From b547f93c1c07292cfcf5a6c5457093dca4be527c Mon Sep 17 00:00:00 2001
From: Jorgen Teig <jorgen.teig@cern.ch>
Date: Mon, 15 Aug 2022 10:27:52 +0200
Subject: [PATCH 001/509] Added commandline arguments and changeable defaults
 in the evaluation script

---
 tools/profiling/evaluation.py     | 59 +++++++++++++++++++++++--------
 tools/profiling/profileconfig.ini |  8 ++---
 2 files changed, 48 insertions(+), 19 deletions(-)

diff --git a/tools/profiling/evaluation.py b/tools/profiling/evaluation.py
index a3c7dfc8c1..4c2f5a6afa 100755
--- a/tools/profiling/evaluation.py
+++ b/tools/profiling/evaluation.py
@@ -14,11 +14,34 @@
 #from matplotlib.offsetbox import TextArea, DrawingArea, OffsetImage, AnnotationBbox
 import seaborn as sns
 import configparser
+import argparse
 import re
 import math
 
+#############################
+#                           #
+#     Argument defaults     #
+#                           #
+#############################
 
+physicsProcesses = ['ee_mumu', 'gg_tt', 'gg_ttg', 'gg_ttgg', 'gg_ttggg']
 
+reportPath = 'C:\\Users\\jteig\\cernbox\\Documents\\CERN\\reports\\v100s_Profiling'
+
+savePath = 'C:\\Users\\jteig\\cernbox\\Documents\\CERN\\Graphs\\'
+
+filePrefix = 'test_v100s_sycl-11.5'
+
+#############################
+
+parser = argparse.ArgumentParser(description='A program for profiling GPUs using MadGraph.')
+
+parser.add_argument("-p", help="Physic process used for making the graphs.", default=physicsProcesses[0], choices=physicsProcesses)
+parser.add_argument("-r", help="Path for the directory containing the reports.", default=reportPath)
+parser.add_argument("-s", help="Path for the directory where the graphs will be saved.", default=savePath)
+parser.add_argument("-n", help="The prefix in the name of the files of the reports e.g test_v100s_sycl-11.5.", default=filePrefix)
+
+args = parser.parse_args()
 
 class Evaluation:
     
@@ -44,7 +67,7 @@ def load_df(self,path):
         listfolders = os.listdir()
         
         for datafolder in listfolders:
-            os.chdir(path+'/'+datafolder)   #Jump in datafolder
+            os.chdir(path+'\\'+datafolder)   #Jump in datafolder
             df_dict[datafolder]=pd.DataFrame()
             Data=pd.DataFrame()
             list_results =[]
@@ -206,7 +229,9 @@ def plots(self,df,plotlist):
             #plt.rcParams['legend.title_fontsize']='large'
             #plt.text(16400, 250000, 'Here we have space for some\nfurther information like:\n\nCuda\nepoch2\ngg_ttgg',fontsize=25)
             plt.show()
-            fig.savefig('/home/andy/cernbox/data/raw/data'+'epoch2_ee_mumu_gcheck_float'+yaxis)
+
+            # Savepath and physics process set by arguments
+            fig.savefig(args.s + args.p + '_' + yaxis)
 
     def data_compare(self,df_dict,compare_list,stat):
         #This function takes the dictinary of data frames and plots the selected df from the list
@@ -368,28 +393,32 @@ def color(self,value):
     Ev.readConfig()
     #logo=mpimg.imread('/home/andy/cernbox/Madgraph/profiler/Logo/Logo_CERN.png')
     #imagebox=OffsetImage(logo)
-    path='/home/andy/cernbox/data/Andrea'
+
+    # Gets directory containing the reports from -r argument
+    path = args.r
+    
     dataframes=Ev.load_df(path) #returns a directory that contains df for all data given in the path
     plotlist= [item for item in Ev.plot_confi['plots']if Ev.plot_confi['plots'][item] == 'on']
-    
-    
 
     dataframes_conv=Ev.convertunits_2() #returns a df directory with converted units
-    dataframes_statisical=Ev.dataframes_statistical_transfomation(dataframes_conv,'max')
-  
-    '''
-    Ev.plots(dataframes_conv['gcheck.exe_epoch1_cuda_ee_mumu_double'],plotlist)
-    '''
+    
+    #''' Plot Graph
+    Ev.plots(dataframes_conv[args.n + '_' + args.p],plotlist)
+    #'''
+
+    # Compare graphs
+    #dataframes_statisical=Ev.dataframes_statistical_transfomation(dataframes_conv,'max')
+
     #max(df_adj_units['EvtsPerSec[MatrixElems] (3)'])
     # To be done
-    list_to_compare=['gcheck.exe_epoch1_cuda_ee_mumu_float','gcheck.exe_epoch1_cuda_ee_mumu_double']
+    #list_to_compare=['check.exe_epochx_cuda_ee_mumu_float','check.exe_epochx_cuda_ee_mumu_double']
     #test_df=Ev.data_compare(dataframes_conv,list_to_compare,'max')
     
-    Ev.data_compare2(dataframes_statisical,list_to_compare)
+    #Ev.data_compare2(dataframes_statisical,list_to_compare)
     
-    dataframes_statisical[list(dataframes_statisical.keys())[0]]
-    dataframes_statisical[list(dataframes_statisical.keys())[0]]['gridsize']
-    dataframes_statisical['gcheck.exe_epoch1_cuda_ee_mumu_float'].dtypes
+    #dataframes_statisical[list(dataframes_statisical.keys())[0]]
+    #dataframes_statisical[list(dataframes_statisical.keys())[0]]['gridsize']
+    #dataframes_statisical['check.exe_epochx_cuda_ee_mumu_float'].dtypes
     
     
     
\ No newline at end of file
diff --git a/tools/profiling/profileconfig.ini b/tools/profiling/profileconfig.ini
index a233430420..bd0557bae3 100755
--- a/tools/profiling/profileconfig.ini
+++ b/tools/profiling/profileconfig.ini
@@ -3,12 +3,12 @@ make = make
 #____________________________________________
 #REMOVE # whether you want to execute gcheck.exe or ccheck.exe
 #sys = ccheck.exe
-sys = gcheck.exe
+#sys = gcheck.exe
 # check.exe still in development 
-#sys = check.exe 
+sys = check.exe 
 #____________________________________________
-epoch = epoch2
-abstr_layer = cuda
+epoch = epochx
+abstr_layer = sycl
 process = ee_mumu
 sigma = P1_Sigma_sm_epem_mupmum
 #process = gg_ttgg

From b1a6e5351438c5577f0e4659d960fc448f05d3c8 Mon Sep 17 00:00:00 2001
From: Jorgen Teig <jorgen.teig@cern.ch>
Date: Tue, 16 Aug 2022 14:51:31 +0200
Subject: [PATCH 002/509] Added functionality to switch between comparing
 graphs and plotting graphs, fixed error saying dataframe.append being
 deprecated

---
 tools/profiling/evaluation.py | 66 +++++++++++++++++++++++------------
 1 file changed, 44 insertions(+), 22 deletions(-)

diff --git a/tools/profiling/evaluation.py b/tools/profiling/evaluation.py
index 4c2f5a6afa..91aa4617c8 100755
--- a/tools/profiling/evaluation.py
+++ b/tools/profiling/evaluation.py
@@ -32,6 +32,16 @@
 
 filePrefix = 'test_v100s_sycl-11.5'
 
+#############################
+#
+# Compare graphs
+#
+#############################
+
+compare = True
+
+graphsToCompare = ['test_v100s_sycl-11.5_ee_mumu', 'test_v100s_sycl-11.5_gg_ttgg']
+
 #############################
 
 parser = argparse.ArgumentParser(description='A program for profiling GPUs using MadGraph.')
@@ -40,9 +50,15 @@
 parser.add_argument("-r", help="Path for the directory containing the reports.", default=reportPath)
 parser.add_argument("-s", help="Path for the directory where the graphs will be saved.", default=savePath)
 parser.add_argument("-n", help="The prefix in the name of the files of the reports e.g test_v100s_sycl-11.5.", default=filePrefix)
+parser.add_argument("-c", help="Option for comparing graphs instead of plotting them.", default=compare)
+parser.add_argument("-g", help="Graphs to use with the compare option.")
 
 args = parser.parse_args()
 
+print(args.g, args.c)
+
+#exit(0)
+
 class Evaluation:
     
     list_results=[]     #List results
@@ -252,8 +268,8 @@ def data_compare(self,df_dict,compare_list,stat):
                 if temp_df.empty:
                     pass
                 else:
-                    df_to_be_plotted=df_to_be_plotted.append(temp_df[(temp_df['EvtsPerSec[MatrixElems] (3)']
-                                                               ==eval(stat)(temp_df['EvtsPerSec[MatrixElems] (3)']))])
+                    df_to_be_plotted = pd.concat([df_to_be_plotted, temp_df[(temp_df['EvtsPerSec[MatrixElems] (3)']
+                                                                    ==eval(stat)(temp_df['EvtsPerSec[MatrixElems] (3)']))]])
                     df_to_be_plotted=df_to_be_plotted.astype({'gridsize':int}) 
         
         
@@ -311,7 +327,7 @@ def data_compare2(self,df_dict,compare_list):
         #Add labels and title
         plt.ylabel('Throughput\nMatrix Elements [s-1]')
         plt.xlabel('Gridsize')
-        plt.title('Cuda throughput for ee_mumu on NVIDIA T4\n')
+        plt.title('SYCL throughput for ee_mumu VS gg_ttgg on NVIDIA v100s\n')
         
         #Change colormap. More info here https://matplotlib.org/stable/tutorials/colors/colormaps.html 
         cmap=plt.get_cmap('Set1')
@@ -339,9 +355,15 @@ def data_compare2(self,df_dict,compare_list):
             
             
             
-        ax1.legend(loc='upper left')  
+        ax1.legend(loc='upper left')
         
         plt.show()
+
+        graph1 = graphsToCompare[0].split('_')
+
+        graph2 = graphsToCompare[1].split('_')
+
+        fig.savefig(args.s + graph1[3] + '_' + graph1[4] + '_vs_' + graph2[3] + '_' + graph2[4])
         
     def dataframes_statistical_transfomation(self,df_dict,stat):
         #This functions takes a dictionary of dataframes and returns a dictionary with dataframes
@@ -359,8 +381,8 @@ def dataframes_statistical_transfomation(self,df_dict,stat):
                 if temp_df.empty:
                     pass
                 else:
-                    df_dict_to_return[df]=df_dict_to_return[df].append(temp_df[(temp_df['EvtsPerSec[MatrixElems] (3)']
-                                                                      ==eval(stat)(temp_df['EvtsPerSec[MatrixElems] (3)']))])
+                    df_dict_to_return[df]=pd.concat([df_dict_to_return[df], temp_df[(temp_df['EvtsPerSec[MatrixElems] (3)']
+                                                                    ==eval(stat)(temp_df['EvtsPerSec[MatrixElems] (3)']))]])
                     df_dict_to_return[df]=df_dict_to_return[df].astype({'gridsize':int}) 
         return df_dict_to_return
 
@@ -402,23 +424,23 @@ def color(self,value):
 
     dataframes_conv=Ev.convertunits_2() #returns a df directory with converted units
     
-    #''' Plot Graph
-    Ev.plots(dataframes_conv[args.n + '_' + args.p],plotlist)
-    #'''
-
-    # Compare graphs
-    #dataframes_statisical=Ev.dataframes_statistical_transfomation(dataframes_conv,'max')
+    if not compare:
 
-    #max(df_adj_units['EvtsPerSec[MatrixElems] (3)'])
-    # To be done
-    #list_to_compare=['check.exe_epochx_cuda_ee_mumu_float','check.exe_epochx_cuda_ee_mumu_double']
-    #test_df=Ev.data_compare(dataframes_conv,list_to_compare,'max')
+        # Plots the graphs in the supplied directories with the info from the config file
+        Ev.plots(dataframes_conv[args.n + '_' + args.p],plotlist)
     
-    #Ev.data_compare2(dataframes_statisical,list_to_compare)
-    
-    #dataframes_statisical[list(dataframes_statisical.keys())[0]]
-    #dataframes_statisical[list(dataframes_statisical.keys())[0]]['gridsize']
-    #dataframes_statisical['check.exe_epochx_cuda_ee_mumu_float'].dtypes
+    else:
+        # Compare graphs
+        dataframes_statisical=Ev.dataframes_statistical_transfomation(dataframes_conv,'max')
+
+        #max(df_adj_units['EvtsPerSec[MatrixElems] (3)'])
+        # To be done
+        #test_df=Ev.data_compare(dataframes_conv,list_to_compare,'max')
     
+        print(dataframes_statisical)
+
+        Ev.data_compare2(dataframes_statisical,graphsToCompare)
     
-    
\ No newline at end of file
+        #dataframes_statisical[list(dataframes_statisical.keys())[0]]
+        #dataframes_statisical[list(dataframes_statisical.keys())[0]]['gridsize']
+        #dataframes_statisical['check.exe_epochx_cuda_ee_mumu_float'].dtypes
\ No newline at end of file

From 466affd81191649344fbe0b541b2e90edff3fa7a Mon Sep 17 00:00:00 2001
From: Jorgen Teig <jorgen.teig@cern.ch>
Date: Wed, 14 Sep 2022 09:52:24 +0200
Subject: [PATCH 003/509] Added arguments to choose if you want to compare data
 in a plot or make plots and added more argument/default functionality instead
 of hardcoding variables in code

---
 tools/profiling/evaluation.py     | 62 ++++++++++++++++++++++---------
 tools/profiling/profileconfig.ini |  1 +
 2 files changed, 45 insertions(+), 18 deletions(-)

diff --git a/tools/profiling/evaluation.py b/tools/profiling/evaluation.py
index 91aa4617c8..f971a505a8 100755
--- a/tools/profiling/evaluation.py
+++ b/tools/profiling/evaluation.py
@@ -26,11 +26,15 @@
 
 physicsProcesses = ['ee_mumu', 'gg_tt', 'gg_ttg', 'gg_ttgg', 'gg_ttggg']
 
-reportPath = 'C:\\Users\\jteig\\cernbox\\Documents\\CERN\\reports\\v100s_Profiling'
+reportPath = 'C:\\Users\\jteig\\cernbox\\Documents\\CERN\\reports\\Sycl_v100s_Profiling_06.09.22_GCC10.3_CUDA11.5\\'
 
 savePath = 'C:\\Users\\jteig\\cernbox\\Documents\\CERN\\Graphs\\'
 
-filePrefix = 'test_v100s_sycl-11.5'
+filePrefix = 'test_v100s_sycl_11.5'
+
+# 'test_v100s_sycl_11.5'
+
+hardware = 'NVIDIA v100s'
 
 #############################
 #
@@ -38,9 +42,9 @@
 #
 #############################
 
-compare = True
+compare = False
 
-graphsToCompare = ['test_v100s_sycl-11.5_ee_mumu', 'test_v100s_sycl-11.5_gg_ttgg']
+graphsToCompare = ['test_v100s_sycl_11.5_gg_ttggg', 'test_v100s_cuda_11.5_gg_ttggg']
 
 #############################
 
@@ -51,12 +55,11 @@
 parser.add_argument("-s", help="Path for the directory where the graphs will be saved.", default=savePath)
 parser.add_argument("-n", help="The prefix in the name of the files of the reports e.g test_v100s_sycl-11.5.", default=filePrefix)
 parser.add_argument("-c", help="Option for comparing graphs instead of plotting them.", default=compare)
+parser.add_argument("-d", help="What device/hardware has been used in the profiling, used as a descriptor in the plots", default=hardware)
 parser.add_argument("-g", help="Graphs to use with the compare option.")
 
 args = parser.parse_args()
 
-print(args.g, args.c)
-
 #exit(0)
 
 class Evaluation:
@@ -162,10 +165,10 @@ def convertunits_2(self):
             temp_df =pd.DataFrame()
             temp_df = dataframes[df][['NumIterations','NumThreadsPerBlock', 'NumBlocksPerGrid',
                                       'EvtsPerSec[MatrixElems] (3)','EvtsPerSec[Rnd+Rmb+ME](123)',
-                                      'EvtsPerSec[Rmb+ME] (23)']]
+                                      'EvtsPerSec[Rmb+ME] (23)', 'EvtsPerSec[MECalcOnly] (3)']]
             
             columns_to_convert = ['EvtsPerSec[MatrixElems] (3)','EvtsPerSec[Rnd+Rmb+ME](123)',
-                                      'EvtsPerSec[Rmb+ME] (23)']
+                                      'EvtsPerSec[Rmb+ME] (23)', 'EvtsPerSec[MECalcOnly] (3)']
             for column in columns_to_convert:
                 
                 for val in range(len(temp_df[column])):
@@ -297,6 +300,12 @@ def data_compare(self,df_dict,compare_list,stat):
         
     
     def data_compare2(self,df_dict,compare_list):
+
+        # Get names of files to compare
+
+        graph1 = graphsToCompare[0].split('_')
+        graph2 = graphsToCompare[1].split('_')
+
         #Takes a dictionary with dataframes and plots it in the same scatter plot
         
         fig = plt.figure()
@@ -319,31 +328,40 @@ def data_compare2(self,df_dict,compare_list):
         
         #setup y axis
         #get maximum value of all df for ylim
-        max_y = [max(df_dict[df]['EvtsPerSec[MatrixElems] (3)']) for df in df_dict]
+        #max_y = max(df_dict[compare_list[0]]['EvtsPerSec[MatrixElems] (3)'], df_dict[compare_list[1]]['EvtsPerSec[MatrixElems] (3)'])
+
+        #print(max_y)
+
+        #min_y = [min(df_dict[df]['EvtsPerSec[MatrixElems] (3)']) for df in df_dict]
+
         #plt.ylim(-0.1*10**9,max(max_y)*1.3)
-        plt.ylim(10**5,max(max_y)*10)
+        #plt.ylim(min(min_y),max(max_y)*10)
         ax1.set_yscale('log')
         
         #Add labels and title
-        plt.ylabel('Throughput\nMatrix Elements [s-1]')
+        plt.ylabel('Throughput\nME Calc Only [s-1]')
         plt.xlabel('Gridsize')
-        plt.title('SYCL throughput for ee_mumu VS gg_ttgg on NVIDIA v100s\n')
+        plt.title("SYCL vs CUDA throughput for "+ graph1[4] + '_' + graph1[5] +" on " + hardware + "\n")
         
         #Change colormap. More info here https://matplotlib.org/stable/tutorials/colors/colormaps.html 
         cmap=plt.get_cmap('Set1')
         
         i=1
         for data in compare_list:
+
+            tempVar  = 'EvtsPerSec[MECalcOnly] (3)'
+            #tempVar2 = 'EvtsPerSec[MatrixElems] (3)'
+
             #Get maximum values for each dataset
-            maxima_y=max(df_dict[data]['EvtsPerSec[MatrixElems] (3)'])
-            maxima_x=df_dict[data].loc[df_dict[data]['EvtsPerSec[MatrixElems] (3)']==maxima_y,'gridsize'].item()
+            maxima_y=max(df_dict[data][tempVar])
+            maxima_x=df_dict[data].loc[df_dict[data][tempVar]==maxima_y,'gridsize'].item()
             
             #label maximum values
             length=len(str(maxima_y))-1
             label_maximas=str(round(maxima_y*10**-(length),3))+'e'+str(length)
             
             #plot datasets
-            ax1.scatter(df_dict[data]['gridsize'].to_list(),df_dict[data]['EvtsPerSec[MatrixElems] (3)'].to_list(),
+            ax1.scatter(df_dict[data]['gridsize'].to_list(),df_dict[data][tempVar].to_list(),
                         label=data+ ' (max = %s)'%label_maximas,
                         color=cmap(i),
                         s=150,alpha=0.9)
@@ -355,7 +373,11 @@ def data_compare2(self,df_dict,compare_list):
             
             
             
-        ax1.legend(loc='upper left')
+        ax1.legend(loc='best')
+
+        plt.tight_layout()
+        plt.autoscale()
+        
         
         plt.show()
 
@@ -363,7 +385,9 @@ def data_compare2(self,df_dict,compare_list):
 
         graph2 = graphsToCompare[1].split('_')
 
-        fig.savefig(args.s + graph1[3] + '_' + graph1[4] + '_vs_' + graph2[3] + '_' + graph2[4])
+        # args.s + graph1[3] + '_' + graph1[4] + '_vs_' + graph2[3] + '_' + graph2[4]
+
+        fig.savefig(args.s + 'SYCL_' + graph1[4] + '_' + graph1[5] + '_vs_CUDA_' + graph2[4] + '_' + graph2[5] + 'MECalcOnly.png')
         
     def dataframes_statistical_transfomation(self,df_dict,stat):
         #This functions takes a dictionary of dataframes and returns a dictionary with dataframes
@@ -426,6 +450,8 @@ def color(self,value):
     
     if not compare:
 
+        print(dataframes_conv)
+
         # Plots the graphs in the supplied directories with the info from the config file
         Ev.plots(dataframes_conv[args.n + '_' + args.p],plotlist)
     
@@ -437,7 +463,7 @@ def color(self,value):
         # To be done
         #test_df=Ev.data_compare(dataframes_conv,list_to_compare,'max')
     
-        print(dataframes_statisical)
+        #print(dataframes_statisical)
 
         Ev.data_compare2(dataframes_statisical,graphsToCompare)
     
diff --git a/tools/profiling/profileconfig.ini b/tools/profiling/profileconfig.ini
index bd0557bae3..122aa9abbf 100755
--- a/tools/profiling/profileconfig.ini
+++ b/tools/profiling/profileconfig.ini
@@ -30,6 +30,7 @@ threads_max = 4
 EvtsPerSec[Rnd+Rmb+ME](123)     = on
 EvtsPerSec[Rmb+ME] (23)         = on
 EvtsPerSec[MatrixElems] (3)     = on
+EvtsPerSec[MECalcOnly] (3)      = on
 NumMatrixElements(notNan)       = off
 MatrixElemEventsPerSec          = off
 NumIterations                   = off

From 643f5cc2ae6f38267d2558c4f532226d0372ef7a Mon Sep 17 00:00:00 2001
From: Jorgen Teig <jorgen.teig@cern.ch>
Date: Tue, 20 Sep 2022 11:01:19 +0200
Subject: [PATCH 004/509] Added shell scripts for building cuda and sycl to be
 used in the python performance profiler

---
 tools/profiling/buildCUDAProcess.sh | 87 ++++++++++++++++++++++++++++
 tools/profiling/buildSYCLProcess.sh | 88 +++++++++++++++++++++++++++++
 tools/profiling/evaluation.py       |  6 +-
 tools/profiling/profileconfig.ini   |  4 +-
 4 files changed, 180 insertions(+), 5 deletions(-)
 create mode 100644 tools/profiling/buildCUDAProcess.sh
 create mode 100644 tools/profiling/buildSYCLProcess.sh

diff --git a/tools/profiling/buildCUDAProcess.sh b/tools/profiling/buildCUDAProcess.sh
new file mode 100644
index 0000000000..6739b0f3f0
--- /dev/null
+++ b/tools/profiling/buildCUDAProcess.sh
@@ -0,0 +1,87 @@
+#!/bin/bash
+
+helpFunction()
+{
+   echo ""
+   echo "Usage: $0 -n gg_ttgg -b 1024 -t 128 -i 10"
+   echo -e "\t-n Name of the physics process being built and run"
+   echo -e "\t-b Blocks per grid"
+   echo -e "\t-t Threads per block"
+   echo -e "\t-i Iterations"
+   exit 1 # Exit script after printing help
+}
+
+while getopts "n:b:t:i:" opt
+do
+   case "$opt" in
+      n ) MG_PROC="$OPTARG" ;; #process to target
+      b ) blocksPerGrid="$OPTARG" ;;
+      t ) threadsPerBlock="$OPTARG" ;;
+      i ) iterations="$OPTARG" ;;
+      ? ) helpFunction ;; # Print helpFunction in case parameter is non-existent
+   esac
+done
+
+# Print helpFunction in case parameters are empty
+
+if [ -z "${MG_PROC}" ] || [ -z "${blocksPerGrid}" ] || [ -z "${threadsPerBlock}" ] || [ -z "${iterations}" ]
+then
+   echo "Some or all of the parameters are empty";
+   helpFunction
+fi
+
+# Begin script in case all parameters are correct
+
+##################################################################
+
+# Set user specific variables
+
+prefix=/afs/cern.ch/work/j/jteig
+export CUDA_HOME=/usr/local/cuda-11.5/
+export FC=`which gfortran`
+
+# Set up compiler and compile options
+
+export USEBUILDDIR=1
+export NTPBMAX=1024
+export MG_EXE="./gcheck.exe"
+export WORKSPACE=$prefix/workspace_mg4gpu
+export NAME_PREFIX="cudacpp_v100s_cuda_11.5"
+
+##################################################################
+
+# Sets CUDA in PATH
+
+export PATH=$CUDA_HOME:$PATH
+
+# Finds correct subprocess
+
+case $MG_PROC in
+    ee_mumu ) export MG_SUBPROC="P1_Sigma_sm_epem_mupmum" ;;
+    gg_tt ) export MG_SUBPROC="P1_Sigma_sm_gg_ttx" ;;
+    gg_ttg ) export MG_SUBPROC="P1_Sigma_sm_gg_ttxg" ;;
+    gg_ttgg ) export MG_SUBPROC="P1_Sigma_sm_gg_ttxgg" ;;
+    gg_ttggg ) export MG_SUBPROC="P1_Sigma_sm_gg_ttxggg" ;;
+esac
+
+# Makes workspace in correct folder
+
+export MG_PROC_DIR=$prefix/madgraph4gpu/epochX/cudacpp/$MG_PROC
+export MG_SP_DIR=$MG_PROC_DIR/SubProcesses/$MG_SUBPROC
+export MG5AMC_CARD_PATH=$MG_PROC_DIR/Cards
+
+mkdir -p $MG_SP_DIR/perf/data
+
+# Build executable
+
+cd $MG_SP_DIR
+make
+
+# Run executable
+
+$MG_EXE -j $blocksPerGrid $threadsPerBlock $iterations
+
+# Move JSON report to workspace
+
+cd $MG_SP_DIR/pref/data/
+mv 0-perf-test-run0.json ${WORKSPACE}/test_${NAME_PREFIX}_${MG_PROC}_${blocksPerGrid}_${threadsPerBlock}_${iterations}.json
\ No newline at end of file
diff --git a/tools/profiling/buildSYCLProcess.sh b/tools/profiling/buildSYCLProcess.sh
new file mode 100644
index 0000000000..d830f7ad9c
--- /dev/null
+++ b/tools/profiling/buildSYCLProcess.sh
@@ -0,0 +1,88 @@
+#!/bin/bash
+
+helpFunction()
+{
+   echo ""
+   echo "Usage: $0 -n gg_ttgg -b 1024 -t 128 -i 10"
+   echo -e "\t-n Name of the physics process being built and run"
+   echo -e "\t-b Blocks per grid"
+   echo -e "\t-t Threads per block"
+   echo -e "\t-i Iterations"
+   exit 1 # Exit script after printing help
+}
+
+while getopts "n:b:t:i:" opt
+do
+   case "$opt" in
+      n ) MG_PROC="$OPTARG" ;; #process to target
+      b ) blocksPerGrid="$OPTARG" ;;
+      t ) threadsPerBlock="$OPTARG" ;;
+      i ) iterations="$OPTARG" ;;
+      ? ) helpFunction ;; # Print helpFunction in case parameter is non-existent
+   esac
+done
+
+# Print helpFunction in case parameters are empty
+if [ -z "${MG_PROC}" ] || [ -z "${blocksPerGrid}" ] || [ -z "${threadsPerBlock}" ] || [ -z "${iterations}" ]
+then
+   echo "Some or all of the parameters are empty";
+   helpFunction
+fi
+
+# Begin script in case all parameters are correct
+
+# Set user/SYCL-flags variables
+prefix=/p/project/prpb109
+export DPCPP_HOME=/p/project/prpb109/sycl_workspace
+export CUDA_PATH=/p/software/juwelsbooster/stages/2022/software/CUDA/11.5
+# export SYCLFLAGS="-fsycl -fsycl-targets=nvptx64-nvidia-cuda -target-backend '--cuda-gpu-arch=sm_80' -fgpu-rdc --cuda-path=$CUDA_PATH"
+export SYCLFLAGS="-fsycl -fsycl-targets=nvptx64-nvidia-cuda -Xsycl-target-backend '--cuda-gpu-arch=sm_80' -fgpu-rdc --cuda-path=$CUDA_PATH"
+export GPU_VERSION="sycl_v100_cuda_11.5_gcc_10.3"
+export WORKSPACE=$prefix/workspace_mg4gpu
+
+# Finds correct subprocess
+case $MG_PROC in
+    ee_mumu ) export MG_SUBPROC="P1_Sigma_sm_epem_mupmum" ;;
+    gg_tt ) export MG_SUBPROC="P1_Sigma_sm_gg_ttx" ;;
+    gg_ttg ) export MG_SUBPROC="P1_Sigma_sm_gg_ttxg" ;;
+    gg_ttgg ) export MG_SUBPROC="P1_Sigma_sm_gg_ttxgg" ;;
+    gg_ttggg ) export MG_SUBPROC="P1_Sigma_sm_gg_ttxggg" ;;
+esac
+
+export DEVICE_ID=0 #if unknown set at the run step after running LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$MG_LIBS $MG_EXE --param_card $MG5AMC_CARD_PATH/param_card.dat --device_info 1024 128 10
+
+# Set up compiler and compile options
+export USEBUILDDIR=1
+export NTPBMAX=1024
+export CXX=$DPCPP_HOME/llvm/build/bin/clang++
+
+mkdir -p $WORKSPACE/mg4gpu/lib
+mkdir -p $WORKSPACE/mg4gpu/bin
+
+export MG4GPU_LIB=$WORKSPACE/mg4gpu/lib
+export MG4GPU_BIN=$WORKSPACE/mg4gpu/bin
+
+export MG_PROC_DIR=$prefix/madgraph4gpu/epochX/sycl/$MG_PROC
+export MG_SP_DIR=$MG_PROC_DIR/SubProcesses/$MG_SUBPROC
+
+export MG_LIBS_DIR="${MG4GPU_LIB}/build_${MG_PROC}_${MG_SUBPROC}_${GPU_VERSION}"
+export MG_LIBS="$DPCPP_HOME/llvm/build/lib:$MG_LIBS_DIR"
+
+export MG_EXE_DIR="${MG4GPU_BIN}/build_${MG_PROC}_${MG_SUBPROC}_${GPU_VERSION}"
+export MG_EXE="$MG_EXE_DIR/check.exe"
+export MG5AMC_CARD_PATH=$MG_PROC_DIR/Cards
+
+# Build executable
+cd $MG_SP_DIR
+make
+mv ../../lib/build.d_inl0/ $MG_LIBS_DIR #2>/dev/null; true
+mv build.d_inl0/ $MG_EXE_DIR #2>/dev/null; true
+
+# Run executable
+cd $WORKSPACE
+#test=LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$MG_LIBS $MG_EXE --param_card $MG5AMC_CARD_PATH/param_card.dat --device_info 1024 128 10
+#echo $test
+LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$MG_LIBS $MG_EXE -j --json_file $WORKSPACE/test_${GPU_VERSION}_${MG_PROC}_${MG_SUBPROC}_${blocksPerGrid}_${threadsPerBlock}_${iterations}.json --param_card $MG5AMC_CARD_PATH/$
+
+# View output
+#nano $WORKSPACE/test_${GPU_VERSION}_${MG_PROC}_${MG_SUBPROC}_${blocksPerGrid}_${threadsPerBlock}_${iterations}.json-+
\ No newline at end of file
diff --git a/tools/profiling/evaluation.py b/tools/profiling/evaluation.py
index f971a505a8..fc152c7681 100755
--- a/tools/profiling/evaluation.py
+++ b/tools/profiling/evaluation.py
@@ -26,11 +26,11 @@
 
 physicsProcesses = ['ee_mumu', 'gg_tt', 'gg_ttg', 'gg_ttgg', 'gg_ttggg']
 
-reportPath = 'C:\\Users\\jteig\\cernbox\\Documents\\CERN\\reports\\Sycl_v100s_Profiling_06.09.22_GCC10.3_CUDA11.5\\'
+reportPath = 'C:\\Users\\jteig\\cernbox\\Documents\\CERN\\reports\\CUDA_v100s_Profiling_12.08_GCC10.3_CUDA11.5\\'
 
 savePath = 'C:\\Users\\jteig\\cernbox\\Documents\\CERN\\Graphs\\'
 
-filePrefix = 'test_v100s_sycl_11.5'
+filePrefix = 'test_v100s_cuda_11.5'
 
 # 'test_v100s_sycl_11.5'
 
@@ -44,7 +44,7 @@
 
 compare = False
 
-graphsToCompare = ['test_v100s_sycl_11.5_gg_ttggg', 'test_v100s_cuda_11.5_gg_ttggg']
+graphsToCompare = ['test_v100s_cuda_11.5_gg_ttggg', 'test_v100s_cuda_11.5_gg_ttggg']
 
 #############################
 
diff --git a/tools/profiling/profileconfig.ini b/tools/profiling/profileconfig.ini
index 122aa9abbf..09ccf41895 100755
--- a/tools/profiling/profileconfig.ini
+++ b/tools/profiling/profileconfig.ini
@@ -27,8 +27,8 @@ threads_max = 4
 #  creats a plot with (NumThreadsPerBlock * BlocksPerGrid)       #
 #                       on the x-axis                            #
 ##################################################################
-EvtsPerSec[Rnd+Rmb+ME](123)     = on
-EvtsPerSec[Rmb+ME] (23)         = on
+EvtsPerSec[Rnd+Rmb+ME](123)     = off
+EvtsPerSec[Rmb+ME] (23)         = off
 EvtsPerSec[MatrixElems] (3)     = on
 EvtsPerSec[MECalcOnly] (3)      = on
 NumMatrixElements(notNan)       = off

From f90dab06c5ff0703298d2c318cd26aa941c679ea Mon Sep 17 00:00:00 2001
From: Jooorgen <jorgen.teig@gmail.com>
Date: Wed, 19 Oct 2022 11:49:54 +0200
Subject: [PATCH 005/509] Less hardcoding in evaluation script

---
 tools/profiling/evaluation.py | 34 ++++++++++++++++++++++------------
 1 file changed, 22 insertions(+), 12 deletions(-)

diff --git a/tools/profiling/evaluation.py b/tools/profiling/evaluation.py
index fc152c7681..bce1a76e38 100755
--- a/tools/profiling/evaluation.py
+++ b/tools/profiling/evaluation.py
@@ -26,15 +26,16 @@
 
 physicsProcesses = ['ee_mumu', 'gg_tt', 'gg_ttg', 'gg_ttgg', 'gg_ttggg']
 
-reportPath = 'C:\\Users\\jteig\\cernbox\\Documents\\CERN\\reports\\CUDA_v100s_Profiling_12.08_GCC10.3_CUDA11.5\\'
+reportPath = 'C:\\Users\\jteig\\cernbox\\Documents\\CERN\\reports\\Sycl_v100s_Profiling_18.10.GCC11.3_CUDA11.6.2_MASTER\\'
 
 savePath = 'C:\\Users\\jteig\\cernbox\\Documents\\CERN\\Graphs\\'
 
-filePrefix = 'test_v100s_cuda_11.5'
+filePrefix = 'test_ATS-P_sycl_11.5'
 
 # 'test_v100s_sycl_11.5'
 
-hardware = 'NVIDIA v100s'
+hardware = 'ATS-P'
+#hardware = 'NVIDIA v100s'
 
 #############################
 #
@@ -44,7 +45,10 @@
 
 compare = False
 
-graphsToCompare = ['test_v100s_cuda_11.5_gg_ttggg', 'test_v100s_cuda_11.5_gg_ttggg']
+graphsToCompare = ['test_v100s_cuda_11.5_gg_ttgg', 'test_v100s_sycl_11.5_gg_ttgg']
+
+stat = 'MECalcOnly'
+#stat = 'MatrixElems'
 
 #############################
 
@@ -198,8 +202,9 @@ def plots(self,df,plotlist):
             ax.spines['top'].set_visible(False)
             
             #enable grid
-            plt.rcParams['grid.linestyle']=':'
-            ax.yaxis.grid()
+            #plt.rcParams['grid.linestyle']=':'
+            #ax.yaxis.grid()
+            plt.grid(which='both',axis = 'y')
             
             #setup x-axis
             ax.set_xscale('log')
@@ -217,7 +222,9 @@ def plots(self,df,plotlist):
             #Labels and titel
             plt.xlabel('Gridsize',fontsize=15)
             plt.ylabel('Troughput\n'+yaxis,fontsize=13.5)
-            plt.title(yaxis,fontsize=15)
+
+
+            plt.title('SYCL (GCC 11.3) on ATS-P',fontsize=15)
         
             # plt.ylabel(yaxis,fontsize=30)
             # plt.xlabel('NumThreadsPerBlock*NumBlocksPerGrid',fontsize=30)
@@ -249,6 +256,10 @@ def plots(self,df,plotlist):
             #plt.text(16400, 250000, 'Here we have space for some\nfurther information like:\n\nCuda\nepoch2\ngg_ttgg',fontsize=25)
             plt.show()
 
+            # Adjusts labels to fit
+            plt.tight_layout()
+            plt.autoscale()
+
             # Savepath and physics process set by arguments
             fig.savefig(args.s + args.p + '_' + yaxis)
 
@@ -339,8 +350,8 @@ def data_compare2(self,df_dict,compare_list):
         ax1.set_yscale('log')
         
         #Add labels and title
-        plt.ylabel('Throughput\nME Calc Only [s-1]')
-        plt.xlabel('Gridsize')
+        plt.ylabel('Throughput\n'+ stat +' [s-1]')
+        plt.xlabel('Gridsize (nBlocksGPU * nThreadsGPU)')
         plt.title("SYCL vs CUDA throughput for "+ graph1[4] + '_' + graph1[5] +" on " + hardware + "\n")
         
         #Change colormap. More info here https://matplotlib.org/stable/tutorials/colors/colormaps.html 
@@ -349,8 +360,7 @@ def data_compare2(self,df_dict,compare_list):
         i=1
         for data in compare_list:
 
-            tempVar  = 'EvtsPerSec[MECalcOnly] (3)'
-            #tempVar2 = 'EvtsPerSec[MatrixElems] (3)'
+            tempVar  = 'EvtsPerSec['+ stat +'] (3)'
 
             #Get maximum values for each dataset
             maxima_y=max(df_dict[data][tempVar])
@@ -387,7 +397,7 @@ def data_compare2(self,df_dict,compare_list):
 
         # args.s + graph1[3] + '_' + graph1[4] + '_vs_' + graph2[3] + '_' + graph2[4]
 
-        fig.savefig(args.s + 'SYCL_' + graph1[4] + '_' + graph1[5] + '_vs_CUDA_' + graph2[4] + '_' + graph2[5] + 'MECalcOnly.png')
+        fig.savefig(args.s + 'SYCL_' + graph1[4] + '_' + graph1[5] + '_vs_CUDA_' + graph2[4] + '_' + graph2[5] + '_' + stat +'.png')
         
     def dataframes_statistical_transfomation(self,df_dict,stat):
         #This functions takes a dictionary of dataframes and returns a dictionary with dataframes

From fd7ed413ed60d8f43471a69839f0da7f5670e806 Mon Sep 17 00:00:00 2001
From: Jooorgen <jorgen.teig@gmail.com>
Date: Wed, 19 Oct 2022 11:51:36 +0200
Subject: [PATCH 006/509] Added profiler script to run the CUDA/SYCL scripts

---
 tools/profiling/performanceProfiler.py | 44 ++++++++++++++++++++++++++
 1 file changed, 44 insertions(+)
 create mode 100644 tools/profiling/performanceProfiler.py

diff --git a/tools/profiling/performanceProfiler.py b/tools/profiling/performanceProfiler.py
new file mode 100644
index 0000000000..c37a3a0549
--- /dev/null
+++ b/tools/profiling/performanceProfiler.py
@@ -0,0 +1,44 @@
+import sys
+import subprocess
+import datetime
+
+# Required info
+absLayer = "CUDA"
+doublePrecisionConstant = 2560
+mgProcesses = ["ee_mumu", "gg_tt", "gg_ttg", "gg_ttgg", "gg_ttggg"]
+iterations = 10
+threadsPerBlock = [32, 64, 128, 256]
+blocksPerGrid = [16, 32, 64, 128, 256, 512, 1024, 2048, 4096, 8192, 16384]
+
+# How many runs in total the program made
+count = 0
+
+for process in mgProcesses:
+    for TPB in threadsPerBlock:
+        for BPG in blocksPerGrid:
+            if (TPB * BPG > doublePrecisionConstant):
+
+                if absLayer.upper() == 'SYCL':
+                    args = ["./buildMadGraphFile.sh", "-n",  process, "-i",  str(iterations), "-t",  str(TPB), "-b", str(BPG)]
+
+                elif absLayer.upper() == 'CUDA':
+
+                    ### Used in br_golden_epochX4 branch
+                    #if ".sa" not in process:
+                    #    process = process + ".sa"
+                    ###
+                    
+                    args = ["./buildCUDAFile.sh", "-n",  process, "-i",  str(iterations), "-t",  str(TPB), "-b", str(BPG)]
+
+                else: sys.exit("No abstraction layer selected")
+
+                print(str(datetime.datetime.now().strftime("%H:%M:%S")) + " Started " + process + " with TPB("+ str(TPB) +") * BPG("+ str(BPG) +"): " + str(TPB * BPG) + "!")
+                
+                build = subprocess.run(args, stdout=subprocess.DEVNULL)
+                if build.returncode != 0:
+                    print(str(datetime.datetime.now().strftime("%H:%M:%S")) + " " + process + " FAILED!, threadsPerBlock: " + str(TPB) + ", blocksPerGrid: " + str(BPG) + ", Product: " + str(TPB * BPG))
+                else:
+                    print(str(datetime.datetime.now().strftime("%H:%M:%S")) + " " + process + " COMPLETED!, threadsPerBlock: " + str(TPB) + ", blocksPerGrid: " + str(BPG) + ", Product: " + str(TPB * BPG))
+                count += 1
+
+print("Builded " + str(count) + " processes!")
\ No newline at end of file

From 03cf7a36dcc77a778ddf504b1d368885fea2830b Mon Sep 17 00:00:00 2001
From: Jooorgen <jorgen.teig@gmail.com>
Date: Wed, 19 Oct 2022 11:53:18 +0200
Subject: [PATCH 007/509] Fixed bug in SYCL script

---
 tools/profiling/buildSYCLProcess.sh | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/tools/profiling/buildSYCLProcess.sh b/tools/profiling/buildSYCLProcess.sh
index d830f7ad9c..08343c37ce 100644
--- a/tools/profiling/buildSYCLProcess.sh
+++ b/tools/profiling/buildSYCLProcess.sh
@@ -80,9 +80,12 @@ mv build.d_inl0/ $MG_EXE_DIR #2>/dev/null; true
 
 # Run executable
 cd $WORKSPACE
-#test=LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$MG_LIBS $MG_EXE --param_card $MG5AMC_CARD_PATH/param_card.dat --device_info 1024 128 10
-#echo $test
-LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$MG_LIBS $MG_EXE -j --json_file $WORKSPACE/test_${GPU_VERSION}_${MG_PROC}_${MG_SUBPROC}_${blocksPerGrid}_${threadsPerBlock}_${iterations}.json --param_card $MG5AMC_CARD_PATH/$
+
+# Display the devices
+#LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$MG_LIBS $MG_EXE --param_card $MG5AMC_CARD_PATH/param_card.dat --device_info 32 32 10
+
+# Add MG Libs to linker library path and run the executable
+LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$MG_LIBS $MG_EXE -j --json_file $WORKSPACE/test_${GPU_VERSION}_${MG_PROC}_${MG_SUBPROC}_${blocksPerGrid}_${threadsPerBlock}_${iterations}.json --param_card $MG5AMC_CARD_PATH /param_card.dat --device_id $DEVICE_ID $blocksPerGrid $threadsPerBlock $iterations
 
 # View output
 #nano $WORKSPACE/test_${GPU_VERSION}_${MG_PROC}_${MG_SUBPROC}_${blocksPerGrid}_${threadsPerBlock}_${iterations}.json-+
\ No newline at end of file

From a3caa0cedbe422d8d1172cee86ea85a991615b2b Mon Sep 17 00:00:00 2001
From: Jooorgen <jorgen.teig@gmail.com>
Date: Mon, 24 Oct 2022 14:06:18 +0200
Subject: [PATCH 008/509] [CI] Removed testing using epoch1/2 with GitHub
 Actions

---
 .github/workflows/c-cpp.yml | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/.github/workflows/c-cpp.yml b/.github/workflows/c-cpp.yml
index a053a6f731..80a6d6fd0c 100644
--- a/.github/workflows/c-cpp.yml
+++ b/.github/workflows/c-cpp.yml
@@ -11,7 +11,7 @@ jobs:
     runs-on: ubuntu-latest
     strategy:
       matrix:
-        folder: [ epochX/cudacpp/ee_mumu.sa/SubProcesses/P1_Sigma_sm_epem_mupmum , epoch1/cuda/ee_mumu/SubProcesses/P1_Sigma_sm_epem_mupmum , epoch2/cuda/ee_mumu/SubProcesses/P1_Sigma_sm_epem_mupmum ]
+        folder: [ epochX/cudacpp/ee_mumu.sa/SubProcesses/P1_Sigma_sm_epem_mupmum , epochX/cudacpp/gg_ttgg.sa/SubProcesses/P1_Sigma_sm_gg_ttxgg ]
       fail-fast: false
     steps:
     - uses: actions/checkout@v2
@@ -21,7 +21,7 @@ jobs:
     runs-on: ubuntu-latest
     strategy:
       matrix:
-        folder: [ epochX/cudacpp/ee_mumu.sa/SubProcesses/P1_Sigma_sm_epem_mupmum , epochX/cudacpp/gg_ttgg.sa/SubProcesses/P1_Sigma_sm_gg_ttxgg , epoch1/cuda/ee_mumu/SubProcesses/P1_Sigma_sm_epem_mupmum , epoch2/cuda/ee_mumu/SubProcesses/P1_Sigma_sm_epem_mupmum ]
+        folder: [ epochX/cudacpp/ee_mumu.sa/SubProcesses/P1_Sigma_sm_epem_mupmum , epochX/cudacpp/gg_ttgg.sa/SubProcesses/P1_Sigma_sm_gg_ttxgg ]
         precision: [ d , f ]
       fail-fast: false
     steps:
@@ -38,7 +38,7 @@ jobs:
       FC: gfortran-11
     strategy:
       matrix:
-        folder: [ epochX/cudacpp/ee_mumu.sa/SubProcesses/P1_Sigma_sm_epem_mupmum, epochX/cudacpp/gg_ttgg.sa/SubProcesses/P1_Sigma_sm_gg_ttxgg , epoch1/cuda/ee_mumu/SubProcesses/P1_Sigma_sm_epem_mupmum , epoch2/cuda/ee_mumu/SubProcesses/P1_Sigma_sm_epem_mupmum ]
+        folder: [ epochX/cudacpp/ee_mumu.sa/SubProcesses/P1_Sigma_sm_epem_mupmum, epochX/cudacpp/gg_ttgg.sa/SubProcesses/P1_Sigma_sm_gg_ttxgg ]
         precision: [ d ]
       fail-fast: false
     steps:
@@ -56,7 +56,7 @@ jobs:
       REQUIRE_CUDA: 1
     strategy:
       matrix:
-        folder: [ epochX/cudacpp/ee_mumu.sa/SubProcesses/P1_Sigma_sm_epem_mupmum , epochX/cudacpp/gg_ttgg.sa/SubProcesses/P1_Sigma_sm_gg_ttxgg , epoch2/cuda/ee_mumu/SubProcesses/P1_Sigma_sm_epem_mupmum ]
+        folder: [ epochX/cudacpp/ee_mumu.sa/SubProcesses/P1_Sigma_sm_epem_mupmum , epochX/cudacpp/gg_ttgg.sa/SubProcesses/P1_Sigma_sm_gg_ttxgg ]
         precision: [ d , f ]
       fail-fast: false
     steps:

From be11cee67f9f44cc018ccb37c683b8ab8508884a Mon Sep 17 00:00:00 2001
From: Jooorgen <jorgen.teig@gmail.com>
Date: Mon, 24 Oct 2022 14:21:51 +0200
Subject: [PATCH 009/509] Added simple arguments to performanceProfiler.py so
 you can choose abstraction layer and git branch

---
 tools/profiling/performanceProfiler.py | 33 ++++++++++++++++++--------
 1 file changed, 23 insertions(+), 10 deletions(-)

diff --git a/tools/profiling/performanceProfiler.py b/tools/profiling/performanceProfiler.py
index c37a3a0549..e32fcceab0 100644
--- a/tools/profiling/performanceProfiler.py
+++ b/tools/profiling/performanceProfiler.py
@@ -1,15 +1,28 @@
 import sys
+import os
 import subprocess
 import datetime
+import argparse
+
+# Parser arguments defaults
+absLayer = "SYCL"
+branch = "br_golden_epochX4"
 
-# Required info
-absLayer = "CUDA"
-doublePrecisionConstant = 2560
 mgProcesses = ["ee_mumu", "gg_tt", "gg_ttg", "gg_ttgg", "gg_ttggg"]
+
+doublePrecisionConstant = 2560
 iterations = 10
 threadsPerBlock = [32, 64, 128, 256]
 blocksPerGrid = [16, 32, 64, 128, 256, 512, 1024, 2048, 4096, 8192, 16384]
 
+# Parser
+parser = argparse.ArgumentParser(description='A program for profiling GPUs using MadGraph.')
+
+parser.add_argument("-l", help="Choose which abstraction layer you want to use (CUDA/SYCL).", default=absLayer)
+parser.add_argument("-b", help="Choose which branch the madgraph4gpu repo is in.", default=branch)
+
+args = parser.parse_args()
+
 # How many runs in total the program made
 count = 0
 
@@ -18,19 +31,19 @@
         for BPG in blocksPerGrid:
             if (TPB * BPG > doublePrecisionConstant):
 
-                if absLayer.upper() == 'SYCL':
+                if args.l.upper() == 'SYCL':
                     args = ["./buildMadGraphFile.sh", "-n",  process, "-i",  str(iterations), "-t",  str(TPB), "-b", str(BPG)]
 
-                elif absLayer.upper() == 'CUDA':
+                elif args.l.upper() == 'CUDA':
 
-                    ### Used in br_golden_epochX4 branch
-                    #if ".sa" not in process:
-                    #    process = process + ".sa"
-                    ###
+                    # Used in br_golden_epochX4 branch
+                    if args.b == 'br_golden_epochX4':
+                        if ".sa" not in process:
+                            process = process + ".sa"
                     
                     args = ["./buildCUDAFile.sh", "-n",  process, "-i",  str(iterations), "-t",  str(TPB), "-b", str(BPG)]
 
-                else: sys.exit("No abstraction layer selected")
+                else: sys.exit("No abstraction layer matching the supplied string!")
 
                 print(str(datetime.datetime.now().strftime("%H:%M:%S")) + " Started " + process + " with TPB("+ str(TPB) +") * BPG("+ str(BPG) +"): " + str(TPB * BPG) + "!")
                 

From 13bbb7602d1983b879ec255c7bbe057a1043e51e Mon Sep 17 00:00:00 2001
From: Jooorgen <jorgen.teig@gmail.com>
Date: Mon, 24 Oct 2022 14:50:36 +0200
Subject: [PATCH 010/509] Revert back to how workflow looked like in main repo
 to test

---
 .github/workflows/c-cpp.yml | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/.github/workflows/c-cpp.yml b/.github/workflows/c-cpp.yml
index 80a6d6fd0c..651b5ef646 100644
--- a/.github/workflows/c-cpp.yml
+++ b/.github/workflows/c-cpp.yml
@@ -11,7 +11,7 @@ jobs:
     runs-on: ubuntu-latest
     strategy:
       matrix:
-        folder: [ epochX/cudacpp/ee_mumu.sa/SubProcesses/P1_Sigma_sm_epem_mupmum , epochX/cudacpp/gg_ttgg.sa/SubProcesses/P1_Sigma_sm_gg_ttxgg ]
+        folder: [ epochX/cudacpp/ee_mumu.sa/SubProcesses/P1_Sigma_sm_epem_mupmum , epoch1/cuda/ee_mumu/SubProcesses/P1_Sigma_sm_epem_mupmum , epoch2/cuda/ee_mumu/SubProcesses/P1_Sigma_sm_epem_mupmum ]
       fail-fast: false
     steps:
     - uses: actions/checkout@v2
@@ -21,7 +21,7 @@ jobs:
     runs-on: ubuntu-latest
     strategy:
       matrix:
-        folder: [ epochX/cudacpp/ee_mumu.sa/SubProcesses/P1_Sigma_sm_epem_mupmum , epochX/cudacpp/gg_ttgg.sa/SubProcesses/P1_Sigma_sm_gg_ttxgg ]
+        folder: [ epochX/cudacpp/ee_mumu.sa/SubProcesses/P1_Sigma_sm_epem_mupmum , epochX/cudacpp/gg_ttgg.sa/SubProcesses/P1_Sigma_sm_gg_ttxgg , epoch1/cuda/ee_mumu/SubProcesses/P1_Sigma_sm_epem_mupmum , epoch2/cuda/ee_mumu/SubProcesses/P1_Sigma_sm_epem_mupmum ]
         precision: [ d , f ]
       fail-fast: false
     steps:
@@ -38,7 +38,7 @@ jobs:
       FC: gfortran-11
     strategy:
       matrix:
-        folder: [ epochX/cudacpp/ee_mumu.sa/SubProcesses/P1_Sigma_sm_epem_mupmum, epochX/cudacpp/gg_ttgg.sa/SubProcesses/P1_Sigma_sm_gg_ttxgg ]
+        folder: [ epochX/cudacpp/ee_mumu.sa/SubProcesses/P1_Sigma_sm_epem_mupmum, epochX/cudacpp/gg_ttgg.sa/SubProcesses/P1_Sigma_sm_gg_ttxgg , epoch1/cuda/ee_mumu/SubProcesses/P1_Sigma_sm_epem_mupmum , epoch2/cuda/ee_mumu/SubProcesses/P1_Sigma_sm_epem_mupmum ]
         precision: [ d ]
       fail-fast: false
     steps:
@@ -56,7 +56,7 @@ jobs:
       REQUIRE_CUDA: 1
     strategy:
       matrix:
-        folder: [ epochX/cudacpp/ee_mumu.sa/SubProcesses/P1_Sigma_sm_epem_mupmum , epochX/cudacpp/gg_ttgg.sa/SubProcesses/P1_Sigma_sm_gg_ttxgg ]
+        folder: [ epochX/cudacpp/ee_mumu.sa/SubProcesses/P1_Sigma_sm_epem_mupmum , epochX/cudacpp/gg_ttgg.sa/SubProcesses/P1_Sigma_sm_gg_ttxgg , epoch2/cuda/ee_mumu/SubProcesses/P1_Sigma_sm_epem_mupmum ]
         precision: [ d , f ]
       fail-fast: false
     steps:
@@ -66,4 +66,4 @@ jobs:
     - name: make
       run: source /opt/rh/gcc-toolset-10/enable; make FPTYPE=${{ matrix.precision }} -C ${{ matrix.folder }}
     - name: make check
-      run: source /opt/rh/gcc-toolset-10/enable; make FPTYPE=${{ matrix.precision }} -C ${{ matrix.folder }} check
+      run: source /opt/rh/gcc-toolset-10/enable; make FPTYPE=${{ matrix.precision }} -C ${{ matrix.folder }} check
\ No newline at end of file

From e9a75b371e95b355c07ef6783b8937e75fdefea1 Mon Sep 17 00:00:00 2001
From: Jooorgen <jorgen.teig@gmail.com>
Date: Mon, 24 Oct 2022 15:22:41 +0200
Subject: [PATCH 011/509] Add make cleanall first to try to fix error

---
 .github/workflows/c-cpp.yml | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/c-cpp.yml b/.github/workflows/c-cpp.yml
index 651b5ef646..0aacb600aa 100644
--- a/.github/workflows/c-cpp.yml
+++ b/.github/workflows/c-cpp.yml
@@ -14,8 +14,11 @@ jobs:
         folder: [ epochX/cudacpp/ee_mumu.sa/SubProcesses/P1_Sigma_sm_epem_mupmum , epoch1/cuda/ee_mumu/SubProcesses/P1_Sigma_sm_epem_mupmum , epoch2/cuda/ee_mumu/SubProcesses/P1_Sigma_sm_epem_mupmum ]
       fail-fast: false
     steps:
+
     - uses: actions/checkout@v2
-    - name: make epoch1
+    - name: make cleanall
+      run: make cleanall
+    - name: make epochX/1/2
       run: make -C ${{ matrix.folder }} debug
   CPU:
     runs-on: ubuntu-latest
@@ -26,6 +29,8 @@ jobs:
       fail-fast: false
     steps:
     - uses: actions/checkout@v2
+    - name: make cleanall
+      run: make cleanall
     - name: make info
       run: make FPTYPE=${{ matrix.precision }} -C ${{ matrix.folder }} info
     - name: make
@@ -43,6 +48,8 @@ jobs:
       fail-fast: false
     steps:
     - uses: actions/checkout@v2
+    - name: make cleanall
+      run: make cleanall
     - name: make info
       run: make AVX=none OMPFLAGS= FPTYPE=${{ matrix.precision }} -C ${{ matrix.folder }} info
     - name: make
@@ -61,6 +68,8 @@ jobs:
       fail-fast: false
     steps:
     - uses: actions/checkout@v2
+    - name: make cleanall
+      run: make cleanall
     - name: make info
       run: source /opt/rh/gcc-toolset-10/enable; make FPTYPE=${{ matrix.precision }} -C ${{ matrix.folder }} info
     - name: make

From 7ea42ab3a40b20f4b5c48496f20c7f18f84ddcd9 Mon Sep 17 00:00:00 2001
From: Jooorgen <jorgen.teig@gmail.com>
Date: Mon, 24 Oct 2022 15:25:24 +0200
Subject: [PATCH 012/509] Make cleanall now gets run in correct directory

---
 .github/workflows/c-cpp.yml | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/.github/workflows/c-cpp.yml b/.github/workflows/c-cpp.yml
index 0aacb600aa..2322155c38 100644
--- a/.github/workflows/c-cpp.yml
+++ b/.github/workflows/c-cpp.yml
@@ -17,7 +17,7 @@ jobs:
 
     - uses: actions/checkout@v2
     - name: make cleanall
-      run: make cleanall
+      run: make -C ${{ matrix.folder }} cleanall
     - name: make epochX/1/2
       run: make -C ${{ matrix.folder }} debug
   CPU:
@@ -30,7 +30,7 @@ jobs:
     steps:
     - uses: actions/checkout@v2
     - name: make cleanall
-      run: make cleanall
+      run: make -C ${{ matrix.folder }} cleanall
     - name: make info
       run: make FPTYPE=${{ matrix.precision }} -C ${{ matrix.folder }} info
     - name: make
@@ -49,7 +49,7 @@ jobs:
     steps:
     - uses: actions/checkout@v2
     - name: make cleanall
-      run: make cleanall
+      run: make -C ${{ matrix.folder }} cleanall
     - name: make info
       run: make AVX=none OMPFLAGS= FPTYPE=${{ matrix.precision }} -C ${{ matrix.folder }} info
     - name: make
@@ -69,7 +69,7 @@ jobs:
     steps:
     - uses: actions/checkout@v2
     - name: make cleanall
-      run: make cleanall
+      run: make -C ${{ matrix.folder }} cleanall
     - name: make info
       run: source /opt/rh/gcc-toolset-10/enable; make FPTYPE=${{ matrix.precision }} -C ${{ matrix.folder }} info
     - name: make

From 060d32cd27f76016cbee4bb49d8cca7f0c3a2ddd Mon Sep 17 00:00:00 2001
From: Jooorgen <jorgen.teig@gmail.com>
Date: Wed, 26 Oct 2022 10:54:18 +0200
Subject: [PATCH 013/509] Added profiler workflow

---
 .github/workflows/profiler.yml | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)
 create mode 100644 .github/workflows/profiler.yml

diff --git a/.github/workflows/profiler.yml b/.github/workflows/profiler.yml
new file mode 100644
index 0000000000..41d2eee9b5
--- /dev/null
+++ b/.github/workflows/profiler.yml
@@ -0,0 +1,17 @@
+name: Profiler
+
+on:
+  push:
+    branches: [ master ]
+  pull_request:
+    branches: [ master ]
+
+jobs:
+  a100s:
+    uses: actions/checkout@v2
+
+    - name: Getting prerequisites
+      run: source /cvmfs/sft.cern.ch/lcg/contrib/gcc/11.3.0/x86_64-centos8/setup.sh
+
+    - name: Runs performanceProfiler.py script
+      run: python3 performanceProfiler.py
\ No newline at end of file

From 02b975182682d54b60c472e621333f47e179ad23 Mon Sep 17 00:00:00 2001
From: Jooorgen <jorgen.teig@gmail.com>
Date: Wed, 26 Oct 2022 11:06:42 +0200
Subject: [PATCH 014/509] Added CUDA 11.6 in path in self-hosted CI

---
 .github/workflows/c-cpp.yml | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/.github/workflows/c-cpp.yml b/.github/workflows/c-cpp.yml
index 2322155c38..1a88a2b328 100644
--- a/.github/workflows/c-cpp.yml
+++ b/.github/workflows/c-cpp.yml
@@ -68,6 +68,8 @@ jobs:
       fail-fast: false
     steps:
     - uses: actions/checkout@v2
+    - name: Add CUDA 11.6 to PATH
+      run: PATH=/usr/local/cuda-11.6/:$PATH
     - name: make cleanall
       run: make -C ${{ matrix.folder }} cleanall
     - name: make info

From 3de0d3ba68d3a1f5aa346d3d3e7700ae47aa633e Mon Sep 17 00:00:00 2001
From: Jooorgen <jorgen.teig@gmail.com>
Date: Wed, 26 Oct 2022 11:08:28 +0200
Subject: [PATCH 015/509] Added CUDA 11.6 to CUDA_HOME in self-hosted CI
 instead of path

---
 .github/workflows/c-cpp.yml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/c-cpp.yml b/.github/workflows/c-cpp.yml
index 1a88a2b328..bac79f0016 100644
--- a/.github/workflows/c-cpp.yml
+++ b/.github/workflows/c-cpp.yml
@@ -68,8 +68,8 @@ jobs:
       fail-fast: false
     steps:
     - uses: actions/checkout@v2
-    - name: Add CUDA 11.6 to PATH
-      run: PATH=/usr/local/cuda-11.6/:$PATH
+    - name: Add CUDA 11.6 to CUDA_HOME
+      run: CUDA_HOME=/usr/local/cuda-11.6/
     - name: make cleanall
       run: make -C ${{ matrix.folder }} cleanall
     - name: make info

From 8185b6a84e45af9d4de3a79450db27f67e447bd6 Mon Sep 17 00:00:00 2001
From: Jooorgen <jorgen.teig@gmail.com>
Date: Wed, 26 Oct 2022 11:09:26 +0200
Subject: [PATCH 016/509] Added export to CUDA_HOME assignment

---
 .github/workflows/c-cpp.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/c-cpp.yml b/.github/workflows/c-cpp.yml
index bac79f0016..610a667641 100644
--- a/.github/workflows/c-cpp.yml
+++ b/.github/workflows/c-cpp.yml
@@ -69,7 +69,7 @@ jobs:
     steps:
     - uses: actions/checkout@v2
     - name: Add CUDA 11.6 to CUDA_HOME
-      run: CUDA_HOME=/usr/local/cuda-11.6/
+      run: export CUDA_HOME=/usr/local/cuda-11.6/
     - name: make cleanall
       run: make -C ${{ matrix.folder }} cleanall
     - name: make info

From 5bfaca8057da0a48558c758ab10b95625b778eb5 Mon Sep 17 00:00:00 2001
From: Jooorgen <jorgen.teig@gmail.com>
Date: Wed, 26 Oct 2022 11:51:48 +0200
Subject: [PATCH 017/509] Added CUDA_HOME as a sub-command in make steps
 instead and removed runs with epoch1/2

---
 .github/workflows/c-cpp.yml | 16 +++++++---------
 1 file changed, 7 insertions(+), 9 deletions(-)

diff --git a/.github/workflows/c-cpp.yml b/.github/workflows/c-cpp.yml
index 610a667641..b9d636967f 100644
--- a/.github/workflows/c-cpp.yml
+++ b/.github/workflows/c-cpp.yml
@@ -11,7 +11,7 @@ jobs:
     runs-on: ubuntu-latest
     strategy:
       matrix:
-        folder: [ epochX/cudacpp/ee_mumu.sa/SubProcesses/P1_Sigma_sm_epem_mupmum , epoch1/cuda/ee_mumu/SubProcesses/P1_Sigma_sm_epem_mupmum , epoch2/cuda/ee_mumu/SubProcesses/P1_Sigma_sm_epem_mupmum ]
+        folder: [ epochX/cudacpp/ee_mumu.sa/SubProcesses/P1_Sigma_sm_epem_mupmum , epochX/cudacpp/gg_ttgg.sa/SubProcesses/P1_Sigma_sm_gg_ttxgg ]
       fail-fast: false
     steps:
 
@@ -24,7 +24,7 @@ jobs:
     runs-on: ubuntu-latest
     strategy:
       matrix:
-        folder: [ epochX/cudacpp/ee_mumu.sa/SubProcesses/P1_Sigma_sm_epem_mupmum , epochX/cudacpp/gg_ttgg.sa/SubProcesses/P1_Sigma_sm_gg_ttxgg , epoch1/cuda/ee_mumu/SubProcesses/P1_Sigma_sm_epem_mupmum , epoch2/cuda/ee_mumu/SubProcesses/P1_Sigma_sm_epem_mupmum ]
+        folder: [ epochX/cudacpp/ee_mumu.sa/SubProcesses/P1_Sigma_sm_epem_mupmum , epochX/cudacpp/gg_ttgg.sa/SubProcesses/P1_Sigma_sm_gg_ttxgg ]
         precision: [ d , f ]
       fail-fast: false
     steps:
@@ -43,7 +43,7 @@ jobs:
       FC: gfortran-11
     strategy:
       matrix:
-        folder: [ epochX/cudacpp/ee_mumu.sa/SubProcesses/P1_Sigma_sm_epem_mupmum, epochX/cudacpp/gg_ttgg.sa/SubProcesses/P1_Sigma_sm_gg_ttxgg , epoch1/cuda/ee_mumu/SubProcesses/P1_Sigma_sm_epem_mupmum , epoch2/cuda/ee_mumu/SubProcesses/P1_Sigma_sm_epem_mupmum ]
+        folder: [ epochX/cudacpp/ee_mumu.sa/SubProcesses/P1_Sigma_sm_epem_mupmum, epochX/cudacpp/gg_ttgg.sa/SubProcesses/P1_Sigma_sm_gg_ttxgg ]
         precision: [ d ]
       fail-fast: false
     steps:
@@ -63,18 +63,16 @@ jobs:
       REQUIRE_CUDA: 1
     strategy:
       matrix:
-        folder: [ epochX/cudacpp/ee_mumu.sa/SubProcesses/P1_Sigma_sm_epem_mupmum , epochX/cudacpp/gg_ttgg.sa/SubProcesses/P1_Sigma_sm_gg_ttxgg , epoch2/cuda/ee_mumu/SubProcesses/P1_Sigma_sm_epem_mupmum ]
+        folder: [ epochX/cudacpp/ee_mumu.sa/SubProcesses/P1_Sigma_sm_epem_mupmum , epochX/cudacpp/gg_ttgg.sa/SubProcesses/P1_Sigma_sm_gg_ttxgg ]
         precision: [ d , f ]
       fail-fast: false
     steps:
     - uses: actions/checkout@v2
-    - name: Add CUDA 11.6 to CUDA_HOME
-      run: export CUDA_HOME=/usr/local/cuda-11.6/
     - name: make cleanall
       run: make -C ${{ matrix.folder }} cleanall
     - name: make info
-      run: source /opt/rh/gcc-toolset-10/enable; make FPTYPE=${{ matrix.precision }} -C ${{ matrix.folder }} info
+      run: source /opt/rh/gcc-toolset-10/enable; export CUDA_HOME=/usr/local/cuda-11.6/; make FPTYPE=${{ matrix.precision }} -C ${{ matrix.folder }} info
     - name: make
-      run: source /opt/rh/gcc-toolset-10/enable; make FPTYPE=${{ matrix.precision }} -C ${{ matrix.folder }}
+      run: source /opt/rh/gcc-toolset-10/enable; export CUDA_HOME=/usr/local/cuda-11.6/; make FPTYPE=${{ matrix.precision }} -C ${{ matrix.folder }}
     - name: make check
-      run: source /opt/rh/gcc-toolset-10/enable; make FPTYPE=${{ matrix.precision }} -C ${{ matrix.folder }} check
\ No newline at end of file
+      run: source /opt/rh/gcc-toolset-10/enable; export CUDA_HOME=/usr/local/cuda-11.6/; make FPTYPE=${{ matrix.precision }} -C ${{ matrix.folder }} check
\ No newline at end of file

From 98cbeaf1faa6cd5ca6b51d569bc3795abc57c641 Mon Sep 17 00:00:00 2001
From: Jooorgen <jorgen.teig@gmail.com>
Date: Wed, 26 Oct 2022 11:57:07 +0200
Subject: [PATCH 018/509] Added the specific GPU as a label on the GPU job

---
 .github/workflows/c-cpp.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/c-cpp.yml b/.github/workflows/c-cpp.yml
index b9d636967f..8f5bef5449 100644
--- a/.github/workflows/c-cpp.yml
+++ b/.github/workflows/c-cpp.yml
@@ -57,7 +57,7 @@ jobs:
     - name: make check
       run: make AVX=none OMPFLAGS= FPTYPE=${{ matrix.precision }} -C ${{ matrix.folder }} check
   GPU:
-    runs-on: self-hosted
+    runs-on: [self-hosted, linux, a100s]
     env:
       FC: gfortran
       REQUIRE_CUDA: 1

From dec2a60913134c66bf91ff0a77f52882c74eee4e Mon Sep 17 00:00:00 2001
From: Jooorgen <jorgen.teig@gmail.com>
Date: Wed, 26 Oct 2022 12:15:55 +0200
Subject: [PATCH 019/509] Fixed some issues with profiler workflow

---
 .github/workflows/profiler.yml | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/.github/workflows/profiler.yml b/.github/workflows/profiler.yml
index 41d2eee9b5..cfceefeb46 100644
--- a/.github/workflows/profiler.yml
+++ b/.github/workflows/profiler.yml
@@ -7,11 +7,11 @@ on:
     branches: [ master ]
 
 jobs:
-  a100s:
-    uses: actions/checkout@v2
+  a100s_Profiling:
+    name: A100S Profiling
+    runs-on: [self-hosted, linux, a100s]
 
-    - name: Getting prerequisites
-      run: source /cvmfs/sft.cern.ch/lcg/contrib/gcc/11.3.0/x86_64-centos8/setup.sh
+    uses: actions/checkout@v2
 
     - name: Runs performanceProfiler.py script
-      run: python3 performanceProfiler.py
\ No newline at end of file
+      run: source /cvmfs/sft.cern.ch/lcg/contrib/gcc/11.3.0/x86_64-centos8/setup.sh; python3 performanceProfiler.py
\ No newline at end of file

From 4f770ef9a23790eb63c89d447a26f71f71eba82b Mon Sep 17 00:00:00 2001
From: Jooorgen <jorgen.teig@gmail.com>
Date: Wed, 26 Oct 2022 12:30:56 +0200
Subject: [PATCH 020/509] Testing diffrent name on job in Profiler workflow

---
 .github/workflows/profiler.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/profiler.yml b/.github/workflows/profiler.yml
index cfceefeb46..54ec011985 100644
--- a/.github/workflows/profiler.yml
+++ b/.github/workflows/profiler.yml
@@ -7,7 +7,7 @@ on:
     branches: [ master ]
 
 jobs:
-  a100s_Profiling:
+  GPU:
     name: A100S Profiling
     runs-on: [self-hosted, linux, a100s]
 

From f444cebc4945abf936fafda3cd8db86e0b04ebc6 Mon Sep 17 00:00:00 2001
From: Jooorgen <jorgen.teig@gmail.com>
Date: Wed, 26 Oct 2022 12:35:25 +0200
Subject: [PATCH 021/509] Adding dash to the name of the job

---
 .github/workflows/profiler.yml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/profiler.yml b/.github/workflows/profiler.yml
index 54ec011985..5c4ed2074f 100644
--- a/.github/workflows/profiler.yml
+++ b/.github/workflows/profiler.yml
@@ -7,8 +7,8 @@ on:
     branches: [ master ]
 
 jobs:
-  GPU:
-    name: A100S Profiling
+  a100s_Profiling:
+    - name: A100S Profiling
     runs-on: [self-hosted, linux, a100s]
 
     uses: actions/checkout@v2

From a47d90b0e1173fc9eae237dbcf6c0b0bae1a11b7 Mon Sep 17 00:00:00 2001
From: Jooorgen <jorgen.teig@gmail.com>
Date: Wed, 26 Oct 2022 12:37:24 +0200
Subject: [PATCH 022/509] Added steps to profiler job

---
 .github/workflows/profiler.yml | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/profiler.yml b/.github/workflows/profiler.yml
index 5c4ed2074f..7017920e7f 100644
--- a/.github/workflows/profiler.yml
+++ b/.github/workflows/profiler.yml
@@ -11,7 +11,8 @@ jobs:
     - name: A100S Profiling
     runs-on: [self-hosted, linux, a100s]
 
-    uses: actions/checkout@v2
+    steps:
+      uses: actions/checkout@v2
 
-    - name: Runs performanceProfiler.py script
-      run: source /cvmfs/sft.cern.ch/lcg/contrib/gcc/11.3.0/x86_64-centos8/setup.sh; python3 performanceProfiler.py
\ No newline at end of file
+      - name: Runs performanceProfiler.py script
+        run: source /cvmfs/sft.cern.ch/lcg/contrib/gcc/11.3.0/x86_64-centos8/setup.sh; python3 performanceProfiler.py
\ No newline at end of file

From cb1153d4d037aeedfbf45d8d5a972bb987a4213a Mon Sep 17 00:00:00 2001
From: Jooorgen <jorgen.teig@gmail.com>
Date: Wed, 26 Oct 2022 12:39:27 +0200
Subject: [PATCH 023/509] Added more dashes in profiler workflow

---
 .github/workflows/profiler.yml | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/.github/workflows/profiler.yml b/.github/workflows/profiler.yml
index 7017920e7f..dae9631ead 100644
--- a/.github/workflows/profiler.yml
+++ b/.github/workflows/profiler.yml
@@ -8,11 +8,9 @@ on:
 
 jobs:
   a100s_Profiling:
-    - name: A100S Profiling
+    name: A100S Profiling
     runs-on: [self-hosted, linux, a100s]
-
     steps:
-      uses: actions/checkout@v2
-
+      - uses: actions/checkout@v2
       - name: Runs performanceProfiler.py script
         run: source /cvmfs/sft.cern.ch/lcg/contrib/gcc/11.3.0/x86_64-centos8/setup.sh; python3 performanceProfiler.py
\ No newline at end of file

From ad464f241ebdc5da64b2ecb71ad74edf8558bbca Mon Sep 17 00:00:00 2001
From: Jooorgen <jorgen.teig@gmail.com>
Date: Wed, 26 Oct 2022 14:43:11 +0200
Subject: [PATCH 024/509] Removed actions/checkoutv2 line

---
 .github/workflows/profiler.yml | 1 -
 1 file changed, 1 deletion(-)

diff --git a/.github/workflows/profiler.yml b/.github/workflows/profiler.yml
index dae9631ead..7c7f70c582 100644
--- a/.github/workflows/profiler.yml
+++ b/.github/workflows/profiler.yml
@@ -11,6 +11,5 @@ jobs:
     name: A100S Profiling
     runs-on: [self-hosted, linux, a100s]
     steps:
-      - uses: actions/checkout@v2
       - name: Runs performanceProfiler.py script
         run: source /cvmfs/sft.cern.ch/lcg/contrib/gcc/11.3.0/x86_64-centos8/setup.sh; python3 performanceProfiler.py
\ No newline at end of file

From c0e0ea2ec3feffa30b91cd2fd5376ff62dcd6a10 Mon Sep 17 00:00:00 2001
From: Jooorgen <jorgen.teig@gmail.com>
Date: Thu, 27 Oct 2022 12:57:09 +0200
Subject: [PATCH 025/509] Reworked some functionality to make it better work
 with CI

---
 tools/profiling/buildCUDAProcess.sh | 16 ++++------------
 1 file changed, 4 insertions(+), 12 deletions(-)

diff --git a/tools/profiling/buildCUDAProcess.sh b/tools/profiling/buildCUDAProcess.sh
index 6739b0f3f0..2bdb53f705 100644
--- a/tools/profiling/buildCUDAProcess.sh
+++ b/tools/profiling/buildCUDAProcess.sh
@@ -36,8 +36,8 @@ fi
 
 # Set user specific variables
 
-prefix=/afs/cern.ch/work/j/jteig
-export CUDA_HOME=/usr/local/cuda-11.5/
+prefix=$(pwd)
+export CUDA_HOME=/usr/local/cuda-11.6/
 export FC=`which gfortran`
 
 # Set up compiler and compile options
@@ -46,7 +46,7 @@ export USEBUILDDIR=1
 export NTPBMAX=1024
 export MG_EXE="./gcheck.exe"
 export WORKSPACE=$prefix/workspace_mg4gpu
-export NAME_PREFIX="cudacpp_v100s_cuda_11.5"
+export NAME_PREFIX="cudacpp_v100s_cuda_11.6"
 
 ##################################################################
 
@@ -68,9 +68,6 @@ esac
 
 export MG_PROC_DIR=$prefix/madgraph4gpu/epochX/cudacpp/$MG_PROC
 export MG_SP_DIR=$MG_PROC_DIR/SubProcesses/$MG_SUBPROC
-export MG5AMC_CARD_PATH=$MG_PROC_DIR/Cards
-
-mkdir -p $MG_SP_DIR/perf/data
 
 # Build executable
 
@@ -79,9 +76,4 @@ make
 
 # Run executable
 
-$MG_EXE -j $blocksPerGrid $threadsPerBlock $iterations
-
-# Move JSON report to workspace
-
-cd $MG_SP_DIR/pref/data/
-mv 0-perf-test-run0.json ${WORKSPACE}/test_${NAME_PREFIX}_${MG_PROC}_${blocksPerGrid}_${threadsPerBlock}_${iterations}.json
\ No newline at end of file
+$MG_EXE -j --json_file ${WORKSPACE}/test_${NAME_PREFIX}_${MG_PROC}_${blocksPerGrid}_${threadsPerBlock}_${iterations}.json $blocksPerGrid $threadsPerBlock $iterations
\ No newline at end of file

From 73d17b68a313e140ad34b4de136ae3acea79e2b6 Mon Sep 17 00:00:00 2001
From: Jooorgen <jorgen.teig@gmail.com>
Date: Thu, 27 Oct 2022 12:57:26 +0200
Subject: [PATCH 026/509] Reworked some functionality to make it better work
 with CI

---
 tools/profiling/buildSYCLProcess.sh | 24 ++++++++++++------------
 1 file changed, 12 insertions(+), 12 deletions(-)

diff --git a/tools/profiling/buildSYCLProcess.sh b/tools/profiling/buildSYCLProcess.sh
index 08343c37ce..be48ab1b27 100644
--- a/tools/profiling/buildSYCLProcess.sh
+++ b/tools/profiling/buildSYCLProcess.sh
@@ -32,12 +32,11 @@ fi
 # Begin script in case all parameters are correct
 
 # Set user/SYCL-flags variables
-prefix=/p/project/prpb109
-export DPCPP_HOME=/p/project/prpb109/sycl_workspace
-export CUDA_PATH=/p/software/juwelsbooster/stages/2022/software/CUDA/11.5
-# export SYCLFLAGS="-fsycl -fsycl-targets=nvptx64-nvidia-cuda -target-backend '--cuda-gpu-arch=sm_80' -fgpu-rdc --cuda-path=$CUDA_PATH"
-export SYCLFLAGS="-fsycl -fsycl-targets=nvptx64-nvidia-cuda -Xsycl-target-backend '--cuda-gpu-arch=sm_80' -fgpu-rdc --cuda-path=$CUDA_PATH"
-export GPU_VERSION="sycl_v100_cuda_11.5_gcc_10.3"
+prefix=$(pwd)
+#export DPCPP_HOME=/p/project/prpb109/sycl_workspace
+export CUDA_PATH=/usr/local/cuda-11.6
+export SYCLFLAGS="-fsycl -fsycl-targets=nvptx64-nvidia-cuda -Xsycl-target-backend '--cuda-gpu-arch=sm_70' -fgpu-rdc --cuda-path=$CUDA_PATH"
+export NAME_PREFIX="sycl_v100_cuda11.6_gcc11.3"
 export WORKSPACE=$prefix/workspace_mg4gpu
 
 # Finds correct subprocess
@@ -54,7 +53,7 @@ export DEVICE_ID=0 #if unknown set at the run step after running LD_LIBRARY_PATH
 # Set up compiler and compile options
 export USEBUILDDIR=1
 export NTPBMAX=1024
-export CXX=$DPCPP_HOME/llvm/build/bin/clang++
+export CXX=/cvmfs/projects.cern.ch/intelsw/oneAPI/linux/x86_64/2022/compiler/2022.2.0/linux/bin/dpcpp
 
 mkdir -p $WORKSPACE/mg4gpu/lib
 mkdir -p $WORKSPACE/mg4gpu/bin
@@ -65,10 +64,11 @@ export MG4GPU_BIN=$WORKSPACE/mg4gpu/bin
 export MG_PROC_DIR=$prefix/madgraph4gpu/epochX/sycl/$MG_PROC
 export MG_SP_DIR=$MG_PROC_DIR/SubProcesses/$MG_SUBPROC
 
-export MG_LIBS_DIR="${MG4GPU_LIB}/build_${MG_PROC}_${MG_SUBPROC}_${GPU_VERSION}"
-export MG_LIBS="$DPCPP_HOME/llvm/build/lib:$MG_LIBS_DIR"
+export MG_LIBS_DIR="${MG4GPU_LIB}/build_${MG_PROC}_${MG_SUBPROC}_${NAME_PREFIX}"
+# export MG_LIBS="$DPCPP_HOME/llvm/build/lib:$MG_LIBS_DIR"
+export MG_LIBS=$MG_LIBS_DIR
 
-export MG_EXE_DIR="${MG4GPU_BIN}/build_${MG_PROC}_${MG_SUBPROC}_${GPU_VERSION}"
+export MG_EXE_DIR="${MG4GPU_BIN}/build_${MG_PROC}_${MG_SUBPROC}_${NAME_PREFIX}"
 export MG_EXE="$MG_EXE_DIR/check.exe"
 export MG5AMC_CARD_PATH=$MG_PROC_DIR/Cards
 
@@ -85,7 +85,7 @@ cd $WORKSPACE
 #LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$MG_LIBS $MG_EXE --param_card $MG5AMC_CARD_PATH/param_card.dat --device_info 32 32 10
 
 # Add MG Libs to linker library path and run the executable
-LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$MG_LIBS $MG_EXE -j --json_file $WORKSPACE/test_${GPU_VERSION}_${MG_PROC}_${MG_SUBPROC}_${blocksPerGrid}_${threadsPerBlock}_${iterations}.json --param_card $MG5AMC_CARD_PATH /param_card.dat --device_id $DEVICE_ID $blocksPerGrid $threadsPerBlock $iterations
+LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$MG_LIBS $MG_EXE -j --json_file $WORKSPACE/test_${NAME_PREFIX}_${MG_PROC}_${blocksPerGrid}_${threadsPerBlock}_${iterations}.json --param_card $MG5AMC_CARD_PATH /param_card.dat --device_id $DEVICE_ID $blocksPerGrid $threadsPerBlock $iterations
 
 # View output
-#nano $WORKSPACE/test_${GPU_VERSION}_${MG_PROC}_${MG_SUBPROC}_${blocksPerGrid}_${threadsPerBlock}_${iterations}.json-+
\ No newline at end of file
+#nano $WORKSPACE/test_${NAME_PREFIX}_${MG_PROC}_${MG_SUBPROC}_${blocksPerGrid}_${threadsPerBlock}_${iterations}.json-+
\ No newline at end of file

From 57b0681961c01fb76e630e457ce058040262f32a Mon Sep 17 00:00:00 2001
From: Jooorgen <jorgen.teig@gmail.com>
Date: Thu, 27 Oct 2022 12:57:56 +0200
Subject: [PATCH 027/509] Reworked some functionality to make it better work
 with CI

---
 tools/profiling/performanceProfiler.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/tools/profiling/performanceProfiler.py b/tools/profiling/performanceProfiler.py
index e32fcceab0..7e71cb905b 100644
--- a/tools/profiling/performanceProfiler.py
+++ b/tools/profiling/performanceProfiler.py
@@ -18,8 +18,8 @@
 # Parser
 parser = argparse.ArgumentParser(description='A program for profiling GPUs using MadGraph.')
 
-parser.add_argument("-l", help="Choose which abstraction layer you want to use (CUDA/SYCL).", default=absLayer)
-parser.add_argument("-b", help="Choose which branch the madgraph4gpu repo is in.", default=branch)
+parser.add_argument("-l", "--layer", help="Choose which abstraction layer you want to use (CUDA/SYCL).", default=absLayer)
+parser.add_argument("-b", "--branch", help="Choose which branch the madgraph4gpu repo is in.", default=branch)
 
 args = parser.parse_args()
 
@@ -32,7 +32,7 @@
             if (TPB * BPG > doublePrecisionConstant):
 
                 if args.l.upper() == 'SYCL':
-                    args = ["./buildMadGraphFile.sh", "-n",  process, "-i",  str(iterations), "-t",  str(TPB), "-b", str(BPG)]
+                    args = ["./buildSYCLProcess.sh", "-n",  process, "-i",  str(iterations), "-t",  str(TPB), "-b", str(BPG)]
 
                 elif args.l.upper() == 'CUDA':
 
@@ -41,7 +41,7 @@
                         if ".sa" not in process:
                             process = process + ".sa"
                     
-                    args = ["./buildCUDAFile.sh", "-n",  process, "-i",  str(iterations), "-t",  str(TPB), "-b", str(BPG)]
+                    args = ["./buildCUDAProcess.sh", "-n",  process, "-i",  str(iterations), "-t",  str(TPB), "-b", str(BPG)]
 
                 else: sys.exit("No abstraction layer matching the supplied string!")
 

From f48b1397f470adcd272179ec0e9936b8d1cdae7c Mon Sep 17 00:00:00 2001
From: Jooorgen <jorgen.teig@gmail.com>
Date: Thu, 27 Oct 2022 12:59:44 +0200
Subject: [PATCH 028/509] Changed workflow so it is using oneAPI from cvmfs

---
 .github/workflows/profiler.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/profiler.yml b/.github/workflows/profiler.yml
index 7c7f70c582..f39b6ab235 100644
--- a/.github/workflows/profiler.yml
+++ b/.github/workflows/profiler.yml
@@ -12,4 +12,4 @@ jobs:
     runs-on: [self-hosted, linux, a100s]
     steps:
       - name: Runs performanceProfiler.py script
-        run: source /cvmfs/sft.cern.ch/lcg/contrib/gcc/11.3.0/x86_64-centos8/setup.sh; python3 performanceProfiler.py
\ No newline at end of file
+        run: source /cvmfs/sft.cern.ch/lcg/contrib/gcc/11.3.0/x86_64-centos8/setup.sh; source /cvmfs/projects.cern.ch/intelsw/oneAPI/linux/x86_64/2022/setvars.sh; python3 performanceProfiler.py
\ No newline at end of file

From ba47e3c083fe908a25c02d1c2d183da2923faebb Mon Sep 17 00:00:00 2001
From: Jooorgen <jorgen.teig@gmail.com>
Date: Thu, 27 Oct 2022 13:01:50 +0200
Subject: [PATCH 029/509] Added correct path to profiler script and added
 arguments for correct execution

---
 .github/workflows/profiler.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/profiler.yml b/.github/workflows/profiler.yml
index f39b6ab235..cbb16028cc 100644
--- a/.github/workflows/profiler.yml
+++ b/.github/workflows/profiler.yml
@@ -12,4 +12,4 @@ jobs:
     runs-on: [self-hosted, linux, a100s]
     steps:
       - name: Runs performanceProfiler.py script
-        run: source /cvmfs/sft.cern.ch/lcg/contrib/gcc/11.3.0/x86_64-centos8/setup.sh; source /cvmfs/projects.cern.ch/intelsw/oneAPI/linux/x86_64/2022/setvars.sh; python3 performanceProfiler.py
\ No newline at end of file
+        run: source /cvmfs/sft.cern.ch/lcg/contrib/gcc/11.3.0/x86_64-centos8/setup.sh; source /cvmfs/projects.cern.ch/intelsw/oneAPI/linux/x86_64/2022/setvars.sh; python3 tools/profiling/performanceProfiler.py --layer CUDA --branch master
\ No newline at end of file

From 368103f28db83de63c701f15a6e334018bb8a535 Mon Sep 17 00:00:00 2001
From: Jooorgen <jorgen.teig@gmail.com>
Date: Thu, 27 Oct 2022 14:48:20 +0200
Subject: [PATCH 030/509] Changed path to the profiler script in the workflow

---
 .github/workflows/profiler.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/profiler.yml b/.github/workflows/profiler.yml
index cbb16028cc..f18f43ab92 100644
--- a/.github/workflows/profiler.yml
+++ b/.github/workflows/profiler.yml
@@ -12,4 +12,4 @@ jobs:
     runs-on: [self-hosted, linux, a100s]
     steps:
       - name: Runs performanceProfiler.py script
-        run: source /cvmfs/sft.cern.ch/lcg/contrib/gcc/11.3.0/x86_64-centos8/setup.sh; source /cvmfs/projects.cern.ch/intelsw/oneAPI/linux/x86_64/2022/setvars.sh; python3 tools/profiling/performanceProfiler.py --layer CUDA --branch master
\ No newline at end of file
+        run: source /cvmfs/sft.cern.ch/lcg/contrib/gcc/11.3.0/x86_64-centos8/setup.sh; source /cvmfs/projects.cern.ch/intelsw/oneAPI/linux/x86_64/2022/setvars.sh; python3 madgraph4gpu/tools/profiling/performanceProfiler.py --layer CUDA --branch master
\ No newline at end of file

From 2bfc1ff03593983a0ef156c7bc8d659c30e7a998 Mon Sep 17 00:00:00 2001
From: Jooorgen <jorgen.teig@gmail.com>
Date: Thu, 27 Oct 2022 15:12:23 +0200
Subject: [PATCH 031/509] Added checkout action to get repository

---
 .github/workflows/profiler.yml | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/profiler.yml b/.github/workflows/profiler.yml
index f18f43ab92..0feaab49fa 100644
--- a/.github/workflows/profiler.yml
+++ b/.github/workflows/profiler.yml
@@ -11,5 +11,6 @@ jobs:
     name: A100S Profiling
     runs-on: [self-hosted, linux, a100s]
     steps:
-      - name: Runs performanceProfiler.py script
-        run: source /cvmfs/sft.cern.ch/lcg/contrib/gcc/11.3.0/x86_64-centos8/setup.sh; source /cvmfs/projects.cern.ch/intelsw/oneAPI/linux/x86_64/2022/setvars.sh; python3 madgraph4gpu/tools/profiling/performanceProfiler.py --layer CUDA --branch master
\ No newline at end of file
+    - uses: actions/checkout@v2
+    - name: Runs performanceProfiler.py script
+       run: source /cvmfs/sft.cern.ch/lcg/contrib/gcc/11.3.0/x86_64-centos8/setup.sh; source /cvmfs/projects.cern.ch/intelsw/oneAPI/linux/x86_64/2022/setvars.sh; python3 tools/profiling/performanceProfiler.py --layer CUDA --branch master
\ No newline at end of file

From aaef0184c29aed9f7c52e1bf13839d4167c4d511 Mon Sep 17 00:00:00 2001
From: Jooorgen <jorgen.teig@gmail.com>
Date: Thu, 27 Oct 2022 15:23:38 +0200
Subject: [PATCH 032/509] Removed whitespace to fix error

---
 .github/workflows/profiler.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/profiler.yml b/.github/workflows/profiler.yml
index 0feaab49fa..9efab8e39e 100644
--- a/.github/workflows/profiler.yml
+++ b/.github/workflows/profiler.yml
@@ -13,4 +13,4 @@ jobs:
     steps:
     - uses: actions/checkout@v2
     - name: Runs performanceProfiler.py script
-       run: source /cvmfs/sft.cern.ch/lcg/contrib/gcc/11.3.0/x86_64-centos8/setup.sh; source /cvmfs/projects.cern.ch/intelsw/oneAPI/linux/x86_64/2022/setvars.sh; python3 tools/profiling/performanceProfiler.py --layer CUDA --branch master
\ No newline at end of file
+      run: source /cvmfs/sft.cern.ch/lcg/contrib/gcc/11.3.0/x86_64-centos8/setup.sh; source /cvmfs/projects.cern.ch/intelsw/oneAPI/linux/x86_64/2022/setvars.sh; python3 tools/profiling/performanceProfiler.py --layer CUDA --branch master
\ No newline at end of file

From 1a322cf7df9ab62087c33db78eefe23caae1691a Mon Sep 17 00:00:00 2001
From: Jooorgen <jorgen.teig@gmail.com>
Date: Thu, 27 Oct 2022 15:26:33 +0200
Subject: [PATCH 033/509] Added str to abstraction layer variables

---
 tools/profiling/performanceProfiler.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tools/profiling/performanceProfiler.py b/tools/profiling/performanceProfiler.py
index 7e71cb905b..c624399bb9 100644
--- a/tools/profiling/performanceProfiler.py
+++ b/tools/profiling/performanceProfiler.py
@@ -31,10 +31,10 @@
         for BPG in blocksPerGrid:
             if (TPB * BPG > doublePrecisionConstant):
 
-                if args.l.upper() == 'SYCL':
+                if str(args.l).upper() == 'SYCL':
                     args = ["./buildSYCLProcess.sh", "-n",  process, "-i",  str(iterations), "-t",  str(TPB), "-b", str(BPG)]
 
-                elif args.l.upper() == 'CUDA':
+                elif str(args.l).upper() == 'CUDA':
 
                     # Used in br_golden_epochX4 branch
                     if args.b == 'br_golden_epochX4':

From e7120e025e587f3f34dce5bee4771868b195e16d Mon Sep 17 00:00:00 2001
From: Jooorgen <jorgen.teig@gmail.com>
Date: Thu, 27 Oct 2022 15:37:24 +0200
Subject: [PATCH 034/509] Added arguments to script as strings

---
 .github/workflows/profiler.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/profiler.yml b/.github/workflows/profiler.yml
index 9efab8e39e..0f9e2aa2d8 100644
--- a/.github/workflows/profiler.yml
+++ b/.github/workflows/profiler.yml
@@ -13,4 +13,4 @@ jobs:
     steps:
     - uses: actions/checkout@v2
     - name: Runs performanceProfiler.py script
-      run: source /cvmfs/sft.cern.ch/lcg/contrib/gcc/11.3.0/x86_64-centos8/setup.sh; source /cvmfs/projects.cern.ch/intelsw/oneAPI/linux/x86_64/2022/setvars.sh; python3 tools/profiling/performanceProfiler.py --layer CUDA --branch master
\ No newline at end of file
+      run: source /cvmfs/sft.cern.ch/lcg/contrib/gcc/11.3.0/x86_64-centos8/setup.sh; source /cvmfs/projects.cern.ch/intelsw/oneAPI/linux/x86_64/2022/setvars.sh; python3 tools/profiling/performanceProfiler.py --layer 'CUDA' --branch 'master'
\ No newline at end of file

From 21cf8edd46c57276a2cf69c37d4bbfb6afaca257 Mon Sep 17 00:00:00 2001
From: Jooorgen <jorgen.teig@gmail.com>
Date: Thu, 27 Oct 2022 15:39:27 +0200
Subject: [PATCH 035/509] Changed arguments to the script in the workflow

---
 .github/workflows/profiler.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/profiler.yml b/.github/workflows/profiler.yml
index 0f9e2aa2d8..bad2bc75be 100644
--- a/.github/workflows/profiler.yml
+++ b/.github/workflows/profiler.yml
@@ -13,4 +13,4 @@ jobs:
     steps:
     - uses: actions/checkout@v2
     - name: Runs performanceProfiler.py script
-      run: source /cvmfs/sft.cern.ch/lcg/contrib/gcc/11.3.0/x86_64-centos8/setup.sh; source /cvmfs/projects.cern.ch/intelsw/oneAPI/linux/x86_64/2022/setvars.sh; python3 tools/profiling/performanceProfiler.py --layer 'CUDA' --branch 'master'
\ No newline at end of file
+      run: source /cvmfs/sft.cern.ch/lcg/contrib/gcc/11.3.0/x86_64-centos8/setup.sh; source /cvmfs/projects.cern.ch/intelsw/oneAPI/linux/x86_64/2022/setvars.sh; python3 tools/profiling/performanceProfiler.py -l 'CUDA' -b 'master'
\ No newline at end of file

From f49dbf07172f44cda000054b66bb0e19303d6872 Mon Sep 17 00:00:00 2001
From: Jooorgen <jorgen.teig@gmail.com>
Date: Thu, 27 Oct 2022 15:45:48 +0200
Subject: [PATCH 036/509] Altered arguments to make script work

---
 tools/profiling/performanceProfiler.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tools/profiling/performanceProfiler.py b/tools/profiling/performanceProfiler.py
index c624399bb9..7136d9dbe3 100644
--- a/tools/profiling/performanceProfiler.py
+++ b/tools/profiling/performanceProfiler.py
@@ -18,8 +18,8 @@
 # Parser
 parser = argparse.ArgumentParser(description='A program for profiling GPUs using MadGraph.')
 
-parser.add_argument("-l", "--layer", help="Choose which abstraction layer you want to use (CUDA/SYCL).", default=absLayer)
-parser.add_argument("-b", "--branch", help="Choose which branch the madgraph4gpu repo is in.", default=branch)
+parser.add_argument("-l", help="Choose which abstraction layer you want to use (CUDA/SYCL).", default=absLayer)
+parser.add_argument("-b", help="Choose which branch the madgraph4gpu repo is in.", default=branch)
 
 args = parser.parse_args()
 

From 5d9e9cf7f07421c3849a29308b816bd2fe1fc15d Mon Sep 17 00:00:00 2001
From: Jooorgen <jorgen.teig@gmail.com>
Date: Thu, 27 Oct 2022 15:54:48 +0200
Subject: [PATCH 037/509] Added +x permissions to make the scripts exectuable

---
 .github/workflows/profiler.yml | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/.github/workflows/profiler.yml b/.github/workflows/profiler.yml
index bad2bc75be..823b4ceb09 100644
--- a/.github/workflows/profiler.yml
+++ b/.github/workflows/profiler.yml
@@ -12,5 +12,7 @@ jobs:
     runs-on: [self-hosted, linux, a100s]
     steps:
     - uses: actions/checkout@v2
+    - name: Change permissions on files
+      run: chmod +x buildCUDAProcess.sh buildSYCLProcess.sh
     - name: Runs performanceProfiler.py script
       run: source /cvmfs/sft.cern.ch/lcg/contrib/gcc/11.3.0/x86_64-centos8/setup.sh; source /cvmfs/projects.cern.ch/intelsw/oneAPI/linux/x86_64/2022/setvars.sh; python3 tools/profiling/performanceProfiler.py -l 'CUDA' -b 'master'
\ No newline at end of file

From 46267bf751c3aa22ed9da119b71cb3d0128b69b0 Mon Sep 17 00:00:00 2001
From: Jooorgen <jorgen.teig@gmail.com>
Date: Thu, 27 Oct 2022 15:57:22 +0200
Subject: [PATCH 038/509] Added actual path to the executables

---
 .github/workflows/profiler.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/profiler.yml b/.github/workflows/profiler.yml
index 823b4ceb09..cf23498928 100644
--- a/.github/workflows/profiler.yml
+++ b/.github/workflows/profiler.yml
@@ -13,6 +13,6 @@ jobs:
     steps:
     - uses: actions/checkout@v2
     - name: Change permissions on files
-      run: chmod +x buildCUDAProcess.sh buildSYCLProcess.sh
+      run: chmod +x tools/profiling/buildCUDAProcess.sh tools/profiling/buildSYCLProcess.sh
     - name: Runs performanceProfiler.py script
       run: source /cvmfs/sft.cern.ch/lcg/contrib/gcc/11.3.0/x86_64-centos8/setup.sh; source /cvmfs/projects.cern.ch/intelsw/oneAPI/linux/x86_64/2022/setvars.sh; python3 tools/profiling/performanceProfiler.py -l 'CUDA' -b 'master'
\ No newline at end of file

From d8ccbc431155381a2be57cb21dd0151db391e1f0 Mon Sep 17 00:00:00 2001
From: Jooorgen <jorgen.teig@gmail.com>
Date: Thu, 27 Oct 2022 15:59:39 +0200
Subject: [PATCH 039/509] Added step to change the working directory to
 profiling

---
 .github/workflows/profiler.yml | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/profiler.yml b/.github/workflows/profiler.yml
index cf23498928..94915f724d 100644
--- a/.github/workflows/profiler.yml
+++ b/.github/workflows/profiler.yml
@@ -12,7 +12,9 @@ jobs:
     runs-on: [self-hosted, linux, a100s]
     steps:
     - uses: actions/checkout@v2
+    - name: Change working directory to profiling
+      run: cd tools/profiling/
     - name: Change permissions on files
-      run: chmod +x tools/profiling/buildCUDAProcess.sh tools/profiling/buildSYCLProcess.sh
+      run: chmod +x buildCUDAProcess.sh buildSYCLProcess.sh
     - name: Runs performanceProfiler.py script
       run: source /cvmfs/sft.cern.ch/lcg/contrib/gcc/11.3.0/x86_64-centos8/setup.sh; source /cvmfs/projects.cern.ch/intelsw/oneAPI/linux/x86_64/2022/setvars.sh; python3 tools/profiling/performanceProfiler.py -l 'CUDA' -b 'master'
\ No newline at end of file

From 807470b58d3916fe8af25b391e369e5c8c1e50c4 Mon Sep 17 00:00:00 2001
From: Jooorgen <jorgen.teig@gmail.com>
Date: Thu, 27 Oct 2022 16:02:48 +0200
Subject: [PATCH 040/509] Test to see if wd carries over steps

---
 .github/workflows/profiler.yml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/profiler.yml b/.github/workflows/profiler.yml
index 94915f724d..d18bb931db 100644
--- a/.github/workflows/profiler.yml
+++ b/.github/workflows/profiler.yml
@@ -13,8 +13,8 @@ jobs:
     steps:
     - uses: actions/checkout@v2
     - name: Change working directory to profiling
-      run: cd tools/profiling/
+      run: cd tools/profiling/; pwd
     - name: Change permissions on files
-      run: chmod +x buildCUDAProcess.sh buildSYCLProcess.sh
+      run: chmod +x buildCUDAProcess.sh buildSYCLProcess.sh; pwd
     - name: Runs performanceProfiler.py script
       run: source /cvmfs/sft.cern.ch/lcg/contrib/gcc/11.3.0/x86_64-centos8/setup.sh; source /cvmfs/projects.cern.ch/intelsw/oneAPI/linux/x86_64/2022/setvars.sh; python3 tools/profiling/performanceProfiler.py -l 'CUDA' -b 'master'
\ No newline at end of file

From f9dd060f2a1dccc2fdfee92f3fc1e135297c2409 Mon Sep 17 00:00:00 2001
From: Jooorgen <jorgen.teig@gmail.com>
Date: Thu, 27 Oct 2022 16:07:22 +0200
Subject: [PATCH 041/509] Changed position of pwd command

---
 .github/workflows/profiler.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/profiler.yml b/.github/workflows/profiler.yml
index d18bb931db..910306426a 100644
--- a/.github/workflows/profiler.yml
+++ b/.github/workflows/profiler.yml
@@ -15,6 +15,6 @@ jobs:
     - name: Change working directory to profiling
       run: cd tools/profiling/; pwd
     - name: Change permissions on files
-      run: chmod +x buildCUDAProcess.sh buildSYCLProcess.sh; pwd
+      run: pwd; chmod +x buildCUDAProcess.sh buildSYCLProcess.sh
     - name: Runs performanceProfiler.py script
       run: source /cvmfs/sft.cern.ch/lcg/contrib/gcc/11.3.0/x86_64-centos8/setup.sh; source /cvmfs/projects.cern.ch/intelsw/oneAPI/linux/x86_64/2022/setvars.sh; python3 tools/profiling/performanceProfiler.py -l 'CUDA' -b 'master'
\ No newline at end of file

From a21180351a660993e795e16483183e57d03d35b8 Mon Sep 17 00:00:00 2001
From: Jooorgen <jorgen.teig@gmail.com>
Date: Thu, 27 Oct 2022 16:10:03 +0200
Subject: [PATCH 042/509] Changes WD to correct folder on execution now

---
 .github/workflows/profiler.yml | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/.github/workflows/profiler.yml b/.github/workflows/profiler.yml
index 910306426a..3d7e347cdd 100644
--- a/.github/workflows/profiler.yml
+++ b/.github/workflows/profiler.yml
@@ -12,9 +12,7 @@ jobs:
     runs-on: [self-hosted, linux, a100s]
     steps:
     - uses: actions/checkout@v2
-    - name: Change working directory to profiling
-      run: cd tools/profiling/; pwd
     - name: Change permissions on files
-      run: pwd; chmod +x buildCUDAProcess.sh buildSYCLProcess.sh
+      run: cd tools/profiling/; chmod +x buildCUDAProcess.sh buildSYCLProcess.sh
     - name: Runs performanceProfiler.py script
-      run: source /cvmfs/sft.cern.ch/lcg/contrib/gcc/11.3.0/x86_64-centos8/setup.sh; source /cvmfs/projects.cern.ch/intelsw/oneAPI/linux/x86_64/2022/setvars.sh; python3 tools/profiling/performanceProfiler.py -l 'CUDA' -b 'master'
\ No newline at end of file
+      run: cd tools/profiling/; source /cvmfs/sft.cern.ch/lcg/contrib/gcc/11.3.0/x86_64-centos8/setup.sh; source /cvmfs/projects.cern.ch/intelsw/oneAPI/linux/x86_64/2022/setvars.sh; python3 tools/profiling/performanceProfiler.py -l 'CUDA' -b 'master'
\ No newline at end of file

From c12eb872eee2063a2a510c058b4a5b2f7cce83f0 Mon Sep 17 00:00:00 2001
From: Jooorgen <jorgen.teig@gmail.com>
Date: Thu, 27 Oct 2022 16:12:29 +0200
Subject: [PATCH 043/509] Removed reference to subfolder when running script

---
 .github/workflows/profiler.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/profiler.yml b/.github/workflows/profiler.yml
index 3d7e347cdd..9c086df761 100644
--- a/.github/workflows/profiler.yml
+++ b/.github/workflows/profiler.yml
@@ -15,4 +15,4 @@ jobs:
     - name: Change permissions on files
       run: cd tools/profiling/; chmod +x buildCUDAProcess.sh buildSYCLProcess.sh
     - name: Runs performanceProfiler.py script
-      run: cd tools/profiling/; source /cvmfs/sft.cern.ch/lcg/contrib/gcc/11.3.0/x86_64-centos8/setup.sh; source /cvmfs/projects.cern.ch/intelsw/oneAPI/linux/x86_64/2022/setvars.sh; python3 tools/profiling/performanceProfiler.py -l 'CUDA' -b 'master'
\ No newline at end of file
+      run: cd tools/profiling/; source /cvmfs/sft.cern.ch/lcg/contrib/gcc/11.3.0/x86_64-centos8/setup.sh; source /cvmfs/projects.cern.ch/intelsw/oneAPI/linux/x86_64/2022/setvars.sh; python3 performanceProfiler.py -l 'CUDA' -b 'master'
\ No newline at end of file

From 9cb871911d2ecc95a08a1f13508e969303e981fb Mon Sep 17 00:00:00 2001
From: Jooorgen <jorgen.teig@gmail.com>
Date: Fri, 28 Oct 2022 10:18:09 +0200
Subject: [PATCH 044/509] Removed CD in running the performance profiler step

---
 .github/workflows/profiler.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/profiler.yml b/.github/workflows/profiler.yml
index 9c086df761..a011f06633 100644
--- a/.github/workflows/profiler.yml
+++ b/.github/workflows/profiler.yml
@@ -15,4 +15,4 @@ jobs:
     - name: Change permissions on files
       run: cd tools/profiling/; chmod +x buildCUDAProcess.sh buildSYCLProcess.sh
     - name: Runs performanceProfiler.py script
-      run: cd tools/profiling/; source /cvmfs/sft.cern.ch/lcg/contrib/gcc/11.3.0/x86_64-centos8/setup.sh; source /cvmfs/projects.cern.ch/intelsw/oneAPI/linux/x86_64/2022/setvars.sh; python3 performanceProfiler.py -l 'CUDA' -b 'master'
\ No newline at end of file
+      run: source /cvmfs/sft.cern.ch/lcg/contrib/gcc/11.3.0/x86_64-centos8/setup.sh; source /cvmfs/projects.cern.ch/intelsw/oneAPI/linux/x86_64/2022/setvars.sh; python3 tools/profiling/performanceProfiler.py -l 'CUDA' -b 'master'
\ No newline at end of file

From 372a7ccd86786c56b962df2ec7c530e4cc0f8511 Mon Sep 17 00:00:00 2001
From: Jooorgen <jorgen.teig@gmail.com>
Date: Fri, 28 Oct 2022 10:21:28 +0200
Subject: [PATCH 045/509] Added full path to performance profiler scripts

---
 tools/profiling/performanceProfiler.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tools/profiling/performanceProfiler.py b/tools/profiling/performanceProfiler.py
index 7136d9dbe3..3daa4b1c62 100644
--- a/tools/profiling/performanceProfiler.py
+++ b/tools/profiling/performanceProfiler.py
@@ -32,7 +32,7 @@
             if (TPB * BPG > doublePrecisionConstant):
 
                 if str(args.l).upper() == 'SYCL':
-                    args = ["./buildSYCLProcess.sh", "-n",  process, "-i",  str(iterations), "-t",  str(TPB), "-b", str(BPG)]
+                    args = ["./tools/profiling/buildSYCLProcess.sh", "-n",  process, "-i",  str(iterations), "-t",  str(TPB), "-b", str(BPG)]
 
                 elif str(args.l).upper() == 'CUDA':
 
@@ -41,7 +41,7 @@
                         if ".sa" not in process:
                             process = process + ".sa"
                     
-                    args = ["./buildCUDAProcess.sh", "-n",  process, "-i",  str(iterations), "-t",  str(TPB), "-b", str(BPG)]
+                    args = ["./tools/profiling/buildCUDAProcess.sh", "-n",  process, "-i",  str(iterations), "-t",  str(TPB), "-b", str(BPG)]
 
                 else: sys.exit("No abstraction layer matching the supplied string!")
 

From 954b7bc054728cbb53a8f17e922562ad4ffbc30d Mon Sep 17 00:00:00 2001
From: Jooorgen <jorgen.teig@gmail.com>
Date: Fri, 28 Oct 2022 10:27:56 +0200
Subject: [PATCH 046/509] Added some testing commands

---
 .github/workflows/profiler.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/profiler.yml b/.github/workflows/profiler.yml
index a011f06633..319643b6b5 100644
--- a/.github/workflows/profiler.yml
+++ b/.github/workflows/profiler.yml
@@ -15,4 +15,4 @@ jobs:
     - name: Change permissions on files
       run: cd tools/profiling/; chmod +x buildCUDAProcess.sh buildSYCLProcess.sh
     - name: Runs performanceProfiler.py script
-      run: source /cvmfs/sft.cern.ch/lcg/contrib/gcc/11.3.0/x86_64-centos8/setup.sh; source /cvmfs/projects.cern.ch/intelsw/oneAPI/linux/x86_64/2022/setvars.sh; python3 tools/profiling/performanceProfiler.py -l 'CUDA' -b 'master'
\ No newline at end of file
+      run: cd; pwd; source /cvmfs/sft.cern.ch/lcg/contrib/gcc/11.3.0/x86_64-centos8/setup.sh; source /cvmfs/projects.cern.ch/intelsw/oneAPI/linux/x86_64/2022/setvars.sh; python3 tools/profiling/performanceProfiler.py -l 'CUDA' -b 'master'
\ No newline at end of file

From 6f03107e00236e2bd3b01974bf01c1f623c2356b Mon Sep 17 00:00:00 2001
From: Jooorgen <jorgen.teig@gmail.com>
Date: Fri, 28 Oct 2022 10:28:52 +0200
Subject: [PATCH 047/509] Changed some testing commands

---
 .github/workflows/profiler.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/profiler.yml b/.github/workflows/profiler.yml
index 319643b6b5..4397143ceb 100644
--- a/.github/workflows/profiler.yml
+++ b/.github/workflows/profiler.yml
@@ -15,4 +15,4 @@ jobs:
     - name: Change permissions on files
       run: cd tools/profiling/; chmod +x buildCUDAProcess.sh buildSYCLProcess.sh
     - name: Runs performanceProfiler.py script
-      run: cd; pwd; source /cvmfs/sft.cern.ch/lcg/contrib/gcc/11.3.0/x86_64-centos8/setup.sh; source /cvmfs/projects.cern.ch/intelsw/oneAPI/linux/x86_64/2022/setvars.sh; python3 tools/profiling/performanceProfiler.py -l 'CUDA' -b 'master'
\ No newline at end of file
+      run: cd $GITHUB_WORKSPACE; pwd; source /cvmfs/sft.cern.ch/lcg/contrib/gcc/11.3.0/x86_64-centos8/setup.sh; source /cvmfs/projects.cern.ch/intelsw/oneAPI/linux/x86_64/2022/setvars.sh; python3 tools/profiling/performanceProfiler.py -l 'CUDA' -b 'master'
\ No newline at end of file

From af9fa15c98239bb36491ddad18302d1c995fd625 Mon Sep 17 00:00:00 2001
From: Jooorgen <jorgen.teig@gmail.com>
Date: Fri, 28 Oct 2022 10:34:27 +0200
Subject: [PATCH 048/509] Removed the chekout action

---
 .github/workflows/profiler.yml | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/.github/workflows/profiler.yml b/.github/workflows/profiler.yml
index 4397143ceb..230892b1a6 100644
--- a/.github/workflows/profiler.yml
+++ b/.github/workflows/profiler.yml
@@ -11,8 +11,7 @@ jobs:
     name: A100S Profiling
     runs-on: [self-hosted, linux, a100s]
     steps:
-    - uses: actions/checkout@v2
     - name: Change permissions on files
-      run: cd tools/profiling/; chmod +x buildCUDAProcess.sh buildSYCLProcess.sh
+      run: pwd; cd madgraph4gpu/tools/profiling/; chmod +x buildCUDAProcess.sh buildSYCLProcess.sh
     - name: Runs performanceProfiler.py script
       run: cd $GITHUB_WORKSPACE; pwd; source /cvmfs/sft.cern.ch/lcg/contrib/gcc/11.3.0/x86_64-centos8/setup.sh; source /cvmfs/projects.cern.ch/intelsw/oneAPI/linux/x86_64/2022/setvars.sh; python3 tools/profiling/performanceProfiler.py -l 'CUDA' -b 'master'
\ No newline at end of file

From 21b652fe6391262e2398127926e713c8bbc00721 Mon Sep 17 00:00:00 2001
From: Jooorgen <jorgen.teig@gmail.com>
Date: Fri, 28 Oct 2022 10:50:45 +0200
Subject: [PATCH 049/509] Reworked workflow so it hopefully works

---
 .github/workflows/profiler.yml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/profiler.yml b/.github/workflows/profiler.yml
index 230892b1a6..c35456f960 100644
--- a/.github/workflows/profiler.yml
+++ b/.github/workflows/profiler.yml
@@ -12,6 +12,6 @@ jobs:
     runs-on: [self-hosted, linux, a100s]
     steps:
     - name: Change permissions on files
-      run: pwd; cd madgraph4gpu/tools/profiling/; chmod +x buildCUDAProcess.sh buildSYCLProcess.sh
+      run: cd tools/profiling/; chmod +x buildCUDAProcess.sh buildSYCLProcess.sh
     - name: Runs performanceProfiler.py script
-      run: cd $GITHUB_WORKSPACE; pwd; source /cvmfs/sft.cern.ch/lcg/contrib/gcc/11.3.0/x86_64-centos8/setup.sh; source /cvmfs/projects.cern.ch/intelsw/oneAPI/linux/x86_64/2022/setvars.sh; python3 tools/profiling/performanceProfiler.py -l 'CUDA' -b 'master'
\ No newline at end of file
+      run: cd ../; source /cvmfs/sft.cern.ch/lcg/contrib/gcc/11.3.0/x86_64-centos8/setup.sh; source /cvmfs/projects.cern.ch/intelsw/oneAPI/linux/x86_64/2022/setvars.sh; python3 madgraph4hpu/tools/profiling/performanceProfiler.py -l 'CUDA' -b 'master'
\ No newline at end of file

From cb6cdab75b90b508b6998ba723b25ab4ff0366e2 Mon Sep 17 00:00:00 2001
From: Jooorgen <jorgen.teig@gmail.com>
Date: Fri, 28 Oct 2022 11:16:41 +0200
Subject: [PATCH 050/509] Added PWD

---
 .github/workflows/profiler.yml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/profiler.yml b/.github/workflows/profiler.yml
index c35456f960..1a71db20a7 100644
--- a/.github/workflows/profiler.yml
+++ b/.github/workflows/profiler.yml
@@ -12,6 +12,6 @@ jobs:
     runs-on: [self-hosted, linux, a100s]
     steps:
     - name: Change permissions on files
-      run: cd tools/profiling/; chmod +x buildCUDAProcess.sh buildSYCLProcess.sh
+      run: pwd; cd tools/profiling/; chmod +x buildCUDAProcess.sh buildSYCLProcess.sh
     - name: Runs performanceProfiler.py script
-      run: cd ../; source /cvmfs/sft.cern.ch/lcg/contrib/gcc/11.3.0/x86_64-centos8/setup.sh; source /cvmfs/projects.cern.ch/intelsw/oneAPI/linux/x86_64/2022/setvars.sh; python3 madgraph4hpu/tools/profiling/performanceProfiler.py -l 'CUDA' -b 'master'
\ No newline at end of file
+      run: pwd; cd ../; source /cvmfs/sft.cern.ch/lcg/contrib/gcc/11.3.0/x86_64-centos8/setup.sh; source /cvmfs/projects.cern.ch/intelsw/oneAPI/linux/x86_64/2022/setvars.sh; python3 madgraph4hpu/tools/profiling/performanceProfiler.py -l 'CUDA' -b 'master'
\ No newline at end of file

From 7056793c8391ca4ccce5ec8fe93829e0f0b02756 Mon Sep 17 00:00:00 2001
From: Jooorgen <jorgen.teig@gmail.com>
Date: Fri, 28 Oct 2022 13:58:27 +0200
Subject: [PATCH 051/509] Added checkout action

---
 .github/workflows/profiler.yml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.github/workflows/profiler.yml b/.github/workflows/profiler.yml
index 1a71db20a7..5d661f7d44 100644
--- a/.github/workflows/profiler.yml
+++ b/.github/workflows/profiler.yml
@@ -11,6 +11,7 @@ jobs:
     name: A100S Profiling
     runs-on: [self-hosted, linux, a100s]
     steps:
+    - uses: actions/checkout@v2
     - name: Change permissions on files
       run: pwd; cd tools/profiling/; chmod +x buildCUDAProcess.sh buildSYCLProcess.sh
     - name: Runs performanceProfiler.py script

From 75c503f402f566df754727f3113937b29d6a5f8f Mon Sep 17 00:00:00 2001
From: Jooorgen <jorgen.teig@gmail.com>
Date: Fri, 28 Oct 2022 14:16:51 +0200
Subject: [PATCH 052/509] Removed typo in workflow

---
 .github/workflows/profiler.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/profiler.yml b/.github/workflows/profiler.yml
index 5d661f7d44..535eae1a1b 100644
--- a/.github/workflows/profiler.yml
+++ b/.github/workflows/profiler.yml
@@ -15,4 +15,4 @@ jobs:
     - name: Change permissions on files
       run: pwd; cd tools/profiling/; chmod +x buildCUDAProcess.sh buildSYCLProcess.sh
     - name: Runs performanceProfiler.py script
-      run: pwd; cd ../; source /cvmfs/sft.cern.ch/lcg/contrib/gcc/11.3.0/x86_64-centos8/setup.sh; source /cvmfs/projects.cern.ch/intelsw/oneAPI/linux/x86_64/2022/setvars.sh; python3 madgraph4hpu/tools/profiling/performanceProfiler.py -l 'CUDA' -b 'master'
\ No newline at end of file
+      run: pwd; cd ../; source /cvmfs/sft.cern.ch/lcg/contrib/gcc/11.3.0/x86_64-centos8/setup.sh; source /cvmfs/projects.cern.ch/intelsw/oneAPI/linux/x86_64/2022/setvars.sh; python3 madgraph4gpu/tools/profiling/performanceProfiler.py -l 'CUDA' -b 'master'
\ No newline at end of file

From da67d17ad098a96432a9bb89e20fe8b3ce46b763 Mon Sep 17 00:00:00 2001
From: Jooorgen <jorgen.teig@gmail.com>
Date: Fri, 28 Oct 2022 14:18:11 +0200
Subject: [PATCH 053/509] Altered path to execution scripts

---
 tools/profiling/performanceProfiler.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tools/profiling/performanceProfiler.py b/tools/profiling/performanceProfiler.py
index 3daa4b1c62..23ccdc4c82 100644
--- a/tools/profiling/performanceProfiler.py
+++ b/tools/profiling/performanceProfiler.py
@@ -32,7 +32,7 @@
             if (TPB * BPG > doublePrecisionConstant):
 
                 if str(args.l).upper() == 'SYCL':
-                    args = ["./tools/profiling/buildSYCLProcess.sh", "-n",  process, "-i",  str(iterations), "-t",  str(TPB), "-b", str(BPG)]
+                    args = ["./madgraph4gpu/tools/profiling/buildSYCLProcess.sh", "-n",  process, "-i",  str(iterations), "-t",  str(TPB), "-b", str(BPG)]
 
                 elif str(args.l).upper() == 'CUDA':
 
@@ -41,7 +41,7 @@
                         if ".sa" not in process:
                             process = process + ".sa"
                     
-                    args = ["./tools/profiling/buildCUDAProcess.sh", "-n",  process, "-i",  str(iterations), "-t",  str(TPB), "-b", str(BPG)]
+                    args = ["./madgraph4gpu/tools/profiling/buildCUDAProcess.sh", "-n",  process, "-i",  str(iterations), "-t",  str(TPB), "-b", str(BPG)]
 
                 else: sys.exit("No abstraction layer matching the supplied string!")
 

From 0a5b2a918f40df3d7420cfc43f04f9ceae095427 Mon Sep 17 00:00:00 2001
From: Jooorgen <jorgen.teig@gmail.com>
Date: Fri, 28 Oct 2022 14:33:29 +0200
Subject: [PATCH 054/509] Altered CUDA script to work better with CI and
 diffrent directories across systems

---
 tools/profiling/buildCUDAProcess.sh | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/tools/profiling/buildCUDAProcess.sh b/tools/profiling/buildCUDAProcess.sh
index 2bdb53f705..a41677480d 100644
--- a/tools/profiling/buildCUDAProcess.sh
+++ b/tools/profiling/buildCUDAProcess.sh
@@ -36,11 +36,13 @@ fi
 
 # Set user specific variables
 
+# Assumes that this is run from profiling directory in the repo
 prefix=$(pwd)
+
 export CUDA_HOME=/usr/local/cuda-11.6/
 export FC=`which gfortran`
 
-# Set up compiler and compile options
+# Set up compiler and compile options and makes workspace
 
 export USEBUILDDIR=1
 export NTPBMAX=1024
@@ -64,9 +66,7 @@ case $MG_PROC in
     gg_ttggg ) export MG_SUBPROC="P1_Sigma_sm_gg_ttxggg" ;;
 esac
 
-# Makes workspace in correct folder
-
-export MG_PROC_DIR=$prefix/madgraph4gpu/epochX/cudacpp/$MG_PROC
+export MG_PROC_DIR=$prefix/../../epochX/cudacpp/$MG_PROC
 export MG_SP_DIR=$MG_PROC_DIR/SubProcesses/$MG_SUBPROC
 
 # Build executable

From 2674b0fed6d476f288979958c72ada7b920f8e39 Mon Sep 17 00:00:00 2001
From: Jooorgen <jorgen.teig@gmail.com>
Date: Fri, 28 Oct 2022 14:56:19 +0200
Subject: [PATCH 055/509] Fixed bug in profiler script and made it work better
 with CI

---
 tools/profiling/performanceProfiler.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/tools/profiling/performanceProfiler.py b/tools/profiling/performanceProfiler.py
index 23ccdc4c82..17ae848f6f 100644
--- a/tools/profiling/performanceProfiler.py
+++ b/tools/profiling/performanceProfiler.py
@@ -32,16 +32,16 @@
             if (TPB * BPG > doublePrecisionConstant):
 
                 if str(args.l).upper() == 'SYCL':
-                    args = ["./madgraph4gpu/tools/profiling/buildSYCLProcess.sh", "-n",  process, "-i",  str(iterations), "-t",  str(TPB), "-b", str(BPG)]
+                    args = ["./buildSYCLProcess.sh", "-n",  process, "-i",  str(iterations), "-t",  str(TPB), "-b", str(BPG)]
 
                 elif str(args.l).upper() == 'CUDA':
 
-                    # Used in br_golden_epochX4 branch
-                    if args.b == 'br_golden_epochX4':
+                    # There is no .sa in br_golden_epochX4 so it makes sure that .sa is included in everything other than that branch
+                    if args.b != 'br_golden_epochX4':
                         if ".sa" not in process:
                             process = process + ".sa"
                     
-                    args = ["./madgraph4gpu/tools/profiling/buildCUDAProcess.sh", "-n",  process, "-i",  str(iterations), "-t",  str(TPB), "-b", str(BPG)]
+                    args = ["./buildCUDAProcess.sh", "-n",  process, "-i",  str(iterations), "-t",  str(TPB), "-b", str(BPG)]
 
                 else: sys.exit("No abstraction layer matching the supplied string!")
 

From f2b202ac985dedac39fa9ed549d3efba08953856 Mon Sep 17 00:00:00 2001
From: Jooorgen <jorgen.teig@gmail.com>
Date: Fri, 28 Oct 2022 14:57:03 +0200
Subject: [PATCH 056/509] Altered profiler workflow to make it work better with
 CI

---
 .github/workflows/profiler.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/profiler.yml b/.github/workflows/profiler.yml
index 535eae1a1b..d689a71859 100644
--- a/.github/workflows/profiler.yml
+++ b/.github/workflows/profiler.yml
@@ -15,4 +15,4 @@ jobs:
     - name: Change permissions on files
       run: pwd; cd tools/profiling/; chmod +x buildCUDAProcess.sh buildSYCLProcess.sh
     - name: Runs performanceProfiler.py script
-      run: pwd; cd ../; source /cvmfs/sft.cern.ch/lcg/contrib/gcc/11.3.0/x86_64-centos8/setup.sh; source /cvmfs/projects.cern.ch/intelsw/oneAPI/linux/x86_64/2022/setvars.sh; python3 madgraph4gpu/tools/profiling/performanceProfiler.py -l 'CUDA' -b 'master'
\ No newline at end of file
+      run: pwd; cd tools/profiling/; source /cvmfs/sft.cern.ch/lcg/contrib/gcc/11.3.0/x86_64-centos8/setup.sh; source /cvmfs/projects.cern.ch/intelsw/oneAPI/linux/x86_64/2022/setvars.sh; python3 performanceProfiler.py -l 'CUDA' -b 'master'
\ No newline at end of file

From 49d13526a57b6e66ea4f15426cea084b4623a794 Mon Sep 17 00:00:00 2001
From: Jooorgen <jorgen.teig@gmail.com>
Date: Fri, 28 Oct 2022 15:01:48 +0200
Subject: [PATCH 057/509] Removed STR to make it work better with CI

---
 tools/profiling/performanceProfiler.py | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/tools/profiling/performanceProfiler.py b/tools/profiling/performanceProfiler.py
index 17ae848f6f..4dd8104637 100644
--- a/tools/profiling/performanceProfiler.py
+++ b/tools/profiling/performanceProfiler.py
@@ -1,3 +1,4 @@
+import string
 import sys
 import os
 import subprocess
@@ -18,8 +19,8 @@
 # Parser
 parser = argparse.ArgumentParser(description='A program for profiling GPUs using MadGraph.')
 
-parser.add_argument("-l", help="Choose which abstraction layer you want to use (CUDA/SYCL).", default=absLayer)
-parser.add_argument("-b", help="Choose which branch the madgraph4gpu repo is in.", default=branch)
+parser.add_argument("-l", help="Choose which abstraction layer you want to use (CUDA/SYCL).", default=absLayer, type=string)
+parser.add_argument("-b", help="Choose which branch the madgraph4gpu repo is in.", default=branch, type=string)
 
 args = parser.parse_args()
 
@@ -31,10 +32,10 @@
         for BPG in blocksPerGrid:
             if (TPB * BPG > doublePrecisionConstant):
 
-                if str(args.l).upper() == 'SYCL':
+                if args.l.upper() == 'SYCL':
                     args = ["./buildSYCLProcess.sh", "-n",  process, "-i",  str(iterations), "-t",  str(TPB), "-b", str(BPG)]
 
-                elif str(args.l).upper() == 'CUDA':
+                elif args.l.upper() == 'CUDA':
 
                     # There is no .sa in br_golden_epochX4 so it makes sure that .sa is included in everything other than that branch
                     if args.b != 'br_golden_epochX4':

From 07063e4aad458f9e38cc5fe8bc32504d136d602e Mon Sep 17 00:00:00 2001
From: Jooorgen <jorgen.teig@gmail.com>
Date: Fri, 28 Oct 2022 15:04:22 +0200
Subject: [PATCH 058/509] Removed default type to try to make it work better
 with CI

---
 tools/profiling/performanceProfiler.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tools/profiling/performanceProfiler.py b/tools/profiling/performanceProfiler.py
index 4dd8104637..441b54bb41 100644
--- a/tools/profiling/performanceProfiler.py
+++ b/tools/profiling/performanceProfiler.py
@@ -19,8 +19,8 @@
 # Parser
 parser = argparse.ArgumentParser(description='A program for profiling GPUs using MadGraph.')
 
-parser.add_argument("-l", help="Choose which abstraction layer you want to use (CUDA/SYCL).", default=absLayer, type=string)
-parser.add_argument("-b", help="Choose which branch the madgraph4gpu repo is in.", default=branch, type=string)
+parser.add_argument("-l", help="Choose which abstraction layer you want to use (CUDA/SYCL).", default=absLayer)
+parser.add_argument("-b", help="Choose which branch the madgraph4gpu repo is in.", default=branch)
 
 args = parser.parse_args()
 

From 430a8ff80fa928bea3f2eb87051d0a9b74da9387 Mon Sep 17 00:00:00 2001
From: Jooorgen <jorgen.teig@gmail.com>
Date: Fri, 28 Oct 2022 15:11:13 +0200
Subject: [PATCH 059/509] Seperated the python and bash args into seperate
 variables

---
 tools/profiling/performanceProfiler.py | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/tools/profiling/performanceProfiler.py b/tools/profiling/performanceProfiler.py
index 441b54bb41..e56964cec8 100644
--- a/tools/profiling/performanceProfiler.py
+++ b/tools/profiling/performanceProfiler.py
@@ -22,7 +22,7 @@
 parser.add_argument("-l", help="Choose which abstraction layer you want to use (CUDA/SYCL).", default=absLayer)
 parser.add_argument("-b", help="Choose which branch the madgraph4gpu repo is in.", default=branch)
 
-args = parser.parse_args()
+pyArgs = parser.parse_args()
 
 # How many runs in total the program made
 count = 0
@@ -32,23 +32,23 @@
         for BPG in blocksPerGrid:
             if (TPB * BPG > doublePrecisionConstant):
 
-                if args.l.upper() == 'SYCL':
-                    args = ["./buildSYCLProcess.sh", "-n",  process, "-i",  str(iterations), "-t",  str(TPB), "-b", str(BPG)]
+                if pyArgs.l.upper() == 'SYCL':
+                    bashArgs = ["./buildSYCLProcess.sh", "-n",  process, "-i",  str(iterations), "-t",  str(TPB), "-b", str(BPG)]
 
-                elif args.l.upper() == 'CUDA':
+                elif pyArgs.l.upper() == 'CUDA':
 
                     # There is no .sa in br_golden_epochX4 so it makes sure that .sa is included in everything other than that branch
-                    if args.b != 'br_golden_epochX4':
+                    if pyArgs.b != 'br_golden_epochX4':
                         if ".sa" not in process:
                             process = process + ".sa"
                     
-                    args = ["./buildCUDAProcess.sh", "-n",  process, "-i",  str(iterations), "-t",  str(TPB), "-b", str(BPG)]
+                    bashArgs = ["./buildCUDAProcess.sh", "-n",  process, "-i",  str(iterations), "-t",  str(TPB), "-b", str(BPG)]
 
                 else: sys.exit("No abstraction layer matching the supplied string!")
 
                 print(str(datetime.datetime.now().strftime("%H:%M:%S")) + " Started " + process + " with TPB("+ str(TPB) +") * BPG("+ str(BPG) +"): " + str(TPB * BPG) + "!")
                 
-                build = subprocess.run(args, stdout=subprocess.DEVNULL)
+                build = subprocess.run(bashArgs, stdout=subprocess.DEVNULL)
                 if build.returncode != 0:
                     print(str(datetime.datetime.now().strftime("%H:%M:%S")) + " " + process + " FAILED!, threadsPerBlock: " + str(TPB) + ", blocksPerGrid: " + str(BPG) + ", Product: " + str(TPB * BPG))
                 else:

From 034cd27e04925cd9ac70b01b580794f36727a108 Mon Sep 17 00:00:00 2001
From: Jooorgen <jorgen.teig@gmail.com>
Date: Fri, 28 Oct 2022 15:26:24 +0200
Subject: [PATCH 060/509] Removed line that hides subprocess output

---
 tools/profiling/performanceProfiler.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/profiling/performanceProfiler.py b/tools/profiling/performanceProfiler.py
index e56964cec8..1b0f2a1a15 100644
--- a/tools/profiling/performanceProfiler.py
+++ b/tools/profiling/performanceProfiler.py
@@ -48,7 +48,7 @@
 
                 print(str(datetime.datetime.now().strftime("%H:%M:%S")) + " Started " + process + " with TPB("+ str(TPB) +") * BPG("+ str(BPG) +"): " + str(TPB * BPG) + "!")
                 
-                build = subprocess.run(bashArgs, stdout=subprocess.DEVNULL)
+                build = subprocess.run(bashArgs)#, stdout=subprocess.DEVNULL)
                 if build.returncode != 0:
                     print(str(datetime.datetime.now().strftime("%H:%M:%S")) + " " + process + " FAILED!, threadsPerBlock: " + str(TPB) + ", blocksPerGrid: " + str(BPG) + ", Product: " + str(TPB * BPG))
                 else:

From 94d0b18da8791f269a81bae8b6fbb8652b69c27b Mon Sep 17 00:00:00 2001
From: Jooorgen <jorgen.teig@gmail.com>
Date: Fri, 28 Oct 2022 15:29:32 +0200
Subject: [PATCH 061/509] Added PWD for debugging

---
 tools/profiling/buildCUDAProcess.sh | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/tools/profiling/buildCUDAProcess.sh b/tools/profiling/buildCUDAProcess.sh
index a41677480d..b7dd7500bf 100644
--- a/tools/profiling/buildCUDAProcess.sh
+++ b/tools/profiling/buildCUDAProcess.sh
@@ -38,6 +38,7 @@ fi
 
 # Assumes that this is run from profiling directory in the repo
 prefix=$(pwd)
+pwd
 
 export CUDA_HOME=/usr/local/cuda-11.6/
 export FC=`which gfortran`
@@ -67,11 +68,13 @@ case $MG_PROC in
 esac
 
 export MG_PROC_DIR=$prefix/../../epochX/cudacpp/$MG_PROC
+echo $MG_PROC_DIR
 export MG_SP_DIR=$MG_PROC_DIR/SubProcesses/$MG_SUBPROC
 
 # Build executable
 
 cd $MG_SP_DIR
+pwd
 make
 
 # Run executable

From 8f3b87e91d705d5a18a4f2a28f68082a579d6918 Mon Sep 17 00:00:00 2001
From: Jooorgen <jorgen.teig@gmail.com>
Date: Fri, 28 Oct 2022 15:35:15 +0200
Subject: [PATCH 062/509] Added more cases to handle .sa folders

---
 tools/profiling/buildCUDAProcess.sh | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/tools/profiling/buildCUDAProcess.sh b/tools/profiling/buildCUDAProcess.sh
index b7dd7500bf..af2a253618 100644
--- a/tools/profiling/buildCUDAProcess.sh
+++ b/tools/profiling/buildCUDAProcess.sh
@@ -38,7 +38,6 @@ fi
 
 # Assumes that this is run from profiling directory in the repo
 prefix=$(pwd)
-pwd
 
 export CUDA_HOME=/usr/local/cuda-11.6/
 export FC=`which gfortran`
@@ -61,20 +60,23 @@ export PATH=$CUDA_HOME:$PATH
 
 case $MG_PROC in
     ee_mumu ) export MG_SUBPROC="P1_Sigma_sm_epem_mupmum" ;;
+    ee_mumu.sa ) export MG_SUBPROC="P1_Sigma_sm_epem_mupmum" ;;
     gg_tt ) export MG_SUBPROC="P1_Sigma_sm_gg_ttx" ;;
+    gg_tt.sa ) export MG_SUBPROC="P1_Sigma_sm_gg_ttx" ;;
     gg_ttg ) export MG_SUBPROC="P1_Sigma_sm_gg_ttxg" ;;
+    gg_ttg.sa ) export MG_SUBPROC="P1_Sigma_sm_gg_ttxg" ;;
     gg_ttgg ) export MG_SUBPROC="P1_Sigma_sm_gg_ttxgg" ;;
+    gg_ttgg.sa ) export MG_SUBPROC="P1_Sigma_sm_gg_ttxgg" ;;
     gg_ttggg ) export MG_SUBPROC="P1_Sigma_sm_gg_ttxggg" ;;
+    gg_ttggg.sa ) export MG_SUBPROC="P1_Sigma_sm_gg_ttxggg" ;;
 esac
 
 export MG_PROC_DIR=$prefix/../../epochX/cudacpp/$MG_PROC
-echo $MG_PROC_DIR
 export MG_SP_DIR=$MG_PROC_DIR/SubProcesses/$MG_SUBPROC
 
 # Build executable
 
 cd $MG_SP_DIR
-pwd
 make
 
 # Run executable

From 4b728bdc91bb8f83a281b80d066199569d332bed Mon Sep 17 00:00:00 2001
From: Jooorgen <jorgen.teig@gmail.com>
Date: Fri, 28 Oct 2022 15:48:06 +0200
Subject: [PATCH 063/509] CD to correct folder for execution

---
 tools/profiling/buildCUDAProcess.sh | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tools/profiling/buildCUDAProcess.sh b/tools/profiling/buildCUDAProcess.sh
index af2a253618..7e67ba34fc 100644
--- a/tools/profiling/buildCUDAProcess.sh
+++ b/tools/profiling/buildCUDAProcess.sh
@@ -81,4 +81,5 @@ make
 
 # Run executable
 
+cd build*
 $MG_EXE -j --json_file ${WORKSPACE}/test_${NAME_PREFIX}_${MG_PROC}_${blocksPerGrid}_${threadsPerBlock}_${iterations}.json $blocksPerGrid $threadsPerBlock $iterations
\ No newline at end of file

From b1da6ef094dedbcf5a28a5d9c42d61bcc003c98d Mon Sep 17 00:00:00 2001
From: Jorgen Teig <jteig@itscrd02.cern.ch>
Date: Fri, 28 Oct 2022 16:18:39 +0200
Subject: [PATCH 064/509] Added +x to execution scripts

---
 tools/profiling/buildCUDAProcess.sh | 0
 tools/profiling/buildSYCLProcess.sh | 0
 2 files changed, 0 insertions(+), 0 deletions(-)
 mode change 100644 => 100755 tools/profiling/buildCUDAProcess.sh
 mode change 100644 => 100755 tools/profiling/buildSYCLProcess.sh

diff --git a/tools/profiling/buildCUDAProcess.sh b/tools/profiling/buildCUDAProcess.sh
old mode 100644
new mode 100755
diff --git a/tools/profiling/buildSYCLProcess.sh b/tools/profiling/buildSYCLProcess.sh
old mode 100644
new mode 100755

From 86ac9d191b1a5a88d2fbb498f6003824c801b866 Mon Sep 17 00:00:00 2001
From: Jooorgen <jorgen.teig@gmail.com>
Date: Fri, 28 Oct 2022 16:32:00 +0200
Subject: [PATCH 065/509] Added variable to card path

---
 tools/profiling/buildCUDAProcess.sh | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tools/profiling/buildCUDAProcess.sh b/tools/profiling/buildCUDAProcess.sh
index 7e67ba34fc..71abb0a781 100644
--- a/tools/profiling/buildCUDAProcess.sh
+++ b/tools/profiling/buildCUDAProcess.sh
@@ -73,6 +73,7 @@ esac
 
 export MG_PROC_DIR=$prefix/../../epochX/cudacpp/$MG_PROC
 export MG_SP_DIR=$MG_PROC_DIR/SubProcesses/$MG_SUBPROC
+export MG5AMC_CARD_PATH=$MG_PROC_DIR/Cards
 
 # Build executable
 

From cb868fbb7afb02bf9800d939931cf3e5abbdf9b2 Mon Sep 17 00:00:00 2001
From: Jooorgen <jorgen.teig@gmail.com>
Date: Fri, 28 Oct 2022 16:41:18 +0200
Subject: [PATCH 066/509] Added way to transport json file generated to
 workspace

---
 tools/profiling/buildCUDAProcess.sh | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/tools/profiling/buildCUDAProcess.sh b/tools/profiling/buildCUDAProcess.sh
index 71abb0a781..591d5e214e 100755
--- a/tools/profiling/buildCUDAProcess.sh
+++ b/tools/profiling/buildCUDAProcess.sh
@@ -83,4 +83,8 @@ make
 # Run executable
 
 cd build*
-$MG_EXE -j --json_file ${WORKSPACE}/test_${NAME_PREFIX}_${MG_PROC}_${blocksPerGrid}_${threadsPerBlock}_${iterations}.json $blocksPerGrid $threadsPerBlock $iterations
\ No newline at end of file
+$MG_EXE -j $blocksPerGrid $threadsPerBlock $iterations
+
+mkdir -p $MG_SP_DIR/pref/data/
+cd $MG_SP_DIR/pref/data/
+mv 0-perf-test-run0.json ${WORKSPACE}/test_${NAME_PREFIX}_${MG_PROC}_${blocksPerGrid}_${threadsPerBlock}_${iterations}.json
\ No newline at end of file

From 8933662399a5e932da2de71c015d7fdec89dc03f Mon Sep 17 00:00:00 2001
From: Jooorgen <jorgen.teig@gmail.com>
Date: Mon, 31 Oct 2022 09:54:04 +0100
Subject: [PATCH 067/509] Removed step changing file permissions

---
 .github/workflows/profiler.yml | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/.github/workflows/profiler.yml b/.github/workflows/profiler.yml
index d689a71859..bad2bc75be 100644
--- a/.github/workflows/profiler.yml
+++ b/.github/workflows/profiler.yml
@@ -12,7 +12,5 @@ jobs:
     runs-on: [self-hosted, linux, a100s]
     steps:
     - uses: actions/checkout@v2
-    - name: Change permissions on files
-      run: pwd; cd tools/profiling/; chmod +x buildCUDAProcess.sh buildSYCLProcess.sh
     - name: Runs performanceProfiler.py script
-      run: pwd; cd tools/profiling/; source /cvmfs/sft.cern.ch/lcg/contrib/gcc/11.3.0/x86_64-centos8/setup.sh; source /cvmfs/projects.cern.ch/intelsw/oneAPI/linux/x86_64/2022/setvars.sh; python3 performanceProfiler.py -l 'CUDA' -b 'master'
\ No newline at end of file
+      run: source /cvmfs/sft.cern.ch/lcg/contrib/gcc/11.3.0/x86_64-centos8/setup.sh; source /cvmfs/projects.cern.ch/intelsw/oneAPI/linux/x86_64/2022/setvars.sh; python3 tools/profiling/performanceProfiler.py -l 'CUDA' -b 'master'
\ No newline at end of file

From 855c505c6719afc53ddc5ab384ac7416767331f8 Mon Sep 17 00:00:00 2001
From: Jooorgen <jorgen.teig@gmail.com>
Date: Mon, 31 Oct 2022 10:13:19 +0100
Subject: [PATCH 068/509] Changed working directory in workflow

---
 .github/workflows/profiler.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/profiler.yml b/.github/workflows/profiler.yml
index bad2bc75be..f94479e165 100644
--- a/.github/workflows/profiler.yml
+++ b/.github/workflows/profiler.yml
@@ -13,4 +13,4 @@ jobs:
     steps:
     - uses: actions/checkout@v2
     - name: Runs performanceProfiler.py script
-      run: source /cvmfs/sft.cern.ch/lcg/contrib/gcc/11.3.0/x86_64-centos8/setup.sh; source /cvmfs/projects.cern.ch/intelsw/oneAPI/linux/x86_64/2022/setvars.sh; python3 tools/profiling/performanceProfiler.py -l 'CUDA' -b 'master'
\ No newline at end of file
+      run: cd tools/profiling/; source /cvmfs/sft.cern.ch/lcg/contrib/gcc/11.3.0/x86_64-centos8/setup.sh; source /cvmfs/projects.cern.ch/intelsw/oneAPI/linux/x86_64/2022/setvars.sh; python3 performanceProfiler.py -l 'CUDA' -b 'master'
\ No newline at end of file

From 5de385a27049e0c4b94f9a7fc88a47d29ccd335f Mon Sep 17 00:00:00 2001
From: Jooorgen <jorgen.teig@gmail.com>
Date: Mon, 31 Oct 2022 11:10:59 +0100
Subject: [PATCH 069/509] Changed position of mkdir so json-file gets created
 in the right spot

---
 tools/profiling/buildCUDAProcess.sh | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tools/profiling/buildCUDAProcess.sh b/tools/profiling/buildCUDAProcess.sh
index 591d5e214e..2fa049786c 100755
--- a/tools/profiling/buildCUDAProcess.sh
+++ b/tools/profiling/buildCUDAProcess.sh
@@ -83,8 +83,8 @@ make
 # Run executable
 
 cd build*
+mkdir -p pref/data/
 $MG_EXE -j $blocksPerGrid $threadsPerBlock $iterations
 
-mkdir -p $MG_SP_DIR/pref/data/
-cd $MG_SP_DIR/pref/data/
+cd pref/data/
 mv 0-perf-test-run0.json ${WORKSPACE}/test_${NAME_PREFIX}_${MG_PROC}_${blocksPerGrid}_${threadsPerBlock}_${iterations}.json
\ No newline at end of file

From cb5029864328ca89cc5ef34017186748af259c19 Mon Sep 17 00:00:00 2001
From: Jooorgen <jorgen.teig@gmail.com>
Date: Mon, 31 Oct 2022 11:25:56 +0100
Subject: [PATCH 070/509] Added a mkdir for the workspace

---
 tools/profiling/buildCUDAProcess.sh | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/tools/profiling/buildCUDAProcess.sh b/tools/profiling/buildCUDAProcess.sh
index 2fa049786c..79c31e6eb7 100755
--- a/tools/profiling/buildCUDAProcess.sh
+++ b/tools/profiling/buildCUDAProcess.sh
@@ -50,6 +50,8 @@ export MG_EXE="./gcheck.exe"
 export WORKSPACE=$prefix/workspace_mg4gpu
 export NAME_PREFIX="cudacpp_v100s_cuda_11.6"
 
+mkdir WORKSPACE
+
 ##################################################################
 
 # Sets CUDA in PATH

From 122b9e96c7f84ec14a20266e5a82249a21112d78 Mon Sep 17 00:00:00 2001
From: Jooorgen <jorgen.teig@gmail.com>
Date: Mon, 31 Oct 2022 11:49:52 +0100
Subject: [PATCH 071/509] Fixed typos

---
 tools/profiling/buildCUDAProcess.sh | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/tools/profiling/buildCUDAProcess.sh b/tools/profiling/buildCUDAProcess.sh
index 79c31e6eb7..f69a606a05 100755
--- a/tools/profiling/buildCUDAProcess.sh
+++ b/tools/profiling/buildCUDAProcess.sh
@@ -50,7 +50,7 @@ export MG_EXE="./gcheck.exe"
 export WORKSPACE=$prefix/workspace_mg4gpu
 export NAME_PREFIX="cudacpp_v100s_cuda_11.6"
 
-mkdir WORKSPACE
+mkdir $WORKSPACE
 
 ##################################################################
 
@@ -85,8 +85,8 @@ make
 # Run executable
 
 cd build*
-mkdir -p pref/data/
+mkdir -p perf/data/
 $MG_EXE -j $blocksPerGrid $threadsPerBlock $iterations
 
-cd pref/data/
+cd perf/data/
 mv 0-perf-test-run0.json ${WORKSPACE}/test_${NAME_PREFIX}_${MG_PROC}_${blocksPerGrid}_${threadsPerBlock}_${iterations}.json
\ No newline at end of file

From 3653a983097e90938117c35eb88faffb35013232 Mon Sep 17 00:00:00 2001
From: Jooorgen <jorgen.teig@gmail.com>
Date: Mon, 31 Oct 2022 14:08:31 +0100
Subject: [PATCH 072/509] Added all the diffrent physics processes and changed
 GCC version to 11

---
 .github/workflows/c-cpp.yml | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/.github/workflows/c-cpp.yml b/.github/workflows/c-cpp.yml
index 8f5bef5449..391c32ba23 100644
--- a/.github/workflows/c-cpp.yml
+++ b/.github/workflows/c-cpp.yml
@@ -11,20 +11,20 @@ jobs:
     runs-on: ubuntu-latest
     strategy:
       matrix:
-        folder: [ epochX/cudacpp/ee_mumu.sa/SubProcesses/P1_Sigma_sm_epem_mupmum , epochX/cudacpp/gg_ttgg.sa/SubProcesses/P1_Sigma_sm_gg_ttxgg ]
+        folder: [ epochX/cudacpp/ee_mumu.sa/SubProcesses/P1_Sigma_sm_epem_mupmum , epochX/cudacpp/gg_tt.sa/SubProcesses/P1_Sigma_sm_gg_ttx , epochX/cudacpp/gg_ttg.sa/SubProcesses/P1_Sigma_sm_gg_ttxg , epochX/cudacpp/gg_ttgg.sa/SubProcesses/P1_Sigma_sm_gg_ttxgg , epochX/cudacpp/gg_ttggg.sa/SubProcesses/P1_Sigma_sm_gg_ttxggg ]
       fail-fast: false
     steps:
 
     - uses: actions/checkout@v2
     - name: make cleanall
       run: make -C ${{ matrix.folder }} cleanall
-    - name: make epochX/1/2
+    - name: make epochX
       run: make -C ${{ matrix.folder }} debug
   CPU:
     runs-on: ubuntu-latest
     strategy:
       matrix:
-        folder: [ epochX/cudacpp/ee_mumu.sa/SubProcesses/P1_Sigma_sm_epem_mupmum , epochX/cudacpp/gg_ttgg.sa/SubProcesses/P1_Sigma_sm_gg_ttxgg ]
+        folder: [ epochX/cudacpp/ee_mumu.sa/SubProcesses/P1_Sigma_sm_epem_mupmum , epochX/cudacpp/gg_tt.sa/SubProcesses/P1_Sigma_sm_gg_ttx , epochX/cudacpp/gg_ttg.sa/SubProcesses/P1_Sigma_sm_gg_ttxg , epochX/cudacpp/gg_ttgg.sa/SubProcesses/P1_Sigma_sm_gg_ttxgg , epochX/cudacpp/gg_ttggg.sa/SubProcesses/P1_Sigma_sm_gg_ttxggg ]
         precision: [ d , f ]
       fail-fast: false
     steps:
@@ -43,7 +43,7 @@ jobs:
       FC: gfortran-11
     strategy:
       matrix:
-        folder: [ epochX/cudacpp/ee_mumu.sa/SubProcesses/P1_Sigma_sm_epem_mupmum, epochX/cudacpp/gg_ttgg.sa/SubProcesses/P1_Sigma_sm_gg_ttxgg ]
+        folder: [ epochX/cudacpp/ee_mumu.sa/SubProcesses/P1_Sigma_sm_epem_mupmum , epochX/cudacpp/gg_tt.sa/SubProcesses/P1_Sigma_sm_gg_ttx , epochX/cudacpp/gg_ttg.sa/SubProcesses/P1_Sigma_sm_gg_ttxg , epochX/cudacpp/gg_ttgg.sa/SubProcesses/P1_Sigma_sm_gg_ttxgg , epochX/cudacpp/gg_ttggg.sa/SubProcesses/P1_Sigma_sm_gg_ttxggg ]
         precision: [ d ]
       fail-fast: false
     steps:
@@ -63,7 +63,7 @@ jobs:
       REQUIRE_CUDA: 1
     strategy:
       matrix:
-        folder: [ epochX/cudacpp/ee_mumu.sa/SubProcesses/P1_Sigma_sm_epem_mupmum , epochX/cudacpp/gg_ttgg.sa/SubProcesses/P1_Sigma_sm_gg_ttxgg ]
+        folder: [ epochX/cudacpp/ee_mumu.sa/SubProcesses/P1_Sigma_sm_epem_mupmum , epochX/cudacpp/gg_tt.sa/SubProcesses/P1_Sigma_sm_gg_ttx , epochX/cudacpp/gg_ttg.sa/SubProcesses/P1_Sigma_sm_gg_ttxg , epochX/cudacpp/gg_ttgg.sa/SubProcesses/P1_Sigma_sm_gg_ttxgg , epochX/cudacpp/gg_ttggg.sa/SubProcesses/P1_Sigma_sm_gg_ttxggg ]
         precision: [ d , f ]
       fail-fast: false
     steps:
@@ -71,8 +71,8 @@ jobs:
     - name: make cleanall
       run: make -C ${{ matrix.folder }} cleanall
     - name: make info
-      run: source /opt/rh/gcc-toolset-10/enable; export CUDA_HOME=/usr/local/cuda-11.6/; make FPTYPE=${{ matrix.precision }} -C ${{ matrix.folder }} info
+      run: source /cvmfs/sft.cern.ch/lcg/contrib/gcc/11.3.0/x86_64-centos8/setup.sh; export CUDA_HOME=/usr/local/cuda-11.6/; make FPTYPE=${{ matrix.precision }} -C ${{ matrix.folder }} info
     - name: make
-      run: source /opt/rh/gcc-toolset-10/enable; export CUDA_HOME=/usr/local/cuda-11.6/; make FPTYPE=${{ matrix.precision }} -C ${{ matrix.folder }}
+      run: source /cvmfs/sft.cern.ch/lcg/contrib/gcc/11.3.0/x86_64-centos8/setup.sh; export CUDA_HOME=/usr/local/cuda-11.6/; make FPTYPE=${{ matrix.precision }} -C ${{ matrix.folder }}
     - name: make check
-      run: source /opt/rh/gcc-toolset-10/enable; export CUDA_HOME=/usr/local/cuda-11.6/; make FPTYPE=${{ matrix.precision }} -C ${{ matrix.folder }} check
\ No newline at end of file
+      run: source /cvmfs/sft.cern.ch/lcg/contrib/gcc/11.3.0/x86_64-centos8/setup.sh; export CUDA_HOME=/usr/local/cuda-11.6/; make FPTYPE=${{ matrix.precision }} -C ${{ matrix.folder }} check
\ No newline at end of file

From 50e647fbb3cecda344046e70d7dc405b1ffa6a79 Mon Sep 17 00:00:00 2001
From: Jooorgen <jorgen.teig@gmail.com>
Date: Mon, 31 Oct 2022 14:14:04 +0100
Subject: [PATCH 073/509] Added SYCL CI

---
 .github/workflows/sycl.yml | 78 ++++++++++++++++++++++++++++++++++++++
 1 file changed, 78 insertions(+)
 create mode 100644 .github/workflows/sycl.yml

diff --git a/.github/workflows/sycl.yml b/.github/workflows/sycl.yml
new file mode 100644
index 0000000000..977c1165ef
--- /dev/null
+++ b/.github/workflows/sycl.yml
@@ -0,0 +1,78 @@
+name: SYCL CI
+
+on:
+  push:
+    branches: [ master ]
+  pull_request:
+    branches: [ master ]
+
+jobs:
+  debug_builds:
+    runs-on: ubuntu-latest
+    strategy:
+      matrix:
+        folder: [ epochX/sycl/ee_mumu/SubProcesses/P1_Sigma_sm_epem_mupmum , epochX/sycl/gg_tt/SubProcesses/P1_Sigma_sm_gg_ttx , epochX/sycl/gg_ttg/SubProcesses/P1_Sigma_sm_gg_ttxg , epochX/sycl/gg_ttgg/SubProcesses/P1_Sigma_sm_gg_ttxgg , epochX/sycl/gg_ttggg/SubProcesses/P1_Sigma_sm_gg_ttxggg ]
+      fail-fast: false
+    steps:
+
+    - uses: actions/checkout@v2
+    - name: make cleanall
+      run: make -C ${{ matrix.folder }} cleanall
+    - name: make epochX
+      run: make -C ${{ matrix.folder }} debug
+  CPU:
+    runs-on: ubuntu-latest
+    strategy:
+      matrix:
+        folder: [ epochX/sycl/ee_mumu/SubProcesses/P1_Sigma_sm_epem_mupmum , epochX/sycl/gg_tt/SubProcesses/P1_Sigma_sm_gg_ttx , epochX/sycl/gg_ttg/SubProcesses/P1_Sigma_sm_gg_ttxg , epochX/sycl/gg_ttgg/SubProcesses/P1_Sigma_sm_gg_ttxgg , epochX/sycl/gg_ttggg/SubProcesses/P1_Sigma_sm_gg_ttxggg ]
+        precision: [ d , f ]
+      fail-fast: false
+    steps:
+    - uses: actions/checkout@v2
+    - name: make cleanall
+      run: make -C ${{ matrix.folder }} cleanall
+    - name: make info
+      run: make FPTYPE=${{ matrix.precision }} -C ${{ matrix.folder }} info
+    - name: make
+      run: make FPTYPE=${{ matrix.precision }} -C ${{ matrix.folder }}
+    - name: make check
+      run: make FPTYPE=${{ matrix.precision }} -C ${{ matrix.folder }} check
+  CPU_MAC:
+    runs-on: macos-latest
+    env:
+      FC: gfortran-11
+    strategy:
+      matrix:
+        folder: [ epochX/sycl/ee_mumu/SubProcesses/P1_Sigma_sm_epem_mupmum , epochX/sycl/gg_tt/SubProcesses/P1_Sigma_sm_gg_ttx , epochX/sycl/gg_ttg/SubProcesses/P1_Sigma_sm_gg_ttxg , epochX/sycl/gg_ttgg/SubProcesses/P1_Sigma_sm_gg_ttxgg , epochX/sycl/gg_ttggg/SubProcesses/P1_Sigma_sm_gg_ttxggg ]
+        precision: [ d ]
+      fail-fast: false
+    steps:
+    - uses: actions/checkout@v2
+    - name: make cleanall
+      run: make -C ${{ matrix.folder }} cleanall
+    - name: make info
+      run: make AVX=none OMPFLAGS= FPTYPE=${{ matrix.precision }} -C ${{ matrix.folder }} info
+    - name: make
+      run: make AVX=none OMPFLAGS= FPTYPE=${{ matrix.precision }} -C ${{ matrix.folder }}
+    - name: make check
+      run: make AVX=none OMPFLAGS= FPTYPE=${{ matrix.precision }} -C ${{ matrix.folder }} check
+  GPU:
+    runs-on: [self-hosted, linux, a100s]
+    env:
+      FC: gfortran
+      REQUIRE_CUDA: 1
+    strategy:
+      matrix:
+        folder: [ epochX/sycl/ee_mumu/SubProcesses/P1_Sigma_sm_epem_mupmum , epochX/sycl/gg_tt/SubProcesses/P1_Sigma_sm_gg_ttx , epochX/sycl/gg_ttg/SubProcesses/P1_Sigma_sm_gg_ttxg , epochX/sycl/gg_ttgg/SubProcesses/P1_Sigma_sm_gg_ttxgg , epochX/sycl/gg_ttggg/SubProcesses/P1_Sigma_sm_gg_ttxggg ]
+        precision: [ d , f ]
+      fail-fast: false
+    steps:
+    - uses: actions/checkout@v2
+    - name: make cleanall
+      run: make -C ${{ matrix.folder }} cleanall
+    - name: make info
+      run: source /cvmfs/sft.cern.ch/lcg/contrib/gcc/11.3.0/x86_64-centos8/setup.sh; export CUDA_HOME=/usr/local/cuda-11.6/; make FPTYPE=${{ matrix.precision }} -C ${{ matrix.folder }} info
+    - name: make
+      run: source /cvmfs/sft.cern.ch/lcg/contrib/gcc/11.3.0/x86_64-centos8/setup.sh; export CUDA_HOME=/usr/local/cuda-11.6/; make FPTYPE=${{ matrix.precision }} -C ${{ matrix.folder }}
+    - name: make check
+      run: source /cvmfs/sft.cern.ch/lcg/contrib/gcc/11.3.0/x86_64-centos8/setup.sh; export CUDA_HOME=/usr/local/cuda-11.6/; make FPTYPE=${{ matrix.precision }} -C ${{ matrix.folder }} check
\ No newline at end of file

From 5e220ef6852538e15d23ce0f61a1c09278972b83 Mon Sep 17 00:00:00 2001
From: Jooorgen <jorgen.teig@gmail.com>
Date: Mon, 31 Oct 2022 14:24:27 +0100
Subject: [PATCH 074/509] Added SYCLFLAGS to env

---
 .github/workflows/sycl.yml | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/.github/workflows/sycl.yml b/.github/workflows/sycl.yml
index 977c1165ef..aee54656dd 100644
--- a/.github/workflows/sycl.yml
+++ b/.github/workflows/sycl.yml
@@ -9,6 +9,8 @@ on:
 jobs:
   debug_builds:
     runs-on: ubuntu-latest
+    env:
+      SYCLFLAGS: -fsycl
     strategy:
       matrix:
         folder: [ epochX/sycl/ee_mumu/SubProcesses/P1_Sigma_sm_epem_mupmum , epochX/sycl/gg_tt/SubProcesses/P1_Sigma_sm_gg_ttx , epochX/sycl/gg_ttg/SubProcesses/P1_Sigma_sm_gg_ttxg , epochX/sycl/gg_ttgg/SubProcesses/P1_Sigma_sm_gg_ttxgg , epochX/sycl/gg_ttggg/SubProcesses/P1_Sigma_sm_gg_ttxggg ]
@@ -22,6 +24,8 @@ jobs:
       run: make -C ${{ matrix.folder }} debug
   CPU:
     runs-on: ubuntu-latest
+    env:
+      SYCLFLAGS: -fsycl
     strategy:
       matrix:
         folder: [ epochX/sycl/ee_mumu/SubProcesses/P1_Sigma_sm_epem_mupmum , epochX/sycl/gg_tt/SubProcesses/P1_Sigma_sm_gg_ttx , epochX/sycl/gg_ttg/SubProcesses/P1_Sigma_sm_gg_ttxg , epochX/sycl/gg_ttgg/SubProcesses/P1_Sigma_sm_gg_ttxgg , epochX/sycl/gg_ttggg/SubProcesses/P1_Sigma_sm_gg_ttxggg ]
@@ -41,6 +45,7 @@ jobs:
     runs-on: macos-latest
     env:
       FC: gfortran-11
+      SYCLFLAGS: -fsycl
     strategy:
       matrix:
         folder: [ epochX/sycl/ee_mumu/SubProcesses/P1_Sigma_sm_epem_mupmum , epochX/sycl/gg_tt/SubProcesses/P1_Sigma_sm_gg_ttx , epochX/sycl/gg_ttg/SubProcesses/P1_Sigma_sm_gg_ttxg , epochX/sycl/gg_ttgg/SubProcesses/P1_Sigma_sm_gg_ttxgg , epochX/sycl/gg_ttggg/SubProcesses/P1_Sigma_sm_gg_ttxggg ]
@@ -61,6 +66,7 @@ jobs:
     env:
       FC: gfortran
       REQUIRE_CUDA: 1
+      SYCLFLAGS: -fsycl
     strategy:
       matrix:
         folder: [ epochX/sycl/ee_mumu/SubProcesses/P1_Sigma_sm_epem_mupmum , epochX/sycl/gg_tt/SubProcesses/P1_Sigma_sm_gg_ttx , epochX/sycl/gg_ttg/SubProcesses/P1_Sigma_sm_gg_ttxg , epochX/sycl/gg_ttgg/SubProcesses/P1_Sigma_sm_gg_ttxgg , epochX/sycl/gg_ttggg/SubProcesses/P1_Sigma_sm_gg_ttxggg ]

From a464c1d180cf3bec948e5399bb3820cf50b6ef16 Mon Sep 17 00:00:00 2001
From: Jooorgen <jorgen.teig@gmail.com>
Date: Thu, 17 Nov 2022 13:58:28 +0100
Subject: [PATCH 075/509] Removed 32 and 64 TPB to speed up profiling

---
 tools/profiling/performanceProfiler.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/profiling/performanceProfiler.py b/tools/profiling/performanceProfiler.py
index 1b0f2a1a15..f3b74fbd75 100644
--- a/tools/profiling/performanceProfiler.py
+++ b/tools/profiling/performanceProfiler.py
@@ -13,7 +13,7 @@
 
 doublePrecisionConstant = 2560
 iterations = 10
-threadsPerBlock = [32, 64, 128, 256]
+threadsPerBlock = [128, 256]
 blocksPerGrid = [16, 32, 64, 128, 256, 512, 1024, 2048, 4096, 8192, 16384]
 
 # Parser

From fdd5f85e0bb831a8d4a9fa85ab7ede756b50732e Mon Sep 17 00:00:00 2001
From: Jooorgen <jorgen.teig@gmail.com>
Date: Thu, 17 Nov 2022 13:59:51 +0100
Subject: [PATCH 076/509] Changed indent length in script

---
 tools/profiling/buildSYCLProcess.sh | 32 ++++++++++++++---------------
 1 file changed, 16 insertions(+), 16 deletions(-)

diff --git a/tools/profiling/buildSYCLProcess.sh b/tools/profiling/buildSYCLProcess.sh
index be48ab1b27..9c848d57bc 100755
--- a/tools/profiling/buildSYCLProcess.sh
+++ b/tools/profiling/buildSYCLProcess.sh
@@ -2,31 +2,31 @@
 
 helpFunction()
 {
-   echo ""
-   echo "Usage: $0 -n gg_ttgg -b 1024 -t 128 -i 10"
-   echo -e "\t-n Name of the physics process being built and run"
-   echo -e "\t-b Blocks per grid"
-   echo -e "\t-t Threads per block"
-   echo -e "\t-i Iterations"
-   exit 1 # Exit script after printing help
+    echo ""
+    echo "Usage: $0 -n gg_ttgg -b 1024 -t 128 -i 10"
+    echo -e "\t-n Name of the physics process being built and run"
+    echo -e "\t-b Blocks per grid"
+    echo -e "\t-t Threads per block"
+    echo -e "\t-i Iterations"
+    exit 1 # Exit script after printing help
 }
 
 while getopts "n:b:t:i:" opt
 do
-   case "$opt" in
-      n ) MG_PROC="$OPTARG" ;; #process to target
-      b ) blocksPerGrid="$OPTARG" ;;
-      t ) threadsPerBlock="$OPTARG" ;;
-      i ) iterations="$OPTARG" ;;
-      ? ) helpFunction ;; # Print helpFunction in case parameter is non-existent
-   esac
+    case "$opt" in
+        n ) MG_PROC="$OPTARG" ;; #process to target
+        b ) blocksPerGrid="$OPTARG" ;;
+        t ) threadsPerBlock="$OPTARG" ;;
+        i ) iterations="$OPTARG" ;;
+        ? ) helpFunction ;; # Print helpFunction in case parameter is non-existent
+    esac
 done
 
 # Print helpFunction in case parameters are empty
 if [ -z "${MG_PROC}" ] || [ -z "${blocksPerGrid}" ] || [ -z "${threadsPerBlock}" ] || [ -z "${iterations}" ]
 then
-   echo "Some or all of the parameters are empty";
-   helpFunction
+    echo "Some or all of the parameters are empty";
+    helpFunction
 fi
 
 # Begin script in case all parameters are correct

From 6fa760428c04f0cac748aebd74b2e78e3bf0acaa Mon Sep 17 00:00:00 2001
From: Jooorgen <jorgen.teig@gmail.com>
Date: Thu, 17 Nov 2022 14:51:29 +0100
Subject: [PATCH 077/509] Changed structure in build scripts to be clearer

---
 tools/profiling/buildCUDAProcess.sh | 20 ++++++--------------
 tools/profiling/buildSYCLProcess.sh | 22 +++++++++++++---------
 2 files changed, 19 insertions(+), 23 deletions(-)

diff --git a/tools/profiling/buildCUDAProcess.sh b/tools/profiling/buildCUDAProcess.sh
index f69a606a05..4b0c5654cc 100755
--- a/tools/profiling/buildCUDAProcess.sh
+++ b/tools/profiling/buildCUDAProcess.sh
@@ -23,7 +23,6 @@ do
 done
 
 # Print helpFunction in case parameters are empty
-
 if [ -z "${MG_PROC}" ] || [ -z "${blocksPerGrid}" ] || [ -z "${threadsPerBlock}" ] || [ -z "${iterations}" ]
 then
    echo "Some or all of the parameters are empty";
@@ -34,32 +33,25 @@ fi
 
 ##################################################################
 
-# Set user specific variables
+# Set variables for later use
 
 # Assumes that this is run from profiling directory in the repo
 prefix=$(pwd)
 
-export CUDA_HOME=/usr/local/cuda-11.6/
-export FC=`which gfortran`
-
-# Set up compiler and compile options and makes workspace
-
 export USEBUILDDIR=1
 export NTPBMAX=1024
 export MG_EXE="./gcheck.exe"
+export CUDA_HOME=/usr/local/cuda-11.6/
+export FC=`which gfortran`
 export WORKSPACE=$prefix/workspace_mg4gpu
-export NAME_PREFIX="cudacpp_v100s_cuda_11.6"
-
-mkdir $WORKSPACE
-
-##################################################################
+export NAME_PREFIX="cudacpp_v100s_cuda_11.6.2_gcc_11.3"
 
 # Sets CUDA in PATH
-
 export PATH=$CUDA_HOME:$PATH
 
-# Finds correct subprocess
+mkdir $WORKSPACE
 
+# Finds correct subprocess
 case $MG_PROC in
     ee_mumu ) export MG_SUBPROC="P1_Sigma_sm_epem_mupmum" ;;
     ee_mumu.sa ) export MG_SUBPROC="P1_Sigma_sm_epem_mupmum" ;;
diff --git a/tools/profiling/buildSYCLProcess.sh b/tools/profiling/buildSYCLProcess.sh
index 9c848d57bc..d30a99f334 100755
--- a/tools/profiling/buildSYCLProcess.sh
+++ b/tools/profiling/buildSYCLProcess.sh
@@ -31,13 +31,24 @@ fi
 
 # Begin script in case all parameters are correct
 
-# Set user/SYCL-flags variables
+##################################################################
+
+# Set variables for later use
+
+# Assumes that this is run from profiling directory in the repo
 prefix=$(pwd)
+
 #export DPCPP_HOME=/p/project/prpb109/sycl_workspace
+export USEBUILDDIR=1
+export NTPBMAX=1024
+export CXX=/cvmfs/projects.cern.ch/intelsw/oneAPI/linux/x86_64/2022/compiler/2022.2.0/linux/bin/dpcpp
 export CUDA_PATH=/usr/local/cuda-11.6
 export SYCLFLAGS="-fsycl -fsycl-targets=nvptx64-nvidia-cuda -Xsycl-target-backend '--cuda-gpu-arch=sm_70' -fgpu-rdc --cuda-path=$CUDA_PATH"
-export NAME_PREFIX="sycl_v100_cuda11.6_gcc11.3"
 export WORKSPACE=$prefix/workspace_mg4gpu
+export NAME_PREFIX="sycl_v100s_cuda_11.6.2_gcc_11.3"
+
+# If unknown set at the run step after running LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$MG_LIBS $MG_EXE --param_card $MG5AMC_CARD_PATH/param_card.dat --device_info 1024 128 10
+export DEVICE_ID=0
 
 # Finds correct subprocess
 case $MG_PROC in
@@ -48,13 +59,6 @@ case $MG_PROC in
     gg_ttggg ) export MG_SUBPROC="P1_Sigma_sm_gg_ttxggg" ;;
 esac
 
-export DEVICE_ID=0 #if unknown set at the run step after running LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$MG_LIBS $MG_EXE --param_card $MG5AMC_CARD_PATH/param_card.dat --device_info 1024 128 10
-
-# Set up compiler and compile options
-export USEBUILDDIR=1
-export NTPBMAX=1024
-export CXX=/cvmfs/projects.cern.ch/intelsw/oneAPI/linux/x86_64/2022/compiler/2022.2.0/linux/bin/dpcpp
-
 mkdir -p $WORKSPACE/mg4gpu/lib
 mkdir -p $WORKSPACE/mg4gpu/bin
 

From a50b48ad726e203e19bf565734f713c94f0f0f9e Mon Sep 17 00:00:00 2001
From: Jooorgen <jorgen.teig@gmail.com>
Date: Fri, 18 Nov 2022 10:56:51 +0100
Subject: [PATCH 078/509] Fixed bug in SYCL build script and added a daily
 folder for JSON reports to scripts

---
 tools/profiling/buildCUDAProcess.sh |  6 +++++-
 tools/profiling/buildSYCLProcess.sh | 12 ++++++++----
 2 files changed, 13 insertions(+), 5 deletions(-)

diff --git a/tools/profiling/buildCUDAProcess.sh b/tools/profiling/buildCUDAProcess.sh
index 4b0c5654cc..a2028d6a6c 100755
--- a/tools/profiling/buildCUDAProcess.sh
+++ b/tools/profiling/buildCUDAProcess.sh
@@ -51,6 +51,10 @@ export PATH=$CUDA_HOME:$PATH
 
 mkdir $WORKSPACE
 
+mkdir -p $WORKSPACE/$(date +"%y-%m-%d")_${NAME_PREFIX}
+
+export REPORT_FOLDER=$WORKSPACE/$(date +"%y-%m-%d")_${NAME_PREFIX}
+
 # Finds correct subprocess
 case $MG_PROC in
     ee_mumu ) export MG_SUBPROC="P1_Sigma_sm_epem_mupmum" ;;
@@ -81,4 +85,4 @@ mkdir -p perf/data/
 $MG_EXE -j $blocksPerGrid $threadsPerBlock $iterations
 
 cd perf/data/
-mv 0-perf-test-run0.json ${WORKSPACE}/test_${NAME_PREFIX}_${MG_PROC}_${blocksPerGrid}_${threadsPerBlock}_${iterations}.json
\ No newline at end of file
+mv 0-perf-test-run0.json ${REPORT_FOLDER}/test_${NAME_PREFIX}_${MG_PROC}_${blocksPerGrid}_${threadsPerBlock}_${iterations}.json
\ No newline at end of file
diff --git a/tools/profiling/buildSYCLProcess.sh b/tools/profiling/buildSYCLProcess.sh
index d30a99f334..b5f1034372 100755
--- a/tools/profiling/buildSYCLProcess.sh
+++ b/tools/profiling/buildSYCLProcess.sh
@@ -62,6 +62,10 @@ esac
 mkdir -p $WORKSPACE/mg4gpu/lib
 mkdir -p $WORKSPACE/mg4gpu/bin
 
+mkdir -p $WORKSPACE/$(date +"%y-%m-%d")_${NAME_PREFIX}
+
+export REPORT_FOLDER=$WORKSPACE/$(date +"%y-%m-%d")_${NAME_PREFIX}
+
 export MG4GPU_LIB=$WORKSPACE/mg4gpu/lib
 export MG4GPU_BIN=$WORKSPACE/mg4gpu/bin
 
@@ -79,8 +83,8 @@ export MG5AMC_CARD_PATH=$MG_PROC_DIR/Cards
 # Build executable
 cd $MG_SP_DIR
 make
-mv ../../lib/build.d_inl0/ $MG_LIBS_DIR #2>/dev/null; true
-mv build.d_inl0/ $MG_EXE_DIR #2>/dev/null; true
+cp ../../lib/build.d_inl0/ $MG_LIBS_DIR #2>/dev/null; true
+cp build.d_inl0/ $MG_EXE_DIR #2>/dev/null; true
 
 # Run executable
 cd $WORKSPACE
@@ -89,7 +93,7 @@ cd $WORKSPACE
 #LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$MG_LIBS $MG_EXE --param_card $MG5AMC_CARD_PATH/param_card.dat --device_info 32 32 10
 
 # Add MG Libs to linker library path and run the executable
-LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$MG_LIBS $MG_EXE -j --json_file $WORKSPACE/test_${NAME_PREFIX}_${MG_PROC}_${blocksPerGrid}_${threadsPerBlock}_${iterations}.json --param_card $MG5AMC_CARD_PATH /param_card.dat --device_id $DEVICE_ID $blocksPerGrid $threadsPerBlock $iterations
+LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$MG_LIBS $MG_EXE -j --json_file $REPORT_FOLDER/test_${NAME_PREFIX}_${MG_PROC}_${blocksPerGrid}_${threadsPerBlock}_${iterations}.json --param_card $MG5AMC_CARD_PATH/param_card.dat --device_id $DEVICE_ID $blocksPerGrid $threadsPerBlock $iterations
 
 # View output
-#nano $WORKSPACE/test_${NAME_PREFIX}_${MG_PROC}_${MG_SUBPROC}_${blocksPerGrid}_${threadsPerBlock}_${iterations}.json-+
\ No newline at end of file
+#nano $REPORT_FOLDER/test_${NAME_PREFIX}_${MG_PROC}_${blocksPerGrid}_${threadsPerBlock}_${iterations}.json-+
\ No newline at end of file

From 65ee72a088b06fe745fc42a1b06d6f15b86c4163 Mon Sep 17 00:00:00 2001
From: Jooorgen <jorgen.teig@gmail.com>
Date: Tue, 22 Nov 2022 17:04:17 +0100
Subject: [PATCH 079/509] Fixed issues with SYCL build script

---
 tools/profiling/buildSYCLProcess.sh | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/tools/profiling/buildSYCLProcess.sh b/tools/profiling/buildSYCLProcess.sh
index b5f1034372..b7e662c7d3 100755
--- a/tools/profiling/buildSYCLProcess.sh
+++ b/tools/profiling/buildSYCLProcess.sh
@@ -41,7 +41,7 @@ prefix=$(pwd)
 #export DPCPP_HOME=/p/project/prpb109/sycl_workspace
 export USEBUILDDIR=1
 export NTPBMAX=1024
-export CXX=/cvmfs/projects.cern.ch/intelsw/oneAPI/linux/x86_64/2022/compiler/2022.2.0/linux/bin/dpcpp
+export CXX=/afs/cern.ch/work/j/jteig/sycl_workspace/llvm/build/bin/clang++
 export CUDA_PATH=/usr/local/cuda-11.6
 export SYCLFLAGS="-fsycl -fsycl-targets=nvptx64-nvidia-cuda -Xsycl-target-backend '--cuda-gpu-arch=sm_70' -fgpu-rdc --cuda-path=$CUDA_PATH"
 export WORKSPACE=$prefix/workspace_mg4gpu
@@ -62,7 +62,7 @@ esac
 mkdir -p $WORKSPACE/mg4gpu/lib
 mkdir -p $WORKSPACE/mg4gpu/bin
 
-mkdir -p $WORKSPACE/$(date +"%y-%m-%d")_${NAME_PREFIX}
+mkdir $WORKSPACE/$(date +"%y-%m-%d")_${NAME_PREFIX}
 
 export REPORT_FOLDER=$WORKSPACE/$(date +"%y-%m-%d")_${NAME_PREFIX}
 
@@ -72,19 +72,19 @@ export MG4GPU_BIN=$WORKSPACE/mg4gpu/bin
 export MG_PROC_DIR=$prefix/madgraph4gpu/epochX/sycl/$MG_PROC
 export MG_SP_DIR=$MG_PROC_DIR/SubProcesses/$MG_SUBPROC
 
-export MG_LIBS_DIR="${MG4GPU_LIB}/build_${MG_PROC}_${MG_SUBPROC}_${NAME_PREFIX}"
+export MG_LIBS_DIR="${MG4GPU_LIB}/build_${MG_PROC}_${NAME_PREFIX}"
 # export MG_LIBS="$DPCPP_HOME/llvm/build/lib:$MG_LIBS_DIR"
 export MG_LIBS=$MG_LIBS_DIR
 
-export MG_EXE_DIR="${MG4GPU_BIN}/build_${MG_PROC}_${MG_SUBPROC}_${NAME_PREFIX}"
+export MG_EXE_DIR="${MG4GPU_BIN}/build_${MG_PROC}_${NAME_PREFIX}"
 export MG_EXE="$MG_EXE_DIR/check.exe"
 export MG5AMC_CARD_PATH=$MG_PROC_DIR/Cards
 
 # Build executable
 cd $MG_SP_DIR
 make
-cp ../../lib/build.d_inl0/ $MG_LIBS_DIR #2>/dev/null; true
-cp build.d_inl0/ $MG_EXE_DIR #2>/dev/null; true
+mv ../../lib/build.d_inl0/ $MG_LIBS_DIR #2>/dev/null; true
+mv build.d_inl0/ $MG_EXE_DIR #2>/dev/null; true
 
 # Run executable
 cd $WORKSPACE

From 084c26919b2dbd99113ceceb174ace1d3e02ecae Mon Sep 17 00:00:00 2001
From: Jooorgen <jorgen.teig@gmail.com>
Date: Tue, 22 Nov 2022 17:15:37 +0100
Subject: [PATCH 080/509] Fixed bug in path to physics process

---
 tools/profiling/buildSYCLProcess.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/profiling/buildSYCLProcess.sh b/tools/profiling/buildSYCLProcess.sh
index b7e662c7d3..2716400502 100755
--- a/tools/profiling/buildSYCLProcess.sh
+++ b/tools/profiling/buildSYCLProcess.sh
@@ -69,7 +69,7 @@ export REPORT_FOLDER=$WORKSPACE/$(date +"%y-%m-%d")_${NAME_PREFIX}
 export MG4GPU_LIB=$WORKSPACE/mg4gpu/lib
 export MG4GPU_BIN=$WORKSPACE/mg4gpu/bin
 
-export MG_PROC_DIR=$prefix/madgraph4gpu/epochX/sycl/$MG_PROC
+export MG_PROC_DIR=$prefix/../../epochX/sycl/$MG_PROC
 export MG_SP_DIR=$MG_PROC_DIR/SubProcesses/$MG_SUBPROC
 
 export MG_LIBS_DIR="${MG4GPU_LIB}/build_${MG_PROC}_${NAME_PREFIX}"

From 97b80f70a161cc8f2b136cbb2da035296f95c08b Mon Sep 17 00:00:00 2001
From: Jooorgen <jorgen.teig@gmail.com>
Date: Thu, 24 Nov 2022 12:11:33 +0100
Subject: [PATCH 081/509] Changed variables for testing

---
 tools/profiling/performanceProfiler.py | 14 +++++++++-----
 1 file changed, 9 insertions(+), 5 deletions(-)

diff --git a/tools/profiling/performanceProfiler.py b/tools/profiling/performanceProfiler.py
index f3b74fbd75..f27d7985e0 100644
--- a/tools/profiling/performanceProfiler.py
+++ b/tools/profiling/performanceProfiler.py
@@ -7,14 +7,18 @@
 
 # Parser arguments defaults
 absLayer = "SYCL"
-branch = "br_golden_epochX4"
+branch = "master"
 
 mgProcesses = ["ee_mumu", "gg_tt", "gg_ttg", "gg_ttgg", "gg_ttggg"]
 
-doublePrecisionConstant = 2560
-iterations = 10
-threadsPerBlock = [128, 256]
-blocksPerGrid = [16, 32, 64, 128, 256, 512, 1024, 2048, 4096, 8192, 16384]
+#doublePrecisionConstant = 2560
+doublePrecisionConstant = 1
+#iterations = 10
+iterations = 1
+#threadsPerBlock = [128, 256]
+threadsPerBlock = [32]
+#blocksPerGrid = [16, 32, 64, 128, 256, 512, 1024, 2048, 4096, 8192, 16384]
+blocksPerGrid = [32]
 
 # Parser
 parser = argparse.ArgumentParser(description='A program for profiling GPUs using MadGraph.')

From 16f8c40bc2188654aff82711770b2383db825dfc Mon Sep 17 00:00:00 2001
From: Jooorgen <jorgen.teig@gmail.com>
Date: Thu, 24 Nov 2022 15:16:48 +0100
Subject: [PATCH 082/509] Refactored build scripts and made minor improvments
 to working with CI

---
 tools/profiling/buildCUDAProcess.sh | 19 ++++++++++---------
 tools/profiling/buildSYCLProcess.sh | 27 +++++++++++++++++----------
 2 files changed, 27 insertions(+), 19 deletions(-)

diff --git a/tools/profiling/buildCUDAProcess.sh b/tools/profiling/buildCUDAProcess.sh
index a2028d6a6c..2b4d88fd12 100755
--- a/tools/profiling/buildCUDAProcess.sh
+++ b/tools/profiling/buildCUDAProcess.sh
@@ -40,20 +40,21 @@ prefix=$(pwd)
 
 export USEBUILDDIR=1
 export NTPBMAX=1024
-export MG_EXE="./gcheck.exe"
+export CXX=/cvmfs/sft.cern.ch/lcg/releases/gcc/11.3.0-ad0f5/x86_64-centos8/bin/g++
+export MG_EXE="./gcheck.exe" #GPU
+#export MG_EXE="./check.exe" #CPU
 export CUDA_HOME=/usr/local/cuda-11.6/
 export FC=`which gfortran`
 export WORKSPACE=$prefix/workspace_mg4gpu
-export NAME_PREFIX="cudacpp_v100s_cuda_11.6.2_gcc_11.3"
+#export NAME_PREFIX="cudacpp_v100s_cuda_11.6.2_gcc_11.3"
+
+REPORT_FOLDER_PREFIX="${WORKSPACE}/$(date +"%y-%m-%d")_${CUDA_NAME_PREFIX}_master"
 
 # Sets CUDA in PATH
 export PATH=$CUDA_HOME:$PATH
 
-mkdir $WORKSPACE
-
-mkdir -p $WORKSPACE/$(date +"%y-%m-%d")_${NAME_PREFIX}
-
-export REPORT_FOLDER=$WORKSPACE/$(date +"%y-%m-%d")_${NAME_PREFIX}
+mkdir $WORKSPACE 2>/dev/null; true
+mkdir $REPORT_FOLDER_PREFIX 2>/dev/null; true
 
 # Finds correct subprocess
 case $MG_PROC in
@@ -81,8 +82,8 @@ make
 # Run executable
 
 cd build*
-mkdir -p perf/data/
+mkdir -p perf/data/ 2>/dev/null; true
 $MG_EXE -j $blocksPerGrid $threadsPerBlock $iterations
 
 cd perf/data/
-mv 0-perf-test-run0.json ${REPORT_FOLDER}/test_${NAME_PREFIX}_${MG_PROC}_${blocksPerGrid}_${threadsPerBlock}_${iterations}.json
\ No newline at end of file
+mv 0-perf-test-run0.json ${REPORT_FOLDER}/test_${MG_PROC}_${CUDA_NAME_PREFIX}_${blocksPerGrid}_${threadsPerBlock}_${iterations}.json
\ No newline at end of file
diff --git a/tools/profiling/buildSYCLProcess.sh b/tools/profiling/buildSYCLProcess.sh
index 2716400502..fb8cccac15 100755
--- a/tools/profiling/buildSYCLProcess.sh
+++ b/tools/profiling/buildSYCLProcess.sh
@@ -45,7 +45,10 @@ export CXX=/afs/cern.ch/work/j/jteig/sycl_workspace/llvm/build/bin/clang++
 export CUDA_PATH=/usr/local/cuda-11.6
 export SYCLFLAGS="-fsycl -fsycl-targets=nvptx64-nvidia-cuda -Xsycl-target-backend '--cuda-gpu-arch=sm_70' -fgpu-rdc --cuda-path=$CUDA_PATH"
 export WORKSPACE=$prefix/workspace_mg4gpu
-export NAME_PREFIX="sycl_v100s_cuda_11.6.2_gcc_11.3"
+#export NAME_PREFIX="sycl_v100s_cuda_11.6.2_gcc_11.3"
+#export NAME_PREFIX="sycl_Xeon-Silver-4216_a100s_cuda-11.6.2_gcc-11.3"
+
+REPORT_FOLDER_PREFIX="${WORKSPACE}/$(date +"%y-%m-%d")_${SYCL_NAME_PREFIX}_master"
 
 # If unknown set at the run step after running LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$MG_LIBS $MG_EXE --param_card $MG5AMC_CARD_PATH/param_card.dat --device_info 1024 128 10
 export DEVICE_ID=0
@@ -53,18 +56,22 @@ export DEVICE_ID=0
 # Finds correct subprocess
 case $MG_PROC in
     ee_mumu ) export MG_SUBPROC="P1_Sigma_sm_epem_mupmum" ;;
+    ee_mumu.sa ) export MG_SUBPROC="P1_Sigma_sm_epem_mupmum" ;;
     gg_tt ) export MG_SUBPROC="P1_Sigma_sm_gg_ttx" ;;
+    gg_tt.sa ) export MG_SUBPROC="P1_Sigma_sm_gg_ttx" ;;
     gg_ttg ) export MG_SUBPROC="P1_Sigma_sm_gg_ttxg" ;;
+    gg_ttg.sa ) export MG_SUBPROC="P1_Sigma_sm_gg_ttxg" ;;
     gg_ttgg ) export MG_SUBPROC="P1_Sigma_sm_gg_ttxgg" ;;
+    gg_ttgg.sa ) export MG_SUBPROC="P1_Sigma_sm_gg_ttxgg" ;;
     gg_ttggg ) export MG_SUBPROC="P1_Sigma_sm_gg_ttxggg" ;;
+    gg_ttggg.sa ) export MG_SUBPROC="P1_Sigma_sm_gg_ttxggg" ;;
 esac
 
-mkdir -p $WORKSPACE/mg4gpu/lib
-mkdir -p $WORKSPACE/mg4gpu/bin
-
-mkdir $WORKSPACE/$(date +"%y-%m-%d")_${NAME_PREFIX}
+mkdir -p $WORKSPACE/mg4gpu/lib 2>/dev/null; true
+mkdir -p $WORKSPACE/mg4gpu/bin 2>/dev/null; true
+mkdir $REPORT_FOLDER_PREFIX 2>/dev/null; true
 
-export REPORT_FOLDER=$WORKSPACE/$(date +"%y-%m-%d")_${NAME_PREFIX}
+export REPORT_FOLDER=$WORKSPACE/$(date +"%y-%m-%d")_${SYCL_NAME_PREFIX}
 
 export MG4GPU_LIB=$WORKSPACE/mg4gpu/lib
 export MG4GPU_BIN=$WORKSPACE/mg4gpu/bin
@@ -72,11 +79,11 @@ export MG4GPU_BIN=$WORKSPACE/mg4gpu/bin
 export MG_PROC_DIR=$prefix/../../epochX/sycl/$MG_PROC
 export MG_SP_DIR=$MG_PROC_DIR/SubProcesses/$MG_SUBPROC
 
-export MG_LIBS_DIR="${MG4GPU_LIB}/build_${MG_PROC}_${NAME_PREFIX}"
+export MG_LIBS_DIR="${MG4GPU_LIB}/build_${MG_PROC}_${SYCL_NAME_PREFIX}"
 # export MG_LIBS="$DPCPP_HOME/llvm/build/lib:$MG_LIBS_DIR"
 export MG_LIBS=$MG_LIBS_DIR
 
-export MG_EXE_DIR="${MG4GPU_BIN}/build_${MG_PROC}_${NAME_PREFIX}"
+export MG_EXE_DIR="${MG4GPU_BIN}/build_${MG_PROC}_${SYCL_NAME_PREFIX}"
 export MG_EXE="$MG_EXE_DIR/check.exe"
 export MG5AMC_CARD_PATH=$MG_PROC_DIR/Cards
 
@@ -93,7 +100,7 @@ cd $WORKSPACE
 #LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$MG_LIBS $MG_EXE --param_card $MG5AMC_CARD_PATH/param_card.dat --device_info 32 32 10
 
 # Add MG Libs to linker library path and run the executable
-LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$MG_LIBS $MG_EXE -j --json_file $REPORT_FOLDER/test_${NAME_PREFIX}_${MG_PROC}_${blocksPerGrid}_${threadsPerBlock}_${iterations}.json --param_card $MG5AMC_CARD_PATH/param_card.dat --device_id $DEVICE_ID $blocksPerGrid $threadsPerBlock $iterations
+LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$MG_LIBS $MG_EXE -j --json_file ${REPORT_FOLDER}/test_${MG_PROC}_${SYCL_NAME_PREFIX}_${blocksPerGrid}_${threadsPerBlock}_${iterations}.json --param_card $MG5AMC_CARD_PATH/param_card.dat --device_id $DEVICE_ID $blocksPerGrid $threadsPerBlock $iterations
 
 # View output
-#nano $REPORT_FOLDER/test_${NAME_PREFIX}_${MG_PROC}_${blocksPerGrid}_${threadsPerBlock}_${iterations}.json-+
\ No newline at end of file
+#nano $REPORT_FOLDER/test_${SYCL_NAME_PREFIX}_${MG_PROC}_${blocksPerGrid}_${threadsPerBlock}_${iterations}.json-+
\ No newline at end of file

From fbdb0d0ea66ccc0efb17ad4a7b572ffe28f28e20 Mon Sep 17 00:00:00 2001
From: Jooorgen <jorgen.teig@gmail.com>
Date: Thu, 24 Nov 2022 15:19:26 +0100
Subject: [PATCH 083/509] Added script for sending JSON reports to database

---
 tools/profiling/sendData.py | 124 ++++++++++++++++++++++++++++++++++++
 1 file changed, 124 insertions(+)
 create mode 100644 tools/profiling/sendData.py

diff --git a/tools/profiling/sendData.py b/tools/profiling/sendData.py
new file mode 100644
index 0000000000..eec84bbb6b
--- /dev/null
+++ b/tools/profiling/sendData.py
@@ -0,0 +1,124 @@
+import os
+import glob
+import json
+import re
+import logging
+import subprocess
+import datetime
+import argparse
+
+import sys
+
+# Parameter defaults
+URL = 'https://dbod-madgraph4gpu-db.cern.ch:8082/api/v2/write?bucket=ProfilerData'
+secret = os.getenv('MADGRAPH4GPU_DB_SECRET')
+Auth = ['db_user', secret]
+GPU = 'a100s'
+physicsProcesses = ['ee_mumu', 'gg_ttggg', 'gg_ttgg', 'gg_ttg', 'gg_tt']
+absLayer = ['SYCL', 'CUDA', 'sycl', 'cuda']
+GCCVersion = '11.3'
+CUDAVersion = '11.6.2'
+fields = ['EvtsPerSec[MatrixElems] (3)', 'EvtsPerSec[MECalcOnly] (3)']
+reportPath = 'C:\\Users\\jteig\\cernbox\\Documents\\CERN\\reports\\CUDA_v100s_Profiling_16.09_GCC10.3_CUDA11.5_GOLDEN_EPOCHX4'
+
+# Argument parser
+parser = argparse.ArgumentParser(description='A script for sending data from profiler to InfluxDB.')
+
+parser.add_argument('-r', '--reportPath', help="Path for the reports that is being put into the database.", default=reportPath)
+parser.add_argument('-f', '--fields', help="Fields in the JSON to be put into the database.", default=fields)
+parser.add_argument('-g', '--gpu', help="GPU used when profiling.", default=GPU)
+parser.add_argument('--GCCVersion', help="GCC version used when profiling.", default=GCCVersion)
+parser.add_argument('--CUDAVersion', help="CUDA version used when profiling.", default=CUDAVersion)
+parser.add_argument('-a', '--absLayer', help="Abstraction layer used when profiling.", default=absLayer[0], choices=absLayer)
+parser.add_argument('-p', '--profiler', help="Enable CI profiling defaults.", action=argparse.BooleanOptionalAction)
+
+args = parser.parse_args()
+
+os.environ['SYCL_NAME_PREFIX'] = 'sycl_Xeon-Silver-4216_a100s_gcc-11.3_cuda-11.6.2'
+os.environ['CUDA_NAME_PREFIX'] = 'cudacpp_Xeon-Silver-4216_a100s_gcc-11.3_cuda-11.6.2'
+
+#
+#   Main
+#
+if __name__=='__main__':
+
+    if args.profiler == True:
+
+        if args.absLayer.upper() == "SYCL":
+
+            syclNamePrefix = os.getenv('SYCL_NAME_PREFIX')
+            
+            if syclNamePrefix == None:
+                logging.error('Sycl name prefix has not been set!')
+                sys.exit(1)
+
+            reportfolder= "workspace_mg4gpu/" + datetime.datetime.now().strftime('%y-%m-%d') + '_' + syclNamePrefix
+
+            if not os.path.exists(reportfolder):
+                logging.error('SYCL report path does not exist!')
+                sys.exit(1)
+
+        elif args.absLayer.upper() == "CUDA":
+
+            cudaNamePrefix = os.getenv('CUDA_NAME_PREFIX')
+            if cudaNamePrefix == None:
+                logging.error('Cuda name prefix has not been set!')
+                sys.exit(1)
+
+            reportfolder= "workspace_mg4gpu/" + datetime.datetime.now().strftime('%y-%m-%d') + '_' + cudaNamePrefix
+
+            if not os.path.exists(reportfolder):
+                logging.error('CUDA report path does not exist!')
+                sys.exit(1)
+
+        else:
+            logging.error('No abstraction layer that is supported has been selected!')
+            sys.exit(1)
+
+    else:
+        reportfolder = args.reportPath
+
+    filePath = []
+    filePath.append(glob.glob(reportfolder + '/test_*.json'))
+    filePath.append(glob.glob(reportfolder + '/*/test_*.json'))
+
+    # Flatten the list
+    files = [p for sublist in filePath for p in sublist]
+
+    for file in files:
+            f = open(file, 'r')
+
+            fileName = (os.path.basename(file))
+
+            for process in physicsProcesses:
+                if process in fileName:
+                    physicsProcess = process
+                    break
+
+            f = f.read()
+
+            if f != '':
+                data = json.loads(f)
+
+                gridsize = data[0]["NumThreadsPerBlock"] * data[0]["NumBlocksPerGrid"]
+
+                DBdata = f'{physicsProcess},GPU={args.gpu},AbstractionLayer={args.absLayer.upper()},GCCVersion={args.GCCVersion},CUDAVersion={args.CUDAVersion},NumThreadsPerBlock={data[0]["NumThreadsPerBlock"]},NumBlocksPerGrid={data[0]["NumBlocksPerGrid"]},NumIterations={data[0]["NumIterations"]} Gridsize={gridsize}'
+
+                for field in fields:
+                    value = float(re.findall(r'[\d.]+',data[0][field])[0])
+
+                    DBdata = DBdata + ',' + args.absLayer + "_" + field.replace(" ", "_") + '=' + str(value)
+
+                #DBdata = DBdata + ' 1668164400'
+
+                requestInfo = ["curl", "-i",  '-XPOST', "-i",  URL, "--header",  "Authorization: Token "+Auth[0]+":"+Auth[1], "--data-raw", DBdata]
+                
+                request = subprocess.run(requestInfo, stdout=subprocess.DEVNULL)
+                
+                if request.returncode != 0:
+                    print(str(datetime.datetime.now().strftime("%H:%M:%S")) + " Request FAILED! Data: " + DBdata)
+                else:
+                    print(str(datetime.datetime.now().strftime("%H:%M:%S")) + " Request COMPLETED! Data: " + DBdata)
+
+
+            else: logging.error('No information/fields in the JSON report!')
\ No newline at end of file

From f91455eca13cf719e52483488098a4ba18d93543 Mon Sep 17 00:00:00 2001
From: Jooorgen <jorgen.teig@gmail.com>
Date: Thu, 24 Nov 2022 15:47:31 +0100
Subject: [PATCH 084/509] Updated profiler workflow to work with CUDA and SYCL

---
 .github/workflows/profiler.yml | 18 ++++++++++++++++--
 1 file changed, 16 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/profiler.yml b/.github/workflows/profiler.yml
index f94479e165..2364b856ba 100644
--- a/.github/workflows/profiler.yml
+++ b/.github/workflows/profiler.yml
@@ -6,11 +6,25 @@ on:
   pull_request:
     branches: [ master ]
 
+env:
+  NAME_PREFIX: sycl 
+
 jobs:
   a100s_Profiling:
     name: A100S Profiling
+    env:
+      SYCL_NAME_PREFIX: sycl_Xeon-Silver-4216_a100s_gcc-11.3_cuda-11.6.2
+      CUDA_NAME_PREFIX: cudacpp_Xeon-Silver-4216_a100s_gcc-11.3_cuda-11.6.2
+
+      MADGRAPH4GPU_DB_SECRET: ${{ secrets.MADGRAPH4GPU_DB_SECRET }}
     runs-on: [self-hosted, linux, a100s]
     steps:
     - uses: actions/checkout@v2
-    - name: Runs performanceProfiler.py script
-      run: cd tools/profiling/; source /cvmfs/sft.cern.ch/lcg/contrib/gcc/11.3.0/x86_64-centos8/setup.sh; source /cvmfs/projects.cern.ch/intelsw/oneAPI/linux/x86_64/2022/setvars.sh; python3 performanceProfiler.py -l 'CUDA' -b 'master'
\ No newline at end of file
+    - name: Runs SYCL performanceProfiler.py script
+      run: cd tools/profiling/; source /cvmfs/sft.cern.ch/lcg/contrib/gcc/11.3.0/x86_64-centos8/setup.sh; python3 performanceProfiler.py -l 'SYCL' -b 'master'
+    - name: Uploads SYCL JSON files to DB
+      run: ./tools/profiling/sendData.py --profiler --absLayer "SYCL"
+    - name: Runs CUDA performanceProfiler.py script
+      run: cd tools/profiling/; source /cvmfs/sft.cern.ch/lcg/contrib/gcc/11.3.0/x86_64-centos8/setup.sh; python3 performanceProfiler.py -l 'CUDA' -b 'master'
+    - name: Uploads CUDA JSON files to DB
+      run: ./tools/profiling/sendData.py --profiler --absLayer "CUDA"
\ No newline at end of file

From a07d8e2b0277ad8a849c0636e03de4b322a93e6c Mon Sep 17 00:00:00 2001
From: Jooorgen <jorgen.teig@gmail.com>
Date: Thu, 24 Nov 2022 17:43:06 +0100
Subject: [PATCH 085/509] Added the correct GCC toolchain for compilation of
 SYCL in the makefiles in each process

---
 epochX/sycl/ee_mumu.sa/SubProcesses/sycl.mk  | 2 +-
 epochX/sycl/gg_tt.sa/SubProcesses/sycl.mk    | 2 +-
 epochX/sycl/gg_ttg.sa/SubProcesses/sycl.mk   | 2 +-
 epochX/sycl/gg_ttgg.sa/SubProcesses/sycl.mk  | 2 +-
 epochX/sycl/gg_ttggg.sa/SubProcesses/sycl.mk | 2 +-
 5 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/epochX/sycl/ee_mumu.sa/SubProcesses/sycl.mk b/epochX/sycl/ee_mumu.sa/SubProcesses/sycl.mk
index badb5bee57..f5b0fcf8d5 100644
--- a/epochX/sycl/ee_mumu.sa/SubProcesses/sycl.mk
+++ b/epochX/sycl/ee_mumu.sa/SubProcesses/sycl.mk
@@ -42,7 +42,7 @@ INCFLAGS += -I$(TOOLSDIR)
 
 #=== Configure the C++ compiler
 
-CXXFLAGS = $(OPTFLAGS) -std=c++20 $(INCFLAGS) -Wall -Wshadow -Wextra
+CXXFLAGS = $(OPTFLAGS) -std=c++20 $(INCFLAGS) -Wall -Wshadow -Wextra --gcc-toolchain="/cvmfs/sft.cern.ch/lcg/releases/gcc/11.3.0-ad0f5/x86_64-centos8"
 CXXFLAGS+= -ffast-math # see issue #117
 ifndef SYCLFLAGS
   $(error SYCLFLAGS not set)
diff --git a/epochX/sycl/gg_tt.sa/SubProcesses/sycl.mk b/epochX/sycl/gg_tt.sa/SubProcesses/sycl.mk
index badb5bee57..f5b0fcf8d5 100644
--- a/epochX/sycl/gg_tt.sa/SubProcesses/sycl.mk
+++ b/epochX/sycl/gg_tt.sa/SubProcesses/sycl.mk
@@ -42,7 +42,7 @@ INCFLAGS += -I$(TOOLSDIR)
 
 #=== Configure the C++ compiler
 
-CXXFLAGS = $(OPTFLAGS) -std=c++20 $(INCFLAGS) -Wall -Wshadow -Wextra
+CXXFLAGS = $(OPTFLAGS) -std=c++20 $(INCFLAGS) -Wall -Wshadow -Wextra --gcc-toolchain="/cvmfs/sft.cern.ch/lcg/releases/gcc/11.3.0-ad0f5/x86_64-centos8"
 CXXFLAGS+= -ffast-math # see issue #117
 ifndef SYCLFLAGS
   $(error SYCLFLAGS not set)
diff --git a/epochX/sycl/gg_ttg.sa/SubProcesses/sycl.mk b/epochX/sycl/gg_ttg.sa/SubProcesses/sycl.mk
index badb5bee57..f5b0fcf8d5 100644
--- a/epochX/sycl/gg_ttg.sa/SubProcesses/sycl.mk
+++ b/epochX/sycl/gg_ttg.sa/SubProcesses/sycl.mk
@@ -42,7 +42,7 @@ INCFLAGS += -I$(TOOLSDIR)
 
 #=== Configure the C++ compiler
 
-CXXFLAGS = $(OPTFLAGS) -std=c++20 $(INCFLAGS) -Wall -Wshadow -Wextra
+CXXFLAGS = $(OPTFLAGS) -std=c++20 $(INCFLAGS) -Wall -Wshadow -Wextra --gcc-toolchain="/cvmfs/sft.cern.ch/lcg/releases/gcc/11.3.0-ad0f5/x86_64-centos8"
 CXXFLAGS+= -ffast-math # see issue #117
 ifndef SYCLFLAGS
   $(error SYCLFLAGS not set)
diff --git a/epochX/sycl/gg_ttgg.sa/SubProcesses/sycl.mk b/epochX/sycl/gg_ttgg.sa/SubProcesses/sycl.mk
index badb5bee57..f5b0fcf8d5 100644
--- a/epochX/sycl/gg_ttgg.sa/SubProcesses/sycl.mk
+++ b/epochX/sycl/gg_ttgg.sa/SubProcesses/sycl.mk
@@ -42,7 +42,7 @@ INCFLAGS += -I$(TOOLSDIR)
 
 #=== Configure the C++ compiler
 
-CXXFLAGS = $(OPTFLAGS) -std=c++20 $(INCFLAGS) -Wall -Wshadow -Wextra
+CXXFLAGS = $(OPTFLAGS) -std=c++20 $(INCFLAGS) -Wall -Wshadow -Wextra --gcc-toolchain="/cvmfs/sft.cern.ch/lcg/releases/gcc/11.3.0-ad0f5/x86_64-centos8"
 CXXFLAGS+= -ffast-math # see issue #117
 ifndef SYCLFLAGS
   $(error SYCLFLAGS not set)
diff --git a/epochX/sycl/gg_ttggg.sa/SubProcesses/sycl.mk b/epochX/sycl/gg_ttggg.sa/SubProcesses/sycl.mk
index badb5bee57..f5b0fcf8d5 100644
--- a/epochX/sycl/gg_ttggg.sa/SubProcesses/sycl.mk
+++ b/epochX/sycl/gg_ttggg.sa/SubProcesses/sycl.mk
@@ -42,7 +42,7 @@ INCFLAGS += -I$(TOOLSDIR)
 
 #=== Configure the C++ compiler
 
-CXXFLAGS = $(OPTFLAGS) -std=c++20 $(INCFLAGS) -Wall -Wshadow -Wextra
+CXXFLAGS = $(OPTFLAGS) -std=c++20 $(INCFLAGS) -Wall -Wshadow -Wextra --gcc-toolchain="/cvmfs/sft.cern.ch/lcg/releases/gcc/11.3.0-ad0f5/x86_64-centos8"
 CXXFLAGS+= -ffast-math # see issue #117
 ifndef SYCLFLAGS
   $(error SYCLFLAGS not set)

From c2c0a95e45c1b0a0d61979fca35c59e170d9aa52 Mon Sep 17 00:00:00 2001
From: Jooorgen <jorgen.teig@gmail.com>
Date: Thu, 24 Nov 2022 17:53:21 +0100
Subject: [PATCH 086/509] Fix for new folder structure in repo

---
 tools/profiling/performanceProfiler.py | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/tools/profiling/performanceProfiler.py b/tools/profiling/performanceProfiler.py
index f27d7985e0..9e23e2f8e4 100644
--- a/tools/profiling/performanceProfiler.py
+++ b/tools/profiling/performanceProfiler.py
@@ -37,14 +37,18 @@
             if (TPB * BPG > doublePrecisionConstant):
 
                 if pyArgs.l.upper() == 'SYCL':
+
+                    if ".sa" not in process:
+                        process = process + ".sa"
+
                     bashArgs = ["./buildSYCLProcess.sh", "-n",  process, "-i",  str(iterations), "-t",  str(TPB), "-b", str(BPG)]
 
                 elif pyArgs.l.upper() == 'CUDA':
 
                     # There is no .sa in br_golden_epochX4 so it makes sure that .sa is included in everything other than that branch
-                    if pyArgs.b != 'br_golden_epochX4':
-                        if ".sa" not in process:
-                            process = process + ".sa"
+                    #if pyArgs.b != 'br_golden_epochX4':
+                    if ".sa" not in process:
+                        process = process + ".sa"
                     
                     bashArgs = ["./buildCUDAProcess.sh", "-n",  process, "-i",  str(iterations), "-t",  str(TPB), "-b", str(BPG)]
 

From eaf7e6b014ea2a31737fda54977a5e29e7ceadff Mon Sep 17 00:00:00 2001
From: Jooorgen <jorgen.teig@gmail.com>
Date: Fri, 25 Nov 2022 15:24:23 +0100
Subject: [PATCH 087/509] Fixed bug in SYCL build script

---
 tools/profiling/buildSYCLProcess.sh | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/tools/profiling/buildSYCLProcess.sh b/tools/profiling/buildSYCLProcess.sh
index fb8cccac15..7dba8b5001 100755
--- a/tools/profiling/buildSYCLProcess.sh
+++ b/tools/profiling/buildSYCLProcess.sh
@@ -80,8 +80,7 @@ export MG_PROC_DIR=$prefix/../../epochX/sycl/$MG_PROC
 export MG_SP_DIR=$MG_PROC_DIR/SubProcesses/$MG_SUBPROC
 
 export MG_LIBS_DIR="${MG4GPU_LIB}/build_${MG_PROC}_${SYCL_NAME_PREFIX}"
-# export MG_LIBS="$DPCPP_HOME/llvm/build/lib:$MG_LIBS_DIR"
-export MG_LIBS=$MG_LIBS_DIR
+export MG_LIBS="$DPCPP_HOME/llvm/build/lib:$MG_LIBS_DIR"
 
 export MG_EXE_DIR="${MG4GPU_BIN}/build_${MG_PROC}_${SYCL_NAME_PREFIX}"
 export MG_EXE="$MG_EXE_DIR/check.exe"

From 801cab25b25855f7f97ecc585a55e00ed559086a Mon Sep 17 00:00:00 2001
From: Jooorgen <jorgen.teig@gmail.com>
Date: Fri, 25 Nov 2022 15:49:12 +0100
Subject: [PATCH 088/509] Added correct DPCPP path

---
 tools/profiling/buildSYCLProcess.sh | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tools/profiling/buildSYCLProcess.sh b/tools/profiling/buildSYCLProcess.sh
index 7dba8b5001..be28a0878b 100755
--- a/tools/profiling/buildSYCLProcess.sh
+++ b/tools/profiling/buildSYCLProcess.sh
@@ -39,6 +39,7 @@ fi
 prefix=$(pwd)
 
 #export DPCPP_HOME=/p/project/prpb109/sycl_workspace
+export DPCPP_HOME=/afs/cern.ch/work/j/jteig/sycl_workspace
 export USEBUILDDIR=1
 export NTPBMAX=1024
 export CXX=/afs/cern.ch/work/j/jteig/sycl_workspace/llvm/build/bin/clang++

From 4dd6ba3f5d2df40604622cf90064c080a8625cc7 Mon Sep 17 00:00:00 2001
From: Jooorgen <jorgen.teig@gmail.com>
Date: Fri, 25 Nov 2022 15:56:39 +0100
Subject: [PATCH 089/509] Added testing command

---
 .github/workflows/profiler.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/profiler.yml b/.github/workflows/profiler.yml
index 2364b856ba..94f27081a9 100644
--- a/.github/workflows/profiler.yml
+++ b/.github/workflows/profiler.yml
@@ -21,7 +21,7 @@ jobs:
     steps:
     - uses: actions/checkout@v2
     - name: Runs SYCL performanceProfiler.py script
-      run: cd tools/profiling/; source /cvmfs/sft.cern.ch/lcg/contrib/gcc/11.3.0/x86_64-centos8/setup.sh; python3 performanceProfiler.py -l 'SYCL' -b 'master'
+      run: cd tools/profiling/; source /cvmfs/sft.cern.ch/lcg/contrib/gcc/11.3.0/x86_64-centos8/setup.sh; g++ --version; python3 performanceProfiler.py -l 'SYCL' -b 'master'
     - name: Uploads SYCL JSON files to DB
       run: ./tools/profiling/sendData.py --profiler --absLayer "SYCL"
     - name: Runs CUDA performanceProfiler.py script

From c7587be70dd8477865f73c9ac81f8c8a2f0cd4c9 Mon Sep 17 00:00:00 2001
From: Jooorgen <jorgen.teig@gmail.com>
Date: Mon, 28 Nov 2022 09:23:38 +0100
Subject: [PATCH 090/509] Changed position of GCC toolkit

---
 epochX/sycl/ee_mumu.sa/SubProcesses/sycl.mk  | 4 ++--
 epochX/sycl/gg_tt.sa/SubProcesses/sycl.mk    | 4 ++--
 epochX/sycl/gg_ttg.sa/SubProcesses/sycl.mk   | 4 ++--
 epochX/sycl/gg_ttgg.sa/SubProcesses/sycl.mk  | 4 ++--
 epochX/sycl/gg_ttggg.sa/SubProcesses/sycl.mk | 5 ++---
 5 files changed, 10 insertions(+), 11 deletions(-)

diff --git a/epochX/sycl/ee_mumu.sa/SubProcesses/sycl.mk b/epochX/sycl/ee_mumu.sa/SubProcesses/sycl.mk
index f5b0fcf8d5..ca69e5897b 100644
--- a/epochX/sycl/ee_mumu.sa/SubProcesses/sycl.mk
+++ b/epochX/sycl/ee_mumu.sa/SubProcesses/sycl.mk
@@ -27,7 +27,7 @@ UNAME_P := $(shell uname -p)
 #=== Configure common compiler flags for C++ and SYCL
 
 INCFLAGS = -I.
-OPTFLAGS = -O3 -march=native
+OPTFLAGS = -O3 -march=native --gcc-toolchain="/cvmfs/sft.cern.ch/lcg/releases/gcc/11.3.0-ad0f5/x86_64-centos8"
 
 # Dependency on src directory
 MG5AMC_COMMONLIB = mg5amc_common
@@ -42,7 +42,7 @@ INCFLAGS += -I$(TOOLSDIR)
 
 #=== Configure the C++ compiler
 
-CXXFLAGS = $(OPTFLAGS) -std=c++20 $(INCFLAGS) -Wall -Wshadow -Wextra --gcc-toolchain="/cvmfs/sft.cern.ch/lcg/releases/gcc/11.3.0-ad0f5/x86_64-centos8"
+CXXFLAGS = $(OPTFLAGS) -std=c++20 $(INCFLAGS) -Wall -Wshadow -Wextra
 CXXFLAGS+= -ffast-math # see issue #117
 ifndef SYCLFLAGS
   $(error SYCLFLAGS not set)
diff --git a/epochX/sycl/gg_tt.sa/SubProcesses/sycl.mk b/epochX/sycl/gg_tt.sa/SubProcesses/sycl.mk
index f5b0fcf8d5..ca69e5897b 100644
--- a/epochX/sycl/gg_tt.sa/SubProcesses/sycl.mk
+++ b/epochX/sycl/gg_tt.sa/SubProcesses/sycl.mk
@@ -27,7 +27,7 @@ UNAME_P := $(shell uname -p)
 #=== Configure common compiler flags for C++ and SYCL
 
 INCFLAGS = -I.
-OPTFLAGS = -O3 -march=native
+OPTFLAGS = -O3 -march=native --gcc-toolchain="/cvmfs/sft.cern.ch/lcg/releases/gcc/11.3.0-ad0f5/x86_64-centos8"
 
 # Dependency on src directory
 MG5AMC_COMMONLIB = mg5amc_common
@@ -42,7 +42,7 @@ INCFLAGS += -I$(TOOLSDIR)
 
 #=== Configure the C++ compiler
 
-CXXFLAGS = $(OPTFLAGS) -std=c++20 $(INCFLAGS) -Wall -Wshadow -Wextra --gcc-toolchain="/cvmfs/sft.cern.ch/lcg/releases/gcc/11.3.0-ad0f5/x86_64-centos8"
+CXXFLAGS = $(OPTFLAGS) -std=c++20 $(INCFLAGS) -Wall -Wshadow -Wextra
 CXXFLAGS+= -ffast-math # see issue #117
 ifndef SYCLFLAGS
   $(error SYCLFLAGS not set)
diff --git a/epochX/sycl/gg_ttg.sa/SubProcesses/sycl.mk b/epochX/sycl/gg_ttg.sa/SubProcesses/sycl.mk
index f5b0fcf8d5..ca69e5897b 100644
--- a/epochX/sycl/gg_ttg.sa/SubProcesses/sycl.mk
+++ b/epochX/sycl/gg_ttg.sa/SubProcesses/sycl.mk
@@ -27,7 +27,7 @@ UNAME_P := $(shell uname -p)
 #=== Configure common compiler flags for C++ and SYCL
 
 INCFLAGS = -I.
-OPTFLAGS = -O3 -march=native
+OPTFLAGS = -O3 -march=native --gcc-toolchain="/cvmfs/sft.cern.ch/lcg/releases/gcc/11.3.0-ad0f5/x86_64-centos8"
 
 # Dependency on src directory
 MG5AMC_COMMONLIB = mg5amc_common
@@ -42,7 +42,7 @@ INCFLAGS += -I$(TOOLSDIR)
 
 #=== Configure the C++ compiler
 
-CXXFLAGS = $(OPTFLAGS) -std=c++20 $(INCFLAGS) -Wall -Wshadow -Wextra --gcc-toolchain="/cvmfs/sft.cern.ch/lcg/releases/gcc/11.3.0-ad0f5/x86_64-centos8"
+CXXFLAGS = $(OPTFLAGS) -std=c++20 $(INCFLAGS) -Wall -Wshadow -Wextra
 CXXFLAGS+= -ffast-math # see issue #117
 ifndef SYCLFLAGS
   $(error SYCLFLAGS not set)
diff --git a/epochX/sycl/gg_ttgg.sa/SubProcesses/sycl.mk b/epochX/sycl/gg_ttgg.sa/SubProcesses/sycl.mk
index f5b0fcf8d5..ca69e5897b 100644
--- a/epochX/sycl/gg_ttgg.sa/SubProcesses/sycl.mk
+++ b/epochX/sycl/gg_ttgg.sa/SubProcesses/sycl.mk
@@ -27,7 +27,7 @@ UNAME_P := $(shell uname -p)
 #=== Configure common compiler flags for C++ and SYCL
 
 INCFLAGS = -I.
-OPTFLAGS = -O3 -march=native
+OPTFLAGS = -O3 -march=native --gcc-toolchain="/cvmfs/sft.cern.ch/lcg/releases/gcc/11.3.0-ad0f5/x86_64-centos8"
 
 # Dependency on src directory
 MG5AMC_COMMONLIB = mg5amc_common
@@ -42,7 +42,7 @@ INCFLAGS += -I$(TOOLSDIR)
 
 #=== Configure the C++ compiler
 
-CXXFLAGS = $(OPTFLAGS) -std=c++20 $(INCFLAGS) -Wall -Wshadow -Wextra --gcc-toolchain="/cvmfs/sft.cern.ch/lcg/releases/gcc/11.3.0-ad0f5/x86_64-centos8"
+CXXFLAGS = $(OPTFLAGS) -std=c++20 $(INCFLAGS) -Wall -Wshadow -Wextra
 CXXFLAGS+= -ffast-math # see issue #117
 ifndef SYCLFLAGS
   $(error SYCLFLAGS not set)
diff --git a/epochX/sycl/gg_ttggg.sa/SubProcesses/sycl.mk b/epochX/sycl/gg_ttggg.sa/SubProcesses/sycl.mk
index f5b0fcf8d5..f42f35bf5e 100644
--- a/epochX/sycl/gg_ttggg.sa/SubProcesses/sycl.mk
+++ b/epochX/sycl/gg_ttggg.sa/SubProcesses/sycl.mk
@@ -27,7 +27,7 @@ UNAME_P := $(shell uname -p)
 #=== Configure common compiler flags for C++ and SYCL
 
 INCFLAGS = -I.
-OPTFLAGS = -O3 -march=native
+OPTFLAGS = -O3 -march=native --gcc-toolchain="/cvmfs/sft.cern.ch/lcg/releases/gcc/11.3.0-ad0f5/x86_64-centos8"
 
 # Dependency on src directory
 MG5AMC_COMMONLIB = mg5amc_common
@@ -42,8 +42,7 @@ INCFLAGS += -I$(TOOLSDIR)
 
 #=== Configure the C++ compiler
 
-CXXFLAGS = $(OPTFLAGS) -std=c++20 $(INCFLAGS) -Wall -Wshadow -Wextra --gcc-toolchain="/cvmfs/sft.cern.ch/lcg/releases/gcc/11.3.0-ad0f5/x86_64-centos8"
-CXXFLAGS+= -ffast-math # see issue #117
+CXXFLAGS = $(OPTFLAGS) -std=c++20 $(INCFLAGS) -Wall -Wshadow -Wextra
 ifndef SYCLFLAGS
   $(error SYCLFLAGS not set)
 endif

From a56cbb7af2482b3878584b4de2c4538382c48e69 Mon Sep 17 00:00:00 2001
From: Jooorgen <jorgen.teig@gmail.com>
Date: Mon, 28 Nov 2022 11:03:19 +0100
Subject: [PATCH 091/509] Added correct GCC toolset in src Makefile

---
 epochX/sycl/ee_mumu.sa/src/sycl_src.mk  | 2 +-
 epochX/sycl/gg_tt.sa/src/sycl_src.mk    | 2 +-
 epochX/sycl/gg_ttg.sa/src/sycl_src.mk   | 2 +-
 epochX/sycl/gg_ttgg.sa/src/sycl_src.mk  | 2 +-
 epochX/sycl/gg_ttggg.sa/src/sycl_src.mk | 2 +-
 5 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/epochX/sycl/ee_mumu.sa/src/sycl_src.mk b/epochX/sycl/ee_mumu.sa/src/sycl_src.mk
index 504c2d4dd8..69d6b659d9 100644
--- a/epochX/sycl/ee_mumu.sa/src/sycl_src.mk
+++ b/epochX/sycl/ee_mumu.sa/src/sycl_src.mk
@@ -1,7 +1,7 @@
 #=== Configure common compiler flags for SYCL build
 
 INCFLAGS = -I.
-OPTFLAGS = -O3 -march=native # this ends up in 
+OPTFLAGS = -O3 -march=native --gcc-toolchain="/cvmfs/sft.cern.ch/lcg/releases/gcc/11.3.0-ad0f5/x86_64-centos8" # this ends up in 
 
 #-------------------------------------------------------------------------------
 
diff --git a/epochX/sycl/gg_tt.sa/src/sycl_src.mk b/epochX/sycl/gg_tt.sa/src/sycl_src.mk
index 504c2d4dd8..69d6b659d9 100644
--- a/epochX/sycl/gg_tt.sa/src/sycl_src.mk
+++ b/epochX/sycl/gg_tt.sa/src/sycl_src.mk
@@ -1,7 +1,7 @@
 #=== Configure common compiler flags for SYCL build
 
 INCFLAGS = -I.
-OPTFLAGS = -O3 -march=native # this ends up in 
+OPTFLAGS = -O3 -march=native --gcc-toolchain="/cvmfs/sft.cern.ch/lcg/releases/gcc/11.3.0-ad0f5/x86_64-centos8" # this ends up in 
 
 #-------------------------------------------------------------------------------
 
diff --git a/epochX/sycl/gg_ttg.sa/src/sycl_src.mk b/epochX/sycl/gg_ttg.sa/src/sycl_src.mk
index 504c2d4dd8..69d6b659d9 100644
--- a/epochX/sycl/gg_ttg.sa/src/sycl_src.mk
+++ b/epochX/sycl/gg_ttg.sa/src/sycl_src.mk
@@ -1,7 +1,7 @@
 #=== Configure common compiler flags for SYCL build
 
 INCFLAGS = -I.
-OPTFLAGS = -O3 -march=native # this ends up in 
+OPTFLAGS = -O3 -march=native --gcc-toolchain="/cvmfs/sft.cern.ch/lcg/releases/gcc/11.3.0-ad0f5/x86_64-centos8" # this ends up in 
 
 #-------------------------------------------------------------------------------
 
diff --git a/epochX/sycl/gg_ttgg.sa/src/sycl_src.mk b/epochX/sycl/gg_ttgg.sa/src/sycl_src.mk
index 504c2d4dd8..69d6b659d9 100644
--- a/epochX/sycl/gg_ttgg.sa/src/sycl_src.mk
+++ b/epochX/sycl/gg_ttgg.sa/src/sycl_src.mk
@@ -1,7 +1,7 @@
 #=== Configure common compiler flags for SYCL build
 
 INCFLAGS = -I.
-OPTFLAGS = -O3 -march=native # this ends up in 
+OPTFLAGS = -O3 -march=native --gcc-toolchain="/cvmfs/sft.cern.ch/lcg/releases/gcc/11.3.0-ad0f5/x86_64-centos8" # this ends up in 
 
 #-------------------------------------------------------------------------------
 
diff --git a/epochX/sycl/gg_ttggg.sa/src/sycl_src.mk b/epochX/sycl/gg_ttggg.sa/src/sycl_src.mk
index 504c2d4dd8..69d6b659d9 100644
--- a/epochX/sycl/gg_ttggg.sa/src/sycl_src.mk
+++ b/epochX/sycl/gg_ttggg.sa/src/sycl_src.mk
@@ -1,7 +1,7 @@
 #=== Configure common compiler flags for SYCL build
 
 INCFLAGS = -I.
-OPTFLAGS = -O3 -march=native # this ends up in 
+OPTFLAGS = -O3 -march=native --gcc-toolchain="/cvmfs/sft.cern.ch/lcg/releases/gcc/11.3.0-ad0f5/x86_64-centos8" # this ends up in 
 
 #-------------------------------------------------------------------------------
 

From 9d02a2c6ef285d7286f947a6973600c1b3959154 Mon Sep 17 00:00:00 2001
From: Jooorgen <jorgen.teig@gmail.com>
Date: Mon, 28 Nov 2022 11:14:20 +0100
Subject: [PATCH 092/509] Reverted changes to src Makefiles

---
 epochX/sycl/ee_mumu.sa/src/sycl_src.mk  | 2 +-
 epochX/sycl/gg_tt.sa/src/sycl_src.mk    | 2 +-
 epochX/sycl/gg_ttg.sa/src/sycl_src.mk   | 2 +-
 epochX/sycl/gg_ttgg.sa/src/sycl_src.mk  | 2 +-
 epochX/sycl/gg_ttggg.sa/src/sycl_src.mk | 2 +-
 5 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/epochX/sycl/ee_mumu.sa/src/sycl_src.mk b/epochX/sycl/ee_mumu.sa/src/sycl_src.mk
index 69d6b659d9..504c2d4dd8 100644
--- a/epochX/sycl/ee_mumu.sa/src/sycl_src.mk
+++ b/epochX/sycl/ee_mumu.sa/src/sycl_src.mk
@@ -1,7 +1,7 @@
 #=== Configure common compiler flags for SYCL build
 
 INCFLAGS = -I.
-OPTFLAGS = -O3 -march=native --gcc-toolchain="/cvmfs/sft.cern.ch/lcg/releases/gcc/11.3.0-ad0f5/x86_64-centos8" # this ends up in 
+OPTFLAGS = -O3 -march=native # this ends up in 
 
 #-------------------------------------------------------------------------------
 
diff --git a/epochX/sycl/gg_tt.sa/src/sycl_src.mk b/epochX/sycl/gg_tt.sa/src/sycl_src.mk
index 69d6b659d9..504c2d4dd8 100644
--- a/epochX/sycl/gg_tt.sa/src/sycl_src.mk
+++ b/epochX/sycl/gg_tt.sa/src/sycl_src.mk
@@ -1,7 +1,7 @@
 #=== Configure common compiler flags for SYCL build
 
 INCFLAGS = -I.
-OPTFLAGS = -O3 -march=native --gcc-toolchain="/cvmfs/sft.cern.ch/lcg/releases/gcc/11.3.0-ad0f5/x86_64-centos8" # this ends up in 
+OPTFLAGS = -O3 -march=native # this ends up in 
 
 #-------------------------------------------------------------------------------
 
diff --git a/epochX/sycl/gg_ttg.sa/src/sycl_src.mk b/epochX/sycl/gg_ttg.sa/src/sycl_src.mk
index 69d6b659d9..504c2d4dd8 100644
--- a/epochX/sycl/gg_ttg.sa/src/sycl_src.mk
+++ b/epochX/sycl/gg_ttg.sa/src/sycl_src.mk
@@ -1,7 +1,7 @@
 #=== Configure common compiler flags for SYCL build
 
 INCFLAGS = -I.
-OPTFLAGS = -O3 -march=native --gcc-toolchain="/cvmfs/sft.cern.ch/lcg/releases/gcc/11.3.0-ad0f5/x86_64-centos8" # this ends up in 
+OPTFLAGS = -O3 -march=native # this ends up in 
 
 #-------------------------------------------------------------------------------
 
diff --git a/epochX/sycl/gg_ttgg.sa/src/sycl_src.mk b/epochX/sycl/gg_ttgg.sa/src/sycl_src.mk
index 69d6b659d9..504c2d4dd8 100644
--- a/epochX/sycl/gg_ttgg.sa/src/sycl_src.mk
+++ b/epochX/sycl/gg_ttgg.sa/src/sycl_src.mk
@@ -1,7 +1,7 @@
 #=== Configure common compiler flags for SYCL build
 
 INCFLAGS = -I.
-OPTFLAGS = -O3 -march=native --gcc-toolchain="/cvmfs/sft.cern.ch/lcg/releases/gcc/11.3.0-ad0f5/x86_64-centos8" # this ends up in 
+OPTFLAGS = -O3 -march=native # this ends up in 
 
 #-------------------------------------------------------------------------------
 
diff --git a/epochX/sycl/gg_ttggg.sa/src/sycl_src.mk b/epochX/sycl/gg_ttggg.sa/src/sycl_src.mk
index 69d6b659d9..504c2d4dd8 100644
--- a/epochX/sycl/gg_ttggg.sa/src/sycl_src.mk
+++ b/epochX/sycl/gg_ttggg.sa/src/sycl_src.mk
@@ -1,7 +1,7 @@
 #=== Configure common compiler flags for SYCL build
 
 INCFLAGS = -I.
-OPTFLAGS = -O3 -march=native --gcc-toolchain="/cvmfs/sft.cern.ch/lcg/releases/gcc/11.3.0-ad0f5/x86_64-centos8" # this ends up in 
+OPTFLAGS = -O3 -march=native # this ends up in 
 
 #-------------------------------------------------------------------------------
 

From 10ac7336df6295de40afa68f7450cbaf894e42a6 Mon Sep 17 00:00:00 2001
From: Jooorgen <jorgen.teig@gmail.com>
Date: Mon, 28 Nov 2022 11:41:22 +0100
Subject: [PATCH 093/509] Adding CXXFLAGS to the compilation of fcheck in SYCL

---
 epochX/sycl/ee_mumu.sa/SubProcesses/sycl.mk | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/epochX/sycl/ee_mumu.sa/SubProcesses/sycl.mk b/epochX/sycl/ee_mumu.sa/SubProcesses/sycl.mk
index ca69e5897b..78f7e8a8b5 100644
--- a/epochX/sycl/ee_mumu.sa/SubProcesses/sycl.mk
+++ b/epochX/sycl/ee_mumu.sa/SubProcesses/sycl.mk
@@ -255,7 +255,7 @@ $(BUILDDIR)/%.o : %.f *.inc
 
 $(fsycl_main): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH
 $(fsycl_main): $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler.o $(LIBDIR)/lib$(MG5AMC_CXXLIB).a $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so
-	$(CXX) $(SYCLFLAGS) -o $@ $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler.o $(LIBDIR)/lib$(MG5AMC_CXXLIB).a $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) -lstdc++fs
+	$(CXX) $(CXXFLAGS) $(SYCLFLAGS) -o $@ $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler.o $(LIBDIR)/lib$(MG5AMC_CXXLIB).a $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) -lstdc++fs
 
 #-------------------------------------------------------------------------------
 

From b683fa823cf90c1480753d09c2a64c60997ca047 Mon Sep 17 00:00:00 2001
From: Jooorgen <jorgen.teig@gmail.com>
Date: Mon, 28 Nov 2022 11:52:36 +0100
Subject: [PATCH 094/509] Added option in Makefile to enable CI profiler to add
 correct GCC toolchain for CI use

---
 epochX/sycl/ee_mumu.sa/SubProcesses/sycl.mk  | 8 +++++++-
 epochX/sycl/gg_tt.sa/SubProcesses/sycl.mk    | 8 +++++++-
 epochX/sycl/gg_ttg.sa/SubProcesses/sycl.mk   | 8 +++++++-
 epochX/sycl/gg_ttgg.sa/SubProcesses/sycl.mk  | 8 +++++++-
 epochX/sycl/gg_ttggg.sa/SubProcesses/sycl.mk | 8 +++++++-
 5 files changed, 35 insertions(+), 5 deletions(-)

diff --git a/epochX/sycl/ee_mumu.sa/SubProcesses/sycl.mk b/epochX/sycl/ee_mumu.sa/SubProcesses/sycl.mk
index 78f7e8a8b5..395c679825 100644
--- a/epochX/sycl/ee_mumu.sa/SubProcesses/sycl.mk
+++ b/epochX/sycl/ee_mumu.sa/SubProcesses/sycl.mk
@@ -27,7 +27,7 @@ UNAME_P := $(shell uname -p)
 #=== Configure common compiler flags for C++ and SYCL
 
 INCFLAGS = -I.
-OPTFLAGS = -O3 -march=native --gcc-toolchain="/cvmfs/sft.cern.ch/lcg/releases/gcc/11.3.0-ad0f5/x86_64-centos8"
+OPTFLAGS = -O3 -march=native
 
 # Dependency on src directory
 MG5AMC_COMMONLIB = mg5amc_common
@@ -94,6 +94,12 @@ export NTPBMAX
 
 #=== Set the SYCL/C++ compiler flags appropriate to user-defined choices of FPTYPE, HELINL, HRDCOD, NTPBMAX
 
+# Add option to enable CI profiler use
+$(info ENABLE_CI_PROFILER=$(ENABLE_CI_PROFILER))
+ifeq ($(ENABLE_CI_PROFILER),1)
+  CXXFLAGS += --gcc-toolchain="/cvmfs/sft.cern.ch/lcg/releases/gcc/11.3.0-ad0f5/x86_64-centos8"
+endif
+
 # Set the build flags appropriate to each FPTYPE choice (example: "make FPTYPE=f")
 $(info FPTYPE=$(FPTYPE))
 ifeq ($(FPTYPE),d)
diff --git a/epochX/sycl/gg_tt.sa/SubProcesses/sycl.mk b/epochX/sycl/gg_tt.sa/SubProcesses/sycl.mk
index ca69e5897b..7fbdae2c11 100644
--- a/epochX/sycl/gg_tt.sa/SubProcesses/sycl.mk
+++ b/epochX/sycl/gg_tt.sa/SubProcesses/sycl.mk
@@ -27,7 +27,7 @@ UNAME_P := $(shell uname -p)
 #=== Configure common compiler flags for C++ and SYCL
 
 INCFLAGS = -I.
-OPTFLAGS = -O3 -march=native --gcc-toolchain="/cvmfs/sft.cern.ch/lcg/releases/gcc/11.3.0-ad0f5/x86_64-centos8"
+OPTFLAGS = -O3 -march=native
 
 # Dependency on src directory
 MG5AMC_COMMONLIB = mg5amc_common
@@ -94,6 +94,12 @@ export NTPBMAX
 
 #=== Set the SYCL/C++ compiler flags appropriate to user-defined choices of FPTYPE, HELINL, HRDCOD, NTPBMAX
 
+# Add option to enable CI profiler use
+$(info ENABLE_CI_PROFILER=$(ENABLE_CI_PROFILER))
+ifeq ($(ENABLE_CI_PROFILER),1)
+  CXXFLAGS += --gcc-toolchain="/cvmfs/sft.cern.ch/lcg/releases/gcc/11.3.0-ad0f5/x86_64-centos8"
+endif
+
 # Set the build flags appropriate to each FPTYPE choice (example: "make FPTYPE=f")
 $(info FPTYPE=$(FPTYPE))
 ifeq ($(FPTYPE),d)
diff --git a/epochX/sycl/gg_ttg.sa/SubProcesses/sycl.mk b/epochX/sycl/gg_ttg.sa/SubProcesses/sycl.mk
index ca69e5897b..7fbdae2c11 100644
--- a/epochX/sycl/gg_ttg.sa/SubProcesses/sycl.mk
+++ b/epochX/sycl/gg_ttg.sa/SubProcesses/sycl.mk
@@ -27,7 +27,7 @@ UNAME_P := $(shell uname -p)
 #=== Configure common compiler flags for C++ and SYCL
 
 INCFLAGS = -I.
-OPTFLAGS = -O3 -march=native --gcc-toolchain="/cvmfs/sft.cern.ch/lcg/releases/gcc/11.3.0-ad0f5/x86_64-centos8"
+OPTFLAGS = -O3 -march=native
 
 # Dependency on src directory
 MG5AMC_COMMONLIB = mg5amc_common
@@ -94,6 +94,12 @@ export NTPBMAX
 
 #=== Set the SYCL/C++ compiler flags appropriate to user-defined choices of FPTYPE, HELINL, HRDCOD, NTPBMAX
 
+# Add option to enable CI profiler use
+$(info ENABLE_CI_PROFILER=$(ENABLE_CI_PROFILER))
+ifeq ($(ENABLE_CI_PROFILER),1)
+  CXXFLAGS += --gcc-toolchain="/cvmfs/sft.cern.ch/lcg/releases/gcc/11.3.0-ad0f5/x86_64-centos8"
+endif
+
 # Set the build flags appropriate to each FPTYPE choice (example: "make FPTYPE=f")
 $(info FPTYPE=$(FPTYPE))
 ifeq ($(FPTYPE),d)
diff --git a/epochX/sycl/gg_ttgg.sa/SubProcesses/sycl.mk b/epochX/sycl/gg_ttgg.sa/SubProcesses/sycl.mk
index ca69e5897b..7fbdae2c11 100644
--- a/epochX/sycl/gg_ttgg.sa/SubProcesses/sycl.mk
+++ b/epochX/sycl/gg_ttgg.sa/SubProcesses/sycl.mk
@@ -27,7 +27,7 @@ UNAME_P := $(shell uname -p)
 #=== Configure common compiler flags for C++ and SYCL
 
 INCFLAGS = -I.
-OPTFLAGS = -O3 -march=native --gcc-toolchain="/cvmfs/sft.cern.ch/lcg/releases/gcc/11.3.0-ad0f5/x86_64-centos8"
+OPTFLAGS = -O3 -march=native
 
 # Dependency on src directory
 MG5AMC_COMMONLIB = mg5amc_common
@@ -94,6 +94,12 @@ export NTPBMAX
 
 #=== Set the SYCL/C++ compiler flags appropriate to user-defined choices of FPTYPE, HELINL, HRDCOD, NTPBMAX
 
+# Add option to enable CI profiler use
+$(info ENABLE_CI_PROFILER=$(ENABLE_CI_PROFILER))
+ifeq ($(ENABLE_CI_PROFILER),1)
+  CXXFLAGS += --gcc-toolchain="/cvmfs/sft.cern.ch/lcg/releases/gcc/11.3.0-ad0f5/x86_64-centos8"
+endif
+
 # Set the build flags appropriate to each FPTYPE choice (example: "make FPTYPE=f")
 $(info FPTYPE=$(FPTYPE))
 ifeq ($(FPTYPE),d)
diff --git a/epochX/sycl/gg_ttggg.sa/SubProcesses/sycl.mk b/epochX/sycl/gg_ttggg.sa/SubProcesses/sycl.mk
index f42f35bf5e..4eb79a45e9 100644
--- a/epochX/sycl/gg_ttggg.sa/SubProcesses/sycl.mk
+++ b/epochX/sycl/gg_ttggg.sa/SubProcesses/sycl.mk
@@ -27,7 +27,7 @@ UNAME_P := $(shell uname -p)
 #=== Configure common compiler flags for C++ and SYCL
 
 INCFLAGS = -I.
-OPTFLAGS = -O3 -march=native --gcc-toolchain="/cvmfs/sft.cern.ch/lcg/releases/gcc/11.3.0-ad0f5/x86_64-centos8"
+OPTFLAGS = -O3 -march=native
 
 # Dependency on src directory
 MG5AMC_COMMONLIB = mg5amc_common
@@ -93,6 +93,12 @@ export NTPBMAX
 
 #=== Set the SYCL/C++ compiler flags appropriate to user-defined choices of FPTYPE, HELINL, HRDCOD, NTPBMAX
 
+# Add option to enable CI profiler use
+$(info ENABLE_CI_PROFILER=$(ENABLE_CI_PROFILER))
+ifeq ($(ENABLE_CI_PROFILER),1)
+  CXXFLAGS += --gcc-toolchain="/cvmfs/sft.cern.ch/lcg/releases/gcc/11.3.0-ad0f5/x86_64-centos8"
+endif
+
 # Set the build flags appropriate to each FPTYPE choice (example: "make FPTYPE=f")
 $(info FPTYPE=$(FPTYPE))
 ifeq ($(FPTYPE),d)

From 6a6e00d97bc2403ed87d66e9ff196573d4c562f7 Mon Sep 17 00:00:00 2001
From: Jooorgen <jorgen.teig@gmail.com>
Date: Mon, 28 Nov 2022 14:51:39 +0100
Subject: [PATCH 095/509] Added CI Profiler flag in SYCl build script for
 makefiles

---
 tools/profiling/buildSYCLProcess.sh | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/tools/profiling/buildSYCLProcess.sh b/tools/profiling/buildSYCLProcess.sh
index be28a0878b..35d454371e 100755
--- a/tools/profiling/buildSYCLProcess.sh
+++ b/tools/profiling/buildSYCLProcess.sh
@@ -8,16 +8,18 @@ helpFunction()
     echo -e "\t-b Blocks per grid"
     echo -e "\t-t Threads per block"
     echo -e "\t-i Iterations"
+    echo -e "\t-p Profiler flag"
     exit 1 # Exit script after printing help
 }
 
-while getopts "n:b:t:i:" opt
+while getopts "n:b:t:i:p:" opt
 do
     case "$opt" in
         n ) MG_PROC="$OPTARG" ;; #process to target
         b ) blocksPerGrid="$OPTARG" ;;
         t ) threadsPerBlock="$OPTARG" ;;
         i ) iterations="$OPTARG" ;;
+        p ) profilerFlag="$OPTARG" ;;
         ? ) helpFunction ;; # Print helpFunction in case parameter is non-existent
     esac
 done
@@ -38,11 +40,13 @@ fi
 # Assumes that this is run from profiling directory in the repo
 prefix=$(pwd)
 
+export ENABLE_CI_PROFILER=1
+
 #export DPCPP_HOME=/p/project/prpb109/sycl_workspace
 export DPCPP_HOME=/afs/cern.ch/work/j/jteig/sycl_workspace
 export USEBUILDDIR=1
 export NTPBMAX=1024
-export CXX=/afs/cern.ch/work/j/jteig/sycl_workspace/llvm/build/bin/clang++
+export CXX=$DPCPP_HOME/llvm/build/bin/clang++
 export CUDA_PATH=/usr/local/cuda-11.6
 export SYCLFLAGS="-fsycl -fsycl-targets=nvptx64-nvidia-cuda -Xsycl-target-backend '--cuda-gpu-arch=sm_70' -fgpu-rdc --cuda-path=$CUDA_PATH"
 export WORKSPACE=$prefix/workspace_mg4gpu
@@ -90,8 +94,8 @@ export MG5AMC_CARD_PATH=$MG_PROC_DIR/Cards
 # Build executable
 cd $MG_SP_DIR
 make
-mv ../../lib/build.d_inl0/ $MG_LIBS_DIR #2>/dev/null; true
-mv build.d_inl0/ $MG_EXE_DIR #2>/dev/null; true
+mv ../../lib/build.d_inl0*/ $MG_LIBS_DIR #2>/dev/null; true
+mv build.d_inl0*/ $MG_EXE_DIR #2>/dev/null; true
 
 # Run executable
 cd $WORKSPACE

From 7f4dd19f85d71d2b9c06a697d1b0a115e02508a6 Mon Sep 17 00:00:00 2001
From: Jooorgen <jorgen.teig@gmail.com>
Date: Mon, 28 Nov 2022 15:00:33 +0100
Subject: [PATCH 096/509] Added CXXFLAGS variable in fcheck compilation

---
 epochX/sycl/gg_tt.sa/SubProcesses/sycl.mk    | 2 +-
 epochX/sycl/gg_ttg.sa/SubProcesses/sycl.mk   | 2 +-
 epochX/sycl/gg_ttgg.sa/SubProcesses/sycl.mk  | 2 +-
 epochX/sycl/gg_ttggg.sa/SubProcesses/sycl.mk | 2 +-
 4 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/epochX/sycl/gg_tt.sa/SubProcesses/sycl.mk b/epochX/sycl/gg_tt.sa/SubProcesses/sycl.mk
index 7fbdae2c11..395c679825 100644
--- a/epochX/sycl/gg_tt.sa/SubProcesses/sycl.mk
+++ b/epochX/sycl/gg_tt.sa/SubProcesses/sycl.mk
@@ -261,7 +261,7 @@ $(BUILDDIR)/%.o : %.f *.inc
 
 $(fsycl_main): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH
 $(fsycl_main): $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler.o $(LIBDIR)/lib$(MG5AMC_CXXLIB).a $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so
-	$(CXX) $(SYCLFLAGS) -o $@ $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler.o $(LIBDIR)/lib$(MG5AMC_CXXLIB).a $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) -lstdc++fs
+	$(CXX) $(CXXFLAGS) $(SYCLFLAGS) -o $@ $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler.o $(LIBDIR)/lib$(MG5AMC_CXXLIB).a $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) -lstdc++fs
 
 #-------------------------------------------------------------------------------
 
diff --git a/epochX/sycl/gg_ttg.sa/SubProcesses/sycl.mk b/epochX/sycl/gg_ttg.sa/SubProcesses/sycl.mk
index 7fbdae2c11..395c679825 100644
--- a/epochX/sycl/gg_ttg.sa/SubProcesses/sycl.mk
+++ b/epochX/sycl/gg_ttg.sa/SubProcesses/sycl.mk
@@ -261,7 +261,7 @@ $(BUILDDIR)/%.o : %.f *.inc
 
 $(fsycl_main): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH
 $(fsycl_main): $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler.o $(LIBDIR)/lib$(MG5AMC_CXXLIB).a $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so
-	$(CXX) $(SYCLFLAGS) -o $@ $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler.o $(LIBDIR)/lib$(MG5AMC_CXXLIB).a $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) -lstdc++fs
+	$(CXX) $(CXXFLAGS) $(SYCLFLAGS) -o $@ $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler.o $(LIBDIR)/lib$(MG5AMC_CXXLIB).a $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) -lstdc++fs
 
 #-------------------------------------------------------------------------------
 
diff --git a/epochX/sycl/gg_ttgg.sa/SubProcesses/sycl.mk b/epochX/sycl/gg_ttgg.sa/SubProcesses/sycl.mk
index 7fbdae2c11..395c679825 100644
--- a/epochX/sycl/gg_ttgg.sa/SubProcesses/sycl.mk
+++ b/epochX/sycl/gg_ttgg.sa/SubProcesses/sycl.mk
@@ -261,7 +261,7 @@ $(BUILDDIR)/%.o : %.f *.inc
 
 $(fsycl_main): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH
 $(fsycl_main): $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler.o $(LIBDIR)/lib$(MG5AMC_CXXLIB).a $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so
-	$(CXX) $(SYCLFLAGS) -o $@ $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler.o $(LIBDIR)/lib$(MG5AMC_CXXLIB).a $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) -lstdc++fs
+	$(CXX) $(CXXFLAGS) $(SYCLFLAGS) -o $@ $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler.o $(LIBDIR)/lib$(MG5AMC_CXXLIB).a $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) -lstdc++fs
 
 #-------------------------------------------------------------------------------
 
diff --git a/epochX/sycl/gg_ttggg.sa/SubProcesses/sycl.mk b/epochX/sycl/gg_ttggg.sa/SubProcesses/sycl.mk
index 4eb79a45e9..2c59c642d9 100644
--- a/epochX/sycl/gg_ttggg.sa/SubProcesses/sycl.mk
+++ b/epochX/sycl/gg_ttggg.sa/SubProcesses/sycl.mk
@@ -212,7 +212,7 @@ cxx_objects_exe=
 $(LIBDIR)/lib$(MG5AMC_CXXLIB).so: $(BUILDDIR)/fbridge.o
 $(LIBDIR)/lib$(MG5AMC_CXXLIB).so: cxx_objects_lib += $(BUILDDIR)/fbridge.o
 $(LIBDIR)/lib$(MG5AMC_CXXLIB).so: $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib)
-	$(CXX) $(CXXFLAGS) $(SYCLFLAGS) -fPIC -shared -o $@ $(cxx_objects_lib) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB)
+	$(CXX) $(CXXFLAGS) $(CXXFLAGS) $(SYCLFLAGS) -fPIC -shared -o $@ $(cxx_objects_lib) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB)
 
 # Target (and build rules): C++ and SYCL static libraries
 $(LIBDIR)/lib$(MG5AMC_CXXLIB).a: $(BUILDDIR)/CPPProcess.o $(BUILDDIR)/fbridge.o

From 968198d29fa347bd83b8327ef3c275bd33ebd0da Mon Sep 17 00:00:00 2001
From: Jooorgen <jorgen.teig@gmail.com>
Date: Mon, 28 Nov 2022 15:27:26 +0100
Subject: [PATCH 097/509] Fixed bug in execution of python scripts

---
 .github/workflows/profiler.yml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/profiler.yml b/.github/workflows/profiler.yml
index 94f27081a9..2ea7798a15 100644
--- a/.github/workflows/profiler.yml
+++ b/.github/workflows/profiler.yml
@@ -23,8 +23,8 @@ jobs:
     - name: Runs SYCL performanceProfiler.py script
       run: cd tools/profiling/; source /cvmfs/sft.cern.ch/lcg/contrib/gcc/11.3.0/x86_64-centos8/setup.sh; g++ --version; python3 performanceProfiler.py -l 'SYCL' -b 'master'
     - name: Uploads SYCL JSON files to DB
-      run: ./tools/profiling/sendData.py --profiler --absLayer "SYCL"
+      run: python3 /tools/profiling/sendData.py --profiler --absLayer "SYCL"
     - name: Runs CUDA performanceProfiler.py script
       run: cd tools/profiling/; source /cvmfs/sft.cern.ch/lcg/contrib/gcc/11.3.0/x86_64-centos8/setup.sh; python3 performanceProfiler.py -l 'CUDA' -b 'master'
     - name: Uploads CUDA JSON files to DB
-      run: ./tools/profiling/sendData.py --profiler --absLayer "CUDA"
\ No newline at end of file
+      run: python3 /tools/profiling/sendData.py --profiler --absLayer "CUDA"
\ No newline at end of file

From d75c64acc9dac614e2024cd780f6ea8d406f03e0 Mon Sep 17 00:00:00 2001
From: Jooorgen <jorgen.teig@gmail.com>
Date: Mon, 28 Nov 2022 16:15:32 +0100
Subject: [PATCH 098/509] Fixed bug in profiler workflow

---
 .github/workflows/profiler.yml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/profiler.yml b/.github/workflows/profiler.yml
index 2ea7798a15..20ccd93db8 100644
--- a/.github/workflows/profiler.yml
+++ b/.github/workflows/profiler.yml
@@ -23,8 +23,8 @@ jobs:
     - name: Runs SYCL performanceProfiler.py script
       run: cd tools/profiling/; source /cvmfs/sft.cern.ch/lcg/contrib/gcc/11.3.0/x86_64-centos8/setup.sh; g++ --version; python3 performanceProfiler.py -l 'SYCL' -b 'master'
     - name: Uploads SYCL JSON files to DB
-      run: python3 /tools/profiling/sendData.py --profiler --absLayer "SYCL"
+      run: python3 tools/profiling/sendData.py --profiler --absLayer "SYCL"
     - name: Runs CUDA performanceProfiler.py script
       run: cd tools/profiling/; source /cvmfs/sft.cern.ch/lcg/contrib/gcc/11.3.0/x86_64-centos8/setup.sh; python3 performanceProfiler.py -l 'CUDA' -b 'master'
     - name: Uploads CUDA JSON files to DB
-      run: python3 /tools/profiling/sendData.py --profiler --absLayer "CUDA"
\ No newline at end of file
+      run: python3 tools/profiling/sendData.py --profiler --absLayer "CUDA"
\ No newline at end of file

From ac32bac82674b59796fee6b6064c27e3df73b831 Mon Sep 17 00:00:00 2001
From: Jooorgen <jorgen.teig@gmail.com>
Date: Mon, 28 Nov 2022 16:59:05 +0100
Subject: [PATCH 099/509] Fixed bug in makefile

---
 epochX/sycl/gg_ttggg.sa/SubProcesses/sycl.mk | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/epochX/sycl/gg_ttggg.sa/SubProcesses/sycl.mk b/epochX/sycl/gg_ttggg.sa/SubProcesses/sycl.mk
index 2c59c642d9..7b13f8a0f8 100644
--- a/epochX/sycl/gg_ttggg.sa/SubProcesses/sycl.mk
+++ b/epochX/sycl/gg_ttggg.sa/SubProcesses/sycl.mk
@@ -212,7 +212,7 @@ cxx_objects_exe=
 $(LIBDIR)/lib$(MG5AMC_CXXLIB).so: $(BUILDDIR)/fbridge.o
 $(LIBDIR)/lib$(MG5AMC_CXXLIB).so: cxx_objects_lib += $(BUILDDIR)/fbridge.o
 $(LIBDIR)/lib$(MG5AMC_CXXLIB).so: $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib)
-	$(CXX) $(CXXFLAGS) $(CXXFLAGS) $(SYCLFLAGS) -fPIC -shared -o $@ $(cxx_objects_lib) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB)
+	$(CXX) $(CXXFLAGS) $(SYCLFLAGS) -fPIC -shared -o $@ $(cxx_objects_lib) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB)
 
 # Target (and build rules): C++ and SYCL static libraries
 $(LIBDIR)/lib$(MG5AMC_CXXLIB).a: $(BUILDDIR)/CPPProcess.o $(BUILDDIR)/fbridge.o
@@ -260,7 +260,7 @@ $(BUILDDIR)/%.o : %.f *.inc
 
 $(fsycl_main): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH
 $(fsycl_main): $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler.o $(LIBDIR)/lib$(MG5AMC_CXXLIB).a $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so
-	$(CXX) $(SYCLFLAGS) -o $@ $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler.o $(LIBDIR)/lib$(MG5AMC_CXXLIB).a $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) -lstdc++fs
+	$(CXX) $(CXXFLAGS) $(SYCLFLAGS) -o $@ $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler.o $(LIBDIR)/lib$(MG5AMC_CXXLIB).a $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) -lstdc++fs
 
 #-------------------------------------------------------------------------------
 

From dd93b988b68493641e09770428b2a2b3e53f0b73 Mon Sep 17 00:00:00 2001
From: Jooorgen <jorgen.teig@gmail.com>
Date: Mon, 28 Nov 2022 17:06:29 +0100
Subject: [PATCH 100/509] Added branch as argument for script

---
 tools/profiling/sendData.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/tools/profiling/sendData.py b/tools/profiling/sendData.py
index eec84bbb6b..338a93ee61 100644
--- a/tools/profiling/sendData.py
+++ b/tools/profiling/sendData.py
@@ -16,6 +16,7 @@
 GPU = 'a100s'
 physicsProcesses = ['ee_mumu', 'gg_ttggg', 'gg_ttgg', 'gg_ttg', 'gg_tt']
 absLayer = ['SYCL', 'CUDA', 'sycl', 'cuda']
+branch = 'master'
 GCCVersion = '11.3'
 CUDAVersion = '11.6.2'
 fields = ['EvtsPerSec[MatrixElems] (3)', 'EvtsPerSec[MECalcOnly] (3)']
@@ -30,6 +31,7 @@
 parser.add_argument('--GCCVersion', help="GCC version used when profiling.", default=GCCVersion)
 parser.add_argument('--CUDAVersion', help="CUDA version used when profiling.", default=CUDAVersion)
 parser.add_argument('-a', '--absLayer', help="Abstraction layer used when profiling.", default=absLayer[0], choices=absLayer)
+parser.add_argument('-b', '--branch', help="Branch the profiler data is in.", default=branch)
 parser.add_argument('-p', '--profiler', help="Enable CI profiling defaults.", action=argparse.BooleanOptionalAction)
 
 args = parser.parse_args()
@@ -52,7 +54,7 @@
                 logging.error('Sycl name prefix has not been set!')
                 sys.exit(1)
 
-            reportfolder= "workspace_mg4gpu/" + datetime.datetime.now().strftime('%y-%m-%d') + '_' + syclNamePrefix
+            reportfolder= "workspace_mg4gpu/" + datetime.datetime.now().strftime('%y-%m-%d') + '_' + syclNamePrefix + '_' + branch
 
             if not os.path.exists(reportfolder):
                 logging.error('SYCL report path does not exist!')

From 0343eeb672069aafdd95dcd1d6836cab95e4d3b3 Mon Sep 17 00:00:00 2001
From: Jooorgen <jorgen.teig@gmail.com>
Date: Mon, 28 Nov 2022 17:09:15 +0100
Subject: [PATCH 101/509] Added comments

---
 tools/profiling/buildSYCLProcess.sh | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tools/profiling/buildSYCLProcess.sh b/tools/profiling/buildSYCLProcess.sh
index 35d454371e..4217d56861 100755
--- a/tools/profiling/buildSYCLProcess.sh
+++ b/tools/profiling/buildSYCLProcess.sh
@@ -53,6 +53,7 @@ export WORKSPACE=$prefix/workspace_mg4gpu
 #export NAME_PREFIX="sycl_v100s_cuda_11.6.2_gcc_11.3"
 #export NAME_PREFIX="sycl_Xeon-Silver-4216_a100s_cuda-11.6.2_gcc-11.3"
 
+# Branch should be enviroment variable in main script and then passed down if none then it is not displayed in prefix
 REPORT_FOLDER_PREFIX="${WORKSPACE}/$(date +"%y-%m-%d")_${SYCL_NAME_PREFIX}_master"
 
 # If unknown set at the run step after running LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$MG_LIBS $MG_EXE --param_card $MG5AMC_CARD_PATH/param_card.dat --device_info 1024 128 10

From b823b614b6e9ef10b6829a232dfe4877983f4a46 Mon Sep 17 00:00:00 2001
From: Jooorgen <jorgen.teig@gmail.com>
Date: Mon, 28 Nov 2022 17:13:51 +0100
Subject: [PATCH 102/509] Added a env variable to set the CI profiler option to
 kick in in Makefiles or not

---
 tools/profiling/buildSYCLProcess.sh    | 2 --
 tools/profiling/performanceProfiler.py | 9 +++++++++
 2 files changed, 9 insertions(+), 2 deletions(-)

diff --git a/tools/profiling/buildSYCLProcess.sh b/tools/profiling/buildSYCLProcess.sh
index 4217d56861..c518f7b17d 100755
--- a/tools/profiling/buildSYCLProcess.sh
+++ b/tools/profiling/buildSYCLProcess.sh
@@ -40,8 +40,6 @@ fi
 # Assumes that this is run from profiling directory in the repo
 prefix=$(pwd)
 
-export ENABLE_CI_PROFILER=1
-
 #export DPCPP_HOME=/p/project/prpb109/sycl_workspace
 export DPCPP_HOME=/afs/cern.ch/work/j/jteig/sycl_workspace
 export USEBUILDDIR=1
diff --git a/tools/profiling/performanceProfiler.py b/tools/profiling/performanceProfiler.py
index 9e23e2f8e4..0b26f85304 100644
--- a/tools/profiling/performanceProfiler.py
+++ b/tools/profiling/performanceProfiler.py
@@ -25,9 +25,18 @@
 
 parser.add_argument("-l", help="Choose which abstraction layer you want to use (CUDA/SYCL).", default=absLayer)
 parser.add_argument("-b", help="Choose which branch the madgraph4gpu repo is in.", default=branch)
+parser.add_argument("-b", help="Choose which branch the madgraph4gpu repo is in.", default=branch)
+parser.add_argument("--profiler", help="Enables a argument in the build script to specify which GCC toolkit the compiler will use.", action=argparse.BooleanOptionalAction)
+
+#Add profiler option in python and build scripts so that correct gcc toolchain can be set through makefile and still not disturb the compilation on Github machines
 
 pyArgs = parser.parse_args()
 
+if pyArgs.profiler == True:
+    os.environ['ENABLE_CI_PROFILER'] = 1
+else:
+    os.environ['ENABLE_CI_PROFILER'] = 0
+
 # How many runs in total the program made
 count = 0
 

From ae8c12fe25eec23787821406dc4cd50efd783bd8 Mon Sep 17 00:00:00 2001
From: Jooorgen <jorgen.teig@gmail.com>
Date: Mon, 28 Nov 2022 17:37:50 +0100
Subject: [PATCH 103/509] Removed unneccessary lines and added branch option to
 profiler workflow

---
 .github/workflows/profiler.yml         | 4 ++--
 tools/profiling/buildSYCLProcess.sh    | 2 --
 tools/profiling/performanceProfiler.py | 2 +-
 3 files changed, 3 insertions(+), 5 deletions(-)

diff --git a/.github/workflows/profiler.yml b/.github/workflows/profiler.yml
index 20ccd93db8..70fe351a9f 100644
--- a/.github/workflows/profiler.yml
+++ b/.github/workflows/profiler.yml
@@ -23,8 +23,8 @@ jobs:
     - name: Runs SYCL performanceProfiler.py script
       run: cd tools/profiling/; source /cvmfs/sft.cern.ch/lcg/contrib/gcc/11.3.0/x86_64-centos8/setup.sh; g++ --version; python3 performanceProfiler.py -l 'SYCL' -b 'master'
     - name: Uploads SYCL JSON files to DB
-      run: python3 tools/profiling/sendData.py --profiler --absLayer "SYCL"
+      run: python3 tools/profiling/sendData.py --profiler --absLayer "SYCL" --branch master
     - name: Runs CUDA performanceProfiler.py script
       run: cd tools/profiling/; source /cvmfs/sft.cern.ch/lcg/contrib/gcc/11.3.0/x86_64-centos8/setup.sh; python3 performanceProfiler.py -l 'CUDA' -b 'master'
     - name: Uploads CUDA JSON files to DB
-      run: python3 tools/profiling/sendData.py --profiler --absLayer "CUDA"
\ No newline at end of file
+      run: python3 tools/profiling/sendData.py --profiler --absLayer "CUDA" --branch master
\ No newline at end of file
diff --git a/tools/profiling/buildSYCLProcess.sh b/tools/profiling/buildSYCLProcess.sh
index c518f7b17d..6cd7e0bf34 100755
--- a/tools/profiling/buildSYCLProcess.sh
+++ b/tools/profiling/buildSYCLProcess.sh
@@ -8,7 +8,6 @@ helpFunction()
     echo -e "\t-b Blocks per grid"
     echo -e "\t-t Threads per block"
     echo -e "\t-i Iterations"
-    echo -e "\t-p Profiler flag"
     exit 1 # Exit script after printing help
 }
 
@@ -19,7 +18,6 @@ do
         b ) blocksPerGrid="$OPTARG" ;;
         t ) threadsPerBlock="$OPTARG" ;;
         i ) iterations="$OPTARG" ;;
-        p ) profilerFlag="$OPTARG" ;;
         ? ) helpFunction ;; # Print helpFunction in case parameter is non-existent
     esac
 done
diff --git a/tools/profiling/performanceProfiler.py b/tools/profiling/performanceProfiler.py
index 0b26f85304..30b45aad1e 100644
--- a/tools/profiling/performanceProfiler.py
+++ b/tools/profiling/performanceProfiler.py
@@ -25,13 +25,13 @@
 
 parser.add_argument("-l", help="Choose which abstraction layer you want to use (CUDA/SYCL).", default=absLayer)
 parser.add_argument("-b", help="Choose which branch the madgraph4gpu repo is in.", default=branch)
-parser.add_argument("-b", help="Choose which branch the madgraph4gpu repo is in.", default=branch)
 parser.add_argument("--profiler", help="Enables a argument in the build script to specify which GCC toolkit the compiler will use.", action=argparse.BooleanOptionalAction)
 
 #Add profiler option in python and build scripts so that correct gcc toolchain can be set through makefile and still not disturb the compilation on Github machines
 
 pyArgs = parser.parse_args()
 
+# Sets enable CI profiler flag to be picked up in makefiles to set correct GCC toolchain 
 if pyArgs.profiler == True:
     os.environ['ENABLE_CI_PROFILER'] = 1
 else:

From 563d54c0f7067856bbf486561136b6eb27fedfb7 Mon Sep 17 00:00:00 2001
From: Jooorgen <jorgen.teig@gmail.com>
Date: Mon, 28 Nov 2022 17:40:21 +0100
Subject: [PATCH 104/509] Changed env variable to string

---
 tools/profiling/performanceProfiler.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tools/profiling/performanceProfiler.py b/tools/profiling/performanceProfiler.py
index 30b45aad1e..f7283d1dc3 100644
--- a/tools/profiling/performanceProfiler.py
+++ b/tools/profiling/performanceProfiler.py
@@ -33,9 +33,9 @@
 
 # Sets enable CI profiler flag to be picked up in makefiles to set correct GCC toolchain 
 if pyArgs.profiler == True:
-    os.environ['ENABLE_CI_PROFILER'] = 1
+    os.environ['ENABLE_CI_PROFILER'] = '1'
 else:
-    os.environ['ENABLE_CI_PROFILER'] = 0
+    os.environ['ENABLE_CI_PROFILER'] = '0'
 
 # How many runs in total the program made
 count = 0

From 77e5f88552c86afc2b6ab37b22dd95c04a74f47f Mon Sep 17 00:00:00 2001
From: Jooorgen <jorgen.teig@gmail.com>
Date: Tue, 29 Nov 2022 10:56:54 +0100
Subject: [PATCH 105/509] Removed unneccessary code and added a enviroment
 variable in workflow

---
 .github/workflows/profiler.yml         | 1 +
 tools/profiling/performanceProfiler.py | 7 -------
 2 files changed, 1 insertion(+), 7 deletions(-)

diff --git a/.github/workflows/profiler.yml b/.github/workflows/profiler.yml
index 70fe351a9f..4a3905726c 100644
--- a/.github/workflows/profiler.yml
+++ b/.github/workflows/profiler.yml
@@ -15,6 +15,7 @@ jobs:
     env:
       SYCL_NAME_PREFIX: sycl_Xeon-Silver-4216_a100s_gcc-11.3_cuda-11.6.2
       CUDA_NAME_PREFIX: cudacpp_Xeon-Silver-4216_a100s_gcc-11.3_cuda-11.6.2
+      ENABLE_CI_PROFILER: 1
 
       MADGRAPH4GPU_DB_SECRET: ${{ secrets.MADGRAPH4GPU_DB_SECRET }}
     runs-on: [self-hosted, linux, a100s]
diff --git a/tools/profiling/performanceProfiler.py b/tools/profiling/performanceProfiler.py
index f7283d1dc3..89306522ea 100644
--- a/tools/profiling/performanceProfiler.py
+++ b/tools/profiling/performanceProfiler.py
@@ -25,18 +25,11 @@
 
 parser.add_argument("-l", help="Choose which abstraction layer you want to use (CUDA/SYCL).", default=absLayer)
 parser.add_argument("-b", help="Choose which branch the madgraph4gpu repo is in.", default=branch)
-parser.add_argument("--profiler", help="Enables a argument in the build script to specify which GCC toolkit the compiler will use.", action=argparse.BooleanOptionalAction)
 
 #Add profiler option in python and build scripts so that correct gcc toolchain can be set through makefile and still not disturb the compilation on Github machines
 
 pyArgs = parser.parse_args()
 
-# Sets enable CI profiler flag to be picked up in makefiles to set correct GCC toolchain 
-if pyArgs.profiler == True:
-    os.environ['ENABLE_CI_PROFILER'] = '1'
-else:
-    os.environ['ENABLE_CI_PROFILER'] = '0'
-
 # How many runs in total the program made
 count = 0
 

From daf80bb3c6a04870c08bb31d3918648e44d316fa Mon Sep 17 00:00:00 2001
From: Jooorgen <jorgen.teig@gmail.com>
Date: Tue, 29 Nov 2022 13:40:21 +0100
Subject: [PATCH 106/509] Made some changes for testing

---
 tools/profiling/performanceProfiler.py | 2 +-
 tools/profiling/sendData.py            | 4 +---
 2 files changed, 2 insertions(+), 4 deletions(-)

diff --git a/tools/profiling/performanceProfiler.py b/tools/profiling/performanceProfiler.py
index 89306522ea..0d5b2cfacb 100644
--- a/tools/profiling/performanceProfiler.py
+++ b/tools/profiling/performanceProfiler.py
@@ -9,7 +9,7 @@
 absLayer = "SYCL"
 branch = "master"
 
-mgProcesses = ["ee_mumu", "gg_tt", "gg_ttg", "gg_ttgg", "gg_ttggg"]
+mgProcesses = ["ee_mumu", "gg_tt", "gg_ttg", "gg_ttgg"] #"gg_ttggg"]
 
 #doublePrecisionConstant = 2560
 doublePrecisionConstant = 1
diff --git a/tools/profiling/sendData.py b/tools/profiling/sendData.py
index 338a93ee61..140fc6ffab 100644
--- a/tools/profiling/sendData.py
+++ b/tools/profiling/sendData.py
@@ -36,9 +36,6 @@
 
 args = parser.parse_args()
 
-os.environ['SYCL_NAME_PREFIX'] = 'sycl_Xeon-Silver-4216_a100s_gcc-11.3_cuda-11.6.2'
-os.environ['CUDA_NAME_PREFIX'] = 'cudacpp_Xeon-Silver-4216_a100s_gcc-11.3_cuda-11.6.2'
-
 #
 #   Main
 #
@@ -55,6 +52,7 @@
                 sys.exit(1)
 
             reportfolder= "workspace_mg4gpu/" + datetime.datetime.now().strftime('%y-%m-%d') + '_' + syclNamePrefix + '_' + branch
+            print(reportfolder)
 
             if not os.path.exists(reportfolder):
                 logging.error('SYCL report path does not exist!')

From 96268c832f3efdd61ffabe5d211f441bc023394d Mon Sep 17 00:00:00 2001
From: Jooorgen <jorgen.teig@gmail.com>
Date: Tue, 29 Nov 2022 13:51:24 +0100
Subject: [PATCH 107/509] Changed working directory of the steps

---
 .github/workflows/profiler.yml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/profiler.yml b/.github/workflows/profiler.yml
index 4a3905726c..34b2ba19f7 100644
--- a/.github/workflows/profiler.yml
+++ b/.github/workflows/profiler.yml
@@ -24,8 +24,8 @@ jobs:
     - name: Runs SYCL performanceProfiler.py script
       run: cd tools/profiling/; source /cvmfs/sft.cern.ch/lcg/contrib/gcc/11.3.0/x86_64-centos8/setup.sh; g++ --version; python3 performanceProfiler.py -l 'SYCL' -b 'master'
     - name: Uploads SYCL JSON files to DB
-      run: python3 tools/profiling/sendData.py --profiler --absLayer "SYCL" --branch master
+      run: cd tools/profiling/; python3 sendData.py --profiler --absLayer "SYCL" --branch master
     - name: Runs CUDA performanceProfiler.py script
       run: cd tools/profiling/; source /cvmfs/sft.cern.ch/lcg/contrib/gcc/11.3.0/x86_64-centos8/setup.sh; python3 performanceProfiler.py -l 'CUDA' -b 'master'
     - name: Uploads CUDA JSON files to DB
-      run: python3 tools/profiling/sendData.py --profiler --absLayer "CUDA" --branch master
\ No newline at end of file
+      run: cd tools/profiling/; python3 sendData.py --profiler --absLayer "CUDA" --branch master
\ No newline at end of file

From 01f05bb7466c17448ceed8caf88d36b72e77bc70 Mon Sep 17 00:00:00 2001
From: Jooorgen <jorgen.teig@gmail.com>
Date: Tue, 29 Nov 2022 14:22:27 +0100
Subject: [PATCH 108/509] Added correct branch assignment

---
 tools/profiling/buildSYCLProcess.sh    | 10 +++++-----
 tools/profiling/performanceProfiler.py |  6 +++++-
 2 files changed, 10 insertions(+), 6 deletions(-)

diff --git a/tools/profiling/buildSYCLProcess.sh b/tools/profiling/buildSYCLProcess.sh
index 6cd7e0bf34..8063c1f5f6 100755
--- a/tools/profiling/buildSYCLProcess.sh
+++ b/tools/profiling/buildSYCLProcess.sh
@@ -8,16 +8,18 @@ helpFunction()
     echo -e "\t-b Blocks per grid"
     echo -e "\t-t Threads per block"
     echo -e "\t-i Iterations"
+    echo -e "\t--branch Branch"
     exit 1 # Exit script after printing help
 }
 
-while getopts "n:b:t:i:p:" opt
+while getopts "n:b:t:i:" opt
 do
     case "$opt" in
         n ) MG_PROC="$OPTARG" ;; #process to target
         b ) blocksPerGrid="$OPTARG" ;;
         t ) threadsPerBlock="$OPTARG" ;;
         i ) iterations="$OPTARG" ;;
+        branch ) branch="$OPTARG" ;;
         ? ) helpFunction ;; # Print helpFunction in case parameter is non-existent
     esac
 done
@@ -50,7 +52,7 @@ export WORKSPACE=$prefix/workspace_mg4gpu
 #export NAME_PREFIX="sycl_Xeon-Silver-4216_a100s_cuda-11.6.2_gcc-11.3"
 
 # Branch should be enviroment variable in main script and then passed down if none then it is not displayed in prefix
-REPORT_FOLDER_PREFIX="${WORKSPACE}/$(date +"%y-%m-%d")_${SYCL_NAME_PREFIX}_master"
+export REPORT_FOLDER=$WORKSPACE/$(date +"%y-%m-%d")_${SYCL_NAME_PREFIX}_${branch}
 
 # If unknown set at the run step after running LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$MG_LIBS $MG_EXE --param_card $MG5AMC_CARD_PATH/param_card.dat --device_info 1024 128 10
 export DEVICE_ID=0
@@ -71,9 +73,7 @@ esac
 
 mkdir -p $WORKSPACE/mg4gpu/lib 2>/dev/null; true
 mkdir -p $WORKSPACE/mg4gpu/bin 2>/dev/null; true
-mkdir $REPORT_FOLDER_PREFIX 2>/dev/null; true
-
-export REPORT_FOLDER=$WORKSPACE/$(date +"%y-%m-%d")_${SYCL_NAME_PREFIX}
+mkdir $REPORT_FOLDER 2>/dev/null; true
 
 export MG4GPU_LIB=$WORKSPACE/mg4gpu/lib
 export MG4GPU_BIN=$WORKSPACE/mg4gpu/bin
diff --git a/tools/profiling/performanceProfiler.py b/tools/profiling/performanceProfiler.py
index 0d5b2cfacb..fbacce75a7 100644
--- a/tools/profiling/performanceProfiler.py
+++ b/tools/profiling/performanceProfiler.py
@@ -52,7 +52,11 @@
                     if ".sa" not in process:
                         process = process + ".sa"
                     
-                    bashArgs = ["./buildCUDAProcess.sh", "-n",  process, "-i",  str(iterations), "-t",  str(TPB), "-b", str(BPG)]
+                    bashArgs = ["./buildCUDAProcess.sh", "-n",  process, "-i",  str(iterations), "-t",  str(TPB), "-b", str(BPG), "--branch", str(pyArgs.b).lower]
+
+                    #if len(pyArgs.b) > 0:
+                    #    bashArgs.append('--branch')
+                    #    bashArgs.append(str(pyArgs.b).lower())
 
                 else: sys.exit("No abstraction layer matching the supplied string!")
 

From cd90f8894e4db946b1b7e55ba0cd0a5816cc4c9b Mon Sep 17 00:00:00 2001
From: Jooorgen <jorgen.teig@gmail.com>
Date: Tue, 29 Nov 2022 14:35:00 +0100
Subject: [PATCH 109/509] Added correct parameter to build script

---
 tools/profiling/buildSYCLProcess.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/profiling/buildSYCLProcess.sh b/tools/profiling/buildSYCLProcess.sh
index 8063c1f5f6..e547a7ab96 100755
--- a/tools/profiling/buildSYCLProcess.sh
+++ b/tools/profiling/buildSYCLProcess.sh
@@ -12,7 +12,7 @@ helpFunction()
     exit 1 # Exit script after printing help
 }
 
-while getopts "n:b:t:i:" opt
+while getopts "n:b:t:i:branch:" opt
 do
     case "$opt" in
         n ) MG_PROC="$OPTARG" ;; #process to target

From c036aaa45033e64bdc38b78e70ea9ae7c9b2bcd2 Mon Sep 17 00:00:00 2001
From: Jooorgen <jorgen.teig@gmail.com>
Date: Tue, 29 Nov 2022 14:50:02 +0100
Subject: [PATCH 110/509] Fixed parameters in SYCL build script

---
 tools/profiling/buildSYCLProcess.sh | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/tools/profiling/buildSYCLProcess.sh b/tools/profiling/buildSYCLProcess.sh
index e547a7ab96..a019ab6bc1 100755
--- a/tools/profiling/buildSYCLProcess.sh
+++ b/tools/profiling/buildSYCLProcess.sh
@@ -8,18 +8,18 @@ helpFunction()
     echo -e "\t-b Blocks per grid"
     echo -e "\t-t Threads per block"
     echo -e "\t-i Iterations"
-    echo -e "\t--branch Branch"
+    echo -e "\t-r Branch"
     exit 1 # Exit script after printing help
 }
 
-while getopts "n:b:t:i:branch:" opt
+while getopts "n:b:t:i:r:" opt
 do
     case "$opt" in
         n ) MG_PROC="$OPTARG" ;; #process to target
         b ) blocksPerGrid="$OPTARG" ;;
         t ) threadsPerBlock="$OPTARG" ;;
         i ) iterations="$OPTARG" ;;
-        branch ) branch="$OPTARG" ;;
+        r ) branch="$OPTARG" ;;
         ? ) helpFunction ;; # Print helpFunction in case parameter is non-existent
     esac
 done

From 6f1482f677a34d261e00a98a87c1a3ac3d9651a4 Mon Sep 17 00:00:00 2001
From: Jooorgen <jorgen.teig@gmail.com>
Date: Tue, 29 Nov 2022 14:57:55 +0100
Subject: [PATCH 111/509] Added correct branch in profiler script

---
 .github/workflows/profiler.yml         | 2 +-
 tools/profiling/performanceProfiler.py | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/profiler.yml b/.github/workflows/profiler.yml
index 34b2ba19f7..ea78fa9d18 100644
--- a/.github/workflows/profiler.yml
+++ b/.github/workflows/profiler.yml
@@ -22,7 +22,7 @@ jobs:
     steps:
     - uses: actions/checkout@v2
     - name: Runs SYCL performanceProfiler.py script
-      run: cd tools/profiling/; source /cvmfs/sft.cern.ch/lcg/contrib/gcc/11.3.0/x86_64-centos8/setup.sh; g++ --version; python3 performanceProfiler.py -l 'SYCL' -b 'master'
+      run: cd tools/profiling/; source /cvmfs/sft.cern.ch/lcg/contrib/gcc/11.3.0/x86_64-centos8/setup.sh; python3 performanceProfiler.py -l 'SYCL' -b 'master'
     - name: Uploads SYCL JSON files to DB
       run: cd tools/profiling/; python3 sendData.py --profiler --absLayer "SYCL" --branch master
     - name: Runs CUDA performanceProfiler.py script
diff --git a/tools/profiling/performanceProfiler.py b/tools/profiling/performanceProfiler.py
index fbacce75a7..00b6f486b7 100644
--- a/tools/profiling/performanceProfiler.py
+++ b/tools/profiling/performanceProfiler.py
@@ -43,7 +43,7 @@
                     if ".sa" not in process:
                         process = process + ".sa"
 
-                    bashArgs = ["./buildSYCLProcess.sh", "-n",  process, "-i",  str(iterations), "-t",  str(TPB), "-b", str(BPG)]
+                    bashArgs = ["./buildSYCLProcess.sh", "-n",  process, "-i",  str(iterations), "-t",  str(TPB), "-b", str(BPG), "-r", str(pyArgs.b).lower]
 
                 elif pyArgs.l.upper() == 'CUDA':
 
@@ -52,7 +52,7 @@
                     if ".sa" not in process:
                         process = process + ".sa"
                     
-                    bashArgs = ["./buildCUDAProcess.sh", "-n",  process, "-i",  str(iterations), "-t",  str(TPB), "-b", str(BPG), "--branch", str(pyArgs.b).lower]
+                    bashArgs = ["./buildCUDAProcess.sh", "-n",  process, "-i",  str(iterations), "-t",  str(TPB), "-b", str(BPG), "-r", str(pyArgs.b).lower]
 
                     #if len(pyArgs.b) > 0:
                     #    bashArgs.append('--branch')

From 61d7877e8e6a8489eea15171021ba728c65b05b2 Mon Sep 17 00:00:00 2001
From: Jooorgen <jorgen.teig@gmail.com>
Date: Tue, 29 Nov 2022 15:04:31 +0100
Subject: [PATCH 112/509] Fixed bug

---
 tools/profiling/performanceProfiler.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tools/profiling/performanceProfiler.py b/tools/profiling/performanceProfiler.py
index 00b6f486b7..93174822c1 100644
--- a/tools/profiling/performanceProfiler.py
+++ b/tools/profiling/performanceProfiler.py
@@ -43,7 +43,7 @@
                     if ".sa" not in process:
                         process = process + ".sa"
 
-                    bashArgs = ["./buildSYCLProcess.sh", "-n",  process, "-i",  str(iterations), "-t",  str(TPB), "-b", str(BPG), "-r", str(pyArgs.b).lower]
+                    bashArgs = ["./buildSYCLProcess.sh", "-n",  process, "-i",  str(iterations), "-t",  str(TPB), "-b", str(BPG), "-r", str(pyArgs.b).lower()]
 
                 elif pyArgs.l.upper() == 'CUDA':
 
@@ -52,7 +52,7 @@
                     if ".sa" not in process:
                         process = process + ".sa"
                     
-                    bashArgs = ["./buildCUDAProcess.sh", "-n",  process, "-i",  str(iterations), "-t",  str(TPB), "-b", str(BPG), "-r", str(pyArgs.b).lower]
+                    bashArgs = ["./buildCUDAProcess.sh", "-n",  process, "-i",  str(iterations), "-t",  str(TPB), "-b", str(BPG), "-r", str(pyArgs.b).lower()]
 
                     #if len(pyArgs.b) > 0:
                     #    bashArgs.append('--branch')

From 39a9483b5a6271b07798d3e28d56d88b87590e35 Mon Sep 17 00:00:00 2001
From: Jooorgen <jorgen.teig@gmail.com>
Date: Tue, 29 Nov 2022 15:29:01 +0100
Subject: [PATCH 113/509] Added same changes to SYCL build script to CUDA build
 script

---
 tools/profiling/buildCUDAProcess.sh | 40 +++++++++++++++--------------
 1 file changed, 21 insertions(+), 19 deletions(-)

diff --git a/tools/profiling/buildCUDAProcess.sh b/tools/profiling/buildCUDAProcess.sh
index 2b4d88fd12..63ba671f7d 100755
--- a/tools/profiling/buildCUDAProcess.sh
+++ b/tools/profiling/buildCUDAProcess.sh
@@ -2,31 +2,33 @@
 
 helpFunction()
 {
-   echo ""
-   echo "Usage: $0 -n gg_ttgg -b 1024 -t 128 -i 10"
-   echo -e "\t-n Name of the physics process being built and run"
-   echo -e "\t-b Blocks per grid"
-   echo -e "\t-t Threads per block"
-   echo -e "\t-i Iterations"
-   exit 1 # Exit script after printing help
+    echo ""
+    echo "Usage: $0 -n gg_ttgg -b 1024 -t 128 -i 10"
+    echo -e "\t-n Name of the physics process being built and run"
+    echo -e "\t-b Blocks per grid"
+    echo -e "\t-t Threads per block"
+    echo -e "\t-i Iterations"
+    echo -e "\t-r Branch"
+    exit 1 # Exit script after printing help
 }
 
-while getopts "n:b:t:i:" opt
+while getopts "n:b:t:i:r:" opt
 do
-   case "$opt" in
-      n ) MG_PROC="$OPTARG" ;; #process to target
-      b ) blocksPerGrid="$OPTARG" ;;
-      t ) threadsPerBlock="$OPTARG" ;;
-      i ) iterations="$OPTARG" ;;
-      ? ) helpFunction ;; # Print helpFunction in case parameter is non-existent
-   esac
+    case "$opt" in
+        n ) MG_PROC="$OPTARG" ;; #process to target
+        b ) blocksPerGrid="$OPTARG" ;;
+        t ) threadsPerBlock="$OPTARG" ;;
+        i ) iterations="$OPTARG" ;;
+        r ) branch="$OPTARG" ;;
+        ? ) helpFunction ;; # Print helpFunction in case parameter is non-existent
+    esac
 done
 
 # Print helpFunction in case parameters are empty
 if [ -z "${MG_PROC}" ] || [ -z "${blocksPerGrid}" ] || [ -z "${threadsPerBlock}" ] || [ -z "${iterations}" ]
 then
-   echo "Some or all of the parameters are empty";
-   helpFunction
+    echo "Some or all of the parameters are empty";
+    helpFunction
 fi
 
 # Begin script in case all parameters are correct
@@ -48,13 +50,13 @@ export FC=`which gfortran`
 export WORKSPACE=$prefix/workspace_mg4gpu
 #export NAME_PREFIX="cudacpp_v100s_cuda_11.6.2_gcc_11.3"
 
-REPORT_FOLDER_PREFIX="${WORKSPACE}/$(date +"%y-%m-%d")_${CUDA_NAME_PREFIX}_master"
+REPORT_FOLDER="${WORKSPACE}/$(date +"%y-%m-%d")_${CUDA_NAME_PREFIX}_${branch}"
 
 # Sets CUDA in PATH
 export PATH=$CUDA_HOME:$PATH
 
 mkdir $WORKSPACE 2>/dev/null; true
-mkdir $REPORT_FOLDER_PREFIX 2>/dev/null; true
+mkdir $REPORT_FOLDER 2>/dev/null; true
 
 # Finds correct subprocess
 case $MG_PROC in

From b045c7c10648b1315bbdcb1f951c0fa73cd02683 Mon Sep 17 00:00:00 2001
From: Jooorgen <jorgen.teig@gmail.com>
Date: Tue, 29 Nov 2022 15:51:15 +0100
Subject: [PATCH 114/509] Fixed bug in the JSON database sending script

---
 tools/profiling/sendData.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/profiling/sendData.py b/tools/profiling/sendData.py
index 140fc6ffab..6f5f7f297c 100644
--- a/tools/profiling/sendData.py
+++ b/tools/profiling/sendData.py
@@ -65,7 +65,7 @@
                 logging.error('Cuda name prefix has not been set!')
                 sys.exit(1)
 
-            reportfolder= "workspace_mg4gpu/" + datetime.datetime.now().strftime('%y-%m-%d') + '_' + cudaNamePrefix
+            reportfolder= "workspace_mg4gpu/" + datetime.datetime.now().strftime('%y-%m-%d') + '_' + cudaNamePrefix + '_' + branch
 
             if not os.path.exists(reportfolder):
                 logging.error('CUDA report path does not exist!')

From 31b1c2aa3cc27ce6b60f4a040c77b71b4be13d69 Mon Sep 17 00:00:00 2001
From: Jooorgen <jorgen.teig@gmail.com>
Date: Tue, 29 Nov 2022 16:54:47 +0100
Subject: [PATCH 115/509] Added back real numbers for testing and changed
 profiler workflow to come alive at 00:00

---
 .github/workflows/profiler.yml         |  6 ++----
 tools/profiling/performanceProfiler.py | 20 ++++++++++----------
 2 files changed, 12 insertions(+), 14 deletions(-)

diff --git a/.github/workflows/profiler.yml b/.github/workflows/profiler.yml
index ea78fa9d18..528e863ada 100644
--- a/.github/workflows/profiler.yml
+++ b/.github/workflows/profiler.yml
@@ -1,10 +1,8 @@
 name: Profiler
 
 on:
-  push:
-    branches: [ master ]
-  pull_request:
-    branches: [ master ]
+  schedule:
+    - cron:  '00 00 * * *'
 
 env:
   NAME_PREFIX: sycl 
diff --git a/tools/profiling/performanceProfiler.py b/tools/profiling/performanceProfiler.py
index 93174822c1..5d21376e1c 100644
--- a/tools/profiling/performanceProfiler.py
+++ b/tools/profiling/performanceProfiler.py
@@ -9,16 +9,16 @@
 absLayer = "SYCL"
 branch = "master"
 
-mgProcesses = ["ee_mumu", "gg_tt", "gg_ttg", "gg_ttgg"] #"gg_ttggg"]
-
-#doublePrecisionConstant = 2560
-doublePrecisionConstant = 1
-#iterations = 10
-iterations = 1
-#threadsPerBlock = [128, 256]
-threadsPerBlock = [32]
-#blocksPerGrid = [16, 32, 64, 128, 256, 512, 1024, 2048, 4096, 8192, 16384]
-blocksPerGrid = [32]
+mgProcesses = ["ee_mumu", "gg_tt", "gg_ttg", "gg_ttgg", "gg_ttggg"]
+
+doublePrecisionConstant = 2560
+#doublePrecisionConstant = 1
+iterations = 10
+#iterations = 1
+threadsPerBlock = [128, 256]
+#threadsPerBlock = [32]
+blocksPerGrid = [16, 32, 64, 128, 256, 512, 1024, 2048, 4096, 8192, 16384]
+#blocksPerGrid = [32]
 
 # Parser
 parser = argparse.ArgumentParser(description='A program for profiling GPUs using MadGraph.')

From 28910728450410641e53c03838aff086b1659566 Mon Sep 17 00:00:00 2001
From: Jooorgen <jorgen.teig@gmail.com>
Date: Wed, 30 Nov 2022 01:16:24 +0100
Subject: [PATCH 116/509] Updated workflow run requirement

---
 .github/workflows/profiler.yml | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/.github/workflows/profiler.yml b/.github/workflows/profiler.yml
index 528e863ada..eed9626d19 100644
--- a/.github/workflows/profiler.yml
+++ b/.github/workflows/profiler.yml
@@ -1,11 +1,10 @@
 name: Profiler
 
 on:
-  schedule:
-    - cron:  '00 00 * * *'
-
-env:
-  NAME_PREFIX: sycl 
+  push:
+    branches: [ master ]
+  pull_request:
+    branches: [ master ]
 
 jobs:
   a100s_Profiling:

From 9ce05cd21be2d39764f72e90405700658a128c7d Mon Sep 17 00:00:00 2001
From: Jooorgen <jorgen.teig@gmail.com>
Date: Wed, 30 Nov 2022 10:27:10 +0100
Subject: [PATCH 117/509] Changed some test numbers for profiling

---
 tools/profiling/performanceProfiler.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/profiling/performanceProfiler.py b/tools/profiling/performanceProfiler.py
index 5d21376e1c..96eb008caa 100644
--- a/tools/profiling/performanceProfiler.py
+++ b/tools/profiling/performanceProfiler.py
@@ -15,7 +15,7 @@
 #doublePrecisionConstant = 1
 iterations = 10
 #iterations = 1
-threadsPerBlock = [128, 256]
+threadsPerBlock = [256]
 #threadsPerBlock = [32]
 blocksPerGrid = [16, 32, 64, 128, 256, 512, 1024, 2048, 4096, 8192, 16384]
 #blocksPerGrid = [32]

From 84b306c6c9a0af10298d13e58f58abfe09177c16 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?J=C3=B8rgen=20Teig?= <jorgen.teig@gmail.com>
Date: Wed, 30 Nov 2022 11:46:33 +0100
Subject: [PATCH 118/509] Update README.md

Added status badge for CI
---
 README.md | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/README.md b/README.md
index 072f04a25d..49d5c7a156 100644
--- a/README.md
+++ b/README.md
@@ -1,5 +1,7 @@
 # Madgraph 4 GPU
 
+[![C/C++ CI](https://github.com/Jooorgen/madgraph4gpu/actions/workflows/c-cpp.yml/badge.svg)](https://github.com/Jooorgen/madgraph4gpu/actions/workflows/c-cpp.yml) [![SYCL CI](https://github.com/Jooorgen/madgraph4gpu/actions/workflows/sycl.yml/badge.svg)](https://github.com/Jooorgen/madgraph4gpu/actions/workflows/sycl.yml) 
+
 This repository contains code developed in the context of porting the [MadGraph5_aMC@NLO](https://cp3.irmp.ucl.ac.be/projects/madgraph/) event generator software onto GPU hardware. MadGraph5_aMC@NLO is able to generate code for various physics processes in different programming languages (Fortran, C, C++). The code generated in this repository is back-ported in "epochs" into the MadGraph5_aMC@NLO generator to allow it to also produce source code for those physics processes to run on GPU platforms. 
 
 

From 75ee18d11358aab3bd01559d0857005610accf5a Mon Sep 17 00:00:00 2001
From: Jooorgen <jorgen.teig@gmail.com>
Date: Mon, 5 Dec 2022 13:35:20 +0100
Subject: [PATCH 119/509] Split the orifinal profiler workflow into 2 for each
 abstraction layer

---
 .github/workflows/cudaProfiler.yml | 23 +++++++++++++++++++++++
 .github/workflows/syclProfiler.yml | 23 +++++++++++++++++++++++
 2 files changed, 46 insertions(+)
 create mode 100644 .github/workflows/cudaProfiler.yml
 create mode 100644 .github/workflows/syclProfiler.yml

diff --git a/.github/workflows/cudaProfiler.yml b/.github/workflows/cudaProfiler.yml
new file mode 100644
index 0000000000..cc8874e4f9
--- /dev/null
+++ b/.github/workflows/cudaProfiler.yml
@@ -0,0 +1,23 @@
+name: CUDA Profiler
+
+on:
+  push:
+    branches: [ master ]
+  pull_request:
+    branches: [ master ]
+
+jobs:
+  cuda_a100s_Profiling:
+    name: CUDA V100S Profiling
+    env:
+      CUDA_NAME_PREFIX: cudacpp_Xeon-Silver-4216_v100s_gcc-11.3_cuda-11.6.2
+      ENABLE_CI_PROFILER: 1
+
+      MADGRAPH4GPU_DB_SECRET: ${{ secrets.MADGRAPH4GPU_DB_SECRET }}
+    runs-on: [self-hosted, linux, v100s]
+    steps:
+    - uses: actions/checkout@v2
+    - name: Runs CUDA performanceProfiler.py script
+      run: cd tools/profiling/; source /cvmfs/sft.cern.ch/lcg/contrib/gcc/11.3.0/x86_64-centos8/setup.sh; python3 performanceProfiler.py -l 'CUDA' -b 'master'
+    - name: Uploads CUDA JSON files to DB
+      run: cd tools/profiling/; python3 sendData.py --profiler --absLayer "CUDA" --branch master
\ No newline at end of file
diff --git a/.github/workflows/syclProfiler.yml b/.github/workflows/syclProfiler.yml
new file mode 100644
index 0000000000..618f1abc79
--- /dev/null
+++ b/.github/workflows/syclProfiler.yml
@@ -0,0 +1,23 @@
+name: SYCL Profiler
+
+on:
+  push:
+    branches: [ master ]
+  pull_request:
+    branches: [ master ]
+
+jobs:
+  sycl_v100s_Profiling:
+    name: SYCL V100S Profiling
+    env:
+      SYCL_NAME_PREFIX: sycl_Xeon-Silver-4216_v100s_gcc-11.3_cuda-11.6.2
+      ENABLE_CI_PROFILER: 1
+
+      MADGRAPH4GPU_DB_SECRET: ${{ secrets.MADGRAPH4GPU_DB_SECRET }}
+    runs-on: [self-hosted, linux, v100s]
+    steps:
+    - uses: actions/checkout@v2
+    - name: Runs SYCL performanceProfiler.py script
+      run: cd tools/profiling/; source /cvmfs/sft.cern.ch/lcg/contrib/gcc/11.3.0/x86_64-centos8/setup.sh; python3 performanceProfiler.py -l 'SYCL' -b 'master'
+    - name: Uploads SYCL JSON files to DB
+      run: cd tools/profiling/; python3 sendData.py --profiler --absLayer "SYCL" --branch master
\ No newline at end of file

From a5493190d9ceee9c989dcf98b198f82cdb613c70 Mon Sep 17 00:00:00 2001
From: Jooorgen <jorgen.teig@gmail.com>
Date: Mon, 5 Dec 2022 13:40:11 +0100
Subject: [PATCH 120/509] Delete obsolete profiler workflow

---
 .github/workflows/profiler.yml | 28 ----------------------------
 1 file changed, 28 deletions(-)
 delete mode 100644 .github/workflows/profiler.yml

diff --git a/.github/workflows/profiler.yml b/.github/workflows/profiler.yml
deleted file mode 100644
index eed9626d19..0000000000
--- a/.github/workflows/profiler.yml
+++ /dev/null
@@ -1,28 +0,0 @@
-name: Profiler
-
-on:
-  push:
-    branches: [ master ]
-  pull_request:
-    branches: [ master ]
-
-jobs:
-  a100s_Profiling:
-    name: A100S Profiling
-    env:
-      SYCL_NAME_PREFIX: sycl_Xeon-Silver-4216_a100s_gcc-11.3_cuda-11.6.2
-      CUDA_NAME_PREFIX: cudacpp_Xeon-Silver-4216_a100s_gcc-11.3_cuda-11.6.2
-      ENABLE_CI_PROFILER: 1
-
-      MADGRAPH4GPU_DB_SECRET: ${{ secrets.MADGRAPH4GPU_DB_SECRET }}
-    runs-on: [self-hosted, linux, a100s]
-    steps:
-    - uses: actions/checkout@v2
-    - name: Runs SYCL performanceProfiler.py script
-      run: cd tools/profiling/; source /cvmfs/sft.cern.ch/lcg/contrib/gcc/11.3.0/x86_64-centos8/setup.sh; python3 performanceProfiler.py -l 'SYCL' -b 'master'
-    - name: Uploads SYCL JSON files to DB
-      run: cd tools/profiling/; python3 sendData.py --profiler --absLayer "SYCL" --branch master
-    - name: Runs CUDA performanceProfiler.py script
-      run: cd tools/profiling/; source /cvmfs/sft.cern.ch/lcg/contrib/gcc/11.3.0/x86_64-centos8/setup.sh; python3 performanceProfiler.py -l 'CUDA' -b 'master'
-    - name: Uploads CUDA JSON files to DB
-      run: cd tools/profiling/; python3 sendData.py --profiler --absLayer "CUDA" --branch master
\ No newline at end of file

From 34e7257182208a0c56c39a34b148fed6ce7f7968 Mon Sep 17 00:00:00 2001
From: Jooorgen <jorgen.teig@gmail.com>
Date: Mon, 5 Dec 2022 14:03:34 +0100
Subject: [PATCH 121/509] Fixed default GPU version

---
 tools/profiling/sendData.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/profiling/sendData.py b/tools/profiling/sendData.py
index 6f5f7f297c..cb2b6d27f4 100644
--- a/tools/profiling/sendData.py
+++ b/tools/profiling/sendData.py
@@ -13,7 +13,7 @@
 URL = 'https://dbod-madgraph4gpu-db.cern.ch:8082/api/v2/write?bucket=ProfilerData'
 secret = os.getenv('MADGRAPH4GPU_DB_SECRET')
 Auth = ['db_user', secret]
-GPU = 'a100s'
+GPU = 'v100s'
 physicsProcesses = ['ee_mumu', 'gg_ttggg', 'gg_ttgg', 'gg_ttg', 'gg_tt']
 absLayer = ['SYCL', 'CUDA', 'sycl', 'cuda']
 branch = 'master'

From e43edc80e9e21fc7a5298f1a34ff26cb00df403b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?J=C3=B8rgen=20Teig?= <jorgen.teig@gmail.com>
Date: Mon, 5 Dec 2022 14:17:56 +0100
Subject: [PATCH 122/509] Added more status badges on the profiling workflows

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 49d5c7a156..733d08b524 100644
--- a/README.md
+++ b/README.md
@@ -1,6 +1,6 @@
 # Madgraph 4 GPU
 
-[![C/C++ CI](https://github.com/Jooorgen/madgraph4gpu/actions/workflows/c-cpp.yml/badge.svg)](https://github.com/Jooorgen/madgraph4gpu/actions/workflows/c-cpp.yml) [![SYCL CI](https://github.com/Jooorgen/madgraph4gpu/actions/workflows/sycl.yml/badge.svg)](https://github.com/Jooorgen/madgraph4gpu/actions/workflows/sycl.yml) 
+[![C/C++ CI](https://github.com/Jooorgen/madgraph4gpu/actions/workflows/c-cpp.yml/badge.svg)](https://github.com/Jooorgen/madgraph4gpu/actions/workflows/c-cpp.yml) [![SYCL CI](https://github.com/Jooorgen/madgraph4gpu/actions/workflows/sycl.yml/badge.svg)](https://github.com/Jooorgen/madgraph4gpu/actions/workflows/sycl.yml) [![CUDA Profiler](https://github.com/Jooorgen/madgraph4gpu/actions/workflows/cudaProfiler.yml/badge.svg)](https://github.com/Jooorgen/madgraph4gpu/actions/workflows/cudaProfiler.yml) [![SYCL Profiler](https://github.com/Jooorgen/madgraph4gpu/actions/workflows/syclProfiler.yml/badge.svg)](https://github.com/Jooorgen/madgraph4gpu/actions/workflows/syclProfiler.yml)
 
 This repository contains code developed in the context of porting the [MadGraph5_aMC@NLO](https://cp3.irmp.ucl.ac.be/projects/madgraph/) event generator software onto GPU hardware. MadGraph5_aMC@NLO is able to generate code for various physics processes in different programming languages (Fortran, C, C++). The code generated in this repository is back-ported in "epochs" into the MadGraph5_aMC@NLO generator to allow it to also produce source code for those physics processes to run on GPU platforms. 
 

From 5041bdcf3edc5303c4e930e1bff03dc31f7896a5 Mon Sep 17 00:00:00 2001
From: Jooorgen <jorgen.teig@gmail.com>
Date: Mon, 5 Dec 2022 17:14:27 +0100
Subject: [PATCH 123/509] Added schedueld profiler

---
 .github/workflows/cudaProfiler.yml | 6 ++----
 .github/workflows/syclProfiler.yml | 6 ++----
 2 files changed, 4 insertions(+), 8 deletions(-)

diff --git a/.github/workflows/cudaProfiler.yml b/.github/workflows/cudaProfiler.yml
index cc8874e4f9..b012d1a0b9 100644
--- a/.github/workflows/cudaProfiler.yml
+++ b/.github/workflows/cudaProfiler.yml
@@ -1,10 +1,8 @@
 name: CUDA Profiler
 
 on:
-  push:
-    branches: [ master ]
-  pull_request:
-    branches: [ master ]
+  schedule:
+    - cron:  '15 17 * * *'
 
 jobs:
   cuda_a100s_Profiling:
diff --git a/.github/workflows/syclProfiler.yml b/.github/workflows/syclProfiler.yml
index 618f1abc79..4bde3af6c8 100644
--- a/.github/workflows/syclProfiler.yml
+++ b/.github/workflows/syclProfiler.yml
@@ -1,10 +1,8 @@
 name: SYCL Profiler
 
 on:
-  push:
-    branches: [ master ]
-  pull_request:
-    branches: [ master ]
+  schedule:
+    - cron:  '15 17 * * *'
 
 jobs:
   sycl_v100s_Profiling:

From 8c80431b88f54535f8c6c2a5009d5172e25a1f12 Mon Sep 17 00:00:00 2001
From: Jooorgen <jorgen.teig@gmail.com>
Date: Mon, 5 Dec 2022 17:30:43 +0100
Subject: [PATCH 124/509] Changed schedule value in workflow

---
 .github/workflows/cudaProfiler.yml | 2 +-
 .github/workflows/syclProfiler.yml | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/cudaProfiler.yml b/.github/workflows/cudaProfiler.yml
index b012d1a0b9..f1e0181c94 100644
--- a/.github/workflows/cudaProfiler.yml
+++ b/.github/workflows/cudaProfiler.yml
@@ -2,7 +2,7 @@ name: CUDA Profiler
 
 on:
   schedule:
-    - cron:  '15 17 * * *'
+    - cron:  '35 17 * * *'
 
 jobs:
   cuda_a100s_Profiling:
diff --git a/.github/workflows/syclProfiler.yml b/.github/workflows/syclProfiler.yml
index 4bde3af6c8..da58f37aaa 100644
--- a/.github/workflows/syclProfiler.yml
+++ b/.github/workflows/syclProfiler.yml
@@ -2,7 +2,7 @@ name: SYCL Profiler
 
 on:
   schedule:
-    - cron:  '15 17 * * *'
+    - cron:  '35 17 * * *'
 
 jobs:
   sycl_v100s_Profiling:

From ce89ce62bc12cf5b60a7c1fb6fa97bb308531f7d Mon Sep 17 00:00:00 2001
From: Jooorgen <jorgen.teig@gmail.com>
Date: Mon, 5 Dec 2022 17:41:09 +0100
Subject: [PATCH 125/509] Updated schedule time in workflow

---
 .github/workflows/cudaProfiler.yml | 2 +-
 .github/workflows/syclProfiler.yml | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/cudaProfiler.yml b/.github/workflows/cudaProfiler.yml
index f1e0181c94..a138deaec9 100644
--- a/.github/workflows/cudaProfiler.yml
+++ b/.github/workflows/cudaProfiler.yml
@@ -2,7 +2,7 @@ name: CUDA Profiler
 
 on:
   schedule:
-    - cron:  '35 17 * * *'
+    - cron:  '45 17 * * *'
 
 jobs:
   cuda_a100s_Profiling:
diff --git a/.github/workflows/syclProfiler.yml b/.github/workflows/syclProfiler.yml
index da58f37aaa..d6d8b69ba8 100644
--- a/.github/workflows/syclProfiler.yml
+++ b/.github/workflows/syclProfiler.yml
@@ -2,7 +2,7 @@ name: SYCL Profiler
 
 on:
   schedule:
-    - cron:  '35 17 * * *'
+    - cron:  '45 17 * * *'
 
 jobs:
   sycl_v100s_Profiling:

From 8b2138ce426260bca7238e7f57529f67be0bb916 Mon Sep 17 00:00:00 2001
From: Jooorgen <jorgen.teig@gmail.com>
Date: Tue, 6 Dec 2022 00:42:23 +0100
Subject: [PATCH 126/509] Copied report folder variable from CUDA build script
 and changed workflow run condition

---
 .github/workflows/syclProfiler.yml  | 6 ++++--
 tools/profiling/buildSYCLProcess.sh | 2 +-
 2 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/syclProfiler.yml b/.github/workflows/syclProfiler.yml
index d6d8b69ba8..618f1abc79 100644
--- a/.github/workflows/syclProfiler.yml
+++ b/.github/workflows/syclProfiler.yml
@@ -1,8 +1,10 @@
 name: SYCL Profiler
 
 on:
-  schedule:
-    - cron:  '45 17 * * *'
+  push:
+    branches: [ master ]
+  pull_request:
+    branches: [ master ]
 
 jobs:
   sycl_v100s_Profiling:
diff --git a/tools/profiling/buildSYCLProcess.sh b/tools/profiling/buildSYCLProcess.sh
index a019ab6bc1..f32917d8d8 100755
--- a/tools/profiling/buildSYCLProcess.sh
+++ b/tools/profiling/buildSYCLProcess.sh
@@ -52,7 +52,7 @@ export WORKSPACE=$prefix/workspace_mg4gpu
 #export NAME_PREFIX="sycl_Xeon-Silver-4216_a100s_cuda-11.6.2_gcc-11.3"
 
 # Branch should be enviroment variable in main script and then passed down if none then it is not displayed in prefix
-export REPORT_FOLDER=$WORKSPACE/$(date +"%y-%m-%d")_${SYCL_NAME_PREFIX}_${branch}
+REPORT_FOLDER="${WORKSPACE}/$(date +"%y-%m-%d")_${SYCL_NAME_PREFIX}_${branch}"
 
 # If unknown set at the run step after running LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$MG_LIBS $MG_EXE --param_card $MG5AMC_CARD_PATH/param_card.dat --device_info 1024 128 10
 export DEVICE_ID=0

From b0442a95bd145079a93724720360475f6e4999b7 Mon Sep 17 00:00:00 2001
From: Jooorgen <jorgen.teig@gmail.com>
Date: Tue, 6 Dec 2022 14:38:26 +0100
Subject: [PATCH 127/509] Made some changes to SYCL and CUDA CI

---
 .github/workflows/c-cpp.yml | 10 +-----
 .github/workflows/sycl.yml  | 61 ++-----------------------------------
 2 files changed, 4 insertions(+), 67 deletions(-)

diff --git a/.github/workflows/c-cpp.yml b/.github/workflows/c-cpp.yml
index 391c32ba23..13cb313c75 100644
--- a/.github/workflows/c-cpp.yml
+++ b/.github/workflows/c-cpp.yml
@@ -16,8 +16,6 @@ jobs:
     steps:
 
     - uses: actions/checkout@v2
-    - name: make cleanall
-      run: make -C ${{ matrix.folder }} cleanall
     - name: make epochX
       run: make -C ${{ matrix.folder }} debug
   CPU:
@@ -29,8 +27,6 @@ jobs:
       fail-fast: false
     steps:
     - uses: actions/checkout@v2
-    - name: make cleanall
-      run: make -C ${{ matrix.folder }} cleanall
     - name: make info
       run: make FPTYPE=${{ matrix.precision }} -C ${{ matrix.folder }} info
     - name: make
@@ -48,8 +44,6 @@ jobs:
       fail-fast: false
     steps:
     - uses: actions/checkout@v2
-    - name: make cleanall
-      run: make -C ${{ matrix.folder }} cleanall
     - name: make info
       run: make AVX=none OMPFLAGS= FPTYPE=${{ matrix.precision }} -C ${{ matrix.folder }} info
     - name: make
@@ -57,7 +51,7 @@ jobs:
     - name: make check
       run: make AVX=none OMPFLAGS= FPTYPE=${{ matrix.precision }} -C ${{ matrix.folder }} check
   GPU:
-    runs-on: [self-hosted, linux, a100s]
+    runs-on: [self-hosted, linux, v100s]
     env:
       FC: gfortran
       REQUIRE_CUDA: 1
@@ -68,8 +62,6 @@ jobs:
       fail-fast: false
     steps:
     - uses: actions/checkout@v2
-    - name: make cleanall
-      run: make -C ${{ matrix.folder }} cleanall
     - name: make info
       run: source /cvmfs/sft.cern.ch/lcg/contrib/gcc/11.3.0/x86_64-centos8/setup.sh; export CUDA_HOME=/usr/local/cuda-11.6/; make FPTYPE=${{ matrix.precision }} -C ${{ matrix.folder }} info
     - name: make
diff --git a/.github/workflows/sycl.yml b/.github/workflows/sycl.yml
index aee54656dd..04eda896a0 100644
--- a/.github/workflows/sycl.yml
+++ b/.github/workflows/sycl.yml
@@ -7,75 +7,20 @@ on:
     branches: [ master ]
 
 jobs:
-  debug_builds:
-    runs-on: ubuntu-latest
-    env:
-      SYCLFLAGS: -fsycl
-    strategy:
-      matrix:
-        folder: [ epochX/sycl/ee_mumu/SubProcesses/P1_Sigma_sm_epem_mupmum , epochX/sycl/gg_tt/SubProcesses/P1_Sigma_sm_gg_ttx , epochX/sycl/gg_ttg/SubProcesses/P1_Sigma_sm_gg_ttxg , epochX/sycl/gg_ttgg/SubProcesses/P1_Sigma_sm_gg_ttxgg , epochX/sycl/gg_ttggg/SubProcesses/P1_Sigma_sm_gg_ttxggg ]
-      fail-fast: false
-    steps:
-
-    - uses: actions/checkout@v2
-    - name: make cleanall
-      run: make -C ${{ matrix.folder }} cleanall
-    - name: make epochX
-      run: make -C ${{ matrix.folder }} debug
-  CPU:
-    runs-on: ubuntu-latest
-    env:
-      SYCLFLAGS: -fsycl
-    strategy:
-      matrix:
-        folder: [ epochX/sycl/ee_mumu/SubProcesses/P1_Sigma_sm_epem_mupmum , epochX/sycl/gg_tt/SubProcesses/P1_Sigma_sm_gg_ttx , epochX/sycl/gg_ttg/SubProcesses/P1_Sigma_sm_gg_ttxg , epochX/sycl/gg_ttgg/SubProcesses/P1_Sigma_sm_gg_ttxgg , epochX/sycl/gg_ttggg/SubProcesses/P1_Sigma_sm_gg_ttxggg ]
-        precision: [ d , f ]
-      fail-fast: false
-    steps:
-    - uses: actions/checkout@v2
-    - name: make cleanall
-      run: make -C ${{ matrix.folder }} cleanall
-    - name: make info
-      run: make FPTYPE=${{ matrix.precision }} -C ${{ matrix.folder }} info
-    - name: make
-      run: make FPTYPE=${{ matrix.precision }} -C ${{ matrix.folder }}
-    - name: make check
-      run: make FPTYPE=${{ matrix.precision }} -C ${{ matrix.folder }} check
-  CPU_MAC:
-    runs-on: macos-latest
-    env:
-      FC: gfortran-11
-      SYCLFLAGS: -fsycl
-    strategy:
-      matrix:
-        folder: [ epochX/sycl/ee_mumu/SubProcesses/P1_Sigma_sm_epem_mupmum , epochX/sycl/gg_tt/SubProcesses/P1_Sigma_sm_gg_ttx , epochX/sycl/gg_ttg/SubProcesses/P1_Sigma_sm_gg_ttxg , epochX/sycl/gg_ttgg/SubProcesses/P1_Sigma_sm_gg_ttxgg , epochX/sycl/gg_ttggg/SubProcesses/P1_Sigma_sm_gg_ttxggg ]
-        precision: [ d ]
-      fail-fast: false
-    steps:
-    - uses: actions/checkout@v2
-    - name: make cleanall
-      run: make -C ${{ matrix.folder }} cleanall
-    - name: make info
-      run: make AVX=none OMPFLAGS= FPTYPE=${{ matrix.precision }} -C ${{ matrix.folder }} info
-    - name: make
-      run: make AVX=none OMPFLAGS= FPTYPE=${{ matrix.precision }} -C ${{ matrix.folder }}
-    - name: make check
-      run: make AVX=none OMPFLAGS= FPTYPE=${{ matrix.precision }} -C ${{ matrix.folder }} check
   GPU:
-    runs-on: [self-hosted, linux, a100s]
+    runs-on: [self-hosted, linux, v100s]
     env:
       FC: gfortran
       REQUIRE_CUDA: 1
       SYCLFLAGS: -fsycl
+      ENABLE_CI_PROFILER: 1
     strategy:
       matrix:
-        folder: [ epochX/sycl/ee_mumu/SubProcesses/P1_Sigma_sm_epem_mupmum , epochX/sycl/gg_tt/SubProcesses/P1_Sigma_sm_gg_ttx , epochX/sycl/gg_ttg/SubProcesses/P1_Sigma_sm_gg_ttxg , epochX/sycl/gg_ttgg/SubProcesses/P1_Sigma_sm_gg_ttxgg , epochX/sycl/gg_ttggg/SubProcesses/P1_Sigma_sm_gg_ttxggg ]
+        folder: [ epochX/sycl/ee_mumu.sa/SubProcesses/P1_Sigma_sm_epem_mupmum , epochX/sycl/gg_tt.sa/SubProcesses/P1_Sigma_sm_gg_ttx , epochX/sycl/gg_ttg.sa/SubProcesses/P1_Sigma_sm_gg_ttxg , epochX/sycl/gg_ttgg.sa/SubProcesses/P1_Sigma_sm_gg_ttxgg , epochX/sycl/gg_ttggg.sa/SubProcesses/P1_Sigma_sm_gg_ttxggg ]
         precision: [ d , f ]
       fail-fast: false
     steps:
     - uses: actions/checkout@v2
-    - name: make cleanall
-      run: make -C ${{ matrix.folder }} cleanall
     - name: make info
       run: source /cvmfs/sft.cern.ch/lcg/contrib/gcc/11.3.0/x86_64-centos8/setup.sh; export CUDA_HOME=/usr/local/cuda-11.6/; make FPTYPE=${{ matrix.precision }} -C ${{ matrix.folder }} info
     - name: make

From 71a56264859536bf4321b16cf1f2af31779764c1 Mon Sep 17 00:00:00 2001
From: Jooorgen <jorgen.teig@gmail.com>
Date: Tue, 6 Dec 2022 14:38:41 +0100
Subject: [PATCH 128/509] Changed when profiler workflows are scheduled

---
 .github/workflows/cudaProfiler.yml | 2 +-
 .github/workflows/syclProfiler.yml | 6 ++----
 2 files changed, 3 insertions(+), 5 deletions(-)

diff --git a/.github/workflows/cudaProfiler.yml b/.github/workflows/cudaProfiler.yml
index a138deaec9..f0fea1e98e 100644
--- a/.github/workflows/cudaProfiler.yml
+++ b/.github/workflows/cudaProfiler.yml
@@ -2,7 +2,7 @@ name: CUDA Profiler
 
 on:
   schedule:
-    - cron:  '45 17 * * *'
+    - cron:  '00 00 * * *'
 
 jobs:
   cuda_a100s_Profiling:
diff --git a/.github/workflows/syclProfiler.yml b/.github/workflows/syclProfiler.yml
index 618f1abc79..15305fdfad 100644
--- a/.github/workflows/syclProfiler.yml
+++ b/.github/workflows/syclProfiler.yml
@@ -1,10 +1,8 @@
 name: SYCL Profiler
 
 on:
-  push:
-    branches: [ master ]
-  pull_request:
-    branches: [ master ]
+  schedule:
+    - cron:  '00 00 * * *'
 
 jobs:
   sycl_v100s_Profiling:

From 495dbd31908c70c16d9ee9b01b2bf4bc1971f246 Mon Sep 17 00:00:00 2001
From: Jooorgen <jorgen.teig@gmail.com>
Date: Tue, 6 Dec 2022 14:47:08 +0100
Subject: [PATCH 129/509] Added correct GXX variable for execution

---
 .github/workflows/sycl.yml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.github/workflows/sycl.yml b/.github/workflows/sycl.yml
index 04eda896a0..37b65812d4 100644
--- a/.github/workflows/sycl.yml
+++ b/.github/workflows/sycl.yml
@@ -13,6 +13,7 @@ jobs:
       FC: gfortran
       REQUIRE_CUDA: 1
       SYCLFLAGS: -fsycl
+      CXX: /afs/cern.ch/work/j/jteig/sycl_workspace/llvm/build/bin/clang++
       ENABLE_CI_PROFILER: 1
     strategy:
       matrix:

From 19dbc21b325fd53a916bb9967f25f2f9fa1872ed Mon Sep 17 00:00:00 2001
From: Jooorgen <jorgen.teig@gmail.com>
Date: Tue, 6 Dec 2022 14:54:52 +0100
Subject: [PATCH 130/509] Changed where CXX variable lies in workflow

---
 .github/workflows/sycl.yml | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/.github/workflows/sycl.yml b/.github/workflows/sycl.yml
index 37b65812d4..7f6856c986 100644
--- a/.github/workflows/sycl.yml
+++ b/.github/workflows/sycl.yml
@@ -13,7 +13,6 @@ jobs:
       FC: gfortran
       REQUIRE_CUDA: 1
       SYCLFLAGS: -fsycl
-      CXX: /afs/cern.ch/work/j/jteig/sycl_workspace/llvm/build/bin/clang++
       ENABLE_CI_PROFILER: 1
     strategy:
       matrix:
@@ -23,8 +22,8 @@ jobs:
     steps:
     - uses: actions/checkout@v2
     - name: make info
-      run: source /cvmfs/sft.cern.ch/lcg/contrib/gcc/11.3.0/x86_64-centos8/setup.sh; export CUDA_HOME=/usr/local/cuda-11.6/; make FPTYPE=${{ matrix.precision }} -C ${{ matrix.folder }} info
+      run: source /cvmfs/sft.cern.ch/lcg/contrib/gcc/11.3.0/x86_64-centos8/setup.sh; export CXX=/afs/cern.ch/work/j/jteig/sycl_workspace/llvm/build/bin/clang++; export CUDA_HOME=/usr/local/cuda-11.6/; make FPTYPE=${{ matrix.precision }} -C ${{ matrix.folder }} info
     - name: make
-      run: source /cvmfs/sft.cern.ch/lcg/contrib/gcc/11.3.0/x86_64-centos8/setup.sh; export CUDA_HOME=/usr/local/cuda-11.6/; make FPTYPE=${{ matrix.precision }} -C ${{ matrix.folder }}
+      run: source /cvmfs/sft.cern.ch/lcg/contrib/gcc/11.3.0/x86_64-centos8/setup.sh; export CXX=/afs/cern.ch/work/j/jteig/sycl_workspace/llvm/build/bin/clang++; export CUDA_HOME=/usr/local/cuda-11.6/; make FPTYPE=${{ matrix.precision }} -C ${{ matrix.folder }}
     - name: make check
-      run: source /cvmfs/sft.cern.ch/lcg/contrib/gcc/11.3.0/x86_64-centos8/setup.sh; export CUDA_HOME=/usr/local/cuda-11.6/; make FPTYPE=${{ matrix.precision }} -C ${{ matrix.folder }} check
\ No newline at end of file
+      run: source /cvmfs/sft.cern.ch/lcg/contrib/gcc/11.3.0/x86_64-centos8/setup.sh; export CXX=/afs/cern.ch/work/j/jteig/sycl_workspace/llvm/build/bin/clang++; export CUDA_HOME=/usr/local/cuda-11.6/; make FPTYPE=${{ matrix.precision }} -C ${{ matrix.folder }} check
\ No newline at end of file

From be084cfa8bd55ef9db1f28bd845020d43d60f3a6 Mon Sep 17 00:00:00 2001
From: Jooorgen <jorgen.teig@gmail.com>
Date: Tue, 6 Dec 2022 15:05:51 +0100
Subject: [PATCH 131/509] Added correct LD_LIBRARY_PATH in SYCL workflow

---
 .github/workflows/sycl.yml | 18 +++++++++++++++---
 1 file changed, 15 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/sycl.yml b/.github/workflows/sycl.yml
index 7f6856c986..63debfdaf6 100644
--- a/.github/workflows/sycl.yml
+++ b/.github/workflows/sycl.yml
@@ -22,8 +22,20 @@ jobs:
     steps:
     - uses: actions/checkout@v2
     - name: make info
-      run: source /cvmfs/sft.cern.ch/lcg/contrib/gcc/11.3.0/x86_64-centos8/setup.sh; export CXX=/afs/cern.ch/work/j/jteig/sycl_workspace/llvm/build/bin/clang++; export CUDA_HOME=/usr/local/cuda-11.6/; make FPTYPE=${{ matrix.precision }} -C ${{ matrix.folder }} info
+      run: source /cvmfs/sft.cern.ch/lcg/contrib/gcc/11.3.0/x86_64-centos8/setup.sh;
+           CXX=/afs/cern.ch/work/j/jteig/sycl_workspace/llvm/build/bin/clang++;
+           LD_LIBRARY_PATH=/afs/cern.ch/work/j/jteig/sycl_workspace/llvm/build/lib:$LD_LIBRARY_PATH
+           export CUDA_HOME=/usr/local/cuda-11.6/;
+           make FPTYPE=${{ matrix.precision }} -C ${{ matrix.folder }} info
     - name: make
-      run: source /cvmfs/sft.cern.ch/lcg/contrib/gcc/11.3.0/x86_64-centos8/setup.sh; export CXX=/afs/cern.ch/work/j/jteig/sycl_workspace/llvm/build/bin/clang++; export CUDA_HOME=/usr/local/cuda-11.6/; make FPTYPE=${{ matrix.precision }} -C ${{ matrix.folder }}
+      run: source /cvmfs/sft.cern.ch/lcg/contrib/gcc/11.3.0/x86_64-centos8/setup.sh;
+           CXX=/afs/cern.ch/work/j/jteig/sycl_workspace/llvm/build/bin/clang++;
+           LD_LIBRARY_PATH=/afs/cern.ch/work/j/jteig/sycl_workspace/llvm/build/lib:$LD_LIBRARY_PATH
+           export CUDA_HOME=/usr/local/cuda-11.6/;
+           make FPTYPE=${{ matrix.precision }} -C ${{ matrix.folder }}
     - name: make check
-      run: source /cvmfs/sft.cern.ch/lcg/contrib/gcc/11.3.0/x86_64-centos8/setup.sh; export CXX=/afs/cern.ch/work/j/jteig/sycl_workspace/llvm/build/bin/clang++; export CUDA_HOME=/usr/local/cuda-11.6/; make FPTYPE=${{ matrix.precision }} -C ${{ matrix.folder }} check
\ No newline at end of file
+      run: source /cvmfs/sft.cern.ch/lcg/contrib/gcc/11.3.0/x86_64-centos8/setup.sh; 
+           CXX=/afs/cern.ch/work/j/jteig/sycl_workspace/llvm/build/bin/clang++;
+           LD_LIBRARY_PATH=/afs/cern.ch/work/j/jteig/sycl_workspace/llvm/build/lib:$LD_LIBRARY_PATH
+           export CUDA_HOME=/usr/local/cuda-11.6/;
+           make FPTYPE=${{ matrix.precision }} -C ${{ matrix.folder }} check
\ No newline at end of file

From 2e31ca8a57a12d04128256909740d07e4d2b6220 Mon Sep 17 00:00:00 2001
From: Jooorgen <jorgen.teig@gmail.com>
Date: Tue, 6 Dec 2022 15:31:45 +0100
Subject: [PATCH 132/509] Added missing semicolon in SYCL workflow

---
 .github/workflows/sycl.yml | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/sycl.yml b/.github/workflows/sycl.yml
index 63debfdaf6..d586f27ff9 100644
--- a/.github/workflows/sycl.yml
+++ b/.github/workflows/sycl.yml
@@ -24,18 +24,18 @@ jobs:
     - name: make info
       run: source /cvmfs/sft.cern.ch/lcg/contrib/gcc/11.3.0/x86_64-centos8/setup.sh;
            CXX=/afs/cern.ch/work/j/jteig/sycl_workspace/llvm/build/bin/clang++;
-           LD_LIBRARY_PATH=/afs/cern.ch/work/j/jteig/sycl_workspace/llvm/build/lib:$LD_LIBRARY_PATH
+           LD_LIBRARY_PATH=/afs/cern.ch/work/j/jteig/sycl_workspace/llvm/build/lib:$LD_LIBRARY_PATH;
            export CUDA_HOME=/usr/local/cuda-11.6/;
            make FPTYPE=${{ matrix.precision }} -C ${{ matrix.folder }} info
     - name: make
       run: source /cvmfs/sft.cern.ch/lcg/contrib/gcc/11.3.0/x86_64-centos8/setup.sh;
            CXX=/afs/cern.ch/work/j/jteig/sycl_workspace/llvm/build/bin/clang++;
-           LD_LIBRARY_PATH=/afs/cern.ch/work/j/jteig/sycl_workspace/llvm/build/lib:$LD_LIBRARY_PATH
+           LD_LIBRARY_PATH=/afs/cern.ch/work/j/jteig/sycl_workspace/llvm/build/lib:$LD_LIBRARY_PATH;
            export CUDA_HOME=/usr/local/cuda-11.6/;
            make FPTYPE=${{ matrix.precision }} -C ${{ matrix.folder }}
     - name: make check
       run: source /cvmfs/sft.cern.ch/lcg/contrib/gcc/11.3.0/x86_64-centos8/setup.sh; 
            CXX=/afs/cern.ch/work/j/jteig/sycl_workspace/llvm/build/bin/clang++;
-           LD_LIBRARY_PATH=/afs/cern.ch/work/j/jteig/sycl_workspace/llvm/build/lib:$LD_LIBRARY_PATH
+           LD_LIBRARY_PATH=/afs/cern.ch/work/j/jteig/sycl_workspace/llvm/build/lib:$LD_LIBRARY_PATH;
            export CUDA_HOME=/usr/local/cuda-11.6/;
            make FPTYPE=${{ matrix.precision }} -C ${{ matrix.folder }} check
\ No newline at end of file

From 5989c1f6848a413d6d296b46b988e9470137e840 Mon Sep 17 00:00:00 2001
From: Jooorgen <jorgen.teig@gmail.com>
Date: Tue, 6 Dec 2022 15:55:41 +0100
Subject: [PATCH 133/509] Added more to the LD_LIBRARY_PATH

---
 .github/workflows/sycl.yml | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/.github/workflows/sycl.yml b/.github/workflows/sycl.yml
index d586f27ff9..719d3a02e2 100644
--- a/.github/workflows/sycl.yml
+++ b/.github/workflows/sycl.yml
@@ -25,17 +25,20 @@ jobs:
       run: source /cvmfs/sft.cern.ch/lcg/contrib/gcc/11.3.0/x86_64-centos8/setup.sh;
            CXX=/afs/cern.ch/work/j/jteig/sycl_workspace/llvm/build/bin/clang++;
            LD_LIBRARY_PATH=/afs/cern.ch/work/j/jteig/sycl_workspace/llvm/build/lib:$LD_LIBRARY_PATH;
+           LD_LIBRARY_PATH=${{ matrix.folder }}/../../lib/build*/:$LD_LIBRARY_PATH;
            export CUDA_HOME=/usr/local/cuda-11.6/;
            make FPTYPE=${{ matrix.precision }} -C ${{ matrix.folder }} info
     - name: make
       run: source /cvmfs/sft.cern.ch/lcg/contrib/gcc/11.3.0/x86_64-centos8/setup.sh;
            CXX=/afs/cern.ch/work/j/jteig/sycl_workspace/llvm/build/bin/clang++;
            LD_LIBRARY_PATH=/afs/cern.ch/work/j/jteig/sycl_workspace/llvm/build/lib:$LD_LIBRARY_PATH;
+           LD_LIBRARY_PATH=${{ matrix.folder }}/../../lib/build*/:$LD_LIBRARY_PATH;
            export CUDA_HOME=/usr/local/cuda-11.6/;
            make FPTYPE=${{ matrix.precision }} -C ${{ matrix.folder }}
     - name: make check
       run: source /cvmfs/sft.cern.ch/lcg/contrib/gcc/11.3.0/x86_64-centos8/setup.sh; 
            CXX=/afs/cern.ch/work/j/jteig/sycl_workspace/llvm/build/bin/clang++;
            LD_LIBRARY_PATH=/afs/cern.ch/work/j/jteig/sycl_workspace/llvm/build/lib:$LD_LIBRARY_PATH;
+           LD_LIBRARY_PATH=${{ matrix.folder }}/../../lib/build*/:$LD_LIBRARY_PATH;
            export CUDA_HOME=/usr/local/cuda-11.6/;
            make FPTYPE=${{ matrix.precision }} -C ${{ matrix.folder }} check
\ No newline at end of file

From 24c49d97f042070d3eec1fd09c94431aa443dfc5 Mon Sep 17 00:00:00 2001
From: Jooorgen <jorgen.teig@gmail.com>
Date: Tue, 6 Dec 2022 16:01:48 +0100
Subject: [PATCH 134/509] Added full path to new LD_LIBRARY_PATH

---
 .github/workflows/sycl.yml | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/sycl.yml b/.github/workflows/sycl.yml
index 719d3a02e2..4cb4f42b37 100644
--- a/.github/workflows/sycl.yml
+++ b/.github/workflows/sycl.yml
@@ -25,20 +25,20 @@ jobs:
       run: source /cvmfs/sft.cern.ch/lcg/contrib/gcc/11.3.0/x86_64-centos8/setup.sh;
            CXX=/afs/cern.ch/work/j/jteig/sycl_workspace/llvm/build/bin/clang++;
            LD_LIBRARY_PATH=/afs/cern.ch/work/j/jteig/sycl_workspace/llvm/build/lib:$LD_LIBRARY_PATH;
-           LD_LIBRARY_PATH=${{ matrix.folder }}/../../lib/build*/:$LD_LIBRARY_PATH;
+           LD_LIBRARY_PATH=${pwd}/${{ matrix.folder }}/../../lib/build*/:$LD_LIBRARY_PATH;
            export CUDA_HOME=/usr/local/cuda-11.6/;
            make FPTYPE=${{ matrix.precision }} -C ${{ matrix.folder }} info
     - name: make
       run: source /cvmfs/sft.cern.ch/lcg/contrib/gcc/11.3.0/x86_64-centos8/setup.sh;
            CXX=/afs/cern.ch/work/j/jteig/sycl_workspace/llvm/build/bin/clang++;
            LD_LIBRARY_PATH=/afs/cern.ch/work/j/jteig/sycl_workspace/llvm/build/lib:$LD_LIBRARY_PATH;
-           LD_LIBRARY_PATH=${{ matrix.folder }}/../../lib/build*/:$LD_LIBRARY_PATH;
+           LD_LIBRARY_PATH=${pwd}/${{ matrix.folder }}/../../lib/build*/:$LD_LIBRARY_PATH;
            export CUDA_HOME=/usr/local/cuda-11.6/;
            make FPTYPE=${{ matrix.precision }} -C ${{ matrix.folder }}
     - name: make check
       run: source /cvmfs/sft.cern.ch/lcg/contrib/gcc/11.3.0/x86_64-centos8/setup.sh; 
            CXX=/afs/cern.ch/work/j/jteig/sycl_workspace/llvm/build/bin/clang++;
            LD_LIBRARY_PATH=/afs/cern.ch/work/j/jteig/sycl_workspace/llvm/build/lib:$LD_LIBRARY_PATH;
-           LD_LIBRARY_PATH=${{ matrix.folder }}/../../lib/build*/:$LD_LIBRARY_PATH;
+           LD_LIBRARY_PATH=${pwd}/${{ matrix.folder }}/../../lib/build*/:$LD_LIBRARY_PATH;
            export CUDA_HOME=/usr/local/cuda-11.6/;
            make FPTYPE=${{ matrix.precision }} -C ${{ matrix.folder }} check
\ No newline at end of file

From 3a47fcd85807d0ce9c91a1e6dff8ea79a18cc92d Mon Sep 17 00:00:00 2001
From: Jooorgen <jorgen.teig@gmail.com>
Date: Tue, 6 Dec 2022 16:20:53 +0100
Subject: [PATCH 135/509] Changed to correct braces

---
 .github/workflows/sycl.yml | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/sycl.yml b/.github/workflows/sycl.yml
index 4cb4f42b37..0f3fed8353 100644
--- a/.github/workflows/sycl.yml
+++ b/.github/workflows/sycl.yml
@@ -25,20 +25,20 @@ jobs:
       run: source /cvmfs/sft.cern.ch/lcg/contrib/gcc/11.3.0/x86_64-centos8/setup.sh;
            CXX=/afs/cern.ch/work/j/jteig/sycl_workspace/llvm/build/bin/clang++;
            LD_LIBRARY_PATH=/afs/cern.ch/work/j/jteig/sycl_workspace/llvm/build/lib:$LD_LIBRARY_PATH;
-           LD_LIBRARY_PATH=${pwd}/${{ matrix.folder }}/../../lib/build*/:$LD_LIBRARY_PATH;
+           LD_LIBRARY_PATH=$(pwd)/${{ matrix.folder }}/../../lib/build*/:$LD_LIBRARY_PATH;
            export CUDA_HOME=/usr/local/cuda-11.6/;
            make FPTYPE=${{ matrix.precision }} -C ${{ matrix.folder }} info
     - name: make
       run: source /cvmfs/sft.cern.ch/lcg/contrib/gcc/11.3.0/x86_64-centos8/setup.sh;
            CXX=/afs/cern.ch/work/j/jteig/sycl_workspace/llvm/build/bin/clang++;
            LD_LIBRARY_PATH=/afs/cern.ch/work/j/jteig/sycl_workspace/llvm/build/lib:$LD_LIBRARY_PATH;
-           LD_LIBRARY_PATH=${pwd}/${{ matrix.folder }}/../../lib/build*/:$LD_LIBRARY_PATH;
+           LD_LIBRARY_PATH=$(pwd)/${{ matrix.folder }}/../../lib/build*/:$LD_LIBRARY_PATH;
            export CUDA_HOME=/usr/local/cuda-11.6/;
            make FPTYPE=${{ matrix.precision }} -C ${{ matrix.folder }}
     - name: make check
       run: source /cvmfs/sft.cern.ch/lcg/contrib/gcc/11.3.0/x86_64-centos8/setup.sh; 
            CXX=/afs/cern.ch/work/j/jteig/sycl_workspace/llvm/build/bin/clang++;
            LD_LIBRARY_PATH=/afs/cern.ch/work/j/jteig/sycl_workspace/llvm/build/lib:$LD_LIBRARY_PATH;
-           LD_LIBRARY_PATH=${pwd}/${{ matrix.folder }}/../../lib/build*/:$LD_LIBRARY_PATH;
+           LD_LIBRARY_PATH=$(pwd)/${{ matrix.folder }}/../../lib/build*/:$LD_LIBRARY_PATH;
            export CUDA_HOME=/usr/local/cuda-11.6/;
            make FPTYPE=${{ matrix.precision }} -C ${{ matrix.folder }} check
\ No newline at end of file

From ff7b665a5d7e06ee6922b9b121df48e25a8fe073 Mon Sep 17 00:00:00 2001
From: Jooorgen <jorgen.teig@gmail.com>
Date: Tue, 6 Dec 2022 16:28:05 +0100
Subject: [PATCH 136/509] Testing command output

---
 .github/workflows/sycl.yml | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/sycl.yml b/.github/workflows/sycl.yml
index 0f3fed8353..d2cfa016a6 100644
--- a/.github/workflows/sycl.yml
+++ b/.github/workflows/sycl.yml
@@ -25,20 +25,20 @@ jobs:
       run: source /cvmfs/sft.cern.ch/lcg/contrib/gcc/11.3.0/x86_64-centos8/setup.sh;
            CXX=/afs/cern.ch/work/j/jteig/sycl_workspace/llvm/build/bin/clang++;
            LD_LIBRARY_PATH=/afs/cern.ch/work/j/jteig/sycl_workspace/llvm/build/lib:$LD_LIBRARY_PATH;
-           LD_LIBRARY_PATH=$(pwd)/${{ matrix.folder }}/../../lib/build*/:$LD_LIBRARY_PATH;
+           LD_LIBRARY_PATH=`pwd`/${{ matrix.folder }}/../../lib/build*/:$LD_LIBRARY_PATH;
            export CUDA_HOME=/usr/local/cuda-11.6/;
            make FPTYPE=${{ matrix.precision }} -C ${{ matrix.folder }} info
     - name: make
       run: source /cvmfs/sft.cern.ch/lcg/contrib/gcc/11.3.0/x86_64-centos8/setup.sh;
            CXX=/afs/cern.ch/work/j/jteig/sycl_workspace/llvm/build/bin/clang++;
            LD_LIBRARY_PATH=/afs/cern.ch/work/j/jteig/sycl_workspace/llvm/build/lib:$LD_LIBRARY_PATH;
-           LD_LIBRARY_PATH=$(pwd)/${{ matrix.folder }}/../../lib/build*/:$LD_LIBRARY_PATH;
+           LD_LIBRARY_PATH=`pwd`/${{ matrix.folder }}/../../lib/build*/:$LD_LIBRARY_PATH;
            export CUDA_HOME=/usr/local/cuda-11.6/;
            make FPTYPE=${{ matrix.precision }} -C ${{ matrix.folder }}
     - name: make check
       run: source /cvmfs/sft.cern.ch/lcg/contrib/gcc/11.3.0/x86_64-centos8/setup.sh; 
            CXX=/afs/cern.ch/work/j/jteig/sycl_workspace/llvm/build/bin/clang++;
            LD_LIBRARY_PATH=/afs/cern.ch/work/j/jteig/sycl_workspace/llvm/build/lib:$LD_LIBRARY_PATH;
-           LD_LIBRARY_PATH=$(pwd)/${{ matrix.folder }}/../../lib/build*/:$LD_LIBRARY_PATH;
+           LD_LIBRARY_PATH=`pwd`/${{ matrix.folder }}/../../lib/build*/:$LD_LIBRARY_PATH;
            export CUDA_HOME=/usr/local/cuda-11.6/;
            make FPTYPE=${{ matrix.precision }} -C ${{ matrix.folder }} check
\ No newline at end of file

From 2c285a970c50b396b3d673678fc6b671ca5a929d Mon Sep 17 00:00:00 2001
From: Jooorgen <jorgen.teig@gmail.com>
Date: Tue, 6 Dec 2022 16:33:08 +0100
Subject: [PATCH 137/509] Added correct workspace variable

---
 .github/workflows/sycl.yml | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/sycl.yml b/.github/workflows/sycl.yml
index d2cfa016a6..3c46c75460 100644
--- a/.github/workflows/sycl.yml
+++ b/.github/workflows/sycl.yml
@@ -25,20 +25,20 @@ jobs:
       run: source /cvmfs/sft.cern.ch/lcg/contrib/gcc/11.3.0/x86_64-centos8/setup.sh;
            CXX=/afs/cern.ch/work/j/jteig/sycl_workspace/llvm/build/bin/clang++;
            LD_LIBRARY_PATH=/afs/cern.ch/work/j/jteig/sycl_workspace/llvm/build/lib:$LD_LIBRARY_PATH;
-           LD_LIBRARY_PATH=`pwd`/${{ matrix.folder }}/../../lib/build*/:$LD_LIBRARY_PATH;
+           LD_LIBRARY_PATH=${GITHUB_WORKSPACE}/${{ matrix.folder }}/../../lib/build*/:$LD_LIBRARY_PATH;
            export CUDA_HOME=/usr/local/cuda-11.6/;
            make FPTYPE=${{ matrix.precision }} -C ${{ matrix.folder }} info
     - name: make
       run: source /cvmfs/sft.cern.ch/lcg/contrib/gcc/11.3.0/x86_64-centos8/setup.sh;
            CXX=/afs/cern.ch/work/j/jteig/sycl_workspace/llvm/build/bin/clang++;
            LD_LIBRARY_PATH=/afs/cern.ch/work/j/jteig/sycl_workspace/llvm/build/lib:$LD_LIBRARY_PATH;
-           LD_LIBRARY_PATH=`pwd`/${{ matrix.folder }}/../../lib/build*/:$LD_LIBRARY_PATH;
+           LD_LIBRARY_PATH=${GITHUB_WORKSPACE}/${{ matrix.folder }}/../../lib/build*/:$LD_LIBRARY_PATH;
            export CUDA_HOME=/usr/local/cuda-11.6/;
            make FPTYPE=${{ matrix.precision }} -C ${{ matrix.folder }}
     - name: make check
       run: source /cvmfs/sft.cern.ch/lcg/contrib/gcc/11.3.0/x86_64-centos8/setup.sh; 
            CXX=/afs/cern.ch/work/j/jteig/sycl_workspace/llvm/build/bin/clang++;
            LD_LIBRARY_PATH=/afs/cern.ch/work/j/jteig/sycl_workspace/llvm/build/lib:$LD_LIBRARY_PATH;
-           LD_LIBRARY_PATH=`pwd`/${{ matrix.folder }}/../../lib/build*/:$LD_LIBRARY_PATH;
+           LD_LIBRARY_PATH=${GITHUB_WORKSPACE}/${{ matrix.folder }}/../../lib/build*/:$LD_LIBRARY_PATH;
            export CUDA_HOME=/usr/local/cuda-11.6/;
            make FPTYPE=${{ matrix.precision }} -C ${{ matrix.folder }} check
\ No newline at end of file

From 32593aa65ce32f547d78410583b6f966b2954f3c Mon Sep 17 00:00:00 2001
From: Jooorgen <jorgen.teig@gmail.com>
Date: Tue, 6 Dec 2022 16:37:41 +0100
Subject: [PATCH 138/509] Changed github workspace variable

---
 .github/workflows/sycl.yml | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/sycl.yml b/.github/workflows/sycl.yml
index 3c46c75460..ce53bb7e53 100644
--- a/.github/workflows/sycl.yml
+++ b/.github/workflows/sycl.yml
@@ -25,20 +25,20 @@ jobs:
       run: source /cvmfs/sft.cern.ch/lcg/contrib/gcc/11.3.0/x86_64-centos8/setup.sh;
            CXX=/afs/cern.ch/work/j/jteig/sycl_workspace/llvm/build/bin/clang++;
            LD_LIBRARY_PATH=/afs/cern.ch/work/j/jteig/sycl_workspace/llvm/build/lib:$LD_LIBRARY_PATH;
-           LD_LIBRARY_PATH=${GITHUB_WORKSPACE}/${{ matrix.folder }}/../../lib/build*/:$LD_LIBRARY_PATH;
+           LD_LIBRARY_PATH=$GITHUB_WORKSPACE/${{ matrix.folder }}/../../lib/build*/:$LD_LIBRARY_PATH;
            export CUDA_HOME=/usr/local/cuda-11.6/;
            make FPTYPE=${{ matrix.precision }} -C ${{ matrix.folder }} info
     - name: make
       run: source /cvmfs/sft.cern.ch/lcg/contrib/gcc/11.3.0/x86_64-centos8/setup.sh;
            CXX=/afs/cern.ch/work/j/jteig/sycl_workspace/llvm/build/bin/clang++;
            LD_LIBRARY_PATH=/afs/cern.ch/work/j/jteig/sycl_workspace/llvm/build/lib:$LD_LIBRARY_PATH;
-           LD_LIBRARY_PATH=${GITHUB_WORKSPACE}/${{ matrix.folder }}/../../lib/build*/:$LD_LIBRARY_PATH;
+           LD_LIBRARY_PATH=$GITHUB_WORKSPACE/${{ matrix.folder }}/../../lib/build*/:$LD_LIBRARY_PATH;
            export CUDA_HOME=/usr/local/cuda-11.6/;
            make FPTYPE=${{ matrix.precision }} -C ${{ matrix.folder }}
     - name: make check
       run: source /cvmfs/sft.cern.ch/lcg/contrib/gcc/11.3.0/x86_64-centos8/setup.sh; 
            CXX=/afs/cern.ch/work/j/jteig/sycl_workspace/llvm/build/bin/clang++;
            LD_LIBRARY_PATH=/afs/cern.ch/work/j/jteig/sycl_workspace/llvm/build/lib:$LD_LIBRARY_PATH;
-           LD_LIBRARY_PATH=${GITHUB_WORKSPACE}/${{ matrix.folder }}/../../lib/build*/:$LD_LIBRARY_PATH;
+           LD_LIBRARY_PATH=$GITHUB_WORKSPACE/${{ matrix.folder }}/../../lib/build*/:$LD_LIBRARY_PATH;
            export CUDA_HOME=/usr/local/cuda-11.6/;
            make FPTYPE=${{ matrix.precision }} -C ${{ matrix.folder }} check
\ No newline at end of file

From ed0cdf6f4ec28b2efc8b1598545c037995486286 Mon Sep 17 00:00:00 2001
From: Jooorgen <jorgen.teig@gmail.com>
Date: Tue, 6 Dec 2022 16:41:56 +0100
Subject: [PATCH 139/509] Changed github workspace parameter again to check if
 it works

---
 .github/workflows/sycl.yml | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/sycl.yml b/.github/workflows/sycl.yml
index ce53bb7e53..4a02b6f70c 100644
--- a/.github/workflows/sycl.yml
+++ b/.github/workflows/sycl.yml
@@ -25,20 +25,20 @@ jobs:
       run: source /cvmfs/sft.cern.ch/lcg/contrib/gcc/11.3.0/x86_64-centos8/setup.sh;
            CXX=/afs/cern.ch/work/j/jteig/sycl_workspace/llvm/build/bin/clang++;
            LD_LIBRARY_PATH=/afs/cern.ch/work/j/jteig/sycl_workspace/llvm/build/lib:$LD_LIBRARY_PATH;
-           LD_LIBRARY_PATH=$GITHUB_WORKSPACE/${{ matrix.folder }}/../../lib/build*/:$LD_LIBRARY_PATH;
+           LD_LIBRARY_PATH=${{ GITHUB_WORKSPACE }}/${{ matrix.folder }}/../../lib/build*/:$LD_LIBRARY_PATH;
            export CUDA_HOME=/usr/local/cuda-11.6/;
            make FPTYPE=${{ matrix.precision }} -C ${{ matrix.folder }} info
     - name: make
       run: source /cvmfs/sft.cern.ch/lcg/contrib/gcc/11.3.0/x86_64-centos8/setup.sh;
            CXX=/afs/cern.ch/work/j/jteig/sycl_workspace/llvm/build/bin/clang++;
            LD_LIBRARY_PATH=/afs/cern.ch/work/j/jteig/sycl_workspace/llvm/build/lib:$LD_LIBRARY_PATH;
-           LD_LIBRARY_PATH=$GITHUB_WORKSPACE/${{ matrix.folder }}/../../lib/build*/:$LD_LIBRARY_PATH;
+           LD_LIBRARY_PATH=${{ GITHUB_WORKSPACE }}/${{ matrix.folder }}/../../lib/build*/:$LD_LIBRARY_PATH;
            export CUDA_HOME=/usr/local/cuda-11.6/;
            make FPTYPE=${{ matrix.precision }} -C ${{ matrix.folder }}
     - name: make check
       run: source /cvmfs/sft.cern.ch/lcg/contrib/gcc/11.3.0/x86_64-centos8/setup.sh; 
            CXX=/afs/cern.ch/work/j/jteig/sycl_workspace/llvm/build/bin/clang++;
            LD_LIBRARY_PATH=/afs/cern.ch/work/j/jteig/sycl_workspace/llvm/build/lib:$LD_LIBRARY_PATH;
-           LD_LIBRARY_PATH=$GITHUB_WORKSPACE/${{ matrix.folder }}/../../lib/build*/:$LD_LIBRARY_PATH;
+           LD_LIBRARY_PATH=${{ GITHUB_WORKSPACE }}/${{ matrix.folder }}/../../lib/build*/:$LD_LIBRARY_PATH;
            export CUDA_HOME=/usr/local/cuda-11.6/;
            make FPTYPE=${{ matrix.precision }} -C ${{ matrix.folder }} check
\ No newline at end of file

From 6128c7945c40925eea78b36abe5a2fdab8509f08 Mon Sep 17 00:00:00 2001
From: Jooorgen <jorgen.teig@gmail.com>
Date: Tue, 6 Dec 2022 16:49:40 +0100
Subject: [PATCH 140/509] Changed github workspace variable for testing

---
 .github/workflows/sycl.yml | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/sycl.yml b/.github/workflows/sycl.yml
index 4a02b6f70c..8541cabe9e 100644
--- a/.github/workflows/sycl.yml
+++ b/.github/workflows/sycl.yml
@@ -25,20 +25,20 @@ jobs:
       run: source /cvmfs/sft.cern.ch/lcg/contrib/gcc/11.3.0/x86_64-centos8/setup.sh;
            CXX=/afs/cern.ch/work/j/jteig/sycl_workspace/llvm/build/bin/clang++;
            LD_LIBRARY_PATH=/afs/cern.ch/work/j/jteig/sycl_workspace/llvm/build/lib:$LD_LIBRARY_PATH;
-           LD_LIBRARY_PATH=${{ GITHUB_WORKSPACE }}/${{ matrix.folder }}/../../lib/build*/:$LD_LIBRARY_PATH;
+           LD_LIBRARY_PATH=${{ github.workspace }}/${{ matrix.folder }}/../../lib/build*/:$LD_LIBRARY_PATH;
            export CUDA_HOME=/usr/local/cuda-11.6/;
            make FPTYPE=${{ matrix.precision }} -C ${{ matrix.folder }} info
     - name: make
       run: source /cvmfs/sft.cern.ch/lcg/contrib/gcc/11.3.0/x86_64-centos8/setup.sh;
            CXX=/afs/cern.ch/work/j/jteig/sycl_workspace/llvm/build/bin/clang++;
            LD_LIBRARY_PATH=/afs/cern.ch/work/j/jteig/sycl_workspace/llvm/build/lib:$LD_LIBRARY_PATH;
-           LD_LIBRARY_PATH=${{ GITHUB_WORKSPACE }}/${{ matrix.folder }}/../../lib/build*/:$LD_LIBRARY_PATH;
+           LD_LIBRARY_PATH=${{ github.workspace }}/${{ matrix.folder }}/../../lib/build*/:$LD_LIBRARY_PATH;
            export CUDA_HOME=/usr/local/cuda-11.6/;
            make FPTYPE=${{ matrix.precision }} -C ${{ matrix.folder }}
     - name: make check
       run: source /cvmfs/sft.cern.ch/lcg/contrib/gcc/11.3.0/x86_64-centos8/setup.sh; 
            CXX=/afs/cern.ch/work/j/jteig/sycl_workspace/llvm/build/bin/clang++;
            LD_LIBRARY_PATH=/afs/cern.ch/work/j/jteig/sycl_workspace/llvm/build/lib:$LD_LIBRARY_PATH;
-           LD_LIBRARY_PATH=${{ GITHUB_WORKSPACE }}/${{ matrix.folder }}/../../lib/build*/:$LD_LIBRARY_PATH;
+           LD_LIBRARY_PATH=${{ github.workspace }}/${{ matrix.folder }}/../../lib/build*/:$LD_LIBRARY_PATH;
            export CUDA_HOME=/usr/local/cuda-11.6/;
            make FPTYPE=${{ matrix.precision }} -C ${{ matrix.folder }} check
\ No newline at end of file

From 29393994d0494c09cc0a5357f7734c8f7982e33a Mon Sep 17 00:00:00 2001
From: Jooorgen <jorgen.teig@gmail.com>
Date: Tue, 6 Dec 2022 16:54:01 +0100
Subject: [PATCH 141/509] Removed unneccessary build folder path

---
 .github/workflows/sycl.yml | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/sycl.yml b/.github/workflows/sycl.yml
index 8541cabe9e..0786d45a58 100644
--- a/.github/workflows/sycl.yml
+++ b/.github/workflows/sycl.yml
@@ -25,20 +25,20 @@ jobs:
       run: source /cvmfs/sft.cern.ch/lcg/contrib/gcc/11.3.0/x86_64-centos8/setup.sh;
            CXX=/afs/cern.ch/work/j/jteig/sycl_workspace/llvm/build/bin/clang++;
            LD_LIBRARY_PATH=/afs/cern.ch/work/j/jteig/sycl_workspace/llvm/build/lib:$LD_LIBRARY_PATH;
-           LD_LIBRARY_PATH=${{ github.workspace }}/${{ matrix.folder }}/../../lib/build*/:$LD_LIBRARY_PATH;
+           LD_LIBRARY_PATH=${{ github.workspace }}/${{ matrix.folder }}/../../lib:$LD_LIBRARY_PATH;
            export CUDA_HOME=/usr/local/cuda-11.6/;
            make FPTYPE=${{ matrix.precision }} -C ${{ matrix.folder }} info
     - name: make
       run: source /cvmfs/sft.cern.ch/lcg/contrib/gcc/11.3.0/x86_64-centos8/setup.sh;
            CXX=/afs/cern.ch/work/j/jteig/sycl_workspace/llvm/build/bin/clang++;
            LD_LIBRARY_PATH=/afs/cern.ch/work/j/jteig/sycl_workspace/llvm/build/lib:$LD_LIBRARY_PATH;
-           LD_LIBRARY_PATH=${{ github.workspace }}/${{ matrix.folder }}/../../lib/build*/:$LD_LIBRARY_PATH;
+           LD_LIBRARY_PATH=${{ github.workspace }}/${{ matrix.folder }}/../../lib:$LD_LIBRARY_PATH;
            export CUDA_HOME=/usr/local/cuda-11.6/;
            make FPTYPE=${{ matrix.precision }} -C ${{ matrix.folder }}
     - name: make check
       run: source /cvmfs/sft.cern.ch/lcg/contrib/gcc/11.3.0/x86_64-centos8/setup.sh; 
            CXX=/afs/cern.ch/work/j/jteig/sycl_workspace/llvm/build/bin/clang++;
            LD_LIBRARY_PATH=/afs/cern.ch/work/j/jteig/sycl_workspace/llvm/build/lib:$LD_LIBRARY_PATH;
-           LD_LIBRARY_PATH=${{ github.workspace }}/${{ matrix.folder }}/../../lib/build*/:$LD_LIBRARY_PATH;
+           LD_LIBRARY_PATH=${{ github.workspace }}/${{ matrix.folder }}/../../lib:$LD_LIBRARY_PATH;
            export CUDA_HOME=/usr/local/cuda-11.6/;
            make FPTYPE=${{ matrix.precision }} -C ${{ matrix.folder }} check
\ No newline at end of file

From 257ad01c3ca7a0520c947d501fbf9277c86cc688 Mon Sep 17 00:00:00 2001
From: Jooorgen <jorgen.teig@gmail.com>
Date: Tue, 6 Dec 2022 17:13:38 +0100
Subject: [PATCH 142/509] Added test to ee_mumu.sa Makefile

---
 epochX/sycl/ee_mumu.sa/SubProcesses/sycl.mk | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/epochX/sycl/ee_mumu.sa/SubProcesses/sycl.mk b/epochX/sycl/ee_mumu.sa/SubProcesses/sycl.mk
index 395c679825..77f2320c4f 100644
--- a/epochX/sycl/ee_mumu.sa/SubProcesses/sycl.mk
+++ b/epochX/sycl/ee_mumu.sa/SubProcesses/sycl.mk
@@ -343,8 +343,8 @@ check: cmpFcheck
 # Target: cmpFcheck (compare ME results from the C++ and Fortran with C++ MEs standalone executables, with a small number of events)
 cmpFcheck: all.$(TAG)
 	@echo
-	@echo "$(BUILDDIR)/check.exe -p 2 32 2"
-	@echo "$(BUILDDIR)/fcheck.exe 2 32 2"
+	@echo "$(BUILDDIR)/check.exe -p 2 32 2 --device_id 0"
+	@echo "$(BUILDDIR)/fcheck.exe 2 32 2 --device_id 0"
 	@me1=$(shell $(RUNTIME) $(BUILDDIR)/check.exe -p 2 32 2 | grep MeanMatrix | awk '{print $$4}'); me2=$(shell $(RUNTIME) $(BUILDDIR)/fcheck.exe 2 32 2 | grep Average | awk '{print $$4}'); echo "Avg ME (C++/C++)    = $${me1}"; echo "Avg ME (F77/C++)    = $${me2}"; if [ "$${me2}" == "NaN" ]; then echo "ERROR! Fortran calculation (F77/C++) returned NaN"; elif [ "$${me2}" == "" ]; then echo "ERROR! Fortran calculation (F77/C++) crashed"; else python3 -c "me1=$${me1}; me2=$${me2}; reldif=abs((me2-me1)/me1); print('Relative difference =', reldif); ok = reldif <= 2E-4; print ( '%s (relative difference %s 2E-4)' % ( ('OK','<=') if ok else ('ERROR','>') ) ); import sys; sys.exit(0 if ok else 1)"; fi
 
 #-------------------------------------------------------------------------------

From 77d95d0cd98c01daa726a24f0780063d10db5d3f Mon Sep 17 00:00:00 2001
From: Jooorgen <jorgen.teig@gmail.com>
Date: Tue, 6 Dec 2022 17:32:14 +0100
Subject: [PATCH 143/509] Changed variable name of CUDA path

---
 .github/workflows/sycl.yml | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/sycl.yml b/.github/workflows/sycl.yml
index 0786d45a58..c357e04ea8 100644
--- a/.github/workflows/sycl.yml
+++ b/.github/workflows/sycl.yml
@@ -26,19 +26,19 @@ jobs:
            CXX=/afs/cern.ch/work/j/jteig/sycl_workspace/llvm/build/bin/clang++;
            LD_LIBRARY_PATH=/afs/cern.ch/work/j/jteig/sycl_workspace/llvm/build/lib:$LD_LIBRARY_PATH;
            LD_LIBRARY_PATH=${{ github.workspace }}/${{ matrix.folder }}/../../lib:$LD_LIBRARY_PATH;
-           export CUDA_HOME=/usr/local/cuda-11.6/;
+           export CUDA_PATH=/usr/local/cuda-11.6/;
            make FPTYPE=${{ matrix.precision }} -C ${{ matrix.folder }} info
     - name: make
       run: source /cvmfs/sft.cern.ch/lcg/contrib/gcc/11.3.0/x86_64-centos8/setup.sh;
            CXX=/afs/cern.ch/work/j/jteig/sycl_workspace/llvm/build/bin/clang++;
            LD_LIBRARY_PATH=/afs/cern.ch/work/j/jteig/sycl_workspace/llvm/build/lib:$LD_LIBRARY_PATH;
            LD_LIBRARY_PATH=${{ github.workspace }}/${{ matrix.folder }}/../../lib:$LD_LIBRARY_PATH;
-           export CUDA_HOME=/usr/local/cuda-11.6/;
+           export CUDA_PATH=/usr/local/cuda-11.6/;
            make FPTYPE=${{ matrix.precision }} -C ${{ matrix.folder }}
     - name: make check
       run: source /cvmfs/sft.cern.ch/lcg/contrib/gcc/11.3.0/x86_64-centos8/setup.sh; 
            CXX=/afs/cern.ch/work/j/jteig/sycl_workspace/llvm/build/bin/clang++;
            LD_LIBRARY_PATH=/afs/cern.ch/work/j/jteig/sycl_workspace/llvm/build/lib:$LD_LIBRARY_PATH;
            LD_LIBRARY_PATH=${{ github.workspace }}/${{ matrix.folder }}/../../lib:$LD_LIBRARY_PATH;
-           export CUDA_HOME=/usr/local/cuda-11.6/;
+           export CUDA_PATH=/usr/local/cuda-11.6/;
            make FPTYPE=${{ matrix.precision }} -C ${{ matrix.folder }} check
\ No newline at end of file

From 3090d75f2a60a216425f84fcadefd2d6f83760f7 Mon Sep 17 00:00:00 2001
From: Jooorgen <jorgen.teig@gmail.com>
Date: Tue, 6 Dec 2022 17:33:01 +0100
Subject: [PATCH 144/509] Revert change to ee_mumu.sa Makefile

---
 epochX/sycl/ee_mumu.sa/SubProcesses/sycl.mk | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/epochX/sycl/ee_mumu.sa/SubProcesses/sycl.mk b/epochX/sycl/ee_mumu.sa/SubProcesses/sycl.mk
index 77f2320c4f..395c679825 100644
--- a/epochX/sycl/ee_mumu.sa/SubProcesses/sycl.mk
+++ b/epochX/sycl/ee_mumu.sa/SubProcesses/sycl.mk
@@ -343,8 +343,8 @@ check: cmpFcheck
 # Target: cmpFcheck (compare ME results from the C++ and Fortran with C++ MEs standalone executables, with a small number of events)
 cmpFcheck: all.$(TAG)
 	@echo
-	@echo "$(BUILDDIR)/check.exe -p 2 32 2 --device_id 0"
-	@echo "$(BUILDDIR)/fcheck.exe 2 32 2 --device_id 0"
+	@echo "$(BUILDDIR)/check.exe -p 2 32 2"
+	@echo "$(BUILDDIR)/fcheck.exe 2 32 2"
 	@me1=$(shell $(RUNTIME) $(BUILDDIR)/check.exe -p 2 32 2 | grep MeanMatrix | awk '{print $$4}'); me2=$(shell $(RUNTIME) $(BUILDDIR)/fcheck.exe 2 32 2 | grep Average | awk '{print $$4}'); echo "Avg ME (C++/C++)    = $${me1}"; echo "Avg ME (F77/C++)    = $${me2}"; if [ "$${me2}" == "NaN" ]; then echo "ERROR! Fortran calculation (F77/C++) returned NaN"; elif [ "$${me2}" == "" ]; then echo "ERROR! Fortran calculation (F77/C++) crashed"; else python3 -c "me1=$${me1}; me2=$${me2}; reldif=abs((me2-me1)/me1); print('Relative difference =', reldif); ok = reldif <= 2E-4; print ( '%s (relative difference %s 2E-4)' % ( ('OK','<=') if ok else ('ERROR','>') ) ); import sys; sys.exit(0 if ok else 1)"; fi
 
 #-------------------------------------------------------------------------------

From 0da2f4ac991fbb2e7647ed51dc82d23ece42fa53 Mon Sep 17 00:00:00 2001
From: Jooorgen <jorgen.teig@gmail.com>
Date: Tue, 6 Dec 2022 17:35:23 +0100
Subject: [PATCH 145/509] Updated SYCL flags

---
 .github/workflows/sycl.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/sycl.yml b/.github/workflows/sycl.yml
index c357e04ea8..1eb1273c49 100644
--- a/.github/workflows/sycl.yml
+++ b/.github/workflows/sycl.yml
@@ -12,7 +12,7 @@ jobs:
     env:
       FC: gfortran
       REQUIRE_CUDA: 1
-      SYCLFLAGS: -fsycl
+      SYCLFLAGS: -fsycl -fsycl-targets=nvptx64-nvidia-cuda -Xsycl-target-backend '--cuda-gpu-arch=sm_70' -fgpu-rdc --cuda-path=$CUDA_PATH
       ENABLE_CI_PROFILER: 1
     strategy:
       matrix:

From b5fd330f825453ba2a9219f0fc51308a89208bf1 Mon Sep 17 00:00:00 2001
From: Jooorgen <jorgen.teig@gmail.com>
Date: Tue, 6 Dec 2022 17:43:07 +0100
Subject: [PATCH 146/509] Updated real path to CUDA installation

---
 .github/workflows/sycl.yml | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/.github/workflows/sycl.yml b/.github/workflows/sycl.yml
index 1eb1273c49..7b086b78bc 100644
--- a/.github/workflows/sycl.yml
+++ b/.github/workflows/sycl.yml
@@ -12,7 +12,7 @@ jobs:
     env:
       FC: gfortran
       REQUIRE_CUDA: 1
-      SYCLFLAGS: -fsycl -fsycl-targets=nvptx64-nvidia-cuda -Xsycl-target-backend '--cuda-gpu-arch=sm_70' -fgpu-rdc --cuda-path=$CUDA_PATH
+      SYCLFLAGS: -fsycl -fsycl-targets=nvptx64-nvidia-cuda -Xsycl-target-backend '--cuda-gpu-arch=sm_70' -fgpu-rdc --cuda-path=/usr/local/cuda-11.6/
       ENABLE_CI_PROFILER: 1
     strategy:
       matrix:
@@ -26,19 +26,16 @@ jobs:
            CXX=/afs/cern.ch/work/j/jteig/sycl_workspace/llvm/build/bin/clang++;
            LD_LIBRARY_PATH=/afs/cern.ch/work/j/jteig/sycl_workspace/llvm/build/lib:$LD_LIBRARY_PATH;
            LD_LIBRARY_PATH=${{ github.workspace }}/${{ matrix.folder }}/../../lib:$LD_LIBRARY_PATH;
-           export CUDA_PATH=/usr/local/cuda-11.6/;
            make FPTYPE=${{ matrix.precision }} -C ${{ matrix.folder }} info
     - name: make
       run: source /cvmfs/sft.cern.ch/lcg/contrib/gcc/11.3.0/x86_64-centos8/setup.sh;
            CXX=/afs/cern.ch/work/j/jteig/sycl_workspace/llvm/build/bin/clang++;
            LD_LIBRARY_PATH=/afs/cern.ch/work/j/jteig/sycl_workspace/llvm/build/lib:$LD_LIBRARY_PATH;
            LD_LIBRARY_PATH=${{ github.workspace }}/${{ matrix.folder }}/../../lib:$LD_LIBRARY_PATH;
-           export CUDA_PATH=/usr/local/cuda-11.6/;
            make FPTYPE=${{ matrix.precision }} -C ${{ matrix.folder }}
     - name: make check
       run: source /cvmfs/sft.cern.ch/lcg/contrib/gcc/11.3.0/x86_64-centos8/setup.sh; 
            CXX=/afs/cern.ch/work/j/jteig/sycl_workspace/llvm/build/bin/clang++;
            LD_LIBRARY_PATH=/afs/cern.ch/work/j/jteig/sycl_workspace/llvm/build/lib:$LD_LIBRARY_PATH;
            LD_LIBRARY_PATH=${{ github.workspace }}/${{ matrix.folder }}/../../lib:$LD_LIBRARY_PATH;
-           export CUDA_PATH=/usr/local/cuda-11.6/;
            make FPTYPE=${{ matrix.precision }} -C ${{ matrix.folder }} check
\ No newline at end of file

From 6fd7075fdceb2e36dba8646483413f3c8098d6ae Mon Sep 17 00:00:00 2001
From: Jooorgen <jorgen.teig@gmail.com>
Date: Thu, 8 Dec 2022 14:32:01 +0100
Subject: [PATCH 147/509] Added Makefile args argument to build script

---
 tools/profiling/buildCUDAProcess.sh | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/tools/profiling/buildCUDAProcess.sh b/tools/profiling/buildCUDAProcess.sh
index 63ba671f7d..f19a21c06d 100755
--- a/tools/profiling/buildCUDAProcess.sh
+++ b/tools/profiling/buildCUDAProcess.sh
@@ -9,10 +9,11 @@ helpFunction()
     echo -e "\t-t Threads per block"
     echo -e "\t-i Iterations"
     echo -e "\t-r Branch"
+    echo -e "\t-m Makefile arguments"
     exit 1 # Exit script after printing help
 }
 
-while getopts "n:b:t:i:r:" opt
+while getopts "n:b:t:i:r:m:" opt
 do
     case "$opt" in
         n ) MG_PROC="$OPTARG" ;; #process to target
@@ -20,6 +21,7 @@ do
         t ) threadsPerBlock="$OPTARG" ;;
         i ) iterations="$OPTARG" ;;
         r ) branch="$OPTARG" ;;
+        m ) makeArgs="$OPTARG" ;;
         ? ) helpFunction ;; # Print helpFunction in case parameter is non-existent
     esac
 done
@@ -43,8 +45,8 @@ prefix=$(pwd)
 export USEBUILDDIR=1
 export NTPBMAX=1024
 export CXX=/cvmfs/sft.cern.ch/lcg/releases/gcc/11.3.0-ad0f5/x86_64-centos8/bin/g++
-export MG_EXE="./gcheck.exe" #GPU
-#export MG_EXE="./check.exe" #CPU
+#export MG_EXE="./gcheck.exe" #GPU
+export MG_EXE="./check.exe" #CPU
 export CUDA_HOME=/usr/local/cuda-11.6/
 export FC=`which gfortran`
 export WORKSPACE=$prefix/workspace_mg4gpu
@@ -79,7 +81,7 @@ export MG5AMC_CARD_PATH=$MG_PROC_DIR/Cards
 # Build executable
 
 cd $MG_SP_DIR
-make
+make $makeArgs
 
 # Run executable
 

From bbd944b39b966e1da16b2a79a112d64f7fea7c79 Mon Sep 17 00:00:00 2001
From: Jooorgen <jorgen.teig@gmail.com>
Date: Thu, 8 Dec 2022 15:21:56 +0100
Subject: [PATCH 148/509] Quick fix for make script

---
 tools/profiling/buildCUDAProcess.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/profiling/buildCUDAProcess.sh b/tools/profiling/buildCUDAProcess.sh
index f19a21c06d..af0bfeb926 100755
--- a/tools/profiling/buildCUDAProcess.sh
+++ b/tools/profiling/buildCUDAProcess.sh
@@ -85,7 +85,7 @@ make $makeArgs
 
 # Run executable
 
-cd build*
+cd build.${makeArgs}_*
 mkdir -p perf/data/ 2>/dev/null; true
 $MG_EXE -j $blocksPerGrid $threadsPerBlock $iterations
 

From 8605c4f0502fc7bc0ed416e104cff1ac2e8641ec Mon Sep 17 00:00:00 2001
From: Jooorgen <jorgen.teig@gmail.com>
Date: Thu, 8 Dec 2022 15:23:17 +0100
Subject: [PATCH 149/509] Quick fix for Make script again

---
 tools/profiling/buildCUDAProcess.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/profiling/buildCUDAProcess.sh b/tools/profiling/buildCUDAProcess.sh
index af0bfeb926..027417fdba 100755
--- a/tools/profiling/buildCUDAProcess.sh
+++ b/tools/profiling/buildCUDAProcess.sh
@@ -85,7 +85,7 @@ make $makeArgs
 
 # Run executable
 
-cd build.${makeArgs}_*
+cd build.${makeArgs:3}_*
 mkdir -p perf/data/ 2>/dev/null; true
 $MG_EXE -j $blocksPerGrid $threadsPerBlock $iterations
 

From ccc672b8f61cfa796fcd2c210558da61e00fe9e4 Mon Sep 17 00:00:00 2001
From: Jooorgen <jorgen.teig@gmail.com>
Date: Thu, 8 Dec 2022 16:29:02 +0100
Subject: [PATCH 150/509] Quick fix for build script

---
 tools/profiling/buildCUDAProcess.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/profiling/buildCUDAProcess.sh b/tools/profiling/buildCUDAProcess.sh
index 027417fdba..e4ec7d3ef9 100755
--- a/tools/profiling/buildCUDAProcess.sh
+++ b/tools/profiling/buildCUDAProcess.sh
@@ -85,7 +85,7 @@ make $makeArgs
 
 # Run executable
 
-cd build.${makeArgs:3}_*
+cd build.${makeArgs:3}*
 mkdir -p perf/data/ 2>/dev/null; true
 $MG_EXE -j $blocksPerGrid $threadsPerBlock $iterations
 

From 09ec44d0a34f9a0ecc6a6775d383e664804b97b3 Mon Sep 17 00:00:00 2001
From: Jooorgen <jorgen.teig@gmail.com>
Date: Thu, 8 Dec 2022 16:32:55 +0100
Subject: [PATCH 151/509] Added echo to see whats run

---
 tools/profiling/buildCUDAProcess.sh | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/tools/profiling/buildCUDAProcess.sh b/tools/profiling/buildCUDAProcess.sh
index e4ec7d3ef9..04a74efc15 100755
--- a/tools/profiling/buildCUDAProcess.sh
+++ b/tools/profiling/buildCUDAProcess.sh
@@ -89,5 +89,7 @@ cd build.${makeArgs:3}*
 mkdir -p perf/data/ 2>/dev/null; true
 $MG_EXE -j $blocksPerGrid $threadsPerBlock $iterations
 
+echo "${MG_EXE} -j ${blocksPerGrid} ${threadsPerBlock} ${iterations}"
+
 cd perf/data/
 mv 0-perf-test-run0.json ${REPORT_FOLDER}/test_${MG_PROC}_${CUDA_NAME_PREFIX}_${blocksPerGrid}_${threadsPerBlock}_${iterations}.json
\ No newline at end of file

From 146fd292b4106c6de6c7f9aff7bb97b4226f859d Mon Sep 17 00:00:00 2001
From: Jooorgen <jorgen.teig@gmail.com>
Date: Fri, 9 Dec 2022 14:38:18 +0100
Subject: [PATCH 152/509] Set device_id to 1 to execute on CPU

---
 tools/profiling/buildSYCLProcess.sh | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/tools/profiling/buildSYCLProcess.sh b/tools/profiling/buildSYCLProcess.sh
index f32917d8d8..9556829ba8 100755
--- a/tools/profiling/buildSYCLProcess.sh
+++ b/tools/profiling/buildSYCLProcess.sh
@@ -55,7 +55,10 @@ export WORKSPACE=$prefix/workspace_mg4gpu
 REPORT_FOLDER="${WORKSPACE}/$(date +"%y-%m-%d")_${SYCL_NAME_PREFIX}_${branch}"
 
 # If unknown set at the run step after running LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$MG_LIBS $MG_EXE --param_card $MG5AMC_CARD_PATH/param_card.dat --device_info 1024 128 10
-export DEVICE_ID=0
+# GPU
+#export DEVICE_ID=0
+# CPU
+export DEVICE_ID=1
 
 # Finds correct subprocess
 case $MG_PROC in

From 47c552e769fcd38b60a67b4f96678341b98269e5 Mon Sep 17 00:00:00 2001
From: Jooorgen <jorgen.teig@gmail.com>
Date: Fri, 9 Dec 2022 16:58:55 +0100
Subject: [PATCH 153/509] Switched to GPU compilation

---
 tools/profiling/buildCUDAProcess.sh | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/tools/profiling/buildCUDAProcess.sh b/tools/profiling/buildCUDAProcess.sh
index 04a74efc15..d62f5b7538 100755
--- a/tools/profiling/buildCUDAProcess.sh
+++ b/tools/profiling/buildCUDAProcess.sh
@@ -45,8 +45,8 @@ prefix=$(pwd)
 export USEBUILDDIR=1
 export NTPBMAX=1024
 export CXX=/cvmfs/sft.cern.ch/lcg/releases/gcc/11.3.0-ad0f5/x86_64-centos8/bin/g++
-#export MG_EXE="./gcheck.exe" #GPU
-export MG_EXE="./check.exe" #CPU
+export MG_EXE="./gcheck.exe" #GPU
+#export MG_EXE="./check.exe" #CPU
 export CUDA_HOME=/usr/local/cuda-11.6/
 export FC=`which gfortran`
 export WORKSPACE=$prefix/workspace_mg4gpu
@@ -81,7 +81,7 @@ export MG5AMC_CARD_PATH=$MG_PROC_DIR/Cards
 # Build executable
 
 cd $MG_SP_DIR
-make $makeArgs
+make -j $makeArgs
 
 # Run executable
 

From e3394812ea3445597bc1113abdd3f84eb8aa3423 Mon Sep 17 00:00:00 2001
From: Jooorgen <jorgen.teig@gmail.com>
Date: Fri, 9 Dec 2022 16:59:44 +0100
Subject: [PATCH 154/509] Switched to GPU execution

---
 tools/profiling/buildSYCLProcess.sh | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/tools/profiling/buildSYCLProcess.sh b/tools/profiling/buildSYCLProcess.sh
index 9556829ba8..abd1566734 100755
--- a/tools/profiling/buildSYCLProcess.sh
+++ b/tools/profiling/buildSYCLProcess.sh
@@ -56,9 +56,9 @@ REPORT_FOLDER="${WORKSPACE}/$(date +"%y-%m-%d")_${SYCL_NAME_PREFIX}_${branch}"
 
 # If unknown set at the run step after running LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$MG_LIBS $MG_EXE --param_card $MG5AMC_CARD_PATH/param_card.dat --device_info 1024 128 10
 # GPU
-#export DEVICE_ID=0
+export DEVICE_ID=0
 # CPU
-export DEVICE_ID=1
+#export DEVICE_ID=1
 
 # Finds correct subprocess
 case $MG_PROC in
@@ -93,7 +93,7 @@ export MG5AMC_CARD_PATH=$MG_PROC_DIR/Cards
 
 # Build executable
 cd $MG_SP_DIR
-make
+make -j
 mv ../../lib/build.d_inl0*/ $MG_LIBS_DIR #2>/dev/null; true
 mv build.d_inl0*/ $MG_EXE_DIR #2>/dev/null; true
 

From 3a54d06924d3d81560331837b4a2c7fda84bbff4 Mon Sep 17 00:00:00 2001
From: Jooorgen <jorgen.teig@gmail.com>
Date: Thu, 12 Jan 2023 17:05:22 +0100
Subject: [PATCH 155/509] Updated sendData.py script to use the data in the
 file names for tag field names

---
 tools/profiling/sendData.py | 64 ++++++++++++++++++++++---------------
 1 file changed, 39 insertions(+), 25 deletions(-)

diff --git a/tools/profiling/sendData.py b/tools/profiling/sendData.py
index cb2b6d27f4..29ece4f844 100644
--- a/tools/profiling/sendData.py
+++ b/tools/profiling/sendData.py
@@ -11,26 +11,23 @@
 
 # Parameter defaults
 URL = 'https://dbod-madgraph4gpu-db.cern.ch:8082/api/v2/write?bucket=ProfilerData'
-secret = os.getenv('MADGRAPH4GPU_DB_SECRET')
+secret = 'fV8dKViWTVdnA3Rw*qCeA@MYtZki@q'
 Auth = ['db_user', secret]
-GPU = 'v100s'
 physicsProcesses = ['ee_mumu', 'gg_ttggg', 'gg_ttgg', 'gg_ttg', 'gg_tt']
-absLayer = ['SYCL', 'CUDA', 'sycl', 'cuda']
+absLayers = ['SYCL', 'CUDA']
 branch = 'master'
-GCCVersion = '11.3'
-CUDAVersion = '11.6.2'
 fields = ['EvtsPerSec[MatrixElems] (3)', 'EvtsPerSec[MECalcOnly] (3)']
-reportPath = 'C:\\Users\\jteig\\cernbox\\Documents\\CERN\\reports\\CUDA_v100s_Profiling_16.09_GCC10.3_CUDA11.5_GOLDEN_EPOCHX4'
+reportPath = 'C:\\Users\\jteig\\cernbox\\Documents\\test\\22-12-07_cudacpp_Xeon-Silver-4216_v100s_gcc-11.3_cuda-11.6.2_master'
 
 # Argument parser
 parser = argparse.ArgumentParser(description='A script for sending data from profiler to InfluxDB.')
 
 parser.add_argument('-r', '--reportPath', help="Path for the reports that is being put into the database.", default=reportPath)
 parser.add_argument('-f', '--fields', help="Fields in the JSON to be put into the database.", default=fields)
-parser.add_argument('-g', '--gpu', help="GPU used when profiling.", default=GPU)
-parser.add_argument('--GCCVersion', help="GCC version used when profiling.", default=GCCVersion)
-parser.add_argument('--CUDAVersion', help="CUDA version used when profiling.", default=CUDAVersion)
-parser.add_argument('-a', '--absLayer', help="Abstraction layer used when profiling.", default=absLayer[0], choices=absLayer)
+#parser.add_argument('-g', '--gpu', help="GPU used when profiling.", default=GPU)
+#parser.add_argument('--GCCVersion', help="GCC version used when profiling.", default=GCCVersion)
+#parser.add_argument('--CUDAVersion', help="CUDA version used when profiling.", default=CUDAVersion)
+#parser.add_argument('-a', '--absLayer', help="Abstraction layer used when profiling.", default=absLayer[0], choices=absLayer)
 parser.add_argument('-b', '--branch', help="Branch the profiler data is in.", default=branch)
 parser.add_argument('-p', '--profiler', help="Enable CI profiling defaults.", action=argparse.BooleanOptionalAction)
 
@@ -51,7 +48,8 @@
                 logging.error('Sycl name prefix has not been set!')
                 sys.exit(1)
 
-            reportfolder= "workspace_mg4gpu/" + datetime.datetime.now().strftime('%y-%m-%d') + '_' + syclNamePrefix + '_' + branch
+            # Fix the branch detection from the file name here
+            reportfolder= "workspace_mg4gpu/" + datetime.datetime.now().strftime('%y-%m-%d') + '_' + syclNamePrefix + '_' + args.branch
             print(reportfolder)
 
             if not os.path.exists(reportfolder):
@@ -65,7 +63,7 @@
                 logging.error('Cuda name prefix has not been set!')
                 sys.exit(1)
 
-            reportfolder= "workspace_mg4gpu/" + datetime.datetime.now().strftime('%y-%m-%d') + '_' + cudaNamePrefix + '_' + branch
+            reportfolder= "workspace_mg4gpu/" + datetime.datetime.now().strftime('%y-%m-%d') + '_' + cudaNamePrefix + '_' + args.branch
 
             if not os.path.exists(reportfolder):
                 logging.error('CUDA report path does not exist!')
@@ -86,34 +84,50 @@
     files = [p for sublist in filePath for p in sublist]
 
     for file in files:
-            f = open(file, 'r')
 
-            fileName = (os.path.basename(file))
+        with open(file, "r") as f:
 
-            for process in physicsProcesses:
-                if process in fileName:
-                    physicsProcess = process
-                    break
+            fileContents = f.read()
 
-            f = f.read()
+            if fileContents != '':
+                data = json.loads(fileContents)
 
-            if f != '':
-                data = json.loads(f)
+                fileName = (os.path.basename(file))
+
+                for process in physicsProcesses:
+                    if process in fileName.lower():
+                        physicsProcess = process
+                        break
+
+                fileNameParts = fileName.split('_')
+
+                CPU = fileNameParts[4]
+
+                GPU = fileNameParts[5]
+
+                for word in absLayers:
+                    if word.lower() in fileName.lower():
+                        absLayer = word
+                        break
+
+                GCCVersion = fileNameParts[6].split('-')[1]
+
+                CUDAVersion = fileNameParts[7].split('-')[1]
 
                 gridsize = data[0]["NumThreadsPerBlock"] * data[0]["NumBlocksPerGrid"]
 
-                DBdata = f'{physicsProcess},GPU={args.gpu},AbstractionLayer={args.absLayer.upper()},GCCVersion={args.GCCVersion},CUDAVersion={args.CUDAVersion},NumThreadsPerBlock={data[0]["NumThreadsPerBlock"]},NumBlocksPerGrid={data[0]["NumBlocksPerGrid"]},NumIterations={data[0]["NumIterations"]} Gridsize={gridsize}'
+                DBdata = f'{physicsProcess},CPU={CPU},GPU={GPU},AbstractionLayer={absLayer},GCCVersion={GCCVersion},CUDAVersion={CUDAVersion},NumThreadsPerBlock={data[0]["NumThreadsPerBlock"]},NumBlocksPerGrid={data[0]["NumBlocksPerGrid"]},NumIterations={data[0]["NumIterations"]} Gridsize={gridsize}'
 
                 for field in fields:
                     value = float(re.findall(r'[\d.]+',data[0][field])[0])
 
-                    DBdata = DBdata + ',' + args.absLayer + "_" + field.replace(" ", "_") + '=' + str(value)
-
-                #DBdata = DBdata + ' 1668164400'
+                    DBdata = DBdata + ',' + absLayer + "_" + field.replace(" ", "_") + '=' + str(value)
 
                 requestInfo = ["curl", "-i",  '-XPOST', "-i",  URL, "--header",  "Authorization: Token "+Auth[0]+":"+Auth[1], "--data-raw", DBdata]
                 
                 request = subprocess.run(requestInfo, stdout=subprocess.DEVNULL)
+
+                f.close()
                 
                 if request.returncode != 0:
                     print(str(datetime.datetime.now().strftime("%H:%M:%S")) + " Request FAILED! Data: " + DBdata)

From 06cc5f1bdc067e8dc9cad1f36065b15f7e832f92 Mon Sep 17 00:00:00 2001
From: Jooorgen <jorgen.teig@gmail.com>
Date: Thu, 12 Jan 2023 17:06:09 +0100
Subject: [PATCH 156/509] Removed absLayer option since it gets it from file
 name now

---
 .github/workflows/cudaProfiler.yml | 2 +-
 .github/workflows/syclProfiler.yml | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/cudaProfiler.yml b/.github/workflows/cudaProfiler.yml
index f0fea1e98e..3f90594a90 100644
--- a/.github/workflows/cudaProfiler.yml
+++ b/.github/workflows/cudaProfiler.yml
@@ -18,4 +18,4 @@ jobs:
     - name: Runs CUDA performanceProfiler.py script
       run: cd tools/profiling/; source /cvmfs/sft.cern.ch/lcg/contrib/gcc/11.3.0/x86_64-centos8/setup.sh; python3 performanceProfiler.py -l 'CUDA' -b 'master'
     - name: Uploads CUDA JSON files to DB
-      run: cd tools/profiling/; python3 sendData.py --profiler --absLayer "CUDA" --branch master
\ No newline at end of file
+      run: cd tools/profiling/; python3 sendData.py --profiler --branch master
\ No newline at end of file
diff --git a/.github/workflows/syclProfiler.yml b/.github/workflows/syclProfiler.yml
index 15305fdfad..1a702c5eab 100644
--- a/.github/workflows/syclProfiler.yml
+++ b/.github/workflows/syclProfiler.yml
@@ -18,4 +18,4 @@ jobs:
     - name: Runs SYCL performanceProfiler.py script
       run: cd tools/profiling/; source /cvmfs/sft.cern.ch/lcg/contrib/gcc/11.3.0/x86_64-centos8/setup.sh; python3 performanceProfiler.py -l 'SYCL' -b 'master'
     - name: Uploads SYCL JSON files to DB
-      run: cd tools/profiling/; python3 sendData.py --profiler --absLayer "SYCL" --branch master
\ No newline at end of file
+      run: cd tools/profiling/; python3 sendData.py --profiler --branch master
\ No newline at end of file

From d87d69f31c9bbc21609174bce6c1439b8c38054e Mon Sep 17 00:00:00 2001
From: Jooorgen <jorgen.teig@gmail.com>
Date: Fri, 13 Jan 2023 15:26:14 +0100
Subject: [PATCH 157/509] Reverted some changes in sendData.py

---
 tools/profiling/sendData.py | 11 +++--------
 1 file changed, 3 insertions(+), 8 deletions(-)

diff --git a/tools/profiling/sendData.py b/tools/profiling/sendData.py
index 29ece4f844..a2cdc2af5a 100644
--- a/tools/profiling/sendData.py
+++ b/tools/profiling/sendData.py
@@ -27,7 +27,7 @@
 #parser.add_argument('-g', '--gpu', help="GPU used when profiling.", default=GPU)
 #parser.add_argument('--GCCVersion', help="GCC version used when profiling.", default=GCCVersion)
 #parser.add_argument('--CUDAVersion', help="CUDA version used when profiling.", default=CUDAVersion)
-#parser.add_argument('-a', '--absLayer', help="Abstraction layer used when profiling.", default=absLayer[0], choices=absLayer)
+parser.add_argument('-a', '--absLayer', help="Abstraction layer used when profiling.", default=absLayers[0])
 parser.add_argument('-b', '--branch', help="Branch the profiler data is in.", default=branch)
 parser.add_argument('-p', '--profiler', help="Enable CI profiling defaults.", action=argparse.BooleanOptionalAction)
 
@@ -105,23 +105,18 @@
 
                 GPU = fileNameParts[5]
 
-                for word in absLayers:
-                    if word.lower() in fileName.lower():
-                        absLayer = word
-                        break
-
                 GCCVersion = fileNameParts[6].split('-')[1]
 
                 CUDAVersion = fileNameParts[7].split('-')[1]
 
                 gridsize = data[0]["NumThreadsPerBlock"] * data[0]["NumBlocksPerGrid"]
 
-                DBdata = f'{physicsProcess},CPU={CPU},GPU={GPU},AbstractionLayer={absLayer},GCCVersion={GCCVersion},CUDAVersion={CUDAVersion},NumThreadsPerBlock={data[0]["NumThreadsPerBlock"]},NumBlocksPerGrid={data[0]["NumBlocksPerGrid"]},NumIterations={data[0]["NumIterations"]} Gridsize={gridsize}'
+                DBdata = f'{physicsProcess},CPU={CPU},GPU={GPU},AbstractionLayer={args.absLayer},GCCVersion={GCCVersion},CUDAVersion={CUDAVersion},NumThreadsPerBlock={data[0]["NumThreadsPerBlock"]},NumBlocksPerGrid={data[0]["NumBlocksPerGrid"]},NumIterations={data[0]["NumIterations"]} Gridsize={gridsize}'
 
                 for field in fields:
                     value = float(re.findall(r'[\d.]+',data[0][field])[0])
 
-                    DBdata = DBdata + ',' + absLayer + "_" + field.replace(" ", "_") + '=' + str(value)
+                    DBdata = DBdata + ',' + args.absLayer + "_" + field.replace(" ", "_") + '=' + str(value)
 
                 requestInfo = ["curl", "-i",  '-XPOST', "-i",  URL, "--header",  "Authorization: Token "+Auth[0]+":"+Auth[1], "--data-raw", DBdata]
                 

From b42b813a1b25137a1446e8e8e7768627df1751a1 Mon Sep 17 00:00:00 2001
From: Jooorgen <jorgen.teig@gmail.com>
Date: Fri, 13 Jan 2023 15:59:42 +0100
Subject: [PATCH 158/509] Reintroduced absLayer option

---
 .github/workflows/cudaProfiler.yml | 2 +-
 .github/workflows/syclProfiler.yml | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/cudaProfiler.yml b/.github/workflows/cudaProfiler.yml
index 3f90594a90..0e1462403f 100644
--- a/.github/workflows/cudaProfiler.yml
+++ b/.github/workflows/cudaProfiler.yml
@@ -18,4 +18,4 @@ jobs:
     - name: Runs CUDA performanceProfiler.py script
       run: cd tools/profiling/; source /cvmfs/sft.cern.ch/lcg/contrib/gcc/11.3.0/x86_64-centos8/setup.sh; python3 performanceProfiler.py -l 'CUDA' -b 'master'
     - name: Uploads CUDA JSON files to DB
-      run: cd tools/profiling/; python3 sendData.py --profiler --branch master
\ No newline at end of file
+      run: cd tools/profiling/; python3 sendData.py --absLayer CUDA --profiler --branch master
\ No newline at end of file
diff --git a/.github/workflows/syclProfiler.yml b/.github/workflows/syclProfiler.yml
index 1a702c5eab..a6bc705c3a 100644
--- a/.github/workflows/syclProfiler.yml
+++ b/.github/workflows/syclProfiler.yml
@@ -18,4 +18,4 @@ jobs:
     - name: Runs SYCL performanceProfiler.py script
       run: cd tools/profiling/; source /cvmfs/sft.cern.ch/lcg/contrib/gcc/11.3.0/x86_64-centos8/setup.sh; python3 performanceProfiler.py -l 'SYCL' -b 'master'
     - name: Uploads SYCL JSON files to DB
-      run: cd tools/profiling/; python3 sendData.py --profiler --branch master
\ No newline at end of file
+      run: cd tools/profiling/; python3 sendData.py --absLayer SYCL --profiler --branch master
\ No newline at end of file

From fee635b2d2d15d466b0a20402a0be52a6b0df651 Mon Sep 17 00:00:00 2001
From: Jooorgen <jorgen.teig@gmail.com>
Date: Mon, 16 Jan 2023 10:57:41 +0100
Subject: [PATCH 159/509] Added workflows for profiling on A100 GPUs

---
 .github/workflows/cudaProfiler.yml   |  4 ++--
 .github/workflows/cuda_A100_Profiler | 21 +++++++++++++++++++++
 .github/workflows/syclProfiler.yml   |  2 +-
 .github/workflows/sycl_A100_Profiler | 21 +++++++++++++++++++++
 4 files changed, 45 insertions(+), 3 deletions(-)
 create mode 100644 .github/workflows/cuda_A100_Profiler
 create mode 100644 .github/workflows/sycl_A100_Profiler

diff --git a/.github/workflows/cudaProfiler.yml b/.github/workflows/cudaProfiler.yml
index 0e1462403f..720e9d7b08 100644
--- a/.github/workflows/cudaProfiler.yml
+++ b/.github/workflows/cudaProfiler.yml
@@ -1,11 +1,11 @@
-name: CUDA Profiler
+name: CUDA V100s Profiler
 
 on:
   schedule:
     - cron:  '00 00 * * *'
 
 jobs:
-  cuda_a100s_Profiling:
+  cuda_v100s_Profiling:
     name: CUDA V100S Profiling
     env:
       CUDA_NAME_PREFIX: cudacpp_Xeon-Silver-4216_v100s_gcc-11.3_cuda-11.6.2
diff --git a/.github/workflows/cuda_A100_Profiler b/.github/workflows/cuda_A100_Profiler
new file mode 100644
index 0000000000..2e065829f1
--- /dev/null
+++ b/.github/workflows/cuda_A100_Profiler
@@ -0,0 +1,21 @@
+name: CUDA A100 Profiler
+
+on:
+  schedule:
+    - cron:  '00 00 * * *'
+
+jobs:
+  cuda_a100_Profiling:
+    name: CUDA A100 Profiling
+    env:
+      CUDA_NAME_PREFIX: cudacpp_AMD-Epyc-7313_a100_gcc-11.3_cuda-11.6.2
+      ENABLE_CI_PROFILER: 1
+
+      MADGRAPH4GPU_DB_SECRET: ${{ secrets.MADGRAPH4GPU_DB_SECRET }}
+    runs-on: [self-hosted, linux, a100]
+    steps:
+    - uses: actions/checkout@v2
+    - name: Runs CUDA performanceProfiler.py script
+      run: cd tools/profiling/; source /cvmfs/sft.cern.ch/lcg/contrib/gcc/11.3.0/x86_64-centos8/setup.sh; python3 performanceProfiler.py -l 'CUDA' -b 'master'
+    - name: Uploads CUDA JSON files to DB
+      run: cd tools/profiling/; python3 sendData.py --absLayer CUDA --profiler --branch master
\ No newline at end of file
diff --git a/.github/workflows/syclProfiler.yml b/.github/workflows/syclProfiler.yml
index a6bc705c3a..93eaedf689 100644
--- a/.github/workflows/syclProfiler.yml
+++ b/.github/workflows/syclProfiler.yml
@@ -1,4 +1,4 @@
-name: SYCL Profiler
+name: SYCL V100S Profiler
 
 on:
   schedule:
diff --git a/.github/workflows/sycl_A100_Profiler b/.github/workflows/sycl_A100_Profiler
new file mode 100644
index 0000000000..b318874cf8
--- /dev/null
+++ b/.github/workflows/sycl_A100_Profiler
@@ -0,0 +1,21 @@
+name: SYCL Profiler
+
+on:
+  schedule:
+    - cron:  '00 00 * * *'
+
+jobs:
+  sycl_A100_Profiling:
+    name: SYCL A100 Profiling
+    env:
+      SYCL_NAME_PREFIX: sycl_AMD-Epyc-7313_a100_gcc-11.3_cuda-11.6.2
+      ENABLE_CI_PROFILER: 1
+
+      MADGRAPH4GPU_DB_SECRET: ${{ secrets.MADGRAPH4GPU_DB_SECRET }}
+    runs-on: [self-hosted, linux, a100]
+    steps:
+    - uses: actions/checkout@v2
+    - name: Runs SYCL performanceProfiler.py script
+      run: cd tools/profiling/; source /cvmfs/sft.cern.ch/lcg/contrib/gcc/11.3.0/x86_64-centos8/setup.sh; python3 performanceProfiler.py -l 'SYCL' -b 'master'
+    - name: Uploads SYCL JSON files to DB
+      run: cd tools/profiling/; python3 sendData.py --absLayer SYCL --profiler --branch master
\ No newline at end of file

From c8e54224a385d571f2a91f65db18bbb6d1d05de3 Mon Sep 17 00:00:00 2001
From: Jooorgen <jorgen.teig@gmail.com>
Date: Mon, 16 Jan 2023 10:58:24 +0100
Subject: [PATCH 160/509] Added test to check if profiling on A100 works

---
 .github/workflows/cuda_A100_Profiler | 6 ++++--
 .github/workflows/sycl_A100_Profiler | 6 ++++--
 2 files changed, 8 insertions(+), 4 deletions(-)

diff --git a/.github/workflows/cuda_A100_Profiler b/.github/workflows/cuda_A100_Profiler
index 2e065829f1..e913b08238 100644
--- a/.github/workflows/cuda_A100_Profiler
+++ b/.github/workflows/cuda_A100_Profiler
@@ -1,8 +1,10 @@
 name: CUDA A100 Profiler
 
 on:
-  schedule:
-    - cron:  '00 00 * * *'
+  push:
+    branches: [ master ]
+  pull_request:
+    branches: [ master ]
 
 jobs:
   cuda_a100_Profiling:
diff --git a/.github/workflows/sycl_A100_Profiler b/.github/workflows/sycl_A100_Profiler
index b318874cf8..7936186639 100644
--- a/.github/workflows/sycl_A100_Profiler
+++ b/.github/workflows/sycl_A100_Profiler
@@ -1,8 +1,10 @@
 name: SYCL Profiler
 
 on:
-  schedule:
-    - cron:  '00 00 * * *'
+  push:
+    branches: [ master ]
+  pull_request:
+    branches: [ master ]
 
 jobs:
   sycl_A100_Profiling:

From c96667a55df32d59aefa44d74f42d88cb26de381 Mon Sep 17 00:00:00 2001
From: Jooorgen <jorgen.teig@gmail.com>
Date: Mon, 16 Jan 2023 11:00:31 +0100
Subject: [PATCH 161/509] Corrected file types so Github recognizes it as a
 workflow

---
 .github/workflows/{cuda_A100_Profiler => cuda_A100_Profiler.yml} | 0
 .github/workflows/{sycl_A100_Profiler => sycl_A100_Profiler.yml} | 0
 2 files changed, 0 insertions(+), 0 deletions(-)
 rename .github/workflows/{cuda_A100_Profiler => cuda_A100_Profiler.yml} (100%)
 rename .github/workflows/{sycl_A100_Profiler => sycl_A100_Profiler.yml} (100%)

diff --git a/.github/workflows/cuda_A100_Profiler b/.github/workflows/cuda_A100_Profiler.yml
similarity index 100%
rename from .github/workflows/cuda_A100_Profiler
rename to .github/workflows/cuda_A100_Profiler.yml
diff --git a/.github/workflows/sycl_A100_Profiler b/.github/workflows/sycl_A100_Profiler.yml
similarity index 100%
rename from .github/workflows/sycl_A100_Profiler
rename to .github/workflows/sycl_A100_Profiler.yml

From d1daf285b83d6cb6196b649827c7516257fbeaa2 Mon Sep 17 00:00:00 2001
From: Jooorgen <jorgen.teig@gmail.com>
Date: Mon, 16 Jan 2023 11:43:21 +0100
Subject: [PATCH 162/509] Updated the CUDA version from 11.6 to 11.8

---
 .github/workflows/cudaProfiler.yml       | 2 +-
 .github/workflows/cuda_A100_Profiler.yml | 2 +-
 .github/workflows/syclProfiler.yml       | 2 +-
 .github/workflows/sycl_A100_Profiler.yml | 4 ++--
 tools/profiling/buildCUDAProcess.sh      | 2 +-
 tools/profiling/buildSYCLProcess.sh      | 2 +-
 6 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/.github/workflows/cudaProfiler.yml b/.github/workflows/cudaProfiler.yml
index 720e9d7b08..f0e5b581c6 100644
--- a/.github/workflows/cudaProfiler.yml
+++ b/.github/workflows/cudaProfiler.yml
@@ -8,7 +8,7 @@ jobs:
   cuda_v100s_Profiling:
     name: CUDA V100S Profiling
     env:
-      CUDA_NAME_PREFIX: cudacpp_Xeon-Silver-4216_v100s_gcc-11.3_cuda-11.6.2
+      CUDA_NAME_PREFIX: cudacpp_Xeon-Silver-4216_v100s_gcc-11.3_cuda-11.8
       ENABLE_CI_PROFILER: 1
 
       MADGRAPH4GPU_DB_SECRET: ${{ secrets.MADGRAPH4GPU_DB_SECRET }}
diff --git a/.github/workflows/cuda_A100_Profiler.yml b/.github/workflows/cuda_A100_Profiler.yml
index e913b08238..526a8a8253 100644
--- a/.github/workflows/cuda_A100_Profiler.yml
+++ b/.github/workflows/cuda_A100_Profiler.yml
@@ -10,7 +10,7 @@ jobs:
   cuda_a100_Profiling:
     name: CUDA A100 Profiling
     env:
-      CUDA_NAME_PREFIX: cudacpp_AMD-Epyc-7313_a100_gcc-11.3_cuda-11.6.2
+      CUDA_NAME_PREFIX: cudacpp_AMD-Epyc-7313_a100_gcc-11.3_cuda-11.8
       ENABLE_CI_PROFILER: 1
 
       MADGRAPH4GPU_DB_SECRET: ${{ secrets.MADGRAPH4GPU_DB_SECRET }}
diff --git a/.github/workflows/syclProfiler.yml b/.github/workflows/syclProfiler.yml
index 93eaedf689..4698dc4835 100644
--- a/.github/workflows/syclProfiler.yml
+++ b/.github/workflows/syclProfiler.yml
@@ -8,7 +8,7 @@ jobs:
   sycl_v100s_Profiling:
     name: SYCL V100S Profiling
     env:
-      SYCL_NAME_PREFIX: sycl_Xeon-Silver-4216_v100s_gcc-11.3_cuda-11.6.2
+      SYCL_NAME_PREFIX: sycl_Xeon-Silver-4216_v100s_gcc-11.3_cuda-11.8
       ENABLE_CI_PROFILER: 1
 
       MADGRAPH4GPU_DB_SECRET: ${{ secrets.MADGRAPH4GPU_DB_SECRET }}
diff --git a/.github/workflows/sycl_A100_Profiler.yml b/.github/workflows/sycl_A100_Profiler.yml
index 7936186639..2d2b9310c3 100644
--- a/.github/workflows/sycl_A100_Profiler.yml
+++ b/.github/workflows/sycl_A100_Profiler.yml
@@ -1,4 +1,4 @@
-name: SYCL Profiler
+name: SYCL A100 Profiler
 
 on:
   push:
@@ -10,7 +10,7 @@ jobs:
   sycl_A100_Profiling:
     name: SYCL A100 Profiling
     env:
-      SYCL_NAME_PREFIX: sycl_AMD-Epyc-7313_a100_gcc-11.3_cuda-11.6.2
+      SYCL_NAME_PREFIX: sycl_AMD-Epyc-7313_a100_gcc-11.3_cuda-11.9
       ENABLE_CI_PROFILER: 1
 
       MADGRAPH4GPU_DB_SECRET: ${{ secrets.MADGRAPH4GPU_DB_SECRET }}
diff --git a/tools/profiling/buildCUDAProcess.sh b/tools/profiling/buildCUDAProcess.sh
index d62f5b7538..d62e333fb4 100755
--- a/tools/profiling/buildCUDAProcess.sh
+++ b/tools/profiling/buildCUDAProcess.sh
@@ -47,7 +47,7 @@ export NTPBMAX=1024
 export CXX=/cvmfs/sft.cern.ch/lcg/releases/gcc/11.3.0-ad0f5/x86_64-centos8/bin/g++
 export MG_EXE="./gcheck.exe" #GPU
 #export MG_EXE="./check.exe" #CPU
-export CUDA_HOME=/usr/local/cuda-11.6/
+export CUDA_HOME=/usr/local/cuda-11.8/
 export FC=`which gfortran`
 export WORKSPACE=$prefix/workspace_mg4gpu
 #export NAME_PREFIX="cudacpp_v100s_cuda_11.6.2_gcc_11.3"
diff --git a/tools/profiling/buildSYCLProcess.sh b/tools/profiling/buildSYCLProcess.sh
index abd1566734..f0e649c6a7 100755
--- a/tools/profiling/buildSYCLProcess.sh
+++ b/tools/profiling/buildSYCLProcess.sh
@@ -45,7 +45,7 @@ export DPCPP_HOME=/afs/cern.ch/work/j/jteig/sycl_workspace
 export USEBUILDDIR=1
 export NTPBMAX=1024
 export CXX=$DPCPP_HOME/llvm/build/bin/clang++
-export CUDA_PATH=/usr/local/cuda-11.6
+export CUDA_PATH=/usr/local/cuda-11.8/
 export SYCLFLAGS="-fsycl -fsycl-targets=nvptx64-nvidia-cuda -Xsycl-target-backend '--cuda-gpu-arch=sm_70' -fgpu-rdc --cuda-path=$CUDA_PATH"
 export WORKSPACE=$prefix/workspace_mg4gpu
 #export NAME_PREFIX="sycl_v100s_cuda_11.6.2_gcc_11.3"

From 0d219033dd322c4f097bb8c0079b90a55adb06d2 Mon Sep 17 00:00:00 2001
From: Jooorgen <jorgen.teig@gmail.com>
Date: Mon, 16 Jan 2023 14:25:02 +0100
Subject: [PATCH 163/509] Added a way to automatically detect GPU and assign SM
 level

---
 tools/profiling/buildSYCLProcess.sh | 20 +++++++++++++++++++-
 1 file changed, 19 insertions(+), 1 deletion(-)

diff --git a/tools/profiling/buildSYCLProcess.sh b/tools/profiling/buildSYCLProcess.sh
index f0e649c6a7..40ceb99aed 100755
--- a/tools/profiling/buildSYCLProcess.sh
+++ b/tools/profiling/buildSYCLProcess.sh
@@ -35,6 +35,24 @@ fi
 
 ##################################################################
 
+# Assign correct SM level for NVIDIA GPUs
+
+# Check if nvidia-smi command exists
+if command -v nvidia-smi > /dev/null 2>&1; then
+
+    # Get the name of the GPU
+    GPU_NAME=$(lshw -C display | grep -i "product:" | awk -F'[][]' '{print $2}')
+else
+    echo "nvidia-smi non existent on system, Nvidia GPU not present!"
+    exit
+
+case $GPU_NAME in
+    Tesla V100S PCIe 32GB ) export SM_LEVEL="sm_70" ;;
+    A100 PCIe 40GB ) export SM_LEVEL="sm_80" ;;
+esac
+
+##################################################################
+
 # Set variables for later use
 
 # Assumes that this is run from profiling directory in the repo
@@ -46,7 +64,7 @@ export USEBUILDDIR=1
 export NTPBMAX=1024
 export CXX=$DPCPP_HOME/llvm/build/bin/clang++
 export CUDA_PATH=/usr/local/cuda-11.8/
-export SYCLFLAGS="-fsycl -fsycl-targets=nvptx64-nvidia-cuda -Xsycl-target-backend '--cuda-gpu-arch=sm_70' -fgpu-rdc --cuda-path=$CUDA_PATH"
+export SYCLFLAGS="-fsycl -fsycl-targets=nvptx64-nvidia-cuda -Xsycl-target-backend '--cuda-gpu-arch=$SM_LEVEL' -fgpu-rdc --cuda-path=$CUDA_PATH"
 export WORKSPACE=$prefix/workspace_mg4gpu
 #export NAME_PREFIX="sycl_v100s_cuda_11.6.2_gcc_11.3"
 #export NAME_PREFIX="sycl_Xeon-Silver-4216_a100s_cuda-11.6.2_gcc-11.3"

From 22b32b45829f85930f63469a6d422662d5cbd2f2 Mon Sep 17 00:00:00 2001
From: Jooorgen <jorgen.teig@gmail.com>
Date: Mon, 16 Jan 2023 15:43:52 +0100
Subject: [PATCH 164/509] Added correct device id's for each GPU

---
 tools/profiling/buildSYCLProcess.sh | 18 ++++++++++++++++--
 1 file changed, 16 insertions(+), 2 deletions(-)

diff --git a/tools/profiling/buildSYCLProcess.sh b/tools/profiling/buildSYCLProcess.sh
index 40ceb99aed..f7a6218ab9 100755
--- a/tools/profiling/buildSYCLProcess.sh
+++ b/tools/profiling/buildSYCLProcess.sh
@@ -47,8 +47,22 @@ else
     exit
 
 case $GPU_NAME in
-    Tesla V100S PCIe 32GB ) export SM_LEVEL="sm_70" ;;
-    A100 PCIe 40GB ) export SM_LEVEL="sm_80" ;;
+    Tesla V100S PCIe 32GB )
+        export SM_LEVEL="sm_70"
+
+        # GPU
+        export DEVICE_ID=0
+        # CPU
+        #export DEVICE_ID=1
+        ;;
+    A100 PCIe 40GB )
+        export SM_LEVEL="sm_80"
+
+        # GPU
+        export DEVICE_ID=2
+        # CPU
+        #export DEVICE_ID=1
+        ;;
 esac
 
 ##################################################################

From 6e3173105871e07786186dcdab5e5957d516b7a1 Mon Sep 17 00:00:00 2001
From: Jooorgen <jorgen.teig@gmail.com>
Date: Mon, 16 Jan 2023 15:54:05 +0100
Subject: [PATCH 165/509] Fixed bug in GPU_NAME switch case

---
 tools/profiling/buildSYCLProcess.sh | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tools/profiling/buildSYCLProcess.sh b/tools/profiling/buildSYCLProcess.sh
index f7a6218ab9..0ccdfa7b29 100755
--- a/tools/profiling/buildSYCLProcess.sh
+++ b/tools/profiling/buildSYCLProcess.sh
@@ -47,7 +47,7 @@ else
     exit
 
 case $GPU_NAME in
-    Tesla V100S PCIe 32GB )
+    "Tesla V100S PCIe 32GB" )
         export SM_LEVEL="sm_70"
 
         # GPU
@@ -55,7 +55,7 @@ case $GPU_NAME in
         # CPU
         #export DEVICE_ID=1
         ;;
-    A100 PCIe 40GB )
+    "A100 PCIe 40GB" )
         export SM_LEVEL="sm_80"
 
         # GPU

From a4776ff7904b56336b6dbe88c7d499a09706e49c Mon Sep 17 00:00:00 2001
From: Jooorgen <jorgen.teig@gmail.com>
Date: Mon, 16 Jan 2023 16:00:10 +0100
Subject: [PATCH 166/509] Fix for another bug in GPU_NAME switch case

---
 tools/profiling/buildSYCLProcess.sh | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tools/profiling/buildSYCLProcess.sh b/tools/profiling/buildSYCLProcess.sh
index 0ccdfa7b29..da34152816 100755
--- a/tools/profiling/buildSYCLProcess.sh
+++ b/tools/profiling/buildSYCLProcess.sh
@@ -47,7 +47,7 @@ else
     exit
 
 case $GPU_NAME in
-    "Tesla V100S PCIe 32GB" )
+    "Tesla V100S PCIe 32GB")
         export SM_LEVEL="sm_70"
 
         # GPU
@@ -55,7 +55,7 @@ case $GPU_NAME in
         # CPU
         #export DEVICE_ID=1
         ;;
-    "A100 PCIe 40GB" )
+    "A100 PCIe 40GB")
         export SM_LEVEL="sm_80"
 
         # GPU

From 324f2bee75657e786b8f5ac417a1b3f85514b805 Mon Sep 17 00:00:00 2001
From: Jooorgen <jorgen.teig@gmail.com>
Date: Mon, 16 Jan 2023 16:11:18 +0100
Subject: [PATCH 167/509] Added missing fi at end of if statement

---
 tools/profiling/buildSYCLProcess.sh | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/tools/profiling/buildSYCLProcess.sh b/tools/profiling/buildSYCLProcess.sh
index da34152816..283be2185f 100755
--- a/tools/profiling/buildSYCLProcess.sh
+++ b/tools/profiling/buildSYCLProcess.sh
@@ -45,9 +45,10 @@ if command -v nvidia-smi > /dev/null 2>&1; then
 else
     echo "nvidia-smi non existent on system, Nvidia GPU not present!"
     exit
+fi
 
 case $GPU_NAME in
-    "Tesla V100S PCIe 32GB")
+    "Tesla V100S PCIe 32GB" )
         export SM_LEVEL="sm_70"
 
         # GPU
@@ -55,7 +56,7 @@ case $GPU_NAME in
         # CPU
         #export DEVICE_ID=1
         ;;
-    "A100 PCIe 40GB")
+    "A100 PCIe 40GB" )
         export SM_LEVEL="sm_80"
 
         # GPU

From 9db43d16423ffb56f7649b0790e146a2135fa792 Mon Sep 17 00:00:00 2001
From: Jooorgen <jorgen.teig@gmail.com>
Date: Mon, 16 Jan 2023 16:41:26 +0100
Subject: [PATCH 168/509] Fixed GPU_NAME switch case to work

---
 tools/profiling/buildSYCLProcess.sh | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tools/profiling/buildSYCLProcess.sh b/tools/profiling/buildSYCLProcess.sh
index 283be2185f..2f726bf8a8 100755
--- a/tools/profiling/buildSYCLProcess.sh
+++ b/tools/profiling/buildSYCLProcess.sh
@@ -48,7 +48,7 @@ else
 fi
 
 case $GPU_NAME in
-    "Tesla V100S PCIe 32GB" )
+    *V100S* )
         export SM_LEVEL="sm_70"
 
         # GPU
@@ -56,7 +56,7 @@ case $GPU_NAME in
         # CPU
         #export DEVICE_ID=1
         ;;
-    "A100 PCIe 40GB" )
+    *A100* )
         export SM_LEVEL="sm_80"
 
         # GPU

From 9cc153e95870848d9fdea8b4abe16b884de07061 Mon Sep 17 00:00:00 2001
From: Jooorgen <jorgen.teig@gmail.com>
Date: Mon, 16 Jan 2023 16:48:25 +0100
Subject: [PATCH 169/509] Remove old DEVICE_ID variable

---
 tools/profiling/buildSYCLProcess.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/profiling/buildSYCLProcess.sh b/tools/profiling/buildSYCLProcess.sh
index 2f726bf8a8..90030dc635 100755
--- a/tools/profiling/buildSYCLProcess.sh
+++ b/tools/profiling/buildSYCLProcess.sh
@@ -89,7 +89,7 @@ REPORT_FOLDER="${WORKSPACE}/$(date +"%y-%m-%d")_${SYCL_NAME_PREFIX}_${branch}"
 
 # If unknown set at the run step after running LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$MG_LIBS $MG_EXE --param_card $MG5AMC_CARD_PATH/param_card.dat --device_info 1024 128 10
 # GPU
-export DEVICE_ID=0
+#export DEVICE_ID=0
 # CPU
 #export DEVICE_ID=1
 

From 426556935251dfc1f480400f9f96f31bfd5c2658 Mon Sep 17 00:00:00 2001
From: Jooorgen <jorgen.teig@gmail.com>
Date: Mon, 16 Jan 2023 17:14:57 +0100
Subject: [PATCH 170/509] Added the workflows as CRON jobs again

---
 .github/workflows/cuda_A100_Profiler.yml | 6 ++----
 .github/workflows/sycl_A100_Profiler.yml | 6 ++----
 2 files changed, 4 insertions(+), 8 deletions(-)

diff --git a/.github/workflows/cuda_A100_Profiler.yml b/.github/workflows/cuda_A100_Profiler.yml
index 526a8a8253..c7c0142b37 100644
--- a/.github/workflows/cuda_A100_Profiler.yml
+++ b/.github/workflows/cuda_A100_Profiler.yml
@@ -1,10 +1,8 @@
 name: CUDA A100 Profiler
 
 on:
-  push:
-    branches: [ master ]
-  pull_request:
-    branches: [ master ]
+  schedule:
+    - cron:  '00 00 * * *'
 
 jobs:
   cuda_a100_Profiling:
diff --git a/.github/workflows/sycl_A100_Profiler.yml b/.github/workflows/sycl_A100_Profiler.yml
index 2d2b9310c3..cf9e5bc910 100644
--- a/.github/workflows/sycl_A100_Profiler.yml
+++ b/.github/workflows/sycl_A100_Profiler.yml
@@ -1,10 +1,8 @@
 name: SYCL A100 Profiler
 
 on:
-  push:
-    branches: [ master ]
-  pull_request:
-    branches: [ master ]
+  schedule:
+    - cron:  '00 00 * * *'
 
 jobs:
   sycl_A100_Profiling:

From 30a4665478f997c304b4eb9dd6774e27aed40913 Mon Sep 17 00:00:00 2001
From: Jooorgen <jorgen.teig@gmail.com>
Date: Tue, 17 Jan 2023 10:51:57 +0100
Subject: [PATCH 171/509] Added option for quickly seeing device info

---
 tools/profiling/buildSYCLProcess.sh | 18 ++++++++++--------
 1 file changed, 10 insertions(+), 8 deletions(-)

diff --git a/tools/profiling/buildSYCLProcess.sh b/tools/profiling/buildSYCLProcess.sh
index 90030dc635..0099c38006 100755
--- a/tools/profiling/buildSYCLProcess.sh
+++ b/tools/profiling/buildSYCLProcess.sh
@@ -9,6 +9,7 @@ helpFunction()
     echo -e "\t-t Threads per block"
     echo -e "\t-i Iterations"
     echo -e "\t-r Branch"
+    echo -e "\t-d Flag for seeing device info"
     exit 1 # Exit script after printing help
 }
 
@@ -20,6 +21,7 @@ do
         t ) threadsPerBlock="$OPTARG" ;;
         i ) iterations="$OPTARG" ;;
         r ) branch="$OPTARG" ;;
+        d ) deviceInfoFlag="$OPTARG" ;;
         ? ) helpFunction ;; # Print helpFunction in case parameter is non-existent
     esac
 done
@@ -133,11 +135,11 @@ mv build.d_inl0*/ $MG_EXE_DIR #2>/dev/null; true
 # Run executable
 cd $WORKSPACE
 
-# Display the devices
-#LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$MG_LIBS $MG_EXE --param_card $MG5AMC_CARD_PATH/param_card.dat --device_info 32 32 10
-
-# Add MG Libs to linker library path and run the executable
-LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$MG_LIBS $MG_EXE -j --json_file ${REPORT_FOLDER}/test_${MG_PROC}_${SYCL_NAME_PREFIX}_${blocksPerGrid}_${threadsPerBlock}_${iterations}.json --param_card $MG5AMC_CARD_PATH/param_card.dat --device_id $DEVICE_ID $blocksPerGrid $threadsPerBlock $iterations
-
-# View output
-#nano $REPORT_FOLDER/test_${SYCL_NAME_PREFIX}_${MG_PROC}_${blocksPerGrid}_${threadsPerBlock}_${iterations}.json-+
\ No newline at end of file
+if [ -z "$deviceInfoFlag" ]
+then
+    # Add MG Libs to linker library path and run the executable
+    LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$MG_LIBS $MG_EXE -j --json_file ${REPORT_FOLDER}/test_${MG_PROC}_${SYCL_NAME_PREFIX}_${blocksPerGrid}_${threadsPerBlock}_${iterations}.json --param_card $MG5AMC_CARD_PATH/param_card.dat --device_id $DEVICE_ID $blocksPerGrid $threadsPerBlock $iterations
+else
+    # Display the devices
+    LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$MG_LIBS $MG_EXE --param_card $MG5AMC_CARD_PATH/param_card.dat --device_info 32 32 10
+fi
\ No newline at end of file

From 7dc4d2253752356b1a2eb75a22cd8d1e9bf4d69f Mon Sep 17 00:00:00 2001
From: Jooorgen <jorgen.teig@gmail.com>
Date: Tue, 17 Jan 2023 11:08:46 +0100
Subject: [PATCH 172/509] Fixed small bug in getops function

---
 tools/profiling/buildSYCLProcess.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/profiling/buildSYCLProcess.sh b/tools/profiling/buildSYCLProcess.sh
index 0099c38006..ddd1fc77f1 100755
--- a/tools/profiling/buildSYCLProcess.sh
+++ b/tools/profiling/buildSYCLProcess.sh
@@ -13,7 +13,7 @@ helpFunction()
     exit 1 # Exit script after printing help
 }
 
-while getopts "n:b:t:i:r:" opt
+while getopts "n:b:t:i:r:d:" opt
 do
     case "$opt" in
         n ) MG_PROC="$OPTARG" ;; #process to target

From 60668f4e34e9f9ed952bccf10ee59920140c0832 Mon Sep 17 00:00:00 2001
From: Jooorgen <jorgen.teig@gmail.com>
Date: Tue, 17 Jan 2023 11:44:40 +0100
Subject: [PATCH 173/509] Added option to select device ID and reordered code

---
 tools/profiling/buildSYCLProcess.sh | 81 ++++++++++++++---------------
 1 file changed, 40 insertions(+), 41 deletions(-)

diff --git a/tools/profiling/buildSYCLProcess.sh b/tools/profiling/buildSYCLProcess.sh
index ddd1fc77f1..2784574076 100755
--- a/tools/profiling/buildSYCLProcess.sh
+++ b/tools/profiling/buildSYCLProcess.sh
@@ -1,5 +1,38 @@
 #!/bin/bash
 
+# Assign correct SM level for NVIDIA GPUs
+
+# Check if nvidia-smi command exists
+if command -v nvidia-smi > /dev/null 2>&1; then
+
+    # Get the name of the GPU
+    GPU_NAME=$(lshw -C display | grep -i "product:" | awk -F'[][]' '{print $2}')
+else
+    echo "nvidia-smi non existent on system, Nvidia GPU not present!"
+    exit
+fi
+
+case $GPU_NAME in
+    *V100S* )
+        export SM_LEVEL="sm_70"
+
+        # GPU
+        export DEVICE_ID=0
+        # CPU
+        #export DEVICE_ID=1
+        ;;
+    *A100* )
+        export SM_LEVEL="sm_80"
+
+        # GPU
+        export DEVICE_ID=2
+        # CPU
+        #export DEVICE_ID=1
+        ;;
+esac
+
+##################################################################
+
 helpFunction()
 {
     echo ""
@@ -9,7 +42,7 @@ helpFunction()
     echo -e "\t-t Threads per block"
     echo -e "\t-i Iterations"
     echo -e "\t-r Branch"
-    echo -e "\t-d Flag for seeing device info"
+    echo -e "\t-d Flag for setting device id"
     exit 1 # Exit script after printing help
 }
 
@@ -21,7 +54,7 @@ do
         t ) threadsPerBlock="$OPTARG" ;;
         i ) iterations="$OPTARG" ;;
         r ) branch="$OPTARG" ;;
-        d ) deviceInfoFlag="$OPTARG" ;;
+        d ) DEVICE_ID="$OPTARG" ;;
         ? ) helpFunction ;; # Print helpFunction in case parameter is non-existent
     esac
 done
@@ -33,40 +66,7 @@ then
     helpFunction
 fi
 
-# Begin script in case all parameters are correct
-
-##################################################################
-
-# Assign correct SM level for NVIDIA GPUs
-
-# Check if nvidia-smi command exists
-if command -v nvidia-smi > /dev/null 2>&1; then
-
-    # Get the name of the GPU
-    GPU_NAME=$(lshw -C display | grep -i "product:" | awk -F'[][]' '{print $2}')
-else
-    echo "nvidia-smi non existent on system, Nvidia GPU not present!"
-    exit
-fi
-
-case $GPU_NAME in
-    *V100S* )
-        export SM_LEVEL="sm_70"
-
-        # GPU
-        export DEVICE_ID=0
-        # CPU
-        #export DEVICE_ID=1
-        ;;
-    *A100* )
-        export SM_LEVEL="sm_80"
-
-        # GPU
-        export DEVICE_ID=2
-        # CPU
-        #export DEVICE_ID=1
-        ;;
-esac
+# Begin script in case all parameters and GPU specific settings are set
 
 ##################################################################
 
@@ -135,11 +135,10 @@ mv build.d_inl0*/ $MG_EXE_DIR #2>/dev/null; true
 # Run executable
 cd $WORKSPACE
 
-if [ -z "$deviceInfoFlag" ]
-then
-    # Add MG Libs to linker library path and run the executable
-    LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$MG_LIBS $MG_EXE -j --json_file ${REPORT_FOLDER}/test_${MG_PROC}_${SYCL_NAME_PREFIX}_${blocksPerGrid}_${threadsPerBlock}_${iterations}.json --param_card $MG5AMC_CARD_PATH/param_card.dat --device_id $DEVICE_ID $blocksPerGrid $threadsPerBlock $iterations
-else
+if [ $DEVICE_ID -eq 0 ]; then
     # Display the devices
     LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$MG_LIBS $MG_EXE --param_card $MG5AMC_CARD_PATH/param_card.dat --device_info 32 32 10
+else
+    # Add MG Libs to linker library path and run the executable
+    LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$MG_LIBS $MG_EXE -j --json_file ${REPORT_FOLDER}/test_${MG_PROC}_${SYCL_NAME_PREFIX}_${blocksPerGrid}_${threadsPerBlock}_${iterations}.json --param_card $MG5AMC_CARD_PATH/param_card.dat --device_id $DEVICE_ID $blocksPerGrid $threadsPerBlock $iterations
 fi
\ No newline at end of file

From 0ecea8a9d4e17604c538d6d894caa4eca909fb38 Mon Sep 17 00:00:00 2001
From: Jooorgen <jorgen.teig@gmail.com>
Date: Tue, 17 Jan 2023 16:08:35 +0100
Subject: [PATCH 174/509] Changed the variable needed to display devices on
 machine

---
 tools/profiling/buildSYCLProcess.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/profiling/buildSYCLProcess.sh b/tools/profiling/buildSYCLProcess.sh
index 2784574076..0f81674c64 100755
--- a/tools/profiling/buildSYCLProcess.sh
+++ b/tools/profiling/buildSYCLProcess.sh
@@ -135,7 +135,7 @@ mv build.d_inl0*/ $MG_EXE_DIR #2>/dev/null; true
 # Run executable
 cd $WORKSPACE
 
-if [ $DEVICE_ID -eq 0 ]; then
+if [ $DEVICE_ID -eq "info" ]; then
     # Display the devices
     LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$MG_LIBS $MG_EXE --param_card $MG5AMC_CARD_PATH/param_card.dat --device_info 32 32 10
 else

From dde56c75e80d68808a07efa304f0b0153eee369e Mon Sep 17 00:00:00 2001
From: Jooorgen <jorgen.teig@gmail.com>
Date: Tue, 17 Jan 2023 16:12:01 +0100
Subject: [PATCH 175/509] Changed syntax of device display check

---
 tools/profiling/buildSYCLProcess.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/profiling/buildSYCLProcess.sh b/tools/profiling/buildSYCLProcess.sh
index 0f81674c64..4c517ad7e2 100755
--- a/tools/profiling/buildSYCLProcess.sh
+++ b/tools/profiling/buildSYCLProcess.sh
@@ -135,7 +135,7 @@ mv build.d_inl0*/ $MG_EXE_DIR #2>/dev/null; true
 # Run executable
 cd $WORKSPACE
 
-if [ $DEVICE_ID -eq "info" ]; then
+if [ $DEVICE_ID == "info" ]; then
     # Display the devices
     LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$MG_LIBS $MG_EXE --param_card $MG5AMC_CARD_PATH/param_card.dat --device_info 32 32 10
 else

From 888c705ec4b7e8450d32ebbdc4826bfcd6a2a30f Mon Sep 17 00:00:00 2001
From: Jooorgen <jorgen.teig@gmail.com>
Date: Wed, 18 Jan 2023 10:44:44 +0100
Subject: [PATCH 176/509] Retrieves GPU name from nvidia-smi instead of lshw

---
 tools/profiling/buildSYCLProcess.sh | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tools/profiling/buildSYCLProcess.sh b/tools/profiling/buildSYCLProcess.sh
index 4c517ad7e2..8ac6531c1a 100755
--- a/tools/profiling/buildSYCLProcess.sh
+++ b/tools/profiling/buildSYCLProcess.sh
@@ -6,9 +6,9 @@
 if command -v nvidia-smi > /dev/null 2>&1; then
 
     # Get the name of the GPU
-    GPU_NAME=$(lshw -C display | grep -i "product:" | awk -F'[][]' '{print $2}')
+    GPU_NAME=$(nvidia-smi --query-gpu=name --format=csv,noheader)
 else
-    echo "nvidia-smi non existent on system, Nvidia GPU not present!"
+    echo "nvidia-smi non existent on system, Nvidia GPU possibly not present!"
     exit
 fi
 

From 436d402ca5aba1a8eadd10636572043c267d1c78 Mon Sep 17 00:00:00 2001
From: Jooorgen <jorgen.teig@gmail.com>
Date: Wed, 18 Jan 2023 10:51:50 +0100
Subject: [PATCH 177/509] Test to see what devices is available on runner

---
 tools/profiling/buildSYCLProcess.sh | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/tools/profiling/buildSYCLProcess.sh b/tools/profiling/buildSYCLProcess.sh
index 8ac6531c1a..a34fb81409 100755
--- a/tools/profiling/buildSYCLProcess.sh
+++ b/tools/profiling/buildSYCLProcess.sh
@@ -135,10 +135,10 @@ mv build.d_inl0*/ $MG_EXE_DIR #2>/dev/null; true
 # Run executable
 cd $WORKSPACE
 
-if [ $DEVICE_ID == "info" ]; then
+#if [ $DEVICE_ID == "info" ]; then
     # Display the devices
-    LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$MG_LIBS $MG_EXE --param_card $MG5AMC_CARD_PATH/param_card.dat --device_info 32 32 10
-else
+LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$MG_LIBS $MG_EXE --param_card $MG5AMC_CARD_PATH/param_card.dat --device_info 32 32 10
+#else
     # Add MG Libs to linker library path and run the executable
-    LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$MG_LIBS $MG_EXE -j --json_file ${REPORT_FOLDER}/test_${MG_PROC}_${SYCL_NAME_PREFIX}_${blocksPerGrid}_${threadsPerBlock}_${iterations}.json --param_card $MG5AMC_CARD_PATH/param_card.dat --device_id $DEVICE_ID $blocksPerGrid $threadsPerBlock $iterations
-fi
\ No newline at end of file
+#    LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$MG_LIBS $MG_EXE -j --json_file ${REPORT_FOLDER}/test_${MG_PROC}_${SYCL_NAME_PREFIX}_${blocksPerGrid}_${threadsPerBlock}_${iterations}.json --param_card $MG5AMC_CARD_PATH/param_card.dat --device_id $DEVICE_ID $blocksPerGrid $threadsPerBlock $iterations
+#fi
\ No newline at end of file

From c3eb172baccf33f5d488c837e246f5d8cfe9b0f9 Mon Sep 17 00:00:00 2001
From: Jooorgen <jorgen.teig@gmail.com>
Date: Wed, 18 Jan 2023 10:53:28 +0100
Subject: [PATCH 178/509] Revert changes in last commit

---
 tools/profiling/buildSYCLProcess.sh | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/tools/profiling/buildSYCLProcess.sh b/tools/profiling/buildSYCLProcess.sh
index a34fb81409..8ac6531c1a 100755
--- a/tools/profiling/buildSYCLProcess.sh
+++ b/tools/profiling/buildSYCLProcess.sh
@@ -135,10 +135,10 @@ mv build.d_inl0*/ $MG_EXE_DIR #2>/dev/null; true
 # Run executable
 cd $WORKSPACE
 
-#if [ $DEVICE_ID == "info" ]; then
+if [ $DEVICE_ID == "info" ]; then
     # Display the devices
-LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$MG_LIBS $MG_EXE --param_card $MG5AMC_CARD_PATH/param_card.dat --device_info 32 32 10
-#else
+    LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$MG_LIBS $MG_EXE --param_card $MG5AMC_CARD_PATH/param_card.dat --device_info 32 32 10
+else
     # Add MG Libs to linker library path and run the executable
-#    LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$MG_LIBS $MG_EXE -j --json_file ${REPORT_FOLDER}/test_${MG_PROC}_${SYCL_NAME_PREFIX}_${blocksPerGrid}_${threadsPerBlock}_${iterations}.json --param_card $MG5AMC_CARD_PATH/param_card.dat --device_id $DEVICE_ID $blocksPerGrid $threadsPerBlock $iterations
-#fi
\ No newline at end of file
+    LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$MG_LIBS $MG_EXE -j --json_file ${REPORT_FOLDER}/test_${MG_PROC}_${SYCL_NAME_PREFIX}_${blocksPerGrid}_${threadsPerBlock}_${iterations}.json --param_card $MG5AMC_CARD_PATH/param_card.dat --device_id $DEVICE_ID $blocksPerGrid $threadsPerBlock $iterations
+fi
\ No newline at end of file

From bba21487e73aff6dc11c1f149f4157ba8bec4554 Mon Sep 17 00:00:00 2001
From: Jooorgen <jorgen.teig@gmail.com>
Date: Wed, 18 Jan 2023 10:54:15 +0100
Subject: [PATCH 179/509] Testing workflow

---
 .github/workflows/sycl_A100_Profiler.yml | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/sycl_A100_Profiler.yml b/.github/workflows/sycl_A100_Profiler.yml
index cf9e5bc910..2d2b9310c3 100644
--- a/.github/workflows/sycl_A100_Profiler.yml
+++ b/.github/workflows/sycl_A100_Profiler.yml
@@ -1,8 +1,10 @@
 name: SYCL A100 Profiler
 
 on:
-  schedule:
-    - cron:  '00 00 * * *'
+  push:
+    branches: [ master ]
+  pull_request:
+    branches: [ master ]
 
 jobs:
   sycl_A100_Profiling:

From 209337807f01fd6c0801924e1ced8dd0f05f1943 Mon Sep 17 00:00:00 2001
From: Jooorgen <jorgen.teig@gmail.com>
Date: Wed, 18 Jan 2023 10:56:49 +0100
Subject: [PATCH 180/509] Added tests to see whats happening on runner

---
 tools/profiling/buildSYCLProcess.sh | 14 +++++++++-----
 1 file changed, 9 insertions(+), 5 deletions(-)

diff --git a/tools/profiling/buildSYCLProcess.sh b/tools/profiling/buildSYCLProcess.sh
index 8ac6531c1a..8b8da46c03 100755
--- a/tools/profiling/buildSYCLProcess.sh
+++ b/tools/profiling/buildSYCLProcess.sh
@@ -26,6 +26,8 @@ case $GPU_NAME in
 
         # GPU
         export DEVICE_ID=2
+        echo $SM_LEVEL
+        echo $DEVICE_ID
         # CPU
         #export DEVICE_ID=1
         ;;
@@ -70,6 +72,8 @@ fi
 
 ##################################################################
 
+echo $DEVICE_ID
+
 # Set variables for later use
 
 # Assumes that this is run from profiling directory in the repo
@@ -135,10 +139,10 @@ mv build.d_inl0*/ $MG_EXE_DIR #2>/dev/null; true
 # Run executable
 cd $WORKSPACE
 
-if [ $DEVICE_ID == "info" ]; then
+#if [ $DEVICE_ID == "info" ]; then
     # Display the devices
-    LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$MG_LIBS $MG_EXE --param_card $MG5AMC_CARD_PATH/param_card.dat --device_info 32 32 10
-else
+LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$MG_LIBS $MG_EXE --param_card $MG5AMC_CARD_PATH/param_card.dat --device_info 32 32 10
+#else
     # Add MG Libs to linker library path and run the executable
-    LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$MG_LIBS $MG_EXE -j --json_file ${REPORT_FOLDER}/test_${MG_PROC}_${SYCL_NAME_PREFIX}_${blocksPerGrid}_${threadsPerBlock}_${iterations}.json --param_card $MG5AMC_CARD_PATH/param_card.dat --device_id $DEVICE_ID $blocksPerGrid $threadsPerBlock $iterations
-fi
\ No newline at end of file
+#    LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$MG_LIBS $MG_EXE -j --json_file ${REPORT_FOLDER}/test_${MG_PROC}_${SYCL_NAME_PREFIX}_${blocksPerGrid}_${threadsPerBlock}_${iterations}.json --param_card $MG5AMC_CARD_PATH/param_card.dat --device_id $DEVICE_ID $blocksPerGrid $threadsPerBlock $iterations
+#fi
\ No newline at end of file

From 4a785383832d923ae547d012b4dac5ec5211635d Mon Sep 17 00:00:00 2001
From: Jooorgen <jorgen.teig@gmail.com>
Date: Wed, 18 Jan 2023 11:12:41 +0100
Subject: [PATCH 181/509] Removing testing info

---
 tools/profiling/buildSYCLProcess.sh | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/tools/profiling/buildSYCLProcess.sh b/tools/profiling/buildSYCLProcess.sh
index 8b8da46c03..c9d450ef9c 100755
--- a/tools/profiling/buildSYCLProcess.sh
+++ b/tools/profiling/buildSYCLProcess.sh
@@ -139,10 +139,11 @@ mv build.d_inl0*/ $MG_EXE_DIR #2>/dev/null; true
 # Run executable
 cd $WORKSPACE
 
-#if [ $DEVICE_ID == "info" ]; then
+if [ $DEVICE_ID == "info" ]; then
     # Display the devices
-LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$MG_LIBS $MG_EXE --param_card $MG5AMC_CARD_PATH/param_card.dat --device_info 32 32 10
-#else
+    LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$MG_LIBS $MG_EXE --param_card $MG5AMC_CARD_PATH/param_card.dat --device_info 32 32 10
+
+else
     # Add MG Libs to linker library path and run the executable
-#    LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$MG_LIBS $MG_EXE -j --json_file ${REPORT_FOLDER}/test_${MG_PROC}_${SYCL_NAME_PREFIX}_${blocksPerGrid}_${threadsPerBlock}_${iterations}.json --param_card $MG5AMC_CARD_PATH/param_card.dat --device_id $DEVICE_ID $blocksPerGrid $threadsPerBlock $iterations
-#fi
\ No newline at end of file
+    LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$MG_LIBS $MG_EXE -j --json_file ${REPORT_FOLDER}/test_${MG_PROC}_${SYCL_NAME_PREFIX}_${blocksPerGrid}_${threadsPerBlock}_${iterations}.json --param_card $MG5AMC_CARD_PATH/param_card.dat --device_id $DEVICE_ID $blocksPerGrid $threadsPerBlock $iterations
+fi
\ No newline at end of file

From 539a2bb563362e26b151bf5dea1c909d92bc34d7 Mon Sep 17 00:00:00 2001
From: Jooorgen <jorgen.teig@gmail.com>
Date: Wed, 18 Jan 2023 11:14:28 +0100
Subject: [PATCH 182/509] Added sleep for testing purposes

---
 tools/profiling/buildSYCLProcess.sh | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/tools/profiling/buildSYCLProcess.sh b/tools/profiling/buildSYCLProcess.sh
index c9d450ef9c..8b6693694c 100755
--- a/tools/profiling/buildSYCLProcess.sh
+++ b/tools/profiling/buildSYCLProcess.sh
@@ -141,6 +141,8 @@ cd $WORKSPACE
 
 if [ $DEVICE_ID == "info" ]; then
     # Display the devices
+    echo "sleeping"
+    sleep 5
     LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$MG_LIBS $MG_EXE --param_card $MG5AMC_CARD_PATH/param_card.dat --device_info 32 32 10
 
 else

From 01ee2d63ad30c6b67fe974f2fcb0c5a294a45d19 Mon Sep 17 00:00:00 2001
From: Jooorgen <jorgen.teig@gmail.com>
Date: Thu, 19 Jan 2023 15:16:10 +0100
Subject: [PATCH 183/509] Reworked code to work with new llvm installation

---
 tools/profiling/buildSYCLProcess.sh | 36 +++++++----------------------
 1 file changed, 8 insertions(+), 28 deletions(-)

diff --git a/tools/profiling/buildSYCLProcess.sh b/tools/profiling/buildSYCLProcess.sh
index 8b6693694c..ec2b16da6b 100755
--- a/tools/profiling/buildSYCLProcess.sh
+++ b/tools/profiling/buildSYCLProcess.sh
@@ -7,30 +7,19 @@ if command -v nvidia-smi > /dev/null 2>&1; then
 
     # Get the name of the GPU
     GPU_NAME=$(nvidia-smi --query-gpu=name --format=csv,noheader)
+
+    # GPU
+    export DEVICE_ID=0
+    # CPU
+    #export DEVICE_ID=1
 else
     echo "nvidia-smi non existent on system, Nvidia GPU possibly not present!"
     exit
 fi
 
 case $GPU_NAME in
-    *V100S* )
-        export SM_LEVEL="sm_70"
-
-        # GPU
-        export DEVICE_ID=0
-        # CPU
-        #export DEVICE_ID=1
-        ;;
-    *A100* )
-        export SM_LEVEL="sm_80"
-
-        # GPU
-        export DEVICE_ID=2
-        echo $SM_LEVEL
-        echo $DEVICE_ID
-        # CPU
-        #export DEVICE_ID=1
-        ;;
+    *V100S* ) export SM_LEVEL="sm_70" ;;
+    *A100* ) export SM_LEVEL="sm_80" ;;
 esac
 
 ##################################################################
@@ -83,22 +72,15 @@ prefix=$(pwd)
 export DPCPP_HOME=/afs/cern.ch/work/j/jteig/sycl_workspace
 export USEBUILDDIR=1
 export NTPBMAX=1024
+#CXX=/cvmfs/projects.cern.ch/intelsw/oneAPI/linux/x86_64/2023/compiler/2023.0.0/linux/bin-llvm/clang++
 export CXX=$DPCPP_HOME/llvm/build/bin/clang++
 export CUDA_PATH=/usr/local/cuda-11.8/
 export SYCLFLAGS="-fsycl -fsycl-targets=nvptx64-nvidia-cuda -Xsycl-target-backend '--cuda-gpu-arch=$SM_LEVEL' -fgpu-rdc --cuda-path=$CUDA_PATH"
 export WORKSPACE=$prefix/workspace_mg4gpu
-#export NAME_PREFIX="sycl_v100s_cuda_11.6.2_gcc_11.3"
-#export NAME_PREFIX="sycl_Xeon-Silver-4216_a100s_cuda-11.6.2_gcc-11.3"
 
 # Branch should be enviroment variable in main script and then passed down if none then it is not displayed in prefix
 REPORT_FOLDER="${WORKSPACE}/$(date +"%y-%m-%d")_${SYCL_NAME_PREFIX}_${branch}"
 
-# If unknown set at the run step after running LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$MG_LIBS $MG_EXE --param_card $MG5AMC_CARD_PATH/param_card.dat --device_info 1024 128 10
-# GPU
-#export DEVICE_ID=0
-# CPU
-#export DEVICE_ID=1
-
 # Finds correct subprocess
 case $MG_PROC in
     ee_mumu ) export MG_SUBPROC="P1_Sigma_sm_epem_mupmum" ;;
@@ -141,8 +123,6 @@ cd $WORKSPACE
 
 if [ $DEVICE_ID == "info" ]; then
     # Display the devices
-    echo "sleeping"
-    sleep 5
     LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$MG_LIBS $MG_EXE --param_card $MG5AMC_CARD_PATH/param_card.dat --device_info 32 32 10
 
 else

From 7db883aee5012b0a70271c2afc18756d8fa4f3d7 Mon Sep 17 00:00:00 2001
From: Jooorgen <jorgen.teig@gmail.com>
Date: Fri, 20 Jan 2023 09:41:21 +0100
Subject: [PATCH 184/509] Remove random echo in build script

---
 tools/profiling/buildSYCLProcess.sh | 2 --
 1 file changed, 2 deletions(-)

diff --git a/tools/profiling/buildSYCLProcess.sh b/tools/profiling/buildSYCLProcess.sh
index ec2b16da6b..16dce92dd6 100755
--- a/tools/profiling/buildSYCLProcess.sh
+++ b/tools/profiling/buildSYCLProcess.sh
@@ -61,8 +61,6 @@ fi
 
 ##################################################################
 
-echo $DEVICE_ID
-
 # Set variables for later use
 
 # Assumes that this is run from profiling directory in the repo

From 19df1dec90db1073a85a7d846708c010595c2b78 Mon Sep 17 00:00:00 2001
From: Jooorgen <jorgen.teig@gmail.com>
Date: Mon, 23 Jan 2023 00:08:14 +0100
Subject: [PATCH 185/509] Added fix for BooleanOptionalAction not existing
 prior to Python 3.9

---
 .github/workflows/cudaProfiler.yml       | 2 +-
 .github/workflows/cuda_A100_Profiler.yml | 2 +-
 .github/workflows/syclProfiler.yml       | 2 +-
 .github/workflows/sycl_A100_Profiler.yml | 8 +++-----
 tools/profiling/sendData.py              | 7 +++++--
 5 files changed, 11 insertions(+), 10 deletions(-)

diff --git a/.github/workflows/cudaProfiler.yml b/.github/workflows/cudaProfiler.yml
index f0e5b581c6..1bb8581772 100644
--- a/.github/workflows/cudaProfiler.yml
+++ b/.github/workflows/cudaProfiler.yml
@@ -18,4 +18,4 @@ jobs:
     - name: Runs CUDA performanceProfiler.py script
       run: cd tools/profiling/; source /cvmfs/sft.cern.ch/lcg/contrib/gcc/11.3.0/x86_64-centos8/setup.sh; python3 performanceProfiler.py -l 'CUDA' -b 'master'
     - name: Uploads CUDA JSON files to DB
-      run: cd tools/profiling/; python3 sendData.py --absLayer CUDA --profiler --branch master
\ No newline at end of file
+      run: cd tools/profiling/; python3 sendData.py --absLayer CUDA --profiler 1 --branch master
\ No newline at end of file
diff --git a/.github/workflows/cuda_A100_Profiler.yml b/.github/workflows/cuda_A100_Profiler.yml
index c7c0142b37..dbf8be78b8 100644
--- a/.github/workflows/cuda_A100_Profiler.yml
+++ b/.github/workflows/cuda_A100_Profiler.yml
@@ -18,4 +18,4 @@ jobs:
     - name: Runs CUDA performanceProfiler.py script
       run: cd tools/profiling/; source /cvmfs/sft.cern.ch/lcg/contrib/gcc/11.3.0/x86_64-centos8/setup.sh; python3 performanceProfiler.py -l 'CUDA' -b 'master'
     - name: Uploads CUDA JSON files to DB
-      run: cd tools/profiling/; python3 sendData.py --absLayer CUDA --profiler --branch master
\ No newline at end of file
+      run: cd tools/profiling/; python3 sendData.py --absLayer CUDA --profiler 1 --branch master
\ No newline at end of file
diff --git a/.github/workflows/syclProfiler.yml b/.github/workflows/syclProfiler.yml
index 4698dc4835..eb82669f72 100644
--- a/.github/workflows/syclProfiler.yml
+++ b/.github/workflows/syclProfiler.yml
@@ -18,4 +18,4 @@ jobs:
     - name: Runs SYCL performanceProfiler.py script
       run: cd tools/profiling/; source /cvmfs/sft.cern.ch/lcg/contrib/gcc/11.3.0/x86_64-centos8/setup.sh; python3 performanceProfiler.py -l 'SYCL' -b 'master'
     - name: Uploads SYCL JSON files to DB
-      run: cd tools/profiling/; python3 sendData.py --absLayer SYCL --profiler --branch master
\ No newline at end of file
+      run: cd tools/profiling/; python3 sendData.py --absLayer SYCL --profiler 1 --branch master
\ No newline at end of file
diff --git a/.github/workflows/sycl_A100_Profiler.yml b/.github/workflows/sycl_A100_Profiler.yml
index 2d2b9310c3..07a56d3dab 100644
--- a/.github/workflows/sycl_A100_Profiler.yml
+++ b/.github/workflows/sycl_A100_Profiler.yml
@@ -1,10 +1,8 @@
 name: SYCL A100 Profiler
 
 on:
-  push:
-    branches: [ master ]
-  pull_request:
-    branches: [ master ]
+  schedule:
+    - cron:  '00 00 * * *'
 
 jobs:
   sycl_A100_Profiling:
@@ -20,4 +18,4 @@ jobs:
     - name: Runs SYCL performanceProfiler.py script
       run: cd tools/profiling/; source /cvmfs/sft.cern.ch/lcg/contrib/gcc/11.3.0/x86_64-centos8/setup.sh; python3 performanceProfiler.py -l 'SYCL' -b 'master'
     - name: Uploads SYCL JSON files to DB
-      run: cd tools/profiling/; python3 sendData.py --absLayer SYCL --profiler --branch master
\ No newline at end of file
+      run: cd tools/profiling/; python3 sendData.py --absLayer SYCL --profiler 1 --branch master
\ No newline at end of file
diff --git a/tools/profiling/sendData.py b/tools/profiling/sendData.py
index a2cdc2af5a..8377c2a068 100644
--- a/tools/profiling/sendData.py
+++ b/tools/profiling/sendData.py
@@ -29,7 +29,9 @@
 #parser.add_argument('--CUDAVersion', help="CUDA version used when profiling.", default=CUDAVersion)
 parser.add_argument('-a', '--absLayer', help="Abstraction layer used when profiling.", default=absLayers[0])
 parser.add_argument('-b', '--branch', help="Branch the profiler data is in.", default=branch)
-parser.add_argument('-p', '--profiler', help="Enable CI profiling defaults.", action=argparse.BooleanOptionalAction)
+
+# Fix this
+parser.add_argument('-p', '--profiler', help="Enable CI profiling defaults.", default='0')
 
 args = parser.parse_args()
 
@@ -38,7 +40,8 @@
 #
 if __name__=='__main__':
 
-    if args.profiler == True:
+    # Fix this
+    if args.profiler == '1':
 
         if args.absLayer.upper() == "SYCL":
 

From cb610ba51783d2027a8c6485beed98d20cc6afe1 Mon Sep 17 00:00:00 2001
From: Jooorgen <jorgen.teig@gmail.com>
Date: Wed, 1 Feb 2023 10:41:37 +0100
Subject: [PATCH 186/509] Fixed a bug in the SYCLFLAGS

---
 tools/profiling/buildSYCLProcess.sh | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

diff --git a/tools/profiling/buildSYCLProcess.sh b/tools/profiling/buildSYCLProcess.sh
index 16dce92dd6..67ce8f61d0 100755
--- a/tools/profiling/buildSYCLProcess.sh
+++ b/tools/profiling/buildSYCLProcess.sh
@@ -66,14 +66,14 @@ fi
 # Assumes that this is run from profiling directory in the repo
 prefix=$(pwd)
 
-#export DPCPP_HOME=/p/project/prpb109/sycl_workspace
-export DPCPP_HOME=/afs/cern.ch/work/j/jteig/sycl_workspace
+#export DPCPP_HOME=/afs/cern.ch/work/j/jteig/sycl_workspace
 export USEBUILDDIR=1
 export NTPBMAX=1024
-#CXX=/cvmfs/projects.cern.ch/intelsw/oneAPI/linux/x86_64/2023/compiler/2023.0.0/linux/bin-llvm/clang++
-export CXX=$DPCPP_HOME/llvm/build/bin/clang++
+export CXX=/cvmfs/projects.cern.ch/intelsw/oneAPI/linux/x86_64/2023/compiler/2023.0.0/linux/bin-llvm/clang++
+#export CXX=$DPCPP_HOME/llvm/build/bin/clang++
 export CUDA_PATH=/usr/local/cuda-11.8/
-export SYCLFLAGS="-fsycl -fsycl-targets=nvptx64-nvidia-cuda -Xsycl-target-backend '--cuda-gpu-arch=$SM_LEVEL' -fgpu-rdc --cuda-path=$CUDA_PATH"
+#export SYCLFLAGS="-fsycl -fsycl-targets=nvptx64-nvidia-cuda -Xsycl-target-backend '--cuda-gpu-arch=$SM_LEVEL' -fgpu-rdc --cuda-path=$CUDA_PATH"
+export SYCLFLAGS="-fsycl -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=$SM_LEVEL"
 export WORKSPACE=$prefix/workspace_mg4gpu
 
 # Branch should be enviroment variable in main script and then passed down if none then it is not displayed in prefix
@@ -104,7 +104,8 @@ export MG_PROC_DIR=$prefix/../../epochX/sycl/$MG_PROC
 export MG_SP_DIR=$MG_PROC_DIR/SubProcesses/$MG_SUBPROC
 
 export MG_LIBS_DIR="${MG4GPU_LIB}/build_${MG_PROC}_${SYCL_NAME_PREFIX}"
-export MG_LIBS="$DPCPP_HOME/llvm/build/lib:$MG_LIBS_DIR"
+#export MG_LIBS="$DPCPP_HOME/llvm/build/lib:$MG_LIBS_DIR"
+export MG_LIBS="$MG_LIBS_DIR"
 
 export MG_EXE_DIR="${MG4GPU_BIN}/build_${MG_PROC}_${SYCL_NAME_PREFIX}"
 export MG_EXE="$MG_EXE_DIR/check.exe"

From 1ed71499f727eff1f29caa7f3c5ab315195a5c87 Mon Sep 17 00:00:00 2001
From: Jooorgen <jorgen.teig@gmail.com>
Date: Mon, 6 Feb 2023 11:49:47 +0100
Subject: [PATCH 187/509] Updated the structure of the buildSYCLProcess script

---
 tools/profiling/buildSYCLProcess.sh | 20 ++++++++++++++------
 1 file changed, 14 insertions(+), 6 deletions(-)

diff --git a/tools/profiling/buildSYCLProcess.sh b/tools/profiling/buildSYCLProcess.sh
index 67ce8f61d0..8655f41883 100755
--- a/tools/profiling/buildSYCLProcess.sh
+++ b/tools/profiling/buildSYCLProcess.sh
@@ -66,15 +66,19 @@ fi
 # Assumes that this is run from profiling directory in the repo
 prefix=$(pwd)
 
-#export DPCPP_HOME=/afs/cern.ch/work/j/jteig/sycl_workspace
 export USEBUILDDIR=1
 export NTPBMAX=1024
+export CUDA_PATH=/usr/local/cuda-11.8/
+export WORKSPACE=$prefix/workspace_mg4gpu
+
+# Compilation using OneAPI Toolkit through CVMFS
 export CXX=/cvmfs/projects.cern.ch/intelsw/oneAPI/linux/x86_64/2023/compiler/2023.0.0/linux/bin-llvm/clang++
+export SYCLFLAGS="-fsycl -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=$SM_LEVEL"
+
+# Compilation for OneAPI LLVM compiler
+#export DPCPP_HOME=/afs/cern.ch/work/j/jteig/sycl_workspace
 #export CXX=$DPCPP_HOME/llvm/build/bin/clang++
-export CUDA_PATH=/usr/local/cuda-11.8/
 #export SYCLFLAGS="-fsycl -fsycl-targets=nvptx64-nvidia-cuda -Xsycl-target-backend '--cuda-gpu-arch=$SM_LEVEL' -fgpu-rdc --cuda-path=$CUDA_PATH"
-export SYCLFLAGS="-fsycl -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=$SM_LEVEL"
-export WORKSPACE=$prefix/workspace_mg4gpu
 
 # Branch should be enviroment variable in main script and then passed down if none then it is not displayed in prefix
 REPORT_FOLDER="${WORKSPACE}/$(date +"%y-%m-%d")_${SYCL_NAME_PREFIX}_${branch}"
@@ -104,8 +108,12 @@ export MG_PROC_DIR=$prefix/../../epochX/sycl/$MG_PROC
 export MG_SP_DIR=$MG_PROC_DIR/SubProcesses/$MG_SUBPROC
 
 export MG_LIBS_DIR="${MG4GPU_LIB}/build_${MG_PROC}_${SYCL_NAME_PREFIX}"
-#export MG_LIBS="$DPCPP_HOME/llvm/build/lib:$MG_LIBS_DIR"
-export MG_LIBS="$MG_LIBS_DIR"
+
+if [[ -z "${DPCPP_HOME}" ]]; then
+    export MG_LIBS="$MG_LIBS_DIR"
+else
+    export MG_LIBS="$DPCPP_HOME/llvm/build/lib:$MG_LIBS_DIR"
+fi
 
 export MG_EXE_DIR="${MG4GPU_BIN}/build_${MG_PROC}_${SYCL_NAME_PREFIX}"
 export MG_EXE="$MG_EXE_DIR/check.exe"

From 7594a579aabc1cd9fe417c5af51c412be1cfebeb Mon Sep 17 00:00:00 2001
From: Jooorgen <jorgen.teig@gmail.com>
Date: Mon, 6 Feb 2023 15:31:27 +0100
Subject: [PATCH 188/509] Fixed typo in SYCL profiling workflow

---
 .github/workflows/sycl_A100_Profiler.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/sycl_A100_Profiler.yml b/.github/workflows/sycl_A100_Profiler.yml
index 07a56d3dab..6b391af810 100644
--- a/.github/workflows/sycl_A100_Profiler.yml
+++ b/.github/workflows/sycl_A100_Profiler.yml
@@ -8,7 +8,7 @@ jobs:
   sycl_A100_Profiling:
     name: SYCL A100 Profiling
     env:
-      SYCL_NAME_PREFIX: sycl_AMD-Epyc-7313_a100_gcc-11.3_cuda-11.9
+      SYCL_NAME_PREFIX: sycl_AMD-Epyc-7313_a100_gcc-11.3_cuda-11.8
       ENABLE_CI_PROFILER: 1
 
       MADGRAPH4GPU_DB_SECRET: ${{ secrets.MADGRAPH4GPU_DB_SECRET }}

From ddaa261186ce1eca02817572ff6607a7d9c725ea Mon Sep 17 00:00:00 2001
From: Jooorgen <jorgen.teig@gmail.com>
Date: Mon, 20 Feb 2023 21:30:05 +0100
Subject: [PATCH 189/509] Changed compiler

---
 tools/profiling/buildSYCLProcess.sh | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

diff --git a/tools/profiling/buildSYCLProcess.sh b/tools/profiling/buildSYCLProcess.sh
index 8655f41883..1a1f6fcfb6 100755
--- a/tools/profiling/buildSYCLProcess.sh
+++ b/tools/profiling/buildSYCLProcess.sh
@@ -72,14 +72,16 @@ export CUDA_PATH=/usr/local/cuda-11.8/
 export WORKSPACE=$prefix/workspace_mg4gpu
 
 # Compilation using OneAPI Toolkit through CVMFS
-export CXX=/cvmfs/projects.cern.ch/intelsw/oneAPI/linux/x86_64/2023/compiler/2023.0.0/linux/bin-llvm/clang++
-export SYCLFLAGS="-fsycl -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=$SM_LEVEL"
+#export CXX=/cvmfs/projects.cern.ch/intelsw/oneAPI/linux/x86_64/2023/compiler/2023.0.0/linux/bin-llvm/clang++
 
-# Compilation for OneAPI LLVM compiler
-#export DPCPP_HOME=/afs/cern.ch/work/j/jteig/sycl_workspace
-#export CXX=$DPCPP_HOME/llvm/build/bin/clang++
+#export SYCLFLAGS="-fsycl -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=$SM_LEVEL"
 #export SYCLFLAGS="-fsycl -fsycl-targets=nvptx64-nvidia-cuda -Xsycl-target-backend '--cuda-gpu-arch=$SM_LEVEL' -fgpu-rdc --cuda-path=$CUDA_PATH"
 
+# Compilation for OneAPI LLVM compiler
+export DPCPP_HOME=/afs/cern.ch/work/j/jteig/sycl_workspace
+export CXX=$DPCPP_HOME/llvm/build/bin/clang++
+export SYCLFLAGS="-fsycl -fsycl-targets=nvptx64-nvidia-cuda -Xsycl-target-backend '--cuda-gpu-arch=$SM_LEVEL' -fgpu-rdc --cuda-path=$CUDA_PATH"
+
 # Branch should be enviroment variable in main script and then passed down if none then it is not displayed in prefix
 REPORT_FOLDER="${WORKSPACE}/$(date +"%y-%m-%d")_${SYCL_NAME_PREFIX}_${branch}"
 

From 485900b7419754fe3e742aa5a4bc0325f6a79767 Mon Sep 17 00:00:00 2001
From: Jooorgen <jorgen.teig@gmail.com>
Date: Tue, 28 Feb 2023 18:28:17 +0100
Subject: [PATCH 190/509] Changed compiler to oneAPI Toolkit

---
 .github/workflows/syclProfiler.yml       |  2 +-
 .github/workflows/sycl_A100_Profiler.yml |  2 +-
 tools/profiling/buildSYCLProcess.sh      | 12 +++++-------
 3 files changed, 7 insertions(+), 9 deletions(-)

diff --git a/.github/workflows/syclProfiler.yml b/.github/workflows/syclProfiler.yml
index eb82669f72..38a6b1846e 100644
--- a/.github/workflows/syclProfiler.yml
+++ b/.github/workflows/syclProfiler.yml
@@ -16,6 +16,6 @@ jobs:
     steps:
     - uses: actions/checkout@v2
     - name: Runs SYCL performanceProfiler.py script
-      run: cd tools/profiling/; source /cvmfs/sft.cern.ch/lcg/contrib/gcc/11.3.0/x86_64-centos8/setup.sh; python3 performanceProfiler.py -l 'SYCL' -b 'master'
+      run: cd tools/profiling/; source /cvmfs/sft.cern.ch/lcg/contrib/gcc/11.3.0/x86_64-centos8/setup.sh; source /cvmfs/projects.cern.ch/intelsw/oneAPI/linux/23-all-setup.sh --include-intel-llvm; python3 performanceProfiler.py -l 'SYCL' -b 'master'
     - name: Uploads SYCL JSON files to DB
       run: cd tools/profiling/; python3 sendData.py --absLayer SYCL --profiler 1 --branch master
\ No newline at end of file
diff --git a/.github/workflows/sycl_A100_Profiler.yml b/.github/workflows/sycl_A100_Profiler.yml
index 6b391af810..e94dbcc6cb 100644
--- a/.github/workflows/sycl_A100_Profiler.yml
+++ b/.github/workflows/sycl_A100_Profiler.yml
@@ -16,6 +16,6 @@ jobs:
     steps:
     - uses: actions/checkout@v2
     - name: Runs SYCL performanceProfiler.py script
-      run: cd tools/profiling/; source /cvmfs/sft.cern.ch/lcg/contrib/gcc/11.3.0/x86_64-centos8/setup.sh; python3 performanceProfiler.py -l 'SYCL' -b 'master'
+      run: cd tools/profiling/; source /cvmfs/sft.cern.ch/lcg/contrib/gcc/11.3.0/x86_64-centos8/setup.sh; source /cvmfs/projects.cern.ch/intelsw/oneAPI/linux/23-all-setup.sh --include-intel-llvm; python3 performanceProfiler.py -l 'SYCL' -b 'master'
     - name: Uploads SYCL JSON files to DB
       run: cd tools/profiling/; python3 sendData.py --absLayer SYCL --profiler 1 --branch master
\ No newline at end of file
diff --git a/tools/profiling/buildSYCLProcess.sh b/tools/profiling/buildSYCLProcess.sh
index 1a1f6fcfb6..8655f41883 100755
--- a/tools/profiling/buildSYCLProcess.sh
+++ b/tools/profiling/buildSYCLProcess.sh
@@ -72,15 +72,13 @@ export CUDA_PATH=/usr/local/cuda-11.8/
 export WORKSPACE=$prefix/workspace_mg4gpu
 
 # Compilation using OneAPI Toolkit through CVMFS
-#export CXX=/cvmfs/projects.cern.ch/intelsw/oneAPI/linux/x86_64/2023/compiler/2023.0.0/linux/bin-llvm/clang++
-
-#export SYCLFLAGS="-fsycl -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=$SM_LEVEL"
-#export SYCLFLAGS="-fsycl -fsycl-targets=nvptx64-nvidia-cuda -Xsycl-target-backend '--cuda-gpu-arch=$SM_LEVEL' -fgpu-rdc --cuda-path=$CUDA_PATH"
+export CXX=/cvmfs/projects.cern.ch/intelsw/oneAPI/linux/x86_64/2023/compiler/2023.0.0/linux/bin-llvm/clang++
+export SYCLFLAGS="-fsycl -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=$SM_LEVEL"
 
 # Compilation for OneAPI LLVM compiler
-export DPCPP_HOME=/afs/cern.ch/work/j/jteig/sycl_workspace
-export CXX=$DPCPP_HOME/llvm/build/bin/clang++
-export SYCLFLAGS="-fsycl -fsycl-targets=nvptx64-nvidia-cuda -Xsycl-target-backend '--cuda-gpu-arch=$SM_LEVEL' -fgpu-rdc --cuda-path=$CUDA_PATH"
+#export DPCPP_HOME=/afs/cern.ch/work/j/jteig/sycl_workspace
+#export CXX=$DPCPP_HOME/llvm/build/bin/clang++
+#export SYCLFLAGS="-fsycl -fsycl-targets=nvptx64-nvidia-cuda -Xsycl-target-backend '--cuda-gpu-arch=$SM_LEVEL' -fgpu-rdc --cuda-path=$CUDA_PATH"
 
 # Branch should be enviroment variable in main script and then passed down if none then it is not displayed in prefix
 REPORT_FOLDER="${WORKSPACE}/$(date +"%y-%m-%d")_${SYCL_NAME_PREFIX}_${branch}"

From 166269e185a533173bdbd98d34b868830a6bf20d Mon Sep 17 00:00:00 2001
From: Jooorgen <jorgen.teig@gmail.com>
Date: Wed, 1 Mar 2023 10:02:09 +0100
Subject: [PATCH 191/509] Chenged to LLVM compiler again

---
 .github/workflows/syclProfiler.yml       |  2 +-
 .github/workflows/sycl_A100_Profiler.yml |  2 +-
 tools/profiling/buildSYCLProcess.sh      | 10 +++++-----
 3 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/.github/workflows/syclProfiler.yml b/.github/workflows/syclProfiler.yml
index 38a6b1846e..eb82669f72 100644
--- a/.github/workflows/syclProfiler.yml
+++ b/.github/workflows/syclProfiler.yml
@@ -16,6 +16,6 @@ jobs:
     steps:
     - uses: actions/checkout@v2
     - name: Runs SYCL performanceProfiler.py script
-      run: cd tools/profiling/; source /cvmfs/sft.cern.ch/lcg/contrib/gcc/11.3.0/x86_64-centos8/setup.sh; source /cvmfs/projects.cern.ch/intelsw/oneAPI/linux/23-all-setup.sh --include-intel-llvm; python3 performanceProfiler.py -l 'SYCL' -b 'master'
+      run: cd tools/profiling/; source /cvmfs/sft.cern.ch/lcg/contrib/gcc/11.3.0/x86_64-centos8/setup.sh; python3 performanceProfiler.py -l 'SYCL' -b 'master'
     - name: Uploads SYCL JSON files to DB
       run: cd tools/profiling/; python3 sendData.py --absLayer SYCL --profiler 1 --branch master
\ No newline at end of file
diff --git a/.github/workflows/sycl_A100_Profiler.yml b/.github/workflows/sycl_A100_Profiler.yml
index e94dbcc6cb..6b391af810 100644
--- a/.github/workflows/sycl_A100_Profiler.yml
+++ b/.github/workflows/sycl_A100_Profiler.yml
@@ -16,6 +16,6 @@ jobs:
     steps:
     - uses: actions/checkout@v2
     - name: Runs SYCL performanceProfiler.py script
-      run: cd tools/profiling/; source /cvmfs/sft.cern.ch/lcg/contrib/gcc/11.3.0/x86_64-centos8/setup.sh; source /cvmfs/projects.cern.ch/intelsw/oneAPI/linux/23-all-setup.sh --include-intel-llvm; python3 performanceProfiler.py -l 'SYCL' -b 'master'
+      run: cd tools/profiling/; source /cvmfs/sft.cern.ch/lcg/contrib/gcc/11.3.0/x86_64-centos8/setup.sh; python3 performanceProfiler.py -l 'SYCL' -b 'master'
     - name: Uploads SYCL JSON files to DB
       run: cd tools/profiling/; python3 sendData.py --absLayer SYCL --profiler 1 --branch master
\ No newline at end of file
diff --git a/tools/profiling/buildSYCLProcess.sh b/tools/profiling/buildSYCLProcess.sh
index 8655f41883..961d5b9c15 100755
--- a/tools/profiling/buildSYCLProcess.sh
+++ b/tools/profiling/buildSYCLProcess.sh
@@ -72,13 +72,13 @@ export CUDA_PATH=/usr/local/cuda-11.8/
 export WORKSPACE=$prefix/workspace_mg4gpu
 
 # Compilation using OneAPI Toolkit through CVMFS
-export CXX=/cvmfs/projects.cern.ch/intelsw/oneAPI/linux/x86_64/2023/compiler/2023.0.0/linux/bin-llvm/clang++
-export SYCLFLAGS="-fsycl -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=$SM_LEVEL"
+#export CXX=/cvmfs/projects.cern.ch/intelsw/oneAPI/linux/x86_64/2023/compiler/2023.0.0/linux/bin-llvm/clang++
+#export SYCLFLAGS="-fsycl -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=$SM_LEVEL"
 
 # Compilation for OneAPI LLVM compiler
-#export DPCPP_HOME=/afs/cern.ch/work/j/jteig/sycl_workspace
-#export CXX=$DPCPP_HOME/llvm/build/bin/clang++
-#export SYCLFLAGS="-fsycl -fsycl-targets=nvptx64-nvidia-cuda -Xsycl-target-backend '--cuda-gpu-arch=$SM_LEVEL' -fgpu-rdc --cuda-path=$CUDA_PATH"
+export DPCPP_HOME=/afs/cern.ch/work/j/jteig/sycl_workspace
+export CXX=$DPCPP_HOME/llvm/build/bin/clang++
+export SYCLFLAGS="-fsycl -fsycl-targets=nvptx64-nvidia-cuda -Xsycl-target-backend '--cuda-gpu-arch=$SM_LEVEL' -fgpu-rdc --cuda-path=$CUDA_PATH"
 
 # Branch should be enviroment variable in main script and then passed down if none then it is not displayed in prefix
 REPORT_FOLDER="${WORKSPACE}/$(date +"%y-%m-%d")_${SYCL_NAME_PREFIX}_${branch}"

From 81f7259b86b1dce1799a83cde9336535b39348a6 Mon Sep 17 00:00:00 2001
From: Jooorgen <jorgen.teig@gmail.com>
Date: Thu, 2 Mar 2023 10:23:32 +0100
Subject: [PATCH 192/509] GCC source script is the real deal now

---
 .github/workflows/syclProfiler.yml       | 2 +-
 .github/workflows/sycl_A100_Profiler.yml | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/syclProfiler.yml b/.github/workflows/syclProfiler.yml
index eb82669f72..e8c5c272c7 100644
--- a/.github/workflows/syclProfiler.yml
+++ b/.github/workflows/syclProfiler.yml
@@ -16,6 +16,6 @@ jobs:
     steps:
     - uses: actions/checkout@v2
     - name: Runs SYCL performanceProfiler.py script
-      run: cd tools/profiling/; source /cvmfs/sft.cern.ch/lcg/contrib/gcc/11.3.0/x86_64-centos8/setup.sh; python3 performanceProfiler.py -l 'SYCL' -b 'master'
+      run: cd tools/profiling/; source /cvmfs/sft.cern.ch/lcg/contrib/gcc/11.3.0-ad0f5/x86_64-centos8/setup.sh; python3 performanceProfiler.py -l 'SYCL' -b 'master'
     - name: Uploads SYCL JSON files to DB
       run: cd tools/profiling/; python3 sendData.py --absLayer SYCL --profiler 1 --branch master
\ No newline at end of file
diff --git a/.github/workflows/sycl_A100_Profiler.yml b/.github/workflows/sycl_A100_Profiler.yml
index 6b391af810..6f6e603083 100644
--- a/.github/workflows/sycl_A100_Profiler.yml
+++ b/.github/workflows/sycl_A100_Profiler.yml
@@ -16,6 +16,6 @@ jobs:
     steps:
     - uses: actions/checkout@v2
     - name: Runs SYCL performanceProfiler.py script
-      run: cd tools/profiling/; source /cvmfs/sft.cern.ch/lcg/contrib/gcc/11.3.0/x86_64-centos8/setup.sh; python3 performanceProfiler.py -l 'SYCL' -b 'master'
+      run: cd tools/profiling/; source /cvmfs/sft.cern.ch/lcg/contrib/gcc/11.3.0-ad0f5/x86_64-centos8/setup.sh; python3 performanceProfiler.py -l 'SYCL' -b 'master'
     - name: Uploads SYCL JSON files to DB
       run: cd tools/profiling/; python3 sendData.py --absLayer SYCL --profiler 1 --branch master
\ No newline at end of file

From 7557dc3a8bc7ad176c417b40232fae1028e06455 Mon Sep 17 00:00:00 2001
From: Jooorgen <jorgen.teig@gmail.com>
Date: Thu, 2 Mar 2023 11:18:53 +0100
Subject: [PATCH 193/509] Added testing on push

---
 .github/workflows/sycl_A100_Profiler.yml | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/sycl_A100_Profiler.yml b/.github/workflows/sycl_A100_Profiler.yml
index 6f6e603083..bda61c18c5 100644
--- a/.github/workflows/sycl_A100_Profiler.yml
+++ b/.github/workflows/sycl_A100_Profiler.yml
@@ -1,8 +1,10 @@
 name: SYCL A100 Profiler
 
 on:
-  schedule:
-    - cron:  '00 00 * * *'
+  push:
+    branches: [ master ]
+  pull_request:
+    branches: [ master ]
 
 jobs:
   sycl_A100_Profiling:

From 90fc1a54cda5d8bfb5e7cbe9184a8e106637b3cb Mon Sep 17 00:00:00 2001
From: Jooorgen <jorgen.teig@gmail.com>
Date: Thu, 2 Mar 2023 11:28:57 +0100
Subject: [PATCH 194/509] Updated path for the GCC sft script

---
 .github/workflows/sycl_A100_Profiler.yml | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/.github/workflows/sycl_A100_Profiler.yml b/.github/workflows/sycl_A100_Profiler.yml
index bda61c18c5..6f6e603083 100644
--- a/.github/workflows/sycl_A100_Profiler.yml
+++ b/.github/workflows/sycl_A100_Profiler.yml
@@ -1,10 +1,8 @@
 name: SYCL A100 Profiler
 
 on:
-  push:
-    branches: [ master ]
-  pull_request:
-    branches: [ master ]
+  schedule:
+    - cron:  '00 00 * * *'
 
 jobs:
   sycl_A100_Profiling:

From eb42e5b488104694e6475f967880c7f558a5788c Mon Sep 17 00:00:00 2001
From: Jooorgen <jorgen.teig@gmail.com>
Date: Thu, 2 Mar 2023 11:30:09 +0100
Subject: [PATCH 195/509] Updated patht o GCC script

---
 .github/workflows/sycl_A100_Profiler.yml | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/sycl_A100_Profiler.yml b/.github/workflows/sycl_A100_Profiler.yml
index 6f6e603083..24200f15b0 100644
--- a/.github/workflows/sycl_A100_Profiler.yml
+++ b/.github/workflows/sycl_A100_Profiler.yml
@@ -1,8 +1,10 @@
 name: SYCL A100 Profiler
 
 on:
-  schedule:
-    - cron:  '00 00 * * *'
+  push:
+    branches: [ master ]
+  pull_request:
+    branches: [ master ]
 
 jobs:
   sycl_A100_Profiling:
@@ -16,6 +18,6 @@ jobs:
     steps:
     - uses: actions/checkout@v2
     - name: Runs SYCL performanceProfiler.py script
-      run: cd tools/profiling/; source /cvmfs/sft.cern.ch/lcg/contrib/gcc/11.3.0-ad0f5/x86_64-centos8/setup.sh; python3 performanceProfiler.py -l 'SYCL' -b 'master'
+      run: cd tools/profiling/; source /cvmfs/sft.cern.ch/lcg/releases/gcc/11.3.0-ad0f5/x86_64-centos8/setup.sh; python3 performanceProfiler.py -l 'SYCL' -b 'master'
     - name: Uploads SYCL JSON files to DB
       run: cd tools/profiling/; python3 sendData.py --absLayer SYCL --profiler 1 --branch master
\ No newline at end of file

From 8a8c00b18986882c6c96b318b8f75220a651546f Mon Sep 17 00:00:00 2001
From: Jooorgen <jorgen.teig@gmail.com>
Date: Thu, 2 Mar 2023 15:16:53 +0100
Subject: [PATCH 196/509] Changed compiler to CVMFS

---
 .github/workflows/sycl_A100_Profiler.yml |  2 +-
 tools/profiling/buildSYCLProcess.sh      | 10 +++++-----
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/.github/workflows/sycl_A100_Profiler.yml b/.github/workflows/sycl_A100_Profiler.yml
index 24200f15b0..2508590933 100644
--- a/.github/workflows/sycl_A100_Profiler.yml
+++ b/.github/workflows/sycl_A100_Profiler.yml
@@ -18,6 +18,6 @@ jobs:
     steps:
     - uses: actions/checkout@v2
     - name: Runs SYCL performanceProfiler.py script
-      run: cd tools/profiling/; source /cvmfs/sft.cern.ch/lcg/releases/gcc/11.3.0-ad0f5/x86_64-centos8/setup.sh; python3 performanceProfiler.py -l 'SYCL' -b 'master'
+      run: cd tools/profiling/; source /cvmfs/sft.cern.ch/lcg/releases/gcc/11.3.0-ad0f5/x86_64-centos8/setup.sh; source /cvmfs/projects.cern.ch/intelsw/oneAPI/linux/23-all-setup.sh --include-intel-llvm; python3 performanceProfiler.py -l 'SYCL' -b 'master'
     - name: Uploads SYCL JSON files to DB
       run: cd tools/profiling/; python3 sendData.py --absLayer SYCL --profiler 1 --branch master
\ No newline at end of file
diff --git a/tools/profiling/buildSYCLProcess.sh b/tools/profiling/buildSYCLProcess.sh
index 961d5b9c15..8655f41883 100755
--- a/tools/profiling/buildSYCLProcess.sh
+++ b/tools/profiling/buildSYCLProcess.sh
@@ -72,13 +72,13 @@ export CUDA_PATH=/usr/local/cuda-11.8/
 export WORKSPACE=$prefix/workspace_mg4gpu
 
 # Compilation using OneAPI Toolkit through CVMFS
-#export CXX=/cvmfs/projects.cern.ch/intelsw/oneAPI/linux/x86_64/2023/compiler/2023.0.0/linux/bin-llvm/clang++
-#export SYCLFLAGS="-fsycl -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=$SM_LEVEL"
+export CXX=/cvmfs/projects.cern.ch/intelsw/oneAPI/linux/x86_64/2023/compiler/2023.0.0/linux/bin-llvm/clang++
+export SYCLFLAGS="-fsycl -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=$SM_LEVEL"
 
 # Compilation for OneAPI LLVM compiler
-export DPCPP_HOME=/afs/cern.ch/work/j/jteig/sycl_workspace
-export CXX=$DPCPP_HOME/llvm/build/bin/clang++
-export SYCLFLAGS="-fsycl -fsycl-targets=nvptx64-nvidia-cuda -Xsycl-target-backend '--cuda-gpu-arch=$SM_LEVEL' -fgpu-rdc --cuda-path=$CUDA_PATH"
+#export DPCPP_HOME=/afs/cern.ch/work/j/jteig/sycl_workspace
+#export CXX=$DPCPP_HOME/llvm/build/bin/clang++
+#export SYCLFLAGS="-fsycl -fsycl-targets=nvptx64-nvidia-cuda -Xsycl-target-backend '--cuda-gpu-arch=$SM_LEVEL' -fgpu-rdc --cuda-path=$CUDA_PATH"
 
 # Branch should be enviroment variable in main script and then passed down if none then it is not displayed in prefix
 REPORT_FOLDER="${WORKSPACE}/$(date +"%y-%m-%d")_${SYCL_NAME_PREFIX}_${branch}"

From 3c566f943f418e51d6d27fb9e52af1f5e426f861 Mon Sep 17 00:00:00 2001
From: Jooorgen <jorgen.teig@gmail.com>
Date: Thu, 2 Mar 2023 15:20:36 +0100
Subject: [PATCH 197/509] Changed path to actual file being sourced

---
 .github/workflows/sycl_A100_Profiler.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/sycl_A100_Profiler.yml b/.github/workflows/sycl_A100_Profiler.yml
index 2508590933..ee500c4650 100644
--- a/.github/workflows/sycl_A100_Profiler.yml
+++ b/.github/workflows/sycl_A100_Profiler.yml
@@ -18,6 +18,6 @@ jobs:
     steps:
     - uses: actions/checkout@v2
     - name: Runs SYCL performanceProfiler.py script
-      run: cd tools/profiling/; source /cvmfs/sft.cern.ch/lcg/releases/gcc/11.3.0-ad0f5/x86_64-centos8/setup.sh; source /cvmfs/projects.cern.ch/intelsw/oneAPI/linux/23-all-setup.sh --include-intel-llvm; python3 performanceProfiler.py -l 'SYCL' -b 'master'
+      run: cd tools/profiling/; source /cvmfs/sft.cern.ch/lcg/releases/gcc/11.3.0-ad0f5/x86_64-centos8/setup.sh; source /cvmfs/projects.cern.ch/intelsw/oneAPI/linux/x86_64/2023/setvars.sh --include-intel-llvm; python3 performanceProfiler.py -l 'SYCL' -b 'master'
     - name: Uploads SYCL JSON files to DB
       run: cd tools/profiling/; python3 sendData.py --absLayer SYCL --profiler 1 --branch master
\ No newline at end of file

From 05d1d47ef5437678461b0c8c5f571d941f957357 Mon Sep 17 00:00:00 2001
From: Jooorgen <jorgen.teig@gmail.com>
Date: Fri, 3 Mar 2023 13:10:48 +0100
Subject: [PATCH 198/509] Added option to set GCC toolchain to sycl_src.mk

---
 epochX/sycl/ee_mumu.sa/src/sycl_src.mk  | 6 ++++++
 epochX/sycl/gg_tt.sa/src/sycl_src.mk    | 6 ++++++
 epochX/sycl/gg_ttg.sa/src/sycl_src.mk   | 6 ++++++
 epochX/sycl/gg_ttgg.sa/src/sycl_src.mk  | 6 ++++++
 epochX/sycl/gg_ttggg.sa/src/sycl_src.mk | 6 ++++++
 5 files changed, 30 insertions(+)

diff --git a/epochX/sycl/ee_mumu.sa/src/sycl_src.mk b/epochX/sycl/ee_mumu.sa/src/sycl_src.mk
index 504c2d4dd8..1e9c1bebca 100644
--- a/epochX/sycl/ee_mumu.sa/src/sycl_src.mk
+++ b/epochX/sycl/ee_mumu.sa/src/sycl_src.mk
@@ -26,6 +26,12 @@ endif
 
 #=== Set the SYCL compiler flags appropriate to user-defined choices of FPTYPE, HELINL, HRDCOD
 
+# Add option to enable CI profiler use
+$(info ENABLE_CI_PROFILER=$(ENABLE_CI_PROFILER))
+ifeq ($(ENABLE_CI_PROFILER),1)
+  CXXFLAGS += --gcc-toolchain="/cvmfs/sft.cern.ch/lcg/releases/gcc/11.3.0-ad0f5/x86_64-centos8"
+endif
+
 # Set the build flags appropriate to each FPTYPE choice (example: "make FPTYPE=f")
 ifeq ($(FPTYPE),d)
   CXXFLAGS += -DMGONGPU_FPTYPE_DOUBLE
diff --git a/epochX/sycl/gg_tt.sa/src/sycl_src.mk b/epochX/sycl/gg_tt.sa/src/sycl_src.mk
index 504c2d4dd8..1e9c1bebca 100644
--- a/epochX/sycl/gg_tt.sa/src/sycl_src.mk
+++ b/epochX/sycl/gg_tt.sa/src/sycl_src.mk
@@ -26,6 +26,12 @@ endif
 
 #=== Set the SYCL compiler flags appropriate to user-defined choices of FPTYPE, HELINL, HRDCOD
 
+# Add option to enable CI profiler use
+$(info ENABLE_CI_PROFILER=$(ENABLE_CI_PROFILER))
+ifeq ($(ENABLE_CI_PROFILER),1)
+  CXXFLAGS += --gcc-toolchain="/cvmfs/sft.cern.ch/lcg/releases/gcc/11.3.0-ad0f5/x86_64-centos8"
+endif
+
 # Set the build flags appropriate to each FPTYPE choice (example: "make FPTYPE=f")
 ifeq ($(FPTYPE),d)
   CXXFLAGS += -DMGONGPU_FPTYPE_DOUBLE
diff --git a/epochX/sycl/gg_ttg.sa/src/sycl_src.mk b/epochX/sycl/gg_ttg.sa/src/sycl_src.mk
index 504c2d4dd8..1e9c1bebca 100644
--- a/epochX/sycl/gg_ttg.sa/src/sycl_src.mk
+++ b/epochX/sycl/gg_ttg.sa/src/sycl_src.mk
@@ -26,6 +26,12 @@ endif
 
 #=== Set the SYCL compiler flags appropriate to user-defined choices of FPTYPE, HELINL, HRDCOD
 
+# Add option to enable CI profiler use
+$(info ENABLE_CI_PROFILER=$(ENABLE_CI_PROFILER))
+ifeq ($(ENABLE_CI_PROFILER),1)
+  CXXFLAGS += --gcc-toolchain="/cvmfs/sft.cern.ch/lcg/releases/gcc/11.3.0-ad0f5/x86_64-centos8"
+endif
+
 # Set the build flags appropriate to each FPTYPE choice (example: "make FPTYPE=f")
 ifeq ($(FPTYPE),d)
   CXXFLAGS += -DMGONGPU_FPTYPE_DOUBLE
diff --git a/epochX/sycl/gg_ttgg.sa/src/sycl_src.mk b/epochX/sycl/gg_ttgg.sa/src/sycl_src.mk
index 504c2d4dd8..1e9c1bebca 100644
--- a/epochX/sycl/gg_ttgg.sa/src/sycl_src.mk
+++ b/epochX/sycl/gg_ttgg.sa/src/sycl_src.mk
@@ -26,6 +26,12 @@ endif
 
 #=== Set the SYCL compiler flags appropriate to user-defined choices of FPTYPE, HELINL, HRDCOD
 
+# Add option to enable CI profiler use
+$(info ENABLE_CI_PROFILER=$(ENABLE_CI_PROFILER))
+ifeq ($(ENABLE_CI_PROFILER),1)
+  CXXFLAGS += --gcc-toolchain="/cvmfs/sft.cern.ch/lcg/releases/gcc/11.3.0-ad0f5/x86_64-centos8"
+endif
+
 # Set the build flags appropriate to each FPTYPE choice (example: "make FPTYPE=f")
 ifeq ($(FPTYPE),d)
   CXXFLAGS += -DMGONGPU_FPTYPE_DOUBLE
diff --git a/epochX/sycl/gg_ttggg.sa/src/sycl_src.mk b/epochX/sycl/gg_ttggg.sa/src/sycl_src.mk
index 504c2d4dd8..1e9c1bebca 100644
--- a/epochX/sycl/gg_ttggg.sa/src/sycl_src.mk
+++ b/epochX/sycl/gg_ttggg.sa/src/sycl_src.mk
@@ -26,6 +26,12 @@ endif
 
 #=== Set the SYCL compiler flags appropriate to user-defined choices of FPTYPE, HELINL, HRDCOD
 
+# Add option to enable CI profiler use
+$(info ENABLE_CI_PROFILER=$(ENABLE_CI_PROFILER))
+ifeq ($(ENABLE_CI_PROFILER),1)
+  CXXFLAGS += --gcc-toolchain="/cvmfs/sft.cern.ch/lcg/releases/gcc/11.3.0-ad0f5/x86_64-centos8"
+endif
+
 # Set the build flags appropriate to each FPTYPE choice (example: "make FPTYPE=f")
 ifeq ($(FPTYPE),d)
   CXXFLAGS += -DMGONGPU_FPTYPE_DOUBLE

From b87235f0a4c1a71148f7eb2d19df9a875ce0432e Mon Sep 17 00:00:00 2001
From: Jooorgen <jorgen.teig@gmail.com>
Date: Tue, 7 Mar 2023 14:54:48 +0100
Subject: [PATCH 199/509] Updated GCC path in CUDA A100 workflow

---
 .github/workflows/cuda_A100_Profiler.yml | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/cuda_A100_Profiler.yml b/.github/workflows/cuda_A100_Profiler.yml
index dbf8be78b8..fb73508669 100644
--- a/.github/workflows/cuda_A100_Profiler.yml
+++ b/.github/workflows/cuda_A100_Profiler.yml
@@ -1,8 +1,10 @@
 name: CUDA A100 Profiler
 
 on:
-  schedule:
-    - cron:  '00 00 * * *'
+  push:
+    branches: [ master ]
+  pull_request:
+    branches: [ master ]
 
 jobs:
   cuda_a100_Profiling:
@@ -16,6 +18,6 @@ jobs:
     steps:
     - uses: actions/checkout@v2
     - name: Runs CUDA performanceProfiler.py script
-      run: cd tools/profiling/; source /cvmfs/sft.cern.ch/lcg/contrib/gcc/11.3.0/x86_64-centos8/setup.sh; python3 performanceProfiler.py -l 'CUDA' -b 'master'
+      run: cd tools/profiling/; source /cvmfs/sft.cern.ch/lcg/releases/gcc/11.3.0-ad0f5/x86_64-centos8/setup.sh; python3 performanceProfiler.py -l 'CUDA' -b 'master'
     - name: Uploads CUDA JSON files to DB
       run: cd tools/profiling/; python3 sendData.py --absLayer CUDA --profiler 1 --branch master
\ No newline at end of file

From 864d3ea3f9d0590de73121d4413afd38308dc72e Mon Sep 17 00:00:00 2001
From: Jooorgen <jorgen.teig@gmail.com>
Date: Tue, 7 Mar 2023 14:57:00 +0100
Subject: [PATCH 200/509] Updated CUDA path for container test

---
 tools/profiling/buildCUDAProcess.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/profiling/buildCUDAProcess.sh b/tools/profiling/buildCUDAProcess.sh
index d62e333fb4..28dd44d5e4 100755
--- a/tools/profiling/buildCUDAProcess.sh
+++ b/tools/profiling/buildCUDAProcess.sh
@@ -47,7 +47,7 @@ export NTPBMAX=1024
 export CXX=/cvmfs/sft.cern.ch/lcg/releases/gcc/11.3.0-ad0f5/x86_64-centos8/bin/g++
 export MG_EXE="./gcheck.exe" #GPU
 #export MG_EXE="./check.exe" #CPU
-export CUDA_HOME=/usr/local/cuda-11.8/
+export CUDA_HOME=/usr/local/cuda/
 export FC=`which gfortran`
 export WORKSPACE=$prefix/workspace_mg4gpu
 #export NAME_PREFIX="cudacpp_v100s_cuda_11.6.2_gcc_11.3"

From b295e18ec561d7ecaf35897a6ef52566688eacad Mon Sep 17 00:00:00 2001
From: Jooorgen <jorgen.teig@gmail.com>
Date: Mon, 20 Mar 2023 11:36:40 +0100
Subject: [PATCH 201/509] Changed name of Makefiles to fix issue

---
 epochX/sycl/ee_mumu.sa/src/{sycl_src.mk => Makefile}  | 0
 epochX/sycl/gg_tt.sa/src/{sycl_src.mk => Makefile}    | 0
 epochX/sycl/gg_ttg.sa/src/{sycl_src.mk => Makefile}   | 0
 epochX/sycl/gg_ttgg.sa/src/{sycl_src.mk => Makefile}  | 0
 epochX/sycl/gg_ttggg.sa/src/{sycl_src.mk => Makefile} | 0
 5 files changed, 0 insertions(+), 0 deletions(-)
 rename epochX/sycl/ee_mumu.sa/src/{sycl_src.mk => Makefile} (100%)
 rename epochX/sycl/gg_tt.sa/src/{sycl_src.mk => Makefile} (100%)
 rename epochX/sycl/gg_ttg.sa/src/{sycl_src.mk => Makefile} (100%)
 rename epochX/sycl/gg_ttgg.sa/src/{sycl_src.mk => Makefile} (100%)
 rename epochX/sycl/gg_ttggg.sa/src/{sycl_src.mk => Makefile} (100%)

diff --git a/epochX/sycl/ee_mumu.sa/src/sycl_src.mk b/epochX/sycl/ee_mumu.sa/src/Makefile
similarity index 100%
rename from epochX/sycl/ee_mumu.sa/src/sycl_src.mk
rename to epochX/sycl/ee_mumu.sa/src/Makefile
diff --git a/epochX/sycl/gg_tt.sa/src/sycl_src.mk b/epochX/sycl/gg_tt.sa/src/Makefile
similarity index 100%
rename from epochX/sycl/gg_tt.sa/src/sycl_src.mk
rename to epochX/sycl/gg_tt.sa/src/Makefile
diff --git a/epochX/sycl/gg_ttg.sa/src/sycl_src.mk b/epochX/sycl/gg_ttg.sa/src/Makefile
similarity index 100%
rename from epochX/sycl/gg_ttg.sa/src/sycl_src.mk
rename to epochX/sycl/gg_ttg.sa/src/Makefile
diff --git a/epochX/sycl/gg_ttgg.sa/src/sycl_src.mk b/epochX/sycl/gg_ttgg.sa/src/Makefile
similarity index 100%
rename from epochX/sycl/gg_ttgg.sa/src/sycl_src.mk
rename to epochX/sycl/gg_ttgg.sa/src/Makefile
diff --git a/epochX/sycl/gg_ttggg.sa/src/sycl_src.mk b/epochX/sycl/gg_ttggg.sa/src/Makefile
similarity index 100%
rename from epochX/sycl/gg_ttggg.sa/src/sycl_src.mk
rename to epochX/sycl/gg_ttggg.sa/src/Makefile

From a58e6bb94808dbe03a05900c6a7f63fa06fa7126 Mon Sep 17 00:00:00 2001
From: Jooorgen <jorgen.teig@gmail.com>
Date: Mon, 20 Mar 2023 13:56:34 +0100
Subject: [PATCH 202/509] Reverted change to makefiles

---
 epochX/sycl/ee_mumu.sa/src/{Makefile => sycl_src.mk}  | 0
 epochX/sycl/gg_tt.sa/src/{Makefile => sycl_src.mk}    | 0
 epochX/sycl/gg_ttg.sa/src/{Makefile => sycl_src.mk}   | 0
 epochX/sycl/gg_ttgg.sa/src/{Makefile => sycl_src.mk}  | 0
 epochX/sycl/gg_ttggg.sa/src/{Makefile => sycl_src.mk} | 0
 5 files changed, 0 insertions(+), 0 deletions(-)
 rename epochX/sycl/ee_mumu.sa/src/{Makefile => sycl_src.mk} (100%)
 rename epochX/sycl/gg_tt.sa/src/{Makefile => sycl_src.mk} (100%)
 rename epochX/sycl/gg_ttg.sa/src/{Makefile => sycl_src.mk} (100%)
 rename epochX/sycl/gg_ttgg.sa/src/{Makefile => sycl_src.mk} (100%)
 rename epochX/sycl/gg_ttggg.sa/src/{Makefile => sycl_src.mk} (100%)

diff --git a/epochX/sycl/ee_mumu.sa/src/Makefile b/epochX/sycl/ee_mumu.sa/src/sycl_src.mk
similarity index 100%
rename from epochX/sycl/ee_mumu.sa/src/Makefile
rename to epochX/sycl/ee_mumu.sa/src/sycl_src.mk
diff --git a/epochX/sycl/gg_tt.sa/src/Makefile b/epochX/sycl/gg_tt.sa/src/sycl_src.mk
similarity index 100%
rename from epochX/sycl/gg_tt.sa/src/Makefile
rename to epochX/sycl/gg_tt.sa/src/sycl_src.mk
diff --git a/epochX/sycl/gg_ttg.sa/src/Makefile b/epochX/sycl/gg_ttg.sa/src/sycl_src.mk
similarity index 100%
rename from epochX/sycl/gg_ttg.sa/src/Makefile
rename to epochX/sycl/gg_ttg.sa/src/sycl_src.mk
diff --git a/epochX/sycl/gg_ttgg.sa/src/Makefile b/epochX/sycl/gg_ttgg.sa/src/sycl_src.mk
similarity index 100%
rename from epochX/sycl/gg_ttgg.sa/src/Makefile
rename to epochX/sycl/gg_ttgg.sa/src/sycl_src.mk
diff --git a/epochX/sycl/gg_ttggg.sa/src/Makefile b/epochX/sycl/gg_ttggg.sa/src/sycl_src.mk
similarity index 100%
rename from epochX/sycl/gg_ttggg.sa/src/Makefile
rename to epochX/sycl/gg_ttggg.sa/src/sycl_src.mk

From c90c63cc49021fc3d2dcf5d6f341c26afc829f1d Mon Sep 17 00:00:00 2001
From: Jooorgen <jorgen.teig@gmail.com>
Date: Mon, 20 Mar 2023 14:11:36 +0100
Subject: [PATCH 203/509] Added which file to make clean in make script

---
 epochX/sycl/ee_mumu.sa/src/sycl_src.mk  | 2 +-
 epochX/sycl/gg_tt.sa/src/sycl_src.mk    | 2 +-
 epochX/sycl/gg_ttg.sa/src/sycl_src.mk   | 2 +-
 epochX/sycl/gg_ttgg.sa/src/sycl_src.mk  | 2 +-
 epochX/sycl/gg_ttggg.sa/src/sycl_src.mk | 2 +-
 5 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/epochX/sycl/ee_mumu.sa/src/sycl_src.mk b/epochX/sycl/ee_mumu.sa/src/sycl_src.mk
index 1e9c1bebca..f7ed33af40 100644
--- a/epochX/sycl/ee_mumu.sa/src/sycl_src.mk
+++ b/epochX/sycl/ee_mumu.sa/src/sycl_src.mk
@@ -156,7 +156,7 @@ endif
 
 cleanall:
 	@echo
-	make clean
+	make -f sycl_src.mk clean
 	@echo
 	rm -rf build.*
 
diff --git a/epochX/sycl/gg_tt.sa/src/sycl_src.mk b/epochX/sycl/gg_tt.sa/src/sycl_src.mk
index 1e9c1bebca..f7ed33af40 100644
--- a/epochX/sycl/gg_tt.sa/src/sycl_src.mk
+++ b/epochX/sycl/gg_tt.sa/src/sycl_src.mk
@@ -156,7 +156,7 @@ endif
 
 cleanall:
 	@echo
-	make clean
+	make -f sycl_src.mk clean
 	@echo
 	rm -rf build.*
 
diff --git a/epochX/sycl/gg_ttg.sa/src/sycl_src.mk b/epochX/sycl/gg_ttg.sa/src/sycl_src.mk
index 1e9c1bebca..f7ed33af40 100644
--- a/epochX/sycl/gg_ttg.sa/src/sycl_src.mk
+++ b/epochX/sycl/gg_ttg.sa/src/sycl_src.mk
@@ -156,7 +156,7 @@ endif
 
 cleanall:
 	@echo
-	make clean
+	make -f sycl_src.mk clean
 	@echo
 	rm -rf build.*
 
diff --git a/epochX/sycl/gg_ttgg.sa/src/sycl_src.mk b/epochX/sycl/gg_ttgg.sa/src/sycl_src.mk
index 1e9c1bebca..f7ed33af40 100644
--- a/epochX/sycl/gg_ttgg.sa/src/sycl_src.mk
+++ b/epochX/sycl/gg_ttgg.sa/src/sycl_src.mk
@@ -156,7 +156,7 @@ endif
 
 cleanall:
 	@echo
-	make clean
+	make -f sycl_src.mk clean
 	@echo
 	rm -rf build.*
 
diff --git a/epochX/sycl/gg_ttggg.sa/src/sycl_src.mk b/epochX/sycl/gg_ttggg.sa/src/sycl_src.mk
index 1e9c1bebca..f7ed33af40 100644
--- a/epochX/sycl/gg_ttggg.sa/src/sycl_src.mk
+++ b/epochX/sycl/gg_ttggg.sa/src/sycl_src.mk
@@ -156,7 +156,7 @@ endif
 
 cleanall:
 	@echo
-	make clean
+	make -f sycl_src.mk clean
 	@echo
 	rm -rf build.*
 

From fc89495083d9268163fc000786c1355b324644cc Mon Sep 17 00:00:00 2001
From: Jooorgen <jorgen.teig@gmail.com>
Date: Mon, 20 Mar 2023 17:03:56 +0100
Subject: [PATCH 204/509] Updated DEVICE_ID to work with oneAPI Toolkit

---
 tools/profiling/buildSYCLProcess.sh | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/tools/profiling/buildSYCLProcess.sh b/tools/profiling/buildSYCLProcess.sh
index 8655f41883..7a6df118dc 100755
--- a/tools/profiling/buildSYCLProcess.sh
+++ b/tools/profiling/buildSYCLProcess.sh
@@ -8,8 +8,8 @@ if command -v nvidia-smi > /dev/null 2>&1; then
     # Get the name of the GPU
     GPU_NAME=$(nvidia-smi --query-gpu=name --format=csv,noheader)
 
-    # GPU
-    export DEVICE_ID=0
+    # GPU (DEVICE_ID=2 for oneAPI toolkit runs on GPUs, else DEVICE_ID=0)
+    export DEVICE_ID=2
     # CPU
     #export DEVICE_ID=1
 else
@@ -75,6 +75,10 @@ export WORKSPACE=$prefix/workspace_mg4gpu
 export CXX=/cvmfs/projects.cern.ch/intelsw/oneAPI/linux/x86_64/2023/compiler/2023.0.0/linux/bin-llvm/clang++
 export SYCLFLAGS="-fsycl -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=$SM_LEVEL"
 
+# Gets no erros with this:
+export SYCLFLAGS="-fsycl -fsycl-targets=nvptx64-nvidia-cuda -Xsycl-target-backend \"--cuda-gpu-arch=$SM_LEVEL\""
+
+
 # Compilation for OneAPI LLVM compiler
 #export DPCPP_HOME=/afs/cern.ch/work/j/jteig/sycl_workspace
 #export CXX=$DPCPP_HOME/llvm/build/bin/clang++

From 0a4ffa606b710bd204ef6e11a866dd2689d2ec36 Mon Sep 17 00:00:00 2001
From: Jooorgen <jorgen.teig@gmail.com>
Date: Tue, 21 Mar 2023 09:46:29 +0100
Subject: [PATCH 205/509] Removed unneccessary SYCLFLAGS line

---
 tools/profiling/buildSYCLProcess.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/profiling/buildSYCLProcess.sh b/tools/profiling/buildSYCLProcess.sh
index 7a6df118dc..103e781435 100755
--- a/tools/profiling/buildSYCLProcess.sh
+++ b/tools/profiling/buildSYCLProcess.sh
@@ -73,7 +73,7 @@ export WORKSPACE=$prefix/workspace_mg4gpu
 
 # Compilation using OneAPI Toolkit through CVMFS
 export CXX=/cvmfs/projects.cern.ch/intelsw/oneAPI/linux/x86_64/2023/compiler/2023.0.0/linux/bin-llvm/clang++
-export SYCLFLAGS="-fsycl -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=$SM_LEVEL"
+#export SYCLFLAGS="-fsycl -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=$SM_LEVEL"
 
 # Gets no erros with this:
 export SYCLFLAGS="-fsycl -fsycl-targets=nvptx64-nvidia-cuda -Xsycl-target-backend \"--cuda-gpu-arch=$SM_LEVEL\""

From f83b185d2e4809812638af10111a5048654d3d6c Mon Sep 17 00:00:00 2001
From: Jooorgen <jorgen.teig@gmail.com>
Date: Tue, 21 Mar 2023 12:03:09 +0100
Subject: [PATCH 206/509] Updated CUDA version to 12

---
 .github/workflows/cudaProfiler.yml       | 2 +-
 .github/workflows/cuda_A100_Profiler.yml | 2 +-
 .github/workflows/syclProfiler.yml       | 2 +-
 .github/workflows/sycl_A100_Profiler.yml | 2 +-
 tools/profiling/buildSYCLProcess.sh      | 2 +-
 5 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/.github/workflows/cudaProfiler.yml b/.github/workflows/cudaProfiler.yml
index 1bb8581772..339594b1d2 100644
--- a/.github/workflows/cudaProfiler.yml
+++ b/.github/workflows/cudaProfiler.yml
@@ -8,7 +8,7 @@ jobs:
   cuda_v100s_Profiling:
     name: CUDA V100S Profiling
     env:
-      CUDA_NAME_PREFIX: cudacpp_Xeon-Silver-4216_v100s_gcc-11.3_cuda-11.8
+      CUDA_NAME_PREFIX: cudacpp_Xeon-Silver-4216_v100s_gcc-11.3_cuda-12.0.1
       ENABLE_CI_PROFILER: 1
 
       MADGRAPH4GPU_DB_SECRET: ${{ secrets.MADGRAPH4GPU_DB_SECRET }}
diff --git a/.github/workflows/cuda_A100_Profiler.yml b/.github/workflows/cuda_A100_Profiler.yml
index fb73508669..ad83047f17 100644
--- a/.github/workflows/cuda_A100_Profiler.yml
+++ b/.github/workflows/cuda_A100_Profiler.yml
@@ -10,7 +10,7 @@ jobs:
   cuda_a100_Profiling:
     name: CUDA A100 Profiling
     env:
-      CUDA_NAME_PREFIX: cudacpp_AMD-Epyc-7313_a100_gcc-11.3_cuda-11.8
+      CUDA_NAME_PREFIX: cudacpp_AMD-Epyc-7313_a100_gcc-11.3_cuda-12.0.1
       ENABLE_CI_PROFILER: 1
 
       MADGRAPH4GPU_DB_SECRET: ${{ secrets.MADGRAPH4GPU_DB_SECRET }}
diff --git a/.github/workflows/syclProfiler.yml b/.github/workflows/syclProfiler.yml
index e8c5c272c7..482003776e 100644
--- a/.github/workflows/syclProfiler.yml
+++ b/.github/workflows/syclProfiler.yml
@@ -8,7 +8,7 @@ jobs:
   sycl_v100s_Profiling:
     name: SYCL V100S Profiling
     env:
-      SYCL_NAME_PREFIX: sycl_Xeon-Silver-4216_v100s_gcc-11.3_cuda-11.8
+      SYCL_NAME_PREFIX: sycl_Xeon-Silver-4216_v100s_gcc-11.3_cuda-12.0.1
       ENABLE_CI_PROFILER: 1
 
       MADGRAPH4GPU_DB_SECRET: ${{ secrets.MADGRAPH4GPU_DB_SECRET }}
diff --git a/.github/workflows/sycl_A100_Profiler.yml b/.github/workflows/sycl_A100_Profiler.yml
index ee500c4650..2df1de4080 100644
--- a/.github/workflows/sycl_A100_Profiler.yml
+++ b/.github/workflows/sycl_A100_Profiler.yml
@@ -10,7 +10,7 @@ jobs:
   sycl_A100_Profiling:
     name: SYCL A100 Profiling
     env:
-      SYCL_NAME_PREFIX: sycl_AMD-Epyc-7313_a100_gcc-11.3_cuda-11.8
+      SYCL_NAME_PREFIX: sycl_AMD-Epyc-7313_a100_gcc-11.3_cuda-12.0.1
       ENABLE_CI_PROFILER: 1
 
       MADGRAPH4GPU_DB_SECRET: ${{ secrets.MADGRAPH4GPU_DB_SECRET }}
diff --git a/tools/profiling/buildSYCLProcess.sh b/tools/profiling/buildSYCLProcess.sh
index 103e781435..ba2ebc75d8 100755
--- a/tools/profiling/buildSYCLProcess.sh
+++ b/tools/profiling/buildSYCLProcess.sh
@@ -68,7 +68,7 @@ prefix=$(pwd)
 
 export USEBUILDDIR=1
 export NTPBMAX=1024
-export CUDA_PATH=/usr/local/cuda-11.8/
+export CUDA_PATH=/usr/local/cuda-12.0/
 export WORKSPACE=$prefix/workspace_mg4gpu
 
 # Compilation using OneAPI Toolkit through CVMFS

From 73afbf5bd4fc412a64ceb18720b543ec5271e550 Mon Sep 17 00:00:00 2001
From: Jooorgen <jorgen.teig@gmail.com>
Date: Tue, 21 Mar 2023 13:43:16 +0100
Subject: [PATCH 207/509] Cleaned the SYCLFLAGS

---
 tools/profiling/buildSYCLProcess.sh | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/tools/profiling/buildSYCLProcess.sh b/tools/profiling/buildSYCLProcess.sh
index ba2ebc75d8..28d8bb3123 100755
--- a/tools/profiling/buildSYCLProcess.sh
+++ b/tools/profiling/buildSYCLProcess.sh
@@ -76,8 +76,7 @@ export CXX=/cvmfs/projects.cern.ch/intelsw/oneAPI/linux/x86_64/2023/compiler/202
 #export SYCLFLAGS="-fsycl -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=$SM_LEVEL"
 
 # Gets no erros with this:
-export SYCLFLAGS="-fsycl -fsycl-targets=nvptx64-nvidia-cuda -Xsycl-target-backend \"--cuda-gpu-arch=$SM_LEVEL\""
-
+export SYCLFLAGS="-fsycl -fsycl-targets=nvptx64-nvidia-cuda -Xsycl-target-backend --cuda-gpu-arch=$SM_LEVEL"
 
 # Compilation for OneAPI LLVM compiler
 #export DPCPP_HOME=/afs/cern.ch/work/j/jteig/sycl_workspace

From 1499601734ccb2bc1e2a054997e59fe55abab8a0 Mon Sep 17 00:00:00 2001
From: Jooorgen <jorgen.teig@gmail.com>
Date: Tue, 21 Mar 2023 17:03:29 +0100
Subject: [PATCH 208/509] Added fix for the nvvm-reflect-ftz error

---
 tools/profiling/buildSYCLProcess.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/profiling/buildSYCLProcess.sh b/tools/profiling/buildSYCLProcess.sh
index 28d8bb3123..333e339637 100755
--- a/tools/profiling/buildSYCLProcess.sh
+++ b/tools/profiling/buildSYCLProcess.sh
@@ -76,7 +76,7 @@ export CXX=/cvmfs/projects.cern.ch/intelsw/oneAPI/linux/x86_64/2023/compiler/202
 #export SYCLFLAGS="-fsycl -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=$SM_LEVEL"
 
 # Gets no erros with this:
-export SYCLFLAGS="-fsycl -fsycl-targets=nvptx64-nvidia-cuda -Xsycl-target-backend --cuda-gpu-arch=$SM_LEVEL"
+export SYCLFLAGS="-fsycl -fsycl-targets=nvptx64-nvidia-cuda -Xsycl-target-backend --cuda-gpu-arch=$SM_LEVEL -Xclang -fdenormal-fp-math=ieee"
 
 # Compilation for OneAPI LLVM compiler
 #export DPCPP_HOME=/afs/cern.ch/work/j/jteig/sycl_workspace

From 103cc8fac0adcbe7f2a043769165d1ee52345d9d Mon Sep 17 00:00:00 2001
From: Jooorgen <jorgen.teig@gmail.com>
Date: Tue, 21 Mar 2023 17:06:53 +0100
Subject: [PATCH 209/509] Refactored some code in the performanceProfiler.py
 script and removed output from build scripts

---
 tools/profiling/performanceProfiler.py | 18 ++++++++++--------
 1 file changed, 10 insertions(+), 8 deletions(-)

diff --git a/tools/profiling/performanceProfiler.py b/tools/profiling/performanceProfiler.py
index 96eb008caa..eece9f7f37 100644
--- a/tools/profiling/performanceProfiler.py
+++ b/tools/profiling/performanceProfiler.py
@@ -26,7 +26,7 @@
 parser.add_argument("-l", help="Choose which abstraction layer you want to use (CUDA/SYCL).", default=absLayer)
 parser.add_argument("-b", help="Choose which branch the madgraph4gpu repo is in.", default=branch)
 
-#Add profiler option in python and build scripts so that correct gcc toolchain can be set through makefile and still not disturb the compilation on Github machines
+# Add profiler option in python and build scripts so that correct gcc toolchain can be set through makefile and still not disturb the compilation on Github machines
 
 pyArgs = parser.parse_args()
 
@@ -36,7 +36,7 @@
 for process in mgProcesses:
     for TPB in threadsPerBlock:
         for BPG in blocksPerGrid:
-            if (TPB * BPG > doublePrecisionConstant):
+            if TPB * BPG > doublePrecisionConstant:
 
                 if pyArgs.l.upper() == 'SYCL':
 
@@ -51,7 +51,7 @@
                     #if pyArgs.b != 'br_golden_epochX4':
                     if ".sa" not in process:
                         process = process + ".sa"
-                    
+
                     bashArgs = ["./buildCUDAProcess.sh", "-n",  process, "-i",  str(iterations), "-t",  str(TPB), "-b", str(BPG), "-r", str(pyArgs.b).lower()]
 
                     #if len(pyArgs.b) > 0:
@@ -61,12 +61,14 @@
                 else: sys.exit("No abstraction layer matching the supplied string!")
 
                 print(str(datetime.datetime.now().strftime("%H:%M:%S")) + " Started " + process + " with TPB("+ str(TPB) +") * BPG("+ str(BPG) +"): " + str(TPB * BPG) + "!")
-                
-                build = subprocess.run(bashArgs)#, stdout=subprocess.DEVNULL)
+
+                time = str(datetime.datetime.now().strftime("%H:%M:%S"))
+
+                build = subprocess.run(bashArgs, check=True, stdout=subprocess.DEVNULL)
                 if build.returncode != 0:
-                    print(str(datetime.datetime.now().strftime("%H:%M:%S")) + " " + process + " FAILED!, threadsPerBlock: " + str(TPB) + ", blocksPerGrid: " + str(BPG) + ", Product: " + str(TPB * BPG))
+                    print(time + " " + process + " FAILED!, threadsPerBlock: " + str(TPB) + ", blocksPerGrid: " + str(BPG) + ", Product: " + str(TPB * BPG))
                 else:
-                    print(str(datetime.datetime.now().strftime("%H:%M:%S")) + " " + process + " COMPLETED!, threadsPerBlock: " + str(TPB) + ", blocksPerGrid: " + str(BPG) + ", Product: " + str(TPB * BPG))
+                    print(time + " " + process + " COMPLETED!, threadsPerBlock: " + str(TPB) + ", blocksPerGrid: " + str(BPG) + ", Product: " + str(TPB * BPG))
                 count += 1
 
-print("Builded " + str(count) + " processes!")
\ No newline at end of file
+print("Builded " + str(count) + " processes!")

From 49016bb58310b896877ce398df6b7f6ec9fd814a Mon Sep 17 00:00:00 2001
From: Jooorgen <jorgen.teig@gmail.com>
Date: Tue, 21 Mar 2023 17:31:55 +0100
Subject: [PATCH 210/509] Added back build script output because of wonky
 capture on GitHub

---
 tools/profiling/performanceProfiler.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/profiling/performanceProfiler.py b/tools/profiling/performanceProfiler.py
index eece9f7f37..28182c6578 100644
--- a/tools/profiling/performanceProfiler.py
+++ b/tools/profiling/performanceProfiler.py
@@ -64,7 +64,7 @@
 
                 time = str(datetime.datetime.now().strftime("%H:%M:%S"))
 
-                build = subprocess.run(bashArgs, check=True, stdout=subprocess.DEVNULL)
+                build = subprocess.run(bashArgs, check=True)#, stdout=subprocess.DEVNULL)
                 if build.returncode != 0:
                     print(time + " " + process + " FAILED!, threadsPerBlock: " + str(TPB) + ", blocksPerGrid: " + str(BPG) + ", Product: " + str(TPB * BPG))
                 else:

From 293a5bc0632fcaa5e3fdfb962017886275b00138 Mon Sep 17 00:00:00 2001
From: Jooorgen <jorgen.teig@gmail.com>
Date: Tue, 21 Mar 2023 17:48:44 +0100
Subject: [PATCH 211/509] Added back cron job to run the job nightly

---
 .github/workflows/cuda_A100_Profiler.yml | 6 ++----
 .github/workflows/sycl_A100_Profiler.yml | 6 ++----
 2 files changed, 4 insertions(+), 8 deletions(-)

diff --git a/.github/workflows/cuda_A100_Profiler.yml b/.github/workflows/cuda_A100_Profiler.yml
index ad83047f17..97981a0a9a 100644
--- a/.github/workflows/cuda_A100_Profiler.yml
+++ b/.github/workflows/cuda_A100_Profiler.yml
@@ -1,10 +1,8 @@
 name: CUDA A100 Profiler
 
 on:
-  push:
-    branches: [ master ]
-  pull_request:
-    branches: [ master ]
+  schedule:
+    - cron:  '00 00 * * *'
 
 jobs:
   cuda_a100_Profiling:
diff --git a/.github/workflows/sycl_A100_Profiler.yml b/.github/workflows/sycl_A100_Profiler.yml
index 2df1de4080..d0b6194acb 100644
--- a/.github/workflows/sycl_A100_Profiler.yml
+++ b/.github/workflows/sycl_A100_Profiler.yml
@@ -1,10 +1,8 @@
 name: SYCL A100 Profiler
 
 on:
-  push:
-    branches: [ master ]
-  pull_request:
-    branches: [ master ]
+  schedule:
+    - cron:  '00 00 * * *'
 
 jobs:
   sycl_A100_Profiling:

From de54670bb8f308086bceecbc07eadb8f057bb8f3 Mon Sep 17 00:00:00 2001
From: Jooorgen <jorgen.teig@gmail.com>
Date: Wed, 22 Mar 2023 17:46:52 +0100
Subject: [PATCH 212/509] Added correct GCC CVMFS paths and added oneAPI
 initilization

---
 .github/workflows/cudaProfiler.yml | 2 +-
 .github/workflows/syclProfiler.yml | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/cudaProfiler.yml b/.github/workflows/cudaProfiler.yml
index 339594b1d2..62a6629343 100644
--- a/.github/workflows/cudaProfiler.yml
+++ b/.github/workflows/cudaProfiler.yml
@@ -16,6 +16,6 @@ jobs:
     steps:
     - uses: actions/checkout@v2
     - name: Runs CUDA performanceProfiler.py script
-      run: cd tools/profiling/; source /cvmfs/sft.cern.ch/lcg/contrib/gcc/11.3.0/x86_64-centos8/setup.sh; python3 performanceProfiler.py -l 'CUDA' -b 'master'
+      run: cd tools/profiling/; source /cvmfs/sft.cern.ch/lcg/releases/gcc/11.3.0-ad0f5/x86_64-centos8/setup.sh; python3 performanceProfiler.py -l 'CUDA' -b 'master'
     - name: Uploads CUDA JSON files to DB
       run: cd tools/profiling/; python3 sendData.py --absLayer CUDA --profiler 1 --branch master
\ No newline at end of file
diff --git a/.github/workflows/syclProfiler.yml b/.github/workflows/syclProfiler.yml
index 482003776e..1457297aa0 100644
--- a/.github/workflows/syclProfiler.yml
+++ b/.github/workflows/syclProfiler.yml
@@ -16,6 +16,6 @@ jobs:
     steps:
     - uses: actions/checkout@v2
     - name: Runs SYCL performanceProfiler.py script
-      run: cd tools/profiling/; source /cvmfs/sft.cern.ch/lcg/contrib/gcc/11.3.0-ad0f5/x86_64-centos8/setup.sh; python3 performanceProfiler.py -l 'SYCL' -b 'master'
+      run: cd tools/profiling/; source /cvmfs/sft.cern.ch/lcg/releases/gcc/11.3.0-ad0f5/x86_64-centos8/setup.sh; source /cvmfs/projects.cern.ch/intelsw/oneAPI/linux/x86_64/2023/setvars.sh --include-intel-llvm; python3 performanceProfiler.py -l 'SYCL' -b 'master'
     - name: Uploads SYCL JSON files to DB
       run: cd tools/profiling/; python3 sendData.py --absLayer SYCL --profiler 1 --branch master
\ No newline at end of file

From 5a97f9246a826def335c32fee9f1ba53c9d90931 Mon Sep 17 00:00:00 2001
From: Jooorgen <jorgen.teig@gmail.com>
Date: Wed, 29 Mar 2023 14:58:00 +0200
Subject: [PATCH 213/509] Refactored and added more comments to
 performanceProfiler script

---
 tools/profiling/performanceProfiler.py | 83 ++++++++++++++------------
 1 file changed, 44 insertions(+), 39 deletions(-)

diff --git a/tools/profiling/performanceProfiler.py b/tools/profiling/performanceProfiler.py
index 28182c6578..6a08f13f55 100644
--- a/tools/profiling/performanceProfiler.py
+++ b/tools/profiling/performanceProfiler.py
@@ -1,74 +1,79 @@
-import string
 import sys
-import os
 import subprocess
 import datetime
 import argparse
 
 # Parser arguments defaults
-absLayer = "SYCL"
-branch = "master"
+ABS_LAYER = "SYCL"
+BRANCH = "master"
 
-mgProcesses = ["ee_mumu", "gg_tt", "gg_ttg", "gg_ttgg", "gg_ttggg"]
+# Physics processes
+MG_PROCESSES_SA = ["ee_mumu.sa", "gg_tt.sa", "gg_ttg.sa", "gg_ttgg.sa", "gg_ttggg.sa"]
 
-doublePrecisionConstant = 2560
-#doublePrecisionConstant = 1
-iterations = 10
-#iterations = 1
-threadsPerBlock = [256]
-#threadsPerBlock = [32]
-blocksPerGrid = [16, 32, 64, 128, 256, 512, 1024, 2048, 4096, 8192, 16384]
-#blocksPerGrid = [32]
+DOUBLE_PRECISION_CONSTANT = 2560
+ITERATIONS = 10
+THREADS_PER_BLOCK = [256]
+#THREADS_PER_BLOCK = [32, 64, 128, 256]
+BLOCKS_PER_GRID = [16, 32, 64, 128, 256, 512, 1024, 2048, 4096, 8192, 16384]
 
 # Parser
 parser = argparse.ArgumentParser(description='A program for profiling GPUs using MadGraph.')
 
-parser.add_argument("-l", help="Choose which abstraction layer you want to use (CUDA/SYCL).", default=absLayer)
-parser.add_argument("-b", help="Choose which branch the madgraph4gpu repo is in.", default=branch)
-
-# Add profiler option in python and build scripts so that correct gcc toolchain can be set through makefile and still not disturb the compilation on Github machines
+parser.add_argument("-l", help="Choose which abstraction layer you want to use (CUDA/SYCL).", default=ABS_LAYER)
+parser.add_argument("-b", help="Choose which branch the madgraph4gpu repo is in.", default=BRANCH)
 
 pyArgs = parser.parse_args()
 
 # How many runs in total the program made
 count = 0
 
-for process in mgProcesses:
-    for TPB in threadsPerBlock:
-        for BPG in blocksPerGrid:
-            if TPB * BPG > doublePrecisionConstant:
+for process in MG_PROCESSES_SA:
+    for TPB in THREADS_PER_BLOCK:
+        for BPG in BLOCKS_PER_GRID:
+            if TPB * BPG > DOUBLE_PRECISION_CONSTANT:
 
                 if pyArgs.l.upper() == 'SYCL':
 
-                    if ".sa" not in process:
-                        process = process + ".sa"
+                    # There is no .sa in br_golden_epochX4
+                    # so it makes sure that .sa is included in everything other than that branch
+                    # if pyArgs.b != 'br_golden_epochX4':
+                    #if ".sa" not in process:
+                    #    process = process + ".sa"
 
-                    bashArgs = ["./buildSYCLProcess.sh", "-n",  process, "-i",  str(iterations), "-t",  str(TPB), "-b", str(BPG), "-r", str(pyArgs.b).lower()]
+                    bashArgs = ["./buildSYCLProcess.sh",
+                                "-n", process,
+                                "-i", str(ITERATIONS),
+                                "-t", str(TPB),
+                                "-b", str(BPG),
+                                "-r", str(pyArgs.b).lower()]
 
                 elif pyArgs.l.upper() == 'CUDA':
 
-                    # There is no .sa in br_golden_epochX4 so it makes sure that .sa is included in everything other than that branch
-                    #if pyArgs.b != 'br_golden_epochX4':
-                    if ".sa" not in process:
-                        process = process + ".sa"
-
-                    bashArgs = ["./buildCUDAProcess.sh", "-n",  process, "-i",  str(iterations), "-t",  str(TPB), "-b", str(BPG), "-r", str(pyArgs.b).lower()]
-
-                    #if len(pyArgs.b) > 0:
-                    #    bashArgs.append('--branch')
-                    #    bashArgs.append(str(pyArgs.b).lower())
+                    bashArgs = ["./buildCUDAProcess.sh",
+                                "-n", process,
+                                "-i", str(ITERATIONS),
+                                "-t", str(TPB),
+                                "-b", str(BPG),
+                                "-r", str(pyArgs.b).lower()]
 
                 else: sys.exit("No abstraction layer matching the supplied string!")
 
-                print(str(datetime.datetime.now().strftime("%H:%M:%S")) + " Started " + process + " with TPB("+ str(TPB) +") * BPG("+ str(BPG) +"): " + str(TPB * BPG) + "!")
-
                 time = str(datetime.datetime.now().strftime("%H:%M:%S"))
 
+                print(time + " Started " + process + " with TPB("+ str(TPB) +") * BPG("+ str(BPG) +"): " + str(TPB * BPG) + "!")
+
                 build = subprocess.run(bashArgs, check=True)#, stdout=subprocess.DEVNULL)
                 if build.returncode != 0:
-                    print(time + " " + process + " FAILED!, threadsPerBlock: " + str(TPB) + ", blocksPerGrid: " + str(BPG) + ", Product: " + str(TPB * BPG))
+                    print(time + " " + process +
+                          " FAILED!, threadsPerBlock: " + str(TPB) +
+                                    ", blocksPerGrid: " + str(BPG) +
+                                    ", Product: " + str(TPB * BPG))
                 else:
-                    print(time + " " + process + " COMPLETED!, threadsPerBlock: " + str(TPB) + ", blocksPerGrid: " + str(BPG) + ", Product: " + str(TPB * BPG))
+                    print(time + " " + process +
+                          " COMPLETED!, threadsPerBlock: " + str(TPB) +
+                                    ", blocksPerGrid: " + str(BPG) +
+                                    ", Product: " + str(TPB * BPG))
+
                 count += 1
 
-print("Builded " + str(count) + " processes!")
+print("Builded " + str(count) + " processes!")
\ No newline at end of file

From 635a4e792b6477756c7c8e8210e9369fc51b026a Mon Sep 17 00:00:00 2001
From: Jooorgen <jorgen.teig@gmail.com>
Date: Wed, 29 Mar 2023 18:21:40 +0200
Subject: [PATCH 214/509] Fixed SSL error in container

---
 tools/profiling/sendData.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tools/profiling/sendData.py b/tools/profiling/sendData.py
index 8377c2a068..6b247ff4fa 100644
--- a/tools/profiling/sendData.py
+++ b/tools/profiling/sendData.py
@@ -118,10 +118,10 @@
 
                 for field in fields:
                     value = float(re.findall(r'[\d.]+',data[0][field])[0])
-
+                    
                     DBdata = DBdata + ',' + args.absLayer + "_" + field.replace(" ", "_") + '=' + str(value)
 
-                requestInfo = ["curl", "-i",  '-XPOST', "-i",  URL, "--header",  "Authorization: Token "+Auth[0]+":"+Auth[1], "--data-raw", DBdata]
+                requestInfo = ["curl", "-i", "-k",  '-XPOST', "-i",  URL, "--header",  "Authorization: Token "+Auth[0]+":"+Auth[1], "--data-raw", DBdata]
                 
                 request = subprocess.run(requestInfo, stdout=subprocess.DEVNULL)
 

From fbf099c6c8adfa332a70de3f587acd8e971298ae Mon Sep 17 00:00:00 2001
From: Jooorgen <jorgen.teig@gmail.com>
Date: Thu, 30 Mar 2023 15:29:15 +0200
Subject: [PATCH 215/509] Updated GCC path and CUDA path

---
 .github/workflows/c-cpp.yml | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/c-cpp.yml b/.github/workflows/c-cpp.yml
index 2f78a97da6..76fadf549f 100644
--- a/.github/workflows/c-cpp.yml
+++ b/.github/workflows/c-cpp.yml
@@ -66,8 +66,8 @@ jobs:
     - name: path
       run: echo "PATH=$PATH"
     - name: make info
-      run: source /cvmfs/sft.cern.ch/lcg/contrib/gcc/11.3.0/x86_64-centos8/setup.sh; export CUDA_HOME=/usr/local/cuda-11.6/; make FPTYPE=${{ matrix.precision }} -C ${{ matrix.folder }} info
+      run: source /cvmfs/sft.cern.ch/lcg/releases/gcc/11.3.0-ad0f5/x86_64-centos8/setup.sh; export CUDA_HOME=/usr/local/cuda-12.0/; make FPTYPE=${{ matrix.precision }} -C ${{ matrix.folder }} info
     - name: make
-      run: source /cvmfs/sft.cern.ch/lcg/contrib/gcc/11.3.0/x86_64-centos8/setup.sh; export CUDA_HOME=/usr/local/cuda-11.6/; make FPTYPE=${{ matrix.precision }} -C ${{ matrix.folder }}
+      run: source /cvmfs/sft.cern.ch/lcg/releases/gcc/11.3.0-ad0f5/x86_64-centos8/setup.sh; export CUDA_HOME=/usr/local/cuda-12.0/; make FPTYPE=${{ matrix.precision }} -C ${{ matrix.folder }}
     - name: make check
-      run: source /cvmfs/sft.cern.ch/lcg/contrib/gcc/11.3.0/x86_64-centos8/setup.sh; export CUDA_HOME=/usr/local/cuda-11.6/; make FPTYPE=${{ matrix.precision }} -C ${{ matrix.folder }} check
+      run: source /cvmfs/sft.cern.ch/lcg/releases/gcc/11.3.0-ad0f5/x86_64-centos8/setup.sh; export CUDA_HOME=/usr/local/cuda-12.0/; make FPTYPE=${{ matrix.precision }} -C ${{ matrix.folder }} check

From 6205b1b6d3e5438ed76c5d646e77e4fd115640a6 Mon Sep 17 00:00:00 2001
From: Jooorgen <jorgen.teig@gmail.com>
Date: Thu, 30 Mar 2023 17:15:00 +0200
Subject: [PATCH 216/509] Added back badges in repo README

---
 README.md | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/README.md b/README.md
index 966e2d7bc9..1bd658ad1c 100644
--- a/README.md
+++ b/README.md
@@ -1,5 +1,7 @@
 # Madgraph 4 GPU
 
+[![C/C++ CI](https://github.com/Jooorgen/madgraph4gpu/actions/workflows/c-cpp.yml/badge.svg)](https://github.com/Jooorgen/madgraph4gpu/actions/workflows/c-cpp.yml) [![SYCL CI](https://github.com/Jooorgen/madgraph4gpu/actions/workflows/sycl.yml/badge.svg)](https://github.com/Jooorgen/madgraph4gpu/actions/workflows/sycl.yml) [![CUDA Profiler](https://github.com/Jooorgen/madgraph4gpu/actions/workflows/cudaProfiler.yml/badge.svg)](https://github.com/Jooorgen/madgraph4gpu/actions/workflows/cudaProfiler.yml) [![SYCL Profiler](https://github.com/Jooorgen/madgraph4gpu/actions/workflows/syclProfiler.yml/badge.svg)](https://github.com/Jooorgen/madgraph4gpu/actions/workflows/syclProfiler.yml)
+
 This repository contains code developed in the context of porting the [MadGraph5_aMC@NLO](https://cp3.irmp.ucl.ac.be/projects/madgraph/) event generator software onto GPU platforms and vector instructions on CPUs. MadGraph5_aMC@NLO is able to generate code for various physics processes in different programming languages (Fortran, C, C++). The code generated in this repository in "epochX" of the MadGraph5_aMC@NLO generator allows to also produce source code for those physics processes to run on GPU and CPU platforms. 
 
 

From 3295e99211ddaed7f4dfa734b8020568d2075bac Mon Sep 17 00:00:00 2001
From: Jooorgen <jorgen.teig@gmail.com>
Date: Thu, 30 Mar 2023 17:22:04 +0200
Subject: [PATCH 217/509] Merged all profiler jobs into one workflow

---
 .github/workflows/cudaProfiler.yml       | 21 -------
 .github/workflows/cuda_A100_Profiler.yml | 21 -------
 .github/workflows/profiler.yml           | 74 ++++++++++++++++++++++++
 .github/workflows/syclProfiler.yml       | 21 -------
 .github/workflows/sycl_A100_Profiler.yml | 21 -------
 5 files changed, 74 insertions(+), 84 deletions(-)
 delete mode 100644 .github/workflows/cudaProfiler.yml
 delete mode 100644 .github/workflows/cuda_A100_Profiler.yml
 create mode 100644 .github/workflows/profiler.yml
 delete mode 100644 .github/workflows/syclProfiler.yml
 delete mode 100644 .github/workflows/sycl_A100_Profiler.yml

diff --git a/.github/workflows/cudaProfiler.yml b/.github/workflows/cudaProfiler.yml
deleted file mode 100644
index 62a6629343..0000000000
--- a/.github/workflows/cudaProfiler.yml
+++ /dev/null
@@ -1,21 +0,0 @@
-name: CUDA V100s Profiler
-
-on:
-  schedule:
-    - cron:  '00 00 * * *'
-
-jobs:
-  cuda_v100s_Profiling:
-    name: CUDA V100S Profiling
-    env:
-      CUDA_NAME_PREFIX: cudacpp_Xeon-Silver-4216_v100s_gcc-11.3_cuda-12.0.1
-      ENABLE_CI_PROFILER: 1
-
-      MADGRAPH4GPU_DB_SECRET: ${{ secrets.MADGRAPH4GPU_DB_SECRET }}
-    runs-on: [self-hosted, linux, v100s]
-    steps:
-    - uses: actions/checkout@v2
-    - name: Runs CUDA performanceProfiler.py script
-      run: cd tools/profiling/; source /cvmfs/sft.cern.ch/lcg/releases/gcc/11.3.0-ad0f5/x86_64-centos8/setup.sh; python3 performanceProfiler.py -l 'CUDA' -b 'master'
-    - name: Uploads CUDA JSON files to DB
-      run: cd tools/profiling/; python3 sendData.py --absLayer CUDA --profiler 1 --branch master
\ No newline at end of file
diff --git a/.github/workflows/cuda_A100_Profiler.yml b/.github/workflows/cuda_A100_Profiler.yml
deleted file mode 100644
index 97981a0a9a..0000000000
--- a/.github/workflows/cuda_A100_Profiler.yml
+++ /dev/null
@@ -1,21 +0,0 @@
-name: CUDA A100 Profiler
-
-on:
-  schedule:
-    - cron:  '00 00 * * *'
-
-jobs:
-  cuda_a100_Profiling:
-    name: CUDA A100 Profiling
-    env:
-      CUDA_NAME_PREFIX: cudacpp_AMD-Epyc-7313_a100_gcc-11.3_cuda-12.0.1
-      ENABLE_CI_PROFILER: 1
-
-      MADGRAPH4GPU_DB_SECRET: ${{ secrets.MADGRAPH4GPU_DB_SECRET }}
-    runs-on: [self-hosted, linux, a100]
-    steps:
-    - uses: actions/checkout@v2
-    - name: Runs CUDA performanceProfiler.py script
-      run: cd tools/profiling/; source /cvmfs/sft.cern.ch/lcg/releases/gcc/11.3.0-ad0f5/x86_64-centos8/setup.sh; python3 performanceProfiler.py -l 'CUDA' -b 'master'
-    - name: Uploads CUDA JSON files to DB
-      run: cd tools/profiling/; python3 sendData.py --absLayer CUDA --profiler 1 --branch master
\ No newline at end of file
diff --git a/.github/workflows/profiler.yml b/.github/workflows/profiler.yml
new file mode 100644
index 0000000000..01ddd5563d
--- /dev/null
+++ b/.github/workflows/profiler.yml
@@ -0,0 +1,74 @@
+name: Performance Profiler
+
+on:
+  schedule:
+    - cron:  '00 00 * * *'
+
+jobs:
+  sycl_A100_Profiling:
+    name: SYCL A100 Profiling
+    env:
+      SYCL_NAME_PREFIX: sycl_AMD-Epyc-7313_a100_gcc-11.3_cuda-12.0.1
+      ENABLE_CI_PROFILER: 1
+
+      MADGRAPH4GPU_DB_SECRET: ${{ secrets.MADGRAPH4GPU_DB_SECRET }}
+    runs-on: [self-hosted, linux, a100]
+    steps:
+    - uses: actions/checkout@v2
+    - name: Runs SYCL performanceProfiler.py script
+      run: cd tools/profiling/;
+           source /cvmfs/sft.cern.ch/lcg/releases/gcc/11.3.0-ad0f5/x86_64-centos8/setup.sh;
+           source /cvmfs/projects.cern.ch/intelsw/oneAPI/linux/x86_64/2023/setvars.sh --include-intel-llvm;
+           python3 performanceProfiler.py -l 'SYCL' -b 'master'
+    - name: Uploads SYCL JSON files to DB
+      run: cd tools/profiling/; python3 sendData.py --absLayer SYCL --profiler 1 --branch master
+
+  cuda_a100_Profiling:
+    name: CUDA A100 Profiling
+    env:
+      CUDA_NAME_PREFIX: cudacpp_AMD-Epyc-7313_a100_gcc-11.3_cuda-12.0.1
+      ENABLE_CI_PROFILER: 1
+
+      MADGRAPH4GPU_DB_SECRET: ${{ secrets.MADGRAPH4GPU_DB_SECRET }}
+    runs-on: [self-hosted, linux, a100]
+    steps:
+    - uses: actions/checkout@v2
+    - name: Runs CUDA performanceProfiler.py script
+      run: cd tools/profiling/; source /cvmfs/sft.cern.ch/lcg/releases/gcc/11.3.0-ad0f5/x86_64-centos8/setup.sh;
+           python3 performanceProfiler.py -l 'CUDA' -b 'master'
+    - name: Uploads CUDA JSON files to DB
+      run: cd tools/profiling/; python3 sendData.py --absLayer CUDA --profiler 1 --branch master
+
+  sycl_v100s_Profiling:
+    name: SYCL V100S Profiling
+    env:
+      SYCL_NAME_PREFIX: sycl_Xeon-Silver-4216_v100s_gcc-11.3_cuda-12.0.1
+      ENABLE_CI_PROFILER: 1
+
+      MADGRAPH4GPU_DB_SECRET: ${{ secrets.MADGRAPH4GPU_DB_SECRET }}
+    runs-on: [self-hosted, linux, v100s]
+    steps:
+    - uses: actions/checkout@v2
+    - name: Runs SYCL performanceProfiler.py script
+      run: cd tools/profiling/; source /cvmfs/sft.cern.ch/lcg/releases/gcc/11.3.0-ad0f5/x86_64-centos8/setup.sh;
+           source /cvmfs/projects.cern.ch/intelsw/oneAPI/linux/x86_64/2023/setvars.sh --include-intel-llvm;
+           python3 performanceProfiler.py -l 'SYCL' -b 'master'
+    - name: Uploads SYCL JSON files to DB
+      run: cd tools/profiling/; python3 sendData.py --absLayer SYCL --profiler 1 --branch master
+
+jobs:
+  cuda_v100s_Profiling:
+    name: CUDA V100S Profiling
+    env:
+      CUDA_NAME_PREFIX: cudacpp_Xeon-Silver-4216_v100s_gcc-11.3_cuda-12.0.1
+      ENABLE_CI_PROFILER: 1
+
+      MADGRAPH4GPU_DB_SECRET: ${{ secrets.MADGRAPH4GPU_DB_SECRET }}
+    runs-on: [self-hosted, linux, v100s]
+    steps:
+    - uses: actions/checkout@v2
+    - name: Runs CUDA performanceProfiler.py script
+      run: cd tools/profiling/; source /cvmfs/sft.cern.ch/lcg/releases/gcc/11.3.0-ad0f5/x86_64-centos8/setup.sh;
+           python3 performanceProfiler.py -l 'CUDA' -b 'master'
+    - name: Uploads CUDA JSON files to DB
+      run: cd tools/profiling/; python3 sendData.py --absLayer CUDA --profiler 1 --branch master
\ No newline at end of file
diff --git a/.github/workflows/syclProfiler.yml b/.github/workflows/syclProfiler.yml
deleted file mode 100644
index 1457297aa0..0000000000
--- a/.github/workflows/syclProfiler.yml
+++ /dev/null
@@ -1,21 +0,0 @@
-name: SYCL V100S Profiler
-
-on:
-  schedule:
-    - cron:  '00 00 * * *'
-
-jobs:
-  sycl_v100s_Profiling:
-    name: SYCL V100S Profiling
-    env:
-      SYCL_NAME_PREFIX: sycl_Xeon-Silver-4216_v100s_gcc-11.3_cuda-12.0.1
-      ENABLE_CI_PROFILER: 1
-
-      MADGRAPH4GPU_DB_SECRET: ${{ secrets.MADGRAPH4GPU_DB_SECRET }}
-    runs-on: [self-hosted, linux, v100s]
-    steps:
-    - uses: actions/checkout@v2
-    - name: Runs SYCL performanceProfiler.py script
-      run: cd tools/profiling/; source /cvmfs/sft.cern.ch/lcg/releases/gcc/11.3.0-ad0f5/x86_64-centos8/setup.sh; source /cvmfs/projects.cern.ch/intelsw/oneAPI/linux/x86_64/2023/setvars.sh --include-intel-llvm; python3 performanceProfiler.py -l 'SYCL' -b 'master'
-    - name: Uploads SYCL JSON files to DB
-      run: cd tools/profiling/; python3 sendData.py --absLayer SYCL --profiler 1 --branch master
\ No newline at end of file
diff --git a/.github/workflows/sycl_A100_Profiler.yml b/.github/workflows/sycl_A100_Profiler.yml
deleted file mode 100644
index d0b6194acb..0000000000
--- a/.github/workflows/sycl_A100_Profiler.yml
+++ /dev/null
@@ -1,21 +0,0 @@
-name: SYCL A100 Profiler
-
-on:
-  schedule:
-    - cron:  '00 00 * * *'
-
-jobs:
-  sycl_A100_Profiling:
-    name: SYCL A100 Profiling
-    env:
-      SYCL_NAME_PREFIX: sycl_AMD-Epyc-7313_a100_gcc-11.3_cuda-12.0.1
-      ENABLE_CI_PROFILER: 1
-
-      MADGRAPH4GPU_DB_SECRET: ${{ secrets.MADGRAPH4GPU_DB_SECRET }}
-    runs-on: [self-hosted, linux, a100]
-    steps:
-    - uses: actions/checkout@v2
-    - name: Runs SYCL performanceProfiler.py script
-      run: cd tools/profiling/; source /cvmfs/sft.cern.ch/lcg/releases/gcc/11.3.0-ad0f5/x86_64-centos8/setup.sh; source /cvmfs/projects.cern.ch/intelsw/oneAPI/linux/x86_64/2023/setvars.sh --include-intel-llvm; python3 performanceProfiler.py -l 'SYCL' -b 'master'
-    - name: Uploads SYCL JSON files to DB
-      run: cd tools/profiling/; python3 sendData.py --absLayer SYCL --profiler 1 --branch master
\ No newline at end of file

From 9039cc6fbf13a85a38c55d31aefea266bc04239c Mon Sep 17 00:00:00 2001
From: Jooorgen <jorgen.teig@gmail.com>
Date: Thu, 30 Mar 2023 17:22:25 +0200
Subject: [PATCH 218/509] Removed redundant CUDA_HOME in c-cpp workflow

---
 .github/workflows/c-cpp.yml | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/c-cpp.yml b/.github/workflows/c-cpp.yml
index 76fadf549f..ca16385183 100644
--- a/.github/workflows/c-cpp.yml
+++ b/.github/workflows/c-cpp.yml
@@ -66,8 +66,8 @@ jobs:
     - name: path
       run: echo "PATH=$PATH"
     - name: make info
-      run: source /cvmfs/sft.cern.ch/lcg/releases/gcc/11.3.0-ad0f5/x86_64-centos8/setup.sh; export CUDA_HOME=/usr/local/cuda-12.0/; make FPTYPE=${{ matrix.precision }} -C ${{ matrix.folder }} info
+      run: source /cvmfs/sft.cern.ch/lcg/releases/gcc/11.3.0-ad0f5/x86_64-centos8/setup.sh; make FPTYPE=${{ matrix.precision }} -C ${{ matrix.folder }} info
     - name: make
-      run: source /cvmfs/sft.cern.ch/lcg/releases/gcc/11.3.0-ad0f5/x86_64-centos8/setup.sh; export CUDA_HOME=/usr/local/cuda-12.0/; make FPTYPE=${{ matrix.precision }} -C ${{ matrix.folder }}
+      run: source /cvmfs/sft.cern.ch/lcg/releases/gcc/11.3.0-ad0f5/x86_64-centos8/setup.sh; make FPTYPE=${{ matrix.precision }} -C ${{ matrix.folder }}
     - name: make check
-      run: source /cvmfs/sft.cern.ch/lcg/releases/gcc/11.3.0-ad0f5/x86_64-centos8/setup.sh; export CUDA_HOME=/usr/local/cuda-12.0/; make FPTYPE=${{ matrix.precision }} -C ${{ matrix.folder }} check
+      run: source /cvmfs/sft.cern.ch/lcg/releases/gcc/11.3.0-ad0f5/x86_64-centos8/setup.sh; make FPTYPE=${{ matrix.precision }} -C ${{ matrix.folder }} check

From 546df7c9428981a5d433207a40d7cb58271a120c Mon Sep 17 00:00:00 2001
From: Jooorgen <jorgen.teig@gmail.com>
Date: Thu, 30 Mar 2023 17:38:11 +0200
Subject: [PATCH 219/509] Updated oneAPI/SYCL CI paths and switched to oneAPI
 toolkit

---
 .github/workflows/sycl.yml | 24 ++++++++++++++----------
 1 file changed, 14 insertions(+), 10 deletions(-)

diff --git a/.github/workflows/sycl.yml b/.github/workflows/sycl.yml
index 7b086b78bc..b754072274 100644
--- a/.github/workflows/sycl.yml
+++ b/.github/workflows/sycl.yml
@@ -8,34 +8,38 @@ on:
 
 jobs:
   GPU:
-    runs-on: [self-hosted, linux, v100s]
+    runs-on: [self-hosted, linux, a100]
     env:
       FC: gfortran
       REQUIRE_CUDA: 1
-      SYCLFLAGS: -fsycl -fsycl-targets=nvptx64-nvidia-cuda -Xsycl-target-backend '--cuda-gpu-arch=sm_70' -fgpu-rdc --cuda-path=/usr/local/cuda-11.6/
+      SYCLFLAGS: export SYCLFLAGS="-fsycl -fsycl-targets=nvptx64-nvidia-cuda -Xsycl-target-backend --cuda-gpu-arch=sm_80 -Xclang -fdenormal-fp-math=ieee"
       ENABLE_CI_PROFILER: 1
     strategy:
       matrix:
-        folder: [ epochX/sycl/ee_mumu.sa/SubProcesses/P1_Sigma_sm_epem_mupmum , epochX/sycl/gg_tt.sa/SubProcesses/P1_Sigma_sm_gg_ttx , epochX/sycl/gg_ttg.sa/SubProcesses/P1_Sigma_sm_gg_ttxg , epochX/sycl/gg_ttgg.sa/SubProcesses/P1_Sigma_sm_gg_ttxgg , epochX/sycl/gg_ttggg.sa/SubProcesses/P1_Sigma_sm_gg_ttxggg ]
+        folder: [ epochX/sycl/ee_mumu.sa/SubProcesses/P1_Sigma_sm_epem_mupmum,
+                  epochX/sycl/gg_tt.sa/SubProcesses/P1_Sigma_sm_gg_ttx,
+                  epochX/sycl/gg_ttg.sa/SubProcesses/P1_Sigma_sm_gg_ttxg,
+                  epochX/sycl/gg_ttgg.sa/SubProcesses/P1_Sigma_sm_gg_ttxgg,
+                  epochX/sycl/gg_ttggg.sa/SubProcesses/P1_Sigma_sm_gg_ttxggg ]
         precision: [ d , f ]
       fail-fast: false
     steps:
     - uses: actions/checkout@v2
     - name: make info
       run: source /cvmfs/sft.cern.ch/lcg/contrib/gcc/11.3.0/x86_64-centos8/setup.sh;
-           CXX=/afs/cern.ch/work/j/jteig/sycl_workspace/llvm/build/bin/clang++;
-           LD_LIBRARY_PATH=/afs/cern.ch/work/j/jteig/sycl_workspace/llvm/build/lib:$LD_LIBRARY_PATH;
+           source /cvmfs/projects.cern.ch/intelsw/oneAPI/linux/x86_64/2023/setvars.sh --include-intel-llvm;
+           CXX=/cvmfs/projects.cern.ch/intelsw/oneAPI/linux/x86_64/2023/compiler/2023.0.0/linux/bin-llvm/clang++;
            LD_LIBRARY_PATH=${{ github.workspace }}/${{ matrix.folder }}/../../lib:$LD_LIBRARY_PATH;
            make FPTYPE=${{ matrix.precision }} -C ${{ matrix.folder }} info
     - name: make
       run: source /cvmfs/sft.cern.ch/lcg/contrib/gcc/11.3.0/x86_64-centos8/setup.sh;
-           CXX=/afs/cern.ch/work/j/jteig/sycl_workspace/llvm/build/bin/clang++;
-           LD_LIBRARY_PATH=/afs/cern.ch/work/j/jteig/sycl_workspace/llvm/build/lib:$LD_LIBRARY_PATH;
+           source /cvmfs/projects.cern.ch/intelsw/oneAPI/linux/x86_64/2023/setvars.sh --include-intel-llvm;
+           CXX=/cvmfs/projects.cern.ch/intelsw/oneAPI/linux/x86_64/2023/compiler/2023.0.0/linux/bin-llvm/clang++;
            LD_LIBRARY_PATH=${{ github.workspace }}/${{ matrix.folder }}/../../lib:$LD_LIBRARY_PATH;
            make FPTYPE=${{ matrix.precision }} -C ${{ matrix.folder }}
     - name: make check
-      run: source /cvmfs/sft.cern.ch/lcg/contrib/gcc/11.3.0/x86_64-centos8/setup.sh; 
-           CXX=/afs/cern.ch/work/j/jteig/sycl_workspace/llvm/build/bin/clang++;
-           LD_LIBRARY_PATH=/afs/cern.ch/work/j/jteig/sycl_workspace/llvm/build/lib:$LD_LIBRARY_PATH;
+      run: source /cvmfs/sft.cern.ch/lcg/contrib/gcc/11.3.0/x86_64-centos8/setup.sh;
+           source /cvmfs/projects.cern.ch/intelsw/oneAPI/linux/x86_64/2023/setvars.sh --include-intel-llvm;
+           CXX=/cvmfs/projects.cern.ch/intelsw/oneAPI/linux/x86_64/2023/compiler/2023.0.0/linux/bin-llvm/clang++;
            LD_LIBRARY_PATH=${{ github.workspace }}/${{ matrix.folder }}/../../lib:$LD_LIBRARY_PATH;
            make FPTYPE=${{ matrix.precision }} -C ${{ matrix.folder }} check
\ No newline at end of file

From 47b34cfe639c2c3ab4d29bd1f3cfaf61f3f5cf30 Mon Sep 17 00:00:00 2001
From: Jooorgen <jorgen.teig@gmail.com>
Date: Thu, 30 Mar 2023 17:39:41 +0200
Subject: [PATCH 220/509] Removed redundant jobs declaration

---
 .github/workflows/profiler.yml | 1 -
 1 file changed, 1 deletion(-)

diff --git a/.github/workflows/profiler.yml b/.github/workflows/profiler.yml
index 01ddd5563d..7374bd6c34 100644
--- a/.github/workflows/profiler.yml
+++ b/.github/workflows/profiler.yml
@@ -56,7 +56,6 @@ jobs:
     - name: Uploads SYCL JSON files to DB
       run: cd tools/profiling/; python3 sendData.py --absLayer SYCL --profiler 1 --branch master
 
-jobs:
   cuda_v100s_Profiling:
     name: CUDA V100S Profiling
     env:

From 57b92c046f8e8bf5d397ffdaa1d3e5f0d0a7afc5 Mon Sep 17 00:00:00 2001
From: Jooorgen <jorgen.teig@gmail.com>
Date: Thu, 30 Mar 2023 17:43:55 +0200
Subject: [PATCH 221/509] Updated c-cpp CI formating and removed test steps in
 jobs

---
 .github/workflows/c-cpp.yml | 28 +++++++++++++++++++++-------
 1 file changed, 21 insertions(+), 7 deletions(-)

diff --git a/.github/workflows/c-cpp.yml b/.github/workflows/c-cpp.yml
index ca16385183..0e33be034c 100644
--- a/.github/workflows/c-cpp.yml
+++ b/.github/workflows/c-cpp.yml
@@ -11,7 +11,11 @@ jobs:
     runs-on: ubuntu-latest
     strategy:
       matrix:
-        folder: [ epochX/cudacpp/ee_mumu.sa/SubProcesses/P1_Sigma_sm_epem_mupmum , epochX/cudacpp/gg_tt.sa/SubProcesses/P1_Sigma_sm_gg_ttx , epochX/cudacpp/gg_ttg.sa/SubProcesses/P1_Sigma_sm_gg_ttxg , epochX/cudacpp/gg_ttgg.sa/SubProcesses/P1_Sigma_sm_gg_ttxgg , epochX/cudacpp/gg_ttggg.sa/SubProcesses/P1_Sigma_sm_gg_ttxggg ]
+        folder: [ epochX/cudacpp/ee_mumu.sa/SubProcesses/P1_Sigma_sm_epem_mupmum,
+                  epochX/cudacpp/gg_tt.sa/SubProcesses/P1_Sigma_sm_gg_ttx,
+                  epochX/cudacpp/gg_ttg.sa/SubProcesses/P1_Sigma_sm_gg_ttxg,
+                  epochX/cudacpp/gg_ttgg.sa/SubProcesses/P1_Sigma_sm_gg_ttxgg,
+                  epochX/cudacpp/gg_ttggg.sa/SubProcesses/P1_Sigma_sm_gg_ttxggg ]
       fail-fast: false
     steps:
 
@@ -22,7 +26,11 @@ jobs:
     runs-on: ubuntu-latest
     strategy:
       matrix:
-        folder: [ epochX/cudacpp/ee_mumu.sa/SubProcesses/P1_Sigma_sm_epem_mupmum , epochX/cudacpp/gg_tt.sa/SubProcesses/P1_Sigma_sm_gg_ttx , epochX/cudacpp/gg_ttg.sa/SubProcesses/P1_Sigma_sm_gg_ttxg , epochX/cudacpp/gg_ttgg.sa/SubProcesses/P1_Sigma_sm_gg_ttxgg , epochX/cudacpp/gg_ttggg.sa/SubProcesses/P1_Sigma_sm_gg_ttxggg ]
+        folder: [ epochX/cudacpp/ee_mumu.sa/SubProcesses/P1_Sigma_sm_epem_mupmum,
+                  epochX/cudacpp/gg_tt.sa/SubProcesses/P1_Sigma_sm_gg_ttx,
+                  epochX/cudacpp/gg_ttg.sa/SubProcesses/P1_Sigma_sm_gg_ttxg,
+                  epochX/cudacpp/gg_ttgg.sa/SubProcesses/P1_Sigma_sm_gg_ttxgg,
+                  epochX/cudacpp/gg_ttggg.sa/SubProcesses/P1_Sigma_sm_gg_ttxggg ]
         precision: [ d , f , m ]
       fail-fast: false
     steps:
@@ -39,7 +47,11 @@ jobs:
       FC: gfortran-11
     strategy:
       matrix:
-        folder: [ epochX/cudacpp/ee_mumu.sa/SubProcesses/P1_Sigma_sm_epem_mupmum , epochX/cudacpp/gg_tt.sa/SubProcesses/P1_Sigma_sm_gg_ttx , epochX/cudacpp/gg_ttg.sa/SubProcesses/P1_Sigma_sm_gg_ttxg , epochX/cudacpp/gg_ttgg.sa/SubProcesses/P1_Sigma_sm_gg_ttxgg , epochX/cudacpp/gg_ttggg.sa/SubProcesses/P1_Sigma_sm_gg_ttxggg ]
+        folder: [ epochX/cudacpp/ee_mumu.sa/SubProcesses/P1_Sigma_sm_epem_mupmum,
+                  epochX/cudacpp/gg_tt.sa/SubProcesses/P1_Sigma_sm_gg_ttx,
+                  epochX/cudacpp/gg_ttg.sa/SubProcesses/P1_Sigma_sm_gg_ttxg,
+                  epochX/cudacpp/gg_ttgg.sa/SubProcesses/P1_Sigma_sm_gg_ttxgg,
+                  epochX/cudacpp/gg_ttggg.sa/SubProcesses/P1_Sigma_sm_gg_ttxggg ]
         precision: [ d , f , m ]
       fail-fast: false
     steps:
@@ -51,20 +63,22 @@ jobs:
     - name: make check
       run: make AVX=none OMPFLAGS= FPTYPE=${{ matrix.precision }} -C ${{ matrix.folder }} check
   GPU:
-    runs-on: [self-hosted, linux, v100s]
+    runs-on: [self-hosted, linux, a100]
     env:
       CUDA_HOME: /usr/local/cuda/
       FC: gfortran
       REQUIRE_CUDA: 1
     strategy:
       matrix:
-        folder: [ epochX/cudacpp/ee_mumu.sa/SubProcesses/P1_Sigma_sm_epem_mupmum , epochX/cudacpp/gg_tt.sa/SubProcesses/P1_Sigma_sm_gg_ttx , epochX/cudacpp/gg_ttg.sa/SubProcesses/P1_Sigma_sm_gg_ttxg , epochX/cudacpp/gg_ttgg.sa/SubProcesses/P1_Sigma_sm_gg_ttxgg , epochX/cudacpp/gg_ttggg.sa/SubProcesses/P1_Sigma_sm_gg_ttxggg ]
+        folder: [ epochX/cudacpp/ee_mumu.sa/SubProcesses/P1_Sigma_sm_epem_mupmum,
+                  epochX/cudacpp/gg_tt.sa/SubProcesses/P1_Sigma_sm_gg_ttx,
+                  epochX/cudacpp/gg_ttg.sa/SubProcesses/P1_Sigma_sm_gg_ttxg,
+                  epochX/cudacpp/gg_ttgg.sa/SubProcesses/P1_Sigma_sm_gg_ttxgg,
+                  epochX/cudacpp/gg_ttggg.sa/SubProcesses/P1_Sigma_sm_gg_ttxggg ]
         precision: [ d , f , m ]
       fail-fast: false
     steps:
     - uses: actions/checkout@v2
-    - name: path
-      run: echo "PATH=$PATH"
     - name: make info
       run: source /cvmfs/sft.cern.ch/lcg/releases/gcc/11.3.0-ad0f5/x86_64-centos8/setup.sh; make FPTYPE=${{ matrix.precision }} -C ${{ matrix.folder }} info
     - name: make

From 1f611c809171ee4ced7db2e61c405b066e6a08cd Mon Sep 17 00:00:00 2001
From: Jooorgen <jorgen.teig@gmail.com>
Date: Thu, 30 Mar 2023 17:45:41 +0200
Subject: [PATCH 222/509] Updated c-cpp CI formating again

---
 .github/workflows/c-cpp.yml | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/c-cpp.yml b/.github/workflows/c-cpp.yml
index 0e33be034c..5170e057d6 100644
--- a/.github/workflows/c-cpp.yml
+++ b/.github/workflows/c-cpp.yml
@@ -80,8 +80,11 @@ jobs:
     steps:
     - uses: actions/checkout@v2
     - name: make info
-      run: source /cvmfs/sft.cern.ch/lcg/releases/gcc/11.3.0-ad0f5/x86_64-centos8/setup.sh; make FPTYPE=${{ matrix.precision }} -C ${{ matrix.folder }} info
+      run: source /cvmfs/sft.cern.ch/lcg/releases/gcc/11.3.0-ad0f5/x86_64-centos8/setup.sh;
+           make FPTYPE=${{ matrix.precision }} -C ${{ matrix.folder }} info
     - name: make
-      run: source /cvmfs/sft.cern.ch/lcg/releases/gcc/11.3.0-ad0f5/x86_64-centos8/setup.sh; make FPTYPE=${{ matrix.precision }} -C ${{ matrix.folder }}
+      run: source /cvmfs/sft.cern.ch/lcg/releases/gcc/11.3.0-ad0f5/x86_64-centos8/setup.sh;
+           make FPTYPE=${{ matrix.precision }} -C ${{ matrix.folder }}
     - name: make check
-      run: source /cvmfs/sft.cern.ch/lcg/releases/gcc/11.3.0-ad0f5/x86_64-centos8/setup.sh; make FPTYPE=${{ matrix.precision }} -C ${{ matrix.folder }} check
+      run: source /cvmfs/sft.cern.ch/lcg/releases/gcc/11.3.0-ad0f5/x86_64-centos8/setup.sh;
+           make FPTYPE=${{ matrix.precision }} -C ${{ matrix.folder }} check

From 39d2382d7b5e39df4f0ca8867dc8545eb9dccd5c Mon Sep 17 00:00:00 2001
From: Jooorgen <jorgen.teig@gmail.com>
Date: Thu, 30 Mar 2023 17:46:30 +0200
Subject: [PATCH 223/509] Updated GCC CVMFS path

---
 .github/workflows/sycl.yml | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/sycl.yml b/.github/workflows/sycl.yml
index b754072274..cca40af09b 100644
--- a/.github/workflows/sycl.yml
+++ b/.github/workflows/sycl.yml
@@ -26,19 +26,19 @@ jobs:
     steps:
     - uses: actions/checkout@v2
     - name: make info
-      run: source /cvmfs/sft.cern.ch/lcg/contrib/gcc/11.3.0/x86_64-centos8/setup.sh;
+      run: source /cvmfs/sft.cern.ch/lcg/releases/gcc/11.3.0-ad0f5/x86_64-centos8/setup.sh;
            source /cvmfs/projects.cern.ch/intelsw/oneAPI/linux/x86_64/2023/setvars.sh --include-intel-llvm;
            CXX=/cvmfs/projects.cern.ch/intelsw/oneAPI/linux/x86_64/2023/compiler/2023.0.0/linux/bin-llvm/clang++;
            LD_LIBRARY_PATH=${{ github.workspace }}/${{ matrix.folder }}/../../lib:$LD_LIBRARY_PATH;
            make FPTYPE=${{ matrix.precision }} -C ${{ matrix.folder }} info
     - name: make
-      run: source /cvmfs/sft.cern.ch/lcg/contrib/gcc/11.3.0/x86_64-centos8/setup.sh;
+      run: source /cvmfs/sft.cern.ch/lcg/releases/gcc/11.3.0-ad0f5/x86_64-centos8/setup.sh;
            source /cvmfs/projects.cern.ch/intelsw/oneAPI/linux/x86_64/2023/setvars.sh --include-intel-llvm;
            CXX=/cvmfs/projects.cern.ch/intelsw/oneAPI/linux/x86_64/2023/compiler/2023.0.0/linux/bin-llvm/clang++;
            LD_LIBRARY_PATH=${{ github.workspace }}/${{ matrix.folder }}/../../lib:$LD_LIBRARY_PATH;
            make FPTYPE=${{ matrix.precision }} -C ${{ matrix.folder }}
     - name: make check
-      run: source /cvmfs/sft.cern.ch/lcg/contrib/gcc/11.3.0/x86_64-centos8/setup.sh;
+      run: source /cvmfs/sft.cern.ch/lcg/releases/gcc/11.3.0-ad0f5/x86_64-centos8/setup.sh;
            source /cvmfs/projects.cern.ch/intelsw/oneAPI/linux/x86_64/2023/setvars.sh --include-intel-llvm;
            CXX=/cvmfs/projects.cern.ch/intelsw/oneAPI/linux/x86_64/2023/compiler/2023.0.0/linux/bin-llvm/clang++;
            LD_LIBRARY_PATH=${{ github.workspace }}/${{ matrix.folder }}/../../lib:$LD_LIBRARY_PATH;

From 00c3bc6199176d7c8a1e79b3aad2d943d8708db1 Mon Sep 17 00:00:00 2001
From: Jooorgen <jorgen.teig@gmail.com>
Date: Fri, 31 Mar 2023 08:13:42 +0200
Subject: [PATCH 224/509] Removed unneccessary export SYCLFLAGS in SYCL
 workflow

---
 .github/workflows/sycl.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/sycl.yml b/.github/workflows/sycl.yml
index cca40af09b..43a2ca0f02 100644
--- a/.github/workflows/sycl.yml
+++ b/.github/workflows/sycl.yml
@@ -12,7 +12,7 @@ jobs:
     env:
       FC: gfortran
       REQUIRE_CUDA: 1
-      SYCLFLAGS: export SYCLFLAGS="-fsycl -fsycl-targets=nvptx64-nvidia-cuda -Xsycl-target-backend --cuda-gpu-arch=sm_80 -Xclang -fdenormal-fp-math=ieee"
+      SYCLFLAGS: -fsycl -fsycl-targets=nvptx64-nvidia-cuda -Xsycl-target-backend --cuda-gpu-arch=sm_80 -Xclang -fdenormal-fp-math=ieee
       ENABLE_CI_PROFILER: 1
     strategy:
       matrix:

From 59bf3bb953a596578899817be9b23854912002cc Mon Sep 17 00:00:00 2001
From: Jooorgen <jorgen.teig@gmail.com>
Date: Fri, 31 Mar 2023 08:51:49 +0200
Subject: [PATCH 225/509] Added device_id to SYCL workflow execution

---
 .github/workflows/sycl.yml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.github/workflows/sycl.yml b/.github/workflows/sycl.yml
index 43a2ca0f02..49edcb9238 100644
--- a/.github/workflows/sycl.yml
+++ b/.github/workflows/sycl.yml
@@ -14,6 +14,7 @@ jobs:
       REQUIRE_CUDA: 1
       SYCLFLAGS: -fsycl -fsycl-targets=nvptx64-nvidia-cuda -Xsycl-target-backend --cuda-gpu-arch=sm_80 -Xclang -fdenormal-fp-math=ieee
       ENABLE_CI_PROFILER: 1
+      device_id: 2
     strategy:
       matrix:
         folder: [ epochX/sycl/ee_mumu.sa/SubProcesses/P1_Sigma_sm_epem_mupmum,

From 94309f0534adc4e5c15c7519d7caac3690eea4b5 Mon Sep 17 00:00:00 2001
From: Jooorgen <jorgen.teig@gmail.com>
Date: Fri, 31 Mar 2023 09:03:24 +0200
Subject: [PATCH 226/509] Added the correct device ID when the SYCL CI runs on
 our runners

---
 epochX/sycl/ee_mumu.sa/SubProcesses/sycl.mk  | 9 ++++++---
 epochX/sycl/gg_tt.sa/SubProcesses/sycl.mk    | 9 ++++++---
 epochX/sycl/gg_ttg.sa/SubProcesses/sycl.mk   | 9 ++++++---
 epochX/sycl/gg_ttgg.sa/SubProcesses/sycl.mk  | 9 ++++++---
 epochX/sycl/gg_ttggg.sa/SubProcesses/sycl.mk | 9 ++++++---
 5 files changed, 30 insertions(+), 15 deletions(-)

diff --git a/epochX/sycl/ee_mumu.sa/SubProcesses/sycl.mk b/epochX/sycl/ee_mumu.sa/SubProcesses/sycl.mk
index 395c679825..c7fcd4071b 100644
--- a/epochX/sycl/ee_mumu.sa/SubProcesses/sycl.mk
+++ b/epochX/sycl/ee_mumu.sa/SubProcesses/sycl.mk
@@ -98,6 +98,9 @@ export NTPBMAX
 $(info ENABLE_CI_PROFILER=$(ENABLE_CI_PROFILER))
 ifeq ($(ENABLE_CI_PROFILER),1)
   CXXFLAGS += --gcc-toolchain="/cvmfs/sft.cern.ch/lcg/releases/gcc/11.3.0-ad0f5/x86_64-centos8"
+
+  # Sets the device ID to the GPU (oneAPI Toolkit) when running check/cmpFcheck in the GitHub CI
+  ENABLE_DEVICE_ID = "--device_id=2"
 endif
 
 # Set the build flags appropriate to each FPTYPE choice (example: "make FPTYPE=f")
@@ -343,8 +346,8 @@ check: cmpFcheck
 # Target: cmpFcheck (compare ME results from the C++ and Fortran with C++ MEs standalone executables, with a small number of events)
 cmpFcheck: all.$(TAG)
 	@echo
-	@echo "$(BUILDDIR)/check.exe -p 2 32 2"
-	@echo "$(BUILDDIR)/fcheck.exe 2 32 2"
-	@me1=$(shell $(RUNTIME) $(BUILDDIR)/check.exe -p 2 32 2 | grep MeanMatrix | awk '{print $$4}'); me2=$(shell $(RUNTIME) $(BUILDDIR)/fcheck.exe 2 32 2 | grep Average | awk '{print $$4}'); echo "Avg ME (C++/C++)    = $${me1}"; echo "Avg ME (F77/C++)    = $${me2}"; if [ "$${me2}" == "NaN" ]; then echo "ERROR! Fortran calculation (F77/C++) returned NaN"; elif [ "$${me2}" == "" ]; then echo "ERROR! Fortran calculation (F77/C++) crashed"; else python3 -c "me1=$${me1}; me2=$${me2}; reldif=abs((me2-me1)/me1); print('Relative difference =', reldif); ok = reldif <= 2E-4; print ( '%s (relative difference %s 2E-4)' % ( ('OK','<=') if ok else ('ERROR','>') ) ); import sys; sys.exit(0 if ok else 1)"; fi
+	@echo "$(BUILDDIR)/check.exe -p 2 32 2 ${ENABLE_DEVICE_ID}"
+	@echo "$(BUILDDIR)/fcheck.exe 2 32 2 ${ENABLE_DEVICE_ID}"
+	@me1=$(shell $(RUNTIME) $(BUILDDIR)/check.exe -p 2 32 2 ${ENABLE_DEVICE_ID} | grep MeanMatrix | awk '{print $$4}'); me2=$(shell $(RUNTIME) $(BUILDDIR)/fcheck.exe 2 32 2 ${ENABLE_DEVICE_ID} | grep Average | awk '{print $$4}'); echo "Avg ME (C++/C++)    = $${me1}"; echo "Avg ME (F77/C++)    = $${me2}"; if [ "$${me2}" == "NaN" ]; then echo "ERROR! Fortran calculation (F77/C++) returned NaN"; elif [ "$${me2}" == "" ]; then echo "ERROR! Fortran calculation (F77/C++) crashed"; else python3 -c "me1=$${me1}; me2=$${me2}; reldif=abs((me2-me1)/me1); print('Relative difference =', reldif); ok = reldif <= 2E-4; print ( '%s (relative difference %s 2E-4)' % ( ('OK','<=') if ok else ('ERROR','>') ) ); import sys; sys.exit(0 if ok else 1)"; fi
 
 #-------------------------------------------------------------------------------
diff --git a/epochX/sycl/gg_tt.sa/SubProcesses/sycl.mk b/epochX/sycl/gg_tt.sa/SubProcesses/sycl.mk
index 395c679825..c7fcd4071b 100644
--- a/epochX/sycl/gg_tt.sa/SubProcesses/sycl.mk
+++ b/epochX/sycl/gg_tt.sa/SubProcesses/sycl.mk
@@ -98,6 +98,9 @@ export NTPBMAX
 $(info ENABLE_CI_PROFILER=$(ENABLE_CI_PROFILER))
 ifeq ($(ENABLE_CI_PROFILER),1)
   CXXFLAGS += --gcc-toolchain="/cvmfs/sft.cern.ch/lcg/releases/gcc/11.3.0-ad0f5/x86_64-centos8"
+
+  # Sets the device ID to the GPU (oneAPI Toolkit) when running check/cmpFcheck in the GitHub CI
+  ENABLE_DEVICE_ID = "--device_id=2"
 endif
 
 # Set the build flags appropriate to each FPTYPE choice (example: "make FPTYPE=f")
@@ -343,8 +346,8 @@ check: cmpFcheck
 # Target: cmpFcheck (compare ME results from the C++ and Fortran with C++ MEs standalone executables, with a small number of events)
 cmpFcheck: all.$(TAG)
 	@echo
-	@echo "$(BUILDDIR)/check.exe -p 2 32 2"
-	@echo "$(BUILDDIR)/fcheck.exe 2 32 2"
-	@me1=$(shell $(RUNTIME) $(BUILDDIR)/check.exe -p 2 32 2 | grep MeanMatrix | awk '{print $$4}'); me2=$(shell $(RUNTIME) $(BUILDDIR)/fcheck.exe 2 32 2 | grep Average | awk '{print $$4}'); echo "Avg ME (C++/C++)    = $${me1}"; echo "Avg ME (F77/C++)    = $${me2}"; if [ "$${me2}" == "NaN" ]; then echo "ERROR! Fortran calculation (F77/C++) returned NaN"; elif [ "$${me2}" == "" ]; then echo "ERROR! Fortran calculation (F77/C++) crashed"; else python3 -c "me1=$${me1}; me2=$${me2}; reldif=abs((me2-me1)/me1); print('Relative difference =', reldif); ok = reldif <= 2E-4; print ( '%s (relative difference %s 2E-4)' % ( ('OK','<=') if ok else ('ERROR','>') ) ); import sys; sys.exit(0 if ok else 1)"; fi
+	@echo "$(BUILDDIR)/check.exe -p 2 32 2 ${ENABLE_DEVICE_ID}"
+	@echo "$(BUILDDIR)/fcheck.exe 2 32 2 ${ENABLE_DEVICE_ID}"
+	@me1=$(shell $(RUNTIME) $(BUILDDIR)/check.exe -p 2 32 2 ${ENABLE_DEVICE_ID} | grep MeanMatrix | awk '{print $$4}'); me2=$(shell $(RUNTIME) $(BUILDDIR)/fcheck.exe 2 32 2 ${ENABLE_DEVICE_ID} | grep Average | awk '{print $$4}'); echo "Avg ME (C++/C++)    = $${me1}"; echo "Avg ME (F77/C++)    = $${me2}"; if [ "$${me2}" == "NaN" ]; then echo "ERROR! Fortran calculation (F77/C++) returned NaN"; elif [ "$${me2}" == "" ]; then echo "ERROR! Fortran calculation (F77/C++) crashed"; else python3 -c "me1=$${me1}; me2=$${me2}; reldif=abs((me2-me1)/me1); print('Relative difference =', reldif); ok = reldif <= 2E-4; print ( '%s (relative difference %s 2E-4)' % ( ('OK','<=') if ok else ('ERROR','>') ) ); import sys; sys.exit(0 if ok else 1)"; fi
 
 #-------------------------------------------------------------------------------
diff --git a/epochX/sycl/gg_ttg.sa/SubProcesses/sycl.mk b/epochX/sycl/gg_ttg.sa/SubProcesses/sycl.mk
index 395c679825..c7fcd4071b 100644
--- a/epochX/sycl/gg_ttg.sa/SubProcesses/sycl.mk
+++ b/epochX/sycl/gg_ttg.sa/SubProcesses/sycl.mk
@@ -98,6 +98,9 @@ export NTPBMAX
 $(info ENABLE_CI_PROFILER=$(ENABLE_CI_PROFILER))
 ifeq ($(ENABLE_CI_PROFILER),1)
   CXXFLAGS += --gcc-toolchain="/cvmfs/sft.cern.ch/lcg/releases/gcc/11.3.0-ad0f5/x86_64-centos8"
+
+  # Sets the device ID to the GPU (oneAPI Toolkit) when running check/cmpFcheck in the GitHub CI
+  ENABLE_DEVICE_ID = "--device_id=2"
 endif
 
 # Set the build flags appropriate to each FPTYPE choice (example: "make FPTYPE=f")
@@ -343,8 +346,8 @@ check: cmpFcheck
 # Target: cmpFcheck (compare ME results from the C++ and Fortran with C++ MEs standalone executables, with a small number of events)
 cmpFcheck: all.$(TAG)
 	@echo
-	@echo "$(BUILDDIR)/check.exe -p 2 32 2"
-	@echo "$(BUILDDIR)/fcheck.exe 2 32 2"
-	@me1=$(shell $(RUNTIME) $(BUILDDIR)/check.exe -p 2 32 2 | grep MeanMatrix | awk '{print $$4}'); me2=$(shell $(RUNTIME) $(BUILDDIR)/fcheck.exe 2 32 2 | grep Average | awk '{print $$4}'); echo "Avg ME (C++/C++)    = $${me1}"; echo "Avg ME (F77/C++)    = $${me2}"; if [ "$${me2}" == "NaN" ]; then echo "ERROR! Fortran calculation (F77/C++) returned NaN"; elif [ "$${me2}" == "" ]; then echo "ERROR! Fortran calculation (F77/C++) crashed"; else python3 -c "me1=$${me1}; me2=$${me2}; reldif=abs((me2-me1)/me1); print('Relative difference =', reldif); ok = reldif <= 2E-4; print ( '%s (relative difference %s 2E-4)' % ( ('OK','<=') if ok else ('ERROR','>') ) ); import sys; sys.exit(0 if ok else 1)"; fi
+	@echo "$(BUILDDIR)/check.exe -p 2 32 2 ${ENABLE_DEVICE_ID}"
+	@echo "$(BUILDDIR)/fcheck.exe 2 32 2 ${ENABLE_DEVICE_ID}"
+	@me1=$(shell $(RUNTIME) $(BUILDDIR)/check.exe -p 2 32 2 ${ENABLE_DEVICE_ID} | grep MeanMatrix | awk '{print $$4}'); me2=$(shell $(RUNTIME) $(BUILDDIR)/fcheck.exe 2 32 2 ${ENABLE_DEVICE_ID} | grep Average | awk '{print $$4}'); echo "Avg ME (C++/C++)    = $${me1}"; echo "Avg ME (F77/C++)    = $${me2}"; if [ "$${me2}" == "NaN" ]; then echo "ERROR! Fortran calculation (F77/C++) returned NaN"; elif [ "$${me2}" == "" ]; then echo "ERROR! Fortran calculation (F77/C++) crashed"; else python3 -c "me1=$${me1}; me2=$${me2}; reldif=abs((me2-me1)/me1); print('Relative difference =', reldif); ok = reldif <= 2E-4; print ( '%s (relative difference %s 2E-4)' % ( ('OK','<=') if ok else ('ERROR','>') ) ); import sys; sys.exit(0 if ok else 1)"; fi
 
 #-------------------------------------------------------------------------------
diff --git a/epochX/sycl/gg_ttgg.sa/SubProcesses/sycl.mk b/epochX/sycl/gg_ttgg.sa/SubProcesses/sycl.mk
index 395c679825..ba52a36851 100644
--- a/epochX/sycl/gg_ttgg.sa/SubProcesses/sycl.mk
+++ b/epochX/sycl/gg_ttgg.sa/SubProcesses/sycl.mk
@@ -98,6 +98,9 @@ export NTPBMAX
 $(info ENABLE_CI_PROFILER=$(ENABLE_CI_PROFILER))
 ifeq ($(ENABLE_CI_PROFILER),1)
   CXXFLAGS += --gcc-toolchain="/cvmfs/sft.cern.ch/lcg/releases/gcc/11.3.0-ad0f5/x86_64-centos8"
+
+  # Sets the device ID to the GPU (oneAPI Toolkit) when running check/cmpFcheck in the GitHub CI
+  ENABLE_DEVICE_ID = "--device_id=2"
 endif
 
 # Set the build flags appropriate to each FPTYPE choice (example: "make FPTYPE=f")
@@ -343,8 +346,8 @@ check: cmpFcheck
 # Target: cmpFcheck (compare ME results from the C++ and Fortran with C++ MEs standalone executables, with a small number of events)
 cmpFcheck: all.$(TAG)
 	@echo
-	@echo "$(BUILDDIR)/check.exe -p 2 32 2"
-	@echo "$(BUILDDIR)/fcheck.exe 2 32 2"
-	@me1=$(shell $(RUNTIME) $(BUILDDIR)/check.exe -p 2 32 2 | grep MeanMatrix | awk '{print $$4}'); me2=$(shell $(RUNTIME) $(BUILDDIR)/fcheck.exe 2 32 2 | grep Average | awk '{print $$4}'); echo "Avg ME (C++/C++)    = $${me1}"; echo "Avg ME (F77/C++)    = $${me2}"; if [ "$${me2}" == "NaN" ]; then echo "ERROR! Fortran calculation (F77/C++) returned NaN"; elif [ "$${me2}" == "" ]; then echo "ERROR! Fortran calculation (F77/C++) crashed"; else python3 -c "me1=$${me1}; me2=$${me2}; reldif=abs((me2-me1)/me1); print('Relative difference =', reldif); ok = reldif <= 2E-4; print ( '%s (relative difference %s 2E-4)' % ( ('OK','<=') if ok else ('ERROR','>') ) ); import sys; sys.exit(0 if ok else 1)"; fi
+	@echo "$(BUILDDIR)/check.exe -p 2 32 2 ${ENABLE_DEVICE_ID}"
+	@echo "$(BUILDDIR)/fcheck.exe 2 32 2 ${ENABLE_DEVICE_ID}"
+	@me1=$(shell $(RUNTIME) $(BUILDDIR)/check.exe ${ENABLE_DEVICE_ID} -p 2 32 2 | grep MeanMatrix | awk '{print $$4}'); me2=$(shell $(RUNTIME) $(BUILDDIR)/fcheck.exe 2 32 2 ${ENABLE_DEVICE_ID} | grep Average | awk '{print $$4}'); echo "Avg ME (C++/C++)    = $${me1}"; echo "Avg ME (F77/C++)    = $${me2}"; if [ "$${me2}" == "NaN" ]; then echo "ERROR! Fortran calculation (F77/C++) returned NaN"; elif [ "$${me2}" == "" ]; then echo "ERROR! Fortran calculation (F77/C++) crashed"; else python3 -c "me1=$${me1}; me2=$${me2}; reldif=abs((me2-me1)/me1); print('Relative difference =', reldif); ok = reldif <= 2E-4; print ( '%s (relative difference %s 2E-4)' % ( ('OK','<=') if ok else ('ERROR','>') ) ); import sys; sys.exit(0 if ok else 1)"; fi
 
 #-------------------------------------------------------------------------------
diff --git a/epochX/sycl/gg_ttggg.sa/SubProcesses/sycl.mk b/epochX/sycl/gg_ttggg.sa/SubProcesses/sycl.mk
index 7b13f8a0f8..0bf90927a2 100644
--- a/epochX/sycl/gg_ttggg.sa/SubProcesses/sycl.mk
+++ b/epochX/sycl/gg_ttggg.sa/SubProcesses/sycl.mk
@@ -97,6 +97,9 @@ export NTPBMAX
 $(info ENABLE_CI_PROFILER=$(ENABLE_CI_PROFILER))
 ifeq ($(ENABLE_CI_PROFILER),1)
   CXXFLAGS += --gcc-toolchain="/cvmfs/sft.cern.ch/lcg/releases/gcc/11.3.0-ad0f5/x86_64-centos8"
+
+  # Sets the device ID to the GPU (oneAPI Toolkit) when running check/cmpFcheck in the GitHub CI
+  ENABLE_DEVICE_ID = "--device_id=2"
 endif
 
 # Set the build flags appropriate to each FPTYPE choice (example: "make FPTYPE=f")
@@ -342,8 +345,8 @@ check: cmpFcheck
 # Target: cmpFcheck (compare ME results from the C++ and Fortran with C++ MEs standalone executables, with a small number of events)
 cmpFcheck: all.$(TAG)
 	@echo
-	@echo "$(BUILDDIR)/check.exe -p 2 32 2"
-	@echo "$(BUILDDIR)/fcheck.exe 2 32 2"
-	@me1=$(shell $(RUNTIME) $(BUILDDIR)/check.exe -p 2 32 2 | grep MeanMatrix | awk '{print $$4}'); me2=$(shell $(RUNTIME) $(BUILDDIR)/fcheck.exe 2 32 2 | grep Average | awk '{print $$4}'); echo "Avg ME (C++/C++)    = $${me1}"; echo "Avg ME (F77/C++)    = $${me2}"; if [ "$${me2}" == "NaN" ]; then echo "ERROR! Fortran calculation (F77/C++) returned NaN"; elif [ "$${me2}" == "" ]; then echo "ERROR! Fortran calculation (F77/C++) crashed"; else python3 -c "me1=$${me1}; me2=$${me2}; reldif=abs((me2-me1)/me1); print('Relative difference =', reldif); ok = reldif <= 2E-4; print ( '%s (relative difference %s 2E-4)' % ( ('OK','<=') if ok else ('ERROR','>') ) ); import sys; sys.exit(0 if ok else 1)"; fi
+	@echo "$(BUILDDIR)/check.exe -p 2 32 2 ${ENABLE_DEVICE_ID}"
+	@echo "$(BUILDDIR)/fcheck.exe 2 32 2 ${ENABLE_DEVICE_ID}"
+	@me1=$(shell $(RUNTIME) $(BUILDDIR)/check.exe -p 2 32 2 ${ENABLE_DEVICE_ID} | grep MeanMatrix | awk '{print $$4}'); me2=$(shell $(RUNTIME) $(BUILDDIR)/fcheck.exe 2 32 2 ${ENABLE_DEVICE_ID} | grep Average | awk '{print $$4}'); echo "Avg ME (C++/C++)    = $${me1}"; echo "Avg ME (F77/C++)    = $${me2}"; if [ "$${me2}" == "NaN" ]; then echo "ERROR! Fortran calculation (F77/C++) returned NaN"; elif [ "$${me2}" == "" ]; then echo "ERROR! Fortran calculation (F77/C++) crashed"; else python3 -c "me1=$${me1}; me2=$${me2}; reldif=abs((me2-me1)/me1); print('Relative difference =', reldif); ok = reldif <= 2E-4; print ( '%s (relative difference %s 2E-4)' % ( ('OK','<=') if ok else ('ERROR','>') ) ); import sys; sys.exit(0 if ok else 1)"; fi
 
 #-------------------------------------------------------------------------------

From bde2ee3aa1ecd070c5f654adb253d9636c053fda Mon Sep 17 00:00:00 2001
From: Jooorgen <jorgen.teig@gmail.com>
Date: Fri, 31 Mar 2023 09:04:05 +0200
Subject: [PATCH 227/509] Removed device_id in workflow

---
 .github/workflows/sycl.yml | 1 -
 1 file changed, 1 deletion(-)

diff --git a/.github/workflows/sycl.yml b/.github/workflows/sycl.yml
index 49edcb9238..43a2ca0f02 100644
--- a/.github/workflows/sycl.yml
+++ b/.github/workflows/sycl.yml
@@ -14,7 +14,6 @@ jobs:
       REQUIRE_CUDA: 1
       SYCLFLAGS: -fsycl -fsycl-targets=nvptx64-nvidia-cuda -Xsycl-target-backend --cuda-gpu-arch=sm_80 -Xclang -fdenormal-fp-math=ieee
       ENABLE_CI_PROFILER: 1
-      device_id: 2
     strategy:
       matrix:
         folder: [ epochX/sycl/ee_mumu.sa/SubProcesses/P1_Sigma_sm_epem_mupmum,

From bfefe2815fbbb54c1219156b011756671e4095c7 Mon Sep 17 00:00:00 2001
From: Jooorgen <jorgen.teig@gmail.com>
Date: Fri, 31 Mar 2023 11:10:35 +0200
Subject: [PATCH 228/509] Removed device_id from fortran in check

---
 epochX/sycl/ee_mumu.sa/SubProcesses/sycl.mk  | 4 ++--
 epochX/sycl/gg_tt.sa/SubProcesses/sycl.mk    | 4 ++--
 epochX/sycl/gg_ttg.sa/SubProcesses/sycl.mk   | 4 ++--
 epochX/sycl/gg_ttgg.sa/SubProcesses/sycl.mk  | 4 ++--
 epochX/sycl/gg_ttggg.sa/SubProcesses/sycl.mk | 4 ++--
 5 files changed, 10 insertions(+), 10 deletions(-)

diff --git a/epochX/sycl/ee_mumu.sa/SubProcesses/sycl.mk b/epochX/sycl/ee_mumu.sa/SubProcesses/sycl.mk
index c7fcd4071b..7e6cbaba0e 100644
--- a/epochX/sycl/ee_mumu.sa/SubProcesses/sycl.mk
+++ b/epochX/sycl/ee_mumu.sa/SubProcesses/sycl.mk
@@ -347,7 +347,7 @@ check: cmpFcheck
 cmpFcheck: all.$(TAG)
 	@echo
 	@echo "$(BUILDDIR)/check.exe -p 2 32 2 ${ENABLE_DEVICE_ID}"
-	@echo "$(BUILDDIR)/fcheck.exe 2 32 2 ${ENABLE_DEVICE_ID}"
-	@me1=$(shell $(RUNTIME) $(BUILDDIR)/check.exe -p 2 32 2 ${ENABLE_DEVICE_ID} | grep MeanMatrix | awk '{print $$4}'); me2=$(shell $(RUNTIME) $(BUILDDIR)/fcheck.exe 2 32 2 ${ENABLE_DEVICE_ID} | grep Average | awk '{print $$4}'); echo "Avg ME (C++/C++)    = $${me1}"; echo "Avg ME (F77/C++)    = $${me2}"; if [ "$${me2}" == "NaN" ]; then echo "ERROR! Fortran calculation (F77/C++) returned NaN"; elif [ "$${me2}" == "" ]; then echo "ERROR! Fortran calculation (F77/C++) crashed"; else python3 -c "me1=$${me1}; me2=$${me2}; reldif=abs((me2-me1)/me1); print('Relative difference =', reldif); ok = reldif <= 2E-4; print ( '%s (relative difference %s 2E-4)' % ( ('OK','<=') if ok else ('ERROR','>') ) ); import sys; sys.exit(0 if ok else 1)"; fi
+	@echo "$(BUILDDIR)/fcheck.exe 2 32 2"
+	@me1=$(shell $(RUNTIME) $(BUILDDIR)/check.exe -p 2 32 2 ${ENABLE_DEVICE_ID} | grep MeanMatrix | awk '{print $$4}'); me2=$(shell $(RUNTIME) $(BUILDDIR)/fcheck.exe 2 32 2 | grep Average | awk '{print $$4}'); echo "Avg ME (C++/C++)    = $${me1}"; echo "Avg ME (F77/C++)    = $${me2}"; if [ "$${me2}" == "NaN" ]; then echo "ERROR! Fortran calculation (F77/C++) returned NaN"; elif [ "$${me2}" == "" ]; then echo "ERROR! Fortran calculation (F77/C++) crashed"; else python3 -c "me1=$${me1}; me2=$${me2}; reldif=abs((me2-me1)/me1); print('Relative difference =', reldif); ok = reldif <= 2E-4; print ( '%s (relative difference %s 2E-4)' % ( ('OK','<=') if ok else ('ERROR','>') ) ); import sys; sys.exit(0 if ok else 1)"; fi
 
 #-------------------------------------------------------------------------------
diff --git a/epochX/sycl/gg_tt.sa/SubProcesses/sycl.mk b/epochX/sycl/gg_tt.sa/SubProcesses/sycl.mk
index c7fcd4071b..7e6cbaba0e 100644
--- a/epochX/sycl/gg_tt.sa/SubProcesses/sycl.mk
+++ b/epochX/sycl/gg_tt.sa/SubProcesses/sycl.mk
@@ -347,7 +347,7 @@ check: cmpFcheck
 cmpFcheck: all.$(TAG)
 	@echo
 	@echo "$(BUILDDIR)/check.exe -p 2 32 2 ${ENABLE_DEVICE_ID}"
-	@echo "$(BUILDDIR)/fcheck.exe 2 32 2 ${ENABLE_DEVICE_ID}"
-	@me1=$(shell $(RUNTIME) $(BUILDDIR)/check.exe -p 2 32 2 ${ENABLE_DEVICE_ID} | grep MeanMatrix | awk '{print $$4}'); me2=$(shell $(RUNTIME) $(BUILDDIR)/fcheck.exe 2 32 2 ${ENABLE_DEVICE_ID} | grep Average | awk '{print $$4}'); echo "Avg ME (C++/C++)    = $${me1}"; echo "Avg ME (F77/C++)    = $${me2}"; if [ "$${me2}" == "NaN" ]; then echo "ERROR! Fortran calculation (F77/C++) returned NaN"; elif [ "$${me2}" == "" ]; then echo "ERROR! Fortran calculation (F77/C++) crashed"; else python3 -c "me1=$${me1}; me2=$${me2}; reldif=abs((me2-me1)/me1); print('Relative difference =', reldif); ok = reldif <= 2E-4; print ( '%s (relative difference %s 2E-4)' % ( ('OK','<=') if ok else ('ERROR','>') ) ); import sys; sys.exit(0 if ok else 1)"; fi
+	@echo "$(BUILDDIR)/fcheck.exe 2 32 2"
+	@me1=$(shell $(RUNTIME) $(BUILDDIR)/check.exe -p 2 32 2 ${ENABLE_DEVICE_ID} | grep MeanMatrix | awk '{print $$4}'); me2=$(shell $(RUNTIME) $(BUILDDIR)/fcheck.exe 2 32 2 | grep Average | awk '{print $$4}'); echo "Avg ME (C++/C++)    = $${me1}"; echo "Avg ME (F77/C++)    = $${me2}"; if [ "$${me2}" == "NaN" ]; then echo "ERROR! Fortran calculation (F77/C++) returned NaN"; elif [ "$${me2}" == "" ]; then echo "ERROR! Fortran calculation (F77/C++) crashed"; else python3 -c "me1=$${me1}; me2=$${me2}; reldif=abs((me2-me1)/me1); print('Relative difference =', reldif); ok = reldif <= 2E-4; print ( '%s (relative difference %s 2E-4)' % ( ('OK','<=') if ok else ('ERROR','>') ) ); import sys; sys.exit(0 if ok else 1)"; fi
 
 #-------------------------------------------------------------------------------
diff --git a/epochX/sycl/gg_ttg.sa/SubProcesses/sycl.mk b/epochX/sycl/gg_ttg.sa/SubProcesses/sycl.mk
index c7fcd4071b..7e6cbaba0e 100644
--- a/epochX/sycl/gg_ttg.sa/SubProcesses/sycl.mk
+++ b/epochX/sycl/gg_ttg.sa/SubProcesses/sycl.mk
@@ -347,7 +347,7 @@ check: cmpFcheck
 cmpFcheck: all.$(TAG)
 	@echo
 	@echo "$(BUILDDIR)/check.exe -p 2 32 2 ${ENABLE_DEVICE_ID}"
-	@echo "$(BUILDDIR)/fcheck.exe 2 32 2 ${ENABLE_DEVICE_ID}"
-	@me1=$(shell $(RUNTIME) $(BUILDDIR)/check.exe -p 2 32 2 ${ENABLE_DEVICE_ID} | grep MeanMatrix | awk '{print $$4}'); me2=$(shell $(RUNTIME) $(BUILDDIR)/fcheck.exe 2 32 2 ${ENABLE_DEVICE_ID} | grep Average | awk '{print $$4}'); echo "Avg ME (C++/C++)    = $${me1}"; echo "Avg ME (F77/C++)    = $${me2}"; if [ "$${me2}" == "NaN" ]; then echo "ERROR! Fortran calculation (F77/C++) returned NaN"; elif [ "$${me2}" == "" ]; then echo "ERROR! Fortran calculation (F77/C++) crashed"; else python3 -c "me1=$${me1}; me2=$${me2}; reldif=abs((me2-me1)/me1); print('Relative difference =', reldif); ok = reldif <= 2E-4; print ( '%s (relative difference %s 2E-4)' % ( ('OK','<=') if ok else ('ERROR','>') ) ); import sys; sys.exit(0 if ok else 1)"; fi
+	@echo "$(BUILDDIR)/fcheck.exe 2 32 2"
+	@me1=$(shell $(RUNTIME) $(BUILDDIR)/check.exe -p 2 32 2 ${ENABLE_DEVICE_ID} | grep MeanMatrix | awk '{print $$4}'); me2=$(shell $(RUNTIME) $(BUILDDIR)/fcheck.exe 2 32 2 | grep Average | awk '{print $$4}'); echo "Avg ME (C++/C++)    = $${me1}"; echo "Avg ME (F77/C++)    = $${me2}"; if [ "$${me2}" == "NaN" ]; then echo "ERROR! Fortran calculation (F77/C++) returned NaN"; elif [ "$${me2}" == "" ]; then echo "ERROR! Fortran calculation (F77/C++) crashed"; else python3 -c "me1=$${me1}; me2=$${me2}; reldif=abs((me2-me1)/me1); print('Relative difference =', reldif); ok = reldif <= 2E-4; print ( '%s (relative difference %s 2E-4)' % ( ('OK','<=') if ok else ('ERROR','>') ) ); import sys; sys.exit(0 if ok else 1)"; fi
 
 #-------------------------------------------------------------------------------
diff --git a/epochX/sycl/gg_ttgg.sa/SubProcesses/sycl.mk b/epochX/sycl/gg_ttgg.sa/SubProcesses/sycl.mk
index ba52a36851..ce9d8c69d9 100644
--- a/epochX/sycl/gg_ttgg.sa/SubProcesses/sycl.mk
+++ b/epochX/sycl/gg_ttgg.sa/SubProcesses/sycl.mk
@@ -347,7 +347,7 @@ check: cmpFcheck
 cmpFcheck: all.$(TAG)
 	@echo
 	@echo "$(BUILDDIR)/check.exe -p 2 32 2 ${ENABLE_DEVICE_ID}"
-	@echo "$(BUILDDIR)/fcheck.exe 2 32 2 ${ENABLE_DEVICE_ID}"
-	@me1=$(shell $(RUNTIME) $(BUILDDIR)/check.exe ${ENABLE_DEVICE_ID} -p 2 32 2 | grep MeanMatrix | awk '{print $$4}'); me2=$(shell $(RUNTIME) $(BUILDDIR)/fcheck.exe 2 32 2 ${ENABLE_DEVICE_ID} | grep Average | awk '{print $$4}'); echo "Avg ME (C++/C++)    = $${me1}"; echo "Avg ME (F77/C++)    = $${me2}"; if [ "$${me2}" == "NaN" ]; then echo "ERROR! Fortran calculation (F77/C++) returned NaN"; elif [ "$${me2}" == "" ]; then echo "ERROR! Fortran calculation (F77/C++) crashed"; else python3 -c "me1=$${me1}; me2=$${me2}; reldif=abs((me2-me1)/me1); print('Relative difference =', reldif); ok = reldif <= 2E-4; print ( '%s (relative difference %s 2E-4)' % ( ('OK','<=') if ok else ('ERROR','>') ) ); import sys; sys.exit(0 if ok else 1)"; fi
+	@echo "$(BUILDDIR)/fcheck.exe 2 32 2"
+	@me1=$(shell $(RUNTIME) $(BUILDDIR)/check.exe ${ENABLE_DEVICE_ID} -p 2 32 2 | grep MeanMatrix | awk '{print $$4}'); me2=$(shell $(RUNTIME) $(BUILDDIR)/fcheck.exe 2 32 2 | grep Average | awk '{print $$4}'); echo "Avg ME (C++/C++)    = $${me1}"; echo "Avg ME (F77/C++)    = $${me2}"; if [ "$${me2}" == "NaN" ]; then echo "ERROR! Fortran calculation (F77/C++) returned NaN"; elif [ "$${me2}" == "" ]; then echo "ERROR! Fortran calculation (F77/C++) crashed"; else python3 -c "me1=$${me1}; me2=$${me2}; reldif=abs((me2-me1)/me1); print('Relative difference =', reldif); ok = reldif <= 2E-4; print ( '%s (relative difference %s 2E-4)' % ( ('OK','<=') if ok else ('ERROR','>') ) ); import sys; sys.exit(0 if ok else 1)"; fi
 
 #-------------------------------------------------------------------------------
diff --git a/epochX/sycl/gg_ttggg.sa/SubProcesses/sycl.mk b/epochX/sycl/gg_ttggg.sa/SubProcesses/sycl.mk
index 0bf90927a2..edf3fbb1a1 100644
--- a/epochX/sycl/gg_ttggg.sa/SubProcesses/sycl.mk
+++ b/epochX/sycl/gg_ttggg.sa/SubProcesses/sycl.mk
@@ -346,7 +346,7 @@ check: cmpFcheck
 cmpFcheck: all.$(TAG)
 	@echo
 	@echo "$(BUILDDIR)/check.exe -p 2 32 2 ${ENABLE_DEVICE_ID}"
-	@echo "$(BUILDDIR)/fcheck.exe 2 32 2 ${ENABLE_DEVICE_ID}"
-	@me1=$(shell $(RUNTIME) $(BUILDDIR)/check.exe -p 2 32 2 ${ENABLE_DEVICE_ID} | grep MeanMatrix | awk '{print $$4}'); me2=$(shell $(RUNTIME) $(BUILDDIR)/fcheck.exe 2 32 2 ${ENABLE_DEVICE_ID} | grep Average | awk '{print $$4}'); echo "Avg ME (C++/C++)    = $${me1}"; echo "Avg ME (F77/C++)    = $${me2}"; if [ "$${me2}" == "NaN" ]; then echo "ERROR! Fortran calculation (F77/C++) returned NaN"; elif [ "$${me2}" == "" ]; then echo "ERROR! Fortran calculation (F77/C++) crashed"; else python3 -c "me1=$${me1}; me2=$${me2}; reldif=abs((me2-me1)/me1); print('Relative difference =', reldif); ok = reldif <= 2E-4; print ( '%s (relative difference %s 2E-4)' % ( ('OK','<=') if ok else ('ERROR','>') ) ); import sys; sys.exit(0 if ok else 1)"; fi
+	@echo "$(BUILDDIR)/fcheck.exe 2 32 2"
+	@me1=$(shell $(RUNTIME) $(BUILDDIR)/check.exe -p 2 32 2 ${ENABLE_DEVICE_ID} | grep MeanMatrix | awk '{print $$4}'); me2=$(shell $(RUNTIME) $(BUILDDIR)/fcheck.exe 2 32 2 | grep Average | awk '{print $$4}'); echo "Avg ME (C++/C++)    = $${me1}"; echo "Avg ME (F77/C++)    = $${me2}"; if [ "$${me2}" == "NaN" ]; then echo "ERROR! Fortran calculation (F77/C++) returned NaN"; elif [ "$${me2}" == "" ]; then echo "ERROR! Fortran calculation (F77/C++) crashed"; else python3 -c "me1=$${me1}; me2=$${me2}; reldif=abs((me2-me1)/me1); print('Relative difference =', reldif); ok = reldif <= 2E-4; print ( '%s (relative difference %s 2E-4)' % ( ('OK','<=') if ok else ('ERROR','>') ) ); import sys; sys.exit(0 if ok else 1)"; fi
 
 #-------------------------------------------------------------------------------

From 013fc38c5f219440511cf0de93fe7f2e0ed9a4da Mon Sep 17 00:00:00 2001
From: Jooorgen <jorgen.teig@gmail.com>
Date: Fri, 31 Mar 2023 16:59:35 +0200
Subject: [PATCH 229/509] Reformated buildSYCLProcess.sh script

---
 tools/profiling/buildSYCLProcess.sh | 82 ++++++++++++++++++-----------
 1 file changed, 51 insertions(+), 31 deletions(-)

diff --git a/tools/profiling/buildSYCLProcess.sh b/tools/profiling/buildSYCLProcess.sh
index 333e339637..c0be7d705a 100755
--- a/tools/profiling/buildSYCLProcess.sh
+++ b/tools/profiling/buildSYCLProcess.sh
@@ -1,28 +1,19 @@
 #!/bin/bash
 
-# Assign correct SM level for NVIDIA GPUs
-
-# Check if nvidia-smi command exists
-if command -v nvidia-smi > /dev/null 2>&1; then
-
-    # Get the name of the GPU
-    GPU_NAME=$(nvidia-smi --query-gpu=name --format=csv,noheader)
-
-    # GPU (DEVICE_ID=2 for oneAPI toolkit runs on GPUs, else DEVICE_ID=0)
-    export DEVICE_ID=2
-    # CPU
-    #export DEVICE_ID=1
-else
-    echo "nvidia-smi non existent on system, Nvidia GPU possibly not present!"
-    exit
-fi
-
-case $GPU_NAME in
-    *V100S* ) export SM_LEVEL="sm_70" ;;
-    *A100* ) export SM_LEVEL="sm_80" ;;
-esac
-
-##################################################################
+#
+#   __  __               _    ____                          _       _  _      ____   ____    _   _ 
+#  |  \/  |   __ _    __| |  / ___|  _ __    __ _   _ __   | |__   | || |    / ___| |  _ \  | | | |
+#  | |\/| |  / _` |  / _` | | |  _  | '__|  / _` | | '_ \  | '_ \  | || |_  | |  _  | |_) | | | | |
+#  | |  | | | (_| | | (_| | | |_| | | |    | (_| | | |_) | | | | | |__   _| | |_| | |  __/  | |_| |
+#  |_|  |_|  \__,_|  \__,_|  \____| |_|     \__,_| | .__/  |_| |_|    |_|    \____| |_|      \___/ 
+#                                                  |_|                                             
+#
+#
+#   Bash script for compiling and executing physics processes using the MadGraph5_aMC@NLO GPU development framework
+#   using oneAPI/SYCL
+#
+#   Author: Jorgen Teig, CERN 2023
+#
 
 helpFunction()
 {
@@ -57,6 +48,32 @@ then
     helpFunction
 fi
 
+##################################################################
+
+# Assign correct SM level for NVIDIA GPUs
+
+# Check if nvidia-smi command exists
+if command -v nvidia-smi > /dev/null 2>&1; then
+
+    # Get the name of the GPU
+    GPU_NAME=$(nvidia-smi --query-gpu=name --format=csv,noheader)
+
+    # GPU (DEVICE_ID=2 for oneAPI toolkit runs on GPUs, else DEVICE_ID=0 with LLVM compiler)
+    export DEVICE_ID=2
+    # CPU
+    #export DEVICE_ID=1
+else
+    echo "nvidia-smi non existent on system, Nvidia GPU possibly not present!"
+    exit
+fi
+
+case $GPU_NAME in
+    *V100S* ) export SM_LEVEL="sm_70" ;;
+    *A100* ) export SM_LEVEL="sm_80" ;;
+esac
+
+##################################################################
+
 # Begin script in case all parameters and GPU specific settings are set
 
 ##################################################################
@@ -71,17 +88,20 @@ export NTPBMAX=1024
 export CUDA_PATH=/usr/local/cuda-12.0/
 export WORKSPACE=$prefix/workspace_mg4gpu
 
-# Compilation using OneAPI Toolkit through CVMFS
-export CXX=/cvmfs/projects.cern.ch/intelsw/oneAPI/linux/x86_64/2023/compiler/2023.0.0/linux/bin-llvm/clang++
-#export SYCLFLAGS="-fsycl -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=$SM_LEVEL"
+# Old SYCLFLAGS
+# export SYCLFLAGS="-fsycl -fsycl-targets=nvptx64-nvidia-cuda -Xsycl-target-backend '--cuda-gpu-arch=$SM_LEVEL' -fgpu-rdc --cuda-path=$CUDA_PATH"
 
-# Gets no erros with this:
 export SYCLFLAGS="-fsycl -fsycl-targets=nvptx64-nvidia-cuda -Xsycl-target-backend --cuda-gpu-arch=$SM_LEVEL -Xclang -fdenormal-fp-math=ieee"
 
+# Compilation using OneAPI Toolkit through CVMFS
+export CXX=/cvmfs/projects.cern.ch/intelsw/oneAPI/linux/x86_64/2023/compiler/2023.0.0/linux/bin-llvm/clang++
+
 # Compilation for OneAPI LLVM compiler
 #export DPCPP_HOME=/afs/cern.ch/work/j/jteig/sycl_workspace
 #export CXX=$DPCPP_HOME/llvm/build/bin/clang++
-#export SYCLFLAGS="-fsycl -fsycl-targets=nvptx64-nvidia-cuda -Xsycl-target-backend '--cuda-gpu-arch=$SM_LEVEL' -fgpu-rdc --cuda-path=$CUDA_PATH"
+
+# Sets CUDA in PATH
+export PATH=$CUDA_HOME:$PATH
 
 # Branch should be enviroment variable in main script and then passed down if none then it is not displayed in prefix
 REPORT_FOLDER="${WORKSPACE}/$(date +"%y-%m-%d")_${SYCL_NAME_PREFIX}_${branch}"
@@ -125,14 +145,14 @@ export MG5AMC_CARD_PATH=$MG_PROC_DIR/Cards
 # Build executable
 cd $MG_SP_DIR
 make -j
-mv ../../lib/build.d_inl0*/ $MG_LIBS_DIR #2>/dev/null; true
-mv build.d_inl0*/ $MG_EXE_DIR #2>/dev/null; true
+mv -f ../../lib/build.*/ $MG_LIBS_DIR #2>/dev/null; true
+mv -f build.*/ $MG_EXE_DIR
 
 # Run executable
 cd $WORKSPACE
 
 if [ $DEVICE_ID == "info" ]; then
-    # Display the devices
+    # Add MG Libs to linker library path and display the devices
     LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$MG_LIBS $MG_EXE --param_card $MG5AMC_CARD_PATH/param_card.dat --device_info 32 32 10
 
 else

From b11a1582f3753d7721f06ffa3cc59f6d57a228bc Mon Sep 17 00:00:00 2001
From: Jooorgen <jorgen.teig@gmail.com>
Date: Fri, 31 Mar 2023 17:00:12 +0200
Subject: [PATCH 230/509] Reformated buildCUDAProcess.sh script

---
 tools/profiling/buildCUDAProcess.sh | 18 ++++++++++++++++--
 1 file changed, 16 insertions(+), 2 deletions(-)

diff --git a/tools/profiling/buildCUDAProcess.sh b/tools/profiling/buildCUDAProcess.sh
index 28dd44d5e4..80edca074b 100755
--- a/tools/profiling/buildCUDAProcess.sh
+++ b/tools/profiling/buildCUDAProcess.sh
@@ -1,5 +1,20 @@
 #!/bin/bash
 
+#
+#   __  __               _    ____                          _       _  _      ____   ____    _   _ 
+#  |  \/  |   __ _    __| |  / ___|  _ __    __ _   _ __   | |__   | || |    / ___| |  _ \  | | | |
+#  | |\/| |  / _` |  / _` | | |  _  | '__|  / _` | | '_ \  | '_ \  | || |_  | |  _  | |_) | | | | |
+#  | |  | | | (_| | | (_| | | |_| | | |    | (_| | | |_) | | | | | |__   _| | |_| | |  __/  | |_| |
+#  |_|  |_|  \__,_|  \__,_|  \____| |_|     \__,_| | .__/  |_| |_|    |_|    \____| |_|      \___/ 
+#                                                  |_|                                             
+#
+#
+#   Bash script for compiling and executing physics processes using the MadGraph5_aMC@NLO GPU development framework
+#   using CUDA
+#
+#   Author: Jorgen Teig, CERN 2023
+#
+
 helpFunction()
 {
     echo ""
@@ -47,10 +62,9 @@ export NTPBMAX=1024
 export CXX=/cvmfs/sft.cern.ch/lcg/releases/gcc/11.3.0-ad0f5/x86_64-centos8/bin/g++
 export MG_EXE="./gcheck.exe" #GPU
 #export MG_EXE="./check.exe" #CPU
-export CUDA_HOME=/usr/local/cuda/
+export CUDA_HOME=/usr/local/cuda-12.0/
 export FC=`which gfortran`
 export WORKSPACE=$prefix/workspace_mg4gpu
-#export NAME_PREFIX="cudacpp_v100s_cuda_11.6.2_gcc_11.3"
 
 REPORT_FOLDER="${WORKSPACE}/$(date +"%y-%m-%d")_${CUDA_NAME_PREFIX}_${branch}"
 

From 2356d67363831b85b92e1d4a2f3ad438d10d5ac5 Mon Sep 17 00:00:00 2001
From: Jooorgen <jorgen.teig@gmail.com>
Date: Fri, 31 Mar 2023 17:00:32 +0200
Subject: [PATCH 231/509] Reformated performanceProfiler.py script

---
 tools/profiling/performanceProfiler.py | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)

diff --git a/tools/profiling/performanceProfiler.py b/tools/profiling/performanceProfiler.py
index 6a08f13f55..2482dda69b 100644
--- a/tools/profiling/performanceProfiler.py
+++ b/tools/profiling/performanceProfiler.py
@@ -1,3 +1,20 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+
+#
+#   __  __               _    ____                          _       _  _      ____   ____    _   _ 
+#  |  \/  |   __ _    __| |  / ___|  _ __    __ _   _ __   | |__   | || |    / ___| |  _ \  | | | |
+#  | |\/| |  / _` |  / _` | | |  _  | '__|  / _` | | '_ \  | '_ \  | || |_  | |  _  | |_) | | | | |
+#  | |  | | | (_| | | (_| | | |_| | | |    | (_| | | |_) | | | | | |__   _| | |_| | |  __/  | |_| |
+#  |_|  |_|  \__,_|  \__,_|  \____| |_|     \__,_| | .__/  |_| |_|    |_|    \____| |_|      \___/ 
+#                                                  |_|                                             
+#
+#
+#   Python script for performance profiling using the MadGraph5_aMC@NLO GPU development framework
+#
+#   Author: Jorgen Teig, CERN 2023
+#
+
 import sys
 import subprocess
 import datetime

From 1b7100afc7af0c90aad18dcbe9d4b2a35daf9f8f Mon Sep 17 00:00:00 2001
From: Jooorgen <jorgen.teig@gmail.com>
Date: Fri, 31 Mar 2023 17:00:46 +0200
Subject: [PATCH 232/509] Added README

---
 tools/profiling/README.md | 163 ++++++++++++++++++++++++++++++++++++++
 1 file changed, 163 insertions(+)
 create mode 100644 tools/profiling/README.md

diff --git a/tools/profiling/README.md b/tools/profiling/README.md
new file mode 100644
index 0000000000..1a5251d93b
--- /dev/null
+++ b/tools/profiling/README.md
@@ -0,0 +1,163 @@
+# Documentation
+
+We are currently using [GitHub Actions](https://docs.github.com/en/actions) in conjunction with onsite self-hosted [GitHub Runners](https://docs.github.com/en/actions/hosting-your-own-runners/about-self-hosted-runners) to automate compiling/testing and performance profiling tasks in SYCL and CUDA on A100 and V100s GPUs currently.
+
+## Grafana link: [madgraph4gpu-db.web.cern.ch](https://madgraph4gpu-db.web.cern.ch/)
+
+## Performance Profiling
+
+### Profiling baseline currently used
+
+**GCC - 11.3.0**
+
+**CUDA - 12.0.1**
+
+**Clang - 16**
+
+### GitHub Actions Runner
+
+A [GitHub Runner](https://docs.github.com/en/actions/using-github-hosted-runners/about-github-hosted-runners) is a tool that allows users to automate their workflow by running [actions](https://docs.github.com/en/actions) or tasks in response to specific events on GitHub. This can include tasks such as running tests, building and deploying code, or publishing artifacts. They can be easily configured and managed through the GitHub website, and can help users streamline their development process and ensure that their code is always up-to-date and ready for deployment. In our case we use them to automate CI and nightly performance profiling.
+
+### performanceProfiler.py
+
+This is the main entrypoint for the profiler. It executes the two bash build scripts for SYCL ```buildSYCLProcess.sh``` and CUDA ```buildCUDAProcess.sh``` with the correct ThreadsPerBlock, BlocksPerThread and iteration count.
+
+#### Usage:
+
+Go to the `tools/profiling` directory and run:
+
+```
+python3 performanceProfiler.py -l <abstration_layer> -b <branch>
+```
+
+The following options are available for this script:
+
+`-l`: This option specifies the abstraction layer to use for profiling. The supported values are "SYCL" and "CUDA". The default value is "SYCL".
+
+`-b`: This option specifies the branch of the madgraph4gpu repository that will be used. The default value is "master".
+
+Example:
+
+Copy code
+python script.py
+To run the script with a different abstraction layer and branch, you can use the following command:
+
+```
+python script.py -l CUDA -b my_branch
+```
+
+### buildSYCLProcess.sh
+
+This bash script compiles and executes standalone physics processes using the MadGraph5_aMC@NLO GPU development framework with oneAPI/SYCL.
+
+#### Usage
+
+Go to the `tools/profiling` directory and run:
+
+```
+./buildSYCLProcess.sh -n <physics_process> -b <blocks_per_grid> -t <threads_per_block> -i <iterations> [-r <branch>] [-d <device_id/info>]
+```
+
+#### Arguments:
+
+* `-n`: Name of the physics process being built and run (e.g., gg_ttgg).
+
+* `-b`: Number of blocks per grid.
+
+* `-t`: Number of threads per block.
+
+* `-i`: Number of iterations.
+
+* `-r`: (Optional) Branch name. Default: not displayed in the report folder prefix.
+
+* `-d`: (Optional) Flag for setting the device ID. Default: "--device_id 2" for oneAPI toolkit runs on GPUs, otherwise "--device_id 0" for LLVM DPCPP compiler. You can also use `-d info` to get the specific device IDs for that host.
+
+#### Example:
+
+```
+./buildSYCLProcess.sh -n gg_ttgg -b 1024 -t 128 -i 10 -r master -d 2
+```
+
+**Note**:
+
+To also compile to CPUs you need to enable more backends in the DPCPP toolchain (Currently when you follow how to use the LLVM DPCPP compiler for CUDA it does not install the necessary dependencies to see other devices as well on the host). You can read more on how to enable more backends [here](https://intel.github.io/llvm-docs/GetStartedGuide.html#build-dpc-toolchain).
+
+### buildCUDAProcess.sh
+
+This script compiles and executes physics processes using the MadGraph5_aMC@NLO GPU development framework with CUDA.
+
+#### Usage
+
+Go to the `tools/profiling` directory and run:
+
+```
+./buildCUDAProcess.sh -n <process_name> -b <blocks_per_grid> -t <threads_per_block> -i <iterations> -r <branch> -m <make_args>
+```
+
+#### Arguments:
+
+* `-n`: Name of the physics process being built and run.
+
+* `-b`: Number of blocks per grid.
+
+* `-t`: Number of threads per block.
+
+* `-i`: Number of iterations.
+
+* `-r`: Branch name.
+
+* `-m`: Makefile arguments.
+
+#### Example:
+
+```
+./buildCUDAProcess.sh -n gg_ttgg -b 1024 -t 128 -i 10 -r master -m avx2
+```
+
+#### Notes
+
+This script assumes that it is run from the profiling directory in the repository.
+Make sure to set the correct CUDA path according to your system.
+You may need to modify the script to set the correct GPU architecture or compiler options depending on your system.
+
+### sendData.py
+
+#### Usage:
+
+Go to the `tools/profiling` directory and run:
+
+```
+python3 sendData.py -r <report_folder_to_upload> -branch <master>
+```
+
+The following arguments are available for this script:
+
+* `-r` or `--reportPath```: This argument specifies the path for the reports that will be sent to the database.
+
+* `-f` or `--fields`: This argument specifies the fields in the JSON data that will be sent to the database. The default value is `['EvtsPerSec[MatrixElems] (3)', 'EvtsPerSec[MECalcOnly] (3)']`.
+
+* `-b` or `--branch`: This argument specifies the branch that the profiler data is in. The default value is `master`.
+
+* `-p` or `--profiler`: This argument enables CI profiling defaults. The default value is `False`.
+
+For example, to run the script with the default arguments, you can use the following command:
+
+python3 sendData.py
+To run the script with a custom report path and branch, you can use the following command:
+
+python3 sendData.py -r /path/to/reports -b my_branch
+Note that some options may not be relevant or may not work as expected in certain situations. For example, the -p option will only work when CI profiling defaults are enabled.
+
+## Known issues:
+
+### Bug in GCC 11.3.0/11.3.1 using the LLVM DPCPP compiler 
+
+There is a [bug](https://bugs.gentoo.org/842405) affecting GCC versions 11.3.0/11.3.1 when compiling the standalone physics processes resulting in two compilation errors `.../fs_path.h:1209:9: error: 'end' is missing exception specification 'noexcept'` and `.../fs_path.h:1217:9: error: 'end' is missing exception specification 'noexcept'`` in the `fs_path.h` file. GCC version 11.2.0 is not affected, and appears to be fixed in later versions (Remains to be tested and cited).
+
+### libmg5amc_common.so: cannot open shared object file: No such file or directory
+
+The libmg5amc_common.so library is not set in the LD_LIBRARY_PATH
+
+### Not linking correctly/Wrong linker version from what you intend to compile with?
+
+If you have problems with wrong linker see which candidate GCC finds with `./sycl_workspace/llvm/build/bin/clang++ -v` and see if it is the correct GCC candidate. If it is not, you can correct this with adding `--gcc-toolchain=/cvmfs/sft.cern.ch/lcg/releases/gcc/11.3.0-ad0f5/x86_64-centos8/lib/gcc/x86_64-pc-linux-gnu/11.3.0` to the `CXXFLAGS`. This will correctly set the GCC candidate to the desired GCC installation. Using `ENABLE_CI_PROFILER=1` automatically adds this in all the standalone physics processes makefiles in SYCL and in CUDA.

From efff2b0a85b5062a99c88b28ab194a646c26264a Mon Sep 17 00:00:00 2001
From: Jooorgen <jorgen.teig@gmail.com>
Date: Fri, 31 Mar 2023 17:02:36 +0200
Subject: [PATCH 233/509] Added arguments and did some minor changes to
 evaluation.py

---
 tools/profiling/evaluation.py | 50 ++++++++++++++++++++++++-----------
 1 file changed, 34 insertions(+), 16 deletions(-)

diff --git a/tools/profiling/evaluation.py b/tools/profiling/evaluation.py
index bce1a76e38..99bcddb4ae 100755
--- a/tools/profiling/evaluation.py
+++ b/tools/profiling/evaluation.py
@@ -4,6 +4,7 @@
 Created on Tue Mar 30 09:59:03 2021
 
 @author: andy
+@edited: Jorgen Teig
 """
 import json
 import os
@@ -26,15 +27,15 @@
 
 physicsProcesses = ['ee_mumu', 'gg_tt', 'gg_ttg', 'gg_ttgg', 'gg_ttggg']
 
-reportPath = 'C:\\Users\\jteig\\cernbox\\Documents\\CERN\\reports\\Sycl_v100s_Profiling_18.10.GCC11.3_CUDA11.6.2_MASTER\\'
+reportPath = 'C:\\Users\\jteig\\cernbox\\Documents\\Report folder 2023\\Merged_23-02-07'
 
-savePath = 'C:\\Users\\jteig\\cernbox\\Documents\\CERN\\Graphs\\'
+savePath = 'C:\\Users\\jteig\\cernbox\\Documents\\Report folder 2023\\Graphs\\Graphs but big\\'
 
-filePrefix = 'test_ATS-P_sycl_11.5'
+filePrefix = 'test_A100_sycl_11.5'
 
 # 'test_v100s_sycl_11.5'
 
-hardware = 'ATS-P'
+hardware = 'Nvidia A100'
 #hardware = 'NVIDIA v100s'
 
 #############################
@@ -43,9 +44,11 @@
 #
 #############################
 
-compare = False
+compare = True
 
-graphsToCompare = ['test_v100s_cuda_11.5_gg_ttgg', 'test_v100s_sycl_11.5_gg_ttgg']
+processToCompare = 'gg_tt'
+
+graphsToCompare = ['test_A100_SYCL_' + processToCompare , 'test_A100_CUDA_' + processToCompare]
 
 stat = 'MECalcOnly'
 #stat = 'MatrixElems'
@@ -67,6 +70,10 @@
 #exit(0)
 
 class Evaluation:
+
+    # Remove warnings regarding chained assignment using pandas dataframes
+    # The code is still working as expected
+    pd.set_option('mode.chained_assignment', None)
     
     list_results=[]     #List results
     Data=pd.DataFrame() #To store all results in one DataFrame
@@ -330,11 +337,13 @@ def data_compare2(self,df_dict,compare_list):
         
         #enable grid
         plt.rcParams['grid.linestyle']=':'
+        plt.rc('font', size=15)
+        plt.rc('axes', labelsize=50)
         plt.grid()
         
         #setup x axis
         ax1.set_xscale('log')
-        plt.xticks(df_dict[list(df_dict.keys())[0]]['gridsize'])
+        plt.xticks(df_dict[list(df_dict.keys())[0]]['gridsize'],size=15)
         ax1.set_xticklabels(df_dict[list(df_dict.keys())[0]]['gridsize'],rotation=75)
         
         #setup y axis
@@ -350,14 +359,14 @@ def data_compare2(self,df_dict,compare_list):
         ax1.set_yscale('log')
         
         #Add labels and title
-        plt.ylabel('Throughput\n'+ stat +' [s-1]')
-        plt.xlabel('Gridsize (nBlocksGPU * nThreadsGPU)')
-        plt.title("SYCL vs CUDA throughput for "+ graph1[4] + '_' + graph1[5] +" on " + hardware + "\n")
+        plt.ylabel('Throughput\n'+ stat +' [s-1]', size=30)
+        plt.xlabel('Gridsize (nBlocksGPU * nThreadsGPU)', size=30)
+        plt.title("SYCL vs CUDA throughput for "+ graph1[3] + '_' + graph1[4] +" on " + hardware + "\n", size=30,wrap=True)
         
         #Change colormap. More info here https://matplotlib.org/stable/tutorials/colors/colormaps.html 
         cmap=plt.get_cmap('Set1')
         
-        i=1
+        i=2
         for data in compare_list:
 
             tempVar  = 'EvtsPerSec['+ stat +'] (3)'
@@ -370,13 +379,21 @@ def data_compare2(self,df_dict,compare_list):
             length=len(str(maxima_y))-1
             label_maximas=str(round(maxima_y*10**-(length),3))+'e'+str(length)
             
+            if i == 2:
+                markerType='o'
+            else:
+                markerType='X'
+
             #plot datasets
             ax1.scatter(df_dict[data]['gridsize'].to_list(),df_dict[data][tempVar].to_list(),
                         label=data+ ' (max = %s)'%label_maximas,
                         color=cmap(i),
-                        s=150,alpha=0.9)
+                        s=150,alpha=0.9, marker=markerType)
+
+            ax1.plot(df_dict[data]['gridsize'].to_list(),df_dict[data][tempVar].to_list(), color=cmap(i))
+
             #Get next cmap color
-            i+=1 
+            i+=2
             
             #plot max values
             ax1.scatter(maxima_x,maxima_y,c='r',marker='o',s=50)
@@ -385,8 +402,9 @@ def data_compare2(self,df_dict,compare_list):
             
         ax1.legend(loc='best')
 
-        plt.tight_layout()
+
         plt.autoscale()
+        plt.tight_layout()
         
         
         plt.show()
@@ -397,7 +415,7 @@ def data_compare2(self,df_dict,compare_list):
 
         # args.s + graph1[3] + '_' + graph1[4] + '_vs_' + graph2[3] + '_' + graph2[4]
 
-        fig.savefig(args.s + 'SYCL_' + graph1[4] + '_' + graph1[5] + '_vs_CUDA_' + graph2[4] + '_' + graph2[5] + '_' + stat +'.png')
+        fig.savefig(args.s + 'SYCL_' + graph1[3] + '_' + graph1[4] + '_vs_CUDA_' + graph2[3] + '_' + graph2[4] + '_' + stat +'.png', bbox_inches="tight")
         
     def dataframes_statistical_transfomation(self,df_dict,stat):
         #This functions takes a dictionary of dataframes and returns a dictionary with dataframes
@@ -473,7 +491,7 @@ def color(self,value):
         # To be done
         #test_df=Ev.data_compare(dataframes_conv,list_to_compare,'max')
     
-        #print(dataframes_statisical)
+        print(dataframes_statisical)
 
         Ev.data_compare2(dataframes_statisical,graphsToCompare)
     

From edb8a067c1a80f5754cb3742149a6f8379209fa7 Mon Sep 17 00:00:00 2001
From: Jooorgen <jorgen.teig@gmail.com>
Date: Fri, 31 Mar 2023 17:03:02 +0200
Subject: [PATCH 234/509] Refactored sendData.py script

---
 tools/profiling/sendData.py | 75 ++++++++++++++++++++++---------------
 1 file changed, 44 insertions(+), 31 deletions(-)

diff --git a/tools/profiling/sendData.py b/tools/profiling/sendData.py
index 6b247ff4fa..399f84318b 100644
--- a/tools/profiling/sendData.py
+++ b/tools/profiling/sendData.py
@@ -1,3 +1,21 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+
+#
+#   __  __               _    ____                          _       _  _      ____   ____    _   _ 
+#  |  \/  |   __ _    __| |  / ___|  _ __    __ _   _ __   | |__   | || |    / ___| |  _ \  | | | |
+#  | |\/| |  / _` |  / _` | | |  _  | '__|  / _` | | '_ \  | '_ \  | || |_  | |  _  | |_) | | | | |
+#  | |  | | | (_| | | (_| | | |_| | | |    | (_| | | |_) | | | | | |__   _| | |_| | |  __/  | |_| |
+#  |_|  |_|  \__,_|  \__,_|  \____| |_|     \__,_| | .__/  |_| |_|    |_|    \____| |_|      \___/ 
+#                                                  |_|                                             
+#
+#
+#   Python script for sending generated reports from performance profiling to InfluxDB instance 
+#   using the MadGraph5_aMC@NLO GPU development framework
+#
+#   Author: Jorgen Teig, CERN 2023
+#
+
 import os
 import glob
 import json
@@ -6,31 +24,27 @@
 import subprocess
 import datetime
 import argparse
-
 import sys
 
 # Parameter defaults
 URL = 'https://dbod-madgraph4gpu-db.cern.ch:8082/api/v2/write?bucket=ProfilerData'
-secret = 'fV8dKViWTVdnA3Rw*qCeA@MYtZki@q'
-Auth = ['db_user', secret]
-physicsProcesses = ['ee_mumu', 'gg_ttggg', 'gg_ttgg', 'gg_ttg', 'gg_tt']
-absLayers = ['SYCL', 'CUDA']
-branch = 'master'
-fields = ['EvtsPerSec[MatrixElems] (3)', 'EvtsPerSec[MECalcOnly] (3)']
-reportPath = 'C:\\Users\\jteig\\cernbox\\Documents\\test\\22-12-07_cudacpp_Xeon-Silver-4216_v100s_gcc-11.3_cuda-11.6.2_master'
+secret = os.environ.get('MADGRAPH4GPU_DB_SECRET')
+AUTH = ['db_user', secret]
+PHYS_PROCESSES = ['ee_mumu', 'gg_ttggg', 'gg_ttgg', 'gg_ttg', 'gg_tt']
+ABS_LAYERS = ['SYCL', 'CUDA']
+BRANCH = 'master'
+FIELDS = ['EvtsPerSec[MatrixElems] (3)', 'EvtsPerSec[MECalcOnly] (3)']
+
+# Default reportPath (Useful for testing)
+REPORT_PATH = 'C:\\Users\\jteig\\cernbox\\Documents\\test\\22-12-07_cudacpp_Xeon-Silver-4216_v100s_gcc-11.3_cuda-11.6.2_master'
 
 # Argument parser
 parser = argparse.ArgumentParser(description='A script for sending data from profiler to InfluxDB.')
 
-parser.add_argument('-r', '--reportPath', help="Path for the reports that is being put into the database.", default=reportPath)
-parser.add_argument('-f', '--fields', help="Fields in the JSON to be put into the database.", default=fields)
-#parser.add_argument('-g', '--gpu', help="GPU used when profiling.", default=GPU)
-#parser.add_argument('--GCCVersion', help="GCC version used when profiling.", default=GCCVersion)
-#parser.add_argument('--CUDAVersion', help="CUDA version used when profiling.", default=CUDAVersion)
-parser.add_argument('-a', '--absLayer', help="Abstraction layer used when profiling.", default=absLayers[0])
-parser.add_argument('-b', '--branch', help="Branch the profiler data is in.", default=branch)
-
-# Fix this
+parser.add_argument('-r', '--reportPath', help="Path for the reports that is being put into the database.", default=REPORT_PATH)
+parser.add_argument('-f', '--fields', help="Fields in the JSON to be put into the database.", default=FIELDS)
+parser.add_argument('-a', '--absLayer', help="Abstraction layer used when profiling.", default=ABS_LAYERS[0])
+parser.add_argument('-b', '--branch', help="Branch the profiler data is in.", default=BRANCH)
 parser.add_argument('-p', '--profiler', help="Enable CI profiling defaults.", default='0')
 
 args = parser.parse_args()
@@ -40,20 +54,18 @@
 #
 if __name__=='__main__':
 
-    # Fix this
+    # Sets report path for extracting the reports generated from performanceProfiler.py 
     if args.profiler == '1':
 
         if args.absLayer.upper() == "SYCL":
 
             syclNamePrefix = os.getenv('SYCL_NAME_PREFIX')
             
-            if syclNamePrefix == None:
+            if syclNamePrefix is None:
                 logging.error('Sycl name prefix has not been set!')
                 sys.exit(1)
 
-            # Fix the branch detection from the file name here
             reportfolder= "workspace_mg4gpu/" + datetime.datetime.now().strftime('%y-%m-%d') + '_' + syclNamePrefix + '_' + args.branch
-            print(reportfolder)
 
             if not os.path.exists(reportfolder):
                 logging.error('SYCL report path does not exist!')
@@ -62,7 +74,8 @@
         elif args.absLayer.upper() == "CUDA":
 
             cudaNamePrefix = os.getenv('CUDA_NAME_PREFIX')
-            if cudaNamePrefix == None:
+
+            if cudaNamePrefix is None:
                 logging.error('Cuda name prefix has not been set!')
                 sys.exit(1)
 
@@ -88,7 +101,7 @@
 
     for file in files:
 
-        with open(file, "r") as f:
+        with open(file, "r", encoding='utf-8') as f:
 
             fileContents = f.read()
 
@@ -97,7 +110,7 @@
 
                 fileName = (os.path.basename(file))
 
-                for process in physicsProcesses:
+                for process in PHYS_PROCESSES:
                     if process in fileName.lower():
                         physicsProcess = process
                         break
@@ -116,21 +129,21 @@
 
                 DBdata = f'{physicsProcess},CPU={CPU},GPU={GPU},AbstractionLayer={args.absLayer},GCCVersion={GCCVersion},CUDAVersion={CUDAVersion},NumThreadsPerBlock={data[0]["NumThreadsPerBlock"]},NumBlocksPerGrid={data[0]["NumBlocksPerGrid"]},NumIterations={data[0]["NumIterations"]} Gridsize={gridsize}'
 
-                for field in fields:
+                for field in FIELDS:
                     value = float(re.findall(r'[\d.]+',data[0][field])[0])
-                    
+                   
                     DBdata = DBdata + ',' + args.absLayer + "_" + field.replace(" ", "_") + '=' + str(value)
 
-                requestInfo = ["curl", "-i", "-k",  '-XPOST', "-i",  URL, "--header",  "Authorization: Token "+Auth[0]+":"+Auth[1], "--data-raw", DBdata]
-                
-                request = subprocess.run(requestInfo, stdout=subprocess.DEVNULL)
+                requestInfo = ["curl", "-i", "-k",  '-XPOST', "-i",  URL, "--header",  "Authorization: Token "+AUTH[0]+":"+AUTH[1], "--data-raw", DBdata]
+               
+                request = subprocess.run(requestInfo, stdout=subprocess.DEVNULL, check=True)
 
                 f.close()
-                
+               
                 if request.returncode != 0:
                     print(str(datetime.datetime.now().strftime("%H:%M:%S")) + " Request FAILED! Data: " + DBdata)
                 else:
                     print(str(datetime.datetime.now().strftime("%H:%M:%S")) + " Request COMPLETED! Data: " + DBdata)
 
 
-            else: logging.error('No information/fields in the JSON report!')
\ No newline at end of file
+            else: logging.error('No information/fields in the JSON report!')

From 80cf5138a6a50cffb652c16f909ba7101e978ef6 Mon Sep 17 00:00:00 2001
From: Jooorgen <jorgen.teig@gmail.com>
Date: Fri, 31 Mar 2023 17:05:23 +0200
Subject: [PATCH 235/509] Split profiler into workflows based on GPU

---
 .github/workflows/a100_profiler.yml           | 40 +++++++++++++++++++
 .../{profiler.yml => v100s_profiler.yml}      | 35 +---------------
 2 files changed, 41 insertions(+), 34 deletions(-)
 create mode 100644 .github/workflows/a100_profiler.yml
 rename .github/workflows/{profiler.yml => v100s_profiler.yml} (50%)

diff --git a/.github/workflows/a100_profiler.yml b/.github/workflows/a100_profiler.yml
new file mode 100644
index 0000000000..1cfed19d28
--- /dev/null
+++ b/.github/workflows/a100_profiler.yml
@@ -0,0 +1,40 @@
+name: A100 Performance Profiler
+
+on:
+  schedule:
+    - cron:  '00 00 * * *'
+
+jobs:
+  sycl_A100_Profiling:
+    name: SYCL A100 Profiling
+    env:
+      SYCL_NAME_PREFIX: sycl_AMD-Epyc-7313_a100_gcc-11.3_cuda-12.0.1
+      ENABLE_CI_PROFILER: 1
+
+      MADGRAPH4GPU_DB_SECRET: ${{ secrets.MADGRAPH4GPU_DB_SECRET }}
+    runs-on: [self-hosted, linux, a100]
+    steps:
+    - uses: actions/checkout@v2
+    - name: Runs SYCL performanceProfiler.py script
+      run: cd tools/profiling/;
+           source /cvmfs/sft.cern.ch/lcg/releases/gcc/11.3.0-ad0f5/x86_64-centos8/setup.sh;
+           source /cvmfs/projects.cern.ch/intelsw/oneAPI/linux/x86_64/2023/setvars.sh --include-intel-llvm;
+           python3 performanceProfiler.py -l 'SYCL' -b 'master'
+    - name: Uploads SYCL JSON files to DB
+      run: cd tools/profiling/; python3 sendData.py --absLayer SYCL --profiler 1 --branch master
+
+  cuda_a100_Profiling:
+    name: CUDA A100 Profiling
+    env:
+      CUDA_NAME_PREFIX: cudacpp_AMD-Epyc-7313_a100_gcc-11.3_cuda-12.0.1
+      ENABLE_CI_PROFILER: 1
+
+      MADGRAPH4GPU_DB_SECRET: ${{ secrets.MADGRAPH4GPU_DB_SECRET }}
+    runs-on: [self-hosted, linux, a100]
+    steps:
+    - uses: actions/checkout@v2
+    - name: Runs CUDA performanceProfiler.py script
+      run: cd tools/profiling/; source /cvmfs/sft.cern.ch/lcg/releases/gcc/11.3.0-ad0f5/x86_64-centos8/setup.sh;
+           python3 performanceProfiler.py -l 'CUDA' -b 'master'
+    - name: Uploads CUDA JSON files to DB
+      run: cd tools/profiling/; python3 sendData.py --absLayer CUDA --profiler 1 --branch master
\ No newline at end of file
diff --git a/.github/workflows/profiler.yml b/.github/workflows/v100s_profiler.yml
similarity index 50%
rename from .github/workflows/profiler.yml
rename to .github/workflows/v100s_profiler.yml
index 7374bd6c34..7317566e25 100644
--- a/.github/workflows/profiler.yml
+++ b/.github/workflows/v100s_profiler.yml
@@ -1,43 +1,10 @@
-name: Performance Profiler
+name: V100s Performance Profiler
 
 on:
   schedule:
     - cron:  '00 00 * * *'
 
 jobs:
-  sycl_A100_Profiling:
-    name: SYCL A100 Profiling
-    env:
-      SYCL_NAME_PREFIX: sycl_AMD-Epyc-7313_a100_gcc-11.3_cuda-12.0.1
-      ENABLE_CI_PROFILER: 1
-
-      MADGRAPH4GPU_DB_SECRET: ${{ secrets.MADGRAPH4GPU_DB_SECRET }}
-    runs-on: [self-hosted, linux, a100]
-    steps:
-    - uses: actions/checkout@v2
-    - name: Runs SYCL performanceProfiler.py script
-      run: cd tools/profiling/;
-           source /cvmfs/sft.cern.ch/lcg/releases/gcc/11.3.0-ad0f5/x86_64-centos8/setup.sh;
-           source /cvmfs/projects.cern.ch/intelsw/oneAPI/linux/x86_64/2023/setvars.sh --include-intel-llvm;
-           python3 performanceProfiler.py -l 'SYCL' -b 'master'
-    - name: Uploads SYCL JSON files to DB
-      run: cd tools/profiling/; python3 sendData.py --absLayer SYCL --profiler 1 --branch master
-
-  cuda_a100_Profiling:
-    name: CUDA A100 Profiling
-    env:
-      CUDA_NAME_PREFIX: cudacpp_AMD-Epyc-7313_a100_gcc-11.3_cuda-12.0.1
-      ENABLE_CI_PROFILER: 1
-
-      MADGRAPH4GPU_DB_SECRET: ${{ secrets.MADGRAPH4GPU_DB_SECRET }}
-    runs-on: [self-hosted, linux, a100]
-    steps:
-    - uses: actions/checkout@v2
-    - name: Runs CUDA performanceProfiler.py script
-      run: cd tools/profiling/; source /cvmfs/sft.cern.ch/lcg/releases/gcc/11.3.0-ad0f5/x86_64-centos8/setup.sh;
-           python3 performanceProfiler.py -l 'CUDA' -b 'master'
-    - name: Uploads CUDA JSON files to DB
-      run: cd tools/profiling/; python3 sendData.py --absLayer CUDA --profiler 1 --branch master
 
   sycl_v100s_Profiling:
     name: SYCL V100S Profiling

From c5590a0c0ec2ca2ba4bfe0ebe046be808bcf4982 Mon Sep 17 00:00:00 2001
From: Jooorgen <jorgen.teig@gmail.com>
Date: Fri, 31 Mar 2023 17:09:23 +0200
Subject: [PATCH 236/509] Added paths to the SYCL CI workflow

---
 .github/workflows/sycl.yml | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/.github/workflows/sycl.yml b/.github/workflows/sycl.yml
index 43a2ca0f02..3af91957f8 100644
--- a/.github/workflows/sycl.yml
+++ b/.github/workflows/sycl.yml
@@ -3,8 +3,12 @@ name: SYCL CI
 on:
   push:
     branches: [ master ]
+    paths:
+      - 'epochX/sycl/**'
   pull_request:
     branches: [ master ]
+    paths:
+      - 'epochX/sycl/**'
 
 jobs:
   GPU:

From 7eb8da74d7d8ed600354fa0416c534f4e8b34e20 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?J=C3=B8rgen=20Teig?= <jorgen.teig@gmail.com>
Date: Fri, 31 Mar 2023 17:11:31 +0200
Subject: [PATCH 237/509] Removed status badges because they eventually have to
 get added again

---
 README.md | 2 --
 1 file changed, 2 deletions(-)

diff --git a/README.md b/README.md
index 1bd658ad1c..966e2d7bc9 100644
--- a/README.md
+++ b/README.md
@@ -1,7 +1,5 @@
 # Madgraph 4 GPU
 
-[![C/C++ CI](https://github.com/Jooorgen/madgraph4gpu/actions/workflows/c-cpp.yml/badge.svg)](https://github.com/Jooorgen/madgraph4gpu/actions/workflows/c-cpp.yml) [![SYCL CI](https://github.com/Jooorgen/madgraph4gpu/actions/workflows/sycl.yml/badge.svg)](https://github.com/Jooorgen/madgraph4gpu/actions/workflows/sycl.yml) [![CUDA Profiler](https://github.com/Jooorgen/madgraph4gpu/actions/workflows/cudaProfiler.yml/badge.svg)](https://github.com/Jooorgen/madgraph4gpu/actions/workflows/cudaProfiler.yml) [![SYCL Profiler](https://github.com/Jooorgen/madgraph4gpu/actions/workflows/syclProfiler.yml/badge.svg)](https://github.com/Jooorgen/madgraph4gpu/actions/workflows/syclProfiler.yml)
-
 This repository contains code developed in the context of porting the [MadGraph5_aMC@NLO](https://cp3.irmp.ucl.ac.be/projects/madgraph/) event generator software onto GPU platforms and vector instructions on CPUs. MadGraph5_aMC@NLO is able to generate code for various physics processes in different programming languages (Fortran, C, C++). The code generated in this repository in "epochX" of the MadGraph5_aMC@NLO generator allows to also produce source code for those physics processes to run on GPU and CPU platforms. 
 
 

From 8946a0c6b85641501c8b550e04bf5b8acd103f95 Mon Sep 17 00:00:00 2001
From: Jooorgen <jorgen.teig@gmail.com>
Date: Fri, 31 Mar 2023 17:21:19 +0200
Subject: [PATCH 238/509] Added script for starting a container running the
 profiling

---
 tools/profiling/containerSetup.sh | 98 +++++++++++++++++++++++++++++++
 1 file changed, 98 insertions(+)
 create mode 100644 tools/profiling/containerSetup.sh

diff --git a/tools/profiling/containerSetup.sh b/tools/profiling/containerSetup.sh
new file mode 100644
index 0000000000..11dc0774b0
--- /dev/null
+++ b/tools/profiling/containerSetup.sh
@@ -0,0 +1,98 @@
+# Preliminary setup
+podman=${podman:-podman}
+distribution=$(. /etc/os-release;echo $ID$VERSION_ID)
+runnerName=GPURunner_itscrd-a100
+sourceImage=nvidia/cuda:12.0.1-devel-rockylinux8
+tag=githubci-cuda12.0.1-gcc11.3-clang
+GitHubRunnerTags=Linux,x64,a100
+githubToken=$1
+
+# Links
+runnerURL=https://github.com/actions/runner/releases/download/v2.301.1/actions-runner-linux-x64-2.301.1.tar.gz
+nvidiaContainerToolkitLink=https://nvidia.github.io/libnvidia-container/$distribution/libnvidia-container.repo
+repoURL=https://github.com/Jooorgen/madgraph4gpu
+
+if ! which podman > /dev/null; then
+  echo "Podman not installed. Trying now ..."
+  sudo yum install podman
+  curl -s -L $nvidiaContainerToolkitLink > nvidia-container-runtime.repo
+  sudo mv nvidia-container-runtime.repo /etc/yum-puppet.repos.d/
+  sudo yum install nvidia-container-runtime
+
+  sudo sed -i 's/^#no-cgroups = false/no-cgroups = true/;' /etc/nvidia-container-runtime/config.toml
+  exit 0
+fi
+
+if $runTest; then
+  # Test that container starts up
+  $podman run --rm --security-opt=label=disable nvidia/cuda:11.5.0-devel-centos8 nvidia-smi || exit 1
+fi
+
+cat > entrypoint.sh << "EOF"
+#!/bin/bash
+RUNNER=/home/CI/actions-runner/run.sh
+
+while true; do
+  if ! pgrep -f ${RUNNER} > /dev/null 2>&1; then
+    # Runner hasn't been started yet or exited because of failure / update
+    ${RUNNER}
+  else
+    # Runner was restarted, and is running in background. Let's wait for it.
+    PID=$(pgrep -f ${RUNNER}) && tail --pid=$PID -f /dev/null
+  fi
+  sleep 10
+done
+EOF
+
+# In container:
+# - install cmake, git, which
+cat > containerManifest <<EOF
+FROM ${sourceImage}
+LABEL maintaner="Stephan/Jorgen"
+
+# Add ARG instructions for required variables
+ARG GITHUB_TOKEN
+ARG RUNNER_NAME
+ARG GITHUB_RUNNER_TAGS
+ARG REPO_URL
+ARG RUNNER_URL https://github.com/actions/runner/releases/download/v2.301.1/actions-runner-linux-x64-2.301.1.tar.gz
+
+RUN if [ -z "\${GITHUB_TOKEN}" ]; then echo "GITHUB_TOKEN is not set" && exit 1; fi
+
+USER root
+RUN yum install -y cmake which git libicu glibc lttng-ust vim clang python39 ncurses ncurses-devel libzstd libzstd-devel findutils
+RUN useradd CI
+USER CI
+WORKDIR /home/CI/
+RUN mkdir actions-runner && cd /tmp/ && curl -o /tmp/actions-runner.tar.gz -L ${Runner_Url} && cd /home/CI/actions-runner && tar -xzf /tmp/actions-runner.tar.gz
+WORKDIR /home/CI/actions-runner
+RUN ./config.sh --unattended --url ${REPO_URL} --token ${GITHUB_TOKEN} --replace --name ${RUNNER_NAME} --labels ${GITHUB_RUNNER_TAGS}
+COPY ./entrypoint.sh .
+USER root
+RUN chown CI ./entrypoint.sh && ls -l && chmod u+x ./entrypoint.sh
+USER CI
+CMD [ "./entrypoint.sh" ]
+EOF
+
+$podman build \
+  --build-arg GITHUB_TOKEN=$githubToken \
+  --build-arg RUNNER_NAME=$runnerName \
+  --build-arg GITHUB_RUNNER_TAGS=$GitHubRunnerTags \
+  --build-arg REPO_URL=$repoUrl \
+  --build-arg RUNNER_URL=https://github.com/actions/runner/releases/download/v2.301.1/actions-runner-linux-x64-2.301.1.tar.gz \
+  --tag ${tag} \
+  --file containerManifest \
+  || exit 1
+
+# Run container:
+# label=disable disables carrying over of SELinux labels for mounts inside the container
+$podman run \
+--security-opt=label=disable \
+-v /cvmfs/sft.cern.ch/lcg/releases/binutils:/cvmfs/sft.cern.ch/lcg/releases/binutils:ro \
+-v /cvmfs/projects.cern.ch/intelsw/oneAPI:/cvmfs/projects.cern.ch/intelsw/oneAPI:ro \
+-v /cvmfs/sft.cern.ch/lcg/releases/gcc:/cvmfs/sft.cern.ch/lcg/releases/gcc:ro \
+--hooks-dir=/usr/share/containers/oci/hooks.d/ \
+--name github_runner ${tag}
+
+#$podman create --security-opt=label=disable --name github_runner ${tag}
+#$podman generate systemd --restart-policy=always --files -t 10 -n github_runner
\ No newline at end of file

From c5d0fb725d6944923c3ec0c9780fa91acd2f3270 Mon Sep 17 00:00:00 2001
From: Jooorgen <jorgen.teig@gmail.com>
Date: Wed, 12 Apr 2023 11:30:18 +0200
Subject: [PATCH 239/509] Reverted changes to the sycl directory

---
 epochX/sycl/ee_mumu.sa/SubProcesses/sycl.mk  | 15 +++------------
 epochX/sycl/ee_mumu.sa/src/sycl_src.mk       |  8 +-------
 epochX/sycl/gg_tt.sa/SubProcesses/sycl.mk    | 15 +++------------
 epochX/sycl/gg_tt.sa/src/sycl_src.mk         |  8 +-------
 epochX/sycl/gg_ttg.sa/SubProcesses/sycl.mk   | 15 +++------------
 epochX/sycl/gg_ttg.sa/src/sycl_src.mk        |  8 +-------
 epochX/sycl/gg_ttgg.sa/SubProcesses/sycl.mk  | 15 +++------------
 epochX/sycl/gg_ttgg.sa/src/sycl_src.mk       |  8 +-------
 epochX/sycl/gg_ttggg.sa/SubProcesses/sycl.mk | 16 ++++------------
 epochX/sycl/gg_ttggg.sa/src/sycl_src.mk      |  8 +-------
 10 files changed, 21 insertions(+), 95 deletions(-)

diff --git a/epochX/sycl/ee_mumu.sa/SubProcesses/sycl.mk b/epochX/sycl/ee_mumu.sa/SubProcesses/sycl.mk
index 7e6cbaba0e..badb5bee57 100644
--- a/epochX/sycl/ee_mumu.sa/SubProcesses/sycl.mk
+++ b/epochX/sycl/ee_mumu.sa/SubProcesses/sycl.mk
@@ -94,15 +94,6 @@ export NTPBMAX
 
 #=== Set the SYCL/C++ compiler flags appropriate to user-defined choices of FPTYPE, HELINL, HRDCOD, NTPBMAX
 
-# Add option to enable CI profiler use
-$(info ENABLE_CI_PROFILER=$(ENABLE_CI_PROFILER))
-ifeq ($(ENABLE_CI_PROFILER),1)
-  CXXFLAGS += --gcc-toolchain="/cvmfs/sft.cern.ch/lcg/releases/gcc/11.3.0-ad0f5/x86_64-centos8"
-
-  # Sets the device ID to the GPU (oneAPI Toolkit) when running check/cmpFcheck in the GitHub CI
-  ENABLE_DEVICE_ID = "--device_id=2"
-endif
-
 # Set the build flags appropriate to each FPTYPE choice (example: "make FPTYPE=f")
 $(info FPTYPE=$(FPTYPE))
 ifeq ($(FPTYPE),d)
@@ -264,7 +255,7 @@ $(BUILDDIR)/%.o : %.f *.inc
 
 $(fsycl_main): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH
 $(fsycl_main): $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler.o $(LIBDIR)/lib$(MG5AMC_CXXLIB).a $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so
-	$(CXX) $(CXXFLAGS) $(SYCLFLAGS) -o $@ $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler.o $(LIBDIR)/lib$(MG5AMC_CXXLIB).a $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) -lstdc++fs
+	$(CXX) $(SYCLFLAGS) -o $@ $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler.o $(LIBDIR)/lib$(MG5AMC_CXXLIB).a $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) -lstdc++fs
 
 #-------------------------------------------------------------------------------
 
@@ -346,8 +337,8 @@ check: cmpFcheck
 # Target: cmpFcheck (compare ME results from the C++ and Fortran with C++ MEs standalone executables, with a small number of events)
 cmpFcheck: all.$(TAG)
 	@echo
-	@echo "$(BUILDDIR)/check.exe -p 2 32 2 ${ENABLE_DEVICE_ID}"
+	@echo "$(BUILDDIR)/check.exe -p 2 32 2"
 	@echo "$(BUILDDIR)/fcheck.exe 2 32 2"
-	@me1=$(shell $(RUNTIME) $(BUILDDIR)/check.exe -p 2 32 2 ${ENABLE_DEVICE_ID} | grep MeanMatrix | awk '{print $$4}'); me2=$(shell $(RUNTIME) $(BUILDDIR)/fcheck.exe 2 32 2 | grep Average | awk '{print $$4}'); echo "Avg ME (C++/C++)    = $${me1}"; echo "Avg ME (F77/C++)    = $${me2}"; if [ "$${me2}" == "NaN" ]; then echo "ERROR! Fortran calculation (F77/C++) returned NaN"; elif [ "$${me2}" == "" ]; then echo "ERROR! Fortran calculation (F77/C++) crashed"; else python3 -c "me1=$${me1}; me2=$${me2}; reldif=abs((me2-me1)/me1); print('Relative difference =', reldif); ok = reldif <= 2E-4; print ( '%s (relative difference %s 2E-4)' % ( ('OK','<=') if ok else ('ERROR','>') ) ); import sys; sys.exit(0 if ok else 1)"; fi
+	@me1=$(shell $(RUNTIME) $(BUILDDIR)/check.exe -p 2 32 2 | grep MeanMatrix | awk '{print $$4}'); me2=$(shell $(RUNTIME) $(BUILDDIR)/fcheck.exe 2 32 2 | grep Average | awk '{print $$4}'); echo "Avg ME (C++/C++)    = $${me1}"; echo "Avg ME (F77/C++)    = $${me2}"; if [ "$${me2}" == "NaN" ]; then echo "ERROR! Fortran calculation (F77/C++) returned NaN"; elif [ "$${me2}" == "" ]; then echo "ERROR! Fortran calculation (F77/C++) crashed"; else python3 -c "me1=$${me1}; me2=$${me2}; reldif=abs((me2-me1)/me1); print('Relative difference =', reldif); ok = reldif <= 2E-4; print ( '%s (relative difference %s 2E-4)' % ( ('OK','<=') if ok else ('ERROR','>') ) ); import sys; sys.exit(0 if ok else 1)"; fi
 
 #-------------------------------------------------------------------------------
diff --git a/epochX/sycl/ee_mumu.sa/src/sycl_src.mk b/epochX/sycl/ee_mumu.sa/src/sycl_src.mk
index f7ed33af40..504c2d4dd8 100644
--- a/epochX/sycl/ee_mumu.sa/src/sycl_src.mk
+++ b/epochX/sycl/ee_mumu.sa/src/sycl_src.mk
@@ -26,12 +26,6 @@ endif
 
 #=== Set the SYCL compiler flags appropriate to user-defined choices of FPTYPE, HELINL, HRDCOD
 
-# Add option to enable CI profiler use
-$(info ENABLE_CI_PROFILER=$(ENABLE_CI_PROFILER))
-ifeq ($(ENABLE_CI_PROFILER),1)
-  CXXFLAGS += --gcc-toolchain="/cvmfs/sft.cern.ch/lcg/releases/gcc/11.3.0-ad0f5/x86_64-centos8"
-endif
-
 # Set the build flags appropriate to each FPTYPE choice (example: "make FPTYPE=f")
 ifeq ($(FPTYPE),d)
   CXXFLAGS += -DMGONGPU_FPTYPE_DOUBLE
@@ -156,7 +150,7 @@ endif
 
 cleanall:
 	@echo
-	make -f sycl_src.mk clean
+	make clean
 	@echo
 	rm -rf build.*
 
diff --git a/epochX/sycl/gg_tt.sa/SubProcesses/sycl.mk b/epochX/sycl/gg_tt.sa/SubProcesses/sycl.mk
index 7e6cbaba0e..badb5bee57 100644
--- a/epochX/sycl/gg_tt.sa/SubProcesses/sycl.mk
+++ b/epochX/sycl/gg_tt.sa/SubProcesses/sycl.mk
@@ -94,15 +94,6 @@ export NTPBMAX
 
 #=== Set the SYCL/C++ compiler flags appropriate to user-defined choices of FPTYPE, HELINL, HRDCOD, NTPBMAX
 
-# Add option to enable CI profiler use
-$(info ENABLE_CI_PROFILER=$(ENABLE_CI_PROFILER))
-ifeq ($(ENABLE_CI_PROFILER),1)
-  CXXFLAGS += --gcc-toolchain="/cvmfs/sft.cern.ch/lcg/releases/gcc/11.3.0-ad0f5/x86_64-centos8"
-
-  # Sets the device ID to the GPU (oneAPI Toolkit) when running check/cmpFcheck in the GitHub CI
-  ENABLE_DEVICE_ID = "--device_id=2"
-endif
-
 # Set the build flags appropriate to each FPTYPE choice (example: "make FPTYPE=f")
 $(info FPTYPE=$(FPTYPE))
 ifeq ($(FPTYPE),d)
@@ -264,7 +255,7 @@ $(BUILDDIR)/%.o : %.f *.inc
 
 $(fsycl_main): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH
 $(fsycl_main): $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler.o $(LIBDIR)/lib$(MG5AMC_CXXLIB).a $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so
-	$(CXX) $(CXXFLAGS) $(SYCLFLAGS) -o $@ $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler.o $(LIBDIR)/lib$(MG5AMC_CXXLIB).a $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) -lstdc++fs
+	$(CXX) $(SYCLFLAGS) -o $@ $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler.o $(LIBDIR)/lib$(MG5AMC_CXXLIB).a $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) -lstdc++fs
 
 #-------------------------------------------------------------------------------
 
@@ -346,8 +337,8 @@ check: cmpFcheck
 # Target: cmpFcheck (compare ME results from the C++ and Fortran with C++ MEs standalone executables, with a small number of events)
 cmpFcheck: all.$(TAG)
 	@echo
-	@echo "$(BUILDDIR)/check.exe -p 2 32 2 ${ENABLE_DEVICE_ID}"
+	@echo "$(BUILDDIR)/check.exe -p 2 32 2"
 	@echo "$(BUILDDIR)/fcheck.exe 2 32 2"
-	@me1=$(shell $(RUNTIME) $(BUILDDIR)/check.exe -p 2 32 2 ${ENABLE_DEVICE_ID} | grep MeanMatrix | awk '{print $$4}'); me2=$(shell $(RUNTIME) $(BUILDDIR)/fcheck.exe 2 32 2 | grep Average | awk '{print $$4}'); echo "Avg ME (C++/C++)    = $${me1}"; echo "Avg ME (F77/C++)    = $${me2}"; if [ "$${me2}" == "NaN" ]; then echo "ERROR! Fortran calculation (F77/C++) returned NaN"; elif [ "$${me2}" == "" ]; then echo "ERROR! Fortran calculation (F77/C++) crashed"; else python3 -c "me1=$${me1}; me2=$${me2}; reldif=abs((me2-me1)/me1); print('Relative difference =', reldif); ok = reldif <= 2E-4; print ( '%s (relative difference %s 2E-4)' % ( ('OK','<=') if ok else ('ERROR','>') ) ); import sys; sys.exit(0 if ok else 1)"; fi
+	@me1=$(shell $(RUNTIME) $(BUILDDIR)/check.exe -p 2 32 2 | grep MeanMatrix | awk '{print $$4}'); me2=$(shell $(RUNTIME) $(BUILDDIR)/fcheck.exe 2 32 2 | grep Average | awk '{print $$4}'); echo "Avg ME (C++/C++)    = $${me1}"; echo "Avg ME (F77/C++)    = $${me2}"; if [ "$${me2}" == "NaN" ]; then echo "ERROR! Fortran calculation (F77/C++) returned NaN"; elif [ "$${me2}" == "" ]; then echo "ERROR! Fortran calculation (F77/C++) crashed"; else python3 -c "me1=$${me1}; me2=$${me2}; reldif=abs((me2-me1)/me1); print('Relative difference =', reldif); ok = reldif <= 2E-4; print ( '%s (relative difference %s 2E-4)' % ( ('OK','<=') if ok else ('ERROR','>') ) ); import sys; sys.exit(0 if ok else 1)"; fi
 
 #-------------------------------------------------------------------------------
diff --git a/epochX/sycl/gg_tt.sa/src/sycl_src.mk b/epochX/sycl/gg_tt.sa/src/sycl_src.mk
index f7ed33af40..504c2d4dd8 100644
--- a/epochX/sycl/gg_tt.sa/src/sycl_src.mk
+++ b/epochX/sycl/gg_tt.sa/src/sycl_src.mk
@@ -26,12 +26,6 @@ endif
 
 #=== Set the SYCL compiler flags appropriate to user-defined choices of FPTYPE, HELINL, HRDCOD
 
-# Add option to enable CI profiler use
-$(info ENABLE_CI_PROFILER=$(ENABLE_CI_PROFILER))
-ifeq ($(ENABLE_CI_PROFILER),1)
-  CXXFLAGS += --gcc-toolchain="/cvmfs/sft.cern.ch/lcg/releases/gcc/11.3.0-ad0f5/x86_64-centos8"
-endif
-
 # Set the build flags appropriate to each FPTYPE choice (example: "make FPTYPE=f")
 ifeq ($(FPTYPE),d)
   CXXFLAGS += -DMGONGPU_FPTYPE_DOUBLE
@@ -156,7 +150,7 @@ endif
 
 cleanall:
 	@echo
-	make -f sycl_src.mk clean
+	make clean
 	@echo
 	rm -rf build.*
 
diff --git a/epochX/sycl/gg_ttg.sa/SubProcesses/sycl.mk b/epochX/sycl/gg_ttg.sa/SubProcesses/sycl.mk
index 7e6cbaba0e..badb5bee57 100644
--- a/epochX/sycl/gg_ttg.sa/SubProcesses/sycl.mk
+++ b/epochX/sycl/gg_ttg.sa/SubProcesses/sycl.mk
@@ -94,15 +94,6 @@ export NTPBMAX
 
 #=== Set the SYCL/C++ compiler flags appropriate to user-defined choices of FPTYPE, HELINL, HRDCOD, NTPBMAX
 
-# Add option to enable CI profiler use
-$(info ENABLE_CI_PROFILER=$(ENABLE_CI_PROFILER))
-ifeq ($(ENABLE_CI_PROFILER),1)
-  CXXFLAGS += --gcc-toolchain="/cvmfs/sft.cern.ch/lcg/releases/gcc/11.3.0-ad0f5/x86_64-centos8"
-
-  # Sets the device ID to the GPU (oneAPI Toolkit) when running check/cmpFcheck in the GitHub CI
-  ENABLE_DEVICE_ID = "--device_id=2"
-endif
-
 # Set the build flags appropriate to each FPTYPE choice (example: "make FPTYPE=f")
 $(info FPTYPE=$(FPTYPE))
 ifeq ($(FPTYPE),d)
@@ -264,7 +255,7 @@ $(BUILDDIR)/%.o : %.f *.inc
 
 $(fsycl_main): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH
 $(fsycl_main): $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler.o $(LIBDIR)/lib$(MG5AMC_CXXLIB).a $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so
-	$(CXX) $(CXXFLAGS) $(SYCLFLAGS) -o $@ $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler.o $(LIBDIR)/lib$(MG5AMC_CXXLIB).a $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) -lstdc++fs
+	$(CXX) $(SYCLFLAGS) -o $@ $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler.o $(LIBDIR)/lib$(MG5AMC_CXXLIB).a $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) -lstdc++fs
 
 #-------------------------------------------------------------------------------
 
@@ -346,8 +337,8 @@ check: cmpFcheck
 # Target: cmpFcheck (compare ME results from the C++ and Fortran with C++ MEs standalone executables, with a small number of events)
 cmpFcheck: all.$(TAG)
 	@echo
-	@echo "$(BUILDDIR)/check.exe -p 2 32 2 ${ENABLE_DEVICE_ID}"
+	@echo "$(BUILDDIR)/check.exe -p 2 32 2"
 	@echo "$(BUILDDIR)/fcheck.exe 2 32 2"
-	@me1=$(shell $(RUNTIME) $(BUILDDIR)/check.exe -p 2 32 2 ${ENABLE_DEVICE_ID} | grep MeanMatrix | awk '{print $$4}'); me2=$(shell $(RUNTIME) $(BUILDDIR)/fcheck.exe 2 32 2 | grep Average | awk '{print $$4}'); echo "Avg ME (C++/C++)    = $${me1}"; echo "Avg ME (F77/C++)    = $${me2}"; if [ "$${me2}" == "NaN" ]; then echo "ERROR! Fortran calculation (F77/C++) returned NaN"; elif [ "$${me2}" == "" ]; then echo "ERROR! Fortran calculation (F77/C++) crashed"; else python3 -c "me1=$${me1}; me2=$${me2}; reldif=abs((me2-me1)/me1); print('Relative difference =', reldif); ok = reldif <= 2E-4; print ( '%s (relative difference %s 2E-4)' % ( ('OK','<=') if ok else ('ERROR','>') ) ); import sys; sys.exit(0 if ok else 1)"; fi
+	@me1=$(shell $(RUNTIME) $(BUILDDIR)/check.exe -p 2 32 2 | grep MeanMatrix | awk '{print $$4}'); me2=$(shell $(RUNTIME) $(BUILDDIR)/fcheck.exe 2 32 2 | grep Average | awk '{print $$4}'); echo "Avg ME (C++/C++)    = $${me1}"; echo "Avg ME (F77/C++)    = $${me2}"; if [ "$${me2}" == "NaN" ]; then echo "ERROR! Fortran calculation (F77/C++) returned NaN"; elif [ "$${me2}" == "" ]; then echo "ERROR! Fortran calculation (F77/C++) crashed"; else python3 -c "me1=$${me1}; me2=$${me2}; reldif=abs((me2-me1)/me1); print('Relative difference =', reldif); ok = reldif <= 2E-4; print ( '%s (relative difference %s 2E-4)' % ( ('OK','<=') if ok else ('ERROR','>') ) ); import sys; sys.exit(0 if ok else 1)"; fi
 
 #-------------------------------------------------------------------------------
diff --git a/epochX/sycl/gg_ttg.sa/src/sycl_src.mk b/epochX/sycl/gg_ttg.sa/src/sycl_src.mk
index f7ed33af40..504c2d4dd8 100644
--- a/epochX/sycl/gg_ttg.sa/src/sycl_src.mk
+++ b/epochX/sycl/gg_ttg.sa/src/sycl_src.mk
@@ -26,12 +26,6 @@ endif
 
 #=== Set the SYCL compiler flags appropriate to user-defined choices of FPTYPE, HELINL, HRDCOD
 
-# Add option to enable CI profiler use
-$(info ENABLE_CI_PROFILER=$(ENABLE_CI_PROFILER))
-ifeq ($(ENABLE_CI_PROFILER),1)
-  CXXFLAGS += --gcc-toolchain="/cvmfs/sft.cern.ch/lcg/releases/gcc/11.3.0-ad0f5/x86_64-centos8"
-endif
-
 # Set the build flags appropriate to each FPTYPE choice (example: "make FPTYPE=f")
 ifeq ($(FPTYPE),d)
   CXXFLAGS += -DMGONGPU_FPTYPE_DOUBLE
@@ -156,7 +150,7 @@ endif
 
 cleanall:
 	@echo
-	make -f sycl_src.mk clean
+	make clean
 	@echo
 	rm -rf build.*
 
diff --git a/epochX/sycl/gg_ttgg.sa/SubProcesses/sycl.mk b/epochX/sycl/gg_ttgg.sa/SubProcesses/sycl.mk
index ce9d8c69d9..badb5bee57 100644
--- a/epochX/sycl/gg_ttgg.sa/SubProcesses/sycl.mk
+++ b/epochX/sycl/gg_ttgg.sa/SubProcesses/sycl.mk
@@ -94,15 +94,6 @@ export NTPBMAX
 
 #=== Set the SYCL/C++ compiler flags appropriate to user-defined choices of FPTYPE, HELINL, HRDCOD, NTPBMAX
 
-# Add option to enable CI profiler use
-$(info ENABLE_CI_PROFILER=$(ENABLE_CI_PROFILER))
-ifeq ($(ENABLE_CI_PROFILER),1)
-  CXXFLAGS += --gcc-toolchain="/cvmfs/sft.cern.ch/lcg/releases/gcc/11.3.0-ad0f5/x86_64-centos8"
-
-  # Sets the device ID to the GPU (oneAPI Toolkit) when running check/cmpFcheck in the GitHub CI
-  ENABLE_DEVICE_ID = "--device_id=2"
-endif
-
 # Set the build flags appropriate to each FPTYPE choice (example: "make FPTYPE=f")
 $(info FPTYPE=$(FPTYPE))
 ifeq ($(FPTYPE),d)
@@ -264,7 +255,7 @@ $(BUILDDIR)/%.o : %.f *.inc
 
 $(fsycl_main): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH
 $(fsycl_main): $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler.o $(LIBDIR)/lib$(MG5AMC_CXXLIB).a $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so
-	$(CXX) $(CXXFLAGS) $(SYCLFLAGS) -o $@ $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler.o $(LIBDIR)/lib$(MG5AMC_CXXLIB).a $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) -lstdc++fs
+	$(CXX) $(SYCLFLAGS) -o $@ $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler.o $(LIBDIR)/lib$(MG5AMC_CXXLIB).a $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) -lstdc++fs
 
 #-------------------------------------------------------------------------------
 
@@ -346,8 +337,8 @@ check: cmpFcheck
 # Target: cmpFcheck (compare ME results from the C++ and Fortran with C++ MEs standalone executables, with a small number of events)
 cmpFcheck: all.$(TAG)
 	@echo
-	@echo "$(BUILDDIR)/check.exe -p 2 32 2 ${ENABLE_DEVICE_ID}"
+	@echo "$(BUILDDIR)/check.exe -p 2 32 2"
 	@echo "$(BUILDDIR)/fcheck.exe 2 32 2"
-	@me1=$(shell $(RUNTIME) $(BUILDDIR)/check.exe ${ENABLE_DEVICE_ID} -p 2 32 2 | grep MeanMatrix | awk '{print $$4}'); me2=$(shell $(RUNTIME) $(BUILDDIR)/fcheck.exe 2 32 2 | grep Average | awk '{print $$4}'); echo "Avg ME (C++/C++)    = $${me1}"; echo "Avg ME (F77/C++)    = $${me2}"; if [ "$${me2}" == "NaN" ]; then echo "ERROR! Fortran calculation (F77/C++) returned NaN"; elif [ "$${me2}" == "" ]; then echo "ERROR! Fortran calculation (F77/C++) crashed"; else python3 -c "me1=$${me1}; me2=$${me2}; reldif=abs((me2-me1)/me1); print('Relative difference =', reldif); ok = reldif <= 2E-4; print ( '%s (relative difference %s 2E-4)' % ( ('OK','<=') if ok else ('ERROR','>') ) ); import sys; sys.exit(0 if ok else 1)"; fi
+	@me1=$(shell $(RUNTIME) $(BUILDDIR)/check.exe -p 2 32 2 | grep MeanMatrix | awk '{print $$4}'); me2=$(shell $(RUNTIME) $(BUILDDIR)/fcheck.exe 2 32 2 | grep Average | awk '{print $$4}'); echo "Avg ME (C++/C++)    = $${me1}"; echo "Avg ME (F77/C++)    = $${me2}"; if [ "$${me2}" == "NaN" ]; then echo "ERROR! Fortran calculation (F77/C++) returned NaN"; elif [ "$${me2}" == "" ]; then echo "ERROR! Fortran calculation (F77/C++) crashed"; else python3 -c "me1=$${me1}; me2=$${me2}; reldif=abs((me2-me1)/me1); print('Relative difference =', reldif); ok = reldif <= 2E-4; print ( '%s (relative difference %s 2E-4)' % ( ('OK','<=') if ok else ('ERROR','>') ) ); import sys; sys.exit(0 if ok else 1)"; fi
 
 #-------------------------------------------------------------------------------
diff --git a/epochX/sycl/gg_ttgg.sa/src/sycl_src.mk b/epochX/sycl/gg_ttgg.sa/src/sycl_src.mk
index f7ed33af40..504c2d4dd8 100644
--- a/epochX/sycl/gg_ttgg.sa/src/sycl_src.mk
+++ b/epochX/sycl/gg_ttgg.sa/src/sycl_src.mk
@@ -26,12 +26,6 @@ endif
 
 #=== Set the SYCL compiler flags appropriate to user-defined choices of FPTYPE, HELINL, HRDCOD
 
-# Add option to enable CI profiler use
-$(info ENABLE_CI_PROFILER=$(ENABLE_CI_PROFILER))
-ifeq ($(ENABLE_CI_PROFILER),1)
-  CXXFLAGS += --gcc-toolchain="/cvmfs/sft.cern.ch/lcg/releases/gcc/11.3.0-ad0f5/x86_64-centos8"
-endif
-
 # Set the build flags appropriate to each FPTYPE choice (example: "make FPTYPE=f")
 ifeq ($(FPTYPE),d)
   CXXFLAGS += -DMGONGPU_FPTYPE_DOUBLE
@@ -156,7 +150,7 @@ endif
 
 cleanall:
 	@echo
-	make -f sycl_src.mk clean
+	make clean
 	@echo
 	rm -rf build.*
 
diff --git a/epochX/sycl/gg_ttggg.sa/SubProcesses/sycl.mk b/epochX/sycl/gg_ttggg.sa/SubProcesses/sycl.mk
index edf3fbb1a1..badb5bee57 100644
--- a/epochX/sycl/gg_ttggg.sa/SubProcesses/sycl.mk
+++ b/epochX/sycl/gg_ttggg.sa/SubProcesses/sycl.mk
@@ -43,6 +43,7 @@ INCFLAGS += -I$(TOOLSDIR)
 #=== Configure the C++ compiler
 
 CXXFLAGS = $(OPTFLAGS) -std=c++20 $(INCFLAGS) -Wall -Wshadow -Wextra
+CXXFLAGS+= -ffast-math # see issue #117
 ifndef SYCLFLAGS
   $(error SYCLFLAGS not set)
 endif
@@ -93,15 +94,6 @@ export NTPBMAX
 
 #=== Set the SYCL/C++ compiler flags appropriate to user-defined choices of FPTYPE, HELINL, HRDCOD, NTPBMAX
 
-# Add option to enable CI profiler use
-$(info ENABLE_CI_PROFILER=$(ENABLE_CI_PROFILER))
-ifeq ($(ENABLE_CI_PROFILER),1)
-  CXXFLAGS += --gcc-toolchain="/cvmfs/sft.cern.ch/lcg/releases/gcc/11.3.0-ad0f5/x86_64-centos8"
-
-  # Sets the device ID to the GPU (oneAPI Toolkit) when running check/cmpFcheck in the GitHub CI
-  ENABLE_DEVICE_ID = "--device_id=2"
-endif
-
 # Set the build flags appropriate to each FPTYPE choice (example: "make FPTYPE=f")
 $(info FPTYPE=$(FPTYPE))
 ifeq ($(FPTYPE),d)
@@ -263,7 +255,7 @@ $(BUILDDIR)/%.o : %.f *.inc
 
 $(fsycl_main): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH
 $(fsycl_main): $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler.o $(LIBDIR)/lib$(MG5AMC_CXXLIB).a $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so
-	$(CXX) $(CXXFLAGS) $(SYCLFLAGS) -o $@ $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler.o $(LIBDIR)/lib$(MG5AMC_CXXLIB).a $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) -lstdc++fs
+	$(CXX) $(SYCLFLAGS) -o $@ $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler.o $(LIBDIR)/lib$(MG5AMC_CXXLIB).a $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) -lstdc++fs
 
 #-------------------------------------------------------------------------------
 
@@ -345,8 +337,8 @@ check: cmpFcheck
 # Target: cmpFcheck (compare ME results from the C++ and Fortran with C++ MEs standalone executables, with a small number of events)
 cmpFcheck: all.$(TAG)
 	@echo
-	@echo "$(BUILDDIR)/check.exe -p 2 32 2 ${ENABLE_DEVICE_ID}"
+	@echo "$(BUILDDIR)/check.exe -p 2 32 2"
 	@echo "$(BUILDDIR)/fcheck.exe 2 32 2"
-	@me1=$(shell $(RUNTIME) $(BUILDDIR)/check.exe -p 2 32 2 ${ENABLE_DEVICE_ID} | grep MeanMatrix | awk '{print $$4}'); me2=$(shell $(RUNTIME) $(BUILDDIR)/fcheck.exe 2 32 2 | grep Average | awk '{print $$4}'); echo "Avg ME (C++/C++)    = $${me1}"; echo "Avg ME (F77/C++)    = $${me2}"; if [ "$${me2}" == "NaN" ]; then echo "ERROR! Fortran calculation (F77/C++) returned NaN"; elif [ "$${me2}" == "" ]; then echo "ERROR! Fortran calculation (F77/C++) crashed"; else python3 -c "me1=$${me1}; me2=$${me2}; reldif=abs((me2-me1)/me1); print('Relative difference =', reldif); ok = reldif <= 2E-4; print ( '%s (relative difference %s 2E-4)' % ( ('OK','<=') if ok else ('ERROR','>') ) ); import sys; sys.exit(0 if ok else 1)"; fi
+	@me1=$(shell $(RUNTIME) $(BUILDDIR)/check.exe -p 2 32 2 | grep MeanMatrix | awk '{print $$4}'); me2=$(shell $(RUNTIME) $(BUILDDIR)/fcheck.exe 2 32 2 | grep Average | awk '{print $$4}'); echo "Avg ME (C++/C++)    = $${me1}"; echo "Avg ME (F77/C++)    = $${me2}"; if [ "$${me2}" == "NaN" ]; then echo "ERROR! Fortran calculation (F77/C++) returned NaN"; elif [ "$${me2}" == "" ]; then echo "ERROR! Fortran calculation (F77/C++) crashed"; else python3 -c "me1=$${me1}; me2=$${me2}; reldif=abs((me2-me1)/me1); print('Relative difference =', reldif); ok = reldif <= 2E-4; print ( '%s (relative difference %s 2E-4)' % ( ('OK','<=') if ok else ('ERROR','>') ) ); import sys; sys.exit(0 if ok else 1)"; fi
 
 #-------------------------------------------------------------------------------
diff --git a/epochX/sycl/gg_ttggg.sa/src/sycl_src.mk b/epochX/sycl/gg_ttggg.sa/src/sycl_src.mk
index f7ed33af40..504c2d4dd8 100644
--- a/epochX/sycl/gg_ttggg.sa/src/sycl_src.mk
+++ b/epochX/sycl/gg_ttggg.sa/src/sycl_src.mk
@@ -26,12 +26,6 @@ endif
 
 #=== Set the SYCL compiler flags appropriate to user-defined choices of FPTYPE, HELINL, HRDCOD
 
-# Add option to enable CI profiler use
-$(info ENABLE_CI_PROFILER=$(ENABLE_CI_PROFILER))
-ifeq ($(ENABLE_CI_PROFILER),1)
-  CXXFLAGS += --gcc-toolchain="/cvmfs/sft.cern.ch/lcg/releases/gcc/11.3.0-ad0f5/x86_64-centos8"
-endif
-
 # Set the build flags appropriate to each FPTYPE choice (example: "make FPTYPE=f")
 ifeq ($(FPTYPE),d)
   CXXFLAGS += -DMGONGPU_FPTYPE_DOUBLE
@@ -156,7 +150,7 @@ endif
 
 cleanall:
 	@echo
-	make -f sycl_src.mk clean
+	make clean
 	@echo
 	rm -rf build.*
 

From cc39fe8ec409a74e218dc48ae1983e58ca2e4f7a Mon Sep 17 00:00:00 2001
From: Jooorgen <jorgen.teig@gmail.com>
Date: Mon, 24 Apr 2023 11:47:18 +0200
Subject: [PATCH 240/509] Remove CVMFS from profiler workflows

---
 .github/workflows/a100_profiler.yml  | 4 +---
 .github/workflows/v100s_profiler.yml | 5 ++---
 2 files changed, 3 insertions(+), 6 deletions(-)

diff --git a/.github/workflows/a100_profiler.yml b/.github/workflows/a100_profiler.yml
index 1cfed19d28..689700dcc2 100644
--- a/.github/workflows/a100_profiler.yml
+++ b/.github/workflows/a100_profiler.yml
@@ -17,8 +17,6 @@ jobs:
     - uses: actions/checkout@v2
     - name: Runs SYCL performanceProfiler.py script
       run: cd tools/profiling/;
-           source /cvmfs/sft.cern.ch/lcg/releases/gcc/11.3.0-ad0f5/x86_64-centos8/setup.sh;
-           source /cvmfs/projects.cern.ch/intelsw/oneAPI/linux/x86_64/2023/setvars.sh --include-intel-llvm;
            python3 performanceProfiler.py -l 'SYCL' -b 'master'
     - name: Uploads SYCL JSON files to DB
       run: cd tools/profiling/; python3 sendData.py --absLayer SYCL --profiler 1 --branch master
@@ -34,7 +32,7 @@ jobs:
     steps:
     - uses: actions/checkout@v2
     - name: Runs CUDA performanceProfiler.py script
-      run: cd tools/profiling/; source /cvmfs/sft.cern.ch/lcg/releases/gcc/11.3.0-ad0f5/x86_64-centos8/setup.sh;
+      run: cd tools/profiling/;
            python3 performanceProfiler.py -l 'CUDA' -b 'master'
     - name: Uploads CUDA JSON files to DB
       run: cd tools/profiling/; python3 sendData.py --absLayer CUDA --profiler 1 --branch master
\ No newline at end of file
diff --git a/.github/workflows/v100s_profiler.yml b/.github/workflows/v100s_profiler.yml
index 7317566e25..a1cc4e710a 100644
--- a/.github/workflows/v100s_profiler.yml
+++ b/.github/workflows/v100s_profiler.yml
@@ -17,8 +17,7 @@ jobs:
     steps:
     - uses: actions/checkout@v2
     - name: Runs SYCL performanceProfiler.py script
-      run: cd tools/profiling/; source /cvmfs/sft.cern.ch/lcg/releases/gcc/11.3.0-ad0f5/x86_64-centos8/setup.sh;
-           source /cvmfs/projects.cern.ch/intelsw/oneAPI/linux/x86_64/2023/setvars.sh --include-intel-llvm;
+      run: cd tools/profiling/;
            python3 performanceProfiler.py -l 'SYCL' -b 'master'
     - name: Uploads SYCL JSON files to DB
       run: cd tools/profiling/; python3 sendData.py --absLayer SYCL --profiler 1 --branch master
@@ -34,7 +33,7 @@ jobs:
     steps:
     - uses: actions/checkout@v2
     - name: Runs CUDA performanceProfiler.py script
-      run: cd tools/profiling/; source /cvmfs/sft.cern.ch/lcg/releases/gcc/11.3.0-ad0f5/x86_64-centos8/setup.sh;
+      run: cd tools/profiling/;
            python3 performanceProfiler.py -l 'CUDA' -b 'master'
     - name: Uploads CUDA JSON files to DB
       run: cd tools/profiling/; python3 sendData.py --absLayer CUDA --profiler 1 --branch master
\ No newline at end of file

From d98e8f6f48662bc714596913c4e3993678155c31 Mon Sep 17 00:00:00 2001
From: Jooorgen <jorgen.teig@gmail.com>
Date: Tue, 25 Apr 2023 00:01:10 +0200
Subject: [PATCH 241/509] Removed CVMFS from CXX variable

---
 tools/profiling/buildCUDAProcess.sh | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/tools/profiling/buildCUDAProcess.sh b/tools/profiling/buildCUDAProcess.sh
index 80edca074b..271e04114a 100755
--- a/tools/profiling/buildCUDAProcess.sh
+++ b/tools/profiling/buildCUDAProcess.sh
@@ -59,7 +59,8 @@ prefix=$(pwd)
 
 export USEBUILDDIR=1
 export NTPBMAX=1024
-export CXX=/cvmfs/sft.cern.ch/lcg/releases/gcc/11.3.0-ad0f5/x86_64-centos8/bin/g++
+#export CXX=/cvmfs/sft.cern.ch/lcg/releases/gcc/11.3.0-ad0f5/x86_64-centos8/bin/g++
+export CXX=`which g++`
 export MG_EXE="./gcheck.exe" #GPU
 #export MG_EXE="./check.exe" #CPU
 export CUDA_HOME=/usr/local/cuda-12.0/

From 684806166208f680a4566996c0093c06c3930016 Mon Sep 17 00:00:00 2001
From: Jooorgen <jorgen.teig@gmail.com>
Date: Tue, 25 Apr 2023 00:02:47 +0200
Subject: [PATCH 242/509] Changed GCC version in CUDA a100 Profiler to whats in
 container

---
 .github/workflows/a100_profiler.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/a100_profiler.yml b/.github/workflows/a100_profiler.yml
index 689700dcc2..c95d4bfaaa 100644
--- a/.github/workflows/a100_profiler.yml
+++ b/.github/workflows/a100_profiler.yml
@@ -24,7 +24,7 @@ jobs:
   cuda_a100_Profiling:
     name: CUDA A100 Profiling
     env:
-      CUDA_NAME_PREFIX: cudacpp_AMD-Epyc-7313_a100_gcc-11.3_cuda-12.0.1
+      CUDA_NAME_PREFIX: cudacpp_AMD-Epyc-7313_a100_gcc-11.2.1_cuda-12.0.1
       ENABLE_CI_PROFILER: 1
 
       MADGRAPH4GPU_DB_SECRET: ${{ secrets.MADGRAPH4GPU_DB_SECRET }}

From 5d87a1e9886437063b7a31b2c446ac21c16f5f08 Mon Sep 17 00:00:00 2001
From: Jorgen Teig <jteig@itscrd-a100.cern.ch>
Date: Fri, 5 May 2023 17:29:02 +0200
Subject: [PATCH 243/509] Testing abstraction of CUDA function to seperate
 header file

---
 .../ee_mumu.mad/SubProcesses/MemoryBuffers.h  | 13 +++++-----
 .../SubProcesses/P1_epem_mupmum/CPPProcess.cc |  9 ++++---
 .../P1_epem_mupmum/gpu_abstraction.h          |  1 +
 .../SubProcesses/gpu_abstraction.h            | 26 +++++++++++++++++++
 4 files changed, 39 insertions(+), 10 deletions(-)
 create mode 120000 epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/gpu_abstraction.h
 create mode 100644 epochX/cudacpp/ee_mumu.mad/SubProcesses/gpu_abstraction.h

diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/MemoryBuffers.h b/epochX/cudacpp/ee_mumu.mad/SubProcesses/MemoryBuffers.h
index 75b552c5ad..7e7a868189 100644
--- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/MemoryBuffers.h
+++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/MemoryBuffers.h
@@ -6,6 +6,7 @@
 #include "mgOnGpuCxtypes.h"
 
 #include "CudaRuntime.h"
+#include "gpu_abstraction.h"
 #include "Parameters_sm.h"
 
 #include <sstream>
@@ -121,11 +122,11 @@ namespace mg5amcCpu
     PinnedHostBufferBase( const size_t size )
       : BufferBase<T>( size, false )
     {
-      checkCuda( cudaMallocHost( &( this->m_data ), this->bytes() ) );
+      checkCuda( gpuMallocHost( &( this->m_data ), this->bytes() ) );
     }
     virtual ~PinnedHostBufferBase()
     {
-      checkCuda( cudaFreeHost( this->m_data ) );
+      checkCuda( gpuFreeHost( this->m_data ) );
     }
   };
 #endif
@@ -141,11 +142,11 @@ namespace mg5amcCpu
     DeviceBufferBase( const size_t size )
       : BufferBase<T>( size, true )
     {
-      checkCuda( cudaMalloc( &( this->m_data ), this->bytes() ) );
+      checkCuda( gpuMalloc( &( this->m_data ), this->bytes() ) );
     }
     virtual ~DeviceBufferBase()
     {
-      checkCuda( cudaFree( this->m_data ) );
+      checkCuda( gpuFree( this->m_data ) );
     }
   };
 #endif
@@ -497,7 +498,7 @@ namespace mg5amcCpu
       throw std::runtime_error( sstr.str() );
     }
     // NB (PR #45): cudaMemcpy involves an intermediate memcpy to pinned memory if host array is a not a pinned host array
-    checkCuda( cudaMemcpy( dst.data(), src.data(), src.bytes(), cudaMemcpyHostToDevice ) );
+    checkCuda( gpuMemcpy( dst.data(), src.data(), src.bytes(), gpuMemcpyHostToDevice ) );
   }
 #endif
 
@@ -520,7 +521,7 @@ namespace mg5amcCpu
       throw std::runtime_error( sstr.str() );
     }
     // NB (PR #45): cudaMemcpy involves an intermediate memcpy to pinned memory if host array is a not a pinned host array
-    checkCuda( cudaMemcpy( dst.data(), src.data(), src.bytes(), cudaMemcpyDeviceToHost ) );
+    checkCuda( gpuMemcpy( dst.data(), src.data(), src.bytes(), gpuMemcpyDeviceToHost ) );
   }
 #endif
 
diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/CPPProcess.cc b/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/CPPProcess.cc
index e386562b88..72aebabb68 100644
--- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/CPPProcess.cc
+++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/CPPProcess.cc
@@ -11,6 +11,7 @@
 
 #include "CudaRuntime.h"
 #include "HelAmps_sm.h"
+#include "gpu_abstraction.h"
 #include "MemoryAccessAmplitudes.h"
 #include "MemoryAccessCouplings.h"
 #include "MemoryAccessCouplingsFixed.h"
@@ -487,8 +488,8 @@ namespace mg5amcCpu
     const fptype tIPD[2] = { (fptype)m_pars->mdl_MZ, (fptype)m_pars->mdl_WZ };
     const cxtype tIPC[3] = { cxmake( m_pars->GC_3 ), cxmake( m_pars->GC_50 ), cxmake( m_pars->GC_59 ) };
 #ifdef __CUDACC__
-    checkCuda( cudaMemcpyToSymbol( cIPD, tIPD, 2 * sizeof( fptype ) ) );
-    checkCuda( cudaMemcpyToSymbol( cIPC, tIPC, 3 * sizeof( cxtype ) ) );
+    checkCuda( gpuMemcpyToSymbol( cIPD, tIPD, 2 * sizeof( fptype ) ) );
+    checkCuda( gpuMemcpyToSymbol( cIPC, tIPC, 3 * sizeof( cxtype ) ) );
 #else
     memcpy( cIPD, tIPD, 2 * sizeof( fptype ) );
     memcpy( cIPC, tIPC, 3 * sizeof( cxtype ) );
@@ -746,8 +747,8 @@ namespace mg5amcCpu
       }
     }
 #ifdef __CUDACC__
-    checkCuda( cudaMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) ) );
-    checkCuda( cudaMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) ) );
+    checkCuda( gpuMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) ) );
+    checkCuda( gpuMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) ) );
 #else
     cNGoodHel = nGoodHel;
     for( int ihel = 0; ihel < ncomb; ihel++ ) cGoodHel[ihel] = goodHel[ihel];
diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/gpu_abstraction.h b/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/gpu_abstraction.h
new file mode 120000
index 0000000000..1a79490b1a
--- /dev/null
+++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/gpu_abstraction.h
@@ -0,0 +1 @@
+../gpu_abstraction.h
\ No newline at end of file
diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/gpu_abstraction.h b/epochX/cudacpp/ee_mumu.mad/SubProcesses/gpu_abstraction.h
new file mode 100644
index 0000000000..6814f60d4b
--- /dev/null
+++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/gpu_abstraction.h
@@ -0,0 +1,26 @@
+// gpu_abstraction.h
+#pragma once
+
+#if defined(__HIP_PLATFORM_HCC__)
+    #include <hip/hip_runtime.h>
+
+    #define gpuError_t hipError_t
+    #define gpuMalloc hipMalloc
+    #define gpuFree hipFree
+    #define gpuMemcpy hipMemcpy
+    // Add other necessary abstractions
+
+#elif defined(__CUDACC__)
+
+    #define gpuError_t cudaError_t
+    #define gpuMalloc cudaMalloc
+    #define gpuMallocHost cudaMallocHost
+    #define gpuMemcpyHostToDevice cudaMemcpyHostToDevice
+    #define gpuMemcpyDeviceToHost cudaMemcpyDeviceToHost
+    #define gpuMemcpyToSymbol cudaMemcpyToSymbol
+    #define gpuFree cudaFree
+    #define gpuFreeHost cudaFreeHost
+    #define gpuMemcpy cudaMemcpy
+    // Add other necessary abstractions
+
+#endif
\ No newline at end of file

From 960877875998a1017c16fdb580eef4fc54c99fe4 Mon Sep 17 00:00:00 2001
From: Jorgen Teig <jteig@itscrd-a100.cern.ch>
Date: Thu, 11 May 2023 12:07:03 +0200
Subject: [PATCH 244/509] Fleshed out HIP macros and added missing macros in
 code

---
 .../cudacpp/ee_mumu.mad/SubProcesses/Bridge.h |  5 +-
 .../SubProcesses/MatrixElementKernels.cc      |  7 +--
 .../ee_mumu.mad/SubProcesses/MemoryBuffers.h  | 13 +++--
 .../SubProcesses/P1_epem_mupmum/CPPProcess.cc | 14 ++---
 .../SubProcesses/gpu_abstraction.h            | 52 +++++++++++++------
 .../ee_mumu.mad/SubProcesses/runTest.cc       |  3 +-
 tools/profiling/buildSYCLProcess.sh           | 16 +++---
 7 files changed, 67 insertions(+), 43 deletions(-)

diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/Bridge.h b/epochX/cudacpp/ee_mumu.mad/SubProcesses/Bridge.h
index faa8f95d1d..053b07d2c2 100644
--- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/Bridge.h
+++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/Bridge.h
@@ -6,6 +6,7 @@
 
 #include "CPPProcess.h"           // for CPPProcess
 #include "CrossSectionKernels.h"  // for flagAbnormalMEs
+#include "gpu_abstraction.h"       // for GPU abstraction, checkCuda is run here
 #include "MatrixElementKernels.h" // for MatrixElementKernelHost, MatrixElementKernelDevice
 #include "MemoryAccessMomenta.h"  // for MemoryAccessMomenta::neppM
 #include "MemoryBuffers.h"        // for HostBufferMomenta, DeviceBufferMomenta etc
@@ -279,11 +280,11 @@ namespace mg5amcCpu
     constexpr int neppM = MemoryAccessMomenta::neppM;
     if constexpr( neppM == 1 && std::is_same_v<FORTRANFPTYPE, fptype> )
     {
-      checkCuda( cudaMemcpy( m_devMomentaC.data(), momenta, m_devMomentaC.bytes(), cudaMemcpyHostToDevice ) );
+      gpuMemcpy( m_devMomentaC.data(), momenta, m_devMomentaC.bytes(), cudaMemcpyHostToDevice );
     }
     else
     {
-      checkCuda( cudaMemcpy( m_devMomentaF.data(), momenta, m_devMomentaF.bytes(), cudaMemcpyHostToDevice ) );
+      gpuMemcpy( m_devMomentaF.data(), momenta, m_devMomentaF.bytes(), cudaMemcpyHostToDevice );
       const int thrPerEvt = mgOnGpu::npar * mgOnGpu::np4; // AV: transpose alg does 1 element per thread (NOT 1 event per thread)
       //const int thrPerEvt = 1; // AV: try new alg with 1 event per thread... this seems slower
       dev_transposeMomentaF2C<<<m_gpublocks * thrPerEvt, m_gputhreads>>>( m_devMomentaF.data(), m_devMomentaC.data(), m_nevt );
diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/MatrixElementKernels.cc b/epochX/cudacpp/ee_mumu.mad/SubProcesses/MatrixElementKernels.cc
index da81c99218..e5054db077 100644
--- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/MatrixElementKernels.cc
+++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/MatrixElementKernels.cc
@@ -2,6 +2,7 @@
 
 #include "CPPProcess.h"
 #include "CudaRuntime.h"
+#include "gpu_abstraction.h"
 #include "MemoryAccessMomenta.h"
 #include "MemoryBuffers.h"
 
@@ -203,7 +204,7 @@ namespace mg5amcGpu
 #else
     sigmaKin_getGoodHel<<<m_gpublocks, m_gputhreads>>>( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), devIsGoodHel.data() );
 #endif
-    checkCuda( cudaPeekAtLastError() );
+    gpuPeekAtLastError();
     // ... 0d2. Copy back good helicity mask to the host
     copyHostFromDevice( hstIsGoodHel, devIsGoodHel );
     // ... 0d3. Copy back good helicity list to constant memory on the device
@@ -225,8 +226,8 @@ namespace mg5amcGpu
 #else
     sigmaKin<<<m_gpublocks, m_gputhreads, sharedMemSize>>>( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), m_selhel.data(), m_selcol.data() );
 #endif
-    checkCuda( cudaPeekAtLastError() );
-    checkCuda( cudaDeviceSynchronize() );
+    gpuPeekAtLastError();
+    gpuDeviceSynchronize();
   }
 
   //--------------------------------------------------------------------------
diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/MemoryBuffers.h b/epochX/cudacpp/ee_mumu.mad/SubProcesses/MemoryBuffers.h
index 7e7a868189..cfa4eae7b4 100644
--- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/MemoryBuffers.h
+++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/MemoryBuffers.h
@@ -5,7 +5,6 @@
 
 #include "mgOnGpuCxtypes.h"
 
-#include "CudaRuntime.h"
 #include "gpu_abstraction.h"
 #include "Parameters_sm.h"
 
@@ -122,11 +121,11 @@ namespace mg5amcCpu
     PinnedHostBufferBase( const size_t size )
       : BufferBase<T>( size, false )
     {
-      checkCuda( gpuMallocHost( &( this->m_data ), this->bytes() ) );
+      gpuMallocHost( &( this->m_data ), this->bytes() );
     }
     virtual ~PinnedHostBufferBase()
     {
-      checkCuda( gpuFreeHost( this->m_data ) );
+      gpuFreeHost( this->m_data );
     }
   };
 #endif
@@ -142,11 +141,11 @@ namespace mg5amcCpu
     DeviceBufferBase( const size_t size )
       : BufferBase<T>( size, true )
     {
-      checkCuda( gpuMalloc( &( this->m_data ), this->bytes() ) );
+      gpuMalloc( &( this->m_data ), this->bytes() );
     }
     virtual ~DeviceBufferBase()
     {
-      checkCuda( gpuFree( this->m_data ) );
+      gpuFree( this->m_data );
     }
   };
 #endif
@@ -498,7 +497,7 @@ namespace mg5amcCpu
       throw std::runtime_error( sstr.str() );
     }
     // NB (PR #45): cudaMemcpy involves an intermediate memcpy to pinned memory if host array is a not a pinned host array
-    checkCuda( gpuMemcpy( dst.data(), src.data(), src.bytes(), gpuMemcpyHostToDevice ) );
+    gpuMemcpy( dst.data(), src.data(), src.bytes(), gpuMemcpyHostToDevice );
   }
 #endif
 
@@ -521,7 +520,7 @@ namespace mg5amcCpu
       throw std::runtime_error( sstr.str() );
     }
     // NB (PR #45): cudaMemcpy involves an intermediate memcpy to pinned memory if host array is a not a pinned host array
-    checkCuda( gpuMemcpy( dst.data(), src.data(), src.bytes(), gpuMemcpyDeviceToHost ) );
+    gpuMemcpy( dst.data(), src.data(), src.bytes(), gpuMemcpyDeviceToHost );
   }
 #endif
 
diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/CPPProcess.cc b/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/CPPProcess.cc
index 72aebabb68..b76f4bd8a2 100644
--- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/CPPProcess.cc
+++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/CPPProcess.cc
@@ -9,9 +9,8 @@
 
 #include "mgOnGpuConfig.h"
 
-#include "CudaRuntime.h"
 #include "HelAmps_sm.h"
-#include "gpu_abstraction.h"
+#include "gpu_abstraction.h" // for GPU abstraction, checkCuda is run on macros defined here
 #include "MemoryAccessAmplitudes.h"
 #include "MemoryAccessCouplings.h"
 #include "MemoryAccessCouplingsFixed.h"
@@ -28,6 +27,7 @@
 
 #include <algorithm>
 #include <array>
+#include <cassert>
 #include <cstring>
 #include <iostream>
 #include <memory>
@@ -447,7 +447,7 @@ namespace mg5amcCpu
       { 1, -1, -1, -1 },
       { 1, -1, -1, 1 } };
 #ifdef __CUDACC__
-    checkCuda( cudaMemcpyToSymbol( cHel, tHel, ncomb * mgOnGpu::npar * sizeof( short ) ) );
+    gpuMemcpyToSymbol( cHel, tHel, ncomb * mgOnGpu::npar * sizeof( short ) );
 #else
     memcpy( cHel, tHel, ncomb * mgOnGpu::npar * sizeof( short ) );
 #endif
@@ -488,8 +488,8 @@ namespace mg5amcCpu
     const fptype tIPD[2] = { (fptype)m_pars->mdl_MZ, (fptype)m_pars->mdl_WZ };
     const cxtype tIPC[3] = { cxmake( m_pars->GC_3 ), cxmake( m_pars->GC_50 ), cxmake( m_pars->GC_59 ) };
 #ifdef __CUDACC__
-    checkCuda( gpuMemcpyToSymbol( cIPD, tIPD, 2 * sizeof( fptype ) ) );
-    checkCuda( gpuMemcpyToSymbol( cIPC, tIPC, 3 * sizeof( cxtype ) ) );
+    gpuMemcpyToSymbol( cIPD, tIPD, 2 * sizeof( fptype ) );
+    gpuMemcpyToSymbol( cIPC, tIPC, 3 * sizeof( cxtype ) );
 #else
     memcpy( cIPD, tIPD, 2 * sizeof( fptype ) );
     memcpy( cIPC, tIPC, 3 * sizeof( cxtype ) );
@@ -747,8 +747,8 @@ namespace mg5amcCpu
       }
     }
 #ifdef __CUDACC__
-    checkCuda( gpuMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) ) );
-    checkCuda( gpuMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) ) );
+    gpuMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) );
+    gpuMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) );
 #else
     cNGoodHel = nGoodHel;
     for( int ihel = 0; ihel < ncomb; ihel++ ) cGoodHel[ihel] = goodHel[ihel];
diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/gpu_abstraction.h b/epochX/cudacpp/ee_mumu.mad/SubProcesses/gpu_abstraction.h
index 6814f60d4b..1f944b7397 100644
--- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/gpu_abstraction.h
+++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/gpu_abstraction.h
@@ -1,26 +1,46 @@
 // gpu_abstraction.h
 #pragma once
 
-#if defined(__HIP_PLATFORM_HCC__)
-    #include <hip/hip_runtime.h>
+#ifdef __CUDACC__
+    // NVIDIA GPU using CUDA
+    #include <CudaRuntime.h>
 
-    #define gpuError_t hipError_t
-    #define gpuMalloc hipMalloc
-    #define gpuFree hipFree
-    #define gpuMemcpy hipMemcpy
-    // Add other necessary abstractions
+    #define gpuError_t cudaError_t
+    #define gpuPeekAtLastError cudaPeekAtLastError
 
-#elif defined(__CUDACC__)
+    #define gpuMallocHost(ptr, size) checkCuda( cudaMallocHost(ptr, size) )
+    #define gpuMalloc(ptr, size) checkCuda( cudaMalloc(ptr, size) )
 
-    #define gpuError_t cudaError_t
-    #define gpuMalloc cudaMalloc
-    #define gpuMallocHost cudaMallocHost
+    #define gpuMemcpy(dstData, srcData, srcBytes, func) checkCuda( cudaMemcpy(dstData, srcData, srcBytes, func) )
     #define gpuMemcpyHostToDevice cudaMemcpyHostToDevice
     #define gpuMemcpyDeviceToHost cudaMemcpyDeviceToHost
-    #define gpuMemcpyToSymbol cudaMemcpyToSymbol
-    #define gpuFree cudaFree
-    #define gpuFreeHost cudaFreeHost
-    #define gpuMemcpy cudaMemcpy
-    // Add other necessary abstractions
+    #define gpuMemcpyToSymbol(type1, type2, size) checkCuda( cudaMemcpyToSymbol(type1, type2, size) )
+
+    #define gpuFree(ptr) checkCuda( cudaFree(ptr) )
+    #define gpuFreeHost(ptr) checkCuda( cudaFreeHost(ptr) )
+
+    #define gpuDeviceSynchronize cudaDeviceSynchronize
+    #define gpuDeviceReset cudaDeviceReset
+
+#elif defined(__HIP__)
+    // AMD GPU using HIP
+    #include <hip/hip_runtime.h>
+
+    #define gpuError_t hipError_t
+    #define gpuPeekAtLastError hipPeekAtLastError
+
+    #define gpuMallocHost(ptr, size) checkHip( hipMallocHost(ptr, size) )
+    #define gpuMalloc(ptr, size) checkHip( hipMalloc(ptr, size) )
+
+    #define gpuMemcpy(dstData, srcData, srcBytes, func) checkHip( hipMemcpy(dstData, srcData, srcBytes, func) )
+    #define gpuMemcpyHostToDevice hipMemcpyHostToDevice
+    #define gpuMemcpyDeviceToHost hipMemcpyDeviceToHost
+    #define gpuMemcpyToSymbol(type1, type2, size) checkHip( hipMemcpyToSymbol(type1, type2, size) )
+
+    #define gpuFree(ptr) checkHip( hipFree(ptr) )
+    #define gpuFreeHost(ptr) checkHip( hipFreeHost(ptr) )
+
+    #define gpuDeviceSynchronize hipDeviceSynchronize
+    #define gpuDeviceReset hipDeviceReset
 
 #endif
\ No newline at end of file
diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/runTest.cc b/epochX/cudacpp/ee_mumu.mad/SubProcesses/runTest.cc
index a1cec39ced..a0d551d5d5 100644
--- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/runTest.cc
+++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/runTest.cc
@@ -1,6 +1,7 @@
 #include "mgOnGpuConfig.h"
 
 #include "CPPProcess.h"
+#include "gpu_abstraction.h" // for GPU abstraction, checkCuda is run on macros defined here
 #include "MadgraphTest.h"
 #include "MatrixElementKernels.h"
 #include "MemoryAccessMatrixElements.h"
@@ -118,7 +119,7 @@ struct CUDATest : public CUDA_CPU_TestBase
   {
     ~DeviceReset()
     {
-      checkCuda( cudaDeviceReset() ); // this is needed by cuda-memcheck --leak-check full
+      gpuDeviceReset(); // this is needed by cuda-memcheck --leak-check full
     }
   } deviceResetter;
 
diff --git a/tools/profiling/buildSYCLProcess.sh b/tools/profiling/buildSYCLProcess.sh
index c0be7d705a..4475fb4eee 100755
--- a/tools/profiling/buildSYCLProcess.sh
+++ b/tools/profiling/buildSYCLProcess.sh
@@ -88,17 +88,19 @@ export NTPBMAX=1024
 export CUDA_PATH=/usr/local/cuda-12.0/
 export WORKSPACE=$prefix/workspace_mg4gpu
 
+export CXTYPE="thrust"
+
 # Old SYCLFLAGS
 # export SYCLFLAGS="-fsycl -fsycl-targets=nvptx64-nvidia-cuda -Xsycl-target-backend '--cuda-gpu-arch=$SM_LEVEL' -fgpu-rdc --cuda-path=$CUDA_PATH"
 
-export SYCLFLAGS="-fsycl -fsycl-targets=nvptx64-nvidia-cuda -Xsycl-target-backend --cuda-gpu-arch=$SM_LEVEL -Xclang -fdenormal-fp-math=ieee"
+export SYCLFLAGS="-fsycl -fsycl-targets=nvptx64-nvidia-cuda -Xcuda-ptxas --maxrregcount=255 -Xcuda-ptxas --verbose -Xsycl-target-backend --cuda-gpu-arch=$SM_LEVEL"
 
 # Compilation using OneAPI Toolkit through CVMFS
-export CXX=/cvmfs/projects.cern.ch/intelsw/oneAPI/linux/x86_64/2023/compiler/2023.0.0/linux/bin-llvm/clang++
+#export CXX=/cvmfs/projects.cern.ch/intelsw/oneAPI/linux/x86_64/2023/compiler/2023.0.0/linux/bin-llvm/clang++
 
-# Compilation for OneAPI LLVM compiler
-#export DPCPP_HOME=/afs/cern.ch/work/j/jteig/sycl_workspace
-#export CXX=$DPCPP_HOME/llvm/build/bin/clang++
+# Compilation with LLVM DPC++ compiler
+export DPCPP_HOME=/afs/cern.ch/work/j/jteig/sycl_workspace
+export CXX=$DPCPP_HOME/llvm/llvm-20230418-fea99cc9ad67-gcc-11.2.1-cuda-12.0/bin/clang++
 
 # Sets CUDA in PATH
 export PATH=$CUDA_HOME:$PATH
@@ -144,7 +146,7 @@ export MG5AMC_CARD_PATH=$MG_PROC_DIR/Cards
 
 # Build executable
 cd $MG_SP_DIR
-make -j
+make -j build.d_inl0_hrd1/check.exe
 mv -f ../../lib/build.*/ $MG_LIBS_DIR #2>/dev/null; true
 mv -f build.*/ $MG_EXE_DIR
 
@@ -158,4 +160,4 @@ if [ $DEVICE_ID == "info" ]; then
 else
     # Add MG Libs to linker library path and run the executable
     LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$MG_LIBS $MG_EXE -j --json_file ${REPORT_FOLDER}/test_${MG_PROC}_${SYCL_NAME_PREFIX}_${blocksPerGrid}_${threadsPerBlock}_${iterations}.json --param_card $MG5AMC_CARD_PATH/param_card.dat --device_id $DEVICE_ID $blocksPerGrid $threadsPerBlock $iterations
-fi
\ No newline at end of file
+fi

From e4dc25ee13cc8c8b8732382645fc6109c49c08f3 Mon Sep 17 00:00:00 2001
From: Jorgen T <jorgen.teig@gmail.com>
Date: Wed, 31 May 2023 15:38:21 +0200
Subject: [PATCH 245/509] Added new GpuRuntime to replace CudaRuntime and added
 macros for kernel launches

---
 .../cudacpp/ee_mumu.mad/SubProcesses/Bridge.h |  4 +-
 .../ee_mumu.mad/SubProcesses/GpuRuntime.h     | 80 ++++++++++++++++++
 .../SubProcesses/MatrixElementKernels.cc      | 15 ++--
 .../ee_mumu.mad/SubProcesses/MemoryBuffers.h  |  2 +-
 .../SubProcesses/P1_epem_mupmum/CPPProcess.cc |  4 +-
 .../SubProcesses/P1_epem_mupmum/GpuRuntime.h  |  1 +
 .../SubProcesses/P1_epem_mupmum/check_sa.cc   |  7 +-
 .../SubProcesses/RamboSamplingKernels.cc      |  6 +-
 .../SubProcesses/RandomNumberKernels.cc       |  2 +-
 .../ee_mumu.mad/SubProcesses/fbridge.cc       |  4 +-
 .../SubProcesses/gpu_abstraction.h            | 84 ++++++++++++-------
 11 files changed, 158 insertions(+), 51 deletions(-)
 create mode 100644 epochX/cudacpp/ee_mumu.mad/SubProcesses/GpuRuntime.h
 create mode 120000 epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/GpuRuntime.h

diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/Bridge.h b/epochX/cudacpp/ee_mumu.mad/SubProcesses/Bridge.h
index 053b07d2c2..18786c0a1a 100644
--- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/Bridge.h
+++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/Bridge.h
@@ -6,7 +6,7 @@
 
 #include "CPPProcess.h"           // for CPPProcess
 #include "CrossSectionKernels.h"  // for flagAbnormalMEs
-#include "gpu_abstraction.h"       // for GPU abstraction, checkCuda is run here
+#include "GpuRuntime.h"       // for GPU abstraction, checkCuda is run here
 #include "MatrixElementKernels.h" // for MatrixElementKernelHost, MatrixElementKernelDevice
 #include "MemoryAccessMomenta.h"  // for MemoryAccessMomenta::neppM
 #include "MemoryBuffers.h"        // for HostBufferMomenta, DeviceBufferMomenta etc
@@ -287,7 +287,7 @@ namespace mg5amcCpu
       gpuMemcpy( m_devMomentaF.data(), momenta, m_devMomentaF.bytes(), cudaMemcpyHostToDevice );
       const int thrPerEvt = mgOnGpu::npar * mgOnGpu::np4; // AV: transpose alg does 1 element per thread (NOT 1 event per thread)
       //const int thrPerEvt = 1; // AV: try new alg with 1 event per thread... this seems slower
-      dev_transposeMomentaF2C<<<m_gpublocks * thrPerEvt, m_gputhreads>>>( m_devMomentaF.data(), m_devMomentaC.data(), m_nevt );
+      gpuLaunchKernel(dev_transposeMomentaF2C, m_gpublocks * thrPerEvt, m_gputhreads, m_devMomentaF.data(), m_devMomentaC.data(), m_nevt );
     }
     if constexpr( std::is_same_v<FORTRANFPTYPE, fptype> )
     {
diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/GpuRuntime.h b/epochX/cudacpp/ee_mumu.mad/SubProcesses/GpuRuntime.h
new file mode 100644
index 0000000000..432c6e06b0
--- /dev/null
+++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/GpuRuntime.h
@@ -0,0 +1,80 @@
+#ifndef MG5AMC_GPURUNTIME_H
+#define MG5AMC_GPURUNTIME_H 1
+
+// MG5AMC on GPU uses the CUDA runtime API, not the lower level CUDA driver API
+// See https://docs.nvidia.com/cuda/cuda-runtime-api/driver-vs-runtime-api.html#driver-vs-runtime-api
+
+#include <cassert>
+#include <iostream>
+#include "gpu_abstraction.h"
+
+//--------------------------------------------------------------------------
+
+// See https://stackoverflow.com/a/14038590
+#ifdef __CUDACC__ /* clang-format off */
+#define checkGpu( code ) { assertGpu( code, __FILE__, __LINE__ ); }
+inline void assertGpu( gpuError_t code, const char* file, int line, bool abort = true )
+{
+  if( code != gpuSuccess )
+  {
+    printf( "ERROR! assertGpu: '%s' (%d) in %s:%d\n", gpuGetErrorString( code ), code, file, line );
+    if( abort ) assert( code == gpuSuccess );
+  }
+}
+#endif /* clang-format on */
+
+//--------------------------------------------------------------------------
+
+#ifdef __CUDACC__
+namespace mg5amcGpu
+{
+  // Instantiate a CudaRuntime at the beginnining of the application's main to
+  // invoke cudaSetDevice(0) in the constructor and book a cudaDeviceReset() call in the destructor
+  // *** FIXME! This will all need to be designed differently when going to multi-GPU nodes! ***
+  struct GpuRuntime final
+  {
+    GpuRuntime( const bool debug = true )
+      : m_debug( debug ) { setUp( m_debug ); }
+    ~GpuRuntime() { tearDown( m_debug ); }
+    GpuRuntime( const GpuRuntime& ) = delete;
+    GpuRuntime( GpuRuntime&& ) = delete;
+    GpuRuntime& operator=( const GpuRuntime& ) = delete;
+    GpuRuntime& operator=( GpuRuntime&& ) = delete;
+    bool m_debug;
+
+    // Set up CUDA application
+    // ** NB: strictly speaking this is not needed when using the CUDA runtime API **
+    // Calling cudaSetDevice on startup is useful to properly book-keep the time spent in CUDA initialization
+    static void setUp( const bool debug = true )
+    {
+      // ** NB: it is useful to call cudaSetDevice, or cudaFree, to properly book-keep the time spent in CUDA initialization
+      // ** NB: otherwise, the first CUDA operation (eg a cudaMemcpyToSymbol in CPPProcess ctor) appears to take much longer!
+      /*
+      // [We initially added cudaFree(0) to "ease profile analysis" only because it shows up as a big recognizable block!]
+      // No explicit initialization is needed: https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#initialization
+      // It is not clear what cudaFree(0) does at all: https://stackoverflow.com/questions/69967813/
+      if ( debug ) std::cout << "__CudaRuntime: calling cudaFree(0)" << std::endl;
+      checkCuda( cudaFree( 0 ) ); // SLOW!
+      */
+      // Replace cudaFree(0) by cudaSetDevice(0), even if it is not really needed either
+      // (but see https://developer.nvidia.com/blog/cuda-pro-tip-always-set-current-device-avoid-multithreading-bugs)
+      if( debug ) std::cout << "__GpuRuntime: calling GpuSetDevice(0)" << std::endl;
+      checkGpu( gpuSetDevice( 0 ) ); // SLOW!
+    }
+
+    // Tear down CUDA application (call cudaDeviceReset)
+    // ** NB: strictly speaking this is not needed when using the CUDA runtime API **
+    // Calling cudaDeviceReset on shutdown is only needed for checking memory leaks in cuda-memcheck
+    // See https://docs.nvidia.com/cuda/cuda-memcheck/index.html#leak-checking
+    static void tearDown( const bool debug = true )
+    {
+      if( debug ) std::cout << "__GpuRuntime: calling GpuDeviceReset()" << std::endl;
+      checkGpu( gpuDeviceReset() );
+    }
+  };
+}
+#endif
+
+//--------------------------------------------------------------------------
+
+#endif // MG5AMC_GPURUNTIME_H
diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/MatrixElementKernels.cc b/epochX/cudacpp/ee_mumu.mad/SubProcesses/MatrixElementKernels.cc
index e5054db077..c6c054efd4 100644
--- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/MatrixElementKernels.cc
+++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/MatrixElementKernels.cc
@@ -1,8 +1,7 @@
 #include "MatrixElementKernels.h"
 
 #include "CPPProcess.h"
-#include "CudaRuntime.h"
-#include "gpu_abstraction.h"
+#include "GpuRuntime.h"
 #include "MemoryAccessMomenta.h"
 #include "MemoryBuffers.h"
 
@@ -198,11 +197,11 @@ namespace mg5amcGpu
     PinnedHostBufferHelicityMask hstIsGoodHel( ncomb );
     DeviceBufferHelicityMask devIsGoodHel( ncomb );
     // ... 0d1. Compute good helicity mask on the device
-    computeDependentCouplings<<<m_gpublocks, m_gputhreads>>>( m_gs.data(), m_couplings.data() );
+    gpuLaunchKernel(computeDependentCouplings, m_gpublocks, m_gputhreads, m_gs.data(), m_couplings.data() );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    sigmaKin_getGoodHel<<<m_gpublocks, m_gputhreads>>>( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_numerators.data(), m_denominators.data(), devIsGoodHel.data() );
+    gpuLaunchKernel(sigmaKin_getGoodHel, m_gpublocks, m_gputhreads, m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_numerators.data(), m_denominators.data(), devIsGoodHel.data() );
 #else
-    sigmaKin_getGoodHel<<<m_gpublocks, m_gputhreads>>>( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), devIsGoodHel.data() );
+    gpuLaunchKernel(sigmaKin_getGoodHel, m_gpublocks, m_gputhreads, m_momenta.data(), m_couplings.data(), m_matrixElements.data(), devIsGoodHel.data() );
 #endif
     gpuPeekAtLastError();
     // ... 0d2. Copy back good helicity mask to the host
@@ -215,16 +214,16 @@ namespace mg5amcGpu
 
   void MatrixElementKernelDevice::computeMatrixElements( const unsigned int channelId )
   {
-    computeDependentCouplings<<<m_gpublocks, m_gputhreads>>>( m_gs.data(), m_couplings.data() );
+    gpuLaunchKernel(computeDependentCouplings, m_gpublocks, m_gputhreads, m_gs.data(), m_couplings.data() );
 #ifndef MGONGPU_NSIGHT_DEBUG
     constexpr unsigned int sharedMemSize = 0;
 #else
     constexpr unsigned int sharedMemSize = ntpbMAX * sizeof( float );
 #endif
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    sigmaKin<<<m_gpublocks, m_gputhreads, sharedMemSize>>>( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), channelId, m_numerators.data(), m_denominators.data(), m_selhel.data(), m_selcol.data() );
+    gpuLaunchKernelSharedMem(sigmaKin, m_gpublocks, m_gputhreads, sharedMemSize, m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), channelId, m_numerators.data(), m_denominators.data(), m_selhel.data(), m_selcol.data() );
 #else
-    sigmaKin<<<m_gpublocks, m_gputhreads, sharedMemSize>>>( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), m_selhel.data(), m_selcol.data() );
+    gpuLaunchKernelSharedMem(sigmaKin, m_gpublocks, m_gputhreads, sharedMemSize, m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), m_selhel.data(), m_selcol.data() );
 #endif
     gpuPeekAtLastError();
     gpuDeviceSynchronize();
diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/MemoryBuffers.h b/epochX/cudacpp/ee_mumu.mad/SubProcesses/MemoryBuffers.h
index cfa4eae7b4..cda8400f10 100644
--- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/MemoryBuffers.h
+++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/MemoryBuffers.h
@@ -5,7 +5,7 @@
 
 #include "mgOnGpuCxtypes.h"
 
-#include "gpu_abstraction.h"
+#include "GpuRuntime.h"
 #include "Parameters_sm.h"
 
 #include <sstream>
diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/CPPProcess.cc b/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/CPPProcess.cc
index b76f4bd8a2..ea2f3dcbb0 100644
--- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/CPPProcess.cc
+++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/CPPProcess.cc
@@ -10,7 +10,7 @@
 #include "mgOnGpuConfig.h"
 
 #include "HelAmps_sm.h"
-#include "gpu_abstraction.h" // for GPU abstraction, checkCuda is run on macros defined here
+#include "GpuRuntime.h" // for GPU abstraction, checkCuda is run on macros defined here
 #include "MemoryAccessAmplitudes.h"
 #include "MemoryAccessCouplings.h"
 #include "MemoryAccessCouplingsFixed.h"
@@ -40,7 +40,7 @@
 // Class member functions for calculating the matrix elements for
 // Process: e+ e- > mu+ mu- WEIGHTED<=4 @1
 
-#ifdef __CUDACC__
+#ifdef __CUDACC__ 
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/GpuRuntime.h b/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/GpuRuntime.h
new file mode 120000
index 0000000000..3920e83be4
--- /dev/null
+++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/GpuRuntime.h
@@ -0,0 +1 @@
+../GpuRuntime.h
\ No newline at end of file
diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/check_sa.cc b/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/check_sa.cc
index 41367fd70b..4abcfbdc6e 100644
--- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/check_sa.cc
+++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/check_sa.cc
@@ -2,6 +2,7 @@
 
 #include "BridgeKernels.h"
 #include "CPPProcess.h"
+#include "GpuRuntime.h"
 #include "CrossSectionKernels.h"
 #include "MatrixElementKernels.h"
 #include "MemoryAccessMatrixElements.h"
@@ -257,11 +258,11 @@ main( int argc, char** argv )
 #ifdef __CUDACC__
 
   // --- 00. Initialise cuda
-  // Instantiate a CudaRuntime at the beginnining of the application's main to
-  // invoke cudaSetDevice(0) in the constructor and book a cudaDeviceReset() call in the destructor
+  // Instantiate a GpuRuntime (CUDA or HIP based on target arch) at the beginnining of the application's main to
+  // invoke gpuSetDevice(0) in the constructor and book a gpuDeviceReset() call in the destructor
   const std::string cdinKey = "00 CudaInit";
   timermap.start( cdinKey );
-  CudaRuntime cudaRuntime( debug );
+  GpuRuntime gpuRuntime( debug );
 #endif
 
   // --- 0a. Initialise physics process
diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/RamboSamplingKernels.cc b/epochX/cudacpp/ee_mumu.mad/SubProcesses/RamboSamplingKernels.cc
index ed2e042427..14d29ece76 100644
--- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/RamboSamplingKernels.cc
+++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/RamboSamplingKernels.cc
@@ -1,6 +1,6 @@
 #include "RamboSamplingKernels.h"
 
-#include "CudaRuntime.h"
+#include "GpuRuntime.h"
 #include "MemoryAccessMomenta.h"
 #include "MemoryAccessRandomNumbers.h"
 #include "MemoryAccessWeights.h"
@@ -146,7 +146,7 @@ namespace mg5amcCpu
   void
   RamboSamplingKernelDevice::getMomentaInitial()
   {
-    getMomentaInitialDevice<<<m_gpublocks, m_gputhreads>>>( m_energy, m_momenta.data() );
+    gpuLaunchKernel(getMomentaInitialDevice, m_gpublocks, m_gputhreads, m_energy, m_momenta.data() );
   }
 #endif
 
@@ -170,7 +170,7 @@ namespace mg5amcCpu
   void
   RamboSamplingKernelDevice::getMomentaFinal()
   {
-    getMomentaFinalDevice<<<m_gpublocks, m_gputhreads>>>( m_energy, m_rndmom.data(), m_momenta.data(), m_weights.data() );
+    gpuLaunchKernel(getMomentaFinalDevice, m_gpublocks, m_gputhreads, m_energy, m_rndmom.data(), m_momenta.data(), m_weights.data() );
   }
 #endif
 
diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/RandomNumberKernels.cc b/epochX/cudacpp/ee_mumu.mad/SubProcesses/RandomNumberKernels.cc
index eb8bc09ea9..57f660911f 100644
--- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/RandomNumberKernels.cc
+++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/RandomNumberKernels.cc
@@ -1,7 +1,7 @@
 #include "RandomNumberKernels.h"
 
 #include "CommonRandomNumbers.h"
-#include "CudaRuntime.h"
+#include "GpuRuntime.h"
 #include "MemoryBuffers.h"
 
 #include <cassert>
diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/fbridge.cc b/epochX/cudacpp/ee_mumu.mad/SubProcesses/fbridge.cc
index 9c9287e0c5..1f914ee143 100644
--- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/fbridge.cc
+++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/fbridge.cc
@@ -1,6 +1,6 @@
 #include "Bridge.h"
 #include "CPPProcess.h"
-#include "CudaRuntime.h"
+#include "GpuRuntime.h"
 
 extern "C"
 {
@@ -42,7 +42,7 @@ extern "C"
   void fbridgecreate_( CppObjectInFortran** ppbridge, const int* pnevtF, const int* pnparF, const int* pnp4F )
   {
 #ifdef __CUDACC__
-    CudaRuntime::setUp();
+    GpuRuntime::setUp();
 #endif
     // Create a process object, read parm card and set parameters
     // FIXME: the process instance can happily go out of scope because it is only needed to read parameters?
diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/gpu_abstraction.h b/epochX/cudacpp/ee_mumu.mad/SubProcesses/gpu_abstraction.h
index 1f944b7397..f534a5c5d5 100644
--- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/gpu_abstraction.h
+++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/gpu_abstraction.h
@@ -1,46 +1,72 @@
 // gpu_abstraction.h
 #pragma once
 
+#include <cassert>
+#include <iostream>
+
 #ifdef __CUDACC__
-    // NVIDIA GPU using CUDA
-    #include <CudaRuntime.h>
 
-    #define gpuError_t cudaError_t
-    #define gpuPeekAtLastError cudaPeekAtLastError
+  // Defines correct compiler
+  #define __CUDACC__ __CUDACC__
+
+  //--------------------------------------------------------------------------
+
+  #define gpuError_t cudaError_t
+  #define gpuPeekAtLastError cudaPeekAtLastError
+  #define gpuGetErrorString cudaGetErrorString
+  #define gpuSuccess cudaSuccess
+
+  #define gpuMallocHost(ptr, size) checkGpu( cudaMallocHost(ptr, size) )
+  #define gpuMalloc(ptr, size) checkGpu( cudaMalloc(ptr, size) )
+
+  #define gpuMemcpy(dstData, srcData, srcBytes, func) checkGpu( cudaMemcpy(dstData, srcData, srcBytes, func) )
+  #define gpuMemcpyHostToDevice cudaMemcpyHostToDevice
+  #define gpuMemcpyDeviceToHost cudaMemcpyDeviceToHost
+  #define gpuMemcpyToSymbol(type1, type2, size) checkGpu( cudaMemcpyToSymbol(type1, type2, size) )
+
+  #define gpuFree(ptr) checkGpu( cudaFree(ptr) )
+  #define gpuFreeHost(ptr) checkGpu( cudaFreeHost(ptr) )
+
+  #define gpuSetDevice cudaSetDevice
+  #define gpuDeviceSynchronize cudaDeviceSynchronize
+  #define gpuDeviceReset cudaDeviceReset
+
+  #define gpuLaunchKernel( kernel, blocks, threads, ...)                    kernel<<<blocks, threads>>> (__VA_ARGS__)
+  #define gpuLaunchKernelSharedMem(kernel, blocks, threads, sharedMem, ...) kernel<<<blocks, threads, sharedMem>>>(__VA_ARGS__)
+
+//--------------------------------------------------------------------------
 
-    #define gpuMallocHost(ptr, size) checkCuda( cudaMallocHost(ptr, size) )
-    #define gpuMalloc(ptr, size) checkCuda( cudaMalloc(ptr, size) )
+#elif defined(__HIPCC__)
 
-    #define gpuMemcpy(dstData, srcData, srcBytes, func) checkCuda( cudaMemcpy(dstData, srcData, srcBytes, func) )
-    #define gpuMemcpyHostToDevice cudaMemcpyHostToDevice
-    #define gpuMemcpyDeviceToHost cudaMemcpyDeviceToHost
-    #define gpuMemcpyToSymbol(type1, type2, size) checkCuda( cudaMemcpyToSymbol(type1, type2, size) )
+  // Defines correct compiler
+  #define __CUDACC__ __HIPCC__
 
-    #define gpuFree(ptr) checkCuda( cudaFree(ptr) )
-    #define gpuFreeHost(ptr) checkCuda( cudaFreeHost(ptr) )
+  //--------------------------------------------------------------------------
 
-    #define gpuDeviceSynchronize cudaDeviceSynchronize
-    #define gpuDeviceReset cudaDeviceReset
+  #define gpuError_t hipError_t
+  #define gpuPeekAtLastError hipPeekAtLastError
+  #define gpuGetErrorString hipGetErrorString
+  #define gpuSuccess hipSuccess
 
-#elif defined(__HIP__)
-    // AMD GPU using HIP
-    #include <hip/hip_runtime.h>
+  #define gpuMallocHost(ptr, size) checkGpu( hipHostMalloc(ptr, size) ) // HostMalloc better
+  #define gpuMalloc(ptr, size) checkGpu( hipMalloc(ptr, size) )
 
-    #define gpuError_t hipError_t
-    #define gpuPeekAtLastError hipPeekAtLastError
+  #define gpuMemcpy(dstData, srcData, srcBytes, func) checkGpu( hipMemcpy(dstData, srcData, srcBytes, func) )
+  #define gpuMemcpyHostToDevice hipMemcpyHostToDevice
+  #define gpuMemcpyDeviceToHost hipMemcpyDeviceToHost
+  #define gpuMemcpyToSymbol(type1, type2, size) checkGpu( hipMemcpyToSymbol(type1, type2, size) )
 
-    #define gpuMallocHost(ptr, size) checkHip( hipMallocHost(ptr, size) )
-    #define gpuMalloc(ptr, size) checkHip( hipMalloc(ptr, size) )
+  #define gpuFree(ptr) checkGpu( hipFree(ptr) )
+  #define gpuFreeHost(ptr) checkGpu( hipFreeHost(ptr) )
 
-    #define gpuMemcpy(dstData, srcData, srcBytes, func) checkHip( hipMemcpy(dstData, srcData, srcBytes, func) )
-    #define gpuMemcpyHostToDevice hipMemcpyHostToDevice
-    #define gpuMemcpyDeviceToHost hipMemcpyDeviceToHost
-    #define gpuMemcpyToSymbol(type1, type2, size) checkHip( hipMemcpyToSymbol(type1, type2, size) )
+  #define gpuSetDevice hipSetDevice
+  #define gpuDeviceSynchronize hipDeviceSynchronize
+  #define gpuDeviceReset hipDeviceReset
 
-    #define gpuFree(ptr) checkHip( hipFree(ptr) )
-    #define gpuFreeHost(ptr) checkHip( hipFreeHost(ptr) )
+  #define gpuLaunchKernel(kernel, blocks, threads, sharedMemSize, ...) \
+          hipLaunchKernelGGL(kernel, blocks, threads, __VA_ARGS__);
 
-    #define gpuDeviceSynchronize hipDeviceSynchronize
-    #define gpuDeviceReset hipDeviceReset
+  #define gpuLaunchKernelSharedMem(kernel, blocks, threads, ...) \
+          hipLaunchKernelGGL(kernel, blocks, threads, sharedMemSize, __VA_ARGS__);
 
 #endif
\ No newline at end of file

From 39836e07219f566c2037a557bc7fd08a6a3e5ff3 Mon Sep 17 00:00:00 2001
From: Jorgen T <jorgen.teig@gmail.com>
Date: Wed, 31 May 2023 15:39:42 +0200
Subject: [PATCH 246/509] Added macro for __CUDACC_

---
 .../cudacpp/ee_mumu.mad/SubProcesses/Bridge.h | 24 +++---
 .../ee_mumu.mad/SubProcesses/BridgeKernels.cc |  6 +-
 .../ee_mumu.mad/SubProcesses/BridgeKernels.h  |  6 +-
 .../SubProcesses/CrossSectionKernels.cc       |  4 +-
 .../SubProcesses/CrossSectionKernels.h        |  4 +-
 .../ee_mumu.mad/SubProcesses/CudaRuntime.h    |  4 +-
 .../SubProcesses/EventStatistics.h            |  2 +-
 .../ee_mumu.mad/SubProcesses/GpuRuntime.h     |  4 +-
 .../ee_mumu.mad/SubProcesses/MadgraphTest.h   |  4 +-
 .../SubProcesses/MatrixElementKernels.cc      |  4 +-
 .../SubProcesses/MatrixElementKernels.h       |  6 +-
 .../SubProcesses/MemoryAccessHelpers.h        |  2 +-
 .../SubProcesses/MemoryAccessMomenta.h        |  2 +-
 .../SubProcesses/MemoryAccessVectors.h        |  2 +-
 .../ee_mumu.mad/SubProcesses/MemoryBuffers.h  | 48 ++++++------
 .../SubProcesses/P1_epem_mupmum/CPPProcess.cc | 50 ++++++-------
 .../SubProcesses/P1_epem_mupmum/CPPProcess.h  |  8 +-
 .../SubProcesses/P1_epem_mupmum/check_sa.cc   | 74 +++++++++----------
 .../SubProcesses/RamboSamplingKernels.cc      | 12 +--
 .../SubProcesses/RamboSamplingKernels.h       |  4 +-
 .../SubProcesses/RandomNumberKernels.cc       |  8 +-
 .../SubProcesses/RandomNumberKernels.h        |  4 +-
 .../ee_mumu.mad/SubProcesses/fbridge.cc       |  8 +-
 .../ee_mumu.mad/SubProcesses/fsampler.cc      |  6 +-
 .../SubProcesses/gpu_abstraction.h            |  6 +-
 .../ee_mumu.mad/SubProcesses/runTest.cc       |  8 +-
 .../ee_mumu.mad/SubProcesses/testmisc.cc      |  2 +-
 .../ee_mumu.mad/SubProcesses/testxxx.cc       |  6 +-
 28 files changed, 159 insertions(+), 159 deletions(-)

diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/Bridge.h b/epochX/cudacpp/ee_mumu.mad/SubProcesses/Bridge.h
index 18786c0a1a..0f1498ed9e 100644
--- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/Bridge.h
+++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/Bridge.h
@@ -19,7 +19,7 @@
 #include <memory>
 #include <type_traits>
 
-#ifdef __CUDACC__
+#ifdef __GPUCC__
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -79,7 +79,7 @@ namespace mg5amcCpu
     Bridge& operator=( const Bridge& ) = delete;
     Bridge& operator=( Bridge&& ) = delete;
 
-#ifdef __CUDACC__
+#ifdef __GPUCC__
     /**
      * Set the gpublocks and gputhreads for the gpusequence - throws if evnt != gpublocks*gputhreads
      * (this is needed for BridgeKernel tests rather than for actual production use in Fortran)
@@ -146,7 +146,7 @@ namespace mg5amcCpu
     unsigned int m_nevt; // number of events
     int m_nGoodHel;      // the number of good helicities (-1 initially when they have not yet been calculated)
 
-#ifdef __CUDACC__
+#ifdef __GPUCC__
     int m_gputhreads; // number of gpu threads (default set from number of events, can be modified)
     int m_gpublocks;  // number of gpu blocks (default set from number of events, can be modified)
     mg5amcGpu::DeviceBuffer<FORTRANFPTYPE, sizePerEventMomenta> m_devMomentaF;
@@ -183,12 +183,12 @@ namespace mg5amcCpu
   // Forward declare transposition methods
   //
 
-#ifdef __CUDACC__
+#ifdef __GPUCC__
 
   template<typename Tin, typename Tout>
   __global__ void dev_transposeMomentaF2C( const Tin* in, Tout* out, const unsigned int nevt );
 
-#endif // __CUDACC__
+#endif // __GPUCC__
 
   template<typename Tin, typename Tout>
   void hst_transposeMomentaF2C( const Tin* in, Tout* out, const unsigned int nevt );
@@ -205,7 +205,7 @@ namespace mg5amcCpu
   Bridge<FORTRANFPTYPE>::Bridge( unsigned int nevtF, unsigned int nparF, unsigned int np4F )
     : m_nevt( nevtF )
     , m_nGoodHel( -1 )
-#ifdef __CUDACC__
+#ifdef __GPUCC__
     , m_gputhreads( 256 )                  // default number of gpu threads
     , m_gpublocks( m_nevt / m_gputhreads ) // this ensures m_nevt <= m_gpublocks*m_gputhreads
     , m_devMomentaF( m_nevt )
@@ -229,7 +229,7 @@ namespace mg5amcCpu
   {
     if( nparF != mgOnGpu::npar ) throw std::runtime_error( "Bridge constructor: npar mismatch" );
     if( np4F != mgOnGpu::np4 ) throw std::runtime_error( "Bridge constructor: np4 mismatch" );
-#ifdef __CUDACC__
+#ifdef __GPUCC__
     if( ( m_nevt < s_gputhreadsmin ) || ( m_nevt % s_gputhreadsmin != 0 ) )
       throw std::runtime_error( "Bridge constructor: nevt should be a multiple of " + std::to_string( s_gputhreadsmin ) );
     while( m_nevt != m_gpublocks * m_gputhreads )
@@ -247,11 +247,11 @@ namespace mg5amcCpu
     std::cout << "WARNING! Instantiate host Bridge (nevt=" << m_nevt << ")" << std::endl;
     mg5amcCpu::CPPProcess process( /*verbose=*/false );
     m_pmek.reset( new mg5amcCpu::MatrixElementKernelHost( m_hstMomentaC, m_hstGs, m_hstRndHel, m_hstRndCol, m_hstMEs, m_hstSelHel, m_hstSelCol, m_nevt ) );
-#endif // __CUDACC__
+#endif // __GPUCC__
     process.initProc( "../../Cards/param_card.dat" );
   }
 
-#ifdef __CUDACC__
+#ifdef __GPUCC__
   template<typename FORTRANFPTYPE>
   void Bridge<FORTRANFPTYPE>::set_gpugrid( const int gpublocks, const int gputhreads )
   {
@@ -265,7 +265,7 @@ namespace mg5amcCpu
   }
 #endif
 
-#ifdef __CUDACC__
+#ifdef __GPUCC__
   template<typename FORTRANFPTYPE>
   void Bridge<FORTRANFPTYPE>::gpu_sequence( const FORTRANFPTYPE* momenta,
                                             const FORTRANFPTYPE* gs,
@@ -330,7 +330,7 @@ namespace mg5amcCpu
   }
 #endif
 
-#ifndef __CUDACC__
+#ifndef __GPUCC__
   template<typename FORTRANFPTYPE>
   void Bridge<FORTRANFPTYPE>::cpu_sequence( const FORTRANFPTYPE* momenta,
                                             const FORTRANFPTYPE* gs,
@@ -385,7 +385,7 @@ namespace mg5amcCpu
   // - C++ array: momenta[npagM][npar][np4][neppM] with nevt=npagM*neppM (AOSOA)
   //
 
-#ifdef __CUDACC__
+#ifdef __GPUCC__
   template<typename Tin, typename Tout>
   __global__ void dev_transposeMomentaF2C( const Tin* in, Tout* out, const unsigned int nevt )
   {
diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/BridgeKernels.cc b/epochX/cudacpp/ee_mumu.mad/SubProcesses/BridgeKernels.cc
index c2c16ff038..e1aad45df5 100644
--- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/BridgeKernels.cc
+++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/BridgeKernels.cc
@@ -9,7 +9,7 @@ using mgOnGpu::np4;  // the number of dimensions of 4-momenta (E,px,py,pz)
 
 //============================================================================
 
-#ifdef __CUDACC__
+#ifdef __GPUCC__
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -40,7 +40,7 @@ namespace mg5amcCpu
 
 //============================================================================
 
-#ifndef __CUDACC__
+#ifndef __GPUCC__
 namespace mg5amcCpu
 {
 
@@ -91,7 +91,7 @@ namespace mg5amcCpu
 
 //============================================================================
 
-#ifdef __CUDACC__
+#ifdef __GPUCC__
 namespace mg5amcGpu
 {
 
diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/BridgeKernels.h b/epochX/cudacpp/ee_mumu.mad/SubProcesses/BridgeKernels.h
index 10e664a4c4..ef7772e192 100644
--- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/BridgeKernels.h
+++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/BridgeKernels.h
@@ -7,7 +7,7 @@
 #include "MatrixElementKernels.h"
 #include "MemoryBuffers.h"
 
-#ifdef __CUDACC__
+#ifdef __GPUCC__
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -44,7 +44,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifndef __CUDACC__
+#ifndef __GPUCC__
   // A Bridge wrapper class encapsulating matrix element calculations on a CPU host
   class BridgeKernelHost final : public BridgeKernelBase
   {
@@ -84,7 +84,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef __GPUCC__
   // A Bridge wrapper class encapsulating matrix element calculations on a GPU device
   class BridgeKernelDevice : public BridgeKernelBase
   {
diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/CrossSectionKernels.cc b/epochX/cudacpp/ee_mumu.mad/SubProcesses/CrossSectionKernels.cc
index 398f8a87bd..998c5b76f1 100644
--- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/CrossSectionKernels.cc
+++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/CrossSectionKernels.cc
@@ -72,7 +72,7 @@ debug_me_is_abnormal( const fptype& me, size_t ievtALL )
 
 //============================================================================
 
-#ifdef __CUDACC__
+#ifdef __GPUCC__
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -180,7 +180,7 @@ namespace mg5amcCpu
 
 //============================================================================
 
-#ifdef __CUDACC__
+#ifdef __GPUCC__
 namespace mg5amcGpu
 {
 
diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/CrossSectionKernels.h b/epochX/cudacpp/ee_mumu.mad/SubProcesses/CrossSectionKernels.h
index 6098157b4e..e2acaa6e12 100644
--- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/CrossSectionKernels.h
+++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/CrossSectionKernels.h
@@ -8,7 +8,7 @@
 
 //============================================================================
 
-#ifdef __CUDACC__
+#ifdef __GPUCC__
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -91,7 +91,7 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
   /*
-#ifdef __CUDACC__
+#ifdef __GPUCC__
   // A class encapsulating the calculation of event statistics on a GPU device
   class CrossSectionKernelDevice : public CrossSectionKernelBase, public NumberOfEvents
   {
diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/CudaRuntime.h b/epochX/cudacpp/ee_mumu.mad/SubProcesses/CudaRuntime.h
index e16ed2c703..f23a3d9e0f 100644
--- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/CudaRuntime.h
+++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/CudaRuntime.h
@@ -10,7 +10,7 @@
 //--------------------------------------------------------------------------
 
 // See https://stackoverflow.com/a/14038590
-#ifdef __CUDACC__ /* clang-format off */
+#ifdef __GPUCC__ /* clang-format off */
 #define checkCuda( code ) { assertCuda( code, __FILE__, __LINE__ ); }
 inline void assertCuda( cudaError_t code, const char* file, int line, bool abort = true )
 {
@@ -24,7 +24,7 @@ inline void assertCuda( cudaError_t code, const char* file, int line, bool abort
 
 //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef __GPUCC__
 namespace mg5amcGpu
 {
   // Instantiate a CudaRuntime at the beginnining of the application's main to
diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/EventStatistics.h b/epochX/cudacpp/ee_mumu.mad/SubProcesses/EventStatistics.h
index 19c5199bcc..d35f42f326 100644
--- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/EventStatistics.h
+++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/EventStatistics.h
@@ -9,7 +9,7 @@
 #include <limits>
 #include <string>
 
-#ifdef __CUDACC__
+#ifdef __GPUCC__
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/GpuRuntime.h b/epochX/cudacpp/ee_mumu.mad/SubProcesses/GpuRuntime.h
index 432c6e06b0..b80cbbdba8 100644
--- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/GpuRuntime.h
+++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/GpuRuntime.h
@@ -11,7 +11,7 @@
 //--------------------------------------------------------------------------
 
 // See https://stackoverflow.com/a/14038590
-#ifdef __CUDACC__ /* clang-format off */
+#ifdef __GPUCC__ /* clang-format off */
 #define checkGpu( code ) { assertGpu( code, __FILE__, __LINE__ ); }
 inline void assertGpu( gpuError_t code, const char* file, int line, bool abort = true )
 {
@@ -25,7 +25,7 @@ inline void assertGpu( gpuError_t code, const char* file, int line, bool abort =
 
 //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef __GPUCC__
 namespace mg5amcGpu
 {
   // Instantiate a CudaRuntime at the beginnining of the application's main to
diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/MadgraphTest.h b/epochX/cudacpp/ee_mumu.mad/SubProcesses/MadgraphTest.h
index 2a0be47978..2b9806caf2 100644
--- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/MadgraphTest.h
+++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/MadgraphTest.h
@@ -186,7 +186,7 @@ class MadgraphTest : public testing::TestWithParam<TestDriverBase*>
 
 // Since we link both the CPU-only and GPU tests into the same executable, we prevent
 // a multiply defined symbol by only compiling this in the non-CUDA phase:
-#ifndef __CUDACC__
+#ifndef __GPUCC__
 
 /// Compare momenta and matrix elements.
 /// This uses an implementation of TestDriverBase to run a madgraph workflow,
@@ -295,6 +295,6 @@ TEST_P( MadgraphTest, CompareMomentaAndME )
   }
 }
 
-#endif // __CUDACC__
+#endif // __GPUCC__
 
 #endif /* MADGRAPHTEST_H_ */
diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/MatrixElementKernels.cc b/epochX/cudacpp/ee_mumu.mad/SubProcesses/MatrixElementKernels.cc
index c6c054efd4..d01984b74d 100644
--- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/MatrixElementKernels.cc
+++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/MatrixElementKernels.cc
@@ -9,7 +9,7 @@
 
 //============================================================================
 
-#ifndef __CUDACC__
+#ifndef __GPUCC__
 namespace mg5amcCpu
 {
 
@@ -138,7 +138,7 @@ namespace mg5amcCpu
 
 //============================================================================
 
-#ifdef __CUDACC__
+#ifdef __GPUCC__
 namespace mg5amcGpu
 {
 
diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/MatrixElementKernels.h b/epochX/cudacpp/ee_mumu.mad/SubProcesses/MatrixElementKernels.h
index ec0fc9b18c..2d39d339ec 100644
--- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/MatrixElementKernels.h
+++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/MatrixElementKernels.h
@@ -5,7 +5,7 @@
 
 #include "MemoryBuffers.h"
 
-#ifdef __CUDACC__
+#ifdef __GPUCC__
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -76,7 +76,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifndef __CUDACC__
+#ifndef __GPUCC__
   // A class encapsulating matrix element calculations on a CPU host
   class MatrixElementKernelHost final : public MatrixElementKernelBase, public NumberOfEvents
   {
@@ -125,7 +125,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef __GPUCC__
   // A class encapsulating matrix element calculations on a GPU device
   class MatrixElementKernelDevice : public MatrixElementKernelBase, public NumberOfEvents
   {
diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/MemoryAccessHelpers.h b/epochX/cudacpp/ee_mumu.mad/SubProcesses/MemoryAccessHelpers.h
index aa3016c9a1..d173eb81a1 100644
--- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/MemoryAccessHelpers.h
+++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/MemoryAccessHelpers.h
@@ -100,7 +100,7 @@ class KernelAccessHelper : public MemoryAccessHelper<T>
     }
     else
     {
-#ifdef __CUDACC__
+#ifdef __GPUCC__
       const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid
       //printf( "kernelAccessRecord: ievt=%d threadId=%d\n", ievt, threadIdx.x );
       return T::ieventAccessRecord( buffer, ievt ); // NB fptype and fptype_sv coincide for CUDA
diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/MemoryAccessMomenta.h b/epochX/cudacpp/ee_mumu.mad/SubProcesses/MemoryAccessMomenta.h
index ace50b40e8..c31062bd30 100644
--- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/MemoryAccessMomenta.h
+++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/MemoryAccessMomenta.h
@@ -17,7 +17,7 @@ class MemoryAccessMomentaBase //_AOSOAv1
 
   // Number of Events Per Page in the momenta AOSOA memory buffer layout
   // (these are all best kept as a compile-time constants: see issue #23)
-#ifdef __CUDACC__ /* clang-format off */
+#ifdef __GPUCC__ /* clang-format off */
   // -----------------------------------------------------------------------------------------------
   // --- GPUs: neppM is best set to a power of 2 times the number of fptype's in a 32-byte cacheline
   // --- This is relevant to ensure coalesced access to momenta in global memory
diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/MemoryAccessVectors.h b/epochX/cudacpp/ee_mumu.mad/SubProcesses/MemoryAccessVectors.h
index 2697cdad52..a3355f0852 100644
--- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/MemoryAccessVectors.h
+++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/MemoryAccessVectors.h
@@ -5,7 +5,7 @@
 
 #include "mgOnGpuVectors.h"
 
-#ifndef __CUDACC__
+#ifndef __GPUCC__
 namespace mg5amcCpu // this is only needed for CPU SIMD vectorization
 {
 
diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/MemoryBuffers.h b/epochX/cudacpp/ee_mumu.mad/SubProcesses/MemoryBuffers.h
index cda8400f10..a3c5bdd861 100644
--- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/MemoryBuffers.h
+++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/MemoryBuffers.h
@@ -10,7 +10,7 @@
 
 #include <sstream>
 
-#ifdef __CUDACC__
+#ifdef __GPUCC__
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -80,7 +80,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifndef __CUDACC__
+#ifndef __GPUCC__
   constexpr bool HostBufferALIGNED = false;   // ismisaligned=false
   constexpr bool HostBufferMISALIGNED = true; // ismisaligned=true
 
@@ -112,7 +112,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef __GPUCC__
   // A class encapsulating a CUDA pinned host buffer
   template<typename T>
   class PinnedHostBufferBase : public BufferBase<T>
@@ -132,7 +132,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef __GPUCC__
   // A class encapsulating a CUDA device buffer
   template<typename T>
   class DeviceBufferBase : public BufferBase<T>
@@ -152,7 +152,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifndef __CUDACC__
+#ifndef __GPUCC__
   // A class encapsulating a C++ host buffer for a given number of events
   template<typename T, size_t sizePerEvent, bool ismisaligned>
   class HostBuffer : public HostBufferBase<T, ismisaligned>, virtual private NumberOfEvents
@@ -168,7 +168,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef __GPUCC__
   // A class encapsulating a CUDA pinned host buffer for a given number of events
   template<typename T, size_t sizePerEvent>
   class PinnedHostBuffer : public PinnedHostBufferBase<T>, virtual private NumberOfEvents
@@ -184,7 +184,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef __GPUCC__
   // A class encapsulating a CUDA device buffer for a given number of events
   template<typename T, size_t sizePerEvent>
   class DeviceBuffer : public DeviceBufferBase<T>, virtual private NumberOfEvents
@@ -206,7 +206,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for momenta random numbers
   constexpr size_t sizePerEventRndNumMomenta = MemoryBuffers::np4 * MemoryBuffers::nparf;
 
-#ifndef __CUDACC__
+#ifndef __GPUCC__
   // A class encapsulating a C++ host buffer for momenta random numbers
   typedef HostBuffer<fptype, sizePerEventRndNumMomenta, HostBufferALIGNED> HostBufferRndNumMomenta;
 #else
@@ -225,7 +225,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer with ONE fptype per event
   constexpr size_t sizePerEventOneFp = 1;
 
-#ifndef __CUDACC__
+#ifndef __GPUCC__
   // A class encapsulating a C++ host buffer with ONE fptype per event
   typedef HostBuffer<fptype, sizePerEventOneFp, HostBufferALIGNED> HostBufferOneFp;
 #else
@@ -250,7 +250,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for Gs
   constexpr size_t sizePerEventGs = 1;
 
-#ifndef __CUDACC__
+#ifndef __GPUCC__
   // A class encapsulating a C++ host buffer for gs
   typedef HostBuffer<fptype, sizePerEventGs, HostBufferALIGNED> HostBufferGs;
 #else
@@ -269,7 +269,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for numerators
   constexpr size_t sizePerEventNumerators = 1;
 
-#ifndef __CUDACC__
+#ifndef __GPUCC__
   // A class encapsulating a C++ host buffer for gs
   typedef HostBuffer<fptype, sizePerEventNumerators, HostBufferALIGNED> HostBufferNumerators;
 #else
@@ -289,7 +289,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for denominators
   constexpr size_t sizePerEventDenominators = 1;
 
-#ifndef __CUDACC__
+#ifndef __GPUCC__
   // A class encapsulating a C++ host buffer for gs
   typedef HostBuffer<fptype, sizePerEventDenominators, HostBufferALIGNED> HostBufferDenominators;
 #else
@@ -308,7 +308,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for random numbers
   constexpr size_t sizePerEventCouplings = MemoryBuffers::ndcoup * MemoryBuffers::nx2;
 
-#ifndef __CUDACC__
+#ifndef __GPUCC__
   // A class encapsulating a C++ host buffer for gs
   typedef HostBuffer<fptype, sizePerEventCouplings, HostBufferALIGNED> HostBufferCouplings;
 #else
@@ -326,7 +326,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for momenta
   constexpr size_t sizePerEventMomenta = MemoryBuffers::np4 * MemoryBuffers::npar;
 
-#ifndef __CUDACC__
+#ifndef __GPUCC__
   // A class encapsulating a C++ host buffer for momenta
   typedef HostBuffer<fptype, sizePerEventMomenta, HostBufferALIGNED> HostBufferMomenta;
   //typedef HostBuffer<fptype, sizePerEventMomenta, HostBufferMISALIGNED> HostBufferMomenta; // TEST MISALIGNMENT!
@@ -345,7 +345,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for sampling weights
   constexpr size_t sizePerEventWeights = 1;
 
-#ifndef __CUDACC__
+#ifndef __GPUCC__
   // A class encapsulating a C++ host buffer for sampling weights
   typedef HostBuffer<fptype, sizePerEventWeights, HostBufferALIGNED> HostBufferWeights;
 #else
@@ -363,7 +363,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for matrix elements
   constexpr size_t sizePerEventMatrixElements = 1;
 
-#ifndef __CUDACC__
+#ifndef __GPUCC__
   // A class encapsulating a C++ host buffer for matrix elements
   typedef HostBuffer<fptype, sizePerEventMatrixElements, HostBufferALIGNED> HostBufferMatrixElements;
 #else
@@ -378,7 +378,7 @@ namespace mg5amcCpu
   // A base class encapsulating a memory buffer for the helicity mask
   typedef BufferBase<bool> BufferHelicityMask;
 
-#ifndef __CUDACC__
+#ifndef __GPUCC__
   // A class encapsulating a C++ host buffer for the helicity mask
   typedef HostBufferBase<bool, HostBufferALIGNED> HostBufferHelicityMask;
 #else
@@ -396,7 +396,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for wavefunctions
   constexpr size_t sizePerEventWavefunctions = MemoryBuffers::nw6 * MemoryBuffers::nx2;
 
-#ifndef __CUDACC__
+#ifndef __GPUCC__
   // A class encapsulating a C++ host buffer for wavefunctions
   typedef HostBuffer<fptype, sizePerEventWavefunctions, HostBufferALIGNED> HostBufferWavefunctions;
 #else
@@ -414,7 +414,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for helicity random numbers
   constexpr size_t sizePerEventRndNumHelicity = 1;
 
-#ifndef __CUDACC__
+#ifndef __GPUCC__
   // A class encapsulating a C++ host buffer for helicity random numbers
   typedef HostBuffer<fptype, sizePerEventRndNumHelicity, HostBufferALIGNED> HostBufferRndNumHelicity;
 #else
@@ -432,7 +432,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for color random numbers
   constexpr size_t sizePerEventRndNumColor = 1;
 
-#ifndef __CUDACC__
+#ifndef __GPUCC__
   // A class encapsulating a C++ host buffer for color random numbers
   typedef HostBuffer<fptype, sizePerEventRndNumColor, HostBufferALIGNED> HostBufferRndNumColor;
 #else
@@ -450,7 +450,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for helicity selection
   constexpr size_t sizePerEventSelectedHelicity = 1;
 
-#ifndef __CUDACC__
+#ifndef __GPUCC__
   // A class encapsulating a C++ host buffer for helicity selection
   typedef HostBuffer<int, sizePerEventSelectedHelicity, HostBufferALIGNED> HostBufferSelectedHelicity;
 #else
@@ -468,7 +468,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for color selection
   constexpr size_t sizePerEventSelectedColor = 1;
 
-#ifndef __CUDACC__
+#ifndef __GPUCC__
   // A class encapsulating a C++ host buffer for color selection
   typedef HostBuffer<int, sizePerEventSelectedColor, HostBufferALIGNED> HostBufferSelectedColor;
 #else
@@ -480,7 +480,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef __GPUCC__
   template<class Tdst, class Tsrc>
   void copyDeviceFromHost( Tdst& dst, const Tsrc& src ) // keep the same order of arguments as in memcpy
   {
@@ -503,7 +503,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef __GPUCC__
   template<class Tdst, class Tsrc>
   void copyHostFromDevice( Tdst& dst, const Tsrc& src ) // keep the same order of arguments as in memcpy
   {
diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/CPPProcess.cc b/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/CPPProcess.cc
index ea2f3dcbb0..83eb71ca36 100644
--- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/CPPProcess.cc
+++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/CPPProcess.cc
@@ -40,7 +40,7 @@
 // Class member functions for calculating the matrix elements for
 // Process: e+ e- > mu+ mu- WEIGHTED<=4 @1
 
-#ifdef __CUDACC__ 
+#ifdef __GPUCC__ 
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -74,7 +74,7 @@ namespace mg5amcCpu
   __device__ const fptype cIPD[2] = { (fptype)Parameters_sm::mdl_MZ, (fptype)Parameters_sm::mdl_WZ };
   __device__ const fptype cIPC[6] = { (fptype)Parameters_sm::GC_3.real(), (fptype)Parameters_sm::GC_3.imag(), (fptype)Parameters_sm::GC_50.real(), (fptype)Parameters_sm::GC_50.imag(), (fptype)Parameters_sm::GC_59.real(), (fptype)Parameters_sm::GC_59.imag() };
 #else
-#ifdef __CUDACC__
+#ifdef __GPUCC__
   __device__ __constant__ fptype cIPD[2];
   __device__ __constant__ fptype cIPC[6];
 #else
@@ -84,7 +84,7 @@ namespace mg5amcCpu
 #endif
 
   // Helicity combinations (and filtering of "good" helicity combinations)
-#ifdef __CUDACC__
+#ifdef __GPUCC__
   __device__ __constant__ short cHel[ncomb][npar];
   __device__ __constant__ int cNGoodHel;
   __device__ __constant__ int cGoodHel[ncomb];
@@ -112,13 +112,13 @@ namespace mg5amcCpu
                            fptype* allDenominators,       // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
                            fptype_sv* jamp2_sv            // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled)
-#ifndef __CUDACC__
+#ifndef __GPUCC__
                            , const int ievt00             // input: first event number in current C++ event page (for CUDA, ievt depends on threadid)
 #endif
                            )
   //ALWAYS_INLINE // attributes are not permitted in a function definition
   {
-#ifdef __CUDACC__
+#ifdef __GPUCC__
     using namespace mg5amcGpu;
     using M_ACCESS = DeviceAccessMomenta;         // non-trivial access: buffer includes all events
     using E_ACCESS = DeviceAccessMatrixElements;  // non-trivial access: buffer includes all events
@@ -145,7 +145,7 @@ namespace mg5amcCpu
 #endif /* clang-format on */
     mgDebug( 0, __FUNCTION__ );
     //printf( "calculate_wavefunctions: ihel=%2d\n", ihel );
-#ifndef __CUDACC__
+#ifndef __GPUCC__
     //printf( "calculate_wavefunctions: ievt00=%d\n", ievt00 );
 #endif
 
@@ -177,12 +177,12 @@ namespace mg5amcCpu
 #endif
     for( int iParity = 0; iParity < nParity; ++iParity )
     { // START LOOP ON IPARITY
-#ifndef __CUDACC__
+#ifndef __GPUCC__
       const int ievt0 = ievt00 + iParity * neppV;
 #endif
       constexpr size_t nxcoup = ndcoup + nicoup; // both dependent and independent couplings
       const fptype* allCOUPs[nxcoup];
-#ifdef __CUDACC__
+#ifdef __GPUCC__
 #pragma nv_diagnostic push
 #pragma nv_diag_suppress 186 // e.g. <<warning #186-D: pointless comparison of unsigned integer with zero>>
 #endif
@@ -190,7 +190,7 @@ namespace mg5amcCpu
         allCOUPs[idcoup] = CD_ACCESS::idcoupAccessBufferConst( allcouplings, idcoup ); // dependent couplings, vary event-by-event
       for( size_t iicoup = 0; iicoup < nicoup; iicoup++ )
         allCOUPs[ndcoup + iicoup] = CI_ACCESS::iicoupAccessBufferConst( cIPC, iicoup ); // independent couplings, fixed for all events
-#ifdef __CUDACC__
+#ifdef __GPUCC__
 #pragma nv_diagnostic pop
       // CUDA kernels take input/output buffers with momenta/MEs for all events
       const fptype* momenta = allmomenta;
@@ -228,7 +228,7 @@ namespace mg5amcCpu
       // *** DIAGRAM 1 OF 2 ***
 
       // Wavefunction(s) for diagram number 1
-#if not( defined __CUDACC__ and defined MGONGPU_TEST_DIVERGENCE )
+#if not( defined __GPUCC__ and defined MGONGPU_TEST_DIVERGENCE )
       opzxxx<M_ACCESS, W_ACCESS>( momenta, cHel[ihel][0], -1, w_fp[0], 0 ); // NB: opzxxx only uses pz
 #else
       if( ( blockDim.x * blockIdx.x + threadIdx.x ) % 2 == 0 )
@@ -283,7 +283,7 @@ namespace mg5amcCpu
       // [NB do keep 'static' for these constexpr arrays, see issue #283]
       static constexpr fptype2 cf[ncolor][ncolor] = { { 1 } }; // 2-D array[1][1]
 
-#ifndef __CUDACC__
+#ifndef __GPUCC__
       // Pre-compute a constexpr triangular color matrix properly normalized #475
       struct TriangularNormalizedColorMatrix
       {
@@ -340,7 +340,7 @@ namespace mg5amcCpu
 #endif
       for( int icol = 0; icol < ncolor; icol++ )
       {
-#ifndef __CUDACC__
+#ifndef __GPUCC__
         // === C++ START ===
         // Diagonal terms
 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
@@ -399,7 +399,7 @@ namespace mg5amcCpu
       MEs_sv_previous += deltaMEs_previous;
 #endif
       /*
-#ifdef __CUDACC__
+#ifdef __GPUCC__
       if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", blockDim.x * blockIdx.x + threadIdx.x, ihel, MEs_sv );
 #else
 #ifdef MGONGPU_CPPSIMD
@@ -446,7 +446,7 @@ namespace mg5amcCpu
       { 1, -1, 1, 1 },
       { 1, -1, -1, -1 },
       { 1, -1, -1, 1 } };
-#ifdef __CUDACC__
+#ifdef __GPUCC__
     gpuMemcpyToSymbol( cHel, tHel, ncomb * mgOnGpu::npar * sizeof( short ) );
 #else
     memcpy( cHel, tHel, ncomb * mgOnGpu::npar * sizeof( short ) );
@@ -487,7 +487,7 @@ namespace mg5amcCpu
     // Then copy them to CUDA constant memory (issue #39) or its C++ emulation in file-scope static memory
     const fptype tIPD[2] = { (fptype)m_pars->mdl_MZ, (fptype)m_pars->mdl_WZ };
     const cxtype tIPC[3] = { cxmake( m_pars->GC_3 ), cxmake( m_pars->GC_50 ), cxmake( m_pars->GC_59 ) };
-#ifdef __CUDACC__
+#ifdef __GPUCC__
     gpuMemcpyToSymbol( cIPD, tIPD, 2 * sizeof( fptype ) );
     gpuMemcpyToSymbol( cIPC, tIPC, 3 * sizeof( cxtype ) );
 #else
@@ -526,7 +526,7 @@ namespace mg5amcCpu
   {
     std::stringstream out;
     // CUDA version (NVCC)
-    // [Use __NVCC__ instead of __CUDACC__ here!]
+    // [Use __NVCC__ instead of __GPUCC__ here!]
     // [This tests if 'nvcc' was used even to build a .cc file, even if not necessarily 'nvcc -x cu' for a .cu file]
     // [Check 'nvcc --compiler-options -dM -E dummy.c | grep CUDA': see https://stackoverflow.com/a/53713712]
 #ifdef __NVCC__
@@ -591,12 +591,12 @@ namespace mg5amcCpu
   __global__ void /* clang-format off */
   computeDependentCouplings( const fptype* allgs, // input: Gs[nevt]
                              fptype* allcouplings // output: couplings[nevt*ndcoup*2]
-#ifndef __CUDACC__
+#ifndef __GPUCC__
                              , const int nevt     // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #endif
   ) /* clang-format on */
   {
-#ifdef __CUDACC__
+#ifdef __GPUCC__
     using namespace mg5amcGpu;
     using G_ACCESS = DeviceAccessGs;
     using C_ACCESS = DeviceAccessCouplings;
@@ -617,7 +617,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__ /* clang-format off */
+#ifdef __GPUCC__ /* clang-format off */
   __global__ void
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
@@ -746,7 +746,7 @@ namespace mg5amcCpu
         nGoodHel++;
       }
     }
-#ifdef __CUDACC__
+#ifdef __GPUCC__
     gpuMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) );
     gpuMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) );
 #else
@@ -772,7 +772,7 @@ namespace mg5amcCpu
 #endif
             int* allselhel,                // output: helicity selection[nevt]
             int* allselcol                 // output: helicity selection[nevt]
-#ifndef __CUDACC__
+#ifndef __GPUCC__
             , const int nevt               // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #endif
             ) /* clang-format on */
@@ -793,7 +793,7 @@ namespace mg5amcCpu
     // Denominators: spins, colors and identical particles
     constexpr int helcolDenominators[1] = { 4 }; // assume nprocesses == 1 (#272 and #343)
 
-#ifdef __CUDACC__
+#ifdef __GPUCC__
     // Remember: in CUDA this is a kernel for one event, in c++ this processes n events
     const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid
 #else
@@ -809,7 +809,7 @@ namespace mg5amcCpu
     // Start sigmaKin_lines
     // === PART 0 - INITIALISATION (before calculate_wavefunctions) ===
     // Reset the "matrix elements" - running sums of |M|^2 over helicities for the given event
-#ifdef __CUDACC__
+#ifdef __GPUCC__
     allMEs[ievt] = 0;
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     allNumerators[ievt] = 0;
@@ -837,7 +837,7 @@ namespace mg5amcCpu
     // === PART 1 - HELICITY LOOP: CALCULATE WAVEFUNCTIONS ===
     // (in both CUDA and C++, using precomputed good helicities)
 
-#ifdef __CUDACC__ // CUDA OR C++
+#ifdef __GPUCC__ // CUDA OR C++
 
     // *** START OF PART 1a - CUDA (one event per CPU thread) ***
     // Running sum of partial amplitudes squared for event by event color selection (#402)
@@ -1041,7 +1041,7 @@ namespace mg5amcCpu
     // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event
     // [NB 'sum over final spins, average over initial spins', eg see
     // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf]
-#ifdef __CUDACC__
+#ifdef __GPUCC__
     allMEs[ievt] /= helcolDenominators[0];
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     if( channelId > 0 ) allMEs[ievt] *= allNumerators[ievt] / allDenominators[ievt];
diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/CPPProcess.h b/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/CPPProcess.h
index 1f57da2a58..f967b79a48 100644
--- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/CPPProcess.h
+++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/CPPProcess.h
@@ -18,7 +18,7 @@
 
 //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef __GPUCC__
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -85,7 +85,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef __GPUCC__
   __global__ void
   computeDependentCouplings( const fptype* allgs,    // input: Gs[nevt]
                              fptype* allcouplings ); // output: couplings[nevt*ndcoup*2]
@@ -98,7 +98,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__ /* clang-format off */
+#ifdef __GPUCC__ /* clang-format off */
   __global__ void
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
@@ -128,7 +128,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__ /* clang-format off */
+#ifdef __GPUCC__ /* clang-format off */
   __global__ void
   sigmaKin( const fptype* allmomenta,      // input: momenta[nevt*npar*4]
             const fptype* allcouplings,    // input: couplings[nevt*ndcoup*2]
diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/check_sa.cc b/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/check_sa.cc
index 4abcfbdc6e..1d1c759f31 100644
--- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/check_sa.cc
+++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/check_sa.cc
@@ -55,7 +55,7 @@ usage( char* argv0, int ret = 1 )
   std::cout << std::endl;
   std::cout << "Summary stats are always computed: '-p' and '-j' only control their printout" << std::endl;
   std::cout << "The '-d' flag only enables NaN/abnormal warnings and OMP debugging" << std::endl;
-#ifndef __CUDACC__
+#ifndef __GPUCC__
 #ifdef _OPENMP
   std::cout << std::endl;
   std::cout << "Use the OMP_NUM_THREADS environment variable to control OMP multi-threading" << std::endl;
@@ -69,7 +69,7 @@ int
 main( int argc, char** argv )
 {
   // Namespaces for CUDA and C++ (FIXME - eventually use the same namespace everywhere...)
-#ifdef __CUDACC__
+#ifdef __GPUCC__
   using namespace mg5amcGpu;
 #else
   using namespace mg5amcCpu;
@@ -94,7 +94,7 @@ main( int argc, char** argv )
     CurandHost = 1,
     CurandDevice = 2
   };
-#ifdef __CUDACC__
+#ifdef __GPUCC__
   RandomNumberMode rndgen = RandomNumberMode::CurandDevice; // default on GPU
 #elif not defined MGONGPU_HAS_NO_CURAND
   RandomNumberMode rndgen = RandomNumberMode::CurandHost;  // default on CPU if build has curand
@@ -107,7 +107,7 @@ main( int argc, char** argv )
     RamboHost = 1,
     RamboDevice = 2
   };
-#ifdef __CUDACC__
+#ifdef __GPUCC__
   RamboSamplingMode rmbsmp = RamboSamplingMode::RamboDevice; // default on GPU
 #else
   RamboSamplingMode rmbsmp = RamboSamplingMode::RamboHost; // default on CPU
@@ -137,7 +137,7 @@ main( int argc, char** argv )
     }
     else if( arg == "--curdev" )
     {
-#ifdef __CUDACC__
+#ifdef __GPUCC__
       rndgen = RandomNumberMode::CurandDevice;
 #else
       throw std::runtime_error( "CurandDevice is not supported on CPUs" );
@@ -157,7 +157,7 @@ main( int argc, char** argv )
     }
     else if( arg == "--rmbdev" )
     {
-#ifdef __CUDACC__
+#ifdef __GPUCC__
       rmbsmp = RamboSamplingMode::RamboDevice;
 #else
       throw std::runtime_error( "RamboDevice is not supported on CPUs" );
@@ -231,13 +231,13 @@ main( int argc, char** argv )
     return usage( argv[0] );
   }
 
-#ifndef __CUDACC__
+#ifndef __GPUCC__
 #ifdef _OPENMP
   ompnumthreadsNotSetMeansOneThread( debug ? 1 : 0 ); // quiet(-1), info(0), debug(1)
 #endif
 #endif
 
-#ifndef __CUDACC__
+#ifndef __GPUCC__
   // Fail gently and avoid "Illegal instruction (core dumped)" if the host does not support the SIMD used in the ME calculation
   // Note: this prevents a crash on pmpe04 but not on some github CI nodes?
   // [NB: SIMD vectorization in mg5amc C++ code is only used in the ME calculation below MatrixElementKernelHost!]
@@ -255,7 +255,7 @@ main( int argc, char** argv )
 
   // === STEP 0 - INITIALISE
 
-#ifdef __CUDACC__
+#ifdef __GPUCC__
 
   // --- 00. Initialise cuda
   // Instantiate a GpuRuntime (CUDA or HIP based on target arch) at the beginnining of the application's main to
@@ -284,7 +284,7 @@ main( int argc, char** argv )
   timermap.start( alloKey );
 
   // Memory buffers for random numbers for momenta
-#ifndef __CUDACC__
+#ifndef __GPUCC__
   HostBufferRndNumMomenta hstRndmom( nevt );
 #else
   PinnedHostBufferRndNumMomenta hstRndmom( nevt );
@@ -292,7 +292,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for sampling weights
-#ifndef __CUDACC__
+#ifndef __GPUCC__
   HostBufferWeights hstWeights( nevt );
 #else
   PinnedHostBufferWeights hstWeights( nevt );
@@ -300,7 +300,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for momenta
-#ifndef __CUDACC__
+#ifndef __GPUCC__
   HostBufferMomenta hstMomenta( nevt );
 #else
   PinnedHostBufferMomenta hstMomenta( nevt );
@@ -308,7 +308,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for Gs
-#ifndef __CUDACC__
+#ifndef __GPUCC__
   HostBufferGs hstGs( nevt );
 #else
   PinnedHostBufferGs hstGs( nevt );
@@ -325,7 +325,7 @@ main( int argc, char** argv )
   }
 
   // Memory buffers for matrix elements
-#ifndef __CUDACC__
+#ifndef __GPUCC__
   HostBufferMatrixElements hstMatrixElements( nevt );
 #else
   PinnedHostBufferMatrixElements hstMatrixElements( nevt );
@@ -334,7 +334,7 @@ main( int argc, char** argv )
 
   // Memory buffers for random numbers for helicity selection
   // *** NB #403 these buffers always remain initialised at 0: no need for helicity choice in gcheck/check (no LHE produced) ***
-#ifndef __CUDACC__
+#ifndef __GPUCC__
   HostBufferRndNumHelicity hstRndHel( nevt );
 #else
   PinnedHostBufferRndNumHelicity hstRndHel( nevt );
@@ -343,7 +343,7 @@ main( int argc, char** argv )
 
   // Memory buffers for random numbers for color selection
   // *** NB #402 these buffers always remain initialised at 0: no need for color choice in gcheck/check (no LHE produced) ***
-#ifndef __CUDACC__
+#ifndef __GPUCC__
   HostBufferRndNumColor hstRndCol( nevt );
 #else
   PinnedHostBufferRndNumColor hstRndCol( nevt );
@@ -351,7 +351,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for helicity selection
-#ifndef __CUDACC__
+#ifndef __GPUCC__
   HostBufferSelectedHelicity hstSelHel( nevt );
 #else
   PinnedHostBufferSelectedHelicity hstSelHel( nevt );
@@ -359,7 +359,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for color selection
-#ifndef __CUDACC__
+#ifndef __GPUCC__
   HostBufferSelectedColor hstSelCol( nevt );
 #else
   PinnedHostBufferSelectedColor hstSelCol( nevt );
@@ -386,7 +386,7 @@ main( int argc, char** argv )
     const bool onDevice = false;
     prnk.reset( new CurandRandomNumberKernel( hstRndmom, onDevice ) );
   }
-#ifdef __CUDACC__
+#ifdef __GPUCC__
   else
   {
     const bool onDevice = true;
@@ -413,7 +413,7 @@ main( int argc, char** argv )
   }
   else
   {
-#ifdef __CUDACC__
+#ifdef __GPUCC__
     prsk.reset( new RamboSamplingKernelDevice( energy, devRndmom, devMomenta, devWeights, gpublocks, gputhreads ) );
 #else
     throw std::logic_error( "RamboDevice is not supported on CPUs" ); // INTERNAL ERROR (no path to this statement)
@@ -424,7 +424,7 @@ main( int argc, char** argv )
   std::unique_ptr<MatrixElementKernelBase> pmek;
   if( !bridge )
   {
-#ifdef __CUDACC__
+#ifdef __GPUCC__
     pmek.reset( new MatrixElementKernelDevice( devMomenta, devGs, devRndHel, devRndCol, devMatrixElements, devSelHel, devSelCol, gpublocks, gputhreads ) );
 #else
     pmek.reset( new MatrixElementKernelHost( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, nevt ) );
@@ -432,7 +432,7 @@ main( int argc, char** argv )
   }
   else
   {
-#ifdef __CUDACC__
+#ifdef __GPUCC__
     pmek.reset( new BridgeKernelDevice( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, gpublocks, gputhreads ) );
 #else
     pmek.reset( new BridgeKernelHost( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, nevt ) );
@@ -474,7 +474,7 @@ main( int argc, char** argv )
     prnk->generateRnarray();
     //std::cout << "Got random numbers" << std::endl;
 
-#ifdef __CUDACC__
+#ifdef __GPUCC__
     if( rndgen != RandomNumberMode::CurandDevice && rmbsmp == RamboSamplingMode::RamboDevice )
     {
       // --- 1c. Copy rndmom from host to device
@@ -506,7 +506,7 @@ main( int argc, char** argv )
     prsk->getMomentaFinal();
     //std::cout << "Got final momenta" << std::endl;
 
-#ifdef __CUDACC__
+#ifdef __GPUCC__
     if( rmbsmp == RamboSamplingMode::RamboDevice )
     {
       // --- 2c. CopyDToH Weights
@@ -551,7 +551,7 @@ main( int argc, char** argv )
       dynamic_cast<BridgeKernelBase*>( pmek.get() )->transposeInputMomentaC2F();
     }
 
-#ifdef __CUDACC__
+#ifdef __GPUCC__
     // --- 2d. CopyHToD Momenta
     const std::string gKey = "0.. CpHTDg";
     rambtime += timermap.start( gKey ); // FIXME! NOT A RAMBO TIMER!
@@ -580,7 +580,7 @@ main( int argc, char** argv )
     wv3atime += timermap.stop(); // calc only
     wavetime += wv3atime;        // calc plus copy
 
-#ifdef __CUDACC__
+#ifdef __GPUCC__
     if( !bridge )
     {
       // --- 3b. CopyDToH MEs
@@ -721,7 +721,7 @@ main( int argc, char** argv )
     rndgentxt = "CURAND HOST";
   else if( rndgen == RandomNumberMode::CurandDevice )
     rndgentxt = "CURAND DEVICE";
-#ifdef __CUDACC__
+#ifdef __GPUCC__
   rndgentxt += " (CUDA code)";
 #else
   rndgentxt += " (C++ code)";
@@ -730,7 +730,7 @@ main( int argc, char** argv )
   // Workflow description summary
   std::string wrkflwtxt;
   // -- CUDA or C++?
-#ifdef __CUDACC__
+#ifdef __GPUCC__
   wrkflwtxt += "CUD:";
 #else
   wrkflwtxt += "CPP:";
@@ -746,7 +746,7 @@ main( int argc, char** argv )
   wrkflwtxt += "???+"; // no path to this statement
 #endif
   // -- CUCOMPLEX or THRUST or STD complex numbers?
-#ifdef __CUDACC__
+#ifdef __GPUCC__
 #if defined MGONGPU_CUCXTYPE_CUCOMPLEX
   wrkflwtxt += "CUX:";
 #elif defined MGONGPU_CUCXTYPE_THRUST
@@ -781,7 +781,7 @@ main( int argc, char** argv )
     wrkflwtxt += "RMBDEV+";
   else
     wrkflwtxt += "??????+"; // no path to this statement
-#ifdef __CUDACC__
+#ifdef __GPUCC__
   // -- HOST or DEVICE matrix elements? Standalone MEs or BRIDGE?
   if( !bridge )
     wrkflwtxt += "MESDEV";
@@ -837,7 +837,7 @@ main( int argc, char** argv )
 
   if( perf )
   {
-#ifndef __CUDACC__
+#ifndef __GPUCC__
 #ifdef _OPENMP
     // Get the output of "nproc --all" (https://stackoverflow.com/a/478960)
     std::string nprocall;
@@ -856,7 +856,7 @@ main( int argc, char** argv )
 #endif
     // Dump all configuration parameters and all results
     std::cout << std::string( SEP79, '*' ) << std::endl
-#ifdef __CUDACC__
+#ifdef __GPUCC__
               << "Process                     = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_CUDA"
 #else
               << "Process                     = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_CPP"
@@ -884,7 +884,7 @@ main( int argc, char** argv )
 #elif defined MGONGPU_FPTYPE_FLOAT
               << "FP precision                = FLOAT (NaN/abnormal=" << nabn << ", zero=" << nzero << ")" << std::endl
 #endif
-#ifdef __CUDACC__
+#ifdef __GPUCC__
 #if defined MGONGPU_CUCXTYPE_CUCOMPLEX
               << "Complex type                = CUCOMPLEX" << std::endl
 #elif defined MGONGPU_CUCXTYPE_THRUST
@@ -898,7 +898,7 @@ main( int argc, char** argv )
               << " [HARDCODED FOR REPRODUCIBILITY]" << std::endl
               << "Momenta memory layout       = AOSOA[" << neppM << "]"
               << ( neppM == 1 ? " == AOS" : "" ) << std::endl
-#ifdef __CUDACC__
+#ifdef __GPUCC__
     //<< "Wavefunction GPU memory     = LOCAL" << std::endl
 #else
 #if !defined MGONGPU_CPPSIMD
@@ -929,7 +929,7 @@ main( int argc, char** argv )
 #endif
 #endif
               << "Random number generation    = " << rndgentxt << std::endl
-#ifndef __CUDACC__
+#ifndef __GPUCC__
 #ifdef _OPENMP
               << "OMP threads / `nproc --all` = " << omp_get_max_threads() << " / " << nprocall // includes a newline
 #endif
@@ -1025,7 +1025,7 @@ main( int argc, char** argv )
              << "\"FLOAT (NaN/abnormal=" << nabn << ")\"," << std::endl
 #endif
              << "\"Complex type\": "
-#ifdef __CUDACC__
+#ifdef __GPUCC__
 #if defined MGONGPU_CUCXTYPE_CUCOMPLEX
              << "\"CUCOMPLEX\"," << std::endl
 #elif defined MGONGPU_CUCXTYPE_THRUST
@@ -1040,7 +1040,7 @@ main( int argc, char** argv )
              << "\"Momenta memory layout\": "
              << "\"AOSOA[" << neppM << "]\""
              << ( neppM == 1 ? " == AOS" : "" ) << ", " << std::endl
-#ifdef __CUDACC__
+#ifdef __GPUCC__
     //<< "\"Wavefunction GPU memory\": " << "\"LOCAL\"," << std::endl
 #endif
              << "\"Curand generation\": "
diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/RamboSamplingKernels.cc b/epochX/cudacpp/ee_mumu.mad/SubProcesses/RamboSamplingKernels.cc
index 14d29ece76..5fb3396fb6 100644
--- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/RamboSamplingKernels.cc
+++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/RamboSamplingKernels.cc
@@ -9,7 +9,7 @@
 
 #include <sstream>
 
-#ifdef __CUDACC__
+#ifdef __GPUCC__
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -87,7 +87,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef __GPUCC__
   RamboSamplingKernelDevice::RamboSamplingKernelDevice( const fptype energy,               // input: energy
                                                         const BufferRndNumMomenta& rndmom, // input: random numbers in [0,1]
                                                         BufferMomenta& momenta,            // output: momenta
@@ -130,7 +130,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef __GPUCC__
   __global__ void
   getMomentaInitialDevice( const fptype energy,
                            fptype* momenta )
@@ -142,7 +142,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef __GPUCC__
   void
   RamboSamplingKernelDevice::getMomentaInitial()
   {
@@ -152,7 +152,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef __GPUCC__
   __global__ void
   getMomentaFinalDevice( const fptype energy,
                          const fptype* rndmom,
@@ -166,7 +166,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef __GPUCC__
   void
   RamboSamplingKernelDevice::getMomentaFinal()
   {
diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/RamboSamplingKernels.h b/epochX/cudacpp/ee_mumu.mad/SubProcesses/RamboSamplingKernels.h
index f40433af4a..0979de34ca 100644
--- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/RamboSamplingKernels.h
+++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/RamboSamplingKernels.h
@@ -5,7 +5,7 @@
 
 #include "MemoryBuffers.h"
 
-#ifdef __CUDACC__
+#ifdef __GPUCC__
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -88,7 +88,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef __GPUCC__
   // A class encapsulating RAMBO phase space sampling on a GPU device
   class RamboSamplingKernelDevice final : public SamplingKernelBase, public NumberOfEvents
   {
diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/RandomNumberKernels.cc b/epochX/cudacpp/ee_mumu.mad/SubProcesses/RandomNumberKernels.cc
index 57f660911f..d06dee9100 100644
--- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/RandomNumberKernels.cc
+++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/RandomNumberKernels.cc
@@ -18,7 +18,7 @@ inline void assertCurand( curandStatus_t code, const char *file, int line, bool
 }
 #endif /* clang-format on */
 
-#ifdef __CUDACC__
+#ifdef __GPUCC__
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -51,7 +51,7 @@ namespace mg5amcCpu
   {
     if( m_isOnDevice )
     {
-#ifdef __CUDACC__
+#ifdef __GPUCC__
       if( !m_rnarray.isOnDevice() )
         throw std::runtime_error( "CurandRandomNumberKernel on device with a host random number array" );
 #else
@@ -129,7 +129,7 @@ namespace mg5amcCpu
     /*
     printf( "\nCurandRandomNumberKernel::generateRnarray size = %d\n", (int)m_rnarray.size() );
     fptype* data = m_rnarray.data();
-#ifdef __CUDACC__
+#ifdef __GPUCC__
     if( m_rnarray.isOnDevice() )
     {
       data = new fptype[m_rnarray.size()]();
@@ -138,7 +138,7 @@ namespace mg5amcCpu
 #endif
     for( int i = 0; i < ( (int)m_rnarray.size() / 4 ); i++ )
       printf( "[%4d] %f %f %f %f\n", i * 4, data[i * 4], data[i * 4 + 2], data[i * 4 + 2], data[i * 4 + 3] );
-#ifdef __CUDACC__
+#ifdef __GPUCC__
     if( m_rnarray.isOnDevice() ) delete[] data;
 #endif
     */
diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/RandomNumberKernels.h b/epochX/cudacpp/ee_mumu.mad/SubProcesses/RandomNumberKernels.h
index 4d55f3d449..36855b730c 100644
--- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/RandomNumberKernels.h
+++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/RandomNumberKernels.h
@@ -3,14 +3,14 @@
 
 #include "mgOnGpuConfig.h"
 
-// NB This must come AFTER mgOnGpuConfig.h which contains our definition of __global__ when __CUDACC__ is not defined
+// NB This must come AFTER mgOnGpuConfig.h which contains our definition of __global__ when __GPUCC__ is not defined
 #ifndef MGONGPU_HAS_NO_CURAND
 #include "curand.h"
 #endif
 
 #include "MemoryBuffers.h"
 
-#ifdef __CUDACC__
+#ifdef __GPUCC__
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/fbridge.cc b/epochX/cudacpp/ee_mumu.mad/SubProcesses/fbridge.cc
index 1f914ee143..eba81d91c1 100644
--- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/fbridge.cc
+++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/fbridge.cc
@@ -17,7 +17,7 @@ extern "C"
    * Using the same Fortran MadEvent code, linking to the hetrerogeneous library would allow access to both CPU and GPU implementations.
    * The specific heterogeneous configuration (how many GPUs, how many threads on each CPU, etc) could be loaded in CUDA/C++ from a data file.
    */
-#ifdef __CUDACC__
+#ifdef __GPUCC__
   using namespace mg5amcGpu;
 #else
   using namespace mg5amcCpu;
@@ -41,7 +41,7 @@ extern "C"
    */
   void fbridgecreate_( CppObjectInFortran** ppbridge, const int* pnevtF, const int* pnparF, const int* pnp4F )
   {
-#ifdef __CUDACC__
+#ifdef __GPUCC__
     GpuRuntime::setUp();
 #endif
     // Create a process object, read parm card and set parameters
@@ -64,7 +64,7 @@ extern "C"
     Bridge<FORTRANFPTYPE>* pbridge = dynamic_cast<Bridge<FORTRANFPTYPE>*>( *ppbridge );
     if( pbridge == 0 ) throw std::runtime_error( "fbridgedelete_: invalid Bridge address" );
     delete pbridge;
-#ifdef __CUDACC__
+#ifdef __GPUCC__
     CudaRuntime::tearDown();
 #endif
   }
@@ -95,7 +95,7 @@ extern "C"
   {
     Bridge<FORTRANFPTYPE>* pbridge = dynamic_cast<Bridge<FORTRANFPTYPE>*>( *ppbridge );
     if( pbridge == 0 ) throw std::runtime_error( "fbridgesequence_: invalid Bridge address" );
-#ifdef __CUDACC__
+#ifdef __GPUCC__
     // Use the device/GPU implementation in the CUDA library
     // (there is also a host implementation in this library)
     pbridge->gpu_sequence( momenta, gs, rndhel, rndcol, *pchannelId, mes, selhel, selcol );
diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/fsampler.cc b/epochX/cudacpp/ee_mumu.mad/SubProcesses/fsampler.cc
index bc90937f47..8f6f8e3414 100644
--- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/fsampler.cc
+++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/fsampler.cc
@@ -7,7 +7,7 @@
 
 //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef __GPUCC__
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -34,7 +34,7 @@ namespace mg5amcCpu
   private:
     const int m_nevt; // The number of events in each iteration
     int m_iiter;      // The iteration counter (for random number seeding)
-#ifndef __CUDACC__
+#ifndef __GPUCC__
     HostBufferRndNumMomenta m_hstRndmom; // Memory buffers for random numbers
     HostBufferMomenta m_hstMomenta;      // Memory buffers for momenta
     HostBufferWeights m_hstWeights;      // Memory buffers for sampling weights
@@ -99,7 +99,7 @@ namespace mg5amcCpu
 
 extern "C"
 {
-#ifdef __CUDACC__
+#ifdef __GPUCC__
   using namespace mg5amcGpu;
 #else
   using namespace mg5amcCpu;
diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/gpu_abstraction.h b/epochX/cudacpp/ee_mumu.mad/SubProcesses/gpu_abstraction.h
index f534a5c5d5..cd62acfbf1 100644
--- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/gpu_abstraction.h
+++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/gpu_abstraction.h
@@ -4,10 +4,10 @@
 #include <cassert>
 #include <iostream>
 
-#ifdef __CUDACC__
+#ifdef __GPUCC__
 
   // Defines correct compiler
-  #define __CUDACC__ __CUDACC__
+  #define __GPUCC__ __GPUCC__
 
   //--------------------------------------------------------------------------
 
@@ -39,7 +39,7 @@
 #elif defined(__HIPCC__)
 
   // Defines correct compiler
-  #define __CUDACC__ __HIPCC__
+  #define __GPUCC__ __HIPCC__
 
   //--------------------------------------------------------------------------
 
diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/runTest.cc b/epochX/cudacpp/ee_mumu.mad/SubProcesses/runTest.cc
index a0d551d5d5..78c0b18a9c 100644
--- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/runTest.cc
+++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/runTest.cc
@@ -11,7 +11,7 @@
 #include "RandomNumberKernels.h"
 #include "epoch_process_id.h"
 
-#ifdef __CUDACC__
+#ifdef __GPUCC__
 using namespace mg5amcGpu;
 #else
 using namespace mg5amcCpu;
@@ -28,7 +28,7 @@ struct CUDA_CPU_TestBase : public TestDriverBase
     : TestDriverBase( npar, refFileName ) {}
 };
 
-#ifndef __CUDACC__
+#ifndef __GPUCC__
 struct CPUTest : public CUDA_CPU_TestBase
 {
   // Struct data members (process, and memory structures for random numbers, momenta, matrix elements and weights on host and device)
@@ -110,7 +110,7 @@ struct CPUTest : public CUDA_CPU_TestBase
 };
 #endif
 
-#ifdef __CUDACC__
+#ifdef __GPUCC__
 struct CUDATest : public CUDA_CPU_TestBase
 {
   // Reset the device when our test goes out of scope. Note that this should happen after
@@ -245,7 +245,7 @@ INSTANTIATE_TEST_SUITE_P( prefix, \
                           test_suite_name, \
                           testing::Values( new CUDATest( MG_EPOCH_REFERENCE_FILE_NAME ) ) );
 
-#ifdef __CUDACC__
+#ifdef __GPUCC__
 MG_INSTANTIATE_TEST_SUITE_GPU( XTESTID_GPU( MG_EPOCH_PROCESS_ID ), MadgraphTest );
 #else
 MG_INSTANTIATE_TEST_SUITE_CPU( XTESTID_CPU( MG_EPOCH_PROCESS_ID ), MadgraphTest );
diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/testmisc.cc b/epochX/cudacpp/ee_mumu.mad/SubProcesses/testmisc.cc
index 5fa8ac70fe..bb8a464ce5 100644
--- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/testmisc.cc
+++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/testmisc.cc
@@ -11,7 +11,7 @@
 #include <sstream>
 #include <typeinfo>
 
-#ifdef __CUDACC__
+#ifdef __GPUCC__
 #define TESTID( s ) s##_GPU_MISC
 #else
 #define TESTID( s ) s##_CPU_MISC
diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/testxxx.cc b/epochX/cudacpp/ee_mumu.mad/SubProcesses/testxxx.cc
index 97dc15d0f1..9ea9aad41d 100644
--- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/testxxx.cc
+++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/testxxx.cc
@@ -15,7 +15,7 @@
 #include <iomanip>
 #include <iostream>
 #include <vector>
-#ifdef __CUDACC__
+#ifdef __GPUCC__
 #define TESTID( s ) s##_GPU_XXX
 #else
 #define TESTID( s ) s##_CPU_XXX
@@ -37,7 +37,7 @@ TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testxxx )
   assert( nevt % neppM == 0 ); // nevt must be a multiple of neppM
   assert( nevt % neppV == 0 ); // nevt must be a multiple of neppV
   // Fill in the input momenta
-#ifdef __CUDACC__
+#ifdef __GPUCC__
   mg5amcGpu::PinnedHostBufferMomenta hstMomenta( nevt ); // AOSOA[npagM][npar=4][np4=4][neppM]
 #else
   mg5amcCpu::HostBufferMomenta hstMomenta( nevt ); // AOSOA[npagM][npar=4][np4=4][neppM]
@@ -224,7 +224,7 @@ TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testxxx )
   {
     for( int ievt = 0; ievt < nevt; ievt++ )
     {
-#ifdef __CUDACC__
+#ifdef __GPUCC__
       using namespace mg5amcGpu;
 #else
       using namespace mg5amcCpu;

From 98d02bbcdf5338b3fe676bb456587ef557f49406 Mon Sep 17 00:00:00 2001
From: Jorgen T <jorgen.teig@gmail.com>
Date: Thu, 1 Jun 2023 09:48:14 +0200
Subject: [PATCH 247/509] Changed name of gpu_abstraction to fall inline with
 naming scheme

---
 .../{gpu_abstraction.h => GpuAbstraction.h}            | 10 ++++------
 .../SubProcesses/P1_epem_mupmum/GpuAbstraction.h       |  1 +
 .../SubProcesses/P1_epem_mupmum/gpu_abstraction.h      |  1 -
 3 files changed, 5 insertions(+), 7 deletions(-)
 rename epochX/cudacpp/ee_mumu.mad/SubProcesses/{gpu_abstraction.h => GpuAbstraction.h} (95%)
 create mode 120000 epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/GpuAbstraction.h
 delete mode 120000 epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/gpu_abstraction.h

diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/gpu_abstraction.h b/epochX/cudacpp/ee_mumu.mad/SubProcesses/GpuAbstraction.h
similarity index 95%
rename from epochX/cudacpp/ee_mumu.mad/SubProcesses/gpu_abstraction.h
rename to epochX/cudacpp/ee_mumu.mad/SubProcesses/GpuAbstraction.h
index cd62acfbf1..67652fae6c 100644
--- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/gpu_abstraction.h
+++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/GpuAbstraction.h
@@ -1,13 +1,11 @@
-// gpu_abstraction.h
-#pragma once
+// GpuAbstraction.h
 
 #include <cassert>
-#include <iostream>
 
-#ifdef __GPUCC__
+#ifdef __CUDACC__
 
   // Defines correct compiler
-  #define __GPUCC__ __GPUCC__
+  #define MGONGPUCPP_GPUIMPL __CUDACC__
 
   //--------------------------------------------------------------------------
 
@@ -39,7 +37,7 @@
 #elif defined(__HIPCC__)
 
   // Defines correct compiler
-  #define __GPUCC__ __HIPCC__
+  #define MGONGPUCPP_GPUIMPL __HIPCC__
 
   //--------------------------------------------------------------------------
 
diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/GpuAbstraction.h b/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/GpuAbstraction.h
new file mode 120000
index 0000000000..72054e19ba
--- /dev/null
+++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/GpuAbstraction.h
@@ -0,0 +1 @@
+../GpuAbstraction.h
\ No newline at end of file
diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/gpu_abstraction.h b/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/gpu_abstraction.h
deleted file mode 120000
index 1a79490b1a..0000000000
--- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/gpu_abstraction.h
+++ /dev/null
@@ -1 +0,0 @@
-../gpu_abstraction.h
\ No newline at end of file

From ecee14d44296039022a1b38bbd42dcc4bd2d9734 Mon Sep 17 00:00:00 2001
From: Jorgen T <jorgen.teig@gmail.com>
Date: Thu, 1 Jun 2023 10:01:39 +0200
Subject: [PATCH 248/509] Changed name of GPUCC macro to MGONGPUCPP_GPUIMPL
 also did some cleanup

---
 .../cudacpp/ee_mumu.mad/SubProcesses/Bridge.h | 26 +++----
 .../ee_mumu.mad/SubProcesses/BridgeKernels.cc |  8 +-
 .../ee_mumu.mad/SubProcesses/BridgeKernels.h  |  7 +-
 .../SubProcesses/CrossSectionKernels.cc       |  6 +-
 .../SubProcesses/CrossSectionKernels.h        |  5 +-
 .../ee_mumu.mad/SubProcesses/CudaRuntime.h    |  4 +-
 .../SubProcesses/EventStatistics.h            |  4 +-
 .../ee_mumu.mad/SubProcesses/GpuRuntime.h     |  8 +-
 .../ee_mumu.mad/SubProcesses/MadgraphTest.h   |  6 +-
 .../SubProcesses/MatrixElementKernels.cc      |  4 +-
 .../SubProcesses/MatrixElementKernels.h       |  8 +-
 .../SubProcesses/MemoryAccessHelpers.h        |  4 +-
 .../SubProcesses/MemoryAccessMomenta.h        |  4 +-
 .../SubProcesses/MemoryAccessVectors.h        |  4 +-
 .../ee_mumu.mad/SubProcesses/MemoryBuffers.h  | 48 ++++++------
 .../SubProcesses/P1_epem_mupmum/CPPProcess.cc | 50 ++++++-------
 .../SubProcesses/P1_epem_mupmum/CPPProcess.h  |  9 ++-
 .../SubProcesses/P1_epem_mupmum/check_sa.cc   | 74 +++++++++----------
 .../SubProcesses/RamboSamplingKernels.cc      | 12 +--
 .../SubProcesses/RamboSamplingKernels.h       |  6 +-
 .../SubProcesses/RandomNumberKernels.cc       |  8 +-
 .../SubProcesses/RandomNumberKernels.h        |  6 +-
 .../ee_mumu.mad/SubProcesses/fbridge.cc       | 12 +--
 .../ee_mumu.mad/SubProcesses/fsampler.cc      |  7 +-
 .../ee_mumu.mad/SubProcesses/runTest.cc       | 10 +--
 .../ee_mumu.mad/SubProcesses/testmisc.cc      |  4 +-
 .../ee_mumu.mad/SubProcesses/testxxx.cc       |  7 +-
 27 files changed, 189 insertions(+), 162 deletions(-)

diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/Bridge.h b/epochX/cudacpp/ee_mumu.mad/SubProcesses/Bridge.h
index 0f1498ed9e..919b0c86dc 100644
--- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/Bridge.h
+++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/Bridge.h
@@ -6,7 +6,7 @@
 
 #include "CPPProcess.h"           // for CPPProcess
 #include "CrossSectionKernels.h"  // for flagAbnormalMEs
-#include "GpuRuntime.h"       // for GPU abstraction, checkCuda is run here
+#include "GpuRuntime.h"           // for CUDA/HIP runtime, also includes GPU abstraction
 #include "MatrixElementKernels.h" // for MatrixElementKernelHost, MatrixElementKernelDevice
 #include "MemoryAccessMomenta.h"  // for MemoryAccessMomenta::neppM
 #include "MemoryBuffers.h"        // for HostBufferMomenta, DeviceBufferMomenta etc
@@ -19,7 +19,7 @@
 #include <memory>
 #include <type_traits>
 
-#ifdef __GPUCC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -79,7 +79,7 @@ namespace mg5amcCpu
     Bridge& operator=( const Bridge& ) = delete;
     Bridge& operator=( Bridge&& ) = delete;
 
-#ifdef __GPUCC__
+#ifdef MGONGPUCPP_GPUIMPL
     /**
      * Set the gpublocks and gputhreads for the gpusequence - throws if evnt != gpublocks*gputhreads
      * (this is needed for BridgeKernel tests rather than for actual production use in Fortran)
@@ -146,7 +146,7 @@ namespace mg5amcCpu
     unsigned int m_nevt; // number of events
     int m_nGoodHel;      // the number of good helicities (-1 initially when they have not yet been calculated)
 
-#ifdef __GPUCC__
+#ifdef MGONGPUCPP_GPUIMPL
     int m_gputhreads; // number of gpu threads (default set from number of events, can be modified)
     int m_gpublocks;  // number of gpu blocks (default set from number of events, can be modified)
     mg5amcGpu::DeviceBuffer<FORTRANFPTYPE, sizePerEventMomenta> m_devMomentaF;
@@ -183,12 +183,12 @@ namespace mg5amcCpu
   // Forward declare transposition methods
   //
 
-#ifdef __GPUCC__
+#ifdef MGONGPUCPP_GPUIMPL
 
   template<typename Tin, typename Tout>
   __global__ void dev_transposeMomentaF2C( const Tin* in, Tout* out, const unsigned int nevt );
 
-#endif // __GPUCC__
+#endif // MGONGPUCPP_GPUIMPL
 
   template<typename Tin, typename Tout>
   void hst_transposeMomentaF2C( const Tin* in, Tout* out, const unsigned int nevt );
@@ -205,7 +205,7 @@ namespace mg5amcCpu
   Bridge<FORTRANFPTYPE>::Bridge( unsigned int nevtF, unsigned int nparF, unsigned int np4F )
     : m_nevt( nevtF )
     , m_nGoodHel( -1 )
-#ifdef __GPUCC__
+#ifdef MGONGPUCPP_GPUIMPL
     , m_gputhreads( 256 )                  // default number of gpu threads
     , m_gpublocks( m_nevt / m_gputhreads ) // this ensures m_nevt <= m_gpublocks*m_gputhreads
     , m_devMomentaF( m_nevt )
@@ -229,7 +229,7 @@ namespace mg5amcCpu
   {
     if( nparF != mgOnGpu::npar ) throw std::runtime_error( "Bridge constructor: npar mismatch" );
     if( np4F != mgOnGpu::np4 ) throw std::runtime_error( "Bridge constructor: np4 mismatch" );
-#ifdef __GPUCC__
+#ifdef MGONGPUCPP_GPUIMPL
     if( ( m_nevt < s_gputhreadsmin ) || ( m_nevt % s_gputhreadsmin != 0 ) )
       throw std::runtime_error( "Bridge constructor: nevt should be a multiple of " + std::to_string( s_gputhreadsmin ) );
     while( m_nevt != m_gpublocks * m_gputhreads )
@@ -247,11 +247,11 @@ namespace mg5amcCpu
     std::cout << "WARNING! Instantiate host Bridge (nevt=" << m_nevt << ")" << std::endl;
     mg5amcCpu::CPPProcess process( /*verbose=*/false );
     m_pmek.reset( new mg5amcCpu::MatrixElementKernelHost( m_hstMomentaC, m_hstGs, m_hstRndHel, m_hstRndCol, m_hstMEs, m_hstSelHel, m_hstSelCol, m_nevt ) );
-#endif // __GPUCC__
+#endif // MGONGPUCPP_GPUIMPL
     process.initProc( "../../Cards/param_card.dat" );
   }
 
-#ifdef __GPUCC__
+#ifdef MGONGPUCPP_GPUIMPL
   template<typename FORTRANFPTYPE>
   void Bridge<FORTRANFPTYPE>::set_gpugrid( const int gpublocks, const int gputhreads )
   {
@@ -265,7 +265,7 @@ namespace mg5amcCpu
   }
 #endif
 
-#ifdef __GPUCC__
+#ifdef MGONGPUCPP_GPUIMPL
   template<typename FORTRANFPTYPE>
   void Bridge<FORTRANFPTYPE>::gpu_sequence( const FORTRANFPTYPE* momenta,
                                             const FORTRANFPTYPE* gs,
@@ -330,7 +330,7 @@ namespace mg5amcCpu
   }
 #endif
 
-#ifndef __GPUCC__
+#ifndef MGONGPUCPP_GPUIMPL
   template<typename FORTRANFPTYPE>
   void Bridge<FORTRANFPTYPE>::cpu_sequence( const FORTRANFPTYPE* momenta,
                                             const FORTRANFPTYPE* gs,
@@ -385,7 +385,7 @@ namespace mg5amcCpu
   // - C++ array: momenta[npagM][npar][np4][neppM] with nevt=npagM*neppM (AOSOA)
   //
 
-#ifdef __GPUCC__
+#ifdef MGONGPUCPP_GPUIMPL
   template<typename Tin, typename Tout>
   __global__ void dev_transposeMomentaF2C( const Tin* in, Tout* out, const unsigned int nevt )
   {
diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/BridgeKernels.cc b/epochX/cudacpp/ee_mumu.mad/SubProcesses/BridgeKernels.cc
index e1aad45df5..8c12ce855a 100644
--- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/BridgeKernels.cc
+++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/BridgeKernels.cc
@@ -1,5 +1,7 @@
 #include "BridgeKernels.h"
 
+#include "GpuAbstraction.h"
+
 #include "MemoryAccessMomenta.h"
 
 #include <sstream>
@@ -9,7 +11,7 @@ using mgOnGpu::np4;  // the number of dimensions of 4-momenta (E,px,py,pz)
 
 //============================================================================
 
-#ifdef __GPUCC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -40,7 +42,7 @@ namespace mg5amcCpu
 
 //============================================================================
 
-#ifndef __GPUCC__
+#ifndef MGONGPUCPP_GPUIMPL
 namespace mg5amcCpu
 {
 
@@ -91,7 +93,7 @@ namespace mg5amcCpu
 
 //============================================================================
 
-#ifdef __GPUCC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 {
 
diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/BridgeKernels.h b/epochX/cudacpp/ee_mumu.mad/SubProcesses/BridgeKernels.h
index ef7772e192..1d6dcc3561 100644
--- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/BridgeKernels.h
+++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/BridgeKernels.h
@@ -4,10 +4,11 @@
 #include "mgOnGpuConfig.h"
 
 #include "Bridge.h"
+#include "GpuAbstraction.h"
 #include "MatrixElementKernels.h"
 #include "MemoryBuffers.h"
 
-#ifdef __GPUCC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -44,7 +45,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifndef __GPUCC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A Bridge wrapper class encapsulating matrix element calculations on a CPU host
   class BridgeKernelHost final : public BridgeKernelBase
   {
@@ -84,7 +85,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __GPUCC__
+#ifdef MGONGPUCPP_GPUIMPL
   // A Bridge wrapper class encapsulating matrix element calculations on a GPU device
   class BridgeKernelDevice : public BridgeKernelBase
   {
diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/CrossSectionKernels.cc b/epochX/cudacpp/ee_mumu.mad/SubProcesses/CrossSectionKernels.cc
index 998c5b76f1..4311024d50 100644
--- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/CrossSectionKernels.cc
+++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/CrossSectionKernels.cc
@@ -1,5 +1,7 @@
 #include "CrossSectionKernels.h"
 
+#include "GpuAbstraction.h"
+
 #include "MemoryAccessMatrixElements.h"
 #include "MemoryAccessWeights.h"
 #include "MemoryBuffers.h"
@@ -72,7 +74,7 @@ debug_me_is_abnormal( const fptype& me, size_t ievtALL )
 
 //============================================================================
 
-#ifdef __GPUCC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -180,7 +182,7 @@ namespace mg5amcCpu
 
 //============================================================================
 
-#ifdef __GPUCC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 {
 
diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/CrossSectionKernels.h b/epochX/cudacpp/ee_mumu.mad/SubProcesses/CrossSectionKernels.h
index e2acaa6e12..ffda71a95f 100644
--- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/CrossSectionKernels.h
+++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/CrossSectionKernels.h
@@ -2,13 +2,14 @@
 #define CROSSSECTIONKERNELS_H 1
 
 #include "mgOnGpuConfig.h"
+#include "GpuAbstraction.h"
 
 #include "EventStatistics.h"
 #include "MemoryBuffers.h"
 
 //============================================================================
 
-#ifdef __GPUCC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -91,7 +92,7 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
   /*
-#ifdef __GPUCC__
+#ifdef MGONGPUCPP_GPUIMPL
   // A class encapsulating the calculation of event statistics on a GPU device
   class CrossSectionKernelDevice : public CrossSectionKernelBase, public NumberOfEvents
   {
diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/CudaRuntime.h b/epochX/cudacpp/ee_mumu.mad/SubProcesses/CudaRuntime.h
index f23a3d9e0f..79ef4e6c13 100644
--- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/CudaRuntime.h
+++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/CudaRuntime.h
@@ -10,7 +10,7 @@
 //--------------------------------------------------------------------------
 
 // See https://stackoverflow.com/a/14038590
-#ifdef __GPUCC__ /* clang-format off */
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
 #define checkCuda( code ) { assertCuda( code, __FILE__, __LINE__ ); }
 inline void assertCuda( cudaError_t code, const char* file, int line, bool abort = true )
 {
@@ -24,7 +24,7 @@ inline void assertCuda( cudaError_t code, const char* file, int line, bool abort
 
 //--------------------------------------------------------------------------
 
-#ifdef __GPUCC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 {
   // Instantiate a CudaRuntime at the beginnining of the application's main to
diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/EventStatistics.h b/epochX/cudacpp/ee_mumu.mad/SubProcesses/EventStatistics.h
index d35f42f326..02cec2983c 100644
--- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/EventStatistics.h
+++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/EventStatistics.h
@@ -3,13 +3,15 @@
 
 #include "mgOnGpuConfig.h" // for npar (meGeVexponent)
 
+#include "GpuAbstraction.h"
+
 #include <algorithm>
 #include <cmath>
 #include <iostream>
 #include <limits>
 #include <string>
 
-#ifdef __GPUCC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/GpuRuntime.h b/epochX/cudacpp/ee_mumu.mad/SubProcesses/GpuRuntime.h
index b80cbbdba8..a53a2c2d8d 100644
--- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/GpuRuntime.h
+++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/GpuRuntime.h
@@ -4,14 +4,14 @@
 // MG5AMC on GPU uses the CUDA runtime API, not the lower level CUDA driver API
 // See https://docs.nvidia.com/cuda/cuda-runtime-api/driver-vs-runtime-api.html#driver-vs-runtime-api
 
-#include <cassert>
+#include "GpuAbstraction.h"
+
 #include <iostream>
-#include "gpu_abstraction.h"
 
 //--------------------------------------------------------------------------
 
 // See https://stackoverflow.com/a/14038590
-#ifdef __GPUCC__ /* clang-format off */
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
 #define checkGpu( code ) { assertGpu( code, __FILE__, __LINE__ ); }
 inline void assertGpu( gpuError_t code, const char* file, int line, bool abort = true )
 {
@@ -25,7 +25,7 @@ inline void assertGpu( gpuError_t code, const char* file, int line, bool abort =
 
 //--------------------------------------------------------------------------
 
-#ifdef __GPUCC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 {
   // Instantiate a CudaRuntime at the beginnining of the application's main to
diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/MadgraphTest.h b/epochX/cudacpp/ee_mumu.mad/SubProcesses/MadgraphTest.h
index 2b9806caf2..f4c9a7e437 100644
--- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/MadgraphTest.h
+++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/MadgraphTest.h
@@ -5,6 +5,8 @@
 #include <gtest/gtest.h>
 #include <mgOnGpuConfig.h>
 
+#include "GpuAbstraction.h"
+
 #include <array>
 #include <cmath>
 #include <fstream>
@@ -186,7 +188,7 @@ class MadgraphTest : public testing::TestWithParam<TestDriverBase*>
 
 // Since we link both the CPU-only and GPU tests into the same executable, we prevent
 // a multiply defined symbol by only compiling this in the non-CUDA phase:
-#ifndef __GPUCC__
+#ifndef MGONGPUCPP_GPUIMPL
 
 /// Compare momenta and matrix elements.
 /// This uses an implementation of TestDriverBase to run a madgraph workflow,
@@ -295,6 +297,6 @@ TEST_P( MadgraphTest, CompareMomentaAndME )
   }
 }
 
-#endif // __GPUCC__
+#endif // MGONGPUCPP_GPUIMPL
 
 #endif /* MADGRAPHTEST_H_ */
diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/MatrixElementKernels.cc b/epochX/cudacpp/ee_mumu.mad/SubProcesses/MatrixElementKernels.cc
index d01984b74d..1aa95591a1 100644
--- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/MatrixElementKernels.cc
+++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/MatrixElementKernels.cc
@@ -9,7 +9,7 @@
 
 //============================================================================
 
-#ifndef __GPUCC__
+#ifndef MGONGPUCPP_GPUIMPL
 namespace mg5amcCpu
 {
 
@@ -138,7 +138,7 @@ namespace mg5amcCpu
 
 //============================================================================
 
-#ifdef __GPUCC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 {
 
diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/MatrixElementKernels.h b/epochX/cudacpp/ee_mumu.mad/SubProcesses/MatrixElementKernels.h
index 2d39d339ec..6c95e04a7d 100644
--- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/MatrixElementKernels.h
+++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/MatrixElementKernels.h
@@ -5,7 +5,9 @@
 
 #include "MemoryBuffers.h"
 
-#ifdef __GPUCC__
+#include "GpuAbstraction.h"
+
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -76,7 +78,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifndef __GPUCC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating matrix element calculations on a CPU host
   class MatrixElementKernelHost final : public MatrixElementKernelBase, public NumberOfEvents
   {
@@ -125,7 +127,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __GPUCC__
+#ifdef MGONGPUCPP_GPUIMPL
   // A class encapsulating matrix element calculations on a GPU device
   class MatrixElementKernelDevice : public MatrixElementKernelBase, public NumberOfEvents
   {
diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/MemoryAccessHelpers.h b/epochX/cudacpp/ee_mumu.mad/SubProcesses/MemoryAccessHelpers.h
index d173eb81a1..9dec82e487 100644
--- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/MemoryAccessHelpers.h
+++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/MemoryAccessHelpers.h
@@ -5,6 +5,8 @@
 
 #include "mgOnGpuFptypes.h"
 
+#include "GpuAbstraction.h"
+
 //----------------------------------------------------------------------------
 
 // A templated helper class that includes the boilerplate code for MemoryAccess classes
@@ -100,7 +102,7 @@ class KernelAccessHelper : public MemoryAccessHelper<T>
     }
     else
     {
-#ifdef __GPUCC__
+#ifdef MGONGPUCPP_GPUIMPL
       const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid
       //printf( "kernelAccessRecord: ievt=%d threadId=%d\n", ievt, threadIdx.x );
       return T::ieventAccessRecord( buffer, ievt ); // NB fptype and fptype_sv coincide for CUDA
diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/MemoryAccessMomenta.h b/epochX/cudacpp/ee_mumu.mad/SubProcesses/MemoryAccessMomenta.h
index c31062bd30..855524b85e 100644
--- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/MemoryAccessMomenta.h
+++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/MemoryAccessMomenta.h
@@ -3,6 +3,8 @@
 
 #include "mgOnGpuConfig.h"
 
+#include "GpuAbstraction.h"
+
 #include "MemoryAccessHelpers.h"
 #include "MemoryAccessVectors.h"
 
@@ -17,7 +19,7 @@ class MemoryAccessMomentaBase //_AOSOAv1
 
   // Number of Events Per Page in the momenta AOSOA memory buffer layout
   // (these are all best kept as a compile-time constants: see issue #23)
-#ifdef __GPUCC__ /* clang-format off */
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
   // -----------------------------------------------------------------------------------------------
   // --- GPUs: neppM is best set to a power of 2 times the number of fptype's in a 32-byte cacheline
   // --- This is relevant to ensure coalesced access to momenta in global memory
diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/MemoryAccessVectors.h b/epochX/cudacpp/ee_mumu.mad/SubProcesses/MemoryAccessVectors.h
index a3355f0852..e15d5b8680 100644
--- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/MemoryAccessVectors.h
+++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/MemoryAccessVectors.h
@@ -5,7 +5,9 @@
 
 #include "mgOnGpuVectors.h"
 
-#ifndef __GPUCC__
+#include "GpuAbstraction.h"
+
+#ifndef MGONGPUCPP_GPUIMPL
 namespace mg5amcCpu // this is only needed for CPU SIMD vectorization
 {
 
diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/MemoryBuffers.h b/epochX/cudacpp/ee_mumu.mad/SubProcesses/MemoryBuffers.h
index a3c5bdd861..2c8336c51c 100644
--- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/MemoryBuffers.h
+++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/MemoryBuffers.h
@@ -10,7 +10,7 @@
 
 #include <sstream>
 
-#ifdef __GPUCC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -80,7 +80,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifndef __GPUCC__
+#ifndef MGONGPUCPP_GPUIMPL
   constexpr bool HostBufferALIGNED = false;   // ismisaligned=false
   constexpr bool HostBufferMISALIGNED = true; // ismisaligned=true
 
@@ -112,7 +112,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __GPUCC__
+#ifdef MGONGPUCPP_GPUIMPL
   // A class encapsulating a CUDA pinned host buffer
   template<typename T>
   class PinnedHostBufferBase : public BufferBase<T>
@@ -132,7 +132,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __GPUCC__
+#ifdef MGONGPUCPP_GPUIMPL
   // A class encapsulating a CUDA device buffer
   template<typename T>
   class DeviceBufferBase : public BufferBase<T>
@@ -152,7 +152,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifndef __GPUCC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for a given number of events
   template<typename T, size_t sizePerEvent, bool ismisaligned>
   class HostBuffer : public HostBufferBase<T, ismisaligned>, virtual private NumberOfEvents
@@ -168,7 +168,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __GPUCC__
+#ifdef MGONGPUCPP_GPUIMPL
   // A class encapsulating a CUDA pinned host buffer for a given number of events
   template<typename T, size_t sizePerEvent>
   class PinnedHostBuffer : public PinnedHostBufferBase<T>, virtual private NumberOfEvents
@@ -184,7 +184,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __GPUCC__
+#ifdef MGONGPUCPP_GPUIMPL
   // A class encapsulating a CUDA device buffer for a given number of events
   template<typename T, size_t sizePerEvent>
   class DeviceBuffer : public DeviceBufferBase<T>, virtual private NumberOfEvents
@@ -206,7 +206,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for momenta random numbers
   constexpr size_t sizePerEventRndNumMomenta = MemoryBuffers::np4 * MemoryBuffers::nparf;
 
-#ifndef __GPUCC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for momenta random numbers
   typedef HostBuffer<fptype, sizePerEventRndNumMomenta, HostBufferALIGNED> HostBufferRndNumMomenta;
 #else
@@ -225,7 +225,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer with ONE fptype per event
   constexpr size_t sizePerEventOneFp = 1;
 
-#ifndef __GPUCC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer with ONE fptype per event
   typedef HostBuffer<fptype, sizePerEventOneFp, HostBufferALIGNED> HostBufferOneFp;
 #else
@@ -250,7 +250,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for Gs
   constexpr size_t sizePerEventGs = 1;
 
-#ifndef __GPUCC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for gs
   typedef HostBuffer<fptype, sizePerEventGs, HostBufferALIGNED> HostBufferGs;
 #else
@@ -269,7 +269,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for numerators
   constexpr size_t sizePerEventNumerators = 1;
 
-#ifndef __GPUCC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for gs
   typedef HostBuffer<fptype, sizePerEventNumerators, HostBufferALIGNED> HostBufferNumerators;
 #else
@@ -289,7 +289,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for denominators
   constexpr size_t sizePerEventDenominators = 1;
 
-#ifndef __GPUCC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for gs
   typedef HostBuffer<fptype, sizePerEventDenominators, HostBufferALIGNED> HostBufferDenominators;
 #else
@@ -308,7 +308,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for random numbers
   constexpr size_t sizePerEventCouplings = MemoryBuffers::ndcoup * MemoryBuffers::nx2;
 
-#ifndef __GPUCC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for gs
   typedef HostBuffer<fptype, sizePerEventCouplings, HostBufferALIGNED> HostBufferCouplings;
 #else
@@ -326,7 +326,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for momenta
   constexpr size_t sizePerEventMomenta = MemoryBuffers::np4 * MemoryBuffers::npar;
 
-#ifndef __GPUCC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for momenta
   typedef HostBuffer<fptype, sizePerEventMomenta, HostBufferALIGNED> HostBufferMomenta;
   //typedef HostBuffer<fptype, sizePerEventMomenta, HostBufferMISALIGNED> HostBufferMomenta; // TEST MISALIGNMENT!
@@ -345,7 +345,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for sampling weights
   constexpr size_t sizePerEventWeights = 1;
 
-#ifndef __GPUCC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for sampling weights
   typedef HostBuffer<fptype, sizePerEventWeights, HostBufferALIGNED> HostBufferWeights;
 #else
@@ -363,7 +363,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for matrix elements
   constexpr size_t sizePerEventMatrixElements = 1;
 
-#ifndef __GPUCC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for matrix elements
   typedef HostBuffer<fptype, sizePerEventMatrixElements, HostBufferALIGNED> HostBufferMatrixElements;
 #else
@@ -378,7 +378,7 @@ namespace mg5amcCpu
   // A base class encapsulating a memory buffer for the helicity mask
   typedef BufferBase<bool> BufferHelicityMask;
 
-#ifndef __GPUCC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for the helicity mask
   typedef HostBufferBase<bool, HostBufferALIGNED> HostBufferHelicityMask;
 #else
@@ -396,7 +396,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for wavefunctions
   constexpr size_t sizePerEventWavefunctions = MemoryBuffers::nw6 * MemoryBuffers::nx2;
 
-#ifndef __GPUCC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for wavefunctions
   typedef HostBuffer<fptype, sizePerEventWavefunctions, HostBufferALIGNED> HostBufferWavefunctions;
 #else
@@ -414,7 +414,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for helicity random numbers
   constexpr size_t sizePerEventRndNumHelicity = 1;
 
-#ifndef __GPUCC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for helicity random numbers
   typedef HostBuffer<fptype, sizePerEventRndNumHelicity, HostBufferALIGNED> HostBufferRndNumHelicity;
 #else
@@ -432,7 +432,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for color random numbers
   constexpr size_t sizePerEventRndNumColor = 1;
 
-#ifndef __GPUCC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for color random numbers
   typedef HostBuffer<fptype, sizePerEventRndNumColor, HostBufferALIGNED> HostBufferRndNumColor;
 #else
@@ -450,7 +450,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for helicity selection
   constexpr size_t sizePerEventSelectedHelicity = 1;
 
-#ifndef __GPUCC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for helicity selection
   typedef HostBuffer<int, sizePerEventSelectedHelicity, HostBufferALIGNED> HostBufferSelectedHelicity;
 #else
@@ -468,7 +468,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for color selection
   constexpr size_t sizePerEventSelectedColor = 1;
 
-#ifndef __GPUCC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for color selection
   typedef HostBuffer<int, sizePerEventSelectedColor, HostBufferALIGNED> HostBufferSelectedColor;
 #else
@@ -480,7 +480,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __GPUCC__
+#ifdef MGONGPUCPP_GPUIMPL
   template<class Tdst, class Tsrc>
   void copyDeviceFromHost( Tdst& dst, const Tsrc& src ) // keep the same order of arguments as in memcpy
   {
@@ -503,7 +503,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __GPUCC__
+#ifdef MGONGPUCPP_GPUIMPL
   template<class Tdst, class Tsrc>
   void copyHostFromDevice( Tdst& dst, const Tsrc& src ) // keep the same order of arguments as in memcpy
   {
diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/CPPProcess.cc b/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/CPPProcess.cc
index 83eb71ca36..37beaa5a34 100644
--- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/CPPProcess.cc
+++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/CPPProcess.cc
@@ -40,7 +40,7 @@
 // Class member functions for calculating the matrix elements for
 // Process: e+ e- > mu+ mu- WEIGHTED<=4 @1
 
-#ifdef __GPUCC__ 
+#ifdef MGONGPUCPP_GPUIMPL 
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -74,7 +74,7 @@ namespace mg5amcCpu
   __device__ const fptype cIPD[2] = { (fptype)Parameters_sm::mdl_MZ, (fptype)Parameters_sm::mdl_WZ };
   __device__ const fptype cIPC[6] = { (fptype)Parameters_sm::GC_3.real(), (fptype)Parameters_sm::GC_3.imag(), (fptype)Parameters_sm::GC_50.real(), (fptype)Parameters_sm::GC_50.imag(), (fptype)Parameters_sm::GC_59.real(), (fptype)Parameters_sm::GC_59.imag() };
 #else
-#ifdef __GPUCC__
+#ifdef MGONGPUCPP_GPUIMPL
   __device__ __constant__ fptype cIPD[2];
   __device__ __constant__ fptype cIPC[6];
 #else
@@ -84,7 +84,7 @@ namespace mg5amcCpu
 #endif
 
   // Helicity combinations (and filtering of "good" helicity combinations)
-#ifdef __GPUCC__
+#ifdef MGONGPUCPP_GPUIMPL
   __device__ __constant__ short cHel[ncomb][npar];
   __device__ __constant__ int cNGoodHel;
   __device__ __constant__ int cGoodHel[ncomb];
@@ -112,13 +112,13 @@ namespace mg5amcCpu
                            fptype* allDenominators,       // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
                            fptype_sv* jamp2_sv            // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled)
-#ifndef __GPUCC__
+#ifndef MGONGPUCPP_GPUIMPL
                            , const int ievt00             // input: first event number in current C++ event page (for CUDA, ievt depends on threadid)
 #endif
                            )
   //ALWAYS_INLINE // attributes are not permitted in a function definition
   {
-#ifdef __GPUCC__
+#ifdef MGONGPUCPP_GPUIMPL
     using namespace mg5amcGpu;
     using M_ACCESS = DeviceAccessMomenta;         // non-trivial access: buffer includes all events
     using E_ACCESS = DeviceAccessMatrixElements;  // non-trivial access: buffer includes all events
@@ -145,7 +145,7 @@ namespace mg5amcCpu
 #endif /* clang-format on */
     mgDebug( 0, __FUNCTION__ );
     //printf( "calculate_wavefunctions: ihel=%2d\n", ihel );
-#ifndef __GPUCC__
+#ifndef MGONGPUCPP_GPUIMPL
     //printf( "calculate_wavefunctions: ievt00=%d\n", ievt00 );
 #endif
 
@@ -177,12 +177,12 @@ namespace mg5amcCpu
 #endif
     for( int iParity = 0; iParity < nParity; ++iParity )
     { // START LOOP ON IPARITY
-#ifndef __GPUCC__
+#ifndef MGONGPUCPP_GPUIMPL
       const int ievt0 = ievt00 + iParity * neppV;
 #endif
       constexpr size_t nxcoup = ndcoup + nicoup; // both dependent and independent couplings
       const fptype* allCOUPs[nxcoup];
-#ifdef __GPUCC__
+#ifdef MGONGPUCPP_GPUIMPL
 #pragma nv_diagnostic push
 #pragma nv_diag_suppress 186 // e.g. <<warning #186-D: pointless comparison of unsigned integer with zero>>
 #endif
@@ -190,7 +190,7 @@ namespace mg5amcCpu
         allCOUPs[idcoup] = CD_ACCESS::idcoupAccessBufferConst( allcouplings, idcoup ); // dependent couplings, vary event-by-event
       for( size_t iicoup = 0; iicoup < nicoup; iicoup++ )
         allCOUPs[ndcoup + iicoup] = CI_ACCESS::iicoupAccessBufferConst( cIPC, iicoup ); // independent couplings, fixed for all events
-#ifdef __GPUCC__
+#ifdef MGONGPUCPP_GPUIMPL
 #pragma nv_diagnostic pop
       // CUDA kernels take input/output buffers with momenta/MEs for all events
       const fptype* momenta = allmomenta;
@@ -228,7 +228,7 @@ namespace mg5amcCpu
       // *** DIAGRAM 1 OF 2 ***
 
       // Wavefunction(s) for diagram number 1
-#if not( defined __GPUCC__ and defined MGONGPU_TEST_DIVERGENCE )
+#if not( defined MGONGPUCPP_GPUIMPL and defined MGONGPU_TEST_DIVERGENCE )
       opzxxx<M_ACCESS, W_ACCESS>( momenta, cHel[ihel][0], -1, w_fp[0], 0 ); // NB: opzxxx only uses pz
 #else
       if( ( blockDim.x * blockIdx.x + threadIdx.x ) % 2 == 0 )
@@ -283,7 +283,7 @@ namespace mg5amcCpu
       // [NB do keep 'static' for these constexpr arrays, see issue #283]
       static constexpr fptype2 cf[ncolor][ncolor] = { { 1 } }; // 2-D array[1][1]
 
-#ifndef __GPUCC__
+#ifndef MGONGPUCPP_GPUIMPL
       // Pre-compute a constexpr triangular color matrix properly normalized #475
       struct TriangularNormalizedColorMatrix
       {
@@ -340,7 +340,7 @@ namespace mg5amcCpu
 #endif
       for( int icol = 0; icol < ncolor; icol++ )
       {
-#ifndef __GPUCC__
+#ifndef MGONGPUCPP_GPUIMPL
         // === C++ START ===
         // Diagonal terms
 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
@@ -399,7 +399,7 @@ namespace mg5amcCpu
       MEs_sv_previous += deltaMEs_previous;
 #endif
       /*
-#ifdef __GPUCC__
+#ifdef MGONGPUCPP_GPUIMPL
       if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", blockDim.x * blockIdx.x + threadIdx.x, ihel, MEs_sv );
 #else
 #ifdef MGONGPU_CPPSIMD
@@ -446,7 +446,7 @@ namespace mg5amcCpu
       { 1, -1, 1, 1 },
       { 1, -1, -1, -1 },
       { 1, -1, -1, 1 } };
-#ifdef __GPUCC__
+#ifdef MGONGPUCPP_GPUIMPL
     gpuMemcpyToSymbol( cHel, tHel, ncomb * mgOnGpu::npar * sizeof( short ) );
 #else
     memcpy( cHel, tHel, ncomb * mgOnGpu::npar * sizeof( short ) );
@@ -487,7 +487,7 @@ namespace mg5amcCpu
     // Then copy them to CUDA constant memory (issue #39) or its C++ emulation in file-scope static memory
     const fptype tIPD[2] = { (fptype)m_pars->mdl_MZ, (fptype)m_pars->mdl_WZ };
     const cxtype tIPC[3] = { cxmake( m_pars->GC_3 ), cxmake( m_pars->GC_50 ), cxmake( m_pars->GC_59 ) };
-#ifdef __GPUCC__
+#ifdef MGONGPUCPP_GPUIMPL
     gpuMemcpyToSymbol( cIPD, tIPD, 2 * sizeof( fptype ) );
     gpuMemcpyToSymbol( cIPC, tIPC, 3 * sizeof( cxtype ) );
 #else
@@ -526,7 +526,7 @@ namespace mg5amcCpu
   {
     std::stringstream out;
     // CUDA version (NVCC)
-    // [Use __NVCC__ instead of __GPUCC__ here!]
+    // [Use __NVCC__ instead of MGONGPUCPP_GPUIMPL here!]
     // [This tests if 'nvcc' was used even to build a .cc file, even if not necessarily 'nvcc -x cu' for a .cu file]
     // [Check 'nvcc --compiler-options -dM -E dummy.c | grep CUDA': see https://stackoverflow.com/a/53713712]
 #ifdef __NVCC__
@@ -591,12 +591,12 @@ namespace mg5amcCpu
   __global__ void /* clang-format off */
   computeDependentCouplings( const fptype* allgs, // input: Gs[nevt]
                              fptype* allcouplings // output: couplings[nevt*ndcoup*2]
-#ifndef __GPUCC__
+#ifndef MGONGPUCPP_GPUIMPL
                              , const int nevt     // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #endif
   ) /* clang-format on */
   {
-#ifdef __GPUCC__
+#ifdef MGONGPUCPP_GPUIMPL
     using namespace mg5amcGpu;
     using G_ACCESS = DeviceAccessGs;
     using C_ACCESS = DeviceAccessCouplings;
@@ -617,7 +617,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __GPUCC__ /* clang-format off */
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
   __global__ void
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
@@ -746,7 +746,7 @@ namespace mg5amcCpu
         nGoodHel++;
       }
     }
-#ifdef __GPUCC__
+#ifdef MGONGPUCPP_GPUIMPL
     gpuMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) );
     gpuMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) );
 #else
@@ -772,7 +772,7 @@ namespace mg5amcCpu
 #endif
             int* allselhel,                // output: helicity selection[nevt]
             int* allselcol                 // output: helicity selection[nevt]
-#ifndef __GPUCC__
+#ifndef MGONGPUCPP_GPUIMPL
             , const int nevt               // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #endif
             ) /* clang-format on */
@@ -793,7 +793,7 @@ namespace mg5amcCpu
     // Denominators: spins, colors and identical particles
     constexpr int helcolDenominators[1] = { 4 }; // assume nprocesses == 1 (#272 and #343)
 
-#ifdef __GPUCC__
+#ifdef MGONGPUCPP_GPUIMPL
     // Remember: in CUDA this is a kernel for one event, in c++ this processes n events
     const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid
 #else
@@ -809,7 +809,7 @@ namespace mg5amcCpu
     // Start sigmaKin_lines
     // === PART 0 - INITIALISATION (before calculate_wavefunctions) ===
     // Reset the "matrix elements" - running sums of |M|^2 over helicities for the given event
-#ifdef __GPUCC__
+#ifdef MGONGPUCPP_GPUIMPL
     allMEs[ievt] = 0;
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     allNumerators[ievt] = 0;
@@ -837,7 +837,7 @@ namespace mg5amcCpu
     // === PART 1 - HELICITY LOOP: CALCULATE WAVEFUNCTIONS ===
     // (in both CUDA and C++, using precomputed good helicities)
 
-#ifdef __GPUCC__ // CUDA OR C++
+#ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++
 
     // *** START OF PART 1a - CUDA (one event per CPU thread) ***
     // Running sum of partial amplitudes squared for event by event color selection (#402)
@@ -1041,7 +1041,7 @@ namespace mg5amcCpu
     // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event
     // [NB 'sum over final spins, average over initial spins', eg see
     // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf]
-#ifdef __GPUCC__
+#ifdef MGONGPUCPP_GPUIMPL
     allMEs[ievt] /= helcolDenominators[0];
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     if( channelId > 0 ) allMEs[ievt] *= allNumerators[ievt] / allDenominators[ievt];
diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/CPPProcess.h b/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/CPPProcess.h
index f967b79a48..6279802f70 100644
--- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/CPPProcess.h
+++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/CPPProcess.h
@@ -9,6 +9,7 @@
 #define MG5_Sigma_sm_epem_mupmum_H 1
 
 #include "mgOnGpuConfig.h"
+#include "GpuRuntime.h"
 
 #include "mgOnGpuVectors.h"
 
@@ -18,7 +19,7 @@
 
 //--------------------------------------------------------------------------
 
-#ifdef __GPUCC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -85,7 +86,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __GPUCC__
+#ifdef MGONGPUCPP_GPUIMPL
   __global__ void
   computeDependentCouplings( const fptype* allgs,    // input: Gs[nevt]
                              fptype* allcouplings ); // output: couplings[nevt*ndcoup*2]
@@ -98,7 +99,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __GPUCC__ /* clang-format off */
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
   __global__ void
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
@@ -128,7 +129,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __GPUCC__ /* clang-format off */
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
   __global__ void
   sigmaKin( const fptype* allmomenta,      // input: momenta[nevt*npar*4]
             const fptype* allcouplings,    // input: couplings[nevt*ndcoup*2]
diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/check_sa.cc b/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/check_sa.cc
index 1d1c759f31..f855a04a18 100644
--- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/check_sa.cc
+++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/check_sa.cc
@@ -55,7 +55,7 @@ usage( char* argv0, int ret = 1 )
   std::cout << std::endl;
   std::cout << "Summary stats are always computed: '-p' and '-j' only control their printout" << std::endl;
   std::cout << "The '-d' flag only enables NaN/abnormal warnings and OMP debugging" << std::endl;
-#ifndef __GPUCC__
+#ifndef MGONGPUCPP_GPUIMPL
 #ifdef _OPENMP
   std::cout << std::endl;
   std::cout << "Use the OMP_NUM_THREADS environment variable to control OMP multi-threading" << std::endl;
@@ -69,7 +69,7 @@ int
 main( int argc, char** argv )
 {
   // Namespaces for CUDA and C++ (FIXME - eventually use the same namespace everywhere...)
-#ifdef __GPUCC__
+#ifdef MGONGPUCPP_GPUIMPL
   using namespace mg5amcGpu;
 #else
   using namespace mg5amcCpu;
@@ -94,7 +94,7 @@ main( int argc, char** argv )
     CurandHost = 1,
     CurandDevice = 2
   };
-#ifdef __GPUCC__
+#ifdef MGONGPUCPP_GPUIMPL
   RandomNumberMode rndgen = RandomNumberMode::CurandDevice; // default on GPU
 #elif not defined MGONGPU_HAS_NO_CURAND
   RandomNumberMode rndgen = RandomNumberMode::CurandHost;  // default on CPU if build has curand
@@ -107,7 +107,7 @@ main( int argc, char** argv )
     RamboHost = 1,
     RamboDevice = 2
   };
-#ifdef __GPUCC__
+#ifdef MGONGPUCPP_GPUIMPL
   RamboSamplingMode rmbsmp = RamboSamplingMode::RamboDevice; // default on GPU
 #else
   RamboSamplingMode rmbsmp = RamboSamplingMode::RamboHost; // default on CPU
@@ -137,7 +137,7 @@ main( int argc, char** argv )
     }
     else if( arg == "--curdev" )
     {
-#ifdef __GPUCC__
+#ifdef MGONGPUCPP_GPUIMPL
       rndgen = RandomNumberMode::CurandDevice;
 #else
       throw std::runtime_error( "CurandDevice is not supported on CPUs" );
@@ -157,7 +157,7 @@ main( int argc, char** argv )
     }
     else if( arg == "--rmbdev" )
     {
-#ifdef __GPUCC__
+#ifdef MGONGPUCPP_GPUIMPL
       rmbsmp = RamboSamplingMode::RamboDevice;
 #else
       throw std::runtime_error( "RamboDevice is not supported on CPUs" );
@@ -231,13 +231,13 @@ main( int argc, char** argv )
     return usage( argv[0] );
   }
 
-#ifndef __GPUCC__
+#ifndef MGONGPUCPP_GPUIMPL
 #ifdef _OPENMP
   ompnumthreadsNotSetMeansOneThread( debug ? 1 : 0 ); // quiet(-1), info(0), debug(1)
 #endif
 #endif
 
-#ifndef __GPUCC__
+#ifndef MGONGPUCPP_GPUIMPL
   // Fail gently and avoid "Illegal instruction (core dumped)" if the host does not support the SIMD used in the ME calculation
   // Note: this prevents a crash on pmpe04 but not on some github CI nodes?
   // [NB: SIMD vectorization in mg5amc C++ code is only used in the ME calculation below MatrixElementKernelHost!]
@@ -255,7 +255,7 @@ main( int argc, char** argv )
 
   // === STEP 0 - INITIALISE
 
-#ifdef __GPUCC__
+#ifdef MGONGPUCPP_GPUIMPL
 
   // --- 00. Initialise cuda
   // Instantiate a GpuRuntime (CUDA or HIP based on target arch) at the beginnining of the application's main to
@@ -284,7 +284,7 @@ main( int argc, char** argv )
   timermap.start( alloKey );
 
   // Memory buffers for random numbers for momenta
-#ifndef __GPUCC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferRndNumMomenta hstRndmom( nevt );
 #else
   PinnedHostBufferRndNumMomenta hstRndmom( nevt );
@@ -292,7 +292,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for sampling weights
-#ifndef __GPUCC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferWeights hstWeights( nevt );
 #else
   PinnedHostBufferWeights hstWeights( nevt );
@@ -300,7 +300,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for momenta
-#ifndef __GPUCC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferMomenta hstMomenta( nevt );
 #else
   PinnedHostBufferMomenta hstMomenta( nevt );
@@ -308,7 +308,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for Gs
-#ifndef __GPUCC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferGs hstGs( nevt );
 #else
   PinnedHostBufferGs hstGs( nevt );
@@ -325,7 +325,7 @@ main( int argc, char** argv )
   }
 
   // Memory buffers for matrix elements
-#ifndef __GPUCC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferMatrixElements hstMatrixElements( nevt );
 #else
   PinnedHostBufferMatrixElements hstMatrixElements( nevt );
@@ -334,7 +334,7 @@ main( int argc, char** argv )
 
   // Memory buffers for random numbers for helicity selection
   // *** NB #403 these buffers always remain initialised at 0: no need for helicity choice in gcheck/check (no LHE produced) ***
-#ifndef __GPUCC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferRndNumHelicity hstRndHel( nevt );
 #else
   PinnedHostBufferRndNumHelicity hstRndHel( nevt );
@@ -343,7 +343,7 @@ main( int argc, char** argv )
 
   // Memory buffers for random numbers for color selection
   // *** NB #402 these buffers always remain initialised at 0: no need for color choice in gcheck/check (no LHE produced) ***
-#ifndef __GPUCC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferRndNumColor hstRndCol( nevt );
 #else
   PinnedHostBufferRndNumColor hstRndCol( nevt );
@@ -351,7 +351,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for helicity selection
-#ifndef __GPUCC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferSelectedHelicity hstSelHel( nevt );
 #else
   PinnedHostBufferSelectedHelicity hstSelHel( nevt );
@@ -359,7 +359,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for color selection
-#ifndef __GPUCC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferSelectedColor hstSelCol( nevt );
 #else
   PinnedHostBufferSelectedColor hstSelCol( nevt );
@@ -386,7 +386,7 @@ main( int argc, char** argv )
     const bool onDevice = false;
     prnk.reset( new CurandRandomNumberKernel( hstRndmom, onDevice ) );
   }
-#ifdef __GPUCC__
+#ifdef MGONGPUCPP_GPUIMPL
   else
   {
     const bool onDevice = true;
@@ -413,7 +413,7 @@ main( int argc, char** argv )
   }
   else
   {
-#ifdef __GPUCC__
+#ifdef MGONGPUCPP_GPUIMPL
     prsk.reset( new RamboSamplingKernelDevice( energy, devRndmom, devMomenta, devWeights, gpublocks, gputhreads ) );
 #else
     throw std::logic_error( "RamboDevice is not supported on CPUs" ); // INTERNAL ERROR (no path to this statement)
@@ -424,7 +424,7 @@ main( int argc, char** argv )
   std::unique_ptr<MatrixElementKernelBase> pmek;
   if( !bridge )
   {
-#ifdef __GPUCC__
+#ifdef MGONGPUCPP_GPUIMPL
     pmek.reset( new MatrixElementKernelDevice( devMomenta, devGs, devRndHel, devRndCol, devMatrixElements, devSelHel, devSelCol, gpublocks, gputhreads ) );
 #else
     pmek.reset( new MatrixElementKernelHost( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, nevt ) );
@@ -432,7 +432,7 @@ main( int argc, char** argv )
   }
   else
   {
-#ifdef __GPUCC__
+#ifdef MGONGPUCPP_GPUIMPL
     pmek.reset( new BridgeKernelDevice( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, gpublocks, gputhreads ) );
 #else
     pmek.reset( new BridgeKernelHost( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, nevt ) );
@@ -474,7 +474,7 @@ main( int argc, char** argv )
     prnk->generateRnarray();
     //std::cout << "Got random numbers" << std::endl;
 
-#ifdef __GPUCC__
+#ifdef MGONGPUCPP_GPUIMPL
     if( rndgen != RandomNumberMode::CurandDevice && rmbsmp == RamboSamplingMode::RamboDevice )
     {
       // --- 1c. Copy rndmom from host to device
@@ -506,7 +506,7 @@ main( int argc, char** argv )
     prsk->getMomentaFinal();
     //std::cout << "Got final momenta" << std::endl;
 
-#ifdef __GPUCC__
+#ifdef MGONGPUCPP_GPUIMPL
     if( rmbsmp == RamboSamplingMode::RamboDevice )
     {
       // --- 2c. CopyDToH Weights
@@ -551,7 +551,7 @@ main( int argc, char** argv )
       dynamic_cast<BridgeKernelBase*>( pmek.get() )->transposeInputMomentaC2F();
     }
 
-#ifdef __GPUCC__
+#ifdef MGONGPUCPP_GPUIMPL
     // --- 2d. CopyHToD Momenta
     const std::string gKey = "0.. CpHTDg";
     rambtime += timermap.start( gKey ); // FIXME! NOT A RAMBO TIMER!
@@ -580,7 +580,7 @@ main( int argc, char** argv )
     wv3atime += timermap.stop(); // calc only
     wavetime += wv3atime;        // calc plus copy
 
-#ifdef __GPUCC__
+#ifdef MGONGPUCPP_GPUIMPL
     if( !bridge )
     {
       // --- 3b. CopyDToH MEs
@@ -721,7 +721,7 @@ main( int argc, char** argv )
     rndgentxt = "CURAND HOST";
   else if( rndgen == RandomNumberMode::CurandDevice )
     rndgentxt = "CURAND DEVICE";
-#ifdef __GPUCC__
+#ifdef MGONGPUCPP_GPUIMPL
   rndgentxt += " (CUDA code)";
 #else
   rndgentxt += " (C++ code)";
@@ -730,7 +730,7 @@ main( int argc, char** argv )
   // Workflow description summary
   std::string wrkflwtxt;
   // -- CUDA or C++?
-#ifdef __GPUCC__
+#ifdef MGONGPUCPP_GPUIMPL
   wrkflwtxt += "CUD:";
 #else
   wrkflwtxt += "CPP:";
@@ -746,7 +746,7 @@ main( int argc, char** argv )
   wrkflwtxt += "???+"; // no path to this statement
 #endif
   // -- CUCOMPLEX or THRUST or STD complex numbers?
-#ifdef __GPUCC__
+#ifdef MGONGPUCPP_GPUIMPL
 #if defined MGONGPU_CUCXTYPE_CUCOMPLEX
   wrkflwtxt += "CUX:";
 #elif defined MGONGPU_CUCXTYPE_THRUST
@@ -781,7 +781,7 @@ main( int argc, char** argv )
     wrkflwtxt += "RMBDEV+";
   else
     wrkflwtxt += "??????+"; // no path to this statement
-#ifdef __GPUCC__
+#ifdef MGONGPUCPP_GPUIMPL
   // -- HOST or DEVICE matrix elements? Standalone MEs or BRIDGE?
   if( !bridge )
     wrkflwtxt += "MESDEV";
@@ -837,7 +837,7 @@ main( int argc, char** argv )
 
   if( perf )
   {
-#ifndef __GPUCC__
+#ifndef MGONGPUCPP_GPUIMPL
 #ifdef _OPENMP
     // Get the output of "nproc --all" (https://stackoverflow.com/a/478960)
     std::string nprocall;
@@ -856,7 +856,7 @@ main( int argc, char** argv )
 #endif
     // Dump all configuration parameters and all results
     std::cout << std::string( SEP79, '*' ) << std::endl
-#ifdef __GPUCC__
+#ifdef MGONGPUCPP_GPUIMPL
               << "Process                     = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_CUDA"
 #else
               << "Process                     = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_CPP"
@@ -884,7 +884,7 @@ main( int argc, char** argv )
 #elif defined MGONGPU_FPTYPE_FLOAT
               << "FP precision                = FLOAT (NaN/abnormal=" << nabn << ", zero=" << nzero << ")" << std::endl
 #endif
-#ifdef __GPUCC__
+#ifdef MGONGPUCPP_GPUIMPL
 #if defined MGONGPU_CUCXTYPE_CUCOMPLEX
               << "Complex type                = CUCOMPLEX" << std::endl
 #elif defined MGONGPU_CUCXTYPE_THRUST
@@ -898,7 +898,7 @@ main( int argc, char** argv )
               << " [HARDCODED FOR REPRODUCIBILITY]" << std::endl
               << "Momenta memory layout       = AOSOA[" << neppM << "]"
               << ( neppM == 1 ? " == AOS" : "" ) << std::endl
-#ifdef __GPUCC__
+#ifdef MGONGPUCPP_GPUIMPL
     //<< "Wavefunction GPU memory     = LOCAL" << std::endl
 #else
 #if !defined MGONGPU_CPPSIMD
@@ -929,7 +929,7 @@ main( int argc, char** argv )
 #endif
 #endif
               << "Random number generation    = " << rndgentxt << std::endl
-#ifndef __GPUCC__
+#ifndef MGONGPUCPP_GPUIMPL
 #ifdef _OPENMP
               << "OMP threads / `nproc --all` = " << omp_get_max_threads() << " / " << nprocall // includes a newline
 #endif
@@ -1025,7 +1025,7 @@ main( int argc, char** argv )
              << "\"FLOAT (NaN/abnormal=" << nabn << ")\"," << std::endl
 #endif
              << "\"Complex type\": "
-#ifdef __GPUCC__
+#ifdef MGONGPUCPP_GPUIMPL
 #if defined MGONGPU_CUCXTYPE_CUCOMPLEX
              << "\"CUCOMPLEX\"," << std::endl
 #elif defined MGONGPU_CUCXTYPE_THRUST
@@ -1040,7 +1040,7 @@ main( int argc, char** argv )
              << "\"Momenta memory layout\": "
              << "\"AOSOA[" << neppM << "]\""
              << ( neppM == 1 ? " == AOS" : "" ) << ", " << std::endl
-#ifdef __GPUCC__
+#ifdef MGONGPUCPP_GPUIMPL
     //<< "\"Wavefunction GPU memory\": " << "\"LOCAL\"," << std::endl
 #endif
              << "\"Curand generation\": "
diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/RamboSamplingKernels.cc b/epochX/cudacpp/ee_mumu.mad/SubProcesses/RamboSamplingKernels.cc
index 5fb3396fb6..948c56edf2 100644
--- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/RamboSamplingKernels.cc
+++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/RamboSamplingKernels.cc
@@ -9,7 +9,7 @@
 
 #include <sstream>
 
-#ifdef __GPUCC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -87,7 +87,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __GPUCC__
+#ifdef MGONGPUCPP_GPUIMPL
   RamboSamplingKernelDevice::RamboSamplingKernelDevice( const fptype energy,               // input: energy
                                                         const BufferRndNumMomenta& rndmom, // input: random numbers in [0,1]
                                                         BufferMomenta& momenta,            // output: momenta
@@ -130,7 +130,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __GPUCC__
+#ifdef MGONGPUCPP_GPUIMPL
   __global__ void
   getMomentaInitialDevice( const fptype energy,
                            fptype* momenta )
@@ -142,7 +142,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __GPUCC__
+#ifdef MGONGPUCPP_GPUIMPL
   void
   RamboSamplingKernelDevice::getMomentaInitial()
   {
@@ -152,7 +152,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __GPUCC__
+#ifdef MGONGPUCPP_GPUIMPL
   __global__ void
   getMomentaFinalDevice( const fptype energy,
                          const fptype* rndmom,
@@ -166,7 +166,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __GPUCC__
+#ifdef MGONGPUCPP_GPUIMPL
   void
   RamboSamplingKernelDevice::getMomentaFinal()
   {
diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/RamboSamplingKernels.h b/epochX/cudacpp/ee_mumu.mad/SubProcesses/RamboSamplingKernels.h
index 0979de34ca..d5d5e31b6f 100644
--- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/RamboSamplingKernels.h
+++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/RamboSamplingKernels.h
@@ -5,7 +5,9 @@
 
 #include "MemoryBuffers.h"
 
-#ifdef __GPUCC__
+#include "GpuAbstraction.h"
+
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -88,7 +90,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __GPUCC__
+#ifdef MGONGPUCPP_GPUIMPL
   // A class encapsulating RAMBO phase space sampling on a GPU device
   class RamboSamplingKernelDevice final : public SamplingKernelBase, public NumberOfEvents
   {
diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/RandomNumberKernels.cc b/epochX/cudacpp/ee_mumu.mad/SubProcesses/RandomNumberKernels.cc
index d06dee9100..2f93d17d69 100644
--- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/RandomNumberKernels.cc
+++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/RandomNumberKernels.cc
@@ -18,7 +18,7 @@ inline void assertCurand( curandStatus_t code, const char *file, int line, bool
 }
 #endif /* clang-format on */
 
-#ifdef __GPUCC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -51,7 +51,7 @@ namespace mg5amcCpu
   {
     if( m_isOnDevice )
     {
-#ifdef __GPUCC__
+#ifdef MGONGPUCPP_GPUIMPL
       if( !m_rnarray.isOnDevice() )
         throw std::runtime_error( "CurandRandomNumberKernel on device with a host random number array" );
 #else
@@ -129,7 +129,7 @@ namespace mg5amcCpu
     /*
     printf( "\nCurandRandomNumberKernel::generateRnarray size = %d\n", (int)m_rnarray.size() );
     fptype* data = m_rnarray.data();
-#ifdef __GPUCC__
+#ifdef MGONGPUCPP_GPUIMPL
     if( m_rnarray.isOnDevice() )
     {
       data = new fptype[m_rnarray.size()]();
@@ -138,7 +138,7 @@ namespace mg5amcCpu
 #endif
     for( int i = 0; i < ( (int)m_rnarray.size() / 4 ); i++ )
       printf( "[%4d] %f %f %f %f\n", i * 4, data[i * 4], data[i * 4 + 2], data[i * 4 + 2], data[i * 4 + 3] );
-#ifdef __GPUCC__
+#ifdef MGONGPUCPP_GPUIMPL
     if( m_rnarray.isOnDevice() ) delete[] data;
 #endif
     */
diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/RandomNumberKernels.h b/epochX/cudacpp/ee_mumu.mad/SubProcesses/RandomNumberKernels.h
index 36855b730c..f949c9393a 100644
--- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/RandomNumberKernels.h
+++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/RandomNumberKernels.h
@@ -3,14 +3,16 @@
 
 #include "mgOnGpuConfig.h"
 
-// NB This must come AFTER mgOnGpuConfig.h which contains our definition of __global__ when __GPUCC__ is not defined
+#include "GpuAbstraction.h"
+
+// NB This must come AFTER mgOnGpuConfig.h which contains our definition of __global__ when MGONGPUCPP_GPUIMPL is not defined
 #ifndef MGONGPU_HAS_NO_CURAND
 #include "curand.h"
 #endif
 
 #include "MemoryBuffers.h"
 
-#ifdef __GPUCC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/fbridge.cc b/epochX/cudacpp/ee_mumu.mad/SubProcesses/fbridge.cc
index eba81d91c1..d7c659729f 100644
--- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/fbridge.cc
+++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/fbridge.cc
@@ -1,6 +1,6 @@
 #include "Bridge.h"
 #include "CPPProcess.h"
-#include "GpuRuntime.h"
+#include "GpuAbstraction.h"
 
 extern "C"
 {
@@ -17,7 +17,7 @@ extern "C"
    * Using the same Fortran MadEvent code, linking to the hetrerogeneous library would allow access to both CPU and GPU implementations.
    * The specific heterogeneous configuration (how many GPUs, how many threads on each CPU, etc) could be loaded in CUDA/C++ from a data file.
    */
-#ifdef __GPUCC__
+#ifdef MGONGPUCPP_GPUIMPL
   using namespace mg5amcGpu;
 #else
   using namespace mg5amcCpu;
@@ -41,7 +41,7 @@ extern "C"
    */
   void fbridgecreate_( CppObjectInFortran** ppbridge, const int* pnevtF, const int* pnparF, const int* pnp4F )
   {
-#ifdef __GPUCC__
+#ifdef MGONGPUCPP_GPUIMPL
     GpuRuntime::setUp();
 #endif
     // Create a process object, read parm card and set parameters
@@ -64,8 +64,8 @@ extern "C"
     Bridge<FORTRANFPTYPE>* pbridge = dynamic_cast<Bridge<FORTRANFPTYPE>*>( *ppbridge );
     if( pbridge == 0 ) throw std::runtime_error( "fbridgedelete_: invalid Bridge address" );
     delete pbridge;
-#ifdef __GPUCC__
-    CudaRuntime::tearDown();
+#ifdef MGONGPUCPP_GPUIMPL
+    GpuRuntime::tearDown();
 #endif
   }
 
@@ -95,7 +95,7 @@ extern "C"
   {
     Bridge<FORTRANFPTYPE>* pbridge = dynamic_cast<Bridge<FORTRANFPTYPE>*>( *ppbridge );
     if( pbridge == 0 ) throw std::runtime_error( "fbridgesequence_: invalid Bridge address" );
-#ifdef __GPUCC__
+#ifdef MGONGPUCPP_GPUIMPL
     // Use the device/GPU implementation in the CUDA library
     // (there is also a host implementation in this library)
     pbridge->gpu_sequence( momenta, gs, rndhel, rndcol, *pchannelId, mes, selhel, selcol );
diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/fsampler.cc b/epochX/cudacpp/ee_mumu.mad/SubProcesses/fsampler.cc
index 8f6f8e3414..fbfc2fd944 100644
--- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/fsampler.cc
+++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/fsampler.cc
@@ -2,12 +2,13 @@
 
 #include "Bridge.h"
 #include "MemoryBuffers.h"
+#include "GpuAbstraction.h"
 #include "RamboSamplingKernels.h"
 #include "RandomNumberKernels.h"
 
 //--------------------------------------------------------------------------
 
-#ifdef __GPUCC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -34,7 +35,7 @@ namespace mg5amcCpu
   private:
     const int m_nevt; // The number of events in each iteration
     int m_iiter;      // The iteration counter (for random number seeding)
-#ifndef __GPUCC__
+#ifndef MGONGPUCPP_GPUIMPL
     HostBufferRndNumMomenta m_hstRndmom; // Memory buffers for random numbers
     HostBufferMomenta m_hstMomenta;      // Memory buffers for momenta
     HostBufferWeights m_hstWeights;      // Memory buffers for sampling weights
@@ -99,7 +100,7 @@ namespace mg5amcCpu
 
 extern "C"
 {
-#ifdef __GPUCC__
+#ifdef MGONGPUCPP_GPUIMPL
   using namespace mg5amcGpu;
 #else
   using namespace mg5amcCpu;
diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/runTest.cc b/epochX/cudacpp/ee_mumu.mad/SubProcesses/runTest.cc
index 78c0b18a9c..aa744d08f8 100644
--- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/runTest.cc
+++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/runTest.cc
@@ -1,7 +1,7 @@
 #include "mgOnGpuConfig.h"
 
 #include "CPPProcess.h"
-#include "gpu_abstraction.h" // for GPU abstraction, checkCuda is run on macros defined here
+#include "GpuAbstraction.h" // for CUDA/HIP runtime, also includes GPU abstraction
 #include "MadgraphTest.h"
 #include "MatrixElementKernels.h"
 #include "MemoryAccessMatrixElements.h"
@@ -11,7 +11,7 @@
 #include "RandomNumberKernels.h"
 #include "epoch_process_id.h"
 
-#ifdef __GPUCC__
+#ifdef MGONGPUCPP_GPUIMPL
 using namespace mg5amcGpu;
 #else
 using namespace mg5amcCpu;
@@ -28,7 +28,7 @@ struct CUDA_CPU_TestBase : public TestDriverBase
     : TestDriverBase( npar, refFileName ) {}
 };
 
-#ifndef __GPUCC__
+#ifndef MGONGPUCPP_GPUIMPL
 struct CPUTest : public CUDA_CPU_TestBase
 {
   // Struct data members (process, and memory structures for random numbers, momenta, matrix elements and weights on host and device)
@@ -110,7 +110,7 @@ struct CPUTest : public CUDA_CPU_TestBase
 };
 #endif
 
-#ifdef __GPUCC__
+#ifdef MGONGPUCPP_GPUIMPL
 struct CUDATest : public CUDA_CPU_TestBase
 {
   // Reset the device when our test goes out of scope. Note that this should happen after
@@ -245,7 +245,7 @@ INSTANTIATE_TEST_SUITE_P( prefix, \
                           test_suite_name, \
                           testing::Values( new CUDATest( MG_EPOCH_REFERENCE_FILE_NAME ) ) );
 
-#ifdef __GPUCC__
+#ifdef MGONGPUCPP_GPUIMPL
 MG_INSTANTIATE_TEST_SUITE_GPU( XTESTID_GPU( MG_EPOCH_PROCESS_ID ), MadgraphTest );
 #else
 MG_INSTANTIATE_TEST_SUITE_CPU( XTESTID_CPU( MG_EPOCH_PROCESS_ID ), MadgraphTest );
diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/testmisc.cc b/epochX/cudacpp/ee_mumu.mad/SubProcesses/testmisc.cc
index bb8a464ce5..e20b487a79 100644
--- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/testmisc.cc
+++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/testmisc.cc
@@ -6,12 +6,14 @@
 
 #include "epoch_process_id.h"
 
+#include "GpuAbstraction.h"
+
 #include <gtest/gtest.h>
 
 #include <sstream>
 #include <typeinfo>
 
-#ifdef __GPUCC__
+#ifdef MGONGPUCPP_GPUIMPL
 #define TESTID( s ) s##_GPU_MISC
 #else
 #define TESTID( s ) s##_CPU_MISC
diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/testxxx.cc b/epochX/cudacpp/ee_mumu.mad/SubProcesses/testxxx.cc
index 9ea9aad41d..726d0df829 100644
--- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/testxxx.cc
+++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/testxxx.cc
@@ -1,6 +1,7 @@
 #include "mgOnGpuConfig.h"
 
 #include "CPPProcess.h"
+#include "GpuAbstraction.h"
 #include "HelAmps_sm.h"
 #include "MemoryAccessMomenta.h"
 #include "MemoryAccessWavefunctions.h"
@@ -15,7 +16,7 @@
 #include <iomanip>
 #include <iostream>
 #include <vector>
-#ifdef __GPUCC__
+#ifdef MGONGPUCPP_GPUIMPL
 #define TESTID( s ) s##_GPU_XXX
 #else
 #define TESTID( s ) s##_CPU_XXX
@@ -37,7 +38,7 @@ TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testxxx )
   assert( nevt % neppM == 0 ); // nevt must be a multiple of neppM
   assert( nevt % neppV == 0 ); // nevt must be a multiple of neppV
   // Fill in the input momenta
-#ifdef __GPUCC__
+#ifdef MGONGPUCPP_GPUIMPL
   mg5amcGpu::PinnedHostBufferMomenta hstMomenta( nevt ); // AOSOA[npagM][npar=4][np4=4][neppM]
 #else
   mg5amcCpu::HostBufferMomenta hstMomenta( nevt ); // AOSOA[npagM][npar=4][np4=4][neppM]
@@ -224,7 +225,7 @@ TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testxxx )
   {
     for( int ievt = 0; ievt < nevt; ievt++ )
     {
-#ifdef __GPUCC__
+#ifdef MGONGPUCPP_GPUIMPL
       using namespace mg5amcGpu;
 #else
       using namespace mg5amcCpu;

From 78a8119f543db0811bd6a84e8acbaad63a96ff2f Mon Sep 17 00:00:00 2001
From: Jorgen T <jorgen.teig@gmail.com>
Date: Mon, 5 Jun 2023 16:15:39 +0200
Subject: [PATCH 249/509] Added GPU abstraction in src directory as well

---
 .../cudacpp/ee_mumu.mad/src/GpuAbstraction.h  |  1 +
 epochX/cudacpp/ee_mumu.mad/src/HelAmps_sm.h   |  2 +-
 .../cudacpp/ee_mumu.mad/src/Parameters_sm.h   |  6 ++---
 .../cudacpp/ee_mumu.mad/src/mgOnGpuConfig.h   | 22 ++++++++++---------
 .../cudacpp/ee_mumu.mad/src/mgOnGpuCxtypes.h  | 16 +++++++-------
 .../cudacpp/ee_mumu.mad/src/mgOnGpuFptypes.h  |  8 +++----
 .../cudacpp/ee_mumu.mad/src/mgOnGpuVectors.h  | 16 ++++++++------
 epochX/cudacpp/ee_mumu.mad/src/rambo.h        |  6 ++---
 8 files changed, 41 insertions(+), 36 deletions(-)
 create mode 120000 epochX/cudacpp/ee_mumu.mad/src/GpuAbstraction.h

diff --git a/epochX/cudacpp/ee_mumu.mad/src/GpuAbstraction.h b/epochX/cudacpp/ee_mumu.mad/src/GpuAbstraction.h
new file mode 120000
index 0000000000..4955c9171e
--- /dev/null
+++ b/epochX/cudacpp/ee_mumu.mad/src/GpuAbstraction.h
@@ -0,0 +1 @@
+../SubProcesses/GpuAbstraction.h
\ No newline at end of file
diff --git a/epochX/cudacpp/ee_mumu.mad/src/HelAmps_sm.h b/epochX/cudacpp/ee_mumu.mad/src/HelAmps_sm.h
index c1bde8a1cf..abf42ff744 100644
--- a/epochX/cudacpp/ee_mumu.mad/src/HelAmps_sm.h
+++ b/epochX/cudacpp/ee_mumu.mad/src/HelAmps_sm.h
@@ -19,7 +19,7 @@
 //#include <iomanip>
 //#include <iostream>
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/ee_mumu.mad/src/Parameters_sm.h b/epochX/cudacpp/ee_mumu.mad/src/Parameters_sm.h
index 6267925958..cf534d617a 100644
--- a/epochX/cudacpp/ee_mumu.mad/src/Parameters_sm.h
+++ b/epochX/cudacpp/ee_mumu.mad/src/Parameters_sm.h
@@ -203,7 +203,7 @@ namespace Parameters_sm_dependentCouplings
 #pragma GCC diagnostic push
 #pragma GCC diagnostic ignored "-Wunused-variable"  // e.g. <<warning: unused variable ‘mdl_G__exp__2’ [-Wunused-variable]>>
 #pragma GCC diagnostic ignored "-Wunused-parameter" // e.g. <<warning: unused parameter ‘G’ [-Wunused-parameter]>>
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 #pragma nv_diagnostic push
 #pragma nv_diag_suppress 177 // e.g. <<warning #177-D: variable "mdl_G__exp__2" was declared but never referenced>>
 #endif
@@ -227,7 +227,7 @@ namespace Parameters_sm_dependentCouplings
     // End SM implementation - no special handling of vectors of floats as in EFT (#439)
     return out;
   }
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 #pragma GCC diagnostic pop
 #pragma nv_diagnostic pop
 #endif
@@ -245,7 +245,7 @@ namespace Parameters_sm_independentCouplings
 
 //==========================================================================
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/ee_mumu.mad/src/mgOnGpuConfig.h b/epochX/cudacpp/ee_mumu.mad/src/mgOnGpuConfig.h
index 058bc2e635..759d9eff31 100644
--- a/epochX/cudacpp/ee_mumu.mad/src/mgOnGpuConfig.h
+++ b/epochX/cudacpp/ee_mumu.mad/src/mgOnGpuConfig.h
@@ -1,3 +1,5 @@
+#include "GpuAbstraction.h" // Includes required macros for GPU abstraction
+
 #ifndef MGONGPUCONFIG_H
 #define MGONGPUCONFIG_H 1
 
@@ -10,7 +12,7 @@
 
 // Choose if curand is supported for generating random numbers
 // For C++, by default, do not inline, but allow this macro to be set from outside with e.g. -DMGONGPU_HAS_NO_CURAND
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 #undef MGONGPU_HAS_NO_CURAND
 #else
 //#undef MGONGPU_HAS_NO_CURAND // default
@@ -48,20 +50,20 @@
 ////#define MGONGPU_HARDCODE_PARAM 1
 
 // Complex type in c++: std::complex or cxsmpl (CHOOSE ONLY ONE)
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 //#define MGONGPU_CPPCXTYPE_STDCOMPLEX 1 // ~8 percent slower on float, same on double (5.1E6/double, 9.4E6/float)
 #define MGONGPU_CPPCXTYPE_CXSMPL 1 // new default (5.1E6/double, 10.2E6/float)
 #endif
 
 // Complex type in cuda: thrust or cucomplex or cxsmpl (CHOOSE ONLY ONE)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 #define MGONGPU_CUCXTYPE_THRUST 1 // default (~1.15E9/double, ~3.2E9/float)
 //#define MGONGPU_CUCXTYPE_CUCOMPLEX 1 // ~10 percent slower (1.03E9/double, ~2.8E9/float)
 //#define MGONGPU_CUCXTYPE_CXSMPL 1 // ~10 percent slower (1.00E9/double, ~2.9E9/float)
 #endif
 
 // Cuda nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 #undef MGONGPU_NSIGHT_DEBUG // default
 //#define MGONGPU_NSIGHT_DEBUG 1
 #endif
@@ -80,14 +82,14 @@
 #endif
 
 // SANITY CHECKS (c++ complex number implementation)
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 #if defined MGONGPU_CPPCXTYPE_STDCOMPLEX and defined MGONGPU_CPPCXTYPE_CXSMPL
 #error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CPPCXTYPE_STDCOMPLEX or MGONGPU_CPPCXTYPE_CXSMPL
 #endif
 #endif
 
 // SANITY CHECKS (cuda complex number implementation)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 #if defined MGONGPU_CUCXTYPE_THRUST and defined MGONGPU_CUCXTYPE_CUCOMPLEX and defined MGONGPU_CUCXTYPE_CXSMPL
 #error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CUCXTYPE_THRUST or MGONGPU_CUCXTYPE_CUCOMPLEX or MGONGPU_CUCXTYPE_CXSMPL
 #endif
@@ -142,7 +144,7 @@ namespace mgOnGpu
   // Alignment requirement for using reinterpret_cast with SIMD vectorized code
   // (using reinterpret_cast with non aligned memory may lead to segmentation faults!)
   // Only needed for C++ code but can be enforced also in NVCC builds of C++ code using CUDA>=11.2 and C++17 (#318, #319, #333)
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   constexpr int cppAlign = 64; // alignment requirement for SIMD vectorization (64-byte i.e. 512-bit)
 #endif
 
@@ -153,7 +155,7 @@ using mgOnGpu::fptype;
 using mgOnGpu::fptype2;
 
 // C++ SIMD vectorization width (this will be used to set neppV)
-#ifdef __CUDACC__ // CUDA implementation has no SIMD
+#ifdef MGONGPUCPP_GPUIMPL // CUDA implementation has no SIMD
 #undef MGONGPU_CPPSIMD
 #elif defined __AVX512VL__ && defined MGONGPU_PVW512 // C++ "512z" AVX512 with 512 width (512-bit ie 64-byte): 8 (DOUBLE) or 16 (FLOAT)
 #ifdef MGONGPU_FPTYPE_DOUBLE
@@ -185,7 +187,7 @@ using mgOnGpu::fptype2;
 
 // Cuda nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation
 // Arguments (not used so far): text is __FUNCTION__, code is 0 (start) or 1 (end)
-#if defined __CUDACC__ && defined MGONGPU_NSIGHT_DEBUG /* clang-format off */
+#if defined MGONGPUCPP_GPUIMPL && defined MGONGPU_NSIGHT_DEBUG /* clang-format off */
 #define mgDebugDeclare() __shared__ float mgDebugCounter[mgOnGpu::ntpbMAX];
 #define mgDebugInitialise() { mgDebugCounter[threadIdx.x] = 0; }
 #define mgDebug( code, text ) { mgDebugCounter[threadIdx.x] += 1; }
@@ -198,7 +200,7 @@ using mgOnGpu::fptype2;
 #endif /* clang-format on */
 
 // Define empty CUDA declaration specifiers for C++
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 #define __global__
 #define __host__
 #define __device__
diff --git a/epochX/cudacpp/ee_mumu.mad/src/mgOnGpuCxtypes.h b/epochX/cudacpp/ee_mumu.mad/src/mgOnGpuCxtypes.h
index caff927311..fdd9a0d046 100644
--- a/epochX/cudacpp/ee_mumu.mad/src/mgOnGpuCxtypes.h
+++ b/epochX/cudacpp/ee_mumu.mad/src/mgOnGpuCxtypes.h
@@ -14,7 +14,7 @@
 #include <complex>
 
 // Complex type in cuda: thrust or cucomplex or cxsmpl
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 #if defined MGONGPU_CUCXTYPE_THRUST
 #pragma clang diagnostic push
 #pragma clang diagnostic ignored "-Wtautological-compare" // for icpx2021/clang13 (https://stackoverflow.com/a/15864661)
@@ -196,7 +196,7 @@ namespace mgOnGpu
 {
 
   // --- Type definitions (complex type: cxtype)
-#ifdef __CUDACC__ // cuda
+#ifdef MGONGPUCPP_GPUIMPL // cuda
 #if defined MGONGPU_CUCXTYPE_THRUST
   typedef thrust::complex<fptype> cxtype;
 #elif defined MGONGPU_CUCXTYPE_CUCOMPLEX
@@ -276,7 +276,7 @@ cxmake( const std::complex<double>& c ) // std::complex to cxsmpl (double-to-flo
 
 //==========================================================================
 
-#if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_THRUST // cuda + thrust
+#if defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CUCXTYPE_THRUST // cuda + thrust
 
 //------------------------------
 // CUDA - using thrust::complex
@@ -312,11 +312,11 @@ cxmake( const cxtype& c )
   return c;
 }
 
-#endif // #if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_THRUST
+#endif // #if defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CUCXTYPE_THRUST
 
 //==========================================================================
 
-#if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_CUCOMPLEX // cuda + cucomplex
+#if defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CUCXTYPE_CUCOMPLEX // cuda + cucomplex
 
 //------------------------------
 // CUDA - using cuComplex
@@ -531,11 +531,11 @@ cxmake( const std::complex<fptype>& c ) // std::complex to cucomplex (float-to-f
   return cxmake( c.real(), c.imag() );
 }
 
-#endif // #if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_CUCOMPLEX
+#endif // #if defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CUCXTYPE_CUCOMPLEX
 
 //==========================================================================
 
-#if not defined __CUDACC__ and defined MGONGPU_CPPCXTYPE_STDCOMPLEX // c++ + stdcomplex
+#if not defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CPPCXTYPE_STDCOMPLEX // c++ + stdcomplex
 
 //------------------------------
 // C++ - using std::complex
@@ -579,7 +579,7 @@ cxmake( const std::complex<double>& c ) // std::complex to std::complex (cast do
 }
 #endif
 
-#endif // #if not defined __CUDACC__ and defined MGONGPU_CPPCXTYPE_STDCOMPLEX
+#endif // #if not defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CPPCXTYPE_STDCOMPLEX
 
 //==========================================================================
 
diff --git a/epochX/cudacpp/ee_mumu.mad/src/mgOnGpuFptypes.h b/epochX/cudacpp/ee_mumu.mad/src/mgOnGpuFptypes.h
index b278275f80..b9d623ba21 100644
--- a/epochX/cudacpp/ee_mumu.mad/src/mgOnGpuFptypes.h
+++ b/epochX/cudacpp/ee_mumu.mad/src/mgOnGpuFptypes.h
@@ -8,7 +8,7 @@
 
 //==========================================================================
 
-#ifdef __CUDACC__ // cuda
+#ifdef MGONGPUCPP_GPUIMPL // cuda
 
 //------------------------------
 // Floating point types - Cuda
@@ -52,11 +52,11 @@ fpsqrt( const fptype& f )
 #endif
 }
 
-#endif // #ifdef __CUDACC__
+#endif // #ifdef MGONGPUCPP_GPUIMPL
 
 //==========================================================================
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 
 //------------------------------
 // Floating point types - C++
@@ -80,7 +80,7 @@ fpsqrt( const fptype& f )
   return std::sqrt( f );
 }
 
-#endif // #ifndef __CUDACC__
+#endif // #ifndef MGONGPUCPP_GPUIMPL
 
 //==========================================================================
 
diff --git a/epochX/cudacpp/ee_mumu.mad/src/mgOnGpuVectors.h b/epochX/cudacpp/ee_mumu.mad/src/mgOnGpuVectors.h
index 0dd4c69bd4..514437e5b0 100644
--- a/epochX/cudacpp/ee_mumu.mad/src/mgOnGpuVectors.h
+++ b/epochX/cudacpp/ee_mumu.mad/src/mgOnGpuVectors.h
@@ -4,6 +4,8 @@
 #include "mgOnGpuCxtypes.h"
 #include "mgOnGpuFptypes.h"
 
+#include "GpuAbstraction.h" // Includes required macros for GPU abstraction
+
 #include <iostream>
 
 //==========================================================================
@@ -103,7 +105,7 @@ namespace mgOnGpu /* clang-format off */
 #endif
 #endif
 
-#else // i.e #ifndef MGONGPU_CPPSIMD (this includes #ifdef __CUDACC__)
+#else // i.e #ifndef MGONGPU_CPPSIMD (this includes #ifdef MGONGPUCPP_GPUIMPL)
 
   const int neppV = 1;
 
@@ -124,7 +126,7 @@ using mgOnGpu::bool_v;
 
 //--------------------------------------------------------------------------
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 
 // Printout to stream for user defined types
 
@@ -739,11 +741,11 @@ fpvsplit1( const fptype2_v& v )
 
 #endif // #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
 
-#endif // #ifndef __CUDACC__
+#endif // #ifndef MGONGPUCPP_GPUIMPL
 
 //==========================================================================
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 
 //------------------------------
 // Vector types - CUDA
@@ -781,12 +783,12 @@ cxternary( const bool& mask, const cxtype& a, const cxtype& b )
   return ( mask ? a : b );
 }
 
-#endif // #ifdef __CUDACC__
+#endif // #ifdef MGONGPUCPP_GPUIMPL
 
 //==========================================================================
 
 // Scalar-or-vector types: scalar in CUDA, vector or scalar in C++
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 typedef bool bool_sv;
 typedef fptype fptype_sv;
 typedef fptype2 fptype2_sv;
@@ -807,7 +809,7 @@ typedef mgOnGpu::cxtype_ref cxtype_sv_ref;
 #endif
 
 // Scalar-or-vector zeros: scalar in CUDA, vector or scalar in C++
-#ifdef __CUDACC__ /* clang-format off */
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
 inline __host__ __device__ cxtype cxzero_sv(){ return cxtype( 0, 0 ); }
 #elif defined MGONGPU_CPPSIMD
 inline cxtype_v cxzero_sv() { return cxtype_v(); } // RRRR=0000 IIII=0000
diff --git a/epochX/cudacpp/ee_mumu.mad/src/rambo.h b/epochX/cudacpp/ee_mumu.mad/src/rambo.h
index 929204feff..422daf5653 100644
--- a/epochX/cudacpp/ee_mumu.mad/src/rambo.h
+++ b/epochX/cudacpp/ee_mumu.mad/src/rambo.h
@@ -7,7 +7,7 @@
 #include <iostream>
 
 // Simplified rambo version for 2 to N (with N>=2) processes with massless particles
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -72,7 +72,7 @@ namespace mg5amcCpu
       static bool first = true;
       if( first )
       {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
         if constexpr( M_ACCESS::isOnDevice() ) // avoid
         {
           const int ievt0 = 0;
@@ -155,7 +155,7 @@ namespace mg5amcCpu
     wt = po2log;
     if( nparf != 2 ) wt = ( 2. * nparf - 4. ) * log( energy ) + z[nparf - 1];
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
     // issue warnings if weight is too small or too large
     static int iwarn[5] = { 0, 0, 0, 0, 0 };
     if( wt < -180. )

From 098219a92854d7c4d341aafdc48a8744fa6ac083 Mon Sep 17 00:00:00 2001
From: Jorgen T <jorgen.teig@gmail.com>
Date: Tue, 6 Jun 2023 16:37:27 +0200
Subject: [PATCH 250/509] Added some WIP changes to compile with HIP

---
 .../ee_mumu.mad/SubProcesses/cudacpp.mk       | 128 ++++++++++--------
 1 file changed, 69 insertions(+), 59 deletions(-)

diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/cudacpp.mk b/epochX/cudacpp/ee_mumu.mad/SubProcesses/cudacpp.mk
index 2155495366..bd68b7aa37 100644
--- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/cudacpp.mk
+++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/cudacpp.mk
@@ -62,68 +62,76 @@ endif
 
 #-------------------------------------------------------------------------------
 
-#=== Configure the CUDA compiler
-
-# If CXX is not a single word (example "clang++ --gcc-toolchain...") then disable CUDA builds (issue #505)
-# This is because it is impossible to pass this to "CUFLAGS += -ccbin <host-compiler>" below
-ifneq ($(words $(subst ccache ,,$(CXX))),1) # allow at most "CXX=ccache <host-compiler>" from outside
-  $(warning CUDA builds are not supported for multi-word CXX "$(CXX)")
-  override CUDA_HOME=disabled
-endif
-
-# If CUDA_HOME is not set, try to set it from the location of nvcc
-ifndef CUDA_HOME
-  CUDA_HOME = $(patsubst %bin/nvcc,%,$(shell which nvcc 2>/dev/null))
-  $(warning CUDA_HOME was not set: using "$(CUDA_HOME)")
-endif
-
-# Set NVCC as $(CUDA_HOME)/bin/nvcc if it exists
-ifneq ($(wildcard $(CUDA_HOME)/bin/nvcc),)
-  NVCC = $(CUDA_HOME)/bin/nvcc
-  USE_NVTX ?=-DUSE_NVTX
-  # See https://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/index.html
-  # See https://arnon.dk/matching-sm-architectures-arch-and-gencode-for-various-nvidia-cards/
-  # Default: use compute capability 70 for V100 (CERN lxbatch, CERN itscrd, Juwels Cluster).
-  # Embed device code for 70, and PTX for 70+.
-  # Export MADGRAPH_CUDA_ARCHITECTURE (comma-separated list) to use another value or list of values (see #533).
-  # Examples: use 60 for P100 (Piz Daint), 80 for A100 (Juwels Booster, NVidia raplab/Curiosity).
-  MADGRAPH_CUDA_ARCHITECTURE ?= 70
-  ###CUARCHFLAGS = -gencode arch=compute_$(MADGRAPH_CUDA_ARCHITECTURE),code=compute_$(MADGRAPH_CUDA_ARCHITECTURE) -gencode arch=compute_$(MADGRAPH_CUDA_ARCHITECTURE),code=sm_$(MADGRAPH_CUDA_ARCHITECTURE) # Older implementation (AV): go back to this one for multi-GPU support #533
-  ###CUARCHFLAGS = --gpu-architecture=compute_$(MADGRAPH_CUDA_ARCHITECTURE) --gpu-code=sm_$(MADGRAPH_CUDA_ARCHITECTURE),compute_$(MADGRAPH_CUDA_ARCHITECTURE) # Newer implementation (SH): cannot use this as-is for multi-GPU support #533
-  comma:=,
-  CUARCHFLAGS = $(foreach arch,$(subst $(comma), ,$(MADGRAPH_CUDA_ARCHITECTURE)),-gencode arch=compute_$(arch),code=compute_$(arch) -gencode arch=compute_$(arch),code=sm_$(arch))
-  CUINC       = -I$(CUDA_HOME)/include/
-  CULIBFLAGS  = -L$(CUDA_HOME)/lib64/ -lcurand # NB: -lcuda is not needed here!
-  CUOPTFLAGS  = -lineinfo
-  CUFLAGS     = $(OPTFLAGS) $(CUOPTFLAGS) $(INCFLAGS) $(CUINC) $(USE_NVTX) $(CUARCHFLAGS) -use_fast_math
-  ###CUFLAGS    += -Xcompiler -Wall -Xcompiler -Wextra -Xcompiler -Wshadow
-  ###NVCC_VERSION = $(shell $(NVCC) --version | grep 'Cuda compilation tools' | cut -d' ' -f5 | cut -d, -f1)
-  CUFLAGS += -std=c++17 # need CUDA >= 11.2 (see #333): this is enforced in mgOnGpuConfig.h
-  # Without -maxrregcount: baseline throughput: 6.5E8 (16384 32 12) up to 7.3E8 (65536 128 12)
-  ###CUFLAGS+= --maxrregcount 160 # improves throughput: 6.9E8 (16384 32 12) up to 7.7E8 (65536 128 12)
-  ###CUFLAGS+= --maxrregcount 128 # improves throughput: 7.3E8 (16384 32 12) up to 7.6E8 (65536 128 12)
-  ###CUFLAGS+= --maxrregcount 96 # degrades throughput: 4.1E8 (16384 32 12) up to 4.5E8 (65536 128 12)
-  ###CUFLAGS+= --maxrregcount 64 # degrades throughput: 1.7E8 (16384 32 12) flat at 1.7E8 (65536 128 12)
-else ifneq ($(origin REQUIRE_CUDA),undefined)
-  # If REQUIRE_CUDA is set but no cuda is found, stop here (e.g. for CI tests on GPU #443)
-  $(error No cuda installation found (set CUDA_HOME or make nvcc visible in PATH))
-else
-  # No cuda. Switch cuda compilation off and go to common random numbers in C++
-  $(warning CUDA_HOME is not set or is invalid: export CUDA_HOME to compile with cuda)
-  override NVCC=
-  override USE_NVTX=
-  override CULIBFLAGS=
-endif
+ifeq ($(COMPILER),cuda)
+  #=== Configure the CUDA compiler
+
+  # If CXX is not a single word (example "clang++ --gcc-toolchain...") then disable CUDA builds (issue #505)
+  # This is because it is impossible to pass this to "CUFLAGS += -ccbin <host-compiler>" below
+  ifneq ($(words $(subst ccache ,,$(CXX))),1) # allow at most "CXX=ccache <host-compiler>" from outside
+    $(warning CUDA builds are not supported for multi-word CXX "$(CXX)")
+    override CUDA_HOME=disabled
+  endif
+
+  # If CUDA_HOME is not set, try to set it from the location of nvcc
+  ifndef CUDA_HOME
+    CUDA_HOME = $(patsubst %bin/nvcc,%,$(shell which nvcc 2>/dev/null))
+    $(warning CUDA_HOME was not set: using "$(CUDA_HOME)")
+  endif
+
+  # Set NVCC as $(CUDA_HOME)/bin/nvcc if it exists
+  ifneq ($(wildcard $(CUDA_HOME)/bin/nvcc),)
+    NVCC = $(CUDA_HOME)/bin/nvcc
+    USE_NVTX ?=-DUSE_NVTX
+    # See https://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/index.html
+    # See https://arnon.dk/matching-sm-architectures-arch-and-gencode-for-various-nvidia-cards/
+    # Default: use compute capability 70 for V100 (CERN lxbatch, CERN itscrd, Juwels Cluster).
+    # Embed device code for 70, and PTX for 70+.
+    # Export MADGRAPH_CUDA_ARCHITECTURE (comma-separated list) to use another value or list of values (see #533).
+    # Examples: use 60 for P100 (Piz Daint), 80 for A100 (Juwels Booster, NVidia raplab/Curiosity).
+    MADGRAPH_CUDA_ARCHITECTURE ?= 70
+    ###CUARCHFLAGS = -gencode arch=compute_$(MADGRAPH_CUDA_ARCHITECTURE),code=compute_$(MADGRAPH_CUDA_ARCHITECTURE) -gencode arch=compute_$(MADGRAPH_CUDA_ARCHITECTURE),code=sm_$(MADGRAPH_CUDA_ARCHITECTURE) # Older implementation (AV): go back to this one for multi-GPU support #533
+    ###CUARCHFLAGS = --gpu-architecture=compute_$(MADGRAPH_CUDA_ARCHITECTURE) --gpu-code=sm_$(MADGRAPH_CUDA_ARCHITECTURE),compute_$(MADGRAPH_CUDA_ARCHITECTURE) # Newer implementation (SH): cannot use this as-is for multi-GPU support #533
+    comma:=,
+    CUARCHFLAGS = $(foreach arch,$(subst $(comma), ,$(MADGRAPH_CUDA_ARCHITECTURE)),-gencode arch=compute_$(arch),code=compute_$(arch) -gencode arch=compute_$(arch),code=sm_$(arch))
+    CUINC       = -I$(CUDA_HOME)/include/
+    CULIBFLAGS  = -L$(CUDA_HOME)/lib64/ -lcurand # NB: -lcuda is not needed here!
+    CUOPTFLAGS  = -lineinfo
+    CUFLAGS     = $(OPTFLAGS) $(CUOPTFLAGS) $(INCFLAGS) $(CUINC) $(USE_NVTX) $(CUARCHFLAGS) -use_fast_math
+    ###CUFLAGS    += -Xcompiler -Wall -Xcompiler -Wextra -Xcompiler -Wshadow
+    ###NVCC_VERSION = $(shell $(NVCC) --version | grep 'Cuda compilation tools' | cut -d' ' -f5 | cut -d, -f1)
+    CUFLAGS += -std=c++17 # need CUDA >= 11.2 (see #333): this is enforced in mgOnGpuConfig.h
+    # Without -maxrregcount: baseline throughput: 6.5E8 (16384 32 12) up to 7.3E8 (65536 128 12)
+    ###CUFLAGS+= --maxrregcount 160 # improves throughput: 6.9E8 (16384 32 12) up to 7.7E8 (65536 128 12)
+    ###CUFLAGS+= --maxrregcount 128 # improves throughput: 7.3E8 (16384 32 12) up to 7.6E8 (65536 128 12)
+    ###CUFLAGS+= --maxrregcount 96 # degrades throughput: 4.1E8 (16384 32 12) up to 4.5E8 (65536 128 12)
+    ###CUFLAGS+= --maxrregcount 64 # degrades throughput: 1.7E8 (16384 32 12) flat at 1.7E8 (65536 128 12)
+  else ifneq ($(origin REQUIRE_CUDA),undefined)
+    # If REQUIRE_CUDA is set but no cuda is found, stop here (e.g. for CI tests on GPU #443)
+    $(error No cuda installation found (set CUDA_HOME or make nvcc visible in PATH))
+  else
+    # No cuda. Switch cuda compilation off and go to common random numbers in C++
+    $(warning CUDA_HOME is not set or is invalid: export CUDA_HOME to compile with cuda)
+    override NVCC=
+    override USE_NVTX=
+    override CULIBFLAGS=
+  endif
+
+  # Set the host C++ compiler for nvcc via "-ccbin <host-compiler>"
+  # (NB issue #505: this must be a single word, "clang++ --gcc-toolchain..." is not supported)
+  CUFLAGS += -ccbin $(shell which $(subst ccache ,,$(CXX)))
 
-# Set the host C++ compiler for nvcc via "-ccbin <host-compiler>"
-# (NB issue #505: this must be a single word, "clang++ --gcc-toolchain..." is not supported)
-CUFLAGS += -ccbin $(shell which $(subst ccache ,,$(CXX)))
+  # Allow newer (unsupported) C++ compilers with older versions of CUDA if ALLOW_UNSUPPORTED_COMPILER_IN_CUDA is set (#504)
+  ifneq ($(origin ALLOW_UNSUPPORTED_COMPILER_IN_CUDA),undefined)
+  CUFLAGS += -allow-unsupported-compiler
+  endif
+
+else ifeq ($(variable),hip)
+    #=== Configure the HIP compiler
 
-# Allow newer (unsupported) C++ compilers with older versions of CUDA if ALLOW_UNSUPPORTED_COMPILER_IN_CUDA is set (#504)
-ifneq ($(origin ALLOW_UNSUPPORTED_COMPILER_IN_CUDA),undefined)
-CUFLAGS += -allow-unsupported-compiler
+    NVCC=/usr/bin/hipcc
 endif
 
+  
 #-------------------------------------------------------------------------------
 
 #=== Configure ccache for C++ and CUDA builds
@@ -224,7 +232,9 @@ ifeq ($(HRDCOD),)
 endif
 
 # Set the default RNDGEN (random number generator) choice
-ifeq ($(NVCC),)
+ifeq ($(findstring hip,$(NVCC)),hip)
+  override RNDGEN = hasNoCurand
+else ifeq ($(NVCC),)
   override RNDGEN = hasNoCurand
 else ifeq ($(RNDGEN),)
   override RNDGEN = hasCurand

From 11e392d3ba7786e485b436e85137e3d77a46825a Mon Sep 17 00:00:00 2001
From: Jorgen T <jorgen.teig@gmail.com>
Date: Tue, 6 Jun 2023 16:57:40 +0200
Subject: [PATCH 251/509] Dont know what happened here

---
 .../SubProcesses/MemoryAccessMomenta.h        | 272 ++++++++++++++++++
 1 file changed, 272 insertions(+)

diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/MemoryAccessMomenta.h b/epochX/cudacpp/ee_mumu.mad/SubProcesses/MemoryAccessMomenta.h
index e69de29bb2..0ac4faa3c7 100644
--- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/MemoryAccessMomenta.h
+++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/MemoryAccessMomenta.h
@@ -0,0 +1,272 @@
+// Copyright (C) 2020-2023 CERN and UCLouvain.
+// Licensed under the GNU Lesser General Public License (version 3 or later).
+// Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+
+#ifndef MemoryAccessMomenta_H
+#define MemoryAccessMomenta_H 1
+
+#include "mgOnGpuConfig.h"
+
+#include "CPPProcess.h"
+#include "MemoryAccessHelpers.h"
+#include "MemoryAccessVectors.h"
+
+#ifdef __CUDACC__
+using mg5amcGpu::CPPProcess;
+#else
+using mg5amcCpu::CPPProcess;
+#endif
+
+//----------------------------------------------------------------------------
+
+// A class describing the internal layout of memory buffers for momenta
+// This implementation uses an AOSOA[npagM][npar][np4][neppM] where nevt=npagM*neppM
+// [If many implementations are used, a suffix _AOSOAv1 should be appended to the class name]
+class MemoryAccessMomentaBase //_AOSOAv1
+{
+public:
+
+  // Number of Events Per Page in the momenta AOSOA memory buffer layout
+  // (these are all best kept as a compile-time constants: see issue #23)
+#ifdef __CUDACC__ /* clang-format off */
+  // -----------------------------------------------------------------------------------------------
+  // --- GPUs: neppM is best set to a power of 2 times the number of fptype's in a 32-byte cacheline
+  // --- This is relevant to ensure coalesced access to momenta in global memory
+  // --- Note that neppR is hardcoded and may differ from neppM and neppV on some platforms
+  // -----------------------------------------------------------------------------------------------
+  //static constexpr int neppM = 64/sizeof(fptype); // 2x 32-byte GPU cache lines (512 bits): 8 (DOUBLE) or 16 (FLOAT)
+  static constexpr int neppM = 32/sizeof(fptype); // (DEFAULT) 32-byte GPU cache line (256 bits): 4 (DOUBLE) or 8 (FLOAT)
+  //static constexpr int neppM = 1; // *** NB: this is equivalent to AOS *** (slower: 1.03E9 instead of 1.11E9 in eemumu)
+#else
+  // -----------------------------------------------------------------------------------------------
+  // --- CPUs: neppM is best set equal to the number of fptype's (neppV) in a vector register
+  // --- This is relevant to ensure faster access to momenta from C++ memory cache lines
+  // --- However, neppM is now decoupled from neppV (issue #176) and can be separately hardcoded
+  // --- In practice, neppR, neppM and neppV could now (in principle) all be different
+  // -----------------------------------------------------------------------------------------------
+#ifdef MGONGPU_CPPSIMD
+  static constexpr int neppM = MGONGPU_CPPSIMD; // (DEFAULT) neppM=neppV for optimal performance
+  //static constexpr int neppM = 64/sizeof(fptype); // maximum CPU vector width (512 bits): 8 (DOUBLE) or 16 (FLOAT)
+  //static constexpr int neppM = 32/sizeof(fptype); // lower CPU vector width (256 bits): 4 (DOUBLE) or 8 (FLOAT)
+  //static constexpr int neppM = 1; // *** NB: this is equivalent to AOS *** (slower: 4.66E6 instead of 5.09E9 in eemumu)
+  //static constexpr int neppM = MGONGPU_CPPSIMD*2; // FOR TESTS
+#else
+  static constexpr int neppM = 1; // (DEFAULT) neppM=neppV for optimal performance (NB: this is equivalent to AOS)
+#endif
+#endif /* clang-format on */
+
+  // SANITY CHECK: check that neppM is a power of two
+  static_assert( ispoweroftwo( neppM ), "neppM is not a power of 2" );
+
+private:
+
+  friend class MemoryAccessHelper<MemoryAccessMomentaBase>;
+  friend class KernelAccessHelper<MemoryAccessMomentaBase, true>;
+  friend class KernelAccessHelper<MemoryAccessMomentaBase, false>;
+
+  // The number of components of a 4-momentum
+  static constexpr int np4 = CPPProcess::np4;
+
+  // The number of particles in this physics process
+  static constexpr int npar = CPPProcess::npar;
+
+  //--------------------------------------------------------------------------
+  // NB all KernelLaunchers assume that memory access can be decomposed as "accessField = decodeRecord( accessRecord )"
+  // (in other words: first locate the event record for a given event, then locate an element in that record)
+  //--------------------------------------------------------------------------
+
+  // Locate an event record (output) in a memory buffer (input) from the given event number (input)
+  // [Signature (non-const) ===> fptype* ieventAccessRecord( fptype* buffer, const int ievt ) <===]
+  static __host__ __device__ inline fptype*
+  ieventAccessRecord( fptype* buffer,
+                      const int ievt )
+  {
+    const int ipagM = ievt / neppM; // #event "M-page"
+    const int ieppM = ievt % neppM; // #event in the current event M-page
+    constexpr int ip4 = 0;
+    constexpr int ipar = 0;
+    return &( buffer[ipagM * npar * np4 * neppM + ipar * np4 * neppM + ip4 * neppM + ieppM] ); // AOSOA[ipagM][ipar][ip4][ieppM]
+  }
+
+  //--------------------------------------------------------------------------
+
+  // Locate a field (output) of an event record (input) from the given field indexes (input)
+  // [Signature (non-const) ===> fptype& decodeRecord( fptype* buffer, Ts... args ) <===]
+  // [NB: expand variadic template "Ts... args" to "const int ip4, const int ipar" and rename "Field" as "Ip4Ipar"]
+  static __host__ __device__ inline fptype&
+  decodeRecord( fptype* buffer,
+                const int ip4,
+                const int ipar )
+  {
+    constexpr int ipagM = 0;
+    constexpr int ieppM = 0;
+    return buffer[ipagM * npar * np4 * neppM + ipar * np4 * neppM + ip4 * neppM + ieppM]; // AOSOA[ipagM][ipar][ip4][ieppM]
+  }
+};
+
+//----------------------------------------------------------------------------
+
+// A class providing access to memory buffers for a given event, based on explicit event numbers
+// Its methods use the MemoryAccessHelper templates - note the use of the template keyword in template function instantiations
+class MemoryAccessMomenta : public MemoryAccessMomentaBase
+{
+public:
+
+  // Locate an event record (output) in a memory buffer (input) from the given event number (input)
+  // [Signature (non-const) ===> fptype* ieventAccessRecord( fptype* buffer, const int ievt ) <===]
+  static constexpr auto ieventAccessRecord = MemoryAccessHelper<MemoryAccessMomentaBase>::ieventAccessRecord;
+
+  // Locate an event record (output) in a memory buffer (input) from the given event number (input)
+  // [Signature (const) ===> const fptype* ieventAccessRecordConst( const fptype* buffer, const int ievt ) <===]
+  static constexpr auto ieventAccessRecordConst = MemoryAccessHelper<MemoryAccessMomentaBase>::ieventAccessRecordConst;
+
+  // Locate a field (output) of an event record (input) from the given field indexes (input)
+  // [Signature (non-const) ===> fptype& decodeRecord( fptype* buffer, const int ipar, const int ipar ) <===]
+  static constexpr auto decodeRecordIp4Ipar = MemoryAccessHelper<MemoryAccessMomentaBase>::decodeRecord;
+
+  // Locate a field (output) of an event record (input) from the given field indexes (input)
+  // [Signature (const) ===> const fptype& decodeRecordConst( const fptype* buffer, const int ipar, const int ipar ) <===]
+  static constexpr auto decodeRecordIp4IparConst =
+    MemoryAccessHelper<MemoryAccessMomentaBase>::template decodeRecordConst<int, int>;
+
+  // Locate a field (output) in a memory buffer (input) from the given event number (input) and the given field indexes (input)
+  // [Signature (non-const) ===> fptype& ieventAccessIp4Ipar( fptype* buffer, const ievt, const int ipar, const int ipar ) <===]
+  static constexpr auto ieventAccessIp4Ipar =
+    MemoryAccessHelper<MemoryAccessMomentaBase>::template ieventAccessField<int, int>;
+
+  // Locate a field (output) in a memory buffer (input) from the given event number (input) and the given field indexes (input)
+  // [Signature (const) ===> const fptype& ieventAccessIp4IparConst( const fptype* buffer, const ievt, const int ipar, const int ipar ) <===]
+  // DEFAULT VERSION
+  static constexpr auto ieventAccessIp4IparConst =
+    MemoryAccessHelper<MemoryAccessMomentaBase>::template ieventAccessFieldConst<int, int>;
+
+  /*
+  // Locate a field (output) in a memory buffer (input) from the given event number (input) and the given field indexes (input)
+  // [Signature (const) ===> const fptype& ieventAccessIp4IparConst( const fptype* buffer, const ievt, const int ipar, const int ipar ) <===]
+  // DEBUG VERSION WITH PRINTOUTS
+  static __host__ __device__ inline const fptype& 
+  ieventAccessIp4IparConst( const fptype* buffer,
+                                          const int ievt,
+                                          const int ip4,
+                                          const int ipar )
+  {
+    const fptype& out = MemoryAccessHelper<MemoryAccessMomentaBase>::template ieventAccessFieldConst<int, int>( buffer, ievt, ip4, ipar );
+    printf( "ipar=%2d ip4=%2d ievt=%8d out=%8.3f\n", ipar, ip4, ievt, out );
+    return out;
+  }
+  */
+};
+
+//----------------------------------------------------------------------------
+
+// A class providing access to memory buffers for a given event, based on implicit kernel rules
+// Its methods use the KernelAccessHelper template - note the use of the template keyword in template function instantiations
+template<bool onDevice>
+class KernelAccessMomenta
+{
+public:
+
+  // Expose selected functions from MemoryAccessMomenta
+  static constexpr auto ieventAccessRecordConst = MemoryAccessMomenta::ieventAccessRecordConst;
+
+  // Locate a field (output) in a memory buffer (input) from a kernel event-indexing mechanism (internal) and the given field indexes (input)
+  // [Signature (non-const, SCALAR) ===> fptype& kernelAccessIp4Ipar( fptype* buffer, const int ipar, const int ipar ) <===]
+  static constexpr auto kernelAccessIp4Ipar =
+    KernelAccessHelper<MemoryAccessMomentaBase, onDevice>::template kernelAccessField<int, int>;
+
+  // Locate a field (output) in a memory buffer (input) from a kernel event-indexing mechanism (internal) and the given field indexes (input)
+  // [Signature (const, SCALAR) ===> const fptype& kernelAccessIp4IparConst( const fptype* buffer, const int ipar, const int ipar ) <===]
+  // DEFAULT VERSION
+  static constexpr auto kernelAccessIp4IparConst_s =
+    KernelAccessHelper<MemoryAccessMomentaBase, onDevice>::template kernelAccessFieldConst<int, int>;
+
+  /*
+  // Locate a field (output) in a memory buffer (input) from a kernel event-indexing mechanism (internal) and the given field indexes (input)
+  // [Signature (const, SCALAR) ===> const fptype& kernelAccessIp4IparConst( const fptype* buffer, const int ipar, const int ipar ) <===]
+  // DEBUG VERSION WITH PRINTOUTS
+  static __host__ __device__ inline const fptype&
+  kernelAccessIp4IparConst_s( const fptype* buffer,
+                              const int ip4,
+                              const int ipar )
+  {
+    const fptype& out = KernelAccessHelper<MemoryAccessMomentaBase, onDevice>::template kernelAccessFieldConst<int, int>( buffer, ip4, ipar );
+    printf( "ipar=%2d ip4=%2d ievt='kernel' out=%8.3f\n", ipar, ip4, out );
+    return out;
+  }
+  */
+
+  // Locate a field (output) in a memory buffer (input) from a kernel event-indexing mechanism (internal) and the given field indexes (input)
+  // [Signature (const, SCALAR OR VECTOR) ===> fptype_sv kernelAccessIp4IparConst( const fptype* buffer, const int ipar, const int ipar ) <===]
+  // FIXME? Eventually return by const reference and support aligned arrays only?
+  // FIXME? Currently return by value to support also unaligned and arbitrary arrays
+  static __host__ __device__ inline fptype_sv
+  kernelAccessIp4IparConst( const fptype* buffer,
+                            const int ip4,
+                            const int ipar )
+  {
+    const fptype& out = kernelAccessIp4IparConst_s( buffer, ip4, ipar );
+#ifndef MGONGPU_CPPSIMD
+    return out;
+#else
+    constexpr int neppM = MemoryAccessMomentaBase::neppM;
+    constexpr bool useContiguousEventsIfPossible = true; // DEFAULT
+    //constexpr bool useContiguousEventsIfPossible = false; // FOR PERFORMANCE TESTS (treat as arbitrary array even if it is an AOSOA)
+    // Use c++17 "if constexpr": compile-time branching
+    if constexpr( useContiguousEventsIfPossible && ( neppM >= neppV ) && ( neppM % neppV == 0 ) )
+    {
+      //constexpr bool skipAlignmentCheck = true; // FASTEST (SEGFAULTS IF MISALIGNED ACCESS, NEEDS A SANITY CHECK ELSEWHERE!)
+      constexpr bool skipAlignmentCheck = false; // DEFAULT: A BIT SLOWER BUT SAFER [ALLOWS MISALIGNED ACCESS]
+      if constexpr( skipAlignmentCheck )
+      {
+        //static bool first=true; if( first ){ std::cout << "WARNING! assume aligned AOSOA, skip check" << std::endl; first=false; } // SLOWER (5.06E6)
+        // FASTEST? (5.09E6 in eemumu 512y)
+        // This assumes alignment for momenta1d without checking - causes segmentation fault in reinterpret_cast if not aligned!
+        return mg5amcCpu::fptypevFromAlignedArray( out ); // use reinterpret_cast
+      }
+      else if( (size_t)( buffer ) % mgOnGpu::cppAlign == 0 )
+      {
+        //static bool first=true; if( first ){ std::cout << "WARNING! aligned AOSOA, reinterpret cast" << std::endl; first=false; } // SLOWER (5.00E6)
+        // DEFAULT! A tiny bit (<1%) slower because of the alignment check (5.07E6 in eemumu 512y)
+        // This explicitly checks buffer alignment to avoid segmentation faults in reinterpret_cast
+        return mg5amcCpu::fptypevFromAlignedArray( out ); // SIMD bulk load of neppV, use reinterpret_cast
+      }
+      else
+      {
+        //static bool first=true; if( first ){ std::cout << "WARNING! AOSOA but no reinterpret cast" << std::endl; first=false; } // SLOWER (4.93E6)
+        // A bit (1%) slower (5.05E6 in eemumu 512y)
+        // This does not require buffer alignment, but it requires AOSOA with neppM>=neppV and neppM%neppV==0
+        return mg5amcCpu::fptypevFromUnalignedArray( out ); // SIMD bulk load of neppV, do not use reinterpret_cast (fewer SIMD operations)
+      }
+    }
+    else
+    {
+      //static bool first=true; if( first ){ std::cout << "WARNING! arbitrary array" << std::endl; first=false; } // SLOWER (5.08E6)
+      // ?!Used to be much slower, now a tiny bit faster for AOSOA?! (5.11E6 for AOSOA, 4.64E6 for AOS in eemumu 512y)
+      // This does not even require AOSOA with neppM>=neppV and neppM%neppV==0 (e.g. can be used with AOS neppM==1)
+      constexpr int ievt0 = 0; // just make it explicit in the code that buffer refers to a given ievt0 and decoderIeppV fetches event ievt0+ieppV
+      auto decoderIeppv = [buffer, ip4, ipar]( int ieppV )
+        -> const fptype&
+      { return MemoryAccessMomenta::ieventAccessIp4IparConst( buffer, ievt0 + ieppV, ip4, ipar ); };
+      return mg5amcCpu::fptypevFromArbitraryArray( decoderIeppv ); // iterate over ieppV in neppV (no SIMD)
+    }
+#endif
+  }
+
+  // Is this a HostAccess or DeviceAccess class?
+  // [this is only needed for a warning printout in rambo.h for nparf==1 #358]
+  static __host__ __device__ inline constexpr bool
+  isOnDevice()
+  {
+    return onDevice;
+  }
+};
+
+//----------------------------------------------------------------------------
+
+typedef KernelAccessMomenta<false> HostAccessMomenta;
+typedef KernelAccessMomenta<true> DeviceAccessMomenta;
+
+//----------------------------------------------------------------------------
+
+#endif // MemoryAccessMomenta_H

From e8779ebca4eca876419c4e9be483c5af3f200c04 Mon Sep 17 00:00:00 2001
From: Jorgen T <jorgen.teig@gmail.com>
Date: Wed, 7 Jun 2023 10:10:34 +0200
Subject: [PATCH 252/509] Cleanup of sync with master

---
 epochX/cudacpp/ee_mumu.mad/src/mgOnGpuConfig.h | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/epochX/cudacpp/ee_mumu.mad/src/mgOnGpuConfig.h b/epochX/cudacpp/ee_mumu.mad/src/mgOnGpuConfig.h
index 22e2f2eeaa..170a3e10d4 100644
--- a/epochX/cudacpp/ee_mumu.mad/src/mgOnGpuConfig.h
+++ b/epochX/cudacpp/ee_mumu.mad/src/mgOnGpuConfig.h
@@ -1,11 +1,7 @@
-<<<<<<< HEAD
-#include "GpuAbstraction.h" // Includes required macros for GPU abstraction
-=======
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jul 2020) for the MG5aMC CUDACPP plugin.
 // Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
->>>>>>> master
 
 #ifndef MGONGPUCONFIG_H
 #define MGONGPUCONFIG_H 1

From e5f107089619947205200448e623c7077614f429 Mon Sep 17 00:00:00 2001
From: Jorgen T <jorgen.teig@gmail.com>
Date: Wed, 7 Jun 2023 10:15:05 +0200
Subject: [PATCH 253/509] More cleanup from sync with master

---
 .../ee_mumu.mad/SubProcesses/P1_epem_mupmum/CPPProcess.cc    | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/CPPProcess.cc b/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/CPPProcess.cc
index 68ee3be577..f2b8ee972b 100644
--- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/CPPProcess.cc
+++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/CPPProcess.cc
@@ -457,13 +457,8 @@ namespace mg5amcCpu
       { 1, -1, 1, 1 },
       { 1, -1, -1, -1 },
       { 1, -1, -1, 1 } };
-<<<<<<< HEAD
-#ifdef MGONGPUCPP_GPUIMPL
-    gpuMemcpyToSymbol( cHel, tHel, ncomb * mgOnGpu::npar * sizeof( short ) );
-=======
 #ifdef __CUDACC__
     checkCuda( cudaMemcpyToSymbol( cHel, tHel, ncomb * npar * sizeof( short ) ) );
->>>>>>> master
 #else
     memcpy( cHel, tHel, ncomb * npar * sizeof( short ) );
 #endif

From a7da6ef3ebc42d3d1d99133699881178374762db Mon Sep 17 00:00:00 2001
From: Jorgen T <jorgen.teig@gmail.com>
Date: Wed, 7 Jun 2023 10:17:37 +0200
Subject: [PATCH 254/509] Added first round of fixes from sync with master

---
 epochX/cudacpp/ee_mumu.mad/SubProcesses/Bridge.h            | 4 ++--
 .../ee_mumu.mad/SubProcesses/CommonRandomNumberKernel.cc    | 3 ++-
 epochX/cudacpp/ee_mumu.mad/SubProcesses/MadgraphTest.h      | 2 +-
 .../cudacpp/ee_mumu.mad/SubProcesses/MemoryAccessMomenta.h  | 2 +-
 .../ee_mumu.mad/SubProcesses/MemoryAccessRandomNumbers.h    | 2 +-
 .../ee_mumu.mad/SubProcesses/P1_epem_mupmum/CPPProcess.cc   | 6 +++---
 6 files changed, 10 insertions(+), 9 deletions(-)

diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/Bridge.h b/epochX/cudacpp/ee_mumu.mad/SubProcesses/Bridge.h
index 1bb389fb1b..bb8a7c48f8 100644
--- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/Bridge.h
+++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/Bridge.h
@@ -233,7 +233,7 @@ namespace mg5amcCpu
   {
     if( nparF != CPPProcess::npar ) throw std::runtime_error( "Bridge constructor: npar mismatch" );
     if( np4F != CPPProcess::np4 ) throw std::runtime_error( "Bridge constructor: np4 mismatch" );
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     if( ( m_nevt < s_gputhreadsmin ) || ( m_nevt % s_gputhreadsmin != 0 ) )
       throw std::runtime_error( "Bridge constructor: nevt should be a multiple of " + std::to_string( s_gputhreadsmin ) );
     while( m_nevt != m_gpublocks * m_gputhreads )
@@ -288,7 +288,7 @@ namespace mg5amcCpu
     }
     else
     {
-      checkCuda( cudaMemcpy( m_devMomentaF.data(), momenta, m_devMomentaF.bytes(), cudaMemcpyHostToDevice ) );
+      gpuMemcpy( m_devMomentaF.data(), momenta, m_devMomentaF.bytes(), cudaMemcpyHostToDevice );
       const int thrPerEvt = CPPProcess::npar * CPPProcess::np4; // AV: transpose alg does 1 element per thread (NOT 1 event per thread)
       //const int thrPerEvt = 1; // AV: try new alg with 1 event per thread... this seems slower
       gpuLaunchKernel(dev_transposeMomentaF2C, m_gpublocks * thrPerEvt, m_gputhreads, m_devMomentaF.data(), m_devMomentaC.data(), m_nevt );
diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/CommonRandomNumberKernel.cc b/epochX/cudacpp/ee_mumu.mad/SubProcesses/CommonRandomNumberKernel.cc
index 985b39f576..f17b9c0ad7 100644
--- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/CommonRandomNumberKernel.cc
+++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/CommonRandomNumberKernel.cc
@@ -4,12 +4,13 @@
 // Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #include "CommonRandomNumbers.h"
+#include "GpuAbstraction.h"
 #include "MemoryBuffers.h"
 #include "RandomNumberKernels.h"
 
 #include <cassert>
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/MadgraphTest.h b/epochX/cudacpp/ee_mumu.mad/SubProcesses/MadgraphTest.h
index 7db75dae69..8b47cb4a44 100644
--- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/MadgraphTest.h
+++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/MadgraphTest.h
@@ -23,7 +23,7 @@
 #include <string>
 #include <vector>
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 using mg5amcGpu::CPPProcess;
 #else
 using mg5amcCpu::CPPProcess;
diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/MemoryAccessMomenta.h b/epochX/cudacpp/ee_mumu.mad/SubProcesses/MemoryAccessMomenta.h
index 0ac4faa3c7..d81f83b912 100644
--- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/MemoryAccessMomenta.h
+++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/MemoryAccessMomenta.h
@@ -12,7 +12,7 @@
 #include "MemoryAccessHelpers.h"
 #include "MemoryAccessVectors.h"
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 using mg5amcGpu::CPPProcess;
 #else
 using mg5amcCpu::CPPProcess;
diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/MemoryAccessRandomNumbers.h b/epochX/cudacpp/ee_mumu.mad/SubProcesses/MemoryAccessRandomNumbers.h
index e2988d39f3..949a42066d 100644
--- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/MemoryAccessRandomNumbers.h
+++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/MemoryAccessRandomNumbers.h
@@ -11,7 +11,7 @@
 #include "CPPProcess.h"
 #include "MemoryAccessHelpers.h"
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 using mg5amcGpu::CPPProcess;
 #else
 using mg5amcCpu::CPPProcess;
diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/CPPProcess.cc b/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/CPPProcess.cc
index f2b8ee972b..bf6fc2d76a 100644
--- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/CPPProcess.cc
+++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/CPPProcess.cc
@@ -17,7 +17,7 @@
 #include "mgOnGpuConfig.h"
 
 #include "HelAmps_sm.h"
-#include "GpuRuntime.h" // for GPU abstraction, checkCuda is run on macros defined here
+#include "GpuRuntime.h" // for GPU abstraction, checkGpu is run on macros defined here
 #include "MemoryAccessAmplitudes.h"
 #include "MemoryAccessCouplings.h"
 #include "MemoryAccessCouplingsFixed.h"
@@ -457,8 +457,8 @@ namespace mg5amcCpu
       { 1, -1, 1, 1 },
       { 1, -1, -1, -1 },
       { 1, -1, -1, 1 } };
-#ifdef __CUDACC__
-    checkCuda( cudaMemcpyToSymbol( cHel, tHel, ncomb * npar * sizeof( short ) ) );
+#ifdef MGONGPUCPP_GPUIMPL
+    gpuMemcpyToSymbol( cHel, tHel, ncomb * npar * sizeof( short ) );
 #else
     memcpy( cHel, tHel, ncomb * npar * sizeof( short ) );
 #endif

From d69762dccddb23396f333799a870f47d243d5171 Mon Sep 17 00:00:00 2001
From: Jorgen T <jorgen.teig@gmail.com>
Date: Wed, 7 Jun 2023 10:20:47 +0200
Subject: [PATCH 255/509] Added back include for abstraction in mgOnGpuConfig.h

---
 epochX/cudacpp/ee_mumu.mad/src/mgOnGpuConfig.h | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/epochX/cudacpp/ee_mumu.mad/src/mgOnGpuConfig.h b/epochX/cudacpp/ee_mumu.mad/src/mgOnGpuConfig.h
index 170a3e10d4..e693d99a8c 100644
--- a/epochX/cudacpp/ee_mumu.mad/src/mgOnGpuConfig.h
+++ b/epochX/cudacpp/ee_mumu.mad/src/mgOnGpuConfig.h
@@ -3,6 +3,8 @@
 // Created by: A. Valassi (Jul 2020) for the MG5aMC CUDACPP plugin.
 // Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 
+#include "GpuAbstraction.h"
+
 #ifndef MGONGPUCONFIG_H
 #define MGONGPUCONFIG_H 1
 

From 5459bbce211714120c0bc1d709f5d013928545f3 Mon Sep 17 00:00:00 2001
From: Jorgen T <jorgen.teig@gmail.com>
Date: Wed, 7 Jun 2023 10:54:17 +0200
Subject: [PATCH 256/509] Added some fixes

---
 .../cudacpp/ee_mumu.mad/SubProcesses/GpuAbstraction.h  | 10 +++++++++-
 .../ee_mumu.mad/SubProcesses/MemoryAccessMomenta.h     |  2 +-
 2 files changed, 10 insertions(+), 2 deletions(-)

diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/GpuAbstraction.h b/epochX/cudacpp/ee_mumu.mad/SubProcesses/GpuAbstraction.h
index 67652fae6c..bd164293b8 100644
--- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/GpuAbstraction.h
+++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/GpuAbstraction.h
@@ -3,6 +3,14 @@
 #include <cassert>
 
 #ifdef __CUDACC__
+  #define MGONGPUCPP_CUDACC 1
+#endif
+
+#ifdef __HIPCC__
+  #define MGONGPUCPP_HIPCC 1
+#endif
+
+#ifdef MGONGPUCPP_CUDACC
 
   // Defines correct compiler
   #define MGONGPUCPP_GPUIMPL __CUDACC__
@@ -34,7 +42,7 @@
 
 //--------------------------------------------------------------------------
 
-#elif defined(__HIPCC__)
+#elif defined MGONGPUCPP_HIPCC
 
   // Defines correct compiler
   #define MGONGPUCPP_GPUIMPL __HIPCC__
diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/MemoryAccessMomenta.h b/epochX/cudacpp/ee_mumu.mad/SubProcesses/MemoryAccessMomenta.h
index d81f83b912..f797f85ca5 100644
--- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/MemoryAccessMomenta.h
+++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/MemoryAccessMomenta.h
@@ -29,7 +29,7 @@ class MemoryAccessMomentaBase //_AOSOAv1
 
   // Number of Events Per Page in the momenta AOSOA memory buffer layout
   // (these are all best kept as a compile-time constants: see issue #23)
-#ifdef __CUDACC__ /* clang-format off */
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
   // -----------------------------------------------------------------------------------------------
   // --- GPUs: neppM is best set to a power of 2 times the number of fptype's in a 32-byte cacheline
   // --- This is relevant to ensure coalesced access to momenta in global memory

From abbc9affcfb53659814161a752683a3435b5b3ed Mon Sep 17 00:00:00 2001
From: Jorgen T <jorgen.teig@gmail.com>
Date: Wed, 7 Jun 2023 11:10:32 +0200
Subject: [PATCH 257/509] Made a change to tthe cudaccpp.mk file

---
 epochX/cudacpp/ee_mumu.mad/SubProcesses/cudacpp.mk | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/cudacpp.mk b/epochX/cudacpp/ee_mumu.mad/SubProcesses/cudacpp.mk
index efe8e0ffd6..ff0e243188 100644
--- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/cudacpp.mk
+++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/cudacpp.mk
@@ -89,7 +89,9 @@ endif
 
 #-------------------------------------------------------------------------------
 
-ifeq ($(COMPILER),cuda)
+CUDA_COMPILER := $(shell which nvcc > /dev/null 2>&1; echo $$?)
+
+ifeq ($(CUDA_COMPILER),1)
   #=== Configure the CUDA compiler
 
   # If CXX is not a single word (example "clang++ --gcc-toolchain...") then disable CUDA builds (issue #505)
@@ -153,7 +155,7 @@ endif
   CUFLAGS += -allow-unsupported-compiler
   endif
 
-else ifeq ($(variable),hip)
+else
     #=== Configure the HIP compiler
 
     NVCC=/usr/bin/hipcc

From 58174f6e8b6bb4432a89d4749a8bf6d1747cf44d Mon Sep 17 00:00:00 2001
From: Jorgen T <jorgen.teig@gmail.com>
Date: Wed, 7 Jun 2023 11:16:25 +0200
Subject: [PATCH 258/509] Made small fix to makefile

---
 .../ee_mumu.mad/SubProcesses/cudacpp.mk       | 78 +++++++++----------
 1 file changed, 39 insertions(+), 39 deletions(-)

diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/cudacpp.mk b/epochX/cudacpp/ee_mumu.mad/SubProcesses/cudacpp.mk
index ff0e243188..5cc9c9f3b3 100644
--- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/cudacpp.mk
+++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/cudacpp.mk
@@ -91,7 +91,7 @@ endif
 
 CUDA_COMPILER := $(shell which nvcc > /dev/null 2>&1; echo $$?)
 
-ifeq ($(CUDA_COMPILER),1)
+ifeq ($(CUDA_COMPILER),0)
   #=== Configure the CUDA compiler
 
   # If CXX is not a single word (example "clang++ --gcc-toolchain...") then disable CUDA builds (issue #505)
@@ -107,44 +107,44 @@ ifeq ($(CUDA_COMPILER),1)
     $(warning CUDA_HOME was not set: using "$(CUDA_HOME)")
   endif
 
-# Set NVCC as $(CUDA_HOME)/bin/nvcc if it exists
-ifneq ($(wildcard $(CUDA_HOME)/bin/nvcc),)
-  NVCC = $(CUDA_HOME)/bin/nvcc
-  USE_NVTX ?=-DUSE_NVTX
-  # See https://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/index.html
-  # See https://arnon.dk/matching-sm-architectures-arch-and-gencode-for-various-nvidia-cards/
-  # Default: use compute capability 70 for V100 (CERN lxbatch, CERN itscrd, Juwels Cluster).
-  # Embed device code for 70, and PTX for 70+.
-  # Export MADGRAPH_CUDA_ARCHITECTURE (comma-separated list) to use another value or list of values (see #533).
-  # Examples: use 60 for P100 (Piz Daint), 80 for A100 (Juwels Booster, NVidia raplab/Curiosity).
-  MADGRAPH_CUDA_ARCHITECTURE ?= 70
-  ###CUARCHFLAGS = -gencode arch=compute_$(MADGRAPH_CUDA_ARCHITECTURE),code=compute_$(MADGRAPH_CUDA_ARCHITECTURE) -gencode arch=compute_$(MADGRAPH_CUDA_ARCHITECTURE),code=sm_$(MADGRAPH_CUDA_ARCHITECTURE) # Older implementation (AV): go back to this one for multi-GPU support #533
-  ###CUARCHFLAGS = --gpu-architecture=compute_$(MADGRAPH_CUDA_ARCHITECTURE) --gpu-code=sm_$(MADGRAPH_CUDA_ARCHITECTURE),compute_$(MADGRAPH_CUDA_ARCHITECTURE) # Newer implementation (SH): cannot use this as-is for multi-GPU support #533
-  comma:=,
-  CUARCHFLAGS = $(foreach arch,$(subst $(comma), ,$(MADGRAPH_CUDA_ARCHITECTURE)),-gencode arch=compute_$(arch),code=compute_$(arch) -gencode arch=compute_$(arch),code=sm_$(arch))
-  CUINC = -I$(CUDA_HOME)/include/
-  CURANDLIBFLAGS = -L$(CUDA_HOME)/lib64/ -lcurand # NB: -lcuda is not needed here!
-  CUOPTFLAGS = -lineinfo
-  CUFLAGS = $(OPTFLAGS) $(CUOPTFLAGS) $(INCFLAGS) $(CUINC) $(USE_NVTX) $(CUARCHFLAGS) -use_fast_math
-  ###CUFLAGS += -Xcompiler -Wall -Xcompiler -Wextra -Xcompiler -Wshadow
-  ###NVCC_VERSION = $(shell $(NVCC) --version | grep 'Cuda compilation tools' | cut -d' ' -f5 | cut -d, -f1)
-  CUFLAGS += -std=c++17 # need CUDA >= 11.2 (see #333): this is enforced in mgOnGpuConfig.h
-  # Without -maxrregcount: baseline throughput: 6.5E8 (16384 32 12) up to 7.3E8 (65536 128 12)
-  ###CUFLAGS+= --maxrregcount 160 # improves throughput: 6.9E8 (16384 32 12) up to 7.7E8 (65536 128 12)
-  ###CUFLAGS+= --maxrregcount 128 # improves throughput: 7.3E8 (16384 32 12) up to 7.6E8 (65536 128 12)
-  ###CUFLAGS+= --maxrregcount 96 # degrades throughput: 4.1E8 (16384 32 12) up to 4.5E8 (65536 128 12)
-  ###CUFLAGS+= --maxrregcount 64 # degrades throughput: 1.7E8 (16384 32 12) flat at 1.7E8 (65536 128 12)
-else ifneq ($(origin REQUIRE_CUDA),undefined)
-  # If REQUIRE_CUDA is set but no cuda is found, stop here (e.g. for CI tests on GPU #443)
-  $(error No cuda installation found (set CUDA_HOME or make nvcc visible in PATH))
-else
-  # No cuda. Switch cuda compilation off and go to common random numbers in C++
-  $(warning CUDA_HOME is not set or is invalid: export CUDA_HOME to compile with cuda)
-  override NVCC=
-  override USE_NVTX=
-  override CUINC=
-  override CURANDLIBFLAGS=
-endif
+  # Set NVCC as $(CUDA_HOME)/bin/nvcc if it exists
+  ifneq ($(wildcard $(CUDA_HOME)/bin/nvcc),)
+    NVCC = $(CUDA_HOME)/bin/nvcc
+    USE_NVTX ?=-DUSE_NVTX
+    # See https://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/index.html
+    # See https://arnon.dk/matching-sm-architectures-arch-and-gencode-for-various-nvidia-cards/
+    # Default: use compute capability 70 for V100 (CERN lxbatch, CERN itscrd, Juwels Cluster).
+    # Embed device code for 70, and PTX for 70+.
+    # Export MADGRAPH_CUDA_ARCHITECTURE (comma-separated list) to use another value or list of values (see #533).
+    # Examples: use 60 for P100 (Piz Daint), 80 for A100 (Juwels Booster, NVidia raplab/Curiosity).
+    MADGRAPH_CUDA_ARCHITECTURE ?= 70
+    ###CUARCHFLAGS = -gencode arch=compute_$(MADGRAPH_CUDA_ARCHITECTURE),code=compute_$(MADGRAPH_CUDA_ARCHITECTURE) -gencode arch=compute_$(MADGRAPH_CUDA_ARCHITECTURE),code=sm_$(MADGRAPH_CUDA_ARCHITECTURE) # Older implementation (AV): go back to this one for multi-GPU support #533
+    ###CUARCHFLAGS = --gpu-architecture=compute_$(MADGRAPH_CUDA_ARCHITECTURE) --gpu-code=sm_$(MADGRAPH_CUDA_ARCHITECTURE),compute_$(MADGRAPH_CUDA_ARCHITECTURE) # Newer implementation (SH): cannot use this as-is for multi-GPU support #533
+    comma:=,
+    CUARCHFLAGS = $(foreach arch,$(subst $(comma), ,$(MADGRAPH_CUDA_ARCHITECTURE)),-gencode arch=compute_$(arch),code=compute_$(arch) -gencode arch=compute_$(arch),code=sm_$(arch))
+    CUINC = -I$(CUDA_HOME)/include/
+    CURANDLIBFLAGS = -L$(CUDA_HOME)/lib64/ -lcurand # NB: -lcuda is not needed here!
+    CUOPTFLAGS = -lineinfo
+    CUFLAGS = $(OPTFLAGS) $(CUOPTFLAGS) $(INCFLAGS) $(CUINC) $(USE_NVTX) $(CUARCHFLAGS) -use_fast_math
+    ###CUFLAGS += -Xcompiler -Wall -Xcompiler -Wextra -Xcompiler -Wshadow
+    ###NVCC_VERSION = $(shell $(NVCC) --version | grep 'Cuda compilation tools' | cut -d' ' -f5 | cut -d, -f1)
+    CUFLAGS += -std=c++17 # need CUDA >= 11.2 (see #333): this is enforced in mgOnGpuConfig.h
+    # Without -maxrregcount: baseline throughput: 6.5E8 (16384 32 12) up to 7.3E8 (65536 128 12)
+    ###CUFLAGS+= --maxrregcount 160 # improves throughput: 6.9E8 (16384 32 12) up to 7.7E8 (65536 128 12)
+    ###CUFLAGS+= --maxrregcount 128 # improves throughput: 7.3E8 (16384 32 12) up to 7.6E8 (65536 128 12)
+    ###CUFLAGS+= --maxrregcount 96 # degrades throughput: 4.1E8 (16384 32 12) up to 4.5E8 (65536 128 12)
+    ###CUFLAGS+= --maxrregcount 64 # degrades throughput: 1.7E8 (16384 32 12) flat at 1.7E8 (65536 128 12)
+  else ifneq ($(origin REQUIRE_CUDA),undefined)
+    # If REQUIRE_CUDA is set but no cuda is found, stop here (e.g. for CI tests on GPU #443)
+    $(error No cuda installation found (set CUDA_HOME or make nvcc visible in PATH))
+  else
+    # No cuda. Switch cuda compilation off and go to common random numbers in C++
+    $(warning CUDA_HOME is not set or is invalid: export CUDA_HOME to compile with cuda)
+    override NVCC=
+    override USE_NVTX=
+    override CUINC=
+    override CURANDLIBFLAGS=
+  endif
 
   # Set the host C++ compiler for nvcc via "-ccbin <host-compiler>"
   # (NB issue #505: this must be a single word, "clang++ --gcc-toolchain..." is not supported)

From a46b3f94bfc2c9ca57dec2090ce7472d7dbc577d Mon Sep 17 00:00:00 2001
From: Jorgen T <jorgen.teig@gmail.com>
Date: Wed, 14 Jun 2023 16:29:38 +0200
Subject: [PATCH 259/509] Added compilation for HIP in makefile

---
 .../ee_mumu.mad/SubProcesses/cudacpp.mk       | 132 ++++++++++++------
 1 file changed, 87 insertions(+), 45 deletions(-)

diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/cudacpp.mk b/epochX/cudacpp/ee_mumu.mad/SubProcesses/cudacpp.mk
index 5cc9c9f3b3..99db19a1dc 100644
--- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/cudacpp.mk
+++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/cudacpp.mk
@@ -101,17 +101,17 @@ ifeq ($(CUDA_COMPILER),0)
     override CUDA_HOME=disabled
   endif
 
-  # If CUDA_HOME is not set, try to set it from the location of nvcc
+  # If CUDA_HOME is not set, try to set it from the location of GPUCC
   ifndef CUDA_HOME
     CUDA_HOME = $(patsubst %bin/nvcc,%,$(shell which nvcc 2>/dev/null))
     $(warning CUDA_HOME was not set: using "$(CUDA_HOME)")
   endif
 
-  # Set NVCC as $(CUDA_HOME)/bin/nvcc if it exists
+  # Set GPUCC as $(CUDA_HOME)/bin/nvcc if it exists
   ifneq ($(wildcard $(CUDA_HOME)/bin/nvcc),)
-    NVCC = $(CUDA_HOME)/bin/nvcc
+    GPUCC = $(CUDA_HOME)/bin/nvcc
     USE_NVTX ?=-DUSE_NVTX
-    # See https://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/index.html
+    # See https://docs.nvidia.com/cuda/cuda-compiler-driver-GPUCC/index.html
     # See https://arnon.dk/matching-sm-architectures-arch-and-gencode-for-various-nvidia-cards/
     # Default: use compute capability 70 for V100 (CERN lxbatch, CERN itscrd, Juwels Cluster).
     # Embed device code for 70, and PTX for 70+.
@@ -127,7 +127,7 @@ ifeq ($(CUDA_COMPILER),0)
     CUOPTFLAGS = -lineinfo
     CUFLAGS = $(OPTFLAGS) $(CUOPTFLAGS) $(INCFLAGS) $(CUINC) $(USE_NVTX) $(CUARCHFLAGS) -use_fast_math
     ###CUFLAGS += -Xcompiler -Wall -Xcompiler -Wextra -Xcompiler -Wshadow
-    ###NVCC_VERSION = $(shell $(NVCC) --version | grep 'Cuda compilation tools' | cut -d' ' -f5 | cut -d, -f1)
+    ###GPUCC_VERSION = $(shell $(GPUCC) --version | grep 'Cuda compilation tools' | cut -d' ' -f5 | cut -d, -f1)
     CUFLAGS += -std=c++17 # need CUDA >= 11.2 (see #333): this is enforced in mgOnGpuConfig.h
     # Without -maxrregcount: baseline throughput: 6.5E8 (16384 32 12) up to 7.3E8 (65536 128 12)
     ###CUFLAGS+= --maxrregcount 160 # improves throughput: 6.9E8 (16384 32 12) up to 7.7E8 (65536 128 12)
@@ -136,17 +136,17 @@ ifeq ($(CUDA_COMPILER),0)
     ###CUFLAGS+= --maxrregcount 64 # degrades throughput: 1.7E8 (16384 32 12) flat at 1.7E8 (65536 128 12)
   else ifneq ($(origin REQUIRE_CUDA),undefined)
     # If REQUIRE_CUDA is set but no cuda is found, stop here (e.g. for CI tests on GPU #443)
-    $(error No cuda installation found (set CUDA_HOME or make nvcc visible in PATH))
+    $(error No cuda installation found (set CUDA_HOME or make GPUCC visible in PATH))
   else
     # No cuda. Switch cuda compilation off and go to common random numbers in C++
     $(warning CUDA_HOME is not set or is invalid: export CUDA_HOME to compile with cuda)
-    override NVCC=
+    override GPUCC=
     override USE_NVTX=
     override CUINC=
     override CURANDLIBFLAGS=
   endif
 
-  # Set the host C++ compiler for nvcc via "-ccbin <host-compiler>"
+  # Set the host C++ compiler for GPUCC via "-ccbin <host-compiler>"
   # (NB issue #505: this must be a single word, "clang++ --gcc-toolchain..." is not supported)
   CUFLAGS += -ccbin $(shell which $(subst ccache ,,$(CXX)))
 
@@ -158,7 +158,49 @@ ifeq ($(CUDA_COMPILER),0)
 else
     #=== Configure the HIP compiler
 
-    NVCC=/usr/bin/hipcc
+    # If HIP_HOME is not set, try to set it from the location of GPUCC
+  ifndef HIP_HOME
+    HIP_HOME = $(patsubst %bin/hipcc,%,$(shell which hipcc 2>/dev/null))
+    $(warning HIP_HOME was not set: using "$(HIP_HOME)")
+  endif
+
+    HIP_HOME=/opt/rocm-5.4.3/
+
+  # Set GPUCC as $(HIP_HOME)/bin/hipcc if it exists
+  ifneq ($(wildcard $(HIP_HOME)/bin/hipcc),)
+    GPUCC = $(CUDA_HOME)/bin/nvcc
+    # Should maybe find something equivelant to this in HIP
+    #USE_NVTX ?=-DUSE_NVTX
+
+    #MADGRAPH_CUDA_ARCHITECTURE ?= 70
+    ###CUARCHFLAGS = -gencode arch=compute_$(MADGRAPH_CUDA_ARCHITECTURE),code=compute_$(MADGRAPH_CUDA_ARCHITECTURE) -gencode arch=compute_$(MADGRAPH_CUDA_ARCHITECTURE),code=sm_$(MADGRAPH_CUDA_ARCHITECTURE) # Older implementation (AV): go back to this one for multi-GPU support #533
+    ###CUARCHFLAGS = --gpu-architecture=compute_$(MADGRAPH_CUDA_ARCHITECTURE) --gpu-code=sm_$(MADGRAPH_CUDA_ARCHITECTURE),compute_$(MADGRAPH_CUDA_ARCHITECTURE) # Newer implementation (SH): cannot use this as-is for multi-GPU support #533
+    comma:=,
+    CUARCHFLAGS = gfx90a
+    CUINC = -I$(HIP_HOME)/include/
+    CURANDLIBFLAGS = -L$(HIP_HOME)/lib64/ # NB: -lcuda is not needed here!
+    CUOPTFLAGS = -lineinfo
+    CUFLAGS = $(OPTFLAGS) $(CUOPTFLAGS) $(INCFLAGS) $(CUINC) $(CUARCHFLAGS) -use_fast_math
+    ###CUFLAGS += -Xcompiler -Wall -Xcompiler -Wextra -Xcompiler -Wshadow
+    ###GPUCC_VERSION = $(shell $(GPUCC) --version | grep 'Cuda compilation tools' | cut -d' ' -f5 | cut -d, -f1)
+    CUFLAGS += -std=c++17 # need CUDA >= 11.2 (see #333): this is enforced in mgOnGpuConfig.h
+    # Without -maxrregcount: baseline throughput: 6.5E8 (16384 32 12) up to 7.3E8 (65536 128 12)
+    ###CUFLAGS+= --maxrregcount 160 # improves throughput: 6.9E8 (16384 32 12) up to 7.7E8 (65536 128 12)
+    ###CUFLAGS+= --maxrregcount 128 # improves throughput: 7.3E8 (16384 32 12) up to 7.6E8 (65536 128 12)
+    ###CUFLAGS+= --maxrregcount 96 # degrades throughput: 4.1E8 (16384 32 12) up to 4.5E8 (65536 128 12)
+    ###CUFLAGS+= --maxrregcount 64 # degrades throughput: 1.7E8 (16384 32 12) flat at 1.7E8 (65536 128 12)
+  else ifneq ($(origin REQUIRE_CUDA),undefined)
+    # If REQUIRE_CUDA is set but no cuda is found, stop here (e.g. for CI tests on GPU #443)
+    $(error No cuda installation found (set CUDA_HOME or make GPUCC visible in PATH))
+  else
+    # No cuda. Switch cuda compilation off and go to common random numbers in C++
+    $(warning CUDA_HOME is not set or is invalid: export CUDA_HOME to compile with cuda)
+    override GPUCC=
+    override USE_NVTX=
+    override CUINC=
+    override CURANDLIBFLAGS=
+  endif
+
 endif
 
   
@@ -173,9 +215,9 @@ endif
 #ifeq ($(USECCACHE)$(shell echo $(AR) | grep ccache),1)
 #  override AR:=ccache $(AR)
 #endif
-ifneq ($(NVCC),)
-  ifeq ($(USECCACHE)$(shell echo $(NVCC) | grep ccache),1)
-    override NVCC:=ccache $(NVCC)
+ifneq ($(GPUCC),)
+  ifeq ($(USECCACHE)$(shell echo $(GPUCC) | grep ccache),1)
+    override GPUCC:=ccache $(GPUCC)
   endif
 endif
 
@@ -209,10 +251,10 @@ endif
 # Set the default OMPFLAGS choice
 ifneq ($(shell $(CXX) --version | egrep '^Intel'),)
 override OMPFLAGS = -fopenmp
-###override OMPFLAGS = # disable OpenMP MT on Intel (was ok without nvcc but not ok with nvcc before #578)
+###override OMPFLAGS = # disable OpenMP MT on Intel (was ok without GPUCC but not ok with GPUCC before #578)
 else ifneq ($(shell $(CXX) --version | egrep '^(clang)'),)
 override OMPFLAGS = -fopenmp
-###override OMPFLAGS = # disable OpenMP MT on clang (was not ok without or with nvcc before #578)
+###override OMPFLAGS = # disable OpenMP MT on clang (was not ok without or with GPUCC before #578)
 else ifneq ($(shell $(CXX) --version | egrep '^(Apple clang)'),)
 override OMPFLAGS = # disable OpenMP MT on Apple clang (builds fail in the CI #578)
 else
@@ -263,7 +305,7 @@ endif
 
 # Set the default RNDGEN (random number generator) choice
 ifeq ($(RNDGEN),)
-  ifeq ($(NVCC),)
+  ifeq ($(GPUCC),)
     override RNDGEN = hasNoCurand
   else ifeq ($(RNDGEN),)
     override RNDGEN = hasCurand
@@ -431,7 +473,7 @@ override RUNTIME =
 cxx_main=$(BUILDDIR)/check.exe
 fcxx_main=$(BUILDDIR)/fcheck.exe
 
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 cu_main=$(BUILDDIR)/gcheck.exe
 fcu_main=$(BUILDDIR)/fgcheck.exe
 else
@@ -462,14 +504,14 @@ $(BUILDDIR)/.build.$(TAG):
 	@touch $(BUILDDIR)/.build.$(TAG)
 
 # Generic target and build rules: objects from CUDA compilation
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 $(BUILDDIR)/%.o : %.cu *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
 	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
-	$(NVCC) $(CPPFLAGS) $(CUFLAGS) -Xcompiler -fPIC -c $< -o $@
+	$(GPUCC) $(CPPFLAGS) $(CUFLAGS) -Xcompiler -fPIC -c $< -o $@
 
 $(BUILDDIR)/%_cu.o : %.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
 	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
-	$(NVCC) $(CPPFLAGS) $(CUFLAGS) -Xcompiler -fPIC -c -x cu $< -o $@
+	$(GPUCC) $(CPPFLAGS) $(CUFLAGS) -Xcompiler -fPIC -c -x cu $< -o $@
 endif
 
 # Generic target and build rules: objects from C++ compilation
@@ -482,7 +524,7 @@ $(BUILDDIR)/%.o : %.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
 ifeq ($(shell $(CXX) --version | grep ^nvc++),)
 $(BUILDDIR)/CrossSectionKernels.o: CXXFLAGS += -fno-fast-math
 $(BUILDDIR)/CrossSectionKernels.o: CXXFLAGS += -fno-fast-math
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 $(BUILDDIR)/gCrossSectionKernels.o: CUFLAGS += -Xcompiler -fno-fast-math
 endif
 endif
@@ -499,9 +541,9 @@ ifeq ($(RNDGEN),hasCurand)
 $(BUILDDIR)/CurandRandomNumberKernel.o: CXXFLAGS += $(CUINC)
 endif
 
-# Avoid "warning: builtin __has_trivial_... is deprecated; use __is_trivially_... instead" in nvcc with icx2023 (#592)
+# Avoid "warning: builtin __has_trivial_... is deprecated; use __is_trivially_... instead" in GPUCC with icx2023 (#592)
 ifneq ($(shell $(CXX) --version | egrep '^(Intel)'),)
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 CUFLAGS += -Xcompiler -Wno-deprecated-builtins
 endif
 endif
@@ -510,7 +552,7 @@ endif
 # This patch does remove the warning, but I prefer to keep it disabled for the moment...
 ###ifneq ($(shell $(CXX) --version | egrep '^(clang|Apple clang|Intel)'),)
 ###$(BUILDDIR)/CrossSectionKernels.o: CXXFLAGS += -Wno-overriding-t-option
-###ifneq ($(NVCC),)
+###ifneq ($(GPUCC),)
 ###$(BUILDDIR)/gCrossSectionKernels.o: CUFLAGS += -Xcompiler -Wno-overriding-t-option
 ###endif
 ###endif
@@ -538,7 +580,7 @@ MG5AMC_CXXLIB = mg5amc_$(processid_short)_cpp
 cxx_objects_lib=$(BUILDDIR)/CPPProcess.o $(BUILDDIR)/MatrixElementKernels.o $(BUILDDIR)/BridgeKernels.o $(BUILDDIR)/CrossSectionKernels.o
 cxx_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel.o $(BUILDDIR)/RamboSamplingKernels.o
 
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 MG5AMC_CULIB = mg5amc_$(processid_short)_cuda
 cu_objects_lib=$(BUILDDIR)/gCPPProcess.o $(BUILDDIR)/gMatrixElementKernels.o $(BUILDDIR)/gBridgeKernels.o $(BUILDDIR)/gCrossSectionKernels.o
 cu_objects_exe=$(BUILDDIR)/gCommonRandomNumberKernel.o $(BUILDDIR)/gRamboSamplingKernels.o
@@ -550,11 +592,11 @@ $(LIBDIR)/lib$(MG5AMC_CXXLIB).so: cxx_objects_lib += $(BUILDDIR)/fbridge.o
 $(LIBDIR)/lib$(MG5AMC_CXXLIB).so: $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib)
 	$(CXX) -shared -o $@ $(cxx_objects_lib) $(CXXLIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB)
 
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 $(LIBDIR)/lib$(MG5AMC_CULIB).so: $(BUILDDIR)/fbridge_cu.o
 $(LIBDIR)/lib$(MG5AMC_CULIB).so: cu_objects_lib += $(BUILDDIR)/fbridge_cu.o
 $(LIBDIR)/lib$(MG5AMC_CULIB).so: $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cu_objects_lib)
-	$(NVCC) --shared -o $@ $(cu_objects_lib) $(CULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB)
+	$(GPUCC) --shared -o $@ $(cu_objects_lib) $(CULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB)
 endif
 
 #-------------------------------------------------------------------------------
@@ -571,16 +613,16 @@ $(cxx_main): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PAT
 $(cxx_main): $(BUILDDIR)/check_sa.o $(LIBDIR)/lib$(MG5AMC_CXXLIB).so $(cxx_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel.o
 	$(CXX) -o $@ $(BUILDDIR)/check_sa.o $(OMPFLAGS) -ldl -pthread $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_CXXLIB) $(cxx_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel.o $(CURANDLIBFLAGS)
 
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 ifneq ($(shell $(CXX) --version | grep ^Intel),)
-$(cu_main): LIBFLAGS += -lintlc # compile with icpx and link with nvcc (undefined reference to `_intel_fast_memcpy')
-$(cu_main): LIBFLAGS += -lsvml # compile with icpx and link with nvcc (undefined reference to `__svml_cos4_l9')
+$(cu_main): LIBFLAGS += -lintlc # compile with icpx and link with GPUCC (undefined reference to `_intel_fast_memcpy')
+$(cu_main): LIBFLAGS += -lsvml # compile with icpx and link with GPUCC (undefined reference to `__svml_cos4_l9')
 else ifneq ($(shell $(CXX) --version | grep ^nvc++),) # support nvc++ #531
 $(cu_main): LIBFLAGS += -L$(patsubst %bin/nvc++,%lib,$(subst ccache ,,$(CXX))) -lnvhpcatm -lnvcpumath -lnvc
 endif
 $(cu_main): LIBFLAGS += $(CULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH
 $(cu_main): $(BUILDDIR)/gcheck_sa.o $(LIBDIR)/lib$(MG5AMC_CULIB).so $(cu_objects_exe) $(BUILDDIR)/gCurandRandomNumberKernel.o
-	$(NVCC) -o $@ $(BUILDDIR)/gcheck_sa.o $(CUARCHFLAGS) $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe) $(BUILDDIR)/gCurandRandomNumberKernel.o $(CURANDLIBFLAGS)
+	$(GPUCC) -o $@ $(BUILDDIR)/gcheck_sa.o $(CUARCHFLAGS) $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe) $(BUILDDIR)/gCurandRandomNumberKernel.o $(CURANDLIBFLAGS)
 endif
 
 #-------------------------------------------------------------------------------
@@ -606,17 +648,17 @@ $(fcxx_main): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PA
 $(fcxx_main): $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler.o $(LIBDIR)/lib$(MG5AMC_CXXLIB).so $(cxx_objects_exe)
 	$(CXX) -o $@ $(BUILDDIR)/fcheck_sa.o $(OMPFLAGS) $(BUILDDIR)/fsampler.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CXXLIB) $(cxx_objects_exe)
 
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 ifneq ($(shell $(CXX) --version | grep ^Intel),)
-$(fcu_main): LIBFLAGS += -lintlc # compile with icpx and link with nvcc (undefined reference to `_intel_fast_memcpy')
-$(fcu_main): LIBFLAGS += -lsvml # compile with icpx and link with nvcc (undefined reference to `__svml_cos4_l9')
+$(fcu_main): LIBFLAGS += -lintlc # compile with icpx and link with GPUCC (undefined reference to `_intel_fast_memcpy')
+$(fcu_main): LIBFLAGS += -lsvml # compile with icpx and link with GPUCC (undefined reference to `__svml_cos4_l9')
 endif
 ifeq ($(UNAME_S),Darwin)
 $(fcu_main): LIBFLAGS += -L$(shell dirname $(shell $(FC) --print-file-name libgfortran.dylib)) # add path to libgfortran on Mac #375
 endif
 $(fcu_main): LIBFLAGS += $(CULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH
 $(fcu_main): $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler_cu.o $(LIBDIR)/lib$(MG5AMC_CULIB).so $(cu_objects_exe)
-	$(NVCC) -o $@ $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler_cu.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe)
+	$(GPUCC) -o $@ $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler_cu.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe)
 endif
 
 #-------------------------------------------------------------------------------
@@ -628,7 +670,7 @@ $(BUILDDIR)/testxxx.o: testxxx_cc_ref.txt
 $(testmain): $(BUILDDIR)/testxxx.o
 $(testmain): cxx_objects_exe += $(BUILDDIR)/testxxx.o # Comment out this line to skip the C++ test of xxx functions
 
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 $(BUILDDIR)/testxxx_cu.o: $(GTESTLIBS)
 $(BUILDDIR)/testxxx_cu.o: INCFLAGS += $(GTESTINC)
 $(BUILDDIR)/testxxx_cu.o: testxxx_cc_ref.txt
@@ -641,7 +683,7 @@ $(BUILDDIR)/testmisc.o: INCFLAGS += $(GTESTINC)
 $(testmain): $(BUILDDIR)/testmisc.o
 $(testmain): cxx_objects_exe += $(BUILDDIR)/testmisc.o # Comment out this line to skip the C++ miscellaneous tests
 
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 $(BUILDDIR)/testmisc_cu.o: $(GTESTLIBS)
 $(BUILDDIR)/testmisc_cu.o: INCFLAGS += $(GTESTINC)
 $(testmain): $(BUILDDIR)/testmisc_cu.o
@@ -653,12 +695,12 @@ $(BUILDDIR)/runTest.o: INCFLAGS += $(GTESTINC)
 $(testmain): $(BUILDDIR)/runTest.o
 $(testmain): cxx_objects_exe += $(BUILDDIR)/runTest.o
 
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 $(BUILDDIR)/runTest_cu.o: $(GTESTLIBS)
 $(BUILDDIR)/runTest_cu.o: INCFLAGS += $(GTESTINC)
 ifneq ($(shell $(CXX) --version | grep ^Intel),)
-$(testmain): LIBFLAGS += -lintlc # compile with icpx and link with nvcc (undefined reference to `_intel_fast_memcpy')
-$(testmain): LIBFLAGS += -lsvml # compile with icpx and link with nvcc (undefined reference to `__svml_cos4_l9')
+$(testmain): LIBFLAGS += -lintlc # compile with icpx and link with GPUCC (undefined reference to `_intel_fast_memcpy')
+$(testmain): LIBFLAGS += -lsvml # compile with icpx and link with GPUCC (undefined reference to `__svml_cos4_l9')
 else ifneq ($(shell $(CXX) --version | grep ^nvc++),) # support nvc++ #531
 $(testmain): LIBFLAGS += -L$(patsubst %bin/nvc++,%lib,$(subst ccache ,,$(CXX))) -lnvhpcatm -lnvcpumath -lnvc
 endif
@@ -682,14 +724,14 @@ $(testmain): LIBFLAGS += -lgomp
 endif
 endif
 
-ifeq ($(NVCC),) # link only runTest.o
+ifeq ($(GPUCC),) # link only runTest.o
 $(testmain): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH
 $(testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(cxx_objects_exe) $(GTESTLIBS)
 	$(CXX) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) -ldl -pthread $(LIBFLAGS)
 else # link both runTest.o and runTest_cu.o
 $(testmain): LIBFLAGS += $(CULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH
 $(testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) $(GTESTLIBS)
-	$(NVCC) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) -ldl $(LIBFLAGS) -lcuda
+	$(GPUCC) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) -ldl $(LIBFLAGS) -lcuda
 endif
 
 # Use flock (Linux only, no Mac) to allow 'make -j' if googletest has not yet been downloaded https://stackoverflow.com/a/32666215
@@ -792,9 +834,9 @@ ifeq ($(USECCACHE),1)
 	ccache --version | head -1
 endif
 	@echo ""
-	@echo NVCC=$(NVCC)
-ifneq ($(NVCC),)
-	$(NVCC) --version
+	@echo GPUCC=$(GPUCC)
+ifneq ($(GPUCC),)
+	$(GPUCC) --version
 endif
 	@echo ""
 	@echo CXX=$(CXX)
@@ -813,7 +855,7 @@ endif
 
 # Target: check (run the C++ test executable)
 # [NB THIS IS WHAT IS USED IN THE GITHUB CI!]
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 check: runTest cmpFcheck cmpFGcheck
 else
 check: runTest cmpFcheck

From ce8a20cf4f20bc13504f2a73c1f8dbbd2ce71b93 Mon Sep 17 00:00:00 2001
From: Jorgen T <jorgen.teig@gmail.com>
Date: Wed, 14 Jun 2023 16:33:23 +0200
Subject: [PATCH 260/509] Removed typo

---
 epochX/cudacpp/ee_mumu.mad/SubProcesses/cudacpp.mk | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/cudacpp.mk b/epochX/cudacpp/ee_mumu.mad/SubProcesses/cudacpp.mk
index 99db19a1dc..980c34777e 100644
--- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/cudacpp.mk
+++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/cudacpp.mk
@@ -168,7 +168,7 @@ else
 
   # Set GPUCC as $(HIP_HOME)/bin/hipcc if it exists
   ifneq ($(wildcard $(HIP_HOME)/bin/hipcc),)
-    GPUCC = $(CUDA_HOME)/bin/nvcc
+    GPUCC = $(HIP_HOME)/bin/hipcc
     # Should maybe find something equivelant to this in HIP
     #USE_NVTX ?=-DUSE_NVTX
 

From 7830e206e771f2f9f581f0461559a49c4ba0f790 Mon Sep 17 00:00:00 2001
From: Jorgen T <jorgen.teig@gmail.com>
Date: Wed, 14 Jun 2023 16:37:58 +0200
Subject: [PATCH 261/509] Tweaked some HIP compiler flags

---
 epochX/cudacpp/ee_mumu.mad/SubProcesses/cudacpp.mk | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/cudacpp.mk b/epochX/cudacpp/ee_mumu.mad/SubProcesses/cudacpp.mk
index 980c34777e..a7d55a4f92 100644
--- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/cudacpp.mk
+++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/cudacpp.mk
@@ -164,7 +164,7 @@ else
     $(warning HIP_HOME was not set: using "$(HIP_HOME)")
   endif
 
-    HIP_HOME=/opt/rocm-5.4.3/
+    HIP_HOME=/opt/rocm-5.4.3
 
   # Set GPUCC as $(HIP_HOME)/bin/hipcc if it exists
   ifneq ($(wildcard $(HIP_HOME)/bin/hipcc),)
@@ -176,10 +176,10 @@ else
     ###CUARCHFLAGS = -gencode arch=compute_$(MADGRAPH_CUDA_ARCHITECTURE),code=compute_$(MADGRAPH_CUDA_ARCHITECTURE) -gencode arch=compute_$(MADGRAPH_CUDA_ARCHITECTURE),code=sm_$(MADGRAPH_CUDA_ARCHITECTURE) # Older implementation (AV): go back to this one for multi-GPU support #533
     ###CUARCHFLAGS = --gpu-architecture=compute_$(MADGRAPH_CUDA_ARCHITECTURE) --gpu-code=sm_$(MADGRAPH_CUDA_ARCHITECTURE),compute_$(MADGRAPH_CUDA_ARCHITECTURE) # Newer implementation (SH): cannot use this as-is for multi-GPU support #533
     comma:=,
-    CUARCHFLAGS = gfx90a
+    CUARCHFLAGS = --genco --offload-arch=gfx90a
     CUINC = -I$(HIP_HOME)/include/
     CURANDLIBFLAGS = -L$(HIP_HOME)/lib64/ # NB: -lcuda is not needed here!
-    CUOPTFLAGS = -lineinfo
+    #CUOPTFLAGS = -lineinfo
     CUFLAGS = $(OPTFLAGS) $(CUOPTFLAGS) $(INCFLAGS) $(CUINC) $(CUARCHFLAGS) -use_fast_math
     ###CUFLAGS += -Xcompiler -Wall -Xcompiler -Wextra -Xcompiler -Wshadow
     ###GPUCC_VERSION = $(shell $(GPUCC) --version | grep 'Cuda compilation tools' | cut -d' ' -f5 | cut -d, -f1)

From 822816d4f289ac6e200b0c3f95aaf61b066049de Mon Sep 17 00:00:00 2001
From: Jorgen T <jorgen.teig@gmail.com>
Date: Wed, 14 Jun 2023 16:45:36 +0200
Subject: [PATCH 262/509] Changed __HIPCC__ to __HCC__ in GPU abstraction

---
 epochX/cudacpp/ee_mumu.mad/SubProcesses/GpuAbstraction.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/GpuAbstraction.h b/epochX/cudacpp/ee_mumu.mad/SubProcesses/GpuAbstraction.h
index bd164293b8..7a9109a71c 100644
--- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/GpuAbstraction.h
+++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/GpuAbstraction.h
@@ -6,7 +6,7 @@
   #define MGONGPUCPP_CUDACC 1
 #endif
 
-#ifdef __HIPCC__
+#ifdef __HCC__
   #define MGONGPUCPP_HIPCC 1
 #endif
 

From 0307eba22f823dc86946d27d753b8beac44f17df Mon Sep 17 00:00:00 2001
From: Jorgen T <jorgen.teig@gmail.com>
Date: Wed, 14 Jun 2023 17:02:16 +0200
Subject: [PATCH 263/509] Changed typo in file

---
 epochX/cudacpp/ee_mumu.mad/SubProcesses/GpuAbstraction.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/GpuAbstraction.h b/epochX/cudacpp/ee_mumu.mad/SubProcesses/GpuAbstraction.h
index 7a9109a71c..ac0d2b4d47 100644
--- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/GpuAbstraction.h
+++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/GpuAbstraction.h
@@ -45,7 +45,7 @@
 #elif defined MGONGPUCPP_HIPCC
 
   // Defines correct compiler
-  #define MGONGPUCPP_GPUIMPL __HIPCC__
+  #define MGONGPUCPP_GPUIMPL __HCC__
 
   //--------------------------------------------------------------------------
 

From 1ceace1e136c49b5ed8fa047fd53a4d6d07adc5b Mon Sep 17 00:00:00 2001
From: Jorgen T <jorgen.teig@gmail.com>
Date: Wed, 14 Jun 2023 17:32:27 +0200
Subject: [PATCH 264/509] Testing some defines

---
 epochX/cudacpp/ee_mumu.mad/src/mgOnGpuCxtypes.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/epochX/cudacpp/ee_mumu.mad/src/mgOnGpuCxtypes.h b/epochX/cudacpp/ee_mumu.mad/src/mgOnGpuCxtypes.h
index 866b7640f6..2fa7335404 100644
--- a/epochX/cudacpp/ee_mumu.mad/src/mgOnGpuCxtypes.h
+++ b/epochX/cudacpp/ee_mumu.mad/src/mgOnGpuCxtypes.h
@@ -19,7 +19,7 @@
 #include <complex>
 
 // Complex type in cuda: thrust or cucomplex or cxsmpl
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef MGONGPUCPP_CUDACC
 #if defined MGONGPU_CUCXTYPE_THRUST
 #pragma clang diagnostic push
 #pragma clang diagnostic ignored "-Wtautological-compare" // for icpx2021/clang13 (https://stackoverflow.com/a/15864661)
@@ -201,7 +201,7 @@ namespace mgOnGpu
 {
 
   // --- Type definitions (complex type: cxtype)
-#ifdef MGONGPUCPP_GPUIMPL // cuda
+#ifdef MGONGPUCPP_CUDACC // cuda
 #if defined MGONGPU_CUCXTYPE_THRUST
   typedef thrust::complex<fptype> cxtype;
 #elif defined MGONGPU_CUCXTYPE_CUCOMPLEX

From 1043b458ed95da48b1becb406311752d21e6c41a Mon Sep 17 00:00:00 2001
From: Jorgen T <jorgen.teig@gmail.com>
Date: Wed, 14 Jun 2023 17:35:01 +0200
Subject: [PATCH 265/509] Added define to ignore error for testing

---
 epochX/cudacpp/ee_mumu.mad/src/mgOnGpuCxtypes.h | 1 +
 1 file changed, 1 insertion(+)

diff --git a/epochX/cudacpp/ee_mumu.mad/src/mgOnGpuCxtypes.h b/epochX/cudacpp/ee_mumu.mad/src/mgOnGpuCxtypes.h
index 2fa7335404..c3b2f3fb57 100644
--- a/epochX/cudacpp/ee_mumu.mad/src/mgOnGpuCxtypes.h
+++ b/epochX/cudacpp/ee_mumu.mad/src/mgOnGpuCxtypes.h
@@ -19,6 +19,7 @@
 #include <complex>
 
 // Complex type in cuda: thrust or cucomplex or cxsmpl
+#define THRUST_IGNORE_CUB_VERSION_CHECK
 #ifdef MGONGPUCPP_CUDACC
 #if defined MGONGPU_CUCXTYPE_THRUST
 #pragma clang diagnostic push

From 28afd0bcc8472226868b67652eccd8df32d0fd5e Mon Sep 17 00:00:00 2001
From: Jorgen T <jorgen.teig@gmail.com>
Date: Wed, 14 Jun 2023 18:16:00 +0200
Subject: [PATCH 266/509] Testing some things in makefile

---
 epochX/cudacpp/ee_mumu.mad/SubProcesses/cudacpp.mk | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/cudacpp.mk b/epochX/cudacpp/ee_mumu.mad/SubProcesses/cudacpp.mk
index a7d55a4f92..d2d83d7985 100644
--- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/cudacpp.mk
+++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/cudacpp.mk
@@ -161,10 +161,11 @@ else
     # If HIP_HOME is not set, try to set it from the location of GPUCC
   ifndef HIP_HOME
     HIP_HOME = $(patsubst %bin/hipcc,%,$(shell which hipcc 2>/dev/null))
-    $(warning HIP_HOME was not set: using "$(HIP_HOME)")
+    #$(warning HIP_HOME was not set: using "$(HIP_HOME)")
   endif
 
     HIP_HOME=/opt/rocm-5.4.3
+    $(warning HIP_HOME was not set: using "$(HIP_HOME)")
 
   # Set GPUCC as $(HIP_HOME)/bin/hipcc if it exists
   ifneq ($(wildcard $(HIP_HOME)/bin/hipcc),)
@@ -511,8 +512,9 @@ $(BUILDDIR)/%.o : %.cu *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
 
 $(BUILDDIR)/%_cu.o : %.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
 	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
-	$(GPUCC) $(CPPFLAGS) $(CUFLAGS) -Xcompiler -fPIC -c -x cu $< -o $@
+	$(GPUCC) $(CPPFLAGS) $(CUFLAGS) -Xcompiler -fPIC -c $< -o $@
 endif
+# -x cu in line above
 
 # Generic target and build rules: objects from C++ compilation
 # (NB do not include CUINC here! add it only for NVTX or curand #679)

From 4486ffd1d302a9ae276133757eef8aaa9d3e5c1c Mon Sep 17 00:00:00 2001
From: Jorgen T <jorgen.teig@gmail.com>
Date: Thu, 15 Jun 2023 10:57:33 +0200
Subject: [PATCH 267/509] Testing removing rpath

---
 epochX/cudacpp/ee_mumu.mad/SubProcesses/cudacpp.mk | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/cudacpp.mk b/epochX/cudacpp/ee_mumu.mad/SubProcesses/cudacpp.mk
index d2d83d7985..c438690ea2 100644
--- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/cudacpp.mk
+++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/cudacpp.mk
@@ -461,7 +461,8 @@ else
   override CULIBFLAGSRPATH = -Xlinker -rpath,$(LIBDIRRPATH)
   # RPATH to common lib when linking cuda/cpp libs
   override CXXLIBFLAGSRPATH2 = -Wl,-rpath,'$$ORIGIN'
-  override CULIBFLAGSRPATH2 = -Xlinker -rpath,'$$ORIGIN'
+  override CULIBFLAGSRPATH2 = 
+  #-Xlinker -rpath,'$$ORIGIN'
 endif
 
 # Setting LD_LIBRARY_PATH or DYLD_LIBRARY_PATH in the RUNTIME is no longer necessary (neither on Linux nor on Mac)

From bc1c26dddc9489c854e782c20c255b9f2df34a10 Mon Sep 17 00:00:00 2001
From: Jorgen T <jorgen.teig@gmail.com>
Date: Thu, 15 Jun 2023 11:28:45 +0200
Subject: [PATCH 268/509] Added specified target in makefile

---
 epochX/cudacpp/ee_mumu.mad/SubProcesses/cudacpp.mk | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/cudacpp.mk b/epochX/cudacpp/ee_mumu.mad/SubProcesses/cudacpp.mk
index c438690ea2..c41ff60358 100644
--- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/cudacpp.mk
+++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/cudacpp.mk
@@ -177,7 +177,7 @@ else
     ###CUARCHFLAGS = -gencode arch=compute_$(MADGRAPH_CUDA_ARCHITECTURE),code=compute_$(MADGRAPH_CUDA_ARCHITECTURE) -gencode arch=compute_$(MADGRAPH_CUDA_ARCHITECTURE),code=sm_$(MADGRAPH_CUDA_ARCHITECTURE) # Older implementation (AV): go back to this one for multi-GPU support #533
     ###CUARCHFLAGS = --gpu-architecture=compute_$(MADGRAPH_CUDA_ARCHITECTURE) --gpu-code=sm_$(MADGRAPH_CUDA_ARCHITECTURE),compute_$(MADGRAPH_CUDA_ARCHITECTURE) # Newer implementation (SH): cannot use this as-is for multi-GPU support #533
     comma:=,
-    CUARCHFLAGS = --genco --offload-arch=gfx90a
+    CUARCHFLAGS = -target x86_64-linux-gnu --offload-arch=gfx90a
     CUINC = -I$(HIP_HOME)/include/
     CURANDLIBFLAGS = -L$(HIP_HOME)/lib64/ # NB: -lcuda is not needed here!
     #CUOPTFLAGS = -lineinfo

From 66566f4af8275b700e71591376caa81b417a9d08 Mon Sep 17 00:00:00 2001
From: Jorgen T <jorgen.teig@gmail.com>
Date: Thu, 15 Jun 2023 11:36:22 +0200
Subject: [PATCH 269/509] Added check for compiling without curand when
 compiling with HIP

---
 .../ee_mumu.mad/SubProcesses/CurandRandomNumberKernel.cc  | 1 -
 epochX/cudacpp/ee_mumu.mad/src/mgOnGpuConfig.h            | 8 +++++---
 2 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/CurandRandomNumberKernel.cc b/epochX/cudacpp/ee_mumu.mad/SubProcesses/CurandRandomNumberKernel.cc
index 219fae8dda..9d0bbe84d7 100644
--- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/CurandRandomNumberKernel.cc
+++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/CurandRandomNumberKernel.cc
@@ -5,7 +5,6 @@
 
 #include "CommonRandomNumbers.h"
 #include "GpuRuntime.h"
-#include "CudaRuntime.h"
 #include "MemoryBuffers.h"
 #include "RandomNumberKernels.h"
 
diff --git a/epochX/cudacpp/ee_mumu.mad/src/mgOnGpuConfig.h b/epochX/cudacpp/ee_mumu.mad/src/mgOnGpuConfig.h
index e693d99a8c..4388c4df2a 100644
--- a/epochX/cudacpp/ee_mumu.mad/src/mgOnGpuConfig.h
+++ b/epochX/cudacpp/ee_mumu.mad/src/mgOnGpuConfig.h
@@ -17,9 +17,11 @@
 
 // Choose if curand is supported for generating random numbers
 // For C++, by default, do not inline, but allow this macro to be set from outside with e.g. -DMGONGPU_HAS_NO_CURAND
-#ifdef MGONGPUCPP_GPUIMPL
-#undef MGONGPU_HAS_NO_CURAND
-#else
+// Added support for HIP compilation by defining MGONGPU_HAS_NO_CURAND
+#ifdef MGONGPUCPP_CUDACC
+#undef MGONGPU_HAS_NO_CURAND 
+#else defined MGONGPUCPP_HIPCC
+#define MGONGPU_HAS_NO_CURAND
 //#undef MGONGPU_HAS_NO_CURAND // default
 ////#define MGONGPU_HAS_NO_CURAND 1
 #endif

From 8106e420692a8039804b5870d77747baf079306c Mon Sep 17 00:00:00 2001
From: Jorgen T <jorgen.teig@gmail.com>
Date: Thu, 15 Jun 2023 11:40:20 +0200
Subject: [PATCH 270/509] Removed rpath for testing

---
 epochX/cudacpp/ee_mumu.mad/SubProcesses/cudacpp.mk | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/cudacpp.mk b/epochX/cudacpp/ee_mumu.mad/SubProcesses/cudacpp.mk
index c41ff60358..1495f79f50 100644
--- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/cudacpp.mk
+++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/cudacpp.mk
@@ -458,7 +458,8 @@ ifeq ($(UNAME_S),Darwin)
 else
   # RPATH to cuda/cpp libs when linking executables
   override CXXLIBFLAGSRPATH = -Wl,-rpath,$(LIBDIRRPATH)
-  override CULIBFLAGSRPATH = -Xlinker -rpath,$(LIBDIRRPATH)
+  override CULIBFLAGSRPATH = 
+  # -Xlinker -rpath,$(LIBDIRRPATH)
   # RPATH to common lib when linking cuda/cpp libs
   override CXXLIBFLAGSRPATH2 = -Wl,-rpath,'$$ORIGIN'
   override CULIBFLAGSRPATH2 = 

From d698edaa11ed6ae11d7bf2976309222d84864f3c Mon Sep 17 00:00:00 2001
From: Jorgen T <jorgen.teig@gmail.com>
Date: Thu, 15 Jun 2023 14:13:12 +0200
Subject: [PATCH 271/509] Added some testing in abstraction

---
 epochX/cudacpp/ee_mumu.mad/SubProcesses/GpuAbstraction.h | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/GpuAbstraction.h b/epochX/cudacpp/ee_mumu.mad/SubProcesses/GpuAbstraction.h
index ac0d2b4d47..b3f53d3b19 100644
--- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/GpuAbstraction.h
+++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/GpuAbstraction.h
@@ -8,6 +8,7 @@
 
 #ifdef __HCC__
   #define MGONGPUCPP_HIPCC 1
+  #warning HCC Defined!
 #endif
 
 #ifdef MGONGPUCPP_CUDACC
@@ -46,6 +47,7 @@
 
   // Defines correct compiler
   #define MGONGPUCPP_GPUIMPL __HCC__
+  #warning MGONGPUCPP_GPUIMPL defined to __HCC__!
 
   //--------------------------------------------------------------------------
 

From a10140f4c70db9a86be3905ea1f173982b6785b7 Mon Sep 17 00:00:00 2001
From: Jorgen T <jorgen.teig@gmail.com>
Date: Thu, 15 Jun 2023 14:20:30 +0200
Subject: [PATCH 272/509] More testing

---
 epochX/cudacpp/ee_mumu.mad/src/mgOnGpuConfig.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/epochX/cudacpp/ee_mumu.mad/src/mgOnGpuConfig.h b/epochX/cudacpp/ee_mumu.mad/src/mgOnGpuConfig.h
index 4388c4df2a..ffbdf87e45 100644
--- a/epochX/cudacpp/ee_mumu.mad/src/mgOnGpuConfig.h
+++ b/epochX/cudacpp/ee_mumu.mad/src/mgOnGpuConfig.h
@@ -20,7 +20,7 @@
 // Added support for HIP compilation by defining MGONGPU_HAS_NO_CURAND
 #ifdef MGONGPUCPP_CUDACC
 #undef MGONGPU_HAS_NO_CURAND 
-#else defined MGONGPUCPP_HIPCC
+#else if defined MGONGPUCPP_HIPCC
 #define MGONGPU_HAS_NO_CURAND
 //#undef MGONGPU_HAS_NO_CURAND // default
 ////#define MGONGPU_HAS_NO_CURAND 1

From ebc413adc62c82b0c93cd49d221b174aa0808a54 Mon Sep 17 00:00:00 2001
From: Jorgen T <jorgen.teig@gmail.com>
Date: Thu, 15 Jun 2023 14:21:45 +0200
Subject: [PATCH 273/509] More testing

---
 epochX/cudacpp/ee_mumu.mad/SubProcesses/GpuAbstraction.h | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/GpuAbstraction.h b/epochX/cudacpp/ee_mumu.mad/SubProcesses/GpuAbstraction.h
index b3f53d3b19..e788c758da 100644
--- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/GpuAbstraction.h
+++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/GpuAbstraction.h
@@ -2,10 +2,12 @@
 
 #include <cassert>
 
+#warning Before __CUDACC__ definition!
 #ifdef __CUDACC__
   #define MGONGPUCPP_CUDACC 1
 #endif
 
+#warning Before __HCC__ definition!
 #ifdef __HCC__
   #define MGONGPUCPP_HIPCC 1
   #warning HCC Defined!

From 33b69710d14cec282652a7ce8f5c462290bde9f4 Mon Sep 17 00:00:00 2001
From: Jorgen T <jorgen.teig@gmail.com>
Date: Thu, 15 Jun 2023 14:27:56 +0200
Subject: [PATCH 274/509] Testing

---
 epochX/cudacpp/ee_mumu.mad/SubProcesses/GpuAbstraction.h | 1 +
 1 file changed, 1 insertion(+)

diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/GpuAbstraction.h b/epochX/cudacpp/ee_mumu.mad/SubProcesses/GpuAbstraction.h
index e788c758da..d9fbd61c9b 100644
--- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/GpuAbstraction.h
+++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/GpuAbstraction.h
@@ -1,6 +1,7 @@
 // GpuAbstraction.h
 
 #include <cassert>
+#include "hip_runtime.h"
 
 #warning Before __CUDACC__ definition!
 #ifdef __CUDACC__

From a68b25c1afad6b43d6d039325d79d5e5a3f257d7 Mon Sep 17 00:00:00 2001
From: Jorgen T <jorgen.teig@gmail.com>
Date: Thu, 15 Jun 2023 14:30:12 +0200
Subject: [PATCH 275/509] Testing 2

---
 epochX/cudacpp/ee_mumu.mad/SubProcesses/GpuAbstraction.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/GpuAbstraction.h b/epochX/cudacpp/ee_mumu.mad/SubProcesses/GpuAbstraction.h
index d9fbd61c9b..88b47c3130 100644
--- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/GpuAbstraction.h
+++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/GpuAbstraction.h
@@ -1,7 +1,7 @@
 // GpuAbstraction.h
 
 #include <cassert>
-#include "hip_runtime.h"
+#include <hip_runtime.h>
 
 #warning Before __CUDACC__ definition!
 #ifdef __CUDACC__

From fcd7a42ec96ea8329e416c99a66b6bf2c5c93462 Mon Sep 17 00:00:00 2001
From: Jorgen T <jorgen.teig@gmail.com>
Date: Thu, 15 Jun 2023 14:31:31 +0200
Subject: [PATCH 276/509] Testing 3

---
 epochX/cudacpp/ee_mumu.mad/SubProcesses/GpuAbstraction.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/GpuAbstraction.h b/epochX/cudacpp/ee_mumu.mad/SubProcesses/GpuAbstraction.h
index 88b47c3130..707f6c886b 100644
--- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/GpuAbstraction.h
+++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/GpuAbstraction.h
@@ -1,7 +1,7 @@
 // GpuAbstraction.h
 
 #include <cassert>
-#include <hip_runtime.h>
+#include <hip/hip_runtime.h>
 
 #warning Before __CUDACC__ definition!
 #ifdef __CUDACC__

From 6c0b4700ed8f288cb0d554d27cb579ad6517a160 Mon Sep 17 00:00:00 2001
From: Jorgen T <jorgen.teig@gmail.com>
Date: Thu, 15 Jun 2023 14:33:19 +0200
Subject: [PATCH 277/509] Testing 4

---
 epochX/cudacpp/ee_mumu.mad/SubProcesses/GpuAbstraction.h | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/GpuAbstraction.h b/epochX/cudacpp/ee_mumu.mad/SubProcesses/GpuAbstraction.h
index 707f6c886b..8e7ddef7d2 100644
--- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/GpuAbstraction.h
+++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/GpuAbstraction.h
@@ -3,13 +3,12 @@
 #include <cassert>
 #include <hip/hip_runtime.h>
 
-#warning Before __CUDACC__ definition!
 #ifdef __CUDACC__
   #define MGONGPUCPP_CUDACC 1
 #endif
 
 #warning Before __HCC__ definition!
-#ifdef __HCC__
+#ifdef __HIP_PLATFORM_HCC___
   #define MGONGPUCPP_HIPCC 1
   #warning HCC Defined!
 #endif

From bfc278273f26e5b492b48f169d030a9533df1395 Mon Sep 17 00:00:00 2001
From: Jorgen T <jorgen.teig@gmail.com>
Date: Thu, 15 Jun 2023 14:49:46 +0200
Subject: [PATCH 278/509] Testing 6

---
 epochX/cudacpp/ee_mumu.mad/SubProcesses/GpuAbstraction.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/GpuAbstraction.h b/epochX/cudacpp/ee_mumu.mad/SubProcesses/GpuAbstraction.h
index 8e7ddef7d2..e4c8b38dae 100644
--- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/GpuAbstraction.h
+++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/GpuAbstraction.h
@@ -8,7 +8,7 @@
 #endif
 
 #warning Before __HCC__ definition!
-#ifdef __HIP_PLATFORM_HCC___
+#ifdef __HIPCC__
   #define MGONGPUCPP_HIPCC 1
   #warning HCC Defined!
 #endif

From fef4e8d9497a62e72a7bb4fb4e0f6edb623fd289 Mon Sep 17 00:00:00 2001
From: Jorgen T <jorgen.teig@gmail.com>
Date: Thu, 15 Jun 2023 14:55:21 +0200
Subject: [PATCH 279/509] Removing lines for testing

---
 epochX/cudacpp/ee_mumu.mad/SubProcesses/GpuAbstraction.h | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/GpuAbstraction.h b/epochX/cudacpp/ee_mumu.mad/SubProcesses/GpuAbstraction.h
index e4c8b38dae..2a04020157 100644
--- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/GpuAbstraction.h
+++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/GpuAbstraction.h
@@ -1,16 +1,13 @@
 // GpuAbstraction.h
 
 #include <cassert>
-#include <hip/hip_runtime.h>
 
 #ifdef __CUDACC__
   #define MGONGPUCPP_CUDACC 1
 #endif
 
-#warning Before __HCC__ definition!
 #ifdef __HIPCC__
   #define MGONGPUCPP_HIPCC 1
-  #warning HCC Defined!
 #endif
 
 #ifdef MGONGPUCPP_CUDACC

From 0fcea2bf7b44177505f1ae88bbc4fe59469fa03f Mon Sep 17 00:00:00 2001
From: Jorgen T <jorgen.teig@gmail.com>
Date: Thu, 15 Jun 2023 15:00:38 +0200
Subject: [PATCH 280/509] Added small changes for HIP compilation

---
 epochX/cudacpp/ee_mumu.mad/SubProcesses/Bridge.h | 4 ++--
 epochX/cudacpp/ee_mumu.mad/src/mgOnGpuCxtypes.h  | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/Bridge.h b/epochX/cudacpp/ee_mumu.mad/SubProcesses/Bridge.h
index bb8a7c48f8..1ff661c20a 100644
--- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/Bridge.h
+++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/Bridge.h
@@ -284,11 +284,11 @@ namespace mg5amcCpu
     constexpr int neppM = MemoryAccessMomenta::neppM;
     if constexpr( neppM == 1 && std::is_same_v<FORTRANFPTYPE, fptype> )
     {
-      gpuMemcpy( m_devMomentaC.data(), momenta, m_devMomentaC.bytes(), cudaMemcpyHostToDevice );
+      gpuMemcpy( m_devMomentaC.data(), momenta, m_devMomentaC.bytes(), gpuMemcpyHostToDevice );
     }
     else
     {
-      gpuMemcpy( m_devMomentaF.data(), momenta, m_devMomentaF.bytes(), cudaMemcpyHostToDevice );
+      gpuMemcpy( m_devMomentaF.data(), momenta, m_devMomentaF.bytes(), gpuMemcpyHostToDevice );
       const int thrPerEvt = CPPProcess::npar * CPPProcess::np4; // AV: transpose alg does 1 element per thread (NOT 1 event per thread)
       //const int thrPerEvt = 1; // AV: try new alg with 1 event per thread... this seems slower
       gpuLaunchKernel(dev_transposeMomentaF2C, m_gpublocks * thrPerEvt, m_gputhreads, m_devMomentaF.data(), m_devMomentaC.data(), m_nevt );
diff --git a/epochX/cudacpp/ee_mumu.mad/src/mgOnGpuCxtypes.h b/epochX/cudacpp/ee_mumu.mad/src/mgOnGpuCxtypes.h
index c3b2f3fb57..e7797cc33d 100644
--- a/epochX/cudacpp/ee_mumu.mad/src/mgOnGpuCxtypes.h
+++ b/epochX/cudacpp/ee_mumu.mad/src/mgOnGpuCxtypes.h
@@ -19,7 +19,7 @@
 #include <complex>
 
 // Complex type in cuda: thrust or cucomplex or cxsmpl
-#define THRUST_IGNORE_CUB_VERSION_CHECK
+// #define THRUST_IGNORE_CUB_VERSION_CHECK
 #ifdef MGONGPUCPP_CUDACC
 #if defined MGONGPU_CUCXTYPE_THRUST
 #pragma clang diagnostic push
@@ -33,7 +33,7 @@
 #endif
 #else
 // Complex type in c++: std::complex or cxsmpl
-#if defined MGONGPU_CPPCXTYPE_STDCOMPLEX
+#if defined MGONGPU_CPPCXTYPE_STDCOMPLEX or MGONGPUCPP_HIPCC
 #include <cmath>
 #elif not defined MGONGPU_CPPCXTYPE_CXSMPL
 #error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CPPCXTYPE_STDCOMPLEX or MGONGPU_CPPCXTYPE_CXSMPL

From 7905854c2d5e5f80c457df588f2e8f1a7d666a94 Mon Sep 17 00:00:00 2001
From: Jorgen T <jorgen.teig@gmail.com>
Date: Thu, 15 Jun 2023 15:04:39 +0200
Subject: [PATCH 281/509] Removed more testing lines

---
 epochX/cudacpp/ee_mumu.mad/SubProcesses/GpuAbstraction.h | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/GpuAbstraction.h b/epochX/cudacpp/ee_mumu.mad/SubProcesses/GpuAbstraction.h
index 2a04020157..a914d5661e 100644
--- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/GpuAbstraction.h
+++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/GpuAbstraction.h
@@ -46,7 +46,6 @@
 
   // Defines correct compiler
   #define MGONGPUCPP_GPUIMPL __HCC__
-  #warning MGONGPUCPP_GPUIMPL defined to __HCC__!
 
   //--------------------------------------------------------------------------
 
@@ -71,9 +70,9 @@
   #define gpuDeviceReset hipDeviceReset
 
   #define gpuLaunchKernel(kernel, blocks, threads, sharedMemSize, ...) \
-          hipLaunchKernelGGL(kernel, blocks, threads, __VA_ARGS__);
+          hipLaunchKernelGGL(kernel, blocks, threads, __VA_ARGS__)
 
   #define gpuLaunchKernelSharedMem(kernel, blocks, threads, ...) \
-          hipLaunchKernelGGL(kernel, blocks, threads, sharedMemSize, __VA_ARGS__);
+          hipLaunchKernelGGL(kernel, blocks, threads, sharedMemSize, __VA_ARGS__)
 
 #endif
\ No newline at end of file

From a97d8dfd4834e24a3f785e337c1b4c41854f9547 Mon Sep 17 00:00:00 2001
From: Jorgen T <jorgen.teig@gmail.com>
Date: Thu, 15 Jun 2023 15:05:44 +0200
Subject: [PATCH 282/509] Removed typo

---
 epochX/cudacpp/ee_mumu.mad/src/mgOnGpuConfig.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/epochX/cudacpp/ee_mumu.mad/src/mgOnGpuConfig.h b/epochX/cudacpp/ee_mumu.mad/src/mgOnGpuConfig.h
index ffbdf87e45..67a9e01db3 100644
--- a/epochX/cudacpp/ee_mumu.mad/src/mgOnGpuConfig.h
+++ b/epochX/cudacpp/ee_mumu.mad/src/mgOnGpuConfig.h
@@ -20,7 +20,7 @@
 // Added support for HIP compilation by defining MGONGPU_HAS_NO_CURAND
 #ifdef MGONGPUCPP_CUDACC
 #undef MGONGPU_HAS_NO_CURAND 
-#else if defined MGONGPUCPP_HIPCC
+#elif defined MGONGPUCPP_HIPCC
 #define MGONGPU_HAS_NO_CURAND
 //#undef MGONGPU_HAS_NO_CURAND // default
 ////#define MGONGPU_HAS_NO_CURAND 1

From 95bd9ccfd988f33db1a38f92f888b80baa6b952a Mon Sep 17 00:00:00 2001
From: Jorgen T <jorgen.teig@gmail.com>
Date: Thu, 15 Jun 2023 15:27:29 +0200
Subject: [PATCH 283/509] Added back hip_runtime.h in abstraction

---
 epochX/cudacpp/ee_mumu.mad/SubProcesses/GpuAbstraction.h | 1 +
 1 file changed, 1 insertion(+)

diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/GpuAbstraction.h b/epochX/cudacpp/ee_mumu.mad/SubProcesses/GpuAbstraction.h
index a914d5661e..e5c6f31573 100644
--- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/GpuAbstraction.h
+++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/GpuAbstraction.h
@@ -7,6 +7,7 @@
 #endif
 
 #ifdef __HIPCC__
+  #include "hip_runtime.h"
   #define MGONGPUCPP_HIPCC 1
 #endif
 

From eb19aeea827527f5724ba6f54cec3b4dc0b9b45a Mon Sep 17 00:00:00 2001
From: Jorgen T <jorgen.teig@gmail.com>
Date: Thu, 15 Jun 2023 15:28:02 +0200
Subject: [PATCH 284/509] Fixed typo

---
 epochX/cudacpp/ee_mumu.mad/SubProcesses/GpuAbstraction.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/GpuAbstraction.h b/epochX/cudacpp/ee_mumu.mad/SubProcesses/GpuAbstraction.h
index e5c6f31573..e8956307ef 100644
--- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/GpuAbstraction.h
+++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/GpuAbstraction.h
@@ -7,7 +7,7 @@
 #endif
 
 #ifdef __HIPCC__
-  #include "hip_runtime.h"
+  #include "hip/hip_runtime.h"
   #define MGONGPUCPP_HIPCC 1
 #endif
 

From 5783902bebdf4251ff08a511b8a5034068456b2c Mon Sep 17 00:00:00 2001
From: Jorgen T <jorgen.teig@gmail.com>
Date: Thu, 15 Jun 2023 15:33:26 +0200
Subject: [PATCH 285/509] Fixed a ifdef that should be CUDA only

---
 epochX/cudacpp/ee_mumu.mad/src/mgOnGpuConfig.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/epochX/cudacpp/ee_mumu.mad/src/mgOnGpuConfig.h b/epochX/cudacpp/ee_mumu.mad/src/mgOnGpuConfig.h
index 67a9e01db3..99e8d3a11a 100644
--- a/epochX/cudacpp/ee_mumu.mad/src/mgOnGpuConfig.h
+++ b/epochX/cudacpp/ee_mumu.mad/src/mgOnGpuConfig.h
@@ -63,14 +63,14 @@
 #endif
 
 // Complex type in cuda: thrust or cucomplex or cxsmpl (CHOOSE ONLY ONE)
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef MGONGPUCPP_CUDACC
 #define MGONGPU_CUCXTYPE_THRUST 1 // default (~1.15E9/double, ~3.2E9/float)
 //#define MGONGPU_CUCXTYPE_CUCOMPLEX 1 // ~10 percent slower (1.03E9/double, ~2.8E9/float)
 //#define MGONGPU_CUCXTYPE_CXSMPL 1 // ~10 percent slower (1.00E9/double, ~2.9E9/float)
 #endif
 
 // Cuda nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef MGONGPUCPP_CUDACC
 #undef MGONGPU_NSIGHT_DEBUG // default
 //#define MGONGPU_NSIGHT_DEBUG 1
 #endif

From 7289a94049368a58d4f04b5fd9add914611342b7 Mon Sep 17 00:00:00 2001
From: Jorgen T <jorgen.teig@gmail.com>
Date: Thu, 15 Jun 2023 15:44:38 +0200
Subject: [PATCH 286/509] Added fixes for comples number ifdefs

---
 epochX/cudacpp/ee_mumu.mad/src/mgOnGpuConfig.h  | 3 ++-
 epochX/cudacpp/ee_mumu.mad/src/mgOnGpuCxtypes.h | 6 +++---
 2 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/epochX/cudacpp/ee_mumu.mad/src/mgOnGpuConfig.h b/epochX/cudacpp/ee_mumu.mad/src/mgOnGpuConfig.h
index 99e8d3a11a..210d04ac36 100644
--- a/epochX/cudacpp/ee_mumu.mad/src/mgOnGpuConfig.h
+++ b/epochX/cudacpp/ee_mumu.mad/src/mgOnGpuConfig.h
@@ -66,7 +66,8 @@
 #ifdef MGONGPUCPP_CUDACC
 #define MGONGPU_CUCXTYPE_THRUST 1 // default (~1.15E9/double, ~3.2E9/float)
 //#define MGONGPU_CUCXTYPE_CUCOMPLEX 1 // ~10 percent slower (1.03E9/double, ~2.8E9/float)
-//#define MGONGPU_CUCXTYPE_CXSMPL 1 // ~10 percent slower (1.00E9/double, ~2.9E9/float)
+#elif defined MGONGPUCPP_HIPCC
+#define MGONGPU_CUCXTYPE_CXSMPL 1 // ~10 percent slower (1.00E9/double, ~2.9E9/float)
 #endif
 
 // Cuda nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation
diff --git a/epochX/cudacpp/ee_mumu.mad/src/mgOnGpuCxtypes.h b/epochX/cudacpp/ee_mumu.mad/src/mgOnGpuCxtypes.h
index e7797cc33d..eb395001f1 100644
--- a/epochX/cudacpp/ee_mumu.mad/src/mgOnGpuCxtypes.h
+++ b/epochX/cudacpp/ee_mumu.mad/src/mgOnGpuCxtypes.h
@@ -20,7 +20,7 @@
 
 // Complex type in cuda: thrust or cucomplex or cxsmpl
 // #define THRUST_IGNORE_CUB_VERSION_CHECK
-#ifdef MGONGPUCPP_CUDACC
+#ifdef MGONGPUCPP_GPUIMPL
 #if defined MGONGPU_CUCXTYPE_THRUST
 #pragma clang diagnostic push
 #pragma clang diagnostic ignored "-Wtautological-compare" // for icpx2021/clang13 (https://stackoverflow.com/a/15864661)
@@ -33,7 +33,7 @@
 #endif
 #else
 // Complex type in c++: std::complex or cxsmpl
-#if defined MGONGPU_CPPCXTYPE_STDCOMPLEX or MGONGPUCPP_HIPCC
+#if defined MGONGPU_CPPCXTYPE_STDCOMPLEX
 #include <cmath>
 #elif not defined MGONGPU_CPPCXTYPE_CXSMPL
 #error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CPPCXTYPE_STDCOMPLEX or MGONGPU_CPPCXTYPE_CXSMPL
@@ -236,7 +236,7 @@ using mgOnGpu::cxtype;
 // COMPLEX TYPES: (PLATFORM-SPECIFIC) FUNCTIONS AND OPERATORS
 //==========================================================================
 
-#if defined MGONGPU_CUCXTYPE_CXSMPL or defined MGONGPU_CPPCXTYPE_CXSMPL
+#if defined MGONGPU_CUCXTYPE_CXSMPL or defined MGONGPU_CPPCXTYPE_CXSMPL or MGONGPUCPP_HIPCC
 
 //------------------------------
 // CUDA or C++ - using cxsmpl

From 665d37ca6dc9741e8051dc4259d2a0cf47bd87d3 Mon Sep 17 00:00:00 2001
From: Jorgen T <jorgen.teig@gmail.com>
Date: Thu, 15 Jun 2023 15:55:57 +0200
Subject: [PATCH 287/509] Testing in abstraction header

---
 epochX/cudacpp/ee_mumu.mad/SubProcesses/GpuAbstraction.h | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/GpuAbstraction.h b/epochX/cudacpp/ee_mumu.mad/SubProcesses/GpuAbstraction.h
index e8956307ef..8a5ba82b4d 100644
--- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/GpuAbstraction.h
+++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/GpuAbstraction.h
@@ -70,10 +70,13 @@
   #define gpuDeviceSynchronize hipDeviceSynchronize
   #define gpuDeviceReset hipDeviceReset
 
-  #define gpuLaunchKernel(kernel, blocks, threads, sharedMemSize, ...) \
+  #define gpuLaunchKernel( kernel, blocks, threads, ...)                    kernel<<<blocks, threads>>> (__VA_ARGS__)
+  #define gpuLaunchKernelSharedMem(kernel, blocks, threads, sharedMem, ...) kernel<<<blocks, threads, sharedMem>>>(__VA_ARGS__)
+
+  //#define gpuLaunchKernel(kernel, blocks, threads, sharedMemSize, ...) \
           hipLaunchKernelGGL(kernel, blocks, threads, __VA_ARGS__)
 
-  #define gpuLaunchKernelSharedMem(kernel, blocks, threads, ...) \
+  //#define gpuLaunchKernelSharedMem(kernel, blocks, threads, ...) \
           hipLaunchKernelGGL(kernel, blocks, threads, sharedMemSize, __VA_ARGS__)
 
 #endif
\ No newline at end of file

From 22b1142878811d14d6447d4ac2814150536fc1e1 Mon Sep 17 00:00:00 2001
From: Jorgen T <jorgen.teig@gmail.com>
Date: Thu, 15 Jun 2023 15:58:39 +0200
Subject: [PATCH 288/509] Added some fixes

---
 epochX/cudacpp/ee_mumu.mad/SubProcesses/GpuAbstraction.h | 6 ++----
 epochX/cudacpp/ee_mumu.mad/src/mgOnGpuConfig.h           | 2 +-
 2 files changed, 3 insertions(+), 5 deletions(-)

diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/GpuAbstraction.h b/epochX/cudacpp/ee_mumu.mad/SubProcesses/GpuAbstraction.h
index 8a5ba82b4d..aa64587b6f 100644
--- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/GpuAbstraction.h
+++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/GpuAbstraction.h
@@ -73,10 +73,8 @@
   #define gpuLaunchKernel( kernel, blocks, threads, ...)                    kernel<<<blocks, threads>>> (__VA_ARGS__)
   #define gpuLaunchKernelSharedMem(kernel, blocks, threads, sharedMem, ...) kernel<<<blocks, threads, sharedMem>>>(__VA_ARGS__)
 
-  //#define gpuLaunchKernel(kernel, blocks, threads, sharedMemSize, ...) \
-          hipLaunchKernelGGL(kernel, blocks, threads, __VA_ARGS__)
+  //#define gpuLaunchKernel(kernel, blocks, threads, sharedMemSize, ...) hipLaunchKernelGGL(kernel, blocks, threads, __VA_ARGS__)
 
-  //#define gpuLaunchKernelSharedMem(kernel, blocks, threads, ...) \
-          hipLaunchKernelGGL(kernel, blocks, threads, sharedMemSize, __VA_ARGS__)
+  //#define gpuLaunchKernelSharedMem(kernel, blocks, threads, ...) hipLaunchKernelGGL(kernel, blocks, threads, sharedMemSize, __VA_ARGS__)
 
 #endif
\ No newline at end of file
diff --git a/epochX/cudacpp/ee_mumu.mad/src/mgOnGpuConfig.h b/epochX/cudacpp/ee_mumu.mad/src/mgOnGpuConfig.h
index 210d04ac36..c18e7c3dfa 100644
--- a/epochX/cudacpp/ee_mumu.mad/src/mgOnGpuConfig.h
+++ b/epochX/cudacpp/ee_mumu.mad/src/mgOnGpuConfig.h
@@ -21,7 +21,7 @@
 #ifdef MGONGPUCPP_CUDACC
 #undef MGONGPU_HAS_NO_CURAND 
 #elif defined MGONGPUCPP_HIPCC
-#define MGONGPU_HAS_NO_CURAND
+#define MGONGPU_HAS_NO_CURAND 1
 //#undef MGONGPU_HAS_NO_CURAND // default
 ////#define MGONGPU_HAS_NO_CURAND 1
 #endif

From dc8290958cfbbdf89d87e51d702e0048324b9a2e Mon Sep 17 00:00:00 2001
From: Jorgen T <jorgen.teig@gmail.com>
Date: Thu, 15 Jun 2023 16:01:58 +0200
Subject: [PATCH 289/509] Added some more fixes in curand

---
 .../ee_mumu.mad/SubProcesses/CurandRandomNumberKernel.cc      | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/CurandRandomNumberKernel.cc b/epochX/cudacpp/ee_mumu.mad/SubProcesses/CurandRandomNumberKernel.cc
index 9d0bbe84d7..e78605042f 100644
--- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/CurandRandomNumberKernel.cc
+++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/CurandRandomNumberKernel.cc
@@ -10,7 +10,7 @@
 
 #include <cassert>
 
-#ifndef MGONGPU_HAS_NO_CURAND /* clang-format off */
+#ifndef MGONGPU_HAS_NO_CURAND and MGONGPUCPP_HIPCC /* clang-format off */
 #include "curand.h"
 #define checkCurand( code ){ assertCurand( code, __FILE__, __LINE__ ); }
 inline void assertCurand( curandStatus_t code, const char *file, int line, bool abort = true )
@@ -23,7 +23,7 @@ inline void assertCurand( curandStatus_t code, const char *file, int line, bool
 }
 #endif /* clang-format on */
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef MGONGPUCPP_CUDACC
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu

From e77172d588d40f7c8ebdc31a9845ab08089fd777 Mon Sep 17 00:00:00 2001
From: Jorgen T <jorgen.teig@gmail.com>
Date: Thu, 15 Jun 2023 16:03:08 +0200
Subject: [PATCH 290/509] Added some testing lines

---
 .../ee_mumu.mad/SubProcesses/CurandRandomNumberKernel.cc    | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/CurandRandomNumberKernel.cc b/epochX/cudacpp/ee_mumu.mad/SubProcesses/CurandRandomNumberKernel.cc
index e78605042f..c6a6f75f2e 100644
--- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/CurandRandomNumberKernel.cc
+++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/CurandRandomNumberKernel.cc
@@ -10,7 +10,11 @@
 
 #include <cassert>
 
-#ifndef MGONGPU_HAS_NO_CURAND and MGONGPUCPP_HIPCC /* clang-format off */
+#ifdef MGONGPU_HAS_NO_CURAND
+#warning MGONGPU_HAS_NO_CURAND is DEFINED!
+#endif
+
+#ifndef MGONGPU_HAS_NO_CURAND /* clang-format off */
 #include "curand.h"
 #define checkCurand( code ){ assertCurand( code, __FILE__, __LINE__ ); }
 inline void assertCurand( curandStatus_t code, const char *file, int line, bool abort = true )

From 9884b82b7d33974197deb775cfd04a463d553589 Mon Sep 17 00:00:00 2001
From: Jorgen T <jorgen.teig@gmail.com>
Date: Thu, 15 Jun 2023 16:07:58 +0200
Subject: [PATCH 291/509] Added some testing lines

---
 .../ee_mumu.mad/SubProcesses/CurandRandomNumberKernel.cc      | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/CurandRandomNumberKernel.cc b/epochX/cudacpp/ee_mumu.mad/SubProcesses/CurandRandomNumberKernel.cc
index c6a6f75f2e..90b5e63e65 100644
--- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/CurandRandomNumberKernel.cc
+++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/CurandRandomNumberKernel.cc
@@ -14,6 +14,7 @@
 #warning MGONGPU_HAS_NO_CURAND is DEFINED!
 #endif
 
+#ifdef MGONGPUCPP_CUDACC
 #ifndef MGONGPU_HAS_NO_CURAND /* clang-format off */
 #include "curand.h"
 #define checkCurand( code ){ assertCurand( code, __FILE__, __LINE__ ); }
@@ -41,7 +42,7 @@ namespace mg5amcCpu
   {
     if( m_isOnDevice )
     {
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef MGONGPUCPP_CUDACC
       if( !m_rnarray.isOnDevice() )
         throw std::runtime_error( "CurandRandomNumberKernel on device with a host random number array" );
 #else
@@ -136,4 +137,5 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 #endif
+#endif
 }

From 7eb0a02e6fc0b78b3b2e1319f2887a78641da59d Mon Sep 17 00:00:00 2001
From: Jorgen T <jorgen.teig@gmail.com>
Date: Thu, 15 Jun 2023 16:08:44 +0200
Subject: [PATCH 292/509] Fixed syntax error

---
 .../ee_mumu.mad/SubProcesses/CurandRandomNumberKernel.cc        | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/CurandRandomNumberKernel.cc b/epochX/cudacpp/ee_mumu.mad/SubProcesses/CurandRandomNumberKernel.cc
index 90b5e63e65..f5667bb588 100644
--- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/CurandRandomNumberKernel.cc
+++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/CurandRandomNumberKernel.cc
@@ -137,5 +137,5 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 #endif
-#endif
 }
+#endif
\ No newline at end of file

From e1fd37ef9e7a2e3b25f22f919fa15b87744fa942 Mon Sep 17 00:00:00 2001
From: Jorgen T <jorgen.teig@gmail.com>
Date: Thu, 15 Jun 2023 16:11:04 +0200
Subject: [PATCH 293/509] Added some syntax fixes

---
 .../cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/check_sa.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/check_sa.cc b/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/check_sa.cc
index b00a9515af..c2c3f53b73 100644
--- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/check_sa.cc
+++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/check_sa.cc
@@ -103,7 +103,7 @@ main( int argc, char** argv )
     CurandHost = 1,
     CurandDevice = 2
   };
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef MGONGPUCPP_CUDACPP
   RandomNumberMode rndgen = RandomNumberMode::CurandDevice; // default on GPU
 #elif not defined MGONGPU_HAS_NO_CURAND
   RandomNumberMode rndgen = RandomNumberMode::CurandHost;  // default on CPU if build has curand

From 44aa73d1f4fd07ac73166ec2b4b6d6f9cbff8a9c Mon Sep 17 00:00:00 2001
From: Jorgen T <jorgen.teig@gmail.com>
Date: Thu, 15 Jun 2023 16:16:46 +0200
Subject: [PATCH 294/509] Added some more syntax changes

---
 .../ee_mumu.mad/SubProcesses/P1_epem_mupmum/check_sa.cc       | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/check_sa.cc b/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/check_sa.cc
index c2c3f53b73..f339ad96a6 100644
--- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/check_sa.cc
+++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/check_sa.cc
@@ -146,7 +146,7 @@ main( int argc, char** argv )
     }
     else if( arg == "--curdev" )
     {
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef MGONGPUCPP_CUDACC
       rndgen = RandomNumberMode::CurandDevice;
 #else
       throw std::runtime_error( "CurandDevice is not supported on CPUs" );
@@ -395,7 +395,7 @@ main( int argc, char** argv )
     const bool onDevice = false;
     prnk.reset( new CurandRandomNumberKernel( hstRndmom, onDevice ) );
   }
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef MGONGPUCPP_CUDACC
   else
   {
     const bool onDevice = true;

From 0a29cff6cba86a1aca234257e708c245f8f921e6 Mon Sep 17 00:00:00 2001
From: Jorgen T <jorgen.teig@gmail.com>
Date: Thu, 15 Jun 2023 16:19:59 +0200
Subject: [PATCH 295/509] Added some more testing stuff

---
 .../cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/check_sa.cc  | 1 +
 1 file changed, 1 insertion(+)

diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/check_sa.cc b/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/check_sa.cc
index f339ad96a6..6496be22b1 100644
--- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/check_sa.cc
+++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/check_sa.cc
@@ -390,6 +390,7 @@ main( int argc, char** argv )
     prnk.reset( new CommonRandomNumberKernel( hstRndmom ) );
   }
 #ifndef MGONGPU_HAS_NO_CURAND
+  #warning THIS SHOULD NOT TRIGGER!!!
   else if( rndgen == RandomNumberMode::CurandHost )
   {
     const bool onDevice = false;

From dd08b7b6046998ec337af0f6c4fbea8691affba7 Mon Sep 17 00:00:00 2001
From: Jorgen T <jorgen.teig@gmail.com>
Date: Thu, 15 Jun 2023 16:33:05 +0200
Subject: [PATCH 296/509] Added some testing lines

---
 epochX/cudacpp/ee_mumu.mad/src/mgOnGpuConfig.h | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/epochX/cudacpp/ee_mumu.mad/src/mgOnGpuConfig.h b/epochX/cudacpp/ee_mumu.mad/src/mgOnGpuConfig.h
index c18e7c3dfa..7b3f4ac428 100644
--- a/epochX/cudacpp/ee_mumu.mad/src/mgOnGpuConfig.h
+++ b/epochX/cudacpp/ee_mumu.mad/src/mgOnGpuConfig.h
@@ -19,9 +19,12 @@
 // For C++, by default, do not inline, but allow this macro to be set from outside with e.g. -DMGONGPU_HAS_NO_CURAND
 // Added support for HIP compilation by defining MGONGPU_HAS_NO_CURAND
 #ifdef MGONGPUCPP_CUDACC
+#warning CUDACC IS DEFINED!
 #undef MGONGPU_HAS_NO_CURAND 
 #elif defined MGONGPUCPP_HIPCC
+#warning HIPCC IS DEFINED!
 #define MGONGPU_HAS_NO_CURAND 1
+#warning CURANC IFDEF IS DEFINED!
 //#undef MGONGPU_HAS_NO_CURAND // default
 ////#define MGONGPU_HAS_NO_CURAND 1
 #endif

From 6accaea0265be9fcf71f175d0980c441896989c5 Mon Sep 17 00:00:00 2001
From: Jorgen T <jorgen.teig@gmail.com>
Date: Thu, 15 Jun 2023 16:44:01 +0200
Subject: [PATCH 297/509] Testing 8

---
 epochX/cudacpp/ee_mumu.mad/src/mgOnGpuConfig.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/epochX/cudacpp/ee_mumu.mad/src/mgOnGpuConfig.h b/epochX/cudacpp/ee_mumu.mad/src/mgOnGpuConfig.h
index 7b3f4ac428..5de6c02c5b 100644
--- a/epochX/cudacpp/ee_mumu.mad/src/mgOnGpuConfig.h
+++ b/epochX/cudacpp/ee_mumu.mad/src/mgOnGpuConfig.h
@@ -24,7 +24,7 @@
 #elif defined MGONGPUCPP_HIPCC
 #warning HIPCC IS DEFINED!
 #define MGONGPU_HAS_NO_CURAND 1
-#warning CURANC IFDEF IS DEFINED!
+#warning MGONGPU_HAS_NO_CURAND = #MGONGPU_HAS_NO_CURAND
 //#undef MGONGPU_HAS_NO_CURAND // default
 ////#define MGONGPU_HAS_NO_CURAND 1
 #endif

From 01319ec8a2a1d73f02a560c1603b21627c588a74 Mon Sep 17 00:00:00 2001
From: Jorgen T <jorgen.teig@gmail.com>
Date: Thu, 15 Jun 2023 16:46:41 +0200
Subject: [PATCH 298/509] Testing 9

---
 epochX/cudacpp/ee_mumu.mad/src/mgOnGpuConfig.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/epochX/cudacpp/ee_mumu.mad/src/mgOnGpuConfig.h b/epochX/cudacpp/ee_mumu.mad/src/mgOnGpuConfig.h
index 5de6c02c5b..9bec8a58d4 100644
--- a/epochX/cudacpp/ee_mumu.mad/src/mgOnGpuConfig.h
+++ b/epochX/cudacpp/ee_mumu.mad/src/mgOnGpuConfig.h
@@ -24,7 +24,8 @@
 #elif defined MGONGPUCPP_HIPCC
 #warning HIPCC IS DEFINED!
 #define MGONGPU_HAS_NO_CURAND 1
-#warning MGONGPU_HAS_NO_CURAND = #MGONGPU_HAS_NO_CURAND
+#define STRINGIFY(x) #x
+#pragma message "MGONGPU_HAS_NO_CURAND =" STRINGIFY(MGONGPU_HAS_NO_CURAND)
 //#undef MGONGPU_HAS_NO_CURAND // default
 ////#define MGONGPU_HAS_NO_CURAND 1
 #endif

From d48c4aba45375e11059e51e1989720ba2d4f7f5e Mon Sep 17 00:00:00 2001
From: Jorgen T <jorgen.teig@gmail.com>
Date: Thu, 15 Jun 2023 16:53:34 +0200
Subject: [PATCH 299/509] Testing 10

---
 .../cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/check_sa.cc | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/check_sa.cc b/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/check_sa.cc
index 6496be22b1..28a2639219 100644
--- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/check_sa.cc
+++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/check_sa.cc
@@ -106,8 +106,10 @@ main( int argc, char** argv )
 #ifdef MGONGPUCPP_CUDACPP
   RandomNumberMode rndgen = RandomNumberMode::CurandDevice; // default on GPU
 #elif not defined MGONGPU_HAS_NO_CURAND
+  #warning Using CurandHost
   RandomNumberMode rndgen = RandomNumberMode::CurandHost;  // default on CPU if build has curand
 #else
+  #warning Using CommonRandom
   RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // default on CPU if build has no curand
 #endif
   // Rambo sampling mode (NB RamboHost implies CommonRandom or CurandHost!)

From 20932b8a2321b2551a78f423ba65406589f5b8ff Mon Sep 17 00:00:00 2001
From: Jorgen Teig <jteig@n4051701.lbdaq.cern.ch>
Date: Fri, 16 Jun 2023 16:05:15 +0200
Subject: [PATCH 300/509] Removed warnings when compiling and fixes to makefile
 for HIP compilation

---
 .../SubProcesses/CurandRandomNumberKernel.cc  |  8 +-
 .../ee_mumu.mad/SubProcesses/GpuAbstraction.h |  6 +-
 .../P1_epem_mupmum/.make_test.lock            |  0
 .../SubProcesses/P1_epem_mupmum/check_sa.cc   |  4 +-
 .../ee_mumu.mad/SubProcesses/cudacpp.mk       | 91 ++++++++++---------
 .../cudacpp/ee_mumu.mad/src/mgOnGpuConfig.h   |  8 +-
 6 files changed, 54 insertions(+), 63 deletions(-)
 create mode 100644 epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/.make_test.lock

diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/CurandRandomNumberKernel.cc b/epochX/cudacpp/ee_mumu.mad/SubProcesses/CurandRandomNumberKernel.cc
index f5667bb588..82dbcbabc4 100644
--- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/CurandRandomNumberKernel.cc
+++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/CurandRandomNumberKernel.cc
@@ -10,11 +10,6 @@
 
 #include <cassert>
 
-#ifdef MGONGPU_HAS_NO_CURAND
-#warning MGONGPU_HAS_NO_CURAND is DEFINED!
-#endif
-
-#ifdef MGONGPUCPP_CUDACC
 #ifndef MGONGPU_HAS_NO_CURAND /* clang-format off */
 #include "curand.h"
 #define checkCurand( code ){ assertCurand( code, __FILE__, __LINE__ ); }
@@ -137,5 +132,4 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 #endif
-}
-#endif
\ No newline at end of file
+}
\ No newline at end of file
diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/GpuAbstraction.h b/epochX/cudacpp/ee_mumu.mad/SubProcesses/GpuAbstraction.h
index aa64587b6f..f930874922 100644
--- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/GpuAbstraction.h
+++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/GpuAbstraction.h
@@ -64,7 +64,7 @@
   #define gpuMemcpyToSymbol(type1, type2, size) checkGpu( hipMemcpyToSymbol(type1, type2, size) )
 
   #define gpuFree(ptr) checkGpu( hipFree(ptr) )
-  #define gpuFreeHost(ptr) checkGpu( hipFreeHost(ptr) )
+  #define gpuFreeHost(ptr) checkGpu( hipHostFree(ptr) )
 
   #define gpuSetDevice hipSetDevice
   #define gpuDeviceSynchronize hipDeviceSynchronize
@@ -73,8 +73,4 @@
   #define gpuLaunchKernel( kernel, blocks, threads, ...)                    kernel<<<blocks, threads>>> (__VA_ARGS__)
   #define gpuLaunchKernelSharedMem(kernel, blocks, threads, sharedMem, ...) kernel<<<blocks, threads, sharedMem>>>(__VA_ARGS__)
 
-  //#define gpuLaunchKernel(kernel, blocks, threads, sharedMemSize, ...) hipLaunchKernelGGL(kernel, blocks, threads, __VA_ARGS__)
-
-  //#define gpuLaunchKernelSharedMem(kernel, blocks, threads, ...) hipLaunchKernelGGL(kernel, blocks, threads, sharedMemSize, __VA_ARGS__)
-
 #endif
\ No newline at end of file
diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/.make_test.lock b/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/.make_test.lock
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/check_sa.cc b/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/check_sa.cc
index 28a2639219..b8934356d5 100644
--- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/check_sa.cc
+++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/check_sa.cc
@@ -106,10 +106,8 @@ main( int argc, char** argv )
 #ifdef MGONGPUCPP_CUDACPP
   RandomNumberMode rndgen = RandomNumberMode::CurandDevice; // default on GPU
 #elif not defined MGONGPU_HAS_NO_CURAND
-  #warning Using CurandHost
   RandomNumberMode rndgen = RandomNumberMode::CurandHost;  // default on CPU if build has curand
 #else
-  #warning Using CommonRandom
   RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // default on CPU if build has no curand
 #endif
   // Rambo sampling mode (NB RamboHost implies CommonRandom or CurandHost!)
@@ -392,7 +390,7 @@ main( int argc, char** argv )
     prnk.reset( new CommonRandomNumberKernel( hstRndmom ) );
   }
 #ifndef MGONGPU_HAS_NO_CURAND
-  #warning THIS SHOULD NOT TRIGGER!!!
+
   else if( rndgen == RandomNumberMode::CurandHost )
   {
     const bool onDevice = false;
diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/cudacpp.mk b/epochX/cudacpp/ee_mumu.mad/SubProcesses/cudacpp.mk
index 1495f79f50..3a41d1eb51 100644
--- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/cudacpp.mk
+++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/cudacpp.mk
@@ -32,7 +32,7 @@ UNAME_P := $(shell uname -p)
 #=== Configure common compiler flags for C++ and CUDA
 
 INCFLAGS = -I.
-OPTFLAGS = -O3 # this ends up in CUFLAGS too (should it?), cannot add -Ofast or -ffast-math here
+OPTFLAGS = -O3 # this ends up in GPUFLAGS too (should it?), cannot add -Ofast or -ffast-math here
 
 # Dependency on src directory
 MG5AMC_COMMONLIB = mg5amc_common
@@ -90,12 +90,13 @@ endif
 #-------------------------------------------------------------------------------
 
 CUDA_COMPILER := $(shell which nvcc > /dev/null 2>&1; echo $$?)
+HIP_COMPILER := $(shell which hipcc > /dev/null 2>&1; echo $$?)
 
 ifeq ($(CUDA_COMPILER),0)
   #=== Configure the CUDA compiler
 
   # If CXX is not a single word (example "clang++ --gcc-toolchain...") then disable CUDA builds (issue #505)
-  # This is because it is impossible to pass this to "CUFLAGS += -ccbin <host-compiler>" below
+  # This is because it is impossible to pass this to "GPUFLAGS += -ccbin <host-compiler>" below
   ifneq ($(words $(subst ccache ,,$(CXX))),1) # allow at most "CXX=ccache <host-compiler>" from outside
     $(warning CUDA builds are not supported for multi-word CXX "$(CXX)")
     override CUDA_HOME=disabled
@@ -125,15 +126,15 @@ ifeq ($(CUDA_COMPILER),0)
     CUINC = -I$(CUDA_HOME)/include/
     CURANDLIBFLAGS = -L$(CUDA_HOME)/lib64/ -lcurand # NB: -lcuda is not needed here!
     CUOPTFLAGS = -lineinfo
-    CUFLAGS = $(OPTFLAGS) $(CUOPTFLAGS) $(INCFLAGS) $(CUINC) $(USE_NVTX) $(CUARCHFLAGS) -use_fast_math
-    ###CUFLAGS += -Xcompiler -Wall -Xcompiler -Wextra -Xcompiler -Wshadow
+    GPUFLAGS = $(OPTFLAGS) $(CUOPTFLAGS) $(INCFLAGS) $(CUINC) $(USE_NVTX) $(CUARCHFLAGS) -Xcompiler -fPIC
+    ###GPUFLAGS += -Xcompiler -Wall -Xcompiler -Wextra -Xcompiler -Wshadow
     ###GPUCC_VERSION = $(shell $(GPUCC) --version | grep 'Cuda compilation tools' | cut -d' ' -f5 | cut -d, -f1)
-    CUFLAGS += -std=c++17 # need CUDA >= 11.2 (see #333): this is enforced in mgOnGpuConfig.h
+    GPUFLAGS += -std=c++17 # need CUDA >= 11.2 (see #333): this is enforced in mgOnGpuConfig.h
     # Without -maxrregcount: baseline throughput: 6.5E8 (16384 32 12) up to 7.3E8 (65536 128 12)
-    ###CUFLAGS+= --maxrregcount 160 # improves throughput: 6.9E8 (16384 32 12) up to 7.7E8 (65536 128 12)
-    ###CUFLAGS+= --maxrregcount 128 # improves throughput: 7.3E8 (16384 32 12) up to 7.6E8 (65536 128 12)
-    ###CUFLAGS+= --maxrregcount 96 # degrades throughput: 4.1E8 (16384 32 12) up to 4.5E8 (65536 128 12)
-    ###CUFLAGS+= --maxrregcount 64 # degrades throughput: 1.7E8 (16384 32 12) flat at 1.7E8 (65536 128 12)
+    ###GPUFLAGS+= --maxrregcount 160 # improves throughput: 6.9E8 (16384 32 12) up to 7.7E8 (65536 128 12)
+    ###GPUFLAGS+= --maxrregcount 128 # improves throughput: 7.3E8 (16384 32 12) up to 7.6E8 (65536 128 12)
+    ###GPUFLAGS+= --maxrregcount 96 # degrades throughput: 4.1E8 (16384 32 12) up to 4.5E8 (65536 128 12)
+    ###GPUFLAGS+= --maxrregcount 64 # degrades throughput: 1.7E8 (16384 32 12) flat at 1.7E8 (65536 128 12)
   else ifneq ($(origin REQUIRE_CUDA),undefined)
     # If REQUIRE_CUDA is set but no cuda is found, stop here (e.g. for CI tests on GPU #443)
     $(error No cuda installation found (set CUDA_HOME or make GPUCC visible in PATH))
@@ -148,48 +149,45 @@ ifeq ($(CUDA_COMPILER),0)
 
   # Set the host C++ compiler for GPUCC via "-ccbin <host-compiler>"
   # (NB issue #505: this must be a single word, "clang++ --gcc-toolchain..." is not supported)
-  CUFLAGS += -ccbin $(shell which $(subst ccache ,,$(CXX)))
+  GPUFLAGS += -ccbin $(shell which $(subst ccache ,,$(CXX)))
 
   # Allow newer (unsupported) C++ compilers with older versions of CUDA if ALLOW_UNSUPPORTED_COMPILER_IN_CUDA is set (#504)
   ifneq ($(origin ALLOW_UNSUPPORTED_COMPILER_IN_CUDA),undefined)
-  CUFLAGS += -allow-unsupported-compiler
+  GPUFLAGS += -allow-unsupported-compiler
   endif
 
-else
+else ifeq ($(HIP_COMPILER),0)
     #=== Configure the HIP compiler
 
     # If HIP_HOME is not set, try to set it from the location of GPUCC
   ifndef HIP_HOME
     HIP_HOME = $(patsubst %bin/hipcc,%,$(shell which hipcc 2>/dev/null))
-    #$(warning HIP_HOME was not set: using "$(HIP_HOME)")
-  endif
-
-    HIP_HOME=/opt/rocm-5.4.3
     $(warning HIP_HOME was not set: using "$(HIP_HOME)")
+  endif
 
   # Set GPUCC as $(HIP_HOME)/bin/hipcc if it exists
   ifneq ($(wildcard $(HIP_HOME)/bin/hipcc),)
     GPUCC = $(HIP_HOME)/bin/hipcc
+
+    CXXFLAGS += -DMGONGPU_HAS_NO_CURAND -DHIP_PLATFORM=amd
     # Should maybe find something equivelant to this in HIP
     #USE_NVTX ?=-DUSE_NVTX
+    
+    HIPARCHFLAGS = -target x86_64-linux-gnu --offload-arch=gfx90a
+    HIPINC = -I$(HIP_HOME)/include/
 
-    #MADGRAPH_CUDA_ARCHITECTURE ?= 70
-    ###CUARCHFLAGS = -gencode arch=compute_$(MADGRAPH_CUDA_ARCHITECTURE),code=compute_$(MADGRAPH_CUDA_ARCHITECTURE) -gencode arch=compute_$(MADGRAPH_CUDA_ARCHITECTURE),code=sm_$(MADGRAPH_CUDA_ARCHITECTURE) # Older implementation (AV): go back to this one for multi-GPU support #533
-    ###CUARCHFLAGS = --gpu-architecture=compute_$(MADGRAPH_CUDA_ARCHITECTURE) --gpu-code=sm_$(MADGRAPH_CUDA_ARCHITECTURE),compute_$(MADGRAPH_CUDA_ARCHITECTURE) # Newer implementation (SH): cannot use this as-is for multi-GPU support #533
-    comma:=,
-    CUARCHFLAGS = -target x86_64-linux-gnu --offload-arch=gfx90a
-    CUINC = -I$(HIP_HOME)/include/
-    CURANDLIBFLAGS = -L$(HIP_HOME)/lib64/ # NB: -lcuda is not needed here!
-    #CUOPTFLAGS = -lineinfo
-    CUFLAGS = $(OPTFLAGS) $(CUOPTFLAGS) $(INCFLAGS) $(CUINC) $(CUARCHFLAGS) -use_fast_math
-    ###CUFLAGS += -Xcompiler -Wall -Xcompiler -Wextra -Xcompiler -Wshadow
+    #CURANDLIBFLAGS = -L$(HIP_HOME)/lib64/ # NB: -lcuda is not needed here!
+    # Not using CURAND in HIP
+
+    GPUFLAGS = $(OPTFLAGS) $(INCFLAGS) $(HIPINC) $(HIPARCHFLAGS) -fPIC
+    ###GPUFLAGS += -Xcompiler -Wall -Xcompiler -Wextra -Xcompiler -Wshadow
     ###GPUCC_VERSION = $(shell $(GPUCC) --version | grep 'Cuda compilation tools' | cut -d' ' -f5 | cut -d, -f1)
-    CUFLAGS += -std=c++17 # need CUDA >= 11.2 (see #333): this is enforced in mgOnGpuConfig.h
+    GPUFLAGS += -std=c++17 # need CUDA >= 11.2 (see #333): this is enforced in mgOnGpuConfig.h
     # Without -maxrregcount: baseline throughput: 6.5E8 (16384 32 12) up to 7.3E8 (65536 128 12)
-    ###CUFLAGS+= --maxrregcount 160 # improves throughput: 6.9E8 (16384 32 12) up to 7.7E8 (65536 128 12)
-    ###CUFLAGS+= --maxrregcount 128 # improves throughput: 7.3E8 (16384 32 12) up to 7.6E8 (65536 128 12)
-    ###CUFLAGS+= --maxrregcount 96 # degrades throughput: 4.1E8 (16384 32 12) up to 4.5E8 (65536 128 12)
-    ###CUFLAGS+= --maxrregcount 64 # degrades throughput: 1.7E8 (16384 32 12) flat at 1.7E8 (65536 128 12)
+    ###GPUFLAGS+= --maxrregcount 160 # improves throughput: 6.9E8 (16384 32 12) up to 7.7E8 (65536 128 12)
+    ###GPUFLAGS+= --maxrregcount 128 # improves throughput: 7.3E8 (16384 32 12) up to 7.6E8 (65536 128 12)
+    ###GPUFLAGS+= --maxrregcount 96 # degrades throughput: 4.1E8 (16384 32 12) up to 4.5E8 (65536 128 12)
+    ###GPUFLAGS+= --maxrregcount 64 # degrades throughput: 1.7E8 (16384 32 12) flat at 1.7E8 (65536 128 12)
   else ifneq ($(origin REQUIRE_CUDA),undefined)
     # If REQUIRE_CUDA is set but no cuda is found, stop here (e.g. for CI tests on GPU #443)
     $(error No cuda installation found (set CUDA_HOME or make GPUCC visible in PATH))
@@ -242,7 +240,7 @@ endif
 
 # PowerPC-specific CUDA compiler flags (to be reviewed!)
 ifeq ($(UNAME_P),ppc64le)
-  CUFLAGS+= -Xcompiler -mno-float128
+  GPUFLAGS+= -Xcompiler -mno-float128
 endif
 
 #-------------------------------------------------------------------------------
@@ -308,6 +306,8 @@ endif
 ifeq ($(RNDGEN),)
   ifeq ($(GPUCC),)
     override RNDGEN = hasNoCurand
+  else ifeq ($(findstring hipcc,$(GPUCC)),hipcc)
+    override RNDGEN = hasNoCurand
   else ifeq ($(RNDGEN),)
     override RNDGEN = hasCurand
   endif
@@ -381,13 +381,13 @@ CXXFLAGS+= $(AVXFLAGS)
 $(info FPTYPE=$(FPTYPE))
 ifeq ($(FPTYPE),d)
   CXXFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_DOUBLE
-  CUFLAGS  += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_DOUBLE
+  GPUFLAGS  += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_DOUBLE
 else ifeq ($(FPTYPE),f)
   CXXFLAGS += -DMGONGPU_FPTYPE_FLOAT -DMGONGPU_FPTYPE2_FLOAT
-  CUFLAGS  += -DMGONGPU_FPTYPE_FLOAT -DMGONGPU_FPTYPE2_FLOAT
+  GPUFLAGS  += -DMGONGPU_FPTYPE_FLOAT -DMGONGPU_FPTYPE2_FLOAT
 else ifeq ($(FPTYPE),m)
   CXXFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_FLOAT
-  CUFLAGS  += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_FLOAT
+  GPUFLAGS  += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_FLOAT
 else
   $(error Unknown FPTYPE='$(FPTYPE)': only 'd', 'f' and 'm' are supported)
 endif
@@ -396,7 +396,7 @@ endif
 $(info HELINL=$(HELINL))
 ifeq ($(HELINL),1)
   CXXFLAGS += -DMGONGPU_INLINE_HELAMPS
-  CUFLAGS  += -DMGONGPU_INLINE_HELAMPS
+  GPUFLAGS  += -DMGONGPU_INLINE_HELAMPS
 else ifneq ($(HELINL),0)
   $(error Unknown HELINL='$(HELINL)': only '0' and '1' are supported)
 endif
@@ -405,7 +405,7 @@ endif
 $(info HRDCOD=$(HRDCOD))
 ifeq ($(HRDCOD),1)
   CXXFLAGS += -DMGONGPU_HARDCODE_PARAM
-  CUFLAGS  += -DMGONGPU_HARDCODE_PARAM
+  GPUFLAGS  += -DMGONGPU_HARDCODE_PARAM
 else ifneq ($(HRDCOD),0)
   $(error Unknown HRDCOD='$(HRDCOD)': only '0' and '1' are supported)
 endif
@@ -510,11 +510,11 @@ $(BUILDDIR)/.build.$(TAG):
 ifneq ($(GPUCC),)
 $(BUILDDIR)/%.o : %.cu *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
 	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
-	$(GPUCC) $(CPPFLAGS) $(CUFLAGS) -Xcompiler -fPIC -c $< -o $@
+	$(GPUCC) $(CPPFLAGS) $(GPUFLAGS) -c $< -o $@
 
 $(BUILDDIR)/%_cu.o : %.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
 	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
-	$(GPUCC) $(CPPFLAGS) $(CUFLAGS) -Xcompiler -fPIC -c $< -o $@
+	$(GPUCC) $(CPPFLAGS) $(GPUFLAGS) -c $< -o $@
 endif
 # -x cu in line above
 
@@ -528,8 +528,15 @@ $(BUILDDIR)/%.o : %.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
 ifeq ($(shell $(CXX) --version | grep ^nvc++),)
 $(BUILDDIR)/CrossSectionKernels.o: CXXFLAGS += -fno-fast-math
 $(BUILDDIR)/CrossSectionKernels.o: CXXFLAGS += -fno-fast-math
+
+else ifeq ($(findstring hipcc,$(GPUCC)),hipcc)
+    override RNDGEN = hasNoCurand
 ifneq ($(GPUCC),)
-$(BUILDDIR)/gCrossSectionKernels.o: CUFLAGS += -Xcompiler -fno-fast-math
+  ifeq ($(findstring nvcc,$(GPUCC)),nvcc)
+    $(BUILDDIR)/gCrossSectionKernels.o: GPUFLAGS += -Xcompiler -fno-fast-math
+  else ifeq ($(findstring hipcc,$(GPUCC)),hipcc)
+    $(BUILDDIR)/gCrossSectionKernels.o: GPUFLAGS += -fno-fast-math
+  endif
 endif
 endif
 
@@ -548,7 +555,7 @@ endif
 # Avoid "warning: builtin __has_trivial_... is deprecated; use __is_trivially_... instead" in GPUCC with icx2023 (#592)
 ifneq ($(shell $(CXX) --version | egrep '^(Intel)'),)
 ifneq ($(GPUCC),)
-CUFLAGS += -Xcompiler -Wno-deprecated-builtins
+GPUFLAGS += -Xcompiler -Wno-deprecated-builtins
 endif
 endif
 
@@ -557,7 +564,7 @@ endif
 ###ifneq ($(shell $(CXX) --version | egrep '^(clang|Apple clang|Intel)'),)
 ###$(BUILDDIR)/CrossSectionKernels.o: CXXFLAGS += -Wno-overriding-t-option
 ###ifneq ($(GPUCC),)
-###$(BUILDDIR)/gCrossSectionKernels.o: CUFLAGS += -Xcompiler -Wno-overriding-t-option
+###$(BUILDDIR)/gCrossSectionKernels.o: GPUFLAGS += -Xcompiler -Wno-overriding-t-option
 ###endif
 ###endif
 
diff --git a/epochX/cudacpp/ee_mumu.mad/src/mgOnGpuConfig.h b/epochX/cudacpp/ee_mumu.mad/src/mgOnGpuConfig.h
index 9bec8a58d4..6a1e473f55 100644
--- a/epochX/cudacpp/ee_mumu.mad/src/mgOnGpuConfig.h
+++ b/epochX/cudacpp/ee_mumu.mad/src/mgOnGpuConfig.h
@@ -3,11 +3,11 @@
 // Created by: A. Valassi (Jul 2020) for the MG5aMC CUDACPP plugin.
 // Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 
-#include "GpuAbstraction.h"
-
 #ifndef MGONGPUCONFIG_H
 #define MGONGPUCONFIG_H 1
 
+#include "GpuAbstraction.h"
+
 // HARDCODED AT CODE GENERATION TIME: DO NOT MODIFY (#473)
 // There are two different code bases for standalone_cudacpp (without multichannel) and madevent+cudacpp (with multichannel)
 #define MGONGPU_SUPPORTS_MULTICHANNEL 1
@@ -19,13 +19,9 @@
 // For C++, by default, do not inline, but allow this macro to be set from outside with e.g. -DMGONGPU_HAS_NO_CURAND
 // Added support for HIP compilation by defining MGONGPU_HAS_NO_CURAND
 #ifdef MGONGPUCPP_CUDACC
-#warning CUDACC IS DEFINED!
 #undef MGONGPU_HAS_NO_CURAND 
 #elif defined MGONGPUCPP_HIPCC
-#warning HIPCC IS DEFINED!
 #define MGONGPU_HAS_NO_CURAND 1
-#define STRINGIFY(x) #x
-#pragma message "MGONGPU_HAS_NO_CURAND =" STRINGIFY(MGONGPU_HAS_NO_CURAND)
 //#undef MGONGPU_HAS_NO_CURAND // default
 ////#define MGONGPU_HAS_NO_CURAND 1
 #endif

From a36fed573ae9c595a4c4147d8fe17c6918b9aa00 Mon Sep 17 00:00:00 2001
From: Jorgen Teig <jteig@n4051701.lbdaq.cern.ch>
Date: Mon, 19 Jun 2023 14:33:24 +0200
Subject: [PATCH 301/509] Made alot of changes to the Makefile, no ROCm-capable
 device is detected problems

---
 epochX/cudacpp/ee_mumu.mad/SubProcesses/cudacpp.mk | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/cudacpp.mk b/epochX/cudacpp/ee_mumu.mad/SubProcesses/cudacpp.mk
index 3a41d1eb51..4d91f5d22d 100644
--- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/cudacpp.mk
+++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/cudacpp.mk
@@ -92,7 +92,7 @@ endif
 CUDA_COMPILER := $(shell which nvcc > /dev/null 2>&1; echo $$?)
 HIP_COMPILER := $(shell which hipcc > /dev/null 2>&1; echo $$?)
 
-ifeq ($(CUDA_COMPILER),0)
+ifeq ($(CUDA_COMPILER),test)
   #=== Configure the CUDA compiler
 
   # If CXX is not a single word (example "clang++ --gcc-toolchain...") then disable CUDA builds (issue #505)
@@ -126,7 +126,7 @@ ifeq ($(CUDA_COMPILER),0)
     CUINC = -I$(CUDA_HOME)/include/
     CURANDLIBFLAGS = -L$(CUDA_HOME)/lib64/ -lcurand # NB: -lcuda is not needed here!
     CUOPTFLAGS = -lineinfo
-    GPUFLAGS = $(OPTFLAGS) $(CUOPTFLAGS) $(INCFLAGS) $(CUINC) $(USE_NVTX) $(CUARCHFLAGS) -Xcompiler -fPIC
+     = $(OPTFLAGS) $(CUOPTFLAGS) $(INCFLAGS) $(CUINC) $(USE_NVTX) $(CUARCHFLAGS) -Xcompiler -fPIC
     ###GPUFLAGS += -Xcompiler -Wall -Xcompiler -Wextra -Xcompiler -Wshadow
     ###GPUCC_VERSION = $(shell $(GPUCC) --version | grep 'Cuda compilation tools' | cut -d' ' -f5 | cut -d, -f1)
     GPUFLAGS += -std=c++17 # need CUDA >= 11.2 (see #333): this is enforced in mgOnGpuConfig.h
@@ -179,7 +179,7 @@ else ifeq ($(HIP_COMPILER),0)
     #CURANDLIBFLAGS = -L$(HIP_HOME)/lib64/ # NB: -lcuda is not needed here!
     # Not using CURAND in HIP
 
-    GPUFLAGS = $(OPTFLAGS) $(INCFLAGS) $(HIPINC) $(HIPARCHFLAGS) -fPIC
+    GPUFLAGS = $(OPTFLAGS) $(INCFLAGS) $(HIPINC) $(HIPARCHFLAGS) -fPIC -DMGONGPU_HAS_NO_CURAND -DHIP_PLATFORM=amd
     ###GPUFLAGS += -Xcompiler -Wall -Xcompiler -Wextra -Xcompiler -Wshadow
     ###GPUCC_VERSION = $(shell $(GPUCC) --version | grep 'Cuda compilation tools' | cut -d' ' -f5 | cut -d, -f1)
     GPUFLAGS += -std=c++17 # need CUDA >= 11.2 (see #333): this is enforced in mgOnGpuConfig.h

From 9272a85ee6431a1615173524027cc1802f0d59e0 Mon Sep 17 00:00:00 2001
From: Jooorgen <jorgen.teig@gmail.com>
Date: Mon, 19 Jun 2023 14:53:35 +0200
Subject: [PATCH 302/509] Revert to commit and accept incoming

---
 .../SubProcesses/CurandRandomNumberKernel.cc  |  8 +-
 .../ee_mumu.mad/SubProcesses/GpuAbstraction.h |  6 +-
 .../P1_epem_mupmum/.make_test.lock            |  0
 .../SubProcesses/P1_epem_mupmum/check_sa.cc   |  4 +-
 .../ee_mumu.mad/SubProcesses/cudacpp.mk       | 91 +++++++++----------
 .../cudacpp/ee_mumu.mad/src/mgOnGpuConfig.h   |  8 +-
 6 files changed, 63 insertions(+), 54 deletions(-)
 delete mode 100644 epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/.make_test.lock

diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/CurandRandomNumberKernel.cc b/epochX/cudacpp/ee_mumu.mad/SubProcesses/CurandRandomNumberKernel.cc
index 82dbcbabc4..f5667bb588 100644
--- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/CurandRandomNumberKernel.cc
+++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/CurandRandomNumberKernel.cc
@@ -10,6 +10,11 @@
 
 #include <cassert>
 
+#ifdef MGONGPU_HAS_NO_CURAND
+#warning MGONGPU_HAS_NO_CURAND is DEFINED!
+#endif
+
+#ifdef MGONGPUCPP_CUDACC
 #ifndef MGONGPU_HAS_NO_CURAND /* clang-format off */
 #include "curand.h"
 #define checkCurand( code ){ assertCurand( code, __FILE__, __LINE__ ); }
@@ -132,4 +137,5 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 #endif
-}
\ No newline at end of file
+}
+#endif
\ No newline at end of file
diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/GpuAbstraction.h b/epochX/cudacpp/ee_mumu.mad/SubProcesses/GpuAbstraction.h
index f930874922..aa64587b6f 100644
--- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/GpuAbstraction.h
+++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/GpuAbstraction.h
@@ -64,7 +64,7 @@
   #define gpuMemcpyToSymbol(type1, type2, size) checkGpu( hipMemcpyToSymbol(type1, type2, size) )
 
   #define gpuFree(ptr) checkGpu( hipFree(ptr) )
-  #define gpuFreeHost(ptr) checkGpu( hipHostFree(ptr) )
+  #define gpuFreeHost(ptr) checkGpu( hipFreeHost(ptr) )
 
   #define gpuSetDevice hipSetDevice
   #define gpuDeviceSynchronize hipDeviceSynchronize
@@ -73,4 +73,8 @@
   #define gpuLaunchKernel( kernel, blocks, threads, ...)                    kernel<<<blocks, threads>>> (__VA_ARGS__)
   #define gpuLaunchKernelSharedMem(kernel, blocks, threads, sharedMem, ...) kernel<<<blocks, threads, sharedMem>>>(__VA_ARGS__)
 
+  //#define gpuLaunchKernel(kernel, blocks, threads, sharedMemSize, ...) hipLaunchKernelGGL(kernel, blocks, threads, __VA_ARGS__)
+
+  //#define gpuLaunchKernelSharedMem(kernel, blocks, threads, ...) hipLaunchKernelGGL(kernel, blocks, threads, sharedMemSize, __VA_ARGS__)
+
 #endif
\ No newline at end of file
diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/.make_test.lock b/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/.make_test.lock
deleted file mode 100644
index e69de29bb2..0000000000
diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/check_sa.cc b/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/check_sa.cc
index b8934356d5..28a2639219 100644
--- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/check_sa.cc
+++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/check_sa.cc
@@ -106,8 +106,10 @@ main( int argc, char** argv )
 #ifdef MGONGPUCPP_CUDACPP
   RandomNumberMode rndgen = RandomNumberMode::CurandDevice; // default on GPU
 #elif not defined MGONGPU_HAS_NO_CURAND
+  #warning Using CurandHost
   RandomNumberMode rndgen = RandomNumberMode::CurandHost;  // default on CPU if build has curand
 #else
+  #warning Using CommonRandom
   RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // default on CPU if build has no curand
 #endif
   // Rambo sampling mode (NB RamboHost implies CommonRandom or CurandHost!)
@@ -390,7 +392,7 @@ main( int argc, char** argv )
     prnk.reset( new CommonRandomNumberKernel( hstRndmom ) );
   }
 #ifndef MGONGPU_HAS_NO_CURAND
-
+  #warning THIS SHOULD NOT TRIGGER!!!
   else if( rndgen == RandomNumberMode::CurandHost )
   {
     const bool onDevice = false;
diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/cudacpp.mk b/epochX/cudacpp/ee_mumu.mad/SubProcesses/cudacpp.mk
index 4d91f5d22d..95d52e0c64 100644
--- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/cudacpp.mk
+++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/cudacpp.mk
@@ -32,7 +32,7 @@ UNAME_P := $(shell uname -p)
 #=== Configure common compiler flags for C++ and CUDA
 
 INCFLAGS = -I.
-OPTFLAGS = -O3 # this ends up in GPUFLAGS too (should it?), cannot add -Ofast or -ffast-math here
+OPTFLAGS = -O3 # this ends up in CUFLAGS too (should it?), cannot add -Ofast or -ffast-math here
 
 # Dependency on src directory
 MG5AMC_COMMONLIB = mg5amc_common
@@ -90,13 +90,12 @@ endif
 #-------------------------------------------------------------------------------
 
 CUDA_COMPILER := $(shell which nvcc > /dev/null 2>&1; echo $$?)
-HIP_COMPILER := $(shell which hipcc > /dev/null 2>&1; echo $$?)
 
 ifeq ($(CUDA_COMPILER),test)
   #=== Configure the CUDA compiler
 
   # If CXX is not a single word (example "clang++ --gcc-toolchain...") then disable CUDA builds (issue #505)
-  # This is because it is impossible to pass this to "GPUFLAGS += -ccbin <host-compiler>" below
+  # This is because it is impossible to pass this to "CUFLAGS += -ccbin <host-compiler>" below
   ifneq ($(words $(subst ccache ,,$(CXX))),1) # allow at most "CXX=ccache <host-compiler>" from outside
     $(warning CUDA builds are not supported for multi-word CXX "$(CXX)")
     override CUDA_HOME=disabled
@@ -126,15 +125,15 @@ ifeq ($(CUDA_COMPILER),test)
     CUINC = -I$(CUDA_HOME)/include/
     CURANDLIBFLAGS = -L$(CUDA_HOME)/lib64/ -lcurand # NB: -lcuda is not needed here!
     CUOPTFLAGS = -lineinfo
-     = $(OPTFLAGS) $(CUOPTFLAGS) $(INCFLAGS) $(CUINC) $(USE_NVTX) $(CUARCHFLAGS) -Xcompiler -fPIC
-    ###GPUFLAGS += -Xcompiler -Wall -Xcompiler -Wextra -Xcompiler -Wshadow
+    CUFLAGS = $(OPTFLAGS) $(CUOPTFLAGS) $(INCFLAGS) $(CUINC) $(USE_NVTX) $(CUARCHFLAGS) -use_fast_math
+    ###CUFLAGS += -Xcompiler -Wall -Xcompiler -Wextra -Xcompiler -Wshadow
     ###GPUCC_VERSION = $(shell $(GPUCC) --version | grep 'Cuda compilation tools' | cut -d' ' -f5 | cut -d, -f1)
-    GPUFLAGS += -std=c++17 # need CUDA >= 11.2 (see #333): this is enforced in mgOnGpuConfig.h
+    CUFLAGS += -std=c++17 # need CUDA >= 11.2 (see #333): this is enforced in mgOnGpuConfig.h
     # Without -maxrregcount: baseline throughput: 6.5E8 (16384 32 12) up to 7.3E8 (65536 128 12)
-    ###GPUFLAGS+= --maxrregcount 160 # improves throughput: 6.9E8 (16384 32 12) up to 7.7E8 (65536 128 12)
-    ###GPUFLAGS+= --maxrregcount 128 # improves throughput: 7.3E8 (16384 32 12) up to 7.6E8 (65536 128 12)
-    ###GPUFLAGS+= --maxrregcount 96 # degrades throughput: 4.1E8 (16384 32 12) up to 4.5E8 (65536 128 12)
-    ###GPUFLAGS+= --maxrregcount 64 # degrades throughput: 1.7E8 (16384 32 12) flat at 1.7E8 (65536 128 12)
+    ###CUFLAGS+= --maxrregcount 160 # improves throughput: 6.9E8 (16384 32 12) up to 7.7E8 (65536 128 12)
+    ###CUFLAGS+= --maxrregcount 128 # improves throughput: 7.3E8 (16384 32 12) up to 7.6E8 (65536 128 12)
+    ###CUFLAGS+= --maxrregcount 96 # degrades throughput: 4.1E8 (16384 32 12) up to 4.5E8 (65536 128 12)
+    ###CUFLAGS+= --maxrregcount 64 # degrades throughput: 1.7E8 (16384 32 12) flat at 1.7E8 (65536 128 12)
   else ifneq ($(origin REQUIRE_CUDA),undefined)
     # If REQUIRE_CUDA is set but no cuda is found, stop here (e.g. for CI tests on GPU #443)
     $(error No cuda installation found (set CUDA_HOME or make GPUCC visible in PATH))
@@ -149,45 +148,48 @@ ifeq ($(CUDA_COMPILER),test)
 
   # Set the host C++ compiler for GPUCC via "-ccbin <host-compiler>"
   # (NB issue #505: this must be a single word, "clang++ --gcc-toolchain..." is not supported)
-  GPUFLAGS += -ccbin $(shell which $(subst ccache ,,$(CXX)))
+  CUFLAGS += -ccbin $(shell which $(subst ccache ,,$(CXX)))
 
   # Allow newer (unsupported) C++ compilers with older versions of CUDA if ALLOW_UNSUPPORTED_COMPILER_IN_CUDA is set (#504)
   ifneq ($(origin ALLOW_UNSUPPORTED_COMPILER_IN_CUDA),undefined)
-  GPUFLAGS += -allow-unsupported-compiler
+  CUFLAGS += -allow-unsupported-compiler
   endif
 
-else ifeq ($(HIP_COMPILER),0)
+else
     #=== Configure the HIP compiler
 
     # If HIP_HOME is not set, try to set it from the location of GPUCC
   ifndef HIP_HOME
     HIP_HOME = $(patsubst %bin/hipcc,%,$(shell which hipcc 2>/dev/null))
-    $(warning HIP_HOME was not set: using "$(HIP_HOME)")
+    #$(warning HIP_HOME was not set: using "$(HIP_HOME)")
   endif
 
+    HIP_HOME=/opt/rocm-5.4.3
+    $(warning HIP_HOME was not set: using "$(HIP_HOME)")
+
   # Set GPUCC as $(HIP_HOME)/bin/hipcc if it exists
   ifneq ($(wildcard $(HIP_HOME)/bin/hipcc),)
     GPUCC = $(HIP_HOME)/bin/hipcc
-
-    CXXFLAGS += -DMGONGPU_HAS_NO_CURAND -DHIP_PLATFORM=amd
     # Should maybe find something equivelant to this in HIP
     #USE_NVTX ?=-DUSE_NVTX
-    
-    HIPARCHFLAGS = -target x86_64-linux-gnu --offload-arch=gfx90a
-    HIPINC = -I$(HIP_HOME)/include/
-
-    #CURANDLIBFLAGS = -L$(HIP_HOME)/lib64/ # NB: -lcuda is not needed here!
-    # Not using CURAND in HIP
 
-    GPUFLAGS = $(OPTFLAGS) $(INCFLAGS) $(HIPINC) $(HIPARCHFLAGS) -fPIC -DMGONGPU_HAS_NO_CURAND -DHIP_PLATFORM=amd
-    ###GPUFLAGS += -Xcompiler -Wall -Xcompiler -Wextra -Xcompiler -Wshadow
+    #MADGRAPH_CUDA_ARCHITECTURE ?= 70
+    ###CUARCHFLAGS = -gencode arch=compute_$(MADGRAPH_CUDA_ARCHITECTURE),code=compute_$(MADGRAPH_CUDA_ARCHITECTURE) -gencode arch=compute_$(MADGRAPH_CUDA_ARCHITECTURE),code=sm_$(MADGRAPH_CUDA_ARCHITECTURE) # Older implementation (AV): go back to this one for multi-GPU support #533
+    ###CUARCHFLAGS = --gpu-architecture=compute_$(MADGRAPH_CUDA_ARCHITECTURE) --gpu-code=sm_$(MADGRAPH_CUDA_ARCHITECTURE),compute_$(MADGRAPH_CUDA_ARCHITECTURE) # Newer implementation (SH): cannot use this as-is for multi-GPU support #533
+    comma:=,
+    CUARCHFLAGS = -target x86_64-linux-gnu --offload-arch=gfx90a
+    CUINC = -I$(HIP_HOME)/include/
+    CURANDLIBFLAGS = -L$(HIP_HOME)/lib64/ # NB: -lcuda is not needed here!
+    #CUOPTFLAGS = -lineinfo
+    CUFLAGS = $(OPTFLAGS) $(CUOPTFLAGS) $(INCFLAGS) $(CUINC) $(CUARCHFLAGS) -use_fast_math
+    ###CUFLAGS += -Xcompiler -Wall -Xcompiler -Wextra -Xcompiler -Wshadow
     ###GPUCC_VERSION = $(shell $(GPUCC) --version | grep 'Cuda compilation tools' | cut -d' ' -f5 | cut -d, -f1)
-    GPUFLAGS += -std=c++17 # need CUDA >= 11.2 (see #333): this is enforced in mgOnGpuConfig.h
+    CUFLAGS += -std=c++17 # need CUDA >= 11.2 (see #333): this is enforced in mgOnGpuConfig.h
     # Without -maxrregcount: baseline throughput: 6.5E8 (16384 32 12) up to 7.3E8 (65536 128 12)
-    ###GPUFLAGS+= --maxrregcount 160 # improves throughput: 6.9E8 (16384 32 12) up to 7.7E8 (65536 128 12)
-    ###GPUFLAGS+= --maxrregcount 128 # improves throughput: 7.3E8 (16384 32 12) up to 7.6E8 (65536 128 12)
-    ###GPUFLAGS+= --maxrregcount 96 # degrades throughput: 4.1E8 (16384 32 12) up to 4.5E8 (65536 128 12)
-    ###GPUFLAGS+= --maxrregcount 64 # degrades throughput: 1.7E8 (16384 32 12) flat at 1.7E8 (65536 128 12)
+    ###CUFLAGS+= --maxrregcount 160 # improves throughput: 6.9E8 (16384 32 12) up to 7.7E8 (65536 128 12)
+    ###CUFLAGS+= --maxrregcount 128 # improves throughput: 7.3E8 (16384 32 12) up to 7.6E8 (65536 128 12)
+    ###CUFLAGS+= --maxrregcount 96 # degrades throughput: 4.1E8 (16384 32 12) up to 4.5E8 (65536 128 12)
+    ###CUFLAGS+= --maxrregcount 64 # degrades throughput: 1.7E8 (16384 32 12) flat at 1.7E8 (65536 128 12)
   else ifneq ($(origin REQUIRE_CUDA),undefined)
     # If REQUIRE_CUDA is set but no cuda is found, stop here (e.g. for CI tests on GPU #443)
     $(error No cuda installation found (set CUDA_HOME or make GPUCC visible in PATH))
@@ -240,7 +242,7 @@ endif
 
 # PowerPC-specific CUDA compiler flags (to be reviewed!)
 ifeq ($(UNAME_P),ppc64le)
-  GPUFLAGS+= -Xcompiler -mno-float128
+  CUFLAGS+= -Xcompiler -mno-float128
 endif
 
 #-------------------------------------------------------------------------------
@@ -306,8 +308,6 @@ endif
 ifeq ($(RNDGEN),)
   ifeq ($(GPUCC),)
     override RNDGEN = hasNoCurand
-  else ifeq ($(findstring hipcc,$(GPUCC)),hipcc)
-    override RNDGEN = hasNoCurand
   else ifeq ($(RNDGEN),)
     override RNDGEN = hasCurand
   endif
@@ -381,13 +381,13 @@ CXXFLAGS+= $(AVXFLAGS)
 $(info FPTYPE=$(FPTYPE))
 ifeq ($(FPTYPE),d)
   CXXFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_DOUBLE
-  GPUFLAGS  += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_DOUBLE
+  CUFLAGS  += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_DOUBLE
 else ifeq ($(FPTYPE),f)
   CXXFLAGS += -DMGONGPU_FPTYPE_FLOAT -DMGONGPU_FPTYPE2_FLOAT
-  GPUFLAGS  += -DMGONGPU_FPTYPE_FLOAT -DMGONGPU_FPTYPE2_FLOAT
+  CUFLAGS  += -DMGONGPU_FPTYPE_FLOAT -DMGONGPU_FPTYPE2_FLOAT
 else ifeq ($(FPTYPE),m)
   CXXFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_FLOAT
-  GPUFLAGS  += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_FLOAT
+  CUFLAGS  += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_FLOAT
 else
   $(error Unknown FPTYPE='$(FPTYPE)': only 'd', 'f' and 'm' are supported)
 endif
@@ -396,7 +396,7 @@ endif
 $(info HELINL=$(HELINL))
 ifeq ($(HELINL),1)
   CXXFLAGS += -DMGONGPU_INLINE_HELAMPS
-  GPUFLAGS  += -DMGONGPU_INLINE_HELAMPS
+  CUFLAGS  += -DMGONGPU_INLINE_HELAMPS
 else ifneq ($(HELINL),0)
   $(error Unknown HELINL='$(HELINL)': only '0' and '1' are supported)
 endif
@@ -405,7 +405,7 @@ endif
 $(info HRDCOD=$(HRDCOD))
 ifeq ($(HRDCOD),1)
   CXXFLAGS += -DMGONGPU_HARDCODE_PARAM
-  GPUFLAGS  += -DMGONGPU_HARDCODE_PARAM
+  CUFLAGS  += -DMGONGPU_HARDCODE_PARAM
 else ifneq ($(HRDCOD),0)
   $(error Unknown HRDCOD='$(HRDCOD)': only '0' and '1' are supported)
 endif
@@ -510,11 +510,11 @@ $(BUILDDIR)/.build.$(TAG):
 ifneq ($(GPUCC),)
 $(BUILDDIR)/%.o : %.cu *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
 	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
-	$(GPUCC) $(CPPFLAGS) $(GPUFLAGS) -c $< -o $@
+	$(GPUCC) $(CPPFLAGS) $(CUFLAGS) -Xcompiler -fPIC -c $< -o $@
 
 $(BUILDDIR)/%_cu.o : %.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
 	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
-	$(GPUCC) $(CPPFLAGS) $(GPUFLAGS) -c $< -o $@
+	$(GPUCC) $(CPPFLAGS) $(CUFLAGS) -Xcompiler -fPIC -c $< -o $@
 endif
 # -x cu in line above
 
@@ -528,15 +528,8 @@ $(BUILDDIR)/%.o : %.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
 ifeq ($(shell $(CXX) --version | grep ^nvc++),)
 $(BUILDDIR)/CrossSectionKernels.o: CXXFLAGS += -fno-fast-math
 $(BUILDDIR)/CrossSectionKernels.o: CXXFLAGS += -fno-fast-math
-
-else ifeq ($(findstring hipcc,$(GPUCC)),hipcc)
-    override RNDGEN = hasNoCurand
 ifneq ($(GPUCC),)
-  ifeq ($(findstring nvcc,$(GPUCC)),nvcc)
-    $(BUILDDIR)/gCrossSectionKernels.o: GPUFLAGS += -Xcompiler -fno-fast-math
-  else ifeq ($(findstring hipcc,$(GPUCC)),hipcc)
-    $(BUILDDIR)/gCrossSectionKernels.o: GPUFLAGS += -fno-fast-math
-  endif
+$(BUILDDIR)/gCrossSectionKernels.o: CUFLAGS += -Xcompiler -fno-fast-math
 endif
 endif
 
@@ -555,7 +548,7 @@ endif
 # Avoid "warning: builtin __has_trivial_... is deprecated; use __is_trivially_... instead" in GPUCC with icx2023 (#592)
 ifneq ($(shell $(CXX) --version | egrep '^(Intel)'),)
 ifneq ($(GPUCC),)
-GPUFLAGS += -Xcompiler -Wno-deprecated-builtins
+CUFLAGS += -Xcompiler -Wno-deprecated-builtins
 endif
 endif
 
@@ -564,7 +557,7 @@ endif
 ###ifneq ($(shell $(CXX) --version | egrep '^(clang|Apple clang|Intel)'),)
 ###$(BUILDDIR)/CrossSectionKernels.o: CXXFLAGS += -Wno-overriding-t-option
 ###ifneq ($(GPUCC),)
-###$(BUILDDIR)/gCrossSectionKernels.o: GPUFLAGS += -Xcompiler -Wno-overriding-t-option
+###$(BUILDDIR)/gCrossSectionKernels.o: CUFLAGS += -Xcompiler -Wno-overriding-t-option
 ###endif
 ###endif
 
diff --git a/epochX/cudacpp/ee_mumu.mad/src/mgOnGpuConfig.h b/epochX/cudacpp/ee_mumu.mad/src/mgOnGpuConfig.h
index 6a1e473f55..9bec8a58d4 100644
--- a/epochX/cudacpp/ee_mumu.mad/src/mgOnGpuConfig.h
+++ b/epochX/cudacpp/ee_mumu.mad/src/mgOnGpuConfig.h
@@ -3,11 +3,11 @@
 // Created by: A. Valassi (Jul 2020) for the MG5aMC CUDACPP plugin.
 // Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 
+#include "GpuAbstraction.h"
+
 #ifndef MGONGPUCONFIG_H
 #define MGONGPUCONFIG_H 1
 
-#include "GpuAbstraction.h"
-
 // HARDCODED AT CODE GENERATION TIME: DO NOT MODIFY (#473)
 // There are two different code bases for standalone_cudacpp (without multichannel) and madevent+cudacpp (with multichannel)
 #define MGONGPU_SUPPORTS_MULTICHANNEL 1
@@ -19,9 +19,13 @@
 // For C++, by default, do not inline, but allow this macro to be set from outside with e.g. -DMGONGPU_HAS_NO_CURAND
 // Added support for HIP compilation by defining MGONGPU_HAS_NO_CURAND
 #ifdef MGONGPUCPP_CUDACC
+#warning CUDACC IS DEFINED!
 #undef MGONGPU_HAS_NO_CURAND 
 #elif defined MGONGPUCPP_HIPCC
+#warning HIPCC IS DEFINED!
 #define MGONGPU_HAS_NO_CURAND 1
+#define STRINGIFY(x) #x
+#pragma message "MGONGPU_HAS_NO_CURAND =" STRINGIFY(MGONGPU_HAS_NO_CURAND)
 //#undef MGONGPU_HAS_NO_CURAND // default
 ////#define MGONGPU_HAS_NO_CURAND 1
 #endif

From 9a6cf66c14515d101a1d4e0022ce1bd4bf360ba6 Mon Sep 17 00:00:00 2001
From: Jooorgen <jorgen.teig@gmail.com>
Date: Mon, 19 Jun 2023 15:05:31 +0200
Subject: [PATCH 303/509] Added some QOL improvements to make script for
 correct output

---
 .../ee_mumu.mad/SubProcesses/cudacpp.mk       | 76 +++++++++----------
 1 file changed, 38 insertions(+), 38 deletions(-)

diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/cudacpp.mk b/epochX/cudacpp/ee_mumu.mad/SubProcesses/cudacpp.mk
index 95d52e0c64..bc6576a850 100644
--- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/cudacpp.mk
+++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/cudacpp.mk
@@ -32,7 +32,7 @@ UNAME_P := $(shell uname -p)
 #=== Configure common compiler flags for C++ and CUDA
 
 INCFLAGS = -I.
-OPTFLAGS = -O3 # this ends up in CUFLAGS too (should it?), cannot add -Ofast or -ffast-math here
+OPTFLAGS = -O3 # this ends up in GPUFLAGS too (should it?), cannot add -Ofast or -ffast-math here
 
 # Dependency on src directory
 MG5AMC_COMMONLIB = mg5amc_common
@@ -90,12 +90,13 @@ endif
 #-------------------------------------------------------------------------------
 
 CUDA_COMPILER := $(shell which nvcc > /dev/null 2>&1; echo $$?)
+HIP_COMPILER := $(shell which hipcc > /dev/null 2>&1; echo $$?)
 
 ifeq ($(CUDA_COMPILER),test)
   #=== Configure the CUDA compiler
 
   # If CXX is not a single word (example "clang++ --gcc-toolchain...") then disable CUDA builds (issue #505)
-  # This is because it is impossible to pass this to "CUFLAGS += -ccbin <host-compiler>" below
+  # This is because it is impossible to pass this to "GPUFLAGS += -ccbin <host-compiler>" below
   ifneq ($(words $(subst ccache ,,$(CXX))),1) # allow at most "CXX=ccache <host-compiler>" from outside
     $(warning CUDA builds are not supported for multi-word CXX "$(CXX)")
     override CUDA_HOME=disabled
@@ -125,15 +126,15 @@ ifeq ($(CUDA_COMPILER),test)
     CUINC = -I$(CUDA_HOME)/include/
     CURANDLIBFLAGS = -L$(CUDA_HOME)/lib64/ -lcurand # NB: -lcuda is not needed here!
     CUOPTFLAGS = -lineinfo
-    CUFLAGS = $(OPTFLAGS) $(CUOPTFLAGS) $(INCFLAGS) $(CUINC) $(USE_NVTX) $(CUARCHFLAGS) -use_fast_math
-    ###CUFLAGS += -Xcompiler -Wall -Xcompiler -Wextra -Xcompiler -Wshadow
+    GPUFLAGS = $(OPTFLAGS) $(CUOPTFLAGS) $(INCFLAGS) $(CUINC) $(USE_NVTX) $(CUARCHFLAGS) -use_fast_math
+    ###GPUFLAGS += -Xcompiler -Wall -Xcompiler -Wextra -Xcompiler -Wshadow
     ###GPUCC_VERSION = $(shell $(GPUCC) --version | grep 'Cuda compilation tools' | cut -d' ' -f5 | cut -d, -f1)
-    CUFLAGS += -std=c++17 # need CUDA >= 11.2 (see #333): this is enforced in mgOnGpuConfig.h
+    GPUFLAGS += -std=c++17 # need CUDA >= 11.2 (see #333): this is enforced in mgOnGpuConfig.h
     # Without -maxrregcount: baseline throughput: 6.5E8 (16384 32 12) up to 7.3E8 (65536 128 12)
-    ###CUFLAGS+= --maxrregcount 160 # improves throughput: 6.9E8 (16384 32 12) up to 7.7E8 (65536 128 12)
-    ###CUFLAGS+= --maxrregcount 128 # improves throughput: 7.3E8 (16384 32 12) up to 7.6E8 (65536 128 12)
-    ###CUFLAGS+= --maxrregcount 96 # degrades throughput: 4.1E8 (16384 32 12) up to 4.5E8 (65536 128 12)
-    ###CUFLAGS+= --maxrregcount 64 # degrades throughput: 1.7E8 (16384 32 12) flat at 1.7E8 (65536 128 12)
+    ###GPUFLAGS+= --maxrregcount 160 # improves throughput: 6.9E8 (16384 32 12) up to 7.7E8 (65536 128 12)
+    ###GPUFLAGS+= --maxrregcount 128 # improves throughput: 7.3E8 (16384 32 12) up to 7.6E8 (65536 128 12)
+    ###GPUFLAGS+= --maxrregcount 96 # degrades throughput: 4.1E8 (16384 32 12) up to 4.5E8 (65536 128 12)
+    ###GPUFLAGS+= --maxrregcount 64 # degrades throughput: 1.7E8 (16384 32 12) flat at 1.7E8 (65536 128 12)
   else ifneq ($(origin REQUIRE_CUDA),undefined)
     # If REQUIRE_CUDA is set but no cuda is found, stop here (e.g. for CI tests on GPU #443)
     $(error No cuda installation found (set CUDA_HOME or make GPUCC visible in PATH))
@@ -148,14 +149,14 @@ ifeq ($(CUDA_COMPILER),test)
 
   # Set the host C++ compiler for GPUCC via "-ccbin <host-compiler>"
   # (NB issue #505: this must be a single word, "clang++ --gcc-toolchain..." is not supported)
-  CUFLAGS += -ccbin $(shell which $(subst ccache ,,$(CXX)))
+  GPUFLAGS += -ccbin $(shell which $(subst ccache ,,$(CXX)))
 
   # Allow newer (unsupported) C++ compilers with older versions of CUDA if ALLOW_UNSUPPORTED_COMPILER_IN_CUDA is set (#504)
   ifneq ($(origin ALLOW_UNSUPPORTED_COMPILER_IN_CUDA),undefined)
-  CUFLAGS += -allow-unsupported-compiler
+  GPUFLAGS += -allow-unsupported-compiler
   endif
 
-else
+else ifeq ($(HIP_COMPILER),0)
     #=== Configure the HIP compiler
 
     # If HIP_HOME is not set, try to set it from the location of GPUCC
@@ -173,23 +174,20 @@ else
     # Should maybe find something equivelant to this in HIP
     #USE_NVTX ?=-DUSE_NVTX
 
-    #MADGRAPH_CUDA_ARCHITECTURE ?= 70
-    ###CUARCHFLAGS = -gencode arch=compute_$(MADGRAPH_CUDA_ARCHITECTURE),code=compute_$(MADGRAPH_CUDA_ARCHITECTURE) -gencode arch=compute_$(MADGRAPH_CUDA_ARCHITECTURE),code=sm_$(MADGRAPH_CUDA_ARCHITECTURE) # Older implementation (AV): go back to this one for multi-GPU support #533
-    ###CUARCHFLAGS = --gpu-architecture=compute_$(MADGRAPH_CUDA_ARCHITECTURE) --gpu-code=sm_$(MADGRAPH_CUDA_ARCHITECTURE),compute_$(MADGRAPH_CUDA_ARCHITECTURE) # Newer implementation (SH): cannot use this as-is for multi-GPU support #533
-    comma:=,
-    CUARCHFLAGS = -target x86_64-linux-gnu --offload-arch=gfx90a
-    CUINC = -I$(HIP_HOME)/include/
-    CURANDLIBFLAGS = -L$(HIP_HOME)/lib64/ # NB: -lcuda is not needed here!
-    #CUOPTFLAGS = -lineinfo
-    CUFLAGS = $(OPTFLAGS) $(CUOPTFLAGS) $(INCFLAGS) $(CUINC) $(CUARCHFLAGS) -use_fast_math
-    ###CUFLAGS += -Xcompiler -Wall -Xcompiler -Wextra -Xcompiler -Wshadow
+    CXXFLAGS += -DMGONGPU_HAS_NO_CURAND -DHIP_PLATFORM=amd
+
+    HIPARCHFLAGS = -target x86_64-linux-gnu --offload-arch=gfx90a
+    HIPINC = -I$(HIP_HOME)/include/
+    
+    GPUFLAGS = $(OPTFLAGS) $(CUOPTFLAGS) $(INCFLAGS) $(HIPINC) $(HIPARCHFLAGS) -use_fast_math
+    ###GPUFLAGS += -Xcompiler -Wall -Xcompiler -Wextra -Xcompiler -Wshadow
     ###GPUCC_VERSION = $(shell $(GPUCC) --version | grep 'Cuda compilation tools' | cut -d' ' -f5 | cut -d, -f1)
-    CUFLAGS += -std=c++17 # need CUDA >= 11.2 (see #333): this is enforced in mgOnGpuConfig.h
+    GPUFLAGS += -std=c++17 # need CUDA >= 11.2 (see #333): this is enforced in mgOnGpuConfig.h
     # Without -maxrregcount: baseline throughput: 6.5E8 (16384 32 12) up to 7.3E8 (65536 128 12)
-    ###CUFLAGS+= --maxrregcount 160 # improves throughput: 6.9E8 (16384 32 12) up to 7.7E8 (65536 128 12)
-    ###CUFLAGS+= --maxrregcount 128 # improves throughput: 7.3E8 (16384 32 12) up to 7.6E8 (65536 128 12)
-    ###CUFLAGS+= --maxrregcount 96 # degrades throughput: 4.1E8 (16384 32 12) up to 4.5E8 (65536 128 12)
-    ###CUFLAGS+= --maxrregcount 64 # degrades throughput: 1.7E8 (16384 32 12) flat at 1.7E8 (65536 128 12)
+    ###GPUFLAGS+= --maxrregcount 160 # improves throughput: 6.9E8 (16384 32 12) up to 7.7E8 (65536 128 12)
+    ###GPUFLAGS+= --maxrregcount 128 # improves throughput: 7.3E8 (16384 32 12) up to 7.6E8 (65536 128 12)
+    ###GPUFLAGS+= --maxrregcount 96 # degrades throughput: 4.1E8 (16384 32 12) up to 4.5E8 (65536 128 12)
+    ###GPUFLAGS+= --maxrregcount 64 # degrades throughput: 1.7E8 (16384 32 12) flat at 1.7E8 (65536 128 12)
   else ifneq ($(origin REQUIRE_CUDA),undefined)
     # If REQUIRE_CUDA is set but no cuda is found, stop here (e.g. for CI tests on GPU #443)
     $(error No cuda installation found (set CUDA_HOME or make GPUCC visible in PATH))
@@ -242,7 +240,7 @@ endif
 
 # PowerPC-specific CUDA compiler flags (to be reviewed!)
 ifeq ($(UNAME_P),ppc64le)
-  CUFLAGS+= -Xcompiler -mno-float128
+  GPUFLAGS+= -Xcompiler -mno-float128
 endif
 
 #-------------------------------------------------------------------------------
@@ -308,6 +306,8 @@ endif
 ifeq ($(RNDGEN),)
   ifeq ($(GPUCC),)
     override RNDGEN = hasNoCurand
+  else ifeq ($(findstring hipcc,$(GPUCC)),hipcc)
+    override RNDGEN = hasNoCurand
   else ifeq ($(RNDGEN),)
     override RNDGEN = hasCurand
   endif
@@ -381,13 +381,13 @@ CXXFLAGS+= $(AVXFLAGS)
 $(info FPTYPE=$(FPTYPE))
 ifeq ($(FPTYPE),d)
   CXXFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_DOUBLE
-  CUFLAGS  += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_DOUBLE
+  GPUFLAGS  += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_DOUBLE
 else ifeq ($(FPTYPE),f)
   CXXFLAGS += -DMGONGPU_FPTYPE_FLOAT -DMGONGPU_FPTYPE2_FLOAT
-  CUFLAGS  += -DMGONGPU_FPTYPE_FLOAT -DMGONGPU_FPTYPE2_FLOAT
+  GPUFLAGS  += -DMGONGPU_FPTYPE_FLOAT -DMGONGPU_FPTYPE2_FLOAT
 else ifeq ($(FPTYPE),m)
   CXXFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_FLOAT
-  CUFLAGS  += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_FLOAT
+  GPUFLAGS  += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_FLOAT
 else
   $(error Unknown FPTYPE='$(FPTYPE)': only 'd', 'f' and 'm' are supported)
 endif
@@ -396,7 +396,7 @@ endif
 $(info HELINL=$(HELINL))
 ifeq ($(HELINL),1)
   CXXFLAGS += -DMGONGPU_INLINE_HELAMPS
-  CUFLAGS  += -DMGONGPU_INLINE_HELAMPS
+  GPUFLAGS  += -DMGONGPU_INLINE_HELAMPS
 else ifneq ($(HELINL),0)
   $(error Unknown HELINL='$(HELINL)': only '0' and '1' are supported)
 endif
@@ -405,7 +405,7 @@ endif
 $(info HRDCOD=$(HRDCOD))
 ifeq ($(HRDCOD),1)
   CXXFLAGS += -DMGONGPU_HARDCODE_PARAM
-  CUFLAGS  += -DMGONGPU_HARDCODE_PARAM
+  GPUFLAGS  += -DMGONGPU_HARDCODE_PARAM
 else ifneq ($(HRDCOD),0)
   $(error Unknown HRDCOD='$(HRDCOD)': only '0' and '1' are supported)
 endif
@@ -510,11 +510,11 @@ $(BUILDDIR)/.build.$(TAG):
 ifneq ($(GPUCC),)
 $(BUILDDIR)/%.o : %.cu *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
 	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
-	$(GPUCC) $(CPPFLAGS) $(CUFLAGS) -Xcompiler -fPIC -c $< -o $@
+	$(GPUCC) $(CPPFLAGS) $(GPUFLAGS) -Xcompiler -fPIC -c $< -o $@
 
 $(BUILDDIR)/%_cu.o : %.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
 	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
-	$(GPUCC) $(CPPFLAGS) $(CUFLAGS) -Xcompiler -fPIC -c $< -o $@
+	$(GPUCC) $(CPPFLAGS) $(GPUFLAGS) -Xcompiler -fPIC -c $< -o $@
 endif
 # -x cu in line above
 
@@ -529,7 +529,7 @@ ifeq ($(shell $(CXX) --version | grep ^nvc++),)
 $(BUILDDIR)/CrossSectionKernels.o: CXXFLAGS += -fno-fast-math
 $(BUILDDIR)/CrossSectionKernels.o: CXXFLAGS += -fno-fast-math
 ifneq ($(GPUCC),)
-$(BUILDDIR)/gCrossSectionKernels.o: CUFLAGS += -Xcompiler -fno-fast-math
+$(BUILDDIR)/gCrossSectionKernels.o: GPUFLAGS += -Xcompiler -fno-fast-math
 endif
 endif
 
@@ -548,7 +548,7 @@ endif
 # Avoid "warning: builtin __has_trivial_... is deprecated; use __is_trivially_... instead" in GPUCC with icx2023 (#592)
 ifneq ($(shell $(CXX) --version | egrep '^(Intel)'),)
 ifneq ($(GPUCC),)
-CUFLAGS += -Xcompiler -Wno-deprecated-builtins
+GPUFLAGS += -Xcompiler -Wno-deprecated-builtins
 endif
 endif
 
@@ -557,7 +557,7 @@ endif
 ###ifneq ($(shell $(CXX) --version | egrep '^(clang|Apple clang|Intel)'),)
 ###$(BUILDDIR)/CrossSectionKernels.o: CXXFLAGS += -Wno-overriding-t-option
 ###ifneq ($(GPUCC),)
-###$(BUILDDIR)/gCrossSectionKernels.o: CUFLAGS += -Xcompiler -Wno-overriding-t-option
+###$(BUILDDIR)/gCrossSectionKernels.o: GPUFLAGS += -Xcompiler -Wno-overriding-t-option
 ###endif
 ###endif
 

From ae7d68d4c258ba2178072e8fd6292a9b08b878d0 Mon Sep 17 00:00:00 2001
From: Jooorgen <jorgen.teig@gmail.com>
Date: Mon, 19 Jun 2023 15:28:54 +0200
Subject: [PATCH 304/509] Removed #warnings

---
 .../ee_mumu.mad/SubProcesses/CurandRandomNumberKernel.cc      | 4 ----
 .../ee_mumu.mad/SubProcesses/P1_epem_mupmum/check_sa.cc       | 3 ---
 epochX/cudacpp/ee_mumu.mad/src/mgOnGpuConfig.h                | 4 ----
 3 files changed, 11 deletions(-)

diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/CurandRandomNumberKernel.cc b/epochX/cudacpp/ee_mumu.mad/SubProcesses/CurandRandomNumberKernel.cc
index f5667bb588..a420fde1df 100644
--- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/CurandRandomNumberKernel.cc
+++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/CurandRandomNumberKernel.cc
@@ -10,10 +10,6 @@
 
 #include <cassert>
 
-#ifdef MGONGPU_HAS_NO_CURAND
-#warning MGONGPU_HAS_NO_CURAND is DEFINED!
-#endif
-
 #ifdef MGONGPUCPP_CUDACC
 #ifndef MGONGPU_HAS_NO_CURAND /* clang-format off */
 #include "curand.h"
diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/check_sa.cc b/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/check_sa.cc
index 28a2639219..f339ad96a6 100644
--- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/check_sa.cc
+++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/check_sa.cc
@@ -106,10 +106,8 @@ main( int argc, char** argv )
 #ifdef MGONGPUCPP_CUDACPP
   RandomNumberMode rndgen = RandomNumberMode::CurandDevice; // default on GPU
 #elif not defined MGONGPU_HAS_NO_CURAND
-  #warning Using CurandHost
   RandomNumberMode rndgen = RandomNumberMode::CurandHost;  // default on CPU if build has curand
 #else
-  #warning Using CommonRandom
   RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // default on CPU if build has no curand
 #endif
   // Rambo sampling mode (NB RamboHost implies CommonRandom or CurandHost!)
@@ -392,7 +390,6 @@ main( int argc, char** argv )
     prnk.reset( new CommonRandomNumberKernel( hstRndmom ) );
   }
 #ifndef MGONGPU_HAS_NO_CURAND
-  #warning THIS SHOULD NOT TRIGGER!!!
   else if( rndgen == RandomNumberMode::CurandHost )
   {
     const bool onDevice = false;
diff --git a/epochX/cudacpp/ee_mumu.mad/src/mgOnGpuConfig.h b/epochX/cudacpp/ee_mumu.mad/src/mgOnGpuConfig.h
index 9bec8a58d4..c18e7c3dfa 100644
--- a/epochX/cudacpp/ee_mumu.mad/src/mgOnGpuConfig.h
+++ b/epochX/cudacpp/ee_mumu.mad/src/mgOnGpuConfig.h
@@ -19,13 +19,9 @@
 // For C++, by default, do not inline, but allow this macro to be set from outside with e.g. -DMGONGPU_HAS_NO_CURAND
 // Added support for HIP compilation by defining MGONGPU_HAS_NO_CURAND
 #ifdef MGONGPUCPP_CUDACC
-#warning CUDACC IS DEFINED!
 #undef MGONGPU_HAS_NO_CURAND 
 #elif defined MGONGPUCPP_HIPCC
-#warning HIPCC IS DEFINED!
 #define MGONGPU_HAS_NO_CURAND 1
-#define STRINGIFY(x) #x
-#pragma message "MGONGPU_HAS_NO_CURAND =" STRINGIFY(MGONGPU_HAS_NO_CURAND)
 //#undef MGONGPU_HAS_NO_CURAND // default
 ////#define MGONGPU_HAS_NO_CURAND 1
 #endif

From e185405992fa06800ea51f337613ddc53bb4ed5b Mon Sep 17 00:00:00 2001
From: Jorgen T <jorgen.teig@gmail.com>
Date: Tue, 20 Jun 2023 19:15:06 +0200
Subject: [PATCH 305/509] Added include guards for the gpu abstraction header

---
 epochX/cudacpp/ee_mumu.mad/SubProcesses/GpuAbstraction.h | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/GpuAbstraction.h b/epochX/cudacpp/ee_mumu.mad/SubProcesses/GpuAbstraction.h
index aa64587b6f..7d77db62dd 100644
--- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/GpuAbstraction.h
+++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/GpuAbstraction.h
@@ -1,4 +1,5 @@
-// GpuAbstraction.h
+#ifndef MG5AMC_GPUABSTRACTION_H
+#define MG5AMC_GPUABSTRACTION_H 1
 
 #include <cassert>
 
@@ -77,4 +78,6 @@
 
   //#define gpuLaunchKernelSharedMem(kernel, blocks, threads, ...) hipLaunchKernelGGL(kernel, blocks, threads, sharedMemSize, __VA_ARGS__)
 
-#endif
\ No newline at end of file
+#endif
+
+#endif // MG5AMC_GPUABSTRACTION_H
\ No newline at end of file

From 5431d84c33b58dff6e39b88cca56530a36518c09 Mon Sep 17 00:00:00 2001
From: Jorgen T <jorgen.teig@gmail.com>
Date: Wed, 21 Jun 2023 16:20:54 +0200
Subject: [PATCH 306/509] Added some redundant features in Makefile and added
 ToDo list in GpuAbstraction header

---
 .../cudacpp/ee_mumu.mad/SubProcesses/GpuAbstraction.h | 11 +++++++----
 epochX/cudacpp/ee_mumu.mad/SubProcesses/cudacpp.mk    |  9 ++++-----
 2 files changed, 11 insertions(+), 9 deletions(-)

diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/GpuAbstraction.h b/epochX/cudacpp/ee_mumu.mad/SubProcesses/GpuAbstraction.h
index 7d77db62dd..1c7cb333a2 100644
--- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/GpuAbstraction.h
+++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/GpuAbstraction.h
@@ -3,6 +3,13 @@
 
 #include <cassert>
 
+/*
+  ToDo:
+      * Fix rpath in makefile when compiling with HIP
+      * Fix warnings with improper hip function return code handling
+*/
+
+
 #ifdef __CUDACC__
   #define MGONGPUCPP_CUDACC 1
 #endif
@@ -74,10 +81,6 @@
   #define gpuLaunchKernel( kernel, blocks, threads, ...)                    kernel<<<blocks, threads>>> (__VA_ARGS__)
   #define gpuLaunchKernelSharedMem(kernel, blocks, threads, sharedMem, ...) kernel<<<blocks, threads, sharedMem>>>(__VA_ARGS__)
 
-  //#define gpuLaunchKernel(kernel, blocks, threads, sharedMemSize, ...) hipLaunchKernelGGL(kernel, blocks, threads, __VA_ARGS__)
-
-  //#define gpuLaunchKernelSharedMem(kernel, blocks, threads, ...) hipLaunchKernelGGL(kernel, blocks, threads, sharedMemSize, __VA_ARGS__)
-
 #endif
 
 #endif // MG5AMC_GPUABSTRACTION_H
\ No newline at end of file
diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/cudacpp.mk b/epochX/cudacpp/ee_mumu.mad/SubProcesses/cudacpp.mk
index bc6576a850..4483176b5d 100644
--- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/cudacpp.mk
+++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/cudacpp.mk
@@ -171,15 +171,14 @@ else ifeq ($(HIP_COMPILER),0)
   # Set GPUCC as $(HIP_HOME)/bin/hipcc if it exists
   ifneq ($(wildcard $(HIP_HOME)/bin/hipcc),)
     GPUCC = $(HIP_HOME)/bin/hipcc
+
     # Should maybe find something equivelant to this in HIP
     #USE_NVTX ?=-DUSE_NVTX
 
-    CXXFLAGS += -DMGONGPU_HAS_NO_CURAND -DHIP_PLATFORM=amd
-
     HIPARCHFLAGS = -target x86_64-linux-gnu --offload-arch=gfx90a
     HIPINC = -I$(HIP_HOME)/include/
     
-    GPUFLAGS = $(OPTFLAGS) $(CUOPTFLAGS) $(INCFLAGS) $(HIPINC) $(HIPARCHFLAGS) -use_fast_math
+    GPUFLAGS = $(OPTFLAGS) $(CUOPTFLAGS) $(INCFLAGS) $(HIPINC) $(HIPARCHFLAGS) -use_fast_math -DHIP_PLATFORM=amd
     ###GPUFLAGS += -Xcompiler -Wall -Xcompiler -Wextra -Xcompiler -Wshadow
     ###GPUCC_VERSION = $(shell $(GPUCC) --version | grep 'Cuda compilation tools' | cut -d' ' -f5 | cut -d, -f1)
     GPUFLAGS += -std=c++17 # need CUDA >= 11.2 (see #333): this is enforced in mgOnGpuConfig.h
@@ -192,8 +191,8 @@ else ifeq ($(HIP_COMPILER),0)
     # If REQUIRE_CUDA is set but no cuda is found, stop here (e.g. for CI tests on GPU #443)
     $(error No cuda installation found (set CUDA_HOME or make GPUCC visible in PATH))
   else
-    # No cuda. Switch cuda compilation off and go to common random numbers in C++
-    $(warning CUDA_HOME is not set or is invalid: export CUDA_HOME to compile with cuda)
+    # No hip. Switch hip compilation off and go to common random numbers in C++
+    $(warning HIP_HOME is not set or is invalid: export HIP_HOME to compile with hip)
     override GPUCC=
     override USE_NVTX=
     override CUINC=

From 2e17a0d4245d2abf41d6bbbe3847ffff61d07a20 Mon Sep 17 00:00:00 2001
From: Jorgen T <jorgen.teig@gmail.com>
Date: Wed, 21 Jun 2023 17:16:55 +0200
Subject: [PATCH 307/509] Testing adding stuff to CXXFLAGS

---
 epochX/cudacpp/ee_mumu.mad/SubProcesses/cudacpp.mk | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/cudacpp.mk b/epochX/cudacpp/ee_mumu.mad/SubProcesses/cudacpp.mk
index 4483176b5d..599d7d1c22 100644
--- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/cudacpp.mk
+++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/cudacpp.mk
@@ -177,6 +177,8 @@ else ifeq ($(HIP_COMPILER),0)
 
     HIPARCHFLAGS = -target x86_64-linux-gnu --offload-arch=gfx90a
     HIPINC = -I$(HIP_HOME)/include/
+
+    CXXFLAGS += -DHIP_PLATFORM=amd
     
     GPUFLAGS = $(OPTFLAGS) $(CUOPTFLAGS) $(INCFLAGS) $(HIPINC) $(HIPARCHFLAGS) -use_fast_math -DHIP_PLATFORM=amd
     ###GPUFLAGS += -Xcompiler -Wall -Xcompiler -Wextra -Xcompiler -Wshadow

From cd90274571fec166884248c98c4db7856283ca5f Mon Sep 17 00:00:00 2001
From: Jorgen T <jorgen.teig@gmail.com>
Date: Wed, 21 Jun 2023 17:43:02 +0200
Subject: [PATCH 308/509] Fixed warnings in GPU abstraction header and removed
 unused compiler flags in HIP

---
 .../cudacpp/ee_mumu.mad/SubProcesses/GpuAbstraction.h |  4 ++--
 epochX/cudacpp/ee_mumu.mad/SubProcesses/cudacpp.mk    | 11 +++++------
 2 files changed, 7 insertions(+), 8 deletions(-)

diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/GpuAbstraction.h b/epochX/cudacpp/ee_mumu.mad/SubProcesses/GpuAbstraction.h
index 1c7cb333a2..b782ffbe79 100644
--- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/GpuAbstraction.h
+++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/GpuAbstraction.h
@@ -72,11 +72,11 @@
   #define gpuMemcpyToSymbol(type1, type2, size) checkGpu( hipMemcpyToSymbol(type1, type2, size) )
 
   #define gpuFree(ptr) checkGpu( hipFree(ptr) )
-  #define gpuFreeHost(ptr) checkGpu( hipFreeHost(ptr) )
+  #define gpuFreeHost(ptr) checkGpu( hipHostFree(ptr) )
 
   #define gpuSetDevice hipSetDevice
   #define gpuDeviceSynchronize hipDeviceSynchronize
-  #define gpuDeviceReset hipDeviceReset
+  #define gpuDeviceReset checkGpu( hipDeviceReset )
 
   #define gpuLaunchKernel( kernel, blocks, threads, ...)                    kernel<<<blocks, threads>>> (__VA_ARGS__)
   #define gpuLaunchKernelSharedMem(kernel, blocks, threads, sharedMem, ...) kernel<<<blocks, threads, sharedMem>>>(__VA_ARGS__)
diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/cudacpp.mk b/epochX/cudacpp/ee_mumu.mad/SubProcesses/cudacpp.mk
index 599d7d1c22..16d72b273d 100644
--- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/cudacpp.mk
+++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/cudacpp.mk
@@ -126,7 +126,7 @@ ifeq ($(CUDA_COMPILER),test)
     CUINC = -I$(CUDA_HOME)/include/
     CURANDLIBFLAGS = -L$(CUDA_HOME)/lib64/ -lcurand # NB: -lcuda is not needed here!
     CUOPTFLAGS = -lineinfo
-    GPUFLAGS = $(OPTFLAGS) $(CUOPTFLAGS) $(INCFLAGS) $(CUINC) $(USE_NVTX) $(CUARCHFLAGS) -use_fast_math
+    GPUFLAGS = $(OPTFLAGS) $(CUOPTFLAGS) $(INCFLAGS) $(CUINC) $(USE_NVTX) $(CUARCHFLAGS) -use_fast_math -Xcompiler -fPIC
     ###GPUFLAGS += -Xcompiler -Wall -Xcompiler -Wextra -Xcompiler -Wshadow
     ###GPUCC_VERSION = $(shell $(GPUCC) --version | grep 'Cuda compilation tools' | cut -d' ' -f5 | cut -d, -f1)
     GPUFLAGS += -std=c++17 # need CUDA >= 11.2 (see #333): this is enforced in mgOnGpuConfig.h
@@ -178,9 +178,8 @@ else ifeq ($(HIP_COMPILER),0)
     HIPARCHFLAGS = -target x86_64-linux-gnu --offload-arch=gfx90a
     HIPINC = -I$(HIP_HOME)/include/
 
-    CXXFLAGS += -DHIP_PLATFORM=amd
-    
-    GPUFLAGS = $(OPTFLAGS) $(CUOPTFLAGS) $(INCFLAGS) $(HIPINC) $(HIPARCHFLAGS) -use_fast_math -DHIP_PLATFORM=amd
+    # -DHIP_FAST_MATH equivelant to -use_fast_math in HIP
+    GPUFLAGS = $(OPTFLAGS) $(CUOPTFLAGS) $(INCFLAGS) $(HIPINC) $(HIPARCHFLAGS) -DHIP_FAST_MATH -DHIP_PLATFORM=amd -fPIC
     ###GPUFLAGS += -Xcompiler -Wall -Xcompiler -Wextra -Xcompiler -Wshadow
     ###GPUCC_VERSION = $(shell $(GPUCC) --version | grep 'Cuda compilation tools' | cut -d' ' -f5 | cut -d, -f1)
     GPUFLAGS += -std=c++17 # need CUDA >= 11.2 (see #333): this is enforced in mgOnGpuConfig.h
@@ -530,7 +529,7 @@ ifeq ($(shell $(CXX) --version | grep ^nvc++),)
 $(BUILDDIR)/CrossSectionKernels.o: CXXFLAGS += -fno-fast-math
 $(BUILDDIR)/CrossSectionKernels.o: CXXFLAGS += -fno-fast-math
 ifneq ($(GPUCC),)
-$(BUILDDIR)/gCrossSectionKernels.o: GPUFLAGS += -Xcompiler -fno-fast-math
+$(BUILDDIR)/gCrossSectionKernels.o: GPUFLAGS += -fno-fast-math
 endif
 endif
 
@@ -549,7 +548,7 @@ endif
 # Avoid "warning: builtin __has_trivial_... is deprecated; use __is_trivially_... instead" in GPUCC with icx2023 (#592)
 ifneq ($(shell $(CXX) --version | egrep '^(Intel)'),)
 ifneq ($(GPUCC),)
-GPUFLAGS += -Xcompiler -Wno-deprecated-builtins
+GPUFLAGS += -Wno-deprecated-builtins
 endif
 endif
 

From 5dc09d070e13aeb147980b8ed1e0a2d9a9811d9a Mon Sep 17 00:00:00 2001
From: Jorgen T <jorgen.teig@gmail.com>
Date: Wed, 21 Jun 2023 17:46:57 +0200
Subject: [PATCH 309/509] Fixed syntax error for errorhandling in
 GpuAbstraction.h

---
 epochX/cudacpp/ee_mumu.mad/SubProcesses/GpuAbstraction.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/GpuAbstraction.h b/epochX/cudacpp/ee_mumu.mad/SubProcesses/GpuAbstraction.h
index b782ffbe79..525bae86e5 100644
--- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/GpuAbstraction.h
+++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/GpuAbstraction.h
@@ -76,7 +76,7 @@
 
   #define gpuSetDevice hipSetDevice
   #define gpuDeviceSynchronize hipDeviceSynchronize
-  #define gpuDeviceReset checkGpu( hipDeviceReset )
+  #define gpuDeviceReset checkGpu( hipDeviceReset() )
 
   #define gpuLaunchKernel( kernel, blocks, threads, ...)                    kernel<<<blocks, threads>>> (__VA_ARGS__)
   #define gpuLaunchKernelSharedMem(kernel, blocks, threads, sharedMem, ...) kernel<<<blocks, threads, sharedMem>>>(__VA_ARGS__)

From f196de6962769faee6de335495325b65690b978b Mon Sep 17 00:00:00 2001
From: Jorgen T <jorgen.teig@gmail.com>
Date: Wed, 21 Jun 2023 17:50:09 +0200
Subject: [PATCH 310/509] Did some refactoring to keep the abstraction
 consistent

---
 epochX/cudacpp/ee_mumu.mad/SubProcesses/GpuAbstraction.h | 8 ++++----
 epochX/cudacpp/ee_mumu.mad/SubProcesses/GpuRuntime.h     | 4 ++--
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/GpuAbstraction.h b/epochX/cudacpp/ee_mumu.mad/SubProcesses/GpuAbstraction.h
index 525bae86e5..3295608f15 100644
--- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/GpuAbstraction.h
+++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/GpuAbstraction.h
@@ -42,9 +42,9 @@
   #define gpuFree(ptr) checkGpu( cudaFree(ptr) )
   #define gpuFreeHost(ptr) checkGpu( cudaFreeHost(ptr) )
 
-  #define gpuSetDevice cudaSetDevice
+  #define gpuSetDevice checkGpu( cudaSetDevice )
   #define gpuDeviceSynchronize cudaDeviceSynchronize
-  #define gpuDeviceReset cudaDeviceReset
+  #define gpuDeviceReset checkGpu( cudaDeviceReset )
 
   #define gpuLaunchKernel( kernel, blocks, threads, ...)                    kernel<<<blocks, threads>>> (__VA_ARGS__)
   #define gpuLaunchKernelSharedMem(kernel, blocks, threads, sharedMem, ...) kernel<<<blocks, threads, sharedMem>>>(__VA_ARGS__)
@@ -74,9 +74,9 @@
   #define gpuFree(ptr) checkGpu( hipFree(ptr) )
   #define gpuFreeHost(ptr) checkGpu( hipHostFree(ptr) )
 
-  #define gpuSetDevice hipSetDevice
+  #define gpuSetDevice checkGpu( hipSetDevice )
   #define gpuDeviceSynchronize hipDeviceSynchronize
-  #define gpuDeviceReset checkGpu( hipDeviceReset() )
+  #define gpuDeviceReset checkGpu( hipDeviceReset )
 
   #define gpuLaunchKernel( kernel, blocks, threads, ...)                    kernel<<<blocks, threads>>> (__VA_ARGS__)
   #define gpuLaunchKernelSharedMem(kernel, blocks, threads, sharedMem, ...) kernel<<<blocks, threads, sharedMem>>>(__VA_ARGS__)
diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/GpuRuntime.h b/epochX/cudacpp/ee_mumu.mad/SubProcesses/GpuRuntime.h
index a53a2c2d8d..9ddd0ec703 100644
--- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/GpuRuntime.h
+++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/GpuRuntime.h
@@ -59,7 +59,7 @@ namespace mg5amcGpu
       // Replace cudaFree(0) by cudaSetDevice(0), even if it is not really needed either
       // (but see https://developer.nvidia.com/blog/cuda-pro-tip-always-set-current-device-avoid-multithreading-bugs)
       if( debug ) std::cout << "__GpuRuntime: calling GpuSetDevice(0)" << std::endl;
-      checkGpu( gpuSetDevice( 0 ) ); // SLOW!
+      gpuSetDevice( 0 ); // SLOW!
     }
 
     // Tear down CUDA application (call cudaDeviceReset)
@@ -69,7 +69,7 @@ namespace mg5amcGpu
     static void tearDown( const bool debug = true )
     {
       if( debug ) std::cout << "__GpuRuntime: calling GpuDeviceReset()" << std::endl;
-      checkGpu( gpuDeviceReset() );
+      gpuDeviceReset();
     }
   };
 }

From 199860b7ca013c02b891409405b714cb89e314f6 Mon Sep 17 00:00:00 2001
From: Jorgen T <jorgen.teig@gmail.com>
Date: Wed, 21 Jun 2023 17:52:18 +0200
Subject: [PATCH 311/509] Fixed some syntax errors

---
 epochX/cudacpp/ee_mumu.mad/SubProcesses/GpuAbstraction.h | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/GpuAbstraction.h b/epochX/cudacpp/ee_mumu.mad/SubProcesses/GpuAbstraction.h
index 3295608f15..e0240b3ec5 100644
--- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/GpuAbstraction.h
+++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/GpuAbstraction.h
@@ -42,9 +42,9 @@
   #define gpuFree(ptr) checkGpu( cudaFree(ptr) )
   #define gpuFreeHost(ptr) checkGpu( cudaFreeHost(ptr) )
 
-  #define gpuSetDevice checkGpu( cudaSetDevice )
+  #define gpuSetDevice checkGpu( cudaSetDevice( int ) )
   #define gpuDeviceSynchronize cudaDeviceSynchronize
-  #define gpuDeviceReset checkGpu( cudaDeviceReset )
+  #define gpuDeviceReset checkGpu( cudaDeviceReset() )
 
   #define gpuLaunchKernel( kernel, blocks, threads, ...)                    kernel<<<blocks, threads>>> (__VA_ARGS__)
   #define gpuLaunchKernelSharedMem(kernel, blocks, threads, sharedMem, ...) kernel<<<blocks, threads, sharedMem>>>(__VA_ARGS__)
@@ -74,9 +74,9 @@
   #define gpuFree(ptr) checkGpu( hipFree(ptr) )
   #define gpuFreeHost(ptr) checkGpu( hipHostFree(ptr) )
 
-  #define gpuSetDevice checkGpu( hipSetDevice )
+  #define gpuSetDevice checkGpu( hipSetDevice( int ) )
   #define gpuDeviceSynchronize hipDeviceSynchronize
-  #define gpuDeviceReset checkGpu( hipDeviceReset )
+  #define gpuDeviceReset checkGpu( hipDeviceReset() )
 
   #define gpuLaunchKernel( kernel, blocks, threads, ...)                    kernel<<<blocks, threads>>> (__VA_ARGS__)
   #define gpuLaunchKernelSharedMem(kernel, blocks, threads, sharedMem, ...) kernel<<<blocks, threads, sharedMem>>>(__VA_ARGS__)

From a74b1aa805050d5fb9bfdb0a39855ee75544e9a6 Mon Sep 17 00:00:00 2001
From: Jorgen T <jorgen.teig@gmail.com>
Date: Wed, 21 Jun 2023 17:56:28 +0200
Subject: [PATCH 312/509] Reverted some changes to working state with warning

---
 epochX/cudacpp/ee_mumu.mad/SubProcesses/GpuAbstraction.h | 8 ++++----
 epochX/cudacpp/ee_mumu.mad/SubProcesses/GpuRuntime.h     | 4 ++--
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/GpuAbstraction.h b/epochX/cudacpp/ee_mumu.mad/SubProcesses/GpuAbstraction.h
index e0240b3ec5..782cb96e8c 100644
--- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/GpuAbstraction.h
+++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/GpuAbstraction.h
@@ -42,9 +42,9 @@
   #define gpuFree(ptr) checkGpu( cudaFree(ptr) )
   #define gpuFreeHost(ptr) checkGpu( cudaFreeHost(ptr) )
 
-  #define gpuSetDevice checkGpu( cudaSetDevice( int ) )
+  #define gpuSetDevice cudaSetDevice
   #define gpuDeviceSynchronize cudaDeviceSynchronize
-  #define gpuDeviceReset checkGpu( cudaDeviceReset() )
+  #define gpuDeviceReset cudaDeviceReset
 
   #define gpuLaunchKernel( kernel, blocks, threads, ...)                    kernel<<<blocks, threads>>> (__VA_ARGS__)
   #define gpuLaunchKernelSharedMem(kernel, blocks, threads, sharedMem, ...) kernel<<<blocks, threads, sharedMem>>>(__VA_ARGS__)
@@ -74,9 +74,9 @@
   #define gpuFree(ptr) checkGpu( hipFree(ptr) )
   #define gpuFreeHost(ptr) checkGpu( hipHostFree(ptr) )
 
-  #define gpuSetDevice checkGpu( hipSetDevice( int ) )
+  #define gpuSetDevice hipSetDevice
   #define gpuDeviceSynchronize hipDeviceSynchronize
-  #define gpuDeviceReset checkGpu( hipDeviceReset() )
+  #define gpuDeviceReset hipDeviceReset
 
   #define gpuLaunchKernel( kernel, blocks, threads, ...)                    kernel<<<blocks, threads>>> (__VA_ARGS__)
   #define gpuLaunchKernelSharedMem(kernel, blocks, threads, sharedMem, ...) kernel<<<blocks, threads, sharedMem>>>(__VA_ARGS__)
diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/GpuRuntime.h b/epochX/cudacpp/ee_mumu.mad/SubProcesses/GpuRuntime.h
index 9ddd0ec703..caa301ef24 100644
--- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/GpuRuntime.h
+++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/GpuRuntime.h
@@ -59,7 +59,7 @@ namespace mg5amcGpu
       // Replace cudaFree(0) by cudaSetDevice(0), even if it is not really needed either
       // (but see https://developer.nvidia.com/blog/cuda-pro-tip-always-set-current-device-avoid-multithreading-bugs)
       if( debug ) std::cout << "__GpuRuntime: calling GpuSetDevice(0)" << std::endl;
-      gpuSetDevice( 0 ); // SLOW!
+      checkGpu ( gpuSetDevice( 0 ) ); // SLOW!
     }
 
     // Tear down CUDA application (call cudaDeviceReset)
@@ -69,7 +69,7 @@ namespace mg5amcGpu
     static void tearDown( const bool debug = true )
     {
       if( debug ) std::cout << "__GpuRuntime: calling GpuDeviceReset()" << std::endl;
-      gpuDeviceReset();
+      checkGpu( gpuDeviceReset() );
     }
   };
 }

From f29ee2be03124036dbbe050b16135ed4e22fdc75 Mon Sep 17 00:00:00 2001
From: Jorgen T <jorgen.teig@gmail.com>
Date: Wed, 21 Jun 2023 17:57:45 +0200
Subject: [PATCH 313/509] Remove redundant options for compilation that is
 injected elsewhere

---
 epochX/cudacpp/ee_mumu.mad/SubProcesses/cudacpp.mk | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/cudacpp.mk b/epochX/cudacpp/ee_mumu.mad/SubProcesses/cudacpp.mk
index 16d72b273d..e6812d7395 100644
--- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/cudacpp.mk
+++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/cudacpp.mk
@@ -510,11 +510,11 @@ $(BUILDDIR)/.build.$(TAG):
 ifneq ($(GPUCC),)
 $(BUILDDIR)/%.o : %.cu *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
 	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
-	$(GPUCC) $(CPPFLAGS) $(GPUFLAGS) -Xcompiler -fPIC -c $< -o $@
+	$(GPUCC) $(CPPFLAGS) $(GPUFLAGS) -c $< -o $@
 
 $(BUILDDIR)/%_cu.o : %.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
 	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
-	$(GPUCC) $(CPPFLAGS) $(GPUFLAGS) -Xcompiler -fPIC -c $< -o $@
+	$(GPUCC) $(CPPFLAGS) $(GPUFLAGS) -c $< -o $@
 endif
 # -x cu in line above
 

From 33f69d88998f5f8f49f07b657bb9e810fadf992b Mon Sep 17 00:00:00 2001
From: Jorgen T <jorgen.teig@gmail.com>
Date: Wed, 21 Jun 2023 18:20:48 +0200
Subject: [PATCH 314/509] Remove hardcoded variable

---
 epochX/cudacpp/ee_mumu.mad/SubProcesses/cudacpp.mk | 1 -
 1 file changed, 1 deletion(-)

diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/cudacpp.mk b/epochX/cudacpp/ee_mumu.mad/SubProcesses/cudacpp.mk
index e6812d7395..79ff88a17a 100644
--- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/cudacpp.mk
+++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/cudacpp.mk
@@ -165,7 +165,6 @@ else ifeq ($(HIP_COMPILER),0)
     #$(warning HIP_HOME was not set: using "$(HIP_HOME)")
   endif
 
-    HIP_HOME=/opt/rocm-5.4.3
     $(warning HIP_HOME was not set: using "$(HIP_HOME)")
 
   # Set GPUCC as $(HIP_HOME)/bin/hipcc if it exists

From 8a1e5fcb3f532d9a81332763c796081fac67e785 Mon Sep 17 00:00:00 2001
From: Jorgen T <jorgen.teig@gmail.com>
Date: Wed, 21 Jun 2023 18:40:31 +0200
Subject: [PATCH 315/509] Removed more cuda specific code and added comments

---
 epochX/cudacpp/ee_mumu.mad/SubProcesses/cudacpp.mk | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/cudacpp.mk b/epochX/cudacpp/ee_mumu.mad/SubProcesses/cudacpp.mk
index 79ff88a17a..96a6e8d15d 100644
--- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/cudacpp.mk
+++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/cudacpp.mk
@@ -177,19 +177,19 @@ else ifeq ($(HIP_COMPILER),0)
     HIPARCHFLAGS = -target x86_64-linux-gnu --offload-arch=gfx90a
     HIPINC = -I$(HIP_HOME)/include/
 
-    # -DHIP_FAST_MATH equivelant to -use_fast_math in HIP
+    # -DHIP_FAST_MATH equivelant to -use_fast_math in HIP 
+    # (But only for single precision line 208: https://rocm-developer-tools.github.io/HIP/hcc__detail_2math__functions_8h_source.html)
     GPUFLAGS = $(OPTFLAGS) $(CUOPTFLAGS) $(INCFLAGS) $(HIPINC) $(HIPARCHFLAGS) -DHIP_FAST_MATH -DHIP_PLATFORM=amd -fPIC
     ###GPUFLAGS += -Xcompiler -Wall -Xcompiler -Wextra -Xcompiler -Wshadow
-    ###GPUCC_VERSION = $(shell $(GPUCC) --version | grep 'Cuda compilation tools' | cut -d' ' -f5 | cut -d, -f1)
-    GPUFLAGS += -std=c++17 # need CUDA >= 11.2 (see #333): this is enforced in mgOnGpuConfig.h
+    GPUFLAGS += -std=c++17
     # Without -maxrregcount: baseline throughput: 6.5E8 (16384 32 12) up to 7.3E8 (65536 128 12)
     ###GPUFLAGS+= --maxrregcount 160 # improves throughput: 6.9E8 (16384 32 12) up to 7.7E8 (65536 128 12)
     ###GPUFLAGS+= --maxrregcount 128 # improves throughput: 7.3E8 (16384 32 12) up to 7.6E8 (65536 128 12)
     ###GPUFLAGS+= --maxrregcount 96 # degrades throughput: 4.1E8 (16384 32 12) up to 4.5E8 (65536 128 12)
     ###GPUFLAGS+= --maxrregcount 64 # degrades throughput: 1.7E8 (16384 32 12) flat at 1.7E8 (65536 128 12)
-  else ifneq ($(origin REQUIRE_CUDA),undefined)
-    # If REQUIRE_CUDA is set but no cuda is found, stop here (e.g. for CI tests on GPU #443)
-    $(error No cuda installation found (set CUDA_HOME or make GPUCC visible in PATH))
+  else ifneq ($(origin REQUIRE_HIP),undefined)
+    # If REQUIRE_HIP is set but no cuda is found, stop here (e.g. for CI tests on GPU #443)
+    $(error No hip installation found (set HIP_HOME or make GPUCC visible in PATH))
   else
     # No hip. Switch hip compilation off and go to common random numbers in C++
     $(warning HIP_HOME is not set or is invalid: export HIP_HOME to compile with hip)

From 3d7892cbe6934f174a3905d153cfa53f192cc2f3 Mon Sep 17 00:00:00 2001
From: Jorgen T <jorgen.teig@gmail.com>
Date: Wed, 21 Jun 2023 18:55:46 +0200
Subject: [PATCH 316/509] Fix for including cuda in test compilation when
 compiling in HIP

---
 epochX/cudacpp/ee_mumu.mad/SubProcesses/cudacpp.mk | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/cudacpp.mk b/epochX/cudacpp/ee_mumu.mad/SubProcesses/cudacpp.mk
index 96a6e8d15d..32946ff490 100644
--- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/cudacpp.mk
+++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/cudacpp.mk
@@ -734,7 +734,11 @@ $(testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(cxx_object
 else # link both runTest.o and runTest_cu.o
 $(testmain): LIBFLAGS += $(CULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH
 $(testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) $(GTESTLIBS)
-	$(GPUCC) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) -ldl $(LIBFLAGS) -lcuda
+  ifeq ($(findstring hipcc,$(GPUCC)),hipcc)
+	  $(GPUCC) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) -ldl $(LIBFLAGS)
+  else
+    $(GPUCC) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) -ldl $(LIBFLAGS) -lcuda
+  endif
 endif
 
 # Use flock (Linux only, no Mac) to allow 'make -j' if googletest has not yet been downloaded https://stackoverflow.com/a/32666215

From 5c8ed771d5fe3cc685bc0c3518cb0a00679bd493 Mon Sep 17 00:00:00 2001
From: Jorgen T <jorgen.teig@gmail.com>
Date: Mon, 26 Jun 2023 10:19:22 +0200
Subject: [PATCH 317/509] Removed testing variable and added better detection
 of compiler paths

---
 epochX/cudacpp/ee_mumu.mad/SubProcesses/cudacpp.mk | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/cudacpp.mk b/epochX/cudacpp/ee_mumu.mad/SubProcesses/cudacpp.mk
index 32946ff490..d88dd3f5d9 100644
--- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/cudacpp.mk
+++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/cudacpp.mk
@@ -89,10 +89,10 @@ endif
 
 #-------------------------------------------------------------------------------
 
-CUDA_COMPILER := $(shell which nvcc > /dev/null 2>&1; echo $$?)
-HIP_COMPILER := $(shell which hipcc > /dev/null 2>&1; echo $$?)
+CUDA_COMPILER := $(shell compiler=`which nvcc` && while [[ -L $file ]]; do file=$(readlink "$file"); done && echo "$file")
+HIP_COMPILER := $(shell compiler=`which hipcc` && while [[ -L $file ]]; do file=$(readlink "$file"); done && echo "$file")
 
-ifeq ($(CUDA_COMPILER),test)
+ifeq ($(CUDA_COMPILER),0)
   #=== Configure the CUDA compiler
 
   # If CXX is not a single word (example "clang++ --gcc-toolchain...") then disable CUDA builds (issue #505)

From c678f3228577627b8aa1318efd2959f2a7e1a605 Mon Sep 17 00:00:00 2001
From: Jorgen T <jorgen.teig@gmail.com>
Date: Mon, 26 Jun 2023 10:32:01 +0200
Subject: [PATCH 318/509] Fixed Makefile syntax so compiler detection works

---
 epochX/cudacpp/ee_mumu.mad/SubProcesses/cudacpp.mk | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/cudacpp.mk b/epochX/cudacpp/ee_mumu.mad/SubProcesses/cudacpp.mk
index d88dd3f5d9..914065f1b8 100644
--- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/cudacpp.mk
+++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/cudacpp.mk
@@ -89,8 +89,8 @@ endif
 
 #-------------------------------------------------------------------------------
 
-CUDA_COMPILER := $(shell compiler=`which nvcc` && while [[ -L $file ]]; do file=$(readlink "$file"); done && echo "$file")
-HIP_COMPILER := $(shell compiler=`which hipcc` && while [[ -L $file ]]; do file=$(readlink "$file"); done && echo "$file")
+CUDA_COMPILER := $(shell compiler="`which nvcc`" && while [ -L "$$compiler" ]; do file=`readlink "$$compiler"`; done && echo "$$compiler")
+HIP_COMPILER := $(shell compiler="`which hipcc`" && while [ -L "$$compiler" ]; do file=`readlink "$$compiler"`; done && echo "$$compiler")
 
 ifeq ($(CUDA_COMPILER),0)
   #=== Configure the CUDA compiler

From b6fb06197b311a768b3bde14820e0eb1a910fa2f Mon Sep 17 00:00:00 2001
From: Jorgen T <jorgen.teig@gmail.com>
Date: Mon, 26 Jun 2023 10:36:36 +0200
Subject: [PATCH 319/509] Fixed typo

---
 epochX/cudacpp/ee_mumu.mad/SubProcesses/cudacpp.mk | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/cudacpp.mk b/epochX/cudacpp/ee_mumu.mad/SubProcesses/cudacpp.mk
index 914065f1b8..d729da1e2e 100644
--- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/cudacpp.mk
+++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/cudacpp.mk
@@ -89,8 +89,8 @@ endif
 
 #-------------------------------------------------------------------------------
 
-CUDA_COMPILER := $(shell compiler="`which nvcc`" && while [ -L "$$compiler" ]; do file=`readlink "$$compiler"`; done && echo "$$compiler")
-HIP_COMPILER := $(shell compiler="`which hipcc`" && while [ -L "$$compiler" ]; do file=`readlink "$$compiler"`; done && echo "$$compiler")
+CUDA_COMPILER := $(shell compiler="`which nvcc`" && while [ -L "$$compiler" ]; do compiler=`readlink "$$compiler"`; done && echo "$$compiler")
+HIP_COMPILER := $(shell compiler="`which hipcc`" && while [ -L "$$compiler" ]; do compiler=`readlink "$$compiler"`; done && echo "$$compiler")
 
 ifeq ($(CUDA_COMPILER),0)
   #=== Configure the CUDA compiler

From a6ef4c59476f9873c21f6bb0991eeeb7f4cc787c Mon Sep 17 00:00:00 2001
From: Jorgen T <jorgen.teig@gmail.com>
Date: Mon, 26 Jun 2023 11:34:14 +0200
Subject: [PATCH 320/509] Did minor changes to makefile and added flags that
 were missing in HIP compilation

---
 .../ee_mumu.mad/SubProcesses/cudacpp.mk       | 34 +++++++++++++------
 1 file changed, 24 insertions(+), 10 deletions(-)

diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/cudacpp.mk b/epochX/cudacpp/ee_mumu.mad/SubProcesses/cudacpp.mk
index d729da1e2e..d025ba8a6b 100644
--- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/cudacpp.mk
+++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/cudacpp.mk
@@ -89,10 +89,10 @@ endif
 
 #-------------------------------------------------------------------------------
 
-CUDA_COMPILER := $(shell compiler="`which nvcc`" && while [ -L "$$compiler" ]; do compiler=`readlink "$$compiler"`; done && echo "$$compiler")
-HIP_COMPILER := $(shell compiler="`which hipcc`" && while [ -L "$$compiler" ]; do compiler=`readlink "$$compiler"`; done && echo "$$compiler")
+CUDA_COMPILER_PATH := $(shell compiler="`which nvcc`" && while [ -L "$$compiler" ]; do compiler=`readlink "$$compiler"`; done && echo "$$compiler")
+HIP_COMPILER_PATH := $(shell compiler="`which hipcc`" && while [ -L "$$compiler" ]; do compiler=`readlink "$$compiler"`; done && echo "$$compiler")
 
-ifeq ($(CUDA_COMPILER),0)
+ifeq ($(findstring nvcc,$(CUDA_COMPILER_PATH)),nvcc)
   #=== Configure the CUDA compiler
 
   # If CXX is not a single word (example "clang++ --gcc-toolchain...") then disable CUDA builds (issue #505)
@@ -102,7 +102,7 @@ ifeq ($(CUDA_COMPILER),0)
     override CUDA_HOME=disabled
   endif
 
-  # If CUDA_HOME is not set, try to set it from the location of GPUCC
+  # If CUDA_HOME is not set, try to set it from the location of NVCC
   ifndef CUDA_HOME
     CUDA_HOME = $(patsubst %bin/nvcc,%,$(shell which nvcc 2>/dev/null))
     $(warning CUDA_HOME was not set: using "$(CUDA_HOME)")
@@ -156,16 +156,21 @@ ifeq ($(CUDA_COMPILER),0)
   GPUFLAGS += -allow-unsupported-compiler
   endif
 
-else ifeq ($(HIP_COMPILER),0)
-    #=== Configure the HIP compiler
+else ifeq ($(findstring hipcc,$(HIP_COMPILER_PATH)),hipcc)
+  #=== Configure the HIP compiler
 
-    # If HIP_HOME is not set, try to set it from the location of GPUCC
-  ifndef HIP_HOME
-    HIP_HOME = $(patsubst %bin/hipcc,%,$(shell which hipcc 2>/dev/null))
-    #$(warning HIP_HOME was not set: using "$(HIP_HOME)")
+  # If CXX is not a single word (example "clang++ --gcc-toolchain...") then disable CUDA builds (issue #505)
+  # This is because it is impossible to pass this to "GPUFLAGS += -ccbin <host-compiler>" below
+  ifneq ($(words $(subst ccache ,,$(CXX))),1) # allow at most "CXX=ccache <host-compiler>" from outside
+    $(warning CUDA builds are not supported for multi-word CXX "$(CXX)")
+    override CUDA_HOME=disabled
   endif
 
+  # If HIP_HOME is not set, try to set it from the location of GPUCC
+  ifndef HIP_HOME
+    HIP_HOME = $(patsubst %bin/hipcc,%,$(HIP_COMPILER_PATH))
     $(warning HIP_HOME was not set: using "$(HIP_HOME)")
+  endif
 
   # Set GPUCC as $(HIP_HOME)/bin/hipcc if it exists
   ifneq ($(wildcard $(HIP_HOME)/bin/hipcc),)
@@ -199,6 +204,15 @@ else ifeq ($(HIP_COMPILER),0)
     override CURANDLIBFLAGS=
   endif
 
+  # Set the host C++ compiler for GPUCC via "-ccbin <host-compiler>"
+  # (NB issue #505: this must be a single word, "clang++ --gcc-toolchain..." is not supported)
+  GPUFLAGS += -ccbin $(shell which $(subst ccache ,,$(CXX)))
+
+  # Allow newer (unsupported) C++ compilers with older versions of CUDA if ALLOW_UNSUPPORTED_COMPILER_IN_CUDA is set (#504)
+  ifneq ($(origin ALLOW_UNSUPPORTED_COMPILER_IN_CUDA),undefined)
+  GPUFLAGS += -allow-unsupported-compiler
+  endif
+
 endif
 
   

From bdfc44b23cbbb030bab8aa743e218468b8f2de92 Mon Sep 17 00:00:00 2001
From: Jorgen <jorgen.teig@gmail.com>
Date: Mon, 26 Jun 2023 11:57:58 +0200
Subject: [PATCH 321/509] Removed unused option in HIP compilation

---
 epochX/cudacpp/ee_mumu.mad/SubProcesses/cudacpp.mk | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/cudacpp.mk b/epochX/cudacpp/ee_mumu.mad/SubProcesses/cudacpp.mk
index d025ba8a6b..d728fc8bfb 100644
--- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/cudacpp.mk
+++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/cudacpp.mk
@@ -204,10 +204,6 @@ else ifeq ($(findstring hipcc,$(HIP_COMPILER_PATH)),hipcc)
     override CURANDLIBFLAGS=
   endif
 
-  # Set the host C++ compiler for GPUCC via "-ccbin <host-compiler>"
-  # (NB issue #505: this must be a single word, "clang++ --gcc-toolchain..." is not supported)
-  GPUFLAGS += -ccbin $(shell which $(subst ccache ,,$(CXX)))
-
   # Allow newer (unsupported) C++ compilers with older versions of CUDA if ALLOW_UNSUPPORTED_COMPILER_IN_CUDA is set (#504)
   ifneq ($(origin ALLOW_UNSUPPORTED_COMPILER_IN_CUDA),undefined)
   GPUFLAGS += -allow-unsupported-compiler

From 2ffbf6b8ef437da461a6519c5bc137166c7922a0 Mon Sep 17 00:00:00 2001
From: Jorgen <jorgen.teig@gmail.com>
Date: Mon, 26 Jun 2023 12:16:25 +0200
Subject: [PATCH 322/509] Readding -rpath

---
 epochX/cudacpp/ee_mumu.mad/SubProcesses/cudacpp.mk | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/cudacpp.mk b/epochX/cudacpp/ee_mumu.mad/SubProcesses/cudacpp.mk
index d728fc8bfb..fcc6fc46a5 100644
--- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/cudacpp.mk
+++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/cudacpp.mk
@@ -467,12 +467,10 @@ ifeq ($(UNAME_S),Darwin)
 else
   # RPATH to cuda/cpp libs when linking executables
   override CXXLIBFLAGSRPATH = -Wl,-rpath,$(LIBDIRRPATH)
-  override CULIBFLAGSRPATH = 
-  # -Xlinker -rpath,$(LIBDIRRPATH)
+  override CULIBFLAGSRPATH = -Xlinker -rpath,$(LIBDIRRPATH)
   # RPATH to common lib when linking cuda/cpp libs
   override CXXLIBFLAGSRPATH2 = -Wl,-rpath,'$$ORIGIN'
-  override CULIBFLAGSRPATH2 = 
-  #-Xlinker -rpath,'$$ORIGIN'
+  override CULIBFLAGSRPATH2 = -Xlinker -rpath,'$$ORIGIN'
 endif
 
 # Setting LD_LIBRARY_PATH or DYLD_LIBRARY_PATH in the RUNTIME is no longer necessary (neither on Linux nor on Mac)

From cd3ced91e974e06614661621a84951681d845aa6 Mon Sep 17 00:00:00 2001
From: Jorgen <jorgen.teig@gmail.com>
Date: Mon, 26 Jun 2023 13:50:47 +0200
Subject: [PATCH 323/509] Changed syntax in Makefile

---
 epochX/cudacpp/ee_mumu.mad/SubProcesses/cudacpp.mk | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/cudacpp.mk b/epochX/cudacpp/ee_mumu.mad/SubProcesses/cudacpp.mk
index fcc6fc46a5..fea74dcb08 100644
--- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/cudacpp.mk
+++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/cudacpp.mk
@@ -467,10 +467,10 @@ ifeq ($(UNAME_S),Darwin)
 else
   # RPATH to cuda/cpp libs when linking executables
   override CXXLIBFLAGSRPATH = -Wl,-rpath,$(LIBDIRRPATH)
-  override CULIBFLAGSRPATH = -Xlinker -rpath,$(LIBDIRRPATH)
+  override CULIBFLAGSRPATH = -Xlinker -rpath=$(LIBDIRRPATH)
   # RPATH to common lib when linking cuda/cpp libs
   override CXXLIBFLAGSRPATH2 = -Wl,-rpath,'$$ORIGIN'
-  override CULIBFLAGSRPATH2 = -Xlinker -rpath,'$$ORIGIN'
+  override CULIBFLAGSRPATH2 = -Xlinker -rpath='$$ORIGIN'
 endif
 
 # Setting LD_LIBRARY_PATH or DYLD_LIBRARY_PATH in the RUNTIME is no longer necessary (neither on Linux nor on Mac)

From 2c1571413ae6d7d5f27276a655924fdf2b39f738 Mon Sep 17 00:00:00 2001
From: Jorgen <jorgen.teig@gmail.com>
Date: Mon, 26 Jun 2023 15:39:56 +0200
Subject: [PATCH 324/509] Added some fixes to makefile

---
 epochX/cudacpp/ee_mumu.mad/SubProcesses/cudacpp.mk | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/cudacpp.mk b/epochX/cudacpp/ee_mumu.mad/SubProcesses/cudacpp.mk
index fea74dcb08..51d5b06466 100644
--- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/cudacpp.mk
+++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/cudacpp.mk
@@ -89,8 +89,8 @@ endif
 
 #-------------------------------------------------------------------------------
 
-CUDA_COMPILER_PATH := $(shell compiler="`which nvcc`" && while [ -L "$$compiler" ]; do compiler=`readlink "$$compiler"`; done && echo "$$compiler")
-HIP_COMPILER_PATH := $(shell compiler="`which hipcc`" && while [ -L "$$compiler" ]; do compiler=`readlink "$$compiler"`; done && echo "$$compiler")
+CUDA_COMPILER_PATH := $(shell compiler="`which nvcc 2>/dev/null`" && while [ -L "$$compiler" ]; do compiler=`readlink "$$compiler"`; done && echo "$$compiler")
+HIP_COMPILER_PATH := $(shell compiler="`which hipcc 2>/dev/null`" && while [ -L "$$compiler" ]; do compiler=`readlink "$$compiler"`; done && echo "$$compiler")
 
 ifeq ($(findstring nvcc,$(CUDA_COMPILER_PATH)),nvcc)
   #=== Configure the CUDA compiler

From 8db62bae71aa2064e39ff7a66e417952baf33a0b Mon Sep 17 00:00:00 2001
From: Jorgen <jorgen.teig@gmail.com>
Date: Mon, 26 Jun 2023 15:49:46 +0200
Subject: [PATCH 325/509] Reworked linking to cuda in tests

---
 epochX/cudacpp/ee_mumu.mad/SubProcesses/cudacpp.mk | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/cudacpp.mk b/epochX/cudacpp/ee_mumu.mad/SubProcesses/cudacpp.mk
index 51d5b06466..3c07afe023 100644
--- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/cudacpp.mk
+++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/cudacpp.mk
@@ -735,6 +735,10 @@ $(testmain): LIBFLAGS += -lgomp
 endif
 endif
 
+ifeq ($(findstring nvcc,$(GPUCC)),nvcc)
+  LIBFLAGS += -lcuda
+endif
+
 ifeq ($(GPUCC),) # link only runTest.o
 $(testmain): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH
 $(testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(cxx_objects_exe) $(GTESTLIBS)
@@ -742,11 +746,7 @@ $(testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(cxx_object
 else # link both runTest.o and runTest_cu.o
 $(testmain): LIBFLAGS += $(CULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH
 $(testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) $(GTESTLIBS)
-  ifeq ($(findstring hipcc,$(GPUCC)),hipcc)
 	  $(GPUCC) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) -ldl $(LIBFLAGS)
-  else
-    $(GPUCC) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) -ldl $(LIBFLAGS) -lcuda
-  endif
 endif
 
 # Use flock (Linux only, no Mac) to allow 'make -j' if googletest has not yet been downloaded https://stackoverflow.com/a/32666215

From 6035cc97d8c6655fceb01102f8d8a4942467e5c8 Mon Sep 17 00:00:00 2001
From: Jorgen <jorgen.teig@gmail.com>
Date: Mon, 26 Jun 2023 16:20:37 +0200
Subject: [PATCH 326/509] Added some fixes for CUDA compilation

---
 epochX/cudacpp/ee_mumu.mad/SubProcesses/cudacpp.mk | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/cudacpp.mk b/epochX/cudacpp/ee_mumu.mad/SubProcesses/cudacpp.mk
index 3c07afe023..41c76b9221 100644
--- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/cudacpp.mk
+++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/cudacpp.mk
@@ -315,6 +315,7 @@ endif
 ifeq ($(RNDGEN),)
   ifeq ($(GPUCC),)
     override RNDGEN = hasNoCurand
+  # Edgecase for HIP compilation
   else ifeq ($(findstring hipcc,$(GPUCC)),hipcc)
     override RNDGEN = hasNoCurand
   else ifeq ($(RNDGEN),)
@@ -532,11 +533,14 @@ $(BUILDDIR)/%.o : %.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
 	$(CXX) $(CPPFLAGS) $(CXXFLAGS) -fPIC -c $< -o $@
 
 # Apply special build flags only to CrossSectionKernel.cc and gCrossSectionKernel.cu (no fast math, see #117)
+# Added edgecase for HIP compilation
 ifeq ($(shell $(CXX) --version | grep ^nvc++),)
 $(BUILDDIR)/CrossSectionKernels.o: CXXFLAGS += -fno-fast-math
 $(BUILDDIR)/CrossSectionKernels.o: CXXFLAGS += -fno-fast-math
-ifneq ($(GPUCC),)
-$(BUILDDIR)/gCrossSectionKernels.o: GPUFLAGS += -fno-fast-math
+ifeq ($(findstring nvcc,$(GPUCC)),nvcc)
+  $(BUILDDIR)/gCrossSectionKernels.o: GPUFLAGS += -Xcompiler -fno-fast-math
+else
+  $(BUILDDIR)/gCrossSectionKernels.o: GPUFLAGS += -fno-fast-math
 endif
 endif
 

From 708340aa756fdf69f7927b0aee90201fb7b7a5bf Mon Sep 17 00:00:00 2001
From: Jorgen <jorgen.teig@gmail.com>
Date: Mon, 26 Jun 2023 18:07:49 +0200
Subject: [PATCH 327/509] Fixed some missing flags in compilation

---
 .../cudacpp/ee_mumu.mad/SubProcesses/cudacpp.mk  | 16 +++++++++++++---
 1 file changed, 13 insertions(+), 3 deletions(-)

diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/cudacpp.mk b/epochX/cudacpp/ee_mumu.mad/SubProcesses/cudacpp.mk
index 41c76b9221..3d13d7bea1 100644
--- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/cudacpp.mk
+++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/cudacpp.mk
@@ -126,7 +126,7 @@ ifeq ($(findstring nvcc,$(CUDA_COMPILER_PATH)),nvcc)
     CUINC = -I$(CUDA_HOME)/include/
     CURANDLIBFLAGS = -L$(CUDA_HOME)/lib64/ -lcurand # NB: -lcuda is not needed here!
     CUOPTFLAGS = -lineinfo
-    GPUFLAGS = $(OPTFLAGS) $(CUOPTFLAGS) $(INCFLAGS) $(CUINC) $(USE_NVTX) $(CUARCHFLAGS) -use_fast_math -Xcompiler -fPIC
+    GPUFLAGS = $(OPTFLAGS) $(CUOPTFLAGS) $(INCFLAGS) $(CUINC) $(USE_NVTX) $(CUARCHFLAGS) -use_fast_math
     ###GPUFLAGS += -Xcompiler -Wall -Xcompiler -Wextra -Xcompiler -Wshadow
     ###GPUCC_VERSION = $(shell $(GPUCC) --version | grep 'Cuda compilation tools' | cut -d' ' -f5 | cut -d, -f1)
     GPUFLAGS += -std=c++17 # need CUDA >= 11.2 (see #333): this is enforced in mgOnGpuConfig.h
@@ -135,6 +135,11 @@ ifeq ($(findstring nvcc,$(CUDA_COMPILER_PATH)),nvcc)
     ###GPUFLAGS+= --maxrregcount 128 # improves throughput: 7.3E8 (16384 32 12) up to 7.6E8 (65536 128 12)
     ###GPUFLAGS+= --maxrregcount 96 # degrades throughput: 4.1E8 (16384 32 12) up to 4.5E8 (65536 128 12)
     ###GPUFLAGS+= --maxrregcount 64 # degrades throughput: 1.7E8 (16384 32 12) flat at 1.7E8 (65536 128 12)
+
+    CUBUILDRULEFLAGS = -Xcompiler -fPIC -c
+
+    CCBUILDRULEFLAGS = -Xcompiler -fPIC -c -x -cu
+
   else ifneq ($(origin REQUIRE_CUDA),undefined)
     # If REQUIRE_CUDA is set but no cuda is found, stop here (e.g. for CI tests on GPU #443)
     $(error No cuda installation found (set CUDA_HOME or make GPUCC visible in PATH))
@@ -192,6 +197,11 @@ else ifeq ($(findstring hipcc,$(HIP_COMPILER_PATH)),hipcc)
     ###GPUFLAGS+= --maxrregcount 128 # improves throughput: 7.3E8 (16384 32 12) up to 7.6E8 (65536 128 12)
     ###GPUFLAGS+= --maxrregcount 96 # degrades throughput: 4.1E8 (16384 32 12) up to 4.5E8 (65536 128 12)
     ###GPUFLAGS+= --maxrregcount 64 # degrades throughput: 1.7E8 (16384 32 12) flat at 1.7E8 (65536 128 12)
+
+    CUBUILDRULEFLAGS = -fPIC -c
+
+    CCBUILDRULEFLAGS = -fPIC -c -x -cu
+
   else ifneq ($(origin REQUIRE_HIP),undefined)
     # If REQUIRE_HIP is set but no cuda is found, stop here (e.g. for CI tests on GPU #443)
     $(error No hip installation found (set HIP_HOME or make GPUCC visible in PATH))
@@ -518,11 +528,11 @@ $(BUILDDIR)/.build.$(TAG):
 ifneq ($(GPUCC),)
 $(BUILDDIR)/%.o : %.cu *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
 	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
-	$(GPUCC) $(CPPFLAGS) $(GPUFLAGS) -c $< -o $@
+	$(GPUCC) $(CPPFLAGS) $(GPUFLAGS) $(CUBUILDRULEFLAGS) $< -o $@
 
 $(BUILDDIR)/%_cu.o : %.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
 	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
-	$(GPUCC) $(CPPFLAGS) $(GPUFLAGS) -c $< -o $@
+	$(GPUCC) $(CPPFLAGS) $(GPUFLAGS) $(CCBUILDRULEFLAGS) $< -o $@
 endif
 # -x cu in line above
 

From d763b13d0477d4a2de8459cf19f7d118024260ec Mon Sep 17 00:00:00 2001
From: Jorgen <jorgen.teig@gmail.com>
Date: Mon, 26 Jun 2023 18:09:45 +0200
Subject: [PATCH 328/509] Fixed typo

---
 epochX/cudacpp/ee_mumu.mad/SubProcesses/cudacpp.mk | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/cudacpp.mk b/epochX/cudacpp/ee_mumu.mad/SubProcesses/cudacpp.mk
index 3d13d7bea1..c0ecbf0fd2 100644
--- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/cudacpp.mk
+++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/cudacpp.mk
@@ -138,7 +138,7 @@ ifeq ($(findstring nvcc,$(CUDA_COMPILER_PATH)),nvcc)
 
     CUBUILDRULEFLAGS = -Xcompiler -fPIC -c
 
-    CCBUILDRULEFLAGS = -Xcompiler -fPIC -c -x -cu
+    CCBUILDRULEFLAGS = -Xcompiler -fPIC -c -x cu
 
   else ifneq ($(origin REQUIRE_CUDA),undefined)
     # If REQUIRE_CUDA is set but no cuda is found, stop here (e.g. for CI tests on GPU #443)
@@ -200,7 +200,7 @@ else ifeq ($(findstring hipcc,$(HIP_COMPILER_PATH)),hipcc)
 
     CUBUILDRULEFLAGS = -fPIC -c
 
-    CCBUILDRULEFLAGS = -fPIC -c -x -cu
+    CCBUILDRULEFLAGS = -fPIC -c -x cu
 
   else ifneq ($(origin REQUIRE_HIP),undefined)
     # If REQUIRE_HIP is set but no cuda is found, stop here (e.g. for CI tests on GPU #443)

From f46c8d505e9f8355cb28a9835964e5f3e14adad2 Mon Sep 17 00:00:00 2001
From: Jorgen <jorgen.teig@gmail.com>
Date: Mon, 26 Jun 2023 18:27:16 +0200
Subject: [PATCH 329/509] Removed CUDA specific instructions in HIP compilation

---
 epochX/cudacpp/ee_mumu.mad/SubProcesses/cudacpp.mk | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/cudacpp.mk b/epochX/cudacpp/ee_mumu.mad/SubProcesses/cudacpp.mk
index c0ecbf0fd2..1df66a8743 100644
--- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/cudacpp.mk
+++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/cudacpp.mk
@@ -200,7 +200,7 @@ else ifeq ($(findstring hipcc,$(HIP_COMPILER_PATH)),hipcc)
 
     CUBUILDRULEFLAGS = -fPIC -c
 
-    CCBUILDRULEFLAGS = -fPIC -c -x cu
+    CCBUILDRULEFLAGS = -fPIC -c
 
   else ifneq ($(origin REQUIRE_HIP),undefined)
     # If REQUIRE_HIP is set but no cuda is found, stop here (e.g. for CI tests on GPU #443)

From 3aef7d7aff4ffa78309df40e6a4d9feae360dab7 Mon Sep 17 00:00:00 2001
From: Jorgen <jorgen.teig@gmail.com>
Date: Mon, 26 Jun 2023 18:53:07 +0200
Subject: [PATCH 330/509] Removed unneccessary flag in wrong spot in
 compilation

---
 epochX/cudacpp/ee_mumu.mad/SubProcesses/cudacpp.mk | 12 ++++--------
 1 file changed, 4 insertions(+), 8 deletions(-)

diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/cudacpp.mk b/epochX/cudacpp/ee_mumu.mad/SubProcesses/cudacpp.mk
index 1df66a8743..1b00bca28b 100644
--- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/cudacpp.mk
+++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/cudacpp.mk
@@ -112,7 +112,7 @@ ifeq ($(findstring nvcc,$(CUDA_COMPILER_PATH)),nvcc)
   ifneq ($(wildcard $(CUDA_HOME)/bin/nvcc),)
     GPUCC = $(CUDA_HOME)/bin/nvcc
     USE_NVTX ?=-DUSE_NVTX
-    # See https://docs.nvidia.com/cuda/cuda-compiler-driver-GPUCC/index.html
+    # See https://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/index.html
     # See https://arnon.dk/matching-sm-architectures-arch-and-gencode-for-various-nvidia-cards/
     # Default: use compute capability 70 for V100 (CERN lxbatch, CERN itscrd, Juwels Cluster).
     # Embed device code for 70, and PTX for 70+.
@@ -137,9 +137,10 @@ ifeq ($(findstring nvcc,$(CUDA_COMPILER_PATH)),nvcc)
     ###GPUFLAGS+= --maxrregcount 64 # degrades throughput: 1.7E8 (16384 32 12) flat at 1.7E8 (65536 128 12)
 
     CUBUILDRULEFLAGS = -Xcompiler -fPIC -c
-
     CCBUILDRULEFLAGS = -Xcompiler -fPIC -c -x cu
 
+    CUDATESTFLAGS = -lcuda
+
   else ifneq ($(origin REQUIRE_CUDA),undefined)
     # If REQUIRE_CUDA is set but no cuda is found, stop here (e.g. for CI tests on GPU #443)
     $(error No cuda installation found (set CUDA_HOME or make GPUCC visible in PATH))
@@ -199,7 +200,6 @@ else ifeq ($(findstring hipcc,$(HIP_COMPILER_PATH)),hipcc)
     ###GPUFLAGS+= --maxrregcount 64 # degrades throughput: 1.7E8 (16384 32 12) flat at 1.7E8 (65536 128 12)
 
     CUBUILDRULEFLAGS = -fPIC -c
-
     CCBUILDRULEFLAGS = -fPIC -c
 
   else ifneq ($(origin REQUIRE_HIP),undefined)
@@ -749,10 +749,6 @@ $(testmain): LIBFLAGS += -lgomp
 endif
 endif
 
-ifeq ($(findstring nvcc,$(GPUCC)),nvcc)
-  LIBFLAGS += -lcuda
-endif
-
 ifeq ($(GPUCC),) # link only runTest.o
 $(testmain): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH
 $(testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(cxx_objects_exe) $(GTESTLIBS)
@@ -760,7 +756,7 @@ $(testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(cxx_object
 else # link both runTest.o and runTest_cu.o
 $(testmain): LIBFLAGS += $(CULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH
 $(testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) $(GTESTLIBS)
-	  $(GPUCC) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) -ldl $(LIBFLAGS)
+	  $(GPUCC) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) -ldl $(LIBFLAGS) $(CUDATESTFLAGS)
 endif
 
 # Use flock (Linux only, no Mac) to allow 'make -j' if googletest has not yet been downloaded https://stackoverflow.com/a/32666215

From f98768fb43c8714e0445a0a4badb167900b8a4dc Mon Sep 17 00:00:00 2001
From: Jorgen <jorgen.teig@gmail.com>
Date: Mon, 26 Jun 2023 18:56:23 +0200
Subject: [PATCH 331/509] Reverted some changes for testing

---
 epochX/cudacpp/ee_mumu.mad/SubProcesses/cudacpp.mk | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/cudacpp.mk b/epochX/cudacpp/ee_mumu.mad/SubProcesses/cudacpp.mk
index 1b00bca28b..1f691504a6 100644
--- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/cudacpp.mk
+++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/cudacpp.mk
@@ -478,10 +478,10 @@ ifeq ($(UNAME_S),Darwin)
 else
   # RPATH to cuda/cpp libs when linking executables
   override CXXLIBFLAGSRPATH = -Wl,-rpath,$(LIBDIRRPATH)
-  override CULIBFLAGSRPATH = -Xlinker -rpath=$(LIBDIRRPATH)
+  override CULIBFLAGSRPATH = -Xlinker -rpath,$(LIBDIRRPATH)
   # RPATH to common lib when linking cuda/cpp libs
   override CXXLIBFLAGSRPATH2 = -Wl,-rpath,'$$ORIGIN'
-  override CULIBFLAGSRPATH2 = -Xlinker -rpath='$$ORIGIN'
+  override CULIBFLAGSRPATH2 = -Xlinker -rpath,'$$ORIGIN'
 endif
 
 # Setting LD_LIBRARY_PATH or DYLD_LIBRARY_PATH in the RUNTIME is no longer necessary (neither on Linux nor on Mac)

From cdff55d3cb83aa2e09c283cf9fbefee3901fb116 Mon Sep 17 00:00:00 2001
From: Jorgen <jorgen.teig@gmail.com>
Date: Tue, 27 Jun 2023 14:23:06 +0200
Subject: [PATCH 332/509] Removed CUDACC ifdef in Curand*.cc

---
 .../ee_mumu.mad/SubProcesses/CurandRandomNumberKernel.cc      | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/CurandRandomNumberKernel.cc b/epochX/cudacpp/ee_mumu.mad/SubProcesses/CurandRandomNumberKernel.cc
index a420fde1df..82dbcbabc4 100644
--- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/CurandRandomNumberKernel.cc
+++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/CurandRandomNumberKernel.cc
@@ -10,7 +10,6 @@
 
 #include <cassert>
 
-#ifdef MGONGPUCPP_CUDACC
 #ifndef MGONGPU_HAS_NO_CURAND /* clang-format off */
 #include "curand.h"
 #define checkCurand( code ){ assertCurand( code, __FILE__, __LINE__ ); }
@@ -133,5 +132,4 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 #endif
-}
-#endif
\ No newline at end of file
+}
\ No newline at end of file

From 6fa25abd6373828f2600d540279ccfc75f8230d6 Mon Sep 17 00:00:00 2001
From: Jorgen <jorgen.teig@gmail.com>
Date: Tue, 27 Jun 2023 14:27:04 +0200
Subject: [PATCH 333/509] "-rpath,$variable" not in LLVM's linker using =
 instead

---
 epochX/cudacpp/ee_mumu.mad/SubProcesses/cudacpp.mk | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/cudacpp.mk b/epochX/cudacpp/ee_mumu.mad/SubProcesses/cudacpp.mk
index 1f691504a6..ff63aa4a43 100644
--- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/cudacpp.mk
+++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/cudacpp.mk
@@ -477,11 +477,11 @@ ifeq ($(UNAME_S),Darwin)
   override CULIBFLAGSRPATH2 =
 else
   # RPATH to cuda/cpp libs when linking executables
-  override CXXLIBFLAGSRPATH = -Wl,-rpath,$(LIBDIRRPATH)
-  override CULIBFLAGSRPATH = -Xlinker -rpath,$(LIBDIRRPATH)
+  override CXXLIBFLAGSRPATH = -Wl,-rpath=$(LIBDIRRPATH)
+  override CULIBFLAGSRPATH = -Xlinker -rpath=$(LIBDIRRPATH)
   # RPATH to common lib when linking cuda/cpp libs
-  override CXXLIBFLAGSRPATH2 = -Wl,-rpath,'$$ORIGIN'
-  override CULIBFLAGSRPATH2 = -Xlinker -rpath,'$$ORIGIN'
+  override CXXLIBFLAGSRPATH2 = -Wl,-rpath='$$ORIGIN'
+  override CULIBFLAGSRPATH2 = -Xlinker -rpath='$$ORIGIN'
 endif
 
 # Setting LD_LIBRARY_PATH or DYLD_LIBRARY_PATH in the RUNTIME is no longer necessary (neither on Linux nor on Mac)

From 8fbc97f0b5c330635ad043ff34678f08be9c785f Mon Sep 17 00:00:00 2001
From: Jorgen <jorgen.teig@gmail.com>
Date: Tue, 27 Jun 2023 14:32:28 +0200
Subject: [PATCH 334/509] Added error checks in gpu functions

---
 epochX/cudacpp/ee_mumu.mad/SubProcesses/GpuAbstraction.h | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/GpuAbstraction.h b/epochX/cudacpp/ee_mumu.mad/SubProcesses/GpuAbstraction.h
index 782cb96e8c..391a5ea4ac 100644
--- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/GpuAbstraction.h
+++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/GpuAbstraction.h
@@ -59,8 +59,8 @@
   //--------------------------------------------------------------------------
 
   #define gpuError_t hipError_t
-  #define gpuPeekAtLastError hipPeekAtLastError
-  #define gpuGetErrorString hipGetErrorString
+  #define gpuPeekAtLastError checkGpu( hipPeekAtLastError )
+  #define gpuGetErrorString checkGpu( hipGetErrorString )
   #define gpuSuccess hipSuccess
 
   #define gpuMallocHost(ptr, size) checkGpu( hipHostMalloc(ptr, size) ) // HostMalloc better
@@ -75,8 +75,8 @@
   #define gpuFreeHost(ptr) checkGpu( hipHostFree(ptr) )
 
   #define gpuSetDevice hipSetDevice
-  #define gpuDeviceSynchronize hipDeviceSynchronize
-  #define gpuDeviceReset hipDeviceReset
+  #define gpuDeviceSynchronize checkGpu( hipDeviceSynchronize )
+  #define gpuDeviceReset checkGpu(hipDeviceReset)
 
   #define gpuLaunchKernel( kernel, blocks, threads, ...)                    kernel<<<blocks, threads>>> (__VA_ARGS__)
   #define gpuLaunchKernelSharedMem(kernel, blocks, threads, sharedMem, ...) kernel<<<blocks, threads, sharedMem>>>(__VA_ARGS__)

From b2d6aa91367c28e1ea014bda1ec9324c8a12a52a Mon Sep 17 00:00:00 2001
From: Jorgen <jorgen.teig@gmail.com>
Date: Tue, 27 Jun 2023 14:36:58 +0200
Subject: [PATCH 335/509] Testing syntax changes

---
 epochX/cudacpp/ee_mumu.mad/SubProcesses/GpuAbstraction.h | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/GpuAbstraction.h b/epochX/cudacpp/ee_mumu.mad/SubProcesses/GpuAbstraction.h
index 391a5ea4ac..be8c4b15bc 100644
--- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/GpuAbstraction.h
+++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/GpuAbstraction.h
@@ -59,8 +59,8 @@
   //--------------------------------------------------------------------------
 
   #define gpuError_t hipError_t
-  #define gpuPeekAtLastError checkGpu( hipPeekAtLastError )
-  #define gpuGetErrorString checkGpu( hipGetErrorString )
+  #define gpuPeekAtLastError checkGpu( hipPeekAtLastError() )
+  #define gpuGetErrorString checkGpu( hipGetErrorString() )
   #define gpuSuccess hipSuccess
 
   #define gpuMallocHost(ptr, size) checkGpu( hipHostMalloc(ptr, size) ) // HostMalloc better
@@ -75,8 +75,8 @@
   #define gpuFreeHost(ptr) checkGpu( hipHostFree(ptr) )
 
   #define gpuSetDevice hipSetDevice
-  #define gpuDeviceSynchronize checkGpu( hipDeviceSynchronize )
-  #define gpuDeviceReset checkGpu(hipDeviceReset)
+  #define gpuDeviceSynchronize checkGpu( hipDeviceSynchronize() )
+  #define gpuDeviceReset checkGpu( hipDeviceReset() )
 
   #define gpuLaunchKernel( kernel, blocks, threads, ...)                    kernel<<<blocks, threads>>> (__VA_ARGS__)
   #define gpuLaunchKernelSharedMem(kernel, blocks, threads, sharedMem, ...) kernel<<<blocks, threads, sharedMem>>>(__VA_ARGS__)

From f7508be23d02fbaa1e1e4f8f31ac637812f2bb72 Mon Sep 17 00:00:00 2001
From: Jorgen <jorgen.teig@gmail.com>
Date: Tue, 27 Jun 2023 14:41:51 +0200
Subject: [PATCH 336/509] Fixed syntax

---
 epochX/cudacpp/ee_mumu.mad/SubProcesses/GpuAbstraction.h | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/GpuAbstraction.h b/epochX/cudacpp/ee_mumu.mad/SubProcesses/GpuAbstraction.h
index be8c4b15bc..c18729d30e 100644
--- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/GpuAbstraction.h
+++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/GpuAbstraction.h
@@ -59,8 +59,8 @@
   //--------------------------------------------------------------------------
 
   #define gpuError_t hipError_t
-  #define gpuPeekAtLastError checkGpu( hipPeekAtLastError() )
-  #define gpuGetErrorString checkGpu( hipGetErrorString() )
+  #define gpuPeekAtLastError() checkGpu( hipPeekAtLastError() )
+  #define gpuGetErrorString() checkGpu( hipGetErrorString() )
   #define gpuSuccess hipSuccess
 
   #define gpuMallocHost(ptr, size) checkGpu( hipHostMalloc(ptr, size) ) // HostMalloc better
@@ -75,8 +75,8 @@
   #define gpuFreeHost(ptr) checkGpu( hipHostFree(ptr) )
 
   #define gpuSetDevice hipSetDevice
-  #define gpuDeviceSynchronize checkGpu( hipDeviceSynchronize() )
-  #define gpuDeviceReset checkGpu( hipDeviceReset() )
+  #define gpuDeviceSynchronize() checkGpu( hipDeviceSynchronize() )
+  #define gpuDeviceReset() checkGpu( hipDeviceReset() )
 
   #define gpuLaunchKernel( kernel, blocks, threads, ...)                    kernel<<<blocks, threads>>> (__VA_ARGS__)
   #define gpuLaunchKernelSharedMem(kernel, blocks, threads, sharedMem, ...) kernel<<<blocks, threads, sharedMem>>>(__VA_ARGS__)

From 7e54dd756fe56ad810acfc5be75d05c81f682406 Mon Sep 17 00:00:00 2001
From: Jorgen <jorgen.teig@gmail.com>
Date: Tue, 27 Jun 2023 14:44:08 +0200
Subject: [PATCH 337/509] Second try at fixing syntax

---
 epochX/cudacpp/ee_mumu.mad/SubProcesses/GpuAbstraction.h | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/GpuAbstraction.h b/epochX/cudacpp/ee_mumu.mad/SubProcesses/GpuAbstraction.h
index c18729d30e..07a9fe1f62 100644
--- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/GpuAbstraction.h
+++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/GpuAbstraction.h
@@ -59,8 +59,8 @@
   //--------------------------------------------------------------------------
 
   #define gpuError_t hipError_t
-  #define gpuPeekAtLastError() checkGpu( hipPeekAtLastError() )
-  #define gpuGetErrorString() checkGpu( hipGetErrorString() )
+  #define gpuPeekAtLastError() checkGpu( hipPeekAtLastError )
+  #define gpuGetErrorString() checkGpu( hipGetErrorString )
   #define gpuSuccess hipSuccess
 
   #define gpuMallocHost(ptr, size) checkGpu( hipHostMalloc(ptr, size) ) // HostMalloc better
@@ -75,8 +75,8 @@
   #define gpuFreeHost(ptr) checkGpu( hipHostFree(ptr) )
 
   #define gpuSetDevice hipSetDevice
-  #define gpuDeviceSynchronize() checkGpu( hipDeviceSynchronize() )
-  #define gpuDeviceReset() checkGpu( hipDeviceReset() )
+  #define gpuDeviceSynchronize() checkGpu( hipDeviceSynchronize )
+  #define gpuDeviceReset() checkGpu( hipDeviceReset )
 
   #define gpuLaunchKernel( kernel, blocks, threads, ...)                    kernel<<<blocks, threads>>> (__VA_ARGS__)
   #define gpuLaunchKernelSharedMem(kernel, blocks, threads, sharedMem, ...) kernel<<<blocks, threads, sharedMem>>>(__VA_ARGS__)

From d09bec0107b1f0e4bd0fef9f8c3ef9d9c5fa3248 Mon Sep 17 00:00:00 2001
From: Jorgen <jorgen.teig@gmail.com>
Date: Tue, 27 Jun 2023 14:49:10 +0200
Subject: [PATCH 338/509] Removed errors, but warnings presist

---
 epochX/cudacpp/ee_mumu.mad/SubProcesses/GpuAbstraction.h | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/GpuAbstraction.h b/epochX/cudacpp/ee_mumu.mad/SubProcesses/GpuAbstraction.h
index 07a9fe1f62..782cb96e8c 100644
--- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/GpuAbstraction.h
+++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/GpuAbstraction.h
@@ -59,8 +59,8 @@
   //--------------------------------------------------------------------------
 
   #define gpuError_t hipError_t
-  #define gpuPeekAtLastError() checkGpu( hipPeekAtLastError )
-  #define gpuGetErrorString() checkGpu( hipGetErrorString )
+  #define gpuPeekAtLastError hipPeekAtLastError
+  #define gpuGetErrorString hipGetErrorString
   #define gpuSuccess hipSuccess
 
   #define gpuMallocHost(ptr, size) checkGpu( hipHostMalloc(ptr, size) ) // HostMalloc better
@@ -75,8 +75,8 @@
   #define gpuFreeHost(ptr) checkGpu( hipHostFree(ptr) )
 
   #define gpuSetDevice hipSetDevice
-  #define gpuDeviceSynchronize() checkGpu( hipDeviceSynchronize )
-  #define gpuDeviceReset() checkGpu( hipDeviceReset )
+  #define gpuDeviceSynchronize hipDeviceSynchronize
+  #define gpuDeviceReset hipDeviceReset
 
   #define gpuLaunchKernel( kernel, blocks, threads, ...)                    kernel<<<blocks, threads>>> (__VA_ARGS__)
   #define gpuLaunchKernelSharedMem(kernel, blocks, threads, sharedMem, ...) kernel<<<blocks, threads, sharedMem>>>(__VA_ARGS__)

From d3e51275da8f233fdfe43d3c79ae0fbe70c2e614 Mon Sep 17 00:00:00 2001
From: Jorgen T <jorgen.teig@gmail.com>
Date: Thu, 29 Jun 2023 17:07:25 +0200
Subject: [PATCH 339/509] Ported GPU abstraction to gg_ttgg

---
 .../cudacpp/gg_ttgg.mad/SubProcesses/Bridge.h |  24 +-
 .../gg_ttgg.mad/SubProcesses/BridgeKernels.cc |   7 +-
 .../gg_ttgg.mad/SubProcesses/BridgeKernels.h  |   7 +-
 .../SubProcesses/CommonRandomNumberKernel.cc  |   3 +-
 .../SubProcesses/CrossSectionKernels.cc       |   5 +-
 .../SubProcesses/CrossSectionKernels.h        |   4 +-
 .../gg_ttgg.mad/SubProcesses/CudaRuntime.h    |   4 +-
 .../SubProcesses/CurandRandomNumberKernel.cc  |  10 +-
 .../SubProcesses/EventStatistics.h            |   2 +-
 .../gg_ttgg.mad/SubProcesses/GpuAbstraction.h |  86 +++++
 .../gg_ttgg.mad/SubProcesses/GpuRuntime.h     |  80 +++++
 .../gg_ttgg.mad/SubProcesses/MadgraphTest.h   |   6 +-
 .../SubProcesses/MatrixElementKernels.cc      |   6 +-
 .../SubProcesses/MatrixElementKernels.h       |   6 +-
 .../SubProcesses/MemoryAccessHelpers.h        |   2 +-
 .../SubProcesses/MemoryAccessMomenta.h        |   4 +-
 .../SubProcesses/MemoryAccessRandomNumbers.h  |   2 +-
 .../SubProcesses/MemoryAccessVectors.h        |   2 +-
 .../gg_ttgg.mad/SubProcesses/MemoryBuffers.h  |  49 ++-
 .../SubProcesses/P1_gg_ttxgg/CPPProcess.cc    |  49 ++-
 .../SubProcesses/P1_gg_ttxgg/CPPProcess.h     |   8 +-
 .../SubProcesses/P1_gg_ttxgg/GpuAbstraction.h |   1 +
 .../SubProcesses/P1_gg_ttxgg/GpuRuntime.h     |   1 +
 .../SubProcesses/P1_gg_ttxgg/check_sa.cc      |  74 ++---
 .../SubProcesses/RamboSamplingKernels.cc      |  14 +-
 .../SubProcesses/RamboSamplingKernels.h       |   4 +-
 .../SubProcesses/RandomNumberKernels.h        |   4 +-
 .../gg_ttgg.mad/SubProcesses/cudacpp.mk       | 295 +++++++++++-------
 .../gg_ttgg.mad/SubProcesses/fbridge.cc       |  10 +-
 .../gg_ttgg.mad/SubProcesses/fsampler.cc      |   6 +-
 .../gg_ttgg.mad/SubProcesses/runTest.cc       |   8 +-
 .../gg_ttgg.mad/SubProcesses/testmisc.cc      |   2 +-
 .../gg_ttgg.mad/SubProcesses/testxxx.cc       |   6 +-
 epochX/cudacpp/gg_ttgg.mad/src/HelAmps_sm.h   |   2 +-
 .../cudacpp/gg_ttgg.mad/src/Parameters_sm.h   |   6 +-
 .../cudacpp/gg_ttgg.mad/src/mgOnGpuConfig.h   |  26 +-
 .../cudacpp/gg_ttgg.mad/src/mgOnGpuCxtypes.h  |  16 +-
 .../cudacpp/gg_ttgg.mad/src/mgOnGpuFptypes.h  |   8 +-
 .../cudacpp/gg_ttgg.mad/src/mgOnGpuVectors.h  |  15 +-
 epochX/cudacpp/gg_ttgg.mad/src/rambo.h        |   6 +-
 40 files changed, 561 insertions(+), 309 deletions(-)
 create mode 100644 epochX/cudacpp/gg_ttgg.mad/SubProcesses/GpuAbstraction.h
 create mode 100644 epochX/cudacpp/gg_ttgg.mad/SubProcesses/GpuRuntime.h
 create mode 120000 epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/GpuAbstraction.h
 create mode 120000 epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/GpuRuntime.h

diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/Bridge.h b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/Bridge.h
index 4cafe0c997..8c543a7356 100644
--- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/Bridge.h
+++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/Bridge.h
@@ -22,7 +22,7 @@
 #include <memory>
 #include <type_traits>
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -82,7 +82,7 @@ namespace mg5amcCpu
     Bridge& operator=( const Bridge& ) = delete;
     Bridge& operator=( Bridge&& ) = delete;
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     /**
      * Set the gpublocks and gputhreads for the gpusequence - throws if evnt != gpublocks*gputhreads
      * (this is needed for BridgeKernel tests rather than for actual production use in Fortran)
@@ -149,7 +149,7 @@ namespace mg5amcCpu
     unsigned int m_nevt; // number of events
     int m_nGoodHel;      // the number of good helicities (-1 initially when they have not yet been calculated)
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     int m_gputhreads; // number of gpu threads (default set from number of events, can be modified)
     int m_gpublocks;  // number of gpu blocks (default set from number of events, can be modified)
     mg5amcGpu::DeviceBuffer<FORTRANFPTYPE, sizePerEventMomenta> m_devMomentaF;
@@ -186,12 +186,12 @@ namespace mg5amcCpu
   // Forward declare transposition methods
   //
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 
   template<typename Tin, typename Tout>
   __global__ void dev_transposeMomentaF2C( const Tin* in, Tout* out, const unsigned int nevt );
 
-#endif // __CUDACC__
+#endif // MGONGPUCPP_GPUIMPL
 
   template<typename Tin, typename Tout>
   void hst_transposeMomentaF2C( const Tin* in, Tout* out, const unsigned int nevt );
@@ -208,7 +208,7 @@ namespace mg5amcCpu
   Bridge<FORTRANFPTYPE>::Bridge( unsigned int nevtF, unsigned int nparF, unsigned int np4F )
     : m_nevt( nevtF )
     , m_nGoodHel( -1 )
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     , m_gputhreads( 256 )                  // default number of gpu threads
     , m_gpublocks( m_nevt / m_gputhreads ) // this ensures m_nevt <= m_gpublocks*m_gputhreads
     , m_devMomentaF( m_nevt )
@@ -232,7 +232,7 @@ namespace mg5amcCpu
   {
     if( nparF != CPPProcess::npar ) throw std::runtime_error( "Bridge constructor: npar mismatch" );
     if( np4F != CPPProcess::np4 ) throw std::runtime_error( "Bridge constructor: np4 mismatch" );
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     if( ( m_nevt < s_gputhreadsmin ) || ( m_nevt % s_gputhreadsmin != 0 ) )
       throw std::runtime_error( "Bridge constructor: nevt should be a multiple of " + std::to_string( s_gputhreadsmin ) );
     while( m_nevt != m_gpublocks * m_gputhreads )
@@ -250,11 +250,11 @@ namespace mg5amcCpu
     std::cout << "WARNING! Instantiate host Bridge (nevt=" << m_nevt << ")" << std::endl;
     mg5amcCpu::CPPProcess process( /*verbose=*/false );
     m_pmek.reset( new mg5amcCpu::MatrixElementKernelHost( m_hstMomentaC, m_hstGs, m_hstRndHel, m_hstRndCol, m_hstMEs, m_hstSelHel, m_hstSelCol, m_nevt ) );
-#endif // __CUDACC__
+#endif // MGONGPUCPP_GPUIMPL
     process.initProc( "../../Cards/param_card.dat" );
   }
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   template<typename FORTRANFPTYPE>
   void Bridge<FORTRANFPTYPE>::set_gpugrid( const int gpublocks, const int gputhreads )
   {
@@ -268,7 +268,7 @@ namespace mg5amcCpu
   }
 #endif
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   template<typename FORTRANFPTYPE>
   void Bridge<FORTRANFPTYPE>::gpu_sequence( const FORTRANFPTYPE* momenta,
                                             const FORTRANFPTYPE* gs,
@@ -333,7 +333,7 @@ namespace mg5amcCpu
   }
 #endif
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   template<typename FORTRANFPTYPE>
   void Bridge<FORTRANFPTYPE>::cpu_sequence( const FORTRANFPTYPE* momenta,
                                             const FORTRANFPTYPE* gs,
@@ -388,7 +388,7 @@ namespace mg5amcCpu
   // - C++ array: momenta[npagM][npar][np4][neppM] with nevt=npagM*neppM (AOSOA)
   //
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   template<typename Tin, typename Tout>
   __global__ void dev_transposeMomentaF2C( const Tin* in, Tout* out, const unsigned int nevt )
   {
diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/BridgeKernels.cc b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/BridgeKernels.cc
index cef4cb3c71..f844178cbb 100644
--- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/BridgeKernels.cc
+++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/BridgeKernels.cc
@@ -5,6 +5,7 @@
 
 #include "BridgeKernels.h"
 
+#include "GpuAbstraction.h"
 #include "MemoryAccessMomenta.h"
 
 #include <sstream>
@@ -14,7 +15,7 @@ constexpr int npar = CPPProcess::npar; // #particles in total (external = initia
 
 //============================================================================
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -45,7 +46,7 @@ namespace mg5amcCpu
 
 //============================================================================
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 namespace mg5amcCpu
 {
 
@@ -96,7 +97,7 @@ namespace mg5amcCpu
 
 //============================================================================
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 {
 
diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/BridgeKernels.h b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/BridgeKernels.h
index 15eb4bff4d..4f0a560d4b 100644
--- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/BridgeKernels.h
+++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/BridgeKernels.h
@@ -9,10 +9,11 @@
 #include "mgOnGpuConfig.h"
 
 #include "Bridge.h"
+#include "GpuAbstraction.h"
 #include "MatrixElementKernels.h"
 #include "MemoryBuffers.h"
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -49,7 +50,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A Bridge wrapper class encapsulating matrix element calculations on a CPU host
   class BridgeKernelHost final : public BridgeKernelBase
   {
@@ -89,7 +90,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   // A Bridge wrapper class encapsulating matrix element calculations on a GPU device
   class BridgeKernelDevice : public BridgeKernelBase
   {
diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/CommonRandomNumberKernel.cc b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/CommonRandomNumberKernel.cc
index 985b39f576..f17b9c0ad7 100644
--- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/CommonRandomNumberKernel.cc
+++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/CommonRandomNumberKernel.cc
@@ -4,12 +4,13 @@
 // Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #include "CommonRandomNumbers.h"
+#include "GpuAbstraction.h"
 #include "MemoryBuffers.h"
 #include "RandomNumberKernels.h"
 
 #include <cassert>
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/CrossSectionKernels.cc b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/CrossSectionKernels.cc
index 0b355a3c8d..36ca2a94d4 100644
--- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/CrossSectionKernels.cc
+++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/CrossSectionKernels.cc
@@ -5,6 +5,7 @@
 
 #include "CrossSectionKernels.h"
 
+#include "GpuAbstraction.h"
 #include "MemoryAccessMatrixElements.h"
 #include "MemoryAccessWeights.h"
 #include "MemoryBuffers.h"
@@ -77,7 +78,7 @@ debug_me_is_abnormal( const fptype& me, size_t ievtALL )
 
 //============================================================================
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -185,7 +186,7 @@ namespace mg5amcCpu
 
 //============================================================================
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 {
 
diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/CrossSectionKernels.h b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/CrossSectionKernels.h
index 7933ca4bbf..ff2350a14d 100644
--- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/CrossSectionKernels.h
+++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/CrossSectionKernels.h
@@ -13,7 +13,7 @@
 
 //============================================================================
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -96,7 +96,7 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
   /*
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   // A class encapsulating the calculation of event statistics on a GPU device
   class CrossSectionKernelDevice : public CrossSectionKernelBase, public NumberOfEvents
   {
diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/CudaRuntime.h b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/CudaRuntime.h
index 64ce52f4b3..df0c3f3df8 100644
--- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/CudaRuntime.h
+++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/CudaRuntime.h
@@ -15,7 +15,7 @@
 //--------------------------------------------------------------------------
 
 // See https://stackoverflow.com/a/14038590
-#ifdef __CUDACC__ /* clang-format off */
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
 #define checkCuda( code ) { assertCuda( code, __FILE__, __LINE__ ); }
 inline void assertCuda( cudaError_t code, const char* file, int line, bool abort = true )
 {
@@ -29,7 +29,7 @@ inline void assertCuda( cudaError_t code, const char* file, int line, bool abort
 
 //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 {
   // Instantiate a CudaRuntime at the beginnining of the application's main to
diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/CurandRandomNumberKernel.cc b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/CurandRandomNumberKernel.cc
index eb56333b03..c3d5510131 100644
--- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/CurandRandomNumberKernel.cc
+++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/CurandRandomNumberKernel.cc
@@ -3,7 +3,7 @@
 // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
 // Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
-#include "CudaRuntime.h"
+#include "GpuRuntime.h"
 #include "MemoryBuffers.h"
 #include "RandomNumberKernels.h"
 
@@ -22,7 +22,7 @@ inline void assertCurand( curandStatus_t code, const char *file, int line, bool
 }
 #endif /* clang-format on */
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -36,7 +36,7 @@ namespace mg5amcCpu
   {
     if( m_isOnDevice )
     {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
       if( !m_rnarray.isOnDevice() )
         throw std::runtime_error( "CurandRandomNumberKernel on device with a host random number array" );
 #else
@@ -114,7 +114,7 @@ namespace mg5amcCpu
     /*
     printf( "\nCurandRandomNumberKernel::generateRnarray size = %d\n", (int)m_rnarray.size() );
     fptype* data = m_rnarray.data();
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     if( m_rnarray.isOnDevice() )
     {
       data = new fptype[m_rnarray.size()]();
@@ -123,7 +123,7 @@ namespace mg5amcCpu
 #endif
     for( int i = 0; i < ( (int)m_rnarray.size() / 4 ); i++ )
       printf( "[%4d] %f %f %f %f\n", i * 4, data[i * 4], data[i * 4 + 2], data[i * 4 + 2], data[i * 4 + 3] );
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     if( m_rnarray.isOnDevice() ) delete[] data;
 #endif
     */
diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/EventStatistics.h b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/EventStatistics.h
index 48b51e0a49..e7d7f3b3c3 100644
--- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/EventStatistics.h
+++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/EventStatistics.h
@@ -16,7 +16,7 @@
 #include <limits>
 #include <string>
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/GpuAbstraction.h b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/GpuAbstraction.h
new file mode 100644
index 0000000000..782cb96e8c
--- /dev/null
+++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/GpuAbstraction.h
@@ -0,0 +1,86 @@
+#ifndef MG5AMC_GPUABSTRACTION_H
+#define MG5AMC_GPUABSTRACTION_H 1
+
+#include <cassert>
+
+/*
+  ToDo:
+      * Fix rpath in makefile when compiling with HIP
+      * Fix warnings with improper hip function return code handling
+*/
+
+
+#ifdef __CUDACC__
+  #define MGONGPUCPP_CUDACC 1
+#endif
+
+#ifdef __HIPCC__
+  #include "hip/hip_runtime.h"
+  #define MGONGPUCPP_HIPCC 1
+#endif
+
+#ifdef MGONGPUCPP_CUDACC
+
+  // Defines correct compiler
+  #define MGONGPUCPP_GPUIMPL __CUDACC__
+
+  //--------------------------------------------------------------------------
+
+  #define gpuError_t cudaError_t
+  #define gpuPeekAtLastError cudaPeekAtLastError
+  #define gpuGetErrorString cudaGetErrorString
+  #define gpuSuccess cudaSuccess
+
+  #define gpuMallocHost(ptr, size) checkGpu( cudaMallocHost(ptr, size) )
+  #define gpuMalloc(ptr, size) checkGpu( cudaMalloc(ptr, size) )
+
+  #define gpuMemcpy(dstData, srcData, srcBytes, func) checkGpu( cudaMemcpy(dstData, srcData, srcBytes, func) )
+  #define gpuMemcpyHostToDevice cudaMemcpyHostToDevice
+  #define gpuMemcpyDeviceToHost cudaMemcpyDeviceToHost
+  #define gpuMemcpyToSymbol(type1, type2, size) checkGpu( cudaMemcpyToSymbol(type1, type2, size) )
+
+  #define gpuFree(ptr) checkGpu( cudaFree(ptr) )
+  #define gpuFreeHost(ptr) checkGpu( cudaFreeHost(ptr) )
+
+  #define gpuSetDevice cudaSetDevice
+  #define gpuDeviceSynchronize cudaDeviceSynchronize
+  #define gpuDeviceReset cudaDeviceReset
+
+  #define gpuLaunchKernel( kernel, blocks, threads, ...)                    kernel<<<blocks, threads>>> (__VA_ARGS__)
+  #define gpuLaunchKernelSharedMem(kernel, blocks, threads, sharedMem, ...) kernel<<<blocks, threads, sharedMem>>>(__VA_ARGS__)
+
+//--------------------------------------------------------------------------
+
+#elif defined MGONGPUCPP_HIPCC
+
+  // Defines correct compiler
+  #define MGONGPUCPP_GPUIMPL __HCC__
+
+  //--------------------------------------------------------------------------
+
+  #define gpuError_t hipError_t
+  #define gpuPeekAtLastError hipPeekAtLastError
+  #define gpuGetErrorString hipGetErrorString
+  #define gpuSuccess hipSuccess
+
+  #define gpuMallocHost(ptr, size) checkGpu( hipHostMalloc(ptr, size) ) // HostMalloc better
+  #define gpuMalloc(ptr, size) checkGpu( hipMalloc(ptr, size) )
+
+  #define gpuMemcpy(dstData, srcData, srcBytes, func) checkGpu( hipMemcpy(dstData, srcData, srcBytes, func) )
+  #define gpuMemcpyHostToDevice hipMemcpyHostToDevice
+  #define gpuMemcpyDeviceToHost hipMemcpyDeviceToHost
+  #define gpuMemcpyToSymbol(type1, type2, size) checkGpu( hipMemcpyToSymbol(type1, type2, size) )
+
+  #define gpuFree(ptr) checkGpu( hipFree(ptr) )
+  #define gpuFreeHost(ptr) checkGpu( hipHostFree(ptr) )
+
+  #define gpuSetDevice hipSetDevice
+  #define gpuDeviceSynchronize hipDeviceSynchronize
+  #define gpuDeviceReset hipDeviceReset
+
+  #define gpuLaunchKernel( kernel, blocks, threads, ...)                    kernel<<<blocks, threads>>> (__VA_ARGS__)
+  #define gpuLaunchKernelSharedMem(kernel, blocks, threads, sharedMem, ...) kernel<<<blocks, threads, sharedMem>>>(__VA_ARGS__)
+
+#endif
+
+#endif // MG5AMC_GPUABSTRACTION_H
\ No newline at end of file
diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/GpuRuntime.h b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/GpuRuntime.h
new file mode 100644
index 0000000000..caa301ef24
--- /dev/null
+++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/GpuRuntime.h
@@ -0,0 +1,80 @@
+#ifndef MG5AMC_GPURUNTIME_H
+#define MG5AMC_GPURUNTIME_H 1
+
+// MG5AMC on GPU uses the CUDA runtime API, not the lower level CUDA driver API
+// See https://docs.nvidia.com/cuda/cuda-runtime-api/driver-vs-runtime-api.html#driver-vs-runtime-api
+
+#include "GpuAbstraction.h"
+
+#include <iostream>
+
+//--------------------------------------------------------------------------
+
+// See https://stackoverflow.com/a/14038590
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
+#define checkGpu( code ) { assertGpu( code, __FILE__, __LINE__ ); }
+inline void assertGpu( gpuError_t code, const char* file, int line, bool abort = true )
+{
+  if( code != gpuSuccess )
+  {
+    printf( "ERROR! assertGpu: '%s' (%d) in %s:%d\n", gpuGetErrorString( code ), code, file, line );
+    if( abort ) assert( code == gpuSuccess );
+  }
+}
+#endif /* clang-format on */
+
+//--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+namespace mg5amcGpu
+{
+  // Instantiate a CudaRuntime at the beginnining of the application's main to
+  // invoke cudaSetDevice(0) in the constructor and book a cudaDeviceReset() call in the destructor
+  // *** FIXME! This will all need to be designed differently when going to multi-GPU nodes! ***
+  struct GpuRuntime final
+  {
+    GpuRuntime( const bool debug = true )
+      : m_debug( debug ) { setUp( m_debug ); }
+    ~GpuRuntime() { tearDown( m_debug ); }
+    GpuRuntime( const GpuRuntime& ) = delete;
+    GpuRuntime( GpuRuntime&& ) = delete;
+    GpuRuntime& operator=( const GpuRuntime& ) = delete;
+    GpuRuntime& operator=( GpuRuntime&& ) = delete;
+    bool m_debug;
+
+    // Set up CUDA application
+    // ** NB: strictly speaking this is not needed when using the CUDA runtime API **
+    // Calling cudaSetDevice on startup is useful to properly book-keep the time spent in CUDA initialization
+    static void setUp( const bool debug = true )
+    {
+      // ** NB: it is useful to call cudaSetDevice, or cudaFree, to properly book-keep the time spent in CUDA initialization
+      // ** NB: otherwise, the first CUDA operation (eg a cudaMemcpyToSymbol in CPPProcess ctor) appears to take much longer!
+      /*
+      // [We initially added cudaFree(0) to "ease profile analysis" only because it shows up as a big recognizable block!]
+      // No explicit initialization is needed: https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#initialization
+      // It is not clear what cudaFree(0) does at all: https://stackoverflow.com/questions/69967813/
+      if ( debug ) std::cout << "__CudaRuntime: calling cudaFree(0)" << std::endl;
+      checkCuda( cudaFree( 0 ) ); // SLOW!
+      */
+      // Replace cudaFree(0) by cudaSetDevice(0), even if it is not really needed either
+      // (but see https://developer.nvidia.com/blog/cuda-pro-tip-always-set-current-device-avoid-multithreading-bugs)
+      if( debug ) std::cout << "__GpuRuntime: calling GpuSetDevice(0)" << std::endl;
+      checkGpu ( gpuSetDevice( 0 ) ); // SLOW!
+    }
+
+    // Tear down CUDA application (call cudaDeviceReset)
+    // ** NB: strictly speaking this is not needed when using the CUDA runtime API **
+    // Calling cudaDeviceReset on shutdown is only needed for checking memory leaks in cuda-memcheck
+    // See https://docs.nvidia.com/cuda/cuda-memcheck/index.html#leak-checking
+    static void tearDown( const bool debug = true )
+    {
+      if( debug ) std::cout << "__GpuRuntime: calling GpuDeviceReset()" << std::endl;
+      checkGpu( gpuDeviceReset() );
+    }
+  };
+}
+#endif
+
+//--------------------------------------------------------------------------
+
+#endif // MG5AMC_GPURUNTIME_H
diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MadgraphTest.h b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MadgraphTest.h
index fd7734ce42..5920d08bf7 100644
--- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MadgraphTest.h
+++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MadgraphTest.h
@@ -21,7 +21,7 @@
 #include <string>
 #include <vector>
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 using mg5amcGpu::CPPProcess;
 #else
 using mg5amcCpu::CPPProcess;
@@ -200,7 +200,7 @@ class MadgraphTest : public testing::TestWithParam<TestDriverBase*>
 
 // Since we link both the CPU-only and GPU tests into the same executable, we prevent
 // a multiply defined symbol by only compiling this in the non-CUDA phase:
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 
 /// Compare momenta and matrix elements.
 /// This uses an implementation of TestDriverBase to run a madgraph workflow,
@@ -309,6 +309,6 @@ TEST_P( MadgraphTest, CompareMomentaAndME )
   }
 }
 
-#endif // __CUDACC__
+#endif // MGONGPUCPP_GPUIMPL
 
 #endif /* MADGRAPHTEST_H_ */
diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MatrixElementKernels.cc b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MatrixElementKernels.cc
index 30257195b6..9191e138ec 100644
--- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MatrixElementKernels.cc
+++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MatrixElementKernels.cc
@@ -6,7 +6,7 @@
 #include "MatrixElementKernels.h"
 
 #include "CPPProcess.h"
-#include "CudaRuntime.h"
+#include "GpuRuntime.h"
 #include "MemoryAccessMomenta.h"
 #include "MemoryBuffers.h"
 
@@ -14,7 +14,7 @@
 
 //============================================================================
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 namespace mg5amcCpu
 {
 
@@ -143,7 +143,7 @@ namespace mg5amcCpu
 
 //============================================================================
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 {
 
diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MatrixElementKernels.h b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MatrixElementKernels.h
index 23e84757a2..4477a385ed 100644
--- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MatrixElementKernels.h
+++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MatrixElementKernels.h
@@ -10,7 +10,7 @@
 
 #include "MemoryBuffers.h"
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -81,7 +81,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating matrix element calculations on a CPU host
   class MatrixElementKernelHost final : public MatrixElementKernelBase, public NumberOfEvents
   {
@@ -130,7 +130,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   // A class encapsulating matrix element calculations on a GPU device
   class MatrixElementKernelDevice : public MatrixElementKernelBase, public NumberOfEvents
   {
diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MemoryAccessHelpers.h b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MemoryAccessHelpers.h
index c82a6c7635..67306c3922 100644
--- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MemoryAccessHelpers.h
+++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MemoryAccessHelpers.h
@@ -105,7 +105,7 @@ class KernelAccessHelper : public MemoryAccessHelper<T>
     }
     else
     {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
       const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid
       //printf( "kernelAccessRecord: ievt=%d threadId=%d\n", ievt, threadIdx.x );
       return T::ieventAccessRecord( buffer, ievt ); // NB fptype and fptype_sv coincide for CUDA
diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MemoryAccessMomenta.h b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MemoryAccessMomenta.h
index 0ac4faa3c7..f797f85ca5 100644
--- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MemoryAccessMomenta.h
+++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MemoryAccessMomenta.h
@@ -12,7 +12,7 @@
 #include "MemoryAccessHelpers.h"
 #include "MemoryAccessVectors.h"
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 using mg5amcGpu::CPPProcess;
 #else
 using mg5amcCpu::CPPProcess;
@@ -29,7 +29,7 @@ class MemoryAccessMomentaBase //_AOSOAv1
 
   // Number of Events Per Page in the momenta AOSOA memory buffer layout
   // (these are all best kept as a compile-time constants: see issue #23)
-#ifdef __CUDACC__ /* clang-format off */
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
   // -----------------------------------------------------------------------------------------------
   // --- GPUs: neppM is best set to a power of 2 times the number of fptype's in a 32-byte cacheline
   // --- This is relevant to ensure coalesced access to momenta in global memory
diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MemoryAccessRandomNumbers.h b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MemoryAccessRandomNumbers.h
index e2988d39f3..949a42066d 100644
--- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MemoryAccessRandomNumbers.h
+++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MemoryAccessRandomNumbers.h
@@ -11,7 +11,7 @@
 #include "CPPProcess.h"
 #include "MemoryAccessHelpers.h"
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 using mg5amcGpu::CPPProcess;
 #else
 using mg5amcCpu::CPPProcess;
diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MemoryAccessVectors.h b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MemoryAccessVectors.h
index e9b197368e..a9ae26b6dc 100644
--- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MemoryAccessVectors.h
+++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MemoryAccessVectors.h
@@ -10,7 +10,7 @@
 
 #include "mgOnGpuVectors.h"
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 namespace mg5amcCpu // this is only needed for CPU SIMD vectorization
 {
 
diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MemoryBuffers.h b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MemoryBuffers.h
index 3093e6ed18..1da79d70d6 100644
--- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MemoryBuffers.h
+++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MemoryBuffers.h
@@ -11,12 +11,11 @@
 #include "mgOnGpuCxtypes.h"
 
 #include "CPPProcess.h"
-#include "CudaRuntime.h"
 #include "Parameters_sm.h"
 
 #include <sstream>
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -87,7 +86,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   constexpr bool HostBufferALIGNED = false;   // ismisaligned=false
   constexpr bool HostBufferMISALIGNED = true; // ismisaligned=true
 
@@ -119,7 +118,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   // A class encapsulating a CUDA pinned host buffer
   template<typename T>
   class PinnedHostBufferBase : public BufferBase<T>
@@ -139,7 +138,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   // A class encapsulating a CUDA device buffer
   template<typename T>
   class DeviceBufferBase : public BufferBase<T>
@@ -159,7 +158,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for a given number of events
   template<typename T, size_t sizePerEvent, bool ismisaligned>
   class HostBuffer : public HostBufferBase<T, ismisaligned>, virtual private NumberOfEvents
@@ -175,7 +174,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   // A class encapsulating a CUDA pinned host buffer for a given number of events
   template<typename T, size_t sizePerEvent>
   class PinnedHostBuffer : public PinnedHostBufferBase<T>, virtual private NumberOfEvents
@@ -191,7 +190,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   // A class encapsulating a CUDA device buffer for a given number of events
   template<typename T, size_t sizePerEvent>
   class DeviceBuffer : public DeviceBufferBase<T>, virtual private NumberOfEvents
@@ -213,7 +212,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for momenta random numbers
   constexpr size_t sizePerEventRndNumMomenta = MemoryBuffers::np4 * MemoryBuffers::nparf;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for momenta random numbers
   typedef HostBuffer<fptype, sizePerEventRndNumMomenta, HostBufferALIGNED> HostBufferRndNumMomenta;
 #else
@@ -232,7 +231,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer with ONE fptype per event
   constexpr size_t sizePerEventOneFp = 1;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer with ONE fptype per event
   typedef HostBuffer<fptype, sizePerEventOneFp, HostBufferALIGNED> HostBufferOneFp;
 #else
@@ -257,7 +256,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for Gs
   constexpr size_t sizePerEventGs = 1;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for gs
   typedef HostBuffer<fptype, sizePerEventGs, HostBufferALIGNED> HostBufferGs;
 #else
@@ -276,7 +275,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for numerators
   constexpr size_t sizePerEventNumerators = 1;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for gs
   typedef HostBuffer<fptype, sizePerEventNumerators, HostBufferALIGNED> HostBufferNumerators;
 #else
@@ -296,7 +295,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for denominators
   constexpr size_t sizePerEventDenominators = 1;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for gs
   typedef HostBuffer<fptype, sizePerEventDenominators, HostBufferALIGNED> HostBufferDenominators;
 #else
@@ -315,7 +314,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for random numbers
   constexpr size_t sizePerEventCouplings = MemoryBuffers::ndcoup * MemoryBuffers::nx2;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for gs
   typedef HostBuffer<fptype, sizePerEventCouplings, HostBufferALIGNED> HostBufferCouplings;
 #else
@@ -333,7 +332,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for momenta
   constexpr size_t sizePerEventMomenta = MemoryBuffers::np4 * MemoryBuffers::npar;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for momenta
   typedef HostBuffer<fptype, sizePerEventMomenta, HostBufferALIGNED> HostBufferMomenta;
   //typedef HostBuffer<fptype, sizePerEventMomenta, HostBufferMISALIGNED> HostBufferMomenta; // TEST MISALIGNMENT!
@@ -352,7 +351,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for sampling weights
   constexpr size_t sizePerEventWeights = 1;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for sampling weights
   typedef HostBuffer<fptype, sizePerEventWeights, HostBufferALIGNED> HostBufferWeights;
 #else
@@ -370,7 +369,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for matrix elements
   constexpr size_t sizePerEventMatrixElements = 1;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for matrix elements
   typedef HostBuffer<fptype, sizePerEventMatrixElements, HostBufferALIGNED> HostBufferMatrixElements;
 #else
@@ -385,7 +384,7 @@ namespace mg5amcCpu
   // A base class encapsulating a memory buffer for the helicity mask
   typedef BufferBase<bool> BufferHelicityMask;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for the helicity mask
   typedef HostBufferBase<bool, HostBufferALIGNED> HostBufferHelicityMask;
 #else
@@ -403,7 +402,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for wavefunctions
   constexpr size_t sizePerEventWavefunctions = MemoryBuffers::nw6 * MemoryBuffers::nx2;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for wavefunctions
   typedef HostBuffer<fptype, sizePerEventWavefunctions, HostBufferALIGNED> HostBufferWavefunctions;
 #else
@@ -421,7 +420,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for helicity random numbers
   constexpr size_t sizePerEventRndNumHelicity = 1;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for helicity random numbers
   typedef HostBuffer<fptype, sizePerEventRndNumHelicity, HostBufferALIGNED> HostBufferRndNumHelicity;
 #else
@@ -439,7 +438,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for color random numbers
   constexpr size_t sizePerEventRndNumColor = 1;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for color random numbers
   typedef HostBuffer<fptype, sizePerEventRndNumColor, HostBufferALIGNED> HostBufferRndNumColor;
 #else
@@ -457,7 +456,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for helicity selection
   constexpr size_t sizePerEventSelectedHelicity = 1;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for helicity selection
   typedef HostBuffer<int, sizePerEventSelectedHelicity, HostBufferALIGNED> HostBufferSelectedHelicity;
 #else
@@ -475,7 +474,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for color selection
   constexpr size_t sizePerEventSelectedColor = 1;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for color selection
   typedef HostBuffer<int, sizePerEventSelectedColor, HostBufferALIGNED> HostBufferSelectedColor;
 #else
@@ -487,7 +486,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   template<class Tdst, class Tsrc>
   void copyDeviceFromHost( Tdst& dst, const Tsrc& src ) // keep the same order of arguments as in memcpy
   {
@@ -510,7 +509,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   template<class Tdst, class Tsrc>
   void copyHostFromDevice( Tdst& dst, const Tsrc& src ) // keep the same order of arguments as in memcpy
   {
diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/CPPProcess.cc b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/CPPProcess.cc
index ce2688fb54..e72332a582 100644
--- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/CPPProcess.cc
+++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/CPPProcess.cc
@@ -16,7 +16,6 @@
 
 #include "mgOnGpuConfig.h"
 
-#include "CudaRuntime.h"
 #include "HelAmps_sm.h"
 #include "MemoryAccessAmplitudes.h"
 #include "MemoryAccessCouplings.h"
@@ -46,7 +45,7 @@
 // Class member functions for calculating the matrix elements for
 // Process: g g > t t~ g g WEIGHTED<=4 @1
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -80,7 +79,7 @@ namespace mg5amcCpu
   __device__ const fptype cIPD[2] = { (fptype)Parameters_sm::mdl_MT, (fptype)Parameters_sm::mdl_WT };
   __device__ const fptype* cIPC = nullptr; // unused as nicoup=0
 #else
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   __device__ __constant__ fptype cIPD[2];
   __device__ __constant__ fptype* cIPC = nullptr; // unused as nicoup=0
 #else
@@ -90,7 +89,7 @@ namespace mg5amcCpu
 #endif
 
   // Helicity combinations (and filtering of "good" helicity combinations)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   __device__ __constant__ short cHel[ncomb][npar];
   __device__ __constant__ int cNGoodHel;
   __device__ __constant__ int cGoodHel[ncomb];
@@ -118,13 +117,13 @@ namespace mg5amcCpu
                            fptype* allDenominators,       // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
                            fptype_sv* jamp2_sv            // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled)
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
                            , const int ievt00             // input: first event number in current C++ event page (for CUDA, ievt depends on threadid)
 #endif
                            )
   //ALWAYS_INLINE // attributes are not permitted in a function definition
   {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     using namespace mg5amcGpu;
     using M_ACCESS = DeviceAccessMomenta;         // non-trivial access: buffer includes all events
     using E_ACCESS = DeviceAccessMatrixElements;  // non-trivial access: buffer includes all events
@@ -151,7 +150,7 @@ namespace mg5amcCpu
 #endif /* clang-format on */
     mgDebug( 0, __FUNCTION__ );
     //printf( "calculate_wavefunctions: ihel=%2d\n", ihel );
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
     //printf( "calculate_wavefunctions: ievt00=%d\n", ievt00 );
 #endif
 
@@ -187,12 +186,12 @@ namespace mg5amcCpu
 #endif
     for( int iParity = 0; iParity < nParity; ++iParity )
     { // START LOOP ON IPARITY
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
       const int ievt0 = ievt00 + iParity * neppV;
 #endif
       constexpr size_t nxcoup = ndcoup + nicoup; // both dependent and independent couplings
       const fptype* allCOUPs[nxcoup];
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 #pragma nv_diagnostic push
 #pragma nv_diag_suppress 186 // e.g. <<warning #186-D: pointless comparison of unsigned integer with zero>>
 #endif
@@ -200,7 +199,7 @@ namespace mg5amcCpu
         allCOUPs[idcoup] = CD_ACCESS::idcoupAccessBufferConst( allcouplings, idcoup ); // dependent couplings, vary event-by-event
       for( size_t iicoup = 0; iicoup < nicoup; iicoup++ )
         allCOUPs[ndcoup + iicoup] = CI_ACCESS::iicoupAccessBufferConst( cIPC, iicoup ); // independent couplings, fixed for all events
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 #pragma nv_diagnostic pop
       // CUDA kernels take input/output buffers with momenta/MEs for all events
       const fptype* momenta = allmomenta;
@@ -2417,7 +2416,7 @@ namespace mg5amcCpu
         { 62, 71, -10, 80, -1, 8, -28, 62, 62, -10, -10, -1, -1, 8, -10, -1, -64, 8, 8, -64, 80, 8, 512, -64 },
         { -28, 62, 62, -10, -10, -1, 62, 71, -10, 80, -1, 8, -10, -1, -1, 8, 8, -64, 80, 8, 8, -64, -64, 512 } }; // 2-D array[24][24]
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
       // Pre-compute a constexpr triangular color matrix properly normalized #475
       struct TriangularNormalizedColorMatrix
       {
@@ -2474,7 +2473,7 @@ namespace mg5amcCpu
 #endif
       for( int icol = 0; icol < ncolor; icol++ )
       {
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
         // === C++ START ===
         // Diagonal terms
 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
@@ -2533,7 +2532,7 @@ namespace mg5amcCpu
       MEs_sv_previous += deltaMEs_previous;
 #endif
       /*
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
       if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", blockDim.x * blockIdx.x + threadIdx.x, ihel, MEs_sv );
 #else
 #ifdef MGONGPU_CPPSIMD
@@ -2628,7 +2627,7 @@ namespace mg5amcCpu
       { 1, 1, 1, -1, -1, 1 },
       { 1, 1, 1, -1, 1, -1 },
       { 1, 1, 1, -1, 1, 1 } };
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     checkCuda( cudaMemcpyToSymbol( cHel, tHel, ncomb * npar * sizeof( short ) ) );
 #else
     memcpy( cHel, tHel, ncomb * npar * sizeof( short ) );
@@ -2671,7 +2670,7 @@ namespace mg5amcCpu
     // Then copy them to CUDA constant memory (issue #39) or its C++ emulation in file-scope static memory
     const fptype tIPD[2] = { (fptype)m_pars->mdl_MT, (fptype)m_pars->mdl_WT };
     //const cxtype tIPC[0] = { ... }; // nicoup=0
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     checkCuda( cudaMemcpyToSymbol( cIPD, tIPD, 2 * sizeof( fptype ) ) );
     //checkCuda( cudaMemcpyToSymbol( cIPC, tIPC, 0 * sizeof( cxtype ) ) ); // nicoup=0
 #else
@@ -2711,7 +2710,7 @@ namespace mg5amcCpu
   {
     std::stringstream out;
     // CUDA version (NVCC)
-    // [Use __NVCC__ instead of __CUDACC__ here!]
+    // [Use __NVCC__ instead of MGONGPUCPP_GPUIMPL here!]
     // [This tests if 'nvcc' was used even to build a .cc file, even if not necessarily 'nvcc -x cu' for a .cu file]
     // [Check 'nvcc --compiler-options -dM -E dummy.c | grep CUDA': see https://stackoverflow.com/a/53713712]
 #ifdef __NVCC__
@@ -2776,12 +2775,12 @@ namespace mg5amcCpu
   __global__ void /* clang-format off */
   computeDependentCouplings( const fptype* allgs, // input: Gs[nevt]
                              fptype* allcouplings // output: couplings[nevt*ndcoup*2]
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
                              , const int nevt     // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #endif
   ) /* clang-format on */
   {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     using namespace mg5amcGpu;
     using G_ACCESS = DeviceAccessGs;
     using C_ACCESS = DeviceAccessCouplings;
@@ -2802,7 +2801,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__ /* clang-format off */
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
   __global__ void
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
@@ -2931,7 +2930,7 @@ namespace mg5amcCpu
         nGoodHel++;
       }
     }
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     checkCuda( cudaMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) ) );
     checkCuda( cudaMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) ) );
 #else
@@ -2957,7 +2956,7 @@ namespace mg5amcCpu
 #endif
             int* allselhel,                // output: helicity selection[nevt]
             int* allselcol                 // output: helicity selection[nevt]
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
             , const int nevt               // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #endif
             ) /* clang-format on */
@@ -2978,7 +2977,7 @@ namespace mg5amcCpu
     // Denominators: spins, colors and identical particles
     constexpr int helcolDenominators[1] = { 512 }; // assume nprocesses == 1 (#272 and #343)
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     // Remember: in CUDA this is a kernel for one event, in c++ this processes n events
     const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid
 #else
@@ -2994,7 +2993,7 @@ namespace mg5amcCpu
     // Start sigmaKin_lines
     // === PART 0 - INITIALISATION (before calculate_wavefunctions) ===
     // Reset the "matrix elements" - running sums of |M|^2 over helicities for the given event
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     allMEs[ievt] = 0;
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     allNumerators[ievt] = 0;
@@ -3022,7 +3021,7 @@ namespace mg5amcCpu
     // === PART 1 - HELICITY LOOP: CALCULATE WAVEFUNCTIONS ===
     // (in both CUDA and C++, using precomputed good helicities)
 
-#ifdef __CUDACC__ // CUDA OR C++
+#ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++
 
     // *** START OF PART 1a - CUDA (one event per CPU thread) ***
     // Running sum of partial amplitudes squared for event by event color selection (#402)
@@ -3226,7 +3225,7 @@ namespace mg5amcCpu
     // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event
     // [NB 'sum over final spins, average over initial spins', eg see
     // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf]
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     allMEs[ievt] /= helcolDenominators[0];
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     if( channelId > 0 ) allMEs[ievt] *= allNumerators[ievt] / allDenominators[ievt];
diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/CPPProcess.h b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/CPPProcess.h
index b3323a7a84..c926411529 100644
--- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/CPPProcess.h
+++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/CPPProcess.h
@@ -25,7 +25,7 @@
 
 //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -107,7 +107,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   __global__ void
   computeDependentCouplings( const fptype* allgs,    // input: Gs[nevt]
                              fptype* allcouplings ); // output: couplings[nevt*ndcoup*2]
@@ -120,7 +120,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__ /* clang-format off */
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
   __global__ void
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
@@ -150,7 +150,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__ /* clang-format off */
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
   __global__ void
   sigmaKin( const fptype* allmomenta,      // input: momenta[nevt*npar*4]
             const fptype* allcouplings,    // input: couplings[nevt*ndcoup*2]
diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/GpuAbstraction.h b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/GpuAbstraction.h
new file mode 120000
index 0000000000..72054e19ba
--- /dev/null
+++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/GpuAbstraction.h
@@ -0,0 +1 @@
+../GpuAbstraction.h
\ No newline at end of file
diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/GpuRuntime.h b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/GpuRuntime.h
new file mode 120000
index 0000000000..3920e83be4
--- /dev/null
+++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/GpuRuntime.h
@@ -0,0 +1 @@
+../GpuRuntime.h
\ No newline at end of file
diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/check_sa.cc b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/check_sa.cc
index f1e75b9252..d7d40e140c 100644
--- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/check_sa.cc
+++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/check_sa.cc
@@ -63,7 +63,7 @@ usage( char* argv0, int ret = 1 )
   std::cout << std::endl;
   std::cout << "Summary stats are always computed: '-p' and '-j' only control their printout" << std::endl;
   std::cout << "The '-d' flag only enables NaN/abnormal warnings and OMP debugging" << std::endl;
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 #ifdef _OPENMP
   std::cout << std::endl;
   std::cout << "Use the OMP_NUM_THREADS environment variable to control OMP multi-threading" << std::endl;
@@ -77,7 +77,7 @@ int
 main( int argc, char** argv )
 {
   // Namespaces for CUDA and C++ (FIXME - eventually use the same namespace everywhere...)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   using namespace mg5amcGpu;
 #else
   using namespace mg5amcCpu;
@@ -102,7 +102,7 @@ main( int argc, char** argv )
     CurandHost = 1,
     CurandDevice = 2
   };
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   RandomNumberMode rndgen = RandomNumberMode::CurandDevice; // default on GPU
 #elif not defined MGONGPU_HAS_NO_CURAND
   RandomNumberMode rndgen = RandomNumberMode::CurandHost;  // default on CPU if build has curand
@@ -115,7 +115,7 @@ main( int argc, char** argv )
     RamboHost = 1,
     RamboDevice = 2
   };
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   RamboSamplingMode rmbsmp = RamboSamplingMode::RamboDevice; // default on GPU
 #else
   RamboSamplingMode rmbsmp = RamboSamplingMode::RamboHost; // default on CPU
@@ -145,7 +145,7 @@ main( int argc, char** argv )
     }
     else if( arg == "--curdev" )
     {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
       rndgen = RandomNumberMode::CurandDevice;
 #else
       throw std::runtime_error( "CurandDevice is not supported on CPUs" );
@@ -165,7 +165,7 @@ main( int argc, char** argv )
     }
     else if( arg == "--rmbdev" )
     {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
       rmbsmp = RamboSamplingMode::RamboDevice;
 #else
       throw std::runtime_error( "RamboDevice is not supported on CPUs" );
@@ -239,13 +239,13 @@ main( int argc, char** argv )
     return usage( argv[0] );
   }
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 #ifdef _OPENMP
   ompnumthreadsNotSetMeansOneThread( debug ? 1 : 0 ); // quiet(-1), info(0), debug(1)
 #endif
 #endif
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // Fail gently and avoid "Illegal instruction (core dumped)" if the host does not support the SIMD used in the ME calculation
   // Note: this prevents a crash on pmpe04 but not on some github CI nodes?
   // [NB: SIMD vectorization in mg5amc C++ code is only used in the ME calculation below MatrixElementKernelHost!]
@@ -263,7 +263,7 @@ main( int argc, char** argv )
 
   // === STEP 0 - INITIALISE
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 
   // --- 00. Initialise cuda
   // Instantiate a CudaRuntime at the beginnining of the application's main to
@@ -292,7 +292,7 @@ main( int argc, char** argv )
   timermap.start( alloKey );
 
   // Memory buffers for random numbers for momenta
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferRndNumMomenta hstRndmom( nevt );
 #else
   PinnedHostBufferRndNumMomenta hstRndmom( nevt );
@@ -300,7 +300,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for sampling weights
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferWeights hstWeights( nevt );
 #else
   PinnedHostBufferWeights hstWeights( nevt );
@@ -308,7 +308,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for momenta
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferMomenta hstMomenta( nevt );
 #else
   PinnedHostBufferMomenta hstMomenta( nevt );
@@ -316,7 +316,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for Gs
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferGs hstGs( nevt );
 #else
   PinnedHostBufferGs hstGs( nevt );
@@ -333,7 +333,7 @@ main( int argc, char** argv )
   }
 
   // Memory buffers for matrix elements
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferMatrixElements hstMatrixElements( nevt );
 #else
   PinnedHostBufferMatrixElements hstMatrixElements( nevt );
@@ -342,7 +342,7 @@ main( int argc, char** argv )
 
   // Memory buffers for random numbers for helicity selection
   // *** NB #403 these buffers always remain initialised at 0: no need for helicity choice in gcheck/check (no LHE produced) ***
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferRndNumHelicity hstRndHel( nevt );
 #else
   PinnedHostBufferRndNumHelicity hstRndHel( nevt );
@@ -351,7 +351,7 @@ main( int argc, char** argv )
 
   // Memory buffers for random numbers for color selection
   // *** NB #402 these buffers always remain initialised at 0: no need for color choice in gcheck/check (no LHE produced) ***
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferRndNumColor hstRndCol( nevt );
 #else
   PinnedHostBufferRndNumColor hstRndCol( nevt );
@@ -359,7 +359,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for helicity selection
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferSelectedHelicity hstSelHel( nevt );
 #else
   PinnedHostBufferSelectedHelicity hstSelHel( nevt );
@@ -367,7 +367,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for color selection
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferSelectedColor hstSelCol( nevt );
 #else
   PinnedHostBufferSelectedColor hstSelCol( nevt );
@@ -394,7 +394,7 @@ main( int argc, char** argv )
     const bool onDevice = false;
     prnk.reset( new CurandRandomNumberKernel( hstRndmom, onDevice ) );
   }
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   else
   {
     const bool onDevice = true;
@@ -421,7 +421,7 @@ main( int argc, char** argv )
   }
   else
   {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     prsk.reset( new RamboSamplingKernelDevice( energy, devRndmom, devMomenta, devWeights, gpublocks, gputhreads ) );
 #else
     throw std::logic_error( "RamboDevice is not supported on CPUs" ); // INTERNAL ERROR (no path to this statement)
@@ -432,7 +432,7 @@ main( int argc, char** argv )
   std::unique_ptr<MatrixElementKernelBase> pmek;
   if( !bridge )
   {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     pmek.reset( new MatrixElementKernelDevice( devMomenta, devGs, devRndHel, devRndCol, devMatrixElements, devSelHel, devSelCol, gpublocks, gputhreads ) );
 #else
     pmek.reset( new MatrixElementKernelHost( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, nevt ) );
@@ -440,7 +440,7 @@ main( int argc, char** argv )
   }
   else
   {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     pmek.reset( new BridgeKernelDevice( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, gpublocks, gputhreads ) );
 #else
     pmek.reset( new BridgeKernelHost( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, nevt ) );
@@ -482,7 +482,7 @@ main( int argc, char** argv )
     prnk->generateRnarray();
     //std::cout << "Got random numbers" << std::endl;
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     if( rndgen != RandomNumberMode::CurandDevice && rmbsmp == RamboSamplingMode::RamboDevice )
     {
       // --- 1c. Copy rndmom from host to device
@@ -514,7 +514,7 @@ main( int argc, char** argv )
     prsk->getMomentaFinal();
     //std::cout << "Got final momenta" << std::endl;
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     if( rmbsmp == RamboSamplingMode::RamboDevice )
     {
       // --- 2c. CopyDToH Weights
@@ -559,7 +559,7 @@ main( int argc, char** argv )
       dynamic_cast<BridgeKernelBase*>( pmek.get() )->transposeInputMomentaC2F();
     }
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     // --- 2d. CopyHToD Momenta
     const std::string gKey = "0.. CpHTDg";
     rambtime += timermap.start( gKey ); // FIXME! NOT A RAMBO TIMER!
@@ -588,7 +588,7 @@ main( int argc, char** argv )
     wv3atime += timermap.stop(); // calc only
     wavetime += wv3atime;        // calc plus copy
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     if( !bridge )
     {
       // --- 3b. CopyDToH MEs
@@ -729,7 +729,7 @@ main( int argc, char** argv )
     rndgentxt = "CURAND HOST";
   else if( rndgen == RandomNumberMode::CurandDevice )
     rndgentxt = "CURAND DEVICE";
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   rndgentxt += " (CUDA code)";
 #else
   rndgentxt += " (C++ code)";
@@ -738,7 +738,7 @@ main( int argc, char** argv )
   // Workflow description summary
   std::string wrkflwtxt;
   // -- CUDA or C++?
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   wrkflwtxt += "CUD:";
 #else
   wrkflwtxt += "CPP:";
@@ -754,7 +754,7 @@ main( int argc, char** argv )
   wrkflwtxt += "???+"; // no path to this statement
 #endif
   // -- CUCOMPLEX or THRUST or STD complex numbers?
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 #if defined MGONGPU_CUCXTYPE_CUCOMPLEX
   wrkflwtxt += "CUX:";
 #elif defined MGONGPU_CUCXTYPE_THRUST
@@ -789,7 +789,7 @@ main( int argc, char** argv )
     wrkflwtxt += "RMBDEV+";
   else
     wrkflwtxt += "??????+"; // no path to this statement
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   // -- HOST or DEVICE matrix elements? Standalone MEs or BRIDGE?
   if( !bridge )
     wrkflwtxt += "MESDEV";
@@ -845,7 +845,7 @@ main( int argc, char** argv )
 
   if( perf )
   {
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 #ifdef _OPENMP
     // Get the output of "nproc --all" (https://stackoverflow.com/a/478960)
     std::string nprocall;
@@ -864,7 +864,7 @@ main( int argc, char** argv )
 #endif
     // Dump all configuration parameters and all results
     std::cout << std::string( SEP79, '*' ) << std::endl
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
               << "Process                     = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_CUDA"
 #else
               << "Process                     = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_CPP"
@@ -892,7 +892,7 @@ main( int argc, char** argv )
 #elif defined MGONGPU_FPTYPE_FLOAT
               << "FP precision                = FLOAT (NaN/abnormal=" << nabn << ", zero=" << nzero << ")" << std::endl
 #endif
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 #if defined MGONGPU_CUCXTYPE_CUCOMPLEX
               << "Complex type                = CUCOMPLEX" << std::endl
 #elif defined MGONGPU_CUCXTYPE_THRUST
@@ -906,7 +906,7 @@ main( int argc, char** argv )
               << " [HARDCODED FOR REPRODUCIBILITY]" << std::endl
               << "Momenta memory layout       = AOSOA[" << neppM << "]"
               << ( neppM == 1 ? " == AOS" : "" ) << std::endl
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     //<< "Wavefunction GPU memory     = LOCAL" << std::endl
 #else
 #if !defined MGONGPU_CPPSIMD
@@ -937,7 +937,7 @@ main( int argc, char** argv )
 #endif
 #endif
               << "Random number generation    = " << rndgentxt << std::endl
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 #ifdef _OPENMP
               << "OMP threads / `nproc --all` = " << omp_get_max_threads() << " / " << nprocall // includes a newline
 #endif
@@ -1033,7 +1033,7 @@ main( int argc, char** argv )
              << "\"FLOAT (NaN/abnormal=" << nabn << ")\"," << std::endl
 #endif
              << "\"Complex type\": "
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 #if defined MGONGPU_CUCXTYPE_CUCOMPLEX
              << "\"CUCOMPLEX\"," << std::endl
 #elif defined MGONGPU_CUCXTYPE_THRUST
@@ -1048,7 +1048,7 @@ main( int argc, char** argv )
              << "\"Momenta memory layout\": "
              << "\"AOSOA[" << neppM << "]\""
              << ( neppM == 1 ? " == AOS" : "" ) << ", " << std::endl
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     //<< "\"Wavefunction GPU memory\": " << "\"LOCAL\"," << std::endl
 #endif
              << "\"Curand generation\": "
diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/RamboSamplingKernels.cc b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/RamboSamplingKernels.cc
index da68aa9255..8412ec06ed 100644
--- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/RamboSamplingKernels.cc
+++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/RamboSamplingKernels.cc
@@ -5,7 +5,7 @@
 
 #include "RamboSamplingKernels.h"
 
-#include "CudaRuntime.h"
+#include "GpuRuntime.h"
 #include "MemoryAccessMomenta.h"
 #include "MemoryAccessRandomNumbers.h"
 #include "MemoryAccessWeights.h"
@@ -14,7 +14,7 @@
 
 #include <sstream>
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -92,7 +92,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   RamboSamplingKernelDevice::RamboSamplingKernelDevice( const fptype energy,               // input: energy
                                                         const BufferRndNumMomenta& rndmom, // input: random numbers in [0,1]
                                                         BufferMomenta& momenta,            // output: momenta
@@ -135,7 +135,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   __global__ void
   getMomentaInitialDevice( const fptype energy,
                            fptype* momenta )
@@ -147,7 +147,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   void
   RamboSamplingKernelDevice::getMomentaInitial()
   {
@@ -157,7 +157,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   __global__ void
   getMomentaFinalDevice( const fptype energy,
                          const fptype* rndmom,
@@ -171,7 +171,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   void
   RamboSamplingKernelDevice::getMomentaFinal()
   {
diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/RamboSamplingKernels.h b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/RamboSamplingKernels.h
index 184089efd7..fe63a7bb77 100644
--- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/RamboSamplingKernels.h
+++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/RamboSamplingKernels.h
@@ -10,7 +10,7 @@
 
 #include "MemoryBuffers.h"
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -93,7 +93,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   // A class encapsulating RAMBO phase space sampling on a GPU device
   class RamboSamplingKernelDevice final : public SamplingKernelBase, public NumberOfEvents
   {
diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/RandomNumberKernels.h b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/RandomNumberKernels.h
index 188a72c2c9..0c215f2583 100644
--- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/RandomNumberKernels.h
+++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/RandomNumberKernels.h
@@ -8,7 +8,7 @@
 
 #include "mgOnGpuConfig.h"
 
-// NB This must come AFTER mgOnGpuConfig.h which contains our definition of __global__ when __CUDACC__ is not defined
+// NB This must come AFTER mgOnGpuConfig.h which contains our definition of __global__ when MGONGPUCPP_GPUIMPL is not defined
 #ifndef MGONGPU_HAS_NO_CURAND
 //#include "curand.h"
 struct curandGenerator_st; // forward definition from curand.h
@@ -16,7 +16,7 @@ struct curandGenerator_st; // forward definition from curand.h
 
 #include "MemoryBuffers.h"
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/cudacpp.mk b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/cudacpp.mk
index a0397e9ecc..ff63aa4a43 100644
--- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/cudacpp.mk
+++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/cudacpp.mk
@@ -32,7 +32,7 @@ UNAME_P := $(shell uname -p)
 #=== Configure common compiler flags for C++ and CUDA
 
 INCFLAGS = -I.
-OPTFLAGS = -O3 # this ends up in CUFLAGS too (should it?), cannot add -Ofast or -ffast-math here
+OPTFLAGS = -O3 # this ends up in GPUFLAGS too (should it?), cannot add -Ofast or -ffast-math here
 
 # Dependency on src directory
 MG5AMC_COMMONLIB = mg5amc_common
@@ -89,69 +89,139 @@ endif
 
 #-------------------------------------------------------------------------------
 
-#=== Configure the CUDA compiler
-
-# If CXX is not a single word (example "clang++ --gcc-toolchain...") then disable CUDA builds (issue #505)
-# This is because it is impossible to pass this to "CUFLAGS += -ccbin <host-compiler>" below
-ifneq ($(words $(subst ccache ,,$(CXX))),1) # allow at most "CXX=ccache <host-compiler>" from outside
-  $(warning CUDA builds are not supported for multi-word CXX "$(CXX)")
-  override CUDA_HOME=disabled
-endif
-
-# If CUDA_HOME is not set, try to set it from the location of nvcc
-ifndef CUDA_HOME
-  CUDA_HOME = $(patsubst %bin/nvcc,%,$(shell which nvcc 2>/dev/null))
-  $(warning CUDA_HOME was not set: using "$(CUDA_HOME)")
-endif
-
-# Set NVCC as $(CUDA_HOME)/bin/nvcc if it exists
-ifneq ($(wildcard $(CUDA_HOME)/bin/nvcc),)
-  NVCC = $(CUDA_HOME)/bin/nvcc
-  USE_NVTX ?=-DUSE_NVTX
-  # See https://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/index.html
-  # See https://arnon.dk/matching-sm-architectures-arch-and-gencode-for-various-nvidia-cards/
-  # Default: use compute capability 70 for V100 (CERN lxbatch, CERN itscrd, Juwels Cluster).
-  # Embed device code for 70, and PTX for 70+.
-  # Export MADGRAPH_CUDA_ARCHITECTURE (comma-separated list) to use another value or list of values (see #533).
-  # Examples: use 60 for P100 (Piz Daint), 80 for A100 (Juwels Booster, NVidia raplab/Curiosity).
-  MADGRAPH_CUDA_ARCHITECTURE ?= 70
-  ###CUARCHFLAGS = -gencode arch=compute_$(MADGRAPH_CUDA_ARCHITECTURE),code=compute_$(MADGRAPH_CUDA_ARCHITECTURE) -gencode arch=compute_$(MADGRAPH_CUDA_ARCHITECTURE),code=sm_$(MADGRAPH_CUDA_ARCHITECTURE) # Older implementation (AV): go back to this one for multi-GPU support #533
-  ###CUARCHFLAGS = --gpu-architecture=compute_$(MADGRAPH_CUDA_ARCHITECTURE) --gpu-code=sm_$(MADGRAPH_CUDA_ARCHITECTURE),compute_$(MADGRAPH_CUDA_ARCHITECTURE) # Newer implementation (SH): cannot use this as-is for multi-GPU support #533
-  comma:=,
-  CUARCHFLAGS = $(foreach arch,$(subst $(comma), ,$(MADGRAPH_CUDA_ARCHITECTURE)),-gencode arch=compute_$(arch),code=compute_$(arch) -gencode arch=compute_$(arch),code=sm_$(arch))
-  CUINC = -I$(CUDA_HOME)/include/
-  CURANDLIBFLAGS = -L$(CUDA_HOME)/lib64/ -lcurand # NB: -lcuda is not needed here!
-  CUOPTFLAGS = -lineinfo
-  CUFLAGS = $(OPTFLAGS) $(CUOPTFLAGS) $(INCFLAGS) $(CUINC) $(USE_NVTX) $(CUARCHFLAGS) -use_fast_math
-  ###CUFLAGS += -Xcompiler -Wall -Xcompiler -Wextra -Xcompiler -Wshadow
-  ###NVCC_VERSION = $(shell $(NVCC) --version | grep 'Cuda compilation tools' | cut -d' ' -f5 | cut -d, -f1)
-  CUFLAGS += -std=c++17 # need CUDA >= 11.2 (see #333): this is enforced in mgOnGpuConfig.h
-  # Without -maxrregcount: baseline throughput: 6.5E8 (16384 32 12) up to 7.3E8 (65536 128 12)
-  ###CUFLAGS+= --maxrregcount 160 # improves throughput: 6.9E8 (16384 32 12) up to 7.7E8 (65536 128 12)
-  ###CUFLAGS+= --maxrregcount 128 # improves throughput: 7.3E8 (16384 32 12) up to 7.6E8 (65536 128 12)
-  ###CUFLAGS+= --maxrregcount 96 # degrades throughput: 4.1E8 (16384 32 12) up to 4.5E8 (65536 128 12)
-  ###CUFLAGS+= --maxrregcount 64 # degrades throughput: 1.7E8 (16384 32 12) flat at 1.7E8 (65536 128 12)
-else ifneq ($(origin REQUIRE_CUDA),undefined)
-  # If REQUIRE_CUDA is set but no cuda is found, stop here (e.g. for CI tests on GPU #443)
-  $(error No cuda installation found (set CUDA_HOME or make nvcc visible in PATH))
-else
-  # No cuda. Switch cuda compilation off and go to common random numbers in C++
-  $(warning CUDA_HOME is not set or is invalid: export CUDA_HOME to compile with cuda)
-  override NVCC=
-  override USE_NVTX=
-  override CUINC=
-  override CURANDLIBFLAGS=
-endif
+CUDA_COMPILER_PATH := $(shell compiler="`which nvcc 2>/dev/null`" && while [ -L "$$compiler" ]; do compiler=`readlink "$$compiler"`; done && echo "$$compiler")
+HIP_COMPILER_PATH := $(shell compiler="`which hipcc 2>/dev/null`" && while [ -L "$$compiler" ]; do compiler=`readlink "$$compiler"`; done && echo "$$compiler")
+
+ifeq ($(findstring nvcc,$(CUDA_COMPILER_PATH)),nvcc)
+  #=== Configure the CUDA compiler
+
+  # If CXX is not a single word (example "clang++ --gcc-toolchain...") then disable CUDA builds (issue #505)
+  # This is because it is impossible to pass this to "GPUFLAGS += -ccbin <host-compiler>" below
+  ifneq ($(words $(subst ccache ,,$(CXX))),1) # allow at most "CXX=ccache <host-compiler>" from outside
+    $(warning CUDA builds are not supported for multi-word CXX "$(CXX)")
+    override CUDA_HOME=disabled
+  endif
+
+  # If CUDA_HOME is not set, try to set it from the location of NVCC
+  ifndef CUDA_HOME
+    CUDA_HOME = $(patsubst %bin/nvcc,%,$(shell which nvcc 2>/dev/null))
+    $(warning CUDA_HOME was not set: using "$(CUDA_HOME)")
+  endif
+
+  # Set GPUCC as $(CUDA_HOME)/bin/nvcc if it exists
+  ifneq ($(wildcard $(CUDA_HOME)/bin/nvcc),)
+    GPUCC = $(CUDA_HOME)/bin/nvcc
+    USE_NVTX ?=-DUSE_NVTX
+    # See https://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/index.html
+    # See https://arnon.dk/matching-sm-architectures-arch-and-gencode-for-various-nvidia-cards/
+    # Default: use compute capability 70 for V100 (CERN lxbatch, CERN itscrd, Juwels Cluster).
+    # Embed device code for 70, and PTX for 70+.
+    # Export MADGRAPH_CUDA_ARCHITECTURE (comma-separated list) to use another value or list of values (see #533).
+    # Examples: use 60 for P100 (Piz Daint), 80 for A100 (Juwels Booster, NVidia raplab/Curiosity).
+    MADGRAPH_CUDA_ARCHITECTURE ?= 70
+    ###CUARCHFLAGS = -gencode arch=compute_$(MADGRAPH_CUDA_ARCHITECTURE),code=compute_$(MADGRAPH_CUDA_ARCHITECTURE) -gencode arch=compute_$(MADGRAPH_CUDA_ARCHITECTURE),code=sm_$(MADGRAPH_CUDA_ARCHITECTURE) # Older implementation (AV): go back to this one for multi-GPU support #533
+    ###CUARCHFLAGS = --gpu-architecture=compute_$(MADGRAPH_CUDA_ARCHITECTURE) --gpu-code=sm_$(MADGRAPH_CUDA_ARCHITECTURE),compute_$(MADGRAPH_CUDA_ARCHITECTURE) # Newer implementation (SH): cannot use this as-is for multi-GPU support #533
+    comma:=,
+    CUARCHFLAGS = $(foreach arch,$(subst $(comma), ,$(MADGRAPH_CUDA_ARCHITECTURE)),-gencode arch=compute_$(arch),code=compute_$(arch) -gencode arch=compute_$(arch),code=sm_$(arch))
+    CUINC = -I$(CUDA_HOME)/include/
+    CURANDLIBFLAGS = -L$(CUDA_HOME)/lib64/ -lcurand # NB: -lcuda is not needed here!
+    CUOPTFLAGS = -lineinfo
+    GPUFLAGS = $(OPTFLAGS) $(CUOPTFLAGS) $(INCFLAGS) $(CUINC) $(USE_NVTX) $(CUARCHFLAGS) -use_fast_math
+    ###GPUFLAGS += -Xcompiler -Wall -Xcompiler -Wextra -Xcompiler -Wshadow
+    ###GPUCC_VERSION = $(shell $(GPUCC) --version | grep 'Cuda compilation tools' | cut -d' ' -f5 | cut -d, -f1)
+    GPUFLAGS += -std=c++17 # need CUDA >= 11.2 (see #333): this is enforced in mgOnGpuConfig.h
+    # Without -maxrregcount: baseline throughput: 6.5E8 (16384 32 12) up to 7.3E8 (65536 128 12)
+    ###GPUFLAGS+= --maxrregcount 160 # improves throughput: 6.9E8 (16384 32 12) up to 7.7E8 (65536 128 12)
+    ###GPUFLAGS+= --maxrregcount 128 # improves throughput: 7.3E8 (16384 32 12) up to 7.6E8 (65536 128 12)
+    ###GPUFLAGS+= --maxrregcount 96 # degrades throughput: 4.1E8 (16384 32 12) up to 4.5E8 (65536 128 12)
+    ###GPUFLAGS+= --maxrregcount 64 # degrades throughput: 1.7E8 (16384 32 12) flat at 1.7E8 (65536 128 12)
+
+    CUBUILDRULEFLAGS = -Xcompiler -fPIC -c
+    CCBUILDRULEFLAGS = -Xcompiler -fPIC -c -x cu
+
+    CUDATESTFLAGS = -lcuda
+
+  else ifneq ($(origin REQUIRE_CUDA),undefined)
+    # If REQUIRE_CUDA is set but no cuda is found, stop here (e.g. for CI tests on GPU #443)
+    $(error No cuda installation found (set CUDA_HOME or make GPUCC visible in PATH))
+  else
+    # No cuda. Switch cuda compilation off and go to common random numbers in C++
+    $(warning CUDA_HOME is not set or is invalid: export CUDA_HOME to compile with cuda)
+    override GPUCC=
+    override USE_NVTX=
+    override CUINC=
+    override CURANDLIBFLAGS=
+  endif
+
+  # Set the host C++ compiler for GPUCC via "-ccbin <host-compiler>"
+  # (NB issue #505: this must be a single word, "clang++ --gcc-toolchain..." is not supported)
+  GPUFLAGS += -ccbin $(shell which $(subst ccache ,,$(CXX)))
+
+  # Allow newer (unsupported) C++ compilers with older versions of CUDA if ALLOW_UNSUPPORTED_COMPILER_IN_CUDA is set (#504)
+  ifneq ($(origin ALLOW_UNSUPPORTED_COMPILER_IN_CUDA),undefined)
+  GPUFLAGS += -allow-unsupported-compiler
+  endif
+
+else ifeq ($(findstring hipcc,$(HIP_COMPILER_PATH)),hipcc)
+  #=== Configure the HIP compiler
+
+  # If CXX is not a single word (example "clang++ --gcc-toolchain...") then disable CUDA builds (issue #505)
+  # This is because it is impossible to pass this to "GPUFLAGS += -ccbin <host-compiler>" below
+  ifneq ($(words $(subst ccache ,,$(CXX))),1) # allow at most "CXX=ccache <host-compiler>" from outside
+    $(warning CUDA builds are not supported for multi-word CXX "$(CXX)")
+    override CUDA_HOME=disabled
+  endif
+
+  # If HIP_HOME is not set, try to set it from the location of GPUCC
+  ifndef HIP_HOME
+    HIP_HOME = $(patsubst %bin/hipcc,%,$(HIP_COMPILER_PATH))
+    $(warning HIP_HOME was not set: using "$(HIP_HOME)")
+  endif
 
-# Set the host C++ compiler for nvcc via "-ccbin <host-compiler>"
-# (NB issue #505: this must be a single word, "clang++ --gcc-toolchain..." is not supported)
-CUFLAGS += -ccbin $(shell which $(subst ccache ,,$(CXX)))
+  # Set GPUCC as $(HIP_HOME)/bin/hipcc if it exists
+  ifneq ($(wildcard $(HIP_HOME)/bin/hipcc),)
+    GPUCC = $(HIP_HOME)/bin/hipcc
+
+    # Should maybe find something equivelant to this in HIP
+    #USE_NVTX ?=-DUSE_NVTX
+
+    HIPARCHFLAGS = -target x86_64-linux-gnu --offload-arch=gfx90a
+    HIPINC = -I$(HIP_HOME)/include/
+
+    # -DHIP_FAST_MATH equivelant to -use_fast_math in HIP 
+    # (But only for single precision line 208: https://rocm-developer-tools.github.io/HIP/hcc__detail_2math__functions_8h_source.html)
+    GPUFLAGS = $(OPTFLAGS) $(CUOPTFLAGS) $(INCFLAGS) $(HIPINC) $(HIPARCHFLAGS) -DHIP_FAST_MATH -DHIP_PLATFORM=amd -fPIC
+    ###GPUFLAGS += -Xcompiler -Wall -Xcompiler -Wextra -Xcompiler -Wshadow
+    GPUFLAGS += -std=c++17
+    # Without -maxrregcount: baseline throughput: 6.5E8 (16384 32 12) up to 7.3E8 (65536 128 12)
+    ###GPUFLAGS+= --maxrregcount 160 # improves throughput: 6.9E8 (16384 32 12) up to 7.7E8 (65536 128 12)
+    ###GPUFLAGS+= --maxrregcount 128 # improves throughput: 7.3E8 (16384 32 12) up to 7.6E8 (65536 128 12)
+    ###GPUFLAGS+= --maxrregcount 96 # degrades throughput: 4.1E8 (16384 32 12) up to 4.5E8 (65536 128 12)
+    ###GPUFLAGS+= --maxrregcount 64 # degrades throughput: 1.7E8 (16384 32 12) flat at 1.7E8 (65536 128 12)
+
+    CUBUILDRULEFLAGS = -fPIC -c
+    CCBUILDRULEFLAGS = -fPIC -c
+
+  else ifneq ($(origin REQUIRE_HIP),undefined)
+    # If REQUIRE_HIP is set but no cuda is found, stop here (e.g. for CI tests on GPU #443)
+    $(error No hip installation found (set HIP_HOME or make GPUCC visible in PATH))
+  else
+    # No hip. Switch hip compilation off and go to common random numbers in C++
+    $(warning HIP_HOME is not set or is invalid: export HIP_HOME to compile with hip)
+    override GPUCC=
+    override USE_NVTX=
+    override CUINC=
+    override CURANDLIBFLAGS=
+  endif
+
+  # Allow newer (unsupported) C++ compilers with older versions of CUDA if ALLOW_UNSUPPORTED_COMPILER_IN_CUDA is set (#504)
+  ifneq ($(origin ALLOW_UNSUPPORTED_COMPILER_IN_CUDA),undefined)
+  GPUFLAGS += -allow-unsupported-compiler
+  endif
 
-# Allow newer (unsupported) C++ compilers with older versions of CUDA if ALLOW_UNSUPPORTED_COMPILER_IN_CUDA is set (#504)
-ifneq ($(origin ALLOW_UNSUPPORTED_COMPILER_IN_CUDA),undefined)
-CUFLAGS += -allow-unsupported-compiler
 endif
 
+  
 #-------------------------------------------------------------------------------
 
 #=== Configure ccache for C++ and CUDA builds
@@ -163,9 +233,9 @@ endif
 #ifeq ($(USECCACHE)$(shell echo $(AR) | grep ccache),1)
 #  override AR:=ccache $(AR)
 #endif
-ifneq ($(NVCC),)
-  ifeq ($(USECCACHE)$(shell echo $(NVCC) | grep ccache),1)
-    override NVCC:=ccache $(NVCC)
+ifneq ($(GPUCC),)
+  ifeq ($(USECCACHE)$(shell echo $(GPUCC) | grep ccache),1)
+    override GPUCC:=ccache $(GPUCC)
   endif
 endif
 
@@ -189,7 +259,7 @@ endif
 
 # PowerPC-specific CUDA compiler flags (to be reviewed!)
 ifeq ($(UNAME_P),ppc64le)
-  CUFLAGS+= -Xcompiler -mno-float128
+  GPUFLAGS+= -Xcompiler -mno-float128
 endif
 
 #-------------------------------------------------------------------------------
@@ -199,10 +269,10 @@ endif
 # Set the default OMPFLAGS choice
 ifneq ($(shell $(CXX) --version | egrep '^Intel'),)
 override OMPFLAGS = -fopenmp
-###override OMPFLAGS = # disable OpenMP MT on Intel (was ok without nvcc but not ok with nvcc before #578)
+###override OMPFLAGS = # disable OpenMP MT on Intel (was ok without GPUCC but not ok with GPUCC before #578)
 else ifneq ($(shell $(CXX) --version | egrep '^(clang)'),)
 override OMPFLAGS = -fopenmp
-###override OMPFLAGS = # disable OpenMP MT on clang (was not ok without or with nvcc before #578)
+###override OMPFLAGS = # disable OpenMP MT on clang (was not ok without or with GPUCC before #578)
 else ifneq ($(shell $(CXX) --version | egrep '^(Apple clang)'),)
 override OMPFLAGS = # disable OpenMP MT on Apple clang (builds fail in the CI #578)
 else
@@ -253,7 +323,10 @@ endif
 
 # Set the default RNDGEN (random number generator) choice
 ifeq ($(RNDGEN),)
-  ifeq ($(NVCC),)
+  ifeq ($(GPUCC),)
+    override RNDGEN = hasNoCurand
+  # Edgecase for HIP compilation
+  else ifeq ($(findstring hipcc,$(GPUCC)),hipcc)
     override RNDGEN = hasNoCurand
   else ifeq ($(RNDGEN),)
     override RNDGEN = hasCurand
@@ -328,13 +401,13 @@ CXXFLAGS+= $(AVXFLAGS)
 $(info FPTYPE=$(FPTYPE))
 ifeq ($(FPTYPE),d)
   CXXFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_DOUBLE
-  CUFLAGS  += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_DOUBLE
+  GPUFLAGS  += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_DOUBLE
 else ifeq ($(FPTYPE),f)
   CXXFLAGS += -DMGONGPU_FPTYPE_FLOAT -DMGONGPU_FPTYPE2_FLOAT
-  CUFLAGS  += -DMGONGPU_FPTYPE_FLOAT -DMGONGPU_FPTYPE2_FLOAT
+  GPUFLAGS  += -DMGONGPU_FPTYPE_FLOAT -DMGONGPU_FPTYPE2_FLOAT
 else ifeq ($(FPTYPE),m)
   CXXFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_FLOAT
-  CUFLAGS  += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_FLOAT
+  GPUFLAGS  += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_FLOAT
 else
   $(error Unknown FPTYPE='$(FPTYPE)': only 'd', 'f' and 'm' are supported)
 endif
@@ -343,7 +416,7 @@ endif
 $(info HELINL=$(HELINL))
 ifeq ($(HELINL),1)
   CXXFLAGS += -DMGONGPU_INLINE_HELAMPS
-  CUFLAGS  += -DMGONGPU_INLINE_HELAMPS
+  GPUFLAGS  += -DMGONGPU_INLINE_HELAMPS
 else ifneq ($(HELINL),0)
   $(error Unknown HELINL='$(HELINL)': only '0' and '1' are supported)
 endif
@@ -352,7 +425,7 @@ endif
 $(info HRDCOD=$(HRDCOD))
 ifeq ($(HRDCOD),1)
   CXXFLAGS += -DMGONGPU_HARDCODE_PARAM
-  CUFLAGS  += -DMGONGPU_HARDCODE_PARAM
+  GPUFLAGS  += -DMGONGPU_HARDCODE_PARAM
 else ifneq ($(HRDCOD),0)
   $(error Unknown HRDCOD='$(HRDCOD)': only '0' and '1' are supported)
 endif
@@ -404,11 +477,11 @@ ifeq ($(UNAME_S),Darwin)
   override CULIBFLAGSRPATH2 =
 else
   # RPATH to cuda/cpp libs when linking executables
-  override CXXLIBFLAGSRPATH = -Wl,-rpath,$(LIBDIRRPATH)
-  override CULIBFLAGSRPATH = -Xlinker -rpath,$(LIBDIRRPATH)
+  override CXXLIBFLAGSRPATH = -Wl,-rpath=$(LIBDIRRPATH)
+  override CULIBFLAGSRPATH = -Xlinker -rpath=$(LIBDIRRPATH)
   # RPATH to common lib when linking cuda/cpp libs
-  override CXXLIBFLAGSRPATH2 = -Wl,-rpath,'$$ORIGIN'
-  override CULIBFLAGSRPATH2 = -Xlinker -rpath,'$$ORIGIN'
+  override CXXLIBFLAGSRPATH2 = -Wl,-rpath='$$ORIGIN'
+  override CULIBFLAGSRPATH2 = -Xlinker -rpath='$$ORIGIN'
 endif
 
 # Setting LD_LIBRARY_PATH or DYLD_LIBRARY_PATH in the RUNTIME is no longer necessary (neither on Linux nor on Mac)
@@ -421,7 +494,7 @@ override RUNTIME =
 cxx_main=$(BUILDDIR)/check.exe
 fcxx_main=$(BUILDDIR)/fcheck.exe
 
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 cu_main=$(BUILDDIR)/gcheck.exe
 fcu_main=$(BUILDDIR)/fgcheck.exe
 else
@@ -452,15 +525,16 @@ $(BUILDDIR)/.build.$(TAG):
 	@touch $(BUILDDIR)/.build.$(TAG)
 
 # Generic target and build rules: objects from CUDA compilation
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 $(BUILDDIR)/%.o : %.cu *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
 	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
-	$(NVCC) $(CPPFLAGS) $(CUFLAGS) -Xcompiler -fPIC -c $< -o $@
+	$(GPUCC) $(CPPFLAGS) $(GPUFLAGS) $(CUBUILDRULEFLAGS) $< -o $@
 
 $(BUILDDIR)/%_cu.o : %.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
 	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
-	$(NVCC) $(CPPFLAGS) $(CUFLAGS) -Xcompiler -fPIC -c -x cu $< -o $@
+	$(GPUCC) $(CPPFLAGS) $(GPUFLAGS) $(CCBUILDRULEFLAGS) $< -o $@
 endif
+# -x cu in line above
 
 # Generic target and build rules: objects from C++ compilation
 # (NB do not include CUINC here! add it only for NVTX or curand #679)
@@ -469,11 +543,14 @@ $(BUILDDIR)/%.o : %.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
 	$(CXX) $(CPPFLAGS) $(CXXFLAGS) -fPIC -c $< -o $@
 
 # Apply special build flags only to CrossSectionKernel.cc and gCrossSectionKernel.cu (no fast math, see #117)
+# Added edgecase for HIP compilation
 ifeq ($(shell $(CXX) --version | grep ^nvc++),)
 $(BUILDDIR)/CrossSectionKernels.o: CXXFLAGS += -fno-fast-math
 $(BUILDDIR)/CrossSectionKernels.o: CXXFLAGS += -fno-fast-math
-ifneq ($(NVCC),)
-$(BUILDDIR)/gCrossSectionKernels.o: CUFLAGS += -Xcompiler -fno-fast-math
+ifeq ($(findstring nvcc,$(GPUCC)),nvcc)
+  $(BUILDDIR)/gCrossSectionKernels.o: GPUFLAGS += -Xcompiler -fno-fast-math
+else
+  $(BUILDDIR)/gCrossSectionKernels.o: GPUFLAGS += -fno-fast-math
 endif
 endif
 
@@ -489,10 +566,10 @@ ifeq ($(RNDGEN),hasCurand)
 $(BUILDDIR)/CurandRandomNumberKernel.o: CXXFLAGS += $(CUINC)
 endif
 
-# Avoid "warning: builtin __has_trivial_... is deprecated; use __is_trivially_... instead" in nvcc with icx2023 (#592)
+# Avoid "warning: builtin __has_trivial_... is deprecated; use __is_trivially_... instead" in GPUCC with icx2023 (#592)
 ifneq ($(shell $(CXX) --version | egrep '^(Intel)'),)
-ifneq ($(NVCC),)
-CUFLAGS += -Xcompiler -Wno-deprecated-builtins
+ifneq ($(GPUCC),)
+GPUFLAGS += -Wno-deprecated-builtins
 endif
 endif
 
@@ -500,8 +577,8 @@ endif
 # This patch does remove the warning, but I prefer to keep it disabled for the moment...
 ###ifneq ($(shell $(CXX) --version | egrep '^(clang|Apple clang|Intel)'),)
 ###$(BUILDDIR)/CrossSectionKernels.o: CXXFLAGS += -Wno-overriding-t-option
-###ifneq ($(NVCC),)
-###$(BUILDDIR)/gCrossSectionKernels.o: CUFLAGS += -Xcompiler -Wno-overriding-t-option
+###ifneq ($(GPUCC),)
+###$(BUILDDIR)/gCrossSectionKernels.o: GPUFLAGS += -Xcompiler -Wno-overriding-t-option
 ###endif
 ###endif
 
@@ -528,7 +605,7 @@ MG5AMC_CXXLIB = mg5amc_$(processid_short)_cpp
 cxx_objects_lib=$(BUILDDIR)/CPPProcess.o $(BUILDDIR)/MatrixElementKernels.o $(BUILDDIR)/BridgeKernels.o $(BUILDDIR)/CrossSectionKernels.o
 cxx_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel.o $(BUILDDIR)/RamboSamplingKernels.o
 
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 MG5AMC_CULIB = mg5amc_$(processid_short)_cuda
 cu_objects_lib=$(BUILDDIR)/gCPPProcess.o $(BUILDDIR)/gMatrixElementKernels.o $(BUILDDIR)/gBridgeKernels.o $(BUILDDIR)/gCrossSectionKernels.o
 cu_objects_exe=$(BUILDDIR)/gCommonRandomNumberKernel.o $(BUILDDIR)/gRamboSamplingKernels.o
@@ -540,11 +617,11 @@ $(LIBDIR)/lib$(MG5AMC_CXXLIB).so: cxx_objects_lib += $(BUILDDIR)/fbridge.o
 $(LIBDIR)/lib$(MG5AMC_CXXLIB).so: $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib)
 	$(CXX) -shared -o $@ $(cxx_objects_lib) $(CXXLIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB)
 
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 $(LIBDIR)/lib$(MG5AMC_CULIB).so: $(BUILDDIR)/fbridge_cu.o
 $(LIBDIR)/lib$(MG5AMC_CULIB).so: cu_objects_lib += $(BUILDDIR)/fbridge_cu.o
 $(LIBDIR)/lib$(MG5AMC_CULIB).so: $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cu_objects_lib)
-	$(NVCC) --shared -o $@ $(cu_objects_lib) $(CULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB)
+	$(GPUCC) --shared -o $@ $(cu_objects_lib) $(CULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB)
 endif
 
 #-------------------------------------------------------------------------------
@@ -561,16 +638,16 @@ $(cxx_main): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PAT
 $(cxx_main): $(BUILDDIR)/check_sa.o $(LIBDIR)/lib$(MG5AMC_CXXLIB).so $(cxx_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel.o
 	$(CXX) -o $@ $(BUILDDIR)/check_sa.o $(OMPFLAGS) -ldl -pthread $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_CXXLIB) $(cxx_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel.o $(CURANDLIBFLAGS)
 
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 ifneq ($(shell $(CXX) --version | grep ^Intel),)
-$(cu_main): LIBFLAGS += -lintlc # compile with icpx and link with nvcc (undefined reference to `_intel_fast_memcpy')
-$(cu_main): LIBFLAGS += -lsvml # compile with icpx and link with nvcc (undefined reference to `__svml_cos4_l9')
+$(cu_main): LIBFLAGS += -lintlc # compile with icpx and link with GPUCC (undefined reference to `_intel_fast_memcpy')
+$(cu_main): LIBFLAGS += -lsvml # compile with icpx and link with GPUCC (undefined reference to `__svml_cos4_l9')
 else ifneq ($(shell $(CXX) --version | grep ^nvc++),) # support nvc++ #531
 $(cu_main): LIBFLAGS += -L$(patsubst %bin/nvc++,%lib,$(subst ccache ,,$(CXX))) -lnvhpcatm -lnvcpumath -lnvc
 endif
 $(cu_main): LIBFLAGS += $(CULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH
 $(cu_main): $(BUILDDIR)/gcheck_sa.o $(LIBDIR)/lib$(MG5AMC_CULIB).so $(cu_objects_exe) $(BUILDDIR)/gCurandRandomNumberKernel.o
-	$(NVCC) -o $@ $(BUILDDIR)/gcheck_sa.o $(CUARCHFLAGS) $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe) $(BUILDDIR)/gCurandRandomNumberKernel.o $(CURANDLIBFLAGS)
+	$(GPUCC) -o $@ $(BUILDDIR)/gcheck_sa.o $(CUARCHFLAGS) $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe) $(BUILDDIR)/gCurandRandomNumberKernel.o $(CURANDLIBFLAGS)
 endif
 
 #-------------------------------------------------------------------------------
@@ -596,17 +673,17 @@ $(fcxx_main): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PA
 $(fcxx_main): $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler.o $(LIBDIR)/lib$(MG5AMC_CXXLIB).so $(cxx_objects_exe)
 	$(CXX) -o $@ $(BUILDDIR)/fcheck_sa.o $(OMPFLAGS) $(BUILDDIR)/fsampler.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CXXLIB) $(cxx_objects_exe)
 
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 ifneq ($(shell $(CXX) --version | grep ^Intel),)
-$(fcu_main): LIBFLAGS += -lintlc # compile with icpx and link with nvcc (undefined reference to `_intel_fast_memcpy')
-$(fcu_main): LIBFLAGS += -lsvml # compile with icpx and link with nvcc (undefined reference to `__svml_cos4_l9')
+$(fcu_main): LIBFLAGS += -lintlc # compile with icpx and link with GPUCC (undefined reference to `_intel_fast_memcpy')
+$(fcu_main): LIBFLAGS += -lsvml # compile with icpx and link with GPUCC (undefined reference to `__svml_cos4_l9')
 endif
 ifeq ($(UNAME_S),Darwin)
 $(fcu_main): LIBFLAGS += -L$(shell dirname $(shell $(FC) --print-file-name libgfortran.dylib)) # add path to libgfortran on Mac #375
 endif
 $(fcu_main): LIBFLAGS += $(CULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH
 $(fcu_main): $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler_cu.o $(LIBDIR)/lib$(MG5AMC_CULIB).so $(cu_objects_exe)
-	$(NVCC) -o $@ $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler_cu.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe)
+	$(GPUCC) -o $@ $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler_cu.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe)
 endif
 
 #-------------------------------------------------------------------------------
@@ -618,7 +695,7 @@ $(BUILDDIR)/testxxx.o: testxxx_cc_ref.txt
 $(testmain): $(BUILDDIR)/testxxx.o
 $(testmain): cxx_objects_exe += $(BUILDDIR)/testxxx.o # Comment out this line to skip the C++ test of xxx functions
 
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 $(BUILDDIR)/testxxx_cu.o: $(GTESTLIBS)
 $(BUILDDIR)/testxxx_cu.o: INCFLAGS += $(GTESTINC)
 $(BUILDDIR)/testxxx_cu.o: testxxx_cc_ref.txt
@@ -631,7 +708,7 @@ $(BUILDDIR)/testmisc.o: INCFLAGS += $(GTESTINC)
 $(testmain): $(BUILDDIR)/testmisc.o
 $(testmain): cxx_objects_exe += $(BUILDDIR)/testmisc.o # Comment out this line to skip the C++ miscellaneous tests
 
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 $(BUILDDIR)/testmisc_cu.o: $(GTESTLIBS)
 $(BUILDDIR)/testmisc_cu.o: INCFLAGS += $(GTESTINC)
 $(testmain): $(BUILDDIR)/testmisc_cu.o
@@ -643,12 +720,12 @@ $(BUILDDIR)/runTest.o: INCFLAGS += $(GTESTINC)
 $(testmain): $(BUILDDIR)/runTest.o
 $(testmain): cxx_objects_exe += $(BUILDDIR)/runTest.o
 
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 $(BUILDDIR)/runTest_cu.o: $(GTESTLIBS)
 $(BUILDDIR)/runTest_cu.o: INCFLAGS += $(GTESTINC)
 ifneq ($(shell $(CXX) --version | grep ^Intel),)
-$(testmain): LIBFLAGS += -lintlc # compile with icpx and link with nvcc (undefined reference to `_intel_fast_memcpy')
-$(testmain): LIBFLAGS += -lsvml # compile with icpx and link with nvcc (undefined reference to `__svml_cos4_l9')
+$(testmain): LIBFLAGS += -lintlc # compile with icpx and link with GPUCC (undefined reference to `_intel_fast_memcpy')
+$(testmain): LIBFLAGS += -lsvml # compile with icpx and link with GPUCC (undefined reference to `__svml_cos4_l9')
 else ifneq ($(shell $(CXX) --version | grep ^nvc++),) # support nvc++ #531
 $(testmain): LIBFLAGS += -L$(patsubst %bin/nvc++,%lib,$(subst ccache ,,$(CXX))) -lnvhpcatm -lnvcpumath -lnvc
 endif
@@ -672,14 +749,14 @@ $(testmain): LIBFLAGS += -lgomp
 endif
 endif
 
-ifeq ($(NVCC),) # link only runTest.o
+ifeq ($(GPUCC),) # link only runTest.o
 $(testmain): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH
 $(testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(cxx_objects_exe) $(GTESTLIBS)
 	$(CXX) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) -ldl -pthread $(LIBFLAGS)
 else # link both runTest.o and runTest_cu.o
 $(testmain): LIBFLAGS += $(CULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH
 $(testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) $(GTESTLIBS)
-	$(NVCC) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) -ldl $(LIBFLAGS) -lcuda
+	  $(GPUCC) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) -ldl $(LIBFLAGS) $(CUDATESTFLAGS)
 endif
 
 # Use flock (Linux only, no Mac) to allow 'make -j' if googletest has not yet been downloaded https://stackoverflow.com/a/32666215
@@ -782,9 +859,9 @@ ifeq ($(USECCACHE),1)
 	ccache --version | head -1
 endif
 	@echo ""
-	@echo NVCC=$(NVCC)
-ifneq ($(NVCC),)
-	$(NVCC) --version
+	@echo GPUCC=$(GPUCC)
+ifneq ($(GPUCC),)
+	$(GPUCC) --version
 endif
 	@echo ""
 	@echo CXX=$(CXX)
@@ -803,7 +880,7 @@ endif
 
 # Target: check (run the C++ test executable)
 # [NB THIS IS WHAT IS USED IN THE GITHUB CI!]
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 check: runTest cmpFcheck cmpFGcheck
 else
 check: runTest cmpFcheck
diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/fbridge.cc b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/fbridge.cc
index f93c05b0b3..5f91f007ee 100644
--- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/fbridge.cc
+++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/fbridge.cc
@@ -5,7 +5,7 @@
 
 #include "Bridge.h"
 #include "CPPProcess.h"
-#include "CudaRuntime.h"
+#include "GpuRuntime.h"
 
 extern "C"
 {
@@ -22,7 +22,7 @@ extern "C"
    * Using the same Fortran MadEvent code, linking to the hetrerogeneous library would allow access to both CPU and GPU implementations.
    * The specific heterogeneous configuration (how many GPUs, how many threads on each CPU, etc) could be loaded in CUDA/C++ from a data file.
    */
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   using namespace mg5amcGpu;
 #else
   using namespace mg5amcCpu;
@@ -46,7 +46,7 @@ extern "C"
    */
   void fbridgecreate_( CppObjectInFortran** ppbridge, const int* pnevtF, const int* pnparF, const int* pnp4F )
   {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     CudaRuntime::setUp();
 #endif
     // Create a process object, read parm card and set parameters
@@ -69,7 +69,7 @@ extern "C"
     Bridge<FORTRANFPTYPE>* pbridge = dynamic_cast<Bridge<FORTRANFPTYPE>*>( *ppbridge );
     if( pbridge == 0 ) throw std::runtime_error( "fbridgedelete_: invalid Bridge address" );
     delete pbridge;
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     CudaRuntime::tearDown();
 #endif
   }
@@ -100,7 +100,7 @@ extern "C"
   {
     Bridge<FORTRANFPTYPE>* pbridge = dynamic_cast<Bridge<FORTRANFPTYPE>*>( *ppbridge );
     if( pbridge == 0 ) throw std::runtime_error( "fbridgesequence_: invalid Bridge address" );
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     // Use the device/GPU implementation in the CUDA library
     // (there is also a host implementation in this library)
     pbridge->gpu_sequence( momenta, gs, rndhel, rndcol, *pchannelId, mes, selhel, selcol );
diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/fsampler.cc b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/fsampler.cc
index 2fb445372d..acffa7c19e 100644
--- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/fsampler.cc
+++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/fsampler.cc
@@ -13,7 +13,7 @@
 
 //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -40,7 +40,7 @@ namespace mg5amcCpu
   private:
     const int m_nevt; // The number of events in each iteration
     int m_iiter;      // The iteration counter (for random number seeding)
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
     HostBufferRndNumMomenta m_hstRndmom; // Memory buffers for random numbers
     HostBufferMomenta m_hstMomenta;      // Memory buffers for momenta
     HostBufferWeights m_hstWeights;      // Memory buffers for sampling weights
@@ -105,7 +105,7 @@ namespace mg5amcCpu
 
 extern "C"
 {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   using namespace mg5amcGpu;
 #else
   using namespace mg5amcCpu;
diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/runTest.cc b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/runTest.cc
index 572e28aaea..13616d771a 100644
--- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/runTest.cc
+++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/runTest.cc
@@ -15,7 +15,7 @@
 #include "RandomNumberKernels.h"
 #include "epoch_process_id.h"
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 using namespace mg5amcGpu;
 #else
 using namespace mg5amcCpu;
@@ -32,7 +32,7 @@ struct CUDA_CPU_TestBase : public TestDriverBase
     : TestDriverBase( npar, refFileName ) {}
 };
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 struct CPUTest : public CUDA_CPU_TestBase
 {
   // Struct data members (process, and memory structures for random numbers, momenta, matrix elements and weights on host and device)
@@ -114,7 +114,7 @@ struct CPUTest : public CUDA_CPU_TestBase
 };
 #endif
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 struct CUDATest : public CUDA_CPU_TestBase
 {
   // Reset the device when our test goes out of scope. Note that this should happen after
@@ -249,7 +249,7 @@ INSTANTIATE_TEST_SUITE_P( prefix, \
                           test_suite_name, \
                           testing::Values( new CUDATest( MG_EPOCH_REFERENCE_FILE_NAME ) ) );
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 MG_INSTANTIATE_TEST_SUITE_GPU( XTESTID_GPU( MG_EPOCH_PROCESS_ID ), MadgraphTest );
 #else
 MG_INSTANTIATE_TEST_SUITE_CPU( XTESTID_CPU( MG_EPOCH_PROCESS_ID ), MadgraphTest );
diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/testmisc.cc b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/testmisc.cc
index 989aba1fdc..dcafb44ee6 100644
--- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/testmisc.cc
+++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/testmisc.cc
@@ -17,7 +17,7 @@
 #include <sstream>
 #include <typeinfo>
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 #define TESTID( s ) s##_GPU_MISC
 #else
 #define TESTID( s ) s##_CPU_MISC
diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/testxxx.cc b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/testxxx.cc
index 4243e9fcec..b58d908756 100644
--- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/testxxx.cc
+++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/testxxx.cc
@@ -20,7 +20,7 @@
 #include <iomanip>
 #include <iostream>
 #include <vector>
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 #define TESTID( s ) s##_GPU_XXX
 #else
 #define TESTID( s ) s##_CPU_XXX
@@ -41,7 +41,7 @@ TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testxxx )
   assert( nevt % neppM == 0 ); // nevt must be a multiple of neppM
   assert( nevt % neppV == 0 ); // nevt must be a multiple of neppV
   // Fill in the input momenta
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   mg5amcGpu::PinnedHostBufferMomenta hstMomenta( nevt ); // AOSOA[npagM][npar=4][np4=4][neppM]
 #else
   mg5amcCpu::HostBufferMomenta hstMomenta( nevt ); // AOSOA[npagM][npar=4][np4=4][neppM]
@@ -228,7 +228,7 @@ TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testxxx )
   {
     for( int ievt = 0; ievt < nevt; ievt++ )
     {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
       using namespace mg5amcGpu;
 #else
       using namespace mg5amcCpu;
diff --git a/epochX/cudacpp/gg_ttgg.mad/src/HelAmps_sm.h b/epochX/cudacpp/gg_ttgg.mad/src/HelAmps_sm.h
index ee2fcbbde5..f8f9fa7f9c 100644
--- a/epochX/cudacpp/gg_ttgg.mad/src/HelAmps_sm.h
+++ b/epochX/cudacpp/gg_ttgg.mad/src/HelAmps_sm.h
@@ -26,7 +26,7 @@
 //#include <iomanip>
 //#include <iostream>
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/gg_ttgg.mad/src/Parameters_sm.h b/epochX/cudacpp/gg_ttgg.mad/src/Parameters_sm.h
index 6551d8da81..fe7d686938 100644
--- a/epochX/cudacpp/gg_ttgg.mad/src/Parameters_sm.h
+++ b/epochX/cudacpp/gg_ttgg.mad/src/Parameters_sm.h
@@ -214,7 +214,7 @@ namespace Parameters_sm_dependentCouplings
 #pragma GCC diagnostic push
 #pragma GCC diagnostic ignored "-Wunused-variable"  // e.g. <<warning: unused variable ‘mdl_G__exp__2’ [-Wunused-variable]>>
 #pragma GCC diagnostic ignored "-Wunused-parameter" // e.g. <<warning: unused parameter ‘G’ [-Wunused-parameter]>>
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 #pragma nv_diagnostic push
 #pragma nv_diag_suppress 177 // e.g. <<warning #177-D: variable "mdl_G__exp__2" was declared but never referenced>>
 #endif
@@ -242,7 +242,7 @@ namespace Parameters_sm_dependentCouplings
     // End SM implementation - no special handling of vectors of floats as in EFT (#439)
     return out;
   }
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 #pragma GCC diagnostic pop
 #pragma nv_diagnostic pop
 #endif
@@ -258,7 +258,7 @@ namespace Parameters_sm_independentCouplings
 
 //==========================================================================
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/gg_ttgg.mad/src/mgOnGpuConfig.h b/epochX/cudacpp/gg_ttgg.mad/src/mgOnGpuConfig.h
index 881353abac..8a393a7231 100644
--- a/epochX/cudacpp/gg_ttgg.mad/src/mgOnGpuConfig.h
+++ b/epochX/cudacpp/gg_ttgg.mad/src/mgOnGpuConfig.h
@@ -6,6 +6,8 @@
 #ifndef MGONGPUCONFIG_H
 #define MGONGPUCONFIG_H 1
 
+#include "GpuRuntime.h"
+
 // HARDCODED AT CODE GENERATION TIME: DO NOT MODIFY (#473)
 // There are two different code bases for standalone_cudacpp (without multichannel) and madevent+cudacpp (with multichannel)
 #define MGONGPU_SUPPORTS_MULTICHANNEL 1
@@ -15,9 +17,11 @@
 
 // Choose if curand is supported for generating random numbers
 // For C++, by default, do not inline, but allow this macro to be set from outside with e.g. -DMGONGPU_HAS_NO_CURAND
-#ifdef __CUDACC__
+// Added support for HIP compilation by defining MGONGPU_HAS_NO_CURAND
+#ifdef MGONGPUCPP_CUDACC
 #undef MGONGPU_HAS_NO_CURAND
-#else
+#elif defined MGONGPUCPP_HIPCC
+#define MGONGPU_HAS_NO_CURAND 1
 //#undef MGONGPU_HAS_NO_CURAND // default
 ////#define MGONGPU_HAS_NO_CURAND 1
 #endif
@@ -53,20 +57,20 @@
 ////#define MGONGPU_HARDCODE_PARAM 1
 
 // Complex type in c++: std::complex or cxsmpl (CHOOSE ONLY ONE)
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 //#define MGONGPU_CPPCXTYPE_STDCOMPLEX 1 // ~8 percent slower on float, same on double (5.1E6/double, 9.4E6/float)
 #define MGONGPU_CPPCXTYPE_CXSMPL 1 // new default (5.1E6/double, 10.2E6/float)
 #endif
 
 // Complex type in cuda: thrust or cucomplex or cxsmpl (CHOOSE ONLY ONE)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 #define MGONGPU_CUCXTYPE_THRUST 1 // default (~1.15E9/double, ~3.2E9/float)
 //#define MGONGPU_CUCXTYPE_CUCOMPLEX 1 // ~10 percent slower (1.03E9/double, ~2.8E9/float)
 //#define MGONGPU_CUCXTYPE_CXSMPL 1 // ~10 percent slower (1.00E9/double, ~2.9E9/float)
 #endif
 
 // Cuda nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 #undef MGONGPU_NSIGHT_DEBUG // default
 //#define MGONGPU_NSIGHT_DEBUG 1
 #endif
@@ -85,14 +89,14 @@
 #endif
 
 // SANITY CHECKS (c++ complex number implementation)
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 #if defined MGONGPU_CPPCXTYPE_STDCOMPLEX and defined MGONGPU_CPPCXTYPE_CXSMPL
 #error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CPPCXTYPE_STDCOMPLEX or MGONGPU_CPPCXTYPE_CXSMPL
 #endif
 #endif
 
 // SANITY CHECKS (cuda complex number implementation)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 #if defined MGONGPU_CUCXTYPE_THRUST and defined MGONGPU_CUCXTYPE_CUCOMPLEX and defined MGONGPU_CUCXTYPE_CXSMPL
 #error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CUCXTYPE_THRUST or MGONGPU_CUCXTYPE_CUCOMPLEX or MGONGPU_CUCXTYPE_CXSMPL
 #endif
@@ -131,7 +135,7 @@ namespace mgOnGpu
   // Alignment requirement for using reinterpret_cast with SIMD vectorized code
   // (using reinterpret_cast with non aligned memory may lead to segmentation faults!)
   // Only needed for C++ code but can be enforced also in NVCC builds of C++ code using CUDA>=11.2 and C++17 (#318, #319, #333)
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   constexpr int cppAlign = 64; // alignment requirement for SIMD vectorization (64-byte i.e. 512-bit)
 #endif
 
@@ -142,7 +146,7 @@ using mgOnGpu::fptype;
 using mgOnGpu::fptype2;
 
 // C++ SIMD vectorization width (this will be used to set neppV)
-#ifdef __CUDACC__ // CUDA implementation has no SIMD
+#ifdef MGONGPUCPP_GPUIMPL // CUDA implementation has no SIMD
 #undef MGONGPU_CPPSIMD
 #elif defined __AVX512VL__ && defined MGONGPU_PVW512 // C++ "512z" AVX512 with 512 width (512-bit ie 64-byte): 8 (DOUBLE) or 16 (FLOAT)
 #ifdef MGONGPU_FPTYPE_DOUBLE
@@ -174,7 +178,7 @@ using mgOnGpu::fptype2;
 
 // Cuda nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation
 // Arguments (not used so far): text is __FUNCTION__, code is 0 (start) or 1 (end)
-#if defined __CUDACC__ && defined MGONGPU_NSIGHT_DEBUG /* clang-format off */
+#if defined MGONGPUCPP_GPUIMPL && defined MGONGPU_NSIGHT_DEBUG /* clang-format off */
 #define mgDebugDeclare() __shared__ float mgDebugCounter[mgOnGpu::ntpbMAX];
 #define mgDebugInitialise() { mgDebugCounter[threadIdx.x] = 0; }
 #define mgDebug( code, text ) { mgDebugCounter[threadIdx.x] += 1; }
@@ -187,7 +191,7 @@ using mgOnGpu::fptype2;
 #endif /* clang-format on */
 
 // Define empty CUDA declaration specifiers for C++
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 #define __global__
 #define __host__
 #define __device__
diff --git a/epochX/cudacpp/gg_ttgg.mad/src/mgOnGpuCxtypes.h b/epochX/cudacpp/gg_ttgg.mad/src/mgOnGpuCxtypes.h
index 0cb2f1db7e..866b7640f6 100644
--- a/epochX/cudacpp/gg_ttgg.mad/src/mgOnGpuCxtypes.h
+++ b/epochX/cudacpp/gg_ttgg.mad/src/mgOnGpuCxtypes.h
@@ -19,7 +19,7 @@
 #include <complex>
 
 // Complex type in cuda: thrust or cucomplex or cxsmpl
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 #if defined MGONGPU_CUCXTYPE_THRUST
 #pragma clang diagnostic push
 #pragma clang diagnostic ignored "-Wtautological-compare" // for icpx2021/clang13 (https://stackoverflow.com/a/15864661)
@@ -201,7 +201,7 @@ namespace mgOnGpu
 {
 
   // --- Type definitions (complex type: cxtype)
-#ifdef __CUDACC__ // cuda
+#ifdef MGONGPUCPP_GPUIMPL // cuda
 #if defined MGONGPU_CUCXTYPE_THRUST
   typedef thrust::complex<fptype> cxtype;
 #elif defined MGONGPU_CUCXTYPE_CUCOMPLEX
@@ -281,7 +281,7 @@ cxmake( const std::complex<double>& c ) // std::complex to cxsmpl (double-to-flo
 
 //==========================================================================
 
-#if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_THRUST // cuda + thrust
+#if defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CUCXTYPE_THRUST // cuda + thrust
 
 //------------------------------
 // CUDA - using thrust::complex
@@ -317,11 +317,11 @@ cxmake( const cxtype& c )
   return c;
 }
 
-#endif // #if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_THRUST
+#endif // #if defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CUCXTYPE_THRUST
 
 //==========================================================================
 
-#if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_CUCOMPLEX // cuda + cucomplex
+#if defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CUCXTYPE_CUCOMPLEX // cuda + cucomplex
 
 //------------------------------
 // CUDA - using cuComplex
@@ -536,11 +536,11 @@ cxmake( const std::complex<fptype>& c ) // std::complex to cucomplex (float-to-f
   return cxmake( c.real(), c.imag() );
 }
 
-#endif // #if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_CUCOMPLEX
+#endif // #if defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CUCXTYPE_CUCOMPLEX
 
 //==========================================================================
 
-#if not defined __CUDACC__ and defined MGONGPU_CPPCXTYPE_STDCOMPLEX // c++ + stdcomplex
+#if not defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CPPCXTYPE_STDCOMPLEX // c++ + stdcomplex
 
 //------------------------------
 // C++ - using std::complex
@@ -584,7 +584,7 @@ cxmake( const std::complex<double>& c ) // std::complex to std::complex (cast do
 }
 #endif
 
-#endif // #if not defined __CUDACC__ and defined MGONGPU_CPPCXTYPE_STDCOMPLEX
+#endif // #if not defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CPPCXTYPE_STDCOMPLEX
 
 //==========================================================================
 
diff --git a/epochX/cudacpp/gg_ttgg.mad/src/mgOnGpuFptypes.h b/epochX/cudacpp/gg_ttgg.mad/src/mgOnGpuFptypes.h
index a1cde16a67..7edefa3389 100644
--- a/epochX/cudacpp/gg_ttgg.mad/src/mgOnGpuFptypes.h
+++ b/epochX/cudacpp/gg_ttgg.mad/src/mgOnGpuFptypes.h
@@ -13,7 +13,7 @@
 
 //==========================================================================
 
-#ifdef __CUDACC__ // cuda
+#ifdef MGONGPUCPP_GPUIMPL // cuda
 
 //------------------------------
 // Floating point types - Cuda
@@ -57,11 +57,11 @@ fpsqrt( const fptype& f )
 #endif
 }
 
-#endif // #ifdef __CUDACC__
+#endif // #ifdef MGONGPUCPP_GPUIMPL
 
 //==========================================================================
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 
 //------------------------------
 // Floating point types - C++
@@ -85,7 +85,7 @@ fpsqrt( const fptype& f )
   return std::sqrt( f );
 }
 
-#endif // #ifndef __CUDACC__
+#endif // #ifndef MGONGPUCPP_GPUIMPL
 
 //==========================================================================
 
diff --git a/epochX/cudacpp/gg_ttgg.mad/src/mgOnGpuVectors.h b/epochX/cudacpp/gg_ttgg.mad/src/mgOnGpuVectors.h
index 9d3e82b1e3..3376081012 100644
--- a/epochX/cudacpp/gg_ttgg.mad/src/mgOnGpuVectors.h
+++ b/epochX/cudacpp/gg_ttgg.mad/src/mgOnGpuVectors.h
@@ -6,6 +6,7 @@
 #ifndef MGONGPUVECTORS_H
 #define MGONGPUVECTORS_H 1
 
+#include "GpuAbstraction.h"
 #include "mgOnGpuCxtypes.h"
 #include "mgOnGpuFptypes.h"
 
@@ -108,7 +109,7 @@ namespace mgOnGpu /* clang-format off */
 #endif
 #endif
 
-#else // i.e #ifndef MGONGPU_CPPSIMD (this includes #ifdef __CUDACC__)
+#else // i.e #ifndef MGONGPU_CPPSIMD (this includes #ifdef MGONGPUCPP_GPUIMPL)
 
   const int neppV = 1;
 
@@ -129,7 +130,7 @@ using mgOnGpu::bool_v;
 
 //--------------------------------------------------------------------------
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 
 // Printout to stream for user defined types
 
@@ -744,11 +745,11 @@ fpvsplit1( const fptype2_v& v )
 
 #endif // #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
 
-#endif // #ifndef __CUDACC__
+#endif // #ifndef MGONGPUCPP_GPUIMPL
 
 //==========================================================================
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 
 //------------------------------
 // Vector types - CUDA
@@ -786,12 +787,12 @@ cxternary( const bool& mask, const cxtype& a, const cxtype& b )
   return ( mask ? a : b );
 }
 
-#endif // #ifdef __CUDACC__
+#endif // #ifdef MGONGPUCPP_GPUIMPL
 
 //==========================================================================
 
 // Scalar-or-vector types: scalar in CUDA, vector or scalar in C++
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 typedef bool bool_sv;
 typedef fptype fptype_sv;
 typedef fptype2 fptype2_sv;
@@ -812,7 +813,7 @@ typedef mgOnGpu::cxtype_ref cxtype_sv_ref;
 #endif
 
 // Scalar-or-vector zeros: scalar in CUDA, vector or scalar in C++
-#ifdef __CUDACC__ /* clang-format off */
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
 inline __host__ __device__ cxtype cxzero_sv(){ return cxtype( 0, 0 ); }
 #elif defined MGONGPU_CPPSIMD
 inline cxtype_v cxzero_sv() { return cxtype_v(); } // RRRR=0000 IIII=0000
diff --git a/epochX/cudacpp/gg_ttgg.mad/src/rambo.h b/epochX/cudacpp/gg_ttgg.mad/src/rambo.h
index e02ea52496..3a331b979a 100644
--- a/epochX/cudacpp/gg_ttgg.mad/src/rambo.h
+++ b/epochX/cudacpp/gg_ttgg.mad/src/rambo.h
@@ -18,7 +18,7 @@
 #include <iostream>
 
 // Simplified rambo version for 2 to N (with N>=2) processes with massless particles
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -83,7 +83,7 @@ namespace mg5amcCpu
       static bool first = true;
       if( first )
       {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
         if constexpr( M_ACCESS::isOnDevice() ) // avoid
         {
           const int ievt0 = 0;
@@ -166,7 +166,7 @@ namespace mg5amcCpu
     wt = po2log;
     if( nparf != 2 ) wt = ( 2. * nparf - 4. ) * log( energy ) + z[nparf - 1];
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
     // issue warnings if weight is too small or too large
     static int iwarn[5] = { 0, 0, 0, 0, 0 };
     if( wt < -180. )

From a7369a383c90f5cb7be6fb8d67b81bcb73466782 Mon Sep 17 00:00:00 2001
From: Jorgen T <jorgen.teig@gmail.com>
Date: Thu, 29 Jun 2023 17:14:25 +0200
Subject: [PATCH 340/509] Forgot to include GpuAbstraction headers in src

---
 epochX/cudacpp/gg_ttgg.mad/src/GpuAbstraction.h | 1 +
 epochX/cudacpp/gg_ttgg.mad/src/GpuRuntime.h     | 1 +
 2 files changed, 2 insertions(+)
 create mode 120000 epochX/cudacpp/gg_ttgg.mad/src/GpuAbstraction.h
 create mode 120000 epochX/cudacpp/gg_ttgg.mad/src/GpuRuntime.h

diff --git a/epochX/cudacpp/gg_ttgg.mad/src/GpuAbstraction.h b/epochX/cudacpp/gg_ttgg.mad/src/GpuAbstraction.h
new file mode 120000
index 0000000000..4955c9171e
--- /dev/null
+++ b/epochX/cudacpp/gg_ttgg.mad/src/GpuAbstraction.h
@@ -0,0 +1 @@
+../SubProcesses/GpuAbstraction.h
\ No newline at end of file
diff --git a/epochX/cudacpp/gg_ttgg.mad/src/GpuRuntime.h b/epochX/cudacpp/gg_ttgg.mad/src/GpuRuntime.h
new file mode 120000
index 0000000000..ba9c735d54
--- /dev/null
+++ b/epochX/cudacpp/gg_ttgg.mad/src/GpuRuntime.h
@@ -0,0 +1 @@
+../SubProcesses/GpuRuntime.h
\ No newline at end of file

From d123d61dd9b5723c017b3c7e59f1b1036621b978 Mon Sep 17 00:00:00 2001
From: Jorgen T <jorgen.teig@gmail.com>
Date: Thu, 29 Jun 2023 17:16:16 +0200
Subject: [PATCH 341/509] Removed CudaRuntime

---
 .../cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/check_sa.cc  | 4 ++--
 epochX/cudacpp/gg_ttgg.mad/SubProcesses/fbridge.cc            | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/check_sa.cc b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/check_sa.cc
index d7d40e140c..4a249ee4df 100644
--- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/check_sa.cc
+++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/check_sa.cc
@@ -266,11 +266,11 @@ main( int argc, char** argv )
 #ifdef MGONGPUCPP_GPUIMPL
 
   // --- 00. Initialise cuda
-  // Instantiate a CudaRuntime at the beginnining of the application's main to
+  // Instantiate a GpuRuntime at the beginnining of the application's main to
   // invoke cudaSetDevice(0) in the constructor and book a cudaDeviceReset() call in the destructor
   const std::string cdinKey = "00 CudaInit";
   timermap.start( cdinKey );
-  CudaRuntime cudaRuntime( debug );
+  GpuRuntime GpuRuntime( debug );
 #endif
 
   // --- 0a. Initialise physics process
diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/fbridge.cc b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/fbridge.cc
index 5f91f007ee..343b6b8d9c 100644
--- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/fbridge.cc
+++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/fbridge.cc
@@ -47,7 +47,7 @@ extern "C"
   void fbridgecreate_( CppObjectInFortran** ppbridge, const int* pnevtF, const int* pnparF, const int* pnp4F )
   {
 #ifdef MGONGPUCPP_GPUIMPL
-    CudaRuntime::setUp();
+    GpuRuntime::setUp();
 #endif
     // Create a process object, read parm card and set parameters
     // FIXME: the process instance can happily go out of scope because it is only needed to read parameters?
@@ -70,7 +70,7 @@ extern "C"
     if( pbridge == 0 ) throw std::runtime_error( "fbridgedelete_: invalid Bridge address" );
     delete pbridge;
 #ifdef MGONGPUCPP_GPUIMPL
-    CudaRuntime::tearDown();
+    GpuRuntime::tearDown();
 #endif
   }
 

From 8da7f4c0a3c1c0ccb1e0e618b5e1099bb551c891 Mon Sep 17 00:00:00 2001
From: Jorgen T <jorgen.teig@gmail.com>
Date: Tue, 4 Jul 2023 08:22:13 +0200
Subject: [PATCH 342/509] Added GPU abstraction to gg_ttgg.mad

---
 .../cudacpp/gg_ttgg.mad/SubProcesses/Bridge.h  |  6 +++---
 .../gg_ttgg.mad/SubProcesses/GpuRuntime.h      |  4 ++--
 .../SubProcesses/MatrixElementKernels.cc       | 18 +++++++++---------
 .../gg_ttgg.mad/SubProcesses/MemoryBuffers.h   | 12 ++++++------
 .../SubProcesses/P1_gg_ttxgg/CPPProcess.cc     |  8 ++++----
 .../SubProcesses/RamboSamplingKernels.cc       |  4 ++--
 .../gg_ttgg.mad/SubProcesses/runTest.cc        |  2 +-
 7 files changed, 27 insertions(+), 27 deletions(-)

diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/Bridge.h b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/Bridge.h
index 8c543a7356..a8642473e4 100644
--- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/Bridge.h
+++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/Bridge.h
@@ -283,14 +283,14 @@ namespace mg5amcCpu
     constexpr int neppM = MemoryAccessMomenta::neppM;
     if constexpr( neppM == 1 && std::is_same_v<FORTRANFPTYPE, fptype> )
     {
-      checkCuda( cudaMemcpy( m_devMomentaC.data(), momenta, m_devMomentaC.bytes(), cudaMemcpyHostToDevice ) );
+      gpuMemcpy( m_devMomentaC.data(), momenta, m_devMomentaC.bytes(), gpuMemcpyHostToDevice ) );
     }
     else
     {
-      checkCuda( cudaMemcpy( m_devMomentaF.data(), momenta, m_devMomentaF.bytes(), cudaMemcpyHostToDevice ) );
+      gpuMemcpy( m_devMomentaF.data(), momenta, m_devMomentaF.bytes(), gpuMemcpyHostToDevice ) );
       const int thrPerEvt = CPPProcess::npar * CPPProcess::np4; // AV: transpose alg does 1 element per thread (NOT 1 event per thread)
       //const int thrPerEvt = 1; // AV: try new alg with 1 event per thread... this seems slower
-      dev_transposeMomentaF2C<<<m_gpublocks * thrPerEvt, m_gputhreads>>>( m_devMomentaF.data(), m_devMomentaC.data(), m_nevt );
+      gpuLaunchKernel( dev_transposeMomentaF2C, m_gpublocks * thrPerEvt, m_gputhreads, m_devMomentaF.data(), m_devMomentaC.data(), m_nevt );
     }
     if constexpr( std::is_same_v<FORTRANFPTYPE, fptype> )
     {
diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/GpuRuntime.h b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/GpuRuntime.h
index caa301ef24..86c9179f4c 100644
--- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/GpuRuntime.h
+++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/GpuRuntime.h
@@ -28,8 +28,8 @@ inline void assertGpu( gpuError_t code, const char* file, int line, bool abort =
 #ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 {
-  // Instantiate a CudaRuntime at the beginnining of the application's main to
-  // invoke cudaSetDevice(0) in the constructor and book a cudaDeviceReset() call in the destructor
+  // Instantiate a GpuRuntime at the beginnining of the application's main to
+  // invoke gpuSetDevice(0) in the constructor and book a gpuDeviceReset() call in the destructor
   // *** FIXME! This will all need to be designed differently when going to multi-GPU nodes! ***
   struct GpuRuntime final
   {
diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MatrixElementKernels.cc b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MatrixElementKernels.cc
index 9191e138ec..95998e32fa 100644
--- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MatrixElementKernels.cc
+++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MatrixElementKernels.cc
@@ -202,13 +202,13 @@ namespace mg5amcGpu
     PinnedHostBufferHelicityMask hstIsGoodHel( ncomb );
     DeviceBufferHelicityMask devIsGoodHel( ncomb );
     // ... 0d1. Compute good helicity mask on the device
-    computeDependentCouplings<<<m_gpublocks, m_gputhreads>>>( m_gs.data(), m_couplings.data() );
+    gpuLaunchKernel( computeDependentCouplings, m_gpublocks, m_gputhreads,  m_gs.data(), m_couplings.data() );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    sigmaKin_getGoodHel<<<m_gpublocks, m_gputhreads>>>( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_numerators.data(), m_denominators.data(), devIsGoodHel.data() );
+    gpuLaunchKernel( sigmaKin_getGoodHel, m_gpublocks, m_gputhreads, m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_numerators.data(), m_denominators.data(), devIsGoodHel.data() );
 #else
-    sigmaKin_getGoodHel<<<m_gpublocks, m_gputhreads>>>( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), devIsGoodHel.data() );
+    gpuLaunchKernel( sigmaKin_getGoodHel, m_gpublocks, m_gputhreads, m_momenta.data(), m_couplings.data(), m_matrixElements.data(), devIsGoodHel.data() );
 #endif
-    checkCuda( cudaPeekAtLastError() );
+    checkgpu( gpuPeekAtLastError() );
     // ... 0d2. Copy back good helicity mask to the host
     copyHostFromDevice( hstIsGoodHel, devIsGoodHel );
     // ... 0d3. Copy back good helicity list to constant memory on the device
@@ -219,19 +219,19 @@ namespace mg5amcGpu
 
   void MatrixElementKernelDevice::computeMatrixElements( const unsigned int channelId )
   {
-    computeDependentCouplings<<<m_gpublocks, m_gputhreads>>>( m_gs.data(), m_couplings.data() );
+    gpuLaunchKernel( computeDependentCouplings, m_gpublocks, m_gputhreads, m_gs.data(), m_couplings.data() );
 #ifndef MGONGPU_NSIGHT_DEBUG
     constexpr unsigned int sharedMemSize = 0;
 #else
     constexpr unsigned int sharedMemSize = ntpbMAX * sizeof( float );
 #endif
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    sigmaKin<<<m_gpublocks, m_gputhreads, sharedMemSize>>>( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), channelId, m_numerators.data(), m_denominators.data(), m_selhel.data(), m_selcol.data() );
+    gpuLaunchKernelSharedMem( sigmaKin, m_gpublocks, m_gputhreads, sharedMemSize, m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), channelId, m_numerators.data(), m_denominators.data(), m_selhel.data(), m_selcol.data() );
 #else
-    sigmaKin<<<m_gpublocks, m_gputhreads, sharedMemSize>>>( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), m_selhel.data(), m_selcol.data() );
+    gpuLaunchKernelSharedMem( sigmaKin, m_gpublocks, m_gputhreads, sharedMemSize, m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), m_selhel.data(), m_selcol.data() );
 #endif
-    checkCuda( cudaPeekAtLastError() );
-    checkCuda( cudaDeviceSynchronize() );
+    checkGpu( gpuPeekAtLastError() );
+    checkGpu( gpuDeviceSynchronize() );
   }
 
   //--------------------------------------------------------------------------
diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MemoryBuffers.h b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MemoryBuffers.h
index 1da79d70d6..d7be435534 100644
--- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MemoryBuffers.h
+++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MemoryBuffers.h
@@ -127,11 +127,11 @@ namespace mg5amcCpu
     PinnedHostBufferBase( const size_t size )
       : BufferBase<T>( size, false )
     {
-      checkCuda( cudaMallocHost( &( this->m_data ), this->bytes() ) );
+      gpuMallocHost( &( this->m_data ), this->bytes() );
     }
     virtual ~PinnedHostBufferBase()
     {
-      checkCuda( cudaFreeHost( this->m_data ) );
+      gpuFreeHost( this->m_data );
     }
   };
 #endif
@@ -147,11 +147,11 @@ namespace mg5amcCpu
     DeviceBufferBase( const size_t size )
       : BufferBase<T>( size, true )
     {
-      checkCuda( cudaMalloc( &( this->m_data ), this->bytes() ) );
+      gpuMalloc( &( this->m_data ), this->bytes() );
     }
     virtual ~DeviceBufferBase()
     {
-      checkCuda( cudaFree( this->m_data ) );
+      gpuFree( this->m_data );
     }
   };
 #endif
@@ -503,7 +503,7 @@ namespace mg5amcCpu
       throw std::runtime_error( sstr.str() );
     }
     // NB (PR #45): cudaMemcpy involves an intermediate memcpy to pinned memory if host array is a not a pinned host array
-    checkCuda( cudaMemcpy( dst.data(), src.data(), src.bytes(), cudaMemcpyHostToDevice ) );
+    gpuMemcpy( dst.data(), src.data(), src.bytes(), gpuMemcpyHostToDevice ) );
   }
 #endif
 
@@ -526,7 +526,7 @@ namespace mg5amcCpu
       throw std::runtime_error( sstr.str() );
     }
     // NB (PR #45): cudaMemcpy involves an intermediate memcpy to pinned memory if host array is a not a pinned host array
-    checkCuda( cudaMemcpy( dst.data(), src.data(), src.bytes(), cudaMemcpyDeviceToHost ) );
+    gpuMemcpy( dst.data(), src.data(), src.bytes(), gpuMemcpyDeviceToHost ) );
   }
 #endif
 
diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/CPPProcess.cc b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/CPPProcess.cc
index e72332a582..4efef467bc 100644
--- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/CPPProcess.cc
+++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/CPPProcess.cc
@@ -2628,7 +2628,7 @@ namespace mg5amcCpu
       { 1, 1, 1, -1, 1, -1 },
       { 1, 1, 1, -1, 1, 1 } };
 #ifdef MGONGPUCPP_GPUIMPL
-    checkCuda( cudaMemcpyToSymbol( cHel, tHel, ncomb * npar * sizeof( short ) ) );
+    gpuMemcpyToSymbol( cHel, tHel, ncomb * npar * sizeof( short ) );
 #else
     memcpy( cHel, tHel, ncomb * npar * sizeof( short ) );
 #endif
@@ -2671,7 +2671,7 @@ namespace mg5amcCpu
     const fptype tIPD[2] = { (fptype)m_pars->mdl_MT, (fptype)m_pars->mdl_WT };
     //const cxtype tIPC[0] = { ... }; // nicoup=0
 #ifdef MGONGPUCPP_GPUIMPL
-    checkCuda( cudaMemcpyToSymbol( cIPD, tIPD, 2 * sizeof( fptype ) ) );
+    gpuMemcpyToSymbol( cIPD, tIPD, 2 * sizeof( fptype ) );
     //checkCuda( cudaMemcpyToSymbol( cIPC, tIPC, 0 * sizeof( cxtype ) ) ); // nicoup=0
 #else
     memcpy( cIPD, tIPD, 2 * sizeof( fptype ) );
@@ -2931,8 +2931,8 @@ namespace mg5amcCpu
       }
     }
 #ifdef MGONGPUCPP_GPUIMPL
-    checkCuda( cudaMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) ) );
-    checkCuda( cudaMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) ) );
+    gpuMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) );
+    gpuMemcpyToSymbol( cGgpuoodHel, goodHel, ncomb * sizeof( int ) );
 #else
     cNGoodHel = nGoodHel;
     for( int ihel = 0; ihel < ncomb; ihel++ ) cGoodHel[ihel] = goodHel[ihel];
diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/RamboSamplingKernels.cc b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/RamboSamplingKernels.cc
index 8412ec06ed..8745b084d3 100644
--- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/RamboSamplingKernels.cc
+++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/RamboSamplingKernels.cc
@@ -151,7 +151,7 @@ namespace mg5amcCpu
   void
   RamboSamplingKernelDevice::getMomentaInitial()
   {
-    getMomentaInitialDevice<<<m_gpublocks, m_gputhreads>>>( m_energy, m_momenta.data() );
+    gpuLaunchKernel( getMomentaInitialDevice, m_gpublocks, m_gputhreads, m_energy, m_momenta.data() );
   }
 #endif
 
@@ -175,7 +175,7 @@ namespace mg5amcCpu
   void
   RamboSamplingKernelDevice::getMomentaFinal()
   {
-    getMomentaFinalDevice<<<m_gpublocks, m_gputhreads>>>( m_energy, m_rndmom.data(), m_momenta.data(), m_weights.data() );
+    gpuLaunchKernel( getMomentaFinalDevice, m_gpublocks, m_gputhreads, m_energy, m_rndmom.data(), m_momenta.data(), m_weights.data() );
   }
 #endif
 
diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/runTest.cc b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/runTest.cc
index 13616d771a..05d1a08c2b 100644
--- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/runTest.cc
+++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/runTest.cc
@@ -123,7 +123,7 @@ struct CUDATest : public CUDA_CPU_TestBase
   {
     ~DeviceReset()
     {
-      checkCuda( cudaDeviceReset() ); // this is needed by cuda-memcheck --leak-check full
+      gpuDeviceReset(); // this is needed by cuda-memcheck --leak-check full
     }
   } deviceResetter;
 

From a6ba2d164ab6daba3fc7aa883fe762d48181482d Mon Sep 17 00:00:00 2001
From: Jorgen T <jorgen.teig@gmail.com>
Date: Tue, 4 Jul 2023 08:38:32 +0200
Subject: [PATCH 343/509] Fixed syntax errors

---
 epochX/cudacpp/gg_ttgg.mad/SubProcesses/Bridge.h        | 4 ++--
 epochX/cudacpp/gg_ttgg.mad/SubProcesses/MemoryBuffers.h | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/Bridge.h b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/Bridge.h
index a8642473e4..3c96940cc1 100644
--- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/Bridge.h
+++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/Bridge.h
@@ -283,11 +283,11 @@ namespace mg5amcCpu
     constexpr int neppM = MemoryAccessMomenta::neppM;
     if constexpr( neppM == 1 && std::is_same_v<FORTRANFPTYPE, fptype> )
     {
-      gpuMemcpy( m_devMomentaC.data(), momenta, m_devMomentaC.bytes(), gpuMemcpyHostToDevice ) );
+      gpuMemcpy( m_devMomentaC.data(), momenta, m_devMomentaC.bytes(), gpuMemcpyHostToDevice );
     }
     else
     {
-      gpuMemcpy( m_devMomentaF.data(), momenta, m_devMomentaF.bytes(), gpuMemcpyHostToDevice ) );
+      gpuMemcpy( m_devMomentaF.data(), momenta, m_devMomentaF.bytes(), gpuMemcpyHostToDevice );
       const int thrPerEvt = CPPProcess::npar * CPPProcess::np4; // AV: transpose alg does 1 element per thread (NOT 1 event per thread)
       //const int thrPerEvt = 1; // AV: try new alg with 1 event per thread... this seems slower
       gpuLaunchKernel( dev_transposeMomentaF2C, m_gpublocks * thrPerEvt, m_gputhreads, m_devMomentaF.data(), m_devMomentaC.data(), m_nevt );
diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MemoryBuffers.h b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MemoryBuffers.h
index d7be435534..41c6e8c0e4 100644
--- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MemoryBuffers.h
+++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MemoryBuffers.h
@@ -503,7 +503,7 @@ namespace mg5amcCpu
       throw std::runtime_error( sstr.str() );
     }
     // NB (PR #45): cudaMemcpy involves an intermediate memcpy to pinned memory if host array is a not a pinned host array
-    gpuMemcpy( dst.data(), src.data(), src.bytes(), gpuMemcpyHostToDevice ) );
+    gpuMemcpy( dst.data(), src.data(), src.bytes(), gpuMemcpyHostToDevice );
   }
 #endif
 
@@ -526,7 +526,7 @@ namespace mg5amcCpu
       throw std::runtime_error( sstr.str() );
     }
     // NB (PR #45): cudaMemcpy involves an intermediate memcpy to pinned memory if host array is a not a pinned host array
-    gpuMemcpy( dst.data(), src.data(), src.bytes(), gpuMemcpyDeviceToHost ) );
+    gpuMemcpy( dst.data(), src.data(), src.bytes(), gpuMemcpyDeviceToHost );
   }
 #endif
 

From c82a5a6ae72fd8066bef93b09cebaaedfb10a9e7 Mon Sep 17 00:00:00 2001
From: Jorgen T <jorgen.teig@gmail.com>
Date: Tue, 4 Jul 2023 08:43:14 +0200
Subject: [PATCH 344/509] Fixed typo

---
 .../cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/CPPProcess.cc  | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/CPPProcess.cc b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/CPPProcess.cc
index 4efef467bc..f143ae923f 100644
--- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/CPPProcess.cc
+++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/CPPProcess.cc
@@ -2932,7 +2932,7 @@ namespace mg5amcCpu
     }
 #ifdef MGONGPUCPP_GPUIMPL
     gpuMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) );
-    gpuMemcpyToSymbol( cGgpuoodHel, goodHel, ncomb * sizeof( int ) );
+    gpuMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) );
 #else
     cNGoodHel = nGoodHel;
     for( int ihel = 0; ihel < ncomb; ihel++ ) cGoodHel[ihel] = goodHel[ihel];

From 0ef271156d1ffc9e5ad6aa6f2fd0e67d69faf825 Mon Sep 17 00:00:00 2001
From: Jorgen T <jorgen.teig@gmail.com>
Date: Tue, 4 Jul 2023 08:46:29 +0200
Subject: [PATCH 345/509] Fixed error

---
 .../gg_ttgg.mad/SubProcesses/MatrixElementKernels.cc        | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MatrixElementKernels.cc b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MatrixElementKernels.cc
index 95998e32fa..e793d6a379 100644
--- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MatrixElementKernels.cc
+++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MatrixElementKernels.cc
@@ -208,7 +208,7 @@ namespace mg5amcGpu
 #else
     gpuLaunchKernel( sigmaKin_getGoodHel, m_gpublocks, m_gputhreads, m_momenta.data(), m_couplings.data(), m_matrixElements.data(), devIsGoodHel.data() );
 #endif
-    checkgpu( gpuPeekAtLastError() );
+    gpuPeekAtLastError();
     // ... 0d2. Copy back good helicity mask to the host
     copyHostFromDevice( hstIsGoodHel, devIsGoodHel );
     // ... 0d3. Copy back good helicity list to constant memory on the device
@@ -230,8 +230,8 @@ namespace mg5amcGpu
 #else
     gpuLaunchKernelSharedMem( sigmaKin, m_gpublocks, m_gputhreads, sharedMemSize, m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), m_selhel.data(), m_selcol.data() );
 #endif
-    checkGpu( gpuPeekAtLastError() );
-    checkGpu( gpuDeviceSynchronize() );
+    gpuPeekAtLastError();
+    gpuDeviceSynchronize();
   }
 
   //--------------------------------------------------------------------------

From e7df295d95dc2a862baaac3f29cb9fe5bc0861b0 Mon Sep 17 00:00:00 2001
From: Jorgen T <jorgen.teig@gmail.com>
Date: Tue, 4 Jul 2023 09:54:49 +0200
Subject: [PATCH 346/509] Added fix for detecting compiler

---
 epochX/cudacpp/gg_ttgg.mad/SubProcesses/cudacpp.mk | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/cudacpp.mk b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/cudacpp.mk
index ff63aa4a43..d8ec3b0ca4 100644
--- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/cudacpp.mk
+++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/cudacpp.mk
@@ -89,8 +89,8 @@ endif
 
 #-------------------------------------------------------------------------------
 
-CUDA_COMPILER_PATH := $(shell compiler="`which nvcc 2>/dev/null`" && while [ -L "$$compiler" ]; do compiler=`readlink "$$compiler"`; done && echo "$$compiler")
-HIP_COMPILER_PATH := $(shell compiler="`which hipcc 2>/dev/null`" && while [ -L "$$compiler" ]; do compiler=`readlink "$$compiler"`; done && echo "$$compiler")
+CUDA_COMPILER_PATH := $(shell compiler="`which nvcc 2>/dev/null`" && while [ -L "$$compiler" ]; do compiler=`readlink "$$compiler"`; done && echo "$$compiler" 2> /dev/null)
+HIP_COMPILER_PATH := $(shell compiler="`which hipcc 2>/dev/null`" && while [ -L "$$compiler" ]; do compiler=`readlink "$$compiler"`; done && echo "$$compiler" 2> /dev/null)
 
 ifeq ($(findstring nvcc,$(CUDA_COMPILER_PATH)),nvcc)
   #=== Configure the CUDA compiler

From d1ba0397014f845cb00f08f94d8e7820d24d1d21 Mon Sep 17 00:00:00 2001
From: Jorgen T <jorgen.teig@gmail.com>
Date: Tue, 4 Jul 2023 11:15:02 +0200
Subject: [PATCH 347/509] Fixed Curand appering in HIP compilation

---
 .../gg_ttgg.mad/SubProcesses/CurandRandomNumberKernel.cc        | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/CurandRandomNumberKernel.cc b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/CurandRandomNumberKernel.cc
index c3d5510131..b5380696af 100644
--- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/CurandRandomNumberKernel.cc
+++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/CurandRandomNumberKernel.cc
@@ -22,7 +22,7 @@ inline void assertCurand( curandStatus_t code, const char *file, int line, bool
 }
 #endif /* clang-format on */
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef MGONGPUCPP_CUDACC
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu

From 15ddcbfe17e1ad8f9fc20dde4b9aabe66a5435fe Mon Sep 17 00:00:00 2001
From: Jorgen T <jorgen.teig@gmail.com>
Date: Tue, 4 Jul 2023 11:20:34 +0200
Subject: [PATCH 348/509] Fixed Curand appearing in HIP compilation 2

---
 .../gg_ttgg.mad/SubProcesses/CurandRandomNumberKernel.cc        | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/CurandRandomNumberKernel.cc b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/CurandRandomNumberKernel.cc
index b5380696af..5b33207ad0 100644
--- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/CurandRandomNumberKernel.cc
+++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/CurandRandomNumberKernel.cc
@@ -36,7 +36,7 @@ namespace mg5amcCpu
   {
     if( m_isOnDevice )
     {
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef MGONGPUCPP_CUDACC
       if( !m_rnarray.isOnDevice() )
         throw std::runtime_error( "CurandRandomNumberKernel on device with a host random number array" );
 #else

From b6b967dd12791a023edddaf5583fa8cf3b4c0d06 Mon Sep 17 00:00:00 2001
From: Jorgen T <jorgen.teig@gmail.com>
Date: Tue, 4 Jul 2023 11:46:27 +0200
Subject: [PATCH 349/509] Fix Curand appearing in HIP compilation 3

---
 .../cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/check_sa.cc   | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/check_sa.cc b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/check_sa.cc
index 4a249ee4df..e26397b39b 100644
--- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/check_sa.cc
+++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/check_sa.cc
@@ -102,7 +102,8 @@ main( int argc, char** argv )
     CurandHost = 1,
     CurandDevice = 2
   };
-#ifdef MGONGPUCPP_GPUIMPL
+// Specifically checks for __CUDACC__ here
+#ifdef MGONGPUCPP_CUDACC
   RandomNumberMode rndgen = RandomNumberMode::CurandDevice; // default on GPU
 #elif not defined MGONGPU_HAS_NO_CURAND
   RandomNumberMode rndgen = RandomNumberMode::CurandHost;  // default on CPU if build has curand

From a74ea9d15e996108b78db170fc8181c157ad60a6 Mon Sep 17 00:00:00 2001
From: Jorgen T <jorgen.teig@gmail.com>
Date: Tue, 4 Jul 2023 14:49:18 +0200
Subject: [PATCH 350/509] Removed unneccesary line in Makefile

---
 epochX/cudacpp/gg_ttgg.mad/SubProcesses/cudacpp.mk | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/cudacpp.mk b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/cudacpp.mk
index d8ec3b0ca4..ff63aa4a43 100644
--- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/cudacpp.mk
+++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/cudacpp.mk
@@ -89,8 +89,8 @@ endif
 
 #-------------------------------------------------------------------------------
 
-CUDA_COMPILER_PATH := $(shell compiler="`which nvcc 2>/dev/null`" && while [ -L "$$compiler" ]; do compiler=`readlink "$$compiler"`; done && echo "$$compiler" 2> /dev/null)
-HIP_COMPILER_PATH := $(shell compiler="`which hipcc 2>/dev/null`" && while [ -L "$$compiler" ]; do compiler=`readlink "$$compiler"`; done && echo "$$compiler" 2> /dev/null)
+CUDA_COMPILER_PATH := $(shell compiler="`which nvcc 2>/dev/null`" && while [ -L "$$compiler" ]; do compiler=`readlink "$$compiler"`; done && echo "$$compiler")
+HIP_COMPILER_PATH := $(shell compiler="`which hipcc 2>/dev/null`" && while [ -L "$$compiler" ]; do compiler=`readlink "$$compiler"`; done && echo "$$compiler")
 
 ifeq ($(findstring nvcc,$(CUDA_COMPILER_PATH)),nvcc)
   #=== Configure the CUDA compiler

From f584d5b7e85cc85022e40a3390e0e02cfa2dcb61 Mon Sep 17 00:00:00 2001
From: Jorgen T <jorgen.teig@gmail.com>
Date: Tue, 4 Jul 2023 15:15:19 +0200
Subject: [PATCH 351/509] Added changes to profiling build script for HIP
 compilation

---
 tools/profiling/buildCUDAProcess.sh | 54 ++++++++++++++++-------------
 tools/profiling/buildSYCLProcess.sh | 16 +--------
 2 files changed, 30 insertions(+), 40 deletions(-)

diff --git a/tools/profiling/buildCUDAProcess.sh b/tools/profiling/buildCUDAProcess.sh
index 271e04114a..8cc3fad6d4 100755
--- a/tools/profiling/buildCUDAProcess.sh
+++ b/tools/profiling/buildCUDAProcess.sh
@@ -10,7 +10,7 @@
 #
 #
 #   Bash script for compiling and executing physics processes using the MadGraph5_aMC@NLO GPU development framework
-#   using CUDA
+#   using CUDA/HIP
 #
 #   Author: Jorgen Teig, CERN 2023
 #
@@ -24,11 +24,12 @@ helpFunction()
     echo -e "\t-t Threads per block"
     echo -e "\t-i Iterations"
     echo -e "\t-r Branch"
+    echo -e "\t-c CUDA or HIP compilation"
     echo -e "\t-m Makefile arguments"
     exit 1 # Exit script after printing help
 }
 
-while getopts "n:b:t:i:r:m:" opt
+while getopts "n:b:t:i:r:m:a:" opt
 do
     case "$opt" in
         n ) MG_PROC="$OPTARG" ;; #process to target
@@ -36,6 +37,7 @@ do
         t ) threadsPerBlock="$OPTARG" ;;
         i ) iterations="$OPTARG" ;;
         r ) branch="$OPTARG" ;;
+        a ) gpuCompiler="$OPTARG" ;;
         m ) makeArgs="$OPTARG" ;;
         ? ) helpFunction ;; # Print helpFunction in case parameter is non-existent
     esac
@@ -54,43 +56,45 @@ fi
 
 # Set variables for later use
 
-# Assumes that this is run from profiling directory in the repo
-prefix=$(pwd)
+# CUDA
+if [[ -z "${gpuCompiler}" ]] || [[ "${gpuCompiler,,}" == "cuda" ]]; then
+    if [[ -z "$CUDA_HOME" ]]; then
+        # Check if CUDA_HOME has not been set from the outside, usefull in CI/CD
+        export CUDA_HOME="`which nvcc 2>/dev/null`" && while [ -L "$compiler" ]; do compiler=`readlink "$compiler"`; done && echo "$$compiler"
+    fi
+    # Sets CUDA in PATH
+    export PATH=$CUDA_HOME:$PATH
+
+# HIP
+else if [[ "${gpuCompiler,,}" == "hip" ]]
+    if [[ -z "$HIP_HOME" ]]; then
+        # Check if HIP_HOME has not been set from the outside, usefull in CI/CD
+        export HIP_HOME="`which hipcc 2>/dev/null`" && while [ -L "$compiler" ]; do compiler=`readlink "$compiler"`; done && echo "$$compiler"
+    fi
+    # Sets HIP to PATH
+    export PATH=$HIP_HOME:$PATH
+fi
+
+# Prefix for saving the JSON files in workspace folder in the tools/profiling directory
+prefix=$(dirname "$0")
 
 export USEBUILDDIR=1
 export NTPBMAX=1024
-#export CXX=/cvmfs/sft.cern.ch/lcg/releases/gcc/11.3.0-ad0f5/x86_64-centos8/bin/g++
 export CXX=`which g++`
+export FC=`which gfortran`
+
 export MG_EXE="./gcheck.exe" #GPU
 #export MG_EXE="./check.exe" #CPU
-export CUDA_HOME=/usr/local/cuda-12.0/
-export FC=`which gfortran`
+
 export WORKSPACE=$prefix/workspace_mg4gpu
 
 REPORT_FOLDER="${WORKSPACE}/$(date +"%y-%m-%d")_${CUDA_NAME_PREFIX}_${branch}"
 
-# Sets CUDA in PATH
-export PATH=$CUDA_HOME:$PATH
-
 mkdir $WORKSPACE 2>/dev/null; true
 mkdir $REPORT_FOLDER 2>/dev/null; true
 
-# Finds correct subprocess
-case $MG_PROC in
-    ee_mumu ) export MG_SUBPROC="P1_Sigma_sm_epem_mupmum" ;;
-    ee_mumu.sa ) export MG_SUBPROC="P1_Sigma_sm_epem_mupmum" ;;
-    gg_tt ) export MG_SUBPROC="P1_Sigma_sm_gg_ttx" ;;
-    gg_tt.sa ) export MG_SUBPROC="P1_Sigma_sm_gg_ttx" ;;
-    gg_ttg ) export MG_SUBPROC="P1_Sigma_sm_gg_ttxg" ;;
-    gg_ttg.sa ) export MG_SUBPROC="P1_Sigma_sm_gg_ttxg" ;;
-    gg_ttgg ) export MG_SUBPROC="P1_Sigma_sm_gg_ttxgg" ;;
-    gg_ttgg.sa ) export MG_SUBPROC="P1_Sigma_sm_gg_ttxgg" ;;
-    gg_ttggg ) export MG_SUBPROC="P1_Sigma_sm_gg_ttxggg" ;;
-    gg_ttggg.sa ) export MG_SUBPROC="P1_Sigma_sm_gg_ttxggg" ;;
-esac
-
 export MG_PROC_DIR=$prefix/../../epochX/cudacpp/$MG_PROC
-export MG_SP_DIR=$MG_PROC_DIR/SubProcesses/$MG_SUBPROC
+export MG_SP_DIR=$MG_PROC_DIR/SubProcesses/P1_Sigma_sm_*
 export MG5AMC_CARD_PATH=$MG_PROC_DIR/Cards
 
 # Build executable
diff --git a/tools/profiling/buildSYCLProcess.sh b/tools/profiling/buildSYCLProcess.sh
index 4475fb4eee..d9065666a6 100755
--- a/tools/profiling/buildSYCLProcess.sh
+++ b/tools/profiling/buildSYCLProcess.sh
@@ -108,20 +108,6 @@ export PATH=$CUDA_HOME:$PATH
 # Branch should be enviroment variable in main script and then passed down if none then it is not displayed in prefix
 REPORT_FOLDER="${WORKSPACE}/$(date +"%y-%m-%d")_${SYCL_NAME_PREFIX}_${branch}"
 
-# Finds correct subprocess
-case $MG_PROC in
-    ee_mumu ) export MG_SUBPROC="P1_Sigma_sm_epem_mupmum" ;;
-    ee_mumu.sa ) export MG_SUBPROC="P1_Sigma_sm_epem_mupmum" ;;
-    gg_tt ) export MG_SUBPROC="P1_Sigma_sm_gg_ttx" ;;
-    gg_tt.sa ) export MG_SUBPROC="P1_Sigma_sm_gg_ttx" ;;
-    gg_ttg ) export MG_SUBPROC="P1_Sigma_sm_gg_ttxg" ;;
-    gg_ttg.sa ) export MG_SUBPROC="P1_Sigma_sm_gg_ttxg" ;;
-    gg_ttgg ) export MG_SUBPROC="P1_Sigma_sm_gg_ttxgg" ;;
-    gg_ttgg.sa ) export MG_SUBPROC="P1_Sigma_sm_gg_ttxgg" ;;
-    gg_ttggg ) export MG_SUBPROC="P1_Sigma_sm_gg_ttxggg" ;;
-    gg_ttggg.sa ) export MG_SUBPROC="P1_Sigma_sm_gg_ttxggg" ;;
-esac
-
 mkdir -p $WORKSPACE/mg4gpu/lib 2>/dev/null; true
 mkdir -p $WORKSPACE/mg4gpu/bin 2>/dev/null; true
 mkdir $REPORT_FOLDER 2>/dev/null; true
@@ -130,7 +116,7 @@ export MG4GPU_LIB=$WORKSPACE/mg4gpu/lib
 export MG4GPU_BIN=$WORKSPACE/mg4gpu/bin
 
 export MG_PROC_DIR=$prefix/../../epochX/sycl/$MG_PROC
-export MG_SP_DIR=$MG_PROC_DIR/SubProcesses/$MG_SUBPROC
+export MG_SP_DIR=$MG_PROC_DIR/SubProcesses/P1_Sigma_sm_*
 
 export MG_LIBS_DIR="${MG4GPU_LIB}/build_${MG_PROC}_${SYCL_NAME_PREFIX}"
 

From 7fd210f45924571788c5788f2aaa52486bd6b892 Mon Sep 17 00:00:00 2001
From: Jorgen T <jorgen.teig@gmail.com>
Date: Tue, 4 Jul 2023 15:16:35 +0200
Subject: [PATCH 352/509] Fixed syntax error

---
 tools/profiling/buildCUDAProcess.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/profiling/buildCUDAProcess.sh b/tools/profiling/buildCUDAProcess.sh
index 8cc3fad6d4..e3ee21fdf4 100755
--- a/tools/profiling/buildCUDAProcess.sh
+++ b/tools/profiling/buildCUDAProcess.sh
@@ -66,7 +66,7 @@ if [[ -z "${gpuCompiler}" ]] || [[ "${gpuCompiler,,}" == "cuda" ]]; then
     export PATH=$CUDA_HOME:$PATH
 
 # HIP
-else if [[ "${gpuCompiler,,}" == "hip" ]]
+elif [[ "${gpuCompiler,,}" == "hip" ]]
     if [[ -z "$HIP_HOME" ]]; then
         # Check if HIP_HOME has not been set from the outside, usefull in CI/CD
         export HIP_HOME="`which hipcc 2>/dev/null`" && while [ -L "$compiler" ]; do compiler=`readlink "$compiler"`; done && echo "$$compiler"

From bea14a4bd58febeb4d84d5254d1b5c030ba50de9 Mon Sep 17 00:00:00 2001
From: Jorgen T <jorgen.teig@gmail.com>
Date: Tue, 4 Jul 2023 15:17:55 +0200
Subject: [PATCH 353/509] Fixed syntax error 2

---
 tools/profiling/buildCUDAProcess.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/profiling/buildCUDAProcess.sh b/tools/profiling/buildCUDAProcess.sh
index e3ee21fdf4..2bac474f10 100755
--- a/tools/profiling/buildCUDAProcess.sh
+++ b/tools/profiling/buildCUDAProcess.sh
@@ -66,7 +66,7 @@ if [[ -z "${gpuCompiler}" ]] || [[ "${gpuCompiler,,}" == "cuda" ]]; then
     export PATH=$CUDA_HOME:$PATH
 
 # HIP
-elif [[ "${gpuCompiler,,}" == "hip" ]]
+elif [[ "${gpuCompiler,,}" == "hip" ]]; then
     if [[ -z "$HIP_HOME" ]]; then
         # Check if HIP_HOME has not been set from the outside, usefull in CI/CD
         export HIP_HOME="`which hipcc 2>/dev/null`" && while [ -L "$compiler" ]; do compiler=`readlink "$compiler"`; done && echo "$$compiler"

From bfd753454dad85d1d59113603e0034434c659a31 Mon Sep 17 00:00:00 2001
From: Jorgen T <jorgen.teig@gmail.com>
Date: Tue, 4 Jul 2023 15:26:07 +0200
Subject: [PATCH 354/509] Fixed path to acual process being compiled

---
 tools/profiling/buildCUDAProcess.sh | 4 ++--
 tools/profiling/buildSYCLProcess.sh | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/tools/profiling/buildCUDAProcess.sh b/tools/profiling/buildCUDAProcess.sh
index 2bac474f10..9548537454 100755
--- a/tools/profiling/buildCUDAProcess.sh
+++ b/tools/profiling/buildCUDAProcess.sh
@@ -76,7 +76,7 @@ elif [[ "${gpuCompiler,,}" == "hip" ]]; then
 fi
 
 # Prefix for saving the JSON files in workspace folder in the tools/profiling directory
-prefix=$(dirname "$0")
+prefix="$( cd -- "$(dirname "$0")" >/dev/null 2>&1 ; pwd -P )"
 
 export USEBUILDDIR=1
 export NTPBMAX=1024
@@ -94,7 +94,7 @@ mkdir $WORKSPACE 2>/dev/null; true
 mkdir $REPORT_FOLDER 2>/dev/null; true
 
 export MG_PROC_DIR=$prefix/../../epochX/cudacpp/$MG_PROC
-export MG_SP_DIR=$MG_PROC_DIR/SubProcesses/P1_Sigma_sm_*
+export MG_SP_DIR=$MG_PROC_DIR/SubProcesses/P1_*
 export MG5AMC_CARD_PATH=$MG_PROC_DIR/Cards
 
 # Build executable
diff --git a/tools/profiling/buildSYCLProcess.sh b/tools/profiling/buildSYCLProcess.sh
index d9065666a6..5afafb9e0c 100755
--- a/tools/profiling/buildSYCLProcess.sh
+++ b/tools/profiling/buildSYCLProcess.sh
@@ -116,7 +116,7 @@ export MG4GPU_LIB=$WORKSPACE/mg4gpu/lib
 export MG4GPU_BIN=$WORKSPACE/mg4gpu/bin
 
 export MG_PROC_DIR=$prefix/../../epochX/sycl/$MG_PROC
-export MG_SP_DIR=$MG_PROC_DIR/SubProcesses/P1_Sigma_sm_*
+export MG_SP_DIR=$MG_PROC_DIR/SubProcesses/P1_*
 
 export MG_LIBS_DIR="${MG4GPU_LIB}/build_${MG_PROC}_${SYCL_NAME_PREFIX}"
 

From da61f5adef979381907b63ff02b180c7e9d8c506 Mon Sep 17 00:00:00 2001
From: Jorgen T <jorgen.teig@gmail.com>
Date: Tue, 4 Jul 2023 15:42:22 +0200
Subject: [PATCH 355/509] Fix to getting CUDA_HOME

---
 tools/profiling/buildCUDAProcess.sh | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/tools/profiling/buildCUDAProcess.sh b/tools/profiling/buildCUDAProcess.sh
index 9548537454..9f738dcc8c 100755
--- a/tools/profiling/buildCUDAProcess.sh
+++ b/tools/profiling/buildCUDAProcess.sh
@@ -60,7 +60,9 @@ fi
 if [[ -z "${gpuCompiler}" ]] || [[ "${gpuCompiler,,}" == "cuda" ]]; then
     if [[ -z "$CUDA_HOME" ]]; then
         # Check if CUDA_HOME has not been set from the outside, usefull in CI/CD
-        export CUDA_HOME="`which nvcc 2>/dev/null`" && while [ -L "$compiler" ]; do compiler=`readlink "$compiler"`; done && echo "$$compiler"
+        COMPILER_PATH="`which nvcc 2>/dev/null`" && while [ -L "$compiler" ]; do compiler=`readlink "$compiler"`; done && echo "$$compiler"
+        export CUDA_HOME=$(dirname $(dirname $COMPILER_PATH))
+        echo CUDA_HOME
     fi
     # Sets CUDA in PATH
     export PATH=$CUDA_HOME:$PATH

From c957d69952e6f17a0848b89cfb52d62b3be3b58d Mon Sep 17 00:00:00 2001
From: Jorgen T <jorgen.teig@gmail.com>
Date: Wed, 5 Jul 2023 15:49:42 +0200
Subject: [PATCH 356/509] Did some refactoring of CUDA/HIP build script

---
 tools/profiling/buildCUDAProcess.sh | 31 ++++++++++++++---------------
 1 file changed, 15 insertions(+), 16 deletions(-)

diff --git a/tools/profiling/buildCUDAProcess.sh b/tools/profiling/buildCUDAProcess.sh
index 9f738dcc8c..b7e7947446 100755
--- a/tools/profiling/buildCUDAProcess.sh
+++ b/tools/profiling/buildCUDAProcess.sh
@@ -24,7 +24,6 @@ helpFunction()
     echo -e "\t-t Threads per block"
     echo -e "\t-i Iterations"
     echo -e "\t-r Branch"
-    echo -e "\t-c CUDA or HIP compilation"
     echo -e "\t-m Makefile arguments"
     exit 1 # Exit script after printing help
 }
@@ -37,7 +36,6 @@ do
         t ) threadsPerBlock="$OPTARG" ;;
         i ) iterations="$OPTARG" ;;
         r ) branch="$OPTARG" ;;
-        a ) gpuCompiler="$OPTARG" ;;
         m ) makeArgs="$OPTARG" ;;
         ? ) helpFunction ;; # Print helpFunction in case parameter is non-existent
     esac
@@ -57,24 +55,25 @@ fi
 # Set variables for later use
 
 # CUDA
-if [[ -z "${gpuCompiler}" ]] || [[ "${gpuCompiler,,}" == "cuda" ]]; then
-    if [[ -z "$CUDA_HOME" ]]; then
-        # Check if CUDA_HOME has not been set from the outside, usefull in CI/CD
-        COMPILER_PATH="`which nvcc 2>/dev/null`" && while [ -L "$compiler" ]; do compiler=`readlink "$compiler"`; done && echo "$$compiler"
-        export CUDA_HOME=$(dirname $(dirname $COMPILER_PATH))
-        echo CUDA_HOME
+# Check if CUDA_HOME has not been set from the outside, usefull in CI/CD
+if [[ -z "$CUDA_HOME" ]]; then
+    export COMPILER_PATH="`which nvcc 2>/dev/null`" && while [ -L "$compiler" ]; do compiler=`readlink "$compiler"`; done && echo "$$compiler"
+
+    if [[ "$COMPILER_PATH" ]]; then
+    export CUDA_HOME=$(dirname $(dirname $COMPILER_PATH))
+    export PATH=$CUDA_HOME${PATH:+:${PATH}}
     fi
-    # Sets CUDA in PATH
-    export PATH=$CUDA_HOME:$PATH
+fi
 
 # HIP
-elif [[ "${gpuCompiler,,}" == "hip" ]]; then
-    if [[ -z "$HIP_HOME" ]]; then
-        # Check if HIP_HOME has not been set from the outside, usefull in CI/CD
-        export HIP_HOME="`which hipcc 2>/dev/null`" && while [ -L "$compiler" ]; do compiler=`readlink "$compiler"`; done && echo "$$compiler"
+# Check if HIP_HOME has not been set from the outside, usefull in CI/CD
+if [[ -z "$HIP_HOME" ]]; then
+    export COMPILER_PATH="`which hipcc 2>/dev/null`" && while [ -L "$compiler" ]; do compiler=`readlink "$compiler"`; done && echo "$$compiler"
+
+    if [[ "$COMPILER_PATH" ]]; then
+    export HIP_HOME=$(dirname $(dirname $COMPILER_PATH))
+    export PATH=$HIP_HOME${PATH:+:${PATH}}
     fi
-    # Sets HIP to PATH
-    export PATH=$HIP_HOME:$PATH
 fi
 
 # Prefix for saving the JSON files in workspace folder in the tools/profiling directory

From ce4e87df3f026f3c73a479e283410dcc8f7c86be Mon Sep 17 00:00:00 2001
From: Jorgen T <jorgen.teig@gmail.com>
Date: Thu, 6 Jul 2023 11:14:49 +0200
Subject: [PATCH 357/509] Last changes for profiling infrastructure for MI250X

---
 .github/workflows/mi250x_profiler.yml  | 22 ++++++++++++++++++++++
 tools/profiling/container-README.md    |  7 +++++++
 tools/profiling/evaluation.py          |  4 ++--
 tools/profiling/performanceProfiler.py |  5 +++--
 tools/profiling/sendData.py            |  2 +-
 5 files changed, 35 insertions(+), 5 deletions(-)
 create mode 100644 .github/workflows/mi250x_profiler.yml
 create mode 100644 tools/profiling/container-README.md

diff --git a/.github/workflows/mi250x_profiler.yml b/.github/workflows/mi250x_profiler.yml
new file mode 100644
index 0000000000..ce38c8ec0e
--- /dev/null
+++ b/.github/workflows/mi250x_profiler.yml
@@ -0,0 +1,22 @@
+name: MI250X Performance Profiler
+
+on:
+  schedule:
+    - cron:  '00 00 * * *'
+
+jobs:
+  HIP_MI250X_Profiling:
+    name: HIP MI250X Profiling
+    env:
+      CUDA_NAME_PREFIX: sycl_AMD-Epyc-7A53_MI250X_gcc-11.3_cuda-12.0.1
+      ENABLE_CI_PROFILER: 1
+
+      MADGRAPH4GPU_DB_SECRET: ${{ secrets.MADGRAPH4GPU_DB_SECRET }}
+    runs-on: [self-hosted, linux, mi250x]
+    steps:
+    - uses: actions/checkout@v2
+    - name: Runs SYCL performanceProfiler.py script
+      run: cd tools/profiling/;
+           python3 performanceProfiler.py -l 'HIP' -b 'master'
+    - name: Uploads HIP JSON files to DB
+      run: cd tools/profiling/; python3 sendData.py --absLayer HIP --profiler 1 --branch master
\ No newline at end of file
diff --git a/tools/profiling/container-README.md b/tools/profiling/container-README.md
new file mode 100644
index 0000000000..782186d0e9
--- /dev/null
+++ b/tools/profiling/container-README.md
@@ -0,0 +1,7 @@
+podman build --tag github_runner . && \
+podman run --security-opt=label=disable -d=true \
+--env GITHUB_TOKEN=AFPDS6753IL4TZY3PPHNNZLEUWJHA \
+--env REPO_URL=https://github.com/Jooorgen/madgraph4gpu \
+--env GITHUB_RUNNER_TAGS=Linux,x64,a100 \
+--env RUNNER_NAME=GPURunner_itscrd-a100 \
+--name github_runner github_runner
\ No newline at end of file
diff --git a/tools/profiling/evaluation.py b/tools/profiling/evaluation.py
index 99bcddb4ae..a1fa06490b 100755
--- a/tools/profiling/evaluation.py
+++ b/tools/profiling/evaluation.py
@@ -46,9 +46,9 @@
 
 compare = True
 
-processToCompare = 'gg_tt'
+processToCompare = 'gg_ttgg.mad'
 
-graphsToCompare = ['test_A100_SYCL_' + processToCompare , 'test_A100_CUDA_' + processToCompare]
+graphsToCompare = ['test_' + processToCompare , 'test_A100_CUDA_' + processToCompare]
 
 stat = 'MECalcOnly'
 #stat = 'MatrixElems'
diff --git a/tools/profiling/performanceProfiler.py b/tools/profiling/performanceProfiler.py
index 2482dda69b..8ee234309c 100644
--- a/tools/profiling/performanceProfiler.py
+++ b/tools/profiling/performanceProfiler.py
@@ -24,7 +24,7 @@
 ABS_LAYER = "SYCL"
 BRANCH = "master"
 
-# Physics processes
+# Physics processes defaults
 MG_PROCESSES_SA = ["ee_mumu.sa", "gg_tt.sa", "gg_ttg.sa", "gg_ttgg.sa", "gg_ttggg.sa"]
 
 DOUBLE_PRECISION_CONSTANT = 2560
@@ -37,6 +37,7 @@
 parser = argparse.ArgumentParser(description='A program for profiling GPUs using MadGraph.')
 
 parser.add_argument("-l", help="Choose which abstraction layer you want to use (CUDA/SYCL).", default=ABS_LAYER)
+parser.add_argument("-p", help="Choose which processes you want to profile.", default=MG_PROCESSES_SA, nargs='+')
 parser.add_argument("-b", help="Choose which branch the madgraph4gpu repo is in.", default=BRANCH)
 
 pyArgs = parser.parse_args()
@@ -44,7 +45,7 @@
 # How many runs in total the program made
 count = 0
 
-for process in MG_PROCESSES_SA:
+for process in args.p:
     for TPB in THREADS_PER_BLOCK:
         for BPG in BLOCKS_PER_GRID:
             if TPB * BPG > DOUBLE_PRECISION_CONSTANT:
diff --git a/tools/profiling/sendData.py b/tools/profiling/sendData.py
index 399f84318b..207d7d2e7f 100644
--- a/tools/profiling/sendData.py
+++ b/tools/profiling/sendData.py
@@ -31,7 +31,7 @@
 secret = os.environ.get('MADGRAPH4GPU_DB_SECRET')
 AUTH = ['db_user', secret]
 PHYS_PROCESSES = ['ee_mumu', 'gg_ttggg', 'gg_ttgg', 'gg_ttg', 'gg_tt']
-ABS_LAYERS = ['SYCL', 'CUDA']
+ABS_LAYERS = ['SYCL', 'CUDA', 'HIP']
 BRANCH = 'master'
 FIELDS = ['EvtsPerSec[MatrixElems] (3)', 'EvtsPerSec[MECalcOnly] (3)']
 

From eda96f84c793ad3a51173a25205b2f16c9c94bcd Mon Sep 17 00:00:00 2001
From: Jorgen T <jorgen.teig@gmail.com>
Date: Wed, 12 Jul 2023 15:50:02 +0200
Subject: [PATCH 358/509] Readded the MI250X performance profiler workflow
 container setup

---
 .github/workflows/mi250x_profiler.yml | 24 +++++++++++++++++++++++-
 1 file changed, 23 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/mi250x_profiler.yml b/.github/workflows/mi250x_profiler.yml
index ce38c8ec0e..3841cb5c26 100644
--- a/.github/workflows/mi250x_profiler.yml
+++ b/.github/workflows/mi250x_profiler.yml
@@ -5,12 +5,34 @@ on:
     - cron:  '00 00 * * *'
 
 jobs:
+  Container Setup:
+  runs-on: [self-hosted, linux]
+  steps:
+  - name: Generate runner token
+    id: generate_token
+    run: |
+      TOKEN=$(curl -XPOST -fsSL -H "Authorization: token ${{ secrets.PAT }}" -H "Accept: application/vnd.github.v3+json" "https://api.github.com/repos/${{ github.repository }}/actions/runners/registration-token" | jq -r '.token')
+      echo "::set-output name=token::$TOKEN"
+  - name: SSH and run Docker container
+    run: |
+      echo "${{ secrets.SSH_KEY }}" > id_rsa
+      chmod 600 id_rsa
+      ssh -o StrictHostKeyChecking=no -i id_rsa ${{ secrets.MI250X_PROFILING_HOST }} "\
+        singularity pull ghcr.io/${{ github.repository_owner }}/github_runner:latest && \
+        singularity run -d --name my_container --rm \
+        -e GITHUB_TOKEN=${{ steps.generate_token.outputs.token }} \
+        -e REPO_URL=https://github.com/${{ github.repository }} \
+        -e RUNNER_NAME=github_runner \
+        -e GITHUB_RUNNER_TAGS='Linux,x64,mi250x' \
+        -e RUNNER_URL=https://github.com/actions/runner/releases/download/v2.303.0/actions-runner-linux-x64-2.303.0.tar.gz \
+        ghcr.io/${{ github.repository_owner }}/github_runner:latest"
+
   HIP_MI250X_Profiling:
+    needs: setup
     name: HIP MI250X Profiling
     env:
       CUDA_NAME_PREFIX: sycl_AMD-Epyc-7A53_MI250X_gcc-11.3_cuda-12.0.1
       ENABLE_CI_PROFILER: 1
-
       MADGRAPH4GPU_DB_SECRET: ${{ secrets.MADGRAPH4GPU_DB_SECRET }}
     runs-on: [self-hosted, linux, mi250x]
     steps:

From 1e4eba8a18d9eb8ee491b340a667dfccc7f61000 Mon Sep 17 00:00:00 2001
From: Jorgen T <jorgen.teig@gmail.com>
Date: Thu, 13 Jul 2023 10:05:19 +0200
Subject: [PATCH 359/509] Fixed the indentation in the MI250X workflow

---
 .github/workflows/mi250x_profiler.yml | 43 ++++++++++++++-------------
 1 file changed, 22 insertions(+), 21 deletions(-)

diff --git a/.github/workflows/mi250x_profiler.yml b/.github/workflows/mi250x_profiler.yml
index 3841cb5c26..e69f810c4f 100644
--- a/.github/workflows/mi250x_profiler.yml
+++ b/.github/workflows/mi250x_profiler.yml
@@ -6,29 +6,30 @@ on:
 
 jobs:
   Container Setup:
-  runs-on: [self-hosted, linux]
-  steps:
-  - name: Generate runner token
-    id: generate_token
-    run: |
-      TOKEN=$(curl -XPOST -fsSL -H "Authorization: token ${{ secrets.PAT }}" -H "Accept: application/vnd.github.v3+json" "https://api.github.com/repos/${{ github.repository }}/actions/runners/registration-token" | jq -r '.token')
-      echo "::set-output name=token::$TOKEN"
-  - name: SSH and run Docker container
-    run: |
-      echo "${{ secrets.SSH_KEY }}" > id_rsa
-      chmod 600 id_rsa
-      ssh -o StrictHostKeyChecking=no -i id_rsa ${{ secrets.MI250X_PROFILING_HOST }} "\
-        singularity pull ghcr.io/${{ github.repository_owner }}/github_runner:latest && \
-        singularity run -d --name my_container --rm \
-        -e GITHUB_TOKEN=${{ steps.generate_token.outputs.token }} \
-        -e REPO_URL=https://github.com/${{ github.repository }} \
-        -e RUNNER_NAME=github_runner \
-        -e GITHUB_RUNNER_TAGS='Linux,x64,mi250x' \
-        -e RUNNER_URL=https://github.com/actions/runner/releases/download/v2.303.0/actions-runner-linux-x64-2.303.0.tar.gz \
-        ghcr.io/${{ github.repository_owner }}/github_runner:latest"
+    runs-on: [self-hosted, linux]
+    name: Container Setup
+    steps:
+    - name: Generate runner token
+      id: generate_token
+      run: |
+        TOKEN=$(curl -XPOST -fsSL -H "Authorization: token ${{ secrets.PAT }}" -H "Accept: application/vnd.github.v3+json" "https://api.github.com/repos/${{ github.repository }}/actions/runners/registration-token" | jq -r '.token')
+        echo "::set-output name=token::$TOKEN"
+    - name: SSH and run Docker container
+      run: |
+        echo "${{ secrets.SSH_KEY }}" > id_rsa
+        chmod 600 id_rsa
+        ssh -o StrictHostKeyChecking=no -i id_rsa ${{ secrets.MI250X_PROFILING_HOST }} "\
+          singularity pull ghcr.io/${{ github.repository_owner }}/github_runner:latest && \
+          singularity run -d --name my_container --rm \
+          -e GITHUB_TOKEN=${{ steps.generate_token.outputs.token }} \
+          -e REPO_URL=https://github.com/${{ github.repository }} \
+          -e RUNNER_NAME=github_runner \
+          -e GITHUB_RUNNER_TAGS='Linux,x64,mi250x' \
+          -e RUNNER_URL=https://github.com/actions/runner/releases/download/v2.303.0/actions-runner-linux-x64-2.303.0.tar.gz \
+          ghcr.io/${{ github.repository_owner }}/github_runner:latest"
 
   HIP_MI250X_Profiling:
-    needs: setup
+    needs: Container Setup
     name: HIP MI250X Profiling
     env:
       CUDA_NAME_PREFIX: sycl_AMD-Epyc-7A53_MI250X_gcc-11.3_cuda-12.0.1

From b7252ab6f5930f3b14f958dc888458bc9b72b09b Mon Sep 17 00:00:00 2001
From: Jorgen T <jorgen.teig@gmail.com>
Date: Thu, 13 Jul 2023 13:34:55 +0200
Subject: [PATCH 360/509] Added first file of implementation of rocRand

---
 .../SubProcesses/RocrandRandomNumberKernel.cc | 131 ++++++++++++++++++
 1 file changed, 131 insertions(+)
 create mode 100644 epochX/cudacpp/gg_ttgg.mad/SubProcesses/RocrandRandomNumberKernel.cc

diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/RocrandRandomNumberKernel.cc b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/RocrandRandomNumberKernel.cc
new file mode 100644
index 0000000000..fb5a8ef1c8
--- /dev/null
+++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/RocrandRandomNumberKernel.cc
@@ -0,0 +1,131 @@
+// Copyright (C) 2020-2023 CERN and UCLouvain.
+// Licensed under the GNU Lesser General Public License (version 3 or later).
+// Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+
+#include "GpuRuntime.h"
+#include "MemoryBuffers.h"
+#include "RandomNumberKernels.h"
+
+#include <cassert>
+
+#ifndef MGONGPU_HAS_NO_ROCRAND /* clang-format off */
+#include "rocrand.h"
+#define checkRocRand( code ){ assertRocRand( code, __FILE__, __LINE__ ); }
+inline void assertRocRand( rocrandStatus_t code, const char *file, int line, bool abort = true )
+{
+  if ( code != ROCRAND_STATUS_SUCCESS )
+  {
+    printf( "RocRandAssert: %s:%d code=%d\n", file, line, code );
+    if ( abort ) assert( code == ROCRAND_STATUS_SUCCESS );
+  }
+}
+#endif /* clang-format on */
+
+#ifdef MGONGPUCPP_HIPCC
+namespace mg5amcGpu
+#else
+namespace mg5amcCpu
+#endif
+{
+  //--------------------------------------------------------------------------
+#ifndef MGONGPU_HAS_NO_ROCRAND
+  RocRandRandomNumberKernel::RocRandRandomNumberKernel( BufferRndNumMomenta& rnarray, const bool onDevice )
+    : RandomNumberKernelBase( rnarray )
+    , m_isOnDevice( onDevice )
+  {
+    if( m_isOnDevice )
+    {
+#ifdef MGONGPUCPP_HIPCC
+      if( !m_rnarray.isOnDevice() )
+        throw std::runtime_error( "RocRandRandomNumberKernel on device with a host random number array" );
+#else
+      throw std::runtime_error( "RocRandRandomNumberKernel does not support RocRandDevice on CPU host" );
+#endif
+    }
+    else
+    {
+      if( m_rnarray.isOnDevice() )
+        throw std::runtime_error( "RocRandRandomNumberKernel on host with a device random number array" );
+    }
+    createGenerator();
+  }
+
+  //--------------------------------------------------------------------------
+
+  RocRandRandomNumberKernel::~RocRandRandomNumberKernel()
+  {
+    destroyGenerator();
+  }
+
+  //--------------------------------------------------------------------------
+
+  void RocRandRandomNumberKernel::seedGenerator( const unsigned int seed )
+  {
+    if( m_isOnDevice )
+    {
+      destroyGenerator(); // workaround for #429
+      createGenerator();  // workaround for #429
+    }
+    //printf( "seedGenerator: seed %d\n", seed );
+    checkRocRand( rocrand_set_seed( m_rnGen, seed ) );
+  }
+
+  //--------------------------------------------------------------------------
+
+  void RocRandRandomNumberKernel::createGenerator()
+  {
+    // [NB Timings are for GenRnGen host|device (cpp|cuda) generation of 256*32*1 events with nproc=1: rn(0) is host=0.0012s]
+    const rocrand_rng_type type = ROCRAND_RNG_PSEUDO_MTGP32; // 0.00082s | 0.00064s (FOR FAST TESTS) NO TESTS DONE WITH ROCRAND, BUT THIS IS FASTEST IN CUDA
+    //const rocrand_rng_type type = ROCRAND_RNG_PSEUDO_XORWOW;
+    //const rocrand_rng_type type = ROCRAND_RNG_PSEUDO_MRG32K3A;
+    //const rocrand_rng_type type = ROCRAND_RNG_PSEUDO_MT19937;
+    //const rocrand_rng_type type = ROCRAND_RNG_PSEUDO_PHILOX4_32_10;
+    if( m_isOnDevice )
+    {
+      checkRocRand( rocrand_create_generator( &m_rnGen, type ) );
+    }
+    else
+    {
+      checkRocRand( rocrand_create_generator_host( &m_rnGen, type ) );
+    }
+    // No RocRAND equivalent for curandSetGeneratorOrdering
+  }
+
+  //--------------------------------------------------------------------------
+
+  void RocRandRandomNumberKernel::destroyGenerator()
+  {
+    checkRocRand( rocrand_destroy_generator( m_rnGen ) );
+  }
+
+  //--------------------------------------------------------------------------
+
+  void RocRandRandomNumberKernel::generateRnarray()
+  {
+#if defined MGONGPU_FPTYPE_DOUBLE
+    checkRocRand( rocrand_generate_uniform_double( m_rnGen, m_rnarray.data(), m_rnarray.size() ) );
+#elif defined MGONGPU_FPTYPE_FLOAT
+    checkRocRand( rocrand_generate_uniform( m_rnGen, m_rnarray.data(), m_rnarray.size() ) );
+#endif
+    /* 
+    printf( "\nRocRandRandomNumberKernel::generateRnarray size = %d\n", (int)m_rnarray.size() );
+    fptype* data = m_rnarray.data();
+#ifdef MGONGPUCPP_GPUIMPL
+    if( m_rnarray.isOnDevice() )
+    {
+      data = new fptype[m_rnarray.size()]();
+      checkRoc( rocMemcpy( data, m_rnarray.data(), m_rnarray.bytes(), rocMemcpyDeviceToHost ) );
+    }
+#endif
+    for( int i = 0; i < ( (int)m_rnarray.size() / 4 ); i++ )
+      printf( "[%4d] %f %f %f %f\n", i * 4, data[i * 4], data[i * 4 + 2], data[i * 4 + 2], data[i * 4 + 3] );
+#ifdef MGONGPUCPP_GPUIMPL
+    if( m_rnarray.isOnDevice() ) delete[] data;
+#endif
+    */
+  }
+
+  //--------------------------------------------------------------------------
+#endif
+}
\ No newline at end of file

From 63cf27cd0d69fa5816440c12b818d029d7d3f9f4 Mon Sep 17 00:00:00 2001
From: Jorgen T <jorgen.teig@gmail.com>
Date: Thu, 13 Jul 2023 14:06:28 +0200
Subject: [PATCH 361/509] Fix to the warnings in HIP compilation

---
 .../gg_ttgg.mad/SubProcesses/MatrixElementKernels.cc        | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MatrixElementKernels.cc b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MatrixElementKernels.cc
index e793d6a379..c2e08cdce0 100644
--- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MatrixElementKernels.cc
+++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MatrixElementKernels.cc
@@ -208,7 +208,7 @@ namespace mg5amcGpu
 #else
     gpuLaunchKernel( sigmaKin_getGoodHel, m_gpublocks, m_gputhreads, m_momenta.data(), m_couplings.data(), m_matrixElements.data(), devIsGoodHel.data() );
 #endif
-    gpuPeekAtLastError();
+    checkGpu( gpuPeekAtLastError() );
     // ... 0d2. Copy back good helicity mask to the host
     copyHostFromDevice( hstIsGoodHel, devIsGoodHel );
     // ... 0d3. Copy back good helicity list to constant memory on the device
@@ -230,8 +230,8 @@ namespace mg5amcGpu
 #else
     gpuLaunchKernelSharedMem( sigmaKin, m_gpublocks, m_gputhreads, sharedMemSize, m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), m_selhel.data(), m_selcol.data() );
 #endif
-    gpuPeekAtLastError();
-    gpuDeviceSynchronize();
+    checkGpu( gpuPeekAtLastError() );
+    checkGpu( gpuDeviceSynchronize() );
   }
 
   //--------------------------------------------------------------------------

From 6741186a50be8c16a09630a959a6327d2b4a7a8a Mon Sep 17 00:00:00 2001
From: Jorgen T <jorgen.teig@gmail.com>
Date: Thu, 13 Jul 2023 15:15:41 +0200
Subject: [PATCH 362/509] [CODEGEN] Added GPU abstraction to CODEGEN

---
 .../iolibs/template_files/gpu/Bridge.h        |  30 +-
 .../template_files/gpu/BridgeKernels.cc       |   7 +-
 .../iolibs/template_files/gpu/BridgeKernels.h |   6 +-
 .../gpu/CommonRandomNumberKernel.cc           |   3 +-
 .../template_files/gpu/CrossSectionKernels.cc |   5 +-
 .../template_files/gpu/CrossSectionKernels.h  |   4 +-
 .../iolibs/template_files/gpu/CudaRuntime.h   |   4 +-
 .../gpu/CurandRandomNumberKernel.cc           |  10 +-
 .../template_files/gpu/EventStatistics.h      |   2 +-
 .../template_files/gpu/GpuAbstraction.h       |  79 +++++
 .../iolibs/template_files/gpu/GpuRuntime.h    |  80 +++++
 .../iolibs/template_files/gpu/MadgraphTest.h  |   6 +-
 .../gpu/MatrixElementKernels.cc               |  24 +-
 .../template_files/gpu/MatrixElementKernels.h |   6 +-
 .../template_files/gpu/MemoryAccessHelpers.h  |   2 +-
 .../template_files/gpu/MemoryAccessMomenta.h  |   4 +-
 .../gpu/MemoryAccessRandomNumbers.h           |   2 +-
 .../template_files/gpu/MemoryAccessVectors.h  |   2 +-
 .../iolibs/template_files/gpu/MemoryBuffers.h |  61 ++--
 .../gpu/RamboSamplingKernels.cc               |  18 +-
 .../template_files/gpu/RamboSamplingKernels.h |   4 +-
 .../template_files/gpu/RandomNumberKernels.h  |   4 +-
 .../iolibs/template_files/gpu/check_sa.cc     |  74 ++--
 .../template_files/gpu/cpp_hel_amps_h.inc     |   2 +-
 .../iolibs/template_files/gpu/cudacpp.mk      | 319 +++++++++++-------
 .../iolibs/template_files/gpu/fbridge.cc      |  10 +-
 .../iolibs/template_files/gpu/fsampler.cc     |   6 +-
 .../iolibs/template_files/gpu/mgOnGpuConfig.h |  25 +-
 .../template_files/gpu/mgOnGpuCxtypes.h       |  16 +-
 .../template_files/gpu/mgOnGpuFptypes.h       |   8 +-
 .../template_files/gpu/mgOnGpuVectors.h       |  16 +-
 .../iolibs/template_files/gpu/process_cc.inc  |   2 +-
 .../gpu/process_function_definitions.inc      |  32 +-
 .../iolibs/template_files/gpu/process_h.inc   |   8 +-
 .../template_files/gpu/process_matrix.inc     |   8 +-
 .../gpu/process_sigmaKin_function.inc         |   9 +-
 .../iolibs/template_files/gpu/rambo.h         |   6 +-
 .../iolibs/template_files/gpu/runTest.cc      |  10 +-
 .../iolibs/template_files/gpu/testmisc.cc     |   2 +-
 .../iolibs/template_files/gpu/testxxx.cc      |   6 +-
 .../PLUGIN/CUDACPP_SA_OUTPUT/output.py        |   6 +-
 41 files changed, 589 insertions(+), 339 deletions(-)
 create mode 100644 epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/GpuAbstraction.h
 create mode 100644 epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/GpuRuntime.h

diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/Bridge.h b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/Bridge.h
index 4cafe0c997..08c7493713 100644
--- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/Bridge.h
+++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/Bridge.h
@@ -22,7 +22,7 @@
 #include <memory>
 #include <type_traits>
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -82,7 +82,7 @@ namespace mg5amcCpu
     Bridge& operator=( const Bridge& ) = delete;
     Bridge& operator=( Bridge&& ) = delete;
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     /**
      * Set the gpublocks and gputhreads for the gpusequence - throws if evnt != gpublocks*gputhreads
      * (this is needed for BridgeKernel tests rather than for actual production use in Fortran)
@@ -149,7 +149,7 @@ namespace mg5amcCpu
     unsigned int m_nevt; // number of events
     int m_nGoodHel;      // the number of good helicities (-1 initially when they have not yet been calculated)
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     int m_gputhreads; // number of gpu threads (default set from number of events, can be modified)
     int m_gpublocks;  // number of gpu blocks (default set from number of events, can be modified)
     mg5amcGpu::DeviceBuffer<FORTRANFPTYPE, sizePerEventMomenta> m_devMomentaF;
@@ -186,12 +186,12 @@ namespace mg5amcCpu
   // Forward declare transposition methods
   //
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 
   template<typename Tin, typename Tout>
   __global__ void dev_transposeMomentaF2C( const Tin* in, Tout* out, const unsigned int nevt );
 
-#endif // __CUDACC__
+#endif // MGONGPUCPP_GPUIMPL
 
   template<typename Tin, typename Tout>
   void hst_transposeMomentaF2C( const Tin* in, Tout* out, const unsigned int nevt );
@@ -208,7 +208,7 @@ namespace mg5amcCpu
   Bridge<FORTRANFPTYPE>::Bridge( unsigned int nevtF, unsigned int nparF, unsigned int np4F )
     : m_nevt( nevtF )
     , m_nGoodHel( -1 )
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     , m_gputhreads( 256 )                  // default number of gpu threads
     , m_gpublocks( m_nevt / m_gputhreads ) // this ensures m_nevt <= m_gpublocks*m_gputhreads
     , m_devMomentaF( m_nevt )
@@ -232,7 +232,7 @@ namespace mg5amcCpu
   {
     if( nparF != CPPProcess::npar ) throw std::runtime_error( "Bridge constructor: npar mismatch" );
     if( np4F != CPPProcess::np4 ) throw std::runtime_error( "Bridge constructor: np4 mismatch" );
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     if( ( m_nevt < s_gputhreadsmin ) || ( m_nevt % s_gputhreadsmin != 0 ) )
       throw std::runtime_error( "Bridge constructor: nevt should be a multiple of " + std::to_string( s_gputhreadsmin ) );
     while( m_nevt != m_gpublocks * m_gputhreads )
@@ -250,11 +250,11 @@ namespace mg5amcCpu
     std::cout << "WARNING! Instantiate host Bridge (nevt=" << m_nevt << ")" << std::endl;
     mg5amcCpu::CPPProcess process( /*verbose=*/false );
     m_pmek.reset( new mg5amcCpu::MatrixElementKernelHost( m_hstMomentaC, m_hstGs, m_hstRndHel, m_hstRndCol, m_hstMEs, m_hstSelHel, m_hstSelCol, m_nevt ) );
-#endif // __CUDACC__
+#endif // MGONGPUCPP_GPUIMPL
     process.initProc( "../../Cards/param_card.dat" );
   }
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   template<typename FORTRANFPTYPE>
   void Bridge<FORTRANFPTYPE>::set_gpugrid( const int gpublocks, const int gputhreads )
   {
@@ -268,7 +268,7 @@ namespace mg5amcCpu
   }
 #endif
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   template<typename FORTRANFPTYPE>
   void Bridge<FORTRANFPTYPE>::gpu_sequence( const FORTRANFPTYPE* momenta,
                                             const FORTRANFPTYPE* gs,
@@ -283,14 +283,14 @@ namespace mg5amcCpu
     constexpr int neppM = MemoryAccessMomenta::neppM;
     if constexpr( neppM == 1 && std::is_same_v<FORTRANFPTYPE, fptype> )
     {
-      checkCuda( cudaMemcpy( m_devMomentaC.data(), momenta, m_devMomentaC.bytes(), cudaMemcpyHostToDevice ) );
+      gpuMemcpy( m_devMomentaC.data(), momenta, m_devMomentaC.bytes(), cudaMemcpyHostToDevice );
     }
     else
     {
-      checkCuda( cudaMemcpy( m_devMomentaF.data(), momenta, m_devMomentaF.bytes(), cudaMemcpyHostToDevice ) );
+      gpuMemcpy( m_devMomentaF.data(), momenta, m_devMomentaF.bytes(), cudaMemcpyHostToDevice );
       const int thrPerEvt = CPPProcess::npar * CPPProcess::np4; // AV: transpose alg does 1 element per thread (NOT 1 event per thread)
       //const int thrPerEvt = 1; // AV: try new alg with 1 event per thread... this seems slower
-      dev_transposeMomentaF2C<<<m_gpublocks * thrPerEvt, m_gputhreads>>>( m_devMomentaF.data(), m_devMomentaC.data(), m_nevt );
+      gpuLaunchKernel( dev_transposeMomentaF2C, m_gpublocks * thrPerEvt, m_gputhreads, m_devMomentaF.data(), m_devMomentaC.data(), m_nevt );
     }
     if constexpr( std::is_same_v<FORTRANFPTYPE, fptype> )
     {
@@ -333,7 +333,7 @@ namespace mg5amcCpu
   }
 #endif
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   template<typename FORTRANFPTYPE>
   void Bridge<FORTRANFPTYPE>::cpu_sequence( const FORTRANFPTYPE* momenta,
                                             const FORTRANFPTYPE* gs,
@@ -388,7 +388,7 @@ namespace mg5amcCpu
   // - C++ array: momenta[npagM][npar][np4][neppM] with nevt=npagM*neppM (AOSOA)
   //
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   template<typename Tin, typename Tout>
   __global__ void dev_transposeMomentaF2C( const Tin* in, Tout* out, const unsigned int nevt )
   {
diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/BridgeKernels.cc b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/BridgeKernels.cc
index cef4cb3c71..f844178cbb 100644
--- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/BridgeKernels.cc
+++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/BridgeKernels.cc
@@ -5,6 +5,7 @@
 
 #include "BridgeKernels.h"
 
+#include "GpuAbstraction.h"
 #include "MemoryAccessMomenta.h"
 
 #include <sstream>
@@ -14,7 +15,7 @@ constexpr int npar = CPPProcess::npar; // #particles in total (external = initia
 
 //============================================================================
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -45,7 +46,7 @@ namespace mg5amcCpu
 
 //============================================================================
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 namespace mg5amcCpu
 {
 
@@ -96,7 +97,7 @@ namespace mg5amcCpu
 
 //============================================================================
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 {
 
diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/BridgeKernels.h b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/BridgeKernels.h
index 15eb4bff4d..7c7feb692a 100644
--- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/BridgeKernels.h
+++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/BridgeKernels.h
@@ -12,7 +12,7 @@
 #include "MatrixElementKernels.h"
 #include "MemoryBuffers.h"
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -49,7 +49,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A Bridge wrapper class encapsulating matrix element calculations on a CPU host
   class BridgeKernelHost final : public BridgeKernelBase
   {
@@ -89,7 +89,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   // A Bridge wrapper class encapsulating matrix element calculations on a GPU device
   class BridgeKernelDevice : public BridgeKernelBase
   {
diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/CommonRandomNumberKernel.cc b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/CommonRandomNumberKernel.cc
index 985b39f576..f17b9c0ad7 100644
--- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/CommonRandomNumberKernel.cc
+++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/CommonRandomNumberKernel.cc
@@ -4,12 +4,13 @@
 // Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #include "CommonRandomNumbers.h"
+#include "GpuAbstraction.h"
 #include "MemoryBuffers.h"
 #include "RandomNumberKernels.h"
 
 #include <cassert>
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/CrossSectionKernels.cc b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/CrossSectionKernels.cc
index 0b355a3c8d..36ca2a94d4 100644
--- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/CrossSectionKernels.cc
+++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/CrossSectionKernels.cc
@@ -5,6 +5,7 @@
 
 #include "CrossSectionKernels.h"
 
+#include "GpuAbstraction.h"
 #include "MemoryAccessMatrixElements.h"
 #include "MemoryAccessWeights.h"
 #include "MemoryBuffers.h"
@@ -77,7 +78,7 @@ debug_me_is_abnormal( const fptype& me, size_t ievtALL )
 
 //============================================================================
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -185,7 +186,7 @@ namespace mg5amcCpu
 
 //============================================================================
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 {
 
diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/CrossSectionKernels.h b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/CrossSectionKernels.h
index 7933ca4bbf..ff2350a14d 100644
--- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/CrossSectionKernels.h
+++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/CrossSectionKernels.h
@@ -13,7 +13,7 @@
 
 //============================================================================
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -96,7 +96,7 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
   /*
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   // A class encapsulating the calculation of event statistics on a GPU device
   class CrossSectionKernelDevice : public CrossSectionKernelBase, public NumberOfEvents
   {
diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/CudaRuntime.h b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/CudaRuntime.h
index 64ce52f4b3..df0c3f3df8 100644
--- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/CudaRuntime.h
+++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/CudaRuntime.h
@@ -15,7 +15,7 @@
 //--------------------------------------------------------------------------
 
 // See https://stackoverflow.com/a/14038590
-#ifdef __CUDACC__ /* clang-format off */
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
 #define checkCuda( code ) { assertCuda( code, __FILE__, __LINE__ ); }
 inline void assertCuda( cudaError_t code, const char* file, int line, bool abort = true )
 {
@@ -29,7 +29,7 @@ inline void assertCuda( cudaError_t code, const char* file, int line, bool abort
 
 //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 {
   // Instantiate a CudaRuntime at the beginnining of the application's main to
diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/CurandRandomNumberKernel.cc b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/CurandRandomNumberKernel.cc
index eb56333b03..5b33207ad0 100644
--- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/CurandRandomNumberKernel.cc
+++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/CurandRandomNumberKernel.cc
@@ -3,7 +3,7 @@
 // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
 // Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
-#include "CudaRuntime.h"
+#include "GpuRuntime.h"
 #include "MemoryBuffers.h"
 #include "RandomNumberKernels.h"
 
@@ -22,7 +22,7 @@ inline void assertCurand( curandStatus_t code, const char *file, int line, bool
 }
 #endif /* clang-format on */
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_CUDACC
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -36,7 +36,7 @@ namespace mg5amcCpu
   {
     if( m_isOnDevice )
     {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_CUDACC
       if( !m_rnarray.isOnDevice() )
         throw std::runtime_error( "CurandRandomNumberKernel on device with a host random number array" );
 #else
@@ -114,7 +114,7 @@ namespace mg5amcCpu
     /*
     printf( "\nCurandRandomNumberKernel::generateRnarray size = %d\n", (int)m_rnarray.size() );
     fptype* data = m_rnarray.data();
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     if( m_rnarray.isOnDevice() )
     {
       data = new fptype[m_rnarray.size()]();
@@ -123,7 +123,7 @@ namespace mg5amcCpu
 #endif
     for( int i = 0; i < ( (int)m_rnarray.size() / 4 ); i++ )
       printf( "[%4d] %f %f %f %f\n", i * 4, data[i * 4], data[i * 4 + 2], data[i * 4 + 2], data[i * 4 + 3] );
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     if( m_rnarray.isOnDevice() ) delete[] data;
 #endif
     */
diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/EventStatistics.h b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/EventStatistics.h
index 48b51e0a49..e7d7f3b3c3 100644
--- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/EventStatistics.h
+++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/EventStatistics.h
@@ -16,7 +16,7 @@
 #include <limits>
 #include <string>
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/GpuAbstraction.h b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/GpuAbstraction.h
new file mode 100644
index 0000000000..98a0124b55
--- /dev/null
+++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/GpuAbstraction.h
@@ -0,0 +1,79 @@
+#ifndef MG5AMC_GPUABSTRACTION_H
+#define MG5AMC_GPUABSTRACTION_H 1
+
+#include <cassert>
+
+#ifdef MGONGPUCPP_GPUIMPL
+  #define MGONGPUCPP_CUDACC 1
+#endif
+
+#ifdef __HIPCC__
+  #include "hip/hip_runtime.h"
+  #define MGONGPUCPP_HIPCC 1
+#endif
+
+#ifdef MGONGPUCPP_CUDACC
+
+  // Defines correct compiler
+  #define MGONGPUCPP_GPUIMPL MGONGPUCPP_GPUIMPL
+
+  //--------------------------------------------------------------------------
+
+  #define gpuError_t cudaError_t
+  #define gpuPeekAtLastError cudaPeekAtLastError
+  #define gpuGetErrorString cudaGetErrorString
+  #define gpuSuccess cudaSuccess
+
+  #define gpuMallocHost(ptr, size) checkGpu( cudaMallocHost(ptr, size) )
+  #define gpuMalloc(ptr, size) checkGpu( cudaMalloc(ptr, size) )
+
+  #define gpuMemcpy(dstData, srcData, srcBytes, func) checkGpu( cudaMemcpy(dstData, srcData, srcBytes, func) )
+  #define gpuMemcpyHostToDevice cudaMemcpyHostToDevice
+  #define gpuMemcpyDeviceToHost cudaMemcpyDeviceToHost
+  #define gpuMemcpyToSymbol(type1, type2, size) checkGpu( cudaMemcpyToSymbol(type1, type2, size) )
+
+  #define gpuFree(ptr) checkGpu( cudaFree(ptr) )
+  #define gpuFreeHost(ptr) checkGpu( cudaFreeHost(ptr) )
+
+  #define gpuSetDevice cudaSetDevice
+  #define gpuDeviceSynchronize cudaDeviceSynchronize
+  #define gpuDeviceReset cudaDeviceReset
+
+  #define gpuLaunchKernel( kernel, blocks, threads, ...)                    kernel<<<blocks, threads>>> (__VA_ARGS__)
+  #define gpuLaunchKernelSharedMem(kernel, blocks, threads, sharedMem, ...) kernel<<<blocks, threads, sharedMem>>>(__VA_ARGS__)
+
+//--------------------------------------------------------------------------
+
+#elif defined MGONGPUCPP_HIPCC
+
+  // Defines correct compiler
+  #define MGONGPUCPP_GPUIMPL __HCC__
+
+  //--------------------------------------------------------------------------
+
+  #define gpuError_t hipError_t
+  #define gpuPeekAtLastError hipPeekAtLastError
+  #define gpuGetErrorString hipGetErrorString
+  #define gpuSuccess hipSuccess
+
+  #define gpuMallocHost(ptr, size) checkGpu( hipHostMalloc(ptr, size) ) // HostMalloc better
+  #define gpuMalloc(ptr, size) checkGpu( hipMalloc(ptr, size) )
+
+  #define gpuMemcpy(dstData, srcData, srcBytes, func) checkGpu( hipMemcpy(dstData, srcData, srcBytes, func) )
+  #define gpuMemcpyHostToDevice hipMemcpyHostToDevice
+  #define gpuMemcpyDeviceToHost hipMemcpyDeviceToHost
+  #define gpuMemcpyToSymbol(type1, type2, size) checkGpu( hipMemcpyToSymbol(type1, type2, size) )
+
+  #define gpuFree(ptr) checkGpu( hipFree(ptr) )
+  #define gpuFreeHost(ptr) checkGpu( hipHostFree(ptr) )
+
+  #define gpuSetDevice hipSetDevice
+  #define gpuDeviceSynchronize hipDeviceSynchronize
+  #define gpuDeviceReset hipDeviceReset
+
+  #define gpuLaunchKernel( kernel, blocks, threads, ...)                    kernel<<<blocks, threads>>> (__VA_ARGS__)
+  #define gpuLaunchKernelSharedMem(kernel, blocks, threads, sharedMem, ...) kernel<<<blocks, threads, sharedMem>>>(__VA_ARGS__)
+
+#endif
+
+#endif // MG5AMC_GPUABSTRACTION_H
\ No newline at end of file
diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/GpuRuntime.h b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/GpuRuntime.h
new file mode 100644
index 0000000000..86c9179f4c
--- /dev/null
+++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/GpuRuntime.h
@@ -0,0 +1,80 @@
+#ifndef MG5AMC_GPURUNTIME_H
+#define MG5AMC_GPURUNTIME_H 1
+
+// MG5AMC on GPU uses the CUDA runtime API, not the lower level CUDA driver API
+// See https://docs.nvidia.com/cuda/cuda-runtime-api/driver-vs-runtime-api.html#driver-vs-runtime-api
+
+#include "GpuAbstraction.h"
+
+#include <iostream>
+
+//--------------------------------------------------------------------------
+
+// See https://stackoverflow.com/a/14038590
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
+#define checkGpu( code ) { assertGpu( code, __FILE__, __LINE__ ); }
+inline void assertGpu( gpuError_t code, const char* file, int line, bool abort = true )
+{
+  if( code != gpuSuccess )
+  {
+    printf( "ERROR! assertGpu: '%s' (%d) in %s:%d\n", gpuGetErrorString( code ), code, file, line );
+    if( abort ) assert( code == gpuSuccess );
+  }
+}
+#endif /* clang-format on */
+
+//--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+namespace mg5amcGpu
+{
+  // Instantiate a GpuRuntime at the beginnining of the application's main to
+  // invoke gpuSetDevice(0) in the constructor and book a gpuDeviceReset() call in the destructor
+  // *** FIXME! This will all need to be designed differently when going to multi-GPU nodes! ***
+  struct GpuRuntime final
+  {
+    GpuRuntime( const bool debug = true )
+      : m_debug( debug ) { setUp( m_debug ); }
+    ~GpuRuntime() { tearDown( m_debug ); }
+    GpuRuntime( const GpuRuntime& ) = delete;
+    GpuRuntime( GpuRuntime&& ) = delete;
+    GpuRuntime& operator=( const GpuRuntime& ) = delete;
+    GpuRuntime& operator=( GpuRuntime&& ) = delete;
+    bool m_debug;
+
+    // Set up CUDA application
+    // ** NB: strictly speaking this is not needed when using the CUDA runtime API **
+    // Calling cudaSetDevice on startup is useful to properly book-keep the time spent in CUDA initialization
+    static void setUp( const bool debug = true )
+    {
+      // ** NB: it is useful to call cudaSetDevice, or cudaFree, to properly book-keep the time spent in CUDA initialization
+      // ** NB: otherwise, the first CUDA operation (eg a cudaMemcpyToSymbol in CPPProcess ctor) appears to take much longer!
+      /*
+      // [We initially added cudaFree(0) to "ease profile analysis" only because it shows up as a big recognizable block!]
+      // No explicit initialization is needed: https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#initialization
+      // It is not clear what cudaFree(0) does at all: https://stackoverflow.com/questions/69967813/
+      if ( debug ) std::cout << "__CudaRuntime: calling cudaFree(0)" << std::endl;
+      checkCuda( cudaFree( 0 ) ); // SLOW!
+      */
+      // Replace cudaFree(0) by cudaSetDevice(0), even if it is not really needed either
+      // (but see https://developer.nvidia.com/blog/cuda-pro-tip-always-set-current-device-avoid-multithreading-bugs)
+      if( debug ) std::cout << "__GpuRuntime: calling GpuSetDevice(0)" << std::endl;
+      checkGpu ( gpuSetDevice( 0 ) ); // SLOW!
+    }
+
+    // Tear down CUDA application (call cudaDeviceReset)
+    // ** NB: strictly speaking this is not needed when using the CUDA runtime API **
+    // Calling cudaDeviceReset on shutdown is only needed for checking memory leaks in cuda-memcheck
+    // See https://docs.nvidia.com/cuda/cuda-memcheck/index.html#leak-checking
+    static void tearDown( const bool debug = true )
+    {
+      if( debug ) std::cout << "__GpuRuntime: calling GpuDeviceReset()" << std::endl;
+      checkGpu( gpuDeviceReset() );
+    }
+  };
+}
+#endif
+
+//--------------------------------------------------------------------------
+
+#endif // MG5AMC_GPURUNTIME_H
diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MadgraphTest.h b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MadgraphTest.h
index fd7734ce42..5920d08bf7 100644
--- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MadgraphTest.h
+++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MadgraphTest.h
@@ -21,7 +21,7 @@
 #include <string>
 #include <vector>
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 using mg5amcGpu::CPPProcess;
 #else
 using mg5amcCpu::CPPProcess;
@@ -200,7 +200,7 @@ class MadgraphTest : public testing::TestWithParam<TestDriverBase*>
 
 // Since we link both the CPU-only and GPU tests into the same executable, we prevent
 // a multiply defined symbol by only compiling this in the non-CUDA phase:
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 
 /// Compare momenta and matrix elements.
 /// This uses an implementation of TestDriverBase to run a madgraph workflow,
@@ -309,6 +309,6 @@ TEST_P( MadgraphTest, CompareMomentaAndME )
   }
 }
 
-#endif // __CUDACC__
+#endif // MGONGPUCPP_GPUIMPL
 
 #endif /* MADGRAPHTEST_H_ */
diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MatrixElementKernels.cc b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MatrixElementKernels.cc
index 30257195b6..dd3eee4ea3 100644
--- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MatrixElementKernels.cc
+++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MatrixElementKernels.cc
@@ -6,7 +6,7 @@
 #include "MatrixElementKernels.h"
 
 #include "CPPProcess.h"
-#include "CudaRuntime.h"
+#include "GpuRuntime.h" // Includes the abstraction for Nvidia/AMD compilation
 #include "MemoryAccessMomenta.h"
 #include "MemoryBuffers.h"
 
@@ -14,7 +14,7 @@
 
 //============================================================================
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 namespace mg5amcCpu
 {
 
@@ -143,7 +143,7 @@ namespace mg5amcCpu
 
 //============================================================================
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 {
 
@@ -202,13 +202,13 @@ namespace mg5amcGpu
     PinnedHostBufferHelicityMask hstIsGoodHel( ncomb );
     DeviceBufferHelicityMask devIsGoodHel( ncomb );
     // ... 0d1. Compute good helicity mask on the device
-    computeDependentCouplings<<<m_gpublocks, m_gputhreads>>>( m_gs.data(), m_couplings.data() );
+    gpuLaunchKernel( computeDependentCouplings, m_gpublocks, m_gputhreads, m_gs.data(), m_couplings.data() );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    sigmaKin_getGoodHel<<<m_gpublocks, m_gputhreads>>>( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_numerators.data(), m_denominators.data(), devIsGoodHel.data() );
+    gpuLaunchKernel( sigmaKin_getGoodHel, m_gpublocks, m_gputhreads, m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_numerators.data(), m_denominators.data(), devIsGoodHel.data() );
 #else
-    sigmaKin_getGoodHel<<<m_gpublocks, m_gputhreads>>>( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), devIsGoodHel.data() );
+    gpuLaunchKernel( sigmaKin_getGoodHel, m_gpublocks, m_gputhreads, m_momenta.data(), m_couplings.data(), m_matrixElements.data(), devIsGoodHel.data() );
 #endif
-    checkCuda( cudaPeekAtLastError() );
+    checkGpu( gpuPeekAtLastError() );
     // ... 0d2. Copy back good helicity mask to the host
     copyHostFromDevice( hstIsGoodHel, devIsGoodHel );
     // ... 0d3. Copy back good helicity list to constant memory on the device
@@ -219,19 +219,19 @@ namespace mg5amcGpu
 
   void MatrixElementKernelDevice::computeMatrixElements( const unsigned int channelId )
   {
-    computeDependentCouplings<<<m_gpublocks, m_gputhreads>>>( m_gs.data(), m_couplings.data() );
+    gpuLaunchKernel( computeDependentCouplings, m_gpublocks, m_gputhreads, m_gs.data(), m_couplings.data() );
 #ifndef MGONGPU_NSIGHT_DEBUG
     constexpr unsigned int sharedMemSize = 0;
 #else
     constexpr unsigned int sharedMemSize = ntpbMAX * sizeof( float );
 #endif
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    sigmaKin<<<m_gpublocks, m_gputhreads, sharedMemSize>>>( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), channelId, m_numerators.data(), m_denominators.data(), m_selhel.data(), m_selcol.data() );
+    gpuLaunchKernelSharedmem( sigmaKin, m_gpublocks, m_gputhreads, sharedMemSize, m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), channelId, m_numerators.data(), m_denominators.data(), m_selhel.data(), m_selcol.data() );
 #else
-    sigmaKin<<<m_gpublocks, m_gputhreads, sharedMemSize>>>( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), m_selhel.data(), m_selcol.data() );
+    gpuLaunchKernelSharedmem( sigmaKin, m_gpublocks, m_gputhreads, sharedMemSize, m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), m_selhel.data(), m_selcol.data() );
 #endif
-    checkCuda( cudaPeekAtLastError() );
-    checkCuda( cudaDeviceSynchronize() );
+    checkGpu( gpuPeekAtLastError() );
+    checkGpu( gpuDeviceSynchronize() );
   }
 
   //--------------------------------------------------------------------------
diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MatrixElementKernels.h b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MatrixElementKernels.h
index 23e84757a2..4477a385ed 100644
--- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MatrixElementKernels.h
+++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MatrixElementKernels.h
@@ -10,7 +10,7 @@
 
 #include "MemoryBuffers.h"
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -81,7 +81,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating matrix element calculations on a CPU host
   class MatrixElementKernelHost final : public MatrixElementKernelBase, public NumberOfEvents
   {
@@ -130,7 +130,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   // A class encapsulating matrix element calculations on a GPU device
   class MatrixElementKernelDevice : public MatrixElementKernelBase, public NumberOfEvents
   {
diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MemoryAccessHelpers.h b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MemoryAccessHelpers.h
index c82a6c7635..67306c3922 100644
--- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MemoryAccessHelpers.h
+++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MemoryAccessHelpers.h
@@ -105,7 +105,7 @@ class KernelAccessHelper : public MemoryAccessHelper<T>
     }
     else
     {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
       const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid
       //printf( "kernelAccessRecord: ievt=%d threadId=%d\n", ievt, threadIdx.x );
       return T::ieventAccessRecord( buffer, ievt ); // NB fptype and fptype_sv coincide for CUDA
diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MemoryAccessMomenta.h b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MemoryAccessMomenta.h
index 0ac4faa3c7..f797f85ca5 100644
--- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MemoryAccessMomenta.h
+++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MemoryAccessMomenta.h
@@ -12,7 +12,7 @@
 #include "MemoryAccessHelpers.h"
 #include "MemoryAccessVectors.h"
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 using mg5amcGpu::CPPProcess;
 #else
 using mg5amcCpu::CPPProcess;
@@ -29,7 +29,7 @@ class MemoryAccessMomentaBase //_AOSOAv1
 
   // Number of Events Per Page in the momenta AOSOA memory buffer layout
   // (these are all best kept as a compile-time constants: see issue #23)
-#ifdef __CUDACC__ /* clang-format off */
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
   // -----------------------------------------------------------------------------------------------
   // --- GPUs: neppM is best set to a power of 2 times the number of fptype's in a 32-byte cacheline
   // --- This is relevant to ensure coalesced access to momenta in global memory
diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MemoryAccessRandomNumbers.h b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MemoryAccessRandomNumbers.h
index e2988d39f3..949a42066d 100644
--- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MemoryAccessRandomNumbers.h
+++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MemoryAccessRandomNumbers.h
@@ -11,7 +11,7 @@
 #include "CPPProcess.h"
 #include "MemoryAccessHelpers.h"
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 using mg5amcGpu::CPPProcess;
 #else
 using mg5amcCpu::CPPProcess;
diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MemoryAccessVectors.h b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MemoryAccessVectors.h
index e9b197368e..a9ae26b6dc 100644
--- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MemoryAccessVectors.h
+++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MemoryAccessVectors.h
@@ -10,7 +10,7 @@
 
 #include "mgOnGpuVectors.h"
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 namespace mg5amcCpu // this is only needed for CPU SIMD vectorization
 {
 
diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MemoryBuffers.h b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MemoryBuffers.h
index 48306a9d41..d6ba45dcad 100644
--- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MemoryBuffers.h
+++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MemoryBuffers.h
@@ -11,12 +11,11 @@
 #include "mgOnGpuCxtypes.h"
 
 #include "CPPProcess.h"
-#include "CudaRuntime.h"
 #include "Parameters_%(model_name)s.h"
 
 #include <sstream>
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -87,7 +86,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   constexpr bool HostBufferALIGNED = false;   // ismisaligned=false
   constexpr bool HostBufferMISALIGNED = true; // ismisaligned=true
 
@@ -119,7 +118,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   // A class encapsulating a CUDA pinned host buffer
   template<typename T>
   class PinnedHostBufferBase : public BufferBase<T>
@@ -128,18 +127,18 @@ namespace mg5amcCpu
     PinnedHostBufferBase( const size_t size )
       : BufferBase<T>( size, false )
     {
-      checkCuda( cudaMallocHost( &( this->m_data ), this->bytes() ) );
+      gpuMallocHost( &( this->m_data ), this->bytes() );
     }
     virtual ~PinnedHostBufferBase()
     {
-      checkCuda( cudaFreeHost( this->m_data ) );
+      gpuFreeHost( this->m_data );
     }
   };
 #endif
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   // A class encapsulating a CUDA device buffer
   template<typename T>
   class DeviceBufferBase : public BufferBase<T>
@@ -148,18 +147,18 @@ namespace mg5amcCpu
     DeviceBufferBase( const size_t size )
       : BufferBase<T>( size, true )
     {
-      checkCuda( cudaMalloc( &( this->m_data ), this->bytes() ) );
+      gpuMalloc( &( this->m_data ), this->bytes() );
     }
     virtual ~DeviceBufferBase()
     {
-      checkCuda( cudaFree( this->m_data ) );
+      gpuFree( this->m_data );
     }
   };
 #endif
 
   //--------------------------------------------------------------------------
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for a given number of events
   template<typename T, size_t sizePerEvent, bool ismisaligned>
   class HostBuffer : public HostBufferBase<T, ismisaligned>, virtual private NumberOfEvents
@@ -175,7 +174,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   // A class encapsulating a CUDA pinned host buffer for a given number of events
   template<typename T, size_t sizePerEvent>
   class PinnedHostBuffer : public PinnedHostBufferBase<T>, virtual private NumberOfEvents
@@ -191,7 +190,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   // A class encapsulating a CUDA device buffer for a given number of events
   template<typename T, size_t sizePerEvent>
   class DeviceBuffer : public DeviceBufferBase<T>, virtual private NumberOfEvents
@@ -213,7 +212,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for momenta random numbers
   constexpr size_t sizePerEventRndNumMomenta = MemoryBuffers::np4 * MemoryBuffers::nparf;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for momenta random numbers
   typedef HostBuffer<fptype, sizePerEventRndNumMomenta, HostBufferALIGNED> HostBufferRndNumMomenta;
 #else
@@ -232,7 +231,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer with ONE fptype per event
   constexpr size_t sizePerEventOneFp = 1;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer with ONE fptype per event
   typedef HostBuffer<fptype, sizePerEventOneFp, HostBufferALIGNED> HostBufferOneFp;
 #else
@@ -257,7 +256,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for Gs
   constexpr size_t sizePerEventGs = 1;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for gs
   typedef HostBuffer<fptype, sizePerEventGs, HostBufferALIGNED> HostBufferGs;
 #else
@@ -276,7 +275,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for numerators
   constexpr size_t sizePerEventNumerators = 1;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for gs
   typedef HostBuffer<fptype, sizePerEventNumerators, HostBufferALIGNED> HostBufferNumerators;
 #else
@@ -296,7 +295,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for denominators
   constexpr size_t sizePerEventDenominators = 1;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for gs
   typedef HostBuffer<fptype, sizePerEventDenominators, HostBufferALIGNED> HostBufferDenominators;
 #else
@@ -315,7 +314,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for random numbers
   constexpr size_t sizePerEventCouplings = MemoryBuffers::ndcoup * MemoryBuffers::nx2;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for gs
   typedef HostBuffer<fptype, sizePerEventCouplings, HostBufferALIGNED> HostBufferCouplings;
 #else
@@ -333,7 +332,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for momenta
   constexpr size_t sizePerEventMomenta = MemoryBuffers::np4 * MemoryBuffers::npar;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for momenta
   typedef HostBuffer<fptype, sizePerEventMomenta, HostBufferALIGNED> HostBufferMomenta;
   //typedef HostBuffer<fptype, sizePerEventMomenta, HostBufferMISALIGNED> HostBufferMomenta; // TEST MISALIGNMENT!
@@ -352,7 +351,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for sampling weights
   constexpr size_t sizePerEventWeights = 1;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for sampling weights
   typedef HostBuffer<fptype, sizePerEventWeights, HostBufferALIGNED> HostBufferWeights;
 #else
@@ -370,7 +369,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for matrix elements
   constexpr size_t sizePerEventMatrixElements = 1;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for matrix elements
   typedef HostBuffer<fptype, sizePerEventMatrixElements, HostBufferALIGNED> HostBufferMatrixElements;
 #else
@@ -385,7 +384,7 @@ namespace mg5amcCpu
   // A base class encapsulating a memory buffer for the helicity mask
   typedef BufferBase<bool> BufferHelicityMask;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for the helicity mask
   typedef HostBufferBase<bool, HostBufferALIGNED> HostBufferHelicityMask;
 #else
@@ -403,7 +402,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for wavefunctions
   constexpr size_t sizePerEventWavefunctions = MemoryBuffers::nw6 * MemoryBuffers::nx2;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for wavefunctions
   typedef HostBuffer<fptype, sizePerEventWavefunctions, HostBufferALIGNED> HostBufferWavefunctions;
 #else
@@ -421,7 +420,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for helicity random numbers
   constexpr size_t sizePerEventRndNumHelicity = 1;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for helicity random numbers
   typedef HostBuffer<fptype, sizePerEventRndNumHelicity, HostBufferALIGNED> HostBufferRndNumHelicity;
 #else
@@ -439,7 +438,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for color random numbers
   constexpr size_t sizePerEventRndNumColor = 1;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for color random numbers
   typedef HostBuffer<fptype, sizePerEventRndNumColor, HostBufferALIGNED> HostBufferRndNumColor;
 #else
@@ -457,7 +456,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for helicity selection
   constexpr size_t sizePerEventSelectedHelicity = 1;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for helicity selection
   typedef HostBuffer<int, sizePerEventSelectedHelicity, HostBufferALIGNED> HostBufferSelectedHelicity;
 #else
@@ -475,7 +474,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for color selection
   constexpr size_t sizePerEventSelectedColor = 1;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for color selection
   typedef HostBuffer<int, sizePerEventSelectedColor, HostBufferALIGNED> HostBufferSelectedColor;
 #else
@@ -487,7 +486,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   template<class Tdst, class Tsrc>
   void copyDeviceFromHost( Tdst& dst, const Tsrc& src ) // keep the same order of arguments as in memcpy
   {
@@ -504,13 +503,13 @@ namespace mg5amcCpu
       throw std::runtime_error( sstr.str() );
     }
     // NB (PR #45): cudaMemcpy involves an intermediate memcpy to pinned memory if host array is a not a pinned host array
-    checkCuda( cudaMemcpy( dst.data(), src.data(), src.bytes(), cudaMemcpyHostToDevice ) );
+    gpuMemcpy( dst.data(), src.data(), src.bytes(), gpuMemcpyHostToDevice );
   }
 #endif
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   template<class Tdst, class Tsrc>
   void copyHostFromDevice( Tdst& dst, const Tsrc& src ) // keep the same order of arguments as in memcpy
   {
@@ -527,7 +526,7 @@ namespace mg5amcCpu
       throw std::runtime_error( sstr.str() );
     }
     // NB (PR #45): cudaMemcpy involves an intermediate memcpy to pinned memory if host array is a not a pinned host array
-    checkCuda( cudaMemcpy( dst.data(), src.data(), src.bytes(), cudaMemcpyDeviceToHost ) );
+    gpuMemcpy( dst.data(), src.data(), src.bytes(), gpuMemcpyDeviceToHost );
   }
 #endif
 
diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/RamboSamplingKernels.cc b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/RamboSamplingKernels.cc
index da68aa9255..8745b084d3 100644
--- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/RamboSamplingKernels.cc
+++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/RamboSamplingKernels.cc
@@ -5,7 +5,7 @@
 
 #include "RamboSamplingKernels.h"
 
-#include "CudaRuntime.h"
+#include "GpuRuntime.h"
 #include "MemoryAccessMomenta.h"
 #include "MemoryAccessRandomNumbers.h"
 #include "MemoryAccessWeights.h"
@@ -14,7 +14,7 @@
 
 #include <sstream>
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -92,7 +92,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   RamboSamplingKernelDevice::RamboSamplingKernelDevice( const fptype energy,               // input: energy
                                                         const BufferRndNumMomenta& rndmom, // input: random numbers in [0,1]
                                                         BufferMomenta& momenta,            // output: momenta
@@ -135,7 +135,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   __global__ void
   getMomentaInitialDevice( const fptype energy,
                            fptype* momenta )
@@ -147,17 +147,17 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   void
   RamboSamplingKernelDevice::getMomentaInitial()
   {
-    getMomentaInitialDevice<<<m_gpublocks, m_gputhreads>>>( m_energy, m_momenta.data() );
+    gpuLaunchKernel( getMomentaInitialDevice, m_gpublocks, m_gputhreads, m_energy, m_momenta.data() );
   }
 #endif
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   __global__ void
   getMomentaFinalDevice( const fptype energy,
                          const fptype* rndmom,
@@ -171,11 +171,11 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   void
   RamboSamplingKernelDevice::getMomentaFinal()
   {
-    getMomentaFinalDevice<<<m_gpublocks, m_gputhreads>>>( m_energy, m_rndmom.data(), m_momenta.data(), m_weights.data() );
+    gpuLaunchKernel( getMomentaFinalDevice, m_gpublocks, m_gputhreads, m_energy, m_rndmom.data(), m_momenta.data(), m_weights.data() );
   }
 #endif
 
diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/RamboSamplingKernels.h b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/RamboSamplingKernels.h
index 184089efd7..fe63a7bb77 100644
--- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/RamboSamplingKernels.h
+++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/RamboSamplingKernels.h
@@ -10,7 +10,7 @@
 
 #include "MemoryBuffers.h"
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -93,7 +93,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   // A class encapsulating RAMBO phase space sampling on a GPU device
   class RamboSamplingKernelDevice final : public SamplingKernelBase, public NumberOfEvents
   {
diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/RandomNumberKernels.h b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/RandomNumberKernels.h
index 188a72c2c9..0c215f2583 100644
--- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/RandomNumberKernels.h
+++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/RandomNumberKernels.h
@@ -8,7 +8,7 @@
 
 #include "mgOnGpuConfig.h"
 
-// NB This must come AFTER mgOnGpuConfig.h which contains our definition of __global__ when __CUDACC__ is not defined
+// NB This must come AFTER mgOnGpuConfig.h which contains our definition of __global__ when MGONGPUCPP_GPUIMPL is not defined
 #ifndef MGONGPU_HAS_NO_CURAND
 //#include "curand.h"
 struct curandGenerator_st; // forward definition from curand.h
@@ -16,7 +16,7 @@ struct curandGenerator_st; // forward definition from curand.h
 
 #include "MemoryBuffers.h"
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/check_sa.cc b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/check_sa.cc
index f1e75b9252..d7d40e140c 100644
--- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/check_sa.cc
+++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/check_sa.cc
@@ -63,7 +63,7 @@ usage( char* argv0, int ret = 1 )
   std::cout << std::endl;
   std::cout << "Summary stats are always computed: '-p' and '-j' only control their printout" << std::endl;
   std::cout << "The '-d' flag only enables NaN/abnormal warnings and OMP debugging" << std::endl;
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 #ifdef _OPENMP
   std::cout << std::endl;
   std::cout << "Use the OMP_NUM_THREADS environment variable to control OMP multi-threading" << std::endl;
@@ -77,7 +77,7 @@ int
 main( int argc, char** argv )
 {
   // Namespaces for CUDA and C++ (FIXME - eventually use the same namespace everywhere...)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   using namespace mg5amcGpu;
 #else
   using namespace mg5amcCpu;
@@ -102,7 +102,7 @@ main( int argc, char** argv )
     CurandHost = 1,
     CurandDevice = 2
   };
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   RandomNumberMode rndgen = RandomNumberMode::CurandDevice; // default on GPU
 #elif not defined MGONGPU_HAS_NO_CURAND
   RandomNumberMode rndgen = RandomNumberMode::CurandHost;  // default on CPU if build has curand
@@ -115,7 +115,7 @@ main( int argc, char** argv )
     RamboHost = 1,
     RamboDevice = 2
   };
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   RamboSamplingMode rmbsmp = RamboSamplingMode::RamboDevice; // default on GPU
 #else
   RamboSamplingMode rmbsmp = RamboSamplingMode::RamboHost; // default on CPU
@@ -145,7 +145,7 @@ main( int argc, char** argv )
     }
     else if( arg == "--curdev" )
     {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
       rndgen = RandomNumberMode::CurandDevice;
 #else
       throw std::runtime_error( "CurandDevice is not supported on CPUs" );
@@ -165,7 +165,7 @@ main( int argc, char** argv )
     }
     else if( arg == "--rmbdev" )
     {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
       rmbsmp = RamboSamplingMode::RamboDevice;
 #else
       throw std::runtime_error( "RamboDevice is not supported on CPUs" );
@@ -239,13 +239,13 @@ main( int argc, char** argv )
     return usage( argv[0] );
   }
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 #ifdef _OPENMP
   ompnumthreadsNotSetMeansOneThread( debug ? 1 : 0 ); // quiet(-1), info(0), debug(1)
 #endif
 #endif
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // Fail gently and avoid "Illegal instruction (core dumped)" if the host does not support the SIMD used in the ME calculation
   // Note: this prevents a crash on pmpe04 but not on some github CI nodes?
   // [NB: SIMD vectorization in mg5amc C++ code is only used in the ME calculation below MatrixElementKernelHost!]
@@ -263,7 +263,7 @@ main( int argc, char** argv )
 
   // === STEP 0 - INITIALISE
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 
   // --- 00. Initialise cuda
   // Instantiate a CudaRuntime at the beginnining of the application's main to
@@ -292,7 +292,7 @@ main( int argc, char** argv )
   timermap.start( alloKey );
 
   // Memory buffers for random numbers for momenta
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferRndNumMomenta hstRndmom( nevt );
 #else
   PinnedHostBufferRndNumMomenta hstRndmom( nevt );
@@ -300,7 +300,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for sampling weights
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferWeights hstWeights( nevt );
 #else
   PinnedHostBufferWeights hstWeights( nevt );
@@ -308,7 +308,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for momenta
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferMomenta hstMomenta( nevt );
 #else
   PinnedHostBufferMomenta hstMomenta( nevt );
@@ -316,7 +316,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for Gs
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferGs hstGs( nevt );
 #else
   PinnedHostBufferGs hstGs( nevt );
@@ -333,7 +333,7 @@ main( int argc, char** argv )
   }
 
   // Memory buffers for matrix elements
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferMatrixElements hstMatrixElements( nevt );
 #else
   PinnedHostBufferMatrixElements hstMatrixElements( nevt );
@@ -342,7 +342,7 @@ main( int argc, char** argv )
 
   // Memory buffers for random numbers for helicity selection
   // *** NB #403 these buffers always remain initialised at 0: no need for helicity choice in gcheck/check (no LHE produced) ***
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferRndNumHelicity hstRndHel( nevt );
 #else
   PinnedHostBufferRndNumHelicity hstRndHel( nevt );
@@ -351,7 +351,7 @@ main( int argc, char** argv )
 
   // Memory buffers for random numbers for color selection
   // *** NB #402 these buffers always remain initialised at 0: no need for color choice in gcheck/check (no LHE produced) ***
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferRndNumColor hstRndCol( nevt );
 #else
   PinnedHostBufferRndNumColor hstRndCol( nevt );
@@ -359,7 +359,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for helicity selection
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferSelectedHelicity hstSelHel( nevt );
 #else
   PinnedHostBufferSelectedHelicity hstSelHel( nevt );
@@ -367,7 +367,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for color selection
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferSelectedColor hstSelCol( nevt );
 #else
   PinnedHostBufferSelectedColor hstSelCol( nevt );
@@ -394,7 +394,7 @@ main( int argc, char** argv )
     const bool onDevice = false;
     prnk.reset( new CurandRandomNumberKernel( hstRndmom, onDevice ) );
   }
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   else
   {
     const bool onDevice = true;
@@ -421,7 +421,7 @@ main( int argc, char** argv )
   }
   else
   {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     prsk.reset( new RamboSamplingKernelDevice( energy, devRndmom, devMomenta, devWeights, gpublocks, gputhreads ) );
 #else
     throw std::logic_error( "RamboDevice is not supported on CPUs" ); // INTERNAL ERROR (no path to this statement)
@@ -432,7 +432,7 @@ main( int argc, char** argv )
   std::unique_ptr<MatrixElementKernelBase> pmek;
   if( !bridge )
   {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     pmek.reset( new MatrixElementKernelDevice( devMomenta, devGs, devRndHel, devRndCol, devMatrixElements, devSelHel, devSelCol, gpublocks, gputhreads ) );
 #else
     pmek.reset( new MatrixElementKernelHost( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, nevt ) );
@@ -440,7 +440,7 @@ main( int argc, char** argv )
   }
   else
   {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     pmek.reset( new BridgeKernelDevice( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, gpublocks, gputhreads ) );
 #else
     pmek.reset( new BridgeKernelHost( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, nevt ) );
@@ -482,7 +482,7 @@ main( int argc, char** argv )
     prnk->generateRnarray();
     //std::cout << "Got random numbers" << std::endl;
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     if( rndgen != RandomNumberMode::CurandDevice && rmbsmp == RamboSamplingMode::RamboDevice )
     {
       // --- 1c. Copy rndmom from host to device
@@ -514,7 +514,7 @@ main( int argc, char** argv )
     prsk->getMomentaFinal();
     //std::cout << "Got final momenta" << std::endl;
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     if( rmbsmp == RamboSamplingMode::RamboDevice )
     {
       // --- 2c. CopyDToH Weights
@@ -559,7 +559,7 @@ main( int argc, char** argv )
       dynamic_cast<BridgeKernelBase*>( pmek.get() )->transposeInputMomentaC2F();
     }
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     // --- 2d. CopyHToD Momenta
     const std::string gKey = "0.. CpHTDg";
     rambtime += timermap.start( gKey ); // FIXME! NOT A RAMBO TIMER!
@@ -588,7 +588,7 @@ main( int argc, char** argv )
     wv3atime += timermap.stop(); // calc only
     wavetime += wv3atime;        // calc plus copy
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     if( !bridge )
     {
       // --- 3b. CopyDToH MEs
@@ -729,7 +729,7 @@ main( int argc, char** argv )
     rndgentxt = "CURAND HOST";
   else if( rndgen == RandomNumberMode::CurandDevice )
     rndgentxt = "CURAND DEVICE";
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   rndgentxt += " (CUDA code)";
 #else
   rndgentxt += " (C++ code)";
@@ -738,7 +738,7 @@ main( int argc, char** argv )
   // Workflow description summary
   std::string wrkflwtxt;
   // -- CUDA or C++?
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   wrkflwtxt += "CUD:";
 #else
   wrkflwtxt += "CPP:";
@@ -754,7 +754,7 @@ main( int argc, char** argv )
   wrkflwtxt += "???+"; // no path to this statement
 #endif
   // -- CUCOMPLEX or THRUST or STD complex numbers?
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 #if defined MGONGPU_CUCXTYPE_CUCOMPLEX
   wrkflwtxt += "CUX:";
 #elif defined MGONGPU_CUCXTYPE_THRUST
@@ -789,7 +789,7 @@ main( int argc, char** argv )
     wrkflwtxt += "RMBDEV+";
   else
     wrkflwtxt += "??????+"; // no path to this statement
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   // -- HOST or DEVICE matrix elements? Standalone MEs or BRIDGE?
   if( !bridge )
     wrkflwtxt += "MESDEV";
@@ -845,7 +845,7 @@ main( int argc, char** argv )
 
   if( perf )
   {
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 #ifdef _OPENMP
     // Get the output of "nproc --all" (https://stackoverflow.com/a/478960)
     std::string nprocall;
@@ -864,7 +864,7 @@ main( int argc, char** argv )
 #endif
     // Dump all configuration parameters and all results
     std::cout << std::string( SEP79, '*' ) << std::endl
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
               << "Process                     = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_CUDA"
 #else
               << "Process                     = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_CPP"
@@ -892,7 +892,7 @@ main( int argc, char** argv )
 #elif defined MGONGPU_FPTYPE_FLOAT
               << "FP precision                = FLOAT (NaN/abnormal=" << nabn << ", zero=" << nzero << ")" << std::endl
 #endif
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 #if defined MGONGPU_CUCXTYPE_CUCOMPLEX
               << "Complex type                = CUCOMPLEX" << std::endl
 #elif defined MGONGPU_CUCXTYPE_THRUST
@@ -906,7 +906,7 @@ main( int argc, char** argv )
               << " [HARDCODED FOR REPRODUCIBILITY]" << std::endl
               << "Momenta memory layout       = AOSOA[" << neppM << "]"
               << ( neppM == 1 ? " == AOS" : "" ) << std::endl
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     //<< "Wavefunction GPU memory     = LOCAL" << std::endl
 #else
 #if !defined MGONGPU_CPPSIMD
@@ -937,7 +937,7 @@ main( int argc, char** argv )
 #endif
 #endif
               << "Random number generation    = " << rndgentxt << std::endl
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 #ifdef _OPENMP
               << "OMP threads / `nproc --all` = " << omp_get_max_threads() << " / " << nprocall // includes a newline
 #endif
@@ -1033,7 +1033,7 @@ main( int argc, char** argv )
              << "\"FLOAT (NaN/abnormal=" << nabn << ")\"," << std::endl
 #endif
              << "\"Complex type\": "
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 #if defined MGONGPU_CUCXTYPE_CUCOMPLEX
              << "\"CUCOMPLEX\"," << std::endl
 #elif defined MGONGPU_CUCXTYPE_THRUST
@@ -1048,7 +1048,7 @@ main( int argc, char** argv )
              << "\"Momenta memory layout\": "
              << "\"AOSOA[" << neppM << "]\""
              << ( neppM == 1 ? " == AOS" : "" ) << ", " << std::endl
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     //<< "\"Wavefunction GPU memory\": " << "\"LOCAL\"," << std::endl
 #endif
              << "\"Curand generation\": "
diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/cpp_hel_amps_h.inc b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/cpp_hel_amps_h.inc
index 862f4b1976..6a592e8da8 100644
--- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/cpp_hel_amps_h.inc
+++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/cpp_hel_amps_h.inc
@@ -24,7 +24,7 @@
 //#include <iomanip>
 //#include <iostream>
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/cudacpp.mk b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/cudacpp.mk
index c2937d59d2..ff63aa4a43 100644
--- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/cudacpp.mk
+++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/cudacpp.mk
@@ -32,7 +32,7 @@ UNAME_P := $(shell uname -p)
 #=== Configure common compiler flags for C++ and CUDA
 
 INCFLAGS = -I.
-OPTFLAGS = -O3 # this ends up in CUFLAGS too (should it?), cannot add -Ofast or -ffast-math here
+OPTFLAGS = -O3 # this ends up in GPUFLAGS too (should it?), cannot add -Ofast or -ffast-math here
 
 # Dependency on src directory
 MG5AMC_COMMONLIB = mg5amc_common
@@ -89,69 +89,139 @@ endif
 
 #-------------------------------------------------------------------------------
 
-#=== Configure the CUDA compiler
-
-# If CXX is not a single word (example "clang++ --gcc-toolchain...") then disable CUDA builds (issue #505)
-# This is because it is impossible to pass this to "CUFLAGS += -ccbin <host-compiler>" below
-ifneq ($(words $(subst ccache ,,$(CXX))),1) # allow at most "CXX=ccache <host-compiler>" from outside
-  $(warning CUDA builds are not supported for multi-word CXX "$(CXX)")
-  override CUDA_HOME=disabled
-endif
-
-# If CUDA_HOME is not set, try to set it from the location of nvcc
-ifndef CUDA_HOME
-  CUDA_HOME = $(patsubst %%bin/nvcc,%%,$(shell which nvcc 2>/dev/null))
-  $(warning CUDA_HOME was not set: using "$(CUDA_HOME)")
-endif
-
-# Set NVCC as $(CUDA_HOME)/bin/nvcc if it exists
-ifneq ($(wildcard $(CUDA_HOME)/bin/nvcc),)
-  NVCC = $(CUDA_HOME)/bin/nvcc
-  USE_NVTX ?=-DUSE_NVTX
-  # See https://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/index.html
-  # See https://arnon.dk/matching-sm-architectures-arch-and-gencode-for-various-nvidia-cards/
-  # Default: use compute capability 70 for V100 (CERN lxbatch, CERN itscrd, Juwels Cluster).
-  # Embed device code for 70, and PTX for 70+.
-  # Export MADGRAPH_CUDA_ARCHITECTURE (comma-separated list) to use another value or list of values (see #533).
-  # Examples: use 60 for P100 (Piz Daint), 80 for A100 (Juwels Booster, NVidia raplab/Curiosity).
-  MADGRAPH_CUDA_ARCHITECTURE ?= 70
-  ###CUARCHFLAGS = -gencode arch=compute_$(MADGRAPH_CUDA_ARCHITECTURE),code=compute_$(MADGRAPH_CUDA_ARCHITECTURE) -gencode arch=compute_$(MADGRAPH_CUDA_ARCHITECTURE),code=sm_$(MADGRAPH_CUDA_ARCHITECTURE) # Older implementation (AV): go back to this one for multi-GPU support #533
-  ###CUARCHFLAGS = --gpu-architecture=compute_$(MADGRAPH_CUDA_ARCHITECTURE) --gpu-code=sm_$(MADGRAPH_CUDA_ARCHITECTURE),compute_$(MADGRAPH_CUDA_ARCHITECTURE) # Newer implementation (SH): cannot use this as-is for multi-GPU support #533
-  comma:=,
-  CUARCHFLAGS = $(foreach arch,$(subst $(comma), ,$(MADGRAPH_CUDA_ARCHITECTURE)),-gencode arch=compute_$(arch),code=compute_$(arch) -gencode arch=compute_$(arch),code=sm_$(arch))
-  CUINC = -I$(CUDA_HOME)/include/
-  CURANDLIBFLAGS = -L$(CUDA_HOME)/lib64/ -lcurand # NB: -lcuda is not needed here!
-  CUOPTFLAGS = -lineinfo
-  CUFLAGS = $(OPTFLAGS) $(CUOPTFLAGS) $(INCFLAGS) $(CUINC) $(USE_NVTX) $(CUARCHFLAGS) -use_fast_math
-  ###CUFLAGS += -Xcompiler -Wall -Xcompiler -Wextra -Xcompiler -Wshadow
-  ###NVCC_VERSION = $(shell $(NVCC) --version | grep 'Cuda compilation tools' | cut -d' ' -f5 | cut -d, -f1)
-  CUFLAGS += -std=c++17 # need CUDA >= 11.2 (see #333): this is enforced in mgOnGpuConfig.h
-  # Without -maxrregcount: baseline throughput: 6.5E8 (16384 32 12) up to 7.3E8 (65536 128 12)
-  ###CUFLAGS+= --maxrregcount 160 # improves throughput: 6.9E8 (16384 32 12) up to 7.7E8 (65536 128 12)
-  ###CUFLAGS+= --maxrregcount 128 # improves throughput: 7.3E8 (16384 32 12) up to 7.6E8 (65536 128 12)
-  ###CUFLAGS+= --maxrregcount 96 # degrades throughput: 4.1E8 (16384 32 12) up to 4.5E8 (65536 128 12)
-  ###CUFLAGS+= --maxrregcount 64 # degrades throughput: 1.7E8 (16384 32 12) flat at 1.7E8 (65536 128 12)
-else ifneq ($(origin REQUIRE_CUDA),undefined)
-  # If REQUIRE_CUDA is set but no cuda is found, stop here (e.g. for CI tests on GPU #443)
-  $(error No cuda installation found (set CUDA_HOME or make nvcc visible in PATH))
-else
-  # No cuda. Switch cuda compilation off and go to common random numbers in C++
-  $(warning CUDA_HOME is not set or is invalid: export CUDA_HOME to compile with cuda)
-  override NVCC=
-  override USE_NVTX=
-  override CUINC=
-  override CURANDLIBFLAGS=
-endif
+CUDA_COMPILER_PATH := $(shell compiler="`which nvcc 2>/dev/null`" && while [ -L "$$compiler" ]; do compiler=`readlink "$$compiler"`; done && echo "$$compiler")
+HIP_COMPILER_PATH := $(shell compiler="`which hipcc 2>/dev/null`" && while [ -L "$$compiler" ]; do compiler=`readlink "$$compiler"`; done && echo "$$compiler")
+
+ifeq ($(findstring nvcc,$(CUDA_COMPILER_PATH)),nvcc)
+  #=== Configure the CUDA compiler
+
+  # If CXX is not a single word (example "clang++ --gcc-toolchain...") then disable CUDA builds (issue #505)
+  # This is because it is impossible to pass this to "GPUFLAGS += -ccbin <host-compiler>" below
+  ifneq ($(words $(subst ccache ,,$(CXX))),1) # allow at most "CXX=ccache <host-compiler>" from outside
+    $(warning CUDA builds are not supported for multi-word CXX "$(CXX)")
+    override CUDA_HOME=disabled
+  endif
+
+  # If CUDA_HOME is not set, try to set it from the location of NVCC
+  ifndef CUDA_HOME
+    CUDA_HOME = $(patsubst %bin/nvcc,%,$(shell which nvcc 2>/dev/null))
+    $(warning CUDA_HOME was not set: using "$(CUDA_HOME)")
+  endif
+
+  # Set GPUCC as $(CUDA_HOME)/bin/nvcc if it exists
+  ifneq ($(wildcard $(CUDA_HOME)/bin/nvcc),)
+    GPUCC = $(CUDA_HOME)/bin/nvcc
+    USE_NVTX ?=-DUSE_NVTX
+    # See https://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/index.html
+    # See https://arnon.dk/matching-sm-architectures-arch-and-gencode-for-various-nvidia-cards/
+    # Default: use compute capability 70 for V100 (CERN lxbatch, CERN itscrd, Juwels Cluster).
+    # Embed device code for 70, and PTX for 70+.
+    # Export MADGRAPH_CUDA_ARCHITECTURE (comma-separated list) to use another value or list of values (see #533).
+    # Examples: use 60 for P100 (Piz Daint), 80 for A100 (Juwels Booster, NVidia raplab/Curiosity).
+    MADGRAPH_CUDA_ARCHITECTURE ?= 70
+    ###CUARCHFLAGS = -gencode arch=compute_$(MADGRAPH_CUDA_ARCHITECTURE),code=compute_$(MADGRAPH_CUDA_ARCHITECTURE) -gencode arch=compute_$(MADGRAPH_CUDA_ARCHITECTURE),code=sm_$(MADGRAPH_CUDA_ARCHITECTURE) # Older implementation (AV): go back to this one for multi-GPU support #533
+    ###CUARCHFLAGS = --gpu-architecture=compute_$(MADGRAPH_CUDA_ARCHITECTURE) --gpu-code=sm_$(MADGRAPH_CUDA_ARCHITECTURE),compute_$(MADGRAPH_CUDA_ARCHITECTURE) # Newer implementation (SH): cannot use this as-is for multi-GPU support #533
+    comma:=,
+    CUARCHFLAGS = $(foreach arch,$(subst $(comma), ,$(MADGRAPH_CUDA_ARCHITECTURE)),-gencode arch=compute_$(arch),code=compute_$(arch) -gencode arch=compute_$(arch),code=sm_$(arch))
+    CUINC = -I$(CUDA_HOME)/include/
+    CURANDLIBFLAGS = -L$(CUDA_HOME)/lib64/ -lcurand # NB: -lcuda is not needed here!
+    CUOPTFLAGS = -lineinfo
+    GPUFLAGS = $(OPTFLAGS) $(CUOPTFLAGS) $(INCFLAGS) $(CUINC) $(USE_NVTX) $(CUARCHFLAGS) -use_fast_math
+    ###GPUFLAGS += -Xcompiler -Wall -Xcompiler -Wextra -Xcompiler -Wshadow
+    ###GPUCC_VERSION = $(shell $(GPUCC) --version | grep 'Cuda compilation tools' | cut -d' ' -f5 | cut -d, -f1)
+    GPUFLAGS += -std=c++17 # need CUDA >= 11.2 (see #333): this is enforced in mgOnGpuConfig.h
+    # Without -maxrregcount: baseline throughput: 6.5E8 (16384 32 12) up to 7.3E8 (65536 128 12)
+    ###GPUFLAGS+= --maxrregcount 160 # improves throughput: 6.9E8 (16384 32 12) up to 7.7E8 (65536 128 12)
+    ###GPUFLAGS+= --maxrregcount 128 # improves throughput: 7.3E8 (16384 32 12) up to 7.6E8 (65536 128 12)
+    ###GPUFLAGS+= --maxrregcount 96 # degrades throughput: 4.1E8 (16384 32 12) up to 4.5E8 (65536 128 12)
+    ###GPUFLAGS+= --maxrregcount 64 # degrades throughput: 1.7E8 (16384 32 12) flat at 1.7E8 (65536 128 12)
+
+    CUBUILDRULEFLAGS = -Xcompiler -fPIC -c
+    CCBUILDRULEFLAGS = -Xcompiler -fPIC -c -x cu
+
+    CUDATESTFLAGS = -lcuda
+
+  else ifneq ($(origin REQUIRE_CUDA),undefined)
+    # If REQUIRE_CUDA is set but no cuda is found, stop here (e.g. for CI tests on GPU #443)
+    $(error No cuda installation found (set CUDA_HOME or make GPUCC visible in PATH))
+  else
+    # No cuda. Switch cuda compilation off and go to common random numbers in C++
+    $(warning CUDA_HOME is not set or is invalid: export CUDA_HOME to compile with cuda)
+    override GPUCC=
+    override USE_NVTX=
+    override CUINC=
+    override CURANDLIBFLAGS=
+  endif
+
+  # Set the host C++ compiler for GPUCC via "-ccbin <host-compiler>"
+  # (NB issue #505: this must be a single word, "clang++ --gcc-toolchain..." is not supported)
+  GPUFLAGS += -ccbin $(shell which $(subst ccache ,,$(CXX)))
+
+  # Allow newer (unsupported) C++ compilers with older versions of CUDA if ALLOW_UNSUPPORTED_COMPILER_IN_CUDA is set (#504)
+  ifneq ($(origin ALLOW_UNSUPPORTED_COMPILER_IN_CUDA),undefined)
+  GPUFLAGS += -allow-unsupported-compiler
+  endif
+
+else ifeq ($(findstring hipcc,$(HIP_COMPILER_PATH)),hipcc)
+  #=== Configure the HIP compiler
+
+  # If CXX is not a single word (example "clang++ --gcc-toolchain...") then disable CUDA builds (issue #505)
+  # This is because it is impossible to pass this to "GPUFLAGS += -ccbin <host-compiler>" below
+  ifneq ($(words $(subst ccache ,,$(CXX))),1) # allow at most "CXX=ccache <host-compiler>" from outside
+    $(warning CUDA builds are not supported for multi-word CXX "$(CXX)")
+    override CUDA_HOME=disabled
+  endif
+
+  # If HIP_HOME is not set, try to set it from the location of GPUCC
+  ifndef HIP_HOME
+    HIP_HOME = $(patsubst %bin/hipcc,%,$(HIP_COMPILER_PATH))
+    $(warning HIP_HOME was not set: using "$(HIP_HOME)")
+  endif
 
-# Set the host C++ compiler for nvcc via "-ccbin <host-compiler>"
-# (NB issue #505: this must be a single word, "clang++ --gcc-toolchain..." is not supported)
-CUFLAGS += -ccbin $(shell which $(subst ccache ,,$(CXX)))
+  # Set GPUCC as $(HIP_HOME)/bin/hipcc if it exists
+  ifneq ($(wildcard $(HIP_HOME)/bin/hipcc),)
+    GPUCC = $(HIP_HOME)/bin/hipcc
+
+    # Should maybe find something equivelant to this in HIP
+    #USE_NVTX ?=-DUSE_NVTX
+
+    HIPARCHFLAGS = -target x86_64-linux-gnu --offload-arch=gfx90a
+    HIPINC = -I$(HIP_HOME)/include/
+
+    # -DHIP_FAST_MATH equivelant to -use_fast_math in HIP 
+    # (But only for single precision line 208: https://rocm-developer-tools.github.io/HIP/hcc__detail_2math__functions_8h_source.html)
+    GPUFLAGS = $(OPTFLAGS) $(CUOPTFLAGS) $(INCFLAGS) $(HIPINC) $(HIPARCHFLAGS) -DHIP_FAST_MATH -DHIP_PLATFORM=amd -fPIC
+    ###GPUFLAGS += -Xcompiler -Wall -Xcompiler -Wextra -Xcompiler -Wshadow
+    GPUFLAGS += -std=c++17
+    # Without -maxrregcount: baseline throughput: 6.5E8 (16384 32 12) up to 7.3E8 (65536 128 12)
+    ###GPUFLAGS+= --maxrregcount 160 # improves throughput: 6.9E8 (16384 32 12) up to 7.7E8 (65536 128 12)
+    ###GPUFLAGS+= --maxrregcount 128 # improves throughput: 7.3E8 (16384 32 12) up to 7.6E8 (65536 128 12)
+    ###GPUFLAGS+= --maxrregcount 96 # degrades throughput: 4.1E8 (16384 32 12) up to 4.5E8 (65536 128 12)
+    ###GPUFLAGS+= --maxrregcount 64 # degrades throughput: 1.7E8 (16384 32 12) flat at 1.7E8 (65536 128 12)
+
+    CUBUILDRULEFLAGS = -fPIC -c
+    CCBUILDRULEFLAGS = -fPIC -c
+
+  else ifneq ($(origin REQUIRE_HIP),undefined)
+    # If REQUIRE_HIP is set but no cuda is found, stop here (e.g. for CI tests on GPU #443)
+    $(error No hip installation found (set HIP_HOME or make GPUCC visible in PATH))
+  else
+    # No hip. Switch hip compilation off and go to common random numbers in C++
+    $(warning HIP_HOME is not set or is invalid: export HIP_HOME to compile with hip)
+    override GPUCC=
+    override USE_NVTX=
+    override CUINC=
+    override CURANDLIBFLAGS=
+  endif
+
+  # Allow newer (unsupported) C++ compilers with older versions of CUDA if ALLOW_UNSUPPORTED_COMPILER_IN_CUDA is set (#504)
+  ifneq ($(origin ALLOW_UNSUPPORTED_COMPILER_IN_CUDA),undefined)
+  GPUFLAGS += -allow-unsupported-compiler
+  endif
 
-# Allow newer (unsupported) C++ compilers with older versions of CUDA if ALLOW_UNSUPPORTED_COMPILER_IN_CUDA is set (#504)
-ifneq ($(origin ALLOW_UNSUPPORTED_COMPILER_IN_CUDA),undefined)
-CUFLAGS += -allow-unsupported-compiler
 endif
 
+  
 #-------------------------------------------------------------------------------
 
 #=== Configure ccache for C++ and CUDA builds
@@ -163,9 +233,9 @@ endif
 #ifeq ($(USECCACHE)$(shell echo $(AR) | grep ccache),1)
 #  override AR:=ccache $(AR)
 #endif
-ifneq ($(NVCC),)
-  ifeq ($(USECCACHE)$(shell echo $(NVCC) | grep ccache),1)
-    override NVCC:=ccache $(NVCC)
+ifneq ($(GPUCC),)
+  ifeq ($(USECCACHE)$(shell echo $(GPUCC) | grep ccache),1)
+    override GPUCC:=ccache $(GPUCC)
   endif
 endif
 
@@ -175,11 +245,11 @@ endif
 
 # PowerPC-specific CXX compiler flags (being reviewed)
 ifeq ($(UNAME_P),ppc64le)
-  CXXFLAGS+= -mcpu=power9 -mtune=power9 # gains ~2-3%% both for none and sse4
+  CXXFLAGS+= -mcpu=power9 -mtune=power9 # gains ~2-3% both for none and sse4
   # Throughput references without the extra flags below: none=1.41-1.42E6, sse4=2.15-2.19E6
   ###CXXFLAGS+= -DNO_WARN_X86_INTRINSICS # no change
   ###CXXFLAGS+= -fpeel-loops # no change
-  ###CXXFLAGS+= -funroll-loops # gains ~1%% for none, loses ~1%% for sse4
+  ###CXXFLAGS+= -funroll-loops # gains ~1% for none, loses ~1% for sse4
   ###CXXFLAGS+= -ftree-vectorize # no change
   ###CXXFLAGS+= -flto # would increase to none=4.08-4.12E6, sse4=4.99-5.03E6!
 else
@@ -189,7 +259,7 @@ endif
 
 # PowerPC-specific CUDA compiler flags (to be reviewed!)
 ifeq ($(UNAME_P),ppc64le)
-  CUFLAGS+= -Xcompiler -mno-float128
+  GPUFLAGS+= -Xcompiler -mno-float128
 endif
 
 #-------------------------------------------------------------------------------
@@ -199,10 +269,10 @@ endif
 # Set the default OMPFLAGS choice
 ifneq ($(shell $(CXX) --version | egrep '^Intel'),)
 override OMPFLAGS = -fopenmp
-###override OMPFLAGS = # disable OpenMP MT on Intel (was ok without nvcc but not ok with nvcc before #578)
+###override OMPFLAGS = # disable OpenMP MT on Intel (was ok without GPUCC but not ok with GPUCC before #578)
 else ifneq ($(shell $(CXX) --version | egrep '^(clang)'),)
 override OMPFLAGS = -fopenmp
-###override OMPFLAGS = # disable OpenMP MT on clang (was not ok without or with nvcc before #578)
+###override OMPFLAGS = # disable OpenMP MT on clang (was not ok without or with GPUCC before #578)
 else ifneq ($(shell $(CXX) --version | egrep '^(Apple clang)'),)
 override OMPFLAGS = # disable OpenMP MT on Apple clang (builds fail in the CI #578)
 else
@@ -253,7 +323,10 @@ endif
 
 # Set the default RNDGEN (random number generator) choice
 ifeq ($(RNDGEN),)
-  ifeq ($(NVCC),)
+  ifeq ($(GPUCC),)
+    override RNDGEN = hasNoCurand
+  # Edgecase for HIP compilation
+  else ifeq ($(findstring hipcc,$(GPUCC)),hipcc)
     override RNDGEN = hasNoCurand
   else ifeq ($(RNDGEN),)
     override RNDGEN = hasCurand
@@ -328,13 +401,13 @@ CXXFLAGS+= $(AVXFLAGS)
 $(info FPTYPE=$(FPTYPE))
 ifeq ($(FPTYPE),d)
   CXXFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_DOUBLE
-  CUFLAGS  += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_DOUBLE
+  GPUFLAGS  += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_DOUBLE
 else ifeq ($(FPTYPE),f)
   CXXFLAGS += -DMGONGPU_FPTYPE_FLOAT -DMGONGPU_FPTYPE2_FLOAT
-  CUFLAGS  += -DMGONGPU_FPTYPE_FLOAT -DMGONGPU_FPTYPE2_FLOAT
+  GPUFLAGS  += -DMGONGPU_FPTYPE_FLOAT -DMGONGPU_FPTYPE2_FLOAT
 else ifeq ($(FPTYPE),m)
   CXXFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_FLOAT
-  CUFLAGS  += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_FLOAT
+  GPUFLAGS  += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_FLOAT
 else
   $(error Unknown FPTYPE='$(FPTYPE)': only 'd', 'f' and 'm' are supported)
 endif
@@ -343,7 +416,7 @@ endif
 $(info HELINL=$(HELINL))
 ifeq ($(HELINL),1)
   CXXFLAGS += -DMGONGPU_INLINE_HELAMPS
-  CUFLAGS  += -DMGONGPU_INLINE_HELAMPS
+  GPUFLAGS  += -DMGONGPU_INLINE_HELAMPS
 else ifneq ($(HELINL),0)
   $(error Unknown HELINL='$(HELINL)': only '0' and '1' are supported)
 endif
@@ -352,7 +425,7 @@ endif
 $(info HRDCOD=$(HRDCOD))
 ifeq ($(HRDCOD),1)
   CXXFLAGS += -DMGONGPU_HARDCODE_PARAM
-  CUFLAGS  += -DMGONGPU_HARDCODE_PARAM
+  GPUFLAGS  += -DMGONGPU_HARDCODE_PARAM
 else ifneq ($(HRDCOD),0)
   $(error Unknown HRDCOD='$(HRDCOD)': only '0' and '1' are supported)
 endif
@@ -404,11 +477,11 @@ ifeq ($(UNAME_S),Darwin)
   override CULIBFLAGSRPATH2 =
 else
   # RPATH to cuda/cpp libs when linking executables
-  override CXXLIBFLAGSRPATH = -Wl,-rpath,$(LIBDIRRPATH)
-  override CULIBFLAGSRPATH = -Xlinker -rpath,$(LIBDIRRPATH)
+  override CXXLIBFLAGSRPATH = -Wl,-rpath=$(LIBDIRRPATH)
+  override CULIBFLAGSRPATH = -Xlinker -rpath=$(LIBDIRRPATH)
   # RPATH to common lib when linking cuda/cpp libs
-  override CXXLIBFLAGSRPATH2 = -Wl,-rpath,'$$ORIGIN'
-  override CULIBFLAGSRPATH2 = -Xlinker -rpath,'$$ORIGIN'
+  override CXXLIBFLAGSRPATH2 = -Wl,-rpath='$$ORIGIN'
+  override CULIBFLAGSRPATH2 = -Xlinker -rpath='$$ORIGIN'
 endif
 
 # Setting LD_LIBRARY_PATH or DYLD_LIBRARY_PATH in the RUNTIME is no longer necessary (neither on Linux nor on Mac)
@@ -421,7 +494,7 @@ override RUNTIME =
 cxx_main=$(BUILDDIR)/check.exe
 fcxx_main=$(BUILDDIR)/fcheck.exe
 
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 cu_main=$(BUILDDIR)/gcheck.exe
 fcu_main=$(BUILDDIR)/fgcheck.exe
 else
@@ -452,28 +525,32 @@ $(BUILDDIR)/.build.$(TAG):
 	@touch $(BUILDDIR)/.build.$(TAG)
 
 # Generic target and build rules: objects from CUDA compilation
-ifneq ($(NVCC),)
-$(BUILDDIR)/%%.o : %%.cu *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
+ifneq ($(GPUCC),)
+$(BUILDDIR)/%.o : %.cu *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
 	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
-	$(NVCC) $(CPPFLAGS) $(CUFLAGS) -Xcompiler -fPIC -c $< -o $@
+	$(GPUCC) $(CPPFLAGS) $(GPUFLAGS) $(CUBUILDRULEFLAGS) $< -o $@
 
-$(BUILDDIR)/%%_cu.o : %%.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
+$(BUILDDIR)/%_cu.o : %.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
 	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
-	$(NVCC) $(CPPFLAGS) $(CUFLAGS) -Xcompiler -fPIC -c -x cu $< -o $@
+	$(GPUCC) $(CPPFLAGS) $(GPUFLAGS) $(CCBUILDRULEFLAGS) $< -o $@
 endif
+# -x cu in line above
 
 # Generic target and build rules: objects from C++ compilation
 # (NB do not include CUINC here! add it only for NVTX or curand #679)
-$(BUILDDIR)/%%.o : %%.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
+$(BUILDDIR)/%.o : %.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
 	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
 	$(CXX) $(CPPFLAGS) $(CXXFLAGS) -fPIC -c $< -o $@
 
 # Apply special build flags only to CrossSectionKernel.cc and gCrossSectionKernel.cu (no fast math, see #117)
+# Added edgecase for HIP compilation
 ifeq ($(shell $(CXX) --version | grep ^nvc++),)
 $(BUILDDIR)/CrossSectionKernels.o: CXXFLAGS += -fno-fast-math
 $(BUILDDIR)/CrossSectionKernels.o: CXXFLAGS += -fno-fast-math
-ifneq ($(NVCC),)
-$(BUILDDIR)/gCrossSectionKernels.o: CUFLAGS += -Xcompiler -fno-fast-math
+ifeq ($(findstring nvcc,$(GPUCC)),nvcc)
+  $(BUILDDIR)/gCrossSectionKernels.o: GPUFLAGS += -Xcompiler -fno-fast-math
+else
+  $(BUILDDIR)/gCrossSectionKernels.o: GPUFLAGS += -fno-fast-math
 endif
 endif
 
@@ -489,10 +566,10 @@ ifeq ($(RNDGEN),hasCurand)
 $(BUILDDIR)/CurandRandomNumberKernel.o: CXXFLAGS += $(CUINC)
 endif
 
-# Avoid "warning: builtin __has_trivial_... is deprecated; use __is_trivially_... instead" in nvcc with icx2023 (#592)
+# Avoid "warning: builtin __has_trivial_... is deprecated; use __is_trivially_... instead" in GPUCC with icx2023 (#592)
 ifneq ($(shell $(CXX) --version | egrep '^(Intel)'),)
-ifneq ($(NVCC),)
-CUFLAGS += -Xcompiler -Wno-deprecated-builtins
+ifneq ($(GPUCC),)
+GPUFLAGS += -Wno-deprecated-builtins
 endif
 endif
 
@@ -500,8 +577,8 @@ endif
 # This patch does remove the warning, but I prefer to keep it disabled for the moment...
 ###ifneq ($(shell $(CXX) --version | egrep '^(clang|Apple clang|Intel)'),)
 ###$(BUILDDIR)/CrossSectionKernels.o: CXXFLAGS += -Wno-overriding-t-option
-###ifneq ($(NVCC),)
-###$(BUILDDIR)/gCrossSectionKernels.o: CUFLAGS += -Xcompiler -Wno-overriding-t-option
+###ifneq ($(GPUCC),)
+###$(BUILDDIR)/gCrossSectionKernels.o: GPUFLAGS += -Xcompiler -Wno-overriding-t-option
 ###endif
 ###endif
 
@@ -528,7 +605,7 @@ MG5AMC_CXXLIB = mg5amc_$(processid_short)_cpp
 cxx_objects_lib=$(BUILDDIR)/CPPProcess.o $(BUILDDIR)/MatrixElementKernels.o $(BUILDDIR)/BridgeKernels.o $(BUILDDIR)/CrossSectionKernels.o
 cxx_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel.o $(BUILDDIR)/RamboSamplingKernels.o
 
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 MG5AMC_CULIB = mg5amc_$(processid_short)_cuda
 cu_objects_lib=$(BUILDDIR)/gCPPProcess.o $(BUILDDIR)/gMatrixElementKernels.o $(BUILDDIR)/gBridgeKernels.o $(BUILDDIR)/gCrossSectionKernels.o
 cu_objects_exe=$(BUILDDIR)/gCommonRandomNumberKernel.o $(BUILDDIR)/gRamboSamplingKernels.o
@@ -540,17 +617,17 @@ $(LIBDIR)/lib$(MG5AMC_CXXLIB).so: cxx_objects_lib += $(BUILDDIR)/fbridge.o
 $(LIBDIR)/lib$(MG5AMC_CXXLIB).so: $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib)
 	$(CXX) -shared -o $@ $(cxx_objects_lib) $(CXXLIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB)
 
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 $(LIBDIR)/lib$(MG5AMC_CULIB).so: $(BUILDDIR)/fbridge_cu.o
 $(LIBDIR)/lib$(MG5AMC_CULIB).so: cu_objects_lib += $(BUILDDIR)/fbridge_cu.o
 $(LIBDIR)/lib$(MG5AMC_CULIB).so: $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cu_objects_lib)
-	$(NVCC) --shared -o $@ $(cu_objects_lib) $(CULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB)
+	$(GPUCC) --shared -o $@ $(cu_objects_lib) $(CULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB)
 endif
 
 #-------------------------------------------------------------------------------
 
 # Target (and build rules): Fortran include files
-###$(INCDIR)/%%.inc : ../%%.inc
+###$(INCDIR)/%.inc : ../%.inc
 ###	@if [ ! -d $(INCDIR) ]; then echo "mkdir -p $(INCDIR)"; mkdir -p $(INCDIR); fi
 ###	\cp $< $@
 
@@ -561,27 +638,27 @@ $(cxx_main): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PAT
 $(cxx_main): $(BUILDDIR)/check_sa.o $(LIBDIR)/lib$(MG5AMC_CXXLIB).so $(cxx_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel.o
 	$(CXX) -o $@ $(BUILDDIR)/check_sa.o $(OMPFLAGS) -ldl -pthread $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_CXXLIB) $(cxx_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel.o $(CURANDLIBFLAGS)
 
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 ifneq ($(shell $(CXX) --version | grep ^Intel),)
-$(cu_main): LIBFLAGS += -lintlc # compile with icpx and link with nvcc (undefined reference to `_intel_fast_memcpy')
-$(cu_main): LIBFLAGS += -lsvml # compile with icpx and link with nvcc (undefined reference to `__svml_cos4_l9')
+$(cu_main): LIBFLAGS += -lintlc # compile with icpx and link with GPUCC (undefined reference to `_intel_fast_memcpy')
+$(cu_main): LIBFLAGS += -lsvml # compile with icpx and link with GPUCC (undefined reference to `__svml_cos4_l9')
 else ifneq ($(shell $(CXX) --version | grep ^nvc++),) # support nvc++ #531
-$(cu_main): LIBFLAGS += -L$(patsubst %%bin/nvc++,%%lib,$(subst ccache ,,$(CXX))) -lnvhpcatm -lnvcpumath -lnvc
+$(cu_main): LIBFLAGS += -L$(patsubst %bin/nvc++,%lib,$(subst ccache ,,$(CXX))) -lnvhpcatm -lnvcpumath -lnvc
 endif
 $(cu_main): LIBFLAGS += $(CULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH
 $(cu_main): $(BUILDDIR)/gcheck_sa.o $(LIBDIR)/lib$(MG5AMC_CULIB).so $(cu_objects_exe) $(BUILDDIR)/gCurandRandomNumberKernel.o
-	$(NVCC) -o $@ $(BUILDDIR)/gcheck_sa.o $(CUARCHFLAGS) $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe) $(BUILDDIR)/gCurandRandomNumberKernel.o $(CURANDLIBFLAGS)
+	$(GPUCC) -o $@ $(BUILDDIR)/gcheck_sa.o $(CUARCHFLAGS) $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe) $(BUILDDIR)/gCurandRandomNumberKernel.o $(CURANDLIBFLAGS)
 endif
 
 #-------------------------------------------------------------------------------
 
 # Generic target and build rules: objects from Fortran compilation
-$(BUILDDIR)/%%.o : %%.f *.inc
+$(BUILDDIR)/%.o : %.f *.inc
 	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
 	$(FC) -I. -c $< -o $@
 
 # Generic target and build rules: objects from Fortran compilation
-###$(BUILDDIR)/%%.o : %%.f *.inc
+###$(BUILDDIR)/%.o : %.f *.inc
 ###	@if [ ! -d $(INCDIR) ]; then echo "mkdir -p $(INCDIR)"; mkdir -p $(INCDIR); fi
 ###	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
 ###	$(FC) -I. -I$(INCDIR) -c $< -o $@
@@ -596,17 +673,17 @@ $(fcxx_main): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PA
 $(fcxx_main): $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler.o $(LIBDIR)/lib$(MG5AMC_CXXLIB).so $(cxx_objects_exe)
 	$(CXX) -o $@ $(BUILDDIR)/fcheck_sa.o $(OMPFLAGS) $(BUILDDIR)/fsampler.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CXXLIB) $(cxx_objects_exe)
 
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 ifneq ($(shell $(CXX) --version | grep ^Intel),)
-$(fcu_main): LIBFLAGS += -lintlc # compile with icpx and link with nvcc (undefined reference to `_intel_fast_memcpy')
-$(fcu_main): LIBFLAGS += -lsvml # compile with icpx and link with nvcc (undefined reference to `__svml_cos4_l9')
+$(fcu_main): LIBFLAGS += -lintlc # compile with icpx and link with GPUCC (undefined reference to `_intel_fast_memcpy')
+$(fcu_main): LIBFLAGS += -lsvml # compile with icpx and link with GPUCC (undefined reference to `__svml_cos4_l9')
 endif
 ifeq ($(UNAME_S),Darwin)
 $(fcu_main): LIBFLAGS += -L$(shell dirname $(shell $(FC) --print-file-name libgfortran.dylib)) # add path to libgfortran on Mac #375
 endif
 $(fcu_main): LIBFLAGS += $(CULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH
 $(fcu_main): $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler_cu.o $(LIBDIR)/lib$(MG5AMC_CULIB).so $(cu_objects_exe)
-	$(NVCC) -o $@ $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler_cu.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe)
+	$(GPUCC) -o $@ $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler_cu.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe)
 endif
 
 #-------------------------------------------------------------------------------
@@ -618,7 +695,7 @@ $(BUILDDIR)/testxxx.o: testxxx_cc_ref.txt
 $(testmain): $(BUILDDIR)/testxxx.o
 $(testmain): cxx_objects_exe += $(BUILDDIR)/testxxx.o # Comment out this line to skip the C++ test of xxx functions
 
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 $(BUILDDIR)/testxxx_cu.o: $(GTESTLIBS)
 $(BUILDDIR)/testxxx_cu.o: INCFLAGS += $(GTESTINC)
 $(BUILDDIR)/testxxx_cu.o: testxxx_cc_ref.txt
@@ -631,7 +708,7 @@ $(BUILDDIR)/testmisc.o: INCFLAGS += $(GTESTINC)
 $(testmain): $(BUILDDIR)/testmisc.o
 $(testmain): cxx_objects_exe += $(BUILDDIR)/testmisc.o # Comment out this line to skip the C++ miscellaneous tests
 
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 $(BUILDDIR)/testmisc_cu.o: $(GTESTLIBS)
 $(BUILDDIR)/testmisc_cu.o: INCFLAGS += $(GTESTINC)
 $(testmain): $(BUILDDIR)/testmisc_cu.o
@@ -643,14 +720,14 @@ $(BUILDDIR)/runTest.o: INCFLAGS += $(GTESTINC)
 $(testmain): $(BUILDDIR)/runTest.o
 $(testmain): cxx_objects_exe += $(BUILDDIR)/runTest.o
 
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 $(BUILDDIR)/runTest_cu.o: $(GTESTLIBS)
 $(BUILDDIR)/runTest_cu.o: INCFLAGS += $(GTESTINC)
 ifneq ($(shell $(CXX) --version | grep ^Intel),)
-$(testmain): LIBFLAGS += -lintlc # compile with icpx and link with nvcc (undefined reference to `_intel_fast_memcpy')
-$(testmain): LIBFLAGS += -lsvml # compile with icpx and link with nvcc (undefined reference to `__svml_cos4_l9')
+$(testmain): LIBFLAGS += -lintlc # compile with icpx and link with GPUCC (undefined reference to `_intel_fast_memcpy')
+$(testmain): LIBFLAGS += -lsvml # compile with icpx and link with GPUCC (undefined reference to `__svml_cos4_l9')
 else ifneq ($(shell $(CXX) --version | grep ^nvc++),) # support nvc++ #531
-$(testmain): LIBFLAGS += -L$(patsubst %%bin/nvc++,%%lib,$(subst ccache ,,$(CXX))) -lnvhpcatm -lnvcpumath -lnvc
+$(testmain): LIBFLAGS += -L$(patsubst %bin/nvc++,%lib,$(subst ccache ,,$(CXX))) -lnvhpcatm -lnvcpumath -lnvc
 endif
 $(testmain): $(BUILDDIR)/runTest_cu.o
 $(testmain): cu_objects_exe  += $(BUILDDIR)/runTest_cu.o
@@ -672,14 +749,14 @@ $(testmain): LIBFLAGS += -lgomp
 endif
 endif
 
-ifeq ($(NVCC),) # link only runTest.o
+ifeq ($(GPUCC),) # link only runTest.o
 $(testmain): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH
 $(testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(cxx_objects_exe) $(GTESTLIBS)
 	$(CXX) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) -ldl -pthread $(LIBFLAGS)
 else # link both runTest.o and runTest_cu.o
 $(testmain): LIBFLAGS += $(CULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH
 $(testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) $(GTESTLIBS)
-	$(NVCC) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) -ldl $(LIBFLAGS) -lcuda
+	  $(GPUCC) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) -ldl $(LIBFLAGS) $(CUDATESTFLAGS)
 endif
 
 # Use flock (Linux only, no Mac) to allow 'make -j' if googletest has not yet been downloaded https://stackoverflow.com/a/32666215
@@ -782,9 +859,9 @@ ifeq ($(USECCACHE),1)
 	ccache --version | head -1
 endif
 	@echo ""
-	@echo NVCC=$(NVCC)
-ifneq ($(NVCC),)
-	$(NVCC) --version
+	@echo GPUCC=$(GPUCC)
+ifneq ($(GPUCC),)
+	$(GPUCC) --version
 endif
 	@echo ""
 	@echo CXX=$(CXX)
@@ -803,7 +880,7 @@ endif
 
 # Target: check (run the C++ test executable)
 # [NB THIS IS WHAT IS USED IN THE GITHUB CI!]
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 check: runTest cmpFcheck cmpFGcheck
 else
 check: runTest cmpFcheck
@@ -834,14 +911,14 @@ cmpFcheck: all.$(TAG)
 	@echo
 	@echo "$(BUILDDIR)/check.exe --common -p 2 32 2"
 	@echo "$(BUILDDIR)/fcheck.exe 2 32 2"
-	@me1=$(shell $(RUNTIME) $(BUILDDIR)/check.exe --common -p 2 32 2 | grep MeanMatrix | awk '{print $$4}'); me2=$(shell $(RUNTIME) $(BUILDDIR)/fcheck.exe 2 32 2 | grep Average | awk '{print $$4}'); echo "Avg ME (C++/C++)    = $${me1}"; echo "Avg ME (F77/C++)    = $${me2}"; if [ "$${me2}" == "NaN" ]; then echo "ERROR! Fortran calculation (F77/C++) returned NaN"; elif [ "$${me2}" == "" ]; then echo "ERROR! Fortran calculation (F77/C++) crashed"; else python3 -c "me1=$${me1}; me2=$${me2}; reldif=abs((me2-me1)/me1); print('Relative difference =', reldif); ok = reldif <= 2E-4; print ( '%%s (relative difference %%s 2E-4)' %% ( ('OK','<=') if ok else ('ERROR','>') ) ); import sys; sys.exit(0 if ok else 1)"; fi
+	@me1=$(shell $(RUNTIME) $(BUILDDIR)/check.exe --common -p 2 32 2 | grep MeanMatrix | awk '{print $$4}'); me2=$(shell $(RUNTIME) $(BUILDDIR)/fcheck.exe 2 32 2 | grep Average | awk '{print $$4}'); echo "Avg ME (C++/C++)    = $${me1}"; echo "Avg ME (F77/C++)    = $${me2}"; if [ "$${me2}" == "NaN" ]; then echo "ERROR! Fortran calculation (F77/C++) returned NaN"; elif [ "$${me2}" == "" ]; then echo "ERROR! Fortran calculation (F77/C++) crashed"; else python3 -c "me1=$${me1}; me2=$${me2}; reldif=abs((me2-me1)/me1); print('Relative difference =', reldif); ok = reldif <= 2E-4; print ( '%s (relative difference %s 2E-4)' % ( ('OK','<=') if ok else ('ERROR','>') ) ); import sys; sys.exit(0 if ok else 1)"; fi
 
 # Target: cmpFGcheck (compare ME results from the CUDA and Fortran with CUDA MEs standalone executables, with a small number of events)
 cmpFGcheck: all.$(TAG)
 	@echo
 	@echo "$(BUILDDIR)/gcheck.exe --common -p 2 32 2"
 	@echo "$(BUILDDIR)/fgcheck.exe 2 32 2"
-	@me1=$(shell $(RUNTIME) $(BUILDDIR)/gcheck.exe --common -p 2 32 2 | grep MeanMatrix | awk '{print $$4}'); me2=$(shell $(RUNTIME) $(BUILDDIR)/fgcheck.exe 2 32 2 | grep Average | awk '{print $$4}'); echo "Avg ME (C++/CUDA)   = $${me1}"; echo "Avg ME (F77/CUDA)   = $${me2}"; if [ "$${me2}" == "NaN" ]; then echo "ERROR! Fortran calculation (F77/CUDA) crashed"; elif [ "$${me2}" == "" ]; then echo "ERROR! Fortran calculation (F77/CUDA) crashed"; else python3 -c "me1=$${me1}; me2=$${me2}; reldif=abs((me2-me1)/me1); print('Relative difference =', reldif); ok = reldif <= 2E-4; print ( '%%s (relative difference %%s 2E-4)' %% ( ('OK','<=') if ok else ('ERROR','>') ) ); import sys; sys.exit(0 if ok else 1)"; fi
+	@me1=$(shell $(RUNTIME) $(BUILDDIR)/gcheck.exe --common -p 2 32 2 | grep MeanMatrix | awk '{print $$4}'); me2=$(shell $(RUNTIME) $(BUILDDIR)/fgcheck.exe 2 32 2 | grep Average | awk '{print $$4}'); echo "Avg ME (C++/CUDA)   = $${me1}"; echo "Avg ME (F77/CUDA)   = $${me2}"; if [ "$${me2}" == "NaN" ]; then echo "ERROR! Fortran calculation (F77/CUDA) crashed"; elif [ "$${me2}" == "" ]; then echo "ERROR! Fortran calculation (F77/CUDA) crashed"; else python3 -c "me1=$${me1}; me2=$${me2}; reldif=abs((me2-me1)/me1); print('Relative difference =', reldif); ok = reldif <= 2E-4; print ( '%s (relative difference %s 2E-4)' % ( ('OK','<=') if ok else ('ERROR','>') ) ); import sys; sys.exit(0 if ok else 1)"; fi
 
 # Target: memcheck (run the CUDA standalone executable gcheck.exe with a small number of events through cuda-memcheck)
 memcheck: all.$(TAG)
diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/fbridge.cc b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/fbridge.cc
index f93c05b0b3..5f91f007ee 100644
--- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/fbridge.cc
+++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/fbridge.cc
@@ -5,7 +5,7 @@
 
 #include "Bridge.h"
 #include "CPPProcess.h"
-#include "CudaRuntime.h"
+#include "GpuRuntime.h"
 
 extern "C"
 {
@@ -22,7 +22,7 @@ extern "C"
    * Using the same Fortran MadEvent code, linking to the hetrerogeneous library would allow access to both CPU and GPU implementations.
    * The specific heterogeneous configuration (how many GPUs, how many threads on each CPU, etc) could be loaded in CUDA/C++ from a data file.
    */
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   using namespace mg5amcGpu;
 #else
   using namespace mg5amcCpu;
@@ -46,7 +46,7 @@ extern "C"
    */
   void fbridgecreate_( CppObjectInFortran** ppbridge, const int* pnevtF, const int* pnparF, const int* pnp4F )
   {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     CudaRuntime::setUp();
 #endif
     // Create a process object, read parm card and set parameters
@@ -69,7 +69,7 @@ extern "C"
     Bridge<FORTRANFPTYPE>* pbridge = dynamic_cast<Bridge<FORTRANFPTYPE>*>( *ppbridge );
     if( pbridge == 0 ) throw std::runtime_error( "fbridgedelete_: invalid Bridge address" );
     delete pbridge;
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     CudaRuntime::tearDown();
 #endif
   }
@@ -100,7 +100,7 @@ extern "C"
   {
     Bridge<FORTRANFPTYPE>* pbridge = dynamic_cast<Bridge<FORTRANFPTYPE>*>( *ppbridge );
     if( pbridge == 0 ) throw std::runtime_error( "fbridgesequence_: invalid Bridge address" );
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     // Use the device/GPU implementation in the CUDA library
     // (there is also a host implementation in this library)
     pbridge->gpu_sequence( momenta, gs, rndhel, rndcol, *pchannelId, mes, selhel, selcol );
diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/fsampler.cc b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/fsampler.cc
index 2fb445372d..acffa7c19e 100644
--- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/fsampler.cc
+++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/fsampler.cc
@@ -13,7 +13,7 @@
 
 //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -40,7 +40,7 @@ namespace mg5amcCpu
   private:
     const int m_nevt; // The number of events in each iteration
     int m_iiter;      // The iteration counter (for random number seeding)
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
     HostBufferRndNumMomenta m_hstRndmom; // Memory buffers for random numbers
     HostBufferMomenta m_hstMomenta;      // Memory buffers for momenta
     HostBufferWeights m_hstWeights;      // Memory buffers for sampling weights
@@ -105,7 +105,7 @@ namespace mg5amcCpu
 
 extern "C"
 {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   using namespace mg5amcGpu;
 #else
   using namespace mg5amcCpu;
diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/mgOnGpuConfig.h b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/mgOnGpuConfig.h
index 8b283c1d36..55307d3674 100644
--- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/mgOnGpuConfig.h
+++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/mgOnGpuConfig.h
@@ -6,6 +6,8 @@
 #ifndef MGONGPUCONFIG_H
 #define MGONGPUCONFIG_H 1
 
+#include "GpuRuntime.h" // Includes the GPU abstraction
+
 // HARDCODED AT CODE GENERATION TIME: DO NOT MODIFY (#473)
 // There are two different code bases for standalone_cudacpp (without multichannel) and madevent+cudacpp (with multichannel)
 %(mgongpu_supports_multichannel)s
@@ -15,9 +17,10 @@
 
 // Choose if curand is supported for generating random numbers
 // For C++, by default, do not inline, but allow this macro to be set from outside with e.g. -DMGONGPU_HAS_NO_CURAND
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_CUDACC
 #undef MGONGPU_HAS_NO_CURAND
-#else
+#elif defined MGONGPUCPP_HIPCC
+#define MGONGPU_HAS_NO_CURAND 1
 //#undef MGONGPU_HAS_NO_CURAND // default
 ////#define MGONGPU_HAS_NO_CURAND 1
 #endif
@@ -53,20 +56,20 @@
 ////#define MGONGPU_HARDCODE_PARAM 1
 
 // Complex type in c++: std::complex or cxsmpl (CHOOSE ONLY ONE)
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 //#define MGONGPU_CPPCXTYPE_STDCOMPLEX 1 // ~8 percent slower on float, same on double (5.1E6/double, 9.4E6/float)
 #define MGONGPU_CPPCXTYPE_CXSMPL 1 // new default (5.1E6/double, 10.2E6/float)
 #endif
 
 // Complex type in cuda: thrust or cucomplex or cxsmpl (CHOOSE ONLY ONE)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 #define MGONGPU_CUCXTYPE_THRUST 1 // default (~1.15E9/double, ~3.2E9/float)
 //#define MGONGPU_CUCXTYPE_CUCOMPLEX 1 // ~10 percent slower (1.03E9/double, ~2.8E9/float)
 //#define MGONGPU_CUCXTYPE_CXSMPL 1 // ~10 percent slower (1.00E9/double, ~2.9E9/float)
 #endif
 
 // Cuda nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 #undef MGONGPU_NSIGHT_DEBUG // default
 //#define MGONGPU_NSIGHT_DEBUG 1
 #endif
@@ -85,14 +88,14 @@
 #endif
 
 // SANITY CHECKS (c++ complex number implementation)
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 #if defined MGONGPU_CPPCXTYPE_STDCOMPLEX and defined MGONGPU_CPPCXTYPE_CXSMPL
 #error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CPPCXTYPE_STDCOMPLEX or MGONGPU_CPPCXTYPE_CXSMPL
 #endif
 #endif
 
 // SANITY CHECKS (cuda complex number implementation)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 #if defined MGONGPU_CUCXTYPE_THRUST and defined MGONGPU_CUCXTYPE_CUCOMPLEX and defined MGONGPU_CUCXTYPE_CXSMPL
 #error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CUCXTYPE_THRUST or MGONGPU_CUCXTYPE_CUCOMPLEX or MGONGPU_CUCXTYPE_CXSMPL
 #endif
@@ -131,7 +134,7 @@ namespace mgOnGpu
   // Alignment requirement for using reinterpret_cast with SIMD vectorized code
   // (using reinterpret_cast with non aligned memory may lead to segmentation faults!)
   // Only needed for C++ code but can be enforced also in NVCC builds of C++ code using CUDA>=11.2 and C++17 (#318, #319, #333)
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   constexpr int cppAlign = 64; // alignment requirement for SIMD vectorization (64-byte i.e. 512-bit)
 #endif
 
@@ -142,7 +145,7 @@ using mgOnGpu::fptype;
 using mgOnGpu::fptype2;
 
 // C++ SIMD vectorization width (this will be used to set neppV)
-#ifdef __CUDACC__ // CUDA implementation has no SIMD
+#ifdef MGONGPUCPP_GPUIMPL // CUDA implementation has no SIMD
 #undef MGONGPU_CPPSIMD
 #elif defined __AVX512VL__ && defined MGONGPU_PVW512 // C++ "512z" AVX512 with 512 width (512-bit ie 64-byte): 8 (DOUBLE) or 16 (FLOAT)
 #ifdef MGONGPU_FPTYPE_DOUBLE
@@ -174,7 +177,7 @@ using mgOnGpu::fptype2;
 
 // Cuda nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation
 // Arguments (not used so far): text is __FUNCTION__, code is 0 (start) or 1 (end)
-#if defined __CUDACC__ && defined MGONGPU_NSIGHT_DEBUG /* clang-format off */
+#if defined MGONGPUCPP_GPUIMPL && defined MGONGPU_NSIGHT_DEBUG /* clang-format off */
 #define mgDebugDeclare() __shared__ float mgDebugCounter[mgOnGpu::ntpbMAX];
 #define mgDebugInitialise() { mgDebugCounter[threadIdx.x] = 0; }
 #define mgDebug( code, text ) { mgDebugCounter[threadIdx.x] += 1; }
@@ -187,7 +190,7 @@ using mgOnGpu::fptype2;
 #endif /* clang-format on */
 
 // Define empty CUDA declaration specifiers for C++
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 #define __global__
 #define __host__
 #define __device__
diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/mgOnGpuCxtypes.h b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/mgOnGpuCxtypes.h
index 0cb2f1db7e..866b7640f6 100644
--- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/mgOnGpuCxtypes.h
+++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/mgOnGpuCxtypes.h
@@ -19,7 +19,7 @@
 #include <complex>
 
 // Complex type in cuda: thrust or cucomplex or cxsmpl
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 #if defined MGONGPU_CUCXTYPE_THRUST
 #pragma clang diagnostic push
 #pragma clang diagnostic ignored "-Wtautological-compare" // for icpx2021/clang13 (https://stackoverflow.com/a/15864661)
@@ -201,7 +201,7 @@ namespace mgOnGpu
 {
 
   // --- Type definitions (complex type: cxtype)
-#ifdef __CUDACC__ // cuda
+#ifdef MGONGPUCPP_GPUIMPL // cuda
 #if defined MGONGPU_CUCXTYPE_THRUST
   typedef thrust::complex<fptype> cxtype;
 #elif defined MGONGPU_CUCXTYPE_CUCOMPLEX
@@ -281,7 +281,7 @@ cxmake( const std::complex<double>& c ) // std::complex to cxsmpl (double-to-flo
 
 //==========================================================================
 
-#if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_THRUST // cuda + thrust
+#if defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CUCXTYPE_THRUST // cuda + thrust
 
 //------------------------------
 // CUDA - using thrust::complex
@@ -317,11 +317,11 @@ cxmake( const cxtype& c )
   return c;
 }
 
-#endif // #if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_THRUST
+#endif // #if defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CUCXTYPE_THRUST
 
 //==========================================================================
 
-#if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_CUCOMPLEX // cuda + cucomplex
+#if defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CUCXTYPE_CUCOMPLEX // cuda + cucomplex
 
 //------------------------------
 // CUDA - using cuComplex
@@ -536,11 +536,11 @@ cxmake( const std::complex<fptype>& c ) // std::complex to cucomplex (float-to-f
   return cxmake( c.real(), c.imag() );
 }
 
-#endif // #if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_CUCOMPLEX
+#endif // #if defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CUCXTYPE_CUCOMPLEX
 
 //==========================================================================
 
-#if not defined __CUDACC__ and defined MGONGPU_CPPCXTYPE_STDCOMPLEX // c++ + stdcomplex
+#if not defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CPPCXTYPE_STDCOMPLEX // c++ + stdcomplex
 
 //------------------------------
 // C++ - using std::complex
@@ -584,7 +584,7 @@ cxmake( const std::complex<double>& c ) // std::complex to std::complex (cast do
 }
 #endif
 
-#endif // #if not defined __CUDACC__ and defined MGONGPU_CPPCXTYPE_STDCOMPLEX
+#endif // #if not defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CPPCXTYPE_STDCOMPLEX
 
 //==========================================================================
 
diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/mgOnGpuFptypes.h b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/mgOnGpuFptypes.h
index a1cde16a67..7edefa3389 100644
--- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/mgOnGpuFptypes.h
+++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/mgOnGpuFptypes.h
@@ -13,7 +13,7 @@
 
 //==========================================================================
 
-#ifdef __CUDACC__ // cuda
+#ifdef MGONGPUCPP_GPUIMPL // cuda
 
 //------------------------------
 // Floating point types - Cuda
@@ -57,11 +57,11 @@ fpsqrt( const fptype& f )
 #endif
 }
 
-#endif // #ifdef __CUDACC__
+#endif // #ifdef MGONGPUCPP_GPUIMPL
 
 //==========================================================================
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 
 //------------------------------
 // Floating point types - C++
@@ -85,7 +85,7 @@ fpsqrt( const fptype& f )
   return std::sqrt( f );
 }
 
-#endif // #ifndef __CUDACC__
+#endif // #ifndef MGONGPUCPP_GPUIMPL
 
 //==========================================================================
 
diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/mgOnGpuVectors.h b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/mgOnGpuVectors.h
index 9d3e82b1e3..ee906f450d 100644
--- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/mgOnGpuVectors.h
+++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/mgOnGpuVectors.h
@@ -9,6 +9,8 @@
 #include "mgOnGpuCxtypes.h"
 #include "mgOnGpuFptypes.h"
 
+#include "GpuAbstraction.h"
+
 #include <iostream>
 
 //==========================================================================
@@ -108,7 +110,7 @@ namespace mgOnGpu /* clang-format off */
 #endif
 #endif
 
-#else // i.e #ifndef MGONGPU_CPPSIMD (this includes #ifdef __CUDACC__)
+#else // i.e #ifndef MGONGPU_CPPSIMD (this includes #ifdef MGONGPUCPP_GPUIMPL)
 
   const int neppV = 1;
 
@@ -129,7 +131,7 @@ using mgOnGpu::bool_v;
 
 //--------------------------------------------------------------------------
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 
 // Printout to stream for user defined types
 
@@ -744,11 +746,11 @@ fpvsplit1( const fptype2_v& v )
 
 #endif // #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
 
-#endif // #ifndef __CUDACC__
+#endif // #ifndef MGONGPUCPP_GPUIMPL
 
 //==========================================================================
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 
 //------------------------------
 // Vector types - CUDA
@@ -786,12 +788,12 @@ cxternary( const bool& mask, const cxtype& a, const cxtype& b )
   return ( mask ? a : b );
 }
 
-#endif // #ifdef __CUDACC__
+#endif // #ifdef MGONGPUCPP_GPUIMPL
 
 //==========================================================================
 
 // Scalar-or-vector types: scalar in CUDA, vector or scalar in C++
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 typedef bool bool_sv;
 typedef fptype fptype_sv;
 typedef fptype2 fptype2_sv;
@@ -812,7 +814,7 @@ typedef mgOnGpu::cxtype_ref cxtype_sv_ref;
 #endif
 
 // Scalar-or-vector zeros: scalar in CUDA, vector or scalar in C++
-#ifdef __CUDACC__ /* clang-format off */
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
 inline __host__ __device__ cxtype cxzero_sv(){ return cxtype( 0, 0 ); }
 #elif defined MGONGPU_CPPSIMD
 inline cxtype_v cxzero_sv() { return cxtype_v(); } // RRRR=0000 IIII=0000
diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/process_cc.inc b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/process_cc.inc
index 778e210468..9dceb45708 100644
--- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/process_cc.inc
+++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/process_cc.inc
@@ -14,7 +14,7 @@
 
 #include "mgOnGpuConfig.h"
 
-#include "CudaRuntime.h"
+#include "GpuRuntime.h"
 %(hel_amps_h)s
 #include "MemoryAccessAmplitudes.h"
 #include "MemoryAccessCouplings.h"
diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/process_function_definitions.inc b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/process_function_definitions.inc
index 4ac95a4b34..2a473552fa 100644
--- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/process_function_definitions.inc
+++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/process_function_definitions.inc
@@ -10,7 +10,9 @@
 // Class member functions for calculating the matrix elements for
 %(process_lines)s
 
-#ifdef __CUDACC__
+#include "GpuRuntime.h"
+
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -44,7 +46,7 @@ namespace mg5amcCpu
   %(cipdhrdcod)s
   %(cipchrdcod)s
 #else
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   %(cipddevice)s
   %(cipcdevice)s
 #else
@@ -54,7 +56,7 @@ namespace mg5amcCpu
 #endif
 
   // Helicity combinations (and filtering of "good" helicity combinations)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   __device__ __constant__ short cHel[ncomb][npar];
   __device__ __constant__ int cNGoodHel;
   __device__ __constant__ int cGoodHel[ncomb];
@@ -80,8 +82,8 @@ namespace mg5amcCpu
     // Helicities for the process [NB do keep 'static' for this constexpr array, see issue #283]
     // *** NB There is no automatic check yet that these are in the same order as Fortran! #569 ***
 %(all_helicities)s
-#ifdef __CUDACC__
-    checkCuda( cudaMemcpyToSymbol( cHel, tHel, ncomb * npar * sizeof( short ) ) );
+#ifdef MGONGPUCPP_GPUIMPL
+    gpuMemcpyToSymbol( cHel, tHel, ncomb * npar * sizeof( short ) );
 #else
     memcpy( cHel, tHel, ncomb * npar * sizeof( short ) );
 #endif
@@ -117,7 +119,7 @@ namespace mg5amcCpu
     // Then copy them to CUDA constant memory (issue #39) or its C++ emulation in file-scope static memory
     %(cipdassign)s
     %(cipcassign)s
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     %(cipd2tipdSym)s
     %(cipc2tipcSym)s
 #else
@@ -150,7 +152,7 @@ namespace mg5amcCpu
   {
     std::stringstream out;
     // CUDA version (NVCC)
-    // [Use __NVCC__ instead of __CUDACC__ here!]
+    // [Use __NVCC__ instead of MGONGPUCPP_GPUIMPL here!]
     // [This tests if 'nvcc' was used even to build a .cc file, even if not necessarily 'nvcc -x cu' for a .cu file]
     // [Check 'nvcc --compiler-options -dM -E dummy.c | grep CUDA': see https://stackoverflow.com/a/53713712]
 #ifdef __NVCC__
@@ -215,12 +217,12 @@ namespace mg5amcCpu
   __global__ void /* clang-format off */
   computeDependentCouplings( const fptype* allgs, // input: Gs[nevt]
                              fptype* allcouplings // output: couplings[nevt*ndcoup*2]
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
                              , const int nevt     // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #endif
   ) /* clang-format on */
   {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     using namespace mg5amcGpu;
     using G_ACCESS = DeviceAccessGs;
     using C_ACCESS = DeviceAccessCouplings;
@@ -241,7 +243,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__ /* clang-format off */
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
   __global__ void
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
@@ -370,9 +372,9 @@ namespace mg5amcCpu
         nGoodHel++;
       }
     }
-#ifdef __CUDACC__
-    checkCuda( cudaMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) ) );
-    checkCuda( cudaMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) ) );
+#ifdef MGONGPUCPP_GPUIMPL
+    gpuMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) );
+    gpuMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) );
 #else
     cNGoodHel = nGoodHel;
     for( int ihel = 0; ihel < ncomb; ihel++ ) cGoodHel[ihel] = goodHel[ihel];
@@ -396,7 +398,7 @@ namespace mg5amcCpu
 #endif
             int* allselhel,                // output: helicity selection[nevt]
             int* allselcol                 // output: helicity selection[nevt]
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
             , const int nevt               // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #endif
             ) /* clang-format on */
@@ -417,7 +419,7 @@ namespace mg5amcCpu
     // Denominators: spins, colors and identical particles
     constexpr int helcolDenominators[1] = { %(den_factors)s }; // assume nprocesses == 1 (#272 and #343)
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     // Remember: in CUDA this is a kernel for one event, in c++ this processes n events
     const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid
 #else
diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/process_h.inc b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/process_h.inc
index 893f7f3215..8a9de336f2 100644
--- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/process_h.inc
+++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/process_h.inc
@@ -23,7 +23,7 @@
 
 //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -32,7 +32,7 @@ namespace mg5amcCpu
 %(process_class_definitions)s
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   __global__ void
   computeDependentCouplings( const fptype* allgs,    // input: Gs[nevt]
                              fptype* allcouplings ); // output: couplings[nevt*ndcoup*2]
@@ -45,7 +45,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__ /* clang-format off */
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
   __global__ void
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
@@ -75,7 +75,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__ /* clang-format off */
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
   __global__ void
   sigmaKin( const fptype* allmomenta,      // input: momenta[nevt*npar*4]
             const fptype* allcouplings,    // input: couplings[nevt*ndcoup*2]
diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/process_matrix.inc b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/process_matrix.inc
index 1e473edcf8..241c50a9d1 100644
--- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/process_matrix.inc
+++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/process_matrix.inc
@@ -7,6 +7,8 @@
 ! Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 !==========================================================================
 
+#include "GpuAbstraction.h"
+
       // *** COLOR CHOICE BELOW ***
       // Store the leading color flows for choice of color
       if( jamp2_sv ) // disable color choice if nullptr
@@ -17,7 +19,7 @@
       // (This method used to be called %(process_class_name)s::matrix_%(proc_name)s(%(matrix_args)s)?)
 %(color_matrix_lines)s
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
       // Pre-compute a constexpr triangular color matrix properly normalized #475
       struct TriangularNormalizedColorMatrix
       {
@@ -74,7 +76,7 @@
 #endif
       for( int icol = 0; icol < ncolor; icol++ )
       {
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
         // === C++ START ===
         // Diagonal terms
 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
@@ -133,7 +135,7 @@
       MEs_sv_previous += deltaMEs_previous;
 #endif
       /*
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
       if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%%6d ihel=%%2d me_running=%%f\n", blockDim.x * blockIdx.x + threadIdx.x, ihel, MEs_sv );
 #else
 #ifdef MGONGPU_CPPSIMD
diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/process_sigmaKin_function.inc b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/process_sigmaKin_function.inc
index 9fcd58196b..59c1623c5a 100644
--- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/process_sigmaKin_function.inc
+++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/process_sigmaKin_function.inc
@@ -6,9 +6,12 @@
 ! Modified by: A. Valassi (Sep 2021) for the MG5aMC CUDACPP plugin.
 ! Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 !==========================================================================
+
+#include "GpuAbstraction.h"
+
     // === PART 0 - INITIALISATION (before calculate_wavefunctions) ===
     // Reset the "matrix elements" - running sums of |M|^2 over helicities for the given event
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     allMEs[ievt] = 0;
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     allNumerators[ievt] = 0;
@@ -36,7 +39,7 @@
     // === PART 1 - HELICITY LOOP: CALCULATE WAVEFUNCTIONS ===
     // (in both CUDA and C++, using precomputed good helicities)
 
-#ifdef __CUDACC__ // CUDA OR C++
+#ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++
 
     // *** START OF PART 1a - CUDA (one event per CPU thread) ***
     // Running sum of partial amplitudes squared for event by event color selection (#402)
@@ -240,7 +243,7 @@
     // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event
     // [NB 'sum over final spins, average over initial spins', eg see
     // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf]
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     allMEs[ievt] /= helcolDenominators[0];
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     if( channelId > 0 ) allMEs[ievt] *= allNumerators[ievt] / allDenominators[ievt];
diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/rambo.h b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/rambo.h
index e02ea52496..3a331b979a 100644
--- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/rambo.h
+++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/rambo.h
@@ -18,7 +18,7 @@
 #include <iostream>
 
 // Simplified rambo version for 2 to N (with N>=2) processes with massless particles
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -83,7 +83,7 @@ namespace mg5amcCpu
       static bool first = true;
       if( first )
       {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
         if constexpr( M_ACCESS::isOnDevice() ) // avoid
         {
           const int ievt0 = 0;
@@ -166,7 +166,7 @@ namespace mg5amcCpu
     wt = po2log;
     if( nparf != 2 ) wt = ( 2. * nparf - 4. ) * log( energy ) + z[nparf - 1];
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
     // issue warnings if weight is too small or too large
     static int iwarn[5] = { 0, 0, 0, 0, 0 };
     if( wt < -180. )
diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/runTest.cc b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/runTest.cc
index 572e28aaea..05d1a08c2b 100644
--- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/runTest.cc
+++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/runTest.cc
@@ -15,7 +15,7 @@
 #include "RandomNumberKernels.h"
 #include "epoch_process_id.h"
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 using namespace mg5amcGpu;
 #else
 using namespace mg5amcCpu;
@@ -32,7 +32,7 @@ struct CUDA_CPU_TestBase : public TestDriverBase
     : TestDriverBase( npar, refFileName ) {}
 };
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 struct CPUTest : public CUDA_CPU_TestBase
 {
   // Struct data members (process, and memory structures for random numbers, momenta, matrix elements and weights on host and device)
@@ -114,7 +114,7 @@ struct CPUTest : public CUDA_CPU_TestBase
 };
 #endif
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 struct CUDATest : public CUDA_CPU_TestBase
 {
   // Reset the device when our test goes out of scope. Note that this should happen after
@@ -123,7 +123,7 @@ struct CUDATest : public CUDA_CPU_TestBase
   {
     ~DeviceReset()
     {
-      checkCuda( cudaDeviceReset() ); // this is needed by cuda-memcheck --leak-check full
+      gpuDeviceReset(); // this is needed by cuda-memcheck --leak-check full
     }
   } deviceResetter;
 
@@ -249,7 +249,7 @@ INSTANTIATE_TEST_SUITE_P( prefix, \
                           test_suite_name, \
                           testing::Values( new CUDATest( MG_EPOCH_REFERENCE_FILE_NAME ) ) );
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 MG_INSTANTIATE_TEST_SUITE_GPU( XTESTID_GPU( MG_EPOCH_PROCESS_ID ), MadgraphTest );
 #else
 MG_INSTANTIATE_TEST_SUITE_CPU( XTESTID_CPU( MG_EPOCH_PROCESS_ID ), MadgraphTest );
diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/testmisc.cc b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/testmisc.cc
index 989aba1fdc..dcafb44ee6 100644
--- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/testmisc.cc
+++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/testmisc.cc
@@ -17,7 +17,7 @@
 #include <sstream>
 #include <typeinfo>
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 #define TESTID( s ) s##_GPU_MISC
 #else
 #define TESTID( s ) s##_CPU_MISC
diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/testxxx.cc b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/testxxx.cc
index 06ad3a4052..eba58eea70 100644
--- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/testxxx.cc
+++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/testxxx.cc
@@ -20,7 +20,7 @@
 #include <iomanip>
 #include <iostream>
 #include <vector>
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 #define TESTID( s ) s##_GPU_XXX
 #else
 #define TESTID( s ) s##_CPU_XXX
@@ -41,7 +41,7 @@ TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testxxx )
   assert( nevt %% neppM == 0 ); // nevt must be a multiple of neppM
   assert( nevt %% neppV == 0 ); // nevt must be a multiple of neppV
   // Fill in the input momenta
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   mg5amcGpu::PinnedHostBufferMomenta hstMomenta( nevt ); // AOSOA[npagM][npar=4][np4=4][neppM]
 #else
   mg5amcCpu::HostBufferMomenta hstMomenta( nevt ); // AOSOA[npagM][npar=4][np4=4][neppM]
@@ -228,7 +228,7 @@ TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testxxx )
   {
     for( int ievt = 0; ievt < nevt; ievt++ )
     {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
       using namespace mg5amcGpu;
 #else
       using namespace mg5amcCpu;
diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/output.py b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/output.py
index b1b14336cd..8ff13f6967 100644
--- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/output.py
+++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/output.py
@@ -86,9 +86,9 @@ class PLUGIN_ProcessExporter(PLUGIN_export_cpp.ProcessExporterGPU):
                      'CMake': [s+'CMake/Compilers.txt', s+'CMake/Platforms.txt', s+'CMake/Macros.txt'],
                      'src': [s+'gpu/rambo.h', s+'read_slha.h', s+'read_slha.cc',
                              s+'gpu/mgOnGpuFptypes.h', s+'gpu/mgOnGpuCxtypes.h', s+'gpu/mgOnGpuVectors.h',
-                             s+'CMake/src/CMakeLists.txt'],
+                             s+'CMake/src/CMakeLists.txt', s+'gpu/GpuRuntime.h', s+'gpu/GpuAbstraction.h'],
                      'SubProcesses': [s+'gpu/nvtx.h', s+'gpu/timer.h', s+'gpu/timermap.h',
-                                      s+'gpu/ompnumthreads.h', s+'gpu/CudaRuntime.h',
+                                      s+'gpu/ompnumthreads.h', s+'gpu/GpuRuntime.h', s+'gpu/GpuAbstraction.h',
                                       s+'gpu/MemoryAccessHelpers.h', s+'gpu/MemoryAccessVectors.h',
                                       s+'gpu/MemoryAccessMatrixElements.h', s+'gpu/MemoryAccessMomenta.h',
                                       s+'gpu/MemoryAccessRandomNumbers.h', s+'gpu/MemoryAccessWeights.h',
@@ -109,7 +109,7 @@ class PLUGIN_ProcessExporter(PLUGIN_export_cpp.ProcessExporterGPU):
                                       s+'CMake/SubProcesses/CMakeLists.txt'],
                      'test': [s+'gpu/cudacpp_test.mk']}
     to_link_in_P = ['nvtx.h', 'timer.h', 'timermap.h',
-                    'ompnumthreads.h', 'CudaRuntime.h',
+                    'ompnumthreads.h', 'GpuRuntime.h', 'GpuAbstraction.h',
                     'MemoryAccessHelpers.h', 'MemoryAccessVectors.h',
                     'MemoryAccessMatrixElements.h', 'MemoryAccessMomenta.h',
                     'MemoryAccessRandomNumbers.h', 'MemoryAccessWeights.h',

From b5932deb521a79d1c8141dfa9c84db6f1c9086d6 Mon Sep 17 00:00:00 2001
From: Andrea Valassi <andrea.valassi@cern.ch>
Date: Thu, 13 Jul 2023 15:55:32 +0200
Subject: [PATCH 363/509] [jthip] change % to %% in CODEGEN cudacpp.mk

---
 .../iolibs/template_files/gpu/cudacpp.mk      | 28 +++++++++----------
 1 file changed, 14 insertions(+), 14 deletions(-)

diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/cudacpp.mk b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/cudacpp.mk
index ff63aa4a43..4cd31ea7c3 100644
--- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/cudacpp.mk
+++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/cudacpp.mk
@@ -104,7 +104,7 @@ ifeq ($(findstring nvcc,$(CUDA_COMPILER_PATH)),nvcc)
 
   # If CUDA_HOME is not set, try to set it from the location of NVCC
   ifndef CUDA_HOME
-    CUDA_HOME = $(patsubst %bin/nvcc,%,$(shell which nvcc 2>/dev/null))
+    CUDA_HOME = $(patsubst %%bin/nvcc,%%,$(shell which nvcc 2>/dev/null))
     $(warning CUDA_HOME was not set: using "$(CUDA_HOME)")
   endif
 
@@ -174,7 +174,7 @@ else ifeq ($(findstring hipcc,$(HIP_COMPILER_PATH)),hipcc)
 
   # If HIP_HOME is not set, try to set it from the location of GPUCC
   ifndef HIP_HOME
-    HIP_HOME = $(patsubst %bin/hipcc,%,$(HIP_COMPILER_PATH))
+    HIP_HOME = $(patsubst %%bin/hipcc,%%,$(HIP_COMPILER_PATH))
     $(warning HIP_HOME was not set: using "$(HIP_HOME)")
   endif
 
@@ -245,11 +245,11 @@ endif
 
 # PowerPC-specific CXX compiler flags (being reviewed)
 ifeq ($(UNAME_P),ppc64le)
-  CXXFLAGS+= -mcpu=power9 -mtune=power9 # gains ~2-3% both for none and sse4
+  CXXFLAGS+= -mcpu=power9 -mtune=power9 # gains ~2-3%% both for none and sse4
   # Throughput references without the extra flags below: none=1.41-1.42E6, sse4=2.15-2.19E6
   ###CXXFLAGS+= -DNO_WARN_X86_INTRINSICS # no change
   ###CXXFLAGS+= -fpeel-loops # no change
-  ###CXXFLAGS+= -funroll-loops # gains ~1% for none, loses ~1% for sse4
+  ###CXXFLAGS+= -funroll-loops # gains ~1%% for none, loses ~1%% for sse4
   ###CXXFLAGS+= -ftree-vectorize # no change
   ###CXXFLAGS+= -flto # would increase to none=4.08-4.12E6, sse4=4.99-5.03E6!
 else
@@ -526,11 +526,11 @@ $(BUILDDIR)/.build.$(TAG):
 
 # Generic target and build rules: objects from CUDA compilation
 ifneq ($(GPUCC),)
-$(BUILDDIR)/%.o : %.cu *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
+$(BUILDDIR)/%%.o : %%.cu *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
 	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
 	$(GPUCC) $(CPPFLAGS) $(GPUFLAGS) $(CUBUILDRULEFLAGS) $< -o $@
 
-$(BUILDDIR)/%_cu.o : %.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
+$(BUILDDIR)/%%_cu.o : %%.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
 	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
 	$(GPUCC) $(CPPFLAGS) $(GPUFLAGS) $(CCBUILDRULEFLAGS) $< -o $@
 endif
@@ -538,7 +538,7 @@ endif
 
 # Generic target and build rules: objects from C++ compilation
 # (NB do not include CUINC here! add it only for NVTX or curand #679)
-$(BUILDDIR)/%.o : %.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
+$(BUILDDIR)/%%.o : %%.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
 	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
 	$(CXX) $(CPPFLAGS) $(CXXFLAGS) -fPIC -c $< -o $@
 
@@ -627,7 +627,7 @@ endif
 #-------------------------------------------------------------------------------
 
 # Target (and build rules): Fortran include files
-###$(INCDIR)/%.inc : ../%.inc
+###$(INCDIR)/%%.inc : ../%%.inc
 ###	@if [ ! -d $(INCDIR) ]; then echo "mkdir -p $(INCDIR)"; mkdir -p $(INCDIR); fi
 ###	\cp $< $@
 
@@ -643,7 +643,7 @@ ifneq ($(shell $(CXX) --version | grep ^Intel),)
 $(cu_main): LIBFLAGS += -lintlc # compile with icpx and link with GPUCC (undefined reference to `_intel_fast_memcpy')
 $(cu_main): LIBFLAGS += -lsvml # compile with icpx and link with GPUCC (undefined reference to `__svml_cos4_l9')
 else ifneq ($(shell $(CXX) --version | grep ^nvc++),) # support nvc++ #531
-$(cu_main): LIBFLAGS += -L$(patsubst %bin/nvc++,%lib,$(subst ccache ,,$(CXX))) -lnvhpcatm -lnvcpumath -lnvc
+$(cu_main): LIBFLAGS += -L$(patsubst %%bin/nvc++,%%lib,$(subst ccache ,,$(CXX))) -lnvhpcatm -lnvcpumath -lnvc
 endif
 $(cu_main): LIBFLAGS += $(CULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH
 $(cu_main): $(BUILDDIR)/gcheck_sa.o $(LIBDIR)/lib$(MG5AMC_CULIB).so $(cu_objects_exe) $(BUILDDIR)/gCurandRandomNumberKernel.o
@@ -653,12 +653,12 @@ endif
 #-------------------------------------------------------------------------------
 
 # Generic target and build rules: objects from Fortran compilation
-$(BUILDDIR)/%.o : %.f *.inc
+$(BUILDDIR)/%%.o : %%.f *.inc
 	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
 	$(FC) -I. -c $< -o $@
 
 # Generic target and build rules: objects from Fortran compilation
-###$(BUILDDIR)/%.o : %.f *.inc
+###$(BUILDDIR)/%%.o : %%.f *.inc
 ###	@if [ ! -d $(INCDIR) ]; then echo "mkdir -p $(INCDIR)"; mkdir -p $(INCDIR); fi
 ###	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
 ###	$(FC) -I. -I$(INCDIR) -c $< -o $@
@@ -727,7 +727,7 @@ ifneq ($(shell $(CXX) --version | grep ^Intel),)
 $(testmain): LIBFLAGS += -lintlc # compile with icpx and link with GPUCC (undefined reference to `_intel_fast_memcpy')
 $(testmain): LIBFLAGS += -lsvml # compile with icpx and link with GPUCC (undefined reference to `__svml_cos4_l9')
 else ifneq ($(shell $(CXX) --version | grep ^nvc++),) # support nvc++ #531
-$(testmain): LIBFLAGS += -L$(patsubst %bin/nvc++,%lib,$(subst ccache ,,$(CXX))) -lnvhpcatm -lnvcpumath -lnvc
+$(testmain): LIBFLAGS += -L$(patsubst %%bin/nvc++,%%lib,$(subst ccache ,,$(CXX))) -lnvhpcatm -lnvcpumath -lnvc
 endif
 $(testmain): $(BUILDDIR)/runTest_cu.o
 $(testmain): cu_objects_exe  += $(BUILDDIR)/runTest_cu.o
@@ -911,14 +911,14 @@ cmpFcheck: all.$(TAG)
 	@echo
 	@echo "$(BUILDDIR)/check.exe --common -p 2 32 2"
 	@echo "$(BUILDDIR)/fcheck.exe 2 32 2"
-	@me1=$(shell $(RUNTIME) $(BUILDDIR)/check.exe --common -p 2 32 2 | grep MeanMatrix | awk '{print $$4}'); me2=$(shell $(RUNTIME) $(BUILDDIR)/fcheck.exe 2 32 2 | grep Average | awk '{print $$4}'); echo "Avg ME (C++/C++)    = $${me1}"; echo "Avg ME (F77/C++)    = $${me2}"; if [ "$${me2}" == "NaN" ]; then echo "ERROR! Fortran calculation (F77/C++) returned NaN"; elif [ "$${me2}" == "" ]; then echo "ERROR! Fortran calculation (F77/C++) crashed"; else python3 -c "me1=$${me1}; me2=$${me2}; reldif=abs((me2-me1)/me1); print('Relative difference =', reldif); ok = reldif <= 2E-4; print ( '%s (relative difference %s 2E-4)' % ( ('OK','<=') if ok else ('ERROR','>') ) ); import sys; sys.exit(0 if ok else 1)"; fi
+	@me1=$(shell $(RUNTIME) $(BUILDDIR)/check.exe --common -p 2 32 2 | grep MeanMatrix | awk '{print $$4}'); me2=$(shell $(RUNTIME) $(BUILDDIR)/fcheck.exe 2 32 2 | grep Average | awk '{print $$4}'); echo "Avg ME (C++/C++)    = $${me1}"; echo "Avg ME (F77/C++)    = $${me2}"; if [ "$${me2}" == "NaN" ]; then echo "ERROR! Fortran calculation (F77/C++) returned NaN"; elif [ "$${me2}" == "" ]; then echo "ERROR! Fortran calculation (F77/C++) crashed"; else python3 -c "me1=$${me1}; me2=$${me2}; reldif=abs((me2-me1)/me1); print('Relative difference =', reldif); ok = reldif <= 2E-4; print ( '%%s (relative difference %%s 2E-4)' %% ( ('OK','<=') if ok else ('ERROR','>') ) ); import sys; sys.exit(0 if ok else 1)"; fi
 
 # Target: cmpFGcheck (compare ME results from the CUDA and Fortran with CUDA MEs standalone executables, with a small number of events)
 cmpFGcheck: all.$(TAG)
 	@echo
 	@echo "$(BUILDDIR)/gcheck.exe --common -p 2 32 2"
 	@echo "$(BUILDDIR)/fgcheck.exe 2 32 2"
-	@me1=$(shell $(RUNTIME) $(BUILDDIR)/gcheck.exe --common -p 2 32 2 | grep MeanMatrix | awk '{print $$4}'); me2=$(shell $(RUNTIME) $(BUILDDIR)/fgcheck.exe 2 32 2 | grep Average | awk '{print $$4}'); echo "Avg ME (C++/CUDA)   = $${me1}"; echo "Avg ME (F77/CUDA)   = $${me2}"; if [ "$${me2}" == "NaN" ]; then echo "ERROR! Fortran calculation (F77/CUDA) crashed"; elif [ "$${me2}" == "" ]; then echo "ERROR! Fortran calculation (F77/CUDA) crashed"; else python3 -c "me1=$${me1}; me2=$${me2}; reldif=abs((me2-me1)/me1); print('Relative difference =', reldif); ok = reldif <= 2E-4; print ( '%s (relative difference %s 2E-4)' % ( ('OK','<=') if ok else ('ERROR','>') ) ); import sys; sys.exit(0 if ok else 1)"; fi
+	@me1=$(shell $(RUNTIME) $(BUILDDIR)/gcheck.exe --common -p 2 32 2 | grep MeanMatrix | awk '{print $$4}'); me2=$(shell $(RUNTIME) $(BUILDDIR)/fgcheck.exe 2 32 2 | grep Average | awk '{print $$4}'); echo "Avg ME (C++/CUDA)   = $${me1}"; echo "Avg ME (F77/CUDA)   = $${me2}"; if [ "$${me2}" == "NaN" ]; then echo "ERROR! Fortran calculation (F77/CUDA) crashed"; elif [ "$${me2}" == "" ]; then echo "ERROR! Fortran calculation (F77/CUDA) crashed"; else python3 -c "me1=$${me1}; me2=$${me2}; reldif=abs((me2-me1)/me1); print('Relative difference =', reldif); ok = reldif <= 2E-4; print ( '%%s (relative difference %%s 2E-4)' %% ( ('OK','<=') if ok else ('ERROR','>') ) ); import sys; sys.exit(0 if ok else 1)"; fi
 
 # Target: memcheck (run the CUDA standalone executable gcheck.exe with a small number of events through cuda-memcheck)
 memcheck: all.$(TAG)

From cceba282c2a16e29b612a0451d6bd0ae7dd85549 Mon Sep 17 00:00:00 2001
From: Andrea Valassi <andrea.valassi@cern.ch>
Date: Thu, 13 Jul 2023 16:24:12 +0200
Subject: [PATCH 364/509] [jthip] in ggttgg.mad use the GpuAbstraction.h from
 codegen (copy as is)

---
 .../cudacpp/gg_ttgg.mad/SubProcesses/GpuAbstraction.h | 11 ++---------
 1 file changed, 2 insertions(+), 9 deletions(-)

diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/GpuAbstraction.h b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/GpuAbstraction.h
index 782cb96e8c..98a0124b55 100644
--- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/GpuAbstraction.h
+++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/GpuAbstraction.h
@@ -3,14 +3,7 @@
 
 #include <cassert>
 
-/*
-  ToDo:
-      * Fix rpath in makefile when compiling with HIP
-      * Fix warnings with improper hip function return code handling
-*/
-
-
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   #define MGONGPUCPP_CUDACC 1
 #endif
 
@@ -22,7 +15,7 @@
 #ifdef MGONGPUCPP_CUDACC
 
   // Defines correct compiler
-  #define MGONGPUCPP_GPUIMPL __CUDACC__
+  #define MGONGPUCPP_GPUIMPL MGONGPUCPP_GPUIMPL
 
   //--------------------------------------------------------------------------
 

From f6ca346527f7170d90be0e41c363d1aed331a318 Mon Sep 17 00:00:00 2001
From: Andrea Valassi <andrea.valassi@cern.ch>
Date: Thu, 13 Jul 2023 16:25:15 +0200
Subject: [PATCH 365/509] [jthip] clang-format GpuAbstraction.h both in CODEGEN
 and in ggttgg.mad

---
 .../template_files/gpu/GpuAbstraction.h       | 86 +++++++++----------
 .../gg_ttgg.mad/SubProcesses/GpuAbstraction.h | 86 +++++++++----------
 2 files changed, 86 insertions(+), 86 deletions(-)

diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/GpuAbstraction.h b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/GpuAbstraction.h
index 98a0124b55..2f000e33d1 100644
--- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/GpuAbstraction.h
+++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/GpuAbstraction.h
@@ -4,75 +4,75 @@
 #include <cassert>
 
 #ifdef MGONGPUCPP_GPUIMPL
-  #define MGONGPUCPP_CUDACC 1
+#define MGONGPUCPP_CUDACC 1
 #endif
 
 #ifdef __HIPCC__
-  #include "hip/hip_runtime.h"
-  #define MGONGPUCPP_HIPCC 1
+#include "hip/hip_runtime.h"
+#define MGONGPUCPP_HIPCC 1
 #endif
 
 #ifdef MGONGPUCPP_CUDACC
 
-  // Defines correct compiler
-  #define MGONGPUCPP_GPUIMPL MGONGPUCPP_GPUIMPL
+// Defines correct compiler
+#define MGONGPUCPP_GPUIMPL MGONGPUCPP_GPUIMPL
 
-  //--------------------------------------------------------------------------
+//--------------------------------------------------------------------------
 
-  #define gpuError_t cudaError_t
-  #define gpuPeekAtLastError cudaPeekAtLastError
-  #define gpuGetErrorString cudaGetErrorString
-  #define gpuSuccess cudaSuccess
+#define gpuError_t cudaError_t
+#define gpuPeekAtLastError cudaPeekAtLastError
+#define gpuGetErrorString cudaGetErrorString
+#define gpuSuccess cudaSuccess
 
-  #define gpuMallocHost(ptr, size) checkGpu( cudaMallocHost(ptr, size) )
-  #define gpuMalloc(ptr, size) checkGpu( cudaMalloc(ptr, size) )
+#define gpuMallocHost( ptr, size ) checkGpu( cudaMallocHost( ptr, size ) )
+#define gpuMalloc( ptr, size ) checkGpu( cudaMalloc( ptr, size ) )
 
-  #define gpuMemcpy(dstData, srcData, srcBytes, func) checkGpu( cudaMemcpy(dstData, srcData, srcBytes, func) )
-  #define gpuMemcpyHostToDevice cudaMemcpyHostToDevice
-  #define gpuMemcpyDeviceToHost cudaMemcpyDeviceToHost
-  #define gpuMemcpyToSymbol(type1, type2, size) checkGpu( cudaMemcpyToSymbol(type1, type2, size) )
+#define gpuMemcpy( dstData, srcData, srcBytes, func ) checkGpu( cudaMemcpy( dstData, srcData, srcBytes, func ) )
+#define gpuMemcpyHostToDevice cudaMemcpyHostToDevice
+#define gpuMemcpyDeviceToHost cudaMemcpyDeviceToHost
+#define gpuMemcpyToSymbol( type1, type2, size ) checkGpu( cudaMemcpyToSymbol( type1, type2, size ) )
 
-  #define gpuFree(ptr) checkGpu( cudaFree(ptr) )
-  #define gpuFreeHost(ptr) checkGpu( cudaFreeHost(ptr) )
+#define gpuFree( ptr ) checkGpu( cudaFree( ptr ) )
+#define gpuFreeHost( ptr ) checkGpu( cudaFreeHost( ptr ) )
 
-  #define gpuSetDevice cudaSetDevice
-  #define gpuDeviceSynchronize cudaDeviceSynchronize
-  #define gpuDeviceReset cudaDeviceReset
+#define gpuSetDevice cudaSetDevice
+#define gpuDeviceSynchronize cudaDeviceSynchronize
+#define gpuDeviceReset cudaDeviceReset
 
-  #define gpuLaunchKernel( kernel, blocks, threads, ...)                    kernel<<<blocks, threads>>> (__VA_ARGS__)
-  #define gpuLaunchKernelSharedMem(kernel, blocks, threads, sharedMem, ...) kernel<<<blocks, threads, sharedMem>>>(__VA_ARGS__)
+#define gpuLaunchKernel( kernel, blocks, threads, ... ) kernel<<<blocks, threads>>>( __VA_ARGS__ )
+#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<<blocks, threads, sharedMem>>>( __VA_ARGS__ )
 
 //--------------------------------------------------------------------------
 
 #elif defined MGONGPUCPP_HIPCC
 
-  // Defines correct compiler
-  #define MGONGPUCPP_GPUIMPL __HCC__
+// Defines correct compiler
+#define MGONGPUCPP_GPUIMPL __HCC__
 
-  //--------------------------------------------------------------------------
+//--------------------------------------------------------------------------
 
-  #define gpuError_t hipError_t
-  #define gpuPeekAtLastError hipPeekAtLastError
-  #define gpuGetErrorString hipGetErrorString
-  #define gpuSuccess hipSuccess
+#define gpuError_t hipError_t
+#define gpuPeekAtLastError hipPeekAtLastError
+#define gpuGetErrorString hipGetErrorString
+#define gpuSuccess hipSuccess
 
-  #define gpuMallocHost(ptr, size) checkGpu( hipHostMalloc(ptr, size) ) // HostMalloc better
-  #define gpuMalloc(ptr, size) checkGpu( hipMalloc(ptr, size) )
+#define gpuMallocHost( ptr, size ) checkGpu( hipHostMalloc( ptr, size ) ) // HostMalloc better
+#define gpuMalloc( ptr, size ) checkGpu( hipMalloc( ptr, size ) )
 
-  #define gpuMemcpy(dstData, srcData, srcBytes, func) checkGpu( hipMemcpy(dstData, srcData, srcBytes, func) )
-  #define gpuMemcpyHostToDevice hipMemcpyHostToDevice
-  #define gpuMemcpyDeviceToHost hipMemcpyDeviceToHost
-  #define gpuMemcpyToSymbol(type1, type2, size) checkGpu( hipMemcpyToSymbol(type1, type2, size) )
+#define gpuMemcpy( dstData, srcData, srcBytes, func ) checkGpu( hipMemcpy( dstData, srcData, srcBytes, func ) )
+#define gpuMemcpyHostToDevice hipMemcpyHostToDevice
+#define gpuMemcpyDeviceToHost hipMemcpyDeviceToHost
+#define gpuMemcpyToSymbol( type1, type2, size ) checkGpu( hipMemcpyToSymbol( type1, type2, size ) )
 
-  #define gpuFree(ptr) checkGpu( hipFree(ptr) )
-  #define gpuFreeHost(ptr) checkGpu( hipHostFree(ptr) )
+#define gpuFree( ptr ) checkGpu( hipFree( ptr ) )
+#define gpuFreeHost( ptr ) checkGpu( hipHostFree( ptr ) )
 
-  #define gpuSetDevice hipSetDevice
-  #define gpuDeviceSynchronize hipDeviceSynchronize
-  #define gpuDeviceReset hipDeviceReset
+#define gpuSetDevice hipSetDevice
+#define gpuDeviceSynchronize hipDeviceSynchronize
+#define gpuDeviceReset hipDeviceReset
 
-  #define gpuLaunchKernel( kernel, blocks, threads, ...)                    kernel<<<blocks, threads>>> (__VA_ARGS__)
-  #define gpuLaunchKernelSharedMem(kernel, blocks, threads, sharedMem, ...) kernel<<<blocks, threads, sharedMem>>>(__VA_ARGS__)
+#define gpuLaunchKernel( kernel, blocks, threads, ... ) kernel<<<blocks, threads>>>( __VA_ARGS__ )
+#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<<blocks, threads, sharedMem>>>( __VA_ARGS__ )
 
 #endif
 
diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/GpuAbstraction.h b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/GpuAbstraction.h
index 98a0124b55..2f000e33d1 100644
--- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/GpuAbstraction.h
+++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/GpuAbstraction.h
@@ -4,75 +4,75 @@
 #include <cassert>
 
 #ifdef MGONGPUCPP_GPUIMPL
-  #define MGONGPUCPP_CUDACC 1
+#define MGONGPUCPP_CUDACC 1
 #endif
 
 #ifdef __HIPCC__
-  #include "hip/hip_runtime.h"
-  #define MGONGPUCPP_HIPCC 1
+#include "hip/hip_runtime.h"
+#define MGONGPUCPP_HIPCC 1
 #endif
 
 #ifdef MGONGPUCPP_CUDACC
 
-  // Defines correct compiler
-  #define MGONGPUCPP_GPUIMPL MGONGPUCPP_GPUIMPL
+// Defines correct compiler
+#define MGONGPUCPP_GPUIMPL MGONGPUCPP_GPUIMPL
 
-  //--------------------------------------------------------------------------
+//--------------------------------------------------------------------------
 
-  #define gpuError_t cudaError_t
-  #define gpuPeekAtLastError cudaPeekAtLastError
-  #define gpuGetErrorString cudaGetErrorString
-  #define gpuSuccess cudaSuccess
+#define gpuError_t cudaError_t
+#define gpuPeekAtLastError cudaPeekAtLastError
+#define gpuGetErrorString cudaGetErrorString
+#define gpuSuccess cudaSuccess
 
-  #define gpuMallocHost(ptr, size) checkGpu( cudaMallocHost(ptr, size) )
-  #define gpuMalloc(ptr, size) checkGpu( cudaMalloc(ptr, size) )
+#define gpuMallocHost( ptr, size ) checkGpu( cudaMallocHost( ptr, size ) )
+#define gpuMalloc( ptr, size ) checkGpu( cudaMalloc( ptr, size ) )
 
-  #define gpuMemcpy(dstData, srcData, srcBytes, func) checkGpu( cudaMemcpy(dstData, srcData, srcBytes, func) )
-  #define gpuMemcpyHostToDevice cudaMemcpyHostToDevice
-  #define gpuMemcpyDeviceToHost cudaMemcpyDeviceToHost
-  #define gpuMemcpyToSymbol(type1, type2, size) checkGpu( cudaMemcpyToSymbol(type1, type2, size) )
+#define gpuMemcpy( dstData, srcData, srcBytes, func ) checkGpu( cudaMemcpy( dstData, srcData, srcBytes, func ) )
+#define gpuMemcpyHostToDevice cudaMemcpyHostToDevice
+#define gpuMemcpyDeviceToHost cudaMemcpyDeviceToHost
+#define gpuMemcpyToSymbol( type1, type2, size ) checkGpu( cudaMemcpyToSymbol( type1, type2, size ) )
 
-  #define gpuFree(ptr) checkGpu( cudaFree(ptr) )
-  #define gpuFreeHost(ptr) checkGpu( cudaFreeHost(ptr) )
+#define gpuFree( ptr ) checkGpu( cudaFree( ptr ) )
+#define gpuFreeHost( ptr ) checkGpu( cudaFreeHost( ptr ) )
 
-  #define gpuSetDevice cudaSetDevice
-  #define gpuDeviceSynchronize cudaDeviceSynchronize
-  #define gpuDeviceReset cudaDeviceReset
+#define gpuSetDevice cudaSetDevice
+#define gpuDeviceSynchronize cudaDeviceSynchronize
+#define gpuDeviceReset cudaDeviceReset
 
-  #define gpuLaunchKernel( kernel, blocks, threads, ...)                    kernel<<<blocks, threads>>> (__VA_ARGS__)
-  #define gpuLaunchKernelSharedMem(kernel, blocks, threads, sharedMem, ...) kernel<<<blocks, threads, sharedMem>>>(__VA_ARGS__)
+#define gpuLaunchKernel( kernel, blocks, threads, ... ) kernel<<<blocks, threads>>>( __VA_ARGS__ )
+#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<<blocks, threads, sharedMem>>>( __VA_ARGS__ )
 
 //--------------------------------------------------------------------------
 
 #elif defined MGONGPUCPP_HIPCC
 
-  // Defines correct compiler
-  #define MGONGPUCPP_GPUIMPL __HCC__
+// Defines correct compiler
+#define MGONGPUCPP_GPUIMPL __HCC__
 
-  //--------------------------------------------------------------------------
+//--------------------------------------------------------------------------
 
-  #define gpuError_t hipError_t
-  #define gpuPeekAtLastError hipPeekAtLastError
-  #define gpuGetErrorString hipGetErrorString
-  #define gpuSuccess hipSuccess
+#define gpuError_t hipError_t
+#define gpuPeekAtLastError hipPeekAtLastError
+#define gpuGetErrorString hipGetErrorString
+#define gpuSuccess hipSuccess
 
-  #define gpuMallocHost(ptr, size) checkGpu( hipHostMalloc(ptr, size) ) // HostMalloc better
-  #define gpuMalloc(ptr, size) checkGpu( hipMalloc(ptr, size) )
+#define gpuMallocHost( ptr, size ) checkGpu( hipHostMalloc( ptr, size ) ) // HostMalloc better
+#define gpuMalloc( ptr, size ) checkGpu( hipMalloc( ptr, size ) )
 
-  #define gpuMemcpy(dstData, srcData, srcBytes, func) checkGpu( hipMemcpy(dstData, srcData, srcBytes, func) )
-  #define gpuMemcpyHostToDevice hipMemcpyHostToDevice
-  #define gpuMemcpyDeviceToHost hipMemcpyDeviceToHost
-  #define gpuMemcpyToSymbol(type1, type2, size) checkGpu( hipMemcpyToSymbol(type1, type2, size) )
+#define gpuMemcpy( dstData, srcData, srcBytes, func ) checkGpu( hipMemcpy( dstData, srcData, srcBytes, func ) )
+#define gpuMemcpyHostToDevice hipMemcpyHostToDevice
+#define gpuMemcpyDeviceToHost hipMemcpyDeviceToHost
+#define gpuMemcpyToSymbol( type1, type2, size ) checkGpu( hipMemcpyToSymbol( type1, type2, size ) )
 
-  #define gpuFree(ptr) checkGpu( hipFree(ptr) )
-  #define gpuFreeHost(ptr) checkGpu( hipHostFree(ptr) )
+#define gpuFree( ptr ) checkGpu( hipFree( ptr ) )
+#define gpuFreeHost( ptr ) checkGpu( hipHostFree( ptr ) )
 
-  #define gpuSetDevice hipSetDevice
-  #define gpuDeviceSynchronize hipDeviceSynchronize
-  #define gpuDeviceReset hipDeviceReset
+#define gpuSetDevice hipSetDevice
+#define gpuDeviceSynchronize hipDeviceSynchronize
+#define gpuDeviceReset hipDeviceReset
 
-  #define gpuLaunchKernel( kernel, blocks, threads, ...)                    kernel<<<blocks, threads>>> (__VA_ARGS__)
-  #define gpuLaunchKernelSharedMem(kernel, blocks, threads, sharedMem, ...) kernel<<<blocks, threads, sharedMem>>>(__VA_ARGS__)
+#define gpuLaunchKernel( kernel, blocks, threads, ... ) kernel<<<blocks, threads>>>( __VA_ARGS__ )
+#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<<blocks, threads, sharedMem>>>( __VA_ARGS__ )
 
 #endif
 

From c33d0af944f54a412521f190e8dd399c07142a2b Mon Sep 17 00:00:00 2001
From: Andrea Valassi <andrea.valassi@cern.ch>
Date: Thu, 13 Jul 2023 16:27:55 +0200
Subject: [PATCH 366/509] [jthip] clang-format GpuRuntime.h both in CODEGEN and
 in ggttgg.mad

---
 .../madgraph/iolibs/template_files/gpu/GpuRuntime.h             | 2 +-
 epochX/cudacpp/gg_ttgg.mad/SubProcesses/GpuRuntime.h            | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/GpuRuntime.h b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/GpuRuntime.h
index 86c9179f4c..895a662e52 100644
--- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/GpuRuntime.h
+++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/GpuRuntime.h
@@ -59,7 +59,7 @@ namespace mg5amcGpu
       // Replace cudaFree(0) by cudaSetDevice(0), even if it is not really needed either
       // (but see https://developer.nvidia.com/blog/cuda-pro-tip-always-set-current-device-avoid-multithreading-bugs)
       if( debug ) std::cout << "__GpuRuntime: calling GpuSetDevice(0)" << std::endl;
-      checkGpu ( gpuSetDevice( 0 ) ); // SLOW!
+      checkGpu( gpuSetDevice( 0 ) ); // SLOW!
     }
 
     // Tear down CUDA application (call cudaDeviceReset)
diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/GpuRuntime.h b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/GpuRuntime.h
index 86c9179f4c..895a662e52 100644
--- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/GpuRuntime.h
+++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/GpuRuntime.h
@@ -59,7 +59,7 @@ namespace mg5amcGpu
       // Replace cudaFree(0) by cudaSetDevice(0), even if it is not really needed either
       // (but see https://developer.nvidia.com/blog/cuda-pro-tip-always-set-current-device-avoid-multithreading-bugs)
       if( debug ) std::cout << "__GpuRuntime: calling GpuSetDevice(0)" << std::endl;
-      checkGpu ( gpuSetDevice( 0 ) ); // SLOW!
+      checkGpu( gpuSetDevice( 0 ) ); // SLOW!
     }
 
     // Tear down CUDA application (call cudaDeviceReset)

From 9872c95b1b4721d0ed9fb0357c19100c7e8475f6 Mon Sep 17 00:00:00 2001
From: Jorgen T <jorgen.teig@gmail.com>
Date: Mon, 17 Jul 2023 14:47:32 +0200
Subject: [PATCH 367/509] Made the codegenerated files same as the templated
 files in gg_ttgg

---
 .../madgraph/iolibs/template_files/gpu/Bridge.h               | 4 ++--
 .../iolibs/template_files/gpu/MatrixElementKernels.cc         | 4 ++--
 .../madgraph/iolibs/template_files/gpu/fbridge.cc             | 4 ++--
 epochX/cudacpp/gg_ttgg.mad/SubProcesses/BridgeKernels.h       | 1 -
 .../cudacpp/gg_ttgg.mad/SubProcesses/MatrixElementKernels.cc  | 4 ++--
 5 files changed, 8 insertions(+), 9 deletions(-)

diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/Bridge.h b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/Bridge.h
index 08c7493713..3c96940cc1 100644
--- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/Bridge.h
+++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/Bridge.h
@@ -283,11 +283,11 @@ namespace mg5amcCpu
     constexpr int neppM = MemoryAccessMomenta::neppM;
     if constexpr( neppM == 1 && std::is_same_v<FORTRANFPTYPE, fptype> )
     {
-      gpuMemcpy( m_devMomentaC.data(), momenta, m_devMomentaC.bytes(), cudaMemcpyHostToDevice );
+      gpuMemcpy( m_devMomentaC.data(), momenta, m_devMomentaC.bytes(), gpuMemcpyHostToDevice );
     }
     else
     {
-      gpuMemcpy( m_devMomentaF.data(), momenta, m_devMomentaF.bytes(), cudaMemcpyHostToDevice );
+      gpuMemcpy( m_devMomentaF.data(), momenta, m_devMomentaF.bytes(), gpuMemcpyHostToDevice );
       const int thrPerEvt = CPPProcess::npar * CPPProcess::np4; // AV: transpose alg does 1 element per thread (NOT 1 event per thread)
       //const int thrPerEvt = 1; // AV: try new alg with 1 event per thread... this seems slower
       gpuLaunchKernel( dev_transposeMomentaF2C, m_gpublocks * thrPerEvt, m_gputhreads, m_devMomentaF.data(), m_devMomentaC.data(), m_nevt );
diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MatrixElementKernels.cc b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MatrixElementKernels.cc
index dd3eee4ea3..a9e20e114f 100644
--- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MatrixElementKernels.cc
+++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MatrixElementKernels.cc
@@ -226,9 +226,9 @@ namespace mg5amcGpu
     constexpr unsigned int sharedMemSize = ntpbMAX * sizeof( float );
 #endif
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    gpuLaunchKernelSharedmem( sigmaKin, m_gpublocks, m_gputhreads, sharedMemSize, m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), channelId, m_numerators.data(), m_denominators.data(), m_selhel.data(), m_selcol.data() );
+    gpuLaunchKernelSharedMem( sigmaKin, m_gpublocks, m_gputhreads, sharedMemSize, m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), channelId, m_numerators.data(), m_denominators.data(), m_selhel.data(), m_selcol.data() );
 #else
-    gpuLaunchKernelSharedmem( sigmaKin, m_gpublocks, m_gputhreads, sharedMemSize, m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), m_selhel.data(), m_selcol.data() );
+    gpuLaunchKernelSharedMem( sigmaKin, m_gpublocks, m_gputhreads, sharedMemSize, m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), m_selhel.data(), m_selcol.data() );
 #endif
     checkGpu( gpuPeekAtLastError() );
     checkGpu( gpuDeviceSynchronize() );
diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/fbridge.cc b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/fbridge.cc
index 5f91f007ee..343b6b8d9c 100644
--- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/fbridge.cc
+++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/fbridge.cc
@@ -47,7 +47,7 @@ extern "C"
   void fbridgecreate_( CppObjectInFortran** ppbridge, const int* pnevtF, const int* pnparF, const int* pnp4F )
   {
 #ifdef MGONGPUCPP_GPUIMPL
-    CudaRuntime::setUp();
+    GpuRuntime::setUp();
 #endif
     // Create a process object, read parm card and set parameters
     // FIXME: the process instance can happily go out of scope because it is only needed to read parameters?
@@ -70,7 +70,7 @@ extern "C"
     if( pbridge == 0 ) throw std::runtime_error( "fbridgedelete_: invalid Bridge address" );
     delete pbridge;
 #ifdef MGONGPUCPP_GPUIMPL
-    CudaRuntime::tearDown();
+    GpuRuntime::tearDown();
 #endif
   }
 
diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/BridgeKernels.h b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/BridgeKernels.h
index 4f0a560d4b..7c7feb692a 100644
--- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/BridgeKernels.h
+++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/BridgeKernels.h
@@ -9,7 +9,6 @@
 #include "mgOnGpuConfig.h"
 
 #include "Bridge.h"
-#include "GpuAbstraction.h"
 #include "MatrixElementKernels.h"
 #include "MemoryBuffers.h"
 
diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MatrixElementKernels.cc b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MatrixElementKernels.cc
index c2e08cdce0..a9e20e114f 100644
--- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MatrixElementKernels.cc
+++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MatrixElementKernels.cc
@@ -6,7 +6,7 @@
 #include "MatrixElementKernels.h"
 
 #include "CPPProcess.h"
-#include "GpuRuntime.h"
+#include "GpuRuntime.h" // Includes the abstraction for Nvidia/AMD compilation
 #include "MemoryAccessMomenta.h"
 #include "MemoryBuffers.h"
 
@@ -202,7 +202,7 @@ namespace mg5amcGpu
     PinnedHostBufferHelicityMask hstIsGoodHel( ncomb );
     DeviceBufferHelicityMask devIsGoodHel( ncomb );
     // ... 0d1. Compute good helicity mask on the device
-    gpuLaunchKernel( computeDependentCouplings, m_gpublocks, m_gputhreads,  m_gs.data(), m_couplings.data() );
+    gpuLaunchKernel( computeDependentCouplings, m_gpublocks, m_gputhreads, m_gs.data(), m_couplings.data() );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     gpuLaunchKernel( sigmaKin_getGoodHel, m_gpublocks, m_gputhreads, m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_numerators.data(), m_denominators.data(), devIsGoodHel.data() );
 #else

From fc4ef6b3d1d6707c6964a7dd319f330bd0a81ab4 Mon Sep 17 00:00:00 2001
From: Jorgen T <jorgen.teig@gmail.com>
Date: Tue, 18 Jul 2023 14:48:51 +0200
Subject: [PATCH 368/509] Delete unneccessary file at this point

---
 .../SubProcesses/RocrandRandomNumberKernel.cc | 131 ------------------
 1 file changed, 131 deletions(-)
 delete mode 100644 epochX/cudacpp/gg_ttgg.mad/SubProcesses/RocrandRandomNumberKernel.cc

diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/RocrandRandomNumberKernel.cc b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/RocrandRandomNumberKernel.cc
deleted file mode 100644
index fb5a8ef1c8..0000000000
--- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/RocrandRandomNumberKernel.cc
+++ /dev/null
@@ -1,131 +0,0 @@
-// Copyright (C) 2020-2023 CERN and UCLouvain.
-// Licensed under the GNU Lesser General Public License (version 3 or later).
-// Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
-
-#include "GpuRuntime.h"
-#include "MemoryBuffers.h"
-#include "RandomNumberKernels.h"
-
-#include <cassert>
-
-#ifndef MGONGPU_HAS_NO_ROCRAND /* clang-format off */
-#include "rocrand.h"
-#define checkRocRand( code ){ assertRocRand( code, __FILE__, __LINE__ ); }
-inline void assertRocRand( rocrandStatus_t code, const char *file, int line, bool abort = true )
-{
-  if ( code != ROCRAND_STATUS_SUCCESS )
-  {
-    printf( "RocRandAssert: %s:%d code=%d\n", file, line, code );
-    if ( abort ) assert( code == ROCRAND_STATUS_SUCCESS );
-  }
-}
-#endif /* clang-format on */
-
-#ifdef MGONGPUCPP_HIPCC
-namespace mg5amcGpu
-#else
-namespace mg5amcCpu
-#endif
-{
-  //--------------------------------------------------------------------------
-#ifndef MGONGPU_HAS_NO_ROCRAND
-  RocRandRandomNumberKernel::RocRandRandomNumberKernel( BufferRndNumMomenta& rnarray, const bool onDevice )
-    : RandomNumberKernelBase( rnarray )
-    , m_isOnDevice( onDevice )
-  {
-    if( m_isOnDevice )
-    {
-#ifdef MGONGPUCPP_HIPCC
-      if( !m_rnarray.isOnDevice() )
-        throw std::runtime_error( "RocRandRandomNumberKernel on device with a host random number array" );
-#else
-      throw std::runtime_error( "RocRandRandomNumberKernel does not support RocRandDevice on CPU host" );
-#endif
-    }
-    else
-    {
-      if( m_rnarray.isOnDevice() )
-        throw std::runtime_error( "RocRandRandomNumberKernel on host with a device random number array" );
-    }
-    createGenerator();
-  }
-
-  //--------------------------------------------------------------------------
-
-  RocRandRandomNumberKernel::~RocRandRandomNumberKernel()
-  {
-    destroyGenerator();
-  }
-
-  //--------------------------------------------------------------------------
-
-  void RocRandRandomNumberKernel::seedGenerator( const unsigned int seed )
-  {
-    if( m_isOnDevice )
-    {
-      destroyGenerator(); // workaround for #429
-      createGenerator();  // workaround for #429
-    }
-    //printf( "seedGenerator: seed %d\n", seed );
-    checkRocRand( rocrand_set_seed( m_rnGen, seed ) );
-  }
-
-  //--------------------------------------------------------------------------
-
-  void RocRandRandomNumberKernel::createGenerator()
-  {
-    // [NB Timings are for GenRnGen host|device (cpp|cuda) generation of 256*32*1 events with nproc=1: rn(0) is host=0.0012s]
-    const rocrand_rng_type type = ROCRAND_RNG_PSEUDO_MTGP32; // 0.00082s | 0.00064s (FOR FAST TESTS) NO TESTS DONE WITH ROCRAND, BUT THIS IS FASTEST IN CUDA
-    //const rocrand_rng_type type = ROCRAND_RNG_PSEUDO_XORWOW;
-    //const rocrand_rng_type type = ROCRAND_RNG_PSEUDO_MRG32K3A;
-    //const rocrand_rng_type type = ROCRAND_RNG_PSEUDO_MT19937;
-    //const rocrand_rng_type type = ROCRAND_RNG_PSEUDO_PHILOX4_32_10;
-    if( m_isOnDevice )
-    {
-      checkRocRand( rocrand_create_generator( &m_rnGen, type ) );
-    }
-    else
-    {
-      checkRocRand( rocrand_create_generator_host( &m_rnGen, type ) );
-    }
-    // No RocRAND equivalent for curandSetGeneratorOrdering
-  }
-
-  //--------------------------------------------------------------------------
-
-  void RocRandRandomNumberKernel::destroyGenerator()
-  {
-    checkRocRand( rocrand_destroy_generator( m_rnGen ) );
-  }
-
-  //--------------------------------------------------------------------------
-
-  void RocRandRandomNumberKernel::generateRnarray()
-  {
-#if defined MGONGPU_FPTYPE_DOUBLE
-    checkRocRand( rocrand_generate_uniform_double( m_rnGen, m_rnarray.data(), m_rnarray.size() ) );
-#elif defined MGONGPU_FPTYPE_FLOAT
-    checkRocRand( rocrand_generate_uniform( m_rnGen, m_rnarray.data(), m_rnarray.size() ) );
-#endif
-    /* 
-    printf( "\nRocRandRandomNumberKernel::generateRnarray size = %d\n", (int)m_rnarray.size() );
-    fptype* data = m_rnarray.data();
-#ifdef MGONGPUCPP_GPUIMPL
-    if( m_rnarray.isOnDevice() )
-    {
-      data = new fptype[m_rnarray.size()]();
-      checkRoc( rocMemcpy( data, m_rnarray.data(), m_rnarray.bytes(), rocMemcpyDeviceToHost ) );
-    }
-#endif
-    for( int i = 0; i < ( (int)m_rnarray.size() / 4 ); i++ )
-      printf( "[%4d] %f %f %f %f\n", i * 4, data[i * 4], data[i * 4 + 2], data[i * 4 + 2], data[i * 4 + 3] );
-#ifdef MGONGPUCPP_GPUIMPL
-    if( m_rnarray.isOnDevice() ) delete[] data;
-#endif
-    */
-  }
-
-  //--------------------------------------------------------------------------
-#endif
-}
\ No newline at end of file

From 67e56133ca3f8de5796ea89ff5caeddefab8d6c0 Mon Sep 17 00:00:00 2001
From: Andrea Valassi <andrea.valassi@cern.ch>
Date: Tue, 18 Jul 2023 16:48:51 +0200
Subject: [PATCH 369/509] [jthip] in ggttgg.mad mgOnGpuConfig.h, remove include
 of GpuRuntime.h

---
 epochX/cudacpp/gg_ttgg.mad/src/mgOnGpuConfig.h | 2 --
 1 file changed, 2 deletions(-)

diff --git a/epochX/cudacpp/gg_ttgg.mad/src/mgOnGpuConfig.h b/epochX/cudacpp/gg_ttgg.mad/src/mgOnGpuConfig.h
index 8a393a7231..b21f044b5c 100644
--- a/epochX/cudacpp/gg_ttgg.mad/src/mgOnGpuConfig.h
+++ b/epochX/cudacpp/gg_ttgg.mad/src/mgOnGpuConfig.h
@@ -6,8 +6,6 @@
 #ifndef MGONGPUCONFIG_H
 #define MGONGPUCONFIG_H 1
 
-#include "GpuRuntime.h"
-
 // HARDCODED AT CODE GENERATION TIME: DO NOT MODIFY (#473)
 // There are two different code bases for standalone_cudacpp (without multichannel) and madevent+cudacpp (with multichannel)
 #define MGONGPU_SUPPORTS_MULTICHANNEL 1

From 81cad87e0f9fedaee6840d52eae8d862a2cf3423 Mon Sep 17 00:00:00 2001
From: Andrea Valassi <andrea.valassi@cern.ch>
Date: Tue, 18 Jul 2023 17:30:10 +0200
Subject: [PATCH 370/509] [jthip] in ggttgg.mad, clean up mgOnGpuConfig.h for
 CUDA/HIP/C++ together with Jorgen

---
 .../cudacpp/gg_ttgg.mad/src/mgOnGpuConfig.h   | 72 ++++++++++++-------
 1 file changed, 46 insertions(+), 26 deletions(-)

diff --git a/epochX/cudacpp/gg_ttgg.mad/src/mgOnGpuConfig.h b/epochX/cudacpp/gg_ttgg.mad/src/mgOnGpuConfig.h
index b21f044b5c..51e88ae9d1 100644
--- a/epochX/cudacpp/gg_ttgg.mad/src/mgOnGpuConfig.h
+++ b/epochX/cudacpp/gg_ttgg.mad/src/mgOnGpuConfig.h
@@ -10,16 +10,27 @@
 // There are two different code bases for standalone_cudacpp (without multichannel) and madevent+cudacpp (with multichannel)
 #define MGONGPU_SUPPORTS_MULTICHANNEL 1
 
+// Is this a GPU (CUDA, HIP) or CPU implementation?
+#ifdef __CUDACC__
+#define MGONGPUCPP_GPUIMPL cuda
+#elif defined __HIPCC__
+#define MGONGPUCPP_GPUIMPL hip
+#else
+#undef MGONGPUCPP_GPUIMPL
+#endif
+
 // ** NB1 Throughputs (e.g. 6.8E8) are events/sec for "./gcheck.exe -p 65536 128 12"
 // ** NB2 Baseline on b7g47n0004 fluctuates (probably depends on load on other VMs)
 
 // Choose if curand is supported for generating random numbers
+// For CUDA, by default, it is supported
+// For HIP, by default, it is not supported
 // For C++, by default, do not inline, but allow this macro to be set from outside with e.g. -DMGONGPU_HAS_NO_CURAND
-// Added support for HIP compilation by defining MGONGPU_HAS_NO_CURAND
-#ifdef MGONGPUCPP_CUDACC
+#ifdef __CUDACC__
 #undef MGONGPU_HAS_NO_CURAND
-#elif defined MGONGPUCPP_HIPCC
+#elif defined __HIPCC__
 #define MGONGPU_HAS_NO_CURAND 1
+#else
 //#undef MGONGPU_HAS_NO_CURAND // default
 ////#define MGONGPU_HAS_NO_CURAND 1
 #endif
@@ -54,23 +65,28 @@
 //#undef MGONGPU_HARDCODE_PARAM // default
 ////#define MGONGPU_HARDCODE_PARAM 1
 
-// Complex type in c++: std::complex or cxsmpl (CHOOSE ONLY ONE)
-#ifndef MGONGPUCPP_GPUIMPL
-//#define MGONGPU_CPPCXTYPE_STDCOMPLEX 1 // ~8 percent slower on float, same on double (5.1E6/double, 9.4E6/float)
-#define MGONGPU_CPPCXTYPE_CXSMPL 1 // new default (5.1E6/double, 10.2E6/float)
-#endif
-
-// Complex type in cuda: thrust or cucomplex or cxsmpl (CHOOSE ONLY ONE)
-#ifdef MGONGPUCPP_GPUIMPL
+// Complex type in CUDA: thrust or cucomplex or cxsmpl (CHOOSE ONLY ONE)
+#ifdef __CUDACC__
 #define MGONGPU_CUCXTYPE_THRUST 1 // default (~1.15E9/double, ~3.2E9/float)
 //#define MGONGPU_CUCXTYPE_CUCOMPLEX 1 // ~10 percent slower (1.03E9/double, ~2.8E9/float)
 //#define MGONGPU_CUCXTYPE_CXSMPL 1 // ~10 percent slower (1.00E9/double, ~2.9E9/float)
+
+// Complex type in HIP: cxsmpl (ONLY ONE OPTION POSSIBLE)
+#elif defined __HIPCC__
+#define MGONGPU_CUCXTYPE_CXSMPL 1 // ~10 percent slower (1.00E9/double, ~2.9E9/float)
+
+// Complex type in C++: std::complex or cxsmpl (CHOOSE ONLY ONE)
+#else
+//#define MGONGPU_CPPCXTYPE_STDCOMPLEX 1 // ~8 percent slower on float, same on double (5.1E6/double, 9.4E6/float)
+#define MGONGPU_CPPCXTYPE_CXSMPL 1 // new default (5.1E6/double, 10.2E6/float)
 #endif
 
-// Cuda nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation
-#ifdef MGONGPUCPP_GPUIMPL
-#undef MGONGPU_NSIGHT_DEBUG // default
+// CUDA nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation
+#ifdef __CUDACC__
+#undef MGONGPU_NSIGHT_DEBUG // default in CUDA
 //#define MGONGPU_NSIGHT_DEBUG 1
+#else
+#undef MGONGPU_NSIGHT_DEBUG // only option in HIP or C++ 
 #endif
 
 // SANITY CHECKS (floating point precision for everything but color algebra #537)
@@ -86,17 +102,21 @@
 #error You cannot use double precision for color algebra and single precision elsewhere
 #endif
 
-// SANITY CHECKS (c++ complex number implementation)
-#ifndef MGONGPUCPP_GPUIMPL
-#if defined MGONGPU_CPPCXTYPE_STDCOMPLEX and defined MGONGPU_CPPCXTYPE_CXSMPL
-#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CPPCXTYPE_STDCOMPLEX or MGONGPU_CPPCXTYPE_CXSMPL
+// SANITY CHECKS (CUDA complex number implementation)
+#ifdef __CUDACC__
+#if defined MGONGPU_CUCXTYPE_THRUST and defined MGONGPU_CUCXTYPE_CUCOMPLEX
+#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CUCXTYPE_THRUST or MGONGPU_CUCXTYPE_CUCOMPLEX for CUDA
+#elif defined MGONGPU_CUCXTYPE_THRUST and defined MGONGPU_CUCXTYPE_CXSMPL
+#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CUCXTYPE_THRUST or MGONGPU_CUCXTYPE_CXSMPL for CUDA
+#elif defined MGONGPU_CUCXTYPE_CUCOMPLEX and defined MGONGPU_CUCXTYPE_CXSMPL
+#error You must CHOOSE (ONE AND) ONLY ONE OF MGONGPU_CUCXTYPE_CUCOMPLEX or MGONGPU_CUCXTYPE_CXSMPL for CUDA
 #endif
 #endif
 
-// SANITY CHECKS (cuda complex number implementation)
-#ifdef MGONGPUCPP_GPUIMPL
-#if defined MGONGPU_CUCXTYPE_THRUST and defined MGONGPU_CUCXTYPE_CUCOMPLEX and defined MGONGPU_CUCXTYPE_CXSMPL
-#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CUCXTYPE_THRUST or MGONGPU_CUCXTYPE_CUCOMPLEX or MGONGPU_CUCXTYPE_CXSMPL
+// SANITY CHECKS (C++ complex number implementation)
+#ifndef MGONGPUCPP_GPUIMPL
+#if defined MGONGPU_CPPCXTYPE_STDCOMPLEX and defined MGONGPU_CPPCXTYPE_CXSMPL
+#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CPPCXTYPE_STDCOMPLEX or MGONGPU_CPPCXTYPE_CXSMPL for C++
 #endif
 #endif
 
@@ -144,7 +164,7 @@ using mgOnGpu::fptype;
 using mgOnGpu::fptype2;
 
 // C++ SIMD vectorization width (this will be used to set neppV)
-#ifdef MGONGPUCPP_GPUIMPL // CUDA implementation has no SIMD
+#ifdef MGONGPUCPP_GPUIMPL // CUDA and HIP implementations have no SIMD
 #undef MGONGPU_CPPSIMD
 #elif defined __AVX512VL__ && defined MGONGPU_PVW512 // C++ "512z" AVX512 with 512 width (512-bit ie 64-byte): 8 (DOUBLE) or 16 (FLOAT)
 #ifdef MGONGPU_FPTYPE_DOUBLE
@@ -174,9 +194,9 @@ using mgOnGpu::fptype2;
 #undef MGONGPU_CPPSIMD
 #endif
 
-// Cuda nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation
+// CUDA nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation
 // Arguments (not used so far): text is __FUNCTION__, code is 0 (start) or 1 (end)
-#if defined MGONGPUCPP_GPUIMPL && defined MGONGPU_NSIGHT_DEBUG /* clang-format off */
+#if defined __CUDA__ && defined MGONGPU_NSIGHT_DEBUG /* clang-format off */
 #define mgDebugDeclare() __shared__ float mgDebugCounter[mgOnGpu::ntpbMAX];
 #define mgDebugInitialise() { mgDebugCounter[threadIdx.x] = 0; }
 #define mgDebug( code, text ) { mgDebugCounter[threadIdx.x] += 1; }
@@ -188,7 +208,7 @@ using mgOnGpu::fptype2;
 #define mgDebugFinalise() { /*noop*/ }
 #endif /* clang-format on */
 
-// Define empty CUDA declaration specifiers for C++
+// Define empty CUDA/HIP declaration specifiers for C++
 #ifndef MGONGPUCPP_GPUIMPL
 #define __global__
 #define __host__

From b4bdf47f61b72ce89e53f16c5f30954a995d485d Mon Sep 17 00:00:00 2001
From: Andrea Valassi <andrea.valassi@cern.ch>
Date: Tue, 18 Jul 2023 17:35:37 +0200
Subject: [PATCH 371/509] [jthip] in ggttgg.mad, clean up GpuAbstraction
 together with Jorgen after moving MGONGPUCPP_GPUIMPL to mgOnGpuConfig.h

---
 .../gg_ttgg.mad/SubProcesses/GpuAbstraction.h | 27 +++++--------------
 1 file changed, 7 insertions(+), 20 deletions(-)

diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/GpuAbstraction.h b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/GpuAbstraction.h
index 2f000e33d1..427c82c05d 100644
--- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/GpuAbstraction.h
+++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/GpuAbstraction.h
@@ -3,22 +3,10 @@
 
 #include <cassert>
 
-#ifdef MGONGPUCPP_GPUIMPL
-#define MGONGPUCPP_CUDACC 1
-#endif
-
-#ifdef __HIPCC__
-#include "hip/hip_runtime.h"
-#define MGONGPUCPP_HIPCC 1
-#endif
-
-#ifdef MGONGPUCPP_CUDACC
-
-// Defines correct compiler
-#define MGONGPUCPP_GPUIMPL MGONGPUCPP_GPUIMPL
-
 //--------------------------------------------------------------------------
 
+#ifdef __CUDACC__
+
 #define gpuError_t cudaError_t
 #define gpuPeekAtLastError cudaPeekAtLastError
 #define gpuGetErrorString cudaGetErrorString
@@ -44,12 +32,9 @@
 
 //--------------------------------------------------------------------------
 
-#elif defined MGONGPUCPP_HIPCC
-
-// Defines correct compiler
-#define MGONGPUCPP_GPUIMPL __HCC__
+#elif defined __HIPCC__
 
-//--------------------------------------------------------------------------
+#include "hip/hip_runtime.h"
 
 #define gpuError_t hipError_t
 #define gpuPeekAtLastError hipPeekAtLastError
@@ -74,6 +59,8 @@
 #define gpuLaunchKernel( kernel, blocks, threads, ... ) kernel<<<blocks, threads>>>( __VA_ARGS__ )
 #define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<<blocks, threads, sharedMem>>>( __VA_ARGS__ )
 
+//--------------------------------------------------------------------------
+
 #endif
 
-#endif // MG5AMC_GPUABSTRACTION_H
\ No newline at end of file
+#endif // MG5AMC_GPUABSTRACTION_H

From 806b5c7a52c20b693a55cdbaf5968e555d496f15 Mon Sep 17 00:00:00 2001
From: Andrea Valassi <andrea.valassi@cern.ch>
Date: Tue, 18 Jul 2023 17:39:27 +0200
Subject: [PATCH 372/509] [jthip] in ggttgg.mad, add GpuRuntime.h in check.cc
 and MemoryBuffers.h

---
 epochX/cudacpp/gg_ttgg.mad/SubProcesses/MemoryBuffers.h         | 2 +-
 epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/check_sa.cc | 1 +
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MemoryBuffers.h b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MemoryBuffers.h
index 41c6e8c0e4..7576be7e7c 100644
--- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MemoryBuffers.h
+++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MemoryBuffers.h
@@ -7,10 +7,10 @@
 #define MemoryBuffers_H 1
 
 #include "mgOnGpuConfig.h"
-
 #include "mgOnGpuCxtypes.h"
 
 #include "CPPProcess.h"
+#include "GpuRuntime.h"
 #include "Parameters_sm.h"
 
 #include <sstream>
diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/check_sa.cc b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/check_sa.cc
index e26397b39b..4f9558bd8f 100644
--- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/check_sa.cc
+++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/check_sa.cc
@@ -12,6 +12,7 @@
 #include "BridgeKernels.h"
 #include "CPPProcess.h"
 #include "CrossSectionKernels.h"
+#include "GpuRuntime.h"
 #include "MatrixElementKernels.h"
 #include "MemoryAccessMatrixElements.h"
 #include "MemoryAccessMomenta.h"

From 5d9a8e6138a49346ba3a8312ceae61cd1c8f1552 Mon Sep 17 00:00:00 2001
From: Andrea Valassi <andrea.valassi@cern.ch>
Date: Tue, 18 Jul 2023 17:46:04 +0200
Subject: [PATCH 373/509] [jthip] in ggttgg.mad CurandRandomNumberKernel.cc,
 change back MGONGPUCPP_CUDACC to __CUDACC__

---
 .../gg_ttgg.mad/SubProcesses/CurandRandomNumberKernel.cc      | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/CurandRandomNumberKernel.cc b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/CurandRandomNumberKernel.cc
index 5b33207ad0..98ec214eaf 100644
--- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/CurandRandomNumberKernel.cc
+++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/CurandRandomNumberKernel.cc
@@ -22,7 +22,7 @@ inline void assertCurand( curandStatus_t code, const char *file, int line, bool
 }
 #endif /* clang-format on */
 
-#ifdef MGONGPUCPP_CUDACC
+#ifdef __CUDACC__
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -36,7 +36,7 @@ namespace mg5amcCpu
   {
     if( m_isOnDevice )
     {
-#ifdef MGONGPUCPP_CUDACC
+#ifdef __CUDACC__
       if( !m_rnarray.isOnDevice() )
         throw std::runtime_error( "CurandRandomNumberKernel on device with a host random number array" );
 #else

From ece2174d37ce5403be53bfc0d9fb81b4db23505a Mon Sep 17 00:00:00 2001
From: Andrea Valassi <andrea.valassi@cern.ch>
Date: Tue, 18 Jul 2023 18:05:32 +0200
Subject: [PATCH 374/509] [jthip] in ggttgg.mad, clean up #ifdefs in check.cc
 together with Jorgen

---
 .../SubProcesses/P1_gg_ttxgg/check_sa.cc      | 59 +++++++++++--------
 1 file changed, 35 insertions(+), 24 deletions(-)

diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/check_sa.cc b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/check_sa.cc
index 4f9558bd8f..fbe245d418 100644
--- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/check_sa.cc
+++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/check_sa.cc
@@ -103,13 +103,12 @@ main( int argc, char** argv )
     CurandHost = 1,
     CurandDevice = 2
   };
-// Specifically checks for __CUDACC__ here
-#ifdef MGONGPUCPP_CUDACC
-  RandomNumberMode rndgen = RandomNumberMode::CurandDevice; // default on GPU
+#ifdef __CUDACC__
+  RandomNumberMode rndgen = RandomNumberMode::CurandDevice; // default on CUDA GPU
 #elif not defined MGONGPU_HAS_NO_CURAND
   RandomNumberMode rndgen = RandomNumberMode::CurandHost;  // default on CPU if build has curand
 #else
-  RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // default on CPU if build has no curand
+  RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // default on CPU if build has no curand and on HIP GPU
 #endif
   // Rambo sampling mode (NB RamboHost implies CommonRandom or CurandHost!)
   enum class RamboSamplingMode
@@ -147,10 +146,10 @@ main( int argc, char** argv )
     }
     else if( arg == "--curdev" )
     {
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
       rndgen = RandomNumberMode::CurandDevice;
 #else
-      throw std::runtime_error( "CurandDevice is not supported on CPUs" );
+      throw std::runtime_error( "CurandDevice is not supported on CPUs or on HIP GPUs" );
 #endif
     }
     else if( arg == "--curhst" )
@@ -267,10 +266,10 @@ main( int argc, char** argv )
 
 #ifdef MGONGPUCPP_GPUIMPL
 
-  // --- 00. Initialise cuda
-  // Instantiate a GpuRuntime at the beginnining of the application's main to
-  // invoke cudaSetDevice(0) in the constructor and book a cudaDeviceReset() call in the destructor
-  const std::string cdinKey = "00 CudaInit";
+  // --- 00. Initialise GPU
+  // Instantiate a GpuRuntime at the beginnining of the application's main.
+  // For CUDA this invokes cudaSetDevice(0) in the constructor and books a cudaDeviceReset() call in the destructor.
+  const std::string cdinKey = "00 GpuInit";
   timermap.start( cdinKey );
   GpuRuntime GpuRuntime( debug );
 #endif
@@ -396,7 +395,7 @@ main( int argc, char** argv )
     const bool onDevice = false;
     prnk.reset( new CurandRandomNumberKernel( hstRndmom, onDevice ) );
   }
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
   else
   {
     const bool onDevice = true;
@@ -405,7 +404,7 @@ main( int argc, char** argv )
 #else
   else
   {
-    throw std::logic_error( "CurandDevice is not supported on CPUs" ); // INTERNAL ERROR (no path to this statement)
+    throw std::logic_error( "CurandDevice is not supported on CPUs or HIP GPUs" ); // INTERNAL ERROR (no path to this statement)
   }
 #endif
 #else
@@ -731,18 +730,22 @@ main( int argc, char** argv )
     rndgentxt = "CURAND HOST";
   else if( rndgen == RandomNumberMode::CurandDevice )
     rndgentxt = "CURAND DEVICE";
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
   rndgentxt += " (CUDA code)";
+#elif defined __HIPCC__
+  rndgentxt += " (HIP code)";
 #else
   rndgentxt += " (C++ code)";
 #endif
 
   // Workflow description summary
   std::string wrkflwtxt;
-  // -- CUDA or C++?
-#ifdef MGONGPUCPP_GPUIMPL
+  // -- CUDA or HIP or C++?
+#ifdef __CUDACC__
   wrkflwtxt += "CUD:";
-#else
+#elif defined __HIPCC__
+  wrkflwtxt += "HIP:";
+else
   wrkflwtxt += "CPP:";
 #endif
   // -- DOUBLE or FLOAT?
@@ -756,7 +759,7 @@ main( int argc, char** argv )
   wrkflwtxt += "???+"; // no path to this statement
 #endif
   // -- CUCOMPLEX or THRUST or STD complex numbers?
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 #if defined MGONGPU_CUCXTYPE_CUCOMPLEX
   wrkflwtxt += "CUX:";
 #elif defined MGONGPU_CUCXTYPE_THRUST
@@ -766,6 +769,12 @@ main( int argc, char** argv )
 #else
   wrkflwtxt += "???:"; // no path to this statement
 #endif
+#elif defined __HIPCC__
+#if defined MGONGPU_CUCXTYPE_CXSMPL
+  wrkflwtxt += "CXS:";
+#else
+  wrkflwtxt += "???:"; // no path to this statement
+#endif
 #else
 #if defined MGONGPU_CPPCXTYPE_STDCOMPLEX
   wrkflwtxt += "STX:";
@@ -866,8 +875,10 @@ main( int argc, char** argv )
 #endif
     // Dump all configuration parameters and all results
     std::cout << std::string( SEP79, '*' ) << std::endl
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
               << "Process                     = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_CUDA"
+#elif defined __HIPCC__
+              << "Process                     = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_HIP"
 #else
               << "Process                     = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_CPP"
 #endif
@@ -894,14 +905,14 @@ main( int argc, char** argv )
 #elif defined MGONGPU_FPTYPE_FLOAT
               << "FP precision                = FLOAT (NaN/abnormal=" << nabn << ", zero=" << nzero << ")" << std::endl
 #endif
-#ifdef MGONGPUCPP_GPUIMPL
 #if defined MGONGPU_CUCXTYPE_CUCOMPLEX
               << "Complex type                = CUCOMPLEX" << std::endl
 #elif defined MGONGPU_CUCXTYPE_THRUST
               << "Complex type                = THRUST::COMPLEX" << std::endl
-#endif
-#else
+#elif defined MGONGPU_CUCXTYPE_CXSMPL
               << "Complex type                = STD::COMPLEX" << std::endl
+#else
+              << "Complex type                = ???" << std::endl // no path to this statement...
 #endif
               << "RanNumb memory layout       = AOSOA[" << neppR << "]"
               << ( neppR == 1 ? " == AOS" : "" )
@@ -1035,14 +1046,14 @@ main( int argc, char** argv )
              << "\"FLOAT (NaN/abnormal=" << nabn << ")\"," << std::endl
 #endif
              << "\"Complex type\": "
-#ifdef MGONGPUCPP_GPUIMPL
 #if defined MGONGPU_CUCXTYPE_CUCOMPLEX
              << "\"CUCOMPLEX\"," << std::endl
 #elif defined MGONGPU_CUCXTYPE_THRUST
              << "\"THRUST::COMPLEX\"," << std::endl
-#endif
-#else
+#elif defined MGONGPU_CUCXTYPE_CXSMPL
              << "\"STD::COMPLEX\"," << std::endl
+#else
+             << "\"???\"," << std::endl // no path to this statement...
 #endif
              << "\"RanNumb memory layout\": "
              << "\"AOSOA[" << neppR << "]\""

From efe9b837e3862d7d9968ae917f4e3e13d88e63a2 Mon Sep 17 00:00:00 2001
From: Andrea Valassi <andrea.valassi@cern.ch>
Date: Tue, 18 Jul 2023 18:17:02 +0200
Subject: [PATCH 375/509] [jthip] in ggttgg.mad, remove CudaRuntime.h files
 from the repo

---
 .../gg_ttgg.mad/SubProcesses/CudaRuntime.h    | 85 -------------------
 .../SubProcesses/P1_gg_ttxgg/CudaRuntime.h    |  1 -
 2 files changed, 86 deletions(-)
 delete mode 100644 epochX/cudacpp/gg_ttgg.mad/SubProcesses/CudaRuntime.h
 delete mode 120000 epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/CudaRuntime.h

diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/CudaRuntime.h b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/CudaRuntime.h
deleted file mode 100644
index df0c3f3df8..0000000000
--- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/CudaRuntime.h
+++ /dev/null
@@ -1,85 +0,0 @@
-// Copyright (C) 2020-2023 CERN and UCLouvain.
-// Licensed under the GNU Lesser General Public License (version 3 or later).
-// Created by: S. Roiser (Jul 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
-
-#ifndef MG5AMC_CUDARUNTIME_H
-#define MG5AMC_CUDARUNTIME_H 1
-
-// MG5AMC on GPU uses the CUDA runtime API, not the lower level CUDA driver API
-// See https://docs.nvidia.com/cuda/cuda-runtime-api/driver-vs-runtime-api.html#driver-vs-runtime-api
-
-#include <cassert>
-#include <iostream>
-
-//--------------------------------------------------------------------------
-
-// See https://stackoverflow.com/a/14038590
-#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
-#define checkCuda( code ) { assertCuda( code, __FILE__, __LINE__ ); }
-inline void assertCuda( cudaError_t code, const char* file, int line, bool abort = true )
-{
-  if( code != cudaSuccess )
-  {
-    printf( "ERROR! assertCuda: '%s' (%d) in %s:%d\n", cudaGetErrorString( code ), code, file, line );
-    if( abort ) assert( code == cudaSuccess );
-  }
-}
-#endif /* clang-format on */
-
-//--------------------------------------------------------------------------
-
-#ifdef MGONGPUCPP_GPUIMPL
-namespace mg5amcGpu
-{
-  // Instantiate a CudaRuntime at the beginnining of the application's main to
-  // invoke cudaSetDevice(0) in the constructor and book a cudaDeviceReset() call in the destructor
-  // *** FIXME! This will all need to be designed differently when going to multi-GPU nodes! ***
-  struct CudaRuntime final
-  {
-    CudaRuntime( const bool debug = true )
-      : m_debug( debug ) { setUp( m_debug ); }
-    ~CudaRuntime() { tearDown( m_debug ); }
-    CudaRuntime( const CudaRuntime& ) = delete;
-    CudaRuntime( CudaRuntime&& ) = delete;
-    CudaRuntime& operator=( const CudaRuntime& ) = delete;
-    CudaRuntime& operator=( CudaRuntime&& ) = delete;
-    bool m_debug;
-
-    // Set up CUDA application
-    // ** NB: strictly speaking this is not needed when using the CUDA runtime API **
-    // Calling cudaSetDevice on startup is useful to properly book-keep the time spent in CUDA initialization
-    static void setUp( const bool debug = true )
-    {
-      // ** NB: it is useful to call cudaSetDevice, or cudaFree, to properly book-keep the time spent in CUDA initialization
-      // ** NB: otherwise, the first CUDA operation (eg a cudaMemcpyToSymbol in CPPProcess ctor) appears to take much longer!
-      /*
-      // [We initially added cudaFree(0) to "ease profile analysis" only because it shows up as a big recognizable block!]
-      // No explicit initialization is needed: https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#initialization
-      // It is not clear what cudaFree(0) does at all: https://stackoverflow.com/questions/69967813/
-      if ( debug ) std::cout << "__CudaRuntime: calling cudaFree(0)" << std::endl;
-      checkCuda( cudaFree( 0 ) ); // SLOW!
-      */
-      // Replace cudaFree(0) by cudaSetDevice(0), even if it is not really needed either
-      // (but see https://developer.nvidia.com/blog/cuda-pro-tip-always-set-current-device-avoid-multithreading-bugs)
-      if( debug ) std::cout << "__CudaRuntime: calling cudaSetDevice(0)" << std::endl;
-      checkCuda( cudaSetDevice( 0 ) ); // SLOW!
-    }
-
-    // Tear down CUDA application (call cudaDeviceReset)
-    // ** NB: strictly speaking this is not needed when using the CUDA runtime API **
-    // Calling cudaDeviceReset on shutdown is only needed for checking memory leaks in cuda-memcheck
-    // See https://docs.nvidia.com/cuda/cuda-memcheck/index.html#leak-checking
-    static void tearDown( const bool debug = true )
-    {
-      if( debug ) std::cout << "__CudaRuntime: calling cudaDeviceReset()" << std::endl;
-      checkCuda( cudaDeviceReset() );
-    }
-  };
-
-}
-#endif
-
-//--------------------------------------------------------------------------
-
-#endif // MG5AMC_CUDARUNTIME_H
diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/CudaRuntime.h b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/CudaRuntime.h
deleted file mode 120000
index ce9e1a487a..0000000000
--- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/CudaRuntime.h
+++ /dev/null
@@ -1 +0,0 @@
-../CudaRuntime.h
\ No newline at end of file

From e3fa17fe29bc053904d07710c8fd66ac48c9776c Mon Sep 17 00:00:00 2001
From: Andrea Valassi <andrea.valassi@cern.ch>
Date: Tue, 18 Jul 2023 18:11:04 +0200
Subject: [PATCH 376/509] [jthip] backport to CODEGEN from ggttgg.mad

---
 .../template_files/cpp_model_parameters_h.inc |  6 +-
 .../iolibs/template_files/gpu/CudaRuntime.h   | 85 -------------------
 .../iolibs/template_files/gpu/check_sa.cc     | 59 ++++++++-----
 .../iolibs/template_files/gpu/mgOnGpuConfig.h | 73 ++++++++++------
 .../cudacpp/gg_ttgg.mad/src/mgOnGpuConfig.h   |  2 +-
 .../cudacpp/gg_ttgg.mad/src/mgOnGpuVectors.h  |  3 +-
 6 files changed, 88 insertions(+), 140 deletions(-)
 delete mode 100644 epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/CudaRuntime.h

diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/cpp_model_parameters_h.inc b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/cpp_model_parameters_h.inc
index 8920a10618..30dd35165b 100644
--- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/cpp_model_parameters_h.inc
+++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/cpp_model_parameters_h.inc
@@ -147,7 +147,7 @@ namespace Parameters_%(model_name)s_dependentCouplings
 #pragma GCC diagnostic push
 #pragma GCC diagnostic ignored "-Wunused-variable"  // e.g. <<warning: unused variable ‘mdl_G__exp__2’ [-Wunused-variable]>>
 #pragma GCC diagnostic ignored "-Wunused-parameter" // e.g. <<warning: unused parameter ‘G’ [-Wunused-parameter]>>
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 #pragma nv_diagnostic push
 #pragma nv_diag_suppress 177 // e.g. <<warning #177-D: variable "mdl_G__exp__2" was declared but never referenced>>
 #endif
@@ -171,7 +171,7 @@ namespace Parameters_%(model_name)s_dependentCouplings
 %(eftspecial2)s
     return out;
   }
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 #pragma GCC diagnostic pop
 #pragma nv_diagnostic pop
 #endif
@@ -187,7 +187,7 @@ namespace Parameters_%(model_name)s_independentCouplings
 
 //==========================================================================
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/CudaRuntime.h b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/CudaRuntime.h
deleted file mode 100644
index df0c3f3df8..0000000000
--- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/CudaRuntime.h
+++ /dev/null
@@ -1,85 +0,0 @@
-// Copyright (C) 2020-2023 CERN and UCLouvain.
-// Licensed under the GNU Lesser General Public License (version 3 or later).
-// Created by: S. Roiser (Jul 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
-
-#ifndef MG5AMC_CUDARUNTIME_H
-#define MG5AMC_CUDARUNTIME_H 1
-
-// MG5AMC on GPU uses the CUDA runtime API, not the lower level CUDA driver API
-// See https://docs.nvidia.com/cuda/cuda-runtime-api/driver-vs-runtime-api.html#driver-vs-runtime-api
-
-#include <cassert>
-#include <iostream>
-
-//--------------------------------------------------------------------------
-
-// See https://stackoverflow.com/a/14038590
-#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
-#define checkCuda( code ) { assertCuda( code, __FILE__, __LINE__ ); }
-inline void assertCuda( cudaError_t code, const char* file, int line, bool abort = true )
-{
-  if( code != cudaSuccess )
-  {
-    printf( "ERROR! assertCuda: '%s' (%d) in %s:%d\n", cudaGetErrorString( code ), code, file, line );
-    if( abort ) assert( code == cudaSuccess );
-  }
-}
-#endif /* clang-format on */
-
-//--------------------------------------------------------------------------
-
-#ifdef MGONGPUCPP_GPUIMPL
-namespace mg5amcGpu
-{
-  // Instantiate a CudaRuntime at the beginnining of the application's main to
-  // invoke cudaSetDevice(0) in the constructor and book a cudaDeviceReset() call in the destructor
-  // *** FIXME! This will all need to be designed differently when going to multi-GPU nodes! ***
-  struct CudaRuntime final
-  {
-    CudaRuntime( const bool debug = true )
-      : m_debug( debug ) { setUp( m_debug ); }
-    ~CudaRuntime() { tearDown( m_debug ); }
-    CudaRuntime( const CudaRuntime& ) = delete;
-    CudaRuntime( CudaRuntime&& ) = delete;
-    CudaRuntime& operator=( const CudaRuntime& ) = delete;
-    CudaRuntime& operator=( CudaRuntime&& ) = delete;
-    bool m_debug;
-
-    // Set up CUDA application
-    // ** NB: strictly speaking this is not needed when using the CUDA runtime API **
-    // Calling cudaSetDevice on startup is useful to properly book-keep the time spent in CUDA initialization
-    static void setUp( const bool debug = true )
-    {
-      // ** NB: it is useful to call cudaSetDevice, or cudaFree, to properly book-keep the time spent in CUDA initialization
-      // ** NB: otherwise, the first CUDA operation (eg a cudaMemcpyToSymbol in CPPProcess ctor) appears to take much longer!
-      /*
-      // [We initially added cudaFree(0) to "ease profile analysis" only because it shows up as a big recognizable block!]
-      // No explicit initialization is needed: https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#initialization
-      // It is not clear what cudaFree(0) does at all: https://stackoverflow.com/questions/69967813/
-      if ( debug ) std::cout << "__CudaRuntime: calling cudaFree(0)" << std::endl;
-      checkCuda( cudaFree( 0 ) ); // SLOW!
-      */
-      // Replace cudaFree(0) by cudaSetDevice(0), even if it is not really needed either
-      // (but see https://developer.nvidia.com/blog/cuda-pro-tip-always-set-current-device-avoid-multithreading-bugs)
-      if( debug ) std::cout << "__CudaRuntime: calling cudaSetDevice(0)" << std::endl;
-      checkCuda( cudaSetDevice( 0 ) ); // SLOW!
-    }
-
-    // Tear down CUDA application (call cudaDeviceReset)
-    // ** NB: strictly speaking this is not needed when using the CUDA runtime API **
-    // Calling cudaDeviceReset on shutdown is only needed for checking memory leaks in cuda-memcheck
-    // See https://docs.nvidia.com/cuda/cuda-memcheck/index.html#leak-checking
-    static void tearDown( const bool debug = true )
-    {
-      if( debug ) std::cout << "__CudaRuntime: calling cudaDeviceReset()" << std::endl;
-      checkCuda( cudaDeviceReset() );
-    }
-  };
-
-}
-#endif
-
-//--------------------------------------------------------------------------
-
-#endif // MG5AMC_CUDARUNTIME_H
diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/check_sa.cc b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/check_sa.cc
index d7d40e140c..9d5f088f38 100644
--- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/check_sa.cc
+++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/check_sa.cc
@@ -12,6 +12,7 @@
 #include "BridgeKernels.h"
 #include "CPPProcess.h"
 #include "CrossSectionKernels.h"
+#include "GpuRuntime.h"
 #include "MatrixElementKernels.h"
 #include "MemoryAccessMatrixElements.h"
 #include "MemoryAccessMomenta.h"
@@ -102,12 +103,12 @@ main( int argc, char** argv )
     CurandHost = 1,
     CurandDevice = 2
   };
-#ifdef MGONGPUCPP_GPUIMPL
-  RandomNumberMode rndgen = RandomNumberMode::CurandDevice; // default on GPU
+#ifdef __CUDACC__
+  RandomNumberMode rndgen = RandomNumberMode::CurandDevice; // default on CUDA GPU
 #elif not defined MGONGPU_HAS_NO_CURAND
   RandomNumberMode rndgen = RandomNumberMode::CurandHost;  // default on CPU if build has curand
 #else
-  RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // default on CPU if build has no curand
+  RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // default on CPU if build has no curand and on HIP GPU
 #endif
   // Rambo sampling mode (NB RamboHost implies CommonRandom or CurandHost!)
   enum class RamboSamplingMode
@@ -145,10 +146,10 @@ main( int argc, char** argv )
     }
     else if( arg == "--curdev" )
     {
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
       rndgen = RandomNumberMode::CurandDevice;
 #else
-      throw std::runtime_error( "CurandDevice is not supported on CPUs" );
+      throw std::runtime_error( "CurandDevice is not supported on CPUs or on HIP GPUs" );
 #endif
     }
     else if( arg == "--curhst" )
@@ -265,12 +266,12 @@ main( int argc, char** argv )
 
 #ifdef MGONGPUCPP_GPUIMPL
 
-  // --- 00. Initialise cuda
-  // Instantiate a CudaRuntime at the beginnining of the application's main to
-  // invoke cudaSetDevice(0) in the constructor and book a cudaDeviceReset() call in the destructor
-  const std::string cdinKey = "00 CudaInit";
+  // --- 00. Initialise GPU
+  // Instantiate a GpuRuntime at the beginnining of the application's main.
+  // For CUDA this invokes cudaSetDevice(0) in the constructor and books a cudaDeviceReset() call in the destructor.
+  const std::string cdinKey = "00 GpuInit";
   timermap.start( cdinKey );
-  CudaRuntime cudaRuntime( debug );
+  GpuRuntime GpuRuntime( debug );
 #endif
 
   // --- 0a. Initialise physics process
@@ -394,7 +395,7 @@ main( int argc, char** argv )
     const bool onDevice = false;
     prnk.reset( new CurandRandomNumberKernel( hstRndmom, onDevice ) );
   }
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
   else
   {
     const bool onDevice = true;
@@ -403,7 +404,7 @@ main( int argc, char** argv )
 #else
   else
   {
-    throw std::logic_error( "CurandDevice is not supported on CPUs" ); // INTERNAL ERROR (no path to this statement)
+    throw std::logic_error( "CurandDevice is not supported on CPUs or HIP GPUs" ); // INTERNAL ERROR (no path to this statement)
   }
 #endif
 #else
@@ -729,17 +730,21 @@ main( int argc, char** argv )
     rndgentxt = "CURAND HOST";
   else if( rndgen == RandomNumberMode::CurandDevice )
     rndgentxt = "CURAND DEVICE";
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
   rndgentxt += " (CUDA code)";
+#elif defined __HIPCC__
+  rndgentxt += " (HIP code)";
 #else
   rndgentxt += " (C++ code)";
 #endif
 
   // Workflow description summary
   std::string wrkflwtxt;
-  // -- CUDA or C++?
-#ifdef MGONGPUCPP_GPUIMPL
+  // -- CUDA or HIP or C++?
+#ifdef __CUDACC__
   wrkflwtxt += "CUD:";
+#elif defined __HIPCC__
+  wrkflwtxt += "HIP:";
 #else
   wrkflwtxt += "CPP:";
 #endif
@@ -754,7 +759,7 @@ main( int argc, char** argv )
   wrkflwtxt += "???+"; // no path to this statement
 #endif
   // -- CUCOMPLEX or THRUST or STD complex numbers?
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 #if defined MGONGPU_CUCXTYPE_CUCOMPLEX
   wrkflwtxt += "CUX:";
 #elif defined MGONGPU_CUCXTYPE_THRUST
@@ -764,6 +769,12 @@ main( int argc, char** argv )
 #else
   wrkflwtxt += "???:"; // no path to this statement
 #endif
+#elif defined __HIPCC__
+#if defined MGONGPU_CUCXTYPE_CXSMPL
+  wrkflwtxt += "CXS:";
+#else
+  wrkflwtxt += "???:"; // no path to this statement
+#endif
 #else
 #if defined MGONGPU_CPPCXTYPE_STDCOMPLEX
   wrkflwtxt += "STX:";
@@ -864,8 +875,10 @@ main( int argc, char** argv )
 #endif
     // Dump all configuration parameters and all results
     std::cout << std::string( SEP79, '*' ) << std::endl
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
               << "Process                     = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_CUDA"
+#elif defined __HIPCC__
+              << "Process                     = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_HIP"
 #else
               << "Process                     = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_CPP"
 #endif
@@ -892,14 +905,14 @@ main( int argc, char** argv )
 #elif defined MGONGPU_FPTYPE_FLOAT
               << "FP precision                = FLOAT (NaN/abnormal=" << nabn << ", zero=" << nzero << ")" << std::endl
 #endif
-#ifdef MGONGPUCPP_GPUIMPL
 #if defined MGONGPU_CUCXTYPE_CUCOMPLEX
               << "Complex type                = CUCOMPLEX" << std::endl
 #elif defined MGONGPU_CUCXTYPE_THRUST
               << "Complex type                = THRUST::COMPLEX" << std::endl
-#endif
-#else
+#elif defined MGONGPU_CUCXTYPE_CXSMPL
               << "Complex type                = STD::COMPLEX" << std::endl
+#else
+              << "Complex type                = ???" << std::endl // no path to this statement...
 #endif
               << "RanNumb memory layout       = AOSOA[" << neppR << "]"
               << ( neppR == 1 ? " == AOS" : "" )
@@ -1033,14 +1046,14 @@ main( int argc, char** argv )
              << "\"FLOAT (NaN/abnormal=" << nabn << ")\"," << std::endl
 #endif
              << "\"Complex type\": "
-#ifdef MGONGPUCPP_GPUIMPL
 #if defined MGONGPU_CUCXTYPE_CUCOMPLEX
              << "\"CUCOMPLEX\"," << std::endl
 #elif defined MGONGPU_CUCXTYPE_THRUST
              << "\"THRUST::COMPLEX\"," << std::endl
-#endif
-#else
+#elif defined MGONGPU_CUCXTYPE_CXSMPL
              << "\"STD::COMPLEX\"," << std::endl
+#else
+             << "\"???\"," << std::endl                           // no path to this statement...
 #endif
              << "\"RanNumb memory layout\": "
              << "\"AOSOA[" << neppR << "]\""
diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/mgOnGpuConfig.h b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/mgOnGpuConfig.h
index 55307d3674..69d85b61cb 100644
--- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/mgOnGpuConfig.h
+++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/mgOnGpuConfig.h
@@ -6,21 +6,31 @@
 #ifndef MGONGPUCONFIG_H
 #define MGONGPUCONFIG_H 1
 
-#include "GpuRuntime.h" // Includes the GPU abstraction
-
 // HARDCODED AT CODE GENERATION TIME: DO NOT MODIFY (#473)
 // There are two different code bases for standalone_cudacpp (without multichannel) and madevent+cudacpp (with multichannel)
 %(mgongpu_supports_multichannel)s
 
+// Is this a GPU (CUDA, HIP) or CPU implementation?
+#ifdef __CUDACC__
+#define MGONGPUCPP_GPUIMPL cuda
+#elif defined __HIPCC__
+#define MGONGPUCPP_GPUIMPL hip
+#else
+#undef MGONGPUCPP_GPUIMPL
+#endif
+
 // ** NB1 Throughputs (e.g. 6.8E8) are events/sec for "./gcheck.exe -p 65536 128 12"
 // ** NB2 Baseline on b7g47n0004 fluctuates (probably depends on load on other VMs)
 
 // Choose if curand is supported for generating random numbers
+// For CUDA, by default, it is supported
+// For HIP, by default, it is not supported
 // For C++, by default, do not inline, but allow this macro to be set from outside with e.g. -DMGONGPU_HAS_NO_CURAND
-#ifdef MGONGPUCPP_CUDACC
+#ifdef __CUDACC__
 #undef MGONGPU_HAS_NO_CURAND
-#elif defined MGONGPUCPP_HIPCC
+#elif defined __HIPCC__
 #define MGONGPU_HAS_NO_CURAND 1
+#else
 //#undef MGONGPU_HAS_NO_CURAND // default
 ////#define MGONGPU_HAS_NO_CURAND 1
 #endif
@@ -55,23 +65,28 @@
 //#undef MGONGPU_HARDCODE_PARAM // default
 ////#define MGONGPU_HARDCODE_PARAM 1
 
-// Complex type in c++: std::complex or cxsmpl (CHOOSE ONLY ONE)
-#ifndef MGONGPUCPP_GPUIMPL
-//#define MGONGPU_CPPCXTYPE_STDCOMPLEX 1 // ~8 percent slower on float, same on double (5.1E6/double, 9.4E6/float)
-#define MGONGPU_CPPCXTYPE_CXSMPL 1 // new default (5.1E6/double, 10.2E6/float)
-#endif
-
-// Complex type in cuda: thrust or cucomplex or cxsmpl (CHOOSE ONLY ONE)
-#ifdef MGONGPUCPP_GPUIMPL
+// Complex type in CUDA: thrust or cucomplex or cxsmpl (CHOOSE ONLY ONE)
+#ifdef __CUDACC__
 #define MGONGPU_CUCXTYPE_THRUST 1 // default (~1.15E9/double, ~3.2E9/float)
 //#define MGONGPU_CUCXTYPE_CUCOMPLEX 1 // ~10 percent slower (1.03E9/double, ~2.8E9/float)
 //#define MGONGPU_CUCXTYPE_CXSMPL 1 // ~10 percent slower (1.00E9/double, ~2.9E9/float)
+
+// Complex type in HIP: cxsmpl (ONLY ONE OPTION POSSIBLE)
+#elif defined __HIPCC__
+#define MGONGPU_CUCXTYPE_CXSMPL 1 // ~10 percent slower (1.00E9/double, ~2.9E9/float)
+
+// Complex type in C++: std::complex or cxsmpl (CHOOSE ONLY ONE)
+#else
+//#define MGONGPU_CPPCXTYPE_STDCOMPLEX 1 // ~8 percent slower on float, same on double (5.1E6/double, 9.4E6/float)
+#define MGONGPU_CPPCXTYPE_CXSMPL 1 // new default (5.1E6/double, 10.2E6/float)
 #endif
 
-// Cuda nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation
-#ifdef MGONGPUCPP_GPUIMPL
-#undef MGONGPU_NSIGHT_DEBUG // default
+// CUDA nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation
+#ifdef __CUDACC__
+#undef MGONGPU_NSIGHT_DEBUG // default in CUDA
 //#define MGONGPU_NSIGHT_DEBUG 1
+#else
+#undef MGONGPU_NSIGHT_DEBUG // only option in HIP or C++
 #endif
 
 // SANITY CHECKS (floating point precision for everything but color algebra #537)
@@ -87,17 +102,21 @@
 #error You cannot use double precision for color algebra and single precision elsewhere
 #endif
 
-// SANITY CHECKS (c++ complex number implementation)
-#ifndef MGONGPUCPP_GPUIMPL
-#if defined MGONGPU_CPPCXTYPE_STDCOMPLEX and defined MGONGPU_CPPCXTYPE_CXSMPL
-#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CPPCXTYPE_STDCOMPLEX or MGONGPU_CPPCXTYPE_CXSMPL
+// SANITY CHECKS (CUDA complex number implementation)
+#ifdef __CUDACC__
+#if defined MGONGPU_CUCXTYPE_THRUST and defined MGONGPU_CUCXTYPE_CUCOMPLEX
+#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CUCXTYPE_THRUST or MGONGPU_CUCXTYPE_CUCOMPLEX for CUDA
+#elif defined MGONGPU_CUCXTYPE_THRUST and defined MGONGPU_CUCXTYPE_CXSMPL
+#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CUCXTYPE_THRUST or MGONGPU_CUCXTYPE_CXSMPL for CUDA
+#elif defined MGONGPU_CUCXTYPE_CUCOMPLEX and defined MGONGPU_CUCXTYPE_CXSMPL
+#error You must CHOOSE (ONE AND) ONLY ONE OF MGONGPU_CUCXTYPE_CUCOMPLEX or MGONGPU_CUCXTYPE_CXSMPL for CUDA
 #endif
 #endif
 
-// SANITY CHECKS (cuda complex number implementation)
-#ifdef MGONGPUCPP_GPUIMPL
-#if defined MGONGPU_CUCXTYPE_THRUST and defined MGONGPU_CUCXTYPE_CUCOMPLEX and defined MGONGPU_CUCXTYPE_CXSMPL
-#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CUCXTYPE_THRUST or MGONGPU_CUCXTYPE_CUCOMPLEX or MGONGPU_CUCXTYPE_CXSMPL
+// SANITY CHECKS (C++ complex number implementation)
+#ifndef MGONGPUCPP_GPUIMPL
+#if defined MGONGPU_CPPCXTYPE_STDCOMPLEX and defined MGONGPU_CPPCXTYPE_CXSMPL
+#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CPPCXTYPE_STDCOMPLEX or MGONGPU_CPPCXTYPE_CXSMPL for C++
 #endif
 #endif
 
@@ -145,7 +164,7 @@ using mgOnGpu::fptype;
 using mgOnGpu::fptype2;
 
 // C++ SIMD vectorization width (this will be used to set neppV)
-#ifdef MGONGPUCPP_GPUIMPL // CUDA implementation has no SIMD
+#ifdef MGONGPUCPP_GPUIMPL // CUDA and HIP implementations have no SIMD
 #undef MGONGPU_CPPSIMD
 #elif defined __AVX512VL__ && defined MGONGPU_PVW512 // C++ "512z" AVX512 with 512 width (512-bit ie 64-byte): 8 (DOUBLE) or 16 (FLOAT)
 #ifdef MGONGPU_FPTYPE_DOUBLE
@@ -175,9 +194,9 @@ using mgOnGpu::fptype2;
 #undef MGONGPU_CPPSIMD
 #endif
 
-// Cuda nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation
+// CUDA nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation
 // Arguments (not used so far): text is __FUNCTION__, code is 0 (start) or 1 (end)
-#if defined MGONGPUCPP_GPUIMPL && defined MGONGPU_NSIGHT_DEBUG /* clang-format off */
+#if defined __CUDA__ && defined MGONGPU_NSIGHT_DEBUG /* clang-format off */
 #define mgDebugDeclare() __shared__ float mgDebugCounter[mgOnGpu::ntpbMAX];
 #define mgDebugInitialise() { mgDebugCounter[threadIdx.x] = 0; }
 #define mgDebug( code, text ) { mgDebugCounter[threadIdx.x] += 1; }
@@ -189,7 +208,7 @@ using mgOnGpu::fptype2;
 #define mgDebugFinalise() { /*noop*/ }
 #endif /* clang-format on */
 
-// Define empty CUDA declaration specifiers for C++
+// Define empty CUDA/HIP declaration specifiers for C++
 #ifndef MGONGPUCPP_GPUIMPL
 #define __global__
 #define __host__
diff --git a/epochX/cudacpp/gg_ttgg.mad/src/mgOnGpuConfig.h b/epochX/cudacpp/gg_ttgg.mad/src/mgOnGpuConfig.h
index 51e88ae9d1..909446af7b 100644
--- a/epochX/cudacpp/gg_ttgg.mad/src/mgOnGpuConfig.h
+++ b/epochX/cudacpp/gg_ttgg.mad/src/mgOnGpuConfig.h
@@ -86,7 +86,7 @@
 #undef MGONGPU_NSIGHT_DEBUG // default in CUDA
 //#define MGONGPU_NSIGHT_DEBUG 1
 #else
-#undef MGONGPU_NSIGHT_DEBUG // only option in HIP or C++ 
+#undef MGONGPU_NSIGHT_DEBUG // only option in HIP or C++
 #endif
 
 // SANITY CHECKS (floating point precision for everything but color algebra #537)
diff --git a/epochX/cudacpp/gg_ttgg.mad/src/mgOnGpuVectors.h b/epochX/cudacpp/gg_ttgg.mad/src/mgOnGpuVectors.h
index 3376081012..ee906f450d 100644
--- a/epochX/cudacpp/gg_ttgg.mad/src/mgOnGpuVectors.h
+++ b/epochX/cudacpp/gg_ttgg.mad/src/mgOnGpuVectors.h
@@ -6,10 +6,11 @@
 #ifndef MGONGPUVECTORS_H
 #define MGONGPUVECTORS_H 1
 
-#include "GpuAbstraction.h"
 #include "mgOnGpuCxtypes.h"
 #include "mgOnGpuFptypes.h"
 
+#include "GpuAbstraction.h"
+
 #include <iostream>
 
 //==========================================================================

From 5d218f6db609e7e23d28517caae28ab44de76eeb Mon Sep 17 00:00:00 2001
From: Andrea Valassi <andrea.valassi@cern.ch>
Date: Tue, 18 Jul 2023 18:23:10 +0200
Subject: [PATCH 377/509] [jthip] in ggttgg.mad mgOnGpuVectors.h, remove
 include of GpuRuntime.h

---
 epochX/cudacpp/gg_ttgg.mad/src/mgOnGpuVectors.h | 2 --
 1 file changed, 2 deletions(-)

diff --git a/epochX/cudacpp/gg_ttgg.mad/src/mgOnGpuVectors.h b/epochX/cudacpp/gg_ttgg.mad/src/mgOnGpuVectors.h
index ee906f450d..7904b93c61 100644
--- a/epochX/cudacpp/gg_ttgg.mad/src/mgOnGpuVectors.h
+++ b/epochX/cudacpp/gg_ttgg.mad/src/mgOnGpuVectors.h
@@ -9,8 +9,6 @@
 #include "mgOnGpuCxtypes.h"
 #include "mgOnGpuFptypes.h"
 
-#include "GpuAbstraction.h"
-
 #include <iostream>
 
 //==========================================================================

From 85a746bc0eb7768c1a4b613f314418745c8f160d Mon Sep 17 00:00:00 2001
From: Andrea Valassi <andrea.valassi@cern.ch>
Date: Tue, 18 Jul 2023 18:23:55 +0200
Subject: [PATCH 378/509] [jthip] in ggttgg.mad, remove src/GpuAbstraction.h
 and src/GpuRuntime.h from the repo

---
 epochX/cudacpp/gg_ttgg.mad/src/GpuAbstraction.h | 1 -
 epochX/cudacpp/gg_ttgg.mad/src/GpuRuntime.h     | 1 -
 2 files changed, 2 deletions(-)
 delete mode 120000 epochX/cudacpp/gg_ttgg.mad/src/GpuAbstraction.h
 delete mode 120000 epochX/cudacpp/gg_ttgg.mad/src/GpuRuntime.h

diff --git a/epochX/cudacpp/gg_ttgg.mad/src/GpuAbstraction.h b/epochX/cudacpp/gg_ttgg.mad/src/GpuAbstraction.h
deleted file mode 120000
index 4955c9171e..0000000000
--- a/epochX/cudacpp/gg_ttgg.mad/src/GpuAbstraction.h
+++ /dev/null
@@ -1 +0,0 @@
-../SubProcesses/GpuAbstraction.h
\ No newline at end of file
diff --git a/epochX/cudacpp/gg_ttgg.mad/src/GpuRuntime.h b/epochX/cudacpp/gg_ttgg.mad/src/GpuRuntime.h
deleted file mode 120000
index ba9c735d54..0000000000
--- a/epochX/cudacpp/gg_ttgg.mad/src/GpuRuntime.h
+++ /dev/null
@@ -1 +0,0 @@
-../SubProcesses/GpuRuntime.h
\ No newline at end of file

From 6e90139833db998d1f6b2546d16c33c357804b24 Mon Sep 17 00:00:00 2001
From: Andrea Valassi <andrea.valassi@cern.ch>
Date: Tue, 18 Jul 2023 18:25:34 +0200
Subject: [PATCH 379/509] [jthip] complete backport to CODEGEN from ggttgg.mad,
 including a few improvements

---
 .../gpu/CurandRandomNumberKernel.cc           |  4 +--
 .../template_files/gpu/GpuAbstraction.h       | 27 +++++--------------
 .../iolibs/template_files/gpu/MemoryBuffers.h |  1 +
 .../template_files/gpu/mgOnGpuVectors.h       |  2 --
 .../iolibs/template_files/gpu/process_cc.inc  |  1 -
 .../gpu/process_function_definitions.inc      |  2 --
 .../template_files/gpu/process_matrix.inc     |  2 --
 .../CUDACPP_SA_OUTPUT/model_handling.py       | 20 +++++++-------
 .../gg_ttgg.mad/SubProcesses/MemoryBuffers.h  |  1 +
 .../SubProcesses/P1_gg_ttxgg/check_sa.cc      |  4 +--
 10 files changed, 24 insertions(+), 40 deletions(-)

diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/CurandRandomNumberKernel.cc b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/CurandRandomNumberKernel.cc
index 5b33207ad0..98ec214eaf 100644
--- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/CurandRandomNumberKernel.cc
+++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/CurandRandomNumberKernel.cc
@@ -22,7 +22,7 @@ inline void assertCurand( curandStatus_t code, const char *file, int line, bool
 }
 #endif /* clang-format on */
 
-#ifdef MGONGPUCPP_CUDACC
+#ifdef __CUDACC__
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -36,7 +36,7 @@ namespace mg5amcCpu
   {
     if( m_isOnDevice )
     {
-#ifdef MGONGPUCPP_CUDACC
+#ifdef __CUDACC__
       if( !m_rnarray.isOnDevice() )
         throw std::runtime_error( "CurandRandomNumberKernel on device with a host random number array" );
 #else
diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/GpuAbstraction.h b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/GpuAbstraction.h
index 2f000e33d1..427c82c05d 100644
--- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/GpuAbstraction.h
+++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/GpuAbstraction.h
@@ -3,22 +3,10 @@
 
 #include <cassert>
 
-#ifdef MGONGPUCPP_GPUIMPL
-#define MGONGPUCPP_CUDACC 1
-#endif
-
-#ifdef __HIPCC__
-#include "hip/hip_runtime.h"
-#define MGONGPUCPP_HIPCC 1
-#endif
-
-#ifdef MGONGPUCPP_CUDACC
-
-// Defines correct compiler
-#define MGONGPUCPP_GPUIMPL MGONGPUCPP_GPUIMPL
-
 //--------------------------------------------------------------------------
 
+#ifdef __CUDACC__
+
 #define gpuError_t cudaError_t
 #define gpuPeekAtLastError cudaPeekAtLastError
 #define gpuGetErrorString cudaGetErrorString
@@ -44,12 +32,9 @@
 
 //--------------------------------------------------------------------------
 
-#elif defined MGONGPUCPP_HIPCC
-
-// Defines correct compiler
-#define MGONGPUCPP_GPUIMPL __HCC__
+#elif defined __HIPCC__
 
-//--------------------------------------------------------------------------
+#include "hip/hip_runtime.h"
 
 #define gpuError_t hipError_t
 #define gpuPeekAtLastError hipPeekAtLastError
@@ -74,6 +59,8 @@
 #define gpuLaunchKernel( kernel, blocks, threads, ... ) kernel<<<blocks, threads>>>( __VA_ARGS__ )
 #define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<<blocks, threads, sharedMem>>>( __VA_ARGS__ )
 
+//--------------------------------------------------------------------------
+
 #endif
 
-#endif // MG5AMC_GPUABSTRACTION_H
\ No newline at end of file
+#endif // MG5AMC_GPUABSTRACTION_H
diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MemoryBuffers.h b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MemoryBuffers.h
index d6ba45dcad..522e6ce100 100644
--- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MemoryBuffers.h
+++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MemoryBuffers.h
@@ -11,6 +11,7 @@
 #include "mgOnGpuCxtypes.h"
 
 #include "CPPProcess.h"
+#include "GpuRuntime.h"
 #include "Parameters_%(model_name)s.h"
 
 #include <sstream>
diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/mgOnGpuVectors.h b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/mgOnGpuVectors.h
index ee906f450d..7904b93c61 100644
--- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/mgOnGpuVectors.h
+++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/mgOnGpuVectors.h
@@ -9,8 +9,6 @@
 #include "mgOnGpuCxtypes.h"
 #include "mgOnGpuFptypes.h"
 
-#include "GpuAbstraction.h"
-
 #include <iostream>
 
 //==========================================================================
diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/process_cc.inc b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/process_cc.inc
index 9dceb45708..95400f42db 100644
--- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/process_cc.inc
+++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/process_cc.inc
@@ -14,7 +14,6 @@
 
 #include "mgOnGpuConfig.h"
 
-#include "GpuRuntime.h"
 %(hel_amps_h)s
 #include "MemoryAccessAmplitudes.h"
 #include "MemoryAccessCouplings.h"
diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/process_function_definitions.inc b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/process_function_definitions.inc
index 2a473552fa..1269fb0a3f 100644
--- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/process_function_definitions.inc
+++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/process_function_definitions.inc
@@ -10,8 +10,6 @@
 // Class member functions for calculating the matrix elements for
 %(process_lines)s
 
-#include "GpuRuntime.h"
-
 #ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/process_matrix.inc b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/process_matrix.inc
index 241c50a9d1..3cfbf668ca 100644
--- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/process_matrix.inc
+++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/process_matrix.inc
@@ -7,8 +7,6 @@
 ! Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 !==========================================================================
 
-#include "GpuAbstraction.h"
-
       // *** COLOR CHOICE BELOW ***
       // Store the leading color flows for choice of color
       if( jamp2_sv ) // disable color choice if nullptr
diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/model_handling.py b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/model_handling.py
index 803fa5e258..b5b6ed037b 100644
--- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/model_handling.py
+++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/model_handling.py
@@ -1080,7 +1080,7 @@ def get_process_function_definitions(self, write=True):
                                          %(len(coupling_indep), ' ), cxmake( m_pars->'.join(coupling_indep)) # AV only indep!
             replace_dict['cipcdevice'] = '__device__ __constant__ fptype cIPC[%i];'%(2*len(coupling_indep))
             replace_dict['cipcstatic'] = 'static fptype cIPC[%i];'%(2*len(coupling_indep))
-            replace_dict['cipc2tipcSym'] = 'checkCuda( cudaMemcpyToSymbol( cIPC, tIPC, %i * sizeof( cxtype ) ) );'%len(coupling_indep)
+            replace_dict['cipc2tipcSym'] = 'gpuMemcpyToSymbol( cIPC, tIPC, %i * sizeof( cxtype ) );'%len(coupling_indep)
             replace_dict['cipc2tipc'] = 'memcpy( cIPC, tIPC, %i * sizeof( cxtype ) );'%len(coupling_indep)
             replace_dict['cipcdump'] = '\n    //for ( i=0; i<%i; i++ ) std::cout << std::setprecision(17) << "tIPC[i] = " << tIPC[i] << std::endl;'%len(coupling_indep)
             coup_str_hrd = '__device__ const fptype cIPC[%s] = { ' % (len(coupling_indep)*2)
@@ -1091,7 +1091,7 @@ def get_process_function_definitions(self, write=True):
             replace_dict['cipcassign'] = '//const cxtype tIPC[0] = { ... }; // nicoup=0'
             replace_dict['cipcdevice'] = '__device__ __constant__ fptype* cIPC = nullptr; // unused as nicoup=0'
             replace_dict['cipcstatic'] = 'static fptype* cIPC = nullptr; // unused as nicoup=0'
-            replace_dict['cipc2tipcSym'] = '//checkCuda( cudaMemcpyToSymbol( cIPC, tIPC, %i * sizeof( cxtype ) ) ); // nicoup=0'%len(coupling_indep)
+            replace_dict['cipc2tipcSym'] = '//gpuMemcpyToSymbol( cIPC, tIPC, %i * sizeof( cxtype ) ); // nicoup=0'%len(coupling_indep)
             replace_dict['cipc2tipc'] = '//memcpy( cIPC, tIPC, %i * sizeof( cxtype ) ); // nicoup=0'%len(coupling_indep)
             replace_dict['cipcdump'] = ''
             replace_dict['cipchrdcod'] = '__device__ const fptype* cIPC = nullptr; // unused as nicoup=0'
@@ -1100,7 +1100,7 @@ def get_process_function_definitions(self, write=True):
                                          %(len(params), ', (fptype)m_pars->'.join(params))
             replace_dict['cipddevice'] = '__device__ __constant__ fptype cIPD[%i];'%(len(params))
             replace_dict['cipdstatic'] = 'static fptype cIPD[%i];'%(len(params))
-            replace_dict['cipd2tipdSym'] = 'checkCuda( cudaMemcpyToSymbol( cIPD, tIPD, %i * sizeof( fptype ) ) );'%len(params)
+            replace_dict['cipd2tipdSym'] = 'gpuMemcpyToSymbol( cIPD, tIPD, %i * sizeof( fptype ) );'%len(params)
             replace_dict['cipd2tipd'] = 'memcpy( cIPD, tIPD, %i * sizeof( fptype ) );'%len(params)
             replace_dict['cipddump'] = '\n    //for ( i=0; i<%i; i++ ) std::cout << std::setprecision(17) << "tIPD[i] = " << tIPD[i] << std::endl;'%len(params)
             param_str_hrd = '__device__ const fptype cIPD[%s] = { ' % len(params)
@@ -1111,7 +1111,7 @@ def get_process_function_definitions(self, write=True):
             replace_dict['cipdassign'] = '//const fptype tIPD[0] = { ... }; // nparam=0'
             replace_dict['cipddevice'] = '//__device__ __constant__ fptype* cIPD = nullptr; // unused as nparam=0'
             replace_dict['cipdstatic'] = '//static fptype* cIPD = nullptr; // unused as nparam=0'
-            replace_dict['cipd2tipdSym'] = '//checkCuda( cudaMemcpyToSymbol( cIPD, tIPD, %i * sizeof( fptype ) ) ); // nparam=0'%len(params)
+            replace_dict['cipd2tipdSym'] = '//gpuMemcpyToSymbol( cIPD, tIPD, %i * sizeof( fptype ) ); // nparam=0'%len(params)
             replace_dict['cipd2tipd'] = '//memcpy( cIPD, tIPD, %i * sizeof( fptype ) ); // nparam=0'%len(params)
             replace_dict['cipddump'] = ''
             replace_dict['cipdhrdcod'] = '//__device__ const fptype* cIPD = nullptr; // unused as nparam=0'
@@ -1183,13 +1183,13 @@ def get_all_sigmaKin_lines(self, color_amplitudes, class_name):
                            fptype* allDenominators,       // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
                            fptype_sv* jamp2_sv            // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled)
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
                            , const int ievt00             // input: first event number in current C++ event page (for CUDA, ievt depends on threadid)
 #endif
                            )
   //ALWAYS_INLINE // attributes are not permitted in a function definition
   {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     using namespace mg5amcGpu;
     using M_ACCESS = DeviceAccessMomenta;         // non-trivial access: buffer includes all events
     using E_ACCESS = DeviceAccessMatrixElements;  // non-trivial access: buffer includes all events
@@ -1216,7 +1216,7 @@ def get_all_sigmaKin_lines(self, color_amplitudes, class_name):
 #endif /* clang-format on */
     mgDebug( 0, __FUNCTION__ );
     //printf( \"calculate_wavefunctions: ihel=%2d\\n\", ihel );
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
     //printf( \"calculate_wavefunctions: ievt00=%d\\n\", ievt00 );
 #endif""")
             nwavefuncs = self.matrix_elements[0].get_number_of_wavefunctions()
@@ -1253,7 +1253,7 @@ def get_all_sigmaKin_lines(self, color_amplitudes, class_name):
 #endif
     for( int iParity = 0; iParity < nParity; ++iParity )
     { // START LOOP ON IPARITY
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
       const int ievt0 = ievt00 + iParity * neppV;
 #endif""")
             ret_lines += helas_calls
@@ -1653,8 +1653,10 @@ def super_get_matrix_element_calls(self, matrix_element, color_amplitudes, multi
         allCOUPs[idcoup] = CD_ACCESS::idcoupAccessBufferConst( allcouplings, idcoup ); // dependent couplings, vary event-by-event
       for( size_t iicoup = 0; iicoup < nicoup; iicoup++ )
         allCOUPs[ndcoup + iicoup] = CI_ACCESS::iicoupAccessBufferConst( cIPC, iicoup ); // independent couplings, fixed for all events
+#ifdef MGONGPUCPP_GPUIMPL
 #ifdef __CUDACC__
 #pragma nv_diagnostic pop
+#endif
       // CUDA kernels take input/output buffers with momenta/MEs for all events
       const fptype* momenta = allmomenta;
       const fptype* COUPs[nxcoup];
@@ -1770,7 +1772,7 @@ def get_external(self, wf, argument):
             split_line2 = [ str.lstrip(' ').rstrip(' ') for str in split_line2] # AV
             split_line2.insert(2, '0') # add parameter fmass=0
             line2 = ', '.join(split_line2)
-            text = '#if not( defined __CUDACC__ and defined MGONGPU_TEST_DIVERGENCE )\n      %s\n#else\n      if( ( blockDim.x * blockIdx.x + threadIdx.x ) %% 2 == 0 )\n        %s\n      else\n        %s\n#endif\n' # AV
+            text = '#if not( defined MGONGPUCPP_GPUIMPL and defined MGONGPU_TEST_DIVERGENCE )\n      %s\n#else\n      if( ( blockDim.x * blockIdx.x + threadIdx.x ) %% 2 == 0 )\n        %s\n      else\n        %s\n#endif\n' # AV
             return text % (line, line, line2)
         text = '%s\n' # AV
         return text % line
diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MemoryBuffers.h b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MemoryBuffers.h
index 7576be7e7c..d37eafb214 100644
--- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MemoryBuffers.h
+++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MemoryBuffers.h
@@ -7,6 +7,7 @@
 #define MemoryBuffers_H 1
 
 #include "mgOnGpuConfig.h"
+
 #include "mgOnGpuCxtypes.h"
 
 #include "CPPProcess.h"
diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/check_sa.cc b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/check_sa.cc
index fbe245d418..9d5f088f38 100644
--- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/check_sa.cc
+++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/check_sa.cc
@@ -745,7 +745,7 @@ main( int argc, char** argv )
   wrkflwtxt += "CUD:";
 #elif defined __HIPCC__
   wrkflwtxt += "HIP:";
-else
+#else
   wrkflwtxt += "CPP:";
 #endif
   // -- DOUBLE or FLOAT?
@@ -1053,7 +1053,7 @@ else
 #elif defined MGONGPU_CUCXTYPE_CXSMPL
              << "\"STD::COMPLEX\"," << std::endl
 #else
-             << "\"???\"," << std::endl // no path to this statement...
+             << "\"???\"," << std::endl                           // no path to this statement...
 #endif
              << "\"RanNumb memory layout\": "
              << "\"AOSOA[" << neppR << "]\""

From c8d4da4217069ed1ee9a03cad9ba01a00fcc8c6b Mon Sep 17 00:00:00 2001
From: Andrea Valassi <andrea.valassi@cern.ch>
Date: Tue, 18 Jul 2023 18:41:09 +0200
Subject: [PATCH 380/509] [jthip] in ggttgg.mad, regenerate
 DHELAS/aloha_file.inc which takes a different order

---
 epochX/cudacpp/gg_ttgg.mad/Source/DHELAS/aloha_file.inc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/epochX/cudacpp/gg_ttgg.mad/Source/DHELAS/aloha_file.inc b/epochX/cudacpp/gg_ttgg.mad/Source/DHELAS/aloha_file.inc
index cf4ec946f8..ec923afd6d 100644
--- a/epochX/cudacpp/gg_ttgg.mad/Source/DHELAS/aloha_file.inc
+++ b/epochX/cudacpp/gg_ttgg.mad/Source/DHELAS/aloha_file.inc
@@ -1 +1 @@
-ALOHARoutine = VVVV3_0.o VVVV4P0_1.o VVVV3P0_1.o VVVV1P0_1.o FFV1_1.o FFV1_2.o VVV1P0_1.o VVV1_0.o FFV1_0.o FFV1P0_3.o VVVV1_0.o VVVV4_0.o
+ALOHARoutine = FFV1_1.o VVVV4_0.o VVVV4P0_1.o FFV1_0.o VVV1_0.o FFV1_2.o VVVV3_0.o VVVV1_0.o VVVV3P0_1.o VVVV1P0_1.o VVV1P0_1.o FFV1P0_3.o

From 334aee7e234efd02f84e5e293ba61a5d119faf1f Mon Sep 17 00:00:00 2001
From: Andrea Valassi <andrea.valassi@cern.ch>
Date: Tue, 18 Jul 2023 18:57:21 +0200
Subject: [PATCH 381/509] [jthip] regenerate ggttgg.mad - all ok! a few
 improvements in CPPProcess.cc as edited in CODEGEN

---
 .../gg_ttgg.mad/CODEGEN_mad_gg_ttgg_log.txt   | 42 +++++++++----------
 .../SubProcesses/P1_gg_ttxgg/CPPProcess.cc    |  9 +++-
 2 files changed, 28 insertions(+), 23 deletions(-)

diff --git a/epochX/cudacpp/gg_ttgg.mad/CODEGEN_mad_gg_ttgg_log.txt b/epochX/cudacpp/gg_ttgg.mad/CODEGEN_mad_gg_ttgg_log.txt
index 3c85563e1c..6aa1ffbf65 100644
--- a/epochX/cudacpp/gg_ttgg.mad/CODEGEN_mad_gg_ttgg_log.txt
+++ b/epochX/cudacpp/gg_ttgg.mad/CODEGEN_mad_gg_ttgg_log.txt
@@ -51,8 +51,8 @@ Note that you can still compile and run aMC@NLO with the built-in PDFs
  MG5_aMC> set lhapdf /PATH/TO/lhapdf-config
 
 Using default text editor "vi". Set another one in ./input/mg5_configuration.txt
-No valid eps viewer found. Please set in ./input/mg5_configuration.txt
-Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt
+Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt
+No valid web browser found. Please set in ./input/mg5_configuration.txt
 import /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_mad_gg_ttgg.mg
 The import format was not given, so we guess it as command
 set stdout_level DEBUG
@@ -62,7 +62,7 @@ generate g g > t t~ g g
 No model currently active, so we import the Standard Model
 INFO: load particles 
 INFO: load vertices 
-[1;32mDEBUG: model prefixing  takes 0.004593610763549805 [0m
+[1;32mDEBUG: model prefixing  takes 0.005247592926025391 [0m
 INFO: Restrict model sm with file models/sm/restrict_default.dat . 
 [1;32mDEBUG: Simplifying conditional expressions [0m
 [1;32mDEBUG: remove interactions: u s w+ at order: QED=1 [0m
@@ -155,7 +155,7 @@ INFO: Please specify coupling orders to bypass this step.
 INFO: Trying coupling order WEIGHTED<=4: WEIGTHED IS QCD+2*QED 
 INFO: Trying process: g g > t t~ g g WEIGHTED<=4 @1  
 INFO: Process has 123 diagrams 
-1 processes with 123 diagrams generated in 0.144 s
+1 processes with 123 diagrams generated in 0.159 s
 Total: 1 processes with 123 diagrams
 output madevent CODEGEN_mad_gg_ttgg --hel_recycling=False --vector_size=16384 --me_exporter=standalone_cudacpp
 Load PLUGIN.CUDACPP_SA_OUTPUT
@@ -177,7 +177,7 @@ INFO: Creating files in directory P1_gg_ttxgg
 [1;32mDEBUG:  Entering PLUGIN_OneProcessExporter.__init__ [1;30m[model_handling.py at line 1028][0m [0m
 [1;32mDEBUG:  kwargs[prefix] = 0 [1;30m[model_handling.py at line 1029][0m [0m
 [1;32mDEBUG:  proc_id = [0m 1 [1;30m[model_handling.py at line 1034][0m [0m
-[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_SA_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7fea80645580> [1;30m[export_v4.py at line 6174][0m [0m
+[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_SA_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7f72fa7f30d0> [1;30m[export_v4.py at line 6174][0m [0m
 INFO: Creating files in directory . 
 [1;32mDEBUG:  Entering PLUGIN_OneProcessExporter.generate_process_files [1;30m[model_handling.py at line 1286][0m [0m
 [1;32mDEBUG:  self.include_multi_channel is already defined: this is madevent+second_exporter mode [1;30m[model_handling.py at line 1288][0m [0m
@@ -192,15 +192,15 @@ FileWriter <class 'PLUGIN.CUDACPP_SA_OUTPUT.model_handling.PLUGIN_CPPWriter'> fo
 [1;32mDEBUG:  self.support_multichannel, self.include_multi_channel = [0m True [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 0, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 0, 46, 47, 48, 49, 50, 51, 52, 53, 54, 0, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 0, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 0, 88, 89, 90, 91, 92, 93, 0, 94, 95, 96, 97, 98, 99, 0, 100, 101, 102, 103, 104, 105, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] [1;30m[model_handling.py at line 1152][0m [0m
 [1;32mDEBUG:  multi_channel = [0m {1: [1], 2: [2], 3: [3], 4: [4], 5: [5], 6: [6], 7: [7], 8: [8], 9: [9], 10: [10], 11: [11], 12: [12], 13: [13], 14: [14], 15: [15], 16: [16], 17: [17], 18: [18], 19: [19], 20: [20], 21: [21], 22: [22], 23: [23], 24: [24], 25: [25], 26: [26], 27: [27], 28: [28], 29: [29], 30: [30], 31: [32], 32: [33], 33: [34], 34: [35], 35: [36], 36: [37], 37: [38], 38: [39], 39: [40], 40: [41], 41: [42], 42: [43], 43: [44], 44: [45], 45: [46], 46: [48], 47: [49], 48: [50], 49: [51], 50: [52], 51: [53], 52: [54], 53: [55], 54: [56], 55: [58], 56: [59], 57: [60], 58: [61], 59: [62], 60: [63], 61: [64], 62: [65], 63: [66], 64: [67], 65: [68], 66: [69], 67: [70], 68: [71], 69: [72], 70: [74], 71: [75], 72: [76], 73: [77], 74: [78], 75: [79], 76: [80], 77: [81], 78: [82], 79: [83], 80: [84], 81: [85], 82: [86], 83: [87], 84: [88], 85: [89], 86: [90], 87: [91], 88: [93], 89: [94], 90: [95], 91: [96], 92: [97], 93: [98], 94: [100], 95: [101], 96: [102], 97: [103], 98: [104], 99: [105], 100: [107], 101: [108], 102: [109], 103: [110], 104: [111], 105: [112]} [1;30m[model_handling.py at line 1158][0m [0m
 [1;32mDEBUG:  multi_channel_map = [0m {1: [1], 2: [2], 3: [3], 4: [4], 5: [5], 6: [6], 7: [7], 8: [8], 9: [9], 10: [10], 11: [11], 12: [12], 13: [13], 14: [14], 15: [15], 16: [16], 17: [17], 18: [18], 19: [19], 20: [20], 21: [21], 22: [22], 23: [23], 24: [24], 25: [25], 26: [26], 27: [27], 28: [28], 29: [29], 30: [30], 31: [32], 32: [33], 33: [34], 34: [35], 35: [36], 36: [37], 37: [38], 38: [39], 39: [40], 40: [41], 41: [42], 42: [43], 43: [44], 44: [45], 45: [46], 46: [48], 47: [49], 48: [50], 49: [51], 50: [52], 51: [53], 52: [54], 53: [55], 54: [56], 55: [58], 56: [59], 57: [60], 58: [61], 59: [62], 60: [63], 61: [64], 62: [65], 63: [66], 64: [67], 65: [68], 66: [69], 67: [70], 68: [71], 69: [72], 70: [74], 71: [75], 72: [76], 73: [77], 74: [78], 75: [79], 76: [80], 77: [81], 78: [82], 79: [83], 80: [84], 81: [85], 82: [86], 83: [87], 84: [88], 85: [89], 86: [90], 87: [91], 88: [93], 89: [94], 90: [95], 91: [96], 92: [97], 93: [98], 94: [100], 95: [101], 96: [102], 97: [103], 98: [104], 99: [105], 100: [107], 101: [108], 102: [109], 103: [110], 104: [111], 105: [112]} [1;30m[model_handling.py at line 1643][0m [0m
-[1;32mDEBUG:  diag_to_config = [0m {4: 1, 5: 2, 6: 3, 7: 4, 8: 5, 9: 6, 10: 7, 11: 8, 12: 9, 13: 10, 14: 11, 15: 12, 16: 13, 17: 14, 18: 15, 19: 16, 20: 17, 21: 18, 22: 19, 23: 20, 24: 21, 25: 22, 26: 23, 27: 24, 28: 25, 29: 26, 30: 27, 31: 28, 32: 29, 33: 30, 37: 31, 38: 32, 39: 33, 40: 34, 41: 35, 42: 36, 43: 37, 44: 38, 45: 39, 46: 40, 47: 41, 48: 42, 49: 43, 50: 44, 51: 45, 55: 46, 56: 47, 57: 48, 58: 49, 59: 50, 60: 51, 61: 52, 62: 53, 63: 54, 67: 55, 68: 56, 69: 57, 70: 58, 71: 59, 72: 60, 73: 61, 74: 62, 75: 63, 76: 64, 77: 65, 78: 66, 79: 67, 80: 68, 81: 69, 85: 70, 86: 71, 87: 72, 88: 73, 89: 74, 90: 75, 91: 76, 92: 77, 93: 78, 94: 79, 95: 80, 96: 81, 97: 82, 98: 83, 99: 84, 100: 85, 101: 86, 102: 87, 106: 88, 107: 89, 108: 90, 109: 91, 110: 92, 111: 93, 115: 94, 116: 95, 117: 96, 118: 97, 119: 98, 120: 99, 124: 100, 125: 101, 126: 102, 127: 103, 128: 104, 129: 105} [1;30m[model_handling.py at line 1698][0m [0m
-[1;32mDEBUG:  call = [0m vxxxxx( momenta,m_pars->%s, cHel[ihel][%d],%+d, w_sv[%d], %d ); [1;30m[model_handling.py at line 1810][0m [0m
-[1;32mDEBUG:  ('ZERO', 0, -1, 0, 0) [1;30m[model_handling.py at line 1811][0m [0m
-[1;32mDEBUG:  call = [0m vxxxxx( momenta,m_pars->%s, cHel[ihel][%d],%+d, w_sv[%d], %d ); [1;30m[model_handling.py at line 1810][0m [0m
-[1;32mDEBUG:  ('ZERO', 1, -1, 1, 1) [1;30m[model_handling.py at line 1811][0m [0m
-[1;32mDEBUG:  call = [0m vxxxxx( momenta,m_pars->%s, cHel[ihel][%d],%+d, w_sv[%d], %d ); [1;30m[model_handling.py at line 1810][0m [0m
-[1;32mDEBUG:  ('ZERO', 4, 1, 4, 4) [1;30m[model_handling.py at line 1811][0m [0m
-[1;32mDEBUG:  call = [0m vxxxxx( momenta,m_pars->%s, cHel[ihel][%d],%+d, w_sv[%d], %d ); [1;30m[model_handling.py at line 1810][0m [0m
-[1;32mDEBUG:  ('ZERO', 5, 1, 5, 5) [1;30m[model_handling.py at line 1811][0m [0m
+[1;32mDEBUG:  diag_to_config = [0m {4: 1, 5: 2, 6: 3, 7: 4, 8: 5, 9: 6, 10: 7, 11: 8, 12: 9, 13: 10, 14: 11, 15: 12, 16: 13, 17: 14, 18: 15, 19: 16, 20: 17, 21: 18, 22: 19, 23: 20, 24: 21, 25: 22, 26: 23, 27: 24, 28: 25, 29: 26, 30: 27, 31: 28, 32: 29, 33: 30, 37: 31, 38: 32, 39: 33, 40: 34, 41: 35, 42: 36, 43: 37, 44: 38, 45: 39, 46: 40, 47: 41, 48: 42, 49: 43, 50: 44, 51: 45, 55: 46, 56: 47, 57: 48, 58: 49, 59: 50, 60: 51, 61: 52, 62: 53, 63: 54, 67: 55, 68: 56, 69: 57, 70: 58, 71: 59, 72: 60, 73: 61, 74: 62, 75: 63, 76: 64, 77: 65, 78: 66, 79: 67, 80: 68, 81: 69, 85: 70, 86: 71, 87: 72, 88: 73, 89: 74, 90: 75, 91: 76, 92: 77, 93: 78, 94: 79, 95: 80, 96: 81, 97: 82, 98: 83, 99: 84, 100: 85, 101: 86, 102: 87, 106: 88, 107: 89, 108: 90, 109: 91, 110: 92, 111: 93, 115: 94, 116: 95, 117: 96, 118: 97, 119: 98, 120: 99, 124: 100, 125: 101, 126: 102, 127: 103, 128: 104, 129: 105} [1;30m[model_handling.py at line 1700][0m [0m
+[1;32mDEBUG:  call = [0m vxxxxx( momenta,m_pars->%s, cHel[ihel][%d],%+d, w_sv[%d], %d ); [1;30m[model_handling.py at line 1812][0m [0m
+[1;32mDEBUG:  ('ZERO', 0, -1, 0, 0) [1;30m[model_handling.py at line 1813][0m [0m
+[1;32mDEBUG:  call = [0m vxxxxx( momenta,m_pars->%s, cHel[ihel][%d],%+d, w_sv[%d], %d ); [1;30m[model_handling.py at line 1812][0m [0m
+[1;32mDEBUG:  ('ZERO', 1, -1, 1, 1) [1;30m[model_handling.py at line 1813][0m [0m
+[1;32mDEBUG:  call = [0m vxxxxx( momenta,m_pars->%s, cHel[ihel][%d],%+d, w_sv[%d], %d ); [1;30m[model_handling.py at line 1812][0m [0m
+[1;32mDEBUG:  ('ZERO', 4, 1, 4, 4) [1;30m[model_handling.py at line 1813][0m [0m
+[1;32mDEBUG:  call = [0m vxxxxx( momenta,m_pars->%s, cHel[ihel][%d],%+d, w_sv[%d], %d ); [1;30m[model_handling.py at line 1812][0m [0m
+[1;32mDEBUG:  ('ZERO', 5, 1, 5, 5) [1;30m[model_handling.py at line 1813][0m [0m
 INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./. 
 [1;32mDEBUG:  Entering PLUGIN_OneProcessExporter.edit_CMakeLists [1;30m[model_handling.py at line 1332][0m [0m
 [1;32mDEBUG:  Entering PLUGIN_OneProcessExporter.edit_check_sa [1;30m[model_handling.py at line 1341][0m [0m
@@ -217,15 +217,15 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./.
 [1;32mDEBUG:  Done [1;30m[export_cpp.py at line 713][0m [0m
 INFO: Generating Feynman diagrams for Process: g g > t t~ g g WEIGHTED<=4 @1 
 INFO: Finding symmetric diagrams for subprocess group gg_ttxgg 
-Generated helas calls for 1 subprocesses (123 diagrams) in 0.380 s
-Wrote files for 222 helas calls in 0.696 s
+Generated helas calls for 1 subprocesses (123 diagrams) in 0.412 s
+Wrote files for 222 helas calls in 0.709 s
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates VVV1 routines[0m
 ALOHA: aloha creates FFV1 routines[0m
 ALOHA: aloha creates VVVV1 routines[0m
 ALOHA: aloha creates VVVV3 routines[0m
 ALOHA: aloha creates VVVV4 routines[0m
-ALOHA: aloha creates 5 routines in  0.281 s
+ALOHA: aloha creates 5 routines in  0.311 s
 [1;32mDEBUG:  Entering PLUGIN_ProcessExporter.convert_model (create the model) [1;30m[output.py at line 194][0m [0m
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates VVV1 routines[0m
@@ -233,7 +233,7 @@ ALOHA: aloha creates FFV1 routines[0m
 ALOHA: aloha creates VVVV1 routines[0m
 ALOHA: aloha creates VVVV3 routines[0m
 ALOHA: aloha creates VVVV4 routines[0m
-ALOHA: aloha creates 10 routines in  0.287 s
+ALOHA: aloha creates 10 routines in  0.294 s
 <class 'aloha.create_aloha.AbstractRoutine'> VVV1
 <class 'aloha.create_aloha.AbstractRoutine'> VVV1
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
@@ -284,6 +284,6 @@ Type "launch" to generate events from this process, or see
 Run "open index.html" to see more information about this process.
 quit
 
-real	0m3.208s
-user	0m2.774s
-sys	0m0.232s
+real	0m3.298s
+user	0m3.025s
+sys	0m0.197s
diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/CPPProcess.cc b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/CPPProcess.cc
index f143ae923f..674cfa4f06 100644
--- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/CPPProcess.cc
+++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/CPPProcess.cc
@@ -191,7 +191,7 @@ namespace mg5amcCpu
 #endif
       constexpr size_t nxcoup = ndcoup + nicoup; // both dependent and independent couplings
       const fptype* allCOUPs[nxcoup];
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 #pragma nv_diagnostic push
 #pragma nv_diag_suppress 186 // e.g. <<warning #186-D: pointless comparison of unsigned integer with zero>>
 #endif
@@ -200,7 +200,9 @@ namespace mg5amcCpu
       for( size_t iicoup = 0; iicoup < nicoup; iicoup++ )
         allCOUPs[ndcoup + iicoup] = CI_ACCESS::iicoupAccessBufferConst( cIPC, iicoup ); // independent couplings, fixed for all events
 #ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 #pragma nv_diagnostic pop
+#endif
       // CUDA kernels take input/output buffers with momenta/MEs for all events
       const fptype* momenta = allmomenta;
       const fptype* COUPs[nxcoup];
@@ -2672,7 +2674,7 @@ namespace mg5amcCpu
     //const cxtype tIPC[0] = { ... }; // nicoup=0
 #ifdef MGONGPUCPP_GPUIMPL
     gpuMemcpyToSymbol( cIPD, tIPD, 2 * sizeof( fptype ) );
-    //checkCuda( cudaMemcpyToSymbol( cIPC, tIPC, 0 * sizeof( cxtype ) ) ); // nicoup=0
+    //gpuMemcpyToSymbol( cIPC, tIPC, 0 * sizeof( cxtype ) ); // nicoup=0
 #else
     memcpy( cIPD, tIPD, 2 * sizeof( fptype ) );
     //memcpy( cIPC, tIPC, 0 * sizeof( cxtype ) ); // nicoup=0
@@ -2991,6 +2993,9 @@ namespace mg5amcCpu
 #endif
 
     // Start sigmaKin_lines
+
+#include "GpuAbstraction.h"
+
     // === PART 0 - INITIALISATION (before calculate_wavefunctions) ===
     // Reset the "matrix elements" - running sums of |M|^2 over helicities for the given event
 #ifdef MGONGPUCPP_GPUIMPL

From bfc5b65fe1ecba19924a5b5ef90e8c739b9c6346 Mon Sep 17 00:00:00 2001
From: Andrea Valassi <andrea.valassi@cern.ch>
Date: Tue, 18 Jul 2023 19:06:32 +0200
Subject: [PATCH 382/509] [jthip] in CODEGEN, remove the copying to src of
 GpuRuntime.h and GpuAbstraction.h

---
 epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/output.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/output.py b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/output.py
index 8ff13f6967..d1e728590c 100644
--- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/output.py
+++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/output.py
@@ -86,7 +86,7 @@ class PLUGIN_ProcessExporter(PLUGIN_export_cpp.ProcessExporterGPU):
                      'CMake': [s+'CMake/Compilers.txt', s+'CMake/Platforms.txt', s+'CMake/Macros.txt'],
                      'src': [s+'gpu/rambo.h', s+'read_slha.h', s+'read_slha.cc',
                              s+'gpu/mgOnGpuFptypes.h', s+'gpu/mgOnGpuCxtypes.h', s+'gpu/mgOnGpuVectors.h',
-                             s+'CMake/src/CMakeLists.txt', s+'gpu/GpuRuntime.h', s+'gpu/GpuAbstraction.h'],
+                             s+'CMake/src/CMakeLists.txt' ],
                      'SubProcesses': [s+'gpu/nvtx.h', s+'gpu/timer.h', s+'gpu/timermap.h',
                                       s+'gpu/ompnumthreads.h', s+'gpu/GpuRuntime.h', s+'gpu/GpuAbstraction.h',
                                       s+'gpu/MemoryAccessHelpers.h', s+'gpu/MemoryAccessVectors.h',

From b83eca55ee0c2536b6d6d058dcf029df7e33526a Mon Sep 17 00:00:00 2001
From: Andrea Valassi <andrea.valassi@cern.ch>
Date: Tue, 18 Jul 2023 19:24:36 +0200
Subject: [PATCH 383/509] [jthip] In CODEGEN, acknowledge Joergen in each file
 and in COPYRIGHT/AUTHORS

---
 .../CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/AUTHORS | 2 +-
 .../madgraph/iolibs/template_files/COPYRIGHT                 | 1 +
 .../iolibs/template_files/cpp_model_parameters_cc.inc        | 2 +-
 .../madgraph/iolibs/template_files/gpu/Bridge.h              | 2 +-
 .../madgraph/iolibs/template_files/gpu/BridgeKernels.cc      | 2 +-
 .../madgraph/iolibs/template_files/gpu/BridgeKernels.h       | 2 +-
 .../iolibs/template_files/gpu/CommonRandomNumberKernel.cc    | 2 +-
 .../iolibs/template_files/gpu/CrossSectionKernels.cc         | 2 +-
 .../madgraph/iolibs/template_files/gpu/CrossSectionKernels.h | 2 +-
 .../iolibs/template_files/gpu/CurandRandomNumberKernel.cc    | 2 +-
 .../madgraph/iolibs/template_files/gpu/EventStatistics.h     | 2 +-
 .../madgraph/iolibs/template_files/gpu/GpuAbstraction.h      | 5 +++++
 .../madgraph/iolibs/template_files/gpu/GpuRuntime.h          | 5 +++++
 .../madgraph/iolibs/template_files/gpu/MadgraphTest.h        | 2 +-
 .../iolibs/template_files/gpu/MatrixElementKernels.cc        | 2 +-
 .../iolibs/template_files/gpu/MatrixElementKernels.h         | 2 +-
 .../madgraph/iolibs/template_files/gpu/MemoryAccessHelpers.h | 2 +-
 .../madgraph/iolibs/template_files/gpu/MemoryAccessMomenta.h | 2 +-
 .../iolibs/template_files/gpu/MemoryAccessRandomNumbers.h    | 2 +-
 .../madgraph/iolibs/template_files/gpu/MemoryAccessVectors.h | 2 +-
 .../madgraph/iolibs/template_files/gpu/MemoryBuffers.h       | 2 +-
 .../iolibs/template_files/gpu/RamboSamplingKernels.cc        | 2 +-
 .../iolibs/template_files/gpu/RamboSamplingKernels.h         | 2 +-
 .../madgraph/iolibs/template_files/gpu/RandomNumberKernels.h | 2 +-
 .../madgraph/iolibs/template_files/gpu/check_sa.cc           | 2 +-
 .../madgraph/iolibs/template_files/gpu/cpp_hel_amps_h.inc    | 2 +-
 .../madgraph/iolibs/template_files/gpu/cudacpp.mk            | 2 +-
 .../madgraph/iolibs/template_files/gpu/fbridge.cc            | 2 +-
 .../madgraph/iolibs/template_files/gpu/fsampler.cc           | 2 +-
 .../madgraph/iolibs/template_files/gpu/mgOnGpuConfig.h       | 2 +-
 .../madgraph/iolibs/template_files/gpu/mgOnGpuCxtypes.h      | 2 +-
 .../madgraph/iolibs/template_files/gpu/mgOnGpuFptypes.h      | 2 +-
 .../madgraph/iolibs/template_files/gpu/process_cc.inc        | 2 +-
 .../template_files/gpu/process_function_definitions.inc      | 2 +-
 .../madgraph/iolibs/template_files/gpu/process_h.inc         | 2 +-
 .../madgraph/iolibs/template_files/gpu/process_matrix.inc    | 2 +-
 .../iolibs/template_files/gpu/process_sigmaKin_function.inc  | 2 +-
 .../madgraph/iolibs/template_files/gpu/rambo.h               | 2 +-
 .../madgraph/iolibs/template_files/gpu/runTest.cc            | 2 +-
 .../madgraph/iolibs/template_files/gpu/testmisc.cc           | 2 +-
 .../madgraph/iolibs/template_files/gpu/testxxx.cc            | 2 +-
 .../CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/model_handling.py       | 2 +-
 epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/output.py    | 2 +-
 43 files changed, 51 insertions(+), 40 deletions(-)

diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/AUTHORS b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/AUTHORS
index 8541e954b9..0aeb2c8a87 100644
--- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/AUTHORS
+++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/AUTHORS
@@ -10,6 +10,7 @@ generates includes the following authors:
   Stephan Hageboeck (CERN)
   Olivier Mattelaer (Universite Catholique de Louvain, original author)
   Stefan Roiser (CERN, original author)
+  Joergen Teig (CERN)
   Andrea Valassi (CERN, original author)
   Zenny Wettersten (CERN)
 
@@ -28,5 +29,4 @@ acknowledged collaboration with the following collaborators:
   Taran Singhania (PES University Bangalore)
   David Smith (CERN)
   Carl Vuosalo (University of Wisconsin-Madison)
-  Joergen Teig (CERN)
 
diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/COPYRIGHT b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/COPYRIGHT
index a134b5fef9..84a883fbb0 100644
--- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/COPYRIGHT
+++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/COPYRIGHT
@@ -15,6 +15,7 @@ The full development team currently includes the following authors :
   Stephan Hageboeck (CERN)
   Olivier Mattelaer (Universite Catholique de Louvain, original author)
   Stefan Roiser (CERN, original author)
+  Joergen Teig (CERN)
   Andrea Valassi (CERN, original author)
   Zenny Wettersten (CERN)
 See https://github.com/madgraph5/madgraph4gpu for more details. For the full
diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/cpp_model_parameters_cc.inc b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/cpp_model_parameters_cc.inc
index 6dc8ed45a7..f83f776f8c 100644
--- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/cpp_model_parameters_cc.inc
+++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/cpp_model_parameters_cc.inc
@@ -4,7 +4,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: A. Valassi (Sep 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
 %(info_lines)s
diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/Bridge.h b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/Bridge.h
index 3c96940cc1..c04628dfd1 100644
--- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/Bridge.h
+++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/Bridge.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: S. Roiser (Nov 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Roiser, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef BRIDGE_H
 #define BRIDGE_H 1
diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/BridgeKernels.cc b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/BridgeKernels.cc
index f844178cbb..90c7f2d3b8 100644
--- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/BridgeKernels.cc
+++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/BridgeKernels.cc
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #include "BridgeKernels.h"
 
diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/BridgeKernels.h b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/BridgeKernels.h
index 7c7feb692a..3efef8ce97 100644
--- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/BridgeKernels.h
+++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/BridgeKernels.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef BRIDGEKERNELS_H
 #define BRIDGEKERNELS_H 1
diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/CommonRandomNumberKernel.cc b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/CommonRandomNumberKernel.cc
index f17b9c0ad7..010bc4cbd0 100644
--- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/CommonRandomNumberKernel.cc
+++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/CommonRandomNumberKernel.cc
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #include "CommonRandomNumbers.h"
 #include "GpuAbstraction.h"
diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/CrossSectionKernels.cc b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/CrossSectionKernels.cc
index 36ca2a94d4..c15b39844d 100644
--- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/CrossSectionKernels.cc
+++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/CrossSectionKernels.cc
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #include "CrossSectionKernels.h"
 
diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/CrossSectionKernels.h b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/CrossSectionKernels.h
index ff2350a14d..4d9659e04e 100644
--- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/CrossSectionKernels.h
+++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/CrossSectionKernels.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef CROSSSECTIONKERNELS_H
 #define CROSSSECTIONKERNELS_H 1
diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/CurandRandomNumberKernel.cc b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/CurandRandomNumberKernel.cc
index 98ec214eaf..38c477c17a 100644
--- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/CurandRandomNumberKernel.cc
+++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/CurandRandomNumberKernel.cc
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #include "GpuRuntime.h"
 #include "MemoryBuffers.h"
diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/EventStatistics.h b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/EventStatistics.h
index e7d7f3b3c3..b425a5bade 100644
--- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/EventStatistics.h
+++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/EventStatistics.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef EventStatistics_H
 #define EventStatistics_H 1
diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/GpuAbstraction.h b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/GpuAbstraction.h
index 427c82c05d..6a7d9c05c0 100644
--- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/GpuAbstraction.h
+++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/GpuAbstraction.h
@@ -1,3 +1,8 @@
+// Copyright (C) 2020-2023 CERN and UCLouvain.
+// Licensed under the GNU Lesser General Public License (version 3 or later).
+// Created by: J. Teig (Jul 2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+
 #ifndef MG5AMC_GPUABSTRACTION_H
 #define MG5AMC_GPUABSTRACTION_H 1
 
diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/GpuRuntime.h b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/GpuRuntime.h
index 895a662e52..93579ef08b 100644
--- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/GpuRuntime.h
+++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/GpuRuntime.h
@@ -1,3 +1,8 @@
+// Copyright (C) 2020-2023 CERN and UCLouvain.
+// Licensed under the GNU Lesser General Public License (version 3 or later).
+// Created by: J. Teig (Jun 2023, based on earlier work by S. Roiser) for the MG5aMC CUDACPP plugin.
+// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+
 #ifndef MG5AMC_GPURUNTIME_H
 #define MG5AMC_GPURUNTIME_H 1
 
diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MadgraphTest.h b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MadgraphTest.h
index 5920d08bf7..d2ff326e20 100644
--- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MadgraphTest.h
+++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MadgraphTest.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: S. Hageboeck (Dec 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MADGRAPHTEST_H_
 #define MADGRAPHTEST_H_ 1
diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MatrixElementKernels.cc b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MatrixElementKernels.cc
index a9e20e114f..d6d6c4f179 100644
--- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MatrixElementKernels.cc
+++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MatrixElementKernels.cc
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #include "MatrixElementKernels.h"
 
diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MatrixElementKernels.h b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MatrixElementKernels.h
index 4477a385ed..72bd8f195b 100644
--- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MatrixElementKernels.h
+++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MatrixElementKernels.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MATRIXELEMENTKERNELS_H
 #define MATRIXELEMENTKERNELS_H 1
diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MemoryAccessHelpers.h b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MemoryAccessHelpers.h
index 67306c3922..db73e4e064 100644
--- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MemoryAccessHelpers.h
+++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MemoryAccessHelpers.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MemoryAccessHelpers_H
 #define MemoryAccessHelpers_H 1
diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MemoryAccessMomenta.h b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MemoryAccessMomenta.h
index f797f85ca5..38fade09fb 100644
--- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MemoryAccessMomenta.h
+++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MemoryAccessMomenta.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MemoryAccessMomenta_H
 #define MemoryAccessMomenta_H 1
diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MemoryAccessRandomNumbers.h b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MemoryAccessRandomNumbers.h
index 949a42066d..40cb089135 100644
--- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MemoryAccessRandomNumbers.h
+++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MemoryAccessRandomNumbers.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MemoryAccessRandomNumbers_H
 #define MemoryAccessRandomNumbers_H 1
diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MemoryAccessVectors.h b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MemoryAccessVectors.h
index a9ae26b6dc..08faccff0f 100644
--- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MemoryAccessVectors.h
+++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MemoryAccessVectors.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MemoryAccessVectors_H
 #define MemoryAccessVectors_H 1
diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MemoryBuffers.h b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MemoryBuffers.h
index 522e6ce100..f29b8c5357 100644
--- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MemoryBuffers.h
+++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MemoryBuffers.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021, based on earlier work by S. Hageboeck) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Roiser, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MemoryBuffers_H
 #define MemoryBuffers_H 1
diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/RamboSamplingKernels.cc b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/RamboSamplingKernels.cc
index 8745b084d3..79abbcc4f8 100644
--- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/RamboSamplingKernels.cc
+++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/RamboSamplingKernels.cc
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #include "RamboSamplingKernels.h"
 
diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/RamboSamplingKernels.h b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/RamboSamplingKernels.h
index fe63a7bb77..7c214cd74b 100644
--- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/RamboSamplingKernels.h
+++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/RamboSamplingKernels.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef RAMBOSAMPLINGKERNELS_H
 #define RAMBOSAMPLINGKERNELS_H 1
diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/RandomNumberKernels.h b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/RandomNumberKernels.h
index 0c215f2583..21d63beeac 100644
--- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/RandomNumberKernels.h
+++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/RandomNumberKernels.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef RANDOMNUMBERKERNELS_H
 #define RANDOMNUMBERKERNELS_H 1
diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/check_sa.cc b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/check_sa.cc
index 9d5f088f38..1bad694d1c 100644
--- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/check_sa.cc
+++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/check_sa.cc
@@ -4,7 +4,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: O. Mattelaer (Nov 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 
 #include "mgOnGpuConfig.h"
diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/cpp_hel_amps_h.inc b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/cpp_hel_amps_h.inc
index 6a592e8da8..88d9c3122a 100644
--- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/cpp_hel_amps_h.inc
+++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/cpp_hel_amps_h.inc
@@ -4,7 +4,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: A. Valassi (Sep 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for %(output_name)s by
 %(info_lines)s
diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/cudacpp.mk b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/cudacpp.mk
index 4cd31ea7c3..44bbb2e126 100644
--- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/cudacpp.mk
+++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/cudacpp.mk
@@ -1,7 +1,7 @@
 # Copyright (C) 2020-2023 CERN and UCLouvain.
 # Licensed under the GNU Lesser General Public License (version 3 or later).
 # Created by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin.
-# Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+# Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 
 #=== Determine the name of this makefile (https://ftp.gnu.org/old-gnu/Manuals/make-3.80/html_node/make_17.html)
 #=== NB: different names (e.g. cudacpp.mk and cudacpp_src.mk) are used in the Subprocess and src directories
diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/fbridge.cc b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/fbridge.cc
index 343b6b8d9c..2b956730d4 100644
--- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/fbridge.cc
+++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/fbridge.cc
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: S. Roiser (Oct 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Roiser, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #include "Bridge.h"
 #include "CPPProcess.h"
diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/fsampler.cc b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/fsampler.cc
index acffa7c19e..3743934f41 100644
--- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/fsampler.cc
+++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/fsampler.cc
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Feb 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #include "mgOnGpuConfig.h"
 
diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/mgOnGpuConfig.h b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/mgOnGpuConfig.h
index 69d85b61cb..0884c88d37 100644
--- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/mgOnGpuConfig.h
+++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/mgOnGpuConfig.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jul 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MGONGPUCONFIG_H
 #define MGONGPUCONFIG_H 1
diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/mgOnGpuCxtypes.h b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/mgOnGpuCxtypes.h
index 866b7640f6..4e7ab03fa2 100644
--- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/mgOnGpuCxtypes.h
+++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/mgOnGpuCxtypes.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022, based on earlier work by D. Smith) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MGONGPUCXTYPES_H
 #define MGONGPUCXTYPES_H 1
diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/mgOnGpuFptypes.h b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/mgOnGpuFptypes.h
index 7edefa3389..6f6cee64d6 100644
--- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/mgOnGpuFptypes.h
+++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/mgOnGpuFptypes.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MGONGPUFPTYPES_H
 #define MGONGPUFPTYPES_H 1
diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/process_cc.inc b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/process_cc.inc
index 95400f42db..815fd8d5b7 100644
--- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/process_cc.inc
+++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/process_cc.inc
@@ -4,7 +4,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
 %(info_lines)s
diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/process_function_definitions.inc b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/process_function_definitions.inc
index 1269fb0a3f..065a464a4e 100644
--- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/process_function_definitions.inc
+++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/process_function_definitions.inc
@@ -4,7 +4,7 @@
 ! Copyright (C) 2020-2023 CERN and UCLouvain.
 ! Licensed under the GNU Lesser General Public License (version 3 or later).
 ! Modified by: A. Valassi (Sep 2021) for the MG5aMC CUDACPP plugin.
-! Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+! Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 !==========================================================================
 //==========================================================================
 // Class member functions for calculating the matrix elements for
diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/process_h.inc b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/process_h.inc
index 8a9de336f2..2c3adf57e2 100644
--- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/process_h.inc
+++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/process_h.inc
@@ -4,7 +4,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
 %(info_lines)s
diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/process_matrix.inc b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/process_matrix.inc
index 3cfbf668ca..960f029d8d 100644
--- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/process_matrix.inc
+++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/process_matrix.inc
@@ -4,7 +4,7 @@
 ! Copyright (C) 2020-2023 CERN and UCLouvain.
 ! Licensed under the GNU Lesser General Public License (version 3 or later).
 ! Modified by: A. Valassi (Sep 2021) for the MG5aMC CUDACPP plugin.
-! Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+! Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 !==========================================================================
 
       // *** COLOR CHOICE BELOW ***
diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/process_sigmaKin_function.inc b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/process_sigmaKin_function.inc
index 59c1623c5a..b84a96d6ec 100644
--- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/process_sigmaKin_function.inc
+++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/process_sigmaKin_function.inc
@@ -4,7 +4,7 @@
 ! Copyright (C) 2020-2023 CERN and UCLouvain.
 ! Licensed under the GNU Lesser General Public License (version 3 or later).
 ! Modified by: A. Valassi (Sep 2021) for the MG5aMC CUDACPP plugin.
-! Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+! Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 !==========================================================================
 
 #include "GpuAbstraction.h"
diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/rambo.h b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/rambo.h
index 3a331b979a..cd7e1008ea 100644
--- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/rambo.h
+++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/rambo.h
@@ -4,7 +4,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 
 #include "mgOnGpuConfig.h"
diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/runTest.cc b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/runTest.cc
index 05d1a08c2b..461ec5c3a5 100644
--- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/runTest.cc
+++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/runTest.cc
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: S. Hageboeck (Nov 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 
 #include "mgOnGpuConfig.h"
 
diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/testmisc.cc b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/testmisc.cc
index dcafb44ee6..2bd7a9fcf9 100644
--- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/testmisc.cc
+++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/testmisc.cc
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 //----------------------------------------------------------------------------
 // Use ./runTest.exe --gtest_filter=*misc to run only this test
 //----------------------------------------------------------------------------
diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/testxxx.cc b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/testxxx.cc
index eba58eea70..adac281361 100644
--- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/testxxx.cc
+++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/testxxx.cc
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Apr 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #include "mgOnGpuConfig.h"
 
diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/model_handling.py b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/model_handling.py
index b5b6ed037b..7d56091856 100644
--- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/model_handling.py
+++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/model_handling.py
@@ -1,7 +1,7 @@
 # Copyright (C) 2020-2023 CERN and UCLouvain.
 # Licensed under the GNU Lesser General Public License (version 3 or later).
 # Created by: O. Mattelaer (Sep 2021) for the MG5aMC CUDACPP plugin.
-# Further modified by: O. Mattelaer, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+# Further modified by: O. Mattelaer, J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 import os
 
diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/output.py b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/output.py
index d1e728590c..cb9a8b5f99 100644
--- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/output.py
+++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/output.py
@@ -1,7 +1,7 @@
 # Copyright (C) 2020-2023 CERN and UCLouvain.
 # Licensed under the GNU Lesser General Public License (version 3 or later).
 # Created by: O. Mattelaer (Sep 2021) for the MG5aMC CUDACPP plugin.
-# Further modified by: O. Mattelaer, A. Valassi, Z. Wettersten (2021-2023) for the MG5aMC CUDACPP plugin.
+# Further modified by: O. Mattelaer, A. Valassi, J. Teig, Z. Wettersten (2021-2023) for the MG5aMC CUDACPP plugin.
 
 import os
 

From ccb3b305cf31211719776f820e51be31bbeda964 Mon Sep 17 00:00:00 2001
From: Andrea Valassi <andrea.valassi@cern.ch>
Date: Tue, 18 Jul 2023 19:25:45 +0200
Subject: [PATCH 384/509] [jthip] regenerate ggttgg.mad including Jorgen's name
 in each file - all ok

---
 .../gg_ttgg.mad/CODEGEN_mad_gg_ttgg_log.txt   | 20 +++++++++----------
 epochX/cudacpp/gg_ttgg.mad/COPYRIGHT          |  1 +
 .../cudacpp/gg_ttgg.mad/SubProcesses/Bridge.h |  2 +-
 .../gg_ttgg.mad/SubProcesses/BridgeKernels.cc |  2 +-
 .../gg_ttgg.mad/SubProcesses/BridgeKernels.h  |  2 +-
 .../SubProcesses/CommonRandomNumberKernel.cc  |  2 +-
 .../SubProcesses/CrossSectionKernels.cc       |  2 +-
 .../SubProcesses/CrossSectionKernels.h        |  2 +-
 .../SubProcesses/CurandRandomNumberKernel.cc  |  2 +-
 .../SubProcesses/EventStatistics.h            |  2 +-
 .../gg_ttgg.mad/SubProcesses/GpuAbstraction.h |  5 +++++
 .../gg_ttgg.mad/SubProcesses/GpuRuntime.h     |  5 +++++
 .../gg_ttgg.mad/SubProcesses/MadgraphTest.h   |  2 +-
 .../SubProcesses/MatrixElementKernels.cc      |  2 +-
 .../SubProcesses/MatrixElementKernels.h       |  2 +-
 .../SubProcesses/MemoryAccessHelpers.h        |  2 +-
 .../SubProcesses/MemoryAccessMomenta.h        |  2 +-
 .../SubProcesses/MemoryAccessRandomNumbers.h  |  2 +-
 .../SubProcesses/MemoryAccessVectors.h        |  2 +-
 .../gg_ttgg.mad/SubProcesses/MemoryBuffers.h  |  2 +-
 .../SubProcesses/P1_gg_ttxgg/CPPProcess.cc    |  2 +-
 .../SubProcesses/P1_gg_ttxgg/CPPProcess.h     |  2 +-
 .../SubProcesses/P1_gg_ttxgg/check_sa.cc      |  2 +-
 .../SubProcesses/RamboSamplingKernels.cc      |  2 +-
 .../SubProcesses/RamboSamplingKernels.h       |  2 +-
 .../SubProcesses/RandomNumberKernels.h        |  2 +-
 .../gg_ttgg.mad/SubProcesses/cudacpp.mk       |  2 +-
 .../gg_ttgg.mad/SubProcesses/fbridge.cc       |  2 +-
 .../gg_ttgg.mad/SubProcesses/fsampler.cc      |  2 +-
 .../gg_ttgg.mad/SubProcesses/runTest.cc       |  2 +-
 .../gg_ttgg.mad/SubProcesses/testmisc.cc      |  2 +-
 .../gg_ttgg.mad/SubProcesses/testxxx.cc       |  2 +-
 epochX/cudacpp/gg_ttgg.mad/src/HelAmps_sm.h   |  2 +-
 .../cudacpp/gg_ttgg.mad/src/Parameters_sm.cc  |  2 +-
 .../cudacpp/gg_ttgg.mad/src/mgOnGpuConfig.h   |  2 +-
 .../cudacpp/gg_ttgg.mad/src/mgOnGpuCxtypes.h  |  2 +-
 .../cudacpp/gg_ttgg.mad/src/mgOnGpuFptypes.h  |  2 +-
 epochX/cudacpp/gg_ttgg.mad/src/rambo.h        |  2 +-
 38 files changed, 55 insertions(+), 44 deletions(-)

diff --git a/epochX/cudacpp/gg_ttgg.mad/CODEGEN_mad_gg_ttgg_log.txt b/epochX/cudacpp/gg_ttgg.mad/CODEGEN_mad_gg_ttgg_log.txt
index 6aa1ffbf65..cc40481acf 100644
--- a/epochX/cudacpp/gg_ttgg.mad/CODEGEN_mad_gg_ttgg_log.txt
+++ b/epochX/cudacpp/gg_ttgg.mad/CODEGEN_mad_gg_ttgg_log.txt
@@ -62,7 +62,7 @@ generate g g > t t~ g g
 No model currently active, so we import the Standard Model
 INFO: load particles 
 INFO: load vertices 
-[1;32mDEBUG: model prefixing  takes 0.005247592926025391 [0m
+[1;32mDEBUG: model prefixing  takes 0.0049550533294677734 [0m
 INFO: Restrict model sm with file models/sm/restrict_default.dat . 
 [1;32mDEBUG: Simplifying conditional expressions [0m
 [1;32mDEBUG: remove interactions: u s w+ at order: QED=1 [0m
@@ -155,7 +155,7 @@ INFO: Please specify coupling orders to bypass this step.
 INFO: Trying coupling order WEIGHTED<=4: WEIGTHED IS QCD+2*QED 
 INFO: Trying process: g g > t t~ g g WEIGHTED<=4 @1  
 INFO: Process has 123 diagrams 
-1 processes with 123 diagrams generated in 0.159 s
+1 processes with 123 diagrams generated in 0.157 s
 Total: 1 processes with 123 diagrams
 output madevent CODEGEN_mad_gg_ttgg --hel_recycling=False --vector_size=16384 --me_exporter=standalone_cudacpp
 Load PLUGIN.CUDACPP_SA_OUTPUT
@@ -177,7 +177,7 @@ INFO: Creating files in directory P1_gg_ttxgg
 [1;32mDEBUG:  Entering PLUGIN_OneProcessExporter.__init__ [1;30m[model_handling.py at line 1028][0m [0m
 [1;32mDEBUG:  kwargs[prefix] = 0 [1;30m[model_handling.py at line 1029][0m [0m
 [1;32mDEBUG:  proc_id = [0m 1 [1;30m[model_handling.py at line 1034][0m [0m
-[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_SA_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7f72fa7f30d0> [1;30m[export_v4.py at line 6174][0m [0m
+[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_SA_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7f325f8ed0d0> [1;30m[export_v4.py at line 6174][0m [0m
 INFO: Creating files in directory . 
 [1;32mDEBUG:  Entering PLUGIN_OneProcessExporter.generate_process_files [1;30m[model_handling.py at line 1286][0m [0m
 [1;32mDEBUG:  self.include_multi_channel is already defined: this is madevent+second_exporter mode [1;30m[model_handling.py at line 1288][0m [0m
@@ -217,15 +217,15 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./.
 [1;32mDEBUG:  Done [1;30m[export_cpp.py at line 713][0m [0m
 INFO: Generating Feynman diagrams for Process: g g > t t~ g g WEIGHTED<=4 @1 
 INFO: Finding symmetric diagrams for subprocess group gg_ttxgg 
-Generated helas calls for 1 subprocesses (123 diagrams) in 0.412 s
-Wrote files for 222 helas calls in 0.709 s
+Generated helas calls for 1 subprocesses (123 diagrams) in 0.413 s
+Wrote files for 222 helas calls in 0.707 s
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates VVV1 routines[0m
 ALOHA: aloha creates FFV1 routines[0m
 ALOHA: aloha creates VVVV1 routines[0m
 ALOHA: aloha creates VVVV3 routines[0m
 ALOHA: aloha creates VVVV4 routines[0m
-ALOHA: aloha creates 5 routines in  0.311 s
+ALOHA: aloha creates 5 routines in  0.316 s
 [1;32mDEBUG:  Entering PLUGIN_ProcessExporter.convert_model (create the model) [1;30m[output.py at line 194][0m [0m
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates VVV1 routines[0m
@@ -233,7 +233,7 @@ ALOHA: aloha creates FFV1 routines[0m
 ALOHA: aloha creates VVVV1 routines[0m
 ALOHA: aloha creates VVVV3 routines[0m
 ALOHA: aloha creates VVVV4 routines[0m
-ALOHA: aloha creates 10 routines in  0.294 s
+ALOHA: aloha creates 10 routines in  0.296 s
 <class 'aloha.create_aloha.AbstractRoutine'> VVV1
 <class 'aloha.create_aloha.AbstractRoutine'> VVV1
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
@@ -284,6 +284,6 @@ Type "launch" to generate events from this process, or see
 Run "open index.html" to see more information about this process.
 quit
 
-real	0m3.298s
-user	0m3.025s
-sys	0m0.197s
+real	0m3.597s
+user	0m3.023s
+sys	0m0.207s
diff --git a/epochX/cudacpp/gg_ttgg.mad/COPYRIGHT b/epochX/cudacpp/gg_ttgg.mad/COPYRIGHT
index a134b5fef9..84a883fbb0 100644
--- a/epochX/cudacpp/gg_ttgg.mad/COPYRIGHT
+++ b/epochX/cudacpp/gg_ttgg.mad/COPYRIGHT
@@ -15,6 +15,7 @@ The full development team currently includes the following authors :
   Stephan Hageboeck (CERN)
   Olivier Mattelaer (Universite Catholique de Louvain, original author)
   Stefan Roiser (CERN, original author)
+  Joergen Teig (CERN)
   Andrea Valassi (CERN, original author)
   Zenny Wettersten (CERN)
 See https://github.com/madgraph5/madgraph4gpu for more details. For the full
diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/Bridge.h b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/Bridge.h
index 3c96940cc1..c04628dfd1 100644
--- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/Bridge.h
+++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/Bridge.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: S. Roiser (Nov 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Roiser, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef BRIDGE_H
 #define BRIDGE_H 1
diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/BridgeKernels.cc b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/BridgeKernels.cc
index f844178cbb..90c7f2d3b8 100644
--- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/BridgeKernels.cc
+++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/BridgeKernels.cc
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #include "BridgeKernels.h"
 
diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/BridgeKernels.h b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/BridgeKernels.h
index 7c7feb692a..3efef8ce97 100644
--- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/BridgeKernels.h
+++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/BridgeKernels.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef BRIDGEKERNELS_H
 #define BRIDGEKERNELS_H 1
diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/CommonRandomNumberKernel.cc b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/CommonRandomNumberKernel.cc
index f17b9c0ad7..010bc4cbd0 100644
--- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/CommonRandomNumberKernel.cc
+++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/CommonRandomNumberKernel.cc
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #include "CommonRandomNumbers.h"
 #include "GpuAbstraction.h"
diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/CrossSectionKernels.cc b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/CrossSectionKernels.cc
index 36ca2a94d4..c15b39844d 100644
--- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/CrossSectionKernels.cc
+++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/CrossSectionKernels.cc
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #include "CrossSectionKernels.h"
 
diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/CrossSectionKernels.h b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/CrossSectionKernels.h
index ff2350a14d..4d9659e04e 100644
--- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/CrossSectionKernels.h
+++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/CrossSectionKernels.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef CROSSSECTIONKERNELS_H
 #define CROSSSECTIONKERNELS_H 1
diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/CurandRandomNumberKernel.cc b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/CurandRandomNumberKernel.cc
index 98ec214eaf..38c477c17a 100644
--- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/CurandRandomNumberKernel.cc
+++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/CurandRandomNumberKernel.cc
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #include "GpuRuntime.h"
 #include "MemoryBuffers.h"
diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/EventStatistics.h b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/EventStatistics.h
index e7d7f3b3c3..b425a5bade 100644
--- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/EventStatistics.h
+++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/EventStatistics.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef EventStatistics_H
 #define EventStatistics_H 1
diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/GpuAbstraction.h b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/GpuAbstraction.h
index 427c82c05d..6a7d9c05c0 100644
--- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/GpuAbstraction.h
+++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/GpuAbstraction.h
@@ -1,3 +1,8 @@
+// Copyright (C) 2020-2023 CERN and UCLouvain.
+// Licensed under the GNU Lesser General Public License (version 3 or later).
+// Created by: J. Teig (Jul 2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+
 #ifndef MG5AMC_GPUABSTRACTION_H
 #define MG5AMC_GPUABSTRACTION_H 1
 
diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/GpuRuntime.h b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/GpuRuntime.h
index 895a662e52..93579ef08b 100644
--- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/GpuRuntime.h
+++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/GpuRuntime.h
@@ -1,3 +1,8 @@
+// Copyright (C) 2020-2023 CERN and UCLouvain.
+// Licensed under the GNU Lesser General Public License (version 3 or later).
+// Created by: J. Teig (Jun 2023, based on earlier work by S. Roiser) for the MG5aMC CUDACPP plugin.
+// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+
 #ifndef MG5AMC_GPURUNTIME_H
 #define MG5AMC_GPURUNTIME_H 1
 
diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MadgraphTest.h b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MadgraphTest.h
index 5920d08bf7..d2ff326e20 100644
--- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MadgraphTest.h
+++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MadgraphTest.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: S. Hageboeck (Dec 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MADGRAPHTEST_H_
 #define MADGRAPHTEST_H_ 1
diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MatrixElementKernels.cc b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MatrixElementKernels.cc
index a9e20e114f..d6d6c4f179 100644
--- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MatrixElementKernels.cc
+++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MatrixElementKernels.cc
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #include "MatrixElementKernels.h"
 
diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MatrixElementKernels.h b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MatrixElementKernels.h
index 4477a385ed..72bd8f195b 100644
--- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MatrixElementKernels.h
+++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MatrixElementKernels.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MATRIXELEMENTKERNELS_H
 #define MATRIXELEMENTKERNELS_H 1
diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MemoryAccessHelpers.h b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MemoryAccessHelpers.h
index 67306c3922..db73e4e064 100644
--- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MemoryAccessHelpers.h
+++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MemoryAccessHelpers.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MemoryAccessHelpers_H
 #define MemoryAccessHelpers_H 1
diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MemoryAccessMomenta.h b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MemoryAccessMomenta.h
index f797f85ca5..38fade09fb 100644
--- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MemoryAccessMomenta.h
+++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MemoryAccessMomenta.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MemoryAccessMomenta_H
 #define MemoryAccessMomenta_H 1
diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MemoryAccessRandomNumbers.h b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MemoryAccessRandomNumbers.h
index 949a42066d..40cb089135 100644
--- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MemoryAccessRandomNumbers.h
+++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MemoryAccessRandomNumbers.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MemoryAccessRandomNumbers_H
 #define MemoryAccessRandomNumbers_H 1
diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MemoryAccessVectors.h b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MemoryAccessVectors.h
index a9ae26b6dc..08faccff0f 100644
--- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MemoryAccessVectors.h
+++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MemoryAccessVectors.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MemoryAccessVectors_H
 #define MemoryAccessVectors_H 1
diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MemoryBuffers.h b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MemoryBuffers.h
index d37eafb214..7756a71621 100644
--- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MemoryBuffers.h
+++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MemoryBuffers.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021, based on earlier work by S. Hageboeck) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Roiser, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MemoryBuffers_H
 #define MemoryBuffers_H 1
diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/CPPProcess.cc b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/CPPProcess.cc
index 674cfa4f06..23d4e8b670 100644
--- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/CPPProcess.cc
+++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/CPPProcess.cc
@@ -4,7 +4,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
 // MadGraph5_aMC@NLO v. 3.5.0_lo_vect, 2023-06-09
diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/CPPProcess.h b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/CPPProcess.h
index c926411529..5fa603d43c 100644
--- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/CPPProcess.h
+++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/CPPProcess.h
@@ -4,7 +4,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
 // MadGraph5_aMC@NLO v. 3.5.0_lo_vect, 2023-06-09
diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/check_sa.cc b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/check_sa.cc
index 9d5f088f38..1bad694d1c 100644
--- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/check_sa.cc
+++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/check_sa.cc
@@ -4,7 +4,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: O. Mattelaer (Nov 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 
 #include "mgOnGpuConfig.h"
diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/RamboSamplingKernels.cc b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/RamboSamplingKernels.cc
index 8745b084d3..79abbcc4f8 100644
--- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/RamboSamplingKernels.cc
+++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/RamboSamplingKernels.cc
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #include "RamboSamplingKernels.h"
 
diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/RamboSamplingKernels.h b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/RamboSamplingKernels.h
index fe63a7bb77..7c214cd74b 100644
--- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/RamboSamplingKernels.h
+++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/RamboSamplingKernels.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef RAMBOSAMPLINGKERNELS_H
 #define RAMBOSAMPLINGKERNELS_H 1
diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/RandomNumberKernels.h b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/RandomNumberKernels.h
index 0c215f2583..21d63beeac 100644
--- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/RandomNumberKernels.h
+++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/RandomNumberKernels.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef RANDOMNUMBERKERNELS_H
 #define RANDOMNUMBERKERNELS_H 1
diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/cudacpp.mk b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/cudacpp.mk
index ff63aa4a43..bcb73d7f01 100644
--- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/cudacpp.mk
+++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/cudacpp.mk
@@ -1,7 +1,7 @@
 # Copyright (C) 2020-2023 CERN and UCLouvain.
 # Licensed under the GNU Lesser General Public License (version 3 or later).
 # Created by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin.
-# Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+# Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 
 #=== Determine the name of this makefile (https://ftp.gnu.org/old-gnu/Manuals/make-3.80/html_node/make_17.html)
 #=== NB: different names (e.g. cudacpp.mk and cudacpp_src.mk) are used in the Subprocess and src directories
diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/fbridge.cc b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/fbridge.cc
index 343b6b8d9c..2b956730d4 100644
--- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/fbridge.cc
+++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/fbridge.cc
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: S. Roiser (Oct 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Roiser, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #include "Bridge.h"
 #include "CPPProcess.h"
diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/fsampler.cc b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/fsampler.cc
index acffa7c19e..3743934f41 100644
--- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/fsampler.cc
+++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/fsampler.cc
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Feb 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #include "mgOnGpuConfig.h"
 
diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/runTest.cc b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/runTest.cc
index 05d1a08c2b..461ec5c3a5 100644
--- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/runTest.cc
+++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/runTest.cc
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: S. Hageboeck (Nov 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 
 #include "mgOnGpuConfig.h"
 
diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/testmisc.cc b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/testmisc.cc
index dcafb44ee6..2bd7a9fcf9 100644
--- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/testmisc.cc
+++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/testmisc.cc
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 //----------------------------------------------------------------------------
 // Use ./runTest.exe --gtest_filter=*misc to run only this test
 //----------------------------------------------------------------------------
diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/testxxx.cc b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/testxxx.cc
index b58d908756..6e8657edca 100644
--- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/testxxx.cc
+++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/testxxx.cc
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Apr 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #include "mgOnGpuConfig.h"
 
diff --git a/epochX/cudacpp/gg_ttgg.mad/src/HelAmps_sm.h b/epochX/cudacpp/gg_ttgg.mad/src/HelAmps_sm.h
index f8f9fa7f9c..f772885631 100644
--- a/epochX/cudacpp/gg_ttgg.mad/src/HelAmps_sm.h
+++ b/epochX/cudacpp/gg_ttgg.mad/src/HelAmps_sm.h
@@ -4,7 +4,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: A. Valassi (Sep 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
 // MadGraph5_aMC@NLO v. 3.5.0_lo_vect, 2023-06-09
diff --git a/epochX/cudacpp/gg_ttgg.mad/src/Parameters_sm.cc b/epochX/cudacpp/gg_ttgg.mad/src/Parameters_sm.cc
index d3d01102fd..de87dcaf64 100644
--- a/epochX/cudacpp/gg_ttgg.mad/src/Parameters_sm.cc
+++ b/epochX/cudacpp/gg_ttgg.mad/src/Parameters_sm.cc
@@ -4,7 +4,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: A. Valassi (Sep 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
 // MadGraph5_aMC@NLO v. 3.5.0_lo_vect, 2023-06-09
diff --git a/epochX/cudacpp/gg_ttgg.mad/src/mgOnGpuConfig.h b/epochX/cudacpp/gg_ttgg.mad/src/mgOnGpuConfig.h
index 909446af7b..390766116b 100644
--- a/epochX/cudacpp/gg_ttgg.mad/src/mgOnGpuConfig.h
+++ b/epochX/cudacpp/gg_ttgg.mad/src/mgOnGpuConfig.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jul 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MGONGPUCONFIG_H
 #define MGONGPUCONFIG_H 1
diff --git a/epochX/cudacpp/gg_ttgg.mad/src/mgOnGpuCxtypes.h b/epochX/cudacpp/gg_ttgg.mad/src/mgOnGpuCxtypes.h
index 866b7640f6..4e7ab03fa2 100644
--- a/epochX/cudacpp/gg_ttgg.mad/src/mgOnGpuCxtypes.h
+++ b/epochX/cudacpp/gg_ttgg.mad/src/mgOnGpuCxtypes.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022, based on earlier work by D. Smith) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MGONGPUCXTYPES_H
 #define MGONGPUCXTYPES_H 1
diff --git a/epochX/cudacpp/gg_ttgg.mad/src/mgOnGpuFptypes.h b/epochX/cudacpp/gg_ttgg.mad/src/mgOnGpuFptypes.h
index 7edefa3389..6f6cee64d6 100644
--- a/epochX/cudacpp/gg_ttgg.mad/src/mgOnGpuFptypes.h
+++ b/epochX/cudacpp/gg_ttgg.mad/src/mgOnGpuFptypes.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MGONGPUFPTYPES_H
 #define MGONGPUFPTYPES_H 1
diff --git a/epochX/cudacpp/gg_ttgg.mad/src/rambo.h b/epochX/cudacpp/gg_ttgg.mad/src/rambo.h
index 3a331b979a..cd7e1008ea 100644
--- a/epochX/cudacpp/gg_ttgg.mad/src/rambo.h
+++ b/epochX/cudacpp/gg_ttgg.mad/src/rambo.h
@@ -4,7 +4,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 
 #include "mgOnGpuConfig.h"

From 1207f1f73d22108a97a0ac3954b08e41fc504455 Mon Sep 17 00:00:00 2001
From: Andrea Valassi <andrea.valassi@cern.ch>
Date: Tue, 18 Jul 2023 19:33:10 +0200
Subject: [PATCH 385/509] [jthip] regenerate with HIP all other 6 processes mad
 and 7 processes sa

---
 .../ee_mumu.mad/CODEGEN_mad_ee_mumu_log.txt   |  22 +-
 epochX/cudacpp/ee_mumu.mad/COPYRIGHT          |   1 +
 .../ee_mumu.mad/Source/DHELAS/aloha_file.inc  |   2 +-
 .../cudacpp/ee_mumu.mad/SubProcesses/Bridge.h |   5 +-
 .../ee_mumu.mad/SubProcesses/BridgeKernels.cc |   3 +-
 .../ee_mumu.mad/SubProcesses/BridgeKernels.h  |   3 +-
 .../SubProcesses/CommonRandomNumberKernel.cc  |   2 +-
 .../SubProcesses/CrossSectionKernels.cc       |   3 +-
 .../SubProcesses/CrossSectionKernels.h        |   3 +-
 .../SubProcesses/CurandRandomNumberKernel.cc  |   9 +-
 .../SubProcesses/EventStatistics.h            |   4 +-
 .../ee_mumu.mad/SubProcesses/GpuAbstraction.h | 103 +++---
 .../ee_mumu.mad/SubProcesses/GpuRuntime.h     |  11 +-
 .../ee_mumu.mad/SubProcesses/MadgraphTest.h   |   4 +-
 .../SubProcesses/MatrixElementKernels.cc      |  22 +-
 .../SubProcesses/MatrixElementKernels.h       |   4 +-
 .../SubProcesses/MemoryAccessHelpers.h        |   4 +-
 .../SubProcesses/MemoryAccessMomenta.h        |   2 +-
 .../SubProcesses/MemoryAccessRandomNumbers.h  |   2 +-
 .../SubProcesses/MemoryAccessVectors.h        |   4 +-
 .../ee_mumu.mad/SubProcesses/MemoryBuffers.h  |   2 +-
 .../SubProcesses/P1_epem_mupmum/CPPProcess.cc |  13 +-
 .../SubProcesses/P1_epem_mupmum/CPPProcess.h  |   3 +-
 .../SubProcesses/P1_epem_mupmum/CudaRuntime.h |   1 -
 .../SubProcesses/P1_epem_mupmum/check_sa.cc   |  62 ++--
 .../SubProcesses/RamboSamplingKernels.cc      |   6 +-
 .../SubProcesses/RamboSamplingKernels.h       |   4 +-
 .../SubProcesses/RandomNumberKernels.h        |   4 +-
 .../ee_mumu.mad/SubProcesses/cudacpp.mk       |   2 +-
 .../ee_mumu.mad/SubProcesses/fbridge.cc       |   4 +-
 .../ee_mumu.mad/SubProcesses/fsampler.cc      |   3 +-
 .../ee_mumu.mad/SubProcesses/runTest.cc       |   3 +-
 .../ee_mumu.mad/SubProcesses/testmisc.cc      |   4 +-
 .../ee_mumu.mad/SubProcesses/testxxx.cc       |   3 +-
 .../cudacpp/ee_mumu.mad/src/GpuAbstraction.h  |   1 -
 epochX/cudacpp/ee_mumu.mad/src/HelAmps_sm.h   |   2 +-
 .../cudacpp/ee_mumu.mad/src/Parameters_sm.cc  |   2 +-
 .../cudacpp/ee_mumu.mad/src/mgOnGpuConfig.h   |  79 +++--
 .../cudacpp/ee_mumu.mad/src/mgOnGpuCxtypes.h  |   7 +-
 .../cudacpp/ee_mumu.mad/src/mgOnGpuFptypes.h  |   2 +-
 .../cudacpp/ee_mumu.mad/src/mgOnGpuVectors.h  |   2 -
 epochX/cudacpp/ee_mumu.mad/src/rambo.h        |   2 +-
 .../CODEGEN_cudacpp_ee_mumu_log.txt           |  16 +-
 epochX/cudacpp/ee_mumu.sa/COPYRIGHT           |   1 +
 .../cudacpp/ee_mumu.sa/SubProcesses/Bridge.h  |  32 +-
 .../ee_mumu.sa/SubProcesses/BridgeKernels.cc  |   9 +-
 .../ee_mumu.sa/SubProcesses/BridgeKernels.h   |   8 +-
 .../SubProcesses/CommonRandomNumberKernel.cc  |   5 +-
 .../SubProcesses/CrossSectionKernels.cc       |   7 +-
 .../SubProcesses/CrossSectionKernels.h        |   6 +-
 .../SubProcesses/CurandRandomNumberKernel.cc  |   8 +-
 .../ee_mumu.sa/SubProcesses/EventStatistics.h |   4 +-
 .../ee_mumu.sa/SubProcesses/GpuAbstraction.h  |  71 +++++
 .../SubProcesses/GpuRuntime.h}                |  50 +--
 .../ee_mumu.sa/SubProcesses/MadgraphTest.h    |   8 +-
 .../SubProcesses/MatrixElementKernels.cc      |  26 +-
 .../SubProcesses/MatrixElementKernels.h       |   8 +-
 .../SubProcesses/MemoryAccessHelpers.h        |   4 +-
 .../SubProcesses/MemoryAccessMomenta.h        |   6 +-
 .../SubProcesses/MemoryAccessRandomNumbers.h  |   4 +-
 .../SubProcesses/MemoryAccessVectors.h        |   4 +-
 .../ee_mumu.sa/SubProcesses/MemoryBuffers.h   |  64 ++--
 .../P1_Sigma_sm_epem_mupmum/CPPProcess.cc     |  64 ++--
 .../P1_Sigma_sm_epem_mupmum/CPPProcess.h      |  10 +-
 .../P1_Sigma_sm_epem_mupmum/CudaRuntime.h     |   1 -
 .../P1_Sigma_sm_epem_mupmum/GpuAbstraction.h  |   1 +
 .../P1_Sigma_sm_epem_mupmum/GpuRuntime.h      |   1 +
 .../P1_Sigma_sm_epem_mupmum/check_sa.cc       | 103 +++---
 .../SubProcesses/RamboSamplingKernels.cc      |  20 +-
 .../SubProcesses/RamboSamplingKernels.h       |   6 +-
 .../SubProcesses/RandomNumberKernels.h        |   6 +-
 .../ee_mumu.sa/SubProcesses/cudacpp.mk        | 297 +++++++++++-------
 .../ee_mumu.sa/SubProcesses/fbridge.cc        |  16 +-
 .../ee_mumu.sa/SubProcesses/fsampler.cc       |   8 +-
 .../ee_mumu.sa/SubProcesses/runTest.cc        |  12 +-
 .../ee_mumu.sa/SubProcesses/testmisc.cc       |   4 +-
 .../ee_mumu.sa/SubProcesses/testxxx.cc        |   8 +-
 epochX/cudacpp/ee_mumu.sa/src/HelAmps_sm.h    |   4 +-
 .../cudacpp/ee_mumu.sa/src/Parameters_sm.cc   |   2 +-
 epochX/cudacpp/ee_mumu.sa/src/Parameters_sm.h |   6 +-
 epochX/cudacpp/ee_mumu.sa/src/mgOnGpuConfig.h |  70 +++--
 .../cudacpp/ee_mumu.sa/src/mgOnGpuCxtypes.h   |  18 +-
 .../cudacpp/ee_mumu.sa/src/mgOnGpuFptypes.h   |  10 +-
 .../cudacpp/ee_mumu.sa/src/mgOnGpuVectors.h   |  14 +-
 epochX/cudacpp/ee_mumu.sa/src/rambo.h         |   8 +-
 .../gg_tt.mad/CODEGEN_mad_gg_tt_log.txt       |  30 +-
 epochX/cudacpp/gg_tt.mad/COPYRIGHT            |   1 +
 .../gg_tt.mad/Source/DHELAS/aloha_file.inc    |   2 +-
 .../cudacpp/gg_tt.mad/SubProcesses/Bridge.h   |  32 +-
 .../gg_tt.mad/SubProcesses/BridgeKernels.cc   |   9 +-
 .../gg_tt.mad/SubProcesses/BridgeKernels.h    |   8 +-
 .../SubProcesses/CommonRandomNumberKernel.cc  |   5 +-
 .../SubProcesses/CrossSectionKernels.cc       |   7 +-
 .../SubProcesses/CrossSectionKernels.h        |   6 +-
 .../SubProcesses/CurandRandomNumberKernel.cc  |   8 +-
 .../gg_tt.mad/SubProcesses/EventStatistics.h  |   4 +-
 .../gg_tt.mad/SubProcesses/GpuAbstraction.h   |  71 +++++
 .../{CudaRuntime.h => GpuRuntime.h}           |  54 ++--
 .../gg_tt.mad/SubProcesses/MadgraphTest.h     |   8 +-
 .../SubProcesses/MatrixElementKernels.cc      |  26 +-
 .../SubProcesses/MatrixElementKernels.h       |   8 +-
 .../SubProcesses/MemoryAccessHelpers.h        |   4 +-
 .../SubProcesses/MemoryAccessMomenta.h        |   6 +-
 .../SubProcesses/MemoryAccessRandomNumbers.h  |   4 +-
 .../SubProcesses/MemoryAccessVectors.h        |   4 +-
 .../gg_tt.mad/SubProcesses/MemoryBuffers.h    |  64 ++--
 .../SubProcesses/P1_gg_ttx/CPPProcess.cc      |  62 ++--
 .../SubProcesses/P1_gg_ttx/CPPProcess.h       |  10 +-
 .../SubProcesses/P1_gg_ttx/CudaRuntime.h      |   1 -
 .../SubProcesses/P1_gg_ttx/GpuAbstraction.h   |   1 +
 .../SubProcesses/P1_gg_ttx/GpuRuntime.h       |   1 +
 .../SubProcesses/P1_gg_ttx/check_sa.cc        | 103 +++---
 .../SubProcesses/RamboSamplingKernels.cc      |  20 +-
 .../SubProcesses/RamboSamplingKernels.h       |   6 +-
 .../SubProcesses/RandomNumberKernels.h        |   6 +-
 .../cudacpp/gg_tt.mad/SubProcesses/cudacpp.mk | 297 +++++++++++-------
 .../cudacpp/gg_tt.mad/SubProcesses/fbridge.cc |  16 +-
 .../gg_tt.mad/SubProcesses/fsampler.cc        |   8 +-
 .../cudacpp/gg_tt.mad/SubProcesses/runTest.cc |  12 +-
 .../gg_tt.mad/SubProcesses/testmisc.cc        |   4 +-
 .../cudacpp/gg_tt.mad/SubProcesses/testxxx.cc |   8 +-
 epochX/cudacpp/gg_tt.mad/src/HelAmps_sm.h     |   4 +-
 epochX/cudacpp/gg_tt.mad/src/Parameters_sm.cc |   2 +-
 epochX/cudacpp/gg_tt.mad/src/Parameters_sm.h  |   6 +-
 epochX/cudacpp/gg_tt.mad/src/mgOnGpuConfig.h  |  70 +++--
 epochX/cudacpp/gg_tt.mad/src/mgOnGpuCxtypes.h |  18 +-
 epochX/cudacpp/gg_tt.mad/src/mgOnGpuFptypes.h |  10 +-
 epochX/cudacpp/gg_tt.mad/src/mgOnGpuVectors.h |  14 +-
 epochX/cudacpp/gg_tt.mad/src/rambo.h          |   8 +-
 .../gg_tt.sa/CODEGEN_cudacpp_gg_tt_log.txt    |  26 +-
 epochX/cudacpp/gg_tt.sa/COPYRIGHT             |   1 +
 epochX/cudacpp/gg_tt.sa/SubProcesses/Bridge.h |  32 +-
 .../gg_tt.sa/SubProcesses/BridgeKernels.cc    |   9 +-
 .../gg_tt.sa/SubProcesses/BridgeKernels.h     |   8 +-
 .../SubProcesses/CommonRandomNumberKernel.cc  |   5 +-
 .../SubProcesses/CrossSectionKernels.cc       |   7 +-
 .../SubProcesses/CrossSectionKernels.h        |   6 +-
 .../SubProcesses/CurandRandomNumberKernel.cc  |   8 +-
 .../gg_tt.sa/SubProcesses/EventStatistics.h   |   4 +-
 .../gg_tt.sa/SubProcesses/GpuAbstraction.h    |  71 +++++
 .../{CudaRuntime.h => GpuRuntime.h}           |  54 ++--
 .../gg_tt.sa/SubProcesses/MadgraphTest.h      |   8 +-
 .../SubProcesses/MatrixElementKernels.cc      |  26 +-
 .../SubProcesses/MatrixElementKernels.h       |   8 +-
 .../SubProcesses/MemoryAccessHelpers.h        |   4 +-
 .../SubProcesses/MemoryAccessMomenta.h        |   6 +-
 .../SubProcesses/MemoryAccessRandomNumbers.h  |   4 +-
 .../SubProcesses/MemoryAccessVectors.h        |   4 +-
 .../gg_tt.sa/SubProcesses/MemoryBuffers.h     |  64 ++--
 .../P1_Sigma_sm_gg_ttx/CPPProcess.cc          |  62 ++--
 .../P1_Sigma_sm_gg_ttx/CPPProcess.h           |  10 +-
 .../P1_Sigma_sm_gg_ttx/CudaRuntime.h          |   1 -
 .../P1_Sigma_sm_gg_ttx/GpuAbstraction.h       |   1 +
 .../P1_Sigma_sm_gg_ttx/GpuRuntime.h           |   1 +
 .../P1_Sigma_sm_gg_ttx/check_sa.cc            | 103 +++---
 .../SubProcesses/RamboSamplingKernels.cc      |  20 +-
 .../SubProcesses/RamboSamplingKernels.h       |   6 +-
 .../SubProcesses/RandomNumberKernels.h        |   6 +-
 .../cudacpp/gg_tt.sa/SubProcesses/cudacpp.mk  | 297 +++++++++++-------
 .../cudacpp/gg_tt.sa/SubProcesses/fbridge.cc  |  16 +-
 .../cudacpp/gg_tt.sa/SubProcesses/fsampler.cc |   8 +-
 .../cudacpp/gg_tt.sa/SubProcesses/runTest.cc  |  12 +-
 .../cudacpp/gg_tt.sa/SubProcesses/testmisc.cc |   4 +-
 .../cudacpp/gg_tt.sa/SubProcesses/testxxx.cc  |   8 +-
 epochX/cudacpp/gg_tt.sa/src/HelAmps_sm.h      |   4 +-
 epochX/cudacpp/gg_tt.sa/src/Parameters_sm.cc  |   2 +-
 epochX/cudacpp/gg_tt.sa/src/Parameters_sm.h   |   6 +-
 epochX/cudacpp/gg_tt.sa/src/mgOnGpuConfig.h   |  70 +++--
 epochX/cudacpp/gg_tt.sa/src/mgOnGpuCxtypes.h  |  18 +-
 epochX/cudacpp/gg_tt.sa/src/mgOnGpuFptypes.h  |  10 +-
 epochX/cudacpp/gg_tt.sa/src/mgOnGpuVectors.h  |  14 +-
 epochX/cudacpp/gg_tt.sa/src/rambo.h           |   8 +-
 .../gg_tt01g.mad/CODEGEN_mad_gg_tt01g_log.txt |  50 +--
 epochX/cudacpp/gg_tt01g.mad/COPYRIGHT         |   1 +
 .../gg_tt01g.mad/Source/DHELAS/aloha_file.inc |   2 +-
 .../gg_tt01g.mad/SubProcesses/Bridge.h        |  32 +-
 .../SubProcesses/BridgeKernels.cc             |   9 +-
 .../gg_tt01g.mad/SubProcesses/BridgeKernels.h |   8 +-
 .../SubProcesses/CommonRandomNumberKernel.cc  |   5 +-
 .../SubProcesses/CrossSectionKernels.cc       |   7 +-
 .../SubProcesses/CrossSectionKernels.h        |   6 +-
 .../gg_tt01g.mad/SubProcesses/CudaRuntime.h   |  85 -----
 .../SubProcesses/CurandRandomNumberKernel.cc  |   8 +-
 .../SubProcesses/EventStatistics.h            |   4 +-
 .../SubProcesses/GpuAbstraction.h             |  71 +++++
 .../SubProcesses/GpuRuntime.h}                |  54 ++--
 .../gg_tt01g.mad/SubProcesses/MadgraphTest.h  |   8 +-
 .../SubProcesses/MatrixElementKernels.cc      |  26 +-
 .../SubProcesses/MatrixElementKernels.h       |   8 +-
 .../SubProcesses/MemoryAccessHelpers.h        |   4 +-
 .../SubProcesses/MemoryAccessMomenta.h        |   6 +-
 .../SubProcesses/MemoryAccessRandomNumbers.h  |   4 +-
 .../SubProcesses/MemoryAccessVectors.h        |   4 +-
 .../gg_tt01g.mad/SubProcesses/MemoryBuffers.h |  64 ++--
 .../SubProcesses/P1_gg_ttx/CPPProcess.cc      |  62 ++--
 .../SubProcesses/P1_gg_ttx/CPPProcess.h       |  10 +-
 .../SubProcesses/P1_gg_ttx/CudaRuntime.h      |   1 -
 .../SubProcesses/P1_gg_ttx/GpuAbstraction.h   |   1 +
 .../SubProcesses/P1_gg_ttx/GpuRuntime.h       |   1 +
 .../SubProcesses/P1_gg_ttx/check_sa.cc        | 103 +++---
 .../SubProcesses/P2_gg_ttxg/CPPProcess.cc     |  62 ++--
 .../SubProcesses/P2_gg_ttxg/CPPProcess.h      |  10 +-
 .../SubProcesses/P2_gg_ttxg/CudaRuntime.h     |   1 -
 .../SubProcesses/P2_gg_ttxg/GpuAbstraction.h  |   1 +
 .../SubProcesses/P2_gg_ttxg/GpuRuntime.h      |   1 +
 .../SubProcesses/P2_gg_ttxg/check_sa.cc       | 103 +++---
 .../SubProcesses/RamboSamplingKernels.cc      |  20 +-
 .../SubProcesses/RamboSamplingKernels.h       |   6 +-
 .../SubProcesses/RandomNumberKernels.h        |   6 +-
 .../gg_tt01g.mad/SubProcesses/cudacpp.mk      | 297 +++++++++++-------
 .../gg_tt01g.mad/SubProcesses/fbridge.cc      |  16 +-
 .../gg_tt01g.mad/SubProcesses/fsampler.cc     |   8 +-
 .../gg_tt01g.mad/SubProcesses/runTest.cc      |  12 +-
 .../gg_tt01g.mad/SubProcesses/testmisc.cc     |   4 +-
 .../gg_tt01g.mad/SubProcesses/testxxx.cc      |   8 +-
 epochX/cudacpp/gg_tt01g.mad/src/HelAmps_sm.h  |   4 +-
 .../cudacpp/gg_tt01g.mad/src/Parameters_sm.cc |   2 +-
 .../cudacpp/gg_tt01g.mad/src/Parameters_sm.h  |   6 +-
 .../cudacpp/gg_tt01g.mad/src/mgOnGpuConfig.h  |  70 +++--
 .../cudacpp/gg_tt01g.mad/src/mgOnGpuCxtypes.h |  18 +-
 .../cudacpp/gg_tt01g.mad/src/mgOnGpuFptypes.h |  10 +-
 .../cudacpp/gg_tt01g.mad/src/mgOnGpuVectors.h |  14 +-
 epochX/cudacpp/gg_tt01g.mad/src/rambo.h       |   8 +-
 .../gg_ttg.mad/CODEGEN_mad_gg_ttg_log.txt     |  38 +--
 epochX/cudacpp/gg_ttg.mad/COPYRIGHT           |   1 +
 .../gg_ttg.mad/Source/DHELAS/aloha_file.inc   |   2 +-
 .../cudacpp/gg_ttg.mad/SubProcesses/Bridge.h  |  32 +-
 .../gg_ttg.mad/SubProcesses/BridgeKernels.cc  |   9 +-
 .../gg_ttg.mad/SubProcesses/BridgeKernels.h   |   8 +-
 .../SubProcesses/CommonRandomNumberKernel.cc  |   5 +-
 .../SubProcesses/CrossSectionKernels.cc       |   7 +-
 .../SubProcesses/CrossSectionKernels.h        |   6 +-
 .../gg_ttg.mad/SubProcesses/CudaRuntime.h     |  85 -----
 .../SubProcesses/CurandRandomNumberKernel.cc  |   8 +-
 .../gg_ttg.mad/SubProcesses/EventStatistics.h |   4 +-
 .../gg_ttg.mad/SubProcesses/GpuAbstraction.h  |  71 +++++
 .../gg_ttg.mad/SubProcesses/GpuRuntime.h      |  85 +++++
 .../gg_ttg.mad/SubProcesses/MadgraphTest.h    |   8 +-
 .../SubProcesses/MatrixElementKernels.cc      |  26 +-
 .../SubProcesses/MatrixElementKernels.h       |   8 +-
 .../SubProcesses/MemoryAccessHelpers.h        |   4 +-
 .../SubProcesses/MemoryAccessMomenta.h        |   6 +-
 .../SubProcesses/MemoryAccessRandomNumbers.h  |   4 +-
 .../SubProcesses/MemoryAccessVectors.h        |   4 +-
 .../gg_ttg.mad/SubProcesses/MemoryBuffers.h   |  64 ++--
 .../SubProcesses/P1_gg_ttxg/CPPProcess.cc     |  62 ++--
 .../SubProcesses/P1_gg_ttxg/CPPProcess.h      |  10 +-
 .../SubProcesses/P1_gg_ttxg/CudaRuntime.h     |   1 -
 .../SubProcesses/P1_gg_ttxg/GpuAbstraction.h  |   1 +
 .../SubProcesses/P1_gg_ttxg/GpuRuntime.h      |   1 +
 .../SubProcesses/P1_gg_ttxg/check_sa.cc       | 103 +++---
 .../SubProcesses/RamboSamplingKernels.cc      |  20 +-
 .../SubProcesses/RamboSamplingKernels.h       |   6 +-
 .../SubProcesses/RandomNumberKernels.h        |   6 +-
 .../gg_ttg.mad/SubProcesses/cudacpp.mk        | 297 +++++++++++-------
 .../gg_ttg.mad/SubProcesses/fbridge.cc        |  16 +-
 .../gg_ttg.mad/SubProcesses/fsampler.cc       |   8 +-
 .../gg_ttg.mad/SubProcesses/runTest.cc        |  12 +-
 .../gg_ttg.mad/SubProcesses/testmisc.cc       |   4 +-
 .../gg_ttg.mad/SubProcesses/testxxx.cc        |   8 +-
 epochX/cudacpp/gg_ttg.mad/src/HelAmps_sm.h    |   4 +-
 .../cudacpp/gg_ttg.mad/src/Parameters_sm.cc   |   2 +-
 epochX/cudacpp/gg_ttg.mad/src/Parameters_sm.h |   6 +-
 epochX/cudacpp/gg_ttg.mad/src/mgOnGpuConfig.h |  70 +++--
 .../cudacpp/gg_ttg.mad/src/mgOnGpuCxtypes.h   |  18 +-
 .../cudacpp/gg_ttg.mad/src/mgOnGpuFptypes.h   |  10 +-
 .../cudacpp/gg_ttg.mad/src/mgOnGpuVectors.h   |  14 +-
 epochX/cudacpp/gg_ttg.mad/src/rambo.h         |   8 +-
 .../gg_ttg.sa/CODEGEN_cudacpp_gg_ttg_log.txt  |  32 +-
 epochX/cudacpp/gg_ttg.sa/COPYRIGHT            |   1 +
 .../cudacpp/gg_ttg.sa/SubProcesses/Bridge.h   |  32 +-
 .../gg_ttg.sa/SubProcesses/BridgeKernels.cc   |   9 +-
 .../gg_ttg.sa/SubProcesses/BridgeKernels.h    |   8 +-
 .../SubProcesses/CommonRandomNumberKernel.cc  |   5 +-
 .../SubProcesses/CrossSectionKernels.cc       |   7 +-
 .../SubProcesses/CrossSectionKernels.h        |   6 +-
 .../gg_ttg.sa/SubProcesses/CudaRuntime.h      |  85 -----
 .../SubProcesses/CurandRandomNumberKernel.cc  |   8 +-
 .../gg_ttg.sa/SubProcesses/EventStatistics.h  |   4 +-
 .../gg_ttg.sa/SubProcesses/GpuAbstraction.h   |  71 +++++
 .../gg_ttg.sa/SubProcesses/GpuRuntime.h       |  85 +++++
 .../gg_ttg.sa/SubProcesses/MadgraphTest.h     |   8 +-
 .../SubProcesses/MatrixElementKernels.cc      |  26 +-
 .../SubProcesses/MatrixElementKernels.h       |   8 +-
 .../SubProcesses/MemoryAccessHelpers.h        |   4 +-
 .../SubProcesses/MemoryAccessMomenta.h        |   6 +-
 .../SubProcesses/MemoryAccessRandomNumbers.h  |   4 +-
 .../SubProcesses/MemoryAccessVectors.h        |   4 +-
 .../gg_ttg.sa/SubProcesses/MemoryBuffers.h    |  64 ++--
 .../P1_Sigma_sm_gg_ttxg/CPPProcess.cc         |  62 ++--
 .../P1_Sigma_sm_gg_ttxg/CPPProcess.h          |  10 +-
 .../P1_Sigma_sm_gg_ttxg/CudaRuntime.h         |   1 -
 .../P1_Sigma_sm_gg_ttxg/GpuAbstraction.h      |   1 +
 .../P1_Sigma_sm_gg_ttxg/GpuRuntime.h          |   1 +
 .../P1_Sigma_sm_gg_ttxg/check_sa.cc           | 103 +++---
 .../SubProcesses/RamboSamplingKernels.cc      |  20 +-
 .../SubProcesses/RamboSamplingKernels.h       |   6 +-
 .../SubProcesses/RandomNumberKernels.h        |   6 +-
 .../cudacpp/gg_ttg.sa/SubProcesses/cudacpp.mk | 297 +++++++++++-------
 .../cudacpp/gg_ttg.sa/SubProcesses/fbridge.cc |  16 +-
 .../gg_ttg.sa/SubProcesses/fsampler.cc        |   8 +-
 .../cudacpp/gg_ttg.sa/SubProcesses/runTest.cc |  12 +-
 .../gg_ttg.sa/SubProcesses/testmisc.cc        |   4 +-
 .../cudacpp/gg_ttg.sa/SubProcesses/testxxx.cc |   8 +-
 epochX/cudacpp/gg_ttg.sa/src/HelAmps_sm.h     |   4 +-
 epochX/cudacpp/gg_ttg.sa/src/Parameters_sm.cc |   2 +-
 epochX/cudacpp/gg_ttg.sa/src/Parameters_sm.h  |   6 +-
 epochX/cudacpp/gg_ttg.sa/src/mgOnGpuConfig.h  |  70 +++--
 epochX/cudacpp/gg_ttg.sa/src/mgOnGpuCxtypes.h |  18 +-
 epochX/cudacpp/gg_ttg.sa/src/mgOnGpuFptypes.h |  10 +-
 epochX/cudacpp/gg_ttg.sa/src/mgOnGpuVectors.h |  14 +-
 epochX/cudacpp/gg_ttg.sa/src/rambo.h          |   8 +-
 .../CODEGEN_cudacpp_gg_ttgg_log.txt           |  36 +--
 epochX/cudacpp/gg_ttgg.sa/COPYRIGHT           |   1 +
 .../cudacpp/gg_ttgg.sa/SubProcesses/Bridge.h  |  32 +-
 .../gg_ttgg.sa/SubProcesses/BridgeKernels.cc  |   9 +-
 .../gg_ttgg.sa/SubProcesses/BridgeKernels.h   |   8 +-
 .../SubProcesses/CommonRandomNumberKernel.cc  |   5 +-
 .../SubProcesses/CrossSectionKernels.cc       |   7 +-
 .../SubProcesses/CrossSectionKernels.h        |   6 +-
 .../gg_ttgg.sa/SubProcesses/CudaRuntime.h     |  85 -----
 .../SubProcesses/CurandRandomNumberKernel.cc  |   8 +-
 .../gg_ttgg.sa/SubProcesses/EventStatistics.h |   4 +-
 .../gg_ttgg.sa/SubProcesses/GpuAbstraction.h  |  71 +++++
 .../gg_ttgg.sa/SubProcesses/GpuRuntime.h      |  85 +++++
 .../gg_ttgg.sa/SubProcesses/MadgraphTest.h    |   8 +-
 .../SubProcesses/MatrixElementKernels.cc      |  26 +-
 .../SubProcesses/MatrixElementKernels.h       |   8 +-
 .../SubProcesses/MemoryAccessHelpers.h        |   4 +-
 .../SubProcesses/MemoryAccessMomenta.h        |   6 +-
 .../SubProcesses/MemoryAccessRandomNumbers.h  |   4 +-
 .../SubProcesses/MemoryAccessVectors.h        |   4 +-
 .../gg_ttgg.sa/SubProcesses/MemoryBuffers.h   |  64 ++--
 .../P1_Sigma_sm_gg_ttxgg/CPPProcess.cc        |  62 ++--
 .../P1_Sigma_sm_gg_ttxgg/CPPProcess.h         |  10 +-
 .../P1_Sigma_sm_gg_ttxgg/CudaRuntime.h        |   1 -
 .../P1_Sigma_sm_gg_ttxgg/GpuAbstraction.h     |   1 +
 .../P1_Sigma_sm_gg_ttxgg/GpuRuntime.h         |   1 +
 .../P1_Sigma_sm_gg_ttxgg/check_sa.cc          | 103 +++---
 .../SubProcesses/RamboSamplingKernels.cc      |  20 +-
 .../SubProcesses/RamboSamplingKernels.h       |   6 +-
 .../SubProcesses/RandomNumberKernels.h        |   6 +-
 .../gg_ttgg.sa/SubProcesses/cudacpp.mk        | 297 +++++++++++-------
 .../gg_ttgg.sa/SubProcesses/fbridge.cc        |  16 +-
 .../gg_ttgg.sa/SubProcesses/fsampler.cc       |   8 +-
 .../gg_ttgg.sa/SubProcesses/runTest.cc        |  12 +-
 .../gg_ttgg.sa/SubProcesses/testmisc.cc       |   4 +-
 .../gg_ttgg.sa/SubProcesses/testxxx.cc        |   8 +-
 epochX/cudacpp/gg_ttgg.sa/src/HelAmps_sm.h    |   4 +-
 .../cudacpp/gg_ttgg.sa/src/Parameters_sm.cc   |   2 +-
 epochX/cudacpp/gg_ttgg.sa/src/Parameters_sm.h |   6 +-
 epochX/cudacpp/gg_ttgg.sa/src/mgOnGpuConfig.h |  70 +++--
 .../cudacpp/gg_ttgg.sa/src/mgOnGpuCxtypes.h   |  18 +-
 .../cudacpp/gg_ttgg.sa/src/mgOnGpuFptypes.h   |  10 +-
 .../cudacpp/gg_ttgg.sa/src/mgOnGpuVectors.h   |  14 +-
 epochX/cudacpp/gg_ttgg.sa/src/rambo.h         |   8 +-
 .../gg_ttggg.mad/CODEGEN_mad_gg_ttggg_log.txt |  48 +--
 epochX/cudacpp/gg_ttggg.mad/COPYRIGHT         |   1 +
 .../gg_ttggg.mad/Source/DHELAS/aloha_file.inc |   2 +-
 .../gg_ttggg.mad/SubProcesses/Bridge.h        |  32 +-
 .../SubProcesses/BridgeKernels.cc             |   9 +-
 .../gg_ttggg.mad/SubProcesses/BridgeKernels.h |   8 +-
 .../SubProcesses/CommonRandomNumberKernel.cc  |   5 +-
 .../SubProcesses/CrossSectionKernels.cc       |   7 +-
 .../SubProcesses/CrossSectionKernels.h        |   6 +-
 .../gg_ttggg.mad/SubProcesses/CudaRuntime.h   |  85 -----
 .../SubProcesses/CurandRandomNumberKernel.cc  |   8 +-
 .../SubProcesses/EventStatistics.h            |   4 +-
 .../SubProcesses/GpuAbstraction.h             |  71 +++++
 .../gg_ttggg.mad/SubProcesses/GpuRuntime.h    |  85 +++++
 .../gg_ttggg.mad/SubProcesses/MadgraphTest.h  |   8 +-
 .../SubProcesses/MatrixElementKernels.cc      |  26 +-
 .../SubProcesses/MatrixElementKernels.h       |   8 +-
 .../SubProcesses/MemoryAccessHelpers.h        |   4 +-
 .../SubProcesses/MemoryAccessMomenta.h        |   6 +-
 .../SubProcesses/MemoryAccessRandomNumbers.h  |   4 +-
 .../SubProcesses/MemoryAccessVectors.h        |   4 +-
 .../gg_ttggg.mad/SubProcesses/MemoryBuffers.h |  64 ++--
 .../SubProcesses/P1_gg_ttxggg/CPPProcess.cc   |  62 ++--
 .../SubProcesses/P1_gg_ttxggg/CPPProcess.h    |  10 +-
 .../SubProcesses/P1_gg_ttxggg/CudaRuntime.h   |   1 -
 .../P1_gg_ttxggg/GpuAbstraction.h             |   1 +
 .../SubProcesses/P1_gg_ttxggg/GpuRuntime.h    |   1 +
 .../SubProcesses/P1_gg_ttxggg/check_sa.cc     | 103 +++---
 .../SubProcesses/RamboSamplingKernels.cc      |  20 +-
 .../SubProcesses/RamboSamplingKernels.h       |   6 +-
 .../SubProcesses/RandomNumberKernels.h        |   6 +-
 .../gg_ttggg.mad/SubProcesses/cudacpp.mk      | 297 +++++++++++-------
 .../gg_ttggg.mad/SubProcesses/fbridge.cc      |  16 +-
 .../gg_ttggg.mad/SubProcesses/fsampler.cc     |   8 +-
 .../gg_ttggg.mad/SubProcesses/runTest.cc      |  12 +-
 .../gg_ttggg.mad/SubProcesses/testmisc.cc     |   4 +-
 .../gg_ttggg.mad/SubProcesses/testxxx.cc      |   8 +-
 epochX/cudacpp/gg_ttggg.mad/src/HelAmps_sm.h  |   4 +-
 .../cudacpp/gg_ttggg.mad/src/Parameters_sm.cc |   2 +-
 .../cudacpp/gg_ttggg.mad/src/Parameters_sm.h  |   6 +-
 .../cudacpp/gg_ttggg.mad/src/mgOnGpuConfig.h  |  70 +++--
 .../cudacpp/gg_ttggg.mad/src/mgOnGpuCxtypes.h |  18 +-
 .../cudacpp/gg_ttggg.mad/src/mgOnGpuFptypes.h |  10 +-
 .../cudacpp/gg_ttggg.mad/src/mgOnGpuVectors.h |  14 +-
 epochX/cudacpp/gg_ttggg.mad/src/rambo.h       |   8 +-
 .../CODEGEN_cudacpp_gg_ttggg_log.txt          |  40 +--
 epochX/cudacpp/gg_ttggg.sa/COPYRIGHT          |   1 +
 .../cudacpp/gg_ttggg.sa/SubProcesses/Bridge.h |  32 +-
 .../gg_ttggg.sa/SubProcesses/BridgeKernels.cc |   9 +-
 .../gg_ttggg.sa/SubProcesses/BridgeKernels.h  |   8 +-
 .../SubProcesses/CommonRandomNumberKernel.cc  |   5 +-
 .../SubProcesses/CrossSectionKernels.cc       |   7 +-
 .../SubProcesses/CrossSectionKernels.h        |   6 +-
 .../gg_ttggg.sa/SubProcesses/CudaRuntime.h    |  85 -----
 .../SubProcesses/CurandRandomNumberKernel.cc  |   8 +-
 .../SubProcesses/EventStatistics.h            |   4 +-
 .../gg_ttggg.sa/SubProcesses/GpuAbstraction.h |  71 +++++
 .../gg_ttggg.sa/SubProcesses/GpuRuntime.h     |  85 +++++
 .../gg_ttggg.sa/SubProcesses/MadgraphTest.h   |   8 +-
 .../SubProcesses/MatrixElementKernels.cc      |  26 +-
 .../SubProcesses/MatrixElementKernels.h       |   8 +-
 .../SubProcesses/MemoryAccessHelpers.h        |   4 +-
 .../SubProcesses/MemoryAccessMomenta.h        |   6 +-
 .../SubProcesses/MemoryAccessRandomNumbers.h  |   4 +-
 .../SubProcesses/MemoryAccessVectors.h        |   4 +-
 .../gg_ttggg.sa/SubProcesses/MemoryBuffers.h  |  64 ++--
 .../P1_Sigma_sm_gg_ttxggg/CPPProcess.cc       |  62 ++--
 .../P1_Sigma_sm_gg_ttxggg/CPPProcess.h        |  10 +-
 .../P1_Sigma_sm_gg_ttxggg/CudaRuntime.h       |   1 -
 .../P1_Sigma_sm_gg_ttxggg/GpuAbstraction.h    |   1 +
 .../P1_Sigma_sm_gg_ttxggg/GpuRuntime.h        |   1 +
 .../P1_Sigma_sm_gg_ttxggg/check_sa.cc         | 103 +++---
 .../SubProcesses/RamboSamplingKernels.cc      |  20 +-
 .../SubProcesses/RamboSamplingKernels.h       |   6 +-
 .../SubProcesses/RandomNumberKernels.h        |   6 +-
 .../gg_ttggg.sa/SubProcesses/cudacpp.mk       | 297 +++++++++++-------
 .../gg_ttggg.sa/SubProcesses/fbridge.cc       |  16 +-
 .../gg_ttggg.sa/SubProcesses/fsampler.cc      |   8 +-
 .../gg_ttggg.sa/SubProcesses/runTest.cc       |  12 +-
 .../gg_ttggg.sa/SubProcesses/testmisc.cc      |   4 +-
 .../gg_ttggg.sa/SubProcesses/testxxx.cc       |   8 +-
 epochX/cudacpp/gg_ttggg.sa/src/HelAmps_sm.h   |   4 +-
 .../cudacpp/gg_ttggg.sa/src/Parameters_sm.cc  |   2 +-
 .../cudacpp/gg_ttggg.sa/src/Parameters_sm.h   |   6 +-
 .../cudacpp/gg_ttggg.sa/src/mgOnGpuConfig.h   |  70 +++--
 .../cudacpp/gg_ttggg.sa/src/mgOnGpuCxtypes.h  |  18 +-
 .../cudacpp/gg_ttggg.sa/src/mgOnGpuFptypes.h  |  10 +-
 .../cudacpp/gg_ttggg.sa/src/mgOnGpuVectors.h  |  14 +-
 epochX/cudacpp/gg_ttggg.sa/src/rambo.h        |   8 +-
 .../gq_ttq.mad/CODEGEN_mad_gq_ttq_log.txt     |  38 +--
 epochX/cudacpp/gq_ttq.mad/COPYRIGHT           |   1 +
 .../gq_ttq.mad/Source/DHELAS/aloha_file.inc   |   2 +-
 .../cudacpp/gq_ttq.mad/SubProcesses/Bridge.h  |  32 +-
 .../gq_ttq.mad/SubProcesses/BridgeKernels.cc  |   9 +-
 .../gq_ttq.mad/SubProcesses/BridgeKernels.h   |   8 +-
 .../SubProcesses/CommonRandomNumberKernel.cc  |   5 +-
 .../SubProcesses/CrossSectionKernels.cc       |   7 +-
 .../SubProcesses/CrossSectionKernels.h        |   6 +-
 .../gq_ttq.mad/SubProcesses/CudaRuntime.h     |  85 -----
 .../SubProcesses/CurandRandomNumberKernel.cc  |   8 +-
 .../gq_ttq.mad/SubProcesses/EventStatistics.h |   4 +-
 .../gq_ttq.mad/SubProcesses/GpuAbstraction.h  |  71 +++++
 .../gq_ttq.mad/SubProcesses/GpuRuntime.h      |  85 +++++
 .../gq_ttq.mad/SubProcesses/MadgraphTest.h    |   8 +-
 .../SubProcesses/MatrixElementKernels.cc      |  26 +-
 .../SubProcesses/MatrixElementKernels.h       |   8 +-
 .../SubProcesses/MemoryAccessHelpers.h        |   4 +-
 .../SubProcesses/MemoryAccessMomenta.h        |   6 +-
 .../SubProcesses/MemoryAccessRandomNumbers.h  |   4 +-
 .../SubProcesses/MemoryAccessVectors.h        |   4 +-
 .../gq_ttq.mad/SubProcesses/MemoryBuffers.h   |  64 ++--
 .../SubProcesses/P1_gu_ttxu/CPPProcess.cc     |  64 ++--
 .../SubProcesses/P1_gu_ttxu/CPPProcess.h      |  10 +-
 .../SubProcesses/P1_gu_ttxu/CudaRuntime.h     |   1 -
 .../SubProcesses/P1_gu_ttxu/GpuAbstraction.h  |   1 +
 .../SubProcesses/P1_gu_ttxu/GpuRuntime.h      |   1 +
 .../SubProcesses/P1_gu_ttxu/check_sa.cc       | 103 +++---
 .../SubProcesses/P1_gux_ttxux/CPPProcess.cc   |  62 ++--
 .../SubProcesses/P1_gux_ttxux/CPPProcess.h    |  10 +-
 .../SubProcesses/P1_gux_ttxux/CudaRuntime.h   |   1 -
 .../P1_gux_ttxux/GpuAbstraction.h             |   1 +
 .../SubProcesses/P1_gux_ttxux/GpuRuntime.h    |   1 +
 .../SubProcesses/P1_gux_ttxux/check_sa.cc     | 103 +++---
 .../SubProcesses/RamboSamplingKernels.cc      |  20 +-
 .../SubProcesses/RamboSamplingKernels.h       |   6 +-
 .../SubProcesses/RandomNumberKernels.h        |   6 +-
 .../gq_ttq.mad/SubProcesses/cudacpp.mk        | 297 +++++++++++-------
 .../gq_ttq.mad/SubProcesses/fbridge.cc        |  16 +-
 .../gq_ttq.mad/SubProcesses/fsampler.cc       |   8 +-
 .../gq_ttq.mad/SubProcesses/runTest.cc        |  12 +-
 .../gq_ttq.mad/SubProcesses/testmisc.cc       |   4 +-
 .../gq_ttq.mad/SubProcesses/testxxx.cc        |   8 +-
 epochX/cudacpp/gq_ttq.mad/src/HelAmps_sm.h    |   4 +-
 .../cudacpp/gq_ttq.mad/src/Parameters_sm.cc   |   2 +-
 epochX/cudacpp/gq_ttq.mad/src/Parameters_sm.h |   6 +-
 epochX/cudacpp/gq_ttq.mad/src/mgOnGpuConfig.h |  70 +++--
 .../cudacpp/gq_ttq.mad/src/mgOnGpuCxtypes.h   |  18 +-
 .../cudacpp/gq_ttq.mad/src/mgOnGpuFptypes.h   |  10 +-
 .../cudacpp/gq_ttq.mad/src/mgOnGpuVectors.h   |  14 +-
 epochX/cudacpp/gq_ttq.mad/src/rambo.h         |   8 +-
 .../gq_ttq.sa/CODEGEN_cudacpp_gq_ttq_log.txt  |  30 +-
 epochX/cudacpp/gq_ttq.sa/COPYRIGHT            |   1 +
 .../cudacpp/gq_ttq.sa/SubProcesses/Bridge.h   |  32 +-
 .../gq_ttq.sa/SubProcesses/BridgeKernels.cc   |   9 +-
 .../gq_ttq.sa/SubProcesses/BridgeKernels.h    |   8 +-
 .../SubProcesses/CommonRandomNumberKernel.cc  |   5 +-
 .../SubProcesses/CrossSectionKernels.cc       |   7 +-
 .../SubProcesses/CrossSectionKernels.h        |   6 +-
 .../gq_ttq.sa/SubProcesses/CudaRuntime.h      |  85 -----
 .../SubProcesses/CurandRandomNumberKernel.cc  |   8 +-
 .../gq_ttq.sa/SubProcesses/EventStatistics.h  |   4 +-
 .../gq_ttq.sa/SubProcesses/GpuAbstraction.h   |  71 +++++
 .../gq_ttq.sa/SubProcesses/GpuRuntime.h       |  85 +++++
 .../gq_ttq.sa/SubProcesses/MadgraphTest.h     |   8 +-
 .../SubProcesses/MatrixElementKernels.cc      |  26 +-
 .../SubProcesses/MatrixElementKernels.h       |   8 +-
 .../SubProcesses/MemoryAccessHelpers.h        |   4 +-
 .../SubProcesses/MemoryAccessMomenta.h        |   6 +-
 .../SubProcesses/MemoryAccessRandomNumbers.h  |   4 +-
 .../SubProcesses/MemoryAccessVectors.h        |   4 +-
 .../gq_ttq.sa/SubProcesses/MemoryBuffers.h    |  64 ++--
 .../P1_Sigma_sm_gu_ttxu/CPPProcess.cc         |  64 ++--
 .../P1_Sigma_sm_gu_ttxu/CPPProcess.h          |  10 +-
 .../P1_Sigma_sm_gu_ttxu/CudaRuntime.h         |   1 -
 .../P1_Sigma_sm_gu_ttxu/GpuAbstraction.h      |   1 +
 .../P1_Sigma_sm_gu_ttxu/GpuRuntime.h          |   1 +
 .../P1_Sigma_sm_gu_ttxu/check_sa.cc           | 103 +++---
 .../P1_Sigma_sm_gux_ttxux/CPPProcess.cc       |  62 ++--
 .../P1_Sigma_sm_gux_ttxux/CPPProcess.h        |  10 +-
 .../P1_Sigma_sm_gux_ttxux/CudaRuntime.h       |   1 -
 .../P1_Sigma_sm_gux_ttxux/GpuAbstraction.h    |   1 +
 .../P1_Sigma_sm_gux_ttxux/GpuRuntime.h        |   1 +
 .../P1_Sigma_sm_gux_ttxux/check_sa.cc         | 103 +++---
 .../SubProcesses/RamboSamplingKernels.cc      |  20 +-
 .../SubProcesses/RamboSamplingKernels.h       |   6 +-
 .../SubProcesses/RandomNumberKernels.h        |   6 +-
 .../cudacpp/gq_ttq.sa/SubProcesses/cudacpp.mk | 297 +++++++++++-------
 .../cudacpp/gq_ttq.sa/SubProcesses/fbridge.cc |  16 +-
 .../gq_ttq.sa/SubProcesses/fsampler.cc        |   8 +-
 .../cudacpp/gq_ttq.sa/SubProcesses/runTest.cc |  12 +-
 .../gq_ttq.sa/SubProcesses/testmisc.cc        |   4 +-
 .../cudacpp/gq_ttq.sa/SubProcesses/testxxx.cc |   8 +-
 epochX/cudacpp/gq_ttq.sa/src/HelAmps_sm.h     |   4 +-
 epochX/cudacpp/gq_ttq.sa/src/Parameters_sm.cc |   2 +-
 epochX/cudacpp/gq_ttq.sa/src/Parameters_sm.h  |   6 +-
 epochX/cudacpp/gq_ttq.sa/src/mgOnGpuConfig.h  |  70 +++--
 epochX/cudacpp/gq_ttq.sa/src/mgOnGpuCxtypes.h |  18 +-
 epochX/cudacpp/gq_ttq.sa/src/mgOnGpuFptypes.h |  10 +-
 epochX/cudacpp/gq_ttq.sa/src/mgOnGpuVectors.h |  14 +-
 epochX/cudacpp/gq_ttq.sa/src/rambo.h          |   8 +-
 .../CODEGEN_cudacpp_heft_gg_h_log.txt         |  24 +-
 epochX/cudacpp/heft_gg_h.sa/COPYRIGHT         |   1 +
 .../heft_gg_h.sa/SubProcesses/Bridge.h        |  32 +-
 .../SubProcesses/BridgeKernels.cc             |   9 +-
 .../heft_gg_h.sa/SubProcesses/BridgeKernels.h |   8 +-
 .../SubProcesses/CommonRandomNumberKernel.cc  |   5 +-
 .../SubProcesses/CrossSectionKernels.cc       |   7 +-
 .../SubProcesses/CrossSectionKernels.h        |   6 +-
 .../heft_gg_h.sa/SubProcesses/CudaRuntime.h   |  85 -----
 .../SubProcesses/CurandRandomNumberKernel.cc  |   8 +-
 .../SubProcesses/EventStatistics.h            |   4 +-
 .../SubProcesses/GpuAbstraction.h             |  71 +++++
 .../heft_gg_h.sa/SubProcesses/GpuRuntime.h    |  85 +++++
 .../heft_gg_h.sa/SubProcesses/MadgraphTest.h  |   8 +-
 .../SubProcesses/MatrixElementKernels.cc      |  26 +-
 .../SubProcesses/MatrixElementKernels.h       |   8 +-
 .../SubProcesses/MemoryAccessHelpers.h        |   4 +-
 .../SubProcesses/MemoryAccessMomenta.h        |   6 +-
 .../SubProcesses/MemoryAccessRandomNumbers.h  |   4 +-
 .../SubProcesses/MemoryAccessVectors.h        |   4 +-
 .../heft_gg_h.sa/SubProcesses/MemoryBuffers.h |  64 ++--
 .../P1_Sigma_heft_gg_h/CPPProcess.cc          |  62 ++--
 .../P1_Sigma_heft_gg_h/CPPProcess.h           |  10 +-
 .../P1_Sigma_heft_gg_h/CudaRuntime.h          |   1 -
 .../P1_Sigma_heft_gg_h/GpuAbstraction.h       |   1 +
 .../P1_Sigma_heft_gg_h/GpuRuntime.h           |   1 +
 .../P1_Sigma_heft_gg_h/check_sa.cc            | 103 +++---
 .../SubProcesses/RamboSamplingKernels.cc      |  20 +-
 .../SubProcesses/RamboSamplingKernels.h       |   6 +-
 .../SubProcesses/RandomNumberKernels.h        |   6 +-
 .../heft_gg_h.sa/SubProcesses/cudacpp.mk      | 297 +++++++++++-------
 .../heft_gg_h.sa/SubProcesses/fbridge.cc      |  16 +-
 .../heft_gg_h.sa/SubProcesses/fsampler.cc     |   8 +-
 .../heft_gg_h.sa/SubProcesses/runTest.cc      |  12 +-
 .../heft_gg_h.sa/SubProcesses/testmisc.cc     |   4 +-
 .../heft_gg_h.sa/SubProcesses/testxxx.cc      |   8 +-
 .../cudacpp/heft_gg_h.sa/src/HelAmps_heft.h   |   4 +-
 .../heft_gg_h.sa/src/Parameters_heft.cc       |   2 +-
 .../heft_gg_h.sa/src/Parameters_heft.h        |   6 +-
 .../cudacpp/heft_gg_h.sa/src/mgOnGpuConfig.h  |  70 +++--
 .../cudacpp/heft_gg_h.sa/src/mgOnGpuCxtypes.h |  18 +-
 .../cudacpp/heft_gg_h.sa/src/mgOnGpuFptypes.h |  10 +-
 .../cudacpp/heft_gg_h.sa/src/mgOnGpuVectors.h |  14 +-
 epochX/cudacpp/heft_gg_h.sa/src/rambo.h       |   8 +-
 590 files changed, 8424 insertions(+), 6159 deletions(-)
 delete mode 120000 epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/CudaRuntime.h
 delete mode 120000 epochX/cudacpp/ee_mumu.mad/src/GpuAbstraction.h
 create mode 100644 epochX/cudacpp/ee_mumu.sa/SubProcesses/GpuAbstraction.h
 rename epochX/cudacpp/{ee_mumu.mad/SubProcesses/CudaRuntime.h => ee_mumu.sa/SubProcesses/GpuRuntime.h} (64%)
 delete mode 120000 epochX/cudacpp/ee_mumu.sa/SubProcesses/P1_Sigma_sm_epem_mupmum/CudaRuntime.h
 create mode 120000 epochX/cudacpp/ee_mumu.sa/SubProcesses/P1_Sigma_sm_epem_mupmum/GpuAbstraction.h
 create mode 120000 epochX/cudacpp/ee_mumu.sa/SubProcesses/P1_Sigma_sm_epem_mupmum/GpuRuntime.h
 create mode 100644 epochX/cudacpp/gg_tt.mad/SubProcesses/GpuAbstraction.h
 rename epochX/cudacpp/gg_tt.mad/SubProcesses/{CudaRuntime.h => GpuRuntime.h} (62%)
 delete mode 120000 epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/CudaRuntime.h
 create mode 120000 epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/GpuAbstraction.h
 create mode 120000 epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/GpuRuntime.h
 create mode 100644 epochX/cudacpp/gg_tt.sa/SubProcesses/GpuAbstraction.h
 rename epochX/cudacpp/gg_tt.sa/SubProcesses/{CudaRuntime.h => GpuRuntime.h} (62%)
 delete mode 120000 epochX/cudacpp/gg_tt.sa/SubProcesses/P1_Sigma_sm_gg_ttx/CudaRuntime.h
 create mode 120000 epochX/cudacpp/gg_tt.sa/SubProcesses/P1_Sigma_sm_gg_ttx/GpuAbstraction.h
 create mode 120000 epochX/cudacpp/gg_tt.sa/SubProcesses/P1_Sigma_sm_gg_ttx/GpuRuntime.h
 delete mode 100644 epochX/cudacpp/gg_tt01g.mad/SubProcesses/CudaRuntime.h
 create mode 100644 epochX/cudacpp/gg_tt01g.mad/SubProcesses/GpuAbstraction.h
 rename epochX/cudacpp/{ee_mumu.sa/SubProcesses/CudaRuntime.h => gg_tt01g.mad/SubProcesses/GpuRuntime.h} (62%)
 delete mode 120000 epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/CudaRuntime.h
 create mode 120000 epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/GpuAbstraction.h
 create mode 120000 epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/GpuRuntime.h
 delete mode 120000 epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/CudaRuntime.h
 create mode 120000 epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/GpuAbstraction.h
 create mode 120000 epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/GpuRuntime.h
 delete mode 100644 epochX/cudacpp/gg_ttg.mad/SubProcesses/CudaRuntime.h
 create mode 100644 epochX/cudacpp/gg_ttg.mad/SubProcesses/GpuAbstraction.h
 create mode 100644 epochX/cudacpp/gg_ttg.mad/SubProcesses/GpuRuntime.h
 delete mode 120000 epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/CudaRuntime.h
 create mode 120000 epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/GpuAbstraction.h
 create mode 120000 epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/GpuRuntime.h
 delete mode 100644 epochX/cudacpp/gg_ttg.sa/SubProcesses/CudaRuntime.h
 create mode 100644 epochX/cudacpp/gg_ttg.sa/SubProcesses/GpuAbstraction.h
 create mode 100644 epochX/cudacpp/gg_ttg.sa/SubProcesses/GpuRuntime.h
 delete mode 120000 epochX/cudacpp/gg_ttg.sa/SubProcesses/P1_Sigma_sm_gg_ttxg/CudaRuntime.h
 create mode 120000 epochX/cudacpp/gg_ttg.sa/SubProcesses/P1_Sigma_sm_gg_ttxg/GpuAbstraction.h
 create mode 120000 epochX/cudacpp/gg_ttg.sa/SubProcesses/P1_Sigma_sm_gg_ttxg/GpuRuntime.h
 delete mode 100644 epochX/cudacpp/gg_ttgg.sa/SubProcesses/CudaRuntime.h
 create mode 100644 epochX/cudacpp/gg_ttgg.sa/SubProcesses/GpuAbstraction.h
 create mode 100644 epochX/cudacpp/gg_ttgg.sa/SubProcesses/GpuRuntime.h
 delete mode 120000 epochX/cudacpp/gg_ttgg.sa/SubProcesses/P1_Sigma_sm_gg_ttxgg/CudaRuntime.h
 create mode 120000 epochX/cudacpp/gg_ttgg.sa/SubProcesses/P1_Sigma_sm_gg_ttxgg/GpuAbstraction.h
 create mode 120000 epochX/cudacpp/gg_ttgg.sa/SubProcesses/P1_Sigma_sm_gg_ttxgg/GpuRuntime.h
 delete mode 100644 epochX/cudacpp/gg_ttggg.mad/SubProcesses/CudaRuntime.h
 create mode 100644 epochX/cudacpp/gg_ttggg.mad/SubProcesses/GpuAbstraction.h
 create mode 100644 epochX/cudacpp/gg_ttggg.mad/SubProcesses/GpuRuntime.h
 delete mode 120000 epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/CudaRuntime.h
 create mode 120000 epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/GpuAbstraction.h
 create mode 120000 epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/GpuRuntime.h
 delete mode 100644 epochX/cudacpp/gg_ttggg.sa/SubProcesses/CudaRuntime.h
 create mode 100644 epochX/cudacpp/gg_ttggg.sa/SubProcesses/GpuAbstraction.h
 create mode 100644 epochX/cudacpp/gg_ttggg.sa/SubProcesses/GpuRuntime.h
 delete mode 120000 epochX/cudacpp/gg_ttggg.sa/SubProcesses/P1_Sigma_sm_gg_ttxggg/CudaRuntime.h
 create mode 120000 epochX/cudacpp/gg_ttggg.sa/SubProcesses/P1_Sigma_sm_gg_ttxggg/GpuAbstraction.h
 create mode 120000 epochX/cudacpp/gg_ttggg.sa/SubProcesses/P1_Sigma_sm_gg_ttxggg/GpuRuntime.h
 delete mode 100644 epochX/cudacpp/gq_ttq.mad/SubProcesses/CudaRuntime.h
 create mode 100644 epochX/cudacpp/gq_ttq.mad/SubProcesses/GpuAbstraction.h
 create mode 100644 epochX/cudacpp/gq_ttq.mad/SubProcesses/GpuRuntime.h
 delete mode 120000 epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/CudaRuntime.h
 create mode 120000 epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/GpuAbstraction.h
 create mode 120000 epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/GpuRuntime.h
 delete mode 120000 epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/CudaRuntime.h
 create mode 120000 epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/GpuAbstraction.h
 create mode 120000 epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/GpuRuntime.h
 delete mode 100644 epochX/cudacpp/gq_ttq.sa/SubProcesses/CudaRuntime.h
 create mode 100644 epochX/cudacpp/gq_ttq.sa/SubProcesses/GpuAbstraction.h
 create mode 100644 epochX/cudacpp/gq_ttq.sa/SubProcesses/GpuRuntime.h
 delete mode 120000 epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gu_ttxu/CudaRuntime.h
 create mode 120000 epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gu_ttxu/GpuAbstraction.h
 create mode 120000 epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gu_ttxu/GpuRuntime.h
 delete mode 120000 epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gux_ttxux/CudaRuntime.h
 create mode 120000 epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gux_ttxux/GpuAbstraction.h
 create mode 120000 epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gux_ttxux/GpuRuntime.h
 delete mode 100644 epochX/cudacpp/heft_gg_h.sa/SubProcesses/CudaRuntime.h
 create mode 100644 epochX/cudacpp/heft_gg_h.sa/SubProcesses/GpuAbstraction.h
 create mode 100644 epochX/cudacpp/heft_gg_h.sa/SubProcesses/GpuRuntime.h
 delete mode 120000 epochX/cudacpp/heft_gg_h.sa/SubProcesses/P1_Sigma_heft_gg_h/CudaRuntime.h
 create mode 120000 epochX/cudacpp/heft_gg_h.sa/SubProcesses/P1_Sigma_heft_gg_h/GpuAbstraction.h
 create mode 120000 epochX/cudacpp/heft_gg_h.sa/SubProcesses/P1_Sigma_heft_gg_h/GpuRuntime.h

diff --git a/epochX/cudacpp/ee_mumu.mad/CODEGEN_mad_ee_mumu_log.txt b/epochX/cudacpp/ee_mumu.mad/CODEGEN_mad_ee_mumu_log.txt
index 528176e84e..fc7f62d186 100644
--- a/epochX/cudacpp/ee_mumu.mad/CODEGEN_mad_ee_mumu_log.txt
+++ b/epochX/cudacpp/ee_mumu.mad/CODEGEN_mad_ee_mumu_log.txt
@@ -51,8 +51,8 @@ Note that you can still compile and run aMC@NLO with the built-in PDFs
  MG5_aMC> set lhapdf /PATH/TO/lhapdf-config
 
 Using default text editor "vi". Set another one in ./input/mg5_configuration.txt
-No valid eps viewer found. Please set in ./input/mg5_configuration.txt
-Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt
+Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt
+No valid web browser found. Please set in ./input/mg5_configuration.txt
 import /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_mad_ee_mumu.mg
 The import format was not given, so we guess it as command
 set stdout_level DEBUG
@@ -62,7 +62,7 @@ generate e+ e- > mu+ mu-
 No model currently active, so we import the Standard Model
 INFO: load particles 
 INFO: load vertices 
-[1;32mDEBUG: model prefixing  takes 0.004788637161254883 [0m
+[1;32mDEBUG: model prefixing  takes 0.005497932434082031 [0m
 INFO: Restrict model sm with file models/sm/restrict_default.dat . 
 [1;32mDEBUG: Simplifying conditional expressions [0m
 [1;32mDEBUG: remove interactions: u s w+ at order: QED=1 [0m
@@ -176,7 +176,7 @@ INFO: Creating files in directory P1_epem_mupmum
 [1;32mDEBUG:  Entering PLUGIN_OneProcessExporter.__init__ [1;30m[model_handling.py at line 1028][0m [0m
 [1;32mDEBUG:  kwargs[prefix] = 0 [1;30m[model_handling.py at line 1029][0m [0m
 [1;32mDEBUG:  proc_id = [0m 1 [1;30m[model_handling.py at line 1034][0m [0m
-[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_SA_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7f26d2393b80> [1;30m[export_v4.py at line 6174][0m [0m
+[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_SA_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7f00153634f0> [1;30m[export_v4.py at line 6174][0m [0m
 INFO: Creating files in directory . 
 [1;32mDEBUG:  Entering PLUGIN_OneProcessExporter.generate_process_files [1;30m[model_handling.py at line 1286][0m [0m
 [1;32mDEBUG:  self.include_multi_channel is already defined: this is madevent+second_exporter mode [1;30m[model_handling.py at line 1288][0m [0m
@@ -191,7 +191,7 @@ FileWriter <class 'PLUGIN.CUDACPP_SA_OUTPUT.model_handling.PLUGIN_CPPWriter'> fo
 [1;32mDEBUG:  self.support_multichannel, self.include_multi_channel = [0m True [1, 2] [1;30m[model_handling.py at line 1152][0m [0m
 [1;32mDEBUG:  multi_channel = [0m {1: [0], 2: [1]} [1;30m[model_handling.py at line 1158][0m [0m
 [1;32mDEBUG:  multi_channel_map = [0m {1: [0], 2: [1]} [1;30m[model_handling.py at line 1643][0m [0m
-[1;32mDEBUG:  diag_to_config = [0m {1: 1, 2: 2} [1;30m[model_handling.py at line 1698][0m [0m
+[1;32mDEBUG:  diag_to_config = [0m {1: 1, 2: 2} [1;30m[model_handling.py at line 1700][0m [0m
 INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./. 
 [1;32mDEBUG:  Entering PLUGIN_OneProcessExporter.edit_CMakeLists [1;30m[model_handling.py at line 1332][0m [0m
 [1;32mDEBUG:  Entering PLUGIN_OneProcessExporter.edit_check_sa [1;30m[model_handling.py at line 1341][0m [0m
@@ -210,19 +210,19 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./.
 INFO: Generating Feynman diagrams for Process: e+ e- > mu+ mu- WEIGHTED<=4 @1 
 INFO: Finding symmetric diagrams for subprocess group epem_mupmum 
 Generated helas calls for 1 subprocesses (2 diagrams) in 0.004 s
-Wrote files for 8 helas calls in 0.094 s
+Wrote files for 8 helas calls in 0.096 s
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates FFV1 routines[0m
 ALOHA: aloha creates FFV2 routines[0m
 ALOHA: aloha creates FFV4 routines[0m
-ALOHA: aloha creates 3 routines in  0.260 s
+ALOHA: aloha creates 3 routines in  0.188 s
 [1;32mDEBUG:  Entering PLUGIN_ProcessExporter.convert_model (create the model) [1;30m[output.py at line 194][0m [0m
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates FFV1 routines[0m
 ALOHA: aloha creates FFV2 routines[0m
 ALOHA: aloha creates FFV4 routines[0m
 ALOHA: aloha creates FFV2_4 routines[0m
-ALOHA: aloha creates 7 routines in  0.239 s
+ALOHA: aloha creates 7 routines in  0.238 s
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
 <class 'aloha.create_aloha.AbstractRoutine'> FFV2
@@ -260,6 +260,6 @@ Type "launch" to generate events from this process, or see
 Run "open index.html" to see more information about this process.
 quit
 
-real	0m1.855s
-user	0m1.538s
-sys	0m0.189s
+real	0m1.891s
+user	0m1.628s
+sys	0m0.195s
diff --git a/epochX/cudacpp/ee_mumu.mad/COPYRIGHT b/epochX/cudacpp/ee_mumu.mad/COPYRIGHT
index a134b5fef9..84a883fbb0 100644
--- a/epochX/cudacpp/ee_mumu.mad/COPYRIGHT
+++ b/epochX/cudacpp/ee_mumu.mad/COPYRIGHT
@@ -15,6 +15,7 @@ The full development team currently includes the following authors :
   Stephan Hageboeck (CERN)
   Olivier Mattelaer (Universite Catholique de Louvain, original author)
   Stefan Roiser (CERN, original author)
+  Joergen Teig (CERN)
   Andrea Valassi (CERN, original author)
   Zenny Wettersten (CERN)
 See https://github.com/madgraph5/madgraph4gpu for more details. For the full
diff --git a/epochX/cudacpp/ee_mumu.mad/Source/DHELAS/aloha_file.inc b/epochX/cudacpp/ee_mumu.mad/Source/DHELAS/aloha_file.inc
index 4f385d6435..738db319fd 100644
--- a/epochX/cudacpp/ee_mumu.mad/Source/DHELAS/aloha_file.inc
+++ b/epochX/cudacpp/ee_mumu.mad/Source/DHELAS/aloha_file.inc
@@ -1 +1 @@
-ALOHARoutine = FFV2_3.o FFV2_0.o FFV4_0.o FFV4_3.o FFV1_0.o FFV1P0_3.o
+ALOHARoutine = FFV1_0.o FFV4_3.o FFV1P0_3.o FFV2_0.o FFV4_0.o FFV2_3.o
diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/Bridge.h b/epochX/cudacpp/ee_mumu.mad/SubProcesses/Bridge.h
index 1ff661c20a..c04628dfd1 100644
--- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/Bridge.h
+++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/Bridge.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: S. Roiser (Nov 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Roiser, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef BRIDGE_H
 #define BRIDGE_H 1
@@ -10,7 +10,6 @@
 
 #include "CPPProcess.h"           // for CPPProcess
 #include "CrossSectionKernels.h"  // for flagAbnormalMEs
-#include "GpuRuntime.h"           // for CUDA/HIP runtime, also includes GPU abstraction
 #include "MatrixElementKernels.h" // for MatrixElementKernelHost, MatrixElementKernelDevice
 #include "MemoryAccessMomenta.h"  // for MemoryAccessMomenta::neppM
 #include "MemoryBuffers.h"        // for HostBufferMomenta, DeviceBufferMomenta etc
@@ -291,7 +290,7 @@ namespace mg5amcCpu
       gpuMemcpy( m_devMomentaF.data(), momenta, m_devMomentaF.bytes(), gpuMemcpyHostToDevice );
       const int thrPerEvt = CPPProcess::npar * CPPProcess::np4; // AV: transpose alg does 1 element per thread (NOT 1 event per thread)
       //const int thrPerEvt = 1; // AV: try new alg with 1 event per thread... this seems slower
-      gpuLaunchKernel(dev_transposeMomentaF2C, m_gpublocks * thrPerEvt, m_gputhreads, m_devMomentaF.data(), m_devMomentaC.data(), m_nevt );
+      gpuLaunchKernel( dev_transposeMomentaF2C, m_gpublocks * thrPerEvt, m_gputhreads, m_devMomentaF.data(), m_devMomentaC.data(), m_nevt );
     }
     if constexpr( std::is_same_v<FORTRANFPTYPE, fptype> )
     {
diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/BridgeKernels.cc b/epochX/cudacpp/ee_mumu.mad/SubProcesses/BridgeKernels.cc
index 58b2f43cbe..90c7f2d3b8 100644
--- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/BridgeKernels.cc
+++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/BridgeKernels.cc
@@ -1,12 +1,11 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #include "BridgeKernels.h"
 
 #include "GpuAbstraction.h"
-
 #include "MemoryAccessMomenta.h"
 
 #include <sstream>
diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/BridgeKernels.h b/epochX/cudacpp/ee_mumu.mad/SubProcesses/BridgeKernels.h
index 4f0a560d4b..3efef8ce97 100644
--- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/BridgeKernels.h
+++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/BridgeKernels.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef BRIDGEKERNELS_H
 #define BRIDGEKERNELS_H 1
@@ -9,7 +9,6 @@
 #include "mgOnGpuConfig.h"
 
 #include "Bridge.h"
-#include "GpuAbstraction.h"
 #include "MatrixElementKernels.h"
 #include "MemoryBuffers.h"
 
diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/CommonRandomNumberKernel.cc b/epochX/cudacpp/ee_mumu.mad/SubProcesses/CommonRandomNumberKernel.cc
index f17b9c0ad7..010bc4cbd0 100644
--- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/CommonRandomNumberKernel.cc
+++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/CommonRandomNumberKernel.cc
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #include "CommonRandomNumbers.h"
 #include "GpuAbstraction.h"
diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/CrossSectionKernels.cc b/epochX/cudacpp/ee_mumu.mad/SubProcesses/CrossSectionKernels.cc
index cddb05658b..c15b39844d 100644
--- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/CrossSectionKernels.cc
+++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/CrossSectionKernels.cc
@@ -1,12 +1,11 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #include "CrossSectionKernels.h"
 
 #include "GpuAbstraction.h"
-
 #include "MemoryAccessMatrixElements.h"
 #include "MemoryAccessWeights.h"
 #include "MemoryBuffers.h"
diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/CrossSectionKernels.h b/epochX/cudacpp/ee_mumu.mad/SubProcesses/CrossSectionKernels.h
index a928d96ea9..4d9659e04e 100644
--- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/CrossSectionKernels.h
+++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/CrossSectionKernels.h
@@ -1,13 +1,12 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef CROSSSECTIONKERNELS_H
 #define CROSSSECTIONKERNELS_H 1
 
 #include "mgOnGpuConfig.h"
-#include "GpuAbstraction.h"
 
 #include "EventStatistics.h"
 #include "MemoryBuffers.h"
diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/CurandRandomNumberKernel.cc b/epochX/cudacpp/ee_mumu.mad/SubProcesses/CurandRandomNumberKernel.cc
index 82dbcbabc4..38c477c17a 100644
--- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/CurandRandomNumberKernel.cc
+++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/CurandRandomNumberKernel.cc
@@ -1,9 +1,8 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
-#include "CommonRandomNumbers.h"
 #include "GpuRuntime.h"
 #include "MemoryBuffers.h"
 #include "RandomNumberKernels.h"
@@ -23,7 +22,7 @@ inline void assertCurand( curandStatus_t code, const char *file, int line, bool
 }
 #endif /* clang-format on */
 
-#ifdef MGONGPUCPP_CUDACC
+#ifdef __CUDACC__
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -37,7 +36,7 @@ namespace mg5amcCpu
   {
     if( m_isOnDevice )
     {
-#ifdef MGONGPUCPP_CUDACC
+#ifdef __CUDACC__
       if( !m_rnarray.isOnDevice() )
         throw std::runtime_error( "CurandRandomNumberKernel on device with a host random number array" );
 #else
@@ -132,4 +131,4 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 #endif
-}
\ No newline at end of file
+}
diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/EventStatistics.h b/epochX/cudacpp/ee_mumu.mad/SubProcesses/EventStatistics.h
index 6f4e1726ff..b425a5bade 100644
--- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/EventStatistics.h
+++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/EventStatistics.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef EventStatistics_H
 #define EventStatistics_H 1
@@ -10,8 +10,6 @@
 
 #include "CPPProcess.h" // for npar (meGeVexponent)
 
-#include "GpuAbstraction.h"
-
 #include <algorithm>
 #include <cmath>
 #include <iostream>
diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/GpuAbstraction.h b/epochX/cudacpp/ee_mumu.mad/SubProcesses/GpuAbstraction.h
index 782cb96e8c..6a7d9c05c0 100644
--- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/GpuAbstraction.h
+++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/GpuAbstraction.h
@@ -1,86 +1,71 @@
+// Copyright (C) 2020-2023 CERN and UCLouvain.
+// Licensed under the GNU Lesser General Public License (version 3 or later).
+// Created by: J. Teig (Jul 2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+
 #ifndef MG5AMC_GPUABSTRACTION_H
 #define MG5AMC_GPUABSTRACTION_H 1
 
 #include <cassert>
 
-/*
-  ToDo:
-      * Fix rpath in makefile when compiling with HIP
-      * Fix warnings with improper hip function return code handling
-*/
-
+//--------------------------------------------------------------------------
 
 #ifdef __CUDACC__
-  #define MGONGPUCPP_CUDACC 1
-#endif
-
-#ifdef __HIPCC__
-  #include "hip/hip_runtime.h"
-  #define MGONGPUCPP_HIPCC 1
-#endif
-
-#ifdef MGONGPUCPP_CUDACC
-
-  // Defines correct compiler
-  #define MGONGPUCPP_GPUIMPL __CUDACC__
 
-  //--------------------------------------------------------------------------
+#define gpuError_t cudaError_t
+#define gpuPeekAtLastError cudaPeekAtLastError
+#define gpuGetErrorString cudaGetErrorString
+#define gpuSuccess cudaSuccess
 
-  #define gpuError_t cudaError_t
-  #define gpuPeekAtLastError cudaPeekAtLastError
-  #define gpuGetErrorString cudaGetErrorString
-  #define gpuSuccess cudaSuccess
+#define gpuMallocHost( ptr, size ) checkGpu( cudaMallocHost( ptr, size ) )
+#define gpuMalloc( ptr, size ) checkGpu( cudaMalloc( ptr, size ) )
 
-  #define gpuMallocHost(ptr, size) checkGpu( cudaMallocHost(ptr, size) )
-  #define gpuMalloc(ptr, size) checkGpu( cudaMalloc(ptr, size) )
+#define gpuMemcpy( dstData, srcData, srcBytes, func ) checkGpu( cudaMemcpy( dstData, srcData, srcBytes, func ) )
+#define gpuMemcpyHostToDevice cudaMemcpyHostToDevice
+#define gpuMemcpyDeviceToHost cudaMemcpyDeviceToHost
+#define gpuMemcpyToSymbol( type1, type2, size ) checkGpu( cudaMemcpyToSymbol( type1, type2, size ) )
 
-  #define gpuMemcpy(dstData, srcData, srcBytes, func) checkGpu( cudaMemcpy(dstData, srcData, srcBytes, func) )
-  #define gpuMemcpyHostToDevice cudaMemcpyHostToDevice
-  #define gpuMemcpyDeviceToHost cudaMemcpyDeviceToHost
-  #define gpuMemcpyToSymbol(type1, type2, size) checkGpu( cudaMemcpyToSymbol(type1, type2, size) )
+#define gpuFree( ptr ) checkGpu( cudaFree( ptr ) )
+#define gpuFreeHost( ptr ) checkGpu( cudaFreeHost( ptr ) )
 
-  #define gpuFree(ptr) checkGpu( cudaFree(ptr) )
-  #define gpuFreeHost(ptr) checkGpu( cudaFreeHost(ptr) )
+#define gpuSetDevice cudaSetDevice
+#define gpuDeviceSynchronize cudaDeviceSynchronize
+#define gpuDeviceReset cudaDeviceReset
 
-  #define gpuSetDevice cudaSetDevice
-  #define gpuDeviceSynchronize cudaDeviceSynchronize
-  #define gpuDeviceReset cudaDeviceReset
-
-  #define gpuLaunchKernel( kernel, blocks, threads, ...)                    kernel<<<blocks, threads>>> (__VA_ARGS__)
-  #define gpuLaunchKernelSharedMem(kernel, blocks, threads, sharedMem, ...) kernel<<<blocks, threads, sharedMem>>>(__VA_ARGS__)
+#define gpuLaunchKernel( kernel, blocks, threads, ... ) kernel<<<blocks, threads>>>( __VA_ARGS__ )
+#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<<blocks, threads, sharedMem>>>( __VA_ARGS__ )
 
 //--------------------------------------------------------------------------
 
-#elif defined MGONGPUCPP_HIPCC
+#elif defined __HIPCC__
 
-  // Defines correct compiler
-  #define MGONGPUCPP_GPUIMPL __HCC__
+#include "hip/hip_runtime.h"
 
-  //--------------------------------------------------------------------------
+#define gpuError_t hipError_t
+#define gpuPeekAtLastError hipPeekAtLastError
+#define gpuGetErrorString hipGetErrorString
+#define gpuSuccess hipSuccess
 
-  #define gpuError_t hipError_t
-  #define gpuPeekAtLastError hipPeekAtLastError
-  #define gpuGetErrorString hipGetErrorString
-  #define gpuSuccess hipSuccess
+#define gpuMallocHost( ptr, size ) checkGpu( hipHostMalloc( ptr, size ) ) // HostMalloc better
+#define gpuMalloc( ptr, size ) checkGpu( hipMalloc( ptr, size ) )
 
-  #define gpuMallocHost(ptr, size) checkGpu( hipHostMalloc(ptr, size) ) // HostMalloc better
-  #define gpuMalloc(ptr, size) checkGpu( hipMalloc(ptr, size) )
+#define gpuMemcpy( dstData, srcData, srcBytes, func ) checkGpu( hipMemcpy( dstData, srcData, srcBytes, func ) )
+#define gpuMemcpyHostToDevice hipMemcpyHostToDevice
+#define gpuMemcpyDeviceToHost hipMemcpyDeviceToHost
+#define gpuMemcpyToSymbol( type1, type2, size ) checkGpu( hipMemcpyToSymbol( type1, type2, size ) )
 
-  #define gpuMemcpy(dstData, srcData, srcBytes, func) checkGpu( hipMemcpy(dstData, srcData, srcBytes, func) )
-  #define gpuMemcpyHostToDevice hipMemcpyHostToDevice
-  #define gpuMemcpyDeviceToHost hipMemcpyDeviceToHost
-  #define gpuMemcpyToSymbol(type1, type2, size) checkGpu( hipMemcpyToSymbol(type1, type2, size) )
+#define gpuFree( ptr ) checkGpu( hipFree( ptr ) )
+#define gpuFreeHost( ptr ) checkGpu( hipHostFree( ptr ) )
 
-  #define gpuFree(ptr) checkGpu( hipFree(ptr) )
-  #define gpuFreeHost(ptr) checkGpu( hipHostFree(ptr) )
+#define gpuSetDevice hipSetDevice
+#define gpuDeviceSynchronize hipDeviceSynchronize
+#define gpuDeviceReset hipDeviceReset
 
-  #define gpuSetDevice hipSetDevice
-  #define gpuDeviceSynchronize hipDeviceSynchronize
-  #define gpuDeviceReset hipDeviceReset
+#define gpuLaunchKernel( kernel, blocks, threads, ... ) kernel<<<blocks, threads>>>( __VA_ARGS__ )
+#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<<blocks, threads, sharedMem>>>( __VA_ARGS__ )
 
-  #define gpuLaunchKernel( kernel, blocks, threads, ...)                    kernel<<<blocks, threads>>> (__VA_ARGS__)
-  #define gpuLaunchKernelSharedMem(kernel, blocks, threads, sharedMem, ...) kernel<<<blocks, threads, sharedMem>>>(__VA_ARGS__)
+//--------------------------------------------------------------------------
 
 #endif
 
-#endif // MG5AMC_GPUABSTRACTION_H
\ No newline at end of file
+#endif // MG5AMC_GPUABSTRACTION_H
diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/GpuRuntime.h b/epochX/cudacpp/ee_mumu.mad/SubProcesses/GpuRuntime.h
index caa301ef24..93579ef08b 100644
--- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/GpuRuntime.h
+++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/GpuRuntime.h
@@ -1,3 +1,8 @@
+// Copyright (C) 2020-2023 CERN and UCLouvain.
+// Licensed under the GNU Lesser General Public License (version 3 or later).
+// Created by: J. Teig (Jun 2023, based on earlier work by S. Roiser) for the MG5aMC CUDACPP plugin.
+// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+
 #ifndef MG5AMC_GPURUNTIME_H
 #define MG5AMC_GPURUNTIME_H 1
 
@@ -28,8 +33,8 @@ inline void assertGpu( gpuError_t code, const char* file, int line, bool abort =
 #ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 {
-  // Instantiate a CudaRuntime at the beginnining of the application's main to
-  // invoke cudaSetDevice(0) in the constructor and book a cudaDeviceReset() call in the destructor
+  // Instantiate a GpuRuntime at the beginnining of the application's main to
+  // invoke gpuSetDevice(0) in the constructor and book a gpuDeviceReset() call in the destructor
   // *** FIXME! This will all need to be designed differently when going to multi-GPU nodes! ***
   struct GpuRuntime final
   {
@@ -59,7 +64,7 @@ namespace mg5amcGpu
       // Replace cudaFree(0) by cudaSetDevice(0), even if it is not really needed either
       // (but see https://developer.nvidia.com/blog/cuda-pro-tip-always-set-current-device-avoid-multithreading-bugs)
       if( debug ) std::cout << "__GpuRuntime: calling GpuSetDevice(0)" << std::endl;
-      checkGpu ( gpuSetDevice( 0 ) ); // SLOW!
+      checkGpu( gpuSetDevice( 0 ) ); // SLOW!
     }
 
     // Tear down CUDA application (call cudaDeviceReset)
diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/MadgraphTest.h b/epochX/cudacpp/ee_mumu.mad/SubProcesses/MadgraphTest.h
index 8b47cb4a44..d2ff326e20 100644
--- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/MadgraphTest.h
+++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/MadgraphTest.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: S. Hageboeck (Dec 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MADGRAPHTEST_H_
 #define MADGRAPHTEST_H_ 1
@@ -12,8 +12,6 @@
 
 #include <gtest/gtest.h>
 
-#include "GpuAbstraction.h"
-
 #include <array>
 #include <cmath>
 #include <fstream>
diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/MatrixElementKernels.cc b/epochX/cudacpp/ee_mumu.mad/SubProcesses/MatrixElementKernels.cc
index 50d8058adc..d6d6c4f179 100644
--- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/MatrixElementKernels.cc
+++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/MatrixElementKernels.cc
@@ -1,12 +1,12 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #include "MatrixElementKernels.h"
 
 #include "CPPProcess.h"
-#include "GpuRuntime.h"
+#include "GpuRuntime.h" // Includes the abstraction for Nvidia/AMD compilation
 #include "MemoryAccessMomenta.h"
 #include "MemoryBuffers.h"
 
@@ -202,13 +202,13 @@ namespace mg5amcGpu
     PinnedHostBufferHelicityMask hstIsGoodHel( ncomb );
     DeviceBufferHelicityMask devIsGoodHel( ncomb );
     // ... 0d1. Compute good helicity mask on the device
-    gpuLaunchKernel(computeDependentCouplings, m_gpublocks, m_gputhreads, m_gs.data(), m_couplings.data() );
+    gpuLaunchKernel( computeDependentCouplings, m_gpublocks, m_gputhreads, m_gs.data(), m_couplings.data() );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    gpuLaunchKernel(sigmaKin_getGoodHel, m_gpublocks, m_gputhreads, m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_numerators.data(), m_denominators.data(), devIsGoodHel.data() );
+    gpuLaunchKernel( sigmaKin_getGoodHel, m_gpublocks, m_gputhreads, m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_numerators.data(), m_denominators.data(), devIsGoodHel.data() );
 #else
-    gpuLaunchKernel(sigmaKin_getGoodHel, m_gpublocks, m_gputhreads, m_momenta.data(), m_couplings.data(), m_matrixElements.data(), devIsGoodHel.data() );
+    gpuLaunchKernel( sigmaKin_getGoodHel, m_gpublocks, m_gputhreads, m_momenta.data(), m_couplings.data(), m_matrixElements.data(), devIsGoodHel.data() );
 #endif
-    gpuPeekAtLastError();
+    checkGpu( gpuPeekAtLastError() );
     // ... 0d2. Copy back good helicity mask to the host
     copyHostFromDevice( hstIsGoodHel, devIsGoodHel );
     // ... 0d3. Copy back good helicity list to constant memory on the device
@@ -219,19 +219,19 @@ namespace mg5amcGpu
 
   void MatrixElementKernelDevice::computeMatrixElements( const unsigned int channelId )
   {
-    gpuLaunchKernel(computeDependentCouplings, m_gpublocks, m_gputhreads, m_gs.data(), m_couplings.data() );
+    gpuLaunchKernel( computeDependentCouplings, m_gpublocks, m_gputhreads, m_gs.data(), m_couplings.data() );
 #ifndef MGONGPU_NSIGHT_DEBUG
     constexpr unsigned int sharedMemSize = 0;
 #else
     constexpr unsigned int sharedMemSize = ntpbMAX * sizeof( float );
 #endif
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    gpuLaunchKernelSharedMem(sigmaKin, m_gpublocks, m_gputhreads, sharedMemSize, m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), channelId, m_numerators.data(), m_denominators.data(), m_selhel.data(), m_selcol.data() );
+    gpuLaunchKernelSharedMem( sigmaKin, m_gpublocks, m_gputhreads, sharedMemSize, m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), channelId, m_numerators.data(), m_denominators.data(), m_selhel.data(), m_selcol.data() );
 #else
-    gpuLaunchKernelSharedMem(sigmaKin, m_gpublocks, m_gputhreads, sharedMemSize, m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), m_selhel.data(), m_selcol.data() );
+    gpuLaunchKernelSharedMem( sigmaKin, m_gpublocks, m_gputhreads, sharedMemSize, m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), m_selhel.data(), m_selcol.data() );
 #endif
-    gpuPeekAtLastError();
-    gpuDeviceSynchronize();
+    checkGpu( gpuPeekAtLastError() );
+    checkGpu( gpuDeviceSynchronize() );
   }
 
   //--------------------------------------------------------------------------
diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/MatrixElementKernels.h b/epochX/cudacpp/ee_mumu.mad/SubProcesses/MatrixElementKernels.h
index 8978b01ae0..72bd8f195b 100644
--- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/MatrixElementKernels.h
+++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/MatrixElementKernels.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MATRIXELEMENTKERNELS_H
 #define MATRIXELEMENTKERNELS_H 1
@@ -10,8 +10,6 @@
 
 #include "MemoryBuffers.h"
 
-#include "GpuAbstraction.h"
-
 #ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/MemoryAccessHelpers.h b/epochX/cudacpp/ee_mumu.mad/SubProcesses/MemoryAccessHelpers.h
index b889e6b50d..db73e4e064 100644
--- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/MemoryAccessHelpers.h
+++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/MemoryAccessHelpers.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MemoryAccessHelpers_H
 #define MemoryAccessHelpers_H 1
@@ -10,8 +10,6 @@
 
 #include "mgOnGpuFptypes.h"
 
-#include "GpuAbstraction.h"
-
 //----------------------------------------------------------------------------
 
 // A templated helper class that includes the boilerplate code for MemoryAccess classes
diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/MemoryAccessMomenta.h b/epochX/cudacpp/ee_mumu.mad/SubProcesses/MemoryAccessMomenta.h
index f797f85ca5..38fade09fb 100644
--- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/MemoryAccessMomenta.h
+++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/MemoryAccessMomenta.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MemoryAccessMomenta_H
 #define MemoryAccessMomenta_H 1
diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/MemoryAccessRandomNumbers.h b/epochX/cudacpp/ee_mumu.mad/SubProcesses/MemoryAccessRandomNumbers.h
index 949a42066d..40cb089135 100644
--- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/MemoryAccessRandomNumbers.h
+++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/MemoryAccessRandomNumbers.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MemoryAccessRandomNumbers_H
 #define MemoryAccessRandomNumbers_H 1
diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/MemoryAccessVectors.h b/epochX/cudacpp/ee_mumu.mad/SubProcesses/MemoryAccessVectors.h
index 617d1df782..08faccff0f 100644
--- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/MemoryAccessVectors.h
+++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/MemoryAccessVectors.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MemoryAccessVectors_H
 #define MemoryAccessVectors_H 1
@@ -10,8 +10,6 @@
 
 #include "mgOnGpuVectors.h"
 
-#include "GpuAbstraction.h"
-
 #ifndef MGONGPUCPP_GPUIMPL
 namespace mg5amcCpu // this is only needed for CPU SIMD vectorization
 {
diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/MemoryBuffers.h b/epochX/cudacpp/ee_mumu.mad/SubProcesses/MemoryBuffers.h
index d37eafb214..7756a71621 100644
--- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/MemoryBuffers.h
+++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/MemoryBuffers.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021, based on earlier work by S. Hageboeck) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Roiser, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MemoryBuffers_H
 #define MemoryBuffers_H 1
diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/CPPProcess.cc b/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/CPPProcess.cc
index c1e83f322b..9a16d0301d 100644
--- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/CPPProcess.cc
+++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/CPPProcess.cc
@@ -4,7 +4,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
 // MadGraph5_aMC@NLO v. 3.5.0_lo_vect, 2023-06-09
@@ -17,7 +17,6 @@
 #include "mgOnGpuConfig.h"
 
 #include "HelAmps_sm.h"
-#include "GpuRuntime.h" // for GPU abstraction, checkGpu is run on macros defined here
 #include "MemoryAccessAmplitudes.h"
 #include "MemoryAccessCouplings.h"
 #include "MemoryAccessCouplingsFixed.h"
@@ -34,7 +33,6 @@
 
 #include <algorithm>
 #include <array>
-#include <cassert>
 #include <cstring>
 #include <iostream>
 #include <memory>
@@ -47,7 +45,7 @@
 // Class member functions for calculating the matrix elements for
 // Process: e+ e- > mu+ mu- WEIGHTED<=4 @1
 
-#ifdef MGONGPUCPP_GPUIMPL 
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -193,7 +191,7 @@ namespace mg5amcCpu
 #endif
       constexpr size_t nxcoup = ndcoup + nicoup; // both dependent and independent couplings
       const fptype* allCOUPs[nxcoup];
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 #pragma nv_diagnostic push
 #pragma nv_diag_suppress 186 // e.g. <<warning #186-D: pointless comparison of unsigned integer with zero>>
 #endif
@@ -202,7 +200,9 @@ namespace mg5amcCpu
       for( size_t iicoup = 0; iicoup < nicoup; iicoup++ )
         allCOUPs[ndcoup + iicoup] = CI_ACCESS::iicoupAccessBufferConst( cIPC, iicoup ); // independent couplings, fixed for all events
 #ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 #pragma nv_diagnostic pop
+#endif
       // CUDA kernels take input/output buffers with momenta/MEs for all events
       const fptype* momenta = allmomenta;
       const fptype* COUPs[nxcoup];
@@ -818,6 +818,9 @@ namespace mg5amcCpu
 #endif
 
     // Start sigmaKin_lines
+
+#include "GpuAbstraction.h"
+
     // === PART 0 - INITIALISATION (before calculate_wavefunctions) ===
     // Reset the "matrix elements" - running sums of |M|^2 over helicities for the given event
 #ifdef MGONGPUCPP_GPUIMPL
diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/CPPProcess.h b/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/CPPProcess.h
index cec98f9fef..ebbc2800d3 100644
--- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/CPPProcess.h
+++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/CPPProcess.h
@@ -4,7 +4,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
 // MadGraph5_aMC@NLO v. 3.5.0_lo_vect, 2023-06-09
@@ -16,7 +16,6 @@
 #define MG5_Sigma_sm_epem_mupmum_H 1
 
 #include "mgOnGpuConfig.h"
-#include "GpuRuntime.h"
 
 #include "mgOnGpuVectors.h"
 
diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/CudaRuntime.h b/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/CudaRuntime.h
deleted file mode 120000
index ce9e1a487a..0000000000
--- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/CudaRuntime.h
+++ /dev/null
@@ -1 +0,0 @@
-../CudaRuntime.h
\ No newline at end of file
diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/check_sa.cc b/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/check_sa.cc
index f339ad96a6..1bad694d1c 100644
--- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/check_sa.cc
+++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/check_sa.cc
@@ -4,15 +4,15 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: O. Mattelaer (Nov 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 
 #include "mgOnGpuConfig.h"
 
 #include "BridgeKernels.h"
 #include "CPPProcess.h"
-#include "GpuRuntime.h"
 #include "CrossSectionKernels.h"
+#include "GpuRuntime.h"
 #include "MatrixElementKernels.h"
 #include "MemoryAccessMatrixElements.h"
 #include "MemoryAccessMomenta.h"
@@ -103,12 +103,12 @@ main( int argc, char** argv )
     CurandHost = 1,
     CurandDevice = 2
   };
-#ifdef MGONGPUCPP_CUDACPP
-  RandomNumberMode rndgen = RandomNumberMode::CurandDevice; // default on GPU
+#ifdef __CUDACC__
+  RandomNumberMode rndgen = RandomNumberMode::CurandDevice; // default on CUDA GPU
 #elif not defined MGONGPU_HAS_NO_CURAND
   RandomNumberMode rndgen = RandomNumberMode::CurandHost;  // default on CPU if build has curand
 #else
-  RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // default on CPU if build has no curand
+  RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // default on CPU if build has no curand and on HIP GPU
 #endif
   // Rambo sampling mode (NB RamboHost implies CommonRandom or CurandHost!)
   enum class RamboSamplingMode
@@ -146,10 +146,10 @@ main( int argc, char** argv )
     }
     else if( arg == "--curdev" )
     {
-#ifdef MGONGPUCPP_CUDACC
+#ifdef __CUDACC__
       rndgen = RandomNumberMode::CurandDevice;
 #else
-      throw std::runtime_error( "CurandDevice is not supported on CPUs" );
+      throw std::runtime_error( "CurandDevice is not supported on CPUs or on HIP GPUs" );
 #endif
     }
     else if( arg == "--curhst" )
@@ -266,12 +266,12 @@ main( int argc, char** argv )
 
 #ifdef MGONGPUCPP_GPUIMPL
 
-  // --- 00. Initialise cuda
-  // Instantiate a GpuRuntime (CUDA or HIP based on target arch) at the beginnining of the application's main to
-  // invoke gpuSetDevice(0) in the constructor and book a gpuDeviceReset() call in the destructor
-  const std::string cdinKey = "00 CudaInit";
+  // --- 00. Initialise GPU
+  // Instantiate a GpuRuntime at the beginnining of the application's main.
+  // For CUDA this invokes cudaSetDevice(0) in the constructor and books a cudaDeviceReset() call in the destructor.
+  const std::string cdinKey = "00 GpuInit";
   timermap.start( cdinKey );
-  GpuRuntime gpuRuntime( debug );
+  GpuRuntime GpuRuntime( debug );
 #endif
 
   // --- 0a. Initialise physics process
@@ -395,7 +395,7 @@ main( int argc, char** argv )
     const bool onDevice = false;
     prnk.reset( new CurandRandomNumberKernel( hstRndmom, onDevice ) );
   }
-#ifdef MGONGPUCPP_CUDACC
+#ifdef __CUDACC__
   else
   {
     const bool onDevice = true;
@@ -404,7 +404,7 @@ main( int argc, char** argv )
 #else
   else
   {
-    throw std::logic_error( "CurandDevice is not supported on CPUs" ); // INTERNAL ERROR (no path to this statement)
+    throw std::logic_error( "CurandDevice is not supported on CPUs or HIP GPUs" ); // INTERNAL ERROR (no path to this statement)
   }
 #endif
 #else
@@ -730,17 +730,21 @@ main( int argc, char** argv )
     rndgentxt = "CURAND HOST";
   else if( rndgen == RandomNumberMode::CurandDevice )
     rndgentxt = "CURAND DEVICE";
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
   rndgentxt += " (CUDA code)";
+#elif defined __HIPCC__
+  rndgentxt += " (HIP code)";
 #else
   rndgentxt += " (C++ code)";
 #endif
 
   // Workflow description summary
   std::string wrkflwtxt;
-  // -- CUDA or C++?
-#ifdef MGONGPUCPP_GPUIMPL
+  // -- CUDA or HIP or C++?
+#ifdef __CUDACC__
   wrkflwtxt += "CUD:";
+#elif defined __HIPCC__
+  wrkflwtxt += "HIP:";
 #else
   wrkflwtxt += "CPP:";
 #endif
@@ -755,7 +759,7 @@ main( int argc, char** argv )
   wrkflwtxt += "???+"; // no path to this statement
 #endif
   // -- CUCOMPLEX or THRUST or STD complex numbers?
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 #if defined MGONGPU_CUCXTYPE_CUCOMPLEX
   wrkflwtxt += "CUX:";
 #elif defined MGONGPU_CUCXTYPE_THRUST
@@ -765,6 +769,12 @@ main( int argc, char** argv )
 #else
   wrkflwtxt += "???:"; // no path to this statement
 #endif
+#elif defined __HIPCC__
+#if defined MGONGPU_CUCXTYPE_CXSMPL
+  wrkflwtxt += "CXS:";
+#else
+  wrkflwtxt += "???:"; // no path to this statement
+#endif
 #else
 #if defined MGONGPU_CPPCXTYPE_STDCOMPLEX
   wrkflwtxt += "STX:";
@@ -865,8 +875,10 @@ main( int argc, char** argv )
 #endif
     // Dump all configuration parameters and all results
     std::cout << std::string( SEP79, '*' ) << std::endl
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
               << "Process                     = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_CUDA"
+#elif defined __HIPCC__
+              << "Process                     = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_HIP"
 #else
               << "Process                     = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_CPP"
 #endif
@@ -893,14 +905,14 @@ main( int argc, char** argv )
 #elif defined MGONGPU_FPTYPE_FLOAT
               << "FP precision                = FLOAT (NaN/abnormal=" << nabn << ", zero=" << nzero << ")" << std::endl
 #endif
-#ifdef MGONGPUCPP_GPUIMPL
 #if defined MGONGPU_CUCXTYPE_CUCOMPLEX
               << "Complex type                = CUCOMPLEX" << std::endl
 #elif defined MGONGPU_CUCXTYPE_THRUST
               << "Complex type                = THRUST::COMPLEX" << std::endl
-#endif
-#else
+#elif defined MGONGPU_CUCXTYPE_CXSMPL
               << "Complex type                = STD::COMPLEX" << std::endl
+#else
+              << "Complex type                = ???" << std::endl // no path to this statement...
 #endif
               << "RanNumb memory layout       = AOSOA[" << neppR << "]"
               << ( neppR == 1 ? " == AOS" : "" )
@@ -1034,14 +1046,14 @@ main( int argc, char** argv )
              << "\"FLOAT (NaN/abnormal=" << nabn << ")\"," << std::endl
 #endif
              << "\"Complex type\": "
-#ifdef MGONGPUCPP_GPUIMPL
 #if defined MGONGPU_CUCXTYPE_CUCOMPLEX
              << "\"CUCOMPLEX\"," << std::endl
 #elif defined MGONGPU_CUCXTYPE_THRUST
              << "\"THRUST::COMPLEX\"," << std::endl
-#endif
-#else
+#elif defined MGONGPU_CUCXTYPE_CXSMPL
              << "\"STD::COMPLEX\"," << std::endl
+#else
+             << "\"???\"," << std::endl                           // no path to this statement...
 #endif
              << "\"RanNumb memory layout\": "
              << "\"AOSOA[" << neppR << "]\""
diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/RamboSamplingKernels.cc b/epochX/cudacpp/ee_mumu.mad/SubProcesses/RamboSamplingKernels.cc
index 3e89b62180..79abbcc4f8 100644
--- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/RamboSamplingKernels.cc
+++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/RamboSamplingKernels.cc
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #include "RamboSamplingKernels.h"
 
@@ -151,7 +151,7 @@ namespace mg5amcCpu
   void
   RamboSamplingKernelDevice::getMomentaInitial()
   {
-    gpuLaunchKernel(getMomentaInitialDevice, m_gpublocks, m_gputhreads, m_energy, m_momenta.data() );
+    gpuLaunchKernel( getMomentaInitialDevice, m_gpublocks, m_gputhreads, m_energy, m_momenta.data() );
   }
 #endif
 
@@ -175,7 +175,7 @@ namespace mg5amcCpu
   void
   RamboSamplingKernelDevice::getMomentaFinal()
   {
-    gpuLaunchKernel(getMomentaFinalDevice, m_gpublocks, m_gputhreads, m_energy, m_rndmom.data(), m_momenta.data(), m_weights.data() );
+    gpuLaunchKernel( getMomentaFinalDevice, m_gpublocks, m_gputhreads, m_energy, m_rndmom.data(), m_momenta.data(), m_weights.data() );
   }
 #endif
 
diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/RamboSamplingKernels.h b/epochX/cudacpp/ee_mumu.mad/SubProcesses/RamboSamplingKernels.h
index abf95e8d1b..7c214cd74b 100644
--- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/RamboSamplingKernels.h
+++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/RamboSamplingKernels.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef RAMBOSAMPLINGKERNELS_H
 #define RAMBOSAMPLINGKERNELS_H 1
@@ -10,8 +10,6 @@
 
 #include "MemoryBuffers.h"
 
-#include "GpuAbstraction.h"
-
 #ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/RandomNumberKernels.h b/epochX/cudacpp/ee_mumu.mad/SubProcesses/RandomNumberKernels.h
index 00a196d40c..21d63beeac 100644
--- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/RandomNumberKernels.h
+++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/RandomNumberKernels.h
@@ -1,15 +1,13 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef RANDOMNUMBERKERNELS_H
 #define RANDOMNUMBERKERNELS_H 1
 
 #include "mgOnGpuConfig.h"
 
-#include "GpuAbstraction.h"
-
 // NB This must come AFTER mgOnGpuConfig.h which contains our definition of __global__ when MGONGPUCPP_GPUIMPL is not defined
 #ifndef MGONGPU_HAS_NO_CURAND
 //#include "curand.h"
diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/cudacpp.mk b/epochX/cudacpp/ee_mumu.mad/SubProcesses/cudacpp.mk
index ff63aa4a43..bcb73d7f01 100644
--- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/cudacpp.mk
+++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/cudacpp.mk
@@ -1,7 +1,7 @@
 # Copyright (C) 2020-2023 CERN and UCLouvain.
 # Licensed under the GNU Lesser General Public License (version 3 or later).
 # Created by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin.
-# Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+# Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 
 #=== Determine the name of this makefile (https://ftp.gnu.org/old-gnu/Manuals/make-3.80/html_node/make_17.html)
 #=== NB: different names (e.g. cudacpp.mk and cudacpp_src.mk) are used in the Subprocess and src directories
diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/fbridge.cc b/epochX/cudacpp/ee_mumu.mad/SubProcesses/fbridge.cc
index 1d584cf647..2b956730d4 100644
--- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/fbridge.cc
+++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/fbridge.cc
@@ -1,11 +1,11 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: S. Roiser (Oct 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Roiser, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #include "Bridge.h"
 #include "CPPProcess.h"
-#include "GpuAbstraction.h"
+#include "GpuRuntime.h"
 
 extern "C"
 {
diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/fsampler.cc b/epochX/cudacpp/ee_mumu.mad/SubProcesses/fsampler.cc
index db2e0e1a99..3743934f41 100644
--- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/fsampler.cc
+++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/fsampler.cc
@@ -1,14 +1,13 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Feb 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #include "mgOnGpuConfig.h"
 
 #include "Bridge.h"
 #include "CPPProcess.h"
 #include "MemoryBuffers.h"
-#include "GpuAbstraction.h"
 #include "RamboSamplingKernels.h"
 #include "RandomNumberKernels.h"
 
diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/runTest.cc b/epochX/cudacpp/ee_mumu.mad/SubProcesses/runTest.cc
index 8806d36eda..461ec5c3a5 100644
--- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/runTest.cc
+++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/runTest.cc
@@ -1,12 +1,11 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: S. Hageboeck (Nov 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 
 #include "mgOnGpuConfig.h"
 
 #include "CPPProcess.h"
-#include "GpuAbstraction.h" // for CUDA/HIP runtime, also includes GPU abstraction
 #include "MadgraphTest.h"
 #include "MatrixElementKernels.h"
 #include "MemoryAccessMatrixElements.h"
diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/testmisc.cc b/epochX/cudacpp/ee_mumu.mad/SubProcesses/testmisc.cc
index 4231c5f524..2bd7a9fcf9 100644
--- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/testmisc.cc
+++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/testmisc.cc
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 //----------------------------------------------------------------------------
 // Use ./runTest.exe --gtest_filter=*misc to run only this test
 //----------------------------------------------------------------------------
@@ -12,8 +12,6 @@
 
 #include "epoch_process_id.h"
 
-#include "GpuAbstraction.h"
-
 #include <gtest/gtest.h>
 
 #include <sstream>
diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/testxxx.cc b/epochX/cudacpp/ee_mumu.mad/SubProcesses/testxxx.cc
index 5356eea65a..6e8657edca 100644
--- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/testxxx.cc
+++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/testxxx.cc
@@ -1,12 +1,11 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Apr 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #include "mgOnGpuConfig.h"
 
 #include "CPPProcess.h"
-#include "GpuAbstraction.h"
 #include "HelAmps_sm.h"
 #include "MemoryAccessMomenta.h"
 #include "MemoryAccessWavefunctions.h"
diff --git a/epochX/cudacpp/ee_mumu.mad/src/GpuAbstraction.h b/epochX/cudacpp/ee_mumu.mad/src/GpuAbstraction.h
deleted file mode 120000
index 4955c9171e..0000000000
--- a/epochX/cudacpp/ee_mumu.mad/src/GpuAbstraction.h
+++ /dev/null
@@ -1 +0,0 @@
-../SubProcesses/GpuAbstraction.h
\ No newline at end of file
diff --git a/epochX/cudacpp/ee_mumu.mad/src/HelAmps_sm.h b/epochX/cudacpp/ee_mumu.mad/src/HelAmps_sm.h
index 54e070bfd7..c2c572778b 100644
--- a/epochX/cudacpp/ee_mumu.mad/src/HelAmps_sm.h
+++ b/epochX/cudacpp/ee_mumu.mad/src/HelAmps_sm.h
@@ -4,7 +4,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: A. Valassi (Sep 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
 // MadGraph5_aMC@NLO v. 3.5.0_lo_vect, 2023-06-09
diff --git a/epochX/cudacpp/ee_mumu.mad/src/Parameters_sm.cc b/epochX/cudacpp/ee_mumu.mad/src/Parameters_sm.cc
index daed91bb80..89bbb57a0d 100644
--- a/epochX/cudacpp/ee_mumu.mad/src/Parameters_sm.cc
+++ b/epochX/cudacpp/ee_mumu.mad/src/Parameters_sm.cc
@@ -4,7 +4,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: A. Valassi (Sep 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
 // MadGraph5_aMC@NLO v. 3.5.0_lo_vect, 2023-06-09
diff --git a/epochX/cudacpp/ee_mumu.mad/src/mgOnGpuConfig.h b/epochX/cudacpp/ee_mumu.mad/src/mgOnGpuConfig.h
index c18e7c3dfa..390766116b 100644
--- a/epochX/cudacpp/ee_mumu.mad/src/mgOnGpuConfig.h
+++ b/epochX/cudacpp/ee_mumu.mad/src/mgOnGpuConfig.h
@@ -1,9 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jul 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
-
-#include "GpuAbstraction.h"
+// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MGONGPUCONFIG_H
 #define MGONGPUCONFIG_H 1
@@ -12,16 +10,27 @@
 // There are two different code bases for standalone_cudacpp (without multichannel) and madevent+cudacpp (with multichannel)
 #define MGONGPU_SUPPORTS_MULTICHANNEL 1
 
+// Is this a GPU (CUDA, HIP) or CPU implementation?
+#ifdef __CUDACC__
+#define MGONGPUCPP_GPUIMPL cuda
+#elif defined __HIPCC__
+#define MGONGPUCPP_GPUIMPL hip
+#else
+#undef MGONGPUCPP_GPUIMPL
+#endif
+
 // ** NB1 Throughputs (e.g. 6.8E8) are events/sec for "./gcheck.exe -p 65536 128 12"
 // ** NB2 Baseline on b7g47n0004 fluctuates (probably depends on load on other VMs)
 
 // Choose if curand is supported for generating random numbers
+// For CUDA, by default, it is supported
+// For HIP, by default, it is not supported
 // For C++, by default, do not inline, but allow this macro to be set from outside with e.g. -DMGONGPU_HAS_NO_CURAND
-// Added support for HIP compilation by defining MGONGPU_HAS_NO_CURAND
-#ifdef MGONGPUCPP_CUDACC
-#undef MGONGPU_HAS_NO_CURAND 
-#elif defined MGONGPUCPP_HIPCC
+#ifdef __CUDACC__
+#undef MGONGPU_HAS_NO_CURAND
+#elif defined __HIPCC__
 #define MGONGPU_HAS_NO_CURAND 1
+#else
 //#undef MGONGPU_HAS_NO_CURAND // default
 ////#define MGONGPU_HAS_NO_CURAND 1
 #endif
@@ -56,24 +65,28 @@
 //#undef MGONGPU_HARDCODE_PARAM // default
 ////#define MGONGPU_HARDCODE_PARAM 1
 
-// Complex type in c++: std::complex or cxsmpl (CHOOSE ONLY ONE)
-#ifndef MGONGPUCPP_GPUIMPL
-//#define MGONGPU_CPPCXTYPE_STDCOMPLEX 1 // ~8 percent slower on float, same on double (5.1E6/double, 9.4E6/float)
-#define MGONGPU_CPPCXTYPE_CXSMPL 1 // new default (5.1E6/double, 10.2E6/float)
-#endif
-
-// Complex type in cuda: thrust or cucomplex or cxsmpl (CHOOSE ONLY ONE)
-#ifdef MGONGPUCPP_CUDACC
+// Complex type in CUDA: thrust or cucomplex or cxsmpl (CHOOSE ONLY ONE)
+#ifdef __CUDACC__
 #define MGONGPU_CUCXTYPE_THRUST 1 // default (~1.15E9/double, ~3.2E9/float)
 //#define MGONGPU_CUCXTYPE_CUCOMPLEX 1 // ~10 percent slower (1.03E9/double, ~2.8E9/float)
-#elif defined MGONGPUCPP_HIPCC
+//#define MGONGPU_CUCXTYPE_CXSMPL 1 // ~10 percent slower (1.00E9/double, ~2.9E9/float)
+
+// Complex type in HIP: cxsmpl (ONLY ONE OPTION POSSIBLE)
+#elif defined __HIPCC__
 #define MGONGPU_CUCXTYPE_CXSMPL 1 // ~10 percent slower (1.00E9/double, ~2.9E9/float)
+
+// Complex type in C++: std::complex or cxsmpl (CHOOSE ONLY ONE)
+#else
+//#define MGONGPU_CPPCXTYPE_STDCOMPLEX 1 // ~8 percent slower on float, same on double (5.1E6/double, 9.4E6/float)
+#define MGONGPU_CPPCXTYPE_CXSMPL 1 // new default (5.1E6/double, 10.2E6/float)
 #endif
 
-// Cuda nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation
-#ifdef MGONGPUCPP_CUDACC
-#undef MGONGPU_NSIGHT_DEBUG // default
+// CUDA nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation
+#ifdef __CUDACC__
+#undef MGONGPU_NSIGHT_DEBUG // default in CUDA
 //#define MGONGPU_NSIGHT_DEBUG 1
+#else
+#undef MGONGPU_NSIGHT_DEBUG // only option in HIP or C++
 #endif
 
 // SANITY CHECKS (floating point precision for everything but color algebra #537)
@@ -89,17 +102,21 @@
 #error You cannot use double precision for color algebra and single precision elsewhere
 #endif
 
-// SANITY CHECKS (c++ complex number implementation)
-#ifndef MGONGPUCPP_GPUIMPL
-#if defined MGONGPU_CPPCXTYPE_STDCOMPLEX and defined MGONGPU_CPPCXTYPE_CXSMPL
-#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CPPCXTYPE_STDCOMPLEX or MGONGPU_CPPCXTYPE_CXSMPL
+// SANITY CHECKS (CUDA complex number implementation)
+#ifdef __CUDACC__
+#if defined MGONGPU_CUCXTYPE_THRUST and defined MGONGPU_CUCXTYPE_CUCOMPLEX
+#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CUCXTYPE_THRUST or MGONGPU_CUCXTYPE_CUCOMPLEX for CUDA
+#elif defined MGONGPU_CUCXTYPE_THRUST and defined MGONGPU_CUCXTYPE_CXSMPL
+#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CUCXTYPE_THRUST or MGONGPU_CUCXTYPE_CXSMPL for CUDA
+#elif defined MGONGPU_CUCXTYPE_CUCOMPLEX and defined MGONGPU_CUCXTYPE_CXSMPL
+#error You must CHOOSE (ONE AND) ONLY ONE OF MGONGPU_CUCXTYPE_CUCOMPLEX or MGONGPU_CUCXTYPE_CXSMPL for CUDA
 #endif
 #endif
 
-// SANITY CHECKS (cuda complex number implementation)
-#ifdef MGONGPUCPP_GPUIMPL
-#if defined MGONGPU_CUCXTYPE_THRUST and defined MGONGPU_CUCXTYPE_CUCOMPLEX and defined MGONGPU_CUCXTYPE_CXSMPL
-#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CUCXTYPE_THRUST or MGONGPU_CUCXTYPE_CUCOMPLEX or MGONGPU_CUCXTYPE_CXSMPL
+// SANITY CHECKS (C++ complex number implementation)
+#ifndef MGONGPUCPP_GPUIMPL
+#if defined MGONGPU_CPPCXTYPE_STDCOMPLEX and defined MGONGPU_CPPCXTYPE_CXSMPL
+#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CPPCXTYPE_STDCOMPLEX or MGONGPU_CPPCXTYPE_CXSMPL for C++
 #endif
 #endif
 
@@ -147,7 +164,7 @@ using mgOnGpu::fptype;
 using mgOnGpu::fptype2;
 
 // C++ SIMD vectorization width (this will be used to set neppV)
-#ifdef MGONGPUCPP_GPUIMPL // CUDA implementation has no SIMD
+#ifdef MGONGPUCPP_GPUIMPL // CUDA and HIP implementations have no SIMD
 #undef MGONGPU_CPPSIMD
 #elif defined __AVX512VL__ && defined MGONGPU_PVW512 // C++ "512z" AVX512 with 512 width (512-bit ie 64-byte): 8 (DOUBLE) or 16 (FLOAT)
 #ifdef MGONGPU_FPTYPE_DOUBLE
@@ -177,9 +194,9 @@ using mgOnGpu::fptype2;
 #undef MGONGPU_CPPSIMD
 #endif
 
-// Cuda nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation
+// CUDA nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation
 // Arguments (not used so far): text is __FUNCTION__, code is 0 (start) or 1 (end)
-#if defined MGONGPUCPP_GPUIMPL && defined MGONGPU_NSIGHT_DEBUG /* clang-format off */
+#if defined __CUDA__ && defined MGONGPU_NSIGHT_DEBUG /* clang-format off */
 #define mgDebugDeclare() __shared__ float mgDebugCounter[mgOnGpu::ntpbMAX];
 #define mgDebugInitialise() { mgDebugCounter[threadIdx.x] = 0; }
 #define mgDebug( code, text ) { mgDebugCounter[threadIdx.x] += 1; }
@@ -191,7 +208,7 @@ using mgOnGpu::fptype2;
 #define mgDebugFinalise() { /*noop*/ }
 #endif /* clang-format on */
 
-// Define empty CUDA declaration specifiers for C++
+// Define empty CUDA/HIP declaration specifiers for C++
 #ifndef MGONGPUCPP_GPUIMPL
 #define __global__
 #define __host__
diff --git a/epochX/cudacpp/ee_mumu.mad/src/mgOnGpuCxtypes.h b/epochX/cudacpp/ee_mumu.mad/src/mgOnGpuCxtypes.h
index eb395001f1..4e7ab03fa2 100644
--- a/epochX/cudacpp/ee_mumu.mad/src/mgOnGpuCxtypes.h
+++ b/epochX/cudacpp/ee_mumu.mad/src/mgOnGpuCxtypes.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022, based on earlier work by D. Smith) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MGONGPUCXTYPES_H
 #define MGONGPUCXTYPES_H 1
@@ -19,7 +19,6 @@
 #include <complex>
 
 // Complex type in cuda: thrust or cucomplex or cxsmpl
-// #define THRUST_IGNORE_CUB_VERSION_CHECK
 #ifdef MGONGPUCPP_GPUIMPL
 #if defined MGONGPU_CUCXTYPE_THRUST
 #pragma clang diagnostic push
@@ -202,7 +201,7 @@ namespace mgOnGpu
 {
 
   // --- Type definitions (complex type: cxtype)
-#ifdef MGONGPUCPP_CUDACC // cuda
+#ifdef MGONGPUCPP_GPUIMPL // cuda
 #if defined MGONGPU_CUCXTYPE_THRUST
   typedef thrust::complex<fptype> cxtype;
 #elif defined MGONGPU_CUCXTYPE_CUCOMPLEX
@@ -236,7 +235,7 @@ using mgOnGpu::cxtype;
 // COMPLEX TYPES: (PLATFORM-SPECIFIC) FUNCTIONS AND OPERATORS
 //==========================================================================
 
-#if defined MGONGPU_CUCXTYPE_CXSMPL or defined MGONGPU_CPPCXTYPE_CXSMPL or MGONGPUCPP_HIPCC
+#if defined MGONGPU_CUCXTYPE_CXSMPL or defined MGONGPU_CPPCXTYPE_CXSMPL
 
 //------------------------------
 // CUDA or C++ - using cxsmpl
diff --git a/epochX/cudacpp/ee_mumu.mad/src/mgOnGpuFptypes.h b/epochX/cudacpp/ee_mumu.mad/src/mgOnGpuFptypes.h
index 7edefa3389..6f6cee64d6 100644
--- a/epochX/cudacpp/ee_mumu.mad/src/mgOnGpuFptypes.h
+++ b/epochX/cudacpp/ee_mumu.mad/src/mgOnGpuFptypes.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MGONGPUFPTYPES_H
 #define MGONGPUFPTYPES_H 1
diff --git a/epochX/cudacpp/ee_mumu.mad/src/mgOnGpuVectors.h b/epochX/cudacpp/ee_mumu.mad/src/mgOnGpuVectors.h
index 6273b1c852..7904b93c61 100644
--- a/epochX/cudacpp/ee_mumu.mad/src/mgOnGpuVectors.h
+++ b/epochX/cudacpp/ee_mumu.mad/src/mgOnGpuVectors.h
@@ -9,8 +9,6 @@
 #include "mgOnGpuCxtypes.h"
 #include "mgOnGpuFptypes.h"
 
-#include "GpuAbstraction.h" // Includes required macros for GPU abstraction
-
 #include <iostream>
 
 //==========================================================================
diff --git a/epochX/cudacpp/ee_mumu.mad/src/rambo.h b/epochX/cudacpp/ee_mumu.mad/src/rambo.h
index 3a331b979a..cd7e1008ea 100644
--- a/epochX/cudacpp/ee_mumu.mad/src/rambo.h
+++ b/epochX/cudacpp/ee_mumu.mad/src/rambo.h
@@ -4,7 +4,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 
 #include "mgOnGpuConfig.h"
diff --git a/epochX/cudacpp/ee_mumu.sa/CODEGEN_cudacpp_ee_mumu_log.txt b/epochX/cudacpp/ee_mumu.sa/CODEGEN_cudacpp_ee_mumu_log.txt
index 138f426e62..f5f2ead4aa 100644
--- a/epochX/cudacpp/ee_mumu.sa/CODEGEN_cudacpp_ee_mumu_log.txt
+++ b/epochX/cudacpp/ee_mumu.sa/CODEGEN_cudacpp_ee_mumu_log.txt
@@ -51,8 +51,8 @@ Note that you can still compile and run aMC@NLO with the built-in PDFs
  MG5_aMC> set lhapdf /PATH/TO/lhapdf-config
 
 Using default text editor "vi". Set another one in ./input/mg5_configuration.txt
-No valid eps viewer found. Please set in ./input/mg5_configuration.txt
-Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt
+Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt
+No valid web browser found. Please set in ./input/mg5_configuration.txt
 import /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_cudacpp_ee_mumu.mg
 The import format was not given, so we guess it as command
 set stdout_level DEBUG
@@ -62,7 +62,7 @@ generate e+ e- > mu+ mu-
 No model currently active, so we import the Standard Model
 INFO: load particles 
 INFO: load vertices 
-[1;32mDEBUG: model prefixing  takes 0.004713535308837891 [0m
+[1;32mDEBUG: model prefixing  takes 0.005217790603637695 [0m
 INFO: Restrict model sm with file models/sm/restrict_default.dat . 
 [1;32mDEBUG: Simplifying conditional expressions [0m
 [1;32mDEBUG: remove interactions: u s w+ at order: QED=1 [0m
@@ -185,7 +185,7 @@ FileWriter <class 'PLUGIN.CUDACPP_SA_OUTPUT.model_handling.PLUGIN_CPPWriter'> fo
 [1;32mDEBUG:  type(self.helas_call_writer) = [0m <class 'PLUGIN.CUDACPP_SA_OUTPUT.model_handling.PLUGIN_GPUFOHelasCallWriter'> [1;30m[model_handling.py at line 1151][0m [0m
 [1;32mDEBUG:  self.support_multichannel, self.include_multi_channel = [0m True False [1;30m[model_handling.py at line 1152][0m [0m
 [1;32mDEBUG:  multi_channel_map = [0m None [1;30m[model_handling.py at line 1643][0m [0m
-[1;32mDEBUG:  diag_to_config = [0m {} [1;30m[model_handling.py at line 1698][0m [0m
+[1;32mDEBUG:  diag_to_config = [0m {} [1;30m[model_handling.py at line 1700][0m [0m
 INFO: Created files CPPProcess.h and CPPProcess.cc in directory /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_cudacpp_ee_mumu/SubProcesses/P1_Sigma_sm_epem_mupmum/. 
 [1;32mDEBUG:  Entering PLUGIN_OneProcessExporter.edit_CMakeLists [1;30m[model_handling.py at line 1332][0m [0m
 [1;32mDEBUG:  Entering PLUGIN_OneProcessExporter.edit_check_sa [1;30m[model_handling.py at line 1341][0m [0m
@@ -202,7 +202,7 @@ ALOHA: aloha creates FFV1 routines[0m
 ALOHA: aloha creates FFV2 routines[0m
 ALOHA: aloha creates FFV4 routines[0m
 ALOHA: aloha creates FFV2_4 routines[0m
-ALOHA: aloha creates 4 routines in  0.229 s
+ALOHA: aloha creates 4 routines in  0.252 s
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
 <class 'aloha.create_aloha.AbstractRoutine'> FFV2
@@ -231,6 +231,6 @@ INFO: /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_cudacpp_ee_mumu/src/.
 [1;32mDEBUG:  Entering PLUGIN_ProcessExporter.finalize [1;30m[output.py at line 203][0m [0m
 quit
 
-real	0m0.652s
-user	0m0.574s
-sys	0m0.066s
+real	0m0.762s
+user	0m0.624s
+sys	0m0.057s
diff --git a/epochX/cudacpp/ee_mumu.sa/COPYRIGHT b/epochX/cudacpp/ee_mumu.sa/COPYRIGHT
index a134b5fef9..84a883fbb0 100644
--- a/epochX/cudacpp/ee_mumu.sa/COPYRIGHT
+++ b/epochX/cudacpp/ee_mumu.sa/COPYRIGHT
@@ -15,6 +15,7 @@ The full development team currently includes the following authors :
   Stephan Hageboeck (CERN)
   Olivier Mattelaer (Universite Catholique de Louvain, original author)
   Stefan Roiser (CERN, original author)
+  Joergen Teig (CERN)
   Andrea Valassi (CERN, original author)
   Zenny Wettersten (CERN)
 See https://github.com/madgraph5/madgraph4gpu for more details. For the full
diff --git a/epochX/cudacpp/ee_mumu.sa/SubProcesses/Bridge.h b/epochX/cudacpp/ee_mumu.sa/SubProcesses/Bridge.h
index 4cafe0c997..c04628dfd1 100644
--- a/epochX/cudacpp/ee_mumu.sa/SubProcesses/Bridge.h
+++ b/epochX/cudacpp/ee_mumu.sa/SubProcesses/Bridge.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: S. Roiser (Nov 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Roiser, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef BRIDGE_H
 #define BRIDGE_H 1
@@ -22,7 +22,7 @@
 #include <memory>
 #include <type_traits>
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -82,7 +82,7 @@ namespace mg5amcCpu
     Bridge& operator=( const Bridge& ) = delete;
     Bridge& operator=( Bridge&& ) = delete;
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     /**
      * Set the gpublocks and gputhreads for the gpusequence - throws if evnt != gpublocks*gputhreads
      * (this is needed for BridgeKernel tests rather than for actual production use in Fortran)
@@ -149,7 +149,7 @@ namespace mg5amcCpu
     unsigned int m_nevt; // number of events
     int m_nGoodHel;      // the number of good helicities (-1 initially when they have not yet been calculated)
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     int m_gputhreads; // number of gpu threads (default set from number of events, can be modified)
     int m_gpublocks;  // number of gpu blocks (default set from number of events, can be modified)
     mg5amcGpu::DeviceBuffer<FORTRANFPTYPE, sizePerEventMomenta> m_devMomentaF;
@@ -186,12 +186,12 @@ namespace mg5amcCpu
   // Forward declare transposition methods
   //
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 
   template<typename Tin, typename Tout>
   __global__ void dev_transposeMomentaF2C( const Tin* in, Tout* out, const unsigned int nevt );
 
-#endif // __CUDACC__
+#endif // MGONGPUCPP_GPUIMPL
 
   template<typename Tin, typename Tout>
   void hst_transposeMomentaF2C( const Tin* in, Tout* out, const unsigned int nevt );
@@ -208,7 +208,7 @@ namespace mg5amcCpu
   Bridge<FORTRANFPTYPE>::Bridge( unsigned int nevtF, unsigned int nparF, unsigned int np4F )
     : m_nevt( nevtF )
     , m_nGoodHel( -1 )
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     , m_gputhreads( 256 )                  // default number of gpu threads
     , m_gpublocks( m_nevt / m_gputhreads ) // this ensures m_nevt <= m_gpublocks*m_gputhreads
     , m_devMomentaF( m_nevt )
@@ -232,7 +232,7 @@ namespace mg5amcCpu
   {
     if( nparF != CPPProcess::npar ) throw std::runtime_error( "Bridge constructor: npar mismatch" );
     if( np4F != CPPProcess::np4 ) throw std::runtime_error( "Bridge constructor: np4 mismatch" );
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     if( ( m_nevt < s_gputhreadsmin ) || ( m_nevt % s_gputhreadsmin != 0 ) )
       throw std::runtime_error( "Bridge constructor: nevt should be a multiple of " + std::to_string( s_gputhreadsmin ) );
     while( m_nevt != m_gpublocks * m_gputhreads )
@@ -250,11 +250,11 @@ namespace mg5amcCpu
     std::cout << "WARNING! Instantiate host Bridge (nevt=" << m_nevt << ")" << std::endl;
     mg5amcCpu::CPPProcess process( /*verbose=*/false );
     m_pmek.reset( new mg5amcCpu::MatrixElementKernelHost( m_hstMomentaC, m_hstGs, m_hstRndHel, m_hstRndCol, m_hstMEs, m_hstSelHel, m_hstSelCol, m_nevt ) );
-#endif // __CUDACC__
+#endif // MGONGPUCPP_GPUIMPL
     process.initProc( "../../Cards/param_card.dat" );
   }
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   template<typename FORTRANFPTYPE>
   void Bridge<FORTRANFPTYPE>::set_gpugrid( const int gpublocks, const int gputhreads )
   {
@@ -268,7 +268,7 @@ namespace mg5amcCpu
   }
 #endif
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   template<typename FORTRANFPTYPE>
   void Bridge<FORTRANFPTYPE>::gpu_sequence( const FORTRANFPTYPE* momenta,
                                             const FORTRANFPTYPE* gs,
@@ -283,14 +283,14 @@ namespace mg5amcCpu
     constexpr int neppM = MemoryAccessMomenta::neppM;
     if constexpr( neppM == 1 && std::is_same_v<FORTRANFPTYPE, fptype> )
     {
-      checkCuda( cudaMemcpy( m_devMomentaC.data(), momenta, m_devMomentaC.bytes(), cudaMemcpyHostToDevice ) );
+      gpuMemcpy( m_devMomentaC.data(), momenta, m_devMomentaC.bytes(), gpuMemcpyHostToDevice );
     }
     else
     {
-      checkCuda( cudaMemcpy( m_devMomentaF.data(), momenta, m_devMomentaF.bytes(), cudaMemcpyHostToDevice ) );
+      gpuMemcpy( m_devMomentaF.data(), momenta, m_devMomentaF.bytes(), gpuMemcpyHostToDevice );
       const int thrPerEvt = CPPProcess::npar * CPPProcess::np4; // AV: transpose alg does 1 element per thread (NOT 1 event per thread)
       //const int thrPerEvt = 1; // AV: try new alg with 1 event per thread... this seems slower
-      dev_transposeMomentaF2C<<<m_gpublocks * thrPerEvt, m_gputhreads>>>( m_devMomentaF.data(), m_devMomentaC.data(), m_nevt );
+      gpuLaunchKernel( dev_transposeMomentaF2C, m_gpublocks * thrPerEvt, m_gputhreads, m_devMomentaF.data(), m_devMomentaC.data(), m_nevt );
     }
     if constexpr( std::is_same_v<FORTRANFPTYPE, fptype> )
     {
@@ -333,7 +333,7 @@ namespace mg5amcCpu
   }
 #endif
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   template<typename FORTRANFPTYPE>
   void Bridge<FORTRANFPTYPE>::cpu_sequence( const FORTRANFPTYPE* momenta,
                                             const FORTRANFPTYPE* gs,
@@ -388,7 +388,7 @@ namespace mg5amcCpu
   // - C++ array: momenta[npagM][npar][np4][neppM] with nevt=npagM*neppM (AOSOA)
   //
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   template<typename Tin, typename Tout>
   __global__ void dev_transposeMomentaF2C( const Tin* in, Tout* out, const unsigned int nevt )
   {
diff --git a/epochX/cudacpp/ee_mumu.sa/SubProcesses/BridgeKernels.cc b/epochX/cudacpp/ee_mumu.sa/SubProcesses/BridgeKernels.cc
index cef4cb3c71..90c7f2d3b8 100644
--- a/epochX/cudacpp/ee_mumu.sa/SubProcesses/BridgeKernels.cc
+++ b/epochX/cudacpp/ee_mumu.sa/SubProcesses/BridgeKernels.cc
@@ -1,10 +1,11 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #include "BridgeKernels.h"
 
+#include "GpuAbstraction.h"
 #include "MemoryAccessMomenta.h"
 
 #include <sstream>
@@ -14,7 +15,7 @@ constexpr int npar = CPPProcess::npar; // #particles in total (external = initia
 
 //============================================================================
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -45,7 +46,7 @@ namespace mg5amcCpu
 
 //============================================================================
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 namespace mg5amcCpu
 {
 
@@ -96,7 +97,7 @@ namespace mg5amcCpu
 
 //============================================================================
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 {
 
diff --git a/epochX/cudacpp/ee_mumu.sa/SubProcesses/BridgeKernels.h b/epochX/cudacpp/ee_mumu.sa/SubProcesses/BridgeKernels.h
index 15eb4bff4d..3efef8ce97 100644
--- a/epochX/cudacpp/ee_mumu.sa/SubProcesses/BridgeKernels.h
+++ b/epochX/cudacpp/ee_mumu.sa/SubProcesses/BridgeKernels.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef BRIDGEKERNELS_H
 #define BRIDGEKERNELS_H 1
@@ -12,7 +12,7 @@
 #include "MatrixElementKernels.h"
 #include "MemoryBuffers.h"
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -49,7 +49,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A Bridge wrapper class encapsulating matrix element calculations on a CPU host
   class BridgeKernelHost final : public BridgeKernelBase
   {
@@ -89,7 +89,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   // A Bridge wrapper class encapsulating matrix element calculations on a GPU device
   class BridgeKernelDevice : public BridgeKernelBase
   {
diff --git a/epochX/cudacpp/ee_mumu.sa/SubProcesses/CommonRandomNumberKernel.cc b/epochX/cudacpp/ee_mumu.sa/SubProcesses/CommonRandomNumberKernel.cc
index 985b39f576..010bc4cbd0 100644
--- a/epochX/cudacpp/ee_mumu.sa/SubProcesses/CommonRandomNumberKernel.cc
+++ b/epochX/cudacpp/ee_mumu.sa/SubProcesses/CommonRandomNumberKernel.cc
@@ -1,15 +1,16 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #include "CommonRandomNumbers.h"
+#include "GpuAbstraction.h"
 #include "MemoryBuffers.h"
 #include "RandomNumberKernels.h"
 
 #include <cassert>
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/ee_mumu.sa/SubProcesses/CrossSectionKernels.cc b/epochX/cudacpp/ee_mumu.sa/SubProcesses/CrossSectionKernels.cc
index 0b355a3c8d..c15b39844d 100644
--- a/epochX/cudacpp/ee_mumu.sa/SubProcesses/CrossSectionKernels.cc
+++ b/epochX/cudacpp/ee_mumu.sa/SubProcesses/CrossSectionKernels.cc
@@ -1,10 +1,11 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #include "CrossSectionKernels.h"
 
+#include "GpuAbstraction.h"
 #include "MemoryAccessMatrixElements.h"
 #include "MemoryAccessWeights.h"
 #include "MemoryBuffers.h"
@@ -77,7 +78,7 @@ debug_me_is_abnormal( const fptype& me, size_t ievtALL )
 
 //============================================================================
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -185,7 +186,7 @@ namespace mg5amcCpu
 
 //============================================================================
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 {
 
diff --git a/epochX/cudacpp/ee_mumu.sa/SubProcesses/CrossSectionKernels.h b/epochX/cudacpp/ee_mumu.sa/SubProcesses/CrossSectionKernels.h
index 7933ca4bbf..4d9659e04e 100644
--- a/epochX/cudacpp/ee_mumu.sa/SubProcesses/CrossSectionKernels.h
+++ b/epochX/cudacpp/ee_mumu.sa/SubProcesses/CrossSectionKernels.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef CROSSSECTIONKERNELS_H
 #define CROSSSECTIONKERNELS_H 1
@@ -13,7 +13,7 @@
 
 //============================================================================
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -96,7 +96,7 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
   /*
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   // A class encapsulating the calculation of event statistics on a GPU device
   class CrossSectionKernelDevice : public CrossSectionKernelBase, public NumberOfEvents
   {
diff --git a/epochX/cudacpp/ee_mumu.sa/SubProcesses/CurandRandomNumberKernel.cc b/epochX/cudacpp/ee_mumu.sa/SubProcesses/CurandRandomNumberKernel.cc
index eb56333b03..38c477c17a 100644
--- a/epochX/cudacpp/ee_mumu.sa/SubProcesses/CurandRandomNumberKernel.cc
+++ b/epochX/cudacpp/ee_mumu.sa/SubProcesses/CurandRandomNumberKernel.cc
@@ -1,9 +1,9 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
-#include "CudaRuntime.h"
+#include "GpuRuntime.h"
 #include "MemoryBuffers.h"
 #include "RandomNumberKernels.h"
 
@@ -114,7 +114,7 @@ namespace mg5amcCpu
     /*
     printf( "\nCurandRandomNumberKernel::generateRnarray size = %d\n", (int)m_rnarray.size() );
     fptype* data = m_rnarray.data();
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     if( m_rnarray.isOnDevice() )
     {
       data = new fptype[m_rnarray.size()]();
@@ -123,7 +123,7 @@ namespace mg5amcCpu
 #endif
     for( int i = 0; i < ( (int)m_rnarray.size() / 4 ); i++ )
       printf( "[%4d] %f %f %f %f\n", i * 4, data[i * 4], data[i * 4 + 2], data[i * 4 + 2], data[i * 4 + 3] );
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     if( m_rnarray.isOnDevice() ) delete[] data;
 #endif
     */
diff --git a/epochX/cudacpp/ee_mumu.sa/SubProcesses/EventStatistics.h b/epochX/cudacpp/ee_mumu.sa/SubProcesses/EventStatistics.h
index 48b51e0a49..b425a5bade 100644
--- a/epochX/cudacpp/ee_mumu.sa/SubProcesses/EventStatistics.h
+++ b/epochX/cudacpp/ee_mumu.sa/SubProcesses/EventStatistics.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef EventStatistics_H
 #define EventStatistics_H 1
@@ -16,7 +16,7 @@
 #include <limits>
 #include <string>
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/ee_mumu.sa/SubProcesses/GpuAbstraction.h b/epochX/cudacpp/ee_mumu.sa/SubProcesses/GpuAbstraction.h
new file mode 100644
index 0000000000..6a7d9c05c0
--- /dev/null
+++ b/epochX/cudacpp/ee_mumu.sa/SubProcesses/GpuAbstraction.h
@@ -0,0 +1,71 @@
+// Copyright (C) 2020-2023 CERN and UCLouvain.
+// Licensed under the GNU Lesser General Public License (version 3 or later).
+// Created by: J. Teig (Jul 2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+
+#ifndef MG5AMC_GPUABSTRACTION_H
+#define MG5AMC_GPUABSTRACTION_H 1
+
+#include <cassert>
+
+//--------------------------------------------------------------------------
+
+#ifdef __CUDACC__
+
+#define gpuError_t cudaError_t
+#define gpuPeekAtLastError cudaPeekAtLastError
+#define gpuGetErrorString cudaGetErrorString
+#define gpuSuccess cudaSuccess
+
+#define gpuMallocHost( ptr, size ) checkGpu( cudaMallocHost( ptr, size ) )
+#define gpuMalloc( ptr, size ) checkGpu( cudaMalloc( ptr, size ) )
+
+#define gpuMemcpy( dstData, srcData, srcBytes, func ) checkGpu( cudaMemcpy( dstData, srcData, srcBytes, func ) )
+#define gpuMemcpyHostToDevice cudaMemcpyHostToDevice
+#define gpuMemcpyDeviceToHost cudaMemcpyDeviceToHost
+#define gpuMemcpyToSymbol( type1, type2, size ) checkGpu( cudaMemcpyToSymbol( type1, type2, size ) )
+
+#define gpuFree( ptr ) checkGpu( cudaFree( ptr ) )
+#define gpuFreeHost( ptr ) checkGpu( cudaFreeHost( ptr ) )
+
+#define gpuSetDevice cudaSetDevice
+#define gpuDeviceSynchronize cudaDeviceSynchronize
+#define gpuDeviceReset cudaDeviceReset
+
+#define gpuLaunchKernel( kernel, blocks, threads, ... ) kernel<<<blocks, threads>>>( __VA_ARGS__ )
+#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<<blocks, threads, sharedMem>>>( __VA_ARGS__ )
+
+//--------------------------------------------------------------------------
+
+#elif defined __HIPCC__
+
+#include "hip/hip_runtime.h"
+
+#define gpuError_t hipError_t
+#define gpuPeekAtLastError hipPeekAtLastError
+#define gpuGetErrorString hipGetErrorString
+#define gpuSuccess hipSuccess
+
+#define gpuMallocHost( ptr, size ) checkGpu( hipHostMalloc( ptr, size ) ) // HostMalloc better
+#define gpuMalloc( ptr, size ) checkGpu( hipMalloc( ptr, size ) )
+
+#define gpuMemcpy( dstData, srcData, srcBytes, func ) checkGpu( hipMemcpy( dstData, srcData, srcBytes, func ) )
+#define gpuMemcpyHostToDevice hipMemcpyHostToDevice
+#define gpuMemcpyDeviceToHost hipMemcpyDeviceToHost
+#define gpuMemcpyToSymbol( type1, type2, size ) checkGpu( hipMemcpyToSymbol( type1, type2, size ) )
+
+#define gpuFree( ptr ) checkGpu( hipFree( ptr ) )
+#define gpuFreeHost( ptr ) checkGpu( hipHostFree( ptr ) )
+
+#define gpuSetDevice hipSetDevice
+#define gpuDeviceSynchronize hipDeviceSynchronize
+#define gpuDeviceReset hipDeviceReset
+
+#define gpuLaunchKernel( kernel, blocks, threads, ... ) kernel<<<blocks, threads>>>( __VA_ARGS__ )
+#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<<blocks, threads, sharedMem>>>( __VA_ARGS__ )
+
+//--------------------------------------------------------------------------
+
+#endif
+
+#endif // MG5AMC_GPUABSTRACTION_H
diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/CudaRuntime.h b/epochX/cudacpp/ee_mumu.sa/SubProcesses/GpuRuntime.h
similarity index 64%
rename from epochX/cudacpp/ee_mumu.mad/SubProcesses/CudaRuntime.h
rename to epochX/cudacpp/ee_mumu.sa/SubProcesses/GpuRuntime.h
index df0c3f3df8..93579ef08b 100644
--- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/CudaRuntime.h
+++ b/epochX/cudacpp/ee_mumu.sa/SubProcesses/GpuRuntime.h
@@ -1,28 +1,29 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
-// Created by: S. Roiser (Jul 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+// Created by: J. Teig (Jun 2023, based on earlier work by S. Roiser) for the MG5aMC CUDACPP plugin.
+// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 
-#ifndef MG5AMC_CUDARUNTIME_H
-#define MG5AMC_CUDARUNTIME_H 1
+#ifndef MG5AMC_GPURUNTIME_H
+#define MG5AMC_GPURUNTIME_H 1
 
 // MG5AMC on GPU uses the CUDA runtime API, not the lower level CUDA driver API
 // See https://docs.nvidia.com/cuda/cuda-runtime-api/driver-vs-runtime-api.html#driver-vs-runtime-api
 
-#include <cassert>
+#include "GpuAbstraction.h"
+
 #include <iostream>
 
 //--------------------------------------------------------------------------
 
 // See https://stackoverflow.com/a/14038590
 #ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
-#define checkCuda( code ) { assertCuda( code, __FILE__, __LINE__ ); }
-inline void assertCuda( cudaError_t code, const char* file, int line, bool abort = true )
+#define checkGpu( code ) { assertGpu( code, __FILE__, __LINE__ ); }
+inline void assertGpu( gpuError_t code, const char* file, int line, bool abort = true )
 {
-  if( code != cudaSuccess )
+  if( code != gpuSuccess )
   {
-    printf( "ERROR! assertCuda: '%s' (%d) in %s:%d\n", cudaGetErrorString( code ), code, file, line );
-    if( abort ) assert( code == cudaSuccess );
+    printf( "ERROR! assertGpu: '%s' (%d) in %s:%d\n", gpuGetErrorString( code ), code, file, line );
+    if( abort ) assert( code == gpuSuccess );
   }
 }
 #endif /* clang-format on */
@@ -32,18 +33,18 @@ inline void assertCuda( cudaError_t code, const char* file, int line, bool abort
 #ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 {
-  // Instantiate a CudaRuntime at the beginnining of the application's main to
-  // invoke cudaSetDevice(0) in the constructor and book a cudaDeviceReset() call in the destructor
+  // Instantiate a GpuRuntime at the beginnining of the application's main to
+  // invoke gpuSetDevice(0) in the constructor and book a gpuDeviceReset() call in the destructor
   // *** FIXME! This will all need to be designed differently when going to multi-GPU nodes! ***
-  struct CudaRuntime final
+  struct GpuRuntime final
   {
-    CudaRuntime( const bool debug = true )
+    GpuRuntime( const bool debug = true )
       : m_debug( debug ) { setUp( m_debug ); }
-    ~CudaRuntime() { tearDown( m_debug ); }
-    CudaRuntime( const CudaRuntime& ) = delete;
-    CudaRuntime( CudaRuntime&& ) = delete;
-    CudaRuntime& operator=( const CudaRuntime& ) = delete;
-    CudaRuntime& operator=( CudaRuntime&& ) = delete;
+    ~GpuRuntime() { tearDown( m_debug ); }
+    GpuRuntime( const GpuRuntime& ) = delete;
+    GpuRuntime( GpuRuntime&& ) = delete;
+    GpuRuntime& operator=( const GpuRuntime& ) = delete;
+    GpuRuntime& operator=( GpuRuntime&& ) = delete;
     bool m_debug;
 
     // Set up CUDA application
@@ -62,8 +63,8 @@ namespace mg5amcGpu
       */
       // Replace cudaFree(0) by cudaSetDevice(0), even if it is not really needed either
       // (but see https://developer.nvidia.com/blog/cuda-pro-tip-always-set-current-device-avoid-multithreading-bugs)
-      if( debug ) std::cout << "__CudaRuntime: calling cudaSetDevice(0)" << std::endl;
-      checkCuda( cudaSetDevice( 0 ) ); // SLOW!
+      if( debug ) std::cout << "__GpuRuntime: calling GpuSetDevice(0)" << std::endl;
+      checkGpu( gpuSetDevice( 0 ) ); // SLOW!
     }
 
     // Tear down CUDA application (call cudaDeviceReset)
@@ -72,14 +73,13 @@ namespace mg5amcGpu
     // See https://docs.nvidia.com/cuda/cuda-memcheck/index.html#leak-checking
     static void tearDown( const bool debug = true )
     {
-      if( debug ) std::cout << "__CudaRuntime: calling cudaDeviceReset()" << std::endl;
-      checkCuda( cudaDeviceReset() );
+      if( debug ) std::cout << "__GpuRuntime: calling GpuDeviceReset()" << std::endl;
+      checkGpu( gpuDeviceReset() );
     }
   };
-
 }
 #endif
 
 //--------------------------------------------------------------------------
 
-#endif // MG5AMC_CUDARUNTIME_H
+#endif // MG5AMC_GPURUNTIME_H
diff --git a/epochX/cudacpp/ee_mumu.sa/SubProcesses/MadgraphTest.h b/epochX/cudacpp/ee_mumu.sa/SubProcesses/MadgraphTest.h
index fd7734ce42..d2ff326e20 100644
--- a/epochX/cudacpp/ee_mumu.sa/SubProcesses/MadgraphTest.h
+++ b/epochX/cudacpp/ee_mumu.sa/SubProcesses/MadgraphTest.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: S. Hageboeck (Dec 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MADGRAPHTEST_H_
 #define MADGRAPHTEST_H_ 1
@@ -21,7 +21,7 @@
 #include <string>
 #include <vector>
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 using mg5amcGpu::CPPProcess;
 #else
 using mg5amcCpu::CPPProcess;
@@ -200,7 +200,7 @@ class MadgraphTest : public testing::TestWithParam<TestDriverBase*>
 
 // Since we link both the CPU-only and GPU tests into the same executable, we prevent
 // a multiply defined symbol by only compiling this in the non-CUDA phase:
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 
 /// Compare momenta and matrix elements.
 /// This uses an implementation of TestDriverBase to run a madgraph workflow,
@@ -309,6 +309,6 @@ TEST_P( MadgraphTest, CompareMomentaAndME )
   }
 }
 
-#endif // __CUDACC__
+#endif // MGONGPUCPP_GPUIMPL
 
 #endif /* MADGRAPHTEST_H_ */
diff --git a/epochX/cudacpp/ee_mumu.sa/SubProcesses/MatrixElementKernels.cc b/epochX/cudacpp/ee_mumu.sa/SubProcesses/MatrixElementKernels.cc
index 30257195b6..d6d6c4f179 100644
--- a/epochX/cudacpp/ee_mumu.sa/SubProcesses/MatrixElementKernels.cc
+++ b/epochX/cudacpp/ee_mumu.sa/SubProcesses/MatrixElementKernels.cc
@@ -1,12 +1,12 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #include "MatrixElementKernels.h"
 
 #include "CPPProcess.h"
-#include "CudaRuntime.h"
+#include "GpuRuntime.h" // Includes the abstraction for Nvidia/AMD compilation
 #include "MemoryAccessMomenta.h"
 #include "MemoryBuffers.h"
 
@@ -14,7 +14,7 @@
 
 //============================================================================
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 namespace mg5amcCpu
 {
 
@@ -143,7 +143,7 @@ namespace mg5amcCpu
 
 //============================================================================
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 {
 
@@ -202,13 +202,13 @@ namespace mg5amcGpu
     PinnedHostBufferHelicityMask hstIsGoodHel( ncomb );
     DeviceBufferHelicityMask devIsGoodHel( ncomb );
     // ... 0d1. Compute good helicity mask on the device
-    computeDependentCouplings<<<m_gpublocks, m_gputhreads>>>( m_gs.data(), m_couplings.data() );
+    gpuLaunchKernel( computeDependentCouplings, m_gpublocks, m_gputhreads, m_gs.data(), m_couplings.data() );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    sigmaKin_getGoodHel<<<m_gpublocks, m_gputhreads>>>( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_numerators.data(), m_denominators.data(), devIsGoodHel.data() );
+    gpuLaunchKernel( sigmaKin_getGoodHel, m_gpublocks, m_gputhreads, m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_numerators.data(), m_denominators.data(), devIsGoodHel.data() );
 #else
-    sigmaKin_getGoodHel<<<m_gpublocks, m_gputhreads>>>( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), devIsGoodHel.data() );
+    gpuLaunchKernel( sigmaKin_getGoodHel, m_gpublocks, m_gputhreads, m_momenta.data(), m_couplings.data(), m_matrixElements.data(), devIsGoodHel.data() );
 #endif
-    checkCuda( cudaPeekAtLastError() );
+    checkGpu( gpuPeekAtLastError() );
     // ... 0d2. Copy back good helicity mask to the host
     copyHostFromDevice( hstIsGoodHel, devIsGoodHel );
     // ... 0d3. Copy back good helicity list to constant memory on the device
@@ -219,19 +219,19 @@ namespace mg5amcGpu
 
   void MatrixElementKernelDevice::computeMatrixElements( const unsigned int channelId )
   {
-    computeDependentCouplings<<<m_gpublocks, m_gputhreads>>>( m_gs.data(), m_couplings.data() );
+    gpuLaunchKernel( computeDependentCouplings, m_gpublocks, m_gputhreads, m_gs.data(), m_couplings.data() );
 #ifndef MGONGPU_NSIGHT_DEBUG
     constexpr unsigned int sharedMemSize = 0;
 #else
     constexpr unsigned int sharedMemSize = ntpbMAX * sizeof( float );
 #endif
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    sigmaKin<<<m_gpublocks, m_gputhreads, sharedMemSize>>>( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), channelId, m_numerators.data(), m_denominators.data(), m_selhel.data(), m_selcol.data() );
+    gpuLaunchKernelSharedMem( sigmaKin, m_gpublocks, m_gputhreads, sharedMemSize, m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), channelId, m_numerators.data(), m_denominators.data(), m_selhel.data(), m_selcol.data() );
 #else
-    sigmaKin<<<m_gpublocks, m_gputhreads, sharedMemSize>>>( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), m_selhel.data(), m_selcol.data() );
+    gpuLaunchKernelSharedMem( sigmaKin, m_gpublocks, m_gputhreads, sharedMemSize, m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), m_selhel.data(), m_selcol.data() );
 #endif
-    checkCuda( cudaPeekAtLastError() );
-    checkCuda( cudaDeviceSynchronize() );
+    checkGpu( gpuPeekAtLastError() );
+    checkGpu( gpuDeviceSynchronize() );
   }
 
   //--------------------------------------------------------------------------
diff --git a/epochX/cudacpp/ee_mumu.sa/SubProcesses/MatrixElementKernels.h b/epochX/cudacpp/ee_mumu.sa/SubProcesses/MatrixElementKernels.h
index 23e84757a2..72bd8f195b 100644
--- a/epochX/cudacpp/ee_mumu.sa/SubProcesses/MatrixElementKernels.h
+++ b/epochX/cudacpp/ee_mumu.sa/SubProcesses/MatrixElementKernels.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MATRIXELEMENTKERNELS_H
 #define MATRIXELEMENTKERNELS_H 1
@@ -10,7 +10,7 @@
 
 #include "MemoryBuffers.h"
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -81,7 +81,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating matrix element calculations on a CPU host
   class MatrixElementKernelHost final : public MatrixElementKernelBase, public NumberOfEvents
   {
@@ -130,7 +130,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   // A class encapsulating matrix element calculations on a GPU device
   class MatrixElementKernelDevice : public MatrixElementKernelBase, public NumberOfEvents
   {
diff --git a/epochX/cudacpp/ee_mumu.sa/SubProcesses/MemoryAccessHelpers.h b/epochX/cudacpp/ee_mumu.sa/SubProcesses/MemoryAccessHelpers.h
index c82a6c7635..db73e4e064 100644
--- a/epochX/cudacpp/ee_mumu.sa/SubProcesses/MemoryAccessHelpers.h
+++ b/epochX/cudacpp/ee_mumu.sa/SubProcesses/MemoryAccessHelpers.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MemoryAccessHelpers_H
 #define MemoryAccessHelpers_H 1
@@ -105,7 +105,7 @@ class KernelAccessHelper : public MemoryAccessHelper<T>
     }
     else
     {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
       const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid
       //printf( "kernelAccessRecord: ievt=%d threadId=%d\n", ievt, threadIdx.x );
       return T::ieventAccessRecord( buffer, ievt ); // NB fptype and fptype_sv coincide for CUDA
diff --git a/epochX/cudacpp/ee_mumu.sa/SubProcesses/MemoryAccessMomenta.h b/epochX/cudacpp/ee_mumu.sa/SubProcesses/MemoryAccessMomenta.h
index 0ac4faa3c7..38fade09fb 100644
--- a/epochX/cudacpp/ee_mumu.sa/SubProcesses/MemoryAccessMomenta.h
+++ b/epochX/cudacpp/ee_mumu.sa/SubProcesses/MemoryAccessMomenta.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MemoryAccessMomenta_H
 #define MemoryAccessMomenta_H 1
@@ -12,7 +12,7 @@
 #include "MemoryAccessHelpers.h"
 #include "MemoryAccessVectors.h"
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 using mg5amcGpu::CPPProcess;
 #else
 using mg5amcCpu::CPPProcess;
@@ -29,7 +29,7 @@ class MemoryAccessMomentaBase //_AOSOAv1
 
   // Number of Events Per Page in the momenta AOSOA memory buffer layout
   // (these are all best kept as a compile-time constants: see issue #23)
-#ifdef __CUDACC__ /* clang-format off */
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
   // -----------------------------------------------------------------------------------------------
   // --- GPUs: neppM is best set to a power of 2 times the number of fptype's in a 32-byte cacheline
   // --- This is relevant to ensure coalesced access to momenta in global memory
diff --git a/epochX/cudacpp/ee_mumu.sa/SubProcesses/MemoryAccessRandomNumbers.h b/epochX/cudacpp/ee_mumu.sa/SubProcesses/MemoryAccessRandomNumbers.h
index e2988d39f3..40cb089135 100644
--- a/epochX/cudacpp/ee_mumu.sa/SubProcesses/MemoryAccessRandomNumbers.h
+++ b/epochX/cudacpp/ee_mumu.sa/SubProcesses/MemoryAccessRandomNumbers.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MemoryAccessRandomNumbers_H
 #define MemoryAccessRandomNumbers_H 1
@@ -11,7 +11,7 @@
 #include "CPPProcess.h"
 #include "MemoryAccessHelpers.h"
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 using mg5amcGpu::CPPProcess;
 #else
 using mg5amcCpu::CPPProcess;
diff --git a/epochX/cudacpp/ee_mumu.sa/SubProcesses/MemoryAccessVectors.h b/epochX/cudacpp/ee_mumu.sa/SubProcesses/MemoryAccessVectors.h
index e9b197368e..08faccff0f 100644
--- a/epochX/cudacpp/ee_mumu.sa/SubProcesses/MemoryAccessVectors.h
+++ b/epochX/cudacpp/ee_mumu.sa/SubProcesses/MemoryAccessVectors.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MemoryAccessVectors_H
 #define MemoryAccessVectors_H 1
@@ -10,7 +10,7 @@
 
 #include "mgOnGpuVectors.h"
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 namespace mg5amcCpu // this is only needed for CPU SIMD vectorization
 {
 
diff --git a/epochX/cudacpp/ee_mumu.sa/SubProcesses/MemoryBuffers.h b/epochX/cudacpp/ee_mumu.sa/SubProcesses/MemoryBuffers.h
index 3093e6ed18..7756a71621 100644
--- a/epochX/cudacpp/ee_mumu.sa/SubProcesses/MemoryBuffers.h
+++ b/epochX/cudacpp/ee_mumu.sa/SubProcesses/MemoryBuffers.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021, based on earlier work by S. Hageboeck) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Roiser, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MemoryBuffers_H
 #define MemoryBuffers_H 1
@@ -11,12 +11,12 @@
 #include "mgOnGpuCxtypes.h"
 
 #include "CPPProcess.h"
-#include "CudaRuntime.h"
+#include "GpuRuntime.h"
 #include "Parameters_sm.h"
 
 #include <sstream>
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -87,7 +87,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   constexpr bool HostBufferALIGNED = false;   // ismisaligned=false
   constexpr bool HostBufferMISALIGNED = true; // ismisaligned=true
 
@@ -119,7 +119,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   // A class encapsulating a CUDA pinned host buffer
   template<typename T>
   class PinnedHostBufferBase : public BufferBase<T>
@@ -128,18 +128,18 @@ namespace mg5amcCpu
     PinnedHostBufferBase( const size_t size )
       : BufferBase<T>( size, false )
     {
-      checkCuda( cudaMallocHost( &( this->m_data ), this->bytes() ) );
+      gpuMallocHost( &( this->m_data ), this->bytes() );
     }
     virtual ~PinnedHostBufferBase()
     {
-      checkCuda( cudaFreeHost( this->m_data ) );
+      gpuFreeHost( this->m_data );
     }
   };
 #endif
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   // A class encapsulating a CUDA device buffer
   template<typename T>
   class DeviceBufferBase : public BufferBase<T>
@@ -148,18 +148,18 @@ namespace mg5amcCpu
     DeviceBufferBase( const size_t size )
       : BufferBase<T>( size, true )
     {
-      checkCuda( cudaMalloc( &( this->m_data ), this->bytes() ) );
+      gpuMalloc( &( this->m_data ), this->bytes() );
     }
     virtual ~DeviceBufferBase()
     {
-      checkCuda( cudaFree( this->m_data ) );
+      gpuFree( this->m_data );
     }
   };
 #endif
 
   //--------------------------------------------------------------------------
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for a given number of events
   template<typename T, size_t sizePerEvent, bool ismisaligned>
   class HostBuffer : public HostBufferBase<T, ismisaligned>, virtual private NumberOfEvents
@@ -175,7 +175,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   // A class encapsulating a CUDA pinned host buffer for a given number of events
   template<typename T, size_t sizePerEvent>
   class PinnedHostBuffer : public PinnedHostBufferBase<T>, virtual private NumberOfEvents
@@ -191,7 +191,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   // A class encapsulating a CUDA device buffer for a given number of events
   template<typename T, size_t sizePerEvent>
   class DeviceBuffer : public DeviceBufferBase<T>, virtual private NumberOfEvents
@@ -213,7 +213,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for momenta random numbers
   constexpr size_t sizePerEventRndNumMomenta = MemoryBuffers::np4 * MemoryBuffers::nparf;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for momenta random numbers
   typedef HostBuffer<fptype, sizePerEventRndNumMomenta, HostBufferALIGNED> HostBufferRndNumMomenta;
 #else
@@ -232,7 +232,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer with ONE fptype per event
   constexpr size_t sizePerEventOneFp = 1;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer with ONE fptype per event
   typedef HostBuffer<fptype, sizePerEventOneFp, HostBufferALIGNED> HostBufferOneFp;
 #else
@@ -257,7 +257,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for Gs
   constexpr size_t sizePerEventGs = 1;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for gs
   typedef HostBuffer<fptype, sizePerEventGs, HostBufferALIGNED> HostBufferGs;
 #else
@@ -276,7 +276,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for numerators
   constexpr size_t sizePerEventNumerators = 1;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for gs
   typedef HostBuffer<fptype, sizePerEventNumerators, HostBufferALIGNED> HostBufferNumerators;
 #else
@@ -296,7 +296,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for denominators
   constexpr size_t sizePerEventDenominators = 1;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for gs
   typedef HostBuffer<fptype, sizePerEventDenominators, HostBufferALIGNED> HostBufferDenominators;
 #else
@@ -315,7 +315,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for random numbers
   constexpr size_t sizePerEventCouplings = MemoryBuffers::ndcoup * MemoryBuffers::nx2;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for gs
   typedef HostBuffer<fptype, sizePerEventCouplings, HostBufferALIGNED> HostBufferCouplings;
 #else
@@ -333,7 +333,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for momenta
   constexpr size_t sizePerEventMomenta = MemoryBuffers::np4 * MemoryBuffers::npar;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for momenta
   typedef HostBuffer<fptype, sizePerEventMomenta, HostBufferALIGNED> HostBufferMomenta;
   //typedef HostBuffer<fptype, sizePerEventMomenta, HostBufferMISALIGNED> HostBufferMomenta; // TEST MISALIGNMENT!
@@ -352,7 +352,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for sampling weights
   constexpr size_t sizePerEventWeights = 1;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for sampling weights
   typedef HostBuffer<fptype, sizePerEventWeights, HostBufferALIGNED> HostBufferWeights;
 #else
@@ -370,7 +370,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for matrix elements
   constexpr size_t sizePerEventMatrixElements = 1;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for matrix elements
   typedef HostBuffer<fptype, sizePerEventMatrixElements, HostBufferALIGNED> HostBufferMatrixElements;
 #else
@@ -385,7 +385,7 @@ namespace mg5amcCpu
   // A base class encapsulating a memory buffer for the helicity mask
   typedef BufferBase<bool> BufferHelicityMask;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for the helicity mask
   typedef HostBufferBase<bool, HostBufferALIGNED> HostBufferHelicityMask;
 #else
@@ -403,7 +403,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for wavefunctions
   constexpr size_t sizePerEventWavefunctions = MemoryBuffers::nw6 * MemoryBuffers::nx2;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for wavefunctions
   typedef HostBuffer<fptype, sizePerEventWavefunctions, HostBufferALIGNED> HostBufferWavefunctions;
 #else
@@ -421,7 +421,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for helicity random numbers
   constexpr size_t sizePerEventRndNumHelicity = 1;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for helicity random numbers
   typedef HostBuffer<fptype, sizePerEventRndNumHelicity, HostBufferALIGNED> HostBufferRndNumHelicity;
 #else
@@ -439,7 +439,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for color random numbers
   constexpr size_t sizePerEventRndNumColor = 1;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for color random numbers
   typedef HostBuffer<fptype, sizePerEventRndNumColor, HostBufferALIGNED> HostBufferRndNumColor;
 #else
@@ -457,7 +457,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for helicity selection
   constexpr size_t sizePerEventSelectedHelicity = 1;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for helicity selection
   typedef HostBuffer<int, sizePerEventSelectedHelicity, HostBufferALIGNED> HostBufferSelectedHelicity;
 #else
@@ -475,7 +475,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for color selection
   constexpr size_t sizePerEventSelectedColor = 1;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for color selection
   typedef HostBuffer<int, sizePerEventSelectedColor, HostBufferALIGNED> HostBufferSelectedColor;
 #else
@@ -487,7 +487,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   template<class Tdst, class Tsrc>
   void copyDeviceFromHost( Tdst& dst, const Tsrc& src ) // keep the same order of arguments as in memcpy
   {
@@ -504,13 +504,13 @@ namespace mg5amcCpu
       throw std::runtime_error( sstr.str() );
     }
     // NB (PR #45): cudaMemcpy involves an intermediate memcpy to pinned memory if host array is a not a pinned host array
-    checkCuda( cudaMemcpy( dst.data(), src.data(), src.bytes(), cudaMemcpyHostToDevice ) );
+    gpuMemcpy( dst.data(), src.data(), src.bytes(), gpuMemcpyHostToDevice );
   }
 #endif
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   template<class Tdst, class Tsrc>
   void copyHostFromDevice( Tdst& dst, const Tsrc& src ) // keep the same order of arguments as in memcpy
   {
@@ -527,7 +527,7 @@ namespace mg5amcCpu
       throw std::runtime_error( sstr.str() );
     }
     // NB (PR #45): cudaMemcpy involves an intermediate memcpy to pinned memory if host array is a not a pinned host array
-    checkCuda( cudaMemcpy( dst.data(), src.data(), src.bytes(), cudaMemcpyDeviceToHost ) );
+    gpuMemcpy( dst.data(), src.data(), src.bytes(), gpuMemcpyDeviceToHost );
   }
 #endif
 
diff --git a/epochX/cudacpp/ee_mumu.sa/SubProcesses/P1_Sigma_sm_epem_mupmum/CPPProcess.cc b/epochX/cudacpp/ee_mumu.sa/SubProcesses/P1_Sigma_sm_epem_mupmum/CPPProcess.cc
index 8bbc9ba493..709a3d6cdf 100644
--- a/epochX/cudacpp/ee_mumu.sa/SubProcesses/P1_Sigma_sm_epem_mupmum/CPPProcess.cc
+++ b/epochX/cudacpp/ee_mumu.sa/SubProcesses/P1_Sigma_sm_epem_mupmum/CPPProcess.cc
@@ -4,7 +4,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
 // MadGraph5_aMC@NLO v. 3.5.0_lo_vect, 2023-06-09
@@ -16,7 +16,6 @@
 
 #include "mgOnGpuConfig.h"
 
-#include "CudaRuntime.h"
 #include "HelAmps_sm.h"
 #include "MemoryAccessAmplitudes.h"
 #include "MemoryAccessCouplings.h"
@@ -46,7 +45,7 @@
 // Class member functions for calculating the matrix elements for
 // Process: e+ e- > mu+ mu- WEIGHTED<=4 @1
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -80,7 +79,7 @@ namespace mg5amcCpu
   __device__ const fptype cIPD[2] = { (fptype)Parameters_sm::mdl_MZ, (fptype)Parameters_sm::mdl_WZ };
   __device__ const fptype cIPC[6] = { (fptype)Parameters_sm::GC_3.real(), (fptype)Parameters_sm::GC_3.imag(), (fptype)Parameters_sm::GC_50.real(), (fptype)Parameters_sm::GC_50.imag(), (fptype)Parameters_sm::GC_59.real(), (fptype)Parameters_sm::GC_59.imag() };
 #else
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   __device__ __constant__ fptype cIPD[2];
   __device__ __constant__ fptype cIPC[6];
 #else
@@ -90,7 +89,7 @@ namespace mg5amcCpu
 #endif
 
   // Helicity combinations (and filtering of "good" helicity combinations)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   __device__ __constant__ short cHel[ncomb][npar];
   __device__ __constant__ int cNGoodHel;
   __device__ __constant__ int cGoodHel[ncomb];
@@ -118,13 +117,13 @@ namespace mg5amcCpu
                            fptype* allDenominators,       // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
                            fptype_sv* jamp2_sv            // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled)
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
                            , const int ievt00             // input: first event number in current C++ event page (for CUDA, ievt depends on threadid)
 #endif
                            )
   //ALWAYS_INLINE // attributes are not permitted in a function definition
   {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     using namespace mg5amcGpu;
     using M_ACCESS = DeviceAccessMomenta;         // non-trivial access: buffer includes all events
     using E_ACCESS = DeviceAccessMatrixElements;  // non-trivial access: buffer includes all events
@@ -151,7 +150,7 @@ namespace mg5amcCpu
 #endif /* clang-format on */
     mgDebug( 0, __FUNCTION__ );
     //printf( "calculate_wavefunctions: ihel=%2d\n", ihel );
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
     //printf( "calculate_wavefunctions: ievt00=%d\n", ievt00 );
 #endif
 
@@ -187,7 +186,7 @@ namespace mg5amcCpu
 #endif
     for( int iParity = 0; iParity < nParity; ++iParity )
     { // START LOOP ON IPARITY
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
       const int ievt0 = ievt00 + iParity * neppV;
 #endif
       constexpr size_t nxcoup = ndcoup + nicoup; // both dependent and independent couplings
@@ -200,8 +199,10 @@ namespace mg5amcCpu
         allCOUPs[idcoup] = CD_ACCESS::idcoupAccessBufferConst( allcouplings, idcoup ); // dependent couplings, vary event-by-event
       for( size_t iicoup = 0; iicoup < nicoup; iicoup++ )
         allCOUPs[ndcoup + iicoup] = CI_ACCESS::iicoupAccessBufferConst( cIPC, iicoup ); // independent couplings, fixed for all events
+#ifdef MGONGPUCPP_GPUIMPL
 #ifdef __CUDACC__
 #pragma nv_diagnostic pop
+#endif
       // CUDA kernels take input/output buffers with momenta/MEs for all events
       const fptype* momenta = allmomenta;
       const fptype* COUPs[nxcoup];
@@ -238,7 +239,7 @@ namespace mg5amcCpu
       // *** DIAGRAM 1 OF 2 ***
 
       // Wavefunction(s) for diagram number 1
-#if not( defined __CUDACC__ and defined MGONGPU_TEST_DIVERGENCE )
+#if not( defined MGONGPUCPP_GPUIMPL and defined MGONGPU_TEST_DIVERGENCE )
       opzxxx<M_ACCESS, W_ACCESS>( momenta, cHel[ihel][0], -1, w_fp[0], 0 ); // NB: opzxxx only uses pz
 #else
       if( ( blockDim.x * blockIdx.x + threadIdx.x ) % 2 == 0 )
@@ -291,7 +292,7 @@ namespace mg5amcCpu
       // [NB do keep 'static' for these constexpr arrays, see issue #283]
       static constexpr fptype2 cf[ncolor][ncolor] = { { 1 } }; // 2-D array[1][1]
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
       // Pre-compute a constexpr triangular color matrix properly normalized #475
       struct TriangularNormalizedColorMatrix
       {
@@ -348,7 +349,7 @@ namespace mg5amcCpu
 #endif
       for( int icol = 0; icol < ncolor; icol++ )
       {
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
         // === C++ START ===
         // Diagonal terms
 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
@@ -407,7 +408,7 @@ namespace mg5amcCpu
       MEs_sv_previous += deltaMEs_previous;
 #endif
       /*
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
       if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", blockDim.x * blockIdx.x + threadIdx.x, ihel, MEs_sv );
 #else
 #ifdef MGONGPU_CPPSIMD
@@ -454,8 +455,8 @@ namespace mg5amcCpu
       { 1, -1, 1, 1 },
       { 1, -1, -1, -1 },
       { 1, -1, -1, 1 } };
-#ifdef __CUDACC__
-    checkCuda( cudaMemcpyToSymbol( cHel, tHel, ncomb * npar * sizeof( short ) ) );
+#ifdef MGONGPUCPP_GPUIMPL
+    gpuMemcpyToSymbol( cHel, tHel, ncomb * npar * sizeof( short ) );
 #else
     memcpy( cHel, tHel, ncomb * npar * sizeof( short ) );
 #endif
@@ -495,9 +496,9 @@ namespace mg5amcCpu
     // Then copy them to CUDA constant memory (issue #39) or its C++ emulation in file-scope static memory
     const fptype tIPD[2] = { (fptype)m_pars->mdl_MZ, (fptype)m_pars->mdl_WZ };
     const cxtype tIPC[3] = { cxmake( m_pars->GC_3 ), cxmake( m_pars->GC_50 ), cxmake( m_pars->GC_59 ) };
-#ifdef __CUDACC__
-    checkCuda( cudaMemcpyToSymbol( cIPD, tIPD, 2 * sizeof( fptype ) ) );
-    checkCuda( cudaMemcpyToSymbol( cIPC, tIPC, 3 * sizeof( cxtype ) ) );
+#ifdef MGONGPUCPP_GPUIMPL
+    gpuMemcpyToSymbol( cIPD, tIPD, 2 * sizeof( fptype ) );
+    gpuMemcpyToSymbol( cIPC, tIPC, 3 * sizeof( cxtype ) );
 #else
     memcpy( cIPD, tIPD, 2 * sizeof( fptype ) );
     memcpy( cIPC, tIPC, 3 * sizeof( cxtype ) );
@@ -534,7 +535,7 @@ namespace mg5amcCpu
   {
     std::stringstream out;
     // CUDA version (NVCC)
-    // [Use __NVCC__ instead of __CUDACC__ here!]
+    // [Use __NVCC__ instead of MGONGPUCPP_GPUIMPL here!]
     // [This tests if 'nvcc' was used even to build a .cc file, even if not necessarily 'nvcc -x cu' for a .cu file]
     // [Check 'nvcc --compiler-options -dM -E dummy.c | grep CUDA': see https://stackoverflow.com/a/53713712]
 #ifdef __NVCC__
@@ -599,12 +600,12 @@ namespace mg5amcCpu
   __global__ void /* clang-format off */
   computeDependentCouplings( const fptype* allgs, // input: Gs[nevt]
                              fptype* allcouplings // output: couplings[nevt*ndcoup*2]
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
                              , const int nevt     // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #endif
   ) /* clang-format on */
   {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     using namespace mg5amcGpu;
     using G_ACCESS = DeviceAccessGs;
     using C_ACCESS = DeviceAccessCouplings;
@@ -625,7 +626,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__ /* clang-format off */
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
   __global__ void
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
@@ -754,9 +755,9 @@ namespace mg5amcCpu
         nGoodHel++;
       }
     }
-#ifdef __CUDACC__
-    checkCuda( cudaMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) ) );
-    checkCuda( cudaMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) ) );
+#ifdef MGONGPUCPP_GPUIMPL
+    gpuMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) );
+    gpuMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) );
 #else
     cNGoodHel = nGoodHel;
     for( int ihel = 0; ihel < ncomb; ihel++ ) cGoodHel[ihel] = goodHel[ihel];
@@ -780,7 +781,7 @@ namespace mg5amcCpu
 #endif
             int* allselhel,                // output: helicity selection[nevt]
             int* allselcol                 // output: helicity selection[nevt]
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
             , const int nevt               // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #endif
             ) /* clang-format on */
@@ -801,7 +802,7 @@ namespace mg5amcCpu
     // Denominators: spins, colors and identical particles
     constexpr int helcolDenominators[1] = { 4 }; // assume nprocesses == 1 (#272 and #343)
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     // Remember: in CUDA this is a kernel for one event, in c++ this processes n events
     const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid
 #else
@@ -815,9 +816,12 @@ namespace mg5amcCpu
 #endif
 
     // Start sigmaKin_lines
+
+#include "GpuAbstraction.h"
+
     // === PART 0 - INITIALISATION (before calculate_wavefunctions) ===
     // Reset the "matrix elements" - running sums of |M|^2 over helicities for the given event
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     allMEs[ievt] = 0;
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     allNumerators[ievt] = 0;
@@ -845,7 +849,7 @@ namespace mg5amcCpu
     // === PART 1 - HELICITY LOOP: CALCULATE WAVEFUNCTIONS ===
     // (in both CUDA and C++, using precomputed good helicities)
 
-#ifdef __CUDACC__ // CUDA OR C++
+#ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++
 
     // *** START OF PART 1a - CUDA (one event per CPU thread) ***
     // Running sum of partial amplitudes squared for event by event color selection (#402)
@@ -1049,7 +1053,7 @@ namespace mg5amcCpu
     // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event
     // [NB 'sum over final spins, average over initial spins', eg see
     // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf]
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     allMEs[ievt] /= helcolDenominators[0];
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     if( channelId > 0 ) allMEs[ievt] *= allNumerators[ievt] / allDenominators[ievt];
diff --git a/epochX/cudacpp/ee_mumu.sa/SubProcesses/P1_Sigma_sm_epem_mupmum/CPPProcess.h b/epochX/cudacpp/ee_mumu.sa/SubProcesses/P1_Sigma_sm_epem_mupmum/CPPProcess.h
index 08d6c29e7b..ebbc2800d3 100644
--- a/epochX/cudacpp/ee_mumu.sa/SubProcesses/P1_Sigma_sm_epem_mupmum/CPPProcess.h
+++ b/epochX/cudacpp/ee_mumu.sa/SubProcesses/P1_Sigma_sm_epem_mupmum/CPPProcess.h
@@ -4,7 +4,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
 // MadGraph5_aMC@NLO v. 3.5.0_lo_vect, 2023-06-09
@@ -25,7 +25,7 @@
 
 //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -107,7 +107,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   __global__ void
   computeDependentCouplings( const fptype* allgs,    // input: Gs[nevt]
                              fptype* allcouplings ); // output: couplings[nevt*ndcoup*2]
@@ -120,7 +120,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__ /* clang-format off */
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
   __global__ void
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
@@ -150,7 +150,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__ /* clang-format off */
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
   __global__ void
   sigmaKin( const fptype* allmomenta,      // input: momenta[nevt*npar*4]
             const fptype* allcouplings,    // input: couplings[nevt*ndcoup*2]
diff --git a/epochX/cudacpp/ee_mumu.sa/SubProcesses/P1_Sigma_sm_epem_mupmum/CudaRuntime.h b/epochX/cudacpp/ee_mumu.sa/SubProcesses/P1_Sigma_sm_epem_mupmum/CudaRuntime.h
deleted file mode 120000
index ce9e1a487a..0000000000
--- a/epochX/cudacpp/ee_mumu.sa/SubProcesses/P1_Sigma_sm_epem_mupmum/CudaRuntime.h
+++ /dev/null
@@ -1 +0,0 @@
-../CudaRuntime.h
\ No newline at end of file
diff --git a/epochX/cudacpp/ee_mumu.sa/SubProcesses/P1_Sigma_sm_epem_mupmum/GpuAbstraction.h b/epochX/cudacpp/ee_mumu.sa/SubProcesses/P1_Sigma_sm_epem_mupmum/GpuAbstraction.h
new file mode 120000
index 0000000000..72054e19ba
--- /dev/null
+++ b/epochX/cudacpp/ee_mumu.sa/SubProcesses/P1_Sigma_sm_epem_mupmum/GpuAbstraction.h
@@ -0,0 +1 @@
+../GpuAbstraction.h
\ No newline at end of file
diff --git a/epochX/cudacpp/ee_mumu.sa/SubProcesses/P1_Sigma_sm_epem_mupmum/GpuRuntime.h b/epochX/cudacpp/ee_mumu.sa/SubProcesses/P1_Sigma_sm_epem_mupmum/GpuRuntime.h
new file mode 120000
index 0000000000..3920e83be4
--- /dev/null
+++ b/epochX/cudacpp/ee_mumu.sa/SubProcesses/P1_Sigma_sm_epem_mupmum/GpuRuntime.h
@@ -0,0 +1 @@
+../GpuRuntime.h
\ No newline at end of file
diff --git a/epochX/cudacpp/ee_mumu.sa/SubProcesses/P1_Sigma_sm_epem_mupmum/check_sa.cc b/epochX/cudacpp/ee_mumu.sa/SubProcesses/P1_Sigma_sm_epem_mupmum/check_sa.cc
index f1e75b9252..1bad694d1c 100644
--- a/epochX/cudacpp/ee_mumu.sa/SubProcesses/P1_Sigma_sm_epem_mupmum/check_sa.cc
+++ b/epochX/cudacpp/ee_mumu.sa/SubProcesses/P1_Sigma_sm_epem_mupmum/check_sa.cc
@@ -4,7 +4,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: O. Mattelaer (Nov 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 
 #include "mgOnGpuConfig.h"
@@ -12,6 +12,7 @@
 #include "BridgeKernels.h"
 #include "CPPProcess.h"
 #include "CrossSectionKernels.h"
+#include "GpuRuntime.h"
 #include "MatrixElementKernels.h"
 #include "MemoryAccessMatrixElements.h"
 #include "MemoryAccessMomenta.h"
@@ -63,7 +64,7 @@ usage( char* argv0, int ret = 1 )
   std::cout << std::endl;
   std::cout << "Summary stats are always computed: '-p' and '-j' only control their printout" << std::endl;
   std::cout << "The '-d' flag only enables NaN/abnormal warnings and OMP debugging" << std::endl;
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 #ifdef _OPENMP
   std::cout << std::endl;
   std::cout << "Use the OMP_NUM_THREADS environment variable to control OMP multi-threading" << std::endl;
@@ -77,7 +78,7 @@ int
 main( int argc, char** argv )
 {
   // Namespaces for CUDA and C++ (FIXME - eventually use the same namespace everywhere...)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   using namespace mg5amcGpu;
 #else
   using namespace mg5amcCpu;
@@ -103,11 +104,11 @@ main( int argc, char** argv )
     CurandDevice = 2
   };
 #ifdef __CUDACC__
-  RandomNumberMode rndgen = RandomNumberMode::CurandDevice; // default on GPU
+  RandomNumberMode rndgen = RandomNumberMode::CurandDevice; // default on CUDA GPU
 #elif not defined MGONGPU_HAS_NO_CURAND
   RandomNumberMode rndgen = RandomNumberMode::CurandHost;  // default on CPU if build has curand
 #else
-  RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // default on CPU if build has no curand
+  RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // default on CPU if build has no curand and on HIP GPU
 #endif
   // Rambo sampling mode (NB RamboHost implies CommonRandom or CurandHost!)
   enum class RamboSamplingMode
@@ -115,7 +116,7 @@ main( int argc, char** argv )
     RamboHost = 1,
     RamboDevice = 2
   };
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   RamboSamplingMode rmbsmp = RamboSamplingMode::RamboDevice; // default on GPU
 #else
   RamboSamplingMode rmbsmp = RamboSamplingMode::RamboHost; // default on CPU
@@ -148,7 +149,7 @@ main( int argc, char** argv )
 #ifdef __CUDACC__
       rndgen = RandomNumberMode::CurandDevice;
 #else
-      throw std::runtime_error( "CurandDevice is not supported on CPUs" );
+      throw std::runtime_error( "CurandDevice is not supported on CPUs or on HIP GPUs" );
 #endif
     }
     else if( arg == "--curhst" )
@@ -165,7 +166,7 @@ main( int argc, char** argv )
     }
     else if( arg == "--rmbdev" )
     {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
       rmbsmp = RamboSamplingMode::RamboDevice;
 #else
       throw std::runtime_error( "RamboDevice is not supported on CPUs" );
@@ -239,13 +240,13 @@ main( int argc, char** argv )
     return usage( argv[0] );
   }
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 #ifdef _OPENMP
   ompnumthreadsNotSetMeansOneThread( debug ? 1 : 0 ); // quiet(-1), info(0), debug(1)
 #endif
 #endif
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // Fail gently and avoid "Illegal instruction (core dumped)" if the host does not support the SIMD used in the ME calculation
   // Note: this prevents a crash on pmpe04 but not on some github CI nodes?
   // [NB: SIMD vectorization in mg5amc C++ code is only used in the ME calculation below MatrixElementKernelHost!]
@@ -263,14 +264,14 @@ main( int argc, char** argv )
 
   // === STEP 0 - INITIALISE
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 
-  // --- 00. Initialise cuda
-  // Instantiate a CudaRuntime at the beginnining of the application's main to
-  // invoke cudaSetDevice(0) in the constructor and book a cudaDeviceReset() call in the destructor
-  const std::string cdinKey = "00 CudaInit";
+  // --- 00. Initialise GPU
+  // Instantiate a GpuRuntime at the beginnining of the application's main.
+  // For CUDA this invokes cudaSetDevice(0) in the constructor and books a cudaDeviceReset() call in the destructor.
+  const std::string cdinKey = "00 GpuInit";
   timermap.start( cdinKey );
-  CudaRuntime cudaRuntime( debug );
+  GpuRuntime GpuRuntime( debug );
 #endif
 
   // --- 0a. Initialise physics process
@@ -292,7 +293,7 @@ main( int argc, char** argv )
   timermap.start( alloKey );
 
   // Memory buffers for random numbers for momenta
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferRndNumMomenta hstRndmom( nevt );
 #else
   PinnedHostBufferRndNumMomenta hstRndmom( nevt );
@@ -300,7 +301,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for sampling weights
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferWeights hstWeights( nevt );
 #else
   PinnedHostBufferWeights hstWeights( nevt );
@@ -308,7 +309,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for momenta
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferMomenta hstMomenta( nevt );
 #else
   PinnedHostBufferMomenta hstMomenta( nevt );
@@ -316,7 +317,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for Gs
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferGs hstGs( nevt );
 #else
   PinnedHostBufferGs hstGs( nevt );
@@ -333,7 +334,7 @@ main( int argc, char** argv )
   }
 
   // Memory buffers for matrix elements
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferMatrixElements hstMatrixElements( nevt );
 #else
   PinnedHostBufferMatrixElements hstMatrixElements( nevt );
@@ -342,7 +343,7 @@ main( int argc, char** argv )
 
   // Memory buffers for random numbers for helicity selection
   // *** NB #403 these buffers always remain initialised at 0: no need for helicity choice in gcheck/check (no LHE produced) ***
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferRndNumHelicity hstRndHel( nevt );
 #else
   PinnedHostBufferRndNumHelicity hstRndHel( nevt );
@@ -351,7 +352,7 @@ main( int argc, char** argv )
 
   // Memory buffers for random numbers for color selection
   // *** NB #402 these buffers always remain initialised at 0: no need for color choice in gcheck/check (no LHE produced) ***
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferRndNumColor hstRndCol( nevt );
 #else
   PinnedHostBufferRndNumColor hstRndCol( nevt );
@@ -359,7 +360,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for helicity selection
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferSelectedHelicity hstSelHel( nevt );
 #else
   PinnedHostBufferSelectedHelicity hstSelHel( nevt );
@@ -367,7 +368,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for color selection
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferSelectedColor hstSelCol( nevt );
 #else
   PinnedHostBufferSelectedColor hstSelCol( nevt );
@@ -403,7 +404,7 @@ main( int argc, char** argv )
 #else
   else
   {
-    throw std::logic_error( "CurandDevice is not supported on CPUs" ); // INTERNAL ERROR (no path to this statement)
+    throw std::logic_error( "CurandDevice is not supported on CPUs or HIP GPUs" ); // INTERNAL ERROR (no path to this statement)
   }
 #endif
 #else
@@ -421,7 +422,7 @@ main( int argc, char** argv )
   }
   else
   {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     prsk.reset( new RamboSamplingKernelDevice( energy, devRndmom, devMomenta, devWeights, gpublocks, gputhreads ) );
 #else
     throw std::logic_error( "RamboDevice is not supported on CPUs" ); // INTERNAL ERROR (no path to this statement)
@@ -432,7 +433,7 @@ main( int argc, char** argv )
   std::unique_ptr<MatrixElementKernelBase> pmek;
   if( !bridge )
   {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     pmek.reset( new MatrixElementKernelDevice( devMomenta, devGs, devRndHel, devRndCol, devMatrixElements, devSelHel, devSelCol, gpublocks, gputhreads ) );
 #else
     pmek.reset( new MatrixElementKernelHost( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, nevt ) );
@@ -440,7 +441,7 @@ main( int argc, char** argv )
   }
   else
   {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     pmek.reset( new BridgeKernelDevice( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, gpublocks, gputhreads ) );
 #else
     pmek.reset( new BridgeKernelHost( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, nevt ) );
@@ -482,7 +483,7 @@ main( int argc, char** argv )
     prnk->generateRnarray();
     //std::cout << "Got random numbers" << std::endl;
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     if( rndgen != RandomNumberMode::CurandDevice && rmbsmp == RamboSamplingMode::RamboDevice )
     {
       // --- 1c. Copy rndmom from host to device
@@ -514,7 +515,7 @@ main( int argc, char** argv )
     prsk->getMomentaFinal();
     //std::cout << "Got final momenta" << std::endl;
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     if( rmbsmp == RamboSamplingMode::RamboDevice )
     {
       // --- 2c. CopyDToH Weights
@@ -559,7 +560,7 @@ main( int argc, char** argv )
       dynamic_cast<BridgeKernelBase*>( pmek.get() )->transposeInputMomentaC2F();
     }
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     // --- 2d. CopyHToD Momenta
     const std::string gKey = "0.. CpHTDg";
     rambtime += timermap.start( gKey ); // FIXME! NOT A RAMBO TIMER!
@@ -588,7 +589,7 @@ main( int argc, char** argv )
     wv3atime += timermap.stop(); // calc only
     wavetime += wv3atime;        // calc plus copy
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     if( !bridge )
     {
       // --- 3b. CopyDToH MEs
@@ -731,15 +732,19 @@ main( int argc, char** argv )
     rndgentxt = "CURAND DEVICE";
 #ifdef __CUDACC__
   rndgentxt += " (CUDA code)";
+#elif defined __HIPCC__
+  rndgentxt += " (HIP code)";
 #else
   rndgentxt += " (C++ code)";
 #endif
 
   // Workflow description summary
   std::string wrkflwtxt;
-  // -- CUDA or C++?
+  // -- CUDA or HIP or C++?
 #ifdef __CUDACC__
   wrkflwtxt += "CUD:";
+#elif defined __HIPCC__
+  wrkflwtxt += "HIP:";
 #else
   wrkflwtxt += "CPP:";
 #endif
@@ -764,6 +769,12 @@ main( int argc, char** argv )
 #else
   wrkflwtxt += "???:"; // no path to this statement
 #endif
+#elif defined __HIPCC__
+#if defined MGONGPU_CUCXTYPE_CXSMPL
+  wrkflwtxt += "CXS:";
+#else
+  wrkflwtxt += "???:"; // no path to this statement
+#endif
 #else
 #if defined MGONGPU_CPPCXTYPE_STDCOMPLEX
   wrkflwtxt += "STX:";
@@ -789,7 +800,7 @@ main( int argc, char** argv )
     wrkflwtxt += "RMBDEV+";
   else
     wrkflwtxt += "??????+"; // no path to this statement
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   // -- HOST or DEVICE matrix elements? Standalone MEs or BRIDGE?
   if( !bridge )
     wrkflwtxt += "MESDEV";
@@ -845,7 +856,7 @@ main( int argc, char** argv )
 
   if( perf )
   {
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 #ifdef _OPENMP
     // Get the output of "nproc --all" (https://stackoverflow.com/a/478960)
     std::string nprocall;
@@ -866,6 +877,8 @@ main( int argc, char** argv )
     std::cout << std::string( SEP79, '*' ) << std::endl
 #ifdef __CUDACC__
               << "Process                     = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_CUDA"
+#elif defined __HIPCC__
+              << "Process                     = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_HIP"
 #else
               << "Process                     = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_CPP"
 #endif
@@ -892,21 +905,21 @@ main( int argc, char** argv )
 #elif defined MGONGPU_FPTYPE_FLOAT
               << "FP precision                = FLOAT (NaN/abnormal=" << nabn << ", zero=" << nzero << ")" << std::endl
 #endif
-#ifdef __CUDACC__
 #if defined MGONGPU_CUCXTYPE_CUCOMPLEX
               << "Complex type                = CUCOMPLEX" << std::endl
 #elif defined MGONGPU_CUCXTYPE_THRUST
               << "Complex type                = THRUST::COMPLEX" << std::endl
-#endif
-#else
+#elif defined MGONGPU_CUCXTYPE_CXSMPL
               << "Complex type                = STD::COMPLEX" << std::endl
+#else
+              << "Complex type                = ???" << std::endl // no path to this statement...
 #endif
               << "RanNumb memory layout       = AOSOA[" << neppR << "]"
               << ( neppR == 1 ? " == AOS" : "" )
               << " [HARDCODED FOR REPRODUCIBILITY]" << std::endl
               << "Momenta memory layout       = AOSOA[" << neppM << "]"
               << ( neppM == 1 ? " == AOS" : "" ) << std::endl
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     //<< "Wavefunction GPU memory     = LOCAL" << std::endl
 #else
 #if !defined MGONGPU_CPPSIMD
@@ -937,7 +950,7 @@ main( int argc, char** argv )
 #endif
 #endif
               << "Random number generation    = " << rndgentxt << std::endl
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 #ifdef _OPENMP
               << "OMP threads / `nproc --all` = " << omp_get_max_threads() << " / " << nprocall // includes a newline
 #endif
@@ -1033,14 +1046,14 @@ main( int argc, char** argv )
              << "\"FLOAT (NaN/abnormal=" << nabn << ")\"," << std::endl
 #endif
              << "\"Complex type\": "
-#ifdef __CUDACC__
 #if defined MGONGPU_CUCXTYPE_CUCOMPLEX
              << "\"CUCOMPLEX\"," << std::endl
 #elif defined MGONGPU_CUCXTYPE_THRUST
              << "\"THRUST::COMPLEX\"," << std::endl
-#endif
-#else
+#elif defined MGONGPU_CUCXTYPE_CXSMPL
              << "\"STD::COMPLEX\"," << std::endl
+#else
+             << "\"???\"," << std::endl                           // no path to this statement...
 #endif
              << "\"RanNumb memory layout\": "
              << "\"AOSOA[" << neppR << "]\""
@@ -1048,7 +1061,7 @@ main( int argc, char** argv )
              << "\"Momenta memory layout\": "
              << "\"AOSOA[" << neppM << "]\""
              << ( neppM == 1 ? " == AOS" : "" ) << ", " << std::endl
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     //<< "\"Wavefunction GPU memory\": " << "\"LOCAL\"," << std::endl
 #endif
              << "\"Curand generation\": "
diff --git a/epochX/cudacpp/ee_mumu.sa/SubProcesses/RamboSamplingKernels.cc b/epochX/cudacpp/ee_mumu.sa/SubProcesses/RamboSamplingKernels.cc
index da68aa9255..79abbcc4f8 100644
--- a/epochX/cudacpp/ee_mumu.sa/SubProcesses/RamboSamplingKernels.cc
+++ b/epochX/cudacpp/ee_mumu.sa/SubProcesses/RamboSamplingKernels.cc
@@ -1,11 +1,11 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #include "RamboSamplingKernels.h"
 
-#include "CudaRuntime.h"
+#include "GpuRuntime.h"
 #include "MemoryAccessMomenta.h"
 #include "MemoryAccessRandomNumbers.h"
 #include "MemoryAccessWeights.h"
@@ -14,7 +14,7 @@
 
 #include <sstream>
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -92,7 +92,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   RamboSamplingKernelDevice::RamboSamplingKernelDevice( const fptype energy,               // input: energy
                                                         const BufferRndNumMomenta& rndmom, // input: random numbers in [0,1]
                                                         BufferMomenta& momenta,            // output: momenta
@@ -135,7 +135,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   __global__ void
   getMomentaInitialDevice( const fptype energy,
                            fptype* momenta )
@@ -147,17 +147,17 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   void
   RamboSamplingKernelDevice::getMomentaInitial()
   {
-    getMomentaInitialDevice<<<m_gpublocks, m_gputhreads>>>( m_energy, m_momenta.data() );
+    gpuLaunchKernel( getMomentaInitialDevice, m_gpublocks, m_gputhreads, m_energy, m_momenta.data() );
   }
 #endif
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   __global__ void
   getMomentaFinalDevice( const fptype energy,
                          const fptype* rndmom,
@@ -171,11 +171,11 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   void
   RamboSamplingKernelDevice::getMomentaFinal()
   {
-    getMomentaFinalDevice<<<m_gpublocks, m_gputhreads>>>( m_energy, m_rndmom.data(), m_momenta.data(), m_weights.data() );
+    gpuLaunchKernel( getMomentaFinalDevice, m_gpublocks, m_gputhreads, m_energy, m_rndmom.data(), m_momenta.data(), m_weights.data() );
   }
 #endif
 
diff --git a/epochX/cudacpp/ee_mumu.sa/SubProcesses/RamboSamplingKernels.h b/epochX/cudacpp/ee_mumu.sa/SubProcesses/RamboSamplingKernels.h
index 184089efd7..7c214cd74b 100644
--- a/epochX/cudacpp/ee_mumu.sa/SubProcesses/RamboSamplingKernels.h
+++ b/epochX/cudacpp/ee_mumu.sa/SubProcesses/RamboSamplingKernels.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef RAMBOSAMPLINGKERNELS_H
 #define RAMBOSAMPLINGKERNELS_H 1
@@ -10,7 +10,7 @@
 
 #include "MemoryBuffers.h"
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -93,7 +93,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   // A class encapsulating RAMBO phase space sampling on a GPU device
   class RamboSamplingKernelDevice final : public SamplingKernelBase, public NumberOfEvents
   {
diff --git a/epochX/cudacpp/ee_mumu.sa/SubProcesses/RandomNumberKernels.h b/epochX/cudacpp/ee_mumu.sa/SubProcesses/RandomNumberKernels.h
index 188a72c2c9..21d63beeac 100644
--- a/epochX/cudacpp/ee_mumu.sa/SubProcesses/RandomNumberKernels.h
+++ b/epochX/cudacpp/ee_mumu.sa/SubProcesses/RandomNumberKernels.h
@@ -1,14 +1,14 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef RANDOMNUMBERKERNELS_H
 #define RANDOMNUMBERKERNELS_H 1
 
 #include "mgOnGpuConfig.h"
 
-// NB This must come AFTER mgOnGpuConfig.h which contains our definition of __global__ when __CUDACC__ is not defined
+// NB This must come AFTER mgOnGpuConfig.h which contains our definition of __global__ when MGONGPUCPP_GPUIMPL is not defined
 #ifndef MGONGPU_HAS_NO_CURAND
 //#include "curand.h"
 struct curandGenerator_st; // forward definition from curand.h
@@ -16,7 +16,7 @@ struct curandGenerator_st; // forward definition from curand.h
 
 #include "MemoryBuffers.h"
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/ee_mumu.sa/SubProcesses/cudacpp.mk b/epochX/cudacpp/ee_mumu.sa/SubProcesses/cudacpp.mk
index a0397e9ecc..bcb73d7f01 100644
--- a/epochX/cudacpp/ee_mumu.sa/SubProcesses/cudacpp.mk
+++ b/epochX/cudacpp/ee_mumu.sa/SubProcesses/cudacpp.mk
@@ -1,7 +1,7 @@
 # Copyright (C) 2020-2023 CERN and UCLouvain.
 # Licensed under the GNU Lesser General Public License (version 3 or later).
 # Created by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin.
-# Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+# Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 
 #=== Determine the name of this makefile (https://ftp.gnu.org/old-gnu/Manuals/make-3.80/html_node/make_17.html)
 #=== NB: different names (e.g. cudacpp.mk and cudacpp_src.mk) are used in the Subprocess and src directories
@@ -32,7 +32,7 @@ UNAME_P := $(shell uname -p)
 #=== Configure common compiler flags for C++ and CUDA
 
 INCFLAGS = -I.
-OPTFLAGS = -O3 # this ends up in CUFLAGS too (should it?), cannot add -Ofast or -ffast-math here
+OPTFLAGS = -O3 # this ends up in GPUFLAGS too (should it?), cannot add -Ofast or -ffast-math here
 
 # Dependency on src directory
 MG5AMC_COMMONLIB = mg5amc_common
@@ -89,69 +89,139 @@ endif
 
 #-------------------------------------------------------------------------------
 
-#=== Configure the CUDA compiler
-
-# If CXX is not a single word (example "clang++ --gcc-toolchain...") then disable CUDA builds (issue #505)
-# This is because it is impossible to pass this to "CUFLAGS += -ccbin <host-compiler>" below
-ifneq ($(words $(subst ccache ,,$(CXX))),1) # allow at most "CXX=ccache <host-compiler>" from outside
-  $(warning CUDA builds are not supported for multi-word CXX "$(CXX)")
-  override CUDA_HOME=disabled
-endif
-
-# If CUDA_HOME is not set, try to set it from the location of nvcc
-ifndef CUDA_HOME
-  CUDA_HOME = $(patsubst %bin/nvcc,%,$(shell which nvcc 2>/dev/null))
-  $(warning CUDA_HOME was not set: using "$(CUDA_HOME)")
-endif
-
-# Set NVCC as $(CUDA_HOME)/bin/nvcc if it exists
-ifneq ($(wildcard $(CUDA_HOME)/bin/nvcc),)
-  NVCC = $(CUDA_HOME)/bin/nvcc
-  USE_NVTX ?=-DUSE_NVTX
-  # See https://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/index.html
-  # See https://arnon.dk/matching-sm-architectures-arch-and-gencode-for-various-nvidia-cards/
-  # Default: use compute capability 70 for V100 (CERN lxbatch, CERN itscrd, Juwels Cluster).
-  # Embed device code for 70, and PTX for 70+.
-  # Export MADGRAPH_CUDA_ARCHITECTURE (comma-separated list) to use another value or list of values (see #533).
-  # Examples: use 60 for P100 (Piz Daint), 80 for A100 (Juwels Booster, NVidia raplab/Curiosity).
-  MADGRAPH_CUDA_ARCHITECTURE ?= 70
-  ###CUARCHFLAGS = -gencode arch=compute_$(MADGRAPH_CUDA_ARCHITECTURE),code=compute_$(MADGRAPH_CUDA_ARCHITECTURE) -gencode arch=compute_$(MADGRAPH_CUDA_ARCHITECTURE),code=sm_$(MADGRAPH_CUDA_ARCHITECTURE) # Older implementation (AV): go back to this one for multi-GPU support #533
-  ###CUARCHFLAGS = --gpu-architecture=compute_$(MADGRAPH_CUDA_ARCHITECTURE) --gpu-code=sm_$(MADGRAPH_CUDA_ARCHITECTURE),compute_$(MADGRAPH_CUDA_ARCHITECTURE) # Newer implementation (SH): cannot use this as-is for multi-GPU support #533
-  comma:=,
-  CUARCHFLAGS = $(foreach arch,$(subst $(comma), ,$(MADGRAPH_CUDA_ARCHITECTURE)),-gencode arch=compute_$(arch),code=compute_$(arch) -gencode arch=compute_$(arch),code=sm_$(arch))
-  CUINC = -I$(CUDA_HOME)/include/
-  CURANDLIBFLAGS = -L$(CUDA_HOME)/lib64/ -lcurand # NB: -lcuda is not needed here!
-  CUOPTFLAGS = -lineinfo
-  CUFLAGS = $(OPTFLAGS) $(CUOPTFLAGS) $(INCFLAGS) $(CUINC) $(USE_NVTX) $(CUARCHFLAGS) -use_fast_math
-  ###CUFLAGS += -Xcompiler -Wall -Xcompiler -Wextra -Xcompiler -Wshadow
-  ###NVCC_VERSION = $(shell $(NVCC) --version | grep 'Cuda compilation tools' | cut -d' ' -f5 | cut -d, -f1)
-  CUFLAGS += -std=c++17 # need CUDA >= 11.2 (see #333): this is enforced in mgOnGpuConfig.h
-  # Without -maxrregcount: baseline throughput: 6.5E8 (16384 32 12) up to 7.3E8 (65536 128 12)
-  ###CUFLAGS+= --maxrregcount 160 # improves throughput: 6.9E8 (16384 32 12) up to 7.7E8 (65536 128 12)
-  ###CUFLAGS+= --maxrregcount 128 # improves throughput: 7.3E8 (16384 32 12) up to 7.6E8 (65536 128 12)
-  ###CUFLAGS+= --maxrregcount 96 # degrades throughput: 4.1E8 (16384 32 12) up to 4.5E8 (65536 128 12)
-  ###CUFLAGS+= --maxrregcount 64 # degrades throughput: 1.7E8 (16384 32 12) flat at 1.7E8 (65536 128 12)
-else ifneq ($(origin REQUIRE_CUDA),undefined)
-  # If REQUIRE_CUDA is set but no cuda is found, stop here (e.g. for CI tests on GPU #443)
-  $(error No cuda installation found (set CUDA_HOME or make nvcc visible in PATH))
-else
-  # No cuda. Switch cuda compilation off and go to common random numbers in C++
-  $(warning CUDA_HOME is not set or is invalid: export CUDA_HOME to compile with cuda)
-  override NVCC=
-  override USE_NVTX=
-  override CUINC=
-  override CURANDLIBFLAGS=
-endif
+CUDA_COMPILER_PATH := $(shell compiler="`which nvcc 2>/dev/null`" && while [ -L "$$compiler" ]; do compiler=`readlink "$$compiler"`; done && echo "$$compiler")
+HIP_COMPILER_PATH := $(shell compiler="`which hipcc 2>/dev/null`" && while [ -L "$$compiler" ]; do compiler=`readlink "$$compiler"`; done && echo "$$compiler")
+
+ifeq ($(findstring nvcc,$(CUDA_COMPILER_PATH)),nvcc)
+  #=== Configure the CUDA compiler
+
+  # If CXX is not a single word (example "clang++ --gcc-toolchain...") then disable CUDA builds (issue #505)
+  # This is because it is impossible to pass this to "GPUFLAGS += -ccbin <host-compiler>" below
+  ifneq ($(words $(subst ccache ,,$(CXX))),1) # allow at most "CXX=ccache <host-compiler>" from outside
+    $(warning CUDA builds are not supported for multi-word CXX "$(CXX)")
+    override CUDA_HOME=disabled
+  endif
+
+  # If CUDA_HOME is not set, try to set it from the location of NVCC
+  ifndef CUDA_HOME
+    CUDA_HOME = $(patsubst %bin/nvcc,%,$(shell which nvcc 2>/dev/null))
+    $(warning CUDA_HOME was not set: using "$(CUDA_HOME)")
+  endif
+
+  # Set GPUCC as $(CUDA_HOME)/bin/nvcc if it exists
+  ifneq ($(wildcard $(CUDA_HOME)/bin/nvcc),)
+    GPUCC = $(CUDA_HOME)/bin/nvcc
+    USE_NVTX ?=-DUSE_NVTX
+    # See https://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/index.html
+    # See https://arnon.dk/matching-sm-architectures-arch-and-gencode-for-various-nvidia-cards/
+    # Default: use compute capability 70 for V100 (CERN lxbatch, CERN itscrd, Juwels Cluster).
+    # Embed device code for 70, and PTX for 70+.
+    # Export MADGRAPH_CUDA_ARCHITECTURE (comma-separated list) to use another value or list of values (see #533).
+    # Examples: use 60 for P100 (Piz Daint), 80 for A100 (Juwels Booster, NVidia raplab/Curiosity).
+    MADGRAPH_CUDA_ARCHITECTURE ?= 70
+    ###CUARCHFLAGS = -gencode arch=compute_$(MADGRAPH_CUDA_ARCHITECTURE),code=compute_$(MADGRAPH_CUDA_ARCHITECTURE) -gencode arch=compute_$(MADGRAPH_CUDA_ARCHITECTURE),code=sm_$(MADGRAPH_CUDA_ARCHITECTURE) # Older implementation (AV): go back to this one for multi-GPU support #533
+    ###CUARCHFLAGS = --gpu-architecture=compute_$(MADGRAPH_CUDA_ARCHITECTURE) --gpu-code=sm_$(MADGRAPH_CUDA_ARCHITECTURE),compute_$(MADGRAPH_CUDA_ARCHITECTURE) # Newer implementation (SH): cannot use this as-is for multi-GPU support #533
+    comma:=,
+    CUARCHFLAGS = $(foreach arch,$(subst $(comma), ,$(MADGRAPH_CUDA_ARCHITECTURE)),-gencode arch=compute_$(arch),code=compute_$(arch) -gencode arch=compute_$(arch),code=sm_$(arch))
+    CUINC = -I$(CUDA_HOME)/include/
+    CURANDLIBFLAGS = -L$(CUDA_HOME)/lib64/ -lcurand # NB: -lcuda is not needed here!
+    CUOPTFLAGS = -lineinfo
+    GPUFLAGS = $(OPTFLAGS) $(CUOPTFLAGS) $(INCFLAGS) $(CUINC) $(USE_NVTX) $(CUARCHFLAGS) -use_fast_math
+    ###GPUFLAGS += -Xcompiler -Wall -Xcompiler -Wextra -Xcompiler -Wshadow
+    ###GPUCC_VERSION = $(shell $(GPUCC) --version | grep 'Cuda compilation tools' | cut -d' ' -f5 | cut -d, -f1)
+    GPUFLAGS += -std=c++17 # need CUDA >= 11.2 (see #333): this is enforced in mgOnGpuConfig.h
+    # Without -maxrregcount: baseline throughput: 6.5E8 (16384 32 12) up to 7.3E8 (65536 128 12)
+    ###GPUFLAGS+= --maxrregcount 160 # improves throughput: 6.9E8 (16384 32 12) up to 7.7E8 (65536 128 12)
+    ###GPUFLAGS+= --maxrregcount 128 # improves throughput: 7.3E8 (16384 32 12) up to 7.6E8 (65536 128 12)
+    ###GPUFLAGS+= --maxrregcount 96 # degrades throughput: 4.1E8 (16384 32 12) up to 4.5E8 (65536 128 12)
+    ###GPUFLAGS+= --maxrregcount 64 # degrades throughput: 1.7E8 (16384 32 12) flat at 1.7E8 (65536 128 12)
+
+    CUBUILDRULEFLAGS = -Xcompiler -fPIC -c
+    CCBUILDRULEFLAGS = -Xcompiler -fPIC -c -x cu
+
+    CUDATESTFLAGS = -lcuda
+
+  else ifneq ($(origin REQUIRE_CUDA),undefined)
+    # If REQUIRE_CUDA is set but no cuda is found, stop here (e.g. for CI tests on GPU #443)
+    $(error No cuda installation found (set CUDA_HOME or make GPUCC visible in PATH))
+  else
+    # No cuda. Switch cuda compilation off and go to common random numbers in C++
+    $(warning CUDA_HOME is not set or is invalid: export CUDA_HOME to compile with cuda)
+    override GPUCC=
+    override USE_NVTX=
+    override CUINC=
+    override CURANDLIBFLAGS=
+  endif
+
+  # Set the host C++ compiler for GPUCC via "-ccbin <host-compiler>"
+  # (NB issue #505: this must be a single word, "clang++ --gcc-toolchain..." is not supported)
+  GPUFLAGS += -ccbin $(shell which $(subst ccache ,,$(CXX)))
+
+  # Allow newer (unsupported) C++ compilers with older versions of CUDA if ALLOW_UNSUPPORTED_COMPILER_IN_CUDA is set (#504)
+  ifneq ($(origin ALLOW_UNSUPPORTED_COMPILER_IN_CUDA),undefined)
+  GPUFLAGS += -allow-unsupported-compiler
+  endif
+
+else ifeq ($(findstring hipcc,$(HIP_COMPILER_PATH)),hipcc)
+  #=== Configure the HIP compiler
+
+  # If CXX is not a single word (example "clang++ --gcc-toolchain...") then disable CUDA builds (issue #505)
+  # This is because it is impossible to pass this to "GPUFLAGS += -ccbin <host-compiler>" below
+  ifneq ($(words $(subst ccache ,,$(CXX))),1) # allow at most "CXX=ccache <host-compiler>" from outside
+    $(warning CUDA builds are not supported for multi-word CXX "$(CXX)")
+    override CUDA_HOME=disabled
+  endif
+
+  # If HIP_HOME is not set, try to set it from the location of GPUCC
+  ifndef HIP_HOME
+    HIP_HOME = $(patsubst %bin/hipcc,%,$(HIP_COMPILER_PATH))
+    $(warning HIP_HOME was not set: using "$(HIP_HOME)")
+  endif
 
-# Set the host C++ compiler for nvcc via "-ccbin <host-compiler>"
-# (NB issue #505: this must be a single word, "clang++ --gcc-toolchain..." is not supported)
-CUFLAGS += -ccbin $(shell which $(subst ccache ,,$(CXX)))
+  # Set GPUCC as $(HIP_HOME)/bin/hipcc if it exists
+  ifneq ($(wildcard $(HIP_HOME)/bin/hipcc),)
+    GPUCC = $(HIP_HOME)/bin/hipcc
+
+    # Should maybe find something equivelant to this in HIP
+    #USE_NVTX ?=-DUSE_NVTX
+
+    HIPARCHFLAGS = -target x86_64-linux-gnu --offload-arch=gfx90a
+    HIPINC = -I$(HIP_HOME)/include/
+
+    # -DHIP_FAST_MATH equivelant to -use_fast_math in HIP 
+    # (But only for single precision line 208: https://rocm-developer-tools.github.io/HIP/hcc__detail_2math__functions_8h_source.html)
+    GPUFLAGS = $(OPTFLAGS) $(CUOPTFLAGS) $(INCFLAGS) $(HIPINC) $(HIPARCHFLAGS) -DHIP_FAST_MATH -DHIP_PLATFORM=amd -fPIC
+    ###GPUFLAGS += -Xcompiler -Wall -Xcompiler -Wextra -Xcompiler -Wshadow
+    GPUFLAGS += -std=c++17
+    # Without -maxrregcount: baseline throughput: 6.5E8 (16384 32 12) up to 7.3E8 (65536 128 12)
+    ###GPUFLAGS+= --maxrregcount 160 # improves throughput: 6.9E8 (16384 32 12) up to 7.7E8 (65536 128 12)
+    ###GPUFLAGS+= --maxrregcount 128 # improves throughput: 7.3E8 (16384 32 12) up to 7.6E8 (65536 128 12)
+    ###GPUFLAGS+= --maxrregcount 96 # degrades throughput: 4.1E8 (16384 32 12) up to 4.5E8 (65536 128 12)
+    ###GPUFLAGS+= --maxrregcount 64 # degrades throughput: 1.7E8 (16384 32 12) flat at 1.7E8 (65536 128 12)
+
+    CUBUILDRULEFLAGS = -fPIC -c
+    CCBUILDRULEFLAGS = -fPIC -c
+
+  else ifneq ($(origin REQUIRE_HIP),undefined)
+    # If REQUIRE_HIP is set but no cuda is found, stop here (e.g. for CI tests on GPU #443)
+    $(error No hip installation found (set HIP_HOME or make GPUCC visible in PATH))
+  else
+    # No hip. Switch hip compilation off and go to common random numbers in C++
+    $(warning HIP_HOME is not set or is invalid: export HIP_HOME to compile with hip)
+    override GPUCC=
+    override USE_NVTX=
+    override CUINC=
+    override CURANDLIBFLAGS=
+  endif
+
+  # Allow newer (unsupported) C++ compilers with older versions of CUDA if ALLOW_UNSUPPORTED_COMPILER_IN_CUDA is set (#504)
+  ifneq ($(origin ALLOW_UNSUPPORTED_COMPILER_IN_CUDA),undefined)
+  GPUFLAGS += -allow-unsupported-compiler
+  endif
 
-# Allow newer (unsupported) C++ compilers with older versions of CUDA if ALLOW_UNSUPPORTED_COMPILER_IN_CUDA is set (#504)
-ifneq ($(origin ALLOW_UNSUPPORTED_COMPILER_IN_CUDA),undefined)
-CUFLAGS += -allow-unsupported-compiler
 endif
 
+  
 #-------------------------------------------------------------------------------
 
 #=== Configure ccache for C++ and CUDA builds
@@ -163,9 +233,9 @@ endif
 #ifeq ($(USECCACHE)$(shell echo $(AR) | grep ccache),1)
 #  override AR:=ccache $(AR)
 #endif
-ifneq ($(NVCC),)
-  ifeq ($(USECCACHE)$(shell echo $(NVCC) | grep ccache),1)
-    override NVCC:=ccache $(NVCC)
+ifneq ($(GPUCC),)
+  ifeq ($(USECCACHE)$(shell echo $(GPUCC) | grep ccache),1)
+    override GPUCC:=ccache $(GPUCC)
   endif
 endif
 
@@ -189,7 +259,7 @@ endif
 
 # PowerPC-specific CUDA compiler flags (to be reviewed!)
 ifeq ($(UNAME_P),ppc64le)
-  CUFLAGS+= -Xcompiler -mno-float128
+  GPUFLAGS+= -Xcompiler -mno-float128
 endif
 
 #-------------------------------------------------------------------------------
@@ -199,10 +269,10 @@ endif
 # Set the default OMPFLAGS choice
 ifneq ($(shell $(CXX) --version | egrep '^Intel'),)
 override OMPFLAGS = -fopenmp
-###override OMPFLAGS = # disable OpenMP MT on Intel (was ok without nvcc but not ok with nvcc before #578)
+###override OMPFLAGS = # disable OpenMP MT on Intel (was ok without GPUCC but not ok with GPUCC before #578)
 else ifneq ($(shell $(CXX) --version | egrep '^(clang)'),)
 override OMPFLAGS = -fopenmp
-###override OMPFLAGS = # disable OpenMP MT on clang (was not ok without or with nvcc before #578)
+###override OMPFLAGS = # disable OpenMP MT on clang (was not ok without or with GPUCC before #578)
 else ifneq ($(shell $(CXX) --version | egrep '^(Apple clang)'),)
 override OMPFLAGS = # disable OpenMP MT on Apple clang (builds fail in the CI #578)
 else
@@ -253,7 +323,10 @@ endif
 
 # Set the default RNDGEN (random number generator) choice
 ifeq ($(RNDGEN),)
-  ifeq ($(NVCC),)
+  ifeq ($(GPUCC),)
+    override RNDGEN = hasNoCurand
+  # Edgecase for HIP compilation
+  else ifeq ($(findstring hipcc,$(GPUCC)),hipcc)
     override RNDGEN = hasNoCurand
   else ifeq ($(RNDGEN),)
     override RNDGEN = hasCurand
@@ -328,13 +401,13 @@ CXXFLAGS+= $(AVXFLAGS)
 $(info FPTYPE=$(FPTYPE))
 ifeq ($(FPTYPE),d)
   CXXFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_DOUBLE
-  CUFLAGS  += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_DOUBLE
+  GPUFLAGS  += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_DOUBLE
 else ifeq ($(FPTYPE),f)
   CXXFLAGS += -DMGONGPU_FPTYPE_FLOAT -DMGONGPU_FPTYPE2_FLOAT
-  CUFLAGS  += -DMGONGPU_FPTYPE_FLOAT -DMGONGPU_FPTYPE2_FLOAT
+  GPUFLAGS  += -DMGONGPU_FPTYPE_FLOAT -DMGONGPU_FPTYPE2_FLOAT
 else ifeq ($(FPTYPE),m)
   CXXFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_FLOAT
-  CUFLAGS  += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_FLOAT
+  GPUFLAGS  += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_FLOAT
 else
   $(error Unknown FPTYPE='$(FPTYPE)': only 'd', 'f' and 'm' are supported)
 endif
@@ -343,7 +416,7 @@ endif
 $(info HELINL=$(HELINL))
 ifeq ($(HELINL),1)
   CXXFLAGS += -DMGONGPU_INLINE_HELAMPS
-  CUFLAGS  += -DMGONGPU_INLINE_HELAMPS
+  GPUFLAGS  += -DMGONGPU_INLINE_HELAMPS
 else ifneq ($(HELINL),0)
   $(error Unknown HELINL='$(HELINL)': only '0' and '1' are supported)
 endif
@@ -352,7 +425,7 @@ endif
 $(info HRDCOD=$(HRDCOD))
 ifeq ($(HRDCOD),1)
   CXXFLAGS += -DMGONGPU_HARDCODE_PARAM
-  CUFLAGS  += -DMGONGPU_HARDCODE_PARAM
+  GPUFLAGS  += -DMGONGPU_HARDCODE_PARAM
 else ifneq ($(HRDCOD),0)
   $(error Unknown HRDCOD='$(HRDCOD)': only '0' and '1' are supported)
 endif
@@ -404,11 +477,11 @@ ifeq ($(UNAME_S),Darwin)
   override CULIBFLAGSRPATH2 =
 else
   # RPATH to cuda/cpp libs when linking executables
-  override CXXLIBFLAGSRPATH = -Wl,-rpath,$(LIBDIRRPATH)
-  override CULIBFLAGSRPATH = -Xlinker -rpath,$(LIBDIRRPATH)
+  override CXXLIBFLAGSRPATH = -Wl,-rpath=$(LIBDIRRPATH)
+  override CULIBFLAGSRPATH = -Xlinker -rpath=$(LIBDIRRPATH)
   # RPATH to common lib when linking cuda/cpp libs
-  override CXXLIBFLAGSRPATH2 = -Wl,-rpath,'$$ORIGIN'
-  override CULIBFLAGSRPATH2 = -Xlinker -rpath,'$$ORIGIN'
+  override CXXLIBFLAGSRPATH2 = -Wl,-rpath='$$ORIGIN'
+  override CULIBFLAGSRPATH2 = -Xlinker -rpath='$$ORIGIN'
 endif
 
 # Setting LD_LIBRARY_PATH or DYLD_LIBRARY_PATH in the RUNTIME is no longer necessary (neither on Linux nor on Mac)
@@ -421,7 +494,7 @@ override RUNTIME =
 cxx_main=$(BUILDDIR)/check.exe
 fcxx_main=$(BUILDDIR)/fcheck.exe
 
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 cu_main=$(BUILDDIR)/gcheck.exe
 fcu_main=$(BUILDDIR)/fgcheck.exe
 else
@@ -452,15 +525,16 @@ $(BUILDDIR)/.build.$(TAG):
 	@touch $(BUILDDIR)/.build.$(TAG)
 
 # Generic target and build rules: objects from CUDA compilation
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 $(BUILDDIR)/%.o : %.cu *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
 	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
-	$(NVCC) $(CPPFLAGS) $(CUFLAGS) -Xcompiler -fPIC -c $< -o $@
+	$(GPUCC) $(CPPFLAGS) $(GPUFLAGS) $(CUBUILDRULEFLAGS) $< -o $@
 
 $(BUILDDIR)/%_cu.o : %.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
 	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
-	$(NVCC) $(CPPFLAGS) $(CUFLAGS) -Xcompiler -fPIC -c -x cu $< -o $@
+	$(GPUCC) $(CPPFLAGS) $(GPUFLAGS) $(CCBUILDRULEFLAGS) $< -o $@
 endif
+# -x cu in line above
 
 # Generic target and build rules: objects from C++ compilation
 # (NB do not include CUINC here! add it only for NVTX or curand #679)
@@ -469,11 +543,14 @@ $(BUILDDIR)/%.o : %.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
 	$(CXX) $(CPPFLAGS) $(CXXFLAGS) -fPIC -c $< -o $@
 
 # Apply special build flags only to CrossSectionKernel.cc and gCrossSectionKernel.cu (no fast math, see #117)
+# Added edgecase for HIP compilation
 ifeq ($(shell $(CXX) --version | grep ^nvc++),)
 $(BUILDDIR)/CrossSectionKernels.o: CXXFLAGS += -fno-fast-math
 $(BUILDDIR)/CrossSectionKernels.o: CXXFLAGS += -fno-fast-math
-ifneq ($(NVCC),)
-$(BUILDDIR)/gCrossSectionKernels.o: CUFLAGS += -Xcompiler -fno-fast-math
+ifeq ($(findstring nvcc,$(GPUCC)),nvcc)
+  $(BUILDDIR)/gCrossSectionKernels.o: GPUFLAGS += -Xcompiler -fno-fast-math
+else
+  $(BUILDDIR)/gCrossSectionKernels.o: GPUFLAGS += -fno-fast-math
 endif
 endif
 
@@ -489,10 +566,10 @@ ifeq ($(RNDGEN),hasCurand)
 $(BUILDDIR)/CurandRandomNumberKernel.o: CXXFLAGS += $(CUINC)
 endif
 
-# Avoid "warning: builtin __has_trivial_... is deprecated; use __is_trivially_... instead" in nvcc with icx2023 (#592)
+# Avoid "warning: builtin __has_trivial_... is deprecated; use __is_trivially_... instead" in GPUCC with icx2023 (#592)
 ifneq ($(shell $(CXX) --version | egrep '^(Intel)'),)
-ifneq ($(NVCC),)
-CUFLAGS += -Xcompiler -Wno-deprecated-builtins
+ifneq ($(GPUCC),)
+GPUFLAGS += -Wno-deprecated-builtins
 endif
 endif
 
@@ -500,8 +577,8 @@ endif
 # This patch does remove the warning, but I prefer to keep it disabled for the moment...
 ###ifneq ($(shell $(CXX) --version | egrep '^(clang|Apple clang|Intel)'),)
 ###$(BUILDDIR)/CrossSectionKernels.o: CXXFLAGS += -Wno-overriding-t-option
-###ifneq ($(NVCC),)
-###$(BUILDDIR)/gCrossSectionKernels.o: CUFLAGS += -Xcompiler -Wno-overriding-t-option
+###ifneq ($(GPUCC),)
+###$(BUILDDIR)/gCrossSectionKernels.o: GPUFLAGS += -Xcompiler -Wno-overriding-t-option
 ###endif
 ###endif
 
@@ -528,7 +605,7 @@ MG5AMC_CXXLIB = mg5amc_$(processid_short)_cpp
 cxx_objects_lib=$(BUILDDIR)/CPPProcess.o $(BUILDDIR)/MatrixElementKernels.o $(BUILDDIR)/BridgeKernels.o $(BUILDDIR)/CrossSectionKernels.o
 cxx_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel.o $(BUILDDIR)/RamboSamplingKernels.o
 
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 MG5AMC_CULIB = mg5amc_$(processid_short)_cuda
 cu_objects_lib=$(BUILDDIR)/gCPPProcess.o $(BUILDDIR)/gMatrixElementKernels.o $(BUILDDIR)/gBridgeKernels.o $(BUILDDIR)/gCrossSectionKernels.o
 cu_objects_exe=$(BUILDDIR)/gCommonRandomNumberKernel.o $(BUILDDIR)/gRamboSamplingKernels.o
@@ -540,11 +617,11 @@ $(LIBDIR)/lib$(MG5AMC_CXXLIB).so: cxx_objects_lib += $(BUILDDIR)/fbridge.o
 $(LIBDIR)/lib$(MG5AMC_CXXLIB).so: $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib)
 	$(CXX) -shared -o $@ $(cxx_objects_lib) $(CXXLIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB)
 
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 $(LIBDIR)/lib$(MG5AMC_CULIB).so: $(BUILDDIR)/fbridge_cu.o
 $(LIBDIR)/lib$(MG5AMC_CULIB).so: cu_objects_lib += $(BUILDDIR)/fbridge_cu.o
 $(LIBDIR)/lib$(MG5AMC_CULIB).so: $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cu_objects_lib)
-	$(NVCC) --shared -o $@ $(cu_objects_lib) $(CULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB)
+	$(GPUCC) --shared -o $@ $(cu_objects_lib) $(CULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB)
 endif
 
 #-------------------------------------------------------------------------------
@@ -561,16 +638,16 @@ $(cxx_main): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PAT
 $(cxx_main): $(BUILDDIR)/check_sa.o $(LIBDIR)/lib$(MG5AMC_CXXLIB).so $(cxx_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel.o
 	$(CXX) -o $@ $(BUILDDIR)/check_sa.o $(OMPFLAGS) -ldl -pthread $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_CXXLIB) $(cxx_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel.o $(CURANDLIBFLAGS)
 
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 ifneq ($(shell $(CXX) --version | grep ^Intel),)
-$(cu_main): LIBFLAGS += -lintlc # compile with icpx and link with nvcc (undefined reference to `_intel_fast_memcpy')
-$(cu_main): LIBFLAGS += -lsvml # compile with icpx and link with nvcc (undefined reference to `__svml_cos4_l9')
+$(cu_main): LIBFLAGS += -lintlc # compile with icpx and link with GPUCC (undefined reference to `_intel_fast_memcpy')
+$(cu_main): LIBFLAGS += -lsvml # compile with icpx and link with GPUCC (undefined reference to `__svml_cos4_l9')
 else ifneq ($(shell $(CXX) --version | grep ^nvc++),) # support nvc++ #531
 $(cu_main): LIBFLAGS += -L$(patsubst %bin/nvc++,%lib,$(subst ccache ,,$(CXX))) -lnvhpcatm -lnvcpumath -lnvc
 endif
 $(cu_main): LIBFLAGS += $(CULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH
 $(cu_main): $(BUILDDIR)/gcheck_sa.o $(LIBDIR)/lib$(MG5AMC_CULIB).so $(cu_objects_exe) $(BUILDDIR)/gCurandRandomNumberKernel.o
-	$(NVCC) -o $@ $(BUILDDIR)/gcheck_sa.o $(CUARCHFLAGS) $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe) $(BUILDDIR)/gCurandRandomNumberKernel.o $(CURANDLIBFLAGS)
+	$(GPUCC) -o $@ $(BUILDDIR)/gcheck_sa.o $(CUARCHFLAGS) $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe) $(BUILDDIR)/gCurandRandomNumberKernel.o $(CURANDLIBFLAGS)
 endif
 
 #-------------------------------------------------------------------------------
@@ -596,17 +673,17 @@ $(fcxx_main): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PA
 $(fcxx_main): $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler.o $(LIBDIR)/lib$(MG5AMC_CXXLIB).so $(cxx_objects_exe)
 	$(CXX) -o $@ $(BUILDDIR)/fcheck_sa.o $(OMPFLAGS) $(BUILDDIR)/fsampler.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CXXLIB) $(cxx_objects_exe)
 
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 ifneq ($(shell $(CXX) --version | grep ^Intel),)
-$(fcu_main): LIBFLAGS += -lintlc # compile with icpx and link with nvcc (undefined reference to `_intel_fast_memcpy')
-$(fcu_main): LIBFLAGS += -lsvml # compile with icpx and link with nvcc (undefined reference to `__svml_cos4_l9')
+$(fcu_main): LIBFLAGS += -lintlc # compile with icpx and link with GPUCC (undefined reference to `_intel_fast_memcpy')
+$(fcu_main): LIBFLAGS += -lsvml # compile with icpx and link with GPUCC (undefined reference to `__svml_cos4_l9')
 endif
 ifeq ($(UNAME_S),Darwin)
 $(fcu_main): LIBFLAGS += -L$(shell dirname $(shell $(FC) --print-file-name libgfortran.dylib)) # add path to libgfortran on Mac #375
 endif
 $(fcu_main): LIBFLAGS += $(CULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH
 $(fcu_main): $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler_cu.o $(LIBDIR)/lib$(MG5AMC_CULIB).so $(cu_objects_exe)
-	$(NVCC) -o $@ $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler_cu.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe)
+	$(GPUCC) -o $@ $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler_cu.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe)
 endif
 
 #-------------------------------------------------------------------------------
@@ -618,7 +695,7 @@ $(BUILDDIR)/testxxx.o: testxxx_cc_ref.txt
 $(testmain): $(BUILDDIR)/testxxx.o
 $(testmain): cxx_objects_exe += $(BUILDDIR)/testxxx.o # Comment out this line to skip the C++ test of xxx functions
 
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 $(BUILDDIR)/testxxx_cu.o: $(GTESTLIBS)
 $(BUILDDIR)/testxxx_cu.o: INCFLAGS += $(GTESTINC)
 $(BUILDDIR)/testxxx_cu.o: testxxx_cc_ref.txt
@@ -631,7 +708,7 @@ $(BUILDDIR)/testmisc.o: INCFLAGS += $(GTESTINC)
 $(testmain): $(BUILDDIR)/testmisc.o
 $(testmain): cxx_objects_exe += $(BUILDDIR)/testmisc.o # Comment out this line to skip the C++ miscellaneous tests
 
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 $(BUILDDIR)/testmisc_cu.o: $(GTESTLIBS)
 $(BUILDDIR)/testmisc_cu.o: INCFLAGS += $(GTESTINC)
 $(testmain): $(BUILDDIR)/testmisc_cu.o
@@ -643,12 +720,12 @@ $(BUILDDIR)/runTest.o: INCFLAGS += $(GTESTINC)
 $(testmain): $(BUILDDIR)/runTest.o
 $(testmain): cxx_objects_exe += $(BUILDDIR)/runTest.o
 
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 $(BUILDDIR)/runTest_cu.o: $(GTESTLIBS)
 $(BUILDDIR)/runTest_cu.o: INCFLAGS += $(GTESTINC)
 ifneq ($(shell $(CXX) --version | grep ^Intel),)
-$(testmain): LIBFLAGS += -lintlc # compile with icpx and link with nvcc (undefined reference to `_intel_fast_memcpy')
-$(testmain): LIBFLAGS += -lsvml # compile with icpx and link with nvcc (undefined reference to `__svml_cos4_l9')
+$(testmain): LIBFLAGS += -lintlc # compile with icpx and link with GPUCC (undefined reference to `_intel_fast_memcpy')
+$(testmain): LIBFLAGS += -lsvml # compile with icpx and link with GPUCC (undefined reference to `__svml_cos4_l9')
 else ifneq ($(shell $(CXX) --version | grep ^nvc++),) # support nvc++ #531
 $(testmain): LIBFLAGS += -L$(patsubst %bin/nvc++,%lib,$(subst ccache ,,$(CXX))) -lnvhpcatm -lnvcpumath -lnvc
 endif
@@ -672,14 +749,14 @@ $(testmain): LIBFLAGS += -lgomp
 endif
 endif
 
-ifeq ($(NVCC),) # link only runTest.o
+ifeq ($(GPUCC),) # link only runTest.o
 $(testmain): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH
 $(testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(cxx_objects_exe) $(GTESTLIBS)
 	$(CXX) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) -ldl -pthread $(LIBFLAGS)
 else # link both runTest.o and runTest_cu.o
 $(testmain): LIBFLAGS += $(CULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH
 $(testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) $(GTESTLIBS)
-	$(NVCC) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) -ldl $(LIBFLAGS) -lcuda
+	  $(GPUCC) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) -ldl $(LIBFLAGS) $(CUDATESTFLAGS)
 endif
 
 # Use flock (Linux only, no Mac) to allow 'make -j' if googletest has not yet been downloaded https://stackoverflow.com/a/32666215
@@ -782,9 +859,9 @@ ifeq ($(USECCACHE),1)
 	ccache --version | head -1
 endif
 	@echo ""
-	@echo NVCC=$(NVCC)
-ifneq ($(NVCC),)
-	$(NVCC) --version
+	@echo GPUCC=$(GPUCC)
+ifneq ($(GPUCC),)
+	$(GPUCC) --version
 endif
 	@echo ""
 	@echo CXX=$(CXX)
@@ -803,7 +880,7 @@ endif
 
 # Target: check (run the C++ test executable)
 # [NB THIS IS WHAT IS USED IN THE GITHUB CI!]
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 check: runTest cmpFcheck cmpFGcheck
 else
 check: runTest cmpFcheck
diff --git a/epochX/cudacpp/ee_mumu.sa/SubProcesses/fbridge.cc b/epochX/cudacpp/ee_mumu.sa/SubProcesses/fbridge.cc
index f93c05b0b3..2b956730d4 100644
--- a/epochX/cudacpp/ee_mumu.sa/SubProcesses/fbridge.cc
+++ b/epochX/cudacpp/ee_mumu.sa/SubProcesses/fbridge.cc
@@ -1,11 +1,11 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: S. Roiser (Oct 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Roiser, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #include "Bridge.h"
 #include "CPPProcess.h"
-#include "CudaRuntime.h"
+#include "GpuRuntime.h"
 
 extern "C"
 {
@@ -22,7 +22,7 @@ extern "C"
    * Using the same Fortran MadEvent code, linking to the hetrerogeneous library would allow access to both CPU and GPU implementations.
    * The specific heterogeneous configuration (how many GPUs, how many threads on each CPU, etc) could be loaded in CUDA/C++ from a data file.
    */
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   using namespace mg5amcGpu;
 #else
   using namespace mg5amcCpu;
@@ -46,8 +46,8 @@ extern "C"
    */
   void fbridgecreate_( CppObjectInFortran** ppbridge, const int* pnevtF, const int* pnparF, const int* pnp4F )
   {
-#ifdef __CUDACC__
-    CudaRuntime::setUp();
+#ifdef MGONGPUCPP_GPUIMPL
+    GpuRuntime::setUp();
 #endif
     // Create a process object, read parm card and set parameters
     // FIXME: the process instance can happily go out of scope because it is only needed to read parameters?
@@ -69,8 +69,8 @@ extern "C"
     Bridge<FORTRANFPTYPE>* pbridge = dynamic_cast<Bridge<FORTRANFPTYPE>*>( *ppbridge );
     if( pbridge == 0 ) throw std::runtime_error( "fbridgedelete_: invalid Bridge address" );
     delete pbridge;
-#ifdef __CUDACC__
-    CudaRuntime::tearDown();
+#ifdef MGONGPUCPP_GPUIMPL
+    GpuRuntime::tearDown();
 #endif
   }
 
@@ -100,7 +100,7 @@ extern "C"
   {
     Bridge<FORTRANFPTYPE>* pbridge = dynamic_cast<Bridge<FORTRANFPTYPE>*>( *ppbridge );
     if( pbridge == 0 ) throw std::runtime_error( "fbridgesequence_: invalid Bridge address" );
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     // Use the device/GPU implementation in the CUDA library
     // (there is also a host implementation in this library)
     pbridge->gpu_sequence( momenta, gs, rndhel, rndcol, *pchannelId, mes, selhel, selcol );
diff --git a/epochX/cudacpp/ee_mumu.sa/SubProcesses/fsampler.cc b/epochX/cudacpp/ee_mumu.sa/SubProcesses/fsampler.cc
index 2fb445372d..3743934f41 100644
--- a/epochX/cudacpp/ee_mumu.sa/SubProcesses/fsampler.cc
+++ b/epochX/cudacpp/ee_mumu.sa/SubProcesses/fsampler.cc
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Feb 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #include "mgOnGpuConfig.h"
 
@@ -13,7 +13,7 @@
 
 //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -40,7 +40,7 @@ namespace mg5amcCpu
   private:
     const int m_nevt; // The number of events in each iteration
     int m_iiter;      // The iteration counter (for random number seeding)
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
     HostBufferRndNumMomenta m_hstRndmom; // Memory buffers for random numbers
     HostBufferMomenta m_hstMomenta;      // Memory buffers for momenta
     HostBufferWeights m_hstWeights;      // Memory buffers for sampling weights
@@ -105,7 +105,7 @@ namespace mg5amcCpu
 
 extern "C"
 {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   using namespace mg5amcGpu;
 #else
   using namespace mg5amcCpu;
diff --git a/epochX/cudacpp/ee_mumu.sa/SubProcesses/runTest.cc b/epochX/cudacpp/ee_mumu.sa/SubProcesses/runTest.cc
index 572e28aaea..461ec5c3a5 100644
--- a/epochX/cudacpp/ee_mumu.sa/SubProcesses/runTest.cc
+++ b/epochX/cudacpp/ee_mumu.sa/SubProcesses/runTest.cc
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: S. Hageboeck (Nov 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 
 #include "mgOnGpuConfig.h"
 
@@ -15,7 +15,7 @@
 #include "RandomNumberKernels.h"
 #include "epoch_process_id.h"
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 using namespace mg5amcGpu;
 #else
 using namespace mg5amcCpu;
@@ -32,7 +32,7 @@ struct CUDA_CPU_TestBase : public TestDriverBase
     : TestDriverBase( npar, refFileName ) {}
 };
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 struct CPUTest : public CUDA_CPU_TestBase
 {
   // Struct data members (process, and memory structures for random numbers, momenta, matrix elements and weights on host and device)
@@ -114,7 +114,7 @@ struct CPUTest : public CUDA_CPU_TestBase
 };
 #endif
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 struct CUDATest : public CUDA_CPU_TestBase
 {
   // Reset the device when our test goes out of scope. Note that this should happen after
@@ -123,7 +123,7 @@ struct CUDATest : public CUDA_CPU_TestBase
   {
     ~DeviceReset()
     {
-      checkCuda( cudaDeviceReset() ); // this is needed by cuda-memcheck --leak-check full
+      gpuDeviceReset(); // this is needed by cuda-memcheck --leak-check full
     }
   } deviceResetter;
 
@@ -249,7 +249,7 @@ INSTANTIATE_TEST_SUITE_P( prefix, \
                           test_suite_name, \
                           testing::Values( new CUDATest( MG_EPOCH_REFERENCE_FILE_NAME ) ) );
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 MG_INSTANTIATE_TEST_SUITE_GPU( XTESTID_GPU( MG_EPOCH_PROCESS_ID ), MadgraphTest );
 #else
 MG_INSTANTIATE_TEST_SUITE_CPU( XTESTID_CPU( MG_EPOCH_PROCESS_ID ), MadgraphTest );
diff --git a/epochX/cudacpp/ee_mumu.sa/SubProcesses/testmisc.cc b/epochX/cudacpp/ee_mumu.sa/SubProcesses/testmisc.cc
index 989aba1fdc..2bd7a9fcf9 100644
--- a/epochX/cudacpp/ee_mumu.sa/SubProcesses/testmisc.cc
+++ b/epochX/cudacpp/ee_mumu.sa/SubProcesses/testmisc.cc
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 //----------------------------------------------------------------------------
 // Use ./runTest.exe --gtest_filter=*misc to run only this test
 //----------------------------------------------------------------------------
@@ -17,7 +17,7 @@
 #include <sstream>
 #include <typeinfo>
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 #define TESTID( s ) s##_GPU_MISC
 #else
 #define TESTID( s ) s##_CPU_MISC
diff --git a/epochX/cudacpp/ee_mumu.sa/SubProcesses/testxxx.cc b/epochX/cudacpp/ee_mumu.sa/SubProcesses/testxxx.cc
index 4243e9fcec..6e8657edca 100644
--- a/epochX/cudacpp/ee_mumu.sa/SubProcesses/testxxx.cc
+++ b/epochX/cudacpp/ee_mumu.sa/SubProcesses/testxxx.cc
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Apr 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #include "mgOnGpuConfig.h"
 
@@ -20,7 +20,7 @@
 #include <iomanip>
 #include <iostream>
 #include <vector>
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 #define TESTID( s ) s##_GPU_XXX
 #else
 #define TESTID( s ) s##_CPU_XXX
@@ -41,7 +41,7 @@ TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testxxx )
   assert( nevt % neppM == 0 ); // nevt must be a multiple of neppM
   assert( nevt % neppV == 0 ); // nevt must be a multiple of neppV
   // Fill in the input momenta
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   mg5amcGpu::PinnedHostBufferMomenta hstMomenta( nevt ); // AOSOA[npagM][npar=4][np4=4][neppM]
 #else
   mg5amcCpu::HostBufferMomenta hstMomenta( nevt ); // AOSOA[npagM][npar=4][np4=4][neppM]
@@ -228,7 +228,7 @@ TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testxxx )
   {
     for( int ievt = 0; ievt < nevt; ievt++ )
     {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
       using namespace mg5amcGpu;
 #else
       using namespace mg5amcCpu;
diff --git a/epochX/cudacpp/ee_mumu.sa/src/HelAmps_sm.h b/epochX/cudacpp/ee_mumu.sa/src/HelAmps_sm.h
index fe9cb24d88..c2c572778b 100644
--- a/epochX/cudacpp/ee_mumu.sa/src/HelAmps_sm.h
+++ b/epochX/cudacpp/ee_mumu.sa/src/HelAmps_sm.h
@@ -4,7 +4,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: A. Valassi (Sep 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
 // MadGraph5_aMC@NLO v. 3.5.0_lo_vect, 2023-06-09
@@ -26,7 +26,7 @@
 //#include <iomanip>
 //#include <iostream>
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/ee_mumu.sa/src/Parameters_sm.cc b/epochX/cudacpp/ee_mumu.sa/src/Parameters_sm.cc
index daed91bb80..89bbb57a0d 100644
--- a/epochX/cudacpp/ee_mumu.sa/src/Parameters_sm.cc
+++ b/epochX/cudacpp/ee_mumu.sa/src/Parameters_sm.cc
@@ -4,7 +4,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: A. Valassi (Sep 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
 // MadGraph5_aMC@NLO v. 3.5.0_lo_vect, 2023-06-09
diff --git a/epochX/cudacpp/ee_mumu.sa/src/Parameters_sm.h b/epochX/cudacpp/ee_mumu.sa/src/Parameters_sm.h
index 852861ced0..5e20ca27b5 100644
--- a/epochX/cudacpp/ee_mumu.sa/src/Parameters_sm.h
+++ b/epochX/cudacpp/ee_mumu.sa/src/Parameters_sm.h
@@ -210,7 +210,7 @@ namespace Parameters_sm_dependentCouplings
 #pragma GCC diagnostic push
 #pragma GCC diagnostic ignored "-Wunused-variable"  // e.g. <<warning: unused variable ‘mdl_G__exp__2’ [-Wunused-variable]>>
 #pragma GCC diagnostic ignored "-Wunused-parameter" // e.g. <<warning: unused parameter ‘G’ [-Wunused-parameter]>>
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 #pragma nv_diagnostic push
 #pragma nv_diag_suppress 177 // e.g. <<warning #177-D: variable "mdl_G__exp__2" was declared but never referenced>>
 #endif
@@ -234,7 +234,7 @@ namespace Parameters_sm_dependentCouplings
     // End SM implementation - no special handling of vectors of floats as in EFT (#439)
     return out;
   }
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 #pragma GCC diagnostic pop
 #pragma nv_diagnostic pop
 #endif
@@ -252,7 +252,7 @@ namespace Parameters_sm_independentCouplings
 
 //==========================================================================
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/ee_mumu.sa/src/mgOnGpuConfig.h b/epochX/cudacpp/ee_mumu.sa/src/mgOnGpuConfig.h
index 6c0c4919e9..d4e37d19b3 100644
--- a/epochX/cudacpp/ee_mumu.sa/src/mgOnGpuConfig.h
+++ b/epochX/cudacpp/ee_mumu.sa/src/mgOnGpuConfig.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jul 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MGONGPUCONFIG_H
 #define MGONGPUCONFIG_H 1
@@ -10,13 +10,26 @@
 // There are two different code bases for standalone_cudacpp (without multichannel) and madevent+cudacpp (with multichannel)
 #undef MGONGPU_SUPPORTS_MULTICHANNEL
 
+// Is this a GPU (CUDA, HIP) or CPU implementation?
+#ifdef __CUDACC__
+#define MGONGPUCPP_GPUIMPL cuda
+#elif defined __HIPCC__
+#define MGONGPUCPP_GPUIMPL hip
+#else
+#undef MGONGPUCPP_GPUIMPL
+#endif
+
 // ** NB1 Throughputs (e.g. 6.8E8) are events/sec for "./gcheck.exe -p 65536 128 12"
 // ** NB2 Baseline on b7g47n0004 fluctuates (probably depends on load on other VMs)
 
 // Choose if curand is supported for generating random numbers
+// For CUDA, by default, it is supported
+// For HIP, by default, it is not supported
 // For C++, by default, do not inline, but allow this macro to be set from outside with e.g. -DMGONGPU_HAS_NO_CURAND
 #ifdef __CUDACC__
 #undef MGONGPU_HAS_NO_CURAND
+#elif defined __HIPCC__
+#define MGONGPU_HAS_NO_CURAND 1
 #else
 //#undef MGONGPU_HAS_NO_CURAND // default
 ////#define MGONGPU_HAS_NO_CURAND 1
@@ -52,23 +65,28 @@
 //#undef MGONGPU_HARDCODE_PARAM // default
 ////#define MGONGPU_HARDCODE_PARAM 1
 
-// Complex type in c++: std::complex or cxsmpl (CHOOSE ONLY ONE)
-#ifndef __CUDACC__
-//#define MGONGPU_CPPCXTYPE_STDCOMPLEX 1 // ~8 percent slower on float, same on double (5.1E6/double, 9.4E6/float)
-#define MGONGPU_CPPCXTYPE_CXSMPL 1 // new default (5.1E6/double, 10.2E6/float)
-#endif
-
-// Complex type in cuda: thrust or cucomplex or cxsmpl (CHOOSE ONLY ONE)
+// Complex type in CUDA: thrust or cucomplex or cxsmpl (CHOOSE ONLY ONE)
 #ifdef __CUDACC__
 #define MGONGPU_CUCXTYPE_THRUST 1 // default (~1.15E9/double, ~3.2E9/float)
 //#define MGONGPU_CUCXTYPE_CUCOMPLEX 1 // ~10 percent slower (1.03E9/double, ~2.8E9/float)
 //#define MGONGPU_CUCXTYPE_CXSMPL 1 // ~10 percent slower (1.00E9/double, ~2.9E9/float)
+
+// Complex type in HIP: cxsmpl (ONLY ONE OPTION POSSIBLE)
+#elif defined __HIPCC__
+#define MGONGPU_CUCXTYPE_CXSMPL 1 // ~10 percent slower (1.00E9/double, ~2.9E9/float)
+
+// Complex type in C++: std::complex or cxsmpl (CHOOSE ONLY ONE)
+#else
+//#define MGONGPU_CPPCXTYPE_STDCOMPLEX 1 // ~8 percent slower on float, same on double (5.1E6/double, 9.4E6/float)
+#define MGONGPU_CPPCXTYPE_CXSMPL 1 // new default (5.1E6/double, 10.2E6/float)
 #endif
 
-// Cuda nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation
+// CUDA nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation
 #ifdef __CUDACC__
-#undef MGONGPU_NSIGHT_DEBUG // default
+#undef MGONGPU_NSIGHT_DEBUG // default in CUDA
 //#define MGONGPU_NSIGHT_DEBUG 1
+#else
+#undef MGONGPU_NSIGHT_DEBUG // only option in HIP or C++
 #endif
 
 // SANITY CHECKS (floating point precision for everything but color algebra #537)
@@ -84,17 +102,21 @@
 #error You cannot use double precision for color algebra and single precision elsewhere
 #endif
 
-// SANITY CHECKS (c++ complex number implementation)
-#ifndef __CUDACC__
-#if defined MGONGPU_CPPCXTYPE_STDCOMPLEX and defined MGONGPU_CPPCXTYPE_CXSMPL
-#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CPPCXTYPE_STDCOMPLEX or MGONGPU_CPPCXTYPE_CXSMPL
+// SANITY CHECKS (CUDA complex number implementation)
+#ifdef __CUDACC__
+#if defined MGONGPU_CUCXTYPE_THRUST and defined MGONGPU_CUCXTYPE_CUCOMPLEX
+#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CUCXTYPE_THRUST or MGONGPU_CUCXTYPE_CUCOMPLEX for CUDA
+#elif defined MGONGPU_CUCXTYPE_THRUST and defined MGONGPU_CUCXTYPE_CXSMPL
+#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CUCXTYPE_THRUST or MGONGPU_CUCXTYPE_CXSMPL for CUDA
+#elif defined MGONGPU_CUCXTYPE_CUCOMPLEX and defined MGONGPU_CUCXTYPE_CXSMPL
+#error You must CHOOSE (ONE AND) ONLY ONE OF MGONGPU_CUCXTYPE_CUCOMPLEX or MGONGPU_CUCXTYPE_CXSMPL for CUDA
 #endif
 #endif
 
-// SANITY CHECKS (cuda complex number implementation)
-#ifdef __CUDACC__
-#if defined MGONGPU_CUCXTYPE_THRUST and defined MGONGPU_CUCXTYPE_CUCOMPLEX and defined MGONGPU_CUCXTYPE_CXSMPL
-#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CUCXTYPE_THRUST or MGONGPU_CUCXTYPE_CUCOMPLEX or MGONGPU_CUCXTYPE_CXSMPL
+// SANITY CHECKS (C++ complex number implementation)
+#ifndef MGONGPUCPP_GPUIMPL
+#if defined MGONGPU_CPPCXTYPE_STDCOMPLEX and defined MGONGPU_CPPCXTYPE_CXSMPL
+#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CPPCXTYPE_STDCOMPLEX or MGONGPU_CPPCXTYPE_CXSMPL for C++
 #endif
 #endif
 
@@ -131,7 +153,7 @@ namespace mgOnGpu
   // Alignment requirement for using reinterpret_cast with SIMD vectorized code
   // (using reinterpret_cast with non aligned memory may lead to segmentation faults!)
   // Only needed for C++ code but can be enforced also in NVCC builds of C++ code using CUDA>=11.2 and C++17 (#318, #319, #333)
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   constexpr int cppAlign = 64; // alignment requirement for SIMD vectorization (64-byte i.e. 512-bit)
 #endif
 
@@ -142,7 +164,7 @@ using mgOnGpu::fptype;
 using mgOnGpu::fptype2;
 
 // C++ SIMD vectorization width (this will be used to set neppV)
-#ifdef __CUDACC__ // CUDA implementation has no SIMD
+#ifdef MGONGPUCPP_GPUIMPL // CUDA and HIP implementations have no SIMD
 #undef MGONGPU_CPPSIMD
 #elif defined __AVX512VL__ && defined MGONGPU_PVW512 // C++ "512z" AVX512 with 512 width (512-bit ie 64-byte): 8 (DOUBLE) or 16 (FLOAT)
 #ifdef MGONGPU_FPTYPE_DOUBLE
@@ -172,9 +194,9 @@ using mgOnGpu::fptype2;
 #undef MGONGPU_CPPSIMD
 #endif
 
-// Cuda nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation
+// CUDA nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation
 // Arguments (not used so far): text is __FUNCTION__, code is 0 (start) or 1 (end)
-#if defined __CUDACC__ && defined MGONGPU_NSIGHT_DEBUG /* clang-format off */
+#if defined __CUDA__ && defined MGONGPU_NSIGHT_DEBUG /* clang-format off */
 #define mgDebugDeclare() __shared__ float mgDebugCounter[mgOnGpu::ntpbMAX];
 #define mgDebugInitialise() { mgDebugCounter[threadIdx.x] = 0; }
 #define mgDebug( code, text ) { mgDebugCounter[threadIdx.x] += 1; }
@@ -186,8 +208,8 @@ using mgOnGpu::fptype2;
 #define mgDebugFinalise() { /*noop*/ }
 #endif /* clang-format on */
 
-// Define empty CUDA declaration specifiers for C++
-#ifndef __CUDACC__
+// Define empty CUDA/HIP declaration specifiers for C++
+#ifndef MGONGPUCPP_GPUIMPL
 #define __global__
 #define __host__
 #define __device__
diff --git a/epochX/cudacpp/ee_mumu.sa/src/mgOnGpuCxtypes.h b/epochX/cudacpp/ee_mumu.sa/src/mgOnGpuCxtypes.h
index 0cb2f1db7e..4e7ab03fa2 100644
--- a/epochX/cudacpp/ee_mumu.sa/src/mgOnGpuCxtypes.h
+++ b/epochX/cudacpp/ee_mumu.sa/src/mgOnGpuCxtypes.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022, based on earlier work by D. Smith) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MGONGPUCXTYPES_H
 #define MGONGPUCXTYPES_H 1
@@ -19,7 +19,7 @@
 #include <complex>
 
 // Complex type in cuda: thrust or cucomplex or cxsmpl
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 #if defined MGONGPU_CUCXTYPE_THRUST
 #pragma clang diagnostic push
 #pragma clang diagnostic ignored "-Wtautological-compare" // for icpx2021/clang13 (https://stackoverflow.com/a/15864661)
@@ -201,7 +201,7 @@ namespace mgOnGpu
 {
 
   // --- Type definitions (complex type: cxtype)
-#ifdef __CUDACC__ // cuda
+#ifdef MGONGPUCPP_GPUIMPL // cuda
 #if defined MGONGPU_CUCXTYPE_THRUST
   typedef thrust::complex<fptype> cxtype;
 #elif defined MGONGPU_CUCXTYPE_CUCOMPLEX
@@ -281,7 +281,7 @@ cxmake( const std::complex<double>& c ) // std::complex to cxsmpl (double-to-flo
 
 //==========================================================================
 
-#if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_THRUST // cuda + thrust
+#if defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CUCXTYPE_THRUST // cuda + thrust
 
 //------------------------------
 // CUDA - using thrust::complex
@@ -317,11 +317,11 @@ cxmake( const cxtype& c )
   return c;
 }
 
-#endif // #if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_THRUST
+#endif // #if defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CUCXTYPE_THRUST
 
 //==========================================================================
 
-#if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_CUCOMPLEX // cuda + cucomplex
+#if defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CUCXTYPE_CUCOMPLEX // cuda + cucomplex
 
 //------------------------------
 // CUDA - using cuComplex
@@ -536,11 +536,11 @@ cxmake( const std::complex<fptype>& c ) // std::complex to cucomplex (float-to-f
   return cxmake( c.real(), c.imag() );
 }
 
-#endif // #if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_CUCOMPLEX
+#endif // #if defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CUCXTYPE_CUCOMPLEX
 
 //==========================================================================
 
-#if not defined __CUDACC__ and defined MGONGPU_CPPCXTYPE_STDCOMPLEX // c++ + stdcomplex
+#if not defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CPPCXTYPE_STDCOMPLEX // c++ + stdcomplex
 
 //------------------------------
 // C++ - using std::complex
@@ -584,7 +584,7 @@ cxmake( const std::complex<double>& c ) // std::complex to std::complex (cast do
 }
 #endif
 
-#endif // #if not defined __CUDACC__ and defined MGONGPU_CPPCXTYPE_STDCOMPLEX
+#endif // #if not defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CPPCXTYPE_STDCOMPLEX
 
 //==========================================================================
 
diff --git a/epochX/cudacpp/ee_mumu.sa/src/mgOnGpuFptypes.h b/epochX/cudacpp/ee_mumu.sa/src/mgOnGpuFptypes.h
index a1cde16a67..6f6cee64d6 100644
--- a/epochX/cudacpp/ee_mumu.sa/src/mgOnGpuFptypes.h
+++ b/epochX/cudacpp/ee_mumu.sa/src/mgOnGpuFptypes.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MGONGPUFPTYPES_H
 #define MGONGPUFPTYPES_H 1
@@ -13,7 +13,7 @@
 
 //==========================================================================
 
-#ifdef __CUDACC__ // cuda
+#ifdef MGONGPUCPP_GPUIMPL // cuda
 
 //------------------------------
 // Floating point types - Cuda
@@ -57,11 +57,11 @@ fpsqrt( const fptype& f )
 #endif
 }
 
-#endif // #ifdef __CUDACC__
+#endif // #ifdef MGONGPUCPP_GPUIMPL
 
 //==========================================================================
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 
 //------------------------------
 // Floating point types - C++
@@ -85,7 +85,7 @@ fpsqrt( const fptype& f )
   return std::sqrt( f );
 }
 
-#endif // #ifndef __CUDACC__
+#endif // #ifndef MGONGPUCPP_GPUIMPL
 
 //==========================================================================
 
diff --git a/epochX/cudacpp/ee_mumu.sa/src/mgOnGpuVectors.h b/epochX/cudacpp/ee_mumu.sa/src/mgOnGpuVectors.h
index 9d3e82b1e3..7904b93c61 100644
--- a/epochX/cudacpp/ee_mumu.sa/src/mgOnGpuVectors.h
+++ b/epochX/cudacpp/ee_mumu.sa/src/mgOnGpuVectors.h
@@ -108,7 +108,7 @@ namespace mgOnGpu /* clang-format off */
 #endif
 #endif
 
-#else // i.e #ifndef MGONGPU_CPPSIMD (this includes #ifdef __CUDACC__)
+#else // i.e #ifndef MGONGPU_CPPSIMD (this includes #ifdef MGONGPUCPP_GPUIMPL)
 
   const int neppV = 1;
 
@@ -129,7 +129,7 @@ using mgOnGpu::bool_v;
 
 //--------------------------------------------------------------------------
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 
 // Printout to stream for user defined types
 
@@ -744,11 +744,11 @@ fpvsplit1( const fptype2_v& v )
 
 #endif // #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
 
-#endif // #ifndef __CUDACC__
+#endif // #ifndef MGONGPUCPP_GPUIMPL
 
 //==========================================================================
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 
 //------------------------------
 // Vector types - CUDA
@@ -786,12 +786,12 @@ cxternary( const bool& mask, const cxtype& a, const cxtype& b )
   return ( mask ? a : b );
 }
 
-#endif // #ifdef __CUDACC__
+#endif // #ifdef MGONGPUCPP_GPUIMPL
 
 //==========================================================================
 
 // Scalar-or-vector types: scalar in CUDA, vector or scalar in C++
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 typedef bool bool_sv;
 typedef fptype fptype_sv;
 typedef fptype2 fptype2_sv;
@@ -812,7 +812,7 @@ typedef mgOnGpu::cxtype_ref cxtype_sv_ref;
 #endif
 
 // Scalar-or-vector zeros: scalar in CUDA, vector or scalar in C++
-#ifdef __CUDACC__ /* clang-format off */
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
 inline __host__ __device__ cxtype cxzero_sv(){ return cxtype( 0, 0 ); }
 #elif defined MGONGPU_CPPSIMD
 inline cxtype_v cxzero_sv() { return cxtype_v(); } // RRRR=0000 IIII=0000
diff --git a/epochX/cudacpp/ee_mumu.sa/src/rambo.h b/epochX/cudacpp/ee_mumu.sa/src/rambo.h
index e02ea52496..cd7e1008ea 100644
--- a/epochX/cudacpp/ee_mumu.sa/src/rambo.h
+++ b/epochX/cudacpp/ee_mumu.sa/src/rambo.h
@@ -4,7 +4,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 
 #include "mgOnGpuConfig.h"
@@ -18,7 +18,7 @@
 #include <iostream>
 
 // Simplified rambo version for 2 to N (with N>=2) processes with massless particles
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -83,7 +83,7 @@ namespace mg5amcCpu
       static bool first = true;
       if( first )
       {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
         if constexpr( M_ACCESS::isOnDevice() ) // avoid
         {
           const int ievt0 = 0;
@@ -166,7 +166,7 @@ namespace mg5amcCpu
     wt = po2log;
     if( nparf != 2 ) wt = ( 2. * nparf - 4. ) * log( energy ) + z[nparf - 1];
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
     // issue warnings if weight is too small or too large
     static int iwarn[5] = { 0, 0, 0, 0, 0 };
     if( wt < -180. )
diff --git a/epochX/cudacpp/gg_tt.mad/CODEGEN_mad_gg_tt_log.txt b/epochX/cudacpp/gg_tt.mad/CODEGEN_mad_gg_tt_log.txt
index 18208a863b..babfee914b 100644
--- a/epochX/cudacpp/gg_tt.mad/CODEGEN_mad_gg_tt_log.txt
+++ b/epochX/cudacpp/gg_tt.mad/CODEGEN_mad_gg_tt_log.txt
@@ -51,8 +51,8 @@ Note that you can still compile and run aMC@NLO with the built-in PDFs
  MG5_aMC> set lhapdf /PATH/TO/lhapdf-config
 
 Using default text editor "vi". Set another one in ./input/mg5_configuration.txt
-No valid eps viewer found. Please set in ./input/mg5_configuration.txt
-Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt
+Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt
+No valid web browser found. Please set in ./input/mg5_configuration.txt
 import /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_mad_gg_tt.mg
 The import format was not given, so we guess it as command
 set stdout_level DEBUG
@@ -62,7 +62,7 @@ generate g g > t t~
 No model currently active, so we import the Standard Model
 INFO: load particles 
 INFO: load vertices 
-[1;32mDEBUG: model prefixing  takes 0.004682779312133789 [0m
+[1;32mDEBUG: model prefixing  takes 0.005231142044067383 [0m
 INFO: Restrict model sm with file models/sm/restrict_default.dat . 
 [1;32mDEBUG: Simplifying conditional expressions [0m
 [1;32mDEBUG: remove interactions: u s w+ at order: QED=1 [0m
@@ -177,7 +177,7 @@ INFO: Creating files in directory P1_gg_ttx
 [1;32mDEBUG:  Entering PLUGIN_OneProcessExporter.__init__ [1;30m[model_handling.py at line 1028][0m [0m
 [1;32mDEBUG:  kwargs[prefix] = 0 [1;30m[model_handling.py at line 1029][0m [0m
 [1;32mDEBUG:  proc_id = [0m 1 [1;30m[model_handling.py at line 1034][0m [0m
-[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_SA_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7f8b34fc4760> [1;30m[export_v4.py at line 6174][0m [0m
+[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_SA_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7f4130fed7c0> [1;30m[export_v4.py at line 6174][0m [0m
 INFO: Creating files in directory . 
 [1;32mDEBUG:  Entering PLUGIN_OneProcessExporter.generate_process_files [1;30m[model_handling.py at line 1286][0m [0m
 [1;32mDEBUG:  self.include_multi_channel is already defined: this is madevent+second_exporter mode [1;30m[model_handling.py at line 1288][0m [0m
@@ -192,11 +192,11 @@ FileWriter <class 'PLUGIN.CUDACPP_SA_OUTPUT.model_handling.PLUGIN_CPPWriter'> fo
 [1;32mDEBUG:  self.support_multichannel, self.include_multi_channel = [0m True [1, 2, 3] [1;30m[model_handling.py at line 1152][0m [0m
 [1;32mDEBUG:  multi_channel = [0m {1: [0], 2: [1], 3: [2]} [1;30m[model_handling.py at line 1158][0m [0m
 [1;32mDEBUG:  multi_channel_map = [0m {1: [0], 2: [1], 3: [2]} [1;30m[model_handling.py at line 1643][0m [0m
-[1;32mDEBUG:  diag_to_config = [0m {1: 1, 2: 2, 3: 3} [1;30m[model_handling.py at line 1698][0m [0m
-[1;32mDEBUG:  call = [0m vxxxxx( momenta,m_pars->%s, cHel[ihel][%d],%+d, w_sv[%d], %d ); [1;30m[model_handling.py at line 1810][0m [0m
-[1;32mDEBUG:  ('ZERO', 0, -1, 0, 0) [1;30m[model_handling.py at line 1811][0m [0m
-[1;32mDEBUG:  call = [0m vxxxxx( momenta,m_pars->%s, cHel[ihel][%d],%+d, w_sv[%d], %d ); [1;30m[model_handling.py at line 1810][0m [0m
-[1;32mDEBUG:  ('ZERO', 1, -1, 1, 1) [1;30m[model_handling.py at line 1811][0m [0m
+[1;32mDEBUG:  diag_to_config = [0m {1: 1, 2: 2, 3: 3} [1;30m[model_handling.py at line 1700][0m [0m
+[1;32mDEBUG:  call = [0m vxxxxx( momenta,m_pars->%s, cHel[ihel][%d],%+d, w_sv[%d], %d ); [1;30m[model_handling.py at line 1812][0m [0m
+[1;32mDEBUG:  ('ZERO', 0, -1, 0, 0) [1;30m[model_handling.py at line 1813][0m [0m
+[1;32mDEBUG:  call = [0m vxxxxx( momenta,m_pars->%s, cHel[ihel][%d],%+d, w_sv[%d], %d ); [1;30m[model_handling.py at line 1812][0m [0m
+[1;32mDEBUG:  ('ZERO', 1, -1, 1, 1) [1;30m[model_handling.py at line 1813][0m [0m
 INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./. 
 [1;32mDEBUG:  Entering PLUGIN_OneProcessExporter.edit_CMakeLists [1;30m[model_handling.py at line 1332][0m [0m
 [1;32mDEBUG:  Entering PLUGIN_OneProcessExporter.edit_check_sa [1;30m[model_handling.py at line 1341][0m [0m
@@ -214,16 +214,16 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./.
 INFO: Generating Feynman diagrams for Process: g g > t t~ WEIGHTED<=2 @1 
 INFO: Finding symmetric diagrams for subprocess group gg_ttx 
 Generated helas calls for 1 subprocesses (3 diagrams) in 0.006 s
-Wrote files for 10 helas calls in 0.109 s
+Wrote files for 10 helas calls in 0.110 s
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates VVV1 set of routines with options: P0[0m
 ALOHA: aloha creates FFV1 routines[0m
-ALOHA: aloha creates 2 routines in  0.126 s
+ALOHA: aloha creates 2 routines in  0.137 s
 [1;32mDEBUG:  Entering PLUGIN_ProcessExporter.convert_model (create the model) [1;30m[output.py at line 194][0m [0m
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates VVV1 set of routines with options: P0[0m
 ALOHA: aloha creates FFV1 routines[0m
-ALOHA: aloha creates 4 routines in  0.113 s
+ALOHA: aloha creates 4 routines in  0.126 s
 <class 'aloha.create_aloha.AbstractRoutine'> VVV1
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
@@ -266,6 +266,6 @@ Type "launch" to generate events from this process, or see
 Run "open index.html" to see more information about this process.
 quit
 
-real	0m1.778s
-user	0m1.425s
-sys	0m0.206s
+real	0m1.707s
+user	0m1.508s
+sys	0m0.188s
diff --git a/epochX/cudacpp/gg_tt.mad/COPYRIGHT b/epochX/cudacpp/gg_tt.mad/COPYRIGHT
index a134b5fef9..84a883fbb0 100644
--- a/epochX/cudacpp/gg_tt.mad/COPYRIGHT
+++ b/epochX/cudacpp/gg_tt.mad/COPYRIGHT
@@ -15,6 +15,7 @@ The full development team currently includes the following authors :
   Stephan Hageboeck (CERN)
   Olivier Mattelaer (Universite Catholique de Louvain, original author)
   Stefan Roiser (CERN, original author)
+  Joergen Teig (CERN)
   Andrea Valassi (CERN, original author)
   Zenny Wettersten (CERN)
 See https://github.com/madgraph5/madgraph4gpu for more details. For the full
diff --git a/epochX/cudacpp/gg_tt.mad/Source/DHELAS/aloha_file.inc b/epochX/cudacpp/gg_tt.mad/Source/DHELAS/aloha_file.inc
index 59e590217d..5597c614b0 100644
--- a/epochX/cudacpp/gg_tt.mad/Source/DHELAS/aloha_file.inc
+++ b/epochX/cudacpp/gg_tt.mad/Source/DHELAS/aloha_file.inc
@@ -1 +1 @@
-ALOHARoutine = FFV1_1.o FFV1_2.o VVV1P0_1.o FFV1_0.o
+ALOHARoutine = FFV1_1.o FFV1_0.o FFV1_2.o VVV1P0_1.o
diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/Bridge.h b/epochX/cudacpp/gg_tt.mad/SubProcesses/Bridge.h
index 4cafe0c997..c04628dfd1 100644
--- a/epochX/cudacpp/gg_tt.mad/SubProcesses/Bridge.h
+++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/Bridge.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: S. Roiser (Nov 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Roiser, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef BRIDGE_H
 #define BRIDGE_H 1
@@ -22,7 +22,7 @@
 #include <memory>
 #include <type_traits>
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -82,7 +82,7 @@ namespace mg5amcCpu
     Bridge& operator=( const Bridge& ) = delete;
     Bridge& operator=( Bridge&& ) = delete;
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     /**
      * Set the gpublocks and gputhreads for the gpusequence - throws if evnt != gpublocks*gputhreads
      * (this is needed for BridgeKernel tests rather than for actual production use in Fortran)
@@ -149,7 +149,7 @@ namespace mg5amcCpu
     unsigned int m_nevt; // number of events
     int m_nGoodHel;      // the number of good helicities (-1 initially when they have not yet been calculated)
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     int m_gputhreads; // number of gpu threads (default set from number of events, can be modified)
     int m_gpublocks;  // number of gpu blocks (default set from number of events, can be modified)
     mg5amcGpu::DeviceBuffer<FORTRANFPTYPE, sizePerEventMomenta> m_devMomentaF;
@@ -186,12 +186,12 @@ namespace mg5amcCpu
   // Forward declare transposition methods
   //
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 
   template<typename Tin, typename Tout>
   __global__ void dev_transposeMomentaF2C( const Tin* in, Tout* out, const unsigned int nevt );
 
-#endif // __CUDACC__
+#endif // MGONGPUCPP_GPUIMPL
 
   template<typename Tin, typename Tout>
   void hst_transposeMomentaF2C( const Tin* in, Tout* out, const unsigned int nevt );
@@ -208,7 +208,7 @@ namespace mg5amcCpu
   Bridge<FORTRANFPTYPE>::Bridge( unsigned int nevtF, unsigned int nparF, unsigned int np4F )
     : m_nevt( nevtF )
     , m_nGoodHel( -1 )
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     , m_gputhreads( 256 )                  // default number of gpu threads
     , m_gpublocks( m_nevt / m_gputhreads ) // this ensures m_nevt <= m_gpublocks*m_gputhreads
     , m_devMomentaF( m_nevt )
@@ -232,7 +232,7 @@ namespace mg5amcCpu
   {
     if( nparF != CPPProcess::npar ) throw std::runtime_error( "Bridge constructor: npar mismatch" );
     if( np4F != CPPProcess::np4 ) throw std::runtime_error( "Bridge constructor: np4 mismatch" );
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     if( ( m_nevt < s_gputhreadsmin ) || ( m_nevt % s_gputhreadsmin != 0 ) )
       throw std::runtime_error( "Bridge constructor: nevt should be a multiple of " + std::to_string( s_gputhreadsmin ) );
     while( m_nevt != m_gpublocks * m_gputhreads )
@@ -250,11 +250,11 @@ namespace mg5amcCpu
     std::cout << "WARNING! Instantiate host Bridge (nevt=" << m_nevt << ")" << std::endl;
     mg5amcCpu::CPPProcess process( /*verbose=*/false );
     m_pmek.reset( new mg5amcCpu::MatrixElementKernelHost( m_hstMomentaC, m_hstGs, m_hstRndHel, m_hstRndCol, m_hstMEs, m_hstSelHel, m_hstSelCol, m_nevt ) );
-#endif // __CUDACC__
+#endif // MGONGPUCPP_GPUIMPL
     process.initProc( "../../Cards/param_card.dat" );
   }
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   template<typename FORTRANFPTYPE>
   void Bridge<FORTRANFPTYPE>::set_gpugrid( const int gpublocks, const int gputhreads )
   {
@@ -268,7 +268,7 @@ namespace mg5amcCpu
   }
 #endif
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   template<typename FORTRANFPTYPE>
   void Bridge<FORTRANFPTYPE>::gpu_sequence( const FORTRANFPTYPE* momenta,
                                             const FORTRANFPTYPE* gs,
@@ -283,14 +283,14 @@ namespace mg5amcCpu
     constexpr int neppM = MemoryAccessMomenta::neppM;
     if constexpr( neppM == 1 && std::is_same_v<FORTRANFPTYPE, fptype> )
     {
-      checkCuda( cudaMemcpy( m_devMomentaC.data(), momenta, m_devMomentaC.bytes(), cudaMemcpyHostToDevice ) );
+      gpuMemcpy( m_devMomentaC.data(), momenta, m_devMomentaC.bytes(), gpuMemcpyHostToDevice );
     }
     else
     {
-      checkCuda( cudaMemcpy( m_devMomentaF.data(), momenta, m_devMomentaF.bytes(), cudaMemcpyHostToDevice ) );
+      gpuMemcpy( m_devMomentaF.data(), momenta, m_devMomentaF.bytes(), gpuMemcpyHostToDevice );
       const int thrPerEvt = CPPProcess::npar * CPPProcess::np4; // AV: transpose alg does 1 element per thread (NOT 1 event per thread)
       //const int thrPerEvt = 1; // AV: try new alg with 1 event per thread... this seems slower
-      dev_transposeMomentaF2C<<<m_gpublocks * thrPerEvt, m_gputhreads>>>( m_devMomentaF.data(), m_devMomentaC.data(), m_nevt );
+      gpuLaunchKernel( dev_transposeMomentaF2C, m_gpublocks * thrPerEvt, m_gputhreads, m_devMomentaF.data(), m_devMomentaC.data(), m_nevt );
     }
     if constexpr( std::is_same_v<FORTRANFPTYPE, fptype> )
     {
@@ -333,7 +333,7 @@ namespace mg5amcCpu
   }
 #endif
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   template<typename FORTRANFPTYPE>
   void Bridge<FORTRANFPTYPE>::cpu_sequence( const FORTRANFPTYPE* momenta,
                                             const FORTRANFPTYPE* gs,
@@ -388,7 +388,7 @@ namespace mg5amcCpu
   // - C++ array: momenta[npagM][npar][np4][neppM] with nevt=npagM*neppM (AOSOA)
   //
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   template<typename Tin, typename Tout>
   __global__ void dev_transposeMomentaF2C( const Tin* in, Tout* out, const unsigned int nevt )
   {
diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/BridgeKernels.cc b/epochX/cudacpp/gg_tt.mad/SubProcesses/BridgeKernels.cc
index cef4cb3c71..90c7f2d3b8 100644
--- a/epochX/cudacpp/gg_tt.mad/SubProcesses/BridgeKernels.cc
+++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/BridgeKernels.cc
@@ -1,10 +1,11 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #include "BridgeKernels.h"
 
+#include "GpuAbstraction.h"
 #include "MemoryAccessMomenta.h"
 
 #include <sstream>
@@ -14,7 +15,7 @@ constexpr int npar = CPPProcess::npar; // #particles in total (external = initia
 
 //============================================================================
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -45,7 +46,7 @@ namespace mg5amcCpu
 
 //============================================================================
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 namespace mg5amcCpu
 {
 
@@ -96,7 +97,7 @@ namespace mg5amcCpu
 
 //============================================================================
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 {
 
diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/BridgeKernels.h b/epochX/cudacpp/gg_tt.mad/SubProcesses/BridgeKernels.h
index 15eb4bff4d..3efef8ce97 100644
--- a/epochX/cudacpp/gg_tt.mad/SubProcesses/BridgeKernels.h
+++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/BridgeKernels.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef BRIDGEKERNELS_H
 #define BRIDGEKERNELS_H 1
@@ -12,7 +12,7 @@
 #include "MatrixElementKernels.h"
 #include "MemoryBuffers.h"
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -49,7 +49,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A Bridge wrapper class encapsulating matrix element calculations on a CPU host
   class BridgeKernelHost final : public BridgeKernelBase
   {
@@ -89,7 +89,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   // A Bridge wrapper class encapsulating matrix element calculations on a GPU device
   class BridgeKernelDevice : public BridgeKernelBase
   {
diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/CommonRandomNumberKernel.cc b/epochX/cudacpp/gg_tt.mad/SubProcesses/CommonRandomNumberKernel.cc
index 985b39f576..010bc4cbd0 100644
--- a/epochX/cudacpp/gg_tt.mad/SubProcesses/CommonRandomNumberKernel.cc
+++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/CommonRandomNumberKernel.cc
@@ -1,15 +1,16 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #include "CommonRandomNumbers.h"
+#include "GpuAbstraction.h"
 #include "MemoryBuffers.h"
 #include "RandomNumberKernels.h"
 
 #include <cassert>
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/CrossSectionKernels.cc b/epochX/cudacpp/gg_tt.mad/SubProcesses/CrossSectionKernels.cc
index 0b355a3c8d..c15b39844d 100644
--- a/epochX/cudacpp/gg_tt.mad/SubProcesses/CrossSectionKernels.cc
+++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/CrossSectionKernels.cc
@@ -1,10 +1,11 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #include "CrossSectionKernels.h"
 
+#include "GpuAbstraction.h"
 #include "MemoryAccessMatrixElements.h"
 #include "MemoryAccessWeights.h"
 #include "MemoryBuffers.h"
@@ -77,7 +78,7 @@ debug_me_is_abnormal( const fptype& me, size_t ievtALL )
 
 //============================================================================
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -185,7 +186,7 @@ namespace mg5amcCpu
 
 //============================================================================
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 {
 
diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/CrossSectionKernels.h b/epochX/cudacpp/gg_tt.mad/SubProcesses/CrossSectionKernels.h
index 7933ca4bbf..4d9659e04e 100644
--- a/epochX/cudacpp/gg_tt.mad/SubProcesses/CrossSectionKernels.h
+++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/CrossSectionKernels.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef CROSSSECTIONKERNELS_H
 #define CROSSSECTIONKERNELS_H 1
@@ -13,7 +13,7 @@
 
 //============================================================================
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -96,7 +96,7 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
   /*
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   // A class encapsulating the calculation of event statistics on a GPU device
   class CrossSectionKernelDevice : public CrossSectionKernelBase, public NumberOfEvents
   {
diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/CurandRandomNumberKernel.cc b/epochX/cudacpp/gg_tt.mad/SubProcesses/CurandRandomNumberKernel.cc
index eb56333b03..38c477c17a 100644
--- a/epochX/cudacpp/gg_tt.mad/SubProcesses/CurandRandomNumberKernel.cc
+++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/CurandRandomNumberKernel.cc
@@ -1,9 +1,9 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
-#include "CudaRuntime.h"
+#include "GpuRuntime.h"
 #include "MemoryBuffers.h"
 #include "RandomNumberKernels.h"
 
@@ -114,7 +114,7 @@ namespace mg5amcCpu
     /*
     printf( "\nCurandRandomNumberKernel::generateRnarray size = %d\n", (int)m_rnarray.size() );
     fptype* data = m_rnarray.data();
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     if( m_rnarray.isOnDevice() )
     {
       data = new fptype[m_rnarray.size()]();
@@ -123,7 +123,7 @@ namespace mg5amcCpu
 #endif
     for( int i = 0; i < ( (int)m_rnarray.size() / 4 ); i++ )
       printf( "[%4d] %f %f %f %f\n", i * 4, data[i * 4], data[i * 4 + 2], data[i * 4 + 2], data[i * 4 + 3] );
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     if( m_rnarray.isOnDevice() ) delete[] data;
 #endif
     */
diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/EventStatistics.h b/epochX/cudacpp/gg_tt.mad/SubProcesses/EventStatistics.h
index 48b51e0a49..b425a5bade 100644
--- a/epochX/cudacpp/gg_tt.mad/SubProcesses/EventStatistics.h
+++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/EventStatistics.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef EventStatistics_H
 #define EventStatistics_H 1
@@ -16,7 +16,7 @@
 #include <limits>
 #include <string>
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/GpuAbstraction.h b/epochX/cudacpp/gg_tt.mad/SubProcesses/GpuAbstraction.h
new file mode 100644
index 0000000000..6a7d9c05c0
--- /dev/null
+++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/GpuAbstraction.h
@@ -0,0 +1,71 @@
+// Copyright (C) 2020-2023 CERN and UCLouvain.
+// Licensed under the GNU Lesser General Public License (version 3 or later).
+// Created by: J. Teig (Jul 2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+
+#ifndef MG5AMC_GPUABSTRACTION_H
+#define MG5AMC_GPUABSTRACTION_H 1
+
+#include <cassert>
+
+//--------------------------------------------------------------------------
+
+#ifdef __CUDACC__
+
+#define gpuError_t cudaError_t
+#define gpuPeekAtLastError cudaPeekAtLastError
+#define gpuGetErrorString cudaGetErrorString
+#define gpuSuccess cudaSuccess
+
+#define gpuMallocHost( ptr, size ) checkGpu( cudaMallocHost( ptr, size ) )
+#define gpuMalloc( ptr, size ) checkGpu( cudaMalloc( ptr, size ) )
+
+#define gpuMemcpy( dstData, srcData, srcBytes, func ) checkGpu( cudaMemcpy( dstData, srcData, srcBytes, func ) )
+#define gpuMemcpyHostToDevice cudaMemcpyHostToDevice
+#define gpuMemcpyDeviceToHost cudaMemcpyDeviceToHost
+#define gpuMemcpyToSymbol( type1, type2, size ) checkGpu( cudaMemcpyToSymbol( type1, type2, size ) )
+
+#define gpuFree( ptr ) checkGpu( cudaFree( ptr ) )
+#define gpuFreeHost( ptr ) checkGpu( cudaFreeHost( ptr ) )
+
+#define gpuSetDevice cudaSetDevice
+#define gpuDeviceSynchronize cudaDeviceSynchronize
+#define gpuDeviceReset cudaDeviceReset
+
+#define gpuLaunchKernel( kernel, blocks, threads, ... ) kernel<<<blocks, threads>>>( __VA_ARGS__ )
+#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<<blocks, threads, sharedMem>>>( __VA_ARGS__ )
+
+//--------------------------------------------------------------------------
+
+#elif defined __HIPCC__
+
+#include "hip/hip_runtime.h"
+
+#define gpuError_t hipError_t
+#define gpuPeekAtLastError hipPeekAtLastError
+#define gpuGetErrorString hipGetErrorString
+#define gpuSuccess hipSuccess
+
+#define gpuMallocHost( ptr, size ) checkGpu( hipHostMalloc( ptr, size ) ) // HostMalloc better
+#define gpuMalloc( ptr, size ) checkGpu( hipMalloc( ptr, size ) )
+
+#define gpuMemcpy( dstData, srcData, srcBytes, func ) checkGpu( hipMemcpy( dstData, srcData, srcBytes, func ) )
+#define gpuMemcpyHostToDevice hipMemcpyHostToDevice
+#define gpuMemcpyDeviceToHost hipMemcpyDeviceToHost
+#define gpuMemcpyToSymbol( type1, type2, size ) checkGpu( hipMemcpyToSymbol( type1, type2, size ) )
+
+#define gpuFree( ptr ) checkGpu( hipFree( ptr ) )
+#define gpuFreeHost( ptr ) checkGpu( hipHostFree( ptr ) )
+
+#define gpuSetDevice hipSetDevice
+#define gpuDeviceSynchronize hipDeviceSynchronize
+#define gpuDeviceReset hipDeviceReset
+
+#define gpuLaunchKernel( kernel, blocks, threads, ... ) kernel<<<blocks, threads>>>( __VA_ARGS__ )
+#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<<blocks, threads, sharedMem>>>( __VA_ARGS__ )
+
+//--------------------------------------------------------------------------
+
+#endif
+
+#endif // MG5AMC_GPUABSTRACTION_H
diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/CudaRuntime.h b/epochX/cudacpp/gg_tt.mad/SubProcesses/GpuRuntime.h
similarity index 62%
rename from epochX/cudacpp/gg_tt.mad/SubProcesses/CudaRuntime.h
rename to epochX/cudacpp/gg_tt.mad/SubProcesses/GpuRuntime.h
index 64ce52f4b3..93579ef08b 100644
--- a/epochX/cudacpp/gg_tt.mad/SubProcesses/CudaRuntime.h
+++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/GpuRuntime.h
@@ -1,49 +1,50 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
-// Created by: S. Roiser (Jul 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+// Created by: J. Teig (Jun 2023, based on earlier work by S. Roiser) for the MG5aMC CUDACPP plugin.
+// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 
-#ifndef MG5AMC_CUDARUNTIME_H
-#define MG5AMC_CUDARUNTIME_H 1
+#ifndef MG5AMC_GPURUNTIME_H
+#define MG5AMC_GPURUNTIME_H 1
 
 // MG5AMC on GPU uses the CUDA runtime API, not the lower level CUDA driver API
 // See https://docs.nvidia.com/cuda/cuda-runtime-api/driver-vs-runtime-api.html#driver-vs-runtime-api
 
-#include <cassert>
+#include "GpuAbstraction.h"
+
 #include <iostream>
 
 //--------------------------------------------------------------------------
 
 // See https://stackoverflow.com/a/14038590
-#ifdef __CUDACC__ /* clang-format off */
-#define checkCuda( code ) { assertCuda( code, __FILE__, __LINE__ ); }
-inline void assertCuda( cudaError_t code, const char* file, int line, bool abort = true )
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
+#define checkGpu( code ) { assertGpu( code, __FILE__, __LINE__ ); }
+inline void assertGpu( gpuError_t code, const char* file, int line, bool abort = true )
 {
-  if( code != cudaSuccess )
+  if( code != gpuSuccess )
   {
-    printf( "ERROR! assertCuda: '%s' (%d) in %s:%d\n", cudaGetErrorString( code ), code, file, line );
-    if( abort ) assert( code == cudaSuccess );
+    printf( "ERROR! assertGpu: '%s' (%d) in %s:%d\n", gpuGetErrorString( code ), code, file, line );
+    if( abort ) assert( code == gpuSuccess );
   }
 }
 #endif /* clang-format on */
 
 //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 {
-  // Instantiate a CudaRuntime at the beginnining of the application's main to
-  // invoke cudaSetDevice(0) in the constructor and book a cudaDeviceReset() call in the destructor
+  // Instantiate a GpuRuntime at the beginnining of the application's main to
+  // invoke gpuSetDevice(0) in the constructor and book a gpuDeviceReset() call in the destructor
   // *** FIXME! This will all need to be designed differently when going to multi-GPU nodes! ***
-  struct CudaRuntime final
+  struct GpuRuntime final
   {
-    CudaRuntime( const bool debug = true )
+    GpuRuntime( const bool debug = true )
       : m_debug( debug ) { setUp( m_debug ); }
-    ~CudaRuntime() { tearDown( m_debug ); }
-    CudaRuntime( const CudaRuntime& ) = delete;
-    CudaRuntime( CudaRuntime&& ) = delete;
-    CudaRuntime& operator=( const CudaRuntime& ) = delete;
-    CudaRuntime& operator=( CudaRuntime&& ) = delete;
+    ~GpuRuntime() { tearDown( m_debug ); }
+    GpuRuntime( const GpuRuntime& ) = delete;
+    GpuRuntime( GpuRuntime&& ) = delete;
+    GpuRuntime& operator=( const GpuRuntime& ) = delete;
+    GpuRuntime& operator=( GpuRuntime&& ) = delete;
     bool m_debug;
 
     // Set up CUDA application
@@ -62,8 +63,8 @@ namespace mg5amcGpu
       */
       // Replace cudaFree(0) by cudaSetDevice(0), even if it is not really needed either
       // (but see https://developer.nvidia.com/blog/cuda-pro-tip-always-set-current-device-avoid-multithreading-bugs)
-      if( debug ) std::cout << "__CudaRuntime: calling cudaSetDevice(0)" << std::endl;
-      checkCuda( cudaSetDevice( 0 ) ); // SLOW!
+      if( debug ) std::cout << "__GpuRuntime: calling GpuSetDevice(0)" << std::endl;
+      checkGpu( gpuSetDevice( 0 ) ); // SLOW!
     }
 
     // Tear down CUDA application (call cudaDeviceReset)
@@ -72,14 +73,13 @@ namespace mg5amcGpu
     // See https://docs.nvidia.com/cuda/cuda-memcheck/index.html#leak-checking
     static void tearDown( const bool debug = true )
     {
-      if( debug ) std::cout << "__CudaRuntime: calling cudaDeviceReset()" << std::endl;
-      checkCuda( cudaDeviceReset() );
+      if( debug ) std::cout << "__GpuRuntime: calling GpuDeviceReset()" << std::endl;
+      checkGpu( gpuDeviceReset() );
     }
   };
-
 }
 #endif
 
 //--------------------------------------------------------------------------
 
-#endif // MG5AMC_CUDARUNTIME_H
+#endif // MG5AMC_GPURUNTIME_H
diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/MadgraphTest.h b/epochX/cudacpp/gg_tt.mad/SubProcesses/MadgraphTest.h
index fd7734ce42..d2ff326e20 100644
--- a/epochX/cudacpp/gg_tt.mad/SubProcesses/MadgraphTest.h
+++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/MadgraphTest.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: S. Hageboeck (Dec 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MADGRAPHTEST_H_
 #define MADGRAPHTEST_H_ 1
@@ -21,7 +21,7 @@
 #include <string>
 #include <vector>
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 using mg5amcGpu::CPPProcess;
 #else
 using mg5amcCpu::CPPProcess;
@@ -200,7 +200,7 @@ class MadgraphTest : public testing::TestWithParam<TestDriverBase*>
 
 // Since we link both the CPU-only and GPU tests into the same executable, we prevent
 // a multiply defined symbol by only compiling this in the non-CUDA phase:
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 
 /// Compare momenta and matrix elements.
 /// This uses an implementation of TestDriverBase to run a madgraph workflow,
@@ -309,6 +309,6 @@ TEST_P( MadgraphTest, CompareMomentaAndME )
   }
 }
 
-#endif // __CUDACC__
+#endif // MGONGPUCPP_GPUIMPL
 
 #endif /* MADGRAPHTEST_H_ */
diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/MatrixElementKernels.cc b/epochX/cudacpp/gg_tt.mad/SubProcesses/MatrixElementKernels.cc
index 30257195b6..d6d6c4f179 100644
--- a/epochX/cudacpp/gg_tt.mad/SubProcesses/MatrixElementKernels.cc
+++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/MatrixElementKernels.cc
@@ -1,12 +1,12 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #include "MatrixElementKernels.h"
 
 #include "CPPProcess.h"
-#include "CudaRuntime.h"
+#include "GpuRuntime.h" // Includes the abstraction for Nvidia/AMD compilation
 #include "MemoryAccessMomenta.h"
 #include "MemoryBuffers.h"
 
@@ -14,7 +14,7 @@
 
 //============================================================================
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 namespace mg5amcCpu
 {
 
@@ -143,7 +143,7 @@ namespace mg5amcCpu
 
 //============================================================================
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 {
 
@@ -202,13 +202,13 @@ namespace mg5amcGpu
     PinnedHostBufferHelicityMask hstIsGoodHel( ncomb );
     DeviceBufferHelicityMask devIsGoodHel( ncomb );
     // ... 0d1. Compute good helicity mask on the device
-    computeDependentCouplings<<<m_gpublocks, m_gputhreads>>>( m_gs.data(), m_couplings.data() );
+    gpuLaunchKernel( computeDependentCouplings, m_gpublocks, m_gputhreads, m_gs.data(), m_couplings.data() );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    sigmaKin_getGoodHel<<<m_gpublocks, m_gputhreads>>>( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_numerators.data(), m_denominators.data(), devIsGoodHel.data() );
+    gpuLaunchKernel( sigmaKin_getGoodHel, m_gpublocks, m_gputhreads, m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_numerators.data(), m_denominators.data(), devIsGoodHel.data() );
 #else
-    sigmaKin_getGoodHel<<<m_gpublocks, m_gputhreads>>>( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), devIsGoodHel.data() );
+    gpuLaunchKernel( sigmaKin_getGoodHel, m_gpublocks, m_gputhreads, m_momenta.data(), m_couplings.data(), m_matrixElements.data(), devIsGoodHel.data() );
 #endif
-    checkCuda( cudaPeekAtLastError() );
+    checkGpu( gpuPeekAtLastError() );
     // ... 0d2. Copy back good helicity mask to the host
     copyHostFromDevice( hstIsGoodHel, devIsGoodHel );
     // ... 0d3. Copy back good helicity list to constant memory on the device
@@ -219,19 +219,19 @@ namespace mg5amcGpu
 
   void MatrixElementKernelDevice::computeMatrixElements( const unsigned int channelId )
   {
-    computeDependentCouplings<<<m_gpublocks, m_gputhreads>>>( m_gs.data(), m_couplings.data() );
+    gpuLaunchKernel( computeDependentCouplings, m_gpublocks, m_gputhreads, m_gs.data(), m_couplings.data() );
 #ifndef MGONGPU_NSIGHT_DEBUG
     constexpr unsigned int sharedMemSize = 0;
 #else
     constexpr unsigned int sharedMemSize = ntpbMAX * sizeof( float );
 #endif
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    sigmaKin<<<m_gpublocks, m_gputhreads, sharedMemSize>>>( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), channelId, m_numerators.data(), m_denominators.data(), m_selhel.data(), m_selcol.data() );
+    gpuLaunchKernelSharedMem( sigmaKin, m_gpublocks, m_gputhreads, sharedMemSize, m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), channelId, m_numerators.data(), m_denominators.data(), m_selhel.data(), m_selcol.data() );
 #else
-    sigmaKin<<<m_gpublocks, m_gputhreads, sharedMemSize>>>( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), m_selhel.data(), m_selcol.data() );
+    gpuLaunchKernelSharedMem( sigmaKin, m_gpublocks, m_gputhreads, sharedMemSize, m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), m_selhel.data(), m_selcol.data() );
 #endif
-    checkCuda( cudaPeekAtLastError() );
-    checkCuda( cudaDeviceSynchronize() );
+    checkGpu( gpuPeekAtLastError() );
+    checkGpu( gpuDeviceSynchronize() );
   }
 
   //--------------------------------------------------------------------------
diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/MatrixElementKernels.h b/epochX/cudacpp/gg_tt.mad/SubProcesses/MatrixElementKernels.h
index 23e84757a2..72bd8f195b 100644
--- a/epochX/cudacpp/gg_tt.mad/SubProcesses/MatrixElementKernels.h
+++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/MatrixElementKernels.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MATRIXELEMENTKERNELS_H
 #define MATRIXELEMENTKERNELS_H 1
@@ -10,7 +10,7 @@
 
 #include "MemoryBuffers.h"
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -81,7 +81,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating matrix element calculations on a CPU host
   class MatrixElementKernelHost final : public MatrixElementKernelBase, public NumberOfEvents
   {
@@ -130,7 +130,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   // A class encapsulating matrix element calculations on a GPU device
   class MatrixElementKernelDevice : public MatrixElementKernelBase, public NumberOfEvents
   {
diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryAccessHelpers.h b/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryAccessHelpers.h
index c82a6c7635..db73e4e064 100644
--- a/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryAccessHelpers.h
+++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryAccessHelpers.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MemoryAccessHelpers_H
 #define MemoryAccessHelpers_H 1
@@ -105,7 +105,7 @@ class KernelAccessHelper : public MemoryAccessHelper<T>
     }
     else
     {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
       const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid
       //printf( "kernelAccessRecord: ievt=%d threadId=%d\n", ievt, threadIdx.x );
       return T::ieventAccessRecord( buffer, ievt ); // NB fptype and fptype_sv coincide for CUDA
diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryAccessMomenta.h b/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryAccessMomenta.h
index 0ac4faa3c7..38fade09fb 100644
--- a/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryAccessMomenta.h
+++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryAccessMomenta.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MemoryAccessMomenta_H
 #define MemoryAccessMomenta_H 1
@@ -12,7 +12,7 @@
 #include "MemoryAccessHelpers.h"
 #include "MemoryAccessVectors.h"
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 using mg5amcGpu::CPPProcess;
 #else
 using mg5amcCpu::CPPProcess;
@@ -29,7 +29,7 @@ class MemoryAccessMomentaBase //_AOSOAv1
 
   // Number of Events Per Page in the momenta AOSOA memory buffer layout
   // (these are all best kept as a compile-time constants: see issue #23)
-#ifdef __CUDACC__ /* clang-format off */
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
   // -----------------------------------------------------------------------------------------------
   // --- GPUs: neppM is best set to a power of 2 times the number of fptype's in a 32-byte cacheline
   // --- This is relevant to ensure coalesced access to momenta in global memory
diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryAccessRandomNumbers.h b/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryAccessRandomNumbers.h
index e2988d39f3..40cb089135 100644
--- a/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryAccessRandomNumbers.h
+++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryAccessRandomNumbers.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MemoryAccessRandomNumbers_H
 #define MemoryAccessRandomNumbers_H 1
@@ -11,7 +11,7 @@
 #include "CPPProcess.h"
 #include "MemoryAccessHelpers.h"
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 using mg5amcGpu::CPPProcess;
 #else
 using mg5amcCpu::CPPProcess;
diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryAccessVectors.h b/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryAccessVectors.h
index e9b197368e..08faccff0f 100644
--- a/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryAccessVectors.h
+++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryAccessVectors.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MemoryAccessVectors_H
 #define MemoryAccessVectors_H 1
@@ -10,7 +10,7 @@
 
 #include "mgOnGpuVectors.h"
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 namespace mg5amcCpu // this is only needed for CPU SIMD vectorization
 {
 
diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryBuffers.h b/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryBuffers.h
index 3093e6ed18..7756a71621 100644
--- a/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryBuffers.h
+++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryBuffers.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021, based on earlier work by S. Hageboeck) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Roiser, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MemoryBuffers_H
 #define MemoryBuffers_H 1
@@ -11,12 +11,12 @@
 #include "mgOnGpuCxtypes.h"
 
 #include "CPPProcess.h"
-#include "CudaRuntime.h"
+#include "GpuRuntime.h"
 #include "Parameters_sm.h"
 
 #include <sstream>
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -87,7 +87,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   constexpr bool HostBufferALIGNED = false;   // ismisaligned=false
   constexpr bool HostBufferMISALIGNED = true; // ismisaligned=true
 
@@ -119,7 +119,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   // A class encapsulating a CUDA pinned host buffer
   template<typename T>
   class PinnedHostBufferBase : public BufferBase<T>
@@ -128,18 +128,18 @@ namespace mg5amcCpu
     PinnedHostBufferBase( const size_t size )
       : BufferBase<T>( size, false )
     {
-      checkCuda( cudaMallocHost( &( this->m_data ), this->bytes() ) );
+      gpuMallocHost( &( this->m_data ), this->bytes() );
     }
     virtual ~PinnedHostBufferBase()
     {
-      checkCuda( cudaFreeHost( this->m_data ) );
+      gpuFreeHost( this->m_data );
     }
   };
 #endif
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   // A class encapsulating a CUDA device buffer
   template<typename T>
   class DeviceBufferBase : public BufferBase<T>
@@ -148,18 +148,18 @@ namespace mg5amcCpu
     DeviceBufferBase( const size_t size )
       : BufferBase<T>( size, true )
     {
-      checkCuda( cudaMalloc( &( this->m_data ), this->bytes() ) );
+      gpuMalloc( &( this->m_data ), this->bytes() );
     }
     virtual ~DeviceBufferBase()
     {
-      checkCuda( cudaFree( this->m_data ) );
+      gpuFree( this->m_data );
     }
   };
 #endif
 
   //--------------------------------------------------------------------------
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for a given number of events
   template<typename T, size_t sizePerEvent, bool ismisaligned>
   class HostBuffer : public HostBufferBase<T, ismisaligned>, virtual private NumberOfEvents
@@ -175,7 +175,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   // A class encapsulating a CUDA pinned host buffer for a given number of events
   template<typename T, size_t sizePerEvent>
   class PinnedHostBuffer : public PinnedHostBufferBase<T>, virtual private NumberOfEvents
@@ -191,7 +191,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   // A class encapsulating a CUDA device buffer for a given number of events
   template<typename T, size_t sizePerEvent>
   class DeviceBuffer : public DeviceBufferBase<T>, virtual private NumberOfEvents
@@ -213,7 +213,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for momenta random numbers
   constexpr size_t sizePerEventRndNumMomenta = MemoryBuffers::np4 * MemoryBuffers::nparf;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for momenta random numbers
   typedef HostBuffer<fptype, sizePerEventRndNumMomenta, HostBufferALIGNED> HostBufferRndNumMomenta;
 #else
@@ -232,7 +232,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer with ONE fptype per event
   constexpr size_t sizePerEventOneFp = 1;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer with ONE fptype per event
   typedef HostBuffer<fptype, sizePerEventOneFp, HostBufferALIGNED> HostBufferOneFp;
 #else
@@ -257,7 +257,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for Gs
   constexpr size_t sizePerEventGs = 1;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for gs
   typedef HostBuffer<fptype, sizePerEventGs, HostBufferALIGNED> HostBufferGs;
 #else
@@ -276,7 +276,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for numerators
   constexpr size_t sizePerEventNumerators = 1;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for gs
   typedef HostBuffer<fptype, sizePerEventNumerators, HostBufferALIGNED> HostBufferNumerators;
 #else
@@ -296,7 +296,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for denominators
   constexpr size_t sizePerEventDenominators = 1;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for gs
   typedef HostBuffer<fptype, sizePerEventDenominators, HostBufferALIGNED> HostBufferDenominators;
 #else
@@ -315,7 +315,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for random numbers
   constexpr size_t sizePerEventCouplings = MemoryBuffers::ndcoup * MemoryBuffers::nx2;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for gs
   typedef HostBuffer<fptype, sizePerEventCouplings, HostBufferALIGNED> HostBufferCouplings;
 #else
@@ -333,7 +333,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for momenta
   constexpr size_t sizePerEventMomenta = MemoryBuffers::np4 * MemoryBuffers::npar;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for momenta
   typedef HostBuffer<fptype, sizePerEventMomenta, HostBufferALIGNED> HostBufferMomenta;
   //typedef HostBuffer<fptype, sizePerEventMomenta, HostBufferMISALIGNED> HostBufferMomenta; // TEST MISALIGNMENT!
@@ -352,7 +352,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for sampling weights
   constexpr size_t sizePerEventWeights = 1;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for sampling weights
   typedef HostBuffer<fptype, sizePerEventWeights, HostBufferALIGNED> HostBufferWeights;
 #else
@@ -370,7 +370,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for matrix elements
   constexpr size_t sizePerEventMatrixElements = 1;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for matrix elements
   typedef HostBuffer<fptype, sizePerEventMatrixElements, HostBufferALIGNED> HostBufferMatrixElements;
 #else
@@ -385,7 +385,7 @@ namespace mg5amcCpu
   // A base class encapsulating a memory buffer for the helicity mask
   typedef BufferBase<bool> BufferHelicityMask;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for the helicity mask
   typedef HostBufferBase<bool, HostBufferALIGNED> HostBufferHelicityMask;
 #else
@@ -403,7 +403,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for wavefunctions
   constexpr size_t sizePerEventWavefunctions = MemoryBuffers::nw6 * MemoryBuffers::nx2;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for wavefunctions
   typedef HostBuffer<fptype, sizePerEventWavefunctions, HostBufferALIGNED> HostBufferWavefunctions;
 #else
@@ -421,7 +421,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for helicity random numbers
   constexpr size_t sizePerEventRndNumHelicity = 1;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for helicity random numbers
   typedef HostBuffer<fptype, sizePerEventRndNumHelicity, HostBufferALIGNED> HostBufferRndNumHelicity;
 #else
@@ -439,7 +439,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for color random numbers
   constexpr size_t sizePerEventRndNumColor = 1;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for color random numbers
   typedef HostBuffer<fptype, sizePerEventRndNumColor, HostBufferALIGNED> HostBufferRndNumColor;
 #else
@@ -457,7 +457,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for helicity selection
   constexpr size_t sizePerEventSelectedHelicity = 1;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for helicity selection
   typedef HostBuffer<int, sizePerEventSelectedHelicity, HostBufferALIGNED> HostBufferSelectedHelicity;
 #else
@@ -475,7 +475,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for color selection
   constexpr size_t sizePerEventSelectedColor = 1;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for color selection
   typedef HostBuffer<int, sizePerEventSelectedColor, HostBufferALIGNED> HostBufferSelectedColor;
 #else
@@ -487,7 +487,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   template<class Tdst, class Tsrc>
   void copyDeviceFromHost( Tdst& dst, const Tsrc& src ) // keep the same order of arguments as in memcpy
   {
@@ -504,13 +504,13 @@ namespace mg5amcCpu
       throw std::runtime_error( sstr.str() );
     }
     // NB (PR #45): cudaMemcpy involves an intermediate memcpy to pinned memory if host array is a not a pinned host array
-    checkCuda( cudaMemcpy( dst.data(), src.data(), src.bytes(), cudaMemcpyHostToDevice ) );
+    gpuMemcpy( dst.data(), src.data(), src.bytes(), gpuMemcpyHostToDevice );
   }
 #endif
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   template<class Tdst, class Tsrc>
   void copyHostFromDevice( Tdst& dst, const Tsrc& src ) // keep the same order of arguments as in memcpy
   {
@@ -527,7 +527,7 @@ namespace mg5amcCpu
       throw std::runtime_error( sstr.str() );
     }
     // NB (PR #45): cudaMemcpy involves an intermediate memcpy to pinned memory if host array is a not a pinned host array
-    checkCuda( cudaMemcpy( dst.data(), src.data(), src.bytes(), cudaMemcpyDeviceToHost ) );
+    gpuMemcpy( dst.data(), src.data(), src.bytes(), gpuMemcpyDeviceToHost );
   }
 #endif
 
diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/CPPProcess.cc b/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/CPPProcess.cc
index a4cc98e6b1..62fa7f0088 100644
--- a/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/CPPProcess.cc
+++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/CPPProcess.cc
@@ -4,7 +4,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
 // MadGraph5_aMC@NLO v. 3.5.0_lo_vect, 2023-06-09
@@ -16,7 +16,6 @@
 
 #include "mgOnGpuConfig.h"
 
-#include "CudaRuntime.h"
 #include "HelAmps_sm.h"
 #include "MemoryAccessAmplitudes.h"
 #include "MemoryAccessCouplings.h"
@@ -46,7 +45,7 @@
 // Class member functions for calculating the matrix elements for
 // Process: g g > t t~ WEIGHTED<=2 @1
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -80,7 +79,7 @@ namespace mg5amcCpu
   __device__ const fptype cIPD[2] = { (fptype)Parameters_sm::mdl_MT, (fptype)Parameters_sm::mdl_WT };
   __device__ const fptype* cIPC = nullptr; // unused as nicoup=0
 #else
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   __device__ __constant__ fptype cIPD[2];
   __device__ __constant__ fptype* cIPC = nullptr; // unused as nicoup=0
 #else
@@ -90,7 +89,7 @@ namespace mg5amcCpu
 #endif
 
   // Helicity combinations (and filtering of "good" helicity combinations)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   __device__ __constant__ short cHel[ncomb][npar];
   __device__ __constant__ int cNGoodHel;
   __device__ __constant__ int cGoodHel[ncomb];
@@ -118,13 +117,13 @@ namespace mg5amcCpu
                            fptype* allDenominators,       // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
                            fptype_sv* jamp2_sv            // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled)
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
                            , const int ievt00             // input: first event number in current C++ event page (for CUDA, ievt depends on threadid)
 #endif
                            )
   //ALWAYS_INLINE // attributes are not permitted in a function definition
   {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     using namespace mg5amcGpu;
     using M_ACCESS = DeviceAccessMomenta;         // non-trivial access: buffer includes all events
     using E_ACCESS = DeviceAccessMatrixElements;  // non-trivial access: buffer includes all events
@@ -151,7 +150,7 @@ namespace mg5amcCpu
 #endif /* clang-format on */
     mgDebug( 0, __FUNCTION__ );
     //printf( "calculate_wavefunctions: ihel=%2d\n", ihel );
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
     //printf( "calculate_wavefunctions: ievt00=%d\n", ievt00 );
 #endif
 
@@ -187,7 +186,7 @@ namespace mg5amcCpu
 #endif
     for( int iParity = 0; iParity < nParity; ++iParity )
     { // START LOOP ON IPARITY
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
       const int ievt0 = ievt00 + iParity * neppV;
 #endif
       constexpr size_t nxcoup = ndcoup + nicoup; // both dependent and independent couplings
@@ -200,8 +199,10 @@ namespace mg5amcCpu
         allCOUPs[idcoup] = CD_ACCESS::idcoupAccessBufferConst( allcouplings, idcoup ); // dependent couplings, vary event-by-event
       for( size_t iicoup = 0; iicoup < nicoup; iicoup++ )
         allCOUPs[ndcoup + iicoup] = CI_ACCESS::iicoupAccessBufferConst( cIPC, iicoup ); // independent couplings, fixed for all events
+#ifdef MGONGPUCPP_GPUIMPL
 #ifdef __CUDACC__
 #pragma nv_diagnostic pop
+#endif
       // CUDA kernels take input/output buffers with momenta/MEs for all events
       const fptype* momenta = allmomenta;
       const fptype* COUPs[nxcoup];
@@ -302,7 +303,7 @@ namespace mg5amcCpu
         { 16, -2 },
         { -2, 16 } }; // 2-D array[2][2]
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
       // Pre-compute a constexpr triangular color matrix properly normalized #475
       struct TriangularNormalizedColorMatrix
       {
@@ -359,7 +360,7 @@ namespace mg5amcCpu
 #endif
       for( int icol = 0; icol < ncolor; icol++ )
       {
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
         // === C++ START ===
         // Diagonal terms
 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
@@ -418,7 +419,7 @@ namespace mg5amcCpu
       MEs_sv_previous += deltaMEs_previous;
 #endif
       /*
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
       if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", blockDim.x * blockIdx.x + threadIdx.x, ihel, MEs_sv );
 #else
 #ifdef MGONGPU_CPPSIMD
@@ -465,8 +466,8 @@ namespace mg5amcCpu
       { 1, 1, -1, -1 },
       { 1, 1, 1, 1 },
       { 1, 1, 1, -1 } };
-#ifdef __CUDACC__
-    checkCuda( cudaMemcpyToSymbol( cHel, tHel, ncomb * npar * sizeof( short ) ) );
+#ifdef MGONGPUCPP_GPUIMPL
+    gpuMemcpyToSymbol( cHel, tHel, ncomb * npar * sizeof( short ) );
 #else
     memcpy( cHel, tHel, ncomb * npar * sizeof( short ) );
 #endif
@@ -506,9 +507,9 @@ namespace mg5amcCpu
     // Then copy them to CUDA constant memory (issue #39) or its C++ emulation in file-scope static memory
     const fptype tIPD[2] = { (fptype)m_pars->mdl_MT, (fptype)m_pars->mdl_WT };
     //const cxtype tIPC[0] = { ... }; // nicoup=0
-#ifdef __CUDACC__
-    checkCuda( cudaMemcpyToSymbol( cIPD, tIPD, 2 * sizeof( fptype ) ) );
-    //checkCuda( cudaMemcpyToSymbol( cIPC, tIPC, 0 * sizeof( cxtype ) ) ); // nicoup=0
+#ifdef MGONGPUCPP_GPUIMPL
+    gpuMemcpyToSymbol( cIPD, tIPD, 2 * sizeof( fptype ) );
+    //gpuMemcpyToSymbol( cIPC, tIPC, 0 * sizeof( cxtype ) ); // nicoup=0
 #else
     memcpy( cIPD, tIPD, 2 * sizeof( fptype ) );
     //memcpy( cIPC, tIPC, 0 * sizeof( cxtype ) ); // nicoup=0
@@ -544,7 +545,7 @@ namespace mg5amcCpu
   {
     std::stringstream out;
     // CUDA version (NVCC)
-    // [Use __NVCC__ instead of __CUDACC__ here!]
+    // [Use __NVCC__ instead of MGONGPUCPP_GPUIMPL here!]
     // [This tests if 'nvcc' was used even to build a .cc file, even if not necessarily 'nvcc -x cu' for a .cu file]
     // [Check 'nvcc --compiler-options -dM -E dummy.c | grep CUDA': see https://stackoverflow.com/a/53713712]
 #ifdef __NVCC__
@@ -609,12 +610,12 @@ namespace mg5amcCpu
   __global__ void /* clang-format off */
   computeDependentCouplings( const fptype* allgs, // input: Gs[nevt]
                              fptype* allcouplings // output: couplings[nevt*ndcoup*2]
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
                              , const int nevt     // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #endif
   ) /* clang-format on */
   {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     using namespace mg5amcGpu;
     using G_ACCESS = DeviceAccessGs;
     using C_ACCESS = DeviceAccessCouplings;
@@ -635,7 +636,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__ /* clang-format off */
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
   __global__ void
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
@@ -764,9 +765,9 @@ namespace mg5amcCpu
         nGoodHel++;
       }
     }
-#ifdef __CUDACC__
-    checkCuda( cudaMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) ) );
-    checkCuda( cudaMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) ) );
+#ifdef MGONGPUCPP_GPUIMPL
+    gpuMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) );
+    gpuMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) );
 #else
     cNGoodHel = nGoodHel;
     for( int ihel = 0; ihel < ncomb; ihel++ ) cGoodHel[ihel] = goodHel[ihel];
@@ -790,7 +791,7 @@ namespace mg5amcCpu
 #endif
             int* allselhel,                // output: helicity selection[nevt]
             int* allselcol                 // output: helicity selection[nevt]
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
             , const int nevt               // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #endif
             ) /* clang-format on */
@@ -811,7 +812,7 @@ namespace mg5amcCpu
     // Denominators: spins, colors and identical particles
     constexpr int helcolDenominators[1] = { 256 }; // assume nprocesses == 1 (#272 and #343)
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     // Remember: in CUDA this is a kernel for one event, in c++ this processes n events
     const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid
 #else
@@ -825,9 +826,12 @@ namespace mg5amcCpu
 #endif
 
     // Start sigmaKin_lines
+
+#include "GpuAbstraction.h"
+
     // === PART 0 - INITIALISATION (before calculate_wavefunctions) ===
     // Reset the "matrix elements" - running sums of |M|^2 over helicities for the given event
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     allMEs[ievt] = 0;
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     allNumerators[ievt] = 0;
@@ -855,7 +859,7 @@ namespace mg5amcCpu
     // === PART 1 - HELICITY LOOP: CALCULATE WAVEFUNCTIONS ===
     // (in both CUDA and C++, using precomputed good helicities)
 
-#ifdef __CUDACC__ // CUDA OR C++
+#ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++
 
     // *** START OF PART 1a - CUDA (one event per CPU thread) ***
     // Running sum of partial amplitudes squared for event by event color selection (#402)
@@ -1059,7 +1063,7 @@ namespace mg5amcCpu
     // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event
     // [NB 'sum over final spins, average over initial spins', eg see
     // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf]
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     allMEs[ievt] /= helcolDenominators[0];
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     if( channelId > 0 ) allMEs[ievt] *= allNumerators[ievt] / allDenominators[ievt];
diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/CPPProcess.h b/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/CPPProcess.h
index 51f966d10f..5a6e96d9e8 100644
--- a/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/CPPProcess.h
+++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/CPPProcess.h
@@ -4,7 +4,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
 // MadGraph5_aMC@NLO v. 3.5.0_lo_vect, 2023-06-09
@@ -25,7 +25,7 @@
 
 //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -107,7 +107,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   __global__ void
   computeDependentCouplings( const fptype* allgs,    // input: Gs[nevt]
                              fptype* allcouplings ); // output: couplings[nevt*ndcoup*2]
@@ -120,7 +120,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__ /* clang-format off */
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
   __global__ void
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
@@ -150,7 +150,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__ /* clang-format off */
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
   __global__ void
   sigmaKin( const fptype* allmomenta,      // input: momenta[nevt*npar*4]
             const fptype* allcouplings,    // input: couplings[nevt*ndcoup*2]
diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/CudaRuntime.h b/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/CudaRuntime.h
deleted file mode 120000
index ce9e1a487a..0000000000
--- a/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/CudaRuntime.h
+++ /dev/null
@@ -1 +0,0 @@
-../CudaRuntime.h
\ No newline at end of file
diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/GpuAbstraction.h b/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/GpuAbstraction.h
new file mode 120000
index 0000000000..72054e19ba
--- /dev/null
+++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/GpuAbstraction.h
@@ -0,0 +1 @@
+../GpuAbstraction.h
\ No newline at end of file
diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/GpuRuntime.h b/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/GpuRuntime.h
new file mode 120000
index 0000000000..3920e83be4
--- /dev/null
+++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/GpuRuntime.h
@@ -0,0 +1 @@
+../GpuRuntime.h
\ No newline at end of file
diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/check_sa.cc b/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/check_sa.cc
index f1e75b9252..1bad694d1c 100644
--- a/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/check_sa.cc
+++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/check_sa.cc
@@ -4,7 +4,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: O. Mattelaer (Nov 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 
 #include "mgOnGpuConfig.h"
@@ -12,6 +12,7 @@
 #include "BridgeKernels.h"
 #include "CPPProcess.h"
 #include "CrossSectionKernels.h"
+#include "GpuRuntime.h"
 #include "MatrixElementKernels.h"
 #include "MemoryAccessMatrixElements.h"
 #include "MemoryAccessMomenta.h"
@@ -63,7 +64,7 @@ usage( char* argv0, int ret = 1 )
   std::cout << std::endl;
   std::cout << "Summary stats are always computed: '-p' and '-j' only control their printout" << std::endl;
   std::cout << "The '-d' flag only enables NaN/abnormal warnings and OMP debugging" << std::endl;
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 #ifdef _OPENMP
   std::cout << std::endl;
   std::cout << "Use the OMP_NUM_THREADS environment variable to control OMP multi-threading" << std::endl;
@@ -77,7 +78,7 @@ int
 main( int argc, char** argv )
 {
   // Namespaces for CUDA and C++ (FIXME - eventually use the same namespace everywhere...)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   using namespace mg5amcGpu;
 #else
   using namespace mg5amcCpu;
@@ -103,11 +104,11 @@ main( int argc, char** argv )
     CurandDevice = 2
   };
 #ifdef __CUDACC__
-  RandomNumberMode rndgen = RandomNumberMode::CurandDevice; // default on GPU
+  RandomNumberMode rndgen = RandomNumberMode::CurandDevice; // default on CUDA GPU
 #elif not defined MGONGPU_HAS_NO_CURAND
   RandomNumberMode rndgen = RandomNumberMode::CurandHost;  // default on CPU if build has curand
 #else
-  RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // default on CPU if build has no curand
+  RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // default on CPU if build has no curand and on HIP GPU
 #endif
   // Rambo sampling mode (NB RamboHost implies CommonRandom or CurandHost!)
   enum class RamboSamplingMode
@@ -115,7 +116,7 @@ main( int argc, char** argv )
     RamboHost = 1,
     RamboDevice = 2
   };
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   RamboSamplingMode rmbsmp = RamboSamplingMode::RamboDevice; // default on GPU
 #else
   RamboSamplingMode rmbsmp = RamboSamplingMode::RamboHost; // default on CPU
@@ -148,7 +149,7 @@ main( int argc, char** argv )
 #ifdef __CUDACC__
       rndgen = RandomNumberMode::CurandDevice;
 #else
-      throw std::runtime_error( "CurandDevice is not supported on CPUs" );
+      throw std::runtime_error( "CurandDevice is not supported on CPUs or on HIP GPUs" );
 #endif
     }
     else if( arg == "--curhst" )
@@ -165,7 +166,7 @@ main( int argc, char** argv )
     }
     else if( arg == "--rmbdev" )
     {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
       rmbsmp = RamboSamplingMode::RamboDevice;
 #else
       throw std::runtime_error( "RamboDevice is not supported on CPUs" );
@@ -239,13 +240,13 @@ main( int argc, char** argv )
     return usage( argv[0] );
   }
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 #ifdef _OPENMP
   ompnumthreadsNotSetMeansOneThread( debug ? 1 : 0 ); // quiet(-1), info(0), debug(1)
 #endif
 #endif
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // Fail gently and avoid "Illegal instruction (core dumped)" if the host does not support the SIMD used in the ME calculation
   // Note: this prevents a crash on pmpe04 but not on some github CI nodes?
   // [NB: SIMD vectorization in mg5amc C++ code is only used in the ME calculation below MatrixElementKernelHost!]
@@ -263,14 +264,14 @@ main( int argc, char** argv )
 
   // === STEP 0 - INITIALISE
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 
-  // --- 00. Initialise cuda
-  // Instantiate a CudaRuntime at the beginnining of the application's main to
-  // invoke cudaSetDevice(0) in the constructor and book a cudaDeviceReset() call in the destructor
-  const std::string cdinKey = "00 CudaInit";
+  // --- 00. Initialise GPU
+  // Instantiate a GpuRuntime at the beginnining of the application's main.
+  // For CUDA this invokes cudaSetDevice(0) in the constructor and books a cudaDeviceReset() call in the destructor.
+  const std::string cdinKey = "00 GpuInit";
   timermap.start( cdinKey );
-  CudaRuntime cudaRuntime( debug );
+  GpuRuntime GpuRuntime( debug );
 #endif
 
   // --- 0a. Initialise physics process
@@ -292,7 +293,7 @@ main( int argc, char** argv )
   timermap.start( alloKey );
 
   // Memory buffers for random numbers for momenta
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferRndNumMomenta hstRndmom( nevt );
 #else
   PinnedHostBufferRndNumMomenta hstRndmom( nevt );
@@ -300,7 +301,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for sampling weights
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferWeights hstWeights( nevt );
 #else
   PinnedHostBufferWeights hstWeights( nevt );
@@ -308,7 +309,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for momenta
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferMomenta hstMomenta( nevt );
 #else
   PinnedHostBufferMomenta hstMomenta( nevt );
@@ -316,7 +317,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for Gs
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferGs hstGs( nevt );
 #else
   PinnedHostBufferGs hstGs( nevt );
@@ -333,7 +334,7 @@ main( int argc, char** argv )
   }
 
   // Memory buffers for matrix elements
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferMatrixElements hstMatrixElements( nevt );
 #else
   PinnedHostBufferMatrixElements hstMatrixElements( nevt );
@@ -342,7 +343,7 @@ main( int argc, char** argv )
 
   // Memory buffers for random numbers for helicity selection
   // *** NB #403 these buffers always remain initialised at 0: no need for helicity choice in gcheck/check (no LHE produced) ***
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferRndNumHelicity hstRndHel( nevt );
 #else
   PinnedHostBufferRndNumHelicity hstRndHel( nevt );
@@ -351,7 +352,7 @@ main( int argc, char** argv )
 
   // Memory buffers for random numbers for color selection
   // *** NB #402 these buffers always remain initialised at 0: no need for color choice in gcheck/check (no LHE produced) ***
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferRndNumColor hstRndCol( nevt );
 #else
   PinnedHostBufferRndNumColor hstRndCol( nevt );
@@ -359,7 +360,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for helicity selection
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferSelectedHelicity hstSelHel( nevt );
 #else
   PinnedHostBufferSelectedHelicity hstSelHel( nevt );
@@ -367,7 +368,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for color selection
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferSelectedColor hstSelCol( nevt );
 #else
   PinnedHostBufferSelectedColor hstSelCol( nevt );
@@ -403,7 +404,7 @@ main( int argc, char** argv )
 #else
   else
   {
-    throw std::logic_error( "CurandDevice is not supported on CPUs" ); // INTERNAL ERROR (no path to this statement)
+    throw std::logic_error( "CurandDevice is not supported on CPUs or HIP GPUs" ); // INTERNAL ERROR (no path to this statement)
   }
 #endif
 #else
@@ -421,7 +422,7 @@ main( int argc, char** argv )
   }
   else
   {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     prsk.reset( new RamboSamplingKernelDevice( energy, devRndmom, devMomenta, devWeights, gpublocks, gputhreads ) );
 #else
     throw std::logic_error( "RamboDevice is not supported on CPUs" ); // INTERNAL ERROR (no path to this statement)
@@ -432,7 +433,7 @@ main( int argc, char** argv )
   std::unique_ptr<MatrixElementKernelBase> pmek;
   if( !bridge )
   {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     pmek.reset( new MatrixElementKernelDevice( devMomenta, devGs, devRndHel, devRndCol, devMatrixElements, devSelHel, devSelCol, gpublocks, gputhreads ) );
 #else
     pmek.reset( new MatrixElementKernelHost( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, nevt ) );
@@ -440,7 +441,7 @@ main( int argc, char** argv )
   }
   else
   {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     pmek.reset( new BridgeKernelDevice( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, gpublocks, gputhreads ) );
 #else
     pmek.reset( new BridgeKernelHost( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, nevt ) );
@@ -482,7 +483,7 @@ main( int argc, char** argv )
     prnk->generateRnarray();
     //std::cout << "Got random numbers" << std::endl;
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     if( rndgen != RandomNumberMode::CurandDevice && rmbsmp == RamboSamplingMode::RamboDevice )
     {
       // --- 1c. Copy rndmom from host to device
@@ -514,7 +515,7 @@ main( int argc, char** argv )
     prsk->getMomentaFinal();
     //std::cout << "Got final momenta" << std::endl;
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     if( rmbsmp == RamboSamplingMode::RamboDevice )
     {
       // --- 2c. CopyDToH Weights
@@ -559,7 +560,7 @@ main( int argc, char** argv )
       dynamic_cast<BridgeKernelBase*>( pmek.get() )->transposeInputMomentaC2F();
     }
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     // --- 2d. CopyHToD Momenta
     const std::string gKey = "0.. CpHTDg";
     rambtime += timermap.start( gKey ); // FIXME! NOT A RAMBO TIMER!
@@ -588,7 +589,7 @@ main( int argc, char** argv )
     wv3atime += timermap.stop(); // calc only
     wavetime += wv3atime;        // calc plus copy
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     if( !bridge )
     {
       // --- 3b. CopyDToH MEs
@@ -731,15 +732,19 @@ main( int argc, char** argv )
     rndgentxt = "CURAND DEVICE";
 #ifdef __CUDACC__
   rndgentxt += " (CUDA code)";
+#elif defined __HIPCC__
+  rndgentxt += " (HIP code)";
 #else
   rndgentxt += " (C++ code)";
 #endif
 
   // Workflow description summary
   std::string wrkflwtxt;
-  // -- CUDA or C++?
+  // -- CUDA or HIP or C++?
 #ifdef __CUDACC__
   wrkflwtxt += "CUD:";
+#elif defined __HIPCC__
+  wrkflwtxt += "HIP:";
 #else
   wrkflwtxt += "CPP:";
 #endif
@@ -764,6 +769,12 @@ main( int argc, char** argv )
 #else
   wrkflwtxt += "???:"; // no path to this statement
 #endif
+#elif defined __HIPCC__
+#if defined MGONGPU_CUCXTYPE_CXSMPL
+  wrkflwtxt += "CXS:";
+#else
+  wrkflwtxt += "???:"; // no path to this statement
+#endif
 #else
 #if defined MGONGPU_CPPCXTYPE_STDCOMPLEX
   wrkflwtxt += "STX:";
@@ -789,7 +800,7 @@ main( int argc, char** argv )
     wrkflwtxt += "RMBDEV+";
   else
     wrkflwtxt += "??????+"; // no path to this statement
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   // -- HOST or DEVICE matrix elements? Standalone MEs or BRIDGE?
   if( !bridge )
     wrkflwtxt += "MESDEV";
@@ -845,7 +856,7 @@ main( int argc, char** argv )
 
   if( perf )
   {
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 #ifdef _OPENMP
     // Get the output of "nproc --all" (https://stackoverflow.com/a/478960)
     std::string nprocall;
@@ -866,6 +877,8 @@ main( int argc, char** argv )
     std::cout << std::string( SEP79, '*' ) << std::endl
 #ifdef __CUDACC__
               << "Process                     = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_CUDA"
+#elif defined __HIPCC__
+              << "Process                     = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_HIP"
 #else
               << "Process                     = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_CPP"
 #endif
@@ -892,21 +905,21 @@ main( int argc, char** argv )
 #elif defined MGONGPU_FPTYPE_FLOAT
               << "FP precision                = FLOAT (NaN/abnormal=" << nabn << ", zero=" << nzero << ")" << std::endl
 #endif
-#ifdef __CUDACC__
 #if defined MGONGPU_CUCXTYPE_CUCOMPLEX
               << "Complex type                = CUCOMPLEX" << std::endl
 #elif defined MGONGPU_CUCXTYPE_THRUST
               << "Complex type                = THRUST::COMPLEX" << std::endl
-#endif
-#else
+#elif defined MGONGPU_CUCXTYPE_CXSMPL
               << "Complex type                = STD::COMPLEX" << std::endl
+#else
+              << "Complex type                = ???" << std::endl // no path to this statement...
 #endif
               << "RanNumb memory layout       = AOSOA[" << neppR << "]"
               << ( neppR == 1 ? " == AOS" : "" )
               << " [HARDCODED FOR REPRODUCIBILITY]" << std::endl
               << "Momenta memory layout       = AOSOA[" << neppM << "]"
               << ( neppM == 1 ? " == AOS" : "" ) << std::endl
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     //<< "Wavefunction GPU memory     = LOCAL" << std::endl
 #else
 #if !defined MGONGPU_CPPSIMD
@@ -937,7 +950,7 @@ main( int argc, char** argv )
 #endif
 #endif
               << "Random number generation    = " << rndgentxt << std::endl
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 #ifdef _OPENMP
               << "OMP threads / `nproc --all` = " << omp_get_max_threads() << " / " << nprocall // includes a newline
 #endif
@@ -1033,14 +1046,14 @@ main( int argc, char** argv )
              << "\"FLOAT (NaN/abnormal=" << nabn << ")\"," << std::endl
 #endif
              << "\"Complex type\": "
-#ifdef __CUDACC__
 #if defined MGONGPU_CUCXTYPE_CUCOMPLEX
              << "\"CUCOMPLEX\"," << std::endl
 #elif defined MGONGPU_CUCXTYPE_THRUST
              << "\"THRUST::COMPLEX\"," << std::endl
-#endif
-#else
+#elif defined MGONGPU_CUCXTYPE_CXSMPL
              << "\"STD::COMPLEX\"," << std::endl
+#else
+             << "\"???\"," << std::endl                           // no path to this statement...
 #endif
              << "\"RanNumb memory layout\": "
              << "\"AOSOA[" << neppR << "]\""
@@ -1048,7 +1061,7 @@ main( int argc, char** argv )
              << "\"Momenta memory layout\": "
              << "\"AOSOA[" << neppM << "]\""
              << ( neppM == 1 ? " == AOS" : "" ) << ", " << std::endl
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     //<< "\"Wavefunction GPU memory\": " << "\"LOCAL\"," << std::endl
 #endif
              << "\"Curand generation\": "
diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/RamboSamplingKernels.cc b/epochX/cudacpp/gg_tt.mad/SubProcesses/RamboSamplingKernels.cc
index da68aa9255..79abbcc4f8 100644
--- a/epochX/cudacpp/gg_tt.mad/SubProcesses/RamboSamplingKernels.cc
+++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/RamboSamplingKernels.cc
@@ -1,11 +1,11 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #include "RamboSamplingKernels.h"
 
-#include "CudaRuntime.h"
+#include "GpuRuntime.h"
 #include "MemoryAccessMomenta.h"
 #include "MemoryAccessRandomNumbers.h"
 #include "MemoryAccessWeights.h"
@@ -14,7 +14,7 @@
 
 #include <sstream>
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -92,7 +92,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   RamboSamplingKernelDevice::RamboSamplingKernelDevice( const fptype energy,               // input: energy
                                                         const BufferRndNumMomenta& rndmom, // input: random numbers in [0,1]
                                                         BufferMomenta& momenta,            // output: momenta
@@ -135,7 +135,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   __global__ void
   getMomentaInitialDevice( const fptype energy,
                            fptype* momenta )
@@ -147,17 +147,17 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   void
   RamboSamplingKernelDevice::getMomentaInitial()
   {
-    getMomentaInitialDevice<<<m_gpublocks, m_gputhreads>>>( m_energy, m_momenta.data() );
+    gpuLaunchKernel( getMomentaInitialDevice, m_gpublocks, m_gputhreads, m_energy, m_momenta.data() );
   }
 #endif
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   __global__ void
   getMomentaFinalDevice( const fptype energy,
                          const fptype* rndmom,
@@ -171,11 +171,11 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   void
   RamboSamplingKernelDevice::getMomentaFinal()
   {
-    getMomentaFinalDevice<<<m_gpublocks, m_gputhreads>>>( m_energy, m_rndmom.data(), m_momenta.data(), m_weights.data() );
+    gpuLaunchKernel( getMomentaFinalDevice, m_gpublocks, m_gputhreads, m_energy, m_rndmom.data(), m_momenta.data(), m_weights.data() );
   }
 #endif
 
diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/RamboSamplingKernels.h b/epochX/cudacpp/gg_tt.mad/SubProcesses/RamboSamplingKernels.h
index 184089efd7..7c214cd74b 100644
--- a/epochX/cudacpp/gg_tt.mad/SubProcesses/RamboSamplingKernels.h
+++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/RamboSamplingKernels.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef RAMBOSAMPLINGKERNELS_H
 #define RAMBOSAMPLINGKERNELS_H 1
@@ -10,7 +10,7 @@
 
 #include "MemoryBuffers.h"
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -93,7 +93,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   // A class encapsulating RAMBO phase space sampling on a GPU device
   class RamboSamplingKernelDevice final : public SamplingKernelBase, public NumberOfEvents
   {
diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/RandomNumberKernels.h b/epochX/cudacpp/gg_tt.mad/SubProcesses/RandomNumberKernels.h
index 188a72c2c9..21d63beeac 100644
--- a/epochX/cudacpp/gg_tt.mad/SubProcesses/RandomNumberKernels.h
+++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/RandomNumberKernels.h
@@ -1,14 +1,14 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef RANDOMNUMBERKERNELS_H
 #define RANDOMNUMBERKERNELS_H 1
 
 #include "mgOnGpuConfig.h"
 
-// NB This must come AFTER mgOnGpuConfig.h which contains our definition of __global__ when __CUDACC__ is not defined
+// NB This must come AFTER mgOnGpuConfig.h which contains our definition of __global__ when MGONGPUCPP_GPUIMPL is not defined
 #ifndef MGONGPU_HAS_NO_CURAND
 //#include "curand.h"
 struct curandGenerator_st; // forward definition from curand.h
@@ -16,7 +16,7 @@ struct curandGenerator_st; // forward definition from curand.h
 
 #include "MemoryBuffers.h"
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/cudacpp.mk b/epochX/cudacpp/gg_tt.mad/SubProcesses/cudacpp.mk
index a0397e9ecc..bcb73d7f01 100644
--- a/epochX/cudacpp/gg_tt.mad/SubProcesses/cudacpp.mk
+++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/cudacpp.mk
@@ -1,7 +1,7 @@
 # Copyright (C) 2020-2023 CERN and UCLouvain.
 # Licensed under the GNU Lesser General Public License (version 3 or later).
 # Created by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin.
-# Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+# Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 
 #=== Determine the name of this makefile (https://ftp.gnu.org/old-gnu/Manuals/make-3.80/html_node/make_17.html)
 #=== NB: different names (e.g. cudacpp.mk and cudacpp_src.mk) are used in the Subprocess and src directories
@@ -32,7 +32,7 @@ UNAME_P := $(shell uname -p)
 #=== Configure common compiler flags for C++ and CUDA
 
 INCFLAGS = -I.
-OPTFLAGS = -O3 # this ends up in CUFLAGS too (should it?), cannot add -Ofast or -ffast-math here
+OPTFLAGS = -O3 # this ends up in GPUFLAGS too (should it?), cannot add -Ofast or -ffast-math here
 
 # Dependency on src directory
 MG5AMC_COMMONLIB = mg5amc_common
@@ -89,69 +89,139 @@ endif
 
 #-------------------------------------------------------------------------------
 
-#=== Configure the CUDA compiler
-
-# If CXX is not a single word (example "clang++ --gcc-toolchain...") then disable CUDA builds (issue #505)
-# This is because it is impossible to pass this to "CUFLAGS += -ccbin <host-compiler>" below
-ifneq ($(words $(subst ccache ,,$(CXX))),1) # allow at most "CXX=ccache <host-compiler>" from outside
-  $(warning CUDA builds are not supported for multi-word CXX "$(CXX)")
-  override CUDA_HOME=disabled
-endif
-
-# If CUDA_HOME is not set, try to set it from the location of nvcc
-ifndef CUDA_HOME
-  CUDA_HOME = $(patsubst %bin/nvcc,%,$(shell which nvcc 2>/dev/null))
-  $(warning CUDA_HOME was not set: using "$(CUDA_HOME)")
-endif
-
-# Set NVCC as $(CUDA_HOME)/bin/nvcc if it exists
-ifneq ($(wildcard $(CUDA_HOME)/bin/nvcc),)
-  NVCC = $(CUDA_HOME)/bin/nvcc
-  USE_NVTX ?=-DUSE_NVTX
-  # See https://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/index.html
-  # See https://arnon.dk/matching-sm-architectures-arch-and-gencode-for-various-nvidia-cards/
-  # Default: use compute capability 70 for V100 (CERN lxbatch, CERN itscrd, Juwels Cluster).
-  # Embed device code for 70, and PTX for 70+.
-  # Export MADGRAPH_CUDA_ARCHITECTURE (comma-separated list) to use another value or list of values (see #533).
-  # Examples: use 60 for P100 (Piz Daint), 80 for A100 (Juwels Booster, NVidia raplab/Curiosity).
-  MADGRAPH_CUDA_ARCHITECTURE ?= 70
-  ###CUARCHFLAGS = -gencode arch=compute_$(MADGRAPH_CUDA_ARCHITECTURE),code=compute_$(MADGRAPH_CUDA_ARCHITECTURE) -gencode arch=compute_$(MADGRAPH_CUDA_ARCHITECTURE),code=sm_$(MADGRAPH_CUDA_ARCHITECTURE) # Older implementation (AV): go back to this one for multi-GPU support #533
-  ###CUARCHFLAGS = --gpu-architecture=compute_$(MADGRAPH_CUDA_ARCHITECTURE) --gpu-code=sm_$(MADGRAPH_CUDA_ARCHITECTURE),compute_$(MADGRAPH_CUDA_ARCHITECTURE) # Newer implementation (SH): cannot use this as-is for multi-GPU support #533
-  comma:=,
-  CUARCHFLAGS = $(foreach arch,$(subst $(comma), ,$(MADGRAPH_CUDA_ARCHITECTURE)),-gencode arch=compute_$(arch),code=compute_$(arch) -gencode arch=compute_$(arch),code=sm_$(arch))
-  CUINC = -I$(CUDA_HOME)/include/
-  CURANDLIBFLAGS = -L$(CUDA_HOME)/lib64/ -lcurand # NB: -lcuda is not needed here!
-  CUOPTFLAGS = -lineinfo
-  CUFLAGS = $(OPTFLAGS) $(CUOPTFLAGS) $(INCFLAGS) $(CUINC) $(USE_NVTX) $(CUARCHFLAGS) -use_fast_math
-  ###CUFLAGS += -Xcompiler -Wall -Xcompiler -Wextra -Xcompiler -Wshadow
-  ###NVCC_VERSION = $(shell $(NVCC) --version | grep 'Cuda compilation tools' | cut -d' ' -f5 | cut -d, -f1)
-  CUFLAGS += -std=c++17 # need CUDA >= 11.2 (see #333): this is enforced in mgOnGpuConfig.h
-  # Without -maxrregcount: baseline throughput: 6.5E8 (16384 32 12) up to 7.3E8 (65536 128 12)
-  ###CUFLAGS+= --maxrregcount 160 # improves throughput: 6.9E8 (16384 32 12) up to 7.7E8 (65536 128 12)
-  ###CUFLAGS+= --maxrregcount 128 # improves throughput: 7.3E8 (16384 32 12) up to 7.6E8 (65536 128 12)
-  ###CUFLAGS+= --maxrregcount 96 # degrades throughput: 4.1E8 (16384 32 12) up to 4.5E8 (65536 128 12)
-  ###CUFLAGS+= --maxrregcount 64 # degrades throughput: 1.7E8 (16384 32 12) flat at 1.7E8 (65536 128 12)
-else ifneq ($(origin REQUIRE_CUDA),undefined)
-  # If REQUIRE_CUDA is set but no cuda is found, stop here (e.g. for CI tests on GPU #443)
-  $(error No cuda installation found (set CUDA_HOME or make nvcc visible in PATH))
-else
-  # No cuda. Switch cuda compilation off and go to common random numbers in C++
-  $(warning CUDA_HOME is not set or is invalid: export CUDA_HOME to compile with cuda)
-  override NVCC=
-  override USE_NVTX=
-  override CUINC=
-  override CURANDLIBFLAGS=
-endif
+CUDA_COMPILER_PATH := $(shell compiler="`which nvcc 2>/dev/null`" && while [ -L "$$compiler" ]; do compiler=`readlink "$$compiler"`; done && echo "$$compiler")
+HIP_COMPILER_PATH := $(shell compiler="`which hipcc 2>/dev/null`" && while [ -L "$$compiler" ]; do compiler=`readlink "$$compiler"`; done && echo "$$compiler")
+
+ifeq ($(findstring nvcc,$(CUDA_COMPILER_PATH)),nvcc)
+  #=== Configure the CUDA compiler
+
+  # If CXX is not a single word (example "clang++ --gcc-toolchain...") then disable CUDA builds (issue #505)
+  # This is because it is impossible to pass this to "GPUFLAGS += -ccbin <host-compiler>" below
+  ifneq ($(words $(subst ccache ,,$(CXX))),1) # allow at most "CXX=ccache <host-compiler>" from outside
+    $(warning CUDA builds are not supported for multi-word CXX "$(CXX)")
+    override CUDA_HOME=disabled
+  endif
+
+  # If CUDA_HOME is not set, try to set it from the location of NVCC
+  ifndef CUDA_HOME
+    CUDA_HOME = $(patsubst %bin/nvcc,%,$(shell which nvcc 2>/dev/null))
+    $(warning CUDA_HOME was not set: using "$(CUDA_HOME)")
+  endif
+
+  # Set GPUCC as $(CUDA_HOME)/bin/nvcc if it exists
+  ifneq ($(wildcard $(CUDA_HOME)/bin/nvcc),)
+    GPUCC = $(CUDA_HOME)/bin/nvcc
+    USE_NVTX ?=-DUSE_NVTX
+    # See https://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/index.html
+    # See https://arnon.dk/matching-sm-architectures-arch-and-gencode-for-various-nvidia-cards/
+    # Default: use compute capability 70 for V100 (CERN lxbatch, CERN itscrd, Juwels Cluster).
+    # Embed device code for 70, and PTX for 70+.
+    # Export MADGRAPH_CUDA_ARCHITECTURE (comma-separated list) to use another value or list of values (see #533).
+    # Examples: use 60 for P100 (Piz Daint), 80 for A100 (Juwels Booster, NVidia raplab/Curiosity).
+    MADGRAPH_CUDA_ARCHITECTURE ?= 70
+    ###CUARCHFLAGS = -gencode arch=compute_$(MADGRAPH_CUDA_ARCHITECTURE),code=compute_$(MADGRAPH_CUDA_ARCHITECTURE) -gencode arch=compute_$(MADGRAPH_CUDA_ARCHITECTURE),code=sm_$(MADGRAPH_CUDA_ARCHITECTURE) # Older implementation (AV): go back to this one for multi-GPU support #533
+    ###CUARCHFLAGS = --gpu-architecture=compute_$(MADGRAPH_CUDA_ARCHITECTURE) --gpu-code=sm_$(MADGRAPH_CUDA_ARCHITECTURE),compute_$(MADGRAPH_CUDA_ARCHITECTURE) # Newer implementation (SH): cannot use this as-is for multi-GPU support #533
+    comma:=,
+    CUARCHFLAGS = $(foreach arch,$(subst $(comma), ,$(MADGRAPH_CUDA_ARCHITECTURE)),-gencode arch=compute_$(arch),code=compute_$(arch) -gencode arch=compute_$(arch),code=sm_$(arch))
+    CUINC = -I$(CUDA_HOME)/include/
+    CURANDLIBFLAGS = -L$(CUDA_HOME)/lib64/ -lcurand # NB: -lcuda is not needed here!
+    CUOPTFLAGS = -lineinfo
+    GPUFLAGS = $(OPTFLAGS) $(CUOPTFLAGS) $(INCFLAGS) $(CUINC) $(USE_NVTX) $(CUARCHFLAGS) -use_fast_math
+    ###GPUFLAGS += -Xcompiler -Wall -Xcompiler -Wextra -Xcompiler -Wshadow
+    ###GPUCC_VERSION = $(shell $(GPUCC) --version | grep 'Cuda compilation tools' | cut -d' ' -f5 | cut -d, -f1)
+    GPUFLAGS += -std=c++17 # need CUDA >= 11.2 (see #333): this is enforced in mgOnGpuConfig.h
+    # Without -maxrregcount: baseline throughput: 6.5E8 (16384 32 12) up to 7.3E8 (65536 128 12)
+    ###GPUFLAGS+= --maxrregcount 160 # improves throughput: 6.9E8 (16384 32 12) up to 7.7E8 (65536 128 12)
+    ###GPUFLAGS+= --maxrregcount 128 # improves throughput: 7.3E8 (16384 32 12) up to 7.6E8 (65536 128 12)
+    ###GPUFLAGS+= --maxrregcount 96 # degrades throughput: 4.1E8 (16384 32 12) up to 4.5E8 (65536 128 12)
+    ###GPUFLAGS+= --maxrregcount 64 # degrades throughput: 1.7E8 (16384 32 12) flat at 1.7E8 (65536 128 12)
+
+    CUBUILDRULEFLAGS = -Xcompiler -fPIC -c
+    CCBUILDRULEFLAGS = -Xcompiler -fPIC -c -x cu
+
+    CUDATESTFLAGS = -lcuda
+
+  else ifneq ($(origin REQUIRE_CUDA),undefined)
+    # If REQUIRE_CUDA is set but no cuda is found, stop here (e.g. for CI tests on GPU #443)
+    $(error No cuda installation found (set CUDA_HOME or make GPUCC visible in PATH))
+  else
+    # No cuda. Switch cuda compilation off and go to common random numbers in C++
+    $(warning CUDA_HOME is not set or is invalid: export CUDA_HOME to compile with cuda)
+    override GPUCC=
+    override USE_NVTX=
+    override CUINC=
+    override CURANDLIBFLAGS=
+  endif
+
+  # Set the host C++ compiler for GPUCC via "-ccbin <host-compiler>"
+  # (NB issue #505: this must be a single word, "clang++ --gcc-toolchain..." is not supported)
+  GPUFLAGS += -ccbin $(shell which $(subst ccache ,,$(CXX)))
+
+  # Allow newer (unsupported) C++ compilers with older versions of CUDA if ALLOW_UNSUPPORTED_COMPILER_IN_CUDA is set (#504)
+  ifneq ($(origin ALLOW_UNSUPPORTED_COMPILER_IN_CUDA),undefined)
+  GPUFLAGS += -allow-unsupported-compiler
+  endif
+
+else ifeq ($(findstring hipcc,$(HIP_COMPILER_PATH)),hipcc)
+  #=== Configure the HIP compiler
+
+  # If CXX is not a single word (example "clang++ --gcc-toolchain...") then disable CUDA builds (issue #505)
+  # This is because it is impossible to pass this to "GPUFLAGS += -ccbin <host-compiler>" below
+  ifneq ($(words $(subst ccache ,,$(CXX))),1) # allow at most "CXX=ccache <host-compiler>" from outside
+    $(warning CUDA builds are not supported for multi-word CXX "$(CXX)")
+    override CUDA_HOME=disabled
+  endif
+
+  # If HIP_HOME is not set, try to set it from the location of GPUCC
+  ifndef HIP_HOME
+    HIP_HOME = $(patsubst %bin/hipcc,%,$(HIP_COMPILER_PATH))
+    $(warning HIP_HOME was not set: using "$(HIP_HOME)")
+  endif
 
-# Set the host C++ compiler for nvcc via "-ccbin <host-compiler>"
-# (NB issue #505: this must be a single word, "clang++ --gcc-toolchain..." is not supported)
-CUFLAGS += -ccbin $(shell which $(subst ccache ,,$(CXX)))
+  # Set GPUCC as $(HIP_HOME)/bin/hipcc if it exists
+  ifneq ($(wildcard $(HIP_HOME)/bin/hipcc),)
+    GPUCC = $(HIP_HOME)/bin/hipcc
+
+    # Should maybe find something equivelant to this in HIP
+    #USE_NVTX ?=-DUSE_NVTX
+
+    HIPARCHFLAGS = -target x86_64-linux-gnu --offload-arch=gfx90a
+    HIPINC = -I$(HIP_HOME)/include/
+
+    # -DHIP_FAST_MATH equivelant to -use_fast_math in HIP 
+    # (But only for single precision line 208: https://rocm-developer-tools.github.io/HIP/hcc__detail_2math__functions_8h_source.html)
+    GPUFLAGS = $(OPTFLAGS) $(CUOPTFLAGS) $(INCFLAGS) $(HIPINC) $(HIPARCHFLAGS) -DHIP_FAST_MATH -DHIP_PLATFORM=amd -fPIC
+    ###GPUFLAGS += -Xcompiler -Wall -Xcompiler -Wextra -Xcompiler -Wshadow
+    GPUFLAGS += -std=c++17
+    # Without -maxrregcount: baseline throughput: 6.5E8 (16384 32 12) up to 7.3E8 (65536 128 12)
+    ###GPUFLAGS+= --maxrregcount 160 # improves throughput: 6.9E8 (16384 32 12) up to 7.7E8 (65536 128 12)
+    ###GPUFLAGS+= --maxrregcount 128 # improves throughput: 7.3E8 (16384 32 12) up to 7.6E8 (65536 128 12)
+    ###GPUFLAGS+= --maxrregcount 96 # degrades throughput: 4.1E8 (16384 32 12) up to 4.5E8 (65536 128 12)
+    ###GPUFLAGS+= --maxrregcount 64 # degrades throughput: 1.7E8 (16384 32 12) flat at 1.7E8 (65536 128 12)
+
+    CUBUILDRULEFLAGS = -fPIC -c
+    CCBUILDRULEFLAGS = -fPIC -c
+
+  else ifneq ($(origin REQUIRE_HIP),undefined)
+    # If REQUIRE_HIP is set but no cuda is found, stop here (e.g. for CI tests on GPU #443)
+    $(error No hip installation found (set HIP_HOME or make GPUCC visible in PATH))
+  else
+    # No hip. Switch hip compilation off and go to common random numbers in C++
+    $(warning HIP_HOME is not set or is invalid: export HIP_HOME to compile with hip)
+    override GPUCC=
+    override USE_NVTX=
+    override CUINC=
+    override CURANDLIBFLAGS=
+  endif
+
+  # Allow newer (unsupported) C++ compilers with older versions of CUDA if ALLOW_UNSUPPORTED_COMPILER_IN_CUDA is set (#504)
+  ifneq ($(origin ALLOW_UNSUPPORTED_COMPILER_IN_CUDA),undefined)
+  GPUFLAGS += -allow-unsupported-compiler
+  endif
 
-# Allow newer (unsupported) C++ compilers with older versions of CUDA if ALLOW_UNSUPPORTED_COMPILER_IN_CUDA is set (#504)
-ifneq ($(origin ALLOW_UNSUPPORTED_COMPILER_IN_CUDA),undefined)
-CUFLAGS += -allow-unsupported-compiler
 endif
 
+  
 #-------------------------------------------------------------------------------
 
 #=== Configure ccache for C++ and CUDA builds
@@ -163,9 +233,9 @@ endif
 #ifeq ($(USECCACHE)$(shell echo $(AR) | grep ccache),1)
 #  override AR:=ccache $(AR)
 #endif
-ifneq ($(NVCC),)
-  ifeq ($(USECCACHE)$(shell echo $(NVCC) | grep ccache),1)
-    override NVCC:=ccache $(NVCC)
+ifneq ($(GPUCC),)
+  ifeq ($(USECCACHE)$(shell echo $(GPUCC) | grep ccache),1)
+    override GPUCC:=ccache $(GPUCC)
   endif
 endif
 
@@ -189,7 +259,7 @@ endif
 
 # PowerPC-specific CUDA compiler flags (to be reviewed!)
 ifeq ($(UNAME_P),ppc64le)
-  CUFLAGS+= -Xcompiler -mno-float128
+  GPUFLAGS+= -Xcompiler -mno-float128
 endif
 
 #-------------------------------------------------------------------------------
@@ -199,10 +269,10 @@ endif
 # Set the default OMPFLAGS choice
 ifneq ($(shell $(CXX) --version | egrep '^Intel'),)
 override OMPFLAGS = -fopenmp
-###override OMPFLAGS = # disable OpenMP MT on Intel (was ok without nvcc but not ok with nvcc before #578)
+###override OMPFLAGS = # disable OpenMP MT on Intel (was ok without GPUCC but not ok with GPUCC before #578)
 else ifneq ($(shell $(CXX) --version | egrep '^(clang)'),)
 override OMPFLAGS = -fopenmp
-###override OMPFLAGS = # disable OpenMP MT on clang (was not ok without or with nvcc before #578)
+###override OMPFLAGS = # disable OpenMP MT on clang (was not ok without or with GPUCC before #578)
 else ifneq ($(shell $(CXX) --version | egrep '^(Apple clang)'),)
 override OMPFLAGS = # disable OpenMP MT on Apple clang (builds fail in the CI #578)
 else
@@ -253,7 +323,10 @@ endif
 
 # Set the default RNDGEN (random number generator) choice
 ifeq ($(RNDGEN),)
-  ifeq ($(NVCC),)
+  ifeq ($(GPUCC),)
+    override RNDGEN = hasNoCurand
+  # Edgecase for HIP compilation
+  else ifeq ($(findstring hipcc,$(GPUCC)),hipcc)
     override RNDGEN = hasNoCurand
   else ifeq ($(RNDGEN),)
     override RNDGEN = hasCurand
@@ -328,13 +401,13 @@ CXXFLAGS+= $(AVXFLAGS)
 $(info FPTYPE=$(FPTYPE))
 ifeq ($(FPTYPE),d)
   CXXFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_DOUBLE
-  CUFLAGS  += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_DOUBLE
+  GPUFLAGS  += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_DOUBLE
 else ifeq ($(FPTYPE),f)
   CXXFLAGS += -DMGONGPU_FPTYPE_FLOAT -DMGONGPU_FPTYPE2_FLOAT
-  CUFLAGS  += -DMGONGPU_FPTYPE_FLOAT -DMGONGPU_FPTYPE2_FLOAT
+  GPUFLAGS  += -DMGONGPU_FPTYPE_FLOAT -DMGONGPU_FPTYPE2_FLOAT
 else ifeq ($(FPTYPE),m)
   CXXFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_FLOAT
-  CUFLAGS  += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_FLOAT
+  GPUFLAGS  += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_FLOAT
 else
   $(error Unknown FPTYPE='$(FPTYPE)': only 'd', 'f' and 'm' are supported)
 endif
@@ -343,7 +416,7 @@ endif
 $(info HELINL=$(HELINL))
 ifeq ($(HELINL),1)
   CXXFLAGS += -DMGONGPU_INLINE_HELAMPS
-  CUFLAGS  += -DMGONGPU_INLINE_HELAMPS
+  GPUFLAGS  += -DMGONGPU_INLINE_HELAMPS
 else ifneq ($(HELINL),0)
   $(error Unknown HELINL='$(HELINL)': only '0' and '1' are supported)
 endif
@@ -352,7 +425,7 @@ endif
 $(info HRDCOD=$(HRDCOD))
 ifeq ($(HRDCOD),1)
   CXXFLAGS += -DMGONGPU_HARDCODE_PARAM
-  CUFLAGS  += -DMGONGPU_HARDCODE_PARAM
+  GPUFLAGS  += -DMGONGPU_HARDCODE_PARAM
 else ifneq ($(HRDCOD),0)
   $(error Unknown HRDCOD='$(HRDCOD)': only '0' and '1' are supported)
 endif
@@ -404,11 +477,11 @@ ifeq ($(UNAME_S),Darwin)
   override CULIBFLAGSRPATH2 =
 else
   # RPATH to cuda/cpp libs when linking executables
-  override CXXLIBFLAGSRPATH = -Wl,-rpath,$(LIBDIRRPATH)
-  override CULIBFLAGSRPATH = -Xlinker -rpath,$(LIBDIRRPATH)
+  override CXXLIBFLAGSRPATH = -Wl,-rpath=$(LIBDIRRPATH)
+  override CULIBFLAGSRPATH = -Xlinker -rpath=$(LIBDIRRPATH)
   # RPATH to common lib when linking cuda/cpp libs
-  override CXXLIBFLAGSRPATH2 = -Wl,-rpath,'$$ORIGIN'
-  override CULIBFLAGSRPATH2 = -Xlinker -rpath,'$$ORIGIN'
+  override CXXLIBFLAGSRPATH2 = -Wl,-rpath='$$ORIGIN'
+  override CULIBFLAGSRPATH2 = -Xlinker -rpath='$$ORIGIN'
 endif
 
 # Setting LD_LIBRARY_PATH or DYLD_LIBRARY_PATH in the RUNTIME is no longer necessary (neither on Linux nor on Mac)
@@ -421,7 +494,7 @@ override RUNTIME =
 cxx_main=$(BUILDDIR)/check.exe
 fcxx_main=$(BUILDDIR)/fcheck.exe
 
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 cu_main=$(BUILDDIR)/gcheck.exe
 fcu_main=$(BUILDDIR)/fgcheck.exe
 else
@@ -452,15 +525,16 @@ $(BUILDDIR)/.build.$(TAG):
 	@touch $(BUILDDIR)/.build.$(TAG)
 
 # Generic target and build rules: objects from CUDA compilation
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 $(BUILDDIR)/%.o : %.cu *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
 	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
-	$(NVCC) $(CPPFLAGS) $(CUFLAGS) -Xcompiler -fPIC -c $< -o $@
+	$(GPUCC) $(CPPFLAGS) $(GPUFLAGS) $(CUBUILDRULEFLAGS) $< -o $@
 
 $(BUILDDIR)/%_cu.o : %.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
 	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
-	$(NVCC) $(CPPFLAGS) $(CUFLAGS) -Xcompiler -fPIC -c -x cu $< -o $@
+	$(GPUCC) $(CPPFLAGS) $(GPUFLAGS) $(CCBUILDRULEFLAGS) $< -o $@
 endif
+# -x cu in line above
 
 # Generic target and build rules: objects from C++ compilation
 # (NB do not include CUINC here! add it only for NVTX or curand #679)
@@ -469,11 +543,14 @@ $(BUILDDIR)/%.o : %.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
 	$(CXX) $(CPPFLAGS) $(CXXFLAGS) -fPIC -c $< -o $@
 
 # Apply special build flags only to CrossSectionKernel.cc and gCrossSectionKernel.cu (no fast math, see #117)
+# Added edgecase for HIP compilation
 ifeq ($(shell $(CXX) --version | grep ^nvc++),)
 $(BUILDDIR)/CrossSectionKernels.o: CXXFLAGS += -fno-fast-math
 $(BUILDDIR)/CrossSectionKernels.o: CXXFLAGS += -fno-fast-math
-ifneq ($(NVCC),)
-$(BUILDDIR)/gCrossSectionKernels.o: CUFLAGS += -Xcompiler -fno-fast-math
+ifeq ($(findstring nvcc,$(GPUCC)),nvcc)
+  $(BUILDDIR)/gCrossSectionKernels.o: GPUFLAGS += -Xcompiler -fno-fast-math
+else
+  $(BUILDDIR)/gCrossSectionKernels.o: GPUFLAGS += -fno-fast-math
 endif
 endif
 
@@ -489,10 +566,10 @@ ifeq ($(RNDGEN),hasCurand)
 $(BUILDDIR)/CurandRandomNumberKernel.o: CXXFLAGS += $(CUINC)
 endif
 
-# Avoid "warning: builtin __has_trivial_... is deprecated; use __is_trivially_... instead" in nvcc with icx2023 (#592)
+# Avoid "warning: builtin __has_trivial_... is deprecated; use __is_trivially_... instead" in GPUCC with icx2023 (#592)
 ifneq ($(shell $(CXX) --version | egrep '^(Intel)'),)
-ifneq ($(NVCC),)
-CUFLAGS += -Xcompiler -Wno-deprecated-builtins
+ifneq ($(GPUCC),)
+GPUFLAGS += -Wno-deprecated-builtins
 endif
 endif
 
@@ -500,8 +577,8 @@ endif
 # This patch does remove the warning, but I prefer to keep it disabled for the moment...
 ###ifneq ($(shell $(CXX) --version | egrep '^(clang|Apple clang|Intel)'),)
 ###$(BUILDDIR)/CrossSectionKernels.o: CXXFLAGS += -Wno-overriding-t-option
-###ifneq ($(NVCC),)
-###$(BUILDDIR)/gCrossSectionKernels.o: CUFLAGS += -Xcompiler -Wno-overriding-t-option
+###ifneq ($(GPUCC),)
+###$(BUILDDIR)/gCrossSectionKernels.o: GPUFLAGS += -Xcompiler -Wno-overriding-t-option
 ###endif
 ###endif
 
@@ -528,7 +605,7 @@ MG5AMC_CXXLIB = mg5amc_$(processid_short)_cpp
 cxx_objects_lib=$(BUILDDIR)/CPPProcess.o $(BUILDDIR)/MatrixElementKernels.o $(BUILDDIR)/BridgeKernels.o $(BUILDDIR)/CrossSectionKernels.o
 cxx_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel.o $(BUILDDIR)/RamboSamplingKernels.o
 
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 MG5AMC_CULIB = mg5amc_$(processid_short)_cuda
 cu_objects_lib=$(BUILDDIR)/gCPPProcess.o $(BUILDDIR)/gMatrixElementKernels.o $(BUILDDIR)/gBridgeKernels.o $(BUILDDIR)/gCrossSectionKernels.o
 cu_objects_exe=$(BUILDDIR)/gCommonRandomNumberKernel.o $(BUILDDIR)/gRamboSamplingKernels.o
@@ -540,11 +617,11 @@ $(LIBDIR)/lib$(MG5AMC_CXXLIB).so: cxx_objects_lib += $(BUILDDIR)/fbridge.o
 $(LIBDIR)/lib$(MG5AMC_CXXLIB).so: $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib)
 	$(CXX) -shared -o $@ $(cxx_objects_lib) $(CXXLIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB)
 
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 $(LIBDIR)/lib$(MG5AMC_CULIB).so: $(BUILDDIR)/fbridge_cu.o
 $(LIBDIR)/lib$(MG5AMC_CULIB).so: cu_objects_lib += $(BUILDDIR)/fbridge_cu.o
 $(LIBDIR)/lib$(MG5AMC_CULIB).so: $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cu_objects_lib)
-	$(NVCC) --shared -o $@ $(cu_objects_lib) $(CULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB)
+	$(GPUCC) --shared -o $@ $(cu_objects_lib) $(CULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB)
 endif
 
 #-------------------------------------------------------------------------------
@@ -561,16 +638,16 @@ $(cxx_main): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PAT
 $(cxx_main): $(BUILDDIR)/check_sa.o $(LIBDIR)/lib$(MG5AMC_CXXLIB).so $(cxx_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel.o
 	$(CXX) -o $@ $(BUILDDIR)/check_sa.o $(OMPFLAGS) -ldl -pthread $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_CXXLIB) $(cxx_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel.o $(CURANDLIBFLAGS)
 
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 ifneq ($(shell $(CXX) --version | grep ^Intel),)
-$(cu_main): LIBFLAGS += -lintlc # compile with icpx and link with nvcc (undefined reference to `_intel_fast_memcpy')
-$(cu_main): LIBFLAGS += -lsvml # compile with icpx and link with nvcc (undefined reference to `__svml_cos4_l9')
+$(cu_main): LIBFLAGS += -lintlc # compile with icpx and link with GPUCC (undefined reference to `_intel_fast_memcpy')
+$(cu_main): LIBFLAGS += -lsvml # compile with icpx and link with GPUCC (undefined reference to `__svml_cos4_l9')
 else ifneq ($(shell $(CXX) --version | grep ^nvc++),) # support nvc++ #531
 $(cu_main): LIBFLAGS += -L$(patsubst %bin/nvc++,%lib,$(subst ccache ,,$(CXX))) -lnvhpcatm -lnvcpumath -lnvc
 endif
 $(cu_main): LIBFLAGS += $(CULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH
 $(cu_main): $(BUILDDIR)/gcheck_sa.o $(LIBDIR)/lib$(MG5AMC_CULIB).so $(cu_objects_exe) $(BUILDDIR)/gCurandRandomNumberKernel.o
-	$(NVCC) -o $@ $(BUILDDIR)/gcheck_sa.o $(CUARCHFLAGS) $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe) $(BUILDDIR)/gCurandRandomNumberKernel.o $(CURANDLIBFLAGS)
+	$(GPUCC) -o $@ $(BUILDDIR)/gcheck_sa.o $(CUARCHFLAGS) $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe) $(BUILDDIR)/gCurandRandomNumberKernel.o $(CURANDLIBFLAGS)
 endif
 
 #-------------------------------------------------------------------------------
@@ -596,17 +673,17 @@ $(fcxx_main): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PA
 $(fcxx_main): $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler.o $(LIBDIR)/lib$(MG5AMC_CXXLIB).so $(cxx_objects_exe)
 	$(CXX) -o $@ $(BUILDDIR)/fcheck_sa.o $(OMPFLAGS) $(BUILDDIR)/fsampler.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CXXLIB) $(cxx_objects_exe)
 
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 ifneq ($(shell $(CXX) --version | grep ^Intel),)
-$(fcu_main): LIBFLAGS += -lintlc # compile with icpx and link with nvcc (undefined reference to `_intel_fast_memcpy')
-$(fcu_main): LIBFLAGS += -lsvml # compile with icpx and link with nvcc (undefined reference to `__svml_cos4_l9')
+$(fcu_main): LIBFLAGS += -lintlc # compile with icpx and link with GPUCC (undefined reference to `_intel_fast_memcpy')
+$(fcu_main): LIBFLAGS += -lsvml # compile with icpx and link with GPUCC (undefined reference to `__svml_cos4_l9')
 endif
 ifeq ($(UNAME_S),Darwin)
 $(fcu_main): LIBFLAGS += -L$(shell dirname $(shell $(FC) --print-file-name libgfortran.dylib)) # add path to libgfortran on Mac #375
 endif
 $(fcu_main): LIBFLAGS += $(CULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH
 $(fcu_main): $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler_cu.o $(LIBDIR)/lib$(MG5AMC_CULIB).so $(cu_objects_exe)
-	$(NVCC) -o $@ $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler_cu.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe)
+	$(GPUCC) -o $@ $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler_cu.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe)
 endif
 
 #-------------------------------------------------------------------------------
@@ -618,7 +695,7 @@ $(BUILDDIR)/testxxx.o: testxxx_cc_ref.txt
 $(testmain): $(BUILDDIR)/testxxx.o
 $(testmain): cxx_objects_exe += $(BUILDDIR)/testxxx.o # Comment out this line to skip the C++ test of xxx functions
 
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 $(BUILDDIR)/testxxx_cu.o: $(GTESTLIBS)
 $(BUILDDIR)/testxxx_cu.o: INCFLAGS += $(GTESTINC)
 $(BUILDDIR)/testxxx_cu.o: testxxx_cc_ref.txt
@@ -631,7 +708,7 @@ $(BUILDDIR)/testmisc.o: INCFLAGS += $(GTESTINC)
 $(testmain): $(BUILDDIR)/testmisc.o
 $(testmain): cxx_objects_exe += $(BUILDDIR)/testmisc.o # Comment out this line to skip the C++ miscellaneous tests
 
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 $(BUILDDIR)/testmisc_cu.o: $(GTESTLIBS)
 $(BUILDDIR)/testmisc_cu.o: INCFLAGS += $(GTESTINC)
 $(testmain): $(BUILDDIR)/testmisc_cu.o
@@ -643,12 +720,12 @@ $(BUILDDIR)/runTest.o: INCFLAGS += $(GTESTINC)
 $(testmain): $(BUILDDIR)/runTest.o
 $(testmain): cxx_objects_exe += $(BUILDDIR)/runTest.o
 
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 $(BUILDDIR)/runTest_cu.o: $(GTESTLIBS)
 $(BUILDDIR)/runTest_cu.o: INCFLAGS += $(GTESTINC)
 ifneq ($(shell $(CXX) --version | grep ^Intel),)
-$(testmain): LIBFLAGS += -lintlc # compile with icpx and link with nvcc (undefined reference to `_intel_fast_memcpy')
-$(testmain): LIBFLAGS += -lsvml # compile with icpx and link with nvcc (undefined reference to `__svml_cos4_l9')
+$(testmain): LIBFLAGS += -lintlc # compile with icpx and link with GPUCC (undefined reference to `_intel_fast_memcpy')
+$(testmain): LIBFLAGS += -lsvml # compile with icpx and link with GPUCC (undefined reference to `__svml_cos4_l9')
 else ifneq ($(shell $(CXX) --version | grep ^nvc++),) # support nvc++ #531
 $(testmain): LIBFLAGS += -L$(patsubst %bin/nvc++,%lib,$(subst ccache ,,$(CXX))) -lnvhpcatm -lnvcpumath -lnvc
 endif
@@ -672,14 +749,14 @@ $(testmain): LIBFLAGS += -lgomp
 endif
 endif
 
-ifeq ($(NVCC),) # link only runTest.o
+ifeq ($(GPUCC),) # link only runTest.o
 $(testmain): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH
 $(testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(cxx_objects_exe) $(GTESTLIBS)
 	$(CXX) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) -ldl -pthread $(LIBFLAGS)
 else # link both runTest.o and runTest_cu.o
 $(testmain): LIBFLAGS += $(CULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH
 $(testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) $(GTESTLIBS)
-	$(NVCC) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) -ldl $(LIBFLAGS) -lcuda
+	  $(GPUCC) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) -ldl $(LIBFLAGS) $(CUDATESTFLAGS)
 endif
 
 # Use flock (Linux only, no Mac) to allow 'make -j' if googletest has not yet been downloaded https://stackoverflow.com/a/32666215
@@ -782,9 +859,9 @@ ifeq ($(USECCACHE),1)
 	ccache --version | head -1
 endif
 	@echo ""
-	@echo NVCC=$(NVCC)
-ifneq ($(NVCC),)
-	$(NVCC) --version
+	@echo GPUCC=$(GPUCC)
+ifneq ($(GPUCC),)
+	$(GPUCC) --version
 endif
 	@echo ""
 	@echo CXX=$(CXX)
@@ -803,7 +880,7 @@ endif
 
 # Target: check (run the C++ test executable)
 # [NB THIS IS WHAT IS USED IN THE GITHUB CI!]
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 check: runTest cmpFcheck cmpFGcheck
 else
 check: runTest cmpFcheck
diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/fbridge.cc b/epochX/cudacpp/gg_tt.mad/SubProcesses/fbridge.cc
index f93c05b0b3..2b956730d4 100644
--- a/epochX/cudacpp/gg_tt.mad/SubProcesses/fbridge.cc
+++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/fbridge.cc
@@ -1,11 +1,11 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: S. Roiser (Oct 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Roiser, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #include "Bridge.h"
 #include "CPPProcess.h"
-#include "CudaRuntime.h"
+#include "GpuRuntime.h"
 
 extern "C"
 {
@@ -22,7 +22,7 @@ extern "C"
    * Using the same Fortran MadEvent code, linking to the hetrerogeneous library would allow access to both CPU and GPU implementations.
    * The specific heterogeneous configuration (how many GPUs, how many threads on each CPU, etc) could be loaded in CUDA/C++ from a data file.
    */
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   using namespace mg5amcGpu;
 #else
   using namespace mg5amcCpu;
@@ -46,8 +46,8 @@ extern "C"
    */
   void fbridgecreate_( CppObjectInFortran** ppbridge, const int* pnevtF, const int* pnparF, const int* pnp4F )
   {
-#ifdef __CUDACC__
-    CudaRuntime::setUp();
+#ifdef MGONGPUCPP_GPUIMPL
+    GpuRuntime::setUp();
 #endif
     // Create a process object, read parm card and set parameters
     // FIXME: the process instance can happily go out of scope because it is only needed to read parameters?
@@ -69,8 +69,8 @@ extern "C"
     Bridge<FORTRANFPTYPE>* pbridge = dynamic_cast<Bridge<FORTRANFPTYPE>*>( *ppbridge );
     if( pbridge == 0 ) throw std::runtime_error( "fbridgedelete_: invalid Bridge address" );
     delete pbridge;
-#ifdef __CUDACC__
-    CudaRuntime::tearDown();
+#ifdef MGONGPUCPP_GPUIMPL
+    GpuRuntime::tearDown();
 #endif
   }
 
@@ -100,7 +100,7 @@ extern "C"
   {
     Bridge<FORTRANFPTYPE>* pbridge = dynamic_cast<Bridge<FORTRANFPTYPE>*>( *ppbridge );
     if( pbridge == 0 ) throw std::runtime_error( "fbridgesequence_: invalid Bridge address" );
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     // Use the device/GPU implementation in the CUDA library
     // (there is also a host implementation in this library)
     pbridge->gpu_sequence( momenta, gs, rndhel, rndcol, *pchannelId, mes, selhel, selcol );
diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/fsampler.cc b/epochX/cudacpp/gg_tt.mad/SubProcesses/fsampler.cc
index 2fb445372d..3743934f41 100644
--- a/epochX/cudacpp/gg_tt.mad/SubProcesses/fsampler.cc
+++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/fsampler.cc
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Feb 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #include "mgOnGpuConfig.h"
 
@@ -13,7 +13,7 @@
 
 //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -40,7 +40,7 @@ namespace mg5amcCpu
   private:
     const int m_nevt; // The number of events in each iteration
     int m_iiter;      // The iteration counter (for random number seeding)
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
     HostBufferRndNumMomenta m_hstRndmom; // Memory buffers for random numbers
     HostBufferMomenta m_hstMomenta;      // Memory buffers for momenta
     HostBufferWeights m_hstWeights;      // Memory buffers for sampling weights
@@ -105,7 +105,7 @@ namespace mg5amcCpu
 
 extern "C"
 {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   using namespace mg5amcGpu;
 #else
   using namespace mg5amcCpu;
diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/runTest.cc b/epochX/cudacpp/gg_tt.mad/SubProcesses/runTest.cc
index 572e28aaea..461ec5c3a5 100644
--- a/epochX/cudacpp/gg_tt.mad/SubProcesses/runTest.cc
+++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/runTest.cc
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: S. Hageboeck (Nov 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 
 #include "mgOnGpuConfig.h"
 
@@ -15,7 +15,7 @@
 #include "RandomNumberKernels.h"
 #include "epoch_process_id.h"
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 using namespace mg5amcGpu;
 #else
 using namespace mg5amcCpu;
@@ -32,7 +32,7 @@ struct CUDA_CPU_TestBase : public TestDriverBase
     : TestDriverBase( npar, refFileName ) {}
 };
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 struct CPUTest : public CUDA_CPU_TestBase
 {
   // Struct data members (process, and memory structures for random numbers, momenta, matrix elements and weights on host and device)
@@ -114,7 +114,7 @@ struct CPUTest : public CUDA_CPU_TestBase
 };
 #endif
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 struct CUDATest : public CUDA_CPU_TestBase
 {
   // Reset the device when our test goes out of scope. Note that this should happen after
@@ -123,7 +123,7 @@ struct CUDATest : public CUDA_CPU_TestBase
   {
     ~DeviceReset()
     {
-      checkCuda( cudaDeviceReset() ); // this is needed by cuda-memcheck --leak-check full
+      gpuDeviceReset(); // this is needed by cuda-memcheck --leak-check full
     }
   } deviceResetter;
 
@@ -249,7 +249,7 @@ INSTANTIATE_TEST_SUITE_P( prefix, \
                           test_suite_name, \
                           testing::Values( new CUDATest( MG_EPOCH_REFERENCE_FILE_NAME ) ) );
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 MG_INSTANTIATE_TEST_SUITE_GPU( XTESTID_GPU( MG_EPOCH_PROCESS_ID ), MadgraphTest );
 #else
 MG_INSTANTIATE_TEST_SUITE_CPU( XTESTID_CPU( MG_EPOCH_PROCESS_ID ), MadgraphTest );
diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/testmisc.cc b/epochX/cudacpp/gg_tt.mad/SubProcesses/testmisc.cc
index 989aba1fdc..2bd7a9fcf9 100644
--- a/epochX/cudacpp/gg_tt.mad/SubProcesses/testmisc.cc
+++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/testmisc.cc
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 //----------------------------------------------------------------------------
 // Use ./runTest.exe --gtest_filter=*misc to run only this test
 //----------------------------------------------------------------------------
@@ -17,7 +17,7 @@
 #include <sstream>
 #include <typeinfo>
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 #define TESTID( s ) s##_GPU_MISC
 #else
 #define TESTID( s ) s##_CPU_MISC
diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/testxxx.cc b/epochX/cudacpp/gg_tt.mad/SubProcesses/testxxx.cc
index 4243e9fcec..6e8657edca 100644
--- a/epochX/cudacpp/gg_tt.mad/SubProcesses/testxxx.cc
+++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/testxxx.cc
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Apr 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #include "mgOnGpuConfig.h"
 
@@ -20,7 +20,7 @@
 #include <iomanip>
 #include <iostream>
 #include <vector>
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 #define TESTID( s ) s##_GPU_XXX
 #else
 #define TESTID( s ) s##_CPU_XXX
@@ -41,7 +41,7 @@ TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testxxx )
   assert( nevt % neppM == 0 ); // nevt must be a multiple of neppM
   assert( nevt % neppV == 0 ); // nevt must be a multiple of neppV
   // Fill in the input momenta
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   mg5amcGpu::PinnedHostBufferMomenta hstMomenta( nevt ); // AOSOA[npagM][npar=4][np4=4][neppM]
 #else
   mg5amcCpu::HostBufferMomenta hstMomenta( nevt ); // AOSOA[npagM][npar=4][np4=4][neppM]
@@ -228,7 +228,7 @@ TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testxxx )
   {
     for( int ievt = 0; ievt < nevt; ievt++ )
     {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
       using namespace mg5amcGpu;
 #else
       using namespace mg5amcCpu;
diff --git a/epochX/cudacpp/gg_tt.mad/src/HelAmps_sm.h b/epochX/cudacpp/gg_tt.mad/src/HelAmps_sm.h
index bc2adb6258..f7ecb29537 100644
--- a/epochX/cudacpp/gg_tt.mad/src/HelAmps_sm.h
+++ b/epochX/cudacpp/gg_tt.mad/src/HelAmps_sm.h
@@ -4,7 +4,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: A. Valassi (Sep 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
 // MadGraph5_aMC@NLO v. 3.5.0_lo_vect, 2023-06-09
@@ -26,7 +26,7 @@
 //#include <iomanip>
 //#include <iostream>
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/gg_tt.mad/src/Parameters_sm.cc b/epochX/cudacpp/gg_tt.mad/src/Parameters_sm.cc
index 7255e49119..459dae9e99 100644
--- a/epochX/cudacpp/gg_tt.mad/src/Parameters_sm.cc
+++ b/epochX/cudacpp/gg_tt.mad/src/Parameters_sm.cc
@@ -4,7 +4,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: A. Valassi (Sep 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
 // MadGraph5_aMC@NLO v. 3.5.0_lo_vect, 2023-06-09
diff --git a/epochX/cudacpp/gg_tt.mad/src/Parameters_sm.h b/epochX/cudacpp/gg_tt.mad/src/Parameters_sm.h
index c935779eb3..db5520aa96 100644
--- a/epochX/cudacpp/gg_tt.mad/src/Parameters_sm.h
+++ b/epochX/cudacpp/gg_tt.mad/src/Parameters_sm.h
@@ -211,7 +211,7 @@ namespace Parameters_sm_dependentCouplings
 #pragma GCC diagnostic push
 #pragma GCC diagnostic ignored "-Wunused-variable"  // e.g. <<warning: unused variable ‘mdl_G__exp__2’ [-Wunused-variable]>>
 #pragma GCC diagnostic ignored "-Wunused-parameter" // e.g. <<warning: unused parameter ‘G’ [-Wunused-parameter]>>
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 #pragma nv_diagnostic push
 #pragma nv_diag_suppress 177 // e.g. <<warning #177-D: variable "mdl_G__exp__2" was declared but never referenced>>
 #endif
@@ -238,7 +238,7 @@ namespace Parameters_sm_dependentCouplings
     // End SM implementation - no special handling of vectors of floats as in EFT (#439)
     return out;
   }
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 #pragma GCC diagnostic pop
 #pragma nv_diagnostic pop
 #endif
@@ -254,7 +254,7 @@ namespace Parameters_sm_independentCouplings
 
 //==========================================================================
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/gg_tt.mad/src/mgOnGpuConfig.h b/epochX/cudacpp/gg_tt.mad/src/mgOnGpuConfig.h
index 881353abac..390766116b 100644
--- a/epochX/cudacpp/gg_tt.mad/src/mgOnGpuConfig.h
+++ b/epochX/cudacpp/gg_tt.mad/src/mgOnGpuConfig.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jul 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MGONGPUCONFIG_H
 #define MGONGPUCONFIG_H 1
@@ -10,13 +10,26 @@
 // There are two different code bases for standalone_cudacpp (without multichannel) and madevent+cudacpp (with multichannel)
 #define MGONGPU_SUPPORTS_MULTICHANNEL 1
 
+// Is this a GPU (CUDA, HIP) or CPU implementation?
+#ifdef __CUDACC__
+#define MGONGPUCPP_GPUIMPL cuda
+#elif defined __HIPCC__
+#define MGONGPUCPP_GPUIMPL hip
+#else
+#undef MGONGPUCPP_GPUIMPL
+#endif
+
 // ** NB1 Throughputs (e.g. 6.8E8) are events/sec for "./gcheck.exe -p 65536 128 12"
 // ** NB2 Baseline on b7g47n0004 fluctuates (probably depends on load on other VMs)
 
 // Choose if curand is supported for generating random numbers
+// For CUDA, by default, it is supported
+// For HIP, by default, it is not supported
 // For C++, by default, do not inline, but allow this macro to be set from outside with e.g. -DMGONGPU_HAS_NO_CURAND
 #ifdef __CUDACC__
 #undef MGONGPU_HAS_NO_CURAND
+#elif defined __HIPCC__
+#define MGONGPU_HAS_NO_CURAND 1
 #else
 //#undef MGONGPU_HAS_NO_CURAND // default
 ////#define MGONGPU_HAS_NO_CURAND 1
@@ -52,23 +65,28 @@
 //#undef MGONGPU_HARDCODE_PARAM // default
 ////#define MGONGPU_HARDCODE_PARAM 1
 
-// Complex type in c++: std::complex or cxsmpl (CHOOSE ONLY ONE)
-#ifndef __CUDACC__
-//#define MGONGPU_CPPCXTYPE_STDCOMPLEX 1 // ~8 percent slower on float, same on double (5.1E6/double, 9.4E6/float)
-#define MGONGPU_CPPCXTYPE_CXSMPL 1 // new default (5.1E6/double, 10.2E6/float)
-#endif
-
-// Complex type in cuda: thrust or cucomplex or cxsmpl (CHOOSE ONLY ONE)
+// Complex type in CUDA: thrust or cucomplex or cxsmpl (CHOOSE ONLY ONE)
 #ifdef __CUDACC__
 #define MGONGPU_CUCXTYPE_THRUST 1 // default (~1.15E9/double, ~3.2E9/float)
 //#define MGONGPU_CUCXTYPE_CUCOMPLEX 1 // ~10 percent slower (1.03E9/double, ~2.8E9/float)
 //#define MGONGPU_CUCXTYPE_CXSMPL 1 // ~10 percent slower (1.00E9/double, ~2.9E9/float)
+
+// Complex type in HIP: cxsmpl (ONLY ONE OPTION POSSIBLE)
+#elif defined __HIPCC__
+#define MGONGPU_CUCXTYPE_CXSMPL 1 // ~10 percent slower (1.00E9/double, ~2.9E9/float)
+
+// Complex type in C++: std::complex or cxsmpl (CHOOSE ONLY ONE)
+#else
+//#define MGONGPU_CPPCXTYPE_STDCOMPLEX 1 // ~8 percent slower on float, same on double (5.1E6/double, 9.4E6/float)
+#define MGONGPU_CPPCXTYPE_CXSMPL 1 // new default (5.1E6/double, 10.2E6/float)
 #endif
 
-// Cuda nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation
+// CUDA nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation
 #ifdef __CUDACC__
-#undef MGONGPU_NSIGHT_DEBUG // default
+#undef MGONGPU_NSIGHT_DEBUG // default in CUDA
 //#define MGONGPU_NSIGHT_DEBUG 1
+#else
+#undef MGONGPU_NSIGHT_DEBUG // only option in HIP or C++
 #endif
 
 // SANITY CHECKS (floating point precision for everything but color algebra #537)
@@ -84,17 +102,21 @@
 #error You cannot use double precision for color algebra and single precision elsewhere
 #endif
 
-// SANITY CHECKS (c++ complex number implementation)
-#ifndef __CUDACC__
-#if defined MGONGPU_CPPCXTYPE_STDCOMPLEX and defined MGONGPU_CPPCXTYPE_CXSMPL
-#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CPPCXTYPE_STDCOMPLEX or MGONGPU_CPPCXTYPE_CXSMPL
+// SANITY CHECKS (CUDA complex number implementation)
+#ifdef __CUDACC__
+#if defined MGONGPU_CUCXTYPE_THRUST and defined MGONGPU_CUCXTYPE_CUCOMPLEX
+#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CUCXTYPE_THRUST or MGONGPU_CUCXTYPE_CUCOMPLEX for CUDA
+#elif defined MGONGPU_CUCXTYPE_THRUST and defined MGONGPU_CUCXTYPE_CXSMPL
+#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CUCXTYPE_THRUST or MGONGPU_CUCXTYPE_CXSMPL for CUDA
+#elif defined MGONGPU_CUCXTYPE_CUCOMPLEX and defined MGONGPU_CUCXTYPE_CXSMPL
+#error You must CHOOSE (ONE AND) ONLY ONE OF MGONGPU_CUCXTYPE_CUCOMPLEX or MGONGPU_CUCXTYPE_CXSMPL for CUDA
 #endif
 #endif
 
-// SANITY CHECKS (cuda complex number implementation)
-#ifdef __CUDACC__
-#if defined MGONGPU_CUCXTYPE_THRUST and defined MGONGPU_CUCXTYPE_CUCOMPLEX and defined MGONGPU_CUCXTYPE_CXSMPL
-#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CUCXTYPE_THRUST or MGONGPU_CUCXTYPE_CUCOMPLEX or MGONGPU_CUCXTYPE_CXSMPL
+// SANITY CHECKS (C++ complex number implementation)
+#ifndef MGONGPUCPP_GPUIMPL
+#if defined MGONGPU_CPPCXTYPE_STDCOMPLEX and defined MGONGPU_CPPCXTYPE_CXSMPL
+#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CPPCXTYPE_STDCOMPLEX or MGONGPU_CPPCXTYPE_CXSMPL for C++
 #endif
 #endif
 
@@ -131,7 +153,7 @@ namespace mgOnGpu
   // Alignment requirement for using reinterpret_cast with SIMD vectorized code
   // (using reinterpret_cast with non aligned memory may lead to segmentation faults!)
   // Only needed for C++ code but can be enforced also in NVCC builds of C++ code using CUDA>=11.2 and C++17 (#318, #319, #333)
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   constexpr int cppAlign = 64; // alignment requirement for SIMD vectorization (64-byte i.e. 512-bit)
 #endif
 
@@ -142,7 +164,7 @@ using mgOnGpu::fptype;
 using mgOnGpu::fptype2;
 
 // C++ SIMD vectorization width (this will be used to set neppV)
-#ifdef __CUDACC__ // CUDA implementation has no SIMD
+#ifdef MGONGPUCPP_GPUIMPL // CUDA and HIP implementations have no SIMD
 #undef MGONGPU_CPPSIMD
 #elif defined __AVX512VL__ && defined MGONGPU_PVW512 // C++ "512z" AVX512 with 512 width (512-bit ie 64-byte): 8 (DOUBLE) or 16 (FLOAT)
 #ifdef MGONGPU_FPTYPE_DOUBLE
@@ -172,9 +194,9 @@ using mgOnGpu::fptype2;
 #undef MGONGPU_CPPSIMD
 #endif
 
-// Cuda nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation
+// CUDA nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation
 // Arguments (not used so far): text is __FUNCTION__, code is 0 (start) or 1 (end)
-#if defined __CUDACC__ && defined MGONGPU_NSIGHT_DEBUG /* clang-format off */
+#if defined __CUDA__ && defined MGONGPU_NSIGHT_DEBUG /* clang-format off */
 #define mgDebugDeclare() __shared__ float mgDebugCounter[mgOnGpu::ntpbMAX];
 #define mgDebugInitialise() { mgDebugCounter[threadIdx.x] = 0; }
 #define mgDebug( code, text ) { mgDebugCounter[threadIdx.x] += 1; }
@@ -186,8 +208,8 @@ using mgOnGpu::fptype2;
 #define mgDebugFinalise() { /*noop*/ }
 #endif /* clang-format on */
 
-// Define empty CUDA declaration specifiers for C++
-#ifndef __CUDACC__
+// Define empty CUDA/HIP declaration specifiers for C++
+#ifndef MGONGPUCPP_GPUIMPL
 #define __global__
 #define __host__
 #define __device__
diff --git a/epochX/cudacpp/gg_tt.mad/src/mgOnGpuCxtypes.h b/epochX/cudacpp/gg_tt.mad/src/mgOnGpuCxtypes.h
index 0cb2f1db7e..4e7ab03fa2 100644
--- a/epochX/cudacpp/gg_tt.mad/src/mgOnGpuCxtypes.h
+++ b/epochX/cudacpp/gg_tt.mad/src/mgOnGpuCxtypes.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022, based on earlier work by D. Smith) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MGONGPUCXTYPES_H
 #define MGONGPUCXTYPES_H 1
@@ -19,7 +19,7 @@
 #include <complex>
 
 // Complex type in cuda: thrust or cucomplex or cxsmpl
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 #if defined MGONGPU_CUCXTYPE_THRUST
 #pragma clang diagnostic push
 #pragma clang diagnostic ignored "-Wtautological-compare" // for icpx2021/clang13 (https://stackoverflow.com/a/15864661)
@@ -201,7 +201,7 @@ namespace mgOnGpu
 {
 
   // --- Type definitions (complex type: cxtype)
-#ifdef __CUDACC__ // cuda
+#ifdef MGONGPUCPP_GPUIMPL // cuda
 #if defined MGONGPU_CUCXTYPE_THRUST
   typedef thrust::complex<fptype> cxtype;
 #elif defined MGONGPU_CUCXTYPE_CUCOMPLEX
@@ -281,7 +281,7 @@ cxmake( const std::complex<double>& c ) // std::complex to cxsmpl (double-to-flo
 
 //==========================================================================
 
-#if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_THRUST // cuda + thrust
+#if defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CUCXTYPE_THRUST // cuda + thrust
 
 //------------------------------
 // CUDA - using thrust::complex
@@ -317,11 +317,11 @@ cxmake( const cxtype& c )
   return c;
 }
 
-#endif // #if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_THRUST
+#endif // #if defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CUCXTYPE_THRUST
 
 //==========================================================================
 
-#if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_CUCOMPLEX // cuda + cucomplex
+#if defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CUCXTYPE_CUCOMPLEX // cuda + cucomplex
 
 //------------------------------
 // CUDA - using cuComplex
@@ -536,11 +536,11 @@ cxmake( const std::complex<fptype>& c ) // std::complex to cucomplex (float-to-f
   return cxmake( c.real(), c.imag() );
 }
 
-#endif // #if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_CUCOMPLEX
+#endif // #if defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CUCXTYPE_CUCOMPLEX
 
 //==========================================================================
 
-#if not defined __CUDACC__ and defined MGONGPU_CPPCXTYPE_STDCOMPLEX // c++ + stdcomplex
+#if not defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CPPCXTYPE_STDCOMPLEX // c++ + stdcomplex
 
 //------------------------------
 // C++ - using std::complex
@@ -584,7 +584,7 @@ cxmake( const std::complex<double>& c ) // std::complex to std::complex (cast do
 }
 #endif
 
-#endif // #if not defined __CUDACC__ and defined MGONGPU_CPPCXTYPE_STDCOMPLEX
+#endif // #if not defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CPPCXTYPE_STDCOMPLEX
 
 //==========================================================================
 
diff --git a/epochX/cudacpp/gg_tt.mad/src/mgOnGpuFptypes.h b/epochX/cudacpp/gg_tt.mad/src/mgOnGpuFptypes.h
index a1cde16a67..6f6cee64d6 100644
--- a/epochX/cudacpp/gg_tt.mad/src/mgOnGpuFptypes.h
+++ b/epochX/cudacpp/gg_tt.mad/src/mgOnGpuFptypes.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MGONGPUFPTYPES_H
 #define MGONGPUFPTYPES_H 1
@@ -13,7 +13,7 @@
 
 //==========================================================================
 
-#ifdef __CUDACC__ // cuda
+#ifdef MGONGPUCPP_GPUIMPL // cuda
 
 //------------------------------
 // Floating point types - Cuda
@@ -57,11 +57,11 @@ fpsqrt( const fptype& f )
 #endif
 }
 
-#endif // #ifdef __CUDACC__
+#endif // #ifdef MGONGPUCPP_GPUIMPL
 
 //==========================================================================
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 
 //------------------------------
 // Floating point types - C++
@@ -85,7 +85,7 @@ fpsqrt( const fptype& f )
   return std::sqrt( f );
 }
 
-#endif // #ifndef __CUDACC__
+#endif // #ifndef MGONGPUCPP_GPUIMPL
 
 //==========================================================================
 
diff --git a/epochX/cudacpp/gg_tt.mad/src/mgOnGpuVectors.h b/epochX/cudacpp/gg_tt.mad/src/mgOnGpuVectors.h
index 9d3e82b1e3..7904b93c61 100644
--- a/epochX/cudacpp/gg_tt.mad/src/mgOnGpuVectors.h
+++ b/epochX/cudacpp/gg_tt.mad/src/mgOnGpuVectors.h
@@ -108,7 +108,7 @@ namespace mgOnGpu /* clang-format off */
 #endif
 #endif
 
-#else // i.e #ifndef MGONGPU_CPPSIMD (this includes #ifdef __CUDACC__)
+#else // i.e #ifndef MGONGPU_CPPSIMD (this includes #ifdef MGONGPUCPP_GPUIMPL)
 
   const int neppV = 1;
 
@@ -129,7 +129,7 @@ using mgOnGpu::bool_v;
 
 //--------------------------------------------------------------------------
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 
 // Printout to stream for user defined types
 
@@ -744,11 +744,11 @@ fpvsplit1( const fptype2_v& v )
 
 #endif // #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
 
-#endif // #ifndef __CUDACC__
+#endif // #ifndef MGONGPUCPP_GPUIMPL
 
 //==========================================================================
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 
 //------------------------------
 // Vector types - CUDA
@@ -786,12 +786,12 @@ cxternary( const bool& mask, const cxtype& a, const cxtype& b )
   return ( mask ? a : b );
 }
 
-#endif // #ifdef __CUDACC__
+#endif // #ifdef MGONGPUCPP_GPUIMPL
 
 //==========================================================================
 
 // Scalar-or-vector types: scalar in CUDA, vector or scalar in C++
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 typedef bool bool_sv;
 typedef fptype fptype_sv;
 typedef fptype2 fptype2_sv;
@@ -812,7 +812,7 @@ typedef mgOnGpu::cxtype_ref cxtype_sv_ref;
 #endif
 
 // Scalar-or-vector zeros: scalar in CUDA, vector or scalar in C++
-#ifdef __CUDACC__ /* clang-format off */
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
 inline __host__ __device__ cxtype cxzero_sv(){ return cxtype( 0, 0 ); }
 #elif defined MGONGPU_CPPSIMD
 inline cxtype_v cxzero_sv() { return cxtype_v(); } // RRRR=0000 IIII=0000
diff --git a/epochX/cudacpp/gg_tt.mad/src/rambo.h b/epochX/cudacpp/gg_tt.mad/src/rambo.h
index e02ea52496..cd7e1008ea 100644
--- a/epochX/cudacpp/gg_tt.mad/src/rambo.h
+++ b/epochX/cudacpp/gg_tt.mad/src/rambo.h
@@ -4,7 +4,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 
 #include "mgOnGpuConfig.h"
@@ -18,7 +18,7 @@
 #include <iostream>
 
 // Simplified rambo version for 2 to N (with N>=2) processes with massless particles
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -83,7 +83,7 @@ namespace mg5amcCpu
       static bool first = true;
       if( first )
       {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
         if constexpr( M_ACCESS::isOnDevice() ) // avoid
         {
           const int ievt0 = 0;
@@ -166,7 +166,7 @@ namespace mg5amcCpu
     wt = po2log;
     if( nparf != 2 ) wt = ( 2. * nparf - 4. ) * log( energy ) + z[nparf - 1];
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
     // issue warnings if weight is too small or too large
     static int iwarn[5] = { 0, 0, 0, 0, 0 };
     if( wt < -180. )
diff --git a/epochX/cudacpp/gg_tt.sa/CODEGEN_cudacpp_gg_tt_log.txt b/epochX/cudacpp/gg_tt.sa/CODEGEN_cudacpp_gg_tt_log.txt
index b8f6269784..b017693308 100644
--- a/epochX/cudacpp/gg_tt.sa/CODEGEN_cudacpp_gg_tt_log.txt
+++ b/epochX/cudacpp/gg_tt.sa/CODEGEN_cudacpp_gg_tt_log.txt
@@ -51,8 +51,8 @@ Note that you can still compile and run aMC@NLO with the built-in PDFs
  MG5_aMC> set lhapdf /PATH/TO/lhapdf-config
 
 Using default text editor "vi". Set another one in ./input/mg5_configuration.txt
-No valid eps viewer found. Please set in ./input/mg5_configuration.txt
-Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt
+Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt
+No valid web browser found. Please set in ./input/mg5_configuration.txt
 import /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_cudacpp_gg_tt.mg
 The import format was not given, so we guess it as command
 set stdout_level DEBUG
@@ -62,7 +62,7 @@ generate g g > t t~
 No model currently active, so we import the Standard Model
 INFO: load particles 
 INFO: load vertices 
-[1;32mDEBUG: model prefixing  takes 0.005010843276977539 [0m
+[1;32mDEBUG: model prefixing  takes 0.005097150802612305 [0m
 INFO: Restrict model sm with file models/sm/restrict_default.dat . 
 [1;32mDEBUG: Simplifying conditional expressions [0m
 [1;32mDEBUG: remove interactions: u s w+ at order: QED=1 [0m
@@ -155,7 +155,7 @@ INFO: Please specify coupling orders to bypass this step.
 INFO: Trying coupling order WEIGHTED<=2: WEIGTHED IS QCD+2*QED 
 INFO: Trying process: g g > t t~ WEIGHTED<=2 @1  
 INFO: Process has 3 diagrams 
-1 processes with 3 diagrams generated in 0.007 s
+1 processes with 3 diagrams generated in 0.008 s
 Total: 1 processes with 3 diagrams
 output standalone_cudacpp CODEGEN_cudacpp_gg_tt
 Load PLUGIN.CUDACPP_SA_OUTPUT
@@ -186,11 +186,11 @@ FileWriter <class 'PLUGIN.CUDACPP_SA_OUTPUT.model_handling.PLUGIN_CPPWriter'> fo
 [1;32mDEBUG:  type(self.helas_call_writer) = [0m <class 'PLUGIN.CUDACPP_SA_OUTPUT.model_handling.PLUGIN_GPUFOHelasCallWriter'> [1;30m[model_handling.py at line 1151][0m [0m
 [1;32mDEBUG:  self.support_multichannel, self.include_multi_channel = [0m True False [1;30m[model_handling.py at line 1152][0m [0m
 [1;32mDEBUG:  multi_channel_map = [0m None [1;30m[model_handling.py at line 1643][0m [0m
-[1;32mDEBUG:  diag_to_config = [0m {} [1;30m[model_handling.py at line 1698][0m [0m
-[1;32mDEBUG:  call = [0m vxxxxx( momenta,m_pars->%s, cHel[ihel][%d],%+d, w_sv[%d], %d ); [1;30m[model_handling.py at line 1810][0m [0m
-[1;32mDEBUG:  ('ZERO', 0, -1, 0, 0) [1;30m[model_handling.py at line 1811][0m [0m
-[1;32mDEBUG:  call = [0m vxxxxx( momenta,m_pars->%s, cHel[ihel][%d],%+d, w_sv[%d], %d ); [1;30m[model_handling.py at line 1810][0m [0m
-[1;32mDEBUG:  ('ZERO', 1, -1, 1, 1) [1;30m[model_handling.py at line 1811][0m [0m
+[1;32mDEBUG:  diag_to_config = [0m {} [1;30m[model_handling.py at line 1700][0m [0m
+[1;32mDEBUG:  call = [0m vxxxxx( momenta,m_pars->%s, cHel[ihel][%d],%+d, w_sv[%d], %d ); [1;30m[model_handling.py at line 1812][0m [0m
+[1;32mDEBUG:  ('ZERO', 0, -1, 0, 0) [1;30m[model_handling.py at line 1813][0m [0m
+[1;32mDEBUG:  call = [0m vxxxxx( momenta,m_pars->%s, cHel[ihel][%d],%+d, w_sv[%d], %d ); [1;30m[model_handling.py at line 1812][0m [0m
+[1;32mDEBUG:  ('ZERO', 1, -1, 1, 1) [1;30m[model_handling.py at line 1813][0m [0m
 INFO: Created files CPPProcess.h and CPPProcess.cc in directory /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_cudacpp_gg_tt/SubProcesses/P1_Sigma_sm_gg_ttx/. 
 [1;32mDEBUG:  Entering PLUGIN_OneProcessExporter.edit_CMakeLists [1;30m[model_handling.py at line 1332][0m [0m
 [1;32mDEBUG:  Entering PLUGIN_OneProcessExporter.edit_check_sa [1;30m[model_handling.py at line 1341][0m [0m
@@ -205,7 +205,7 @@ Generated helas calls for 1 subprocesses (3 diagrams) in 0.005 s
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates VVV1 set of routines with options: P0[0m
 ALOHA: aloha creates FFV1 routines[0m
-ALOHA: aloha creates 2 routines in  0.123 s
+ALOHA: aloha creates 2 routines in  0.136 s
 <class 'aloha.create_aloha.AbstractRoutine'> VVV1
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
@@ -239,6 +239,6 @@ INFO: /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_cudacpp_gg_tt/src/. a
 [1;32mDEBUG:  Entering PLUGIN_ProcessExporter.finalize [1;30m[output.py at line 203][0m [0m
 quit
 
-real	0m0.581s
-user	0m0.514s
-sys	0m0.058s
+real	0m2.710s
+user	0m0.550s
+sys	0m0.052s
diff --git a/epochX/cudacpp/gg_tt.sa/COPYRIGHT b/epochX/cudacpp/gg_tt.sa/COPYRIGHT
index a134b5fef9..84a883fbb0 100644
--- a/epochX/cudacpp/gg_tt.sa/COPYRIGHT
+++ b/epochX/cudacpp/gg_tt.sa/COPYRIGHT
@@ -15,6 +15,7 @@ The full development team currently includes the following authors :
   Stephan Hageboeck (CERN)
   Olivier Mattelaer (Universite Catholique de Louvain, original author)
   Stefan Roiser (CERN, original author)
+  Joergen Teig (CERN)
   Andrea Valassi (CERN, original author)
   Zenny Wettersten (CERN)
 See https://github.com/madgraph5/madgraph4gpu for more details. For the full
diff --git a/epochX/cudacpp/gg_tt.sa/SubProcesses/Bridge.h b/epochX/cudacpp/gg_tt.sa/SubProcesses/Bridge.h
index 4cafe0c997..c04628dfd1 100644
--- a/epochX/cudacpp/gg_tt.sa/SubProcesses/Bridge.h
+++ b/epochX/cudacpp/gg_tt.sa/SubProcesses/Bridge.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: S. Roiser (Nov 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Roiser, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef BRIDGE_H
 #define BRIDGE_H 1
@@ -22,7 +22,7 @@
 #include <memory>
 #include <type_traits>
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -82,7 +82,7 @@ namespace mg5amcCpu
     Bridge& operator=( const Bridge& ) = delete;
     Bridge& operator=( Bridge&& ) = delete;
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     /**
      * Set the gpublocks and gputhreads for the gpusequence - throws if evnt != gpublocks*gputhreads
      * (this is needed for BridgeKernel tests rather than for actual production use in Fortran)
@@ -149,7 +149,7 @@ namespace mg5amcCpu
     unsigned int m_nevt; // number of events
     int m_nGoodHel;      // the number of good helicities (-1 initially when they have not yet been calculated)
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     int m_gputhreads; // number of gpu threads (default set from number of events, can be modified)
     int m_gpublocks;  // number of gpu blocks (default set from number of events, can be modified)
     mg5amcGpu::DeviceBuffer<FORTRANFPTYPE, sizePerEventMomenta> m_devMomentaF;
@@ -186,12 +186,12 @@ namespace mg5amcCpu
   // Forward declare transposition methods
   //
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 
   template<typename Tin, typename Tout>
   __global__ void dev_transposeMomentaF2C( const Tin* in, Tout* out, const unsigned int nevt );
 
-#endif // __CUDACC__
+#endif // MGONGPUCPP_GPUIMPL
 
   template<typename Tin, typename Tout>
   void hst_transposeMomentaF2C( const Tin* in, Tout* out, const unsigned int nevt );
@@ -208,7 +208,7 @@ namespace mg5amcCpu
   Bridge<FORTRANFPTYPE>::Bridge( unsigned int nevtF, unsigned int nparF, unsigned int np4F )
     : m_nevt( nevtF )
     , m_nGoodHel( -1 )
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     , m_gputhreads( 256 )                  // default number of gpu threads
     , m_gpublocks( m_nevt / m_gputhreads ) // this ensures m_nevt <= m_gpublocks*m_gputhreads
     , m_devMomentaF( m_nevt )
@@ -232,7 +232,7 @@ namespace mg5amcCpu
   {
     if( nparF != CPPProcess::npar ) throw std::runtime_error( "Bridge constructor: npar mismatch" );
     if( np4F != CPPProcess::np4 ) throw std::runtime_error( "Bridge constructor: np4 mismatch" );
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     if( ( m_nevt < s_gputhreadsmin ) || ( m_nevt % s_gputhreadsmin != 0 ) )
       throw std::runtime_error( "Bridge constructor: nevt should be a multiple of " + std::to_string( s_gputhreadsmin ) );
     while( m_nevt != m_gpublocks * m_gputhreads )
@@ -250,11 +250,11 @@ namespace mg5amcCpu
     std::cout << "WARNING! Instantiate host Bridge (nevt=" << m_nevt << ")" << std::endl;
     mg5amcCpu::CPPProcess process( /*verbose=*/false );
     m_pmek.reset( new mg5amcCpu::MatrixElementKernelHost( m_hstMomentaC, m_hstGs, m_hstRndHel, m_hstRndCol, m_hstMEs, m_hstSelHel, m_hstSelCol, m_nevt ) );
-#endif // __CUDACC__
+#endif // MGONGPUCPP_GPUIMPL
     process.initProc( "../../Cards/param_card.dat" );
   }
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   template<typename FORTRANFPTYPE>
   void Bridge<FORTRANFPTYPE>::set_gpugrid( const int gpublocks, const int gputhreads )
   {
@@ -268,7 +268,7 @@ namespace mg5amcCpu
   }
 #endif
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   template<typename FORTRANFPTYPE>
   void Bridge<FORTRANFPTYPE>::gpu_sequence( const FORTRANFPTYPE* momenta,
                                             const FORTRANFPTYPE* gs,
@@ -283,14 +283,14 @@ namespace mg5amcCpu
     constexpr int neppM = MemoryAccessMomenta::neppM;
     if constexpr( neppM == 1 && std::is_same_v<FORTRANFPTYPE, fptype> )
     {
-      checkCuda( cudaMemcpy( m_devMomentaC.data(), momenta, m_devMomentaC.bytes(), cudaMemcpyHostToDevice ) );
+      gpuMemcpy( m_devMomentaC.data(), momenta, m_devMomentaC.bytes(), gpuMemcpyHostToDevice );
     }
     else
     {
-      checkCuda( cudaMemcpy( m_devMomentaF.data(), momenta, m_devMomentaF.bytes(), cudaMemcpyHostToDevice ) );
+      gpuMemcpy( m_devMomentaF.data(), momenta, m_devMomentaF.bytes(), gpuMemcpyHostToDevice );
       const int thrPerEvt = CPPProcess::npar * CPPProcess::np4; // AV: transpose alg does 1 element per thread (NOT 1 event per thread)
       //const int thrPerEvt = 1; // AV: try new alg with 1 event per thread... this seems slower
-      dev_transposeMomentaF2C<<<m_gpublocks * thrPerEvt, m_gputhreads>>>( m_devMomentaF.data(), m_devMomentaC.data(), m_nevt );
+      gpuLaunchKernel( dev_transposeMomentaF2C, m_gpublocks * thrPerEvt, m_gputhreads, m_devMomentaF.data(), m_devMomentaC.data(), m_nevt );
     }
     if constexpr( std::is_same_v<FORTRANFPTYPE, fptype> )
     {
@@ -333,7 +333,7 @@ namespace mg5amcCpu
   }
 #endif
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   template<typename FORTRANFPTYPE>
   void Bridge<FORTRANFPTYPE>::cpu_sequence( const FORTRANFPTYPE* momenta,
                                             const FORTRANFPTYPE* gs,
@@ -388,7 +388,7 @@ namespace mg5amcCpu
   // - C++ array: momenta[npagM][npar][np4][neppM] with nevt=npagM*neppM (AOSOA)
   //
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   template<typename Tin, typename Tout>
   __global__ void dev_transposeMomentaF2C( const Tin* in, Tout* out, const unsigned int nevt )
   {
diff --git a/epochX/cudacpp/gg_tt.sa/SubProcesses/BridgeKernels.cc b/epochX/cudacpp/gg_tt.sa/SubProcesses/BridgeKernels.cc
index cef4cb3c71..90c7f2d3b8 100644
--- a/epochX/cudacpp/gg_tt.sa/SubProcesses/BridgeKernels.cc
+++ b/epochX/cudacpp/gg_tt.sa/SubProcesses/BridgeKernels.cc
@@ -1,10 +1,11 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #include "BridgeKernels.h"
 
+#include "GpuAbstraction.h"
 #include "MemoryAccessMomenta.h"
 
 #include <sstream>
@@ -14,7 +15,7 @@ constexpr int npar = CPPProcess::npar; // #particles in total (external = initia
 
 //============================================================================
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -45,7 +46,7 @@ namespace mg5amcCpu
 
 //============================================================================
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 namespace mg5amcCpu
 {
 
@@ -96,7 +97,7 @@ namespace mg5amcCpu
 
 //============================================================================
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 {
 
diff --git a/epochX/cudacpp/gg_tt.sa/SubProcesses/BridgeKernels.h b/epochX/cudacpp/gg_tt.sa/SubProcesses/BridgeKernels.h
index 15eb4bff4d..3efef8ce97 100644
--- a/epochX/cudacpp/gg_tt.sa/SubProcesses/BridgeKernels.h
+++ b/epochX/cudacpp/gg_tt.sa/SubProcesses/BridgeKernels.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef BRIDGEKERNELS_H
 #define BRIDGEKERNELS_H 1
@@ -12,7 +12,7 @@
 #include "MatrixElementKernels.h"
 #include "MemoryBuffers.h"
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -49,7 +49,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A Bridge wrapper class encapsulating matrix element calculations on a CPU host
   class BridgeKernelHost final : public BridgeKernelBase
   {
@@ -89,7 +89,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   // A Bridge wrapper class encapsulating matrix element calculations on a GPU device
   class BridgeKernelDevice : public BridgeKernelBase
   {
diff --git a/epochX/cudacpp/gg_tt.sa/SubProcesses/CommonRandomNumberKernel.cc b/epochX/cudacpp/gg_tt.sa/SubProcesses/CommonRandomNumberKernel.cc
index 985b39f576..010bc4cbd0 100644
--- a/epochX/cudacpp/gg_tt.sa/SubProcesses/CommonRandomNumberKernel.cc
+++ b/epochX/cudacpp/gg_tt.sa/SubProcesses/CommonRandomNumberKernel.cc
@@ -1,15 +1,16 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #include "CommonRandomNumbers.h"
+#include "GpuAbstraction.h"
 #include "MemoryBuffers.h"
 #include "RandomNumberKernels.h"
 
 #include <cassert>
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/gg_tt.sa/SubProcesses/CrossSectionKernels.cc b/epochX/cudacpp/gg_tt.sa/SubProcesses/CrossSectionKernels.cc
index 0b355a3c8d..c15b39844d 100644
--- a/epochX/cudacpp/gg_tt.sa/SubProcesses/CrossSectionKernels.cc
+++ b/epochX/cudacpp/gg_tt.sa/SubProcesses/CrossSectionKernels.cc
@@ -1,10 +1,11 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #include "CrossSectionKernels.h"
 
+#include "GpuAbstraction.h"
 #include "MemoryAccessMatrixElements.h"
 #include "MemoryAccessWeights.h"
 #include "MemoryBuffers.h"
@@ -77,7 +78,7 @@ debug_me_is_abnormal( const fptype& me, size_t ievtALL )
 
 //============================================================================
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -185,7 +186,7 @@ namespace mg5amcCpu
 
 //============================================================================
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 {
 
diff --git a/epochX/cudacpp/gg_tt.sa/SubProcesses/CrossSectionKernels.h b/epochX/cudacpp/gg_tt.sa/SubProcesses/CrossSectionKernels.h
index 7933ca4bbf..4d9659e04e 100644
--- a/epochX/cudacpp/gg_tt.sa/SubProcesses/CrossSectionKernels.h
+++ b/epochX/cudacpp/gg_tt.sa/SubProcesses/CrossSectionKernels.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef CROSSSECTIONKERNELS_H
 #define CROSSSECTIONKERNELS_H 1
@@ -13,7 +13,7 @@
 
 //============================================================================
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -96,7 +96,7 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
   /*
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   // A class encapsulating the calculation of event statistics on a GPU device
   class CrossSectionKernelDevice : public CrossSectionKernelBase, public NumberOfEvents
   {
diff --git a/epochX/cudacpp/gg_tt.sa/SubProcesses/CurandRandomNumberKernel.cc b/epochX/cudacpp/gg_tt.sa/SubProcesses/CurandRandomNumberKernel.cc
index eb56333b03..38c477c17a 100644
--- a/epochX/cudacpp/gg_tt.sa/SubProcesses/CurandRandomNumberKernel.cc
+++ b/epochX/cudacpp/gg_tt.sa/SubProcesses/CurandRandomNumberKernel.cc
@@ -1,9 +1,9 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
-#include "CudaRuntime.h"
+#include "GpuRuntime.h"
 #include "MemoryBuffers.h"
 #include "RandomNumberKernels.h"
 
@@ -114,7 +114,7 @@ namespace mg5amcCpu
     /*
     printf( "\nCurandRandomNumberKernel::generateRnarray size = %d\n", (int)m_rnarray.size() );
     fptype* data = m_rnarray.data();
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     if( m_rnarray.isOnDevice() )
     {
       data = new fptype[m_rnarray.size()]();
@@ -123,7 +123,7 @@ namespace mg5amcCpu
 #endif
     for( int i = 0; i < ( (int)m_rnarray.size() / 4 ); i++ )
       printf( "[%4d] %f %f %f %f\n", i * 4, data[i * 4], data[i * 4 + 2], data[i * 4 + 2], data[i * 4 + 3] );
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     if( m_rnarray.isOnDevice() ) delete[] data;
 #endif
     */
diff --git a/epochX/cudacpp/gg_tt.sa/SubProcesses/EventStatistics.h b/epochX/cudacpp/gg_tt.sa/SubProcesses/EventStatistics.h
index 48b51e0a49..b425a5bade 100644
--- a/epochX/cudacpp/gg_tt.sa/SubProcesses/EventStatistics.h
+++ b/epochX/cudacpp/gg_tt.sa/SubProcesses/EventStatistics.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef EventStatistics_H
 #define EventStatistics_H 1
@@ -16,7 +16,7 @@
 #include <limits>
 #include <string>
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/gg_tt.sa/SubProcesses/GpuAbstraction.h b/epochX/cudacpp/gg_tt.sa/SubProcesses/GpuAbstraction.h
new file mode 100644
index 0000000000..6a7d9c05c0
--- /dev/null
+++ b/epochX/cudacpp/gg_tt.sa/SubProcesses/GpuAbstraction.h
@@ -0,0 +1,71 @@
+// Copyright (C) 2020-2023 CERN and UCLouvain.
+// Licensed under the GNU Lesser General Public License (version 3 or later).
+// Created by: J. Teig (Jul 2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+
+#ifndef MG5AMC_GPUABSTRACTION_H
+#define MG5AMC_GPUABSTRACTION_H 1
+
+#include <cassert>
+
+//--------------------------------------------------------------------------
+
+#ifdef __CUDACC__
+
+#define gpuError_t cudaError_t
+#define gpuPeekAtLastError cudaPeekAtLastError
+#define gpuGetErrorString cudaGetErrorString
+#define gpuSuccess cudaSuccess
+
+#define gpuMallocHost( ptr, size ) checkGpu( cudaMallocHost( ptr, size ) )
+#define gpuMalloc( ptr, size ) checkGpu( cudaMalloc( ptr, size ) )
+
+#define gpuMemcpy( dstData, srcData, srcBytes, func ) checkGpu( cudaMemcpy( dstData, srcData, srcBytes, func ) )
+#define gpuMemcpyHostToDevice cudaMemcpyHostToDevice
+#define gpuMemcpyDeviceToHost cudaMemcpyDeviceToHost
+#define gpuMemcpyToSymbol( type1, type2, size ) checkGpu( cudaMemcpyToSymbol( type1, type2, size ) )
+
+#define gpuFree( ptr ) checkGpu( cudaFree( ptr ) )
+#define gpuFreeHost( ptr ) checkGpu( cudaFreeHost( ptr ) )
+
+#define gpuSetDevice cudaSetDevice
+#define gpuDeviceSynchronize cudaDeviceSynchronize
+#define gpuDeviceReset cudaDeviceReset
+
+#define gpuLaunchKernel( kernel, blocks, threads, ... ) kernel<<<blocks, threads>>>( __VA_ARGS__ )
+#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<<blocks, threads, sharedMem>>>( __VA_ARGS__ )
+
+//--------------------------------------------------------------------------
+
+#elif defined __HIPCC__
+
+#include "hip/hip_runtime.h"
+
+#define gpuError_t hipError_t
+#define gpuPeekAtLastError hipPeekAtLastError
+#define gpuGetErrorString hipGetErrorString
+#define gpuSuccess hipSuccess
+
+#define gpuMallocHost( ptr, size ) checkGpu( hipHostMalloc( ptr, size ) ) // HostMalloc better
+#define gpuMalloc( ptr, size ) checkGpu( hipMalloc( ptr, size ) )
+
+#define gpuMemcpy( dstData, srcData, srcBytes, func ) checkGpu( hipMemcpy( dstData, srcData, srcBytes, func ) )
+#define gpuMemcpyHostToDevice hipMemcpyHostToDevice
+#define gpuMemcpyDeviceToHost hipMemcpyDeviceToHost
+#define gpuMemcpyToSymbol( type1, type2, size ) checkGpu( hipMemcpyToSymbol( type1, type2, size ) )
+
+#define gpuFree( ptr ) checkGpu( hipFree( ptr ) )
+#define gpuFreeHost( ptr ) checkGpu( hipHostFree( ptr ) )
+
+#define gpuSetDevice hipSetDevice
+#define gpuDeviceSynchronize hipDeviceSynchronize
+#define gpuDeviceReset hipDeviceReset
+
+#define gpuLaunchKernel( kernel, blocks, threads, ... ) kernel<<<blocks, threads>>>( __VA_ARGS__ )
+#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<<blocks, threads, sharedMem>>>( __VA_ARGS__ )
+
+//--------------------------------------------------------------------------
+
+#endif
+
+#endif // MG5AMC_GPUABSTRACTION_H
diff --git a/epochX/cudacpp/gg_tt.sa/SubProcesses/CudaRuntime.h b/epochX/cudacpp/gg_tt.sa/SubProcesses/GpuRuntime.h
similarity index 62%
rename from epochX/cudacpp/gg_tt.sa/SubProcesses/CudaRuntime.h
rename to epochX/cudacpp/gg_tt.sa/SubProcesses/GpuRuntime.h
index 64ce52f4b3..93579ef08b 100644
--- a/epochX/cudacpp/gg_tt.sa/SubProcesses/CudaRuntime.h
+++ b/epochX/cudacpp/gg_tt.sa/SubProcesses/GpuRuntime.h
@@ -1,49 +1,50 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
-// Created by: S. Roiser (Jul 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+// Created by: J. Teig (Jun 2023, based on earlier work by S. Roiser) for the MG5aMC CUDACPP plugin.
+// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 
-#ifndef MG5AMC_CUDARUNTIME_H
-#define MG5AMC_CUDARUNTIME_H 1
+#ifndef MG5AMC_GPURUNTIME_H
+#define MG5AMC_GPURUNTIME_H 1
 
 // MG5AMC on GPU uses the CUDA runtime API, not the lower level CUDA driver API
 // See https://docs.nvidia.com/cuda/cuda-runtime-api/driver-vs-runtime-api.html#driver-vs-runtime-api
 
-#include <cassert>
+#include "GpuAbstraction.h"
+
 #include <iostream>
 
 //--------------------------------------------------------------------------
 
 // See https://stackoverflow.com/a/14038590
-#ifdef __CUDACC__ /* clang-format off */
-#define checkCuda( code ) { assertCuda( code, __FILE__, __LINE__ ); }
-inline void assertCuda( cudaError_t code, const char* file, int line, bool abort = true )
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
+#define checkGpu( code ) { assertGpu( code, __FILE__, __LINE__ ); }
+inline void assertGpu( gpuError_t code, const char* file, int line, bool abort = true )
 {
-  if( code != cudaSuccess )
+  if( code != gpuSuccess )
   {
-    printf( "ERROR! assertCuda: '%s' (%d) in %s:%d\n", cudaGetErrorString( code ), code, file, line );
-    if( abort ) assert( code == cudaSuccess );
+    printf( "ERROR! assertGpu: '%s' (%d) in %s:%d\n", gpuGetErrorString( code ), code, file, line );
+    if( abort ) assert( code == gpuSuccess );
   }
 }
 #endif /* clang-format on */
 
 //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 {
-  // Instantiate a CudaRuntime at the beginnining of the application's main to
-  // invoke cudaSetDevice(0) in the constructor and book a cudaDeviceReset() call in the destructor
+  // Instantiate a GpuRuntime at the beginnining of the application's main to
+  // invoke gpuSetDevice(0) in the constructor and book a gpuDeviceReset() call in the destructor
   // *** FIXME! This will all need to be designed differently when going to multi-GPU nodes! ***
-  struct CudaRuntime final
+  struct GpuRuntime final
   {
-    CudaRuntime( const bool debug = true )
+    GpuRuntime( const bool debug = true )
       : m_debug( debug ) { setUp( m_debug ); }
-    ~CudaRuntime() { tearDown( m_debug ); }
-    CudaRuntime( const CudaRuntime& ) = delete;
-    CudaRuntime( CudaRuntime&& ) = delete;
-    CudaRuntime& operator=( const CudaRuntime& ) = delete;
-    CudaRuntime& operator=( CudaRuntime&& ) = delete;
+    ~GpuRuntime() { tearDown( m_debug ); }
+    GpuRuntime( const GpuRuntime& ) = delete;
+    GpuRuntime( GpuRuntime&& ) = delete;
+    GpuRuntime& operator=( const GpuRuntime& ) = delete;
+    GpuRuntime& operator=( GpuRuntime&& ) = delete;
     bool m_debug;
 
     // Set up CUDA application
@@ -62,8 +63,8 @@ namespace mg5amcGpu
       */
       // Replace cudaFree(0) by cudaSetDevice(0), even if it is not really needed either
       // (but see https://developer.nvidia.com/blog/cuda-pro-tip-always-set-current-device-avoid-multithreading-bugs)
-      if( debug ) std::cout << "__CudaRuntime: calling cudaSetDevice(0)" << std::endl;
-      checkCuda( cudaSetDevice( 0 ) ); // SLOW!
+      if( debug ) std::cout << "__GpuRuntime: calling GpuSetDevice(0)" << std::endl;
+      checkGpu( gpuSetDevice( 0 ) ); // SLOW!
     }
 
     // Tear down CUDA application (call cudaDeviceReset)
@@ -72,14 +73,13 @@ namespace mg5amcGpu
     // See https://docs.nvidia.com/cuda/cuda-memcheck/index.html#leak-checking
     static void tearDown( const bool debug = true )
     {
-      if( debug ) std::cout << "__CudaRuntime: calling cudaDeviceReset()" << std::endl;
-      checkCuda( cudaDeviceReset() );
+      if( debug ) std::cout << "__GpuRuntime: calling GpuDeviceReset()" << std::endl;
+      checkGpu( gpuDeviceReset() );
     }
   };
-
 }
 #endif
 
 //--------------------------------------------------------------------------
 
-#endif // MG5AMC_CUDARUNTIME_H
+#endif // MG5AMC_GPURUNTIME_H
diff --git a/epochX/cudacpp/gg_tt.sa/SubProcesses/MadgraphTest.h b/epochX/cudacpp/gg_tt.sa/SubProcesses/MadgraphTest.h
index fd7734ce42..d2ff326e20 100644
--- a/epochX/cudacpp/gg_tt.sa/SubProcesses/MadgraphTest.h
+++ b/epochX/cudacpp/gg_tt.sa/SubProcesses/MadgraphTest.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: S. Hageboeck (Dec 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MADGRAPHTEST_H_
 #define MADGRAPHTEST_H_ 1
@@ -21,7 +21,7 @@
 #include <string>
 #include <vector>
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 using mg5amcGpu::CPPProcess;
 #else
 using mg5amcCpu::CPPProcess;
@@ -200,7 +200,7 @@ class MadgraphTest : public testing::TestWithParam<TestDriverBase*>
 
 // Since we link both the CPU-only and GPU tests into the same executable, we prevent
 // a multiply defined symbol by only compiling this in the non-CUDA phase:
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 
 /// Compare momenta and matrix elements.
 /// This uses an implementation of TestDriverBase to run a madgraph workflow,
@@ -309,6 +309,6 @@ TEST_P( MadgraphTest, CompareMomentaAndME )
   }
 }
 
-#endif // __CUDACC__
+#endif // MGONGPUCPP_GPUIMPL
 
 #endif /* MADGRAPHTEST_H_ */
diff --git a/epochX/cudacpp/gg_tt.sa/SubProcesses/MatrixElementKernels.cc b/epochX/cudacpp/gg_tt.sa/SubProcesses/MatrixElementKernels.cc
index 30257195b6..d6d6c4f179 100644
--- a/epochX/cudacpp/gg_tt.sa/SubProcesses/MatrixElementKernels.cc
+++ b/epochX/cudacpp/gg_tt.sa/SubProcesses/MatrixElementKernels.cc
@@ -1,12 +1,12 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #include "MatrixElementKernels.h"
 
 #include "CPPProcess.h"
-#include "CudaRuntime.h"
+#include "GpuRuntime.h" // Includes the abstraction for Nvidia/AMD compilation
 #include "MemoryAccessMomenta.h"
 #include "MemoryBuffers.h"
 
@@ -14,7 +14,7 @@
 
 //============================================================================
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 namespace mg5amcCpu
 {
 
@@ -143,7 +143,7 @@ namespace mg5amcCpu
 
 //============================================================================
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 {
 
@@ -202,13 +202,13 @@ namespace mg5amcGpu
     PinnedHostBufferHelicityMask hstIsGoodHel( ncomb );
     DeviceBufferHelicityMask devIsGoodHel( ncomb );
     // ... 0d1. Compute good helicity mask on the device
-    computeDependentCouplings<<<m_gpublocks, m_gputhreads>>>( m_gs.data(), m_couplings.data() );
+    gpuLaunchKernel( computeDependentCouplings, m_gpublocks, m_gputhreads, m_gs.data(), m_couplings.data() );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    sigmaKin_getGoodHel<<<m_gpublocks, m_gputhreads>>>( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_numerators.data(), m_denominators.data(), devIsGoodHel.data() );
+    gpuLaunchKernel( sigmaKin_getGoodHel, m_gpublocks, m_gputhreads, m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_numerators.data(), m_denominators.data(), devIsGoodHel.data() );
 #else
-    sigmaKin_getGoodHel<<<m_gpublocks, m_gputhreads>>>( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), devIsGoodHel.data() );
+    gpuLaunchKernel( sigmaKin_getGoodHel, m_gpublocks, m_gputhreads, m_momenta.data(), m_couplings.data(), m_matrixElements.data(), devIsGoodHel.data() );
 #endif
-    checkCuda( cudaPeekAtLastError() );
+    checkGpu( gpuPeekAtLastError() );
     // ... 0d2. Copy back good helicity mask to the host
     copyHostFromDevice( hstIsGoodHel, devIsGoodHel );
     // ... 0d3. Copy back good helicity list to constant memory on the device
@@ -219,19 +219,19 @@ namespace mg5amcGpu
 
   void MatrixElementKernelDevice::computeMatrixElements( const unsigned int channelId )
   {
-    computeDependentCouplings<<<m_gpublocks, m_gputhreads>>>( m_gs.data(), m_couplings.data() );
+    gpuLaunchKernel( computeDependentCouplings, m_gpublocks, m_gputhreads, m_gs.data(), m_couplings.data() );
 #ifndef MGONGPU_NSIGHT_DEBUG
     constexpr unsigned int sharedMemSize = 0;
 #else
     constexpr unsigned int sharedMemSize = ntpbMAX * sizeof( float );
 #endif
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    sigmaKin<<<m_gpublocks, m_gputhreads, sharedMemSize>>>( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), channelId, m_numerators.data(), m_denominators.data(), m_selhel.data(), m_selcol.data() );
+    gpuLaunchKernelSharedMem( sigmaKin, m_gpublocks, m_gputhreads, sharedMemSize, m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), channelId, m_numerators.data(), m_denominators.data(), m_selhel.data(), m_selcol.data() );
 #else
-    sigmaKin<<<m_gpublocks, m_gputhreads, sharedMemSize>>>( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), m_selhel.data(), m_selcol.data() );
+    gpuLaunchKernelSharedMem( sigmaKin, m_gpublocks, m_gputhreads, sharedMemSize, m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), m_selhel.data(), m_selcol.data() );
 #endif
-    checkCuda( cudaPeekAtLastError() );
-    checkCuda( cudaDeviceSynchronize() );
+    checkGpu( gpuPeekAtLastError() );
+    checkGpu( gpuDeviceSynchronize() );
   }
 
   //--------------------------------------------------------------------------
diff --git a/epochX/cudacpp/gg_tt.sa/SubProcesses/MatrixElementKernels.h b/epochX/cudacpp/gg_tt.sa/SubProcesses/MatrixElementKernels.h
index 23e84757a2..72bd8f195b 100644
--- a/epochX/cudacpp/gg_tt.sa/SubProcesses/MatrixElementKernels.h
+++ b/epochX/cudacpp/gg_tt.sa/SubProcesses/MatrixElementKernels.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MATRIXELEMENTKERNELS_H
 #define MATRIXELEMENTKERNELS_H 1
@@ -10,7 +10,7 @@
 
 #include "MemoryBuffers.h"
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -81,7 +81,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating matrix element calculations on a CPU host
   class MatrixElementKernelHost final : public MatrixElementKernelBase, public NumberOfEvents
   {
@@ -130,7 +130,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   // A class encapsulating matrix element calculations on a GPU device
   class MatrixElementKernelDevice : public MatrixElementKernelBase, public NumberOfEvents
   {
diff --git a/epochX/cudacpp/gg_tt.sa/SubProcesses/MemoryAccessHelpers.h b/epochX/cudacpp/gg_tt.sa/SubProcesses/MemoryAccessHelpers.h
index c82a6c7635..db73e4e064 100644
--- a/epochX/cudacpp/gg_tt.sa/SubProcesses/MemoryAccessHelpers.h
+++ b/epochX/cudacpp/gg_tt.sa/SubProcesses/MemoryAccessHelpers.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MemoryAccessHelpers_H
 #define MemoryAccessHelpers_H 1
@@ -105,7 +105,7 @@ class KernelAccessHelper : public MemoryAccessHelper<T>
     }
     else
     {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
       const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid
       //printf( "kernelAccessRecord: ievt=%d threadId=%d\n", ievt, threadIdx.x );
       return T::ieventAccessRecord( buffer, ievt ); // NB fptype and fptype_sv coincide for CUDA
diff --git a/epochX/cudacpp/gg_tt.sa/SubProcesses/MemoryAccessMomenta.h b/epochX/cudacpp/gg_tt.sa/SubProcesses/MemoryAccessMomenta.h
index 0ac4faa3c7..38fade09fb 100644
--- a/epochX/cudacpp/gg_tt.sa/SubProcesses/MemoryAccessMomenta.h
+++ b/epochX/cudacpp/gg_tt.sa/SubProcesses/MemoryAccessMomenta.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MemoryAccessMomenta_H
 #define MemoryAccessMomenta_H 1
@@ -12,7 +12,7 @@
 #include "MemoryAccessHelpers.h"
 #include "MemoryAccessVectors.h"
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 using mg5amcGpu::CPPProcess;
 #else
 using mg5amcCpu::CPPProcess;
@@ -29,7 +29,7 @@ class MemoryAccessMomentaBase //_AOSOAv1
 
   // Number of Events Per Page in the momenta AOSOA memory buffer layout
   // (these are all best kept as a compile-time constants: see issue #23)
-#ifdef __CUDACC__ /* clang-format off */
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
   // -----------------------------------------------------------------------------------------------
   // --- GPUs: neppM is best set to a power of 2 times the number of fptype's in a 32-byte cacheline
   // --- This is relevant to ensure coalesced access to momenta in global memory
diff --git a/epochX/cudacpp/gg_tt.sa/SubProcesses/MemoryAccessRandomNumbers.h b/epochX/cudacpp/gg_tt.sa/SubProcesses/MemoryAccessRandomNumbers.h
index e2988d39f3..40cb089135 100644
--- a/epochX/cudacpp/gg_tt.sa/SubProcesses/MemoryAccessRandomNumbers.h
+++ b/epochX/cudacpp/gg_tt.sa/SubProcesses/MemoryAccessRandomNumbers.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MemoryAccessRandomNumbers_H
 #define MemoryAccessRandomNumbers_H 1
@@ -11,7 +11,7 @@
 #include "CPPProcess.h"
 #include "MemoryAccessHelpers.h"
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 using mg5amcGpu::CPPProcess;
 #else
 using mg5amcCpu::CPPProcess;
diff --git a/epochX/cudacpp/gg_tt.sa/SubProcesses/MemoryAccessVectors.h b/epochX/cudacpp/gg_tt.sa/SubProcesses/MemoryAccessVectors.h
index e9b197368e..08faccff0f 100644
--- a/epochX/cudacpp/gg_tt.sa/SubProcesses/MemoryAccessVectors.h
+++ b/epochX/cudacpp/gg_tt.sa/SubProcesses/MemoryAccessVectors.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MemoryAccessVectors_H
 #define MemoryAccessVectors_H 1
@@ -10,7 +10,7 @@
 
 #include "mgOnGpuVectors.h"
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 namespace mg5amcCpu // this is only needed for CPU SIMD vectorization
 {
 
diff --git a/epochX/cudacpp/gg_tt.sa/SubProcesses/MemoryBuffers.h b/epochX/cudacpp/gg_tt.sa/SubProcesses/MemoryBuffers.h
index 3093e6ed18..7756a71621 100644
--- a/epochX/cudacpp/gg_tt.sa/SubProcesses/MemoryBuffers.h
+++ b/epochX/cudacpp/gg_tt.sa/SubProcesses/MemoryBuffers.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021, based on earlier work by S. Hageboeck) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Roiser, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MemoryBuffers_H
 #define MemoryBuffers_H 1
@@ -11,12 +11,12 @@
 #include "mgOnGpuCxtypes.h"
 
 #include "CPPProcess.h"
-#include "CudaRuntime.h"
+#include "GpuRuntime.h"
 #include "Parameters_sm.h"
 
 #include <sstream>
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -87,7 +87,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   constexpr bool HostBufferALIGNED = false;   // ismisaligned=false
   constexpr bool HostBufferMISALIGNED = true; // ismisaligned=true
 
@@ -119,7 +119,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   // A class encapsulating a CUDA pinned host buffer
   template<typename T>
   class PinnedHostBufferBase : public BufferBase<T>
@@ -128,18 +128,18 @@ namespace mg5amcCpu
     PinnedHostBufferBase( const size_t size )
       : BufferBase<T>( size, false )
     {
-      checkCuda( cudaMallocHost( &( this->m_data ), this->bytes() ) );
+      gpuMallocHost( &( this->m_data ), this->bytes() );
     }
     virtual ~PinnedHostBufferBase()
     {
-      checkCuda( cudaFreeHost( this->m_data ) );
+      gpuFreeHost( this->m_data );
     }
   };
 #endif
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   // A class encapsulating a CUDA device buffer
   template<typename T>
   class DeviceBufferBase : public BufferBase<T>
@@ -148,18 +148,18 @@ namespace mg5amcCpu
     DeviceBufferBase( const size_t size )
       : BufferBase<T>( size, true )
     {
-      checkCuda( cudaMalloc( &( this->m_data ), this->bytes() ) );
+      gpuMalloc( &( this->m_data ), this->bytes() );
     }
     virtual ~DeviceBufferBase()
     {
-      checkCuda( cudaFree( this->m_data ) );
+      gpuFree( this->m_data );
     }
   };
 #endif
 
   //--------------------------------------------------------------------------
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for a given number of events
   template<typename T, size_t sizePerEvent, bool ismisaligned>
   class HostBuffer : public HostBufferBase<T, ismisaligned>, virtual private NumberOfEvents
@@ -175,7 +175,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   // A class encapsulating a CUDA pinned host buffer for a given number of events
   template<typename T, size_t sizePerEvent>
   class PinnedHostBuffer : public PinnedHostBufferBase<T>, virtual private NumberOfEvents
@@ -191,7 +191,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   // A class encapsulating a CUDA device buffer for a given number of events
   template<typename T, size_t sizePerEvent>
   class DeviceBuffer : public DeviceBufferBase<T>, virtual private NumberOfEvents
@@ -213,7 +213,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for momenta random numbers
   constexpr size_t sizePerEventRndNumMomenta = MemoryBuffers::np4 * MemoryBuffers::nparf;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for momenta random numbers
   typedef HostBuffer<fptype, sizePerEventRndNumMomenta, HostBufferALIGNED> HostBufferRndNumMomenta;
 #else
@@ -232,7 +232,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer with ONE fptype per event
   constexpr size_t sizePerEventOneFp = 1;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer with ONE fptype per event
   typedef HostBuffer<fptype, sizePerEventOneFp, HostBufferALIGNED> HostBufferOneFp;
 #else
@@ -257,7 +257,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for Gs
   constexpr size_t sizePerEventGs = 1;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for gs
   typedef HostBuffer<fptype, sizePerEventGs, HostBufferALIGNED> HostBufferGs;
 #else
@@ -276,7 +276,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for numerators
   constexpr size_t sizePerEventNumerators = 1;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for gs
   typedef HostBuffer<fptype, sizePerEventNumerators, HostBufferALIGNED> HostBufferNumerators;
 #else
@@ -296,7 +296,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for denominators
   constexpr size_t sizePerEventDenominators = 1;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for gs
   typedef HostBuffer<fptype, sizePerEventDenominators, HostBufferALIGNED> HostBufferDenominators;
 #else
@@ -315,7 +315,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for random numbers
   constexpr size_t sizePerEventCouplings = MemoryBuffers::ndcoup * MemoryBuffers::nx2;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for gs
   typedef HostBuffer<fptype, sizePerEventCouplings, HostBufferALIGNED> HostBufferCouplings;
 #else
@@ -333,7 +333,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for momenta
   constexpr size_t sizePerEventMomenta = MemoryBuffers::np4 * MemoryBuffers::npar;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for momenta
   typedef HostBuffer<fptype, sizePerEventMomenta, HostBufferALIGNED> HostBufferMomenta;
   //typedef HostBuffer<fptype, sizePerEventMomenta, HostBufferMISALIGNED> HostBufferMomenta; // TEST MISALIGNMENT!
@@ -352,7 +352,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for sampling weights
   constexpr size_t sizePerEventWeights = 1;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for sampling weights
   typedef HostBuffer<fptype, sizePerEventWeights, HostBufferALIGNED> HostBufferWeights;
 #else
@@ -370,7 +370,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for matrix elements
   constexpr size_t sizePerEventMatrixElements = 1;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for matrix elements
   typedef HostBuffer<fptype, sizePerEventMatrixElements, HostBufferALIGNED> HostBufferMatrixElements;
 #else
@@ -385,7 +385,7 @@ namespace mg5amcCpu
   // A base class encapsulating a memory buffer for the helicity mask
   typedef BufferBase<bool> BufferHelicityMask;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for the helicity mask
   typedef HostBufferBase<bool, HostBufferALIGNED> HostBufferHelicityMask;
 #else
@@ -403,7 +403,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for wavefunctions
   constexpr size_t sizePerEventWavefunctions = MemoryBuffers::nw6 * MemoryBuffers::nx2;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for wavefunctions
   typedef HostBuffer<fptype, sizePerEventWavefunctions, HostBufferALIGNED> HostBufferWavefunctions;
 #else
@@ -421,7 +421,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for helicity random numbers
   constexpr size_t sizePerEventRndNumHelicity = 1;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for helicity random numbers
   typedef HostBuffer<fptype, sizePerEventRndNumHelicity, HostBufferALIGNED> HostBufferRndNumHelicity;
 #else
@@ -439,7 +439,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for color random numbers
   constexpr size_t sizePerEventRndNumColor = 1;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for color random numbers
   typedef HostBuffer<fptype, sizePerEventRndNumColor, HostBufferALIGNED> HostBufferRndNumColor;
 #else
@@ -457,7 +457,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for helicity selection
   constexpr size_t sizePerEventSelectedHelicity = 1;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for helicity selection
   typedef HostBuffer<int, sizePerEventSelectedHelicity, HostBufferALIGNED> HostBufferSelectedHelicity;
 #else
@@ -475,7 +475,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for color selection
   constexpr size_t sizePerEventSelectedColor = 1;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for color selection
   typedef HostBuffer<int, sizePerEventSelectedColor, HostBufferALIGNED> HostBufferSelectedColor;
 #else
@@ -487,7 +487,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   template<class Tdst, class Tsrc>
   void copyDeviceFromHost( Tdst& dst, const Tsrc& src ) // keep the same order of arguments as in memcpy
   {
@@ -504,13 +504,13 @@ namespace mg5amcCpu
       throw std::runtime_error( sstr.str() );
     }
     // NB (PR #45): cudaMemcpy involves an intermediate memcpy to pinned memory if host array is a not a pinned host array
-    checkCuda( cudaMemcpy( dst.data(), src.data(), src.bytes(), cudaMemcpyHostToDevice ) );
+    gpuMemcpy( dst.data(), src.data(), src.bytes(), gpuMemcpyHostToDevice );
   }
 #endif
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   template<class Tdst, class Tsrc>
   void copyHostFromDevice( Tdst& dst, const Tsrc& src ) // keep the same order of arguments as in memcpy
   {
@@ -527,7 +527,7 @@ namespace mg5amcCpu
       throw std::runtime_error( sstr.str() );
     }
     // NB (PR #45): cudaMemcpy involves an intermediate memcpy to pinned memory if host array is a not a pinned host array
-    checkCuda( cudaMemcpy( dst.data(), src.data(), src.bytes(), cudaMemcpyDeviceToHost ) );
+    gpuMemcpy( dst.data(), src.data(), src.bytes(), gpuMemcpyDeviceToHost );
   }
 #endif
 
diff --git a/epochX/cudacpp/gg_tt.sa/SubProcesses/P1_Sigma_sm_gg_ttx/CPPProcess.cc b/epochX/cudacpp/gg_tt.sa/SubProcesses/P1_Sigma_sm_gg_ttx/CPPProcess.cc
index 327b69d008..b0d93e9401 100644
--- a/epochX/cudacpp/gg_tt.sa/SubProcesses/P1_Sigma_sm_gg_ttx/CPPProcess.cc
+++ b/epochX/cudacpp/gg_tt.sa/SubProcesses/P1_Sigma_sm_gg_ttx/CPPProcess.cc
@@ -4,7 +4,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
 // MadGraph5_aMC@NLO v. 3.5.0_lo_vect, 2023-06-09
@@ -16,7 +16,6 @@
 
 #include "mgOnGpuConfig.h"
 
-#include "CudaRuntime.h"
 #include "HelAmps_sm.h"
 #include "MemoryAccessAmplitudes.h"
 #include "MemoryAccessCouplings.h"
@@ -46,7 +45,7 @@
 // Class member functions for calculating the matrix elements for
 // Process: g g > t t~ WEIGHTED<=2 @1
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -80,7 +79,7 @@ namespace mg5amcCpu
   __device__ const fptype cIPD[2] = { (fptype)Parameters_sm::mdl_MT, (fptype)Parameters_sm::mdl_WT };
   __device__ const fptype* cIPC = nullptr; // unused as nicoup=0
 #else
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   __device__ __constant__ fptype cIPD[2];
   __device__ __constant__ fptype* cIPC = nullptr; // unused as nicoup=0
 #else
@@ -90,7 +89,7 @@ namespace mg5amcCpu
 #endif
 
   // Helicity combinations (and filtering of "good" helicity combinations)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   __device__ __constant__ short cHel[ncomb][npar];
   __device__ __constant__ int cNGoodHel;
   __device__ __constant__ int cGoodHel[ncomb];
@@ -118,13 +117,13 @@ namespace mg5amcCpu
                            fptype* allDenominators,       // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
                            fptype_sv* jamp2_sv            // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled)
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
                            , const int ievt00             // input: first event number in current C++ event page (for CUDA, ievt depends on threadid)
 #endif
                            )
   //ALWAYS_INLINE // attributes are not permitted in a function definition
   {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     using namespace mg5amcGpu;
     using M_ACCESS = DeviceAccessMomenta;         // non-trivial access: buffer includes all events
     using E_ACCESS = DeviceAccessMatrixElements;  // non-trivial access: buffer includes all events
@@ -151,7 +150,7 @@ namespace mg5amcCpu
 #endif /* clang-format on */
     mgDebug( 0, __FUNCTION__ );
     //printf( "calculate_wavefunctions: ihel=%2d\n", ihel );
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
     //printf( "calculate_wavefunctions: ievt00=%d\n", ievt00 );
 #endif
 
@@ -187,7 +186,7 @@ namespace mg5amcCpu
 #endif
     for( int iParity = 0; iParity < nParity; ++iParity )
     { // START LOOP ON IPARITY
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
       const int ievt0 = ievt00 + iParity * neppV;
 #endif
       constexpr size_t nxcoup = ndcoup + nicoup; // both dependent and independent couplings
@@ -200,8 +199,10 @@ namespace mg5amcCpu
         allCOUPs[idcoup] = CD_ACCESS::idcoupAccessBufferConst( allcouplings, idcoup ); // dependent couplings, vary event-by-event
       for( size_t iicoup = 0; iicoup < nicoup; iicoup++ )
         allCOUPs[ndcoup + iicoup] = CI_ACCESS::iicoupAccessBufferConst( cIPC, iicoup ); // independent couplings, fixed for all events
+#ifdef MGONGPUCPP_GPUIMPL
 #ifdef __CUDACC__
 #pragma nv_diagnostic pop
+#endif
       // CUDA kernels take input/output buffers with momenta/MEs for all events
       const fptype* momenta = allmomenta;
       const fptype* COUPs[nxcoup];
@@ -299,7 +300,7 @@ namespace mg5amcCpu
         { 16, -2 },
         { -2, 16 } }; // 2-D array[2][2]
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
       // Pre-compute a constexpr triangular color matrix properly normalized #475
       struct TriangularNormalizedColorMatrix
       {
@@ -356,7 +357,7 @@ namespace mg5amcCpu
 #endif
       for( int icol = 0; icol < ncolor; icol++ )
       {
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
         // === C++ START ===
         // Diagonal terms
 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
@@ -415,7 +416,7 @@ namespace mg5amcCpu
       MEs_sv_previous += deltaMEs_previous;
 #endif
       /*
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
       if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", blockDim.x * blockIdx.x + threadIdx.x, ihel, MEs_sv );
 #else
 #ifdef MGONGPU_CPPSIMD
@@ -462,8 +463,8 @@ namespace mg5amcCpu
       { 1, 1, -1, -1 },
       { 1, 1, 1, 1 },
       { 1, 1, 1, -1 } };
-#ifdef __CUDACC__
-    checkCuda( cudaMemcpyToSymbol( cHel, tHel, ncomb * npar * sizeof( short ) ) );
+#ifdef MGONGPUCPP_GPUIMPL
+    gpuMemcpyToSymbol( cHel, tHel, ncomb * npar * sizeof( short ) );
 #else
     memcpy( cHel, tHel, ncomb * npar * sizeof( short ) );
 #endif
@@ -503,9 +504,9 @@ namespace mg5amcCpu
     // Then copy them to CUDA constant memory (issue #39) or its C++ emulation in file-scope static memory
     const fptype tIPD[2] = { (fptype)m_pars->mdl_MT, (fptype)m_pars->mdl_WT };
     //const cxtype tIPC[0] = { ... }; // nicoup=0
-#ifdef __CUDACC__
-    checkCuda( cudaMemcpyToSymbol( cIPD, tIPD, 2 * sizeof( fptype ) ) );
-    //checkCuda( cudaMemcpyToSymbol( cIPC, tIPC, 0 * sizeof( cxtype ) ) ); // nicoup=0
+#ifdef MGONGPUCPP_GPUIMPL
+    gpuMemcpyToSymbol( cIPD, tIPD, 2 * sizeof( fptype ) );
+    //gpuMemcpyToSymbol( cIPC, tIPC, 0 * sizeof( cxtype ) ); // nicoup=0
 #else
     memcpy( cIPD, tIPD, 2 * sizeof( fptype ) );
     //memcpy( cIPC, tIPC, 0 * sizeof( cxtype ) ); // nicoup=0
@@ -541,7 +542,7 @@ namespace mg5amcCpu
   {
     std::stringstream out;
     // CUDA version (NVCC)
-    // [Use __NVCC__ instead of __CUDACC__ here!]
+    // [Use __NVCC__ instead of MGONGPUCPP_GPUIMPL here!]
     // [This tests if 'nvcc' was used even to build a .cc file, even if not necessarily 'nvcc -x cu' for a .cu file]
     // [Check 'nvcc --compiler-options -dM -E dummy.c | grep CUDA': see https://stackoverflow.com/a/53713712]
 #ifdef __NVCC__
@@ -606,12 +607,12 @@ namespace mg5amcCpu
   __global__ void /* clang-format off */
   computeDependentCouplings( const fptype* allgs, // input: Gs[nevt]
                              fptype* allcouplings // output: couplings[nevt*ndcoup*2]
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
                              , const int nevt     // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #endif
   ) /* clang-format on */
   {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     using namespace mg5amcGpu;
     using G_ACCESS = DeviceAccessGs;
     using C_ACCESS = DeviceAccessCouplings;
@@ -632,7 +633,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__ /* clang-format off */
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
   __global__ void
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
@@ -761,9 +762,9 @@ namespace mg5amcCpu
         nGoodHel++;
       }
     }
-#ifdef __CUDACC__
-    checkCuda( cudaMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) ) );
-    checkCuda( cudaMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) ) );
+#ifdef MGONGPUCPP_GPUIMPL
+    gpuMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) );
+    gpuMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) );
 #else
     cNGoodHel = nGoodHel;
     for( int ihel = 0; ihel < ncomb; ihel++ ) cGoodHel[ihel] = goodHel[ihel];
@@ -787,7 +788,7 @@ namespace mg5amcCpu
 #endif
             int* allselhel,                // output: helicity selection[nevt]
             int* allselcol                 // output: helicity selection[nevt]
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
             , const int nevt               // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #endif
             ) /* clang-format on */
@@ -808,7 +809,7 @@ namespace mg5amcCpu
     // Denominators: spins, colors and identical particles
     constexpr int helcolDenominators[1] = { 256 }; // assume nprocesses == 1 (#272 and #343)
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     // Remember: in CUDA this is a kernel for one event, in c++ this processes n events
     const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid
 #else
@@ -822,9 +823,12 @@ namespace mg5amcCpu
 #endif
 
     // Start sigmaKin_lines
+
+#include "GpuAbstraction.h"
+
     // === PART 0 - INITIALISATION (before calculate_wavefunctions) ===
     // Reset the "matrix elements" - running sums of |M|^2 over helicities for the given event
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     allMEs[ievt] = 0;
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     allNumerators[ievt] = 0;
@@ -852,7 +856,7 @@ namespace mg5amcCpu
     // === PART 1 - HELICITY LOOP: CALCULATE WAVEFUNCTIONS ===
     // (in both CUDA and C++, using precomputed good helicities)
 
-#ifdef __CUDACC__ // CUDA OR C++
+#ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++
 
     // *** START OF PART 1a - CUDA (one event per CPU thread) ***
     // Running sum of partial amplitudes squared for event by event color selection (#402)
@@ -1056,7 +1060,7 @@ namespace mg5amcCpu
     // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event
     // [NB 'sum over final spins, average over initial spins', eg see
     // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf]
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     allMEs[ievt] /= helcolDenominators[0];
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     if( channelId > 0 ) allMEs[ievt] *= allNumerators[ievt] / allDenominators[ievt];
diff --git a/epochX/cudacpp/gg_tt.sa/SubProcesses/P1_Sigma_sm_gg_ttx/CPPProcess.h b/epochX/cudacpp/gg_tt.sa/SubProcesses/P1_Sigma_sm_gg_ttx/CPPProcess.h
index 51f966d10f..5a6e96d9e8 100644
--- a/epochX/cudacpp/gg_tt.sa/SubProcesses/P1_Sigma_sm_gg_ttx/CPPProcess.h
+++ b/epochX/cudacpp/gg_tt.sa/SubProcesses/P1_Sigma_sm_gg_ttx/CPPProcess.h
@@ -4,7 +4,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
 // MadGraph5_aMC@NLO v. 3.5.0_lo_vect, 2023-06-09
@@ -25,7 +25,7 @@
 
 //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -107,7 +107,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   __global__ void
   computeDependentCouplings( const fptype* allgs,    // input: Gs[nevt]
                              fptype* allcouplings ); // output: couplings[nevt*ndcoup*2]
@@ -120,7 +120,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__ /* clang-format off */
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
   __global__ void
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
@@ -150,7 +150,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__ /* clang-format off */
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
   __global__ void
   sigmaKin( const fptype* allmomenta,      // input: momenta[nevt*npar*4]
             const fptype* allcouplings,    // input: couplings[nevt*ndcoup*2]
diff --git a/epochX/cudacpp/gg_tt.sa/SubProcesses/P1_Sigma_sm_gg_ttx/CudaRuntime.h b/epochX/cudacpp/gg_tt.sa/SubProcesses/P1_Sigma_sm_gg_ttx/CudaRuntime.h
deleted file mode 120000
index ce9e1a487a..0000000000
--- a/epochX/cudacpp/gg_tt.sa/SubProcesses/P1_Sigma_sm_gg_ttx/CudaRuntime.h
+++ /dev/null
@@ -1 +0,0 @@
-../CudaRuntime.h
\ No newline at end of file
diff --git a/epochX/cudacpp/gg_tt.sa/SubProcesses/P1_Sigma_sm_gg_ttx/GpuAbstraction.h b/epochX/cudacpp/gg_tt.sa/SubProcesses/P1_Sigma_sm_gg_ttx/GpuAbstraction.h
new file mode 120000
index 0000000000..72054e19ba
--- /dev/null
+++ b/epochX/cudacpp/gg_tt.sa/SubProcesses/P1_Sigma_sm_gg_ttx/GpuAbstraction.h
@@ -0,0 +1 @@
+../GpuAbstraction.h
\ No newline at end of file
diff --git a/epochX/cudacpp/gg_tt.sa/SubProcesses/P1_Sigma_sm_gg_ttx/GpuRuntime.h b/epochX/cudacpp/gg_tt.sa/SubProcesses/P1_Sigma_sm_gg_ttx/GpuRuntime.h
new file mode 120000
index 0000000000..3920e83be4
--- /dev/null
+++ b/epochX/cudacpp/gg_tt.sa/SubProcesses/P1_Sigma_sm_gg_ttx/GpuRuntime.h
@@ -0,0 +1 @@
+../GpuRuntime.h
\ No newline at end of file
diff --git a/epochX/cudacpp/gg_tt.sa/SubProcesses/P1_Sigma_sm_gg_ttx/check_sa.cc b/epochX/cudacpp/gg_tt.sa/SubProcesses/P1_Sigma_sm_gg_ttx/check_sa.cc
index f1e75b9252..1bad694d1c 100644
--- a/epochX/cudacpp/gg_tt.sa/SubProcesses/P1_Sigma_sm_gg_ttx/check_sa.cc
+++ b/epochX/cudacpp/gg_tt.sa/SubProcesses/P1_Sigma_sm_gg_ttx/check_sa.cc
@@ -4,7 +4,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: O. Mattelaer (Nov 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 
 #include "mgOnGpuConfig.h"
@@ -12,6 +12,7 @@
 #include "BridgeKernels.h"
 #include "CPPProcess.h"
 #include "CrossSectionKernels.h"
+#include "GpuRuntime.h"
 #include "MatrixElementKernels.h"
 #include "MemoryAccessMatrixElements.h"
 #include "MemoryAccessMomenta.h"
@@ -63,7 +64,7 @@ usage( char* argv0, int ret = 1 )
   std::cout << std::endl;
   std::cout << "Summary stats are always computed: '-p' and '-j' only control their printout" << std::endl;
   std::cout << "The '-d' flag only enables NaN/abnormal warnings and OMP debugging" << std::endl;
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 #ifdef _OPENMP
   std::cout << std::endl;
   std::cout << "Use the OMP_NUM_THREADS environment variable to control OMP multi-threading" << std::endl;
@@ -77,7 +78,7 @@ int
 main( int argc, char** argv )
 {
   // Namespaces for CUDA and C++ (FIXME - eventually use the same namespace everywhere...)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   using namespace mg5amcGpu;
 #else
   using namespace mg5amcCpu;
@@ -103,11 +104,11 @@ main( int argc, char** argv )
     CurandDevice = 2
   };
 #ifdef __CUDACC__
-  RandomNumberMode rndgen = RandomNumberMode::CurandDevice; // default on GPU
+  RandomNumberMode rndgen = RandomNumberMode::CurandDevice; // default on CUDA GPU
 #elif not defined MGONGPU_HAS_NO_CURAND
   RandomNumberMode rndgen = RandomNumberMode::CurandHost;  // default on CPU if build has curand
 #else
-  RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // default on CPU if build has no curand
+  RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // default on CPU if build has no curand and on HIP GPU
 #endif
   // Rambo sampling mode (NB RamboHost implies CommonRandom or CurandHost!)
   enum class RamboSamplingMode
@@ -115,7 +116,7 @@ main( int argc, char** argv )
     RamboHost = 1,
     RamboDevice = 2
   };
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   RamboSamplingMode rmbsmp = RamboSamplingMode::RamboDevice; // default on GPU
 #else
   RamboSamplingMode rmbsmp = RamboSamplingMode::RamboHost; // default on CPU
@@ -148,7 +149,7 @@ main( int argc, char** argv )
 #ifdef __CUDACC__
       rndgen = RandomNumberMode::CurandDevice;
 #else
-      throw std::runtime_error( "CurandDevice is not supported on CPUs" );
+      throw std::runtime_error( "CurandDevice is not supported on CPUs or on HIP GPUs" );
 #endif
     }
     else if( arg == "--curhst" )
@@ -165,7 +166,7 @@ main( int argc, char** argv )
     }
     else if( arg == "--rmbdev" )
     {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
       rmbsmp = RamboSamplingMode::RamboDevice;
 #else
       throw std::runtime_error( "RamboDevice is not supported on CPUs" );
@@ -239,13 +240,13 @@ main( int argc, char** argv )
     return usage( argv[0] );
   }
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 #ifdef _OPENMP
   ompnumthreadsNotSetMeansOneThread( debug ? 1 : 0 ); // quiet(-1), info(0), debug(1)
 #endif
 #endif
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // Fail gently and avoid "Illegal instruction (core dumped)" if the host does not support the SIMD used in the ME calculation
   // Note: this prevents a crash on pmpe04 but not on some github CI nodes?
   // [NB: SIMD vectorization in mg5amc C++ code is only used in the ME calculation below MatrixElementKernelHost!]
@@ -263,14 +264,14 @@ main( int argc, char** argv )
 
   // === STEP 0 - INITIALISE
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 
-  // --- 00. Initialise cuda
-  // Instantiate a CudaRuntime at the beginnining of the application's main to
-  // invoke cudaSetDevice(0) in the constructor and book a cudaDeviceReset() call in the destructor
-  const std::string cdinKey = "00 CudaInit";
+  // --- 00. Initialise GPU
+  // Instantiate a GpuRuntime at the beginnining of the application's main.
+  // For CUDA this invokes cudaSetDevice(0) in the constructor and books a cudaDeviceReset() call in the destructor.
+  const std::string cdinKey = "00 GpuInit";
   timermap.start( cdinKey );
-  CudaRuntime cudaRuntime( debug );
+  GpuRuntime GpuRuntime( debug );
 #endif
 
   // --- 0a. Initialise physics process
@@ -292,7 +293,7 @@ main( int argc, char** argv )
   timermap.start( alloKey );
 
   // Memory buffers for random numbers for momenta
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferRndNumMomenta hstRndmom( nevt );
 #else
   PinnedHostBufferRndNumMomenta hstRndmom( nevt );
@@ -300,7 +301,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for sampling weights
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferWeights hstWeights( nevt );
 #else
   PinnedHostBufferWeights hstWeights( nevt );
@@ -308,7 +309,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for momenta
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferMomenta hstMomenta( nevt );
 #else
   PinnedHostBufferMomenta hstMomenta( nevt );
@@ -316,7 +317,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for Gs
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferGs hstGs( nevt );
 #else
   PinnedHostBufferGs hstGs( nevt );
@@ -333,7 +334,7 @@ main( int argc, char** argv )
   }
 
   // Memory buffers for matrix elements
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferMatrixElements hstMatrixElements( nevt );
 #else
   PinnedHostBufferMatrixElements hstMatrixElements( nevt );
@@ -342,7 +343,7 @@ main( int argc, char** argv )
 
   // Memory buffers for random numbers for helicity selection
   // *** NB #403 these buffers always remain initialised at 0: no need for helicity choice in gcheck/check (no LHE produced) ***
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferRndNumHelicity hstRndHel( nevt );
 #else
   PinnedHostBufferRndNumHelicity hstRndHel( nevt );
@@ -351,7 +352,7 @@ main( int argc, char** argv )
 
   // Memory buffers for random numbers for color selection
   // *** NB #402 these buffers always remain initialised at 0: no need for color choice in gcheck/check (no LHE produced) ***
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferRndNumColor hstRndCol( nevt );
 #else
   PinnedHostBufferRndNumColor hstRndCol( nevt );
@@ -359,7 +360,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for helicity selection
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferSelectedHelicity hstSelHel( nevt );
 #else
   PinnedHostBufferSelectedHelicity hstSelHel( nevt );
@@ -367,7 +368,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for color selection
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferSelectedColor hstSelCol( nevt );
 #else
   PinnedHostBufferSelectedColor hstSelCol( nevt );
@@ -403,7 +404,7 @@ main( int argc, char** argv )
 #else
   else
   {
-    throw std::logic_error( "CurandDevice is not supported on CPUs" ); // INTERNAL ERROR (no path to this statement)
+    throw std::logic_error( "CurandDevice is not supported on CPUs or HIP GPUs" ); // INTERNAL ERROR (no path to this statement)
   }
 #endif
 #else
@@ -421,7 +422,7 @@ main( int argc, char** argv )
   }
   else
   {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     prsk.reset( new RamboSamplingKernelDevice( energy, devRndmom, devMomenta, devWeights, gpublocks, gputhreads ) );
 #else
     throw std::logic_error( "RamboDevice is not supported on CPUs" ); // INTERNAL ERROR (no path to this statement)
@@ -432,7 +433,7 @@ main( int argc, char** argv )
   std::unique_ptr<MatrixElementKernelBase> pmek;
   if( !bridge )
   {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     pmek.reset( new MatrixElementKernelDevice( devMomenta, devGs, devRndHel, devRndCol, devMatrixElements, devSelHel, devSelCol, gpublocks, gputhreads ) );
 #else
     pmek.reset( new MatrixElementKernelHost( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, nevt ) );
@@ -440,7 +441,7 @@ main( int argc, char** argv )
   }
   else
   {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     pmek.reset( new BridgeKernelDevice( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, gpublocks, gputhreads ) );
 #else
     pmek.reset( new BridgeKernelHost( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, nevt ) );
@@ -482,7 +483,7 @@ main( int argc, char** argv )
     prnk->generateRnarray();
     //std::cout << "Got random numbers" << std::endl;
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     if( rndgen != RandomNumberMode::CurandDevice && rmbsmp == RamboSamplingMode::RamboDevice )
     {
       // --- 1c. Copy rndmom from host to device
@@ -514,7 +515,7 @@ main( int argc, char** argv )
     prsk->getMomentaFinal();
     //std::cout << "Got final momenta" << std::endl;
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     if( rmbsmp == RamboSamplingMode::RamboDevice )
     {
       // --- 2c. CopyDToH Weights
@@ -559,7 +560,7 @@ main( int argc, char** argv )
       dynamic_cast<BridgeKernelBase*>( pmek.get() )->transposeInputMomentaC2F();
     }
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     // --- 2d. CopyHToD Momenta
     const std::string gKey = "0.. CpHTDg";
     rambtime += timermap.start( gKey ); // FIXME! NOT A RAMBO TIMER!
@@ -588,7 +589,7 @@ main( int argc, char** argv )
     wv3atime += timermap.stop(); // calc only
     wavetime += wv3atime;        // calc plus copy
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     if( !bridge )
     {
       // --- 3b. CopyDToH MEs
@@ -731,15 +732,19 @@ main( int argc, char** argv )
     rndgentxt = "CURAND DEVICE";
 #ifdef __CUDACC__
   rndgentxt += " (CUDA code)";
+#elif defined __HIPCC__
+  rndgentxt += " (HIP code)";
 #else
   rndgentxt += " (C++ code)";
 #endif
 
   // Workflow description summary
   std::string wrkflwtxt;
-  // -- CUDA or C++?
+  // -- CUDA or HIP or C++?
 #ifdef __CUDACC__
   wrkflwtxt += "CUD:";
+#elif defined __HIPCC__
+  wrkflwtxt += "HIP:";
 #else
   wrkflwtxt += "CPP:";
 #endif
@@ -764,6 +769,12 @@ main( int argc, char** argv )
 #else
   wrkflwtxt += "???:"; // no path to this statement
 #endif
+#elif defined __HIPCC__
+#if defined MGONGPU_CUCXTYPE_CXSMPL
+  wrkflwtxt += "CXS:";
+#else
+  wrkflwtxt += "???:"; // no path to this statement
+#endif
 #else
 #if defined MGONGPU_CPPCXTYPE_STDCOMPLEX
   wrkflwtxt += "STX:";
@@ -789,7 +800,7 @@ main( int argc, char** argv )
     wrkflwtxt += "RMBDEV+";
   else
     wrkflwtxt += "??????+"; // no path to this statement
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   // -- HOST or DEVICE matrix elements? Standalone MEs or BRIDGE?
   if( !bridge )
     wrkflwtxt += "MESDEV";
@@ -845,7 +856,7 @@ main( int argc, char** argv )
 
   if( perf )
   {
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 #ifdef _OPENMP
     // Get the output of "nproc --all" (https://stackoverflow.com/a/478960)
     std::string nprocall;
@@ -866,6 +877,8 @@ main( int argc, char** argv )
     std::cout << std::string( SEP79, '*' ) << std::endl
 #ifdef __CUDACC__
               << "Process                     = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_CUDA"
+#elif defined __HIPCC__
+              << "Process                     = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_HIP"
 #else
               << "Process                     = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_CPP"
 #endif
@@ -892,21 +905,21 @@ main( int argc, char** argv )
 #elif defined MGONGPU_FPTYPE_FLOAT
               << "FP precision                = FLOAT (NaN/abnormal=" << nabn << ", zero=" << nzero << ")" << std::endl
 #endif
-#ifdef __CUDACC__
 #if defined MGONGPU_CUCXTYPE_CUCOMPLEX
               << "Complex type                = CUCOMPLEX" << std::endl
 #elif defined MGONGPU_CUCXTYPE_THRUST
               << "Complex type                = THRUST::COMPLEX" << std::endl
-#endif
-#else
+#elif defined MGONGPU_CUCXTYPE_CXSMPL
               << "Complex type                = STD::COMPLEX" << std::endl
+#else
+              << "Complex type                = ???" << std::endl // no path to this statement...
 #endif
               << "RanNumb memory layout       = AOSOA[" << neppR << "]"
               << ( neppR == 1 ? " == AOS" : "" )
               << " [HARDCODED FOR REPRODUCIBILITY]" << std::endl
               << "Momenta memory layout       = AOSOA[" << neppM << "]"
               << ( neppM == 1 ? " == AOS" : "" ) << std::endl
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     //<< "Wavefunction GPU memory     = LOCAL" << std::endl
 #else
 #if !defined MGONGPU_CPPSIMD
@@ -937,7 +950,7 @@ main( int argc, char** argv )
 #endif
 #endif
               << "Random number generation    = " << rndgentxt << std::endl
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 #ifdef _OPENMP
               << "OMP threads / `nproc --all` = " << omp_get_max_threads() << " / " << nprocall // includes a newline
 #endif
@@ -1033,14 +1046,14 @@ main( int argc, char** argv )
              << "\"FLOAT (NaN/abnormal=" << nabn << ")\"," << std::endl
 #endif
              << "\"Complex type\": "
-#ifdef __CUDACC__
 #if defined MGONGPU_CUCXTYPE_CUCOMPLEX
              << "\"CUCOMPLEX\"," << std::endl
 #elif defined MGONGPU_CUCXTYPE_THRUST
              << "\"THRUST::COMPLEX\"," << std::endl
-#endif
-#else
+#elif defined MGONGPU_CUCXTYPE_CXSMPL
              << "\"STD::COMPLEX\"," << std::endl
+#else
+             << "\"???\"," << std::endl                           // no path to this statement...
 #endif
              << "\"RanNumb memory layout\": "
              << "\"AOSOA[" << neppR << "]\""
@@ -1048,7 +1061,7 @@ main( int argc, char** argv )
              << "\"Momenta memory layout\": "
              << "\"AOSOA[" << neppM << "]\""
              << ( neppM == 1 ? " == AOS" : "" ) << ", " << std::endl
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     //<< "\"Wavefunction GPU memory\": " << "\"LOCAL\"," << std::endl
 #endif
              << "\"Curand generation\": "
diff --git a/epochX/cudacpp/gg_tt.sa/SubProcesses/RamboSamplingKernels.cc b/epochX/cudacpp/gg_tt.sa/SubProcesses/RamboSamplingKernels.cc
index da68aa9255..79abbcc4f8 100644
--- a/epochX/cudacpp/gg_tt.sa/SubProcesses/RamboSamplingKernels.cc
+++ b/epochX/cudacpp/gg_tt.sa/SubProcesses/RamboSamplingKernels.cc
@@ -1,11 +1,11 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #include "RamboSamplingKernels.h"
 
-#include "CudaRuntime.h"
+#include "GpuRuntime.h"
 #include "MemoryAccessMomenta.h"
 #include "MemoryAccessRandomNumbers.h"
 #include "MemoryAccessWeights.h"
@@ -14,7 +14,7 @@
 
 #include <sstream>
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -92,7 +92,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   RamboSamplingKernelDevice::RamboSamplingKernelDevice( const fptype energy,               // input: energy
                                                         const BufferRndNumMomenta& rndmom, // input: random numbers in [0,1]
                                                         BufferMomenta& momenta,            // output: momenta
@@ -135,7 +135,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   __global__ void
   getMomentaInitialDevice( const fptype energy,
                            fptype* momenta )
@@ -147,17 +147,17 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   void
   RamboSamplingKernelDevice::getMomentaInitial()
   {
-    getMomentaInitialDevice<<<m_gpublocks, m_gputhreads>>>( m_energy, m_momenta.data() );
+    gpuLaunchKernel( getMomentaInitialDevice, m_gpublocks, m_gputhreads, m_energy, m_momenta.data() );
   }
 #endif
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   __global__ void
   getMomentaFinalDevice( const fptype energy,
                          const fptype* rndmom,
@@ -171,11 +171,11 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   void
   RamboSamplingKernelDevice::getMomentaFinal()
   {
-    getMomentaFinalDevice<<<m_gpublocks, m_gputhreads>>>( m_energy, m_rndmom.data(), m_momenta.data(), m_weights.data() );
+    gpuLaunchKernel( getMomentaFinalDevice, m_gpublocks, m_gputhreads, m_energy, m_rndmom.data(), m_momenta.data(), m_weights.data() );
   }
 #endif
 
diff --git a/epochX/cudacpp/gg_tt.sa/SubProcesses/RamboSamplingKernels.h b/epochX/cudacpp/gg_tt.sa/SubProcesses/RamboSamplingKernels.h
index 184089efd7..7c214cd74b 100644
--- a/epochX/cudacpp/gg_tt.sa/SubProcesses/RamboSamplingKernels.h
+++ b/epochX/cudacpp/gg_tt.sa/SubProcesses/RamboSamplingKernels.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef RAMBOSAMPLINGKERNELS_H
 #define RAMBOSAMPLINGKERNELS_H 1
@@ -10,7 +10,7 @@
 
 #include "MemoryBuffers.h"
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -93,7 +93,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   // A class encapsulating RAMBO phase space sampling on a GPU device
   class RamboSamplingKernelDevice final : public SamplingKernelBase, public NumberOfEvents
   {
diff --git a/epochX/cudacpp/gg_tt.sa/SubProcesses/RandomNumberKernels.h b/epochX/cudacpp/gg_tt.sa/SubProcesses/RandomNumberKernels.h
index 188a72c2c9..21d63beeac 100644
--- a/epochX/cudacpp/gg_tt.sa/SubProcesses/RandomNumberKernels.h
+++ b/epochX/cudacpp/gg_tt.sa/SubProcesses/RandomNumberKernels.h
@@ -1,14 +1,14 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef RANDOMNUMBERKERNELS_H
 #define RANDOMNUMBERKERNELS_H 1
 
 #include "mgOnGpuConfig.h"
 
-// NB This must come AFTER mgOnGpuConfig.h which contains our definition of __global__ when __CUDACC__ is not defined
+// NB This must come AFTER mgOnGpuConfig.h which contains our definition of __global__ when MGONGPUCPP_GPUIMPL is not defined
 #ifndef MGONGPU_HAS_NO_CURAND
 //#include "curand.h"
 struct curandGenerator_st; // forward definition from curand.h
@@ -16,7 +16,7 @@ struct curandGenerator_st; // forward definition from curand.h
 
 #include "MemoryBuffers.h"
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/gg_tt.sa/SubProcesses/cudacpp.mk b/epochX/cudacpp/gg_tt.sa/SubProcesses/cudacpp.mk
index a0397e9ecc..bcb73d7f01 100644
--- a/epochX/cudacpp/gg_tt.sa/SubProcesses/cudacpp.mk
+++ b/epochX/cudacpp/gg_tt.sa/SubProcesses/cudacpp.mk
@@ -1,7 +1,7 @@
 # Copyright (C) 2020-2023 CERN and UCLouvain.
 # Licensed under the GNU Lesser General Public License (version 3 or later).
 # Created by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin.
-# Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+# Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 
 #=== Determine the name of this makefile (https://ftp.gnu.org/old-gnu/Manuals/make-3.80/html_node/make_17.html)
 #=== NB: different names (e.g. cudacpp.mk and cudacpp_src.mk) are used in the Subprocess and src directories
@@ -32,7 +32,7 @@ UNAME_P := $(shell uname -p)
 #=== Configure common compiler flags for C++ and CUDA
 
 INCFLAGS = -I.
-OPTFLAGS = -O3 # this ends up in CUFLAGS too (should it?), cannot add -Ofast or -ffast-math here
+OPTFLAGS = -O3 # this ends up in GPUFLAGS too (should it?), cannot add -Ofast or -ffast-math here
 
 # Dependency on src directory
 MG5AMC_COMMONLIB = mg5amc_common
@@ -89,69 +89,139 @@ endif
 
 #-------------------------------------------------------------------------------
 
-#=== Configure the CUDA compiler
-
-# If CXX is not a single word (example "clang++ --gcc-toolchain...") then disable CUDA builds (issue #505)
-# This is because it is impossible to pass this to "CUFLAGS += -ccbin <host-compiler>" below
-ifneq ($(words $(subst ccache ,,$(CXX))),1) # allow at most "CXX=ccache <host-compiler>" from outside
-  $(warning CUDA builds are not supported for multi-word CXX "$(CXX)")
-  override CUDA_HOME=disabled
-endif
-
-# If CUDA_HOME is not set, try to set it from the location of nvcc
-ifndef CUDA_HOME
-  CUDA_HOME = $(patsubst %bin/nvcc,%,$(shell which nvcc 2>/dev/null))
-  $(warning CUDA_HOME was not set: using "$(CUDA_HOME)")
-endif
-
-# Set NVCC as $(CUDA_HOME)/bin/nvcc if it exists
-ifneq ($(wildcard $(CUDA_HOME)/bin/nvcc),)
-  NVCC = $(CUDA_HOME)/bin/nvcc
-  USE_NVTX ?=-DUSE_NVTX
-  # See https://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/index.html
-  # See https://arnon.dk/matching-sm-architectures-arch-and-gencode-for-various-nvidia-cards/
-  # Default: use compute capability 70 for V100 (CERN lxbatch, CERN itscrd, Juwels Cluster).
-  # Embed device code for 70, and PTX for 70+.
-  # Export MADGRAPH_CUDA_ARCHITECTURE (comma-separated list) to use another value or list of values (see #533).
-  # Examples: use 60 for P100 (Piz Daint), 80 for A100 (Juwels Booster, NVidia raplab/Curiosity).
-  MADGRAPH_CUDA_ARCHITECTURE ?= 70
-  ###CUARCHFLAGS = -gencode arch=compute_$(MADGRAPH_CUDA_ARCHITECTURE),code=compute_$(MADGRAPH_CUDA_ARCHITECTURE) -gencode arch=compute_$(MADGRAPH_CUDA_ARCHITECTURE),code=sm_$(MADGRAPH_CUDA_ARCHITECTURE) # Older implementation (AV): go back to this one for multi-GPU support #533
-  ###CUARCHFLAGS = --gpu-architecture=compute_$(MADGRAPH_CUDA_ARCHITECTURE) --gpu-code=sm_$(MADGRAPH_CUDA_ARCHITECTURE),compute_$(MADGRAPH_CUDA_ARCHITECTURE) # Newer implementation (SH): cannot use this as-is for multi-GPU support #533
-  comma:=,
-  CUARCHFLAGS = $(foreach arch,$(subst $(comma), ,$(MADGRAPH_CUDA_ARCHITECTURE)),-gencode arch=compute_$(arch),code=compute_$(arch) -gencode arch=compute_$(arch),code=sm_$(arch))
-  CUINC = -I$(CUDA_HOME)/include/
-  CURANDLIBFLAGS = -L$(CUDA_HOME)/lib64/ -lcurand # NB: -lcuda is not needed here!
-  CUOPTFLAGS = -lineinfo
-  CUFLAGS = $(OPTFLAGS) $(CUOPTFLAGS) $(INCFLAGS) $(CUINC) $(USE_NVTX) $(CUARCHFLAGS) -use_fast_math
-  ###CUFLAGS += -Xcompiler -Wall -Xcompiler -Wextra -Xcompiler -Wshadow
-  ###NVCC_VERSION = $(shell $(NVCC) --version | grep 'Cuda compilation tools' | cut -d' ' -f5 | cut -d, -f1)
-  CUFLAGS += -std=c++17 # need CUDA >= 11.2 (see #333): this is enforced in mgOnGpuConfig.h
-  # Without -maxrregcount: baseline throughput: 6.5E8 (16384 32 12) up to 7.3E8 (65536 128 12)
-  ###CUFLAGS+= --maxrregcount 160 # improves throughput: 6.9E8 (16384 32 12) up to 7.7E8 (65536 128 12)
-  ###CUFLAGS+= --maxrregcount 128 # improves throughput: 7.3E8 (16384 32 12) up to 7.6E8 (65536 128 12)
-  ###CUFLAGS+= --maxrregcount 96 # degrades throughput: 4.1E8 (16384 32 12) up to 4.5E8 (65536 128 12)
-  ###CUFLAGS+= --maxrregcount 64 # degrades throughput: 1.7E8 (16384 32 12) flat at 1.7E8 (65536 128 12)
-else ifneq ($(origin REQUIRE_CUDA),undefined)
-  # If REQUIRE_CUDA is set but no cuda is found, stop here (e.g. for CI tests on GPU #443)
-  $(error No cuda installation found (set CUDA_HOME or make nvcc visible in PATH))
-else
-  # No cuda. Switch cuda compilation off and go to common random numbers in C++
-  $(warning CUDA_HOME is not set or is invalid: export CUDA_HOME to compile with cuda)
-  override NVCC=
-  override USE_NVTX=
-  override CUINC=
-  override CURANDLIBFLAGS=
-endif
+CUDA_COMPILER_PATH := $(shell compiler="`which nvcc 2>/dev/null`" && while [ -L "$$compiler" ]; do compiler=`readlink "$$compiler"`; done && echo "$$compiler")
+HIP_COMPILER_PATH := $(shell compiler="`which hipcc 2>/dev/null`" && while [ -L "$$compiler" ]; do compiler=`readlink "$$compiler"`; done && echo "$$compiler")
+
+ifeq ($(findstring nvcc,$(CUDA_COMPILER_PATH)),nvcc)
+  #=== Configure the CUDA compiler
+
+  # If CXX is not a single word (example "clang++ --gcc-toolchain...") then disable CUDA builds (issue #505)
+  # This is because it is impossible to pass this to "GPUFLAGS += -ccbin <host-compiler>" below
+  ifneq ($(words $(subst ccache ,,$(CXX))),1) # allow at most "CXX=ccache <host-compiler>" from outside
+    $(warning CUDA builds are not supported for multi-word CXX "$(CXX)")
+    override CUDA_HOME=disabled
+  endif
+
+  # If CUDA_HOME is not set, try to set it from the location of NVCC
+  ifndef CUDA_HOME
+    CUDA_HOME = $(patsubst %bin/nvcc,%,$(shell which nvcc 2>/dev/null))
+    $(warning CUDA_HOME was not set: using "$(CUDA_HOME)")
+  endif
+
+  # Set GPUCC as $(CUDA_HOME)/bin/nvcc if it exists
+  ifneq ($(wildcard $(CUDA_HOME)/bin/nvcc),)
+    GPUCC = $(CUDA_HOME)/bin/nvcc
+    USE_NVTX ?=-DUSE_NVTX
+    # See https://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/index.html
+    # See https://arnon.dk/matching-sm-architectures-arch-and-gencode-for-various-nvidia-cards/
+    # Default: use compute capability 70 for V100 (CERN lxbatch, CERN itscrd, Juwels Cluster).
+    # Embed device code for 70, and PTX for 70+.
+    # Export MADGRAPH_CUDA_ARCHITECTURE (comma-separated list) to use another value or list of values (see #533).
+    # Examples: use 60 for P100 (Piz Daint), 80 for A100 (Juwels Booster, NVidia raplab/Curiosity).
+    MADGRAPH_CUDA_ARCHITECTURE ?= 70
+    ###CUARCHFLAGS = -gencode arch=compute_$(MADGRAPH_CUDA_ARCHITECTURE),code=compute_$(MADGRAPH_CUDA_ARCHITECTURE) -gencode arch=compute_$(MADGRAPH_CUDA_ARCHITECTURE),code=sm_$(MADGRAPH_CUDA_ARCHITECTURE) # Older implementation (AV): go back to this one for multi-GPU support #533
+    ###CUARCHFLAGS = --gpu-architecture=compute_$(MADGRAPH_CUDA_ARCHITECTURE) --gpu-code=sm_$(MADGRAPH_CUDA_ARCHITECTURE),compute_$(MADGRAPH_CUDA_ARCHITECTURE) # Newer implementation (SH): cannot use this as-is for multi-GPU support #533
+    comma:=,
+    CUARCHFLAGS = $(foreach arch,$(subst $(comma), ,$(MADGRAPH_CUDA_ARCHITECTURE)),-gencode arch=compute_$(arch),code=compute_$(arch) -gencode arch=compute_$(arch),code=sm_$(arch))
+    CUINC = -I$(CUDA_HOME)/include/
+    CURANDLIBFLAGS = -L$(CUDA_HOME)/lib64/ -lcurand # NB: -lcuda is not needed here!
+    CUOPTFLAGS = -lineinfo
+    GPUFLAGS = $(OPTFLAGS) $(CUOPTFLAGS) $(INCFLAGS) $(CUINC) $(USE_NVTX) $(CUARCHFLAGS) -use_fast_math
+    ###GPUFLAGS += -Xcompiler -Wall -Xcompiler -Wextra -Xcompiler -Wshadow
+    ###GPUCC_VERSION = $(shell $(GPUCC) --version | grep 'Cuda compilation tools' | cut -d' ' -f5 | cut -d, -f1)
+    GPUFLAGS += -std=c++17 # need CUDA >= 11.2 (see #333): this is enforced in mgOnGpuConfig.h
+    # Without -maxrregcount: baseline throughput: 6.5E8 (16384 32 12) up to 7.3E8 (65536 128 12)
+    ###GPUFLAGS+= --maxrregcount 160 # improves throughput: 6.9E8 (16384 32 12) up to 7.7E8 (65536 128 12)
+    ###GPUFLAGS+= --maxrregcount 128 # improves throughput: 7.3E8 (16384 32 12) up to 7.6E8 (65536 128 12)
+    ###GPUFLAGS+= --maxrregcount 96 # degrades throughput: 4.1E8 (16384 32 12) up to 4.5E8 (65536 128 12)
+    ###GPUFLAGS+= --maxrregcount 64 # degrades throughput: 1.7E8 (16384 32 12) flat at 1.7E8 (65536 128 12)
+
+    CUBUILDRULEFLAGS = -Xcompiler -fPIC -c
+    CCBUILDRULEFLAGS = -Xcompiler -fPIC -c -x cu
+
+    CUDATESTFLAGS = -lcuda
+
+  else ifneq ($(origin REQUIRE_CUDA),undefined)
+    # If REQUIRE_CUDA is set but no cuda is found, stop here (e.g. for CI tests on GPU #443)
+    $(error No cuda installation found (set CUDA_HOME or make GPUCC visible in PATH))
+  else
+    # No cuda. Switch cuda compilation off and go to common random numbers in C++
+    $(warning CUDA_HOME is not set or is invalid: export CUDA_HOME to compile with cuda)
+    override GPUCC=
+    override USE_NVTX=
+    override CUINC=
+    override CURANDLIBFLAGS=
+  endif
+
+  # Set the host C++ compiler for GPUCC via "-ccbin <host-compiler>"
+  # (NB issue #505: this must be a single word, "clang++ --gcc-toolchain..." is not supported)
+  GPUFLAGS += -ccbin $(shell which $(subst ccache ,,$(CXX)))
+
+  # Allow newer (unsupported) C++ compilers with older versions of CUDA if ALLOW_UNSUPPORTED_COMPILER_IN_CUDA is set (#504)
+  ifneq ($(origin ALLOW_UNSUPPORTED_COMPILER_IN_CUDA),undefined)
+  GPUFLAGS += -allow-unsupported-compiler
+  endif
+
+else ifeq ($(findstring hipcc,$(HIP_COMPILER_PATH)),hipcc)
+  #=== Configure the HIP compiler
+
+  # If CXX is not a single word (example "clang++ --gcc-toolchain...") then disable CUDA builds (issue #505)
+  # This is because it is impossible to pass this to "GPUFLAGS += -ccbin <host-compiler>" below
+  ifneq ($(words $(subst ccache ,,$(CXX))),1) # allow at most "CXX=ccache <host-compiler>" from outside
+    $(warning CUDA builds are not supported for multi-word CXX "$(CXX)")
+    override CUDA_HOME=disabled
+  endif
+
+  # If HIP_HOME is not set, try to set it from the location of GPUCC
+  ifndef HIP_HOME
+    HIP_HOME = $(patsubst %bin/hipcc,%,$(HIP_COMPILER_PATH))
+    $(warning HIP_HOME was not set: using "$(HIP_HOME)")
+  endif
 
-# Set the host C++ compiler for nvcc via "-ccbin <host-compiler>"
-# (NB issue #505: this must be a single word, "clang++ --gcc-toolchain..." is not supported)
-CUFLAGS += -ccbin $(shell which $(subst ccache ,,$(CXX)))
+  # Set GPUCC as $(HIP_HOME)/bin/hipcc if it exists
+  ifneq ($(wildcard $(HIP_HOME)/bin/hipcc),)
+    GPUCC = $(HIP_HOME)/bin/hipcc
+
+    # Should maybe find something equivelant to this in HIP
+    #USE_NVTX ?=-DUSE_NVTX
+
+    HIPARCHFLAGS = -target x86_64-linux-gnu --offload-arch=gfx90a
+    HIPINC = -I$(HIP_HOME)/include/
+
+    # -DHIP_FAST_MATH equivelant to -use_fast_math in HIP 
+    # (But only for single precision line 208: https://rocm-developer-tools.github.io/HIP/hcc__detail_2math__functions_8h_source.html)
+    GPUFLAGS = $(OPTFLAGS) $(CUOPTFLAGS) $(INCFLAGS) $(HIPINC) $(HIPARCHFLAGS) -DHIP_FAST_MATH -DHIP_PLATFORM=amd -fPIC
+    ###GPUFLAGS += -Xcompiler -Wall -Xcompiler -Wextra -Xcompiler -Wshadow
+    GPUFLAGS += -std=c++17
+    # Without -maxrregcount: baseline throughput: 6.5E8 (16384 32 12) up to 7.3E8 (65536 128 12)
+    ###GPUFLAGS+= --maxrregcount 160 # improves throughput: 6.9E8 (16384 32 12) up to 7.7E8 (65536 128 12)
+    ###GPUFLAGS+= --maxrregcount 128 # improves throughput: 7.3E8 (16384 32 12) up to 7.6E8 (65536 128 12)
+    ###GPUFLAGS+= --maxrregcount 96 # degrades throughput: 4.1E8 (16384 32 12) up to 4.5E8 (65536 128 12)
+    ###GPUFLAGS+= --maxrregcount 64 # degrades throughput: 1.7E8 (16384 32 12) flat at 1.7E8 (65536 128 12)
+
+    CUBUILDRULEFLAGS = -fPIC -c
+    CCBUILDRULEFLAGS = -fPIC -c
+
+  else ifneq ($(origin REQUIRE_HIP),undefined)
+    # If REQUIRE_HIP is set but no cuda is found, stop here (e.g. for CI tests on GPU #443)
+    $(error No hip installation found (set HIP_HOME or make GPUCC visible in PATH))
+  else
+    # No hip. Switch hip compilation off and go to common random numbers in C++
+    $(warning HIP_HOME is not set or is invalid: export HIP_HOME to compile with hip)
+    override GPUCC=
+    override USE_NVTX=
+    override CUINC=
+    override CURANDLIBFLAGS=
+  endif
+
+  # Allow newer (unsupported) C++ compilers with older versions of CUDA if ALLOW_UNSUPPORTED_COMPILER_IN_CUDA is set (#504)
+  ifneq ($(origin ALLOW_UNSUPPORTED_COMPILER_IN_CUDA),undefined)
+  GPUFLAGS += -allow-unsupported-compiler
+  endif
 
-# Allow newer (unsupported) C++ compilers with older versions of CUDA if ALLOW_UNSUPPORTED_COMPILER_IN_CUDA is set (#504)
-ifneq ($(origin ALLOW_UNSUPPORTED_COMPILER_IN_CUDA),undefined)
-CUFLAGS += -allow-unsupported-compiler
 endif
 
+  
 #-------------------------------------------------------------------------------
 
 #=== Configure ccache for C++ and CUDA builds
@@ -163,9 +233,9 @@ endif
 #ifeq ($(USECCACHE)$(shell echo $(AR) | grep ccache),1)
 #  override AR:=ccache $(AR)
 #endif
-ifneq ($(NVCC),)
-  ifeq ($(USECCACHE)$(shell echo $(NVCC) | grep ccache),1)
-    override NVCC:=ccache $(NVCC)
+ifneq ($(GPUCC),)
+  ifeq ($(USECCACHE)$(shell echo $(GPUCC) | grep ccache),1)
+    override GPUCC:=ccache $(GPUCC)
   endif
 endif
 
@@ -189,7 +259,7 @@ endif
 
 # PowerPC-specific CUDA compiler flags (to be reviewed!)
 ifeq ($(UNAME_P),ppc64le)
-  CUFLAGS+= -Xcompiler -mno-float128
+  GPUFLAGS+= -Xcompiler -mno-float128
 endif
 
 #-------------------------------------------------------------------------------
@@ -199,10 +269,10 @@ endif
 # Set the default OMPFLAGS choice
 ifneq ($(shell $(CXX) --version | egrep '^Intel'),)
 override OMPFLAGS = -fopenmp
-###override OMPFLAGS = # disable OpenMP MT on Intel (was ok without nvcc but not ok with nvcc before #578)
+###override OMPFLAGS = # disable OpenMP MT on Intel (was ok without GPUCC but not ok with GPUCC before #578)
 else ifneq ($(shell $(CXX) --version | egrep '^(clang)'),)
 override OMPFLAGS = -fopenmp
-###override OMPFLAGS = # disable OpenMP MT on clang (was not ok without or with nvcc before #578)
+###override OMPFLAGS = # disable OpenMP MT on clang (was not ok without or with GPUCC before #578)
 else ifneq ($(shell $(CXX) --version | egrep '^(Apple clang)'),)
 override OMPFLAGS = # disable OpenMP MT on Apple clang (builds fail in the CI #578)
 else
@@ -253,7 +323,10 @@ endif
 
 # Set the default RNDGEN (random number generator) choice
 ifeq ($(RNDGEN),)
-  ifeq ($(NVCC),)
+  ifeq ($(GPUCC),)
+    override RNDGEN = hasNoCurand
+  # Edgecase for HIP compilation
+  else ifeq ($(findstring hipcc,$(GPUCC)),hipcc)
     override RNDGEN = hasNoCurand
   else ifeq ($(RNDGEN),)
     override RNDGEN = hasCurand
@@ -328,13 +401,13 @@ CXXFLAGS+= $(AVXFLAGS)
 $(info FPTYPE=$(FPTYPE))
 ifeq ($(FPTYPE),d)
   CXXFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_DOUBLE
-  CUFLAGS  += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_DOUBLE
+  GPUFLAGS  += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_DOUBLE
 else ifeq ($(FPTYPE),f)
   CXXFLAGS += -DMGONGPU_FPTYPE_FLOAT -DMGONGPU_FPTYPE2_FLOAT
-  CUFLAGS  += -DMGONGPU_FPTYPE_FLOAT -DMGONGPU_FPTYPE2_FLOAT
+  GPUFLAGS  += -DMGONGPU_FPTYPE_FLOAT -DMGONGPU_FPTYPE2_FLOAT
 else ifeq ($(FPTYPE),m)
   CXXFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_FLOAT
-  CUFLAGS  += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_FLOAT
+  GPUFLAGS  += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_FLOAT
 else
   $(error Unknown FPTYPE='$(FPTYPE)': only 'd', 'f' and 'm' are supported)
 endif
@@ -343,7 +416,7 @@ endif
 $(info HELINL=$(HELINL))
 ifeq ($(HELINL),1)
   CXXFLAGS += -DMGONGPU_INLINE_HELAMPS
-  CUFLAGS  += -DMGONGPU_INLINE_HELAMPS
+  GPUFLAGS  += -DMGONGPU_INLINE_HELAMPS
 else ifneq ($(HELINL),0)
   $(error Unknown HELINL='$(HELINL)': only '0' and '1' are supported)
 endif
@@ -352,7 +425,7 @@ endif
 $(info HRDCOD=$(HRDCOD))
 ifeq ($(HRDCOD),1)
   CXXFLAGS += -DMGONGPU_HARDCODE_PARAM
-  CUFLAGS  += -DMGONGPU_HARDCODE_PARAM
+  GPUFLAGS  += -DMGONGPU_HARDCODE_PARAM
 else ifneq ($(HRDCOD),0)
   $(error Unknown HRDCOD='$(HRDCOD)': only '0' and '1' are supported)
 endif
@@ -404,11 +477,11 @@ ifeq ($(UNAME_S),Darwin)
   override CULIBFLAGSRPATH2 =
 else
   # RPATH to cuda/cpp libs when linking executables
-  override CXXLIBFLAGSRPATH = -Wl,-rpath,$(LIBDIRRPATH)
-  override CULIBFLAGSRPATH = -Xlinker -rpath,$(LIBDIRRPATH)
+  override CXXLIBFLAGSRPATH = -Wl,-rpath=$(LIBDIRRPATH)
+  override CULIBFLAGSRPATH = -Xlinker -rpath=$(LIBDIRRPATH)
   # RPATH to common lib when linking cuda/cpp libs
-  override CXXLIBFLAGSRPATH2 = -Wl,-rpath,'$$ORIGIN'
-  override CULIBFLAGSRPATH2 = -Xlinker -rpath,'$$ORIGIN'
+  override CXXLIBFLAGSRPATH2 = -Wl,-rpath='$$ORIGIN'
+  override CULIBFLAGSRPATH2 = -Xlinker -rpath='$$ORIGIN'
 endif
 
 # Setting LD_LIBRARY_PATH or DYLD_LIBRARY_PATH in the RUNTIME is no longer necessary (neither on Linux nor on Mac)
@@ -421,7 +494,7 @@ override RUNTIME =
 cxx_main=$(BUILDDIR)/check.exe
 fcxx_main=$(BUILDDIR)/fcheck.exe
 
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 cu_main=$(BUILDDIR)/gcheck.exe
 fcu_main=$(BUILDDIR)/fgcheck.exe
 else
@@ -452,15 +525,16 @@ $(BUILDDIR)/.build.$(TAG):
 	@touch $(BUILDDIR)/.build.$(TAG)
 
 # Generic target and build rules: objects from CUDA compilation
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 $(BUILDDIR)/%.o : %.cu *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
 	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
-	$(NVCC) $(CPPFLAGS) $(CUFLAGS) -Xcompiler -fPIC -c $< -o $@
+	$(GPUCC) $(CPPFLAGS) $(GPUFLAGS) $(CUBUILDRULEFLAGS) $< -o $@
 
 $(BUILDDIR)/%_cu.o : %.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
 	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
-	$(NVCC) $(CPPFLAGS) $(CUFLAGS) -Xcompiler -fPIC -c -x cu $< -o $@
+	$(GPUCC) $(CPPFLAGS) $(GPUFLAGS) $(CCBUILDRULEFLAGS) $< -o $@
 endif
+# -x cu in line above
 
 # Generic target and build rules: objects from C++ compilation
 # (NB do not include CUINC here! add it only for NVTX or curand #679)
@@ -469,11 +543,14 @@ $(BUILDDIR)/%.o : %.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
 	$(CXX) $(CPPFLAGS) $(CXXFLAGS) -fPIC -c $< -o $@
 
 # Apply special build flags only to CrossSectionKernel.cc and gCrossSectionKernel.cu (no fast math, see #117)
+# Added edgecase for HIP compilation
 ifeq ($(shell $(CXX) --version | grep ^nvc++),)
 $(BUILDDIR)/CrossSectionKernels.o: CXXFLAGS += -fno-fast-math
 $(BUILDDIR)/CrossSectionKernels.o: CXXFLAGS += -fno-fast-math
-ifneq ($(NVCC),)
-$(BUILDDIR)/gCrossSectionKernels.o: CUFLAGS += -Xcompiler -fno-fast-math
+ifeq ($(findstring nvcc,$(GPUCC)),nvcc)
+  $(BUILDDIR)/gCrossSectionKernels.o: GPUFLAGS += -Xcompiler -fno-fast-math
+else
+  $(BUILDDIR)/gCrossSectionKernels.o: GPUFLAGS += -fno-fast-math
 endif
 endif
 
@@ -489,10 +566,10 @@ ifeq ($(RNDGEN),hasCurand)
 $(BUILDDIR)/CurandRandomNumberKernel.o: CXXFLAGS += $(CUINC)
 endif
 
-# Avoid "warning: builtin __has_trivial_... is deprecated; use __is_trivially_... instead" in nvcc with icx2023 (#592)
+# Avoid "warning: builtin __has_trivial_... is deprecated; use __is_trivially_... instead" in GPUCC with icx2023 (#592)
 ifneq ($(shell $(CXX) --version | egrep '^(Intel)'),)
-ifneq ($(NVCC),)
-CUFLAGS += -Xcompiler -Wno-deprecated-builtins
+ifneq ($(GPUCC),)
+GPUFLAGS += -Wno-deprecated-builtins
 endif
 endif
 
@@ -500,8 +577,8 @@ endif
 # This patch does remove the warning, but I prefer to keep it disabled for the moment...
 ###ifneq ($(shell $(CXX) --version | egrep '^(clang|Apple clang|Intel)'),)
 ###$(BUILDDIR)/CrossSectionKernels.o: CXXFLAGS += -Wno-overriding-t-option
-###ifneq ($(NVCC),)
-###$(BUILDDIR)/gCrossSectionKernels.o: CUFLAGS += -Xcompiler -Wno-overriding-t-option
+###ifneq ($(GPUCC),)
+###$(BUILDDIR)/gCrossSectionKernels.o: GPUFLAGS += -Xcompiler -Wno-overriding-t-option
 ###endif
 ###endif
 
@@ -528,7 +605,7 @@ MG5AMC_CXXLIB = mg5amc_$(processid_short)_cpp
 cxx_objects_lib=$(BUILDDIR)/CPPProcess.o $(BUILDDIR)/MatrixElementKernels.o $(BUILDDIR)/BridgeKernels.o $(BUILDDIR)/CrossSectionKernels.o
 cxx_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel.o $(BUILDDIR)/RamboSamplingKernels.o
 
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 MG5AMC_CULIB = mg5amc_$(processid_short)_cuda
 cu_objects_lib=$(BUILDDIR)/gCPPProcess.o $(BUILDDIR)/gMatrixElementKernels.o $(BUILDDIR)/gBridgeKernels.o $(BUILDDIR)/gCrossSectionKernels.o
 cu_objects_exe=$(BUILDDIR)/gCommonRandomNumberKernel.o $(BUILDDIR)/gRamboSamplingKernels.o
@@ -540,11 +617,11 @@ $(LIBDIR)/lib$(MG5AMC_CXXLIB).so: cxx_objects_lib += $(BUILDDIR)/fbridge.o
 $(LIBDIR)/lib$(MG5AMC_CXXLIB).so: $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib)
 	$(CXX) -shared -o $@ $(cxx_objects_lib) $(CXXLIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB)
 
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 $(LIBDIR)/lib$(MG5AMC_CULIB).so: $(BUILDDIR)/fbridge_cu.o
 $(LIBDIR)/lib$(MG5AMC_CULIB).so: cu_objects_lib += $(BUILDDIR)/fbridge_cu.o
 $(LIBDIR)/lib$(MG5AMC_CULIB).so: $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cu_objects_lib)
-	$(NVCC) --shared -o $@ $(cu_objects_lib) $(CULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB)
+	$(GPUCC) --shared -o $@ $(cu_objects_lib) $(CULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB)
 endif
 
 #-------------------------------------------------------------------------------
@@ -561,16 +638,16 @@ $(cxx_main): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PAT
 $(cxx_main): $(BUILDDIR)/check_sa.o $(LIBDIR)/lib$(MG5AMC_CXXLIB).so $(cxx_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel.o
 	$(CXX) -o $@ $(BUILDDIR)/check_sa.o $(OMPFLAGS) -ldl -pthread $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_CXXLIB) $(cxx_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel.o $(CURANDLIBFLAGS)
 
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 ifneq ($(shell $(CXX) --version | grep ^Intel),)
-$(cu_main): LIBFLAGS += -lintlc # compile with icpx and link with nvcc (undefined reference to `_intel_fast_memcpy')
-$(cu_main): LIBFLAGS += -lsvml # compile with icpx and link with nvcc (undefined reference to `__svml_cos4_l9')
+$(cu_main): LIBFLAGS += -lintlc # compile with icpx and link with GPUCC (undefined reference to `_intel_fast_memcpy')
+$(cu_main): LIBFLAGS += -lsvml # compile with icpx and link with GPUCC (undefined reference to `__svml_cos4_l9')
 else ifneq ($(shell $(CXX) --version | grep ^nvc++),) # support nvc++ #531
 $(cu_main): LIBFLAGS += -L$(patsubst %bin/nvc++,%lib,$(subst ccache ,,$(CXX))) -lnvhpcatm -lnvcpumath -lnvc
 endif
 $(cu_main): LIBFLAGS += $(CULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH
 $(cu_main): $(BUILDDIR)/gcheck_sa.o $(LIBDIR)/lib$(MG5AMC_CULIB).so $(cu_objects_exe) $(BUILDDIR)/gCurandRandomNumberKernel.o
-	$(NVCC) -o $@ $(BUILDDIR)/gcheck_sa.o $(CUARCHFLAGS) $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe) $(BUILDDIR)/gCurandRandomNumberKernel.o $(CURANDLIBFLAGS)
+	$(GPUCC) -o $@ $(BUILDDIR)/gcheck_sa.o $(CUARCHFLAGS) $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe) $(BUILDDIR)/gCurandRandomNumberKernel.o $(CURANDLIBFLAGS)
 endif
 
 #-------------------------------------------------------------------------------
@@ -596,17 +673,17 @@ $(fcxx_main): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PA
 $(fcxx_main): $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler.o $(LIBDIR)/lib$(MG5AMC_CXXLIB).so $(cxx_objects_exe)
 	$(CXX) -o $@ $(BUILDDIR)/fcheck_sa.o $(OMPFLAGS) $(BUILDDIR)/fsampler.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CXXLIB) $(cxx_objects_exe)
 
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 ifneq ($(shell $(CXX) --version | grep ^Intel),)
-$(fcu_main): LIBFLAGS += -lintlc # compile with icpx and link with nvcc (undefined reference to `_intel_fast_memcpy')
-$(fcu_main): LIBFLAGS += -lsvml # compile with icpx and link with nvcc (undefined reference to `__svml_cos4_l9')
+$(fcu_main): LIBFLAGS += -lintlc # compile with icpx and link with GPUCC (undefined reference to `_intel_fast_memcpy')
+$(fcu_main): LIBFLAGS += -lsvml # compile with icpx and link with GPUCC (undefined reference to `__svml_cos4_l9')
 endif
 ifeq ($(UNAME_S),Darwin)
 $(fcu_main): LIBFLAGS += -L$(shell dirname $(shell $(FC) --print-file-name libgfortran.dylib)) # add path to libgfortran on Mac #375
 endif
 $(fcu_main): LIBFLAGS += $(CULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH
 $(fcu_main): $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler_cu.o $(LIBDIR)/lib$(MG5AMC_CULIB).so $(cu_objects_exe)
-	$(NVCC) -o $@ $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler_cu.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe)
+	$(GPUCC) -o $@ $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler_cu.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe)
 endif
 
 #-------------------------------------------------------------------------------
@@ -618,7 +695,7 @@ $(BUILDDIR)/testxxx.o: testxxx_cc_ref.txt
 $(testmain): $(BUILDDIR)/testxxx.o
 $(testmain): cxx_objects_exe += $(BUILDDIR)/testxxx.o # Comment out this line to skip the C++ test of xxx functions
 
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 $(BUILDDIR)/testxxx_cu.o: $(GTESTLIBS)
 $(BUILDDIR)/testxxx_cu.o: INCFLAGS += $(GTESTINC)
 $(BUILDDIR)/testxxx_cu.o: testxxx_cc_ref.txt
@@ -631,7 +708,7 @@ $(BUILDDIR)/testmisc.o: INCFLAGS += $(GTESTINC)
 $(testmain): $(BUILDDIR)/testmisc.o
 $(testmain): cxx_objects_exe += $(BUILDDIR)/testmisc.o # Comment out this line to skip the C++ miscellaneous tests
 
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 $(BUILDDIR)/testmisc_cu.o: $(GTESTLIBS)
 $(BUILDDIR)/testmisc_cu.o: INCFLAGS += $(GTESTINC)
 $(testmain): $(BUILDDIR)/testmisc_cu.o
@@ -643,12 +720,12 @@ $(BUILDDIR)/runTest.o: INCFLAGS += $(GTESTINC)
 $(testmain): $(BUILDDIR)/runTest.o
 $(testmain): cxx_objects_exe += $(BUILDDIR)/runTest.o
 
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 $(BUILDDIR)/runTest_cu.o: $(GTESTLIBS)
 $(BUILDDIR)/runTest_cu.o: INCFLAGS += $(GTESTINC)
 ifneq ($(shell $(CXX) --version | grep ^Intel),)
-$(testmain): LIBFLAGS += -lintlc # compile with icpx and link with nvcc (undefined reference to `_intel_fast_memcpy')
-$(testmain): LIBFLAGS += -lsvml # compile with icpx and link with nvcc (undefined reference to `__svml_cos4_l9')
+$(testmain): LIBFLAGS += -lintlc # compile with icpx and link with GPUCC (undefined reference to `_intel_fast_memcpy')
+$(testmain): LIBFLAGS += -lsvml # compile with icpx and link with GPUCC (undefined reference to `__svml_cos4_l9')
 else ifneq ($(shell $(CXX) --version | grep ^nvc++),) # support nvc++ #531
 $(testmain): LIBFLAGS += -L$(patsubst %bin/nvc++,%lib,$(subst ccache ,,$(CXX))) -lnvhpcatm -lnvcpumath -lnvc
 endif
@@ -672,14 +749,14 @@ $(testmain): LIBFLAGS += -lgomp
 endif
 endif
 
-ifeq ($(NVCC),) # link only runTest.o
+ifeq ($(GPUCC),) # link only runTest.o
 $(testmain): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH
 $(testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(cxx_objects_exe) $(GTESTLIBS)
 	$(CXX) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) -ldl -pthread $(LIBFLAGS)
 else # link both runTest.o and runTest_cu.o
 $(testmain): LIBFLAGS += $(CULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH
 $(testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) $(GTESTLIBS)
-	$(NVCC) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) -ldl $(LIBFLAGS) -lcuda
+	  $(GPUCC) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) -ldl $(LIBFLAGS) $(CUDATESTFLAGS)
 endif
 
 # Use flock (Linux only, no Mac) to allow 'make -j' if googletest has not yet been downloaded https://stackoverflow.com/a/32666215
@@ -782,9 +859,9 @@ ifeq ($(USECCACHE),1)
 	ccache --version | head -1
 endif
 	@echo ""
-	@echo NVCC=$(NVCC)
-ifneq ($(NVCC),)
-	$(NVCC) --version
+	@echo GPUCC=$(GPUCC)
+ifneq ($(GPUCC),)
+	$(GPUCC) --version
 endif
 	@echo ""
 	@echo CXX=$(CXX)
@@ -803,7 +880,7 @@ endif
 
 # Target: check (run the C++ test executable)
 # [NB THIS IS WHAT IS USED IN THE GITHUB CI!]
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 check: runTest cmpFcheck cmpFGcheck
 else
 check: runTest cmpFcheck
diff --git a/epochX/cudacpp/gg_tt.sa/SubProcesses/fbridge.cc b/epochX/cudacpp/gg_tt.sa/SubProcesses/fbridge.cc
index f93c05b0b3..2b956730d4 100644
--- a/epochX/cudacpp/gg_tt.sa/SubProcesses/fbridge.cc
+++ b/epochX/cudacpp/gg_tt.sa/SubProcesses/fbridge.cc
@@ -1,11 +1,11 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: S. Roiser (Oct 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Roiser, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #include "Bridge.h"
 #include "CPPProcess.h"
-#include "CudaRuntime.h"
+#include "GpuRuntime.h"
 
 extern "C"
 {
@@ -22,7 +22,7 @@ extern "C"
    * Using the same Fortran MadEvent code, linking to the hetrerogeneous library would allow access to both CPU and GPU implementations.
    * The specific heterogeneous configuration (how many GPUs, how many threads on each CPU, etc) could be loaded in CUDA/C++ from a data file.
    */
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   using namespace mg5amcGpu;
 #else
   using namespace mg5amcCpu;
@@ -46,8 +46,8 @@ extern "C"
    */
   void fbridgecreate_( CppObjectInFortran** ppbridge, const int* pnevtF, const int* pnparF, const int* pnp4F )
   {
-#ifdef __CUDACC__
-    CudaRuntime::setUp();
+#ifdef MGONGPUCPP_GPUIMPL
+    GpuRuntime::setUp();
 #endif
     // Create a process object, read parm card and set parameters
     // FIXME: the process instance can happily go out of scope because it is only needed to read parameters?
@@ -69,8 +69,8 @@ extern "C"
     Bridge<FORTRANFPTYPE>* pbridge = dynamic_cast<Bridge<FORTRANFPTYPE>*>( *ppbridge );
     if( pbridge == 0 ) throw std::runtime_error( "fbridgedelete_: invalid Bridge address" );
     delete pbridge;
-#ifdef __CUDACC__
-    CudaRuntime::tearDown();
+#ifdef MGONGPUCPP_GPUIMPL
+    GpuRuntime::tearDown();
 #endif
   }
 
@@ -100,7 +100,7 @@ extern "C"
   {
     Bridge<FORTRANFPTYPE>* pbridge = dynamic_cast<Bridge<FORTRANFPTYPE>*>( *ppbridge );
     if( pbridge == 0 ) throw std::runtime_error( "fbridgesequence_: invalid Bridge address" );
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     // Use the device/GPU implementation in the CUDA library
     // (there is also a host implementation in this library)
     pbridge->gpu_sequence( momenta, gs, rndhel, rndcol, *pchannelId, mes, selhel, selcol );
diff --git a/epochX/cudacpp/gg_tt.sa/SubProcesses/fsampler.cc b/epochX/cudacpp/gg_tt.sa/SubProcesses/fsampler.cc
index 2fb445372d..3743934f41 100644
--- a/epochX/cudacpp/gg_tt.sa/SubProcesses/fsampler.cc
+++ b/epochX/cudacpp/gg_tt.sa/SubProcesses/fsampler.cc
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Feb 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #include "mgOnGpuConfig.h"
 
@@ -13,7 +13,7 @@
 
 //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -40,7 +40,7 @@ namespace mg5amcCpu
   private:
     const int m_nevt; // The number of events in each iteration
     int m_iiter;      // The iteration counter (for random number seeding)
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
     HostBufferRndNumMomenta m_hstRndmom; // Memory buffers for random numbers
     HostBufferMomenta m_hstMomenta;      // Memory buffers for momenta
     HostBufferWeights m_hstWeights;      // Memory buffers for sampling weights
@@ -105,7 +105,7 @@ namespace mg5amcCpu
 
 extern "C"
 {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   using namespace mg5amcGpu;
 #else
   using namespace mg5amcCpu;
diff --git a/epochX/cudacpp/gg_tt.sa/SubProcesses/runTest.cc b/epochX/cudacpp/gg_tt.sa/SubProcesses/runTest.cc
index 572e28aaea..461ec5c3a5 100644
--- a/epochX/cudacpp/gg_tt.sa/SubProcesses/runTest.cc
+++ b/epochX/cudacpp/gg_tt.sa/SubProcesses/runTest.cc
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: S. Hageboeck (Nov 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 
 #include "mgOnGpuConfig.h"
 
@@ -15,7 +15,7 @@
 #include "RandomNumberKernels.h"
 #include "epoch_process_id.h"
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 using namespace mg5amcGpu;
 #else
 using namespace mg5amcCpu;
@@ -32,7 +32,7 @@ struct CUDA_CPU_TestBase : public TestDriverBase
     : TestDriverBase( npar, refFileName ) {}
 };
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 struct CPUTest : public CUDA_CPU_TestBase
 {
   // Struct data members (process, and memory structures for random numbers, momenta, matrix elements and weights on host and device)
@@ -114,7 +114,7 @@ struct CPUTest : public CUDA_CPU_TestBase
 };
 #endif
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 struct CUDATest : public CUDA_CPU_TestBase
 {
   // Reset the device when our test goes out of scope. Note that this should happen after
@@ -123,7 +123,7 @@ struct CUDATest : public CUDA_CPU_TestBase
   {
     ~DeviceReset()
     {
-      checkCuda( cudaDeviceReset() ); // this is needed by cuda-memcheck --leak-check full
+      gpuDeviceReset(); // this is needed by cuda-memcheck --leak-check full
     }
   } deviceResetter;
 
@@ -249,7 +249,7 @@ INSTANTIATE_TEST_SUITE_P( prefix, \
                           test_suite_name, \
                           testing::Values( new CUDATest( MG_EPOCH_REFERENCE_FILE_NAME ) ) );
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 MG_INSTANTIATE_TEST_SUITE_GPU( XTESTID_GPU( MG_EPOCH_PROCESS_ID ), MadgraphTest );
 #else
 MG_INSTANTIATE_TEST_SUITE_CPU( XTESTID_CPU( MG_EPOCH_PROCESS_ID ), MadgraphTest );
diff --git a/epochX/cudacpp/gg_tt.sa/SubProcesses/testmisc.cc b/epochX/cudacpp/gg_tt.sa/SubProcesses/testmisc.cc
index 989aba1fdc..2bd7a9fcf9 100644
--- a/epochX/cudacpp/gg_tt.sa/SubProcesses/testmisc.cc
+++ b/epochX/cudacpp/gg_tt.sa/SubProcesses/testmisc.cc
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 //----------------------------------------------------------------------------
 // Use ./runTest.exe --gtest_filter=*misc to run only this test
 //----------------------------------------------------------------------------
@@ -17,7 +17,7 @@
 #include <sstream>
 #include <typeinfo>
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 #define TESTID( s ) s##_GPU_MISC
 #else
 #define TESTID( s ) s##_CPU_MISC
diff --git a/epochX/cudacpp/gg_tt.sa/SubProcesses/testxxx.cc b/epochX/cudacpp/gg_tt.sa/SubProcesses/testxxx.cc
index 4243e9fcec..6e8657edca 100644
--- a/epochX/cudacpp/gg_tt.sa/SubProcesses/testxxx.cc
+++ b/epochX/cudacpp/gg_tt.sa/SubProcesses/testxxx.cc
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Apr 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #include "mgOnGpuConfig.h"
 
@@ -20,7 +20,7 @@
 #include <iomanip>
 #include <iostream>
 #include <vector>
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 #define TESTID( s ) s##_GPU_XXX
 #else
 #define TESTID( s ) s##_CPU_XXX
@@ -41,7 +41,7 @@ TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testxxx )
   assert( nevt % neppM == 0 ); // nevt must be a multiple of neppM
   assert( nevt % neppV == 0 ); // nevt must be a multiple of neppV
   // Fill in the input momenta
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   mg5amcGpu::PinnedHostBufferMomenta hstMomenta( nevt ); // AOSOA[npagM][npar=4][np4=4][neppM]
 #else
   mg5amcCpu::HostBufferMomenta hstMomenta( nevt ); // AOSOA[npagM][npar=4][np4=4][neppM]
@@ -228,7 +228,7 @@ TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testxxx )
   {
     for( int ievt = 0; ievt < nevt; ievt++ )
     {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
       using namespace mg5amcGpu;
 #else
       using namespace mg5amcCpu;
diff --git a/epochX/cudacpp/gg_tt.sa/src/HelAmps_sm.h b/epochX/cudacpp/gg_tt.sa/src/HelAmps_sm.h
index bc2adb6258..f7ecb29537 100644
--- a/epochX/cudacpp/gg_tt.sa/src/HelAmps_sm.h
+++ b/epochX/cudacpp/gg_tt.sa/src/HelAmps_sm.h
@@ -4,7 +4,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: A. Valassi (Sep 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
 // MadGraph5_aMC@NLO v. 3.5.0_lo_vect, 2023-06-09
@@ -26,7 +26,7 @@
 //#include <iomanip>
 //#include <iostream>
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/gg_tt.sa/src/Parameters_sm.cc b/epochX/cudacpp/gg_tt.sa/src/Parameters_sm.cc
index 7255e49119..459dae9e99 100644
--- a/epochX/cudacpp/gg_tt.sa/src/Parameters_sm.cc
+++ b/epochX/cudacpp/gg_tt.sa/src/Parameters_sm.cc
@@ -4,7 +4,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: A. Valassi (Sep 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
 // MadGraph5_aMC@NLO v. 3.5.0_lo_vect, 2023-06-09
diff --git a/epochX/cudacpp/gg_tt.sa/src/Parameters_sm.h b/epochX/cudacpp/gg_tt.sa/src/Parameters_sm.h
index c935779eb3..db5520aa96 100644
--- a/epochX/cudacpp/gg_tt.sa/src/Parameters_sm.h
+++ b/epochX/cudacpp/gg_tt.sa/src/Parameters_sm.h
@@ -211,7 +211,7 @@ namespace Parameters_sm_dependentCouplings
 #pragma GCC diagnostic push
 #pragma GCC diagnostic ignored "-Wunused-variable"  // e.g. <<warning: unused variable ‘mdl_G__exp__2’ [-Wunused-variable]>>
 #pragma GCC diagnostic ignored "-Wunused-parameter" // e.g. <<warning: unused parameter ‘G’ [-Wunused-parameter]>>
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 #pragma nv_diagnostic push
 #pragma nv_diag_suppress 177 // e.g. <<warning #177-D: variable "mdl_G__exp__2" was declared but never referenced>>
 #endif
@@ -238,7 +238,7 @@ namespace Parameters_sm_dependentCouplings
     // End SM implementation - no special handling of vectors of floats as in EFT (#439)
     return out;
   }
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 #pragma GCC diagnostic pop
 #pragma nv_diagnostic pop
 #endif
@@ -254,7 +254,7 @@ namespace Parameters_sm_independentCouplings
 
 //==========================================================================
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/gg_tt.sa/src/mgOnGpuConfig.h b/epochX/cudacpp/gg_tt.sa/src/mgOnGpuConfig.h
index 6c0c4919e9..d4e37d19b3 100644
--- a/epochX/cudacpp/gg_tt.sa/src/mgOnGpuConfig.h
+++ b/epochX/cudacpp/gg_tt.sa/src/mgOnGpuConfig.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jul 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MGONGPUCONFIG_H
 #define MGONGPUCONFIG_H 1
@@ -10,13 +10,26 @@
 // There are two different code bases for standalone_cudacpp (without multichannel) and madevent+cudacpp (with multichannel)
 #undef MGONGPU_SUPPORTS_MULTICHANNEL
 
+// Is this a GPU (CUDA, HIP) or CPU implementation?
+#ifdef __CUDACC__
+#define MGONGPUCPP_GPUIMPL cuda
+#elif defined __HIPCC__
+#define MGONGPUCPP_GPUIMPL hip
+#else
+#undef MGONGPUCPP_GPUIMPL
+#endif
+
 // ** NB1 Throughputs (e.g. 6.8E8) are events/sec for "./gcheck.exe -p 65536 128 12"
 // ** NB2 Baseline on b7g47n0004 fluctuates (probably depends on load on other VMs)
 
 // Choose if curand is supported for generating random numbers
+// For CUDA, by default, it is supported
+// For HIP, by default, it is not supported
 // For C++, by default, do not inline, but allow this macro to be set from outside with e.g. -DMGONGPU_HAS_NO_CURAND
 #ifdef __CUDACC__
 #undef MGONGPU_HAS_NO_CURAND
+#elif defined __HIPCC__
+#define MGONGPU_HAS_NO_CURAND 1
 #else
 //#undef MGONGPU_HAS_NO_CURAND // default
 ////#define MGONGPU_HAS_NO_CURAND 1
@@ -52,23 +65,28 @@
 //#undef MGONGPU_HARDCODE_PARAM // default
 ////#define MGONGPU_HARDCODE_PARAM 1
 
-// Complex type in c++: std::complex or cxsmpl (CHOOSE ONLY ONE)
-#ifndef __CUDACC__
-//#define MGONGPU_CPPCXTYPE_STDCOMPLEX 1 // ~8 percent slower on float, same on double (5.1E6/double, 9.4E6/float)
-#define MGONGPU_CPPCXTYPE_CXSMPL 1 // new default (5.1E6/double, 10.2E6/float)
-#endif
-
-// Complex type in cuda: thrust or cucomplex or cxsmpl (CHOOSE ONLY ONE)
+// Complex type in CUDA: thrust or cucomplex or cxsmpl (CHOOSE ONLY ONE)
 #ifdef __CUDACC__
 #define MGONGPU_CUCXTYPE_THRUST 1 // default (~1.15E9/double, ~3.2E9/float)
 //#define MGONGPU_CUCXTYPE_CUCOMPLEX 1 // ~10 percent slower (1.03E9/double, ~2.8E9/float)
 //#define MGONGPU_CUCXTYPE_CXSMPL 1 // ~10 percent slower (1.00E9/double, ~2.9E9/float)
+
+// Complex type in HIP: cxsmpl (ONLY ONE OPTION POSSIBLE)
+#elif defined __HIPCC__
+#define MGONGPU_CUCXTYPE_CXSMPL 1 // ~10 percent slower (1.00E9/double, ~2.9E9/float)
+
+// Complex type in C++: std::complex or cxsmpl (CHOOSE ONLY ONE)
+#else
+//#define MGONGPU_CPPCXTYPE_STDCOMPLEX 1 // ~8 percent slower on float, same on double (5.1E6/double, 9.4E6/float)
+#define MGONGPU_CPPCXTYPE_CXSMPL 1 // new default (5.1E6/double, 10.2E6/float)
 #endif
 
-// Cuda nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation
+// CUDA nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation
 #ifdef __CUDACC__
-#undef MGONGPU_NSIGHT_DEBUG // default
+#undef MGONGPU_NSIGHT_DEBUG // default in CUDA
 //#define MGONGPU_NSIGHT_DEBUG 1
+#else
+#undef MGONGPU_NSIGHT_DEBUG // only option in HIP or C++
 #endif
 
 // SANITY CHECKS (floating point precision for everything but color algebra #537)
@@ -84,17 +102,21 @@
 #error You cannot use double precision for color algebra and single precision elsewhere
 #endif
 
-// SANITY CHECKS (c++ complex number implementation)
-#ifndef __CUDACC__
-#if defined MGONGPU_CPPCXTYPE_STDCOMPLEX and defined MGONGPU_CPPCXTYPE_CXSMPL
-#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CPPCXTYPE_STDCOMPLEX or MGONGPU_CPPCXTYPE_CXSMPL
+// SANITY CHECKS (CUDA complex number implementation)
+#ifdef __CUDACC__
+#if defined MGONGPU_CUCXTYPE_THRUST and defined MGONGPU_CUCXTYPE_CUCOMPLEX
+#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CUCXTYPE_THRUST or MGONGPU_CUCXTYPE_CUCOMPLEX for CUDA
+#elif defined MGONGPU_CUCXTYPE_THRUST and defined MGONGPU_CUCXTYPE_CXSMPL
+#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CUCXTYPE_THRUST or MGONGPU_CUCXTYPE_CXSMPL for CUDA
+#elif defined MGONGPU_CUCXTYPE_CUCOMPLEX and defined MGONGPU_CUCXTYPE_CXSMPL
+#error You must CHOOSE (ONE AND) ONLY ONE OF MGONGPU_CUCXTYPE_CUCOMPLEX or MGONGPU_CUCXTYPE_CXSMPL for CUDA
 #endif
 #endif
 
-// SANITY CHECKS (cuda complex number implementation)
-#ifdef __CUDACC__
-#if defined MGONGPU_CUCXTYPE_THRUST and defined MGONGPU_CUCXTYPE_CUCOMPLEX and defined MGONGPU_CUCXTYPE_CXSMPL
-#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CUCXTYPE_THRUST or MGONGPU_CUCXTYPE_CUCOMPLEX or MGONGPU_CUCXTYPE_CXSMPL
+// SANITY CHECKS (C++ complex number implementation)
+#ifndef MGONGPUCPP_GPUIMPL
+#if defined MGONGPU_CPPCXTYPE_STDCOMPLEX and defined MGONGPU_CPPCXTYPE_CXSMPL
+#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CPPCXTYPE_STDCOMPLEX or MGONGPU_CPPCXTYPE_CXSMPL for C++
 #endif
 #endif
 
@@ -131,7 +153,7 @@ namespace mgOnGpu
   // Alignment requirement for using reinterpret_cast with SIMD vectorized code
   // (using reinterpret_cast with non aligned memory may lead to segmentation faults!)
   // Only needed for C++ code but can be enforced also in NVCC builds of C++ code using CUDA>=11.2 and C++17 (#318, #319, #333)
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   constexpr int cppAlign = 64; // alignment requirement for SIMD vectorization (64-byte i.e. 512-bit)
 #endif
 
@@ -142,7 +164,7 @@ using mgOnGpu::fptype;
 using mgOnGpu::fptype2;
 
 // C++ SIMD vectorization width (this will be used to set neppV)
-#ifdef __CUDACC__ // CUDA implementation has no SIMD
+#ifdef MGONGPUCPP_GPUIMPL // CUDA and HIP implementations have no SIMD
 #undef MGONGPU_CPPSIMD
 #elif defined __AVX512VL__ && defined MGONGPU_PVW512 // C++ "512z" AVX512 with 512 width (512-bit ie 64-byte): 8 (DOUBLE) or 16 (FLOAT)
 #ifdef MGONGPU_FPTYPE_DOUBLE
@@ -172,9 +194,9 @@ using mgOnGpu::fptype2;
 #undef MGONGPU_CPPSIMD
 #endif
 
-// Cuda nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation
+// CUDA nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation
 // Arguments (not used so far): text is __FUNCTION__, code is 0 (start) or 1 (end)
-#if defined __CUDACC__ && defined MGONGPU_NSIGHT_DEBUG /* clang-format off */
+#if defined __CUDA__ && defined MGONGPU_NSIGHT_DEBUG /* clang-format off */
 #define mgDebugDeclare() __shared__ float mgDebugCounter[mgOnGpu::ntpbMAX];
 #define mgDebugInitialise() { mgDebugCounter[threadIdx.x] = 0; }
 #define mgDebug( code, text ) { mgDebugCounter[threadIdx.x] += 1; }
@@ -186,8 +208,8 @@ using mgOnGpu::fptype2;
 #define mgDebugFinalise() { /*noop*/ }
 #endif /* clang-format on */
 
-// Define empty CUDA declaration specifiers for C++
-#ifndef __CUDACC__
+// Define empty CUDA/HIP declaration specifiers for C++
+#ifndef MGONGPUCPP_GPUIMPL
 #define __global__
 #define __host__
 #define __device__
diff --git a/epochX/cudacpp/gg_tt.sa/src/mgOnGpuCxtypes.h b/epochX/cudacpp/gg_tt.sa/src/mgOnGpuCxtypes.h
index 0cb2f1db7e..4e7ab03fa2 100644
--- a/epochX/cudacpp/gg_tt.sa/src/mgOnGpuCxtypes.h
+++ b/epochX/cudacpp/gg_tt.sa/src/mgOnGpuCxtypes.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022, based on earlier work by D. Smith) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MGONGPUCXTYPES_H
 #define MGONGPUCXTYPES_H 1
@@ -19,7 +19,7 @@
 #include <complex>
 
 // Complex type in cuda: thrust or cucomplex or cxsmpl
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 #if defined MGONGPU_CUCXTYPE_THRUST
 #pragma clang diagnostic push
 #pragma clang diagnostic ignored "-Wtautological-compare" // for icpx2021/clang13 (https://stackoverflow.com/a/15864661)
@@ -201,7 +201,7 @@ namespace mgOnGpu
 {
 
   // --- Type definitions (complex type: cxtype)
-#ifdef __CUDACC__ // cuda
+#ifdef MGONGPUCPP_GPUIMPL // cuda
 #if defined MGONGPU_CUCXTYPE_THRUST
   typedef thrust::complex<fptype> cxtype;
 #elif defined MGONGPU_CUCXTYPE_CUCOMPLEX
@@ -281,7 +281,7 @@ cxmake( const std::complex<double>& c ) // std::complex to cxsmpl (double-to-flo
 
 //==========================================================================
 
-#if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_THRUST // cuda + thrust
+#if defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CUCXTYPE_THRUST // cuda + thrust
 
 //------------------------------
 // CUDA - using thrust::complex
@@ -317,11 +317,11 @@ cxmake( const cxtype& c )
   return c;
 }
 
-#endif // #if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_THRUST
+#endif // #if defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CUCXTYPE_THRUST
 
 //==========================================================================
 
-#if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_CUCOMPLEX // cuda + cucomplex
+#if defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CUCXTYPE_CUCOMPLEX // cuda + cucomplex
 
 //------------------------------
 // CUDA - using cuComplex
@@ -536,11 +536,11 @@ cxmake( const std::complex<fptype>& c ) // std::complex to cucomplex (float-to-f
   return cxmake( c.real(), c.imag() );
 }
 
-#endif // #if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_CUCOMPLEX
+#endif // #if defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CUCXTYPE_CUCOMPLEX
 
 //==========================================================================
 
-#if not defined __CUDACC__ and defined MGONGPU_CPPCXTYPE_STDCOMPLEX // c++ + stdcomplex
+#if not defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CPPCXTYPE_STDCOMPLEX // c++ + stdcomplex
 
 //------------------------------
 // C++ - using std::complex
@@ -584,7 +584,7 @@ cxmake( const std::complex<double>& c ) // std::complex to std::complex (cast do
 }
 #endif
 
-#endif // #if not defined __CUDACC__ and defined MGONGPU_CPPCXTYPE_STDCOMPLEX
+#endif // #if not defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CPPCXTYPE_STDCOMPLEX
 
 //==========================================================================
 
diff --git a/epochX/cudacpp/gg_tt.sa/src/mgOnGpuFptypes.h b/epochX/cudacpp/gg_tt.sa/src/mgOnGpuFptypes.h
index a1cde16a67..6f6cee64d6 100644
--- a/epochX/cudacpp/gg_tt.sa/src/mgOnGpuFptypes.h
+++ b/epochX/cudacpp/gg_tt.sa/src/mgOnGpuFptypes.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MGONGPUFPTYPES_H
 #define MGONGPUFPTYPES_H 1
@@ -13,7 +13,7 @@
 
 //==========================================================================
 
-#ifdef __CUDACC__ // cuda
+#ifdef MGONGPUCPP_GPUIMPL // cuda
 
 //------------------------------
 // Floating point types - Cuda
@@ -57,11 +57,11 @@ fpsqrt( const fptype& f )
 #endif
 }
 
-#endif // #ifdef __CUDACC__
+#endif // #ifdef MGONGPUCPP_GPUIMPL
 
 //==========================================================================
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 
 //------------------------------
 // Floating point types - C++
@@ -85,7 +85,7 @@ fpsqrt( const fptype& f )
   return std::sqrt( f );
 }
 
-#endif // #ifndef __CUDACC__
+#endif // #ifndef MGONGPUCPP_GPUIMPL
 
 //==========================================================================
 
diff --git a/epochX/cudacpp/gg_tt.sa/src/mgOnGpuVectors.h b/epochX/cudacpp/gg_tt.sa/src/mgOnGpuVectors.h
index 9d3e82b1e3..7904b93c61 100644
--- a/epochX/cudacpp/gg_tt.sa/src/mgOnGpuVectors.h
+++ b/epochX/cudacpp/gg_tt.sa/src/mgOnGpuVectors.h
@@ -108,7 +108,7 @@ namespace mgOnGpu /* clang-format off */
 #endif
 #endif
 
-#else // i.e #ifndef MGONGPU_CPPSIMD (this includes #ifdef __CUDACC__)
+#else // i.e #ifndef MGONGPU_CPPSIMD (this includes #ifdef MGONGPUCPP_GPUIMPL)
 
   const int neppV = 1;
 
@@ -129,7 +129,7 @@ using mgOnGpu::bool_v;
 
 //--------------------------------------------------------------------------
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 
 // Printout to stream for user defined types
 
@@ -744,11 +744,11 @@ fpvsplit1( const fptype2_v& v )
 
 #endif // #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
 
-#endif // #ifndef __CUDACC__
+#endif // #ifndef MGONGPUCPP_GPUIMPL
 
 //==========================================================================
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 
 //------------------------------
 // Vector types - CUDA
@@ -786,12 +786,12 @@ cxternary( const bool& mask, const cxtype& a, const cxtype& b )
   return ( mask ? a : b );
 }
 
-#endif // #ifdef __CUDACC__
+#endif // #ifdef MGONGPUCPP_GPUIMPL
 
 //==========================================================================
 
 // Scalar-or-vector types: scalar in CUDA, vector or scalar in C++
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 typedef bool bool_sv;
 typedef fptype fptype_sv;
 typedef fptype2 fptype2_sv;
@@ -812,7 +812,7 @@ typedef mgOnGpu::cxtype_ref cxtype_sv_ref;
 #endif
 
 // Scalar-or-vector zeros: scalar in CUDA, vector or scalar in C++
-#ifdef __CUDACC__ /* clang-format off */
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
 inline __host__ __device__ cxtype cxzero_sv(){ return cxtype( 0, 0 ); }
 #elif defined MGONGPU_CPPSIMD
 inline cxtype_v cxzero_sv() { return cxtype_v(); } // RRRR=0000 IIII=0000
diff --git a/epochX/cudacpp/gg_tt.sa/src/rambo.h b/epochX/cudacpp/gg_tt.sa/src/rambo.h
index e02ea52496..cd7e1008ea 100644
--- a/epochX/cudacpp/gg_tt.sa/src/rambo.h
+++ b/epochX/cudacpp/gg_tt.sa/src/rambo.h
@@ -4,7 +4,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 
 #include "mgOnGpuConfig.h"
@@ -18,7 +18,7 @@
 #include <iostream>
 
 // Simplified rambo version for 2 to N (with N>=2) processes with massless particles
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -83,7 +83,7 @@ namespace mg5amcCpu
       static bool first = true;
       if( first )
       {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
         if constexpr( M_ACCESS::isOnDevice() ) // avoid
         {
           const int ievt0 = 0;
@@ -166,7 +166,7 @@ namespace mg5amcCpu
     wt = po2log;
     if( nparf != 2 ) wt = ( 2. * nparf - 4. ) * log( energy ) + z[nparf - 1];
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
     // issue warnings if weight is too small or too large
     static int iwarn[5] = { 0, 0, 0, 0, 0 };
     if( wt < -180. )
diff --git a/epochX/cudacpp/gg_tt01g.mad/CODEGEN_mad_gg_tt01g_log.txt b/epochX/cudacpp/gg_tt01g.mad/CODEGEN_mad_gg_tt01g_log.txt
index 76ccc27b8e..aa1674d4d2 100644
--- a/epochX/cudacpp/gg_tt01g.mad/CODEGEN_mad_gg_tt01g_log.txt
+++ b/epochX/cudacpp/gg_tt01g.mad/CODEGEN_mad_gg_tt01g_log.txt
@@ -51,8 +51,8 @@ Note that you can still compile and run aMC@NLO with the built-in PDFs
  MG5_aMC> set lhapdf /PATH/TO/lhapdf-config
 
 Using default text editor "vi". Set another one in ./input/mg5_configuration.txt
-No valid eps viewer found. Please set in ./input/mg5_configuration.txt
-Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt
+Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt
+No valid web browser found. Please set in ./input/mg5_configuration.txt
 import /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_mad_gg_tt01g.mg
 The import format was not given, so we guess it as command
 set stdout_level DEBUG
@@ -62,7 +62,7 @@ generate g g > t t~; add process g g > t t~ g
 No model currently active, so we import the Standard Model
 INFO: load particles 
 INFO: load vertices 
-[1;32mDEBUG: model prefixing  takes 0.004670143127441406 [0m
+[1;32mDEBUG: model prefixing  takes 0.005077838897705078 [0m
 INFO: Restrict model sm with file models/sm/restrict_default.dat . 
 [1;32mDEBUG: Simplifying conditional expressions [0m
 [1;32mDEBUG: remove interactions: u s w+ at order: QED=1 [0m
@@ -162,7 +162,7 @@ INFO: Please specify coupling orders to bypass this step.
 INFO: Trying coupling order WEIGHTED<=3: WEIGTHED IS QCD+2*QED 
 INFO: Trying process: g g > t t~ g WEIGHTED<=3 @2  
 INFO: Process has 16 diagrams 
-1 processes with 16 diagrams generated in 0.018 s
+1 processes with 16 diagrams generated in 0.019 s
 Total: 2 processes with 19 diagrams
 output madevent CODEGEN_mad_gg_tt01g --hel_recycling=False --vector_size=16384 --me_exporter=standalone_cudacpp
 Load PLUGIN.CUDACPP_SA_OUTPUT
@@ -186,7 +186,7 @@ INFO: Creating files in directory P2_gg_ttxg
 [1;32mDEBUG:  Entering PLUGIN_OneProcessExporter.__init__ [1;30m[model_handling.py at line 1028][0m [0m
 [1;32mDEBUG:  kwargs[prefix] = 0 [1;30m[model_handling.py at line 1029][0m [0m
 [1;32mDEBUG:  proc_id = [0m 1 [1;30m[model_handling.py at line 1034][0m [0m
-[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_SA_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7fa068f5e6d0> [1;30m[export_v4.py at line 6174][0m [0m
+[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_SA_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7f90c157afd0> [1;30m[export_v4.py at line 6174][0m [0m
 INFO: Creating files in directory . 
 [1;32mDEBUG:  Entering PLUGIN_OneProcessExporter.generate_process_files [1;30m[model_handling.py at line 1286][0m [0m
 [1;32mDEBUG:  self.include_multi_channel is already defined: this is madevent+second_exporter mode [1;30m[model_handling.py at line 1288][0m [0m
@@ -201,13 +201,13 @@ FileWriter <class 'PLUGIN.CUDACPP_SA_OUTPUT.model_handling.PLUGIN_CPPWriter'> fo
 [1;32mDEBUG:  self.support_multichannel, self.include_multi_channel = [0m True [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 0] [1;30m[model_handling.py at line 1152][0m [0m
 [1;32mDEBUG:  multi_channel = [0m {1: [0], 2: [1], 3: [2], 4: [3], 5: [4], 6: [5], 7: [6], 8: [7], 9: [8], 10: [9], 11: [10], 12: [11], 13: [12], 14: [13], 15: [14]} [1;30m[model_handling.py at line 1158][0m [0m
 [1;32mDEBUG:  multi_channel_map = [0m {1: [0], 2: [1], 3: [2], 4: [3], 5: [4], 6: [5], 7: [6], 8: [7], 9: [8], 10: [9], 11: [10], 12: [11], 13: [12], 14: [13], 15: [14]} [1;30m[model_handling.py at line 1643][0m [0m
-[1;32mDEBUG:  diag_to_config = [0m {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14, 15: 15} [1;30m[model_handling.py at line 1698][0m [0m
-[1;32mDEBUG:  call = [0m vxxxxx( momenta,m_pars->%s, cHel[ihel][%d],%+d, w_sv[%d], %d ); [1;30m[model_handling.py at line 1810][0m [0m
-[1;32mDEBUG:  ('ZERO', 0, -1, 0, 0) [1;30m[model_handling.py at line 1811][0m [0m
-[1;32mDEBUG:  call = [0m vxxxxx( momenta,m_pars->%s, cHel[ihel][%d],%+d, w_sv[%d], %d ); [1;30m[model_handling.py at line 1810][0m [0m
-[1;32mDEBUG:  ('ZERO', 1, -1, 1, 1) [1;30m[model_handling.py at line 1811][0m [0m
-[1;32mDEBUG:  call = [0m vxxxxx( momenta,m_pars->%s, cHel[ihel][%d],%+d, w_sv[%d], %d ); [1;30m[model_handling.py at line 1810][0m [0m
-[1;32mDEBUG:  ('ZERO', 4, 1, 4, 4) [1;30m[model_handling.py at line 1811][0m [0m
+[1;32mDEBUG:  diag_to_config = [0m {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14, 15: 15} [1;30m[model_handling.py at line 1700][0m [0m
+[1;32mDEBUG:  call = [0m vxxxxx( momenta,m_pars->%s, cHel[ihel][%d],%+d, w_sv[%d], %d ); [1;30m[model_handling.py at line 1812][0m [0m
+[1;32mDEBUG:  ('ZERO', 0, -1, 0, 0) [1;30m[model_handling.py at line 1813][0m [0m
+[1;32mDEBUG:  call = [0m vxxxxx( momenta,m_pars->%s, cHel[ihel][%d],%+d, w_sv[%d], %d ); [1;30m[model_handling.py at line 1812][0m [0m
+[1;32mDEBUG:  ('ZERO', 1, -1, 1, 1) [1;30m[model_handling.py at line 1813][0m [0m
+[1;32mDEBUG:  call = [0m vxxxxx( momenta,m_pars->%s, cHel[ihel][%d],%+d, w_sv[%d], %d ); [1;30m[model_handling.py at line 1812][0m [0m
+[1;32mDEBUG:  ('ZERO', 4, 1, 4, 4) [1;30m[model_handling.py at line 1813][0m [0m
 INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./. 
 [1;32mDEBUG:  Entering PLUGIN_OneProcessExporter.edit_CMakeLists [1;30m[model_handling.py at line 1332][0m [0m
 [1;32mDEBUG:  Entering PLUGIN_OneProcessExporter.edit_check_sa [1;30m[model_handling.py at line 1341][0m [0m
@@ -228,7 +228,7 @@ INFO: Creating files in directory P1_gg_ttx
 [1;32mDEBUG:  Entering PLUGIN_OneProcessExporter.__init__ [1;30m[model_handling.py at line 1028][0m [0m
 [1;32mDEBUG:  kwargs[prefix] = 0 [1;30m[model_handling.py at line 1029][0m [0m
 [1;32mDEBUG:  proc_id = [0m 1 [1;30m[model_handling.py at line 1034][0m [0m
-[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_SA_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7fa068f44100> [1;30m[export_v4.py at line 6174][0m [0m
+[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_SA_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7f90c1647820> [1;30m[export_v4.py at line 6174][0m [0m
 INFO: Creating files in directory . 
 [1;32mDEBUG:  Entering PLUGIN_OneProcessExporter.generate_process_files [1;30m[model_handling.py at line 1286][0m [0m
 [1;32mDEBUG:  self.include_multi_channel is already defined: this is madevent+second_exporter mode [1;30m[model_handling.py at line 1288][0m [0m
@@ -243,11 +243,11 @@ FileWriter <class 'PLUGIN.CUDACPP_SA_OUTPUT.model_handling.PLUGIN_CPPWriter'> fo
 [1;32mDEBUG:  self.support_multichannel, self.include_multi_channel = [0m True [1, 2, 3] [1;30m[model_handling.py at line 1152][0m [0m
 [1;32mDEBUG:  multi_channel = [0m {1: [0], 2: [1], 3: [2]} [1;30m[model_handling.py at line 1158][0m [0m
 [1;32mDEBUG:  multi_channel_map = [0m {1: [0], 2: [1], 3: [2]} [1;30m[model_handling.py at line 1643][0m [0m
-[1;32mDEBUG:  diag_to_config = [0m {1: 1, 2: 2, 3: 3} [1;30m[model_handling.py at line 1698][0m [0m
-[1;32mDEBUG:  call = [0m vxxxxx( momenta,m_pars->%s, cHel[ihel][%d],%+d, w_sv[%d], %d ); [1;30m[model_handling.py at line 1810][0m [0m
-[1;32mDEBUG:  ('ZERO', 0, -1, 0, 0) [1;30m[model_handling.py at line 1811][0m [0m
-[1;32mDEBUG:  call = [0m vxxxxx( momenta,m_pars->%s, cHel[ihel][%d],%+d, w_sv[%d], %d ); [1;30m[model_handling.py at line 1810][0m [0m
-[1;32mDEBUG:  ('ZERO', 1, -1, 1, 1) [1;30m[model_handling.py at line 1811][0m [0m
+[1;32mDEBUG:  diag_to_config = [0m {1: 1, 2: 2, 3: 3} [1;30m[model_handling.py at line 1700][0m [0m
+[1;32mDEBUG:  call = [0m vxxxxx( momenta,m_pars->%s, cHel[ihel][%d],%+d, w_sv[%d], %d ); [1;30m[model_handling.py at line 1812][0m [0m
+[1;32mDEBUG:  ('ZERO', 0, -1, 0, 0) [1;30m[model_handling.py at line 1813][0m [0m
+[1;32mDEBUG:  call = [0m vxxxxx( momenta,m_pars->%s, cHel[ihel][%d],%+d, w_sv[%d], %d ); [1;30m[model_handling.py at line 1812][0m [0m
+[1;32mDEBUG:  ('ZERO', 1, -1, 1, 1) [1;30m[model_handling.py at line 1813][0m [0m
 INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./. 
 [1;32mDEBUG:  Entering PLUGIN_OneProcessExporter.edit_CMakeLists [1;30m[model_handling.py at line 1332][0m [0m
 [1;32mDEBUG:  Entering PLUGIN_OneProcessExporter.edit_check_sa [1;30m[model_handling.py at line 1341][0m [0m
@@ -264,15 +264,15 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./.
 [1;32mDEBUG:  Done [1;30m[export_cpp.py at line 713][0m [0m
 INFO: Generating Feynman diagrams for Process: g g > t t~ WEIGHTED<=2 @1 
 INFO: Finding symmetric diagrams for subprocess group gg_ttx 
-Generated helas calls for 2 subprocesses (19 diagrams) in 0.039 s
-Wrote files for 46 helas calls in 0.248 s
+Generated helas calls for 2 subprocesses (19 diagrams) in 0.041 s
+Wrote files for 46 helas calls in 0.261 s
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates VVV1 routines[0m
 ALOHA: aloha creates FFV1 routines[0m
 ALOHA: aloha creates VVVV1 set of routines with options: P0[0m
 ALOHA: aloha creates VVVV3 set of routines with options: P0[0m
 ALOHA: aloha creates VVVV4 set of routines with options: P0[0m
-ALOHA: aloha creates 5 routines in  0.277 s
+ALOHA: aloha creates 5 routines in  0.307 s
 [1;32mDEBUG:  Entering PLUGIN_ProcessExporter.convert_model (create the model) [1;30m[output.py at line 194][0m [0m
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates VVV1 routines[0m
@@ -280,7 +280,7 @@ ALOHA: aloha creates FFV1 routines[0m
 ALOHA: aloha creates VVVV1 set of routines with options: P0[0m
 ALOHA: aloha creates VVVV3 set of routines with options: P0[0m
 ALOHA: aloha creates VVVV4 set of routines with options: P0[0m
-ALOHA: aloha creates 10 routines in  0.278 s
+ALOHA: aloha creates 10 routines in  0.293 s
 <class 'aloha.create_aloha.AbstractRoutine'> VVV1
 <class 'aloha.create_aloha.AbstractRoutine'> VVV1
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
@@ -328,6 +328,6 @@ Type "launch" to generate events from this process, or see
 Run "open index.html" to see more information about this process.
 quit
 
-real	0m2.229s
-user	0m1.925s
-sys	0m0.210s
+real	0m2.335s
+user	0m2.066s
+sys	0m0.192s
diff --git a/epochX/cudacpp/gg_tt01g.mad/COPYRIGHT b/epochX/cudacpp/gg_tt01g.mad/COPYRIGHT
index a134b5fef9..84a883fbb0 100644
--- a/epochX/cudacpp/gg_tt01g.mad/COPYRIGHT
+++ b/epochX/cudacpp/gg_tt01g.mad/COPYRIGHT
@@ -15,6 +15,7 @@ The full development team currently includes the following authors :
   Stephan Hageboeck (CERN)
   Olivier Mattelaer (Universite Catholique de Louvain, original author)
   Stefan Roiser (CERN, original author)
+  Joergen Teig (CERN)
   Andrea Valassi (CERN, original author)
   Zenny Wettersten (CERN)
 See https://github.com/madgraph5/madgraph4gpu for more details. For the full
diff --git a/epochX/cudacpp/gg_tt01g.mad/Source/DHELAS/aloha_file.inc b/epochX/cudacpp/gg_tt01g.mad/Source/DHELAS/aloha_file.inc
index 4f2ef3d0d8..50c12b0804 100644
--- a/epochX/cudacpp/gg_tt01g.mad/Source/DHELAS/aloha_file.inc
+++ b/epochX/cudacpp/gg_tt01g.mad/Source/DHELAS/aloha_file.inc
@@ -1 +1 @@
-ALOHARoutine = VVVV4P0_1.o VVVV3P0_1.o VVVV1P0_1.o FFV1_1.o FFV1_2.o VVV1P0_1.o VVV1_0.o FFV1_0.o FFV1P0_3.o
+ALOHARoutine = FFV1_1.o VVVV4P0_1.o FFV1_0.o VVV1_0.o FFV1_2.o VVVV3P0_1.o VVVV1P0_1.o VVV1P0_1.o FFV1P0_3.o
diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/Bridge.h b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/Bridge.h
index 4cafe0c997..c04628dfd1 100644
--- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/Bridge.h
+++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/Bridge.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: S. Roiser (Nov 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Roiser, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef BRIDGE_H
 #define BRIDGE_H 1
@@ -22,7 +22,7 @@
 #include <memory>
 #include <type_traits>
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -82,7 +82,7 @@ namespace mg5amcCpu
     Bridge& operator=( const Bridge& ) = delete;
     Bridge& operator=( Bridge&& ) = delete;
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     /**
      * Set the gpublocks and gputhreads for the gpusequence - throws if evnt != gpublocks*gputhreads
      * (this is needed for BridgeKernel tests rather than for actual production use in Fortran)
@@ -149,7 +149,7 @@ namespace mg5amcCpu
     unsigned int m_nevt; // number of events
     int m_nGoodHel;      // the number of good helicities (-1 initially when they have not yet been calculated)
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     int m_gputhreads; // number of gpu threads (default set from number of events, can be modified)
     int m_gpublocks;  // number of gpu blocks (default set from number of events, can be modified)
     mg5amcGpu::DeviceBuffer<FORTRANFPTYPE, sizePerEventMomenta> m_devMomentaF;
@@ -186,12 +186,12 @@ namespace mg5amcCpu
   // Forward declare transposition methods
   //
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 
   template<typename Tin, typename Tout>
   __global__ void dev_transposeMomentaF2C( const Tin* in, Tout* out, const unsigned int nevt );
 
-#endif // __CUDACC__
+#endif // MGONGPUCPP_GPUIMPL
 
   template<typename Tin, typename Tout>
   void hst_transposeMomentaF2C( const Tin* in, Tout* out, const unsigned int nevt );
@@ -208,7 +208,7 @@ namespace mg5amcCpu
   Bridge<FORTRANFPTYPE>::Bridge( unsigned int nevtF, unsigned int nparF, unsigned int np4F )
     : m_nevt( nevtF )
     , m_nGoodHel( -1 )
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     , m_gputhreads( 256 )                  // default number of gpu threads
     , m_gpublocks( m_nevt / m_gputhreads ) // this ensures m_nevt <= m_gpublocks*m_gputhreads
     , m_devMomentaF( m_nevt )
@@ -232,7 +232,7 @@ namespace mg5amcCpu
   {
     if( nparF != CPPProcess::npar ) throw std::runtime_error( "Bridge constructor: npar mismatch" );
     if( np4F != CPPProcess::np4 ) throw std::runtime_error( "Bridge constructor: np4 mismatch" );
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     if( ( m_nevt < s_gputhreadsmin ) || ( m_nevt % s_gputhreadsmin != 0 ) )
       throw std::runtime_error( "Bridge constructor: nevt should be a multiple of " + std::to_string( s_gputhreadsmin ) );
     while( m_nevt != m_gpublocks * m_gputhreads )
@@ -250,11 +250,11 @@ namespace mg5amcCpu
     std::cout << "WARNING! Instantiate host Bridge (nevt=" << m_nevt << ")" << std::endl;
     mg5amcCpu::CPPProcess process( /*verbose=*/false );
     m_pmek.reset( new mg5amcCpu::MatrixElementKernelHost( m_hstMomentaC, m_hstGs, m_hstRndHel, m_hstRndCol, m_hstMEs, m_hstSelHel, m_hstSelCol, m_nevt ) );
-#endif // __CUDACC__
+#endif // MGONGPUCPP_GPUIMPL
     process.initProc( "../../Cards/param_card.dat" );
   }
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   template<typename FORTRANFPTYPE>
   void Bridge<FORTRANFPTYPE>::set_gpugrid( const int gpublocks, const int gputhreads )
   {
@@ -268,7 +268,7 @@ namespace mg5amcCpu
   }
 #endif
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   template<typename FORTRANFPTYPE>
   void Bridge<FORTRANFPTYPE>::gpu_sequence( const FORTRANFPTYPE* momenta,
                                             const FORTRANFPTYPE* gs,
@@ -283,14 +283,14 @@ namespace mg5amcCpu
     constexpr int neppM = MemoryAccessMomenta::neppM;
     if constexpr( neppM == 1 && std::is_same_v<FORTRANFPTYPE, fptype> )
     {
-      checkCuda( cudaMemcpy( m_devMomentaC.data(), momenta, m_devMomentaC.bytes(), cudaMemcpyHostToDevice ) );
+      gpuMemcpy( m_devMomentaC.data(), momenta, m_devMomentaC.bytes(), gpuMemcpyHostToDevice );
     }
     else
     {
-      checkCuda( cudaMemcpy( m_devMomentaF.data(), momenta, m_devMomentaF.bytes(), cudaMemcpyHostToDevice ) );
+      gpuMemcpy( m_devMomentaF.data(), momenta, m_devMomentaF.bytes(), gpuMemcpyHostToDevice );
       const int thrPerEvt = CPPProcess::npar * CPPProcess::np4; // AV: transpose alg does 1 element per thread (NOT 1 event per thread)
       //const int thrPerEvt = 1; // AV: try new alg with 1 event per thread... this seems slower
-      dev_transposeMomentaF2C<<<m_gpublocks * thrPerEvt, m_gputhreads>>>( m_devMomentaF.data(), m_devMomentaC.data(), m_nevt );
+      gpuLaunchKernel( dev_transposeMomentaF2C, m_gpublocks * thrPerEvt, m_gputhreads, m_devMomentaF.data(), m_devMomentaC.data(), m_nevt );
     }
     if constexpr( std::is_same_v<FORTRANFPTYPE, fptype> )
     {
@@ -333,7 +333,7 @@ namespace mg5amcCpu
   }
 #endif
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   template<typename FORTRANFPTYPE>
   void Bridge<FORTRANFPTYPE>::cpu_sequence( const FORTRANFPTYPE* momenta,
                                             const FORTRANFPTYPE* gs,
@@ -388,7 +388,7 @@ namespace mg5amcCpu
   // - C++ array: momenta[npagM][npar][np4][neppM] with nevt=npagM*neppM (AOSOA)
   //
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   template<typename Tin, typename Tout>
   __global__ void dev_transposeMomentaF2C( const Tin* in, Tout* out, const unsigned int nevt )
   {
diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/BridgeKernels.cc b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/BridgeKernels.cc
index cef4cb3c71..90c7f2d3b8 100644
--- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/BridgeKernels.cc
+++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/BridgeKernels.cc
@@ -1,10 +1,11 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #include "BridgeKernels.h"
 
+#include "GpuAbstraction.h"
 #include "MemoryAccessMomenta.h"
 
 #include <sstream>
@@ -14,7 +15,7 @@ constexpr int npar = CPPProcess::npar; // #particles in total (external = initia
 
 //============================================================================
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -45,7 +46,7 @@ namespace mg5amcCpu
 
 //============================================================================
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 namespace mg5amcCpu
 {
 
@@ -96,7 +97,7 @@ namespace mg5amcCpu
 
 //============================================================================
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 {
 
diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/BridgeKernels.h b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/BridgeKernels.h
index 15eb4bff4d..3efef8ce97 100644
--- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/BridgeKernels.h
+++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/BridgeKernels.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef BRIDGEKERNELS_H
 #define BRIDGEKERNELS_H 1
@@ -12,7 +12,7 @@
 #include "MatrixElementKernels.h"
 #include "MemoryBuffers.h"
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -49,7 +49,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A Bridge wrapper class encapsulating matrix element calculations on a CPU host
   class BridgeKernelHost final : public BridgeKernelBase
   {
@@ -89,7 +89,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   // A Bridge wrapper class encapsulating matrix element calculations on a GPU device
   class BridgeKernelDevice : public BridgeKernelBase
   {
diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/CommonRandomNumberKernel.cc b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/CommonRandomNumberKernel.cc
index 985b39f576..010bc4cbd0 100644
--- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/CommonRandomNumberKernel.cc
+++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/CommonRandomNumberKernel.cc
@@ -1,15 +1,16 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #include "CommonRandomNumbers.h"
+#include "GpuAbstraction.h"
 #include "MemoryBuffers.h"
 #include "RandomNumberKernels.h"
 
 #include <cassert>
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/CrossSectionKernels.cc b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/CrossSectionKernels.cc
index 0b355a3c8d..c15b39844d 100644
--- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/CrossSectionKernels.cc
+++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/CrossSectionKernels.cc
@@ -1,10 +1,11 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #include "CrossSectionKernels.h"
 
+#include "GpuAbstraction.h"
 #include "MemoryAccessMatrixElements.h"
 #include "MemoryAccessWeights.h"
 #include "MemoryBuffers.h"
@@ -77,7 +78,7 @@ debug_me_is_abnormal( const fptype& me, size_t ievtALL )
 
 //============================================================================
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -185,7 +186,7 @@ namespace mg5amcCpu
 
 //============================================================================
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 {
 
diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/CrossSectionKernels.h b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/CrossSectionKernels.h
index 7933ca4bbf..4d9659e04e 100644
--- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/CrossSectionKernels.h
+++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/CrossSectionKernels.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef CROSSSECTIONKERNELS_H
 #define CROSSSECTIONKERNELS_H 1
@@ -13,7 +13,7 @@
 
 //============================================================================
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -96,7 +96,7 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
   /*
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   // A class encapsulating the calculation of event statistics on a GPU device
   class CrossSectionKernelDevice : public CrossSectionKernelBase, public NumberOfEvents
   {
diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/CudaRuntime.h b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/CudaRuntime.h
deleted file mode 100644
index 64ce52f4b3..0000000000
--- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/CudaRuntime.h
+++ /dev/null
@@ -1,85 +0,0 @@
-// Copyright (C) 2020-2023 CERN and UCLouvain.
-// Licensed under the GNU Lesser General Public License (version 3 or later).
-// Created by: S. Roiser (Jul 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
-
-#ifndef MG5AMC_CUDARUNTIME_H
-#define MG5AMC_CUDARUNTIME_H 1
-
-// MG5AMC on GPU uses the CUDA runtime API, not the lower level CUDA driver API
-// See https://docs.nvidia.com/cuda/cuda-runtime-api/driver-vs-runtime-api.html#driver-vs-runtime-api
-
-#include <cassert>
-#include <iostream>
-
-//--------------------------------------------------------------------------
-
-// See https://stackoverflow.com/a/14038590
-#ifdef __CUDACC__ /* clang-format off */
-#define checkCuda( code ) { assertCuda( code, __FILE__, __LINE__ ); }
-inline void assertCuda( cudaError_t code, const char* file, int line, bool abort = true )
-{
-  if( code != cudaSuccess )
-  {
-    printf( "ERROR! assertCuda: '%s' (%d) in %s:%d\n", cudaGetErrorString( code ), code, file, line );
-    if( abort ) assert( code == cudaSuccess );
-  }
-}
-#endif /* clang-format on */
-
-//--------------------------------------------------------------------------
-
-#ifdef __CUDACC__
-namespace mg5amcGpu
-{
-  // Instantiate a CudaRuntime at the beginnining of the application's main to
-  // invoke cudaSetDevice(0) in the constructor and book a cudaDeviceReset() call in the destructor
-  // *** FIXME! This will all need to be designed differently when going to multi-GPU nodes! ***
-  struct CudaRuntime final
-  {
-    CudaRuntime( const bool debug = true )
-      : m_debug( debug ) { setUp( m_debug ); }
-    ~CudaRuntime() { tearDown( m_debug ); }
-    CudaRuntime( const CudaRuntime& ) = delete;
-    CudaRuntime( CudaRuntime&& ) = delete;
-    CudaRuntime& operator=( const CudaRuntime& ) = delete;
-    CudaRuntime& operator=( CudaRuntime&& ) = delete;
-    bool m_debug;
-
-    // Set up CUDA application
-    // ** NB: strictly speaking this is not needed when using the CUDA runtime API **
-    // Calling cudaSetDevice on startup is useful to properly book-keep the time spent in CUDA initialization
-    static void setUp( const bool debug = true )
-    {
-      // ** NB: it is useful to call cudaSetDevice, or cudaFree, to properly book-keep the time spent in CUDA initialization
-      // ** NB: otherwise, the first CUDA operation (eg a cudaMemcpyToSymbol in CPPProcess ctor) appears to take much longer!
-      /*
-      // [We initially added cudaFree(0) to "ease profile analysis" only because it shows up as a big recognizable block!]
-      // No explicit initialization is needed: https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#initialization
-      // It is not clear what cudaFree(0) does at all: https://stackoverflow.com/questions/69967813/
-      if ( debug ) std::cout << "__CudaRuntime: calling cudaFree(0)" << std::endl;
-      checkCuda( cudaFree( 0 ) ); // SLOW!
-      */
-      // Replace cudaFree(0) by cudaSetDevice(0), even if it is not really needed either
-      // (but see https://developer.nvidia.com/blog/cuda-pro-tip-always-set-current-device-avoid-multithreading-bugs)
-      if( debug ) std::cout << "__CudaRuntime: calling cudaSetDevice(0)" << std::endl;
-      checkCuda( cudaSetDevice( 0 ) ); // SLOW!
-    }
-
-    // Tear down CUDA application (call cudaDeviceReset)
-    // ** NB: strictly speaking this is not needed when using the CUDA runtime API **
-    // Calling cudaDeviceReset on shutdown is only needed for checking memory leaks in cuda-memcheck
-    // See https://docs.nvidia.com/cuda/cuda-memcheck/index.html#leak-checking
-    static void tearDown( const bool debug = true )
-    {
-      if( debug ) std::cout << "__CudaRuntime: calling cudaDeviceReset()" << std::endl;
-      checkCuda( cudaDeviceReset() );
-    }
-  };
-
-}
-#endif
-
-//--------------------------------------------------------------------------
-
-#endif // MG5AMC_CUDARUNTIME_H
diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/CurandRandomNumberKernel.cc b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/CurandRandomNumberKernel.cc
index eb56333b03..38c477c17a 100644
--- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/CurandRandomNumberKernel.cc
+++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/CurandRandomNumberKernel.cc
@@ -1,9 +1,9 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
-#include "CudaRuntime.h"
+#include "GpuRuntime.h"
 #include "MemoryBuffers.h"
 #include "RandomNumberKernels.h"
 
@@ -114,7 +114,7 @@ namespace mg5amcCpu
     /*
     printf( "\nCurandRandomNumberKernel::generateRnarray size = %d\n", (int)m_rnarray.size() );
     fptype* data = m_rnarray.data();
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     if( m_rnarray.isOnDevice() )
     {
       data = new fptype[m_rnarray.size()]();
@@ -123,7 +123,7 @@ namespace mg5amcCpu
 #endif
     for( int i = 0; i < ( (int)m_rnarray.size() / 4 ); i++ )
       printf( "[%4d] %f %f %f %f\n", i * 4, data[i * 4], data[i * 4 + 2], data[i * 4 + 2], data[i * 4 + 3] );
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     if( m_rnarray.isOnDevice() ) delete[] data;
 #endif
     */
diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/EventStatistics.h b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/EventStatistics.h
index 48b51e0a49..b425a5bade 100644
--- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/EventStatistics.h
+++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/EventStatistics.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef EventStatistics_H
 #define EventStatistics_H 1
@@ -16,7 +16,7 @@
 #include <limits>
 #include <string>
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/GpuAbstraction.h b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/GpuAbstraction.h
new file mode 100644
index 0000000000..6a7d9c05c0
--- /dev/null
+++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/GpuAbstraction.h
@@ -0,0 +1,71 @@
+// Copyright (C) 2020-2023 CERN and UCLouvain.
+// Licensed under the GNU Lesser General Public License (version 3 or later).
+// Created by: J. Teig (Jul 2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+
+#ifndef MG5AMC_GPUABSTRACTION_H
+#define MG5AMC_GPUABSTRACTION_H 1
+
+#include <cassert>
+
+//--------------------------------------------------------------------------
+
+#ifdef __CUDACC__
+
+#define gpuError_t cudaError_t
+#define gpuPeekAtLastError cudaPeekAtLastError
+#define gpuGetErrorString cudaGetErrorString
+#define gpuSuccess cudaSuccess
+
+#define gpuMallocHost( ptr, size ) checkGpu( cudaMallocHost( ptr, size ) )
+#define gpuMalloc( ptr, size ) checkGpu( cudaMalloc( ptr, size ) )
+
+#define gpuMemcpy( dstData, srcData, srcBytes, func ) checkGpu( cudaMemcpy( dstData, srcData, srcBytes, func ) )
+#define gpuMemcpyHostToDevice cudaMemcpyHostToDevice
+#define gpuMemcpyDeviceToHost cudaMemcpyDeviceToHost
+#define gpuMemcpyToSymbol( type1, type2, size ) checkGpu( cudaMemcpyToSymbol( type1, type2, size ) )
+
+#define gpuFree( ptr ) checkGpu( cudaFree( ptr ) )
+#define gpuFreeHost( ptr ) checkGpu( cudaFreeHost( ptr ) )
+
+#define gpuSetDevice cudaSetDevice
+#define gpuDeviceSynchronize cudaDeviceSynchronize
+#define gpuDeviceReset cudaDeviceReset
+
+#define gpuLaunchKernel( kernel, blocks, threads, ... ) kernel<<<blocks, threads>>>( __VA_ARGS__ )
+#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<<blocks, threads, sharedMem>>>( __VA_ARGS__ )
+
+//--------------------------------------------------------------------------
+
+#elif defined __HIPCC__
+
+#include "hip/hip_runtime.h"
+
+#define gpuError_t hipError_t
+#define gpuPeekAtLastError hipPeekAtLastError
+#define gpuGetErrorString hipGetErrorString
+#define gpuSuccess hipSuccess
+
+#define gpuMallocHost( ptr, size ) checkGpu( hipHostMalloc( ptr, size ) ) // HostMalloc better
+#define gpuMalloc( ptr, size ) checkGpu( hipMalloc( ptr, size ) )
+
+#define gpuMemcpy( dstData, srcData, srcBytes, func ) checkGpu( hipMemcpy( dstData, srcData, srcBytes, func ) )
+#define gpuMemcpyHostToDevice hipMemcpyHostToDevice
+#define gpuMemcpyDeviceToHost hipMemcpyDeviceToHost
+#define gpuMemcpyToSymbol( type1, type2, size ) checkGpu( hipMemcpyToSymbol( type1, type2, size ) )
+
+#define gpuFree( ptr ) checkGpu( hipFree( ptr ) )
+#define gpuFreeHost( ptr ) checkGpu( hipHostFree( ptr ) )
+
+#define gpuSetDevice hipSetDevice
+#define gpuDeviceSynchronize hipDeviceSynchronize
+#define gpuDeviceReset hipDeviceReset
+
+#define gpuLaunchKernel( kernel, blocks, threads, ... ) kernel<<<blocks, threads>>>( __VA_ARGS__ )
+#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<<blocks, threads, sharedMem>>>( __VA_ARGS__ )
+
+//--------------------------------------------------------------------------
+
+#endif
+
+#endif // MG5AMC_GPUABSTRACTION_H
diff --git a/epochX/cudacpp/ee_mumu.sa/SubProcesses/CudaRuntime.h b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/GpuRuntime.h
similarity index 62%
rename from epochX/cudacpp/ee_mumu.sa/SubProcesses/CudaRuntime.h
rename to epochX/cudacpp/gg_tt01g.mad/SubProcesses/GpuRuntime.h
index 64ce52f4b3..93579ef08b 100644
--- a/epochX/cudacpp/ee_mumu.sa/SubProcesses/CudaRuntime.h
+++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/GpuRuntime.h
@@ -1,49 +1,50 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
-// Created by: S. Roiser (Jul 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+// Created by: J. Teig (Jun 2023, based on earlier work by S. Roiser) for the MG5aMC CUDACPP plugin.
+// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 
-#ifndef MG5AMC_CUDARUNTIME_H
-#define MG5AMC_CUDARUNTIME_H 1
+#ifndef MG5AMC_GPURUNTIME_H
+#define MG5AMC_GPURUNTIME_H 1
 
 // MG5AMC on GPU uses the CUDA runtime API, not the lower level CUDA driver API
 // See https://docs.nvidia.com/cuda/cuda-runtime-api/driver-vs-runtime-api.html#driver-vs-runtime-api
 
-#include <cassert>
+#include "GpuAbstraction.h"
+
 #include <iostream>
 
 //--------------------------------------------------------------------------
 
 // See https://stackoverflow.com/a/14038590
-#ifdef __CUDACC__ /* clang-format off */
-#define checkCuda( code ) { assertCuda( code, __FILE__, __LINE__ ); }
-inline void assertCuda( cudaError_t code, const char* file, int line, bool abort = true )
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
+#define checkGpu( code ) { assertGpu( code, __FILE__, __LINE__ ); }
+inline void assertGpu( gpuError_t code, const char* file, int line, bool abort = true )
 {
-  if( code != cudaSuccess )
+  if( code != gpuSuccess )
   {
-    printf( "ERROR! assertCuda: '%s' (%d) in %s:%d\n", cudaGetErrorString( code ), code, file, line );
-    if( abort ) assert( code == cudaSuccess );
+    printf( "ERROR! assertGpu: '%s' (%d) in %s:%d\n", gpuGetErrorString( code ), code, file, line );
+    if( abort ) assert( code == gpuSuccess );
   }
 }
 #endif /* clang-format on */
 
 //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 {
-  // Instantiate a CudaRuntime at the beginnining of the application's main to
-  // invoke cudaSetDevice(0) in the constructor and book a cudaDeviceReset() call in the destructor
+  // Instantiate a GpuRuntime at the beginnining of the application's main to
+  // invoke gpuSetDevice(0) in the constructor and book a gpuDeviceReset() call in the destructor
   // *** FIXME! This will all need to be designed differently when going to multi-GPU nodes! ***
-  struct CudaRuntime final
+  struct GpuRuntime final
   {
-    CudaRuntime( const bool debug = true )
+    GpuRuntime( const bool debug = true )
       : m_debug( debug ) { setUp( m_debug ); }
-    ~CudaRuntime() { tearDown( m_debug ); }
-    CudaRuntime( const CudaRuntime& ) = delete;
-    CudaRuntime( CudaRuntime&& ) = delete;
-    CudaRuntime& operator=( const CudaRuntime& ) = delete;
-    CudaRuntime& operator=( CudaRuntime&& ) = delete;
+    ~GpuRuntime() { tearDown( m_debug ); }
+    GpuRuntime( const GpuRuntime& ) = delete;
+    GpuRuntime( GpuRuntime&& ) = delete;
+    GpuRuntime& operator=( const GpuRuntime& ) = delete;
+    GpuRuntime& operator=( GpuRuntime&& ) = delete;
     bool m_debug;
 
     // Set up CUDA application
@@ -62,8 +63,8 @@ namespace mg5amcGpu
       */
       // Replace cudaFree(0) by cudaSetDevice(0), even if it is not really needed either
       // (but see https://developer.nvidia.com/blog/cuda-pro-tip-always-set-current-device-avoid-multithreading-bugs)
-      if( debug ) std::cout << "__CudaRuntime: calling cudaSetDevice(0)" << std::endl;
-      checkCuda( cudaSetDevice( 0 ) ); // SLOW!
+      if( debug ) std::cout << "__GpuRuntime: calling GpuSetDevice(0)" << std::endl;
+      checkGpu( gpuSetDevice( 0 ) ); // SLOW!
     }
 
     // Tear down CUDA application (call cudaDeviceReset)
@@ -72,14 +73,13 @@ namespace mg5amcGpu
     // See https://docs.nvidia.com/cuda/cuda-memcheck/index.html#leak-checking
     static void tearDown( const bool debug = true )
     {
-      if( debug ) std::cout << "__CudaRuntime: calling cudaDeviceReset()" << std::endl;
-      checkCuda( cudaDeviceReset() );
+      if( debug ) std::cout << "__GpuRuntime: calling GpuDeviceReset()" << std::endl;
+      checkGpu( gpuDeviceReset() );
     }
   };
-
 }
 #endif
 
 //--------------------------------------------------------------------------
 
-#endif // MG5AMC_CUDARUNTIME_H
+#endif // MG5AMC_GPURUNTIME_H
diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MadgraphTest.h b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MadgraphTest.h
index fd7734ce42..d2ff326e20 100644
--- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MadgraphTest.h
+++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MadgraphTest.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: S. Hageboeck (Dec 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MADGRAPHTEST_H_
 #define MADGRAPHTEST_H_ 1
@@ -21,7 +21,7 @@
 #include <string>
 #include <vector>
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 using mg5amcGpu::CPPProcess;
 #else
 using mg5amcCpu::CPPProcess;
@@ -200,7 +200,7 @@ class MadgraphTest : public testing::TestWithParam<TestDriverBase*>
 
 // Since we link both the CPU-only and GPU tests into the same executable, we prevent
 // a multiply defined symbol by only compiling this in the non-CUDA phase:
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 
 /// Compare momenta and matrix elements.
 /// This uses an implementation of TestDriverBase to run a madgraph workflow,
@@ -309,6 +309,6 @@ TEST_P( MadgraphTest, CompareMomentaAndME )
   }
 }
 
-#endif // __CUDACC__
+#endif // MGONGPUCPP_GPUIMPL
 
 #endif /* MADGRAPHTEST_H_ */
diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MatrixElementKernels.cc b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MatrixElementKernels.cc
index 30257195b6..d6d6c4f179 100644
--- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MatrixElementKernels.cc
+++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MatrixElementKernels.cc
@@ -1,12 +1,12 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #include "MatrixElementKernels.h"
 
 #include "CPPProcess.h"
-#include "CudaRuntime.h"
+#include "GpuRuntime.h" // Includes the abstraction for Nvidia/AMD compilation
 #include "MemoryAccessMomenta.h"
 #include "MemoryBuffers.h"
 
@@ -14,7 +14,7 @@
 
 //============================================================================
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 namespace mg5amcCpu
 {
 
@@ -143,7 +143,7 @@ namespace mg5amcCpu
 
 //============================================================================
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 {
 
@@ -202,13 +202,13 @@ namespace mg5amcGpu
     PinnedHostBufferHelicityMask hstIsGoodHel( ncomb );
     DeviceBufferHelicityMask devIsGoodHel( ncomb );
     // ... 0d1. Compute good helicity mask on the device
-    computeDependentCouplings<<<m_gpublocks, m_gputhreads>>>( m_gs.data(), m_couplings.data() );
+    gpuLaunchKernel( computeDependentCouplings, m_gpublocks, m_gputhreads, m_gs.data(), m_couplings.data() );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    sigmaKin_getGoodHel<<<m_gpublocks, m_gputhreads>>>( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_numerators.data(), m_denominators.data(), devIsGoodHel.data() );
+    gpuLaunchKernel( sigmaKin_getGoodHel, m_gpublocks, m_gputhreads, m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_numerators.data(), m_denominators.data(), devIsGoodHel.data() );
 #else
-    sigmaKin_getGoodHel<<<m_gpublocks, m_gputhreads>>>( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), devIsGoodHel.data() );
+    gpuLaunchKernel( sigmaKin_getGoodHel, m_gpublocks, m_gputhreads, m_momenta.data(), m_couplings.data(), m_matrixElements.data(), devIsGoodHel.data() );
 #endif
-    checkCuda( cudaPeekAtLastError() );
+    checkGpu( gpuPeekAtLastError() );
     // ... 0d2. Copy back good helicity mask to the host
     copyHostFromDevice( hstIsGoodHel, devIsGoodHel );
     // ... 0d3. Copy back good helicity list to constant memory on the device
@@ -219,19 +219,19 @@ namespace mg5amcGpu
 
   void MatrixElementKernelDevice::computeMatrixElements( const unsigned int channelId )
   {
-    computeDependentCouplings<<<m_gpublocks, m_gputhreads>>>( m_gs.data(), m_couplings.data() );
+    gpuLaunchKernel( computeDependentCouplings, m_gpublocks, m_gputhreads, m_gs.data(), m_couplings.data() );
 #ifndef MGONGPU_NSIGHT_DEBUG
     constexpr unsigned int sharedMemSize = 0;
 #else
     constexpr unsigned int sharedMemSize = ntpbMAX * sizeof( float );
 #endif
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    sigmaKin<<<m_gpublocks, m_gputhreads, sharedMemSize>>>( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), channelId, m_numerators.data(), m_denominators.data(), m_selhel.data(), m_selcol.data() );
+    gpuLaunchKernelSharedMem( sigmaKin, m_gpublocks, m_gputhreads, sharedMemSize, m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), channelId, m_numerators.data(), m_denominators.data(), m_selhel.data(), m_selcol.data() );
 #else
-    sigmaKin<<<m_gpublocks, m_gputhreads, sharedMemSize>>>( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), m_selhel.data(), m_selcol.data() );
+    gpuLaunchKernelSharedMem( sigmaKin, m_gpublocks, m_gputhreads, sharedMemSize, m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), m_selhel.data(), m_selcol.data() );
 #endif
-    checkCuda( cudaPeekAtLastError() );
-    checkCuda( cudaDeviceSynchronize() );
+    checkGpu( gpuPeekAtLastError() );
+    checkGpu( gpuDeviceSynchronize() );
   }
 
   //--------------------------------------------------------------------------
diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MatrixElementKernels.h b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MatrixElementKernels.h
index 23e84757a2..72bd8f195b 100644
--- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MatrixElementKernels.h
+++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MatrixElementKernels.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MATRIXELEMENTKERNELS_H
 #define MATRIXELEMENTKERNELS_H 1
@@ -10,7 +10,7 @@
 
 #include "MemoryBuffers.h"
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -81,7 +81,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating matrix element calculations on a CPU host
   class MatrixElementKernelHost final : public MatrixElementKernelBase, public NumberOfEvents
   {
@@ -130,7 +130,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   // A class encapsulating matrix element calculations on a GPU device
   class MatrixElementKernelDevice : public MatrixElementKernelBase, public NumberOfEvents
   {
diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MemoryAccessHelpers.h b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MemoryAccessHelpers.h
index c82a6c7635..db73e4e064 100644
--- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MemoryAccessHelpers.h
+++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MemoryAccessHelpers.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MemoryAccessHelpers_H
 #define MemoryAccessHelpers_H 1
@@ -105,7 +105,7 @@ class KernelAccessHelper : public MemoryAccessHelper<T>
     }
     else
     {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
       const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid
       //printf( "kernelAccessRecord: ievt=%d threadId=%d\n", ievt, threadIdx.x );
       return T::ieventAccessRecord( buffer, ievt ); // NB fptype and fptype_sv coincide for CUDA
diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MemoryAccessMomenta.h b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MemoryAccessMomenta.h
index 0ac4faa3c7..38fade09fb 100644
--- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MemoryAccessMomenta.h
+++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MemoryAccessMomenta.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MemoryAccessMomenta_H
 #define MemoryAccessMomenta_H 1
@@ -12,7 +12,7 @@
 #include "MemoryAccessHelpers.h"
 #include "MemoryAccessVectors.h"
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 using mg5amcGpu::CPPProcess;
 #else
 using mg5amcCpu::CPPProcess;
@@ -29,7 +29,7 @@ class MemoryAccessMomentaBase //_AOSOAv1
 
   // Number of Events Per Page in the momenta AOSOA memory buffer layout
   // (these are all best kept as a compile-time constants: see issue #23)
-#ifdef __CUDACC__ /* clang-format off */
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
   // -----------------------------------------------------------------------------------------------
   // --- GPUs: neppM is best set to a power of 2 times the number of fptype's in a 32-byte cacheline
   // --- This is relevant to ensure coalesced access to momenta in global memory
diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MemoryAccessRandomNumbers.h b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MemoryAccessRandomNumbers.h
index e2988d39f3..40cb089135 100644
--- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MemoryAccessRandomNumbers.h
+++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MemoryAccessRandomNumbers.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MemoryAccessRandomNumbers_H
 #define MemoryAccessRandomNumbers_H 1
@@ -11,7 +11,7 @@
 #include "CPPProcess.h"
 #include "MemoryAccessHelpers.h"
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 using mg5amcGpu::CPPProcess;
 #else
 using mg5amcCpu::CPPProcess;
diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MemoryAccessVectors.h b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MemoryAccessVectors.h
index e9b197368e..08faccff0f 100644
--- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MemoryAccessVectors.h
+++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MemoryAccessVectors.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MemoryAccessVectors_H
 #define MemoryAccessVectors_H 1
@@ -10,7 +10,7 @@
 
 #include "mgOnGpuVectors.h"
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 namespace mg5amcCpu // this is only needed for CPU SIMD vectorization
 {
 
diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MemoryBuffers.h b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MemoryBuffers.h
index 3093e6ed18..7756a71621 100644
--- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MemoryBuffers.h
+++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MemoryBuffers.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021, based on earlier work by S. Hageboeck) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Roiser, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MemoryBuffers_H
 #define MemoryBuffers_H 1
@@ -11,12 +11,12 @@
 #include "mgOnGpuCxtypes.h"
 
 #include "CPPProcess.h"
-#include "CudaRuntime.h"
+#include "GpuRuntime.h"
 #include "Parameters_sm.h"
 
 #include <sstream>
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -87,7 +87,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   constexpr bool HostBufferALIGNED = false;   // ismisaligned=false
   constexpr bool HostBufferMISALIGNED = true; // ismisaligned=true
 
@@ -119,7 +119,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   // A class encapsulating a CUDA pinned host buffer
   template<typename T>
   class PinnedHostBufferBase : public BufferBase<T>
@@ -128,18 +128,18 @@ namespace mg5amcCpu
     PinnedHostBufferBase( const size_t size )
       : BufferBase<T>( size, false )
     {
-      checkCuda( cudaMallocHost( &( this->m_data ), this->bytes() ) );
+      gpuMallocHost( &( this->m_data ), this->bytes() );
     }
     virtual ~PinnedHostBufferBase()
     {
-      checkCuda( cudaFreeHost( this->m_data ) );
+      gpuFreeHost( this->m_data );
     }
   };
 #endif
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   // A class encapsulating a CUDA device buffer
   template<typename T>
   class DeviceBufferBase : public BufferBase<T>
@@ -148,18 +148,18 @@ namespace mg5amcCpu
     DeviceBufferBase( const size_t size )
       : BufferBase<T>( size, true )
     {
-      checkCuda( cudaMalloc( &( this->m_data ), this->bytes() ) );
+      gpuMalloc( &( this->m_data ), this->bytes() );
     }
     virtual ~DeviceBufferBase()
     {
-      checkCuda( cudaFree( this->m_data ) );
+      gpuFree( this->m_data );
     }
   };
 #endif
 
   //--------------------------------------------------------------------------
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for a given number of events
   template<typename T, size_t sizePerEvent, bool ismisaligned>
   class HostBuffer : public HostBufferBase<T, ismisaligned>, virtual private NumberOfEvents
@@ -175,7 +175,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   // A class encapsulating a CUDA pinned host buffer for a given number of events
   template<typename T, size_t sizePerEvent>
   class PinnedHostBuffer : public PinnedHostBufferBase<T>, virtual private NumberOfEvents
@@ -191,7 +191,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   // A class encapsulating a CUDA device buffer for a given number of events
   template<typename T, size_t sizePerEvent>
   class DeviceBuffer : public DeviceBufferBase<T>, virtual private NumberOfEvents
@@ -213,7 +213,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for momenta random numbers
   constexpr size_t sizePerEventRndNumMomenta = MemoryBuffers::np4 * MemoryBuffers::nparf;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for momenta random numbers
   typedef HostBuffer<fptype, sizePerEventRndNumMomenta, HostBufferALIGNED> HostBufferRndNumMomenta;
 #else
@@ -232,7 +232,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer with ONE fptype per event
   constexpr size_t sizePerEventOneFp = 1;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer with ONE fptype per event
   typedef HostBuffer<fptype, sizePerEventOneFp, HostBufferALIGNED> HostBufferOneFp;
 #else
@@ -257,7 +257,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for Gs
   constexpr size_t sizePerEventGs = 1;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for gs
   typedef HostBuffer<fptype, sizePerEventGs, HostBufferALIGNED> HostBufferGs;
 #else
@@ -276,7 +276,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for numerators
   constexpr size_t sizePerEventNumerators = 1;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for gs
   typedef HostBuffer<fptype, sizePerEventNumerators, HostBufferALIGNED> HostBufferNumerators;
 #else
@@ -296,7 +296,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for denominators
   constexpr size_t sizePerEventDenominators = 1;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for gs
   typedef HostBuffer<fptype, sizePerEventDenominators, HostBufferALIGNED> HostBufferDenominators;
 #else
@@ -315,7 +315,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for random numbers
   constexpr size_t sizePerEventCouplings = MemoryBuffers::ndcoup * MemoryBuffers::nx2;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for gs
   typedef HostBuffer<fptype, sizePerEventCouplings, HostBufferALIGNED> HostBufferCouplings;
 #else
@@ -333,7 +333,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for momenta
   constexpr size_t sizePerEventMomenta = MemoryBuffers::np4 * MemoryBuffers::npar;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for momenta
   typedef HostBuffer<fptype, sizePerEventMomenta, HostBufferALIGNED> HostBufferMomenta;
   //typedef HostBuffer<fptype, sizePerEventMomenta, HostBufferMISALIGNED> HostBufferMomenta; // TEST MISALIGNMENT!
@@ -352,7 +352,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for sampling weights
   constexpr size_t sizePerEventWeights = 1;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for sampling weights
   typedef HostBuffer<fptype, sizePerEventWeights, HostBufferALIGNED> HostBufferWeights;
 #else
@@ -370,7 +370,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for matrix elements
   constexpr size_t sizePerEventMatrixElements = 1;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for matrix elements
   typedef HostBuffer<fptype, sizePerEventMatrixElements, HostBufferALIGNED> HostBufferMatrixElements;
 #else
@@ -385,7 +385,7 @@ namespace mg5amcCpu
   // A base class encapsulating a memory buffer for the helicity mask
   typedef BufferBase<bool> BufferHelicityMask;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for the helicity mask
   typedef HostBufferBase<bool, HostBufferALIGNED> HostBufferHelicityMask;
 #else
@@ -403,7 +403,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for wavefunctions
   constexpr size_t sizePerEventWavefunctions = MemoryBuffers::nw6 * MemoryBuffers::nx2;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for wavefunctions
   typedef HostBuffer<fptype, sizePerEventWavefunctions, HostBufferALIGNED> HostBufferWavefunctions;
 #else
@@ -421,7 +421,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for helicity random numbers
   constexpr size_t sizePerEventRndNumHelicity = 1;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for helicity random numbers
   typedef HostBuffer<fptype, sizePerEventRndNumHelicity, HostBufferALIGNED> HostBufferRndNumHelicity;
 #else
@@ -439,7 +439,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for color random numbers
   constexpr size_t sizePerEventRndNumColor = 1;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for color random numbers
   typedef HostBuffer<fptype, sizePerEventRndNumColor, HostBufferALIGNED> HostBufferRndNumColor;
 #else
@@ -457,7 +457,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for helicity selection
   constexpr size_t sizePerEventSelectedHelicity = 1;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for helicity selection
   typedef HostBuffer<int, sizePerEventSelectedHelicity, HostBufferALIGNED> HostBufferSelectedHelicity;
 #else
@@ -475,7 +475,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for color selection
   constexpr size_t sizePerEventSelectedColor = 1;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for color selection
   typedef HostBuffer<int, sizePerEventSelectedColor, HostBufferALIGNED> HostBufferSelectedColor;
 #else
@@ -487,7 +487,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   template<class Tdst, class Tsrc>
   void copyDeviceFromHost( Tdst& dst, const Tsrc& src ) // keep the same order of arguments as in memcpy
   {
@@ -504,13 +504,13 @@ namespace mg5amcCpu
       throw std::runtime_error( sstr.str() );
     }
     // NB (PR #45): cudaMemcpy involves an intermediate memcpy to pinned memory if host array is a not a pinned host array
-    checkCuda( cudaMemcpy( dst.data(), src.data(), src.bytes(), cudaMemcpyHostToDevice ) );
+    gpuMemcpy( dst.data(), src.data(), src.bytes(), gpuMemcpyHostToDevice );
   }
 #endif
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   template<class Tdst, class Tsrc>
   void copyHostFromDevice( Tdst& dst, const Tsrc& src ) // keep the same order of arguments as in memcpy
   {
@@ -527,7 +527,7 @@ namespace mg5amcCpu
       throw std::runtime_error( sstr.str() );
     }
     // NB (PR #45): cudaMemcpy involves an intermediate memcpy to pinned memory if host array is a not a pinned host array
-    checkCuda( cudaMemcpy( dst.data(), src.data(), src.bytes(), cudaMemcpyDeviceToHost ) );
+    gpuMemcpy( dst.data(), src.data(), src.bytes(), gpuMemcpyDeviceToHost );
   }
 #endif
 
diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/CPPProcess.cc b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/CPPProcess.cc
index a4cc98e6b1..62fa7f0088 100644
--- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/CPPProcess.cc
+++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/CPPProcess.cc
@@ -4,7 +4,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
 // MadGraph5_aMC@NLO v. 3.5.0_lo_vect, 2023-06-09
@@ -16,7 +16,6 @@
 
 #include "mgOnGpuConfig.h"
 
-#include "CudaRuntime.h"
 #include "HelAmps_sm.h"
 #include "MemoryAccessAmplitudes.h"
 #include "MemoryAccessCouplings.h"
@@ -46,7 +45,7 @@
 // Class member functions for calculating the matrix elements for
 // Process: g g > t t~ WEIGHTED<=2 @1
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -80,7 +79,7 @@ namespace mg5amcCpu
   __device__ const fptype cIPD[2] = { (fptype)Parameters_sm::mdl_MT, (fptype)Parameters_sm::mdl_WT };
   __device__ const fptype* cIPC = nullptr; // unused as nicoup=0
 #else
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   __device__ __constant__ fptype cIPD[2];
   __device__ __constant__ fptype* cIPC = nullptr; // unused as nicoup=0
 #else
@@ -90,7 +89,7 @@ namespace mg5amcCpu
 #endif
 
   // Helicity combinations (and filtering of "good" helicity combinations)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   __device__ __constant__ short cHel[ncomb][npar];
   __device__ __constant__ int cNGoodHel;
   __device__ __constant__ int cGoodHel[ncomb];
@@ -118,13 +117,13 @@ namespace mg5amcCpu
                            fptype* allDenominators,       // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
                            fptype_sv* jamp2_sv            // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled)
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
                            , const int ievt00             // input: first event number in current C++ event page (for CUDA, ievt depends on threadid)
 #endif
                            )
   //ALWAYS_INLINE // attributes are not permitted in a function definition
   {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     using namespace mg5amcGpu;
     using M_ACCESS = DeviceAccessMomenta;         // non-trivial access: buffer includes all events
     using E_ACCESS = DeviceAccessMatrixElements;  // non-trivial access: buffer includes all events
@@ -151,7 +150,7 @@ namespace mg5amcCpu
 #endif /* clang-format on */
     mgDebug( 0, __FUNCTION__ );
     //printf( "calculate_wavefunctions: ihel=%2d\n", ihel );
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
     //printf( "calculate_wavefunctions: ievt00=%d\n", ievt00 );
 #endif
 
@@ -187,7 +186,7 @@ namespace mg5amcCpu
 #endif
     for( int iParity = 0; iParity < nParity; ++iParity )
     { // START LOOP ON IPARITY
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
       const int ievt0 = ievt00 + iParity * neppV;
 #endif
       constexpr size_t nxcoup = ndcoup + nicoup; // both dependent and independent couplings
@@ -200,8 +199,10 @@ namespace mg5amcCpu
         allCOUPs[idcoup] = CD_ACCESS::idcoupAccessBufferConst( allcouplings, idcoup ); // dependent couplings, vary event-by-event
       for( size_t iicoup = 0; iicoup < nicoup; iicoup++ )
         allCOUPs[ndcoup + iicoup] = CI_ACCESS::iicoupAccessBufferConst( cIPC, iicoup ); // independent couplings, fixed for all events
+#ifdef MGONGPUCPP_GPUIMPL
 #ifdef __CUDACC__
 #pragma nv_diagnostic pop
+#endif
       // CUDA kernels take input/output buffers with momenta/MEs for all events
       const fptype* momenta = allmomenta;
       const fptype* COUPs[nxcoup];
@@ -302,7 +303,7 @@ namespace mg5amcCpu
         { 16, -2 },
         { -2, 16 } }; // 2-D array[2][2]
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
       // Pre-compute a constexpr triangular color matrix properly normalized #475
       struct TriangularNormalizedColorMatrix
       {
@@ -359,7 +360,7 @@ namespace mg5amcCpu
 #endif
       for( int icol = 0; icol < ncolor; icol++ )
       {
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
         // === C++ START ===
         // Diagonal terms
 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
@@ -418,7 +419,7 @@ namespace mg5amcCpu
       MEs_sv_previous += deltaMEs_previous;
 #endif
       /*
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
       if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", blockDim.x * blockIdx.x + threadIdx.x, ihel, MEs_sv );
 #else
 #ifdef MGONGPU_CPPSIMD
@@ -465,8 +466,8 @@ namespace mg5amcCpu
       { 1, 1, -1, -1 },
       { 1, 1, 1, 1 },
       { 1, 1, 1, -1 } };
-#ifdef __CUDACC__
-    checkCuda( cudaMemcpyToSymbol( cHel, tHel, ncomb * npar * sizeof( short ) ) );
+#ifdef MGONGPUCPP_GPUIMPL
+    gpuMemcpyToSymbol( cHel, tHel, ncomb * npar * sizeof( short ) );
 #else
     memcpy( cHel, tHel, ncomb * npar * sizeof( short ) );
 #endif
@@ -506,9 +507,9 @@ namespace mg5amcCpu
     // Then copy them to CUDA constant memory (issue #39) or its C++ emulation in file-scope static memory
     const fptype tIPD[2] = { (fptype)m_pars->mdl_MT, (fptype)m_pars->mdl_WT };
     //const cxtype tIPC[0] = { ... }; // nicoup=0
-#ifdef __CUDACC__
-    checkCuda( cudaMemcpyToSymbol( cIPD, tIPD, 2 * sizeof( fptype ) ) );
-    //checkCuda( cudaMemcpyToSymbol( cIPC, tIPC, 0 * sizeof( cxtype ) ) ); // nicoup=0
+#ifdef MGONGPUCPP_GPUIMPL
+    gpuMemcpyToSymbol( cIPD, tIPD, 2 * sizeof( fptype ) );
+    //gpuMemcpyToSymbol( cIPC, tIPC, 0 * sizeof( cxtype ) ); // nicoup=0
 #else
     memcpy( cIPD, tIPD, 2 * sizeof( fptype ) );
     //memcpy( cIPC, tIPC, 0 * sizeof( cxtype ) ); // nicoup=0
@@ -544,7 +545,7 @@ namespace mg5amcCpu
   {
     std::stringstream out;
     // CUDA version (NVCC)
-    // [Use __NVCC__ instead of __CUDACC__ here!]
+    // [Use __NVCC__ instead of MGONGPUCPP_GPUIMPL here!]
     // [This tests if 'nvcc' was used even to build a .cc file, even if not necessarily 'nvcc -x cu' for a .cu file]
     // [Check 'nvcc --compiler-options -dM -E dummy.c | grep CUDA': see https://stackoverflow.com/a/53713712]
 #ifdef __NVCC__
@@ -609,12 +610,12 @@ namespace mg5amcCpu
   __global__ void /* clang-format off */
   computeDependentCouplings( const fptype* allgs, // input: Gs[nevt]
                              fptype* allcouplings // output: couplings[nevt*ndcoup*2]
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
                              , const int nevt     // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #endif
   ) /* clang-format on */
   {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     using namespace mg5amcGpu;
     using G_ACCESS = DeviceAccessGs;
     using C_ACCESS = DeviceAccessCouplings;
@@ -635,7 +636,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__ /* clang-format off */
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
   __global__ void
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
@@ -764,9 +765,9 @@ namespace mg5amcCpu
         nGoodHel++;
       }
     }
-#ifdef __CUDACC__
-    checkCuda( cudaMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) ) );
-    checkCuda( cudaMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) ) );
+#ifdef MGONGPUCPP_GPUIMPL
+    gpuMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) );
+    gpuMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) );
 #else
     cNGoodHel = nGoodHel;
     for( int ihel = 0; ihel < ncomb; ihel++ ) cGoodHel[ihel] = goodHel[ihel];
@@ -790,7 +791,7 @@ namespace mg5amcCpu
 #endif
             int* allselhel,                // output: helicity selection[nevt]
             int* allselcol                 // output: helicity selection[nevt]
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
             , const int nevt               // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #endif
             ) /* clang-format on */
@@ -811,7 +812,7 @@ namespace mg5amcCpu
     // Denominators: spins, colors and identical particles
     constexpr int helcolDenominators[1] = { 256 }; // assume nprocesses == 1 (#272 and #343)
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     // Remember: in CUDA this is a kernel for one event, in c++ this processes n events
     const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid
 #else
@@ -825,9 +826,12 @@ namespace mg5amcCpu
 #endif
 
     // Start sigmaKin_lines
+
+#include "GpuAbstraction.h"
+
     // === PART 0 - INITIALISATION (before calculate_wavefunctions) ===
     // Reset the "matrix elements" - running sums of |M|^2 over helicities for the given event
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     allMEs[ievt] = 0;
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     allNumerators[ievt] = 0;
@@ -855,7 +859,7 @@ namespace mg5amcCpu
     // === PART 1 - HELICITY LOOP: CALCULATE WAVEFUNCTIONS ===
     // (in both CUDA and C++, using precomputed good helicities)
 
-#ifdef __CUDACC__ // CUDA OR C++
+#ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++
 
     // *** START OF PART 1a - CUDA (one event per CPU thread) ***
     // Running sum of partial amplitudes squared for event by event color selection (#402)
@@ -1059,7 +1063,7 @@ namespace mg5amcCpu
     // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event
     // [NB 'sum over final spins, average over initial spins', eg see
     // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf]
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     allMEs[ievt] /= helcolDenominators[0];
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     if( channelId > 0 ) allMEs[ievt] *= allNumerators[ievt] / allDenominators[ievt];
diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/CPPProcess.h b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/CPPProcess.h
index 51f966d10f..5a6e96d9e8 100644
--- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/CPPProcess.h
+++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/CPPProcess.h
@@ -4,7 +4,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
 // MadGraph5_aMC@NLO v. 3.5.0_lo_vect, 2023-06-09
@@ -25,7 +25,7 @@
 
 //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -107,7 +107,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   __global__ void
   computeDependentCouplings( const fptype* allgs,    // input: Gs[nevt]
                              fptype* allcouplings ); // output: couplings[nevt*ndcoup*2]
@@ -120,7 +120,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__ /* clang-format off */
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
   __global__ void
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
@@ -150,7 +150,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__ /* clang-format off */
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
   __global__ void
   sigmaKin( const fptype* allmomenta,      // input: momenta[nevt*npar*4]
             const fptype* allcouplings,    // input: couplings[nevt*ndcoup*2]
diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/CudaRuntime.h b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/CudaRuntime.h
deleted file mode 120000
index ce9e1a487a..0000000000
--- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/CudaRuntime.h
+++ /dev/null
@@ -1 +0,0 @@
-../CudaRuntime.h
\ No newline at end of file
diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/GpuAbstraction.h b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/GpuAbstraction.h
new file mode 120000
index 0000000000..72054e19ba
--- /dev/null
+++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/GpuAbstraction.h
@@ -0,0 +1 @@
+../GpuAbstraction.h
\ No newline at end of file
diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/GpuRuntime.h b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/GpuRuntime.h
new file mode 120000
index 0000000000..3920e83be4
--- /dev/null
+++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/GpuRuntime.h
@@ -0,0 +1 @@
+../GpuRuntime.h
\ No newline at end of file
diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/check_sa.cc b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/check_sa.cc
index f1e75b9252..1bad694d1c 100644
--- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/check_sa.cc
+++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/check_sa.cc
@@ -4,7 +4,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: O. Mattelaer (Nov 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 
 #include "mgOnGpuConfig.h"
@@ -12,6 +12,7 @@
 #include "BridgeKernels.h"
 #include "CPPProcess.h"
 #include "CrossSectionKernels.h"
+#include "GpuRuntime.h"
 #include "MatrixElementKernels.h"
 #include "MemoryAccessMatrixElements.h"
 #include "MemoryAccessMomenta.h"
@@ -63,7 +64,7 @@ usage( char* argv0, int ret = 1 )
   std::cout << std::endl;
   std::cout << "Summary stats are always computed: '-p' and '-j' only control their printout" << std::endl;
   std::cout << "The '-d' flag only enables NaN/abnormal warnings and OMP debugging" << std::endl;
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 #ifdef _OPENMP
   std::cout << std::endl;
   std::cout << "Use the OMP_NUM_THREADS environment variable to control OMP multi-threading" << std::endl;
@@ -77,7 +78,7 @@ int
 main( int argc, char** argv )
 {
   // Namespaces for CUDA and C++ (FIXME - eventually use the same namespace everywhere...)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   using namespace mg5amcGpu;
 #else
   using namespace mg5amcCpu;
@@ -103,11 +104,11 @@ main( int argc, char** argv )
     CurandDevice = 2
   };
 #ifdef __CUDACC__
-  RandomNumberMode rndgen = RandomNumberMode::CurandDevice; // default on GPU
+  RandomNumberMode rndgen = RandomNumberMode::CurandDevice; // default on CUDA GPU
 #elif not defined MGONGPU_HAS_NO_CURAND
   RandomNumberMode rndgen = RandomNumberMode::CurandHost;  // default on CPU if build has curand
 #else
-  RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // default on CPU if build has no curand
+  RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // default on CPU if build has no curand and on HIP GPU
 #endif
   // Rambo sampling mode (NB RamboHost implies CommonRandom or CurandHost!)
   enum class RamboSamplingMode
@@ -115,7 +116,7 @@ main( int argc, char** argv )
     RamboHost = 1,
     RamboDevice = 2
   };
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   RamboSamplingMode rmbsmp = RamboSamplingMode::RamboDevice; // default on GPU
 #else
   RamboSamplingMode rmbsmp = RamboSamplingMode::RamboHost; // default on CPU
@@ -148,7 +149,7 @@ main( int argc, char** argv )
 #ifdef __CUDACC__
       rndgen = RandomNumberMode::CurandDevice;
 #else
-      throw std::runtime_error( "CurandDevice is not supported on CPUs" );
+      throw std::runtime_error( "CurandDevice is not supported on CPUs or on HIP GPUs" );
 #endif
     }
     else if( arg == "--curhst" )
@@ -165,7 +166,7 @@ main( int argc, char** argv )
     }
     else if( arg == "--rmbdev" )
     {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
       rmbsmp = RamboSamplingMode::RamboDevice;
 #else
       throw std::runtime_error( "RamboDevice is not supported on CPUs" );
@@ -239,13 +240,13 @@ main( int argc, char** argv )
     return usage( argv[0] );
   }
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 #ifdef _OPENMP
   ompnumthreadsNotSetMeansOneThread( debug ? 1 : 0 ); // quiet(-1), info(0), debug(1)
 #endif
 #endif
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // Fail gently and avoid "Illegal instruction (core dumped)" if the host does not support the SIMD used in the ME calculation
   // Note: this prevents a crash on pmpe04 but not on some github CI nodes?
   // [NB: SIMD vectorization in mg5amc C++ code is only used in the ME calculation below MatrixElementKernelHost!]
@@ -263,14 +264,14 @@ main( int argc, char** argv )
 
   // === STEP 0 - INITIALISE
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 
-  // --- 00. Initialise cuda
-  // Instantiate a CudaRuntime at the beginnining of the application's main to
-  // invoke cudaSetDevice(0) in the constructor and book a cudaDeviceReset() call in the destructor
-  const std::string cdinKey = "00 CudaInit";
+  // --- 00. Initialise GPU
+  // Instantiate a GpuRuntime at the beginnining of the application's main.
+  // For CUDA this invokes cudaSetDevice(0) in the constructor and books a cudaDeviceReset() call in the destructor.
+  const std::string cdinKey = "00 GpuInit";
   timermap.start( cdinKey );
-  CudaRuntime cudaRuntime( debug );
+  GpuRuntime GpuRuntime( debug );
 #endif
 
   // --- 0a. Initialise physics process
@@ -292,7 +293,7 @@ main( int argc, char** argv )
   timermap.start( alloKey );
 
   // Memory buffers for random numbers for momenta
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferRndNumMomenta hstRndmom( nevt );
 #else
   PinnedHostBufferRndNumMomenta hstRndmom( nevt );
@@ -300,7 +301,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for sampling weights
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferWeights hstWeights( nevt );
 #else
   PinnedHostBufferWeights hstWeights( nevt );
@@ -308,7 +309,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for momenta
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferMomenta hstMomenta( nevt );
 #else
   PinnedHostBufferMomenta hstMomenta( nevt );
@@ -316,7 +317,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for Gs
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferGs hstGs( nevt );
 #else
   PinnedHostBufferGs hstGs( nevt );
@@ -333,7 +334,7 @@ main( int argc, char** argv )
   }
 
   // Memory buffers for matrix elements
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferMatrixElements hstMatrixElements( nevt );
 #else
   PinnedHostBufferMatrixElements hstMatrixElements( nevt );
@@ -342,7 +343,7 @@ main( int argc, char** argv )
 
   // Memory buffers for random numbers for helicity selection
   // *** NB #403 these buffers always remain initialised at 0: no need for helicity choice in gcheck/check (no LHE produced) ***
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferRndNumHelicity hstRndHel( nevt );
 #else
   PinnedHostBufferRndNumHelicity hstRndHel( nevt );
@@ -351,7 +352,7 @@ main( int argc, char** argv )
 
   // Memory buffers for random numbers for color selection
   // *** NB #402 these buffers always remain initialised at 0: no need for color choice in gcheck/check (no LHE produced) ***
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferRndNumColor hstRndCol( nevt );
 #else
   PinnedHostBufferRndNumColor hstRndCol( nevt );
@@ -359,7 +360,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for helicity selection
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferSelectedHelicity hstSelHel( nevt );
 #else
   PinnedHostBufferSelectedHelicity hstSelHel( nevt );
@@ -367,7 +368,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for color selection
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferSelectedColor hstSelCol( nevt );
 #else
   PinnedHostBufferSelectedColor hstSelCol( nevt );
@@ -403,7 +404,7 @@ main( int argc, char** argv )
 #else
   else
   {
-    throw std::logic_error( "CurandDevice is not supported on CPUs" ); // INTERNAL ERROR (no path to this statement)
+    throw std::logic_error( "CurandDevice is not supported on CPUs or HIP GPUs" ); // INTERNAL ERROR (no path to this statement)
   }
 #endif
 #else
@@ -421,7 +422,7 @@ main( int argc, char** argv )
   }
   else
   {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     prsk.reset( new RamboSamplingKernelDevice( energy, devRndmom, devMomenta, devWeights, gpublocks, gputhreads ) );
 #else
     throw std::logic_error( "RamboDevice is not supported on CPUs" ); // INTERNAL ERROR (no path to this statement)
@@ -432,7 +433,7 @@ main( int argc, char** argv )
   std::unique_ptr<MatrixElementKernelBase> pmek;
   if( !bridge )
   {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     pmek.reset( new MatrixElementKernelDevice( devMomenta, devGs, devRndHel, devRndCol, devMatrixElements, devSelHel, devSelCol, gpublocks, gputhreads ) );
 #else
     pmek.reset( new MatrixElementKernelHost( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, nevt ) );
@@ -440,7 +441,7 @@ main( int argc, char** argv )
   }
   else
   {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     pmek.reset( new BridgeKernelDevice( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, gpublocks, gputhreads ) );
 #else
     pmek.reset( new BridgeKernelHost( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, nevt ) );
@@ -482,7 +483,7 @@ main( int argc, char** argv )
     prnk->generateRnarray();
     //std::cout << "Got random numbers" << std::endl;
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     if( rndgen != RandomNumberMode::CurandDevice && rmbsmp == RamboSamplingMode::RamboDevice )
     {
       // --- 1c. Copy rndmom from host to device
@@ -514,7 +515,7 @@ main( int argc, char** argv )
     prsk->getMomentaFinal();
     //std::cout << "Got final momenta" << std::endl;
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     if( rmbsmp == RamboSamplingMode::RamboDevice )
     {
       // --- 2c. CopyDToH Weights
@@ -559,7 +560,7 @@ main( int argc, char** argv )
       dynamic_cast<BridgeKernelBase*>( pmek.get() )->transposeInputMomentaC2F();
     }
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     // --- 2d. CopyHToD Momenta
     const std::string gKey = "0.. CpHTDg";
     rambtime += timermap.start( gKey ); // FIXME! NOT A RAMBO TIMER!
@@ -588,7 +589,7 @@ main( int argc, char** argv )
     wv3atime += timermap.stop(); // calc only
     wavetime += wv3atime;        // calc plus copy
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     if( !bridge )
     {
       // --- 3b. CopyDToH MEs
@@ -731,15 +732,19 @@ main( int argc, char** argv )
     rndgentxt = "CURAND DEVICE";
 #ifdef __CUDACC__
   rndgentxt += " (CUDA code)";
+#elif defined __HIPCC__
+  rndgentxt += " (HIP code)";
 #else
   rndgentxt += " (C++ code)";
 #endif
 
   // Workflow description summary
   std::string wrkflwtxt;
-  // -- CUDA or C++?
+  // -- CUDA or HIP or C++?
 #ifdef __CUDACC__
   wrkflwtxt += "CUD:";
+#elif defined __HIPCC__
+  wrkflwtxt += "HIP:";
 #else
   wrkflwtxt += "CPP:";
 #endif
@@ -764,6 +769,12 @@ main( int argc, char** argv )
 #else
   wrkflwtxt += "???:"; // no path to this statement
 #endif
+#elif defined __HIPCC__
+#if defined MGONGPU_CUCXTYPE_CXSMPL
+  wrkflwtxt += "CXS:";
+#else
+  wrkflwtxt += "???:"; // no path to this statement
+#endif
 #else
 #if defined MGONGPU_CPPCXTYPE_STDCOMPLEX
   wrkflwtxt += "STX:";
@@ -789,7 +800,7 @@ main( int argc, char** argv )
     wrkflwtxt += "RMBDEV+";
   else
     wrkflwtxt += "??????+"; // no path to this statement
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   // -- HOST or DEVICE matrix elements? Standalone MEs or BRIDGE?
   if( !bridge )
     wrkflwtxt += "MESDEV";
@@ -845,7 +856,7 @@ main( int argc, char** argv )
 
   if( perf )
   {
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 #ifdef _OPENMP
     // Get the output of "nproc --all" (https://stackoverflow.com/a/478960)
     std::string nprocall;
@@ -866,6 +877,8 @@ main( int argc, char** argv )
     std::cout << std::string( SEP79, '*' ) << std::endl
 #ifdef __CUDACC__
               << "Process                     = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_CUDA"
+#elif defined __HIPCC__
+              << "Process                     = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_HIP"
 #else
               << "Process                     = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_CPP"
 #endif
@@ -892,21 +905,21 @@ main( int argc, char** argv )
 #elif defined MGONGPU_FPTYPE_FLOAT
               << "FP precision                = FLOAT (NaN/abnormal=" << nabn << ", zero=" << nzero << ")" << std::endl
 #endif
-#ifdef __CUDACC__
 #if defined MGONGPU_CUCXTYPE_CUCOMPLEX
               << "Complex type                = CUCOMPLEX" << std::endl
 #elif defined MGONGPU_CUCXTYPE_THRUST
               << "Complex type                = THRUST::COMPLEX" << std::endl
-#endif
-#else
+#elif defined MGONGPU_CUCXTYPE_CXSMPL
               << "Complex type                = STD::COMPLEX" << std::endl
+#else
+              << "Complex type                = ???" << std::endl // no path to this statement...
 #endif
               << "RanNumb memory layout       = AOSOA[" << neppR << "]"
               << ( neppR == 1 ? " == AOS" : "" )
               << " [HARDCODED FOR REPRODUCIBILITY]" << std::endl
               << "Momenta memory layout       = AOSOA[" << neppM << "]"
               << ( neppM == 1 ? " == AOS" : "" ) << std::endl
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     //<< "Wavefunction GPU memory     = LOCAL" << std::endl
 #else
 #if !defined MGONGPU_CPPSIMD
@@ -937,7 +950,7 @@ main( int argc, char** argv )
 #endif
 #endif
               << "Random number generation    = " << rndgentxt << std::endl
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 #ifdef _OPENMP
               << "OMP threads / `nproc --all` = " << omp_get_max_threads() << " / " << nprocall // includes a newline
 #endif
@@ -1033,14 +1046,14 @@ main( int argc, char** argv )
              << "\"FLOAT (NaN/abnormal=" << nabn << ")\"," << std::endl
 #endif
              << "\"Complex type\": "
-#ifdef __CUDACC__
 #if defined MGONGPU_CUCXTYPE_CUCOMPLEX
              << "\"CUCOMPLEX\"," << std::endl
 #elif defined MGONGPU_CUCXTYPE_THRUST
              << "\"THRUST::COMPLEX\"," << std::endl
-#endif
-#else
+#elif defined MGONGPU_CUCXTYPE_CXSMPL
              << "\"STD::COMPLEX\"," << std::endl
+#else
+             << "\"???\"," << std::endl                           // no path to this statement...
 #endif
              << "\"RanNumb memory layout\": "
              << "\"AOSOA[" << neppR << "]\""
@@ -1048,7 +1061,7 @@ main( int argc, char** argv )
              << "\"Momenta memory layout\": "
              << "\"AOSOA[" << neppM << "]\""
              << ( neppM == 1 ? " == AOS" : "" ) << ", " << std::endl
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     //<< "\"Wavefunction GPU memory\": " << "\"LOCAL\"," << std::endl
 #endif
              << "\"Curand generation\": "
diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/CPPProcess.cc b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/CPPProcess.cc
index 2afd9a2b1b..b26c54fe3c 100644
--- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/CPPProcess.cc
+++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/CPPProcess.cc
@@ -4,7 +4,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
 // MadGraph5_aMC@NLO v. 3.5.0_lo_vect, 2023-06-09
@@ -16,7 +16,6 @@
 
 #include "mgOnGpuConfig.h"
 
-#include "CudaRuntime.h"
 #include "HelAmps_sm.h"
 #include "MemoryAccessAmplitudes.h"
 #include "MemoryAccessCouplings.h"
@@ -46,7 +45,7 @@
 // Class member functions for calculating the matrix elements for
 // Process: g g > t t~ g WEIGHTED<=3 @2
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -80,7 +79,7 @@ namespace mg5amcCpu
   __device__ const fptype cIPD[2] = { (fptype)Parameters_sm::mdl_MT, (fptype)Parameters_sm::mdl_WT };
   __device__ const fptype* cIPC = nullptr; // unused as nicoup=0
 #else
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   __device__ __constant__ fptype cIPD[2];
   __device__ __constant__ fptype* cIPC = nullptr; // unused as nicoup=0
 #else
@@ -90,7 +89,7 @@ namespace mg5amcCpu
 #endif
 
   // Helicity combinations (and filtering of "good" helicity combinations)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   __device__ __constant__ short cHel[ncomb][npar];
   __device__ __constant__ int cNGoodHel;
   __device__ __constant__ int cGoodHel[ncomb];
@@ -118,13 +117,13 @@ namespace mg5amcCpu
                            fptype* allDenominators,       // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
                            fptype_sv* jamp2_sv            // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled)
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
                            , const int ievt00             // input: first event number in current C++ event page (for CUDA, ievt depends on threadid)
 #endif
                            )
   //ALWAYS_INLINE // attributes are not permitted in a function definition
   {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     using namespace mg5amcGpu;
     using M_ACCESS = DeviceAccessMomenta;         // non-trivial access: buffer includes all events
     using E_ACCESS = DeviceAccessMatrixElements;  // non-trivial access: buffer includes all events
@@ -151,7 +150,7 @@ namespace mg5amcCpu
 #endif /* clang-format on */
     mgDebug( 0, __FUNCTION__ );
     //printf( "calculate_wavefunctions: ihel=%2d\n", ihel );
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
     //printf( "calculate_wavefunctions: ievt00=%d\n", ievt00 );
 #endif
 
@@ -187,7 +186,7 @@ namespace mg5amcCpu
 #endif
     for( int iParity = 0; iParity < nParity; ++iParity )
     { // START LOOP ON IPARITY
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
       const int ievt0 = ievt00 + iParity * neppV;
 #endif
       constexpr size_t nxcoup = ndcoup + nicoup; // both dependent and independent couplings
@@ -200,8 +199,10 @@ namespace mg5amcCpu
         allCOUPs[idcoup] = CD_ACCESS::idcoupAccessBufferConst( allcouplings, idcoup ); // dependent couplings, vary event-by-event
       for( size_t iicoup = 0; iicoup < nicoup; iicoup++ )
         allCOUPs[ndcoup + iicoup] = CI_ACCESS::iicoupAccessBufferConst( cIPC, iicoup ); // independent couplings, fixed for all events
+#ifdef MGONGPUCPP_GPUIMPL
 #ifdef __CUDACC__
 #pragma nv_diagnostic pop
+#endif
       // CUDA kernels take input/output buffers with momenta/MEs for all events
       const fptype* momenta = allmomenta;
       const fptype* COUPs[nxcoup];
@@ -505,7 +506,7 @@ namespace mg5amcCpu
         { 1, -8, 10, 1, 64, -8 },
         { 10, 1, 1, -8, -8, 64 } }; // 2-D array[6][6]
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
       // Pre-compute a constexpr triangular color matrix properly normalized #475
       struct TriangularNormalizedColorMatrix
       {
@@ -562,7 +563,7 @@ namespace mg5amcCpu
 #endif
       for( int icol = 0; icol < ncolor; icol++ )
       {
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
         // === C++ START ===
         // Diagonal terms
 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
@@ -621,7 +622,7 @@ namespace mg5amcCpu
       MEs_sv_previous += deltaMEs_previous;
 #endif
       /*
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
       if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", blockDim.x * blockIdx.x + threadIdx.x, ihel, MEs_sv );
 #else
 #ifdef MGONGPU_CPPSIMD
@@ -684,8 +685,8 @@ namespace mg5amcCpu
       { 1, 1, 1, 1, 1 },
       { 1, 1, 1, -1, -1 },
       { 1, 1, 1, -1, 1 } };
-#ifdef __CUDACC__
-    checkCuda( cudaMemcpyToSymbol( cHel, tHel, ncomb * npar * sizeof( short ) ) );
+#ifdef MGONGPUCPP_GPUIMPL
+    gpuMemcpyToSymbol( cHel, tHel, ncomb * npar * sizeof( short ) );
 #else
     memcpy( cHel, tHel, ncomb * npar * sizeof( short ) );
 #endif
@@ -726,9 +727,9 @@ namespace mg5amcCpu
     // Then copy them to CUDA constant memory (issue #39) or its C++ emulation in file-scope static memory
     const fptype tIPD[2] = { (fptype)m_pars->mdl_MT, (fptype)m_pars->mdl_WT };
     //const cxtype tIPC[0] = { ... }; // nicoup=0
-#ifdef __CUDACC__
-    checkCuda( cudaMemcpyToSymbol( cIPD, tIPD, 2 * sizeof( fptype ) ) );
-    //checkCuda( cudaMemcpyToSymbol( cIPC, tIPC, 0 * sizeof( cxtype ) ) ); // nicoup=0
+#ifdef MGONGPUCPP_GPUIMPL
+    gpuMemcpyToSymbol( cIPD, tIPD, 2 * sizeof( fptype ) );
+    //gpuMemcpyToSymbol( cIPC, tIPC, 0 * sizeof( cxtype ) ); // nicoup=0
 #else
     memcpy( cIPD, tIPD, 2 * sizeof( fptype ) );
     //memcpy( cIPC, tIPC, 0 * sizeof( cxtype ) ); // nicoup=0
@@ -765,7 +766,7 @@ namespace mg5amcCpu
   {
     std::stringstream out;
     // CUDA version (NVCC)
-    // [Use __NVCC__ instead of __CUDACC__ here!]
+    // [Use __NVCC__ instead of MGONGPUCPP_GPUIMPL here!]
     // [This tests if 'nvcc' was used even to build a .cc file, even if not necessarily 'nvcc -x cu' for a .cu file]
     // [Check 'nvcc --compiler-options -dM -E dummy.c | grep CUDA': see https://stackoverflow.com/a/53713712]
 #ifdef __NVCC__
@@ -830,12 +831,12 @@ namespace mg5amcCpu
   __global__ void /* clang-format off */
   computeDependentCouplings( const fptype* allgs, // input: Gs[nevt]
                              fptype* allcouplings // output: couplings[nevt*ndcoup*2]
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
                              , const int nevt     // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #endif
   ) /* clang-format on */
   {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     using namespace mg5amcGpu;
     using G_ACCESS = DeviceAccessGs;
     using C_ACCESS = DeviceAccessCouplings;
@@ -856,7 +857,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__ /* clang-format off */
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
   __global__ void
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
@@ -985,9 +986,9 @@ namespace mg5amcCpu
         nGoodHel++;
       }
     }
-#ifdef __CUDACC__
-    checkCuda( cudaMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) ) );
-    checkCuda( cudaMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) ) );
+#ifdef MGONGPUCPP_GPUIMPL
+    gpuMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) );
+    gpuMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) );
 #else
     cNGoodHel = nGoodHel;
     for( int ihel = 0; ihel < ncomb; ihel++ ) cGoodHel[ihel] = goodHel[ihel];
@@ -1011,7 +1012,7 @@ namespace mg5amcCpu
 #endif
             int* allselhel,                // output: helicity selection[nevt]
             int* allselcol                 // output: helicity selection[nevt]
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
             , const int nevt               // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #endif
             ) /* clang-format on */
@@ -1032,7 +1033,7 @@ namespace mg5amcCpu
     // Denominators: spins, colors and identical particles
     constexpr int helcolDenominators[1] = { 256 }; // assume nprocesses == 1 (#272 and #343)
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     // Remember: in CUDA this is a kernel for one event, in c++ this processes n events
     const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid
 #else
@@ -1046,9 +1047,12 @@ namespace mg5amcCpu
 #endif
 
     // Start sigmaKin_lines
+
+#include "GpuAbstraction.h"
+
     // === PART 0 - INITIALISATION (before calculate_wavefunctions) ===
     // Reset the "matrix elements" - running sums of |M|^2 over helicities for the given event
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     allMEs[ievt] = 0;
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     allNumerators[ievt] = 0;
@@ -1076,7 +1080,7 @@ namespace mg5amcCpu
     // === PART 1 - HELICITY LOOP: CALCULATE WAVEFUNCTIONS ===
     // (in both CUDA and C++, using precomputed good helicities)
 
-#ifdef __CUDACC__ // CUDA OR C++
+#ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++
 
     // *** START OF PART 1a - CUDA (one event per CPU thread) ***
     // Running sum of partial amplitudes squared for event by event color selection (#402)
@@ -1280,7 +1284,7 @@ namespace mg5amcCpu
     // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event
     // [NB 'sum over final spins, average over initial spins', eg see
     // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf]
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     allMEs[ievt] /= helcolDenominators[0];
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     if( channelId > 0 ) allMEs[ievt] *= allNumerators[ievt] / allDenominators[ievt];
diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/CPPProcess.h b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/CPPProcess.h
index 5cba84f97c..d8179c5c94 100644
--- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/CPPProcess.h
+++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/CPPProcess.h
@@ -4,7 +4,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
 // MadGraph5_aMC@NLO v. 3.5.0_lo_vect, 2023-06-09
@@ -25,7 +25,7 @@
 
 //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -107,7 +107,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   __global__ void
   computeDependentCouplings( const fptype* allgs,    // input: Gs[nevt]
                              fptype* allcouplings ); // output: couplings[nevt*ndcoup*2]
@@ -120,7 +120,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__ /* clang-format off */
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
   __global__ void
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
@@ -150,7 +150,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__ /* clang-format off */
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
   __global__ void
   sigmaKin( const fptype* allmomenta,      // input: momenta[nevt*npar*4]
             const fptype* allcouplings,    // input: couplings[nevt*ndcoup*2]
diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/CudaRuntime.h b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/CudaRuntime.h
deleted file mode 120000
index ce9e1a487a..0000000000
--- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/CudaRuntime.h
+++ /dev/null
@@ -1 +0,0 @@
-../CudaRuntime.h
\ No newline at end of file
diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/GpuAbstraction.h b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/GpuAbstraction.h
new file mode 120000
index 0000000000..72054e19ba
--- /dev/null
+++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/GpuAbstraction.h
@@ -0,0 +1 @@
+../GpuAbstraction.h
\ No newline at end of file
diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/GpuRuntime.h b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/GpuRuntime.h
new file mode 120000
index 0000000000..3920e83be4
--- /dev/null
+++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/GpuRuntime.h
@@ -0,0 +1 @@
+../GpuRuntime.h
\ No newline at end of file
diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/check_sa.cc b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/check_sa.cc
index f1e75b9252..1bad694d1c 100644
--- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/check_sa.cc
+++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/check_sa.cc
@@ -4,7 +4,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: O. Mattelaer (Nov 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 
 #include "mgOnGpuConfig.h"
@@ -12,6 +12,7 @@
 #include "BridgeKernels.h"
 #include "CPPProcess.h"
 #include "CrossSectionKernels.h"
+#include "GpuRuntime.h"
 #include "MatrixElementKernels.h"
 #include "MemoryAccessMatrixElements.h"
 #include "MemoryAccessMomenta.h"
@@ -63,7 +64,7 @@ usage( char* argv0, int ret = 1 )
   std::cout << std::endl;
   std::cout << "Summary stats are always computed: '-p' and '-j' only control their printout" << std::endl;
   std::cout << "The '-d' flag only enables NaN/abnormal warnings and OMP debugging" << std::endl;
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 #ifdef _OPENMP
   std::cout << std::endl;
   std::cout << "Use the OMP_NUM_THREADS environment variable to control OMP multi-threading" << std::endl;
@@ -77,7 +78,7 @@ int
 main( int argc, char** argv )
 {
   // Namespaces for CUDA and C++ (FIXME - eventually use the same namespace everywhere...)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   using namespace mg5amcGpu;
 #else
   using namespace mg5amcCpu;
@@ -103,11 +104,11 @@ main( int argc, char** argv )
     CurandDevice = 2
   };
 #ifdef __CUDACC__
-  RandomNumberMode rndgen = RandomNumberMode::CurandDevice; // default on GPU
+  RandomNumberMode rndgen = RandomNumberMode::CurandDevice; // default on CUDA GPU
 #elif not defined MGONGPU_HAS_NO_CURAND
   RandomNumberMode rndgen = RandomNumberMode::CurandHost;  // default on CPU if build has curand
 #else
-  RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // default on CPU if build has no curand
+  RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // default on CPU if build has no curand and on HIP GPU
 #endif
   // Rambo sampling mode (NB RamboHost implies CommonRandom or CurandHost!)
   enum class RamboSamplingMode
@@ -115,7 +116,7 @@ main( int argc, char** argv )
     RamboHost = 1,
     RamboDevice = 2
   };
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   RamboSamplingMode rmbsmp = RamboSamplingMode::RamboDevice; // default on GPU
 #else
   RamboSamplingMode rmbsmp = RamboSamplingMode::RamboHost; // default on CPU
@@ -148,7 +149,7 @@ main( int argc, char** argv )
 #ifdef __CUDACC__
       rndgen = RandomNumberMode::CurandDevice;
 #else
-      throw std::runtime_error( "CurandDevice is not supported on CPUs" );
+      throw std::runtime_error( "CurandDevice is not supported on CPUs or on HIP GPUs" );
 #endif
     }
     else if( arg == "--curhst" )
@@ -165,7 +166,7 @@ main( int argc, char** argv )
     }
     else if( arg == "--rmbdev" )
     {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
       rmbsmp = RamboSamplingMode::RamboDevice;
 #else
       throw std::runtime_error( "RamboDevice is not supported on CPUs" );
@@ -239,13 +240,13 @@ main( int argc, char** argv )
     return usage( argv[0] );
   }
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 #ifdef _OPENMP
   ompnumthreadsNotSetMeansOneThread( debug ? 1 : 0 ); // quiet(-1), info(0), debug(1)
 #endif
 #endif
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // Fail gently and avoid "Illegal instruction (core dumped)" if the host does not support the SIMD used in the ME calculation
   // Note: this prevents a crash on pmpe04 but not on some github CI nodes?
   // [NB: SIMD vectorization in mg5amc C++ code is only used in the ME calculation below MatrixElementKernelHost!]
@@ -263,14 +264,14 @@ main( int argc, char** argv )
 
   // === STEP 0 - INITIALISE
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 
-  // --- 00. Initialise cuda
-  // Instantiate a CudaRuntime at the beginnining of the application's main to
-  // invoke cudaSetDevice(0) in the constructor and book a cudaDeviceReset() call in the destructor
-  const std::string cdinKey = "00 CudaInit";
+  // --- 00. Initialise GPU
+  // Instantiate a GpuRuntime at the beginnining of the application's main.
+  // For CUDA this invokes cudaSetDevice(0) in the constructor and books a cudaDeviceReset() call in the destructor.
+  const std::string cdinKey = "00 GpuInit";
   timermap.start( cdinKey );
-  CudaRuntime cudaRuntime( debug );
+  GpuRuntime GpuRuntime( debug );
 #endif
 
   // --- 0a. Initialise physics process
@@ -292,7 +293,7 @@ main( int argc, char** argv )
   timermap.start( alloKey );
 
   // Memory buffers for random numbers for momenta
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferRndNumMomenta hstRndmom( nevt );
 #else
   PinnedHostBufferRndNumMomenta hstRndmom( nevt );
@@ -300,7 +301,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for sampling weights
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferWeights hstWeights( nevt );
 #else
   PinnedHostBufferWeights hstWeights( nevt );
@@ -308,7 +309,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for momenta
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferMomenta hstMomenta( nevt );
 #else
   PinnedHostBufferMomenta hstMomenta( nevt );
@@ -316,7 +317,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for Gs
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferGs hstGs( nevt );
 #else
   PinnedHostBufferGs hstGs( nevt );
@@ -333,7 +334,7 @@ main( int argc, char** argv )
   }
 
   // Memory buffers for matrix elements
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferMatrixElements hstMatrixElements( nevt );
 #else
   PinnedHostBufferMatrixElements hstMatrixElements( nevt );
@@ -342,7 +343,7 @@ main( int argc, char** argv )
 
   // Memory buffers for random numbers for helicity selection
   // *** NB #403 these buffers always remain initialised at 0: no need for helicity choice in gcheck/check (no LHE produced) ***
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferRndNumHelicity hstRndHel( nevt );
 #else
   PinnedHostBufferRndNumHelicity hstRndHel( nevt );
@@ -351,7 +352,7 @@ main( int argc, char** argv )
 
   // Memory buffers for random numbers for color selection
   // *** NB #402 these buffers always remain initialised at 0: no need for color choice in gcheck/check (no LHE produced) ***
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferRndNumColor hstRndCol( nevt );
 #else
   PinnedHostBufferRndNumColor hstRndCol( nevt );
@@ -359,7 +360,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for helicity selection
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferSelectedHelicity hstSelHel( nevt );
 #else
   PinnedHostBufferSelectedHelicity hstSelHel( nevt );
@@ -367,7 +368,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for color selection
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferSelectedColor hstSelCol( nevt );
 #else
   PinnedHostBufferSelectedColor hstSelCol( nevt );
@@ -403,7 +404,7 @@ main( int argc, char** argv )
 #else
   else
   {
-    throw std::logic_error( "CurandDevice is not supported on CPUs" ); // INTERNAL ERROR (no path to this statement)
+    throw std::logic_error( "CurandDevice is not supported on CPUs or HIP GPUs" ); // INTERNAL ERROR (no path to this statement)
   }
 #endif
 #else
@@ -421,7 +422,7 @@ main( int argc, char** argv )
   }
   else
   {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     prsk.reset( new RamboSamplingKernelDevice( energy, devRndmom, devMomenta, devWeights, gpublocks, gputhreads ) );
 #else
     throw std::logic_error( "RamboDevice is not supported on CPUs" ); // INTERNAL ERROR (no path to this statement)
@@ -432,7 +433,7 @@ main( int argc, char** argv )
   std::unique_ptr<MatrixElementKernelBase> pmek;
   if( !bridge )
   {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     pmek.reset( new MatrixElementKernelDevice( devMomenta, devGs, devRndHel, devRndCol, devMatrixElements, devSelHel, devSelCol, gpublocks, gputhreads ) );
 #else
     pmek.reset( new MatrixElementKernelHost( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, nevt ) );
@@ -440,7 +441,7 @@ main( int argc, char** argv )
   }
   else
   {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     pmek.reset( new BridgeKernelDevice( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, gpublocks, gputhreads ) );
 #else
     pmek.reset( new BridgeKernelHost( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, nevt ) );
@@ -482,7 +483,7 @@ main( int argc, char** argv )
     prnk->generateRnarray();
     //std::cout << "Got random numbers" << std::endl;
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     if( rndgen != RandomNumberMode::CurandDevice && rmbsmp == RamboSamplingMode::RamboDevice )
     {
       // --- 1c. Copy rndmom from host to device
@@ -514,7 +515,7 @@ main( int argc, char** argv )
     prsk->getMomentaFinal();
     //std::cout << "Got final momenta" << std::endl;
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     if( rmbsmp == RamboSamplingMode::RamboDevice )
     {
       // --- 2c. CopyDToH Weights
@@ -559,7 +560,7 @@ main( int argc, char** argv )
       dynamic_cast<BridgeKernelBase*>( pmek.get() )->transposeInputMomentaC2F();
     }
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     // --- 2d. CopyHToD Momenta
     const std::string gKey = "0.. CpHTDg";
     rambtime += timermap.start( gKey ); // FIXME! NOT A RAMBO TIMER!
@@ -588,7 +589,7 @@ main( int argc, char** argv )
     wv3atime += timermap.stop(); // calc only
     wavetime += wv3atime;        // calc plus copy
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     if( !bridge )
     {
       // --- 3b. CopyDToH MEs
@@ -731,15 +732,19 @@ main( int argc, char** argv )
     rndgentxt = "CURAND DEVICE";
 #ifdef __CUDACC__
   rndgentxt += " (CUDA code)";
+#elif defined __HIPCC__
+  rndgentxt += " (HIP code)";
 #else
   rndgentxt += " (C++ code)";
 #endif
 
   // Workflow description summary
   std::string wrkflwtxt;
-  // -- CUDA or C++?
+  // -- CUDA or HIP or C++?
 #ifdef __CUDACC__
   wrkflwtxt += "CUD:";
+#elif defined __HIPCC__
+  wrkflwtxt += "HIP:";
 #else
   wrkflwtxt += "CPP:";
 #endif
@@ -764,6 +769,12 @@ main( int argc, char** argv )
 #else
   wrkflwtxt += "???:"; // no path to this statement
 #endif
+#elif defined __HIPCC__
+#if defined MGONGPU_CUCXTYPE_CXSMPL
+  wrkflwtxt += "CXS:";
+#else
+  wrkflwtxt += "???:"; // no path to this statement
+#endif
 #else
 #if defined MGONGPU_CPPCXTYPE_STDCOMPLEX
   wrkflwtxt += "STX:";
@@ -789,7 +800,7 @@ main( int argc, char** argv )
     wrkflwtxt += "RMBDEV+";
   else
     wrkflwtxt += "??????+"; // no path to this statement
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   // -- HOST or DEVICE matrix elements? Standalone MEs or BRIDGE?
   if( !bridge )
     wrkflwtxt += "MESDEV";
@@ -845,7 +856,7 @@ main( int argc, char** argv )
 
   if( perf )
   {
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 #ifdef _OPENMP
     // Get the output of "nproc --all" (https://stackoverflow.com/a/478960)
     std::string nprocall;
@@ -866,6 +877,8 @@ main( int argc, char** argv )
     std::cout << std::string( SEP79, '*' ) << std::endl
 #ifdef __CUDACC__
               << "Process                     = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_CUDA"
+#elif defined __HIPCC__
+              << "Process                     = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_HIP"
 #else
               << "Process                     = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_CPP"
 #endif
@@ -892,21 +905,21 @@ main( int argc, char** argv )
 #elif defined MGONGPU_FPTYPE_FLOAT
               << "FP precision                = FLOAT (NaN/abnormal=" << nabn << ", zero=" << nzero << ")" << std::endl
 #endif
-#ifdef __CUDACC__
 #if defined MGONGPU_CUCXTYPE_CUCOMPLEX
               << "Complex type                = CUCOMPLEX" << std::endl
 #elif defined MGONGPU_CUCXTYPE_THRUST
               << "Complex type                = THRUST::COMPLEX" << std::endl
-#endif
-#else
+#elif defined MGONGPU_CUCXTYPE_CXSMPL
               << "Complex type                = STD::COMPLEX" << std::endl
+#else
+              << "Complex type                = ???" << std::endl // no path to this statement...
 #endif
               << "RanNumb memory layout       = AOSOA[" << neppR << "]"
               << ( neppR == 1 ? " == AOS" : "" )
               << " [HARDCODED FOR REPRODUCIBILITY]" << std::endl
               << "Momenta memory layout       = AOSOA[" << neppM << "]"
               << ( neppM == 1 ? " == AOS" : "" ) << std::endl
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     //<< "Wavefunction GPU memory     = LOCAL" << std::endl
 #else
 #if !defined MGONGPU_CPPSIMD
@@ -937,7 +950,7 @@ main( int argc, char** argv )
 #endif
 #endif
               << "Random number generation    = " << rndgentxt << std::endl
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 #ifdef _OPENMP
               << "OMP threads / `nproc --all` = " << omp_get_max_threads() << " / " << nprocall // includes a newline
 #endif
@@ -1033,14 +1046,14 @@ main( int argc, char** argv )
              << "\"FLOAT (NaN/abnormal=" << nabn << ")\"," << std::endl
 #endif
              << "\"Complex type\": "
-#ifdef __CUDACC__
 #if defined MGONGPU_CUCXTYPE_CUCOMPLEX
              << "\"CUCOMPLEX\"," << std::endl
 #elif defined MGONGPU_CUCXTYPE_THRUST
              << "\"THRUST::COMPLEX\"," << std::endl
-#endif
-#else
+#elif defined MGONGPU_CUCXTYPE_CXSMPL
              << "\"STD::COMPLEX\"," << std::endl
+#else
+             << "\"???\"," << std::endl                           // no path to this statement...
 #endif
              << "\"RanNumb memory layout\": "
              << "\"AOSOA[" << neppR << "]\""
@@ -1048,7 +1061,7 @@ main( int argc, char** argv )
              << "\"Momenta memory layout\": "
              << "\"AOSOA[" << neppM << "]\""
              << ( neppM == 1 ? " == AOS" : "" ) << ", " << std::endl
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     //<< "\"Wavefunction GPU memory\": " << "\"LOCAL\"," << std::endl
 #endif
              << "\"Curand generation\": "
diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/RamboSamplingKernels.cc b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/RamboSamplingKernels.cc
index da68aa9255..79abbcc4f8 100644
--- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/RamboSamplingKernels.cc
+++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/RamboSamplingKernels.cc
@@ -1,11 +1,11 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #include "RamboSamplingKernels.h"
 
-#include "CudaRuntime.h"
+#include "GpuRuntime.h"
 #include "MemoryAccessMomenta.h"
 #include "MemoryAccessRandomNumbers.h"
 #include "MemoryAccessWeights.h"
@@ -14,7 +14,7 @@
 
 #include <sstream>
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -92,7 +92,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   RamboSamplingKernelDevice::RamboSamplingKernelDevice( const fptype energy,               // input: energy
                                                         const BufferRndNumMomenta& rndmom, // input: random numbers in [0,1]
                                                         BufferMomenta& momenta,            // output: momenta
@@ -135,7 +135,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   __global__ void
   getMomentaInitialDevice( const fptype energy,
                            fptype* momenta )
@@ -147,17 +147,17 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   void
   RamboSamplingKernelDevice::getMomentaInitial()
   {
-    getMomentaInitialDevice<<<m_gpublocks, m_gputhreads>>>( m_energy, m_momenta.data() );
+    gpuLaunchKernel( getMomentaInitialDevice, m_gpublocks, m_gputhreads, m_energy, m_momenta.data() );
   }
 #endif
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   __global__ void
   getMomentaFinalDevice( const fptype energy,
                          const fptype* rndmom,
@@ -171,11 +171,11 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   void
   RamboSamplingKernelDevice::getMomentaFinal()
   {
-    getMomentaFinalDevice<<<m_gpublocks, m_gputhreads>>>( m_energy, m_rndmom.data(), m_momenta.data(), m_weights.data() );
+    gpuLaunchKernel( getMomentaFinalDevice, m_gpublocks, m_gputhreads, m_energy, m_rndmom.data(), m_momenta.data(), m_weights.data() );
   }
 #endif
 
diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/RamboSamplingKernels.h b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/RamboSamplingKernels.h
index 184089efd7..7c214cd74b 100644
--- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/RamboSamplingKernels.h
+++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/RamboSamplingKernels.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef RAMBOSAMPLINGKERNELS_H
 #define RAMBOSAMPLINGKERNELS_H 1
@@ -10,7 +10,7 @@
 
 #include "MemoryBuffers.h"
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -93,7 +93,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   // A class encapsulating RAMBO phase space sampling on a GPU device
   class RamboSamplingKernelDevice final : public SamplingKernelBase, public NumberOfEvents
   {
diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/RandomNumberKernels.h b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/RandomNumberKernels.h
index 188a72c2c9..21d63beeac 100644
--- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/RandomNumberKernels.h
+++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/RandomNumberKernels.h
@@ -1,14 +1,14 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef RANDOMNUMBERKERNELS_H
 #define RANDOMNUMBERKERNELS_H 1
 
 #include "mgOnGpuConfig.h"
 
-// NB This must come AFTER mgOnGpuConfig.h which contains our definition of __global__ when __CUDACC__ is not defined
+// NB This must come AFTER mgOnGpuConfig.h which contains our definition of __global__ when MGONGPUCPP_GPUIMPL is not defined
 #ifndef MGONGPU_HAS_NO_CURAND
 //#include "curand.h"
 struct curandGenerator_st; // forward definition from curand.h
@@ -16,7 +16,7 @@ struct curandGenerator_st; // forward definition from curand.h
 
 #include "MemoryBuffers.h"
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/cudacpp.mk b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/cudacpp.mk
index a0397e9ecc..bcb73d7f01 100644
--- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/cudacpp.mk
+++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/cudacpp.mk
@@ -1,7 +1,7 @@
 # Copyright (C) 2020-2023 CERN and UCLouvain.
 # Licensed under the GNU Lesser General Public License (version 3 or later).
 # Created by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin.
-# Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+# Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 
 #=== Determine the name of this makefile (https://ftp.gnu.org/old-gnu/Manuals/make-3.80/html_node/make_17.html)
 #=== NB: different names (e.g. cudacpp.mk and cudacpp_src.mk) are used in the Subprocess and src directories
@@ -32,7 +32,7 @@ UNAME_P := $(shell uname -p)
 #=== Configure common compiler flags for C++ and CUDA
 
 INCFLAGS = -I.
-OPTFLAGS = -O3 # this ends up in CUFLAGS too (should it?), cannot add -Ofast or -ffast-math here
+OPTFLAGS = -O3 # this ends up in GPUFLAGS too (should it?), cannot add -Ofast or -ffast-math here
 
 # Dependency on src directory
 MG5AMC_COMMONLIB = mg5amc_common
@@ -89,69 +89,139 @@ endif
 
 #-------------------------------------------------------------------------------
 
-#=== Configure the CUDA compiler
-
-# If CXX is not a single word (example "clang++ --gcc-toolchain...") then disable CUDA builds (issue #505)
-# This is because it is impossible to pass this to "CUFLAGS += -ccbin <host-compiler>" below
-ifneq ($(words $(subst ccache ,,$(CXX))),1) # allow at most "CXX=ccache <host-compiler>" from outside
-  $(warning CUDA builds are not supported for multi-word CXX "$(CXX)")
-  override CUDA_HOME=disabled
-endif
-
-# If CUDA_HOME is not set, try to set it from the location of nvcc
-ifndef CUDA_HOME
-  CUDA_HOME = $(patsubst %bin/nvcc,%,$(shell which nvcc 2>/dev/null))
-  $(warning CUDA_HOME was not set: using "$(CUDA_HOME)")
-endif
-
-# Set NVCC as $(CUDA_HOME)/bin/nvcc if it exists
-ifneq ($(wildcard $(CUDA_HOME)/bin/nvcc),)
-  NVCC = $(CUDA_HOME)/bin/nvcc
-  USE_NVTX ?=-DUSE_NVTX
-  # See https://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/index.html
-  # See https://arnon.dk/matching-sm-architectures-arch-and-gencode-for-various-nvidia-cards/
-  # Default: use compute capability 70 for V100 (CERN lxbatch, CERN itscrd, Juwels Cluster).
-  # Embed device code for 70, and PTX for 70+.
-  # Export MADGRAPH_CUDA_ARCHITECTURE (comma-separated list) to use another value or list of values (see #533).
-  # Examples: use 60 for P100 (Piz Daint), 80 for A100 (Juwels Booster, NVidia raplab/Curiosity).
-  MADGRAPH_CUDA_ARCHITECTURE ?= 70
-  ###CUARCHFLAGS = -gencode arch=compute_$(MADGRAPH_CUDA_ARCHITECTURE),code=compute_$(MADGRAPH_CUDA_ARCHITECTURE) -gencode arch=compute_$(MADGRAPH_CUDA_ARCHITECTURE),code=sm_$(MADGRAPH_CUDA_ARCHITECTURE) # Older implementation (AV): go back to this one for multi-GPU support #533
-  ###CUARCHFLAGS = --gpu-architecture=compute_$(MADGRAPH_CUDA_ARCHITECTURE) --gpu-code=sm_$(MADGRAPH_CUDA_ARCHITECTURE),compute_$(MADGRAPH_CUDA_ARCHITECTURE) # Newer implementation (SH): cannot use this as-is for multi-GPU support #533
-  comma:=,
-  CUARCHFLAGS = $(foreach arch,$(subst $(comma), ,$(MADGRAPH_CUDA_ARCHITECTURE)),-gencode arch=compute_$(arch),code=compute_$(arch) -gencode arch=compute_$(arch),code=sm_$(arch))
-  CUINC = -I$(CUDA_HOME)/include/
-  CURANDLIBFLAGS = -L$(CUDA_HOME)/lib64/ -lcurand # NB: -lcuda is not needed here!
-  CUOPTFLAGS = -lineinfo
-  CUFLAGS = $(OPTFLAGS) $(CUOPTFLAGS) $(INCFLAGS) $(CUINC) $(USE_NVTX) $(CUARCHFLAGS) -use_fast_math
-  ###CUFLAGS += -Xcompiler -Wall -Xcompiler -Wextra -Xcompiler -Wshadow
-  ###NVCC_VERSION = $(shell $(NVCC) --version | grep 'Cuda compilation tools' | cut -d' ' -f5 | cut -d, -f1)
-  CUFLAGS += -std=c++17 # need CUDA >= 11.2 (see #333): this is enforced in mgOnGpuConfig.h
-  # Without -maxrregcount: baseline throughput: 6.5E8 (16384 32 12) up to 7.3E8 (65536 128 12)
-  ###CUFLAGS+= --maxrregcount 160 # improves throughput: 6.9E8 (16384 32 12) up to 7.7E8 (65536 128 12)
-  ###CUFLAGS+= --maxrregcount 128 # improves throughput: 7.3E8 (16384 32 12) up to 7.6E8 (65536 128 12)
-  ###CUFLAGS+= --maxrregcount 96 # degrades throughput: 4.1E8 (16384 32 12) up to 4.5E8 (65536 128 12)
-  ###CUFLAGS+= --maxrregcount 64 # degrades throughput: 1.7E8 (16384 32 12) flat at 1.7E8 (65536 128 12)
-else ifneq ($(origin REQUIRE_CUDA),undefined)
-  # If REQUIRE_CUDA is set but no cuda is found, stop here (e.g. for CI tests on GPU #443)
-  $(error No cuda installation found (set CUDA_HOME or make nvcc visible in PATH))
-else
-  # No cuda. Switch cuda compilation off and go to common random numbers in C++
-  $(warning CUDA_HOME is not set or is invalid: export CUDA_HOME to compile with cuda)
-  override NVCC=
-  override USE_NVTX=
-  override CUINC=
-  override CURANDLIBFLAGS=
-endif
+CUDA_COMPILER_PATH := $(shell compiler="`which nvcc 2>/dev/null`" && while [ -L "$$compiler" ]; do compiler=`readlink "$$compiler"`; done && echo "$$compiler")
+HIP_COMPILER_PATH := $(shell compiler="`which hipcc 2>/dev/null`" && while [ -L "$$compiler" ]; do compiler=`readlink "$$compiler"`; done && echo "$$compiler")
+
+ifeq ($(findstring nvcc,$(CUDA_COMPILER_PATH)),nvcc)
+  #=== Configure the CUDA compiler
+
+  # If CXX is not a single word (example "clang++ --gcc-toolchain...") then disable CUDA builds (issue #505)
+  # This is because it is impossible to pass this to "GPUFLAGS += -ccbin <host-compiler>" below
+  ifneq ($(words $(subst ccache ,,$(CXX))),1) # allow at most "CXX=ccache <host-compiler>" from outside
+    $(warning CUDA builds are not supported for multi-word CXX "$(CXX)")
+    override CUDA_HOME=disabled
+  endif
+
+  # If CUDA_HOME is not set, try to set it from the location of NVCC
+  ifndef CUDA_HOME
+    CUDA_HOME = $(patsubst %bin/nvcc,%,$(shell which nvcc 2>/dev/null))
+    $(warning CUDA_HOME was not set: using "$(CUDA_HOME)")
+  endif
+
+  # Set GPUCC as $(CUDA_HOME)/bin/nvcc if it exists
+  ifneq ($(wildcard $(CUDA_HOME)/bin/nvcc),)
+    GPUCC = $(CUDA_HOME)/bin/nvcc
+    USE_NVTX ?=-DUSE_NVTX
+    # See https://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/index.html
+    # See https://arnon.dk/matching-sm-architectures-arch-and-gencode-for-various-nvidia-cards/
+    # Default: use compute capability 70 for V100 (CERN lxbatch, CERN itscrd, Juwels Cluster).
+    # Embed device code for 70, and PTX for 70+.
+    # Export MADGRAPH_CUDA_ARCHITECTURE (comma-separated list) to use another value or list of values (see #533).
+    # Examples: use 60 for P100 (Piz Daint), 80 for A100 (Juwels Booster, NVidia raplab/Curiosity).
+    MADGRAPH_CUDA_ARCHITECTURE ?= 70
+    ###CUARCHFLAGS = -gencode arch=compute_$(MADGRAPH_CUDA_ARCHITECTURE),code=compute_$(MADGRAPH_CUDA_ARCHITECTURE) -gencode arch=compute_$(MADGRAPH_CUDA_ARCHITECTURE),code=sm_$(MADGRAPH_CUDA_ARCHITECTURE) # Older implementation (AV): go back to this one for multi-GPU support #533
+    ###CUARCHFLAGS = --gpu-architecture=compute_$(MADGRAPH_CUDA_ARCHITECTURE) --gpu-code=sm_$(MADGRAPH_CUDA_ARCHITECTURE),compute_$(MADGRAPH_CUDA_ARCHITECTURE) # Newer implementation (SH): cannot use this as-is for multi-GPU support #533
+    comma:=,
+    CUARCHFLAGS = $(foreach arch,$(subst $(comma), ,$(MADGRAPH_CUDA_ARCHITECTURE)),-gencode arch=compute_$(arch),code=compute_$(arch) -gencode arch=compute_$(arch),code=sm_$(arch))
+    CUINC = -I$(CUDA_HOME)/include/
+    CURANDLIBFLAGS = -L$(CUDA_HOME)/lib64/ -lcurand # NB: -lcuda is not needed here!
+    CUOPTFLAGS = -lineinfo
+    GPUFLAGS = $(OPTFLAGS) $(CUOPTFLAGS) $(INCFLAGS) $(CUINC) $(USE_NVTX) $(CUARCHFLAGS) -use_fast_math
+    ###GPUFLAGS += -Xcompiler -Wall -Xcompiler -Wextra -Xcompiler -Wshadow
+    ###GPUCC_VERSION = $(shell $(GPUCC) --version | grep 'Cuda compilation tools' | cut -d' ' -f5 | cut -d, -f1)
+    GPUFLAGS += -std=c++17 # need CUDA >= 11.2 (see #333): this is enforced in mgOnGpuConfig.h
+    # Without -maxrregcount: baseline throughput: 6.5E8 (16384 32 12) up to 7.3E8 (65536 128 12)
+    ###GPUFLAGS+= --maxrregcount 160 # improves throughput: 6.9E8 (16384 32 12) up to 7.7E8 (65536 128 12)
+    ###GPUFLAGS+= --maxrregcount 128 # improves throughput: 7.3E8 (16384 32 12) up to 7.6E8 (65536 128 12)
+    ###GPUFLAGS+= --maxrregcount 96 # degrades throughput: 4.1E8 (16384 32 12) up to 4.5E8 (65536 128 12)
+    ###GPUFLAGS+= --maxrregcount 64 # degrades throughput: 1.7E8 (16384 32 12) flat at 1.7E8 (65536 128 12)
+
+    CUBUILDRULEFLAGS = -Xcompiler -fPIC -c
+    CCBUILDRULEFLAGS = -Xcompiler -fPIC -c -x cu
+
+    CUDATESTFLAGS = -lcuda
+
+  else ifneq ($(origin REQUIRE_CUDA),undefined)
+    # If REQUIRE_CUDA is set but no cuda is found, stop here (e.g. for CI tests on GPU #443)
+    $(error No cuda installation found (set CUDA_HOME or make GPUCC visible in PATH))
+  else
+    # No cuda. Switch cuda compilation off and go to common random numbers in C++
+    $(warning CUDA_HOME is not set or is invalid: export CUDA_HOME to compile with cuda)
+    override GPUCC=
+    override USE_NVTX=
+    override CUINC=
+    override CURANDLIBFLAGS=
+  endif
+
+  # Set the host C++ compiler for GPUCC via "-ccbin <host-compiler>"
+  # (NB issue #505: this must be a single word, "clang++ --gcc-toolchain..." is not supported)
+  GPUFLAGS += -ccbin $(shell which $(subst ccache ,,$(CXX)))
+
+  # Allow newer (unsupported) C++ compilers with older versions of CUDA if ALLOW_UNSUPPORTED_COMPILER_IN_CUDA is set (#504)
+  ifneq ($(origin ALLOW_UNSUPPORTED_COMPILER_IN_CUDA),undefined)
+  GPUFLAGS += -allow-unsupported-compiler
+  endif
+
+else ifeq ($(findstring hipcc,$(HIP_COMPILER_PATH)),hipcc)
+  #=== Configure the HIP compiler
+
+  # If CXX is not a single word (example "clang++ --gcc-toolchain...") then disable CUDA builds (issue #505)
+  # This is because it is impossible to pass this to "GPUFLAGS += -ccbin <host-compiler>" below
+  ifneq ($(words $(subst ccache ,,$(CXX))),1) # allow at most "CXX=ccache <host-compiler>" from outside
+    $(warning CUDA builds are not supported for multi-word CXX "$(CXX)")
+    override CUDA_HOME=disabled
+  endif
+
+  # If HIP_HOME is not set, try to set it from the location of GPUCC
+  ifndef HIP_HOME
+    HIP_HOME = $(patsubst %bin/hipcc,%,$(HIP_COMPILER_PATH))
+    $(warning HIP_HOME was not set: using "$(HIP_HOME)")
+  endif
 
-# Set the host C++ compiler for nvcc via "-ccbin <host-compiler>"
-# (NB issue #505: this must be a single word, "clang++ --gcc-toolchain..." is not supported)
-CUFLAGS += -ccbin $(shell which $(subst ccache ,,$(CXX)))
+  # Set GPUCC as $(HIP_HOME)/bin/hipcc if it exists
+  ifneq ($(wildcard $(HIP_HOME)/bin/hipcc),)
+    GPUCC = $(HIP_HOME)/bin/hipcc
+
+    # Should maybe find something equivelant to this in HIP
+    #USE_NVTX ?=-DUSE_NVTX
+
+    HIPARCHFLAGS = -target x86_64-linux-gnu --offload-arch=gfx90a
+    HIPINC = -I$(HIP_HOME)/include/
+
+    # -DHIP_FAST_MATH equivelant to -use_fast_math in HIP 
+    # (But only for single precision line 208: https://rocm-developer-tools.github.io/HIP/hcc__detail_2math__functions_8h_source.html)
+    GPUFLAGS = $(OPTFLAGS) $(CUOPTFLAGS) $(INCFLAGS) $(HIPINC) $(HIPARCHFLAGS) -DHIP_FAST_MATH -DHIP_PLATFORM=amd -fPIC
+    ###GPUFLAGS += -Xcompiler -Wall -Xcompiler -Wextra -Xcompiler -Wshadow
+    GPUFLAGS += -std=c++17
+    # Without -maxrregcount: baseline throughput: 6.5E8 (16384 32 12) up to 7.3E8 (65536 128 12)
+    ###GPUFLAGS+= --maxrregcount 160 # improves throughput: 6.9E8 (16384 32 12) up to 7.7E8 (65536 128 12)
+    ###GPUFLAGS+= --maxrregcount 128 # improves throughput: 7.3E8 (16384 32 12) up to 7.6E8 (65536 128 12)
+    ###GPUFLAGS+= --maxrregcount 96 # degrades throughput: 4.1E8 (16384 32 12) up to 4.5E8 (65536 128 12)
+    ###GPUFLAGS+= --maxrregcount 64 # degrades throughput: 1.7E8 (16384 32 12) flat at 1.7E8 (65536 128 12)
+
+    CUBUILDRULEFLAGS = -fPIC -c
+    CCBUILDRULEFLAGS = -fPIC -c
+
+  else ifneq ($(origin REQUIRE_HIP),undefined)
+    # If REQUIRE_HIP is set but no cuda is found, stop here (e.g. for CI tests on GPU #443)
+    $(error No hip installation found (set HIP_HOME or make GPUCC visible in PATH))
+  else
+    # No hip. Switch hip compilation off and go to common random numbers in C++
+    $(warning HIP_HOME is not set or is invalid: export HIP_HOME to compile with hip)
+    override GPUCC=
+    override USE_NVTX=
+    override CUINC=
+    override CURANDLIBFLAGS=
+  endif
+
+  # Allow newer (unsupported) C++ compilers with older versions of CUDA if ALLOW_UNSUPPORTED_COMPILER_IN_CUDA is set (#504)
+  ifneq ($(origin ALLOW_UNSUPPORTED_COMPILER_IN_CUDA),undefined)
+  GPUFLAGS += -allow-unsupported-compiler
+  endif
 
-# Allow newer (unsupported) C++ compilers with older versions of CUDA if ALLOW_UNSUPPORTED_COMPILER_IN_CUDA is set (#504)
-ifneq ($(origin ALLOW_UNSUPPORTED_COMPILER_IN_CUDA),undefined)
-CUFLAGS += -allow-unsupported-compiler
 endif
 
+  
 #-------------------------------------------------------------------------------
 
 #=== Configure ccache for C++ and CUDA builds
@@ -163,9 +233,9 @@ endif
 #ifeq ($(USECCACHE)$(shell echo $(AR) | grep ccache),1)
 #  override AR:=ccache $(AR)
 #endif
-ifneq ($(NVCC),)
-  ifeq ($(USECCACHE)$(shell echo $(NVCC) | grep ccache),1)
-    override NVCC:=ccache $(NVCC)
+ifneq ($(GPUCC),)
+  ifeq ($(USECCACHE)$(shell echo $(GPUCC) | grep ccache),1)
+    override GPUCC:=ccache $(GPUCC)
   endif
 endif
 
@@ -189,7 +259,7 @@ endif
 
 # PowerPC-specific CUDA compiler flags (to be reviewed!)
 ifeq ($(UNAME_P),ppc64le)
-  CUFLAGS+= -Xcompiler -mno-float128
+  GPUFLAGS+= -Xcompiler -mno-float128
 endif
 
 #-------------------------------------------------------------------------------
@@ -199,10 +269,10 @@ endif
 # Set the default OMPFLAGS choice
 ifneq ($(shell $(CXX) --version | egrep '^Intel'),)
 override OMPFLAGS = -fopenmp
-###override OMPFLAGS = # disable OpenMP MT on Intel (was ok without nvcc but not ok with nvcc before #578)
+###override OMPFLAGS = # disable OpenMP MT on Intel (was ok without GPUCC but not ok with GPUCC before #578)
 else ifneq ($(shell $(CXX) --version | egrep '^(clang)'),)
 override OMPFLAGS = -fopenmp
-###override OMPFLAGS = # disable OpenMP MT on clang (was not ok without or with nvcc before #578)
+###override OMPFLAGS = # disable OpenMP MT on clang (was not ok without or with GPUCC before #578)
 else ifneq ($(shell $(CXX) --version | egrep '^(Apple clang)'),)
 override OMPFLAGS = # disable OpenMP MT on Apple clang (builds fail in the CI #578)
 else
@@ -253,7 +323,10 @@ endif
 
 # Set the default RNDGEN (random number generator) choice
 ifeq ($(RNDGEN),)
-  ifeq ($(NVCC),)
+  ifeq ($(GPUCC),)
+    override RNDGEN = hasNoCurand
+  # Edgecase for HIP compilation
+  else ifeq ($(findstring hipcc,$(GPUCC)),hipcc)
     override RNDGEN = hasNoCurand
   else ifeq ($(RNDGEN),)
     override RNDGEN = hasCurand
@@ -328,13 +401,13 @@ CXXFLAGS+= $(AVXFLAGS)
 $(info FPTYPE=$(FPTYPE))
 ifeq ($(FPTYPE),d)
   CXXFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_DOUBLE
-  CUFLAGS  += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_DOUBLE
+  GPUFLAGS  += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_DOUBLE
 else ifeq ($(FPTYPE),f)
   CXXFLAGS += -DMGONGPU_FPTYPE_FLOAT -DMGONGPU_FPTYPE2_FLOAT
-  CUFLAGS  += -DMGONGPU_FPTYPE_FLOAT -DMGONGPU_FPTYPE2_FLOAT
+  GPUFLAGS  += -DMGONGPU_FPTYPE_FLOAT -DMGONGPU_FPTYPE2_FLOAT
 else ifeq ($(FPTYPE),m)
   CXXFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_FLOAT
-  CUFLAGS  += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_FLOAT
+  GPUFLAGS  += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_FLOAT
 else
   $(error Unknown FPTYPE='$(FPTYPE)': only 'd', 'f' and 'm' are supported)
 endif
@@ -343,7 +416,7 @@ endif
 $(info HELINL=$(HELINL))
 ifeq ($(HELINL),1)
   CXXFLAGS += -DMGONGPU_INLINE_HELAMPS
-  CUFLAGS  += -DMGONGPU_INLINE_HELAMPS
+  GPUFLAGS  += -DMGONGPU_INLINE_HELAMPS
 else ifneq ($(HELINL),0)
   $(error Unknown HELINL='$(HELINL)': only '0' and '1' are supported)
 endif
@@ -352,7 +425,7 @@ endif
 $(info HRDCOD=$(HRDCOD))
 ifeq ($(HRDCOD),1)
   CXXFLAGS += -DMGONGPU_HARDCODE_PARAM
-  CUFLAGS  += -DMGONGPU_HARDCODE_PARAM
+  GPUFLAGS  += -DMGONGPU_HARDCODE_PARAM
 else ifneq ($(HRDCOD),0)
   $(error Unknown HRDCOD='$(HRDCOD)': only '0' and '1' are supported)
 endif
@@ -404,11 +477,11 @@ ifeq ($(UNAME_S),Darwin)
   override CULIBFLAGSRPATH2 =
 else
   # RPATH to cuda/cpp libs when linking executables
-  override CXXLIBFLAGSRPATH = -Wl,-rpath,$(LIBDIRRPATH)
-  override CULIBFLAGSRPATH = -Xlinker -rpath,$(LIBDIRRPATH)
+  override CXXLIBFLAGSRPATH = -Wl,-rpath=$(LIBDIRRPATH)
+  override CULIBFLAGSRPATH = -Xlinker -rpath=$(LIBDIRRPATH)
   # RPATH to common lib when linking cuda/cpp libs
-  override CXXLIBFLAGSRPATH2 = -Wl,-rpath,'$$ORIGIN'
-  override CULIBFLAGSRPATH2 = -Xlinker -rpath,'$$ORIGIN'
+  override CXXLIBFLAGSRPATH2 = -Wl,-rpath='$$ORIGIN'
+  override CULIBFLAGSRPATH2 = -Xlinker -rpath='$$ORIGIN'
 endif
 
 # Setting LD_LIBRARY_PATH or DYLD_LIBRARY_PATH in the RUNTIME is no longer necessary (neither on Linux nor on Mac)
@@ -421,7 +494,7 @@ override RUNTIME =
 cxx_main=$(BUILDDIR)/check.exe
 fcxx_main=$(BUILDDIR)/fcheck.exe
 
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 cu_main=$(BUILDDIR)/gcheck.exe
 fcu_main=$(BUILDDIR)/fgcheck.exe
 else
@@ -452,15 +525,16 @@ $(BUILDDIR)/.build.$(TAG):
 	@touch $(BUILDDIR)/.build.$(TAG)
 
 # Generic target and build rules: objects from CUDA compilation
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 $(BUILDDIR)/%.o : %.cu *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
 	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
-	$(NVCC) $(CPPFLAGS) $(CUFLAGS) -Xcompiler -fPIC -c $< -o $@
+	$(GPUCC) $(CPPFLAGS) $(GPUFLAGS) $(CUBUILDRULEFLAGS) $< -o $@
 
 $(BUILDDIR)/%_cu.o : %.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
 	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
-	$(NVCC) $(CPPFLAGS) $(CUFLAGS) -Xcompiler -fPIC -c -x cu $< -o $@
+	$(GPUCC) $(CPPFLAGS) $(GPUFLAGS) $(CCBUILDRULEFLAGS) $< -o $@
 endif
+# -x cu in line above
 
 # Generic target and build rules: objects from C++ compilation
 # (NB do not include CUINC here! add it only for NVTX or curand #679)
@@ -469,11 +543,14 @@ $(BUILDDIR)/%.o : %.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
 	$(CXX) $(CPPFLAGS) $(CXXFLAGS) -fPIC -c $< -o $@
 
 # Apply special build flags only to CrossSectionKernel.cc and gCrossSectionKernel.cu (no fast math, see #117)
+# Added edgecase for HIP compilation
 ifeq ($(shell $(CXX) --version | grep ^nvc++),)
 $(BUILDDIR)/CrossSectionKernels.o: CXXFLAGS += -fno-fast-math
 $(BUILDDIR)/CrossSectionKernels.o: CXXFLAGS += -fno-fast-math
-ifneq ($(NVCC),)
-$(BUILDDIR)/gCrossSectionKernels.o: CUFLAGS += -Xcompiler -fno-fast-math
+ifeq ($(findstring nvcc,$(GPUCC)),nvcc)
+  $(BUILDDIR)/gCrossSectionKernels.o: GPUFLAGS += -Xcompiler -fno-fast-math
+else
+  $(BUILDDIR)/gCrossSectionKernels.o: GPUFLAGS += -fno-fast-math
 endif
 endif
 
@@ -489,10 +566,10 @@ ifeq ($(RNDGEN),hasCurand)
 $(BUILDDIR)/CurandRandomNumberKernel.o: CXXFLAGS += $(CUINC)
 endif
 
-# Avoid "warning: builtin __has_trivial_... is deprecated; use __is_trivially_... instead" in nvcc with icx2023 (#592)
+# Avoid "warning: builtin __has_trivial_... is deprecated; use __is_trivially_... instead" in GPUCC with icx2023 (#592)
 ifneq ($(shell $(CXX) --version | egrep '^(Intel)'),)
-ifneq ($(NVCC),)
-CUFLAGS += -Xcompiler -Wno-deprecated-builtins
+ifneq ($(GPUCC),)
+GPUFLAGS += -Wno-deprecated-builtins
 endif
 endif
 
@@ -500,8 +577,8 @@ endif
 # This patch does remove the warning, but I prefer to keep it disabled for the moment...
 ###ifneq ($(shell $(CXX) --version | egrep '^(clang|Apple clang|Intel)'),)
 ###$(BUILDDIR)/CrossSectionKernels.o: CXXFLAGS += -Wno-overriding-t-option
-###ifneq ($(NVCC),)
-###$(BUILDDIR)/gCrossSectionKernels.o: CUFLAGS += -Xcompiler -Wno-overriding-t-option
+###ifneq ($(GPUCC),)
+###$(BUILDDIR)/gCrossSectionKernels.o: GPUFLAGS += -Xcompiler -Wno-overriding-t-option
 ###endif
 ###endif
 
@@ -528,7 +605,7 @@ MG5AMC_CXXLIB = mg5amc_$(processid_short)_cpp
 cxx_objects_lib=$(BUILDDIR)/CPPProcess.o $(BUILDDIR)/MatrixElementKernels.o $(BUILDDIR)/BridgeKernels.o $(BUILDDIR)/CrossSectionKernels.o
 cxx_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel.o $(BUILDDIR)/RamboSamplingKernels.o
 
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 MG5AMC_CULIB = mg5amc_$(processid_short)_cuda
 cu_objects_lib=$(BUILDDIR)/gCPPProcess.o $(BUILDDIR)/gMatrixElementKernels.o $(BUILDDIR)/gBridgeKernels.o $(BUILDDIR)/gCrossSectionKernels.o
 cu_objects_exe=$(BUILDDIR)/gCommonRandomNumberKernel.o $(BUILDDIR)/gRamboSamplingKernels.o
@@ -540,11 +617,11 @@ $(LIBDIR)/lib$(MG5AMC_CXXLIB).so: cxx_objects_lib += $(BUILDDIR)/fbridge.o
 $(LIBDIR)/lib$(MG5AMC_CXXLIB).so: $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib)
 	$(CXX) -shared -o $@ $(cxx_objects_lib) $(CXXLIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB)
 
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 $(LIBDIR)/lib$(MG5AMC_CULIB).so: $(BUILDDIR)/fbridge_cu.o
 $(LIBDIR)/lib$(MG5AMC_CULIB).so: cu_objects_lib += $(BUILDDIR)/fbridge_cu.o
 $(LIBDIR)/lib$(MG5AMC_CULIB).so: $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cu_objects_lib)
-	$(NVCC) --shared -o $@ $(cu_objects_lib) $(CULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB)
+	$(GPUCC) --shared -o $@ $(cu_objects_lib) $(CULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB)
 endif
 
 #-------------------------------------------------------------------------------
@@ -561,16 +638,16 @@ $(cxx_main): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PAT
 $(cxx_main): $(BUILDDIR)/check_sa.o $(LIBDIR)/lib$(MG5AMC_CXXLIB).so $(cxx_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel.o
 	$(CXX) -o $@ $(BUILDDIR)/check_sa.o $(OMPFLAGS) -ldl -pthread $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_CXXLIB) $(cxx_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel.o $(CURANDLIBFLAGS)
 
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 ifneq ($(shell $(CXX) --version | grep ^Intel),)
-$(cu_main): LIBFLAGS += -lintlc # compile with icpx and link with nvcc (undefined reference to `_intel_fast_memcpy')
-$(cu_main): LIBFLAGS += -lsvml # compile with icpx and link with nvcc (undefined reference to `__svml_cos4_l9')
+$(cu_main): LIBFLAGS += -lintlc # compile with icpx and link with GPUCC (undefined reference to `_intel_fast_memcpy')
+$(cu_main): LIBFLAGS += -lsvml # compile with icpx and link with GPUCC (undefined reference to `__svml_cos4_l9')
 else ifneq ($(shell $(CXX) --version | grep ^nvc++),) # support nvc++ #531
 $(cu_main): LIBFLAGS += -L$(patsubst %bin/nvc++,%lib,$(subst ccache ,,$(CXX))) -lnvhpcatm -lnvcpumath -lnvc
 endif
 $(cu_main): LIBFLAGS += $(CULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH
 $(cu_main): $(BUILDDIR)/gcheck_sa.o $(LIBDIR)/lib$(MG5AMC_CULIB).so $(cu_objects_exe) $(BUILDDIR)/gCurandRandomNumberKernel.o
-	$(NVCC) -o $@ $(BUILDDIR)/gcheck_sa.o $(CUARCHFLAGS) $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe) $(BUILDDIR)/gCurandRandomNumberKernel.o $(CURANDLIBFLAGS)
+	$(GPUCC) -o $@ $(BUILDDIR)/gcheck_sa.o $(CUARCHFLAGS) $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe) $(BUILDDIR)/gCurandRandomNumberKernel.o $(CURANDLIBFLAGS)
 endif
 
 #-------------------------------------------------------------------------------
@@ -596,17 +673,17 @@ $(fcxx_main): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PA
 $(fcxx_main): $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler.o $(LIBDIR)/lib$(MG5AMC_CXXLIB).so $(cxx_objects_exe)
 	$(CXX) -o $@ $(BUILDDIR)/fcheck_sa.o $(OMPFLAGS) $(BUILDDIR)/fsampler.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CXXLIB) $(cxx_objects_exe)
 
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 ifneq ($(shell $(CXX) --version | grep ^Intel),)
-$(fcu_main): LIBFLAGS += -lintlc # compile with icpx and link with nvcc (undefined reference to `_intel_fast_memcpy')
-$(fcu_main): LIBFLAGS += -lsvml # compile with icpx and link with nvcc (undefined reference to `__svml_cos4_l9')
+$(fcu_main): LIBFLAGS += -lintlc # compile with icpx and link with GPUCC (undefined reference to `_intel_fast_memcpy')
+$(fcu_main): LIBFLAGS += -lsvml # compile with icpx and link with GPUCC (undefined reference to `__svml_cos4_l9')
 endif
 ifeq ($(UNAME_S),Darwin)
 $(fcu_main): LIBFLAGS += -L$(shell dirname $(shell $(FC) --print-file-name libgfortran.dylib)) # add path to libgfortran on Mac #375
 endif
 $(fcu_main): LIBFLAGS += $(CULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH
 $(fcu_main): $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler_cu.o $(LIBDIR)/lib$(MG5AMC_CULIB).so $(cu_objects_exe)
-	$(NVCC) -o $@ $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler_cu.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe)
+	$(GPUCC) -o $@ $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler_cu.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe)
 endif
 
 #-------------------------------------------------------------------------------
@@ -618,7 +695,7 @@ $(BUILDDIR)/testxxx.o: testxxx_cc_ref.txt
 $(testmain): $(BUILDDIR)/testxxx.o
 $(testmain): cxx_objects_exe += $(BUILDDIR)/testxxx.o # Comment out this line to skip the C++ test of xxx functions
 
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 $(BUILDDIR)/testxxx_cu.o: $(GTESTLIBS)
 $(BUILDDIR)/testxxx_cu.o: INCFLAGS += $(GTESTINC)
 $(BUILDDIR)/testxxx_cu.o: testxxx_cc_ref.txt
@@ -631,7 +708,7 @@ $(BUILDDIR)/testmisc.o: INCFLAGS += $(GTESTINC)
 $(testmain): $(BUILDDIR)/testmisc.o
 $(testmain): cxx_objects_exe += $(BUILDDIR)/testmisc.o # Comment out this line to skip the C++ miscellaneous tests
 
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 $(BUILDDIR)/testmisc_cu.o: $(GTESTLIBS)
 $(BUILDDIR)/testmisc_cu.o: INCFLAGS += $(GTESTINC)
 $(testmain): $(BUILDDIR)/testmisc_cu.o
@@ -643,12 +720,12 @@ $(BUILDDIR)/runTest.o: INCFLAGS += $(GTESTINC)
 $(testmain): $(BUILDDIR)/runTest.o
 $(testmain): cxx_objects_exe += $(BUILDDIR)/runTest.o
 
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 $(BUILDDIR)/runTest_cu.o: $(GTESTLIBS)
 $(BUILDDIR)/runTest_cu.o: INCFLAGS += $(GTESTINC)
 ifneq ($(shell $(CXX) --version | grep ^Intel),)
-$(testmain): LIBFLAGS += -lintlc # compile with icpx and link with nvcc (undefined reference to `_intel_fast_memcpy')
-$(testmain): LIBFLAGS += -lsvml # compile with icpx and link with nvcc (undefined reference to `__svml_cos4_l9')
+$(testmain): LIBFLAGS += -lintlc # compile with icpx and link with GPUCC (undefined reference to `_intel_fast_memcpy')
+$(testmain): LIBFLAGS += -lsvml # compile with icpx and link with GPUCC (undefined reference to `__svml_cos4_l9')
 else ifneq ($(shell $(CXX) --version | grep ^nvc++),) # support nvc++ #531
 $(testmain): LIBFLAGS += -L$(patsubst %bin/nvc++,%lib,$(subst ccache ,,$(CXX))) -lnvhpcatm -lnvcpumath -lnvc
 endif
@@ -672,14 +749,14 @@ $(testmain): LIBFLAGS += -lgomp
 endif
 endif
 
-ifeq ($(NVCC),) # link only runTest.o
+ifeq ($(GPUCC),) # link only runTest.o
 $(testmain): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH
 $(testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(cxx_objects_exe) $(GTESTLIBS)
 	$(CXX) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) -ldl -pthread $(LIBFLAGS)
 else # link both runTest.o and runTest_cu.o
 $(testmain): LIBFLAGS += $(CULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH
 $(testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) $(GTESTLIBS)
-	$(NVCC) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) -ldl $(LIBFLAGS) -lcuda
+	  $(GPUCC) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) -ldl $(LIBFLAGS) $(CUDATESTFLAGS)
 endif
 
 # Use flock (Linux only, no Mac) to allow 'make -j' if googletest has not yet been downloaded https://stackoverflow.com/a/32666215
@@ -782,9 +859,9 @@ ifeq ($(USECCACHE),1)
 	ccache --version | head -1
 endif
 	@echo ""
-	@echo NVCC=$(NVCC)
-ifneq ($(NVCC),)
-	$(NVCC) --version
+	@echo GPUCC=$(GPUCC)
+ifneq ($(GPUCC),)
+	$(GPUCC) --version
 endif
 	@echo ""
 	@echo CXX=$(CXX)
@@ -803,7 +880,7 @@ endif
 
 # Target: check (run the C++ test executable)
 # [NB THIS IS WHAT IS USED IN THE GITHUB CI!]
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 check: runTest cmpFcheck cmpFGcheck
 else
 check: runTest cmpFcheck
diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/fbridge.cc b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/fbridge.cc
index f93c05b0b3..2b956730d4 100644
--- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/fbridge.cc
+++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/fbridge.cc
@@ -1,11 +1,11 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: S. Roiser (Oct 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Roiser, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #include "Bridge.h"
 #include "CPPProcess.h"
-#include "CudaRuntime.h"
+#include "GpuRuntime.h"
 
 extern "C"
 {
@@ -22,7 +22,7 @@ extern "C"
    * Using the same Fortran MadEvent code, linking to the hetrerogeneous library would allow access to both CPU and GPU implementations.
    * The specific heterogeneous configuration (how many GPUs, how many threads on each CPU, etc) could be loaded in CUDA/C++ from a data file.
    */
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   using namespace mg5amcGpu;
 #else
   using namespace mg5amcCpu;
@@ -46,8 +46,8 @@ extern "C"
    */
   void fbridgecreate_( CppObjectInFortran** ppbridge, const int* pnevtF, const int* pnparF, const int* pnp4F )
   {
-#ifdef __CUDACC__
-    CudaRuntime::setUp();
+#ifdef MGONGPUCPP_GPUIMPL
+    GpuRuntime::setUp();
 #endif
     // Create a process object, read parm card and set parameters
     // FIXME: the process instance can happily go out of scope because it is only needed to read parameters?
@@ -69,8 +69,8 @@ extern "C"
     Bridge<FORTRANFPTYPE>* pbridge = dynamic_cast<Bridge<FORTRANFPTYPE>*>( *ppbridge );
     if( pbridge == 0 ) throw std::runtime_error( "fbridgedelete_: invalid Bridge address" );
     delete pbridge;
-#ifdef __CUDACC__
-    CudaRuntime::tearDown();
+#ifdef MGONGPUCPP_GPUIMPL
+    GpuRuntime::tearDown();
 #endif
   }
 
@@ -100,7 +100,7 @@ extern "C"
   {
     Bridge<FORTRANFPTYPE>* pbridge = dynamic_cast<Bridge<FORTRANFPTYPE>*>( *ppbridge );
     if( pbridge == 0 ) throw std::runtime_error( "fbridgesequence_: invalid Bridge address" );
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     // Use the device/GPU implementation in the CUDA library
     // (there is also a host implementation in this library)
     pbridge->gpu_sequence( momenta, gs, rndhel, rndcol, *pchannelId, mes, selhel, selcol );
diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/fsampler.cc b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/fsampler.cc
index 2fb445372d..3743934f41 100644
--- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/fsampler.cc
+++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/fsampler.cc
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Feb 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #include "mgOnGpuConfig.h"
 
@@ -13,7 +13,7 @@
 
 //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -40,7 +40,7 @@ namespace mg5amcCpu
   private:
     const int m_nevt; // The number of events in each iteration
     int m_iiter;      // The iteration counter (for random number seeding)
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
     HostBufferRndNumMomenta m_hstRndmom; // Memory buffers for random numbers
     HostBufferMomenta m_hstMomenta;      // Memory buffers for momenta
     HostBufferWeights m_hstWeights;      // Memory buffers for sampling weights
@@ -105,7 +105,7 @@ namespace mg5amcCpu
 
 extern "C"
 {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   using namespace mg5amcGpu;
 #else
   using namespace mg5amcCpu;
diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/runTest.cc b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/runTest.cc
index 572e28aaea..461ec5c3a5 100644
--- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/runTest.cc
+++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/runTest.cc
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: S. Hageboeck (Nov 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 
 #include "mgOnGpuConfig.h"
 
@@ -15,7 +15,7 @@
 #include "RandomNumberKernels.h"
 #include "epoch_process_id.h"
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 using namespace mg5amcGpu;
 #else
 using namespace mg5amcCpu;
@@ -32,7 +32,7 @@ struct CUDA_CPU_TestBase : public TestDriverBase
     : TestDriverBase( npar, refFileName ) {}
 };
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 struct CPUTest : public CUDA_CPU_TestBase
 {
   // Struct data members (process, and memory structures for random numbers, momenta, matrix elements and weights on host and device)
@@ -114,7 +114,7 @@ struct CPUTest : public CUDA_CPU_TestBase
 };
 #endif
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 struct CUDATest : public CUDA_CPU_TestBase
 {
   // Reset the device when our test goes out of scope. Note that this should happen after
@@ -123,7 +123,7 @@ struct CUDATest : public CUDA_CPU_TestBase
   {
     ~DeviceReset()
     {
-      checkCuda( cudaDeviceReset() ); // this is needed by cuda-memcheck --leak-check full
+      gpuDeviceReset(); // this is needed by cuda-memcheck --leak-check full
     }
   } deviceResetter;
 
@@ -249,7 +249,7 @@ INSTANTIATE_TEST_SUITE_P( prefix, \
                           test_suite_name, \
                           testing::Values( new CUDATest( MG_EPOCH_REFERENCE_FILE_NAME ) ) );
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 MG_INSTANTIATE_TEST_SUITE_GPU( XTESTID_GPU( MG_EPOCH_PROCESS_ID ), MadgraphTest );
 #else
 MG_INSTANTIATE_TEST_SUITE_CPU( XTESTID_CPU( MG_EPOCH_PROCESS_ID ), MadgraphTest );
diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/testmisc.cc b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/testmisc.cc
index 989aba1fdc..2bd7a9fcf9 100644
--- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/testmisc.cc
+++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/testmisc.cc
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 //----------------------------------------------------------------------------
 // Use ./runTest.exe --gtest_filter=*misc to run only this test
 //----------------------------------------------------------------------------
@@ -17,7 +17,7 @@
 #include <sstream>
 #include <typeinfo>
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 #define TESTID( s ) s##_GPU_MISC
 #else
 #define TESTID( s ) s##_CPU_MISC
diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/testxxx.cc b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/testxxx.cc
index 4243e9fcec..6e8657edca 100644
--- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/testxxx.cc
+++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/testxxx.cc
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Apr 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #include "mgOnGpuConfig.h"
 
@@ -20,7 +20,7 @@
 #include <iomanip>
 #include <iostream>
 #include <vector>
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 #define TESTID( s ) s##_GPU_XXX
 #else
 #define TESTID( s ) s##_CPU_XXX
@@ -41,7 +41,7 @@ TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testxxx )
   assert( nevt % neppM == 0 ); // nevt must be a multiple of neppM
   assert( nevt % neppV == 0 ); // nevt must be a multiple of neppV
   // Fill in the input momenta
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   mg5amcGpu::PinnedHostBufferMomenta hstMomenta( nevt ); // AOSOA[npagM][npar=4][np4=4][neppM]
 #else
   mg5amcCpu::HostBufferMomenta hstMomenta( nevt ); // AOSOA[npagM][npar=4][np4=4][neppM]
@@ -228,7 +228,7 @@ TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testxxx )
   {
     for( int ievt = 0; ievt < nevt; ievt++ )
     {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
       using namespace mg5amcGpu;
 #else
       using namespace mg5amcCpu;
diff --git a/epochX/cudacpp/gg_tt01g.mad/src/HelAmps_sm.h b/epochX/cudacpp/gg_tt01g.mad/src/HelAmps_sm.h
index 5a3a5dc76f..3593d9f169 100644
--- a/epochX/cudacpp/gg_tt01g.mad/src/HelAmps_sm.h
+++ b/epochX/cudacpp/gg_tt01g.mad/src/HelAmps_sm.h
@@ -4,7 +4,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: A. Valassi (Sep 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
 // MadGraph5_aMC@NLO v. 3.5.0_lo_vect, 2023-06-09
@@ -26,7 +26,7 @@
 //#include <iomanip>
 //#include <iostream>
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/gg_tt01g.mad/src/Parameters_sm.cc b/epochX/cudacpp/gg_tt01g.mad/src/Parameters_sm.cc
index d3d01102fd..de87dcaf64 100644
--- a/epochX/cudacpp/gg_tt01g.mad/src/Parameters_sm.cc
+++ b/epochX/cudacpp/gg_tt01g.mad/src/Parameters_sm.cc
@@ -4,7 +4,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: A. Valassi (Sep 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
 // MadGraph5_aMC@NLO v. 3.5.0_lo_vect, 2023-06-09
diff --git a/epochX/cudacpp/gg_tt01g.mad/src/Parameters_sm.h b/epochX/cudacpp/gg_tt01g.mad/src/Parameters_sm.h
index 6551d8da81..fe7d686938 100644
--- a/epochX/cudacpp/gg_tt01g.mad/src/Parameters_sm.h
+++ b/epochX/cudacpp/gg_tt01g.mad/src/Parameters_sm.h
@@ -214,7 +214,7 @@ namespace Parameters_sm_dependentCouplings
 #pragma GCC diagnostic push
 #pragma GCC diagnostic ignored "-Wunused-variable"  // e.g. <<warning: unused variable ‘mdl_G__exp__2’ [-Wunused-variable]>>
 #pragma GCC diagnostic ignored "-Wunused-parameter" // e.g. <<warning: unused parameter ‘G’ [-Wunused-parameter]>>
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 #pragma nv_diagnostic push
 #pragma nv_diag_suppress 177 // e.g. <<warning #177-D: variable "mdl_G__exp__2" was declared but never referenced>>
 #endif
@@ -242,7 +242,7 @@ namespace Parameters_sm_dependentCouplings
     // End SM implementation - no special handling of vectors of floats as in EFT (#439)
     return out;
   }
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 #pragma GCC diagnostic pop
 #pragma nv_diagnostic pop
 #endif
@@ -258,7 +258,7 @@ namespace Parameters_sm_independentCouplings
 
 //==========================================================================
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/gg_tt01g.mad/src/mgOnGpuConfig.h b/epochX/cudacpp/gg_tt01g.mad/src/mgOnGpuConfig.h
index 881353abac..390766116b 100644
--- a/epochX/cudacpp/gg_tt01g.mad/src/mgOnGpuConfig.h
+++ b/epochX/cudacpp/gg_tt01g.mad/src/mgOnGpuConfig.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jul 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MGONGPUCONFIG_H
 #define MGONGPUCONFIG_H 1
@@ -10,13 +10,26 @@
 // There are two different code bases for standalone_cudacpp (without multichannel) and madevent+cudacpp (with multichannel)
 #define MGONGPU_SUPPORTS_MULTICHANNEL 1
 
+// Is this a GPU (CUDA, HIP) or CPU implementation?
+#ifdef __CUDACC__
+#define MGONGPUCPP_GPUIMPL cuda
+#elif defined __HIPCC__
+#define MGONGPUCPP_GPUIMPL hip
+#else
+#undef MGONGPUCPP_GPUIMPL
+#endif
+
 // ** NB1 Throughputs (e.g. 6.8E8) are events/sec for "./gcheck.exe -p 65536 128 12"
 // ** NB2 Baseline on b7g47n0004 fluctuates (probably depends on load on other VMs)
 
 // Choose if curand is supported for generating random numbers
+// For CUDA, by default, it is supported
+// For HIP, by default, it is not supported
 // For C++, by default, do not inline, but allow this macro to be set from outside with e.g. -DMGONGPU_HAS_NO_CURAND
 #ifdef __CUDACC__
 #undef MGONGPU_HAS_NO_CURAND
+#elif defined __HIPCC__
+#define MGONGPU_HAS_NO_CURAND 1
 #else
 //#undef MGONGPU_HAS_NO_CURAND // default
 ////#define MGONGPU_HAS_NO_CURAND 1
@@ -52,23 +65,28 @@
 //#undef MGONGPU_HARDCODE_PARAM // default
 ////#define MGONGPU_HARDCODE_PARAM 1
 
-// Complex type in c++: std::complex or cxsmpl (CHOOSE ONLY ONE)
-#ifndef __CUDACC__
-//#define MGONGPU_CPPCXTYPE_STDCOMPLEX 1 // ~8 percent slower on float, same on double (5.1E6/double, 9.4E6/float)
-#define MGONGPU_CPPCXTYPE_CXSMPL 1 // new default (5.1E6/double, 10.2E6/float)
-#endif
-
-// Complex type in cuda: thrust or cucomplex or cxsmpl (CHOOSE ONLY ONE)
+// Complex type in CUDA: thrust or cucomplex or cxsmpl (CHOOSE ONLY ONE)
 #ifdef __CUDACC__
 #define MGONGPU_CUCXTYPE_THRUST 1 // default (~1.15E9/double, ~3.2E9/float)
 //#define MGONGPU_CUCXTYPE_CUCOMPLEX 1 // ~10 percent slower (1.03E9/double, ~2.8E9/float)
 //#define MGONGPU_CUCXTYPE_CXSMPL 1 // ~10 percent slower (1.00E9/double, ~2.9E9/float)
+
+// Complex type in HIP: cxsmpl (ONLY ONE OPTION POSSIBLE)
+#elif defined __HIPCC__
+#define MGONGPU_CUCXTYPE_CXSMPL 1 // ~10 percent slower (1.00E9/double, ~2.9E9/float)
+
+// Complex type in C++: std::complex or cxsmpl (CHOOSE ONLY ONE)
+#else
+//#define MGONGPU_CPPCXTYPE_STDCOMPLEX 1 // ~8 percent slower on float, same on double (5.1E6/double, 9.4E6/float)
+#define MGONGPU_CPPCXTYPE_CXSMPL 1 // new default (5.1E6/double, 10.2E6/float)
 #endif
 
-// Cuda nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation
+// CUDA nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation
 #ifdef __CUDACC__
-#undef MGONGPU_NSIGHT_DEBUG // default
+#undef MGONGPU_NSIGHT_DEBUG // default in CUDA
 //#define MGONGPU_NSIGHT_DEBUG 1
+#else
+#undef MGONGPU_NSIGHT_DEBUG // only option in HIP or C++
 #endif
 
 // SANITY CHECKS (floating point precision for everything but color algebra #537)
@@ -84,17 +102,21 @@
 #error You cannot use double precision for color algebra and single precision elsewhere
 #endif
 
-// SANITY CHECKS (c++ complex number implementation)
-#ifndef __CUDACC__
-#if defined MGONGPU_CPPCXTYPE_STDCOMPLEX and defined MGONGPU_CPPCXTYPE_CXSMPL
-#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CPPCXTYPE_STDCOMPLEX or MGONGPU_CPPCXTYPE_CXSMPL
+// SANITY CHECKS (CUDA complex number implementation)
+#ifdef __CUDACC__
+#if defined MGONGPU_CUCXTYPE_THRUST and defined MGONGPU_CUCXTYPE_CUCOMPLEX
+#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CUCXTYPE_THRUST or MGONGPU_CUCXTYPE_CUCOMPLEX for CUDA
+#elif defined MGONGPU_CUCXTYPE_THRUST and defined MGONGPU_CUCXTYPE_CXSMPL
+#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CUCXTYPE_THRUST or MGONGPU_CUCXTYPE_CXSMPL for CUDA
+#elif defined MGONGPU_CUCXTYPE_CUCOMPLEX and defined MGONGPU_CUCXTYPE_CXSMPL
+#error You must CHOOSE (ONE AND) ONLY ONE OF MGONGPU_CUCXTYPE_CUCOMPLEX or MGONGPU_CUCXTYPE_CXSMPL for CUDA
 #endif
 #endif
 
-// SANITY CHECKS (cuda complex number implementation)
-#ifdef __CUDACC__
-#if defined MGONGPU_CUCXTYPE_THRUST and defined MGONGPU_CUCXTYPE_CUCOMPLEX and defined MGONGPU_CUCXTYPE_CXSMPL
-#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CUCXTYPE_THRUST or MGONGPU_CUCXTYPE_CUCOMPLEX or MGONGPU_CUCXTYPE_CXSMPL
+// SANITY CHECKS (C++ complex number implementation)
+#ifndef MGONGPUCPP_GPUIMPL
+#if defined MGONGPU_CPPCXTYPE_STDCOMPLEX and defined MGONGPU_CPPCXTYPE_CXSMPL
+#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CPPCXTYPE_STDCOMPLEX or MGONGPU_CPPCXTYPE_CXSMPL for C++
 #endif
 #endif
 
@@ -131,7 +153,7 @@ namespace mgOnGpu
   // Alignment requirement for using reinterpret_cast with SIMD vectorized code
   // (using reinterpret_cast with non aligned memory may lead to segmentation faults!)
   // Only needed for C++ code but can be enforced also in NVCC builds of C++ code using CUDA>=11.2 and C++17 (#318, #319, #333)
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   constexpr int cppAlign = 64; // alignment requirement for SIMD vectorization (64-byte i.e. 512-bit)
 #endif
 
@@ -142,7 +164,7 @@ using mgOnGpu::fptype;
 using mgOnGpu::fptype2;
 
 // C++ SIMD vectorization width (this will be used to set neppV)
-#ifdef __CUDACC__ // CUDA implementation has no SIMD
+#ifdef MGONGPUCPP_GPUIMPL // CUDA and HIP implementations have no SIMD
 #undef MGONGPU_CPPSIMD
 #elif defined __AVX512VL__ && defined MGONGPU_PVW512 // C++ "512z" AVX512 with 512 width (512-bit ie 64-byte): 8 (DOUBLE) or 16 (FLOAT)
 #ifdef MGONGPU_FPTYPE_DOUBLE
@@ -172,9 +194,9 @@ using mgOnGpu::fptype2;
 #undef MGONGPU_CPPSIMD
 #endif
 
-// Cuda nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation
+// CUDA nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation
 // Arguments (not used so far): text is __FUNCTION__, code is 0 (start) or 1 (end)
-#if defined __CUDACC__ && defined MGONGPU_NSIGHT_DEBUG /* clang-format off */
+#if defined __CUDA__ && defined MGONGPU_NSIGHT_DEBUG /* clang-format off */
 #define mgDebugDeclare() __shared__ float mgDebugCounter[mgOnGpu::ntpbMAX];
 #define mgDebugInitialise() { mgDebugCounter[threadIdx.x] = 0; }
 #define mgDebug( code, text ) { mgDebugCounter[threadIdx.x] += 1; }
@@ -186,8 +208,8 @@ using mgOnGpu::fptype2;
 #define mgDebugFinalise() { /*noop*/ }
 #endif /* clang-format on */
 
-// Define empty CUDA declaration specifiers for C++
-#ifndef __CUDACC__
+// Define empty CUDA/HIP declaration specifiers for C++
+#ifndef MGONGPUCPP_GPUIMPL
 #define __global__
 #define __host__
 #define __device__
diff --git a/epochX/cudacpp/gg_tt01g.mad/src/mgOnGpuCxtypes.h b/epochX/cudacpp/gg_tt01g.mad/src/mgOnGpuCxtypes.h
index 0cb2f1db7e..4e7ab03fa2 100644
--- a/epochX/cudacpp/gg_tt01g.mad/src/mgOnGpuCxtypes.h
+++ b/epochX/cudacpp/gg_tt01g.mad/src/mgOnGpuCxtypes.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022, based on earlier work by D. Smith) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MGONGPUCXTYPES_H
 #define MGONGPUCXTYPES_H 1
@@ -19,7 +19,7 @@
 #include <complex>
 
 // Complex type in cuda: thrust or cucomplex or cxsmpl
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 #if defined MGONGPU_CUCXTYPE_THRUST
 #pragma clang diagnostic push
 #pragma clang diagnostic ignored "-Wtautological-compare" // for icpx2021/clang13 (https://stackoverflow.com/a/15864661)
@@ -201,7 +201,7 @@ namespace mgOnGpu
 {
 
   // --- Type definitions (complex type: cxtype)
-#ifdef __CUDACC__ // cuda
+#ifdef MGONGPUCPP_GPUIMPL // cuda
 #if defined MGONGPU_CUCXTYPE_THRUST
   typedef thrust::complex<fptype> cxtype;
 #elif defined MGONGPU_CUCXTYPE_CUCOMPLEX
@@ -281,7 +281,7 @@ cxmake( const std::complex<double>& c ) // std::complex to cxsmpl (double-to-flo
 
 //==========================================================================
 
-#if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_THRUST // cuda + thrust
+#if defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CUCXTYPE_THRUST // cuda + thrust
 
 //------------------------------
 // CUDA - using thrust::complex
@@ -317,11 +317,11 @@ cxmake( const cxtype& c )
   return c;
 }
 
-#endif // #if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_THRUST
+#endif // #if defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CUCXTYPE_THRUST
 
 //==========================================================================
 
-#if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_CUCOMPLEX // cuda + cucomplex
+#if defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CUCXTYPE_CUCOMPLEX // cuda + cucomplex
 
 //------------------------------
 // CUDA - using cuComplex
@@ -536,11 +536,11 @@ cxmake( const std::complex<fptype>& c ) // std::complex to cucomplex (float-to-f
   return cxmake( c.real(), c.imag() );
 }
 
-#endif // #if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_CUCOMPLEX
+#endif // #if defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CUCXTYPE_CUCOMPLEX
 
 //==========================================================================
 
-#if not defined __CUDACC__ and defined MGONGPU_CPPCXTYPE_STDCOMPLEX // c++ + stdcomplex
+#if not defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CPPCXTYPE_STDCOMPLEX // c++ + stdcomplex
 
 //------------------------------
 // C++ - using std::complex
@@ -584,7 +584,7 @@ cxmake( const std::complex<double>& c ) // std::complex to std::complex (cast do
 }
 #endif
 
-#endif // #if not defined __CUDACC__ and defined MGONGPU_CPPCXTYPE_STDCOMPLEX
+#endif // #if not defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CPPCXTYPE_STDCOMPLEX
 
 //==========================================================================
 
diff --git a/epochX/cudacpp/gg_tt01g.mad/src/mgOnGpuFptypes.h b/epochX/cudacpp/gg_tt01g.mad/src/mgOnGpuFptypes.h
index a1cde16a67..6f6cee64d6 100644
--- a/epochX/cudacpp/gg_tt01g.mad/src/mgOnGpuFptypes.h
+++ b/epochX/cudacpp/gg_tt01g.mad/src/mgOnGpuFptypes.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MGONGPUFPTYPES_H
 #define MGONGPUFPTYPES_H 1
@@ -13,7 +13,7 @@
 
 //==========================================================================
 
-#ifdef __CUDACC__ // cuda
+#ifdef MGONGPUCPP_GPUIMPL // cuda
 
 //------------------------------
 // Floating point types - Cuda
@@ -57,11 +57,11 @@ fpsqrt( const fptype& f )
 #endif
 }
 
-#endif // #ifdef __CUDACC__
+#endif // #ifdef MGONGPUCPP_GPUIMPL
 
 //==========================================================================
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 
 //------------------------------
 // Floating point types - C++
@@ -85,7 +85,7 @@ fpsqrt( const fptype& f )
   return std::sqrt( f );
 }
 
-#endif // #ifndef __CUDACC__
+#endif // #ifndef MGONGPUCPP_GPUIMPL
 
 //==========================================================================
 
diff --git a/epochX/cudacpp/gg_tt01g.mad/src/mgOnGpuVectors.h b/epochX/cudacpp/gg_tt01g.mad/src/mgOnGpuVectors.h
index 9d3e82b1e3..7904b93c61 100644
--- a/epochX/cudacpp/gg_tt01g.mad/src/mgOnGpuVectors.h
+++ b/epochX/cudacpp/gg_tt01g.mad/src/mgOnGpuVectors.h
@@ -108,7 +108,7 @@ namespace mgOnGpu /* clang-format off */
 #endif
 #endif
 
-#else // i.e #ifndef MGONGPU_CPPSIMD (this includes #ifdef __CUDACC__)
+#else // i.e #ifndef MGONGPU_CPPSIMD (this includes #ifdef MGONGPUCPP_GPUIMPL)
 
   const int neppV = 1;
 
@@ -129,7 +129,7 @@ using mgOnGpu::bool_v;
 
 //--------------------------------------------------------------------------
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 
 // Printout to stream for user defined types
 
@@ -744,11 +744,11 @@ fpvsplit1( const fptype2_v& v )
 
 #endif // #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
 
-#endif // #ifndef __CUDACC__
+#endif // #ifndef MGONGPUCPP_GPUIMPL
 
 //==========================================================================
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 
 //------------------------------
 // Vector types - CUDA
@@ -786,12 +786,12 @@ cxternary( const bool& mask, const cxtype& a, const cxtype& b )
   return ( mask ? a : b );
 }
 
-#endif // #ifdef __CUDACC__
+#endif // #ifdef MGONGPUCPP_GPUIMPL
 
 //==========================================================================
 
 // Scalar-or-vector types: scalar in CUDA, vector or scalar in C++
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 typedef bool bool_sv;
 typedef fptype fptype_sv;
 typedef fptype2 fptype2_sv;
@@ -812,7 +812,7 @@ typedef mgOnGpu::cxtype_ref cxtype_sv_ref;
 #endif
 
 // Scalar-or-vector zeros: scalar in CUDA, vector or scalar in C++
-#ifdef __CUDACC__ /* clang-format off */
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
 inline __host__ __device__ cxtype cxzero_sv(){ return cxtype( 0, 0 ); }
 #elif defined MGONGPU_CPPSIMD
 inline cxtype_v cxzero_sv() { return cxtype_v(); } // RRRR=0000 IIII=0000
diff --git a/epochX/cudacpp/gg_tt01g.mad/src/rambo.h b/epochX/cudacpp/gg_tt01g.mad/src/rambo.h
index e02ea52496..cd7e1008ea 100644
--- a/epochX/cudacpp/gg_tt01g.mad/src/rambo.h
+++ b/epochX/cudacpp/gg_tt01g.mad/src/rambo.h
@@ -4,7 +4,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 
 #include "mgOnGpuConfig.h"
@@ -18,7 +18,7 @@
 #include <iostream>
 
 // Simplified rambo version for 2 to N (with N>=2) processes with massless particles
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -83,7 +83,7 @@ namespace mg5amcCpu
       static bool first = true;
       if( first )
       {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
         if constexpr( M_ACCESS::isOnDevice() ) // avoid
         {
           const int ievt0 = 0;
@@ -166,7 +166,7 @@ namespace mg5amcCpu
     wt = po2log;
     if( nparf != 2 ) wt = ( 2. * nparf - 4. ) * log( energy ) + z[nparf - 1];
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
     // issue warnings if weight is too small or too large
     static int iwarn[5] = { 0, 0, 0, 0, 0 };
     if( wt < -180. )
diff --git a/epochX/cudacpp/gg_ttg.mad/CODEGEN_mad_gg_ttg_log.txt b/epochX/cudacpp/gg_ttg.mad/CODEGEN_mad_gg_ttg_log.txt
index e56e6dfb27..f273353c34 100644
--- a/epochX/cudacpp/gg_ttg.mad/CODEGEN_mad_gg_ttg_log.txt
+++ b/epochX/cudacpp/gg_ttg.mad/CODEGEN_mad_gg_ttg_log.txt
@@ -51,8 +51,8 @@ Note that you can still compile and run aMC@NLO with the built-in PDFs
  MG5_aMC> set lhapdf /PATH/TO/lhapdf-config
 
 Using default text editor "vi". Set another one in ./input/mg5_configuration.txt
-No valid eps viewer found. Please set in ./input/mg5_configuration.txt
-Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt
+Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt
+No valid web browser found. Please set in ./input/mg5_configuration.txt
 import /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_mad_gg_ttg.mg
 The import format was not given, so we guess it as command
 set stdout_level DEBUG
@@ -62,7 +62,7 @@ generate g g > t t~ g
 No model currently active, so we import the Standard Model
 INFO: load particles 
 INFO: load vertices 
-[1;32mDEBUG: model prefixing  takes 0.004709720611572266 [0m
+[1;32mDEBUG: model prefixing  takes 0.005414009094238281 [0m
 INFO: Restrict model sm with file models/sm/restrict_default.dat . 
 [1;32mDEBUG: Simplifying conditional expressions [0m
 [1;32mDEBUG: remove interactions: u s w+ at order: QED=1 [0m
@@ -155,7 +155,7 @@ INFO: Please specify coupling orders to bypass this step.
 INFO: Trying coupling order WEIGHTED<=3: WEIGTHED IS QCD+2*QED 
 INFO: Trying process: g g > t t~ g WEIGHTED<=3 @1  
 INFO: Process has 16 diagrams 
-1 processes with 16 diagrams generated in 0.019 s
+1 processes with 16 diagrams generated in 0.021 s
 Total: 1 processes with 16 diagrams
 output madevent CODEGEN_mad_gg_ttg --hel_recycling=False --vector_size=16384 --me_exporter=standalone_cudacpp
 Load PLUGIN.CUDACPP_SA_OUTPUT
@@ -177,7 +177,7 @@ INFO: Creating files in directory P1_gg_ttxg
 [1;32mDEBUG:  Entering PLUGIN_OneProcessExporter.__init__ [1;30m[model_handling.py at line 1028][0m [0m
 [1;32mDEBUG:  kwargs[prefix] = 0 [1;30m[model_handling.py at line 1029][0m [0m
 [1;32mDEBUG:  proc_id = [0m 1 [1;30m[model_handling.py at line 1034][0m [0m
-[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_SA_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7f2a1901cc40> [1;30m[export_v4.py at line 6174][0m [0m
+[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_SA_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7f4ee7745e80> [1;30m[export_v4.py at line 6174][0m [0m
 INFO: Creating files in directory . 
 [1;32mDEBUG:  Entering PLUGIN_OneProcessExporter.generate_process_files [1;30m[model_handling.py at line 1286][0m [0m
 [1;32mDEBUG:  self.include_multi_channel is already defined: this is madevent+second_exporter mode [1;30m[model_handling.py at line 1288][0m [0m
@@ -192,13 +192,13 @@ FileWriter <class 'PLUGIN.CUDACPP_SA_OUTPUT.model_handling.PLUGIN_CPPWriter'> fo
 [1;32mDEBUG:  self.support_multichannel, self.include_multi_channel = [0m True [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 0] [1;30m[model_handling.py at line 1152][0m [0m
 [1;32mDEBUG:  multi_channel = [0m {1: [0], 2: [1], 3: [2], 4: [3], 5: [4], 6: [5], 7: [6], 8: [7], 9: [8], 10: [9], 11: [10], 12: [11], 13: [12], 14: [13], 15: [14]} [1;30m[model_handling.py at line 1158][0m [0m
 [1;32mDEBUG:  multi_channel_map = [0m {1: [0], 2: [1], 3: [2], 4: [3], 5: [4], 6: [5], 7: [6], 8: [7], 9: [8], 10: [9], 11: [10], 12: [11], 13: [12], 14: [13], 15: [14]} [1;30m[model_handling.py at line 1643][0m [0m
-[1;32mDEBUG:  diag_to_config = [0m {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14, 15: 15} [1;30m[model_handling.py at line 1698][0m [0m
-[1;32mDEBUG:  call = [0m vxxxxx( momenta,m_pars->%s, cHel[ihel][%d],%+d, w_sv[%d], %d ); [1;30m[model_handling.py at line 1810][0m [0m
-[1;32mDEBUG:  ('ZERO', 0, -1, 0, 0) [1;30m[model_handling.py at line 1811][0m [0m
-[1;32mDEBUG:  call = [0m vxxxxx( momenta,m_pars->%s, cHel[ihel][%d],%+d, w_sv[%d], %d ); [1;30m[model_handling.py at line 1810][0m [0m
-[1;32mDEBUG:  ('ZERO', 1, -1, 1, 1) [1;30m[model_handling.py at line 1811][0m [0m
-[1;32mDEBUG:  call = [0m vxxxxx( momenta,m_pars->%s, cHel[ihel][%d],%+d, w_sv[%d], %d ); [1;30m[model_handling.py at line 1810][0m [0m
-[1;32mDEBUG:  ('ZERO', 4, 1, 4, 4) [1;30m[model_handling.py at line 1811][0m [0m
+[1;32mDEBUG:  diag_to_config = [0m {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14, 15: 15} [1;30m[model_handling.py at line 1700][0m [0m
+[1;32mDEBUG:  call = [0m vxxxxx( momenta,m_pars->%s, cHel[ihel][%d],%+d, w_sv[%d], %d ); [1;30m[model_handling.py at line 1812][0m [0m
+[1;32mDEBUG:  ('ZERO', 0, -1, 0, 0) [1;30m[model_handling.py at line 1813][0m [0m
+[1;32mDEBUG:  call = [0m vxxxxx( momenta,m_pars->%s, cHel[ihel][%d],%+d, w_sv[%d], %d ); [1;30m[model_handling.py at line 1812][0m [0m
+[1;32mDEBUG:  ('ZERO', 1, -1, 1, 1) [1;30m[model_handling.py at line 1813][0m [0m
+[1;32mDEBUG:  call = [0m vxxxxx( momenta,m_pars->%s, cHel[ihel][%d],%+d, w_sv[%d], %d ); [1;30m[model_handling.py at line 1812][0m [0m
+[1;32mDEBUG:  ('ZERO', 4, 1, 4, 4) [1;30m[model_handling.py at line 1813][0m [0m
 INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./. 
 [1;32mDEBUG:  Entering PLUGIN_OneProcessExporter.edit_CMakeLists [1;30m[model_handling.py at line 1332][0m [0m
 [1;32mDEBUG:  Entering PLUGIN_OneProcessExporter.edit_check_sa [1;30m[model_handling.py at line 1341][0m [0m
@@ -215,15 +215,15 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./.
 [1;32mDEBUG:  Done [1;30m[export_cpp.py at line 713][0m [0m
 INFO: Generating Feynman diagrams for Process: g g > t t~ g WEIGHTED<=3 @1 
 INFO: Finding symmetric diagrams for subprocess group gg_ttxg 
-Generated helas calls for 1 subprocesses (16 diagrams) in 0.035 s
-Wrote files for 36 helas calls in 0.153 s
+Generated helas calls for 1 subprocesses (16 diagrams) in 0.037 s
+Wrote files for 36 helas calls in 0.160 s
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates VVV1 routines[0m
 ALOHA: aloha creates FFV1 routines[0m
 ALOHA: aloha creates VVVV1 set of routines with options: P0[0m
 ALOHA: aloha creates VVVV3 set of routines with options: P0[0m
 ALOHA: aloha creates VVVV4 set of routines with options: P0[0m
-ALOHA: aloha creates 5 routines in  0.290 s
+ALOHA: aloha creates 5 routines in  0.309 s
 [1;32mDEBUG:  Entering PLUGIN_ProcessExporter.convert_model (create the model) [1;30m[output.py at line 194][0m [0m
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates VVV1 routines[0m
@@ -231,7 +231,7 @@ ALOHA: aloha creates FFV1 routines[0m
 ALOHA: aloha creates VVVV1 set of routines with options: P0[0m
 ALOHA: aloha creates VVVV3 set of routines with options: P0[0m
 ALOHA: aloha creates VVVV4 set of routines with options: P0[0m
-ALOHA: aloha creates 10 routines in  0.267 s
+ALOHA: aloha creates 10 routines in  0.294 s
 <class 'aloha.create_aloha.AbstractRoutine'> VVV1
 <class 'aloha.create_aloha.AbstractRoutine'> VVV1
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
@@ -279,6 +279,6 @@ Type "launch" to generate events from this process, or see
 Run "open index.html" to see more information about this process.
 quit
 
-real	0m2.132s
-user	0m1.824s
-sys	0m0.213s
+real	0m3.084s
+user	0m1.962s
+sys	0m0.184s
diff --git a/epochX/cudacpp/gg_ttg.mad/COPYRIGHT b/epochX/cudacpp/gg_ttg.mad/COPYRIGHT
index a134b5fef9..84a883fbb0 100644
--- a/epochX/cudacpp/gg_ttg.mad/COPYRIGHT
+++ b/epochX/cudacpp/gg_ttg.mad/COPYRIGHT
@@ -15,6 +15,7 @@ The full development team currently includes the following authors :
   Stephan Hageboeck (CERN)
   Olivier Mattelaer (Universite Catholique de Louvain, original author)
   Stefan Roiser (CERN, original author)
+  Joergen Teig (CERN)
   Andrea Valassi (CERN, original author)
   Zenny Wettersten (CERN)
 See https://github.com/madgraph5/madgraph4gpu for more details. For the full
diff --git a/epochX/cudacpp/gg_ttg.mad/Source/DHELAS/aloha_file.inc b/epochX/cudacpp/gg_ttg.mad/Source/DHELAS/aloha_file.inc
index 4f2ef3d0d8..50c12b0804 100644
--- a/epochX/cudacpp/gg_ttg.mad/Source/DHELAS/aloha_file.inc
+++ b/epochX/cudacpp/gg_ttg.mad/Source/DHELAS/aloha_file.inc
@@ -1 +1 @@
-ALOHARoutine = VVVV4P0_1.o VVVV3P0_1.o VVVV1P0_1.o FFV1_1.o FFV1_2.o VVV1P0_1.o VVV1_0.o FFV1_0.o FFV1P0_3.o
+ALOHARoutine = FFV1_1.o VVVV4P0_1.o FFV1_0.o VVV1_0.o FFV1_2.o VVVV3P0_1.o VVVV1P0_1.o VVV1P0_1.o FFV1P0_3.o
diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/Bridge.h b/epochX/cudacpp/gg_ttg.mad/SubProcesses/Bridge.h
index 4cafe0c997..c04628dfd1 100644
--- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/Bridge.h
+++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/Bridge.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: S. Roiser (Nov 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Roiser, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef BRIDGE_H
 #define BRIDGE_H 1
@@ -22,7 +22,7 @@
 #include <memory>
 #include <type_traits>
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -82,7 +82,7 @@ namespace mg5amcCpu
     Bridge& operator=( const Bridge& ) = delete;
     Bridge& operator=( Bridge&& ) = delete;
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     /**
      * Set the gpublocks and gputhreads for the gpusequence - throws if evnt != gpublocks*gputhreads
      * (this is needed for BridgeKernel tests rather than for actual production use in Fortran)
@@ -149,7 +149,7 @@ namespace mg5amcCpu
     unsigned int m_nevt; // number of events
     int m_nGoodHel;      // the number of good helicities (-1 initially when they have not yet been calculated)
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     int m_gputhreads; // number of gpu threads (default set from number of events, can be modified)
     int m_gpublocks;  // number of gpu blocks (default set from number of events, can be modified)
     mg5amcGpu::DeviceBuffer<FORTRANFPTYPE, sizePerEventMomenta> m_devMomentaF;
@@ -186,12 +186,12 @@ namespace mg5amcCpu
   // Forward declare transposition methods
   //
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 
   template<typename Tin, typename Tout>
   __global__ void dev_transposeMomentaF2C( const Tin* in, Tout* out, const unsigned int nevt );
 
-#endif // __CUDACC__
+#endif // MGONGPUCPP_GPUIMPL
 
   template<typename Tin, typename Tout>
   void hst_transposeMomentaF2C( const Tin* in, Tout* out, const unsigned int nevt );
@@ -208,7 +208,7 @@ namespace mg5amcCpu
   Bridge<FORTRANFPTYPE>::Bridge( unsigned int nevtF, unsigned int nparF, unsigned int np4F )
     : m_nevt( nevtF )
     , m_nGoodHel( -1 )
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     , m_gputhreads( 256 )                  // default number of gpu threads
     , m_gpublocks( m_nevt / m_gputhreads ) // this ensures m_nevt <= m_gpublocks*m_gputhreads
     , m_devMomentaF( m_nevt )
@@ -232,7 +232,7 @@ namespace mg5amcCpu
   {
     if( nparF != CPPProcess::npar ) throw std::runtime_error( "Bridge constructor: npar mismatch" );
     if( np4F != CPPProcess::np4 ) throw std::runtime_error( "Bridge constructor: np4 mismatch" );
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     if( ( m_nevt < s_gputhreadsmin ) || ( m_nevt % s_gputhreadsmin != 0 ) )
       throw std::runtime_error( "Bridge constructor: nevt should be a multiple of " + std::to_string( s_gputhreadsmin ) );
     while( m_nevt != m_gpublocks * m_gputhreads )
@@ -250,11 +250,11 @@ namespace mg5amcCpu
     std::cout << "WARNING! Instantiate host Bridge (nevt=" << m_nevt << ")" << std::endl;
     mg5amcCpu::CPPProcess process( /*verbose=*/false );
     m_pmek.reset( new mg5amcCpu::MatrixElementKernelHost( m_hstMomentaC, m_hstGs, m_hstRndHel, m_hstRndCol, m_hstMEs, m_hstSelHel, m_hstSelCol, m_nevt ) );
-#endif // __CUDACC__
+#endif // MGONGPUCPP_GPUIMPL
     process.initProc( "../../Cards/param_card.dat" );
   }
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   template<typename FORTRANFPTYPE>
   void Bridge<FORTRANFPTYPE>::set_gpugrid( const int gpublocks, const int gputhreads )
   {
@@ -268,7 +268,7 @@ namespace mg5amcCpu
   }
 #endif
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   template<typename FORTRANFPTYPE>
   void Bridge<FORTRANFPTYPE>::gpu_sequence( const FORTRANFPTYPE* momenta,
                                             const FORTRANFPTYPE* gs,
@@ -283,14 +283,14 @@ namespace mg5amcCpu
     constexpr int neppM = MemoryAccessMomenta::neppM;
     if constexpr( neppM == 1 && std::is_same_v<FORTRANFPTYPE, fptype> )
     {
-      checkCuda( cudaMemcpy( m_devMomentaC.data(), momenta, m_devMomentaC.bytes(), cudaMemcpyHostToDevice ) );
+      gpuMemcpy( m_devMomentaC.data(), momenta, m_devMomentaC.bytes(), gpuMemcpyHostToDevice );
     }
     else
     {
-      checkCuda( cudaMemcpy( m_devMomentaF.data(), momenta, m_devMomentaF.bytes(), cudaMemcpyHostToDevice ) );
+      gpuMemcpy( m_devMomentaF.data(), momenta, m_devMomentaF.bytes(), gpuMemcpyHostToDevice );
       const int thrPerEvt = CPPProcess::npar * CPPProcess::np4; // AV: transpose alg does 1 element per thread (NOT 1 event per thread)
       //const int thrPerEvt = 1; // AV: try new alg with 1 event per thread... this seems slower
-      dev_transposeMomentaF2C<<<m_gpublocks * thrPerEvt, m_gputhreads>>>( m_devMomentaF.data(), m_devMomentaC.data(), m_nevt );
+      gpuLaunchKernel( dev_transposeMomentaF2C, m_gpublocks * thrPerEvt, m_gputhreads, m_devMomentaF.data(), m_devMomentaC.data(), m_nevt );
     }
     if constexpr( std::is_same_v<FORTRANFPTYPE, fptype> )
     {
@@ -333,7 +333,7 @@ namespace mg5amcCpu
   }
 #endif
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   template<typename FORTRANFPTYPE>
   void Bridge<FORTRANFPTYPE>::cpu_sequence( const FORTRANFPTYPE* momenta,
                                             const FORTRANFPTYPE* gs,
@@ -388,7 +388,7 @@ namespace mg5amcCpu
   // - C++ array: momenta[npagM][npar][np4][neppM] with nevt=npagM*neppM (AOSOA)
   //
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   template<typename Tin, typename Tout>
   __global__ void dev_transposeMomentaF2C( const Tin* in, Tout* out, const unsigned int nevt )
   {
diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/BridgeKernels.cc b/epochX/cudacpp/gg_ttg.mad/SubProcesses/BridgeKernels.cc
index cef4cb3c71..90c7f2d3b8 100644
--- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/BridgeKernels.cc
+++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/BridgeKernels.cc
@@ -1,10 +1,11 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #include "BridgeKernels.h"
 
+#include "GpuAbstraction.h"
 #include "MemoryAccessMomenta.h"
 
 #include <sstream>
@@ -14,7 +15,7 @@ constexpr int npar = CPPProcess::npar; // #particles in total (external = initia
 
 //============================================================================
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -45,7 +46,7 @@ namespace mg5amcCpu
 
 //============================================================================
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 namespace mg5amcCpu
 {
 
@@ -96,7 +97,7 @@ namespace mg5amcCpu
 
 //============================================================================
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 {
 
diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/BridgeKernels.h b/epochX/cudacpp/gg_ttg.mad/SubProcesses/BridgeKernels.h
index 15eb4bff4d..3efef8ce97 100644
--- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/BridgeKernels.h
+++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/BridgeKernels.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef BRIDGEKERNELS_H
 #define BRIDGEKERNELS_H 1
@@ -12,7 +12,7 @@
 #include "MatrixElementKernels.h"
 #include "MemoryBuffers.h"
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -49,7 +49,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A Bridge wrapper class encapsulating matrix element calculations on a CPU host
   class BridgeKernelHost final : public BridgeKernelBase
   {
@@ -89,7 +89,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   // A Bridge wrapper class encapsulating matrix element calculations on a GPU device
   class BridgeKernelDevice : public BridgeKernelBase
   {
diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/CommonRandomNumberKernel.cc b/epochX/cudacpp/gg_ttg.mad/SubProcesses/CommonRandomNumberKernel.cc
index 985b39f576..010bc4cbd0 100644
--- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/CommonRandomNumberKernel.cc
+++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/CommonRandomNumberKernel.cc
@@ -1,15 +1,16 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #include "CommonRandomNumbers.h"
+#include "GpuAbstraction.h"
 #include "MemoryBuffers.h"
 #include "RandomNumberKernels.h"
 
 #include <cassert>
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/CrossSectionKernels.cc b/epochX/cudacpp/gg_ttg.mad/SubProcesses/CrossSectionKernels.cc
index 0b355a3c8d..c15b39844d 100644
--- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/CrossSectionKernels.cc
+++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/CrossSectionKernels.cc
@@ -1,10 +1,11 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #include "CrossSectionKernels.h"
 
+#include "GpuAbstraction.h"
 #include "MemoryAccessMatrixElements.h"
 #include "MemoryAccessWeights.h"
 #include "MemoryBuffers.h"
@@ -77,7 +78,7 @@ debug_me_is_abnormal( const fptype& me, size_t ievtALL )
 
 //============================================================================
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -185,7 +186,7 @@ namespace mg5amcCpu
 
 //============================================================================
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 {
 
diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/CrossSectionKernels.h b/epochX/cudacpp/gg_ttg.mad/SubProcesses/CrossSectionKernels.h
index 7933ca4bbf..4d9659e04e 100644
--- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/CrossSectionKernels.h
+++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/CrossSectionKernels.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef CROSSSECTIONKERNELS_H
 #define CROSSSECTIONKERNELS_H 1
@@ -13,7 +13,7 @@
 
 //============================================================================
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -96,7 +96,7 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
   /*
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   // A class encapsulating the calculation of event statistics on a GPU device
   class CrossSectionKernelDevice : public CrossSectionKernelBase, public NumberOfEvents
   {
diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/CudaRuntime.h b/epochX/cudacpp/gg_ttg.mad/SubProcesses/CudaRuntime.h
deleted file mode 100644
index 64ce52f4b3..0000000000
--- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/CudaRuntime.h
+++ /dev/null
@@ -1,85 +0,0 @@
-// Copyright (C) 2020-2023 CERN and UCLouvain.
-// Licensed under the GNU Lesser General Public License (version 3 or later).
-// Created by: S. Roiser (Jul 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
-
-#ifndef MG5AMC_CUDARUNTIME_H
-#define MG5AMC_CUDARUNTIME_H 1
-
-// MG5AMC on GPU uses the CUDA runtime API, not the lower level CUDA driver API
-// See https://docs.nvidia.com/cuda/cuda-runtime-api/driver-vs-runtime-api.html#driver-vs-runtime-api
-
-#include <cassert>
-#include <iostream>
-
-//--------------------------------------------------------------------------
-
-// See https://stackoverflow.com/a/14038590
-#ifdef __CUDACC__ /* clang-format off */
-#define checkCuda( code ) { assertCuda( code, __FILE__, __LINE__ ); }
-inline void assertCuda( cudaError_t code, const char* file, int line, bool abort = true )
-{
-  if( code != cudaSuccess )
-  {
-    printf( "ERROR! assertCuda: '%s' (%d) in %s:%d\n", cudaGetErrorString( code ), code, file, line );
-    if( abort ) assert( code == cudaSuccess );
-  }
-}
-#endif /* clang-format on */
-
-//--------------------------------------------------------------------------
-
-#ifdef __CUDACC__
-namespace mg5amcGpu
-{
-  // Instantiate a CudaRuntime at the beginnining of the application's main to
-  // invoke cudaSetDevice(0) in the constructor and book a cudaDeviceReset() call in the destructor
-  // *** FIXME! This will all need to be designed differently when going to multi-GPU nodes! ***
-  struct CudaRuntime final
-  {
-    CudaRuntime( const bool debug = true )
-      : m_debug( debug ) { setUp( m_debug ); }
-    ~CudaRuntime() { tearDown( m_debug ); }
-    CudaRuntime( const CudaRuntime& ) = delete;
-    CudaRuntime( CudaRuntime&& ) = delete;
-    CudaRuntime& operator=( const CudaRuntime& ) = delete;
-    CudaRuntime& operator=( CudaRuntime&& ) = delete;
-    bool m_debug;
-
-    // Set up CUDA application
-    // ** NB: strictly speaking this is not needed when using the CUDA runtime API **
-    // Calling cudaSetDevice on startup is useful to properly book-keep the time spent in CUDA initialization
-    static void setUp( const bool debug = true )
-    {
-      // ** NB: it is useful to call cudaSetDevice, or cudaFree, to properly book-keep the time spent in CUDA initialization
-      // ** NB: otherwise, the first CUDA operation (eg a cudaMemcpyToSymbol in CPPProcess ctor) appears to take much longer!
-      /*
-      // [We initially added cudaFree(0) to "ease profile analysis" only because it shows up as a big recognizable block!]
-      // No explicit initialization is needed: https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#initialization
-      // It is not clear what cudaFree(0) does at all: https://stackoverflow.com/questions/69967813/
-      if ( debug ) std::cout << "__CudaRuntime: calling cudaFree(0)" << std::endl;
-      checkCuda( cudaFree( 0 ) ); // SLOW!
-      */
-      // Replace cudaFree(0) by cudaSetDevice(0), even if it is not really needed either
-      // (but see https://developer.nvidia.com/blog/cuda-pro-tip-always-set-current-device-avoid-multithreading-bugs)
-      if( debug ) std::cout << "__CudaRuntime: calling cudaSetDevice(0)" << std::endl;
-      checkCuda( cudaSetDevice( 0 ) ); // SLOW!
-    }
-
-    // Tear down CUDA application (call cudaDeviceReset)
-    // ** NB: strictly speaking this is not needed when using the CUDA runtime API **
-    // Calling cudaDeviceReset on shutdown is only needed for checking memory leaks in cuda-memcheck
-    // See https://docs.nvidia.com/cuda/cuda-memcheck/index.html#leak-checking
-    static void tearDown( const bool debug = true )
-    {
-      if( debug ) std::cout << "__CudaRuntime: calling cudaDeviceReset()" << std::endl;
-      checkCuda( cudaDeviceReset() );
-    }
-  };
-
-}
-#endif
-
-//--------------------------------------------------------------------------
-
-#endif // MG5AMC_CUDARUNTIME_H
diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/CurandRandomNumberKernel.cc b/epochX/cudacpp/gg_ttg.mad/SubProcesses/CurandRandomNumberKernel.cc
index eb56333b03..38c477c17a 100644
--- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/CurandRandomNumberKernel.cc
+++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/CurandRandomNumberKernel.cc
@@ -1,9 +1,9 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
-#include "CudaRuntime.h"
+#include "GpuRuntime.h"
 #include "MemoryBuffers.h"
 #include "RandomNumberKernels.h"
 
@@ -114,7 +114,7 @@ namespace mg5amcCpu
     /*
     printf( "\nCurandRandomNumberKernel::generateRnarray size = %d\n", (int)m_rnarray.size() );
     fptype* data = m_rnarray.data();
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     if( m_rnarray.isOnDevice() )
     {
       data = new fptype[m_rnarray.size()]();
@@ -123,7 +123,7 @@ namespace mg5amcCpu
 #endif
     for( int i = 0; i < ( (int)m_rnarray.size() / 4 ); i++ )
       printf( "[%4d] %f %f %f %f\n", i * 4, data[i * 4], data[i * 4 + 2], data[i * 4 + 2], data[i * 4 + 3] );
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     if( m_rnarray.isOnDevice() ) delete[] data;
 #endif
     */
diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/EventStatistics.h b/epochX/cudacpp/gg_ttg.mad/SubProcesses/EventStatistics.h
index 48b51e0a49..b425a5bade 100644
--- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/EventStatistics.h
+++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/EventStatistics.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef EventStatistics_H
 #define EventStatistics_H 1
@@ -16,7 +16,7 @@
 #include <limits>
 #include <string>
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/GpuAbstraction.h b/epochX/cudacpp/gg_ttg.mad/SubProcesses/GpuAbstraction.h
new file mode 100644
index 0000000000..6a7d9c05c0
--- /dev/null
+++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/GpuAbstraction.h
@@ -0,0 +1,71 @@
+// Copyright (C) 2020-2023 CERN and UCLouvain.
+// Licensed under the GNU Lesser General Public License (version 3 or later).
+// Created by: J. Teig (Jul 2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+
+#ifndef MG5AMC_GPUABSTRACTION_H
+#define MG5AMC_GPUABSTRACTION_H 1
+
+#include <cassert>
+
+//--------------------------------------------------------------------------
+
+#ifdef __CUDACC__
+
+#define gpuError_t cudaError_t
+#define gpuPeekAtLastError cudaPeekAtLastError
+#define gpuGetErrorString cudaGetErrorString
+#define gpuSuccess cudaSuccess
+
+#define gpuMallocHost( ptr, size ) checkGpu( cudaMallocHost( ptr, size ) )
+#define gpuMalloc( ptr, size ) checkGpu( cudaMalloc( ptr, size ) )
+
+#define gpuMemcpy( dstData, srcData, srcBytes, func ) checkGpu( cudaMemcpy( dstData, srcData, srcBytes, func ) )
+#define gpuMemcpyHostToDevice cudaMemcpyHostToDevice
+#define gpuMemcpyDeviceToHost cudaMemcpyDeviceToHost
+#define gpuMemcpyToSymbol( type1, type2, size ) checkGpu( cudaMemcpyToSymbol( type1, type2, size ) )
+
+#define gpuFree( ptr ) checkGpu( cudaFree( ptr ) )
+#define gpuFreeHost( ptr ) checkGpu( cudaFreeHost( ptr ) )
+
+#define gpuSetDevice cudaSetDevice
+#define gpuDeviceSynchronize cudaDeviceSynchronize
+#define gpuDeviceReset cudaDeviceReset
+
+#define gpuLaunchKernel( kernel, blocks, threads, ... ) kernel<<<blocks, threads>>>( __VA_ARGS__ )
+#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<<blocks, threads, sharedMem>>>( __VA_ARGS__ )
+
+//--------------------------------------------------------------------------
+
+#elif defined __HIPCC__
+
+#include "hip/hip_runtime.h"
+
+#define gpuError_t hipError_t
+#define gpuPeekAtLastError hipPeekAtLastError
+#define gpuGetErrorString hipGetErrorString
+#define gpuSuccess hipSuccess
+
+#define gpuMallocHost( ptr, size ) checkGpu( hipHostMalloc( ptr, size ) ) // HostMalloc better
+#define gpuMalloc( ptr, size ) checkGpu( hipMalloc( ptr, size ) )
+
+#define gpuMemcpy( dstData, srcData, srcBytes, func ) checkGpu( hipMemcpy( dstData, srcData, srcBytes, func ) )
+#define gpuMemcpyHostToDevice hipMemcpyHostToDevice
+#define gpuMemcpyDeviceToHost hipMemcpyDeviceToHost
+#define gpuMemcpyToSymbol( type1, type2, size ) checkGpu( hipMemcpyToSymbol( type1, type2, size ) )
+
+#define gpuFree( ptr ) checkGpu( hipFree( ptr ) )
+#define gpuFreeHost( ptr ) checkGpu( hipHostFree( ptr ) )
+
+#define gpuSetDevice hipSetDevice
+#define gpuDeviceSynchronize hipDeviceSynchronize
+#define gpuDeviceReset hipDeviceReset
+
+#define gpuLaunchKernel( kernel, blocks, threads, ... ) kernel<<<blocks, threads>>>( __VA_ARGS__ )
+#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<<blocks, threads, sharedMem>>>( __VA_ARGS__ )
+
+//--------------------------------------------------------------------------
+
+#endif
+
+#endif // MG5AMC_GPUABSTRACTION_H
diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/GpuRuntime.h b/epochX/cudacpp/gg_ttg.mad/SubProcesses/GpuRuntime.h
new file mode 100644
index 0000000000..93579ef08b
--- /dev/null
+++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/GpuRuntime.h
@@ -0,0 +1,85 @@
+// Copyright (C) 2020-2023 CERN and UCLouvain.
+// Licensed under the GNU Lesser General Public License (version 3 or later).
+// Created by: J. Teig (Jun 2023, based on earlier work by S. Roiser) for the MG5aMC CUDACPP plugin.
+// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+
+#ifndef MG5AMC_GPURUNTIME_H
+#define MG5AMC_GPURUNTIME_H 1
+
+// MG5AMC on GPU uses the CUDA runtime API, not the lower level CUDA driver API
+// See https://docs.nvidia.com/cuda/cuda-runtime-api/driver-vs-runtime-api.html#driver-vs-runtime-api
+
+#include "GpuAbstraction.h"
+
+#include <iostream>
+
+//--------------------------------------------------------------------------
+
+// See https://stackoverflow.com/a/14038590
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
+#define checkGpu( code ) { assertGpu( code, __FILE__, __LINE__ ); }
+inline void assertGpu( gpuError_t code, const char* file, int line, bool abort = true )
+{
+  if( code != gpuSuccess )
+  {
+    printf( "ERROR! assertGpu: '%s' (%d) in %s:%d\n", gpuGetErrorString( code ), code, file, line );
+    if( abort ) assert( code == gpuSuccess );
+  }
+}
+#endif /* clang-format on */
+
+//--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+namespace mg5amcGpu
+{
+  // Instantiate a GpuRuntime at the beginnining of the application's main to
+  // invoke gpuSetDevice(0) in the constructor and book a gpuDeviceReset() call in the destructor
+  // *** FIXME! This will all need to be designed differently when going to multi-GPU nodes! ***
+  struct GpuRuntime final
+  {
+    GpuRuntime( const bool debug = true )
+      : m_debug( debug ) { setUp( m_debug ); }
+    ~GpuRuntime() { tearDown( m_debug ); }
+    GpuRuntime( const GpuRuntime& ) = delete;
+    GpuRuntime( GpuRuntime&& ) = delete;
+    GpuRuntime& operator=( const GpuRuntime& ) = delete;
+    GpuRuntime& operator=( GpuRuntime&& ) = delete;
+    bool m_debug;
+
+    // Set up CUDA application
+    // ** NB: strictly speaking this is not needed when using the CUDA runtime API **
+    // Calling cudaSetDevice on startup is useful to properly book-keep the time spent in CUDA initialization
+    static void setUp( const bool debug = true )
+    {
+      // ** NB: it is useful to call cudaSetDevice, or cudaFree, to properly book-keep the time spent in CUDA initialization
+      // ** NB: otherwise, the first CUDA operation (eg a cudaMemcpyToSymbol in CPPProcess ctor) appears to take much longer!
+      /*
+      // [We initially added cudaFree(0) to "ease profile analysis" only because it shows up as a big recognizable block!]
+      // No explicit initialization is needed: https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#initialization
+      // It is not clear what cudaFree(0) does at all: https://stackoverflow.com/questions/69967813/
+      if ( debug ) std::cout << "__CudaRuntime: calling cudaFree(0)" << std::endl;
+      checkCuda( cudaFree( 0 ) ); // SLOW!
+      */
+      // Replace cudaFree(0) by cudaSetDevice(0), even if it is not really needed either
+      // (but see https://developer.nvidia.com/blog/cuda-pro-tip-always-set-current-device-avoid-multithreading-bugs)
+      if( debug ) std::cout << "__GpuRuntime: calling GpuSetDevice(0)" << std::endl;
+      checkGpu( gpuSetDevice( 0 ) ); // SLOW!
+    }
+
+    // Tear down CUDA application (call cudaDeviceReset)
+    // ** NB: strictly speaking this is not needed when using the CUDA runtime API **
+    // Calling cudaDeviceReset on shutdown is only needed for checking memory leaks in cuda-memcheck
+    // See https://docs.nvidia.com/cuda/cuda-memcheck/index.html#leak-checking
+    static void tearDown( const bool debug = true )
+    {
+      if( debug ) std::cout << "__GpuRuntime: calling GpuDeviceReset()" << std::endl;
+      checkGpu( gpuDeviceReset() );
+    }
+  };
+}
+#endif
+
+//--------------------------------------------------------------------------
+
+#endif // MG5AMC_GPURUNTIME_H
diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/MadgraphTest.h b/epochX/cudacpp/gg_ttg.mad/SubProcesses/MadgraphTest.h
index fd7734ce42..d2ff326e20 100644
--- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/MadgraphTest.h
+++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/MadgraphTest.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: S. Hageboeck (Dec 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MADGRAPHTEST_H_
 #define MADGRAPHTEST_H_ 1
@@ -21,7 +21,7 @@
 #include <string>
 #include <vector>
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 using mg5amcGpu::CPPProcess;
 #else
 using mg5amcCpu::CPPProcess;
@@ -200,7 +200,7 @@ class MadgraphTest : public testing::TestWithParam<TestDriverBase*>
 
 // Since we link both the CPU-only and GPU tests into the same executable, we prevent
 // a multiply defined symbol by only compiling this in the non-CUDA phase:
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 
 /// Compare momenta and matrix elements.
 /// This uses an implementation of TestDriverBase to run a madgraph workflow,
@@ -309,6 +309,6 @@ TEST_P( MadgraphTest, CompareMomentaAndME )
   }
 }
 
-#endif // __CUDACC__
+#endif // MGONGPUCPP_GPUIMPL
 
 #endif /* MADGRAPHTEST_H_ */
diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/MatrixElementKernels.cc b/epochX/cudacpp/gg_ttg.mad/SubProcesses/MatrixElementKernels.cc
index 30257195b6..d6d6c4f179 100644
--- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/MatrixElementKernels.cc
+++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/MatrixElementKernels.cc
@@ -1,12 +1,12 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #include "MatrixElementKernels.h"
 
 #include "CPPProcess.h"
-#include "CudaRuntime.h"
+#include "GpuRuntime.h" // Includes the abstraction for Nvidia/AMD compilation
 #include "MemoryAccessMomenta.h"
 #include "MemoryBuffers.h"
 
@@ -14,7 +14,7 @@
 
 //============================================================================
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 namespace mg5amcCpu
 {
 
@@ -143,7 +143,7 @@ namespace mg5amcCpu
 
 //============================================================================
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 {
 
@@ -202,13 +202,13 @@ namespace mg5amcGpu
     PinnedHostBufferHelicityMask hstIsGoodHel( ncomb );
     DeviceBufferHelicityMask devIsGoodHel( ncomb );
     // ... 0d1. Compute good helicity mask on the device
-    computeDependentCouplings<<<m_gpublocks, m_gputhreads>>>( m_gs.data(), m_couplings.data() );
+    gpuLaunchKernel( computeDependentCouplings, m_gpublocks, m_gputhreads, m_gs.data(), m_couplings.data() );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    sigmaKin_getGoodHel<<<m_gpublocks, m_gputhreads>>>( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_numerators.data(), m_denominators.data(), devIsGoodHel.data() );
+    gpuLaunchKernel( sigmaKin_getGoodHel, m_gpublocks, m_gputhreads, m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_numerators.data(), m_denominators.data(), devIsGoodHel.data() );
 #else
-    sigmaKin_getGoodHel<<<m_gpublocks, m_gputhreads>>>( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), devIsGoodHel.data() );
+    gpuLaunchKernel( sigmaKin_getGoodHel, m_gpublocks, m_gputhreads, m_momenta.data(), m_couplings.data(), m_matrixElements.data(), devIsGoodHel.data() );
 #endif
-    checkCuda( cudaPeekAtLastError() );
+    checkGpu( gpuPeekAtLastError() );
     // ... 0d2. Copy back good helicity mask to the host
     copyHostFromDevice( hstIsGoodHel, devIsGoodHel );
     // ... 0d3. Copy back good helicity list to constant memory on the device
@@ -219,19 +219,19 @@ namespace mg5amcGpu
 
   void MatrixElementKernelDevice::computeMatrixElements( const unsigned int channelId )
   {
-    computeDependentCouplings<<<m_gpublocks, m_gputhreads>>>( m_gs.data(), m_couplings.data() );
+    gpuLaunchKernel( computeDependentCouplings, m_gpublocks, m_gputhreads, m_gs.data(), m_couplings.data() );
 #ifndef MGONGPU_NSIGHT_DEBUG
     constexpr unsigned int sharedMemSize = 0;
 #else
     constexpr unsigned int sharedMemSize = ntpbMAX * sizeof( float );
 #endif
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    sigmaKin<<<m_gpublocks, m_gputhreads, sharedMemSize>>>( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), channelId, m_numerators.data(), m_denominators.data(), m_selhel.data(), m_selcol.data() );
+    gpuLaunchKernelSharedMem( sigmaKin, m_gpublocks, m_gputhreads, sharedMemSize, m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), channelId, m_numerators.data(), m_denominators.data(), m_selhel.data(), m_selcol.data() );
 #else
-    sigmaKin<<<m_gpublocks, m_gputhreads, sharedMemSize>>>( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), m_selhel.data(), m_selcol.data() );
+    gpuLaunchKernelSharedMem( sigmaKin, m_gpublocks, m_gputhreads, sharedMemSize, m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), m_selhel.data(), m_selcol.data() );
 #endif
-    checkCuda( cudaPeekAtLastError() );
-    checkCuda( cudaDeviceSynchronize() );
+    checkGpu( gpuPeekAtLastError() );
+    checkGpu( gpuDeviceSynchronize() );
   }
 
   //--------------------------------------------------------------------------
diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/MatrixElementKernels.h b/epochX/cudacpp/gg_ttg.mad/SubProcesses/MatrixElementKernels.h
index 23e84757a2..72bd8f195b 100644
--- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/MatrixElementKernels.h
+++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/MatrixElementKernels.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MATRIXELEMENTKERNELS_H
 #define MATRIXELEMENTKERNELS_H 1
@@ -10,7 +10,7 @@
 
 #include "MemoryBuffers.h"
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -81,7 +81,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating matrix element calculations on a CPU host
   class MatrixElementKernelHost final : public MatrixElementKernelBase, public NumberOfEvents
   {
@@ -130,7 +130,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   // A class encapsulating matrix element calculations on a GPU device
   class MatrixElementKernelDevice : public MatrixElementKernelBase, public NumberOfEvents
   {
diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/MemoryAccessHelpers.h b/epochX/cudacpp/gg_ttg.mad/SubProcesses/MemoryAccessHelpers.h
index c82a6c7635..db73e4e064 100644
--- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/MemoryAccessHelpers.h
+++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/MemoryAccessHelpers.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MemoryAccessHelpers_H
 #define MemoryAccessHelpers_H 1
@@ -105,7 +105,7 @@ class KernelAccessHelper : public MemoryAccessHelper<T>
     }
     else
     {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
       const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid
       //printf( "kernelAccessRecord: ievt=%d threadId=%d\n", ievt, threadIdx.x );
       return T::ieventAccessRecord( buffer, ievt ); // NB fptype and fptype_sv coincide for CUDA
diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/MemoryAccessMomenta.h b/epochX/cudacpp/gg_ttg.mad/SubProcesses/MemoryAccessMomenta.h
index 0ac4faa3c7..38fade09fb 100644
--- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/MemoryAccessMomenta.h
+++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/MemoryAccessMomenta.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MemoryAccessMomenta_H
 #define MemoryAccessMomenta_H 1
@@ -12,7 +12,7 @@
 #include "MemoryAccessHelpers.h"
 #include "MemoryAccessVectors.h"
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 using mg5amcGpu::CPPProcess;
 #else
 using mg5amcCpu::CPPProcess;
@@ -29,7 +29,7 @@ class MemoryAccessMomentaBase //_AOSOAv1
 
   // Number of Events Per Page in the momenta AOSOA memory buffer layout
   // (these are all best kept as a compile-time constants: see issue #23)
-#ifdef __CUDACC__ /* clang-format off */
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
   // -----------------------------------------------------------------------------------------------
   // --- GPUs: neppM is best set to a power of 2 times the number of fptype's in a 32-byte cacheline
   // --- This is relevant to ensure coalesced access to momenta in global memory
diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/MemoryAccessRandomNumbers.h b/epochX/cudacpp/gg_ttg.mad/SubProcesses/MemoryAccessRandomNumbers.h
index e2988d39f3..40cb089135 100644
--- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/MemoryAccessRandomNumbers.h
+++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/MemoryAccessRandomNumbers.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MemoryAccessRandomNumbers_H
 #define MemoryAccessRandomNumbers_H 1
@@ -11,7 +11,7 @@
 #include "CPPProcess.h"
 #include "MemoryAccessHelpers.h"
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 using mg5amcGpu::CPPProcess;
 #else
 using mg5amcCpu::CPPProcess;
diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/MemoryAccessVectors.h b/epochX/cudacpp/gg_ttg.mad/SubProcesses/MemoryAccessVectors.h
index e9b197368e..08faccff0f 100644
--- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/MemoryAccessVectors.h
+++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/MemoryAccessVectors.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MemoryAccessVectors_H
 #define MemoryAccessVectors_H 1
@@ -10,7 +10,7 @@
 
 #include "mgOnGpuVectors.h"
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 namespace mg5amcCpu // this is only needed for CPU SIMD vectorization
 {
 
diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/MemoryBuffers.h b/epochX/cudacpp/gg_ttg.mad/SubProcesses/MemoryBuffers.h
index 3093e6ed18..7756a71621 100644
--- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/MemoryBuffers.h
+++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/MemoryBuffers.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021, based on earlier work by S. Hageboeck) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Roiser, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MemoryBuffers_H
 #define MemoryBuffers_H 1
@@ -11,12 +11,12 @@
 #include "mgOnGpuCxtypes.h"
 
 #include "CPPProcess.h"
-#include "CudaRuntime.h"
+#include "GpuRuntime.h"
 #include "Parameters_sm.h"
 
 #include <sstream>
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -87,7 +87,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   constexpr bool HostBufferALIGNED = false;   // ismisaligned=false
   constexpr bool HostBufferMISALIGNED = true; // ismisaligned=true
 
@@ -119,7 +119,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   // A class encapsulating a CUDA pinned host buffer
   template<typename T>
   class PinnedHostBufferBase : public BufferBase<T>
@@ -128,18 +128,18 @@ namespace mg5amcCpu
     PinnedHostBufferBase( const size_t size )
       : BufferBase<T>( size, false )
     {
-      checkCuda( cudaMallocHost( &( this->m_data ), this->bytes() ) );
+      gpuMallocHost( &( this->m_data ), this->bytes() );
     }
     virtual ~PinnedHostBufferBase()
     {
-      checkCuda( cudaFreeHost( this->m_data ) );
+      gpuFreeHost( this->m_data );
     }
   };
 #endif
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   // A class encapsulating a CUDA device buffer
   template<typename T>
   class DeviceBufferBase : public BufferBase<T>
@@ -148,18 +148,18 @@ namespace mg5amcCpu
     DeviceBufferBase( const size_t size )
       : BufferBase<T>( size, true )
     {
-      checkCuda( cudaMalloc( &( this->m_data ), this->bytes() ) );
+      gpuMalloc( &( this->m_data ), this->bytes() );
     }
     virtual ~DeviceBufferBase()
     {
-      checkCuda( cudaFree( this->m_data ) );
+      gpuFree( this->m_data );
     }
   };
 #endif
 
   //--------------------------------------------------------------------------
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for a given number of events
   template<typename T, size_t sizePerEvent, bool ismisaligned>
   class HostBuffer : public HostBufferBase<T, ismisaligned>, virtual private NumberOfEvents
@@ -175,7 +175,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   // A class encapsulating a CUDA pinned host buffer for a given number of events
   template<typename T, size_t sizePerEvent>
   class PinnedHostBuffer : public PinnedHostBufferBase<T>, virtual private NumberOfEvents
@@ -191,7 +191,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   // A class encapsulating a CUDA device buffer for a given number of events
   template<typename T, size_t sizePerEvent>
   class DeviceBuffer : public DeviceBufferBase<T>, virtual private NumberOfEvents
@@ -213,7 +213,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for momenta random numbers
   constexpr size_t sizePerEventRndNumMomenta = MemoryBuffers::np4 * MemoryBuffers::nparf;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for momenta random numbers
   typedef HostBuffer<fptype, sizePerEventRndNumMomenta, HostBufferALIGNED> HostBufferRndNumMomenta;
 #else
@@ -232,7 +232,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer with ONE fptype per event
   constexpr size_t sizePerEventOneFp = 1;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer with ONE fptype per event
   typedef HostBuffer<fptype, sizePerEventOneFp, HostBufferALIGNED> HostBufferOneFp;
 #else
@@ -257,7 +257,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for Gs
   constexpr size_t sizePerEventGs = 1;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for gs
   typedef HostBuffer<fptype, sizePerEventGs, HostBufferALIGNED> HostBufferGs;
 #else
@@ -276,7 +276,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for numerators
   constexpr size_t sizePerEventNumerators = 1;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for gs
   typedef HostBuffer<fptype, sizePerEventNumerators, HostBufferALIGNED> HostBufferNumerators;
 #else
@@ -296,7 +296,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for denominators
   constexpr size_t sizePerEventDenominators = 1;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for gs
   typedef HostBuffer<fptype, sizePerEventDenominators, HostBufferALIGNED> HostBufferDenominators;
 #else
@@ -315,7 +315,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for random numbers
   constexpr size_t sizePerEventCouplings = MemoryBuffers::ndcoup * MemoryBuffers::nx2;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for gs
   typedef HostBuffer<fptype, sizePerEventCouplings, HostBufferALIGNED> HostBufferCouplings;
 #else
@@ -333,7 +333,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for momenta
   constexpr size_t sizePerEventMomenta = MemoryBuffers::np4 * MemoryBuffers::npar;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for momenta
   typedef HostBuffer<fptype, sizePerEventMomenta, HostBufferALIGNED> HostBufferMomenta;
   //typedef HostBuffer<fptype, sizePerEventMomenta, HostBufferMISALIGNED> HostBufferMomenta; // TEST MISALIGNMENT!
@@ -352,7 +352,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for sampling weights
   constexpr size_t sizePerEventWeights = 1;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for sampling weights
   typedef HostBuffer<fptype, sizePerEventWeights, HostBufferALIGNED> HostBufferWeights;
 #else
@@ -370,7 +370,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for matrix elements
   constexpr size_t sizePerEventMatrixElements = 1;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for matrix elements
   typedef HostBuffer<fptype, sizePerEventMatrixElements, HostBufferALIGNED> HostBufferMatrixElements;
 #else
@@ -385,7 +385,7 @@ namespace mg5amcCpu
   // A base class encapsulating a memory buffer for the helicity mask
   typedef BufferBase<bool> BufferHelicityMask;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for the helicity mask
   typedef HostBufferBase<bool, HostBufferALIGNED> HostBufferHelicityMask;
 #else
@@ -403,7 +403,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for wavefunctions
   constexpr size_t sizePerEventWavefunctions = MemoryBuffers::nw6 * MemoryBuffers::nx2;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for wavefunctions
   typedef HostBuffer<fptype, sizePerEventWavefunctions, HostBufferALIGNED> HostBufferWavefunctions;
 #else
@@ -421,7 +421,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for helicity random numbers
   constexpr size_t sizePerEventRndNumHelicity = 1;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for helicity random numbers
   typedef HostBuffer<fptype, sizePerEventRndNumHelicity, HostBufferALIGNED> HostBufferRndNumHelicity;
 #else
@@ -439,7 +439,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for color random numbers
   constexpr size_t sizePerEventRndNumColor = 1;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for color random numbers
   typedef HostBuffer<fptype, sizePerEventRndNumColor, HostBufferALIGNED> HostBufferRndNumColor;
 #else
@@ -457,7 +457,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for helicity selection
   constexpr size_t sizePerEventSelectedHelicity = 1;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for helicity selection
   typedef HostBuffer<int, sizePerEventSelectedHelicity, HostBufferALIGNED> HostBufferSelectedHelicity;
 #else
@@ -475,7 +475,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for color selection
   constexpr size_t sizePerEventSelectedColor = 1;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for color selection
   typedef HostBuffer<int, sizePerEventSelectedColor, HostBufferALIGNED> HostBufferSelectedColor;
 #else
@@ -487,7 +487,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   template<class Tdst, class Tsrc>
   void copyDeviceFromHost( Tdst& dst, const Tsrc& src ) // keep the same order of arguments as in memcpy
   {
@@ -504,13 +504,13 @@ namespace mg5amcCpu
       throw std::runtime_error( sstr.str() );
     }
     // NB (PR #45): cudaMemcpy involves an intermediate memcpy to pinned memory if host array is a not a pinned host array
-    checkCuda( cudaMemcpy( dst.data(), src.data(), src.bytes(), cudaMemcpyHostToDevice ) );
+    gpuMemcpy( dst.data(), src.data(), src.bytes(), gpuMemcpyHostToDevice );
   }
 #endif
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   template<class Tdst, class Tsrc>
   void copyHostFromDevice( Tdst& dst, const Tsrc& src ) // keep the same order of arguments as in memcpy
   {
@@ -527,7 +527,7 @@ namespace mg5amcCpu
       throw std::runtime_error( sstr.str() );
     }
     // NB (PR #45): cudaMemcpy involves an intermediate memcpy to pinned memory if host array is a not a pinned host array
-    checkCuda( cudaMemcpy( dst.data(), src.data(), src.bytes(), cudaMemcpyDeviceToHost ) );
+    gpuMemcpy( dst.data(), src.data(), src.bytes(), gpuMemcpyDeviceToHost );
   }
 #endif
 
diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/CPPProcess.cc b/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/CPPProcess.cc
index 5856e464ed..389a5d98b3 100644
--- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/CPPProcess.cc
+++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/CPPProcess.cc
@@ -4,7 +4,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
 // MadGraph5_aMC@NLO v. 3.5.0_lo_vect, 2023-06-09
@@ -16,7 +16,6 @@
 
 #include "mgOnGpuConfig.h"
 
-#include "CudaRuntime.h"
 #include "HelAmps_sm.h"
 #include "MemoryAccessAmplitudes.h"
 #include "MemoryAccessCouplings.h"
@@ -46,7 +45,7 @@
 // Class member functions for calculating the matrix elements for
 // Process: g g > t t~ g WEIGHTED<=3 @1
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -80,7 +79,7 @@ namespace mg5amcCpu
   __device__ const fptype cIPD[2] = { (fptype)Parameters_sm::mdl_MT, (fptype)Parameters_sm::mdl_WT };
   __device__ const fptype* cIPC = nullptr; // unused as nicoup=0
 #else
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   __device__ __constant__ fptype cIPD[2];
   __device__ __constant__ fptype* cIPC = nullptr; // unused as nicoup=0
 #else
@@ -90,7 +89,7 @@ namespace mg5amcCpu
 #endif
 
   // Helicity combinations (and filtering of "good" helicity combinations)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   __device__ __constant__ short cHel[ncomb][npar];
   __device__ __constant__ int cNGoodHel;
   __device__ __constant__ int cGoodHel[ncomb];
@@ -118,13 +117,13 @@ namespace mg5amcCpu
                            fptype* allDenominators,       // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
                            fptype_sv* jamp2_sv            // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled)
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
                            , const int ievt00             // input: first event number in current C++ event page (for CUDA, ievt depends on threadid)
 #endif
                            )
   //ALWAYS_INLINE // attributes are not permitted in a function definition
   {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     using namespace mg5amcGpu;
     using M_ACCESS = DeviceAccessMomenta;         // non-trivial access: buffer includes all events
     using E_ACCESS = DeviceAccessMatrixElements;  // non-trivial access: buffer includes all events
@@ -151,7 +150,7 @@ namespace mg5amcCpu
 #endif /* clang-format on */
     mgDebug( 0, __FUNCTION__ );
     //printf( "calculate_wavefunctions: ihel=%2d\n", ihel );
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
     //printf( "calculate_wavefunctions: ievt00=%d\n", ievt00 );
 #endif
 
@@ -187,7 +186,7 @@ namespace mg5amcCpu
 #endif
     for( int iParity = 0; iParity < nParity; ++iParity )
     { // START LOOP ON IPARITY
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
       const int ievt0 = ievt00 + iParity * neppV;
 #endif
       constexpr size_t nxcoup = ndcoup + nicoup; // both dependent and independent couplings
@@ -200,8 +199,10 @@ namespace mg5amcCpu
         allCOUPs[idcoup] = CD_ACCESS::idcoupAccessBufferConst( allcouplings, idcoup ); // dependent couplings, vary event-by-event
       for( size_t iicoup = 0; iicoup < nicoup; iicoup++ )
         allCOUPs[ndcoup + iicoup] = CI_ACCESS::iicoupAccessBufferConst( cIPC, iicoup ); // independent couplings, fixed for all events
+#ifdef MGONGPUCPP_GPUIMPL
 #ifdef __CUDACC__
 #pragma nv_diagnostic pop
+#endif
       // CUDA kernels take input/output buffers with momenta/MEs for all events
       const fptype* momenta = allmomenta;
       const fptype* COUPs[nxcoup];
@@ -505,7 +506,7 @@ namespace mg5amcCpu
         { 1, -8, 10, 1, 64, -8 },
         { 10, 1, 1, -8, -8, 64 } }; // 2-D array[6][6]
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
       // Pre-compute a constexpr triangular color matrix properly normalized #475
       struct TriangularNormalizedColorMatrix
       {
@@ -562,7 +563,7 @@ namespace mg5amcCpu
 #endif
       for( int icol = 0; icol < ncolor; icol++ )
       {
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
         // === C++ START ===
         // Diagonal terms
 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
@@ -621,7 +622,7 @@ namespace mg5amcCpu
       MEs_sv_previous += deltaMEs_previous;
 #endif
       /*
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
       if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", blockDim.x * blockIdx.x + threadIdx.x, ihel, MEs_sv );
 #else
 #ifdef MGONGPU_CPPSIMD
@@ -684,8 +685,8 @@ namespace mg5amcCpu
       { 1, 1, 1, 1, 1 },
       { 1, 1, 1, -1, -1 },
       { 1, 1, 1, -1, 1 } };
-#ifdef __CUDACC__
-    checkCuda( cudaMemcpyToSymbol( cHel, tHel, ncomb * npar * sizeof( short ) ) );
+#ifdef MGONGPUCPP_GPUIMPL
+    gpuMemcpyToSymbol( cHel, tHel, ncomb * npar * sizeof( short ) );
 #else
     memcpy( cHel, tHel, ncomb * npar * sizeof( short ) );
 #endif
@@ -726,9 +727,9 @@ namespace mg5amcCpu
     // Then copy them to CUDA constant memory (issue #39) or its C++ emulation in file-scope static memory
     const fptype tIPD[2] = { (fptype)m_pars->mdl_MT, (fptype)m_pars->mdl_WT };
     //const cxtype tIPC[0] = { ... }; // nicoup=0
-#ifdef __CUDACC__
-    checkCuda( cudaMemcpyToSymbol( cIPD, tIPD, 2 * sizeof( fptype ) ) );
-    //checkCuda( cudaMemcpyToSymbol( cIPC, tIPC, 0 * sizeof( cxtype ) ) ); // nicoup=0
+#ifdef MGONGPUCPP_GPUIMPL
+    gpuMemcpyToSymbol( cIPD, tIPD, 2 * sizeof( fptype ) );
+    //gpuMemcpyToSymbol( cIPC, tIPC, 0 * sizeof( cxtype ) ); // nicoup=0
 #else
     memcpy( cIPD, tIPD, 2 * sizeof( fptype ) );
     //memcpy( cIPC, tIPC, 0 * sizeof( cxtype ) ); // nicoup=0
@@ -765,7 +766,7 @@ namespace mg5amcCpu
   {
     std::stringstream out;
     // CUDA version (NVCC)
-    // [Use __NVCC__ instead of __CUDACC__ here!]
+    // [Use __NVCC__ instead of MGONGPUCPP_GPUIMPL here!]
     // [This tests if 'nvcc' was used even to build a .cc file, even if not necessarily 'nvcc -x cu' for a .cu file]
     // [Check 'nvcc --compiler-options -dM -E dummy.c | grep CUDA': see https://stackoverflow.com/a/53713712]
 #ifdef __NVCC__
@@ -830,12 +831,12 @@ namespace mg5amcCpu
   __global__ void /* clang-format off */
   computeDependentCouplings( const fptype* allgs, // input: Gs[nevt]
                              fptype* allcouplings // output: couplings[nevt*ndcoup*2]
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
                              , const int nevt     // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #endif
   ) /* clang-format on */
   {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     using namespace mg5amcGpu;
     using G_ACCESS = DeviceAccessGs;
     using C_ACCESS = DeviceAccessCouplings;
@@ -856,7 +857,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__ /* clang-format off */
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
   __global__ void
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
@@ -985,9 +986,9 @@ namespace mg5amcCpu
         nGoodHel++;
       }
     }
-#ifdef __CUDACC__
-    checkCuda( cudaMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) ) );
-    checkCuda( cudaMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) ) );
+#ifdef MGONGPUCPP_GPUIMPL
+    gpuMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) );
+    gpuMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) );
 #else
     cNGoodHel = nGoodHel;
     for( int ihel = 0; ihel < ncomb; ihel++ ) cGoodHel[ihel] = goodHel[ihel];
@@ -1011,7 +1012,7 @@ namespace mg5amcCpu
 #endif
             int* allselhel,                // output: helicity selection[nevt]
             int* allselcol                 // output: helicity selection[nevt]
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
             , const int nevt               // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #endif
             ) /* clang-format on */
@@ -1032,7 +1033,7 @@ namespace mg5amcCpu
     // Denominators: spins, colors and identical particles
     constexpr int helcolDenominators[1] = { 256 }; // assume nprocesses == 1 (#272 and #343)
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     // Remember: in CUDA this is a kernel for one event, in c++ this processes n events
     const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid
 #else
@@ -1046,9 +1047,12 @@ namespace mg5amcCpu
 #endif
 
     // Start sigmaKin_lines
+
+#include "GpuAbstraction.h"
+
     // === PART 0 - INITIALISATION (before calculate_wavefunctions) ===
     // Reset the "matrix elements" - running sums of |M|^2 over helicities for the given event
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     allMEs[ievt] = 0;
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     allNumerators[ievt] = 0;
@@ -1076,7 +1080,7 @@ namespace mg5amcCpu
     // === PART 1 - HELICITY LOOP: CALCULATE WAVEFUNCTIONS ===
     // (in both CUDA and C++, using precomputed good helicities)
 
-#ifdef __CUDACC__ // CUDA OR C++
+#ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++
 
     // *** START OF PART 1a - CUDA (one event per CPU thread) ***
     // Running sum of partial amplitudes squared for event by event color selection (#402)
@@ -1280,7 +1284,7 @@ namespace mg5amcCpu
     // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event
     // [NB 'sum over final spins, average over initial spins', eg see
     // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf]
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     allMEs[ievt] /= helcolDenominators[0];
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     if( channelId > 0 ) allMEs[ievt] *= allNumerators[ievt] / allDenominators[ievt];
diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/CPPProcess.h b/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/CPPProcess.h
index 0edca1b52a..ff2cb4ab9a 100644
--- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/CPPProcess.h
+++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/CPPProcess.h
@@ -4,7 +4,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
 // MadGraph5_aMC@NLO v. 3.5.0_lo_vect, 2023-06-09
@@ -25,7 +25,7 @@
 
 //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -107,7 +107,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   __global__ void
   computeDependentCouplings( const fptype* allgs,    // input: Gs[nevt]
                              fptype* allcouplings ); // output: couplings[nevt*ndcoup*2]
@@ -120,7 +120,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__ /* clang-format off */
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
   __global__ void
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
@@ -150,7 +150,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__ /* clang-format off */
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
   __global__ void
   sigmaKin( const fptype* allmomenta,      // input: momenta[nevt*npar*4]
             const fptype* allcouplings,    // input: couplings[nevt*ndcoup*2]
diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/CudaRuntime.h b/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/CudaRuntime.h
deleted file mode 120000
index ce9e1a487a..0000000000
--- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/CudaRuntime.h
+++ /dev/null
@@ -1 +0,0 @@
-../CudaRuntime.h
\ No newline at end of file
diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/GpuAbstraction.h b/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/GpuAbstraction.h
new file mode 120000
index 0000000000..72054e19ba
--- /dev/null
+++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/GpuAbstraction.h
@@ -0,0 +1 @@
+../GpuAbstraction.h
\ No newline at end of file
diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/GpuRuntime.h b/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/GpuRuntime.h
new file mode 120000
index 0000000000..3920e83be4
--- /dev/null
+++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/GpuRuntime.h
@@ -0,0 +1 @@
+../GpuRuntime.h
\ No newline at end of file
diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/check_sa.cc b/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/check_sa.cc
index f1e75b9252..1bad694d1c 100644
--- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/check_sa.cc
+++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/check_sa.cc
@@ -4,7 +4,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: O. Mattelaer (Nov 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 
 #include "mgOnGpuConfig.h"
@@ -12,6 +12,7 @@
 #include "BridgeKernels.h"
 #include "CPPProcess.h"
 #include "CrossSectionKernels.h"
+#include "GpuRuntime.h"
 #include "MatrixElementKernels.h"
 #include "MemoryAccessMatrixElements.h"
 #include "MemoryAccessMomenta.h"
@@ -63,7 +64,7 @@ usage( char* argv0, int ret = 1 )
   std::cout << std::endl;
   std::cout << "Summary stats are always computed: '-p' and '-j' only control their printout" << std::endl;
   std::cout << "The '-d' flag only enables NaN/abnormal warnings and OMP debugging" << std::endl;
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 #ifdef _OPENMP
   std::cout << std::endl;
   std::cout << "Use the OMP_NUM_THREADS environment variable to control OMP multi-threading" << std::endl;
@@ -77,7 +78,7 @@ int
 main( int argc, char** argv )
 {
   // Namespaces for CUDA and C++ (FIXME - eventually use the same namespace everywhere...)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   using namespace mg5amcGpu;
 #else
   using namespace mg5amcCpu;
@@ -103,11 +104,11 @@ main( int argc, char** argv )
     CurandDevice = 2
   };
 #ifdef __CUDACC__
-  RandomNumberMode rndgen = RandomNumberMode::CurandDevice; // default on GPU
+  RandomNumberMode rndgen = RandomNumberMode::CurandDevice; // default on CUDA GPU
 #elif not defined MGONGPU_HAS_NO_CURAND
   RandomNumberMode rndgen = RandomNumberMode::CurandHost;  // default on CPU if build has curand
 #else
-  RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // default on CPU if build has no curand
+  RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // default on CPU if build has no curand and on HIP GPU
 #endif
   // Rambo sampling mode (NB RamboHost implies CommonRandom or CurandHost!)
   enum class RamboSamplingMode
@@ -115,7 +116,7 @@ main( int argc, char** argv )
     RamboHost = 1,
     RamboDevice = 2
   };
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   RamboSamplingMode rmbsmp = RamboSamplingMode::RamboDevice; // default on GPU
 #else
   RamboSamplingMode rmbsmp = RamboSamplingMode::RamboHost; // default on CPU
@@ -148,7 +149,7 @@ main( int argc, char** argv )
 #ifdef __CUDACC__
       rndgen = RandomNumberMode::CurandDevice;
 #else
-      throw std::runtime_error( "CurandDevice is not supported on CPUs" );
+      throw std::runtime_error( "CurandDevice is not supported on CPUs or on HIP GPUs" );
 #endif
     }
     else if( arg == "--curhst" )
@@ -165,7 +166,7 @@ main( int argc, char** argv )
     }
     else if( arg == "--rmbdev" )
     {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
       rmbsmp = RamboSamplingMode::RamboDevice;
 #else
       throw std::runtime_error( "RamboDevice is not supported on CPUs" );
@@ -239,13 +240,13 @@ main( int argc, char** argv )
     return usage( argv[0] );
   }
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 #ifdef _OPENMP
   ompnumthreadsNotSetMeansOneThread( debug ? 1 : 0 ); // quiet(-1), info(0), debug(1)
 #endif
 #endif
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // Fail gently and avoid "Illegal instruction (core dumped)" if the host does not support the SIMD used in the ME calculation
   // Note: this prevents a crash on pmpe04 but not on some github CI nodes?
   // [NB: SIMD vectorization in mg5amc C++ code is only used in the ME calculation below MatrixElementKernelHost!]
@@ -263,14 +264,14 @@ main( int argc, char** argv )
 
   // === STEP 0 - INITIALISE
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 
-  // --- 00. Initialise cuda
-  // Instantiate a CudaRuntime at the beginnining of the application's main to
-  // invoke cudaSetDevice(0) in the constructor and book a cudaDeviceReset() call in the destructor
-  const std::string cdinKey = "00 CudaInit";
+  // --- 00. Initialise GPU
+  // Instantiate a GpuRuntime at the beginnining of the application's main.
+  // For CUDA this invokes cudaSetDevice(0) in the constructor and books a cudaDeviceReset() call in the destructor.
+  const std::string cdinKey = "00 GpuInit";
   timermap.start( cdinKey );
-  CudaRuntime cudaRuntime( debug );
+  GpuRuntime GpuRuntime( debug );
 #endif
 
   // --- 0a. Initialise physics process
@@ -292,7 +293,7 @@ main( int argc, char** argv )
   timermap.start( alloKey );
 
   // Memory buffers for random numbers for momenta
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferRndNumMomenta hstRndmom( nevt );
 #else
   PinnedHostBufferRndNumMomenta hstRndmom( nevt );
@@ -300,7 +301,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for sampling weights
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferWeights hstWeights( nevt );
 #else
   PinnedHostBufferWeights hstWeights( nevt );
@@ -308,7 +309,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for momenta
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferMomenta hstMomenta( nevt );
 #else
   PinnedHostBufferMomenta hstMomenta( nevt );
@@ -316,7 +317,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for Gs
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferGs hstGs( nevt );
 #else
   PinnedHostBufferGs hstGs( nevt );
@@ -333,7 +334,7 @@ main( int argc, char** argv )
   }
 
   // Memory buffers for matrix elements
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferMatrixElements hstMatrixElements( nevt );
 #else
   PinnedHostBufferMatrixElements hstMatrixElements( nevt );
@@ -342,7 +343,7 @@ main( int argc, char** argv )
 
   // Memory buffers for random numbers for helicity selection
   // *** NB #403 these buffers always remain initialised at 0: no need for helicity choice in gcheck/check (no LHE produced) ***
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferRndNumHelicity hstRndHel( nevt );
 #else
   PinnedHostBufferRndNumHelicity hstRndHel( nevt );
@@ -351,7 +352,7 @@ main( int argc, char** argv )
 
   // Memory buffers for random numbers for color selection
   // *** NB #402 these buffers always remain initialised at 0: no need for color choice in gcheck/check (no LHE produced) ***
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferRndNumColor hstRndCol( nevt );
 #else
   PinnedHostBufferRndNumColor hstRndCol( nevt );
@@ -359,7 +360,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for helicity selection
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferSelectedHelicity hstSelHel( nevt );
 #else
   PinnedHostBufferSelectedHelicity hstSelHel( nevt );
@@ -367,7 +368,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for color selection
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferSelectedColor hstSelCol( nevt );
 #else
   PinnedHostBufferSelectedColor hstSelCol( nevt );
@@ -403,7 +404,7 @@ main( int argc, char** argv )
 #else
   else
   {
-    throw std::logic_error( "CurandDevice is not supported on CPUs" ); // INTERNAL ERROR (no path to this statement)
+    throw std::logic_error( "CurandDevice is not supported on CPUs or HIP GPUs" ); // INTERNAL ERROR (no path to this statement)
   }
 #endif
 #else
@@ -421,7 +422,7 @@ main( int argc, char** argv )
   }
   else
   {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     prsk.reset( new RamboSamplingKernelDevice( energy, devRndmom, devMomenta, devWeights, gpublocks, gputhreads ) );
 #else
     throw std::logic_error( "RamboDevice is not supported on CPUs" ); // INTERNAL ERROR (no path to this statement)
@@ -432,7 +433,7 @@ main( int argc, char** argv )
   std::unique_ptr<MatrixElementKernelBase> pmek;
   if( !bridge )
   {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     pmek.reset( new MatrixElementKernelDevice( devMomenta, devGs, devRndHel, devRndCol, devMatrixElements, devSelHel, devSelCol, gpublocks, gputhreads ) );
 #else
     pmek.reset( new MatrixElementKernelHost( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, nevt ) );
@@ -440,7 +441,7 @@ main( int argc, char** argv )
   }
   else
   {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     pmek.reset( new BridgeKernelDevice( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, gpublocks, gputhreads ) );
 #else
     pmek.reset( new BridgeKernelHost( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, nevt ) );
@@ -482,7 +483,7 @@ main( int argc, char** argv )
     prnk->generateRnarray();
     //std::cout << "Got random numbers" << std::endl;
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     if( rndgen != RandomNumberMode::CurandDevice && rmbsmp == RamboSamplingMode::RamboDevice )
     {
       // --- 1c. Copy rndmom from host to device
@@ -514,7 +515,7 @@ main( int argc, char** argv )
     prsk->getMomentaFinal();
     //std::cout << "Got final momenta" << std::endl;
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     if( rmbsmp == RamboSamplingMode::RamboDevice )
     {
       // --- 2c. CopyDToH Weights
@@ -559,7 +560,7 @@ main( int argc, char** argv )
       dynamic_cast<BridgeKernelBase*>( pmek.get() )->transposeInputMomentaC2F();
     }
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     // --- 2d. CopyHToD Momenta
     const std::string gKey = "0.. CpHTDg";
     rambtime += timermap.start( gKey ); // FIXME! NOT A RAMBO TIMER!
@@ -588,7 +589,7 @@ main( int argc, char** argv )
     wv3atime += timermap.stop(); // calc only
     wavetime += wv3atime;        // calc plus copy
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     if( !bridge )
     {
       // --- 3b. CopyDToH MEs
@@ -731,15 +732,19 @@ main( int argc, char** argv )
     rndgentxt = "CURAND DEVICE";
 #ifdef __CUDACC__
   rndgentxt += " (CUDA code)";
+#elif defined __HIPCC__
+  rndgentxt += " (HIP code)";
 #else
   rndgentxt += " (C++ code)";
 #endif
 
   // Workflow description summary
   std::string wrkflwtxt;
-  // -- CUDA or C++?
+  // -- CUDA or HIP or C++?
 #ifdef __CUDACC__
   wrkflwtxt += "CUD:";
+#elif defined __HIPCC__
+  wrkflwtxt += "HIP:";
 #else
   wrkflwtxt += "CPP:";
 #endif
@@ -764,6 +769,12 @@ main( int argc, char** argv )
 #else
   wrkflwtxt += "???:"; // no path to this statement
 #endif
+#elif defined __HIPCC__
+#if defined MGONGPU_CUCXTYPE_CXSMPL
+  wrkflwtxt += "CXS:";
+#else
+  wrkflwtxt += "???:"; // no path to this statement
+#endif
 #else
 #if defined MGONGPU_CPPCXTYPE_STDCOMPLEX
   wrkflwtxt += "STX:";
@@ -789,7 +800,7 @@ main( int argc, char** argv )
     wrkflwtxt += "RMBDEV+";
   else
     wrkflwtxt += "??????+"; // no path to this statement
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   // -- HOST or DEVICE matrix elements? Standalone MEs or BRIDGE?
   if( !bridge )
     wrkflwtxt += "MESDEV";
@@ -845,7 +856,7 @@ main( int argc, char** argv )
 
   if( perf )
   {
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 #ifdef _OPENMP
     // Get the output of "nproc --all" (https://stackoverflow.com/a/478960)
     std::string nprocall;
@@ -866,6 +877,8 @@ main( int argc, char** argv )
     std::cout << std::string( SEP79, '*' ) << std::endl
 #ifdef __CUDACC__
               << "Process                     = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_CUDA"
+#elif defined __HIPCC__
+              << "Process                     = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_HIP"
 #else
               << "Process                     = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_CPP"
 #endif
@@ -892,21 +905,21 @@ main( int argc, char** argv )
 #elif defined MGONGPU_FPTYPE_FLOAT
               << "FP precision                = FLOAT (NaN/abnormal=" << nabn << ", zero=" << nzero << ")" << std::endl
 #endif
-#ifdef __CUDACC__
 #if defined MGONGPU_CUCXTYPE_CUCOMPLEX
               << "Complex type                = CUCOMPLEX" << std::endl
 #elif defined MGONGPU_CUCXTYPE_THRUST
               << "Complex type                = THRUST::COMPLEX" << std::endl
-#endif
-#else
+#elif defined MGONGPU_CUCXTYPE_CXSMPL
               << "Complex type                = STD::COMPLEX" << std::endl
+#else
+              << "Complex type                = ???" << std::endl // no path to this statement...
 #endif
               << "RanNumb memory layout       = AOSOA[" << neppR << "]"
               << ( neppR == 1 ? " == AOS" : "" )
               << " [HARDCODED FOR REPRODUCIBILITY]" << std::endl
               << "Momenta memory layout       = AOSOA[" << neppM << "]"
               << ( neppM == 1 ? " == AOS" : "" ) << std::endl
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     //<< "Wavefunction GPU memory     = LOCAL" << std::endl
 #else
 #if !defined MGONGPU_CPPSIMD
@@ -937,7 +950,7 @@ main( int argc, char** argv )
 #endif
 #endif
               << "Random number generation    = " << rndgentxt << std::endl
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 #ifdef _OPENMP
               << "OMP threads / `nproc --all` = " << omp_get_max_threads() << " / " << nprocall // includes a newline
 #endif
@@ -1033,14 +1046,14 @@ main( int argc, char** argv )
              << "\"FLOAT (NaN/abnormal=" << nabn << ")\"," << std::endl
 #endif
              << "\"Complex type\": "
-#ifdef __CUDACC__
 #if defined MGONGPU_CUCXTYPE_CUCOMPLEX
              << "\"CUCOMPLEX\"," << std::endl
 #elif defined MGONGPU_CUCXTYPE_THRUST
              << "\"THRUST::COMPLEX\"," << std::endl
-#endif
-#else
+#elif defined MGONGPU_CUCXTYPE_CXSMPL
              << "\"STD::COMPLEX\"," << std::endl
+#else
+             << "\"???\"," << std::endl                           // no path to this statement...
 #endif
              << "\"RanNumb memory layout\": "
              << "\"AOSOA[" << neppR << "]\""
@@ -1048,7 +1061,7 @@ main( int argc, char** argv )
              << "\"Momenta memory layout\": "
              << "\"AOSOA[" << neppM << "]\""
              << ( neppM == 1 ? " == AOS" : "" ) << ", " << std::endl
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     //<< "\"Wavefunction GPU memory\": " << "\"LOCAL\"," << std::endl
 #endif
              << "\"Curand generation\": "
diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/RamboSamplingKernels.cc b/epochX/cudacpp/gg_ttg.mad/SubProcesses/RamboSamplingKernels.cc
index da68aa9255..79abbcc4f8 100644
--- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/RamboSamplingKernels.cc
+++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/RamboSamplingKernels.cc
@@ -1,11 +1,11 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #include "RamboSamplingKernels.h"
 
-#include "CudaRuntime.h"
+#include "GpuRuntime.h"
 #include "MemoryAccessMomenta.h"
 #include "MemoryAccessRandomNumbers.h"
 #include "MemoryAccessWeights.h"
@@ -14,7 +14,7 @@
 
 #include <sstream>
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -92,7 +92,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   RamboSamplingKernelDevice::RamboSamplingKernelDevice( const fptype energy,               // input: energy
                                                         const BufferRndNumMomenta& rndmom, // input: random numbers in [0,1]
                                                         BufferMomenta& momenta,            // output: momenta
@@ -135,7 +135,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   __global__ void
   getMomentaInitialDevice( const fptype energy,
                            fptype* momenta )
@@ -147,17 +147,17 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   void
   RamboSamplingKernelDevice::getMomentaInitial()
   {
-    getMomentaInitialDevice<<<m_gpublocks, m_gputhreads>>>( m_energy, m_momenta.data() );
+    gpuLaunchKernel( getMomentaInitialDevice, m_gpublocks, m_gputhreads, m_energy, m_momenta.data() );
   }
 #endif
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   __global__ void
   getMomentaFinalDevice( const fptype energy,
                          const fptype* rndmom,
@@ -171,11 +171,11 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   void
   RamboSamplingKernelDevice::getMomentaFinal()
   {
-    getMomentaFinalDevice<<<m_gpublocks, m_gputhreads>>>( m_energy, m_rndmom.data(), m_momenta.data(), m_weights.data() );
+    gpuLaunchKernel( getMomentaFinalDevice, m_gpublocks, m_gputhreads, m_energy, m_rndmom.data(), m_momenta.data(), m_weights.data() );
   }
 #endif
 
diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/RamboSamplingKernels.h b/epochX/cudacpp/gg_ttg.mad/SubProcesses/RamboSamplingKernels.h
index 184089efd7..7c214cd74b 100644
--- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/RamboSamplingKernels.h
+++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/RamboSamplingKernels.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef RAMBOSAMPLINGKERNELS_H
 #define RAMBOSAMPLINGKERNELS_H 1
@@ -10,7 +10,7 @@
 
 #include "MemoryBuffers.h"
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -93,7 +93,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   // A class encapsulating RAMBO phase space sampling on a GPU device
   class RamboSamplingKernelDevice final : public SamplingKernelBase, public NumberOfEvents
   {
diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/RandomNumberKernels.h b/epochX/cudacpp/gg_ttg.mad/SubProcesses/RandomNumberKernels.h
index 188a72c2c9..21d63beeac 100644
--- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/RandomNumberKernels.h
+++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/RandomNumberKernels.h
@@ -1,14 +1,14 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef RANDOMNUMBERKERNELS_H
 #define RANDOMNUMBERKERNELS_H 1
 
 #include "mgOnGpuConfig.h"
 
-// NB This must come AFTER mgOnGpuConfig.h which contains our definition of __global__ when __CUDACC__ is not defined
+// NB This must come AFTER mgOnGpuConfig.h which contains our definition of __global__ when MGONGPUCPP_GPUIMPL is not defined
 #ifndef MGONGPU_HAS_NO_CURAND
 //#include "curand.h"
 struct curandGenerator_st; // forward definition from curand.h
@@ -16,7 +16,7 @@ struct curandGenerator_st; // forward definition from curand.h
 
 #include "MemoryBuffers.h"
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/cudacpp.mk b/epochX/cudacpp/gg_ttg.mad/SubProcesses/cudacpp.mk
index a0397e9ecc..bcb73d7f01 100644
--- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/cudacpp.mk
+++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/cudacpp.mk
@@ -1,7 +1,7 @@
 # Copyright (C) 2020-2023 CERN and UCLouvain.
 # Licensed under the GNU Lesser General Public License (version 3 or later).
 # Created by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin.
-# Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+# Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 
 #=== Determine the name of this makefile (https://ftp.gnu.org/old-gnu/Manuals/make-3.80/html_node/make_17.html)
 #=== NB: different names (e.g. cudacpp.mk and cudacpp_src.mk) are used in the Subprocess and src directories
@@ -32,7 +32,7 @@ UNAME_P := $(shell uname -p)
 #=== Configure common compiler flags for C++ and CUDA
 
 INCFLAGS = -I.
-OPTFLAGS = -O3 # this ends up in CUFLAGS too (should it?), cannot add -Ofast or -ffast-math here
+OPTFLAGS = -O3 # this ends up in GPUFLAGS too (should it?), cannot add -Ofast or -ffast-math here
 
 # Dependency on src directory
 MG5AMC_COMMONLIB = mg5amc_common
@@ -89,69 +89,139 @@ endif
 
 #-------------------------------------------------------------------------------
 
-#=== Configure the CUDA compiler
-
-# If CXX is not a single word (example "clang++ --gcc-toolchain...") then disable CUDA builds (issue #505)
-# This is because it is impossible to pass this to "CUFLAGS += -ccbin <host-compiler>" below
-ifneq ($(words $(subst ccache ,,$(CXX))),1) # allow at most "CXX=ccache <host-compiler>" from outside
-  $(warning CUDA builds are not supported for multi-word CXX "$(CXX)")
-  override CUDA_HOME=disabled
-endif
-
-# If CUDA_HOME is not set, try to set it from the location of nvcc
-ifndef CUDA_HOME
-  CUDA_HOME = $(patsubst %bin/nvcc,%,$(shell which nvcc 2>/dev/null))
-  $(warning CUDA_HOME was not set: using "$(CUDA_HOME)")
-endif
-
-# Set NVCC as $(CUDA_HOME)/bin/nvcc if it exists
-ifneq ($(wildcard $(CUDA_HOME)/bin/nvcc),)
-  NVCC = $(CUDA_HOME)/bin/nvcc
-  USE_NVTX ?=-DUSE_NVTX
-  # See https://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/index.html
-  # See https://arnon.dk/matching-sm-architectures-arch-and-gencode-for-various-nvidia-cards/
-  # Default: use compute capability 70 for V100 (CERN lxbatch, CERN itscrd, Juwels Cluster).
-  # Embed device code for 70, and PTX for 70+.
-  # Export MADGRAPH_CUDA_ARCHITECTURE (comma-separated list) to use another value or list of values (see #533).
-  # Examples: use 60 for P100 (Piz Daint), 80 for A100 (Juwels Booster, NVidia raplab/Curiosity).
-  MADGRAPH_CUDA_ARCHITECTURE ?= 70
-  ###CUARCHFLAGS = -gencode arch=compute_$(MADGRAPH_CUDA_ARCHITECTURE),code=compute_$(MADGRAPH_CUDA_ARCHITECTURE) -gencode arch=compute_$(MADGRAPH_CUDA_ARCHITECTURE),code=sm_$(MADGRAPH_CUDA_ARCHITECTURE) # Older implementation (AV): go back to this one for multi-GPU support #533
-  ###CUARCHFLAGS = --gpu-architecture=compute_$(MADGRAPH_CUDA_ARCHITECTURE) --gpu-code=sm_$(MADGRAPH_CUDA_ARCHITECTURE),compute_$(MADGRAPH_CUDA_ARCHITECTURE) # Newer implementation (SH): cannot use this as-is for multi-GPU support #533
-  comma:=,
-  CUARCHFLAGS = $(foreach arch,$(subst $(comma), ,$(MADGRAPH_CUDA_ARCHITECTURE)),-gencode arch=compute_$(arch),code=compute_$(arch) -gencode arch=compute_$(arch),code=sm_$(arch))
-  CUINC = -I$(CUDA_HOME)/include/
-  CURANDLIBFLAGS = -L$(CUDA_HOME)/lib64/ -lcurand # NB: -lcuda is not needed here!
-  CUOPTFLAGS = -lineinfo
-  CUFLAGS = $(OPTFLAGS) $(CUOPTFLAGS) $(INCFLAGS) $(CUINC) $(USE_NVTX) $(CUARCHFLAGS) -use_fast_math
-  ###CUFLAGS += -Xcompiler -Wall -Xcompiler -Wextra -Xcompiler -Wshadow
-  ###NVCC_VERSION = $(shell $(NVCC) --version | grep 'Cuda compilation tools' | cut -d' ' -f5 | cut -d, -f1)
-  CUFLAGS += -std=c++17 # need CUDA >= 11.2 (see #333): this is enforced in mgOnGpuConfig.h
-  # Without -maxrregcount: baseline throughput: 6.5E8 (16384 32 12) up to 7.3E8 (65536 128 12)
-  ###CUFLAGS+= --maxrregcount 160 # improves throughput: 6.9E8 (16384 32 12) up to 7.7E8 (65536 128 12)
-  ###CUFLAGS+= --maxrregcount 128 # improves throughput: 7.3E8 (16384 32 12) up to 7.6E8 (65536 128 12)
-  ###CUFLAGS+= --maxrregcount 96 # degrades throughput: 4.1E8 (16384 32 12) up to 4.5E8 (65536 128 12)
-  ###CUFLAGS+= --maxrregcount 64 # degrades throughput: 1.7E8 (16384 32 12) flat at 1.7E8 (65536 128 12)
-else ifneq ($(origin REQUIRE_CUDA),undefined)
-  # If REQUIRE_CUDA is set but no cuda is found, stop here (e.g. for CI tests on GPU #443)
-  $(error No cuda installation found (set CUDA_HOME or make nvcc visible in PATH))
-else
-  # No cuda. Switch cuda compilation off and go to common random numbers in C++
-  $(warning CUDA_HOME is not set or is invalid: export CUDA_HOME to compile with cuda)
-  override NVCC=
-  override USE_NVTX=
-  override CUINC=
-  override CURANDLIBFLAGS=
-endif
+CUDA_COMPILER_PATH := $(shell compiler="`which nvcc 2>/dev/null`" && while [ -L "$$compiler" ]; do compiler=`readlink "$$compiler"`; done && echo "$$compiler")
+HIP_COMPILER_PATH := $(shell compiler="`which hipcc 2>/dev/null`" && while [ -L "$$compiler" ]; do compiler=`readlink "$$compiler"`; done && echo "$$compiler")
+
+ifeq ($(findstring nvcc,$(CUDA_COMPILER_PATH)),nvcc)
+  #=== Configure the CUDA compiler
+
+  # If CXX is not a single word (example "clang++ --gcc-toolchain...") then disable CUDA builds (issue #505)
+  # This is because it is impossible to pass this to "GPUFLAGS += -ccbin <host-compiler>" below
+  ifneq ($(words $(subst ccache ,,$(CXX))),1) # allow at most "CXX=ccache <host-compiler>" from outside
+    $(warning CUDA builds are not supported for multi-word CXX "$(CXX)")
+    override CUDA_HOME=disabled
+  endif
+
+  # If CUDA_HOME is not set, try to set it from the location of NVCC
+  ifndef CUDA_HOME
+    CUDA_HOME = $(patsubst %bin/nvcc,%,$(shell which nvcc 2>/dev/null))
+    $(warning CUDA_HOME was not set: using "$(CUDA_HOME)")
+  endif
+
+  # Set GPUCC as $(CUDA_HOME)/bin/nvcc if it exists
+  ifneq ($(wildcard $(CUDA_HOME)/bin/nvcc),)
+    GPUCC = $(CUDA_HOME)/bin/nvcc
+    USE_NVTX ?=-DUSE_NVTX
+    # See https://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/index.html
+    # See https://arnon.dk/matching-sm-architectures-arch-and-gencode-for-various-nvidia-cards/
+    # Default: use compute capability 70 for V100 (CERN lxbatch, CERN itscrd, Juwels Cluster).
+    # Embed device code for 70, and PTX for 70+.
+    # Export MADGRAPH_CUDA_ARCHITECTURE (comma-separated list) to use another value or list of values (see #533).
+    # Examples: use 60 for P100 (Piz Daint), 80 for A100 (Juwels Booster, NVidia raplab/Curiosity).
+    MADGRAPH_CUDA_ARCHITECTURE ?= 70
+    ###CUARCHFLAGS = -gencode arch=compute_$(MADGRAPH_CUDA_ARCHITECTURE),code=compute_$(MADGRAPH_CUDA_ARCHITECTURE) -gencode arch=compute_$(MADGRAPH_CUDA_ARCHITECTURE),code=sm_$(MADGRAPH_CUDA_ARCHITECTURE) # Older implementation (AV): go back to this one for multi-GPU support #533
+    ###CUARCHFLAGS = --gpu-architecture=compute_$(MADGRAPH_CUDA_ARCHITECTURE) --gpu-code=sm_$(MADGRAPH_CUDA_ARCHITECTURE),compute_$(MADGRAPH_CUDA_ARCHITECTURE) # Newer implementation (SH): cannot use this as-is for multi-GPU support #533
+    comma:=,
+    CUARCHFLAGS = $(foreach arch,$(subst $(comma), ,$(MADGRAPH_CUDA_ARCHITECTURE)),-gencode arch=compute_$(arch),code=compute_$(arch) -gencode arch=compute_$(arch),code=sm_$(arch))
+    CUINC = -I$(CUDA_HOME)/include/
+    CURANDLIBFLAGS = -L$(CUDA_HOME)/lib64/ -lcurand # NB: -lcuda is not needed here!
+    CUOPTFLAGS = -lineinfo
+    GPUFLAGS = $(OPTFLAGS) $(CUOPTFLAGS) $(INCFLAGS) $(CUINC) $(USE_NVTX) $(CUARCHFLAGS) -use_fast_math
+    ###GPUFLAGS += -Xcompiler -Wall -Xcompiler -Wextra -Xcompiler -Wshadow
+    ###GPUCC_VERSION = $(shell $(GPUCC) --version | grep 'Cuda compilation tools' | cut -d' ' -f5 | cut -d, -f1)
+    GPUFLAGS += -std=c++17 # need CUDA >= 11.2 (see #333): this is enforced in mgOnGpuConfig.h
+    # Without -maxrregcount: baseline throughput: 6.5E8 (16384 32 12) up to 7.3E8 (65536 128 12)
+    ###GPUFLAGS+= --maxrregcount 160 # improves throughput: 6.9E8 (16384 32 12) up to 7.7E8 (65536 128 12)
+    ###GPUFLAGS+= --maxrregcount 128 # improves throughput: 7.3E8 (16384 32 12) up to 7.6E8 (65536 128 12)
+    ###GPUFLAGS+= --maxrregcount 96 # degrades throughput: 4.1E8 (16384 32 12) up to 4.5E8 (65536 128 12)
+    ###GPUFLAGS+= --maxrregcount 64 # degrades throughput: 1.7E8 (16384 32 12) flat at 1.7E8 (65536 128 12)
+
+    CUBUILDRULEFLAGS = -Xcompiler -fPIC -c
+    CCBUILDRULEFLAGS = -Xcompiler -fPIC -c -x cu
+
+    CUDATESTFLAGS = -lcuda
+
+  else ifneq ($(origin REQUIRE_CUDA),undefined)
+    # If REQUIRE_CUDA is set but no cuda is found, stop here (e.g. for CI tests on GPU #443)
+    $(error No cuda installation found (set CUDA_HOME or make GPUCC visible in PATH))
+  else
+    # No cuda. Switch cuda compilation off and go to common random numbers in C++
+    $(warning CUDA_HOME is not set or is invalid: export CUDA_HOME to compile with cuda)
+    override GPUCC=
+    override USE_NVTX=
+    override CUINC=
+    override CURANDLIBFLAGS=
+  endif
+
+  # Set the host C++ compiler for GPUCC via "-ccbin <host-compiler>"
+  # (NB issue #505: this must be a single word, "clang++ --gcc-toolchain..." is not supported)
+  GPUFLAGS += -ccbin $(shell which $(subst ccache ,,$(CXX)))
+
+  # Allow newer (unsupported) C++ compilers with older versions of CUDA if ALLOW_UNSUPPORTED_COMPILER_IN_CUDA is set (#504)
+  ifneq ($(origin ALLOW_UNSUPPORTED_COMPILER_IN_CUDA),undefined)
+  GPUFLAGS += -allow-unsupported-compiler
+  endif
+
+else ifeq ($(findstring hipcc,$(HIP_COMPILER_PATH)),hipcc)
+  #=== Configure the HIP compiler
+
+  # If CXX is not a single word (example "clang++ --gcc-toolchain...") then disable CUDA builds (issue #505)
+  # This is because it is impossible to pass this to "GPUFLAGS += -ccbin <host-compiler>" below
+  ifneq ($(words $(subst ccache ,,$(CXX))),1) # allow at most "CXX=ccache <host-compiler>" from outside
+    $(warning CUDA builds are not supported for multi-word CXX "$(CXX)")
+    override CUDA_HOME=disabled
+  endif
+
+  # If HIP_HOME is not set, try to set it from the location of GPUCC
+  ifndef HIP_HOME
+    HIP_HOME = $(patsubst %bin/hipcc,%,$(HIP_COMPILER_PATH))
+    $(warning HIP_HOME was not set: using "$(HIP_HOME)")
+  endif
 
-# Set the host C++ compiler for nvcc via "-ccbin <host-compiler>"
-# (NB issue #505: this must be a single word, "clang++ --gcc-toolchain..." is not supported)
-CUFLAGS += -ccbin $(shell which $(subst ccache ,,$(CXX)))
+  # Set GPUCC as $(HIP_HOME)/bin/hipcc if it exists
+  ifneq ($(wildcard $(HIP_HOME)/bin/hipcc),)
+    GPUCC = $(HIP_HOME)/bin/hipcc
+
+    # Should maybe find something equivelant to this in HIP
+    #USE_NVTX ?=-DUSE_NVTX
+
+    HIPARCHFLAGS = -target x86_64-linux-gnu --offload-arch=gfx90a
+    HIPINC = -I$(HIP_HOME)/include/
+
+    # -DHIP_FAST_MATH equivelant to -use_fast_math in HIP 
+    # (But only for single precision line 208: https://rocm-developer-tools.github.io/HIP/hcc__detail_2math__functions_8h_source.html)
+    GPUFLAGS = $(OPTFLAGS) $(CUOPTFLAGS) $(INCFLAGS) $(HIPINC) $(HIPARCHFLAGS) -DHIP_FAST_MATH -DHIP_PLATFORM=amd -fPIC
+    ###GPUFLAGS += -Xcompiler -Wall -Xcompiler -Wextra -Xcompiler -Wshadow
+    GPUFLAGS += -std=c++17
+    # Without -maxrregcount: baseline throughput: 6.5E8 (16384 32 12) up to 7.3E8 (65536 128 12)
+    ###GPUFLAGS+= --maxrregcount 160 # improves throughput: 6.9E8 (16384 32 12) up to 7.7E8 (65536 128 12)
+    ###GPUFLAGS+= --maxrregcount 128 # improves throughput: 7.3E8 (16384 32 12) up to 7.6E8 (65536 128 12)
+    ###GPUFLAGS+= --maxrregcount 96 # degrades throughput: 4.1E8 (16384 32 12) up to 4.5E8 (65536 128 12)
+    ###GPUFLAGS+= --maxrregcount 64 # degrades throughput: 1.7E8 (16384 32 12) flat at 1.7E8 (65536 128 12)
+
+    CUBUILDRULEFLAGS = -fPIC -c
+    CCBUILDRULEFLAGS = -fPIC -c
+
+  else ifneq ($(origin REQUIRE_HIP),undefined)
+    # If REQUIRE_HIP is set but no cuda is found, stop here (e.g. for CI tests on GPU #443)
+    $(error No hip installation found (set HIP_HOME or make GPUCC visible in PATH))
+  else
+    # No hip. Switch hip compilation off and go to common random numbers in C++
+    $(warning HIP_HOME is not set or is invalid: export HIP_HOME to compile with hip)
+    override GPUCC=
+    override USE_NVTX=
+    override CUINC=
+    override CURANDLIBFLAGS=
+  endif
+
+  # Allow newer (unsupported) C++ compilers with older versions of CUDA if ALLOW_UNSUPPORTED_COMPILER_IN_CUDA is set (#504)
+  ifneq ($(origin ALLOW_UNSUPPORTED_COMPILER_IN_CUDA),undefined)
+  GPUFLAGS += -allow-unsupported-compiler
+  endif
 
-# Allow newer (unsupported) C++ compilers with older versions of CUDA if ALLOW_UNSUPPORTED_COMPILER_IN_CUDA is set (#504)
-ifneq ($(origin ALLOW_UNSUPPORTED_COMPILER_IN_CUDA),undefined)
-CUFLAGS += -allow-unsupported-compiler
 endif
 
+  
 #-------------------------------------------------------------------------------
 
 #=== Configure ccache for C++ and CUDA builds
@@ -163,9 +233,9 @@ endif
 #ifeq ($(USECCACHE)$(shell echo $(AR) | grep ccache),1)
 #  override AR:=ccache $(AR)
 #endif
-ifneq ($(NVCC),)
-  ifeq ($(USECCACHE)$(shell echo $(NVCC) | grep ccache),1)
-    override NVCC:=ccache $(NVCC)
+ifneq ($(GPUCC),)
+  ifeq ($(USECCACHE)$(shell echo $(GPUCC) | grep ccache),1)
+    override GPUCC:=ccache $(GPUCC)
   endif
 endif
 
@@ -189,7 +259,7 @@ endif
 
 # PowerPC-specific CUDA compiler flags (to be reviewed!)
 ifeq ($(UNAME_P),ppc64le)
-  CUFLAGS+= -Xcompiler -mno-float128
+  GPUFLAGS+= -Xcompiler -mno-float128
 endif
 
 #-------------------------------------------------------------------------------
@@ -199,10 +269,10 @@ endif
 # Set the default OMPFLAGS choice
 ifneq ($(shell $(CXX) --version | egrep '^Intel'),)
 override OMPFLAGS = -fopenmp
-###override OMPFLAGS = # disable OpenMP MT on Intel (was ok without nvcc but not ok with nvcc before #578)
+###override OMPFLAGS = # disable OpenMP MT on Intel (was ok without GPUCC but not ok with GPUCC before #578)
 else ifneq ($(shell $(CXX) --version | egrep '^(clang)'),)
 override OMPFLAGS = -fopenmp
-###override OMPFLAGS = # disable OpenMP MT on clang (was not ok without or with nvcc before #578)
+###override OMPFLAGS = # disable OpenMP MT on clang (was not ok without or with GPUCC before #578)
 else ifneq ($(shell $(CXX) --version | egrep '^(Apple clang)'),)
 override OMPFLAGS = # disable OpenMP MT on Apple clang (builds fail in the CI #578)
 else
@@ -253,7 +323,10 @@ endif
 
 # Set the default RNDGEN (random number generator) choice
 ifeq ($(RNDGEN),)
-  ifeq ($(NVCC),)
+  ifeq ($(GPUCC),)
+    override RNDGEN = hasNoCurand
+  # Edgecase for HIP compilation
+  else ifeq ($(findstring hipcc,$(GPUCC)),hipcc)
     override RNDGEN = hasNoCurand
   else ifeq ($(RNDGEN),)
     override RNDGEN = hasCurand
@@ -328,13 +401,13 @@ CXXFLAGS+= $(AVXFLAGS)
 $(info FPTYPE=$(FPTYPE))
 ifeq ($(FPTYPE),d)
   CXXFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_DOUBLE
-  CUFLAGS  += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_DOUBLE
+  GPUFLAGS  += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_DOUBLE
 else ifeq ($(FPTYPE),f)
   CXXFLAGS += -DMGONGPU_FPTYPE_FLOAT -DMGONGPU_FPTYPE2_FLOAT
-  CUFLAGS  += -DMGONGPU_FPTYPE_FLOAT -DMGONGPU_FPTYPE2_FLOAT
+  GPUFLAGS  += -DMGONGPU_FPTYPE_FLOAT -DMGONGPU_FPTYPE2_FLOAT
 else ifeq ($(FPTYPE),m)
   CXXFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_FLOAT
-  CUFLAGS  += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_FLOAT
+  GPUFLAGS  += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_FLOAT
 else
   $(error Unknown FPTYPE='$(FPTYPE)': only 'd', 'f' and 'm' are supported)
 endif
@@ -343,7 +416,7 @@ endif
 $(info HELINL=$(HELINL))
 ifeq ($(HELINL),1)
   CXXFLAGS += -DMGONGPU_INLINE_HELAMPS
-  CUFLAGS  += -DMGONGPU_INLINE_HELAMPS
+  GPUFLAGS  += -DMGONGPU_INLINE_HELAMPS
 else ifneq ($(HELINL),0)
   $(error Unknown HELINL='$(HELINL)': only '0' and '1' are supported)
 endif
@@ -352,7 +425,7 @@ endif
 $(info HRDCOD=$(HRDCOD))
 ifeq ($(HRDCOD),1)
   CXXFLAGS += -DMGONGPU_HARDCODE_PARAM
-  CUFLAGS  += -DMGONGPU_HARDCODE_PARAM
+  GPUFLAGS  += -DMGONGPU_HARDCODE_PARAM
 else ifneq ($(HRDCOD),0)
   $(error Unknown HRDCOD='$(HRDCOD)': only '0' and '1' are supported)
 endif
@@ -404,11 +477,11 @@ ifeq ($(UNAME_S),Darwin)
   override CULIBFLAGSRPATH2 =
 else
   # RPATH to cuda/cpp libs when linking executables
-  override CXXLIBFLAGSRPATH = -Wl,-rpath,$(LIBDIRRPATH)
-  override CULIBFLAGSRPATH = -Xlinker -rpath,$(LIBDIRRPATH)
+  override CXXLIBFLAGSRPATH = -Wl,-rpath=$(LIBDIRRPATH)
+  override CULIBFLAGSRPATH = -Xlinker -rpath=$(LIBDIRRPATH)
   # RPATH to common lib when linking cuda/cpp libs
-  override CXXLIBFLAGSRPATH2 = -Wl,-rpath,'$$ORIGIN'
-  override CULIBFLAGSRPATH2 = -Xlinker -rpath,'$$ORIGIN'
+  override CXXLIBFLAGSRPATH2 = -Wl,-rpath='$$ORIGIN'
+  override CULIBFLAGSRPATH2 = -Xlinker -rpath='$$ORIGIN'
 endif
 
 # Setting LD_LIBRARY_PATH or DYLD_LIBRARY_PATH in the RUNTIME is no longer necessary (neither on Linux nor on Mac)
@@ -421,7 +494,7 @@ override RUNTIME =
 cxx_main=$(BUILDDIR)/check.exe
 fcxx_main=$(BUILDDIR)/fcheck.exe
 
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 cu_main=$(BUILDDIR)/gcheck.exe
 fcu_main=$(BUILDDIR)/fgcheck.exe
 else
@@ -452,15 +525,16 @@ $(BUILDDIR)/.build.$(TAG):
 	@touch $(BUILDDIR)/.build.$(TAG)
 
 # Generic target and build rules: objects from CUDA compilation
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 $(BUILDDIR)/%.o : %.cu *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
 	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
-	$(NVCC) $(CPPFLAGS) $(CUFLAGS) -Xcompiler -fPIC -c $< -o $@
+	$(GPUCC) $(CPPFLAGS) $(GPUFLAGS) $(CUBUILDRULEFLAGS) $< -o $@
 
 $(BUILDDIR)/%_cu.o : %.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
 	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
-	$(NVCC) $(CPPFLAGS) $(CUFLAGS) -Xcompiler -fPIC -c -x cu $< -o $@
+	$(GPUCC) $(CPPFLAGS) $(GPUFLAGS) $(CCBUILDRULEFLAGS) $< -o $@
 endif
+# -x cu in line above
 
 # Generic target and build rules: objects from C++ compilation
 # (NB do not include CUINC here! add it only for NVTX or curand #679)
@@ -469,11 +543,14 @@ $(BUILDDIR)/%.o : %.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
 	$(CXX) $(CPPFLAGS) $(CXXFLAGS) -fPIC -c $< -o $@
 
 # Apply special build flags only to CrossSectionKernel.cc and gCrossSectionKernel.cu (no fast math, see #117)
+# Added edgecase for HIP compilation
 ifeq ($(shell $(CXX) --version | grep ^nvc++),)
 $(BUILDDIR)/CrossSectionKernels.o: CXXFLAGS += -fno-fast-math
 $(BUILDDIR)/CrossSectionKernels.o: CXXFLAGS += -fno-fast-math
-ifneq ($(NVCC),)
-$(BUILDDIR)/gCrossSectionKernels.o: CUFLAGS += -Xcompiler -fno-fast-math
+ifeq ($(findstring nvcc,$(GPUCC)),nvcc)
+  $(BUILDDIR)/gCrossSectionKernels.o: GPUFLAGS += -Xcompiler -fno-fast-math
+else
+  $(BUILDDIR)/gCrossSectionKernels.o: GPUFLAGS += -fno-fast-math
 endif
 endif
 
@@ -489,10 +566,10 @@ ifeq ($(RNDGEN),hasCurand)
 $(BUILDDIR)/CurandRandomNumberKernel.o: CXXFLAGS += $(CUINC)
 endif
 
-# Avoid "warning: builtin __has_trivial_... is deprecated; use __is_trivially_... instead" in nvcc with icx2023 (#592)
+# Avoid "warning: builtin __has_trivial_... is deprecated; use __is_trivially_... instead" in GPUCC with icx2023 (#592)
 ifneq ($(shell $(CXX) --version | egrep '^(Intel)'),)
-ifneq ($(NVCC),)
-CUFLAGS += -Xcompiler -Wno-deprecated-builtins
+ifneq ($(GPUCC),)
+GPUFLAGS += -Wno-deprecated-builtins
 endif
 endif
 
@@ -500,8 +577,8 @@ endif
 # This patch does remove the warning, but I prefer to keep it disabled for the moment...
 ###ifneq ($(shell $(CXX) --version | egrep '^(clang|Apple clang|Intel)'),)
 ###$(BUILDDIR)/CrossSectionKernels.o: CXXFLAGS += -Wno-overriding-t-option
-###ifneq ($(NVCC),)
-###$(BUILDDIR)/gCrossSectionKernels.o: CUFLAGS += -Xcompiler -Wno-overriding-t-option
+###ifneq ($(GPUCC),)
+###$(BUILDDIR)/gCrossSectionKernels.o: GPUFLAGS += -Xcompiler -Wno-overriding-t-option
 ###endif
 ###endif
 
@@ -528,7 +605,7 @@ MG5AMC_CXXLIB = mg5amc_$(processid_short)_cpp
 cxx_objects_lib=$(BUILDDIR)/CPPProcess.o $(BUILDDIR)/MatrixElementKernels.o $(BUILDDIR)/BridgeKernels.o $(BUILDDIR)/CrossSectionKernels.o
 cxx_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel.o $(BUILDDIR)/RamboSamplingKernels.o
 
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 MG5AMC_CULIB = mg5amc_$(processid_short)_cuda
 cu_objects_lib=$(BUILDDIR)/gCPPProcess.o $(BUILDDIR)/gMatrixElementKernels.o $(BUILDDIR)/gBridgeKernels.o $(BUILDDIR)/gCrossSectionKernels.o
 cu_objects_exe=$(BUILDDIR)/gCommonRandomNumberKernel.o $(BUILDDIR)/gRamboSamplingKernels.o
@@ -540,11 +617,11 @@ $(LIBDIR)/lib$(MG5AMC_CXXLIB).so: cxx_objects_lib += $(BUILDDIR)/fbridge.o
 $(LIBDIR)/lib$(MG5AMC_CXXLIB).so: $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib)
 	$(CXX) -shared -o $@ $(cxx_objects_lib) $(CXXLIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB)
 
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 $(LIBDIR)/lib$(MG5AMC_CULIB).so: $(BUILDDIR)/fbridge_cu.o
 $(LIBDIR)/lib$(MG5AMC_CULIB).so: cu_objects_lib += $(BUILDDIR)/fbridge_cu.o
 $(LIBDIR)/lib$(MG5AMC_CULIB).so: $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cu_objects_lib)
-	$(NVCC) --shared -o $@ $(cu_objects_lib) $(CULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB)
+	$(GPUCC) --shared -o $@ $(cu_objects_lib) $(CULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB)
 endif
 
 #-------------------------------------------------------------------------------
@@ -561,16 +638,16 @@ $(cxx_main): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PAT
 $(cxx_main): $(BUILDDIR)/check_sa.o $(LIBDIR)/lib$(MG5AMC_CXXLIB).so $(cxx_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel.o
 	$(CXX) -o $@ $(BUILDDIR)/check_sa.o $(OMPFLAGS) -ldl -pthread $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_CXXLIB) $(cxx_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel.o $(CURANDLIBFLAGS)
 
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 ifneq ($(shell $(CXX) --version | grep ^Intel),)
-$(cu_main): LIBFLAGS += -lintlc # compile with icpx and link with nvcc (undefined reference to `_intel_fast_memcpy')
-$(cu_main): LIBFLAGS += -lsvml # compile with icpx and link with nvcc (undefined reference to `__svml_cos4_l9')
+$(cu_main): LIBFLAGS += -lintlc # compile with icpx and link with GPUCC (undefined reference to `_intel_fast_memcpy')
+$(cu_main): LIBFLAGS += -lsvml # compile with icpx and link with GPUCC (undefined reference to `__svml_cos4_l9')
 else ifneq ($(shell $(CXX) --version | grep ^nvc++),) # support nvc++ #531
 $(cu_main): LIBFLAGS += -L$(patsubst %bin/nvc++,%lib,$(subst ccache ,,$(CXX))) -lnvhpcatm -lnvcpumath -lnvc
 endif
 $(cu_main): LIBFLAGS += $(CULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH
 $(cu_main): $(BUILDDIR)/gcheck_sa.o $(LIBDIR)/lib$(MG5AMC_CULIB).so $(cu_objects_exe) $(BUILDDIR)/gCurandRandomNumberKernel.o
-	$(NVCC) -o $@ $(BUILDDIR)/gcheck_sa.o $(CUARCHFLAGS) $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe) $(BUILDDIR)/gCurandRandomNumberKernel.o $(CURANDLIBFLAGS)
+	$(GPUCC) -o $@ $(BUILDDIR)/gcheck_sa.o $(CUARCHFLAGS) $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe) $(BUILDDIR)/gCurandRandomNumberKernel.o $(CURANDLIBFLAGS)
 endif
 
 #-------------------------------------------------------------------------------
@@ -596,17 +673,17 @@ $(fcxx_main): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PA
 $(fcxx_main): $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler.o $(LIBDIR)/lib$(MG5AMC_CXXLIB).so $(cxx_objects_exe)
 	$(CXX) -o $@ $(BUILDDIR)/fcheck_sa.o $(OMPFLAGS) $(BUILDDIR)/fsampler.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CXXLIB) $(cxx_objects_exe)
 
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 ifneq ($(shell $(CXX) --version | grep ^Intel),)
-$(fcu_main): LIBFLAGS += -lintlc # compile with icpx and link with nvcc (undefined reference to `_intel_fast_memcpy')
-$(fcu_main): LIBFLAGS += -lsvml # compile with icpx and link with nvcc (undefined reference to `__svml_cos4_l9')
+$(fcu_main): LIBFLAGS += -lintlc # compile with icpx and link with GPUCC (undefined reference to `_intel_fast_memcpy')
+$(fcu_main): LIBFLAGS += -lsvml # compile with icpx and link with GPUCC (undefined reference to `__svml_cos4_l9')
 endif
 ifeq ($(UNAME_S),Darwin)
 $(fcu_main): LIBFLAGS += -L$(shell dirname $(shell $(FC) --print-file-name libgfortran.dylib)) # add path to libgfortran on Mac #375
 endif
 $(fcu_main): LIBFLAGS += $(CULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH
 $(fcu_main): $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler_cu.o $(LIBDIR)/lib$(MG5AMC_CULIB).so $(cu_objects_exe)
-	$(NVCC) -o $@ $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler_cu.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe)
+	$(GPUCC) -o $@ $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler_cu.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe)
 endif
 
 #-------------------------------------------------------------------------------
@@ -618,7 +695,7 @@ $(BUILDDIR)/testxxx.o: testxxx_cc_ref.txt
 $(testmain): $(BUILDDIR)/testxxx.o
 $(testmain): cxx_objects_exe += $(BUILDDIR)/testxxx.o # Comment out this line to skip the C++ test of xxx functions
 
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 $(BUILDDIR)/testxxx_cu.o: $(GTESTLIBS)
 $(BUILDDIR)/testxxx_cu.o: INCFLAGS += $(GTESTINC)
 $(BUILDDIR)/testxxx_cu.o: testxxx_cc_ref.txt
@@ -631,7 +708,7 @@ $(BUILDDIR)/testmisc.o: INCFLAGS += $(GTESTINC)
 $(testmain): $(BUILDDIR)/testmisc.o
 $(testmain): cxx_objects_exe += $(BUILDDIR)/testmisc.o # Comment out this line to skip the C++ miscellaneous tests
 
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 $(BUILDDIR)/testmisc_cu.o: $(GTESTLIBS)
 $(BUILDDIR)/testmisc_cu.o: INCFLAGS += $(GTESTINC)
 $(testmain): $(BUILDDIR)/testmisc_cu.o
@@ -643,12 +720,12 @@ $(BUILDDIR)/runTest.o: INCFLAGS += $(GTESTINC)
 $(testmain): $(BUILDDIR)/runTest.o
 $(testmain): cxx_objects_exe += $(BUILDDIR)/runTest.o
 
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 $(BUILDDIR)/runTest_cu.o: $(GTESTLIBS)
 $(BUILDDIR)/runTest_cu.o: INCFLAGS += $(GTESTINC)
 ifneq ($(shell $(CXX) --version | grep ^Intel),)
-$(testmain): LIBFLAGS += -lintlc # compile with icpx and link with nvcc (undefined reference to `_intel_fast_memcpy')
-$(testmain): LIBFLAGS += -lsvml # compile with icpx and link with nvcc (undefined reference to `__svml_cos4_l9')
+$(testmain): LIBFLAGS += -lintlc # compile with icpx and link with GPUCC (undefined reference to `_intel_fast_memcpy')
+$(testmain): LIBFLAGS += -lsvml # compile with icpx and link with GPUCC (undefined reference to `__svml_cos4_l9')
 else ifneq ($(shell $(CXX) --version | grep ^nvc++),) # support nvc++ #531
 $(testmain): LIBFLAGS += -L$(patsubst %bin/nvc++,%lib,$(subst ccache ,,$(CXX))) -lnvhpcatm -lnvcpumath -lnvc
 endif
@@ -672,14 +749,14 @@ $(testmain): LIBFLAGS += -lgomp
 endif
 endif
 
-ifeq ($(NVCC),) # link only runTest.o
+ifeq ($(GPUCC),) # link only runTest.o
 $(testmain): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH
 $(testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(cxx_objects_exe) $(GTESTLIBS)
 	$(CXX) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) -ldl -pthread $(LIBFLAGS)
 else # link both runTest.o and runTest_cu.o
 $(testmain): LIBFLAGS += $(CULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH
 $(testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) $(GTESTLIBS)
-	$(NVCC) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) -ldl $(LIBFLAGS) -lcuda
+	  $(GPUCC) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) -ldl $(LIBFLAGS) $(CUDATESTFLAGS)
 endif
 
 # Use flock (Linux only, no Mac) to allow 'make -j' if googletest has not yet been downloaded https://stackoverflow.com/a/32666215
@@ -782,9 +859,9 @@ ifeq ($(USECCACHE),1)
 	ccache --version | head -1
 endif
 	@echo ""
-	@echo NVCC=$(NVCC)
-ifneq ($(NVCC),)
-	$(NVCC) --version
+	@echo GPUCC=$(GPUCC)
+ifneq ($(GPUCC),)
+	$(GPUCC) --version
 endif
 	@echo ""
 	@echo CXX=$(CXX)
@@ -803,7 +880,7 @@ endif
 
 # Target: check (run the C++ test executable)
 # [NB THIS IS WHAT IS USED IN THE GITHUB CI!]
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 check: runTest cmpFcheck cmpFGcheck
 else
 check: runTest cmpFcheck
diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/fbridge.cc b/epochX/cudacpp/gg_ttg.mad/SubProcesses/fbridge.cc
index f93c05b0b3..2b956730d4 100644
--- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/fbridge.cc
+++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/fbridge.cc
@@ -1,11 +1,11 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: S. Roiser (Oct 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Roiser, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #include "Bridge.h"
 #include "CPPProcess.h"
-#include "CudaRuntime.h"
+#include "GpuRuntime.h"
 
 extern "C"
 {
@@ -22,7 +22,7 @@ extern "C"
    * Using the same Fortran MadEvent code, linking to the hetrerogeneous library would allow access to both CPU and GPU implementations.
    * The specific heterogeneous configuration (how many GPUs, how many threads on each CPU, etc) could be loaded in CUDA/C++ from a data file.
    */
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   using namespace mg5amcGpu;
 #else
   using namespace mg5amcCpu;
@@ -46,8 +46,8 @@ extern "C"
    */
   void fbridgecreate_( CppObjectInFortran** ppbridge, const int* pnevtF, const int* pnparF, const int* pnp4F )
   {
-#ifdef __CUDACC__
-    CudaRuntime::setUp();
+#ifdef MGONGPUCPP_GPUIMPL
+    GpuRuntime::setUp();
 #endif
     // Create a process object, read parm card and set parameters
     // FIXME: the process instance can happily go out of scope because it is only needed to read parameters?
@@ -69,8 +69,8 @@ extern "C"
     Bridge<FORTRANFPTYPE>* pbridge = dynamic_cast<Bridge<FORTRANFPTYPE>*>( *ppbridge );
     if( pbridge == 0 ) throw std::runtime_error( "fbridgedelete_: invalid Bridge address" );
     delete pbridge;
-#ifdef __CUDACC__
-    CudaRuntime::tearDown();
+#ifdef MGONGPUCPP_GPUIMPL
+    GpuRuntime::tearDown();
 #endif
   }
 
@@ -100,7 +100,7 @@ extern "C"
   {
     Bridge<FORTRANFPTYPE>* pbridge = dynamic_cast<Bridge<FORTRANFPTYPE>*>( *ppbridge );
     if( pbridge == 0 ) throw std::runtime_error( "fbridgesequence_: invalid Bridge address" );
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     // Use the device/GPU implementation in the CUDA library
     // (there is also a host implementation in this library)
     pbridge->gpu_sequence( momenta, gs, rndhel, rndcol, *pchannelId, mes, selhel, selcol );
diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/fsampler.cc b/epochX/cudacpp/gg_ttg.mad/SubProcesses/fsampler.cc
index 2fb445372d..3743934f41 100644
--- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/fsampler.cc
+++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/fsampler.cc
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Feb 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #include "mgOnGpuConfig.h"
 
@@ -13,7 +13,7 @@
 
 //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -40,7 +40,7 @@ namespace mg5amcCpu
   private:
     const int m_nevt; // The number of events in each iteration
     int m_iiter;      // The iteration counter (for random number seeding)
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
     HostBufferRndNumMomenta m_hstRndmom; // Memory buffers for random numbers
     HostBufferMomenta m_hstMomenta;      // Memory buffers for momenta
     HostBufferWeights m_hstWeights;      // Memory buffers for sampling weights
@@ -105,7 +105,7 @@ namespace mg5amcCpu
 
 extern "C"
 {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   using namespace mg5amcGpu;
 #else
   using namespace mg5amcCpu;
diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/runTest.cc b/epochX/cudacpp/gg_ttg.mad/SubProcesses/runTest.cc
index 572e28aaea..461ec5c3a5 100644
--- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/runTest.cc
+++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/runTest.cc
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: S. Hageboeck (Nov 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 
 #include "mgOnGpuConfig.h"
 
@@ -15,7 +15,7 @@
 #include "RandomNumberKernels.h"
 #include "epoch_process_id.h"
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 using namespace mg5amcGpu;
 #else
 using namespace mg5amcCpu;
@@ -32,7 +32,7 @@ struct CUDA_CPU_TestBase : public TestDriverBase
     : TestDriverBase( npar, refFileName ) {}
 };
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 struct CPUTest : public CUDA_CPU_TestBase
 {
   // Struct data members (process, and memory structures for random numbers, momenta, matrix elements and weights on host and device)
@@ -114,7 +114,7 @@ struct CPUTest : public CUDA_CPU_TestBase
 };
 #endif
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 struct CUDATest : public CUDA_CPU_TestBase
 {
   // Reset the device when our test goes out of scope. Note that this should happen after
@@ -123,7 +123,7 @@ struct CUDATest : public CUDA_CPU_TestBase
   {
     ~DeviceReset()
     {
-      checkCuda( cudaDeviceReset() ); // this is needed by cuda-memcheck --leak-check full
+      gpuDeviceReset(); // this is needed by cuda-memcheck --leak-check full
     }
   } deviceResetter;
 
@@ -249,7 +249,7 @@ INSTANTIATE_TEST_SUITE_P( prefix, \
                           test_suite_name, \
                           testing::Values( new CUDATest( MG_EPOCH_REFERENCE_FILE_NAME ) ) );
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 MG_INSTANTIATE_TEST_SUITE_GPU( XTESTID_GPU( MG_EPOCH_PROCESS_ID ), MadgraphTest );
 #else
 MG_INSTANTIATE_TEST_SUITE_CPU( XTESTID_CPU( MG_EPOCH_PROCESS_ID ), MadgraphTest );
diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/testmisc.cc b/epochX/cudacpp/gg_ttg.mad/SubProcesses/testmisc.cc
index 989aba1fdc..2bd7a9fcf9 100644
--- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/testmisc.cc
+++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/testmisc.cc
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 //----------------------------------------------------------------------------
 // Use ./runTest.exe --gtest_filter=*misc to run only this test
 //----------------------------------------------------------------------------
@@ -17,7 +17,7 @@
 #include <sstream>
 #include <typeinfo>
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 #define TESTID( s ) s##_GPU_MISC
 #else
 #define TESTID( s ) s##_CPU_MISC
diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/testxxx.cc b/epochX/cudacpp/gg_ttg.mad/SubProcesses/testxxx.cc
index 4243e9fcec..6e8657edca 100644
--- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/testxxx.cc
+++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/testxxx.cc
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Apr 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #include "mgOnGpuConfig.h"
 
@@ -20,7 +20,7 @@
 #include <iomanip>
 #include <iostream>
 #include <vector>
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 #define TESTID( s ) s##_GPU_XXX
 #else
 #define TESTID( s ) s##_CPU_XXX
@@ -41,7 +41,7 @@ TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testxxx )
   assert( nevt % neppM == 0 ); // nevt must be a multiple of neppM
   assert( nevt % neppV == 0 ); // nevt must be a multiple of neppV
   // Fill in the input momenta
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   mg5amcGpu::PinnedHostBufferMomenta hstMomenta( nevt ); // AOSOA[npagM][npar=4][np4=4][neppM]
 #else
   mg5amcCpu::HostBufferMomenta hstMomenta( nevt ); // AOSOA[npagM][npar=4][np4=4][neppM]
@@ -228,7 +228,7 @@ TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testxxx )
   {
     for( int ievt = 0; ievt < nevt; ievt++ )
     {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
       using namespace mg5amcGpu;
 #else
       using namespace mg5amcCpu;
diff --git a/epochX/cudacpp/gg_ttg.mad/src/HelAmps_sm.h b/epochX/cudacpp/gg_ttg.mad/src/HelAmps_sm.h
index 5a3a5dc76f..3593d9f169 100644
--- a/epochX/cudacpp/gg_ttg.mad/src/HelAmps_sm.h
+++ b/epochX/cudacpp/gg_ttg.mad/src/HelAmps_sm.h
@@ -4,7 +4,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: A. Valassi (Sep 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
 // MadGraph5_aMC@NLO v. 3.5.0_lo_vect, 2023-06-09
@@ -26,7 +26,7 @@
 //#include <iomanip>
 //#include <iostream>
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/gg_ttg.mad/src/Parameters_sm.cc b/epochX/cudacpp/gg_ttg.mad/src/Parameters_sm.cc
index d3d01102fd..de87dcaf64 100644
--- a/epochX/cudacpp/gg_ttg.mad/src/Parameters_sm.cc
+++ b/epochX/cudacpp/gg_ttg.mad/src/Parameters_sm.cc
@@ -4,7 +4,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: A. Valassi (Sep 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
 // MadGraph5_aMC@NLO v. 3.5.0_lo_vect, 2023-06-09
diff --git a/epochX/cudacpp/gg_ttg.mad/src/Parameters_sm.h b/epochX/cudacpp/gg_ttg.mad/src/Parameters_sm.h
index 6551d8da81..fe7d686938 100644
--- a/epochX/cudacpp/gg_ttg.mad/src/Parameters_sm.h
+++ b/epochX/cudacpp/gg_ttg.mad/src/Parameters_sm.h
@@ -214,7 +214,7 @@ namespace Parameters_sm_dependentCouplings
 #pragma GCC diagnostic push
 #pragma GCC diagnostic ignored "-Wunused-variable"  // e.g. <<warning: unused variable ‘mdl_G__exp__2’ [-Wunused-variable]>>
 #pragma GCC diagnostic ignored "-Wunused-parameter" // e.g. <<warning: unused parameter ‘G’ [-Wunused-parameter]>>
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 #pragma nv_diagnostic push
 #pragma nv_diag_suppress 177 // e.g. <<warning #177-D: variable "mdl_G__exp__2" was declared but never referenced>>
 #endif
@@ -242,7 +242,7 @@ namespace Parameters_sm_dependentCouplings
     // End SM implementation - no special handling of vectors of floats as in EFT (#439)
     return out;
   }
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 #pragma GCC diagnostic pop
 #pragma nv_diagnostic pop
 #endif
@@ -258,7 +258,7 @@ namespace Parameters_sm_independentCouplings
 
 //==========================================================================
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/gg_ttg.mad/src/mgOnGpuConfig.h b/epochX/cudacpp/gg_ttg.mad/src/mgOnGpuConfig.h
index 881353abac..390766116b 100644
--- a/epochX/cudacpp/gg_ttg.mad/src/mgOnGpuConfig.h
+++ b/epochX/cudacpp/gg_ttg.mad/src/mgOnGpuConfig.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jul 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MGONGPUCONFIG_H
 #define MGONGPUCONFIG_H 1
@@ -10,13 +10,26 @@
 // There are two different code bases for standalone_cudacpp (without multichannel) and madevent+cudacpp (with multichannel)
 #define MGONGPU_SUPPORTS_MULTICHANNEL 1
 
+// Is this a GPU (CUDA, HIP) or CPU implementation?
+#ifdef __CUDACC__
+#define MGONGPUCPP_GPUIMPL cuda
+#elif defined __HIPCC__
+#define MGONGPUCPP_GPUIMPL hip
+#else
+#undef MGONGPUCPP_GPUIMPL
+#endif
+
 // ** NB1 Throughputs (e.g. 6.8E8) are events/sec for "./gcheck.exe -p 65536 128 12"
 // ** NB2 Baseline on b7g47n0004 fluctuates (probably depends on load on other VMs)
 
 // Choose if curand is supported for generating random numbers
+// For CUDA, by default, it is supported
+// For HIP, by default, it is not supported
 // For C++, by default, do not inline, but allow this macro to be set from outside with e.g. -DMGONGPU_HAS_NO_CURAND
 #ifdef __CUDACC__
 #undef MGONGPU_HAS_NO_CURAND
+#elif defined __HIPCC__
+#define MGONGPU_HAS_NO_CURAND 1
 #else
 //#undef MGONGPU_HAS_NO_CURAND // default
 ////#define MGONGPU_HAS_NO_CURAND 1
@@ -52,23 +65,28 @@
 //#undef MGONGPU_HARDCODE_PARAM // default
 ////#define MGONGPU_HARDCODE_PARAM 1
 
-// Complex type in c++: std::complex or cxsmpl (CHOOSE ONLY ONE)
-#ifndef __CUDACC__
-//#define MGONGPU_CPPCXTYPE_STDCOMPLEX 1 // ~8 percent slower on float, same on double (5.1E6/double, 9.4E6/float)
-#define MGONGPU_CPPCXTYPE_CXSMPL 1 // new default (5.1E6/double, 10.2E6/float)
-#endif
-
-// Complex type in cuda: thrust or cucomplex or cxsmpl (CHOOSE ONLY ONE)
+// Complex type in CUDA: thrust or cucomplex or cxsmpl (CHOOSE ONLY ONE)
 #ifdef __CUDACC__
 #define MGONGPU_CUCXTYPE_THRUST 1 // default (~1.15E9/double, ~3.2E9/float)
 //#define MGONGPU_CUCXTYPE_CUCOMPLEX 1 // ~10 percent slower (1.03E9/double, ~2.8E9/float)
 //#define MGONGPU_CUCXTYPE_CXSMPL 1 // ~10 percent slower (1.00E9/double, ~2.9E9/float)
+
+// Complex type in HIP: cxsmpl (ONLY ONE OPTION POSSIBLE)
+#elif defined __HIPCC__
+#define MGONGPU_CUCXTYPE_CXSMPL 1 // ~10 percent slower (1.00E9/double, ~2.9E9/float)
+
+// Complex type in C++: std::complex or cxsmpl (CHOOSE ONLY ONE)
+#else
+//#define MGONGPU_CPPCXTYPE_STDCOMPLEX 1 // ~8 percent slower on float, same on double (5.1E6/double, 9.4E6/float)
+#define MGONGPU_CPPCXTYPE_CXSMPL 1 // new default (5.1E6/double, 10.2E6/float)
 #endif
 
-// Cuda nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation
+// CUDA nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation
 #ifdef __CUDACC__
-#undef MGONGPU_NSIGHT_DEBUG // default
+#undef MGONGPU_NSIGHT_DEBUG // default in CUDA
 //#define MGONGPU_NSIGHT_DEBUG 1
+#else
+#undef MGONGPU_NSIGHT_DEBUG // only option in HIP or C++
 #endif
 
 // SANITY CHECKS (floating point precision for everything but color algebra #537)
@@ -84,17 +102,21 @@
 #error You cannot use double precision for color algebra and single precision elsewhere
 #endif
 
-// SANITY CHECKS (c++ complex number implementation)
-#ifndef __CUDACC__
-#if defined MGONGPU_CPPCXTYPE_STDCOMPLEX and defined MGONGPU_CPPCXTYPE_CXSMPL
-#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CPPCXTYPE_STDCOMPLEX or MGONGPU_CPPCXTYPE_CXSMPL
+// SANITY CHECKS (CUDA complex number implementation)
+#ifdef __CUDACC__
+#if defined MGONGPU_CUCXTYPE_THRUST and defined MGONGPU_CUCXTYPE_CUCOMPLEX
+#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CUCXTYPE_THRUST or MGONGPU_CUCXTYPE_CUCOMPLEX for CUDA
+#elif defined MGONGPU_CUCXTYPE_THRUST and defined MGONGPU_CUCXTYPE_CXSMPL
+#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CUCXTYPE_THRUST or MGONGPU_CUCXTYPE_CXSMPL for CUDA
+#elif defined MGONGPU_CUCXTYPE_CUCOMPLEX and defined MGONGPU_CUCXTYPE_CXSMPL
+#error You must CHOOSE (ONE AND) ONLY ONE OF MGONGPU_CUCXTYPE_CUCOMPLEX or MGONGPU_CUCXTYPE_CXSMPL for CUDA
 #endif
 #endif
 
-// SANITY CHECKS (cuda complex number implementation)
-#ifdef __CUDACC__
-#if defined MGONGPU_CUCXTYPE_THRUST and defined MGONGPU_CUCXTYPE_CUCOMPLEX and defined MGONGPU_CUCXTYPE_CXSMPL
-#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CUCXTYPE_THRUST or MGONGPU_CUCXTYPE_CUCOMPLEX or MGONGPU_CUCXTYPE_CXSMPL
+// SANITY CHECKS (C++ complex number implementation)
+#ifndef MGONGPUCPP_GPUIMPL
+#if defined MGONGPU_CPPCXTYPE_STDCOMPLEX and defined MGONGPU_CPPCXTYPE_CXSMPL
+#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CPPCXTYPE_STDCOMPLEX or MGONGPU_CPPCXTYPE_CXSMPL for C++
 #endif
 #endif
 
@@ -131,7 +153,7 @@ namespace mgOnGpu
   // Alignment requirement for using reinterpret_cast with SIMD vectorized code
   // (using reinterpret_cast with non aligned memory may lead to segmentation faults!)
   // Only needed for C++ code but can be enforced also in NVCC builds of C++ code using CUDA>=11.2 and C++17 (#318, #319, #333)
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   constexpr int cppAlign = 64; // alignment requirement for SIMD vectorization (64-byte i.e. 512-bit)
 #endif
 
@@ -142,7 +164,7 @@ using mgOnGpu::fptype;
 using mgOnGpu::fptype2;
 
 // C++ SIMD vectorization width (this will be used to set neppV)
-#ifdef __CUDACC__ // CUDA implementation has no SIMD
+#ifdef MGONGPUCPP_GPUIMPL // CUDA and HIP implementations have no SIMD
 #undef MGONGPU_CPPSIMD
 #elif defined __AVX512VL__ && defined MGONGPU_PVW512 // C++ "512z" AVX512 with 512 width (512-bit ie 64-byte): 8 (DOUBLE) or 16 (FLOAT)
 #ifdef MGONGPU_FPTYPE_DOUBLE
@@ -172,9 +194,9 @@ using mgOnGpu::fptype2;
 #undef MGONGPU_CPPSIMD
 #endif
 
-// Cuda nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation
+// CUDA nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation
 // Arguments (not used so far): text is __FUNCTION__, code is 0 (start) or 1 (end)
-#if defined __CUDACC__ && defined MGONGPU_NSIGHT_DEBUG /* clang-format off */
+#if defined __CUDA__ && defined MGONGPU_NSIGHT_DEBUG /* clang-format off */
 #define mgDebugDeclare() __shared__ float mgDebugCounter[mgOnGpu::ntpbMAX];
 #define mgDebugInitialise() { mgDebugCounter[threadIdx.x] = 0; }
 #define mgDebug( code, text ) { mgDebugCounter[threadIdx.x] += 1; }
@@ -186,8 +208,8 @@ using mgOnGpu::fptype2;
 #define mgDebugFinalise() { /*noop*/ }
 #endif /* clang-format on */
 
-// Define empty CUDA declaration specifiers for C++
-#ifndef __CUDACC__
+// Define empty CUDA/HIP declaration specifiers for C++
+#ifndef MGONGPUCPP_GPUIMPL
 #define __global__
 #define __host__
 #define __device__
diff --git a/epochX/cudacpp/gg_ttg.mad/src/mgOnGpuCxtypes.h b/epochX/cudacpp/gg_ttg.mad/src/mgOnGpuCxtypes.h
index 0cb2f1db7e..4e7ab03fa2 100644
--- a/epochX/cudacpp/gg_ttg.mad/src/mgOnGpuCxtypes.h
+++ b/epochX/cudacpp/gg_ttg.mad/src/mgOnGpuCxtypes.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022, based on earlier work by D. Smith) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MGONGPUCXTYPES_H
 #define MGONGPUCXTYPES_H 1
@@ -19,7 +19,7 @@
 #include <complex>
 
 // Complex type in cuda: thrust or cucomplex or cxsmpl
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 #if defined MGONGPU_CUCXTYPE_THRUST
 #pragma clang diagnostic push
 #pragma clang diagnostic ignored "-Wtautological-compare" // for icpx2021/clang13 (https://stackoverflow.com/a/15864661)
@@ -201,7 +201,7 @@ namespace mgOnGpu
 {
 
   // --- Type definitions (complex type: cxtype)
-#ifdef __CUDACC__ // cuda
+#ifdef MGONGPUCPP_GPUIMPL // cuda
 #if defined MGONGPU_CUCXTYPE_THRUST
   typedef thrust::complex<fptype> cxtype;
 #elif defined MGONGPU_CUCXTYPE_CUCOMPLEX
@@ -281,7 +281,7 @@ cxmake( const std::complex<double>& c ) // std::complex to cxsmpl (double-to-flo
 
 //==========================================================================
 
-#if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_THRUST // cuda + thrust
+#if defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CUCXTYPE_THRUST // cuda + thrust
 
 //------------------------------
 // CUDA - using thrust::complex
@@ -317,11 +317,11 @@ cxmake( const cxtype& c )
   return c;
 }
 
-#endif // #if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_THRUST
+#endif // #if defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CUCXTYPE_THRUST
 
 //==========================================================================
 
-#if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_CUCOMPLEX // cuda + cucomplex
+#if defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CUCXTYPE_CUCOMPLEX // cuda + cucomplex
 
 //------------------------------
 // CUDA - using cuComplex
@@ -536,11 +536,11 @@ cxmake( const std::complex<fptype>& c ) // std::complex to cucomplex (float-to-f
   return cxmake( c.real(), c.imag() );
 }
 
-#endif // #if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_CUCOMPLEX
+#endif // #if defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CUCXTYPE_CUCOMPLEX
 
 //==========================================================================
 
-#if not defined __CUDACC__ and defined MGONGPU_CPPCXTYPE_STDCOMPLEX // c++ + stdcomplex
+#if not defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CPPCXTYPE_STDCOMPLEX // c++ + stdcomplex
 
 //------------------------------
 // C++ - using std::complex
@@ -584,7 +584,7 @@ cxmake( const std::complex<double>& c ) // std::complex to std::complex (cast do
 }
 #endif
 
-#endif // #if not defined __CUDACC__ and defined MGONGPU_CPPCXTYPE_STDCOMPLEX
+#endif // #if not defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CPPCXTYPE_STDCOMPLEX
 
 //==========================================================================
 
diff --git a/epochX/cudacpp/gg_ttg.mad/src/mgOnGpuFptypes.h b/epochX/cudacpp/gg_ttg.mad/src/mgOnGpuFptypes.h
index a1cde16a67..6f6cee64d6 100644
--- a/epochX/cudacpp/gg_ttg.mad/src/mgOnGpuFptypes.h
+++ b/epochX/cudacpp/gg_ttg.mad/src/mgOnGpuFptypes.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MGONGPUFPTYPES_H
 #define MGONGPUFPTYPES_H 1
@@ -13,7 +13,7 @@
 
 //==========================================================================
 
-#ifdef __CUDACC__ // cuda
+#ifdef MGONGPUCPP_GPUIMPL // cuda
 
 //------------------------------
 // Floating point types - Cuda
@@ -57,11 +57,11 @@ fpsqrt( const fptype& f )
 #endif
 }
 
-#endif // #ifdef __CUDACC__
+#endif // #ifdef MGONGPUCPP_GPUIMPL
 
 //==========================================================================
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 
 //------------------------------
 // Floating point types - C++
@@ -85,7 +85,7 @@ fpsqrt( const fptype& f )
   return std::sqrt( f );
 }
 
-#endif // #ifndef __CUDACC__
+#endif // #ifndef MGONGPUCPP_GPUIMPL
 
 //==========================================================================
 
diff --git a/epochX/cudacpp/gg_ttg.mad/src/mgOnGpuVectors.h b/epochX/cudacpp/gg_ttg.mad/src/mgOnGpuVectors.h
index 9d3e82b1e3..7904b93c61 100644
--- a/epochX/cudacpp/gg_ttg.mad/src/mgOnGpuVectors.h
+++ b/epochX/cudacpp/gg_ttg.mad/src/mgOnGpuVectors.h
@@ -108,7 +108,7 @@ namespace mgOnGpu /* clang-format off */
 #endif
 #endif
 
-#else // i.e #ifndef MGONGPU_CPPSIMD (this includes #ifdef __CUDACC__)
+#else // i.e #ifndef MGONGPU_CPPSIMD (this includes #ifdef MGONGPUCPP_GPUIMPL)
 
   const int neppV = 1;
 
@@ -129,7 +129,7 @@ using mgOnGpu::bool_v;
 
 //--------------------------------------------------------------------------
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 
 // Printout to stream for user defined types
 
@@ -744,11 +744,11 @@ fpvsplit1( const fptype2_v& v )
 
 #endif // #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
 
-#endif // #ifndef __CUDACC__
+#endif // #ifndef MGONGPUCPP_GPUIMPL
 
 //==========================================================================
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 
 //------------------------------
 // Vector types - CUDA
@@ -786,12 +786,12 @@ cxternary( const bool& mask, const cxtype& a, const cxtype& b )
   return ( mask ? a : b );
 }
 
-#endif // #ifdef __CUDACC__
+#endif // #ifdef MGONGPUCPP_GPUIMPL
 
 //==========================================================================
 
 // Scalar-or-vector types: scalar in CUDA, vector or scalar in C++
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 typedef bool bool_sv;
 typedef fptype fptype_sv;
 typedef fptype2 fptype2_sv;
@@ -812,7 +812,7 @@ typedef mgOnGpu::cxtype_ref cxtype_sv_ref;
 #endif
 
 // Scalar-or-vector zeros: scalar in CUDA, vector or scalar in C++
-#ifdef __CUDACC__ /* clang-format off */
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
 inline __host__ __device__ cxtype cxzero_sv(){ return cxtype( 0, 0 ); }
 #elif defined MGONGPU_CPPSIMD
 inline cxtype_v cxzero_sv() { return cxtype_v(); } // RRRR=0000 IIII=0000
diff --git a/epochX/cudacpp/gg_ttg.mad/src/rambo.h b/epochX/cudacpp/gg_ttg.mad/src/rambo.h
index e02ea52496..cd7e1008ea 100644
--- a/epochX/cudacpp/gg_ttg.mad/src/rambo.h
+++ b/epochX/cudacpp/gg_ttg.mad/src/rambo.h
@@ -4,7 +4,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 
 #include "mgOnGpuConfig.h"
@@ -18,7 +18,7 @@
 #include <iostream>
 
 // Simplified rambo version for 2 to N (with N>=2) processes with massless particles
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -83,7 +83,7 @@ namespace mg5amcCpu
       static bool first = true;
       if( first )
       {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
         if constexpr( M_ACCESS::isOnDevice() ) // avoid
         {
           const int ievt0 = 0;
@@ -166,7 +166,7 @@ namespace mg5amcCpu
     wt = po2log;
     if( nparf != 2 ) wt = ( 2. * nparf - 4. ) * log( energy ) + z[nparf - 1];
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
     // issue warnings if weight is too small or too large
     static int iwarn[5] = { 0, 0, 0, 0, 0 };
     if( wt < -180. )
diff --git a/epochX/cudacpp/gg_ttg.sa/CODEGEN_cudacpp_gg_ttg_log.txt b/epochX/cudacpp/gg_ttg.sa/CODEGEN_cudacpp_gg_ttg_log.txt
index 49a78d1df6..e8d2db38ce 100644
--- a/epochX/cudacpp/gg_ttg.sa/CODEGEN_cudacpp_gg_ttg_log.txt
+++ b/epochX/cudacpp/gg_ttg.sa/CODEGEN_cudacpp_gg_ttg_log.txt
@@ -51,8 +51,8 @@ Note that you can still compile and run aMC@NLO with the built-in PDFs
  MG5_aMC> set lhapdf /PATH/TO/lhapdf-config
 
 Using default text editor "vi". Set another one in ./input/mg5_configuration.txt
-No valid eps viewer found. Please set in ./input/mg5_configuration.txt
-Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt
+Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt
+No valid web browser found. Please set in ./input/mg5_configuration.txt
 import /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_cudacpp_gg_ttg.mg
 The import format was not given, so we guess it as command
 set stdout_level DEBUG
@@ -62,7 +62,7 @@ generate g g > t t~ g
 No model currently active, so we import the Standard Model
 INFO: load particles 
 INFO: load vertices 
-[1;32mDEBUG: model prefixing  takes 0.004593372344970703 [0m
+[1;32mDEBUG: model prefixing  takes 0.00521087646484375 [0m
 INFO: Restrict model sm with file models/sm/restrict_default.dat . 
 [1;32mDEBUG: Simplifying conditional expressions [0m
 [1;32mDEBUG: remove interactions: u s w+ at order: QED=1 [0m
@@ -155,7 +155,7 @@ INFO: Please specify coupling orders to bypass this step.
 INFO: Trying coupling order WEIGHTED<=3: WEIGTHED IS QCD+2*QED 
 INFO: Trying process: g g > t t~ g WEIGHTED<=3 @1  
 INFO: Process has 16 diagrams 
-1 processes with 16 diagrams generated in 0.020 s
+1 processes with 16 diagrams generated in 0.022 s
 Total: 1 processes with 16 diagrams
 output standalone_cudacpp CODEGEN_cudacpp_gg_ttg
 Load PLUGIN.CUDACPP_SA_OUTPUT
@@ -186,13 +186,13 @@ FileWriter <class 'PLUGIN.CUDACPP_SA_OUTPUT.model_handling.PLUGIN_CPPWriter'> fo
 [1;32mDEBUG:  type(self.helas_call_writer) = [0m <class 'PLUGIN.CUDACPP_SA_OUTPUT.model_handling.PLUGIN_GPUFOHelasCallWriter'> [1;30m[model_handling.py at line 1151][0m [0m
 [1;32mDEBUG:  self.support_multichannel, self.include_multi_channel = [0m True False [1;30m[model_handling.py at line 1152][0m [0m
 [1;32mDEBUG:  multi_channel_map = [0m None [1;30m[model_handling.py at line 1643][0m [0m
-[1;32mDEBUG:  diag_to_config = [0m {} [1;30m[model_handling.py at line 1698][0m [0m
-[1;32mDEBUG:  call = [0m vxxxxx( momenta,m_pars->%s, cHel[ihel][%d],%+d, w_sv[%d], %d ); [1;30m[model_handling.py at line 1810][0m [0m
-[1;32mDEBUG:  ('ZERO', 0, -1, 0, 0) [1;30m[model_handling.py at line 1811][0m [0m
-[1;32mDEBUG:  call = [0m vxxxxx( momenta,m_pars->%s, cHel[ihel][%d],%+d, w_sv[%d], %d ); [1;30m[model_handling.py at line 1810][0m [0m
-[1;32mDEBUG:  ('ZERO', 1, -1, 1, 1) [1;30m[model_handling.py at line 1811][0m [0m
-[1;32mDEBUG:  call = [0m vxxxxx( momenta,m_pars->%s, cHel[ihel][%d],%+d, w_sv[%d], %d ); [1;30m[model_handling.py at line 1810][0m [0m
-[1;32mDEBUG:  ('ZERO', 4, 1, 4, 4) [1;30m[model_handling.py at line 1811][0m [0m
+[1;32mDEBUG:  diag_to_config = [0m {} [1;30m[model_handling.py at line 1700][0m [0m
+[1;32mDEBUG:  call = [0m vxxxxx( momenta,m_pars->%s, cHel[ihel][%d],%+d, w_sv[%d], %d ); [1;30m[model_handling.py at line 1812][0m [0m
+[1;32mDEBUG:  ('ZERO', 0, -1, 0, 0) [1;30m[model_handling.py at line 1813][0m [0m
+[1;32mDEBUG:  call = [0m vxxxxx( momenta,m_pars->%s, cHel[ihel][%d],%+d, w_sv[%d], %d ); [1;30m[model_handling.py at line 1812][0m [0m
+[1;32mDEBUG:  ('ZERO', 1, -1, 1, 1) [1;30m[model_handling.py at line 1813][0m [0m
+[1;32mDEBUG:  call = [0m vxxxxx( momenta,m_pars->%s, cHel[ihel][%d],%+d, w_sv[%d], %d ); [1;30m[model_handling.py at line 1812][0m [0m
+[1;32mDEBUG:  ('ZERO', 4, 1, 4, 4) [1;30m[model_handling.py at line 1813][0m [0m
 INFO: Created files CPPProcess.h and CPPProcess.cc in directory /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_cudacpp_gg_ttg/SubProcesses/P1_Sigma_sm_gg_ttxg/. 
 [1;32mDEBUG:  Entering PLUGIN_OneProcessExporter.edit_CMakeLists [1;30m[model_handling.py at line 1332][0m [0m
 [1;32mDEBUG:  Entering PLUGIN_OneProcessExporter.edit_check_sa [1;30m[model_handling.py at line 1341][0m [0m
@@ -202,7 +202,7 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory /data/avalassi/G
 [1;32mDEBUG:  Entering PLUGIN_OneProcessExporter.edit_memorybuffers [1;30m[model_handling.py at line 1419][0m [0m
 [1;32mDEBUG:  Entering PLUGIN_OneProcessExporter.edit_memoryaccesscouplings [1;30m[model_handling.py at line 1430][0m [0m
 [1;32mDEBUG:  'Copying test reference file: ', template_ref  = [0m Copying test reference file:  /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/../../../test/ref/dump_CPUTest.Sigma_sm_gg_ttxg.txt [1;30m[model_handling.py at line 1324][0m [0m
-Generated helas calls for 1 subprocesses (16 diagrams) in 0.034 s
+Generated helas calls for 1 subprocesses (16 diagrams) in 0.037 s
 [1;32mDEBUG:  Entering PLUGIN_ProcessExporter.convert_model (create the model) [1;30m[output.py at line 194][0m [0m
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates VVV1 routines[0m
@@ -210,7 +210,7 @@ ALOHA: aloha creates FFV1 routines[0m
 ALOHA: aloha creates VVVV1 set of routines with options: P0[0m
 ALOHA: aloha creates VVVV3 set of routines with options: P0[0m
 ALOHA: aloha creates VVVV4 set of routines with options: P0[0m
-ALOHA: aloha creates 5 routines in  0.275 s
+ALOHA: aloha creates 5 routines in  0.313 s
 <class 'aloha.create_aloha.AbstractRoutine'> VVV1
 <class 'aloha.create_aloha.AbstractRoutine'> VVV1
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
@@ -249,6 +249,6 @@ INFO: /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_cudacpp_gg_ttg/src/.
 [1;32mDEBUG:  Entering PLUGIN_ProcessExporter.finalize [1;30m[output.py at line 203][0m [0m
 quit
 
-real	0m0.820s
-user	0m0.731s
-sys	0m0.052s
+real	0m0.981s
+user	0m0.792s
+sys	0m0.066s
diff --git a/epochX/cudacpp/gg_ttg.sa/COPYRIGHT b/epochX/cudacpp/gg_ttg.sa/COPYRIGHT
index a134b5fef9..84a883fbb0 100644
--- a/epochX/cudacpp/gg_ttg.sa/COPYRIGHT
+++ b/epochX/cudacpp/gg_ttg.sa/COPYRIGHT
@@ -15,6 +15,7 @@ The full development team currently includes the following authors :
   Stephan Hageboeck (CERN)
   Olivier Mattelaer (Universite Catholique de Louvain, original author)
   Stefan Roiser (CERN, original author)
+  Joergen Teig (CERN)
   Andrea Valassi (CERN, original author)
   Zenny Wettersten (CERN)
 See https://github.com/madgraph5/madgraph4gpu for more details. For the full
diff --git a/epochX/cudacpp/gg_ttg.sa/SubProcesses/Bridge.h b/epochX/cudacpp/gg_ttg.sa/SubProcesses/Bridge.h
index 4cafe0c997..c04628dfd1 100644
--- a/epochX/cudacpp/gg_ttg.sa/SubProcesses/Bridge.h
+++ b/epochX/cudacpp/gg_ttg.sa/SubProcesses/Bridge.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: S. Roiser (Nov 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Roiser, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef BRIDGE_H
 #define BRIDGE_H 1
@@ -22,7 +22,7 @@
 #include <memory>
 #include <type_traits>
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -82,7 +82,7 @@ namespace mg5amcCpu
     Bridge& operator=( const Bridge& ) = delete;
     Bridge& operator=( Bridge&& ) = delete;
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     /**
      * Set the gpublocks and gputhreads for the gpusequence - throws if evnt != gpublocks*gputhreads
      * (this is needed for BridgeKernel tests rather than for actual production use in Fortran)
@@ -149,7 +149,7 @@ namespace mg5amcCpu
     unsigned int m_nevt; // number of events
     int m_nGoodHel;      // the number of good helicities (-1 initially when they have not yet been calculated)
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     int m_gputhreads; // number of gpu threads (default set from number of events, can be modified)
     int m_gpublocks;  // number of gpu blocks (default set from number of events, can be modified)
     mg5amcGpu::DeviceBuffer<FORTRANFPTYPE, sizePerEventMomenta> m_devMomentaF;
@@ -186,12 +186,12 @@ namespace mg5amcCpu
   // Forward declare transposition methods
   //
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 
   template<typename Tin, typename Tout>
   __global__ void dev_transposeMomentaF2C( const Tin* in, Tout* out, const unsigned int nevt );
 
-#endif // __CUDACC__
+#endif // MGONGPUCPP_GPUIMPL
 
   template<typename Tin, typename Tout>
   void hst_transposeMomentaF2C( const Tin* in, Tout* out, const unsigned int nevt );
@@ -208,7 +208,7 @@ namespace mg5amcCpu
   Bridge<FORTRANFPTYPE>::Bridge( unsigned int nevtF, unsigned int nparF, unsigned int np4F )
     : m_nevt( nevtF )
     , m_nGoodHel( -1 )
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     , m_gputhreads( 256 )                  // default number of gpu threads
     , m_gpublocks( m_nevt / m_gputhreads ) // this ensures m_nevt <= m_gpublocks*m_gputhreads
     , m_devMomentaF( m_nevt )
@@ -232,7 +232,7 @@ namespace mg5amcCpu
   {
     if( nparF != CPPProcess::npar ) throw std::runtime_error( "Bridge constructor: npar mismatch" );
     if( np4F != CPPProcess::np4 ) throw std::runtime_error( "Bridge constructor: np4 mismatch" );
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     if( ( m_nevt < s_gputhreadsmin ) || ( m_nevt % s_gputhreadsmin != 0 ) )
       throw std::runtime_error( "Bridge constructor: nevt should be a multiple of " + std::to_string( s_gputhreadsmin ) );
     while( m_nevt != m_gpublocks * m_gputhreads )
@@ -250,11 +250,11 @@ namespace mg5amcCpu
     std::cout << "WARNING! Instantiate host Bridge (nevt=" << m_nevt << ")" << std::endl;
     mg5amcCpu::CPPProcess process( /*verbose=*/false );
     m_pmek.reset( new mg5amcCpu::MatrixElementKernelHost( m_hstMomentaC, m_hstGs, m_hstRndHel, m_hstRndCol, m_hstMEs, m_hstSelHel, m_hstSelCol, m_nevt ) );
-#endif // __CUDACC__
+#endif // MGONGPUCPP_GPUIMPL
     process.initProc( "../../Cards/param_card.dat" );
   }
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   template<typename FORTRANFPTYPE>
   void Bridge<FORTRANFPTYPE>::set_gpugrid( const int gpublocks, const int gputhreads )
   {
@@ -268,7 +268,7 @@ namespace mg5amcCpu
   }
 #endif
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   template<typename FORTRANFPTYPE>
   void Bridge<FORTRANFPTYPE>::gpu_sequence( const FORTRANFPTYPE* momenta,
                                             const FORTRANFPTYPE* gs,
@@ -283,14 +283,14 @@ namespace mg5amcCpu
     constexpr int neppM = MemoryAccessMomenta::neppM;
     if constexpr( neppM == 1 && std::is_same_v<FORTRANFPTYPE, fptype> )
     {
-      checkCuda( cudaMemcpy( m_devMomentaC.data(), momenta, m_devMomentaC.bytes(), cudaMemcpyHostToDevice ) );
+      gpuMemcpy( m_devMomentaC.data(), momenta, m_devMomentaC.bytes(), gpuMemcpyHostToDevice );
     }
     else
     {
-      checkCuda( cudaMemcpy( m_devMomentaF.data(), momenta, m_devMomentaF.bytes(), cudaMemcpyHostToDevice ) );
+      gpuMemcpy( m_devMomentaF.data(), momenta, m_devMomentaF.bytes(), gpuMemcpyHostToDevice );
       const int thrPerEvt = CPPProcess::npar * CPPProcess::np4; // AV: transpose alg does 1 element per thread (NOT 1 event per thread)
       //const int thrPerEvt = 1; // AV: try new alg with 1 event per thread... this seems slower
-      dev_transposeMomentaF2C<<<m_gpublocks * thrPerEvt, m_gputhreads>>>( m_devMomentaF.data(), m_devMomentaC.data(), m_nevt );
+      gpuLaunchKernel( dev_transposeMomentaF2C, m_gpublocks * thrPerEvt, m_gputhreads, m_devMomentaF.data(), m_devMomentaC.data(), m_nevt );
     }
     if constexpr( std::is_same_v<FORTRANFPTYPE, fptype> )
     {
@@ -333,7 +333,7 @@ namespace mg5amcCpu
   }
 #endif
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   template<typename FORTRANFPTYPE>
   void Bridge<FORTRANFPTYPE>::cpu_sequence( const FORTRANFPTYPE* momenta,
                                             const FORTRANFPTYPE* gs,
@@ -388,7 +388,7 @@ namespace mg5amcCpu
   // - C++ array: momenta[npagM][npar][np4][neppM] with nevt=npagM*neppM (AOSOA)
   //
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   template<typename Tin, typename Tout>
   __global__ void dev_transposeMomentaF2C( const Tin* in, Tout* out, const unsigned int nevt )
   {
diff --git a/epochX/cudacpp/gg_ttg.sa/SubProcesses/BridgeKernels.cc b/epochX/cudacpp/gg_ttg.sa/SubProcesses/BridgeKernels.cc
index cef4cb3c71..90c7f2d3b8 100644
--- a/epochX/cudacpp/gg_ttg.sa/SubProcesses/BridgeKernels.cc
+++ b/epochX/cudacpp/gg_ttg.sa/SubProcesses/BridgeKernels.cc
@@ -1,10 +1,11 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #include "BridgeKernels.h"
 
+#include "GpuAbstraction.h"
 #include "MemoryAccessMomenta.h"
 
 #include <sstream>
@@ -14,7 +15,7 @@ constexpr int npar = CPPProcess::npar; // #particles in total (external = initia
 
 //============================================================================
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -45,7 +46,7 @@ namespace mg5amcCpu
 
 //============================================================================
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 namespace mg5amcCpu
 {
 
@@ -96,7 +97,7 @@ namespace mg5amcCpu
 
 //============================================================================
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 {
 
diff --git a/epochX/cudacpp/gg_ttg.sa/SubProcesses/BridgeKernels.h b/epochX/cudacpp/gg_ttg.sa/SubProcesses/BridgeKernels.h
index 15eb4bff4d..3efef8ce97 100644
--- a/epochX/cudacpp/gg_ttg.sa/SubProcesses/BridgeKernels.h
+++ b/epochX/cudacpp/gg_ttg.sa/SubProcesses/BridgeKernels.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef BRIDGEKERNELS_H
 #define BRIDGEKERNELS_H 1
@@ -12,7 +12,7 @@
 #include "MatrixElementKernels.h"
 #include "MemoryBuffers.h"
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -49,7 +49,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A Bridge wrapper class encapsulating matrix element calculations on a CPU host
   class BridgeKernelHost final : public BridgeKernelBase
   {
@@ -89,7 +89,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   // A Bridge wrapper class encapsulating matrix element calculations on a GPU device
   class BridgeKernelDevice : public BridgeKernelBase
   {
diff --git a/epochX/cudacpp/gg_ttg.sa/SubProcesses/CommonRandomNumberKernel.cc b/epochX/cudacpp/gg_ttg.sa/SubProcesses/CommonRandomNumberKernel.cc
index 985b39f576..010bc4cbd0 100644
--- a/epochX/cudacpp/gg_ttg.sa/SubProcesses/CommonRandomNumberKernel.cc
+++ b/epochX/cudacpp/gg_ttg.sa/SubProcesses/CommonRandomNumberKernel.cc
@@ -1,15 +1,16 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #include "CommonRandomNumbers.h"
+#include "GpuAbstraction.h"
 #include "MemoryBuffers.h"
 #include "RandomNumberKernels.h"
 
 #include <cassert>
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/gg_ttg.sa/SubProcesses/CrossSectionKernels.cc b/epochX/cudacpp/gg_ttg.sa/SubProcesses/CrossSectionKernels.cc
index 0b355a3c8d..c15b39844d 100644
--- a/epochX/cudacpp/gg_ttg.sa/SubProcesses/CrossSectionKernels.cc
+++ b/epochX/cudacpp/gg_ttg.sa/SubProcesses/CrossSectionKernels.cc
@@ -1,10 +1,11 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #include "CrossSectionKernels.h"
 
+#include "GpuAbstraction.h"
 #include "MemoryAccessMatrixElements.h"
 #include "MemoryAccessWeights.h"
 #include "MemoryBuffers.h"
@@ -77,7 +78,7 @@ debug_me_is_abnormal( const fptype& me, size_t ievtALL )
 
 //============================================================================
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -185,7 +186,7 @@ namespace mg5amcCpu
 
 //============================================================================
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 {
 
diff --git a/epochX/cudacpp/gg_ttg.sa/SubProcesses/CrossSectionKernels.h b/epochX/cudacpp/gg_ttg.sa/SubProcesses/CrossSectionKernels.h
index 7933ca4bbf..4d9659e04e 100644
--- a/epochX/cudacpp/gg_ttg.sa/SubProcesses/CrossSectionKernels.h
+++ b/epochX/cudacpp/gg_ttg.sa/SubProcesses/CrossSectionKernels.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef CROSSSECTIONKERNELS_H
 #define CROSSSECTIONKERNELS_H 1
@@ -13,7 +13,7 @@
 
 //============================================================================
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -96,7 +96,7 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
   /*
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   // A class encapsulating the calculation of event statistics on a GPU device
   class CrossSectionKernelDevice : public CrossSectionKernelBase, public NumberOfEvents
   {
diff --git a/epochX/cudacpp/gg_ttg.sa/SubProcesses/CudaRuntime.h b/epochX/cudacpp/gg_ttg.sa/SubProcesses/CudaRuntime.h
deleted file mode 100644
index 64ce52f4b3..0000000000
--- a/epochX/cudacpp/gg_ttg.sa/SubProcesses/CudaRuntime.h
+++ /dev/null
@@ -1,85 +0,0 @@
-// Copyright (C) 2020-2023 CERN and UCLouvain.
-// Licensed under the GNU Lesser General Public License (version 3 or later).
-// Created by: S. Roiser (Jul 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
-
-#ifndef MG5AMC_CUDARUNTIME_H
-#define MG5AMC_CUDARUNTIME_H 1
-
-// MG5AMC on GPU uses the CUDA runtime API, not the lower level CUDA driver API
-// See https://docs.nvidia.com/cuda/cuda-runtime-api/driver-vs-runtime-api.html#driver-vs-runtime-api
-
-#include <cassert>
-#include <iostream>
-
-//--------------------------------------------------------------------------
-
-// See https://stackoverflow.com/a/14038590
-#ifdef __CUDACC__ /* clang-format off */
-#define checkCuda( code ) { assertCuda( code, __FILE__, __LINE__ ); }
-inline void assertCuda( cudaError_t code, const char* file, int line, bool abort = true )
-{
-  if( code != cudaSuccess )
-  {
-    printf( "ERROR! assertCuda: '%s' (%d) in %s:%d\n", cudaGetErrorString( code ), code, file, line );
-    if( abort ) assert( code == cudaSuccess );
-  }
-}
-#endif /* clang-format on */
-
-//--------------------------------------------------------------------------
-
-#ifdef __CUDACC__
-namespace mg5amcGpu
-{
-  // Instantiate a CudaRuntime at the beginnining of the application's main to
-  // invoke cudaSetDevice(0) in the constructor and book a cudaDeviceReset() call in the destructor
-  // *** FIXME! This will all need to be designed differently when going to multi-GPU nodes! ***
-  struct CudaRuntime final
-  {
-    CudaRuntime( const bool debug = true )
-      : m_debug( debug ) { setUp( m_debug ); }
-    ~CudaRuntime() { tearDown( m_debug ); }
-    CudaRuntime( const CudaRuntime& ) = delete;
-    CudaRuntime( CudaRuntime&& ) = delete;
-    CudaRuntime& operator=( const CudaRuntime& ) = delete;
-    CudaRuntime& operator=( CudaRuntime&& ) = delete;
-    bool m_debug;
-
-    // Set up CUDA application
-    // ** NB: strictly speaking this is not needed when using the CUDA runtime API **
-    // Calling cudaSetDevice on startup is useful to properly book-keep the time spent in CUDA initialization
-    static void setUp( const bool debug = true )
-    {
-      // ** NB: it is useful to call cudaSetDevice, or cudaFree, to properly book-keep the time spent in CUDA initialization
-      // ** NB: otherwise, the first CUDA operation (eg a cudaMemcpyToSymbol in CPPProcess ctor) appears to take much longer!
-      /*
-      // [We initially added cudaFree(0) to "ease profile analysis" only because it shows up as a big recognizable block!]
-      // No explicit initialization is needed: https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#initialization
-      // It is not clear what cudaFree(0) does at all: https://stackoverflow.com/questions/69967813/
-      if ( debug ) std::cout << "__CudaRuntime: calling cudaFree(0)" << std::endl;
-      checkCuda( cudaFree( 0 ) ); // SLOW!
-      */
-      // Replace cudaFree(0) by cudaSetDevice(0), even if it is not really needed either
-      // (but see https://developer.nvidia.com/blog/cuda-pro-tip-always-set-current-device-avoid-multithreading-bugs)
-      if( debug ) std::cout << "__CudaRuntime: calling cudaSetDevice(0)" << std::endl;
-      checkCuda( cudaSetDevice( 0 ) ); // SLOW!
-    }
-
-    // Tear down CUDA application (call cudaDeviceReset)
-    // ** NB: strictly speaking this is not needed when using the CUDA runtime API **
-    // Calling cudaDeviceReset on shutdown is only needed for checking memory leaks in cuda-memcheck
-    // See https://docs.nvidia.com/cuda/cuda-memcheck/index.html#leak-checking
-    static void tearDown( const bool debug = true )
-    {
-      if( debug ) std::cout << "__CudaRuntime: calling cudaDeviceReset()" << std::endl;
-      checkCuda( cudaDeviceReset() );
-    }
-  };
-
-}
-#endif
-
-//--------------------------------------------------------------------------
-
-#endif // MG5AMC_CUDARUNTIME_H
diff --git a/epochX/cudacpp/gg_ttg.sa/SubProcesses/CurandRandomNumberKernel.cc b/epochX/cudacpp/gg_ttg.sa/SubProcesses/CurandRandomNumberKernel.cc
index eb56333b03..38c477c17a 100644
--- a/epochX/cudacpp/gg_ttg.sa/SubProcesses/CurandRandomNumberKernel.cc
+++ b/epochX/cudacpp/gg_ttg.sa/SubProcesses/CurandRandomNumberKernel.cc
@@ -1,9 +1,9 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
-#include "CudaRuntime.h"
+#include "GpuRuntime.h"
 #include "MemoryBuffers.h"
 #include "RandomNumberKernels.h"
 
@@ -114,7 +114,7 @@ namespace mg5amcCpu
     /*
     printf( "\nCurandRandomNumberKernel::generateRnarray size = %d\n", (int)m_rnarray.size() );
     fptype* data = m_rnarray.data();
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     if( m_rnarray.isOnDevice() )
     {
       data = new fptype[m_rnarray.size()]();
@@ -123,7 +123,7 @@ namespace mg5amcCpu
 #endif
     for( int i = 0; i < ( (int)m_rnarray.size() / 4 ); i++ )
       printf( "[%4d] %f %f %f %f\n", i * 4, data[i * 4], data[i * 4 + 2], data[i * 4 + 2], data[i * 4 + 3] );
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     if( m_rnarray.isOnDevice() ) delete[] data;
 #endif
     */
diff --git a/epochX/cudacpp/gg_ttg.sa/SubProcesses/EventStatistics.h b/epochX/cudacpp/gg_ttg.sa/SubProcesses/EventStatistics.h
index 48b51e0a49..b425a5bade 100644
--- a/epochX/cudacpp/gg_ttg.sa/SubProcesses/EventStatistics.h
+++ b/epochX/cudacpp/gg_ttg.sa/SubProcesses/EventStatistics.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef EventStatistics_H
 #define EventStatistics_H 1
@@ -16,7 +16,7 @@
 #include <limits>
 #include <string>
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/gg_ttg.sa/SubProcesses/GpuAbstraction.h b/epochX/cudacpp/gg_ttg.sa/SubProcesses/GpuAbstraction.h
new file mode 100644
index 0000000000..6a7d9c05c0
--- /dev/null
+++ b/epochX/cudacpp/gg_ttg.sa/SubProcesses/GpuAbstraction.h
@@ -0,0 +1,71 @@
+// Copyright (C) 2020-2023 CERN and UCLouvain.
+// Licensed under the GNU Lesser General Public License (version 3 or later).
+// Created by: J. Teig (Jul 2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+
+#ifndef MG5AMC_GPUABSTRACTION_H
+#define MG5AMC_GPUABSTRACTION_H 1
+
+#include <cassert>
+
+//--------------------------------------------------------------------------
+
+#ifdef __CUDACC__
+
+#define gpuError_t cudaError_t
+#define gpuPeekAtLastError cudaPeekAtLastError
+#define gpuGetErrorString cudaGetErrorString
+#define gpuSuccess cudaSuccess
+
+#define gpuMallocHost( ptr, size ) checkGpu( cudaMallocHost( ptr, size ) )
+#define gpuMalloc( ptr, size ) checkGpu( cudaMalloc( ptr, size ) )
+
+#define gpuMemcpy( dstData, srcData, srcBytes, func ) checkGpu( cudaMemcpy( dstData, srcData, srcBytes, func ) )
+#define gpuMemcpyHostToDevice cudaMemcpyHostToDevice
+#define gpuMemcpyDeviceToHost cudaMemcpyDeviceToHost
+#define gpuMemcpyToSymbol( type1, type2, size ) checkGpu( cudaMemcpyToSymbol( type1, type2, size ) )
+
+#define gpuFree( ptr ) checkGpu( cudaFree( ptr ) )
+#define gpuFreeHost( ptr ) checkGpu( cudaFreeHost( ptr ) )
+
+#define gpuSetDevice cudaSetDevice
+#define gpuDeviceSynchronize cudaDeviceSynchronize
+#define gpuDeviceReset cudaDeviceReset
+
+#define gpuLaunchKernel( kernel, blocks, threads, ... ) kernel<<<blocks, threads>>>( __VA_ARGS__ )
+#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<<blocks, threads, sharedMem>>>( __VA_ARGS__ )
+
+//--------------------------------------------------------------------------
+
+#elif defined __HIPCC__
+
+#include "hip/hip_runtime.h"
+
+#define gpuError_t hipError_t
+#define gpuPeekAtLastError hipPeekAtLastError
+#define gpuGetErrorString hipGetErrorString
+#define gpuSuccess hipSuccess
+
+#define gpuMallocHost( ptr, size ) checkGpu( hipHostMalloc( ptr, size ) ) // HostMalloc better
+#define gpuMalloc( ptr, size ) checkGpu( hipMalloc( ptr, size ) )
+
+#define gpuMemcpy( dstData, srcData, srcBytes, func ) checkGpu( hipMemcpy( dstData, srcData, srcBytes, func ) )
+#define gpuMemcpyHostToDevice hipMemcpyHostToDevice
+#define gpuMemcpyDeviceToHost hipMemcpyDeviceToHost
+#define gpuMemcpyToSymbol( type1, type2, size ) checkGpu( hipMemcpyToSymbol( type1, type2, size ) )
+
+#define gpuFree( ptr ) checkGpu( hipFree( ptr ) )
+#define gpuFreeHost( ptr ) checkGpu( hipHostFree( ptr ) )
+
+#define gpuSetDevice hipSetDevice
+#define gpuDeviceSynchronize hipDeviceSynchronize
+#define gpuDeviceReset hipDeviceReset
+
+#define gpuLaunchKernel( kernel, blocks, threads, ... ) kernel<<<blocks, threads>>>( __VA_ARGS__ )
+#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<<blocks, threads, sharedMem>>>( __VA_ARGS__ )
+
+//--------------------------------------------------------------------------
+
+#endif
+
+#endif // MG5AMC_GPUABSTRACTION_H
diff --git a/epochX/cudacpp/gg_ttg.sa/SubProcesses/GpuRuntime.h b/epochX/cudacpp/gg_ttg.sa/SubProcesses/GpuRuntime.h
new file mode 100644
index 0000000000..93579ef08b
--- /dev/null
+++ b/epochX/cudacpp/gg_ttg.sa/SubProcesses/GpuRuntime.h
@@ -0,0 +1,85 @@
+// Copyright (C) 2020-2023 CERN and UCLouvain.
+// Licensed under the GNU Lesser General Public License (version 3 or later).
+// Created by: J. Teig (Jun 2023, based on earlier work by S. Roiser) for the MG5aMC CUDACPP plugin.
+// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+
+#ifndef MG5AMC_GPURUNTIME_H
+#define MG5AMC_GPURUNTIME_H 1
+
+// MG5AMC on GPU uses the CUDA runtime API, not the lower level CUDA driver API
+// See https://docs.nvidia.com/cuda/cuda-runtime-api/driver-vs-runtime-api.html#driver-vs-runtime-api
+
+#include "GpuAbstraction.h"
+
+#include <iostream>
+
+//--------------------------------------------------------------------------
+
+// See https://stackoverflow.com/a/14038590
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
+#define checkGpu( code ) { assertGpu( code, __FILE__, __LINE__ ); }
+inline void assertGpu( gpuError_t code, const char* file, int line, bool abort = true )
+{
+  if( code != gpuSuccess )
+  {
+    printf( "ERROR! assertGpu: '%s' (%d) in %s:%d\n", gpuGetErrorString( code ), code, file, line );
+    if( abort ) assert( code == gpuSuccess );
+  }
+}
+#endif /* clang-format on */
+
+//--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+namespace mg5amcGpu
+{
+  // Instantiate a GpuRuntime at the beginnining of the application's main to
+  // invoke gpuSetDevice(0) in the constructor and book a gpuDeviceReset() call in the destructor
+  // *** FIXME! This will all need to be designed differently when going to multi-GPU nodes! ***
+  struct GpuRuntime final
+  {
+    GpuRuntime( const bool debug = true )
+      : m_debug( debug ) { setUp( m_debug ); }
+    ~GpuRuntime() { tearDown( m_debug ); }
+    GpuRuntime( const GpuRuntime& ) = delete;
+    GpuRuntime( GpuRuntime&& ) = delete;
+    GpuRuntime& operator=( const GpuRuntime& ) = delete;
+    GpuRuntime& operator=( GpuRuntime&& ) = delete;
+    bool m_debug;
+
+    // Set up CUDA application
+    // ** NB: strictly speaking this is not needed when using the CUDA runtime API **
+    // Calling cudaSetDevice on startup is useful to properly book-keep the time spent in CUDA initialization
+    static void setUp( const bool debug = true )
+    {
+      // ** NB: it is useful to call cudaSetDevice, or cudaFree, to properly book-keep the time spent in CUDA initialization
+      // ** NB: otherwise, the first CUDA operation (eg a cudaMemcpyToSymbol in CPPProcess ctor) appears to take much longer!
+      /*
+      // [We initially added cudaFree(0) to "ease profile analysis" only because it shows up as a big recognizable block!]
+      // No explicit initialization is needed: https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#initialization
+      // It is not clear what cudaFree(0) does at all: https://stackoverflow.com/questions/69967813/
+      if ( debug ) std::cout << "__CudaRuntime: calling cudaFree(0)" << std::endl;
+      checkCuda( cudaFree( 0 ) ); // SLOW!
+      */
+      // Replace cudaFree(0) by cudaSetDevice(0), even if it is not really needed either
+      // (but see https://developer.nvidia.com/blog/cuda-pro-tip-always-set-current-device-avoid-multithreading-bugs)
+      if( debug ) std::cout << "__GpuRuntime: calling GpuSetDevice(0)" << std::endl;
+      checkGpu( gpuSetDevice( 0 ) ); // SLOW!
+    }
+
+    // Tear down CUDA application (call cudaDeviceReset)
+    // ** NB: strictly speaking this is not needed when using the CUDA runtime API **
+    // Calling cudaDeviceReset on shutdown is only needed for checking memory leaks in cuda-memcheck
+    // See https://docs.nvidia.com/cuda/cuda-memcheck/index.html#leak-checking
+    static void tearDown( const bool debug = true )
+    {
+      if( debug ) std::cout << "__GpuRuntime: calling GpuDeviceReset()" << std::endl;
+      checkGpu( gpuDeviceReset() );
+    }
+  };
+}
+#endif
+
+//--------------------------------------------------------------------------
+
+#endif // MG5AMC_GPURUNTIME_H
diff --git a/epochX/cudacpp/gg_ttg.sa/SubProcesses/MadgraphTest.h b/epochX/cudacpp/gg_ttg.sa/SubProcesses/MadgraphTest.h
index fd7734ce42..d2ff326e20 100644
--- a/epochX/cudacpp/gg_ttg.sa/SubProcesses/MadgraphTest.h
+++ b/epochX/cudacpp/gg_ttg.sa/SubProcesses/MadgraphTest.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: S. Hageboeck (Dec 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MADGRAPHTEST_H_
 #define MADGRAPHTEST_H_ 1
@@ -21,7 +21,7 @@
 #include <string>
 #include <vector>
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 using mg5amcGpu::CPPProcess;
 #else
 using mg5amcCpu::CPPProcess;
@@ -200,7 +200,7 @@ class MadgraphTest : public testing::TestWithParam<TestDriverBase*>
 
 // Since we link both the CPU-only and GPU tests into the same executable, we prevent
 // a multiply defined symbol by only compiling this in the non-CUDA phase:
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 
 /// Compare momenta and matrix elements.
 /// This uses an implementation of TestDriverBase to run a madgraph workflow,
@@ -309,6 +309,6 @@ TEST_P( MadgraphTest, CompareMomentaAndME )
   }
 }
 
-#endif // __CUDACC__
+#endif // MGONGPUCPP_GPUIMPL
 
 #endif /* MADGRAPHTEST_H_ */
diff --git a/epochX/cudacpp/gg_ttg.sa/SubProcesses/MatrixElementKernels.cc b/epochX/cudacpp/gg_ttg.sa/SubProcesses/MatrixElementKernels.cc
index 30257195b6..d6d6c4f179 100644
--- a/epochX/cudacpp/gg_ttg.sa/SubProcesses/MatrixElementKernels.cc
+++ b/epochX/cudacpp/gg_ttg.sa/SubProcesses/MatrixElementKernels.cc
@@ -1,12 +1,12 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #include "MatrixElementKernels.h"
 
 #include "CPPProcess.h"
-#include "CudaRuntime.h"
+#include "GpuRuntime.h" // Includes the abstraction for Nvidia/AMD compilation
 #include "MemoryAccessMomenta.h"
 #include "MemoryBuffers.h"
 
@@ -14,7 +14,7 @@
 
 //============================================================================
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 namespace mg5amcCpu
 {
 
@@ -143,7 +143,7 @@ namespace mg5amcCpu
 
 //============================================================================
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 {
 
@@ -202,13 +202,13 @@ namespace mg5amcGpu
     PinnedHostBufferHelicityMask hstIsGoodHel( ncomb );
     DeviceBufferHelicityMask devIsGoodHel( ncomb );
     // ... 0d1. Compute good helicity mask on the device
-    computeDependentCouplings<<<m_gpublocks, m_gputhreads>>>( m_gs.data(), m_couplings.data() );
+    gpuLaunchKernel( computeDependentCouplings, m_gpublocks, m_gputhreads, m_gs.data(), m_couplings.data() );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    sigmaKin_getGoodHel<<<m_gpublocks, m_gputhreads>>>( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_numerators.data(), m_denominators.data(), devIsGoodHel.data() );
+    gpuLaunchKernel( sigmaKin_getGoodHel, m_gpublocks, m_gputhreads, m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_numerators.data(), m_denominators.data(), devIsGoodHel.data() );
 #else
-    sigmaKin_getGoodHel<<<m_gpublocks, m_gputhreads>>>( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), devIsGoodHel.data() );
+    gpuLaunchKernel( sigmaKin_getGoodHel, m_gpublocks, m_gputhreads, m_momenta.data(), m_couplings.data(), m_matrixElements.data(), devIsGoodHel.data() );
 #endif
-    checkCuda( cudaPeekAtLastError() );
+    checkGpu( gpuPeekAtLastError() );
     // ... 0d2. Copy back good helicity mask to the host
     copyHostFromDevice( hstIsGoodHel, devIsGoodHel );
     // ... 0d3. Copy back good helicity list to constant memory on the device
@@ -219,19 +219,19 @@ namespace mg5amcGpu
 
   void MatrixElementKernelDevice::computeMatrixElements( const unsigned int channelId )
   {
-    computeDependentCouplings<<<m_gpublocks, m_gputhreads>>>( m_gs.data(), m_couplings.data() );
+    gpuLaunchKernel( computeDependentCouplings, m_gpublocks, m_gputhreads, m_gs.data(), m_couplings.data() );
 #ifndef MGONGPU_NSIGHT_DEBUG
     constexpr unsigned int sharedMemSize = 0;
 #else
     constexpr unsigned int sharedMemSize = ntpbMAX * sizeof( float );
 #endif
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    sigmaKin<<<m_gpublocks, m_gputhreads, sharedMemSize>>>( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), channelId, m_numerators.data(), m_denominators.data(), m_selhel.data(), m_selcol.data() );
+    gpuLaunchKernelSharedMem( sigmaKin, m_gpublocks, m_gputhreads, sharedMemSize, m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), channelId, m_numerators.data(), m_denominators.data(), m_selhel.data(), m_selcol.data() );
 #else
-    sigmaKin<<<m_gpublocks, m_gputhreads, sharedMemSize>>>( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), m_selhel.data(), m_selcol.data() );
+    gpuLaunchKernelSharedMem( sigmaKin, m_gpublocks, m_gputhreads, sharedMemSize, m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), m_selhel.data(), m_selcol.data() );
 #endif
-    checkCuda( cudaPeekAtLastError() );
-    checkCuda( cudaDeviceSynchronize() );
+    checkGpu( gpuPeekAtLastError() );
+    checkGpu( gpuDeviceSynchronize() );
   }
 
   //--------------------------------------------------------------------------
diff --git a/epochX/cudacpp/gg_ttg.sa/SubProcesses/MatrixElementKernels.h b/epochX/cudacpp/gg_ttg.sa/SubProcesses/MatrixElementKernels.h
index 23e84757a2..72bd8f195b 100644
--- a/epochX/cudacpp/gg_ttg.sa/SubProcesses/MatrixElementKernels.h
+++ b/epochX/cudacpp/gg_ttg.sa/SubProcesses/MatrixElementKernels.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MATRIXELEMENTKERNELS_H
 #define MATRIXELEMENTKERNELS_H 1
@@ -10,7 +10,7 @@
 
 #include "MemoryBuffers.h"
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -81,7 +81,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating matrix element calculations on a CPU host
   class MatrixElementKernelHost final : public MatrixElementKernelBase, public NumberOfEvents
   {
@@ -130,7 +130,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   // A class encapsulating matrix element calculations on a GPU device
   class MatrixElementKernelDevice : public MatrixElementKernelBase, public NumberOfEvents
   {
diff --git a/epochX/cudacpp/gg_ttg.sa/SubProcesses/MemoryAccessHelpers.h b/epochX/cudacpp/gg_ttg.sa/SubProcesses/MemoryAccessHelpers.h
index c82a6c7635..db73e4e064 100644
--- a/epochX/cudacpp/gg_ttg.sa/SubProcesses/MemoryAccessHelpers.h
+++ b/epochX/cudacpp/gg_ttg.sa/SubProcesses/MemoryAccessHelpers.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MemoryAccessHelpers_H
 #define MemoryAccessHelpers_H 1
@@ -105,7 +105,7 @@ class KernelAccessHelper : public MemoryAccessHelper<T>
     }
     else
     {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
       const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid
       //printf( "kernelAccessRecord: ievt=%d threadId=%d\n", ievt, threadIdx.x );
       return T::ieventAccessRecord( buffer, ievt ); // NB fptype and fptype_sv coincide for CUDA
diff --git a/epochX/cudacpp/gg_ttg.sa/SubProcesses/MemoryAccessMomenta.h b/epochX/cudacpp/gg_ttg.sa/SubProcesses/MemoryAccessMomenta.h
index 0ac4faa3c7..38fade09fb 100644
--- a/epochX/cudacpp/gg_ttg.sa/SubProcesses/MemoryAccessMomenta.h
+++ b/epochX/cudacpp/gg_ttg.sa/SubProcesses/MemoryAccessMomenta.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MemoryAccessMomenta_H
 #define MemoryAccessMomenta_H 1
@@ -12,7 +12,7 @@
 #include "MemoryAccessHelpers.h"
 #include "MemoryAccessVectors.h"
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 using mg5amcGpu::CPPProcess;
 #else
 using mg5amcCpu::CPPProcess;
@@ -29,7 +29,7 @@ class MemoryAccessMomentaBase //_AOSOAv1
 
   // Number of Events Per Page in the momenta AOSOA memory buffer layout
   // (these are all best kept as a compile-time constants: see issue #23)
-#ifdef __CUDACC__ /* clang-format off */
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
   // -----------------------------------------------------------------------------------------------
   // --- GPUs: neppM is best set to a power of 2 times the number of fptype's in a 32-byte cacheline
   // --- This is relevant to ensure coalesced access to momenta in global memory
diff --git a/epochX/cudacpp/gg_ttg.sa/SubProcesses/MemoryAccessRandomNumbers.h b/epochX/cudacpp/gg_ttg.sa/SubProcesses/MemoryAccessRandomNumbers.h
index e2988d39f3..40cb089135 100644
--- a/epochX/cudacpp/gg_ttg.sa/SubProcesses/MemoryAccessRandomNumbers.h
+++ b/epochX/cudacpp/gg_ttg.sa/SubProcesses/MemoryAccessRandomNumbers.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MemoryAccessRandomNumbers_H
 #define MemoryAccessRandomNumbers_H 1
@@ -11,7 +11,7 @@
 #include "CPPProcess.h"
 #include "MemoryAccessHelpers.h"
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 using mg5amcGpu::CPPProcess;
 #else
 using mg5amcCpu::CPPProcess;
diff --git a/epochX/cudacpp/gg_ttg.sa/SubProcesses/MemoryAccessVectors.h b/epochX/cudacpp/gg_ttg.sa/SubProcesses/MemoryAccessVectors.h
index e9b197368e..08faccff0f 100644
--- a/epochX/cudacpp/gg_ttg.sa/SubProcesses/MemoryAccessVectors.h
+++ b/epochX/cudacpp/gg_ttg.sa/SubProcesses/MemoryAccessVectors.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MemoryAccessVectors_H
 #define MemoryAccessVectors_H 1
@@ -10,7 +10,7 @@
 
 #include "mgOnGpuVectors.h"
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 namespace mg5amcCpu // this is only needed for CPU SIMD vectorization
 {
 
diff --git a/epochX/cudacpp/gg_ttg.sa/SubProcesses/MemoryBuffers.h b/epochX/cudacpp/gg_ttg.sa/SubProcesses/MemoryBuffers.h
index 3093e6ed18..7756a71621 100644
--- a/epochX/cudacpp/gg_ttg.sa/SubProcesses/MemoryBuffers.h
+++ b/epochX/cudacpp/gg_ttg.sa/SubProcesses/MemoryBuffers.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021, based on earlier work by S. Hageboeck) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Roiser, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MemoryBuffers_H
 #define MemoryBuffers_H 1
@@ -11,12 +11,12 @@
 #include "mgOnGpuCxtypes.h"
 
 #include "CPPProcess.h"
-#include "CudaRuntime.h"
+#include "GpuRuntime.h"
 #include "Parameters_sm.h"
 
 #include <sstream>
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -87,7 +87,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   constexpr bool HostBufferALIGNED = false;   // ismisaligned=false
   constexpr bool HostBufferMISALIGNED = true; // ismisaligned=true
 
@@ -119,7 +119,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   // A class encapsulating a CUDA pinned host buffer
   template<typename T>
   class PinnedHostBufferBase : public BufferBase<T>
@@ -128,18 +128,18 @@ namespace mg5amcCpu
     PinnedHostBufferBase( const size_t size )
       : BufferBase<T>( size, false )
     {
-      checkCuda( cudaMallocHost( &( this->m_data ), this->bytes() ) );
+      gpuMallocHost( &( this->m_data ), this->bytes() );
     }
     virtual ~PinnedHostBufferBase()
     {
-      checkCuda( cudaFreeHost( this->m_data ) );
+      gpuFreeHost( this->m_data );
     }
   };
 #endif
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   // A class encapsulating a CUDA device buffer
   template<typename T>
   class DeviceBufferBase : public BufferBase<T>
@@ -148,18 +148,18 @@ namespace mg5amcCpu
     DeviceBufferBase( const size_t size )
       : BufferBase<T>( size, true )
     {
-      checkCuda( cudaMalloc( &( this->m_data ), this->bytes() ) );
+      gpuMalloc( &( this->m_data ), this->bytes() );
     }
     virtual ~DeviceBufferBase()
     {
-      checkCuda( cudaFree( this->m_data ) );
+      gpuFree( this->m_data );
     }
   };
 #endif
 
   //--------------------------------------------------------------------------
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for a given number of events
   template<typename T, size_t sizePerEvent, bool ismisaligned>
   class HostBuffer : public HostBufferBase<T, ismisaligned>, virtual private NumberOfEvents
@@ -175,7 +175,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   // A class encapsulating a CUDA pinned host buffer for a given number of events
   template<typename T, size_t sizePerEvent>
   class PinnedHostBuffer : public PinnedHostBufferBase<T>, virtual private NumberOfEvents
@@ -191,7 +191,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   // A class encapsulating a CUDA device buffer for a given number of events
   template<typename T, size_t sizePerEvent>
   class DeviceBuffer : public DeviceBufferBase<T>, virtual private NumberOfEvents
@@ -213,7 +213,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for momenta random numbers
   constexpr size_t sizePerEventRndNumMomenta = MemoryBuffers::np4 * MemoryBuffers::nparf;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for momenta random numbers
   typedef HostBuffer<fptype, sizePerEventRndNumMomenta, HostBufferALIGNED> HostBufferRndNumMomenta;
 #else
@@ -232,7 +232,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer with ONE fptype per event
   constexpr size_t sizePerEventOneFp = 1;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer with ONE fptype per event
   typedef HostBuffer<fptype, sizePerEventOneFp, HostBufferALIGNED> HostBufferOneFp;
 #else
@@ -257,7 +257,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for Gs
   constexpr size_t sizePerEventGs = 1;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for gs
   typedef HostBuffer<fptype, sizePerEventGs, HostBufferALIGNED> HostBufferGs;
 #else
@@ -276,7 +276,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for numerators
   constexpr size_t sizePerEventNumerators = 1;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for gs
   typedef HostBuffer<fptype, sizePerEventNumerators, HostBufferALIGNED> HostBufferNumerators;
 #else
@@ -296,7 +296,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for denominators
   constexpr size_t sizePerEventDenominators = 1;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for gs
   typedef HostBuffer<fptype, sizePerEventDenominators, HostBufferALIGNED> HostBufferDenominators;
 #else
@@ -315,7 +315,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for random numbers
   constexpr size_t sizePerEventCouplings = MemoryBuffers::ndcoup * MemoryBuffers::nx2;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for gs
   typedef HostBuffer<fptype, sizePerEventCouplings, HostBufferALIGNED> HostBufferCouplings;
 #else
@@ -333,7 +333,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for momenta
   constexpr size_t sizePerEventMomenta = MemoryBuffers::np4 * MemoryBuffers::npar;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for momenta
   typedef HostBuffer<fptype, sizePerEventMomenta, HostBufferALIGNED> HostBufferMomenta;
   //typedef HostBuffer<fptype, sizePerEventMomenta, HostBufferMISALIGNED> HostBufferMomenta; // TEST MISALIGNMENT!
@@ -352,7 +352,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for sampling weights
   constexpr size_t sizePerEventWeights = 1;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for sampling weights
   typedef HostBuffer<fptype, sizePerEventWeights, HostBufferALIGNED> HostBufferWeights;
 #else
@@ -370,7 +370,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for matrix elements
   constexpr size_t sizePerEventMatrixElements = 1;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for matrix elements
   typedef HostBuffer<fptype, sizePerEventMatrixElements, HostBufferALIGNED> HostBufferMatrixElements;
 #else
@@ -385,7 +385,7 @@ namespace mg5amcCpu
   // A base class encapsulating a memory buffer for the helicity mask
   typedef BufferBase<bool> BufferHelicityMask;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for the helicity mask
   typedef HostBufferBase<bool, HostBufferALIGNED> HostBufferHelicityMask;
 #else
@@ -403,7 +403,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for wavefunctions
   constexpr size_t sizePerEventWavefunctions = MemoryBuffers::nw6 * MemoryBuffers::nx2;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for wavefunctions
   typedef HostBuffer<fptype, sizePerEventWavefunctions, HostBufferALIGNED> HostBufferWavefunctions;
 #else
@@ -421,7 +421,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for helicity random numbers
   constexpr size_t sizePerEventRndNumHelicity = 1;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for helicity random numbers
   typedef HostBuffer<fptype, sizePerEventRndNumHelicity, HostBufferALIGNED> HostBufferRndNumHelicity;
 #else
@@ -439,7 +439,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for color random numbers
   constexpr size_t sizePerEventRndNumColor = 1;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for color random numbers
   typedef HostBuffer<fptype, sizePerEventRndNumColor, HostBufferALIGNED> HostBufferRndNumColor;
 #else
@@ -457,7 +457,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for helicity selection
   constexpr size_t sizePerEventSelectedHelicity = 1;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for helicity selection
   typedef HostBuffer<int, sizePerEventSelectedHelicity, HostBufferALIGNED> HostBufferSelectedHelicity;
 #else
@@ -475,7 +475,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for color selection
   constexpr size_t sizePerEventSelectedColor = 1;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for color selection
   typedef HostBuffer<int, sizePerEventSelectedColor, HostBufferALIGNED> HostBufferSelectedColor;
 #else
@@ -487,7 +487,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   template<class Tdst, class Tsrc>
   void copyDeviceFromHost( Tdst& dst, const Tsrc& src ) // keep the same order of arguments as in memcpy
   {
@@ -504,13 +504,13 @@ namespace mg5amcCpu
       throw std::runtime_error( sstr.str() );
     }
     // NB (PR #45): cudaMemcpy involves an intermediate memcpy to pinned memory if host array is a not a pinned host array
-    checkCuda( cudaMemcpy( dst.data(), src.data(), src.bytes(), cudaMemcpyHostToDevice ) );
+    gpuMemcpy( dst.data(), src.data(), src.bytes(), gpuMemcpyHostToDevice );
   }
 #endif
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   template<class Tdst, class Tsrc>
   void copyHostFromDevice( Tdst& dst, const Tsrc& src ) // keep the same order of arguments as in memcpy
   {
@@ -527,7 +527,7 @@ namespace mg5amcCpu
       throw std::runtime_error( sstr.str() );
     }
     // NB (PR #45): cudaMemcpy involves an intermediate memcpy to pinned memory if host array is a not a pinned host array
-    checkCuda( cudaMemcpy( dst.data(), src.data(), src.bytes(), cudaMemcpyDeviceToHost ) );
+    gpuMemcpy( dst.data(), src.data(), src.bytes(), gpuMemcpyDeviceToHost );
   }
 #endif
 
diff --git a/epochX/cudacpp/gg_ttg.sa/SubProcesses/P1_Sigma_sm_gg_ttxg/CPPProcess.cc b/epochX/cudacpp/gg_ttg.sa/SubProcesses/P1_Sigma_sm_gg_ttxg/CPPProcess.cc
index 09575d4a91..b723717621 100644
--- a/epochX/cudacpp/gg_ttg.sa/SubProcesses/P1_Sigma_sm_gg_ttxg/CPPProcess.cc
+++ b/epochX/cudacpp/gg_ttg.sa/SubProcesses/P1_Sigma_sm_gg_ttxg/CPPProcess.cc
@@ -4,7 +4,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
 // MadGraph5_aMC@NLO v. 3.5.0_lo_vect, 2023-06-09
@@ -16,7 +16,6 @@
 
 #include "mgOnGpuConfig.h"
 
-#include "CudaRuntime.h"
 #include "HelAmps_sm.h"
 #include "MemoryAccessAmplitudes.h"
 #include "MemoryAccessCouplings.h"
@@ -46,7 +45,7 @@
 // Class member functions for calculating the matrix elements for
 // Process: g g > t t~ g WEIGHTED<=3 @1
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -80,7 +79,7 @@ namespace mg5amcCpu
   __device__ const fptype cIPD[2] = { (fptype)Parameters_sm::mdl_MT, (fptype)Parameters_sm::mdl_WT };
   __device__ const fptype* cIPC = nullptr; // unused as nicoup=0
 #else
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   __device__ __constant__ fptype cIPD[2];
   __device__ __constant__ fptype* cIPC = nullptr; // unused as nicoup=0
 #else
@@ -90,7 +89,7 @@ namespace mg5amcCpu
 #endif
 
   // Helicity combinations (and filtering of "good" helicity combinations)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   __device__ __constant__ short cHel[ncomb][npar];
   __device__ __constant__ int cNGoodHel;
   __device__ __constant__ int cGoodHel[ncomb];
@@ -118,13 +117,13 @@ namespace mg5amcCpu
                            fptype* allDenominators,       // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
                            fptype_sv* jamp2_sv            // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled)
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
                            , const int ievt00             // input: first event number in current C++ event page (for CUDA, ievt depends on threadid)
 #endif
                            )
   //ALWAYS_INLINE // attributes are not permitted in a function definition
   {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     using namespace mg5amcGpu;
     using M_ACCESS = DeviceAccessMomenta;         // non-trivial access: buffer includes all events
     using E_ACCESS = DeviceAccessMatrixElements;  // non-trivial access: buffer includes all events
@@ -151,7 +150,7 @@ namespace mg5amcCpu
 #endif /* clang-format on */
     mgDebug( 0, __FUNCTION__ );
     //printf( "calculate_wavefunctions: ihel=%2d\n", ihel );
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
     //printf( "calculate_wavefunctions: ievt00=%d\n", ievt00 );
 #endif
 
@@ -187,7 +186,7 @@ namespace mg5amcCpu
 #endif
     for( int iParity = 0; iParity < nParity; ++iParity )
     { // START LOOP ON IPARITY
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
       const int ievt0 = ievt00 + iParity * neppV;
 #endif
       constexpr size_t nxcoup = ndcoup + nicoup; // both dependent and independent couplings
@@ -200,8 +199,10 @@ namespace mg5amcCpu
         allCOUPs[idcoup] = CD_ACCESS::idcoupAccessBufferConst( allcouplings, idcoup ); // dependent couplings, vary event-by-event
       for( size_t iicoup = 0; iicoup < nicoup; iicoup++ )
         allCOUPs[ndcoup + iicoup] = CI_ACCESS::iicoupAccessBufferConst( cIPC, iicoup ); // independent couplings, fixed for all events
+#ifdef MGONGPUCPP_GPUIMPL
 #ifdef __CUDACC__
 #pragma nv_diagnostic pop
+#endif
       // CUDA kernels take input/output buffers with momenta/MEs for all events
       const fptype* momenta = allmomenta;
       const fptype* COUPs[nxcoup];
@@ -499,7 +500,7 @@ namespace mg5amcCpu
         { 1, -8, 10, 1, 64, -8 },
         { 10, 1, 1, -8, -8, 64 } }; // 2-D array[6][6]
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
       // Pre-compute a constexpr triangular color matrix properly normalized #475
       struct TriangularNormalizedColorMatrix
       {
@@ -556,7 +557,7 @@ namespace mg5amcCpu
 #endif
       for( int icol = 0; icol < ncolor; icol++ )
       {
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
         // === C++ START ===
         // Diagonal terms
 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
@@ -615,7 +616,7 @@ namespace mg5amcCpu
       MEs_sv_previous += deltaMEs_previous;
 #endif
       /*
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
       if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", blockDim.x * blockIdx.x + threadIdx.x, ihel, MEs_sv );
 #else
 #ifdef MGONGPU_CPPSIMD
@@ -678,8 +679,8 @@ namespace mg5amcCpu
       { 1, 1, 1, 1, 1 },
       { 1, 1, 1, -1, -1 },
       { 1, 1, 1, -1, 1 } };
-#ifdef __CUDACC__
-    checkCuda( cudaMemcpyToSymbol( cHel, tHel, ncomb * npar * sizeof( short ) ) );
+#ifdef MGONGPUCPP_GPUIMPL
+    gpuMemcpyToSymbol( cHel, tHel, ncomb * npar * sizeof( short ) );
 #else
     memcpy( cHel, tHel, ncomb * npar * sizeof( short ) );
 #endif
@@ -720,9 +721,9 @@ namespace mg5amcCpu
     // Then copy them to CUDA constant memory (issue #39) or its C++ emulation in file-scope static memory
     const fptype tIPD[2] = { (fptype)m_pars->mdl_MT, (fptype)m_pars->mdl_WT };
     //const cxtype tIPC[0] = { ... }; // nicoup=0
-#ifdef __CUDACC__
-    checkCuda( cudaMemcpyToSymbol( cIPD, tIPD, 2 * sizeof( fptype ) ) );
-    //checkCuda( cudaMemcpyToSymbol( cIPC, tIPC, 0 * sizeof( cxtype ) ) ); // nicoup=0
+#ifdef MGONGPUCPP_GPUIMPL
+    gpuMemcpyToSymbol( cIPD, tIPD, 2 * sizeof( fptype ) );
+    //gpuMemcpyToSymbol( cIPC, tIPC, 0 * sizeof( cxtype ) ); // nicoup=0
 #else
     memcpy( cIPD, tIPD, 2 * sizeof( fptype ) );
     //memcpy( cIPC, tIPC, 0 * sizeof( cxtype ) ); // nicoup=0
@@ -759,7 +760,7 @@ namespace mg5amcCpu
   {
     std::stringstream out;
     // CUDA version (NVCC)
-    // [Use __NVCC__ instead of __CUDACC__ here!]
+    // [Use __NVCC__ instead of MGONGPUCPP_GPUIMPL here!]
     // [This tests if 'nvcc' was used even to build a .cc file, even if not necessarily 'nvcc -x cu' for a .cu file]
     // [Check 'nvcc --compiler-options -dM -E dummy.c | grep CUDA': see https://stackoverflow.com/a/53713712]
 #ifdef __NVCC__
@@ -824,12 +825,12 @@ namespace mg5amcCpu
   __global__ void /* clang-format off */
   computeDependentCouplings( const fptype* allgs, // input: Gs[nevt]
                              fptype* allcouplings // output: couplings[nevt*ndcoup*2]
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
                              , const int nevt     // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #endif
   ) /* clang-format on */
   {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     using namespace mg5amcGpu;
     using G_ACCESS = DeviceAccessGs;
     using C_ACCESS = DeviceAccessCouplings;
@@ -850,7 +851,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__ /* clang-format off */
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
   __global__ void
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
@@ -979,9 +980,9 @@ namespace mg5amcCpu
         nGoodHel++;
       }
     }
-#ifdef __CUDACC__
-    checkCuda( cudaMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) ) );
-    checkCuda( cudaMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) ) );
+#ifdef MGONGPUCPP_GPUIMPL
+    gpuMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) );
+    gpuMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) );
 #else
     cNGoodHel = nGoodHel;
     for( int ihel = 0; ihel < ncomb; ihel++ ) cGoodHel[ihel] = goodHel[ihel];
@@ -1005,7 +1006,7 @@ namespace mg5amcCpu
 #endif
             int* allselhel,                // output: helicity selection[nevt]
             int* allselcol                 // output: helicity selection[nevt]
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
             , const int nevt               // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #endif
             ) /* clang-format on */
@@ -1026,7 +1027,7 @@ namespace mg5amcCpu
     // Denominators: spins, colors and identical particles
     constexpr int helcolDenominators[1] = { 256 }; // assume nprocesses == 1 (#272 and #343)
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     // Remember: in CUDA this is a kernel for one event, in c++ this processes n events
     const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid
 #else
@@ -1040,9 +1041,12 @@ namespace mg5amcCpu
 #endif
 
     // Start sigmaKin_lines
+
+#include "GpuAbstraction.h"
+
     // === PART 0 - INITIALISATION (before calculate_wavefunctions) ===
     // Reset the "matrix elements" - running sums of |M|^2 over helicities for the given event
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     allMEs[ievt] = 0;
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     allNumerators[ievt] = 0;
@@ -1070,7 +1074,7 @@ namespace mg5amcCpu
     // === PART 1 - HELICITY LOOP: CALCULATE WAVEFUNCTIONS ===
     // (in both CUDA and C++, using precomputed good helicities)
 
-#ifdef __CUDACC__ // CUDA OR C++
+#ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++
 
     // *** START OF PART 1a - CUDA (one event per CPU thread) ***
     // Running sum of partial amplitudes squared for event by event color selection (#402)
@@ -1274,7 +1278,7 @@ namespace mg5amcCpu
     // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event
     // [NB 'sum over final spins, average over initial spins', eg see
     // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf]
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     allMEs[ievt] /= helcolDenominators[0];
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     if( channelId > 0 ) allMEs[ievt] *= allNumerators[ievt] / allDenominators[ievt];
diff --git a/epochX/cudacpp/gg_ttg.sa/SubProcesses/P1_Sigma_sm_gg_ttxg/CPPProcess.h b/epochX/cudacpp/gg_ttg.sa/SubProcesses/P1_Sigma_sm_gg_ttxg/CPPProcess.h
index 0edca1b52a..ff2cb4ab9a 100644
--- a/epochX/cudacpp/gg_ttg.sa/SubProcesses/P1_Sigma_sm_gg_ttxg/CPPProcess.h
+++ b/epochX/cudacpp/gg_ttg.sa/SubProcesses/P1_Sigma_sm_gg_ttxg/CPPProcess.h
@@ -4,7 +4,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
 // MadGraph5_aMC@NLO v. 3.5.0_lo_vect, 2023-06-09
@@ -25,7 +25,7 @@
 
 //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -107,7 +107,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   __global__ void
   computeDependentCouplings( const fptype* allgs,    // input: Gs[nevt]
                              fptype* allcouplings ); // output: couplings[nevt*ndcoup*2]
@@ -120,7 +120,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__ /* clang-format off */
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
   __global__ void
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
@@ -150,7 +150,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__ /* clang-format off */
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
   __global__ void
   sigmaKin( const fptype* allmomenta,      // input: momenta[nevt*npar*4]
             const fptype* allcouplings,    // input: couplings[nevt*ndcoup*2]
diff --git a/epochX/cudacpp/gg_ttg.sa/SubProcesses/P1_Sigma_sm_gg_ttxg/CudaRuntime.h b/epochX/cudacpp/gg_ttg.sa/SubProcesses/P1_Sigma_sm_gg_ttxg/CudaRuntime.h
deleted file mode 120000
index ce9e1a487a..0000000000
--- a/epochX/cudacpp/gg_ttg.sa/SubProcesses/P1_Sigma_sm_gg_ttxg/CudaRuntime.h
+++ /dev/null
@@ -1 +0,0 @@
-../CudaRuntime.h
\ No newline at end of file
diff --git a/epochX/cudacpp/gg_ttg.sa/SubProcesses/P1_Sigma_sm_gg_ttxg/GpuAbstraction.h b/epochX/cudacpp/gg_ttg.sa/SubProcesses/P1_Sigma_sm_gg_ttxg/GpuAbstraction.h
new file mode 120000
index 0000000000..72054e19ba
--- /dev/null
+++ b/epochX/cudacpp/gg_ttg.sa/SubProcesses/P1_Sigma_sm_gg_ttxg/GpuAbstraction.h
@@ -0,0 +1 @@
+../GpuAbstraction.h
\ No newline at end of file
diff --git a/epochX/cudacpp/gg_ttg.sa/SubProcesses/P1_Sigma_sm_gg_ttxg/GpuRuntime.h b/epochX/cudacpp/gg_ttg.sa/SubProcesses/P1_Sigma_sm_gg_ttxg/GpuRuntime.h
new file mode 120000
index 0000000000..3920e83be4
--- /dev/null
+++ b/epochX/cudacpp/gg_ttg.sa/SubProcesses/P1_Sigma_sm_gg_ttxg/GpuRuntime.h
@@ -0,0 +1 @@
+../GpuRuntime.h
\ No newline at end of file
diff --git a/epochX/cudacpp/gg_ttg.sa/SubProcesses/P1_Sigma_sm_gg_ttxg/check_sa.cc b/epochX/cudacpp/gg_ttg.sa/SubProcesses/P1_Sigma_sm_gg_ttxg/check_sa.cc
index f1e75b9252..1bad694d1c 100644
--- a/epochX/cudacpp/gg_ttg.sa/SubProcesses/P1_Sigma_sm_gg_ttxg/check_sa.cc
+++ b/epochX/cudacpp/gg_ttg.sa/SubProcesses/P1_Sigma_sm_gg_ttxg/check_sa.cc
@@ -4,7 +4,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: O. Mattelaer (Nov 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 
 #include "mgOnGpuConfig.h"
@@ -12,6 +12,7 @@
 #include "BridgeKernels.h"
 #include "CPPProcess.h"
 #include "CrossSectionKernels.h"
+#include "GpuRuntime.h"
 #include "MatrixElementKernels.h"
 #include "MemoryAccessMatrixElements.h"
 #include "MemoryAccessMomenta.h"
@@ -63,7 +64,7 @@ usage( char* argv0, int ret = 1 )
   std::cout << std::endl;
   std::cout << "Summary stats are always computed: '-p' and '-j' only control their printout" << std::endl;
   std::cout << "The '-d' flag only enables NaN/abnormal warnings and OMP debugging" << std::endl;
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 #ifdef _OPENMP
   std::cout << std::endl;
   std::cout << "Use the OMP_NUM_THREADS environment variable to control OMP multi-threading" << std::endl;
@@ -77,7 +78,7 @@ int
 main( int argc, char** argv )
 {
   // Namespaces for CUDA and C++ (FIXME - eventually use the same namespace everywhere...)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   using namespace mg5amcGpu;
 #else
   using namespace mg5amcCpu;
@@ -103,11 +104,11 @@ main( int argc, char** argv )
     CurandDevice = 2
   };
 #ifdef __CUDACC__
-  RandomNumberMode rndgen = RandomNumberMode::CurandDevice; // default on GPU
+  RandomNumberMode rndgen = RandomNumberMode::CurandDevice; // default on CUDA GPU
 #elif not defined MGONGPU_HAS_NO_CURAND
   RandomNumberMode rndgen = RandomNumberMode::CurandHost;  // default on CPU if build has curand
 #else
-  RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // default on CPU if build has no curand
+  RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // default on CPU if build has no curand and on HIP GPU
 #endif
   // Rambo sampling mode (NB RamboHost implies CommonRandom or CurandHost!)
   enum class RamboSamplingMode
@@ -115,7 +116,7 @@ main( int argc, char** argv )
     RamboHost = 1,
     RamboDevice = 2
   };
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   RamboSamplingMode rmbsmp = RamboSamplingMode::RamboDevice; // default on GPU
 #else
   RamboSamplingMode rmbsmp = RamboSamplingMode::RamboHost; // default on CPU
@@ -148,7 +149,7 @@ main( int argc, char** argv )
 #ifdef __CUDACC__
       rndgen = RandomNumberMode::CurandDevice;
 #else
-      throw std::runtime_error( "CurandDevice is not supported on CPUs" );
+      throw std::runtime_error( "CurandDevice is not supported on CPUs or on HIP GPUs" );
 #endif
     }
     else if( arg == "--curhst" )
@@ -165,7 +166,7 @@ main( int argc, char** argv )
     }
     else if( arg == "--rmbdev" )
     {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
       rmbsmp = RamboSamplingMode::RamboDevice;
 #else
       throw std::runtime_error( "RamboDevice is not supported on CPUs" );
@@ -239,13 +240,13 @@ main( int argc, char** argv )
     return usage( argv[0] );
   }
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 #ifdef _OPENMP
   ompnumthreadsNotSetMeansOneThread( debug ? 1 : 0 ); // quiet(-1), info(0), debug(1)
 #endif
 #endif
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // Fail gently and avoid "Illegal instruction (core dumped)" if the host does not support the SIMD used in the ME calculation
   // Note: this prevents a crash on pmpe04 but not on some github CI nodes?
   // [NB: SIMD vectorization in mg5amc C++ code is only used in the ME calculation below MatrixElementKernelHost!]
@@ -263,14 +264,14 @@ main( int argc, char** argv )
 
   // === STEP 0 - INITIALISE
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 
-  // --- 00. Initialise cuda
-  // Instantiate a CudaRuntime at the beginnining of the application's main to
-  // invoke cudaSetDevice(0) in the constructor and book a cudaDeviceReset() call in the destructor
-  const std::string cdinKey = "00 CudaInit";
+  // --- 00. Initialise GPU
+  // Instantiate a GpuRuntime at the beginnining of the application's main.
+  // For CUDA this invokes cudaSetDevice(0) in the constructor and books a cudaDeviceReset() call in the destructor.
+  const std::string cdinKey = "00 GpuInit";
   timermap.start( cdinKey );
-  CudaRuntime cudaRuntime( debug );
+  GpuRuntime GpuRuntime( debug );
 #endif
 
   // --- 0a. Initialise physics process
@@ -292,7 +293,7 @@ main( int argc, char** argv )
   timermap.start( alloKey );
 
   // Memory buffers for random numbers for momenta
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferRndNumMomenta hstRndmom( nevt );
 #else
   PinnedHostBufferRndNumMomenta hstRndmom( nevt );
@@ -300,7 +301,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for sampling weights
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferWeights hstWeights( nevt );
 #else
   PinnedHostBufferWeights hstWeights( nevt );
@@ -308,7 +309,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for momenta
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferMomenta hstMomenta( nevt );
 #else
   PinnedHostBufferMomenta hstMomenta( nevt );
@@ -316,7 +317,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for Gs
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferGs hstGs( nevt );
 #else
   PinnedHostBufferGs hstGs( nevt );
@@ -333,7 +334,7 @@ main( int argc, char** argv )
   }
 
   // Memory buffers for matrix elements
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferMatrixElements hstMatrixElements( nevt );
 #else
   PinnedHostBufferMatrixElements hstMatrixElements( nevt );
@@ -342,7 +343,7 @@ main( int argc, char** argv )
 
   // Memory buffers for random numbers for helicity selection
   // *** NB #403 these buffers always remain initialised at 0: no need for helicity choice in gcheck/check (no LHE produced) ***
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferRndNumHelicity hstRndHel( nevt );
 #else
   PinnedHostBufferRndNumHelicity hstRndHel( nevt );
@@ -351,7 +352,7 @@ main( int argc, char** argv )
 
   // Memory buffers for random numbers for color selection
   // *** NB #402 these buffers always remain initialised at 0: no need for color choice in gcheck/check (no LHE produced) ***
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferRndNumColor hstRndCol( nevt );
 #else
   PinnedHostBufferRndNumColor hstRndCol( nevt );
@@ -359,7 +360,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for helicity selection
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferSelectedHelicity hstSelHel( nevt );
 #else
   PinnedHostBufferSelectedHelicity hstSelHel( nevt );
@@ -367,7 +368,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for color selection
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferSelectedColor hstSelCol( nevt );
 #else
   PinnedHostBufferSelectedColor hstSelCol( nevt );
@@ -403,7 +404,7 @@ main( int argc, char** argv )
 #else
   else
   {
-    throw std::logic_error( "CurandDevice is not supported on CPUs" ); // INTERNAL ERROR (no path to this statement)
+    throw std::logic_error( "CurandDevice is not supported on CPUs or HIP GPUs" ); // INTERNAL ERROR (no path to this statement)
   }
 #endif
 #else
@@ -421,7 +422,7 @@ main( int argc, char** argv )
   }
   else
   {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     prsk.reset( new RamboSamplingKernelDevice( energy, devRndmom, devMomenta, devWeights, gpublocks, gputhreads ) );
 #else
     throw std::logic_error( "RamboDevice is not supported on CPUs" ); // INTERNAL ERROR (no path to this statement)
@@ -432,7 +433,7 @@ main( int argc, char** argv )
   std::unique_ptr<MatrixElementKernelBase> pmek;
   if( !bridge )
   {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     pmek.reset( new MatrixElementKernelDevice( devMomenta, devGs, devRndHel, devRndCol, devMatrixElements, devSelHel, devSelCol, gpublocks, gputhreads ) );
 #else
     pmek.reset( new MatrixElementKernelHost( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, nevt ) );
@@ -440,7 +441,7 @@ main( int argc, char** argv )
   }
   else
   {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     pmek.reset( new BridgeKernelDevice( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, gpublocks, gputhreads ) );
 #else
     pmek.reset( new BridgeKernelHost( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, nevt ) );
@@ -482,7 +483,7 @@ main( int argc, char** argv )
     prnk->generateRnarray();
     //std::cout << "Got random numbers" << std::endl;
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     if( rndgen != RandomNumberMode::CurandDevice && rmbsmp == RamboSamplingMode::RamboDevice )
     {
       // --- 1c. Copy rndmom from host to device
@@ -514,7 +515,7 @@ main( int argc, char** argv )
     prsk->getMomentaFinal();
     //std::cout << "Got final momenta" << std::endl;
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     if( rmbsmp == RamboSamplingMode::RamboDevice )
     {
       // --- 2c. CopyDToH Weights
@@ -559,7 +560,7 @@ main( int argc, char** argv )
       dynamic_cast<BridgeKernelBase*>( pmek.get() )->transposeInputMomentaC2F();
     }
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     // --- 2d. CopyHToD Momenta
     const std::string gKey = "0.. CpHTDg";
     rambtime += timermap.start( gKey ); // FIXME! NOT A RAMBO TIMER!
@@ -588,7 +589,7 @@ main( int argc, char** argv )
     wv3atime += timermap.stop(); // calc only
     wavetime += wv3atime;        // calc plus copy
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     if( !bridge )
     {
       // --- 3b. CopyDToH MEs
@@ -731,15 +732,19 @@ main( int argc, char** argv )
     rndgentxt = "CURAND DEVICE";
 #ifdef __CUDACC__
   rndgentxt += " (CUDA code)";
+#elif defined __HIPCC__
+  rndgentxt += " (HIP code)";
 #else
   rndgentxt += " (C++ code)";
 #endif
 
   // Workflow description summary
   std::string wrkflwtxt;
-  // -- CUDA or C++?
+  // -- CUDA or HIP or C++?
 #ifdef __CUDACC__
   wrkflwtxt += "CUD:";
+#elif defined __HIPCC__
+  wrkflwtxt += "HIP:";
 #else
   wrkflwtxt += "CPP:";
 #endif
@@ -764,6 +769,12 @@ main( int argc, char** argv )
 #else
   wrkflwtxt += "???:"; // no path to this statement
 #endif
+#elif defined __HIPCC__
+#if defined MGONGPU_CUCXTYPE_CXSMPL
+  wrkflwtxt += "CXS:";
+#else
+  wrkflwtxt += "???:"; // no path to this statement
+#endif
 #else
 #if defined MGONGPU_CPPCXTYPE_STDCOMPLEX
   wrkflwtxt += "STX:";
@@ -789,7 +800,7 @@ main( int argc, char** argv )
     wrkflwtxt += "RMBDEV+";
   else
     wrkflwtxt += "??????+"; // no path to this statement
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   // -- HOST or DEVICE matrix elements? Standalone MEs or BRIDGE?
   if( !bridge )
     wrkflwtxt += "MESDEV";
@@ -845,7 +856,7 @@ main( int argc, char** argv )
 
   if( perf )
   {
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 #ifdef _OPENMP
     // Get the output of "nproc --all" (https://stackoverflow.com/a/478960)
     std::string nprocall;
@@ -866,6 +877,8 @@ main( int argc, char** argv )
     std::cout << std::string( SEP79, '*' ) << std::endl
 #ifdef __CUDACC__
               << "Process                     = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_CUDA"
+#elif defined __HIPCC__
+              << "Process                     = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_HIP"
 #else
               << "Process                     = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_CPP"
 #endif
@@ -892,21 +905,21 @@ main( int argc, char** argv )
 #elif defined MGONGPU_FPTYPE_FLOAT
               << "FP precision                = FLOAT (NaN/abnormal=" << nabn << ", zero=" << nzero << ")" << std::endl
 #endif
-#ifdef __CUDACC__
 #if defined MGONGPU_CUCXTYPE_CUCOMPLEX
               << "Complex type                = CUCOMPLEX" << std::endl
 #elif defined MGONGPU_CUCXTYPE_THRUST
               << "Complex type                = THRUST::COMPLEX" << std::endl
-#endif
-#else
+#elif defined MGONGPU_CUCXTYPE_CXSMPL
               << "Complex type                = STD::COMPLEX" << std::endl
+#else
+              << "Complex type                = ???" << std::endl // no path to this statement...
 #endif
               << "RanNumb memory layout       = AOSOA[" << neppR << "]"
               << ( neppR == 1 ? " == AOS" : "" )
               << " [HARDCODED FOR REPRODUCIBILITY]" << std::endl
               << "Momenta memory layout       = AOSOA[" << neppM << "]"
               << ( neppM == 1 ? " == AOS" : "" ) << std::endl
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     //<< "Wavefunction GPU memory     = LOCAL" << std::endl
 #else
 #if !defined MGONGPU_CPPSIMD
@@ -937,7 +950,7 @@ main( int argc, char** argv )
 #endif
 #endif
               << "Random number generation    = " << rndgentxt << std::endl
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 #ifdef _OPENMP
               << "OMP threads / `nproc --all` = " << omp_get_max_threads() << " / " << nprocall // includes a newline
 #endif
@@ -1033,14 +1046,14 @@ main( int argc, char** argv )
              << "\"FLOAT (NaN/abnormal=" << nabn << ")\"," << std::endl
 #endif
              << "\"Complex type\": "
-#ifdef __CUDACC__
 #if defined MGONGPU_CUCXTYPE_CUCOMPLEX
              << "\"CUCOMPLEX\"," << std::endl
 #elif defined MGONGPU_CUCXTYPE_THRUST
              << "\"THRUST::COMPLEX\"," << std::endl
-#endif
-#else
+#elif defined MGONGPU_CUCXTYPE_CXSMPL
              << "\"STD::COMPLEX\"," << std::endl
+#else
+             << "\"???\"," << std::endl                           // no path to this statement...
 #endif
              << "\"RanNumb memory layout\": "
              << "\"AOSOA[" << neppR << "]\""
@@ -1048,7 +1061,7 @@ main( int argc, char** argv )
              << "\"Momenta memory layout\": "
              << "\"AOSOA[" << neppM << "]\""
              << ( neppM == 1 ? " == AOS" : "" ) << ", " << std::endl
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     //<< "\"Wavefunction GPU memory\": " << "\"LOCAL\"," << std::endl
 #endif
              << "\"Curand generation\": "
diff --git a/epochX/cudacpp/gg_ttg.sa/SubProcesses/RamboSamplingKernels.cc b/epochX/cudacpp/gg_ttg.sa/SubProcesses/RamboSamplingKernels.cc
index da68aa9255..79abbcc4f8 100644
--- a/epochX/cudacpp/gg_ttg.sa/SubProcesses/RamboSamplingKernels.cc
+++ b/epochX/cudacpp/gg_ttg.sa/SubProcesses/RamboSamplingKernels.cc
@@ -1,11 +1,11 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #include "RamboSamplingKernels.h"
 
-#include "CudaRuntime.h"
+#include "GpuRuntime.h"
 #include "MemoryAccessMomenta.h"
 #include "MemoryAccessRandomNumbers.h"
 #include "MemoryAccessWeights.h"
@@ -14,7 +14,7 @@
 
 #include <sstream>
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -92,7 +92,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   RamboSamplingKernelDevice::RamboSamplingKernelDevice( const fptype energy,               // input: energy
                                                         const BufferRndNumMomenta& rndmom, // input: random numbers in [0,1]
                                                         BufferMomenta& momenta,            // output: momenta
@@ -135,7 +135,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   __global__ void
   getMomentaInitialDevice( const fptype energy,
                            fptype* momenta )
@@ -147,17 +147,17 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   void
   RamboSamplingKernelDevice::getMomentaInitial()
   {
-    getMomentaInitialDevice<<<m_gpublocks, m_gputhreads>>>( m_energy, m_momenta.data() );
+    gpuLaunchKernel( getMomentaInitialDevice, m_gpublocks, m_gputhreads, m_energy, m_momenta.data() );
   }
 #endif
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   __global__ void
   getMomentaFinalDevice( const fptype energy,
                          const fptype* rndmom,
@@ -171,11 +171,11 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   void
   RamboSamplingKernelDevice::getMomentaFinal()
   {
-    getMomentaFinalDevice<<<m_gpublocks, m_gputhreads>>>( m_energy, m_rndmom.data(), m_momenta.data(), m_weights.data() );
+    gpuLaunchKernel( getMomentaFinalDevice, m_gpublocks, m_gputhreads, m_energy, m_rndmom.data(), m_momenta.data(), m_weights.data() );
   }
 #endif
 
diff --git a/epochX/cudacpp/gg_ttg.sa/SubProcesses/RamboSamplingKernels.h b/epochX/cudacpp/gg_ttg.sa/SubProcesses/RamboSamplingKernels.h
index 184089efd7..7c214cd74b 100644
--- a/epochX/cudacpp/gg_ttg.sa/SubProcesses/RamboSamplingKernels.h
+++ b/epochX/cudacpp/gg_ttg.sa/SubProcesses/RamboSamplingKernels.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef RAMBOSAMPLINGKERNELS_H
 #define RAMBOSAMPLINGKERNELS_H 1
@@ -10,7 +10,7 @@
 
 #include "MemoryBuffers.h"
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -93,7 +93,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   // A class encapsulating RAMBO phase space sampling on a GPU device
   class RamboSamplingKernelDevice final : public SamplingKernelBase, public NumberOfEvents
   {
diff --git a/epochX/cudacpp/gg_ttg.sa/SubProcesses/RandomNumberKernels.h b/epochX/cudacpp/gg_ttg.sa/SubProcesses/RandomNumberKernels.h
index 188a72c2c9..21d63beeac 100644
--- a/epochX/cudacpp/gg_ttg.sa/SubProcesses/RandomNumberKernels.h
+++ b/epochX/cudacpp/gg_ttg.sa/SubProcesses/RandomNumberKernels.h
@@ -1,14 +1,14 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef RANDOMNUMBERKERNELS_H
 #define RANDOMNUMBERKERNELS_H 1
 
 #include "mgOnGpuConfig.h"
 
-// NB This must come AFTER mgOnGpuConfig.h which contains our definition of __global__ when __CUDACC__ is not defined
+// NB This must come AFTER mgOnGpuConfig.h which contains our definition of __global__ when MGONGPUCPP_GPUIMPL is not defined
 #ifndef MGONGPU_HAS_NO_CURAND
 //#include "curand.h"
 struct curandGenerator_st; // forward definition from curand.h
@@ -16,7 +16,7 @@ struct curandGenerator_st; // forward definition from curand.h
 
 #include "MemoryBuffers.h"
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/gg_ttg.sa/SubProcesses/cudacpp.mk b/epochX/cudacpp/gg_ttg.sa/SubProcesses/cudacpp.mk
index a0397e9ecc..bcb73d7f01 100644
--- a/epochX/cudacpp/gg_ttg.sa/SubProcesses/cudacpp.mk
+++ b/epochX/cudacpp/gg_ttg.sa/SubProcesses/cudacpp.mk
@@ -1,7 +1,7 @@
 # Copyright (C) 2020-2023 CERN and UCLouvain.
 # Licensed under the GNU Lesser General Public License (version 3 or later).
 # Created by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin.
-# Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+# Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 
 #=== Determine the name of this makefile (https://ftp.gnu.org/old-gnu/Manuals/make-3.80/html_node/make_17.html)
 #=== NB: different names (e.g. cudacpp.mk and cudacpp_src.mk) are used in the Subprocess and src directories
@@ -32,7 +32,7 @@ UNAME_P := $(shell uname -p)
 #=== Configure common compiler flags for C++ and CUDA
 
 INCFLAGS = -I.
-OPTFLAGS = -O3 # this ends up in CUFLAGS too (should it?), cannot add -Ofast or -ffast-math here
+OPTFLAGS = -O3 # this ends up in GPUFLAGS too (should it?), cannot add -Ofast or -ffast-math here
 
 # Dependency on src directory
 MG5AMC_COMMONLIB = mg5amc_common
@@ -89,69 +89,139 @@ endif
 
 #-------------------------------------------------------------------------------
 
-#=== Configure the CUDA compiler
-
-# If CXX is not a single word (example "clang++ --gcc-toolchain...") then disable CUDA builds (issue #505)
-# This is because it is impossible to pass this to "CUFLAGS += -ccbin <host-compiler>" below
-ifneq ($(words $(subst ccache ,,$(CXX))),1) # allow at most "CXX=ccache <host-compiler>" from outside
-  $(warning CUDA builds are not supported for multi-word CXX "$(CXX)")
-  override CUDA_HOME=disabled
-endif
-
-# If CUDA_HOME is not set, try to set it from the location of nvcc
-ifndef CUDA_HOME
-  CUDA_HOME = $(patsubst %bin/nvcc,%,$(shell which nvcc 2>/dev/null))
-  $(warning CUDA_HOME was not set: using "$(CUDA_HOME)")
-endif
-
-# Set NVCC as $(CUDA_HOME)/bin/nvcc if it exists
-ifneq ($(wildcard $(CUDA_HOME)/bin/nvcc),)
-  NVCC = $(CUDA_HOME)/bin/nvcc
-  USE_NVTX ?=-DUSE_NVTX
-  # See https://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/index.html
-  # See https://arnon.dk/matching-sm-architectures-arch-and-gencode-for-various-nvidia-cards/
-  # Default: use compute capability 70 for V100 (CERN lxbatch, CERN itscrd, Juwels Cluster).
-  # Embed device code for 70, and PTX for 70+.
-  # Export MADGRAPH_CUDA_ARCHITECTURE (comma-separated list) to use another value or list of values (see #533).
-  # Examples: use 60 for P100 (Piz Daint), 80 for A100 (Juwels Booster, NVidia raplab/Curiosity).
-  MADGRAPH_CUDA_ARCHITECTURE ?= 70
-  ###CUARCHFLAGS = -gencode arch=compute_$(MADGRAPH_CUDA_ARCHITECTURE),code=compute_$(MADGRAPH_CUDA_ARCHITECTURE) -gencode arch=compute_$(MADGRAPH_CUDA_ARCHITECTURE),code=sm_$(MADGRAPH_CUDA_ARCHITECTURE) # Older implementation (AV): go back to this one for multi-GPU support #533
-  ###CUARCHFLAGS = --gpu-architecture=compute_$(MADGRAPH_CUDA_ARCHITECTURE) --gpu-code=sm_$(MADGRAPH_CUDA_ARCHITECTURE),compute_$(MADGRAPH_CUDA_ARCHITECTURE) # Newer implementation (SH): cannot use this as-is for multi-GPU support #533
-  comma:=,
-  CUARCHFLAGS = $(foreach arch,$(subst $(comma), ,$(MADGRAPH_CUDA_ARCHITECTURE)),-gencode arch=compute_$(arch),code=compute_$(arch) -gencode arch=compute_$(arch),code=sm_$(arch))
-  CUINC = -I$(CUDA_HOME)/include/
-  CURANDLIBFLAGS = -L$(CUDA_HOME)/lib64/ -lcurand # NB: -lcuda is not needed here!
-  CUOPTFLAGS = -lineinfo
-  CUFLAGS = $(OPTFLAGS) $(CUOPTFLAGS) $(INCFLAGS) $(CUINC) $(USE_NVTX) $(CUARCHFLAGS) -use_fast_math
-  ###CUFLAGS += -Xcompiler -Wall -Xcompiler -Wextra -Xcompiler -Wshadow
-  ###NVCC_VERSION = $(shell $(NVCC) --version | grep 'Cuda compilation tools' | cut -d' ' -f5 | cut -d, -f1)
-  CUFLAGS += -std=c++17 # need CUDA >= 11.2 (see #333): this is enforced in mgOnGpuConfig.h
-  # Without -maxrregcount: baseline throughput: 6.5E8 (16384 32 12) up to 7.3E8 (65536 128 12)
-  ###CUFLAGS+= --maxrregcount 160 # improves throughput: 6.9E8 (16384 32 12) up to 7.7E8 (65536 128 12)
-  ###CUFLAGS+= --maxrregcount 128 # improves throughput: 7.3E8 (16384 32 12) up to 7.6E8 (65536 128 12)
-  ###CUFLAGS+= --maxrregcount 96 # degrades throughput: 4.1E8 (16384 32 12) up to 4.5E8 (65536 128 12)
-  ###CUFLAGS+= --maxrregcount 64 # degrades throughput: 1.7E8 (16384 32 12) flat at 1.7E8 (65536 128 12)
-else ifneq ($(origin REQUIRE_CUDA),undefined)
-  # If REQUIRE_CUDA is set but no cuda is found, stop here (e.g. for CI tests on GPU #443)
-  $(error No cuda installation found (set CUDA_HOME or make nvcc visible in PATH))
-else
-  # No cuda. Switch cuda compilation off and go to common random numbers in C++
-  $(warning CUDA_HOME is not set or is invalid: export CUDA_HOME to compile with cuda)
-  override NVCC=
-  override USE_NVTX=
-  override CUINC=
-  override CURANDLIBFLAGS=
-endif
+CUDA_COMPILER_PATH := $(shell compiler="`which nvcc 2>/dev/null`" && while [ -L "$$compiler" ]; do compiler=`readlink "$$compiler"`; done && echo "$$compiler")
+HIP_COMPILER_PATH := $(shell compiler="`which hipcc 2>/dev/null`" && while [ -L "$$compiler" ]; do compiler=`readlink "$$compiler"`; done && echo "$$compiler")
+
+ifeq ($(findstring nvcc,$(CUDA_COMPILER_PATH)),nvcc)
+  #=== Configure the CUDA compiler
+
+  # If CXX is not a single word (example "clang++ --gcc-toolchain...") then disable CUDA builds (issue #505)
+  # This is because it is impossible to pass this to "GPUFLAGS += -ccbin <host-compiler>" below
+  ifneq ($(words $(subst ccache ,,$(CXX))),1) # allow at most "CXX=ccache <host-compiler>" from outside
+    $(warning CUDA builds are not supported for multi-word CXX "$(CXX)")
+    override CUDA_HOME=disabled
+  endif
+
+  # If CUDA_HOME is not set, try to set it from the location of NVCC
+  ifndef CUDA_HOME
+    CUDA_HOME = $(patsubst %bin/nvcc,%,$(shell which nvcc 2>/dev/null))
+    $(warning CUDA_HOME was not set: using "$(CUDA_HOME)")
+  endif
+
+  # Set GPUCC as $(CUDA_HOME)/bin/nvcc if it exists
+  ifneq ($(wildcard $(CUDA_HOME)/bin/nvcc),)
+    GPUCC = $(CUDA_HOME)/bin/nvcc
+    USE_NVTX ?=-DUSE_NVTX
+    # See https://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/index.html
+    # See https://arnon.dk/matching-sm-architectures-arch-and-gencode-for-various-nvidia-cards/
+    # Default: use compute capability 70 for V100 (CERN lxbatch, CERN itscrd, Juwels Cluster).
+    # Embed device code for 70, and PTX for 70+.
+    # Export MADGRAPH_CUDA_ARCHITECTURE (comma-separated list) to use another value or list of values (see #533).
+    # Examples: use 60 for P100 (Piz Daint), 80 for A100 (Juwels Booster, NVidia raplab/Curiosity).
+    MADGRAPH_CUDA_ARCHITECTURE ?= 70
+    ###CUARCHFLAGS = -gencode arch=compute_$(MADGRAPH_CUDA_ARCHITECTURE),code=compute_$(MADGRAPH_CUDA_ARCHITECTURE) -gencode arch=compute_$(MADGRAPH_CUDA_ARCHITECTURE),code=sm_$(MADGRAPH_CUDA_ARCHITECTURE) # Older implementation (AV): go back to this one for multi-GPU support #533
+    ###CUARCHFLAGS = --gpu-architecture=compute_$(MADGRAPH_CUDA_ARCHITECTURE) --gpu-code=sm_$(MADGRAPH_CUDA_ARCHITECTURE),compute_$(MADGRAPH_CUDA_ARCHITECTURE) # Newer implementation (SH): cannot use this as-is for multi-GPU support #533
+    comma:=,
+    CUARCHFLAGS = $(foreach arch,$(subst $(comma), ,$(MADGRAPH_CUDA_ARCHITECTURE)),-gencode arch=compute_$(arch),code=compute_$(arch) -gencode arch=compute_$(arch),code=sm_$(arch))
+    CUINC = -I$(CUDA_HOME)/include/
+    CURANDLIBFLAGS = -L$(CUDA_HOME)/lib64/ -lcurand # NB: -lcuda is not needed here!
+    CUOPTFLAGS = -lineinfo
+    GPUFLAGS = $(OPTFLAGS) $(CUOPTFLAGS) $(INCFLAGS) $(CUINC) $(USE_NVTX) $(CUARCHFLAGS) -use_fast_math
+    ###GPUFLAGS += -Xcompiler -Wall -Xcompiler -Wextra -Xcompiler -Wshadow
+    ###GPUCC_VERSION = $(shell $(GPUCC) --version | grep 'Cuda compilation tools' | cut -d' ' -f5 | cut -d, -f1)
+    GPUFLAGS += -std=c++17 # need CUDA >= 11.2 (see #333): this is enforced in mgOnGpuConfig.h
+    # Without -maxrregcount: baseline throughput: 6.5E8 (16384 32 12) up to 7.3E8 (65536 128 12)
+    ###GPUFLAGS+= --maxrregcount 160 # improves throughput: 6.9E8 (16384 32 12) up to 7.7E8 (65536 128 12)
+    ###GPUFLAGS+= --maxrregcount 128 # improves throughput: 7.3E8 (16384 32 12) up to 7.6E8 (65536 128 12)
+    ###GPUFLAGS+= --maxrregcount 96 # degrades throughput: 4.1E8 (16384 32 12) up to 4.5E8 (65536 128 12)
+    ###GPUFLAGS+= --maxrregcount 64 # degrades throughput: 1.7E8 (16384 32 12) flat at 1.7E8 (65536 128 12)
+
+    CUBUILDRULEFLAGS = -Xcompiler -fPIC -c
+    CCBUILDRULEFLAGS = -Xcompiler -fPIC -c -x cu
+
+    CUDATESTFLAGS = -lcuda
+
+  else ifneq ($(origin REQUIRE_CUDA),undefined)
+    # If REQUIRE_CUDA is set but no cuda is found, stop here (e.g. for CI tests on GPU #443)
+    $(error No cuda installation found (set CUDA_HOME or make GPUCC visible in PATH))
+  else
+    # No cuda. Switch cuda compilation off and go to common random numbers in C++
+    $(warning CUDA_HOME is not set or is invalid: export CUDA_HOME to compile with cuda)
+    override GPUCC=
+    override USE_NVTX=
+    override CUINC=
+    override CURANDLIBFLAGS=
+  endif
+
+  # Set the host C++ compiler for GPUCC via "-ccbin <host-compiler>"
+  # (NB issue #505: this must be a single word, "clang++ --gcc-toolchain..." is not supported)
+  GPUFLAGS += -ccbin $(shell which $(subst ccache ,,$(CXX)))
+
+  # Allow newer (unsupported) C++ compilers with older versions of CUDA if ALLOW_UNSUPPORTED_COMPILER_IN_CUDA is set (#504)
+  ifneq ($(origin ALLOW_UNSUPPORTED_COMPILER_IN_CUDA),undefined)
+  GPUFLAGS += -allow-unsupported-compiler
+  endif
+
+else ifeq ($(findstring hipcc,$(HIP_COMPILER_PATH)),hipcc)
+  #=== Configure the HIP compiler
+
+  # If CXX is not a single word (example "clang++ --gcc-toolchain...") then disable CUDA builds (issue #505)
+  # This is because it is impossible to pass this to "GPUFLAGS += -ccbin <host-compiler>" below
+  ifneq ($(words $(subst ccache ,,$(CXX))),1) # allow at most "CXX=ccache <host-compiler>" from outside
+    $(warning CUDA builds are not supported for multi-word CXX "$(CXX)")
+    override CUDA_HOME=disabled
+  endif
+
+  # If HIP_HOME is not set, try to set it from the location of GPUCC
+  ifndef HIP_HOME
+    HIP_HOME = $(patsubst %bin/hipcc,%,$(HIP_COMPILER_PATH))
+    $(warning HIP_HOME was not set: using "$(HIP_HOME)")
+  endif
 
-# Set the host C++ compiler for nvcc via "-ccbin <host-compiler>"
-# (NB issue #505: this must be a single word, "clang++ --gcc-toolchain..." is not supported)
-CUFLAGS += -ccbin $(shell which $(subst ccache ,,$(CXX)))
+  # Set GPUCC as $(HIP_HOME)/bin/hipcc if it exists
+  ifneq ($(wildcard $(HIP_HOME)/bin/hipcc),)
+    GPUCC = $(HIP_HOME)/bin/hipcc
+
+    # Should maybe find something equivelant to this in HIP
+    #USE_NVTX ?=-DUSE_NVTX
+
+    HIPARCHFLAGS = -target x86_64-linux-gnu --offload-arch=gfx90a
+    HIPINC = -I$(HIP_HOME)/include/
+
+    # -DHIP_FAST_MATH equivelant to -use_fast_math in HIP 
+    # (But only for single precision line 208: https://rocm-developer-tools.github.io/HIP/hcc__detail_2math__functions_8h_source.html)
+    GPUFLAGS = $(OPTFLAGS) $(CUOPTFLAGS) $(INCFLAGS) $(HIPINC) $(HIPARCHFLAGS) -DHIP_FAST_MATH -DHIP_PLATFORM=amd -fPIC
+    ###GPUFLAGS += -Xcompiler -Wall -Xcompiler -Wextra -Xcompiler -Wshadow
+    GPUFLAGS += -std=c++17
+    # Without -maxrregcount: baseline throughput: 6.5E8 (16384 32 12) up to 7.3E8 (65536 128 12)
+    ###GPUFLAGS+= --maxrregcount 160 # improves throughput: 6.9E8 (16384 32 12) up to 7.7E8 (65536 128 12)
+    ###GPUFLAGS+= --maxrregcount 128 # improves throughput: 7.3E8 (16384 32 12) up to 7.6E8 (65536 128 12)
+    ###GPUFLAGS+= --maxrregcount 96 # degrades throughput: 4.1E8 (16384 32 12) up to 4.5E8 (65536 128 12)
+    ###GPUFLAGS+= --maxrregcount 64 # degrades throughput: 1.7E8 (16384 32 12) flat at 1.7E8 (65536 128 12)
+
+    CUBUILDRULEFLAGS = -fPIC -c
+    CCBUILDRULEFLAGS = -fPIC -c
+
+  else ifneq ($(origin REQUIRE_HIP),undefined)
+    # If REQUIRE_HIP is set but no cuda is found, stop here (e.g. for CI tests on GPU #443)
+    $(error No hip installation found (set HIP_HOME or make GPUCC visible in PATH))
+  else
+    # No hip. Switch hip compilation off and go to common random numbers in C++
+    $(warning HIP_HOME is not set or is invalid: export HIP_HOME to compile with hip)
+    override GPUCC=
+    override USE_NVTX=
+    override CUINC=
+    override CURANDLIBFLAGS=
+  endif
+
+  # Allow newer (unsupported) C++ compilers with older versions of CUDA if ALLOW_UNSUPPORTED_COMPILER_IN_CUDA is set (#504)
+  ifneq ($(origin ALLOW_UNSUPPORTED_COMPILER_IN_CUDA),undefined)
+  GPUFLAGS += -allow-unsupported-compiler
+  endif
 
-# Allow newer (unsupported) C++ compilers with older versions of CUDA if ALLOW_UNSUPPORTED_COMPILER_IN_CUDA is set (#504)
-ifneq ($(origin ALLOW_UNSUPPORTED_COMPILER_IN_CUDA),undefined)
-CUFLAGS += -allow-unsupported-compiler
 endif
 
+  
 #-------------------------------------------------------------------------------
 
 #=== Configure ccache for C++ and CUDA builds
@@ -163,9 +233,9 @@ endif
 #ifeq ($(USECCACHE)$(shell echo $(AR) | grep ccache),1)
 #  override AR:=ccache $(AR)
 #endif
-ifneq ($(NVCC),)
-  ifeq ($(USECCACHE)$(shell echo $(NVCC) | grep ccache),1)
-    override NVCC:=ccache $(NVCC)
+ifneq ($(GPUCC),)
+  ifeq ($(USECCACHE)$(shell echo $(GPUCC) | grep ccache),1)
+    override GPUCC:=ccache $(GPUCC)
   endif
 endif
 
@@ -189,7 +259,7 @@ endif
 
 # PowerPC-specific CUDA compiler flags (to be reviewed!)
 ifeq ($(UNAME_P),ppc64le)
-  CUFLAGS+= -Xcompiler -mno-float128
+  GPUFLAGS+= -Xcompiler -mno-float128
 endif
 
 #-------------------------------------------------------------------------------
@@ -199,10 +269,10 @@ endif
 # Set the default OMPFLAGS choice
 ifneq ($(shell $(CXX) --version | egrep '^Intel'),)
 override OMPFLAGS = -fopenmp
-###override OMPFLAGS = # disable OpenMP MT on Intel (was ok without nvcc but not ok with nvcc before #578)
+###override OMPFLAGS = # disable OpenMP MT on Intel (was ok without GPUCC but not ok with GPUCC before #578)
 else ifneq ($(shell $(CXX) --version | egrep '^(clang)'),)
 override OMPFLAGS = -fopenmp
-###override OMPFLAGS = # disable OpenMP MT on clang (was not ok without or with nvcc before #578)
+###override OMPFLAGS = # disable OpenMP MT on clang (was not ok without or with GPUCC before #578)
 else ifneq ($(shell $(CXX) --version | egrep '^(Apple clang)'),)
 override OMPFLAGS = # disable OpenMP MT on Apple clang (builds fail in the CI #578)
 else
@@ -253,7 +323,10 @@ endif
 
 # Set the default RNDGEN (random number generator) choice
 ifeq ($(RNDGEN),)
-  ifeq ($(NVCC),)
+  ifeq ($(GPUCC),)
+    override RNDGEN = hasNoCurand
+  # Edgecase for HIP compilation
+  else ifeq ($(findstring hipcc,$(GPUCC)),hipcc)
     override RNDGEN = hasNoCurand
   else ifeq ($(RNDGEN),)
     override RNDGEN = hasCurand
@@ -328,13 +401,13 @@ CXXFLAGS+= $(AVXFLAGS)
 $(info FPTYPE=$(FPTYPE))
 ifeq ($(FPTYPE),d)
   CXXFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_DOUBLE
-  CUFLAGS  += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_DOUBLE
+  GPUFLAGS  += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_DOUBLE
 else ifeq ($(FPTYPE),f)
   CXXFLAGS += -DMGONGPU_FPTYPE_FLOAT -DMGONGPU_FPTYPE2_FLOAT
-  CUFLAGS  += -DMGONGPU_FPTYPE_FLOAT -DMGONGPU_FPTYPE2_FLOAT
+  GPUFLAGS  += -DMGONGPU_FPTYPE_FLOAT -DMGONGPU_FPTYPE2_FLOAT
 else ifeq ($(FPTYPE),m)
   CXXFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_FLOAT
-  CUFLAGS  += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_FLOAT
+  GPUFLAGS  += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_FLOAT
 else
   $(error Unknown FPTYPE='$(FPTYPE)': only 'd', 'f' and 'm' are supported)
 endif
@@ -343,7 +416,7 @@ endif
 $(info HELINL=$(HELINL))
 ifeq ($(HELINL),1)
   CXXFLAGS += -DMGONGPU_INLINE_HELAMPS
-  CUFLAGS  += -DMGONGPU_INLINE_HELAMPS
+  GPUFLAGS  += -DMGONGPU_INLINE_HELAMPS
 else ifneq ($(HELINL),0)
   $(error Unknown HELINL='$(HELINL)': only '0' and '1' are supported)
 endif
@@ -352,7 +425,7 @@ endif
 $(info HRDCOD=$(HRDCOD))
 ifeq ($(HRDCOD),1)
   CXXFLAGS += -DMGONGPU_HARDCODE_PARAM
-  CUFLAGS  += -DMGONGPU_HARDCODE_PARAM
+  GPUFLAGS  += -DMGONGPU_HARDCODE_PARAM
 else ifneq ($(HRDCOD),0)
   $(error Unknown HRDCOD='$(HRDCOD)': only '0' and '1' are supported)
 endif
@@ -404,11 +477,11 @@ ifeq ($(UNAME_S),Darwin)
   override CULIBFLAGSRPATH2 =
 else
   # RPATH to cuda/cpp libs when linking executables
-  override CXXLIBFLAGSRPATH = -Wl,-rpath,$(LIBDIRRPATH)
-  override CULIBFLAGSRPATH = -Xlinker -rpath,$(LIBDIRRPATH)
+  override CXXLIBFLAGSRPATH = -Wl,-rpath=$(LIBDIRRPATH)
+  override CULIBFLAGSRPATH = -Xlinker -rpath=$(LIBDIRRPATH)
   # RPATH to common lib when linking cuda/cpp libs
-  override CXXLIBFLAGSRPATH2 = -Wl,-rpath,'$$ORIGIN'
-  override CULIBFLAGSRPATH2 = -Xlinker -rpath,'$$ORIGIN'
+  override CXXLIBFLAGSRPATH2 = -Wl,-rpath='$$ORIGIN'
+  override CULIBFLAGSRPATH2 = -Xlinker -rpath='$$ORIGIN'
 endif
 
 # Setting LD_LIBRARY_PATH or DYLD_LIBRARY_PATH in the RUNTIME is no longer necessary (neither on Linux nor on Mac)
@@ -421,7 +494,7 @@ override RUNTIME =
 cxx_main=$(BUILDDIR)/check.exe
 fcxx_main=$(BUILDDIR)/fcheck.exe
 
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 cu_main=$(BUILDDIR)/gcheck.exe
 fcu_main=$(BUILDDIR)/fgcheck.exe
 else
@@ -452,15 +525,16 @@ $(BUILDDIR)/.build.$(TAG):
 	@touch $(BUILDDIR)/.build.$(TAG)
 
 # Generic target and build rules: objects from CUDA compilation
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 $(BUILDDIR)/%.o : %.cu *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
 	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
-	$(NVCC) $(CPPFLAGS) $(CUFLAGS) -Xcompiler -fPIC -c $< -o $@
+	$(GPUCC) $(CPPFLAGS) $(GPUFLAGS) $(CUBUILDRULEFLAGS) $< -o $@
 
 $(BUILDDIR)/%_cu.o : %.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
 	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
-	$(NVCC) $(CPPFLAGS) $(CUFLAGS) -Xcompiler -fPIC -c -x cu $< -o $@
+	$(GPUCC) $(CPPFLAGS) $(GPUFLAGS) $(CCBUILDRULEFLAGS) $< -o $@
 endif
+# -x cu in line above
 
 # Generic target and build rules: objects from C++ compilation
 # (NB do not include CUINC here! add it only for NVTX or curand #679)
@@ -469,11 +543,14 @@ $(BUILDDIR)/%.o : %.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
 	$(CXX) $(CPPFLAGS) $(CXXFLAGS) -fPIC -c $< -o $@
 
 # Apply special build flags only to CrossSectionKernel.cc and gCrossSectionKernel.cu (no fast math, see #117)
+# Added edgecase for HIP compilation
 ifeq ($(shell $(CXX) --version | grep ^nvc++),)
 $(BUILDDIR)/CrossSectionKernels.o: CXXFLAGS += -fno-fast-math
 $(BUILDDIR)/CrossSectionKernels.o: CXXFLAGS += -fno-fast-math
-ifneq ($(NVCC),)
-$(BUILDDIR)/gCrossSectionKernels.o: CUFLAGS += -Xcompiler -fno-fast-math
+ifeq ($(findstring nvcc,$(GPUCC)),nvcc)
+  $(BUILDDIR)/gCrossSectionKernels.o: GPUFLAGS += -Xcompiler -fno-fast-math
+else
+  $(BUILDDIR)/gCrossSectionKernels.o: GPUFLAGS += -fno-fast-math
 endif
 endif
 
@@ -489,10 +566,10 @@ ifeq ($(RNDGEN),hasCurand)
 $(BUILDDIR)/CurandRandomNumberKernel.o: CXXFLAGS += $(CUINC)
 endif
 
-# Avoid "warning: builtin __has_trivial_... is deprecated; use __is_trivially_... instead" in nvcc with icx2023 (#592)
+# Avoid "warning: builtin __has_trivial_... is deprecated; use __is_trivially_... instead" in GPUCC with icx2023 (#592)
 ifneq ($(shell $(CXX) --version | egrep '^(Intel)'),)
-ifneq ($(NVCC),)
-CUFLAGS += -Xcompiler -Wno-deprecated-builtins
+ifneq ($(GPUCC),)
+GPUFLAGS += -Wno-deprecated-builtins
 endif
 endif
 
@@ -500,8 +577,8 @@ endif
 # This patch does remove the warning, but I prefer to keep it disabled for the moment...
 ###ifneq ($(shell $(CXX) --version | egrep '^(clang|Apple clang|Intel)'),)
 ###$(BUILDDIR)/CrossSectionKernels.o: CXXFLAGS += -Wno-overriding-t-option
-###ifneq ($(NVCC),)
-###$(BUILDDIR)/gCrossSectionKernels.o: CUFLAGS += -Xcompiler -Wno-overriding-t-option
+###ifneq ($(GPUCC),)
+###$(BUILDDIR)/gCrossSectionKernels.o: GPUFLAGS += -Xcompiler -Wno-overriding-t-option
 ###endif
 ###endif
 
@@ -528,7 +605,7 @@ MG5AMC_CXXLIB = mg5amc_$(processid_short)_cpp
 cxx_objects_lib=$(BUILDDIR)/CPPProcess.o $(BUILDDIR)/MatrixElementKernels.o $(BUILDDIR)/BridgeKernels.o $(BUILDDIR)/CrossSectionKernels.o
 cxx_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel.o $(BUILDDIR)/RamboSamplingKernels.o
 
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 MG5AMC_CULIB = mg5amc_$(processid_short)_cuda
 cu_objects_lib=$(BUILDDIR)/gCPPProcess.o $(BUILDDIR)/gMatrixElementKernels.o $(BUILDDIR)/gBridgeKernels.o $(BUILDDIR)/gCrossSectionKernels.o
 cu_objects_exe=$(BUILDDIR)/gCommonRandomNumberKernel.o $(BUILDDIR)/gRamboSamplingKernels.o
@@ -540,11 +617,11 @@ $(LIBDIR)/lib$(MG5AMC_CXXLIB).so: cxx_objects_lib += $(BUILDDIR)/fbridge.o
 $(LIBDIR)/lib$(MG5AMC_CXXLIB).so: $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib)
 	$(CXX) -shared -o $@ $(cxx_objects_lib) $(CXXLIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB)
 
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 $(LIBDIR)/lib$(MG5AMC_CULIB).so: $(BUILDDIR)/fbridge_cu.o
 $(LIBDIR)/lib$(MG5AMC_CULIB).so: cu_objects_lib += $(BUILDDIR)/fbridge_cu.o
 $(LIBDIR)/lib$(MG5AMC_CULIB).so: $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cu_objects_lib)
-	$(NVCC) --shared -o $@ $(cu_objects_lib) $(CULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB)
+	$(GPUCC) --shared -o $@ $(cu_objects_lib) $(CULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB)
 endif
 
 #-------------------------------------------------------------------------------
@@ -561,16 +638,16 @@ $(cxx_main): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PAT
 $(cxx_main): $(BUILDDIR)/check_sa.o $(LIBDIR)/lib$(MG5AMC_CXXLIB).so $(cxx_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel.o
 	$(CXX) -o $@ $(BUILDDIR)/check_sa.o $(OMPFLAGS) -ldl -pthread $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_CXXLIB) $(cxx_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel.o $(CURANDLIBFLAGS)
 
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 ifneq ($(shell $(CXX) --version | grep ^Intel),)
-$(cu_main): LIBFLAGS += -lintlc # compile with icpx and link with nvcc (undefined reference to `_intel_fast_memcpy')
-$(cu_main): LIBFLAGS += -lsvml # compile with icpx and link with nvcc (undefined reference to `__svml_cos4_l9')
+$(cu_main): LIBFLAGS += -lintlc # compile with icpx and link with GPUCC (undefined reference to `_intel_fast_memcpy')
+$(cu_main): LIBFLAGS += -lsvml # compile with icpx and link with GPUCC (undefined reference to `__svml_cos4_l9')
 else ifneq ($(shell $(CXX) --version | grep ^nvc++),) # support nvc++ #531
 $(cu_main): LIBFLAGS += -L$(patsubst %bin/nvc++,%lib,$(subst ccache ,,$(CXX))) -lnvhpcatm -lnvcpumath -lnvc
 endif
 $(cu_main): LIBFLAGS += $(CULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH
 $(cu_main): $(BUILDDIR)/gcheck_sa.o $(LIBDIR)/lib$(MG5AMC_CULIB).so $(cu_objects_exe) $(BUILDDIR)/gCurandRandomNumberKernel.o
-	$(NVCC) -o $@ $(BUILDDIR)/gcheck_sa.o $(CUARCHFLAGS) $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe) $(BUILDDIR)/gCurandRandomNumberKernel.o $(CURANDLIBFLAGS)
+	$(GPUCC) -o $@ $(BUILDDIR)/gcheck_sa.o $(CUARCHFLAGS) $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe) $(BUILDDIR)/gCurandRandomNumberKernel.o $(CURANDLIBFLAGS)
 endif
 
 #-------------------------------------------------------------------------------
@@ -596,17 +673,17 @@ $(fcxx_main): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PA
 $(fcxx_main): $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler.o $(LIBDIR)/lib$(MG5AMC_CXXLIB).so $(cxx_objects_exe)
 	$(CXX) -o $@ $(BUILDDIR)/fcheck_sa.o $(OMPFLAGS) $(BUILDDIR)/fsampler.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CXXLIB) $(cxx_objects_exe)
 
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 ifneq ($(shell $(CXX) --version | grep ^Intel),)
-$(fcu_main): LIBFLAGS += -lintlc # compile with icpx and link with nvcc (undefined reference to `_intel_fast_memcpy')
-$(fcu_main): LIBFLAGS += -lsvml # compile with icpx and link with nvcc (undefined reference to `__svml_cos4_l9')
+$(fcu_main): LIBFLAGS += -lintlc # compile with icpx and link with GPUCC (undefined reference to `_intel_fast_memcpy')
+$(fcu_main): LIBFLAGS += -lsvml # compile with icpx and link with GPUCC (undefined reference to `__svml_cos4_l9')
 endif
 ifeq ($(UNAME_S),Darwin)
 $(fcu_main): LIBFLAGS += -L$(shell dirname $(shell $(FC) --print-file-name libgfortran.dylib)) # add path to libgfortran on Mac #375
 endif
 $(fcu_main): LIBFLAGS += $(CULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH
 $(fcu_main): $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler_cu.o $(LIBDIR)/lib$(MG5AMC_CULIB).so $(cu_objects_exe)
-	$(NVCC) -o $@ $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler_cu.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe)
+	$(GPUCC) -o $@ $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler_cu.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe)
 endif
 
 #-------------------------------------------------------------------------------
@@ -618,7 +695,7 @@ $(BUILDDIR)/testxxx.o: testxxx_cc_ref.txt
 $(testmain): $(BUILDDIR)/testxxx.o
 $(testmain): cxx_objects_exe += $(BUILDDIR)/testxxx.o # Comment out this line to skip the C++ test of xxx functions
 
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 $(BUILDDIR)/testxxx_cu.o: $(GTESTLIBS)
 $(BUILDDIR)/testxxx_cu.o: INCFLAGS += $(GTESTINC)
 $(BUILDDIR)/testxxx_cu.o: testxxx_cc_ref.txt
@@ -631,7 +708,7 @@ $(BUILDDIR)/testmisc.o: INCFLAGS += $(GTESTINC)
 $(testmain): $(BUILDDIR)/testmisc.o
 $(testmain): cxx_objects_exe += $(BUILDDIR)/testmisc.o # Comment out this line to skip the C++ miscellaneous tests
 
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 $(BUILDDIR)/testmisc_cu.o: $(GTESTLIBS)
 $(BUILDDIR)/testmisc_cu.o: INCFLAGS += $(GTESTINC)
 $(testmain): $(BUILDDIR)/testmisc_cu.o
@@ -643,12 +720,12 @@ $(BUILDDIR)/runTest.o: INCFLAGS += $(GTESTINC)
 $(testmain): $(BUILDDIR)/runTest.o
 $(testmain): cxx_objects_exe += $(BUILDDIR)/runTest.o
 
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 $(BUILDDIR)/runTest_cu.o: $(GTESTLIBS)
 $(BUILDDIR)/runTest_cu.o: INCFLAGS += $(GTESTINC)
 ifneq ($(shell $(CXX) --version | grep ^Intel),)
-$(testmain): LIBFLAGS += -lintlc # compile with icpx and link with nvcc (undefined reference to `_intel_fast_memcpy')
-$(testmain): LIBFLAGS += -lsvml # compile with icpx and link with nvcc (undefined reference to `__svml_cos4_l9')
+$(testmain): LIBFLAGS += -lintlc # compile with icpx and link with GPUCC (undefined reference to `_intel_fast_memcpy')
+$(testmain): LIBFLAGS += -lsvml # compile with icpx and link with GPUCC (undefined reference to `__svml_cos4_l9')
 else ifneq ($(shell $(CXX) --version | grep ^nvc++),) # support nvc++ #531
 $(testmain): LIBFLAGS += -L$(patsubst %bin/nvc++,%lib,$(subst ccache ,,$(CXX))) -lnvhpcatm -lnvcpumath -lnvc
 endif
@@ -672,14 +749,14 @@ $(testmain): LIBFLAGS += -lgomp
 endif
 endif
 
-ifeq ($(NVCC),) # link only runTest.o
+ifeq ($(GPUCC),) # link only runTest.o
 $(testmain): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH
 $(testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(cxx_objects_exe) $(GTESTLIBS)
 	$(CXX) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) -ldl -pthread $(LIBFLAGS)
 else # link both runTest.o and runTest_cu.o
 $(testmain): LIBFLAGS += $(CULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH
 $(testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) $(GTESTLIBS)
-	$(NVCC) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) -ldl $(LIBFLAGS) -lcuda
+	  $(GPUCC) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) -ldl $(LIBFLAGS) $(CUDATESTFLAGS)
 endif
 
 # Use flock (Linux only, no Mac) to allow 'make -j' if googletest has not yet been downloaded https://stackoverflow.com/a/32666215
@@ -782,9 +859,9 @@ ifeq ($(USECCACHE),1)
 	ccache --version | head -1
 endif
 	@echo ""
-	@echo NVCC=$(NVCC)
-ifneq ($(NVCC),)
-	$(NVCC) --version
+	@echo GPUCC=$(GPUCC)
+ifneq ($(GPUCC),)
+	$(GPUCC) --version
 endif
 	@echo ""
 	@echo CXX=$(CXX)
@@ -803,7 +880,7 @@ endif
 
 # Target: check (run the C++ test executable)
 # [NB THIS IS WHAT IS USED IN THE GITHUB CI!]
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 check: runTest cmpFcheck cmpFGcheck
 else
 check: runTest cmpFcheck
diff --git a/epochX/cudacpp/gg_ttg.sa/SubProcesses/fbridge.cc b/epochX/cudacpp/gg_ttg.sa/SubProcesses/fbridge.cc
index f93c05b0b3..2b956730d4 100644
--- a/epochX/cudacpp/gg_ttg.sa/SubProcesses/fbridge.cc
+++ b/epochX/cudacpp/gg_ttg.sa/SubProcesses/fbridge.cc
@@ -1,11 +1,11 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: S. Roiser (Oct 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Roiser, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #include "Bridge.h"
 #include "CPPProcess.h"
-#include "CudaRuntime.h"
+#include "GpuRuntime.h"
 
 extern "C"
 {
@@ -22,7 +22,7 @@ extern "C"
    * Using the same Fortran MadEvent code, linking to the hetrerogeneous library would allow access to both CPU and GPU implementations.
    * The specific heterogeneous configuration (how many GPUs, how many threads on each CPU, etc) could be loaded in CUDA/C++ from a data file.
    */
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   using namespace mg5amcGpu;
 #else
   using namespace mg5amcCpu;
@@ -46,8 +46,8 @@ extern "C"
    */
   void fbridgecreate_( CppObjectInFortran** ppbridge, const int* pnevtF, const int* pnparF, const int* pnp4F )
   {
-#ifdef __CUDACC__
-    CudaRuntime::setUp();
+#ifdef MGONGPUCPP_GPUIMPL
+    GpuRuntime::setUp();
 #endif
     // Create a process object, read parm card and set parameters
     // FIXME: the process instance can happily go out of scope because it is only needed to read parameters?
@@ -69,8 +69,8 @@ extern "C"
     Bridge<FORTRANFPTYPE>* pbridge = dynamic_cast<Bridge<FORTRANFPTYPE>*>( *ppbridge );
     if( pbridge == 0 ) throw std::runtime_error( "fbridgedelete_: invalid Bridge address" );
     delete pbridge;
-#ifdef __CUDACC__
-    CudaRuntime::tearDown();
+#ifdef MGONGPUCPP_GPUIMPL
+    GpuRuntime::tearDown();
 #endif
   }
 
@@ -100,7 +100,7 @@ extern "C"
   {
     Bridge<FORTRANFPTYPE>* pbridge = dynamic_cast<Bridge<FORTRANFPTYPE>*>( *ppbridge );
     if( pbridge == 0 ) throw std::runtime_error( "fbridgesequence_: invalid Bridge address" );
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     // Use the device/GPU implementation in the CUDA library
     // (there is also a host implementation in this library)
     pbridge->gpu_sequence( momenta, gs, rndhel, rndcol, *pchannelId, mes, selhel, selcol );
diff --git a/epochX/cudacpp/gg_ttg.sa/SubProcesses/fsampler.cc b/epochX/cudacpp/gg_ttg.sa/SubProcesses/fsampler.cc
index 2fb445372d..3743934f41 100644
--- a/epochX/cudacpp/gg_ttg.sa/SubProcesses/fsampler.cc
+++ b/epochX/cudacpp/gg_ttg.sa/SubProcesses/fsampler.cc
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Feb 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #include "mgOnGpuConfig.h"
 
@@ -13,7 +13,7 @@
 
 //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -40,7 +40,7 @@ namespace mg5amcCpu
   private:
     const int m_nevt; // The number of events in each iteration
     int m_iiter;      // The iteration counter (for random number seeding)
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
     HostBufferRndNumMomenta m_hstRndmom; // Memory buffers for random numbers
     HostBufferMomenta m_hstMomenta;      // Memory buffers for momenta
     HostBufferWeights m_hstWeights;      // Memory buffers for sampling weights
@@ -105,7 +105,7 @@ namespace mg5amcCpu
 
 extern "C"
 {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   using namespace mg5amcGpu;
 #else
   using namespace mg5amcCpu;
diff --git a/epochX/cudacpp/gg_ttg.sa/SubProcesses/runTest.cc b/epochX/cudacpp/gg_ttg.sa/SubProcesses/runTest.cc
index 572e28aaea..461ec5c3a5 100644
--- a/epochX/cudacpp/gg_ttg.sa/SubProcesses/runTest.cc
+++ b/epochX/cudacpp/gg_ttg.sa/SubProcesses/runTest.cc
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: S. Hageboeck (Nov 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 
 #include "mgOnGpuConfig.h"
 
@@ -15,7 +15,7 @@
 #include "RandomNumberKernels.h"
 #include "epoch_process_id.h"
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 using namespace mg5amcGpu;
 #else
 using namespace mg5amcCpu;
@@ -32,7 +32,7 @@ struct CUDA_CPU_TestBase : public TestDriverBase
     : TestDriverBase( npar, refFileName ) {}
 };
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 struct CPUTest : public CUDA_CPU_TestBase
 {
   // Struct data members (process, and memory structures for random numbers, momenta, matrix elements and weights on host and device)
@@ -114,7 +114,7 @@ struct CPUTest : public CUDA_CPU_TestBase
 };
 #endif
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 struct CUDATest : public CUDA_CPU_TestBase
 {
   // Reset the device when our test goes out of scope. Note that this should happen after
@@ -123,7 +123,7 @@ struct CUDATest : public CUDA_CPU_TestBase
   {
     ~DeviceReset()
     {
-      checkCuda( cudaDeviceReset() ); // this is needed by cuda-memcheck --leak-check full
+      gpuDeviceReset(); // this is needed by cuda-memcheck --leak-check full
     }
   } deviceResetter;
 
@@ -249,7 +249,7 @@ INSTANTIATE_TEST_SUITE_P( prefix, \
                           test_suite_name, \
                           testing::Values( new CUDATest( MG_EPOCH_REFERENCE_FILE_NAME ) ) );
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 MG_INSTANTIATE_TEST_SUITE_GPU( XTESTID_GPU( MG_EPOCH_PROCESS_ID ), MadgraphTest );
 #else
 MG_INSTANTIATE_TEST_SUITE_CPU( XTESTID_CPU( MG_EPOCH_PROCESS_ID ), MadgraphTest );
diff --git a/epochX/cudacpp/gg_ttg.sa/SubProcesses/testmisc.cc b/epochX/cudacpp/gg_ttg.sa/SubProcesses/testmisc.cc
index 989aba1fdc..2bd7a9fcf9 100644
--- a/epochX/cudacpp/gg_ttg.sa/SubProcesses/testmisc.cc
+++ b/epochX/cudacpp/gg_ttg.sa/SubProcesses/testmisc.cc
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 //----------------------------------------------------------------------------
 // Use ./runTest.exe --gtest_filter=*misc to run only this test
 //----------------------------------------------------------------------------
@@ -17,7 +17,7 @@
 #include <sstream>
 #include <typeinfo>
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 #define TESTID( s ) s##_GPU_MISC
 #else
 #define TESTID( s ) s##_CPU_MISC
diff --git a/epochX/cudacpp/gg_ttg.sa/SubProcesses/testxxx.cc b/epochX/cudacpp/gg_ttg.sa/SubProcesses/testxxx.cc
index 4243e9fcec..6e8657edca 100644
--- a/epochX/cudacpp/gg_ttg.sa/SubProcesses/testxxx.cc
+++ b/epochX/cudacpp/gg_ttg.sa/SubProcesses/testxxx.cc
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Apr 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #include "mgOnGpuConfig.h"
 
@@ -20,7 +20,7 @@
 #include <iomanip>
 #include <iostream>
 #include <vector>
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 #define TESTID( s ) s##_GPU_XXX
 #else
 #define TESTID( s ) s##_CPU_XXX
@@ -41,7 +41,7 @@ TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testxxx )
   assert( nevt % neppM == 0 ); // nevt must be a multiple of neppM
   assert( nevt % neppV == 0 ); // nevt must be a multiple of neppV
   // Fill in the input momenta
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   mg5amcGpu::PinnedHostBufferMomenta hstMomenta( nevt ); // AOSOA[npagM][npar=4][np4=4][neppM]
 #else
   mg5amcCpu::HostBufferMomenta hstMomenta( nevt ); // AOSOA[npagM][npar=4][np4=4][neppM]
@@ -228,7 +228,7 @@ TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testxxx )
   {
     for( int ievt = 0; ievt < nevt; ievt++ )
     {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
       using namespace mg5amcGpu;
 #else
       using namespace mg5amcCpu;
diff --git a/epochX/cudacpp/gg_ttg.sa/src/HelAmps_sm.h b/epochX/cudacpp/gg_ttg.sa/src/HelAmps_sm.h
index 5a3a5dc76f..3593d9f169 100644
--- a/epochX/cudacpp/gg_ttg.sa/src/HelAmps_sm.h
+++ b/epochX/cudacpp/gg_ttg.sa/src/HelAmps_sm.h
@@ -4,7 +4,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: A. Valassi (Sep 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
 // MadGraph5_aMC@NLO v. 3.5.0_lo_vect, 2023-06-09
@@ -26,7 +26,7 @@
 //#include <iomanip>
 //#include <iostream>
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/gg_ttg.sa/src/Parameters_sm.cc b/epochX/cudacpp/gg_ttg.sa/src/Parameters_sm.cc
index d3d01102fd..de87dcaf64 100644
--- a/epochX/cudacpp/gg_ttg.sa/src/Parameters_sm.cc
+++ b/epochX/cudacpp/gg_ttg.sa/src/Parameters_sm.cc
@@ -4,7 +4,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: A. Valassi (Sep 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
 // MadGraph5_aMC@NLO v. 3.5.0_lo_vect, 2023-06-09
diff --git a/epochX/cudacpp/gg_ttg.sa/src/Parameters_sm.h b/epochX/cudacpp/gg_ttg.sa/src/Parameters_sm.h
index 6551d8da81..fe7d686938 100644
--- a/epochX/cudacpp/gg_ttg.sa/src/Parameters_sm.h
+++ b/epochX/cudacpp/gg_ttg.sa/src/Parameters_sm.h
@@ -214,7 +214,7 @@ namespace Parameters_sm_dependentCouplings
 #pragma GCC diagnostic push
 #pragma GCC diagnostic ignored "-Wunused-variable"  // e.g. <<warning: unused variable ‘mdl_G__exp__2’ [-Wunused-variable]>>
 #pragma GCC diagnostic ignored "-Wunused-parameter" // e.g. <<warning: unused parameter ‘G’ [-Wunused-parameter]>>
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 #pragma nv_diagnostic push
 #pragma nv_diag_suppress 177 // e.g. <<warning #177-D: variable "mdl_G__exp__2" was declared but never referenced>>
 #endif
@@ -242,7 +242,7 @@ namespace Parameters_sm_dependentCouplings
     // End SM implementation - no special handling of vectors of floats as in EFT (#439)
     return out;
   }
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 #pragma GCC diagnostic pop
 #pragma nv_diagnostic pop
 #endif
@@ -258,7 +258,7 @@ namespace Parameters_sm_independentCouplings
 
 //==========================================================================
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/gg_ttg.sa/src/mgOnGpuConfig.h b/epochX/cudacpp/gg_ttg.sa/src/mgOnGpuConfig.h
index 6c0c4919e9..d4e37d19b3 100644
--- a/epochX/cudacpp/gg_ttg.sa/src/mgOnGpuConfig.h
+++ b/epochX/cudacpp/gg_ttg.sa/src/mgOnGpuConfig.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jul 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MGONGPUCONFIG_H
 #define MGONGPUCONFIG_H 1
@@ -10,13 +10,26 @@
 // There are two different code bases for standalone_cudacpp (without multichannel) and madevent+cudacpp (with multichannel)
 #undef MGONGPU_SUPPORTS_MULTICHANNEL
 
+// Is this a GPU (CUDA, HIP) or CPU implementation?
+#ifdef __CUDACC__
+#define MGONGPUCPP_GPUIMPL cuda
+#elif defined __HIPCC__
+#define MGONGPUCPP_GPUIMPL hip
+#else
+#undef MGONGPUCPP_GPUIMPL
+#endif
+
 // ** NB1 Throughputs (e.g. 6.8E8) are events/sec for "./gcheck.exe -p 65536 128 12"
 // ** NB2 Baseline on b7g47n0004 fluctuates (probably depends on load on other VMs)
 
 // Choose if curand is supported for generating random numbers
+// For CUDA, by default, it is supported
+// For HIP, by default, it is not supported
 // For C++, by default, do not inline, but allow this macro to be set from outside with e.g. -DMGONGPU_HAS_NO_CURAND
 #ifdef __CUDACC__
 #undef MGONGPU_HAS_NO_CURAND
+#elif defined __HIPCC__
+#define MGONGPU_HAS_NO_CURAND 1
 #else
 //#undef MGONGPU_HAS_NO_CURAND // default
 ////#define MGONGPU_HAS_NO_CURAND 1
@@ -52,23 +65,28 @@
 //#undef MGONGPU_HARDCODE_PARAM // default
 ////#define MGONGPU_HARDCODE_PARAM 1
 
-// Complex type in c++: std::complex or cxsmpl (CHOOSE ONLY ONE)
-#ifndef __CUDACC__
-//#define MGONGPU_CPPCXTYPE_STDCOMPLEX 1 // ~8 percent slower on float, same on double (5.1E6/double, 9.4E6/float)
-#define MGONGPU_CPPCXTYPE_CXSMPL 1 // new default (5.1E6/double, 10.2E6/float)
-#endif
-
-// Complex type in cuda: thrust or cucomplex or cxsmpl (CHOOSE ONLY ONE)
+// Complex type in CUDA: thrust or cucomplex or cxsmpl (CHOOSE ONLY ONE)
 #ifdef __CUDACC__
 #define MGONGPU_CUCXTYPE_THRUST 1 // default (~1.15E9/double, ~3.2E9/float)
 //#define MGONGPU_CUCXTYPE_CUCOMPLEX 1 // ~10 percent slower (1.03E9/double, ~2.8E9/float)
 //#define MGONGPU_CUCXTYPE_CXSMPL 1 // ~10 percent slower (1.00E9/double, ~2.9E9/float)
+
+// Complex type in HIP: cxsmpl (ONLY ONE OPTION POSSIBLE)
+#elif defined __HIPCC__
+#define MGONGPU_CUCXTYPE_CXSMPL 1 // ~10 percent slower (1.00E9/double, ~2.9E9/float)
+
+// Complex type in C++: std::complex or cxsmpl (CHOOSE ONLY ONE)
+#else
+//#define MGONGPU_CPPCXTYPE_STDCOMPLEX 1 // ~8 percent slower on float, same on double (5.1E6/double, 9.4E6/float)
+#define MGONGPU_CPPCXTYPE_CXSMPL 1 // new default (5.1E6/double, 10.2E6/float)
 #endif
 
-// Cuda nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation
+// CUDA nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation
 #ifdef __CUDACC__
-#undef MGONGPU_NSIGHT_DEBUG // default
+#undef MGONGPU_NSIGHT_DEBUG // default in CUDA
 //#define MGONGPU_NSIGHT_DEBUG 1
+#else
+#undef MGONGPU_NSIGHT_DEBUG // only option in HIP or C++
 #endif
 
 // SANITY CHECKS (floating point precision for everything but color algebra #537)
@@ -84,17 +102,21 @@
 #error You cannot use double precision for color algebra and single precision elsewhere
 #endif
 
-// SANITY CHECKS (c++ complex number implementation)
-#ifndef __CUDACC__
-#if defined MGONGPU_CPPCXTYPE_STDCOMPLEX and defined MGONGPU_CPPCXTYPE_CXSMPL
-#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CPPCXTYPE_STDCOMPLEX or MGONGPU_CPPCXTYPE_CXSMPL
+// SANITY CHECKS (CUDA complex number implementation)
+#ifdef __CUDACC__
+#if defined MGONGPU_CUCXTYPE_THRUST and defined MGONGPU_CUCXTYPE_CUCOMPLEX
+#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CUCXTYPE_THRUST or MGONGPU_CUCXTYPE_CUCOMPLEX for CUDA
+#elif defined MGONGPU_CUCXTYPE_THRUST and defined MGONGPU_CUCXTYPE_CXSMPL
+#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CUCXTYPE_THRUST or MGONGPU_CUCXTYPE_CXSMPL for CUDA
+#elif defined MGONGPU_CUCXTYPE_CUCOMPLEX and defined MGONGPU_CUCXTYPE_CXSMPL
+#error You must CHOOSE (ONE AND) ONLY ONE OF MGONGPU_CUCXTYPE_CUCOMPLEX or MGONGPU_CUCXTYPE_CXSMPL for CUDA
 #endif
 #endif
 
-// SANITY CHECKS (cuda complex number implementation)
-#ifdef __CUDACC__
-#if defined MGONGPU_CUCXTYPE_THRUST and defined MGONGPU_CUCXTYPE_CUCOMPLEX and defined MGONGPU_CUCXTYPE_CXSMPL
-#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CUCXTYPE_THRUST or MGONGPU_CUCXTYPE_CUCOMPLEX or MGONGPU_CUCXTYPE_CXSMPL
+// SANITY CHECKS (C++ complex number implementation)
+#ifndef MGONGPUCPP_GPUIMPL
+#if defined MGONGPU_CPPCXTYPE_STDCOMPLEX and defined MGONGPU_CPPCXTYPE_CXSMPL
+#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CPPCXTYPE_STDCOMPLEX or MGONGPU_CPPCXTYPE_CXSMPL for C++
 #endif
 #endif
 
@@ -131,7 +153,7 @@ namespace mgOnGpu
   // Alignment requirement for using reinterpret_cast with SIMD vectorized code
   // (using reinterpret_cast with non aligned memory may lead to segmentation faults!)
   // Only needed for C++ code but can be enforced also in NVCC builds of C++ code using CUDA>=11.2 and C++17 (#318, #319, #333)
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   constexpr int cppAlign = 64; // alignment requirement for SIMD vectorization (64-byte i.e. 512-bit)
 #endif
 
@@ -142,7 +164,7 @@ using mgOnGpu::fptype;
 using mgOnGpu::fptype2;
 
 // C++ SIMD vectorization width (this will be used to set neppV)
-#ifdef __CUDACC__ // CUDA implementation has no SIMD
+#ifdef MGONGPUCPP_GPUIMPL // CUDA and HIP implementations have no SIMD
 #undef MGONGPU_CPPSIMD
 #elif defined __AVX512VL__ && defined MGONGPU_PVW512 // C++ "512z" AVX512 with 512 width (512-bit ie 64-byte): 8 (DOUBLE) or 16 (FLOAT)
 #ifdef MGONGPU_FPTYPE_DOUBLE
@@ -172,9 +194,9 @@ using mgOnGpu::fptype2;
 #undef MGONGPU_CPPSIMD
 #endif
 
-// Cuda nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation
+// CUDA nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation
 // Arguments (not used so far): text is __FUNCTION__, code is 0 (start) or 1 (end)
-#if defined __CUDACC__ && defined MGONGPU_NSIGHT_DEBUG /* clang-format off */
+#if defined __CUDA__ && defined MGONGPU_NSIGHT_DEBUG /* clang-format off */
 #define mgDebugDeclare() __shared__ float mgDebugCounter[mgOnGpu::ntpbMAX];
 #define mgDebugInitialise() { mgDebugCounter[threadIdx.x] = 0; }
 #define mgDebug( code, text ) { mgDebugCounter[threadIdx.x] += 1; }
@@ -186,8 +208,8 @@ using mgOnGpu::fptype2;
 #define mgDebugFinalise() { /*noop*/ }
 #endif /* clang-format on */
 
-// Define empty CUDA declaration specifiers for C++
-#ifndef __CUDACC__
+// Define empty CUDA/HIP declaration specifiers for C++
+#ifndef MGONGPUCPP_GPUIMPL
 #define __global__
 #define __host__
 #define __device__
diff --git a/epochX/cudacpp/gg_ttg.sa/src/mgOnGpuCxtypes.h b/epochX/cudacpp/gg_ttg.sa/src/mgOnGpuCxtypes.h
index 0cb2f1db7e..4e7ab03fa2 100644
--- a/epochX/cudacpp/gg_ttg.sa/src/mgOnGpuCxtypes.h
+++ b/epochX/cudacpp/gg_ttg.sa/src/mgOnGpuCxtypes.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022, based on earlier work by D. Smith) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MGONGPUCXTYPES_H
 #define MGONGPUCXTYPES_H 1
@@ -19,7 +19,7 @@
 #include <complex>
 
 // Complex type in cuda: thrust or cucomplex or cxsmpl
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 #if defined MGONGPU_CUCXTYPE_THRUST
 #pragma clang diagnostic push
 #pragma clang diagnostic ignored "-Wtautological-compare" // for icpx2021/clang13 (https://stackoverflow.com/a/15864661)
@@ -201,7 +201,7 @@ namespace mgOnGpu
 {
 
   // --- Type definitions (complex type: cxtype)
-#ifdef __CUDACC__ // cuda
+#ifdef MGONGPUCPP_GPUIMPL // cuda
 #if defined MGONGPU_CUCXTYPE_THRUST
   typedef thrust::complex<fptype> cxtype;
 #elif defined MGONGPU_CUCXTYPE_CUCOMPLEX
@@ -281,7 +281,7 @@ cxmake( const std::complex<double>& c ) // std::complex to cxsmpl (double-to-flo
 
 //==========================================================================
 
-#if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_THRUST // cuda + thrust
+#if defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CUCXTYPE_THRUST // cuda + thrust
 
 //------------------------------
 // CUDA - using thrust::complex
@@ -317,11 +317,11 @@ cxmake( const cxtype& c )
   return c;
 }
 
-#endif // #if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_THRUST
+#endif // #if defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CUCXTYPE_THRUST
 
 //==========================================================================
 
-#if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_CUCOMPLEX // cuda + cucomplex
+#if defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CUCXTYPE_CUCOMPLEX // cuda + cucomplex
 
 //------------------------------
 // CUDA - using cuComplex
@@ -536,11 +536,11 @@ cxmake( const std::complex<fptype>& c ) // std::complex to cucomplex (float-to-f
   return cxmake( c.real(), c.imag() );
 }
 
-#endif // #if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_CUCOMPLEX
+#endif // #if defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CUCXTYPE_CUCOMPLEX
 
 //==========================================================================
 
-#if not defined __CUDACC__ and defined MGONGPU_CPPCXTYPE_STDCOMPLEX // c++ + stdcomplex
+#if not defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CPPCXTYPE_STDCOMPLEX // c++ + stdcomplex
 
 //------------------------------
 // C++ - using std::complex
@@ -584,7 +584,7 @@ cxmake( const std::complex<double>& c ) // std::complex to std::complex (cast do
 }
 #endif
 
-#endif // #if not defined __CUDACC__ and defined MGONGPU_CPPCXTYPE_STDCOMPLEX
+#endif // #if not defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CPPCXTYPE_STDCOMPLEX
 
 //==========================================================================
 
diff --git a/epochX/cudacpp/gg_ttg.sa/src/mgOnGpuFptypes.h b/epochX/cudacpp/gg_ttg.sa/src/mgOnGpuFptypes.h
index a1cde16a67..6f6cee64d6 100644
--- a/epochX/cudacpp/gg_ttg.sa/src/mgOnGpuFptypes.h
+++ b/epochX/cudacpp/gg_ttg.sa/src/mgOnGpuFptypes.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MGONGPUFPTYPES_H
 #define MGONGPUFPTYPES_H 1
@@ -13,7 +13,7 @@
 
 //==========================================================================
 
-#ifdef __CUDACC__ // cuda
+#ifdef MGONGPUCPP_GPUIMPL // cuda
 
 //------------------------------
 // Floating point types - Cuda
@@ -57,11 +57,11 @@ fpsqrt( const fptype& f )
 #endif
 }
 
-#endif // #ifdef __CUDACC__
+#endif // #ifdef MGONGPUCPP_GPUIMPL
 
 //==========================================================================
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 
 //------------------------------
 // Floating point types - C++
@@ -85,7 +85,7 @@ fpsqrt( const fptype& f )
   return std::sqrt( f );
 }
 
-#endif // #ifndef __CUDACC__
+#endif // #ifndef MGONGPUCPP_GPUIMPL
 
 //==========================================================================
 
diff --git a/epochX/cudacpp/gg_ttg.sa/src/mgOnGpuVectors.h b/epochX/cudacpp/gg_ttg.sa/src/mgOnGpuVectors.h
index 9d3e82b1e3..7904b93c61 100644
--- a/epochX/cudacpp/gg_ttg.sa/src/mgOnGpuVectors.h
+++ b/epochX/cudacpp/gg_ttg.sa/src/mgOnGpuVectors.h
@@ -108,7 +108,7 @@ namespace mgOnGpu /* clang-format off */
 #endif
 #endif
 
-#else // i.e #ifndef MGONGPU_CPPSIMD (this includes #ifdef __CUDACC__)
+#else // i.e #ifndef MGONGPU_CPPSIMD (this includes #ifdef MGONGPUCPP_GPUIMPL)
 
   const int neppV = 1;
 
@@ -129,7 +129,7 @@ using mgOnGpu::bool_v;
 
 //--------------------------------------------------------------------------
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 
 // Printout to stream for user defined types
 
@@ -744,11 +744,11 @@ fpvsplit1( const fptype2_v& v )
 
 #endif // #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
 
-#endif // #ifndef __CUDACC__
+#endif // #ifndef MGONGPUCPP_GPUIMPL
 
 //==========================================================================
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 
 //------------------------------
 // Vector types - CUDA
@@ -786,12 +786,12 @@ cxternary( const bool& mask, const cxtype& a, const cxtype& b )
   return ( mask ? a : b );
 }
 
-#endif // #ifdef __CUDACC__
+#endif // #ifdef MGONGPUCPP_GPUIMPL
 
 //==========================================================================
 
 // Scalar-or-vector types: scalar in CUDA, vector or scalar in C++
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 typedef bool bool_sv;
 typedef fptype fptype_sv;
 typedef fptype2 fptype2_sv;
@@ -812,7 +812,7 @@ typedef mgOnGpu::cxtype_ref cxtype_sv_ref;
 #endif
 
 // Scalar-or-vector zeros: scalar in CUDA, vector or scalar in C++
-#ifdef __CUDACC__ /* clang-format off */
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
 inline __host__ __device__ cxtype cxzero_sv(){ return cxtype( 0, 0 ); }
 #elif defined MGONGPU_CPPSIMD
 inline cxtype_v cxzero_sv() { return cxtype_v(); } // RRRR=0000 IIII=0000
diff --git a/epochX/cudacpp/gg_ttg.sa/src/rambo.h b/epochX/cudacpp/gg_ttg.sa/src/rambo.h
index e02ea52496..cd7e1008ea 100644
--- a/epochX/cudacpp/gg_ttg.sa/src/rambo.h
+++ b/epochX/cudacpp/gg_ttg.sa/src/rambo.h
@@ -4,7 +4,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 
 #include "mgOnGpuConfig.h"
@@ -18,7 +18,7 @@
 #include <iostream>
 
 // Simplified rambo version for 2 to N (with N>=2) processes with massless particles
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -83,7 +83,7 @@ namespace mg5amcCpu
       static bool first = true;
       if( first )
       {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
         if constexpr( M_ACCESS::isOnDevice() ) // avoid
         {
           const int ievt0 = 0;
@@ -166,7 +166,7 @@ namespace mg5amcCpu
     wt = po2log;
     if( nparf != 2 ) wt = ( 2. * nparf - 4. ) * log( energy ) + z[nparf - 1];
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
     // issue warnings if weight is too small or too large
     static int iwarn[5] = { 0, 0, 0, 0, 0 };
     if( wt < -180. )
diff --git a/epochX/cudacpp/gg_ttgg.sa/CODEGEN_cudacpp_gg_ttgg_log.txt b/epochX/cudacpp/gg_ttgg.sa/CODEGEN_cudacpp_gg_ttgg_log.txt
index a14b6d40d3..4f93261b95 100644
--- a/epochX/cudacpp/gg_ttgg.sa/CODEGEN_cudacpp_gg_ttgg_log.txt
+++ b/epochX/cudacpp/gg_ttgg.sa/CODEGEN_cudacpp_gg_ttgg_log.txt
@@ -51,8 +51,8 @@ Note that you can still compile and run aMC@NLO with the built-in PDFs
  MG5_aMC> set lhapdf /PATH/TO/lhapdf-config
 
 Using default text editor "vi". Set another one in ./input/mg5_configuration.txt
-No valid eps viewer found. Please set in ./input/mg5_configuration.txt
-Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt
+Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt
+No valid web browser found. Please set in ./input/mg5_configuration.txt
 import /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_cudacpp_gg_ttgg.mg
 The import format was not given, so we guess it as command
 set stdout_level DEBUG
@@ -62,7 +62,7 @@ generate g g > t t~ g g
 No model currently active, so we import the Standard Model
 INFO: load particles 
 INFO: load vertices 
-[1;32mDEBUG: model prefixing  takes 0.004655122756958008 [0m
+[1;32mDEBUG: model prefixing  takes 0.005167722702026367 [0m
 INFO: Restrict model sm with file models/sm/restrict_default.dat . 
 [1;32mDEBUG: Simplifying conditional expressions [0m
 [1;32mDEBUG: remove interactions: u s w+ at order: QED=1 [0m
@@ -155,7 +155,7 @@ INFO: Please specify coupling orders to bypass this step.
 INFO: Trying coupling order WEIGHTED<=4: WEIGTHED IS QCD+2*QED 
 INFO: Trying process: g g > t t~ g g WEIGHTED<=4 @1  
 INFO: Process has 123 diagrams 
-1 processes with 123 diagrams generated in 0.143 s
+1 processes with 123 diagrams generated in 0.152 s
 Total: 1 processes with 123 diagrams
 output standalone_cudacpp CODEGEN_cudacpp_gg_ttgg
 Load PLUGIN.CUDACPP_SA_OUTPUT
@@ -186,15 +186,15 @@ FileWriter <class 'PLUGIN.CUDACPP_SA_OUTPUT.model_handling.PLUGIN_CPPWriter'> fo
 [1;32mDEBUG:  type(self.helas_call_writer) = [0m <class 'PLUGIN.CUDACPP_SA_OUTPUT.model_handling.PLUGIN_GPUFOHelasCallWriter'> [1;30m[model_handling.py at line 1151][0m [0m
 [1;32mDEBUG:  self.support_multichannel, self.include_multi_channel = [0m True False [1;30m[model_handling.py at line 1152][0m [0m
 [1;32mDEBUG:  multi_channel_map = [0m None [1;30m[model_handling.py at line 1643][0m [0m
-[1;32mDEBUG:  diag_to_config = [0m {} [1;30m[model_handling.py at line 1698][0m [0m
-[1;32mDEBUG:  call = [0m vxxxxx( momenta,m_pars->%s, cHel[ihel][%d],%+d, w_sv[%d], %d ); [1;30m[model_handling.py at line 1810][0m [0m
-[1;32mDEBUG:  ('ZERO', 0, -1, 0, 0) [1;30m[model_handling.py at line 1811][0m [0m
-[1;32mDEBUG:  call = [0m vxxxxx( momenta,m_pars->%s, cHel[ihel][%d],%+d, w_sv[%d], %d ); [1;30m[model_handling.py at line 1810][0m [0m
-[1;32mDEBUG:  ('ZERO', 1, -1, 1, 1) [1;30m[model_handling.py at line 1811][0m [0m
-[1;32mDEBUG:  call = [0m vxxxxx( momenta,m_pars->%s, cHel[ihel][%d],%+d, w_sv[%d], %d ); [1;30m[model_handling.py at line 1810][0m [0m
-[1;32mDEBUG:  ('ZERO', 4, 1, 4, 4) [1;30m[model_handling.py at line 1811][0m [0m
-[1;32mDEBUG:  call = [0m vxxxxx( momenta,m_pars->%s, cHel[ihel][%d],%+d, w_sv[%d], %d ); [1;30m[model_handling.py at line 1810][0m [0m
-[1;32mDEBUG:  ('ZERO', 5, 1, 5, 5) [1;30m[model_handling.py at line 1811][0m [0m
+[1;32mDEBUG:  diag_to_config = [0m {} [1;30m[model_handling.py at line 1700][0m [0m
+[1;32mDEBUG:  call = [0m vxxxxx( momenta,m_pars->%s, cHel[ihel][%d],%+d, w_sv[%d], %d ); [1;30m[model_handling.py at line 1812][0m [0m
+[1;32mDEBUG:  ('ZERO', 0, -1, 0, 0) [1;30m[model_handling.py at line 1813][0m [0m
+[1;32mDEBUG:  call = [0m vxxxxx( momenta,m_pars->%s, cHel[ihel][%d],%+d, w_sv[%d], %d ); [1;30m[model_handling.py at line 1812][0m [0m
+[1;32mDEBUG:  ('ZERO', 1, -1, 1, 1) [1;30m[model_handling.py at line 1813][0m [0m
+[1;32mDEBUG:  call = [0m vxxxxx( momenta,m_pars->%s, cHel[ihel][%d],%+d, w_sv[%d], %d ); [1;30m[model_handling.py at line 1812][0m [0m
+[1;32mDEBUG:  ('ZERO', 4, 1, 4, 4) [1;30m[model_handling.py at line 1813][0m [0m
+[1;32mDEBUG:  call = [0m vxxxxx( momenta,m_pars->%s, cHel[ihel][%d],%+d, w_sv[%d], %d ); [1;30m[model_handling.py at line 1812][0m [0m
+[1;32mDEBUG:  ('ZERO', 5, 1, 5, 5) [1;30m[model_handling.py at line 1813][0m [0m
 INFO: Created files CPPProcess.h and CPPProcess.cc in directory /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_cudacpp_gg_ttgg/SubProcesses/P1_Sigma_sm_gg_ttxgg/. 
 [1;32mDEBUG:  Entering PLUGIN_OneProcessExporter.edit_CMakeLists [1;30m[model_handling.py at line 1332][0m [0m
 [1;32mDEBUG:  Entering PLUGIN_OneProcessExporter.edit_check_sa [1;30m[model_handling.py at line 1341][0m [0m
@@ -204,7 +204,7 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory /data/avalassi/G
 [1;32mDEBUG:  Entering PLUGIN_OneProcessExporter.edit_memorybuffers [1;30m[model_handling.py at line 1419][0m [0m
 [1;32mDEBUG:  Entering PLUGIN_OneProcessExporter.edit_memoryaccesscouplings [1;30m[model_handling.py at line 1430][0m [0m
 [1;32mDEBUG:  'Copying test reference file: ', template_ref  = [0m Copying test reference file:  /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/../../../test/ref/dump_CPUTest.Sigma_sm_gg_ttxgg.txt [1;30m[model_handling.py at line 1324][0m [0m
-Generated helas calls for 1 subprocesses (123 diagrams) in 0.380 s
+Generated helas calls for 1 subprocesses (123 diagrams) in 0.406 s
 [1;32mDEBUG:  Entering PLUGIN_ProcessExporter.convert_model (create the model) [1;30m[output.py at line 194][0m [0m
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates VVV1 routines[0m
@@ -212,7 +212,7 @@ ALOHA: aloha creates FFV1 routines[0m
 ALOHA: aloha creates VVVV1 routines[0m
 ALOHA: aloha creates VVVV3 routines[0m
 ALOHA: aloha creates VVVV4 routines[0m
-ALOHA: aloha creates 5 routines in  0.271 s
+ALOHA: aloha creates 5 routines in  0.308 s
 <class 'aloha.create_aloha.AbstractRoutine'> VVV1
 <class 'aloha.create_aloha.AbstractRoutine'> VVV1
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
@@ -254,6 +254,6 @@ INFO: /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_cudacpp_gg_ttgg/src/.
 [1;32mDEBUG:  Entering PLUGIN_ProcessExporter.finalize [1;30m[output.py at line 203][0m [0m
 quit
 
-real	0m1.382s
-user	0m1.312s
-sys	0m0.060s
+real	0m1.524s
+user	0m1.397s
+sys	0m0.070s
diff --git a/epochX/cudacpp/gg_ttgg.sa/COPYRIGHT b/epochX/cudacpp/gg_ttgg.sa/COPYRIGHT
index a134b5fef9..84a883fbb0 100644
--- a/epochX/cudacpp/gg_ttgg.sa/COPYRIGHT
+++ b/epochX/cudacpp/gg_ttgg.sa/COPYRIGHT
@@ -15,6 +15,7 @@ The full development team currently includes the following authors :
   Stephan Hageboeck (CERN)
   Olivier Mattelaer (Universite Catholique de Louvain, original author)
   Stefan Roiser (CERN, original author)
+  Joergen Teig (CERN)
   Andrea Valassi (CERN, original author)
   Zenny Wettersten (CERN)
 See https://github.com/madgraph5/madgraph4gpu for more details. For the full
diff --git a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/Bridge.h b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/Bridge.h
index 4cafe0c997..c04628dfd1 100644
--- a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/Bridge.h
+++ b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/Bridge.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: S. Roiser (Nov 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Roiser, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef BRIDGE_H
 #define BRIDGE_H 1
@@ -22,7 +22,7 @@
 #include <memory>
 #include <type_traits>
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -82,7 +82,7 @@ namespace mg5amcCpu
     Bridge& operator=( const Bridge& ) = delete;
     Bridge& operator=( Bridge&& ) = delete;
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     /**
      * Set the gpublocks and gputhreads for the gpusequence - throws if evnt != gpublocks*gputhreads
      * (this is needed for BridgeKernel tests rather than for actual production use in Fortran)
@@ -149,7 +149,7 @@ namespace mg5amcCpu
     unsigned int m_nevt; // number of events
     int m_nGoodHel;      // the number of good helicities (-1 initially when they have not yet been calculated)
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     int m_gputhreads; // number of gpu threads (default set from number of events, can be modified)
     int m_gpublocks;  // number of gpu blocks (default set from number of events, can be modified)
     mg5amcGpu::DeviceBuffer<FORTRANFPTYPE, sizePerEventMomenta> m_devMomentaF;
@@ -186,12 +186,12 @@ namespace mg5amcCpu
   // Forward declare transposition methods
   //
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 
   template<typename Tin, typename Tout>
   __global__ void dev_transposeMomentaF2C( const Tin* in, Tout* out, const unsigned int nevt );
 
-#endif // __CUDACC__
+#endif // MGONGPUCPP_GPUIMPL
 
   template<typename Tin, typename Tout>
   void hst_transposeMomentaF2C( const Tin* in, Tout* out, const unsigned int nevt );
@@ -208,7 +208,7 @@ namespace mg5amcCpu
   Bridge<FORTRANFPTYPE>::Bridge( unsigned int nevtF, unsigned int nparF, unsigned int np4F )
     : m_nevt( nevtF )
     , m_nGoodHel( -1 )
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     , m_gputhreads( 256 )                  // default number of gpu threads
     , m_gpublocks( m_nevt / m_gputhreads ) // this ensures m_nevt <= m_gpublocks*m_gputhreads
     , m_devMomentaF( m_nevt )
@@ -232,7 +232,7 @@ namespace mg5amcCpu
   {
     if( nparF != CPPProcess::npar ) throw std::runtime_error( "Bridge constructor: npar mismatch" );
     if( np4F != CPPProcess::np4 ) throw std::runtime_error( "Bridge constructor: np4 mismatch" );
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     if( ( m_nevt < s_gputhreadsmin ) || ( m_nevt % s_gputhreadsmin != 0 ) )
       throw std::runtime_error( "Bridge constructor: nevt should be a multiple of " + std::to_string( s_gputhreadsmin ) );
     while( m_nevt != m_gpublocks * m_gputhreads )
@@ -250,11 +250,11 @@ namespace mg5amcCpu
     std::cout << "WARNING! Instantiate host Bridge (nevt=" << m_nevt << ")" << std::endl;
     mg5amcCpu::CPPProcess process( /*verbose=*/false );
     m_pmek.reset( new mg5amcCpu::MatrixElementKernelHost( m_hstMomentaC, m_hstGs, m_hstRndHel, m_hstRndCol, m_hstMEs, m_hstSelHel, m_hstSelCol, m_nevt ) );
-#endif // __CUDACC__
+#endif // MGONGPUCPP_GPUIMPL
     process.initProc( "../../Cards/param_card.dat" );
   }
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   template<typename FORTRANFPTYPE>
   void Bridge<FORTRANFPTYPE>::set_gpugrid( const int gpublocks, const int gputhreads )
   {
@@ -268,7 +268,7 @@ namespace mg5amcCpu
   }
 #endif
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   template<typename FORTRANFPTYPE>
   void Bridge<FORTRANFPTYPE>::gpu_sequence( const FORTRANFPTYPE* momenta,
                                             const FORTRANFPTYPE* gs,
@@ -283,14 +283,14 @@ namespace mg5amcCpu
     constexpr int neppM = MemoryAccessMomenta::neppM;
     if constexpr( neppM == 1 && std::is_same_v<FORTRANFPTYPE, fptype> )
     {
-      checkCuda( cudaMemcpy( m_devMomentaC.data(), momenta, m_devMomentaC.bytes(), cudaMemcpyHostToDevice ) );
+      gpuMemcpy( m_devMomentaC.data(), momenta, m_devMomentaC.bytes(), gpuMemcpyHostToDevice );
     }
     else
     {
-      checkCuda( cudaMemcpy( m_devMomentaF.data(), momenta, m_devMomentaF.bytes(), cudaMemcpyHostToDevice ) );
+      gpuMemcpy( m_devMomentaF.data(), momenta, m_devMomentaF.bytes(), gpuMemcpyHostToDevice );
       const int thrPerEvt = CPPProcess::npar * CPPProcess::np4; // AV: transpose alg does 1 element per thread (NOT 1 event per thread)
       //const int thrPerEvt = 1; // AV: try new alg with 1 event per thread... this seems slower
-      dev_transposeMomentaF2C<<<m_gpublocks * thrPerEvt, m_gputhreads>>>( m_devMomentaF.data(), m_devMomentaC.data(), m_nevt );
+      gpuLaunchKernel( dev_transposeMomentaF2C, m_gpublocks * thrPerEvt, m_gputhreads, m_devMomentaF.data(), m_devMomentaC.data(), m_nevt );
     }
     if constexpr( std::is_same_v<FORTRANFPTYPE, fptype> )
     {
@@ -333,7 +333,7 @@ namespace mg5amcCpu
   }
 #endif
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   template<typename FORTRANFPTYPE>
   void Bridge<FORTRANFPTYPE>::cpu_sequence( const FORTRANFPTYPE* momenta,
                                             const FORTRANFPTYPE* gs,
@@ -388,7 +388,7 @@ namespace mg5amcCpu
   // - C++ array: momenta[npagM][npar][np4][neppM] with nevt=npagM*neppM (AOSOA)
   //
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   template<typename Tin, typename Tout>
   __global__ void dev_transposeMomentaF2C( const Tin* in, Tout* out, const unsigned int nevt )
   {
diff --git a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/BridgeKernels.cc b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/BridgeKernels.cc
index cef4cb3c71..90c7f2d3b8 100644
--- a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/BridgeKernels.cc
+++ b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/BridgeKernels.cc
@@ -1,10 +1,11 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #include "BridgeKernels.h"
 
+#include "GpuAbstraction.h"
 #include "MemoryAccessMomenta.h"
 
 #include <sstream>
@@ -14,7 +15,7 @@ constexpr int npar = CPPProcess::npar; // #particles in total (external = initia
 
 //============================================================================
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -45,7 +46,7 @@ namespace mg5amcCpu
 
 //============================================================================
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 namespace mg5amcCpu
 {
 
@@ -96,7 +97,7 @@ namespace mg5amcCpu
 
 //============================================================================
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 {
 
diff --git a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/BridgeKernels.h b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/BridgeKernels.h
index 15eb4bff4d..3efef8ce97 100644
--- a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/BridgeKernels.h
+++ b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/BridgeKernels.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef BRIDGEKERNELS_H
 #define BRIDGEKERNELS_H 1
@@ -12,7 +12,7 @@
 #include "MatrixElementKernels.h"
 #include "MemoryBuffers.h"
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -49,7 +49,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A Bridge wrapper class encapsulating matrix element calculations on a CPU host
   class BridgeKernelHost final : public BridgeKernelBase
   {
@@ -89,7 +89,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   // A Bridge wrapper class encapsulating matrix element calculations on a GPU device
   class BridgeKernelDevice : public BridgeKernelBase
   {
diff --git a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/CommonRandomNumberKernel.cc b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/CommonRandomNumberKernel.cc
index 985b39f576..010bc4cbd0 100644
--- a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/CommonRandomNumberKernel.cc
+++ b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/CommonRandomNumberKernel.cc
@@ -1,15 +1,16 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #include "CommonRandomNumbers.h"
+#include "GpuAbstraction.h"
 #include "MemoryBuffers.h"
 #include "RandomNumberKernels.h"
 
 #include <cassert>
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/CrossSectionKernels.cc b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/CrossSectionKernels.cc
index 0b355a3c8d..c15b39844d 100644
--- a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/CrossSectionKernels.cc
+++ b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/CrossSectionKernels.cc
@@ -1,10 +1,11 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #include "CrossSectionKernels.h"
 
+#include "GpuAbstraction.h"
 #include "MemoryAccessMatrixElements.h"
 #include "MemoryAccessWeights.h"
 #include "MemoryBuffers.h"
@@ -77,7 +78,7 @@ debug_me_is_abnormal( const fptype& me, size_t ievtALL )
 
 //============================================================================
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -185,7 +186,7 @@ namespace mg5amcCpu
 
 //============================================================================
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 {
 
diff --git a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/CrossSectionKernels.h b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/CrossSectionKernels.h
index 7933ca4bbf..4d9659e04e 100644
--- a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/CrossSectionKernels.h
+++ b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/CrossSectionKernels.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef CROSSSECTIONKERNELS_H
 #define CROSSSECTIONKERNELS_H 1
@@ -13,7 +13,7 @@
 
 //============================================================================
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -96,7 +96,7 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
   /*
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   // A class encapsulating the calculation of event statistics on a GPU device
   class CrossSectionKernelDevice : public CrossSectionKernelBase, public NumberOfEvents
   {
diff --git a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/CudaRuntime.h b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/CudaRuntime.h
deleted file mode 100644
index 64ce52f4b3..0000000000
--- a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/CudaRuntime.h
+++ /dev/null
@@ -1,85 +0,0 @@
-// Copyright (C) 2020-2023 CERN and UCLouvain.
-// Licensed under the GNU Lesser General Public License (version 3 or later).
-// Created by: S. Roiser (Jul 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
-
-#ifndef MG5AMC_CUDARUNTIME_H
-#define MG5AMC_CUDARUNTIME_H 1
-
-// MG5AMC on GPU uses the CUDA runtime API, not the lower level CUDA driver API
-// See https://docs.nvidia.com/cuda/cuda-runtime-api/driver-vs-runtime-api.html#driver-vs-runtime-api
-
-#include <cassert>
-#include <iostream>
-
-//--------------------------------------------------------------------------
-
-// See https://stackoverflow.com/a/14038590
-#ifdef __CUDACC__ /* clang-format off */
-#define checkCuda( code ) { assertCuda( code, __FILE__, __LINE__ ); }
-inline void assertCuda( cudaError_t code, const char* file, int line, bool abort = true )
-{
-  if( code != cudaSuccess )
-  {
-    printf( "ERROR! assertCuda: '%s' (%d) in %s:%d\n", cudaGetErrorString( code ), code, file, line );
-    if( abort ) assert( code == cudaSuccess );
-  }
-}
-#endif /* clang-format on */
-
-//--------------------------------------------------------------------------
-
-#ifdef __CUDACC__
-namespace mg5amcGpu
-{
-  // Instantiate a CudaRuntime at the beginnining of the application's main to
-  // invoke cudaSetDevice(0) in the constructor and book a cudaDeviceReset() call in the destructor
-  // *** FIXME! This will all need to be designed differently when going to multi-GPU nodes! ***
-  struct CudaRuntime final
-  {
-    CudaRuntime( const bool debug = true )
-      : m_debug( debug ) { setUp( m_debug ); }
-    ~CudaRuntime() { tearDown( m_debug ); }
-    CudaRuntime( const CudaRuntime& ) = delete;
-    CudaRuntime( CudaRuntime&& ) = delete;
-    CudaRuntime& operator=( const CudaRuntime& ) = delete;
-    CudaRuntime& operator=( CudaRuntime&& ) = delete;
-    bool m_debug;
-
-    // Set up CUDA application
-    // ** NB: strictly speaking this is not needed when using the CUDA runtime API **
-    // Calling cudaSetDevice on startup is useful to properly book-keep the time spent in CUDA initialization
-    static void setUp( const bool debug = true )
-    {
-      // ** NB: it is useful to call cudaSetDevice, or cudaFree, to properly book-keep the time spent in CUDA initialization
-      // ** NB: otherwise, the first CUDA operation (eg a cudaMemcpyToSymbol in CPPProcess ctor) appears to take much longer!
-      /*
-      // [We initially added cudaFree(0) to "ease profile analysis" only because it shows up as a big recognizable block!]
-      // No explicit initialization is needed: https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#initialization
-      // It is not clear what cudaFree(0) does at all: https://stackoverflow.com/questions/69967813/
-      if ( debug ) std::cout << "__CudaRuntime: calling cudaFree(0)" << std::endl;
-      checkCuda( cudaFree( 0 ) ); // SLOW!
-      */
-      // Replace cudaFree(0) by cudaSetDevice(0), even if it is not really needed either
-      // (but see https://developer.nvidia.com/blog/cuda-pro-tip-always-set-current-device-avoid-multithreading-bugs)
-      if( debug ) std::cout << "__CudaRuntime: calling cudaSetDevice(0)" << std::endl;
-      checkCuda( cudaSetDevice( 0 ) ); // SLOW!
-    }
-
-    // Tear down CUDA application (call cudaDeviceReset)
-    // ** NB: strictly speaking this is not needed when using the CUDA runtime API **
-    // Calling cudaDeviceReset on shutdown is only needed for checking memory leaks in cuda-memcheck
-    // See https://docs.nvidia.com/cuda/cuda-memcheck/index.html#leak-checking
-    static void tearDown( const bool debug = true )
-    {
-      if( debug ) std::cout << "__CudaRuntime: calling cudaDeviceReset()" << std::endl;
-      checkCuda( cudaDeviceReset() );
-    }
-  };
-
-}
-#endif
-
-//--------------------------------------------------------------------------
-
-#endif // MG5AMC_CUDARUNTIME_H
diff --git a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/CurandRandomNumberKernel.cc b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/CurandRandomNumberKernel.cc
index eb56333b03..38c477c17a 100644
--- a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/CurandRandomNumberKernel.cc
+++ b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/CurandRandomNumberKernel.cc
@@ -1,9 +1,9 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
-#include "CudaRuntime.h"
+#include "GpuRuntime.h"
 #include "MemoryBuffers.h"
 #include "RandomNumberKernels.h"
 
@@ -114,7 +114,7 @@ namespace mg5amcCpu
     /*
     printf( "\nCurandRandomNumberKernel::generateRnarray size = %d\n", (int)m_rnarray.size() );
     fptype* data = m_rnarray.data();
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     if( m_rnarray.isOnDevice() )
     {
       data = new fptype[m_rnarray.size()]();
@@ -123,7 +123,7 @@ namespace mg5amcCpu
 #endif
     for( int i = 0; i < ( (int)m_rnarray.size() / 4 ); i++ )
       printf( "[%4d] %f %f %f %f\n", i * 4, data[i * 4], data[i * 4 + 2], data[i * 4 + 2], data[i * 4 + 3] );
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     if( m_rnarray.isOnDevice() ) delete[] data;
 #endif
     */
diff --git a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/EventStatistics.h b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/EventStatistics.h
index 48b51e0a49..b425a5bade 100644
--- a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/EventStatistics.h
+++ b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/EventStatistics.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef EventStatistics_H
 #define EventStatistics_H 1
@@ -16,7 +16,7 @@
 #include <limits>
 #include <string>
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/GpuAbstraction.h b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/GpuAbstraction.h
new file mode 100644
index 0000000000..6a7d9c05c0
--- /dev/null
+++ b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/GpuAbstraction.h
@@ -0,0 +1,71 @@
+// Copyright (C) 2020-2023 CERN and UCLouvain.
+// Licensed under the GNU Lesser General Public License (version 3 or later).
+// Created by: J. Teig (Jul 2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+
+#ifndef MG5AMC_GPUABSTRACTION_H
+#define MG5AMC_GPUABSTRACTION_H 1
+
+#include <cassert>
+
+//--------------------------------------------------------------------------
+
+#ifdef __CUDACC__
+
+#define gpuError_t cudaError_t
+#define gpuPeekAtLastError cudaPeekAtLastError
+#define gpuGetErrorString cudaGetErrorString
+#define gpuSuccess cudaSuccess
+
+#define gpuMallocHost( ptr, size ) checkGpu( cudaMallocHost( ptr, size ) )
+#define gpuMalloc( ptr, size ) checkGpu( cudaMalloc( ptr, size ) )
+
+#define gpuMemcpy( dstData, srcData, srcBytes, func ) checkGpu( cudaMemcpy( dstData, srcData, srcBytes, func ) )
+#define gpuMemcpyHostToDevice cudaMemcpyHostToDevice
+#define gpuMemcpyDeviceToHost cudaMemcpyDeviceToHost
+#define gpuMemcpyToSymbol( type1, type2, size ) checkGpu( cudaMemcpyToSymbol( type1, type2, size ) )
+
+#define gpuFree( ptr ) checkGpu( cudaFree( ptr ) )
+#define gpuFreeHost( ptr ) checkGpu( cudaFreeHost( ptr ) )
+
+#define gpuSetDevice cudaSetDevice
+#define gpuDeviceSynchronize cudaDeviceSynchronize
+#define gpuDeviceReset cudaDeviceReset
+
+#define gpuLaunchKernel( kernel, blocks, threads, ... ) kernel<<<blocks, threads>>>( __VA_ARGS__ )
+#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<<blocks, threads, sharedMem>>>( __VA_ARGS__ )
+
+//--------------------------------------------------------------------------
+
+#elif defined __HIPCC__
+
+#include "hip/hip_runtime.h"
+
+#define gpuError_t hipError_t
+#define gpuPeekAtLastError hipPeekAtLastError
+#define gpuGetErrorString hipGetErrorString
+#define gpuSuccess hipSuccess
+
+#define gpuMallocHost( ptr, size ) checkGpu( hipHostMalloc( ptr, size ) ) // HostMalloc better
+#define gpuMalloc( ptr, size ) checkGpu( hipMalloc( ptr, size ) )
+
+#define gpuMemcpy( dstData, srcData, srcBytes, func ) checkGpu( hipMemcpy( dstData, srcData, srcBytes, func ) )
+#define gpuMemcpyHostToDevice hipMemcpyHostToDevice
+#define gpuMemcpyDeviceToHost hipMemcpyDeviceToHost
+#define gpuMemcpyToSymbol( type1, type2, size ) checkGpu( hipMemcpyToSymbol( type1, type2, size ) )
+
+#define gpuFree( ptr ) checkGpu( hipFree( ptr ) )
+#define gpuFreeHost( ptr ) checkGpu( hipHostFree( ptr ) )
+
+#define gpuSetDevice hipSetDevice
+#define gpuDeviceSynchronize hipDeviceSynchronize
+#define gpuDeviceReset hipDeviceReset
+
+#define gpuLaunchKernel( kernel, blocks, threads, ... ) kernel<<<blocks, threads>>>( __VA_ARGS__ )
+#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<<blocks, threads, sharedMem>>>( __VA_ARGS__ )
+
+//--------------------------------------------------------------------------
+
+#endif
+
+#endif // MG5AMC_GPUABSTRACTION_H
diff --git a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/GpuRuntime.h b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/GpuRuntime.h
new file mode 100644
index 0000000000..93579ef08b
--- /dev/null
+++ b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/GpuRuntime.h
@@ -0,0 +1,85 @@
+// Copyright (C) 2020-2023 CERN and UCLouvain.
+// Licensed under the GNU Lesser General Public License (version 3 or later).
+// Created by: J. Teig (Jun 2023, based on earlier work by S. Roiser) for the MG5aMC CUDACPP plugin.
+// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+
+#ifndef MG5AMC_GPURUNTIME_H
+#define MG5AMC_GPURUNTIME_H 1
+
+// MG5AMC on GPU uses the CUDA runtime API, not the lower level CUDA driver API
+// See https://docs.nvidia.com/cuda/cuda-runtime-api/driver-vs-runtime-api.html#driver-vs-runtime-api
+
+#include "GpuAbstraction.h"
+
+#include <iostream>
+
+//--------------------------------------------------------------------------
+
+// See https://stackoverflow.com/a/14038590
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
+#define checkGpu( code ) { assertGpu( code, __FILE__, __LINE__ ); }
+inline void assertGpu( gpuError_t code, const char* file, int line, bool abort = true )
+{
+  if( code != gpuSuccess )
+  {
+    printf( "ERROR! assertGpu: '%s' (%d) in %s:%d\n", gpuGetErrorString( code ), code, file, line );
+    if( abort ) assert( code == gpuSuccess );
+  }
+}
+#endif /* clang-format on */
+
+//--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+namespace mg5amcGpu
+{
+  // Instantiate a GpuRuntime at the beginnining of the application's main to
+  // invoke gpuSetDevice(0) in the constructor and book a gpuDeviceReset() call in the destructor
+  // *** FIXME! This will all need to be designed differently when going to multi-GPU nodes! ***
+  struct GpuRuntime final
+  {
+    GpuRuntime( const bool debug = true )
+      : m_debug( debug ) { setUp( m_debug ); }
+    ~GpuRuntime() { tearDown( m_debug ); }
+    GpuRuntime( const GpuRuntime& ) = delete;
+    GpuRuntime( GpuRuntime&& ) = delete;
+    GpuRuntime& operator=( const GpuRuntime& ) = delete;
+    GpuRuntime& operator=( GpuRuntime&& ) = delete;
+    bool m_debug;
+
+    // Set up CUDA application
+    // ** NB: strictly speaking this is not needed when using the CUDA runtime API **
+    // Calling cudaSetDevice on startup is useful to properly book-keep the time spent in CUDA initialization
+    static void setUp( const bool debug = true )
+    {
+      // ** NB: it is useful to call cudaSetDevice, or cudaFree, to properly book-keep the time spent in CUDA initialization
+      // ** NB: otherwise, the first CUDA operation (eg a cudaMemcpyToSymbol in CPPProcess ctor) appears to take much longer!
+      /*
+      // [We initially added cudaFree(0) to "ease profile analysis" only because it shows up as a big recognizable block!]
+      // No explicit initialization is needed: https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#initialization
+      // It is not clear what cudaFree(0) does at all: https://stackoverflow.com/questions/69967813/
+      if ( debug ) std::cout << "__CudaRuntime: calling cudaFree(0)" << std::endl;
+      checkCuda( cudaFree( 0 ) ); // SLOW!
+      */
+      // Replace cudaFree(0) by cudaSetDevice(0), even if it is not really needed either
+      // (but see https://developer.nvidia.com/blog/cuda-pro-tip-always-set-current-device-avoid-multithreading-bugs)
+      if( debug ) std::cout << "__GpuRuntime: calling GpuSetDevice(0)" << std::endl;
+      checkGpu( gpuSetDevice( 0 ) ); // SLOW!
+    }
+
+    // Tear down CUDA application (call cudaDeviceReset)
+    // ** NB: strictly speaking this is not needed when using the CUDA runtime API **
+    // Calling cudaDeviceReset on shutdown is only needed for checking memory leaks in cuda-memcheck
+    // See https://docs.nvidia.com/cuda/cuda-memcheck/index.html#leak-checking
+    static void tearDown( const bool debug = true )
+    {
+      if( debug ) std::cout << "__GpuRuntime: calling GpuDeviceReset()" << std::endl;
+      checkGpu( gpuDeviceReset() );
+    }
+  };
+}
+#endif
+
+//--------------------------------------------------------------------------
+
+#endif // MG5AMC_GPURUNTIME_H
diff --git a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MadgraphTest.h b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MadgraphTest.h
index fd7734ce42..d2ff326e20 100644
--- a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MadgraphTest.h
+++ b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MadgraphTest.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: S. Hageboeck (Dec 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MADGRAPHTEST_H_
 #define MADGRAPHTEST_H_ 1
@@ -21,7 +21,7 @@
 #include <string>
 #include <vector>
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 using mg5amcGpu::CPPProcess;
 #else
 using mg5amcCpu::CPPProcess;
@@ -200,7 +200,7 @@ class MadgraphTest : public testing::TestWithParam<TestDriverBase*>
 
 // Since we link both the CPU-only and GPU tests into the same executable, we prevent
 // a multiply defined symbol by only compiling this in the non-CUDA phase:
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 
 /// Compare momenta and matrix elements.
 /// This uses an implementation of TestDriverBase to run a madgraph workflow,
@@ -309,6 +309,6 @@ TEST_P( MadgraphTest, CompareMomentaAndME )
   }
 }
 
-#endif // __CUDACC__
+#endif // MGONGPUCPP_GPUIMPL
 
 #endif /* MADGRAPHTEST_H_ */
diff --git a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MatrixElementKernels.cc b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MatrixElementKernels.cc
index 30257195b6..d6d6c4f179 100644
--- a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MatrixElementKernels.cc
+++ b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MatrixElementKernels.cc
@@ -1,12 +1,12 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #include "MatrixElementKernels.h"
 
 #include "CPPProcess.h"
-#include "CudaRuntime.h"
+#include "GpuRuntime.h" // Includes the abstraction for Nvidia/AMD compilation
 #include "MemoryAccessMomenta.h"
 #include "MemoryBuffers.h"
 
@@ -14,7 +14,7 @@
 
 //============================================================================
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 namespace mg5amcCpu
 {
 
@@ -143,7 +143,7 @@ namespace mg5amcCpu
 
 //============================================================================
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 {
 
@@ -202,13 +202,13 @@ namespace mg5amcGpu
     PinnedHostBufferHelicityMask hstIsGoodHel( ncomb );
     DeviceBufferHelicityMask devIsGoodHel( ncomb );
     // ... 0d1. Compute good helicity mask on the device
-    computeDependentCouplings<<<m_gpublocks, m_gputhreads>>>( m_gs.data(), m_couplings.data() );
+    gpuLaunchKernel( computeDependentCouplings, m_gpublocks, m_gputhreads, m_gs.data(), m_couplings.data() );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    sigmaKin_getGoodHel<<<m_gpublocks, m_gputhreads>>>( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_numerators.data(), m_denominators.data(), devIsGoodHel.data() );
+    gpuLaunchKernel( sigmaKin_getGoodHel, m_gpublocks, m_gputhreads, m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_numerators.data(), m_denominators.data(), devIsGoodHel.data() );
 #else
-    sigmaKin_getGoodHel<<<m_gpublocks, m_gputhreads>>>( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), devIsGoodHel.data() );
+    gpuLaunchKernel( sigmaKin_getGoodHel, m_gpublocks, m_gputhreads, m_momenta.data(), m_couplings.data(), m_matrixElements.data(), devIsGoodHel.data() );
 #endif
-    checkCuda( cudaPeekAtLastError() );
+    checkGpu( gpuPeekAtLastError() );
     // ... 0d2. Copy back good helicity mask to the host
     copyHostFromDevice( hstIsGoodHel, devIsGoodHel );
     // ... 0d3. Copy back good helicity list to constant memory on the device
@@ -219,19 +219,19 @@ namespace mg5amcGpu
 
   void MatrixElementKernelDevice::computeMatrixElements( const unsigned int channelId )
   {
-    computeDependentCouplings<<<m_gpublocks, m_gputhreads>>>( m_gs.data(), m_couplings.data() );
+    gpuLaunchKernel( computeDependentCouplings, m_gpublocks, m_gputhreads, m_gs.data(), m_couplings.data() );
 #ifndef MGONGPU_NSIGHT_DEBUG
     constexpr unsigned int sharedMemSize = 0;
 #else
     constexpr unsigned int sharedMemSize = ntpbMAX * sizeof( float );
 #endif
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    sigmaKin<<<m_gpublocks, m_gputhreads, sharedMemSize>>>( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), channelId, m_numerators.data(), m_denominators.data(), m_selhel.data(), m_selcol.data() );
+    gpuLaunchKernelSharedMem( sigmaKin, m_gpublocks, m_gputhreads, sharedMemSize, m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), channelId, m_numerators.data(), m_denominators.data(), m_selhel.data(), m_selcol.data() );
 #else
-    sigmaKin<<<m_gpublocks, m_gputhreads, sharedMemSize>>>( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), m_selhel.data(), m_selcol.data() );
+    gpuLaunchKernelSharedMem( sigmaKin, m_gpublocks, m_gputhreads, sharedMemSize, m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), m_selhel.data(), m_selcol.data() );
 #endif
-    checkCuda( cudaPeekAtLastError() );
-    checkCuda( cudaDeviceSynchronize() );
+    checkGpu( gpuPeekAtLastError() );
+    checkGpu( gpuDeviceSynchronize() );
   }
 
   //--------------------------------------------------------------------------
diff --git a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MatrixElementKernels.h b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MatrixElementKernels.h
index 23e84757a2..72bd8f195b 100644
--- a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MatrixElementKernels.h
+++ b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MatrixElementKernels.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MATRIXELEMENTKERNELS_H
 #define MATRIXELEMENTKERNELS_H 1
@@ -10,7 +10,7 @@
 
 #include "MemoryBuffers.h"
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -81,7 +81,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating matrix element calculations on a CPU host
   class MatrixElementKernelHost final : public MatrixElementKernelBase, public NumberOfEvents
   {
@@ -130,7 +130,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   // A class encapsulating matrix element calculations on a GPU device
   class MatrixElementKernelDevice : public MatrixElementKernelBase, public NumberOfEvents
   {
diff --git a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MemoryAccessHelpers.h b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MemoryAccessHelpers.h
index c82a6c7635..db73e4e064 100644
--- a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MemoryAccessHelpers.h
+++ b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MemoryAccessHelpers.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MemoryAccessHelpers_H
 #define MemoryAccessHelpers_H 1
@@ -105,7 +105,7 @@ class KernelAccessHelper : public MemoryAccessHelper<T>
     }
     else
     {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
       const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid
       //printf( "kernelAccessRecord: ievt=%d threadId=%d\n", ievt, threadIdx.x );
       return T::ieventAccessRecord( buffer, ievt ); // NB fptype and fptype_sv coincide for CUDA
diff --git a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MemoryAccessMomenta.h b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MemoryAccessMomenta.h
index 0ac4faa3c7..38fade09fb 100644
--- a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MemoryAccessMomenta.h
+++ b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MemoryAccessMomenta.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MemoryAccessMomenta_H
 #define MemoryAccessMomenta_H 1
@@ -12,7 +12,7 @@
 #include "MemoryAccessHelpers.h"
 #include "MemoryAccessVectors.h"
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 using mg5amcGpu::CPPProcess;
 #else
 using mg5amcCpu::CPPProcess;
@@ -29,7 +29,7 @@ class MemoryAccessMomentaBase //_AOSOAv1
 
   // Number of Events Per Page in the momenta AOSOA memory buffer layout
   // (these are all best kept as a compile-time constants: see issue #23)
-#ifdef __CUDACC__ /* clang-format off */
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
   // -----------------------------------------------------------------------------------------------
   // --- GPUs: neppM is best set to a power of 2 times the number of fptype's in a 32-byte cacheline
   // --- This is relevant to ensure coalesced access to momenta in global memory
diff --git a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MemoryAccessRandomNumbers.h b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MemoryAccessRandomNumbers.h
index e2988d39f3..40cb089135 100644
--- a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MemoryAccessRandomNumbers.h
+++ b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MemoryAccessRandomNumbers.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MemoryAccessRandomNumbers_H
 #define MemoryAccessRandomNumbers_H 1
@@ -11,7 +11,7 @@
 #include "CPPProcess.h"
 #include "MemoryAccessHelpers.h"
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 using mg5amcGpu::CPPProcess;
 #else
 using mg5amcCpu::CPPProcess;
diff --git a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MemoryAccessVectors.h b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MemoryAccessVectors.h
index e9b197368e..08faccff0f 100644
--- a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MemoryAccessVectors.h
+++ b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MemoryAccessVectors.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MemoryAccessVectors_H
 #define MemoryAccessVectors_H 1
@@ -10,7 +10,7 @@
 
 #include "mgOnGpuVectors.h"
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 namespace mg5amcCpu // this is only needed for CPU SIMD vectorization
 {
 
diff --git a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MemoryBuffers.h b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MemoryBuffers.h
index 3093e6ed18..7756a71621 100644
--- a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MemoryBuffers.h
+++ b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MemoryBuffers.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021, based on earlier work by S. Hageboeck) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Roiser, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MemoryBuffers_H
 #define MemoryBuffers_H 1
@@ -11,12 +11,12 @@
 #include "mgOnGpuCxtypes.h"
 
 #include "CPPProcess.h"
-#include "CudaRuntime.h"
+#include "GpuRuntime.h"
 #include "Parameters_sm.h"
 
 #include <sstream>
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -87,7 +87,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   constexpr bool HostBufferALIGNED = false;   // ismisaligned=false
   constexpr bool HostBufferMISALIGNED = true; // ismisaligned=true
 
@@ -119,7 +119,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   // A class encapsulating a CUDA pinned host buffer
   template<typename T>
   class PinnedHostBufferBase : public BufferBase<T>
@@ -128,18 +128,18 @@ namespace mg5amcCpu
     PinnedHostBufferBase( const size_t size )
       : BufferBase<T>( size, false )
     {
-      checkCuda( cudaMallocHost( &( this->m_data ), this->bytes() ) );
+      gpuMallocHost( &( this->m_data ), this->bytes() );
     }
     virtual ~PinnedHostBufferBase()
     {
-      checkCuda( cudaFreeHost( this->m_data ) );
+      gpuFreeHost( this->m_data );
     }
   };
 #endif
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   // A class encapsulating a CUDA device buffer
   template<typename T>
   class DeviceBufferBase : public BufferBase<T>
@@ -148,18 +148,18 @@ namespace mg5amcCpu
     DeviceBufferBase( const size_t size )
       : BufferBase<T>( size, true )
     {
-      checkCuda( cudaMalloc( &( this->m_data ), this->bytes() ) );
+      gpuMalloc( &( this->m_data ), this->bytes() );
     }
     virtual ~DeviceBufferBase()
     {
-      checkCuda( cudaFree( this->m_data ) );
+      gpuFree( this->m_data );
     }
   };
 #endif
 
   //--------------------------------------------------------------------------
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for a given number of events
   template<typename T, size_t sizePerEvent, bool ismisaligned>
   class HostBuffer : public HostBufferBase<T, ismisaligned>, virtual private NumberOfEvents
@@ -175,7 +175,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   // A class encapsulating a CUDA pinned host buffer for a given number of events
   template<typename T, size_t sizePerEvent>
   class PinnedHostBuffer : public PinnedHostBufferBase<T>, virtual private NumberOfEvents
@@ -191,7 +191,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   // A class encapsulating a CUDA device buffer for a given number of events
   template<typename T, size_t sizePerEvent>
   class DeviceBuffer : public DeviceBufferBase<T>, virtual private NumberOfEvents
@@ -213,7 +213,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for momenta random numbers
   constexpr size_t sizePerEventRndNumMomenta = MemoryBuffers::np4 * MemoryBuffers::nparf;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for momenta random numbers
   typedef HostBuffer<fptype, sizePerEventRndNumMomenta, HostBufferALIGNED> HostBufferRndNumMomenta;
 #else
@@ -232,7 +232,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer with ONE fptype per event
   constexpr size_t sizePerEventOneFp = 1;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer with ONE fptype per event
   typedef HostBuffer<fptype, sizePerEventOneFp, HostBufferALIGNED> HostBufferOneFp;
 #else
@@ -257,7 +257,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for Gs
   constexpr size_t sizePerEventGs = 1;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for gs
   typedef HostBuffer<fptype, sizePerEventGs, HostBufferALIGNED> HostBufferGs;
 #else
@@ -276,7 +276,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for numerators
   constexpr size_t sizePerEventNumerators = 1;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for gs
   typedef HostBuffer<fptype, sizePerEventNumerators, HostBufferALIGNED> HostBufferNumerators;
 #else
@@ -296,7 +296,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for denominators
   constexpr size_t sizePerEventDenominators = 1;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for gs
   typedef HostBuffer<fptype, sizePerEventDenominators, HostBufferALIGNED> HostBufferDenominators;
 #else
@@ -315,7 +315,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for random numbers
   constexpr size_t sizePerEventCouplings = MemoryBuffers::ndcoup * MemoryBuffers::nx2;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for gs
   typedef HostBuffer<fptype, sizePerEventCouplings, HostBufferALIGNED> HostBufferCouplings;
 #else
@@ -333,7 +333,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for momenta
   constexpr size_t sizePerEventMomenta = MemoryBuffers::np4 * MemoryBuffers::npar;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for momenta
   typedef HostBuffer<fptype, sizePerEventMomenta, HostBufferALIGNED> HostBufferMomenta;
   //typedef HostBuffer<fptype, sizePerEventMomenta, HostBufferMISALIGNED> HostBufferMomenta; // TEST MISALIGNMENT!
@@ -352,7 +352,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for sampling weights
   constexpr size_t sizePerEventWeights = 1;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for sampling weights
   typedef HostBuffer<fptype, sizePerEventWeights, HostBufferALIGNED> HostBufferWeights;
 #else
@@ -370,7 +370,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for matrix elements
   constexpr size_t sizePerEventMatrixElements = 1;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for matrix elements
   typedef HostBuffer<fptype, sizePerEventMatrixElements, HostBufferALIGNED> HostBufferMatrixElements;
 #else
@@ -385,7 +385,7 @@ namespace mg5amcCpu
   // A base class encapsulating a memory buffer for the helicity mask
   typedef BufferBase<bool> BufferHelicityMask;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for the helicity mask
   typedef HostBufferBase<bool, HostBufferALIGNED> HostBufferHelicityMask;
 #else
@@ -403,7 +403,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for wavefunctions
   constexpr size_t sizePerEventWavefunctions = MemoryBuffers::nw6 * MemoryBuffers::nx2;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for wavefunctions
   typedef HostBuffer<fptype, sizePerEventWavefunctions, HostBufferALIGNED> HostBufferWavefunctions;
 #else
@@ -421,7 +421,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for helicity random numbers
   constexpr size_t sizePerEventRndNumHelicity = 1;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for helicity random numbers
   typedef HostBuffer<fptype, sizePerEventRndNumHelicity, HostBufferALIGNED> HostBufferRndNumHelicity;
 #else
@@ -439,7 +439,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for color random numbers
   constexpr size_t sizePerEventRndNumColor = 1;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for color random numbers
   typedef HostBuffer<fptype, sizePerEventRndNumColor, HostBufferALIGNED> HostBufferRndNumColor;
 #else
@@ -457,7 +457,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for helicity selection
   constexpr size_t sizePerEventSelectedHelicity = 1;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for helicity selection
   typedef HostBuffer<int, sizePerEventSelectedHelicity, HostBufferALIGNED> HostBufferSelectedHelicity;
 #else
@@ -475,7 +475,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for color selection
   constexpr size_t sizePerEventSelectedColor = 1;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for color selection
   typedef HostBuffer<int, sizePerEventSelectedColor, HostBufferALIGNED> HostBufferSelectedColor;
 #else
@@ -487,7 +487,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   template<class Tdst, class Tsrc>
   void copyDeviceFromHost( Tdst& dst, const Tsrc& src ) // keep the same order of arguments as in memcpy
   {
@@ -504,13 +504,13 @@ namespace mg5amcCpu
       throw std::runtime_error( sstr.str() );
     }
     // NB (PR #45): cudaMemcpy involves an intermediate memcpy to pinned memory if host array is a not a pinned host array
-    checkCuda( cudaMemcpy( dst.data(), src.data(), src.bytes(), cudaMemcpyHostToDevice ) );
+    gpuMemcpy( dst.data(), src.data(), src.bytes(), gpuMemcpyHostToDevice );
   }
 #endif
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   template<class Tdst, class Tsrc>
   void copyHostFromDevice( Tdst& dst, const Tsrc& src ) // keep the same order of arguments as in memcpy
   {
@@ -527,7 +527,7 @@ namespace mg5amcCpu
       throw std::runtime_error( sstr.str() );
     }
     // NB (PR #45): cudaMemcpy involves an intermediate memcpy to pinned memory if host array is a not a pinned host array
-    checkCuda( cudaMemcpy( dst.data(), src.data(), src.bytes(), cudaMemcpyDeviceToHost ) );
+    gpuMemcpy( dst.data(), src.data(), src.bytes(), gpuMemcpyDeviceToHost );
   }
 #endif
 
diff --git a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/P1_Sigma_sm_gg_ttxgg/CPPProcess.cc b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/P1_Sigma_sm_gg_ttxgg/CPPProcess.cc
index 53ef4c5751..0d88d93225 100644
--- a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/P1_Sigma_sm_gg_ttxgg/CPPProcess.cc
+++ b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/P1_Sigma_sm_gg_ttxgg/CPPProcess.cc
@@ -4,7 +4,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
 // MadGraph5_aMC@NLO v. 3.5.0_lo_vect, 2023-06-09
@@ -16,7 +16,6 @@
 
 #include "mgOnGpuConfig.h"
 
-#include "CudaRuntime.h"
 #include "HelAmps_sm.h"
 #include "MemoryAccessAmplitudes.h"
 #include "MemoryAccessCouplings.h"
@@ -46,7 +45,7 @@
 // Class member functions for calculating the matrix elements for
 // Process: g g > t t~ g g WEIGHTED<=4 @1
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -80,7 +79,7 @@ namespace mg5amcCpu
   __device__ const fptype cIPD[2] = { (fptype)Parameters_sm::mdl_MT, (fptype)Parameters_sm::mdl_WT };
   __device__ const fptype* cIPC = nullptr; // unused as nicoup=0
 #else
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   __device__ __constant__ fptype cIPD[2];
   __device__ __constant__ fptype* cIPC = nullptr; // unused as nicoup=0
 #else
@@ -90,7 +89,7 @@ namespace mg5amcCpu
 #endif
 
   // Helicity combinations (and filtering of "good" helicity combinations)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   __device__ __constant__ short cHel[ncomb][npar];
   __device__ __constant__ int cNGoodHel;
   __device__ __constant__ int cGoodHel[ncomb];
@@ -118,13 +117,13 @@ namespace mg5amcCpu
                            fptype* allDenominators,       // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
                            fptype_sv* jamp2_sv            // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled)
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
                            , const int ievt00             // input: first event number in current C++ event page (for CUDA, ievt depends on threadid)
 #endif
                            )
   //ALWAYS_INLINE // attributes are not permitted in a function definition
   {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     using namespace mg5amcGpu;
     using M_ACCESS = DeviceAccessMomenta;         // non-trivial access: buffer includes all events
     using E_ACCESS = DeviceAccessMatrixElements;  // non-trivial access: buffer includes all events
@@ -151,7 +150,7 @@ namespace mg5amcCpu
 #endif /* clang-format on */
     mgDebug( 0, __FUNCTION__ );
     //printf( "calculate_wavefunctions: ihel=%2d\n", ihel );
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
     //printf( "calculate_wavefunctions: ievt00=%d\n", ievt00 );
 #endif
 
@@ -187,7 +186,7 @@ namespace mg5amcCpu
 #endif
     for( int iParity = 0; iParity < nParity; ++iParity )
     { // START LOOP ON IPARITY
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
       const int ievt0 = ievt00 + iParity * neppV;
 #endif
       constexpr size_t nxcoup = ndcoup + nicoup; // both dependent and independent couplings
@@ -200,8 +199,10 @@ namespace mg5amcCpu
         allCOUPs[idcoup] = CD_ACCESS::idcoupAccessBufferConst( allcouplings, idcoup ); // dependent couplings, vary event-by-event
       for( size_t iicoup = 0; iicoup < nicoup; iicoup++ )
         allCOUPs[ndcoup + iicoup] = CI_ACCESS::iicoupAccessBufferConst( cIPC, iicoup ); // independent couplings, fixed for all events
+#ifdef MGONGPUCPP_GPUIMPL
 #ifdef __CUDACC__
 #pragma nv_diagnostic pop
+#endif
       // CUDA kernels take input/output buffers with momenta/MEs for all events
       const fptype* momenta = allmomenta;
       const fptype* COUPs[nxcoup];
@@ -2474,7 +2475,7 @@ namespace mg5amcCpu
         { 62, 71, -10, 80, -1, 8, -28, 62, 62, -10, -10, -1, -1, 8, -10, -1, -64, 8, 8, -64, 80, 8, 512, -64 },
         { -28, 62, 62, -10, -10, -1, 62, 71, -10, 80, -1, 8, -10, -1, -1, 8, 8, -64, 80, 8, 8, -64, -64, 512 } }; // 2-D array[24][24]
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
       // Pre-compute a constexpr triangular color matrix properly normalized #475
       struct TriangularNormalizedColorMatrix
       {
@@ -2531,7 +2532,7 @@ namespace mg5amcCpu
 #endif
       for( int icol = 0; icol < ncolor; icol++ )
       {
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
         // === C++ START ===
         // Diagonal terms
 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
@@ -2590,7 +2591,7 @@ namespace mg5amcCpu
       MEs_sv_previous += deltaMEs_previous;
 #endif
       /*
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
       if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", blockDim.x * blockIdx.x + threadIdx.x, ihel, MEs_sv );
 #else
 #ifdef MGONGPU_CPPSIMD
@@ -2685,8 +2686,8 @@ namespace mg5amcCpu
       { 1, 1, 1, -1, -1, 1 },
       { 1, 1, 1, -1, 1, -1 },
       { 1, 1, 1, -1, 1, 1 } };
-#ifdef __CUDACC__
-    checkCuda( cudaMemcpyToSymbol( cHel, tHel, ncomb * npar * sizeof( short ) ) );
+#ifdef MGONGPUCPP_GPUIMPL
+    gpuMemcpyToSymbol( cHel, tHel, ncomb * npar * sizeof( short ) );
 #else
     memcpy( cHel, tHel, ncomb * npar * sizeof( short ) );
 #endif
@@ -2728,9 +2729,9 @@ namespace mg5amcCpu
     // Then copy them to CUDA constant memory (issue #39) or its C++ emulation in file-scope static memory
     const fptype tIPD[2] = { (fptype)m_pars->mdl_MT, (fptype)m_pars->mdl_WT };
     //const cxtype tIPC[0] = { ... }; // nicoup=0
-#ifdef __CUDACC__
-    checkCuda( cudaMemcpyToSymbol( cIPD, tIPD, 2 * sizeof( fptype ) ) );
-    //checkCuda( cudaMemcpyToSymbol( cIPC, tIPC, 0 * sizeof( cxtype ) ) ); // nicoup=0
+#ifdef MGONGPUCPP_GPUIMPL
+    gpuMemcpyToSymbol( cIPD, tIPD, 2 * sizeof( fptype ) );
+    //gpuMemcpyToSymbol( cIPC, tIPC, 0 * sizeof( cxtype ) ); // nicoup=0
 #else
     memcpy( cIPD, tIPD, 2 * sizeof( fptype ) );
     //memcpy( cIPC, tIPC, 0 * sizeof( cxtype ) ); // nicoup=0
@@ -2768,7 +2769,7 @@ namespace mg5amcCpu
   {
     std::stringstream out;
     // CUDA version (NVCC)
-    // [Use __NVCC__ instead of __CUDACC__ here!]
+    // [Use __NVCC__ instead of MGONGPUCPP_GPUIMPL here!]
     // [This tests if 'nvcc' was used even to build a .cc file, even if not necessarily 'nvcc -x cu' for a .cu file]
     // [Check 'nvcc --compiler-options -dM -E dummy.c | grep CUDA': see https://stackoverflow.com/a/53713712]
 #ifdef __NVCC__
@@ -2833,12 +2834,12 @@ namespace mg5amcCpu
   __global__ void /* clang-format off */
   computeDependentCouplings( const fptype* allgs, // input: Gs[nevt]
                              fptype* allcouplings // output: couplings[nevt*ndcoup*2]
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
                              , const int nevt     // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #endif
   ) /* clang-format on */
   {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     using namespace mg5amcGpu;
     using G_ACCESS = DeviceAccessGs;
     using C_ACCESS = DeviceAccessCouplings;
@@ -2859,7 +2860,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__ /* clang-format off */
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
   __global__ void
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
@@ -2988,9 +2989,9 @@ namespace mg5amcCpu
         nGoodHel++;
       }
     }
-#ifdef __CUDACC__
-    checkCuda( cudaMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) ) );
-    checkCuda( cudaMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) ) );
+#ifdef MGONGPUCPP_GPUIMPL
+    gpuMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) );
+    gpuMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) );
 #else
     cNGoodHel = nGoodHel;
     for( int ihel = 0; ihel < ncomb; ihel++ ) cGoodHel[ihel] = goodHel[ihel];
@@ -3014,7 +3015,7 @@ namespace mg5amcCpu
 #endif
             int* allselhel,                // output: helicity selection[nevt]
             int* allselcol                 // output: helicity selection[nevt]
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
             , const int nevt               // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #endif
             ) /* clang-format on */
@@ -3035,7 +3036,7 @@ namespace mg5amcCpu
     // Denominators: spins, colors and identical particles
     constexpr int helcolDenominators[1] = { 512 }; // assume nprocesses == 1 (#272 and #343)
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     // Remember: in CUDA this is a kernel for one event, in c++ this processes n events
     const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid
 #else
@@ -3049,9 +3050,12 @@ namespace mg5amcCpu
 #endif
 
     // Start sigmaKin_lines
+
+#include "GpuAbstraction.h"
+
     // === PART 0 - INITIALISATION (before calculate_wavefunctions) ===
     // Reset the "matrix elements" - running sums of |M|^2 over helicities for the given event
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     allMEs[ievt] = 0;
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     allNumerators[ievt] = 0;
@@ -3079,7 +3083,7 @@ namespace mg5amcCpu
     // === PART 1 - HELICITY LOOP: CALCULATE WAVEFUNCTIONS ===
     // (in both CUDA and C++, using precomputed good helicities)
 
-#ifdef __CUDACC__ // CUDA OR C++
+#ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++
 
     // *** START OF PART 1a - CUDA (one event per CPU thread) ***
     // Running sum of partial amplitudes squared for event by event color selection (#402)
@@ -3283,7 +3287,7 @@ namespace mg5amcCpu
     // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event
     // [NB 'sum over final spins, average over initial spins', eg see
     // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf]
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     allMEs[ievt] /= helcolDenominators[0];
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     if( channelId > 0 ) allMEs[ievt] *= allNumerators[ievt] / allDenominators[ievt];
diff --git a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/P1_Sigma_sm_gg_ttxgg/CPPProcess.h b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/P1_Sigma_sm_gg_ttxgg/CPPProcess.h
index b3323a7a84..5fa603d43c 100644
--- a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/P1_Sigma_sm_gg_ttxgg/CPPProcess.h
+++ b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/P1_Sigma_sm_gg_ttxgg/CPPProcess.h
@@ -4,7 +4,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
 // MadGraph5_aMC@NLO v. 3.5.0_lo_vect, 2023-06-09
@@ -25,7 +25,7 @@
 
 //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -107,7 +107,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   __global__ void
   computeDependentCouplings( const fptype* allgs,    // input: Gs[nevt]
                              fptype* allcouplings ); // output: couplings[nevt*ndcoup*2]
@@ -120,7 +120,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__ /* clang-format off */
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
   __global__ void
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
@@ -150,7 +150,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__ /* clang-format off */
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
   __global__ void
   sigmaKin( const fptype* allmomenta,      // input: momenta[nevt*npar*4]
             const fptype* allcouplings,    // input: couplings[nevt*ndcoup*2]
diff --git a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/P1_Sigma_sm_gg_ttxgg/CudaRuntime.h b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/P1_Sigma_sm_gg_ttxgg/CudaRuntime.h
deleted file mode 120000
index ce9e1a487a..0000000000
--- a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/P1_Sigma_sm_gg_ttxgg/CudaRuntime.h
+++ /dev/null
@@ -1 +0,0 @@
-../CudaRuntime.h
\ No newline at end of file
diff --git a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/P1_Sigma_sm_gg_ttxgg/GpuAbstraction.h b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/P1_Sigma_sm_gg_ttxgg/GpuAbstraction.h
new file mode 120000
index 0000000000..72054e19ba
--- /dev/null
+++ b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/P1_Sigma_sm_gg_ttxgg/GpuAbstraction.h
@@ -0,0 +1 @@
+../GpuAbstraction.h
\ No newline at end of file
diff --git a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/P1_Sigma_sm_gg_ttxgg/GpuRuntime.h b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/P1_Sigma_sm_gg_ttxgg/GpuRuntime.h
new file mode 120000
index 0000000000..3920e83be4
--- /dev/null
+++ b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/P1_Sigma_sm_gg_ttxgg/GpuRuntime.h
@@ -0,0 +1 @@
+../GpuRuntime.h
\ No newline at end of file
diff --git a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/P1_Sigma_sm_gg_ttxgg/check_sa.cc b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/P1_Sigma_sm_gg_ttxgg/check_sa.cc
index f1e75b9252..1bad694d1c 100644
--- a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/P1_Sigma_sm_gg_ttxgg/check_sa.cc
+++ b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/P1_Sigma_sm_gg_ttxgg/check_sa.cc
@@ -4,7 +4,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: O. Mattelaer (Nov 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 
 #include "mgOnGpuConfig.h"
@@ -12,6 +12,7 @@
 #include "BridgeKernels.h"
 #include "CPPProcess.h"
 #include "CrossSectionKernels.h"
+#include "GpuRuntime.h"
 #include "MatrixElementKernels.h"
 #include "MemoryAccessMatrixElements.h"
 #include "MemoryAccessMomenta.h"
@@ -63,7 +64,7 @@ usage( char* argv0, int ret = 1 )
   std::cout << std::endl;
   std::cout << "Summary stats are always computed: '-p' and '-j' only control their printout" << std::endl;
   std::cout << "The '-d' flag only enables NaN/abnormal warnings and OMP debugging" << std::endl;
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 #ifdef _OPENMP
   std::cout << std::endl;
   std::cout << "Use the OMP_NUM_THREADS environment variable to control OMP multi-threading" << std::endl;
@@ -77,7 +78,7 @@ int
 main( int argc, char** argv )
 {
   // Namespaces for CUDA and C++ (FIXME - eventually use the same namespace everywhere...)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   using namespace mg5amcGpu;
 #else
   using namespace mg5amcCpu;
@@ -103,11 +104,11 @@ main( int argc, char** argv )
     CurandDevice = 2
   };
 #ifdef __CUDACC__
-  RandomNumberMode rndgen = RandomNumberMode::CurandDevice; // default on GPU
+  RandomNumberMode rndgen = RandomNumberMode::CurandDevice; // default on CUDA GPU
 #elif not defined MGONGPU_HAS_NO_CURAND
   RandomNumberMode rndgen = RandomNumberMode::CurandHost;  // default on CPU if build has curand
 #else
-  RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // default on CPU if build has no curand
+  RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // default on CPU if build has no curand and on HIP GPU
 #endif
   // Rambo sampling mode (NB RamboHost implies CommonRandom or CurandHost!)
   enum class RamboSamplingMode
@@ -115,7 +116,7 @@ main( int argc, char** argv )
     RamboHost = 1,
     RamboDevice = 2
   };
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   RamboSamplingMode rmbsmp = RamboSamplingMode::RamboDevice; // default on GPU
 #else
   RamboSamplingMode rmbsmp = RamboSamplingMode::RamboHost; // default on CPU
@@ -148,7 +149,7 @@ main( int argc, char** argv )
 #ifdef __CUDACC__
       rndgen = RandomNumberMode::CurandDevice;
 #else
-      throw std::runtime_error( "CurandDevice is not supported on CPUs" );
+      throw std::runtime_error( "CurandDevice is not supported on CPUs or on HIP GPUs" );
 #endif
     }
     else if( arg == "--curhst" )
@@ -165,7 +166,7 @@ main( int argc, char** argv )
     }
     else if( arg == "--rmbdev" )
     {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
       rmbsmp = RamboSamplingMode::RamboDevice;
 #else
       throw std::runtime_error( "RamboDevice is not supported on CPUs" );
@@ -239,13 +240,13 @@ main( int argc, char** argv )
     return usage( argv[0] );
   }
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 #ifdef _OPENMP
   ompnumthreadsNotSetMeansOneThread( debug ? 1 : 0 ); // quiet(-1), info(0), debug(1)
 #endif
 #endif
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // Fail gently and avoid "Illegal instruction (core dumped)" if the host does not support the SIMD used in the ME calculation
   // Note: this prevents a crash on pmpe04 but not on some github CI nodes?
   // [NB: SIMD vectorization in mg5amc C++ code is only used in the ME calculation below MatrixElementKernelHost!]
@@ -263,14 +264,14 @@ main( int argc, char** argv )
 
   // === STEP 0 - INITIALISE
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 
-  // --- 00. Initialise cuda
-  // Instantiate a CudaRuntime at the beginnining of the application's main to
-  // invoke cudaSetDevice(0) in the constructor and book a cudaDeviceReset() call in the destructor
-  const std::string cdinKey = "00 CudaInit";
+  // --- 00. Initialise GPU
+  // Instantiate a GpuRuntime at the beginnining of the application's main.
+  // For CUDA this invokes cudaSetDevice(0) in the constructor and books a cudaDeviceReset() call in the destructor.
+  const std::string cdinKey = "00 GpuInit";
   timermap.start( cdinKey );
-  CudaRuntime cudaRuntime( debug );
+  GpuRuntime GpuRuntime( debug );
 #endif
 
   // --- 0a. Initialise physics process
@@ -292,7 +293,7 @@ main( int argc, char** argv )
   timermap.start( alloKey );
 
   // Memory buffers for random numbers for momenta
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferRndNumMomenta hstRndmom( nevt );
 #else
   PinnedHostBufferRndNumMomenta hstRndmom( nevt );
@@ -300,7 +301,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for sampling weights
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferWeights hstWeights( nevt );
 #else
   PinnedHostBufferWeights hstWeights( nevt );
@@ -308,7 +309,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for momenta
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferMomenta hstMomenta( nevt );
 #else
   PinnedHostBufferMomenta hstMomenta( nevt );
@@ -316,7 +317,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for Gs
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferGs hstGs( nevt );
 #else
   PinnedHostBufferGs hstGs( nevt );
@@ -333,7 +334,7 @@ main( int argc, char** argv )
   }
 
   // Memory buffers for matrix elements
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferMatrixElements hstMatrixElements( nevt );
 #else
   PinnedHostBufferMatrixElements hstMatrixElements( nevt );
@@ -342,7 +343,7 @@ main( int argc, char** argv )
 
   // Memory buffers for random numbers for helicity selection
   // *** NB #403 these buffers always remain initialised at 0: no need for helicity choice in gcheck/check (no LHE produced) ***
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferRndNumHelicity hstRndHel( nevt );
 #else
   PinnedHostBufferRndNumHelicity hstRndHel( nevt );
@@ -351,7 +352,7 @@ main( int argc, char** argv )
 
   // Memory buffers for random numbers for color selection
   // *** NB #402 these buffers always remain initialised at 0: no need for color choice in gcheck/check (no LHE produced) ***
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferRndNumColor hstRndCol( nevt );
 #else
   PinnedHostBufferRndNumColor hstRndCol( nevt );
@@ -359,7 +360,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for helicity selection
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferSelectedHelicity hstSelHel( nevt );
 #else
   PinnedHostBufferSelectedHelicity hstSelHel( nevt );
@@ -367,7 +368,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for color selection
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferSelectedColor hstSelCol( nevt );
 #else
   PinnedHostBufferSelectedColor hstSelCol( nevt );
@@ -403,7 +404,7 @@ main( int argc, char** argv )
 #else
   else
   {
-    throw std::logic_error( "CurandDevice is not supported on CPUs" ); // INTERNAL ERROR (no path to this statement)
+    throw std::logic_error( "CurandDevice is not supported on CPUs or HIP GPUs" ); // INTERNAL ERROR (no path to this statement)
   }
 #endif
 #else
@@ -421,7 +422,7 @@ main( int argc, char** argv )
   }
   else
   {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     prsk.reset( new RamboSamplingKernelDevice( energy, devRndmom, devMomenta, devWeights, gpublocks, gputhreads ) );
 #else
     throw std::logic_error( "RamboDevice is not supported on CPUs" ); // INTERNAL ERROR (no path to this statement)
@@ -432,7 +433,7 @@ main( int argc, char** argv )
   std::unique_ptr<MatrixElementKernelBase> pmek;
   if( !bridge )
   {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     pmek.reset( new MatrixElementKernelDevice( devMomenta, devGs, devRndHel, devRndCol, devMatrixElements, devSelHel, devSelCol, gpublocks, gputhreads ) );
 #else
     pmek.reset( new MatrixElementKernelHost( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, nevt ) );
@@ -440,7 +441,7 @@ main( int argc, char** argv )
   }
   else
   {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     pmek.reset( new BridgeKernelDevice( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, gpublocks, gputhreads ) );
 #else
     pmek.reset( new BridgeKernelHost( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, nevt ) );
@@ -482,7 +483,7 @@ main( int argc, char** argv )
     prnk->generateRnarray();
     //std::cout << "Got random numbers" << std::endl;
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     if( rndgen != RandomNumberMode::CurandDevice && rmbsmp == RamboSamplingMode::RamboDevice )
     {
       // --- 1c. Copy rndmom from host to device
@@ -514,7 +515,7 @@ main( int argc, char** argv )
     prsk->getMomentaFinal();
     //std::cout << "Got final momenta" << std::endl;
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     if( rmbsmp == RamboSamplingMode::RamboDevice )
     {
       // --- 2c. CopyDToH Weights
@@ -559,7 +560,7 @@ main( int argc, char** argv )
       dynamic_cast<BridgeKernelBase*>( pmek.get() )->transposeInputMomentaC2F();
     }
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     // --- 2d. CopyHToD Momenta
     const std::string gKey = "0.. CpHTDg";
     rambtime += timermap.start( gKey ); // FIXME! NOT A RAMBO TIMER!
@@ -588,7 +589,7 @@ main( int argc, char** argv )
     wv3atime += timermap.stop(); // calc only
     wavetime += wv3atime;        // calc plus copy
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     if( !bridge )
     {
       // --- 3b. CopyDToH MEs
@@ -731,15 +732,19 @@ main( int argc, char** argv )
     rndgentxt = "CURAND DEVICE";
 #ifdef __CUDACC__
   rndgentxt += " (CUDA code)";
+#elif defined __HIPCC__
+  rndgentxt += " (HIP code)";
 #else
   rndgentxt += " (C++ code)";
 #endif
 
   // Workflow description summary
   std::string wrkflwtxt;
-  // -- CUDA or C++?
+  // -- CUDA or HIP or C++?
 #ifdef __CUDACC__
   wrkflwtxt += "CUD:";
+#elif defined __HIPCC__
+  wrkflwtxt += "HIP:";
 #else
   wrkflwtxt += "CPP:";
 #endif
@@ -764,6 +769,12 @@ main( int argc, char** argv )
 #else
   wrkflwtxt += "???:"; // no path to this statement
 #endif
+#elif defined __HIPCC__
+#if defined MGONGPU_CUCXTYPE_CXSMPL
+  wrkflwtxt += "CXS:";
+#else
+  wrkflwtxt += "???:"; // no path to this statement
+#endif
 #else
 #if defined MGONGPU_CPPCXTYPE_STDCOMPLEX
   wrkflwtxt += "STX:";
@@ -789,7 +800,7 @@ main( int argc, char** argv )
     wrkflwtxt += "RMBDEV+";
   else
     wrkflwtxt += "??????+"; // no path to this statement
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   // -- HOST or DEVICE matrix elements? Standalone MEs or BRIDGE?
   if( !bridge )
     wrkflwtxt += "MESDEV";
@@ -845,7 +856,7 @@ main( int argc, char** argv )
 
   if( perf )
   {
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 #ifdef _OPENMP
     // Get the output of "nproc --all" (https://stackoverflow.com/a/478960)
     std::string nprocall;
@@ -866,6 +877,8 @@ main( int argc, char** argv )
     std::cout << std::string( SEP79, '*' ) << std::endl
 #ifdef __CUDACC__
               << "Process                     = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_CUDA"
+#elif defined __HIPCC__
+              << "Process                     = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_HIP"
 #else
               << "Process                     = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_CPP"
 #endif
@@ -892,21 +905,21 @@ main( int argc, char** argv )
 #elif defined MGONGPU_FPTYPE_FLOAT
               << "FP precision                = FLOAT (NaN/abnormal=" << nabn << ", zero=" << nzero << ")" << std::endl
 #endif
-#ifdef __CUDACC__
 #if defined MGONGPU_CUCXTYPE_CUCOMPLEX
               << "Complex type                = CUCOMPLEX" << std::endl
 #elif defined MGONGPU_CUCXTYPE_THRUST
               << "Complex type                = THRUST::COMPLEX" << std::endl
-#endif
-#else
+#elif defined MGONGPU_CUCXTYPE_CXSMPL
               << "Complex type                = STD::COMPLEX" << std::endl
+#else
+              << "Complex type                = ???" << std::endl // no path to this statement...
 #endif
               << "RanNumb memory layout       = AOSOA[" << neppR << "]"
               << ( neppR == 1 ? " == AOS" : "" )
               << " [HARDCODED FOR REPRODUCIBILITY]" << std::endl
               << "Momenta memory layout       = AOSOA[" << neppM << "]"
               << ( neppM == 1 ? " == AOS" : "" ) << std::endl
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     //<< "Wavefunction GPU memory     = LOCAL" << std::endl
 #else
 #if !defined MGONGPU_CPPSIMD
@@ -937,7 +950,7 @@ main( int argc, char** argv )
 #endif
 #endif
               << "Random number generation    = " << rndgentxt << std::endl
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 #ifdef _OPENMP
               << "OMP threads / `nproc --all` = " << omp_get_max_threads() << " / " << nprocall // includes a newline
 #endif
@@ -1033,14 +1046,14 @@ main( int argc, char** argv )
              << "\"FLOAT (NaN/abnormal=" << nabn << ")\"," << std::endl
 #endif
              << "\"Complex type\": "
-#ifdef __CUDACC__
 #if defined MGONGPU_CUCXTYPE_CUCOMPLEX
              << "\"CUCOMPLEX\"," << std::endl
 #elif defined MGONGPU_CUCXTYPE_THRUST
              << "\"THRUST::COMPLEX\"," << std::endl
-#endif
-#else
+#elif defined MGONGPU_CUCXTYPE_CXSMPL
              << "\"STD::COMPLEX\"," << std::endl
+#else
+             << "\"???\"," << std::endl                           // no path to this statement...
 #endif
              << "\"RanNumb memory layout\": "
              << "\"AOSOA[" << neppR << "]\""
@@ -1048,7 +1061,7 @@ main( int argc, char** argv )
              << "\"Momenta memory layout\": "
              << "\"AOSOA[" << neppM << "]\""
              << ( neppM == 1 ? " == AOS" : "" ) << ", " << std::endl
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     //<< "\"Wavefunction GPU memory\": " << "\"LOCAL\"," << std::endl
 #endif
              << "\"Curand generation\": "
diff --git a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/RamboSamplingKernels.cc b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/RamboSamplingKernels.cc
index da68aa9255..79abbcc4f8 100644
--- a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/RamboSamplingKernels.cc
+++ b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/RamboSamplingKernels.cc
@@ -1,11 +1,11 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #include "RamboSamplingKernels.h"
 
-#include "CudaRuntime.h"
+#include "GpuRuntime.h"
 #include "MemoryAccessMomenta.h"
 #include "MemoryAccessRandomNumbers.h"
 #include "MemoryAccessWeights.h"
@@ -14,7 +14,7 @@
 
 #include <sstream>
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -92,7 +92,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   RamboSamplingKernelDevice::RamboSamplingKernelDevice( const fptype energy,               // input: energy
                                                         const BufferRndNumMomenta& rndmom, // input: random numbers in [0,1]
                                                         BufferMomenta& momenta,            // output: momenta
@@ -135,7 +135,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   __global__ void
   getMomentaInitialDevice( const fptype energy,
                            fptype* momenta )
@@ -147,17 +147,17 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   void
   RamboSamplingKernelDevice::getMomentaInitial()
   {
-    getMomentaInitialDevice<<<m_gpublocks, m_gputhreads>>>( m_energy, m_momenta.data() );
+    gpuLaunchKernel( getMomentaInitialDevice, m_gpublocks, m_gputhreads, m_energy, m_momenta.data() );
   }
 #endif
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   __global__ void
   getMomentaFinalDevice( const fptype energy,
                          const fptype* rndmom,
@@ -171,11 +171,11 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   void
   RamboSamplingKernelDevice::getMomentaFinal()
   {
-    getMomentaFinalDevice<<<m_gpublocks, m_gputhreads>>>( m_energy, m_rndmom.data(), m_momenta.data(), m_weights.data() );
+    gpuLaunchKernel( getMomentaFinalDevice, m_gpublocks, m_gputhreads, m_energy, m_rndmom.data(), m_momenta.data(), m_weights.data() );
   }
 #endif
 
diff --git a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/RamboSamplingKernels.h b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/RamboSamplingKernels.h
index 184089efd7..7c214cd74b 100644
--- a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/RamboSamplingKernels.h
+++ b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/RamboSamplingKernels.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef RAMBOSAMPLINGKERNELS_H
 #define RAMBOSAMPLINGKERNELS_H 1
@@ -10,7 +10,7 @@
 
 #include "MemoryBuffers.h"
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -93,7 +93,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   // A class encapsulating RAMBO phase space sampling on a GPU device
   class RamboSamplingKernelDevice final : public SamplingKernelBase, public NumberOfEvents
   {
diff --git a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/RandomNumberKernels.h b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/RandomNumberKernels.h
index 188a72c2c9..21d63beeac 100644
--- a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/RandomNumberKernels.h
+++ b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/RandomNumberKernels.h
@@ -1,14 +1,14 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef RANDOMNUMBERKERNELS_H
 #define RANDOMNUMBERKERNELS_H 1
 
 #include "mgOnGpuConfig.h"
 
-// NB This must come AFTER mgOnGpuConfig.h which contains our definition of __global__ when __CUDACC__ is not defined
+// NB This must come AFTER mgOnGpuConfig.h which contains our definition of __global__ when MGONGPUCPP_GPUIMPL is not defined
 #ifndef MGONGPU_HAS_NO_CURAND
 //#include "curand.h"
 struct curandGenerator_st; // forward definition from curand.h
@@ -16,7 +16,7 @@ struct curandGenerator_st; // forward definition from curand.h
 
 #include "MemoryBuffers.h"
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/cudacpp.mk b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/cudacpp.mk
index a0397e9ecc..bcb73d7f01 100644
--- a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/cudacpp.mk
+++ b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/cudacpp.mk
@@ -1,7 +1,7 @@
 # Copyright (C) 2020-2023 CERN and UCLouvain.
 # Licensed under the GNU Lesser General Public License (version 3 or later).
 # Created by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin.
-# Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+# Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 
 #=== Determine the name of this makefile (https://ftp.gnu.org/old-gnu/Manuals/make-3.80/html_node/make_17.html)
 #=== NB: different names (e.g. cudacpp.mk and cudacpp_src.mk) are used in the Subprocess and src directories
@@ -32,7 +32,7 @@ UNAME_P := $(shell uname -p)
 #=== Configure common compiler flags for C++ and CUDA
 
 INCFLAGS = -I.
-OPTFLAGS = -O3 # this ends up in CUFLAGS too (should it?), cannot add -Ofast or -ffast-math here
+OPTFLAGS = -O3 # this ends up in GPUFLAGS too (should it?), cannot add -Ofast or -ffast-math here
 
 # Dependency on src directory
 MG5AMC_COMMONLIB = mg5amc_common
@@ -89,69 +89,139 @@ endif
 
 #-------------------------------------------------------------------------------
 
-#=== Configure the CUDA compiler
-
-# If CXX is not a single word (example "clang++ --gcc-toolchain...") then disable CUDA builds (issue #505)
-# This is because it is impossible to pass this to "CUFLAGS += -ccbin <host-compiler>" below
-ifneq ($(words $(subst ccache ,,$(CXX))),1) # allow at most "CXX=ccache <host-compiler>" from outside
-  $(warning CUDA builds are not supported for multi-word CXX "$(CXX)")
-  override CUDA_HOME=disabled
-endif
-
-# If CUDA_HOME is not set, try to set it from the location of nvcc
-ifndef CUDA_HOME
-  CUDA_HOME = $(patsubst %bin/nvcc,%,$(shell which nvcc 2>/dev/null))
-  $(warning CUDA_HOME was not set: using "$(CUDA_HOME)")
-endif
-
-# Set NVCC as $(CUDA_HOME)/bin/nvcc if it exists
-ifneq ($(wildcard $(CUDA_HOME)/bin/nvcc),)
-  NVCC = $(CUDA_HOME)/bin/nvcc
-  USE_NVTX ?=-DUSE_NVTX
-  # See https://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/index.html
-  # See https://arnon.dk/matching-sm-architectures-arch-and-gencode-for-various-nvidia-cards/
-  # Default: use compute capability 70 for V100 (CERN lxbatch, CERN itscrd, Juwels Cluster).
-  # Embed device code for 70, and PTX for 70+.
-  # Export MADGRAPH_CUDA_ARCHITECTURE (comma-separated list) to use another value or list of values (see #533).
-  # Examples: use 60 for P100 (Piz Daint), 80 for A100 (Juwels Booster, NVidia raplab/Curiosity).
-  MADGRAPH_CUDA_ARCHITECTURE ?= 70
-  ###CUARCHFLAGS = -gencode arch=compute_$(MADGRAPH_CUDA_ARCHITECTURE),code=compute_$(MADGRAPH_CUDA_ARCHITECTURE) -gencode arch=compute_$(MADGRAPH_CUDA_ARCHITECTURE),code=sm_$(MADGRAPH_CUDA_ARCHITECTURE) # Older implementation (AV): go back to this one for multi-GPU support #533
-  ###CUARCHFLAGS = --gpu-architecture=compute_$(MADGRAPH_CUDA_ARCHITECTURE) --gpu-code=sm_$(MADGRAPH_CUDA_ARCHITECTURE),compute_$(MADGRAPH_CUDA_ARCHITECTURE) # Newer implementation (SH): cannot use this as-is for multi-GPU support #533
-  comma:=,
-  CUARCHFLAGS = $(foreach arch,$(subst $(comma), ,$(MADGRAPH_CUDA_ARCHITECTURE)),-gencode arch=compute_$(arch),code=compute_$(arch) -gencode arch=compute_$(arch),code=sm_$(arch))
-  CUINC = -I$(CUDA_HOME)/include/
-  CURANDLIBFLAGS = -L$(CUDA_HOME)/lib64/ -lcurand # NB: -lcuda is not needed here!
-  CUOPTFLAGS = -lineinfo
-  CUFLAGS = $(OPTFLAGS) $(CUOPTFLAGS) $(INCFLAGS) $(CUINC) $(USE_NVTX) $(CUARCHFLAGS) -use_fast_math
-  ###CUFLAGS += -Xcompiler -Wall -Xcompiler -Wextra -Xcompiler -Wshadow
-  ###NVCC_VERSION = $(shell $(NVCC) --version | grep 'Cuda compilation tools' | cut -d' ' -f5 | cut -d, -f1)
-  CUFLAGS += -std=c++17 # need CUDA >= 11.2 (see #333): this is enforced in mgOnGpuConfig.h
-  # Without -maxrregcount: baseline throughput: 6.5E8 (16384 32 12) up to 7.3E8 (65536 128 12)
-  ###CUFLAGS+= --maxrregcount 160 # improves throughput: 6.9E8 (16384 32 12) up to 7.7E8 (65536 128 12)
-  ###CUFLAGS+= --maxrregcount 128 # improves throughput: 7.3E8 (16384 32 12) up to 7.6E8 (65536 128 12)
-  ###CUFLAGS+= --maxrregcount 96 # degrades throughput: 4.1E8 (16384 32 12) up to 4.5E8 (65536 128 12)
-  ###CUFLAGS+= --maxrregcount 64 # degrades throughput: 1.7E8 (16384 32 12) flat at 1.7E8 (65536 128 12)
-else ifneq ($(origin REQUIRE_CUDA),undefined)
-  # If REQUIRE_CUDA is set but no cuda is found, stop here (e.g. for CI tests on GPU #443)
-  $(error No cuda installation found (set CUDA_HOME or make nvcc visible in PATH))
-else
-  # No cuda. Switch cuda compilation off and go to common random numbers in C++
-  $(warning CUDA_HOME is not set or is invalid: export CUDA_HOME to compile with cuda)
-  override NVCC=
-  override USE_NVTX=
-  override CUINC=
-  override CURANDLIBFLAGS=
-endif
+CUDA_COMPILER_PATH := $(shell compiler="`which nvcc 2>/dev/null`" && while [ -L "$$compiler" ]; do compiler=`readlink "$$compiler"`; done && echo "$$compiler")
+HIP_COMPILER_PATH := $(shell compiler="`which hipcc 2>/dev/null`" && while [ -L "$$compiler" ]; do compiler=`readlink "$$compiler"`; done && echo "$$compiler")
+
+ifeq ($(findstring nvcc,$(CUDA_COMPILER_PATH)),nvcc)
+  #=== Configure the CUDA compiler
+
+  # If CXX is not a single word (example "clang++ --gcc-toolchain...") then disable CUDA builds (issue #505)
+  # This is because it is impossible to pass this to "GPUFLAGS += -ccbin <host-compiler>" below
+  ifneq ($(words $(subst ccache ,,$(CXX))),1) # allow at most "CXX=ccache <host-compiler>" from outside
+    $(warning CUDA builds are not supported for multi-word CXX "$(CXX)")
+    override CUDA_HOME=disabled
+  endif
+
+  # If CUDA_HOME is not set, try to set it from the location of NVCC
+  ifndef CUDA_HOME
+    CUDA_HOME = $(patsubst %bin/nvcc,%,$(shell which nvcc 2>/dev/null))
+    $(warning CUDA_HOME was not set: using "$(CUDA_HOME)")
+  endif
+
+  # Set GPUCC as $(CUDA_HOME)/bin/nvcc if it exists
+  ifneq ($(wildcard $(CUDA_HOME)/bin/nvcc),)
+    GPUCC = $(CUDA_HOME)/bin/nvcc
+    USE_NVTX ?=-DUSE_NVTX
+    # See https://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/index.html
+    # See https://arnon.dk/matching-sm-architectures-arch-and-gencode-for-various-nvidia-cards/
+    # Default: use compute capability 70 for V100 (CERN lxbatch, CERN itscrd, Juwels Cluster).
+    # Embed device code for 70, and PTX for 70+.
+    # Export MADGRAPH_CUDA_ARCHITECTURE (comma-separated list) to use another value or list of values (see #533).
+    # Examples: use 60 for P100 (Piz Daint), 80 for A100 (Juwels Booster, NVidia raplab/Curiosity).
+    MADGRAPH_CUDA_ARCHITECTURE ?= 70
+    ###CUARCHFLAGS = -gencode arch=compute_$(MADGRAPH_CUDA_ARCHITECTURE),code=compute_$(MADGRAPH_CUDA_ARCHITECTURE) -gencode arch=compute_$(MADGRAPH_CUDA_ARCHITECTURE),code=sm_$(MADGRAPH_CUDA_ARCHITECTURE) # Older implementation (AV): go back to this one for multi-GPU support #533
+    ###CUARCHFLAGS = --gpu-architecture=compute_$(MADGRAPH_CUDA_ARCHITECTURE) --gpu-code=sm_$(MADGRAPH_CUDA_ARCHITECTURE),compute_$(MADGRAPH_CUDA_ARCHITECTURE) # Newer implementation (SH): cannot use this as-is for multi-GPU support #533
+    comma:=,
+    CUARCHFLAGS = $(foreach arch,$(subst $(comma), ,$(MADGRAPH_CUDA_ARCHITECTURE)),-gencode arch=compute_$(arch),code=compute_$(arch) -gencode arch=compute_$(arch),code=sm_$(arch))
+    CUINC = -I$(CUDA_HOME)/include/
+    CURANDLIBFLAGS = -L$(CUDA_HOME)/lib64/ -lcurand # NB: -lcuda is not needed here!
+    CUOPTFLAGS = -lineinfo
+    GPUFLAGS = $(OPTFLAGS) $(CUOPTFLAGS) $(INCFLAGS) $(CUINC) $(USE_NVTX) $(CUARCHFLAGS) -use_fast_math
+    ###GPUFLAGS += -Xcompiler -Wall -Xcompiler -Wextra -Xcompiler -Wshadow
+    ###GPUCC_VERSION = $(shell $(GPUCC) --version | grep 'Cuda compilation tools' | cut -d' ' -f5 | cut -d, -f1)
+    GPUFLAGS += -std=c++17 # need CUDA >= 11.2 (see #333): this is enforced in mgOnGpuConfig.h
+    # Without -maxrregcount: baseline throughput: 6.5E8 (16384 32 12) up to 7.3E8 (65536 128 12)
+    ###GPUFLAGS+= --maxrregcount 160 # improves throughput: 6.9E8 (16384 32 12) up to 7.7E8 (65536 128 12)
+    ###GPUFLAGS+= --maxrregcount 128 # improves throughput: 7.3E8 (16384 32 12) up to 7.6E8 (65536 128 12)
+    ###GPUFLAGS+= --maxrregcount 96 # degrades throughput: 4.1E8 (16384 32 12) up to 4.5E8 (65536 128 12)
+    ###GPUFLAGS+= --maxrregcount 64 # degrades throughput: 1.7E8 (16384 32 12) flat at 1.7E8 (65536 128 12)
+
+    CUBUILDRULEFLAGS = -Xcompiler -fPIC -c
+    CCBUILDRULEFLAGS = -Xcompiler -fPIC -c -x cu
+
+    CUDATESTFLAGS = -lcuda
+
+  else ifneq ($(origin REQUIRE_CUDA),undefined)
+    # If REQUIRE_CUDA is set but no cuda is found, stop here (e.g. for CI tests on GPU #443)
+    $(error No cuda installation found (set CUDA_HOME or make GPUCC visible in PATH))
+  else
+    # No cuda. Switch cuda compilation off and go to common random numbers in C++
+    $(warning CUDA_HOME is not set or is invalid: export CUDA_HOME to compile with cuda)
+    override GPUCC=
+    override USE_NVTX=
+    override CUINC=
+    override CURANDLIBFLAGS=
+  endif
+
+  # Set the host C++ compiler for GPUCC via "-ccbin <host-compiler>"
+  # (NB issue #505: this must be a single word, "clang++ --gcc-toolchain..." is not supported)
+  GPUFLAGS += -ccbin $(shell which $(subst ccache ,,$(CXX)))
+
+  # Allow newer (unsupported) C++ compilers with older versions of CUDA if ALLOW_UNSUPPORTED_COMPILER_IN_CUDA is set (#504)
+  ifneq ($(origin ALLOW_UNSUPPORTED_COMPILER_IN_CUDA),undefined)
+  GPUFLAGS += -allow-unsupported-compiler
+  endif
+
+else ifeq ($(findstring hipcc,$(HIP_COMPILER_PATH)),hipcc)
+  #=== Configure the HIP compiler
+
+  # If CXX is not a single word (example "clang++ --gcc-toolchain...") then disable CUDA builds (issue #505)
+  # This is because it is impossible to pass this to "GPUFLAGS += -ccbin <host-compiler>" below
+  ifneq ($(words $(subst ccache ,,$(CXX))),1) # allow at most "CXX=ccache <host-compiler>" from outside
+    $(warning CUDA builds are not supported for multi-word CXX "$(CXX)")
+    override CUDA_HOME=disabled
+  endif
+
+  # If HIP_HOME is not set, try to set it from the location of GPUCC
+  ifndef HIP_HOME
+    HIP_HOME = $(patsubst %bin/hipcc,%,$(HIP_COMPILER_PATH))
+    $(warning HIP_HOME was not set: using "$(HIP_HOME)")
+  endif
 
-# Set the host C++ compiler for nvcc via "-ccbin <host-compiler>"
-# (NB issue #505: this must be a single word, "clang++ --gcc-toolchain..." is not supported)
-CUFLAGS += -ccbin $(shell which $(subst ccache ,,$(CXX)))
+  # Set GPUCC as $(HIP_HOME)/bin/hipcc if it exists
+  ifneq ($(wildcard $(HIP_HOME)/bin/hipcc),)
+    GPUCC = $(HIP_HOME)/bin/hipcc
+
+    # Should maybe find something equivelant to this in HIP
+    #USE_NVTX ?=-DUSE_NVTX
+
+    HIPARCHFLAGS = -target x86_64-linux-gnu --offload-arch=gfx90a
+    HIPINC = -I$(HIP_HOME)/include/
+
+    # -DHIP_FAST_MATH equivelant to -use_fast_math in HIP 
+    # (But only for single precision line 208: https://rocm-developer-tools.github.io/HIP/hcc__detail_2math__functions_8h_source.html)
+    GPUFLAGS = $(OPTFLAGS) $(CUOPTFLAGS) $(INCFLAGS) $(HIPINC) $(HIPARCHFLAGS) -DHIP_FAST_MATH -DHIP_PLATFORM=amd -fPIC
+    ###GPUFLAGS += -Xcompiler -Wall -Xcompiler -Wextra -Xcompiler -Wshadow
+    GPUFLAGS += -std=c++17
+    # Without -maxrregcount: baseline throughput: 6.5E8 (16384 32 12) up to 7.3E8 (65536 128 12)
+    ###GPUFLAGS+= --maxrregcount 160 # improves throughput: 6.9E8 (16384 32 12) up to 7.7E8 (65536 128 12)
+    ###GPUFLAGS+= --maxrregcount 128 # improves throughput: 7.3E8 (16384 32 12) up to 7.6E8 (65536 128 12)
+    ###GPUFLAGS+= --maxrregcount 96 # degrades throughput: 4.1E8 (16384 32 12) up to 4.5E8 (65536 128 12)
+    ###GPUFLAGS+= --maxrregcount 64 # degrades throughput: 1.7E8 (16384 32 12) flat at 1.7E8 (65536 128 12)
+
+    CUBUILDRULEFLAGS = -fPIC -c
+    CCBUILDRULEFLAGS = -fPIC -c
+
+  else ifneq ($(origin REQUIRE_HIP),undefined)
+    # If REQUIRE_HIP is set but no cuda is found, stop here (e.g. for CI tests on GPU #443)
+    $(error No hip installation found (set HIP_HOME or make GPUCC visible in PATH))
+  else
+    # No hip. Switch hip compilation off and go to common random numbers in C++
+    $(warning HIP_HOME is not set or is invalid: export HIP_HOME to compile with hip)
+    override GPUCC=
+    override USE_NVTX=
+    override CUINC=
+    override CURANDLIBFLAGS=
+  endif
+
+  # Allow newer (unsupported) C++ compilers with older versions of CUDA if ALLOW_UNSUPPORTED_COMPILER_IN_CUDA is set (#504)
+  ifneq ($(origin ALLOW_UNSUPPORTED_COMPILER_IN_CUDA),undefined)
+  GPUFLAGS += -allow-unsupported-compiler
+  endif
 
-# Allow newer (unsupported) C++ compilers with older versions of CUDA if ALLOW_UNSUPPORTED_COMPILER_IN_CUDA is set (#504)
-ifneq ($(origin ALLOW_UNSUPPORTED_COMPILER_IN_CUDA),undefined)
-CUFLAGS += -allow-unsupported-compiler
 endif
 
+  
 #-------------------------------------------------------------------------------
 
 #=== Configure ccache for C++ and CUDA builds
@@ -163,9 +233,9 @@ endif
 #ifeq ($(USECCACHE)$(shell echo $(AR) | grep ccache),1)
 #  override AR:=ccache $(AR)
 #endif
-ifneq ($(NVCC),)
-  ifeq ($(USECCACHE)$(shell echo $(NVCC) | grep ccache),1)
-    override NVCC:=ccache $(NVCC)
+ifneq ($(GPUCC),)
+  ifeq ($(USECCACHE)$(shell echo $(GPUCC) | grep ccache),1)
+    override GPUCC:=ccache $(GPUCC)
   endif
 endif
 
@@ -189,7 +259,7 @@ endif
 
 # PowerPC-specific CUDA compiler flags (to be reviewed!)
 ifeq ($(UNAME_P),ppc64le)
-  CUFLAGS+= -Xcompiler -mno-float128
+  GPUFLAGS+= -Xcompiler -mno-float128
 endif
 
 #-------------------------------------------------------------------------------
@@ -199,10 +269,10 @@ endif
 # Set the default OMPFLAGS choice
 ifneq ($(shell $(CXX) --version | egrep '^Intel'),)
 override OMPFLAGS = -fopenmp
-###override OMPFLAGS = # disable OpenMP MT on Intel (was ok without nvcc but not ok with nvcc before #578)
+###override OMPFLAGS = # disable OpenMP MT on Intel (was ok without GPUCC but not ok with GPUCC before #578)
 else ifneq ($(shell $(CXX) --version | egrep '^(clang)'),)
 override OMPFLAGS = -fopenmp
-###override OMPFLAGS = # disable OpenMP MT on clang (was not ok without or with nvcc before #578)
+###override OMPFLAGS = # disable OpenMP MT on clang (was not ok without or with GPUCC before #578)
 else ifneq ($(shell $(CXX) --version | egrep '^(Apple clang)'),)
 override OMPFLAGS = # disable OpenMP MT on Apple clang (builds fail in the CI #578)
 else
@@ -253,7 +323,10 @@ endif
 
 # Set the default RNDGEN (random number generator) choice
 ifeq ($(RNDGEN),)
-  ifeq ($(NVCC),)
+  ifeq ($(GPUCC),)
+    override RNDGEN = hasNoCurand
+  # Edgecase for HIP compilation
+  else ifeq ($(findstring hipcc,$(GPUCC)),hipcc)
     override RNDGEN = hasNoCurand
   else ifeq ($(RNDGEN),)
     override RNDGEN = hasCurand
@@ -328,13 +401,13 @@ CXXFLAGS+= $(AVXFLAGS)
 $(info FPTYPE=$(FPTYPE))
 ifeq ($(FPTYPE),d)
   CXXFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_DOUBLE
-  CUFLAGS  += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_DOUBLE
+  GPUFLAGS  += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_DOUBLE
 else ifeq ($(FPTYPE),f)
   CXXFLAGS += -DMGONGPU_FPTYPE_FLOAT -DMGONGPU_FPTYPE2_FLOAT
-  CUFLAGS  += -DMGONGPU_FPTYPE_FLOAT -DMGONGPU_FPTYPE2_FLOAT
+  GPUFLAGS  += -DMGONGPU_FPTYPE_FLOAT -DMGONGPU_FPTYPE2_FLOAT
 else ifeq ($(FPTYPE),m)
   CXXFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_FLOAT
-  CUFLAGS  += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_FLOAT
+  GPUFLAGS  += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_FLOAT
 else
   $(error Unknown FPTYPE='$(FPTYPE)': only 'd', 'f' and 'm' are supported)
 endif
@@ -343,7 +416,7 @@ endif
 $(info HELINL=$(HELINL))
 ifeq ($(HELINL),1)
   CXXFLAGS += -DMGONGPU_INLINE_HELAMPS
-  CUFLAGS  += -DMGONGPU_INLINE_HELAMPS
+  GPUFLAGS  += -DMGONGPU_INLINE_HELAMPS
 else ifneq ($(HELINL),0)
   $(error Unknown HELINL='$(HELINL)': only '0' and '1' are supported)
 endif
@@ -352,7 +425,7 @@ endif
 $(info HRDCOD=$(HRDCOD))
 ifeq ($(HRDCOD),1)
   CXXFLAGS += -DMGONGPU_HARDCODE_PARAM
-  CUFLAGS  += -DMGONGPU_HARDCODE_PARAM
+  GPUFLAGS  += -DMGONGPU_HARDCODE_PARAM
 else ifneq ($(HRDCOD),0)
   $(error Unknown HRDCOD='$(HRDCOD)': only '0' and '1' are supported)
 endif
@@ -404,11 +477,11 @@ ifeq ($(UNAME_S),Darwin)
   override CULIBFLAGSRPATH2 =
 else
   # RPATH to cuda/cpp libs when linking executables
-  override CXXLIBFLAGSRPATH = -Wl,-rpath,$(LIBDIRRPATH)
-  override CULIBFLAGSRPATH = -Xlinker -rpath,$(LIBDIRRPATH)
+  override CXXLIBFLAGSRPATH = -Wl,-rpath=$(LIBDIRRPATH)
+  override CULIBFLAGSRPATH = -Xlinker -rpath=$(LIBDIRRPATH)
   # RPATH to common lib when linking cuda/cpp libs
-  override CXXLIBFLAGSRPATH2 = -Wl,-rpath,'$$ORIGIN'
-  override CULIBFLAGSRPATH2 = -Xlinker -rpath,'$$ORIGIN'
+  override CXXLIBFLAGSRPATH2 = -Wl,-rpath='$$ORIGIN'
+  override CULIBFLAGSRPATH2 = -Xlinker -rpath='$$ORIGIN'
 endif
 
 # Setting LD_LIBRARY_PATH or DYLD_LIBRARY_PATH in the RUNTIME is no longer necessary (neither on Linux nor on Mac)
@@ -421,7 +494,7 @@ override RUNTIME =
 cxx_main=$(BUILDDIR)/check.exe
 fcxx_main=$(BUILDDIR)/fcheck.exe
 
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 cu_main=$(BUILDDIR)/gcheck.exe
 fcu_main=$(BUILDDIR)/fgcheck.exe
 else
@@ -452,15 +525,16 @@ $(BUILDDIR)/.build.$(TAG):
 	@touch $(BUILDDIR)/.build.$(TAG)
 
 # Generic target and build rules: objects from CUDA compilation
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 $(BUILDDIR)/%.o : %.cu *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
 	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
-	$(NVCC) $(CPPFLAGS) $(CUFLAGS) -Xcompiler -fPIC -c $< -o $@
+	$(GPUCC) $(CPPFLAGS) $(GPUFLAGS) $(CUBUILDRULEFLAGS) $< -o $@
 
 $(BUILDDIR)/%_cu.o : %.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
 	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
-	$(NVCC) $(CPPFLAGS) $(CUFLAGS) -Xcompiler -fPIC -c -x cu $< -o $@
+	$(GPUCC) $(CPPFLAGS) $(GPUFLAGS) $(CCBUILDRULEFLAGS) $< -o $@
 endif
+# -x cu in line above
 
 # Generic target and build rules: objects from C++ compilation
 # (NB do not include CUINC here! add it only for NVTX or curand #679)
@@ -469,11 +543,14 @@ $(BUILDDIR)/%.o : %.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
 	$(CXX) $(CPPFLAGS) $(CXXFLAGS) -fPIC -c $< -o $@
 
 # Apply special build flags only to CrossSectionKernel.cc and gCrossSectionKernel.cu (no fast math, see #117)
+# Added edgecase for HIP compilation
 ifeq ($(shell $(CXX) --version | grep ^nvc++),)
 $(BUILDDIR)/CrossSectionKernels.o: CXXFLAGS += -fno-fast-math
 $(BUILDDIR)/CrossSectionKernels.o: CXXFLAGS += -fno-fast-math
-ifneq ($(NVCC),)
-$(BUILDDIR)/gCrossSectionKernels.o: CUFLAGS += -Xcompiler -fno-fast-math
+ifeq ($(findstring nvcc,$(GPUCC)),nvcc)
+  $(BUILDDIR)/gCrossSectionKernels.o: GPUFLAGS += -Xcompiler -fno-fast-math
+else
+  $(BUILDDIR)/gCrossSectionKernels.o: GPUFLAGS += -fno-fast-math
 endif
 endif
 
@@ -489,10 +566,10 @@ ifeq ($(RNDGEN),hasCurand)
 $(BUILDDIR)/CurandRandomNumberKernel.o: CXXFLAGS += $(CUINC)
 endif
 
-# Avoid "warning: builtin __has_trivial_... is deprecated; use __is_trivially_... instead" in nvcc with icx2023 (#592)
+# Avoid "warning: builtin __has_trivial_... is deprecated; use __is_trivially_... instead" in GPUCC with icx2023 (#592)
 ifneq ($(shell $(CXX) --version | egrep '^(Intel)'),)
-ifneq ($(NVCC),)
-CUFLAGS += -Xcompiler -Wno-deprecated-builtins
+ifneq ($(GPUCC),)
+GPUFLAGS += -Wno-deprecated-builtins
 endif
 endif
 
@@ -500,8 +577,8 @@ endif
 # This patch does remove the warning, but I prefer to keep it disabled for the moment...
 ###ifneq ($(shell $(CXX) --version | egrep '^(clang|Apple clang|Intel)'),)
 ###$(BUILDDIR)/CrossSectionKernels.o: CXXFLAGS += -Wno-overriding-t-option
-###ifneq ($(NVCC),)
-###$(BUILDDIR)/gCrossSectionKernels.o: CUFLAGS += -Xcompiler -Wno-overriding-t-option
+###ifneq ($(GPUCC),)
+###$(BUILDDIR)/gCrossSectionKernels.o: GPUFLAGS += -Xcompiler -Wno-overriding-t-option
 ###endif
 ###endif
 
@@ -528,7 +605,7 @@ MG5AMC_CXXLIB = mg5amc_$(processid_short)_cpp
 cxx_objects_lib=$(BUILDDIR)/CPPProcess.o $(BUILDDIR)/MatrixElementKernels.o $(BUILDDIR)/BridgeKernels.o $(BUILDDIR)/CrossSectionKernels.o
 cxx_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel.o $(BUILDDIR)/RamboSamplingKernels.o
 
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 MG5AMC_CULIB = mg5amc_$(processid_short)_cuda
 cu_objects_lib=$(BUILDDIR)/gCPPProcess.o $(BUILDDIR)/gMatrixElementKernels.o $(BUILDDIR)/gBridgeKernels.o $(BUILDDIR)/gCrossSectionKernels.o
 cu_objects_exe=$(BUILDDIR)/gCommonRandomNumberKernel.o $(BUILDDIR)/gRamboSamplingKernels.o
@@ -540,11 +617,11 @@ $(LIBDIR)/lib$(MG5AMC_CXXLIB).so: cxx_objects_lib += $(BUILDDIR)/fbridge.o
 $(LIBDIR)/lib$(MG5AMC_CXXLIB).so: $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib)
 	$(CXX) -shared -o $@ $(cxx_objects_lib) $(CXXLIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB)
 
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 $(LIBDIR)/lib$(MG5AMC_CULIB).so: $(BUILDDIR)/fbridge_cu.o
 $(LIBDIR)/lib$(MG5AMC_CULIB).so: cu_objects_lib += $(BUILDDIR)/fbridge_cu.o
 $(LIBDIR)/lib$(MG5AMC_CULIB).so: $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cu_objects_lib)
-	$(NVCC) --shared -o $@ $(cu_objects_lib) $(CULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB)
+	$(GPUCC) --shared -o $@ $(cu_objects_lib) $(CULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB)
 endif
 
 #-------------------------------------------------------------------------------
@@ -561,16 +638,16 @@ $(cxx_main): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PAT
 $(cxx_main): $(BUILDDIR)/check_sa.o $(LIBDIR)/lib$(MG5AMC_CXXLIB).so $(cxx_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel.o
 	$(CXX) -o $@ $(BUILDDIR)/check_sa.o $(OMPFLAGS) -ldl -pthread $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_CXXLIB) $(cxx_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel.o $(CURANDLIBFLAGS)
 
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 ifneq ($(shell $(CXX) --version | grep ^Intel),)
-$(cu_main): LIBFLAGS += -lintlc # compile with icpx and link with nvcc (undefined reference to `_intel_fast_memcpy')
-$(cu_main): LIBFLAGS += -lsvml # compile with icpx and link with nvcc (undefined reference to `__svml_cos4_l9')
+$(cu_main): LIBFLAGS += -lintlc # compile with icpx and link with GPUCC (undefined reference to `_intel_fast_memcpy')
+$(cu_main): LIBFLAGS += -lsvml # compile with icpx and link with GPUCC (undefined reference to `__svml_cos4_l9')
 else ifneq ($(shell $(CXX) --version | grep ^nvc++),) # support nvc++ #531
 $(cu_main): LIBFLAGS += -L$(patsubst %bin/nvc++,%lib,$(subst ccache ,,$(CXX))) -lnvhpcatm -lnvcpumath -lnvc
 endif
 $(cu_main): LIBFLAGS += $(CULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH
 $(cu_main): $(BUILDDIR)/gcheck_sa.o $(LIBDIR)/lib$(MG5AMC_CULIB).so $(cu_objects_exe) $(BUILDDIR)/gCurandRandomNumberKernel.o
-	$(NVCC) -o $@ $(BUILDDIR)/gcheck_sa.o $(CUARCHFLAGS) $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe) $(BUILDDIR)/gCurandRandomNumberKernel.o $(CURANDLIBFLAGS)
+	$(GPUCC) -o $@ $(BUILDDIR)/gcheck_sa.o $(CUARCHFLAGS) $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe) $(BUILDDIR)/gCurandRandomNumberKernel.o $(CURANDLIBFLAGS)
 endif
 
 #-------------------------------------------------------------------------------
@@ -596,17 +673,17 @@ $(fcxx_main): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PA
 $(fcxx_main): $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler.o $(LIBDIR)/lib$(MG5AMC_CXXLIB).so $(cxx_objects_exe)
 	$(CXX) -o $@ $(BUILDDIR)/fcheck_sa.o $(OMPFLAGS) $(BUILDDIR)/fsampler.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CXXLIB) $(cxx_objects_exe)
 
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 ifneq ($(shell $(CXX) --version | grep ^Intel),)
-$(fcu_main): LIBFLAGS += -lintlc # compile with icpx and link with nvcc (undefined reference to `_intel_fast_memcpy')
-$(fcu_main): LIBFLAGS += -lsvml # compile with icpx and link with nvcc (undefined reference to `__svml_cos4_l9')
+$(fcu_main): LIBFLAGS += -lintlc # compile with icpx and link with GPUCC (undefined reference to `_intel_fast_memcpy')
+$(fcu_main): LIBFLAGS += -lsvml # compile with icpx and link with GPUCC (undefined reference to `__svml_cos4_l9')
 endif
 ifeq ($(UNAME_S),Darwin)
 $(fcu_main): LIBFLAGS += -L$(shell dirname $(shell $(FC) --print-file-name libgfortran.dylib)) # add path to libgfortran on Mac #375
 endif
 $(fcu_main): LIBFLAGS += $(CULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH
 $(fcu_main): $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler_cu.o $(LIBDIR)/lib$(MG5AMC_CULIB).so $(cu_objects_exe)
-	$(NVCC) -o $@ $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler_cu.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe)
+	$(GPUCC) -o $@ $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler_cu.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe)
 endif
 
 #-------------------------------------------------------------------------------
@@ -618,7 +695,7 @@ $(BUILDDIR)/testxxx.o: testxxx_cc_ref.txt
 $(testmain): $(BUILDDIR)/testxxx.o
 $(testmain): cxx_objects_exe += $(BUILDDIR)/testxxx.o # Comment out this line to skip the C++ test of xxx functions
 
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 $(BUILDDIR)/testxxx_cu.o: $(GTESTLIBS)
 $(BUILDDIR)/testxxx_cu.o: INCFLAGS += $(GTESTINC)
 $(BUILDDIR)/testxxx_cu.o: testxxx_cc_ref.txt
@@ -631,7 +708,7 @@ $(BUILDDIR)/testmisc.o: INCFLAGS += $(GTESTINC)
 $(testmain): $(BUILDDIR)/testmisc.o
 $(testmain): cxx_objects_exe += $(BUILDDIR)/testmisc.o # Comment out this line to skip the C++ miscellaneous tests
 
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 $(BUILDDIR)/testmisc_cu.o: $(GTESTLIBS)
 $(BUILDDIR)/testmisc_cu.o: INCFLAGS += $(GTESTINC)
 $(testmain): $(BUILDDIR)/testmisc_cu.o
@@ -643,12 +720,12 @@ $(BUILDDIR)/runTest.o: INCFLAGS += $(GTESTINC)
 $(testmain): $(BUILDDIR)/runTest.o
 $(testmain): cxx_objects_exe += $(BUILDDIR)/runTest.o
 
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 $(BUILDDIR)/runTest_cu.o: $(GTESTLIBS)
 $(BUILDDIR)/runTest_cu.o: INCFLAGS += $(GTESTINC)
 ifneq ($(shell $(CXX) --version | grep ^Intel),)
-$(testmain): LIBFLAGS += -lintlc # compile with icpx and link with nvcc (undefined reference to `_intel_fast_memcpy')
-$(testmain): LIBFLAGS += -lsvml # compile with icpx and link with nvcc (undefined reference to `__svml_cos4_l9')
+$(testmain): LIBFLAGS += -lintlc # compile with icpx and link with GPUCC (undefined reference to `_intel_fast_memcpy')
+$(testmain): LIBFLAGS += -lsvml # compile with icpx and link with GPUCC (undefined reference to `__svml_cos4_l9')
 else ifneq ($(shell $(CXX) --version | grep ^nvc++),) # support nvc++ #531
 $(testmain): LIBFLAGS += -L$(patsubst %bin/nvc++,%lib,$(subst ccache ,,$(CXX))) -lnvhpcatm -lnvcpumath -lnvc
 endif
@@ -672,14 +749,14 @@ $(testmain): LIBFLAGS += -lgomp
 endif
 endif
 
-ifeq ($(NVCC),) # link only runTest.o
+ifeq ($(GPUCC),) # link only runTest.o
 $(testmain): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH
 $(testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(cxx_objects_exe) $(GTESTLIBS)
 	$(CXX) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) -ldl -pthread $(LIBFLAGS)
 else # link both runTest.o and runTest_cu.o
 $(testmain): LIBFLAGS += $(CULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH
 $(testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) $(GTESTLIBS)
-	$(NVCC) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) -ldl $(LIBFLAGS) -lcuda
+	  $(GPUCC) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) -ldl $(LIBFLAGS) $(CUDATESTFLAGS)
 endif
 
 # Use flock (Linux only, no Mac) to allow 'make -j' if googletest has not yet been downloaded https://stackoverflow.com/a/32666215
@@ -782,9 +859,9 @@ ifeq ($(USECCACHE),1)
 	ccache --version | head -1
 endif
 	@echo ""
-	@echo NVCC=$(NVCC)
-ifneq ($(NVCC),)
-	$(NVCC) --version
+	@echo GPUCC=$(GPUCC)
+ifneq ($(GPUCC),)
+	$(GPUCC) --version
 endif
 	@echo ""
 	@echo CXX=$(CXX)
@@ -803,7 +880,7 @@ endif
 
 # Target: check (run the C++ test executable)
 # [NB THIS IS WHAT IS USED IN THE GITHUB CI!]
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 check: runTest cmpFcheck cmpFGcheck
 else
 check: runTest cmpFcheck
diff --git a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/fbridge.cc b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/fbridge.cc
index f93c05b0b3..2b956730d4 100644
--- a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/fbridge.cc
+++ b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/fbridge.cc
@@ -1,11 +1,11 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: S. Roiser (Oct 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Roiser, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #include "Bridge.h"
 #include "CPPProcess.h"
-#include "CudaRuntime.h"
+#include "GpuRuntime.h"
 
 extern "C"
 {
@@ -22,7 +22,7 @@ extern "C"
    * Using the same Fortran MadEvent code, linking to the hetrerogeneous library would allow access to both CPU and GPU implementations.
    * The specific heterogeneous configuration (how many GPUs, how many threads on each CPU, etc) could be loaded in CUDA/C++ from a data file.
    */
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   using namespace mg5amcGpu;
 #else
   using namespace mg5amcCpu;
@@ -46,8 +46,8 @@ extern "C"
    */
   void fbridgecreate_( CppObjectInFortran** ppbridge, const int* pnevtF, const int* pnparF, const int* pnp4F )
   {
-#ifdef __CUDACC__
-    CudaRuntime::setUp();
+#ifdef MGONGPUCPP_GPUIMPL
+    GpuRuntime::setUp();
 #endif
     // Create a process object, read parm card and set parameters
     // FIXME: the process instance can happily go out of scope because it is only needed to read parameters?
@@ -69,8 +69,8 @@ extern "C"
     Bridge<FORTRANFPTYPE>* pbridge = dynamic_cast<Bridge<FORTRANFPTYPE>*>( *ppbridge );
     if( pbridge == 0 ) throw std::runtime_error( "fbridgedelete_: invalid Bridge address" );
     delete pbridge;
-#ifdef __CUDACC__
-    CudaRuntime::tearDown();
+#ifdef MGONGPUCPP_GPUIMPL
+    GpuRuntime::tearDown();
 #endif
   }
 
@@ -100,7 +100,7 @@ extern "C"
   {
     Bridge<FORTRANFPTYPE>* pbridge = dynamic_cast<Bridge<FORTRANFPTYPE>*>( *ppbridge );
     if( pbridge == 0 ) throw std::runtime_error( "fbridgesequence_: invalid Bridge address" );
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     // Use the device/GPU implementation in the CUDA library
     // (there is also a host implementation in this library)
     pbridge->gpu_sequence( momenta, gs, rndhel, rndcol, *pchannelId, mes, selhel, selcol );
diff --git a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/fsampler.cc b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/fsampler.cc
index 2fb445372d..3743934f41 100644
--- a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/fsampler.cc
+++ b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/fsampler.cc
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Feb 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #include "mgOnGpuConfig.h"
 
@@ -13,7 +13,7 @@
 
 //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -40,7 +40,7 @@ namespace mg5amcCpu
   private:
     const int m_nevt; // The number of events in each iteration
     int m_iiter;      // The iteration counter (for random number seeding)
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
     HostBufferRndNumMomenta m_hstRndmom; // Memory buffers for random numbers
     HostBufferMomenta m_hstMomenta;      // Memory buffers for momenta
     HostBufferWeights m_hstWeights;      // Memory buffers for sampling weights
@@ -105,7 +105,7 @@ namespace mg5amcCpu
 
 extern "C"
 {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   using namespace mg5amcGpu;
 #else
   using namespace mg5amcCpu;
diff --git a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/runTest.cc b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/runTest.cc
index 572e28aaea..461ec5c3a5 100644
--- a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/runTest.cc
+++ b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/runTest.cc
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: S. Hageboeck (Nov 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 
 #include "mgOnGpuConfig.h"
 
@@ -15,7 +15,7 @@
 #include "RandomNumberKernels.h"
 #include "epoch_process_id.h"
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 using namespace mg5amcGpu;
 #else
 using namespace mg5amcCpu;
@@ -32,7 +32,7 @@ struct CUDA_CPU_TestBase : public TestDriverBase
     : TestDriverBase( npar, refFileName ) {}
 };
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 struct CPUTest : public CUDA_CPU_TestBase
 {
   // Struct data members (process, and memory structures for random numbers, momenta, matrix elements and weights on host and device)
@@ -114,7 +114,7 @@ struct CPUTest : public CUDA_CPU_TestBase
 };
 #endif
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 struct CUDATest : public CUDA_CPU_TestBase
 {
   // Reset the device when our test goes out of scope. Note that this should happen after
@@ -123,7 +123,7 @@ struct CUDATest : public CUDA_CPU_TestBase
   {
     ~DeviceReset()
     {
-      checkCuda( cudaDeviceReset() ); // this is needed by cuda-memcheck --leak-check full
+      gpuDeviceReset(); // this is needed by cuda-memcheck --leak-check full
     }
   } deviceResetter;
 
@@ -249,7 +249,7 @@ INSTANTIATE_TEST_SUITE_P( prefix, \
                           test_suite_name, \
                           testing::Values( new CUDATest( MG_EPOCH_REFERENCE_FILE_NAME ) ) );
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 MG_INSTANTIATE_TEST_SUITE_GPU( XTESTID_GPU( MG_EPOCH_PROCESS_ID ), MadgraphTest );
 #else
 MG_INSTANTIATE_TEST_SUITE_CPU( XTESTID_CPU( MG_EPOCH_PROCESS_ID ), MadgraphTest );
diff --git a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/testmisc.cc b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/testmisc.cc
index 989aba1fdc..2bd7a9fcf9 100644
--- a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/testmisc.cc
+++ b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/testmisc.cc
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 //----------------------------------------------------------------------------
 // Use ./runTest.exe --gtest_filter=*misc to run only this test
 //----------------------------------------------------------------------------
@@ -17,7 +17,7 @@
 #include <sstream>
 #include <typeinfo>
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 #define TESTID( s ) s##_GPU_MISC
 #else
 #define TESTID( s ) s##_CPU_MISC
diff --git a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/testxxx.cc b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/testxxx.cc
index 4243e9fcec..6e8657edca 100644
--- a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/testxxx.cc
+++ b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/testxxx.cc
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Apr 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #include "mgOnGpuConfig.h"
 
@@ -20,7 +20,7 @@
 #include <iomanip>
 #include <iostream>
 #include <vector>
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 #define TESTID( s ) s##_GPU_XXX
 #else
 #define TESTID( s ) s##_CPU_XXX
@@ -41,7 +41,7 @@ TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testxxx )
   assert( nevt % neppM == 0 ); // nevt must be a multiple of neppM
   assert( nevt % neppV == 0 ); // nevt must be a multiple of neppV
   // Fill in the input momenta
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   mg5amcGpu::PinnedHostBufferMomenta hstMomenta( nevt ); // AOSOA[npagM][npar=4][np4=4][neppM]
 #else
   mg5amcCpu::HostBufferMomenta hstMomenta( nevt ); // AOSOA[npagM][npar=4][np4=4][neppM]
@@ -228,7 +228,7 @@ TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testxxx )
   {
     for( int ievt = 0; ievt < nevt; ievt++ )
     {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
       using namespace mg5amcGpu;
 #else
       using namespace mg5amcCpu;
diff --git a/epochX/cudacpp/gg_ttgg.sa/src/HelAmps_sm.h b/epochX/cudacpp/gg_ttgg.sa/src/HelAmps_sm.h
index ee2fcbbde5..f772885631 100644
--- a/epochX/cudacpp/gg_ttgg.sa/src/HelAmps_sm.h
+++ b/epochX/cudacpp/gg_ttgg.sa/src/HelAmps_sm.h
@@ -4,7 +4,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: A. Valassi (Sep 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
 // MadGraph5_aMC@NLO v. 3.5.0_lo_vect, 2023-06-09
@@ -26,7 +26,7 @@
 //#include <iomanip>
 //#include <iostream>
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/gg_ttgg.sa/src/Parameters_sm.cc b/epochX/cudacpp/gg_ttgg.sa/src/Parameters_sm.cc
index d3d01102fd..de87dcaf64 100644
--- a/epochX/cudacpp/gg_ttgg.sa/src/Parameters_sm.cc
+++ b/epochX/cudacpp/gg_ttgg.sa/src/Parameters_sm.cc
@@ -4,7 +4,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: A. Valassi (Sep 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
 // MadGraph5_aMC@NLO v. 3.5.0_lo_vect, 2023-06-09
diff --git a/epochX/cudacpp/gg_ttgg.sa/src/Parameters_sm.h b/epochX/cudacpp/gg_ttgg.sa/src/Parameters_sm.h
index 6551d8da81..fe7d686938 100644
--- a/epochX/cudacpp/gg_ttgg.sa/src/Parameters_sm.h
+++ b/epochX/cudacpp/gg_ttgg.sa/src/Parameters_sm.h
@@ -214,7 +214,7 @@ namespace Parameters_sm_dependentCouplings
 #pragma GCC diagnostic push
 #pragma GCC diagnostic ignored "-Wunused-variable"  // e.g. <<warning: unused variable ‘mdl_G__exp__2’ [-Wunused-variable]>>
 #pragma GCC diagnostic ignored "-Wunused-parameter" // e.g. <<warning: unused parameter ‘G’ [-Wunused-parameter]>>
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 #pragma nv_diagnostic push
 #pragma nv_diag_suppress 177 // e.g. <<warning #177-D: variable "mdl_G__exp__2" was declared but never referenced>>
 #endif
@@ -242,7 +242,7 @@ namespace Parameters_sm_dependentCouplings
     // End SM implementation - no special handling of vectors of floats as in EFT (#439)
     return out;
   }
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 #pragma GCC diagnostic pop
 #pragma nv_diagnostic pop
 #endif
@@ -258,7 +258,7 @@ namespace Parameters_sm_independentCouplings
 
 //==========================================================================
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/gg_ttgg.sa/src/mgOnGpuConfig.h b/epochX/cudacpp/gg_ttgg.sa/src/mgOnGpuConfig.h
index 6c0c4919e9..d4e37d19b3 100644
--- a/epochX/cudacpp/gg_ttgg.sa/src/mgOnGpuConfig.h
+++ b/epochX/cudacpp/gg_ttgg.sa/src/mgOnGpuConfig.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jul 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MGONGPUCONFIG_H
 #define MGONGPUCONFIG_H 1
@@ -10,13 +10,26 @@
 // There are two different code bases for standalone_cudacpp (without multichannel) and madevent+cudacpp (with multichannel)
 #undef MGONGPU_SUPPORTS_MULTICHANNEL
 
+// Is this a GPU (CUDA, HIP) or CPU implementation?
+#ifdef __CUDACC__
+#define MGONGPUCPP_GPUIMPL cuda
+#elif defined __HIPCC__
+#define MGONGPUCPP_GPUIMPL hip
+#else
+#undef MGONGPUCPP_GPUIMPL
+#endif
+
 // ** NB1 Throughputs (e.g. 6.8E8) are events/sec for "./gcheck.exe -p 65536 128 12"
 // ** NB2 Baseline on b7g47n0004 fluctuates (probably depends on load on other VMs)
 
 // Choose if curand is supported for generating random numbers
+// For CUDA, by default, it is supported
+// For HIP, by default, it is not supported
 // For C++, by default, do not inline, but allow this macro to be set from outside with e.g. -DMGONGPU_HAS_NO_CURAND
 #ifdef __CUDACC__
 #undef MGONGPU_HAS_NO_CURAND
+#elif defined __HIPCC__
+#define MGONGPU_HAS_NO_CURAND 1
 #else
 //#undef MGONGPU_HAS_NO_CURAND // default
 ////#define MGONGPU_HAS_NO_CURAND 1
@@ -52,23 +65,28 @@
 //#undef MGONGPU_HARDCODE_PARAM // default
 ////#define MGONGPU_HARDCODE_PARAM 1
 
-// Complex type in c++: std::complex or cxsmpl (CHOOSE ONLY ONE)
-#ifndef __CUDACC__
-//#define MGONGPU_CPPCXTYPE_STDCOMPLEX 1 // ~8 percent slower on float, same on double (5.1E6/double, 9.4E6/float)
-#define MGONGPU_CPPCXTYPE_CXSMPL 1 // new default (5.1E6/double, 10.2E6/float)
-#endif
-
-// Complex type in cuda: thrust or cucomplex or cxsmpl (CHOOSE ONLY ONE)
+// Complex type in CUDA: thrust or cucomplex or cxsmpl (CHOOSE ONLY ONE)
 #ifdef __CUDACC__
 #define MGONGPU_CUCXTYPE_THRUST 1 // default (~1.15E9/double, ~3.2E9/float)
 //#define MGONGPU_CUCXTYPE_CUCOMPLEX 1 // ~10 percent slower (1.03E9/double, ~2.8E9/float)
 //#define MGONGPU_CUCXTYPE_CXSMPL 1 // ~10 percent slower (1.00E9/double, ~2.9E9/float)
+
+// Complex type in HIP: cxsmpl (ONLY ONE OPTION POSSIBLE)
+#elif defined __HIPCC__
+#define MGONGPU_CUCXTYPE_CXSMPL 1 // ~10 percent slower (1.00E9/double, ~2.9E9/float)
+
+// Complex type in C++: std::complex or cxsmpl (CHOOSE ONLY ONE)
+#else
+//#define MGONGPU_CPPCXTYPE_STDCOMPLEX 1 // ~8 percent slower on float, same on double (5.1E6/double, 9.4E6/float)
+#define MGONGPU_CPPCXTYPE_CXSMPL 1 // new default (5.1E6/double, 10.2E6/float)
 #endif
 
-// Cuda nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation
+// CUDA nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation
 #ifdef __CUDACC__
-#undef MGONGPU_NSIGHT_DEBUG // default
+#undef MGONGPU_NSIGHT_DEBUG // default in CUDA
 //#define MGONGPU_NSIGHT_DEBUG 1
+#else
+#undef MGONGPU_NSIGHT_DEBUG // only option in HIP or C++
 #endif
 
 // SANITY CHECKS (floating point precision for everything but color algebra #537)
@@ -84,17 +102,21 @@
 #error You cannot use double precision for color algebra and single precision elsewhere
 #endif
 
-// SANITY CHECKS (c++ complex number implementation)
-#ifndef __CUDACC__
-#if defined MGONGPU_CPPCXTYPE_STDCOMPLEX and defined MGONGPU_CPPCXTYPE_CXSMPL
-#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CPPCXTYPE_STDCOMPLEX or MGONGPU_CPPCXTYPE_CXSMPL
+// SANITY CHECKS (CUDA complex number implementation)
+#ifdef __CUDACC__
+#if defined MGONGPU_CUCXTYPE_THRUST and defined MGONGPU_CUCXTYPE_CUCOMPLEX
+#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CUCXTYPE_THRUST or MGONGPU_CUCXTYPE_CUCOMPLEX for CUDA
+#elif defined MGONGPU_CUCXTYPE_THRUST and defined MGONGPU_CUCXTYPE_CXSMPL
+#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CUCXTYPE_THRUST or MGONGPU_CUCXTYPE_CXSMPL for CUDA
+#elif defined MGONGPU_CUCXTYPE_CUCOMPLEX and defined MGONGPU_CUCXTYPE_CXSMPL
+#error You must CHOOSE (ONE AND) ONLY ONE OF MGONGPU_CUCXTYPE_CUCOMPLEX or MGONGPU_CUCXTYPE_CXSMPL for CUDA
 #endif
 #endif
 
-// SANITY CHECKS (cuda complex number implementation)
-#ifdef __CUDACC__
-#if defined MGONGPU_CUCXTYPE_THRUST and defined MGONGPU_CUCXTYPE_CUCOMPLEX and defined MGONGPU_CUCXTYPE_CXSMPL
-#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CUCXTYPE_THRUST or MGONGPU_CUCXTYPE_CUCOMPLEX or MGONGPU_CUCXTYPE_CXSMPL
+// SANITY CHECKS (C++ complex number implementation)
+#ifndef MGONGPUCPP_GPUIMPL
+#if defined MGONGPU_CPPCXTYPE_STDCOMPLEX and defined MGONGPU_CPPCXTYPE_CXSMPL
+#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CPPCXTYPE_STDCOMPLEX or MGONGPU_CPPCXTYPE_CXSMPL for C++
 #endif
 #endif
 
@@ -131,7 +153,7 @@ namespace mgOnGpu
   // Alignment requirement for using reinterpret_cast with SIMD vectorized code
   // (using reinterpret_cast with non aligned memory may lead to segmentation faults!)
   // Only needed for C++ code but can be enforced also in NVCC builds of C++ code using CUDA>=11.2 and C++17 (#318, #319, #333)
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   constexpr int cppAlign = 64; // alignment requirement for SIMD vectorization (64-byte i.e. 512-bit)
 #endif
 
@@ -142,7 +164,7 @@ using mgOnGpu::fptype;
 using mgOnGpu::fptype2;
 
 // C++ SIMD vectorization width (this will be used to set neppV)
-#ifdef __CUDACC__ // CUDA implementation has no SIMD
+#ifdef MGONGPUCPP_GPUIMPL // CUDA and HIP implementations have no SIMD
 #undef MGONGPU_CPPSIMD
 #elif defined __AVX512VL__ && defined MGONGPU_PVW512 // C++ "512z" AVX512 with 512 width (512-bit ie 64-byte): 8 (DOUBLE) or 16 (FLOAT)
 #ifdef MGONGPU_FPTYPE_DOUBLE
@@ -172,9 +194,9 @@ using mgOnGpu::fptype2;
 #undef MGONGPU_CPPSIMD
 #endif
 
-// Cuda nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation
+// CUDA nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation
 // Arguments (not used so far): text is __FUNCTION__, code is 0 (start) or 1 (end)
-#if defined __CUDACC__ && defined MGONGPU_NSIGHT_DEBUG /* clang-format off */
+#if defined __CUDA__ && defined MGONGPU_NSIGHT_DEBUG /* clang-format off */
 #define mgDebugDeclare() __shared__ float mgDebugCounter[mgOnGpu::ntpbMAX];
 #define mgDebugInitialise() { mgDebugCounter[threadIdx.x] = 0; }
 #define mgDebug( code, text ) { mgDebugCounter[threadIdx.x] += 1; }
@@ -186,8 +208,8 @@ using mgOnGpu::fptype2;
 #define mgDebugFinalise() { /*noop*/ }
 #endif /* clang-format on */
 
-// Define empty CUDA declaration specifiers for C++
-#ifndef __CUDACC__
+// Define empty CUDA/HIP declaration specifiers for C++
+#ifndef MGONGPUCPP_GPUIMPL
 #define __global__
 #define __host__
 #define __device__
diff --git a/epochX/cudacpp/gg_ttgg.sa/src/mgOnGpuCxtypes.h b/epochX/cudacpp/gg_ttgg.sa/src/mgOnGpuCxtypes.h
index 0cb2f1db7e..4e7ab03fa2 100644
--- a/epochX/cudacpp/gg_ttgg.sa/src/mgOnGpuCxtypes.h
+++ b/epochX/cudacpp/gg_ttgg.sa/src/mgOnGpuCxtypes.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022, based on earlier work by D. Smith) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MGONGPUCXTYPES_H
 #define MGONGPUCXTYPES_H 1
@@ -19,7 +19,7 @@
 #include <complex>
 
 // Complex type in cuda: thrust or cucomplex or cxsmpl
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 #if defined MGONGPU_CUCXTYPE_THRUST
 #pragma clang diagnostic push
 #pragma clang diagnostic ignored "-Wtautological-compare" // for icpx2021/clang13 (https://stackoverflow.com/a/15864661)
@@ -201,7 +201,7 @@ namespace mgOnGpu
 {
 
   // --- Type definitions (complex type: cxtype)
-#ifdef __CUDACC__ // cuda
+#ifdef MGONGPUCPP_GPUIMPL // cuda
 #if defined MGONGPU_CUCXTYPE_THRUST
   typedef thrust::complex<fptype> cxtype;
 #elif defined MGONGPU_CUCXTYPE_CUCOMPLEX
@@ -281,7 +281,7 @@ cxmake( const std::complex<double>& c ) // std::complex to cxsmpl (double-to-flo
 
 //==========================================================================
 
-#if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_THRUST // cuda + thrust
+#if defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CUCXTYPE_THRUST // cuda + thrust
 
 //------------------------------
 // CUDA - using thrust::complex
@@ -317,11 +317,11 @@ cxmake( const cxtype& c )
   return c;
 }
 
-#endif // #if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_THRUST
+#endif // #if defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CUCXTYPE_THRUST
 
 //==========================================================================
 
-#if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_CUCOMPLEX // cuda + cucomplex
+#if defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CUCXTYPE_CUCOMPLEX // cuda + cucomplex
 
 //------------------------------
 // CUDA - using cuComplex
@@ -536,11 +536,11 @@ cxmake( const std::complex<fptype>& c ) // std::complex to cucomplex (float-to-f
   return cxmake( c.real(), c.imag() );
 }
 
-#endif // #if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_CUCOMPLEX
+#endif // #if defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CUCXTYPE_CUCOMPLEX
 
 //==========================================================================
 
-#if not defined __CUDACC__ and defined MGONGPU_CPPCXTYPE_STDCOMPLEX // c++ + stdcomplex
+#if not defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CPPCXTYPE_STDCOMPLEX // c++ + stdcomplex
 
 //------------------------------
 // C++ - using std::complex
@@ -584,7 +584,7 @@ cxmake( const std::complex<double>& c ) // std::complex to std::complex (cast do
 }
 #endif
 
-#endif // #if not defined __CUDACC__ and defined MGONGPU_CPPCXTYPE_STDCOMPLEX
+#endif // #if not defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CPPCXTYPE_STDCOMPLEX
 
 //==========================================================================
 
diff --git a/epochX/cudacpp/gg_ttgg.sa/src/mgOnGpuFptypes.h b/epochX/cudacpp/gg_ttgg.sa/src/mgOnGpuFptypes.h
index a1cde16a67..6f6cee64d6 100644
--- a/epochX/cudacpp/gg_ttgg.sa/src/mgOnGpuFptypes.h
+++ b/epochX/cudacpp/gg_ttgg.sa/src/mgOnGpuFptypes.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MGONGPUFPTYPES_H
 #define MGONGPUFPTYPES_H 1
@@ -13,7 +13,7 @@
 
 //==========================================================================
 
-#ifdef __CUDACC__ // cuda
+#ifdef MGONGPUCPP_GPUIMPL // cuda
 
 //------------------------------
 // Floating point types - Cuda
@@ -57,11 +57,11 @@ fpsqrt( const fptype& f )
 #endif
 }
 
-#endif // #ifdef __CUDACC__
+#endif // #ifdef MGONGPUCPP_GPUIMPL
 
 //==========================================================================
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 
 //------------------------------
 // Floating point types - C++
@@ -85,7 +85,7 @@ fpsqrt( const fptype& f )
   return std::sqrt( f );
 }
 
-#endif // #ifndef __CUDACC__
+#endif // #ifndef MGONGPUCPP_GPUIMPL
 
 //==========================================================================
 
diff --git a/epochX/cudacpp/gg_ttgg.sa/src/mgOnGpuVectors.h b/epochX/cudacpp/gg_ttgg.sa/src/mgOnGpuVectors.h
index 9d3e82b1e3..7904b93c61 100644
--- a/epochX/cudacpp/gg_ttgg.sa/src/mgOnGpuVectors.h
+++ b/epochX/cudacpp/gg_ttgg.sa/src/mgOnGpuVectors.h
@@ -108,7 +108,7 @@ namespace mgOnGpu /* clang-format off */
 #endif
 #endif
 
-#else // i.e #ifndef MGONGPU_CPPSIMD (this includes #ifdef __CUDACC__)
+#else // i.e #ifndef MGONGPU_CPPSIMD (this includes #ifdef MGONGPUCPP_GPUIMPL)
 
   const int neppV = 1;
 
@@ -129,7 +129,7 @@ using mgOnGpu::bool_v;
 
 //--------------------------------------------------------------------------
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 
 // Printout to stream for user defined types
 
@@ -744,11 +744,11 @@ fpvsplit1( const fptype2_v& v )
 
 #endif // #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
 
-#endif // #ifndef __CUDACC__
+#endif // #ifndef MGONGPUCPP_GPUIMPL
 
 //==========================================================================
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 
 //------------------------------
 // Vector types - CUDA
@@ -786,12 +786,12 @@ cxternary( const bool& mask, const cxtype& a, const cxtype& b )
   return ( mask ? a : b );
 }
 
-#endif // #ifdef __CUDACC__
+#endif // #ifdef MGONGPUCPP_GPUIMPL
 
 //==========================================================================
 
 // Scalar-or-vector types: scalar in CUDA, vector or scalar in C++
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 typedef bool bool_sv;
 typedef fptype fptype_sv;
 typedef fptype2 fptype2_sv;
@@ -812,7 +812,7 @@ typedef mgOnGpu::cxtype_ref cxtype_sv_ref;
 #endif
 
 // Scalar-or-vector zeros: scalar in CUDA, vector or scalar in C++
-#ifdef __CUDACC__ /* clang-format off */
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
 inline __host__ __device__ cxtype cxzero_sv(){ return cxtype( 0, 0 ); }
 #elif defined MGONGPU_CPPSIMD
 inline cxtype_v cxzero_sv() { return cxtype_v(); } // RRRR=0000 IIII=0000
diff --git a/epochX/cudacpp/gg_ttgg.sa/src/rambo.h b/epochX/cudacpp/gg_ttgg.sa/src/rambo.h
index e02ea52496..cd7e1008ea 100644
--- a/epochX/cudacpp/gg_ttgg.sa/src/rambo.h
+++ b/epochX/cudacpp/gg_ttgg.sa/src/rambo.h
@@ -4,7 +4,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 
 #include "mgOnGpuConfig.h"
@@ -18,7 +18,7 @@
 #include <iostream>
 
 // Simplified rambo version for 2 to N (with N>=2) processes with massless particles
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -83,7 +83,7 @@ namespace mg5amcCpu
       static bool first = true;
       if( first )
       {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
         if constexpr( M_ACCESS::isOnDevice() ) // avoid
         {
           const int ievt0 = 0;
@@ -166,7 +166,7 @@ namespace mg5amcCpu
     wt = po2log;
     if( nparf != 2 ) wt = ( 2. * nparf - 4. ) * log( energy ) + z[nparf - 1];
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
     // issue warnings if weight is too small or too large
     static int iwarn[5] = { 0, 0, 0, 0, 0 };
     if( wt < -180. )
diff --git a/epochX/cudacpp/gg_ttggg.mad/CODEGEN_mad_gg_ttggg_log.txt b/epochX/cudacpp/gg_ttggg.mad/CODEGEN_mad_gg_ttggg_log.txt
index 99d1f8f4a8..5b1f36bf57 100644
--- a/epochX/cudacpp/gg_ttggg.mad/CODEGEN_mad_gg_ttggg_log.txt
+++ b/epochX/cudacpp/gg_ttggg.mad/CODEGEN_mad_gg_ttggg_log.txt
@@ -51,8 +51,8 @@ Note that you can still compile and run aMC@NLO with the built-in PDFs
  MG5_aMC> set lhapdf /PATH/TO/lhapdf-config
 
 Using default text editor "vi". Set another one in ./input/mg5_configuration.txt
-No valid eps viewer found. Please set in ./input/mg5_configuration.txt
-Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt
+Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt
+No valid web browser found. Please set in ./input/mg5_configuration.txt
 import /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_mad_gg_ttggg.mg
 The import format was not given, so we guess it as command
 set stdout_level DEBUG
@@ -62,7 +62,7 @@ generate g g > t t~ g g g
 No model currently active, so we import the Standard Model
 INFO: load particles 
 INFO: load vertices 
-[1;32mDEBUG: model prefixing  takes 0.004646778106689453 [0m
+[1;32mDEBUG: model prefixing  takes 0.004990100860595703 [0m
 INFO: Restrict model sm with file models/sm/restrict_default.dat . 
 [1;32mDEBUG: Simplifying conditional expressions [0m
 [1;32mDEBUG: remove interactions: u s w+ at order: QED=1 [0m
@@ -155,7 +155,7 @@ INFO: Please specify coupling orders to bypass this step.
 INFO: Trying coupling order WEIGHTED<=5: WEIGTHED IS QCD+2*QED 
 INFO: Trying process: g g > t t~ g g g WEIGHTED<=5 @1  
 INFO: Process has 1240 diagrams 
-1 processes with 1240 diagrams generated in 1.702 s
+1 processes with 1240 diagrams generated in 1.812 s
 Total: 1 processes with 1240 diagrams
 output madevent CODEGEN_mad_gg_ttggg --hel_recycling=False --vector_size=16384 --me_exporter=standalone_cudacpp
 Load PLUGIN.CUDACPP_SA_OUTPUT
@@ -175,11 +175,11 @@ INFO: Generating Helas calls for process: g g > t t~ g g g WEIGHTED<=5 @1
 INFO: Processing color information for process: g g > t t~ g g g @1 
 INFO: Creating files in directory P1_gg_ttxggg 
 INFO: Computing Color-Flow optimization [15120 term] 
-INFO: Color-Flow passed to 1592 term in 30s. Introduce 2768 contraction 
+INFO: Color-Flow passed to 1592 term in 33s. Introduce 2768 contraction 
 [1;32mDEBUG:  Entering PLUGIN_OneProcessExporter.__init__ [1;30m[model_handling.py at line 1028][0m [0m
 [1;32mDEBUG:  kwargs[prefix] = 0 [1;30m[model_handling.py at line 1029][0m [0m
 [1;32mDEBUG:  proc_id = [0m 1 [1;30m[model_handling.py at line 1034][0m [0m
-[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_SA_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7ffa3e89c250> [1;30m[export_v4.py at line 6174][0m [0m
+[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_SA_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7f344337fc70> [1;30m[export_v4.py at line 6174][0m [0m
 INFO: Creating files in directory . 
 [1;32mDEBUG:  Entering PLUGIN_OneProcessExporter.generate_process_files [1;30m[model_handling.py at line 1286][0m [0m
 [1;32mDEBUG:  self.include_multi_channel is already defined: this is madevent+second_exporter mode [1;30m[model_handling.py at line 1288][0m [0m
@@ -194,17 +194,17 @@ FileWriter <class 'PLUGIN.CUDACPP_SA_OUTPUT.model_handling.PLUGIN_CPPWriter'> fo
 [1;32mDEBUG:  self.support_multichannel, self.include_multi_channel = [0m True [1, 2, 0, 3, 4, 0, 5, 6, 0, 0, 0, 0, 0, 7, 8, 9, 0, 10, 11, 12, 0, 13, 14, 15, 0, 16, 17, 18, 19, 20, 21, 0, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 0, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 0, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 0, 67, 68, 69, 70, 71, 72, 73, 74, 75, 0, 76, 77, 78, 79, 80, 81, 82, 83, 84, 0, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 0, 0, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 0, 121, 122, 0, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 0, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 0, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196, 0, 197, 198, 199, 200, 201, 202, 0, 203, 204, 205, 206, 207, 208, 0, 209, 210, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223, 224, 225, 0, 226, 227, 0, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239, 240, 241, 242, 0, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254, 255, 256, 257, 0, 258, 259, 260, 261, 262, 263, 264, 265, 266, 267, 268, 269, 270, 271, 272, 273, 274, 275, 276, 277, 278, 279, 280, 281, 282, 283, 284, 285, 286, 287, 288, 289, 290, 291, 292, 293, 294, 295, 296, 297, 298, 299, 300, 301, 0, 302, 303, 304, 305, 306, 307, 0, 308, 309, 310, 311, 312, 313, 0, 314, 315, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 316, 317, 318, 319, 320, 321, 0, 322, 323, 324, 325, 326, 327, 328, 329, 330, 331, 332, 333, 334, 335, 336, 0, 337, 338, 339, 340, 341, 342, 343, 344, 345, 346, 347, 348, 349, 350, 351, 0, 352, 353, 354, 355, 356, 357, 358, 359, 360, 361, 362, 363, 364, 365, 366, 0, 367, 368, 369, 370, 371, 372, 373, 374, 375, 376, 377, 0, 378, 379, 0, 380, 381, 0, 0, 0, 0, 0, 382, 383, 384, 385, 386, 387, 388, 389, 390, 0, 391, 392, 393, 394, 395, 396, 397, 398, 399, 0, 400, 401, 402, 403, 404, 405, 406, 407, 408, 0, 409, 410, 411, 412, 413, 414, 0, 415, 416, 417, 418, 419, 420, 0, 0, 0, 421, 422, 423, 424, 425, 426, 0, 427, 428, 429, 430, 431, 432, 433, 434, 435, 436, 437, 438, 439, 440, 441, 0, 442, 443, 444, 445, 446, 447, 448, 449, 450, 451, 452, 453, 454, 455, 456, 0, 457, 458, 459, 460, 461, 462, 463, 464, 465, 466, 467, 468, 469, 470, 471, 0, 472, 473, 474, 475, 476, 477, 478, 479, 480, 481, 482, 0, 483, 484, 0, 485, 486, 0, 0, 0, 0, 0, 487, 488, 489, 490, 491, 492, 493, 494, 495, 0, 496, 497, 498, 499, 500, 501, 502, 503, 504, 0, 505, 506, 507, 508, 509, 510, 511, 512, 513, 0, 514, 515, 516, 517, 518, 519, 0, 520, 521, 522, 523, 524, 525, 0, 0, 0, 526, 527, 528, 529, 530, 531, 0, 532, 533, 534, 535, 536, 537, 538, 539, 540, 541, 542, 543, 544, 545, 546, 0, 547, 548, 549, 550, 551, 552, 553, 554, 555, 556, 557, 558, 559, 560, 561, 0, 562, 563, 564, 565, 566, 567, 568, 569, 570, 571, 572, 573, 574, 575, 576, 0, 577, 578, 579, 580, 581, 582, 583, 584, 585, 586, 587, 0, 588, 589, 0, 590, 591, 0, 0, 0, 0, 0, 592, 593, 594, 595, 596, 597, 598, 599, 600, 0, 601, 602, 603, 604, 605, 606, 607, 608, 609, 0, 610, 611, 612, 613, 614, 615, 616, 617, 618, 0, 619, 620, 621, 622, 623, 624, 0, 625, 626, 627, 628, 629, 630, 0, 0, 0, 631, 632, 633, 634, 635, 636, 637, 638, 639, 640, 641, 642, 643, 644, 645, 646, 647, 648, 649, 650, 651, 652, 653, 654, 655, 656, 657, 658, 659, 660, 661, 662, 663, 0, 664, 665, 666, 667, 668, 669, 0, 670, 671, 672, 673, 674, 675, 0, 0, 0, 676, 677, 678, 679, 680, 681, 682, 683, 684, 685, 686, 687, 688, 689, 690, 691, 692, 693, 694, 695, 696, 697, 698, 699, 700, 701, 702, 703, 704, 705, 706, 707, 708, 0, 709, 710, 711, 712, 713, 714, 0, 715, 716, 717, 718, 719, 720, 0, 0, 0, 721, 722, 0, 723, 724, 0, 725, 726, 0, 0, 0, 0, 0, 727, 728, 729, 730, 731, 732, 733, 734, 735, 0, 736, 737, 738, 739, 740, 741, 742, 743, 744, 0, 745, 746, 747, 748, 749, 750, 751, 752, 753, 0, 754, 755, 756, 757, 758, 759, 0, 760, 761, 762, 763, 764, 765, 766, 767, 0, 768, 769, 0, 770, 771, 0, 0, 0, 0, 0, 772, 773, 774, 775, 776, 777, 778, 779, 780, 0, 781, 782, 783, 784, 785, 786, 787, 788, 789, 0, 790, 791, 792, 793, 794, 795, 796, 797, 798, 0, 799, 800, 801, 802, 803, 804, 0, 805, 806, 807, 808, 809, 810, 811, 812, 0, 813, 814, 0, 815, 816, 0, 0, 0, 0, 0, 817, 818, 819, 820, 821, 822, 823, 824, 825, 0, 826, 827, 828, 829, 830, 831, 832, 833, 834, 0, 835, 836, 837, 838, 839, 840, 841, 842, 843, 0, 844, 845, 846, 847, 848, 849, 0, 850, 851, 852, 853, 854, 855, 856, 857, 0, 858, 859, 0, 860, 861, 0, 0, 0, 0, 862, 863, 0, 864, 865, 0, 866, 867, 0, 0, 0, 0, 868, 869, 0, 870, 871, 0, 872, 873, 0, 0, 0, 0, 0, 0, 0, 874, 875, 876, 877, 878, 879, 880, 881, 882, 883, 884, 885, 886, 887, 888, 889, 890, 891, 0, 892, 893, 894, 895, 896, 897, 898, 899, 900, 901, 902, 903, 904, 905, 906, 907, 908, 909, 0, 910, 911, 912, 913, 914, 915, 916, 917, 918, 919, 920, 921, 922, 923, 924, 925, 926, 927, 0, 928, 929, 930, 931, 932, 933, 0, 934, 935, 936, 937, 938, 939, 0, 940, 941, 942, 943, 944, 945, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] [1;30m[model_handling.py at line 1152][0m [0m
 [1;32mDEBUG:  multi_channel = [0m {1: [0], 2: [1], 3: [3], 4: [4], 5: [6], 6: [7], 7: [13], 8: [14], 9: [15], 10: [17], 11: [18], 12: [19], 13: [21], 14: [22], 15: [23], 16: [25], 17: [26], 18: [27], 19: [28], 20: [29], 21: [30], 22: [32], 23: [33], 24: [34], 25: [35], 26: [36], 27: [37], 28: [38], 29: [39], 30: [40], 31: [41], 32: [42], 33: [43], 34: [44], 35: [45], 36: [46], 37: [48], 38: [49], 39: [50], 40: [51], 41: [52], 42: [53], 43: [54], 44: [55], 45: [56], 46: [57], 47: [58], 48: [59], 49: [60], 50: [61], 51: [62], 52: [64], 53: [65], 54: [66], 55: [67], 56: [68], 57: [69], 58: [70], 59: [71], 60: [72], 61: [73], 62: [74], 63: [75], 64: [76], 65: [77], 66: [78], 67: [80], 68: [81], 69: [82], 70: [83], 71: [84], 72: [85], 73: [86], 74: [87], 75: [88], 76: [90], 77: [91], 78: [92], 79: [93], 80: [94], 81: [95], 82: [96], 83: [97], 84: [98], 85: [100], 86: [101], 87: [102], 88: [103], 89: [104], 90: [105], 91: [106], 92: [107], 93: [108], 94: [109], 95: [110], 96: [111], 97: [112], 98: [113], 99: [114], 100: [115], 101: [116], 102: [117], 103: [118], 104: [119], 105: [120], 106: [123], 107: [124], 108: [125], 109: [126], 110: [127], 111: [128], 112: [129], 113: [130], 114: [131], 115: [132], 116: [133], 117: [134], 118: [135], 119: [136], 120: [137], 121: [139], 122: [140], 123: [142], 124: [143], 125: [144], 126: [145], 127: [146], 128: [147], 129: [148], 130: [149], 131: [150], 132: [151], 133: [152], 134: [153], 135: [154], 136: [155], 137: [156], 138: [158], 139: [159], 140: [160], 141: [161], 142: [162], 143: [163], 144: [164], 145: [165], 146: [166], 147: [167], 148: [168], 149: [169], 150: [170], 151: [171], 152: [172], 153: [174], 154: [175], 155: [176], 156: [177], 157: [178], 158: [179], 159: [180], 160: [181], 161: [182], 162: [183], 163: [184], 164: [185], 165: [186], 166: [187], 167: [188], 168: [189], 169: [190], 170: [191], 171: [192], 172: [193], 173: [194], 174: [195], 175: [196], 176: [197], 177: [198], 178: [199], 179: [200], 180: [201], 181: [202], 182: [203], 183: [204], 184: [205], 185: [206], 186: [207], 187: [208], 188: [209], 189: [210], 190: [211], 191: [212], 192: [213], 193: [214], 194: [215], 195: [216], 196: [217], 197: [219], 198: [220], 199: [221], 200: [222], 201: [223], 202: [224], 203: [226], 204: [227], 205: [228], 206: [229], 207: [230], 208: [231], 209: [233], 210: [234], 211: [246], 212: [247], 213: [248], 214: [249], 215: [250], 216: [251], 217: [252], 218: [253], 219: [254], 220: [255], 221: [256], 222: [257], 223: [258], 224: [259], 225: [260], 226: [262], 227: [263], 228: [265], 229: [266], 230: [267], 231: [268], 232: [269], 233: [270], 234: [271], 235: [272], 236: [273], 237: [274], 238: [275], 239: [276], 240: [277], 241: [278], 242: [279], 243: [281], 244: [282], 245: [283], 246: [284], 247: [285], 248: [286], 249: [287], 250: [288], 251: [289], 252: [290], 253: [291], 254: [292], 255: [293], 256: [294], 257: [295], 258: [297], 259: [298], 260: [299], 261: [300], 262: [301], 263: [302], 264: [303], 265: [304], 266: [305], 267: [306], 268: [307], 269: [308], 270: [309], 271: [310], 272: [311], 273: [312], 274: [313], 275: [314], 276: [315], 277: [316], 278: [317], 279: [318], 280: [319], 281: [320], 282: [321], 283: [322], 284: [323], 285: [324], 286: [325], 287: [326], 288: [327], 289: [328], 290: [329], 291: [330], 292: [331], 293: [332], 294: [333], 295: [334], 296: [335], 297: [336], 298: [337], 299: [338], 300: [339], 301: [340], 302: [342], 303: [343], 304: [344], 305: [345], 306: [346], 307: [347], 308: [349], 309: [350], 310: [351], 311: [352], 312: [353], 313: [354], 314: [356], 315: [357], 316: [369], 317: [370], 318: [371], 319: [372], 320: [373], 321: [374], 322: [376], 323: [377], 324: [378], 325: [379], 326: [380], 327: [381], 328: [382], 329: [383], 330: [384], 331: [385], 332: [386], 333: [387], 334: [388], 335: [389], 336: [390], 337: [392], 338: [393], 339: [394], 340: [395], 341: [396], 342: [397], 343: [398], 344: [399], 345: [400], 346: [401], 347: [402], 348: [403], 349: [404], 350: [405], 351: [406], 352: [408], 353: [409], 354: [410], 355: [411], 356: [412], 357: [413], 358: [414], 359: [415], 360: [416], 361: [417], 362: [418], 363: [419], 364: [420], 365: [421], 366: [422], 367: [424], 368: [425], 369: [426], 370: [427], 371: [428], 372: [429], 373: [430], 374: [431], 375: [432], 376: [433], 377: [434], 378: [436], 379: [437], 380: [439], 381: [440], 382: [446], 383: [447], 384: [448], 385: [449], 386: [450], 387: [451], 388: [452], 389: [453], 390: [454], 391: [456], 392: [457], 393: [458], 394: [459], 395: [460], 396: [461], 397: [462], 398: [463], 399: [464], 400: [466], 401: [467], 402: [468], 403: [469], 404: [470], 405: [471], 406: [472], 407: [473], 408: [474], 409: [476], 410: [477], 411: [478], 412: [479], 413: [480], 414: [481], 415: [483], 416: [484], 417: [485], 418: [486], 419: [487], 420: [488], 421: [492], 422: [493], 423: [494], 424: [495], 425: [496], 426: [497], 427: [499], 428: [500], 429: [501], 430: [502], 431: [503], 432: [504], 433: [505], 434: [506], 435: [507], 436: [508], 437: [509], 438: [510], 439: [511], 440: [512], 441: [513], 442: [515], 443: [516], 444: [517], 445: [518], 446: [519], 447: [520], 448: [521], 449: [522], 450: [523], 451: [524], 452: [525], 453: [526], 454: [527], 455: [528], 456: [529], 457: [531], 458: [532], 459: [533], 460: [534], 461: [535], 462: [536], 463: [537], 464: [538], 465: [539], 466: [540], 467: [541], 468: [542], 469: [543], 470: [544], 471: [545], 472: [547], 473: [548], 474: [549], 475: [550], 476: [551], 477: [552], 478: [553], 479: [554], 480: [555], 481: [556], 482: [557], 483: [559], 484: [560], 485: [562], 486: [563], 487: [569], 488: [570], 489: [571], 490: [572], 491: [573], 492: [574], 493: [575], 494: [576], 495: [577], 496: [579], 497: [580], 498: [581], 499: [582], 500: [583], 501: [584], 502: [585], 503: [586], 504: [587], 505: [589], 506: [590], 507: [591], 508: [592], 509: [593], 510: [594], 511: [595], 512: [596], 513: [597], 514: [599], 515: [600], 516: [601], 517: [602], 518: [603], 519: [604], 520: [606], 521: [607], 522: [608], 523: [609], 524: [610], 525: [611], 526: [615], 527: [616], 528: [617], 529: [618], 530: [619], 531: [620], 532: [622], 533: [623], 534: [624], 535: [625], 536: [626], 537: [627], 538: [628], 539: [629], 540: [630], 541: [631], 542: [632], 543: [633], 544: [634], 545: [635], 546: [636], 547: [638], 548: [639], 549: [640], 550: [641], 551: [642], 552: [643], 553: [644], 554: [645], 555: [646], 556: [647], 557: [648], 558: [649], 559: [650], 560: [651], 561: [652], 562: [654], 563: [655], 564: [656], 565: [657], 566: [658], 567: [659], 568: [660], 569: [661], 570: [662], 571: [663], 572: [664], 573: [665], 574: [666], 575: [667], 576: [668], 577: [670], 578: [671], 579: [672], 580: [673], 581: [674], 582: [675], 583: [676], 584: [677], 585: [678], 586: [679], 587: [680], 588: [682], 589: [683], 590: [685], 591: [686], 592: [692], 593: [693], 594: [694], 595: [695], 596: [696], 597: [697], 598: [698], 599: [699], 600: [700], 601: [702], 602: [703], 603: [704], 604: [705], 605: [706], 606: [707], 607: [708], 608: [709], 609: [710], 610: [712], 611: [713], 612: [714], 613: [715], 614: [716], 615: [717], 616: [718], 617: [719], 618: [720], 619: [722], 620: [723], 621: [724], 622: [725], 623: [726], 624: [727], 625: [729], 626: [730], 627: [731], 628: [732], 629: [733], 630: [734], 631: [738], 632: [739], 633: [740], 634: [741], 635: [742], 636: [743], 637: [744], 638: [745], 639: [746], 640: [747], 641: [748], 642: [749], 643: [750], 644: [751], 645: [752], 646: [753], 647: [754], 648: [755], 649: [756], 650: [757], 651: [758], 652: [759], 653: [760], 654: [761], 655: [762], 656: [763], 657: [764], 658: [765], 659: [766], 660: [767], 661: [768], 662: [769], 663: [770], 664: [772], 665: [773], 666: [774], 667: [775], 668: [776], 669: [777], 670: [779], 671: [780], 672: [781], 673: [782], 674: [783], 675: [784], 676: [788], 677: [789], 678: [790], 679: [791], 680: [792], 681: [793], 682: [794], 683: [795], 684: [796], 685: [797], 686: [798], 687: [799], 688: [800], 689: [801], 690: [802], 691: [803], 692: [804], 693: [805], 694: [806], 695: [807], 696: [808], 697: [809], 698: [810], 699: [811], 700: [812], 701: [813], 702: [814], 703: [815], 704: [816], 705: [817], 706: [818], 707: [819], 708: [820], 709: [822], 710: [823], 711: [824], 712: [825], 713: [826], 714: [827], 715: [829], 716: [830], 717: [831], 718: [832], 719: [833], 720: [834], 721: [838], 722: [839], 723: [841], 724: [842], 725: [844], 726: [845], 727: [851], 728: [852], 729: [853], 730: [854], 731: [855], 732: [856], 733: [857], 734: [858], 735: [859], 736: [861], 737: [862], 738: [863], 739: [864], 740: [865], 741: [866], 742: [867], 743: [868], 744: [869], 745: [871], 746: [872], 747: [873], 748: [874], 749: [875], 750: [876], 751: [877], 752: [878], 753: [879], 754: [881], 755: [882], 756: [883], 757: [884], 758: [885], 759: [886], 760: [888], 761: [889], 762: [890], 763: [891], 764: [892], 765: [893], 766: [894], 767: [895], 768: [897], 769: [898], 770: [900], 771: [901], 772: [907], 773: [908], 774: [909], 775: [910], 776: [911], 777: [912], 778: [913], 779: [914], 780: [915], 781: [917], 782: [918], 783: [919], 784: [920], 785: [921], 786: [922], 787: [923], 788: [924], 789: [925], 790: [927], 791: [928], 792: [929], 793: [930], 794: [931], 795: [932], 796: [933], 797: [934], 798: [935], 799: [937], 800: [938], 801: [939], 802: [940], 803: [941], 804: [942], 805: [944], 806: [945], 807: [946], 808: [947], 809: [948], 810: [949], 811: [950], 812: [951], 813: [953], 814: [954], 815: [956], 816: [957], 817: [963], 818: [964], 819: [965], 820: [966], 821: [967], 822: [968], 823: [969], 824: [970], 825: [971], 826: [973], 827: [974], 828: [975], 829: [976], 830: [977], 831: [978], 832: [979], 833: [980], 834: [981], 835: [983], 836: [984], 837: [985], 838: [986], 839: [987], 840: [988], 841: [989], 842: [990], 843: [991], 844: [993], 845: [994], 846: [995], 847: [996], 848: [997], 849: [998], 850: [1000], 851: [1001], 852: [1002], 853: [1003], 854: [1004], 855: [1005], 856: [1006], 857: [1007], 858: [1009], 859: [1010], 860: [1012], 861: [1013], 862: [1018], 863: [1019], 864: [1021], 865: [1022], 866: [1024], 867: [1025], 868: [1030], 869: [1031], 870: [1033], 871: [1034], 872: [1036], 873: [1037], 874: [1045], 875: [1046], 876: [1047], 877: [1048], 878: [1049], 879: [1050], 880: [1051], 881: [1052], 882: [1053], 883: [1054], 884: [1055], 885: [1056], 886: [1057], 887: [1058], 888: [1059], 889: [1060], 890: [1061], 891: [1062], 892: [1064], 893: [1065], 894: [1066], 895: [1067], 896: [1068], 897: [1069], 898: [1070], 899: [1071], 900: [1072], 901: [1073], 902: [1074], 903: [1075], 904: [1076], 905: [1077], 906: [1078], 907: [1079], 908: [1080], 909: [1081], 910: [1083], 911: [1084], 912: [1085], 913: [1086], 914: [1087], 915: [1088], 916: [1089], 917: [1090], 918: [1091], 919: [1092], 920: [1093], 921: [1094], 922: [1095], 923: [1096], 924: [1097], 925: [1098], 926: [1099], 927: [1100], 928: [1102], 929: [1103], 930: [1104], 931: [1105], 932: [1106], 933: [1107], 934: [1109], 935: [1110], 936: [1111], 937: [1112], 938: [1113], 939: [1114], 940: [1116], 941: [1117], 942: [1118], 943: [1119], 944: [1120], 945: [1121]} [1;30m[model_handling.py at line 1158][0m [0m
 [1;32mDEBUG:  multi_channel_map = [0m {1: [0], 2: [1], 3: [3], 4: [4], 5: [6], 6: [7], 7: [13], 8: [14], 9: [15], 10: [17], 11: [18], 12: [19], 13: [21], 14: [22], 15: [23], 16: [25], 17: [26], 18: [27], 19: [28], 20: [29], 21: [30], 22: [32], 23: [33], 24: [34], 25: [35], 26: [36], 27: [37], 28: [38], 29: [39], 30: [40], 31: [41], 32: [42], 33: [43], 34: [44], 35: [45], 36: [46], 37: [48], 38: [49], 39: [50], 40: [51], 41: [52], 42: [53], 43: [54], 44: [55], 45: [56], 46: [57], 47: [58], 48: [59], 49: [60], 50: [61], 51: [62], 52: [64], 53: [65], 54: [66], 55: [67], 56: [68], 57: [69], 58: [70], 59: [71], 60: [72], 61: [73], 62: [74], 63: [75], 64: [76], 65: [77], 66: [78], 67: [80], 68: [81], 69: [82], 70: [83], 71: [84], 72: [85], 73: [86], 74: [87], 75: [88], 76: [90], 77: [91], 78: [92], 79: [93], 80: [94], 81: [95], 82: [96], 83: [97], 84: [98], 85: [100], 86: [101], 87: [102], 88: [103], 89: [104], 90: [105], 91: [106], 92: [107], 93: [108], 94: [109], 95: [110], 96: [111], 97: [112], 98: [113], 99: [114], 100: [115], 101: [116], 102: [117], 103: [118], 104: [119], 105: [120], 106: [123], 107: [124], 108: [125], 109: [126], 110: [127], 111: [128], 112: [129], 113: [130], 114: [131], 115: [132], 116: [133], 117: [134], 118: [135], 119: [136], 120: [137], 121: [139], 122: [140], 123: [142], 124: [143], 125: [144], 126: [145], 127: [146], 128: [147], 129: [148], 130: [149], 131: [150], 132: [151], 133: [152], 134: [153], 135: [154], 136: [155], 137: [156], 138: [158], 139: [159], 140: [160], 141: [161], 142: [162], 143: [163], 144: [164], 145: [165], 146: [166], 147: [167], 148: [168], 149: [169], 150: [170], 151: [171], 152: [172], 153: [174], 154: [175], 155: [176], 156: [177], 157: [178], 158: [179], 159: [180], 160: [181], 161: [182], 162: [183], 163: [184], 164: [185], 165: [186], 166: [187], 167: [188], 168: [189], 169: [190], 170: [191], 171: [192], 172: [193], 173: [194], 174: [195], 175: [196], 176: [197], 177: [198], 178: [199], 179: [200], 180: [201], 181: [202], 182: [203], 183: [204], 184: [205], 185: [206], 186: [207], 187: [208], 188: [209], 189: [210], 190: [211], 191: [212], 192: [213], 193: [214], 194: [215], 195: [216], 196: [217], 197: [219], 198: [220], 199: [221], 200: [222], 201: [223], 202: [224], 203: [226], 204: [227], 205: [228], 206: [229], 207: [230], 208: [231], 209: [233], 210: [234], 211: [246], 212: [247], 213: [248], 214: [249], 215: [250], 216: [251], 217: [252], 218: [253], 219: [254], 220: [255], 221: [256], 222: [257], 223: [258], 224: [259], 225: [260], 226: [262], 227: [263], 228: [265], 229: [266], 230: [267], 231: [268], 232: [269], 233: [270], 234: [271], 235: [272], 236: [273], 237: [274], 238: [275], 239: [276], 240: [277], 241: [278], 242: [279], 243: [281], 244: [282], 245: [283], 246: [284], 247: [285], 248: [286], 249: [287], 250: [288], 251: [289], 252: [290], 253: [291], 254: [292], 255: [293], 256: [294], 257: [295], 258: [297], 259: [298], 260: [299], 261: [300], 262: [301], 263: [302], 264: [303], 265: [304], 266: [305], 267: [306], 268: [307], 269: [308], 270: [309], 271: [310], 272: [311], 273: [312], 274: [313], 275: [314], 276: [315], 277: [316], 278: [317], 279: [318], 280: [319], 281: [320], 282: [321], 283: [322], 284: [323], 285: [324], 286: [325], 287: [326], 288: [327], 289: [328], 290: [329], 291: [330], 292: [331], 293: [332], 294: [333], 295: [334], 296: [335], 297: [336], 298: [337], 299: [338], 300: [339], 301: [340], 302: [342], 303: [343], 304: [344], 305: [345], 306: [346], 307: [347], 308: [349], 309: [350], 310: [351], 311: [352], 312: [353], 313: [354], 314: [356], 315: [357], 316: [369], 317: [370], 318: [371], 319: [372], 320: [373], 321: [374], 322: [376], 323: [377], 324: [378], 325: [379], 326: [380], 327: [381], 328: [382], 329: [383], 330: [384], 331: [385], 332: [386], 333: [387], 334: [388], 335: [389], 336: [390], 337: [392], 338: [393], 339: [394], 340: [395], 341: [396], 342: [397], 343: [398], 344: [399], 345: [400], 346: [401], 347: [402], 348: [403], 349: [404], 350: [405], 351: [406], 352: [408], 353: [409], 354: [410], 355: [411], 356: [412], 357: [413], 358: [414], 359: [415], 360: [416], 361: [417], 362: [418], 363: [419], 364: [420], 365: [421], 366: [422], 367: [424], 368: [425], 369: [426], 370: [427], 371: [428], 372: [429], 373: [430], 374: [431], 375: [432], 376: [433], 377: [434], 378: [436], 379: [437], 380: [439], 381: [440], 382: [446], 383: [447], 384: [448], 385: [449], 386: [450], 387: [451], 388: [452], 389: [453], 390: [454], 391: [456], 392: [457], 393: [458], 394: [459], 395: [460], 396: [461], 397: [462], 398: [463], 399: [464], 400: [466], 401: [467], 402: [468], 403: [469], 404: [470], 405: [471], 406: [472], 407: [473], 408: [474], 409: [476], 410: [477], 411: [478], 412: [479], 413: [480], 414: [481], 415: [483], 416: [484], 417: [485], 418: [486], 419: [487], 420: [488], 421: [492], 422: [493], 423: [494], 424: [495], 425: [496], 426: [497], 427: [499], 428: [500], 429: [501], 430: [502], 431: [503], 432: [504], 433: [505], 434: [506], 435: [507], 436: [508], 437: [509], 438: [510], 439: [511], 440: [512], 441: [513], 442: [515], 443: [516], 444: [517], 445: [518], 446: [519], 447: [520], 448: [521], 449: [522], 450: [523], 451: [524], 452: [525], 453: [526], 454: [527], 455: [528], 456: [529], 457: [531], 458: [532], 459: [533], 460: [534], 461: [535], 462: [536], 463: [537], 464: [538], 465: [539], 466: [540], 467: [541], 468: [542], 469: [543], 470: [544], 471: [545], 472: [547], 473: [548], 474: [549], 475: [550], 476: [551], 477: [552], 478: [553], 479: [554], 480: [555], 481: [556], 482: [557], 483: [559], 484: [560], 485: [562], 486: [563], 487: [569], 488: [570], 489: [571], 490: [572], 491: [573], 492: [574], 493: [575], 494: [576], 495: [577], 496: [579], 497: [580], 498: [581], 499: [582], 500: [583], 501: [584], 502: [585], 503: [586], 504: [587], 505: [589], 506: [590], 507: [591], 508: [592], 509: [593], 510: [594], 511: [595], 512: [596], 513: [597], 514: [599], 515: [600], 516: [601], 517: [602], 518: [603], 519: [604], 520: [606], 521: [607], 522: [608], 523: [609], 524: [610], 525: [611], 526: [615], 527: [616], 528: [617], 529: [618], 530: [619], 531: [620], 532: [622], 533: [623], 534: [624], 535: [625], 536: [626], 537: [627], 538: [628], 539: [629], 540: [630], 541: [631], 542: [632], 543: [633], 544: [634], 545: [635], 546: [636], 547: [638], 548: [639], 549: [640], 550: [641], 551: [642], 552: [643], 553: [644], 554: [645], 555: [646], 556: [647], 557: [648], 558: [649], 559: [650], 560: [651], 561: [652], 562: [654], 563: [655], 564: [656], 565: [657], 566: [658], 567: [659], 568: [660], 569: [661], 570: [662], 571: [663], 572: [664], 573: [665], 574: [666], 575: [667], 576: [668], 577: [670], 578: [671], 579: [672], 580: [673], 581: [674], 582: [675], 583: [676], 584: [677], 585: [678], 586: [679], 587: [680], 588: [682], 589: [683], 590: [685], 591: [686], 592: [692], 593: [693], 594: [694], 595: [695], 596: [696], 597: [697], 598: [698], 599: [699], 600: [700], 601: [702], 602: [703], 603: [704], 604: [705], 605: [706], 606: [707], 607: [708], 608: [709], 609: [710], 610: [712], 611: [713], 612: [714], 613: [715], 614: [716], 615: [717], 616: [718], 617: [719], 618: [720], 619: [722], 620: [723], 621: [724], 622: [725], 623: [726], 624: [727], 625: [729], 626: [730], 627: [731], 628: [732], 629: [733], 630: [734], 631: [738], 632: [739], 633: [740], 634: [741], 635: [742], 636: [743], 637: [744], 638: [745], 639: [746], 640: [747], 641: [748], 642: [749], 643: [750], 644: [751], 645: [752], 646: [753], 647: [754], 648: [755], 649: [756], 650: [757], 651: [758], 652: [759], 653: [760], 654: [761], 655: [762], 656: [763], 657: [764], 658: [765], 659: [766], 660: [767], 661: [768], 662: [769], 663: [770], 664: [772], 665: [773], 666: [774], 667: [775], 668: [776], 669: [777], 670: [779], 671: [780], 672: [781], 673: [782], 674: [783], 675: [784], 676: [788], 677: [789], 678: [790], 679: [791], 680: [792], 681: [793], 682: [794], 683: [795], 684: [796], 685: [797], 686: [798], 687: [799], 688: [800], 689: [801], 690: [802], 691: [803], 692: [804], 693: [805], 694: [806], 695: [807], 696: [808], 697: [809], 698: [810], 699: [811], 700: [812], 701: [813], 702: [814], 703: [815], 704: [816], 705: [817], 706: [818], 707: [819], 708: [820], 709: [822], 710: [823], 711: [824], 712: [825], 713: [826], 714: [827], 715: [829], 716: [830], 717: [831], 718: [832], 719: [833], 720: [834], 721: [838], 722: [839], 723: [841], 724: [842], 725: [844], 726: [845], 727: [851], 728: [852], 729: [853], 730: [854], 731: [855], 732: [856], 733: [857], 734: [858], 735: [859], 736: [861], 737: [862], 738: [863], 739: [864], 740: [865], 741: [866], 742: [867], 743: [868], 744: [869], 745: [871], 746: [872], 747: [873], 748: [874], 749: [875], 750: [876], 751: [877], 752: [878], 753: [879], 754: [881], 755: [882], 756: [883], 757: [884], 758: [885], 759: [886], 760: [888], 761: [889], 762: [890], 763: [891], 764: [892], 765: [893], 766: [894], 767: [895], 768: [897], 769: [898], 770: [900], 771: [901], 772: [907], 773: [908], 774: [909], 775: [910], 776: [911], 777: [912], 778: [913], 779: [914], 780: [915], 781: [917], 782: [918], 783: [919], 784: [920], 785: [921], 786: [922], 787: [923], 788: [924], 789: [925], 790: [927], 791: [928], 792: [929], 793: [930], 794: [931], 795: [932], 796: [933], 797: [934], 798: [935], 799: [937], 800: [938], 801: [939], 802: [940], 803: [941], 804: [942], 805: [944], 806: [945], 807: [946], 808: [947], 809: [948], 810: [949], 811: [950], 812: [951], 813: [953], 814: [954], 815: [956], 816: [957], 817: [963], 818: [964], 819: [965], 820: [966], 821: [967], 822: [968], 823: [969], 824: [970], 825: [971], 826: [973], 827: [974], 828: [975], 829: [976], 830: [977], 831: [978], 832: [979], 833: [980], 834: [981], 835: [983], 836: [984], 837: [985], 838: [986], 839: [987], 840: [988], 841: [989], 842: [990], 843: [991], 844: [993], 845: [994], 846: [995], 847: [996], 848: [997], 849: [998], 850: [1000], 851: [1001], 852: [1002], 853: [1003], 854: [1004], 855: [1005], 856: [1006], 857: [1007], 858: [1009], 859: [1010], 860: [1012], 861: [1013], 862: [1018], 863: [1019], 864: [1021], 865: [1022], 866: [1024], 867: [1025], 868: [1030], 869: [1031], 870: [1033], 871: [1034], 872: [1036], 873: [1037], 874: [1045], 875: [1046], 876: [1047], 877: [1048], 878: [1049], 879: [1050], 880: [1051], 881: [1052], 882: [1053], 883: [1054], 884: [1055], 885: [1056], 886: [1057], 887: [1058], 888: [1059], 889: [1060], 890: [1061], 891: [1062], 892: [1064], 893: [1065], 894: [1066], 895: [1067], 896: [1068], 897: [1069], 898: [1070], 899: [1071], 900: [1072], 901: [1073], 902: [1074], 903: [1075], 904: [1076], 905: [1077], 906: [1078], 907: [1079], 908: [1080], 909: [1081], 910: [1083], 911: [1084], 912: [1085], 913: [1086], 914: [1087], 915: [1088], 916: [1089], 917: [1090], 918: [1091], 919: [1092], 920: [1093], 921: [1094], 922: [1095], 923: [1096], 924: [1097], 925: [1098], 926: [1099], 927: [1100], 928: [1102], 929: [1103], 930: [1104], 931: [1105], 932: [1106], 933: [1107], 934: [1109], 935: [1110], 936: [1111], 937: [1112], 938: [1113], 939: [1114], 940: [1116], 941: [1117], 942: [1118], 943: [1119], 944: [1120], 945: [1121]} [1;30m[model_handling.py at line 1643][0m [0m
-[1;32mDEBUG:  diag_to_config = [0m {1: 1, 2: 2, 6: 3, 7: 4, 11: 5, 12: 6, 28: 7, 29: 8, 30: 9, 34: 10, 35: 11, 36: 12, 40: 13, 41: 14, 42: 15, 46: 16, 47: 17, 48: 18, 49: 19, 50: 20, 51: 21, 55: 22, 56: 23, 57: 24, 58: 25, 59: 26, 60: 27, 61: 28, 62: 29, 63: 30, 64: 31, 65: 32, 66: 33, 67: 34, 68: 35, 69: 36, 73: 37, 74: 38, 75: 39, 76: 40, 77: 41, 78: 42, 79: 43, 80: 44, 81: 45, 82: 46, 83: 47, 84: 48, 85: 49, 86: 50, 87: 51, 91: 52, 92: 53, 93: 54, 94: 55, 95: 56, 96: 57, 97: 58, 98: 59, 99: 60, 100: 61, 101: 62, 102: 63, 103: 64, 104: 65, 105: 66, 109: 67, 110: 68, 111: 69, 112: 70, 113: 71, 114: 72, 115: 73, 116: 74, 117: 75, 121: 76, 122: 77, 123: 78, 124: 79, 125: 80, 126: 81, 127: 82, 128: 83, 129: 84, 133: 85, 134: 86, 135: 87, 136: 88, 137: 89, 138: 90, 139: 91, 140: 92, 141: 93, 142: 94, 143: 95, 144: 96, 145: 97, 146: 98, 147: 99, 148: 100, 149: 101, 150: 102, 151: 103, 152: 104, 153: 105, 160: 106, 161: 107, 162: 108, 163: 109, 164: 110, 165: 111, 166: 112, 167: 113, 168: 114, 169: 115, 170: 116, 171: 117, 172: 118, 173: 119, 174: 120, 178: 121, 179: 122, 183: 123, 184: 124, 185: 125, 186: 126, 187: 127, 188: 128, 189: 129, 190: 130, 191: 131, 192: 132, 193: 133, 194: 134, 195: 135, 196: 136, 197: 137, 201: 138, 202: 139, 203: 140, 204: 141, 205: 142, 206: 143, 207: 144, 208: 145, 209: 146, 210: 147, 211: 148, 212: 149, 213: 150, 214: 151, 215: 152, 219: 153, 220: 154, 221: 155, 222: 156, 223: 157, 224: 158, 225: 159, 226: 160, 227: 161, 228: 162, 229: 163, 230: 164, 231: 165, 232: 166, 233: 167, 234: 168, 235: 169, 236: 170, 237: 171, 238: 172, 239: 173, 240: 174, 241: 175, 242: 176, 243: 177, 244: 178, 245: 179, 246: 180, 247: 181, 248: 182, 249: 183, 250: 184, 251: 185, 252: 186, 253: 187, 254: 188, 255: 189, 256: 190, 257: 191, 258: 192, 259: 193, 260: 194, 261: 195, 262: 196, 266: 197, 267: 198, 268: 199, 269: 200, 270: 201, 271: 202, 275: 203, 276: 204, 277: 205, 278: 206, 279: 207, 280: 208, 284: 209, 285: 210, 319: 211, 320: 212, 321: 213, 322: 214, 323: 215, 324: 216, 325: 217, 326: 218, 327: 219, 328: 220, 329: 221, 330: 222, 331: 223, 332: 224, 333: 225, 337: 226, 338: 227, 342: 228, 343: 229, 344: 230, 345: 231, 346: 232, 347: 233, 348: 234, 349: 235, 350: 236, 351: 237, 352: 238, 353: 239, 354: 240, 355: 241, 356: 242, 360: 243, 361: 244, 362: 245, 363: 246, 364: 247, 365: 248, 366: 249, 367: 250, 368: 251, 369: 252, 370: 253, 371: 254, 372: 255, 373: 256, 374: 257, 378: 258, 379: 259, 380: 260, 381: 261, 382: 262, 383: 263, 384: 264, 385: 265, 386: 266, 387: 267, 388: 268, 389: 269, 390: 270, 391: 271, 392: 272, 393: 273, 394: 274, 395: 275, 396: 276, 397: 277, 398: 278, 399: 279, 400: 280, 401: 281, 402: 282, 403: 283, 404: 284, 405: 285, 406: 286, 407: 287, 408: 288, 409: 289, 410: 290, 411: 291, 412: 292, 413: 293, 414: 294, 415: 295, 416: 296, 417: 297, 418: 298, 419: 299, 420: 300, 421: 301, 425: 302, 426: 303, 427: 304, 428: 305, 429: 306, 430: 307, 434: 308, 435: 309, 436: 310, 437: 311, 438: 312, 439: 313, 443: 314, 444: 315, 478: 316, 479: 317, 480: 318, 481: 319, 482: 320, 483: 321, 487: 322, 488: 323, 489: 324, 490: 325, 491: 326, 492: 327, 493: 328, 494: 329, 495: 330, 496: 331, 497: 332, 498: 333, 499: 334, 500: 335, 501: 336, 505: 337, 506: 338, 507: 339, 508: 340, 509: 341, 510: 342, 511: 343, 512: 344, 513: 345, 514: 346, 515: 347, 516: 348, 517: 349, 518: 350, 519: 351, 523: 352, 524: 353, 525: 354, 526: 355, 527: 356, 528: 357, 529: 358, 530: 359, 531: 360, 532: 361, 533: 362, 534: 363, 535: 364, 536: 365, 537: 366, 541: 367, 542: 368, 543: 369, 544: 370, 545: 371, 546: 372, 547: 373, 548: 374, 549: 375, 550: 376, 551: 377, 555: 378, 556: 379, 560: 380, 561: 381, 577: 382, 578: 383, 579: 384, 580: 385, 581: 386, 582: 387, 583: 388, 584: 389, 585: 390, 589: 391, 590: 392, 591: 393, 592: 394, 593: 395, 594: 396, 595: 397, 596: 398, 597: 399, 601: 400, 602: 401, 603: 402, 604: 403, 605: 404, 606: 405, 607: 406, 608: 407, 609: 408, 613: 409, 614: 410, 615: 411, 616: 412, 617: 413, 618: 414, 622: 415, 623: 416, 624: 417, 625: 418, 626: 419, 627: 420, 637: 421, 638: 422, 639: 423, 640: 424, 641: 425, 642: 426, 646: 427, 647: 428, 648: 429, 649: 430, 650: 431, 651: 432, 652: 433, 653: 434, 654: 435, 655: 436, 656: 437, 657: 438, 658: 439, 659: 440, 660: 441, 664: 442, 665: 443, 666: 444, 667: 445, 668: 446, 669: 447, 670: 448, 671: 449, 672: 450, 673: 451, 674: 452, 675: 453, 676: 454, 677: 455, 678: 456, 682: 457, 683: 458, 684: 459, 685: 460, 686: 461, 687: 462, 688: 463, 689: 464, 690: 465, 691: 466, 692: 467, 693: 468, 694: 469, 695: 470, 696: 471, 700: 472, 701: 473, 702: 474, 703: 475, 704: 476, 705: 477, 706: 478, 707: 479, 708: 480, 709: 481, 710: 482, 714: 483, 715: 484, 719: 485, 720: 486, 736: 487, 737: 488, 738: 489, 739: 490, 740: 491, 741: 492, 742: 493, 743: 494, 744: 495, 748: 496, 749: 497, 750: 498, 751: 499, 752: 500, 753: 501, 754: 502, 755: 503, 756: 504, 760: 505, 761: 506, 762: 507, 763: 508, 764: 509, 765: 510, 766: 511, 767: 512, 768: 513, 772: 514, 773: 515, 774: 516, 775: 517, 776: 518, 777: 519, 781: 520, 782: 521, 783: 522, 784: 523, 785: 524, 786: 525, 796: 526, 797: 527, 798: 528, 799: 529, 800: 530, 801: 531, 805: 532, 806: 533, 807: 534, 808: 535, 809: 536, 810: 537, 811: 538, 812: 539, 813: 540, 814: 541, 815: 542, 816: 543, 817: 544, 818: 545, 819: 546, 823: 547, 824: 548, 825: 549, 826: 550, 827: 551, 828: 552, 829: 553, 830: 554, 831: 555, 832: 556, 833: 557, 834: 558, 835: 559, 836: 560, 837: 561, 841: 562, 842: 563, 843: 564, 844: 565, 845: 566, 846: 567, 847: 568, 848: 569, 849: 570, 850: 571, 851: 572, 852: 573, 853: 574, 854: 575, 855: 576, 859: 577, 860: 578, 861: 579, 862: 580, 863: 581, 864: 582, 865: 583, 866: 584, 867: 585, 868: 586, 869: 587, 873: 588, 874: 589, 878: 590, 879: 591, 895: 592, 896: 593, 897: 594, 898: 595, 899: 596, 900: 597, 901: 598, 902: 599, 903: 600, 907: 601, 908: 602, 909: 603, 910: 604, 911: 605, 912: 606, 913: 607, 914: 608, 915: 609, 919: 610, 920: 611, 921: 612, 922: 613, 923: 614, 924: 615, 925: 616, 926: 617, 927: 618, 931: 619, 932: 620, 933: 621, 934: 622, 935: 623, 936: 624, 940: 625, 941: 626, 942: 627, 943: 628, 944: 629, 945: 630, 955: 631, 956: 632, 957: 633, 958: 634, 959: 635, 960: 636, 961: 637, 962: 638, 963: 639, 964: 640, 965: 641, 966: 642, 967: 643, 968: 644, 969: 645, 970: 646, 971: 647, 972: 648, 973: 649, 974: 650, 975: 651, 976: 652, 977: 653, 978: 654, 979: 655, 980: 656, 981: 657, 982: 658, 983: 659, 984: 660, 985: 661, 986: 662, 987: 663, 991: 664, 992: 665, 993: 666, 994: 667, 995: 668, 996: 669, 1000: 670, 1001: 671, 1002: 672, 1003: 673, 1004: 674, 1005: 675, 1015: 676, 1016: 677, 1017: 678, 1018: 679, 1019: 680, 1020: 681, 1021: 682, 1022: 683, 1023: 684, 1024: 685, 1025: 686, 1026: 687, 1027: 688, 1028: 689, 1029: 690, 1030: 691, 1031: 692, 1032: 693, 1033: 694, 1034: 695, 1035: 696, 1036: 697, 1037: 698, 1038: 699, 1039: 700, 1040: 701, 1041: 702, 1042: 703, 1043: 704, 1044: 705, 1045: 706, 1046: 707, 1047: 708, 1051: 709, 1052: 710, 1053: 711, 1054: 712, 1055: 713, 1056: 714, 1060: 715, 1061: 716, 1062: 717, 1063: 718, 1064: 719, 1065: 720, 1075: 721, 1076: 722, 1080: 723, 1081: 724, 1085: 725, 1086: 726, 1102: 727, 1103: 728, 1104: 729, 1105: 730, 1106: 731, 1107: 732, 1108: 733, 1109: 734, 1110: 735, 1114: 736, 1115: 737, 1116: 738, 1117: 739, 1118: 740, 1119: 741, 1120: 742, 1121: 743, 1122: 744, 1126: 745, 1127: 746, 1128: 747, 1129: 748, 1130: 749, 1131: 750, 1132: 751, 1133: 752, 1134: 753, 1138: 754, 1139: 755, 1140: 756, 1141: 757, 1142: 758, 1143: 759, 1147: 760, 1148: 761, 1149: 762, 1150: 763, 1151: 764, 1152: 765, 1153: 766, 1154: 767, 1158: 768, 1159: 769, 1163: 770, 1164: 771, 1180: 772, 1181: 773, 1182: 774, 1183: 775, 1184: 776, 1185: 777, 1186: 778, 1187: 779, 1188: 780, 1192: 781, 1193: 782, 1194: 783, 1195: 784, 1196: 785, 1197: 786, 1198: 787, 1199: 788, 1200: 789, 1204: 790, 1205: 791, 1206: 792, 1207: 793, 1208: 794, 1209: 795, 1210: 796, 1211: 797, 1212: 798, 1216: 799, 1217: 800, 1218: 801, 1219: 802, 1220: 803, 1221: 804, 1225: 805, 1226: 806, 1227: 807, 1228: 808, 1229: 809, 1230: 810, 1231: 811, 1232: 812, 1236: 813, 1237: 814, 1241: 815, 1242: 816, 1258: 817, 1259: 818, 1260: 819, 1261: 820, 1262: 821, 1263: 822, 1264: 823, 1265: 824, 1266: 825, 1270: 826, 1271: 827, 1272: 828, 1273: 829, 1274: 830, 1275: 831, 1276: 832, 1277: 833, 1278: 834, 1282: 835, 1283: 836, 1284: 837, 1285: 838, 1286: 839, 1287: 840, 1288: 841, 1289: 842, 1290: 843, 1294: 844, 1295: 845, 1296: 846, 1297: 847, 1298: 848, 1299: 849, 1303: 850, 1304: 851, 1305: 852, 1306: 853, 1307: 854, 1308: 855, 1309: 856, 1310: 857, 1314: 858, 1315: 859, 1319: 860, 1320: 861, 1333: 862, 1334: 863, 1338: 864, 1339: 865, 1343: 866, 1344: 867, 1357: 868, 1358: 869, 1362: 870, 1363: 871, 1367: 872, 1368: 873, 1396: 874, 1397: 875, 1398: 876, 1399: 877, 1400: 878, 1401: 879, 1402: 880, 1403: 881, 1404: 882, 1405: 883, 1406: 884, 1407: 885, 1408: 886, 1409: 887, 1410: 888, 1411: 889, 1412: 890, 1413: 891, 1417: 892, 1418: 893, 1419: 894, 1420: 895, 1421: 896, 1422: 897, 1423: 898, 1424: 899, 1425: 900, 1426: 901, 1427: 902, 1428: 903, 1429: 904, 1430: 905, 1431: 906, 1432: 907, 1433: 908, 1434: 909, 1438: 910, 1439: 911, 1440: 912, 1441: 913, 1442: 914, 1443: 915, 1444: 916, 1445: 917, 1446: 918, 1447: 919, 1448: 920, 1449: 921, 1450: 922, 1451: 923, 1452: 924, 1453: 925, 1454: 926, 1455: 927, 1459: 928, 1460: 929, 1461: 930, 1462: 931, 1463: 932, 1464: 933, 1468: 934, 1469: 935, 1470: 936, 1471: 937, 1472: 938, 1473: 939, 1477: 940, 1478: 941, 1479: 942, 1480: 943, 1481: 944, 1482: 945} [1;30m[model_handling.py at line 1698][0m [0m
-[1;32mDEBUG:  call = [0m vxxxxx( momenta,m_pars->%s, cHel[ihel][%d],%+d, w_sv[%d], %d ); [1;30m[model_handling.py at line 1810][0m [0m
-[1;32mDEBUG:  ('ZERO', 0, -1, 0, 0) [1;30m[model_handling.py at line 1811][0m [0m
-[1;32mDEBUG:  call = [0m vxxxxx( momenta,m_pars->%s, cHel[ihel][%d],%+d, w_sv[%d], %d ); [1;30m[model_handling.py at line 1810][0m [0m
-[1;32mDEBUG:  ('ZERO', 1, -1, 1, 1) [1;30m[model_handling.py at line 1811][0m [0m
-[1;32mDEBUG:  call = [0m vxxxxx( momenta,m_pars->%s, cHel[ihel][%d],%+d, w_sv[%d], %d ); [1;30m[model_handling.py at line 1810][0m [0m
-[1;32mDEBUG:  ('ZERO', 4, 1, 4, 4) [1;30m[model_handling.py at line 1811][0m [0m
-[1;32mDEBUG:  call = [0m vxxxxx( momenta,m_pars->%s, cHel[ihel][%d],%+d, w_sv[%d], %d ); [1;30m[model_handling.py at line 1810][0m [0m
-[1;32mDEBUG:  ('ZERO', 5, 1, 5, 5) [1;30m[model_handling.py at line 1811][0m [0m
-[1;32mDEBUG:  call = [0m vxxxxx( momenta,m_pars->%s, cHel[ihel][%d],%+d, w_sv[%d], %d ); [1;30m[model_handling.py at line 1810][0m [0m
-[1;32mDEBUG:  ('ZERO', 6, 1, 6, 6) [1;30m[model_handling.py at line 1811][0m [0m
+[1;32mDEBUG:  diag_to_config = [0m {1: 1, 2: 2, 6: 3, 7: 4, 11: 5, 12: 6, 28: 7, 29: 8, 30: 9, 34: 10, 35: 11, 36: 12, 40: 13, 41: 14, 42: 15, 46: 16, 47: 17, 48: 18, 49: 19, 50: 20, 51: 21, 55: 22, 56: 23, 57: 24, 58: 25, 59: 26, 60: 27, 61: 28, 62: 29, 63: 30, 64: 31, 65: 32, 66: 33, 67: 34, 68: 35, 69: 36, 73: 37, 74: 38, 75: 39, 76: 40, 77: 41, 78: 42, 79: 43, 80: 44, 81: 45, 82: 46, 83: 47, 84: 48, 85: 49, 86: 50, 87: 51, 91: 52, 92: 53, 93: 54, 94: 55, 95: 56, 96: 57, 97: 58, 98: 59, 99: 60, 100: 61, 101: 62, 102: 63, 103: 64, 104: 65, 105: 66, 109: 67, 110: 68, 111: 69, 112: 70, 113: 71, 114: 72, 115: 73, 116: 74, 117: 75, 121: 76, 122: 77, 123: 78, 124: 79, 125: 80, 126: 81, 127: 82, 128: 83, 129: 84, 133: 85, 134: 86, 135: 87, 136: 88, 137: 89, 138: 90, 139: 91, 140: 92, 141: 93, 142: 94, 143: 95, 144: 96, 145: 97, 146: 98, 147: 99, 148: 100, 149: 101, 150: 102, 151: 103, 152: 104, 153: 105, 160: 106, 161: 107, 162: 108, 163: 109, 164: 110, 165: 111, 166: 112, 167: 113, 168: 114, 169: 115, 170: 116, 171: 117, 172: 118, 173: 119, 174: 120, 178: 121, 179: 122, 183: 123, 184: 124, 185: 125, 186: 126, 187: 127, 188: 128, 189: 129, 190: 130, 191: 131, 192: 132, 193: 133, 194: 134, 195: 135, 196: 136, 197: 137, 201: 138, 202: 139, 203: 140, 204: 141, 205: 142, 206: 143, 207: 144, 208: 145, 209: 146, 210: 147, 211: 148, 212: 149, 213: 150, 214: 151, 215: 152, 219: 153, 220: 154, 221: 155, 222: 156, 223: 157, 224: 158, 225: 159, 226: 160, 227: 161, 228: 162, 229: 163, 230: 164, 231: 165, 232: 166, 233: 167, 234: 168, 235: 169, 236: 170, 237: 171, 238: 172, 239: 173, 240: 174, 241: 175, 242: 176, 243: 177, 244: 178, 245: 179, 246: 180, 247: 181, 248: 182, 249: 183, 250: 184, 251: 185, 252: 186, 253: 187, 254: 188, 255: 189, 256: 190, 257: 191, 258: 192, 259: 193, 260: 194, 261: 195, 262: 196, 266: 197, 267: 198, 268: 199, 269: 200, 270: 201, 271: 202, 275: 203, 276: 204, 277: 205, 278: 206, 279: 207, 280: 208, 284: 209, 285: 210, 319: 211, 320: 212, 321: 213, 322: 214, 323: 215, 324: 216, 325: 217, 326: 218, 327: 219, 328: 220, 329: 221, 330: 222, 331: 223, 332: 224, 333: 225, 337: 226, 338: 227, 342: 228, 343: 229, 344: 230, 345: 231, 346: 232, 347: 233, 348: 234, 349: 235, 350: 236, 351: 237, 352: 238, 353: 239, 354: 240, 355: 241, 356: 242, 360: 243, 361: 244, 362: 245, 363: 246, 364: 247, 365: 248, 366: 249, 367: 250, 368: 251, 369: 252, 370: 253, 371: 254, 372: 255, 373: 256, 374: 257, 378: 258, 379: 259, 380: 260, 381: 261, 382: 262, 383: 263, 384: 264, 385: 265, 386: 266, 387: 267, 388: 268, 389: 269, 390: 270, 391: 271, 392: 272, 393: 273, 394: 274, 395: 275, 396: 276, 397: 277, 398: 278, 399: 279, 400: 280, 401: 281, 402: 282, 403: 283, 404: 284, 405: 285, 406: 286, 407: 287, 408: 288, 409: 289, 410: 290, 411: 291, 412: 292, 413: 293, 414: 294, 415: 295, 416: 296, 417: 297, 418: 298, 419: 299, 420: 300, 421: 301, 425: 302, 426: 303, 427: 304, 428: 305, 429: 306, 430: 307, 434: 308, 435: 309, 436: 310, 437: 311, 438: 312, 439: 313, 443: 314, 444: 315, 478: 316, 479: 317, 480: 318, 481: 319, 482: 320, 483: 321, 487: 322, 488: 323, 489: 324, 490: 325, 491: 326, 492: 327, 493: 328, 494: 329, 495: 330, 496: 331, 497: 332, 498: 333, 499: 334, 500: 335, 501: 336, 505: 337, 506: 338, 507: 339, 508: 340, 509: 341, 510: 342, 511: 343, 512: 344, 513: 345, 514: 346, 515: 347, 516: 348, 517: 349, 518: 350, 519: 351, 523: 352, 524: 353, 525: 354, 526: 355, 527: 356, 528: 357, 529: 358, 530: 359, 531: 360, 532: 361, 533: 362, 534: 363, 535: 364, 536: 365, 537: 366, 541: 367, 542: 368, 543: 369, 544: 370, 545: 371, 546: 372, 547: 373, 548: 374, 549: 375, 550: 376, 551: 377, 555: 378, 556: 379, 560: 380, 561: 381, 577: 382, 578: 383, 579: 384, 580: 385, 581: 386, 582: 387, 583: 388, 584: 389, 585: 390, 589: 391, 590: 392, 591: 393, 592: 394, 593: 395, 594: 396, 595: 397, 596: 398, 597: 399, 601: 400, 602: 401, 603: 402, 604: 403, 605: 404, 606: 405, 607: 406, 608: 407, 609: 408, 613: 409, 614: 410, 615: 411, 616: 412, 617: 413, 618: 414, 622: 415, 623: 416, 624: 417, 625: 418, 626: 419, 627: 420, 637: 421, 638: 422, 639: 423, 640: 424, 641: 425, 642: 426, 646: 427, 647: 428, 648: 429, 649: 430, 650: 431, 651: 432, 652: 433, 653: 434, 654: 435, 655: 436, 656: 437, 657: 438, 658: 439, 659: 440, 660: 441, 664: 442, 665: 443, 666: 444, 667: 445, 668: 446, 669: 447, 670: 448, 671: 449, 672: 450, 673: 451, 674: 452, 675: 453, 676: 454, 677: 455, 678: 456, 682: 457, 683: 458, 684: 459, 685: 460, 686: 461, 687: 462, 688: 463, 689: 464, 690: 465, 691: 466, 692: 467, 693: 468, 694: 469, 695: 470, 696: 471, 700: 472, 701: 473, 702: 474, 703: 475, 704: 476, 705: 477, 706: 478, 707: 479, 708: 480, 709: 481, 710: 482, 714: 483, 715: 484, 719: 485, 720: 486, 736: 487, 737: 488, 738: 489, 739: 490, 740: 491, 741: 492, 742: 493, 743: 494, 744: 495, 748: 496, 749: 497, 750: 498, 751: 499, 752: 500, 753: 501, 754: 502, 755: 503, 756: 504, 760: 505, 761: 506, 762: 507, 763: 508, 764: 509, 765: 510, 766: 511, 767: 512, 768: 513, 772: 514, 773: 515, 774: 516, 775: 517, 776: 518, 777: 519, 781: 520, 782: 521, 783: 522, 784: 523, 785: 524, 786: 525, 796: 526, 797: 527, 798: 528, 799: 529, 800: 530, 801: 531, 805: 532, 806: 533, 807: 534, 808: 535, 809: 536, 810: 537, 811: 538, 812: 539, 813: 540, 814: 541, 815: 542, 816: 543, 817: 544, 818: 545, 819: 546, 823: 547, 824: 548, 825: 549, 826: 550, 827: 551, 828: 552, 829: 553, 830: 554, 831: 555, 832: 556, 833: 557, 834: 558, 835: 559, 836: 560, 837: 561, 841: 562, 842: 563, 843: 564, 844: 565, 845: 566, 846: 567, 847: 568, 848: 569, 849: 570, 850: 571, 851: 572, 852: 573, 853: 574, 854: 575, 855: 576, 859: 577, 860: 578, 861: 579, 862: 580, 863: 581, 864: 582, 865: 583, 866: 584, 867: 585, 868: 586, 869: 587, 873: 588, 874: 589, 878: 590, 879: 591, 895: 592, 896: 593, 897: 594, 898: 595, 899: 596, 900: 597, 901: 598, 902: 599, 903: 600, 907: 601, 908: 602, 909: 603, 910: 604, 911: 605, 912: 606, 913: 607, 914: 608, 915: 609, 919: 610, 920: 611, 921: 612, 922: 613, 923: 614, 924: 615, 925: 616, 926: 617, 927: 618, 931: 619, 932: 620, 933: 621, 934: 622, 935: 623, 936: 624, 940: 625, 941: 626, 942: 627, 943: 628, 944: 629, 945: 630, 955: 631, 956: 632, 957: 633, 958: 634, 959: 635, 960: 636, 961: 637, 962: 638, 963: 639, 964: 640, 965: 641, 966: 642, 967: 643, 968: 644, 969: 645, 970: 646, 971: 647, 972: 648, 973: 649, 974: 650, 975: 651, 976: 652, 977: 653, 978: 654, 979: 655, 980: 656, 981: 657, 982: 658, 983: 659, 984: 660, 985: 661, 986: 662, 987: 663, 991: 664, 992: 665, 993: 666, 994: 667, 995: 668, 996: 669, 1000: 670, 1001: 671, 1002: 672, 1003: 673, 1004: 674, 1005: 675, 1015: 676, 1016: 677, 1017: 678, 1018: 679, 1019: 680, 1020: 681, 1021: 682, 1022: 683, 1023: 684, 1024: 685, 1025: 686, 1026: 687, 1027: 688, 1028: 689, 1029: 690, 1030: 691, 1031: 692, 1032: 693, 1033: 694, 1034: 695, 1035: 696, 1036: 697, 1037: 698, 1038: 699, 1039: 700, 1040: 701, 1041: 702, 1042: 703, 1043: 704, 1044: 705, 1045: 706, 1046: 707, 1047: 708, 1051: 709, 1052: 710, 1053: 711, 1054: 712, 1055: 713, 1056: 714, 1060: 715, 1061: 716, 1062: 717, 1063: 718, 1064: 719, 1065: 720, 1075: 721, 1076: 722, 1080: 723, 1081: 724, 1085: 725, 1086: 726, 1102: 727, 1103: 728, 1104: 729, 1105: 730, 1106: 731, 1107: 732, 1108: 733, 1109: 734, 1110: 735, 1114: 736, 1115: 737, 1116: 738, 1117: 739, 1118: 740, 1119: 741, 1120: 742, 1121: 743, 1122: 744, 1126: 745, 1127: 746, 1128: 747, 1129: 748, 1130: 749, 1131: 750, 1132: 751, 1133: 752, 1134: 753, 1138: 754, 1139: 755, 1140: 756, 1141: 757, 1142: 758, 1143: 759, 1147: 760, 1148: 761, 1149: 762, 1150: 763, 1151: 764, 1152: 765, 1153: 766, 1154: 767, 1158: 768, 1159: 769, 1163: 770, 1164: 771, 1180: 772, 1181: 773, 1182: 774, 1183: 775, 1184: 776, 1185: 777, 1186: 778, 1187: 779, 1188: 780, 1192: 781, 1193: 782, 1194: 783, 1195: 784, 1196: 785, 1197: 786, 1198: 787, 1199: 788, 1200: 789, 1204: 790, 1205: 791, 1206: 792, 1207: 793, 1208: 794, 1209: 795, 1210: 796, 1211: 797, 1212: 798, 1216: 799, 1217: 800, 1218: 801, 1219: 802, 1220: 803, 1221: 804, 1225: 805, 1226: 806, 1227: 807, 1228: 808, 1229: 809, 1230: 810, 1231: 811, 1232: 812, 1236: 813, 1237: 814, 1241: 815, 1242: 816, 1258: 817, 1259: 818, 1260: 819, 1261: 820, 1262: 821, 1263: 822, 1264: 823, 1265: 824, 1266: 825, 1270: 826, 1271: 827, 1272: 828, 1273: 829, 1274: 830, 1275: 831, 1276: 832, 1277: 833, 1278: 834, 1282: 835, 1283: 836, 1284: 837, 1285: 838, 1286: 839, 1287: 840, 1288: 841, 1289: 842, 1290: 843, 1294: 844, 1295: 845, 1296: 846, 1297: 847, 1298: 848, 1299: 849, 1303: 850, 1304: 851, 1305: 852, 1306: 853, 1307: 854, 1308: 855, 1309: 856, 1310: 857, 1314: 858, 1315: 859, 1319: 860, 1320: 861, 1333: 862, 1334: 863, 1338: 864, 1339: 865, 1343: 866, 1344: 867, 1357: 868, 1358: 869, 1362: 870, 1363: 871, 1367: 872, 1368: 873, 1396: 874, 1397: 875, 1398: 876, 1399: 877, 1400: 878, 1401: 879, 1402: 880, 1403: 881, 1404: 882, 1405: 883, 1406: 884, 1407: 885, 1408: 886, 1409: 887, 1410: 888, 1411: 889, 1412: 890, 1413: 891, 1417: 892, 1418: 893, 1419: 894, 1420: 895, 1421: 896, 1422: 897, 1423: 898, 1424: 899, 1425: 900, 1426: 901, 1427: 902, 1428: 903, 1429: 904, 1430: 905, 1431: 906, 1432: 907, 1433: 908, 1434: 909, 1438: 910, 1439: 911, 1440: 912, 1441: 913, 1442: 914, 1443: 915, 1444: 916, 1445: 917, 1446: 918, 1447: 919, 1448: 920, 1449: 921, 1450: 922, 1451: 923, 1452: 924, 1453: 925, 1454: 926, 1455: 927, 1459: 928, 1460: 929, 1461: 930, 1462: 931, 1463: 932, 1464: 933, 1468: 934, 1469: 935, 1470: 936, 1471: 937, 1472: 938, 1473: 939, 1477: 940, 1478: 941, 1479: 942, 1480: 943, 1481: 944, 1482: 945} [1;30m[model_handling.py at line 1700][0m [0m
+[1;32mDEBUG:  call = [0m vxxxxx( momenta,m_pars->%s, cHel[ihel][%d],%+d, w_sv[%d], %d ); [1;30m[model_handling.py at line 1812][0m [0m
+[1;32mDEBUG:  ('ZERO', 0, -1, 0, 0) [1;30m[model_handling.py at line 1813][0m [0m
+[1;32mDEBUG:  call = [0m vxxxxx( momenta,m_pars->%s, cHel[ihel][%d],%+d, w_sv[%d], %d ); [1;30m[model_handling.py at line 1812][0m [0m
+[1;32mDEBUG:  ('ZERO', 1, -1, 1, 1) [1;30m[model_handling.py at line 1813][0m [0m
+[1;32mDEBUG:  call = [0m vxxxxx( momenta,m_pars->%s, cHel[ihel][%d],%+d, w_sv[%d], %d ); [1;30m[model_handling.py at line 1812][0m [0m
+[1;32mDEBUG:  ('ZERO', 4, 1, 4, 4) [1;30m[model_handling.py at line 1813][0m [0m
+[1;32mDEBUG:  call = [0m vxxxxx( momenta,m_pars->%s, cHel[ihel][%d],%+d, w_sv[%d], %d ); [1;30m[model_handling.py at line 1812][0m [0m
+[1;32mDEBUG:  ('ZERO', 5, 1, 5, 5) [1;30m[model_handling.py at line 1813][0m [0m
+[1;32mDEBUG:  call = [0m vxxxxx( momenta,m_pars->%s, cHel[ihel][%d],%+d, w_sv[%d], %d ); [1;30m[model_handling.py at line 1812][0m [0m
+[1;32mDEBUG:  ('ZERO', 6, 1, 6, 6) [1;30m[model_handling.py at line 1813][0m [0m
 INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./. 
 [1;32mDEBUG:  Entering PLUGIN_OneProcessExporter.edit_CMakeLists [1;30m[model_handling.py at line 1332][0m [0m
 [1;32mDEBUG:  Entering PLUGIN_OneProcessExporter.edit_check_sa [1;30m[model_handling.py at line 1341][0m [0m
@@ -221,15 +221,15 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./.
 [1;32mDEBUG:  Done [1;30m[export_cpp.py at line 713][0m [0m
 INFO: Generating Feynman diagrams for Process: g g > t t~ g g g WEIGHTED<=5 @1 
 INFO: Finding symmetric diagrams for subprocess group gg_ttxggg 
-Generated helas calls for 1 subprocesses (1240 diagrams) in 5.868 s
-Wrote files for 2281 helas calls in 39.490 s
+Generated helas calls for 1 subprocesses (1240 diagrams) in 6.293 s
+Wrote files for 2281 helas calls in 43.698 s
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates VVV1 routines[0m
 ALOHA: aloha creates FFV1 routines[0m
 ALOHA: aloha creates VVVV1 routines[0m
 ALOHA: aloha creates VVVV3 routines[0m
 ALOHA: aloha creates VVVV4 routines[0m
-ALOHA: aloha creates 5 routines in  0.272 s
+ALOHA: aloha creates 5 routines in  0.302 s
 [1;32mDEBUG:  Entering PLUGIN_ProcessExporter.convert_model (create the model) [1;30m[output.py at line 194][0m [0m
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates VVV1 routines[0m
@@ -237,7 +237,7 @@ ALOHA: aloha creates FFV1 routines[0m
 ALOHA: aloha creates VVVV1 routines[0m
 ALOHA: aloha creates VVVV3 routines[0m
 ALOHA: aloha creates VVVV4 routines[0m
-ALOHA: aloha creates 10 routines in  0.268 s
+ALOHA: aloha creates 10 routines in  0.296 s
 <class 'aloha.create_aloha.AbstractRoutine'> VVV1
 <class 'aloha.create_aloha.AbstractRoutine'> VVV1
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
@@ -288,6 +288,6 @@ Type "launch" to generate events from this process, or see
 Run "open index.html" to see more information about this process.
 quit
 
-real	0m49.958s
-user	0m48.085s
-sys	0m0.851s
+real	0m54.095s
+user	0m53.047s
+sys	0m0.756s
diff --git a/epochX/cudacpp/gg_ttggg.mad/COPYRIGHT b/epochX/cudacpp/gg_ttggg.mad/COPYRIGHT
index a134b5fef9..84a883fbb0 100644
--- a/epochX/cudacpp/gg_ttggg.mad/COPYRIGHT
+++ b/epochX/cudacpp/gg_ttggg.mad/COPYRIGHT
@@ -15,6 +15,7 @@ The full development team currently includes the following authors :
   Stephan Hageboeck (CERN)
   Olivier Mattelaer (Universite Catholique de Louvain, original author)
   Stefan Roiser (CERN, original author)
+  Joergen Teig (CERN)
   Andrea Valassi (CERN, original author)
   Zenny Wettersten (CERN)
 See https://github.com/madgraph5/madgraph4gpu for more details. For the full
diff --git a/epochX/cudacpp/gg_ttggg.mad/Source/DHELAS/aloha_file.inc b/epochX/cudacpp/gg_ttggg.mad/Source/DHELAS/aloha_file.inc
index cf4ec946f8..ec923afd6d 100644
--- a/epochX/cudacpp/gg_ttggg.mad/Source/DHELAS/aloha_file.inc
+++ b/epochX/cudacpp/gg_ttggg.mad/Source/DHELAS/aloha_file.inc
@@ -1 +1 @@
-ALOHARoutine = VVVV3_0.o VVVV4P0_1.o VVVV3P0_1.o VVVV1P0_1.o FFV1_1.o FFV1_2.o VVV1P0_1.o VVV1_0.o FFV1_0.o FFV1P0_3.o VVVV1_0.o VVVV4_0.o
+ALOHARoutine = FFV1_1.o VVVV4_0.o VVVV4P0_1.o FFV1_0.o VVV1_0.o FFV1_2.o VVVV3_0.o VVVV1_0.o VVVV3P0_1.o VVVV1P0_1.o VVV1P0_1.o FFV1P0_3.o
diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/Bridge.h b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/Bridge.h
index 4cafe0c997..c04628dfd1 100644
--- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/Bridge.h
+++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/Bridge.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: S. Roiser (Nov 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Roiser, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef BRIDGE_H
 #define BRIDGE_H 1
@@ -22,7 +22,7 @@
 #include <memory>
 #include <type_traits>
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -82,7 +82,7 @@ namespace mg5amcCpu
     Bridge& operator=( const Bridge& ) = delete;
     Bridge& operator=( Bridge&& ) = delete;
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     /**
      * Set the gpublocks and gputhreads for the gpusequence - throws if evnt != gpublocks*gputhreads
      * (this is needed for BridgeKernel tests rather than for actual production use in Fortran)
@@ -149,7 +149,7 @@ namespace mg5amcCpu
     unsigned int m_nevt; // number of events
     int m_nGoodHel;      // the number of good helicities (-1 initially when they have not yet been calculated)
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     int m_gputhreads; // number of gpu threads (default set from number of events, can be modified)
     int m_gpublocks;  // number of gpu blocks (default set from number of events, can be modified)
     mg5amcGpu::DeviceBuffer<FORTRANFPTYPE, sizePerEventMomenta> m_devMomentaF;
@@ -186,12 +186,12 @@ namespace mg5amcCpu
   // Forward declare transposition methods
   //
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 
   template<typename Tin, typename Tout>
   __global__ void dev_transposeMomentaF2C( const Tin* in, Tout* out, const unsigned int nevt );
 
-#endif // __CUDACC__
+#endif // MGONGPUCPP_GPUIMPL
 
   template<typename Tin, typename Tout>
   void hst_transposeMomentaF2C( const Tin* in, Tout* out, const unsigned int nevt );
@@ -208,7 +208,7 @@ namespace mg5amcCpu
   Bridge<FORTRANFPTYPE>::Bridge( unsigned int nevtF, unsigned int nparF, unsigned int np4F )
     : m_nevt( nevtF )
     , m_nGoodHel( -1 )
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     , m_gputhreads( 256 )                  // default number of gpu threads
     , m_gpublocks( m_nevt / m_gputhreads ) // this ensures m_nevt <= m_gpublocks*m_gputhreads
     , m_devMomentaF( m_nevt )
@@ -232,7 +232,7 @@ namespace mg5amcCpu
   {
     if( nparF != CPPProcess::npar ) throw std::runtime_error( "Bridge constructor: npar mismatch" );
     if( np4F != CPPProcess::np4 ) throw std::runtime_error( "Bridge constructor: np4 mismatch" );
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     if( ( m_nevt < s_gputhreadsmin ) || ( m_nevt % s_gputhreadsmin != 0 ) )
       throw std::runtime_error( "Bridge constructor: nevt should be a multiple of " + std::to_string( s_gputhreadsmin ) );
     while( m_nevt != m_gpublocks * m_gputhreads )
@@ -250,11 +250,11 @@ namespace mg5amcCpu
     std::cout << "WARNING! Instantiate host Bridge (nevt=" << m_nevt << ")" << std::endl;
     mg5amcCpu::CPPProcess process( /*verbose=*/false );
     m_pmek.reset( new mg5amcCpu::MatrixElementKernelHost( m_hstMomentaC, m_hstGs, m_hstRndHel, m_hstRndCol, m_hstMEs, m_hstSelHel, m_hstSelCol, m_nevt ) );
-#endif // __CUDACC__
+#endif // MGONGPUCPP_GPUIMPL
     process.initProc( "../../Cards/param_card.dat" );
   }
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   template<typename FORTRANFPTYPE>
   void Bridge<FORTRANFPTYPE>::set_gpugrid( const int gpublocks, const int gputhreads )
   {
@@ -268,7 +268,7 @@ namespace mg5amcCpu
   }
 #endif
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   template<typename FORTRANFPTYPE>
   void Bridge<FORTRANFPTYPE>::gpu_sequence( const FORTRANFPTYPE* momenta,
                                             const FORTRANFPTYPE* gs,
@@ -283,14 +283,14 @@ namespace mg5amcCpu
     constexpr int neppM = MemoryAccessMomenta::neppM;
     if constexpr( neppM == 1 && std::is_same_v<FORTRANFPTYPE, fptype> )
     {
-      checkCuda( cudaMemcpy( m_devMomentaC.data(), momenta, m_devMomentaC.bytes(), cudaMemcpyHostToDevice ) );
+      gpuMemcpy( m_devMomentaC.data(), momenta, m_devMomentaC.bytes(), gpuMemcpyHostToDevice );
     }
     else
     {
-      checkCuda( cudaMemcpy( m_devMomentaF.data(), momenta, m_devMomentaF.bytes(), cudaMemcpyHostToDevice ) );
+      gpuMemcpy( m_devMomentaF.data(), momenta, m_devMomentaF.bytes(), gpuMemcpyHostToDevice );
       const int thrPerEvt = CPPProcess::npar * CPPProcess::np4; // AV: transpose alg does 1 element per thread (NOT 1 event per thread)
       //const int thrPerEvt = 1; // AV: try new alg with 1 event per thread... this seems slower
-      dev_transposeMomentaF2C<<<m_gpublocks * thrPerEvt, m_gputhreads>>>( m_devMomentaF.data(), m_devMomentaC.data(), m_nevt );
+      gpuLaunchKernel( dev_transposeMomentaF2C, m_gpublocks * thrPerEvt, m_gputhreads, m_devMomentaF.data(), m_devMomentaC.data(), m_nevt );
     }
     if constexpr( std::is_same_v<FORTRANFPTYPE, fptype> )
     {
@@ -333,7 +333,7 @@ namespace mg5amcCpu
   }
 #endif
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   template<typename FORTRANFPTYPE>
   void Bridge<FORTRANFPTYPE>::cpu_sequence( const FORTRANFPTYPE* momenta,
                                             const FORTRANFPTYPE* gs,
@@ -388,7 +388,7 @@ namespace mg5amcCpu
   // - C++ array: momenta[npagM][npar][np4][neppM] with nevt=npagM*neppM (AOSOA)
   //
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   template<typename Tin, typename Tout>
   __global__ void dev_transposeMomentaF2C( const Tin* in, Tout* out, const unsigned int nevt )
   {
diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/BridgeKernels.cc b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/BridgeKernels.cc
index cef4cb3c71..90c7f2d3b8 100644
--- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/BridgeKernels.cc
+++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/BridgeKernels.cc
@@ -1,10 +1,11 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #include "BridgeKernels.h"
 
+#include "GpuAbstraction.h"
 #include "MemoryAccessMomenta.h"
 
 #include <sstream>
@@ -14,7 +15,7 @@ constexpr int npar = CPPProcess::npar; // #particles in total (external = initia
 
 //============================================================================
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -45,7 +46,7 @@ namespace mg5amcCpu
 
 //============================================================================
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 namespace mg5amcCpu
 {
 
@@ -96,7 +97,7 @@ namespace mg5amcCpu
 
 //============================================================================
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 {
 
diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/BridgeKernels.h b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/BridgeKernels.h
index 15eb4bff4d..3efef8ce97 100644
--- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/BridgeKernels.h
+++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/BridgeKernels.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef BRIDGEKERNELS_H
 #define BRIDGEKERNELS_H 1
@@ -12,7 +12,7 @@
 #include "MatrixElementKernels.h"
 #include "MemoryBuffers.h"
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -49,7 +49,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A Bridge wrapper class encapsulating matrix element calculations on a CPU host
   class BridgeKernelHost final : public BridgeKernelBase
   {
@@ -89,7 +89,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   // A Bridge wrapper class encapsulating matrix element calculations on a GPU device
   class BridgeKernelDevice : public BridgeKernelBase
   {
diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/CommonRandomNumberKernel.cc b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/CommonRandomNumberKernel.cc
index 985b39f576..010bc4cbd0 100644
--- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/CommonRandomNumberKernel.cc
+++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/CommonRandomNumberKernel.cc
@@ -1,15 +1,16 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #include "CommonRandomNumbers.h"
+#include "GpuAbstraction.h"
 #include "MemoryBuffers.h"
 #include "RandomNumberKernels.h"
 
 #include <cassert>
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/CrossSectionKernels.cc b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/CrossSectionKernels.cc
index 0b355a3c8d..c15b39844d 100644
--- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/CrossSectionKernels.cc
+++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/CrossSectionKernels.cc
@@ -1,10 +1,11 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #include "CrossSectionKernels.h"
 
+#include "GpuAbstraction.h"
 #include "MemoryAccessMatrixElements.h"
 #include "MemoryAccessWeights.h"
 #include "MemoryBuffers.h"
@@ -77,7 +78,7 @@ debug_me_is_abnormal( const fptype& me, size_t ievtALL )
 
 //============================================================================
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -185,7 +186,7 @@ namespace mg5amcCpu
 
 //============================================================================
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 {
 
diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/CrossSectionKernels.h b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/CrossSectionKernels.h
index 7933ca4bbf..4d9659e04e 100644
--- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/CrossSectionKernels.h
+++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/CrossSectionKernels.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef CROSSSECTIONKERNELS_H
 #define CROSSSECTIONKERNELS_H 1
@@ -13,7 +13,7 @@
 
 //============================================================================
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -96,7 +96,7 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
   /*
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   // A class encapsulating the calculation of event statistics on a GPU device
   class CrossSectionKernelDevice : public CrossSectionKernelBase, public NumberOfEvents
   {
diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/CudaRuntime.h b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/CudaRuntime.h
deleted file mode 100644
index 64ce52f4b3..0000000000
--- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/CudaRuntime.h
+++ /dev/null
@@ -1,85 +0,0 @@
-// Copyright (C) 2020-2023 CERN and UCLouvain.
-// Licensed under the GNU Lesser General Public License (version 3 or later).
-// Created by: S. Roiser (Jul 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
-
-#ifndef MG5AMC_CUDARUNTIME_H
-#define MG5AMC_CUDARUNTIME_H 1
-
-// MG5AMC on GPU uses the CUDA runtime API, not the lower level CUDA driver API
-// See https://docs.nvidia.com/cuda/cuda-runtime-api/driver-vs-runtime-api.html#driver-vs-runtime-api
-
-#include <cassert>
-#include <iostream>
-
-//--------------------------------------------------------------------------
-
-// See https://stackoverflow.com/a/14038590
-#ifdef __CUDACC__ /* clang-format off */
-#define checkCuda( code ) { assertCuda( code, __FILE__, __LINE__ ); }
-inline void assertCuda( cudaError_t code, const char* file, int line, bool abort = true )
-{
-  if( code != cudaSuccess )
-  {
-    printf( "ERROR! assertCuda: '%s' (%d) in %s:%d\n", cudaGetErrorString( code ), code, file, line );
-    if( abort ) assert( code == cudaSuccess );
-  }
-}
-#endif /* clang-format on */
-
-//--------------------------------------------------------------------------
-
-#ifdef __CUDACC__
-namespace mg5amcGpu
-{
-  // Instantiate a CudaRuntime at the beginnining of the application's main to
-  // invoke cudaSetDevice(0) in the constructor and book a cudaDeviceReset() call in the destructor
-  // *** FIXME! This will all need to be designed differently when going to multi-GPU nodes! ***
-  struct CudaRuntime final
-  {
-    CudaRuntime( const bool debug = true )
-      : m_debug( debug ) { setUp( m_debug ); }
-    ~CudaRuntime() { tearDown( m_debug ); }
-    CudaRuntime( const CudaRuntime& ) = delete;
-    CudaRuntime( CudaRuntime&& ) = delete;
-    CudaRuntime& operator=( const CudaRuntime& ) = delete;
-    CudaRuntime& operator=( CudaRuntime&& ) = delete;
-    bool m_debug;
-
-    // Set up CUDA application
-    // ** NB: strictly speaking this is not needed when using the CUDA runtime API **
-    // Calling cudaSetDevice on startup is useful to properly book-keep the time spent in CUDA initialization
-    static void setUp( const bool debug = true )
-    {
-      // ** NB: it is useful to call cudaSetDevice, or cudaFree, to properly book-keep the time spent in CUDA initialization
-      // ** NB: otherwise, the first CUDA operation (eg a cudaMemcpyToSymbol in CPPProcess ctor) appears to take much longer!
-      /*
-      // [We initially added cudaFree(0) to "ease profile analysis" only because it shows up as a big recognizable block!]
-      // No explicit initialization is needed: https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#initialization
-      // It is not clear what cudaFree(0) does at all: https://stackoverflow.com/questions/69967813/
-      if ( debug ) std::cout << "__CudaRuntime: calling cudaFree(0)" << std::endl;
-      checkCuda( cudaFree( 0 ) ); // SLOW!
-      */
-      // Replace cudaFree(0) by cudaSetDevice(0), even if it is not really needed either
-      // (but see https://developer.nvidia.com/blog/cuda-pro-tip-always-set-current-device-avoid-multithreading-bugs)
-      if( debug ) std::cout << "__CudaRuntime: calling cudaSetDevice(0)" << std::endl;
-      checkCuda( cudaSetDevice( 0 ) ); // SLOW!
-    }
-
-    // Tear down CUDA application (call cudaDeviceReset)
-    // ** NB: strictly speaking this is not needed when using the CUDA runtime API **
-    // Calling cudaDeviceReset on shutdown is only needed for checking memory leaks in cuda-memcheck
-    // See https://docs.nvidia.com/cuda/cuda-memcheck/index.html#leak-checking
-    static void tearDown( const bool debug = true )
-    {
-      if( debug ) std::cout << "__CudaRuntime: calling cudaDeviceReset()" << std::endl;
-      checkCuda( cudaDeviceReset() );
-    }
-  };
-
-}
-#endif
-
-//--------------------------------------------------------------------------
-
-#endif // MG5AMC_CUDARUNTIME_H
diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/CurandRandomNumberKernel.cc b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/CurandRandomNumberKernel.cc
index eb56333b03..38c477c17a 100644
--- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/CurandRandomNumberKernel.cc
+++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/CurandRandomNumberKernel.cc
@@ -1,9 +1,9 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
-#include "CudaRuntime.h"
+#include "GpuRuntime.h"
 #include "MemoryBuffers.h"
 #include "RandomNumberKernels.h"
 
@@ -114,7 +114,7 @@ namespace mg5amcCpu
     /*
     printf( "\nCurandRandomNumberKernel::generateRnarray size = %d\n", (int)m_rnarray.size() );
     fptype* data = m_rnarray.data();
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     if( m_rnarray.isOnDevice() )
     {
       data = new fptype[m_rnarray.size()]();
@@ -123,7 +123,7 @@ namespace mg5amcCpu
 #endif
     for( int i = 0; i < ( (int)m_rnarray.size() / 4 ); i++ )
       printf( "[%4d] %f %f %f %f\n", i * 4, data[i * 4], data[i * 4 + 2], data[i * 4 + 2], data[i * 4 + 3] );
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     if( m_rnarray.isOnDevice() ) delete[] data;
 #endif
     */
diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/EventStatistics.h b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/EventStatistics.h
index 48b51e0a49..b425a5bade 100644
--- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/EventStatistics.h
+++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/EventStatistics.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef EventStatistics_H
 #define EventStatistics_H 1
@@ -16,7 +16,7 @@
 #include <limits>
 #include <string>
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/GpuAbstraction.h b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/GpuAbstraction.h
new file mode 100644
index 0000000000..6a7d9c05c0
--- /dev/null
+++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/GpuAbstraction.h
@@ -0,0 +1,71 @@
+// Copyright (C) 2020-2023 CERN and UCLouvain.
+// Licensed under the GNU Lesser General Public License (version 3 or later).
+// Created by: J. Teig (Jul 2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+
+#ifndef MG5AMC_GPUABSTRACTION_H
+#define MG5AMC_GPUABSTRACTION_H 1
+
+#include <cassert>
+
+//--------------------------------------------------------------------------
+
+#ifdef __CUDACC__
+
+#define gpuError_t cudaError_t
+#define gpuPeekAtLastError cudaPeekAtLastError
+#define gpuGetErrorString cudaGetErrorString
+#define gpuSuccess cudaSuccess
+
+#define gpuMallocHost( ptr, size ) checkGpu( cudaMallocHost( ptr, size ) )
+#define gpuMalloc( ptr, size ) checkGpu( cudaMalloc( ptr, size ) )
+
+#define gpuMemcpy( dstData, srcData, srcBytes, func ) checkGpu( cudaMemcpy( dstData, srcData, srcBytes, func ) )
+#define gpuMemcpyHostToDevice cudaMemcpyHostToDevice
+#define gpuMemcpyDeviceToHost cudaMemcpyDeviceToHost
+#define gpuMemcpyToSymbol( type1, type2, size ) checkGpu( cudaMemcpyToSymbol( type1, type2, size ) )
+
+#define gpuFree( ptr ) checkGpu( cudaFree( ptr ) )
+#define gpuFreeHost( ptr ) checkGpu( cudaFreeHost( ptr ) )
+
+#define gpuSetDevice cudaSetDevice
+#define gpuDeviceSynchronize cudaDeviceSynchronize
+#define gpuDeviceReset cudaDeviceReset
+
+#define gpuLaunchKernel( kernel, blocks, threads, ... ) kernel<<<blocks, threads>>>( __VA_ARGS__ )
+#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<<blocks, threads, sharedMem>>>( __VA_ARGS__ )
+
+//--------------------------------------------------------------------------
+
+#elif defined __HIPCC__
+
+#include "hip/hip_runtime.h"
+
+#define gpuError_t hipError_t
+#define gpuPeekAtLastError hipPeekAtLastError
+#define gpuGetErrorString hipGetErrorString
+#define gpuSuccess hipSuccess
+
+#define gpuMallocHost( ptr, size ) checkGpu( hipHostMalloc( ptr, size ) ) // HostMalloc better
+#define gpuMalloc( ptr, size ) checkGpu( hipMalloc( ptr, size ) )
+
+#define gpuMemcpy( dstData, srcData, srcBytes, func ) checkGpu( hipMemcpy( dstData, srcData, srcBytes, func ) )
+#define gpuMemcpyHostToDevice hipMemcpyHostToDevice
+#define gpuMemcpyDeviceToHost hipMemcpyDeviceToHost
+#define gpuMemcpyToSymbol( type1, type2, size ) checkGpu( hipMemcpyToSymbol( type1, type2, size ) )
+
+#define gpuFree( ptr ) checkGpu( hipFree( ptr ) )
+#define gpuFreeHost( ptr ) checkGpu( hipHostFree( ptr ) )
+
+#define gpuSetDevice hipSetDevice
+#define gpuDeviceSynchronize hipDeviceSynchronize
+#define gpuDeviceReset hipDeviceReset
+
+#define gpuLaunchKernel( kernel, blocks, threads, ... ) kernel<<<blocks, threads>>>( __VA_ARGS__ )
+#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<<blocks, threads, sharedMem>>>( __VA_ARGS__ )
+
+//--------------------------------------------------------------------------
+
+#endif
+
+#endif // MG5AMC_GPUABSTRACTION_H
diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/GpuRuntime.h b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/GpuRuntime.h
new file mode 100644
index 0000000000..93579ef08b
--- /dev/null
+++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/GpuRuntime.h
@@ -0,0 +1,85 @@
+// Copyright (C) 2020-2023 CERN and UCLouvain.
+// Licensed under the GNU Lesser General Public License (version 3 or later).
+// Created by: J. Teig (Jun 2023, based on earlier work by S. Roiser) for the MG5aMC CUDACPP plugin.
+// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+
+#ifndef MG5AMC_GPURUNTIME_H
+#define MG5AMC_GPURUNTIME_H 1
+
+// MG5AMC on GPU uses the CUDA runtime API, not the lower level CUDA driver API
+// See https://docs.nvidia.com/cuda/cuda-runtime-api/driver-vs-runtime-api.html#driver-vs-runtime-api
+
+#include "GpuAbstraction.h"
+
+#include <iostream>
+
+//--------------------------------------------------------------------------
+
+// See https://stackoverflow.com/a/14038590
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
+#define checkGpu( code ) { assertGpu( code, __FILE__, __LINE__ ); }
+inline void assertGpu( gpuError_t code, const char* file, int line, bool abort = true )
+{
+  if( code != gpuSuccess )
+  {
+    printf( "ERROR! assertGpu: '%s' (%d) in %s:%d\n", gpuGetErrorString( code ), code, file, line );
+    if( abort ) assert( code == gpuSuccess );
+  }
+}
+#endif /* clang-format on */
+
+//--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+namespace mg5amcGpu
+{
+  // Instantiate a GpuRuntime at the beginnining of the application's main to
+  // invoke gpuSetDevice(0) in the constructor and book a gpuDeviceReset() call in the destructor
+  // *** FIXME! This will all need to be designed differently when going to multi-GPU nodes! ***
+  struct GpuRuntime final
+  {
+    GpuRuntime( const bool debug = true )
+      : m_debug( debug ) { setUp( m_debug ); }
+    ~GpuRuntime() { tearDown( m_debug ); }
+    GpuRuntime( const GpuRuntime& ) = delete;
+    GpuRuntime( GpuRuntime&& ) = delete;
+    GpuRuntime& operator=( const GpuRuntime& ) = delete;
+    GpuRuntime& operator=( GpuRuntime&& ) = delete;
+    bool m_debug;
+
+    // Set up CUDA application
+    // ** NB: strictly speaking this is not needed when using the CUDA runtime API **
+    // Calling cudaSetDevice on startup is useful to properly book-keep the time spent in CUDA initialization
+    static void setUp( const bool debug = true )
+    {
+      // ** NB: it is useful to call cudaSetDevice, or cudaFree, to properly book-keep the time spent in CUDA initialization
+      // ** NB: otherwise, the first CUDA operation (eg a cudaMemcpyToSymbol in CPPProcess ctor) appears to take much longer!
+      /*
+      // [We initially added cudaFree(0) to "ease profile analysis" only because it shows up as a big recognizable block!]
+      // No explicit initialization is needed: https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#initialization
+      // It is not clear what cudaFree(0) does at all: https://stackoverflow.com/questions/69967813/
+      if ( debug ) std::cout << "__CudaRuntime: calling cudaFree(0)" << std::endl;
+      checkCuda( cudaFree( 0 ) ); // SLOW!
+      */
+      // Replace cudaFree(0) by cudaSetDevice(0), even if it is not really needed either
+      // (but see https://developer.nvidia.com/blog/cuda-pro-tip-always-set-current-device-avoid-multithreading-bugs)
+      if( debug ) std::cout << "__GpuRuntime: calling GpuSetDevice(0)" << std::endl;
+      checkGpu( gpuSetDevice( 0 ) ); // SLOW!
+    }
+
+    // Tear down CUDA application (call cudaDeviceReset)
+    // ** NB: strictly speaking this is not needed when using the CUDA runtime API **
+    // Calling cudaDeviceReset on shutdown is only needed for checking memory leaks in cuda-memcheck
+    // See https://docs.nvidia.com/cuda/cuda-memcheck/index.html#leak-checking
+    static void tearDown( const bool debug = true )
+    {
+      if( debug ) std::cout << "__GpuRuntime: calling GpuDeviceReset()" << std::endl;
+      checkGpu( gpuDeviceReset() );
+    }
+  };
+}
+#endif
+
+//--------------------------------------------------------------------------
+
+#endif // MG5AMC_GPURUNTIME_H
diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MadgraphTest.h b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MadgraphTest.h
index fd7734ce42..d2ff326e20 100644
--- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MadgraphTest.h
+++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MadgraphTest.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: S. Hageboeck (Dec 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MADGRAPHTEST_H_
 #define MADGRAPHTEST_H_ 1
@@ -21,7 +21,7 @@
 #include <string>
 #include <vector>
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 using mg5amcGpu::CPPProcess;
 #else
 using mg5amcCpu::CPPProcess;
@@ -200,7 +200,7 @@ class MadgraphTest : public testing::TestWithParam<TestDriverBase*>
 
 // Since we link both the CPU-only and GPU tests into the same executable, we prevent
 // a multiply defined symbol by only compiling this in the non-CUDA phase:
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 
 /// Compare momenta and matrix elements.
 /// This uses an implementation of TestDriverBase to run a madgraph workflow,
@@ -309,6 +309,6 @@ TEST_P( MadgraphTest, CompareMomentaAndME )
   }
 }
 
-#endif // __CUDACC__
+#endif // MGONGPUCPP_GPUIMPL
 
 #endif /* MADGRAPHTEST_H_ */
diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MatrixElementKernels.cc b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MatrixElementKernels.cc
index 30257195b6..d6d6c4f179 100644
--- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MatrixElementKernels.cc
+++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MatrixElementKernels.cc
@@ -1,12 +1,12 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #include "MatrixElementKernels.h"
 
 #include "CPPProcess.h"
-#include "CudaRuntime.h"
+#include "GpuRuntime.h" // Includes the abstraction for Nvidia/AMD compilation
 #include "MemoryAccessMomenta.h"
 #include "MemoryBuffers.h"
 
@@ -14,7 +14,7 @@
 
 //============================================================================
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 namespace mg5amcCpu
 {
 
@@ -143,7 +143,7 @@ namespace mg5amcCpu
 
 //============================================================================
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 {
 
@@ -202,13 +202,13 @@ namespace mg5amcGpu
     PinnedHostBufferHelicityMask hstIsGoodHel( ncomb );
     DeviceBufferHelicityMask devIsGoodHel( ncomb );
     // ... 0d1. Compute good helicity mask on the device
-    computeDependentCouplings<<<m_gpublocks, m_gputhreads>>>( m_gs.data(), m_couplings.data() );
+    gpuLaunchKernel( computeDependentCouplings, m_gpublocks, m_gputhreads, m_gs.data(), m_couplings.data() );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    sigmaKin_getGoodHel<<<m_gpublocks, m_gputhreads>>>( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_numerators.data(), m_denominators.data(), devIsGoodHel.data() );
+    gpuLaunchKernel( sigmaKin_getGoodHel, m_gpublocks, m_gputhreads, m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_numerators.data(), m_denominators.data(), devIsGoodHel.data() );
 #else
-    sigmaKin_getGoodHel<<<m_gpublocks, m_gputhreads>>>( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), devIsGoodHel.data() );
+    gpuLaunchKernel( sigmaKin_getGoodHel, m_gpublocks, m_gputhreads, m_momenta.data(), m_couplings.data(), m_matrixElements.data(), devIsGoodHel.data() );
 #endif
-    checkCuda( cudaPeekAtLastError() );
+    checkGpu( gpuPeekAtLastError() );
     // ... 0d2. Copy back good helicity mask to the host
     copyHostFromDevice( hstIsGoodHel, devIsGoodHel );
     // ... 0d3. Copy back good helicity list to constant memory on the device
@@ -219,19 +219,19 @@ namespace mg5amcGpu
 
   void MatrixElementKernelDevice::computeMatrixElements( const unsigned int channelId )
   {
-    computeDependentCouplings<<<m_gpublocks, m_gputhreads>>>( m_gs.data(), m_couplings.data() );
+    gpuLaunchKernel( computeDependentCouplings, m_gpublocks, m_gputhreads, m_gs.data(), m_couplings.data() );
 #ifndef MGONGPU_NSIGHT_DEBUG
     constexpr unsigned int sharedMemSize = 0;
 #else
     constexpr unsigned int sharedMemSize = ntpbMAX * sizeof( float );
 #endif
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    sigmaKin<<<m_gpublocks, m_gputhreads, sharedMemSize>>>( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), channelId, m_numerators.data(), m_denominators.data(), m_selhel.data(), m_selcol.data() );
+    gpuLaunchKernelSharedMem( sigmaKin, m_gpublocks, m_gputhreads, sharedMemSize, m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), channelId, m_numerators.data(), m_denominators.data(), m_selhel.data(), m_selcol.data() );
 #else
-    sigmaKin<<<m_gpublocks, m_gputhreads, sharedMemSize>>>( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), m_selhel.data(), m_selcol.data() );
+    gpuLaunchKernelSharedMem( sigmaKin, m_gpublocks, m_gputhreads, sharedMemSize, m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), m_selhel.data(), m_selcol.data() );
 #endif
-    checkCuda( cudaPeekAtLastError() );
-    checkCuda( cudaDeviceSynchronize() );
+    checkGpu( gpuPeekAtLastError() );
+    checkGpu( gpuDeviceSynchronize() );
   }
 
   //--------------------------------------------------------------------------
diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MatrixElementKernels.h b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MatrixElementKernels.h
index 23e84757a2..72bd8f195b 100644
--- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MatrixElementKernels.h
+++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MatrixElementKernels.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MATRIXELEMENTKERNELS_H
 #define MATRIXELEMENTKERNELS_H 1
@@ -10,7 +10,7 @@
 
 #include "MemoryBuffers.h"
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -81,7 +81,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating matrix element calculations on a CPU host
   class MatrixElementKernelHost final : public MatrixElementKernelBase, public NumberOfEvents
   {
@@ -130,7 +130,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   // A class encapsulating matrix element calculations on a GPU device
   class MatrixElementKernelDevice : public MatrixElementKernelBase, public NumberOfEvents
   {
diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MemoryAccessHelpers.h b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MemoryAccessHelpers.h
index c82a6c7635..db73e4e064 100644
--- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MemoryAccessHelpers.h
+++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MemoryAccessHelpers.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MemoryAccessHelpers_H
 #define MemoryAccessHelpers_H 1
@@ -105,7 +105,7 @@ class KernelAccessHelper : public MemoryAccessHelper<T>
     }
     else
     {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
       const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid
       //printf( "kernelAccessRecord: ievt=%d threadId=%d\n", ievt, threadIdx.x );
       return T::ieventAccessRecord( buffer, ievt ); // NB fptype and fptype_sv coincide for CUDA
diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MemoryAccessMomenta.h b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MemoryAccessMomenta.h
index 0ac4faa3c7..38fade09fb 100644
--- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MemoryAccessMomenta.h
+++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MemoryAccessMomenta.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MemoryAccessMomenta_H
 #define MemoryAccessMomenta_H 1
@@ -12,7 +12,7 @@
 #include "MemoryAccessHelpers.h"
 #include "MemoryAccessVectors.h"
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 using mg5amcGpu::CPPProcess;
 #else
 using mg5amcCpu::CPPProcess;
@@ -29,7 +29,7 @@ class MemoryAccessMomentaBase //_AOSOAv1
 
   // Number of Events Per Page in the momenta AOSOA memory buffer layout
   // (these are all best kept as a compile-time constants: see issue #23)
-#ifdef __CUDACC__ /* clang-format off */
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
   // -----------------------------------------------------------------------------------------------
   // --- GPUs: neppM is best set to a power of 2 times the number of fptype's in a 32-byte cacheline
   // --- This is relevant to ensure coalesced access to momenta in global memory
diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MemoryAccessRandomNumbers.h b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MemoryAccessRandomNumbers.h
index e2988d39f3..40cb089135 100644
--- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MemoryAccessRandomNumbers.h
+++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MemoryAccessRandomNumbers.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MemoryAccessRandomNumbers_H
 #define MemoryAccessRandomNumbers_H 1
@@ -11,7 +11,7 @@
 #include "CPPProcess.h"
 #include "MemoryAccessHelpers.h"
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 using mg5amcGpu::CPPProcess;
 #else
 using mg5amcCpu::CPPProcess;
diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MemoryAccessVectors.h b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MemoryAccessVectors.h
index e9b197368e..08faccff0f 100644
--- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MemoryAccessVectors.h
+++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MemoryAccessVectors.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MemoryAccessVectors_H
 #define MemoryAccessVectors_H 1
@@ -10,7 +10,7 @@
 
 #include "mgOnGpuVectors.h"
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 namespace mg5amcCpu // this is only needed for CPU SIMD vectorization
 {
 
diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MemoryBuffers.h b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MemoryBuffers.h
index 3093e6ed18..7756a71621 100644
--- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MemoryBuffers.h
+++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MemoryBuffers.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021, based on earlier work by S. Hageboeck) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Roiser, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MemoryBuffers_H
 #define MemoryBuffers_H 1
@@ -11,12 +11,12 @@
 #include "mgOnGpuCxtypes.h"
 
 #include "CPPProcess.h"
-#include "CudaRuntime.h"
+#include "GpuRuntime.h"
 #include "Parameters_sm.h"
 
 #include <sstream>
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -87,7 +87,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   constexpr bool HostBufferALIGNED = false;   // ismisaligned=false
   constexpr bool HostBufferMISALIGNED = true; // ismisaligned=true
 
@@ -119,7 +119,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   // A class encapsulating a CUDA pinned host buffer
   template<typename T>
   class PinnedHostBufferBase : public BufferBase<T>
@@ -128,18 +128,18 @@ namespace mg5amcCpu
     PinnedHostBufferBase( const size_t size )
       : BufferBase<T>( size, false )
     {
-      checkCuda( cudaMallocHost( &( this->m_data ), this->bytes() ) );
+      gpuMallocHost( &( this->m_data ), this->bytes() );
     }
     virtual ~PinnedHostBufferBase()
     {
-      checkCuda( cudaFreeHost( this->m_data ) );
+      gpuFreeHost( this->m_data );
     }
   };
 #endif
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   // A class encapsulating a CUDA device buffer
   template<typename T>
   class DeviceBufferBase : public BufferBase<T>
@@ -148,18 +148,18 @@ namespace mg5amcCpu
     DeviceBufferBase( const size_t size )
       : BufferBase<T>( size, true )
     {
-      checkCuda( cudaMalloc( &( this->m_data ), this->bytes() ) );
+      gpuMalloc( &( this->m_data ), this->bytes() );
     }
     virtual ~DeviceBufferBase()
     {
-      checkCuda( cudaFree( this->m_data ) );
+      gpuFree( this->m_data );
     }
   };
 #endif
 
   //--------------------------------------------------------------------------
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for a given number of events
   template<typename T, size_t sizePerEvent, bool ismisaligned>
   class HostBuffer : public HostBufferBase<T, ismisaligned>, virtual private NumberOfEvents
@@ -175,7 +175,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   // A class encapsulating a CUDA pinned host buffer for a given number of events
   template<typename T, size_t sizePerEvent>
   class PinnedHostBuffer : public PinnedHostBufferBase<T>, virtual private NumberOfEvents
@@ -191,7 +191,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   // A class encapsulating a CUDA device buffer for a given number of events
   template<typename T, size_t sizePerEvent>
   class DeviceBuffer : public DeviceBufferBase<T>, virtual private NumberOfEvents
@@ -213,7 +213,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for momenta random numbers
   constexpr size_t sizePerEventRndNumMomenta = MemoryBuffers::np4 * MemoryBuffers::nparf;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for momenta random numbers
   typedef HostBuffer<fptype, sizePerEventRndNumMomenta, HostBufferALIGNED> HostBufferRndNumMomenta;
 #else
@@ -232,7 +232,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer with ONE fptype per event
   constexpr size_t sizePerEventOneFp = 1;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer with ONE fptype per event
   typedef HostBuffer<fptype, sizePerEventOneFp, HostBufferALIGNED> HostBufferOneFp;
 #else
@@ -257,7 +257,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for Gs
   constexpr size_t sizePerEventGs = 1;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for gs
   typedef HostBuffer<fptype, sizePerEventGs, HostBufferALIGNED> HostBufferGs;
 #else
@@ -276,7 +276,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for numerators
   constexpr size_t sizePerEventNumerators = 1;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for gs
   typedef HostBuffer<fptype, sizePerEventNumerators, HostBufferALIGNED> HostBufferNumerators;
 #else
@@ -296,7 +296,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for denominators
   constexpr size_t sizePerEventDenominators = 1;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for gs
   typedef HostBuffer<fptype, sizePerEventDenominators, HostBufferALIGNED> HostBufferDenominators;
 #else
@@ -315,7 +315,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for random numbers
   constexpr size_t sizePerEventCouplings = MemoryBuffers::ndcoup * MemoryBuffers::nx2;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for gs
   typedef HostBuffer<fptype, sizePerEventCouplings, HostBufferALIGNED> HostBufferCouplings;
 #else
@@ -333,7 +333,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for momenta
   constexpr size_t sizePerEventMomenta = MemoryBuffers::np4 * MemoryBuffers::npar;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for momenta
   typedef HostBuffer<fptype, sizePerEventMomenta, HostBufferALIGNED> HostBufferMomenta;
   //typedef HostBuffer<fptype, sizePerEventMomenta, HostBufferMISALIGNED> HostBufferMomenta; // TEST MISALIGNMENT!
@@ -352,7 +352,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for sampling weights
   constexpr size_t sizePerEventWeights = 1;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for sampling weights
   typedef HostBuffer<fptype, sizePerEventWeights, HostBufferALIGNED> HostBufferWeights;
 #else
@@ -370,7 +370,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for matrix elements
   constexpr size_t sizePerEventMatrixElements = 1;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for matrix elements
   typedef HostBuffer<fptype, sizePerEventMatrixElements, HostBufferALIGNED> HostBufferMatrixElements;
 #else
@@ -385,7 +385,7 @@ namespace mg5amcCpu
   // A base class encapsulating a memory buffer for the helicity mask
   typedef BufferBase<bool> BufferHelicityMask;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for the helicity mask
   typedef HostBufferBase<bool, HostBufferALIGNED> HostBufferHelicityMask;
 #else
@@ -403,7 +403,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for wavefunctions
   constexpr size_t sizePerEventWavefunctions = MemoryBuffers::nw6 * MemoryBuffers::nx2;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for wavefunctions
   typedef HostBuffer<fptype, sizePerEventWavefunctions, HostBufferALIGNED> HostBufferWavefunctions;
 #else
@@ -421,7 +421,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for helicity random numbers
   constexpr size_t sizePerEventRndNumHelicity = 1;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for helicity random numbers
   typedef HostBuffer<fptype, sizePerEventRndNumHelicity, HostBufferALIGNED> HostBufferRndNumHelicity;
 #else
@@ -439,7 +439,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for color random numbers
   constexpr size_t sizePerEventRndNumColor = 1;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for color random numbers
   typedef HostBuffer<fptype, sizePerEventRndNumColor, HostBufferALIGNED> HostBufferRndNumColor;
 #else
@@ -457,7 +457,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for helicity selection
   constexpr size_t sizePerEventSelectedHelicity = 1;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for helicity selection
   typedef HostBuffer<int, sizePerEventSelectedHelicity, HostBufferALIGNED> HostBufferSelectedHelicity;
 #else
@@ -475,7 +475,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for color selection
   constexpr size_t sizePerEventSelectedColor = 1;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for color selection
   typedef HostBuffer<int, sizePerEventSelectedColor, HostBufferALIGNED> HostBufferSelectedColor;
 #else
@@ -487,7 +487,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   template<class Tdst, class Tsrc>
   void copyDeviceFromHost( Tdst& dst, const Tsrc& src ) // keep the same order of arguments as in memcpy
   {
@@ -504,13 +504,13 @@ namespace mg5amcCpu
       throw std::runtime_error( sstr.str() );
     }
     // NB (PR #45): cudaMemcpy involves an intermediate memcpy to pinned memory if host array is a not a pinned host array
-    checkCuda( cudaMemcpy( dst.data(), src.data(), src.bytes(), cudaMemcpyHostToDevice ) );
+    gpuMemcpy( dst.data(), src.data(), src.bytes(), gpuMemcpyHostToDevice );
   }
 #endif
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   template<class Tdst, class Tsrc>
   void copyHostFromDevice( Tdst& dst, const Tsrc& src ) // keep the same order of arguments as in memcpy
   {
@@ -527,7 +527,7 @@ namespace mg5amcCpu
       throw std::runtime_error( sstr.str() );
     }
     // NB (PR #45): cudaMemcpy involves an intermediate memcpy to pinned memory if host array is a not a pinned host array
-    checkCuda( cudaMemcpy( dst.data(), src.data(), src.bytes(), cudaMemcpyDeviceToHost ) );
+    gpuMemcpy( dst.data(), src.data(), src.bytes(), gpuMemcpyDeviceToHost );
   }
 #endif
 
diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/CPPProcess.cc b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/CPPProcess.cc
index 5459588505..caf3f4c49d 100644
--- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/CPPProcess.cc
+++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/CPPProcess.cc
@@ -4,7 +4,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
 // MadGraph5_aMC@NLO v. 3.5.0_lo_vect, 2023-06-09
@@ -16,7 +16,6 @@
 
 #include "mgOnGpuConfig.h"
 
-#include "CudaRuntime.h"
 #include "HelAmps_sm.h"
 #include "MemoryAccessAmplitudes.h"
 #include "MemoryAccessCouplings.h"
@@ -46,7 +45,7 @@
 // Class member functions for calculating the matrix elements for
 // Process: g g > t t~ g g g WEIGHTED<=5 @1
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -80,7 +79,7 @@ namespace mg5amcCpu
   __device__ const fptype cIPD[2] = { (fptype)Parameters_sm::mdl_MT, (fptype)Parameters_sm::mdl_WT };
   __device__ const fptype* cIPC = nullptr; // unused as nicoup=0
 #else
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   __device__ __constant__ fptype cIPD[2];
   __device__ __constant__ fptype* cIPC = nullptr; // unused as nicoup=0
 #else
@@ -90,7 +89,7 @@ namespace mg5amcCpu
 #endif
 
   // Helicity combinations (and filtering of "good" helicity combinations)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   __device__ __constant__ short cHel[ncomb][npar];
   __device__ __constant__ int cNGoodHel;
   __device__ __constant__ int cGoodHel[ncomb];
@@ -118,13 +117,13 @@ namespace mg5amcCpu
                            fptype* allDenominators,       // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
                            fptype_sv* jamp2_sv            // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled)
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
                            , const int ievt00             // input: first event number in current C++ event page (for CUDA, ievt depends on threadid)
 #endif
                            )
   //ALWAYS_INLINE // attributes are not permitted in a function definition
   {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     using namespace mg5amcGpu;
     using M_ACCESS = DeviceAccessMomenta;         // non-trivial access: buffer includes all events
     using E_ACCESS = DeviceAccessMatrixElements;  // non-trivial access: buffer includes all events
@@ -151,7 +150,7 @@ namespace mg5amcCpu
 #endif /* clang-format on */
     mgDebug( 0, __FUNCTION__ );
     //printf( "calculate_wavefunctions: ihel=%2d\n", ihel );
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
     //printf( "calculate_wavefunctions: ievt00=%d\n", ievt00 );
 #endif
 
@@ -187,7 +186,7 @@ namespace mg5amcCpu
 #endif
     for( int iParity = 0; iParity < nParity; ++iParity )
     { // START LOOP ON IPARITY
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
       const int ievt0 = ievt00 + iParity * neppV;
 #endif
       constexpr size_t nxcoup = ndcoup + nicoup; // both dependent and independent couplings
@@ -200,8 +199,10 @@ namespace mg5amcCpu
         allCOUPs[idcoup] = CD_ACCESS::idcoupAccessBufferConst( allcouplings, idcoup ); // dependent couplings, vary event-by-event
       for( size_t iicoup = 0; iicoup < nicoup; iicoup++ )
         allCOUPs[ndcoup + iicoup] = CI_ACCESS::iicoupAccessBufferConst( cIPC, iicoup ); // independent couplings, fixed for all events
+#ifdef MGONGPUCPP_GPUIMPL
 #ifdef __CUDACC__
 #pragma nv_diagnostic pop
+#endif
       // CUDA kernels take input/output buffers with momenta/MEs for all events
       const fptype* momenta = allmomenta;
       const fptype* COUPs[nxcoup];
@@ -30018,7 +30019,7 @@ namespace mg5amcCpu
         { -116, 442, 442, -134, -134, 505, -44, -134, 28, -224, -62, 496, -53, 19, -62, 496, 10, -80, -62, -71, 10, -80, 1, -8, 136, -116, -116, -44, -44, 514, -116, 442, -44, 28, -53, -62, -44, -53, 514, -62, 100, 10, 28, -62, -62, 10, 10, 1, 514, 505, -62, 496, -71, 568, -44, -134, -53, -62, 19, -71, 10, -80, 100, 10, 640, -80, 1, -8, 10, 1, 64, -8, -62, -71, 10, -80, 1, -8, 28, -62, -62, 10, 10, 1, 1, -8, 10, 1, 64, -8, -8, 64, -80, -8, -512, 64, 496, 568, -80, 640, -8, 64, -224, 496, 496, -80, -80, -8, -8, 64, -80, -8, -512, 64, 64, -512, 640, 64, 4096, -512 },
         { 136, -116, -116, -44, -44, 514, -116, 442, -44, 28, -53, -62, -44, -53, 514, -62, 100, 10, 28, -62, -62, 10, 10, 1, -116, 442, 442, -134, -134, 505, -44, -134, 28, -224, -62, 496, -53, 19, -62, 496, 10, -80, -62, -71, 10, -80, 1, -8, -44, -134, -53, -62, 19, -71, 514, 505, -62, 496, -71, 568, 100, 10, 10, -80, -80, 640, 10, 1, 1, -8, -8, 64, 28, -62, -62, 10, 10, 1, -62, -71, 10, -80, 1, -8, 10, 1, 1, -8, -8, 64, -80, -8, -8, 64, 64, -512, -224, 496, 496, -80, -80, -8, 496, 568, -80, 640, -8, 64, -80, -8, -8, 64, 64, -512, 640, 64, 64, -512, -512, 4096 } }; // 2-D array[120][120]
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
       // Pre-compute a constexpr triangular color matrix properly normalized #475
       struct TriangularNormalizedColorMatrix
       {
@@ -30075,7 +30076,7 @@ namespace mg5amcCpu
 #endif
       for( int icol = 0; icol < ncolor; icol++ )
       {
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
         // === C++ START ===
         // Diagonal terms
 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
@@ -30134,7 +30135,7 @@ namespace mg5amcCpu
       MEs_sv_previous += deltaMEs_previous;
 #endif
       /*
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
       if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", blockDim.x * blockIdx.x + threadIdx.x, ihel, MEs_sv );
 #else
 #ifdef MGONGPU_CPPSIMD
@@ -30293,8 +30294,8 @@ namespace mg5amcCpu
       { 1, 1, 1, -1, 1, -1, 1 },
       { 1, 1, 1, -1, 1, 1, -1 },
       { 1, 1, 1, -1, 1, 1, 1 } };
-#ifdef __CUDACC__
-    checkCuda( cudaMemcpyToSymbol( cHel, tHel, ncomb * npar * sizeof( short ) ) );
+#ifdef MGONGPUCPP_GPUIMPL
+    gpuMemcpyToSymbol( cHel, tHel, ncomb * npar * sizeof( short ) );
 #else
     memcpy( cHel, tHel, ncomb * npar * sizeof( short ) );
 #endif
@@ -30337,9 +30338,9 @@ namespace mg5amcCpu
     // Then copy them to CUDA constant memory (issue #39) or its C++ emulation in file-scope static memory
     const fptype tIPD[2] = { (fptype)m_pars->mdl_MT, (fptype)m_pars->mdl_WT };
     //const cxtype tIPC[0] = { ... }; // nicoup=0
-#ifdef __CUDACC__
-    checkCuda( cudaMemcpyToSymbol( cIPD, tIPD, 2 * sizeof( fptype ) ) );
-    //checkCuda( cudaMemcpyToSymbol( cIPC, tIPC, 0 * sizeof( cxtype ) ) ); // nicoup=0
+#ifdef MGONGPUCPP_GPUIMPL
+    gpuMemcpyToSymbol( cIPD, tIPD, 2 * sizeof( fptype ) );
+    //gpuMemcpyToSymbol( cIPC, tIPC, 0 * sizeof( cxtype ) ); // nicoup=0
 #else
     memcpy( cIPD, tIPD, 2 * sizeof( fptype ) );
     //memcpy( cIPC, tIPC, 0 * sizeof( cxtype ) ); // nicoup=0
@@ -30378,7 +30379,7 @@ namespace mg5amcCpu
   {
     std::stringstream out;
     // CUDA version (NVCC)
-    // [Use __NVCC__ instead of __CUDACC__ here!]
+    // [Use __NVCC__ instead of MGONGPUCPP_GPUIMPL here!]
     // [This tests if 'nvcc' was used even to build a .cc file, even if not necessarily 'nvcc -x cu' for a .cu file]
     // [Check 'nvcc --compiler-options -dM -E dummy.c | grep CUDA': see https://stackoverflow.com/a/53713712]
 #ifdef __NVCC__
@@ -30443,12 +30444,12 @@ namespace mg5amcCpu
   __global__ void /* clang-format off */
   computeDependentCouplings( const fptype* allgs, // input: Gs[nevt]
                              fptype* allcouplings // output: couplings[nevt*ndcoup*2]
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
                              , const int nevt     // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #endif
   ) /* clang-format on */
   {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     using namespace mg5amcGpu;
     using G_ACCESS = DeviceAccessGs;
     using C_ACCESS = DeviceAccessCouplings;
@@ -30469,7 +30470,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__ /* clang-format off */
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
   __global__ void
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
@@ -30598,9 +30599,9 @@ namespace mg5amcCpu
         nGoodHel++;
       }
     }
-#ifdef __CUDACC__
-    checkCuda( cudaMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) ) );
-    checkCuda( cudaMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) ) );
+#ifdef MGONGPUCPP_GPUIMPL
+    gpuMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) );
+    gpuMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) );
 #else
     cNGoodHel = nGoodHel;
     for( int ihel = 0; ihel < ncomb; ihel++ ) cGoodHel[ihel] = goodHel[ihel];
@@ -30624,7 +30625,7 @@ namespace mg5amcCpu
 #endif
             int* allselhel,                // output: helicity selection[nevt]
             int* allselcol                 // output: helicity selection[nevt]
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
             , const int nevt               // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #endif
             ) /* clang-format on */
@@ -30645,7 +30646,7 @@ namespace mg5amcCpu
     // Denominators: spins, colors and identical particles
     constexpr int helcolDenominators[1] = { 1536 }; // assume nprocesses == 1 (#272 and #343)
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     // Remember: in CUDA this is a kernel for one event, in c++ this processes n events
     const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid
 #else
@@ -30659,9 +30660,12 @@ namespace mg5amcCpu
 #endif
 
     // Start sigmaKin_lines
+
+#include "GpuAbstraction.h"
+
     // === PART 0 - INITIALISATION (before calculate_wavefunctions) ===
     // Reset the "matrix elements" - running sums of |M|^2 over helicities for the given event
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     allMEs[ievt] = 0;
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     allNumerators[ievt] = 0;
@@ -30689,7 +30693,7 @@ namespace mg5amcCpu
     // === PART 1 - HELICITY LOOP: CALCULATE WAVEFUNCTIONS ===
     // (in both CUDA and C++, using precomputed good helicities)
 
-#ifdef __CUDACC__ // CUDA OR C++
+#ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++
 
     // *** START OF PART 1a - CUDA (one event per CPU thread) ***
     // Running sum of partial amplitudes squared for event by event color selection (#402)
@@ -30893,7 +30897,7 @@ namespace mg5amcCpu
     // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event
     // [NB 'sum over final spins, average over initial spins', eg see
     // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf]
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     allMEs[ievt] /= helcolDenominators[0];
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     if( channelId > 0 ) allMEs[ievt] *= allNumerators[ievt] / allDenominators[ievt];
diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/CPPProcess.h b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/CPPProcess.h
index d1dd4d6150..b1f469b1c9 100644
--- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/CPPProcess.h
+++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/CPPProcess.h
@@ -4,7 +4,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
 // MadGraph5_aMC@NLO v. 3.5.0_lo_vect, 2023-06-09
@@ -25,7 +25,7 @@
 
 //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -107,7 +107,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   __global__ void
   computeDependentCouplings( const fptype* allgs,    // input: Gs[nevt]
                              fptype* allcouplings ); // output: couplings[nevt*ndcoup*2]
@@ -120,7 +120,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__ /* clang-format off */
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
   __global__ void
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
@@ -150,7 +150,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__ /* clang-format off */
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
   __global__ void
   sigmaKin( const fptype* allmomenta,      // input: momenta[nevt*npar*4]
             const fptype* allcouplings,    // input: couplings[nevt*ndcoup*2]
diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/CudaRuntime.h b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/CudaRuntime.h
deleted file mode 120000
index ce9e1a487a..0000000000
--- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/CudaRuntime.h
+++ /dev/null
@@ -1 +0,0 @@
-../CudaRuntime.h
\ No newline at end of file
diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/GpuAbstraction.h b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/GpuAbstraction.h
new file mode 120000
index 0000000000..72054e19ba
--- /dev/null
+++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/GpuAbstraction.h
@@ -0,0 +1 @@
+../GpuAbstraction.h
\ No newline at end of file
diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/GpuRuntime.h b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/GpuRuntime.h
new file mode 120000
index 0000000000..3920e83be4
--- /dev/null
+++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/GpuRuntime.h
@@ -0,0 +1 @@
+../GpuRuntime.h
\ No newline at end of file
diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/check_sa.cc b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/check_sa.cc
index f1e75b9252..1bad694d1c 100644
--- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/check_sa.cc
+++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/check_sa.cc
@@ -4,7 +4,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: O. Mattelaer (Nov 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 
 #include "mgOnGpuConfig.h"
@@ -12,6 +12,7 @@
 #include "BridgeKernels.h"
 #include "CPPProcess.h"
 #include "CrossSectionKernels.h"
+#include "GpuRuntime.h"
 #include "MatrixElementKernels.h"
 #include "MemoryAccessMatrixElements.h"
 #include "MemoryAccessMomenta.h"
@@ -63,7 +64,7 @@ usage( char* argv0, int ret = 1 )
   std::cout << std::endl;
   std::cout << "Summary stats are always computed: '-p' and '-j' only control their printout" << std::endl;
   std::cout << "The '-d' flag only enables NaN/abnormal warnings and OMP debugging" << std::endl;
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 #ifdef _OPENMP
   std::cout << std::endl;
   std::cout << "Use the OMP_NUM_THREADS environment variable to control OMP multi-threading" << std::endl;
@@ -77,7 +78,7 @@ int
 main( int argc, char** argv )
 {
   // Namespaces for CUDA and C++ (FIXME - eventually use the same namespace everywhere...)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   using namespace mg5amcGpu;
 #else
   using namespace mg5amcCpu;
@@ -103,11 +104,11 @@ main( int argc, char** argv )
     CurandDevice = 2
   };
 #ifdef __CUDACC__
-  RandomNumberMode rndgen = RandomNumberMode::CurandDevice; // default on GPU
+  RandomNumberMode rndgen = RandomNumberMode::CurandDevice; // default on CUDA GPU
 #elif not defined MGONGPU_HAS_NO_CURAND
   RandomNumberMode rndgen = RandomNumberMode::CurandHost;  // default on CPU if build has curand
 #else
-  RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // default on CPU if build has no curand
+  RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // default on CPU if build has no curand and on HIP GPU
 #endif
   // Rambo sampling mode (NB RamboHost implies CommonRandom or CurandHost!)
   enum class RamboSamplingMode
@@ -115,7 +116,7 @@ main( int argc, char** argv )
     RamboHost = 1,
     RamboDevice = 2
   };
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   RamboSamplingMode rmbsmp = RamboSamplingMode::RamboDevice; // default on GPU
 #else
   RamboSamplingMode rmbsmp = RamboSamplingMode::RamboHost; // default on CPU
@@ -148,7 +149,7 @@ main( int argc, char** argv )
 #ifdef __CUDACC__
       rndgen = RandomNumberMode::CurandDevice;
 #else
-      throw std::runtime_error( "CurandDevice is not supported on CPUs" );
+      throw std::runtime_error( "CurandDevice is not supported on CPUs or on HIP GPUs" );
 #endif
     }
     else if( arg == "--curhst" )
@@ -165,7 +166,7 @@ main( int argc, char** argv )
     }
     else if( arg == "--rmbdev" )
     {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
       rmbsmp = RamboSamplingMode::RamboDevice;
 #else
       throw std::runtime_error( "RamboDevice is not supported on CPUs" );
@@ -239,13 +240,13 @@ main( int argc, char** argv )
     return usage( argv[0] );
   }
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 #ifdef _OPENMP
   ompnumthreadsNotSetMeansOneThread( debug ? 1 : 0 ); // quiet(-1), info(0), debug(1)
 #endif
 #endif
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // Fail gently and avoid "Illegal instruction (core dumped)" if the host does not support the SIMD used in the ME calculation
   // Note: this prevents a crash on pmpe04 but not on some github CI nodes?
   // [NB: SIMD vectorization in mg5amc C++ code is only used in the ME calculation below MatrixElementKernelHost!]
@@ -263,14 +264,14 @@ main( int argc, char** argv )
 
   // === STEP 0 - INITIALISE
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 
-  // --- 00. Initialise cuda
-  // Instantiate a CudaRuntime at the beginnining of the application's main to
-  // invoke cudaSetDevice(0) in the constructor and book a cudaDeviceReset() call in the destructor
-  const std::string cdinKey = "00 CudaInit";
+  // --- 00. Initialise GPU
+  // Instantiate a GpuRuntime at the beginnining of the application's main.
+  // For CUDA this invokes cudaSetDevice(0) in the constructor and books a cudaDeviceReset() call in the destructor.
+  const std::string cdinKey = "00 GpuInit";
   timermap.start( cdinKey );
-  CudaRuntime cudaRuntime( debug );
+  GpuRuntime GpuRuntime( debug );
 #endif
 
   // --- 0a. Initialise physics process
@@ -292,7 +293,7 @@ main( int argc, char** argv )
   timermap.start( alloKey );
 
   // Memory buffers for random numbers for momenta
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferRndNumMomenta hstRndmom( nevt );
 #else
   PinnedHostBufferRndNumMomenta hstRndmom( nevt );
@@ -300,7 +301,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for sampling weights
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferWeights hstWeights( nevt );
 #else
   PinnedHostBufferWeights hstWeights( nevt );
@@ -308,7 +309,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for momenta
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferMomenta hstMomenta( nevt );
 #else
   PinnedHostBufferMomenta hstMomenta( nevt );
@@ -316,7 +317,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for Gs
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferGs hstGs( nevt );
 #else
   PinnedHostBufferGs hstGs( nevt );
@@ -333,7 +334,7 @@ main( int argc, char** argv )
   }
 
   // Memory buffers for matrix elements
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferMatrixElements hstMatrixElements( nevt );
 #else
   PinnedHostBufferMatrixElements hstMatrixElements( nevt );
@@ -342,7 +343,7 @@ main( int argc, char** argv )
 
   // Memory buffers for random numbers for helicity selection
   // *** NB #403 these buffers always remain initialised at 0: no need for helicity choice in gcheck/check (no LHE produced) ***
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferRndNumHelicity hstRndHel( nevt );
 #else
   PinnedHostBufferRndNumHelicity hstRndHel( nevt );
@@ -351,7 +352,7 @@ main( int argc, char** argv )
 
   // Memory buffers for random numbers for color selection
   // *** NB #402 these buffers always remain initialised at 0: no need for color choice in gcheck/check (no LHE produced) ***
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferRndNumColor hstRndCol( nevt );
 #else
   PinnedHostBufferRndNumColor hstRndCol( nevt );
@@ -359,7 +360,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for helicity selection
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferSelectedHelicity hstSelHel( nevt );
 #else
   PinnedHostBufferSelectedHelicity hstSelHel( nevt );
@@ -367,7 +368,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for color selection
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferSelectedColor hstSelCol( nevt );
 #else
   PinnedHostBufferSelectedColor hstSelCol( nevt );
@@ -403,7 +404,7 @@ main( int argc, char** argv )
 #else
   else
   {
-    throw std::logic_error( "CurandDevice is not supported on CPUs" ); // INTERNAL ERROR (no path to this statement)
+    throw std::logic_error( "CurandDevice is not supported on CPUs or HIP GPUs" ); // INTERNAL ERROR (no path to this statement)
   }
 #endif
 #else
@@ -421,7 +422,7 @@ main( int argc, char** argv )
   }
   else
   {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     prsk.reset( new RamboSamplingKernelDevice( energy, devRndmom, devMomenta, devWeights, gpublocks, gputhreads ) );
 #else
     throw std::logic_error( "RamboDevice is not supported on CPUs" ); // INTERNAL ERROR (no path to this statement)
@@ -432,7 +433,7 @@ main( int argc, char** argv )
   std::unique_ptr<MatrixElementKernelBase> pmek;
   if( !bridge )
   {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     pmek.reset( new MatrixElementKernelDevice( devMomenta, devGs, devRndHel, devRndCol, devMatrixElements, devSelHel, devSelCol, gpublocks, gputhreads ) );
 #else
     pmek.reset( new MatrixElementKernelHost( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, nevt ) );
@@ -440,7 +441,7 @@ main( int argc, char** argv )
   }
   else
   {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     pmek.reset( new BridgeKernelDevice( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, gpublocks, gputhreads ) );
 #else
     pmek.reset( new BridgeKernelHost( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, nevt ) );
@@ -482,7 +483,7 @@ main( int argc, char** argv )
     prnk->generateRnarray();
     //std::cout << "Got random numbers" << std::endl;
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     if( rndgen != RandomNumberMode::CurandDevice && rmbsmp == RamboSamplingMode::RamboDevice )
     {
       // --- 1c. Copy rndmom from host to device
@@ -514,7 +515,7 @@ main( int argc, char** argv )
     prsk->getMomentaFinal();
     //std::cout << "Got final momenta" << std::endl;
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     if( rmbsmp == RamboSamplingMode::RamboDevice )
     {
       // --- 2c. CopyDToH Weights
@@ -559,7 +560,7 @@ main( int argc, char** argv )
       dynamic_cast<BridgeKernelBase*>( pmek.get() )->transposeInputMomentaC2F();
     }
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     // --- 2d. CopyHToD Momenta
     const std::string gKey = "0.. CpHTDg";
     rambtime += timermap.start( gKey ); // FIXME! NOT A RAMBO TIMER!
@@ -588,7 +589,7 @@ main( int argc, char** argv )
     wv3atime += timermap.stop(); // calc only
     wavetime += wv3atime;        // calc plus copy
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     if( !bridge )
     {
       // --- 3b. CopyDToH MEs
@@ -731,15 +732,19 @@ main( int argc, char** argv )
     rndgentxt = "CURAND DEVICE";
 #ifdef __CUDACC__
   rndgentxt += " (CUDA code)";
+#elif defined __HIPCC__
+  rndgentxt += " (HIP code)";
 #else
   rndgentxt += " (C++ code)";
 #endif
 
   // Workflow description summary
   std::string wrkflwtxt;
-  // -- CUDA or C++?
+  // -- CUDA or HIP or C++?
 #ifdef __CUDACC__
   wrkflwtxt += "CUD:";
+#elif defined __HIPCC__
+  wrkflwtxt += "HIP:";
 #else
   wrkflwtxt += "CPP:";
 #endif
@@ -764,6 +769,12 @@ main( int argc, char** argv )
 #else
   wrkflwtxt += "???:"; // no path to this statement
 #endif
+#elif defined __HIPCC__
+#if defined MGONGPU_CUCXTYPE_CXSMPL
+  wrkflwtxt += "CXS:";
+#else
+  wrkflwtxt += "???:"; // no path to this statement
+#endif
 #else
 #if defined MGONGPU_CPPCXTYPE_STDCOMPLEX
   wrkflwtxt += "STX:";
@@ -789,7 +800,7 @@ main( int argc, char** argv )
     wrkflwtxt += "RMBDEV+";
   else
     wrkflwtxt += "??????+"; // no path to this statement
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   // -- HOST or DEVICE matrix elements? Standalone MEs or BRIDGE?
   if( !bridge )
     wrkflwtxt += "MESDEV";
@@ -845,7 +856,7 @@ main( int argc, char** argv )
 
   if( perf )
   {
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 #ifdef _OPENMP
     // Get the output of "nproc --all" (https://stackoverflow.com/a/478960)
     std::string nprocall;
@@ -866,6 +877,8 @@ main( int argc, char** argv )
     std::cout << std::string( SEP79, '*' ) << std::endl
 #ifdef __CUDACC__
               << "Process                     = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_CUDA"
+#elif defined __HIPCC__
+              << "Process                     = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_HIP"
 #else
               << "Process                     = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_CPP"
 #endif
@@ -892,21 +905,21 @@ main( int argc, char** argv )
 #elif defined MGONGPU_FPTYPE_FLOAT
               << "FP precision                = FLOAT (NaN/abnormal=" << nabn << ", zero=" << nzero << ")" << std::endl
 #endif
-#ifdef __CUDACC__
 #if defined MGONGPU_CUCXTYPE_CUCOMPLEX
               << "Complex type                = CUCOMPLEX" << std::endl
 #elif defined MGONGPU_CUCXTYPE_THRUST
               << "Complex type                = THRUST::COMPLEX" << std::endl
-#endif
-#else
+#elif defined MGONGPU_CUCXTYPE_CXSMPL
               << "Complex type                = STD::COMPLEX" << std::endl
+#else
+              << "Complex type                = ???" << std::endl // no path to this statement...
 #endif
               << "RanNumb memory layout       = AOSOA[" << neppR << "]"
               << ( neppR == 1 ? " == AOS" : "" )
               << " [HARDCODED FOR REPRODUCIBILITY]" << std::endl
               << "Momenta memory layout       = AOSOA[" << neppM << "]"
               << ( neppM == 1 ? " == AOS" : "" ) << std::endl
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     //<< "Wavefunction GPU memory     = LOCAL" << std::endl
 #else
 #if !defined MGONGPU_CPPSIMD
@@ -937,7 +950,7 @@ main( int argc, char** argv )
 #endif
 #endif
               << "Random number generation    = " << rndgentxt << std::endl
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 #ifdef _OPENMP
               << "OMP threads / `nproc --all` = " << omp_get_max_threads() << " / " << nprocall // includes a newline
 #endif
@@ -1033,14 +1046,14 @@ main( int argc, char** argv )
              << "\"FLOAT (NaN/abnormal=" << nabn << ")\"," << std::endl
 #endif
              << "\"Complex type\": "
-#ifdef __CUDACC__
 #if defined MGONGPU_CUCXTYPE_CUCOMPLEX
              << "\"CUCOMPLEX\"," << std::endl
 #elif defined MGONGPU_CUCXTYPE_THRUST
              << "\"THRUST::COMPLEX\"," << std::endl
-#endif
-#else
+#elif defined MGONGPU_CUCXTYPE_CXSMPL
              << "\"STD::COMPLEX\"," << std::endl
+#else
+             << "\"???\"," << std::endl                           // no path to this statement...
 #endif
              << "\"RanNumb memory layout\": "
              << "\"AOSOA[" << neppR << "]\""
@@ -1048,7 +1061,7 @@ main( int argc, char** argv )
              << "\"Momenta memory layout\": "
              << "\"AOSOA[" << neppM << "]\""
              << ( neppM == 1 ? " == AOS" : "" ) << ", " << std::endl
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     //<< "\"Wavefunction GPU memory\": " << "\"LOCAL\"," << std::endl
 #endif
              << "\"Curand generation\": "
diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/RamboSamplingKernels.cc b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/RamboSamplingKernels.cc
index da68aa9255..79abbcc4f8 100644
--- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/RamboSamplingKernels.cc
+++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/RamboSamplingKernels.cc
@@ -1,11 +1,11 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #include "RamboSamplingKernels.h"
 
-#include "CudaRuntime.h"
+#include "GpuRuntime.h"
 #include "MemoryAccessMomenta.h"
 #include "MemoryAccessRandomNumbers.h"
 #include "MemoryAccessWeights.h"
@@ -14,7 +14,7 @@
 
 #include <sstream>
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -92,7 +92,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   RamboSamplingKernelDevice::RamboSamplingKernelDevice( const fptype energy,               // input: energy
                                                         const BufferRndNumMomenta& rndmom, // input: random numbers in [0,1]
                                                         BufferMomenta& momenta,            // output: momenta
@@ -135,7 +135,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   __global__ void
   getMomentaInitialDevice( const fptype energy,
                            fptype* momenta )
@@ -147,17 +147,17 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   void
   RamboSamplingKernelDevice::getMomentaInitial()
   {
-    getMomentaInitialDevice<<<m_gpublocks, m_gputhreads>>>( m_energy, m_momenta.data() );
+    gpuLaunchKernel( getMomentaInitialDevice, m_gpublocks, m_gputhreads, m_energy, m_momenta.data() );
   }
 #endif
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   __global__ void
   getMomentaFinalDevice( const fptype energy,
                          const fptype* rndmom,
@@ -171,11 +171,11 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   void
   RamboSamplingKernelDevice::getMomentaFinal()
   {
-    getMomentaFinalDevice<<<m_gpublocks, m_gputhreads>>>( m_energy, m_rndmom.data(), m_momenta.data(), m_weights.data() );
+    gpuLaunchKernel( getMomentaFinalDevice, m_gpublocks, m_gputhreads, m_energy, m_rndmom.data(), m_momenta.data(), m_weights.data() );
   }
 #endif
 
diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/RamboSamplingKernels.h b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/RamboSamplingKernels.h
index 184089efd7..7c214cd74b 100644
--- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/RamboSamplingKernels.h
+++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/RamboSamplingKernels.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef RAMBOSAMPLINGKERNELS_H
 #define RAMBOSAMPLINGKERNELS_H 1
@@ -10,7 +10,7 @@
 
 #include "MemoryBuffers.h"
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -93,7 +93,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   // A class encapsulating RAMBO phase space sampling on a GPU device
   class RamboSamplingKernelDevice final : public SamplingKernelBase, public NumberOfEvents
   {
diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/RandomNumberKernels.h b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/RandomNumberKernels.h
index 188a72c2c9..21d63beeac 100644
--- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/RandomNumberKernels.h
+++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/RandomNumberKernels.h
@@ -1,14 +1,14 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef RANDOMNUMBERKERNELS_H
 #define RANDOMNUMBERKERNELS_H 1
 
 #include "mgOnGpuConfig.h"
 
-// NB This must come AFTER mgOnGpuConfig.h which contains our definition of __global__ when __CUDACC__ is not defined
+// NB This must come AFTER mgOnGpuConfig.h which contains our definition of __global__ when MGONGPUCPP_GPUIMPL is not defined
 #ifndef MGONGPU_HAS_NO_CURAND
 //#include "curand.h"
 struct curandGenerator_st; // forward definition from curand.h
@@ -16,7 +16,7 @@ struct curandGenerator_st; // forward definition from curand.h
 
 #include "MemoryBuffers.h"
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/cudacpp.mk b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/cudacpp.mk
index a0397e9ecc..bcb73d7f01 100644
--- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/cudacpp.mk
+++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/cudacpp.mk
@@ -1,7 +1,7 @@
 # Copyright (C) 2020-2023 CERN and UCLouvain.
 # Licensed under the GNU Lesser General Public License (version 3 or later).
 # Created by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin.
-# Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+# Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 
 #=== Determine the name of this makefile (https://ftp.gnu.org/old-gnu/Manuals/make-3.80/html_node/make_17.html)
 #=== NB: different names (e.g. cudacpp.mk and cudacpp_src.mk) are used in the Subprocess and src directories
@@ -32,7 +32,7 @@ UNAME_P := $(shell uname -p)
 #=== Configure common compiler flags for C++ and CUDA
 
 INCFLAGS = -I.
-OPTFLAGS = -O3 # this ends up in CUFLAGS too (should it?), cannot add -Ofast or -ffast-math here
+OPTFLAGS = -O3 # this ends up in GPUFLAGS too (should it?), cannot add -Ofast or -ffast-math here
 
 # Dependency on src directory
 MG5AMC_COMMONLIB = mg5amc_common
@@ -89,69 +89,139 @@ endif
 
 #-------------------------------------------------------------------------------
 
-#=== Configure the CUDA compiler
-
-# If CXX is not a single word (example "clang++ --gcc-toolchain...") then disable CUDA builds (issue #505)
-# This is because it is impossible to pass this to "CUFLAGS += -ccbin <host-compiler>" below
-ifneq ($(words $(subst ccache ,,$(CXX))),1) # allow at most "CXX=ccache <host-compiler>" from outside
-  $(warning CUDA builds are not supported for multi-word CXX "$(CXX)")
-  override CUDA_HOME=disabled
-endif
-
-# If CUDA_HOME is not set, try to set it from the location of nvcc
-ifndef CUDA_HOME
-  CUDA_HOME = $(patsubst %bin/nvcc,%,$(shell which nvcc 2>/dev/null))
-  $(warning CUDA_HOME was not set: using "$(CUDA_HOME)")
-endif
-
-# Set NVCC as $(CUDA_HOME)/bin/nvcc if it exists
-ifneq ($(wildcard $(CUDA_HOME)/bin/nvcc),)
-  NVCC = $(CUDA_HOME)/bin/nvcc
-  USE_NVTX ?=-DUSE_NVTX
-  # See https://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/index.html
-  # See https://arnon.dk/matching-sm-architectures-arch-and-gencode-for-various-nvidia-cards/
-  # Default: use compute capability 70 for V100 (CERN lxbatch, CERN itscrd, Juwels Cluster).
-  # Embed device code for 70, and PTX for 70+.
-  # Export MADGRAPH_CUDA_ARCHITECTURE (comma-separated list) to use another value or list of values (see #533).
-  # Examples: use 60 for P100 (Piz Daint), 80 for A100 (Juwels Booster, NVidia raplab/Curiosity).
-  MADGRAPH_CUDA_ARCHITECTURE ?= 70
-  ###CUARCHFLAGS = -gencode arch=compute_$(MADGRAPH_CUDA_ARCHITECTURE),code=compute_$(MADGRAPH_CUDA_ARCHITECTURE) -gencode arch=compute_$(MADGRAPH_CUDA_ARCHITECTURE),code=sm_$(MADGRAPH_CUDA_ARCHITECTURE) # Older implementation (AV): go back to this one for multi-GPU support #533
-  ###CUARCHFLAGS = --gpu-architecture=compute_$(MADGRAPH_CUDA_ARCHITECTURE) --gpu-code=sm_$(MADGRAPH_CUDA_ARCHITECTURE),compute_$(MADGRAPH_CUDA_ARCHITECTURE) # Newer implementation (SH): cannot use this as-is for multi-GPU support #533
-  comma:=,
-  CUARCHFLAGS = $(foreach arch,$(subst $(comma), ,$(MADGRAPH_CUDA_ARCHITECTURE)),-gencode arch=compute_$(arch),code=compute_$(arch) -gencode arch=compute_$(arch),code=sm_$(arch))
-  CUINC = -I$(CUDA_HOME)/include/
-  CURANDLIBFLAGS = -L$(CUDA_HOME)/lib64/ -lcurand # NB: -lcuda is not needed here!
-  CUOPTFLAGS = -lineinfo
-  CUFLAGS = $(OPTFLAGS) $(CUOPTFLAGS) $(INCFLAGS) $(CUINC) $(USE_NVTX) $(CUARCHFLAGS) -use_fast_math
-  ###CUFLAGS += -Xcompiler -Wall -Xcompiler -Wextra -Xcompiler -Wshadow
-  ###NVCC_VERSION = $(shell $(NVCC) --version | grep 'Cuda compilation tools' | cut -d' ' -f5 | cut -d, -f1)
-  CUFLAGS += -std=c++17 # need CUDA >= 11.2 (see #333): this is enforced in mgOnGpuConfig.h
-  # Without -maxrregcount: baseline throughput: 6.5E8 (16384 32 12) up to 7.3E8 (65536 128 12)
-  ###CUFLAGS+= --maxrregcount 160 # improves throughput: 6.9E8 (16384 32 12) up to 7.7E8 (65536 128 12)
-  ###CUFLAGS+= --maxrregcount 128 # improves throughput: 7.3E8 (16384 32 12) up to 7.6E8 (65536 128 12)
-  ###CUFLAGS+= --maxrregcount 96 # degrades throughput: 4.1E8 (16384 32 12) up to 4.5E8 (65536 128 12)
-  ###CUFLAGS+= --maxrregcount 64 # degrades throughput: 1.7E8 (16384 32 12) flat at 1.7E8 (65536 128 12)
-else ifneq ($(origin REQUIRE_CUDA),undefined)
-  # If REQUIRE_CUDA is set but no cuda is found, stop here (e.g. for CI tests on GPU #443)
-  $(error No cuda installation found (set CUDA_HOME or make nvcc visible in PATH))
-else
-  # No cuda. Switch cuda compilation off and go to common random numbers in C++
-  $(warning CUDA_HOME is not set or is invalid: export CUDA_HOME to compile with cuda)
-  override NVCC=
-  override USE_NVTX=
-  override CUINC=
-  override CURANDLIBFLAGS=
-endif
+CUDA_COMPILER_PATH := $(shell compiler="`which nvcc 2>/dev/null`" && while [ -L "$$compiler" ]; do compiler=`readlink "$$compiler"`; done && echo "$$compiler")
+HIP_COMPILER_PATH := $(shell compiler="`which hipcc 2>/dev/null`" && while [ -L "$$compiler" ]; do compiler=`readlink "$$compiler"`; done && echo "$$compiler")
+
+ifeq ($(findstring nvcc,$(CUDA_COMPILER_PATH)),nvcc)
+  #=== Configure the CUDA compiler
+
+  # If CXX is not a single word (example "clang++ --gcc-toolchain...") then disable CUDA builds (issue #505)
+  # This is because it is impossible to pass this to "GPUFLAGS += -ccbin <host-compiler>" below
+  ifneq ($(words $(subst ccache ,,$(CXX))),1) # allow at most "CXX=ccache <host-compiler>" from outside
+    $(warning CUDA builds are not supported for multi-word CXX "$(CXX)")
+    override CUDA_HOME=disabled
+  endif
+
+  # If CUDA_HOME is not set, try to set it from the location of NVCC
+  ifndef CUDA_HOME
+    CUDA_HOME = $(patsubst %bin/nvcc,%,$(shell which nvcc 2>/dev/null))
+    $(warning CUDA_HOME was not set: using "$(CUDA_HOME)")
+  endif
+
+  # Set GPUCC as $(CUDA_HOME)/bin/nvcc if it exists
+  ifneq ($(wildcard $(CUDA_HOME)/bin/nvcc),)
+    GPUCC = $(CUDA_HOME)/bin/nvcc
+    USE_NVTX ?=-DUSE_NVTX
+    # See https://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/index.html
+    # See https://arnon.dk/matching-sm-architectures-arch-and-gencode-for-various-nvidia-cards/
+    # Default: use compute capability 70 for V100 (CERN lxbatch, CERN itscrd, Juwels Cluster).
+    # Embed device code for 70, and PTX for 70+.
+    # Export MADGRAPH_CUDA_ARCHITECTURE (comma-separated list) to use another value or list of values (see #533).
+    # Examples: use 60 for P100 (Piz Daint), 80 for A100 (Juwels Booster, NVidia raplab/Curiosity).
+    MADGRAPH_CUDA_ARCHITECTURE ?= 70
+    ###CUARCHFLAGS = -gencode arch=compute_$(MADGRAPH_CUDA_ARCHITECTURE),code=compute_$(MADGRAPH_CUDA_ARCHITECTURE) -gencode arch=compute_$(MADGRAPH_CUDA_ARCHITECTURE),code=sm_$(MADGRAPH_CUDA_ARCHITECTURE) # Older implementation (AV): go back to this one for multi-GPU support #533
+    ###CUARCHFLAGS = --gpu-architecture=compute_$(MADGRAPH_CUDA_ARCHITECTURE) --gpu-code=sm_$(MADGRAPH_CUDA_ARCHITECTURE),compute_$(MADGRAPH_CUDA_ARCHITECTURE) # Newer implementation (SH): cannot use this as-is for multi-GPU support #533
+    comma:=,
+    CUARCHFLAGS = $(foreach arch,$(subst $(comma), ,$(MADGRAPH_CUDA_ARCHITECTURE)),-gencode arch=compute_$(arch),code=compute_$(arch) -gencode arch=compute_$(arch),code=sm_$(arch))
+    CUINC = -I$(CUDA_HOME)/include/
+    CURANDLIBFLAGS = -L$(CUDA_HOME)/lib64/ -lcurand # NB: -lcuda is not needed here!
+    CUOPTFLAGS = -lineinfo
+    GPUFLAGS = $(OPTFLAGS) $(CUOPTFLAGS) $(INCFLAGS) $(CUINC) $(USE_NVTX) $(CUARCHFLAGS) -use_fast_math
+    ###GPUFLAGS += -Xcompiler -Wall -Xcompiler -Wextra -Xcompiler -Wshadow
+    ###GPUCC_VERSION = $(shell $(GPUCC) --version | grep 'Cuda compilation tools' | cut -d' ' -f5 | cut -d, -f1)
+    GPUFLAGS += -std=c++17 # need CUDA >= 11.2 (see #333): this is enforced in mgOnGpuConfig.h
+    # Without -maxrregcount: baseline throughput: 6.5E8 (16384 32 12) up to 7.3E8 (65536 128 12)
+    ###GPUFLAGS+= --maxrregcount 160 # improves throughput: 6.9E8 (16384 32 12) up to 7.7E8 (65536 128 12)
+    ###GPUFLAGS+= --maxrregcount 128 # improves throughput: 7.3E8 (16384 32 12) up to 7.6E8 (65536 128 12)
+    ###GPUFLAGS+= --maxrregcount 96 # degrades throughput: 4.1E8 (16384 32 12) up to 4.5E8 (65536 128 12)
+    ###GPUFLAGS+= --maxrregcount 64 # degrades throughput: 1.7E8 (16384 32 12) flat at 1.7E8 (65536 128 12)
+
+    CUBUILDRULEFLAGS = -Xcompiler -fPIC -c
+    CCBUILDRULEFLAGS = -Xcompiler -fPIC -c -x cu
+
+    CUDATESTFLAGS = -lcuda
+
+  else ifneq ($(origin REQUIRE_CUDA),undefined)
+    # If REQUIRE_CUDA is set but no cuda is found, stop here (e.g. for CI tests on GPU #443)
+    $(error No cuda installation found (set CUDA_HOME or make GPUCC visible in PATH))
+  else
+    # No cuda. Switch cuda compilation off and go to common random numbers in C++
+    $(warning CUDA_HOME is not set or is invalid: export CUDA_HOME to compile with cuda)
+    override GPUCC=
+    override USE_NVTX=
+    override CUINC=
+    override CURANDLIBFLAGS=
+  endif
+
+  # Set the host C++ compiler for GPUCC via "-ccbin <host-compiler>"
+  # (NB issue #505: this must be a single word, "clang++ --gcc-toolchain..." is not supported)
+  GPUFLAGS += -ccbin $(shell which $(subst ccache ,,$(CXX)))
+
+  # Allow newer (unsupported) C++ compilers with older versions of CUDA if ALLOW_UNSUPPORTED_COMPILER_IN_CUDA is set (#504)
+  ifneq ($(origin ALLOW_UNSUPPORTED_COMPILER_IN_CUDA),undefined)
+  GPUFLAGS += -allow-unsupported-compiler
+  endif
+
+else ifeq ($(findstring hipcc,$(HIP_COMPILER_PATH)),hipcc)
+  #=== Configure the HIP compiler
+
+  # If CXX is not a single word (example "clang++ --gcc-toolchain...") then disable CUDA builds (issue #505)
+  # This is because it is impossible to pass this to "GPUFLAGS += -ccbin <host-compiler>" below
+  ifneq ($(words $(subst ccache ,,$(CXX))),1) # allow at most "CXX=ccache <host-compiler>" from outside
+    $(warning CUDA builds are not supported for multi-word CXX "$(CXX)")
+    override CUDA_HOME=disabled
+  endif
+
+  # If HIP_HOME is not set, try to set it from the location of GPUCC
+  ifndef HIP_HOME
+    HIP_HOME = $(patsubst %bin/hipcc,%,$(HIP_COMPILER_PATH))
+    $(warning HIP_HOME was not set: using "$(HIP_HOME)")
+  endif
 
-# Set the host C++ compiler for nvcc via "-ccbin <host-compiler>"
-# (NB issue #505: this must be a single word, "clang++ --gcc-toolchain..." is not supported)
-CUFLAGS += -ccbin $(shell which $(subst ccache ,,$(CXX)))
+  # Set GPUCC as $(HIP_HOME)/bin/hipcc if it exists
+  ifneq ($(wildcard $(HIP_HOME)/bin/hipcc),)
+    GPUCC = $(HIP_HOME)/bin/hipcc
+
+    # Should maybe find something equivelant to this in HIP
+    #USE_NVTX ?=-DUSE_NVTX
+
+    HIPARCHFLAGS = -target x86_64-linux-gnu --offload-arch=gfx90a
+    HIPINC = -I$(HIP_HOME)/include/
+
+    # -DHIP_FAST_MATH equivelant to -use_fast_math in HIP 
+    # (But only for single precision line 208: https://rocm-developer-tools.github.io/HIP/hcc__detail_2math__functions_8h_source.html)
+    GPUFLAGS = $(OPTFLAGS) $(CUOPTFLAGS) $(INCFLAGS) $(HIPINC) $(HIPARCHFLAGS) -DHIP_FAST_MATH -DHIP_PLATFORM=amd -fPIC
+    ###GPUFLAGS += -Xcompiler -Wall -Xcompiler -Wextra -Xcompiler -Wshadow
+    GPUFLAGS += -std=c++17
+    # Without -maxrregcount: baseline throughput: 6.5E8 (16384 32 12) up to 7.3E8 (65536 128 12)
+    ###GPUFLAGS+= --maxrregcount 160 # improves throughput: 6.9E8 (16384 32 12) up to 7.7E8 (65536 128 12)
+    ###GPUFLAGS+= --maxrregcount 128 # improves throughput: 7.3E8 (16384 32 12) up to 7.6E8 (65536 128 12)
+    ###GPUFLAGS+= --maxrregcount 96 # degrades throughput: 4.1E8 (16384 32 12) up to 4.5E8 (65536 128 12)
+    ###GPUFLAGS+= --maxrregcount 64 # degrades throughput: 1.7E8 (16384 32 12) flat at 1.7E8 (65536 128 12)
+
+    CUBUILDRULEFLAGS = -fPIC -c
+    CCBUILDRULEFLAGS = -fPIC -c
+
+  else ifneq ($(origin REQUIRE_HIP),undefined)
+    # If REQUIRE_HIP is set but no cuda is found, stop here (e.g. for CI tests on GPU #443)
+    $(error No hip installation found (set HIP_HOME or make GPUCC visible in PATH))
+  else
+    # No hip. Switch hip compilation off and go to common random numbers in C++
+    $(warning HIP_HOME is not set or is invalid: export HIP_HOME to compile with hip)
+    override GPUCC=
+    override USE_NVTX=
+    override CUINC=
+    override CURANDLIBFLAGS=
+  endif
+
+  # Allow newer (unsupported) C++ compilers with older versions of CUDA if ALLOW_UNSUPPORTED_COMPILER_IN_CUDA is set (#504)
+  ifneq ($(origin ALLOW_UNSUPPORTED_COMPILER_IN_CUDA),undefined)
+  GPUFLAGS += -allow-unsupported-compiler
+  endif
 
-# Allow newer (unsupported) C++ compilers with older versions of CUDA if ALLOW_UNSUPPORTED_COMPILER_IN_CUDA is set (#504)
-ifneq ($(origin ALLOW_UNSUPPORTED_COMPILER_IN_CUDA),undefined)
-CUFLAGS += -allow-unsupported-compiler
 endif
 
+  
 #-------------------------------------------------------------------------------
 
 #=== Configure ccache for C++ and CUDA builds
@@ -163,9 +233,9 @@ endif
 #ifeq ($(USECCACHE)$(shell echo $(AR) | grep ccache),1)
 #  override AR:=ccache $(AR)
 #endif
-ifneq ($(NVCC),)
-  ifeq ($(USECCACHE)$(shell echo $(NVCC) | grep ccache),1)
-    override NVCC:=ccache $(NVCC)
+ifneq ($(GPUCC),)
+  ifeq ($(USECCACHE)$(shell echo $(GPUCC) | grep ccache),1)
+    override GPUCC:=ccache $(GPUCC)
   endif
 endif
 
@@ -189,7 +259,7 @@ endif
 
 # PowerPC-specific CUDA compiler flags (to be reviewed!)
 ifeq ($(UNAME_P),ppc64le)
-  CUFLAGS+= -Xcompiler -mno-float128
+  GPUFLAGS+= -Xcompiler -mno-float128
 endif
 
 #-------------------------------------------------------------------------------
@@ -199,10 +269,10 @@ endif
 # Set the default OMPFLAGS choice
 ifneq ($(shell $(CXX) --version | egrep '^Intel'),)
 override OMPFLAGS = -fopenmp
-###override OMPFLAGS = # disable OpenMP MT on Intel (was ok without nvcc but not ok with nvcc before #578)
+###override OMPFLAGS = # disable OpenMP MT on Intel (was ok without GPUCC but not ok with GPUCC before #578)
 else ifneq ($(shell $(CXX) --version | egrep '^(clang)'),)
 override OMPFLAGS = -fopenmp
-###override OMPFLAGS = # disable OpenMP MT on clang (was not ok without or with nvcc before #578)
+###override OMPFLAGS = # disable OpenMP MT on clang (was not ok without or with GPUCC before #578)
 else ifneq ($(shell $(CXX) --version | egrep '^(Apple clang)'),)
 override OMPFLAGS = # disable OpenMP MT on Apple clang (builds fail in the CI #578)
 else
@@ -253,7 +323,10 @@ endif
 
 # Set the default RNDGEN (random number generator) choice
 ifeq ($(RNDGEN),)
-  ifeq ($(NVCC),)
+  ifeq ($(GPUCC),)
+    override RNDGEN = hasNoCurand
+  # Edgecase for HIP compilation
+  else ifeq ($(findstring hipcc,$(GPUCC)),hipcc)
     override RNDGEN = hasNoCurand
   else ifeq ($(RNDGEN),)
     override RNDGEN = hasCurand
@@ -328,13 +401,13 @@ CXXFLAGS+= $(AVXFLAGS)
 $(info FPTYPE=$(FPTYPE))
 ifeq ($(FPTYPE),d)
   CXXFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_DOUBLE
-  CUFLAGS  += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_DOUBLE
+  GPUFLAGS  += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_DOUBLE
 else ifeq ($(FPTYPE),f)
   CXXFLAGS += -DMGONGPU_FPTYPE_FLOAT -DMGONGPU_FPTYPE2_FLOAT
-  CUFLAGS  += -DMGONGPU_FPTYPE_FLOAT -DMGONGPU_FPTYPE2_FLOAT
+  GPUFLAGS  += -DMGONGPU_FPTYPE_FLOAT -DMGONGPU_FPTYPE2_FLOAT
 else ifeq ($(FPTYPE),m)
   CXXFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_FLOAT
-  CUFLAGS  += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_FLOAT
+  GPUFLAGS  += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_FLOAT
 else
   $(error Unknown FPTYPE='$(FPTYPE)': only 'd', 'f' and 'm' are supported)
 endif
@@ -343,7 +416,7 @@ endif
 $(info HELINL=$(HELINL))
 ifeq ($(HELINL),1)
   CXXFLAGS += -DMGONGPU_INLINE_HELAMPS
-  CUFLAGS  += -DMGONGPU_INLINE_HELAMPS
+  GPUFLAGS  += -DMGONGPU_INLINE_HELAMPS
 else ifneq ($(HELINL),0)
   $(error Unknown HELINL='$(HELINL)': only '0' and '1' are supported)
 endif
@@ -352,7 +425,7 @@ endif
 $(info HRDCOD=$(HRDCOD))
 ifeq ($(HRDCOD),1)
   CXXFLAGS += -DMGONGPU_HARDCODE_PARAM
-  CUFLAGS  += -DMGONGPU_HARDCODE_PARAM
+  GPUFLAGS  += -DMGONGPU_HARDCODE_PARAM
 else ifneq ($(HRDCOD),0)
   $(error Unknown HRDCOD='$(HRDCOD)': only '0' and '1' are supported)
 endif
@@ -404,11 +477,11 @@ ifeq ($(UNAME_S),Darwin)
   override CULIBFLAGSRPATH2 =
 else
   # RPATH to cuda/cpp libs when linking executables
-  override CXXLIBFLAGSRPATH = -Wl,-rpath,$(LIBDIRRPATH)
-  override CULIBFLAGSRPATH = -Xlinker -rpath,$(LIBDIRRPATH)
+  override CXXLIBFLAGSRPATH = -Wl,-rpath=$(LIBDIRRPATH)
+  override CULIBFLAGSRPATH = -Xlinker -rpath=$(LIBDIRRPATH)
   # RPATH to common lib when linking cuda/cpp libs
-  override CXXLIBFLAGSRPATH2 = -Wl,-rpath,'$$ORIGIN'
-  override CULIBFLAGSRPATH2 = -Xlinker -rpath,'$$ORIGIN'
+  override CXXLIBFLAGSRPATH2 = -Wl,-rpath='$$ORIGIN'
+  override CULIBFLAGSRPATH2 = -Xlinker -rpath='$$ORIGIN'
 endif
 
 # Setting LD_LIBRARY_PATH or DYLD_LIBRARY_PATH in the RUNTIME is no longer necessary (neither on Linux nor on Mac)
@@ -421,7 +494,7 @@ override RUNTIME =
 cxx_main=$(BUILDDIR)/check.exe
 fcxx_main=$(BUILDDIR)/fcheck.exe
 
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 cu_main=$(BUILDDIR)/gcheck.exe
 fcu_main=$(BUILDDIR)/fgcheck.exe
 else
@@ -452,15 +525,16 @@ $(BUILDDIR)/.build.$(TAG):
 	@touch $(BUILDDIR)/.build.$(TAG)
 
 # Generic target and build rules: objects from CUDA compilation
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 $(BUILDDIR)/%.o : %.cu *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
 	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
-	$(NVCC) $(CPPFLAGS) $(CUFLAGS) -Xcompiler -fPIC -c $< -o $@
+	$(GPUCC) $(CPPFLAGS) $(GPUFLAGS) $(CUBUILDRULEFLAGS) $< -o $@
 
 $(BUILDDIR)/%_cu.o : %.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
 	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
-	$(NVCC) $(CPPFLAGS) $(CUFLAGS) -Xcompiler -fPIC -c -x cu $< -o $@
+	$(GPUCC) $(CPPFLAGS) $(GPUFLAGS) $(CCBUILDRULEFLAGS) $< -o $@
 endif
+# -x cu in line above
 
 # Generic target and build rules: objects from C++ compilation
 # (NB do not include CUINC here! add it only for NVTX or curand #679)
@@ -469,11 +543,14 @@ $(BUILDDIR)/%.o : %.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
 	$(CXX) $(CPPFLAGS) $(CXXFLAGS) -fPIC -c $< -o $@
 
 # Apply special build flags only to CrossSectionKernel.cc and gCrossSectionKernel.cu (no fast math, see #117)
+# Added edgecase for HIP compilation
 ifeq ($(shell $(CXX) --version | grep ^nvc++),)
 $(BUILDDIR)/CrossSectionKernels.o: CXXFLAGS += -fno-fast-math
 $(BUILDDIR)/CrossSectionKernels.o: CXXFLAGS += -fno-fast-math
-ifneq ($(NVCC),)
-$(BUILDDIR)/gCrossSectionKernels.o: CUFLAGS += -Xcompiler -fno-fast-math
+ifeq ($(findstring nvcc,$(GPUCC)),nvcc)
+  $(BUILDDIR)/gCrossSectionKernels.o: GPUFLAGS += -Xcompiler -fno-fast-math
+else
+  $(BUILDDIR)/gCrossSectionKernels.o: GPUFLAGS += -fno-fast-math
 endif
 endif
 
@@ -489,10 +566,10 @@ ifeq ($(RNDGEN),hasCurand)
 $(BUILDDIR)/CurandRandomNumberKernel.o: CXXFLAGS += $(CUINC)
 endif
 
-# Avoid "warning: builtin __has_trivial_... is deprecated; use __is_trivially_... instead" in nvcc with icx2023 (#592)
+# Avoid "warning: builtin __has_trivial_... is deprecated; use __is_trivially_... instead" in GPUCC with icx2023 (#592)
 ifneq ($(shell $(CXX) --version | egrep '^(Intel)'),)
-ifneq ($(NVCC),)
-CUFLAGS += -Xcompiler -Wno-deprecated-builtins
+ifneq ($(GPUCC),)
+GPUFLAGS += -Wno-deprecated-builtins
 endif
 endif
 
@@ -500,8 +577,8 @@ endif
 # This patch does remove the warning, but I prefer to keep it disabled for the moment...
 ###ifneq ($(shell $(CXX) --version | egrep '^(clang|Apple clang|Intel)'),)
 ###$(BUILDDIR)/CrossSectionKernels.o: CXXFLAGS += -Wno-overriding-t-option
-###ifneq ($(NVCC),)
-###$(BUILDDIR)/gCrossSectionKernels.o: CUFLAGS += -Xcompiler -Wno-overriding-t-option
+###ifneq ($(GPUCC),)
+###$(BUILDDIR)/gCrossSectionKernels.o: GPUFLAGS += -Xcompiler -Wno-overriding-t-option
 ###endif
 ###endif
 
@@ -528,7 +605,7 @@ MG5AMC_CXXLIB = mg5amc_$(processid_short)_cpp
 cxx_objects_lib=$(BUILDDIR)/CPPProcess.o $(BUILDDIR)/MatrixElementKernels.o $(BUILDDIR)/BridgeKernels.o $(BUILDDIR)/CrossSectionKernels.o
 cxx_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel.o $(BUILDDIR)/RamboSamplingKernels.o
 
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 MG5AMC_CULIB = mg5amc_$(processid_short)_cuda
 cu_objects_lib=$(BUILDDIR)/gCPPProcess.o $(BUILDDIR)/gMatrixElementKernels.o $(BUILDDIR)/gBridgeKernels.o $(BUILDDIR)/gCrossSectionKernels.o
 cu_objects_exe=$(BUILDDIR)/gCommonRandomNumberKernel.o $(BUILDDIR)/gRamboSamplingKernels.o
@@ -540,11 +617,11 @@ $(LIBDIR)/lib$(MG5AMC_CXXLIB).so: cxx_objects_lib += $(BUILDDIR)/fbridge.o
 $(LIBDIR)/lib$(MG5AMC_CXXLIB).so: $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib)
 	$(CXX) -shared -o $@ $(cxx_objects_lib) $(CXXLIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB)
 
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 $(LIBDIR)/lib$(MG5AMC_CULIB).so: $(BUILDDIR)/fbridge_cu.o
 $(LIBDIR)/lib$(MG5AMC_CULIB).so: cu_objects_lib += $(BUILDDIR)/fbridge_cu.o
 $(LIBDIR)/lib$(MG5AMC_CULIB).so: $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cu_objects_lib)
-	$(NVCC) --shared -o $@ $(cu_objects_lib) $(CULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB)
+	$(GPUCC) --shared -o $@ $(cu_objects_lib) $(CULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB)
 endif
 
 #-------------------------------------------------------------------------------
@@ -561,16 +638,16 @@ $(cxx_main): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PAT
 $(cxx_main): $(BUILDDIR)/check_sa.o $(LIBDIR)/lib$(MG5AMC_CXXLIB).so $(cxx_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel.o
 	$(CXX) -o $@ $(BUILDDIR)/check_sa.o $(OMPFLAGS) -ldl -pthread $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_CXXLIB) $(cxx_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel.o $(CURANDLIBFLAGS)
 
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 ifneq ($(shell $(CXX) --version | grep ^Intel),)
-$(cu_main): LIBFLAGS += -lintlc # compile with icpx and link with nvcc (undefined reference to `_intel_fast_memcpy')
-$(cu_main): LIBFLAGS += -lsvml # compile with icpx and link with nvcc (undefined reference to `__svml_cos4_l9')
+$(cu_main): LIBFLAGS += -lintlc # compile with icpx and link with GPUCC (undefined reference to `_intel_fast_memcpy')
+$(cu_main): LIBFLAGS += -lsvml # compile with icpx and link with GPUCC (undefined reference to `__svml_cos4_l9')
 else ifneq ($(shell $(CXX) --version | grep ^nvc++),) # support nvc++ #531
 $(cu_main): LIBFLAGS += -L$(patsubst %bin/nvc++,%lib,$(subst ccache ,,$(CXX))) -lnvhpcatm -lnvcpumath -lnvc
 endif
 $(cu_main): LIBFLAGS += $(CULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH
 $(cu_main): $(BUILDDIR)/gcheck_sa.o $(LIBDIR)/lib$(MG5AMC_CULIB).so $(cu_objects_exe) $(BUILDDIR)/gCurandRandomNumberKernel.o
-	$(NVCC) -o $@ $(BUILDDIR)/gcheck_sa.o $(CUARCHFLAGS) $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe) $(BUILDDIR)/gCurandRandomNumberKernel.o $(CURANDLIBFLAGS)
+	$(GPUCC) -o $@ $(BUILDDIR)/gcheck_sa.o $(CUARCHFLAGS) $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe) $(BUILDDIR)/gCurandRandomNumberKernel.o $(CURANDLIBFLAGS)
 endif
 
 #-------------------------------------------------------------------------------
@@ -596,17 +673,17 @@ $(fcxx_main): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PA
 $(fcxx_main): $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler.o $(LIBDIR)/lib$(MG5AMC_CXXLIB).so $(cxx_objects_exe)
 	$(CXX) -o $@ $(BUILDDIR)/fcheck_sa.o $(OMPFLAGS) $(BUILDDIR)/fsampler.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CXXLIB) $(cxx_objects_exe)
 
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 ifneq ($(shell $(CXX) --version | grep ^Intel),)
-$(fcu_main): LIBFLAGS += -lintlc # compile with icpx and link with nvcc (undefined reference to `_intel_fast_memcpy')
-$(fcu_main): LIBFLAGS += -lsvml # compile with icpx and link with nvcc (undefined reference to `__svml_cos4_l9')
+$(fcu_main): LIBFLAGS += -lintlc # compile with icpx and link with GPUCC (undefined reference to `_intel_fast_memcpy')
+$(fcu_main): LIBFLAGS += -lsvml # compile with icpx and link with GPUCC (undefined reference to `__svml_cos4_l9')
 endif
 ifeq ($(UNAME_S),Darwin)
 $(fcu_main): LIBFLAGS += -L$(shell dirname $(shell $(FC) --print-file-name libgfortran.dylib)) # add path to libgfortran on Mac #375
 endif
 $(fcu_main): LIBFLAGS += $(CULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH
 $(fcu_main): $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler_cu.o $(LIBDIR)/lib$(MG5AMC_CULIB).so $(cu_objects_exe)
-	$(NVCC) -o $@ $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler_cu.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe)
+	$(GPUCC) -o $@ $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler_cu.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe)
 endif
 
 #-------------------------------------------------------------------------------
@@ -618,7 +695,7 @@ $(BUILDDIR)/testxxx.o: testxxx_cc_ref.txt
 $(testmain): $(BUILDDIR)/testxxx.o
 $(testmain): cxx_objects_exe += $(BUILDDIR)/testxxx.o # Comment out this line to skip the C++ test of xxx functions
 
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 $(BUILDDIR)/testxxx_cu.o: $(GTESTLIBS)
 $(BUILDDIR)/testxxx_cu.o: INCFLAGS += $(GTESTINC)
 $(BUILDDIR)/testxxx_cu.o: testxxx_cc_ref.txt
@@ -631,7 +708,7 @@ $(BUILDDIR)/testmisc.o: INCFLAGS += $(GTESTINC)
 $(testmain): $(BUILDDIR)/testmisc.o
 $(testmain): cxx_objects_exe += $(BUILDDIR)/testmisc.o # Comment out this line to skip the C++ miscellaneous tests
 
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 $(BUILDDIR)/testmisc_cu.o: $(GTESTLIBS)
 $(BUILDDIR)/testmisc_cu.o: INCFLAGS += $(GTESTINC)
 $(testmain): $(BUILDDIR)/testmisc_cu.o
@@ -643,12 +720,12 @@ $(BUILDDIR)/runTest.o: INCFLAGS += $(GTESTINC)
 $(testmain): $(BUILDDIR)/runTest.o
 $(testmain): cxx_objects_exe += $(BUILDDIR)/runTest.o
 
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 $(BUILDDIR)/runTest_cu.o: $(GTESTLIBS)
 $(BUILDDIR)/runTest_cu.o: INCFLAGS += $(GTESTINC)
 ifneq ($(shell $(CXX) --version | grep ^Intel),)
-$(testmain): LIBFLAGS += -lintlc # compile with icpx and link with nvcc (undefined reference to `_intel_fast_memcpy')
-$(testmain): LIBFLAGS += -lsvml # compile with icpx and link with nvcc (undefined reference to `__svml_cos4_l9')
+$(testmain): LIBFLAGS += -lintlc # compile with icpx and link with GPUCC (undefined reference to `_intel_fast_memcpy')
+$(testmain): LIBFLAGS += -lsvml # compile with icpx and link with GPUCC (undefined reference to `__svml_cos4_l9')
 else ifneq ($(shell $(CXX) --version | grep ^nvc++),) # support nvc++ #531
 $(testmain): LIBFLAGS += -L$(patsubst %bin/nvc++,%lib,$(subst ccache ,,$(CXX))) -lnvhpcatm -lnvcpumath -lnvc
 endif
@@ -672,14 +749,14 @@ $(testmain): LIBFLAGS += -lgomp
 endif
 endif
 
-ifeq ($(NVCC),) # link only runTest.o
+ifeq ($(GPUCC),) # link only runTest.o
 $(testmain): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH
 $(testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(cxx_objects_exe) $(GTESTLIBS)
 	$(CXX) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) -ldl -pthread $(LIBFLAGS)
 else # link both runTest.o and runTest_cu.o
 $(testmain): LIBFLAGS += $(CULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH
 $(testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) $(GTESTLIBS)
-	$(NVCC) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) -ldl $(LIBFLAGS) -lcuda
+	  $(GPUCC) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) -ldl $(LIBFLAGS) $(CUDATESTFLAGS)
 endif
 
 # Use flock (Linux only, no Mac) to allow 'make -j' if googletest has not yet been downloaded https://stackoverflow.com/a/32666215
@@ -782,9 +859,9 @@ ifeq ($(USECCACHE),1)
 	ccache --version | head -1
 endif
 	@echo ""
-	@echo NVCC=$(NVCC)
-ifneq ($(NVCC),)
-	$(NVCC) --version
+	@echo GPUCC=$(GPUCC)
+ifneq ($(GPUCC),)
+	$(GPUCC) --version
 endif
 	@echo ""
 	@echo CXX=$(CXX)
@@ -803,7 +880,7 @@ endif
 
 # Target: check (run the C++ test executable)
 # [NB THIS IS WHAT IS USED IN THE GITHUB CI!]
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 check: runTest cmpFcheck cmpFGcheck
 else
 check: runTest cmpFcheck
diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/fbridge.cc b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/fbridge.cc
index f93c05b0b3..2b956730d4 100644
--- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/fbridge.cc
+++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/fbridge.cc
@@ -1,11 +1,11 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: S. Roiser (Oct 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Roiser, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #include "Bridge.h"
 #include "CPPProcess.h"
-#include "CudaRuntime.h"
+#include "GpuRuntime.h"
 
 extern "C"
 {
@@ -22,7 +22,7 @@ extern "C"
    * Using the same Fortran MadEvent code, linking to the hetrerogeneous library would allow access to both CPU and GPU implementations.
    * The specific heterogeneous configuration (how many GPUs, how many threads on each CPU, etc) could be loaded in CUDA/C++ from a data file.
    */
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   using namespace mg5amcGpu;
 #else
   using namespace mg5amcCpu;
@@ -46,8 +46,8 @@ extern "C"
    */
   void fbridgecreate_( CppObjectInFortran** ppbridge, const int* pnevtF, const int* pnparF, const int* pnp4F )
   {
-#ifdef __CUDACC__
-    CudaRuntime::setUp();
+#ifdef MGONGPUCPP_GPUIMPL
+    GpuRuntime::setUp();
 #endif
     // Create a process object, read parm card and set parameters
     // FIXME: the process instance can happily go out of scope because it is only needed to read parameters?
@@ -69,8 +69,8 @@ extern "C"
     Bridge<FORTRANFPTYPE>* pbridge = dynamic_cast<Bridge<FORTRANFPTYPE>*>( *ppbridge );
     if( pbridge == 0 ) throw std::runtime_error( "fbridgedelete_: invalid Bridge address" );
     delete pbridge;
-#ifdef __CUDACC__
-    CudaRuntime::tearDown();
+#ifdef MGONGPUCPP_GPUIMPL
+    GpuRuntime::tearDown();
 #endif
   }
 
@@ -100,7 +100,7 @@ extern "C"
   {
     Bridge<FORTRANFPTYPE>* pbridge = dynamic_cast<Bridge<FORTRANFPTYPE>*>( *ppbridge );
     if( pbridge == 0 ) throw std::runtime_error( "fbridgesequence_: invalid Bridge address" );
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     // Use the device/GPU implementation in the CUDA library
     // (there is also a host implementation in this library)
     pbridge->gpu_sequence( momenta, gs, rndhel, rndcol, *pchannelId, mes, selhel, selcol );
diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/fsampler.cc b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/fsampler.cc
index 2fb445372d..3743934f41 100644
--- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/fsampler.cc
+++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/fsampler.cc
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Feb 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #include "mgOnGpuConfig.h"
 
@@ -13,7 +13,7 @@
 
 //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -40,7 +40,7 @@ namespace mg5amcCpu
   private:
     const int m_nevt; // The number of events in each iteration
     int m_iiter;      // The iteration counter (for random number seeding)
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
     HostBufferRndNumMomenta m_hstRndmom; // Memory buffers for random numbers
     HostBufferMomenta m_hstMomenta;      // Memory buffers for momenta
     HostBufferWeights m_hstWeights;      // Memory buffers for sampling weights
@@ -105,7 +105,7 @@ namespace mg5amcCpu
 
 extern "C"
 {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   using namespace mg5amcGpu;
 #else
   using namespace mg5amcCpu;
diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/runTest.cc b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/runTest.cc
index 572e28aaea..461ec5c3a5 100644
--- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/runTest.cc
+++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/runTest.cc
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: S. Hageboeck (Nov 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 
 #include "mgOnGpuConfig.h"
 
@@ -15,7 +15,7 @@
 #include "RandomNumberKernels.h"
 #include "epoch_process_id.h"
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 using namespace mg5amcGpu;
 #else
 using namespace mg5amcCpu;
@@ -32,7 +32,7 @@ struct CUDA_CPU_TestBase : public TestDriverBase
     : TestDriverBase( npar, refFileName ) {}
 };
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 struct CPUTest : public CUDA_CPU_TestBase
 {
   // Struct data members (process, and memory structures for random numbers, momenta, matrix elements and weights on host and device)
@@ -114,7 +114,7 @@ struct CPUTest : public CUDA_CPU_TestBase
 };
 #endif
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 struct CUDATest : public CUDA_CPU_TestBase
 {
   // Reset the device when our test goes out of scope. Note that this should happen after
@@ -123,7 +123,7 @@ struct CUDATest : public CUDA_CPU_TestBase
   {
     ~DeviceReset()
     {
-      checkCuda( cudaDeviceReset() ); // this is needed by cuda-memcheck --leak-check full
+      gpuDeviceReset(); // this is needed by cuda-memcheck --leak-check full
     }
   } deviceResetter;
 
@@ -249,7 +249,7 @@ INSTANTIATE_TEST_SUITE_P( prefix, \
                           test_suite_name, \
                           testing::Values( new CUDATest( MG_EPOCH_REFERENCE_FILE_NAME ) ) );
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 MG_INSTANTIATE_TEST_SUITE_GPU( XTESTID_GPU( MG_EPOCH_PROCESS_ID ), MadgraphTest );
 #else
 MG_INSTANTIATE_TEST_SUITE_CPU( XTESTID_CPU( MG_EPOCH_PROCESS_ID ), MadgraphTest );
diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/testmisc.cc b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/testmisc.cc
index 989aba1fdc..2bd7a9fcf9 100644
--- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/testmisc.cc
+++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/testmisc.cc
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 //----------------------------------------------------------------------------
 // Use ./runTest.exe --gtest_filter=*misc to run only this test
 //----------------------------------------------------------------------------
@@ -17,7 +17,7 @@
 #include <sstream>
 #include <typeinfo>
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 #define TESTID( s ) s##_GPU_MISC
 #else
 #define TESTID( s ) s##_CPU_MISC
diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/testxxx.cc b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/testxxx.cc
index 4243e9fcec..6e8657edca 100644
--- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/testxxx.cc
+++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/testxxx.cc
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Apr 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #include "mgOnGpuConfig.h"
 
@@ -20,7 +20,7 @@
 #include <iomanip>
 #include <iostream>
 #include <vector>
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 #define TESTID( s ) s##_GPU_XXX
 #else
 #define TESTID( s ) s##_CPU_XXX
@@ -41,7 +41,7 @@ TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testxxx )
   assert( nevt % neppM == 0 ); // nevt must be a multiple of neppM
   assert( nevt % neppV == 0 ); // nevt must be a multiple of neppV
   // Fill in the input momenta
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   mg5amcGpu::PinnedHostBufferMomenta hstMomenta( nevt ); // AOSOA[npagM][npar=4][np4=4][neppM]
 #else
   mg5amcCpu::HostBufferMomenta hstMomenta( nevt ); // AOSOA[npagM][npar=4][np4=4][neppM]
@@ -228,7 +228,7 @@ TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testxxx )
   {
     for( int ievt = 0; ievt < nevt; ievt++ )
     {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
       using namespace mg5amcGpu;
 #else
       using namespace mg5amcCpu;
diff --git a/epochX/cudacpp/gg_ttggg.mad/src/HelAmps_sm.h b/epochX/cudacpp/gg_ttggg.mad/src/HelAmps_sm.h
index ee2fcbbde5..f772885631 100644
--- a/epochX/cudacpp/gg_ttggg.mad/src/HelAmps_sm.h
+++ b/epochX/cudacpp/gg_ttggg.mad/src/HelAmps_sm.h
@@ -4,7 +4,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: A. Valassi (Sep 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
 // MadGraph5_aMC@NLO v. 3.5.0_lo_vect, 2023-06-09
@@ -26,7 +26,7 @@
 //#include <iomanip>
 //#include <iostream>
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/gg_ttggg.mad/src/Parameters_sm.cc b/epochX/cudacpp/gg_ttggg.mad/src/Parameters_sm.cc
index d3d01102fd..de87dcaf64 100644
--- a/epochX/cudacpp/gg_ttggg.mad/src/Parameters_sm.cc
+++ b/epochX/cudacpp/gg_ttggg.mad/src/Parameters_sm.cc
@@ -4,7 +4,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: A. Valassi (Sep 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
 // MadGraph5_aMC@NLO v. 3.5.0_lo_vect, 2023-06-09
diff --git a/epochX/cudacpp/gg_ttggg.mad/src/Parameters_sm.h b/epochX/cudacpp/gg_ttggg.mad/src/Parameters_sm.h
index 6551d8da81..fe7d686938 100644
--- a/epochX/cudacpp/gg_ttggg.mad/src/Parameters_sm.h
+++ b/epochX/cudacpp/gg_ttggg.mad/src/Parameters_sm.h
@@ -214,7 +214,7 @@ namespace Parameters_sm_dependentCouplings
 #pragma GCC diagnostic push
 #pragma GCC diagnostic ignored "-Wunused-variable"  // e.g. <<warning: unused variable ‘mdl_G__exp__2’ [-Wunused-variable]>>
 #pragma GCC diagnostic ignored "-Wunused-parameter" // e.g. <<warning: unused parameter ‘G’ [-Wunused-parameter]>>
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 #pragma nv_diagnostic push
 #pragma nv_diag_suppress 177 // e.g. <<warning #177-D: variable "mdl_G__exp__2" was declared but never referenced>>
 #endif
@@ -242,7 +242,7 @@ namespace Parameters_sm_dependentCouplings
     // End SM implementation - no special handling of vectors of floats as in EFT (#439)
     return out;
   }
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 #pragma GCC diagnostic pop
 #pragma nv_diagnostic pop
 #endif
@@ -258,7 +258,7 @@ namespace Parameters_sm_independentCouplings
 
 //==========================================================================
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/gg_ttggg.mad/src/mgOnGpuConfig.h b/epochX/cudacpp/gg_ttggg.mad/src/mgOnGpuConfig.h
index 881353abac..390766116b 100644
--- a/epochX/cudacpp/gg_ttggg.mad/src/mgOnGpuConfig.h
+++ b/epochX/cudacpp/gg_ttggg.mad/src/mgOnGpuConfig.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jul 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MGONGPUCONFIG_H
 #define MGONGPUCONFIG_H 1
@@ -10,13 +10,26 @@
 // There are two different code bases for standalone_cudacpp (without multichannel) and madevent+cudacpp (with multichannel)
 #define MGONGPU_SUPPORTS_MULTICHANNEL 1
 
+// Is this a GPU (CUDA, HIP) or CPU implementation?
+#ifdef __CUDACC__
+#define MGONGPUCPP_GPUIMPL cuda
+#elif defined __HIPCC__
+#define MGONGPUCPP_GPUIMPL hip
+#else
+#undef MGONGPUCPP_GPUIMPL
+#endif
+
 // ** NB1 Throughputs (e.g. 6.8E8) are events/sec for "./gcheck.exe -p 65536 128 12"
 // ** NB2 Baseline on b7g47n0004 fluctuates (probably depends on load on other VMs)
 
 // Choose if curand is supported for generating random numbers
+// For CUDA, by default, it is supported
+// For HIP, by default, it is not supported
 // For C++, by default, do not inline, but allow this macro to be set from outside with e.g. -DMGONGPU_HAS_NO_CURAND
 #ifdef __CUDACC__
 #undef MGONGPU_HAS_NO_CURAND
+#elif defined __HIPCC__
+#define MGONGPU_HAS_NO_CURAND 1
 #else
 //#undef MGONGPU_HAS_NO_CURAND // default
 ////#define MGONGPU_HAS_NO_CURAND 1
@@ -52,23 +65,28 @@
 //#undef MGONGPU_HARDCODE_PARAM // default
 ////#define MGONGPU_HARDCODE_PARAM 1
 
-// Complex type in c++: std::complex or cxsmpl (CHOOSE ONLY ONE)
-#ifndef __CUDACC__
-//#define MGONGPU_CPPCXTYPE_STDCOMPLEX 1 // ~8 percent slower on float, same on double (5.1E6/double, 9.4E6/float)
-#define MGONGPU_CPPCXTYPE_CXSMPL 1 // new default (5.1E6/double, 10.2E6/float)
-#endif
-
-// Complex type in cuda: thrust or cucomplex or cxsmpl (CHOOSE ONLY ONE)
+// Complex type in CUDA: thrust or cucomplex or cxsmpl (CHOOSE ONLY ONE)
 #ifdef __CUDACC__
 #define MGONGPU_CUCXTYPE_THRUST 1 // default (~1.15E9/double, ~3.2E9/float)
 //#define MGONGPU_CUCXTYPE_CUCOMPLEX 1 // ~10 percent slower (1.03E9/double, ~2.8E9/float)
 //#define MGONGPU_CUCXTYPE_CXSMPL 1 // ~10 percent slower (1.00E9/double, ~2.9E9/float)
+
+// Complex type in HIP: cxsmpl (ONLY ONE OPTION POSSIBLE)
+#elif defined __HIPCC__
+#define MGONGPU_CUCXTYPE_CXSMPL 1 // ~10 percent slower (1.00E9/double, ~2.9E9/float)
+
+// Complex type in C++: std::complex or cxsmpl (CHOOSE ONLY ONE)
+#else
+//#define MGONGPU_CPPCXTYPE_STDCOMPLEX 1 // ~8 percent slower on float, same on double (5.1E6/double, 9.4E6/float)
+#define MGONGPU_CPPCXTYPE_CXSMPL 1 // new default (5.1E6/double, 10.2E6/float)
 #endif
 
-// Cuda nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation
+// CUDA nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation
 #ifdef __CUDACC__
-#undef MGONGPU_NSIGHT_DEBUG // default
+#undef MGONGPU_NSIGHT_DEBUG // default in CUDA
 //#define MGONGPU_NSIGHT_DEBUG 1
+#else
+#undef MGONGPU_NSIGHT_DEBUG // only option in HIP or C++
 #endif
 
 // SANITY CHECKS (floating point precision for everything but color algebra #537)
@@ -84,17 +102,21 @@
 #error You cannot use double precision for color algebra and single precision elsewhere
 #endif
 
-// SANITY CHECKS (c++ complex number implementation)
-#ifndef __CUDACC__
-#if defined MGONGPU_CPPCXTYPE_STDCOMPLEX and defined MGONGPU_CPPCXTYPE_CXSMPL
-#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CPPCXTYPE_STDCOMPLEX or MGONGPU_CPPCXTYPE_CXSMPL
+// SANITY CHECKS (CUDA complex number implementation)
+#ifdef __CUDACC__
+#if defined MGONGPU_CUCXTYPE_THRUST and defined MGONGPU_CUCXTYPE_CUCOMPLEX
+#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CUCXTYPE_THRUST or MGONGPU_CUCXTYPE_CUCOMPLEX for CUDA
+#elif defined MGONGPU_CUCXTYPE_THRUST and defined MGONGPU_CUCXTYPE_CXSMPL
+#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CUCXTYPE_THRUST or MGONGPU_CUCXTYPE_CXSMPL for CUDA
+#elif defined MGONGPU_CUCXTYPE_CUCOMPLEX and defined MGONGPU_CUCXTYPE_CXSMPL
+#error You must CHOOSE (ONE AND) ONLY ONE OF MGONGPU_CUCXTYPE_CUCOMPLEX or MGONGPU_CUCXTYPE_CXSMPL for CUDA
 #endif
 #endif
 
-// SANITY CHECKS (cuda complex number implementation)
-#ifdef __CUDACC__
-#if defined MGONGPU_CUCXTYPE_THRUST and defined MGONGPU_CUCXTYPE_CUCOMPLEX and defined MGONGPU_CUCXTYPE_CXSMPL
-#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CUCXTYPE_THRUST or MGONGPU_CUCXTYPE_CUCOMPLEX or MGONGPU_CUCXTYPE_CXSMPL
+// SANITY CHECKS (C++ complex number implementation)
+#ifndef MGONGPUCPP_GPUIMPL
+#if defined MGONGPU_CPPCXTYPE_STDCOMPLEX and defined MGONGPU_CPPCXTYPE_CXSMPL
+#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CPPCXTYPE_STDCOMPLEX or MGONGPU_CPPCXTYPE_CXSMPL for C++
 #endif
 #endif
 
@@ -131,7 +153,7 @@ namespace mgOnGpu
   // Alignment requirement for using reinterpret_cast with SIMD vectorized code
   // (using reinterpret_cast with non aligned memory may lead to segmentation faults!)
   // Only needed for C++ code but can be enforced also in NVCC builds of C++ code using CUDA>=11.2 and C++17 (#318, #319, #333)
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   constexpr int cppAlign = 64; // alignment requirement for SIMD vectorization (64-byte i.e. 512-bit)
 #endif
 
@@ -142,7 +164,7 @@ using mgOnGpu::fptype;
 using mgOnGpu::fptype2;
 
 // C++ SIMD vectorization width (this will be used to set neppV)
-#ifdef __CUDACC__ // CUDA implementation has no SIMD
+#ifdef MGONGPUCPP_GPUIMPL // CUDA and HIP implementations have no SIMD
 #undef MGONGPU_CPPSIMD
 #elif defined __AVX512VL__ && defined MGONGPU_PVW512 // C++ "512z" AVX512 with 512 width (512-bit ie 64-byte): 8 (DOUBLE) or 16 (FLOAT)
 #ifdef MGONGPU_FPTYPE_DOUBLE
@@ -172,9 +194,9 @@ using mgOnGpu::fptype2;
 #undef MGONGPU_CPPSIMD
 #endif
 
-// Cuda nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation
+// CUDA nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation
 // Arguments (not used so far): text is __FUNCTION__, code is 0 (start) or 1 (end)
-#if defined __CUDACC__ && defined MGONGPU_NSIGHT_DEBUG /* clang-format off */
+#if defined __CUDA__ && defined MGONGPU_NSIGHT_DEBUG /* clang-format off */
 #define mgDebugDeclare() __shared__ float mgDebugCounter[mgOnGpu::ntpbMAX];
 #define mgDebugInitialise() { mgDebugCounter[threadIdx.x] = 0; }
 #define mgDebug( code, text ) { mgDebugCounter[threadIdx.x] += 1; }
@@ -186,8 +208,8 @@ using mgOnGpu::fptype2;
 #define mgDebugFinalise() { /*noop*/ }
 #endif /* clang-format on */
 
-// Define empty CUDA declaration specifiers for C++
-#ifndef __CUDACC__
+// Define empty CUDA/HIP declaration specifiers for C++
+#ifndef MGONGPUCPP_GPUIMPL
 #define __global__
 #define __host__
 #define __device__
diff --git a/epochX/cudacpp/gg_ttggg.mad/src/mgOnGpuCxtypes.h b/epochX/cudacpp/gg_ttggg.mad/src/mgOnGpuCxtypes.h
index 0cb2f1db7e..4e7ab03fa2 100644
--- a/epochX/cudacpp/gg_ttggg.mad/src/mgOnGpuCxtypes.h
+++ b/epochX/cudacpp/gg_ttggg.mad/src/mgOnGpuCxtypes.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022, based on earlier work by D. Smith) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MGONGPUCXTYPES_H
 #define MGONGPUCXTYPES_H 1
@@ -19,7 +19,7 @@
 #include <complex>
 
 // Complex type in cuda: thrust or cucomplex or cxsmpl
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 #if defined MGONGPU_CUCXTYPE_THRUST
 #pragma clang diagnostic push
 #pragma clang diagnostic ignored "-Wtautological-compare" // for icpx2021/clang13 (https://stackoverflow.com/a/15864661)
@@ -201,7 +201,7 @@ namespace mgOnGpu
 {
 
   // --- Type definitions (complex type: cxtype)
-#ifdef __CUDACC__ // cuda
+#ifdef MGONGPUCPP_GPUIMPL // cuda
 #if defined MGONGPU_CUCXTYPE_THRUST
   typedef thrust::complex<fptype> cxtype;
 #elif defined MGONGPU_CUCXTYPE_CUCOMPLEX
@@ -281,7 +281,7 @@ cxmake( const std::complex<double>& c ) // std::complex to cxsmpl (double-to-flo
 
 //==========================================================================
 
-#if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_THRUST // cuda + thrust
+#if defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CUCXTYPE_THRUST // cuda + thrust
 
 //------------------------------
 // CUDA - using thrust::complex
@@ -317,11 +317,11 @@ cxmake( const cxtype& c )
   return c;
 }
 
-#endif // #if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_THRUST
+#endif // #if defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CUCXTYPE_THRUST
 
 //==========================================================================
 
-#if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_CUCOMPLEX // cuda + cucomplex
+#if defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CUCXTYPE_CUCOMPLEX // cuda + cucomplex
 
 //------------------------------
 // CUDA - using cuComplex
@@ -536,11 +536,11 @@ cxmake( const std::complex<fptype>& c ) // std::complex to cucomplex (float-to-f
   return cxmake( c.real(), c.imag() );
 }
 
-#endif // #if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_CUCOMPLEX
+#endif // #if defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CUCXTYPE_CUCOMPLEX
 
 //==========================================================================
 
-#if not defined __CUDACC__ and defined MGONGPU_CPPCXTYPE_STDCOMPLEX // c++ + stdcomplex
+#if not defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CPPCXTYPE_STDCOMPLEX // c++ + stdcomplex
 
 //------------------------------
 // C++ - using std::complex
@@ -584,7 +584,7 @@ cxmake( const std::complex<double>& c ) // std::complex to std::complex (cast do
 }
 #endif
 
-#endif // #if not defined __CUDACC__ and defined MGONGPU_CPPCXTYPE_STDCOMPLEX
+#endif // #if not defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CPPCXTYPE_STDCOMPLEX
 
 //==========================================================================
 
diff --git a/epochX/cudacpp/gg_ttggg.mad/src/mgOnGpuFptypes.h b/epochX/cudacpp/gg_ttggg.mad/src/mgOnGpuFptypes.h
index a1cde16a67..6f6cee64d6 100644
--- a/epochX/cudacpp/gg_ttggg.mad/src/mgOnGpuFptypes.h
+++ b/epochX/cudacpp/gg_ttggg.mad/src/mgOnGpuFptypes.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MGONGPUFPTYPES_H
 #define MGONGPUFPTYPES_H 1
@@ -13,7 +13,7 @@
 
 //==========================================================================
 
-#ifdef __CUDACC__ // cuda
+#ifdef MGONGPUCPP_GPUIMPL // cuda
 
 //------------------------------
 // Floating point types - Cuda
@@ -57,11 +57,11 @@ fpsqrt( const fptype& f )
 #endif
 }
 
-#endif // #ifdef __CUDACC__
+#endif // #ifdef MGONGPUCPP_GPUIMPL
 
 //==========================================================================
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 
 //------------------------------
 // Floating point types - C++
@@ -85,7 +85,7 @@ fpsqrt( const fptype& f )
   return std::sqrt( f );
 }
 
-#endif // #ifndef __CUDACC__
+#endif // #ifndef MGONGPUCPP_GPUIMPL
 
 //==========================================================================
 
diff --git a/epochX/cudacpp/gg_ttggg.mad/src/mgOnGpuVectors.h b/epochX/cudacpp/gg_ttggg.mad/src/mgOnGpuVectors.h
index 9d3e82b1e3..7904b93c61 100644
--- a/epochX/cudacpp/gg_ttggg.mad/src/mgOnGpuVectors.h
+++ b/epochX/cudacpp/gg_ttggg.mad/src/mgOnGpuVectors.h
@@ -108,7 +108,7 @@ namespace mgOnGpu /* clang-format off */
 #endif
 #endif
 
-#else // i.e #ifndef MGONGPU_CPPSIMD (this includes #ifdef __CUDACC__)
+#else // i.e #ifndef MGONGPU_CPPSIMD (this includes #ifdef MGONGPUCPP_GPUIMPL)
 
   const int neppV = 1;
 
@@ -129,7 +129,7 @@ using mgOnGpu::bool_v;
 
 //--------------------------------------------------------------------------
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 
 // Printout to stream for user defined types
 
@@ -744,11 +744,11 @@ fpvsplit1( const fptype2_v& v )
 
 #endif // #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
 
-#endif // #ifndef __CUDACC__
+#endif // #ifndef MGONGPUCPP_GPUIMPL
 
 //==========================================================================
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 
 //------------------------------
 // Vector types - CUDA
@@ -786,12 +786,12 @@ cxternary( const bool& mask, const cxtype& a, const cxtype& b )
   return ( mask ? a : b );
 }
 
-#endif // #ifdef __CUDACC__
+#endif // #ifdef MGONGPUCPP_GPUIMPL
 
 //==========================================================================
 
 // Scalar-or-vector types: scalar in CUDA, vector or scalar in C++
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 typedef bool bool_sv;
 typedef fptype fptype_sv;
 typedef fptype2 fptype2_sv;
@@ -812,7 +812,7 @@ typedef mgOnGpu::cxtype_ref cxtype_sv_ref;
 #endif
 
 // Scalar-or-vector zeros: scalar in CUDA, vector or scalar in C++
-#ifdef __CUDACC__ /* clang-format off */
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
 inline __host__ __device__ cxtype cxzero_sv(){ return cxtype( 0, 0 ); }
 #elif defined MGONGPU_CPPSIMD
 inline cxtype_v cxzero_sv() { return cxtype_v(); } // RRRR=0000 IIII=0000
diff --git a/epochX/cudacpp/gg_ttggg.mad/src/rambo.h b/epochX/cudacpp/gg_ttggg.mad/src/rambo.h
index e02ea52496..cd7e1008ea 100644
--- a/epochX/cudacpp/gg_ttggg.mad/src/rambo.h
+++ b/epochX/cudacpp/gg_ttggg.mad/src/rambo.h
@@ -4,7 +4,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 
 #include "mgOnGpuConfig.h"
@@ -18,7 +18,7 @@
 #include <iostream>
 
 // Simplified rambo version for 2 to N (with N>=2) processes with massless particles
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -83,7 +83,7 @@ namespace mg5amcCpu
       static bool first = true;
       if( first )
       {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
         if constexpr( M_ACCESS::isOnDevice() ) // avoid
         {
           const int ievt0 = 0;
@@ -166,7 +166,7 @@ namespace mg5amcCpu
     wt = po2log;
     if( nparf != 2 ) wt = ( 2. * nparf - 4. ) * log( energy ) + z[nparf - 1];
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
     // issue warnings if weight is too small or too large
     static int iwarn[5] = { 0, 0, 0, 0, 0 };
     if( wt < -180. )
diff --git a/epochX/cudacpp/gg_ttggg.sa/CODEGEN_cudacpp_gg_ttggg_log.txt b/epochX/cudacpp/gg_ttggg.sa/CODEGEN_cudacpp_gg_ttggg_log.txt
index 4105134487..abe8c1ab15 100644
--- a/epochX/cudacpp/gg_ttggg.sa/CODEGEN_cudacpp_gg_ttggg_log.txt
+++ b/epochX/cudacpp/gg_ttggg.sa/CODEGEN_cudacpp_gg_ttggg_log.txt
@@ -51,8 +51,8 @@ Note that you can still compile and run aMC@NLO with the built-in PDFs
  MG5_aMC> set lhapdf /PATH/TO/lhapdf-config
 
 Using default text editor "vi". Set another one in ./input/mg5_configuration.txt
-No valid eps viewer found. Please set in ./input/mg5_configuration.txt
-Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt
+Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt
+No valid web browser found. Please set in ./input/mg5_configuration.txt
 import /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_cudacpp_gg_ttggg.mg
 The import format was not given, so we guess it as command
 set stdout_level DEBUG
@@ -62,7 +62,7 @@ generate g g > t t~ g g g
 No model currently active, so we import the Standard Model
 INFO: load particles 
 INFO: load vertices 
-[1;32mDEBUG: model prefixing  takes 0.004633426666259766 [0m
+[1;32mDEBUG: model prefixing  takes 0.005353450775146484 [0m
 INFO: Restrict model sm with file models/sm/restrict_default.dat . 
 [1;32mDEBUG: Simplifying conditional expressions [0m
 [1;32mDEBUG: remove interactions: u s w+ at order: QED=1 [0m
@@ -155,7 +155,7 @@ INFO: Please specify coupling orders to bypass this step.
 INFO: Trying coupling order WEIGHTED<=5: WEIGTHED IS QCD+2*QED 
 INFO: Trying process: g g > t t~ g g g WEIGHTED<=5 @1  
 INFO: Process has 1240 diagrams 
-1 processes with 1240 diagrams generated in 1.703 s
+1 processes with 1240 diagrams generated in 1.788 s
 Total: 1 processes with 1240 diagrams
 output standalone_cudacpp CODEGEN_cudacpp_gg_ttggg
 Load PLUGIN.CUDACPP_SA_OUTPUT
@@ -186,17 +186,17 @@ FileWriter <class 'PLUGIN.CUDACPP_SA_OUTPUT.model_handling.PLUGIN_CPPWriter'> fo
 [1;32mDEBUG:  type(self.helas_call_writer) = [0m <class 'PLUGIN.CUDACPP_SA_OUTPUT.model_handling.PLUGIN_GPUFOHelasCallWriter'> [1;30m[model_handling.py at line 1151][0m [0m
 [1;32mDEBUG:  self.support_multichannel, self.include_multi_channel = [0m True False [1;30m[model_handling.py at line 1152][0m [0m
 [1;32mDEBUG:  multi_channel_map = [0m None [1;30m[model_handling.py at line 1643][0m [0m
-[1;32mDEBUG:  diag_to_config = [0m {} [1;30m[model_handling.py at line 1698][0m [0m
-[1;32mDEBUG:  call = [0m vxxxxx( momenta,m_pars->%s, cHel[ihel][%d],%+d, w_sv[%d], %d ); [1;30m[model_handling.py at line 1810][0m [0m
-[1;32mDEBUG:  ('ZERO', 0, -1, 0, 0) [1;30m[model_handling.py at line 1811][0m [0m
-[1;32mDEBUG:  call = [0m vxxxxx( momenta,m_pars->%s, cHel[ihel][%d],%+d, w_sv[%d], %d ); [1;30m[model_handling.py at line 1810][0m [0m
-[1;32mDEBUG:  ('ZERO', 1, -1, 1, 1) [1;30m[model_handling.py at line 1811][0m [0m
-[1;32mDEBUG:  call = [0m vxxxxx( momenta,m_pars->%s, cHel[ihel][%d],%+d, w_sv[%d], %d ); [1;30m[model_handling.py at line 1810][0m [0m
-[1;32mDEBUG:  ('ZERO', 4, 1, 4, 4) [1;30m[model_handling.py at line 1811][0m [0m
-[1;32mDEBUG:  call = [0m vxxxxx( momenta,m_pars->%s, cHel[ihel][%d],%+d, w_sv[%d], %d ); [1;30m[model_handling.py at line 1810][0m [0m
-[1;32mDEBUG:  ('ZERO', 5, 1, 5, 5) [1;30m[model_handling.py at line 1811][0m [0m
-[1;32mDEBUG:  call = [0m vxxxxx( momenta,m_pars->%s, cHel[ihel][%d],%+d, w_sv[%d], %d ); [1;30m[model_handling.py at line 1810][0m [0m
-[1;32mDEBUG:  ('ZERO', 6, 1, 6, 6) [1;30m[model_handling.py at line 1811][0m [0m
+[1;32mDEBUG:  diag_to_config = [0m {} [1;30m[model_handling.py at line 1700][0m [0m
+[1;32mDEBUG:  call = [0m vxxxxx( momenta,m_pars->%s, cHel[ihel][%d],%+d, w_sv[%d], %d ); [1;30m[model_handling.py at line 1812][0m [0m
+[1;32mDEBUG:  ('ZERO', 0, -1, 0, 0) [1;30m[model_handling.py at line 1813][0m [0m
+[1;32mDEBUG:  call = [0m vxxxxx( momenta,m_pars->%s, cHel[ihel][%d],%+d, w_sv[%d], %d ); [1;30m[model_handling.py at line 1812][0m [0m
+[1;32mDEBUG:  ('ZERO', 1, -1, 1, 1) [1;30m[model_handling.py at line 1813][0m [0m
+[1;32mDEBUG:  call = [0m vxxxxx( momenta,m_pars->%s, cHel[ihel][%d],%+d, w_sv[%d], %d ); [1;30m[model_handling.py at line 1812][0m [0m
+[1;32mDEBUG:  ('ZERO', 4, 1, 4, 4) [1;30m[model_handling.py at line 1813][0m [0m
+[1;32mDEBUG:  call = [0m vxxxxx( momenta,m_pars->%s, cHel[ihel][%d],%+d, w_sv[%d], %d ); [1;30m[model_handling.py at line 1812][0m [0m
+[1;32mDEBUG:  ('ZERO', 5, 1, 5, 5) [1;30m[model_handling.py at line 1813][0m [0m
+[1;32mDEBUG:  call = [0m vxxxxx( momenta,m_pars->%s, cHel[ihel][%d],%+d, w_sv[%d], %d ); [1;30m[model_handling.py at line 1812][0m [0m
+[1;32mDEBUG:  ('ZERO', 6, 1, 6, 6) [1;30m[model_handling.py at line 1813][0m [0m
 INFO: Created files CPPProcess.h and CPPProcess.cc in directory /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_cudacpp_gg_ttggg/SubProcesses/P1_Sigma_sm_gg_ttxggg/. 
 [1;32mDEBUG:  Entering PLUGIN_OneProcessExporter.edit_CMakeLists [1;30m[model_handling.py at line 1332][0m [0m
 [1;32mDEBUG:  Entering PLUGIN_OneProcessExporter.edit_check_sa [1;30m[model_handling.py at line 1341][0m [0m
@@ -206,7 +206,7 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory /data/avalassi/G
 [1;32mDEBUG:  Entering PLUGIN_OneProcessExporter.edit_memorybuffers [1;30m[model_handling.py at line 1419][0m [0m
 [1;32mDEBUG:  Entering PLUGIN_OneProcessExporter.edit_memoryaccesscouplings [1;30m[model_handling.py at line 1430][0m [0m
 [1;32mDEBUG:  'Copying test reference file: ', template_ref  = [0m Copying test reference file:  /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/../../../test/ref/dump_CPUTest.Sigma_sm_gg_ttxggg.txt [1;30m[model_handling.py at line 1324][0m [0m
-Generated helas calls for 1 subprocesses (1240 diagrams) in 5.972 s
+Generated helas calls for 1 subprocesses (1240 diagrams) in 6.320 s
 [1;32mDEBUG:  Entering PLUGIN_ProcessExporter.convert_model (create the model) [1;30m[output.py at line 194][0m [0m
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates VVV1 routines[0m
@@ -214,7 +214,7 @@ ALOHA: aloha creates FFV1 routines[0m
 ALOHA: aloha creates VVVV1 routines[0m
 ALOHA: aloha creates VVVV3 routines[0m
 ALOHA: aloha creates VVVV4 routines[0m
-ALOHA: aloha creates 5 routines in  0.296 s
+ALOHA: aloha creates 5 routines in  0.331 s
 <class 'aloha.create_aloha.AbstractRoutine'> VVV1
 <class 'aloha.create_aloha.AbstractRoutine'> VVV1
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
@@ -256,6 +256,6 @@ INFO: /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_cudacpp_gg_ttggg/src/
 [1;32mDEBUG:  Entering PLUGIN_ProcessExporter.finalize [1;30m[output.py at line 203][0m [0m
 quit
 
-real	0m11.811s
-user	0m11.639s
-sys	0m0.116s
+real	0m12.606s
+user	0m12.392s
+sys	0m0.114s
diff --git a/epochX/cudacpp/gg_ttggg.sa/COPYRIGHT b/epochX/cudacpp/gg_ttggg.sa/COPYRIGHT
index a134b5fef9..84a883fbb0 100644
--- a/epochX/cudacpp/gg_ttggg.sa/COPYRIGHT
+++ b/epochX/cudacpp/gg_ttggg.sa/COPYRIGHT
@@ -15,6 +15,7 @@ The full development team currently includes the following authors :
   Stephan Hageboeck (CERN)
   Olivier Mattelaer (Universite Catholique de Louvain, original author)
   Stefan Roiser (CERN, original author)
+  Joergen Teig (CERN)
   Andrea Valassi (CERN, original author)
   Zenny Wettersten (CERN)
 See https://github.com/madgraph5/madgraph4gpu for more details. For the full
diff --git a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/Bridge.h b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/Bridge.h
index 4cafe0c997..c04628dfd1 100644
--- a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/Bridge.h
+++ b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/Bridge.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: S. Roiser (Nov 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Roiser, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef BRIDGE_H
 #define BRIDGE_H 1
@@ -22,7 +22,7 @@
 #include <memory>
 #include <type_traits>
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -82,7 +82,7 @@ namespace mg5amcCpu
     Bridge& operator=( const Bridge& ) = delete;
     Bridge& operator=( Bridge&& ) = delete;
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     /**
      * Set the gpublocks and gputhreads for the gpusequence - throws if evnt != gpublocks*gputhreads
      * (this is needed for BridgeKernel tests rather than for actual production use in Fortran)
@@ -149,7 +149,7 @@ namespace mg5amcCpu
     unsigned int m_nevt; // number of events
     int m_nGoodHel;      // the number of good helicities (-1 initially when they have not yet been calculated)
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     int m_gputhreads; // number of gpu threads (default set from number of events, can be modified)
     int m_gpublocks;  // number of gpu blocks (default set from number of events, can be modified)
     mg5amcGpu::DeviceBuffer<FORTRANFPTYPE, sizePerEventMomenta> m_devMomentaF;
@@ -186,12 +186,12 @@ namespace mg5amcCpu
   // Forward declare transposition methods
   //
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 
   template<typename Tin, typename Tout>
   __global__ void dev_transposeMomentaF2C( const Tin* in, Tout* out, const unsigned int nevt );
 
-#endif // __CUDACC__
+#endif // MGONGPUCPP_GPUIMPL
 
   template<typename Tin, typename Tout>
   void hst_transposeMomentaF2C( const Tin* in, Tout* out, const unsigned int nevt );
@@ -208,7 +208,7 @@ namespace mg5amcCpu
   Bridge<FORTRANFPTYPE>::Bridge( unsigned int nevtF, unsigned int nparF, unsigned int np4F )
     : m_nevt( nevtF )
     , m_nGoodHel( -1 )
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     , m_gputhreads( 256 )                  // default number of gpu threads
     , m_gpublocks( m_nevt / m_gputhreads ) // this ensures m_nevt <= m_gpublocks*m_gputhreads
     , m_devMomentaF( m_nevt )
@@ -232,7 +232,7 @@ namespace mg5amcCpu
   {
     if( nparF != CPPProcess::npar ) throw std::runtime_error( "Bridge constructor: npar mismatch" );
     if( np4F != CPPProcess::np4 ) throw std::runtime_error( "Bridge constructor: np4 mismatch" );
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     if( ( m_nevt < s_gputhreadsmin ) || ( m_nevt % s_gputhreadsmin != 0 ) )
       throw std::runtime_error( "Bridge constructor: nevt should be a multiple of " + std::to_string( s_gputhreadsmin ) );
     while( m_nevt != m_gpublocks * m_gputhreads )
@@ -250,11 +250,11 @@ namespace mg5amcCpu
     std::cout << "WARNING! Instantiate host Bridge (nevt=" << m_nevt << ")" << std::endl;
     mg5amcCpu::CPPProcess process( /*verbose=*/false );
     m_pmek.reset( new mg5amcCpu::MatrixElementKernelHost( m_hstMomentaC, m_hstGs, m_hstRndHel, m_hstRndCol, m_hstMEs, m_hstSelHel, m_hstSelCol, m_nevt ) );
-#endif // __CUDACC__
+#endif // MGONGPUCPP_GPUIMPL
     process.initProc( "../../Cards/param_card.dat" );
   }
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   template<typename FORTRANFPTYPE>
   void Bridge<FORTRANFPTYPE>::set_gpugrid( const int gpublocks, const int gputhreads )
   {
@@ -268,7 +268,7 @@ namespace mg5amcCpu
   }
 #endif
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   template<typename FORTRANFPTYPE>
   void Bridge<FORTRANFPTYPE>::gpu_sequence( const FORTRANFPTYPE* momenta,
                                             const FORTRANFPTYPE* gs,
@@ -283,14 +283,14 @@ namespace mg5amcCpu
     constexpr int neppM = MemoryAccessMomenta::neppM;
     if constexpr( neppM == 1 && std::is_same_v<FORTRANFPTYPE, fptype> )
     {
-      checkCuda( cudaMemcpy( m_devMomentaC.data(), momenta, m_devMomentaC.bytes(), cudaMemcpyHostToDevice ) );
+      gpuMemcpy( m_devMomentaC.data(), momenta, m_devMomentaC.bytes(), gpuMemcpyHostToDevice );
     }
     else
     {
-      checkCuda( cudaMemcpy( m_devMomentaF.data(), momenta, m_devMomentaF.bytes(), cudaMemcpyHostToDevice ) );
+      gpuMemcpy( m_devMomentaF.data(), momenta, m_devMomentaF.bytes(), gpuMemcpyHostToDevice );
       const int thrPerEvt = CPPProcess::npar * CPPProcess::np4; // AV: transpose alg does 1 element per thread (NOT 1 event per thread)
       //const int thrPerEvt = 1; // AV: try new alg with 1 event per thread... this seems slower
-      dev_transposeMomentaF2C<<<m_gpublocks * thrPerEvt, m_gputhreads>>>( m_devMomentaF.data(), m_devMomentaC.data(), m_nevt );
+      gpuLaunchKernel( dev_transposeMomentaF2C, m_gpublocks * thrPerEvt, m_gputhreads, m_devMomentaF.data(), m_devMomentaC.data(), m_nevt );
     }
     if constexpr( std::is_same_v<FORTRANFPTYPE, fptype> )
     {
@@ -333,7 +333,7 @@ namespace mg5amcCpu
   }
 #endif
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   template<typename FORTRANFPTYPE>
   void Bridge<FORTRANFPTYPE>::cpu_sequence( const FORTRANFPTYPE* momenta,
                                             const FORTRANFPTYPE* gs,
@@ -388,7 +388,7 @@ namespace mg5amcCpu
   // - C++ array: momenta[npagM][npar][np4][neppM] with nevt=npagM*neppM (AOSOA)
   //
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   template<typename Tin, typename Tout>
   __global__ void dev_transposeMomentaF2C( const Tin* in, Tout* out, const unsigned int nevt )
   {
diff --git a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/BridgeKernels.cc b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/BridgeKernels.cc
index cef4cb3c71..90c7f2d3b8 100644
--- a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/BridgeKernels.cc
+++ b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/BridgeKernels.cc
@@ -1,10 +1,11 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #include "BridgeKernels.h"
 
+#include "GpuAbstraction.h"
 #include "MemoryAccessMomenta.h"
 
 #include <sstream>
@@ -14,7 +15,7 @@ constexpr int npar = CPPProcess::npar; // #particles in total (external = initia
 
 //============================================================================
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -45,7 +46,7 @@ namespace mg5amcCpu
 
 //============================================================================
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 namespace mg5amcCpu
 {
 
@@ -96,7 +97,7 @@ namespace mg5amcCpu
 
 //============================================================================
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 {
 
diff --git a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/BridgeKernels.h b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/BridgeKernels.h
index 15eb4bff4d..3efef8ce97 100644
--- a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/BridgeKernels.h
+++ b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/BridgeKernels.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef BRIDGEKERNELS_H
 #define BRIDGEKERNELS_H 1
@@ -12,7 +12,7 @@
 #include "MatrixElementKernels.h"
 #include "MemoryBuffers.h"
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -49,7 +49,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A Bridge wrapper class encapsulating matrix element calculations on a CPU host
   class BridgeKernelHost final : public BridgeKernelBase
   {
@@ -89,7 +89,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   // A Bridge wrapper class encapsulating matrix element calculations on a GPU device
   class BridgeKernelDevice : public BridgeKernelBase
   {
diff --git a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/CommonRandomNumberKernel.cc b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/CommonRandomNumberKernel.cc
index 985b39f576..010bc4cbd0 100644
--- a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/CommonRandomNumberKernel.cc
+++ b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/CommonRandomNumberKernel.cc
@@ -1,15 +1,16 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #include "CommonRandomNumbers.h"
+#include "GpuAbstraction.h"
 #include "MemoryBuffers.h"
 #include "RandomNumberKernels.h"
 
 #include <cassert>
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/CrossSectionKernels.cc b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/CrossSectionKernels.cc
index 0b355a3c8d..c15b39844d 100644
--- a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/CrossSectionKernels.cc
+++ b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/CrossSectionKernels.cc
@@ -1,10 +1,11 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #include "CrossSectionKernels.h"
 
+#include "GpuAbstraction.h"
 #include "MemoryAccessMatrixElements.h"
 #include "MemoryAccessWeights.h"
 #include "MemoryBuffers.h"
@@ -77,7 +78,7 @@ debug_me_is_abnormal( const fptype& me, size_t ievtALL )
 
 //============================================================================
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -185,7 +186,7 @@ namespace mg5amcCpu
 
 //============================================================================
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 {
 
diff --git a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/CrossSectionKernels.h b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/CrossSectionKernels.h
index 7933ca4bbf..4d9659e04e 100644
--- a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/CrossSectionKernels.h
+++ b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/CrossSectionKernels.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef CROSSSECTIONKERNELS_H
 #define CROSSSECTIONKERNELS_H 1
@@ -13,7 +13,7 @@
 
 //============================================================================
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -96,7 +96,7 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
   /*
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   // A class encapsulating the calculation of event statistics on a GPU device
   class CrossSectionKernelDevice : public CrossSectionKernelBase, public NumberOfEvents
   {
diff --git a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/CudaRuntime.h b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/CudaRuntime.h
deleted file mode 100644
index 64ce52f4b3..0000000000
--- a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/CudaRuntime.h
+++ /dev/null
@@ -1,85 +0,0 @@
-// Copyright (C) 2020-2023 CERN and UCLouvain.
-// Licensed under the GNU Lesser General Public License (version 3 or later).
-// Created by: S. Roiser (Jul 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
-
-#ifndef MG5AMC_CUDARUNTIME_H
-#define MG5AMC_CUDARUNTIME_H 1
-
-// MG5AMC on GPU uses the CUDA runtime API, not the lower level CUDA driver API
-// See https://docs.nvidia.com/cuda/cuda-runtime-api/driver-vs-runtime-api.html#driver-vs-runtime-api
-
-#include <cassert>
-#include <iostream>
-
-//--------------------------------------------------------------------------
-
-// See https://stackoverflow.com/a/14038590
-#ifdef __CUDACC__ /* clang-format off */
-#define checkCuda( code ) { assertCuda( code, __FILE__, __LINE__ ); }
-inline void assertCuda( cudaError_t code, const char* file, int line, bool abort = true )
-{
-  if( code != cudaSuccess )
-  {
-    printf( "ERROR! assertCuda: '%s' (%d) in %s:%d\n", cudaGetErrorString( code ), code, file, line );
-    if( abort ) assert( code == cudaSuccess );
-  }
-}
-#endif /* clang-format on */
-
-//--------------------------------------------------------------------------
-
-#ifdef __CUDACC__
-namespace mg5amcGpu
-{
-  // Instantiate a CudaRuntime at the beginnining of the application's main to
-  // invoke cudaSetDevice(0) in the constructor and book a cudaDeviceReset() call in the destructor
-  // *** FIXME! This will all need to be designed differently when going to multi-GPU nodes! ***
-  struct CudaRuntime final
-  {
-    CudaRuntime( const bool debug = true )
-      : m_debug( debug ) { setUp( m_debug ); }
-    ~CudaRuntime() { tearDown( m_debug ); }
-    CudaRuntime( const CudaRuntime& ) = delete;
-    CudaRuntime( CudaRuntime&& ) = delete;
-    CudaRuntime& operator=( const CudaRuntime& ) = delete;
-    CudaRuntime& operator=( CudaRuntime&& ) = delete;
-    bool m_debug;
-
-    // Set up CUDA application
-    // ** NB: strictly speaking this is not needed when using the CUDA runtime API **
-    // Calling cudaSetDevice on startup is useful to properly book-keep the time spent in CUDA initialization
-    static void setUp( const bool debug = true )
-    {
-      // ** NB: it is useful to call cudaSetDevice, or cudaFree, to properly book-keep the time spent in CUDA initialization
-      // ** NB: otherwise, the first CUDA operation (eg a cudaMemcpyToSymbol in CPPProcess ctor) appears to take much longer!
-      /*
-      // [We initially added cudaFree(0) to "ease profile analysis" only because it shows up as a big recognizable block!]
-      // No explicit initialization is needed: https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#initialization
-      // It is not clear what cudaFree(0) does at all: https://stackoverflow.com/questions/69967813/
-      if ( debug ) std::cout << "__CudaRuntime: calling cudaFree(0)" << std::endl;
-      checkCuda( cudaFree( 0 ) ); // SLOW!
-      */
-      // Replace cudaFree(0) by cudaSetDevice(0), even if it is not really needed either
-      // (but see https://developer.nvidia.com/blog/cuda-pro-tip-always-set-current-device-avoid-multithreading-bugs)
-      if( debug ) std::cout << "__CudaRuntime: calling cudaSetDevice(0)" << std::endl;
-      checkCuda( cudaSetDevice( 0 ) ); // SLOW!
-    }
-
-    // Tear down CUDA application (call cudaDeviceReset)
-    // ** NB: strictly speaking this is not needed when using the CUDA runtime API **
-    // Calling cudaDeviceReset on shutdown is only needed for checking memory leaks in cuda-memcheck
-    // See https://docs.nvidia.com/cuda/cuda-memcheck/index.html#leak-checking
-    static void tearDown( const bool debug = true )
-    {
-      if( debug ) std::cout << "__CudaRuntime: calling cudaDeviceReset()" << std::endl;
-      checkCuda( cudaDeviceReset() );
-    }
-  };
-
-}
-#endif
-
-//--------------------------------------------------------------------------
-
-#endif // MG5AMC_CUDARUNTIME_H
diff --git a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/CurandRandomNumberKernel.cc b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/CurandRandomNumberKernel.cc
index eb56333b03..38c477c17a 100644
--- a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/CurandRandomNumberKernel.cc
+++ b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/CurandRandomNumberKernel.cc
@@ -1,9 +1,9 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
-#include "CudaRuntime.h"
+#include "GpuRuntime.h"
 #include "MemoryBuffers.h"
 #include "RandomNumberKernels.h"
 
@@ -114,7 +114,7 @@ namespace mg5amcCpu
     /*
     printf( "\nCurandRandomNumberKernel::generateRnarray size = %d\n", (int)m_rnarray.size() );
     fptype* data = m_rnarray.data();
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     if( m_rnarray.isOnDevice() )
     {
       data = new fptype[m_rnarray.size()]();
@@ -123,7 +123,7 @@ namespace mg5amcCpu
 #endif
     for( int i = 0; i < ( (int)m_rnarray.size() / 4 ); i++ )
       printf( "[%4d] %f %f %f %f\n", i * 4, data[i * 4], data[i * 4 + 2], data[i * 4 + 2], data[i * 4 + 3] );
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     if( m_rnarray.isOnDevice() ) delete[] data;
 #endif
     */
diff --git a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/EventStatistics.h b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/EventStatistics.h
index 48b51e0a49..b425a5bade 100644
--- a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/EventStatistics.h
+++ b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/EventStatistics.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef EventStatistics_H
 #define EventStatistics_H 1
@@ -16,7 +16,7 @@
 #include <limits>
 #include <string>
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/GpuAbstraction.h b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/GpuAbstraction.h
new file mode 100644
index 0000000000..6a7d9c05c0
--- /dev/null
+++ b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/GpuAbstraction.h
@@ -0,0 +1,71 @@
+// Copyright (C) 2020-2023 CERN and UCLouvain.
+// Licensed under the GNU Lesser General Public License (version 3 or later).
+// Created by: J. Teig (Jul 2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+
+#ifndef MG5AMC_GPUABSTRACTION_H
+#define MG5AMC_GPUABSTRACTION_H 1
+
+#include <cassert>
+
+//--------------------------------------------------------------------------
+
+#ifdef __CUDACC__
+
+#define gpuError_t cudaError_t
+#define gpuPeekAtLastError cudaPeekAtLastError
+#define gpuGetErrorString cudaGetErrorString
+#define gpuSuccess cudaSuccess
+
+#define gpuMallocHost( ptr, size ) checkGpu( cudaMallocHost( ptr, size ) )
+#define gpuMalloc( ptr, size ) checkGpu( cudaMalloc( ptr, size ) )
+
+#define gpuMemcpy( dstData, srcData, srcBytes, func ) checkGpu( cudaMemcpy( dstData, srcData, srcBytes, func ) )
+#define gpuMemcpyHostToDevice cudaMemcpyHostToDevice
+#define gpuMemcpyDeviceToHost cudaMemcpyDeviceToHost
+#define gpuMemcpyToSymbol( type1, type2, size ) checkGpu( cudaMemcpyToSymbol( type1, type2, size ) )
+
+#define gpuFree( ptr ) checkGpu( cudaFree( ptr ) )
+#define gpuFreeHost( ptr ) checkGpu( cudaFreeHost( ptr ) )
+
+#define gpuSetDevice cudaSetDevice
+#define gpuDeviceSynchronize cudaDeviceSynchronize
+#define gpuDeviceReset cudaDeviceReset
+
+#define gpuLaunchKernel( kernel, blocks, threads, ... ) kernel<<<blocks, threads>>>( __VA_ARGS__ )
+#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<<blocks, threads, sharedMem>>>( __VA_ARGS__ )
+
+//--------------------------------------------------------------------------
+
+#elif defined __HIPCC__
+
+#include "hip/hip_runtime.h"
+
+#define gpuError_t hipError_t
+#define gpuPeekAtLastError hipPeekAtLastError
+#define gpuGetErrorString hipGetErrorString
+#define gpuSuccess hipSuccess
+
+#define gpuMallocHost( ptr, size ) checkGpu( hipHostMalloc( ptr, size ) ) // HostMalloc better
+#define gpuMalloc( ptr, size ) checkGpu( hipMalloc( ptr, size ) )
+
+#define gpuMemcpy( dstData, srcData, srcBytes, func ) checkGpu( hipMemcpy( dstData, srcData, srcBytes, func ) )
+#define gpuMemcpyHostToDevice hipMemcpyHostToDevice
+#define gpuMemcpyDeviceToHost hipMemcpyDeviceToHost
+#define gpuMemcpyToSymbol( type1, type2, size ) checkGpu( hipMemcpyToSymbol( type1, type2, size ) )
+
+#define gpuFree( ptr ) checkGpu( hipFree( ptr ) )
+#define gpuFreeHost( ptr ) checkGpu( hipHostFree( ptr ) )
+
+#define gpuSetDevice hipSetDevice
+#define gpuDeviceSynchronize hipDeviceSynchronize
+#define gpuDeviceReset hipDeviceReset
+
+#define gpuLaunchKernel( kernel, blocks, threads, ... ) kernel<<<blocks, threads>>>( __VA_ARGS__ )
+#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<<blocks, threads, sharedMem>>>( __VA_ARGS__ )
+
+//--------------------------------------------------------------------------
+
+#endif
+
+#endif // MG5AMC_GPUABSTRACTION_H
diff --git a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/GpuRuntime.h b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/GpuRuntime.h
new file mode 100644
index 0000000000..93579ef08b
--- /dev/null
+++ b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/GpuRuntime.h
@@ -0,0 +1,85 @@
+// Copyright (C) 2020-2023 CERN and UCLouvain.
+// Licensed under the GNU Lesser General Public License (version 3 or later).
+// Created by: J. Teig (Jun 2023, based on earlier work by S. Roiser) for the MG5aMC CUDACPP plugin.
+// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+
+#ifndef MG5AMC_GPURUNTIME_H
+#define MG5AMC_GPURUNTIME_H 1
+
+// MG5AMC on GPU uses the CUDA runtime API, not the lower level CUDA driver API
+// See https://docs.nvidia.com/cuda/cuda-runtime-api/driver-vs-runtime-api.html#driver-vs-runtime-api
+
+#include "GpuAbstraction.h"
+
+#include <iostream>
+
+//--------------------------------------------------------------------------
+
+// See https://stackoverflow.com/a/14038590
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
+#define checkGpu( code ) { assertGpu( code, __FILE__, __LINE__ ); }
+inline void assertGpu( gpuError_t code, const char* file, int line, bool abort = true )
+{
+  if( code != gpuSuccess )
+  {
+    printf( "ERROR! assertGpu: '%s' (%d) in %s:%d\n", gpuGetErrorString( code ), code, file, line );
+    if( abort ) assert( code == gpuSuccess );
+  }
+}
+#endif /* clang-format on */
+
+//--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+namespace mg5amcGpu
+{
+  // Instantiate a GpuRuntime at the beginnining of the application's main to
+  // invoke gpuSetDevice(0) in the constructor and book a gpuDeviceReset() call in the destructor
+  // *** FIXME! This will all need to be designed differently when going to multi-GPU nodes! ***
+  struct GpuRuntime final
+  {
+    GpuRuntime( const bool debug = true )
+      : m_debug( debug ) { setUp( m_debug ); }
+    ~GpuRuntime() { tearDown( m_debug ); }
+    GpuRuntime( const GpuRuntime& ) = delete;
+    GpuRuntime( GpuRuntime&& ) = delete;
+    GpuRuntime& operator=( const GpuRuntime& ) = delete;
+    GpuRuntime& operator=( GpuRuntime&& ) = delete;
+    bool m_debug;
+
+    // Set up CUDA application
+    // ** NB: strictly speaking this is not needed when using the CUDA runtime API **
+    // Calling cudaSetDevice on startup is useful to properly book-keep the time spent in CUDA initialization
+    static void setUp( const bool debug = true )
+    {
+      // ** NB: it is useful to call cudaSetDevice, or cudaFree, to properly book-keep the time spent in CUDA initialization
+      // ** NB: otherwise, the first CUDA operation (eg a cudaMemcpyToSymbol in CPPProcess ctor) appears to take much longer!
+      /*
+      // [We initially added cudaFree(0) to "ease profile analysis" only because it shows up as a big recognizable block!]
+      // No explicit initialization is needed: https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#initialization
+      // It is not clear what cudaFree(0) does at all: https://stackoverflow.com/questions/69967813/
+      if ( debug ) std::cout << "__CudaRuntime: calling cudaFree(0)" << std::endl;
+      checkCuda( cudaFree( 0 ) ); // SLOW!
+      */
+      // Replace cudaFree(0) by cudaSetDevice(0), even if it is not really needed either
+      // (but see https://developer.nvidia.com/blog/cuda-pro-tip-always-set-current-device-avoid-multithreading-bugs)
+      if( debug ) std::cout << "__GpuRuntime: calling GpuSetDevice(0)" << std::endl;
+      checkGpu( gpuSetDevice( 0 ) ); // SLOW!
+    }
+
+    // Tear down CUDA application (call cudaDeviceReset)
+    // ** NB: strictly speaking this is not needed when using the CUDA runtime API **
+    // Calling cudaDeviceReset on shutdown is only needed for checking memory leaks in cuda-memcheck
+    // See https://docs.nvidia.com/cuda/cuda-memcheck/index.html#leak-checking
+    static void tearDown( const bool debug = true )
+    {
+      if( debug ) std::cout << "__GpuRuntime: calling GpuDeviceReset()" << std::endl;
+      checkGpu( gpuDeviceReset() );
+    }
+  };
+}
+#endif
+
+//--------------------------------------------------------------------------
+
+#endif // MG5AMC_GPURUNTIME_H
diff --git a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MadgraphTest.h b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MadgraphTest.h
index fd7734ce42..d2ff326e20 100644
--- a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MadgraphTest.h
+++ b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MadgraphTest.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: S. Hageboeck (Dec 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MADGRAPHTEST_H_
 #define MADGRAPHTEST_H_ 1
@@ -21,7 +21,7 @@
 #include <string>
 #include <vector>
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 using mg5amcGpu::CPPProcess;
 #else
 using mg5amcCpu::CPPProcess;
@@ -200,7 +200,7 @@ class MadgraphTest : public testing::TestWithParam<TestDriverBase*>
 
 // Since we link both the CPU-only and GPU tests into the same executable, we prevent
 // a multiply defined symbol by only compiling this in the non-CUDA phase:
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 
 /// Compare momenta and matrix elements.
 /// This uses an implementation of TestDriverBase to run a madgraph workflow,
@@ -309,6 +309,6 @@ TEST_P( MadgraphTest, CompareMomentaAndME )
   }
 }
 
-#endif // __CUDACC__
+#endif // MGONGPUCPP_GPUIMPL
 
 #endif /* MADGRAPHTEST_H_ */
diff --git a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MatrixElementKernels.cc b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MatrixElementKernels.cc
index 30257195b6..d6d6c4f179 100644
--- a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MatrixElementKernels.cc
+++ b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MatrixElementKernels.cc
@@ -1,12 +1,12 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #include "MatrixElementKernels.h"
 
 #include "CPPProcess.h"
-#include "CudaRuntime.h"
+#include "GpuRuntime.h" // Includes the abstraction for Nvidia/AMD compilation
 #include "MemoryAccessMomenta.h"
 #include "MemoryBuffers.h"
 
@@ -14,7 +14,7 @@
 
 //============================================================================
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 namespace mg5amcCpu
 {
 
@@ -143,7 +143,7 @@ namespace mg5amcCpu
 
 //============================================================================
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 {
 
@@ -202,13 +202,13 @@ namespace mg5amcGpu
     PinnedHostBufferHelicityMask hstIsGoodHel( ncomb );
     DeviceBufferHelicityMask devIsGoodHel( ncomb );
     // ... 0d1. Compute good helicity mask on the device
-    computeDependentCouplings<<<m_gpublocks, m_gputhreads>>>( m_gs.data(), m_couplings.data() );
+    gpuLaunchKernel( computeDependentCouplings, m_gpublocks, m_gputhreads, m_gs.data(), m_couplings.data() );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    sigmaKin_getGoodHel<<<m_gpublocks, m_gputhreads>>>( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_numerators.data(), m_denominators.data(), devIsGoodHel.data() );
+    gpuLaunchKernel( sigmaKin_getGoodHel, m_gpublocks, m_gputhreads, m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_numerators.data(), m_denominators.data(), devIsGoodHel.data() );
 #else
-    sigmaKin_getGoodHel<<<m_gpublocks, m_gputhreads>>>( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), devIsGoodHel.data() );
+    gpuLaunchKernel( sigmaKin_getGoodHel, m_gpublocks, m_gputhreads, m_momenta.data(), m_couplings.data(), m_matrixElements.data(), devIsGoodHel.data() );
 #endif
-    checkCuda( cudaPeekAtLastError() );
+    checkGpu( gpuPeekAtLastError() );
     // ... 0d2. Copy back good helicity mask to the host
     copyHostFromDevice( hstIsGoodHel, devIsGoodHel );
     // ... 0d3. Copy back good helicity list to constant memory on the device
@@ -219,19 +219,19 @@ namespace mg5amcGpu
 
   void MatrixElementKernelDevice::computeMatrixElements( const unsigned int channelId )
   {
-    computeDependentCouplings<<<m_gpublocks, m_gputhreads>>>( m_gs.data(), m_couplings.data() );
+    gpuLaunchKernel( computeDependentCouplings, m_gpublocks, m_gputhreads, m_gs.data(), m_couplings.data() );
 #ifndef MGONGPU_NSIGHT_DEBUG
     constexpr unsigned int sharedMemSize = 0;
 #else
     constexpr unsigned int sharedMemSize = ntpbMAX * sizeof( float );
 #endif
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    sigmaKin<<<m_gpublocks, m_gputhreads, sharedMemSize>>>( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), channelId, m_numerators.data(), m_denominators.data(), m_selhel.data(), m_selcol.data() );
+    gpuLaunchKernelSharedMem( sigmaKin, m_gpublocks, m_gputhreads, sharedMemSize, m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), channelId, m_numerators.data(), m_denominators.data(), m_selhel.data(), m_selcol.data() );
 #else
-    sigmaKin<<<m_gpublocks, m_gputhreads, sharedMemSize>>>( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), m_selhel.data(), m_selcol.data() );
+    gpuLaunchKernelSharedMem( sigmaKin, m_gpublocks, m_gputhreads, sharedMemSize, m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), m_selhel.data(), m_selcol.data() );
 #endif
-    checkCuda( cudaPeekAtLastError() );
-    checkCuda( cudaDeviceSynchronize() );
+    checkGpu( gpuPeekAtLastError() );
+    checkGpu( gpuDeviceSynchronize() );
   }
 
   //--------------------------------------------------------------------------
diff --git a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MatrixElementKernels.h b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MatrixElementKernels.h
index 23e84757a2..72bd8f195b 100644
--- a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MatrixElementKernels.h
+++ b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MatrixElementKernels.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MATRIXELEMENTKERNELS_H
 #define MATRIXELEMENTKERNELS_H 1
@@ -10,7 +10,7 @@
 
 #include "MemoryBuffers.h"
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -81,7 +81,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating matrix element calculations on a CPU host
   class MatrixElementKernelHost final : public MatrixElementKernelBase, public NumberOfEvents
   {
@@ -130,7 +130,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   // A class encapsulating matrix element calculations on a GPU device
   class MatrixElementKernelDevice : public MatrixElementKernelBase, public NumberOfEvents
   {
diff --git a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MemoryAccessHelpers.h b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MemoryAccessHelpers.h
index c82a6c7635..db73e4e064 100644
--- a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MemoryAccessHelpers.h
+++ b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MemoryAccessHelpers.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MemoryAccessHelpers_H
 #define MemoryAccessHelpers_H 1
@@ -105,7 +105,7 @@ class KernelAccessHelper : public MemoryAccessHelper<T>
     }
     else
     {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
       const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid
       //printf( "kernelAccessRecord: ievt=%d threadId=%d\n", ievt, threadIdx.x );
       return T::ieventAccessRecord( buffer, ievt ); // NB fptype and fptype_sv coincide for CUDA
diff --git a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MemoryAccessMomenta.h b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MemoryAccessMomenta.h
index 0ac4faa3c7..38fade09fb 100644
--- a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MemoryAccessMomenta.h
+++ b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MemoryAccessMomenta.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MemoryAccessMomenta_H
 #define MemoryAccessMomenta_H 1
@@ -12,7 +12,7 @@
 #include "MemoryAccessHelpers.h"
 #include "MemoryAccessVectors.h"
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 using mg5amcGpu::CPPProcess;
 #else
 using mg5amcCpu::CPPProcess;
@@ -29,7 +29,7 @@ class MemoryAccessMomentaBase //_AOSOAv1
 
   // Number of Events Per Page in the momenta AOSOA memory buffer layout
   // (these are all best kept as a compile-time constants: see issue #23)
-#ifdef __CUDACC__ /* clang-format off */
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
   // -----------------------------------------------------------------------------------------------
   // --- GPUs: neppM is best set to a power of 2 times the number of fptype's in a 32-byte cacheline
   // --- This is relevant to ensure coalesced access to momenta in global memory
diff --git a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MemoryAccessRandomNumbers.h b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MemoryAccessRandomNumbers.h
index e2988d39f3..40cb089135 100644
--- a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MemoryAccessRandomNumbers.h
+++ b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MemoryAccessRandomNumbers.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MemoryAccessRandomNumbers_H
 #define MemoryAccessRandomNumbers_H 1
@@ -11,7 +11,7 @@
 #include "CPPProcess.h"
 #include "MemoryAccessHelpers.h"
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 using mg5amcGpu::CPPProcess;
 #else
 using mg5amcCpu::CPPProcess;
diff --git a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MemoryAccessVectors.h b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MemoryAccessVectors.h
index e9b197368e..08faccff0f 100644
--- a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MemoryAccessVectors.h
+++ b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MemoryAccessVectors.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MemoryAccessVectors_H
 #define MemoryAccessVectors_H 1
@@ -10,7 +10,7 @@
 
 #include "mgOnGpuVectors.h"
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 namespace mg5amcCpu // this is only needed for CPU SIMD vectorization
 {
 
diff --git a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MemoryBuffers.h b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MemoryBuffers.h
index 3093e6ed18..7756a71621 100644
--- a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MemoryBuffers.h
+++ b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MemoryBuffers.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021, based on earlier work by S. Hageboeck) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Roiser, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MemoryBuffers_H
 #define MemoryBuffers_H 1
@@ -11,12 +11,12 @@
 #include "mgOnGpuCxtypes.h"
 
 #include "CPPProcess.h"
-#include "CudaRuntime.h"
+#include "GpuRuntime.h"
 #include "Parameters_sm.h"
 
 #include <sstream>
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -87,7 +87,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   constexpr bool HostBufferALIGNED = false;   // ismisaligned=false
   constexpr bool HostBufferMISALIGNED = true; // ismisaligned=true
 
@@ -119,7 +119,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   // A class encapsulating a CUDA pinned host buffer
   template<typename T>
   class PinnedHostBufferBase : public BufferBase<T>
@@ -128,18 +128,18 @@ namespace mg5amcCpu
     PinnedHostBufferBase( const size_t size )
       : BufferBase<T>( size, false )
     {
-      checkCuda( cudaMallocHost( &( this->m_data ), this->bytes() ) );
+      gpuMallocHost( &( this->m_data ), this->bytes() );
     }
     virtual ~PinnedHostBufferBase()
     {
-      checkCuda( cudaFreeHost( this->m_data ) );
+      gpuFreeHost( this->m_data );
     }
   };
 #endif
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   // A class encapsulating a CUDA device buffer
   template<typename T>
   class DeviceBufferBase : public BufferBase<T>
@@ -148,18 +148,18 @@ namespace mg5amcCpu
     DeviceBufferBase( const size_t size )
       : BufferBase<T>( size, true )
     {
-      checkCuda( cudaMalloc( &( this->m_data ), this->bytes() ) );
+      gpuMalloc( &( this->m_data ), this->bytes() );
     }
     virtual ~DeviceBufferBase()
     {
-      checkCuda( cudaFree( this->m_data ) );
+      gpuFree( this->m_data );
     }
   };
 #endif
 
   //--------------------------------------------------------------------------
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for a given number of events
   template<typename T, size_t sizePerEvent, bool ismisaligned>
   class HostBuffer : public HostBufferBase<T, ismisaligned>, virtual private NumberOfEvents
@@ -175,7 +175,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   // A class encapsulating a CUDA pinned host buffer for a given number of events
   template<typename T, size_t sizePerEvent>
   class PinnedHostBuffer : public PinnedHostBufferBase<T>, virtual private NumberOfEvents
@@ -191,7 +191,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   // A class encapsulating a CUDA device buffer for a given number of events
   template<typename T, size_t sizePerEvent>
   class DeviceBuffer : public DeviceBufferBase<T>, virtual private NumberOfEvents
@@ -213,7 +213,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for momenta random numbers
   constexpr size_t sizePerEventRndNumMomenta = MemoryBuffers::np4 * MemoryBuffers::nparf;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for momenta random numbers
   typedef HostBuffer<fptype, sizePerEventRndNumMomenta, HostBufferALIGNED> HostBufferRndNumMomenta;
 #else
@@ -232,7 +232,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer with ONE fptype per event
   constexpr size_t sizePerEventOneFp = 1;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer with ONE fptype per event
   typedef HostBuffer<fptype, sizePerEventOneFp, HostBufferALIGNED> HostBufferOneFp;
 #else
@@ -257,7 +257,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for Gs
   constexpr size_t sizePerEventGs = 1;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for gs
   typedef HostBuffer<fptype, sizePerEventGs, HostBufferALIGNED> HostBufferGs;
 #else
@@ -276,7 +276,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for numerators
   constexpr size_t sizePerEventNumerators = 1;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for gs
   typedef HostBuffer<fptype, sizePerEventNumerators, HostBufferALIGNED> HostBufferNumerators;
 #else
@@ -296,7 +296,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for denominators
   constexpr size_t sizePerEventDenominators = 1;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for gs
   typedef HostBuffer<fptype, sizePerEventDenominators, HostBufferALIGNED> HostBufferDenominators;
 #else
@@ -315,7 +315,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for random numbers
   constexpr size_t sizePerEventCouplings = MemoryBuffers::ndcoup * MemoryBuffers::nx2;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for gs
   typedef HostBuffer<fptype, sizePerEventCouplings, HostBufferALIGNED> HostBufferCouplings;
 #else
@@ -333,7 +333,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for momenta
   constexpr size_t sizePerEventMomenta = MemoryBuffers::np4 * MemoryBuffers::npar;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for momenta
   typedef HostBuffer<fptype, sizePerEventMomenta, HostBufferALIGNED> HostBufferMomenta;
   //typedef HostBuffer<fptype, sizePerEventMomenta, HostBufferMISALIGNED> HostBufferMomenta; // TEST MISALIGNMENT!
@@ -352,7 +352,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for sampling weights
   constexpr size_t sizePerEventWeights = 1;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for sampling weights
   typedef HostBuffer<fptype, sizePerEventWeights, HostBufferALIGNED> HostBufferWeights;
 #else
@@ -370,7 +370,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for matrix elements
   constexpr size_t sizePerEventMatrixElements = 1;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for matrix elements
   typedef HostBuffer<fptype, sizePerEventMatrixElements, HostBufferALIGNED> HostBufferMatrixElements;
 #else
@@ -385,7 +385,7 @@ namespace mg5amcCpu
   // A base class encapsulating a memory buffer for the helicity mask
   typedef BufferBase<bool> BufferHelicityMask;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for the helicity mask
   typedef HostBufferBase<bool, HostBufferALIGNED> HostBufferHelicityMask;
 #else
@@ -403,7 +403,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for wavefunctions
   constexpr size_t sizePerEventWavefunctions = MemoryBuffers::nw6 * MemoryBuffers::nx2;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for wavefunctions
   typedef HostBuffer<fptype, sizePerEventWavefunctions, HostBufferALIGNED> HostBufferWavefunctions;
 #else
@@ -421,7 +421,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for helicity random numbers
   constexpr size_t sizePerEventRndNumHelicity = 1;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for helicity random numbers
   typedef HostBuffer<fptype, sizePerEventRndNumHelicity, HostBufferALIGNED> HostBufferRndNumHelicity;
 #else
@@ -439,7 +439,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for color random numbers
   constexpr size_t sizePerEventRndNumColor = 1;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for color random numbers
   typedef HostBuffer<fptype, sizePerEventRndNumColor, HostBufferALIGNED> HostBufferRndNumColor;
 #else
@@ -457,7 +457,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for helicity selection
   constexpr size_t sizePerEventSelectedHelicity = 1;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for helicity selection
   typedef HostBuffer<int, sizePerEventSelectedHelicity, HostBufferALIGNED> HostBufferSelectedHelicity;
 #else
@@ -475,7 +475,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for color selection
   constexpr size_t sizePerEventSelectedColor = 1;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for color selection
   typedef HostBuffer<int, sizePerEventSelectedColor, HostBufferALIGNED> HostBufferSelectedColor;
 #else
@@ -487,7 +487,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   template<class Tdst, class Tsrc>
   void copyDeviceFromHost( Tdst& dst, const Tsrc& src ) // keep the same order of arguments as in memcpy
   {
@@ -504,13 +504,13 @@ namespace mg5amcCpu
       throw std::runtime_error( sstr.str() );
     }
     // NB (PR #45): cudaMemcpy involves an intermediate memcpy to pinned memory if host array is a not a pinned host array
-    checkCuda( cudaMemcpy( dst.data(), src.data(), src.bytes(), cudaMemcpyHostToDevice ) );
+    gpuMemcpy( dst.data(), src.data(), src.bytes(), gpuMemcpyHostToDevice );
   }
 #endif
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   template<class Tdst, class Tsrc>
   void copyHostFromDevice( Tdst& dst, const Tsrc& src ) // keep the same order of arguments as in memcpy
   {
@@ -527,7 +527,7 @@ namespace mg5amcCpu
       throw std::runtime_error( sstr.str() );
     }
     // NB (PR #45): cudaMemcpy involves an intermediate memcpy to pinned memory if host array is a not a pinned host array
-    checkCuda( cudaMemcpy( dst.data(), src.data(), src.bytes(), cudaMemcpyDeviceToHost ) );
+    gpuMemcpy( dst.data(), src.data(), src.bytes(), gpuMemcpyDeviceToHost );
   }
 #endif
 
diff --git a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/P1_Sigma_sm_gg_ttxggg/CPPProcess.cc b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/P1_Sigma_sm_gg_ttxggg/CPPProcess.cc
index b7a16f1170..a2f1fc1dc2 100644
--- a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/P1_Sigma_sm_gg_ttxggg/CPPProcess.cc
+++ b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/P1_Sigma_sm_gg_ttxggg/CPPProcess.cc
@@ -4,7 +4,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
 // MadGraph5_aMC@NLO v. 3.5.0_lo_vect, 2023-06-09
@@ -16,7 +16,6 @@
 
 #include "mgOnGpuConfig.h"
 
-#include "CudaRuntime.h"
 #include "HelAmps_sm.h"
 #include "MemoryAccessAmplitudes.h"
 #include "MemoryAccessCouplings.h"
@@ -46,7 +45,7 @@
 // Class member functions for calculating the matrix elements for
 // Process: g g > t t~ g g g WEIGHTED<=5 @1
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -80,7 +79,7 @@ namespace mg5amcCpu
   __device__ const fptype cIPD[2] = { (fptype)Parameters_sm::mdl_MT, (fptype)Parameters_sm::mdl_WT };
   __device__ const fptype* cIPC = nullptr; // unused as nicoup=0
 #else
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   __device__ __constant__ fptype cIPD[2];
   __device__ __constant__ fptype* cIPC = nullptr; // unused as nicoup=0
 #else
@@ -90,7 +89,7 @@ namespace mg5amcCpu
 #endif
 
   // Helicity combinations (and filtering of "good" helicity combinations)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   __device__ __constant__ short cHel[ncomb][npar];
   __device__ __constant__ int cNGoodHel;
   __device__ __constant__ int cGoodHel[ncomb];
@@ -118,13 +117,13 @@ namespace mg5amcCpu
                            fptype* allDenominators,       // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
                            fptype_sv* jamp2_sv            // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled)
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
                            , const int ievt00             // input: first event number in current C++ event page (for CUDA, ievt depends on threadid)
 #endif
                            )
   //ALWAYS_INLINE // attributes are not permitted in a function definition
   {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     using namespace mg5amcGpu;
     using M_ACCESS = DeviceAccessMomenta;         // non-trivial access: buffer includes all events
     using E_ACCESS = DeviceAccessMatrixElements;  // non-trivial access: buffer includes all events
@@ -151,7 +150,7 @@ namespace mg5amcCpu
 #endif /* clang-format on */
     mgDebug( 0, __FUNCTION__ );
     //printf( "calculate_wavefunctions: ihel=%2d\n", ihel );
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
     //printf( "calculate_wavefunctions: ievt00=%d\n", ievt00 );
 #endif
 
@@ -187,7 +186,7 @@ namespace mg5amcCpu
 #endif
     for( int iParity = 0; iParity < nParity; ++iParity )
     { // START LOOP ON IPARITY
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
       const int ievt0 = ievt00 + iParity * neppV;
 #endif
       constexpr size_t nxcoup = ndcoup + nicoup; // both dependent and independent couplings
@@ -200,8 +199,10 @@ namespace mg5amcCpu
         allCOUPs[idcoup] = CD_ACCESS::idcoupAccessBufferConst( allcouplings, idcoup ); // dependent couplings, vary event-by-event
       for( size_t iicoup = 0; iicoup < nicoup; iicoup++ )
         allCOUPs[ndcoup + iicoup] = CI_ACCESS::iicoupAccessBufferConst( cIPC, iicoup ); // independent couplings, fixed for all events
+#ifdef MGONGPUCPP_GPUIMPL
 #ifdef __CUDACC__
 #pragma nv_diagnostic pop
+#endif
       // CUDA kernels take input/output buffers with momenta/MEs for all events
       const fptype* momenta = allmomenta;
       const fptype* COUPs[nxcoup];
@@ -31908,7 +31909,7 @@ namespace mg5amcCpu
         { -116, 442, 442, -134, -134, 505, -44, -134, 28, -224, -62, 496, -53, 19, -62, 496, 10, -80, -62, -71, 10, -80, 1, -8, 136, -116, -116, -44, -44, 514, -116, 442, -44, 28, -53, -62, -44, -53, 514, -62, 100, 10, 28, -62, -62, 10, 10, 1, 514, 505, -62, 496, -71, 568, -44, -134, -53, -62, 19, -71, 10, -80, 100, 10, 640, -80, 1, -8, 10, 1, 64, -8, -62, -71, 10, -80, 1, -8, 28, -62, -62, 10, 10, 1, 1, -8, 10, 1, 64, -8, -8, 64, -80, -8, -512, 64, 496, 568, -80, 640, -8, 64, -224, 496, 496, -80, -80, -8, -8, 64, -80, -8, -512, 64, 64, -512, 640, 64, 4096, -512 },
         { 136, -116, -116, -44, -44, 514, -116, 442, -44, 28, -53, -62, -44, -53, 514, -62, 100, 10, 28, -62, -62, 10, 10, 1, -116, 442, 442, -134, -134, 505, -44, -134, 28, -224, -62, 496, -53, 19, -62, 496, 10, -80, -62, -71, 10, -80, 1, -8, -44, -134, -53, -62, 19, -71, 514, 505, -62, 496, -71, 568, 100, 10, 10, -80, -80, 640, 10, 1, 1, -8, -8, 64, 28, -62, -62, 10, 10, 1, -62, -71, 10, -80, 1, -8, 10, 1, 1, -8, -8, 64, -80, -8, -8, 64, 64, -512, -224, 496, 496, -80, -80, -8, 496, 568, -80, 640, -8, 64, -80, -8, -8, 64, 64, -512, 640, 64, 64, -512, -512, 4096 } }; // 2-D array[120][120]
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
       // Pre-compute a constexpr triangular color matrix properly normalized #475
       struct TriangularNormalizedColorMatrix
       {
@@ -31965,7 +31966,7 @@ namespace mg5amcCpu
 #endif
       for( int icol = 0; icol < ncolor; icol++ )
       {
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
         // === C++ START ===
         // Diagonal terms
 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
@@ -32024,7 +32025,7 @@ namespace mg5amcCpu
       MEs_sv_previous += deltaMEs_previous;
 #endif
       /*
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
       if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", blockDim.x * blockIdx.x + threadIdx.x, ihel, MEs_sv );
 #else
 #ifdef MGONGPU_CPPSIMD
@@ -32183,8 +32184,8 @@ namespace mg5amcCpu
       { 1, 1, 1, -1, 1, -1, 1 },
       { 1, 1, 1, -1, 1, 1, -1 },
       { 1, 1, 1, -1, 1, 1, 1 } };
-#ifdef __CUDACC__
-    checkCuda( cudaMemcpyToSymbol( cHel, tHel, ncomb * npar * sizeof( short ) ) );
+#ifdef MGONGPUCPP_GPUIMPL
+    gpuMemcpyToSymbol( cHel, tHel, ncomb * npar * sizeof( short ) );
 #else
     memcpy( cHel, tHel, ncomb * npar * sizeof( short ) );
 #endif
@@ -32227,9 +32228,9 @@ namespace mg5amcCpu
     // Then copy them to CUDA constant memory (issue #39) or its C++ emulation in file-scope static memory
     const fptype tIPD[2] = { (fptype)m_pars->mdl_MT, (fptype)m_pars->mdl_WT };
     //const cxtype tIPC[0] = { ... }; // nicoup=0
-#ifdef __CUDACC__
-    checkCuda( cudaMemcpyToSymbol( cIPD, tIPD, 2 * sizeof( fptype ) ) );
-    //checkCuda( cudaMemcpyToSymbol( cIPC, tIPC, 0 * sizeof( cxtype ) ) ); // nicoup=0
+#ifdef MGONGPUCPP_GPUIMPL
+    gpuMemcpyToSymbol( cIPD, tIPD, 2 * sizeof( fptype ) );
+    //gpuMemcpyToSymbol( cIPC, tIPC, 0 * sizeof( cxtype ) ); // nicoup=0
 #else
     memcpy( cIPD, tIPD, 2 * sizeof( fptype ) );
     //memcpy( cIPC, tIPC, 0 * sizeof( cxtype ) ); // nicoup=0
@@ -32268,7 +32269,7 @@ namespace mg5amcCpu
   {
     std::stringstream out;
     // CUDA version (NVCC)
-    // [Use __NVCC__ instead of __CUDACC__ here!]
+    // [Use __NVCC__ instead of MGONGPUCPP_GPUIMPL here!]
     // [This tests if 'nvcc' was used even to build a .cc file, even if not necessarily 'nvcc -x cu' for a .cu file]
     // [Check 'nvcc --compiler-options -dM -E dummy.c | grep CUDA': see https://stackoverflow.com/a/53713712]
 #ifdef __NVCC__
@@ -32333,12 +32334,12 @@ namespace mg5amcCpu
   __global__ void /* clang-format off */
   computeDependentCouplings( const fptype* allgs, // input: Gs[nevt]
                              fptype* allcouplings // output: couplings[nevt*ndcoup*2]
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
                              , const int nevt     // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #endif
   ) /* clang-format on */
   {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     using namespace mg5amcGpu;
     using G_ACCESS = DeviceAccessGs;
     using C_ACCESS = DeviceAccessCouplings;
@@ -32359,7 +32360,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__ /* clang-format off */
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
   __global__ void
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
@@ -32488,9 +32489,9 @@ namespace mg5amcCpu
         nGoodHel++;
       }
     }
-#ifdef __CUDACC__
-    checkCuda( cudaMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) ) );
-    checkCuda( cudaMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) ) );
+#ifdef MGONGPUCPP_GPUIMPL
+    gpuMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) );
+    gpuMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) );
 #else
     cNGoodHel = nGoodHel;
     for( int ihel = 0; ihel < ncomb; ihel++ ) cGoodHel[ihel] = goodHel[ihel];
@@ -32514,7 +32515,7 @@ namespace mg5amcCpu
 #endif
             int* allselhel,                // output: helicity selection[nevt]
             int* allselcol                 // output: helicity selection[nevt]
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
             , const int nevt               // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #endif
             ) /* clang-format on */
@@ -32535,7 +32536,7 @@ namespace mg5amcCpu
     // Denominators: spins, colors and identical particles
     constexpr int helcolDenominators[1] = { 1536 }; // assume nprocesses == 1 (#272 and #343)
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     // Remember: in CUDA this is a kernel for one event, in c++ this processes n events
     const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid
 #else
@@ -32549,9 +32550,12 @@ namespace mg5amcCpu
 #endif
 
     // Start sigmaKin_lines
+
+#include "GpuAbstraction.h"
+
     // === PART 0 - INITIALISATION (before calculate_wavefunctions) ===
     // Reset the "matrix elements" - running sums of |M|^2 over helicities for the given event
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     allMEs[ievt] = 0;
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     allNumerators[ievt] = 0;
@@ -32579,7 +32583,7 @@ namespace mg5amcCpu
     // === PART 1 - HELICITY LOOP: CALCULATE WAVEFUNCTIONS ===
     // (in both CUDA and C++, using precomputed good helicities)
 
-#ifdef __CUDACC__ // CUDA OR C++
+#ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++
 
     // *** START OF PART 1a - CUDA (one event per CPU thread) ***
     // Running sum of partial amplitudes squared for event by event color selection (#402)
@@ -32783,7 +32787,7 @@ namespace mg5amcCpu
     // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event
     // [NB 'sum over final spins, average over initial spins', eg see
     // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf]
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     allMEs[ievt] /= helcolDenominators[0];
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     if( channelId > 0 ) allMEs[ievt] *= allNumerators[ievt] / allDenominators[ievt];
diff --git a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/P1_Sigma_sm_gg_ttxggg/CPPProcess.h b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/P1_Sigma_sm_gg_ttxggg/CPPProcess.h
index d1dd4d6150..b1f469b1c9 100644
--- a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/P1_Sigma_sm_gg_ttxggg/CPPProcess.h
+++ b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/P1_Sigma_sm_gg_ttxggg/CPPProcess.h
@@ -4,7 +4,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
 // MadGraph5_aMC@NLO v. 3.5.0_lo_vect, 2023-06-09
@@ -25,7 +25,7 @@
 
 //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -107,7 +107,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   __global__ void
   computeDependentCouplings( const fptype* allgs,    // input: Gs[nevt]
                              fptype* allcouplings ); // output: couplings[nevt*ndcoup*2]
@@ -120,7 +120,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__ /* clang-format off */
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
   __global__ void
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
@@ -150,7 +150,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__ /* clang-format off */
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
   __global__ void
   sigmaKin( const fptype* allmomenta,      // input: momenta[nevt*npar*4]
             const fptype* allcouplings,    // input: couplings[nevt*ndcoup*2]
diff --git a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/P1_Sigma_sm_gg_ttxggg/CudaRuntime.h b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/P1_Sigma_sm_gg_ttxggg/CudaRuntime.h
deleted file mode 120000
index ce9e1a487a..0000000000
--- a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/P1_Sigma_sm_gg_ttxggg/CudaRuntime.h
+++ /dev/null
@@ -1 +0,0 @@
-../CudaRuntime.h
\ No newline at end of file
diff --git a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/P1_Sigma_sm_gg_ttxggg/GpuAbstraction.h b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/P1_Sigma_sm_gg_ttxggg/GpuAbstraction.h
new file mode 120000
index 0000000000..72054e19ba
--- /dev/null
+++ b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/P1_Sigma_sm_gg_ttxggg/GpuAbstraction.h
@@ -0,0 +1 @@
+../GpuAbstraction.h
\ No newline at end of file
diff --git a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/P1_Sigma_sm_gg_ttxggg/GpuRuntime.h b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/P1_Sigma_sm_gg_ttxggg/GpuRuntime.h
new file mode 120000
index 0000000000..3920e83be4
--- /dev/null
+++ b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/P1_Sigma_sm_gg_ttxggg/GpuRuntime.h
@@ -0,0 +1 @@
+../GpuRuntime.h
\ No newline at end of file
diff --git a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/P1_Sigma_sm_gg_ttxggg/check_sa.cc b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/P1_Sigma_sm_gg_ttxggg/check_sa.cc
index f1e75b9252..1bad694d1c 100644
--- a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/P1_Sigma_sm_gg_ttxggg/check_sa.cc
+++ b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/P1_Sigma_sm_gg_ttxggg/check_sa.cc
@@ -4,7 +4,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: O. Mattelaer (Nov 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 
 #include "mgOnGpuConfig.h"
@@ -12,6 +12,7 @@
 #include "BridgeKernels.h"
 #include "CPPProcess.h"
 #include "CrossSectionKernels.h"
+#include "GpuRuntime.h"
 #include "MatrixElementKernels.h"
 #include "MemoryAccessMatrixElements.h"
 #include "MemoryAccessMomenta.h"
@@ -63,7 +64,7 @@ usage( char* argv0, int ret = 1 )
   std::cout << std::endl;
   std::cout << "Summary stats are always computed: '-p' and '-j' only control their printout" << std::endl;
   std::cout << "The '-d' flag only enables NaN/abnormal warnings and OMP debugging" << std::endl;
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 #ifdef _OPENMP
   std::cout << std::endl;
   std::cout << "Use the OMP_NUM_THREADS environment variable to control OMP multi-threading" << std::endl;
@@ -77,7 +78,7 @@ int
 main( int argc, char** argv )
 {
   // Namespaces for CUDA and C++ (FIXME - eventually use the same namespace everywhere...)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   using namespace mg5amcGpu;
 #else
   using namespace mg5amcCpu;
@@ -103,11 +104,11 @@ main( int argc, char** argv )
     CurandDevice = 2
   };
 #ifdef __CUDACC__
-  RandomNumberMode rndgen = RandomNumberMode::CurandDevice; // default on GPU
+  RandomNumberMode rndgen = RandomNumberMode::CurandDevice; // default on CUDA GPU
 #elif not defined MGONGPU_HAS_NO_CURAND
   RandomNumberMode rndgen = RandomNumberMode::CurandHost;  // default on CPU if build has curand
 #else
-  RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // default on CPU if build has no curand
+  RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // default on CPU if build has no curand and on HIP GPU
 #endif
   // Rambo sampling mode (NB RamboHost implies CommonRandom or CurandHost!)
   enum class RamboSamplingMode
@@ -115,7 +116,7 @@ main( int argc, char** argv )
     RamboHost = 1,
     RamboDevice = 2
   };
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   RamboSamplingMode rmbsmp = RamboSamplingMode::RamboDevice; // default on GPU
 #else
   RamboSamplingMode rmbsmp = RamboSamplingMode::RamboHost; // default on CPU
@@ -148,7 +149,7 @@ main( int argc, char** argv )
 #ifdef __CUDACC__
       rndgen = RandomNumberMode::CurandDevice;
 #else
-      throw std::runtime_error( "CurandDevice is not supported on CPUs" );
+      throw std::runtime_error( "CurandDevice is not supported on CPUs or on HIP GPUs" );
 #endif
     }
     else if( arg == "--curhst" )
@@ -165,7 +166,7 @@ main( int argc, char** argv )
     }
     else if( arg == "--rmbdev" )
     {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
       rmbsmp = RamboSamplingMode::RamboDevice;
 #else
       throw std::runtime_error( "RamboDevice is not supported on CPUs" );
@@ -239,13 +240,13 @@ main( int argc, char** argv )
     return usage( argv[0] );
   }
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 #ifdef _OPENMP
   ompnumthreadsNotSetMeansOneThread( debug ? 1 : 0 ); // quiet(-1), info(0), debug(1)
 #endif
 #endif
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // Fail gently and avoid "Illegal instruction (core dumped)" if the host does not support the SIMD used in the ME calculation
   // Note: this prevents a crash on pmpe04 but not on some github CI nodes?
   // [NB: SIMD vectorization in mg5amc C++ code is only used in the ME calculation below MatrixElementKernelHost!]
@@ -263,14 +264,14 @@ main( int argc, char** argv )
 
   // === STEP 0 - INITIALISE
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 
-  // --- 00. Initialise cuda
-  // Instantiate a CudaRuntime at the beginnining of the application's main to
-  // invoke cudaSetDevice(0) in the constructor and book a cudaDeviceReset() call in the destructor
-  const std::string cdinKey = "00 CudaInit";
+  // --- 00. Initialise GPU
+  // Instantiate a GpuRuntime at the beginnining of the application's main.
+  // For CUDA this invokes cudaSetDevice(0) in the constructor and books a cudaDeviceReset() call in the destructor.
+  const std::string cdinKey = "00 GpuInit";
   timermap.start( cdinKey );
-  CudaRuntime cudaRuntime( debug );
+  GpuRuntime GpuRuntime( debug );
 #endif
 
   // --- 0a. Initialise physics process
@@ -292,7 +293,7 @@ main( int argc, char** argv )
   timermap.start( alloKey );
 
   // Memory buffers for random numbers for momenta
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferRndNumMomenta hstRndmom( nevt );
 #else
   PinnedHostBufferRndNumMomenta hstRndmom( nevt );
@@ -300,7 +301,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for sampling weights
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferWeights hstWeights( nevt );
 #else
   PinnedHostBufferWeights hstWeights( nevt );
@@ -308,7 +309,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for momenta
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferMomenta hstMomenta( nevt );
 #else
   PinnedHostBufferMomenta hstMomenta( nevt );
@@ -316,7 +317,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for Gs
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferGs hstGs( nevt );
 #else
   PinnedHostBufferGs hstGs( nevt );
@@ -333,7 +334,7 @@ main( int argc, char** argv )
   }
 
   // Memory buffers for matrix elements
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferMatrixElements hstMatrixElements( nevt );
 #else
   PinnedHostBufferMatrixElements hstMatrixElements( nevt );
@@ -342,7 +343,7 @@ main( int argc, char** argv )
 
   // Memory buffers for random numbers for helicity selection
   // *** NB #403 these buffers always remain initialised at 0: no need for helicity choice in gcheck/check (no LHE produced) ***
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferRndNumHelicity hstRndHel( nevt );
 #else
   PinnedHostBufferRndNumHelicity hstRndHel( nevt );
@@ -351,7 +352,7 @@ main( int argc, char** argv )
 
   // Memory buffers for random numbers for color selection
   // *** NB #402 these buffers always remain initialised at 0: no need for color choice in gcheck/check (no LHE produced) ***
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferRndNumColor hstRndCol( nevt );
 #else
   PinnedHostBufferRndNumColor hstRndCol( nevt );
@@ -359,7 +360,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for helicity selection
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferSelectedHelicity hstSelHel( nevt );
 #else
   PinnedHostBufferSelectedHelicity hstSelHel( nevt );
@@ -367,7 +368,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for color selection
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferSelectedColor hstSelCol( nevt );
 #else
   PinnedHostBufferSelectedColor hstSelCol( nevt );
@@ -403,7 +404,7 @@ main( int argc, char** argv )
 #else
   else
   {
-    throw std::logic_error( "CurandDevice is not supported on CPUs" ); // INTERNAL ERROR (no path to this statement)
+    throw std::logic_error( "CurandDevice is not supported on CPUs or HIP GPUs" ); // INTERNAL ERROR (no path to this statement)
   }
 #endif
 #else
@@ -421,7 +422,7 @@ main( int argc, char** argv )
   }
   else
   {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     prsk.reset( new RamboSamplingKernelDevice( energy, devRndmom, devMomenta, devWeights, gpublocks, gputhreads ) );
 #else
     throw std::logic_error( "RamboDevice is not supported on CPUs" ); // INTERNAL ERROR (no path to this statement)
@@ -432,7 +433,7 @@ main( int argc, char** argv )
   std::unique_ptr<MatrixElementKernelBase> pmek;
   if( !bridge )
   {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     pmek.reset( new MatrixElementKernelDevice( devMomenta, devGs, devRndHel, devRndCol, devMatrixElements, devSelHel, devSelCol, gpublocks, gputhreads ) );
 #else
     pmek.reset( new MatrixElementKernelHost( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, nevt ) );
@@ -440,7 +441,7 @@ main( int argc, char** argv )
   }
   else
   {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     pmek.reset( new BridgeKernelDevice( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, gpublocks, gputhreads ) );
 #else
     pmek.reset( new BridgeKernelHost( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, nevt ) );
@@ -482,7 +483,7 @@ main( int argc, char** argv )
     prnk->generateRnarray();
     //std::cout << "Got random numbers" << std::endl;
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     if( rndgen != RandomNumberMode::CurandDevice && rmbsmp == RamboSamplingMode::RamboDevice )
     {
       // --- 1c. Copy rndmom from host to device
@@ -514,7 +515,7 @@ main( int argc, char** argv )
     prsk->getMomentaFinal();
     //std::cout << "Got final momenta" << std::endl;
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     if( rmbsmp == RamboSamplingMode::RamboDevice )
     {
       // --- 2c. CopyDToH Weights
@@ -559,7 +560,7 @@ main( int argc, char** argv )
       dynamic_cast<BridgeKernelBase*>( pmek.get() )->transposeInputMomentaC2F();
     }
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     // --- 2d. CopyHToD Momenta
     const std::string gKey = "0.. CpHTDg";
     rambtime += timermap.start( gKey ); // FIXME! NOT A RAMBO TIMER!
@@ -588,7 +589,7 @@ main( int argc, char** argv )
     wv3atime += timermap.stop(); // calc only
     wavetime += wv3atime;        // calc plus copy
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     if( !bridge )
     {
       // --- 3b. CopyDToH MEs
@@ -731,15 +732,19 @@ main( int argc, char** argv )
     rndgentxt = "CURAND DEVICE";
 #ifdef __CUDACC__
   rndgentxt += " (CUDA code)";
+#elif defined __HIPCC__
+  rndgentxt += " (HIP code)";
 #else
   rndgentxt += " (C++ code)";
 #endif
 
   // Workflow description summary
   std::string wrkflwtxt;
-  // -- CUDA or C++?
+  // -- CUDA or HIP or C++?
 #ifdef __CUDACC__
   wrkflwtxt += "CUD:";
+#elif defined __HIPCC__
+  wrkflwtxt += "HIP:";
 #else
   wrkflwtxt += "CPP:";
 #endif
@@ -764,6 +769,12 @@ main( int argc, char** argv )
 #else
   wrkflwtxt += "???:"; // no path to this statement
 #endif
+#elif defined __HIPCC__
+#if defined MGONGPU_CUCXTYPE_CXSMPL
+  wrkflwtxt += "CXS:";
+#else
+  wrkflwtxt += "???:"; // no path to this statement
+#endif
 #else
 #if defined MGONGPU_CPPCXTYPE_STDCOMPLEX
   wrkflwtxt += "STX:";
@@ -789,7 +800,7 @@ main( int argc, char** argv )
     wrkflwtxt += "RMBDEV+";
   else
     wrkflwtxt += "??????+"; // no path to this statement
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   // -- HOST or DEVICE matrix elements? Standalone MEs or BRIDGE?
   if( !bridge )
     wrkflwtxt += "MESDEV";
@@ -845,7 +856,7 @@ main( int argc, char** argv )
 
   if( perf )
   {
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 #ifdef _OPENMP
     // Get the output of "nproc --all" (https://stackoverflow.com/a/478960)
     std::string nprocall;
@@ -866,6 +877,8 @@ main( int argc, char** argv )
     std::cout << std::string( SEP79, '*' ) << std::endl
 #ifdef __CUDACC__
               << "Process                     = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_CUDA"
+#elif defined __HIPCC__
+              << "Process                     = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_HIP"
 #else
               << "Process                     = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_CPP"
 #endif
@@ -892,21 +905,21 @@ main( int argc, char** argv )
 #elif defined MGONGPU_FPTYPE_FLOAT
               << "FP precision                = FLOAT (NaN/abnormal=" << nabn << ", zero=" << nzero << ")" << std::endl
 #endif
-#ifdef __CUDACC__
 #if defined MGONGPU_CUCXTYPE_CUCOMPLEX
               << "Complex type                = CUCOMPLEX" << std::endl
 #elif defined MGONGPU_CUCXTYPE_THRUST
               << "Complex type                = THRUST::COMPLEX" << std::endl
-#endif
-#else
+#elif defined MGONGPU_CUCXTYPE_CXSMPL
               << "Complex type                = STD::COMPLEX" << std::endl
+#else
+              << "Complex type                = ???" << std::endl // no path to this statement...
 #endif
               << "RanNumb memory layout       = AOSOA[" << neppR << "]"
               << ( neppR == 1 ? " == AOS" : "" )
               << " [HARDCODED FOR REPRODUCIBILITY]" << std::endl
               << "Momenta memory layout       = AOSOA[" << neppM << "]"
               << ( neppM == 1 ? " == AOS" : "" ) << std::endl
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     //<< "Wavefunction GPU memory     = LOCAL" << std::endl
 #else
 #if !defined MGONGPU_CPPSIMD
@@ -937,7 +950,7 @@ main( int argc, char** argv )
 #endif
 #endif
               << "Random number generation    = " << rndgentxt << std::endl
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 #ifdef _OPENMP
               << "OMP threads / `nproc --all` = " << omp_get_max_threads() << " / " << nprocall // includes a newline
 #endif
@@ -1033,14 +1046,14 @@ main( int argc, char** argv )
              << "\"FLOAT (NaN/abnormal=" << nabn << ")\"," << std::endl
 #endif
              << "\"Complex type\": "
-#ifdef __CUDACC__
 #if defined MGONGPU_CUCXTYPE_CUCOMPLEX
              << "\"CUCOMPLEX\"," << std::endl
 #elif defined MGONGPU_CUCXTYPE_THRUST
              << "\"THRUST::COMPLEX\"," << std::endl
-#endif
-#else
+#elif defined MGONGPU_CUCXTYPE_CXSMPL
              << "\"STD::COMPLEX\"," << std::endl
+#else
+             << "\"???\"," << std::endl                           // no path to this statement...
 #endif
              << "\"RanNumb memory layout\": "
              << "\"AOSOA[" << neppR << "]\""
@@ -1048,7 +1061,7 @@ main( int argc, char** argv )
              << "\"Momenta memory layout\": "
              << "\"AOSOA[" << neppM << "]\""
              << ( neppM == 1 ? " == AOS" : "" ) << ", " << std::endl
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     //<< "\"Wavefunction GPU memory\": " << "\"LOCAL\"," << std::endl
 #endif
              << "\"Curand generation\": "
diff --git a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/RamboSamplingKernels.cc b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/RamboSamplingKernels.cc
index da68aa9255..79abbcc4f8 100644
--- a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/RamboSamplingKernels.cc
+++ b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/RamboSamplingKernels.cc
@@ -1,11 +1,11 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #include "RamboSamplingKernels.h"
 
-#include "CudaRuntime.h"
+#include "GpuRuntime.h"
 #include "MemoryAccessMomenta.h"
 #include "MemoryAccessRandomNumbers.h"
 #include "MemoryAccessWeights.h"
@@ -14,7 +14,7 @@
 
 #include <sstream>
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -92,7 +92,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   RamboSamplingKernelDevice::RamboSamplingKernelDevice( const fptype energy,               // input: energy
                                                         const BufferRndNumMomenta& rndmom, // input: random numbers in [0,1]
                                                         BufferMomenta& momenta,            // output: momenta
@@ -135,7 +135,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   __global__ void
   getMomentaInitialDevice( const fptype energy,
                            fptype* momenta )
@@ -147,17 +147,17 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   void
   RamboSamplingKernelDevice::getMomentaInitial()
   {
-    getMomentaInitialDevice<<<m_gpublocks, m_gputhreads>>>( m_energy, m_momenta.data() );
+    gpuLaunchKernel( getMomentaInitialDevice, m_gpublocks, m_gputhreads, m_energy, m_momenta.data() );
   }
 #endif
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   __global__ void
   getMomentaFinalDevice( const fptype energy,
                          const fptype* rndmom,
@@ -171,11 +171,11 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   void
   RamboSamplingKernelDevice::getMomentaFinal()
   {
-    getMomentaFinalDevice<<<m_gpublocks, m_gputhreads>>>( m_energy, m_rndmom.data(), m_momenta.data(), m_weights.data() );
+    gpuLaunchKernel( getMomentaFinalDevice, m_gpublocks, m_gputhreads, m_energy, m_rndmom.data(), m_momenta.data(), m_weights.data() );
   }
 #endif
 
diff --git a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/RamboSamplingKernels.h b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/RamboSamplingKernels.h
index 184089efd7..7c214cd74b 100644
--- a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/RamboSamplingKernels.h
+++ b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/RamboSamplingKernels.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef RAMBOSAMPLINGKERNELS_H
 #define RAMBOSAMPLINGKERNELS_H 1
@@ -10,7 +10,7 @@
 
 #include "MemoryBuffers.h"
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -93,7 +93,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   // A class encapsulating RAMBO phase space sampling on a GPU device
   class RamboSamplingKernelDevice final : public SamplingKernelBase, public NumberOfEvents
   {
diff --git a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/RandomNumberKernels.h b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/RandomNumberKernels.h
index 188a72c2c9..21d63beeac 100644
--- a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/RandomNumberKernels.h
+++ b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/RandomNumberKernels.h
@@ -1,14 +1,14 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef RANDOMNUMBERKERNELS_H
 #define RANDOMNUMBERKERNELS_H 1
 
 #include "mgOnGpuConfig.h"
 
-// NB This must come AFTER mgOnGpuConfig.h which contains our definition of __global__ when __CUDACC__ is not defined
+// NB This must come AFTER mgOnGpuConfig.h which contains our definition of __global__ when MGONGPUCPP_GPUIMPL is not defined
 #ifndef MGONGPU_HAS_NO_CURAND
 //#include "curand.h"
 struct curandGenerator_st; // forward definition from curand.h
@@ -16,7 +16,7 @@ struct curandGenerator_st; // forward definition from curand.h
 
 #include "MemoryBuffers.h"
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/cudacpp.mk b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/cudacpp.mk
index a0397e9ecc..bcb73d7f01 100644
--- a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/cudacpp.mk
+++ b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/cudacpp.mk
@@ -1,7 +1,7 @@
 # Copyright (C) 2020-2023 CERN and UCLouvain.
 # Licensed under the GNU Lesser General Public License (version 3 or later).
 # Created by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin.
-# Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+# Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 
 #=== Determine the name of this makefile (https://ftp.gnu.org/old-gnu/Manuals/make-3.80/html_node/make_17.html)
 #=== NB: different names (e.g. cudacpp.mk and cudacpp_src.mk) are used in the Subprocess and src directories
@@ -32,7 +32,7 @@ UNAME_P := $(shell uname -p)
 #=== Configure common compiler flags for C++ and CUDA
 
 INCFLAGS = -I.
-OPTFLAGS = -O3 # this ends up in CUFLAGS too (should it?), cannot add -Ofast or -ffast-math here
+OPTFLAGS = -O3 # this ends up in GPUFLAGS too (should it?), cannot add -Ofast or -ffast-math here
 
 # Dependency on src directory
 MG5AMC_COMMONLIB = mg5amc_common
@@ -89,69 +89,139 @@ endif
 
 #-------------------------------------------------------------------------------
 
-#=== Configure the CUDA compiler
-
-# If CXX is not a single word (example "clang++ --gcc-toolchain...") then disable CUDA builds (issue #505)
-# This is because it is impossible to pass this to "CUFLAGS += -ccbin <host-compiler>" below
-ifneq ($(words $(subst ccache ,,$(CXX))),1) # allow at most "CXX=ccache <host-compiler>" from outside
-  $(warning CUDA builds are not supported for multi-word CXX "$(CXX)")
-  override CUDA_HOME=disabled
-endif
-
-# If CUDA_HOME is not set, try to set it from the location of nvcc
-ifndef CUDA_HOME
-  CUDA_HOME = $(patsubst %bin/nvcc,%,$(shell which nvcc 2>/dev/null))
-  $(warning CUDA_HOME was not set: using "$(CUDA_HOME)")
-endif
-
-# Set NVCC as $(CUDA_HOME)/bin/nvcc if it exists
-ifneq ($(wildcard $(CUDA_HOME)/bin/nvcc),)
-  NVCC = $(CUDA_HOME)/bin/nvcc
-  USE_NVTX ?=-DUSE_NVTX
-  # See https://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/index.html
-  # See https://arnon.dk/matching-sm-architectures-arch-and-gencode-for-various-nvidia-cards/
-  # Default: use compute capability 70 for V100 (CERN lxbatch, CERN itscrd, Juwels Cluster).
-  # Embed device code for 70, and PTX for 70+.
-  # Export MADGRAPH_CUDA_ARCHITECTURE (comma-separated list) to use another value or list of values (see #533).
-  # Examples: use 60 for P100 (Piz Daint), 80 for A100 (Juwels Booster, NVidia raplab/Curiosity).
-  MADGRAPH_CUDA_ARCHITECTURE ?= 70
-  ###CUARCHFLAGS = -gencode arch=compute_$(MADGRAPH_CUDA_ARCHITECTURE),code=compute_$(MADGRAPH_CUDA_ARCHITECTURE) -gencode arch=compute_$(MADGRAPH_CUDA_ARCHITECTURE),code=sm_$(MADGRAPH_CUDA_ARCHITECTURE) # Older implementation (AV): go back to this one for multi-GPU support #533
-  ###CUARCHFLAGS = --gpu-architecture=compute_$(MADGRAPH_CUDA_ARCHITECTURE) --gpu-code=sm_$(MADGRAPH_CUDA_ARCHITECTURE),compute_$(MADGRAPH_CUDA_ARCHITECTURE) # Newer implementation (SH): cannot use this as-is for multi-GPU support #533
-  comma:=,
-  CUARCHFLAGS = $(foreach arch,$(subst $(comma), ,$(MADGRAPH_CUDA_ARCHITECTURE)),-gencode arch=compute_$(arch),code=compute_$(arch) -gencode arch=compute_$(arch),code=sm_$(arch))
-  CUINC = -I$(CUDA_HOME)/include/
-  CURANDLIBFLAGS = -L$(CUDA_HOME)/lib64/ -lcurand # NB: -lcuda is not needed here!
-  CUOPTFLAGS = -lineinfo
-  CUFLAGS = $(OPTFLAGS) $(CUOPTFLAGS) $(INCFLAGS) $(CUINC) $(USE_NVTX) $(CUARCHFLAGS) -use_fast_math
-  ###CUFLAGS += -Xcompiler -Wall -Xcompiler -Wextra -Xcompiler -Wshadow
-  ###NVCC_VERSION = $(shell $(NVCC) --version | grep 'Cuda compilation tools' | cut -d' ' -f5 | cut -d, -f1)
-  CUFLAGS += -std=c++17 # need CUDA >= 11.2 (see #333): this is enforced in mgOnGpuConfig.h
-  # Without -maxrregcount: baseline throughput: 6.5E8 (16384 32 12) up to 7.3E8 (65536 128 12)
-  ###CUFLAGS+= --maxrregcount 160 # improves throughput: 6.9E8 (16384 32 12) up to 7.7E8 (65536 128 12)
-  ###CUFLAGS+= --maxrregcount 128 # improves throughput: 7.3E8 (16384 32 12) up to 7.6E8 (65536 128 12)
-  ###CUFLAGS+= --maxrregcount 96 # degrades throughput: 4.1E8 (16384 32 12) up to 4.5E8 (65536 128 12)
-  ###CUFLAGS+= --maxrregcount 64 # degrades throughput: 1.7E8 (16384 32 12) flat at 1.7E8 (65536 128 12)
-else ifneq ($(origin REQUIRE_CUDA),undefined)
-  # If REQUIRE_CUDA is set but no cuda is found, stop here (e.g. for CI tests on GPU #443)
-  $(error No cuda installation found (set CUDA_HOME or make nvcc visible in PATH))
-else
-  # No cuda. Switch cuda compilation off and go to common random numbers in C++
-  $(warning CUDA_HOME is not set or is invalid: export CUDA_HOME to compile with cuda)
-  override NVCC=
-  override USE_NVTX=
-  override CUINC=
-  override CURANDLIBFLAGS=
-endif
+CUDA_COMPILER_PATH := $(shell compiler="`which nvcc 2>/dev/null`" && while [ -L "$$compiler" ]; do compiler=`readlink "$$compiler"`; done && echo "$$compiler")
+HIP_COMPILER_PATH := $(shell compiler="`which hipcc 2>/dev/null`" && while [ -L "$$compiler" ]; do compiler=`readlink "$$compiler"`; done && echo "$$compiler")
+
+ifeq ($(findstring nvcc,$(CUDA_COMPILER_PATH)),nvcc)
+  #=== Configure the CUDA compiler
+
+  # If CXX is not a single word (example "clang++ --gcc-toolchain...") then disable CUDA builds (issue #505)
+  # This is because it is impossible to pass this to "GPUFLAGS += -ccbin <host-compiler>" below
+  ifneq ($(words $(subst ccache ,,$(CXX))),1) # allow at most "CXX=ccache <host-compiler>" from outside
+    $(warning CUDA builds are not supported for multi-word CXX "$(CXX)")
+    override CUDA_HOME=disabled
+  endif
+
+  # If CUDA_HOME is not set, try to set it from the location of NVCC
+  ifndef CUDA_HOME
+    CUDA_HOME = $(patsubst %bin/nvcc,%,$(shell which nvcc 2>/dev/null))
+    $(warning CUDA_HOME was not set: using "$(CUDA_HOME)")
+  endif
+
+  # Set GPUCC as $(CUDA_HOME)/bin/nvcc if it exists
+  ifneq ($(wildcard $(CUDA_HOME)/bin/nvcc),)
+    GPUCC = $(CUDA_HOME)/bin/nvcc
+    USE_NVTX ?=-DUSE_NVTX
+    # See https://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/index.html
+    # See https://arnon.dk/matching-sm-architectures-arch-and-gencode-for-various-nvidia-cards/
+    # Default: use compute capability 70 for V100 (CERN lxbatch, CERN itscrd, Juwels Cluster).
+    # Embed device code for 70, and PTX for 70+.
+    # Export MADGRAPH_CUDA_ARCHITECTURE (comma-separated list) to use another value or list of values (see #533).
+    # Examples: use 60 for P100 (Piz Daint), 80 for A100 (Juwels Booster, NVidia raplab/Curiosity).
+    MADGRAPH_CUDA_ARCHITECTURE ?= 70
+    ###CUARCHFLAGS = -gencode arch=compute_$(MADGRAPH_CUDA_ARCHITECTURE),code=compute_$(MADGRAPH_CUDA_ARCHITECTURE) -gencode arch=compute_$(MADGRAPH_CUDA_ARCHITECTURE),code=sm_$(MADGRAPH_CUDA_ARCHITECTURE) # Older implementation (AV): go back to this one for multi-GPU support #533
+    ###CUARCHFLAGS = --gpu-architecture=compute_$(MADGRAPH_CUDA_ARCHITECTURE) --gpu-code=sm_$(MADGRAPH_CUDA_ARCHITECTURE),compute_$(MADGRAPH_CUDA_ARCHITECTURE) # Newer implementation (SH): cannot use this as-is for multi-GPU support #533
+    comma:=,
+    CUARCHFLAGS = $(foreach arch,$(subst $(comma), ,$(MADGRAPH_CUDA_ARCHITECTURE)),-gencode arch=compute_$(arch),code=compute_$(arch) -gencode arch=compute_$(arch),code=sm_$(arch))
+    CUINC = -I$(CUDA_HOME)/include/
+    CURANDLIBFLAGS = -L$(CUDA_HOME)/lib64/ -lcurand # NB: -lcuda is not needed here!
+    CUOPTFLAGS = -lineinfo
+    GPUFLAGS = $(OPTFLAGS) $(CUOPTFLAGS) $(INCFLAGS) $(CUINC) $(USE_NVTX) $(CUARCHFLAGS) -use_fast_math
+    ###GPUFLAGS += -Xcompiler -Wall -Xcompiler -Wextra -Xcompiler -Wshadow
+    ###GPUCC_VERSION = $(shell $(GPUCC) --version | grep 'Cuda compilation tools' | cut -d' ' -f5 | cut -d, -f1)
+    GPUFLAGS += -std=c++17 # need CUDA >= 11.2 (see #333): this is enforced in mgOnGpuConfig.h
+    # Without -maxrregcount: baseline throughput: 6.5E8 (16384 32 12) up to 7.3E8 (65536 128 12)
+    ###GPUFLAGS+= --maxrregcount 160 # improves throughput: 6.9E8 (16384 32 12) up to 7.7E8 (65536 128 12)
+    ###GPUFLAGS+= --maxrregcount 128 # improves throughput: 7.3E8 (16384 32 12) up to 7.6E8 (65536 128 12)
+    ###GPUFLAGS+= --maxrregcount 96 # degrades throughput: 4.1E8 (16384 32 12) up to 4.5E8 (65536 128 12)
+    ###GPUFLAGS+= --maxrregcount 64 # degrades throughput: 1.7E8 (16384 32 12) flat at 1.7E8 (65536 128 12)
+
+    CUBUILDRULEFLAGS = -Xcompiler -fPIC -c
+    CCBUILDRULEFLAGS = -Xcompiler -fPIC -c -x cu
+
+    CUDATESTFLAGS = -lcuda
+
+  else ifneq ($(origin REQUIRE_CUDA),undefined)
+    # If REQUIRE_CUDA is set but no cuda is found, stop here (e.g. for CI tests on GPU #443)
+    $(error No cuda installation found (set CUDA_HOME or make GPUCC visible in PATH))
+  else
+    # No cuda. Switch cuda compilation off and go to common random numbers in C++
+    $(warning CUDA_HOME is not set or is invalid: export CUDA_HOME to compile with cuda)
+    override GPUCC=
+    override USE_NVTX=
+    override CUINC=
+    override CURANDLIBFLAGS=
+  endif
+
+  # Set the host C++ compiler for GPUCC via "-ccbin <host-compiler>"
+  # (NB issue #505: this must be a single word, "clang++ --gcc-toolchain..." is not supported)
+  GPUFLAGS += -ccbin $(shell which $(subst ccache ,,$(CXX)))
+
+  # Allow newer (unsupported) C++ compilers with older versions of CUDA if ALLOW_UNSUPPORTED_COMPILER_IN_CUDA is set (#504)
+  ifneq ($(origin ALLOW_UNSUPPORTED_COMPILER_IN_CUDA),undefined)
+  GPUFLAGS += -allow-unsupported-compiler
+  endif
+
+else ifeq ($(findstring hipcc,$(HIP_COMPILER_PATH)),hipcc)
+  #=== Configure the HIP compiler
+
+  # If CXX is not a single word (example "clang++ --gcc-toolchain...") then disable CUDA builds (issue #505)
+  # This is because it is impossible to pass this to "GPUFLAGS += -ccbin <host-compiler>" below
+  ifneq ($(words $(subst ccache ,,$(CXX))),1) # allow at most "CXX=ccache <host-compiler>" from outside
+    $(warning CUDA builds are not supported for multi-word CXX "$(CXX)")
+    override CUDA_HOME=disabled
+  endif
+
+  # If HIP_HOME is not set, try to set it from the location of GPUCC
+  ifndef HIP_HOME
+    HIP_HOME = $(patsubst %bin/hipcc,%,$(HIP_COMPILER_PATH))
+    $(warning HIP_HOME was not set: using "$(HIP_HOME)")
+  endif
 
-# Set the host C++ compiler for nvcc via "-ccbin <host-compiler>"
-# (NB issue #505: this must be a single word, "clang++ --gcc-toolchain..." is not supported)
-CUFLAGS += -ccbin $(shell which $(subst ccache ,,$(CXX)))
+  # Set GPUCC as $(HIP_HOME)/bin/hipcc if it exists
+  ifneq ($(wildcard $(HIP_HOME)/bin/hipcc),)
+    GPUCC = $(HIP_HOME)/bin/hipcc
+
+    # Should maybe find something equivelant to this in HIP
+    #USE_NVTX ?=-DUSE_NVTX
+
+    HIPARCHFLAGS = -target x86_64-linux-gnu --offload-arch=gfx90a
+    HIPINC = -I$(HIP_HOME)/include/
+
+    # -DHIP_FAST_MATH equivelant to -use_fast_math in HIP 
+    # (But only for single precision line 208: https://rocm-developer-tools.github.io/HIP/hcc__detail_2math__functions_8h_source.html)
+    GPUFLAGS = $(OPTFLAGS) $(CUOPTFLAGS) $(INCFLAGS) $(HIPINC) $(HIPARCHFLAGS) -DHIP_FAST_MATH -DHIP_PLATFORM=amd -fPIC
+    ###GPUFLAGS += -Xcompiler -Wall -Xcompiler -Wextra -Xcompiler -Wshadow
+    GPUFLAGS += -std=c++17
+    # Without -maxrregcount: baseline throughput: 6.5E8 (16384 32 12) up to 7.3E8 (65536 128 12)
+    ###GPUFLAGS+= --maxrregcount 160 # improves throughput: 6.9E8 (16384 32 12) up to 7.7E8 (65536 128 12)
+    ###GPUFLAGS+= --maxrregcount 128 # improves throughput: 7.3E8 (16384 32 12) up to 7.6E8 (65536 128 12)
+    ###GPUFLAGS+= --maxrregcount 96 # degrades throughput: 4.1E8 (16384 32 12) up to 4.5E8 (65536 128 12)
+    ###GPUFLAGS+= --maxrregcount 64 # degrades throughput: 1.7E8 (16384 32 12) flat at 1.7E8 (65536 128 12)
+
+    CUBUILDRULEFLAGS = -fPIC -c
+    CCBUILDRULEFLAGS = -fPIC -c
+
+  else ifneq ($(origin REQUIRE_HIP),undefined)
+    # If REQUIRE_HIP is set but no cuda is found, stop here (e.g. for CI tests on GPU #443)
+    $(error No hip installation found (set HIP_HOME or make GPUCC visible in PATH))
+  else
+    # No hip. Switch hip compilation off and go to common random numbers in C++
+    $(warning HIP_HOME is not set or is invalid: export HIP_HOME to compile with hip)
+    override GPUCC=
+    override USE_NVTX=
+    override CUINC=
+    override CURANDLIBFLAGS=
+  endif
+
+  # Allow newer (unsupported) C++ compilers with older versions of CUDA if ALLOW_UNSUPPORTED_COMPILER_IN_CUDA is set (#504)
+  ifneq ($(origin ALLOW_UNSUPPORTED_COMPILER_IN_CUDA),undefined)
+  GPUFLAGS += -allow-unsupported-compiler
+  endif
 
-# Allow newer (unsupported) C++ compilers with older versions of CUDA if ALLOW_UNSUPPORTED_COMPILER_IN_CUDA is set (#504)
-ifneq ($(origin ALLOW_UNSUPPORTED_COMPILER_IN_CUDA),undefined)
-CUFLAGS += -allow-unsupported-compiler
 endif
 
+  
 #-------------------------------------------------------------------------------
 
 #=== Configure ccache for C++ and CUDA builds
@@ -163,9 +233,9 @@ endif
 #ifeq ($(USECCACHE)$(shell echo $(AR) | grep ccache),1)
 #  override AR:=ccache $(AR)
 #endif
-ifneq ($(NVCC),)
-  ifeq ($(USECCACHE)$(shell echo $(NVCC) | grep ccache),1)
-    override NVCC:=ccache $(NVCC)
+ifneq ($(GPUCC),)
+  ifeq ($(USECCACHE)$(shell echo $(GPUCC) | grep ccache),1)
+    override GPUCC:=ccache $(GPUCC)
   endif
 endif
 
@@ -189,7 +259,7 @@ endif
 
 # PowerPC-specific CUDA compiler flags (to be reviewed!)
 ifeq ($(UNAME_P),ppc64le)
-  CUFLAGS+= -Xcompiler -mno-float128
+  GPUFLAGS+= -Xcompiler -mno-float128
 endif
 
 #-------------------------------------------------------------------------------
@@ -199,10 +269,10 @@ endif
 # Set the default OMPFLAGS choice
 ifneq ($(shell $(CXX) --version | egrep '^Intel'),)
 override OMPFLAGS = -fopenmp
-###override OMPFLAGS = # disable OpenMP MT on Intel (was ok without nvcc but not ok with nvcc before #578)
+###override OMPFLAGS = # disable OpenMP MT on Intel (was ok without GPUCC but not ok with GPUCC before #578)
 else ifneq ($(shell $(CXX) --version | egrep '^(clang)'),)
 override OMPFLAGS = -fopenmp
-###override OMPFLAGS = # disable OpenMP MT on clang (was not ok without or with nvcc before #578)
+###override OMPFLAGS = # disable OpenMP MT on clang (was not ok without or with GPUCC before #578)
 else ifneq ($(shell $(CXX) --version | egrep '^(Apple clang)'),)
 override OMPFLAGS = # disable OpenMP MT on Apple clang (builds fail in the CI #578)
 else
@@ -253,7 +323,10 @@ endif
 
 # Set the default RNDGEN (random number generator) choice
 ifeq ($(RNDGEN),)
-  ifeq ($(NVCC),)
+  ifeq ($(GPUCC),)
+    override RNDGEN = hasNoCurand
+  # Edgecase for HIP compilation
+  else ifeq ($(findstring hipcc,$(GPUCC)),hipcc)
     override RNDGEN = hasNoCurand
   else ifeq ($(RNDGEN),)
     override RNDGEN = hasCurand
@@ -328,13 +401,13 @@ CXXFLAGS+= $(AVXFLAGS)
 $(info FPTYPE=$(FPTYPE))
 ifeq ($(FPTYPE),d)
   CXXFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_DOUBLE
-  CUFLAGS  += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_DOUBLE
+  GPUFLAGS  += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_DOUBLE
 else ifeq ($(FPTYPE),f)
   CXXFLAGS += -DMGONGPU_FPTYPE_FLOAT -DMGONGPU_FPTYPE2_FLOAT
-  CUFLAGS  += -DMGONGPU_FPTYPE_FLOAT -DMGONGPU_FPTYPE2_FLOAT
+  GPUFLAGS  += -DMGONGPU_FPTYPE_FLOAT -DMGONGPU_FPTYPE2_FLOAT
 else ifeq ($(FPTYPE),m)
   CXXFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_FLOAT
-  CUFLAGS  += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_FLOAT
+  GPUFLAGS  += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_FLOAT
 else
   $(error Unknown FPTYPE='$(FPTYPE)': only 'd', 'f' and 'm' are supported)
 endif
@@ -343,7 +416,7 @@ endif
 $(info HELINL=$(HELINL))
 ifeq ($(HELINL),1)
   CXXFLAGS += -DMGONGPU_INLINE_HELAMPS
-  CUFLAGS  += -DMGONGPU_INLINE_HELAMPS
+  GPUFLAGS  += -DMGONGPU_INLINE_HELAMPS
 else ifneq ($(HELINL),0)
   $(error Unknown HELINL='$(HELINL)': only '0' and '1' are supported)
 endif
@@ -352,7 +425,7 @@ endif
 $(info HRDCOD=$(HRDCOD))
 ifeq ($(HRDCOD),1)
   CXXFLAGS += -DMGONGPU_HARDCODE_PARAM
-  CUFLAGS  += -DMGONGPU_HARDCODE_PARAM
+  GPUFLAGS  += -DMGONGPU_HARDCODE_PARAM
 else ifneq ($(HRDCOD),0)
   $(error Unknown HRDCOD='$(HRDCOD)': only '0' and '1' are supported)
 endif
@@ -404,11 +477,11 @@ ifeq ($(UNAME_S),Darwin)
   override CULIBFLAGSRPATH2 =
 else
   # RPATH to cuda/cpp libs when linking executables
-  override CXXLIBFLAGSRPATH = -Wl,-rpath,$(LIBDIRRPATH)
-  override CULIBFLAGSRPATH = -Xlinker -rpath,$(LIBDIRRPATH)
+  override CXXLIBFLAGSRPATH = -Wl,-rpath=$(LIBDIRRPATH)
+  override CULIBFLAGSRPATH = -Xlinker -rpath=$(LIBDIRRPATH)
   # RPATH to common lib when linking cuda/cpp libs
-  override CXXLIBFLAGSRPATH2 = -Wl,-rpath,'$$ORIGIN'
-  override CULIBFLAGSRPATH2 = -Xlinker -rpath,'$$ORIGIN'
+  override CXXLIBFLAGSRPATH2 = -Wl,-rpath='$$ORIGIN'
+  override CULIBFLAGSRPATH2 = -Xlinker -rpath='$$ORIGIN'
 endif
 
 # Setting LD_LIBRARY_PATH or DYLD_LIBRARY_PATH in the RUNTIME is no longer necessary (neither on Linux nor on Mac)
@@ -421,7 +494,7 @@ override RUNTIME =
 cxx_main=$(BUILDDIR)/check.exe
 fcxx_main=$(BUILDDIR)/fcheck.exe
 
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 cu_main=$(BUILDDIR)/gcheck.exe
 fcu_main=$(BUILDDIR)/fgcheck.exe
 else
@@ -452,15 +525,16 @@ $(BUILDDIR)/.build.$(TAG):
 	@touch $(BUILDDIR)/.build.$(TAG)
 
 # Generic target and build rules: objects from CUDA compilation
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 $(BUILDDIR)/%.o : %.cu *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
 	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
-	$(NVCC) $(CPPFLAGS) $(CUFLAGS) -Xcompiler -fPIC -c $< -o $@
+	$(GPUCC) $(CPPFLAGS) $(GPUFLAGS) $(CUBUILDRULEFLAGS) $< -o $@
 
 $(BUILDDIR)/%_cu.o : %.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
 	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
-	$(NVCC) $(CPPFLAGS) $(CUFLAGS) -Xcompiler -fPIC -c -x cu $< -o $@
+	$(GPUCC) $(CPPFLAGS) $(GPUFLAGS) $(CCBUILDRULEFLAGS) $< -o $@
 endif
+# -x cu in line above
 
 # Generic target and build rules: objects from C++ compilation
 # (NB do not include CUINC here! add it only for NVTX or curand #679)
@@ -469,11 +543,14 @@ $(BUILDDIR)/%.o : %.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
 	$(CXX) $(CPPFLAGS) $(CXXFLAGS) -fPIC -c $< -o $@
 
 # Apply special build flags only to CrossSectionKernel.cc and gCrossSectionKernel.cu (no fast math, see #117)
+# Added edgecase for HIP compilation
 ifeq ($(shell $(CXX) --version | grep ^nvc++),)
 $(BUILDDIR)/CrossSectionKernels.o: CXXFLAGS += -fno-fast-math
 $(BUILDDIR)/CrossSectionKernels.o: CXXFLAGS += -fno-fast-math
-ifneq ($(NVCC),)
-$(BUILDDIR)/gCrossSectionKernels.o: CUFLAGS += -Xcompiler -fno-fast-math
+ifeq ($(findstring nvcc,$(GPUCC)),nvcc)
+  $(BUILDDIR)/gCrossSectionKernels.o: GPUFLAGS += -Xcompiler -fno-fast-math
+else
+  $(BUILDDIR)/gCrossSectionKernels.o: GPUFLAGS += -fno-fast-math
 endif
 endif
 
@@ -489,10 +566,10 @@ ifeq ($(RNDGEN),hasCurand)
 $(BUILDDIR)/CurandRandomNumberKernel.o: CXXFLAGS += $(CUINC)
 endif
 
-# Avoid "warning: builtin __has_trivial_... is deprecated; use __is_trivially_... instead" in nvcc with icx2023 (#592)
+# Avoid "warning: builtin __has_trivial_... is deprecated; use __is_trivially_... instead" in GPUCC with icx2023 (#592)
 ifneq ($(shell $(CXX) --version | egrep '^(Intel)'),)
-ifneq ($(NVCC),)
-CUFLAGS += -Xcompiler -Wno-deprecated-builtins
+ifneq ($(GPUCC),)
+GPUFLAGS += -Wno-deprecated-builtins
 endif
 endif
 
@@ -500,8 +577,8 @@ endif
 # This patch does remove the warning, but I prefer to keep it disabled for the moment...
 ###ifneq ($(shell $(CXX) --version | egrep '^(clang|Apple clang|Intel)'),)
 ###$(BUILDDIR)/CrossSectionKernels.o: CXXFLAGS += -Wno-overriding-t-option
-###ifneq ($(NVCC),)
-###$(BUILDDIR)/gCrossSectionKernels.o: CUFLAGS += -Xcompiler -Wno-overriding-t-option
+###ifneq ($(GPUCC),)
+###$(BUILDDIR)/gCrossSectionKernels.o: GPUFLAGS += -Xcompiler -Wno-overriding-t-option
 ###endif
 ###endif
 
@@ -528,7 +605,7 @@ MG5AMC_CXXLIB = mg5amc_$(processid_short)_cpp
 cxx_objects_lib=$(BUILDDIR)/CPPProcess.o $(BUILDDIR)/MatrixElementKernels.o $(BUILDDIR)/BridgeKernels.o $(BUILDDIR)/CrossSectionKernels.o
 cxx_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel.o $(BUILDDIR)/RamboSamplingKernels.o
 
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 MG5AMC_CULIB = mg5amc_$(processid_short)_cuda
 cu_objects_lib=$(BUILDDIR)/gCPPProcess.o $(BUILDDIR)/gMatrixElementKernels.o $(BUILDDIR)/gBridgeKernels.o $(BUILDDIR)/gCrossSectionKernels.o
 cu_objects_exe=$(BUILDDIR)/gCommonRandomNumberKernel.o $(BUILDDIR)/gRamboSamplingKernels.o
@@ -540,11 +617,11 @@ $(LIBDIR)/lib$(MG5AMC_CXXLIB).so: cxx_objects_lib += $(BUILDDIR)/fbridge.o
 $(LIBDIR)/lib$(MG5AMC_CXXLIB).so: $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib)
 	$(CXX) -shared -o $@ $(cxx_objects_lib) $(CXXLIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB)
 
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 $(LIBDIR)/lib$(MG5AMC_CULIB).so: $(BUILDDIR)/fbridge_cu.o
 $(LIBDIR)/lib$(MG5AMC_CULIB).so: cu_objects_lib += $(BUILDDIR)/fbridge_cu.o
 $(LIBDIR)/lib$(MG5AMC_CULIB).so: $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cu_objects_lib)
-	$(NVCC) --shared -o $@ $(cu_objects_lib) $(CULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB)
+	$(GPUCC) --shared -o $@ $(cu_objects_lib) $(CULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB)
 endif
 
 #-------------------------------------------------------------------------------
@@ -561,16 +638,16 @@ $(cxx_main): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PAT
 $(cxx_main): $(BUILDDIR)/check_sa.o $(LIBDIR)/lib$(MG5AMC_CXXLIB).so $(cxx_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel.o
 	$(CXX) -o $@ $(BUILDDIR)/check_sa.o $(OMPFLAGS) -ldl -pthread $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_CXXLIB) $(cxx_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel.o $(CURANDLIBFLAGS)
 
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 ifneq ($(shell $(CXX) --version | grep ^Intel),)
-$(cu_main): LIBFLAGS += -lintlc # compile with icpx and link with nvcc (undefined reference to `_intel_fast_memcpy')
-$(cu_main): LIBFLAGS += -lsvml # compile with icpx and link with nvcc (undefined reference to `__svml_cos4_l9')
+$(cu_main): LIBFLAGS += -lintlc # compile with icpx and link with GPUCC (undefined reference to `_intel_fast_memcpy')
+$(cu_main): LIBFLAGS += -lsvml # compile with icpx and link with GPUCC (undefined reference to `__svml_cos4_l9')
 else ifneq ($(shell $(CXX) --version | grep ^nvc++),) # support nvc++ #531
 $(cu_main): LIBFLAGS += -L$(patsubst %bin/nvc++,%lib,$(subst ccache ,,$(CXX))) -lnvhpcatm -lnvcpumath -lnvc
 endif
 $(cu_main): LIBFLAGS += $(CULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH
 $(cu_main): $(BUILDDIR)/gcheck_sa.o $(LIBDIR)/lib$(MG5AMC_CULIB).so $(cu_objects_exe) $(BUILDDIR)/gCurandRandomNumberKernel.o
-	$(NVCC) -o $@ $(BUILDDIR)/gcheck_sa.o $(CUARCHFLAGS) $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe) $(BUILDDIR)/gCurandRandomNumberKernel.o $(CURANDLIBFLAGS)
+	$(GPUCC) -o $@ $(BUILDDIR)/gcheck_sa.o $(CUARCHFLAGS) $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe) $(BUILDDIR)/gCurandRandomNumberKernel.o $(CURANDLIBFLAGS)
 endif
 
 #-------------------------------------------------------------------------------
@@ -596,17 +673,17 @@ $(fcxx_main): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PA
 $(fcxx_main): $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler.o $(LIBDIR)/lib$(MG5AMC_CXXLIB).so $(cxx_objects_exe)
 	$(CXX) -o $@ $(BUILDDIR)/fcheck_sa.o $(OMPFLAGS) $(BUILDDIR)/fsampler.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CXXLIB) $(cxx_objects_exe)
 
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 ifneq ($(shell $(CXX) --version | grep ^Intel),)
-$(fcu_main): LIBFLAGS += -lintlc # compile with icpx and link with nvcc (undefined reference to `_intel_fast_memcpy')
-$(fcu_main): LIBFLAGS += -lsvml # compile with icpx and link with nvcc (undefined reference to `__svml_cos4_l9')
+$(fcu_main): LIBFLAGS += -lintlc # compile with icpx and link with GPUCC (undefined reference to `_intel_fast_memcpy')
+$(fcu_main): LIBFLAGS += -lsvml # compile with icpx and link with GPUCC (undefined reference to `__svml_cos4_l9')
 endif
 ifeq ($(UNAME_S),Darwin)
 $(fcu_main): LIBFLAGS += -L$(shell dirname $(shell $(FC) --print-file-name libgfortran.dylib)) # add path to libgfortran on Mac #375
 endif
 $(fcu_main): LIBFLAGS += $(CULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH
 $(fcu_main): $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler_cu.o $(LIBDIR)/lib$(MG5AMC_CULIB).so $(cu_objects_exe)
-	$(NVCC) -o $@ $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler_cu.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe)
+	$(GPUCC) -o $@ $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler_cu.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe)
 endif
 
 #-------------------------------------------------------------------------------
@@ -618,7 +695,7 @@ $(BUILDDIR)/testxxx.o: testxxx_cc_ref.txt
 $(testmain): $(BUILDDIR)/testxxx.o
 $(testmain): cxx_objects_exe += $(BUILDDIR)/testxxx.o # Comment out this line to skip the C++ test of xxx functions
 
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 $(BUILDDIR)/testxxx_cu.o: $(GTESTLIBS)
 $(BUILDDIR)/testxxx_cu.o: INCFLAGS += $(GTESTINC)
 $(BUILDDIR)/testxxx_cu.o: testxxx_cc_ref.txt
@@ -631,7 +708,7 @@ $(BUILDDIR)/testmisc.o: INCFLAGS += $(GTESTINC)
 $(testmain): $(BUILDDIR)/testmisc.o
 $(testmain): cxx_objects_exe += $(BUILDDIR)/testmisc.o # Comment out this line to skip the C++ miscellaneous tests
 
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 $(BUILDDIR)/testmisc_cu.o: $(GTESTLIBS)
 $(BUILDDIR)/testmisc_cu.o: INCFLAGS += $(GTESTINC)
 $(testmain): $(BUILDDIR)/testmisc_cu.o
@@ -643,12 +720,12 @@ $(BUILDDIR)/runTest.o: INCFLAGS += $(GTESTINC)
 $(testmain): $(BUILDDIR)/runTest.o
 $(testmain): cxx_objects_exe += $(BUILDDIR)/runTest.o
 
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 $(BUILDDIR)/runTest_cu.o: $(GTESTLIBS)
 $(BUILDDIR)/runTest_cu.o: INCFLAGS += $(GTESTINC)
 ifneq ($(shell $(CXX) --version | grep ^Intel),)
-$(testmain): LIBFLAGS += -lintlc # compile with icpx and link with nvcc (undefined reference to `_intel_fast_memcpy')
-$(testmain): LIBFLAGS += -lsvml # compile with icpx and link with nvcc (undefined reference to `__svml_cos4_l9')
+$(testmain): LIBFLAGS += -lintlc # compile with icpx and link with GPUCC (undefined reference to `_intel_fast_memcpy')
+$(testmain): LIBFLAGS += -lsvml # compile with icpx and link with GPUCC (undefined reference to `__svml_cos4_l9')
 else ifneq ($(shell $(CXX) --version | grep ^nvc++),) # support nvc++ #531
 $(testmain): LIBFLAGS += -L$(patsubst %bin/nvc++,%lib,$(subst ccache ,,$(CXX))) -lnvhpcatm -lnvcpumath -lnvc
 endif
@@ -672,14 +749,14 @@ $(testmain): LIBFLAGS += -lgomp
 endif
 endif
 
-ifeq ($(NVCC),) # link only runTest.o
+ifeq ($(GPUCC),) # link only runTest.o
 $(testmain): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH
 $(testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(cxx_objects_exe) $(GTESTLIBS)
 	$(CXX) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) -ldl -pthread $(LIBFLAGS)
 else # link both runTest.o and runTest_cu.o
 $(testmain): LIBFLAGS += $(CULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH
 $(testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) $(GTESTLIBS)
-	$(NVCC) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) -ldl $(LIBFLAGS) -lcuda
+	  $(GPUCC) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) -ldl $(LIBFLAGS) $(CUDATESTFLAGS)
 endif
 
 # Use flock (Linux only, no Mac) to allow 'make -j' if googletest has not yet been downloaded https://stackoverflow.com/a/32666215
@@ -782,9 +859,9 @@ ifeq ($(USECCACHE),1)
 	ccache --version | head -1
 endif
 	@echo ""
-	@echo NVCC=$(NVCC)
-ifneq ($(NVCC),)
-	$(NVCC) --version
+	@echo GPUCC=$(GPUCC)
+ifneq ($(GPUCC),)
+	$(GPUCC) --version
 endif
 	@echo ""
 	@echo CXX=$(CXX)
@@ -803,7 +880,7 @@ endif
 
 # Target: check (run the C++ test executable)
 # [NB THIS IS WHAT IS USED IN THE GITHUB CI!]
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 check: runTest cmpFcheck cmpFGcheck
 else
 check: runTest cmpFcheck
diff --git a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/fbridge.cc b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/fbridge.cc
index f93c05b0b3..2b956730d4 100644
--- a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/fbridge.cc
+++ b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/fbridge.cc
@@ -1,11 +1,11 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: S. Roiser (Oct 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Roiser, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #include "Bridge.h"
 #include "CPPProcess.h"
-#include "CudaRuntime.h"
+#include "GpuRuntime.h"
 
 extern "C"
 {
@@ -22,7 +22,7 @@ extern "C"
    * Using the same Fortran MadEvent code, linking to the hetrerogeneous library would allow access to both CPU and GPU implementations.
    * The specific heterogeneous configuration (how many GPUs, how many threads on each CPU, etc) could be loaded in CUDA/C++ from a data file.
    */
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   using namespace mg5amcGpu;
 #else
   using namespace mg5amcCpu;
@@ -46,8 +46,8 @@ extern "C"
    */
   void fbridgecreate_( CppObjectInFortran** ppbridge, const int* pnevtF, const int* pnparF, const int* pnp4F )
   {
-#ifdef __CUDACC__
-    CudaRuntime::setUp();
+#ifdef MGONGPUCPP_GPUIMPL
+    GpuRuntime::setUp();
 #endif
     // Create a process object, read parm card and set parameters
     // FIXME: the process instance can happily go out of scope because it is only needed to read parameters?
@@ -69,8 +69,8 @@ extern "C"
     Bridge<FORTRANFPTYPE>* pbridge = dynamic_cast<Bridge<FORTRANFPTYPE>*>( *ppbridge );
     if( pbridge == 0 ) throw std::runtime_error( "fbridgedelete_: invalid Bridge address" );
     delete pbridge;
-#ifdef __CUDACC__
-    CudaRuntime::tearDown();
+#ifdef MGONGPUCPP_GPUIMPL
+    GpuRuntime::tearDown();
 #endif
   }
 
@@ -100,7 +100,7 @@ extern "C"
   {
     Bridge<FORTRANFPTYPE>* pbridge = dynamic_cast<Bridge<FORTRANFPTYPE>*>( *ppbridge );
     if( pbridge == 0 ) throw std::runtime_error( "fbridgesequence_: invalid Bridge address" );
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     // Use the device/GPU implementation in the CUDA library
     // (there is also a host implementation in this library)
     pbridge->gpu_sequence( momenta, gs, rndhel, rndcol, *pchannelId, mes, selhel, selcol );
diff --git a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/fsampler.cc b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/fsampler.cc
index 2fb445372d..3743934f41 100644
--- a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/fsampler.cc
+++ b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/fsampler.cc
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Feb 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #include "mgOnGpuConfig.h"
 
@@ -13,7 +13,7 @@
 
 //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -40,7 +40,7 @@ namespace mg5amcCpu
   private:
     const int m_nevt; // The number of events in each iteration
     int m_iiter;      // The iteration counter (for random number seeding)
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
     HostBufferRndNumMomenta m_hstRndmom; // Memory buffers for random numbers
     HostBufferMomenta m_hstMomenta;      // Memory buffers for momenta
     HostBufferWeights m_hstWeights;      // Memory buffers for sampling weights
@@ -105,7 +105,7 @@ namespace mg5amcCpu
 
 extern "C"
 {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   using namespace mg5amcGpu;
 #else
   using namespace mg5amcCpu;
diff --git a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/runTest.cc b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/runTest.cc
index 572e28aaea..461ec5c3a5 100644
--- a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/runTest.cc
+++ b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/runTest.cc
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: S. Hageboeck (Nov 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 
 #include "mgOnGpuConfig.h"
 
@@ -15,7 +15,7 @@
 #include "RandomNumberKernels.h"
 #include "epoch_process_id.h"
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 using namespace mg5amcGpu;
 #else
 using namespace mg5amcCpu;
@@ -32,7 +32,7 @@ struct CUDA_CPU_TestBase : public TestDriverBase
     : TestDriverBase( npar, refFileName ) {}
 };
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 struct CPUTest : public CUDA_CPU_TestBase
 {
   // Struct data members (process, and memory structures for random numbers, momenta, matrix elements and weights on host and device)
@@ -114,7 +114,7 @@ struct CPUTest : public CUDA_CPU_TestBase
 };
 #endif
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 struct CUDATest : public CUDA_CPU_TestBase
 {
   // Reset the device when our test goes out of scope. Note that this should happen after
@@ -123,7 +123,7 @@ struct CUDATest : public CUDA_CPU_TestBase
   {
     ~DeviceReset()
     {
-      checkCuda( cudaDeviceReset() ); // this is needed by cuda-memcheck --leak-check full
+      gpuDeviceReset(); // this is needed by cuda-memcheck --leak-check full
     }
   } deviceResetter;
 
@@ -249,7 +249,7 @@ INSTANTIATE_TEST_SUITE_P( prefix, \
                           test_suite_name, \
                           testing::Values( new CUDATest( MG_EPOCH_REFERENCE_FILE_NAME ) ) );
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 MG_INSTANTIATE_TEST_SUITE_GPU( XTESTID_GPU( MG_EPOCH_PROCESS_ID ), MadgraphTest );
 #else
 MG_INSTANTIATE_TEST_SUITE_CPU( XTESTID_CPU( MG_EPOCH_PROCESS_ID ), MadgraphTest );
diff --git a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/testmisc.cc b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/testmisc.cc
index 989aba1fdc..2bd7a9fcf9 100644
--- a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/testmisc.cc
+++ b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/testmisc.cc
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 //----------------------------------------------------------------------------
 // Use ./runTest.exe --gtest_filter=*misc to run only this test
 //----------------------------------------------------------------------------
@@ -17,7 +17,7 @@
 #include <sstream>
 #include <typeinfo>
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 #define TESTID( s ) s##_GPU_MISC
 #else
 #define TESTID( s ) s##_CPU_MISC
diff --git a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/testxxx.cc b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/testxxx.cc
index 4243e9fcec..6e8657edca 100644
--- a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/testxxx.cc
+++ b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/testxxx.cc
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Apr 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #include "mgOnGpuConfig.h"
 
@@ -20,7 +20,7 @@
 #include <iomanip>
 #include <iostream>
 #include <vector>
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 #define TESTID( s ) s##_GPU_XXX
 #else
 #define TESTID( s ) s##_CPU_XXX
@@ -41,7 +41,7 @@ TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testxxx )
   assert( nevt % neppM == 0 ); // nevt must be a multiple of neppM
   assert( nevt % neppV == 0 ); // nevt must be a multiple of neppV
   // Fill in the input momenta
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   mg5amcGpu::PinnedHostBufferMomenta hstMomenta( nevt ); // AOSOA[npagM][npar=4][np4=4][neppM]
 #else
   mg5amcCpu::HostBufferMomenta hstMomenta( nevt ); // AOSOA[npagM][npar=4][np4=4][neppM]
@@ -228,7 +228,7 @@ TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testxxx )
   {
     for( int ievt = 0; ievt < nevt; ievt++ )
     {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
       using namespace mg5amcGpu;
 #else
       using namespace mg5amcCpu;
diff --git a/epochX/cudacpp/gg_ttggg.sa/src/HelAmps_sm.h b/epochX/cudacpp/gg_ttggg.sa/src/HelAmps_sm.h
index ee2fcbbde5..f772885631 100644
--- a/epochX/cudacpp/gg_ttggg.sa/src/HelAmps_sm.h
+++ b/epochX/cudacpp/gg_ttggg.sa/src/HelAmps_sm.h
@@ -4,7 +4,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: A. Valassi (Sep 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
 // MadGraph5_aMC@NLO v. 3.5.0_lo_vect, 2023-06-09
@@ -26,7 +26,7 @@
 //#include <iomanip>
 //#include <iostream>
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/gg_ttggg.sa/src/Parameters_sm.cc b/epochX/cudacpp/gg_ttggg.sa/src/Parameters_sm.cc
index d3d01102fd..de87dcaf64 100644
--- a/epochX/cudacpp/gg_ttggg.sa/src/Parameters_sm.cc
+++ b/epochX/cudacpp/gg_ttggg.sa/src/Parameters_sm.cc
@@ -4,7 +4,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: A. Valassi (Sep 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
 // MadGraph5_aMC@NLO v. 3.5.0_lo_vect, 2023-06-09
diff --git a/epochX/cudacpp/gg_ttggg.sa/src/Parameters_sm.h b/epochX/cudacpp/gg_ttggg.sa/src/Parameters_sm.h
index 6551d8da81..fe7d686938 100644
--- a/epochX/cudacpp/gg_ttggg.sa/src/Parameters_sm.h
+++ b/epochX/cudacpp/gg_ttggg.sa/src/Parameters_sm.h
@@ -214,7 +214,7 @@ namespace Parameters_sm_dependentCouplings
 #pragma GCC diagnostic push
 #pragma GCC diagnostic ignored "-Wunused-variable"  // e.g. <<warning: unused variable ‘mdl_G__exp__2’ [-Wunused-variable]>>
 #pragma GCC diagnostic ignored "-Wunused-parameter" // e.g. <<warning: unused parameter ‘G’ [-Wunused-parameter]>>
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 #pragma nv_diagnostic push
 #pragma nv_diag_suppress 177 // e.g. <<warning #177-D: variable "mdl_G__exp__2" was declared but never referenced>>
 #endif
@@ -242,7 +242,7 @@ namespace Parameters_sm_dependentCouplings
     // End SM implementation - no special handling of vectors of floats as in EFT (#439)
     return out;
   }
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 #pragma GCC diagnostic pop
 #pragma nv_diagnostic pop
 #endif
@@ -258,7 +258,7 @@ namespace Parameters_sm_independentCouplings
 
 //==========================================================================
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/gg_ttggg.sa/src/mgOnGpuConfig.h b/epochX/cudacpp/gg_ttggg.sa/src/mgOnGpuConfig.h
index 6c0c4919e9..d4e37d19b3 100644
--- a/epochX/cudacpp/gg_ttggg.sa/src/mgOnGpuConfig.h
+++ b/epochX/cudacpp/gg_ttggg.sa/src/mgOnGpuConfig.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jul 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MGONGPUCONFIG_H
 #define MGONGPUCONFIG_H 1
@@ -10,13 +10,26 @@
 // There are two different code bases for standalone_cudacpp (without multichannel) and madevent+cudacpp (with multichannel)
 #undef MGONGPU_SUPPORTS_MULTICHANNEL
 
+// Is this a GPU (CUDA, HIP) or CPU implementation?
+#ifdef __CUDACC__
+#define MGONGPUCPP_GPUIMPL cuda
+#elif defined __HIPCC__
+#define MGONGPUCPP_GPUIMPL hip
+#else
+#undef MGONGPUCPP_GPUIMPL
+#endif
+
 // ** NB1 Throughputs (e.g. 6.8E8) are events/sec for "./gcheck.exe -p 65536 128 12"
 // ** NB2 Baseline on b7g47n0004 fluctuates (probably depends on load on other VMs)
 
 // Choose if curand is supported for generating random numbers
+// For CUDA, by default, it is supported
+// For HIP, by default, it is not supported
 // For C++, by default, do not inline, but allow this macro to be set from outside with e.g. -DMGONGPU_HAS_NO_CURAND
 #ifdef __CUDACC__
 #undef MGONGPU_HAS_NO_CURAND
+#elif defined __HIPCC__
+#define MGONGPU_HAS_NO_CURAND 1
 #else
 //#undef MGONGPU_HAS_NO_CURAND // default
 ////#define MGONGPU_HAS_NO_CURAND 1
@@ -52,23 +65,28 @@
 //#undef MGONGPU_HARDCODE_PARAM // default
 ////#define MGONGPU_HARDCODE_PARAM 1
 
-// Complex type in c++: std::complex or cxsmpl (CHOOSE ONLY ONE)
-#ifndef __CUDACC__
-//#define MGONGPU_CPPCXTYPE_STDCOMPLEX 1 // ~8 percent slower on float, same on double (5.1E6/double, 9.4E6/float)
-#define MGONGPU_CPPCXTYPE_CXSMPL 1 // new default (5.1E6/double, 10.2E6/float)
-#endif
-
-// Complex type in cuda: thrust or cucomplex or cxsmpl (CHOOSE ONLY ONE)
+// Complex type in CUDA: thrust or cucomplex or cxsmpl (CHOOSE ONLY ONE)
 #ifdef __CUDACC__
 #define MGONGPU_CUCXTYPE_THRUST 1 // default (~1.15E9/double, ~3.2E9/float)
 //#define MGONGPU_CUCXTYPE_CUCOMPLEX 1 // ~10 percent slower (1.03E9/double, ~2.8E9/float)
 //#define MGONGPU_CUCXTYPE_CXSMPL 1 // ~10 percent slower (1.00E9/double, ~2.9E9/float)
+
+// Complex type in HIP: cxsmpl (ONLY ONE OPTION POSSIBLE)
+#elif defined __HIPCC__
+#define MGONGPU_CUCXTYPE_CXSMPL 1 // ~10 percent slower (1.00E9/double, ~2.9E9/float)
+
+// Complex type in C++: std::complex or cxsmpl (CHOOSE ONLY ONE)
+#else
+//#define MGONGPU_CPPCXTYPE_STDCOMPLEX 1 // ~8 percent slower on float, same on double (5.1E6/double, 9.4E6/float)
+#define MGONGPU_CPPCXTYPE_CXSMPL 1 // new default (5.1E6/double, 10.2E6/float)
 #endif
 
-// Cuda nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation
+// CUDA nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation
 #ifdef __CUDACC__
-#undef MGONGPU_NSIGHT_DEBUG // default
+#undef MGONGPU_NSIGHT_DEBUG // default in CUDA
 //#define MGONGPU_NSIGHT_DEBUG 1
+#else
+#undef MGONGPU_NSIGHT_DEBUG // only option in HIP or C++
 #endif
 
 // SANITY CHECKS (floating point precision for everything but color algebra #537)
@@ -84,17 +102,21 @@
 #error You cannot use double precision for color algebra and single precision elsewhere
 #endif
 
-// SANITY CHECKS (c++ complex number implementation)
-#ifndef __CUDACC__
-#if defined MGONGPU_CPPCXTYPE_STDCOMPLEX and defined MGONGPU_CPPCXTYPE_CXSMPL
-#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CPPCXTYPE_STDCOMPLEX or MGONGPU_CPPCXTYPE_CXSMPL
+// SANITY CHECKS (CUDA complex number implementation)
+#ifdef __CUDACC__
+#if defined MGONGPU_CUCXTYPE_THRUST and defined MGONGPU_CUCXTYPE_CUCOMPLEX
+#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CUCXTYPE_THRUST or MGONGPU_CUCXTYPE_CUCOMPLEX for CUDA
+#elif defined MGONGPU_CUCXTYPE_THRUST and defined MGONGPU_CUCXTYPE_CXSMPL
+#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CUCXTYPE_THRUST or MGONGPU_CUCXTYPE_CXSMPL for CUDA
+#elif defined MGONGPU_CUCXTYPE_CUCOMPLEX and defined MGONGPU_CUCXTYPE_CXSMPL
+#error You must CHOOSE (ONE AND) ONLY ONE OF MGONGPU_CUCXTYPE_CUCOMPLEX or MGONGPU_CUCXTYPE_CXSMPL for CUDA
 #endif
 #endif
 
-// SANITY CHECKS (cuda complex number implementation)
-#ifdef __CUDACC__
-#if defined MGONGPU_CUCXTYPE_THRUST and defined MGONGPU_CUCXTYPE_CUCOMPLEX and defined MGONGPU_CUCXTYPE_CXSMPL
-#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CUCXTYPE_THRUST or MGONGPU_CUCXTYPE_CUCOMPLEX or MGONGPU_CUCXTYPE_CXSMPL
+// SANITY CHECKS (C++ complex number implementation)
+#ifndef MGONGPUCPP_GPUIMPL
+#if defined MGONGPU_CPPCXTYPE_STDCOMPLEX and defined MGONGPU_CPPCXTYPE_CXSMPL
+#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CPPCXTYPE_STDCOMPLEX or MGONGPU_CPPCXTYPE_CXSMPL for C++
 #endif
 #endif
 
@@ -131,7 +153,7 @@ namespace mgOnGpu
   // Alignment requirement for using reinterpret_cast with SIMD vectorized code
   // (using reinterpret_cast with non aligned memory may lead to segmentation faults!)
   // Only needed for C++ code but can be enforced also in NVCC builds of C++ code using CUDA>=11.2 and C++17 (#318, #319, #333)
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   constexpr int cppAlign = 64; // alignment requirement for SIMD vectorization (64-byte i.e. 512-bit)
 #endif
 
@@ -142,7 +164,7 @@ using mgOnGpu::fptype;
 using mgOnGpu::fptype2;
 
 // C++ SIMD vectorization width (this will be used to set neppV)
-#ifdef __CUDACC__ // CUDA implementation has no SIMD
+#ifdef MGONGPUCPP_GPUIMPL // CUDA and HIP implementations have no SIMD
 #undef MGONGPU_CPPSIMD
 #elif defined __AVX512VL__ && defined MGONGPU_PVW512 // C++ "512z" AVX512 with 512 width (512-bit ie 64-byte): 8 (DOUBLE) or 16 (FLOAT)
 #ifdef MGONGPU_FPTYPE_DOUBLE
@@ -172,9 +194,9 @@ using mgOnGpu::fptype2;
 #undef MGONGPU_CPPSIMD
 #endif
 
-// Cuda nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation
+// CUDA nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation
 // Arguments (not used so far): text is __FUNCTION__, code is 0 (start) or 1 (end)
-#if defined __CUDACC__ && defined MGONGPU_NSIGHT_DEBUG /* clang-format off */
+#if defined __CUDA__ && defined MGONGPU_NSIGHT_DEBUG /* clang-format off */
 #define mgDebugDeclare() __shared__ float mgDebugCounter[mgOnGpu::ntpbMAX];
 #define mgDebugInitialise() { mgDebugCounter[threadIdx.x] = 0; }
 #define mgDebug( code, text ) { mgDebugCounter[threadIdx.x] += 1; }
@@ -186,8 +208,8 @@ using mgOnGpu::fptype2;
 #define mgDebugFinalise() { /*noop*/ }
 #endif /* clang-format on */
 
-// Define empty CUDA declaration specifiers for C++
-#ifndef __CUDACC__
+// Define empty CUDA/HIP declaration specifiers for C++
+#ifndef MGONGPUCPP_GPUIMPL
 #define __global__
 #define __host__
 #define __device__
diff --git a/epochX/cudacpp/gg_ttggg.sa/src/mgOnGpuCxtypes.h b/epochX/cudacpp/gg_ttggg.sa/src/mgOnGpuCxtypes.h
index 0cb2f1db7e..4e7ab03fa2 100644
--- a/epochX/cudacpp/gg_ttggg.sa/src/mgOnGpuCxtypes.h
+++ b/epochX/cudacpp/gg_ttggg.sa/src/mgOnGpuCxtypes.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022, based on earlier work by D. Smith) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MGONGPUCXTYPES_H
 #define MGONGPUCXTYPES_H 1
@@ -19,7 +19,7 @@
 #include <complex>
 
 // Complex type in cuda: thrust or cucomplex or cxsmpl
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 #if defined MGONGPU_CUCXTYPE_THRUST
 #pragma clang diagnostic push
 #pragma clang diagnostic ignored "-Wtautological-compare" // for icpx2021/clang13 (https://stackoverflow.com/a/15864661)
@@ -201,7 +201,7 @@ namespace mgOnGpu
 {
 
   // --- Type definitions (complex type: cxtype)
-#ifdef __CUDACC__ // cuda
+#ifdef MGONGPUCPP_GPUIMPL // cuda
 #if defined MGONGPU_CUCXTYPE_THRUST
   typedef thrust::complex<fptype> cxtype;
 #elif defined MGONGPU_CUCXTYPE_CUCOMPLEX
@@ -281,7 +281,7 @@ cxmake( const std::complex<double>& c ) // std::complex to cxsmpl (double-to-flo
 
 //==========================================================================
 
-#if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_THRUST // cuda + thrust
+#if defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CUCXTYPE_THRUST // cuda + thrust
 
 //------------------------------
 // CUDA - using thrust::complex
@@ -317,11 +317,11 @@ cxmake( const cxtype& c )
   return c;
 }
 
-#endif // #if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_THRUST
+#endif // #if defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CUCXTYPE_THRUST
 
 //==========================================================================
 
-#if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_CUCOMPLEX // cuda + cucomplex
+#if defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CUCXTYPE_CUCOMPLEX // cuda + cucomplex
 
 //------------------------------
 // CUDA - using cuComplex
@@ -536,11 +536,11 @@ cxmake( const std::complex<fptype>& c ) // std::complex to cucomplex (float-to-f
   return cxmake( c.real(), c.imag() );
 }
 
-#endif // #if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_CUCOMPLEX
+#endif // #if defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CUCXTYPE_CUCOMPLEX
 
 //==========================================================================
 
-#if not defined __CUDACC__ and defined MGONGPU_CPPCXTYPE_STDCOMPLEX // c++ + stdcomplex
+#if not defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CPPCXTYPE_STDCOMPLEX // c++ + stdcomplex
 
 //------------------------------
 // C++ - using std::complex
@@ -584,7 +584,7 @@ cxmake( const std::complex<double>& c ) // std::complex to std::complex (cast do
 }
 #endif
 
-#endif // #if not defined __CUDACC__ and defined MGONGPU_CPPCXTYPE_STDCOMPLEX
+#endif // #if not defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CPPCXTYPE_STDCOMPLEX
 
 //==========================================================================
 
diff --git a/epochX/cudacpp/gg_ttggg.sa/src/mgOnGpuFptypes.h b/epochX/cudacpp/gg_ttggg.sa/src/mgOnGpuFptypes.h
index a1cde16a67..6f6cee64d6 100644
--- a/epochX/cudacpp/gg_ttggg.sa/src/mgOnGpuFptypes.h
+++ b/epochX/cudacpp/gg_ttggg.sa/src/mgOnGpuFptypes.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MGONGPUFPTYPES_H
 #define MGONGPUFPTYPES_H 1
@@ -13,7 +13,7 @@
 
 //==========================================================================
 
-#ifdef __CUDACC__ // cuda
+#ifdef MGONGPUCPP_GPUIMPL // cuda
 
 //------------------------------
 // Floating point types - Cuda
@@ -57,11 +57,11 @@ fpsqrt( const fptype& f )
 #endif
 }
 
-#endif // #ifdef __CUDACC__
+#endif // #ifdef MGONGPUCPP_GPUIMPL
 
 //==========================================================================
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 
 //------------------------------
 // Floating point types - C++
@@ -85,7 +85,7 @@ fpsqrt( const fptype& f )
   return std::sqrt( f );
 }
 
-#endif // #ifndef __CUDACC__
+#endif // #ifndef MGONGPUCPP_GPUIMPL
 
 //==========================================================================
 
diff --git a/epochX/cudacpp/gg_ttggg.sa/src/mgOnGpuVectors.h b/epochX/cudacpp/gg_ttggg.sa/src/mgOnGpuVectors.h
index 9d3e82b1e3..7904b93c61 100644
--- a/epochX/cudacpp/gg_ttggg.sa/src/mgOnGpuVectors.h
+++ b/epochX/cudacpp/gg_ttggg.sa/src/mgOnGpuVectors.h
@@ -108,7 +108,7 @@ namespace mgOnGpu /* clang-format off */
 #endif
 #endif
 
-#else // i.e #ifndef MGONGPU_CPPSIMD (this includes #ifdef __CUDACC__)
+#else // i.e #ifndef MGONGPU_CPPSIMD (this includes #ifdef MGONGPUCPP_GPUIMPL)
 
   const int neppV = 1;
 
@@ -129,7 +129,7 @@ using mgOnGpu::bool_v;
 
 //--------------------------------------------------------------------------
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 
 // Printout to stream for user defined types
 
@@ -744,11 +744,11 @@ fpvsplit1( const fptype2_v& v )
 
 #endif // #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
 
-#endif // #ifndef __CUDACC__
+#endif // #ifndef MGONGPUCPP_GPUIMPL
 
 //==========================================================================
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 
 //------------------------------
 // Vector types - CUDA
@@ -786,12 +786,12 @@ cxternary( const bool& mask, const cxtype& a, const cxtype& b )
   return ( mask ? a : b );
 }
 
-#endif // #ifdef __CUDACC__
+#endif // #ifdef MGONGPUCPP_GPUIMPL
 
 //==========================================================================
 
 // Scalar-or-vector types: scalar in CUDA, vector or scalar in C++
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 typedef bool bool_sv;
 typedef fptype fptype_sv;
 typedef fptype2 fptype2_sv;
@@ -812,7 +812,7 @@ typedef mgOnGpu::cxtype_ref cxtype_sv_ref;
 #endif
 
 // Scalar-or-vector zeros: scalar in CUDA, vector or scalar in C++
-#ifdef __CUDACC__ /* clang-format off */
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
 inline __host__ __device__ cxtype cxzero_sv(){ return cxtype( 0, 0 ); }
 #elif defined MGONGPU_CPPSIMD
 inline cxtype_v cxzero_sv() { return cxtype_v(); } // RRRR=0000 IIII=0000
diff --git a/epochX/cudacpp/gg_ttggg.sa/src/rambo.h b/epochX/cudacpp/gg_ttggg.sa/src/rambo.h
index e02ea52496..cd7e1008ea 100644
--- a/epochX/cudacpp/gg_ttggg.sa/src/rambo.h
+++ b/epochX/cudacpp/gg_ttggg.sa/src/rambo.h
@@ -4,7 +4,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 
 #include "mgOnGpuConfig.h"
@@ -18,7 +18,7 @@
 #include <iostream>
 
 // Simplified rambo version for 2 to N (with N>=2) processes with massless particles
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -83,7 +83,7 @@ namespace mg5amcCpu
       static bool first = true;
       if( first )
       {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
         if constexpr( M_ACCESS::isOnDevice() ) // avoid
         {
           const int ievt0 = 0;
@@ -166,7 +166,7 @@ namespace mg5amcCpu
     wt = po2log;
     if( nparf != 2 ) wt = ( 2. * nparf - 4. ) * log( energy ) + z[nparf - 1];
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
     // issue warnings if weight is too small or too large
     static int iwarn[5] = { 0, 0, 0, 0, 0 };
     if( wt < -180. )
diff --git a/epochX/cudacpp/gq_ttq.mad/CODEGEN_mad_gq_ttq_log.txt b/epochX/cudacpp/gq_ttq.mad/CODEGEN_mad_gq_ttq_log.txt
index 1505c1a4b4..bab8b8f779 100644
--- a/epochX/cudacpp/gq_ttq.mad/CODEGEN_mad_gq_ttq_log.txt
+++ b/epochX/cudacpp/gq_ttq.mad/CODEGEN_mad_gq_ttq_log.txt
@@ -51,8 +51,8 @@ Note that you can still compile and run aMC@NLO with the built-in PDFs
  MG5_aMC> set lhapdf /PATH/TO/lhapdf-config
 
 Using default text editor "vi". Set another one in ./input/mg5_configuration.txt
-No valid eps viewer found. Please set in ./input/mg5_configuration.txt
-Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt
+Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt
+No valid web browser found. Please set in ./input/mg5_configuration.txt
 import /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_mad_gq_ttq.mg
 The import format was not given, so we guess it as command
 set stdout_level DEBUG
@@ -61,7 +61,7 @@ set zerowidth_tchannel F
 define q = u c d s u~ c~ d~ s~; generate g q > t t~ q
 INFO: load particles 
 INFO: load vertices 
-[1;32mDEBUG: model prefixing  takes 0.004947662353515625 [0m
+[1;32mDEBUG: model prefixing  takes 0.00509190559387207 [0m
 INFO: Restrict model sm with file models/sm/restrict_default.dat . 
 [1;32mDEBUG: Simplifying conditional expressions [0m
 [1;32mDEBUG: remove interactions: u s w+ at order: QED=1 [0m
@@ -169,7 +169,7 @@ INFO: Crossed process found for g u~ > t t~ u~, reuse diagrams.
 INFO: Crossed process found for g c~ > t t~ c~, reuse diagrams. 
 INFO: Crossed process found for g d~ > t t~ d~, reuse diagrams. 
 INFO: Crossed process found for g s~ > t t~ s~, reuse diagrams. 
-8 processes with 40 diagrams generated in 0.071 s
+8 processes with 40 diagrams generated in 0.075 s
 Total: 8 processes with 40 diagrams
 output madevent CODEGEN_mad_gq_ttq --hel_recycling=False --vector_size=16384 --me_exporter=standalone_cudacpp
 Load PLUGIN.CUDACPP_SA_OUTPUT
@@ -199,7 +199,7 @@ INFO: Creating files in directory P1_gu_ttxu
 [1;32mDEBUG:  Entering PLUGIN_OneProcessExporter.__init__ [1;30m[model_handling.py at line 1028][0m [0m
 [1;32mDEBUG:  kwargs[prefix] = 0 [1;30m[model_handling.py at line 1029][0m [0m
 [1;32mDEBUG:  proc_id = [0m 1 [1;30m[model_handling.py at line 1034][0m [0m
-[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_SA_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7fa896bb1e20> [1;30m[export_v4.py at line 6174][0m [0m
+[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_SA_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7f01c88098b0> [1;30m[export_v4.py at line 6174][0m [0m
 INFO: Creating files in directory . 
 [1;32mDEBUG:  Entering PLUGIN_OneProcessExporter.generate_process_files [1;30m[model_handling.py at line 1286][0m [0m
 [1;32mDEBUG:  self.include_multi_channel is already defined: this is madevent+second_exporter mode [1;30m[model_handling.py at line 1288][0m [0m
@@ -214,9 +214,9 @@ FileWriter <class 'PLUGIN.CUDACPP_SA_OUTPUT.model_handling.PLUGIN_CPPWriter'> fo
 [1;32mDEBUG:  self.support_multichannel, self.include_multi_channel = [0m True [1, 2, 3, 4, 5] [1;30m[model_handling.py at line 1152][0m [0m
 [1;32mDEBUG:  multi_channel = [0m {1: [0], 2: [1], 3: [2], 4: [3], 5: [4]} [1;30m[model_handling.py at line 1158][0m [0m
 [1;32mDEBUG:  multi_channel_map = [0m {1: [0], 2: [1], 3: [2], 4: [3], 5: [4]} [1;30m[model_handling.py at line 1643][0m [0m
-[1;32mDEBUG:  diag_to_config = [0m {1: 1, 2: 2, 3: 3, 4: 4, 5: 5} [1;30m[model_handling.py at line 1698][0m [0m
-[1;32mDEBUG:  call = [0m vxxxxx( momenta,m_pars->%s, cHel[ihel][%d],%+d, w_sv[%d], %d ); [1;30m[model_handling.py at line 1810][0m [0m
-[1;32mDEBUG:  ('ZERO', 0, -1, 0, 0) [1;30m[model_handling.py at line 1811][0m [0m
+[1;32mDEBUG:  diag_to_config = [0m {1: 1, 2: 2, 3: 3, 4: 4, 5: 5} [1;30m[model_handling.py at line 1700][0m [0m
+[1;32mDEBUG:  call = [0m vxxxxx( momenta,m_pars->%s, cHel[ihel][%d],%+d, w_sv[%d], %d ); [1;30m[model_handling.py at line 1812][0m [0m
+[1;32mDEBUG:  ('ZERO', 0, -1, 0, 0) [1;30m[model_handling.py at line 1813][0m [0m
 INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./. 
 [1;32mDEBUG:  Entering PLUGIN_OneProcessExporter.edit_CMakeLists [1;30m[model_handling.py at line 1332][0m [0m
 [1;32mDEBUG:  Entering PLUGIN_OneProcessExporter.edit_check_sa [1;30m[model_handling.py at line 1341][0m [0m
@@ -237,7 +237,7 @@ INFO: Creating files in directory P1_gux_ttxux
 [1;32mDEBUG:  Entering PLUGIN_OneProcessExporter.__init__ [1;30m[model_handling.py at line 1028][0m [0m
 [1;32mDEBUG:  kwargs[prefix] = 0 [1;30m[model_handling.py at line 1029][0m [0m
 [1;32mDEBUG:  proc_id = [0m 1 [1;30m[model_handling.py at line 1034][0m [0m
-[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_SA_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7fa896c73430> [1;30m[export_v4.py at line 6174][0m [0m
+[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_SA_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7f01c86caee0> [1;30m[export_v4.py at line 6174][0m [0m
 INFO: Creating files in directory . 
 [1;32mDEBUG:  Entering PLUGIN_OneProcessExporter.generate_process_files [1;30m[model_handling.py at line 1286][0m [0m
 [1;32mDEBUG:  self.include_multi_channel is already defined: this is madevent+second_exporter mode [1;30m[model_handling.py at line 1288][0m [0m
@@ -252,9 +252,9 @@ FileWriter <class 'PLUGIN.CUDACPP_SA_OUTPUT.model_handling.PLUGIN_CPPWriter'> fo
 [1;32mDEBUG:  self.support_multichannel, self.include_multi_channel = [0m True [1, 2, 3, 4, 5] [1;30m[model_handling.py at line 1152][0m [0m
 [1;32mDEBUG:  multi_channel = [0m {1: [0], 2: [1], 3: [2], 4: [3], 5: [4]} [1;30m[model_handling.py at line 1158][0m [0m
 [1;32mDEBUG:  multi_channel_map = [0m {1: [0], 2: [1], 3: [2], 4: [3], 5: [4]} [1;30m[model_handling.py at line 1643][0m [0m
-[1;32mDEBUG:  diag_to_config = [0m {1: 1, 2: 2, 3: 3, 4: 4, 5: 5} [1;30m[model_handling.py at line 1698][0m [0m
-[1;32mDEBUG:  call = [0m vxxxxx( momenta,m_pars->%s, cHel[ihel][%d],%+d, w_sv[%d], %d ); [1;30m[model_handling.py at line 1810][0m [0m
-[1;32mDEBUG:  ('ZERO', 0, -1, 0, 0) [1;30m[model_handling.py at line 1811][0m [0m
+[1;32mDEBUG:  diag_to_config = [0m {1: 1, 2: 2, 3: 3, 4: 4, 5: 5} [1;30m[model_handling.py at line 1700][0m [0m
+[1;32mDEBUG:  call = [0m vxxxxx( momenta,m_pars->%s, cHel[ihel][%d],%+d, w_sv[%d], %d ); [1;30m[model_handling.py at line 1812][0m [0m
+[1;32mDEBUG:  ('ZERO', 0, -1, 0, 0) [1;30m[model_handling.py at line 1813][0m [0m
 INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./. 
 [1;32mDEBUG:  Entering PLUGIN_OneProcessExporter.edit_CMakeLists [1;30m[model_handling.py at line 1332][0m [0m
 [1;32mDEBUG:  Entering PLUGIN_OneProcessExporter.edit_check_sa [1;30m[model_handling.py at line 1341][0m [0m
@@ -271,17 +271,17 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./.
 [1;32mDEBUG:  Done [1;30m[export_cpp.py at line 713][0m [0m
 INFO: Generating Feynman diagrams for Process: g u~ > t t~ u~ WEIGHTED<=3 @1 
 INFO: Finding symmetric diagrams for subprocess group gux_ttxux 
-Generated helas calls for 2 subprocesses (10 diagrams) in 0.028 s
-Wrote files for 32 helas calls in 0.213 s
+Generated helas calls for 2 subprocesses (10 diagrams) in 0.029 s
+Wrote files for 32 helas calls in 0.223 s
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates FFV1 routines[0m
 ALOHA: aloha creates VVV1 routines[0m
-ALOHA: aloha creates 2 routines in  0.126 s
+ALOHA: aloha creates 2 routines in  0.138 s
 [1;32mDEBUG:  Entering PLUGIN_ProcessExporter.convert_model (create the model) [1;30m[output.py at line 194][0m [0m
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates FFV1 routines[0m
 ALOHA: aloha creates VVV1 routines[0m
-ALOHA: aloha creates 4 routines in  0.112 s
+ALOHA: aloha creates 4 routines in  0.124 s
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
@@ -325,6 +325,6 @@ Type "launch" to generate events from this process, or see
 Run "open index.html" to see more information about this process.
 quit
 
-real	0m1.861s
-user	0m1.623s
-sys	0m0.202s
+real	0m2.313s
+user	0m1.757s
+sys	0m0.177s
diff --git a/epochX/cudacpp/gq_ttq.mad/COPYRIGHT b/epochX/cudacpp/gq_ttq.mad/COPYRIGHT
index a134b5fef9..84a883fbb0 100644
--- a/epochX/cudacpp/gq_ttq.mad/COPYRIGHT
+++ b/epochX/cudacpp/gq_ttq.mad/COPYRIGHT
@@ -15,6 +15,7 @@ The full development team currently includes the following authors :
   Stephan Hageboeck (CERN)
   Olivier Mattelaer (Universite Catholique de Louvain, original author)
   Stefan Roiser (CERN, original author)
+  Joergen Teig (CERN)
   Andrea Valassi (CERN, original author)
   Zenny Wettersten (CERN)
 See https://github.com/madgraph5/madgraph4gpu for more details. For the full
diff --git a/epochX/cudacpp/gq_ttq.mad/Source/DHELAS/aloha_file.inc b/epochX/cudacpp/gq_ttq.mad/Source/DHELAS/aloha_file.inc
index 0c895f2b2c..4457933199 100644
--- a/epochX/cudacpp/gq_ttq.mad/Source/DHELAS/aloha_file.inc
+++ b/epochX/cudacpp/gq_ttq.mad/Source/DHELAS/aloha_file.inc
@@ -1 +1 @@
-ALOHARoutine = FFV1_1.o FFV1_2.o VVV1_0.o FFV1_0.o FFV1P0_3.o
+ALOHARoutine = FFV1_1.o FFV1_0.o VVV1_0.o FFV1_2.o FFV1P0_3.o
diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/Bridge.h b/epochX/cudacpp/gq_ttq.mad/SubProcesses/Bridge.h
index 4cafe0c997..c04628dfd1 100644
--- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/Bridge.h
+++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/Bridge.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: S. Roiser (Nov 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Roiser, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef BRIDGE_H
 #define BRIDGE_H 1
@@ -22,7 +22,7 @@
 #include <memory>
 #include <type_traits>
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -82,7 +82,7 @@ namespace mg5amcCpu
     Bridge& operator=( const Bridge& ) = delete;
     Bridge& operator=( Bridge&& ) = delete;
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     /**
      * Set the gpublocks and gputhreads for the gpusequence - throws if evnt != gpublocks*gputhreads
      * (this is needed for BridgeKernel tests rather than for actual production use in Fortran)
@@ -149,7 +149,7 @@ namespace mg5amcCpu
     unsigned int m_nevt; // number of events
     int m_nGoodHel;      // the number of good helicities (-1 initially when they have not yet been calculated)
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     int m_gputhreads; // number of gpu threads (default set from number of events, can be modified)
     int m_gpublocks;  // number of gpu blocks (default set from number of events, can be modified)
     mg5amcGpu::DeviceBuffer<FORTRANFPTYPE, sizePerEventMomenta> m_devMomentaF;
@@ -186,12 +186,12 @@ namespace mg5amcCpu
   // Forward declare transposition methods
   //
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 
   template<typename Tin, typename Tout>
   __global__ void dev_transposeMomentaF2C( const Tin* in, Tout* out, const unsigned int nevt );
 
-#endif // __CUDACC__
+#endif // MGONGPUCPP_GPUIMPL
 
   template<typename Tin, typename Tout>
   void hst_transposeMomentaF2C( const Tin* in, Tout* out, const unsigned int nevt );
@@ -208,7 +208,7 @@ namespace mg5amcCpu
   Bridge<FORTRANFPTYPE>::Bridge( unsigned int nevtF, unsigned int nparF, unsigned int np4F )
     : m_nevt( nevtF )
     , m_nGoodHel( -1 )
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     , m_gputhreads( 256 )                  // default number of gpu threads
     , m_gpublocks( m_nevt / m_gputhreads ) // this ensures m_nevt <= m_gpublocks*m_gputhreads
     , m_devMomentaF( m_nevt )
@@ -232,7 +232,7 @@ namespace mg5amcCpu
   {
     if( nparF != CPPProcess::npar ) throw std::runtime_error( "Bridge constructor: npar mismatch" );
     if( np4F != CPPProcess::np4 ) throw std::runtime_error( "Bridge constructor: np4 mismatch" );
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     if( ( m_nevt < s_gputhreadsmin ) || ( m_nevt % s_gputhreadsmin != 0 ) )
       throw std::runtime_error( "Bridge constructor: nevt should be a multiple of " + std::to_string( s_gputhreadsmin ) );
     while( m_nevt != m_gpublocks * m_gputhreads )
@@ -250,11 +250,11 @@ namespace mg5amcCpu
     std::cout << "WARNING! Instantiate host Bridge (nevt=" << m_nevt << ")" << std::endl;
     mg5amcCpu::CPPProcess process( /*verbose=*/false );
     m_pmek.reset( new mg5amcCpu::MatrixElementKernelHost( m_hstMomentaC, m_hstGs, m_hstRndHel, m_hstRndCol, m_hstMEs, m_hstSelHel, m_hstSelCol, m_nevt ) );
-#endif // __CUDACC__
+#endif // MGONGPUCPP_GPUIMPL
     process.initProc( "../../Cards/param_card.dat" );
   }
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   template<typename FORTRANFPTYPE>
   void Bridge<FORTRANFPTYPE>::set_gpugrid( const int gpublocks, const int gputhreads )
   {
@@ -268,7 +268,7 @@ namespace mg5amcCpu
   }
 #endif
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   template<typename FORTRANFPTYPE>
   void Bridge<FORTRANFPTYPE>::gpu_sequence( const FORTRANFPTYPE* momenta,
                                             const FORTRANFPTYPE* gs,
@@ -283,14 +283,14 @@ namespace mg5amcCpu
     constexpr int neppM = MemoryAccessMomenta::neppM;
     if constexpr( neppM == 1 && std::is_same_v<FORTRANFPTYPE, fptype> )
     {
-      checkCuda( cudaMemcpy( m_devMomentaC.data(), momenta, m_devMomentaC.bytes(), cudaMemcpyHostToDevice ) );
+      gpuMemcpy( m_devMomentaC.data(), momenta, m_devMomentaC.bytes(), gpuMemcpyHostToDevice );
     }
     else
     {
-      checkCuda( cudaMemcpy( m_devMomentaF.data(), momenta, m_devMomentaF.bytes(), cudaMemcpyHostToDevice ) );
+      gpuMemcpy( m_devMomentaF.data(), momenta, m_devMomentaF.bytes(), gpuMemcpyHostToDevice );
       const int thrPerEvt = CPPProcess::npar * CPPProcess::np4; // AV: transpose alg does 1 element per thread (NOT 1 event per thread)
       //const int thrPerEvt = 1; // AV: try new alg with 1 event per thread... this seems slower
-      dev_transposeMomentaF2C<<<m_gpublocks * thrPerEvt, m_gputhreads>>>( m_devMomentaF.data(), m_devMomentaC.data(), m_nevt );
+      gpuLaunchKernel( dev_transposeMomentaF2C, m_gpublocks * thrPerEvt, m_gputhreads, m_devMomentaF.data(), m_devMomentaC.data(), m_nevt );
     }
     if constexpr( std::is_same_v<FORTRANFPTYPE, fptype> )
     {
@@ -333,7 +333,7 @@ namespace mg5amcCpu
   }
 #endif
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   template<typename FORTRANFPTYPE>
   void Bridge<FORTRANFPTYPE>::cpu_sequence( const FORTRANFPTYPE* momenta,
                                             const FORTRANFPTYPE* gs,
@@ -388,7 +388,7 @@ namespace mg5amcCpu
   // - C++ array: momenta[npagM][npar][np4][neppM] with nevt=npagM*neppM (AOSOA)
   //
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   template<typename Tin, typename Tout>
   __global__ void dev_transposeMomentaF2C( const Tin* in, Tout* out, const unsigned int nevt )
   {
diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/BridgeKernels.cc b/epochX/cudacpp/gq_ttq.mad/SubProcesses/BridgeKernels.cc
index cef4cb3c71..90c7f2d3b8 100644
--- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/BridgeKernels.cc
+++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/BridgeKernels.cc
@@ -1,10 +1,11 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #include "BridgeKernels.h"
 
+#include "GpuAbstraction.h"
 #include "MemoryAccessMomenta.h"
 
 #include <sstream>
@@ -14,7 +15,7 @@ constexpr int npar = CPPProcess::npar; // #particles in total (external = initia
 
 //============================================================================
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -45,7 +46,7 @@ namespace mg5amcCpu
 
 //============================================================================
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 namespace mg5amcCpu
 {
 
@@ -96,7 +97,7 @@ namespace mg5amcCpu
 
 //============================================================================
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 {
 
diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/BridgeKernels.h b/epochX/cudacpp/gq_ttq.mad/SubProcesses/BridgeKernels.h
index 15eb4bff4d..3efef8ce97 100644
--- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/BridgeKernels.h
+++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/BridgeKernels.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef BRIDGEKERNELS_H
 #define BRIDGEKERNELS_H 1
@@ -12,7 +12,7 @@
 #include "MatrixElementKernels.h"
 #include "MemoryBuffers.h"
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -49,7 +49,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A Bridge wrapper class encapsulating matrix element calculations on a CPU host
   class BridgeKernelHost final : public BridgeKernelBase
   {
@@ -89,7 +89,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   // A Bridge wrapper class encapsulating matrix element calculations on a GPU device
   class BridgeKernelDevice : public BridgeKernelBase
   {
diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/CommonRandomNumberKernel.cc b/epochX/cudacpp/gq_ttq.mad/SubProcesses/CommonRandomNumberKernel.cc
index 985b39f576..010bc4cbd0 100644
--- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/CommonRandomNumberKernel.cc
+++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/CommonRandomNumberKernel.cc
@@ -1,15 +1,16 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #include "CommonRandomNumbers.h"
+#include "GpuAbstraction.h"
 #include "MemoryBuffers.h"
 #include "RandomNumberKernels.h"
 
 #include <cassert>
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/CrossSectionKernels.cc b/epochX/cudacpp/gq_ttq.mad/SubProcesses/CrossSectionKernels.cc
index 0b355a3c8d..c15b39844d 100644
--- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/CrossSectionKernels.cc
+++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/CrossSectionKernels.cc
@@ -1,10 +1,11 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #include "CrossSectionKernels.h"
 
+#include "GpuAbstraction.h"
 #include "MemoryAccessMatrixElements.h"
 #include "MemoryAccessWeights.h"
 #include "MemoryBuffers.h"
@@ -77,7 +78,7 @@ debug_me_is_abnormal( const fptype& me, size_t ievtALL )
 
 //============================================================================
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -185,7 +186,7 @@ namespace mg5amcCpu
 
 //============================================================================
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 {
 
diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/CrossSectionKernels.h b/epochX/cudacpp/gq_ttq.mad/SubProcesses/CrossSectionKernels.h
index 7933ca4bbf..4d9659e04e 100644
--- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/CrossSectionKernels.h
+++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/CrossSectionKernels.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef CROSSSECTIONKERNELS_H
 #define CROSSSECTIONKERNELS_H 1
@@ -13,7 +13,7 @@
 
 //============================================================================
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -96,7 +96,7 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
   /*
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   // A class encapsulating the calculation of event statistics on a GPU device
   class CrossSectionKernelDevice : public CrossSectionKernelBase, public NumberOfEvents
   {
diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/CudaRuntime.h b/epochX/cudacpp/gq_ttq.mad/SubProcesses/CudaRuntime.h
deleted file mode 100644
index 64ce52f4b3..0000000000
--- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/CudaRuntime.h
+++ /dev/null
@@ -1,85 +0,0 @@
-// Copyright (C) 2020-2023 CERN and UCLouvain.
-// Licensed under the GNU Lesser General Public License (version 3 or later).
-// Created by: S. Roiser (Jul 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
-
-#ifndef MG5AMC_CUDARUNTIME_H
-#define MG5AMC_CUDARUNTIME_H 1
-
-// MG5AMC on GPU uses the CUDA runtime API, not the lower level CUDA driver API
-// See https://docs.nvidia.com/cuda/cuda-runtime-api/driver-vs-runtime-api.html#driver-vs-runtime-api
-
-#include <cassert>
-#include <iostream>
-
-//--------------------------------------------------------------------------
-
-// See https://stackoverflow.com/a/14038590
-#ifdef __CUDACC__ /* clang-format off */
-#define checkCuda( code ) { assertCuda( code, __FILE__, __LINE__ ); }
-inline void assertCuda( cudaError_t code, const char* file, int line, bool abort = true )
-{
-  if( code != cudaSuccess )
-  {
-    printf( "ERROR! assertCuda: '%s' (%d) in %s:%d\n", cudaGetErrorString( code ), code, file, line );
-    if( abort ) assert( code == cudaSuccess );
-  }
-}
-#endif /* clang-format on */
-
-//--------------------------------------------------------------------------
-
-#ifdef __CUDACC__
-namespace mg5amcGpu
-{
-  // Instantiate a CudaRuntime at the beginnining of the application's main to
-  // invoke cudaSetDevice(0) in the constructor and book a cudaDeviceReset() call in the destructor
-  // *** FIXME! This will all need to be designed differently when going to multi-GPU nodes! ***
-  struct CudaRuntime final
-  {
-    CudaRuntime( const bool debug = true )
-      : m_debug( debug ) { setUp( m_debug ); }
-    ~CudaRuntime() { tearDown( m_debug ); }
-    CudaRuntime( const CudaRuntime& ) = delete;
-    CudaRuntime( CudaRuntime&& ) = delete;
-    CudaRuntime& operator=( const CudaRuntime& ) = delete;
-    CudaRuntime& operator=( CudaRuntime&& ) = delete;
-    bool m_debug;
-
-    // Set up CUDA application
-    // ** NB: strictly speaking this is not needed when using the CUDA runtime API **
-    // Calling cudaSetDevice on startup is useful to properly book-keep the time spent in CUDA initialization
-    static void setUp( const bool debug = true )
-    {
-      // ** NB: it is useful to call cudaSetDevice, or cudaFree, to properly book-keep the time spent in CUDA initialization
-      // ** NB: otherwise, the first CUDA operation (eg a cudaMemcpyToSymbol in CPPProcess ctor) appears to take much longer!
-      /*
-      // [We initially added cudaFree(0) to "ease profile analysis" only because it shows up as a big recognizable block!]
-      // No explicit initialization is needed: https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#initialization
-      // It is not clear what cudaFree(0) does at all: https://stackoverflow.com/questions/69967813/
-      if ( debug ) std::cout << "__CudaRuntime: calling cudaFree(0)" << std::endl;
-      checkCuda( cudaFree( 0 ) ); // SLOW!
-      */
-      // Replace cudaFree(0) by cudaSetDevice(0), even if it is not really needed either
-      // (but see https://developer.nvidia.com/blog/cuda-pro-tip-always-set-current-device-avoid-multithreading-bugs)
-      if( debug ) std::cout << "__CudaRuntime: calling cudaSetDevice(0)" << std::endl;
-      checkCuda( cudaSetDevice( 0 ) ); // SLOW!
-    }
-
-    // Tear down CUDA application (call cudaDeviceReset)
-    // ** NB: strictly speaking this is not needed when using the CUDA runtime API **
-    // Calling cudaDeviceReset on shutdown is only needed for checking memory leaks in cuda-memcheck
-    // See https://docs.nvidia.com/cuda/cuda-memcheck/index.html#leak-checking
-    static void tearDown( const bool debug = true )
-    {
-      if( debug ) std::cout << "__CudaRuntime: calling cudaDeviceReset()" << std::endl;
-      checkCuda( cudaDeviceReset() );
-    }
-  };
-
-}
-#endif
-
-//--------------------------------------------------------------------------
-
-#endif // MG5AMC_CUDARUNTIME_H
diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/CurandRandomNumberKernel.cc b/epochX/cudacpp/gq_ttq.mad/SubProcesses/CurandRandomNumberKernel.cc
index eb56333b03..38c477c17a 100644
--- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/CurandRandomNumberKernel.cc
+++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/CurandRandomNumberKernel.cc
@@ -1,9 +1,9 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
-#include "CudaRuntime.h"
+#include "GpuRuntime.h"
 #include "MemoryBuffers.h"
 #include "RandomNumberKernels.h"
 
@@ -114,7 +114,7 @@ namespace mg5amcCpu
     /*
     printf( "\nCurandRandomNumberKernel::generateRnarray size = %d\n", (int)m_rnarray.size() );
     fptype* data = m_rnarray.data();
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     if( m_rnarray.isOnDevice() )
     {
       data = new fptype[m_rnarray.size()]();
@@ -123,7 +123,7 @@ namespace mg5amcCpu
 #endif
     for( int i = 0; i < ( (int)m_rnarray.size() / 4 ); i++ )
       printf( "[%4d] %f %f %f %f\n", i * 4, data[i * 4], data[i * 4 + 2], data[i * 4 + 2], data[i * 4 + 3] );
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     if( m_rnarray.isOnDevice() ) delete[] data;
 #endif
     */
diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/EventStatistics.h b/epochX/cudacpp/gq_ttq.mad/SubProcesses/EventStatistics.h
index 48b51e0a49..b425a5bade 100644
--- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/EventStatistics.h
+++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/EventStatistics.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef EventStatistics_H
 #define EventStatistics_H 1
@@ -16,7 +16,7 @@
 #include <limits>
 #include <string>
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/GpuAbstraction.h b/epochX/cudacpp/gq_ttq.mad/SubProcesses/GpuAbstraction.h
new file mode 100644
index 0000000000..6a7d9c05c0
--- /dev/null
+++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/GpuAbstraction.h
@@ -0,0 +1,71 @@
+// Copyright (C) 2020-2023 CERN and UCLouvain.
+// Licensed under the GNU Lesser General Public License (version 3 or later).
+// Created by: J. Teig (Jul 2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+
+#ifndef MG5AMC_GPUABSTRACTION_H
+#define MG5AMC_GPUABSTRACTION_H 1
+
+#include <cassert>
+
+//--------------------------------------------------------------------------
+
+#ifdef __CUDACC__
+
+#define gpuError_t cudaError_t
+#define gpuPeekAtLastError cudaPeekAtLastError
+#define gpuGetErrorString cudaGetErrorString
+#define gpuSuccess cudaSuccess
+
+#define gpuMallocHost( ptr, size ) checkGpu( cudaMallocHost( ptr, size ) )
+#define gpuMalloc( ptr, size ) checkGpu( cudaMalloc( ptr, size ) )
+
+#define gpuMemcpy( dstData, srcData, srcBytes, func ) checkGpu( cudaMemcpy( dstData, srcData, srcBytes, func ) )
+#define gpuMemcpyHostToDevice cudaMemcpyHostToDevice
+#define gpuMemcpyDeviceToHost cudaMemcpyDeviceToHost
+#define gpuMemcpyToSymbol( type1, type2, size ) checkGpu( cudaMemcpyToSymbol( type1, type2, size ) )
+
+#define gpuFree( ptr ) checkGpu( cudaFree( ptr ) )
+#define gpuFreeHost( ptr ) checkGpu( cudaFreeHost( ptr ) )
+
+#define gpuSetDevice cudaSetDevice
+#define gpuDeviceSynchronize cudaDeviceSynchronize
+#define gpuDeviceReset cudaDeviceReset
+
+#define gpuLaunchKernel( kernel, blocks, threads, ... ) kernel<<<blocks, threads>>>( __VA_ARGS__ )
+#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<<blocks, threads, sharedMem>>>( __VA_ARGS__ )
+
+//--------------------------------------------------------------------------
+
+#elif defined __HIPCC__
+
+#include "hip/hip_runtime.h"
+
+#define gpuError_t hipError_t
+#define gpuPeekAtLastError hipPeekAtLastError
+#define gpuGetErrorString hipGetErrorString
+#define gpuSuccess hipSuccess
+
+#define gpuMallocHost( ptr, size ) checkGpu( hipHostMalloc( ptr, size ) ) // HostMalloc better
+#define gpuMalloc( ptr, size ) checkGpu( hipMalloc( ptr, size ) )
+
+#define gpuMemcpy( dstData, srcData, srcBytes, func ) checkGpu( hipMemcpy( dstData, srcData, srcBytes, func ) )
+#define gpuMemcpyHostToDevice hipMemcpyHostToDevice
+#define gpuMemcpyDeviceToHost hipMemcpyDeviceToHost
+#define gpuMemcpyToSymbol( type1, type2, size ) checkGpu( hipMemcpyToSymbol( type1, type2, size ) )
+
+#define gpuFree( ptr ) checkGpu( hipFree( ptr ) )
+#define gpuFreeHost( ptr ) checkGpu( hipHostFree( ptr ) )
+
+#define gpuSetDevice hipSetDevice
+#define gpuDeviceSynchronize hipDeviceSynchronize
+#define gpuDeviceReset hipDeviceReset
+
+#define gpuLaunchKernel( kernel, blocks, threads, ... ) kernel<<<blocks, threads>>>( __VA_ARGS__ )
+#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<<blocks, threads, sharedMem>>>( __VA_ARGS__ )
+
+//--------------------------------------------------------------------------
+
+#endif
+
+#endif // MG5AMC_GPUABSTRACTION_H
diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/GpuRuntime.h b/epochX/cudacpp/gq_ttq.mad/SubProcesses/GpuRuntime.h
new file mode 100644
index 0000000000..93579ef08b
--- /dev/null
+++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/GpuRuntime.h
@@ -0,0 +1,85 @@
+// Copyright (C) 2020-2023 CERN and UCLouvain.
+// Licensed under the GNU Lesser General Public License (version 3 or later).
+// Created by: J. Teig (Jun 2023, based on earlier work by S. Roiser) for the MG5aMC CUDACPP plugin.
+// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+
+#ifndef MG5AMC_GPURUNTIME_H
+#define MG5AMC_GPURUNTIME_H 1
+
+// MG5AMC on GPU uses the CUDA runtime API, not the lower level CUDA driver API
+// See https://docs.nvidia.com/cuda/cuda-runtime-api/driver-vs-runtime-api.html#driver-vs-runtime-api
+
+#include "GpuAbstraction.h"
+
+#include <iostream>
+
+//--------------------------------------------------------------------------
+
+// See https://stackoverflow.com/a/14038590
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
+#define checkGpu( code ) { assertGpu( code, __FILE__, __LINE__ ); }
+inline void assertGpu( gpuError_t code, const char* file, int line, bool abort = true )
+{
+  if( code != gpuSuccess )
+  {
+    printf( "ERROR! assertGpu: '%s' (%d) in %s:%d\n", gpuGetErrorString( code ), code, file, line );
+    if( abort ) assert( code == gpuSuccess );
+  }
+}
+#endif /* clang-format on */
+
+//--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+namespace mg5amcGpu
+{
+  // Instantiate a GpuRuntime at the beginnining of the application's main to
+  // invoke gpuSetDevice(0) in the constructor and book a gpuDeviceReset() call in the destructor
+  // *** FIXME! This will all need to be designed differently when going to multi-GPU nodes! ***
+  struct GpuRuntime final
+  {
+    GpuRuntime( const bool debug = true )
+      : m_debug( debug ) { setUp( m_debug ); }
+    ~GpuRuntime() { tearDown( m_debug ); }
+    GpuRuntime( const GpuRuntime& ) = delete;
+    GpuRuntime( GpuRuntime&& ) = delete;
+    GpuRuntime& operator=( const GpuRuntime& ) = delete;
+    GpuRuntime& operator=( GpuRuntime&& ) = delete;
+    bool m_debug;
+
+    // Set up CUDA application
+    // ** NB: strictly speaking this is not needed when using the CUDA runtime API **
+    // Calling cudaSetDevice on startup is useful to properly book-keep the time spent in CUDA initialization
+    static void setUp( const bool debug = true )
+    {
+      // ** NB: it is useful to call cudaSetDevice, or cudaFree, to properly book-keep the time spent in CUDA initialization
+      // ** NB: otherwise, the first CUDA operation (eg a cudaMemcpyToSymbol in CPPProcess ctor) appears to take much longer!
+      /*
+      // [We initially added cudaFree(0) to "ease profile analysis" only because it shows up as a big recognizable block!]
+      // No explicit initialization is needed: https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#initialization
+      // It is not clear what cudaFree(0) does at all: https://stackoverflow.com/questions/69967813/
+      if ( debug ) std::cout << "__CudaRuntime: calling cudaFree(0)" << std::endl;
+      checkCuda( cudaFree( 0 ) ); // SLOW!
+      */
+      // Replace cudaFree(0) by cudaSetDevice(0), even if it is not really needed either
+      // (but see https://developer.nvidia.com/blog/cuda-pro-tip-always-set-current-device-avoid-multithreading-bugs)
+      if( debug ) std::cout << "__GpuRuntime: calling GpuSetDevice(0)" << std::endl;
+      checkGpu( gpuSetDevice( 0 ) ); // SLOW!
+    }
+
+    // Tear down CUDA application (call cudaDeviceReset)
+    // ** NB: strictly speaking this is not needed when using the CUDA runtime API **
+    // Calling cudaDeviceReset on shutdown is only needed for checking memory leaks in cuda-memcheck
+    // See https://docs.nvidia.com/cuda/cuda-memcheck/index.html#leak-checking
+    static void tearDown( const bool debug = true )
+    {
+      if( debug ) std::cout << "__GpuRuntime: calling GpuDeviceReset()" << std::endl;
+      checkGpu( gpuDeviceReset() );
+    }
+  };
+}
+#endif
+
+//--------------------------------------------------------------------------
+
+#endif // MG5AMC_GPURUNTIME_H
diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/MadgraphTest.h b/epochX/cudacpp/gq_ttq.mad/SubProcesses/MadgraphTest.h
index fd7734ce42..d2ff326e20 100644
--- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/MadgraphTest.h
+++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/MadgraphTest.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: S. Hageboeck (Dec 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MADGRAPHTEST_H_
 #define MADGRAPHTEST_H_ 1
@@ -21,7 +21,7 @@
 #include <string>
 #include <vector>
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 using mg5amcGpu::CPPProcess;
 #else
 using mg5amcCpu::CPPProcess;
@@ -200,7 +200,7 @@ class MadgraphTest : public testing::TestWithParam<TestDriverBase*>
 
 // Since we link both the CPU-only and GPU tests into the same executable, we prevent
 // a multiply defined symbol by only compiling this in the non-CUDA phase:
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 
 /// Compare momenta and matrix elements.
 /// This uses an implementation of TestDriverBase to run a madgraph workflow,
@@ -309,6 +309,6 @@ TEST_P( MadgraphTest, CompareMomentaAndME )
   }
 }
 
-#endif // __CUDACC__
+#endif // MGONGPUCPP_GPUIMPL
 
 #endif /* MADGRAPHTEST_H_ */
diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/MatrixElementKernels.cc b/epochX/cudacpp/gq_ttq.mad/SubProcesses/MatrixElementKernels.cc
index 30257195b6..d6d6c4f179 100644
--- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/MatrixElementKernels.cc
+++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/MatrixElementKernels.cc
@@ -1,12 +1,12 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #include "MatrixElementKernels.h"
 
 #include "CPPProcess.h"
-#include "CudaRuntime.h"
+#include "GpuRuntime.h" // Includes the abstraction for Nvidia/AMD compilation
 #include "MemoryAccessMomenta.h"
 #include "MemoryBuffers.h"
 
@@ -14,7 +14,7 @@
 
 //============================================================================
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 namespace mg5amcCpu
 {
 
@@ -143,7 +143,7 @@ namespace mg5amcCpu
 
 //============================================================================
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 {
 
@@ -202,13 +202,13 @@ namespace mg5amcGpu
     PinnedHostBufferHelicityMask hstIsGoodHel( ncomb );
     DeviceBufferHelicityMask devIsGoodHel( ncomb );
     // ... 0d1. Compute good helicity mask on the device
-    computeDependentCouplings<<<m_gpublocks, m_gputhreads>>>( m_gs.data(), m_couplings.data() );
+    gpuLaunchKernel( computeDependentCouplings, m_gpublocks, m_gputhreads, m_gs.data(), m_couplings.data() );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    sigmaKin_getGoodHel<<<m_gpublocks, m_gputhreads>>>( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_numerators.data(), m_denominators.data(), devIsGoodHel.data() );
+    gpuLaunchKernel( sigmaKin_getGoodHel, m_gpublocks, m_gputhreads, m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_numerators.data(), m_denominators.data(), devIsGoodHel.data() );
 #else
-    sigmaKin_getGoodHel<<<m_gpublocks, m_gputhreads>>>( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), devIsGoodHel.data() );
+    gpuLaunchKernel( sigmaKin_getGoodHel, m_gpublocks, m_gputhreads, m_momenta.data(), m_couplings.data(), m_matrixElements.data(), devIsGoodHel.data() );
 #endif
-    checkCuda( cudaPeekAtLastError() );
+    checkGpu( gpuPeekAtLastError() );
     // ... 0d2. Copy back good helicity mask to the host
     copyHostFromDevice( hstIsGoodHel, devIsGoodHel );
     // ... 0d3. Copy back good helicity list to constant memory on the device
@@ -219,19 +219,19 @@ namespace mg5amcGpu
 
   void MatrixElementKernelDevice::computeMatrixElements( const unsigned int channelId )
   {
-    computeDependentCouplings<<<m_gpublocks, m_gputhreads>>>( m_gs.data(), m_couplings.data() );
+    gpuLaunchKernel( computeDependentCouplings, m_gpublocks, m_gputhreads, m_gs.data(), m_couplings.data() );
 #ifndef MGONGPU_NSIGHT_DEBUG
     constexpr unsigned int sharedMemSize = 0;
 #else
     constexpr unsigned int sharedMemSize = ntpbMAX * sizeof( float );
 #endif
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    sigmaKin<<<m_gpublocks, m_gputhreads, sharedMemSize>>>( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), channelId, m_numerators.data(), m_denominators.data(), m_selhel.data(), m_selcol.data() );
+    gpuLaunchKernelSharedMem( sigmaKin, m_gpublocks, m_gputhreads, sharedMemSize, m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), channelId, m_numerators.data(), m_denominators.data(), m_selhel.data(), m_selcol.data() );
 #else
-    sigmaKin<<<m_gpublocks, m_gputhreads, sharedMemSize>>>( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), m_selhel.data(), m_selcol.data() );
+    gpuLaunchKernelSharedMem( sigmaKin, m_gpublocks, m_gputhreads, sharedMemSize, m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), m_selhel.data(), m_selcol.data() );
 #endif
-    checkCuda( cudaPeekAtLastError() );
-    checkCuda( cudaDeviceSynchronize() );
+    checkGpu( gpuPeekAtLastError() );
+    checkGpu( gpuDeviceSynchronize() );
   }
 
   //--------------------------------------------------------------------------
diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/MatrixElementKernels.h b/epochX/cudacpp/gq_ttq.mad/SubProcesses/MatrixElementKernels.h
index 23e84757a2..72bd8f195b 100644
--- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/MatrixElementKernels.h
+++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/MatrixElementKernels.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MATRIXELEMENTKERNELS_H
 #define MATRIXELEMENTKERNELS_H 1
@@ -10,7 +10,7 @@
 
 #include "MemoryBuffers.h"
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -81,7 +81,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating matrix element calculations on a CPU host
   class MatrixElementKernelHost final : public MatrixElementKernelBase, public NumberOfEvents
   {
@@ -130,7 +130,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   // A class encapsulating matrix element calculations on a GPU device
   class MatrixElementKernelDevice : public MatrixElementKernelBase, public NumberOfEvents
   {
diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/MemoryAccessHelpers.h b/epochX/cudacpp/gq_ttq.mad/SubProcesses/MemoryAccessHelpers.h
index c82a6c7635..db73e4e064 100644
--- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/MemoryAccessHelpers.h
+++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/MemoryAccessHelpers.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MemoryAccessHelpers_H
 #define MemoryAccessHelpers_H 1
@@ -105,7 +105,7 @@ class KernelAccessHelper : public MemoryAccessHelper<T>
     }
     else
     {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
       const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid
       //printf( "kernelAccessRecord: ievt=%d threadId=%d\n", ievt, threadIdx.x );
       return T::ieventAccessRecord( buffer, ievt ); // NB fptype and fptype_sv coincide for CUDA
diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/MemoryAccessMomenta.h b/epochX/cudacpp/gq_ttq.mad/SubProcesses/MemoryAccessMomenta.h
index 0ac4faa3c7..38fade09fb 100644
--- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/MemoryAccessMomenta.h
+++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/MemoryAccessMomenta.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MemoryAccessMomenta_H
 #define MemoryAccessMomenta_H 1
@@ -12,7 +12,7 @@
 #include "MemoryAccessHelpers.h"
 #include "MemoryAccessVectors.h"
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 using mg5amcGpu::CPPProcess;
 #else
 using mg5amcCpu::CPPProcess;
@@ -29,7 +29,7 @@ class MemoryAccessMomentaBase //_AOSOAv1
 
   // Number of Events Per Page in the momenta AOSOA memory buffer layout
   // (these are all best kept as a compile-time constants: see issue #23)
-#ifdef __CUDACC__ /* clang-format off */
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
   // -----------------------------------------------------------------------------------------------
   // --- GPUs: neppM is best set to a power of 2 times the number of fptype's in a 32-byte cacheline
   // --- This is relevant to ensure coalesced access to momenta in global memory
diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/MemoryAccessRandomNumbers.h b/epochX/cudacpp/gq_ttq.mad/SubProcesses/MemoryAccessRandomNumbers.h
index e2988d39f3..40cb089135 100644
--- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/MemoryAccessRandomNumbers.h
+++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/MemoryAccessRandomNumbers.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MemoryAccessRandomNumbers_H
 #define MemoryAccessRandomNumbers_H 1
@@ -11,7 +11,7 @@
 #include "CPPProcess.h"
 #include "MemoryAccessHelpers.h"
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 using mg5amcGpu::CPPProcess;
 #else
 using mg5amcCpu::CPPProcess;
diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/MemoryAccessVectors.h b/epochX/cudacpp/gq_ttq.mad/SubProcesses/MemoryAccessVectors.h
index e9b197368e..08faccff0f 100644
--- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/MemoryAccessVectors.h
+++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/MemoryAccessVectors.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MemoryAccessVectors_H
 #define MemoryAccessVectors_H 1
@@ -10,7 +10,7 @@
 
 #include "mgOnGpuVectors.h"
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 namespace mg5amcCpu // this is only needed for CPU SIMD vectorization
 {
 
diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/MemoryBuffers.h b/epochX/cudacpp/gq_ttq.mad/SubProcesses/MemoryBuffers.h
index 3093e6ed18..7756a71621 100644
--- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/MemoryBuffers.h
+++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/MemoryBuffers.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021, based on earlier work by S. Hageboeck) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Roiser, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MemoryBuffers_H
 #define MemoryBuffers_H 1
@@ -11,12 +11,12 @@
 #include "mgOnGpuCxtypes.h"
 
 #include "CPPProcess.h"
-#include "CudaRuntime.h"
+#include "GpuRuntime.h"
 #include "Parameters_sm.h"
 
 #include <sstream>
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -87,7 +87,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   constexpr bool HostBufferALIGNED = false;   // ismisaligned=false
   constexpr bool HostBufferMISALIGNED = true; // ismisaligned=true
 
@@ -119,7 +119,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   // A class encapsulating a CUDA pinned host buffer
   template<typename T>
   class PinnedHostBufferBase : public BufferBase<T>
@@ -128,18 +128,18 @@ namespace mg5amcCpu
     PinnedHostBufferBase( const size_t size )
       : BufferBase<T>( size, false )
     {
-      checkCuda( cudaMallocHost( &( this->m_data ), this->bytes() ) );
+      gpuMallocHost( &( this->m_data ), this->bytes() );
     }
     virtual ~PinnedHostBufferBase()
     {
-      checkCuda( cudaFreeHost( this->m_data ) );
+      gpuFreeHost( this->m_data );
     }
   };
 #endif
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   // A class encapsulating a CUDA device buffer
   template<typename T>
   class DeviceBufferBase : public BufferBase<T>
@@ -148,18 +148,18 @@ namespace mg5amcCpu
     DeviceBufferBase( const size_t size )
       : BufferBase<T>( size, true )
     {
-      checkCuda( cudaMalloc( &( this->m_data ), this->bytes() ) );
+      gpuMalloc( &( this->m_data ), this->bytes() );
     }
     virtual ~DeviceBufferBase()
     {
-      checkCuda( cudaFree( this->m_data ) );
+      gpuFree( this->m_data );
     }
   };
 #endif
 
   //--------------------------------------------------------------------------
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for a given number of events
   template<typename T, size_t sizePerEvent, bool ismisaligned>
   class HostBuffer : public HostBufferBase<T, ismisaligned>, virtual private NumberOfEvents
@@ -175,7 +175,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   // A class encapsulating a CUDA pinned host buffer for a given number of events
   template<typename T, size_t sizePerEvent>
   class PinnedHostBuffer : public PinnedHostBufferBase<T>, virtual private NumberOfEvents
@@ -191,7 +191,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   // A class encapsulating a CUDA device buffer for a given number of events
   template<typename T, size_t sizePerEvent>
   class DeviceBuffer : public DeviceBufferBase<T>, virtual private NumberOfEvents
@@ -213,7 +213,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for momenta random numbers
   constexpr size_t sizePerEventRndNumMomenta = MemoryBuffers::np4 * MemoryBuffers::nparf;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for momenta random numbers
   typedef HostBuffer<fptype, sizePerEventRndNumMomenta, HostBufferALIGNED> HostBufferRndNumMomenta;
 #else
@@ -232,7 +232,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer with ONE fptype per event
   constexpr size_t sizePerEventOneFp = 1;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer with ONE fptype per event
   typedef HostBuffer<fptype, sizePerEventOneFp, HostBufferALIGNED> HostBufferOneFp;
 #else
@@ -257,7 +257,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for Gs
   constexpr size_t sizePerEventGs = 1;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for gs
   typedef HostBuffer<fptype, sizePerEventGs, HostBufferALIGNED> HostBufferGs;
 #else
@@ -276,7 +276,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for numerators
   constexpr size_t sizePerEventNumerators = 1;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for gs
   typedef HostBuffer<fptype, sizePerEventNumerators, HostBufferALIGNED> HostBufferNumerators;
 #else
@@ -296,7 +296,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for denominators
   constexpr size_t sizePerEventDenominators = 1;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for gs
   typedef HostBuffer<fptype, sizePerEventDenominators, HostBufferALIGNED> HostBufferDenominators;
 #else
@@ -315,7 +315,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for random numbers
   constexpr size_t sizePerEventCouplings = MemoryBuffers::ndcoup * MemoryBuffers::nx2;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for gs
   typedef HostBuffer<fptype, sizePerEventCouplings, HostBufferALIGNED> HostBufferCouplings;
 #else
@@ -333,7 +333,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for momenta
   constexpr size_t sizePerEventMomenta = MemoryBuffers::np4 * MemoryBuffers::npar;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for momenta
   typedef HostBuffer<fptype, sizePerEventMomenta, HostBufferALIGNED> HostBufferMomenta;
   //typedef HostBuffer<fptype, sizePerEventMomenta, HostBufferMISALIGNED> HostBufferMomenta; // TEST MISALIGNMENT!
@@ -352,7 +352,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for sampling weights
   constexpr size_t sizePerEventWeights = 1;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for sampling weights
   typedef HostBuffer<fptype, sizePerEventWeights, HostBufferALIGNED> HostBufferWeights;
 #else
@@ -370,7 +370,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for matrix elements
   constexpr size_t sizePerEventMatrixElements = 1;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for matrix elements
   typedef HostBuffer<fptype, sizePerEventMatrixElements, HostBufferALIGNED> HostBufferMatrixElements;
 #else
@@ -385,7 +385,7 @@ namespace mg5amcCpu
   // A base class encapsulating a memory buffer for the helicity mask
   typedef BufferBase<bool> BufferHelicityMask;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for the helicity mask
   typedef HostBufferBase<bool, HostBufferALIGNED> HostBufferHelicityMask;
 #else
@@ -403,7 +403,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for wavefunctions
   constexpr size_t sizePerEventWavefunctions = MemoryBuffers::nw6 * MemoryBuffers::nx2;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for wavefunctions
   typedef HostBuffer<fptype, sizePerEventWavefunctions, HostBufferALIGNED> HostBufferWavefunctions;
 #else
@@ -421,7 +421,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for helicity random numbers
   constexpr size_t sizePerEventRndNumHelicity = 1;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for helicity random numbers
   typedef HostBuffer<fptype, sizePerEventRndNumHelicity, HostBufferALIGNED> HostBufferRndNumHelicity;
 #else
@@ -439,7 +439,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for color random numbers
   constexpr size_t sizePerEventRndNumColor = 1;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for color random numbers
   typedef HostBuffer<fptype, sizePerEventRndNumColor, HostBufferALIGNED> HostBufferRndNumColor;
 #else
@@ -457,7 +457,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for helicity selection
   constexpr size_t sizePerEventSelectedHelicity = 1;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for helicity selection
   typedef HostBuffer<int, sizePerEventSelectedHelicity, HostBufferALIGNED> HostBufferSelectedHelicity;
 #else
@@ -475,7 +475,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for color selection
   constexpr size_t sizePerEventSelectedColor = 1;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for color selection
   typedef HostBuffer<int, sizePerEventSelectedColor, HostBufferALIGNED> HostBufferSelectedColor;
 #else
@@ -487,7 +487,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   template<class Tdst, class Tsrc>
   void copyDeviceFromHost( Tdst& dst, const Tsrc& src ) // keep the same order of arguments as in memcpy
   {
@@ -504,13 +504,13 @@ namespace mg5amcCpu
       throw std::runtime_error( sstr.str() );
     }
     // NB (PR #45): cudaMemcpy involves an intermediate memcpy to pinned memory if host array is a not a pinned host array
-    checkCuda( cudaMemcpy( dst.data(), src.data(), src.bytes(), cudaMemcpyHostToDevice ) );
+    gpuMemcpy( dst.data(), src.data(), src.bytes(), gpuMemcpyHostToDevice );
   }
 #endif
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   template<class Tdst, class Tsrc>
   void copyHostFromDevice( Tdst& dst, const Tsrc& src ) // keep the same order of arguments as in memcpy
   {
@@ -527,7 +527,7 @@ namespace mg5amcCpu
       throw std::runtime_error( sstr.str() );
     }
     // NB (PR #45): cudaMemcpy involves an intermediate memcpy to pinned memory if host array is a not a pinned host array
-    checkCuda( cudaMemcpy( dst.data(), src.data(), src.bytes(), cudaMemcpyDeviceToHost ) );
+    gpuMemcpy( dst.data(), src.data(), src.bytes(), gpuMemcpyDeviceToHost );
   }
 #endif
 
diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/CPPProcess.cc b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/CPPProcess.cc
index 3a9b2fddaf..3732ec6679 100644
--- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/CPPProcess.cc
+++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/CPPProcess.cc
@@ -4,7 +4,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
 // MadGraph5_aMC@NLO v. 3.5.0_lo_vect, 2023-06-09
@@ -16,7 +16,6 @@
 
 #include "mgOnGpuConfig.h"
 
-#include "CudaRuntime.h"
 #include "HelAmps_sm.h"
 #include "MemoryAccessAmplitudes.h"
 #include "MemoryAccessCouplings.h"
@@ -49,7 +48,7 @@
 // Process: g d > t t~ d WEIGHTED<=3 @1
 // Process: g s > t t~ s WEIGHTED<=3 @1
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -83,7 +82,7 @@ namespace mg5amcCpu
   __device__ const fptype cIPD[2] = { (fptype)Parameters_sm::mdl_MT, (fptype)Parameters_sm::mdl_WT };
   __device__ const fptype* cIPC = nullptr; // unused as nicoup=0
 #else
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   __device__ __constant__ fptype cIPD[2];
   __device__ __constant__ fptype* cIPC = nullptr; // unused as nicoup=0
 #else
@@ -93,7 +92,7 @@ namespace mg5amcCpu
 #endif
 
   // Helicity combinations (and filtering of "good" helicity combinations)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   __device__ __constant__ short cHel[ncomb][npar];
   __device__ __constant__ int cNGoodHel;
   __device__ __constant__ int cGoodHel[ncomb];
@@ -121,13 +120,13 @@ namespace mg5amcCpu
                            fptype* allDenominators,       // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
                            fptype_sv* jamp2_sv            // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled)
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
                            , const int ievt00             // input: first event number in current C++ event page (for CUDA, ievt depends on threadid)
 #endif
                            )
   //ALWAYS_INLINE // attributes are not permitted in a function definition
   {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     using namespace mg5amcGpu;
     using M_ACCESS = DeviceAccessMomenta;         // non-trivial access: buffer includes all events
     using E_ACCESS = DeviceAccessMatrixElements;  // non-trivial access: buffer includes all events
@@ -154,7 +153,7 @@ namespace mg5amcCpu
 #endif /* clang-format on */
     mgDebug( 0, __FUNCTION__ );
     //printf( "calculate_wavefunctions: ihel=%2d\n", ihel );
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
     //printf( "calculate_wavefunctions: ievt00=%d\n", ievt00 );
 #endif
 
@@ -190,7 +189,7 @@ namespace mg5amcCpu
 #endif
     for( int iParity = 0; iParity < nParity; ++iParity )
     { // START LOOP ON IPARITY
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
       const int ievt0 = ievt00 + iParity * neppV;
 #endif
       constexpr size_t nxcoup = ndcoup + nicoup; // both dependent and independent couplings
@@ -203,8 +202,10 @@ namespace mg5amcCpu
         allCOUPs[idcoup] = CD_ACCESS::idcoupAccessBufferConst( allcouplings, idcoup ); // dependent couplings, vary event-by-event
       for( size_t iicoup = 0; iicoup < nicoup; iicoup++ )
         allCOUPs[ndcoup + iicoup] = CI_ACCESS::iicoupAccessBufferConst( cIPC, iicoup ); // independent couplings, fixed for all events
+#ifdef MGONGPUCPP_GPUIMPL
 #ifdef __CUDACC__
 #pragma nv_diagnostic pop
+#endif
       // CUDA kernels take input/output buffers with momenta/MEs for all events
       const fptype* momenta = allmomenta;
       const fptype* COUPs[nxcoup];
@@ -243,7 +244,7 @@ namespace mg5amcCpu
       // Wavefunction(s) for diagram number 1
       vxxxxx<M_ACCESS, W_ACCESS>( momenta, 0., cHel[ihel][0], -1, w_fp[0], 0 );
 
-#if not( defined __CUDACC__ and defined MGONGPU_TEST_DIVERGENCE )
+#if not( defined MGONGPUCPP_GPUIMPL and defined MGONGPU_TEST_DIVERGENCE )
       imzxxx<M_ACCESS, W_ACCESS>( momenta, cHel[ihel][1], +1, w_fp[1], 1 ); // NB: imzxxx only uses pz
 #else
       if( ( blockDim.x * blockIdx.x + threadIdx.x ) % 2 == 0 )
@@ -348,7 +349,7 @@ namespace mg5amcCpu
         { 4, 0, 12, 4 },
         { 0, 4, 4, 12 } }; // 2-D array[4][4]
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
       // Pre-compute a constexpr triangular color matrix properly normalized #475
       struct TriangularNormalizedColorMatrix
       {
@@ -405,7 +406,7 @@ namespace mg5amcCpu
 #endif
       for( int icol = 0; icol < ncolor; icol++ )
       {
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
         // === C++ START ===
         // Diagonal terms
 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
@@ -464,7 +465,7 @@ namespace mg5amcCpu
       MEs_sv_previous += deltaMEs_previous;
 #endif
       /*
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
       if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", blockDim.x * blockIdx.x + threadIdx.x, ihel, MEs_sv );
 #else
 #ifdef MGONGPU_CPPSIMD
@@ -527,8 +528,8 @@ namespace mg5amcCpu
       { 1, -1, 1, 1, 1 },
       { 1, -1, 1, -1, -1 },
       { 1, -1, 1, -1, 1 } };
-#ifdef __CUDACC__
-    checkCuda( cudaMemcpyToSymbol( cHel, tHel, ncomb * npar * sizeof( short ) ) );
+#ifdef MGONGPUCPP_GPUIMPL
+    gpuMemcpyToSymbol( cHel, tHel, ncomb * npar * sizeof( short ) );
 #else
     memcpy( cHel, tHel, ncomb * npar * sizeof( short ) );
 #endif
@@ -569,9 +570,9 @@ namespace mg5amcCpu
     // Then copy them to CUDA constant memory (issue #39) or its C++ emulation in file-scope static memory
     const fptype tIPD[2] = { (fptype)m_pars->mdl_MT, (fptype)m_pars->mdl_WT };
     //const cxtype tIPC[0] = { ... }; // nicoup=0
-#ifdef __CUDACC__
-    checkCuda( cudaMemcpyToSymbol( cIPD, tIPD, 2 * sizeof( fptype ) ) );
-    //checkCuda( cudaMemcpyToSymbol( cIPC, tIPC, 0 * sizeof( cxtype ) ) ); // nicoup=0
+#ifdef MGONGPUCPP_GPUIMPL
+    gpuMemcpyToSymbol( cIPD, tIPD, 2 * sizeof( fptype ) );
+    //gpuMemcpyToSymbol( cIPC, tIPC, 0 * sizeof( cxtype ) ); // nicoup=0
 #else
     memcpy( cIPD, tIPD, 2 * sizeof( fptype ) );
     //memcpy( cIPC, tIPC, 0 * sizeof( cxtype ) ); // nicoup=0
@@ -608,7 +609,7 @@ namespace mg5amcCpu
   {
     std::stringstream out;
     // CUDA version (NVCC)
-    // [Use __NVCC__ instead of __CUDACC__ here!]
+    // [Use __NVCC__ instead of MGONGPUCPP_GPUIMPL here!]
     // [This tests if 'nvcc' was used even to build a .cc file, even if not necessarily 'nvcc -x cu' for a .cu file]
     // [Check 'nvcc --compiler-options -dM -E dummy.c | grep CUDA': see https://stackoverflow.com/a/53713712]
 #ifdef __NVCC__
@@ -673,12 +674,12 @@ namespace mg5amcCpu
   __global__ void /* clang-format off */
   computeDependentCouplings( const fptype* allgs, // input: Gs[nevt]
                              fptype* allcouplings // output: couplings[nevt*ndcoup*2]
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
                              , const int nevt     // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #endif
   ) /* clang-format on */
   {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     using namespace mg5amcGpu;
     using G_ACCESS = DeviceAccessGs;
     using C_ACCESS = DeviceAccessCouplings;
@@ -699,7 +700,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__ /* clang-format off */
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
   __global__ void
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
@@ -828,9 +829,9 @@ namespace mg5amcCpu
         nGoodHel++;
       }
     }
-#ifdef __CUDACC__
-    checkCuda( cudaMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) ) );
-    checkCuda( cudaMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) ) );
+#ifdef MGONGPUCPP_GPUIMPL
+    gpuMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) );
+    gpuMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) );
 #else
     cNGoodHel = nGoodHel;
     for( int ihel = 0; ihel < ncomb; ihel++ ) cGoodHel[ihel] = goodHel[ihel];
@@ -854,7 +855,7 @@ namespace mg5amcCpu
 #endif
             int* allselhel,                // output: helicity selection[nevt]
             int* allselcol                 // output: helicity selection[nevt]
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
             , const int nevt               // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #endif
             ) /* clang-format on */
@@ -875,7 +876,7 @@ namespace mg5amcCpu
     // Denominators: spins, colors and identical particles
     constexpr int helcolDenominators[1] = { 96 }; // assume nprocesses == 1 (#272 and #343)
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     // Remember: in CUDA this is a kernel for one event, in c++ this processes n events
     const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid
 #else
@@ -889,9 +890,12 @@ namespace mg5amcCpu
 #endif
 
     // Start sigmaKin_lines
+
+#include "GpuAbstraction.h"
+
     // === PART 0 - INITIALISATION (before calculate_wavefunctions) ===
     // Reset the "matrix elements" - running sums of |M|^2 over helicities for the given event
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     allMEs[ievt] = 0;
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     allNumerators[ievt] = 0;
@@ -919,7 +923,7 @@ namespace mg5amcCpu
     // === PART 1 - HELICITY LOOP: CALCULATE WAVEFUNCTIONS ===
     // (in both CUDA and C++, using precomputed good helicities)
 
-#ifdef __CUDACC__ // CUDA OR C++
+#ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++
 
     // *** START OF PART 1a - CUDA (one event per CPU thread) ***
     // Running sum of partial amplitudes squared for event by event color selection (#402)
@@ -1123,7 +1127,7 @@ namespace mg5amcCpu
     // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event
     // [NB 'sum over final spins, average over initial spins', eg see
     // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf]
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     allMEs[ievt] /= helcolDenominators[0];
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     if( channelId > 0 ) allMEs[ievt] *= allNumerators[ievt] / allDenominators[ievt];
diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/CPPProcess.h b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/CPPProcess.h
index 9554285817..ee747a8ae4 100644
--- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/CPPProcess.h
+++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/CPPProcess.h
@@ -4,7 +4,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
 // MadGraph5_aMC@NLO v. 3.5.0_lo_vect, 2023-06-09
@@ -25,7 +25,7 @@
 
 //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -110,7 +110,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   __global__ void
   computeDependentCouplings( const fptype* allgs,    // input: Gs[nevt]
                              fptype* allcouplings ); // output: couplings[nevt*ndcoup*2]
@@ -123,7 +123,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__ /* clang-format off */
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
   __global__ void
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
@@ -153,7 +153,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__ /* clang-format off */
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
   __global__ void
   sigmaKin( const fptype* allmomenta,      // input: momenta[nevt*npar*4]
             const fptype* allcouplings,    // input: couplings[nevt*ndcoup*2]
diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/CudaRuntime.h b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/CudaRuntime.h
deleted file mode 120000
index ce9e1a487a..0000000000
--- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/CudaRuntime.h
+++ /dev/null
@@ -1 +0,0 @@
-../CudaRuntime.h
\ No newline at end of file
diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/GpuAbstraction.h b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/GpuAbstraction.h
new file mode 120000
index 0000000000..72054e19ba
--- /dev/null
+++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/GpuAbstraction.h
@@ -0,0 +1 @@
+../GpuAbstraction.h
\ No newline at end of file
diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/GpuRuntime.h b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/GpuRuntime.h
new file mode 120000
index 0000000000..3920e83be4
--- /dev/null
+++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/GpuRuntime.h
@@ -0,0 +1 @@
+../GpuRuntime.h
\ No newline at end of file
diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/check_sa.cc b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/check_sa.cc
index f1e75b9252..1bad694d1c 100644
--- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/check_sa.cc
+++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/check_sa.cc
@@ -4,7 +4,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: O. Mattelaer (Nov 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 
 #include "mgOnGpuConfig.h"
@@ -12,6 +12,7 @@
 #include "BridgeKernels.h"
 #include "CPPProcess.h"
 #include "CrossSectionKernels.h"
+#include "GpuRuntime.h"
 #include "MatrixElementKernels.h"
 #include "MemoryAccessMatrixElements.h"
 #include "MemoryAccessMomenta.h"
@@ -63,7 +64,7 @@ usage( char* argv0, int ret = 1 )
   std::cout << std::endl;
   std::cout << "Summary stats are always computed: '-p' and '-j' only control their printout" << std::endl;
   std::cout << "The '-d' flag only enables NaN/abnormal warnings and OMP debugging" << std::endl;
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 #ifdef _OPENMP
   std::cout << std::endl;
   std::cout << "Use the OMP_NUM_THREADS environment variable to control OMP multi-threading" << std::endl;
@@ -77,7 +78,7 @@ int
 main( int argc, char** argv )
 {
   // Namespaces for CUDA and C++ (FIXME - eventually use the same namespace everywhere...)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   using namespace mg5amcGpu;
 #else
   using namespace mg5amcCpu;
@@ -103,11 +104,11 @@ main( int argc, char** argv )
     CurandDevice = 2
   };
 #ifdef __CUDACC__
-  RandomNumberMode rndgen = RandomNumberMode::CurandDevice; // default on GPU
+  RandomNumberMode rndgen = RandomNumberMode::CurandDevice; // default on CUDA GPU
 #elif not defined MGONGPU_HAS_NO_CURAND
   RandomNumberMode rndgen = RandomNumberMode::CurandHost;  // default on CPU if build has curand
 #else
-  RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // default on CPU if build has no curand
+  RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // default on CPU if build has no curand and on HIP GPU
 #endif
   // Rambo sampling mode (NB RamboHost implies CommonRandom or CurandHost!)
   enum class RamboSamplingMode
@@ -115,7 +116,7 @@ main( int argc, char** argv )
     RamboHost = 1,
     RamboDevice = 2
   };
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   RamboSamplingMode rmbsmp = RamboSamplingMode::RamboDevice; // default on GPU
 #else
   RamboSamplingMode rmbsmp = RamboSamplingMode::RamboHost; // default on CPU
@@ -148,7 +149,7 @@ main( int argc, char** argv )
 #ifdef __CUDACC__
       rndgen = RandomNumberMode::CurandDevice;
 #else
-      throw std::runtime_error( "CurandDevice is not supported on CPUs" );
+      throw std::runtime_error( "CurandDevice is not supported on CPUs or on HIP GPUs" );
 #endif
     }
     else if( arg == "--curhst" )
@@ -165,7 +166,7 @@ main( int argc, char** argv )
     }
     else if( arg == "--rmbdev" )
     {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
       rmbsmp = RamboSamplingMode::RamboDevice;
 #else
       throw std::runtime_error( "RamboDevice is not supported on CPUs" );
@@ -239,13 +240,13 @@ main( int argc, char** argv )
     return usage( argv[0] );
   }
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 #ifdef _OPENMP
   ompnumthreadsNotSetMeansOneThread( debug ? 1 : 0 ); // quiet(-1), info(0), debug(1)
 #endif
 #endif
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // Fail gently and avoid "Illegal instruction (core dumped)" if the host does not support the SIMD used in the ME calculation
   // Note: this prevents a crash on pmpe04 but not on some github CI nodes?
   // [NB: SIMD vectorization in mg5amc C++ code is only used in the ME calculation below MatrixElementKernelHost!]
@@ -263,14 +264,14 @@ main( int argc, char** argv )
 
   // === STEP 0 - INITIALISE
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 
-  // --- 00. Initialise cuda
-  // Instantiate a CudaRuntime at the beginnining of the application's main to
-  // invoke cudaSetDevice(0) in the constructor and book a cudaDeviceReset() call in the destructor
-  const std::string cdinKey = "00 CudaInit";
+  // --- 00. Initialise GPU
+  // Instantiate a GpuRuntime at the beginnining of the application's main.
+  // For CUDA this invokes cudaSetDevice(0) in the constructor and books a cudaDeviceReset() call in the destructor.
+  const std::string cdinKey = "00 GpuInit";
   timermap.start( cdinKey );
-  CudaRuntime cudaRuntime( debug );
+  GpuRuntime GpuRuntime( debug );
 #endif
 
   // --- 0a. Initialise physics process
@@ -292,7 +293,7 @@ main( int argc, char** argv )
   timermap.start( alloKey );
 
   // Memory buffers for random numbers for momenta
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferRndNumMomenta hstRndmom( nevt );
 #else
   PinnedHostBufferRndNumMomenta hstRndmom( nevt );
@@ -300,7 +301,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for sampling weights
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferWeights hstWeights( nevt );
 #else
   PinnedHostBufferWeights hstWeights( nevt );
@@ -308,7 +309,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for momenta
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferMomenta hstMomenta( nevt );
 #else
   PinnedHostBufferMomenta hstMomenta( nevt );
@@ -316,7 +317,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for Gs
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferGs hstGs( nevt );
 #else
   PinnedHostBufferGs hstGs( nevt );
@@ -333,7 +334,7 @@ main( int argc, char** argv )
   }
 
   // Memory buffers for matrix elements
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferMatrixElements hstMatrixElements( nevt );
 #else
   PinnedHostBufferMatrixElements hstMatrixElements( nevt );
@@ -342,7 +343,7 @@ main( int argc, char** argv )
 
   // Memory buffers for random numbers for helicity selection
   // *** NB #403 these buffers always remain initialised at 0: no need for helicity choice in gcheck/check (no LHE produced) ***
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferRndNumHelicity hstRndHel( nevt );
 #else
   PinnedHostBufferRndNumHelicity hstRndHel( nevt );
@@ -351,7 +352,7 @@ main( int argc, char** argv )
 
   // Memory buffers for random numbers for color selection
   // *** NB #402 these buffers always remain initialised at 0: no need for color choice in gcheck/check (no LHE produced) ***
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferRndNumColor hstRndCol( nevt );
 #else
   PinnedHostBufferRndNumColor hstRndCol( nevt );
@@ -359,7 +360,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for helicity selection
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferSelectedHelicity hstSelHel( nevt );
 #else
   PinnedHostBufferSelectedHelicity hstSelHel( nevt );
@@ -367,7 +368,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for color selection
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferSelectedColor hstSelCol( nevt );
 #else
   PinnedHostBufferSelectedColor hstSelCol( nevt );
@@ -403,7 +404,7 @@ main( int argc, char** argv )
 #else
   else
   {
-    throw std::logic_error( "CurandDevice is not supported on CPUs" ); // INTERNAL ERROR (no path to this statement)
+    throw std::logic_error( "CurandDevice is not supported on CPUs or HIP GPUs" ); // INTERNAL ERROR (no path to this statement)
   }
 #endif
 #else
@@ -421,7 +422,7 @@ main( int argc, char** argv )
   }
   else
   {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     prsk.reset( new RamboSamplingKernelDevice( energy, devRndmom, devMomenta, devWeights, gpublocks, gputhreads ) );
 #else
     throw std::logic_error( "RamboDevice is not supported on CPUs" ); // INTERNAL ERROR (no path to this statement)
@@ -432,7 +433,7 @@ main( int argc, char** argv )
   std::unique_ptr<MatrixElementKernelBase> pmek;
   if( !bridge )
   {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     pmek.reset( new MatrixElementKernelDevice( devMomenta, devGs, devRndHel, devRndCol, devMatrixElements, devSelHel, devSelCol, gpublocks, gputhreads ) );
 #else
     pmek.reset( new MatrixElementKernelHost( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, nevt ) );
@@ -440,7 +441,7 @@ main( int argc, char** argv )
   }
   else
   {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     pmek.reset( new BridgeKernelDevice( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, gpublocks, gputhreads ) );
 #else
     pmek.reset( new BridgeKernelHost( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, nevt ) );
@@ -482,7 +483,7 @@ main( int argc, char** argv )
     prnk->generateRnarray();
     //std::cout << "Got random numbers" << std::endl;
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     if( rndgen != RandomNumberMode::CurandDevice && rmbsmp == RamboSamplingMode::RamboDevice )
     {
       // --- 1c. Copy rndmom from host to device
@@ -514,7 +515,7 @@ main( int argc, char** argv )
     prsk->getMomentaFinal();
     //std::cout << "Got final momenta" << std::endl;
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     if( rmbsmp == RamboSamplingMode::RamboDevice )
     {
       // --- 2c. CopyDToH Weights
@@ -559,7 +560,7 @@ main( int argc, char** argv )
       dynamic_cast<BridgeKernelBase*>( pmek.get() )->transposeInputMomentaC2F();
     }
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     // --- 2d. CopyHToD Momenta
     const std::string gKey = "0.. CpHTDg";
     rambtime += timermap.start( gKey ); // FIXME! NOT A RAMBO TIMER!
@@ -588,7 +589,7 @@ main( int argc, char** argv )
     wv3atime += timermap.stop(); // calc only
     wavetime += wv3atime;        // calc plus copy
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     if( !bridge )
     {
       // --- 3b. CopyDToH MEs
@@ -731,15 +732,19 @@ main( int argc, char** argv )
     rndgentxt = "CURAND DEVICE";
 #ifdef __CUDACC__
   rndgentxt += " (CUDA code)";
+#elif defined __HIPCC__
+  rndgentxt += " (HIP code)";
 #else
   rndgentxt += " (C++ code)";
 #endif
 
   // Workflow description summary
   std::string wrkflwtxt;
-  // -- CUDA or C++?
+  // -- CUDA or HIP or C++?
 #ifdef __CUDACC__
   wrkflwtxt += "CUD:";
+#elif defined __HIPCC__
+  wrkflwtxt += "HIP:";
 #else
   wrkflwtxt += "CPP:";
 #endif
@@ -764,6 +769,12 @@ main( int argc, char** argv )
 #else
   wrkflwtxt += "???:"; // no path to this statement
 #endif
+#elif defined __HIPCC__
+#if defined MGONGPU_CUCXTYPE_CXSMPL
+  wrkflwtxt += "CXS:";
+#else
+  wrkflwtxt += "???:"; // no path to this statement
+#endif
 #else
 #if defined MGONGPU_CPPCXTYPE_STDCOMPLEX
   wrkflwtxt += "STX:";
@@ -789,7 +800,7 @@ main( int argc, char** argv )
     wrkflwtxt += "RMBDEV+";
   else
     wrkflwtxt += "??????+"; // no path to this statement
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   // -- HOST or DEVICE matrix elements? Standalone MEs or BRIDGE?
   if( !bridge )
     wrkflwtxt += "MESDEV";
@@ -845,7 +856,7 @@ main( int argc, char** argv )
 
   if( perf )
   {
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 #ifdef _OPENMP
     // Get the output of "nproc --all" (https://stackoverflow.com/a/478960)
     std::string nprocall;
@@ -866,6 +877,8 @@ main( int argc, char** argv )
     std::cout << std::string( SEP79, '*' ) << std::endl
 #ifdef __CUDACC__
               << "Process                     = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_CUDA"
+#elif defined __HIPCC__
+              << "Process                     = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_HIP"
 #else
               << "Process                     = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_CPP"
 #endif
@@ -892,21 +905,21 @@ main( int argc, char** argv )
 #elif defined MGONGPU_FPTYPE_FLOAT
               << "FP precision                = FLOAT (NaN/abnormal=" << nabn << ", zero=" << nzero << ")" << std::endl
 #endif
-#ifdef __CUDACC__
 #if defined MGONGPU_CUCXTYPE_CUCOMPLEX
               << "Complex type                = CUCOMPLEX" << std::endl
 #elif defined MGONGPU_CUCXTYPE_THRUST
               << "Complex type                = THRUST::COMPLEX" << std::endl
-#endif
-#else
+#elif defined MGONGPU_CUCXTYPE_CXSMPL
               << "Complex type                = STD::COMPLEX" << std::endl
+#else
+              << "Complex type                = ???" << std::endl // no path to this statement...
 #endif
               << "RanNumb memory layout       = AOSOA[" << neppR << "]"
               << ( neppR == 1 ? " == AOS" : "" )
               << " [HARDCODED FOR REPRODUCIBILITY]" << std::endl
               << "Momenta memory layout       = AOSOA[" << neppM << "]"
               << ( neppM == 1 ? " == AOS" : "" ) << std::endl
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     //<< "Wavefunction GPU memory     = LOCAL" << std::endl
 #else
 #if !defined MGONGPU_CPPSIMD
@@ -937,7 +950,7 @@ main( int argc, char** argv )
 #endif
 #endif
               << "Random number generation    = " << rndgentxt << std::endl
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 #ifdef _OPENMP
               << "OMP threads / `nproc --all` = " << omp_get_max_threads() << " / " << nprocall // includes a newline
 #endif
@@ -1033,14 +1046,14 @@ main( int argc, char** argv )
              << "\"FLOAT (NaN/abnormal=" << nabn << ")\"," << std::endl
 #endif
              << "\"Complex type\": "
-#ifdef __CUDACC__
 #if defined MGONGPU_CUCXTYPE_CUCOMPLEX
              << "\"CUCOMPLEX\"," << std::endl
 #elif defined MGONGPU_CUCXTYPE_THRUST
              << "\"THRUST::COMPLEX\"," << std::endl
-#endif
-#else
+#elif defined MGONGPU_CUCXTYPE_CXSMPL
              << "\"STD::COMPLEX\"," << std::endl
+#else
+             << "\"???\"," << std::endl                           // no path to this statement...
 #endif
              << "\"RanNumb memory layout\": "
              << "\"AOSOA[" << neppR << "]\""
@@ -1048,7 +1061,7 @@ main( int argc, char** argv )
              << "\"Momenta memory layout\": "
              << "\"AOSOA[" << neppM << "]\""
              << ( neppM == 1 ? " == AOS" : "" ) << ", " << std::endl
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     //<< "\"Wavefunction GPU memory\": " << "\"LOCAL\"," << std::endl
 #endif
              << "\"Curand generation\": "
diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/CPPProcess.cc b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/CPPProcess.cc
index 408fbb52f9..af91413156 100644
--- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/CPPProcess.cc
+++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/CPPProcess.cc
@@ -4,7 +4,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
 // MadGraph5_aMC@NLO v. 3.5.0_lo_vect, 2023-06-09
@@ -16,7 +16,6 @@
 
 #include "mgOnGpuConfig.h"
 
-#include "CudaRuntime.h"
 #include "HelAmps_sm.h"
 #include "MemoryAccessAmplitudes.h"
 #include "MemoryAccessCouplings.h"
@@ -49,7 +48,7 @@
 // Process: g d~ > t t~ d~ WEIGHTED<=3 @1
 // Process: g s~ > t t~ s~ WEIGHTED<=3 @1
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -83,7 +82,7 @@ namespace mg5amcCpu
   __device__ const fptype cIPD[2] = { (fptype)Parameters_sm::mdl_MT, (fptype)Parameters_sm::mdl_WT };
   __device__ const fptype* cIPC = nullptr; // unused as nicoup=0
 #else
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   __device__ __constant__ fptype cIPD[2];
   __device__ __constant__ fptype* cIPC = nullptr; // unused as nicoup=0
 #else
@@ -93,7 +92,7 @@ namespace mg5amcCpu
 #endif
 
   // Helicity combinations (and filtering of "good" helicity combinations)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   __device__ __constant__ short cHel[ncomb][npar];
   __device__ __constant__ int cNGoodHel;
   __device__ __constant__ int cGoodHel[ncomb];
@@ -121,13 +120,13 @@ namespace mg5amcCpu
                            fptype* allDenominators,       // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
                            fptype_sv* jamp2_sv            // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled)
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
                            , const int ievt00             // input: first event number in current C++ event page (for CUDA, ievt depends on threadid)
 #endif
                            )
   //ALWAYS_INLINE // attributes are not permitted in a function definition
   {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     using namespace mg5amcGpu;
     using M_ACCESS = DeviceAccessMomenta;         // non-trivial access: buffer includes all events
     using E_ACCESS = DeviceAccessMatrixElements;  // non-trivial access: buffer includes all events
@@ -154,7 +153,7 @@ namespace mg5amcCpu
 #endif /* clang-format on */
     mgDebug( 0, __FUNCTION__ );
     //printf( "calculate_wavefunctions: ihel=%2d\n", ihel );
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
     //printf( "calculate_wavefunctions: ievt00=%d\n", ievt00 );
 #endif
 
@@ -190,7 +189,7 @@ namespace mg5amcCpu
 #endif
     for( int iParity = 0; iParity < nParity; ++iParity )
     { // START LOOP ON IPARITY
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
       const int ievt0 = ievt00 + iParity * neppV;
 #endif
       constexpr size_t nxcoup = ndcoup + nicoup; // both dependent and independent couplings
@@ -203,8 +202,10 @@ namespace mg5amcCpu
         allCOUPs[idcoup] = CD_ACCESS::idcoupAccessBufferConst( allcouplings, idcoup ); // dependent couplings, vary event-by-event
       for( size_t iicoup = 0; iicoup < nicoup; iicoup++ )
         allCOUPs[ndcoup + iicoup] = CI_ACCESS::iicoupAccessBufferConst( cIPC, iicoup ); // independent couplings, fixed for all events
+#ifdef MGONGPUCPP_GPUIMPL
 #ifdef __CUDACC__
 #pragma nv_diagnostic pop
+#endif
       // CUDA kernels take input/output buffers with momenta/MEs for all events
       const fptype* momenta = allmomenta;
       const fptype* COUPs[nxcoup];
@@ -341,7 +342,7 @@ namespace mg5amcCpu
         { 4, 0, 12, 4 },
         { 0, 4, 4, 12 } }; // 2-D array[4][4]
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
       // Pre-compute a constexpr triangular color matrix properly normalized #475
       struct TriangularNormalizedColorMatrix
       {
@@ -398,7 +399,7 @@ namespace mg5amcCpu
 #endif
       for( int icol = 0; icol < ncolor; icol++ )
       {
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
         // === C++ START ===
         // Diagonal terms
 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
@@ -457,7 +458,7 @@ namespace mg5amcCpu
       MEs_sv_previous += deltaMEs_previous;
 #endif
       /*
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
       if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", blockDim.x * blockIdx.x + threadIdx.x, ihel, MEs_sv );
 #else
 #ifdef MGONGPU_CPPSIMD
@@ -520,8 +521,8 @@ namespace mg5amcCpu
       { 1, 1, 1, 1, -1 },
       { 1, 1, 1, -1, 1 },
       { 1, 1, 1, -1, -1 } };
-#ifdef __CUDACC__
-    checkCuda( cudaMemcpyToSymbol( cHel, tHel, ncomb * npar * sizeof( short ) ) );
+#ifdef MGONGPUCPP_GPUIMPL
+    gpuMemcpyToSymbol( cHel, tHel, ncomb * npar * sizeof( short ) );
 #else
     memcpy( cHel, tHel, ncomb * npar * sizeof( short ) );
 #endif
@@ -562,9 +563,9 @@ namespace mg5amcCpu
     // Then copy them to CUDA constant memory (issue #39) or its C++ emulation in file-scope static memory
     const fptype tIPD[2] = { (fptype)m_pars->mdl_MT, (fptype)m_pars->mdl_WT };
     //const cxtype tIPC[0] = { ... }; // nicoup=0
-#ifdef __CUDACC__
-    checkCuda( cudaMemcpyToSymbol( cIPD, tIPD, 2 * sizeof( fptype ) ) );
-    //checkCuda( cudaMemcpyToSymbol( cIPC, tIPC, 0 * sizeof( cxtype ) ) ); // nicoup=0
+#ifdef MGONGPUCPP_GPUIMPL
+    gpuMemcpyToSymbol( cIPD, tIPD, 2 * sizeof( fptype ) );
+    //gpuMemcpyToSymbol( cIPC, tIPC, 0 * sizeof( cxtype ) ); // nicoup=0
 #else
     memcpy( cIPD, tIPD, 2 * sizeof( fptype ) );
     //memcpy( cIPC, tIPC, 0 * sizeof( cxtype ) ); // nicoup=0
@@ -601,7 +602,7 @@ namespace mg5amcCpu
   {
     std::stringstream out;
     // CUDA version (NVCC)
-    // [Use __NVCC__ instead of __CUDACC__ here!]
+    // [Use __NVCC__ instead of MGONGPUCPP_GPUIMPL here!]
     // [This tests if 'nvcc' was used even to build a .cc file, even if not necessarily 'nvcc -x cu' for a .cu file]
     // [Check 'nvcc --compiler-options -dM -E dummy.c | grep CUDA': see https://stackoverflow.com/a/53713712]
 #ifdef __NVCC__
@@ -666,12 +667,12 @@ namespace mg5amcCpu
   __global__ void /* clang-format off */
   computeDependentCouplings( const fptype* allgs, // input: Gs[nevt]
                              fptype* allcouplings // output: couplings[nevt*ndcoup*2]
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
                              , const int nevt     // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #endif
   ) /* clang-format on */
   {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     using namespace mg5amcGpu;
     using G_ACCESS = DeviceAccessGs;
     using C_ACCESS = DeviceAccessCouplings;
@@ -692,7 +693,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__ /* clang-format off */
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
   __global__ void
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
@@ -821,9 +822,9 @@ namespace mg5amcCpu
         nGoodHel++;
       }
     }
-#ifdef __CUDACC__
-    checkCuda( cudaMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) ) );
-    checkCuda( cudaMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) ) );
+#ifdef MGONGPUCPP_GPUIMPL
+    gpuMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) );
+    gpuMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) );
 #else
     cNGoodHel = nGoodHel;
     for( int ihel = 0; ihel < ncomb; ihel++ ) cGoodHel[ihel] = goodHel[ihel];
@@ -847,7 +848,7 @@ namespace mg5amcCpu
 #endif
             int* allselhel,                // output: helicity selection[nevt]
             int* allselcol                 // output: helicity selection[nevt]
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
             , const int nevt               // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #endif
             ) /* clang-format on */
@@ -868,7 +869,7 @@ namespace mg5amcCpu
     // Denominators: spins, colors and identical particles
     constexpr int helcolDenominators[1] = { 96 }; // assume nprocesses == 1 (#272 and #343)
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     // Remember: in CUDA this is a kernel for one event, in c++ this processes n events
     const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid
 #else
@@ -882,9 +883,12 @@ namespace mg5amcCpu
 #endif
 
     // Start sigmaKin_lines
+
+#include "GpuAbstraction.h"
+
     // === PART 0 - INITIALISATION (before calculate_wavefunctions) ===
     // Reset the "matrix elements" - running sums of |M|^2 over helicities for the given event
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     allMEs[ievt] = 0;
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     allNumerators[ievt] = 0;
@@ -912,7 +916,7 @@ namespace mg5amcCpu
     // === PART 1 - HELICITY LOOP: CALCULATE WAVEFUNCTIONS ===
     // (in both CUDA and C++, using precomputed good helicities)
 
-#ifdef __CUDACC__ // CUDA OR C++
+#ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++
 
     // *** START OF PART 1a - CUDA (one event per CPU thread) ***
     // Running sum of partial amplitudes squared for event by event color selection (#402)
@@ -1116,7 +1120,7 @@ namespace mg5amcCpu
     // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event
     // [NB 'sum over final spins, average over initial spins', eg see
     // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf]
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     allMEs[ievt] /= helcolDenominators[0];
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     if( channelId > 0 ) allMEs[ievt] *= allNumerators[ievt] / allDenominators[ievt];
diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/CPPProcess.h b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/CPPProcess.h
index 1d1e130ec2..53bb5ccd94 100644
--- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/CPPProcess.h
+++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/CPPProcess.h
@@ -4,7 +4,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
 // MadGraph5_aMC@NLO v. 3.5.0_lo_vect, 2023-06-09
@@ -25,7 +25,7 @@
 
 //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -110,7 +110,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   __global__ void
   computeDependentCouplings( const fptype* allgs,    // input: Gs[nevt]
                              fptype* allcouplings ); // output: couplings[nevt*ndcoup*2]
@@ -123,7 +123,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__ /* clang-format off */
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
   __global__ void
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
@@ -153,7 +153,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__ /* clang-format off */
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
   __global__ void
   sigmaKin( const fptype* allmomenta,      // input: momenta[nevt*npar*4]
             const fptype* allcouplings,    // input: couplings[nevt*ndcoup*2]
diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/CudaRuntime.h b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/CudaRuntime.h
deleted file mode 120000
index ce9e1a487a..0000000000
--- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/CudaRuntime.h
+++ /dev/null
@@ -1 +0,0 @@
-../CudaRuntime.h
\ No newline at end of file
diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/GpuAbstraction.h b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/GpuAbstraction.h
new file mode 120000
index 0000000000..72054e19ba
--- /dev/null
+++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/GpuAbstraction.h
@@ -0,0 +1 @@
+../GpuAbstraction.h
\ No newline at end of file
diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/GpuRuntime.h b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/GpuRuntime.h
new file mode 120000
index 0000000000..3920e83be4
--- /dev/null
+++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/GpuRuntime.h
@@ -0,0 +1 @@
+../GpuRuntime.h
\ No newline at end of file
diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/check_sa.cc b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/check_sa.cc
index f1e75b9252..1bad694d1c 100644
--- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/check_sa.cc
+++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/check_sa.cc
@@ -4,7 +4,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: O. Mattelaer (Nov 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 
 #include "mgOnGpuConfig.h"
@@ -12,6 +12,7 @@
 #include "BridgeKernels.h"
 #include "CPPProcess.h"
 #include "CrossSectionKernels.h"
+#include "GpuRuntime.h"
 #include "MatrixElementKernels.h"
 #include "MemoryAccessMatrixElements.h"
 #include "MemoryAccessMomenta.h"
@@ -63,7 +64,7 @@ usage( char* argv0, int ret = 1 )
   std::cout << std::endl;
   std::cout << "Summary stats are always computed: '-p' and '-j' only control their printout" << std::endl;
   std::cout << "The '-d' flag only enables NaN/abnormal warnings and OMP debugging" << std::endl;
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 #ifdef _OPENMP
   std::cout << std::endl;
   std::cout << "Use the OMP_NUM_THREADS environment variable to control OMP multi-threading" << std::endl;
@@ -77,7 +78,7 @@ int
 main( int argc, char** argv )
 {
   // Namespaces for CUDA and C++ (FIXME - eventually use the same namespace everywhere...)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   using namespace mg5amcGpu;
 #else
   using namespace mg5amcCpu;
@@ -103,11 +104,11 @@ main( int argc, char** argv )
     CurandDevice = 2
   };
 #ifdef __CUDACC__
-  RandomNumberMode rndgen = RandomNumberMode::CurandDevice; // default on GPU
+  RandomNumberMode rndgen = RandomNumberMode::CurandDevice; // default on CUDA GPU
 #elif not defined MGONGPU_HAS_NO_CURAND
   RandomNumberMode rndgen = RandomNumberMode::CurandHost;  // default on CPU if build has curand
 #else
-  RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // default on CPU if build has no curand
+  RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // default on CPU if build has no curand and on HIP GPU
 #endif
   // Rambo sampling mode (NB RamboHost implies CommonRandom or CurandHost!)
   enum class RamboSamplingMode
@@ -115,7 +116,7 @@ main( int argc, char** argv )
     RamboHost = 1,
     RamboDevice = 2
   };
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   RamboSamplingMode rmbsmp = RamboSamplingMode::RamboDevice; // default on GPU
 #else
   RamboSamplingMode rmbsmp = RamboSamplingMode::RamboHost; // default on CPU
@@ -148,7 +149,7 @@ main( int argc, char** argv )
 #ifdef __CUDACC__
       rndgen = RandomNumberMode::CurandDevice;
 #else
-      throw std::runtime_error( "CurandDevice is not supported on CPUs" );
+      throw std::runtime_error( "CurandDevice is not supported on CPUs or on HIP GPUs" );
 #endif
     }
     else if( arg == "--curhst" )
@@ -165,7 +166,7 @@ main( int argc, char** argv )
     }
     else if( arg == "--rmbdev" )
     {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
       rmbsmp = RamboSamplingMode::RamboDevice;
 #else
       throw std::runtime_error( "RamboDevice is not supported on CPUs" );
@@ -239,13 +240,13 @@ main( int argc, char** argv )
     return usage( argv[0] );
   }
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 #ifdef _OPENMP
   ompnumthreadsNotSetMeansOneThread( debug ? 1 : 0 ); // quiet(-1), info(0), debug(1)
 #endif
 #endif
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // Fail gently and avoid "Illegal instruction (core dumped)" if the host does not support the SIMD used in the ME calculation
   // Note: this prevents a crash on pmpe04 but not on some github CI nodes?
   // [NB: SIMD vectorization in mg5amc C++ code is only used in the ME calculation below MatrixElementKernelHost!]
@@ -263,14 +264,14 @@ main( int argc, char** argv )
 
   // === STEP 0 - INITIALISE
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 
-  // --- 00. Initialise cuda
-  // Instantiate a CudaRuntime at the beginnining of the application's main to
-  // invoke cudaSetDevice(0) in the constructor and book a cudaDeviceReset() call in the destructor
-  const std::string cdinKey = "00 CudaInit";
+  // --- 00. Initialise GPU
+  // Instantiate a GpuRuntime at the beginnining of the application's main.
+  // For CUDA this invokes cudaSetDevice(0) in the constructor and books a cudaDeviceReset() call in the destructor.
+  const std::string cdinKey = "00 GpuInit";
   timermap.start( cdinKey );
-  CudaRuntime cudaRuntime( debug );
+  GpuRuntime GpuRuntime( debug );
 #endif
 
   // --- 0a. Initialise physics process
@@ -292,7 +293,7 @@ main( int argc, char** argv )
   timermap.start( alloKey );
 
   // Memory buffers for random numbers for momenta
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferRndNumMomenta hstRndmom( nevt );
 #else
   PinnedHostBufferRndNumMomenta hstRndmom( nevt );
@@ -300,7 +301,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for sampling weights
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferWeights hstWeights( nevt );
 #else
   PinnedHostBufferWeights hstWeights( nevt );
@@ -308,7 +309,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for momenta
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferMomenta hstMomenta( nevt );
 #else
   PinnedHostBufferMomenta hstMomenta( nevt );
@@ -316,7 +317,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for Gs
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferGs hstGs( nevt );
 #else
   PinnedHostBufferGs hstGs( nevt );
@@ -333,7 +334,7 @@ main( int argc, char** argv )
   }
 
   // Memory buffers for matrix elements
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferMatrixElements hstMatrixElements( nevt );
 #else
   PinnedHostBufferMatrixElements hstMatrixElements( nevt );
@@ -342,7 +343,7 @@ main( int argc, char** argv )
 
   // Memory buffers for random numbers for helicity selection
   // *** NB #403 these buffers always remain initialised at 0: no need for helicity choice in gcheck/check (no LHE produced) ***
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferRndNumHelicity hstRndHel( nevt );
 #else
   PinnedHostBufferRndNumHelicity hstRndHel( nevt );
@@ -351,7 +352,7 @@ main( int argc, char** argv )
 
   // Memory buffers for random numbers for color selection
   // *** NB #402 these buffers always remain initialised at 0: no need for color choice in gcheck/check (no LHE produced) ***
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferRndNumColor hstRndCol( nevt );
 #else
   PinnedHostBufferRndNumColor hstRndCol( nevt );
@@ -359,7 +360,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for helicity selection
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferSelectedHelicity hstSelHel( nevt );
 #else
   PinnedHostBufferSelectedHelicity hstSelHel( nevt );
@@ -367,7 +368,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for color selection
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferSelectedColor hstSelCol( nevt );
 #else
   PinnedHostBufferSelectedColor hstSelCol( nevt );
@@ -403,7 +404,7 @@ main( int argc, char** argv )
 #else
   else
   {
-    throw std::logic_error( "CurandDevice is not supported on CPUs" ); // INTERNAL ERROR (no path to this statement)
+    throw std::logic_error( "CurandDevice is not supported on CPUs or HIP GPUs" ); // INTERNAL ERROR (no path to this statement)
   }
 #endif
 #else
@@ -421,7 +422,7 @@ main( int argc, char** argv )
   }
   else
   {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     prsk.reset( new RamboSamplingKernelDevice( energy, devRndmom, devMomenta, devWeights, gpublocks, gputhreads ) );
 #else
     throw std::logic_error( "RamboDevice is not supported on CPUs" ); // INTERNAL ERROR (no path to this statement)
@@ -432,7 +433,7 @@ main( int argc, char** argv )
   std::unique_ptr<MatrixElementKernelBase> pmek;
   if( !bridge )
   {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     pmek.reset( new MatrixElementKernelDevice( devMomenta, devGs, devRndHel, devRndCol, devMatrixElements, devSelHel, devSelCol, gpublocks, gputhreads ) );
 #else
     pmek.reset( new MatrixElementKernelHost( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, nevt ) );
@@ -440,7 +441,7 @@ main( int argc, char** argv )
   }
   else
   {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     pmek.reset( new BridgeKernelDevice( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, gpublocks, gputhreads ) );
 #else
     pmek.reset( new BridgeKernelHost( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, nevt ) );
@@ -482,7 +483,7 @@ main( int argc, char** argv )
     prnk->generateRnarray();
     //std::cout << "Got random numbers" << std::endl;
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     if( rndgen != RandomNumberMode::CurandDevice && rmbsmp == RamboSamplingMode::RamboDevice )
     {
       // --- 1c. Copy rndmom from host to device
@@ -514,7 +515,7 @@ main( int argc, char** argv )
     prsk->getMomentaFinal();
     //std::cout << "Got final momenta" << std::endl;
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     if( rmbsmp == RamboSamplingMode::RamboDevice )
     {
       // --- 2c. CopyDToH Weights
@@ -559,7 +560,7 @@ main( int argc, char** argv )
       dynamic_cast<BridgeKernelBase*>( pmek.get() )->transposeInputMomentaC2F();
     }
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     // --- 2d. CopyHToD Momenta
     const std::string gKey = "0.. CpHTDg";
     rambtime += timermap.start( gKey ); // FIXME! NOT A RAMBO TIMER!
@@ -588,7 +589,7 @@ main( int argc, char** argv )
     wv3atime += timermap.stop(); // calc only
     wavetime += wv3atime;        // calc plus copy
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     if( !bridge )
     {
       // --- 3b. CopyDToH MEs
@@ -731,15 +732,19 @@ main( int argc, char** argv )
     rndgentxt = "CURAND DEVICE";
 #ifdef __CUDACC__
   rndgentxt += " (CUDA code)";
+#elif defined __HIPCC__
+  rndgentxt += " (HIP code)";
 #else
   rndgentxt += " (C++ code)";
 #endif
 
   // Workflow description summary
   std::string wrkflwtxt;
-  // -- CUDA or C++?
+  // -- CUDA or HIP or C++?
 #ifdef __CUDACC__
   wrkflwtxt += "CUD:";
+#elif defined __HIPCC__
+  wrkflwtxt += "HIP:";
 #else
   wrkflwtxt += "CPP:";
 #endif
@@ -764,6 +769,12 @@ main( int argc, char** argv )
 #else
   wrkflwtxt += "???:"; // no path to this statement
 #endif
+#elif defined __HIPCC__
+#if defined MGONGPU_CUCXTYPE_CXSMPL
+  wrkflwtxt += "CXS:";
+#else
+  wrkflwtxt += "???:"; // no path to this statement
+#endif
 #else
 #if defined MGONGPU_CPPCXTYPE_STDCOMPLEX
   wrkflwtxt += "STX:";
@@ -789,7 +800,7 @@ main( int argc, char** argv )
     wrkflwtxt += "RMBDEV+";
   else
     wrkflwtxt += "??????+"; // no path to this statement
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   // -- HOST or DEVICE matrix elements? Standalone MEs or BRIDGE?
   if( !bridge )
     wrkflwtxt += "MESDEV";
@@ -845,7 +856,7 @@ main( int argc, char** argv )
 
   if( perf )
   {
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 #ifdef _OPENMP
     // Get the output of "nproc --all" (https://stackoverflow.com/a/478960)
     std::string nprocall;
@@ -866,6 +877,8 @@ main( int argc, char** argv )
     std::cout << std::string( SEP79, '*' ) << std::endl
 #ifdef __CUDACC__
               << "Process                     = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_CUDA"
+#elif defined __HIPCC__
+              << "Process                     = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_HIP"
 #else
               << "Process                     = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_CPP"
 #endif
@@ -892,21 +905,21 @@ main( int argc, char** argv )
 #elif defined MGONGPU_FPTYPE_FLOAT
               << "FP precision                = FLOAT (NaN/abnormal=" << nabn << ", zero=" << nzero << ")" << std::endl
 #endif
-#ifdef __CUDACC__
 #if defined MGONGPU_CUCXTYPE_CUCOMPLEX
               << "Complex type                = CUCOMPLEX" << std::endl
 #elif defined MGONGPU_CUCXTYPE_THRUST
               << "Complex type                = THRUST::COMPLEX" << std::endl
-#endif
-#else
+#elif defined MGONGPU_CUCXTYPE_CXSMPL
               << "Complex type                = STD::COMPLEX" << std::endl
+#else
+              << "Complex type                = ???" << std::endl // no path to this statement...
 #endif
               << "RanNumb memory layout       = AOSOA[" << neppR << "]"
               << ( neppR == 1 ? " == AOS" : "" )
               << " [HARDCODED FOR REPRODUCIBILITY]" << std::endl
               << "Momenta memory layout       = AOSOA[" << neppM << "]"
               << ( neppM == 1 ? " == AOS" : "" ) << std::endl
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     //<< "Wavefunction GPU memory     = LOCAL" << std::endl
 #else
 #if !defined MGONGPU_CPPSIMD
@@ -937,7 +950,7 @@ main( int argc, char** argv )
 #endif
 #endif
               << "Random number generation    = " << rndgentxt << std::endl
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 #ifdef _OPENMP
               << "OMP threads / `nproc --all` = " << omp_get_max_threads() << " / " << nprocall // includes a newline
 #endif
@@ -1033,14 +1046,14 @@ main( int argc, char** argv )
              << "\"FLOAT (NaN/abnormal=" << nabn << ")\"," << std::endl
 #endif
              << "\"Complex type\": "
-#ifdef __CUDACC__
 #if defined MGONGPU_CUCXTYPE_CUCOMPLEX
              << "\"CUCOMPLEX\"," << std::endl
 #elif defined MGONGPU_CUCXTYPE_THRUST
              << "\"THRUST::COMPLEX\"," << std::endl
-#endif
-#else
+#elif defined MGONGPU_CUCXTYPE_CXSMPL
              << "\"STD::COMPLEX\"," << std::endl
+#else
+             << "\"???\"," << std::endl                           // no path to this statement...
 #endif
              << "\"RanNumb memory layout\": "
              << "\"AOSOA[" << neppR << "]\""
@@ -1048,7 +1061,7 @@ main( int argc, char** argv )
              << "\"Momenta memory layout\": "
              << "\"AOSOA[" << neppM << "]\""
              << ( neppM == 1 ? " == AOS" : "" ) << ", " << std::endl
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     //<< "\"Wavefunction GPU memory\": " << "\"LOCAL\"," << std::endl
 #endif
              << "\"Curand generation\": "
diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/RamboSamplingKernels.cc b/epochX/cudacpp/gq_ttq.mad/SubProcesses/RamboSamplingKernels.cc
index da68aa9255..79abbcc4f8 100644
--- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/RamboSamplingKernels.cc
+++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/RamboSamplingKernels.cc
@@ -1,11 +1,11 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #include "RamboSamplingKernels.h"
 
-#include "CudaRuntime.h"
+#include "GpuRuntime.h"
 #include "MemoryAccessMomenta.h"
 #include "MemoryAccessRandomNumbers.h"
 #include "MemoryAccessWeights.h"
@@ -14,7 +14,7 @@
 
 #include <sstream>
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -92,7 +92,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   RamboSamplingKernelDevice::RamboSamplingKernelDevice( const fptype energy,               // input: energy
                                                         const BufferRndNumMomenta& rndmom, // input: random numbers in [0,1]
                                                         BufferMomenta& momenta,            // output: momenta
@@ -135,7 +135,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   __global__ void
   getMomentaInitialDevice( const fptype energy,
                            fptype* momenta )
@@ -147,17 +147,17 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   void
   RamboSamplingKernelDevice::getMomentaInitial()
   {
-    getMomentaInitialDevice<<<m_gpublocks, m_gputhreads>>>( m_energy, m_momenta.data() );
+    gpuLaunchKernel( getMomentaInitialDevice, m_gpublocks, m_gputhreads, m_energy, m_momenta.data() );
   }
 #endif
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   __global__ void
   getMomentaFinalDevice( const fptype energy,
                          const fptype* rndmom,
@@ -171,11 +171,11 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   void
   RamboSamplingKernelDevice::getMomentaFinal()
   {
-    getMomentaFinalDevice<<<m_gpublocks, m_gputhreads>>>( m_energy, m_rndmom.data(), m_momenta.data(), m_weights.data() );
+    gpuLaunchKernel( getMomentaFinalDevice, m_gpublocks, m_gputhreads, m_energy, m_rndmom.data(), m_momenta.data(), m_weights.data() );
   }
 #endif
 
diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/RamboSamplingKernels.h b/epochX/cudacpp/gq_ttq.mad/SubProcesses/RamboSamplingKernels.h
index 184089efd7..7c214cd74b 100644
--- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/RamboSamplingKernels.h
+++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/RamboSamplingKernels.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef RAMBOSAMPLINGKERNELS_H
 #define RAMBOSAMPLINGKERNELS_H 1
@@ -10,7 +10,7 @@
 
 #include "MemoryBuffers.h"
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -93,7 +93,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   // A class encapsulating RAMBO phase space sampling on a GPU device
   class RamboSamplingKernelDevice final : public SamplingKernelBase, public NumberOfEvents
   {
diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/RandomNumberKernels.h b/epochX/cudacpp/gq_ttq.mad/SubProcesses/RandomNumberKernels.h
index 188a72c2c9..21d63beeac 100644
--- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/RandomNumberKernels.h
+++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/RandomNumberKernels.h
@@ -1,14 +1,14 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef RANDOMNUMBERKERNELS_H
 #define RANDOMNUMBERKERNELS_H 1
 
 #include "mgOnGpuConfig.h"
 
-// NB This must come AFTER mgOnGpuConfig.h which contains our definition of __global__ when __CUDACC__ is not defined
+// NB This must come AFTER mgOnGpuConfig.h which contains our definition of __global__ when MGONGPUCPP_GPUIMPL is not defined
 #ifndef MGONGPU_HAS_NO_CURAND
 //#include "curand.h"
 struct curandGenerator_st; // forward definition from curand.h
@@ -16,7 +16,7 @@ struct curandGenerator_st; // forward definition from curand.h
 
 #include "MemoryBuffers.h"
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/cudacpp.mk b/epochX/cudacpp/gq_ttq.mad/SubProcesses/cudacpp.mk
index a0397e9ecc..bcb73d7f01 100644
--- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/cudacpp.mk
+++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/cudacpp.mk
@@ -1,7 +1,7 @@
 # Copyright (C) 2020-2023 CERN and UCLouvain.
 # Licensed under the GNU Lesser General Public License (version 3 or later).
 # Created by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin.
-# Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+# Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 
 #=== Determine the name of this makefile (https://ftp.gnu.org/old-gnu/Manuals/make-3.80/html_node/make_17.html)
 #=== NB: different names (e.g. cudacpp.mk and cudacpp_src.mk) are used in the Subprocess and src directories
@@ -32,7 +32,7 @@ UNAME_P := $(shell uname -p)
 #=== Configure common compiler flags for C++ and CUDA
 
 INCFLAGS = -I.
-OPTFLAGS = -O3 # this ends up in CUFLAGS too (should it?), cannot add -Ofast or -ffast-math here
+OPTFLAGS = -O3 # this ends up in GPUFLAGS too (should it?), cannot add -Ofast or -ffast-math here
 
 # Dependency on src directory
 MG5AMC_COMMONLIB = mg5amc_common
@@ -89,69 +89,139 @@ endif
 
 #-------------------------------------------------------------------------------
 
-#=== Configure the CUDA compiler
-
-# If CXX is not a single word (example "clang++ --gcc-toolchain...") then disable CUDA builds (issue #505)
-# This is because it is impossible to pass this to "CUFLAGS += -ccbin <host-compiler>" below
-ifneq ($(words $(subst ccache ,,$(CXX))),1) # allow at most "CXX=ccache <host-compiler>" from outside
-  $(warning CUDA builds are not supported for multi-word CXX "$(CXX)")
-  override CUDA_HOME=disabled
-endif
-
-# If CUDA_HOME is not set, try to set it from the location of nvcc
-ifndef CUDA_HOME
-  CUDA_HOME = $(patsubst %bin/nvcc,%,$(shell which nvcc 2>/dev/null))
-  $(warning CUDA_HOME was not set: using "$(CUDA_HOME)")
-endif
-
-# Set NVCC as $(CUDA_HOME)/bin/nvcc if it exists
-ifneq ($(wildcard $(CUDA_HOME)/bin/nvcc),)
-  NVCC = $(CUDA_HOME)/bin/nvcc
-  USE_NVTX ?=-DUSE_NVTX
-  # See https://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/index.html
-  # See https://arnon.dk/matching-sm-architectures-arch-and-gencode-for-various-nvidia-cards/
-  # Default: use compute capability 70 for V100 (CERN lxbatch, CERN itscrd, Juwels Cluster).
-  # Embed device code for 70, and PTX for 70+.
-  # Export MADGRAPH_CUDA_ARCHITECTURE (comma-separated list) to use another value or list of values (see #533).
-  # Examples: use 60 for P100 (Piz Daint), 80 for A100 (Juwels Booster, NVidia raplab/Curiosity).
-  MADGRAPH_CUDA_ARCHITECTURE ?= 70
-  ###CUARCHFLAGS = -gencode arch=compute_$(MADGRAPH_CUDA_ARCHITECTURE),code=compute_$(MADGRAPH_CUDA_ARCHITECTURE) -gencode arch=compute_$(MADGRAPH_CUDA_ARCHITECTURE),code=sm_$(MADGRAPH_CUDA_ARCHITECTURE) # Older implementation (AV): go back to this one for multi-GPU support #533
-  ###CUARCHFLAGS = --gpu-architecture=compute_$(MADGRAPH_CUDA_ARCHITECTURE) --gpu-code=sm_$(MADGRAPH_CUDA_ARCHITECTURE),compute_$(MADGRAPH_CUDA_ARCHITECTURE) # Newer implementation (SH): cannot use this as-is for multi-GPU support #533
-  comma:=,
-  CUARCHFLAGS = $(foreach arch,$(subst $(comma), ,$(MADGRAPH_CUDA_ARCHITECTURE)),-gencode arch=compute_$(arch),code=compute_$(arch) -gencode arch=compute_$(arch),code=sm_$(arch))
-  CUINC = -I$(CUDA_HOME)/include/
-  CURANDLIBFLAGS = -L$(CUDA_HOME)/lib64/ -lcurand # NB: -lcuda is not needed here!
-  CUOPTFLAGS = -lineinfo
-  CUFLAGS = $(OPTFLAGS) $(CUOPTFLAGS) $(INCFLAGS) $(CUINC) $(USE_NVTX) $(CUARCHFLAGS) -use_fast_math
-  ###CUFLAGS += -Xcompiler -Wall -Xcompiler -Wextra -Xcompiler -Wshadow
-  ###NVCC_VERSION = $(shell $(NVCC) --version | grep 'Cuda compilation tools' | cut -d' ' -f5 | cut -d, -f1)
-  CUFLAGS += -std=c++17 # need CUDA >= 11.2 (see #333): this is enforced in mgOnGpuConfig.h
-  # Without -maxrregcount: baseline throughput: 6.5E8 (16384 32 12) up to 7.3E8 (65536 128 12)
-  ###CUFLAGS+= --maxrregcount 160 # improves throughput: 6.9E8 (16384 32 12) up to 7.7E8 (65536 128 12)
-  ###CUFLAGS+= --maxrregcount 128 # improves throughput: 7.3E8 (16384 32 12) up to 7.6E8 (65536 128 12)
-  ###CUFLAGS+= --maxrregcount 96 # degrades throughput: 4.1E8 (16384 32 12) up to 4.5E8 (65536 128 12)
-  ###CUFLAGS+= --maxrregcount 64 # degrades throughput: 1.7E8 (16384 32 12) flat at 1.7E8 (65536 128 12)
-else ifneq ($(origin REQUIRE_CUDA),undefined)
-  # If REQUIRE_CUDA is set but no cuda is found, stop here (e.g. for CI tests on GPU #443)
-  $(error No cuda installation found (set CUDA_HOME or make nvcc visible in PATH))
-else
-  # No cuda. Switch cuda compilation off and go to common random numbers in C++
-  $(warning CUDA_HOME is not set or is invalid: export CUDA_HOME to compile with cuda)
-  override NVCC=
-  override USE_NVTX=
-  override CUINC=
-  override CURANDLIBFLAGS=
-endif
+CUDA_COMPILER_PATH := $(shell compiler="`which nvcc 2>/dev/null`" && while [ -L "$$compiler" ]; do compiler=`readlink "$$compiler"`; done && echo "$$compiler")
+HIP_COMPILER_PATH := $(shell compiler="`which hipcc 2>/dev/null`" && while [ -L "$$compiler" ]; do compiler=`readlink "$$compiler"`; done && echo "$$compiler")
+
+ifeq ($(findstring nvcc,$(CUDA_COMPILER_PATH)),nvcc)
+  #=== Configure the CUDA compiler
+
+  # If CXX is not a single word (example "clang++ --gcc-toolchain...") then disable CUDA builds (issue #505)
+  # This is because it is impossible to pass this to "GPUFLAGS += -ccbin <host-compiler>" below
+  ifneq ($(words $(subst ccache ,,$(CXX))),1) # allow at most "CXX=ccache <host-compiler>" from outside
+    $(warning CUDA builds are not supported for multi-word CXX "$(CXX)")
+    override CUDA_HOME=disabled
+  endif
+
+  # If CUDA_HOME is not set, try to set it from the location of NVCC
+  ifndef CUDA_HOME
+    CUDA_HOME = $(patsubst %bin/nvcc,%,$(shell which nvcc 2>/dev/null))
+    $(warning CUDA_HOME was not set: using "$(CUDA_HOME)")
+  endif
+
+  # Set GPUCC as $(CUDA_HOME)/bin/nvcc if it exists
+  ifneq ($(wildcard $(CUDA_HOME)/bin/nvcc),)
+    GPUCC = $(CUDA_HOME)/bin/nvcc
+    USE_NVTX ?=-DUSE_NVTX
+    # See https://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/index.html
+    # See https://arnon.dk/matching-sm-architectures-arch-and-gencode-for-various-nvidia-cards/
+    # Default: use compute capability 70 for V100 (CERN lxbatch, CERN itscrd, Juwels Cluster).
+    # Embed device code for 70, and PTX for 70+.
+    # Export MADGRAPH_CUDA_ARCHITECTURE (comma-separated list) to use another value or list of values (see #533).
+    # Examples: use 60 for P100 (Piz Daint), 80 for A100 (Juwels Booster, NVidia raplab/Curiosity).
+    MADGRAPH_CUDA_ARCHITECTURE ?= 70
+    ###CUARCHFLAGS = -gencode arch=compute_$(MADGRAPH_CUDA_ARCHITECTURE),code=compute_$(MADGRAPH_CUDA_ARCHITECTURE) -gencode arch=compute_$(MADGRAPH_CUDA_ARCHITECTURE),code=sm_$(MADGRAPH_CUDA_ARCHITECTURE) # Older implementation (AV): go back to this one for multi-GPU support #533
+    ###CUARCHFLAGS = --gpu-architecture=compute_$(MADGRAPH_CUDA_ARCHITECTURE) --gpu-code=sm_$(MADGRAPH_CUDA_ARCHITECTURE),compute_$(MADGRAPH_CUDA_ARCHITECTURE) # Newer implementation (SH): cannot use this as-is for multi-GPU support #533
+    comma:=,
+    CUARCHFLAGS = $(foreach arch,$(subst $(comma), ,$(MADGRAPH_CUDA_ARCHITECTURE)),-gencode arch=compute_$(arch),code=compute_$(arch) -gencode arch=compute_$(arch),code=sm_$(arch))
+    CUINC = -I$(CUDA_HOME)/include/
+    CURANDLIBFLAGS = -L$(CUDA_HOME)/lib64/ -lcurand # NB: -lcuda is not needed here!
+    CUOPTFLAGS = -lineinfo
+    GPUFLAGS = $(OPTFLAGS) $(CUOPTFLAGS) $(INCFLAGS) $(CUINC) $(USE_NVTX) $(CUARCHFLAGS) -use_fast_math
+    ###GPUFLAGS += -Xcompiler -Wall -Xcompiler -Wextra -Xcompiler -Wshadow
+    ###GPUCC_VERSION = $(shell $(GPUCC) --version | grep 'Cuda compilation tools' | cut -d' ' -f5 | cut -d, -f1)
+    GPUFLAGS += -std=c++17 # need CUDA >= 11.2 (see #333): this is enforced in mgOnGpuConfig.h
+    # Without -maxrregcount: baseline throughput: 6.5E8 (16384 32 12) up to 7.3E8 (65536 128 12)
+    ###GPUFLAGS+= --maxrregcount 160 # improves throughput: 6.9E8 (16384 32 12) up to 7.7E8 (65536 128 12)
+    ###GPUFLAGS+= --maxrregcount 128 # improves throughput: 7.3E8 (16384 32 12) up to 7.6E8 (65536 128 12)
+    ###GPUFLAGS+= --maxrregcount 96 # degrades throughput: 4.1E8 (16384 32 12) up to 4.5E8 (65536 128 12)
+    ###GPUFLAGS+= --maxrregcount 64 # degrades throughput: 1.7E8 (16384 32 12) flat at 1.7E8 (65536 128 12)
+
+    CUBUILDRULEFLAGS = -Xcompiler -fPIC -c
+    CCBUILDRULEFLAGS = -Xcompiler -fPIC -c -x cu
+
+    CUDATESTFLAGS = -lcuda
+
+  else ifneq ($(origin REQUIRE_CUDA),undefined)
+    # If REQUIRE_CUDA is set but no cuda is found, stop here (e.g. for CI tests on GPU #443)
+    $(error No cuda installation found (set CUDA_HOME or make GPUCC visible in PATH))
+  else
+    # No cuda. Switch cuda compilation off and go to common random numbers in C++
+    $(warning CUDA_HOME is not set or is invalid: export CUDA_HOME to compile with cuda)
+    override GPUCC=
+    override USE_NVTX=
+    override CUINC=
+    override CURANDLIBFLAGS=
+  endif
+
+  # Set the host C++ compiler for GPUCC via "-ccbin <host-compiler>"
+  # (NB issue #505: this must be a single word, "clang++ --gcc-toolchain..." is not supported)
+  GPUFLAGS += -ccbin $(shell which $(subst ccache ,,$(CXX)))
+
+  # Allow newer (unsupported) C++ compilers with older versions of CUDA if ALLOW_UNSUPPORTED_COMPILER_IN_CUDA is set (#504)
+  ifneq ($(origin ALLOW_UNSUPPORTED_COMPILER_IN_CUDA),undefined)
+  GPUFLAGS += -allow-unsupported-compiler
+  endif
+
+else ifeq ($(findstring hipcc,$(HIP_COMPILER_PATH)),hipcc)
+  #=== Configure the HIP compiler
+
+  # If CXX is not a single word (example "clang++ --gcc-toolchain...") then disable CUDA builds (issue #505)
+  # This is because it is impossible to pass this to "GPUFLAGS += -ccbin <host-compiler>" below
+  ifneq ($(words $(subst ccache ,,$(CXX))),1) # allow at most "CXX=ccache <host-compiler>" from outside
+    $(warning CUDA builds are not supported for multi-word CXX "$(CXX)")
+    override CUDA_HOME=disabled
+  endif
+
+  # If HIP_HOME is not set, try to set it from the location of GPUCC
+  ifndef HIP_HOME
+    HIP_HOME = $(patsubst %bin/hipcc,%,$(HIP_COMPILER_PATH))
+    $(warning HIP_HOME was not set: using "$(HIP_HOME)")
+  endif
 
-# Set the host C++ compiler for nvcc via "-ccbin <host-compiler>"
-# (NB issue #505: this must be a single word, "clang++ --gcc-toolchain..." is not supported)
-CUFLAGS += -ccbin $(shell which $(subst ccache ,,$(CXX)))
+  # Set GPUCC as $(HIP_HOME)/bin/hipcc if it exists
+  ifneq ($(wildcard $(HIP_HOME)/bin/hipcc),)
+    GPUCC = $(HIP_HOME)/bin/hipcc
+
+    # Should maybe find something equivelant to this in HIP
+    #USE_NVTX ?=-DUSE_NVTX
+
+    HIPARCHFLAGS = -target x86_64-linux-gnu --offload-arch=gfx90a
+    HIPINC = -I$(HIP_HOME)/include/
+
+    # -DHIP_FAST_MATH equivelant to -use_fast_math in HIP 
+    # (But only for single precision line 208: https://rocm-developer-tools.github.io/HIP/hcc__detail_2math__functions_8h_source.html)
+    GPUFLAGS = $(OPTFLAGS) $(CUOPTFLAGS) $(INCFLAGS) $(HIPINC) $(HIPARCHFLAGS) -DHIP_FAST_MATH -DHIP_PLATFORM=amd -fPIC
+    ###GPUFLAGS += -Xcompiler -Wall -Xcompiler -Wextra -Xcompiler -Wshadow
+    GPUFLAGS += -std=c++17
+    # Without -maxrregcount: baseline throughput: 6.5E8 (16384 32 12) up to 7.3E8 (65536 128 12)
+    ###GPUFLAGS+= --maxrregcount 160 # improves throughput: 6.9E8 (16384 32 12) up to 7.7E8 (65536 128 12)
+    ###GPUFLAGS+= --maxrregcount 128 # improves throughput: 7.3E8 (16384 32 12) up to 7.6E8 (65536 128 12)
+    ###GPUFLAGS+= --maxrregcount 96 # degrades throughput: 4.1E8 (16384 32 12) up to 4.5E8 (65536 128 12)
+    ###GPUFLAGS+= --maxrregcount 64 # degrades throughput: 1.7E8 (16384 32 12) flat at 1.7E8 (65536 128 12)
+
+    CUBUILDRULEFLAGS = -fPIC -c
+    CCBUILDRULEFLAGS = -fPIC -c
+
+  else ifneq ($(origin REQUIRE_HIP),undefined)
+    # If REQUIRE_HIP is set but no cuda is found, stop here (e.g. for CI tests on GPU #443)
+    $(error No hip installation found (set HIP_HOME or make GPUCC visible in PATH))
+  else
+    # No hip. Switch hip compilation off and go to common random numbers in C++
+    $(warning HIP_HOME is not set or is invalid: export HIP_HOME to compile with hip)
+    override GPUCC=
+    override USE_NVTX=
+    override CUINC=
+    override CURANDLIBFLAGS=
+  endif
+
+  # Allow newer (unsupported) C++ compilers with older versions of CUDA if ALLOW_UNSUPPORTED_COMPILER_IN_CUDA is set (#504)
+  ifneq ($(origin ALLOW_UNSUPPORTED_COMPILER_IN_CUDA),undefined)
+  GPUFLAGS += -allow-unsupported-compiler
+  endif
 
-# Allow newer (unsupported) C++ compilers with older versions of CUDA if ALLOW_UNSUPPORTED_COMPILER_IN_CUDA is set (#504)
-ifneq ($(origin ALLOW_UNSUPPORTED_COMPILER_IN_CUDA),undefined)
-CUFLAGS += -allow-unsupported-compiler
 endif
 
+  
 #-------------------------------------------------------------------------------
 
 #=== Configure ccache for C++ and CUDA builds
@@ -163,9 +233,9 @@ endif
 #ifeq ($(USECCACHE)$(shell echo $(AR) | grep ccache),1)
 #  override AR:=ccache $(AR)
 #endif
-ifneq ($(NVCC),)
-  ifeq ($(USECCACHE)$(shell echo $(NVCC) | grep ccache),1)
-    override NVCC:=ccache $(NVCC)
+ifneq ($(GPUCC),)
+  ifeq ($(USECCACHE)$(shell echo $(GPUCC) | grep ccache),1)
+    override GPUCC:=ccache $(GPUCC)
   endif
 endif
 
@@ -189,7 +259,7 @@ endif
 
 # PowerPC-specific CUDA compiler flags (to be reviewed!)
 ifeq ($(UNAME_P),ppc64le)
-  CUFLAGS+= -Xcompiler -mno-float128
+  GPUFLAGS+= -Xcompiler -mno-float128
 endif
 
 #-------------------------------------------------------------------------------
@@ -199,10 +269,10 @@ endif
 # Set the default OMPFLAGS choice
 ifneq ($(shell $(CXX) --version | egrep '^Intel'),)
 override OMPFLAGS = -fopenmp
-###override OMPFLAGS = # disable OpenMP MT on Intel (was ok without nvcc but not ok with nvcc before #578)
+###override OMPFLAGS = # disable OpenMP MT on Intel (was ok without GPUCC but not ok with GPUCC before #578)
 else ifneq ($(shell $(CXX) --version | egrep '^(clang)'),)
 override OMPFLAGS = -fopenmp
-###override OMPFLAGS = # disable OpenMP MT on clang (was not ok without or with nvcc before #578)
+###override OMPFLAGS = # disable OpenMP MT on clang (was not ok without or with GPUCC before #578)
 else ifneq ($(shell $(CXX) --version | egrep '^(Apple clang)'),)
 override OMPFLAGS = # disable OpenMP MT on Apple clang (builds fail in the CI #578)
 else
@@ -253,7 +323,10 @@ endif
 
 # Set the default RNDGEN (random number generator) choice
 ifeq ($(RNDGEN),)
-  ifeq ($(NVCC),)
+  ifeq ($(GPUCC),)
+    override RNDGEN = hasNoCurand
+  # Edgecase for HIP compilation
+  else ifeq ($(findstring hipcc,$(GPUCC)),hipcc)
     override RNDGEN = hasNoCurand
   else ifeq ($(RNDGEN),)
     override RNDGEN = hasCurand
@@ -328,13 +401,13 @@ CXXFLAGS+= $(AVXFLAGS)
 $(info FPTYPE=$(FPTYPE))
 ifeq ($(FPTYPE),d)
   CXXFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_DOUBLE
-  CUFLAGS  += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_DOUBLE
+  GPUFLAGS  += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_DOUBLE
 else ifeq ($(FPTYPE),f)
   CXXFLAGS += -DMGONGPU_FPTYPE_FLOAT -DMGONGPU_FPTYPE2_FLOAT
-  CUFLAGS  += -DMGONGPU_FPTYPE_FLOAT -DMGONGPU_FPTYPE2_FLOAT
+  GPUFLAGS  += -DMGONGPU_FPTYPE_FLOAT -DMGONGPU_FPTYPE2_FLOAT
 else ifeq ($(FPTYPE),m)
   CXXFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_FLOAT
-  CUFLAGS  += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_FLOAT
+  GPUFLAGS  += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_FLOAT
 else
   $(error Unknown FPTYPE='$(FPTYPE)': only 'd', 'f' and 'm' are supported)
 endif
@@ -343,7 +416,7 @@ endif
 $(info HELINL=$(HELINL))
 ifeq ($(HELINL),1)
   CXXFLAGS += -DMGONGPU_INLINE_HELAMPS
-  CUFLAGS  += -DMGONGPU_INLINE_HELAMPS
+  GPUFLAGS  += -DMGONGPU_INLINE_HELAMPS
 else ifneq ($(HELINL),0)
   $(error Unknown HELINL='$(HELINL)': only '0' and '1' are supported)
 endif
@@ -352,7 +425,7 @@ endif
 $(info HRDCOD=$(HRDCOD))
 ifeq ($(HRDCOD),1)
   CXXFLAGS += -DMGONGPU_HARDCODE_PARAM
-  CUFLAGS  += -DMGONGPU_HARDCODE_PARAM
+  GPUFLAGS  += -DMGONGPU_HARDCODE_PARAM
 else ifneq ($(HRDCOD),0)
   $(error Unknown HRDCOD='$(HRDCOD)': only '0' and '1' are supported)
 endif
@@ -404,11 +477,11 @@ ifeq ($(UNAME_S),Darwin)
   override CULIBFLAGSRPATH2 =
 else
   # RPATH to cuda/cpp libs when linking executables
-  override CXXLIBFLAGSRPATH = -Wl,-rpath,$(LIBDIRRPATH)
-  override CULIBFLAGSRPATH = -Xlinker -rpath,$(LIBDIRRPATH)
+  override CXXLIBFLAGSRPATH = -Wl,-rpath=$(LIBDIRRPATH)
+  override CULIBFLAGSRPATH = -Xlinker -rpath=$(LIBDIRRPATH)
   # RPATH to common lib when linking cuda/cpp libs
-  override CXXLIBFLAGSRPATH2 = -Wl,-rpath,'$$ORIGIN'
-  override CULIBFLAGSRPATH2 = -Xlinker -rpath,'$$ORIGIN'
+  override CXXLIBFLAGSRPATH2 = -Wl,-rpath='$$ORIGIN'
+  override CULIBFLAGSRPATH2 = -Xlinker -rpath='$$ORIGIN'
 endif
 
 # Setting LD_LIBRARY_PATH or DYLD_LIBRARY_PATH in the RUNTIME is no longer necessary (neither on Linux nor on Mac)
@@ -421,7 +494,7 @@ override RUNTIME =
 cxx_main=$(BUILDDIR)/check.exe
 fcxx_main=$(BUILDDIR)/fcheck.exe
 
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 cu_main=$(BUILDDIR)/gcheck.exe
 fcu_main=$(BUILDDIR)/fgcheck.exe
 else
@@ -452,15 +525,16 @@ $(BUILDDIR)/.build.$(TAG):
 	@touch $(BUILDDIR)/.build.$(TAG)
 
 # Generic target and build rules: objects from CUDA compilation
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 $(BUILDDIR)/%.o : %.cu *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
 	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
-	$(NVCC) $(CPPFLAGS) $(CUFLAGS) -Xcompiler -fPIC -c $< -o $@
+	$(GPUCC) $(CPPFLAGS) $(GPUFLAGS) $(CUBUILDRULEFLAGS) $< -o $@
 
 $(BUILDDIR)/%_cu.o : %.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
 	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
-	$(NVCC) $(CPPFLAGS) $(CUFLAGS) -Xcompiler -fPIC -c -x cu $< -o $@
+	$(GPUCC) $(CPPFLAGS) $(GPUFLAGS) $(CCBUILDRULEFLAGS) $< -o $@
 endif
+# -x cu in line above
 
 # Generic target and build rules: objects from C++ compilation
 # (NB do not include CUINC here! add it only for NVTX or curand #679)
@@ -469,11 +543,14 @@ $(BUILDDIR)/%.o : %.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
 	$(CXX) $(CPPFLAGS) $(CXXFLAGS) -fPIC -c $< -o $@
 
 # Apply special build flags only to CrossSectionKernel.cc and gCrossSectionKernel.cu (no fast math, see #117)
+# Added edgecase for HIP compilation
 ifeq ($(shell $(CXX) --version | grep ^nvc++),)
 $(BUILDDIR)/CrossSectionKernels.o: CXXFLAGS += -fno-fast-math
 $(BUILDDIR)/CrossSectionKernels.o: CXXFLAGS += -fno-fast-math
-ifneq ($(NVCC),)
-$(BUILDDIR)/gCrossSectionKernels.o: CUFLAGS += -Xcompiler -fno-fast-math
+ifeq ($(findstring nvcc,$(GPUCC)),nvcc)
+  $(BUILDDIR)/gCrossSectionKernels.o: GPUFLAGS += -Xcompiler -fno-fast-math
+else
+  $(BUILDDIR)/gCrossSectionKernels.o: GPUFLAGS += -fno-fast-math
 endif
 endif
 
@@ -489,10 +566,10 @@ ifeq ($(RNDGEN),hasCurand)
 $(BUILDDIR)/CurandRandomNumberKernel.o: CXXFLAGS += $(CUINC)
 endif
 
-# Avoid "warning: builtin __has_trivial_... is deprecated; use __is_trivially_... instead" in nvcc with icx2023 (#592)
+# Avoid "warning: builtin __has_trivial_... is deprecated; use __is_trivially_... instead" in GPUCC with icx2023 (#592)
 ifneq ($(shell $(CXX) --version | egrep '^(Intel)'),)
-ifneq ($(NVCC),)
-CUFLAGS += -Xcompiler -Wno-deprecated-builtins
+ifneq ($(GPUCC),)
+GPUFLAGS += -Wno-deprecated-builtins
 endif
 endif
 
@@ -500,8 +577,8 @@ endif
 # This patch does remove the warning, but I prefer to keep it disabled for the moment...
 ###ifneq ($(shell $(CXX) --version | egrep '^(clang|Apple clang|Intel)'),)
 ###$(BUILDDIR)/CrossSectionKernels.o: CXXFLAGS += -Wno-overriding-t-option
-###ifneq ($(NVCC),)
-###$(BUILDDIR)/gCrossSectionKernels.o: CUFLAGS += -Xcompiler -Wno-overriding-t-option
+###ifneq ($(GPUCC),)
+###$(BUILDDIR)/gCrossSectionKernels.o: GPUFLAGS += -Xcompiler -Wno-overriding-t-option
 ###endif
 ###endif
 
@@ -528,7 +605,7 @@ MG5AMC_CXXLIB = mg5amc_$(processid_short)_cpp
 cxx_objects_lib=$(BUILDDIR)/CPPProcess.o $(BUILDDIR)/MatrixElementKernels.o $(BUILDDIR)/BridgeKernels.o $(BUILDDIR)/CrossSectionKernels.o
 cxx_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel.o $(BUILDDIR)/RamboSamplingKernels.o
 
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 MG5AMC_CULIB = mg5amc_$(processid_short)_cuda
 cu_objects_lib=$(BUILDDIR)/gCPPProcess.o $(BUILDDIR)/gMatrixElementKernels.o $(BUILDDIR)/gBridgeKernels.o $(BUILDDIR)/gCrossSectionKernels.o
 cu_objects_exe=$(BUILDDIR)/gCommonRandomNumberKernel.o $(BUILDDIR)/gRamboSamplingKernels.o
@@ -540,11 +617,11 @@ $(LIBDIR)/lib$(MG5AMC_CXXLIB).so: cxx_objects_lib += $(BUILDDIR)/fbridge.o
 $(LIBDIR)/lib$(MG5AMC_CXXLIB).so: $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib)
 	$(CXX) -shared -o $@ $(cxx_objects_lib) $(CXXLIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB)
 
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 $(LIBDIR)/lib$(MG5AMC_CULIB).so: $(BUILDDIR)/fbridge_cu.o
 $(LIBDIR)/lib$(MG5AMC_CULIB).so: cu_objects_lib += $(BUILDDIR)/fbridge_cu.o
 $(LIBDIR)/lib$(MG5AMC_CULIB).so: $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cu_objects_lib)
-	$(NVCC) --shared -o $@ $(cu_objects_lib) $(CULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB)
+	$(GPUCC) --shared -o $@ $(cu_objects_lib) $(CULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB)
 endif
 
 #-------------------------------------------------------------------------------
@@ -561,16 +638,16 @@ $(cxx_main): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PAT
 $(cxx_main): $(BUILDDIR)/check_sa.o $(LIBDIR)/lib$(MG5AMC_CXXLIB).so $(cxx_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel.o
 	$(CXX) -o $@ $(BUILDDIR)/check_sa.o $(OMPFLAGS) -ldl -pthread $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_CXXLIB) $(cxx_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel.o $(CURANDLIBFLAGS)
 
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 ifneq ($(shell $(CXX) --version | grep ^Intel),)
-$(cu_main): LIBFLAGS += -lintlc # compile with icpx and link with nvcc (undefined reference to `_intel_fast_memcpy')
-$(cu_main): LIBFLAGS += -lsvml # compile with icpx and link with nvcc (undefined reference to `__svml_cos4_l9')
+$(cu_main): LIBFLAGS += -lintlc # compile with icpx and link with GPUCC (undefined reference to `_intel_fast_memcpy')
+$(cu_main): LIBFLAGS += -lsvml # compile with icpx and link with GPUCC (undefined reference to `__svml_cos4_l9')
 else ifneq ($(shell $(CXX) --version | grep ^nvc++),) # support nvc++ #531
 $(cu_main): LIBFLAGS += -L$(patsubst %bin/nvc++,%lib,$(subst ccache ,,$(CXX))) -lnvhpcatm -lnvcpumath -lnvc
 endif
 $(cu_main): LIBFLAGS += $(CULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH
 $(cu_main): $(BUILDDIR)/gcheck_sa.o $(LIBDIR)/lib$(MG5AMC_CULIB).so $(cu_objects_exe) $(BUILDDIR)/gCurandRandomNumberKernel.o
-	$(NVCC) -o $@ $(BUILDDIR)/gcheck_sa.o $(CUARCHFLAGS) $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe) $(BUILDDIR)/gCurandRandomNumberKernel.o $(CURANDLIBFLAGS)
+	$(GPUCC) -o $@ $(BUILDDIR)/gcheck_sa.o $(CUARCHFLAGS) $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe) $(BUILDDIR)/gCurandRandomNumberKernel.o $(CURANDLIBFLAGS)
 endif
 
 #-------------------------------------------------------------------------------
@@ -596,17 +673,17 @@ $(fcxx_main): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PA
 $(fcxx_main): $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler.o $(LIBDIR)/lib$(MG5AMC_CXXLIB).so $(cxx_objects_exe)
 	$(CXX) -o $@ $(BUILDDIR)/fcheck_sa.o $(OMPFLAGS) $(BUILDDIR)/fsampler.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CXXLIB) $(cxx_objects_exe)
 
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 ifneq ($(shell $(CXX) --version | grep ^Intel),)
-$(fcu_main): LIBFLAGS += -lintlc # compile with icpx and link with nvcc (undefined reference to `_intel_fast_memcpy')
-$(fcu_main): LIBFLAGS += -lsvml # compile with icpx and link with nvcc (undefined reference to `__svml_cos4_l9')
+$(fcu_main): LIBFLAGS += -lintlc # compile with icpx and link with GPUCC (undefined reference to `_intel_fast_memcpy')
+$(fcu_main): LIBFLAGS += -lsvml # compile with icpx and link with GPUCC (undefined reference to `__svml_cos4_l9')
 endif
 ifeq ($(UNAME_S),Darwin)
 $(fcu_main): LIBFLAGS += -L$(shell dirname $(shell $(FC) --print-file-name libgfortran.dylib)) # add path to libgfortran on Mac #375
 endif
 $(fcu_main): LIBFLAGS += $(CULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH
 $(fcu_main): $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler_cu.o $(LIBDIR)/lib$(MG5AMC_CULIB).so $(cu_objects_exe)
-	$(NVCC) -o $@ $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler_cu.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe)
+	$(GPUCC) -o $@ $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler_cu.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe)
 endif
 
 #-------------------------------------------------------------------------------
@@ -618,7 +695,7 @@ $(BUILDDIR)/testxxx.o: testxxx_cc_ref.txt
 $(testmain): $(BUILDDIR)/testxxx.o
 $(testmain): cxx_objects_exe += $(BUILDDIR)/testxxx.o # Comment out this line to skip the C++ test of xxx functions
 
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 $(BUILDDIR)/testxxx_cu.o: $(GTESTLIBS)
 $(BUILDDIR)/testxxx_cu.o: INCFLAGS += $(GTESTINC)
 $(BUILDDIR)/testxxx_cu.o: testxxx_cc_ref.txt
@@ -631,7 +708,7 @@ $(BUILDDIR)/testmisc.o: INCFLAGS += $(GTESTINC)
 $(testmain): $(BUILDDIR)/testmisc.o
 $(testmain): cxx_objects_exe += $(BUILDDIR)/testmisc.o # Comment out this line to skip the C++ miscellaneous tests
 
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 $(BUILDDIR)/testmisc_cu.o: $(GTESTLIBS)
 $(BUILDDIR)/testmisc_cu.o: INCFLAGS += $(GTESTINC)
 $(testmain): $(BUILDDIR)/testmisc_cu.o
@@ -643,12 +720,12 @@ $(BUILDDIR)/runTest.o: INCFLAGS += $(GTESTINC)
 $(testmain): $(BUILDDIR)/runTest.o
 $(testmain): cxx_objects_exe += $(BUILDDIR)/runTest.o
 
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 $(BUILDDIR)/runTest_cu.o: $(GTESTLIBS)
 $(BUILDDIR)/runTest_cu.o: INCFLAGS += $(GTESTINC)
 ifneq ($(shell $(CXX) --version | grep ^Intel),)
-$(testmain): LIBFLAGS += -lintlc # compile with icpx and link with nvcc (undefined reference to `_intel_fast_memcpy')
-$(testmain): LIBFLAGS += -lsvml # compile with icpx and link with nvcc (undefined reference to `__svml_cos4_l9')
+$(testmain): LIBFLAGS += -lintlc # compile with icpx and link with GPUCC (undefined reference to `_intel_fast_memcpy')
+$(testmain): LIBFLAGS += -lsvml # compile with icpx and link with GPUCC (undefined reference to `__svml_cos4_l9')
 else ifneq ($(shell $(CXX) --version | grep ^nvc++),) # support nvc++ #531
 $(testmain): LIBFLAGS += -L$(patsubst %bin/nvc++,%lib,$(subst ccache ,,$(CXX))) -lnvhpcatm -lnvcpumath -lnvc
 endif
@@ -672,14 +749,14 @@ $(testmain): LIBFLAGS += -lgomp
 endif
 endif
 
-ifeq ($(NVCC),) # link only runTest.o
+ifeq ($(GPUCC),) # link only runTest.o
 $(testmain): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH
 $(testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(cxx_objects_exe) $(GTESTLIBS)
 	$(CXX) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) -ldl -pthread $(LIBFLAGS)
 else # link both runTest.o and runTest_cu.o
 $(testmain): LIBFLAGS += $(CULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH
 $(testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) $(GTESTLIBS)
-	$(NVCC) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) -ldl $(LIBFLAGS) -lcuda
+	  $(GPUCC) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) -ldl $(LIBFLAGS) $(CUDATESTFLAGS)
 endif
 
 # Use flock (Linux only, no Mac) to allow 'make -j' if googletest has not yet been downloaded https://stackoverflow.com/a/32666215
@@ -782,9 +859,9 @@ ifeq ($(USECCACHE),1)
 	ccache --version | head -1
 endif
 	@echo ""
-	@echo NVCC=$(NVCC)
-ifneq ($(NVCC),)
-	$(NVCC) --version
+	@echo GPUCC=$(GPUCC)
+ifneq ($(GPUCC),)
+	$(GPUCC) --version
 endif
 	@echo ""
 	@echo CXX=$(CXX)
@@ -803,7 +880,7 @@ endif
 
 # Target: check (run the C++ test executable)
 # [NB THIS IS WHAT IS USED IN THE GITHUB CI!]
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 check: runTest cmpFcheck cmpFGcheck
 else
 check: runTest cmpFcheck
diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/fbridge.cc b/epochX/cudacpp/gq_ttq.mad/SubProcesses/fbridge.cc
index f93c05b0b3..2b956730d4 100644
--- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/fbridge.cc
+++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/fbridge.cc
@@ -1,11 +1,11 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: S. Roiser (Oct 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Roiser, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #include "Bridge.h"
 #include "CPPProcess.h"
-#include "CudaRuntime.h"
+#include "GpuRuntime.h"
 
 extern "C"
 {
@@ -22,7 +22,7 @@ extern "C"
    * Using the same Fortran MadEvent code, linking to the hetrerogeneous library would allow access to both CPU and GPU implementations.
    * The specific heterogeneous configuration (how many GPUs, how many threads on each CPU, etc) could be loaded in CUDA/C++ from a data file.
    */
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   using namespace mg5amcGpu;
 #else
   using namespace mg5amcCpu;
@@ -46,8 +46,8 @@ extern "C"
    */
   void fbridgecreate_( CppObjectInFortran** ppbridge, const int* pnevtF, const int* pnparF, const int* pnp4F )
   {
-#ifdef __CUDACC__
-    CudaRuntime::setUp();
+#ifdef MGONGPUCPP_GPUIMPL
+    GpuRuntime::setUp();
 #endif
     // Create a process object, read parm card and set parameters
     // FIXME: the process instance can happily go out of scope because it is only needed to read parameters?
@@ -69,8 +69,8 @@ extern "C"
     Bridge<FORTRANFPTYPE>* pbridge = dynamic_cast<Bridge<FORTRANFPTYPE>*>( *ppbridge );
     if( pbridge == 0 ) throw std::runtime_error( "fbridgedelete_: invalid Bridge address" );
     delete pbridge;
-#ifdef __CUDACC__
-    CudaRuntime::tearDown();
+#ifdef MGONGPUCPP_GPUIMPL
+    GpuRuntime::tearDown();
 #endif
   }
 
@@ -100,7 +100,7 @@ extern "C"
   {
     Bridge<FORTRANFPTYPE>* pbridge = dynamic_cast<Bridge<FORTRANFPTYPE>*>( *ppbridge );
     if( pbridge == 0 ) throw std::runtime_error( "fbridgesequence_: invalid Bridge address" );
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     // Use the device/GPU implementation in the CUDA library
     // (there is also a host implementation in this library)
     pbridge->gpu_sequence( momenta, gs, rndhel, rndcol, *pchannelId, mes, selhel, selcol );
diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/fsampler.cc b/epochX/cudacpp/gq_ttq.mad/SubProcesses/fsampler.cc
index 2fb445372d..3743934f41 100644
--- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/fsampler.cc
+++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/fsampler.cc
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Feb 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #include "mgOnGpuConfig.h"
 
@@ -13,7 +13,7 @@
 
 //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -40,7 +40,7 @@ namespace mg5amcCpu
   private:
     const int m_nevt; // The number of events in each iteration
     int m_iiter;      // The iteration counter (for random number seeding)
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
     HostBufferRndNumMomenta m_hstRndmom; // Memory buffers for random numbers
     HostBufferMomenta m_hstMomenta;      // Memory buffers for momenta
     HostBufferWeights m_hstWeights;      // Memory buffers for sampling weights
@@ -105,7 +105,7 @@ namespace mg5amcCpu
 
 extern "C"
 {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   using namespace mg5amcGpu;
 #else
   using namespace mg5amcCpu;
diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/runTest.cc b/epochX/cudacpp/gq_ttq.mad/SubProcesses/runTest.cc
index 572e28aaea..461ec5c3a5 100644
--- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/runTest.cc
+++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/runTest.cc
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: S. Hageboeck (Nov 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 
 #include "mgOnGpuConfig.h"
 
@@ -15,7 +15,7 @@
 #include "RandomNumberKernels.h"
 #include "epoch_process_id.h"
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 using namespace mg5amcGpu;
 #else
 using namespace mg5amcCpu;
@@ -32,7 +32,7 @@ struct CUDA_CPU_TestBase : public TestDriverBase
     : TestDriverBase( npar, refFileName ) {}
 };
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 struct CPUTest : public CUDA_CPU_TestBase
 {
   // Struct data members (process, and memory structures for random numbers, momenta, matrix elements and weights on host and device)
@@ -114,7 +114,7 @@ struct CPUTest : public CUDA_CPU_TestBase
 };
 #endif
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 struct CUDATest : public CUDA_CPU_TestBase
 {
   // Reset the device when our test goes out of scope. Note that this should happen after
@@ -123,7 +123,7 @@ struct CUDATest : public CUDA_CPU_TestBase
   {
     ~DeviceReset()
     {
-      checkCuda( cudaDeviceReset() ); // this is needed by cuda-memcheck --leak-check full
+      gpuDeviceReset(); // this is needed by cuda-memcheck --leak-check full
     }
   } deviceResetter;
 
@@ -249,7 +249,7 @@ INSTANTIATE_TEST_SUITE_P( prefix, \
                           test_suite_name, \
                           testing::Values( new CUDATest( MG_EPOCH_REFERENCE_FILE_NAME ) ) );
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 MG_INSTANTIATE_TEST_SUITE_GPU( XTESTID_GPU( MG_EPOCH_PROCESS_ID ), MadgraphTest );
 #else
 MG_INSTANTIATE_TEST_SUITE_CPU( XTESTID_CPU( MG_EPOCH_PROCESS_ID ), MadgraphTest );
diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/testmisc.cc b/epochX/cudacpp/gq_ttq.mad/SubProcesses/testmisc.cc
index 989aba1fdc..2bd7a9fcf9 100644
--- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/testmisc.cc
+++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/testmisc.cc
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 //----------------------------------------------------------------------------
 // Use ./runTest.exe --gtest_filter=*misc to run only this test
 //----------------------------------------------------------------------------
@@ -17,7 +17,7 @@
 #include <sstream>
 #include <typeinfo>
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 #define TESTID( s ) s##_GPU_MISC
 #else
 #define TESTID( s ) s##_CPU_MISC
diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/testxxx.cc b/epochX/cudacpp/gq_ttq.mad/SubProcesses/testxxx.cc
index 4243e9fcec..6e8657edca 100644
--- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/testxxx.cc
+++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/testxxx.cc
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Apr 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #include "mgOnGpuConfig.h"
 
@@ -20,7 +20,7 @@
 #include <iomanip>
 #include <iostream>
 #include <vector>
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 #define TESTID( s ) s##_GPU_XXX
 #else
 #define TESTID( s ) s##_CPU_XXX
@@ -41,7 +41,7 @@ TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testxxx )
   assert( nevt % neppM == 0 ); // nevt must be a multiple of neppM
   assert( nevt % neppV == 0 ); // nevt must be a multiple of neppV
   // Fill in the input momenta
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   mg5amcGpu::PinnedHostBufferMomenta hstMomenta( nevt ); // AOSOA[npagM][npar=4][np4=4][neppM]
 #else
   mg5amcCpu::HostBufferMomenta hstMomenta( nevt ); // AOSOA[npagM][npar=4][np4=4][neppM]
@@ -228,7 +228,7 @@ TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testxxx )
   {
     for( int ievt = 0; ievt < nevt; ievt++ )
     {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
       using namespace mg5amcGpu;
 #else
       using namespace mg5amcCpu;
diff --git a/epochX/cudacpp/gq_ttq.mad/src/HelAmps_sm.h b/epochX/cudacpp/gq_ttq.mad/src/HelAmps_sm.h
index e15ce959e9..9b0bfb10ee 100644
--- a/epochX/cudacpp/gq_ttq.mad/src/HelAmps_sm.h
+++ b/epochX/cudacpp/gq_ttq.mad/src/HelAmps_sm.h
@@ -4,7 +4,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: A. Valassi (Sep 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
 // MadGraph5_aMC@NLO v. 3.5.0_lo_vect, 2023-06-09
@@ -26,7 +26,7 @@
 //#include <iomanip>
 //#include <iostream>
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/gq_ttq.mad/src/Parameters_sm.cc b/epochX/cudacpp/gq_ttq.mad/src/Parameters_sm.cc
index 7255e49119..459dae9e99 100644
--- a/epochX/cudacpp/gq_ttq.mad/src/Parameters_sm.cc
+++ b/epochX/cudacpp/gq_ttq.mad/src/Parameters_sm.cc
@@ -4,7 +4,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: A. Valassi (Sep 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
 // MadGraph5_aMC@NLO v. 3.5.0_lo_vect, 2023-06-09
diff --git a/epochX/cudacpp/gq_ttq.mad/src/Parameters_sm.h b/epochX/cudacpp/gq_ttq.mad/src/Parameters_sm.h
index c935779eb3..db5520aa96 100644
--- a/epochX/cudacpp/gq_ttq.mad/src/Parameters_sm.h
+++ b/epochX/cudacpp/gq_ttq.mad/src/Parameters_sm.h
@@ -211,7 +211,7 @@ namespace Parameters_sm_dependentCouplings
 #pragma GCC diagnostic push
 #pragma GCC diagnostic ignored "-Wunused-variable"  // e.g. <<warning: unused variable ‘mdl_G__exp__2’ [-Wunused-variable]>>
 #pragma GCC diagnostic ignored "-Wunused-parameter" // e.g. <<warning: unused parameter ‘G’ [-Wunused-parameter]>>
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 #pragma nv_diagnostic push
 #pragma nv_diag_suppress 177 // e.g. <<warning #177-D: variable "mdl_G__exp__2" was declared but never referenced>>
 #endif
@@ -238,7 +238,7 @@ namespace Parameters_sm_dependentCouplings
     // End SM implementation - no special handling of vectors of floats as in EFT (#439)
     return out;
   }
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 #pragma GCC diagnostic pop
 #pragma nv_diagnostic pop
 #endif
@@ -254,7 +254,7 @@ namespace Parameters_sm_independentCouplings
 
 //==========================================================================
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/gq_ttq.mad/src/mgOnGpuConfig.h b/epochX/cudacpp/gq_ttq.mad/src/mgOnGpuConfig.h
index 881353abac..390766116b 100644
--- a/epochX/cudacpp/gq_ttq.mad/src/mgOnGpuConfig.h
+++ b/epochX/cudacpp/gq_ttq.mad/src/mgOnGpuConfig.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jul 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MGONGPUCONFIG_H
 #define MGONGPUCONFIG_H 1
@@ -10,13 +10,26 @@
 // There are two different code bases for standalone_cudacpp (without multichannel) and madevent+cudacpp (with multichannel)
 #define MGONGPU_SUPPORTS_MULTICHANNEL 1
 
+// Is this a GPU (CUDA, HIP) or CPU implementation?
+#ifdef __CUDACC__
+#define MGONGPUCPP_GPUIMPL cuda
+#elif defined __HIPCC__
+#define MGONGPUCPP_GPUIMPL hip
+#else
+#undef MGONGPUCPP_GPUIMPL
+#endif
+
 // ** NB1 Throughputs (e.g. 6.8E8) are events/sec for "./gcheck.exe -p 65536 128 12"
 // ** NB2 Baseline on b7g47n0004 fluctuates (probably depends on load on other VMs)
 
 // Choose if curand is supported for generating random numbers
+// For CUDA, by default, it is supported
+// For HIP, by default, it is not supported
 // For C++, by default, do not inline, but allow this macro to be set from outside with e.g. -DMGONGPU_HAS_NO_CURAND
 #ifdef __CUDACC__
 #undef MGONGPU_HAS_NO_CURAND
+#elif defined __HIPCC__
+#define MGONGPU_HAS_NO_CURAND 1
 #else
 //#undef MGONGPU_HAS_NO_CURAND // default
 ////#define MGONGPU_HAS_NO_CURAND 1
@@ -52,23 +65,28 @@
 //#undef MGONGPU_HARDCODE_PARAM // default
 ////#define MGONGPU_HARDCODE_PARAM 1
 
-// Complex type in c++: std::complex or cxsmpl (CHOOSE ONLY ONE)
-#ifndef __CUDACC__
-//#define MGONGPU_CPPCXTYPE_STDCOMPLEX 1 // ~8 percent slower on float, same on double (5.1E6/double, 9.4E6/float)
-#define MGONGPU_CPPCXTYPE_CXSMPL 1 // new default (5.1E6/double, 10.2E6/float)
-#endif
-
-// Complex type in cuda: thrust or cucomplex or cxsmpl (CHOOSE ONLY ONE)
+// Complex type in CUDA: thrust or cucomplex or cxsmpl (CHOOSE ONLY ONE)
 #ifdef __CUDACC__
 #define MGONGPU_CUCXTYPE_THRUST 1 // default (~1.15E9/double, ~3.2E9/float)
 //#define MGONGPU_CUCXTYPE_CUCOMPLEX 1 // ~10 percent slower (1.03E9/double, ~2.8E9/float)
 //#define MGONGPU_CUCXTYPE_CXSMPL 1 // ~10 percent slower (1.00E9/double, ~2.9E9/float)
+
+// Complex type in HIP: cxsmpl (ONLY ONE OPTION POSSIBLE)
+#elif defined __HIPCC__
+#define MGONGPU_CUCXTYPE_CXSMPL 1 // ~10 percent slower (1.00E9/double, ~2.9E9/float)
+
+// Complex type in C++: std::complex or cxsmpl (CHOOSE ONLY ONE)
+#else
+//#define MGONGPU_CPPCXTYPE_STDCOMPLEX 1 // ~8 percent slower on float, same on double (5.1E6/double, 9.4E6/float)
+#define MGONGPU_CPPCXTYPE_CXSMPL 1 // new default (5.1E6/double, 10.2E6/float)
 #endif
 
-// Cuda nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation
+// CUDA nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation
 #ifdef __CUDACC__
-#undef MGONGPU_NSIGHT_DEBUG // default
+#undef MGONGPU_NSIGHT_DEBUG // default in CUDA
 //#define MGONGPU_NSIGHT_DEBUG 1
+#else
+#undef MGONGPU_NSIGHT_DEBUG // only option in HIP or C++
 #endif
 
 // SANITY CHECKS (floating point precision for everything but color algebra #537)
@@ -84,17 +102,21 @@
 #error You cannot use double precision for color algebra and single precision elsewhere
 #endif
 
-// SANITY CHECKS (c++ complex number implementation)
-#ifndef __CUDACC__
-#if defined MGONGPU_CPPCXTYPE_STDCOMPLEX and defined MGONGPU_CPPCXTYPE_CXSMPL
-#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CPPCXTYPE_STDCOMPLEX or MGONGPU_CPPCXTYPE_CXSMPL
+// SANITY CHECKS (CUDA complex number implementation)
+#ifdef __CUDACC__
+#if defined MGONGPU_CUCXTYPE_THRUST and defined MGONGPU_CUCXTYPE_CUCOMPLEX
+#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CUCXTYPE_THRUST or MGONGPU_CUCXTYPE_CUCOMPLEX for CUDA
+#elif defined MGONGPU_CUCXTYPE_THRUST and defined MGONGPU_CUCXTYPE_CXSMPL
+#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CUCXTYPE_THRUST or MGONGPU_CUCXTYPE_CXSMPL for CUDA
+#elif defined MGONGPU_CUCXTYPE_CUCOMPLEX and defined MGONGPU_CUCXTYPE_CXSMPL
+#error You must CHOOSE (ONE AND) ONLY ONE OF MGONGPU_CUCXTYPE_CUCOMPLEX or MGONGPU_CUCXTYPE_CXSMPL for CUDA
 #endif
 #endif
 
-// SANITY CHECKS (cuda complex number implementation)
-#ifdef __CUDACC__
-#if defined MGONGPU_CUCXTYPE_THRUST and defined MGONGPU_CUCXTYPE_CUCOMPLEX and defined MGONGPU_CUCXTYPE_CXSMPL
-#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CUCXTYPE_THRUST or MGONGPU_CUCXTYPE_CUCOMPLEX or MGONGPU_CUCXTYPE_CXSMPL
+// SANITY CHECKS (C++ complex number implementation)
+#ifndef MGONGPUCPP_GPUIMPL
+#if defined MGONGPU_CPPCXTYPE_STDCOMPLEX and defined MGONGPU_CPPCXTYPE_CXSMPL
+#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CPPCXTYPE_STDCOMPLEX or MGONGPU_CPPCXTYPE_CXSMPL for C++
 #endif
 #endif
 
@@ -131,7 +153,7 @@ namespace mgOnGpu
   // Alignment requirement for using reinterpret_cast with SIMD vectorized code
   // (using reinterpret_cast with non aligned memory may lead to segmentation faults!)
   // Only needed for C++ code but can be enforced also in NVCC builds of C++ code using CUDA>=11.2 and C++17 (#318, #319, #333)
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   constexpr int cppAlign = 64; // alignment requirement for SIMD vectorization (64-byte i.e. 512-bit)
 #endif
 
@@ -142,7 +164,7 @@ using mgOnGpu::fptype;
 using mgOnGpu::fptype2;
 
 // C++ SIMD vectorization width (this will be used to set neppV)
-#ifdef __CUDACC__ // CUDA implementation has no SIMD
+#ifdef MGONGPUCPP_GPUIMPL // CUDA and HIP implementations have no SIMD
 #undef MGONGPU_CPPSIMD
 #elif defined __AVX512VL__ && defined MGONGPU_PVW512 // C++ "512z" AVX512 with 512 width (512-bit ie 64-byte): 8 (DOUBLE) or 16 (FLOAT)
 #ifdef MGONGPU_FPTYPE_DOUBLE
@@ -172,9 +194,9 @@ using mgOnGpu::fptype2;
 #undef MGONGPU_CPPSIMD
 #endif
 
-// Cuda nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation
+// CUDA nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation
 // Arguments (not used so far): text is __FUNCTION__, code is 0 (start) or 1 (end)
-#if defined __CUDACC__ && defined MGONGPU_NSIGHT_DEBUG /* clang-format off */
+#if defined __CUDA__ && defined MGONGPU_NSIGHT_DEBUG /* clang-format off */
 #define mgDebugDeclare() __shared__ float mgDebugCounter[mgOnGpu::ntpbMAX];
 #define mgDebugInitialise() { mgDebugCounter[threadIdx.x] = 0; }
 #define mgDebug( code, text ) { mgDebugCounter[threadIdx.x] += 1; }
@@ -186,8 +208,8 @@ using mgOnGpu::fptype2;
 #define mgDebugFinalise() { /*noop*/ }
 #endif /* clang-format on */
 
-// Define empty CUDA declaration specifiers for C++
-#ifndef __CUDACC__
+// Define empty CUDA/HIP declaration specifiers for C++
+#ifndef MGONGPUCPP_GPUIMPL
 #define __global__
 #define __host__
 #define __device__
diff --git a/epochX/cudacpp/gq_ttq.mad/src/mgOnGpuCxtypes.h b/epochX/cudacpp/gq_ttq.mad/src/mgOnGpuCxtypes.h
index 0cb2f1db7e..4e7ab03fa2 100644
--- a/epochX/cudacpp/gq_ttq.mad/src/mgOnGpuCxtypes.h
+++ b/epochX/cudacpp/gq_ttq.mad/src/mgOnGpuCxtypes.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022, based on earlier work by D. Smith) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MGONGPUCXTYPES_H
 #define MGONGPUCXTYPES_H 1
@@ -19,7 +19,7 @@
 #include <complex>
 
 // Complex type in cuda: thrust or cucomplex or cxsmpl
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 #if defined MGONGPU_CUCXTYPE_THRUST
 #pragma clang diagnostic push
 #pragma clang diagnostic ignored "-Wtautological-compare" // for icpx2021/clang13 (https://stackoverflow.com/a/15864661)
@@ -201,7 +201,7 @@ namespace mgOnGpu
 {
 
   // --- Type definitions (complex type: cxtype)
-#ifdef __CUDACC__ // cuda
+#ifdef MGONGPUCPP_GPUIMPL // cuda
 #if defined MGONGPU_CUCXTYPE_THRUST
   typedef thrust::complex<fptype> cxtype;
 #elif defined MGONGPU_CUCXTYPE_CUCOMPLEX
@@ -281,7 +281,7 @@ cxmake( const std::complex<double>& c ) // std::complex to cxsmpl (double-to-flo
 
 //==========================================================================
 
-#if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_THRUST // cuda + thrust
+#if defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CUCXTYPE_THRUST // cuda + thrust
 
 //------------------------------
 // CUDA - using thrust::complex
@@ -317,11 +317,11 @@ cxmake( const cxtype& c )
   return c;
 }
 
-#endif // #if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_THRUST
+#endif // #if defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CUCXTYPE_THRUST
 
 //==========================================================================
 
-#if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_CUCOMPLEX // cuda + cucomplex
+#if defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CUCXTYPE_CUCOMPLEX // cuda + cucomplex
 
 //------------------------------
 // CUDA - using cuComplex
@@ -536,11 +536,11 @@ cxmake( const std::complex<fptype>& c ) // std::complex to cucomplex (float-to-f
   return cxmake( c.real(), c.imag() );
 }
 
-#endif // #if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_CUCOMPLEX
+#endif // #if defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CUCXTYPE_CUCOMPLEX
 
 //==========================================================================
 
-#if not defined __CUDACC__ and defined MGONGPU_CPPCXTYPE_STDCOMPLEX // c++ + stdcomplex
+#if not defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CPPCXTYPE_STDCOMPLEX // c++ + stdcomplex
 
 //------------------------------
 // C++ - using std::complex
@@ -584,7 +584,7 @@ cxmake( const std::complex<double>& c ) // std::complex to std::complex (cast do
 }
 #endif
 
-#endif // #if not defined __CUDACC__ and defined MGONGPU_CPPCXTYPE_STDCOMPLEX
+#endif // #if not defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CPPCXTYPE_STDCOMPLEX
 
 //==========================================================================
 
diff --git a/epochX/cudacpp/gq_ttq.mad/src/mgOnGpuFptypes.h b/epochX/cudacpp/gq_ttq.mad/src/mgOnGpuFptypes.h
index a1cde16a67..6f6cee64d6 100644
--- a/epochX/cudacpp/gq_ttq.mad/src/mgOnGpuFptypes.h
+++ b/epochX/cudacpp/gq_ttq.mad/src/mgOnGpuFptypes.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MGONGPUFPTYPES_H
 #define MGONGPUFPTYPES_H 1
@@ -13,7 +13,7 @@
 
 //==========================================================================
 
-#ifdef __CUDACC__ // cuda
+#ifdef MGONGPUCPP_GPUIMPL // cuda
 
 //------------------------------
 // Floating point types - Cuda
@@ -57,11 +57,11 @@ fpsqrt( const fptype& f )
 #endif
 }
 
-#endif // #ifdef __CUDACC__
+#endif // #ifdef MGONGPUCPP_GPUIMPL
 
 //==========================================================================
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 
 //------------------------------
 // Floating point types - C++
@@ -85,7 +85,7 @@ fpsqrt( const fptype& f )
   return std::sqrt( f );
 }
 
-#endif // #ifndef __CUDACC__
+#endif // #ifndef MGONGPUCPP_GPUIMPL
 
 //==========================================================================
 
diff --git a/epochX/cudacpp/gq_ttq.mad/src/mgOnGpuVectors.h b/epochX/cudacpp/gq_ttq.mad/src/mgOnGpuVectors.h
index 9d3e82b1e3..7904b93c61 100644
--- a/epochX/cudacpp/gq_ttq.mad/src/mgOnGpuVectors.h
+++ b/epochX/cudacpp/gq_ttq.mad/src/mgOnGpuVectors.h
@@ -108,7 +108,7 @@ namespace mgOnGpu /* clang-format off */
 #endif
 #endif
 
-#else // i.e #ifndef MGONGPU_CPPSIMD (this includes #ifdef __CUDACC__)
+#else // i.e #ifndef MGONGPU_CPPSIMD (this includes #ifdef MGONGPUCPP_GPUIMPL)
 
   const int neppV = 1;
 
@@ -129,7 +129,7 @@ using mgOnGpu::bool_v;
 
 //--------------------------------------------------------------------------
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 
 // Printout to stream for user defined types
 
@@ -744,11 +744,11 @@ fpvsplit1( const fptype2_v& v )
 
 #endif // #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
 
-#endif // #ifndef __CUDACC__
+#endif // #ifndef MGONGPUCPP_GPUIMPL
 
 //==========================================================================
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 
 //------------------------------
 // Vector types - CUDA
@@ -786,12 +786,12 @@ cxternary( const bool& mask, const cxtype& a, const cxtype& b )
   return ( mask ? a : b );
 }
 
-#endif // #ifdef __CUDACC__
+#endif // #ifdef MGONGPUCPP_GPUIMPL
 
 //==========================================================================
 
 // Scalar-or-vector types: scalar in CUDA, vector or scalar in C++
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 typedef bool bool_sv;
 typedef fptype fptype_sv;
 typedef fptype2 fptype2_sv;
@@ -812,7 +812,7 @@ typedef mgOnGpu::cxtype_ref cxtype_sv_ref;
 #endif
 
 // Scalar-or-vector zeros: scalar in CUDA, vector or scalar in C++
-#ifdef __CUDACC__ /* clang-format off */
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
 inline __host__ __device__ cxtype cxzero_sv(){ return cxtype( 0, 0 ); }
 #elif defined MGONGPU_CPPSIMD
 inline cxtype_v cxzero_sv() { return cxtype_v(); } // RRRR=0000 IIII=0000
diff --git a/epochX/cudacpp/gq_ttq.mad/src/rambo.h b/epochX/cudacpp/gq_ttq.mad/src/rambo.h
index e02ea52496..cd7e1008ea 100644
--- a/epochX/cudacpp/gq_ttq.mad/src/rambo.h
+++ b/epochX/cudacpp/gq_ttq.mad/src/rambo.h
@@ -4,7 +4,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 
 #include "mgOnGpuConfig.h"
@@ -18,7 +18,7 @@
 #include <iostream>
 
 // Simplified rambo version for 2 to N (with N>=2) processes with massless particles
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -83,7 +83,7 @@ namespace mg5amcCpu
       static bool first = true;
       if( first )
       {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
         if constexpr( M_ACCESS::isOnDevice() ) // avoid
         {
           const int ievt0 = 0;
@@ -166,7 +166,7 @@ namespace mg5amcCpu
     wt = po2log;
     if( nparf != 2 ) wt = ( 2. * nparf - 4. ) * log( energy ) + z[nparf - 1];
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
     // issue warnings if weight is too small or too large
     static int iwarn[5] = { 0, 0, 0, 0, 0 };
     if( wt < -180. )
diff --git a/epochX/cudacpp/gq_ttq.sa/CODEGEN_cudacpp_gq_ttq_log.txt b/epochX/cudacpp/gq_ttq.sa/CODEGEN_cudacpp_gq_ttq_log.txt
index b973f7da7e..bfaddc5def 100644
--- a/epochX/cudacpp/gq_ttq.sa/CODEGEN_cudacpp_gq_ttq_log.txt
+++ b/epochX/cudacpp/gq_ttq.sa/CODEGEN_cudacpp_gq_ttq_log.txt
@@ -51,8 +51,8 @@ Note that you can still compile and run aMC@NLO with the built-in PDFs
  MG5_aMC> set lhapdf /PATH/TO/lhapdf-config
 
 Using default text editor "vi". Set another one in ./input/mg5_configuration.txt
-No valid eps viewer found. Please set in ./input/mg5_configuration.txt
-Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt
+Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt
+No valid web browser found. Please set in ./input/mg5_configuration.txt
 import /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_cudacpp_gq_ttq.mg
 The import format was not given, so we guess it as command
 set stdout_level DEBUG
@@ -61,7 +61,7 @@ set zerowidth_tchannel F
 define q = u c d s u~ c~ d~ s~; generate g q > t t~ q
 INFO: load particles 
 INFO: load vertices 
-[1;32mDEBUG: model prefixing  takes 0.0045931339263916016 [0m
+[1;32mDEBUG: model prefixing  takes 0.005072832107543945 [0m
 INFO: Restrict model sm with file models/sm/restrict_default.dat . 
 [1;32mDEBUG: Simplifying conditional expressions [0m
 [1;32mDEBUG: remove interactions: u s w+ at order: QED=1 [0m
@@ -169,7 +169,7 @@ INFO: Crossed process found for g u~ > t t~ u~, reuse diagrams.
 INFO: Crossed process found for g c~ > t t~ c~, reuse diagrams. 
 INFO: Crossed process found for g d~ > t t~ d~, reuse diagrams. 
 INFO: Crossed process found for g s~ > t t~ s~, reuse diagrams. 
-8 processes with 40 diagrams generated in 0.070 s
+8 processes with 40 diagrams generated in 0.075 s
 Total: 8 processes with 40 diagrams
 output standalone_cudacpp CODEGEN_cudacpp_gq_ttq
 Load PLUGIN.CUDACPP_SA_OUTPUT
@@ -208,9 +208,9 @@ FileWriter <class 'PLUGIN.CUDACPP_SA_OUTPUT.model_handling.PLUGIN_CPPWriter'> fo
 [1;32mDEBUG:  type(self.helas_call_writer) = [0m <class 'PLUGIN.CUDACPP_SA_OUTPUT.model_handling.PLUGIN_GPUFOHelasCallWriter'> [1;30m[model_handling.py at line 1151][0m [0m
 [1;32mDEBUG:  self.support_multichannel, self.include_multi_channel = [0m True False [1;30m[model_handling.py at line 1152][0m [0m
 [1;32mDEBUG:  multi_channel_map = [0m None [1;30m[model_handling.py at line 1643][0m [0m
-[1;32mDEBUG:  diag_to_config = [0m {} [1;30m[model_handling.py at line 1698][0m [0m
-[1;32mDEBUG:  call = [0m vxxxxx( momenta,m_pars->%s, cHel[ihel][%d],%+d, w_sv[%d], %d ); [1;30m[model_handling.py at line 1810][0m [0m
-[1;32mDEBUG:  ('ZERO', 0, -1, 0, 0) [1;30m[model_handling.py at line 1811][0m [0m
+[1;32mDEBUG:  diag_to_config = [0m {} [1;30m[model_handling.py at line 1700][0m [0m
+[1;32mDEBUG:  call = [0m vxxxxx( momenta,m_pars->%s, cHel[ihel][%d],%+d, w_sv[%d], %d ); [1;30m[model_handling.py at line 1812][0m [0m
+[1;32mDEBUG:  ('ZERO', 0, -1, 0, 0) [1;30m[model_handling.py at line 1813][0m [0m
 INFO: Created files CPPProcess.h and CPPProcess.cc in directory /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_cudacpp_gq_ttq/SubProcesses/P1_Sigma_sm_gu_ttxu/. 
 [1;32mDEBUG:  Entering PLUGIN_OneProcessExporter.edit_CMakeLists [1;30m[model_handling.py at line 1332][0m [0m
 [1;32mDEBUG:  Entering PLUGIN_OneProcessExporter.edit_check_sa [1;30m[model_handling.py at line 1341][0m [0m
@@ -239,9 +239,9 @@ FileWriter <class 'PLUGIN.CUDACPP_SA_OUTPUT.model_handling.PLUGIN_CPPWriter'> fo
 [1;32mDEBUG:  type(self.helas_call_writer) = [0m <class 'PLUGIN.CUDACPP_SA_OUTPUT.model_handling.PLUGIN_GPUFOHelasCallWriter'> [1;30m[model_handling.py at line 1151][0m [0m
 [1;32mDEBUG:  self.support_multichannel, self.include_multi_channel = [0m True False [1;30m[model_handling.py at line 1152][0m [0m
 [1;32mDEBUG:  multi_channel_map = [0m None [1;30m[model_handling.py at line 1643][0m [0m
-[1;32mDEBUG:  diag_to_config = [0m {} [1;30m[model_handling.py at line 1698][0m [0m
-[1;32mDEBUG:  call = [0m vxxxxx( momenta,m_pars->%s, cHel[ihel][%d],%+d, w_sv[%d], %d ); [1;30m[model_handling.py at line 1810][0m [0m
-[1;32mDEBUG:  ('ZERO', 0, -1, 0, 0) [1;30m[model_handling.py at line 1811][0m [0m
+[1;32mDEBUG:  diag_to_config = [0m {} [1;30m[model_handling.py at line 1700][0m [0m
+[1;32mDEBUG:  call = [0m vxxxxx( momenta,m_pars->%s, cHel[ihel][%d],%+d, w_sv[%d], %d ); [1;30m[model_handling.py at line 1812][0m [0m
+[1;32mDEBUG:  ('ZERO', 0, -1, 0, 0) [1;30m[model_handling.py at line 1813][0m [0m
 INFO: Created files CPPProcess.h and CPPProcess.cc in directory /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_cudacpp_gq_ttq/SubProcesses/P1_Sigma_sm_gux_ttxux/. 
 [1;32mDEBUG:  Entering PLUGIN_OneProcessExporter.edit_CMakeLists [1;30m[model_handling.py at line 1332][0m [0m
 [1;32mDEBUG:  Entering PLUGIN_OneProcessExporter.edit_check_sa [1;30m[model_handling.py at line 1341][0m [0m
@@ -251,12 +251,12 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory /data/avalassi/G
 [1;32mDEBUG:  Entering PLUGIN_OneProcessExporter.edit_memorybuffers [1;30m[model_handling.py at line 1419][0m [0m
 [1;32mDEBUG:  Entering PLUGIN_OneProcessExporter.edit_memoryaccesscouplings [1;30m[model_handling.py at line 1430][0m [0m
 [1;32mDEBUG:  'Copying test reference file: ', template_ref  = [0m Copying test reference file:  /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/../../../test/ref/dump_CPUTest.Sigma_sm_gux_ttxux.txt [1;30m[model_handling.py at line 1324][0m [0m
-Generated helas calls for 2 subprocesses (10 diagrams) in 0.027 s
+Generated helas calls for 2 subprocesses (10 diagrams) in 0.029 s
 [1;32mDEBUG:  Entering PLUGIN_ProcessExporter.convert_model (create the model) [1;30m[output.py at line 194][0m [0m
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates FFV1 routines[0m
 ALOHA: aloha creates VVV1 routines[0m
-ALOHA: aloha creates 2 routines in  0.124 s
+ALOHA: aloha creates 2 routines in  0.137 s
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
@@ -291,6 +291,6 @@ INFO: /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_cudacpp_gq_ttq/src/.
 [1;32mDEBUG:  Entering PLUGIN_ProcessExporter.finalize [1;30m[output.py at line 203][0m [0m
 quit
 
-real	0m0.705s
-user	0m0.639s
-sys	0m0.059s
+real	0m0.791s
+user	0m0.673s
+sys	0m0.063s
diff --git a/epochX/cudacpp/gq_ttq.sa/COPYRIGHT b/epochX/cudacpp/gq_ttq.sa/COPYRIGHT
index a134b5fef9..84a883fbb0 100644
--- a/epochX/cudacpp/gq_ttq.sa/COPYRIGHT
+++ b/epochX/cudacpp/gq_ttq.sa/COPYRIGHT
@@ -15,6 +15,7 @@ The full development team currently includes the following authors :
   Stephan Hageboeck (CERN)
   Olivier Mattelaer (Universite Catholique de Louvain, original author)
   Stefan Roiser (CERN, original author)
+  Joergen Teig (CERN)
   Andrea Valassi (CERN, original author)
   Zenny Wettersten (CERN)
 See https://github.com/madgraph5/madgraph4gpu for more details. For the full
diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/Bridge.h b/epochX/cudacpp/gq_ttq.sa/SubProcesses/Bridge.h
index 4cafe0c997..c04628dfd1 100644
--- a/epochX/cudacpp/gq_ttq.sa/SubProcesses/Bridge.h
+++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/Bridge.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: S. Roiser (Nov 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Roiser, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef BRIDGE_H
 #define BRIDGE_H 1
@@ -22,7 +22,7 @@
 #include <memory>
 #include <type_traits>
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -82,7 +82,7 @@ namespace mg5amcCpu
     Bridge& operator=( const Bridge& ) = delete;
     Bridge& operator=( Bridge&& ) = delete;
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     /**
      * Set the gpublocks and gputhreads for the gpusequence - throws if evnt != gpublocks*gputhreads
      * (this is needed for BridgeKernel tests rather than for actual production use in Fortran)
@@ -149,7 +149,7 @@ namespace mg5amcCpu
     unsigned int m_nevt; // number of events
     int m_nGoodHel;      // the number of good helicities (-1 initially when they have not yet been calculated)
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     int m_gputhreads; // number of gpu threads (default set from number of events, can be modified)
     int m_gpublocks;  // number of gpu blocks (default set from number of events, can be modified)
     mg5amcGpu::DeviceBuffer<FORTRANFPTYPE, sizePerEventMomenta> m_devMomentaF;
@@ -186,12 +186,12 @@ namespace mg5amcCpu
   // Forward declare transposition methods
   //
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 
   template<typename Tin, typename Tout>
   __global__ void dev_transposeMomentaF2C( const Tin* in, Tout* out, const unsigned int nevt );
 
-#endif // __CUDACC__
+#endif // MGONGPUCPP_GPUIMPL
 
   template<typename Tin, typename Tout>
   void hst_transposeMomentaF2C( const Tin* in, Tout* out, const unsigned int nevt );
@@ -208,7 +208,7 @@ namespace mg5amcCpu
   Bridge<FORTRANFPTYPE>::Bridge( unsigned int nevtF, unsigned int nparF, unsigned int np4F )
     : m_nevt( nevtF )
     , m_nGoodHel( -1 )
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     , m_gputhreads( 256 )                  // default number of gpu threads
     , m_gpublocks( m_nevt / m_gputhreads ) // this ensures m_nevt <= m_gpublocks*m_gputhreads
     , m_devMomentaF( m_nevt )
@@ -232,7 +232,7 @@ namespace mg5amcCpu
   {
     if( nparF != CPPProcess::npar ) throw std::runtime_error( "Bridge constructor: npar mismatch" );
     if( np4F != CPPProcess::np4 ) throw std::runtime_error( "Bridge constructor: np4 mismatch" );
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     if( ( m_nevt < s_gputhreadsmin ) || ( m_nevt % s_gputhreadsmin != 0 ) )
       throw std::runtime_error( "Bridge constructor: nevt should be a multiple of " + std::to_string( s_gputhreadsmin ) );
     while( m_nevt != m_gpublocks * m_gputhreads )
@@ -250,11 +250,11 @@ namespace mg5amcCpu
     std::cout << "WARNING! Instantiate host Bridge (nevt=" << m_nevt << ")" << std::endl;
     mg5amcCpu::CPPProcess process( /*verbose=*/false );
     m_pmek.reset( new mg5amcCpu::MatrixElementKernelHost( m_hstMomentaC, m_hstGs, m_hstRndHel, m_hstRndCol, m_hstMEs, m_hstSelHel, m_hstSelCol, m_nevt ) );
-#endif // __CUDACC__
+#endif // MGONGPUCPP_GPUIMPL
     process.initProc( "../../Cards/param_card.dat" );
   }
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   template<typename FORTRANFPTYPE>
   void Bridge<FORTRANFPTYPE>::set_gpugrid( const int gpublocks, const int gputhreads )
   {
@@ -268,7 +268,7 @@ namespace mg5amcCpu
   }
 #endif
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   template<typename FORTRANFPTYPE>
   void Bridge<FORTRANFPTYPE>::gpu_sequence( const FORTRANFPTYPE* momenta,
                                             const FORTRANFPTYPE* gs,
@@ -283,14 +283,14 @@ namespace mg5amcCpu
     constexpr int neppM = MemoryAccessMomenta::neppM;
     if constexpr( neppM == 1 && std::is_same_v<FORTRANFPTYPE, fptype> )
     {
-      checkCuda( cudaMemcpy( m_devMomentaC.data(), momenta, m_devMomentaC.bytes(), cudaMemcpyHostToDevice ) );
+      gpuMemcpy( m_devMomentaC.data(), momenta, m_devMomentaC.bytes(), gpuMemcpyHostToDevice );
     }
     else
     {
-      checkCuda( cudaMemcpy( m_devMomentaF.data(), momenta, m_devMomentaF.bytes(), cudaMemcpyHostToDevice ) );
+      gpuMemcpy( m_devMomentaF.data(), momenta, m_devMomentaF.bytes(), gpuMemcpyHostToDevice );
       const int thrPerEvt = CPPProcess::npar * CPPProcess::np4; // AV: transpose alg does 1 element per thread (NOT 1 event per thread)
       //const int thrPerEvt = 1; // AV: try new alg with 1 event per thread... this seems slower
-      dev_transposeMomentaF2C<<<m_gpublocks * thrPerEvt, m_gputhreads>>>( m_devMomentaF.data(), m_devMomentaC.data(), m_nevt );
+      gpuLaunchKernel( dev_transposeMomentaF2C, m_gpublocks * thrPerEvt, m_gputhreads, m_devMomentaF.data(), m_devMomentaC.data(), m_nevt );
     }
     if constexpr( std::is_same_v<FORTRANFPTYPE, fptype> )
     {
@@ -333,7 +333,7 @@ namespace mg5amcCpu
   }
 #endif
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   template<typename FORTRANFPTYPE>
   void Bridge<FORTRANFPTYPE>::cpu_sequence( const FORTRANFPTYPE* momenta,
                                             const FORTRANFPTYPE* gs,
@@ -388,7 +388,7 @@ namespace mg5amcCpu
   // - C++ array: momenta[npagM][npar][np4][neppM] with nevt=npagM*neppM (AOSOA)
   //
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   template<typename Tin, typename Tout>
   __global__ void dev_transposeMomentaF2C( const Tin* in, Tout* out, const unsigned int nevt )
   {
diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/BridgeKernels.cc b/epochX/cudacpp/gq_ttq.sa/SubProcesses/BridgeKernels.cc
index cef4cb3c71..90c7f2d3b8 100644
--- a/epochX/cudacpp/gq_ttq.sa/SubProcesses/BridgeKernels.cc
+++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/BridgeKernels.cc
@@ -1,10 +1,11 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #include "BridgeKernels.h"
 
+#include "GpuAbstraction.h"
 #include "MemoryAccessMomenta.h"
 
 #include <sstream>
@@ -14,7 +15,7 @@ constexpr int npar = CPPProcess::npar; // #particles in total (external = initia
 
 //============================================================================
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -45,7 +46,7 @@ namespace mg5amcCpu
 
 //============================================================================
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 namespace mg5amcCpu
 {
 
@@ -96,7 +97,7 @@ namespace mg5amcCpu
 
 //============================================================================
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 {
 
diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/BridgeKernels.h b/epochX/cudacpp/gq_ttq.sa/SubProcesses/BridgeKernels.h
index 15eb4bff4d..3efef8ce97 100644
--- a/epochX/cudacpp/gq_ttq.sa/SubProcesses/BridgeKernels.h
+++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/BridgeKernels.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef BRIDGEKERNELS_H
 #define BRIDGEKERNELS_H 1
@@ -12,7 +12,7 @@
 #include "MatrixElementKernels.h"
 #include "MemoryBuffers.h"
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -49,7 +49,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A Bridge wrapper class encapsulating matrix element calculations on a CPU host
   class BridgeKernelHost final : public BridgeKernelBase
   {
@@ -89,7 +89,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   // A Bridge wrapper class encapsulating matrix element calculations on a GPU device
   class BridgeKernelDevice : public BridgeKernelBase
   {
diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/CommonRandomNumberKernel.cc b/epochX/cudacpp/gq_ttq.sa/SubProcesses/CommonRandomNumberKernel.cc
index 985b39f576..010bc4cbd0 100644
--- a/epochX/cudacpp/gq_ttq.sa/SubProcesses/CommonRandomNumberKernel.cc
+++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/CommonRandomNumberKernel.cc
@@ -1,15 +1,16 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #include "CommonRandomNumbers.h"
+#include "GpuAbstraction.h"
 #include "MemoryBuffers.h"
 #include "RandomNumberKernels.h"
 
 #include <cassert>
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/CrossSectionKernels.cc b/epochX/cudacpp/gq_ttq.sa/SubProcesses/CrossSectionKernels.cc
index 0b355a3c8d..c15b39844d 100644
--- a/epochX/cudacpp/gq_ttq.sa/SubProcesses/CrossSectionKernels.cc
+++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/CrossSectionKernels.cc
@@ -1,10 +1,11 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #include "CrossSectionKernels.h"
 
+#include "GpuAbstraction.h"
 #include "MemoryAccessMatrixElements.h"
 #include "MemoryAccessWeights.h"
 #include "MemoryBuffers.h"
@@ -77,7 +78,7 @@ debug_me_is_abnormal( const fptype& me, size_t ievtALL )
 
 //============================================================================
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -185,7 +186,7 @@ namespace mg5amcCpu
 
 //============================================================================
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 {
 
diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/CrossSectionKernels.h b/epochX/cudacpp/gq_ttq.sa/SubProcesses/CrossSectionKernels.h
index 7933ca4bbf..4d9659e04e 100644
--- a/epochX/cudacpp/gq_ttq.sa/SubProcesses/CrossSectionKernels.h
+++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/CrossSectionKernels.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef CROSSSECTIONKERNELS_H
 #define CROSSSECTIONKERNELS_H 1
@@ -13,7 +13,7 @@
 
 //============================================================================
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -96,7 +96,7 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
   /*
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   // A class encapsulating the calculation of event statistics on a GPU device
   class CrossSectionKernelDevice : public CrossSectionKernelBase, public NumberOfEvents
   {
diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/CudaRuntime.h b/epochX/cudacpp/gq_ttq.sa/SubProcesses/CudaRuntime.h
deleted file mode 100644
index 64ce52f4b3..0000000000
--- a/epochX/cudacpp/gq_ttq.sa/SubProcesses/CudaRuntime.h
+++ /dev/null
@@ -1,85 +0,0 @@
-// Copyright (C) 2020-2023 CERN and UCLouvain.
-// Licensed under the GNU Lesser General Public License (version 3 or later).
-// Created by: S. Roiser (Jul 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
-
-#ifndef MG5AMC_CUDARUNTIME_H
-#define MG5AMC_CUDARUNTIME_H 1
-
-// MG5AMC on GPU uses the CUDA runtime API, not the lower level CUDA driver API
-// See https://docs.nvidia.com/cuda/cuda-runtime-api/driver-vs-runtime-api.html#driver-vs-runtime-api
-
-#include <cassert>
-#include <iostream>
-
-//--------------------------------------------------------------------------
-
-// See https://stackoverflow.com/a/14038590
-#ifdef __CUDACC__ /* clang-format off */
-#define checkCuda( code ) { assertCuda( code, __FILE__, __LINE__ ); }
-inline void assertCuda( cudaError_t code, const char* file, int line, bool abort = true )
-{
-  if( code != cudaSuccess )
-  {
-    printf( "ERROR! assertCuda: '%s' (%d) in %s:%d\n", cudaGetErrorString( code ), code, file, line );
-    if( abort ) assert( code == cudaSuccess );
-  }
-}
-#endif /* clang-format on */
-
-//--------------------------------------------------------------------------
-
-#ifdef __CUDACC__
-namespace mg5amcGpu
-{
-  // Instantiate a CudaRuntime at the beginnining of the application's main to
-  // invoke cudaSetDevice(0) in the constructor and book a cudaDeviceReset() call in the destructor
-  // *** FIXME! This will all need to be designed differently when going to multi-GPU nodes! ***
-  struct CudaRuntime final
-  {
-    CudaRuntime( const bool debug = true )
-      : m_debug( debug ) { setUp( m_debug ); }
-    ~CudaRuntime() { tearDown( m_debug ); }
-    CudaRuntime( const CudaRuntime& ) = delete;
-    CudaRuntime( CudaRuntime&& ) = delete;
-    CudaRuntime& operator=( const CudaRuntime& ) = delete;
-    CudaRuntime& operator=( CudaRuntime&& ) = delete;
-    bool m_debug;
-
-    // Set up CUDA application
-    // ** NB: strictly speaking this is not needed when using the CUDA runtime API **
-    // Calling cudaSetDevice on startup is useful to properly book-keep the time spent in CUDA initialization
-    static void setUp( const bool debug = true )
-    {
-      // ** NB: it is useful to call cudaSetDevice, or cudaFree, to properly book-keep the time spent in CUDA initialization
-      // ** NB: otherwise, the first CUDA operation (eg a cudaMemcpyToSymbol in CPPProcess ctor) appears to take much longer!
-      /*
-      // [We initially added cudaFree(0) to "ease profile analysis" only because it shows up as a big recognizable block!]
-      // No explicit initialization is needed: https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#initialization
-      // It is not clear what cudaFree(0) does at all: https://stackoverflow.com/questions/69967813/
-      if ( debug ) std::cout << "__CudaRuntime: calling cudaFree(0)" << std::endl;
-      checkCuda( cudaFree( 0 ) ); // SLOW!
-      */
-      // Replace cudaFree(0) by cudaSetDevice(0), even if it is not really needed either
-      // (but see https://developer.nvidia.com/blog/cuda-pro-tip-always-set-current-device-avoid-multithreading-bugs)
-      if( debug ) std::cout << "__CudaRuntime: calling cudaSetDevice(0)" << std::endl;
-      checkCuda( cudaSetDevice( 0 ) ); // SLOW!
-    }
-
-    // Tear down CUDA application (call cudaDeviceReset)
-    // ** NB: strictly speaking this is not needed when using the CUDA runtime API **
-    // Calling cudaDeviceReset on shutdown is only needed for checking memory leaks in cuda-memcheck
-    // See https://docs.nvidia.com/cuda/cuda-memcheck/index.html#leak-checking
-    static void tearDown( const bool debug = true )
-    {
-      if( debug ) std::cout << "__CudaRuntime: calling cudaDeviceReset()" << std::endl;
-      checkCuda( cudaDeviceReset() );
-    }
-  };
-
-}
-#endif
-
-//--------------------------------------------------------------------------
-
-#endif // MG5AMC_CUDARUNTIME_H
diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/CurandRandomNumberKernel.cc b/epochX/cudacpp/gq_ttq.sa/SubProcesses/CurandRandomNumberKernel.cc
index eb56333b03..38c477c17a 100644
--- a/epochX/cudacpp/gq_ttq.sa/SubProcesses/CurandRandomNumberKernel.cc
+++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/CurandRandomNumberKernel.cc
@@ -1,9 +1,9 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
-#include "CudaRuntime.h"
+#include "GpuRuntime.h"
 #include "MemoryBuffers.h"
 #include "RandomNumberKernels.h"
 
@@ -114,7 +114,7 @@ namespace mg5amcCpu
     /*
     printf( "\nCurandRandomNumberKernel::generateRnarray size = %d\n", (int)m_rnarray.size() );
     fptype* data = m_rnarray.data();
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     if( m_rnarray.isOnDevice() )
     {
       data = new fptype[m_rnarray.size()]();
@@ -123,7 +123,7 @@ namespace mg5amcCpu
 #endif
     for( int i = 0; i < ( (int)m_rnarray.size() / 4 ); i++ )
       printf( "[%4d] %f %f %f %f\n", i * 4, data[i * 4], data[i * 4 + 2], data[i * 4 + 2], data[i * 4 + 3] );
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     if( m_rnarray.isOnDevice() ) delete[] data;
 #endif
     */
diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/EventStatistics.h b/epochX/cudacpp/gq_ttq.sa/SubProcesses/EventStatistics.h
index 48b51e0a49..b425a5bade 100644
--- a/epochX/cudacpp/gq_ttq.sa/SubProcesses/EventStatistics.h
+++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/EventStatistics.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef EventStatistics_H
 #define EventStatistics_H 1
@@ -16,7 +16,7 @@
 #include <limits>
 #include <string>
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/GpuAbstraction.h b/epochX/cudacpp/gq_ttq.sa/SubProcesses/GpuAbstraction.h
new file mode 100644
index 0000000000..6a7d9c05c0
--- /dev/null
+++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/GpuAbstraction.h
@@ -0,0 +1,71 @@
+// Copyright (C) 2020-2023 CERN and UCLouvain.
+// Licensed under the GNU Lesser General Public License (version 3 or later).
+// Created by: J. Teig (Jul 2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+
+#ifndef MG5AMC_GPUABSTRACTION_H
+#define MG5AMC_GPUABSTRACTION_H 1
+
+#include <cassert>
+
+//--------------------------------------------------------------------------
+
+#ifdef __CUDACC__
+
+#define gpuError_t cudaError_t
+#define gpuPeekAtLastError cudaPeekAtLastError
+#define gpuGetErrorString cudaGetErrorString
+#define gpuSuccess cudaSuccess
+
+#define gpuMallocHost( ptr, size ) checkGpu( cudaMallocHost( ptr, size ) )
+#define gpuMalloc( ptr, size ) checkGpu( cudaMalloc( ptr, size ) )
+
+#define gpuMemcpy( dstData, srcData, srcBytes, func ) checkGpu( cudaMemcpy( dstData, srcData, srcBytes, func ) )
+#define gpuMemcpyHostToDevice cudaMemcpyHostToDevice
+#define gpuMemcpyDeviceToHost cudaMemcpyDeviceToHost
+#define gpuMemcpyToSymbol( type1, type2, size ) checkGpu( cudaMemcpyToSymbol( type1, type2, size ) )
+
+#define gpuFree( ptr ) checkGpu( cudaFree( ptr ) )
+#define gpuFreeHost( ptr ) checkGpu( cudaFreeHost( ptr ) )
+
+#define gpuSetDevice cudaSetDevice
+#define gpuDeviceSynchronize cudaDeviceSynchronize
+#define gpuDeviceReset cudaDeviceReset
+
+#define gpuLaunchKernel( kernel, blocks, threads, ... ) kernel<<<blocks, threads>>>( __VA_ARGS__ )
+#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<<blocks, threads, sharedMem>>>( __VA_ARGS__ )
+
+//--------------------------------------------------------------------------
+
+#elif defined __HIPCC__
+
+#include "hip/hip_runtime.h"
+
+#define gpuError_t hipError_t
+#define gpuPeekAtLastError hipPeekAtLastError
+#define gpuGetErrorString hipGetErrorString
+#define gpuSuccess hipSuccess
+
+#define gpuMallocHost( ptr, size ) checkGpu( hipHostMalloc( ptr, size ) ) // HostMalloc better
+#define gpuMalloc( ptr, size ) checkGpu( hipMalloc( ptr, size ) )
+
+#define gpuMemcpy( dstData, srcData, srcBytes, func ) checkGpu( hipMemcpy( dstData, srcData, srcBytes, func ) )
+#define gpuMemcpyHostToDevice hipMemcpyHostToDevice
+#define gpuMemcpyDeviceToHost hipMemcpyDeviceToHost
+#define gpuMemcpyToSymbol( type1, type2, size ) checkGpu( hipMemcpyToSymbol( type1, type2, size ) )
+
+#define gpuFree( ptr ) checkGpu( hipFree( ptr ) )
+#define gpuFreeHost( ptr ) checkGpu( hipHostFree( ptr ) )
+
+#define gpuSetDevice hipSetDevice
+#define gpuDeviceSynchronize hipDeviceSynchronize
+#define gpuDeviceReset hipDeviceReset
+
+#define gpuLaunchKernel( kernel, blocks, threads, ... ) kernel<<<blocks, threads>>>( __VA_ARGS__ )
+#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<<blocks, threads, sharedMem>>>( __VA_ARGS__ )
+
+//--------------------------------------------------------------------------
+
+#endif
+
+#endif // MG5AMC_GPUABSTRACTION_H
diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/GpuRuntime.h b/epochX/cudacpp/gq_ttq.sa/SubProcesses/GpuRuntime.h
new file mode 100644
index 0000000000..93579ef08b
--- /dev/null
+++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/GpuRuntime.h
@@ -0,0 +1,85 @@
+// Copyright (C) 2020-2023 CERN and UCLouvain.
+// Licensed under the GNU Lesser General Public License (version 3 or later).
+// Created by: J. Teig (Jun 2023, based on earlier work by S. Roiser) for the MG5aMC CUDACPP plugin.
+// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+
+#ifndef MG5AMC_GPURUNTIME_H
+#define MG5AMC_GPURUNTIME_H 1
+
+// MG5AMC on GPU uses the CUDA runtime API, not the lower level CUDA driver API
+// See https://docs.nvidia.com/cuda/cuda-runtime-api/driver-vs-runtime-api.html#driver-vs-runtime-api
+
+#include "GpuAbstraction.h"
+
+#include <iostream>
+
+//--------------------------------------------------------------------------
+
+// See https://stackoverflow.com/a/14038590
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
+#define checkGpu( code ) { assertGpu( code, __FILE__, __LINE__ ); }
+inline void assertGpu( gpuError_t code, const char* file, int line, bool abort = true )
+{
+  if( code != gpuSuccess )
+  {
+    printf( "ERROR! assertGpu: '%s' (%d) in %s:%d\n", gpuGetErrorString( code ), code, file, line );
+    if( abort ) assert( code == gpuSuccess );
+  }
+}
+#endif /* clang-format on */
+
+//--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+namespace mg5amcGpu
+{
+  // Instantiate a GpuRuntime at the beginnining of the application's main to
+  // invoke gpuSetDevice(0) in the constructor and book a gpuDeviceReset() call in the destructor
+  // *** FIXME! This will all need to be designed differently when going to multi-GPU nodes! ***
+  struct GpuRuntime final
+  {
+    GpuRuntime( const bool debug = true )
+      : m_debug( debug ) { setUp( m_debug ); }
+    ~GpuRuntime() { tearDown( m_debug ); }
+    GpuRuntime( const GpuRuntime& ) = delete;
+    GpuRuntime( GpuRuntime&& ) = delete;
+    GpuRuntime& operator=( const GpuRuntime& ) = delete;
+    GpuRuntime& operator=( GpuRuntime&& ) = delete;
+    bool m_debug;
+
+    // Set up CUDA application
+    // ** NB: strictly speaking this is not needed when using the CUDA runtime API **
+    // Calling cudaSetDevice on startup is useful to properly book-keep the time spent in CUDA initialization
+    static void setUp( const bool debug = true )
+    {
+      // ** NB: it is useful to call cudaSetDevice, or cudaFree, to properly book-keep the time spent in CUDA initialization
+      // ** NB: otherwise, the first CUDA operation (eg a cudaMemcpyToSymbol in CPPProcess ctor) appears to take much longer!
+      /*
+      // [We initially added cudaFree(0) to "ease profile analysis" only because it shows up as a big recognizable block!]
+      // No explicit initialization is needed: https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#initialization
+      // It is not clear what cudaFree(0) does at all: https://stackoverflow.com/questions/69967813/
+      if ( debug ) std::cout << "__CudaRuntime: calling cudaFree(0)" << std::endl;
+      checkCuda( cudaFree( 0 ) ); // SLOW!
+      */
+      // Replace cudaFree(0) by cudaSetDevice(0), even if it is not really needed either
+      // (but see https://developer.nvidia.com/blog/cuda-pro-tip-always-set-current-device-avoid-multithreading-bugs)
+      if( debug ) std::cout << "__GpuRuntime: calling GpuSetDevice(0)" << std::endl;
+      checkGpu( gpuSetDevice( 0 ) ); // SLOW!
+    }
+
+    // Tear down CUDA application (call cudaDeviceReset)
+    // ** NB: strictly speaking this is not needed when using the CUDA runtime API **
+    // Calling cudaDeviceReset on shutdown is only needed for checking memory leaks in cuda-memcheck
+    // See https://docs.nvidia.com/cuda/cuda-memcheck/index.html#leak-checking
+    static void tearDown( const bool debug = true )
+    {
+      if( debug ) std::cout << "__GpuRuntime: calling GpuDeviceReset()" << std::endl;
+      checkGpu( gpuDeviceReset() );
+    }
+  };
+}
+#endif
+
+//--------------------------------------------------------------------------
+
+#endif // MG5AMC_GPURUNTIME_H
diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/MadgraphTest.h b/epochX/cudacpp/gq_ttq.sa/SubProcesses/MadgraphTest.h
index fd7734ce42..d2ff326e20 100644
--- a/epochX/cudacpp/gq_ttq.sa/SubProcesses/MadgraphTest.h
+++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/MadgraphTest.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: S. Hageboeck (Dec 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MADGRAPHTEST_H_
 #define MADGRAPHTEST_H_ 1
@@ -21,7 +21,7 @@
 #include <string>
 #include <vector>
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 using mg5amcGpu::CPPProcess;
 #else
 using mg5amcCpu::CPPProcess;
@@ -200,7 +200,7 @@ class MadgraphTest : public testing::TestWithParam<TestDriverBase*>
 
 // Since we link both the CPU-only and GPU tests into the same executable, we prevent
 // a multiply defined symbol by only compiling this in the non-CUDA phase:
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 
 /// Compare momenta and matrix elements.
 /// This uses an implementation of TestDriverBase to run a madgraph workflow,
@@ -309,6 +309,6 @@ TEST_P( MadgraphTest, CompareMomentaAndME )
   }
 }
 
-#endif // __CUDACC__
+#endif // MGONGPUCPP_GPUIMPL
 
 #endif /* MADGRAPHTEST_H_ */
diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/MatrixElementKernels.cc b/epochX/cudacpp/gq_ttq.sa/SubProcesses/MatrixElementKernels.cc
index 30257195b6..d6d6c4f179 100644
--- a/epochX/cudacpp/gq_ttq.sa/SubProcesses/MatrixElementKernels.cc
+++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/MatrixElementKernels.cc
@@ -1,12 +1,12 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #include "MatrixElementKernels.h"
 
 #include "CPPProcess.h"
-#include "CudaRuntime.h"
+#include "GpuRuntime.h" // Includes the abstraction for Nvidia/AMD compilation
 #include "MemoryAccessMomenta.h"
 #include "MemoryBuffers.h"
 
@@ -14,7 +14,7 @@
 
 //============================================================================
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 namespace mg5amcCpu
 {
 
@@ -143,7 +143,7 @@ namespace mg5amcCpu
 
 //============================================================================
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 {
 
@@ -202,13 +202,13 @@ namespace mg5amcGpu
     PinnedHostBufferHelicityMask hstIsGoodHel( ncomb );
     DeviceBufferHelicityMask devIsGoodHel( ncomb );
     // ... 0d1. Compute good helicity mask on the device
-    computeDependentCouplings<<<m_gpublocks, m_gputhreads>>>( m_gs.data(), m_couplings.data() );
+    gpuLaunchKernel( computeDependentCouplings, m_gpublocks, m_gputhreads, m_gs.data(), m_couplings.data() );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    sigmaKin_getGoodHel<<<m_gpublocks, m_gputhreads>>>( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_numerators.data(), m_denominators.data(), devIsGoodHel.data() );
+    gpuLaunchKernel( sigmaKin_getGoodHel, m_gpublocks, m_gputhreads, m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_numerators.data(), m_denominators.data(), devIsGoodHel.data() );
 #else
-    sigmaKin_getGoodHel<<<m_gpublocks, m_gputhreads>>>( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), devIsGoodHel.data() );
+    gpuLaunchKernel( sigmaKin_getGoodHel, m_gpublocks, m_gputhreads, m_momenta.data(), m_couplings.data(), m_matrixElements.data(), devIsGoodHel.data() );
 #endif
-    checkCuda( cudaPeekAtLastError() );
+    checkGpu( gpuPeekAtLastError() );
     // ... 0d2. Copy back good helicity mask to the host
     copyHostFromDevice( hstIsGoodHel, devIsGoodHel );
     // ... 0d3. Copy back good helicity list to constant memory on the device
@@ -219,19 +219,19 @@ namespace mg5amcGpu
 
   void MatrixElementKernelDevice::computeMatrixElements( const unsigned int channelId )
   {
-    computeDependentCouplings<<<m_gpublocks, m_gputhreads>>>( m_gs.data(), m_couplings.data() );
+    gpuLaunchKernel( computeDependentCouplings, m_gpublocks, m_gputhreads, m_gs.data(), m_couplings.data() );
 #ifndef MGONGPU_NSIGHT_DEBUG
     constexpr unsigned int sharedMemSize = 0;
 #else
     constexpr unsigned int sharedMemSize = ntpbMAX * sizeof( float );
 #endif
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    sigmaKin<<<m_gpublocks, m_gputhreads, sharedMemSize>>>( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), channelId, m_numerators.data(), m_denominators.data(), m_selhel.data(), m_selcol.data() );
+    gpuLaunchKernelSharedMem( sigmaKin, m_gpublocks, m_gputhreads, sharedMemSize, m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), channelId, m_numerators.data(), m_denominators.data(), m_selhel.data(), m_selcol.data() );
 #else
-    sigmaKin<<<m_gpublocks, m_gputhreads, sharedMemSize>>>( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), m_selhel.data(), m_selcol.data() );
+    gpuLaunchKernelSharedMem( sigmaKin, m_gpublocks, m_gputhreads, sharedMemSize, m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), m_selhel.data(), m_selcol.data() );
 #endif
-    checkCuda( cudaPeekAtLastError() );
-    checkCuda( cudaDeviceSynchronize() );
+    checkGpu( gpuPeekAtLastError() );
+    checkGpu( gpuDeviceSynchronize() );
   }
 
   //--------------------------------------------------------------------------
diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/MatrixElementKernels.h b/epochX/cudacpp/gq_ttq.sa/SubProcesses/MatrixElementKernels.h
index 23e84757a2..72bd8f195b 100644
--- a/epochX/cudacpp/gq_ttq.sa/SubProcesses/MatrixElementKernels.h
+++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/MatrixElementKernels.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MATRIXELEMENTKERNELS_H
 #define MATRIXELEMENTKERNELS_H 1
@@ -10,7 +10,7 @@
 
 #include "MemoryBuffers.h"
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -81,7 +81,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating matrix element calculations on a CPU host
   class MatrixElementKernelHost final : public MatrixElementKernelBase, public NumberOfEvents
   {
@@ -130,7 +130,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   // A class encapsulating matrix element calculations on a GPU device
   class MatrixElementKernelDevice : public MatrixElementKernelBase, public NumberOfEvents
   {
diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/MemoryAccessHelpers.h b/epochX/cudacpp/gq_ttq.sa/SubProcesses/MemoryAccessHelpers.h
index c82a6c7635..db73e4e064 100644
--- a/epochX/cudacpp/gq_ttq.sa/SubProcesses/MemoryAccessHelpers.h
+++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/MemoryAccessHelpers.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MemoryAccessHelpers_H
 #define MemoryAccessHelpers_H 1
@@ -105,7 +105,7 @@ class KernelAccessHelper : public MemoryAccessHelper<T>
     }
     else
     {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
       const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid
       //printf( "kernelAccessRecord: ievt=%d threadId=%d\n", ievt, threadIdx.x );
       return T::ieventAccessRecord( buffer, ievt ); // NB fptype and fptype_sv coincide for CUDA
diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/MemoryAccessMomenta.h b/epochX/cudacpp/gq_ttq.sa/SubProcesses/MemoryAccessMomenta.h
index 0ac4faa3c7..38fade09fb 100644
--- a/epochX/cudacpp/gq_ttq.sa/SubProcesses/MemoryAccessMomenta.h
+++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/MemoryAccessMomenta.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MemoryAccessMomenta_H
 #define MemoryAccessMomenta_H 1
@@ -12,7 +12,7 @@
 #include "MemoryAccessHelpers.h"
 #include "MemoryAccessVectors.h"
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 using mg5amcGpu::CPPProcess;
 #else
 using mg5amcCpu::CPPProcess;
@@ -29,7 +29,7 @@ class MemoryAccessMomentaBase //_AOSOAv1
 
   // Number of Events Per Page in the momenta AOSOA memory buffer layout
   // (these are all best kept as a compile-time constants: see issue #23)
-#ifdef __CUDACC__ /* clang-format off */
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
   // -----------------------------------------------------------------------------------------------
   // --- GPUs: neppM is best set to a power of 2 times the number of fptype's in a 32-byte cacheline
   // --- This is relevant to ensure coalesced access to momenta in global memory
diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/MemoryAccessRandomNumbers.h b/epochX/cudacpp/gq_ttq.sa/SubProcesses/MemoryAccessRandomNumbers.h
index e2988d39f3..40cb089135 100644
--- a/epochX/cudacpp/gq_ttq.sa/SubProcesses/MemoryAccessRandomNumbers.h
+++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/MemoryAccessRandomNumbers.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MemoryAccessRandomNumbers_H
 #define MemoryAccessRandomNumbers_H 1
@@ -11,7 +11,7 @@
 #include "CPPProcess.h"
 #include "MemoryAccessHelpers.h"
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 using mg5amcGpu::CPPProcess;
 #else
 using mg5amcCpu::CPPProcess;
diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/MemoryAccessVectors.h b/epochX/cudacpp/gq_ttq.sa/SubProcesses/MemoryAccessVectors.h
index e9b197368e..08faccff0f 100644
--- a/epochX/cudacpp/gq_ttq.sa/SubProcesses/MemoryAccessVectors.h
+++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/MemoryAccessVectors.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MemoryAccessVectors_H
 #define MemoryAccessVectors_H 1
@@ -10,7 +10,7 @@
 
 #include "mgOnGpuVectors.h"
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 namespace mg5amcCpu // this is only needed for CPU SIMD vectorization
 {
 
diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/MemoryBuffers.h b/epochX/cudacpp/gq_ttq.sa/SubProcesses/MemoryBuffers.h
index 3093e6ed18..7756a71621 100644
--- a/epochX/cudacpp/gq_ttq.sa/SubProcesses/MemoryBuffers.h
+++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/MemoryBuffers.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021, based on earlier work by S. Hageboeck) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Roiser, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MemoryBuffers_H
 #define MemoryBuffers_H 1
@@ -11,12 +11,12 @@
 #include "mgOnGpuCxtypes.h"
 
 #include "CPPProcess.h"
-#include "CudaRuntime.h"
+#include "GpuRuntime.h"
 #include "Parameters_sm.h"
 
 #include <sstream>
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -87,7 +87,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   constexpr bool HostBufferALIGNED = false;   // ismisaligned=false
   constexpr bool HostBufferMISALIGNED = true; // ismisaligned=true
 
@@ -119,7 +119,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   // A class encapsulating a CUDA pinned host buffer
   template<typename T>
   class PinnedHostBufferBase : public BufferBase<T>
@@ -128,18 +128,18 @@ namespace mg5amcCpu
     PinnedHostBufferBase( const size_t size )
       : BufferBase<T>( size, false )
     {
-      checkCuda( cudaMallocHost( &( this->m_data ), this->bytes() ) );
+      gpuMallocHost( &( this->m_data ), this->bytes() );
     }
     virtual ~PinnedHostBufferBase()
     {
-      checkCuda( cudaFreeHost( this->m_data ) );
+      gpuFreeHost( this->m_data );
     }
   };
 #endif
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   // A class encapsulating a CUDA device buffer
   template<typename T>
   class DeviceBufferBase : public BufferBase<T>
@@ -148,18 +148,18 @@ namespace mg5amcCpu
     DeviceBufferBase( const size_t size )
       : BufferBase<T>( size, true )
     {
-      checkCuda( cudaMalloc( &( this->m_data ), this->bytes() ) );
+      gpuMalloc( &( this->m_data ), this->bytes() );
     }
     virtual ~DeviceBufferBase()
     {
-      checkCuda( cudaFree( this->m_data ) );
+      gpuFree( this->m_data );
     }
   };
 #endif
 
   //--------------------------------------------------------------------------
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for a given number of events
   template<typename T, size_t sizePerEvent, bool ismisaligned>
   class HostBuffer : public HostBufferBase<T, ismisaligned>, virtual private NumberOfEvents
@@ -175,7 +175,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   // A class encapsulating a CUDA pinned host buffer for a given number of events
   template<typename T, size_t sizePerEvent>
   class PinnedHostBuffer : public PinnedHostBufferBase<T>, virtual private NumberOfEvents
@@ -191,7 +191,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   // A class encapsulating a CUDA device buffer for a given number of events
   template<typename T, size_t sizePerEvent>
   class DeviceBuffer : public DeviceBufferBase<T>, virtual private NumberOfEvents
@@ -213,7 +213,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for momenta random numbers
   constexpr size_t sizePerEventRndNumMomenta = MemoryBuffers::np4 * MemoryBuffers::nparf;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for momenta random numbers
   typedef HostBuffer<fptype, sizePerEventRndNumMomenta, HostBufferALIGNED> HostBufferRndNumMomenta;
 #else
@@ -232,7 +232,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer with ONE fptype per event
   constexpr size_t sizePerEventOneFp = 1;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer with ONE fptype per event
   typedef HostBuffer<fptype, sizePerEventOneFp, HostBufferALIGNED> HostBufferOneFp;
 #else
@@ -257,7 +257,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for Gs
   constexpr size_t sizePerEventGs = 1;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for gs
   typedef HostBuffer<fptype, sizePerEventGs, HostBufferALIGNED> HostBufferGs;
 #else
@@ -276,7 +276,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for numerators
   constexpr size_t sizePerEventNumerators = 1;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for gs
   typedef HostBuffer<fptype, sizePerEventNumerators, HostBufferALIGNED> HostBufferNumerators;
 #else
@@ -296,7 +296,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for denominators
   constexpr size_t sizePerEventDenominators = 1;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for gs
   typedef HostBuffer<fptype, sizePerEventDenominators, HostBufferALIGNED> HostBufferDenominators;
 #else
@@ -315,7 +315,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for random numbers
   constexpr size_t sizePerEventCouplings = MemoryBuffers::ndcoup * MemoryBuffers::nx2;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for gs
   typedef HostBuffer<fptype, sizePerEventCouplings, HostBufferALIGNED> HostBufferCouplings;
 #else
@@ -333,7 +333,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for momenta
   constexpr size_t sizePerEventMomenta = MemoryBuffers::np4 * MemoryBuffers::npar;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for momenta
   typedef HostBuffer<fptype, sizePerEventMomenta, HostBufferALIGNED> HostBufferMomenta;
   //typedef HostBuffer<fptype, sizePerEventMomenta, HostBufferMISALIGNED> HostBufferMomenta; // TEST MISALIGNMENT!
@@ -352,7 +352,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for sampling weights
   constexpr size_t sizePerEventWeights = 1;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for sampling weights
   typedef HostBuffer<fptype, sizePerEventWeights, HostBufferALIGNED> HostBufferWeights;
 #else
@@ -370,7 +370,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for matrix elements
   constexpr size_t sizePerEventMatrixElements = 1;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for matrix elements
   typedef HostBuffer<fptype, sizePerEventMatrixElements, HostBufferALIGNED> HostBufferMatrixElements;
 #else
@@ -385,7 +385,7 @@ namespace mg5amcCpu
   // A base class encapsulating a memory buffer for the helicity mask
   typedef BufferBase<bool> BufferHelicityMask;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for the helicity mask
   typedef HostBufferBase<bool, HostBufferALIGNED> HostBufferHelicityMask;
 #else
@@ -403,7 +403,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for wavefunctions
   constexpr size_t sizePerEventWavefunctions = MemoryBuffers::nw6 * MemoryBuffers::nx2;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for wavefunctions
   typedef HostBuffer<fptype, sizePerEventWavefunctions, HostBufferALIGNED> HostBufferWavefunctions;
 #else
@@ -421,7 +421,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for helicity random numbers
   constexpr size_t sizePerEventRndNumHelicity = 1;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for helicity random numbers
   typedef HostBuffer<fptype, sizePerEventRndNumHelicity, HostBufferALIGNED> HostBufferRndNumHelicity;
 #else
@@ -439,7 +439,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for color random numbers
   constexpr size_t sizePerEventRndNumColor = 1;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for color random numbers
   typedef HostBuffer<fptype, sizePerEventRndNumColor, HostBufferALIGNED> HostBufferRndNumColor;
 #else
@@ -457,7 +457,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for helicity selection
   constexpr size_t sizePerEventSelectedHelicity = 1;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for helicity selection
   typedef HostBuffer<int, sizePerEventSelectedHelicity, HostBufferALIGNED> HostBufferSelectedHelicity;
 #else
@@ -475,7 +475,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for color selection
   constexpr size_t sizePerEventSelectedColor = 1;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for color selection
   typedef HostBuffer<int, sizePerEventSelectedColor, HostBufferALIGNED> HostBufferSelectedColor;
 #else
@@ -487,7 +487,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   template<class Tdst, class Tsrc>
   void copyDeviceFromHost( Tdst& dst, const Tsrc& src ) // keep the same order of arguments as in memcpy
   {
@@ -504,13 +504,13 @@ namespace mg5amcCpu
       throw std::runtime_error( sstr.str() );
     }
     // NB (PR #45): cudaMemcpy involves an intermediate memcpy to pinned memory if host array is a not a pinned host array
-    checkCuda( cudaMemcpy( dst.data(), src.data(), src.bytes(), cudaMemcpyHostToDevice ) );
+    gpuMemcpy( dst.data(), src.data(), src.bytes(), gpuMemcpyHostToDevice );
   }
 #endif
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   template<class Tdst, class Tsrc>
   void copyHostFromDevice( Tdst& dst, const Tsrc& src ) // keep the same order of arguments as in memcpy
   {
@@ -527,7 +527,7 @@ namespace mg5amcCpu
       throw std::runtime_error( sstr.str() );
     }
     // NB (PR #45): cudaMemcpy involves an intermediate memcpy to pinned memory if host array is a not a pinned host array
-    checkCuda( cudaMemcpy( dst.data(), src.data(), src.bytes(), cudaMemcpyDeviceToHost ) );
+    gpuMemcpy( dst.data(), src.data(), src.bytes(), gpuMemcpyDeviceToHost );
   }
 #endif
 
diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gu_ttxu/CPPProcess.cc b/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gu_ttxu/CPPProcess.cc
index abcb3c9654..0c59a4bcd3 100644
--- a/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gu_ttxu/CPPProcess.cc
+++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gu_ttxu/CPPProcess.cc
@@ -4,7 +4,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
 // MadGraph5_aMC@NLO v. 3.5.0_lo_vect, 2023-06-09
@@ -16,7 +16,6 @@
 
 #include "mgOnGpuConfig.h"
 
-#include "CudaRuntime.h"
 #include "HelAmps_sm.h"
 #include "MemoryAccessAmplitudes.h"
 #include "MemoryAccessCouplings.h"
@@ -49,7 +48,7 @@
 // Process: g d > t t~ d WEIGHTED<=3 @1
 // Process: g s > t t~ s WEIGHTED<=3 @1
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -83,7 +82,7 @@ namespace mg5amcCpu
   __device__ const fptype cIPD[2] = { (fptype)Parameters_sm::mdl_MT, (fptype)Parameters_sm::mdl_WT };
   __device__ const fptype* cIPC = nullptr; // unused as nicoup=0
 #else
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   __device__ __constant__ fptype cIPD[2];
   __device__ __constant__ fptype* cIPC = nullptr; // unused as nicoup=0
 #else
@@ -93,7 +92,7 @@ namespace mg5amcCpu
 #endif
 
   // Helicity combinations (and filtering of "good" helicity combinations)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   __device__ __constant__ short cHel[ncomb][npar];
   __device__ __constant__ int cNGoodHel;
   __device__ __constant__ int cGoodHel[ncomb];
@@ -121,13 +120,13 @@ namespace mg5amcCpu
                            fptype* allDenominators,       // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
                            fptype_sv* jamp2_sv            // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled)
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
                            , const int ievt00             // input: first event number in current C++ event page (for CUDA, ievt depends on threadid)
 #endif
                            )
   //ALWAYS_INLINE // attributes are not permitted in a function definition
   {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     using namespace mg5amcGpu;
     using M_ACCESS = DeviceAccessMomenta;         // non-trivial access: buffer includes all events
     using E_ACCESS = DeviceAccessMatrixElements;  // non-trivial access: buffer includes all events
@@ -154,7 +153,7 @@ namespace mg5amcCpu
 #endif /* clang-format on */
     mgDebug( 0, __FUNCTION__ );
     //printf( "calculate_wavefunctions: ihel=%2d\n", ihel );
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
     //printf( "calculate_wavefunctions: ievt00=%d\n", ievt00 );
 #endif
 
@@ -190,7 +189,7 @@ namespace mg5amcCpu
 #endif
     for( int iParity = 0; iParity < nParity; ++iParity )
     { // START LOOP ON IPARITY
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
       const int ievt0 = ievt00 + iParity * neppV;
 #endif
       constexpr size_t nxcoup = ndcoup + nicoup; // both dependent and independent couplings
@@ -203,8 +202,10 @@ namespace mg5amcCpu
         allCOUPs[idcoup] = CD_ACCESS::idcoupAccessBufferConst( allcouplings, idcoup ); // dependent couplings, vary event-by-event
       for( size_t iicoup = 0; iicoup < nicoup; iicoup++ )
         allCOUPs[ndcoup + iicoup] = CI_ACCESS::iicoupAccessBufferConst( cIPC, iicoup ); // independent couplings, fixed for all events
+#ifdef MGONGPUCPP_GPUIMPL
 #ifdef __CUDACC__
 #pragma nv_diagnostic pop
+#endif
       // CUDA kernels take input/output buffers with momenta/MEs for all events
       const fptype* momenta = allmomenta;
       const fptype* COUPs[nxcoup];
@@ -243,7 +244,7 @@ namespace mg5amcCpu
       // Wavefunction(s) for diagram number 1
       vxxxxx<M_ACCESS, W_ACCESS>( momenta, 0., cHel[ihel][0], -1, w_fp[0], 0 );
 
-#if not( defined __CUDACC__ and defined MGONGPU_TEST_DIVERGENCE )
+#if not( defined MGONGPUCPP_GPUIMPL and defined MGONGPU_TEST_DIVERGENCE )
       imzxxx<M_ACCESS, W_ACCESS>( momenta, cHel[ihel][1], +1, w_fp[1], 1 ); // NB: imzxxx only uses pz
 #else
       if( ( blockDim.x * blockIdx.x + threadIdx.x ) % 2 == 0 )
@@ -343,7 +344,7 @@ namespace mg5amcCpu
         { 4, 0, 12, 4 },
         { 0, 4, 4, 12 } }; // 2-D array[4][4]
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
       // Pre-compute a constexpr triangular color matrix properly normalized #475
       struct TriangularNormalizedColorMatrix
       {
@@ -400,7 +401,7 @@ namespace mg5amcCpu
 #endif
       for( int icol = 0; icol < ncolor; icol++ )
       {
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
         // === C++ START ===
         // Diagonal terms
 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
@@ -459,7 +460,7 @@ namespace mg5amcCpu
       MEs_sv_previous += deltaMEs_previous;
 #endif
       /*
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
       if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", blockDim.x * blockIdx.x + threadIdx.x, ihel, MEs_sv );
 #else
 #ifdef MGONGPU_CPPSIMD
@@ -522,8 +523,8 @@ namespace mg5amcCpu
       { 1, -1, 1, 1, 1 },
       { 1, -1, 1, -1, -1 },
       { 1, -1, 1, -1, 1 } };
-#ifdef __CUDACC__
-    checkCuda( cudaMemcpyToSymbol( cHel, tHel, ncomb * npar * sizeof( short ) ) );
+#ifdef MGONGPUCPP_GPUIMPL
+    gpuMemcpyToSymbol( cHel, tHel, ncomb * npar * sizeof( short ) );
 #else
     memcpy( cHel, tHel, ncomb * npar * sizeof( short ) );
 #endif
@@ -564,9 +565,9 @@ namespace mg5amcCpu
     // Then copy them to CUDA constant memory (issue #39) or its C++ emulation in file-scope static memory
     const fptype tIPD[2] = { (fptype)m_pars->mdl_MT, (fptype)m_pars->mdl_WT };
     //const cxtype tIPC[0] = { ... }; // nicoup=0
-#ifdef __CUDACC__
-    checkCuda( cudaMemcpyToSymbol( cIPD, tIPD, 2 * sizeof( fptype ) ) );
-    //checkCuda( cudaMemcpyToSymbol( cIPC, tIPC, 0 * sizeof( cxtype ) ) ); // nicoup=0
+#ifdef MGONGPUCPP_GPUIMPL
+    gpuMemcpyToSymbol( cIPD, tIPD, 2 * sizeof( fptype ) );
+    //gpuMemcpyToSymbol( cIPC, tIPC, 0 * sizeof( cxtype ) ); // nicoup=0
 #else
     memcpy( cIPD, tIPD, 2 * sizeof( fptype ) );
     //memcpy( cIPC, tIPC, 0 * sizeof( cxtype ) ); // nicoup=0
@@ -603,7 +604,7 @@ namespace mg5amcCpu
   {
     std::stringstream out;
     // CUDA version (NVCC)
-    // [Use __NVCC__ instead of __CUDACC__ here!]
+    // [Use __NVCC__ instead of MGONGPUCPP_GPUIMPL here!]
     // [This tests if 'nvcc' was used even to build a .cc file, even if not necessarily 'nvcc -x cu' for a .cu file]
     // [Check 'nvcc --compiler-options -dM -E dummy.c | grep CUDA': see https://stackoverflow.com/a/53713712]
 #ifdef __NVCC__
@@ -668,12 +669,12 @@ namespace mg5amcCpu
   __global__ void /* clang-format off */
   computeDependentCouplings( const fptype* allgs, // input: Gs[nevt]
                              fptype* allcouplings // output: couplings[nevt*ndcoup*2]
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
                              , const int nevt     // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #endif
   ) /* clang-format on */
   {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     using namespace mg5amcGpu;
     using G_ACCESS = DeviceAccessGs;
     using C_ACCESS = DeviceAccessCouplings;
@@ -694,7 +695,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__ /* clang-format off */
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
   __global__ void
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
@@ -823,9 +824,9 @@ namespace mg5amcCpu
         nGoodHel++;
       }
     }
-#ifdef __CUDACC__
-    checkCuda( cudaMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) ) );
-    checkCuda( cudaMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) ) );
+#ifdef MGONGPUCPP_GPUIMPL
+    gpuMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) );
+    gpuMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) );
 #else
     cNGoodHel = nGoodHel;
     for( int ihel = 0; ihel < ncomb; ihel++ ) cGoodHel[ihel] = goodHel[ihel];
@@ -849,7 +850,7 @@ namespace mg5amcCpu
 #endif
             int* allselhel,                // output: helicity selection[nevt]
             int* allselcol                 // output: helicity selection[nevt]
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
             , const int nevt               // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #endif
             ) /* clang-format on */
@@ -870,7 +871,7 @@ namespace mg5amcCpu
     // Denominators: spins, colors and identical particles
     constexpr int helcolDenominators[1] = { 96 }; // assume nprocesses == 1 (#272 and #343)
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     // Remember: in CUDA this is a kernel for one event, in c++ this processes n events
     const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid
 #else
@@ -884,9 +885,12 @@ namespace mg5amcCpu
 #endif
 
     // Start sigmaKin_lines
+
+#include "GpuAbstraction.h"
+
     // === PART 0 - INITIALISATION (before calculate_wavefunctions) ===
     // Reset the "matrix elements" - running sums of |M|^2 over helicities for the given event
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     allMEs[ievt] = 0;
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     allNumerators[ievt] = 0;
@@ -914,7 +918,7 @@ namespace mg5amcCpu
     // === PART 1 - HELICITY LOOP: CALCULATE WAVEFUNCTIONS ===
     // (in both CUDA and C++, using precomputed good helicities)
 
-#ifdef __CUDACC__ // CUDA OR C++
+#ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++
 
     // *** START OF PART 1a - CUDA (one event per CPU thread) ***
     // Running sum of partial amplitudes squared for event by event color selection (#402)
@@ -1118,7 +1122,7 @@ namespace mg5amcCpu
     // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event
     // [NB 'sum over final spins, average over initial spins', eg see
     // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf]
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     allMEs[ievt] /= helcolDenominators[0];
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     if( channelId > 0 ) allMEs[ievt] *= allNumerators[ievt] / allDenominators[ievt];
diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gu_ttxu/CPPProcess.h b/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gu_ttxu/CPPProcess.h
index 9554285817..ee747a8ae4 100644
--- a/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gu_ttxu/CPPProcess.h
+++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gu_ttxu/CPPProcess.h
@@ -4,7 +4,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
 // MadGraph5_aMC@NLO v. 3.5.0_lo_vect, 2023-06-09
@@ -25,7 +25,7 @@
 
 //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -110,7 +110,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   __global__ void
   computeDependentCouplings( const fptype* allgs,    // input: Gs[nevt]
                              fptype* allcouplings ); // output: couplings[nevt*ndcoup*2]
@@ -123,7 +123,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__ /* clang-format off */
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
   __global__ void
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
@@ -153,7 +153,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__ /* clang-format off */
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
   __global__ void
   sigmaKin( const fptype* allmomenta,      // input: momenta[nevt*npar*4]
             const fptype* allcouplings,    // input: couplings[nevt*ndcoup*2]
diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gu_ttxu/CudaRuntime.h b/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gu_ttxu/CudaRuntime.h
deleted file mode 120000
index ce9e1a487a..0000000000
--- a/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gu_ttxu/CudaRuntime.h
+++ /dev/null
@@ -1 +0,0 @@
-../CudaRuntime.h
\ No newline at end of file
diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gu_ttxu/GpuAbstraction.h b/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gu_ttxu/GpuAbstraction.h
new file mode 120000
index 0000000000..72054e19ba
--- /dev/null
+++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gu_ttxu/GpuAbstraction.h
@@ -0,0 +1 @@
+../GpuAbstraction.h
\ No newline at end of file
diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gu_ttxu/GpuRuntime.h b/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gu_ttxu/GpuRuntime.h
new file mode 120000
index 0000000000..3920e83be4
--- /dev/null
+++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gu_ttxu/GpuRuntime.h
@@ -0,0 +1 @@
+../GpuRuntime.h
\ No newline at end of file
diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gu_ttxu/check_sa.cc b/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gu_ttxu/check_sa.cc
index f1e75b9252..1bad694d1c 100644
--- a/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gu_ttxu/check_sa.cc
+++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gu_ttxu/check_sa.cc
@@ -4,7 +4,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: O. Mattelaer (Nov 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 
 #include "mgOnGpuConfig.h"
@@ -12,6 +12,7 @@
 #include "BridgeKernels.h"
 #include "CPPProcess.h"
 #include "CrossSectionKernels.h"
+#include "GpuRuntime.h"
 #include "MatrixElementKernels.h"
 #include "MemoryAccessMatrixElements.h"
 #include "MemoryAccessMomenta.h"
@@ -63,7 +64,7 @@ usage( char* argv0, int ret = 1 )
   std::cout << std::endl;
   std::cout << "Summary stats are always computed: '-p' and '-j' only control their printout" << std::endl;
   std::cout << "The '-d' flag only enables NaN/abnormal warnings and OMP debugging" << std::endl;
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 #ifdef _OPENMP
   std::cout << std::endl;
   std::cout << "Use the OMP_NUM_THREADS environment variable to control OMP multi-threading" << std::endl;
@@ -77,7 +78,7 @@ int
 main( int argc, char** argv )
 {
   // Namespaces for CUDA and C++ (FIXME - eventually use the same namespace everywhere...)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   using namespace mg5amcGpu;
 #else
   using namespace mg5amcCpu;
@@ -103,11 +104,11 @@ main( int argc, char** argv )
     CurandDevice = 2
   };
 #ifdef __CUDACC__
-  RandomNumberMode rndgen = RandomNumberMode::CurandDevice; // default on GPU
+  RandomNumberMode rndgen = RandomNumberMode::CurandDevice; // default on CUDA GPU
 #elif not defined MGONGPU_HAS_NO_CURAND
   RandomNumberMode rndgen = RandomNumberMode::CurandHost;  // default on CPU if build has curand
 #else
-  RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // default on CPU if build has no curand
+  RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // default on CPU if build has no curand and on HIP GPU
 #endif
   // Rambo sampling mode (NB RamboHost implies CommonRandom or CurandHost!)
   enum class RamboSamplingMode
@@ -115,7 +116,7 @@ main( int argc, char** argv )
     RamboHost = 1,
     RamboDevice = 2
   };
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   RamboSamplingMode rmbsmp = RamboSamplingMode::RamboDevice; // default on GPU
 #else
   RamboSamplingMode rmbsmp = RamboSamplingMode::RamboHost; // default on CPU
@@ -148,7 +149,7 @@ main( int argc, char** argv )
 #ifdef __CUDACC__
       rndgen = RandomNumberMode::CurandDevice;
 #else
-      throw std::runtime_error( "CurandDevice is not supported on CPUs" );
+      throw std::runtime_error( "CurandDevice is not supported on CPUs or on HIP GPUs" );
 #endif
     }
     else if( arg == "--curhst" )
@@ -165,7 +166,7 @@ main( int argc, char** argv )
     }
     else if( arg == "--rmbdev" )
     {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
       rmbsmp = RamboSamplingMode::RamboDevice;
 #else
       throw std::runtime_error( "RamboDevice is not supported on CPUs" );
@@ -239,13 +240,13 @@ main( int argc, char** argv )
     return usage( argv[0] );
   }
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 #ifdef _OPENMP
   ompnumthreadsNotSetMeansOneThread( debug ? 1 : 0 ); // quiet(-1), info(0), debug(1)
 #endif
 #endif
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // Fail gently and avoid "Illegal instruction (core dumped)" if the host does not support the SIMD used in the ME calculation
   // Note: this prevents a crash on pmpe04 but not on some github CI nodes?
   // [NB: SIMD vectorization in mg5amc C++ code is only used in the ME calculation below MatrixElementKernelHost!]
@@ -263,14 +264,14 @@ main( int argc, char** argv )
 
   // === STEP 0 - INITIALISE
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 
-  // --- 00. Initialise cuda
-  // Instantiate a CudaRuntime at the beginnining of the application's main to
-  // invoke cudaSetDevice(0) in the constructor and book a cudaDeviceReset() call in the destructor
-  const std::string cdinKey = "00 CudaInit";
+  // --- 00. Initialise GPU
+  // Instantiate a GpuRuntime at the beginnining of the application's main.
+  // For CUDA this invokes cudaSetDevice(0) in the constructor and books a cudaDeviceReset() call in the destructor.
+  const std::string cdinKey = "00 GpuInit";
   timermap.start( cdinKey );
-  CudaRuntime cudaRuntime( debug );
+  GpuRuntime GpuRuntime( debug );
 #endif
 
   // --- 0a. Initialise physics process
@@ -292,7 +293,7 @@ main( int argc, char** argv )
   timermap.start( alloKey );
 
   // Memory buffers for random numbers for momenta
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferRndNumMomenta hstRndmom( nevt );
 #else
   PinnedHostBufferRndNumMomenta hstRndmom( nevt );
@@ -300,7 +301,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for sampling weights
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferWeights hstWeights( nevt );
 #else
   PinnedHostBufferWeights hstWeights( nevt );
@@ -308,7 +309,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for momenta
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferMomenta hstMomenta( nevt );
 #else
   PinnedHostBufferMomenta hstMomenta( nevt );
@@ -316,7 +317,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for Gs
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferGs hstGs( nevt );
 #else
   PinnedHostBufferGs hstGs( nevt );
@@ -333,7 +334,7 @@ main( int argc, char** argv )
   }
 
   // Memory buffers for matrix elements
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferMatrixElements hstMatrixElements( nevt );
 #else
   PinnedHostBufferMatrixElements hstMatrixElements( nevt );
@@ -342,7 +343,7 @@ main( int argc, char** argv )
 
   // Memory buffers for random numbers for helicity selection
   // *** NB #403 these buffers always remain initialised at 0: no need for helicity choice in gcheck/check (no LHE produced) ***
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferRndNumHelicity hstRndHel( nevt );
 #else
   PinnedHostBufferRndNumHelicity hstRndHel( nevt );
@@ -351,7 +352,7 @@ main( int argc, char** argv )
 
   // Memory buffers for random numbers for color selection
   // *** NB #402 these buffers always remain initialised at 0: no need for color choice in gcheck/check (no LHE produced) ***
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferRndNumColor hstRndCol( nevt );
 #else
   PinnedHostBufferRndNumColor hstRndCol( nevt );
@@ -359,7 +360,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for helicity selection
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferSelectedHelicity hstSelHel( nevt );
 #else
   PinnedHostBufferSelectedHelicity hstSelHel( nevt );
@@ -367,7 +368,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for color selection
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferSelectedColor hstSelCol( nevt );
 #else
   PinnedHostBufferSelectedColor hstSelCol( nevt );
@@ -403,7 +404,7 @@ main( int argc, char** argv )
 #else
   else
   {
-    throw std::logic_error( "CurandDevice is not supported on CPUs" ); // INTERNAL ERROR (no path to this statement)
+    throw std::logic_error( "CurandDevice is not supported on CPUs or HIP GPUs" ); // INTERNAL ERROR (no path to this statement)
   }
 #endif
 #else
@@ -421,7 +422,7 @@ main( int argc, char** argv )
   }
   else
   {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     prsk.reset( new RamboSamplingKernelDevice( energy, devRndmom, devMomenta, devWeights, gpublocks, gputhreads ) );
 #else
     throw std::logic_error( "RamboDevice is not supported on CPUs" ); // INTERNAL ERROR (no path to this statement)
@@ -432,7 +433,7 @@ main( int argc, char** argv )
   std::unique_ptr<MatrixElementKernelBase> pmek;
   if( !bridge )
   {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     pmek.reset( new MatrixElementKernelDevice( devMomenta, devGs, devRndHel, devRndCol, devMatrixElements, devSelHel, devSelCol, gpublocks, gputhreads ) );
 #else
     pmek.reset( new MatrixElementKernelHost( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, nevt ) );
@@ -440,7 +441,7 @@ main( int argc, char** argv )
   }
   else
   {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     pmek.reset( new BridgeKernelDevice( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, gpublocks, gputhreads ) );
 #else
     pmek.reset( new BridgeKernelHost( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, nevt ) );
@@ -482,7 +483,7 @@ main( int argc, char** argv )
     prnk->generateRnarray();
     //std::cout << "Got random numbers" << std::endl;
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     if( rndgen != RandomNumberMode::CurandDevice && rmbsmp == RamboSamplingMode::RamboDevice )
     {
       // --- 1c. Copy rndmom from host to device
@@ -514,7 +515,7 @@ main( int argc, char** argv )
     prsk->getMomentaFinal();
     //std::cout << "Got final momenta" << std::endl;
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     if( rmbsmp == RamboSamplingMode::RamboDevice )
     {
       // --- 2c. CopyDToH Weights
@@ -559,7 +560,7 @@ main( int argc, char** argv )
       dynamic_cast<BridgeKernelBase*>( pmek.get() )->transposeInputMomentaC2F();
     }
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     // --- 2d. CopyHToD Momenta
     const std::string gKey = "0.. CpHTDg";
     rambtime += timermap.start( gKey ); // FIXME! NOT A RAMBO TIMER!
@@ -588,7 +589,7 @@ main( int argc, char** argv )
     wv3atime += timermap.stop(); // calc only
     wavetime += wv3atime;        // calc plus copy
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     if( !bridge )
     {
       // --- 3b. CopyDToH MEs
@@ -731,15 +732,19 @@ main( int argc, char** argv )
     rndgentxt = "CURAND DEVICE";
 #ifdef __CUDACC__
   rndgentxt += " (CUDA code)";
+#elif defined __HIPCC__
+  rndgentxt += " (HIP code)";
 #else
   rndgentxt += " (C++ code)";
 #endif
 
   // Workflow description summary
   std::string wrkflwtxt;
-  // -- CUDA or C++?
+  // -- CUDA or HIP or C++?
 #ifdef __CUDACC__
   wrkflwtxt += "CUD:";
+#elif defined __HIPCC__
+  wrkflwtxt += "HIP:";
 #else
   wrkflwtxt += "CPP:";
 #endif
@@ -764,6 +769,12 @@ main( int argc, char** argv )
 #else
   wrkflwtxt += "???:"; // no path to this statement
 #endif
+#elif defined __HIPCC__
+#if defined MGONGPU_CUCXTYPE_CXSMPL
+  wrkflwtxt += "CXS:";
+#else
+  wrkflwtxt += "???:"; // no path to this statement
+#endif
 #else
 #if defined MGONGPU_CPPCXTYPE_STDCOMPLEX
   wrkflwtxt += "STX:";
@@ -789,7 +800,7 @@ main( int argc, char** argv )
     wrkflwtxt += "RMBDEV+";
   else
     wrkflwtxt += "??????+"; // no path to this statement
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   // -- HOST or DEVICE matrix elements? Standalone MEs or BRIDGE?
   if( !bridge )
     wrkflwtxt += "MESDEV";
@@ -845,7 +856,7 @@ main( int argc, char** argv )
 
   if( perf )
   {
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 #ifdef _OPENMP
     // Get the output of "nproc --all" (https://stackoverflow.com/a/478960)
     std::string nprocall;
@@ -866,6 +877,8 @@ main( int argc, char** argv )
     std::cout << std::string( SEP79, '*' ) << std::endl
 #ifdef __CUDACC__
               << "Process                     = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_CUDA"
+#elif defined __HIPCC__
+              << "Process                     = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_HIP"
 #else
               << "Process                     = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_CPP"
 #endif
@@ -892,21 +905,21 @@ main( int argc, char** argv )
 #elif defined MGONGPU_FPTYPE_FLOAT
               << "FP precision                = FLOAT (NaN/abnormal=" << nabn << ", zero=" << nzero << ")" << std::endl
 #endif
-#ifdef __CUDACC__
 #if defined MGONGPU_CUCXTYPE_CUCOMPLEX
               << "Complex type                = CUCOMPLEX" << std::endl
 #elif defined MGONGPU_CUCXTYPE_THRUST
               << "Complex type                = THRUST::COMPLEX" << std::endl
-#endif
-#else
+#elif defined MGONGPU_CUCXTYPE_CXSMPL
               << "Complex type                = STD::COMPLEX" << std::endl
+#else
+              << "Complex type                = ???" << std::endl // no path to this statement...
 #endif
               << "RanNumb memory layout       = AOSOA[" << neppR << "]"
               << ( neppR == 1 ? " == AOS" : "" )
               << " [HARDCODED FOR REPRODUCIBILITY]" << std::endl
               << "Momenta memory layout       = AOSOA[" << neppM << "]"
               << ( neppM == 1 ? " == AOS" : "" ) << std::endl
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     //<< "Wavefunction GPU memory     = LOCAL" << std::endl
 #else
 #if !defined MGONGPU_CPPSIMD
@@ -937,7 +950,7 @@ main( int argc, char** argv )
 #endif
 #endif
               << "Random number generation    = " << rndgentxt << std::endl
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 #ifdef _OPENMP
               << "OMP threads / `nproc --all` = " << omp_get_max_threads() << " / " << nprocall // includes a newline
 #endif
@@ -1033,14 +1046,14 @@ main( int argc, char** argv )
              << "\"FLOAT (NaN/abnormal=" << nabn << ")\"," << std::endl
 #endif
              << "\"Complex type\": "
-#ifdef __CUDACC__
 #if defined MGONGPU_CUCXTYPE_CUCOMPLEX
              << "\"CUCOMPLEX\"," << std::endl
 #elif defined MGONGPU_CUCXTYPE_THRUST
              << "\"THRUST::COMPLEX\"," << std::endl
-#endif
-#else
+#elif defined MGONGPU_CUCXTYPE_CXSMPL
              << "\"STD::COMPLEX\"," << std::endl
+#else
+             << "\"???\"," << std::endl                           // no path to this statement...
 #endif
              << "\"RanNumb memory layout\": "
              << "\"AOSOA[" << neppR << "]\""
@@ -1048,7 +1061,7 @@ main( int argc, char** argv )
              << "\"Momenta memory layout\": "
              << "\"AOSOA[" << neppM << "]\""
              << ( neppM == 1 ? " == AOS" : "" ) << ", " << std::endl
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     //<< "\"Wavefunction GPU memory\": " << "\"LOCAL\"," << std::endl
 #endif
              << "\"Curand generation\": "
diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gux_ttxux/CPPProcess.cc b/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gux_ttxux/CPPProcess.cc
index 12f74f99ea..c828a1d13b 100644
--- a/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gux_ttxux/CPPProcess.cc
+++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gux_ttxux/CPPProcess.cc
@@ -4,7 +4,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
 // MadGraph5_aMC@NLO v. 3.5.0_lo_vect, 2023-06-09
@@ -16,7 +16,6 @@
 
 #include "mgOnGpuConfig.h"
 
-#include "CudaRuntime.h"
 #include "HelAmps_sm.h"
 #include "MemoryAccessAmplitudes.h"
 #include "MemoryAccessCouplings.h"
@@ -49,7 +48,7 @@
 // Process: g d~ > t t~ d~ WEIGHTED<=3 @1
 // Process: g s~ > t t~ s~ WEIGHTED<=3 @1
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -83,7 +82,7 @@ namespace mg5amcCpu
   __device__ const fptype cIPD[2] = { (fptype)Parameters_sm::mdl_MT, (fptype)Parameters_sm::mdl_WT };
   __device__ const fptype* cIPC = nullptr; // unused as nicoup=0
 #else
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   __device__ __constant__ fptype cIPD[2];
   __device__ __constant__ fptype* cIPC = nullptr; // unused as nicoup=0
 #else
@@ -93,7 +92,7 @@ namespace mg5amcCpu
 #endif
 
   // Helicity combinations (and filtering of "good" helicity combinations)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   __device__ __constant__ short cHel[ncomb][npar];
   __device__ __constant__ int cNGoodHel;
   __device__ __constant__ int cGoodHel[ncomb];
@@ -121,13 +120,13 @@ namespace mg5amcCpu
                            fptype* allDenominators,       // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
                            fptype_sv* jamp2_sv            // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled)
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
                            , const int ievt00             // input: first event number in current C++ event page (for CUDA, ievt depends on threadid)
 #endif
                            )
   //ALWAYS_INLINE // attributes are not permitted in a function definition
   {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     using namespace mg5amcGpu;
     using M_ACCESS = DeviceAccessMomenta;         // non-trivial access: buffer includes all events
     using E_ACCESS = DeviceAccessMatrixElements;  // non-trivial access: buffer includes all events
@@ -154,7 +153,7 @@ namespace mg5amcCpu
 #endif /* clang-format on */
     mgDebug( 0, __FUNCTION__ );
     //printf( "calculate_wavefunctions: ihel=%2d\n", ihel );
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
     //printf( "calculate_wavefunctions: ievt00=%d\n", ievt00 );
 #endif
 
@@ -190,7 +189,7 @@ namespace mg5amcCpu
 #endif
     for( int iParity = 0; iParity < nParity; ++iParity )
     { // START LOOP ON IPARITY
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
       const int ievt0 = ievt00 + iParity * neppV;
 #endif
       constexpr size_t nxcoup = ndcoup + nicoup; // both dependent and independent couplings
@@ -203,8 +202,10 @@ namespace mg5amcCpu
         allCOUPs[idcoup] = CD_ACCESS::idcoupAccessBufferConst( allcouplings, idcoup ); // dependent couplings, vary event-by-event
       for( size_t iicoup = 0; iicoup < nicoup; iicoup++ )
         allCOUPs[ndcoup + iicoup] = CI_ACCESS::iicoupAccessBufferConst( cIPC, iicoup ); // independent couplings, fixed for all events
+#ifdef MGONGPUCPP_GPUIMPL
 #ifdef __CUDACC__
 #pragma nv_diagnostic pop
+#endif
       // CUDA kernels take input/output buffers with momenta/MEs for all events
       const fptype* momenta = allmomenta;
       const fptype* COUPs[nxcoup];
@@ -336,7 +337,7 @@ namespace mg5amcCpu
         { 4, 0, 12, 4 },
         { 0, 4, 4, 12 } }; // 2-D array[4][4]
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
       // Pre-compute a constexpr triangular color matrix properly normalized #475
       struct TriangularNormalizedColorMatrix
       {
@@ -393,7 +394,7 @@ namespace mg5amcCpu
 #endif
       for( int icol = 0; icol < ncolor; icol++ )
       {
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
         // === C++ START ===
         // Diagonal terms
 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
@@ -452,7 +453,7 @@ namespace mg5amcCpu
       MEs_sv_previous += deltaMEs_previous;
 #endif
       /*
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
       if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", blockDim.x * blockIdx.x + threadIdx.x, ihel, MEs_sv );
 #else
 #ifdef MGONGPU_CPPSIMD
@@ -515,8 +516,8 @@ namespace mg5amcCpu
       { 1, 1, 1, 1, -1 },
       { 1, 1, 1, -1, 1 },
       { 1, 1, 1, -1, -1 } };
-#ifdef __CUDACC__
-    checkCuda( cudaMemcpyToSymbol( cHel, tHel, ncomb * npar * sizeof( short ) ) );
+#ifdef MGONGPUCPP_GPUIMPL
+    gpuMemcpyToSymbol( cHel, tHel, ncomb * npar * sizeof( short ) );
 #else
     memcpy( cHel, tHel, ncomb * npar * sizeof( short ) );
 #endif
@@ -557,9 +558,9 @@ namespace mg5amcCpu
     // Then copy them to CUDA constant memory (issue #39) or its C++ emulation in file-scope static memory
     const fptype tIPD[2] = { (fptype)m_pars->mdl_MT, (fptype)m_pars->mdl_WT };
     //const cxtype tIPC[0] = { ... }; // nicoup=0
-#ifdef __CUDACC__
-    checkCuda( cudaMemcpyToSymbol( cIPD, tIPD, 2 * sizeof( fptype ) ) );
-    //checkCuda( cudaMemcpyToSymbol( cIPC, tIPC, 0 * sizeof( cxtype ) ) ); // nicoup=0
+#ifdef MGONGPUCPP_GPUIMPL
+    gpuMemcpyToSymbol( cIPD, tIPD, 2 * sizeof( fptype ) );
+    //gpuMemcpyToSymbol( cIPC, tIPC, 0 * sizeof( cxtype ) ); // nicoup=0
 #else
     memcpy( cIPD, tIPD, 2 * sizeof( fptype ) );
     //memcpy( cIPC, tIPC, 0 * sizeof( cxtype ) ); // nicoup=0
@@ -596,7 +597,7 @@ namespace mg5amcCpu
   {
     std::stringstream out;
     // CUDA version (NVCC)
-    // [Use __NVCC__ instead of __CUDACC__ here!]
+    // [Use __NVCC__ instead of MGONGPUCPP_GPUIMPL here!]
     // [This tests if 'nvcc' was used even to build a .cc file, even if not necessarily 'nvcc -x cu' for a .cu file]
     // [Check 'nvcc --compiler-options -dM -E dummy.c | grep CUDA': see https://stackoverflow.com/a/53713712]
 #ifdef __NVCC__
@@ -661,12 +662,12 @@ namespace mg5amcCpu
   __global__ void /* clang-format off */
   computeDependentCouplings( const fptype* allgs, // input: Gs[nevt]
                              fptype* allcouplings // output: couplings[nevt*ndcoup*2]
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
                              , const int nevt     // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #endif
   ) /* clang-format on */
   {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     using namespace mg5amcGpu;
     using G_ACCESS = DeviceAccessGs;
     using C_ACCESS = DeviceAccessCouplings;
@@ -687,7 +688,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__ /* clang-format off */
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
   __global__ void
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
@@ -816,9 +817,9 @@ namespace mg5amcCpu
         nGoodHel++;
       }
     }
-#ifdef __CUDACC__
-    checkCuda( cudaMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) ) );
-    checkCuda( cudaMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) ) );
+#ifdef MGONGPUCPP_GPUIMPL
+    gpuMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) );
+    gpuMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) );
 #else
     cNGoodHel = nGoodHel;
     for( int ihel = 0; ihel < ncomb; ihel++ ) cGoodHel[ihel] = goodHel[ihel];
@@ -842,7 +843,7 @@ namespace mg5amcCpu
 #endif
             int* allselhel,                // output: helicity selection[nevt]
             int* allselcol                 // output: helicity selection[nevt]
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
             , const int nevt               // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #endif
             ) /* clang-format on */
@@ -863,7 +864,7 @@ namespace mg5amcCpu
     // Denominators: spins, colors and identical particles
     constexpr int helcolDenominators[1] = { 96 }; // assume nprocesses == 1 (#272 and #343)
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     // Remember: in CUDA this is a kernel for one event, in c++ this processes n events
     const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid
 #else
@@ -877,9 +878,12 @@ namespace mg5amcCpu
 #endif
 
     // Start sigmaKin_lines
+
+#include "GpuAbstraction.h"
+
     // === PART 0 - INITIALISATION (before calculate_wavefunctions) ===
     // Reset the "matrix elements" - running sums of |M|^2 over helicities for the given event
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     allMEs[ievt] = 0;
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     allNumerators[ievt] = 0;
@@ -907,7 +911,7 @@ namespace mg5amcCpu
     // === PART 1 - HELICITY LOOP: CALCULATE WAVEFUNCTIONS ===
     // (in both CUDA and C++, using precomputed good helicities)
 
-#ifdef __CUDACC__ // CUDA OR C++
+#ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++
 
     // *** START OF PART 1a - CUDA (one event per CPU thread) ***
     // Running sum of partial amplitudes squared for event by event color selection (#402)
@@ -1111,7 +1115,7 @@ namespace mg5amcCpu
     // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event
     // [NB 'sum over final spins, average over initial spins', eg see
     // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf]
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     allMEs[ievt] /= helcolDenominators[0];
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     if( channelId > 0 ) allMEs[ievt] *= allNumerators[ievt] / allDenominators[ievt];
diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gux_ttxux/CPPProcess.h b/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gux_ttxux/CPPProcess.h
index 1d1e130ec2..53bb5ccd94 100644
--- a/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gux_ttxux/CPPProcess.h
+++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gux_ttxux/CPPProcess.h
@@ -4,7 +4,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
 // MadGraph5_aMC@NLO v. 3.5.0_lo_vect, 2023-06-09
@@ -25,7 +25,7 @@
 
 //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -110,7 +110,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   __global__ void
   computeDependentCouplings( const fptype* allgs,    // input: Gs[nevt]
                              fptype* allcouplings ); // output: couplings[nevt*ndcoup*2]
@@ -123,7 +123,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__ /* clang-format off */
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
   __global__ void
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
@@ -153,7 +153,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__ /* clang-format off */
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
   __global__ void
   sigmaKin( const fptype* allmomenta,      // input: momenta[nevt*npar*4]
             const fptype* allcouplings,    // input: couplings[nevt*ndcoup*2]
diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gux_ttxux/CudaRuntime.h b/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gux_ttxux/CudaRuntime.h
deleted file mode 120000
index ce9e1a487a..0000000000
--- a/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gux_ttxux/CudaRuntime.h
+++ /dev/null
@@ -1 +0,0 @@
-../CudaRuntime.h
\ No newline at end of file
diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gux_ttxux/GpuAbstraction.h b/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gux_ttxux/GpuAbstraction.h
new file mode 120000
index 0000000000..72054e19ba
--- /dev/null
+++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gux_ttxux/GpuAbstraction.h
@@ -0,0 +1 @@
+../GpuAbstraction.h
\ No newline at end of file
diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gux_ttxux/GpuRuntime.h b/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gux_ttxux/GpuRuntime.h
new file mode 120000
index 0000000000..3920e83be4
--- /dev/null
+++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gux_ttxux/GpuRuntime.h
@@ -0,0 +1 @@
+../GpuRuntime.h
\ No newline at end of file
diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gux_ttxux/check_sa.cc b/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gux_ttxux/check_sa.cc
index f1e75b9252..1bad694d1c 100644
--- a/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gux_ttxux/check_sa.cc
+++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gux_ttxux/check_sa.cc
@@ -4,7 +4,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: O. Mattelaer (Nov 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 
 #include "mgOnGpuConfig.h"
@@ -12,6 +12,7 @@
 #include "BridgeKernels.h"
 #include "CPPProcess.h"
 #include "CrossSectionKernels.h"
+#include "GpuRuntime.h"
 #include "MatrixElementKernels.h"
 #include "MemoryAccessMatrixElements.h"
 #include "MemoryAccessMomenta.h"
@@ -63,7 +64,7 @@ usage( char* argv0, int ret = 1 )
   std::cout << std::endl;
   std::cout << "Summary stats are always computed: '-p' and '-j' only control their printout" << std::endl;
   std::cout << "The '-d' flag only enables NaN/abnormal warnings and OMP debugging" << std::endl;
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 #ifdef _OPENMP
   std::cout << std::endl;
   std::cout << "Use the OMP_NUM_THREADS environment variable to control OMP multi-threading" << std::endl;
@@ -77,7 +78,7 @@ int
 main( int argc, char** argv )
 {
   // Namespaces for CUDA and C++ (FIXME - eventually use the same namespace everywhere...)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   using namespace mg5amcGpu;
 #else
   using namespace mg5amcCpu;
@@ -103,11 +104,11 @@ main( int argc, char** argv )
     CurandDevice = 2
   };
 #ifdef __CUDACC__
-  RandomNumberMode rndgen = RandomNumberMode::CurandDevice; // default on GPU
+  RandomNumberMode rndgen = RandomNumberMode::CurandDevice; // default on CUDA GPU
 #elif not defined MGONGPU_HAS_NO_CURAND
   RandomNumberMode rndgen = RandomNumberMode::CurandHost;  // default on CPU if build has curand
 #else
-  RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // default on CPU if build has no curand
+  RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // default on CPU if build has no curand and on HIP GPU
 #endif
   // Rambo sampling mode (NB RamboHost implies CommonRandom or CurandHost!)
   enum class RamboSamplingMode
@@ -115,7 +116,7 @@ main( int argc, char** argv )
     RamboHost = 1,
     RamboDevice = 2
   };
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   RamboSamplingMode rmbsmp = RamboSamplingMode::RamboDevice; // default on GPU
 #else
   RamboSamplingMode rmbsmp = RamboSamplingMode::RamboHost; // default on CPU
@@ -148,7 +149,7 @@ main( int argc, char** argv )
 #ifdef __CUDACC__
       rndgen = RandomNumberMode::CurandDevice;
 #else
-      throw std::runtime_error( "CurandDevice is not supported on CPUs" );
+      throw std::runtime_error( "CurandDevice is not supported on CPUs or on HIP GPUs" );
 #endif
     }
     else if( arg == "--curhst" )
@@ -165,7 +166,7 @@ main( int argc, char** argv )
     }
     else if( arg == "--rmbdev" )
     {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
       rmbsmp = RamboSamplingMode::RamboDevice;
 #else
       throw std::runtime_error( "RamboDevice is not supported on CPUs" );
@@ -239,13 +240,13 @@ main( int argc, char** argv )
     return usage( argv[0] );
   }
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 #ifdef _OPENMP
   ompnumthreadsNotSetMeansOneThread( debug ? 1 : 0 ); // quiet(-1), info(0), debug(1)
 #endif
 #endif
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // Fail gently and avoid "Illegal instruction (core dumped)" if the host does not support the SIMD used in the ME calculation
   // Note: this prevents a crash on pmpe04 but not on some github CI nodes?
   // [NB: SIMD vectorization in mg5amc C++ code is only used in the ME calculation below MatrixElementKernelHost!]
@@ -263,14 +264,14 @@ main( int argc, char** argv )
 
   // === STEP 0 - INITIALISE
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 
-  // --- 00. Initialise cuda
-  // Instantiate a CudaRuntime at the beginnining of the application's main to
-  // invoke cudaSetDevice(0) in the constructor and book a cudaDeviceReset() call in the destructor
-  const std::string cdinKey = "00 CudaInit";
+  // --- 00. Initialise GPU
+  // Instantiate a GpuRuntime at the beginnining of the application's main.
+  // For CUDA this invokes cudaSetDevice(0) in the constructor and books a cudaDeviceReset() call in the destructor.
+  const std::string cdinKey = "00 GpuInit";
   timermap.start( cdinKey );
-  CudaRuntime cudaRuntime( debug );
+  GpuRuntime GpuRuntime( debug );
 #endif
 
   // --- 0a. Initialise physics process
@@ -292,7 +293,7 @@ main( int argc, char** argv )
   timermap.start( alloKey );
 
   // Memory buffers for random numbers for momenta
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferRndNumMomenta hstRndmom( nevt );
 #else
   PinnedHostBufferRndNumMomenta hstRndmom( nevt );
@@ -300,7 +301,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for sampling weights
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferWeights hstWeights( nevt );
 #else
   PinnedHostBufferWeights hstWeights( nevt );
@@ -308,7 +309,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for momenta
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferMomenta hstMomenta( nevt );
 #else
   PinnedHostBufferMomenta hstMomenta( nevt );
@@ -316,7 +317,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for Gs
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferGs hstGs( nevt );
 #else
   PinnedHostBufferGs hstGs( nevt );
@@ -333,7 +334,7 @@ main( int argc, char** argv )
   }
 
   // Memory buffers for matrix elements
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferMatrixElements hstMatrixElements( nevt );
 #else
   PinnedHostBufferMatrixElements hstMatrixElements( nevt );
@@ -342,7 +343,7 @@ main( int argc, char** argv )
 
   // Memory buffers for random numbers for helicity selection
   // *** NB #403 these buffers always remain initialised at 0: no need for helicity choice in gcheck/check (no LHE produced) ***
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferRndNumHelicity hstRndHel( nevt );
 #else
   PinnedHostBufferRndNumHelicity hstRndHel( nevt );
@@ -351,7 +352,7 @@ main( int argc, char** argv )
 
   // Memory buffers for random numbers for color selection
   // *** NB #402 these buffers always remain initialised at 0: no need for color choice in gcheck/check (no LHE produced) ***
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferRndNumColor hstRndCol( nevt );
 #else
   PinnedHostBufferRndNumColor hstRndCol( nevt );
@@ -359,7 +360,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for helicity selection
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferSelectedHelicity hstSelHel( nevt );
 #else
   PinnedHostBufferSelectedHelicity hstSelHel( nevt );
@@ -367,7 +368,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for color selection
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferSelectedColor hstSelCol( nevt );
 #else
   PinnedHostBufferSelectedColor hstSelCol( nevt );
@@ -403,7 +404,7 @@ main( int argc, char** argv )
 #else
   else
   {
-    throw std::logic_error( "CurandDevice is not supported on CPUs" ); // INTERNAL ERROR (no path to this statement)
+    throw std::logic_error( "CurandDevice is not supported on CPUs or HIP GPUs" ); // INTERNAL ERROR (no path to this statement)
   }
 #endif
 #else
@@ -421,7 +422,7 @@ main( int argc, char** argv )
   }
   else
   {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     prsk.reset( new RamboSamplingKernelDevice( energy, devRndmom, devMomenta, devWeights, gpublocks, gputhreads ) );
 #else
     throw std::logic_error( "RamboDevice is not supported on CPUs" ); // INTERNAL ERROR (no path to this statement)
@@ -432,7 +433,7 @@ main( int argc, char** argv )
   std::unique_ptr<MatrixElementKernelBase> pmek;
   if( !bridge )
   {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     pmek.reset( new MatrixElementKernelDevice( devMomenta, devGs, devRndHel, devRndCol, devMatrixElements, devSelHel, devSelCol, gpublocks, gputhreads ) );
 #else
     pmek.reset( new MatrixElementKernelHost( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, nevt ) );
@@ -440,7 +441,7 @@ main( int argc, char** argv )
   }
   else
   {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     pmek.reset( new BridgeKernelDevice( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, gpublocks, gputhreads ) );
 #else
     pmek.reset( new BridgeKernelHost( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, nevt ) );
@@ -482,7 +483,7 @@ main( int argc, char** argv )
     prnk->generateRnarray();
     //std::cout << "Got random numbers" << std::endl;
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     if( rndgen != RandomNumberMode::CurandDevice && rmbsmp == RamboSamplingMode::RamboDevice )
     {
       // --- 1c. Copy rndmom from host to device
@@ -514,7 +515,7 @@ main( int argc, char** argv )
     prsk->getMomentaFinal();
     //std::cout << "Got final momenta" << std::endl;
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     if( rmbsmp == RamboSamplingMode::RamboDevice )
     {
       // --- 2c. CopyDToH Weights
@@ -559,7 +560,7 @@ main( int argc, char** argv )
       dynamic_cast<BridgeKernelBase*>( pmek.get() )->transposeInputMomentaC2F();
     }
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     // --- 2d. CopyHToD Momenta
     const std::string gKey = "0.. CpHTDg";
     rambtime += timermap.start( gKey ); // FIXME! NOT A RAMBO TIMER!
@@ -588,7 +589,7 @@ main( int argc, char** argv )
     wv3atime += timermap.stop(); // calc only
     wavetime += wv3atime;        // calc plus copy
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     if( !bridge )
     {
       // --- 3b. CopyDToH MEs
@@ -731,15 +732,19 @@ main( int argc, char** argv )
     rndgentxt = "CURAND DEVICE";
 #ifdef __CUDACC__
   rndgentxt += " (CUDA code)";
+#elif defined __HIPCC__
+  rndgentxt += " (HIP code)";
 #else
   rndgentxt += " (C++ code)";
 #endif
 
   // Workflow description summary
   std::string wrkflwtxt;
-  // -- CUDA or C++?
+  // -- CUDA or HIP or C++?
 #ifdef __CUDACC__
   wrkflwtxt += "CUD:";
+#elif defined __HIPCC__
+  wrkflwtxt += "HIP:";
 #else
   wrkflwtxt += "CPP:";
 #endif
@@ -764,6 +769,12 @@ main( int argc, char** argv )
 #else
   wrkflwtxt += "???:"; // no path to this statement
 #endif
+#elif defined __HIPCC__
+#if defined MGONGPU_CUCXTYPE_CXSMPL
+  wrkflwtxt += "CXS:";
+#else
+  wrkflwtxt += "???:"; // no path to this statement
+#endif
 #else
 #if defined MGONGPU_CPPCXTYPE_STDCOMPLEX
   wrkflwtxt += "STX:";
@@ -789,7 +800,7 @@ main( int argc, char** argv )
     wrkflwtxt += "RMBDEV+";
   else
     wrkflwtxt += "??????+"; // no path to this statement
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   // -- HOST or DEVICE matrix elements? Standalone MEs or BRIDGE?
   if( !bridge )
     wrkflwtxt += "MESDEV";
@@ -845,7 +856,7 @@ main( int argc, char** argv )
 
   if( perf )
   {
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 #ifdef _OPENMP
     // Get the output of "nproc --all" (https://stackoverflow.com/a/478960)
     std::string nprocall;
@@ -866,6 +877,8 @@ main( int argc, char** argv )
     std::cout << std::string( SEP79, '*' ) << std::endl
 #ifdef __CUDACC__
               << "Process                     = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_CUDA"
+#elif defined __HIPCC__
+              << "Process                     = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_HIP"
 #else
               << "Process                     = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_CPP"
 #endif
@@ -892,21 +905,21 @@ main( int argc, char** argv )
 #elif defined MGONGPU_FPTYPE_FLOAT
               << "FP precision                = FLOAT (NaN/abnormal=" << nabn << ", zero=" << nzero << ")" << std::endl
 #endif
-#ifdef __CUDACC__
 #if defined MGONGPU_CUCXTYPE_CUCOMPLEX
               << "Complex type                = CUCOMPLEX" << std::endl
 #elif defined MGONGPU_CUCXTYPE_THRUST
               << "Complex type                = THRUST::COMPLEX" << std::endl
-#endif
-#else
+#elif defined MGONGPU_CUCXTYPE_CXSMPL
               << "Complex type                = STD::COMPLEX" << std::endl
+#else
+              << "Complex type                = ???" << std::endl // no path to this statement...
 #endif
               << "RanNumb memory layout       = AOSOA[" << neppR << "]"
               << ( neppR == 1 ? " == AOS" : "" )
               << " [HARDCODED FOR REPRODUCIBILITY]" << std::endl
               << "Momenta memory layout       = AOSOA[" << neppM << "]"
               << ( neppM == 1 ? " == AOS" : "" ) << std::endl
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     //<< "Wavefunction GPU memory     = LOCAL" << std::endl
 #else
 #if !defined MGONGPU_CPPSIMD
@@ -937,7 +950,7 @@ main( int argc, char** argv )
 #endif
 #endif
               << "Random number generation    = " << rndgentxt << std::endl
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 #ifdef _OPENMP
               << "OMP threads / `nproc --all` = " << omp_get_max_threads() << " / " << nprocall // includes a newline
 #endif
@@ -1033,14 +1046,14 @@ main( int argc, char** argv )
              << "\"FLOAT (NaN/abnormal=" << nabn << ")\"," << std::endl
 #endif
              << "\"Complex type\": "
-#ifdef __CUDACC__
 #if defined MGONGPU_CUCXTYPE_CUCOMPLEX
              << "\"CUCOMPLEX\"," << std::endl
 #elif defined MGONGPU_CUCXTYPE_THRUST
              << "\"THRUST::COMPLEX\"," << std::endl
-#endif
-#else
+#elif defined MGONGPU_CUCXTYPE_CXSMPL
              << "\"STD::COMPLEX\"," << std::endl
+#else
+             << "\"???\"," << std::endl                           // no path to this statement...
 #endif
              << "\"RanNumb memory layout\": "
              << "\"AOSOA[" << neppR << "]\""
@@ -1048,7 +1061,7 @@ main( int argc, char** argv )
              << "\"Momenta memory layout\": "
              << "\"AOSOA[" << neppM << "]\""
              << ( neppM == 1 ? " == AOS" : "" ) << ", " << std::endl
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     //<< "\"Wavefunction GPU memory\": " << "\"LOCAL\"," << std::endl
 #endif
              << "\"Curand generation\": "
diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/RamboSamplingKernels.cc b/epochX/cudacpp/gq_ttq.sa/SubProcesses/RamboSamplingKernels.cc
index da68aa9255..79abbcc4f8 100644
--- a/epochX/cudacpp/gq_ttq.sa/SubProcesses/RamboSamplingKernels.cc
+++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/RamboSamplingKernels.cc
@@ -1,11 +1,11 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #include "RamboSamplingKernels.h"
 
-#include "CudaRuntime.h"
+#include "GpuRuntime.h"
 #include "MemoryAccessMomenta.h"
 #include "MemoryAccessRandomNumbers.h"
 #include "MemoryAccessWeights.h"
@@ -14,7 +14,7 @@
 
 #include <sstream>
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -92,7 +92,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   RamboSamplingKernelDevice::RamboSamplingKernelDevice( const fptype energy,               // input: energy
                                                         const BufferRndNumMomenta& rndmom, // input: random numbers in [0,1]
                                                         BufferMomenta& momenta,            // output: momenta
@@ -135,7 +135,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   __global__ void
   getMomentaInitialDevice( const fptype energy,
                            fptype* momenta )
@@ -147,17 +147,17 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   void
   RamboSamplingKernelDevice::getMomentaInitial()
   {
-    getMomentaInitialDevice<<<m_gpublocks, m_gputhreads>>>( m_energy, m_momenta.data() );
+    gpuLaunchKernel( getMomentaInitialDevice, m_gpublocks, m_gputhreads, m_energy, m_momenta.data() );
   }
 #endif
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   __global__ void
   getMomentaFinalDevice( const fptype energy,
                          const fptype* rndmom,
@@ -171,11 +171,11 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   void
   RamboSamplingKernelDevice::getMomentaFinal()
   {
-    getMomentaFinalDevice<<<m_gpublocks, m_gputhreads>>>( m_energy, m_rndmom.data(), m_momenta.data(), m_weights.data() );
+    gpuLaunchKernel( getMomentaFinalDevice, m_gpublocks, m_gputhreads, m_energy, m_rndmom.data(), m_momenta.data(), m_weights.data() );
   }
 #endif
 
diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/RamboSamplingKernels.h b/epochX/cudacpp/gq_ttq.sa/SubProcesses/RamboSamplingKernels.h
index 184089efd7..7c214cd74b 100644
--- a/epochX/cudacpp/gq_ttq.sa/SubProcesses/RamboSamplingKernels.h
+++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/RamboSamplingKernels.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef RAMBOSAMPLINGKERNELS_H
 #define RAMBOSAMPLINGKERNELS_H 1
@@ -10,7 +10,7 @@
 
 #include "MemoryBuffers.h"
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -93,7 +93,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   // A class encapsulating RAMBO phase space sampling on a GPU device
   class RamboSamplingKernelDevice final : public SamplingKernelBase, public NumberOfEvents
   {
diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/RandomNumberKernels.h b/epochX/cudacpp/gq_ttq.sa/SubProcesses/RandomNumberKernels.h
index 188a72c2c9..21d63beeac 100644
--- a/epochX/cudacpp/gq_ttq.sa/SubProcesses/RandomNumberKernels.h
+++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/RandomNumberKernels.h
@@ -1,14 +1,14 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef RANDOMNUMBERKERNELS_H
 #define RANDOMNUMBERKERNELS_H 1
 
 #include "mgOnGpuConfig.h"
 
-// NB This must come AFTER mgOnGpuConfig.h which contains our definition of __global__ when __CUDACC__ is not defined
+// NB This must come AFTER mgOnGpuConfig.h which contains our definition of __global__ when MGONGPUCPP_GPUIMPL is not defined
 #ifndef MGONGPU_HAS_NO_CURAND
 //#include "curand.h"
 struct curandGenerator_st; // forward definition from curand.h
@@ -16,7 +16,7 @@ struct curandGenerator_st; // forward definition from curand.h
 
 #include "MemoryBuffers.h"
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/cudacpp.mk b/epochX/cudacpp/gq_ttq.sa/SubProcesses/cudacpp.mk
index a0397e9ecc..bcb73d7f01 100644
--- a/epochX/cudacpp/gq_ttq.sa/SubProcesses/cudacpp.mk
+++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/cudacpp.mk
@@ -1,7 +1,7 @@
 # Copyright (C) 2020-2023 CERN and UCLouvain.
 # Licensed under the GNU Lesser General Public License (version 3 or later).
 # Created by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin.
-# Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+# Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 
 #=== Determine the name of this makefile (https://ftp.gnu.org/old-gnu/Manuals/make-3.80/html_node/make_17.html)
 #=== NB: different names (e.g. cudacpp.mk and cudacpp_src.mk) are used in the Subprocess and src directories
@@ -32,7 +32,7 @@ UNAME_P := $(shell uname -p)
 #=== Configure common compiler flags for C++ and CUDA
 
 INCFLAGS = -I.
-OPTFLAGS = -O3 # this ends up in CUFLAGS too (should it?), cannot add -Ofast or -ffast-math here
+OPTFLAGS = -O3 # this ends up in GPUFLAGS too (should it?), cannot add -Ofast or -ffast-math here
 
 # Dependency on src directory
 MG5AMC_COMMONLIB = mg5amc_common
@@ -89,69 +89,139 @@ endif
 
 #-------------------------------------------------------------------------------
 
-#=== Configure the CUDA compiler
-
-# If CXX is not a single word (example "clang++ --gcc-toolchain...") then disable CUDA builds (issue #505)
-# This is because it is impossible to pass this to "CUFLAGS += -ccbin <host-compiler>" below
-ifneq ($(words $(subst ccache ,,$(CXX))),1) # allow at most "CXX=ccache <host-compiler>" from outside
-  $(warning CUDA builds are not supported for multi-word CXX "$(CXX)")
-  override CUDA_HOME=disabled
-endif
-
-# If CUDA_HOME is not set, try to set it from the location of nvcc
-ifndef CUDA_HOME
-  CUDA_HOME = $(patsubst %bin/nvcc,%,$(shell which nvcc 2>/dev/null))
-  $(warning CUDA_HOME was not set: using "$(CUDA_HOME)")
-endif
-
-# Set NVCC as $(CUDA_HOME)/bin/nvcc if it exists
-ifneq ($(wildcard $(CUDA_HOME)/bin/nvcc),)
-  NVCC = $(CUDA_HOME)/bin/nvcc
-  USE_NVTX ?=-DUSE_NVTX
-  # See https://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/index.html
-  # See https://arnon.dk/matching-sm-architectures-arch-and-gencode-for-various-nvidia-cards/
-  # Default: use compute capability 70 for V100 (CERN lxbatch, CERN itscrd, Juwels Cluster).
-  # Embed device code for 70, and PTX for 70+.
-  # Export MADGRAPH_CUDA_ARCHITECTURE (comma-separated list) to use another value or list of values (see #533).
-  # Examples: use 60 for P100 (Piz Daint), 80 for A100 (Juwels Booster, NVidia raplab/Curiosity).
-  MADGRAPH_CUDA_ARCHITECTURE ?= 70
-  ###CUARCHFLAGS = -gencode arch=compute_$(MADGRAPH_CUDA_ARCHITECTURE),code=compute_$(MADGRAPH_CUDA_ARCHITECTURE) -gencode arch=compute_$(MADGRAPH_CUDA_ARCHITECTURE),code=sm_$(MADGRAPH_CUDA_ARCHITECTURE) # Older implementation (AV): go back to this one for multi-GPU support #533
-  ###CUARCHFLAGS = --gpu-architecture=compute_$(MADGRAPH_CUDA_ARCHITECTURE) --gpu-code=sm_$(MADGRAPH_CUDA_ARCHITECTURE),compute_$(MADGRAPH_CUDA_ARCHITECTURE) # Newer implementation (SH): cannot use this as-is for multi-GPU support #533
-  comma:=,
-  CUARCHFLAGS = $(foreach arch,$(subst $(comma), ,$(MADGRAPH_CUDA_ARCHITECTURE)),-gencode arch=compute_$(arch),code=compute_$(arch) -gencode arch=compute_$(arch),code=sm_$(arch))
-  CUINC = -I$(CUDA_HOME)/include/
-  CURANDLIBFLAGS = -L$(CUDA_HOME)/lib64/ -lcurand # NB: -lcuda is not needed here!
-  CUOPTFLAGS = -lineinfo
-  CUFLAGS = $(OPTFLAGS) $(CUOPTFLAGS) $(INCFLAGS) $(CUINC) $(USE_NVTX) $(CUARCHFLAGS) -use_fast_math
-  ###CUFLAGS += -Xcompiler -Wall -Xcompiler -Wextra -Xcompiler -Wshadow
-  ###NVCC_VERSION = $(shell $(NVCC) --version | grep 'Cuda compilation tools' | cut -d' ' -f5 | cut -d, -f1)
-  CUFLAGS += -std=c++17 # need CUDA >= 11.2 (see #333): this is enforced in mgOnGpuConfig.h
-  # Without -maxrregcount: baseline throughput: 6.5E8 (16384 32 12) up to 7.3E8 (65536 128 12)
-  ###CUFLAGS+= --maxrregcount 160 # improves throughput: 6.9E8 (16384 32 12) up to 7.7E8 (65536 128 12)
-  ###CUFLAGS+= --maxrregcount 128 # improves throughput: 7.3E8 (16384 32 12) up to 7.6E8 (65536 128 12)
-  ###CUFLAGS+= --maxrregcount 96 # degrades throughput: 4.1E8 (16384 32 12) up to 4.5E8 (65536 128 12)
-  ###CUFLAGS+= --maxrregcount 64 # degrades throughput: 1.7E8 (16384 32 12) flat at 1.7E8 (65536 128 12)
-else ifneq ($(origin REQUIRE_CUDA),undefined)
-  # If REQUIRE_CUDA is set but no cuda is found, stop here (e.g. for CI tests on GPU #443)
-  $(error No cuda installation found (set CUDA_HOME or make nvcc visible in PATH))
-else
-  # No cuda. Switch cuda compilation off and go to common random numbers in C++
-  $(warning CUDA_HOME is not set or is invalid: export CUDA_HOME to compile with cuda)
-  override NVCC=
-  override USE_NVTX=
-  override CUINC=
-  override CURANDLIBFLAGS=
-endif
+CUDA_COMPILER_PATH := $(shell compiler="`which nvcc 2>/dev/null`" && while [ -L "$$compiler" ]; do compiler=`readlink "$$compiler"`; done && echo "$$compiler")
+HIP_COMPILER_PATH := $(shell compiler="`which hipcc 2>/dev/null`" && while [ -L "$$compiler" ]; do compiler=`readlink "$$compiler"`; done && echo "$$compiler")
+
+ifeq ($(findstring nvcc,$(CUDA_COMPILER_PATH)),nvcc)
+  #=== Configure the CUDA compiler
+
+  # If CXX is not a single word (example "clang++ --gcc-toolchain...") then disable CUDA builds (issue #505)
+  # This is because it is impossible to pass this to "GPUFLAGS += -ccbin <host-compiler>" below
+  ifneq ($(words $(subst ccache ,,$(CXX))),1) # allow at most "CXX=ccache <host-compiler>" from outside
+    $(warning CUDA builds are not supported for multi-word CXX "$(CXX)")
+    override CUDA_HOME=disabled
+  endif
+
+  # If CUDA_HOME is not set, try to set it from the location of NVCC
+  ifndef CUDA_HOME
+    CUDA_HOME = $(patsubst %bin/nvcc,%,$(shell which nvcc 2>/dev/null))
+    $(warning CUDA_HOME was not set: using "$(CUDA_HOME)")
+  endif
+
+  # Set GPUCC as $(CUDA_HOME)/bin/nvcc if it exists
+  ifneq ($(wildcard $(CUDA_HOME)/bin/nvcc),)
+    GPUCC = $(CUDA_HOME)/bin/nvcc
+    USE_NVTX ?=-DUSE_NVTX
+    # See https://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/index.html
+    # See https://arnon.dk/matching-sm-architectures-arch-and-gencode-for-various-nvidia-cards/
+    # Default: use compute capability 70 for V100 (CERN lxbatch, CERN itscrd, Juwels Cluster).
+    # Embed device code for 70, and PTX for 70+.
+    # Export MADGRAPH_CUDA_ARCHITECTURE (comma-separated list) to use another value or list of values (see #533).
+    # Examples: use 60 for P100 (Piz Daint), 80 for A100 (Juwels Booster, NVidia raplab/Curiosity).
+    MADGRAPH_CUDA_ARCHITECTURE ?= 70
+    ###CUARCHFLAGS = -gencode arch=compute_$(MADGRAPH_CUDA_ARCHITECTURE),code=compute_$(MADGRAPH_CUDA_ARCHITECTURE) -gencode arch=compute_$(MADGRAPH_CUDA_ARCHITECTURE),code=sm_$(MADGRAPH_CUDA_ARCHITECTURE) # Older implementation (AV): go back to this one for multi-GPU support #533
+    ###CUARCHFLAGS = --gpu-architecture=compute_$(MADGRAPH_CUDA_ARCHITECTURE) --gpu-code=sm_$(MADGRAPH_CUDA_ARCHITECTURE),compute_$(MADGRAPH_CUDA_ARCHITECTURE) # Newer implementation (SH): cannot use this as-is for multi-GPU support #533
+    comma:=,
+    CUARCHFLAGS = $(foreach arch,$(subst $(comma), ,$(MADGRAPH_CUDA_ARCHITECTURE)),-gencode arch=compute_$(arch),code=compute_$(arch) -gencode arch=compute_$(arch),code=sm_$(arch))
+    CUINC = -I$(CUDA_HOME)/include/
+    CURANDLIBFLAGS = -L$(CUDA_HOME)/lib64/ -lcurand # NB: -lcuda is not needed here!
+    CUOPTFLAGS = -lineinfo
+    GPUFLAGS = $(OPTFLAGS) $(CUOPTFLAGS) $(INCFLAGS) $(CUINC) $(USE_NVTX) $(CUARCHFLAGS) -use_fast_math
+    ###GPUFLAGS += -Xcompiler -Wall -Xcompiler -Wextra -Xcompiler -Wshadow
+    ###GPUCC_VERSION = $(shell $(GPUCC) --version | grep 'Cuda compilation tools' | cut -d' ' -f5 | cut -d, -f1)
+    GPUFLAGS += -std=c++17 # need CUDA >= 11.2 (see #333): this is enforced in mgOnGpuConfig.h
+    # Without -maxrregcount: baseline throughput: 6.5E8 (16384 32 12) up to 7.3E8 (65536 128 12)
+    ###GPUFLAGS+= --maxrregcount 160 # improves throughput: 6.9E8 (16384 32 12) up to 7.7E8 (65536 128 12)
+    ###GPUFLAGS+= --maxrregcount 128 # improves throughput: 7.3E8 (16384 32 12) up to 7.6E8 (65536 128 12)
+    ###GPUFLAGS+= --maxrregcount 96 # degrades throughput: 4.1E8 (16384 32 12) up to 4.5E8 (65536 128 12)
+    ###GPUFLAGS+= --maxrregcount 64 # degrades throughput: 1.7E8 (16384 32 12) flat at 1.7E8 (65536 128 12)
+
+    CUBUILDRULEFLAGS = -Xcompiler -fPIC -c
+    CCBUILDRULEFLAGS = -Xcompiler -fPIC -c -x cu
+
+    CUDATESTFLAGS = -lcuda
+
+  else ifneq ($(origin REQUIRE_CUDA),undefined)
+    # If REQUIRE_CUDA is set but no cuda is found, stop here (e.g. for CI tests on GPU #443)
+    $(error No cuda installation found (set CUDA_HOME or make GPUCC visible in PATH))
+  else
+    # No cuda. Switch cuda compilation off and go to common random numbers in C++
+    $(warning CUDA_HOME is not set or is invalid: export CUDA_HOME to compile with cuda)
+    override GPUCC=
+    override USE_NVTX=
+    override CUINC=
+    override CURANDLIBFLAGS=
+  endif
+
+  # Set the host C++ compiler for GPUCC via "-ccbin <host-compiler>"
+  # (NB issue #505: this must be a single word, "clang++ --gcc-toolchain..." is not supported)
+  GPUFLAGS += -ccbin $(shell which $(subst ccache ,,$(CXX)))
+
+  # Allow newer (unsupported) C++ compilers with older versions of CUDA if ALLOW_UNSUPPORTED_COMPILER_IN_CUDA is set (#504)
+  ifneq ($(origin ALLOW_UNSUPPORTED_COMPILER_IN_CUDA),undefined)
+  GPUFLAGS += -allow-unsupported-compiler
+  endif
+
+else ifeq ($(findstring hipcc,$(HIP_COMPILER_PATH)),hipcc)
+  #=== Configure the HIP compiler
+
+  # If CXX is not a single word (example "clang++ --gcc-toolchain...") then disable CUDA builds (issue #505)
+  # This is because it is impossible to pass this to "GPUFLAGS += -ccbin <host-compiler>" below
+  ifneq ($(words $(subst ccache ,,$(CXX))),1) # allow at most "CXX=ccache <host-compiler>" from outside
+    $(warning CUDA builds are not supported for multi-word CXX "$(CXX)")
+    override CUDA_HOME=disabled
+  endif
+
+  # If HIP_HOME is not set, try to set it from the location of GPUCC
+  ifndef HIP_HOME
+    HIP_HOME = $(patsubst %bin/hipcc,%,$(HIP_COMPILER_PATH))
+    $(warning HIP_HOME was not set: using "$(HIP_HOME)")
+  endif
 
-# Set the host C++ compiler for nvcc via "-ccbin <host-compiler>"
-# (NB issue #505: this must be a single word, "clang++ --gcc-toolchain..." is not supported)
-CUFLAGS += -ccbin $(shell which $(subst ccache ,,$(CXX)))
+  # Set GPUCC as $(HIP_HOME)/bin/hipcc if it exists
+  ifneq ($(wildcard $(HIP_HOME)/bin/hipcc),)
+    GPUCC = $(HIP_HOME)/bin/hipcc
+
+    # Should maybe find something equivelant to this in HIP
+    #USE_NVTX ?=-DUSE_NVTX
+
+    HIPARCHFLAGS = -target x86_64-linux-gnu --offload-arch=gfx90a
+    HIPINC = -I$(HIP_HOME)/include/
+
+    # -DHIP_FAST_MATH equivelant to -use_fast_math in HIP 
+    # (But only for single precision line 208: https://rocm-developer-tools.github.io/HIP/hcc__detail_2math__functions_8h_source.html)
+    GPUFLAGS = $(OPTFLAGS) $(CUOPTFLAGS) $(INCFLAGS) $(HIPINC) $(HIPARCHFLAGS) -DHIP_FAST_MATH -DHIP_PLATFORM=amd -fPIC
+    ###GPUFLAGS += -Xcompiler -Wall -Xcompiler -Wextra -Xcompiler -Wshadow
+    GPUFLAGS += -std=c++17
+    # Without -maxrregcount: baseline throughput: 6.5E8 (16384 32 12) up to 7.3E8 (65536 128 12)
+    ###GPUFLAGS+= --maxrregcount 160 # improves throughput: 6.9E8 (16384 32 12) up to 7.7E8 (65536 128 12)
+    ###GPUFLAGS+= --maxrregcount 128 # improves throughput: 7.3E8 (16384 32 12) up to 7.6E8 (65536 128 12)
+    ###GPUFLAGS+= --maxrregcount 96 # degrades throughput: 4.1E8 (16384 32 12) up to 4.5E8 (65536 128 12)
+    ###GPUFLAGS+= --maxrregcount 64 # degrades throughput: 1.7E8 (16384 32 12) flat at 1.7E8 (65536 128 12)
+
+    CUBUILDRULEFLAGS = -fPIC -c
+    CCBUILDRULEFLAGS = -fPIC -c
+
+  else ifneq ($(origin REQUIRE_HIP),undefined)
+    # If REQUIRE_HIP is set but no cuda is found, stop here (e.g. for CI tests on GPU #443)
+    $(error No hip installation found (set HIP_HOME or make GPUCC visible in PATH))
+  else
+    # No hip. Switch hip compilation off and go to common random numbers in C++
+    $(warning HIP_HOME is not set or is invalid: export HIP_HOME to compile with hip)
+    override GPUCC=
+    override USE_NVTX=
+    override CUINC=
+    override CURANDLIBFLAGS=
+  endif
+
+  # Allow newer (unsupported) C++ compilers with older versions of CUDA if ALLOW_UNSUPPORTED_COMPILER_IN_CUDA is set (#504)
+  ifneq ($(origin ALLOW_UNSUPPORTED_COMPILER_IN_CUDA),undefined)
+  GPUFLAGS += -allow-unsupported-compiler
+  endif
 
-# Allow newer (unsupported) C++ compilers with older versions of CUDA if ALLOW_UNSUPPORTED_COMPILER_IN_CUDA is set (#504)
-ifneq ($(origin ALLOW_UNSUPPORTED_COMPILER_IN_CUDA),undefined)
-CUFLAGS += -allow-unsupported-compiler
 endif
 
+  
 #-------------------------------------------------------------------------------
 
 #=== Configure ccache for C++ and CUDA builds
@@ -163,9 +233,9 @@ endif
 #ifeq ($(USECCACHE)$(shell echo $(AR) | grep ccache),1)
 #  override AR:=ccache $(AR)
 #endif
-ifneq ($(NVCC),)
-  ifeq ($(USECCACHE)$(shell echo $(NVCC) | grep ccache),1)
-    override NVCC:=ccache $(NVCC)
+ifneq ($(GPUCC),)
+  ifeq ($(USECCACHE)$(shell echo $(GPUCC) | grep ccache),1)
+    override GPUCC:=ccache $(GPUCC)
   endif
 endif
 
@@ -189,7 +259,7 @@ endif
 
 # PowerPC-specific CUDA compiler flags (to be reviewed!)
 ifeq ($(UNAME_P),ppc64le)
-  CUFLAGS+= -Xcompiler -mno-float128
+  GPUFLAGS+= -Xcompiler -mno-float128
 endif
 
 #-------------------------------------------------------------------------------
@@ -199,10 +269,10 @@ endif
 # Set the default OMPFLAGS choice
 ifneq ($(shell $(CXX) --version | egrep '^Intel'),)
 override OMPFLAGS = -fopenmp
-###override OMPFLAGS = # disable OpenMP MT on Intel (was ok without nvcc but not ok with nvcc before #578)
+###override OMPFLAGS = # disable OpenMP MT on Intel (was ok without GPUCC but not ok with GPUCC before #578)
 else ifneq ($(shell $(CXX) --version | egrep '^(clang)'),)
 override OMPFLAGS = -fopenmp
-###override OMPFLAGS = # disable OpenMP MT on clang (was not ok without or with nvcc before #578)
+###override OMPFLAGS = # disable OpenMP MT on clang (was not ok without or with GPUCC before #578)
 else ifneq ($(shell $(CXX) --version | egrep '^(Apple clang)'),)
 override OMPFLAGS = # disable OpenMP MT on Apple clang (builds fail in the CI #578)
 else
@@ -253,7 +323,10 @@ endif
 
 # Set the default RNDGEN (random number generator) choice
 ifeq ($(RNDGEN),)
-  ifeq ($(NVCC),)
+  ifeq ($(GPUCC),)
+    override RNDGEN = hasNoCurand
+  # Edgecase for HIP compilation
+  else ifeq ($(findstring hipcc,$(GPUCC)),hipcc)
     override RNDGEN = hasNoCurand
   else ifeq ($(RNDGEN),)
     override RNDGEN = hasCurand
@@ -328,13 +401,13 @@ CXXFLAGS+= $(AVXFLAGS)
 $(info FPTYPE=$(FPTYPE))
 ifeq ($(FPTYPE),d)
   CXXFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_DOUBLE
-  CUFLAGS  += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_DOUBLE
+  GPUFLAGS  += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_DOUBLE
 else ifeq ($(FPTYPE),f)
   CXXFLAGS += -DMGONGPU_FPTYPE_FLOAT -DMGONGPU_FPTYPE2_FLOAT
-  CUFLAGS  += -DMGONGPU_FPTYPE_FLOAT -DMGONGPU_FPTYPE2_FLOAT
+  GPUFLAGS  += -DMGONGPU_FPTYPE_FLOAT -DMGONGPU_FPTYPE2_FLOAT
 else ifeq ($(FPTYPE),m)
   CXXFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_FLOAT
-  CUFLAGS  += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_FLOAT
+  GPUFLAGS  += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_FLOAT
 else
   $(error Unknown FPTYPE='$(FPTYPE)': only 'd', 'f' and 'm' are supported)
 endif
@@ -343,7 +416,7 @@ endif
 $(info HELINL=$(HELINL))
 ifeq ($(HELINL),1)
   CXXFLAGS += -DMGONGPU_INLINE_HELAMPS
-  CUFLAGS  += -DMGONGPU_INLINE_HELAMPS
+  GPUFLAGS  += -DMGONGPU_INLINE_HELAMPS
 else ifneq ($(HELINL),0)
   $(error Unknown HELINL='$(HELINL)': only '0' and '1' are supported)
 endif
@@ -352,7 +425,7 @@ endif
 $(info HRDCOD=$(HRDCOD))
 ifeq ($(HRDCOD),1)
   CXXFLAGS += -DMGONGPU_HARDCODE_PARAM
-  CUFLAGS  += -DMGONGPU_HARDCODE_PARAM
+  GPUFLAGS  += -DMGONGPU_HARDCODE_PARAM
 else ifneq ($(HRDCOD),0)
   $(error Unknown HRDCOD='$(HRDCOD)': only '0' and '1' are supported)
 endif
@@ -404,11 +477,11 @@ ifeq ($(UNAME_S),Darwin)
   override CULIBFLAGSRPATH2 =
 else
   # RPATH to cuda/cpp libs when linking executables
-  override CXXLIBFLAGSRPATH = -Wl,-rpath,$(LIBDIRRPATH)
-  override CULIBFLAGSRPATH = -Xlinker -rpath,$(LIBDIRRPATH)
+  override CXXLIBFLAGSRPATH = -Wl,-rpath=$(LIBDIRRPATH)
+  override CULIBFLAGSRPATH = -Xlinker -rpath=$(LIBDIRRPATH)
   # RPATH to common lib when linking cuda/cpp libs
-  override CXXLIBFLAGSRPATH2 = -Wl,-rpath,'$$ORIGIN'
-  override CULIBFLAGSRPATH2 = -Xlinker -rpath,'$$ORIGIN'
+  override CXXLIBFLAGSRPATH2 = -Wl,-rpath='$$ORIGIN'
+  override CULIBFLAGSRPATH2 = -Xlinker -rpath='$$ORIGIN'
 endif
 
 # Setting LD_LIBRARY_PATH or DYLD_LIBRARY_PATH in the RUNTIME is no longer necessary (neither on Linux nor on Mac)
@@ -421,7 +494,7 @@ override RUNTIME =
 cxx_main=$(BUILDDIR)/check.exe
 fcxx_main=$(BUILDDIR)/fcheck.exe
 
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 cu_main=$(BUILDDIR)/gcheck.exe
 fcu_main=$(BUILDDIR)/fgcheck.exe
 else
@@ -452,15 +525,16 @@ $(BUILDDIR)/.build.$(TAG):
 	@touch $(BUILDDIR)/.build.$(TAG)
 
 # Generic target and build rules: objects from CUDA compilation
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 $(BUILDDIR)/%.o : %.cu *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
 	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
-	$(NVCC) $(CPPFLAGS) $(CUFLAGS) -Xcompiler -fPIC -c $< -o $@
+	$(GPUCC) $(CPPFLAGS) $(GPUFLAGS) $(CUBUILDRULEFLAGS) $< -o $@
 
 $(BUILDDIR)/%_cu.o : %.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
 	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
-	$(NVCC) $(CPPFLAGS) $(CUFLAGS) -Xcompiler -fPIC -c -x cu $< -o $@
+	$(GPUCC) $(CPPFLAGS) $(GPUFLAGS) $(CCBUILDRULEFLAGS) $< -o $@
 endif
+# -x cu in line above
 
 # Generic target and build rules: objects from C++ compilation
 # (NB do not include CUINC here! add it only for NVTX or curand #679)
@@ -469,11 +543,14 @@ $(BUILDDIR)/%.o : %.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
 	$(CXX) $(CPPFLAGS) $(CXXFLAGS) -fPIC -c $< -o $@
 
 # Apply special build flags only to CrossSectionKernel.cc and gCrossSectionKernel.cu (no fast math, see #117)
+# Added edgecase for HIP compilation
 ifeq ($(shell $(CXX) --version | grep ^nvc++),)
 $(BUILDDIR)/CrossSectionKernels.o: CXXFLAGS += -fno-fast-math
 $(BUILDDIR)/CrossSectionKernels.o: CXXFLAGS += -fno-fast-math
-ifneq ($(NVCC),)
-$(BUILDDIR)/gCrossSectionKernels.o: CUFLAGS += -Xcompiler -fno-fast-math
+ifeq ($(findstring nvcc,$(GPUCC)),nvcc)
+  $(BUILDDIR)/gCrossSectionKernels.o: GPUFLAGS += -Xcompiler -fno-fast-math
+else
+  $(BUILDDIR)/gCrossSectionKernels.o: GPUFLAGS += -fno-fast-math
 endif
 endif
 
@@ -489,10 +566,10 @@ ifeq ($(RNDGEN),hasCurand)
 $(BUILDDIR)/CurandRandomNumberKernel.o: CXXFLAGS += $(CUINC)
 endif
 
-# Avoid "warning: builtin __has_trivial_... is deprecated; use __is_trivially_... instead" in nvcc with icx2023 (#592)
+# Avoid "warning: builtin __has_trivial_... is deprecated; use __is_trivially_... instead" in GPUCC with icx2023 (#592)
 ifneq ($(shell $(CXX) --version | egrep '^(Intel)'),)
-ifneq ($(NVCC),)
-CUFLAGS += -Xcompiler -Wno-deprecated-builtins
+ifneq ($(GPUCC),)
+GPUFLAGS += -Wno-deprecated-builtins
 endif
 endif
 
@@ -500,8 +577,8 @@ endif
 # This patch does remove the warning, but I prefer to keep it disabled for the moment...
 ###ifneq ($(shell $(CXX) --version | egrep '^(clang|Apple clang|Intel)'),)
 ###$(BUILDDIR)/CrossSectionKernels.o: CXXFLAGS += -Wno-overriding-t-option
-###ifneq ($(NVCC),)
-###$(BUILDDIR)/gCrossSectionKernels.o: CUFLAGS += -Xcompiler -Wno-overriding-t-option
+###ifneq ($(GPUCC),)
+###$(BUILDDIR)/gCrossSectionKernels.o: GPUFLAGS += -Xcompiler -Wno-overriding-t-option
 ###endif
 ###endif
 
@@ -528,7 +605,7 @@ MG5AMC_CXXLIB = mg5amc_$(processid_short)_cpp
 cxx_objects_lib=$(BUILDDIR)/CPPProcess.o $(BUILDDIR)/MatrixElementKernels.o $(BUILDDIR)/BridgeKernels.o $(BUILDDIR)/CrossSectionKernels.o
 cxx_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel.o $(BUILDDIR)/RamboSamplingKernels.o
 
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 MG5AMC_CULIB = mg5amc_$(processid_short)_cuda
 cu_objects_lib=$(BUILDDIR)/gCPPProcess.o $(BUILDDIR)/gMatrixElementKernels.o $(BUILDDIR)/gBridgeKernels.o $(BUILDDIR)/gCrossSectionKernels.o
 cu_objects_exe=$(BUILDDIR)/gCommonRandomNumberKernel.o $(BUILDDIR)/gRamboSamplingKernels.o
@@ -540,11 +617,11 @@ $(LIBDIR)/lib$(MG5AMC_CXXLIB).so: cxx_objects_lib += $(BUILDDIR)/fbridge.o
 $(LIBDIR)/lib$(MG5AMC_CXXLIB).so: $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib)
 	$(CXX) -shared -o $@ $(cxx_objects_lib) $(CXXLIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB)
 
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 $(LIBDIR)/lib$(MG5AMC_CULIB).so: $(BUILDDIR)/fbridge_cu.o
 $(LIBDIR)/lib$(MG5AMC_CULIB).so: cu_objects_lib += $(BUILDDIR)/fbridge_cu.o
 $(LIBDIR)/lib$(MG5AMC_CULIB).so: $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cu_objects_lib)
-	$(NVCC) --shared -o $@ $(cu_objects_lib) $(CULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB)
+	$(GPUCC) --shared -o $@ $(cu_objects_lib) $(CULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB)
 endif
 
 #-------------------------------------------------------------------------------
@@ -561,16 +638,16 @@ $(cxx_main): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PAT
 $(cxx_main): $(BUILDDIR)/check_sa.o $(LIBDIR)/lib$(MG5AMC_CXXLIB).so $(cxx_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel.o
 	$(CXX) -o $@ $(BUILDDIR)/check_sa.o $(OMPFLAGS) -ldl -pthread $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_CXXLIB) $(cxx_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel.o $(CURANDLIBFLAGS)
 
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 ifneq ($(shell $(CXX) --version | grep ^Intel),)
-$(cu_main): LIBFLAGS += -lintlc # compile with icpx and link with nvcc (undefined reference to `_intel_fast_memcpy')
-$(cu_main): LIBFLAGS += -lsvml # compile with icpx and link with nvcc (undefined reference to `__svml_cos4_l9')
+$(cu_main): LIBFLAGS += -lintlc # compile with icpx and link with GPUCC (undefined reference to `_intel_fast_memcpy')
+$(cu_main): LIBFLAGS += -lsvml # compile with icpx and link with GPUCC (undefined reference to `__svml_cos4_l9')
 else ifneq ($(shell $(CXX) --version | grep ^nvc++),) # support nvc++ #531
 $(cu_main): LIBFLAGS += -L$(patsubst %bin/nvc++,%lib,$(subst ccache ,,$(CXX))) -lnvhpcatm -lnvcpumath -lnvc
 endif
 $(cu_main): LIBFLAGS += $(CULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH
 $(cu_main): $(BUILDDIR)/gcheck_sa.o $(LIBDIR)/lib$(MG5AMC_CULIB).so $(cu_objects_exe) $(BUILDDIR)/gCurandRandomNumberKernel.o
-	$(NVCC) -o $@ $(BUILDDIR)/gcheck_sa.o $(CUARCHFLAGS) $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe) $(BUILDDIR)/gCurandRandomNumberKernel.o $(CURANDLIBFLAGS)
+	$(GPUCC) -o $@ $(BUILDDIR)/gcheck_sa.o $(CUARCHFLAGS) $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe) $(BUILDDIR)/gCurandRandomNumberKernel.o $(CURANDLIBFLAGS)
 endif
 
 #-------------------------------------------------------------------------------
@@ -596,17 +673,17 @@ $(fcxx_main): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PA
 $(fcxx_main): $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler.o $(LIBDIR)/lib$(MG5AMC_CXXLIB).so $(cxx_objects_exe)
 	$(CXX) -o $@ $(BUILDDIR)/fcheck_sa.o $(OMPFLAGS) $(BUILDDIR)/fsampler.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CXXLIB) $(cxx_objects_exe)
 
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 ifneq ($(shell $(CXX) --version | grep ^Intel),)
-$(fcu_main): LIBFLAGS += -lintlc # compile with icpx and link with nvcc (undefined reference to `_intel_fast_memcpy')
-$(fcu_main): LIBFLAGS += -lsvml # compile with icpx and link with nvcc (undefined reference to `__svml_cos4_l9')
+$(fcu_main): LIBFLAGS += -lintlc # compile with icpx and link with GPUCC (undefined reference to `_intel_fast_memcpy')
+$(fcu_main): LIBFLAGS += -lsvml # compile with icpx and link with GPUCC (undefined reference to `__svml_cos4_l9')
 endif
 ifeq ($(UNAME_S),Darwin)
 $(fcu_main): LIBFLAGS += -L$(shell dirname $(shell $(FC) --print-file-name libgfortran.dylib)) # add path to libgfortran on Mac #375
 endif
 $(fcu_main): LIBFLAGS += $(CULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH
 $(fcu_main): $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler_cu.o $(LIBDIR)/lib$(MG5AMC_CULIB).so $(cu_objects_exe)
-	$(NVCC) -o $@ $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler_cu.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe)
+	$(GPUCC) -o $@ $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler_cu.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe)
 endif
 
 #-------------------------------------------------------------------------------
@@ -618,7 +695,7 @@ $(BUILDDIR)/testxxx.o: testxxx_cc_ref.txt
 $(testmain): $(BUILDDIR)/testxxx.o
 $(testmain): cxx_objects_exe += $(BUILDDIR)/testxxx.o # Comment out this line to skip the C++ test of xxx functions
 
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 $(BUILDDIR)/testxxx_cu.o: $(GTESTLIBS)
 $(BUILDDIR)/testxxx_cu.o: INCFLAGS += $(GTESTINC)
 $(BUILDDIR)/testxxx_cu.o: testxxx_cc_ref.txt
@@ -631,7 +708,7 @@ $(BUILDDIR)/testmisc.o: INCFLAGS += $(GTESTINC)
 $(testmain): $(BUILDDIR)/testmisc.o
 $(testmain): cxx_objects_exe += $(BUILDDIR)/testmisc.o # Comment out this line to skip the C++ miscellaneous tests
 
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 $(BUILDDIR)/testmisc_cu.o: $(GTESTLIBS)
 $(BUILDDIR)/testmisc_cu.o: INCFLAGS += $(GTESTINC)
 $(testmain): $(BUILDDIR)/testmisc_cu.o
@@ -643,12 +720,12 @@ $(BUILDDIR)/runTest.o: INCFLAGS += $(GTESTINC)
 $(testmain): $(BUILDDIR)/runTest.o
 $(testmain): cxx_objects_exe += $(BUILDDIR)/runTest.o
 
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 $(BUILDDIR)/runTest_cu.o: $(GTESTLIBS)
 $(BUILDDIR)/runTest_cu.o: INCFLAGS += $(GTESTINC)
 ifneq ($(shell $(CXX) --version | grep ^Intel),)
-$(testmain): LIBFLAGS += -lintlc # compile with icpx and link with nvcc (undefined reference to `_intel_fast_memcpy')
-$(testmain): LIBFLAGS += -lsvml # compile with icpx and link with nvcc (undefined reference to `__svml_cos4_l9')
+$(testmain): LIBFLAGS += -lintlc # compile with icpx and link with GPUCC (undefined reference to `_intel_fast_memcpy')
+$(testmain): LIBFLAGS += -lsvml # compile with icpx and link with GPUCC (undefined reference to `__svml_cos4_l9')
 else ifneq ($(shell $(CXX) --version | grep ^nvc++),) # support nvc++ #531
 $(testmain): LIBFLAGS += -L$(patsubst %bin/nvc++,%lib,$(subst ccache ,,$(CXX))) -lnvhpcatm -lnvcpumath -lnvc
 endif
@@ -672,14 +749,14 @@ $(testmain): LIBFLAGS += -lgomp
 endif
 endif
 
-ifeq ($(NVCC),) # link only runTest.o
+ifeq ($(GPUCC),) # link only runTest.o
 $(testmain): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH
 $(testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(cxx_objects_exe) $(GTESTLIBS)
 	$(CXX) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) -ldl -pthread $(LIBFLAGS)
 else # link both runTest.o and runTest_cu.o
 $(testmain): LIBFLAGS += $(CULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH
 $(testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) $(GTESTLIBS)
-	$(NVCC) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) -ldl $(LIBFLAGS) -lcuda
+	  $(GPUCC) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) -ldl $(LIBFLAGS) $(CUDATESTFLAGS)
 endif
 
 # Use flock (Linux only, no Mac) to allow 'make -j' if googletest has not yet been downloaded https://stackoverflow.com/a/32666215
@@ -782,9 +859,9 @@ ifeq ($(USECCACHE),1)
 	ccache --version | head -1
 endif
 	@echo ""
-	@echo NVCC=$(NVCC)
-ifneq ($(NVCC),)
-	$(NVCC) --version
+	@echo GPUCC=$(GPUCC)
+ifneq ($(GPUCC),)
+	$(GPUCC) --version
 endif
 	@echo ""
 	@echo CXX=$(CXX)
@@ -803,7 +880,7 @@ endif
 
 # Target: check (run the C++ test executable)
 # [NB THIS IS WHAT IS USED IN THE GITHUB CI!]
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 check: runTest cmpFcheck cmpFGcheck
 else
 check: runTest cmpFcheck
diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/fbridge.cc b/epochX/cudacpp/gq_ttq.sa/SubProcesses/fbridge.cc
index f93c05b0b3..2b956730d4 100644
--- a/epochX/cudacpp/gq_ttq.sa/SubProcesses/fbridge.cc
+++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/fbridge.cc
@@ -1,11 +1,11 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: S. Roiser (Oct 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Roiser, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #include "Bridge.h"
 #include "CPPProcess.h"
-#include "CudaRuntime.h"
+#include "GpuRuntime.h"
 
 extern "C"
 {
@@ -22,7 +22,7 @@ extern "C"
    * Using the same Fortran MadEvent code, linking to the hetrerogeneous library would allow access to both CPU and GPU implementations.
    * The specific heterogeneous configuration (how many GPUs, how many threads on each CPU, etc) could be loaded in CUDA/C++ from a data file.
    */
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   using namespace mg5amcGpu;
 #else
   using namespace mg5amcCpu;
@@ -46,8 +46,8 @@ extern "C"
    */
   void fbridgecreate_( CppObjectInFortran** ppbridge, const int* pnevtF, const int* pnparF, const int* pnp4F )
   {
-#ifdef __CUDACC__
-    CudaRuntime::setUp();
+#ifdef MGONGPUCPP_GPUIMPL
+    GpuRuntime::setUp();
 #endif
     // Create a process object, read parm card and set parameters
     // FIXME: the process instance can happily go out of scope because it is only needed to read parameters?
@@ -69,8 +69,8 @@ extern "C"
     Bridge<FORTRANFPTYPE>* pbridge = dynamic_cast<Bridge<FORTRANFPTYPE>*>( *ppbridge );
     if( pbridge == 0 ) throw std::runtime_error( "fbridgedelete_: invalid Bridge address" );
     delete pbridge;
-#ifdef __CUDACC__
-    CudaRuntime::tearDown();
+#ifdef MGONGPUCPP_GPUIMPL
+    GpuRuntime::tearDown();
 #endif
   }
 
@@ -100,7 +100,7 @@ extern "C"
   {
     Bridge<FORTRANFPTYPE>* pbridge = dynamic_cast<Bridge<FORTRANFPTYPE>*>( *ppbridge );
     if( pbridge == 0 ) throw std::runtime_error( "fbridgesequence_: invalid Bridge address" );
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     // Use the device/GPU implementation in the CUDA library
     // (there is also a host implementation in this library)
     pbridge->gpu_sequence( momenta, gs, rndhel, rndcol, *pchannelId, mes, selhel, selcol );
diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/fsampler.cc b/epochX/cudacpp/gq_ttq.sa/SubProcesses/fsampler.cc
index 2fb445372d..3743934f41 100644
--- a/epochX/cudacpp/gq_ttq.sa/SubProcesses/fsampler.cc
+++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/fsampler.cc
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Feb 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #include "mgOnGpuConfig.h"
 
@@ -13,7 +13,7 @@
 
 //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -40,7 +40,7 @@ namespace mg5amcCpu
   private:
     const int m_nevt; // The number of events in each iteration
     int m_iiter;      // The iteration counter (for random number seeding)
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
     HostBufferRndNumMomenta m_hstRndmom; // Memory buffers for random numbers
     HostBufferMomenta m_hstMomenta;      // Memory buffers for momenta
     HostBufferWeights m_hstWeights;      // Memory buffers for sampling weights
@@ -105,7 +105,7 @@ namespace mg5amcCpu
 
 extern "C"
 {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   using namespace mg5amcGpu;
 #else
   using namespace mg5amcCpu;
diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/runTest.cc b/epochX/cudacpp/gq_ttq.sa/SubProcesses/runTest.cc
index 572e28aaea..461ec5c3a5 100644
--- a/epochX/cudacpp/gq_ttq.sa/SubProcesses/runTest.cc
+++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/runTest.cc
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: S. Hageboeck (Nov 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 
 #include "mgOnGpuConfig.h"
 
@@ -15,7 +15,7 @@
 #include "RandomNumberKernels.h"
 #include "epoch_process_id.h"
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 using namespace mg5amcGpu;
 #else
 using namespace mg5amcCpu;
@@ -32,7 +32,7 @@ struct CUDA_CPU_TestBase : public TestDriverBase
     : TestDriverBase( npar, refFileName ) {}
 };
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 struct CPUTest : public CUDA_CPU_TestBase
 {
   // Struct data members (process, and memory structures for random numbers, momenta, matrix elements and weights on host and device)
@@ -114,7 +114,7 @@ struct CPUTest : public CUDA_CPU_TestBase
 };
 #endif
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 struct CUDATest : public CUDA_CPU_TestBase
 {
   // Reset the device when our test goes out of scope. Note that this should happen after
@@ -123,7 +123,7 @@ struct CUDATest : public CUDA_CPU_TestBase
   {
     ~DeviceReset()
     {
-      checkCuda( cudaDeviceReset() ); // this is needed by cuda-memcheck --leak-check full
+      gpuDeviceReset(); // this is needed by cuda-memcheck --leak-check full
     }
   } deviceResetter;
 
@@ -249,7 +249,7 @@ INSTANTIATE_TEST_SUITE_P( prefix, \
                           test_suite_name, \
                           testing::Values( new CUDATest( MG_EPOCH_REFERENCE_FILE_NAME ) ) );
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 MG_INSTANTIATE_TEST_SUITE_GPU( XTESTID_GPU( MG_EPOCH_PROCESS_ID ), MadgraphTest );
 #else
 MG_INSTANTIATE_TEST_SUITE_CPU( XTESTID_CPU( MG_EPOCH_PROCESS_ID ), MadgraphTest );
diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/testmisc.cc b/epochX/cudacpp/gq_ttq.sa/SubProcesses/testmisc.cc
index 989aba1fdc..2bd7a9fcf9 100644
--- a/epochX/cudacpp/gq_ttq.sa/SubProcesses/testmisc.cc
+++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/testmisc.cc
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 //----------------------------------------------------------------------------
 // Use ./runTest.exe --gtest_filter=*misc to run only this test
 //----------------------------------------------------------------------------
@@ -17,7 +17,7 @@
 #include <sstream>
 #include <typeinfo>
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 #define TESTID( s ) s##_GPU_MISC
 #else
 #define TESTID( s ) s##_CPU_MISC
diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/testxxx.cc b/epochX/cudacpp/gq_ttq.sa/SubProcesses/testxxx.cc
index 4243e9fcec..6e8657edca 100644
--- a/epochX/cudacpp/gq_ttq.sa/SubProcesses/testxxx.cc
+++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/testxxx.cc
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Apr 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #include "mgOnGpuConfig.h"
 
@@ -20,7 +20,7 @@
 #include <iomanip>
 #include <iostream>
 #include <vector>
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 #define TESTID( s ) s##_GPU_XXX
 #else
 #define TESTID( s ) s##_CPU_XXX
@@ -41,7 +41,7 @@ TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testxxx )
   assert( nevt % neppM == 0 ); // nevt must be a multiple of neppM
   assert( nevt % neppV == 0 ); // nevt must be a multiple of neppV
   // Fill in the input momenta
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   mg5amcGpu::PinnedHostBufferMomenta hstMomenta( nevt ); // AOSOA[npagM][npar=4][np4=4][neppM]
 #else
   mg5amcCpu::HostBufferMomenta hstMomenta( nevt ); // AOSOA[npagM][npar=4][np4=4][neppM]
@@ -228,7 +228,7 @@ TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testxxx )
   {
     for( int ievt = 0; ievt < nevt; ievt++ )
     {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
       using namespace mg5amcGpu;
 #else
       using namespace mg5amcCpu;
diff --git a/epochX/cudacpp/gq_ttq.sa/src/HelAmps_sm.h b/epochX/cudacpp/gq_ttq.sa/src/HelAmps_sm.h
index e15ce959e9..9b0bfb10ee 100644
--- a/epochX/cudacpp/gq_ttq.sa/src/HelAmps_sm.h
+++ b/epochX/cudacpp/gq_ttq.sa/src/HelAmps_sm.h
@@ -4,7 +4,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: A. Valassi (Sep 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
 // MadGraph5_aMC@NLO v. 3.5.0_lo_vect, 2023-06-09
@@ -26,7 +26,7 @@
 //#include <iomanip>
 //#include <iostream>
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/gq_ttq.sa/src/Parameters_sm.cc b/epochX/cudacpp/gq_ttq.sa/src/Parameters_sm.cc
index 7255e49119..459dae9e99 100644
--- a/epochX/cudacpp/gq_ttq.sa/src/Parameters_sm.cc
+++ b/epochX/cudacpp/gq_ttq.sa/src/Parameters_sm.cc
@@ -4,7 +4,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: A. Valassi (Sep 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
 // MadGraph5_aMC@NLO v. 3.5.0_lo_vect, 2023-06-09
diff --git a/epochX/cudacpp/gq_ttq.sa/src/Parameters_sm.h b/epochX/cudacpp/gq_ttq.sa/src/Parameters_sm.h
index c935779eb3..db5520aa96 100644
--- a/epochX/cudacpp/gq_ttq.sa/src/Parameters_sm.h
+++ b/epochX/cudacpp/gq_ttq.sa/src/Parameters_sm.h
@@ -211,7 +211,7 @@ namespace Parameters_sm_dependentCouplings
 #pragma GCC diagnostic push
 #pragma GCC diagnostic ignored "-Wunused-variable"  // e.g. <<warning: unused variable ‘mdl_G__exp__2’ [-Wunused-variable]>>
 #pragma GCC diagnostic ignored "-Wunused-parameter" // e.g. <<warning: unused parameter ‘G’ [-Wunused-parameter]>>
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 #pragma nv_diagnostic push
 #pragma nv_diag_suppress 177 // e.g. <<warning #177-D: variable "mdl_G__exp__2" was declared but never referenced>>
 #endif
@@ -238,7 +238,7 @@ namespace Parameters_sm_dependentCouplings
     // End SM implementation - no special handling of vectors of floats as in EFT (#439)
     return out;
   }
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 #pragma GCC diagnostic pop
 #pragma nv_diagnostic pop
 #endif
@@ -254,7 +254,7 @@ namespace Parameters_sm_independentCouplings
 
 //==========================================================================
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/gq_ttq.sa/src/mgOnGpuConfig.h b/epochX/cudacpp/gq_ttq.sa/src/mgOnGpuConfig.h
index 6c0c4919e9..d4e37d19b3 100644
--- a/epochX/cudacpp/gq_ttq.sa/src/mgOnGpuConfig.h
+++ b/epochX/cudacpp/gq_ttq.sa/src/mgOnGpuConfig.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jul 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MGONGPUCONFIG_H
 #define MGONGPUCONFIG_H 1
@@ -10,13 +10,26 @@
 // There are two different code bases for standalone_cudacpp (without multichannel) and madevent+cudacpp (with multichannel)
 #undef MGONGPU_SUPPORTS_MULTICHANNEL
 
+// Is this a GPU (CUDA, HIP) or CPU implementation?
+#ifdef __CUDACC__
+#define MGONGPUCPP_GPUIMPL cuda
+#elif defined __HIPCC__
+#define MGONGPUCPP_GPUIMPL hip
+#else
+#undef MGONGPUCPP_GPUIMPL
+#endif
+
 // ** NB1 Throughputs (e.g. 6.8E8) are events/sec for "./gcheck.exe -p 65536 128 12"
 // ** NB2 Baseline on b7g47n0004 fluctuates (probably depends on load on other VMs)
 
 // Choose if curand is supported for generating random numbers
+// For CUDA, by default, it is supported
+// For HIP, by default, it is not supported
 // For C++, by default, do not inline, but allow this macro to be set from outside with e.g. -DMGONGPU_HAS_NO_CURAND
 #ifdef __CUDACC__
 #undef MGONGPU_HAS_NO_CURAND
+#elif defined __HIPCC__
+#define MGONGPU_HAS_NO_CURAND 1
 #else
 //#undef MGONGPU_HAS_NO_CURAND // default
 ////#define MGONGPU_HAS_NO_CURAND 1
@@ -52,23 +65,28 @@
 //#undef MGONGPU_HARDCODE_PARAM // default
 ////#define MGONGPU_HARDCODE_PARAM 1
 
-// Complex type in c++: std::complex or cxsmpl (CHOOSE ONLY ONE)
-#ifndef __CUDACC__
-//#define MGONGPU_CPPCXTYPE_STDCOMPLEX 1 // ~8 percent slower on float, same on double (5.1E6/double, 9.4E6/float)
-#define MGONGPU_CPPCXTYPE_CXSMPL 1 // new default (5.1E6/double, 10.2E6/float)
-#endif
-
-// Complex type in cuda: thrust or cucomplex or cxsmpl (CHOOSE ONLY ONE)
+// Complex type in CUDA: thrust or cucomplex or cxsmpl (CHOOSE ONLY ONE)
 #ifdef __CUDACC__
 #define MGONGPU_CUCXTYPE_THRUST 1 // default (~1.15E9/double, ~3.2E9/float)
 //#define MGONGPU_CUCXTYPE_CUCOMPLEX 1 // ~10 percent slower (1.03E9/double, ~2.8E9/float)
 //#define MGONGPU_CUCXTYPE_CXSMPL 1 // ~10 percent slower (1.00E9/double, ~2.9E9/float)
+
+// Complex type in HIP: cxsmpl (ONLY ONE OPTION POSSIBLE)
+#elif defined __HIPCC__
+#define MGONGPU_CUCXTYPE_CXSMPL 1 // ~10 percent slower (1.00E9/double, ~2.9E9/float)
+
+// Complex type in C++: std::complex or cxsmpl (CHOOSE ONLY ONE)
+#else
+//#define MGONGPU_CPPCXTYPE_STDCOMPLEX 1 // ~8 percent slower on float, same on double (5.1E6/double, 9.4E6/float)
+#define MGONGPU_CPPCXTYPE_CXSMPL 1 // new default (5.1E6/double, 10.2E6/float)
 #endif
 
-// Cuda nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation
+// CUDA nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation
 #ifdef __CUDACC__
-#undef MGONGPU_NSIGHT_DEBUG // default
+#undef MGONGPU_NSIGHT_DEBUG // default in CUDA
 //#define MGONGPU_NSIGHT_DEBUG 1
+#else
+#undef MGONGPU_NSIGHT_DEBUG // only option in HIP or C++
 #endif
 
 // SANITY CHECKS (floating point precision for everything but color algebra #537)
@@ -84,17 +102,21 @@
 #error You cannot use double precision for color algebra and single precision elsewhere
 #endif
 
-// SANITY CHECKS (c++ complex number implementation)
-#ifndef __CUDACC__
-#if defined MGONGPU_CPPCXTYPE_STDCOMPLEX and defined MGONGPU_CPPCXTYPE_CXSMPL
-#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CPPCXTYPE_STDCOMPLEX or MGONGPU_CPPCXTYPE_CXSMPL
+// SANITY CHECKS (CUDA complex number implementation)
+#ifdef __CUDACC__
+#if defined MGONGPU_CUCXTYPE_THRUST and defined MGONGPU_CUCXTYPE_CUCOMPLEX
+#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CUCXTYPE_THRUST or MGONGPU_CUCXTYPE_CUCOMPLEX for CUDA
+#elif defined MGONGPU_CUCXTYPE_THRUST and defined MGONGPU_CUCXTYPE_CXSMPL
+#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CUCXTYPE_THRUST or MGONGPU_CUCXTYPE_CXSMPL for CUDA
+#elif defined MGONGPU_CUCXTYPE_CUCOMPLEX and defined MGONGPU_CUCXTYPE_CXSMPL
+#error You must CHOOSE (ONE AND) ONLY ONE OF MGONGPU_CUCXTYPE_CUCOMPLEX or MGONGPU_CUCXTYPE_CXSMPL for CUDA
 #endif
 #endif
 
-// SANITY CHECKS (cuda complex number implementation)
-#ifdef __CUDACC__
-#if defined MGONGPU_CUCXTYPE_THRUST and defined MGONGPU_CUCXTYPE_CUCOMPLEX and defined MGONGPU_CUCXTYPE_CXSMPL
-#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CUCXTYPE_THRUST or MGONGPU_CUCXTYPE_CUCOMPLEX or MGONGPU_CUCXTYPE_CXSMPL
+// SANITY CHECKS (C++ complex number implementation)
+#ifndef MGONGPUCPP_GPUIMPL
+#if defined MGONGPU_CPPCXTYPE_STDCOMPLEX and defined MGONGPU_CPPCXTYPE_CXSMPL
+#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CPPCXTYPE_STDCOMPLEX or MGONGPU_CPPCXTYPE_CXSMPL for C++
 #endif
 #endif
 
@@ -131,7 +153,7 @@ namespace mgOnGpu
   // Alignment requirement for using reinterpret_cast with SIMD vectorized code
   // (using reinterpret_cast with non aligned memory may lead to segmentation faults!)
   // Only needed for C++ code but can be enforced also in NVCC builds of C++ code using CUDA>=11.2 and C++17 (#318, #319, #333)
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   constexpr int cppAlign = 64; // alignment requirement for SIMD vectorization (64-byte i.e. 512-bit)
 #endif
 
@@ -142,7 +164,7 @@ using mgOnGpu::fptype;
 using mgOnGpu::fptype2;
 
 // C++ SIMD vectorization width (this will be used to set neppV)
-#ifdef __CUDACC__ // CUDA implementation has no SIMD
+#ifdef MGONGPUCPP_GPUIMPL // CUDA and HIP implementations have no SIMD
 #undef MGONGPU_CPPSIMD
 #elif defined __AVX512VL__ && defined MGONGPU_PVW512 // C++ "512z" AVX512 with 512 width (512-bit ie 64-byte): 8 (DOUBLE) or 16 (FLOAT)
 #ifdef MGONGPU_FPTYPE_DOUBLE
@@ -172,9 +194,9 @@ using mgOnGpu::fptype2;
 #undef MGONGPU_CPPSIMD
 #endif
 
-// Cuda nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation
+// CUDA nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation
 // Arguments (not used so far): text is __FUNCTION__, code is 0 (start) or 1 (end)
-#if defined __CUDACC__ && defined MGONGPU_NSIGHT_DEBUG /* clang-format off */
+#if defined __CUDA__ && defined MGONGPU_NSIGHT_DEBUG /* clang-format off */
 #define mgDebugDeclare() __shared__ float mgDebugCounter[mgOnGpu::ntpbMAX];
 #define mgDebugInitialise() { mgDebugCounter[threadIdx.x] = 0; }
 #define mgDebug( code, text ) { mgDebugCounter[threadIdx.x] += 1; }
@@ -186,8 +208,8 @@ using mgOnGpu::fptype2;
 #define mgDebugFinalise() { /*noop*/ }
 #endif /* clang-format on */
 
-// Define empty CUDA declaration specifiers for C++
-#ifndef __CUDACC__
+// Define empty CUDA/HIP declaration specifiers for C++
+#ifndef MGONGPUCPP_GPUIMPL
 #define __global__
 #define __host__
 #define __device__
diff --git a/epochX/cudacpp/gq_ttq.sa/src/mgOnGpuCxtypes.h b/epochX/cudacpp/gq_ttq.sa/src/mgOnGpuCxtypes.h
index 0cb2f1db7e..4e7ab03fa2 100644
--- a/epochX/cudacpp/gq_ttq.sa/src/mgOnGpuCxtypes.h
+++ b/epochX/cudacpp/gq_ttq.sa/src/mgOnGpuCxtypes.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022, based on earlier work by D. Smith) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MGONGPUCXTYPES_H
 #define MGONGPUCXTYPES_H 1
@@ -19,7 +19,7 @@
 #include <complex>
 
 // Complex type in cuda: thrust or cucomplex or cxsmpl
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 #if defined MGONGPU_CUCXTYPE_THRUST
 #pragma clang diagnostic push
 #pragma clang diagnostic ignored "-Wtautological-compare" // for icpx2021/clang13 (https://stackoverflow.com/a/15864661)
@@ -201,7 +201,7 @@ namespace mgOnGpu
 {
 
   // --- Type definitions (complex type: cxtype)
-#ifdef __CUDACC__ // cuda
+#ifdef MGONGPUCPP_GPUIMPL // cuda
 #if defined MGONGPU_CUCXTYPE_THRUST
   typedef thrust::complex<fptype> cxtype;
 #elif defined MGONGPU_CUCXTYPE_CUCOMPLEX
@@ -281,7 +281,7 @@ cxmake( const std::complex<double>& c ) // std::complex to cxsmpl (double-to-flo
 
 //==========================================================================
 
-#if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_THRUST // cuda + thrust
+#if defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CUCXTYPE_THRUST // cuda + thrust
 
 //------------------------------
 // CUDA - using thrust::complex
@@ -317,11 +317,11 @@ cxmake( const cxtype& c )
   return c;
 }
 
-#endif // #if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_THRUST
+#endif // #if defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CUCXTYPE_THRUST
 
 //==========================================================================
 
-#if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_CUCOMPLEX // cuda + cucomplex
+#if defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CUCXTYPE_CUCOMPLEX // cuda + cucomplex
 
 //------------------------------
 // CUDA - using cuComplex
@@ -536,11 +536,11 @@ cxmake( const std::complex<fptype>& c ) // std::complex to cucomplex (float-to-f
   return cxmake( c.real(), c.imag() );
 }
 
-#endif // #if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_CUCOMPLEX
+#endif // #if defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CUCXTYPE_CUCOMPLEX
 
 //==========================================================================
 
-#if not defined __CUDACC__ and defined MGONGPU_CPPCXTYPE_STDCOMPLEX // c++ + stdcomplex
+#if not defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CPPCXTYPE_STDCOMPLEX // c++ + stdcomplex
 
 //------------------------------
 // C++ - using std::complex
@@ -584,7 +584,7 @@ cxmake( const std::complex<double>& c ) // std::complex to std::complex (cast do
 }
 #endif
 
-#endif // #if not defined __CUDACC__ and defined MGONGPU_CPPCXTYPE_STDCOMPLEX
+#endif // #if not defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CPPCXTYPE_STDCOMPLEX
 
 //==========================================================================
 
diff --git a/epochX/cudacpp/gq_ttq.sa/src/mgOnGpuFptypes.h b/epochX/cudacpp/gq_ttq.sa/src/mgOnGpuFptypes.h
index a1cde16a67..6f6cee64d6 100644
--- a/epochX/cudacpp/gq_ttq.sa/src/mgOnGpuFptypes.h
+++ b/epochX/cudacpp/gq_ttq.sa/src/mgOnGpuFptypes.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MGONGPUFPTYPES_H
 #define MGONGPUFPTYPES_H 1
@@ -13,7 +13,7 @@
 
 //==========================================================================
 
-#ifdef __CUDACC__ // cuda
+#ifdef MGONGPUCPP_GPUIMPL // cuda
 
 //------------------------------
 // Floating point types - Cuda
@@ -57,11 +57,11 @@ fpsqrt( const fptype& f )
 #endif
 }
 
-#endif // #ifdef __CUDACC__
+#endif // #ifdef MGONGPUCPP_GPUIMPL
 
 //==========================================================================
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 
 //------------------------------
 // Floating point types - C++
@@ -85,7 +85,7 @@ fpsqrt( const fptype& f )
   return std::sqrt( f );
 }
 
-#endif // #ifndef __CUDACC__
+#endif // #ifndef MGONGPUCPP_GPUIMPL
 
 //==========================================================================
 
diff --git a/epochX/cudacpp/gq_ttq.sa/src/mgOnGpuVectors.h b/epochX/cudacpp/gq_ttq.sa/src/mgOnGpuVectors.h
index 9d3e82b1e3..7904b93c61 100644
--- a/epochX/cudacpp/gq_ttq.sa/src/mgOnGpuVectors.h
+++ b/epochX/cudacpp/gq_ttq.sa/src/mgOnGpuVectors.h
@@ -108,7 +108,7 @@ namespace mgOnGpu /* clang-format off */
 #endif
 #endif
 
-#else // i.e #ifndef MGONGPU_CPPSIMD (this includes #ifdef __CUDACC__)
+#else // i.e #ifndef MGONGPU_CPPSIMD (this includes #ifdef MGONGPUCPP_GPUIMPL)
 
   const int neppV = 1;
 
@@ -129,7 +129,7 @@ using mgOnGpu::bool_v;
 
 //--------------------------------------------------------------------------
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 
 // Printout to stream for user defined types
 
@@ -744,11 +744,11 @@ fpvsplit1( const fptype2_v& v )
 
 #endif // #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
 
-#endif // #ifndef __CUDACC__
+#endif // #ifndef MGONGPUCPP_GPUIMPL
 
 //==========================================================================
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 
 //------------------------------
 // Vector types - CUDA
@@ -786,12 +786,12 @@ cxternary( const bool& mask, const cxtype& a, const cxtype& b )
   return ( mask ? a : b );
 }
 
-#endif // #ifdef __CUDACC__
+#endif // #ifdef MGONGPUCPP_GPUIMPL
 
 //==========================================================================
 
 // Scalar-or-vector types: scalar in CUDA, vector or scalar in C++
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 typedef bool bool_sv;
 typedef fptype fptype_sv;
 typedef fptype2 fptype2_sv;
@@ -812,7 +812,7 @@ typedef mgOnGpu::cxtype_ref cxtype_sv_ref;
 #endif
 
 // Scalar-or-vector zeros: scalar in CUDA, vector or scalar in C++
-#ifdef __CUDACC__ /* clang-format off */
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
 inline __host__ __device__ cxtype cxzero_sv(){ return cxtype( 0, 0 ); }
 #elif defined MGONGPU_CPPSIMD
 inline cxtype_v cxzero_sv() { return cxtype_v(); } // RRRR=0000 IIII=0000
diff --git a/epochX/cudacpp/gq_ttq.sa/src/rambo.h b/epochX/cudacpp/gq_ttq.sa/src/rambo.h
index e02ea52496..cd7e1008ea 100644
--- a/epochX/cudacpp/gq_ttq.sa/src/rambo.h
+++ b/epochX/cudacpp/gq_ttq.sa/src/rambo.h
@@ -4,7 +4,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 
 #include "mgOnGpuConfig.h"
@@ -18,7 +18,7 @@
 #include <iostream>
 
 // Simplified rambo version for 2 to N (with N>=2) processes with massless particles
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -83,7 +83,7 @@ namespace mg5amcCpu
       static bool first = true;
       if( first )
       {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
         if constexpr( M_ACCESS::isOnDevice() ) // avoid
         {
           const int ievt0 = 0;
@@ -166,7 +166,7 @@ namespace mg5amcCpu
     wt = po2log;
     if( nparf != 2 ) wt = ( 2. * nparf - 4. ) * log( energy ) + z[nparf - 1];
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
     // issue warnings if weight is too small or too large
     static int iwarn[5] = { 0, 0, 0, 0, 0 };
     if( wt < -180. )
diff --git a/epochX/cudacpp/heft_gg_h.sa/CODEGEN_cudacpp_heft_gg_h_log.txt b/epochX/cudacpp/heft_gg_h.sa/CODEGEN_cudacpp_heft_gg_h_log.txt
index 33f009f727..2b0816a6e3 100644
--- a/epochX/cudacpp/heft_gg_h.sa/CODEGEN_cudacpp_heft_gg_h_log.txt
+++ b/epochX/cudacpp/heft_gg_h.sa/CODEGEN_cudacpp_heft_gg_h_log.txt
@@ -51,8 +51,8 @@ Note that you can still compile and run aMC@NLO with the built-in PDFs
  MG5_aMC> set lhapdf /PATH/TO/lhapdf-config
 
 Using default text editor "vi". Set another one in ./input/mg5_configuration.txt
-No valid eps viewer found. Please set in ./input/mg5_configuration.txt
-Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt
+Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt
+No valid web browser found. Please set in ./input/mg5_configuration.txt
 import /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_cudacpp_heft_gg_h.mg
 The import format was not given, so we guess it as command
 set stdout_level DEBUG
@@ -127,7 +127,7 @@ INFO: Checking for minimal orders which gives processes.
 INFO: Please specify coupling orders to bypass this step. 
 INFO: Trying process: g g > h HIG<=1 HIW<=1 WEIGHTED<=2 @1  
 INFO: Process has 1 diagrams 
-1 processes with 1 diagrams generated in 0.003 s
+1 processes with 1 diagrams generated in 0.004 s
 Total: 1 processes with 1 diagrams
 output standalone_cudacpp CODEGEN_cudacpp_heft_gg_h
 Load PLUGIN.CUDACPP_SA_OUTPUT
@@ -158,11 +158,11 @@ FileWriter <class 'PLUGIN.CUDACPP_SA_OUTPUT.model_handling.PLUGIN_CPPWriter'> fo
 [1;32mDEBUG:  type(self.helas_call_writer) = [0m <class 'PLUGIN.CUDACPP_SA_OUTPUT.model_handling.PLUGIN_GPUFOHelasCallWriter'> [1;30m[model_handling.py at line 1151][0m [0m
 [1;32mDEBUG:  self.support_multichannel, self.include_multi_channel = [0m True False [1;30m[model_handling.py at line 1152][0m [0m
 [1;32mDEBUG:  multi_channel_map = [0m None [1;30m[model_handling.py at line 1643][0m [0m
-[1;32mDEBUG:  diag_to_config = [0m {} [1;30m[model_handling.py at line 1698][0m [0m
-[1;32mDEBUG:  call = [0m vxxxxx( momenta,m_pars->%s, cHel[ihel][%d],%+d, w_sv[%d], %d ); [1;30m[model_handling.py at line 1810][0m [0m
-[1;32mDEBUG:  ('ZERO', 0, -1, 0, 0) [1;30m[model_handling.py at line 1811][0m [0m
-[1;32mDEBUG:  call = [0m vxxxxx( momenta,m_pars->%s, cHel[ihel][%d],%+d, w_sv[%d], %d ); [1;30m[model_handling.py at line 1810][0m [0m
-[1;32mDEBUG:  ('ZERO', 1, -1, 1, 1) [1;30m[model_handling.py at line 1811][0m [0m
+[1;32mDEBUG:  diag_to_config = [0m {} [1;30m[model_handling.py at line 1700][0m [0m
+[1;32mDEBUG:  call = [0m vxxxxx( momenta,m_pars->%s, cHel[ihel][%d],%+d, w_sv[%d], %d ); [1;30m[model_handling.py at line 1812][0m [0m
+[1;32mDEBUG:  ('ZERO', 0, -1, 0, 0) [1;30m[model_handling.py at line 1813][0m [0m
+[1;32mDEBUG:  call = [0m vxxxxx( momenta,m_pars->%s, cHel[ihel][%d],%+d, w_sv[%d], %d ); [1;30m[model_handling.py at line 1812][0m [0m
+[1;32mDEBUG:  ('ZERO', 1, -1, 1, 1) [1;30m[model_handling.py at line 1813][0m [0m
 INFO: Created files CPPProcess.h and CPPProcess.cc in directory /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_cudacpp_heft_gg_h/SubProcesses/P1_Sigma_heft_gg_h/. 
 [1;32mDEBUG:  Entering PLUGIN_OneProcessExporter.edit_CMakeLists [1;30m[model_handling.py at line 1332][0m [0m
 [1;32mDEBUG:  Entering PLUGIN_OneProcessExporter.edit_check_sa [1;30m[model_handling.py at line 1341][0m [0m
@@ -176,7 +176,7 @@ Generated helas calls for 1 subprocesses (1 diagrams) in 0.002 s
 [1;32mDEBUG:  Entering PLUGIN_ProcessExporter.convert_model (create the model) [1;30m[output.py at line 194][0m [0m
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates VVS3 routines[0m
-ALOHA: aloha creates 1 routines in  0.053 s
+ALOHA: aloha creates 1 routines in  0.058 s
 <class 'aloha.create_aloha.AbstractRoutine'> VVS3
 FileWriter <class 'PLUGIN.CUDACPP_SA_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_cudacpp_heft_gg_h/src/./HelAmps_heft.h
 INFO: Created file HelAmps_heft.h in directory /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_cudacpp_heft_gg_h/src/. 
@@ -207,6 +207,6 @@ INFO: /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_cudacpp_heft_gg_h/src
 [1;32mDEBUG:  Entering PLUGIN_ProcessExporter.finalize [1;30m[output.py at line 203][0m [0m
 quit
 
-real	0m0.806s
-user	0m0.424s
-sys	0m0.052s
+real	0m0.612s
+user	0m0.439s
+sys	0m0.057s
diff --git a/epochX/cudacpp/heft_gg_h.sa/COPYRIGHT b/epochX/cudacpp/heft_gg_h.sa/COPYRIGHT
index a134b5fef9..84a883fbb0 100644
--- a/epochX/cudacpp/heft_gg_h.sa/COPYRIGHT
+++ b/epochX/cudacpp/heft_gg_h.sa/COPYRIGHT
@@ -15,6 +15,7 @@ The full development team currently includes the following authors :
   Stephan Hageboeck (CERN)
   Olivier Mattelaer (Universite Catholique de Louvain, original author)
   Stefan Roiser (CERN, original author)
+  Joergen Teig (CERN)
   Andrea Valassi (CERN, original author)
   Zenny Wettersten (CERN)
 See https://github.com/madgraph5/madgraph4gpu for more details. For the full
diff --git a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/Bridge.h b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/Bridge.h
index 4cafe0c997..c04628dfd1 100644
--- a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/Bridge.h
+++ b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/Bridge.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: S. Roiser (Nov 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Roiser, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef BRIDGE_H
 #define BRIDGE_H 1
@@ -22,7 +22,7 @@
 #include <memory>
 #include <type_traits>
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -82,7 +82,7 @@ namespace mg5amcCpu
     Bridge& operator=( const Bridge& ) = delete;
     Bridge& operator=( Bridge&& ) = delete;
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     /**
      * Set the gpublocks and gputhreads for the gpusequence - throws if evnt != gpublocks*gputhreads
      * (this is needed for BridgeKernel tests rather than for actual production use in Fortran)
@@ -149,7 +149,7 @@ namespace mg5amcCpu
     unsigned int m_nevt; // number of events
     int m_nGoodHel;      // the number of good helicities (-1 initially when they have not yet been calculated)
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     int m_gputhreads; // number of gpu threads (default set from number of events, can be modified)
     int m_gpublocks;  // number of gpu blocks (default set from number of events, can be modified)
     mg5amcGpu::DeviceBuffer<FORTRANFPTYPE, sizePerEventMomenta> m_devMomentaF;
@@ -186,12 +186,12 @@ namespace mg5amcCpu
   // Forward declare transposition methods
   //
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 
   template<typename Tin, typename Tout>
   __global__ void dev_transposeMomentaF2C( const Tin* in, Tout* out, const unsigned int nevt );
 
-#endif // __CUDACC__
+#endif // MGONGPUCPP_GPUIMPL
 
   template<typename Tin, typename Tout>
   void hst_transposeMomentaF2C( const Tin* in, Tout* out, const unsigned int nevt );
@@ -208,7 +208,7 @@ namespace mg5amcCpu
   Bridge<FORTRANFPTYPE>::Bridge( unsigned int nevtF, unsigned int nparF, unsigned int np4F )
     : m_nevt( nevtF )
     , m_nGoodHel( -1 )
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     , m_gputhreads( 256 )                  // default number of gpu threads
     , m_gpublocks( m_nevt / m_gputhreads ) // this ensures m_nevt <= m_gpublocks*m_gputhreads
     , m_devMomentaF( m_nevt )
@@ -232,7 +232,7 @@ namespace mg5amcCpu
   {
     if( nparF != CPPProcess::npar ) throw std::runtime_error( "Bridge constructor: npar mismatch" );
     if( np4F != CPPProcess::np4 ) throw std::runtime_error( "Bridge constructor: np4 mismatch" );
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     if( ( m_nevt < s_gputhreadsmin ) || ( m_nevt % s_gputhreadsmin != 0 ) )
       throw std::runtime_error( "Bridge constructor: nevt should be a multiple of " + std::to_string( s_gputhreadsmin ) );
     while( m_nevt != m_gpublocks * m_gputhreads )
@@ -250,11 +250,11 @@ namespace mg5amcCpu
     std::cout << "WARNING! Instantiate host Bridge (nevt=" << m_nevt << ")" << std::endl;
     mg5amcCpu::CPPProcess process( /*verbose=*/false );
     m_pmek.reset( new mg5amcCpu::MatrixElementKernelHost( m_hstMomentaC, m_hstGs, m_hstRndHel, m_hstRndCol, m_hstMEs, m_hstSelHel, m_hstSelCol, m_nevt ) );
-#endif // __CUDACC__
+#endif // MGONGPUCPP_GPUIMPL
     process.initProc( "../../Cards/param_card.dat" );
   }
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   template<typename FORTRANFPTYPE>
   void Bridge<FORTRANFPTYPE>::set_gpugrid( const int gpublocks, const int gputhreads )
   {
@@ -268,7 +268,7 @@ namespace mg5amcCpu
   }
 #endif
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   template<typename FORTRANFPTYPE>
   void Bridge<FORTRANFPTYPE>::gpu_sequence( const FORTRANFPTYPE* momenta,
                                             const FORTRANFPTYPE* gs,
@@ -283,14 +283,14 @@ namespace mg5amcCpu
     constexpr int neppM = MemoryAccessMomenta::neppM;
     if constexpr( neppM == 1 && std::is_same_v<FORTRANFPTYPE, fptype> )
     {
-      checkCuda( cudaMemcpy( m_devMomentaC.data(), momenta, m_devMomentaC.bytes(), cudaMemcpyHostToDevice ) );
+      gpuMemcpy( m_devMomentaC.data(), momenta, m_devMomentaC.bytes(), gpuMemcpyHostToDevice );
     }
     else
     {
-      checkCuda( cudaMemcpy( m_devMomentaF.data(), momenta, m_devMomentaF.bytes(), cudaMemcpyHostToDevice ) );
+      gpuMemcpy( m_devMomentaF.data(), momenta, m_devMomentaF.bytes(), gpuMemcpyHostToDevice );
       const int thrPerEvt = CPPProcess::npar * CPPProcess::np4; // AV: transpose alg does 1 element per thread (NOT 1 event per thread)
       //const int thrPerEvt = 1; // AV: try new alg with 1 event per thread... this seems slower
-      dev_transposeMomentaF2C<<<m_gpublocks * thrPerEvt, m_gputhreads>>>( m_devMomentaF.data(), m_devMomentaC.data(), m_nevt );
+      gpuLaunchKernel( dev_transposeMomentaF2C, m_gpublocks * thrPerEvt, m_gputhreads, m_devMomentaF.data(), m_devMomentaC.data(), m_nevt );
     }
     if constexpr( std::is_same_v<FORTRANFPTYPE, fptype> )
     {
@@ -333,7 +333,7 @@ namespace mg5amcCpu
   }
 #endif
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   template<typename FORTRANFPTYPE>
   void Bridge<FORTRANFPTYPE>::cpu_sequence( const FORTRANFPTYPE* momenta,
                                             const FORTRANFPTYPE* gs,
@@ -388,7 +388,7 @@ namespace mg5amcCpu
   // - C++ array: momenta[npagM][npar][np4][neppM] with nevt=npagM*neppM (AOSOA)
   //
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   template<typename Tin, typename Tout>
   __global__ void dev_transposeMomentaF2C( const Tin* in, Tout* out, const unsigned int nevt )
   {
diff --git a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/BridgeKernels.cc b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/BridgeKernels.cc
index cef4cb3c71..90c7f2d3b8 100644
--- a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/BridgeKernels.cc
+++ b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/BridgeKernels.cc
@@ -1,10 +1,11 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #include "BridgeKernels.h"
 
+#include "GpuAbstraction.h"
 #include "MemoryAccessMomenta.h"
 
 #include <sstream>
@@ -14,7 +15,7 @@ constexpr int npar = CPPProcess::npar; // #particles in total (external = initia
 
 //============================================================================
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -45,7 +46,7 @@ namespace mg5amcCpu
 
 //============================================================================
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 namespace mg5amcCpu
 {
 
@@ -96,7 +97,7 @@ namespace mg5amcCpu
 
 //============================================================================
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 {
 
diff --git a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/BridgeKernels.h b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/BridgeKernels.h
index 15eb4bff4d..3efef8ce97 100644
--- a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/BridgeKernels.h
+++ b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/BridgeKernels.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef BRIDGEKERNELS_H
 #define BRIDGEKERNELS_H 1
@@ -12,7 +12,7 @@
 #include "MatrixElementKernels.h"
 #include "MemoryBuffers.h"
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -49,7 +49,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A Bridge wrapper class encapsulating matrix element calculations on a CPU host
   class BridgeKernelHost final : public BridgeKernelBase
   {
@@ -89,7 +89,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   // A Bridge wrapper class encapsulating matrix element calculations on a GPU device
   class BridgeKernelDevice : public BridgeKernelBase
   {
diff --git a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/CommonRandomNumberKernel.cc b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/CommonRandomNumberKernel.cc
index 985b39f576..010bc4cbd0 100644
--- a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/CommonRandomNumberKernel.cc
+++ b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/CommonRandomNumberKernel.cc
@@ -1,15 +1,16 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #include "CommonRandomNumbers.h"
+#include "GpuAbstraction.h"
 #include "MemoryBuffers.h"
 #include "RandomNumberKernels.h"
 
 #include <cassert>
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/CrossSectionKernels.cc b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/CrossSectionKernels.cc
index 0b355a3c8d..c15b39844d 100644
--- a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/CrossSectionKernels.cc
+++ b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/CrossSectionKernels.cc
@@ -1,10 +1,11 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #include "CrossSectionKernels.h"
 
+#include "GpuAbstraction.h"
 #include "MemoryAccessMatrixElements.h"
 #include "MemoryAccessWeights.h"
 #include "MemoryBuffers.h"
@@ -77,7 +78,7 @@ debug_me_is_abnormal( const fptype& me, size_t ievtALL )
 
 //============================================================================
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -185,7 +186,7 @@ namespace mg5amcCpu
 
 //============================================================================
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 {
 
diff --git a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/CrossSectionKernels.h b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/CrossSectionKernels.h
index 7933ca4bbf..4d9659e04e 100644
--- a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/CrossSectionKernels.h
+++ b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/CrossSectionKernels.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef CROSSSECTIONKERNELS_H
 #define CROSSSECTIONKERNELS_H 1
@@ -13,7 +13,7 @@
 
 //============================================================================
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -96,7 +96,7 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
   /*
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   // A class encapsulating the calculation of event statistics on a GPU device
   class CrossSectionKernelDevice : public CrossSectionKernelBase, public NumberOfEvents
   {
diff --git a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/CudaRuntime.h b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/CudaRuntime.h
deleted file mode 100644
index 64ce52f4b3..0000000000
--- a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/CudaRuntime.h
+++ /dev/null
@@ -1,85 +0,0 @@
-// Copyright (C) 2020-2023 CERN and UCLouvain.
-// Licensed under the GNU Lesser General Public License (version 3 or later).
-// Created by: S. Roiser (Jul 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
-
-#ifndef MG5AMC_CUDARUNTIME_H
-#define MG5AMC_CUDARUNTIME_H 1
-
-// MG5AMC on GPU uses the CUDA runtime API, not the lower level CUDA driver API
-// See https://docs.nvidia.com/cuda/cuda-runtime-api/driver-vs-runtime-api.html#driver-vs-runtime-api
-
-#include <cassert>
-#include <iostream>
-
-//--------------------------------------------------------------------------
-
-// See https://stackoverflow.com/a/14038590
-#ifdef __CUDACC__ /* clang-format off */
-#define checkCuda( code ) { assertCuda( code, __FILE__, __LINE__ ); }
-inline void assertCuda( cudaError_t code, const char* file, int line, bool abort = true )
-{
-  if( code != cudaSuccess )
-  {
-    printf( "ERROR! assertCuda: '%s' (%d) in %s:%d\n", cudaGetErrorString( code ), code, file, line );
-    if( abort ) assert( code == cudaSuccess );
-  }
-}
-#endif /* clang-format on */
-
-//--------------------------------------------------------------------------
-
-#ifdef __CUDACC__
-namespace mg5amcGpu
-{
-  // Instantiate a CudaRuntime at the beginnining of the application's main to
-  // invoke cudaSetDevice(0) in the constructor and book a cudaDeviceReset() call in the destructor
-  // *** FIXME! This will all need to be designed differently when going to multi-GPU nodes! ***
-  struct CudaRuntime final
-  {
-    CudaRuntime( const bool debug = true )
-      : m_debug( debug ) { setUp( m_debug ); }
-    ~CudaRuntime() { tearDown( m_debug ); }
-    CudaRuntime( const CudaRuntime& ) = delete;
-    CudaRuntime( CudaRuntime&& ) = delete;
-    CudaRuntime& operator=( const CudaRuntime& ) = delete;
-    CudaRuntime& operator=( CudaRuntime&& ) = delete;
-    bool m_debug;
-
-    // Set up CUDA application
-    // ** NB: strictly speaking this is not needed when using the CUDA runtime API **
-    // Calling cudaSetDevice on startup is useful to properly book-keep the time spent in CUDA initialization
-    static void setUp( const bool debug = true )
-    {
-      // ** NB: it is useful to call cudaSetDevice, or cudaFree, to properly book-keep the time spent in CUDA initialization
-      // ** NB: otherwise, the first CUDA operation (eg a cudaMemcpyToSymbol in CPPProcess ctor) appears to take much longer!
-      /*
-      // [We initially added cudaFree(0) to "ease profile analysis" only because it shows up as a big recognizable block!]
-      // No explicit initialization is needed: https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#initialization
-      // It is not clear what cudaFree(0) does at all: https://stackoverflow.com/questions/69967813/
-      if ( debug ) std::cout << "__CudaRuntime: calling cudaFree(0)" << std::endl;
-      checkCuda( cudaFree( 0 ) ); // SLOW!
-      */
-      // Replace cudaFree(0) by cudaSetDevice(0), even if it is not really needed either
-      // (but see https://developer.nvidia.com/blog/cuda-pro-tip-always-set-current-device-avoid-multithreading-bugs)
-      if( debug ) std::cout << "__CudaRuntime: calling cudaSetDevice(0)" << std::endl;
-      checkCuda( cudaSetDevice( 0 ) ); // SLOW!
-    }
-
-    // Tear down CUDA application (call cudaDeviceReset)
-    // ** NB: strictly speaking this is not needed when using the CUDA runtime API **
-    // Calling cudaDeviceReset on shutdown is only needed for checking memory leaks in cuda-memcheck
-    // See https://docs.nvidia.com/cuda/cuda-memcheck/index.html#leak-checking
-    static void tearDown( const bool debug = true )
-    {
-      if( debug ) std::cout << "__CudaRuntime: calling cudaDeviceReset()" << std::endl;
-      checkCuda( cudaDeviceReset() );
-    }
-  };
-
-}
-#endif
-
-//--------------------------------------------------------------------------
-
-#endif // MG5AMC_CUDARUNTIME_H
diff --git a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/CurandRandomNumberKernel.cc b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/CurandRandomNumberKernel.cc
index eb56333b03..38c477c17a 100644
--- a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/CurandRandomNumberKernel.cc
+++ b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/CurandRandomNumberKernel.cc
@@ -1,9 +1,9 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
-#include "CudaRuntime.h"
+#include "GpuRuntime.h"
 #include "MemoryBuffers.h"
 #include "RandomNumberKernels.h"
 
@@ -114,7 +114,7 @@ namespace mg5amcCpu
     /*
     printf( "\nCurandRandomNumberKernel::generateRnarray size = %d\n", (int)m_rnarray.size() );
     fptype* data = m_rnarray.data();
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     if( m_rnarray.isOnDevice() )
     {
       data = new fptype[m_rnarray.size()]();
@@ -123,7 +123,7 @@ namespace mg5amcCpu
 #endif
     for( int i = 0; i < ( (int)m_rnarray.size() / 4 ); i++ )
       printf( "[%4d] %f %f %f %f\n", i * 4, data[i * 4], data[i * 4 + 2], data[i * 4 + 2], data[i * 4 + 3] );
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     if( m_rnarray.isOnDevice() ) delete[] data;
 #endif
     */
diff --git a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/EventStatistics.h b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/EventStatistics.h
index 48b51e0a49..b425a5bade 100644
--- a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/EventStatistics.h
+++ b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/EventStatistics.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef EventStatistics_H
 #define EventStatistics_H 1
@@ -16,7 +16,7 @@
 #include <limits>
 #include <string>
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/GpuAbstraction.h b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/GpuAbstraction.h
new file mode 100644
index 0000000000..6a7d9c05c0
--- /dev/null
+++ b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/GpuAbstraction.h
@@ -0,0 +1,71 @@
+// Copyright (C) 2020-2023 CERN and UCLouvain.
+// Licensed under the GNU Lesser General Public License (version 3 or later).
+// Created by: J. Teig (Jul 2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+
+#ifndef MG5AMC_GPUABSTRACTION_H
+#define MG5AMC_GPUABSTRACTION_H 1
+
+#include <cassert>
+
+//--------------------------------------------------------------------------
+
+#ifdef __CUDACC__
+
+#define gpuError_t cudaError_t
+#define gpuPeekAtLastError cudaPeekAtLastError
+#define gpuGetErrorString cudaGetErrorString
+#define gpuSuccess cudaSuccess
+
+#define gpuMallocHost( ptr, size ) checkGpu( cudaMallocHost( ptr, size ) )
+#define gpuMalloc( ptr, size ) checkGpu( cudaMalloc( ptr, size ) )
+
+#define gpuMemcpy( dstData, srcData, srcBytes, func ) checkGpu( cudaMemcpy( dstData, srcData, srcBytes, func ) )
+#define gpuMemcpyHostToDevice cudaMemcpyHostToDevice
+#define gpuMemcpyDeviceToHost cudaMemcpyDeviceToHost
+#define gpuMemcpyToSymbol( type1, type2, size ) checkGpu( cudaMemcpyToSymbol( type1, type2, size ) )
+
+#define gpuFree( ptr ) checkGpu( cudaFree( ptr ) )
+#define gpuFreeHost( ptr ) checkGpu( cudaFreeHost( ptr ) )
+
+#define gpuSetDevice cudaSetDevice
+#define gpuDeviceSynchronize cudaDeviceSynchronize
+#define gpuDeviceReset cudaDeviceReset
+
+#define gpuLaunchKernel( kernel, blocks, threads, ... ) kernel<<<blocks, threads>>>( __VA_ARGS__ )
+#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<<blocks, threads, sharedMem>>>( __VA_ARGS__ )
+
+//--------------------------------------------------------------------------
+
+#elif defined __HIPCC__
+
+#include "hip/hip_runtime.h"
+
+#define gpuError_t hipError_t
+#define gpuPeekAtLastError hipPeekAtLastError
+#define gpuGetErrorString hipGetErrorString
+#define gpuSuccess hipSuccess
+
+#define gpuMallocHost( ptr, size ) checkGpu( hipHostMalloc( ptr, size ) ) // HostMalloc better
+#define gpuMalloc( ptr, size ) checkGpu( hipMalloc( ptr, size ) )
+
+#define gpuMemcpy( dstData, srcData, srcBytes, func ) checkGpu( hipMemcpy( dstData, srcData, srcBytes, func ) )
+#define gpuMemcpyHostToDevice hipMemcpyHostToDevice
+#define gpuMemcpyDeviceToHost hipMemcpyDeviceToHost
+#define gpuMemcpyToSymbol( type1, type2, size ) checkGpu( hipMemcpyToSymbol( type1, type2, size ) )
+
+#define gpuFree( ptr ) checkGpu( hipFree( ptr ) )
+#define gpuFreeHost( ptr ) checkGpu( hipHostFree( ptr ) )
+
+#define gpuSetDevice hipSetDevice
+#define gpuDeviceSynchronize hipDeviceSynchronize
+#define gpuDeviceReset hipDeviceReset
+
+#define gpuLaunchKernel( kernel, blocks, threads, ... ) kernel<<<blocks, threads>>>( __VA_ARGS__ )
+#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<<blocks, threads, sharedMem>>>( __VA_ARGS__ )
+
+//--------------------------------------------------------------------------
+
+#endif
+
+#endif // MG5AMC_GPUABSTRACTION_H
diff --git a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/GpuRuntime.h b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/GpuRuntime.h
new file mode 100644
index 0000000000..93579ef08b
--- /dev/null
+++ b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/GpuRuntime.h
@@ -0,0 +1,85 @@
+// Copyright (C) 2020-2023 CERN and UCLouvain.
+// Licensed under the GNU Lesser General Public License (version 3 or later).
+// Created by: J. Teig (Jun 2023, based on earlier work by S. Roiser) for the MG5aMC CUDACPP plugin.
+// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+
+#ifndef MG5AMC_GPURUNTIME_H
+#define MG5AMC_GPURUNTIME_H 1
+
+// MG5AMC on GPU uses the CUDA runtime API, not the lower level CUDA driver API
+// See https://docs.nvidia.com/cuda/cuda-runtime-api/driver-vs-runtime-api.html#driver-vs-runtime-api
+
+#include "GpuAbstraction.h"
+
+#include <iostream>
+
+//--------------------------------------------------------------------------
+
+// See https://stackoverflow.com/a/14038590
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
+#define checkGpu( code ) { assertGpu( code, __FILE__, __LINE__ ); }
+inline void assertGpu( gpuError_t code, const char* file, int line, bool abort = true )
+{
+  if( code != gpuSuccess )
+  {
+    printf( "ERROR! assertGpu: '%s' (%d) in %s:%d\n", gpuGetErrorString( code ), code, file, line );
+    if( abort ) assert( code == gpuSuccess );
+  }
+}
+#endif /* clang-format on */
+
+//--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+namespace mg5amcGpu
+{
+  // Instantiate a GpuRuntime at the beginnining of the application's main to
+  // invoke gpuSetDevice(0) in the constructor and book a gpuDeviceReset() call in the destructor
+  // *** FIXME! This will all need to be designed differently when going to multi-GPU nodes! ***
+  struct GpuRuntime final
+  {
+    GpuRuntime( const bool debug = true )
+      : m_debug( debug ) { setUp( m_debug ); }
+    ~GpuRuntime() { tearDown( m_debug ); }
+    GpuRuntime( const GpuRuntime& ) = delete;
+    GpuRuntime( GpuRuntime&& ) = delete;
+    GpuRuntime& operator=( const GpuRuntime& ) = delete;
+    GpuRuntime& operator=( GpuRuntime&& ) = delete;
+    bool m_debug;
+
+    // Set up CUDA application
+    // ** NB: strictly speaking this is not needed when using the CUDA runtime API **
+    // Calling cudaSetDevice on startup is useful to properly book-keep the time spent in CUDA initialization
+    static void setUp( const bool debug = true )
+    {
+      // ** NB: it is useful to call cudaSetDevice, or cudaFree, to properly book-keep the time spent in CUDA initialization
+      // ** NB: otherwise, the first CUDA operation (eg a cudaMemcpyToSymbol in CPPProcess ctor) appears to take much longer!
+      /*
+      // [We initially added cudaFree(0) to "ease profile analysis" only because it shows up as a big recognizable block!]
+      // No explicit initialization is needed: https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#initialization
+      // It is not clear what cudaFree(0) does at all: https://stackoverflow.com/questions/69967813/
+      if ( debug ) std::cout << "__CudaRuntime: calling cudaFree(0)" << std::endl;
+      checkCuda( cudaFree( 0 ) ); // SLOW!
+      */
+      // Replace cudaFree(0) by cudaSetDevice(0), even if it is not really needed either
+      // (but see https://developer.nvidia.com/blog/cuda-pro-tip-always-set-current-device-avoid-multithreading-bugs)
+      if( debug ) std::cout << "__GpuRuntime: calling GpuSetDevice(0)" << std::endl;
+      checkGpu( gpuSetDevice( 0 ) ); // SLOW!
+    }
+
+    // Tear down CUDA application (call cudaDeviceReset)
+    // ** NB: strictly speaking this is not needed when using the CUDA runtime API **
+    // Calling cudaDeviceReset on shutdown is only needed for checking memory leaks in cuda-memcheck
+    // See https://docs.nvidia.com/cuda/cuda-memcheck/index.html#leak-checking
+    static void tearDown( const bool debug = true )
+    {
+      if( debug ) std::cout << "__GpuRuntime: calling GpuDeviceReset()" << std::endl;
+      checkGpu( gpuDeviceReset() );
+    }
+  };
+}
+#endif
+
+//--------------------------------------------------------------------------
+
+#endif // MG5AMC_GPURUNTIME_H
diff --git a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/MadgraphTest.h b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/MadgraphTest.h
index fd7734ce42..d2ff326e20 100644
--- a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/MadgraphTest.h
+++ b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/MadgraphTest.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: S. Hageboeck (Dec 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MADGRAPHTEST_H_
 #define MADGRAPHTEST_H_ 1
@@ -21,7 +21,7 @@
 #include <string>
 #include <vector>
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 using mg5amcGpu::CPPProcess;
 #else
 using mg5amcCpu::CPPProcess;
@@ -200,7 +200,7 @@ class MadgraphTest : public testing::TestWithParam<TestDriverBase*>
 
 // Since we link both the CPU-only and GPU tests into the same executable, we prevent
 // a multiply defined symbol by only compiling this in the non-CUDA phase:
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 
 /// Compare momenta and matrix elements.
 /// This uses an implementation of TestDriverBase to run a madgraph workflow,
@@ -309,6 +309,6 @@ TEST_P( MadgraphTest, CompareMomentaAndME )
   }
 }
 
-#endif // __CUDACC__
+#endif // MGONGPUCPP_GPUIMPL
 
 #endif /* MADGRAPHTEST_H_ */
diff --git a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/MatrixElementKernels.cc b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/MatrixElementKernels.cc
index 30257195b6..d6d6c4f179 100644
--- a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/MatrixElementKernels.cc
+++ b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/MatrixElementKernels.cc
@@ -1,12 +1,12 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #include "MatrixElementKernels.h"
 
 #include "CPPProcess.h"
-#include "CudaRuntime.h"
+#include "GpuRuntime.h" // Includes the abstraction for Nvidia/AMD compilation
 #include "MemoryAccessMomenta.h"
 #include "MemoryBuffers.h"
 
@@ -14,7 +14,7 @@
 
 //============================================================================
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 namespace mg5amcCpu
 {
 
@@ -143,7 +143,7 @@ namespace mg5amcCpu
 
 //============================================================================
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 {
 
@@ -202,13 +202,13 @@ namespace mg5amcGpu
     PinnedHostBufferHelicityMask hstIsGoodHel( ncomb );
     DeviceBufferHelicityMask devIsGoodHel( ncomb );
     // ... 0d1. Compute good helicity mask on the device
-    computeDependentCouplings<<<m_gpublocks, m_gputhreads>>>( m_gs.data(), m_couplings.data() );
+    gpuLaunchKernel( computeDependentCouplings, m_gpublocks, m_gputhreads, m_gs.data(), m_couplings.data() );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    sigmaKin_getGoodHel<<<m_gpublocks, m_gputhreads>>>( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_numerators.data(), m_denominators.data(), devIsGoodHel.data() );
+    gpuLaunchKernel( sigmaKin_getGoodHel, m_gpublocks, m_gputhreads, m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_numerators.data(), m_denominators.data(), devIsGoodHel.data() );
 #else
-    sigmaKin_getGoodHel<<<m_gpublocks, m_gputhreads>>>( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), devIsGoodHel.data() );
+    gpuLaunchKernel( sigmaKin_getGoodHel, m_gpublocks, m_gputhreads, m_momenta.data(), m_couplings.data(), m_matrixElements.data(), devIsGoodHel.data() );
 #endif
-    checkCuda( cudaPeekAtLastError() );
+    checkGpu( gpuPeekAtLastError() );
     // ... 0d2. Copy back good helicity mask to the host
     copyHostFromDevice( hstIsGoodHel, devIsGoodHel );
     // ... 0d3. Copy back good helicity list to constant memory on the device
@@ -219,19 +219,19 @@ namespace mg5amcGpu
 
   void MatrixElementKernelDevice::computeMatrixElements( const unsigned int channelId )
   {
-    computeDependentCouplings<<<m_gpublocks, m_gputhreads>>>( m_gs.data(), m_couplings.data() );
+    gpuLaunchKernel( computeDependentCouplings, m_gpublocks, m_gputhreads, m_gs.data(), m_couplings.data() );
 #ifndef MGONGPU_NSIGHT_DEBUG
     constexpr unsigned int sharedMemSize = 0;
 #else
     constexpr unsigned int sharedMemSize = ntpbMAX * sizeof( float );
 #endif
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    sigmaKin<<<m_gpublocks, m_gputhreads, sharedMemSize>>>( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), channelId, m_numerators.data(), m_denominators.data(), m_selhel.data(), m_selcol.data() );
+    gpuLaunchKernelSharedMem( sigmaKin, m_gpublocks, m_gputhreads, sharedMemSize, m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), channelId, m_numerators.data(), m_denominators.data(), m_selhel.data(), m_selcol.data() );
 #else
-    sigmaKin<<<m_gpublocks, m_gputhreads, sharedMemSize>>>( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), m_selhel.data(), m_selcol.data() );
+    gpuLaunchKernelSharedMem( sigmaKin, m_gpublocks, m_gputhreads, sharedMemSize, m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), m_selhel.data(), m_selcol.data() );
 #endif
-    checkCuda( cudaPeekAtLastError() );
-    checkCuda( cudaDeviceSynchronize() );
+    checkGpu( gpuPeekAtLastError() );
+    checkGpu( gpuDeviceSynchronize() );
   }
 
   //--------------------------------------------------------------------------
diff --git a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/MatrixElementKernels.h b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/MatrixElementKernels.h
index 23e84757a2..72bd8f195b 100644
--- a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/MatrixElementKernels.h
+++ b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/MatrixElementKernels.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MATRIXELEMENTKERNELS_H
 #define MATRIXELEMENTKERNELS_H 1
@@ -10,7 +10,7 @@
 
 #include "MemoryBuffers.h"
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -81,7 +81,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating matrix element calculations on a CPU host
   class MatrixElementKernelHost final : public MatrixElementKernelBase, public NumberOfEvents
   {
@@ -130,7 +130,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   // A class encapsulating matrix element calculations on a GPU device
   class MatrixElementKernelDevice : public MatrixElementKernelBase, public NumberOfEvents
   {
diff --git a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/MemoryAccessHelpers.h b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/MemoryAccessHelpers.h
index c82a6c7635..db73e4e064 100644
--- a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/MemoryAccessHelpers.h
+++ b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/MemoryAccessHelpers.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MemoryAccessHelpers_H
 #define MemoryAccessHelpers_H 1
@@ -105,7 +105,7 @@ class KernelAccessHelper : public MemoryAccessHelper<T>
     }
     else
     {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
       const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid
       //printf( "kernelAccessRecord: ievt=%d threadId=%d\n", ievt, threadIdx.x );
       return T::ieventAccessRecord( buffer, ievt ); // NB fptype and fptype_sv coincide for CUDA
diff --git a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/MemoryAccessMomenta.h b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/MemoryAccessMomenta.h
index 0ac4faa3c7..38fade09fb 100644
--- a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/MemoryAccessMomenta.h
+++ b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/MemoryAccessMomenta.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MemoryAccessMomenta_H
 #define MemoryAccessMomenta_H 1
@@ -12,7 +12,7 @@
 #include "MemoryAccessHelpers.h"
 #include "MemoryAccessVectors.h"
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 using mg5amcGpu::CPPProcess;
 #else
 using mg5amcCpu::CPPProcess;
@@ -29,7 +29,7 @@ class MemoryAccessMomentaBase //_AOSOAv1
 
   // Number of Events Per Page in the momenta AOSOA memory buffer layout
   // (these are all best kept as a compile-time constants: see issue #23)
-#ifdef __CUDACC__ /* clang-format off */
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
   // -----------------------------------------------------------------------------------------------
   // --- GPUs: neppM is best set to a power of 2 times the number of fptype's in a 32-byte cacheline
   // --- This is relevant to ensure coalesced access to momenta in global memory
diff --git a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/MemoryAccessRandomNumbers.h b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/MemoryAccessRandomNumbers.h
index e2988d39f3..40cb089135 100644
--- a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/MemoryAccessRandomNumbers.h
+++ b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/MemoryAccessRandomNumbers.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MemoryAccessRandomNumbers_H
 #define MemoryAccessRandomNumbers_H 1
@@ -11,7 +11,7 @@
 #include "CPPProcess.h"
 #include "MemoryAccessHelpers.h"
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 using mg5amcGpu::CPPProcess;
 #else
 using mg5amcCpu::CPPProcess;
diff --git a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/MemoryAccessVectors.h b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/MemoryAccessVectors.h
index e9b197368e..08faccff0f 100644
--- a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/MemoryAccessVectors.h
+++ b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/MemoryAccessVectors.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MemoryAccessVectors_H
 #define MemoryAccessVectors_H 1
@@ -10,7 +10,7 @@
 
 #include "mgOnGpuVectors.h"
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 namespace mg5amcCpu // this is only needed for CPU SIMD vectorization
 {
 
diff --git a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/MemoryBuffers.h b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/MemoryBuffers.h
index 8109470148..78004e66cc 100644
--- a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/MemoryBuffers.h
+++ b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/MemoryBuffers.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021, based on earlier work by S. Hageboeck) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Roiser, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MemoryBuffers_H
 #define MemoryBuffers_H 1
@@ -11,12 +11,12 @@
 #include "mgOnGpuCxtypes.h"
 
 #include "CPPProcess.h"
-#include "CudaRuntime.h"
+#include "GpuRuntime.h"
 #include "Parameters_heft.h"
 
 #include <sstream>
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -87,7 +87,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   constexpr bool HostBufferALIGNED = false;   // ismisaligned=false
   constexpr bool HostBufferMISALIGNED = true; // ismisaligned=true
 
@@ -119,7 +119,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   // A class encapsulating a CUDA pinned host buffer
   template<typename T>
   class PinnedHostBufferBase : public BufferBase<T>
@@ -128,18 +128,18 @@ namespace mg5amcCpu
     PinnedHostBufferBase( const size_t size )
       : BufferBase<T>( size, false )
     {
-      checkCuda( cudaMallocHost( &( this->m_data ), this->bytes() ) );
+      gpuMallocHost( &( this->m_data ), this->bytes() );
     }
     virtual ~PinnedHostBufferBase()
     {
-      checkCuda( cudaFreeHost( this->m_data ) );
+      gpuFreeHost( this->m_data );
     }
   };
 #endif
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   // A class encapsulating a CUDA device buffer
   template<typename T>
   class DeviceBufferBase : public BufferBase<T>
@@ -148,18 +148,18 @@ namespace mg5amcCpu
     DeviceBufferBase( const size_t size )
       : BufferBase<T>( size, true )
     {
-      checkCuda( cudaMalloc( &( this->m_data ), this->bytes() ) );
+      gpuMalloc( &( this->m_data ), this->bytes() );
     }
     virtual ~DeviceBufferBase()
     {
-      checkCuda( cudaFree( this->m_data ) );
+      gpuFree( this->m_data );
     }
   };
 #endif
 
   //--------------------------------------------------------------------------
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for a given number of events
   template<typename T, size_t sizePerEvent, bool ismisaligned>
   class HostBuffer : public HostBufferBase<T, ismisaligned>, virtual private NumberOfEvents
@@ -175,7 +175,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   // A class encapsulating a CUDA pinned host buffer for a given number of events
   template<typename T, size_t sizePerEvent>
   class PinnedHostBuffer : public PinnedHostBufferBase<T>, virtual private NumberOfEvents
@@ -191,7 +191,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   // A class encapsulating a CUDA device buffer for a given number of events
   template<typename T, size_t sizePerEvent>
   class DeviceBuffer : public DeviceBufferBase<T>, virtual private NumberOfEvents
@@ -213,7 +213,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for momenta random numbers
   constexpr size_t sizePerEventRndNumMomenta = MemoryBuffers::np4 * MemoryBuffers::nparf;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for momenta random numbers
   typedef HostBuffer<fptype, sizePerEventRndNumMomenta, HostBufferALIGNED> HostBufferRndNumMomenta;
 #else
@@ -232,7 +232,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer with ONE fptype per event
   constexpr size_t sizePerEventOneFp = 1;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer with ONE fptype per event
   typedef HostBuffer<fptype, sizePerEventOneFp, HostBufferALIGNED> HostBufferOneFp;
 #else
@@ -257,7 +257,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for Gs
   constexpr size_t sizePerEventGs = 1;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for gs
   typedef HostBuffer<fptype, sizePerEventGs, HostBufferALIGNED> HostBufferGs;
 #else
@@ -276,7 +276,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for numerators
   constexpr size_t sizePerEventNumerators = 1;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for gs
   typedef HostBuffer<fptype, sizePerEventNumerators, HostBufferALIGNED> HostBufferNumerators;
 #else
@@ -296,7 +296,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for denominators
   constexpr size_t sizePerEventDenominators = 1;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for gs
   typedef HostBuffer<fptype, sizePerEventDenominators, HostBufferALIGNED> HostBufferDenominators;
 #else
@@ -315,7 +315,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for random numbers
   constexpr size_t sizePerEventCouplings = MemoryBuffers::ndcoup * MemoryBuffers::nx2;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for gs
   typedef HostBuffer<fptype, sizePerEventCouplings, HostBufferALIGNED> HostBufferCouplings;
 #else
@@ -333,7 +333,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for momenta
   constexpr size_t sizePerEventMomenta = MemoryBuffers::np4 * MemoryBuffers::npar;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for momenta
   typedef HostBuffer<fptype, sizePerEventMomenta, HostBufferALIGNED> HostBufferMomenta;
   //typedef HostBuffer<fptype, sizePerEventMomenta, HostBufferMISALIGNED> HostBufferMomenta; // TEST MISALIGNMENT!
@@ -352,7 +352,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for sampling weights
   constexpr size_t sizePerEventWeights = 1;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for sampling weights
   typedef HostBuffer<fptype, sizePerEventWeights, HostBufferALIGNED> HostBufferWeights;
 #else
@@ -370,7 +370,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for matrix elements
   constexpr size_t sizePerEventMatrixElements = 1;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for matrix elements
   typedef HostBuffer<fptype, sizePerEventMatrixElements, HostBufferALIGNED> HostBufferMatrixElements;
 #else
@@ -385,7 +385,7 @@ namespace mg5amcCpu
   // A base class encapsulating a memory buffer for the helicity mask
   typedef BufferBase<bool> BufferHelicityMask;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for the helicity mask
   typedef HostBufferBase<bool, HostBufferALIGNED> HostBufferHelicityMask;
 #else
@@ -403,7 +403,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for wavefunctions
   constexpr size_t sizePerEventWavefunctions = MemoryBuffers::nw6 * MemoryBuffers::nx2;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for wavefunctions
   typedef HostBuffer<fptype, sizePerEventWavefunctions, HostBufferALIGNED> HostBufferWavefunctions;
 #else
@@ -421,7 +421,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for helicity random numbers
   constexpr size_t sizePerEventRndNumHelicity = 1;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for helicity random numbers
   typedef HostBuffer<fptype, sizePerEventRndNumHelicity, HostBufferALIGNED> HostBufferRndNumHelicity;
 #else
@@ -439,7 +439,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for color random numbers
   constexpr size_t sizePerEventRndNumColor = 1;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for color random numbers
   typedef HostBuffer<fptype, sizePerEventRndNumColor, HostBufferALIGNED> HostBufferRndNumColor;
 #else
@@ -457,7 +457,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for helicity selection
   constexpr size_t sizePerEventSelectedHelicity = 1;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for helicity selection
   typedef HostBuffer<int, sizePerEventSelectedHelicity, HostBufferALIGNED> HostBufferSelectedHelicity;
 #else
@@ -475,7 +475,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for color selection
   constexpr size_t sizePerEventSelectedColor = 1;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for color selection
   typedef HostBuffer<int, sizePerEventSelectedColor, HostBufferALIGNED> HostBufferSelectedColor;
 #else
@@ -487,7 +487,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   template<class Tdst, class Tsrc>
   void copyDeviceFromHost( Tdst& dst, const Tsrc& src ) // keep the same order of arguments as in memcpy
   {
@@ -504,13 +504,13 @@ namespace mg5amcCpu
       throw std::runtime_error( sstr.str() );
     }
     // NB (PR #45): cudaMemcpy involves an intermediate memcpy to pinned memory if host array is a not a pinned host array
-    checkCuda( cudaMemcpy( dst.data(), src.data(), src.bytes(), cudaMemcpyHostToDevice ) );
+    gpuMemcpy( dst.data(), src.data(), src.bytes(), gpuMemcpyHostToDevice );
   }
 #endif
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   template<class Tdst, class Tsrc>
   void copyHostFromDevice( Tdst& dst, const Tsrc& src ) // keep the same order of arguments as in memcpy
   {
@@ -527,7 +527,7 @@ namespace mg5amcCpu
       throw std::runtime_error( sstr.str() );
     }
     // NB (PR #45): cudaMemcpy involves an intermediate memcpy to pinned memory if host array is a not a pinned host array
-    checkCuda( cudaMemcpy( dst.data(), src.data(), src.bytes(), cudaMemcpyDeviceToHost ) );
+    gpuMemcpy( dst.data(), src.data(), src.bytes(), gpuMemcpyDeviceToHost );
   }
 #endif
 
diff --git a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/P1_Sigma_heft_gg_h/CPPProcess.cc b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/P1_Sigma_heft_gg_h/CPPProcess.cc
index e0c90c3bc7..c3fcba5970 100644
--- a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/P1_Sigma_heft_gg_h/CPPProcess.cc
+++ b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/P1_Sigma_heft_gg_h/CPPProcess.cc
@@ -4,7 +4,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
 // MadGraph5_aMC@NLO v. 3.5.0_lo_vect, 2023-06-09
@@ -16,7 +16,6 @@
 
 #include "mgOnGpuConfig.h"
 
-#include "CudaRuntime.h"
 #include "HelAmps_heft.h"
 #include "MemoryAccessAmplitudes.h"
 #include "MemoryAccessCouplings.h"
@@ -46,7 +45,7 @@
 // Class member functions for calculating the matrix elements for
 // Process: g g > h HIG<=1 HIW<=1 WEIGHTED<=2 @1
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -80,7 +79,7 @@ namespace mg5amcCpu
   //__device__ const fptype* cIPD = nullptr; // unused as nparam=0
   __device__ const fptype* cIPC = nullptr; // unused as nicoup=0
 #else
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   //__device__ __constant__ fptype* cIPD = nullptr; // unused as nparam=0
   __device__ __constant__ fptype* cIPC = nullptr; // unused as nicoup=0
 #else
@@ -90,7 +89,7 @@ namespace mg5amcCpu
 #endif
 
   // Helicity combinations (and filtering of "good" helicity combinations)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   __device__ __constant__ short cHel[ncomb][npar];
   __device__ __constant__ int cNGoodHel;
   __device__ __constant__ int cGoodHel[ncomb];
@@ -118,13 +117,13 @@ namespace mg5amcCpu
                            fptype* allDenominators,       // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
                            fptype_sv* jamp2_sv            // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled)
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
                            , const int ievt00             // input: first event number in current C++ event page (for CUDA, ievt depends on threadid)
 #endif
                            )
   //ALWAYS_INLINE // attributes are not permitted in a function definition
   {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     using namespace mg5amcGpu;
     using M_ACCESS = DeviceAccessMomenta;         // non-trivial access: buffer includes all events
     using E_ACCESS = DeviceAccessMatrixElements;  // non-trivial access: buffer includes all events
@@ -151,7 +150,7 @@ namespace mg5amcCpu
 #endif /* clang-format on */
     mgDebug( 0, __FUNCTION__ );
     //printf( "calculate_wavefunctions: ihel=%2d\n", ihel );
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
     //printf( "calculate_wavefunctions: ievt00=%d\n", ievt00 );
 #endif
 
@@ -187,7 +186,7 @@ namespace mg5amcCpu
 #endif
     for( int iParity = 0; iParity < nParity; ++iParity )
     { // START LOOP ON IPARITY
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
       const int ievt0 = ievt00 + iParity * neppV;
 #endif
       constexpr size_t nxcoup = ndcoup + nicoup; // both dependent and independent couplings
@@ -200,8 +199,10 @@ namespace mg5amcCpu
         allCOUPs[idcoup] = CD_ACCESS::idcoupAccessBufferConst( allcouplings, idcoup ); // dependent couplings, vary event-by-event
       for( size_t iicoup = 0; iicoup < nicoup; iicoup++ )
         allCOUPs[ndcoup + iicoup] = CI_ACCESS::iicoupAccessBufferConst( cIPC, iicoup ); // independent couplings, fixed for all events
+#ifdef MGONGPUCPP_GPUIMPL
 #ifdef __CUDACC__
 #pragma nv_diagnostic pop
+#endif
       // CUDA kernels take input/output buffers with momenta/MEs for all events
       const fptype* momenta = allmomenta;
       const fptype* COUPs[nxcoup];
@@ -268,7 +269,7 @@ namespace mg5amcCpu
       // [NB do keep 'static' for these constexpr arrays, see issue #283]
       static constexpr fptype2 cf[ncolor][ncolor] = { { 2 } }; // 2-D array[1][1]
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
       // Pre-compute a constexpr triangular color matrix properly normalized #475
       struct TriangularNormalizedColorMatrix
       {
@@ -325,7 +326,7 @@ namespace mg5amcCpu
 #endif
       for( int icol = 0; icol < ncolor; icol++ )
       {
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
         // === C++ START ===
         // Diagonal terms
 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
@@ -384,7 +385,7 @@ namespace mg5amcCpu
       MEs_sv_previous += deltaMEs_previous;
 #endif
       /*
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
       if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", blockDim.x * blockIdx.x + threadIdx.x, ihel, MEs_sv );
 #else
 #ifdef MGONGPU_CPPSIMD
@@ -419,8 +420,8 @@ namespace mg5amcCpu
       { -1, 1, 0 },
       { 1, -1, 0 },
       { 1, 1, 0 } };
-#ifdef __CUDACC__
-    checkCuda( cudaMemcpyToSymbol( cHel, tHel, ncomb * npar * sizeof( short ) ) );
+#ifdef MGONGPUCPP_GPUIMPL
+    gpuMemcpyToSymbol( cHel, tHel, ncomb * npar * sizeof( short ) );
 #else
     memcpy( cHel, tHel, ncomb * npar * sizeof( short ) );
 #endif
@@ -459,9 +460,9 @@ namespace mg5amcCpu
     // Then copy them to CUDA constant memory (issue #39) or its C++ emulation in file-scope static memory
     //const fptype tIPD[0] = { ... }; // nparam=0
     //const cxtype tIPC[0] = { ... }; // nicoup=0
-#ifdef __CUDACC__
-    //checkCuda( cudaMemcpyToSymbol( cIPD, tIPD, 0 * sizeof( fptype ) ) ); // nparam=0
-    //checkCuda( cudaMemcpyToSymbol( cIPC, tIPC, 0 * sizeof( cxtype ) ) ); // nicoup=0
+#ifdef MGONGPUCPP_GPUIMPL
+    //gpuMemcpyToSymbol( cIPD, tIPD, 0 * sizeof( fptype ) ); // nparam=0
+    //gpuMemcpyToSymbol( cIPC, tIPC, 0 * sizeof( cxtype ) ); // nicoup=0
 #else
     //memcpy( cIPD, tIPD, 0 * sizeof( fptype ) ); // nparam=0
     //memcpy( cIPC, tIPC, 0 * sizeof( cxtype ) ); // nicoup=0
@@ -495,7 +496,7 @@ namespace mg5amcCpu
   {
     std::stringstream out;
     // CUDA version (NVCC)
-    // [Use __NVCC__ instead of __CUDACC__ here!]
+    // [Use __NVCC__ instead of MGONGPUCPP_GPUIMPL here!]
     // [This tests if 'nvcc' was used even to build a .cc file, even if not necessarily 'nvcc -x cu' for a .cu file]
     // [Check 'nvcc --compiler-options -dM -E dummy.c | grep CUDA': see https://stackoverflow.com/a/53713712]
 #ifdef __NVCC__
@@ -560,12 +561,12 @@ namespace mg5amcCpu
   __global__ void /* clang-format off */
   computeDependentCouplings( const fptype* allgs, // input: Gs[nevt]
                              fptype* allcouplings // output: couplings[nevt*ndcoup*2]
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
                              , const int nevt     // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #endif
   ) /* clang-format on */
   {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     using namespace mg5amcGpu;
     using G_ACCESS = DeviceAccessGs;
     using C_ACCESS = DeviceAccessCouplings;
@@ -586,7 +587,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__ /* clang-format off */
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
   __global__ void
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
@@ -715,9 +716,9 @@ namespace mg5amcCpu
         nGoodHel++;
       }
     }
-#ifdef __CUDACC__
-    checkCuda( cudaMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) ) );
-    checkCuda( cudaMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) ) );
+#ifdef MGONGPUCPP_GPUIMPL
+    gpuMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) );
+    gpuMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) );
 #else
     cNGoodHel = nGoodHel;
     for( int ihel = 0; ihel < ncomb; ihel++ ) cGoodHel[ihel] = goodHel[ihel];
@@ -741,7 +742,7 @@ namespace mg5amcCpu
 #endif
             int* allselhel,                // output: helicity selection[nevt]
             int* allselcol                 // output: helicity selection[nevt]
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
             , const int nevt               // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #endif
             ) /* clang-format on */
@@ -762,7 +763,7 @@ namespace mg5amcCpu
     // Denominators: spins, colors and identical particles
     constexpr int helcolDenominators[1] = { 256 }; // assume nprocesses == 1 (#272 and #343)
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     // Remember: in CUDA this is a kernel for one event, in c++ this processes n events
     const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid
 #else
@@ -776,9 +777,12 @@ namespace mg5amcCpu
 #endif
 
     // Start sigmaKin_lines
+
+#include "GpuAbstraction.h"
+
     // === PART 0 - INITIALISATION (before calculate_wavefunctions) ===
     // Reset the "matrix elements" - running sums of |M|^2 over helicities for the given event
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     allMEs[ievt] = 0;
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     allNumerators[ievt] = 0;
@@ -806,7 +810,7 @@ namespace mg5amcCpu
     // === PART 1 - HELICITY LOOP: CALCULATE WAVEFUNCTIONS ===
     // (in both CUDA and C++, using precomputed good helicities)
 
-#ifdef __CUDACC__ // CUDA OR C++
+#ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++
 
     // *** START OF PART 1a - CUDA (one event per CPU thread) ***
     // Running sum of partial amplitudes squared for event by event color selection (#402)
@@ -1010,7 +1014,7 @@ namespace mg5amcCpu
     // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event
     // [NB 'sum over final spins, average over initial spins', eg see
     // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf]
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     allMEs[ievt] /= helcolDenominators[0];
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     if( channelId > 0 ) allMEs[ievt] *= allNumerators[ievt] / allDenominators[ievt];
diff --git a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/P1_Sigma_heft_gg_h/CPPProcess.h b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/P1_Sigma_heft_gg_h/CPPProcess.h
index 1210ee05bc..3febdd5abe 100644
--- a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/P1_Sigma_heft_gg_h/CPPProcess.h
+++ b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/P1_Sigma_heft_gg_h/CPPProcess.h
@@ -4,7 +4,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
 // MadGraph5_aMC@NLO v. 3.5.0_lo_vect, 2023-06-09
@@ -25,7 +25,7 @@
 
 //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -107,7 +107,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   __global__ void
   computeDependentCouplings( const fptype* allgs,    // input: Gs[nevt]
                              fptype* allcouplings ); // output: couplings[nevt*ndcoup*2]
@@ -120,7 +120,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__ /* clang-format off */
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
   __global__ void
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
@@ -150,7 +150,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__ /* clang-format off */
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
   __global__ void
   sigmaKin( const fptype* allmomenta,      // input: momenta[nevt*npar*4]
             const fptype* allcouplings,    // input: couplings[nevt*ndcoup*2]
diff --git a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/P1_Sigma_heft_gg_h/CudaRuntime.h b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/P1_Sigma_heft_gg_h/CudaRuntime.h
deleted file mode 120000
index ce9e1a487a..0000000000
--- a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/P1_Sigma_heft_gg_h/CudaRuntime.h
+++ /dev/null
@@ -1 +0,0 @@
-../CudaRuntime.h
\ No newline at end of file
diff --git a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/P1_Sigma_heft_gg_h/GpuAbstraction.h b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/P1_Sigma_heft_gg_h/GpuAbstraction.h
new file mode 120000
index 0000000000..72054e19ba
--- /dev/null
+++ b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/P1_Sigma_heft_gg_h/GpuAbstraction.h
@@ -0,0 +1 @@
+../GpuAbstraction.h
\ No newline at end of file
diff --git a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/P1_Sigma_heft_gg_h/GpuRuntime.h b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/P1_Sigma_heft_gg_h/GpuRuntime.h
new file mode 120000
index 0000000000..3920e83be4
--- /dev/null
+++ b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/P1_Sigma_heft_gg_h/GpuRuntime.h
@@ -0,0 +1 @@
+../GpuRuntime.h
\ No newline at end of file
diff --git a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/P1_Sigma_heft_gg_h/check_sa.cc b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/P1_Sigma_heft_gg_h/check_sa.cc
index f1e75b9252..1bad694d1c 100644
--- a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/P1_Sigma_heft_gg_h/check_sa.cc
+++ b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/P1_Sigma_heft_gg_h/check_sa.cc
@@ -4,7 +4,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: O. Mattelaer (Nov 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 
 #include "mgOnGpuConfig.h"
@@ -12,6 +12,7 @@
 #include "BridgeKernels.h"
 #include "CPPProcess.h"
 #include "CrossSectionKernels.h"
+#include "GpuRuntime.h"
 #include "MatrixElementKernels.h"
 #include "MemoryAccessMatrixElements.h"
 #include "MemoryAccessMomenta.h"
@@ -63,7 +64,7 @@ usage( char* argv0, int ret = 1 )
   std::cout << std::endl;
   std::cout << "Summary stats are always computed: '-p' and '-j' only control their printout" << std::endl;
   std::cout << "The '-d' flag only enables NaN/abnormal warnings and OMP debugging" << std::endl;
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 #ifdef _OPENMP
   std::cout << std::endl;
   std::cout << "Use the OMP_NUM_THREADS environment variable to control OMP multi-threading" << std::endl;
@@ -77,7 +78,7 @@ int
 main( int argc, char** argv )
 {
   // Namespaces for CUDA and C++ (FIXME - eventually use the same namespace everywhere...)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   using namespace mg5amcGpu;
 #else
   using namespace mg5amcCpu;
@@ -103,11 +104,11 @@ main( int argc, char** argv )
     CurandDevice = 2
   };
 #ifdef __CUDACC__
-  RandomNumberMode rndgen = RandomNumberMode::CurandDevice; // default on GPU
+  RandomNumberMode rndgen = RandomNumberMode::CurandDevice; // default on CUDA GPU
 #elif not defined MGONGPU_HAS_NO_CURAND
   RandomNumberMode rndgen = RandomNumberMode::CurandHost;  // default on CPU if build has curand
 #else
-  RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // default on CPU if build has no curand
+  RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // default on CPU if build has no curand and on HIP GPU
 #endif
   // Rambo sampling mode (NB RamboHost implies CommonRandom or CurandHost!)
   enum class RamboSamplingMode
@@ -115,7 +116,7 @@ main( int argc, char** argv )
     RamboHost = 1,
     RamboDevice = 2
   };
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   RamboSamplingMode rmbsmp = RamboSamplingMode::RamboDevice; // default on GPU
 #else
   RamboSamplingMode rmbsmp = RamboSamplingMode::RamboHost; // default on CPU
@@ -148,7 +149,7 @@ main( int argc, char** argv )
 #ifdef __CUDACC__
       rndgen = RandomNumberMode::CurandDevice;
 #else
-      throw std::runtime_error( "CurandDevice is not supported on CPUs" );
+      throw std::runtime_error( "CurandDevice is not supported on CPUs or on HIP GPUs" );
 #endif
     }
     else if( arg == "--curhst" )
@@ -165,7 +166,7 @@ main( int argc, char** argv )
     }
     else if( arg == "--rmbdev" )
     {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
       rmbsmp = RamboSamplingMode::RamboDevice;
 #else
       throw std::runtime_error( "RamboDevice is not supported on CPUs" );
@@ -239,13 +240,13 @@ main( int argc, char** argv )
     return usage( argv[0] );
   }
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 #ifdef _OPENMP
   ompnumthreadsNotSetMeansOneThread( debug ? 1 : 0 ); // quiet(-1), info(0), debug(1)
 #endif
 #endif
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // Fail gently and avoid "Illegal instruction (core dumped)" if the host does not support the SIMD used in the ME calculation
   // Note: this prevents a crash on pmpe04 but not on some github CI nodes?
   // [NB: SIMD vectorization in mg5amc C++ code is only used in the ME calculation below MatrixElementKernelHost!]
@@ -263,14 +264,14 @@ main( int argc, char** argv )
 
   // === STEP 0 - INITIALISE
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 
-  // --- 00. Initialise cuda
-  // Instantiate a CudaRuntime at the beginnining of the application's main to
-  // invoke cudaSetDevice(0) in the constructor and book a cudaDeviceReset() call in the destructor
-  const std::string cdinKey = "00 CudaInit";
+  // --- 00. Initialise GPU
+  // Instantiate a GpuRuntime at the beginnining of the application's main.
+  // For CUDA this invokes cudaSetDevice(0) in the constructor and books a cudaDeviceReset() call in the destructor.
+  const std::string cdinKey = "00 GpuInit";
   timermap.start( cdinKey );
-  CudaRuntime cudaRuntime( debug );
+  GpuRuntime GpuRuntime( debug );
 #endif
 
   // --- 0a. Initialise physics process
@@ -292,7 +293,7 @@ main( int argc, char** argv )
   timermap.start( alloKey );
 
   // Memory buffers for random numbers for momenta
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferRndNumMomenta hstRndmom( nevt );
 #else
   PinnedHostBufferRndNumMomenta hstRndmom( nevt );
@@ -300,7 +301,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for sampling weights
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferWeights hstWeights( nevt );
 #else
   PinnedHostBufferWeights hstWeights( nevt );
@@ -308,7 +309,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for momenta
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferMomenta hstMomenta( nevt );
 #else
   PinnedHostBufferMomenta hstMomenta( nevt );
@@ -316,7 +317,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for Gs
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferGs hstGs( nevt );
 #else
   PinnedHostBufferGs hstGs( nevt );
@@ -333,7 +334,7 @@ main( int argc, char** argv )
   }
 
   // Memory buffers for matrix elements
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferMatrixElements hstMatrixElements( nevt );
 #else
   PinnedHostBufferMatrixElements hstMatrixElements( nevt );
@@ -342,7 +343,7 @@ main( int argc, char** argv )
 
   // Memory buffers for random numbers for helicity selection
   // *** NB #403 these buffers always remain initialised at 0: no need for helicity choice in gcheck/check (no LHE produced) ***
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferRndNumHelicity hstRndHel( nevt );
 #else
   PinnedHostBufferRndNumHelicity hstRndHel( nevt );
@@ -351,7 +352,7 @@ main( int argc, char** argv )
 
   // Memory buffers for random numbers for color selection
   // *** NB #402 these buffers always remain initialised at 0: no need for color choice in gcheck/check (no LHE produced) ***
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferRndNumColor hstRndCol( nevt );
 #else
   PinnedHostBufferRndNumColor hstRndCol( nevt );
@@ -359,7 +360,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for helicity selection
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferSelectedHelicity hstSelHel( nevt );
 #else
   PinnedHostBufferSelectedHelicity hstSelHel( nevt );
@@ -367,7 +368,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for color selection
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferSelectedColor hstSelCol( nevt );
 #else
   PinnedHostBufferSelectedColor hstSelCol( nevt );
@@ -403,7 +404,7 @@ main( int argc, char** argv )
 #else
   else
   {
-    throw std::logic_error( "CurandDevice is not supported on CPUs" ); // INTERNAL ERROR (no path to this statement)
+    throw std::logic_error( "CurandDevice is not supported on CPUs or HIP GPUs" ); // INTERNAL ERROR (no path to this statement)
   }
 #endif
 #else
@@ -421,7 +422,7 @@ main( int argc, char** argv )
   }
   else
   {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     prsk.reset( new RamboSamplingKernelDevice( energy, devRndmom, devMomenta, devWeights, gpublocks, gputhreads ) );
 #else
     throw std::logic_error( "RamboDevice is not supported on CPUs" ); // INTERNAL ERROR (no path to this statement)
@@ -432,7 +433,7 @@ main( int argc, char** argv )
   std::unique_ptr<MatrixElementKernelBase> pmek;
   if( !bridge )
   {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     pmek.reset( new MatrixElementKernelDevice( devMomenta, devGs, devRndHel, devRndCol, devMatrixElements, devSelHel, devSelCol, gpublocks, gputhreads ) );
 #else
     pmek.reset( new MatrixElementKernelHost( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, nevt ) );
@@ -440,7 +441,7 @@ main( int argc, char** argv )
   }
   else
   {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     pmek.reset( new BridgeKernelDevice( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, gpublocks, gputhreads ) );
 #else
     pmek.reset( new BridgeKernelHost( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, nevt ) );
@@ -482,7 +483,7 @@ main( int argc, char** argv )
     prnk->generateRnarray();
     //std::cout << "Got random numbers" << std::endl;
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     if( rndgen != RandomNumberMode::CurandDevice && rmbsmp == RamboSamplingMode::RamboDevice )
     {
       // --- 1c. Copy rndmom from host to device
@@ -514,7 +515,7 @@ main( int argc, char** argv )
     prsk->getMomentaFinal();
     //std::cout << "Got final momenta" << std::endl;
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     if( rmbsmp == RamboSamplingMode::RamboDevice )
     {
       // --- 2c. CopyDToH Weights
@@ -559,7 +560,7 @@ main( int argc, char** argv )
       dynamic_cast<BridgeKernelBase*>( pmek.get() )->transposeInputMomentaC2F();
     }
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     // --- 2d. CopyHToD Momenta
     const std::string gKey = "0.. CpHTDg";
     rambtime += timermap.start( gKey ); // FIXME! NOT A RAMBO TIMER!
@@ -588,7 +589,7 @@ main( int argc, char** argv )
     wv3atime += timermap.stop(); // calc only
     wavetime += wv3atime;        // calc plus copy
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     if( !bridge )
     {
       // --- 3b. CopyDToH MEs
@@ -731,15 +732,19 @@ main( int argc, char** argv )
     rndgentxt = "CURAND DEVICE";
 #ifdef __CUDACC__
   rndgentxt += " (CUDA code)";
+#elif defined __HIPCC__
+  rndgentxt += " (HIP code)";
 #else
   rndgentxt += " (C++ code)";
 #endif
 
   // Workflow description summary
   std::string wrkflwtxt;
-  // -- CUDA or C++?
+  // -- CUDA or HIP or C++?
 #ifdef __CUDACC__
   wrkflwtxt += "CUD:";
+#elif defined __HIPCC__
+  wrkflwtxt += "HIP:";
 #else
   wrkflwtxt += "CPP:";
 #endif
@@ -764,6 +769,12 @@ main( int argc, char** argv )
 #else
   wrkflwtxt += "???:"; // no path to this statement
 #endif
+#elif defined __HIPCC__
+#if defined MGONGPU_CUCXTYPE_CXSMPL
+  wrkflwtxt += "CXS:";
+#else
+  wrkflwtxt += "???:"; // no path to this statement
+#endif
 #else
 #if defined MGONGPU_CPPCXTYPE_STDCOMPLEX
   wrkflwtxt += "STX:";
@@ -789,7 +800,7 @@ main( int argc, char** argv )
     wrkflwtxt += "RMBDEV+";
   else
     wrkflwtxt += "??????+"; // no path to this statement
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   // -- HOST or DEVICE matrix elements? Standalone MEs or BRIDGE?
   if( !bridge )
     wrkflwtxt += "MESDEV";
@@ -845,7 +856,7 @@ main( int argc, char** argv )
 
   if( perf )
   {
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 #ifdef _OPENMP
     // Get the output of "nproc --all" (https://stackoverflow.com/a/478960)
     std::string nprocall;
@@ -866,6 +877,8 @@ main( int argc, char** argv )
     std::cout << std::string( SEP79, '*' ) << std::endl
 #ifdef __CUDACC__
               << "Process                     = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_CUDA"
+#elif defined __HIPCC__
+              << "Process                     = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_HIP"
 #else
               << "Process                     = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_CPP"
 #endif
@@ -892,21 +905,21 @@ main( int argc, char** argv )
 #elif defined MGONGPU_FPTYPE_FLOAT
               << "FP precision                = FLOAT (NaN/abnormal=" << nabn << ", zero=" << nzero << ")" << std::endl
 #endif
-#ifdef __CUDACC__
 #if defined MGONGPU_CUCXTYPE_CUCOMPLEX
               << "Complex type                = CUCOMPLEX" << std::endl
 #elif defined MGONGPU_CUCXTYPE_THRUST
               << "Complex type                = THRUST::COMPLEX" << std::endl
-#endif
-#else
+#elif defined MGONGPU_CUCXTYPE_CXSMPL
               << "Complex type                = STD::COMPLEX" << std::endl
+#else
+              << "Complex type                = ???" << std::endl // no path to this statement...
 #endif
               << "RanNumb memory layout       = AOSOA[" << neppR << "]"
               << ( neppR == 1 ? " == AOS" : "" )
               << " [HARDCODED FOR REPRODUCIBILITY]" << std::endl
               << "Momenta memory layout       = AOSOA[" << neppM << "]"
               << ( neppM == 1 ? " == AOS" : "" ) << std::endl
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     //<< "Wavefunction GPU memory     = LOCAL" << std::endl
 #else
 #if !defined MGONGPU_CPPSIMD
@@ -937,7 +950,7 @@ main( int argc, char** argv )
 #endif
 #endif
               << "Random number generation    = " << rndgentxt << std::endl
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 #ifdef _OPENMP
               << "OMP threads / `nproc --all` = " << omp_get_max_threads() << " / " << nprocall // includes a newline
 #endif
@@ -1033,14 +1046,14 @@ main( int argc, char** argv )
              << "\"FLOAT (NaN/abnormal=" << nabn << ")\"," << std::endl
 #endif
              << "\"Complex type\": "
-#ifdef __CUDACC__
 #if defined MGONGPU_CUCXTYPE_CUCOMPLEX
              << "\"CUCOMPLEX\"," << std::endl
 #elif defined MGONGPU_CUCXTYPE_THRUST
              << "\"THRUST::COMPLEX\"," << std::endl
-#endif
-#else
+#elif defined MGONGPU_CUCXTYPE_CXSMPL
              << "\"STD::COMPLEX\"," << std::endl
+#else
+             << "\"???\"," << std::endl                           // no path to this statement...
 #endif
              << "\"RanNumb memory layout\": "
              << "\"AOSOA[" << neppR << "]\""
@@ -1048,7 +1061,7 @@ main( int argc, char** argv )
              << "\"Momenta memory layout\": "
              << "\"AOSOA[" << neppM << "]\""
              << ( neppM == 1 ? " == AOS" : "" ) << ", " << std::endl
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     //<< "\"Wavefunction GPU memory\": " << "\"LOCAL\"," << std::endl
 #endif
              << "\"Curand generation\": "
diff --git a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/RamboSamplingKernels.cc b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/RamboSamplingKernels.cc
index da68aa9255..79abbcc4f8 100644
--- a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/RamboSamplingKernels.cc
+++ b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/RamboSamplingKernels.cc
@@ -1,11 +1,11 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #include "RamboSamplingKernels.h"
 
-#include "CudaRuntime.h"
+#include "GpuRuntime.h"
 #include "MemoryAccessMomenta.h"
 #include "MemoryAccessRandomNumbers.h"
 #include "MemoryAccessWeights.h"
@@ -14,7 +14,7 @@
 
 #include <sstream>
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -92,7 +92,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   RamboSamplingKernelDevice::RamboSamplingKernelDevice( const fptype energy,               // input: energy
                                                         const BufferRndNumMomenta& rndmom, // input: random numbers in [0,1]
                                                         BufferMomenta& momenta,            // output: momenta
@@ -135,7 +135,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   __global__ void
   getMomentaInitialDevice( const fptype energy,
                            fptype* momenta )
@@ -147,17 +147,17 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   void
   RamboSamplingKernelDevice::getMomentaInitial()
   {
-    getMomentaInitialDevice<<<m_gpublocks, m_gputhreads>>>( m_energy, m_momenta.data() );
+    gpuLaunchKernel( getMomentaInitialDevice, m_gpublocks, m_gputhreads, m_energy, m_momenta.data() );
   }
 #endif
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   __global__ void
   getMomentaFinalDevice( const fptype energy,
                          const fptype* rndmom,
@@ -171,11 +171,11 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   void
   RamboSamplingKernelDevice::getMomentaFinal()
   {
-    getMomentaFinalDevice<<<m_gpublocks, m_gputhreads>>>( m_energy, m_rndmom.data(), m_momenta.data(), m_weights.data() );
+    gpuLaunchKernel( getMomentaFinalDevice, m_gpublocks, m_gputhreads, m_energy, m_rndmom.data(), m_momenta.data(), m_weights.data() );
   }
 #endif
 
diff --git a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/RamboSamplingKernels.h b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/RamboSamplingKernels.h
index 184089efd7..7c214cd74b 100644
--- a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/RamboSamplingKernels.h
+++ b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/RamboSamplingKernels.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef RAMBOSAMPLINGKERNELS_H
 #define RAMBOSAMPLINGKERNELS_H 1
@@ -10,7 +10,7 @@
 
 #include "MemoryBuffers.h"
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -93,7 +93,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   // A class encapsulating RAMBO phase space sampling on a GPU device
   class RamboSamplingKernelDevice final : public SamplingKernelBase, public NumberOfEvents
   {
diff --git a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/RandomNumberKernels.h b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/RandomNumberKernels.h
index 188a72c2c9..21d63beeac 100644
--- a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/RandomNumberKernels.h
+++ b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/RandomNumberKernels.h
@@ -1,14 +1,14 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef RANDOMNUMBERKERNELS_H
 #define RANDOMNUMBERKERNELS_H 1
 
 #include "mgOnGpuConfig.h"
 
-// NB This must come AFTER mgOnGpuConfig.h which contains our definition of __global__ when __CUDACC__ is not defined
+// NB This must come AFTER mgOnGpuConfig.h which contains our definition of __global__ when MGONGPUCPP_GPUIMPL is not defined
 #ifndef MGONGPU_HAS_NO_CURAND
 //#include "curand.h"
 struct curandGenerator_st; // forward definition from curand.h
@@ -16,7 +16,7 @@ struct curandGenerator_st; // forward definition from curand.h
 
 #include "MemoryBuffers.h"
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/cudacpp.mk b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/cudacpp.mk
index a0397e9ecc..bcb73d7f01 100644
--- a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/cudacpp.mk
+++ b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/cudacpp.mk
@@ -1,7 +1,7 @@
 # Copyright (C) 2020-2023 CERN and UCLouvain.
 # Licensed under the GNU Lesser General Public License (version 3 or later).
 # Created by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin.
-# Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+# Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 
 #=== Determine the name of this makefile (https://ftp.gnu.org/old-gnu/Manuals/make-3.80/html_node/make_17.html)
 #=== NB: different names (e.g. cudacpp.mk and cudacpp_src.mk) are used in the Subprocess and src directories
@@ -32,7 +32,7 @@ UNAME_P := $(shell uname -p)
 #=== Configure common compiler flags for C++ and CUDA
 
 INCFLAGS = -I.
-OPTFLAGS = -O3 # this ends up in CUFLAGS too (should it?), cannot add -Ofast or -ffast-math here
+OPTFLAGS = -O3 # this ends up in GPUFLAGS too (should it?), cannot add -Ofast or -ffast-math here
 
 # Dependency on src directory
 MG5AMC_COMMONLIB = mg5amc_common
@@ -89,69 +89,139 @@ endif
 
 #-------------------------------------------------------------------------------
 
-#=== Configure the CUDA compiler
-
-# If CXX is not a single word (example "clang++ --gcc-toolchain...") then disable CUDA builds (issue #505)
-# This is because it is impossible to pass this to "CUFLAGS += -ccbin <host-compiler>" below
-ifneq ($(words $(subst ccache ,,$(CXX))),1) # allow at most "CXX=ccache <host-compiler>" from outside
-  $(warning CUDA builds are not supported for multi-word CXX "$(CXX)")
-  override CUDA_HOME=disabled
-endif
-
-# If CUDA_HOME is not set, try to set it from the location of nvcc
-ifndef CUDA_HOME
-  CUDA_HOME = $(patsubst %bin/nvcc,%,$(shell which nvcc 2>/dev/null))
-  $(warning CUDA_HOME was not set: using "$(CUDA_HOME)")
-endif
-
-# Set NVCC as $(CUDA_HOME)/bin/nvcc if it exists
-ifneq ($(wildcard $(CUDA_HOME)/bin/nvcc),)
-  NVCC = $(CUDA_HOME)/bin/nvcc
-  USE_NVTX ?=-DUSE_NVTX
-  # See https://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/index.html
-  # See https://arnon.dk/matching-sm-architectures-arch-and-gencode-for-various-nvidia-cards/
-  # Default: use compute capability 70 for V100 (CERN lxbatch, CERN itscrd, Juwels Cluster).
-  # Embed device code for 70, and PTX for 70+.
-  # Export MADGRAPH_CUDA_ARCHITECTURE (comma-separated list) to use another value or list of values (see #533).
-  # Examples: use 60 for P100 (Piz Daint), 80 for A100 (Juwels Booster, NVidia raplab/Curiosity).
-  MADGRAPH_CUDA_ARCHITECTURE ?= 70
-  ###CUARCHFLAGS = -gencode arch=compute_$(MADGRAPH_CUDA_ARCHITECTURE),code=compute_$(MADGRAPH_CUDA_ARCHITECTURE) -gencode arch=compute_$(MADGRAPH_CUDA_ARCHITECTURE),code=sm_$(MADGRAPH_CUDA_ARCHITECTURE) # Older implementation (AV): go back to this one for multi-GPU support #533
-  ###CUARCHFLAGS = --gpu-architecture=compute_$(MADGRAPH_CUDA_ARCHITECTURE) --gpu-code=sm_$(MADGRAPH_CUDA_ARCHITECTURE),compute_$(MADGRAPH_CUDA_ARCHITECTURE) # Newer implementation (SH): cannot use this as-is for multi-GPU support #533
-  comma:=,
-  CUARCHFLAGS = $(foreach arch,$(subst $(comma), ,$(MADGRAPH_CUDA_ARCHITECTURE)),-gencode arch=compute_$(arch),code=compute_$(arch) -gencode arch=compute_$(arch),code=sm_$(arch))
-  CUINC = -I$(CUDA_HOME)/include/
-  CURANDLIBFLAGS = -L$(CUDA_HOME)/lib64/ -lcurand # NB: -lcuda is not needed here!
-  CUOPTFLAGS = -lineinfo
-  CUFLAGS = $(OPTFLAGS) $(CUOPTFLAGS) $(INCFLAGS) $(CUINC) $(USE_NVTX) $(CUARCHFLAGS) -use_fast_math
-  ###CUFLAGS += -Xcompiler -Wall -Xcompiler -Wextra -Xcompiler -Wshadow
-  ###NVCC_VERSION = $(shell $(NVCC) --version | grep 'Cuda compilation tools' | cut -d' ' -f5 | cut -d, -f1)
-  CUFLAGS += -std=c++17 # need CUDA >= 11.2 (see #333): this is enforced in mgOnGpuConfig.h
-  # Without -maxrregcount: baseline throughput: 6.5E8 (16384 32 12) up to 7.3E8 (65536 128 12)
-  ###CUFLAGS+= --maxrregcount 160 # improves throughput: 6.9E8 (16384 32 12) up to 7.7E8 (65536 128 12)
-  ###CUFLAGS+= --maxrregcount 128 # improves throughput: 7.3E8 (16384 32 12) up to 7.6E8 (65536 128 12)
-  ###CUFLAGS+= --maxrregcount 96 # degrades throughput: 4.1E8 (16384 32 12) up to 4.5E8 (65536 128 12)
-  ###CUFLAGS+= --maxrregcount 64 # degrades throughput: 1.7E8 (16384 32 12) flat at 1.7E8 (65536 128 12)
-else ifneq ($(origin REQUIRE_CUDA),undefined)
-  # If REQUIRE_CUDA is set but no cuda is found, stop here (e.g. for CI tests on GPU #443)
-  $(error No cuda installation found (set CUDA_HOME or make nvcc visible in PATH))
-else
-  # No cuda. Switch cuda compilation off and go to common random numbers in C++
-  $(warning CUDA_HOME is not set or is invalid: export CUDA_HOME to compile with cuda)
-  override NVCC=
-  override USE_NVTX=
-  override CUINC=
-  override CURANDLIBFLAGS=
-endif
+CUDA_COMPILER_PATH := $(shell compiler="`which nvcc 2>/dev/null`" && while [ -L "$$compiler" ]; do compiler=`readlink "$$compiler"`; done && echo "$$compiler")
+HIP_COMPILER_PATH := $(shell compiler="`which hipcc 2>/dev/null`" && while [ -L "$$compiler" ]; do compiler=`readlink "$$compiler"`; done && echo "$$compiler")
+
+ifeq ($(findstring nvcc,$(CUDA_COMPILER_PATH)),nvcc)
+  #=== Configure the CUDA compiler
+
+  # If CXX is not a single word (example "clang++ --gcc-toolchain...") then disable CUDA builds (issue #505)
+  # This is because it is impossible to pass this to "GPUFLAGS += -ccbin <host-compiler>" below
+  ifneq ($(words $(subst ccache ,,$(CXX))),1) # allow at most "CXX=ccache <host-compiler>" from outside
+    $(warning CUDA builds are not supported for multi-word CXX "$(CXX)")
+    override CUDA_HOME=disabled
+  endif
+
+  # If CUDA_HOME is not set, try to set it from the location of NVCC
+  ifndef CUDA_HOME
+    CUDA_HOME = $(patsubst %bin/nvcc,%,$(shell which nvcc 2>/dev/null))
+    $(warning CUDA_HOME was not set: using "$(CUDA_HOME)")
+  endif
+
+  # Set GPUCC as $(CUDA_HOME)/bin/nvcc if it exists
+  ifneq ($(wildcard $(CUDA_HOME)/bin/nvcc),)
+    GPUCC = $(CUDA_HOME)/bin/nvcc
+    USE_NVTX ?=-DUSE_NVTX
+    # See https://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/index.html
+    # See https://arnon.dk/matching-sm-architectures-arch-and-gencode-for-various-nvidia-cards/
+    # Default: use compute capability 70 for V100 (CERN lxbatch, CERN itscrd, Juwels Cluster).
+    # Embed device code for 70, and PTX for 70+.
+    # Export MADGRAPH_CUDA_ARCHITECTURE (comma-separated list) to use another value or list of values (see #533).
+    # Examples: use 60 for P100 (Piz Daint), 80 for A100 (Juwels Booster, NVidia raplab/Curiosity).
+    MADGRAPH_CUDA_ARCHITECTURE ?= 70
+    ###CUARCHFLAGS = -gencode arch=compute_$(MADGRAPH_CUDA_ARCHITECTURE),code=compute_$(MADGRAPH_CUDA_ARCHITECTURE) -gencode arch=compute_$(MADGRAPH_CUDA_ARCHITECTURE),code=sm_$(MADGRAPH_CUDA_ARCHITECTURE) # Older implementation (AV): go back to this one for multi-GPU support #533
+    ###CUARCHFLAGS = --gpu-architecture=compute_$(MADGRAPH_CUDA_ARCHITECTURE) --gpu-code=sm_$(MADGRAPH_CUDA_ARCHITECTURE),compute_$(MADGRAPH_CUDA_ARCHITECTURE) # Newer implementation (SH): cannot use this as-is for multi-GPU support #533
+    comma:=,
+    CUARCHFLAGS = $(foreach arch,$(subst $(comma), ,$(MADGRAPH_CUDA_ARCHITECTURE)),-gencode arch=compute_$(arch),code=compute_$(arch) -gencode arch=compute_$(arch),code=sm_$(arch))
+    CUINC = -I$(CUDA_HOME)/include/
+    CURANDLIBFLAGS = -L$(CUDA_HOME)/lib64/ -lcurand # NB: -lcuda is not needed here!
+    CUOPTFLAGS = -lineinfo
+    GPUFLAGS = $(OPTFLAGS) $(CUOPTFLAGS) $(INCFLAGS) $(CUINC) $(USE_NVTX) $(CUARCHFLAGS) -use_fast_math
+    ###GPUFLAGS += -Xcompiler -Wall -Xcompiler -Wextra -Xcompiler -Wshadow
+    ###GPUCC_VERSION = $(shell $(GPUCC) --version | grep 'Cuda compilation tools' | cut -d' ' -f5 | cut -d, -f1)
+    GPUFLAGS += -std=c++17 # need CUDA >= 11.2 (see #333): this is enforced in mgOnGpuConfig.h
+    # Without -maxrregcount: baseline throughput: 6.5E8 (16384 32 12) up to 7.3E8 (65536 128 12)
+    ###GPUFLAGS+= --maxrregcount 160 # improves throughput: 6.9E8 (16384 32 12) up to 7.7E8 (65536 128 12)
+    ###GPUFLAGS+= --maxrregcount 128 # improves throughput: 7.3E8 (16384 32 12) up to 7.6E8 (65536 128 12)
+    ###GPUFLAGS+= --maxrregcount 96 # degrades throughput: 4.1E8 (16384 32 12) up to 4.5E8 (65536 128 12)
+    ###GPUFLAGS+= --maxrregcount 64 # degrades throughput: 1.7E8 (16384 32 12) flat at 1.7E8 (65536 128 12)
+
+    CUBUILDRULEFLAGS = -Xcompiler -fPIC -c
+    CCBUILDRULEFLAGS = -Xcompiler -fPIC -c -x cu
+
+    CUDATESTFLAGS = -lcuda
+
+  else ifneq ($(origin REQUIRE_CUDA),undefined)
+    # If REQUIRE_CUDA is set but no cuda is found, stop here (e.g. for CI tests on GPU #443)
+    $(error No cuda installation found (set CUDA_HOME or make GPUCC visible in PATH))
+  else
+    # No cuda. Switch cuda compilation off and go to common random numbers in C++
+    $(warning CUDA_HOME is not set or is invalid: export CUDA_HOME to compile with cuda)
+    override GPUCC=
+    override USE_NVTX=
+    override CUINC=
+    override CURANDLIBFLAGS=
+  endif
+
+  # Set the host C++ compiler for GPUCC via "-ccbin <host-compiler>"
+  # (NB issue #505: this must be a single word, "clang++ --gcc-toolchain..." is not supported)
+  GPUFLAGS += -ccbin $(shell which $(subst ccache ,,$(CXX)))
+
+  # Allow newer (unsupported) C++ compilers with older versions of CUDA if ALLOW_UNSUPPORTED_COMPILER_IN_CUDA is set (#504)
+  ifneq ($(origin ALLOW_UNSUPPORTED_COMPILER_IN_CUDA),undefined)
+  GPUFLAGS += -allow-unsupported-compiler
+  endif
+
+else ifeq ($(findstring hipcc,$(HIP_COMPILER_PATH)),hipcc)
+  #=== Configure the HIP compiler
+
+  # If CXX is not a single word (example "clang++ --gcc-toolchain...") then disable CUDA builds (issue #505)
+  # This is because it is impossible to pass this to "GPUFLAGS += -ccbin <host-compiler>" below
+  ifneq ($(words $(subst ccache ,,$(CXX))),1) # allow at most "CXX=ccache <host-compiler>" from outside
+    $(warning CUDA builds are not supported for multi-word CXX "$(CXX)")
+    override CUDA_HOME=disabled
+  endif
+
+  # If HIP_HOME is not set, try to set it from the location of GPUCC
+  ifndef HIP_HOME
+    HIP_HOME = $(patsubst %bin/hipcc,%,$(HIP_COMPILER_PATH))
+    $(warning HIP_HOME was not set: using "$(HIP_HOME)")
+  endif
 
-# Set the host C++ compiler for nvcc via "-ccbin <host-compiler>"
-# (NB issue #505: this must be a single word, "clang++ --gcc-toolchain..." is not supported)
-CUFLAGS += -ccbin $(shell which $(subst ccache ,,$(CXX)))
+  # Set GPUCC as $(HIP_HOME)/bin/hipcc if it exists
+  ifneq ($(wildcard $(HIP_HOME)/bin/hipcc),)
+    GPUCC = $(HIP_HOME)/bin/hipcc
+
+    # Should maybe find something equivelant to this in HIP
+    #USE_NVTX ?=-DUSE_NVTX
+
+    HIPARCHFLAGS = -target x86_64-linux-gnu --offload-arch=gfx90a
+    HIPINC = -I$(HIP_HOME)/include/
+
+    # -DHIP_FAST_MATH equivelant to -use_fast_math in HIP 
+    # (But only for single precision line 208: https://rocm-developer-tools.github.io/HIP/hcc__detail_2math__functions_8h_source.html)
+    GPUFLAGS = $(OPTFLAGS) $(CUOPTFLAGS) $(INCFLAGS) $(HIPINC) $(HIPARCHFLAGS) -DHIP_FAST_MATH -DHIP_PLATFORM=amd -fPIC
+    ###GPUFLAGS += -Xcompiler -Wall -Xcompiler -Wextra -Xcompiler -Wshadow
+    GPUFLAGS += -std=c++17
+    # Without -maxrregcount: baseline throughput: 6.5E8 (16384 32 12) up to 7.3E8 (65536 128 12)
+    ###GPUFLAGS+= --maxrregcount 160 # improves throughput: 6.9E8 (16384 32 12) up to 7.7E8 (65536 128 12)
+    ###GPUFLAGS+= --maxrregcount 128 # improves throughput: 7.3E8 (16384 32 12) up to 7.6E8 (65536 128 12)
+    ###GPUFLAGS+= --maxrregcount 96 # degrades throughput: 4.1E8 (16384 32 12) up to 4.5E8 (65536 128 12)
+    ###GPUFLAGS+= --maxrregcount 64 # degrades throughput: 1.7E8 (16384 32 12) flat at 1.7E8 (65536 128 12)
+
+    CUBUILDRULEFLAGS = -fPIC -c
+    CCBUILDRULEFLAGS = -fPIC -c
+
+  else ifneq ($(origin REQUIRE_HIP),undefined)
+    # If REQUIRE_HIP is set but no cuda is found, stop here (e.g. for CI tests on GPU #443)
+    $(error No hip installation found (set HIP_HOME or make GPUCC visible in PATH))
+  else
+    # No hip. Switch hip compilation off and go to common random numbers in C++
+    $(warning HIP_HOME is not set or is invalid: export HIP_HOME to compile with hip)
+    override GPUCC=
+    override USE_NVTX=
+    override CUINC=
+    override CURANDLIBFLAGS=
+  endif
+
+  # Allow newer (unsupported) C++ compilers with older versions of CUDA if ALLOW_UNSUPPORTED_COMPILER_IN_CUDA is set (#504)
+  ifneq ($(origin ALLOW_UNSUPPORTED_COMPILER_IN_CUDA),undefined)
+  GPUFLAGS += -allow-unsupported-compiler
+  endif
 
-# Allow newer (unsupported) C++ compilers with older versions of CUDA if ALLOW_UNSUPPORTED_COMPILER_IN_CUDA is set (#504)
-ifneq ($(origin ALLOW_UNSUPPORTED_COMPILER_IN_CUDA),undefined)
-CUFLAGS += -allow-unsupported-compiler
 endif
 
+  
 #-------------------------------------------------------------------------------
 
 #=== Configure ccache for C++ and CUDA builds
@@ -163,9 +233,9 @@ endif
 #ifeq ($(USECCACHE)$(shell echo $(AR) | grep ccache),1)
 #  override AR:=ccache $(AR)
 #endif
-ifneq ($(NVCC),)
-  ifeq ($(USECCACHE)$(shell echo $(NVCC) | grep ccache),1)
-    override NVCC:=ccache $(NVCC)
+ifneq ($(GPUCC),)
+  ifeq ($(USECCACHE)$(shell echo $(GPUCC) | grep ccache),1)
+    override GPUCC:=ccache $(GPUCC)
   endif
 endif
 
@@ -189,7 +259,7 @@ endif
 
 # PowerPC-specific CUDA compiler flags (to be reviewed!)
 ifeq ($(UNAME_P),ppc64le)
-  CUFLAGS+= -Xcompiler -mno-float128
+  GPUFLAGS+= -Xcompiler -mno-float128
 endif
 
 #-------------------------------------------------------------------------------
@@ -199,10 +269,10 @@ endif
 # Set the default OMPFLAGS choice
 ifneq ($(shell $(CXX) --version | egrep '^Intel'),)
 override OMPFLAGS = -fopenmp
-###override OMPFLAGS = # disable OpenMP MT on Intel (was ok without nvcc but not ok with nvcc before #578)
+###override OMPFLAGS = # disable OpenMP MT on Intel (was ok without GPUCC but not ok with GPUCC before #578)
 else ifneq ($(shell $(CXX) --version | egrep '^(clang)'),)
 override OMPFLAGS = -fopenmp
-###override OMPFLAGS = # disable OpenMP MT on clang (was not ok without or with nvcc before #578)
+###override OMPFLAGS = # disable OpenMP MT on clang (was not ok without or with GPUCC before #578)
 else ifneq ($(shell $(CXX) --version | egrep '^(Apple clang)'),)
 override OMPFLAGS = # disable OpenMP MT on Apple clang (builds fail in the CI #578)
 else
@@ -253,7 +323,10 @@ endif
 
 # Set the default RNDGEN (random number generator) choice
 ifeq ($(RNDGEN),)
-  ifeq ($(NVCC),)
+  ifeq ($(GPUCC),)
+    override RNDGEN = hasNoCurand
+  # Edgecase for HIP compilation
+  else ifeq ($(findstring hipcc,$(GPUCC)),hipcc)
     override RNDGEN = hasNoCurand
   else ifeq ($(RNDGEN),)
     override RNDGEN = hasCurand
@@ -328,13 +401,13 @@ CXXFLAGS+= $(AVXFLAGS)
 $(info FPTYPE=$(FPTYPE))
 ifeq ($(FPTYPE),d)
   CXXFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_DOUBLE
-  CUFLAGS  += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_DOUBLE
+  GPUFLAGS  += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_DOUBLE
 else ifeq ($(FPTYPE),f)
   CXXFLAGS += -DMGONGPU_FPTYPE_FLOAT -DMGONGPU_FPTYPE2_FLOAT
-  CUFLAGS  += -DMGONGPU_FPTYPE_FLOAT -DMGONGPU_FPTYPE2_FLOAT
+  GPUFLAGS  += -DMGONGPU_FPTYPE_FLOAT -DMGONGPU_FPTYPE2_FLOAT
 else ifeq ($(FPTYPE),m)
   CXXFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_FLOAT
-  CUFLAGS  += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_FLOAT
+  GPUFLAGS  += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_FLOAT
 else
   $(error Unknown FPTYPE='$(FPTYPE)': only 'd', 'f' and 'm' are supported)
 endif
@@ -343,7 +416,7 @@ endif
 $(info HELINL=$(HELINL))
 ifeq ($(HELINL),1)
   CXXFLAGS += -DMGONGPU_INLINE_HELAMPS
-  CUFLAGS  += -DMGONGPU_INLINE_HELAMPS
+  GPUFLAGS  += -DMGONGPU_INLINE_HELAMPS
 else ifneq ($(HELINL),0)
   $(error Unknown HELINL='$(HELINL)': only '0' and '1' are supported)
 endif
@@ -352,7 +425,7 @@ endif
 $(info HRDCOD=$(HRDCOD))
 ifeq ($(HRDCOD),1)
   CXXFLAGS += -DMGONGPU_HARDCODE_PARAM
-  CUFLAGS  += -DMGONGPU_HARDCODE_PARAM
+  GPUFLAGS  += -DMGONGPU_HARDCODE_PARAM
 else ifneq ($(HRDCOD),0)
   $(error Unknown HRDCOD='$(HRDCOD)': only '0' and '1' are supported)
 endif
@@ -404,11 +477,11 @@ ifeq ($(UNAME_S),Darwin)
   override CULIBFLAGSRPATH2 =
 else
   # RPATH to cuda/cpp libs when linking executables
-  override CXXLIBFLAGSRPATH = -Wl,-rpath,$(LIBDIRRPATH)
-  override CULIBFLAGSRPATH = -Xlinker -rpath,$(LIBDIRRPATH)
+  override CXXLIBFLAGSRPATH = -Wl,-rpath=$(LIBDIRRPATH)
+  override CULIBFLAGSRPATH = -Xlinker -rpath=$(LIBDIRRPATH)
   # RPATH to common lib when linking cuda/cpp libs
-  override CXXLIBFLAGSRPATH2 = -Wl,-rpath,'$$ORIGIN'
-  override CULIBFLAGSRPATH2 = -Xlinker -rpath,'$$ORIGIN'
+  override CXXLIBFLAGSRPATH2 = -Wl,-rpath='$$ORIGIN'
+  override CULIBFLAGSRPATH2 = -Xlinker -rpath='$$ORIGIN'
 endif
 
 # Setting LD_LIBRARY_PATH or DYLD_LIBRARY_PATH in the RUNTIME is no longer necessary (neither on Linux nor on Mac)
@@ -421,7 +494,7 @@ override RUNTIME =
 cxx_main=$(BUILDDIR)/check.exe
 fcxx_main=$(BUILDDIR)/fcheck.exe
 
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 cu_main=$(BUILDDIR)/gcheck.exe
 fcu_main=$(BUILDDIR)/fgcheck.exe
 else
@@ -452,15 +525,16 @@ $(BUILDDIR)/.build.$(TAG):
 	@touch $(BUILDDIR)/.build.$(TAG)
 
 # Generic target and build rules: objects from CUDA compilation
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 $(BUILDDIR)/%.o : %.cu *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
 	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
-	$(NVCC) $(CPPFLAGS) $(CUFLAGS) -Xcompiler -fPIC -c $< -o $@
+	$(GPUCC) $(CPPFLAGS) $(GPUFLAGS) $(CUBUILDRULEFLAGS) $< -o $@
 
 $(BUILDDIR)/%_cu.o : %.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
 	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
-	$(NVCC) $(CPPFLAGS) $(CUFLAGS) -Xcompiler -fPIC -c -x cu $< -o $@
+	$(GPUCC) $(CPPFLAGS) $(GPUFLAGS) $(CCBUILDRULEFLAGS) $< -o $@
 endif
+# -x cu in line above
 
 # Generic target and build rules: objects from C++ compilation
 # (NB do not include CUINC here! add it only for NVTX or curand #679)
@@ -469,11 +543,14 @@ $(BUILDDIR)/%.o : %.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
 	$(CXX) $(CPPFLAGS) $(CXXFLAGS) -fPIC -c $< -o $@
 
 # Apply special build flags only to CrossSectionKernel.cc and gCrossSectionKernel.cu (no fast math, see #117)
+# Added edgecase for HIP compilation
 ifeq ($(shell $(CXX) --version | grep ^nvc++),)
 $(BUILDDIR)/CrossSectionKernels.o: CXXFLAGS += -fno-fast-math
 $(BUILDDIR)/CrossSectionKernels.o: CXXFLAGS += -fno-fast-math
-ifneq ($(NVCC),)
-$(BUILDDIR)/gCrossSectionKernels.o: CUFLAGS += -Xcompiler -fno-fast-math
+ifeq ($(findstring nvcc,$(GPUCC)),nvcc)
+  $(BUILDDIR)/gCrossSectionKernels.o: GPUFLAGS += -Xcompiler -fno-fast-math
+else
+  $(BUILDDIR)/gCrossSectionKernels.o: GPUFLAGS += -fno-fast-math
 endif
 endif
 
@@ -489,10 +566,10 @@ ifeq ($(RNDGEN),hasCurand)
 $(BUILDDIR)/CurandRandomNumberKernel.o: CXXFLAGS += $(CUINC)
 endif
 
-# Avoid "warning: builtin __has_trivial_... is deprecated; use __is_trivially_... instead" in nvcc with icx2023 (#592)
+# Avoid "warning: builtin __has_trivial_... is deprecated; use __is_trivially_... instead" in GPUCC with icx2023 (#592)
 ifneq ($(shell $(CXX) --version | egrep '^(Intel)'),)
-ifneq ($(NVCC),)
-CUFLAGS += -Xcompiler -Wno-deprecated-builtins
+ifneq ($(GPUCC),)
+GPUFLAGS += -Wno-deprecated-builtins
 endif
 endif
 
@@ -500,8 +577,8 @@ endif
 # This patch does remove the warning, but I prefer to keep it disabled for the moment...
 ###ifneq ($(shell $(CXX) --version | egrep '^(clang|Apple clang|Intel)'),)
 ###$(BUILDDIR)/CrossSectionKernels.o: CXXFLAGS += -Wno-overriding-t-option
-###ifneq ($(NVCC),)
-###$(BUILDDIR)/gCrossSectionKernels.o: CUFLAGS += -Xcompiler -Wno-overriding-t-option
+###ifneq ($(GPUCC),)
+###$(BUILDDIR)/gCrossSectionKernels.o: GPUFLAGS += -Xcompiler -Wno-overriding-t-option
 ###endif
 ###endif
 
@@ -528,7 +605,7 @@ MG5AMC_CXXLIB = mg5amc_$(processid_short)_cpp
 cxx_objects_lib=$(BUILDDIR)/CPPProcess.o $(BUILDDIR)/MatrixElementKernels.o $(BUILDDIR)/BridgeKernels.o $(BUILDDIR)/CrossSectionKernels.o
 cxx_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel.o $(BUILDDIR)/RamboSamplingKernels.o
 
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 MG5AMC_CULIB = mg5amc_$(processid_short)_cuda
 cu_objects_lib=$(BUILDDIR)/gCPPProcess.o $(BUILDDIR)/gMatrixElementKernels.o $(BUILDDIR)/gBridgeKernels.o $(BUILDDIR)/gCrossSectionKernels.o
 cu_objects_exe=$(BUILDDIR)/gCommonRandomNumberKernel.o $(BUILDDIR)/gRamboSamplingKernels.o
@@ -540,11 +617,11 @@ $(LIBDIR)/lib$(MG5AMC_CXXLIB).so: cxx_objects_lib += $(BUILDDIR)/fbridge.o
 $(LIBDIR)/lib$(MG5AMC_CXXLIB).so: $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib)
 	$(CXX) -shared -o $@ $(cxx_objects_lib) $(CXXLIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB)
 
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 $(LIBDIR)/lib$(MG5AMC_CULIB).so: $(BUILDDIR)/fbridge_cu.o
 $(LIBDIR)/lib$(MG5AMC_CULIB).so: cu_objects_lib += $(BUILDDIR)/fbridge_cu.o
 $(LIBDIR)/lib$(MG5AMC_CULIB).so: $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cu_objects_lib)
-	$(NVCC) --shared -o $@ $(cu_objects_lib) $(CULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB)
+	$(GPUCC) --shared -o $@ $(cu_objects_lib) $(CULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB)
 endif
 
 #-------------------------------------------------------------------------------
@@ -561,16 +638,16 @@ $(cxx_main): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PAT
 $(cxx_main): $(BUILDDIR)/check_sa.o $(LIBDIR)/lib$(MG5AMC_CXXLIB).so $(cxx_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel.o
 	$(CXX) -o $@ $(BUILDDIR)/check_sa.o $(OMPFLAGS) -ldl -pthread $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_CXXLIB) $(cxx_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel.o $(CURANDLIBFLAGS)
 
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 ifneq ($(shell $(CXX) --version | grep ^Intel),)
-$(cu_main): LIBFLAGS += -lintlc # compile with icpx and link with nvcc (undefined reference to `_intel_fast_memcpy')
-$(cu_main): LIBFLAGS += -lsvml # compile with icpx and link with nvcc (undefined reference to `__svml_cos4_l9')
+$(cu_main): LIBFLAGS += -lintlc # compile with icpx and link with GPUCC (undefined reference to `_intel_fast_memcpy')
+$(cu_main): LIBFLAGS += -lsvml # compile with icpx and link with GPUCC (undefined reference to `__svml_cos4_l9')
 else ifneq ($(shell $(CXX) --version | grep ^nvc++),) # support nvc++ #531
 $(cu_main): LIBFLAGS += -L$(patsubst %bin/nvc++,%lib,$(subst ccache ,,$(CXX))) -lnvhpcatm -lnvcpumath -lnvc
 endif
 $(cu_main): LIBFLAGS += $(CULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH
 $(cu_main): $(BUILDDIR)/gcheck_sa.o $(LIBDIR)/lib$(MG5AMC_CULIB).so $(cu_objects_exe) $(BUILDDIR)/gCurandRandomNumberKernel.o
-	$(NVCC) -o $@ $(BUILDDIR)/gcheck_sa.o $(CUARCHFLAGS) $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe) $(BUILDDIR)/gCurandRandomNumberKernel.o $(CURANDLIBFLAGS)
+	$(GPUCC) -o $@ $(BUILDDIR)/gcheck_sa.o $(CUARCHFLAGS) $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe) $(BUILDDIR)/gCurandRandomNumberKernel.o $(CURANDLIBFLAGS)
 endif
 
 #-------------------------------------------------------------------------------
@@ -596,17 +673,17 @@ $(fcxx_main): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PA
 $(fcxx_main): $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler.o $(LIBDIR)/lib$(MG5AMC_CXXLIB).so $(cxx_objects_exe)
 	$(CXX) -o $@ $(BUILDDIR)/fcheck_sa.o $(OMPFLAGS) $(BUILDDIR)/fsampler.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CXXLIB) $(cxx_objects_exe)
 
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 ifneq ($(shell $(CXX) --version | grep ^Intel),)
-$(fcu_main): LIBFLAGS += -lintlc # compile with icpx and link with nvcc (undefined reference to `_intel_fast_memcpy')
-$(fcu_main): LIBFLAGS += -lsvml # compile with icpx and link with nvcc (undefined reference to `__svml_cos4_l9')
+$(fcu_main): LIBFLAGS += -lintlc # compile with icpx and link with GPUCC (undefined reference to `_intel_fast_memcpy')
+$(fcu_main): LIBFLAGS += -lsvml # compile with icpx and link with GPUCC (undefined reference to `__svml_cos4_l9')
 endif
 ifeq ($(UNAME_S),Darwin)
 $(fcu_main): LIBFLAGS += -L$(shell dirname $(shell $(FC) --print-file-name libgfortran.dylib)) # add path to libgfortran on Mac #375
 endif
 $(fcu_main): LIBFLAGS += $(CULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH
 $(fcu_main): $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler_cu.o $(LIBDIR)/lib$(MG5AMC_CULIB).so $(cu_objects_exe)
-	$(NVCC) -o $@ $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler_cu.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe)
+	$(GPUCC) -o $@ $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler_cu.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe)
 endif
 
 #-------------------------------------------------------------------------------
@@ -618,7 +695,7 @@ $(BUILDDIR)/testxxx.o: testxxx_cc_ref.txt
 $(testmain): $(BUILDDIR)/testxxx.o
 $(testmain): cxx_objects_exe += $(BUILDDIR)/testxxx.o # Comment out this line to skip the C++ test of xxx functions
 
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 $(BUILDDIR)/testxxx_cu.o: $(GTESTLIBS)
 $(BUILDDIR)/testxxx_cu.o: INCFLAGS += $(GTESTINC)
 $(BUILDDIR)/testxxx_cu.o: testxxx_cc_ref.txt
@@ -631,7 +708,7 @@ $(BUILDDIR)/testmisc.o: INCFLAGS += $(GTESTINC)
 $(testmain): $(BUILDDIR)/testmisc.o
 $(testmain): cxx_objects_exe += $(BUILDDIR)/testmisc.o # Comment out this line to skip the C++ miscellaneous tests
 
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 $(BUILDDIR)/testmisc_cu.o: $(GTESTLIBS)
 $(BUILDDIR)/testmisc_cu.o: INCFLAGS += $(GTESTINC)
 $(testmain): $(BUILDDIR)/testmisc_cu.o
@@ -643,12 +720,12 @@ $(BUILDDIR)/runTest.o: INCFLAGS += $(GTESTINC)
 $(testmain): $(BUILDDIR)/runTest.o
 $(testmain): cxx_objects_exe += $(BUILDDIR)/runTest.o
 
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 $(BUILDDIR)/runTest_cu.o: $(GTESTLIBS)
 $(BUILDDIR)/runTest_cu.o: INCFLAGS += $(GTESTINC)
 ifneq ($(shell $(CXX) --version | grep ^Intel),)
-$(testmain): LIBFLAGS += -lintlc # compile with icpx and link with nvcc (undefined reference to `_intel_fast_memcpy')
-$(testmain): LIBFLAGS += -lsvml # compile with icpx and link with nvcc (undefined reference to `__svml_cos4_l9')
+$(testmain): LIBFLAGS += -lintlc # compile with icpx and link with GPUCC (undefined reference to `_intel_fast_memcpy')
+$(testmain): LIBFLAGS += -lsvml # compile with icpx and link with GPUCC (undefined reference to `__svml_cos4_l9')
 else ifneq ($(shell $(CXX) --version | grep ^nvc++),) # support nvc++ #531
 $(testmain): LIBFLAGS += -L$(patsubst %bin/nvc++,%lib,$(subst ccache ,,$(CXX))) -lnvhpcatm -lnvcpumath -lnvc
 endif
@@ -672,14 +749,14 @@ $(testmain): LIBFLAGS += -lgomp
 endif
 endif
 
-ifeq ($(NVCC),) # link only runTest.o
+ifeq ($(GPUCC),) # link only runTest.o
 $(testmain): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH
 $(testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(cxx_objects_exe) $(GTESTLIBS)
 	$(CXX) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) -ldl -pthread $(LIBFLAGS)
 else # link both runTest.o and runTest_cu.o
 $(testmain): LIBFLAGS += $(CULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH
 $(testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) $(GTESTLIBS)
-	$(NVCC) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) -ldl $(LIBFLAGS) -lcuda
+	  $(GPUCC) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) -ldl $(LIBFLAGS) $(CUDATESTFLAGS)
 endif
 
 # Use flock (Linux only, no Mac) to allow 'make -j' if googletest has not yet been downloaded https://stackoverflow.com/a/32666215
@@ -782,9 +859,9 @@ ifeq ($(USECCACHE),1)
 	ccache --version | head -1
 endif
 	@echo ""
-	@echo NVCC=$(NVCC)
-ifneq ($(NVCC),)
-	$(NVCC) --version
+	@echo GPUCC=$(GPUCC)
+ifneq ($(GPUCC),)
+	$(GPUCC) --version
 endif
 	@echo ""
 	@echo CXX=$(CXX)
@@ -803,7 +880,7 @@ endif
 
 # Target: check (run the C++ test executable)
 # [NB THIS IS WHAT IS USED IN THE GITHUB CI!]
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 check: runTest cmpFcheck cmpFGcheck
 else
 check: runTest cmpFcheck
diff --git a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/fbridge.cc b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/fbridge.cc
index f93c05b0b3..2b956730d4 100644
--- a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/fbridge.cc
+++ b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/fbridge.cc
@@ -1,11 +1,11 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: S. Roiser (Oct 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Roiser, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #include "Bridge.h"
 #include "CPPProcess.h"
-#include "CudaRuntime.h"
+#include "GpuRuntime.h"
 
 extern "C"
 {
@@ -22,7 +22,7 @@ extern "C"
    * Using the same Fortran MadEvent code, linking to the hetrerogeneous library would allow access to both CPU and GPU implementations.
    * The specific heterogeneous configuration (how many GPUs, how many threads on each CPU, etc) could be loaded in CUDA/C++ from a data file.
    */
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   using namespace mg5amcGpu;
 #else
   using namespace mg5amcCpu;
@@ -46,8 +46,8 @@ extern "C"
    */
   void fbridgecreate_( CppObjectInFortran** ppbridge, const int* pnevtF, const int* pnparF, const int* pnp4F )
   {
-#ifdef __CUDACC__
-    CudaRuntime::setUp();
+#ifdef MGONGPUCPP_GPUIMPL
+    GpuRuntime::setUp();
 #endif
     // Create a process object, read parm card and set parameters
     // FIXME: the process instance can happily go out of scope because it is only needed to read parameters?
@@ -69,8 +69,8 @@ extern "C"
     Bridge<FORTRANFPTYPE>* pbridge = dynamic_cast<Bridge<FORTRANFPTYPE>*>( *ppbridge );
     if( pbridge == 0 ) throw std::runtime_error( "fbridgedelete_: invalid Bridge address" );
     delete pbridge;
-#ifdef __CUDACC__
-    CudaRuntime::tearDown();
+#ifdef MGONGPUCPP_GPUIMPL
+    GpuRuntime::tearDown();
 #endif
   }
 
@@ -100,7 +100,7 @@ extern "C"
   {
     Bridge<FORTRANFPTYPE>* pbridge = dynamic_cast<Bridge<FORTRANFPTYPE>*>( *ppbridge );
     if( pbridge == 0 ) throw std::runtime_error( "fbridgesequence_: invalid Bridge address" );
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     // Use the device/GPU implementation in the CUDA library
     // (there is also a host implementation in this library)
     pbridge->gpu_sequence( momenta, gs, rndhel, rndcol, *pchannelId, mes, selhel, selcol );
diff --git a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/fsampler.cc b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/fsampler.cc
index 2fb445372d..3743934f41 100644
--- a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/fsampler.cc
+++ b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/fsampler.cc
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Feb 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #include "mgOnGpuConfig.h"
 
@@ -13,7 +13,7 @@
 
 //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -40,7 +40,7 @@ namespace mg5amcCpu
   private:
     const int m_nevt; // The number of events in each iteration
     int m_iiter;      // The iteration counter (for random number seeding)
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
     HostBufferRndNumMomenta m_hstRndmom; // Memory buffers for random numbers
     HostBufferMomenta m_hstMomenta;      // Memory buffers for momenta
     HostBufferWeights m_hstWeights;      // Memory buffers for sampling weights
@@ -105,7 +105,7 @@ namespace mg5amcCpu
 
 extern "C"
 {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   using namespace mg5amcGpu;
 #else
   using namespace mg5amcCpu;
diff --git a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/runTest.cc b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/runTest.cc
index 572e28aaea..461ec5c3a5 100644
--- a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/runTest.cc
+++ b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/runTest.cc
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: S. Hageboeck (Nov 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 
 #include "mgOnGpuConfig.h"
 
@@ -15,7 +15,7 @@
 #include "RandomNumberKernels.h"
 #include "epoch_process_id.h"
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 using namespace mg5amcGpu;
 #else
 using namespace mg5amcCpu;
@@ -32,7 +32,7 @@ struct CUDA_CPU_TestBase : public TestDriverBase
     : TestDriverBase( npar, refFileName ) {}
 };
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 struct CPUTest : public CUDA_CPU_TestBase
 {
   // Struct data members (process, and memory structures for random numbers, momenta, matrix elements and weights on host and device)
@@ -114,7 +114,7 @@ struct CPUTest : public CUDA_CPU_TestBase
 };
 #endif
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 struct CUDATest : public CUDA_CPU_TestBase
 {
   // Reset the device when our test goes out of scope. Note that this should happen after
@@ -123,7 +123,7 @@ struct CUDATest : public CUDA_CPU_TestBase
   {
     ~DeviceReset()
     {
-      checkCuda( cudaDeviceReset() ); // this is needed by cuda-memcheck --leak-check full
+      gpuDeviceReset(); // this is needed by cuda-memcheck --leak-check full
     }
   } deviceResetter;
 
@@ -249,7 +249,7 @@ INSTANTIATE_TEST_SUITE_P( prefix, \
                           test_suite_name, \
                           testing::Values( new CUDATest( MG_EPOCH_REFERENCE_FILE_NAME ) ) );
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 MG_INSTANTIATE_TEST_SUITE_GPU( XTESTID_GPU( MG_EPOCH_PROCESS_ID ), MadgraphTest );
 #else
 MG_INSTANTIATE_TEST_SUITE_CPU( XTESTID_CPU( MG_EPOCH_PROCESS_ID ), MadgraphTest );
diff --git a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/testmisc.cc b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/testmisc.cc
index 989aba1fdc..2bd7a9fcf9 100644
--- a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/testmisc.cc
+++ b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/testmisc.cc
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 //----------------------------------------------------------------------------
 // Use ./runTest.exe --gtest_filter=*misc to run only this test
 //----------------------------------------------------------------------------
@@ -17,7 +17,7 @@
 #include <sstream>
 #include <typeinfo>
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 #define TESTID( s ) s##_GPU_MISC
 #else
 #define TESTID( s ) s##_CPU_MISC
diff --git a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/testxxx.cc b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/testxxx.cc
index a66e595176..ac9c95d539 100644
--- a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/testxxx.cc
+++ b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/testxxx.cc
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Apr 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #include "mgOnGpuConfig.h"
 
@@ -20,7 +20,7 @@
 #include <iomanip>
 #include <iostream>
 #include <vector>
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 #define TESTID( s ) s##_GPU_XXX
 #else
 #define TESTID( s ) s##_CPU_XXX
@@ -41,7 +41,7 @@ TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testxxx )
   assert( nevt % neppM == 0 ); // nevt must be a multiple of neppM
   assert( nevt % neppV == 0 ); // nevt must be a multiple of neppV
   // Fill in the input momenta
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   mg5amcGpu::PinnedHostBufferMomenta hstMomenta( nevt ); // AOSOA[npagM][npar=4][np4=4][neppM]
 #else
   mg5amcCpu::HostBufferMomenta hstMomenta( nevt ); // AOSOA[npagM][npar=4][np4=4][neppM]
@@ -228,7 +228,7 @@ TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testxxx )
   {
     for( int ievt = 0; ievt < nevt; ievt++ )
     {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
       using namespace mg5amcGpu;
 #else
       using namespace mg5amcCpu;
diff --git a/epochX/cudacpp/heft_gg_h.sa/src/HelAmps_heft.h b/epochX/cudacpp/heft_gg_h.sa/src/HelAmps_heft.h
index 5e79643e1c..c8cb1ed767 100644
--- a/epochX/cudacpp/heft_gg_h.sa/src/HelAmps_heft.h
+++ b/epochX/cudacpp/heft_gg_h.sa/src/HelAmps_heft.h
@@ -4,7 +4,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: A. Valassi (Sep 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
 // MadGraph5_aMC@NLO v. 3.5.0_lo_vect, 2023-06-09
@@ -26,7 +26,7 @@
 //#include <iomanip>
 //#include <iostream>
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/heft_gg_h.sa/src/Parameters_heft.cc b/epochX/cudacpp/heft_gg_h.sa/src/Parameters_heft.cc
index fcf1b455b9..3741400a1c 100644
--- a/epochX/cudacpp/heft_gg_h.sa/src/Parameters_heft.cc
+++ b/epochX/cudacpp/heft_gg_h.sa/src/Parameters_heft.cc
@@ -4,7 +4,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: A. Valassi (Sep 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
 // MadGraph5_aMC@NLO v. 3.5.0_lo_vect, 2023-06-09
diff --git a/epochX/cudacpp/heft_gg_h.sa/src/Parameters_heft.h b/epochX/cudacpp/heft_gg_h.sa/src/Parameters_heft.h
index 6d56738204..689ae12a60 100644
--- a/epochX/cudacpp/heft_gg_h.sa/src/Parameters_heft.h
+++ b/epochX/cudacpp/heft_gg_h.sa/src/Parameters_heft.h
@@ -222,7 +222,7 @@ namespace Parameters_heft_dependentCouplings
 #pragma GCC diagnostic push
 #pragma GCC diagnostic ignored "-Wunused-variable"  // e.g. <<warning: unused variable ‘mdl_G__exp__2’ [-Wunused-variable]>>
 #pragma GCC diagnostic ignored "-Wunused-parameter" // e.g. <<warning: unused parameter ‘G’ [-Wunused-parameter]>>
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 #pragma nv_diagnostic push
 #pragma nv_diag_suppress 177 // e.g. <<warning #177-D: variable "mdl_G__exp__2" was declared but never referenced>>
 #endif
@@ -273,7 +273,7 @@ namespace Parameters_heft_dependentCouplings
     // End non-SM (e.g. EFT) implementation - special handling of vectors of floats (#439)
     return out;
   }
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 #pragma GCC diagnostic pop
 #pragma nv_diagnostic pop
 #endif
@@ -289,7 +289,7 @@ namespace Parameters_heft_independentCouplings
 
 //==========================================================================
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/heft_gg_h.sa/src/mgOnGpuConfig.h b/epochX/cudacpp/heft_gg_h.sa/src/mgOnGpuConfig.h
index 6c0c4919e9..d4e37d19b3 100644
--- a/epochX/cudacpp/heft_gg_h.sa/src/mgOnGpuConfig.h
+++ b/epochX/cudacpp/heft_gg_h.sa/src/mgOnGpuConfig.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jul 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MGONGPUCONFIG_H
 #define MGONGPUCONFIG_H 1
@@ -10,13 +10,26 @@
 // There are two different code bases for standalone_cudacpp (without multichannel) and madevent+cudacpp (with multichannel)
 #undef MGONGPU_SUPPORTS_MULTICHANNEL
 
+// Is this a GPU (CUDA, HIP) or CPU implementation?
+#ifdef __CUDACC__
+#define MGONGPUCPP_GPUIMPL cuda
+#elif defined __HIPCC__
+#define MGONGPUCPP_GPUIMPL hip
+#else
+#undef MGONGPUCPP_GPUIMPL
+#endif
+
 // ** NB1 Throughputs (e.g. 6.8E8) are events/sec for "./gcheck.exe -p 65536 128 12"
 // ** NB2 Baseline on b7g47n0004 fluctuates (probably depends on load on other VMs)
 
 // Choose if curand is supported for generating random numbers
+// For CUDA, by default, it is supported
+// For HIP, by default, it is not supported
 // For C++, by default, do not inline, but allow this macro to be set from outside with e.g. -DMGONGPU_HAS_NO_CURAND
 #ifdef __CUDACC__
 #undef MGONGPU_HAS_NO_CURAND
+#elif defined __HIPCC__
+#define MGONGPU_HAS_NO_CURAND 1
 #else
 //#undef MGONGPU_HAS_NO_CURAND // default
 ////#define MGONGPU_HAS_NO_CURAND 1
@@ -52,23 +65,28 @@
 //#undef MGONGPU_HARDCODE_PARAM // default
 ////#define MGONGPU_HARDCODE_PARAM 1
 
-// Complex type in c++: std::complex or cxsmpl (CHOOSE ONLY ONE)
-#ifndef __CUDACC__
-//#define MGONGPU_CPPCXTYPE_STDCOMPLEX 1 // ~8 percent slower on float, same on double (5.1E6/double, 9.4E6/float)
-#define MGONGPU_CPPCXTYPE_CXSMPL 1 // new default (5.1E6/double, 10.2E6/float)
-#endif
-
-// Complex type in cuda: thrust or cucomplex or cxsmpl (CHOOSE ONLY ONE)
+// Complex type in CUDA: thrust or cucomplex or cxsmpl (CHOOSE ONLY ONE)
 #ifdef __CUDACC__
 #define MGONGPU_CUCXTYPE_THRUST 1 // default (~1.15E9/double, ~3.2E9/float)
 //#define MGONGPU_CUCXTYPE_CUCOMPLEX 1 // ~10 percent slower (1.03E9/double, ~2.8E9/float)
 //#define MGONGPU_CUCXTYPE_CXSMPL 1 // ~10 percent slower (1.00E9/double, ~2.9E9/float)
+
+// Complex type in HIP: cxsmpl (ONLY ONE OPTION POSSIBLE)
+#elif defined __HIPCC__
+#define MGONGPU_CUCXTYPE_CXSMPL 1 // ~10 percent slower (1.00E9/double, ~2.9E9/float)
+
+// Complex type in C++: std::complex or cxsmpl (CHOOSE ONLY ONE)
+#else
+//#define MGONGPU_CPPCXTYPE_STDCOMPLEX 1 // ~8 percent slower on float, same on double (5.1E6/double, 9.4E6/float)
+#define MGONGPU_CPPCXTYPE_CXSMPL 1 // new default (5.1E6/double, 10.2E6/float)
 #endif
 
-// Cuda nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation
+// CUDA nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation
 #ifdef __CUDACC__
-#undef MGONGPU_NSIGHT_DEBUG // default
+#undef MGONGPU_NSIGHT_DEBUG // default in CUDA
 //#define MGONGPU_NSIGHT_DEBUG 1
+#else
+#undef MGONGPU_NSIGHT_DEBUG // only option in HIP or C++
 #endif
 
 // SANITY CHECKS (floating point precision for everything but color algebra #537)
@@ -84,17 +102,21 @@
 #error You cannot use double precision for color algebra and single precision elsewhere
 #endif
 
-// SANITY CHECKS (c++ complex number implementation)
-#ifndef __CUDACC__
-#if defined MGONGPU_CPPCXTYPE_STDCOMPLEX and defined MGONGPU_CPPCXTYPE_CXSMPL
-#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CPPCXTYPE_STDCOMPLEX or MGONGPU_CPPCXTYPE_CXSMPL
+// SANITY CHECKS (CUDA complex number implementation)
+#ifdef __CUDACC__
+#if defined MGONGPU_CUCXTYPE_THRUST and defined MGONGPU_CUCXTYPE_CUCOMPLEX
+#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CUCXTYPE_THRUST or MGONGPU_CUCXTYPE_CUCOMPLEX for CUDA
+#elif defined MGONGPU_CUCXTYPE_THRUST and defined MGONGPU_CUCXTYPE_CXSMPL
+#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CUCXTYPE_THRUST or MGONGPU_CUCXTYPE_CXSMPL for CUDA
+#elif defined MGONGPU_CUCXTYPE_CUCOMPLEX and defined MGONGPU_CUCXTYPE_CXSMPL
+#error You must CHOOSE (ONE AND) ONLY ONE OF MGONGPU_CUCXTYPE_CUCOMPLEX or MGONGPU_CUCXTYPE_CXSMPL for CUDA
 #endif
 #endif
 
-// SANITY CHECKS (cuda complex number implementation)
-#ifdef __CUDACC__
-#if defined MGONGPU_CUCXTYPE_THRUST and defined MGONGPU_CUCXTYPE_CUCOMPLEX and defined MGONGPU_CUCXTYPE_CXSMPL
-#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CUCXTYPE_THRUST or MGONGPU_CUCXTYPE_CUCOMPLEX or MGONGPU_CUCXTYPE_CXSMPL
+// SANITY CHECKS (C++ complex number implementation)
+#ifndef MGONGPUCPP_GPUIMPL
+#if defined MGONGPU_CPPCXTYPE_STDCOMPLEX and defined MGONGPU_CPPCXTYPE_CXSMPL
+#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CPPCXTYPE_STDCOMPLEX or MGONGPU_CPPCXTYPE_CXSMPL for C++
 #endif
 #endif
 
@@ -131,7 +153,7 @@ namespace mgOnGpu
   // Alignment requirement for using reinterpret_cast with SIMD vectorized code
   // (using reinterpret_cast with non aligned memory may lead to segmentation faults!)
   // Only needed for C++ code but can be enforced also in NVCC builds of C++ code using CUDA>=11.2 and C++17 (#318, #319, #333)
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   constexpr int cppAlign = 64; // alignment requirement for SIMD vectorization (64-byte i.e. 512-bit)
 #endif
 
@@ -142,7 +164,7 @@ using mgOnGpu::fptype;
 using mgOnGpu::fptype2;
 
 // C++ SIMD vectorization width (this will be used to set neppV)
-#ifdef __CUDACC__ // CUDA implementation has no SIMD
+#ifdef MGONGPUCPP_GPUIMPL // CUDA and HIP implementations have no SIMD
 #undef MGONGPU_CPPSIMD
 #elif defined __AVX512VL__ && defined MGONGPU_PVW512 // C++ "512z" AVX512 with 512 width (512-bit ie 64-byte): 8 (DOUBLE) or 16 (FLOAT)
 #ifdef MGONGPU_FPTYPE_DOUBLE
@@ -172,9 +194,9 @@ using mgOnGpu::fptype2;
 #undef MGONGPU_CPPSIMD
 #endif
 
-// Cuda nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation
+// CUDA nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation
 // Arguments (not used so far): text is __FUNCTION__, code is 0 (start) or 1 (end)
-#if defined __CUDACC__ && defined MGONGPU_NSIGHT_DEBUG /* clang-format off */
+#if defined __CUDA__ && defined MGONGPU_NSIGHT_DEBUG /* clang-format off */
 #define mgDebugDeclare() __shared__ float mgDebugCounter[mgOnGpu::ntpbMAX];
 #define mgDebugInitialise() { mgDebugCounter[threadIdx.x] = 0; }
 #define mgDebug( code, text ) { mgDebugCounter[threadIdx.x] += 1; }
@@ -186,8 +208,8 @@ using mgOnGpu::fptype2;
 #define mgDebugFinalise() { /*noop*/ }
 #endif /* clang-format on */
 
-// Define empty CUDA declaration specifiers for C++
-#ifndef __CUDACC__
+// Define empty CUDA/HIP declaration specifiers for C++
+#ifndef MGONGPUCPP_GPUIMPL
 #define __global__
 #define __host__
 #define __device__
diff --git a/epochX/cudacpp/heft_gg_h.sa/src/mgOnGpuCxtypes.h b/epochX/cudacpp/heft_gg_h.sa/src/mgOnGpuCxtypes.h
index 0cb2f1db7e..4e7ab03fa2 100644
--- a/epochX/cudacpp/heft_gg_h.sa/src/mgOnGpuCxtypes.h
+++ b/epochX/cudacpp/heft_gg_h.sa/src/mgOnGpuCxtypes.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022, based on earlier work by D. Smith) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MGONGPUCXTYPES_H
 #define MGONGPUCXTYPES_H 1
@@ -19,7 +19,7 @@
 #include <complex>
 
 // Complex type in cuda: thrust or cucomplex or cxsmpl
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 #if defined MGONGPU_CUCXTYPE_THRUST
 #pragma clang diagnostic push
 #pragma clang diagnostic ignored "-Wtautological-compare" // for icpx2021/clang13 (https://stackoverflow.com/a/15864661)
@@ -201,7 +201,7 @@ namespace mgOnGpu
 {
 
   // --- Type definitions (complex type: cxtype)
-#ifdef __CUDACC__ // cuda
+#ifdef MGONGPUCPP_GPUIMPL // cuda
 #if defined MGONGPU_CUCXTYPE_THRUST
   typedef thrust::complex<fptype> cxtype;
 #elif defined MGONGPU_CUCXTYPE_CUCOMPLEX
@@ -281,7 +281,7 @@ cxmake( const std::complex<double>& c ) // std::complex to cxsmpl (double-to-flo
 
 //==========================================================================
 
-#if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_THRUST // cuda + thrust
+#if defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CUCXTYPE_THRUST // cuda + thrust
 
 //------------------------------
 // CUDA - using thrust::complex
@@ -317,11 +317,11 @@ cxmake( const cxtype& c )
   return c;
 }
 
-#endif // #if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_THRUST
+#endif // #if defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CUCXTYPE_THRUST
 
 //==========================================================================
 
-#if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_CUCOMPLEX // cuda + cucomplex
+#if defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CUCXTYPE_CUCOMPLEX // cuda + cucomplex
 
 //------------------------------
 // CUDA - using cuComplex
@@ -536,11 +536,11 @@ cxmake( const std::complex<fptype>& c ) // std::complex to cucomplex (float-to-f
   return cxmake( c.real(), c.imag() );
 }
 
-#endif // #if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_CUCOMPLEX
+#endif // #if defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CUCXTYPE_CUCOMPLEX
 
 //==========================================================================
 
-#if not defined __CUDACC__ and defined MGONGPU_CPPCXTYPE_STDCOMPLEX // c++ + stdcomplex
+#if not defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CPPCXTYPE_STDCOMPLEX // c++ + stdcomplex
 
 //------------------------------
 // C++ - using std::complex
@@ -584,7 +584,7 @@ cxmake( const std::complex<double>& c ) // std::complex to std::complex (cast do
 }
 #endif
 
-#endif // #if not defined __CUDACC__ and defined MGONGPU_CPPCXTYPE_STDCOMPLEX
+#endif // #if not defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CPPCXTYPE_STDCOMPLEX
 
 //==========================================================================
 
diff --git a/epochX/cudacpp/heft_gg_h.sa/src/mgOnGpuFptypes.h b/epochX/cudacpp/heft_gg_h.sa/src/mgOnGpuFptypes.h
index a1cde16a67..6f6cee64d6 100644
--- a/epochX/cudacpp/heft_gg_h.sa/src/mgOnGpuFptypes.h
+++ b/epochX/cudacpp/heft_gg_h.sa/src/mgOnGpuFptypes.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MGONGPUFPTYPES_H
 #define MGONGPUFPTYPES_H 1
@@ -13,7 +13,7 @@
 
 //==========================================================================
 
-#ifdef __CUDACC__ // cuda
+#ifdef MGONGPUCPP_GPUIMPL // cuda
 
 //------------------------------
 // Floating point types - Cuda
@@ -57,11 +57,11 @@ fpsqrt( const fptype& f )
 #endif
 }
 
-#endif // #ifdef __CUDACC__
+#endif // #ifdef MGONGPUCPP_GPUIMPL
 
 //==========================================================================
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 
 //------------------------------
 // Floating point types - C++
@@ -85,7 +85,7 @@ fpsqrt( const fptype& f )
   return std::sqrt( f );
 }
 
-#endif // #ifndef __CUDACC__
+#endif // #ifndef MGONGPUCPP_GPUIMPL
 
 //==========================================================================
 
diff --git a/epochX/cudacpp/heft_gg_h.sa/src/mgOnGpuVectors.h b/epochX/cudacpp/heft_gg_h.sa/src/mgOnGpuVectors.h
index 9d3e82b1e3..7904b93c61 100644
--- a/epochX/cudacpp/heft_gg_h.sa/src/mgOnGpuVectors.h
+++ b/epochX/cudacpp/heft_gg_h.sa/src/mgOnGpuVectors.h
@@ -108,7 +108,7 @@ namespace mgOnGpu /* clang-format off */
 #endif
 #endif
 
-#else // i.e #ifndef MGONGPU_CPPSIMD (this includes #ifdef __CUDACC__)
+#else // i.e #ifndef MGONGPU_CPPSIMD (this includes #ifdef MGONGPUCPP_GPUIMPL)
 
   const int neppV = 1;
 
@@ -129,7 +129,7 @@ using mgOnGpu::bool_v;
 
 //--------------------------------------------------------------------------
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 
 // Printout to stream for user defined types
 
@@ -744,11 +744,11 @@ fpvsplit1( const fptype2_v& v )
 
 #endif // #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
 
-#endif // #ifndef __CUDACC__
+#endif // #ifndef MGONGPUCPP_GPUIMPL
 
 //==========================================================================
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 
 //------------------------------
 // Vector types - CUDA
@@ -786,12 +786,12 @@ cxternary( const bool& mask, const cxtype& a, const cxtype& b )
   return ( mask ? a : b );
 }
 
-#endif // #ifdef __CUDACC__
+#endif // #ifdef MGONGPUCPP_GPUIMPL
 
 //==========================================================================
 
 // Scalar-or-vector types: scalar in CUDA, vector or scalar in C++
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 typedef bool bool_sv;
 typedef fptype fptype_sv;
 typedef fptype2 fptype2_sv;
@@ -812,7 +812,7 @@ typedef mgOnGpu::cxtype_ref cxtype_sv_ref;
 #endif
 
 // Scalar-or-vector zeros: scalar in CUDA, vector or scalar in C++
-#ifdef __CUDACC__ /* clang-format off */
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
 inline __host__ __device__ cxtype cxzero_sv(){ return cxtype( 0, 0 ); }
 #elif defined MGONGPU_CPPSIMD
 inline cxtype_v cxzero_sv() { return cxtype_v(); } // RRRR=0000 IIII=0000
diff --git a/epochX/cudacpp/heft_gg_h.sa/src/rambo.h b/epochX/cudacpp/heft_gg_h.sa/src/rambo.h
index e02ea52496..cd7e1008ea 100644
--- a/epochX/cudacpp/heft_gg_h.sa/src/rambo.h
+++ b/epochX/cudacpp/heft_gg_h.sa/src/rambo.h
@@ -4,7 +4,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 
 #include "mgOnGpuConfig.h"
@@ -18,7 +18,7 @@
 #include <iostream>
 
 // Simplified rambo version for 2 to N (with N>=2) processes with massless particles
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -83,7 +83,7 @@ namespace mg5amcCpu
       static bool first = true;
       if( first )
       {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
         if constexpr( M_ACCESS::isOnDevice() ) // avoid
         {
           const int ievt0 = 0;
@@ -166,7 +166,7 @@ namespace mg5amcCpu
     wt = po2log;
     if( nparf != 2 ) wt = ( 2. * nparf - 4. ) * log( energy ) + z[nparf - 1];
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
     // issue warnings if weight is too small or too large
     static int iwarn[5] = { 0, 0, 0, 0, 0 };
     if( wt < -180. )

From 1050176e78c04fc35feea3c8963fed7739441feb Mon Sep 17 00:00:00 2001
From: Andrea Valassi <andrea.valassi@cern.ch>
Date: Wed, 19 Jul 2023 09:11:18 +0200
Subject: [PATCH 386/509] [jthip] rerun 78 tput alltees (for CUDA/C++) after
 including HIP, all looks ok

STARTED  AT Tue Jul 18 07:35:57 PM CEST 2023
./tput/teeThroughputX.sh -mix -hrd -makej -eemumu -ggtt -ggttg -ggttgg -gqttq -ggttggg -makeclean
ENDED(1) AT Tue Jul 18 10:55:48 PM CEST 2023 [Status=0]
./tput/teeThroughputX.sh -flt -hrd -makej -eemumu -ggtt -ggttgg -inlonly -makeclean
ENDED(2) AT Tue Jul 18 11:19:50 PM CEST 2023 [Status=0]
./tput/teeThroughputX.sh -makej -eemumu -ggtt -ggttg -gqttq -ggttgg -ggttggg -flt -bridge -makeclean
ENDED(3) AT Tue Jul 18 11:28:30 PM CEST 2023 [Status=0]
./tput/teeThroughputX.sh -eemumu -ggtt -ggttgg -flt -rmbhst
ENDED(4) AT Tue Jul 18 11:31:25 PM CEST 2023 [Status=0]
./tput/teeThroughputX.sh -eemumu -ggtt -ggttgg -flt -curhst
ENDED(5) AT Tue Jul 18 11:34:18 PM CEST 2023 [Status=0]

Some minor performance changes also due to itscrd80 vs itscrd90
---
 .../log_eemumu_mad_d_inl0_hrd0.txt            | 100 ++++++-------
 .../log_eemumu_mad_d_inl0_hrd0_bridge.txt     | 100 ++++++-------
 .../log_eemumu_mad_d_inl0_hrd0_common.txt     | 100 ++++++-------
 .../log_eemumu_mad_d_inl0_hrd0_curhst.txt     | 100 ++++++-------
 .../log_eemumu_mad_d_inl0_hrd0_rmbhst.txt     | 100 ++++++-------
 .../log_eemumu_mad_d_inl0_hrd1.txt            | 100 ++++++-------
 .../log_eemumu_mad_d_inl1_hrd0.txt            | 100 ++++++-------
 .../log_eemumu_mad_d_inl1_hrd1.txt            | 100 ++++++-------
 .../log_eemumu_mad_f_inl0_hrd0.txt            | 104 +++++++-------
 .../log_eemumu_mad_f_inl0_hrd0_bridge.txt     | 106 +++++++-------
 .../log_eemumu_mad_f_inl0_hrd0_common.txt     | 104 +++++++-------
 .../log_eemumu_mad_f_inl0_hrd0_curhst.txt     | 104 +++++++-------
 .../log_eemumu_mad_f_inl0_hrd0_rmbhst.txt     | 106 +++++++-------
 .../log_eemumu_mad_f_inl0_hrd1.txt            | 104 +++++++-------
 .../log_eemumu_mad_f_inl1_hrd0.txt            | 104 +++++++-------
 .../log_eemumu_mad_f_inl1_hrd1.txt            | 104 +++++++-------
 .../log_eemumu_mad_m_inl0_hrd0.txt            | 100 ++++++-------
 .../log_eemumu_mad_m_inl0_hrd1.txt            | 100 ++++++-------
 .../log_ggtt_mad_d_inl0_hrd0.txt              | 102 ++++++-------
 .../log_ggtt_mad_d_inl0_hrd0_bridge.txt       | 102 ++++++-------
 .../log_ggtt_mad_d_inl0_hrd0_common.txt       | 102 ++++++-------
 .../log_ggtt_mad_d_inl0_hrd0_curhst.txt       | 102 ++++++-------
 .../log_ggtt_mad_d_inl0_hrd0_rmbhst.txt       | 102 ++++++-------
 .../log_ggtt_mad_d_inl0_hrd1.txt              | 100 ++++++-------
 .../log_ggtt_mad_d_inl1_hrd0.txt              | 100 ++++++-------
 .../log_ggtt_mad_d_inl1_hrd1.txt              | 102 ++++++-------
 .../log_ggtt_mad_f_inl0_hrd0.txt              | 108 +++++++-------
 .../log_ggtt_mad_f_inl0_hrd0_bridge.txt       | 110 +++++++-------
 .../log_ggtt_mad_f_inl0_hrd0_common.txt       | 108 +++++++-------
 .../log_ggtt_mad_f_inl0_hrd0_curhst.txt       | 108 +++++++-------
 .../log_ggtt_mad_f_inl0_hrd0_rmbhst.txt       | 110 +++++++-------
 .../log_ggtt_mad_f_inl0_hrd1.txt              | 104 +++++++-------
 .../log_ggtt_mad_f_inl1_hrd0.txt              | 104 +++++++-------
 .../log_ggtt_mad_f_inl1_hrd1.txt              | 106 +++++++-------
 .../log_ggtt_mad_m_inl0_hrd0.txt              | 102 ++++++-------
 .../log_ggtt_mad_m_inl0_hrd1.txt              | 102 ++++++-------
 .../log_ggttg_mad_d_inl0_hrd0.txt             | 118 +++++++--------
 .../log_ggttg_mad_d_inl0_hrd0_bridge.txt      | 118 +++++++--------
 .../log_ggttg_mad_d_inl0_hrd1.txt             | 120 ++++++++--------
 .../log_ggttg_mad_f_inl0_hrd0.txt             | 124 ++++++++--------
 .../log_ggttg_mad_f_inl0_hrd0_bridge.txt      | 126 ++++++++--------
 .../log_ggttg_mad_f_inl0_hrd1.txt             | 124 ++++++++--------
 .../log_ggttg_mad_m_inl0_hrd0.txt             | 124 ++++++++--------
 .../log_ggttg_mad_m_inl0_hrd1.txt             | 122 ++++++++--------
 .../log_ggttgg_mad_d_inl0_hrd0.txt            | 120 ++++++++--------
 .../log_ggttgg_mad_d_inl0_hrd0_bridge.txt     | 120 ++++++++--------
 .../log_ggttgg_mad_d_inl0_hrd0_common.txt     | 120 ++++++++--------
 .../log_ggttgg_mad_d_inl0_hrd0_curhst.txt     | 120 ++++++++--------
 .../log_ggttgg_mad_d_inl0_hrd0_rmbhst.txt     | 120 ++++++++--------
 .../log_ggttgg_mad_d_inl0_hrd1.txt            | 122 ++++++++--------
 .../log_ggttgg_mad_d_inl1_hrd0.txt            | 118 +++++++--------
 .../log_ggttgg_mad_d_inl1_hrd1.txt            | 118 +++++++--------
 .../log_ggttgg_mad_f_inl0_hrd0.txt            | 124 ++++++++--------
 .../log_ggttgg_mad_f_inl0_hrd0_bridge.txt     | 126 ++++++++--------
 .../log_ggttgg_mad_f_inl0_hrd0_common.txt     | 124 ++++++++--------
 .../log_ggttgg_mad_f_inl0_hrd0_curhst.txt     | 124 ++++++++--------
 .../log_ggttgg_mad_f_inl0_hrd0_rmbhst.txt     | 126 ++++++++--------
 .../log_ggttgg_mad_f_inl0_hrd1.txt            | 124 ++++++++--------
 .../log_ggttgg_mad_f_inl1_hrd0.txt            | 122 ++++++++--------
 .../log_ggttgg_mad_f_inl1_hrd1.txt            | 122 ++++++++--------
 .../log_ggttgg_mad_m_inl0_hrd0.txt            | 124 ++++++++--------
 .../log_ggttgg_mad_m_inl0_hrd1.txt            | 126 ++++++++--------
 .../log_ggttggg_mad_d_inl0_hrd0.txt           | 120 ++++++++--------
 .../log_ggttggg_mad_d_inl0_hrd0_bridge.txt    | 120 ++++++++--------
 .../log_ggttggg_mad_d_inl0_hrd1.txt           | 124 ++++++++--------
 .../log_ggttggg_mad_f_inl0_hrd0.txt           | 128 ++++++++---------
 .../log_ggttggg_mad_f_inl0_hrd0_bridge.txt    | 128 ++++++++---------
 .../log_ggttggg_mad_f_inl0_hrd1.txt           | 134 +++++++++---------
 .../log_ggttggg_mad_m_inl0_hrd0.txt           | 120 ++++++++--------
 .../log_ggttggg_mad_m_inl0_hrd1.txt           | 124 ++++++++--------
 .../log_gqttq_mad_d_inl0_hrd0.txt             | 118 +++++++--------
 .../log_gqttq_mad_d_inl0_hrd0_bridge.txt      | 118 +++++++--------
 .../log_gqttq_mad_d_inl0_hrd1.txt             | 118 +++++++--------
 .../log_gqttq_mad_f_inl0_hrd0.txt             | 122 ++++++++--------
 .../log_gqttq_mad_f_inl0_hrd0_bridge.txt      | 124 ++++++++--------
 .../log_gqttq_mad_f_inl0_hrd1.txt             | 124 ++++++++--------
 .../log_gqttq_mad_m_inl0_hrd0.txt             | 120 ++++++++--------
 .../log_gqttq_mad_m_inl0_hrd1.txt             | 118 +++++++--------
 78 files changed, 4423 insertions(+), 4423 deletions(-)

diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0.txt
index 2f9c234dd5..97fdf2746a 100644
--- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0.txt
+++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0.txt
@@ -35,22 +35,22 @@ CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 
-DATE: 2023-06-16_22:48:21
+DATE: 2023-07-18_22:37:20
 
-On itscrd80.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
+On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 12 OMP=
-Process                     = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 7.277441e+07                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 5.165288e+08                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 8.736817e+08                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 5.475387e+07                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 4.820978e+08                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 8.778893e+08                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270315e-06 )  GeV^0
-TOTAL       :     0.697989 sec
-     2,734,672,671      cycles                    #    2.891 GHz                    
-     3,867,376,774      instructions              #    1.41  insn per cycle         
-       1.006665473 seconds time elapsed
+TOTAL       :     0.855531 sec
+     2,833,192,783      cycles                           #    2.924 GHz                    
+     4,325,389,575      instructions                     #    1.53  insn per cycle         
+       1.187306501 seconds time elapsed
 runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 1
 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 150
 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100%
@@ -63,19 +63,19 @@ Relative difference = 1.0277080522138477e-08
 OK (relative difference <= 5E-3)
 =========================================================================
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/check.exe -p 2048 256 12 OMP=
-Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.201835e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.490501e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.490501e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.195619e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.419858e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.419858e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270315e-06 )  GeV^0
-TOTAL       :     5.636481 sec
-    17,424,894,515      cycles                    #    3.090 GHz                    
-    41,067,496,174      instructions              #    2.36  insn per cycle         
-       5.642861047 seconds time elapsed
+TOTAL       :     5.668235 sec
+    17,201,910,416      cycles                           #    3.033 GHz                    
+    40,424,836,344      instructions                     #    2.35  insn per cycle         
+       5.676367615 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:  375) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/runTest.exe
@@ -89,19 +89,19 @@ Relative difference = 1.0277102699700292e-08
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/check.exe -p 2048 256 12 OMP=
-Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 2.047757e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.152885e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.152885e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.110803e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.016058e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.016058e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270315e-06 )  GeV^0
-TOTAL       :     3.473381 sec
-    10,707,811,255      cycles                    #    3.080 GHz                    
-    25,328,572,543      instructions              #    2.37  insn per cycle         
-       3.485749163 seconds time elapsed
+TOTAL       :     3.385940 sec
+    10,308,474,989      cycles                           #    3.040 GHz                    
+    24,683,556,153      instructions                     #    2.39  insn per cycle         
+       3.396019895 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4: 1283) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/runTest.exe
@@ -115,19 +115,19 @@ Relative difference = 1.0277102699700292e-08
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/check.exe -p 2048 256 12 OMP=
-Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 2.975289e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 5.963650e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 5.963650e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.235785e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 5.797359e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 5.797359e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270315e-06 )  GeV^0
-TOTAL       :     2.514299 sec
-     7,529,377,193      cycles                    #    2.995 GHz                    
-    14,324,765,141      instructions              #    1.90  insn per cycle         
-       2.525874425 seconds time elapsed
+TOTAL       :     2.343102 sec
+     6,894,343,443      cycles                           #    2.936 GHz                    
+    13,677,655,108      instructions                     #    1.98  insn per cycle         
+       2.351884504 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 1063) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/runTest.exe
@@ -141,19 +141,19 @@ Relative difference = 1.0277089312025782e-08
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/check.exe -p 2048 256 12 OMP=
-Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 3.078510e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 6.389306e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 6.389306e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.391389e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 6.266576e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 6.266576e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270315e-06 )  GeV^0
-TOTAL       :     2.437364 sec
-     7,311,719,824      cycles                    #    2.995 GHz                    
-    14,031,232,859      instructions              #    1.92  insn per cycle         
-       2.448705799 seconds time elapsed
+TOTAL       :     2.252146 sec
+     6,641,349,095      cycles                           #    2.944 GHz                    
+    13,370,765,818      instructions                     #    2.01  insn per cycle         
+       2.260544049 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 1024) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/runTest.exe
@@ -167,19 +167,19 @@ Relative difference = 1.0277089312025782e-08
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/check.exe -p 2048 256 12 OMP=
-Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 2.939663e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 5.739935e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 5.739935e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.179961e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 5.588825e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 5.588825e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270315e-06 )  GeV^0
-TOTAL       :     2.537461 sec
-     6,538,056,350      cycles                    #    2.572 GHz                    
-    10,814,168,036      instructions              #    1.65  insn per cycle         
-       2.543993268 seconds time elapsed
+TOTAL       :     2.381395 sec
+     5,905,623,572      cycles                           #    2.474 GHz                    
+    10,160,843,578      instructions                     #    1.72  insn per cycle         
+       2.390440152 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2:  289) (512y:    0) (512z:  683)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/runTest.exe
diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_bridge.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_bridge.txt
index 413524a714..328f839ecf 100644
--- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_bridge.txt
+++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_bridge.txt
@@ -35,26 +35,26 @@ CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 
-DATE: 2023-06-16_23:18:49
+DATE: 2023-07-18_23:22:26
 
-On itscrd80.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
+On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 12 --bridge OMP=
 WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost
 WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost
 WARNING! Instantiate device Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288)
 WARNING! Set grid in Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288)
-Process                     = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 4.138466e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.761625e+07                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.761625e+07                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 4.629780e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.551524e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.551524e+07                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270315e-06 )  GeV^0
-TOTAL       :     2.435025 sec
-     8,057,144,700      cycles                    #    2.996 GHz                    
-    13,634,238,127      instructions              #    1.69  insn per cycle         
-       2.748108130 seconds time elapsed
+TOTAL       :     2.253250 sec
+     7,452,417,366      cycles                           #    2.981 GHz                    
+    13,168,688,030      instructions                     #    1.77  insn per cycle         
+       2.555215575 seconds time elapsed
 runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 1 --bridge
 WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost
 WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost
@@ -72,19 +72,19 @@ OK (relative difference <= 5E-3)
 =========================================================================
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/check.exe -p 2048 256 12 --bridge OMP=
 WARNING! Instantiate host Bridge (nevt=524288)
-Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.146185e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.405096e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.405096e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.150979e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.358888e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.358888e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270315e-06 )  GeV^0
-TOTAL       :     6.106864 sec
-    18,682,647,016      cycles                    #    3.057 GHz                    
-    41,378,608,262      instructions              #    2.21  insn per cycle         
-       6.114253847 seconds time elapsed
+TOTAL       :     6.071511 sec
+    18,441,155,303      cycles                           #    3.036 GHz                    
+    40,655,259,371      instructions                     #    2.20  insn per cycle         
+       6.078353672 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:  375) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/runTest.exe
@@ -99,19 +99,19 @@ OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/check.exe -p 2048 256 12 --bridge OMP=
 WARNING! Instantiate host Bridge (nevt=524288)
-Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.923353e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.849109e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.849109e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.945018e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.698668e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.698668e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270315e-06 )  GeV^0
-TOTAL       :     3.885345 sec
-    11,937,212,794      cycles                    #    3.069 GHz                    
-    26,176,863,335      instructions              #    2.19  insn per cycle         
-       3.901575111 seconds time elapsed
+TOTAL       :     3.846123 sec
+    11,536,385,327      cycles                           #    2.995 GHz                    
+    25,528,557,805      instructions                     #    2.21  insn per cycle         
+       3.852649513 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4: 1283) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/runTest.exe
@@ -126,19 +126,19 @@ OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/check.exe -p 2048 256 12 --bridge OMP=
 WARNING! Instantiate host Bridge (nevt=524288)
-Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 2.738552e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 5.036567e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 5.036567e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.922595e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 4.906457e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.906457e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270315e-06 )  GeV^0
-TOTAL       :     2.911902 sec
-     8,807,623,027      cycles                    #    3.018 GHz                    
-    15,689,801,673      instructions              #    1.78  insn per cycle         
-       2.919670444 seconds time elapsed
+TOTAL       :     2.761542 sec
+     8,147,443,517      cycles                           #    2.944 GHz                    
+    15,038,071,882      instructions                     #    1.85  insn per cycle         
+       2.768392811 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 1063) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/runTest.exe
@@ -153,19 +153,19 @@ OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/check.exe -p 2048 256 12 --bridge OMP=
 WARNING! Instantiate host Bridge (nevt=524288)
-Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 2.823116e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 5.286837e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 5.286837e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.016753e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 5.161698e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 5.161698e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270315e-06 )  GeV^0
-TOTAL       :     2.843015 sec
-     8,509,599,499      cycles                    #    2.988 GHz                    
-    15,397,182,071      instructions              #    1.81  insn per cycle         
-       2.850349634 seconds time elapsed
+TOTAL       :     2.698772 sec
+     7,885,285,527      cycles                           #    2.915 GHz                    
+    14,731,068,956      instructions                     #    1.87  insn per cycle         
+       2.705638196 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 1024) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/runTest.exe
@@ -180,19 +180,19 @@ OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/check.exe -p 2048 256 12 --bridge OMP=
 WARNING! Instantiate host Bridge (nevt=524288)
-Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 2.546825e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 4.539933e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 4.539933e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.767252e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 4.506464e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.506464e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270315e-06 )  GeV^0
-TOTAL       :     3.102839 sec
-     7,964,585,861      cycles                    #    2.565 GHz                    
-    11,966,072,331      instructions              #    1.50  insn per cycle         
-       3.117079564 seconds time elapsed
+TOTAL       :     2.899290 sec
+     7,240,703,756      cycles                           #    2.492 GHz                    
+    11,306,846,019      instructions                     #    1.56  insn per cycle         
+       2.906107924 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2:  289) (512y:    0) (512z:  683)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/runTest.exe
diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_common.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_common.txt
index f307a0f66d..8fcaa402a4 100644
--- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_common.txt
+++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_common.txt
@@ -35,22 +35,22 @@ CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 
-DATE: 2023-06-16_23:31:11
+DATE: 2023-07-18_23:34:26
 
-On itscrd80.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
+On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 12 --common OMP=
-Process                     = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:COMMON+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 6.936041e+07                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 5.298657e+08                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 8.700721e+08                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 6.662259e+07                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 5.218708e+08                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 8.733644e+08                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371879e-02 +- 3.270020e-06 )  GeV^0
-TOTAL       :     1.341897 sec
-     4,726,189,877      cycles                    #    2.972 GHz                    
-     6,969,421,973      instructions              #    1.47  insn per cycle         
-       1.646460673 seconds time elapsed
+TOTAL       :     1.334692 sec
+     4,639,793,652      cycles                           #    2.967 GHz                    
+     7,129,897,248      instructions                     #    1.54  insn per cycle         
+       1.621833847 seconds time elapsed
 runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 1 --common
 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 150
 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100%
@@ -63,19 +63,19 @@ Relative difference = 1.0277080522138477e-08
 OK (relative difference <= 5E-3)
 =========================================================================
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/check.exe -p 2048 256 12 --common OMP=
-Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.210740e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.496812e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.496812e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.191263e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.421752e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.421752e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371879e-02 +- 3.270020e-06 )  GeV^0
-TOTAL       :     5.955030 sec
-    18,512,766,741      cycles                    #    3.107 GHz                    
-    41,194,403,666      instructions              #    2.23  insn per cycle         
-       5.961394817 seconds time elapsed
+TOTAL       :     6.042662 sec
+    18,302,456,063      cycles                           #    3.028 GHz                    
+    40,527,734,047      instructions                     #    2.21  insn per cycle         
+       6.047705174 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:  375) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/runTest.exe
@@ -89,19 +89,19 @@ Relative difference = 1.0277102699700292e-08
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/check.exe -p 2048 256 12 --common OMP=
-Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 2.047034e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.131905e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.131905e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.100324e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.996340e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.996340e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371879e-02 +- 3.270020e-06 )  GeV^0
-TOTAL       :     3.837203 sec
-    11,777,572,402      cycles                    #    3.067 GHz                    
-    25,355,656,397      instructions              #    2.15  insn per cycle         
-       3.848753687 seconds time elapsed
+TOTAL       :     3.755609 sec
+    11,392,550,668      cycles                           #    3.030 GHz                    
+    24,688,239,473      instructions                     #    2.17  insn per cycle         
+       3.760743822 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4: 1283) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/runTest.exe
@@ -115,19 +115,19 @@ Relative difference = 1.0277102699700292e-08
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/check.exe -p 2048 256 12 --common OMP=
-Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 3.018909e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 5.963202e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 5.963202e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.232926e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 5.836224e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 5.836224e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371879e-02 +- 3.270020e-06 )  GeV^0
-TOTAL       :     2.834606 sec
-     8,590,828,376      cycles                    #    3.027 GHz                    
-    14,249,676,242      instructions              #    1.66  insn per cycle         
-       2.846836909 seconds time elapsed
+TOTAL       :     2.705588 sec
+     7,975,874,619      cycles                           #    2.944 GHz                    
+    13,582,310,666      instructions                     #    1.70  insn per cycle         
+       2.710967931 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 1063) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/runTest.exe
@@ -141,19 +141,19 @@ Relative difference = 1.0277089312025782e-08
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/check.exe -p 2048 256 12 --common OMP=
-Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:COMMON+RMBHST+MESHST/512y+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 3.112124e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 6.390077e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 6.390077e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.347476e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 6.167763e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 6.167763e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371879e-02 +- 3.270020e-06 )  GeV^0
-TOTAL       :     2.779947 sec
-     8,413,366,977      cycles                    #    3.021 GHz                    
-    13,755,530,918      instructions              #    1.63  insn per cycle         
-       2.792283352 seconds time elapsed
+TOTAL       :     2.646547 sec
+     7,739,527,075      cycles                           #    2.920 GHz                    
+    13,072,506,268      instructions                     #    1.69  insn per cycle         
+       2.652061733 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 1024) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/runTest.exe
@@ -167,19 +167,19 @@ Relative difference = 1.0277089312025782e-08
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/check.exe -p 2048 256 12 --common OMP=
-Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:COMMON+RMBHST+MESHST/512z+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 2.941539e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 5.752941e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 5.752941e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.143547e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 5.489365e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 5.489365e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371879e-02 +- 3.270020e-06 )  GeV^0
-TOTAL       :     2.901859 sec
-     7,689,553,685      cycles                    #    2.645 GHz                    
-    10,538,700,131      instructions              #    1.37  insn per cycle         
-       2.908219333 seconds time elapsed
+TOTAL       :     2.766665 sec
+     7,042,639,216      cycles                           #    2.542 GHz                    
+     9,862,282,848      instructions                     #    1.40  insn per cycle         
+       2.771823029 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2:  289) (512y:    0) (512z:  683)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/runTest.exe
diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_curhst.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_curhst.txt
index c1df1cffcd..f6754ccae5 100644
--- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_curhst.txt
+++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_curhst.txt
@@ -35,22 +35,22 @@ CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 
-DATE: 2023-06-16_23:28:12
+DATE: 2023-07-18_23:31:33
 
-On itscrd80.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
+On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 12 --curhst OMP=
-Process                     = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURHST+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 6.961494e+07                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 5.330926e+08                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 8.727946e+08                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 6.662856e+07                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 5.245166e+08                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 8.781222e+08                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270315e-06 )  GeV^0
-TOTAL       :     0.991651 sec
-     3,630,538,921      cycles                    #    2.949 GHz                    
-     6,739,620,599      instructions              #    1.86  insn per cycle         
-       1.288650450 seconds time elapsed
+TOTAL       :     0.978310 sec
+     3,590,915,972      cycles                           #    2.969 GHz                    
+     7,100,949,878      instructions                     #    1.98  insn per cycle         
+       1.265914090 seconds time elapsed
 runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 1 --curhst
 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 150
 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100%
@@ -63,19 +63,19 @@ Relative difference = 1.0277080522138477e-08
 OK (relative difference <= 5E-3)
 =========================================================================
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/check.exe -p 2048 256 12 --curhst OMP=
-Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.207504e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.492388e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.492388e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.197592e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.423169e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.423169e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270315e-06 )  GeV^0
-TOTAL       :     5.609999 sec
-    17,377,959,437      cycles                    #    3.096 GHz                    
-    41,067,120,156      instructions              #    2.36  insn per cycle         
-       5.616601968 seconds time elapsed
+TOTAL       :     5.658752 sec
+    17,157,501,803      cycles                           #    3.031 GHz                    
+    40,423,247,048      instructions                     #    2.36  insn per cycle         
+       5.663798783 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:  375) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/runTest.exe
@@ -89,19 +89,19 @@ Relative difference = 1.0277102699700292e-08
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/check.exe -p 2048 256 12 --curhst OMP=
-Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 2.061159e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.155823e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.155823e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.094129e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.984471e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.984471e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270315e-06 )  GeV^0
-TOTAL       :     3.452926 sec
-    10,661,248,447      cycles                    #    3.085 GHz                    
-    25,328,629,608      instructions              #    2.38  insn per cycle         
-       3.465344366 seconds time elapsed
+TOTAL       :     3.411637 sec
+    10,290,438,965      cycles                           #    3.012 GHz                    
+    24,682,963,306      instructions                     #    2.40  insn per cycle         
+       3.416785888 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4: 1283) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/runTest.exe
@@ -115,19 +115,19 @@ Relative difference = 1.0277102699700292e-08
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/check.exe -p 2048 256 12 --curhst OMP=
-Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 2.992039e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 5.943088e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 5.943088e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.128020e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 5.666773e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 5.666773e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270315e-06 )  GeV^0
-TOTAL       :     2.500278 sec
-     7,490,696,881      cycles                    #    2.992 GHz                    
-    14,324,115,086      instructions              #    1.91  insn per cycle         
-       2.512588472 seconds time elapsed
+TOTAL       :     2.417697 sec
+     6,930,904,304      cycles                           #    2.863 GHz                    
+    13,677,888,049      instructions                     #    1.97  insn per cycle         
+       2.422886649 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 1063) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/runTest.exe
@@ -141,19 +141,19 @@ Relative difference = 1.0277089312025782e-08
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/check.exe -p 2048 256 12 --curhst OMP=
-Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 3.099847e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 6.437862e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 6.437862e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.358143e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 6.166519e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 6.166519e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270315e-06 )  GeV^0
-TOTAL       :     2.420313 sec
-     7,280,482,659      cycles                    #    3.003 GHz                    
-    14,031,142,533      instructions              #    1.93  insn per cycle         
-       2.426602182 seconds time elapsed
+TOTAL       :     2.270607 sec
+     6,657,046,197      cycles                           #    2.926 GHz                    
+    13,381,817,915      instructions                     #    2.01  insn per cycle         
+       2.276064035 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 1024) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/runTest.exe
@@ -167,19 +167,19 @@ Relative difference = 1.0277089312025782e-08
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/check.exe -p 2048 256 12 --curhst OMP=
-Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 2.913808e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 5.700903e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 5.700903e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.165935e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 5.563823e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 5.563823e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270315e-06 )  GeV^0
-TOTAL       :     2.559114 sec
-     6,555,384,580      cycles                    #    2.558 GHz                    
-    10,814,650,468      instructions              #    1.65  insn per cycle         
-       2.565400113 seconds time elapsed
+TOTAL       :     2.391095 sec
+     5,900,007,770      cycles                           #    2.464 GHz                    
+    10,160,901,024      instructions                     #    1.72  insn per cycle         
+       2.396646480 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2:  289) (512y:    0) (512z:  683)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/runTest.exe
diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_rmbhst.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_rmbhst.txt
index 6e1b117ddd..2691433432 100644
--- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_rmbhst.txt
+++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_rmbhst.txt
@@ -35,23 +35,23 @@ CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 
-DATE: 2023-06-16_23:25:12
+DATE: 2023-07-18_23:28:38
 
-On itscrd80.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
+On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 12 --rmbhst OMP=
 WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost
-Process                     = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURHST+RMBHST+MESDEV/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 5.189825e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 5.289841e+08                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 8.647085e+08                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 6.016534e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 5.173006e+08                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 8.612395e+08                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270315e-06 )  GeV^0
-TOTAL       :     2.074835 sec
-     6,914,545,522      cycles                    #    2.988 GHz                    
-    11,919,156,975      instructions              #    1.72  insn per cycle         
-       2.372897157 seconds time elapsed
+TOTAL       :     1.898785 sec
+     6,257,438,480      cycles                           #    2.941 GHz                    
+    11,386,458,619      instructions                     #    1.82  insn per cycle         
+       2.187295422 seconds time elapsed
 runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 1 --rmbhst
 WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost
 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 150
@@ -65,19 +65,19 @@ Relative difference = 1.0277080522138477e-08
 OK (relative difference <= 5E-3)
 =========================================================================
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/check.exe -p 2048 256 12 --rmbhst OMP=
-Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.202902e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.488452e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.488452e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.194169e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.419402e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.419402e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270315e-06 )  GeV^0
-TOTAL       :     5.632592 sec
-    17,383,031,697      cycles                    #    3.085 GHz                    
-    41,067,827,351      instructions              #    2.36  insn per cycle         
-       5.638706459 seconds time elapsed
+TOTAL       :     5.674598 sec
+    17,158,940,169      cycles                           #    3.023 GHz                    
+    40,423,706,090      instructions                     #    2.36  insn per cycle         
+       5.679802244 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:  375) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/runTest.exe
@@ -91,19 +91,19 @@ Relative difference = 1.0277102699700292e-08
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/check.exe -p 2048 256 12 --rmbhst OMP=
-Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 2.068552e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.168923e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.168923e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.101127e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.993080e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.993080e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270315e-06 )  GeV^0
-TOTAL       :     3.443621 sec
-    10,653,431,550      cycles                    #    3.090 GHz                    
-    25,328,207,820      instructions              #    2.38  insn per cycle         
-       3.459263800 seconds time elapsed
+TOTAL       :     3.399462 sec
+    10,311,897,646      cycles                           #    3.031 GHz                    
+    24,682,892,190      instructions                     #    2.39  insn per cycle         
+       3.404932887 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4: 1283) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/runTest.exe
@@ -117,19 +117,19 @@ Relative difference = 1.0277102699700292e-08
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/check.exe -p 2048 256 12 --rmbhst OMP=
-Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 2.993581e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 5.951966e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 5.951966e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.251403e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 5.859944e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 5.859944e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270315e-06 )  GeV^0
-TOTAL       :     2.497810 sec
-     7,472,614,101      cycles                    #    2.988 GHz                    
-    14,326,081,552      instructions              #    1.92  insn per cycle         
-       2.513276024 seconds time elapsed
+TOTAL       :     2.331865 sec
+     6,864,953,180      cycles                           #    2.939 GHz                    
+    13,677,506,771      instructions                     #    1.99  insn per cycle         
+       2.336973372 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 1063) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/runTest.exe
@@ -143,19 +143,19 @@ Relative difference = 1.0277089312025782e-08
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/check.exe -p 2048 256 12 --rmbhst OMP=
-Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 3.123387e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 6.389805e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 6.389805e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.394665e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 6.273811e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 6.273811e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270315e-06 )  GeV^0
-TOTAL       :     2.406381 sec
-     7,220,335,602      cycles                    #    2.995 GHz                    
-    14,031,141,989      instructions              #    1.94  insn per cycle         
-       2.412823702 seconds time elapsed
+TOTAL       :     2.250085 sec
+     6,619,521,580      cycles                           #    2.937 GHz                    
+    13,370,614,673      instructions                     #    2.02  insn per cycle         
+       2.255242606 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 1024) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/runTest.exe
@@ -169,19 +169,19 @@ Relative difference = 1.0277089312025782e-08
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/check.exe -p 2048 256 12 --rmbhst OMP=
-Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 2.914294e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 5.704937e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 5.704937e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.140322e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 5.487494e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 5.487494e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270315e-06 )  GeV^0
-TOTAL       :     2.563138 sec
-     6,557,308,012      cycles                    #    2.558 GHz                    
-    10,815,146,087      instructions              #    1.65  insn per cycle         
-       2.575443013 seconds time elapsed
+TOTAL       :     2.405074 sec
+     5,909,857,512      cycles                           #    2.453 GHz                    
+    10,161,683,132      instructions                     #    1.72  insn per cycle         
+       2.410495582 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2:  289) (512y:    0) (512z:  683)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/runTest.exe
diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd1.txt
index bd06bd6ba5..4d453aecee 100644
--- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd1.txt
+++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd1.txt
@@ -35,22 +35,22 @@ CUDACPP_BUILDDIR='build.512z_d_inl0_hrd1'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 
-DATE: 2023-06-16_22:48:51
+DATE: 2023-07-18_22:37:49
 
-On itscrd80.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
+On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd1/gcheck.exe -p 2048 256 12 OMP=
-Process                     = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=1]
+Process                     = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 7.643941e+07                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 5.489939e+08                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.087323e+09                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 5.573344e+07                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 5.403737e+08                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.093659e+09                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270315e-06 )  GeV^0
-TOTAL       :     0.690482 sec
-     2,734,951,372      cycles                    #    2.921 GHz                    
-     3,869,487,763      instructions              #    1.41  insn per cycle         
-       1.001478108 seconds time elapsed
+TOTAL       :     0.706305 sec
+     2,778,015,010      cycles                           #    2.947 GHz                    
+     4,218,331,761      instructions                     #    1.52  insn per cycle         
+       1.008727837 seconds time elapsed
 runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd1/gcheck.exe -p 2048 256 1
 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 118
 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100%
@@ -63,19 +63,19 @@ Relative difference = 1.027708011645137e-08
 OK (relative difference <= 5E-3)
 =========================================================================
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd1/check.exe -p 2048 256 12 OMP=
-Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=1]
+Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.201121e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.486695e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.486695e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.200514e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.426217e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.426217e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270315e-06 )  GeV^0
-TOTAL       :     5.640974 sec
-    17,377,915,107      cycles                    #    3.080 GHz                    
-    41,019,735,572      instructions              #    2.36  insn per cycle         
-       5.647089750 seconds time elapsed
+TOTAL       :     5.643058 sec
+    17,154,898,222      cycles                           #    3.038 GHz                    
+    40,369,035,860      instructions                     #    2.35  insn per cycle         
+       5.649770482 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:  362) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd1/runTest.exe
@@ -89,19 +89,19 @@ Relative difference = 1.0277102294013186e-08
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd1/check.exe -p 2048 256 12 OMP=
-Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=1]
+Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 2.055579e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.158330e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.158330e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.091169e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.980266e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.980266e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270315e-06 )  GeV^0
-TOTAL       :     3.461985 sec
-    10,655,685,663      cycles                    #    3.075 GHz                    
-    25,289,974,301      instructions              #    2.37  insn per cycle         
-       3.474152122 seconds time elapsed
+TOTAL       :     3.416079 sec
+    10,310,582,032      cycles                           #    3.014 GHz                    
+    24,644,581,740      instructions                     #    2.39  insn per cycle         
+       3.422889711 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4: 1270) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd1/runTest.exe
@@ -115,19 +115,19 @@ Relative difference = 1.0277102294013186e-08
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd1/check.exe -p 2048 256 12 OMP=
-Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=1]
+Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 2.953920e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 5.887932e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 5.887932e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.143094e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 5.628947e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 5.628947e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270315e-06 )  GeV^0
-TOTAL       :     2.529478 sec
-     7,488,504,684      cycles                    #    2.955 GHz                    
-    14,297,973,959      instructions              #    1.91  insn per cycle         
-       2.535527552 seconds time elapsed
+TOTAL       :     2.411210 sec
+     6,890,682,089      cycles                           #    2.851 GHz                    
+    13,652,338,335      instructions                     #    1.98  insn per cycle         
+       2.418474908 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 1043) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd1/runTest.exe
@@ -141,19 +141,19 @@ Relative difference = 1.0277088906338675e-08
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd1/check.exe -p 2048 256 12 OMP=
-Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=1]
+Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 3.063577e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 6.335894e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 6.335894e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.360833e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 6.207381e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 6.207381e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270315e-06 )  GeV^0
-TOTAL       :     2.447618 sec
-     7,308,425,300      cycles                    #    2.980 GHz                    
-    14,017,785,626      instructions              #    1.92  insn per cycle         
-       2.453901778 seconds time elapsed
+TOTAL       :     2.268507 sec
+     6,671,924,337      cycles                           #    2.934 GHz                    
+    13,357,349,040      instructions                     #    2.00  insn per cycle         
+       2.275507029 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 1004) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd1/runTest.exe
@@ -167,19 +167,19 @@ Relative difference = 1.0277088906338675e-08
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd1/check.exe -p 2048 256 12 OMP=
-Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=1]
+Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 3.005001e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 6.053828e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 6.053828e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.271910e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 5.888847e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 5.888847e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270315e-06 )  GeV^0
-TOTAL       :     2.492808 sec
-     6,433,917,466      cycles                    #    2.577 GHz                    
-    10,696,732,836      instructions              #    1.66  insn per cycle         
-       2.504690032 seconds time elapsed
+TOTAL       :     2.325808 sec
+     5,802,426,408      cycles                           #    2.490 GHz                    
+    10,040,622,291      instructions                     #    1.73  insn per cycle         
+       2.332346097 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2:  267) (512y:    0) (512z:  663)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd1/runTest.exe
diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl1_hrd0.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl1_hrd0.txt
index 4005df7354..565e7311af 100644
--- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl1_hrd0.txt
+++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl1_hrd0.txt
@@ -35,22 +35,22 @@ CUDACPP_BUILDDIR='build.512z_d_inl1_hrd0'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 
-DATE: 2023-06-16_23:08:58
+DATE: 2023-07-18_23:12:58
 
-On itscrd80.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
+On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd0/gcheck.exe -p 2048 256 12 OMP=
-Process                     = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=1] [hardcodePARAM=0]
+Process                     = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=1] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 6.934597e+07                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 5.349196e+08                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 8.751072e+08                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 6.619901e+07                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 5.240858e+08                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 8.771999e+08                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270315e-06 )  GeV^0
-TOTAL       :     0.703650 sec
-     2,783,628,482      cycles                    #    2.937 GHz                    
-     3,817,522,930      instructions              #    1.37  insn per cycle         
-       1.006043909 seconds time elapsed
+TOTAL       :     0.686369 sec
+     2,716,938,611      cycles                           #    2.937 GHz                    
+     4,035,248,213      instructions                     #    1.49  insn per cycle         
+       0.984077819 seconds time elapsed
 runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd0/gcheck.exe -p 2048 256 1
 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 150
 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100%
@@ -63,19 +63,19 @@ Relative difference = 1.0277080522138477e-08
 OK (relative difference <= 5E-3)
 =========================================================================
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd0/check.exe -p 2048 256 12 OMP=
-Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.2.0] [inlineHel=1] [hardcodePARAM=0]
+Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 2.627042e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 4.555043e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 4.555043e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.794717e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 4.454295e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.454295e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270315e-06 )  GeV^0
-TOTAL       :     2.809092 sec
-     8,529,730,783      cycles                    #    3.037 GHz                    
-    17,314,493,447      instructions              #    2.03  insn per cycle         
-       2.815992048 seconds time elapsed
+TOTAL       :     2.659170 sec
+     7,973,174,973      cycles                           #    2.994 GHz                    
+    16,667,372,696      instructions                     #    2.09  insn per cycle         
+       2.664809953 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:  206) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd0/runTest.exe
@@ -89,19 +89,19 @@ Relative difference = 1.0277102699700292e-08
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl1_hrd0/check.exe -p 2048 256 12 OMP=
-Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.2.0] [inlineHel=1] [hardcodePARAM=0]
+Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 3.394827e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 8.052489e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 8.052489e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.727236e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 7.881600e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.881600e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270315e-06 )  GeV^0
-TOTAL       :     2.257313 sec
-     6,891,555,509      cycles                    #    3.048 GHz                    
-    13,420,618,567      instructions              #    1.95  insn per cycle         
-       2.263346481 seconds time elapsed
+TOTAL       :     2.088301 sec
+     6,353,709,417      cycles                           #    3.036 GHz                    
+    12,770,191,691      instructions                     #    2.01  insn per cycle         
+       2.093883449 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:  809) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl1_hrd0/runTest.exe
@@ -115,19 +115,19 @@ Relative difference = 1.0277102699700292e-08
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl1_hrd0/check.exe -p 2048 256 12 OMP=
-Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.2.0] [inlineHel=1] [hardcodePARAM=0]
+Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 3.953793e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.186747e+07                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.186747e+07                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 4.497330e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.177920e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.177920e+07                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270315e-06 )  GeV^0
-TOTAL       :     1.994156 sec
-     5,944,600,029      cycles                    #    2.973 GHz                    
-    10,446,540,517      instructions              #    1.76  insn per cycle         
-       2.006757717 seconds time elapsed
+TOTAL       :     1.795633 sec
+     5,363,007,772      cycles                           #    2.979 GHz                    
+     9,799,409,716      instructions                     #    1.83  insn per cycle         
+       1.801392342 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2:  460) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl1_hrd0/runTest.exe
@@ -141,19 +141,19 @@ Relative difference = 1.0277089312025782e-08
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl1_hrd0/check.exe -p 2048 256 12 OMP=
-Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.2.0] [inlineHel=1] [hardcodePARAM=0]
+Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 4.165108e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.323250e+07                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.323250e+07                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 4.631669e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.299199e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.299199e+07                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270315e-06 )  GeV^0
-TOTAL       :     1.906328 sec
-     5,747,679,195      cycles                    #    3.008 GHz                    
-    10,324,227,604      instructions              #    1.80  insn per cycle         
-       1.912748843 seconds time elapsed
+TOTAL       :     1.756078 sec
+     5,218,021,073      cycles                           #    2.964 GHz                    
+     9,661,166,589      instructions                     #    1.85  insn per cycle         
+       1.761517181 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2:  435) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl1_hrd0/runTest.exe
@@ -167,19 +167,19 @@ Relative difference = 1.0277089312025782e-08
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl1_hrd0/check.exe -p 2048 256 12 OMP=
-Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.2.0] [inlineHel=1] [hardcodePARAM=0]
+Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 3.709510e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 9.834439e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 9.834439e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 4.226821e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 9.881135e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 9.881135e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270315e-06 )  GeV^0
-TOTAL       :     2.101712 sec
-     5,645,784,770      cycles                    #    2.680 GHz                    
-     9,348,170,561      instructions              #    1.66  insn per cycle         
-       2.114432099 seconds time elapsed
+TOTAL       :     1.889106 sec
+     4,991,885,143      cycles                           #    2.638 GHz                    
+     8,694,348,991      instructions                     #    1.74  insn per cycle         
+       1.894628100 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2:  221) (512y:    0) (512z:  276)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl1_hrd0/runTest.exe
diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl1_hrd1.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl1_hrd1.txt
index bc4e48c353..09c8b4ac31 100644
--- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl1_hrd1.txt
+++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl1_hrd1.txt
@@ -35,22 +35,22 @@ CUDACPP_BUILDDIR='build.512z_d_inl1_hrd1'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 
-DATE: 2023-06-16_23:09:22
+DATE: 2023-07-18_23:13:20
 
-On itscrd80.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
+On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd1/gcheck.exe -p 2048 256 12 OMP=
-Process                     = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=1] [hardcodePARAM=1]
+Process                     = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=1] [hardcodePARAM=1]
 Workflow summary            = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 7.022137e+07                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 6.034901e+08                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.081788e+09                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 6.724285e+07                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 5.937249e+08                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.086399e+09                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270315e-06 )  GeV^0
-TOTAL       :     0.701380 sec
-     2,726,534,804      cycles                    #    2.886 GHz                    
-     3,786,125,853      instructions              #    1.39  insn per cycle         
-       1.004213223 seconds time elapsed
+TOTAL       :     0.674219 sec
+     2,690,679,134      cycles                           #    2.960 GHz                    
+     4,101,520,560      instructions                     #    1.52  insn per cycle         
+       0.968706738 seconds time elapsed
 runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd1/gcheck.exe -p 2048 256 1
 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 118
 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100%
@@ -63,19 +63,19 @@ Relative difference = 1.027708011645137e-08
 OK (relative difference <= 5E-3)
 =========================================================================
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd1/check.exe -p 2048 256 12 OMP=
-Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.2.0] [inlineHel=1] [hardcodePARAM=1]
+Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=1]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 3.510910e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 7.966131e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 7.966131e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.820226e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 7.697359e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.697359e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270315e-06 )  GeV^0
-TOTAL       :     2.194238 sec
-     6,733,882,413      cycles                    #    3.062 GHz                    
-    13,573,888,806      instructions              #    2.02  insn per cycle         
-       2.200295951 seconds time elapsed
+TOTAL       :     2.052669 sec
+     6,223,903,279      cycles                           #    3.025 GHz                    
+    12,927,789,626      instructions                     #    2.08  insn per cycle         
+       2.058229844 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:  176) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd1/runTest.exe
@@ -89,19 +89,19 @@ Relative difference = 1.0277102294013186e-08
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl1_hrd1/check.exe -p 2048 256 12 OMP=
-Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.2.0] [inlineHel=1] [hardcodePARAM=1]
+Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=1]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 3.963639e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.196983e+07                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.196983e+07                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 4.347316e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.157252e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.157252e+07                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270315e-06 )  GeV^0
-TOTAL       :     1.987100 sec
-     6,108,038,272      cycles                    #    3.066 GHz                    
-    11,421,394,512      instructions              #    1.87  insn per cycle         
-       1.998910641 seconds time elapsed
+TOTAL       :     1.858729 sec
+     5,557,210,569      cycles                           #    2.981 GHz                    
+    10,775,548,998      instructions                     #    1.94  insn per cycle         
+       1.864742495 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:  609) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl1_hrd1/runTest.exe
@@ -115,19 +115,19 @@ Relative difference = 1.0277102294013186e-08
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl1_hrd1/check.exe -p 2048 256 12 OMP=
-Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.2.0] [inlineHel=1] [hardcodePARAM=1]
+Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=1]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 4.184988e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.401255e+07                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.401255e+07                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 4.783880e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.401463e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.401463e+07                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270315e-06 )  GeV^0
-TOTAL       :     1.904581 sec
-     5,674,452,757      cycles                    #    2.972 GHz                    
-     9,756,418,512      instructions              #    1.72  insn per cycle         
-       1.916141098 seconds time elapsed
+TOTAL       :     1.713600 sec
+     5,108,676,399      cycles                           #    2.973 GHz                    
+     9,109,861,348      instructions                     #    1.78  insn per cycle         
+       1.719071566 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2:  365) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl1_hrd1/runTest.exe
@@ -141,19 +141,19 @@ Relative difference = 1.0277088906338675e-08
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl1_hrd1/check.exe -p 2048 256 12 OMP=
-Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.2.0] [inlineHel=1] [hardcodePARAM=1]
+Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=1]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 4.421355e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.712475e+07                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.712475e+07                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 5.121890e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.674799e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.674799e+07                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270315e-06 )  GeV^0
-TOTAL       :     1.818113 sec
-     5,526,907,542      cycles                    #    3.034 GHz                    
-     9,745,448,132      instructions              #    1.76  insn per cycle         
-       1.824604908 seconds time elapsed
+TOTAL       :     1.623052 sec
+     4,855,321,150      cycles                           #    2.984 GHz                    
+     9,083,995,305      instructions                     #    1.87  insn per cycle         
+       1.628469471 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2:  356) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl1_hrd1/runTest.exe
@@ -167,19 +167,19 @@ Relative difference = 1.0277088906338675e-08
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl1_hrd1/check.exe -p 2048 256 12 OMP=
-Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.2.0] [inlineHel=1] [hardcodePARAM=1]
+Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=1]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 3.924922e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.119550e+07                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.119550e+07                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 4.385628e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.089039e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.089039e+07                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270315e-06 )  GeV^0
-TOTAL       :     2.000527 sec
-     5,531,314,603      cycles                    #    2.758 GHz                    
-     9,060,806,562      instructions              #    1.64  insn per cycle         
-       2.012991860 seconds time elapsed
+TOTAL       :     1.835801 sec
+     4,894,193,139      cycles                           #    2.659 GHz                    
+     8,406,937,984      instructions                     #    1.72  insn per cycle         
+       1.841151608 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2:  189) (512y:    0) (512z:  227)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl1_hrd1/runTest.exe
diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0.txt
index 998d7298b5..a5ef853643 100644
--- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0.txt
+++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0.txt
@@ -35,22 +35,22 @@ CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 
-DATE: 2023-06-16_22:49:20
+DATE: 2023-07-18_22:38:16
 
-On itscrd80.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
+On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 12 OMP=
-Process                     = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=2, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 1.622816e+08                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.472400e+09                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.786994e+09                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.098638e+08                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.184721e+09                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.780918e+09                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371686e-02 +- 3.270219e-06 )  GeV^0
-TOTAL       :     0.583321 sec
-     2,430,104,692      cycles                    #    2.932 GHz                    
-     3,363,264,752      instructions              #    1.38  insn per cycle         
-       0.886216701 seconds time elapsed
+TOTAL       :     0.589536 sec
+     2,402,233,552      cycles                           #    2.943 GHz                    
+     3,672,855,124      instructions                     #    1.53  insn per cycle         
+       0.878253042 seconds time elapsed
 runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 1
 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 96
 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100%
@@ -58,24 +58,24 @@ runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProces
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2
 Avg ME (C++/CUDA)   = 1.282802e-02
-Avg ME (F77/CUDA)   = 1.2828112026909366E-002
-Relative difference = 7.173898182689807e-06
+Avg ME (F77/CUDA)   = 1.2828112108763889E-002
+Relative difference = 7.180279099086847e-06
 OK (relative difference <= 5E-3)
 =========================================================================
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/check.exe -p 2048 256 12 OMP=
-Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=6, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.264259e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.523188e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.523188e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.216373e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.458660e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.458660e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270373e-06 )  GeV^0
-TOTAL       :     5.337690 sec
-    16,489,915,021      cycles                    #    3.088 GHz                    
-    40,104,655,673      instructions              #    2.43  insn per cycle         
-       5.343808523 seconds time elapsed
+TOTAL       :     5.536943 sec
+    16,849,697,016      cycles                           #    3.042 GHz                    
+    40,090,354,034      instructions                     #    2.38  insn per cycle         
+       5.543711092 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:  368) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/runTest.exe
@@ -89,19 +89,19 @@ Relative difference = 1.500049293219082e-08
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/check.exe -p 2048 256 12 OMP=
-Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=6, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 3.262321e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 6.200988e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 6.200988e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.140810e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 5.847975e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 5.847975e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270373e-06 )  GeV^0
-TOTAL       :     2.286472 sec
-     7,103,303,250      cycles                    #    3.101 GHz                    
-    16,746,623,366      instructions              #    2.36  insn per cycle         
-       2.292241843 seconds time elapsed
+TOTAL       :     2.367678 sec
+     7,192,962,805      cycles                           #    3.034 GHz                    
+    16,727,490,778      instructions                     #    2.33  insn per cycle         
+       2.374359353 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4: 1363) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/runTest.exe
@@ -115,19 +115,19 @@ Relative difference = 3.8113554068418534e-08
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/check.exe -p 2048 256 12 OMP=
-Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=4, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 4.581033e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.234851e+07                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.234851e+07                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 4.589522e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.217101e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.217101e+07                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270338e-06 )  GeV^0
-TOTAL       :     1.728168 sec
-     5,223,225,548      cycles                    #    3.015 GHz                    
-    10,646,468,952      instructions              #    2.04  insn per cycle         
-       1.739591338 seconds time elapsed
+TOTAL       :     1.730753 sec
+     5,165,936,826      cycles                           #    2.979 GHz                    
+    10,630,035,742      instructions                     #    2.06  insn per cycle         
+       1.737116155 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 1140) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/runTest.exe
@@ -141,19 +141,19 @@ Relative difference = 2.5306003563303186e-07
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/check.exe -p 2048 256 12 OMP=
-Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=4, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 4.727128e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.320782e+07                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.320782e+07                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 4.709413e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.299300e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.299300e+07                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270338e-06 )  GeV^0
-TOTAL       :     1.685516 sec
-     5,127,500,119      cycles                    #    3.035 GHz                    
-    10,500,102,407      instructions              #    2.05  insn per cycle         
-       1.698440130 seconds time elapsed
+TOTAL       :     1.696753 sec
+     5,075,364,220      cycles                           #    2.984 GHz                    
+    10,482,055,984      instructions                     #    2.07  insn per cycle         
+       1.703463309 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 1092) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/runTest.exe
@@ -167,19 +167,19 @@ Relative difference = 2.5306003563303186e-07
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/check.exe -p 2048 256 12 OMP=
-Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=4, zero=0)
 Internal loops fptype_sv    = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 4.545044e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.174805e+07                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.174805e+07                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 4.507095e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.144049e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.144049e+07                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371705e-02 +- 3.270339e-06 )  GeV^0
-TOTAL       :     1.747234 sec
-     4,737,364,431      cycles                    #    2.712 GHz                    
-     8,948,917,615      instructions              #    1.89  insn per cycle         
-       1.753348775 seconds time elapsed
+TOTAL       :     1.759221 sec
+     4,700,239,186      cycles                           #    2.667 GHz                    
+     8,928,497,530      instructions                     #    1.90  insn per cycle         
+       1.766112379 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2:  408) (512y:    0) (512z:  710)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/runTest.exe
diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_bridge.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_bridge.txt
index eea602cb6a..91a6918e52 100644
--- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_bridge.txt
+++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_bridge.txt
@@ -35,9 +35,9 @@ CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 
-DATE: 2023-06-16_23:19:22
+DATE: 2023-07-18_23:22:57
 
-On itscrd80.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
+On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 12 --bridge OMP=
 WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost
@@ -51,17 +51,17 @@ WARNING! flagging abnormal ME for ievt=66427
 WARNING! flagging abnormal ME for ievt=465318
 WARNING! flagging abnormal ME for ievt=458848
 WARNING! flagging abnormal ME for ievt=247522
-Process                     = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=7, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 7.151373e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 7.051120e+07                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 7.051120e+07                 )  sec^-1
-MeanMatrixElemValue         = ( 1.371709e-02 +- 3.270386e-06 )  GeV^0
-TOTAL       :     1.716284 sec
-     5,788,689,598      cycles                    #    2.965 GHz                    
-    10,066,532,076      instructions              #    1.74  insn per cycle         
-       2.010969778 seconds time elapsed
+EvtsPerSec[Rmb+ME]     (23) = ( 7.181850e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 6.493425e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 6.493425e+07                 )  sec^-1
+MeanMatrixElemValue         = ( 1.371709e-02 +- 3.270385e-06 )  GeV^0
+TOTAL       :     1.690119 sec
+     5,683,014,469      cycles                           #    2.962 GHz                    
+    10,259,149,546      instructions                     #    1.81  insn per cycle         
+       1.975831258 seconds time elapsed
 runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 1 --bridge
 WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost
 WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost
@@ -73,8 +73,8 @@ WARNING! Set grid in Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublo
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2
 Avg ME (C++/CUDA)   = 1.282802e-02
-Avg ME (F77/CUDA)   = 1.2828112026909366E-002
-Relative difference = 7.173898182689807e-06
+Avg ME (F77/CUDA)   = 1.2828112108763889E-002
+Relative difference = 7.180279099086847e-06
 OK (relative difference <= 5E-3)
 =========================================================================
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/check.exe -p 2048 256 12 --bridge OMP=
@@ -85,19 +85,19 @@ WARNING! flagging abnormal ME for ievt=152898
 WARNING! flagging abnormal ME for ievt=66427
 WARNING! flagging abnormal ME for ievt=164749
 WARNING! flagging abnormal ME for ievt=247522
-Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=6, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.231634e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.480435e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.480435e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.184677e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.415360e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.415360e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270373e-06 )  GeV^0
-TOTAL       :     5.578840 sec
-    17,225,374,898      cycles                    #    3.086 GHz                    
-    40,276,553,142      instructions              #    2.34  insn per cycle         
-       5.585821624 seconds time elapsed
+TOTAL       :     5.784130 sec
+    17,482,262,914      cycles                           #    3.020 GHz                    
+    40,239,314,516      instructions                     #    2.30  insn per cycle         
+       5.790365098 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:  368) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/runTest.exe
@@ -118,19 +118,19 @@ WARNING! flagging abnormal ME for ievt=152898
 WARNING! flagging abnormal ME for ievt=66427
 WARNING! flagging abnormal ME for ievt=164749
 WARNING! flagging abnormal ME for ievt=247522
-Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=6, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 2.937007e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 5.345782e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 5.345782e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.934835e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 5.216064e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 5.216064e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270373e-06 )  GeV^0
-TOTAL       :     2.639278 sec
-     7,955,090,673      cycles                    #    3.008 GHz                    
-    18,082,112,251      instructions              #    2.27  insn per cycle         
-       2.646477973 seconds time elapsed
+TOTAL       :     2.629008 sec
+     7,923,607,448      cycles                           #    3.009 GHz                    
+    18,063,565,877      instructions                     #    2.28  insn per cycle         
+       2.635314179 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4: 1363) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/runTest.exe
@@ -149,19 +149,19 @@ WARNING! flagging abnormal ME for ievt=53874
 WARNING! flagging abnormal ME for ievt=66427
 WARNING! flagging abnormal ME for ievt=164749
 WARNING! flagging abnormal ME for ievt=247522
-Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=4, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 4.189753e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.014027e+07                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.014027e+07                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 4.187074e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 9.897049e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 9.897049e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270338e-06 )  GeV^0
-TOTAL       :     1.979778 sec
-     5,995,131,291      cycles                    #    3.020 GHz                    
-    11,763,127,657      instructions              #    1.96  insn per cycle         
-       1.992584680 seconds time elapsed
+TOTAL       :     1.984011 sec
+     5,882,038,038      cycles                           #    2.956 GHz                    
+    11,750,113,647      instructions                     #    2.00  insn per cycle         
+       1.990516269 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 1140) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/runTest.exe
@@ -180,19 +180,19 @@ WARNING! flagging abnormal ME for ievt=53874
 WARNING! flagging abnormal ME for ievt=66427
 WARNING! flagging abnormal ME for ievt=164749
 WARNING! flagging abnormal ME for ievt=247522
-Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=4, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 4.319958e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.080076e+07                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.080076e+07                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 4.292699e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.044668e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.044668e+07                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270338e-06 )  GeV^0
-TOTAL       :     1.934129 sec
-     5,894,246,896      cycles                    #    3.038 GHz                    
-    11,620,616,390      instructions              #    1.97  insn per cycle         
-       1.941396703 seconds time elapsed
+TOTAL       :     1.947459 sec
+     5,799,609,939      cycles                           #    2.970 GHz                    
+    11,601,900,460      instructions                     #    2.00  insn per cycle         
+       1.953545312 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 1092) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/runTest.exe
@@ -211,19 +211,19 @@ WARNING! flagging abnormal ME for ievt=53874
 WARNING! flagging abnormal ME for ievt=66427
 WARNING! flagging abnormal ME for ievt=164749
 WARNING! flagging abnormal ME for ievt=247522
-Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=4, zero=0)
 Internal loops fptype_sv    = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 4.141992e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 9.375055e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 9.375055e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 4.075696e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 9.092017e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 9.092017e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371705e-02 +- 3.270339e-06 )  GeV^0
-TOTAL       :     1.999503 sec
-     5,512,078,856      cycles                    #    2.749 GHz                    
-    10,155,647,564      instructions              #    1.84  insn per cycle         
-       2.016503539 seconds time elapsed
+TOTAL       :     2.023993 sec
+     5,468,734,972      cycles                           #    2.695 GHz                    
+    10,135,139,968      instructions                     #    1.85  insn per cycle         
+       2.029970771 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2:  408) (512y:    0) (512z:  710)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/runTest.exe
diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_common.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_common.txt
index 800a4b8c86..bbb093b510 100644
--- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_common.txt
+++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_common.txt
@@ -35,22 +35,22 @@ CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 
-DATE: 2023-06-16_23:31:42
+DATE: 2023-07-18_23:34:56
 
-On itscrd80.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
+On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 12 --common OMP=
-Process                     = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:COMMON+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 1.394086e+08                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.330155e+09                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.707146e+09                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.336297e+08                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.304968e+09                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.746760e+09                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371863e-02 +- 3.269951e-06 )  GeV^0
-TOTAL       :     1.184633 sec
-     4,183,254,985      cycles                    #    2.960 GHz                    
-     6,293,699,635      instructions              #    1.50  insn per cycle         
-       1.472409896 seconds time elapsed
+TOTAL       :     1.172878 sec
+     4,104,924,338      cycles                           #    2.941 GHz                    
+     6,526,677,484      instructions                     #    1.59  insn per cycle         
+       1.452215524 seconds time elapsed
 runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 1 --common
 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 96
 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100%
@@ -58,24 +58,24 @@ runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProces
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2
 Avg ME (C++/CUDA)   = 1.282802e-02
-Avg ME (F77/CUDA)   = 1.2828112026909366E-002
-Relative difference = 7.173898182689807e-06
+Avg ME (F77/CUDA)   = 1.2828112108763889E-002
+Relative difference = 7.180279099086847e-06
 OK (relative difference <= 5E-3)
 =========================================================================
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/check.exe -p 2048 256 12 --common OMP=
-Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=4, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.262596e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.522021e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.522021e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.207988e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.449886e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.449886e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371887e-02 +- 3.270265e-06 )  GeV^0
-TOTAL       :     5.659502 sec
-    17,477,401,058      cycles                    #    3.088 GHz                    
-    40,266,459,883      instructions              #    2.30  insn per cycle         
-       5.665157466 seconds time elapsed
+TOTAL       :     5.909710 sec
+    17,837,688,263      cycles                           #    3.016 GHz                    
+    40,273,572,845      instructions                     #    2.26  insn per cycle         
+       5.915116440 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:  368) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/runTest.exe
@@ -89,19 +89,19 @@ Relative difference = 1.500049293219082e-08
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/check.exe -p 2048 256 12 --common OMP=
-Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=4, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 3.236448e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 6.142806e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 6.142806e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.046412e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 5.668762e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 5.668762e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371887e-02 +- 3.270265e-06 )  GeV^0
-TOTAL       :     2.618721 sec
-     8,103,544,055      cycles                    #    3.089 GHz                    
-    16,832,855,817      instructions              #    2.08  insn per cycle         
-       2.624569274 seconds time elapsed
+TOTAL       :     2.778407 sec
+     8,186,545,146      cycles                           #    2.942 GHz                    
+    16,814,006,667      instructions                     #    2.05  insn per cycle         
+       2.783352640 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4: 1363) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/runTest.exe
@@ -115,19 +115,19 @@ Relative difference = 3.8113554068418534e-08
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/check.exe -p 2048 256 12 --common OMP=
-Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=4, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 4.539437e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.230738e+07                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.230738e+07                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 4.494077e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.185801e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.185801e+07                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371885e-02 +- 3.270110e-06 )  GeV^0
-TOTAL       :     2.059577 sec
-     6,252,474,158      cycles                    #    3.029 GHz                    
-    10,562,743,863      instructions              #    1.69  insn per cycle         
-       2.071952790 seconds time elapsed
+TOTAL       :     2.084718 sec
+     6,169,848,657      cycles                           #    2.955 GHz                    
+    10,542,439,223      instructions                     #    1.71  insn per cycle         
+       2.090056514 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 1140) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/runTest.exe
@@ -141,19 +141,19 @@ Relative difference = 2.5306003563303186e-07
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/check.exe -p 2048 256 12 --common OMP=
-Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:COMMON+RMBHST+MESHST/512y+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=4, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 4.617687e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.289170e+07                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.289170e+07                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 4.635871e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.276211e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.276211e+07                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371885e-02 +- 3.270110e-06 )  GeV^0
-TOTAL       :     2.052194 sec
-     6,160,094,141      cycles                    #    2.996 GHz                    
-    10,211,164,443      instructions              #    1.66  insn per cycle         
-       2.069184232 seconds time elapsed
+TOTAL       :     2.048669 sec
+     6,121,871,406      cycles                           #    2.982 GHz                    
+    10,187,558,774      instructions                     #    1.66  insn per cycle         
+       2.053957847 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 1092) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/runTest.exe
@@ -167,19 +167,19 @@ Relative difference = 2.5306003563303186e-07
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/check.exe -p 2048 256 12 --common OMP=
-Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:COMMON+RMBHST+MESHST/512z+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=4, zero=0)
 Internal loops fptype_sv    = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 4.509722e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.158916e+07                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.158916e+07                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 4.477590e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.132710e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.132710e+07                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371884e-02 +- 3.270111e-06 )  GeV^0
-TOTAL       :     2.079386 sec
-     5,756,219,942      cycles                    #    2.763 GHz                    
-     8,663,173,729      instructions              #    1.51  insn per cycle         
-       2.085435176 seconds time elapsed
+TOTAL       :     2.101013 sec
+     5,715,581,929      cycles                           #    2.715 GHz                    
+     8,638,754,232      instructions                     #    1.51  insn per cycle         
+       2.105949730 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2:  408) (512y:    0) (512z:  710)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/runTest.exe
diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_curhst.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_curhst.txt
index 440dcefee1..8ef12c26ca 100644
--- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_curhst.txt
+++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_curhst.txt
@@ -35,22 +35,22 @@ CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 
-DATE: 2023-06-16_23:28:42
+DATE: 2023-07-18_23:32:01
 
-On itscrd80.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
+On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 12 --curhst OMP=
-Process                     = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURHST+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=2, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 1.402219e+08                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.355346e+09                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.784726e+09                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.334789e+08                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.313989e+09                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.795467e+09                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371686e-02 +- 3.270219e-06 )  GeV^0
-TOTAL       :     0.867299 sec
-     3,217,543,414      cycles                    #    2.926 GHz                    
-     6,137,693,119      instructions              #    1.91  insn per cycle         
-       1.157361320 seconds time elapsed
+TOTAL       :     0.849666 sec
+     3,175,772,104      cycles                           #    2.963 GHz                    
+     6,432,890,363      instructions                     #    2.03  insn per cycle         
+       1.129537023 seconds time elapsed
 runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 1 --curhst
 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 96
 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100%
@@ -58,24 +58,24 @@ runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProces
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2
 Avg ME (C++/CUDA)   = 1.282802e-02
-Avg ME (F77/CUDA)   = 1.2828112026909366E-002
-Relative difference = 7.173898182689807e-06
+Avg ME (F77/CUDA)   = 1.2828112108763889E-002
+Relative difference = 7.180279099086847e-06
 OK (relative difference <= 5E-3)
 =========================================================================
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/check.exe -p 2048 256 12 --curhst OMP=
-Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=6, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.257010e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.513861e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.513861e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.193842e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.431726e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.431726e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270373e-06 )  GeV^0
-TOTAL       :     5.368758 sec
-    16,502,939,001      cycles                    #    3.072 GHz                    
-    40,105,024,525      instructions              #    2.43  insn per cycle         
-       5.374937445 seconds time elapsed
+TOTAL       :     5.641756 sec
+    16,854,623,313      cycles                           #    2.986 GHz                    
+    40,092,655,634      instructions                     #    2.38  insn per cycle         
+       5.647077748 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:  368) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/runTest.exe
@@ -89,19 +89,19 @@ Relative difference = 1.500049293219082e-08
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/check.exe -p 2048 256 12 --curhst OMP=
-Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=6, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 3.242270e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 6.146410e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 6.146410e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.125387e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 5.853813e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 5.853813e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270373e-06 )  GeV^0
-TOTAL       :     2.303063 sec
-     7,094,904,869      cycles                    #    3.075 GHz                    
-    16,746,800,277      instructions              #    2.36  insn per cycle         
-       2.314822596 seconds time elapsed
+TOTAL       :     2.379891 sec
+     7,204,257,236      cycles                           #    3.021 GHz                    
+    16,730,694,588      instructions                     #    2.32  insn per cycle         
+       2.385210316 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4: 1363) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/runTest.exe
@@ -115,19 +115,19 @@ Relative difference = 3.8113554068418534e-08
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/check.exe -p 2048 256 12 --curhst OMP=
-Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=4, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 4.586979e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.231913e+07                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.231913e+07                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 4.519803e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.207170e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.207170e+07                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270338e-06 )  GeV^0
-TOTAL       :     1.807105 sec
-     5,428,964,238      cycles                    #    2.996 GHz                    
-    10,692,005,584      instructions              #    1.97  insn per cycle         
-       1.822790793 seconds time elapsed
+TOTAL       :     1.750842 sec
+     5,196,697,084      cycles                           #    2.961 GHz                    
+    10,630,011,228      instructions                     #    2.05  insn per cycle         
+       1.755747802 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 1140) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/runTest.exe
@@ -141,19 +141,19 @@ Relative difference = 2.5306003563303186e-07
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/check.exe -p 2048 256 12 --curhst OMP=
-Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=4, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 4.726711e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.329947e+07                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.329947e+07                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 4.650477e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.286008e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.286008e+07                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270338e-06 )  GeV^0
-TOTAL       :     1.687490 sec
-     5,097,137,843      cycles                    #    3.011 GHz                    
-    10,500,104,402      instructions              #    2.06  insn per cycle         
-       1.704174033 seconds time elapsed
+TOTAL       :     1.711767 sec
+     5,070,226,849      cycles                           #    2.955 GHz                    
+    10,472,873,926      instructions                     #    2.07  insn per cycle         
+       1.716818170 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 1092) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/runTest.exe
@@ -167,19 +167,19 @@ Relative difference = 2.5306003563303186e-07
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/check.exe -p 2048 256 12 --curhst OMP=
-Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=4, zero=0)
 Internal loops fptype_sv    = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 4.531072e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.167602e+07                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.167602e+07                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 4.489878e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.142603e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.142603e+07                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371705e-02 +- 3.270339e-06 )  GeV^0
-TOTAL       :     1.748105 sec
-     4,711,439,440      cycles                    #    2.688 GHz                    
-     8,948,315,074      instructions              #    1.90  insn per cycle         
-       1.760555512 seconds time elapsed
+TOTAL       :     1.761077 sec
+     4,702,427,717      cycles                           #    2.664 GHz                    
+     8,928,587,643      instructions                     #    1.90  insn per cycle         
+       1.766311452 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2:  408) (512y:    0) (512z:  710)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/runTest.exe
diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_rmbhst.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_rmbhst.txt
index 6541a30c4f..aac2c3c1e7 100644
--- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_rmbhst.txt
+++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_rmbhst.txt
@@ -35,23 +35,23 @@ CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 
-DATE: 2023-06-16_23:25:42
+DATE: 2023-07-18_23:29:07
 
-On itscrd80.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
+On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 12 --rmbhst OMP=
 WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost
-Process                     = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURHST+RMBHST+MESDEV/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=7, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 9.106073e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.321114e+09                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.631808e+09                 )  sec^-1
-MeanMatrixElemValue         = ( 1.371709e-02 +- 3.270386e-06 )  GeV^0
-TOTAL       :     1.491290 sec
-     5,169,582,957      cycles                    #    2.984 GHz                    
-     9,050,754,175      instructions              #    1.75  insn per cycle         
-       1.789532913 seconds time elapsed
+EvtsPerSec[Rmb+ME]     (23) = ( 9.089458e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.263743e+09                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.592799e+09                 )  sec^-1
+MeanMatrixElemValue         = ( 1.371709e-02 +- 3.270385e-06 )  GeV^0
+TOTAL       :     1.472978 sec
+     5,051,901,601      cycles                           #    2.981 GHz                    
+     9,151,755,294      instructions                     #    1.81  insn per cycle         
+       1.751826648 seconds time elapsed
 runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 1 --rmbhst
 WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost
 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 96
@@ -60,24 +60,24 @@ WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2
 Avg ME (C++/CUDA)   = 1.282802e-02
-Avg ME (F77/CUDA)   = 1.2828112026909366E-002
-Relative difference = 7.173898182689807e-06
+Avg ME (F77/CUDA)   = 1.2828112108763889E-002
+Relative difference = 7.180279099086847e-06
 OK (relative difference <= 5E-3)
 =========================================================================
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/check.exe -p 2048 256 12 --rmbhst OMP=
-Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=6, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.266280e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.526041e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.526041e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.205288e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.445391e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.445391e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270373e-06 )  GeV^0
-TOTAL       :     5.330040 sec
-    16,494,480,958      cycles                    #    3.092 GHz                    
-    40,104,668,257      instructions              #    2.43  insn per cycle         
-       5.336402582 seconds time elapsed
+TOTAL       :     5.588347 sec
+    16,838,368,532      cycles                           #    3.013 GHz                    
+    40,090,617,591      instructions                     #    2.38  insn per cycle         
+       5.593133320 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:  368) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/runTest.exe
@@ -91,19 +91,19 @@ Relative difference = 1.500049293219082e-08
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/check.exe -p 2048 256 12 --rmbhst OMP=
-Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=6, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 3.242421e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 6.129823e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 6.129823e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.133712e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 5.821935e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 5.821935e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270373e-06 )  GeV^0
-TOTAL       :     2.301776 sec
-     7,083,962,972      cycles                    #    3.073 GHz                    
-    16,746,671,271      instructions              #    2.36  insn per cycle         
-       2.307495453 seconds time elapsed
+TOTAL       :     2.373647 sec
+     7,180,844,252      cycles                           #    3.020 GHz                    
+    16,730,848,835      instructions                     #    2.33  insn per cycle         
+       2.378841839 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4: 1363) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/runTest.exe
@@ -117,19 +117,19 @@ Relative difference = 3.8113554068418534e-08
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/check.exe -p 2048 256 12 --rmbhst OMP=
-Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=4, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 4.610071e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.237721e+07                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.237721e+07                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 4.550598e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.196746e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.196746e+07                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270338e-06 )  GeV^0
-TOTAL       :     1.717055 sec
-     5,214,546,292      cycles                    #    3.028 GHz                    
-    10,646,475,199      instructions              #    2.04  insn per cycle         
-       1.729853327 seconds time elapsed
+TOTAL       :     1.739609 sec
+     5,166,190,440      cycles                           #    2.964 GHz                    
+    10,630,180,346      instructions                     #    2.06  insn per cycle         
+       1.744861725 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 1140) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/runTest.exe
@@ -143,19 +143,19 @@ Relative difference = 2.5306003563303186e-07
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/check.exe -p 2048 256 12 --rmbhst OMP=
-Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=4, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 4.669054e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.286554e+07                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.286554e+07                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 4.673596e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.294703e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.294703e+07                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270338e-06 )  GeV^0
-TOTAL       :     1.706696 sec
-     5,116,265,885      cycles                    #    2.990 GHz                    
-    10,500,169,607      instructions              #    2.05  insn per cycle         
-       1.718833211 seconds time elapsed
+TOTAL       :     1.707901 sec
+     5,075,943,253      cycles                           #    2.966 GHz                    
+    10,482,206,390      instructions                     #    2.07  insn per cycle         
+       1.713064431 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 1092) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/runTest.exe
@@ -169,19 +169,19 @@ Relative difference = 2.5306003563303186e-07
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/check.exe -p 2048 256 12 --rmbhst OMP=
-Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=4, zero=0)
 Internal loops fptype_sv    = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 4.528217e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.171325e+07                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.171325e+07                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 4.492142e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.134522e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.134522e+07                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371705e-02 +- 3.270339e-06 )  GeV^0
-TOTAL       :     1.745381 sec
-     4,755,213,854      cycles                    #    2.718 GHz                    
-     8,948,305,241      instructions              #    1.88  insn per cycle         
-       1.757628494 seconds time elapsed
+TOTAL       :     1.762628 sec
+     4,694,313,362      cycles                           #    2.657 GHz                    
+     8,928,042,352      instructions                     #    1.90  insn per cycle         
+       1.767978454 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2:  408) (512y:    0) (512z:  710)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/runTest.exe
diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd1.txt
index 278cecd3e4..9790783d8d 100644
--- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd1.txt
+++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd1.txt
@@ -35,22 +35,22 @@ CUDACPP_BUILDDIR='build.512z_f_inl0_hrd1'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 
-DATE: 2023-06-16_22:49:45
+DATE: 2023-07-18_22:38:41
 
-On itscrd80.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
+On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd1/gcheck.exe -p 2048 256 12 OMP=
-Process                     = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=1]
+Process                     = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=2, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 1.626450e+08                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.505770e+09                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.909932e+09                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.100412e+08                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.209842e+09                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.918242e+09                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371686e-02 +- 3.270219e-06 )  GeV^0
-TOTAL       :     0.584101 sec
-     2,373,467,870      cycles                    #    2.893 GHz                    
-     3,334,548,453      instructions              #    1.40  insn per cycle         
-       0.877449920 seconds time elapsed
+TOTAL       :     0.587563 sec
+     2,397,239,442      cycles                           #    2.943 GHz                    
+     3,693,800,031      instructions                     #    1.54  insn per cycle         
+       0.877655792 seconds time elapsed
 runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd1/gcheck.exe -p 2048 256 1
 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 80
 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100%
@@ -58,24 +58,24 @@ runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProces
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd1/gcheck.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd1/fgcheck.exe 2 64 2
 Avg ME (C++/CUDA)   = 1.282802e-02
-Avg ME (F77/CUDA)   = 1.2828112026909366E-002
-Relative difference = 7.173898182689807e-06
+Avg ME (F77/CUDA)   = 1.2828112108763889E-002
+Relative difference = 7.180279099086847e-06
 OK (relative difference <= 5E-3)
 =========================================================================
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd1/check.exe -p 2048 256 12 OMP=
-Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=1]
+Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=6, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.259578e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.517908e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.517908e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.202926e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.442967e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.442967e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270373e-06 )  GeV^0
-TOTAL       :     5.356459 sec
-    16,505,080,474      cycles                    #    3.080 GHz                    
-    40,054,237,440      instructions              #    2.43  insn per cycle         
-       5.362931581 seconds time elapsed
+TOTAL       :     5.597963 sec
+    16,833,526,961      cycles                           #    3.004 GHz                    
+    40,039,819,397      instructions                     #    2.38  insn per cycle         
+       5.604856115 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:  351) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd1/runTest.exe
@@ -89,19 +89,19 @@ Relative difference = 1.500049293219082e-08
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd1/check.exe -p 2048 256 12 OMP=
-Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=1]
+Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=6, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 3.233010e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 6.140192e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 6.140192e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.149831e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 5.849943e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 5.849943e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270373e-06 )  GeV^0
-TOTAL       :     2.305928 sec
-     7,093,977,561      cycles                    #    3.071 GHz                    
-    16,670,395,724      instructions              #    2.35  insn per cycle         
-       2.317849748 seconds time elapsed
+TOTAL       :     2.363324 sec
+     7,193,211,431      cycles                           #    3.038 GHz                    
+    16,654,791,239      instructions                     #    2.32  insn per cycle         
+       2.369899698 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4: 1338) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd1/runTest.exe
@@ -115,19 +115,19 @@ Relative difference = 3.8113554068418534e-08
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd1/check.exe -p 2048 256 12 OMP=
-Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=1]
+Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=4, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 4.492331e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.196200e+07                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.196200e+07                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 4.535444e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.206818e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.206818e+07                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270338e-06 )  GeV^0
-TOTAL       :     1.759548 sec
-     5,233,152,516      cycles                    #    2.967 GHz                    
-    10,634,018,962      instructions              #    2.03  insn per cycle         
-       1.771464615 seconds time elapsed
+TOTAL       :     1.745111 sec
+     5,193,548,855      cycles                           #    2.969 GHz                    
+    10,616,753,512      instructions                     #    2.04  insn per cycle         
+       1.751486110 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 1110) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd1/runTest.exe
@@ -141,19 +141,19 @@ Relative difference = 2.5306003563303186e-07
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd1/check.exe -p 2048 256 12 OMP=
-Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=1]
+Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=4, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 4.707001e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.313255e+07                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.313255e+07                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 4.705086e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.300457e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.300457e+07                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270338e-06 )  GeV^0
-TOTAL       :     1.696242 sec
-     5,111,285,292      cycles                    #    3.005 GHz                    
-    10,493,325,522      instructions              #    2.05  insn per cycle         
-       1.711751099 seconds time elapsed
+TOTAL       :     1.696618 sec
+     5,074,104,058      cycles                           #    2.982 GHz                    
+    10,475,028,207      instructions                     #    2.06  insn per cycle         
+       1.703324719 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 1062) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd1/runTest.exe
@@ -167,19 +167,19 @@ Relative difference = 2.5306003563303186e-07
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd1/check.exe -p 2048 256 12 OMP=
-Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=1]
+Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=4, zero=0)
 Internal loops fptype_sv    = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 4.642010e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.241108e+07                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.241108e+07                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 4.539230e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.200229e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.200229e+07                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371705e-02 +- 3.270339e-06 )  GeV^0
-TOTAL       :     1.712084 sec
-     4,653,252,953      cycles                    #    2.711 GHz                    
-     8,877,573,860      instructions              #    1.91  insn per cycle         
-       1.718230554 seconds time elapsed
+TOTAL       :     1.753059 sec
+     4,625,386,034      cycles                           #    2.633 GHz                    
+     8,857,420,373      instructions                     #    1.91  insn per cycle         
+       1.759570802 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2:  377) (512y:    0) (512z:  678)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd1/runTest.exe
diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl1_hrd0.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl1_hrd0.txt
index 212456d513..5092bfd385 100644
--- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl1_hrd0.txt
+++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl1_hrd0.txt
@@ -35,22 +35,22 @@ CUDACPP_BUILDDIR='build.512z_f_inl1_hrd0'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 
-DATE: 2023-06-16_23:09:44
+DATE: 2023-07-18_23:13:41
 
-On itscrd80.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
+On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd0/gcheck.exe -p 2048 256 12 OMP=
-Process                     = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=1] [hardcodePARAM=0]
+Process                     = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=1] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=2, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 1.391928e+08                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.360941e+09                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.806903e+09                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.320916e+08                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.314255e+09                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.785092e+09                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371686e-02 +- 3.270219e-06 )  GeV^0
-TOTAL       :     0.595523 sec
-     2,413,044,519      cycles                    #    2.909 GHz                    
-     3,371,780,308      instructions              #    1.40  insn per cycle         
-       0.887688074 seconds time elapsed
+TOTAL       :     0.575146 sec
+     2,379,081,623      cycles                           #    2.948 GHz                    
+     3,617,069,651      instructions                     #    1.52  insn per cycle         
+       0.863850288 seconds time elapsed
 runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd0/gcheck.exe -p 2048 256 1
 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 96
 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100%
@@ -58,24 +58,24 @@ runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProces
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd0/gcheck.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd0/fgcheck.exe 2 64 2
 Avg ME (C++/CUDA)   = 1.282802e-02
-Avg ME (F77/CUDA)   = 1.2828112026909366E-002
-Relative difference = 7.173898182689807e-06
+Avg ME (F77/CUDA)   = 1.2828112108763889E-002
+Relative difference = 7.180279099086847e-06
 OK (relative difference <= 5E-3)
 =========================================================================
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd0/check.exe -p 2048 256 12 OMP=
-Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.2.0] [inlineHel=1] [hardcodePARAM=0]
+Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=6, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 3.031021e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 5.106428e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 5.106428e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.990930e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 5.047464e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 5.047464e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270373e-06 )  GeV^0
-TOTAL       :     2.437957 sec
-     7,408,174,050      cycles                    #    3.033 GHz                    
-    16,633,646,919      instructions              #    2.25  insn per cycle         
-       2.444238664 seconds time elapsed
+TOTAL       :     2.470211 sec
+     7,431,999,027      cycles                           #    3.003 GHz                    
+    16,617,479,109      instructions                     #    2.24  insn per cycle         
+       2.475635291 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:  226) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd0/runTest.exe
@@ -89,19 +89,19 @@ Relative difference = 1.4858695011109669e-08
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl1_hrd0/check.exe -p 2048 256 12 OMP=
-Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.2.0] [inlineHel=1] [hardcodePARAM=0]
+Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=6, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 4.503700e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.305405e+07                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.305405e+07                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 4.220042e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.202891e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.202891e+07                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270373e-06 )  GeV^0
-TOTAL       :     1.757246 sec
-     5,418,067,590      cycles                    #    3.076 GHz                    
-    11,183,088,134      instructions              #    2.06  insn per cycle         
-       1.763156753 seconds time elapsed
+TOTAL       :     1.872741 sec
+     5,421,090,473      cycles                           #    2.889 GHz                    
+    11,166,873,964      instructions                     #    2.06  insn per cycle         
+       1.878025127 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:  532) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl1_hrd0/runTest.exe
@@ -115,19 +115,19 @@ Relative difference = 3.8113554068418534e-08
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl1_hrd0/check.exe -p 2048 256 12 OMP=
-Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.2.0] [inlineHel=1] [hardcodePARAM=0]
+Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=4, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 5.726065e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.682697e+07                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.682697e+07                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 5.696033e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.535675e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.535675e+07                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270338e-06 )  GeV^0
-TOTAL       :     1.451397 sec
-     4,426,469,159      cycles                    #    3.040 GHz                    
-     8,688,961,662      instructions              #    1.96  insn per cycle         
-       1.457509989 seconds time elapsed
+TOTAL       :     1.461547 sec
+     4,388,837,969      cycles                           #    2.993 GHz                    
+     8,671,935,595      instructions                     #    1.98  insn per cycle         
+       1.467055991 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2:  530) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl1_hrd0/runTest.exe
@@ -141,19 +141,19 @@ Relative difference = 2.5306003563303186e-07
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl1_hrd0/check.exe -p 2048 256 12 OMP=
-Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.2.0] [inlineHel=1] [hardcodePARAM=0]
+Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=4, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 5.791450e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.780450e+07                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.780450e+07                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 5.767451e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.731200e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.731200e+07                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270338e-06 )  GeV^0
-TOTAL       :     1.446231 sec
-     4,397,730,279      cycles                    #    3.031 GHz                    
-     8,635,389,369      instructions              #    1.96  insn per cycle         
-       1.458743810 seconds time elapsed
+TOTAL       :     1.453990 sec
+     4,366,268,857      cycles                           #    2.994 GHz                    
+     8,617,347,584      instructions                     #    1.97  insn per cycle         
+       1.459292578 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2:  502) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl1_hrd0/runTest.exe
@@ -167,19 +167,19 @@ Relative difference = 2.5306003563303186e-07
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl1_hrd0/check.exe -p 2048 256 12 OMP=
-Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.2.0] [inlineHel=1] [hardcodePARAM=0]
+Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=4, zero=0)
 Internal loops fptype_sv    = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 5.422680e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.137968e+07                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.137968e+07                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 5.480160e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.113430e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.113430e+07                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371705e-02 +- 3.270339e-06 )  GeV^0
-TOTAL       :     1.522551 sec
-     4,240,504,189      cycles                    #    2.777 GHz                    
-     8,218,517,462      instructions              #    1.94  insn per cycle         
-       1.543890893 seconds time elapsed
+TOTAL       :     1.506867 sec
+     4,219,327,935      cycles                           #    2.791 GHz                    
+     8,194,521,601      instructions                     #    1.94  insn per cycle         
+       1.512223325 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2:  345) (512y:    0) (512z:  301)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl1_hrd0/runTest.exe
diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl1_hrd1.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl1_hrd1.txt
index ac5f47f7f2..673b2dc9eb 100644
--- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl1_hrd1.txt
+++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl1_hrd1.txt
@@ -35,22 +35,22 @@ CUDACPP_BUILDDIR='build.512z_f_inl1_hrd1'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 
-DATE: 2023-06-16_23:10:05
+DATE: 2023-07-18_23:14:01
 
-On itscrd80.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
+On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd1/gcheck.exe -p 2048 256 12 OMP=
-Process                     = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=1] [hardcodePARAM=1]
+Process                     = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=1] [hardcodePARAM=1]
 Workflow summary            = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=2, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 1.391214e+08                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.376725e+09                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.873588e+09                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.327036e+08                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.329373e+09                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.853831e+09                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371686e-02 +- 3.270219e-06 )  GeV^0
-TOTAL       :     0.594001 sec
-     2,406,505,129      cycles                    #    2.898 GHz                    
-     3,366,385,454      instructions              #    1.40  insn per cycle         
-       0.891567175 seconds time elapsed
+TOTAL       :     0.574675 sec
+     2,378,492,152      cycles                           #    2.944 GHz                    
+     3,593,429,174      instructions                     #    1.51  insn per cycle         
+       0.866047495 seconds time elapsed
 runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd1/gcheck.exe -p 2048 256 1
 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 80
 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100%
@@ -58,24 +58,24 @@ runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProces
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd1/gcheck.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd1/fgcheck.exe 2 64 2
 Avg ME (C++/CUDA)   = 1.282802e-02
-Avg ME (F77/CUDA)   = 1.2828112026909366E-002
-Relative difference = 7.173898182689807e-06
+Avg ME (F77/CUDA)   = 1.2828112108763889E-002
+Relative difference = 7.180279099086847e-06
 OK (relative difference <= 5E-3)
 =========================================================================
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd1/check.exe -p 2048 256 12 OMP=
-Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.2.0] [inlineHel=1] [hardcodePARAM=1]
+Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=1]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=6, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 4.071244e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 9.178096e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 9.178096e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.964570e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 8.800048e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 8.800048e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270373e-06 )  GeV^0
-TOTAL       :     1.909389 sec
-     5,830,536,600      cycles                    #    3.046 GHz                    
-    12,919,583,380      instructions              #    2.22  insn per cycle         
-       1.916035180 seconds time elapsed
+TOTAL       :     1.958996 sec
+     5,876,137,740      cycles                           #    2.993 GHz                    
+    12,902,833,170      instructions                     #    2.20  insn per cycle         
+       1.964145548 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:  196) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd1/runTest.exe
@@ -89,19 +89,19 @@ Relative difference = 1.3015322037054697e-08
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl1_hrd1/check.exe -p 2048 256 12 OMP=
-Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.2.0] [inlineHel=1] [hardcodePARAM=1]
+Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=1]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=6, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 5.045068e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.916048e+07                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.916048e+07                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 4.963534e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.843882e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.843882e+07                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270373e-06 )  GeV^0
-TOTAL       :     1.609251 sec
-     4,943,298,493      cycles                    #    3.063 GHz                    
-     9,983,440,182      instructions              #    2.02  insn per cycle         
-       1.615247999 seconds time elapsed
+TOTAL       :     1.635684 sec
+     4,951,868,064      cycles                           #    3.019 GHz                    
+     9,966,972,442      instructions                     #    2.01  insn per cycle         
+       1.641293343 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:  391) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl1_hrd1/runTest.exe
@@ -115,19 +115,19 @@ Relative difference = 3.8113554068418534e-08
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl1_hrd1/check.exe -p 2048 256 12 OMP=
-Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.2.0] [inlineHel=1] [hardcodePARAM=1]
+Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=1]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=4, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 6.077418e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.611229e+07                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.611229e+07                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 6.019861e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.403402e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.403402e+07                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270338e-06 )  GeV^0
-TOTAL       :     1.386065 sec
-     4,257,966,284      cycles                    #    3.061 GHz                    
-     8,332,432,046      instructions              #    1.96  insn per cycle         
-       1.401917759 seconds time elapsed
+TOTAL       :     1.401927 sec
+     4,211,127,106      cycles                           #    2.994 GHz                    
+     8,315,678,328      instructions                     #    1.97  insn per cycle         
+       1.407507913 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2:  418) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl1_hrd1/runTest.exe
@@ -141,19 +141,19 @@ Relative difference = 2.5306003563303186e-07
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl1_hrd1/check.exe -p 2048 256 12 OMP=
-Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.2.0] [inlineHel=1] [hardcodePARAM=1]
+Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=1]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=4, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 6.185758e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.872361e+07                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.872361e+07                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 6.094647e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.677968e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.677968e+07                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270338e-06 )  GeV^0
-TOTAL       :     1.368255 sec
-     4,217,890,128      cycles                    #    3.071 GHz                    
-     8,344,168,177      instructions              #    1.98  insn per cycle         
-       1.374282023 seconds time elapsed
+TOTAL       :     1.391847 sec
+     4,178,481,727      cycles                           #    2.992 GHz                    
+     8,319,910,388      instructions                     #    1.99  insn per cycle         
+       1.397372095 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2:  404) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl1_hrd1/runTest.exe
@@ -167,19 +167,19 @@ Relative difference = 2.5306003563303186e-07
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl1_hrd1/check.exe -p 2048 256 12 OMP=
-Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.2.0] [inlineHel=1] [hardcodePARAM=1]
+Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=1]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=4, zero=0)
 Internal loops fptype_sv    = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 5.831994e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.619622e+07                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.619622e+07                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 5.660417e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.435550e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.435550e+07                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371705e-02 +- 3.270339e-06 )  GeV^0
-TOTAL       :     1.433154 sec
-     4,169,968,523      cycles                    #    2.901 GHz                    
-     8,053,769,834      instructions              #    1.93  insn per cycle         
-       1.439092794 seconds time elapsed
+TOTAL       :     1.473902 sec
+     4,166,139,184      cycles                           #    2.818 GHz                    
+     8,033,518,917      instructions                     #    1.93  insn per cycle         
+       1.479050546 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2:  297) (512y:    0) (512z:  234)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl1_hrd1/runTest.exe
diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_m_inl0_hrd0.txt
index 6dfac9c1ed..842f26a638 100644
--- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_m_inl0_hrd0.txt
+++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_m_inl0_hrd0.txt
@@ -35,22 +35,22 @@ CUDACPP_BUILDDIR='build.512z_m_inl0_hrd0'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 
-DATE: 2023-06-16_22:50:10
+DATE: 2023-07-18_22:39:06
 
-On itscrd80.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
+On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd0/gcheck.exe -p 2048 256 12 OMP=
-Process                     = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 7.376494e+07                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 4.953518e+08                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 8.671712e+08                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 5.506055e+07                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 4.821545e+08                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 8.796974e+08                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270315e-06 )  GeV^0
-TOTAL       :     0.688402 sec
-     2,700,826,195      cycles                    #    2.905 GHz                    
-     3,806,980,667      instructions              #    1.41  insn per cycle         
-       0.989702055 seconds time elapsed
+TOTAL       :     0.705545 sec
+     2,790,742,182      cycles                           #    2.893 GHz                    
+     4,222,733,657      instructions                     #    1.51  insn per cycle         
+       1.029186888 seconds time elapsed
 runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd0/gcheck.exe -p 2048 256 1
 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 150
 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100%
@@ -63,19 +63,19 @@ Relative difference = 7.671454200650844e-09
 OK (relative difference <= 5E-3)
 =========================================================================
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd0/check.exe -p 2048 256 12 OMP=
-Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.180803e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.453359e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.453359e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.177223e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.395160e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.395160e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270315e-06 )  GeV^0
-TOTAL       :     5.728686 sec
-    17,708,432,158      cycles                    #    3.090 GHz                    
-    41,244,089,604      instructions              #    2.33  insn per cycle         
-       5.734765056 seconds time elapsed
+TOTAL       :     5.748578 sec
+    17,433,563,503      cycles                           #    3.031 GHz                    
+    40,599,667,455      instructions                     #    2.33  insn per cycle         
+       5.755064250 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:  377) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd0/runTest.exe
@@ -89,19 +89,19 @@ Relative difference = 3.6990156841838714e-09
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_m_inl0_hrd0/check.exe -p 2048 256 12 OMP=
-Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 2.063645e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.188199e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.188199e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.109821e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.048932e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.048932e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270315e-06 )  GeV^0
-TOTAL       :     3.450990 sec
-    10,583,383,925      cycles                    #    3.064 GHz                    
-    25,489,452,356      instructions              #    2.41  insn per cycle         
-       3.462562119 seconds time elapsed
+TOTAL       :     3.390395 sec
+    10,287,311,971      cycles                           #    3.030 GHz                    
+    24,844,772,489      instructions                     #    2.42  insn per cycle         
+       3.397350143 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4: 1316) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_m_inl0_hrd0/runTest.exe
@@ -115,19 +115,19 @@ Relative difference = 3.6990156841838714e-09
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_m_inl0_hrd0/check.exe -p 2048 256 12 OMP=
-Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 3.037721e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 6.147402e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 6.147402e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.271948e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 5.939610e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 5.939610e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270315e-06 )  GeV^0
-TOTAL       :     2.460588 sec
-     7,441,510,421      cycles                    #    3.018 GHz                    
-    14,282,092,763      instructions              #    1.92  insn per cycle         
-       2.472351528 seconds time elapsed
+TOTAL       :     2.324350 sec
+     6,838,266,308      cycles                           #    2.937 GHz                    
+    13,636,393,663      instructions                     #    1.99  insn per cycle         
+       2.331441003 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 1222) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_m_inl0_hrd0/runTest.exe
@@ -141,19 +141,19 @@ Relative difference = 3.767475112924841e-09
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_m_inl0_hrd0/check.exe -p 2048 256 12 OMP=
-Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 3.158847e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 6.607374e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 6.607374e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.432176e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 6.420412e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 6.420412e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270315e-06 )  GeV^0
-TOTAL       :     2.383448 sec
-     7,184,403,404      cycles                    #    3.008 GHz                    
-    13,977,543,468      instructions              #    1.95  insn per cycle         
-       2.395596096 seconds time elapsed
+TOTAL       :     2.234107 sec
+     6,563,629,435      cycles                           #    2.935 GHz                    
+    13,328,217,942      instructions                     #    2.03  insn per cycle         
+       2.240707772 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 1170) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_m_inl0_hrd0/runTest.exe
@@ -167,19 +167,19 @@ Relative difference = 3.767475112924841e-09
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_m_inl0_hrd0/check.exe -p 2048 256 12 OMP=
-Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 2.933204e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 5.709618e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 5.709618e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.157816e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 5.531671e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 5.531671e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270315e-06 )  GeV^0
-TOTAL       :     2.541646 sec
-     6,535,344,396      cycles                    #    2.567 GHz                    
-    10,866,787,933      instructions              #    1.66  insn per cycle         
-       2.548121091 seconds time elapsed
+TOTAL       :     2.398333 sec
+     5,903,707,649      cycles                           #    2.458 GHz                    
+    10,213,997,707      instructions                     #    1.73  insn per cycle         
+       2.405561211 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2:  473) (512y:    0) (512z:  707)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_m_inl0_hrd0/runTest.exe
diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_m_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_m_inl0_hrd1.txt
index a6191f8a49..27856cc8ed 100644
--- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_m_inl0_hrd1.txt
+++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_m_inl0_hrd1.txt
@@ -35,22 +35,22 @@ CUDACPP_BUILDDIR='build.512z_m_inl0_hrd1'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 
-DATE: 2023-06-16_22:50:39
+DATE: 2023-07-18_22:39:33
 
-On itscrd80.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
+On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd1/gcheck.exe -p 2048 256 12 OMP=
-Process                     = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=1]
+Process                     = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 7.486758e+07                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 5.407739e+08                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.073604e+09                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 5.571073e+07                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 5.374061e+08                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.082215e+09                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270315e-06 )  GeV^0
-TOTAL       :     0.687151 sec
-     2,695,987,556      cycles                    #    2.909 GHz                    
-     3,846,627,469      instructions              #    1.43  insn per cycle         
-       0.990030497 seconds time elapsed
+TOTAL       :     0.700216 sec
+     2,772,657,778      cycles                           #    2.959 GHz                    
+     4,163,185,243      instructions                     #    1.50  insn per cycle         
+       1.001266216 seconds time elapsed
 runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd1/gcheck.exe -p 2048 256 1
 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 118
 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100%
@@ -63,19 +63,19 @@ Relative difference = 7.671454200650844e-09
 OK (relative difference <= 5E-3)
 =========================================================================
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd1/check.exe -p 2048 256 12 OMP=
-Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=1]
+Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.184791e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.461890e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.461890e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.173359e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.391323e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.391323e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270315e-06 )  GeV^0
-TOTAL       :     5.709105 sec
-    17,648,992,290      cycles                    #    3.090 GHz                    
-    41,192,633,916      instructions              #    2.33  insn per cycle         
-       5.715345529 seconds time elapsed
+TOTAL       :     5.767106 sec
+    17,412,569,782      cycles                           #    3.017 GHz                    
+    40,548,741,589      instructions                     #    2.33  insn per cycle         
+       5.773604309 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:  364) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd1/runTest.exe
@@ -89,19 +89,19 @@ Relative difference = 3.6990156841838714e-09
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_m_inl0_hrd1/check.exe -p 2048 256 12 OMP=
-Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=1]
+Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 2.065279e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.171891e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.171891e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.115602e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.022710e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.022710e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270315e-06 )  GeV^0
-TOTAL       :     3.445809 sec
-    10,635,101,807      cycles                    #    3.083 GHz                    
-    25,450,128,846      instructions              #    2.39  insn per cycle         
-       3.457774379 seconds time elapsed
+TOTAL       :     3.378561 sec
+    10,228,415,037      cycles                           #    3.023 GHz                    
+    24,804,793,058      instructions                     #    2.43  insn per cycle         
+       3.385098578 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4: 1303) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_m_inl0_hrd1/runTest.exe
@@ -115,19 +115,19 @@ Relative difference = 3.6990156841838714e-09
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_m_inl0_hrd1/check.exe -p 2048 256 12 OMP=
-Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=1]
+Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 3.055789e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 6.156027e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 6.156027e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.283997e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 5.986448e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 5.986448e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270315e-06 )  GeV^0
-TOTAL       :     2.453672 sec
-     7,396,722,782      cycles                    #    3.009 GHz                    
-    14,256,099,046      instructions              #    1.93  insn per cycle         
-       2.460024595 seconds time elapsed
+TOTAL       :     2.313046 sec
+     6,817,504,422      cycles                           #    2.941 GHz                    
+    13,609,894,657      instructions                     #    2.00  insn per cycle         
+       2.319689673 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 1202) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_m_inl0_hrd1/runTest.exe
@@ -141,19 +141,19 @@ Relative difference = 3.767475112924841e-09
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_m_inl0_hrd1/check.exe -p 2048 256 12 OMP=
-Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=1]
+Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 3.105741e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 6.538001e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 6.538001e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.428716e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 6.446641e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 6.446641e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270315e-06 )  GeV^0
-TOTAL       :     2.423016 sec
-     7,227,506,496      cycles                    #    2.978 GHz                    
-    13,966,530,659      instructions              #    1.93  insn per cycle         
-       2.435636437 seconds time elapsed
+TOTAL       :     2.229820 sec
+     6,574,314,217      cycles                           #    2.942 GHz                    
+    13,301,660,522      instructions                     #    2.02  insn per cycle         
+       2.236816103 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 1150) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_m_inl0_hrd1/runTest.exe
@@ -167,19 +167,19 @@ Relative difference = 3.767475112924841e-09
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_m_inl0_hrd1/check.exe -p 2048 256 12 OMP=
-Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=1]
+Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 2.961514e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 5.946738e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 5.946738e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.251675e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 5.824241e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 5.824241e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270315e-06 )  GeV^0
-TOTAL       :     2.525122 sec
-     6,474,425,117      cycles                    #    2.561 GHz                    
-    10,746,498,563      instructions              #    1.66  insn per cycle         
-       2.536989466 seconds time elapsed
+TOTAL       :     2.338011 sec
+     5,826,014,320      cycles                           #    2.489 GHz                    
+    10,093,757,750      instructions                     #    1.73  insn per cycle         
+       2.344971034 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2:  453) (512y:    0) (512z:  688)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_m_inl0_hrd1/runTest.exe
diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0.txt
index 238b115334..23f8555033 100644
--- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0.txt
+++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0.txt
@@ -35,22 +35,22 @@ CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 
-DATE: 2023-06-16_22:51:08
+DATE: 2023-07-18_22:40:01
 
-On itscrd80.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
+On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 2 OMP=
-Process                     = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 4.924818e+07                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.131524e+08                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.265700e+08                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 4.041058e+07                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.155921e+08                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.270597e+08                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086689e+00 +- 3.413217e-03 )  GeV^0
-TOTAL       :     0.548095 sec
-     2,264,164,972      cycles                    #    2.867 GHz                    
-     2,877,964,725      instructions              #    1.27  insn per cycle         
-       0.846931183 seconds time elapsed
+TOTAL       :     0.537791 sec
+     2,267,470,311      cycles                           #    2.936 GHz                    
+     3,162,069,972      instructions                     #    1.39  insn per cycle         
+       0.841128427 seconds time elapsed
 runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 1
 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 214
 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100%
@@ -63,19 +63,19 @@ Relative difference = 3.2588034143755247e-07
 OK (relative difference <= 5E-3)
 =========================================================================
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/check.exe -p 2048 256 2 OMP=
-Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.970787e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.034500e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.034500e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.865897e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.913508e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.913508e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086689e+00 +- 3.413217e-03 )  GeV^0
-TOTAL       :     5.428827 sec
-    16,813,146,395      cycles                    #    3.096 GHz                    
-    45,522,826,465      instructions              #    2.71  insn per cycle         
-       5.435350903 seconds time elapsed
+TOTAL       :     5.727029 sec
+    17,122,088,671      cycles                           #    2.988 GHz                    
+    45,405,896,149      instructions                     #    2.65  insn per cycle         
+       5.734069298 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:  625) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/runTest.exe
@@ -89,19 +89,19 @@ Relative difference = 3.258803992249869e-07
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/check.exe -p 2048 256 2 OMP=
-Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 3.561847e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.792373e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.792373e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.429859e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.611560e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.611560e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086689e+00 +- 3.413217e-03 )  GeV^0
-TOTAL       :     3.052689 sec
-     9,449,457,732      cycles                    #    3.092 GHz                    
-    26,574,621,752      instructions              #    2.81  insn per cycle         
-       3.065025864 seconds time elapsed
+TOTAL       :     3.167563 sec
+     9,543,841,338      cycles                           #    3.010 GHz                    
+    26,463,320,160      instructions                     #    2.77  insn per cycle         
+       3.174476374 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4: 2475) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/runTest.exe
@@ -115,19 +115,19 @@ Relative difference = 3.2588039900609506e-07
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/check.exe -p 2048 256 2 OMP=
-Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 6.062194e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 6.740435e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 6.740435e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 6.094977e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 6.655959e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 6.655959e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086689e+00 +- 3.413217e-03 )  GeV^0
-TOTAL       :     1.837966 sec
-     5,299,704,595      cycles                    #    2.875 GHz                    
-    11,318,276,197      instructions              #    2.14  insn per cycle         
-       1.850204362 seconds time elapsed
+TOTAL       :     1.829248 sec
+     5,245,339,310      cycles                           #    2.861 GHz                    
+    11,203,451,715      instructions                     #    2.14  insn per cycle         
+       1.836368230 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 2317) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/runTest.exe
@@ -141,19 +141,19 @@ Relative difference = 3.2588037186351226e-07
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/check.exe -p 2048 256 2 OMP=
-Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 6.705723e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 7.548710e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 7.548710e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 6.746194e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 7.426314e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.426314e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086689e+00 +- 3.413217e-03 )  GeV^0
-TOTAL       :     1.671188 sec
-     4,885,067,241      cycles                    #    2.915 GHz                    
-    10,738,150,017      instructions              #    2.20  insn per cycle         
-       1.677721158 seconds time elapsed
+TOTAL       :     1.662736 sec
+     4,783,397,891      cycles                           #    2.870 GHz                    
+    10,623,484,397      instructions                     #    2.22  insn per cycle         
+       1.669838048 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 2116) (512y:   84) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/runTest.exe
@@ -167,20 +167,20 @@ Relative difference = 3.2588037186351226e-07
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/check.exe -p 2048 256 2 OMP=
-Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 4.237343e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 4.564057e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 4.564057e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 4.165186e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 4.415979e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.415979e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086689e+00 +- 3.413217e-03 )  GeV^0
-TOTAL       :     2.582849 sec
-     5,275,527,208      cycles                    #    2.039 GHz                    
-     7,074,506,056      instructions              #    1.34  insn per cycle         
-       2.594504687 seconds time elapsed
-=Symbols in CPPProcess.o= (~sse4:    0) (avx2: 1084) (512y:   95) (512z: 1629)
+TOTAL       :     2.627050 sec
+     5,157,567,367      cycles                           #    1.961 GHz                    
+     6,977,826,191      instructions                     #    1.35  insn per cycle         
+       2.634198680 seconds time elapsed
+=Symbols in CPPProcess.o= (~sse4:    0) (avx2: 1093) (512y:   96) (512z: 1628)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/runTest.exe
 [  PASSED  ] 6 tests.
diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_bridge.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_bridge.txt
index 639aca4e98..3b1248b3b8 100644
--- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_bridge.txt
+++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_bridge.txt
@@ -35,26 +35,26 @@ CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 
-DATE: 2023-06-16_23:19:50
+DATE: 2023-07-18_23:23:24
 
-On itscrd80.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
+On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 2 --bridge OMP=
 WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost
 WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost
 WARNING! Instantiate device Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288)
 WARNING! Set grid in Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288)
-Process                     = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 4.044204e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.996138e+07                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.996138e+07                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 4.517155e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.850093e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.850093e+07                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086689e+00 +- 3.413217e-03 )  GeV^0
-TOTAL       :     0.864809 sec
-     3,286,181,196      cycles                    #    2.941 GHz                    
-     4,692,253,429      instructions              #    1.43  insn per cycle         
-       1.175702347 seconds time elapsed
+TOTAL       :     0.811994 sec
+     3,122,813,862      cycles                           #    2.957 GHz                    
+     4,726,707,499      instructions                     #    1.51  insn per cycle         
+       1.114267763 seconds time elapsed
 runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 1 --bridge
 WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost
 WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost
@@ -72,19 +72,19 @@ OK (relative difference <= 5E-3)
 =========================================================================
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/check.exe -p 2048 256 2 --bridge OMP=
 WARNING! Instantiate host Bridge (nevt=524288)
-Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.945517e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.007133e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.007133e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.891761e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.940341e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.940341e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086689e+00 +- 3.413217e-03 )  GeV^0
-TOTAL       :     5.579973 sec
-    17,197,411,796      cycles                    #    3.079 GHz                    
-    45,599,315,728      instructions              #    2.65  insn per cycle         
-       5.587453353 seconds time elapsed
+TOTAL       :     5.725155 sec
+    17,443,155,891      cycles                           #    3.044 GHz                    
+    45,463,067,794      instructions                     #    2.61  insn per cycle         
+       5.731889775 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:  625) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/runTest.exe
@@ -99,19 +99,19 @@ OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/check.exe -p 2048 256 2 --bridge OMP=
 WARNING! Instantiate host Bridge (nevt=524288)
-Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 3.516201e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.735806e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.735806e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.424146e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.600690e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.600690e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086689e+00 +- 3.413217e-03 )  GeV^0
-TOTAL       :     3.173720 sec
-     9,799,980,850      cycles                    #    3.084 GHz                    
-    26,760,156,296      instructions              #    2.73  insn per cycle         
-       3.191189055 seconds time elapsed
+TOTAL       :     3.245399 sec
+     9,877,145,077      cycles                           #    3.039 GHz                    
+    26,644,958,913      instructions                     #    2.70  insn per cycle         
+       3.251786063 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4: 2475) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/runTest.exe
@@ -126,19 +126,19 @@ OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/check.exe -p 2048 256 2 --bridge OMP=
 WARNING! Instantiate host Bridge (nevt=524288)
-Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 6.004187e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 6.680852e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 6.680852e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 5.989584e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 6.525267e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 6.525267e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086689e+00 +- 3.413217e-03 )  GeV^0
-TOTAL       :     1.934049 sec
-     5,690,588,652      cycles                    #    2.933 GHz                    
-    11,606,672,585      instructions              #    2.04  insn per cycle         
-       1.948751225 seconds time elapsed
+TOTAL       :     1.935210 sec
+     5,584,834,974      cycles                           #    2.877 GHz                    
+    11,490,126,989      instructions                     #    2.06  insn per cycle         
+       1.942181580 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 2317) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/runTest.exe
@@ -153,19 +153,19 @@ OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/check.exe -p 2048 256 2 --bridge OMP=
 WARNING! Instantiate host Bridge (nevt=524288)
-Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 6.578597e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 7.386711e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 7.386711e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 6.585484e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 7.233673e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.233673e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086689e+00 +- 3.413217e-03 )  GeV^0
-TOTAL       :     1.781882 sec
-     5,244,605,674      cycles                    #    2.934 GHz                    
-    11,027,240,287      instructions              #    2.10  insn per cycle         
-       1.797644688 seconds time elapsed
+TOTAL       :     1.776662 sec
+     5,115,306,828      cycles                           #    2.870 GHz                    
+    10,908,248,078      instructions                     #    2.13  insn per cycle         
+       1.783566470 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 2116) (512y:   84) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/runTest.exe
@@ -180,20 +180,20 @@ OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/check.exe -p 2048 256 2 --bridge OMP=
 WARNING! Instantiate host Bridge (nevt=524288)
-Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 4.159645e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 4.469350e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 4.469350e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 4.052284e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 4.287533e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.287533e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086689e+00 +- 3.413217e-03 )  GeV^0
-TOTAL       :     2.712347 sec
-     5,634,887,558      cycles                    #    2.074 GHz                    
-     7,322,991,974      instructions              #    1.30  insn per cycle         
-       2.729029702 seconds time elapsed
-=Symbols in CPPProcess.o= (~sse4:    0) (avx2: 1084) (512y:   95) (512z: 1629)
+TOTAL       :     2.773144 sec
+     5,511,544,850      cycles                           #    1.984 GHz                    
+     7,221,836,653      instructions                     #    1.31  insn per cycle         
+       2.779849469 seconds time elapsed
+=Symbols in CPPProcess.o= (~sse4:    0) (avx2: 1093) (512y:   96) (512z: 1628)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/runTest.exe
 [  PASSED  ] 6 tests.
diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_common.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_common.txt
index 654e369bcd..3ee287cbb2 100644
--- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_common.txt
+++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_common.txt
@@ -35,22 +35,22 @@ CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 
-DATE: 2023-06-16_23:32:09
+DATE: 2023-07-18_23:35:23
 
-On itscrd80.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
+On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 2 --common OMP=
-Process                     = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:COMMON+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 4.720407e+07                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.158709e+08                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.266813e+08                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 4.594075e+07                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.159099e+08                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.273479e+08                 )  sec^-1
 MeanMatrixElemValue         = ( 2.079401e+00 +- 3.402993e-03 )  GeV^0
-TOTAL       :     0.643304 sec
-     2,557,243,803      cycles                    #    2.902 GHz                    
-     3,323,241,435      instructions              #    1.30  insn per cycle         
-       0.940843234 seconds time elapsed
+TOTAL       :     0.619590 sec
+     2,505,610,814      cycles                           #    2.955 GHz                    
+     3,592,105,745      instructions                     #    1.43  insn per cycle         
+       0.906983599 seconds time elapsed
 runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 1 --common
 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 214
 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100%
@@ -63,19 +63,19 @@ Relative difference = 3.2588034143755247e-07
 OK (relative difference <= 5E-3)
 =========================================================================
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/check.exe -p 2048 256 2 --common OMP=
-Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.976660e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.040382e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.040382e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.888766e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.937039e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.937039e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.079401e+00 +- 3.402993e-03 )  GeV^0
-TOTAL       :     5.473136 sec
-    17,003,987,037      cycles                    #    3.104 GHz                    
-    45,540,260,291      instructions              #    2.68  insn per cycle         
-       5.479696030 seconds time elapsed
+TOTAL       :     5.717439 sec
+    17,289,665,830      cycles                           #    3.023 GHz                    
+    45,422,171,459      instructions                     #    2.63  insn per cycle         
+       5.722653885 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:  625) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/runTest.exe
@@ -89,19 +89,19 @@ Relative difference = 3.258803992249869e-07
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/check.exe -p 2048 256 2 --common OMP=
-Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 3.560915e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.788844e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.788844e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.455670e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.635570e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.635570e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.079401e+00 +- 3.402993e-03 )  GeV^0
-TOTAL       :     3.112416 sec
-     9,646,257,782      cycles                    #    3.095 GHz                    
-    26,577,811,209      instructions              #    2.76  insn per cycle         
-       3.124182275 seconds time elapsed
+TOTAL       :     3.200432 sec
+     9,723,038,655      cycles                           #    3.036 GHz                    
+    26,462,151,211      instructions                     #    2.72  insn per cycle         
+       3.205930504 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4: 2475) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/runTest.exe
@@ -115,19 +115,19 @@ Relative difference = 3.2588039900609506e-07
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/check.exe -p 2048 256 2 --common OMP=
-Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 6.139210e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 6.855962e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 6.855962e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 6.081846e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 6.628914e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 6.628914e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.079401e+00 +- 3.402993e-03 )  GeV^0
-TOTAL       :     1.874805 sec
-     5,513,978,865      cycles                    #    2.934 GHz                    
-    11,304,407,461      instructions              #    2.05  insn per cycle         
-       1.886432659 seconds time elapsed
+TOTAL       :     1.889454 sec
+     5,425,081,251      cycles                           #    2.866 GHz                    
+    11,186,282,050      instructions                     #    2.06  insn per cycle         
+       1.894560758 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 2317) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/runTest.exe
@@ -141,19 +141,19 @@ Relative difference = 3.2588037186351226e-07
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/check.exe -p 2048 256 2 --common OMP=
-Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:COMMON+RMBHST+MESHST/512y+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 6.726123e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 7.568760e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 7.568760e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 6.753281e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 7.434153e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.434153e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.079401e+00 +- 3.402993e-03 )  GeV^0
-TOTAL       :     1.727887 sec
-     5,070,617,194      cycles                    #    2.927 GHz                    
-    10,691,249,015      instructions              #    2.11  insn per cycle         
-       1.739511158 seconds time elapsed
+TOTAL       :     1.719503 sec
+     4,944,778,621      cycles                           #    2.869 GHz                    
+    10,572,680,419      instructions                     #    2.14  insn per cycle         
+       1.725111487 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 2116) (512y:   84) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/runTest.exe
@@ -167,20 +167,20 @@ Relative difference = 3.2588037186351226e-07
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/check.exe -p 2048 256 2 --common OMP=
-Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:COMMON+RMBHST+MESHST/512z+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 4.172539e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 4.488358e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 4.488358e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 4.150572e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 4.398100e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.398100e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.079401e+00 +- 3.402993e-03 )  GeV^0
-TOTAL       :     2.683562 sec
-     5,459,517,935      cycles                    #    2.032 GHz                    
-     7,027,784,941      instructions              #    1.29  insn per cycle         
-       2.699812290 seconds time elapsed
-=Symbols in CPPProcess.o= (~sse4:    0) (avx2: 1084) (512y:   95) (512z: 1629)
+TOTAL       :     2.693591 sec
+     5,346,956,955      cycles                           #    1.983 GHz                    
+     6,927,109,897      instructions                     #    1.30  insn per cycle         
+       2.698900902 seconds time elapsed
+=Symbols in CPPProcess.o= (~sse4:    0) (avx2: 1093) (512y:   96) (512z: 1628)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/runTest.exe
 [  PASSED  ] 6 tests.
diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_curhst.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_curhst.txt
index a8675ddd60..c07d5465aa 100644
--- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_curhst.txt
+++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_curhst.txt
@@ -35,22 +35,22 @@ CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 
-DATE: 2023-06-16_23:29:07
+DATE: 2023-07-18_23:32:26
 
-On itscrd80.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
+On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 2 --curhst OMP=
-Process                     = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURHST+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 4.723347e+07                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.157467e+08                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.265955e+08                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 4.590823e+07                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.156069e+08                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.272880e+08                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086689e+00 +- 3.413217e-03 )  GeV^0
-TOTAL       :     0.586317 sec
-     2,370,587,754      cycles                    #    2.873 GHz                    
-     3,311,584,988      instructions              #    1.40  insn per cycle         
-       0.883202740 seconds time elapsed
+TOTAL       :     0.561090 sec
+     2,337,153,978      cycles                           #    2.954 GHz                    
+     3,567,976,480      instructions                     #    1.53  insn per cycle         
+       0.848347727 seconds time elapsed
 runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 1 --curhst
 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 214
 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100%
@@ -63,19 +63,19 @@ Relative difference = 3.2588034143755247e-07
 OK (relative difference <= 5E-3)
 =========================================================================
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/check.exe -p 2048 256 2 --curhst OMP=
-Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.965968e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.028993e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.028993e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.901838e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.950756e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.950756e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086689e+00 +- 3.413217e-03 )  GeV^0
-TOTAL       :     5.442844 sec
-    16,814,668,007      cycles                    #    3.088 GHz                    
-    45,523,651,040      instructions              #    2.71  insn per cycle         
-       5.449380623 seconds time elapsed
+TOTAL       :     5.619703 sec
+    17,098,072,758      cycles                           #    3.041 GHz                    
+    45,403,233,892      instructions                     #    2.66  insn per cycle         
+       5.625298678 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:  625) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/runTest.exe
@@ -89,19 +89,19 @@ Relative difference = 3.258803992249869e-07
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/check.exe -p 2048 256 2 --curhst OMP=
-Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 3.557389e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.784830e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.784830e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.465062e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.643542e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.643542e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086689e+00 +- 3.413217e-03 )  GeV^0
-TOTAL       :     3.056098 sec
-     9,441,724,564      cycles                    #    3.085 GHz                    
-    26,574,286,961      instructions              #    2.81  insn per cycle         
-       3.067987765 seconds time elapsed
+TOTAL       :     3.133350 sec
+     9,530,990,715      cycles                           #    3.038 GHz                    
+    26,461,775,640      instructions                     #    2.78  insn per cycle         
+       3.138486769 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4: 2475) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/runTest.exe
@@ -115,19 +115,19 @@ Relative difference = 3.2588039900609506e-07
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/check.exe -p 2048 256 2 --curhst OMP=
-Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 6.028635e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 6.705882e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 6.705882e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 6.113272e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 6.680038e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 6.680038e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086689e+00 +- 3.413217e-03 )  GeV^0
-TOTAL       :     1.847560 sec
-     5,305,013,811      cycles                    #    2.865 GHz                    
-    11,318,072,423      instructions              #    2.13  insn per cycle         
-       1.862970960 seconds time elapsed
+TOTAL       :     1.822016 sec
+     5,239,798,170      cycles                           #    2.869 GHz                    
+    11,203,273,002      instructions                     #    2.14  insn per cycle         
+       1.827279491 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 2317) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/runTest.exe
@@ -141,19 +141,19 @@ Relative difference = 3.2588037186351226e-07
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/check.exe -p 2048 256 2 --curhst OMP=
-Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 6.705810e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 7.546514e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 7.546514e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 6.752624e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 7.436775e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.436775e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086689e+00 +- 3.413217e-03 )  GeV^0
-TOTAL       :     1.670710 sec
-     4,873,672,906      cycles                    #    2.909 GHz                    
-    10,738,237,712      instructions              #    2.20  insn per cycle         
-       1.682858352 seconds time elapsed
+TOTAL       :     1.656970 sec
+     4,770,141,804      cycles                           #    2.871 GHz                    
+    10,617,376,861      instructions                     #    2.23  insn per cycle         
+       1.662328882 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 2116) (512y:   84) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/runTest.exe
@@ -167,20 +167,20 @@ Relative difference = 3.2588037186351226e-07
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/check.exe -p 2048 256 2 --curhst OMP=
-Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 4.212444e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 4.528464e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 4.528464e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 4.111293e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 4.354169e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.354169e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086689e+00 +- 3.413217e-03 )  GeV^0
-TOTAL       :     2.596550 sec
-     5,275,280,525      cycles                    #    2.028 GHz                    
-     7,074,607,503      instructions              #    1.34  insn per cycle         
-       2.603068795 seconds time elapsed
-=Symbols in CPPProcess.o= (~sse4:    0) (avx2: 1084) (512y:   95) (512z: 1629)
+TOTAL       :     2.656883 sec
+     5,149,232,654      cycles                           #    1.935 GHz                    
+     6,977,685,886      instructions                     #    1.36  insn per cycle         
+       2.662117722 seconds time elapsed
+=Symbols in CPPProcess.o= (~sse4:    0) (avx2: 1093) (512y:   96) (512z: 1628)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/runTest.exe
 [  PASSED  ] 6 tests.
diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_rmbhst.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_rmbhst.txt
index 359002bbc2..f3fc0520ec 100644
--- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_rmbhst.txt
+++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_rmbhst.txt
@@ -35,23 +35,23 @@ CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 
-DATE: 2023-06-16_23:26:08
+DATE: 2023-07-18_23:29:33
 
-On itscrd80.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
+On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 2 --rmbhst OMP=
 WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost
-Process                     = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURHST+RMBHST+MESDEV/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 5.030270e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.154608e+08                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.262941e+08                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 5.823745e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.160361e+08                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.275150e+08                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086689e+00 +- 3.413217e-03 )  GeV^0
-TOTAL       :     0.762017 sec
-     2,931,874,448      cycles                    #    2.934 GHz                    
-     4,162,111,608      instructions              #    1.42  insn per cycle         
-       1.058050965 seconds time elapsed
+TOTAL       :     0.711538 sec
+     2,800,525,082      cycles                           #    2.960 GHz                    
+     4,320,932,668      instructions                     #    1.54  insn per cycle         
+       1.004387612 seconds time elapsed
 runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 1 --rmbhst
 WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost
 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 214
@@ -65,19 +65,19 @@ Relative difference = 3.2588034143755247e-07
 OK (relative difference <= 5E-3)
 =========================================================================
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/check.exe -p 2048 256 2 --rmbhst OMP=
-Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.969731e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.034215e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.034215e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.893916e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.943371e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.943371e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086689e+00 +- 3.413217e-03 )  GeV^0
-TOTAL       :     5.431365 sec
-    16,808,611,250      cycles                    #    3.092 GHz                    
-    45,520,552,491      instructions              #    2.71  insn per cycle         
-       5.438033144 seconds time elapsed
+TOTAL       :     5.643047 sec
+    17,107,721,214      cycles                           #    3.029 GHz                    
+    45,404,077,046      instructions                     #    2.65  insn per cycle         
+       5.648523883 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:  625) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/runTest.exe
@@ -91,19 +91,19 @@ Relative difference = 3.258803992249869e-07
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/check.exe -p 2048 256 2 --rmbhst OMP=
-Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 3.568471e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.797188e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.797188e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.464238e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.642365e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.642365e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086689e+00 +- 3.413217e-03 )  GeV^0
-TOTAL       :     3.046374 sec
-     9,446,582,684      cycles                    #    3.097 GHz                    
-    26,574,434,074      instructions              #    2.81  insn per cycle         
-       3.058800051 seconds time elapsed
+TOTAL       :     3.133051 sec
+     9,535,469,676      cycles                           #    3.040 GHz                    
+    26,461,903,214      instructions                     #    2.78  insn per cycle         
+       3.138570872 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4: 2475) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/runTest.exe
@@ -117,19 +117,19 @@ Relative difference = 3.2588039900609506e-07
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/check.exe -p 2048 256 2 --rmbhst OMP=
-Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 6.022985e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 6.704154e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 6.704154e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 6.023884e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 6.568023e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 6.568023e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086689e+00 +- 3.413217e-03 )  GeV^0
-TOTAL       :     1.849517 sec
-     5,358,628,341      cycles                    #    2.890 GHz                    
-    11,318,219,306      instructions              #    2.11  insn per cycle         
-       1.862335681 seconds time elapsed
+TOTAL       :     1.845680 sec
+     5,244,293,032      cycles                           #    2.834 GHz                    
+    11,203,499,976      instructions                     #    2.14  insn per cycle         
+       1.851246836 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 2317) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/runTest.exe
@@ -143,19 +143,19 @@ Relative difference = 3.2588037186351226e-07
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/check.exe -p 2048 256 2 --rmbhst OMP=
-Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 6.716217e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 7.550859e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 7.550859e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 6.768737e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 7.456321e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.456321e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086689e+00 +- 3.413217e-03 )  GeV^0
-TOTAL       :     1.668825 sec
-     4,888,461,793      cycles                    #    2.922 GHz                    
-    10,738,236,069      instructions              #    2.20  insn per cycle         
-       1.680870974 seconds time elapsed
+TOTAL       :     1.654802 sec
+     4,755,871,808      cycles                           #    2.866 GHz                    
+    10,620,982,700      instructions                     #    2.23  insn per cycle         
+       1.660273899 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 2116) (512y:   84) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/runTest.exe
@@ -169,20 +169,20 @@ Relative difference = 3.2588037186351226e-07
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/check.exe -p 2048 256 2 --rmbhst OMP=
-Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 4.203955e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 4.527248e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 4.527248e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 4.161738e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 4.413225e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.413225e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086689e+00 +- 3.413217e-03 )  GeV^0
-TOTAL       :     2.602140 sec
-     5,278,828,196      cycles                    #    2.025 GHz                    
-     7,074,793,601      instructions              #    1.34  insn per cycle         
-       2.608554211 seconds time elapsed
-=Symbols in CPPProcess.o= (~sse4:    0) (avx2: 1084) (512y:   95) (512z: 1629)
+TOTAL       :     2.626109 sec
+     5,153,641,042      cycles                           #    1.960 GHz                    
+     6,978,206,846      instructions                     #    1.35  insn per cycle         
+       2.631697516 seconds time elapsed
+=Symbols in CPPProcess.o= (~sse4:    0) (avx2: 1093) (512y:   96) (512z: 1628)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/runTest.exe
 [  PASSED  ] 6 tests.
diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd1.txt
index eac3f7700f..93b17b3385 100644
--- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd1.txt
+++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd1.txt
@@ -35,22 +35,22 @@ CUDACPP_BUILDDIR='build.512z_d_inl0_hrd1'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 
-DATE: 2023-06-16_22:51:35
+DATE: 2023-07-18_22:40:28
 
-On itscrd80.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
+On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd1/gcheck.exe -p 2048 256 2 OMP=
-Process                     = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=1]
+Process                     = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 4.913985e+07                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.133710e+08                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.264879e+08                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 4.097464e+07                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.160557e+08                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.275726e+08                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086689e+00 +- 3.413217e-03 )  GeV^0
-TOTAL       :     0.547036 sec
-     2,265,211,371      cycles                    #    2.879 GHz                    
-     2,883,998,397      instructions              #    1.27  insn per cycle         
-       0.846249580 seconds time elapsed
+TOTAL       :     0.529241 sec
+     2,249,151,280      cycles                           #    2.939 GHz                    
+     3,126,073,041      instructions                     #    1.39  insn per cycle         
+       0.833374909 seconds time elapsed
 runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd1/gcheck.exe -p 2048 256 1
 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 208
 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100%
@@ -63,19 +63,19 @@ Relative difference = 3.2588034143755247e-07
 OK (relative difference <= 5E-3)
 =========================================================================
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd1/check.exe -p 2048 256 2 OMP=
-Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=1]
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 2.025487e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.092839e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.092839e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.947164e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.998957e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.998957e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086689e+00 +- 3.413217e-03 )  GeV^0
-TOTAL       :     5.293594 sec
-    16,435,613,118      cycles                    #    3.105 GHz                    
-    44,496,848,749      instructions              #    2.71  insn per cycle         
-       5.300078017 seconds time elapsed
+TOTAL       :     5.491250 sec
+    16,727,580,632      cycles                           #    3.044 GHz                    
+    44,378,708,972      instructions                     #    2.65  insn per cycle         
+       5.497794723 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:  576) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd1/runTest.exe
@@ -89,19 +89,19 @@ Relative difference = 3.258803992249869e-07
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd1/check.exe -p 2048 256 2 OMP=
-Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=1]
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 3.743778e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.997210e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.997210e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.657519e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.857574e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.857574e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086689e+00 +- 3.413217e-03 )  GeV^0
-TOTAL       :     2.909580 sec
-     8,996,897,610      cycles                    #    3.087 GHz                    
-    25,400,434,000      instructions              #    2.82  insn per cycle         
-       2.923168620 seconds time elapsed
+TOTAL       :     2.975412 sec
+     9,062,080,469      cycles                           #    3.042 GHz                    
+    25,287,862,621      instructions                     #    2.79  insn per cycle         
+       2.982186168 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4: 2305) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd1/runTest.exe
@@ -115,19 +115,19 @@ Relative difference = 3.2588039900609506e-07
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd1/check.exe -p 2048 256 2 OMP=
-Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=1]
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 5.683039e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 6.268975e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 6.268975e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 5.559920e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 6.020470e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 6.020470e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086689e+00 +- 3.413217e-03 )  GeV^0
-TOTAL       :     1.953170 sec
-     5,745,024,701      cycles                    #    2.934 GHz                    
-    12,420,701,802      instructions              #    2.16  insn per cycle         
-       1.968877475 seconds time elapsed
+TOTAL       :     1.994543 sec
+     5,687,301,019      cycles                           #    2.847 GHz                    
+    12,306,543,864      instructions                     #    2.16  insn per cycle         
+       2.001637740 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 2408) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd1/runTest.exe
@@ -141,19 +141,19 @@ Relative difference = 3.2588037186351226e-07
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd1/check.exe -p 2048 256 2 OMP=
-Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=1]
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 5.895401e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 6.531291e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 6.531291e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 5.854570e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 6.362779e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 6.362779e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086689e+00 +- 3.413217e-03 )  GeV^0
-TOTAL       :     1.886801 sec
-     5,519,025,390      cycles                    #    2.918 GHz                    
-    12,000,810,261      instructions              #    2.17  insn per cycle         
-       1.892947629 seconds time elapsed
+TOTAL       :     1.898550 sec
+     5,405,706,794      cycles                           #    2.842 GHz                    
+    11,884,289,492      instructions                     #    2.20  insn per cycle         
+       1.905488215 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 2127) (512y:  239) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd1/runTest.exe
@@ -167,19 +167,19 @@ Relative difference = 3.2588037186351226e-07
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd1/check.exe -p 2048 256 2 OMP=
-Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=1]
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 4.060444e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 4.357914e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 4.357914e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 4.007795e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 4.242707e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.242707e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086689e+00 +- 3.413217e-03 )  GeV^0
-TOTAL       :     2.690681 sec
-     5,453,449,759      cycles                    #    2.024 GHz                    
-     8,526,907,153      instructions              #    1.56  insn per cycle         
-       2.696635390 seconds time elapsed
+TOTAL       :     2.726437 sec
+     5,336,054,647      cycles                           #    1.955 GHz                    
+     8,411,434,439      instructions                     #    1.58  insn per cycle         
+       2.733346003 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 1067) (512y:  204) (512z: 1715)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd1/runTest.exe
diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl1_hrd0.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl1_hrd0.txt
index e46f7db696..e5969ef6ae 100644
--- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl1_hrd0.txt
+++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl1_hrd0.txt
@@ -35,22 +35,22 @@ CUDACPP_BUILDDIR='build.512z_d_inl1_hrd0'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 
-DATE: 2023-06-16_23:10:25
+DATE: 2023-07-18_23:14:20
 
-On itscrd80.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
+On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd0/gcheck.exe -p 2048 256 2 OMP=
-Process                     = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=1] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=1] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 4.728254e+07                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.165396e+08                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.275486e+08                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 4.592910e+07                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.162940e+08                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.277756e+08                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086689e+00 +- 3.413217e-03 )  GeV^0
-TOTAL       :     0.552941 sec
-     2,310,610,153      cycles                    #    2.859 GHz                    
-     2,915,241,692      instructions              #    1.26  insn per cycle         
-       0.867252595 seconds time elapsed
+TOTAL       :     0.525339 sec
+     2,240,703,448      cycles                           #    2.932 GHz                    
+     3,098,476,503      instructions                     #    1.38  insn per cycle         
+       0.823729716 seconds time elapsed
 runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd0/gcheck.exe -p 2048 256 1
 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 214
 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100%
@@ -63,19 +63,19 @@ Relative difference = 3.2588034143755247e-07
 OK (relative difference <= 5E-3)
 =========================================================================
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd0/check.exe -p 2048 256 2 OMP=
-Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.2.0] [inlineHel=1] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 2.605414e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.718612e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.718612e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.523001e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.610147e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.610147e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086689e+00 +- 3.413217e-03 )  GeV^0
-TOTAL       :     4.132902 sec
-    12,781,577,555      cycles                    #    3.090 GHz                    
-    34,468,010,618      instructions              #    2.70  insn per cycle         
-       4.139327277 seconds time elapsed
+TOTAL       :     4.262946 sec
+    12,917,991,309      cycles                           #    3.027 GHz                    
+    34,356,800,274      instructions                     #    2.66  insn per cycle         
+       4.268384369 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:  672) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd0/runTest.exe
@@ -89,19 +89,19 @@ Relative difference = 3.258803992249869e-07
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl1_hrd0/check.exe -p 2048 256 2 OMP=
-Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.2.0] [inlineHel=1] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 3.146546e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.322919e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.322919e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.097210e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.240059e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.240059e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086689e+00 +- 3.413217e-03 )  GeV^0
-TOTAL       :     3.441664 sec
-    10,637,428,563      cycles                    #    3.088 GHz                    
-    22,946,767,172      instructions              #    2.16  insn per cycle         
-       3.453622149 seconds time elapsed
+TOTAL       :     3.491661 sec
+    10,619,998,715      cycles                           #    3.037 GHz                    
+    22,832,779,188      instructions                     #    2.15  insn per cycle         
+       3.497483266 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4: 2554) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl1_hrd0/runTest.exe
@@ -115,19 +115,19 @@ Relative difference = 3.2588039900609506e-07
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl1_hrd0/check.exe -p 2048 256 2 OMP=
-Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.2.0] [inlineHel=1] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 5.616486e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 6.203845e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 6.203845e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 5.431271e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 5.883655e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 5.883655e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086689e+00 +- 3.413217e-03 )  GeV^0
-TOTAL       :     1.975506 sec
-     5,785,528,635      cycles                    #    2.924 GHz                    
-    10,765,799,966      instructions              #    1.86  insn per cycle         
-       1.987834132 seconds time elapsed
+TOTAL       :     2.039012 sec
+     5,716,156,076      cycles                           #    2.802 GHz                    
+    10,652,722,821      instructions                     #    1.86  insn per cycle         
+       2.044492539 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 2694) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl1_hrd0/runTest.exe
@@ -141,19 +141,19 @@ Relative difference = 3.2588037186351226e-07
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl1_hrd0/check.exe -p 2048 256 2 OMP=
-Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.2.0] [inlineHel=1] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 5.684700e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 6.274843e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 6.274843e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 5.705619e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 6.188017e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 6.188017e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086689e+00 +- 3.413217e-03 )  GeV^0
-TOTAL       :     1.951945 sec
-     5,696,598,180      cycles                    #    2.912 GHz                    
-     9,993,327,583      instructions              #    1.75  insn per cycle         
-       1.964406649 seconds time elapsed
+TOTAL       :     1.944139 sec
+     5,573,008,606      cycles                           #    2.860 GHz                    
+     9,877,800,784      instructions                     #    1.77  insn per cycle         
+       1.950007637 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 2323) (512y:  159) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl1_hrd0/runTest.exe
@@ -167,19 +167,19 @@ Relative difference = 3.2588037186351226e-07
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl1_hrd0/check.exe -p 2048 256 2 OMP=
-Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.2.0] [inlineHel=1] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 4.413791e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 4.764783e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 4.764783e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 4.361963e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 4.636685e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.636685e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086689e+00 +- 3.413217e-03 )  GeV^0
-TOTAL       :     2.484051 sec
-     5,055,557,870      cycles                    #    2.031 GHz                    
-     7,607,210,183      instructions              #    1.50  insn per cycle         
-       2.496248175 seconds time elapsed
+TOTAL       :     2.510468 sec
+     4,946,319,713      cycles                           #    1.966 GHz                    
+     7,490,938,080      instructions                     #    1.51  insn per cycle         
+       2.516345616 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 1617) (512y:  257) (512z: 1663)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl1_hrd0/runTest.exe
diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl1_hrd1.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl1_hrd1.txt
index 336c6b2c5b..0b9a5fa56c 100644
--- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl1_hrd1.txt
+++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl1_hrd1.txt
@@ -35,22 +35,22 @@ CUDACPP_BUILDDIR='build.512z_d_inl1_hrd1'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 
-DATE: 2023-06-16_23:10:52
+DATE: 2023-07-18_23:14:46
 
-On itscrd80.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
+On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd1/gcheck.exe -p 2048 256 2 OMP=
-Process                     = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=1] [hardcodePARAM=1]
+Process                     = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=1] [hardcodePARAM=1]
 Workflow summary            = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 4.704482e+07                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.154812e+08                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.264069e+08                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 4.590276e+07                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.161665e+08                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.276091e+08                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086689e+00 +- 3.413217e-03 )  GeV^0
-TOTAL       :     0.548991 sec
-     2,291,931,243      cycles                    #    2.897 GHz                    
-     2,895,980,863      instructions              #    1.26  insn per cycle         
-       0.849095215 seconds time elapsed
+TOTAL       :     0.523953 sec
+     2,257,235,761      cycles                           #    2.939 GHz                    
+     3,115,853,120      instructions                     #    1.38  insn per cycle         
+       0.825012907 seconds time elapsed
 runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd1/gcheck.exe -p 2048 256 1
 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 208
 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100%
@@ -63,20 +63,20 @@ Relative difference = 3.2588034143755247e-07
 OK (relative difference <= 5E-3)
 =========================================================================
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd1/check.exe -p 2048 256 2 OMP=
-Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.2.0] [inlineHel=1] [hardcodePARAM=1]
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=1]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 2.696861e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.817621e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.817621e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.638662e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.736999e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.736999e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086689e+00 +- 3.413217e-03 )  GeV^0
-TOTAL       :     3.995865 sec
-    12,393,440,631      cycles                    #    3.099 GHz                    
-    35,121,655,729      instructions              #    2.83  insn per cycle         
-       4.002434492 seconds time elapsed
-=Symbols in CPPProcess.o= (~sse4:  458) (avx2:    0) (512y:    0) (512z:    0)
+TOTAL       :     4.081958 sec
+    12,320,219,319      cycles                           #    3.014 GHz                    
+    35,009,881,921      instructions                     #    2.84  insn per cycle         
+       4.088093070 seconds time elapsed
+=Symbols in CPPProcess.o= (~sse4:  457) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd1/runTest.exe
 [  PASSED  ] 6 tests.
@@ -89,19 +89,19 @@ Relative difference = 3.258803992249869e-07
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl1_hrd1/check.exe -p 2048 256 2 OMP=
-Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.2.0] [inlineHel=1] [hardcodePARAM=1]
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=1]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 3.203775e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.393294e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.393294e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.150755e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.298430e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.298430e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086689e+00 +- 3.413217e-03 )  GeV^0
-TOTAL       :     3.383474 sec
-    10,433,445,794      cycles                    #    3.079 GHz                    
-    22,106,457,751      instructions              #    2.12  insn per cycle         
-       3.399221962 seconds time elapsed
+TOTAL       :     3.435005 sec
+    10,404,447,581      cycles                           #    3.026 GHz                    
+    21,993,103,626      instructions                     #    2.11  insn per cycle         
+       3.440569541 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4: 2351) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl1_hrd1/runTest.exe
@@ -115,19 +115,19 @@ Relative difference = 3.2588039900609506e-07
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl1_hrd1/check.exe -p 2048 256 2 OMP=
-Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.2.0] [inlineHel=1] [hardcodePARAM=1]
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=1]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 5.959707e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 6.624477e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 6.624477e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 5.736703e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 6.246663e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 6.246663e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086689e+00 +- 3.413217e-03 )  GeV^0
-TOTAL       :     1.874404 sec
-     5,449,857,591      cycles                    #    2.908 GHz                    
-    10,370,988,020      instructions              #    1.90  insn per cycle         
-       1.886802544 seconds time elapsed
+TOTAL       :     1.937670 sec
+     5,358,288,000      cycles                           #    2.758 GHz                    
+    10,256,578,853      instructions                     #    1.91  insn per cycle         
+       1.943859017 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 2170) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl1_hrd1/runTest.exe
@@ -141,19 +141,19 @@ Relative difference = 3.2588037186351226e-07
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl1_hrd1/check.exe -p 2048 256 2 OMP=
-Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.2.0] [inlineHel=1] [hardcodePARAM=1]
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=1]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 5.940537e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 6.590919e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 6.590919e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 5.908842e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 6.427368e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 6.427368e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086689e+00 +- 3.413217e-03 )  GeV^0
-TOTAL       :     1.873958 sec
-     5,488,219,503      cycles                    #    2.922 GHz                    
-     9,577,123,391      instructions              #    1.75  insn per cycle         
-       1.890504897 seconds time elapsed
+TOTAL       :     1.882902 sec
+     5,396,388,166      cycles                           #    2.859 GHz                    
+     9,460,392,900      instructions                     #    1.75  insn per cycle         
+       1.888502703 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 1857) (512y:  115) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl1_hrd1/runTest.exe
@@ -167,19 +167,19 @@ Relative difference = 3.2588037186351226e-07
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl1_hrd1/check.exe -p 2048 256 2 OMP=
-Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.2.0] [inlineHel=1] [hardcodePARAM=1]
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=1]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 4.678748e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 5.078914e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 5.078914e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 4.549884e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 4.851953e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.851953e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086689e+00 +- 3.413217e-03 )  GeV^0
-TOTAL       :     2.351438 sec
-     4,817,365,178      cycles                    #    2.046 GHz                    
-     7,401,125,646      instructions              #    1.54  insn per cycle         
-       2.365073669 seconds time elapsed
+TOTAL       :     2.411770 sec
+     4,707,067,837      cycles                           #    1.950 GHz                    
+     7,285,308,864      instructions                     #    1.55  insn per cycle         
+       2.417369345 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 1298) (512y:  193) (512z: 1369)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl1_hrd1/runTest.exe
diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0.txt
index 51b62d4486..2c306e7eab 100644
--- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0.txt
+++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0.txt
@@ -35,22 +35,22 @@ CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 
-DATE: 2023-06-16_22:52:02
+DATE: 2023-07-18_22:40:55
 
-On itscrd80.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
+On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 2 OMP=
-Process                     = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 1.086859e+08                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.694248e+08                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.954158e+08                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 8.217276e+07                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.572809e+08                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.948314e+08                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086718e+00 +- 3.413389e-03 )  GeV^0
-TOTAL       :     0.501803 sec
-     2,114,448,212      cycles                    #    2.892 GHz                    
-     2,650,449,805      instructions              #    1.25  insn per cycle         
-       0.791582379 seconds time elapsed
+TOTAL       :     0.485442 sec
+     2,080,435,790      cycles                           #    2.923 GHz                    
+     2,866,762,926      instructions                     #    1.38  insn per cycle         
+       0.791800006 seconds time elapsed
 runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 1
 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 128
 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100%
@@ -58,25 +58,25 @@ runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesse
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2
 Avg ME (C++/CUDA)   = 2.028811e+00
-Avg ME (F77/CUDA)   = 2.0288499668240547
-Relative difference = 1.920672948568199e-05
+Avg ME (F77/CUDA)   = 2.0288499749731272
+Relative difference = 1.9210746159747678e-05
 OK (relative difference <= 5E-3)
 =========================================================================
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/check.exe -p 2048 256 2 OMP=
-Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 2.045424e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.103487e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.103487e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.956154e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.009707e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.009707e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086780e+00 +- 3.413794e-03 )  GeV^0
-TOTAL       :     5.214918 sec
-    16,086,691,839      cycles                    #    3.084 GHz                    
-    45,264,306,297      instructions              #    2.81  insn per cycle         
-       5.221298921 seconds time elapsed
-=Symbols in CPPProcess.o= (~sse4:  630) (avx2:    0) (512y:    0) (512z:    0)
+TOTAL       :     5.446517 sec
+    16,514,086,283      cycles                           #    3.029 GHz                    
+    45,242,334,961      instructions                     #    2.74  insn per cycle         
+       5.453327424 seconds time elapsed
+=Symbols in CPPProcess.o= (~sse4:  628) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/runTest.exe
 [  PASSED  ] 6 tests.
@@ -89,19 +89,19 @@ Relative difference = 6.443528218283898e-08
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/check.exe -p 2048 256 2 OMP=
-Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 5.087937e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 5.510673e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 5.510673e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 4.892673e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 5.273909e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 5.273909e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086781e+00 +- 3.413806e-03 )  GeV^0
-TOTAL       :     2.147379 sec
-     6,633,404,493      cycles                    #    3.083 GHz                    
-    16,691,710,310      instructions              #    2.52  insn per cycle         
-       2.153586806 seconds time elapsed
+TOTAL       :     2.230204 sec
+     6,747,558,443      cycles                           #    3.021 GHz                    
+    16,682,885,775      instructions                     #    2.47  insn per cycle         
+       2.237199250 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4: 3123) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/runTest.exe
@@ -115,19 +115,19 @@ Relative difference = 8.24528544926829e-08
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/check.exe -p 2048 256 2 OMP=
-Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.082531e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.272863e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.272863e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.039386e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.217213e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.217213e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086812e+00 +- 3.414242e-03 )  GeV^0
-TOTAL       :     1.053599 sec
-     3,105,351,321      cycles                    #    2.935 GHz                    
-     7,028,445,226      instructions              #    2.26  insn per cycle         
-       1.065166686 seconds time elapsed
+TOTAL       :     1.098393 sec
+     3,098,459,353      cycles                           #    2.813 GHz                    
+     7,014,812,146      instructions                     #    2.26  insn per cycle         
+       1.105173507 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 2735) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/runTest.exe
@@ -141,19 +141,19 @@ Relative difference = 1.2313965798858044e-07
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/check.exe -p 2048 256 2 OMP=
-Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.144769e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.363322e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.363322e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.133289e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.341271e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.341271e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086812e+00 +- 3.414242e-03 )  GeV^0
-TOTAL       :     1.001609 sec
-     2,959,023,588      cycles                    #    2.940 GHz                    
-     6,742,632,212      instructions              #    2.28  insn per cycle         
-       1.013857674 seconds time elapsed
+TOTAL       :     1.012727 sec
+     2,932,731,503      cycles                           #    2.887 GHz                    
+     6,731,148,444      instructions                     #    2.30  insn per cycle         
+       1.019615763 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 2593) (512y:   12) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/runTest.exe
@@ -167,20 +167,20 @@ Relative difference = 1.2313965798858044e-07
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/check.exe -p 2048 256 2 OMP=
-Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 8.162662e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 9.186029e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 9.186029e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 7.994135e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 8.967821e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 8.967821e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086812e+00 +- 3.414242e-03 )  GeV^0
-TOTAL       :     1.370260 sec
-     2,871,098,848      cycles                    #    2.089 GHz                    
-     4,848,815,662      instructions              #    1.69  insn per cycle         
-       1.382844305 seconds time elapsed
-=Symbols in CPPProcess.o= (~sse4:    0) (avx2: 1722) (512y:   22) (512z: 1849)
+TOTAL       :     1.398110 sec
+     2,867,836,390      cycles                           #    2.047 GHz                    
+     4,844,016,196      instructions                     #    1.69  insn per cycle         
+       1.404798548 seconds time elapsed
+=Symbols in CPPProcess.o= (~sse4:    0) (avx2: 1731) (512y:   22) (512z: 1849)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/runTest.exe
 [  PASSED  ] 6 tests.
diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_bridge.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_bridge.txt
index 500d268665..240f0eb579 100644
--- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_bridge.txt
+++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_bridge.txt
@@ -35,26 +35,26 @@ CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 
-DATE: 2023-06-16_23:20:18
+DATE: 2023-07-18_23:23:52
 
-On itscrd80.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
+On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 2 --bridge OMP=
 WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost
 WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost
 WARNING! Instantiate device Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288)
 WARNING! Set grid in Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288)
-Process                     = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 7.110307e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 5.876714e+07                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 5.876714e+07                 )  sec^-1
-MeanMatrixElemValue         = ( 2.086808e+00 +- 3.414090e-03 )  GeV^0
-TOTAL       :     0.696990 sec
-     2,704,320,756      cycles                    #    2.897 GHz                    
-     3,812,649,511      instructions              #    1.41  insn per cycle         
-       0.992471982 seconds time elapsed
+EvtsPerSec[Rmb+ME]     (23) = ( 7.077735e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 5.457332e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 5.457332e+07                 )  sec^-1
+MeanMatrixElemValue         = ( 2.086805e+00 +- 3.414078e-03 )  GeV^0
+TOTAL       :     0.668812 sec
+     2,707,938,072      cycles                           #    2.970 GHz                    
+     4,089,951,980      instructions                     #    1.51  insn per cycle         
+       0.968803408 seconds time elapsed
 runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 1 --bridge
 WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost
 WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost
@@ -66,26 +66,26 @@ WARNING! Set grid in Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublo
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2
 Avg ME (C++/CUDA)   = 2.028811e+00
-Avg ME (F77/CUDA)   = 2.0288499668240547
-Relative difference = 1.920672948568199e-05
+Avg ME (F77/CUDA)   = 2.0288499749731272
+Relative difference = 1.9210746159747678e-05
 OK (relative difference <= 5E-3)
 =========================================================================
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/check.exe -p 2048 256 2 --bridge OMP=
 WARNING! Instantiate host Bridge (nevt=524288)
-Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 2.034027e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.092499e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.092499e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.927286e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.979745e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.979745e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086780e+00 +- 3.413794e-03 )  GeV^0
-TOTAL       :     5.291084 sec
-    16,288,379,429      cycles                    #    3.078 GHz                    
-    45,312,040,286      instructions              #    2.78  insn per cycle         
-       5.298079548 seconds time elapsed
-=Symbols in CPPProcess.o= (~sse4:  630) (avx2:    0) (512y:    0) (512z:    0)
+TOTAL       :     5.569040 sec
+    16,686,762,464      cycles                           #    2.995 GHz                    
+    45,285,461,035      instructions                     #    2.71  insn per cycle         
+       5.575706950 seconds time elapsed
+=Symbols in CPPProcess.o= (~sse4:  628) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/runTest.exe
 [  PASSED  ] 6 tests.
@@ -99,19 +99,19 @@ OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/check.exe -p 2048 256 2 --bridge OMP=
 WARNING! Instantiate host Bridge (nevt=524288)
-Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 5.061059e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 5.467740e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 5.467740e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 4.892165e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 5.271278e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 5.271278e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086781e+00 +- 3.413806e-03 )  GeV^0
-TOTAL       :     2.203734 sec
-     6,829,055,399      cycles                    #    3.092 GHz                    
-    16,972,646,719      instructions              #    2.49  insn per cycle         
-       2.216688904 seconds time elapsed
+TOTAL       :     2.272550 sec
+     6,943,591,915      cycles                           #    3.049 GHz                    
+    16,959,252,147      instructions                     #    2.44  insn per cycle         
+       2.278715637 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4: 3123) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/runTest.exe
@@ -126,19 +126,19 @@ OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/check.exe -p 2048 256 2 --bridge OMP=
 WARNING! Instantiate host Bridge (nevt=524288)
-Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.061515e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.242839e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.242839e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.040490e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.216209e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.216209e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086812e+00 +- 3.414242e-03 )  GeV^0
-TOTAL       :     1.119383 sec
-     3,302,419,161      cycles                    #    2.935 GHz                    
-     7,266,282,160      instructions              #    2.20  insn per cycle         
-       1.131841634 seconds time elapsed
+TOTAL       :     1.138766 sec
+     3,304,662,247      cycles                           #    2.887 GHz                    
+     7,255,626,586      instructions                     #    2.20  insn per cycle         
+       1.145475247 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 2735) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/runTest.exe
@@ -153,19 +153,19 @@ OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/check.exe -p 2048 256 2 --bridge OMP=
 WARNING! Instantiate host Bridge (nevt=524288)
-Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.096474e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.293763e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.293763e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.115706e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.317047e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.317047e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086812e+00 +- 3.414242e-03 )  GeV^0
-TOTAL       :     1.088730 sec
-     3,150,094,724      cycles                    #    2.877 GHz                    
-     6,980,573,007      instructions              #    2.22  insn per cycle         
-       1.104462726 seconds time elapsed
+TOTAL       :     1.070181 sec
+     3,123,669,032      cycles                           #    2.906 GHz                    
+     6,969,124,690      instructions                     #    2.23  insn per cycle         
+       1.076178013 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 2593) (512y:   12) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/runTest.exe
@@ -180,20 +180,20 @@ OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/check.exe -p 2048 256 2 --bridge OMP=
 WARNING! Instantiate host Bridge (nevt=524288)
-Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 8.080793e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 9.067489e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 9.067489e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 7.898530e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 8.849790e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 8.849790e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086812e+00 +- 3.414242e-03 )  GeV^0
-TOTAL       :     1.430225 sec
-     3,084,429,817      cycles                    #    2.147 GHz                    
-     5,103,946,062      instructions              #    1.65  insn per cycle         
-       1.443126402 seconds time elapsed
-=Symbols in CPPProcess.o= (~sse4:    0) (avx2: 1722) (512y:   22) (512z: 1849)
+TOTAL       :     1.459194 sec
+     3,072,125,058      cycles                           #    2.097 GHz                    
+     5,101,749,018      instructions                     #    1.66  insn per cycle         
+       1.465704045 seconds time elapsed
+=Symbols in CPPProcess.o= (~sse4:    0) (avx2: 1731) (512y:   22) (512z: 1849)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/runTest.exe
 [  PASSED  ] 6 tests.
diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_common.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_common.txt
index a6492e3922..95a4fe5b8e 100644
--- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_common.txt
+++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_common.txt
@@ -35,22 +35,22 @@ CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 
-DATE: 2023-06-16_23:32:37
+DATE: 2023-07-18_23:35:50
 
-On itscrd80.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
+On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 2 --common OMP=
-Process                     = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:COMMON+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 9.808067e+07                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.652444e+08                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.960520e+08                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 9.466782e+07                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.650096e+08                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.971128e+08                 )  sec^-1
 MeanMatrixElemValue         = ( 2.079446e+00 +- 3.403306e-03 )  GeV^0
-TOTAL       :     0.588246 sec
-     2,402,908,432      cycles                    #    2.901 GHz                    
-     3,110,614,158      instructions              #    1.29  insn per cycle         
-       0.886196949 seconds time elapsed
+TOTAL       :     0.564456 sec
+     2,318,311,977      cycles                           #    2.948 GHz                    
+     3,311,952,807      instructions                     #    1.43  insn per cycle         
+       0.843584481 seconds time elapsed
 runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 1 --common
 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 128
 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100%
@@ -58,25 +58,25 @@ runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesse
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2
 Avg ME (C++/CUDA)   = 2.028811e+00
-Avg ME (F77/CUDA)   = 2.0288499668240547
-Relative difference = 1.920672948568199e-05
+Avg ME (F77/CUDA)   = 2.0288499749731272
+Relative difference = 1.9210746159747678e-05
 OK (relative difference <= 5E-3)
 =========================================================================
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/check.exe -p 2048 256 2 --common OMP=
-Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 2.033763e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.091227e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.091227e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.962432e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.016728e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.016728e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.079573e+00 +- 3.404712e-03 )  GeV^0
-TOTAL       :     5.296990 sec
-    16,269,407,762      cycles                    #    3.070 GHz                    
-    45,296,730,260      instructions              #    2.78  insn per cycle         
-       5.303386291 seconds time elapsed
-=Symbols in CPPProcess.o= (~sse4:  630) (avx2:    0) (512y:    0) (512z:    0)
+TOTAL       :     5.483598 sec
+    16,671,675,276      cycles                           #    3.039 GHz                    
+    45,271,419,303      instructions                     #    2.72  insn per cycle         
+       5.488608674 seconds time elapsed
+=Symbols in CPPProcess.o= (~sse4:  628) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/runTest.exe
 [  PASSED  ] 6 tests.
@@ -89,19 +89,19 @@ Relative difference = 6.443528218283898e-08
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/check.exe -p 2048 256 2 --common OMP=
-Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 5.111672e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 5.523499e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 5.523499e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 4.869022e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 5.249889e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 5.249889e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.079574e+00 +- 3.404724e-03 )  GeV^0
-TOTAL       :     2.191606 sec
-     6,792,051,649      cycles                    #    3.094 GHz                    
-    16,705,051,298      instructions              #    2.46  insn per cycle         
-       2.206896271 seconds time elapsed
+TOTAL       :     2.292634 sec
+     6,902,222,192      cycles                           #    3.006 GHz                    
+    16,695,390,275      instructions                     #    2.42  insn per cycle         
+       2.297548144 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4: 3123) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/runTest.exe
@@ -115,19 +115,19 @@ Relative difference = 8.24528544926829e-08
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/check.exe -p 2048 256 2 --common OMP=
-Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.084154e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.276390e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.276390e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.053420e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.233732e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.233732e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 2.079552e+00 +- 3.404217e-03 )  GeV^0
-TOTAL       :     1.106630 sec
-     3,283,893,034      cycles                    #    2.956 GHz                    
-     7,013,364,350      instructions              #    2.14  insn per cycle         
-       1.121623240 seconds time elapsed
+TOTAL       :     1.134406 sec
+     3,272,747,000      cycles                           #    2.874 GHz                    
+     7,002,338,013      instructions                     #    2.14  insn per cycle         
+       1.139504150 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 2735) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/runTest.exe
@@ -141,19 +141,19 @@ Relative difference = 1.2313965798858044e-07
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/check.exe -p 2048 256 2 --common OMP=
-Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:COMMON+RMBHST+MESHST/512y+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.140251e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.355699e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.355699e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.105641e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.306620e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.306620e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 2.079552e+00 +- 3.404217e-03 )  GeV^0
-TOTAL       :     1.061058 sec
-     3,126,583,899      cycles                    #    2.934 GHz                    
-     6,694,705,597      instructions              #    2.14  insn per cycle         
-       1.076779612 seconds time elapsed
+TOTAL       :     1.090369 sec
+     3,098,852,591      cycles                           #    2.831 GHz                    
+     6,681,759,155      instructions                     #    2.16  insn per cycle         
+       1.095947246 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 2593) (512y:   12) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/runTest.exe
@@ -167,20 +167,20 @@ Relative difference = 1.2313965798858044e-07
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/check.exe -p 2048 256 2 --common OMP=
-Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:COMMON+RMBHST+MESHST/512z+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 8.180445e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 9.199369e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 9.199369e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 7.633216e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 8.532699e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 8.532699e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.079552e+00 +- 3.404217e-03 )  GeV^0
-TOTAL       :     1.423127 sec
-     3,043,448,384      cycles                    #    2.132 GHz                    
-     4,800,221,178      instructions              #    1.58  insn per cycle         
-       1.434967626 seconds time elapsed
-=Symbols in CPPProcess.o= (~sse4:    0) (avx2: 1722) (512y:   22) (512z: 1849)
+TOTAL       :     1.516441 sec
+     3,031,711,893      cycles                           #    1.994 GHz                    
+     4,798,217,271      instructions                     #    1.58  insn per cycle         
+       1.521855756 seconds time elapsed
+=Symbols in CPPProcess.o= (~sse4:    0) (avx2: 1731) (512y:   22) (512z: 1849)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/runTest.exe
 [  PASSED  ] 6 tests.
diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_curhst.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_curhst.txt
index 5b694fb236..24f2d03ca1 100644
--- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_curhst.txt
+++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_curhst.txt
@@ -35,22 +35,22 @@ CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 
-DATE: 2023-06-16_23:29:34
+DATE: 2023-07-18_23:32:52
 
-On itscrd80.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
+On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 2 --curhst OMP=
-Process                     = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURHST+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 9.836019e+07                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.651430e+08                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.966312e+08                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 9.484059e+07                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.651757e+08                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.971705e+08                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086718e+00 +- 3.413389e-03 )  GeV^0
-TOTAL       :     0.535629 sec
-     2,203,011,013      cycles                    #    2.886 GHz                    
-     3,060,363,719      instructions              #    1.39  insn per cycle         
-       0.822111510 seconds time elapsed
+TOTAL       :     0.510522 sec
+     2,157,248,955      cycles                           #    2.948 GHz                    
+     3,306,751,827      instructions                     #    1.53  insn per cycle         
+       0.789217813 seconds time elapsed
 runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 1 --curhst
 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 128
 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100%
@@ -58,25 +58,25 @@ runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesse
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2
 Avg ME (C++/CUDA)   = 2.028811e+00
-Avg ME (F77/CUDA)   = 2.0288499668240547
-Relative difference = 1.920672948568199e-05
+Avg ME (F77/CUDA)   = 2.0288499749731272
+Relative difference = 1.9210746159747678e-05
 OK (relative difference <= 5E-3)
 =========================================================================
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/check.exe -p 2048 256 2 --curhst OMP=
-Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 2.025315e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.083260e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.083260e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.941861e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.997041e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.997041e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086780e+00 +- 3.413794e-03 )  GeV^0
-TOTAL       :     5.275218 sec
-    16,091,710,481      cycles                    #    3.052 GHz                    
-    45,265,017,659      instructions              #    2.81  insn per cycle         
-       5.281528258 seconds time elapsed
-=Symbols in CPPProcess.o= (~sse4:  630) (avx2:    0) (512y:    0) (512z:    0)
+TOTAL       :     5.487408 sec
+    16,511,648,399      cycles                           #    3.007 GHz                    
+    45,244,112,398      instructions                     #    2.74  insn per cycle         
+       5.492462207 seconds time elapsed
+=Symbols in CPPProcess.o= (~sse4:  628) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/runTest.exe
 [  PASSED  ] 6 tests.
@@ -89,19 +89,19 @@ Relative difference = 6.443528218283898e-08
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/check.exe -p 2048 256 2 --curhst OMP=
-Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 5.134361e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 5.551581e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 5.551581e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 4.895899e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 5.279157e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 5.279157e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086781e+00 +- 3.413806e-03 )  GeV^0
-TOTAL       :     2.128639 sec
-     6,626,366,458      cycles                    #    3.106 GHz                    
-    16,691,617,090      instructions              #    2.52  insn per cycle         
-       2.144077506 seconds time elapsed
+TOTAL       :     2.227339 sec
+     6,742,004,236      cycles                           #    3.021 GHz                    
+    16,683,347,281      instructions                     #    2.47  insn per cycle         
+       2.232732049 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4: 3123) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/runTest.exe
@@ -115,19 +115,19 @@ Relative difference = 8.24528544926829e-08
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/check.exe -p 2048 256 2 --curhst OMP=
-Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.089985e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.279320e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.279320e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.054815e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.234111e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.234111e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086812e+00 +- 3.414242e-03 )  GeV^0
-TOTAL       :     1.046794 sec
-     3,097,881,265      cycles                    #    2.946 GHz                    
-     7,028,413,867      instructions              #    2.27  insn per cycle         
-       1.062080365 seconds time elapsed
+TOTAL       :     1.078882 sec
+     3,105,837,405      cycles                           #    2.867 GHz                    
+     7,018,311,290      instructions                     #    2.26  insn per cycle         
+       1.084486905 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 2735) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/runTest.exe
@@ -141,19 +141,19 @@ Relative difference = 1.2313965798858044e-07
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/check.exe -p 2048 256 2 --curhst OMP=
-Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.151751e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.366544e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.366544e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.063461e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.252592e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.252592e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086812e+00 +- 3.414242e-03 )  GeV^0
-TOTAL       :     0.995620 sec
-     2,939,121,938      cycles                    #    2.938 GHz                    
-     6,742,732,632      instructions              #    2.29  insn per cycle         
-       1.001882555 seconds time elapsed
+TOTAL       :     1.074162 sec
+     2,944,652,129      cycles                           #    2.730 GHz                    
+     6,731,582,934      instructions                     #    2.29  insn per cycle         
+       1.079931534 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 2593) (512y:   12) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/runTest.exe
@@ -167,20 +167,20 @@ Relative difference = 1.2313965798858044e-07
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/check.exe -p 2048 256 2 --curhst OMP=
-Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 8.156539e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 9.166833e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 9.166833e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 7.961017e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 8.924748e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 8.924748e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086812e+00 +- 3.414242e-03 )  GeV^0
-TOTAL       :     1.371454 sec
-     2,896,780,603      cycles                    #    2.105 GHz                    
-     4,848,892,795      instructions              #    1.67  insn per cycle         
-       1.377614632 seconds time elapsed
-=Symbols in CPPProcess.o= (~sse4:    0) (avx2: 1722) (512y:   22) (512z: 1849)
+TOTAL       :     1.402742 sec
+     2,862,663,665      cycles                           #    2.034 GHz                    
+     4,847,235,521      instructions                     #    1.69  insn per cycle         
+       1.408043991 seconds time elapsed
+=Symbols in CPPProcess.o= (~sse4:    0) (avx2: 1731) (512y:   22) (512z: 1849)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/runTest.exe
 [  PASSED  ] 6 tests.
diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_rmbhst.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_rmbhst.txt
index f142e268bd..04fe0fb90b 100644
--- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_rmbhst.txt
+++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_rmbhst.txt
@@ -35,23 +35,23 @@ CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 
-DATE: 2023-06-16_23:26:35
+DATE: 2023-07-18_23:29:59
 
-On itscrd80.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
+On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 2 --rmbhst OMP=
 WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost
-Process                     = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURHST+RMBHST+MESDEV/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 8.844906e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.667637e+08                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.968555e+08                 )  sec^-1
-MeanMatrixElemValue         = ( 2.086808e+00 +- 3.414090e-03 )  GeV^0
-TOTAL       :     0.637578 sec
-     2,532,069,078      cycles                    #    2.919 GHz                    
-     3,546,459,768      instructions              #    1.40  insn per cycle         
-       0.925210240 seconds time elapsed
+EvtsPerSec[Rmb+ME]     (23) = ( 8.694538e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.624973e+08                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.937992e+08                 )  sec^-1
+MeanMatrixElemValue         = ( 2.086805e+00 +- 3.414078e-03 )  GeV^0
+TOTAL       :     0.621749 sec
+     2,436,447,955      cycles                           #    2.874 GHz                    
+     3,730,153,997      instructions                     #    1.53  insn per cycle         
+       0.906983386 seconds time elapsed
 runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 1 --rmbhst
 WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost
 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 128
@@ -60,25 +60,25 @@ WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2
 Avg ME (C++/CUDA)   = 2.028811e+00
-Avg ME (F77/CUDA)   = 2.0288499668240547
-Relative difference = 1.920672948568199e-05
+Avg ME (F77/CUDA)   = 2.0288499749731272
+Relative difference = 1.9210746159747678e-05
 OK (relative difference <= 5E-3)
 =========================================================================
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/check.exe -p 2048 256 2 --rmbhst OMP=
-Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 2.040228e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.098157e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.098157e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.929722e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.983904e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.983904e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086780e+00 +- 3.413794e-03 )  GeV^0
-TOTAL       :     5.228615 sec
-    16,104,862,870      cycles                    #    3.079 GHz                    
-    45,267,189,999      instructions              #    2.81  insn per cycle         
-       5.234886365 seconds time elapsed
-=Symbols in CPPProcess.o= (~sse4:  630) (avx2:    0) (512y:    0) (512z:    0)
+TOTAL       :     5.522797 sec
+    16,514,064,227      cycles                           #    2.989 GHz                    
+    45,244,888,942      instructions                     #    2.74  insn per cycle         
+       5.527861458 seconds time elapsed
+=Symbols in CPPProcess.o= (~sse4:  628) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/runTest.exe
 [  PASSED  ] 6 tests.
@@ -91,19 +91,19 @@ Relative difference = 6.443528218283898e-08
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/check.exe -p 2048 256 2 --rmbhst OMP=
-Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 5.110524e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 5.524586e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 5.524586e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 4.882531e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 5.256446e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 5.256446e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086781e+00 +- 3.413806e-03 )  GeV^0
-TOTAL       :     2.139950 sec
-     6,622,300,033      cycles                    #    3.088 GHz                    
-    16,691,797,560      instructions              #    2.52  insn per cycle         
-       2.152617384 seconds time elapsed
+TOTAL       :     2.233627 sec
+     6,749,698,693      cycles                           #    3.016 GHz                    
+    16,683,905,940      instructions                     #    2.47  insn per cycle         
+       2.239003688 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4: 3123) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/runTest.exe
@@ -117,19 +117,19 @@ Relative difference = 8.24528544926829e-08
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/check.exe -p 2048 256 2 --rmbhst OMP=
-Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.073753e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.265534e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.265534e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.066310e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.248063e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.248063e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086812e+00 +- 3.414242e-03 )  GeV^0
-TOTAL       :     1.063032 sec
-     3,116,892,646      cycles                    #    2.921 GHz                    
-     7,028,647,548      instructions              #    2.26  insn per cycle         
-       1.074679609 seconds time elapsed
+TOTAL       :     1.068064 sec
+     3,096,676,145      cycles                           #    2.887 GHz                    
+     7,018,044,016      instructions                     #    2.27  insn per cycle         
+       1.073453269 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 2735) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/runTest.exe
@@ -143,19 +143,19 @@ Relative difference = 1.2313965798858044e-07
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/check.exe -p 2048 256 2 --rmbhst OMP=
-Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.147777e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.361400e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.361400e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.131277e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.337030e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.337030e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086812e+00 +- 3.414242e-03 )  GeV^0
-TOTAL       :     0.998684 sec
-     2,942,336,224      cycles                    #    2.933 GHz                    
-     6,742,794,858      instructions              #    2.29  insn per cycle         
-       1.014229588 seconds time elapsed
+TOTAL       :     1.011866 sec
+     2,935,754,900      cycles                           #    2.890 GHz                    
+     6,731,265,559      instructions                     #    2.29  insn per cycle         
+       1.017019453 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 2593) (512y:   12) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/runTest.exe
@@ -169,20 +169,20 @@ Relative difference = 1.2313965798858044e-07
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/check.exe -p 2048 256 2 --rmbhst OMP=
-Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 8.156146e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 9.170100e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 9.170100e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 8.038564e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 9.027544e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 9.027544e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086812e+00 +- 3.414242e-03 )  GeV^0
-TOTAL       :     1.371673 sec
-     2,870,181,896      cycles                    #    2.085 GHz                    
-     4,848,797,007      instructions              #    1.69  insn per cycle         
-       1.377996079 seconds time elapsed
-=Symbols in CPPProcess.o= (~sse4:    0) (avx2: 1722) (512y:   22) (512z: 1849)
+TOTAL       :     1.390802 sec
+     2,864,121,739      cycles                           #    2.054 GHz                    
+     4,847,614,894      instructions                     #    1.69  insn per cycle         
+       1.395918141 seconds time elapsed
+=Symbols in CPPProcess.o= (~sse4:    0) (avx2: 1731) (512y:   22) (512z: 1849)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/runTest.exe
 [  PASSED  ] 6 tests.
diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd1.txt
index 908359aae9..c12727faea 100644
--- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd1.txt
+++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd1.txt
@@ -35,22 +35,22 @@ CUDACPP_BUILDDIR='build.512z_f_inl0_hrd1'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 
-DATE: 2023-06-16_22:52:25
+DATE: 2023-07-18_22:41:17
 
-On itscrd80.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
+On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd1/gcheck.exe -p 2048 256 2 OMP=
-Process                     = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=1]
+Process                     = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 1.091832e+08                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.744916e+08                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.014144e+08                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 8.274020e+07                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.620265e+08                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.010701e+08                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086718e+00 +- 3.413389e-03 )  GeV^0
-TOTAL       :     0.502212 sec
-     2,095,360,596      cycles                    #    2.861 GHz                    
-     2,633,922,104      instructions              #    1.26  insn per cycle         
-       0.791606560 seconds time elapsed
+TOTAL       :     0.485160 sec
+     2,099,851,451      cycles                           #    2.951 GHz                    
+     2,930,083,354      instructions                     #    1.40  insn per cycle         
+       0.775769836 seconds time elapsed
 runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd1/gcheck.exe -p 2048 256 1
 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 127
 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100%
@@ -58,24 +58,24 @@ runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesse
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd1/gcheck.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd1/fgcheck.exe 2 64 2
 Avg ME (C++/CUDA)   = 2.028811e+00
-Avg ME (F77/CUDA)   = 2.0288499668240547
-Relative difference = 1.920672948568199e-05
+Avg ME (F77/CUDA)   = 2.0288499749731272
+Relative difference = 1.9210746159747678e-05
 OK (relative difference <= 5E-3)
 =========================================================================
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd1/check.exe -p 2048 256 2 OMP=
-Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=1]
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 2.080122e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.140133e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.140133e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.999828e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.056311e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.056311e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086780e+00 +- 3.413794e-03 )  GeV^0
-TOTAL       :     5.126985 sec
-    15,885,461,434      cycles                    #    3.097 GHz                    
-    44,491,325,292      instructions              #    2.80  insn per cycle         
-       5.132808718 seconds time elapsed
+TOTAL       :     5.329602 sec
+    16,261,566,148      cycles                           #    3.049 GHz                    
+    44,485,747,751      instructions                     #    2.74  insn per cycle         
+       5.336424594 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:  580) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd1/runTest.exe
@@ -89,19 +89,19 @@ Relative difference = 6.443528218283898e-08
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd1/check.exe -p 2048 256 2 OMP=
-Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=1]
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 6.233465e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 6.861527e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 6.861527e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 5.927613e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 6.489171e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 6.489171e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086781e+00 +- 3.413806e-03 )  GeV^0
-TOTAL       :     1.768519 sec
-     5,514,620,841      cycles                    #    3.109 GHz                    
-    15,833,995,859      instructions              #    2.87  insn per cycle         
-       1.780397431 seconds time elapsed
+TOTAL       :     1.855943 sec
+     5,655,311,427      cycles                           #    3.042 GHz                    
+    15,825,122,163      instructions                     #    2.80  insn per cycle         
+       1.862537652 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4: 2852) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd1/runTest.exe
@@ -115,19 +115,19 @@ Relative difference = 8.24528544926829e-08
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd1/check.exe -p 2048 256 2 OMP=
-Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=1]
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 7.563962e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 8.456105e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 8.456105e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 7.442084e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 8.286668e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 8.286668e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086812e+00 +- 3.414242e-03 )  GeV^0
-TOTAL       :     1.472161 sec
-     4,310,727,148      cycles                    #    2.921 GHz                    
-     8,709,473,097      instructions              #    2.02  insn per cycle         
-       1.483481957 seconds time elapsed
+TOTAL       :     1.494877 sec
+     4,308,010,076      cycles                           #    2.875 GHz                    
+     8,699,047,100      instructions                     #    2.02  insn per cycle         
+       1.501395488 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 3300) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd1/runTest.exe
@@ -141,19 +141,19 @@ Relative difference = 1.2313965798858044e-07
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd1/check.exe -p 2048 256 2 OMP=
-Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=1]
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 7.777637e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 8.715040e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 8.715040e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 7.653978e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 8.555644e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 8.555644e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086812e+00 +- 3.414242e-03 )  GeV^0
-TOTAL       :     1.433573 sec
-     4,206,371,569      cycles                    #    2.925 GHz                    
-     8,430,013,752      instructions              #    2.00  insn per cycle         
-       1.439953997 seconds time elapsed
+TOTAL       :     1.456247 sec
+     4,198,587,715      cycles                           #    2.876 GHz                    
+     8,418,940,783      instructions                     #    2.01  insn per cycle         
+       1.462871031 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 3203) (512y:    5) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd1/runTest.exe
@@ -167,19 +167,19 @@ Relative difference = 1.2313965798858044e-07
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd1/check.exe -p 2048 256 2 OMP=
-Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=1]
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 5.942233e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 6.458072e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 6.458072e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 5.777520e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 6.265352e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 6.265352e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086812e+00 +- 3.414242e-03 )  GeV^0
-TOTAL       :     1.858685 sec
-     3,800,386,936      cycles                    #    2.045 GHz                    
-     6,743,751,136      instructions              #    1.77  insn per cycle         
-       1.870417357 seconds time elapsed
+TOTAL       :     1.901803 sec
+     3,793,058,233      cycles                           #    1.992 GHz                    
+     6,732,056,557      instructions                     #    1.77  insn per cycle         
+       1.908711601 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 2337) (512y:   12) (512z: 2190)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd1/runTest.exe
diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl1_hrd0.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl1_hrd0.txt
index 03f579fc70..a0fa7cdf34 100644
--- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl1_hrd0.txt
+++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl1_hrd0.txt
@@ -35,22 +35,22 @@ CUDACPP_BUILDDIR='build.512z_f_inl1_hrd0'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 
-DATE: 2023-06-16_23:11:17
+DATE: 2023-07-18_23:15:11
 
-On itscrd80.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
+On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd0/gcheck.exe -p 2048 256 2 OMP=
-Process                     = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=1] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=1] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 9.788614e+07                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.673918e+08                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.981454e+08                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 9.422354e+07                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.645287e+08                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.966949e+08                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086718e+00 +- 3.413389e-03 )  GeV^0
-TOTAL       :     0.503723 sec
-     2,153,847,666      cycles                    #    2.865 GHz                    
-     2,679,220,576      instructions              #    1.24  insn per cycle         
-       0.809587763 seconds time elapsed
+TOTAL       :     0.481868 sec
+     2,073,830,742      cycles                           #    2.935 GHz                    
+     2,863,733,688      instructions                     #    1.38  insn per cycle         
+       0.764204199 seconds time elapsed
 runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd0/gcheck.exe -p 2048 256 1
 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 128
 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100%
@@ -58,24 +58,24 @@ runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesse
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd0/gcheck.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd0/fgcheck.exe 2 64 2
 Avg ME (C++/CUDA)   = 2.028811e+00
-Avg ME (F77/CUDA)   = 2.0288499668240547
-Relative difference = 1.920672948568199e-05
+Avg ME (F77/CUDA)   = 2.0288499749731272
+Relative difference = 1.9210746159747678e-05
 OK (relative difference <= 5E-3)
 =========================================================================
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd0/check.exe -p 2048 256 2 OMP=
-Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.2.0] [inlineHel=1] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 2.654314e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.753826e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.753826e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.572125e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.666571e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.666571e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086780e+00 +- 3.413794e-03 )  GeV^0
-TOTAL       :     4.037765 sec
-    12,382,941,525      cycles                    #    3.063 GHz                    
-    34,720,031,809      instructions              #    2.80  insn per cycle         
-       4.044257307 seconds time elapsed
+TOTAL       :     4.163180 sec
+    12,652,638,378      cycles                           #    3.038 GHz                    
+    34,713,568,176      instructions                     #    2.74  insn per cycle         
+       4.168589758 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:  710) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd0/runTest.exe
@@ -89,19 +89,19 @@ Relative difference = 4.463890496342449e-08
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl1_hrd0/check.exe -p 2048 256 2 OMP=
-Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.2.0] [inlineHel=1] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 5.839094e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 6.391075e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 6.391075e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 5.684803e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 6.208747e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 6.208747e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086781e+00 +- 3.413806e-03 )  GeV^0
-TOTAL       :     1.882962 sec
-     5,789,032,021      cycles                    #    3.067 GHz                    
-    13,741,089,000      instructions              #    2.37  insn per cycle         
-       1.901362450 seconds time elapsed
+TOTAL       :     1.930654 sec
+     5,894,451,853      cycles                           #    3.047 GHz                    
+    13,730,800,746      instructions                     #    2.33  insn per cycle         
+       1.936122727 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4: 3019) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl1_hrd0/runTest.exe
@@ -115,19 +115,19 @@ Relative difference = 3.8327016574625664e-08
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl1_hrd0/check.exe -p 2048 256 2 OMP=
-Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.2.0] [inlineHel=1] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 9.215187e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.053508e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.053508e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 8.993820e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.025845e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.025845e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086812e+00 +- 3.414242e-03 )  GeV^0
-TOTAL       :     1.223202 sec
-     3,599,020,391      cycles                    #    2.929 GHz                    
-     7,571,835,022      instructions              #    2.10  insn per cycle         
-       1.236020404 seconds time elapsed
+TOTAL       :     1.250517 sec
+     3,619,439,498      cycles                           #    2.883 GHz                    
+     7,561,578,701      instructions                     #    2.09  insn per cycle         
+       1.255914931 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 3640) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl1_hrd0/runTest.exe
@@ -141,19 +141,19 @@ Relative difference = 1.1252420410236244e-07
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl1_hrd0/check.exe -p 2048 256 2 OMP=
-Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.2.0] [inlineHel=1] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 9.176325e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.050726e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.050726e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 9.078382e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.036724e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.036724e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086812e+00 +- 3.414242e-03 )  GeV^0
-TOTAL       :     1.228546 sec
-     3,600,171,901      cycles                    #    2.920 GHz                    
-     7,138,528,819      instructions              #    1.98  insn per cycle         
-       1.240059403 seconds time elapsed
+TOTAL       :     1.240246 sec
+     3,576,341,649      cycles                           #    2.872 GHz                    
+     7,127,496,668      instructions                     #    1.99  insn per cycle         
+       1.245884945 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 3407) (512y:    1) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl1_hrd0/runTest.exe
@@ -167,19 +167,19 @@ Relative difference = 1.1252420410236244e-07
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl1_hrd0/check.exe -p 2048 256 2 OMP=
-Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.2.0] [inlineHel=1] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 7.044103e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 7.781585e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 7.781585e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 6.921947e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 7.641857e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.641857e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086812e+00 +- 3.414242e-03 )  GeV^0
-TOTAL       :     1.574940 sec
-     3,264,671,911      cycles                    #    2.066 GHz                    
-     6,099,502,522      instructions              #    1.87  insn per cycle         
-       1.581189951 seconds time elapsed
+TOTAL       :     1.600928 sec
+     3,260,409,094      cycles                           #    2.031 GHz                    
+     6,088,341,124      instructions                     #    1.87  insn per cycle         
+       1.606537939 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 3531) (512y:    0) (512z: 2032)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl1_hrd0/runTest.exe
diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl1_hrd1.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl1_hrd1.txt
index fc059049f2..dab893021c 100644
--- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl1_hrd1.txt
+++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl1_hrd1.txt
@@ -35,22 +35,22 @@ CUDACPP_BUILDDIR='build.512z_f_inl1_hrd1'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 
-DATE: 2023-06-16_23:11:40
+DATE: 2023-07-18_23:15:33
 
-On itscrd80.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
+On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd1/gcheck.exe -p 2048 256 2 OMP=
-Process                     = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=1] [hardcodePARAM=1]
+Process                     = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=1] [hardcodePARAM=1]
 Workflow summary            = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 9.829844e+07                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.682794e+08                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.002681e+08                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 9.454682e+07                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.671605e+08                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.998737e+08                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086718e+00 +- 3.413389e-03 )  GeV^0
-TOTAL       :     0.504050 sec
-     2,156,269,977      cycles                    #    2.889 GHz                    
-     2,661,239,316      instructions              #    1.23  insn per cycle         
-       0.803885518 seconds time elapsed
+TOTAL       :     0.480198 sec
+     2,077,693,016      cycles                           #    2.948 GHz                    
+     2,863,537,381      instructions                     #    1.38  insn per cycle         
+       0.762361625 seconds time elapsed
 runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd1/gcheck.exe -p 2048 256 1
 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 127
 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100%
@@ -58,25 +58,25 @@ runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesse
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd1/gcheck.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd1/fgcheck.exe 2 64 2
 Avg ME (C++/CUDA)   = 2.028811e+00
-Avg ME (F77/CUDA)   = 2.0288499668240547
-Relative difference = 1.920672948568199e-05
+Avg ME (F77/CUDA)   = 2.0288499749731272
+Relative difference = 1.9210746159747678e-05
 OK (relative difference <= 5E-3)
 =========================================================================
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd1/check.exe -p 2048 256 2 OMP=
-Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.2.0] [inlineHel=1] [hardcodePARAM=1]
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=1]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 2.707277e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.810959e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.810959e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.773245e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.883059e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.883059e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086780e+00 +- 3.413794e-03 )  GeV^0
-TOTAL       :     3.961686 sec
-    11,627,305,519      cycles                    #    2.933 GHz                    
-    34,903,521,773      instructions              #    3.00  insn per cycle         
-       3.967678741 seconds time elapsed
-=Symbols in CPPProcess.o= (~sse4:  465) (avx2:    0) (512y:    0) (512z:    0)
+TOTAL       :     3.868309 sec
+    11,682,709,052      cycles                           #    3.018 GHz                    
+    34,862,885,807      instructions                     #    2.98  insn per cycle         
+       3.873785737 seconds time elapsed
+=Symbols in CPPProcess.o= (~sse4:  464) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd1/runTest.exe
 [  PASSED  ] 6 tests.
@@ -89,19 +89,19 @@ Relative difference = 4.463890496342449e-08
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl1_hrd1/check.exe -p 2048 256 2 OMP=
-Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.2.0] [inlineHel=1] [hardcodePARAM=1]
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=1]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 5.940642e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 6.518726e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 6.518726e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 5.632454e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 6.155978e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 6.155978e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086781e+00 +- 3.413806e-03 )  GeV^0
-TOTAL       :     1.852150 sec
-     5,740,254,137      cycles                    #    3.092 GHz                    
-    13,368,786,182      instructions              #    2.33  insn per cycle         
-       1.858247428 seconds time elapsed
+TOTAL       :     1.947933 sec
+     5,813,664,106      cycles                           #    2.978 GHz                    
+    13,358,473,505      instructions                     #    2.30  insn per cycle         
+       1.953458740 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4: 2503) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl1_hrd1/runTest.exe
@@ -115,19 +115,19 @@ Relative difference = 5.749220495516028e-08
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl1_hrd1/check.exe -p 2048 256 2 OMP=
-Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.2.0] [inlineHel=1] [hardcodePARAM=1]
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=1]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 9.380718e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.076685e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.076685e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 9.243536e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.059312e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.059312e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086812e+00 +- 3.414242e-03 )  GeV^0
-TOTAL       :     1.203884 sec
-     3,531,102,752      cycles                    #    2.922 GHz                    
-     7,331,328,779      instructions              #    2.08  insn per cycle         
-       1.215736492 seconds time elapsed
+TOTAL       :     1.219381 sec
+     3,526,273,182      cycles                           #    2.882 GHz                    
+     7,320,880,885      instructions                     #    2.08  insn per cycle         
+       1.224848096 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 2935) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl1_hrd1/runTest.exe
@@ -141,19 +141,19 @@ Relative difference = 1.0167922688887485e-07
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl1_hrd1/check.exe -p 2048 256 2 OMP=
-Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.2.0] [inlineHel=1] [hardcodePARAM=1]
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=1]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 9.449213e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.085295e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.085295e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 9.262178e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.066026e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.066026e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086812e+00 +- 3.414242e-03 )  GeV^0
-TOTAL       :     1.195397 sec
-     3,507,221,220      cycles                    #    2.922 GHz                    
-     6,966,297,573      instructions              #    1.99  insn per cycle         
-       1.207630553 seconds time elapsed
+TOTAL       :     1.219224 sec
+     3,497,432,036      cycles                           #    2.859 GHz                    
+     6,955,863,876      instructions                     #    1.99  insn per cycle         
+       1.224609988 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 2744) (512y:    1) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl1_hrd1/runTest.exe
@@ -167,19 +167,19 @@ Relative difference = 1.0167922688887485e-07
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl1_hrd1/check.exe -p 2048 256 2 OMP=
-Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.2.0] [inlineHel=1] [hardcodePARAM=1]
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=1]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 7.302201e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 8.102178e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 8.102178e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 6.931601e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 7.665347e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.665347e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086812e+00 +- 3.414242e-03 )  GeV^0
-TOTAL       :     1.522441 sec
-     3,161,449,900      cycles                    #    2.070 GHz                    
-     5,927,790,892      instructions              #    1.88  insn per cycle         
-       1.534090530 seconds time elapsed
+TOTAL       :     1.598191 sec
+     3,150,397,894      cycles                           #    1.966 GHz                    
+     5,916,588,298      instructions                     #    1.88  insn per cycle         
+       1.603813998 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 2811) (512y:    0) (512z: 1595)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl1_hrd1/runTest.exe
diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd0.txt
index 2eb3f90d4a..fe0561b233 100644
--- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd0.txt
+++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd0.txt
@@ -35,22 +35,22 @@ CUDACPP_BUILDDIR='build.512z_m_inl0_hrd0'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 
-DATE: 2023-06-16_22:52:49
+DATE: 2023-07-18_22:41:41
 
-On itscrd80.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
+On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd0/gcheck.exe -p 2048 256 2 OMP=
-Process                     = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 4.883163e+07                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.153438e+08                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.268351e+08                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 4.392536e+07                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.158109e+08                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.275734e+08                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086689e+00 +- 3.413217e-03 )  GeV^0
-TOTAL       :     0.546765 sec
-     2,277,271,038      cycles                    #    2.894 GHz                    
-     2,877,502,583      instructions              #    1.26  insn per cycle         
-       0.845241312 seconds time elapsed
+TOTAL       :     0.531194 sec
+     2,261,044,639      cycles                           #    2.950 GHz                    
+     3,130,191,732      instructions                     #    1.38  insn per cycle         
+       0.834655474 seconds time elapsed
 runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd0/gcheck.exe -p 2048 256 1
 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 214
 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100%
@@ -63,19 +63,19 @@ Relative difference = 3.241686432649386e-07
 OK (relative difference <= 5E-3)
 =========================================================================
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd0/check.exe -p 2048 256 2 OMP=
-Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.947985e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.010292e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.010292e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.870049e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.918386e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.918386e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086689e+00 +- 3.413217e-03 )  GeV^0
-TOTAL       :     5.490788 sec
-    16,975,144,546      cycles                    #    3.089 GHz                    
-    45,688,141,777      instructions              #    2.69  insn per cycle         
-       5.496744556 seconds time elapsed
+TOTAL       :     5.713565 sec
+    17,359,831,528      cycles                           #    3.036 GHz                    
+    45,571,194,182      instructions                     #    2.63  insn per cycle         
+       5.720648230 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:  625) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd0/runTest.exe
@@ -89,19 +89,19 @@ Relative difference = 3.0048445715164216e-07
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd0/check.exe -p 2048 256 2 OMP=
-Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 3.550845e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.778152e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.778152e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.398273e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.571667e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.571667e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086689e+00 +- 3.413217e-03 )  GeV^0
-TOTAL       :     3.061992 sec
-     9,396,473,786      cycles                    #    3.065 GHz                    
-    26,383,157,357      instructions              #    2.81  insn per cycle         
-       3.078372883 seconds time elapsed
+TOTAL       :     3.193762 sec
+     9,453,328,575      cycles                           #    2.956 GHz                    
+    26,269,627,906      instructions                     #    2.78  insn per cycle         
+       3.200386787 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4: 2530) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd0/runTest.exe
@@ -115,19 +115,19 @@ Relative difference = 3.0048445715164216e-07
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd0/check.exe -p 2048 256 2 OMP=
-Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 6.169685e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 6.889662e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 6.889662e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 6.201835e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 6.775338e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 6.775338e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086689e+00 +- 3.413217e-03 )  GeV^0
-TOTAL       :     1.808628 sec
-     5,256,027,628      cycles                    #    2.899 GHz                    
-    11,191,741,528      instructions              #    2.13  insn per cycle         
-       1.814736091 seconds time elapsed
+TOTAL       :     1.798271 sec
+     5,177,674,203      cycles                           #    2.875 GHz                    
+    11,076,621,451      instructions                     #    2.14  insn per cycle         
+       1.805172074 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 2396) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd0/runTest.exe
@@ -141,19 +141,19 @@ Relative difference = 2.9292737240031234e-07
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd0/check.exe -p 2048 256 2 OMP=
-Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 6.822007e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 7.696291e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 7.696291e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 6.833335e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 7.536038e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.536038e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086689e+00 +- 3.413217e-03 )  GeV^0
-TOTAL       :     1.643994 sec
-     4,833,892,867      cycles                    #    2.931 GHz                    
-    10,629,174,604      instructions              #    2.20  insn per cycle         
-       1.660177459 seconds time elapsed
+TOTAL       :     1.642692 sec
+     4,730,582,091      cycles                           #    2.874 GHz                    
+    10,512,103,131      instructions                     #    2.22  insn per cycle         
+       1.649610489 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 2214) (512y:   86) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd0/runTest.exe
@@ -167,20 +167,20 @@ Relative difference = 2.9292737240031234e-07
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd0/check.exe -p 2048 256 2 OMP=
-Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 4.215185e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 4.538766e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 4.538766e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 4.155965e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 4.407271e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.407271e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086689e+00 +- 3.413217e-03 )  GeV^0
-TOTAL       :     2.596408 sec
-     5,235,856,295      cycles                    #    2.013 GHz                    
-     6,967,015,570      instructions              #    1.33  insn per cycle         
-       2.609031673 seconds time elapsed
-=Symbols in CPPProcess.o= (~sse4:    0) (avx2: 1322) (512y:   98) (512z: 1681)
+TOTAL       :     2.632441 sec
+     5,100,977,173      cycles                           #    1.935 GHz                    
+     6,869,699,327      instructions                     #    1.35  insn per cycle         
+       2.639569883 seconds time elapsed
+=Symbols in CPPProcess.o= (~sse4:    0) (avx2: 1331) (512y:   99) (512z: 1680)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd0/runTest.exe
 [  PASSED  ] 6 tests.
diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd1.txt
index 2922f1f91d..1d55c3f17d 100644
--- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd1.txt
+++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd1.txt
@@ -35,22 +35,22 @@ CUDACPP_BUILDDIR='build.512z_m_inl0_hrd1'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 
-DATE: 2023-06-16_22:53:16
+DATE: 2023-07-18_22:42:07
 
-On itscrd80.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
+On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd1/gcheck.exe -p 2048 256 2 OMP=
-Process                     = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=1]
+Process                     = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 4.913817e+07                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.163142e+08                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.279616e+08                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 4.591732e+07                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.163988e+08                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.278377e+08                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086689e+00 +- 3.413217e-03 )  GeV^0
-TOTAL       :     0.546816 sec
-     2,259,525,710      cycles                    #    2.872 GHz                    
-     2,874,198,806      instructions              #    1.27  insn per cycle         
-       0.845865418 seconds time elapsed
+TOTAL       :     0.525972 sec
+     2,258,252,017      cycles                           #    2.941 GHz                    
+     3,161,692,710      instructions                     #    1.40  insn per cycle         
+       0.831314902 seconds time elapsed
 runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd1/gcheck.exe -p 2048 256 1
 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 208
 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100%
@@ -63,19 +63,19 @@ Relative difference = 3.241686432649386e-07
 OK (relative difference <= 5E-3)
 =========================================================================
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd1/check.exe -p 2048 256 2 OMP=
-Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=1]
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.980085e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.044880e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.044880e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.918606e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.968345e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.968345e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086689e+00 +- 3.413217e-03 )  GeV^0
-TOTAL       :     5.404304 sec
-    16,542,848,858      cycles                    #    3.058 GHz                    
-    44,663,688,353      instructions              #    2.70  insn per cycle         
-       5.410750967 seconds time elapsed
+TOTAL       :     5.572154 sec
+    16,905,818,500      cycles                           #    3.032 GHz                    
+    44,546,391,043      instructions                     #    2.63  insn per cycle         
+       5.579066388 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:  574) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd1/runTest.exe
@@ -89,20 +89,20 @@ Relative difference = 3.0048445715164216e-07
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd1/check.exe -p 2048 256 2 OMP=
-Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=1]
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 3.631536e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.869634e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.869634e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.521745e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.706976e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.706976e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086689e+00 +- 3.413217e-03 )  GeV^0
-TOTAL       :     2.995465 sec
-     8,977,674,629      cycles                    #    2.992 GHz                    
-    25,016,671,404      instructions              #    2.79  insn per cycle         
-       3.009020680 seconds time elapsed
-=Symbols in CPPProcess.o= (~sse4: 2371) (avx2:    0) (512y:    0) (512z:    0)
+TOTAL       :     3.085524 sec
+     9,099,594,779      cycles                           #    2.946 GHz                    
+    24,902,805,370      instructions                     #    2.74  insn per cycle         
+       3.092491354 seconds time elapsed
+=Symbols in CPPProcess.o= (~sse4: 2369) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd1/runTest.exe
 [  PASSED  ] 6 tests.
@@ -115,19 +115,19 @@ Relative difference = 3.0048445715164216e-07
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd1/check.exe -p 2048 256 2 OMP=
-Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=1]
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 5.387090e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 5.916862e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 5.916862e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 5.391961e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 5.824695e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 5.824695e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086689e+00 +- 3.413217e-03 )  GeV^0
-TOTAL       :     2.054054 sec
-     6,000,307,213      cycles                    #    2.915 GHz                    
-    12,337,042,112      instructions              #    2.06  insn per cycle         
-       2.066748623 seconds time elapsed
+TOTAL       :     2.052840 sec
+     5,892,762,503      cycles                           #    2.865 GHz                    
+    12,222,365,250      instructions                     #    2.07  insn per cycle         
+       2.059770395 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 2523) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd1/runTest.exe
@@ -141,19 +141,19 @@ Relative difference = 2.9292737240031234e-07
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd1/check.exe -p 2048 256 2 OMP=
-Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=1]
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 5.658921e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 6.249521e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 6.249521e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 5.645945e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 6.116372e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 6.116372e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086689e+00 +- 3.413217e-03 )  GeV^0
-TOTAL       :     1.961008 sec
-     5,732,861,626      cycles                    #    2.917 GHz                    
-    11,869,224,559      instructions              #    2.07  insn per cycle         
-       1.973908984 seconds time elapsed
+TOTAL       :     1.964121 sec
+     5,617,009,330      cycles                           #    2.855 GHz                    
+    11,751,786,889      instructions                     #    2.09  insn per cycle         
+       1.971325073 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 2246) (512y:  242) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd1/runTest.exe
@@ -167,19 +167,19 @@ Relative difference = 2.9292737240031234e-07
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd1/check.exe -p 2048 256 2 OMP=
-Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=1]
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 4.318569e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 4.652438e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 4.652438e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 4.250862e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 4.513092e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.513092e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086689e+00 +- 3.413217e-03 )  GeV^0
-TOTAL       :     2.536785 sec
-     5,170,015,216      cycles                    #    2.035 GHz                    
-     7,935,725,128      instructions              #    1.53  insn per cycle         
-       2.548762025 seconds time elapsed
+TOTAL       :     2.574723 sec
+     5,071,782,594      cycles                           #    1.967 GHz                    
+     7,819,966,245      instructions                     #    1.54  insn per cycle         
+       2.581825456 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 1280) (512y:  203) (512z: 1763)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd1/runTest.exe
diff --git a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd0.txt
index 2e037d0b10..5537ac3142 100644
--- a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd0.txt
+++ b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd0.txt
@@ -35,38 +35,38 @@ CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
 
-DATE: 2023-06-16_22:53:43
+DATE: 2023-07-18_22:42:34
 
-On itscrd80.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
+On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/gcheck.exe -p 64 256 10 OMP=
-Process                     = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 9.026215e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.052133e+07                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.065201e+07                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 8.503919e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.049667e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.066496e+07                 )  sec^-1
 MeanMatrixElemValue         = ( 1.008920e+02 +- 5.001681e+01 )  GeV^-2
-TOTAL       :     0.487786 sec
-     2,022,328,369      cycles                    #    2.846 GHz                    
-     2,538,461,046      instructions              #    1.26  insn per cycle         
-       0.768629495 seconds time elapsed
+TOTAL       :     0.467316 sec
+     2,001,974,079      cycles                           #    2.930 GHz                    
+     2,757,134,869      instructions                     #    1.38  insn per cycle         
+       0.752234585 seconds time elapsed
 runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/gcheck.exe -p 64 256 1
 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255
 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100%
 .........................................................................
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 1 OMP=
-Process                     = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 1.088515e+07                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.317362e+07                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.330192e+07                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.082976e+07                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.320902e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.335180e+07                 )  sec^-1
 MeanMatrixElemValue         = ( 6.734461e+02 +- 4.775415e+02 )  GeV^-2
-TOTAL       :     0.622651 sec
-     2,554,475,723      cycles                    #    2.940 GHz                    
-     3,479,055,818      instructions              #    1.36  insn per cycle         
-       0.928929641 seconds time elapsed
+TOTAL       :     0.608019 sec
+     2,462,986,844      cycles                           #    2.901 GHz                    
+     3,667,408,117      instructions                     #    1.49  insn per cycle         
+       0.906833656 seconds time elapsed
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2
@@ -76,19 +76,19 @@ Relative difference = 4.469239988637851e-07
 OK (relative difference <= 5E-3)
 =========================================================================
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/check.exe -p 64 256 10 OMP=
-Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 2.659944e+04                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.676456e+04                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.676456e+04                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.514851e+04                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.527304e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.527304e+04                 )  sec^-1
 MeanMatrixElemValue         = ( 1.008920e+02 +- 5.001681e+01 )  GeV^-2
-TOTAL       :     6.184495 sec
-    19,272,486,733      cycles                    #    3.115 GHz                    
-    59,041,427,699      instructions              #    3.06  insn per cycle         
-       6.189768442 seconds time elapsed
+TOTAL       :     6.540428 sec
+    19,799,689,712      cycles                           #    3.027 GHz                    
+    59,004,193,140      instructions                     #    2.98  insn per cycle         
+       6.545917684 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4: 1187) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/runTest.exe
@@ -102,19 +102,19 @@ Relative difference = 4.46923023397472e-07
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd0/check.exe -p 64 256 10 OMP=
-Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 4.959664e+04                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 5.017593e+04                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 5.017593e+04                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 4.780759e+04                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 4.825135e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.825135e+04                 )  sec^-1
 MeanMatrixElemValue         = ( 1.008920e+02 +- 5.001681e+01 )  GeV^-2
-TOTAL       :     3.326440 sec
-    10,241,888,109      cycles                    #    3.075 GHz                    
-    30,662,975,210      instructions              #    2.99  insn per cycle         
-       3.331736567 seconds time elapsed
+TOTAL       :     3.452934 sec
+    10,383,918,995      cycles                           #    3.006 GHz                    
+    30,619,374,623      instructions                     #    2.95  insn per cycle         
+       3.457954159 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4: 5158) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd0/runTest.exe
@@ -128,19 +128,19 @@ Relative difference = 4.46923023397472e-07
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd0/check.exe -p 64 256 10 OMP=
-Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 9.791918e+04                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.003008e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.003008e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.012797e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.032407e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.032407e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 1.008920e+02 +- 5.001681e+01 )  GeV^-2
-TOTAL       :     1.697221 sec
-     4,672,524,845      cycles                    #    2.747 GHz                    
-    10,912,223,676      instructions              #    2.34  insn per cycle         
-       1.702314818 seconds time elapsed
+TOTAL       :     1.642738 sec
+     4,678,343,363      cycles                           #    2.846 GHz                    
+    10,873,667,959      instructions                     #    2.32  insn per cycle         
+       1.647799642 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 4166) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd0/runTest.exe
@@ -154,19 +154,19 @@ Relative difference = 4.469241533230934e-07
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_d_inl0_hrd0/check.exe -p 64 256 10 OMP=
-Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.140088e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.171799e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.171799e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.153352e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.178336e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.178336e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 1.008920e+02 +- 5.001681e+01 )  GeV^-2
-TOTAL       :     1.461092 sec
-     4,145,138,859      cycles                    #    2.829 GHz                    
-    10,109,831,075      instructions              #    2.44  insn per cycle         
-       1.466721195 seconds time elapsed
+TOTAL       :     1.446538 sec
+     4,118,589,455      cycles                           #    2.844 GHz                    
+    10,067,662,580      instructions                     #    2.44  insn per cycle         
+       1.452318936 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 3967) (512y:   32) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_d_inl0_hrd0/runTest.exe
@@ -180,20 +180,20 @@ Relative difference = 4.469241533230934e-07
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_d_inl0_hrd0/check.exe -p 64 256 10 OMP=
-Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 8.316911e+04                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 8.479148e+04                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 8.479148e+04                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 8.072222e+04                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 8.195209e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 8.195209e+04                 )  sec^-1
 MeanMatrixElemValue         = ( 1.008920e+02 +- 5.001681e+01 )  GeV^-2
-TOTAL       :     1.996366 sec
-     3,886,762,322      cycles                    #    1.943 GHz                    
-     5,545,319,779      instructions              #    1.43  insn per cycle         
-       2.006628065 seconds time elapsed
-=Symbols in CPPProcess.o= (~sse4:    0) (avx2: 1125) (512y:   59) (512z: 3431)
+TOTAL       :     2.055995 sec
+     3,852,949,281      cycles                           #    1.873 GHz                    
+     5,509,316,607      instructions                     #    1.43  insn per cycle         
+       2.061113838 seconds time elapsed
+=Symbols in CPPProcess.o= (~sse4:    0) (avx2: 1135) (512y:   60) (512z: 3429)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_d_inl0_hrd0/runTest.exe
 [  PASSED  ] 6 tests.
diff --git a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd0_bridge.txt b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd0_bridge.txt
index a475d7222b..3462aa965e 100644
--- a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd0_bridge.txt
+++ b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd0_bridge.txt
@@ -35,26 +35,26 @@ CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
 
-DATE: 2023-06-16_23:20:42
+DATE: 2023-07-18_23:24:15
 
-On itscrd80.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
+On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/gcheck.exe -p 64 256 10 --bridge OMP=
 WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost
 WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost
 WARNING! Instantiate device Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384)
 WARNING! Set grid in Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384)
-Process                     = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 2.496628e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 8.248177e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 8.248177e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.810503e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 8.141219e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 8.141219e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.008920e+02 +- 5.001681e+01 )  GeV^-2
-TOTAL       :     0.521129 sec
-     2,135,210,830      cycles                    #    2.889 GHz                    
-     2,829,931,118      instructions              #    1.33  insn per cycle         
-       0.798733192 seconds time elapsed
+TOTAL       :     0.486993 sec
+     2,066,943,312      cycles                           #    2.937 GHz                    
+     3,000,534,395      instructions                     #    1.45  insn per cycle         
+       0.761751041 seconds time elapsed
 runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/gcheck.exe -p 64 256 1 --bridge
 WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost
 WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost
@@ -68,17 +68,17 @@ WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost
 WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost
 WARNING! Instantiate device Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288)
 WARNING! Set grid in Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288)
-Process                     = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 2.419279e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 9.564570e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 9.564570e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.682258e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 9.444835e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 9.444835e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 6.734461e+02 +- 4.775415e+02 )  GeV^-2
-TOTAL       :     0.870649 sec
-     3,346,648,541      cycles                    #    2.958 GHz                    
-     4,943,935,079      instructions              #    1.48  insn per cycle         
-       1.190299197 seconds time elapsed
+TOTAL       :     0.823911 sec
+     3,195,140,467      cycles                           #    2.964 GHz                    
+     4,939,449,955      instructions                     #    1.55  insn per cycle         
+       1.137578322 seconds time elapsed
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2
@@ -89,19 +89,19 @@ OK (relative difference <= 5E-3)
 =========================================================================
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/check.exe -p 64 256 10 --bridge OMP=
 WARNING! Instantiate host Bridge (nevt=16384)
-Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 2.646125e+04                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.663019e+04                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.663019e+04                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.512317e+04                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.524834e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.524834e+04                 )  sec^-1
 MeanMatrixElemValue         = ( 1.008920e+02 +- 5.001681e+01 )  GeV^-2
-TOTAL       :     6.222512 sec
-    19,287,909,225      cycles                    #    3.099 GHz                    
-    59,047,382,237      instructions              #    3.06  insn per cycle         
-       6.228060526 seconds time elapsed
+TOTAL       :     6.551068 sec
+    19,859,136,318      cycles                           #    3.030 GHz                    
+    59,010,773,498      instructions                     #    2.97  insn per cycle         
+       6.555009301 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4: 1187) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/runTest.exe
@@ -116,19 +116,19 @@ OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd0/check.exe -p 64 256 10 --bridge OMP=
 WARNING! Instantiate host Bridge (nevt=16384)
-Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 4.998676e+04                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 5.058773e+04                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 5.058773e+04                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 4.849323e+04                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 4.895894e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.895894e+04                 )  sec^-1
 MeanMatrixElemValue         = ( 1.008920e+02 +- 5.001681e+01 )  GeV^-2
-TOTAL       :     3.306706 sec
-    10,251,708,193      cycles                    #    3.096 GHz                    
-    30,706,584,426      instructions              #    3.00  insn per cycle         
-       3.312153580 seconds time elapsed
+TOTAL       :     3.408439 sec
+    10,416,721,509      cycles                           #    3.053 GHz                    
+    30,668,561,647      instructions                     #    2.94  insn per cycle         
+       3.412905583 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4: 5158) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd0/runTest.exe
@@ -143,19 +143,19 @@ OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd0/check.exe -p 64 256 10 --bridge OMP=
 WARNING! Instantiate host Bridge (nevt=16384)
-Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.028551e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.054465e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.054465e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.011440e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.030823e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.030823e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 1.008920e+02 +- 5.001681e+01 )  GeV^-2
-TOTAL       :     1.624375 sec
-     4,708,780,636      cycles                    #    2.891 GHz                    
-    10,966,076,818      instructions              #    2.33  insn per cycle         
-       1.629709328 seconds time elapsed
+TOTAL       :     1.649546 sec
+     4,709,324,732      cycles                           #    2.849 GHz                    
+    10,921,961,578      instructions                     #    2.32  insn per cycle         
+       1.653928822 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 4166) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd0/runTest.exe
@@ -170,19 +170,19 @@ OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_d_inl0_hrd0/check.exe -p 64 256 10 --bridge OMP=
 WARNING! Instantiate host Bridge (nevt=16384)
-Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.164974e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.197651e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.197651e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.150271e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.177125e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.177125e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 1.008920e+02 +- 5.001681e+01 )  GeV^-2
-TOTAL       :     1.437942 sec
-     4,180,610,973      cycles                    #    2.898 GHz                    
-    10,160,732,858      instructions              #    2.43  insn per cycle         
-       1.452787766 seconds time elapsed
+TOTAL       :     1.453735 sec
+     4,150,109,601      cycles                           #    2.849 GHz                    
+    10,114,883,487      instructions                     #    2.44  insn per cycle         
+       1.457709150 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 3967) (512y:   32) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_d_inl0_hrd0/runTest.exe
@@ -197,20 +197,20 @@ OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_d_inl0_hrd0/check.exe -p 64 256 10 --bridge OMP=
 WARNING! Instantiate host Bridge (nevt=16384)
-Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 8.237498e+04                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 8.399499e+04                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 8.399499e+04                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 8.115929e+04                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 8.241926e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 8.241926e+04                 )  sec^-1
 MeanMatrixElemValue         = ( 1.008920e+02 +- 5.001681e+01 )  GeV^-2
-TOTAL       :     2.022999 sec
-     3,915,207,010      cycles                    #    1.933 GHz                    
-     5,583,645,604      instructions              #    1.43  insn per cycle         
-       2.033737208 seconds time elapsed
-=Symbols in CPPProcess.o= (~sse4:    0) (avx2: 1125) (512y:   59) (512z: 3431)
+TOTAL       :     2.048746 sec
+     3,884,858,685      cycles                           #    1.895 GHz                    
+     5,544,738,228      instructions                     #    1.43  insn per cycle         
+       2.052682001 seconds time elapsed
+=Symbols in CPPProcess.o= (~sse4:    0) (avx2: 1135) (512y:   60) (512z: 3429)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_d_inl0_hrd0/runTest.exe
 [  PASSED  ] 6 tests.
diff --git a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd1.txt
index c7fa7a5874..24c2bf3235 100644
--- a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd1.txt
+++ b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd1.txt
@@ -35,38 +35,38 @@ CUDACPP_BUILDDIR='build.512z_d_inl0_hrd1'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
 
-DATE: 2023-06-16_22:54:11
+DATE: 2023-07-18_22:43:02
 
-On itscrd80.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
+On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd1/gcheck.exe -p 64 256 10 OMP=
-Process                     = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=1]
+Process                     = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 8.406208e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.034522e+07                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.051529e+07                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 8.719998e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.041825e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.055822e+07                 )  sec^-1
 MeanMatrixElemValue         = ( 1.008920e+02 +- 5.001681e+01 )  GeV^-2
-TOTAL       :     0.495761 sec
-     2,045,430,788      cycles                    #    2.844 GHz                    
-     2,567,034,130      instructions              #    1.26  insn per cycle         
-       0.778564031 seconds time elapsed
+TOTAL       :     0.467133 sec
+     1,982,839,200      cycles                           #    2.903 GHz                    
+     2,732,083,404      instructions                     #    1.38  insn per cycle         
+       0.750043715 seconds time elapsed
 runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd1/gcheck.exe -p 64 256 1
 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255
 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100%
 .........................................................................
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd1/gcheck.exe -p 2048 256 1 OMP=
-Process                     = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=1]
+Process                     = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 1.078800e+07                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.305420e+07                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.318205e+07                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.075933e+07                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.309859e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.323648e+07                 )  sec^-1
 MeanMatrixElemValue         = ( 6.734461e+02 +- 4.775415e+02 )  GeV^-2
-TOTAL       :     0.628664 sec
-     2,557,626,902      cycles                    #    2.898 GHz                    
-     3,428,444,966      instructions              #    1.34  insn per cycle         
-       0.942725213 seconds time elapsed
+TOTAL       :     0.602656 sec
+     2,465,959,222      cycles                           #    2.929 GHz                    
+     3,643,124,057      instructions                     #    1.48  insn per cycle         
+       0.900913388 seconds time elapsed
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd1/gcheck.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd1/fgcheck.exe 2 64 2
@@ -76,20 +76,20 @@ Relative difference = 4.469239988637851e-07
 OK (relative difference <= 5E-3)
 =========================================================================
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd1/check.exe -p 64 256 10 OMP=
-Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=1]
+Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 2.633830e+04                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.650576e+04                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.650576e+04                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.480213e+04                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.492782e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.492782e+04                 )  sec^-1
 MeanMatrixElemValue         = ( 1.008920e+02 +- 5.001681e+01 )  GeV^-2
-TOTAL       :     6.246365 sec
-    19,302,390,466      cycles                    #    3.090 GHz                    
-    59,308,261,536      instructions              #    3.07  insn per cycle         
-       6.251260293 seconds time elapsed
-=Symbols in CPPProcess.o= (~sse4: 1309) (avx2:    0) (512y:    0) (512z:    0)
+TOTAL       :     6.631622 sec
+    19,752,961,343      cycles                           #    2.979 GHz                    
+    59,276,588,700      instructions                     #    3.00  insn per cycle         
+       6.637134779 seconds time elapsed
+=Symbols in CPPProcess.o= (~sse4: 1312) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd1/runTest.exe
 [  PASSED  ] 6 tests.
@@ -102,19 +102,19 @@ Relative difference = 4.46923023397472e-07
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd1/check.exe -p 64 256 10 OMP=
-Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=1]
+Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 5.048737e+04                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 5.110388e+04                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 5.110388e+04                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 4.862316e+04                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 4.908955e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.908955e+04                 )  sec^-1
 MeanMatrixElemValue         = ( 1.008920e+02 +- 5.001681e+01 )  GeV^-2
-TOTAL       :     3.268980 sec
-    10,121,456,601      cycles                    #    3.094 GHz                    
-    30,320,135,500      instructions              #    3.00  insn per cycle         
-       3.274001858 seconds time elapsed
+TOTAL       :     3.395194 sec
+    10,280,079,176      cycles                           #    3.026 GHz                    
+    30,281,005,461      instructions                     #    2.95  insn per cycle         
+       3.400628119 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4: 5009) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd1/runTest.exe
@@ -128,19 +128,19 @@ Relative difference = 4.46923023397472e-07
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd1/check.exe -p 64 256 10 OMP=
-Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=1]
+Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 9.955459e+04                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.019425e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.019425e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 9.762243e+04                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 9.944781e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 9.944781e+04                 )  sec^-1
 MeanMatrixElemValue         = ( 1.008920e+02 +- 5.001681e+01 )  GeV^-2
-TOTAL       :     1.669916 sec
-     4,868,753,430      cycles                    #    2.909 GHz                    
-    11,322,372,755      instructions              #    2.33  insn per cycle         
-       1.675375031 seconds time elapsed
+TOTAL       :     1.702839 sec
+     4,841,922,221      cycles                           #    2.840 GHz                    
+    11,278,605,221      instructions                     #    2.33  insn per cycle         
+       1.708262409 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 4330) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd1/runTest.exe
@@ -154,19 +154,19 @@ Relative difference = 4.469241533230934e-07
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_d_inl0_hrd1/check.exe -p 64 256 10 OMP=
-Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=1]
+Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.093964e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.122647e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.122647e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.068844e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.090489e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.090489e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 1.008920e+02 +- 5.001681e+01 )  GeV^-2
-TOTAL       :     1.521327 sec
-     4,446,518,647      cycles                    #    2.915 GHz                    
-    10,548,968,003      instructions              #    2.37  insn per cycle         
-       1.526307477 seconds time elapsed
+TOTAL       :     1.556503 sec
+     4,414,507,903      cycles                           #    2.832 GHz                    
+    10,506,180,954      instructions                     #    2.38  insn per cycle         
+       1.561434477 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 4044) (512y:  186) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_d_inl0_hrd1/runTest.exe
@@ -180,20 +180,20 @@ Relative difference = 4.469241533230934e-07
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_d_inl0_hrd1/check.exe -p 64 256 10 OMP=
-Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=1]
+Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 8.202771e+04                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 8.370511e+04                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 8.370511e+04                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 7.947687e+04                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 8.071742e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 8.071742e+04                 )  sec^-1
 MeanMatrixElemValue         = ( 1.008920e+02 +- 5.001681e+01 )  GeV^-2
-TOTAL       :     2.023574 sec
-     3,918,082,536      cycles                    #    1.933 GHz                    
-     5,775,352,534      instructions              #    1.47  insn per cycle         
-       2.028589705 seconds time elapsed
-=Symbols in CPPProcess.o= (~sse4:    0) (avx2: 1089) (512y:  110) (512z: 3505)
+TOTAL       :     2.087384 sec
+     3,884,235,117      cycles                           #    1.859 GHz                    
+     5,736,642,934      instructions                     #    1.48  insn per cycle         
+       2.092378399 seconds time elapsed
+=Symbols in CPPProcess.o= (~sse4:    0) (avx2: 1094) (512y:  110) (512z: 3505)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_d_inl0_hrd1/runTest.exe
 [  PASSED  ] 6 tests.
diff --git a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd0.txt
index 421224af27..e528e70741 100644
--- a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd0.txt
+++ b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd0.txt
@@ -35,60 +35,60 @@ CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
 
-DATE: 2023-06-16_22:54:40
+DATE: 2023-07-18_22:43:30
 
-On itscrd80.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
+On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/gcheck.exe -p 64 256 10 OMP=
-Process                     = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 2.327176e+07                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.257598e+07                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.374537e+07                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.443897e+07                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.281940e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.377474e+07                 )  sec^-1
 MeanMatrixElemValue         = ( 1.008472e+02 +- 5.002447e+01 )  GeV^-2
-TOTAL       :     0.473310 sec
-     2,027,915,544      cycles                    #    2.867 GHz                    
-     2,463,775,432      instructions              #    1.21  insn per cycle         
-       0.764839697 seconds time elapsed
+TOTAL       :     0.450086 sec
+     1,932,638,031      cycles                           #    2.910 GHz                    
+     2,644,542,305      instructions                     #    1.37  insn per cycle         
+       0.736735331 seconds time elapsed
 runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/gcheck.exe -p 64 256 1
 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 249
 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100%
 .........................................................................
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 1 OMP=
-Process                     = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 3.391345e+07                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 4.450691e+07                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 4.517488e+07                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.224449e+07                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 4.399050e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.481223e+07                 )  sec^-1
 MeanMatrixElemValue         = ( 6.630099e+02 +- 4.770719e+02 )  GeV^-2
-TOTAL       :     0.522081 sec
-     2,217,811,440      cycles                    #    2.881 GHz                    
-     2,788,557,835      instructions              #    1.26  insn per cycle         
-       0.827734782 seconds time elapsed
+TOTAL       :     0.496307 sec
+     2,113,797,836      cycles                           #    2.927 GHz                    
+     2,979,656,527      instructions                     #    1.41  insn per cycle         
+       0.780892096 seconds time elapsed
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2
 Avg ME (C++/CUDA)   = 1.412608e+00
-Avg ME (F77/CUDA)   = 1.4132214343518683
-Relative difference = 0.0004342566032956241
+Avg ME (F77/CUDA)   = 1.4132214346515752
+Relative difference = 0.00043425681546129636
 OK (relative difference <= 5E-3)
 =========================================================================
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/check.exe -p 64 256 10 OMP=
-Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 2.701153e+04                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.715579e+04                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.715579e+04                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.558206e+04                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.571081e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.571081e+04                 )  sec^-1
 MeanMatrixElemValue         = ( 1.009236e+02 +- 5.002643e+01 )  GeV^-2
-TOTAL       :     6.095019 sec
-    18,873,238,787      cycles                    #    3.098 GHz                    
-    59,483,025,635      instructions              #    3.15  insn per cycle         
-       6.100057991 seconds time elapsed
+TOTAL       :     6.427410 sec
+    19,447,033,884      cycles                           #    3.025 GHz                    
+    59,469,776,554      instructions                     #    3.06  insn per cycle         
+       6.432547562 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:  970) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/runTest.exe
@@ -102,20 +102,20 @@ Relative difference = 2.1728426918172542e-08
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd0/check.exe -p 64 256 10 OMP=
-Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 8.884416e+04                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 9.046889e+04                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 9.046889e+04                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 8.472251e+04                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 8.617510e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 8.617510e+04                 )  sec^-1
 MeanMatrixElemValue         = ( 1.009236e+02 +- 5.002643e+01 )  GeV^-2
-TOTAL       :     1.866765 sec
-     5,710,913,630      cycles                    #    3.054 GHz                    
-    16,521,962,319      instructions              #    2.89  insn per cycle         
-       1.876809148 seconds time elapsed
-=Symbols in CPPProcess.o= (~sse4: 5863) (avx2:    0) (512y:    0) (512z:    0)
+TOTAL       :     1.956307 sec
+     5,875,533,113      cycles                           #    3.002 GHz                    
+    16,505,988,555      instructions                     #    2.81  insn per cycle         
+       1.961302040 seconds time elapsed
+=Symbols in CPPProcess.o= (~sse4: 5864) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd0/runTest.exe
 [  PASSED  ] 6 tests.
@@ -128,19 +128,19 @@ Relative difference = 1.2948889545181803e-07
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd0/check.exe -p 64 256 10 OMP=
-Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 2.049407e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.136391e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.136391e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.982725e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.060793e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.060793e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 1.008858e+02 +- 5.002467e+01 )  GeV^-2
-TOTAL       :     0.820791 sec
-     2,394,884,327      cycles                    #    2.906 GHz                    
-     5,781,261,143      instructions              #    2.41  insn per cycle         
-       0.825770188 seconds time elapsed
+TOTAL       :     0.849896 sec
+     2,399,557,922      cycles                           #    2.819 GHz                    
+     5,764,751,975      instructions                     #    2.40  insn per cycle         
+       0.854825256 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 4396) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd0/runTest.exe
@@ -154,19 +154,19 @@ Relative difference = 2.7390098302447566e-07
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_f_inl0_hrd0/check.exe -p 64 256 10 OMP=
-Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 2.285711e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.389224e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.389224e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.206732e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.302653e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.302653e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 1.008858e+02 +- 5.002467e+01 )  GeV^-2
-TOTAL       :     0.738019 sec
-     2,162,282,356      cycles                    #    2.914 GHz                    
-     5,351,223,903      instructions              #    2.47  insn per cycle         
-       0.743188108 seconds time elapsed
+TOTAL       :     0.765407 sec
+     2,156,413,717      cycles                           #    2.812 GHz                    
+     5,334,290,181      instructions                     #    2.47  insn per cycle         
+       0.772175970 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 4167) (512y:   25) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_f_inl0_hrd0/runTest.exe
@@ -180,20 +180,20 @@ Relative difference = 2.7390098302447566e-07
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_f_inl0_hrd0/check.exe -p 64 256 10 OMP=
-Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.684448e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.741867e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.741867e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.638617e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.692125e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.692125e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 1.008856e+02 +- 5.002468e+01 )  GeV^-2
-TOTAL       :     0.995951 sec
-     1,960,707,531      cycles                    #    1.961 GHz                    
-     3,020,418,486      instructions              #    1.54  insn per cycle         
-       1.001177596 seconds time elapsed
-=Symbols in CPPProcess.o= (~sse4:    0) (avx2: 1416) (512y:   33) (512z: 3549)
+TOTAL       :     1.024345 sec
+     1,941,949,756      cycles                           #    1.893 GHz                    
+     3,005,992,347      instructions                     #    1.55  insn per cycle         
+       1.029400462 seconds time elapsed
+=Symbols in CPPProcess.o= (~sse4:    0) (avx2: 1425) (512y:   33) (512z: 3547)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_f_inl0_hrd0/runTest.exe
 [  PASSED  ] 6 tests.
diff --git a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd0_bridge.txt b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd0_bridge.txt
index cc4c5da246..9a33b358ad 100644
--- a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd0_bridge.txt
+++ b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd0_bridge.txt
@@ -35,26 +35,26 @@ CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
 
-DATE: 2023-06-16_23:21:10
+DATE: 2023-07-18_23:24:43
 
-On itscrd80.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
+On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/gcheck.exe -p 64 256 10 --bridge OMP=
 WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost
 WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost
 WARNING! Instantiate device Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384)
 WARNING! Set grid in Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384)
-Process                     = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 4.989407e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.193414e+07                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.193414e+07                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 4.944597e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.100237e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.100237e+07                 )  sec^-1
 MeanMatrixElemValue         = ( 1.009071e+02 +- 5.002295e+01 )  GeV^-2
-TOTAL       :     0.482951 sec
-     2,013,685,658      cycles                    #    2.866 GHz                    
-     2,592,200,009      instructions              #    1.29  insn per cycle         
-       0.760014140 seconds time elapsed
+TOTAL       :     0.457752 sec
+     1,966,081,921      cycles                           #    2.939 GHz                    
+     2,842,862,075      instructions                     #    1.45  insn per cycle         
+       0.725727150 seconds time elapsed
 runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/gcheck.exe -p 64 256 1 --bridge
 WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost
 WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost
@@ -68,40 +68,40 @@ WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost
 WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost
 WARNING! Instantiate device Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288)
 WARNING! Set grid in Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288)
-Process                     = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 4.724425e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.653929e+07                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.653929e+07                 )  sec^-1
-MeanMatrixElemValue         = ( 6.737489e+02 +- 4.776370e+02 )  GeV^-2
-TOTAL       :     0.663195 sec
-     2,648,120,389      cycles                    #    2.929 GHz                    
-     3,654,507,605      instructions              #    1.38  insn per cycle         
-       0.962099488 seconds time elapsed
+EvtsPerSec[Rmb+ME]     (23) = ( 4.725699e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.575204e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.575204e+07                 )  sec^-1
+MeanMatrixElemValue         = ( 6.737500e+02 +- 4.776370e+02 )  GeV^-2
+TOTAL       :     0.637779 sec
+     2,573,028,941      cycles                           #    2.957 GHz                    
+     3,873,609,391      instructions                     #    1.51  insn per cycle         
+       0.927196086 seconds time elapsed
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2
 Avg ME (C++/CUDA)   = 1.412608e+00
-Avg ME (F77/CUDA)   = 1.4132214343518683
-Relative difference = 0.0004342566032956241
+Avg ME (F77/CUDA)   = 1.4132214346515752
+Relative difference = 0.00043425681546129636
 OK (relative difference <= 5E-3)
 =========================================================================
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/check.exe -p 64 256 10 --bridge OMP=
 WARNING! Instantiate host Bridge (nevt=16384)
-Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 2.700458e+04                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.715017e+04                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.715017e+04                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.572228e+04                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.585304e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.585304e+04                 )  sec^-1
 MeanMatrixElemValue         = ( 1.009236e+02 +- 5.002643e+01 )  GeV^-2
-TOTAL       :     6.092117 sec
-    18,886,470,035      cycles                    #    3.099 GHz                    
-    59,487,095,856      instructions              #    3.15  insn per cycle         
-       6.096869282 seconds time elapsed
+TOTAL       :     6.394035 sec
+    19,472,549,387      cycles                           #    3.044 GHz                    
+    59,473,909,839      instructions                     #    3.05  insn per cycle         
+       6.398143881 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:  970) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/runTest.exe
@@ -116,20 +116,20 @@ OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd0/check.exe -p 64 256 10 --bridge OMP=
 WARNING! Instantiate host Bridge (nevt=16384)
-Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 8.823507e+04                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 8.985125e+04                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 8.985125e+04                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 8.584034e+04                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 8.734732e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 8.734732e+04                 )  sec^-1
 MeanMatrixElemValue         = ( 1.009236e+02 +- 5.002643e+01 )  GeV^-2
-TOTAL       :     1.882730 sec
-     5,728,750,294      cycles                    #    3.037 GHz                    
-    16,570,856,852      instructions              #    2.89  insn per cycle         
-       1.893324475 seconds time elapsed
-=Symbols in CPPProcess.o= (~sse4: 5863) (avx2:    0) (512y:    0) (512z:    0)
+TOTAL       :     1.933468 sec
+     5,894,952,061      cycles                           #    3.045 GHz                    
+    16,554,069,417      instructions                     #    2.81  insn per cycle         
+       1.937631603 seconds time elapsed
+=Symbols in CPPProcess.o= (~sse4: 5864) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd0/runTest.exe
 [  PASSED  ] 6 tests.
@@ -143,19 +143,19 @@ OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd0/check.exe -p 64 256 10 --bridge OMP=
 WARNING! Instantiate host Bridge (nevt=16384)
-Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 2.034411e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.118355e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.118355e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.001540e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.081012e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.081012e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 1.008858e+02 +- 5.002467e+01 )  GeV^-2
-TOTAL       :     0.830571 sec
-     2,411,689,968      cycles                    #    2.891 GHz                    
-     5,818,447,773      instructions              #    2.41  insn per cycle         
-       0.835666053 seconds time elapsed
+TOTAL       :     0.843109 sec
+     2,417,753,828      cycles                           #    2.857 GHz                    
+     5,800,856,711      instructions                     #    2.40  insn per cycle         
+       0.846901747 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 4396) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd0/runTest.exe
@@ -170,19 +170,19 @@ OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_f_inl0_hrd0/check.exe -p 64 256 10 --bridge OMP=
 WARNING! Instantiate host Bridge (nevt=16384)
-Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 2.276703e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.381223e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.381223e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.196336e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.291341e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.291341e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 1.008858e+02 +- 5.002467e+01 )  GeV^-2
-TOTAL       :     0.750194 sec
-     2,182,144,900      cycles                    #    2.906 GHz                    
-     5,388,939,388      instructions              #    2.47  insn per cycle         
-       0.755052239 seconds time elapsed
+TOTAL       :     0.771255 sec
+     2,177,354,153      cycles                           #    2.811 GHz                    
+     5,370,873,032      instructions                     #    2.47  insn per cycle         
+       0.775290977 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 4167) (512y:   25) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_f_inl0_hrd0/runTest.exe
@@ -197,20 +197,20 @@ OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_f_inl0_hrd0/check.exe -p 64 256 10 --bridge OMP=
 WARNING! Instantiate host Bridge (nevt=16384)
-Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.697532e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.759499e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.759499e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.589791e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.641495e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.641495e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 1.008856e+02 +- 5.002468e+01 )  GeV^-2
-TOTAL       :     0.992848 sec
-     1,971,737,093      cycles                    #    1.979 GHz                    
-     3,062,300,048      instructions              #    1.55  insn per cycle         
-       0.997634050 seconds time elapsed
-=Symbols in CPPProcess.o= (~sse4:    0) (avx2: 1416) (512y:   33) (512z: 3549)
+TOTAL       :     1.058213 sec
+     1,967,102,783      cycles                           #    1.854 GHz                    
+     3,047,015,939      instructions                     #    1.55  insn per cycle         
+       1.062417559 seconds time elapsed
+=Symbols in CPPProcess.o= (~sse4:    0) (avx2: 1425) (512y:   33) (512z: 3547)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_f_inl0_hrd0/runTest.exe
 [  PASSED  ] 6 tests.
diff --git a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd1.txt
index adf6bfc552..b9428218ff 100644
--- a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd1.txt
+++ b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd1.txt
@@ -35,60 +35,60 @@ CUDACPP_BUILDDIR='build.512z_f_inl0_hrd1'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
 
-DATE: 2023-06-16_22:55:04
+DATE: 2023-07-18_22:43:53
 
-On itscrd80.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
+On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd1/gcheck.exe -p 64 256 10 OMP=
-Process                     = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=1]
+Process                     = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 2.339749e+07                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.271976e+07                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.390547e+07                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.428863e+07                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.263490e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.366244e+07                 )  sec^-1
 MeanMatrixElemValue         = ( 1.008472e+02 +- 5.002447e+01 )  GeV^-2
-TOTAL       :     0.474512 sec
-     1,999,862,138      cycles                    #    2.850 GHz                    
-     2,440,400,706      instructions              #    1.22  insn per cycle         
-       0.759138369 seconds time elapsed
+TOTAL       :     0.447829 sec
+     1,946,982,573      cycles                           #    2.935 GHz                    
+     2,680,932,348      instructions                     #    1.38  insn per cycle         
+       0.728844364 seconds time elapsed
 runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd1/gcheck.exe -p 64 256 1
 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 248
 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100%
 .........................................................................
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd1/gcheck.exe -p 2048 256 1 OMP=
-Process                     = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=1]
+Process                     = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 3.328864e+07                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 4.325114e+07                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 4.388368e+07                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.216441e+07                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 4.386159e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.466691e+07                 )  sec^-1
 MeanMatrixElemValue         = ( 6.630099e+02 +- 4.770719e+02 )  GeV^-2
-TOTAL       :     0.521920 sec
-     2,232,068,560      cycles                    #    2.897 GHz                    
-     2,825,671,567      instructions              #    1.27  insn per cycle         
-       0.828183210 seconds time elapsed
+TOTAL       :     0.495369 sec
+     2,121,287,530      cycles                           #    2.937 GHz                    
+     2,989,151,728      instructions                     #    1.41  insn per cycle         
+       0.780758288 seconds time elapsed
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd1/gcheck.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd1/fgcheck.exe 2 64 2
 Avg ME (C++/CUDA)   = 1.412608e+00
-Avg ME (F77/CUDA)   = 1.4132214343518683
-Relative difference = 0.0004342566032956241
+Avg ME (F77/CUDA)   = 1.4132214346515752
+Relative difference = 0.00043425681546129636
 OK (relative difference <= 5E-3)
 =========================================================================
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd1/check.exe -p 64 256 10 OMP=
-Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=1]
+Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 2.699454e+04                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.714152e+04                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.714152e+04                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.561997e+04                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.574840e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.574840e+04                 )  sec^-1
 MeanMatrixElemValue         = ( 1.009236e+02 +- 5.002643e+01 )  GeV^-2
-TOTAL       :     6.092288 sec
-    18,813,707,143      cycles                    #    3.087 GHz                    
-    59,245,916,196      instructions              #    3.15  insn per cycle         
-       6.097217914 seconds time elapsed
+TOTAL       :     6.417399 sec
+    19,389,104,365      cycles                           #    3.020 GHz                    
+    59,233,313,389      instructions                     #    3.05  insn per cycle         
+       6.422046826 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4: 1031) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd1/runTest.exe
@@ -102,20 +102,20 @@ Relative difference = 2.1728426918172542e-08
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd1/check.exe -p 64 256 10 OMP=
-Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=1]
+Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 9.395747e+04                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 9.576744e+04                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 9.576744e+04                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 8.882489e+04                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 9.043796e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 9.043796e+04                 )  sec^-1
 MeanMatrixElemValue         = ( 1.009236e+02 +- 5.002643e+01 )  GeV^-2
-TOTAL       :     1.764298 sec
-     5,437,573,561      cycles                    #    3.075 GHz                    
-    16,318,666,941      instructions              #    3.00  insn per cycle         
-       1.774906001 seconds time elapsed
-=Symbols in CPPProcess.o= (~sse4: 5638) (avx2:    0) (512y:    0) (512z:    0)
+TOTAL       :     1.866886 sec
+     5,621,059,731      cycles                           #    3.009 GHz                    
+    16,304,017,213      instructions                     #    2.90  insn per cycle         
+       1.872024197 seconds time elapsed
+=Symbols in CPPProcess.o= (~sse4: 5639) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd1/runTest.exe
 [  PASSED  ] 6 tests.
@@ -128,19 +128,19 @@ Relative difference = 1.2948889545181803e-07
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd1/check.exe -p 64 256 10 OMP=
-Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=1]
+Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.771528e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.833956e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.833956e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.718948e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.776130e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.776130e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 1.008858e+02 +- 5.002467e+01 )  GeV^-2
-TOTAL       :     0.946284 sec
-     2,773,608,102      cycles                    #    2.919 GHz                    
-     6,345,516,906      instructions              #    2.29  insn per cycle         
-       0.951331609 seconds time elapsed
+TOTAL       :     0.975130 sec
+     2,776,282,140      cycles                           #    2.840 GHz                    
+     6,328,900,147      instructions                     #    2.28  insn per cycle         
+       0.980407362 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 5044) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd1/runTest.exe
@@ -154,19 +154,19 @@ Relative difference = 2.7390098302447566e-07
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_f_inl0_hrd1/check.exe -p 64 256 10 OMP=
-Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=1]
+Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.898207e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.970096e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.970096e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.858619e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.926048e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.926048e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 1.008858e+02 +- 5.002467e+01 )  GeV^-2
-TOTAL       :     0.884490 sec
-     2,572,468,156      cycles                    #    2.896 GHz                    
-     5,899,227,322      instructions              #    2.29  insn per cycle         
-       0.889316146 seconds time elapsed
+TOTAL       :     0.902998 sec
+     2,569,021,706      cycles                           #    2.837 GHz                    
+     5,882,455,801      instructions                     #    2.29  insn per cycle         
+       0.908262650 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 4834) (512y:   18) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_f_inl0_hrd1/runTest.exe
@@ -180,20 +180,20 @@ Relative difference = 2.7390098302447566e-07
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_f_inl0_hrd1/check.exe -p 64 256 10 OMP=
-Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=1]
+Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.504005e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.551206e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.551206e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.446292e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.489559e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.489559e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 1.008856e+02 +- 5.002468e+01 )  GeV^-2
-TOTAL       :     1.114511 sec
-     2,105,248,250      cycles                    #    1.885 GHz                    
-     3,318,671,370      instructions              #    1.58  insn per cycle         
-       1.119425864 seconds time elapsed
-=Symbols in CPPProcess.o= (~sse4:    0) (avx2: 1702) (512y:   31) (512z: 3743)
+TOTAL       :     1.157234 sec
+     2,094,475,571      cycles                           #    1.806 GHz                    
+     3,307,191,778      instructions                     #    1.58  insn per cycle         
+       1.161991638 seconds time elapsed
+=Symbols in CPPProcess.o= (~sse4:    0) (avx2: 1706) (512y:   31) (512z: 3743)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_f_inl0_hrd1/runTest.exe
 [  PASSED  ] 6 tests.
diff --git a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_m_inl0_hrd0.txt
index 509a151f9e..c7a57bbaf7 100644
--- a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_m_inl0_hrd0.txt
+++ b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_m_inl0_hrd0.txt
@@ -35,38 +35,38 @@ CUDACPP_BUILDDIR='build.512z_m_inl0_hrd0'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
 
-DATE: 2023-06-16_22:55:28
+DATE: 2023-07-18_22:44:17
 
-On itscrd80.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
+On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd0/gcheck.exe -p 64 256 10 OMP=
-Process                     = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 8.484913e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.046842e+07                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.063089e+07                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 8.751950e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.049189e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.063151e+07                 )  sec^-1
 MeanMatrixElemValue         = ( 1.008920e+02 +- 5.001681e+01 )  GeV^-2
-TOTAL       :     0.491268 sec
-     2,111,124,427      cycles                    #    2.897 GHz                    
-     2,572,779,858      instructions              #    1.22  insn per cycle         
-       0.786655156 seconds time elapsed
+TOTAL       :     0.465247 sec
+     1,992,296,691      cycles                           #    2.929 GHz                    
+     2,742,134,661      instructions                     #    1.38  insn per cycle         
+       0.745964144 seconds time elapsed
 runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd0/gcheck.exe -p 64 256 1
 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255
 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100%
 .........................................................................
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd0/gcheck.exe -p 2048 256 1 OMP=
-Process                     = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 1.087869e+07                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.318151e+07                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.331093e+07                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.080664e+07                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.316749e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.330839e+07                 )  sec^-1
 MeanMatrixElemValue         = ( 6.734461e+02 +- 4.775415e+02 )  GeV^-2
-TOTAL       :     0.625996 sec
-     2,572,645,106      cycles                    #    2.943 GHz                    
-     3,495,847,613      instructions              #    1.36  insn per cycle         
-       0.932270031 seconds time elapsed
+TOTAL       :     0.609902 sec
+     2,480,338,961      cycles                           #    2.929 GHz                    
+     3,630,300,542      instructions                     #    1.46  insn per cycle         
+       0.907777782 seconds time elapsed
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd0/gcheck.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd0/fgcheck.exe 2 64 2
@@ -76,19 +76,19 @@ Relative difference = 4.418889885423659e-07
 OK (relative difference <= 5E-3)
 =========================================================================
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd0/check.exe -p 64 256 10 OMP=
-Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 2.611473e+04                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.627816e+04                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.627816e+04                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.473267e+04                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.485439e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.485439e+04                 )  sec^-1
 MeanMatrixElemValue         = ( 1.008920e+02 +- 5.001681e+01 )  GeV^-2
-TOTAL       :     6.298063 sec
-    19,584,640,453      cycles                    #    3.108 GHz                    
-    60,128,609,602      instructions              #    3.07  insn per cycle         
-       6.303425917 seconds time elapsed
+TOTAL       :     6.649315 sec
+    20,090,384,636      cycles                           #    3.020 GHz                    
+    60,091,853,129      instructions                     #    2.99  insn per cycle         
+       6.654276724 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4: 1222) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd0/runTest.exe
@@ -102,20 +102,20 @@ Relative difference = 4.345647726386255e-07
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_m_inl0_hrd0/check.exe -p 64 256 10 OMP=
-Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 4.848908e+04                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 4.905341e+04                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 4.905341e+04                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 4.877837e+04                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 4.923415e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.923415e+04                 )  sec^-1
 MeanMatrixElemValue         = ( 1.008920e+02 +- 5.001681e+01 )  GeV^-2
-TOTAL       :     3.403128 sec
-    10,110,840,700      cycles                    #    2.969 GHz                    
-    30,404,987,403      instructions              #    3.01  insn per cycle         
-       3.408194423 seconds time elapsed
-=Symbols in CPPProcess.o= (~sse4: 5293) (avx2:    0) (512y:    0) (512z:    0)
+TOTAL       :     3.383033 sec
+    10,288,712,274      cycles                           #    3.041 GHz                    
+    30,361,296,631      instructions                     #    2.95  insn per cycle         
+       3.388432916 seconds time elapsed
+=Symbols in CPPProcess.o= (~sse4: 5291) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_m_inl0_hrd0/runTest.exe
 [  PASSED  ] 6 tests.
@@ -128,20 +128,20 @@ Relative difference = 4.392710025734405e-07
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_m_inl0_hrd0/check.exe -p 64 256 10 OMP=
-Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.045472e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.071998e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.071998e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.026505e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.047006e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.047006e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 1.008920e+02 +- 5.001681e+01 )  GeV^-2
-TOTAL       :     1.590737 sec
-     4,630,470,943      cycles                    #    2.904 GHz                    
-    10,870,600,906      instructions              #    2.35  insn per cycle         
-       1.595619026 seconds time elapsed
-=Symbols in CPPProcess.o= (~sse4:    0) (avx2: 4303) (512y:    0) (512z:    0)
+TOTAL       :     1.621347 sec
+     4,623,658,846      cycles                           #    2.848 GHz                    
+    10,831,015,634      instructions                     #    2.34  insn per cycle         
+       1.626661019 seconds time elapsed
+=Symbols in CPPProcess.o= (~sse4:    0) (avx2: 4302) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_m_inl0_hrd0/runTest.exe
 [  PASSED  ] 6 tests.
@@ -154,20 +154,20 @@ Relative difference = 4.5288254008796884e-07
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_m_inl0_hrd0/check.exe -p 64 256 10 OMP=
-Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.186438e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.219581e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.219581e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.168519e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.194583e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.194583e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 1.008920e+02 +- 5.001681e+01 )  GeV^-2
-TOTAL       :     1.404412 sec
-     4,081,268,261      cycles                    #    2.899 GHz                    
-    10,057,889,430      instructions              #    2.46  insn per cycle         
-       1.409450424 seconds time elapsed
-=Symbols in CPPProcess.o= (~sse4:    0) (avx2: 4105) (512y:   24) (512z:    0)
+TOTAL       :     1.426741 sec
+     4,058,981,984      cycles                           #    2.842 GHz                    
+    10,018,453,229      instructions                     #    2.47  insn per cycle         
+       1.432132556 seconds time elapsed
+=Symbols in CPPProcess.o= (~sse4:    0) (avx2: 4103) (512y:   24) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_m_inl0_hrd0/runTest.exe
 [  PASSED  ] 6 tests.
@@ -180,20 +180,20 @@ Relative difference = 4.5288254008796884e-07
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_m_inl0_hrd0/check.exe -p 64 256 10 OMP=
-Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 8.045191e+04                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 8.199654e+04                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 8.199654e+04                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 7.857979e+04                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 7.975503e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.975503e+04                 )  sec^-1
 MeanMatrixElemValue         = ( 1.008920e+02 +- 5.001681e+01 )  GeV^-2
-TOTAL       :     2.069671 sec
-     3,996,882,474      cycles                    #    1.933 GHz                    
-     5,753,481,101      instructions              #    1.44  insn per cycle         
-       2.074584748 seconds time elapsed
-=Symbols in CPPProcess.o= (~sse4:    0) (avx2: 1687) (512y:   81) (512z: 3506)
+TOTAL       :     2.112045 sec
+     3,967,011,902      cycles                           #    1.877 GHz                    
+     5,714,986,051      instructions                     #    1.44  insn per cycle         
+       2.117098871 seconds time elapsed
+=Symbols in CPPProcess.o= (~sse4:    0) (avx2: 1695) (512y:   82) (512z: 3505)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_m_inl0_hrd0/runTest.exe
 [  PASSED  ] 6 tests.
diff --git a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_m_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_m_inl0_hrd1.txt
index ccf71ae338..c4d3ede309 100644
--- a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_m_inl0_hrd1.txt
+++ b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_m_inl0_hrd1.txt
@@ -35,38 +35,38 @@ CUDACPP_BUILDDIR='build.512z_m_inl0_hrd1'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
 
-DATE: 2023-06-16_22:55:56
+DATE: 2023-07-18_22:44:45
 
-On itscrd80.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
+On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd1/gcheck.exe -p 64 256 10 OMP=
-Process                     = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=1]
+Process                     = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 8.431385e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.039088e+07                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.054905e+07                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 8.739531e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.045718e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.060032e+07                 )  sec^-1
 MeanMatrixElemValue         = ( 1.008920e+02 +- 5.001681e+01 )  GeV^-2
-TOTAL       :     0.491602 sec
-     2,104,753,968      cycles                    #    2.908 GHz                    
-     2,609,636,072      instructions              #    1.24  insn per cycle         
-       0.781468608 seconds time elapsed
+TOTAL       :     0.463123 sec
+     2,035,412,609      cycles                           #    2.951 GHz                    
+     2,826,905,066      instructions                     #    1.39  insn per cycle         
+       0.757115522 seconds time elapsed
 runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd1/gcheck.exe -p 64 256 1
 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255
 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100%
 .........................................................................
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd1/gcheck.exe -p 2048 256 1 OMP=
-Process                     = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=1]
+Process                     = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 1.077122e+07                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.304284e+07                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.317280e+07                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.072296e+07                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.305322e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.318939e+07                 )  sec^-1
 MeanMatrixElemValue         = ( 6.734461e+02 +- 4.775415e+02 )  GeV^-2
-TOTAL       :     0.629159 sec
-     2,584,689,149      cycles                    #    2.926 GHz                    
-     3,485,232,370      instructions              #    1.35  insn per cycle         
-       0.945332117 seconds time elapsed
+TOTAL       :     0.601037 sec
+     2,472,372,598      cycles                           #    2.942 GHz                    
+     3,637,776,581      instructions                     #    1.47  insn per cycle         
+       0.899651920 seconds time elapsed
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd1/gcheck.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd1/fgcheck.exe 2 64 2
@@ -76,19 +76,19 @@ Relative difference = 4.418889885423659e-07
 OK (relative difference <= 5E-3)
 =========================================================================
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd1/check.exe -p 64 256 10 OMP=
-Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=1]
+Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 2.592801e+04                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.608815e+04                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.608815e+04                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.498871e+04                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.510973e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.510973e+04                 )  sec^-1
 MeanMatrixElemValue         = ( 1.008920e+02 +- 5.001681e+01 )  GeV^-2
-TOTAL       :     6.344781 sec
-    19,563,789,540      cycles                    #    3.082 GHz                    
-    60,327,298,777      instructions              #    3.08  insn per cycle         
-       6.350004821 seconds time elapsed
+TOTAL       :     6.581447 sec
+    20,078,816,266      cycles                           #    3.050 GHz                    
+    60,290,196,754      instructions                     #    3.00  insn per cycle         
+       6.586924347 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4: 1269) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd1/runTest.exe
@@ -102,19 +102,19 @@ Relative difference = 4.345647726386255e-07
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_m_inl0_hrd1/check.exe -p 64 256 10 OMP=
-Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=1]
+Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 4.907260e+04                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 4.965434e+04                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 4.965434e+04                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 4.899627e+04                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 4.946616e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.946616e+04                 )  sec^-1
 MeanMatrixElemValue         = ( 1.008920e+02 +- 5.001681e+01 )  GeV^-2
-TOTAL       :     3.362361 sec
-     9,992,104,885      cycles                    #    2.970 GHz                    
-    30,065,057,144      instructions              #    3.01  insn per cycle         
-       3.367365912 seconds time elapsed
+TOTAL       :     3.368086 sec
+    10,135,128,900      cycles                           #    3.007 GHz                    
+    30,027,179,365      instructions                     #    2.96  insn per cycle         
+       3.373489630 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4: 5113) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_m_inl0_hrd1/runTest.exe
@@ -128,20 +128,20 @@ Relative difference = 4.392710025734405e-07
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_m_inl0_hrd1/check.exe -p 64 256 10 OMP=
-Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=1]
+Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 9.851728e+04                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.008826e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.008826e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 9.789146e+04                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 9.969687e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 9.969687e+04                 )  sec^-1
 MeanMatrixElemValue         = ( 1.008920e+02 +- 5.001681e+01 )  GeV^-2
-TOTAL       :     1.687332 sec
-     4,858,147,798      cycles                    #    2.873 GHz                    
-    11,292,265,821      instructions              #    2.32  insn per cycle         
-       1.692863069 seconds time elapsed
-=Symbols in CPPProcess.o= (~sse4:    0) (avx2: 4448) (512y:    0) (512z:    0)
+TOTAL       :     1.697227 sec
+     4,841,194,253      cycles                           #    2.849 GHz                    
+    11,245,209,770      instructions                     #    2.32  insn per cycle         
+       1.702258703 seconds time elapsed
+=Symbols in CPPProcess.o= (~sse4:    0) (avx2: 4443) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_m_inl0_hrd1/runTest.exe
 [  PASSED  ] 6 tests.
@@ -154,20 +154,20 @@ Relative difference = 4.5288254008796884e-07
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_m_inl0_hrd1/check.exe -p 64 256 10 OMP=
-Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=1]
+Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.079486e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.107998e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.107998e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.070485e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.092848e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.092848e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 1.008920e+02 +- 5.001681e+01 )  GeV^-2
-TOTAL       :     1.542428 sec
-     4,416,137,343      cycles                    #    2.857 GHz                    
-    10,507,517,652      instructions              #    2.38  insn per cycle         
-       1.556019289 seconds time elapsed
-=Symbols in CPPProcess.o= (~sse4:    0) (avx2: 4154) (512y:  177) (512z:    0)
+TOTAL       :     1.555422 sec
+     4,392,434,745      cycles                           #    2.821 GHz                    
+    10,466,303,892      instructions                     #    2.38  insn per cycle         
+       1.560659362 seconds time elapsed
+=Symbols in CPPProcess.o= (~sse4:    0) (avx2: 4149) (512y:  177) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_m_inl0_hrd1/runTest.exe
 [  PASSED  ] 6 tests.
@@ -180,20 +180,20 @@ Relative difference = 4.5288254008796884e-07
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_m_inl0_hrd1/check.exe -p 64 256 10 OMP=
-Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=1]
+Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 8.082610e+04                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 8.236339e+04                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 8.236339e+04                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 7.842072e+04                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 7.959105e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.959105e+04                 )  sec^-1
 MeanMatrixElemValue         = ( 1.008920e+02 +- 5.001681e+01 )  GeV^-2
-TOTAL       :     2.052702 sec
-     4,019,238,803      cycles                    #    1.954 GHz                    
-     5,946,674,764      instructions              #    1.48  insn per cycle         
-       2.063384894 seconds time elapsed
-=Symbols in CPPProcess.o= (~sse4:    0) (avx2: 1626) (512y:  130) (512z: 3560)
+TOTAL       :     2.115498 sec
+     3,984,090,326      cycles                           #    1.881 GHz                    
+     5,904,512,729      instructions                     #    1.48  insn per cycle         
+       2.120336902 seconds time elapsed
+=Symbols in CPPProcess.o= (~sse4:    0) (avx2: 1629) (512y:  130) (512z: 3558)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_m_inl0_hrd1/runTest.exe
 [  PASSED  ] 6 tests.
diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0.txt
index 674d696ef5..29f62f0d17 100644
--- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0.txt
+++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0.txt
@@ -35,38 +35,38 @@ CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 
-DATE: 2023-06-16_22:56:24
+DATE: 2023-07-18_22:45:13
 
-On itscrd80.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
+On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe -p 64 256 1 OMP=
-Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 3.480605e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.516689e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.519182e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.486362e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.515101e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.517375e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 4.063123e+00 +- 2.368970e+00 )  GeV^-4
-TOTAL       :     0.551619 sec
-     2,292,239,292      cycles                    #    2.899 GHz                    
-     3,148,001,142      instructions              #    1.37  insn per cycle         
-       0.847995580 seconds time elapsed
+TOTAL       :     0.528132 sec
+     2,245,590,903      cycles                           #    2.913 GHz                    
+     3,327,857,187      instructions                     #    1.48  insn per cycle         
+       0.837844369 seconds time elapsed
 runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe -p 64 256 1
 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255
 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100%
 .........................................................................
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 1 OMP=
-Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 4.150203e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 4.184079e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 4.185408e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 4.148048e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 4.182455e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.183874e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 6.665112e+00 +- 5.002651e+00 )  GeV^-4
-TOTAL       :     3.060087 sec
-    10,127,877,925      cycles                    #    3.044 GHz                    
-    22,666,867,696      instructions              #    2.24  insn per cycle         
-       3.384890862 seconds time elapsed
+TOTAL       :     3.046993 sec
+     9,895,463,238      cycles                           #    2.997 GHz                    
+    20,573,508,118      instructions                     #    2.08  insn per cycle         
+       3.361731000 seconds time elapsed
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2
@@ -76,20 +76,20 @@ Relative difference = 2.837296512218831e-07
 OK (relative difference <= 5E-3)
 =========================================================================
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/check.exe -p 64 256 1 OMP=
-Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 2.003968e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.005255e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.005255e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.927904e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.928903e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.928903e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 4.063123e+00 +- 2.368970e+00 )  GeV^-4
-TOTAL       :     8.196590 sec
-    25,354,902,918      cycles                    #    3.092 GHz                    
-    78,729,043,359      instructions              #    3.11  insn per cycle         
-       8.201758404 seconds time elapsed
-=Symbols in CPPProcess.o= (~sse4: 4807) (avx2:    0) (512y:    0) (512z:    0)
+TOTAL       :     8.518600 sec
+    25,868,819,765      cycles                           #    3.036 GHz                    
+    78,716,411,819      instructions                     #    3.04  insn per cycle         
+       8.523451145 seconds time elapsed
+=Symbols in CPPProcess.o= (~sse4: 4798) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/runTest.exe
 [  PASSED  ] 6 tests.
@@ -102,19 +102,19 @@ Relative difference = 2.8372990776517314e-07
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/check.exe -p 64 256 1 OMP=
-Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 3.668144e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.672261e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.672261e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.642763e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.646140e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.646140e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 4.063123e+00 +- 2.368970e+00 )  GeV^-4
-TOTAL       :     4.481440 sec
-    12,963,646,112      cycles                    #    2.890 GHz                    
-    39,242,921,927      instructions              #    3.03  insn per cycle         
-       4.486873061 seconds time elapsed
+TOTAL       :     4.514266 sec
+    13,094,581,918      cycles                           #    2.900 GHz                    
+    39,233,207,168      instructions                     #    3.00  insn per cycle         
+       4.520265687 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:13100) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/runTest.exe
@@ -128,19 +128,19 @@ Relative difference = 2.837299079287849e-07
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/check.exe -p 64 256 1 OMP=
-Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 8.660201e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 8.683942e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 8.683942e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 8.470283e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 8.487708e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 8.487708e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 4.063123e+00 +- 2.368970e+00 )  GeV^-4
-TOTAL       :     1.904647 sec
-     5,529,577,984      cycles                    #    2.897 GHz                    
-    13,826,051,095      instructions              #    2.50  insn per cycle         
-       1.909890252 seconds time elapsed
+TOTAL       :     1.948046 sec
+     5,551,646,863      cycles                           #    2.851 GHz                    
+    13,814,806,208      instructions                     #    2.49  insn per cycle         
+       1.953490189 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2:10973) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/runTest.exe
@@ -154,19 +154,19 @@ Relative difference = 2.837296634927675e-07
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/check.exe -p 64 256 1 OMP=
-Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 9.795604e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 9.827220e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 9.827220e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 9.621230e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 9.644526e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 9.644526e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 4.063123e+00 +- 2.368970e+00 )  GeV^-4
-TOTAL       :     1.693130 sec
-     4,892,778,969      cycles                    #    2.891 GHz                    
-    12,473,829,555      instructions              #    2.55  insn per cycle         
-       1.698091935 seconds time elapsed
+TOTAL       :     1.716666 sec
+     4,883,495,439      cycles                           #    2.843 GHz                    
+    12,459,263,271      instructions                     #    2.55  insn per cycle         
+       1.721976811 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2:10670) (512y:   29) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/runTest.exe
@@ -180,20 +180,20 @@ Relative difference = 2.837296634927675e-07
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/check.exe -p 64 256 1 OMP=
-Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 7.725507e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 7.744232e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 7.744232e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 7.530563e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 7.545549e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.545549e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 4.063123e+00 +- 2.368970e+00 )  GeV^-4
-TOTAL       :     2.133997 sec
-     4,079,803,226      cycles                    #    1.909 GHz                    
-     6,360,421,828      instructions              #    1.56  insn per cycle         
-       2.139161289 seconds time elapsed
-=Symbols in CPPProcess.o= (~sse4:    0) (avx2: 1390) (512y:   66) (512z: 9967)
+TOTAL       :     2.191833 sec
+     4,072,863,886      cycles                           #    1.858 GHz                    
+     6,352,171,216      instructions                     #    1.56  insn per cycle         
+       2.197154031 seconds time elapsed
+=Symbols in CPPProcess.o= (~sse4:    0) (avx2: 1400) (512y:   67) (512z: 9966)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/runTest.exe
 [  PASSED  ] 6 tests.
diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_bridge.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_bridge.txt
index f6b028fd57..3c6a1dc7e6 100644
--- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_bridge.txt
+++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_bridge.txt
@@ -35,26 +35,26 @@ CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 
-DATE: 2023-06-16_23:22:07
+DATE: 2023-07-18_23:25:38
 
-On itscrd80.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
+On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe -p 64 256 1 --bridge OMP=
 WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost
 WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost
 WARNING! Instantiate device Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384)
 WARNING! Set grid in Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384)
-Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 3.072253e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.474976e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.474976e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.120653e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.456740e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.456740e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 4.063123e+00 +- 2.368970e+00 )  GeV^-4
-TOTAL       :     0.540694 sec
-     2,277,802,180      cycles                    #    2.911 GHz                    
-     3,167,166,199      instructions              #    1.39  insn per cycle         
-       0.842923456 seconds time elapsed
+TOTAL       :     0.513909 sec
+     2,200,002,847      cycles                           #    2.961 GHz                    
+     3,401,705,219      instructions                     #    1.55  insn per cycle         
+       0.803499105 seconds time elapsed
 runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe -p 64 256 1 --bridge
 WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost
 WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost
@@ -68,17 +68,17 @@ WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost
 WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost
 WARNING! Instantiate device Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288)
 WARNING! Set grid in Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288)
-Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 3.563283e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 4.119139e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 4.119139e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.637838e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 4.119387e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.119387e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 6.665112e+00 +- 5.002651e+00 )  GeV^-4
-TOTAL       :     3.373561 sec
-    11,106,851,106      cycles                    #    3.040 GHz                    
-    22,871,021,639      instructions              #    2.06  insn per cycle         
-       3.711939491 seconds time elapsed
+TOTAL       :     3.312176 sec
+    10,816,030,086      cycles                           #    3.007 GHz                    
+    25,033,915,585      instructions                     #    2.31  insn per cycle         
+       3.656175148 seconds time elapsed
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2
@@ -89,20 +89,20 @@ OK (relative difference <= 5E-3)
 =========================================================================
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/check.exe -p 64 256 1 --bridge OMP=
 WARNING! Instantiate host Bridge (nevt=16384)
-Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 2.000691e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.001917e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.001917e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.930072e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.931007e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.931007e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 4.063123e+00 +- 2.368970e+00 )  GeV^-4
-TOTAL       :     8.213921 sec
-    25,358,245,526      cycles                    #    3.087 GHz                    
-    78,737,811,439      instructions              #    3.11  insn per cycle         
-       8.219246769 seconds time elapsed
-=Symbols in CPPProcess.o= (~sse4: 4807) (avx2:    0) (512y:    0) (512z:    0)
+TOTAL       :     8.510916 sec
+    25,852,391,140      cycles                           #    3.037 GHz                    
+    78,719,532,972      instructions                     #    3.04  insn per cycle         
+       8.514893856 seconds time elapsed
+=Symbols in CPPProcess.o= (~sse4: 4798) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/runTest.exe
 [  PASSED  ] 6 tests.
@@ -116,19 +116,19 @@ OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/check.exe -p 64 256 1 --bridge OMP=
 WARNING! Instantiate host Bridge (nevt=16384)
-Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 3.672231e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.676605e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.676605e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.640619e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.644045e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.644045e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 4.063123e+00 +- 2.368970e+00 )  GeV^-4
-TOTAL       :     4.487084 sec
-    13,003,690,708      cycles                    #    2.899 GHz                    
-    39,263,649,353      instructions              #    3.02  insn per cycle         
-       4.492556901 seconds time elapsed
+TOTAL       :     4.518890 sec
+    13,104,499,016      cycles                           #    2.899 GHz                    
+    39,245,795,332      instructions                     #    2.99  insn per cycle         
+       4.522830732 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:13100) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/runTest.exe
@@ -143,19 +143,19 @@ OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/check.exe -p 64 256 1 --bridge OMP=
 WARNING! Instantiate host Bridge (nevt=16384)
-Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 8.646145e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 8.670478e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 8.670478e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 8.082245e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 8.099128e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 8.099128e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 4.063123e+00 +- 2.368970e+00 )  GeV^-4
-TOTAL       :     1.912940 sec
-     5,540,461,476      cycles                    #    2.890 GHz                    
-    13,840,907,410      instructions              #    2.50  insn per cycle         
-       1.918355435 seconds time elapsed
+TOTAL       :     2.042668 sec
+     5,583,209,757      cycles                           #    2.729 GHz                    
+    13,824,560,946      instructions                     #    2.48  insn per cycle         
+       2.047130706 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2:10973) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/runTest.exe
@@ -170,19 +170,19 @@ OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/check.exe -p 64 256 1 --bridge OMP=
 WARNING! Instantiate host Bridge (nevt=16384)
-Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 9.799344e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 9.831037e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 9.831037e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 9.516477e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 9.539576e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 9.539576e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 4.063123e+00 +- 2.368970e+00 )  GeV^-4
-TOTAL       :     1.689800 sec
-     4,910,066,879      cycles                    #    2.898 GHz                    
-    12,486,034,623      instructions              #    2.54  insn per cycle         
-       1.695175100 seconds time elapsed
+TOTAL       :     1.737653 sec
+     4,899,889,298      cycles                           #    2.815 GHz                    
+    12,471,583,278      instructions                     #    2.55  insn per cycle         
+       1.741981812 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2:10670) (512y:   29) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/runTest.exe
@@ -197,20 +197,20 @@ OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/check.exe -p 64 256 1 --bridge OMP=
 WARNING! Instantiate host Bridge (nevt=16384)
-Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 7.596903e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 7.616542e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 7.616542e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 7.491439e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 7.505589e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.505589e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 4.063123e+00 +- 2.368970e+00 )  GeV^-4
-TOTAL       :     2.177343 sec
-     4,099,033,272      cycles                    #    1.880 GHz                    
-     6,376,304,361      instructions              #    1.56  insn per cycle         
-       2.182949238 seconds time elapsed
-=Symbols in CPPProcess.o= (~sse4:    0) (avx2: 1390) (512y:   66) (512z: 9967)
+TOTAL       :     2.203756 sec
+     4,082,065,510      cycles                           #    1.852 GHz                    
+     6,360,646,741      instructions                     #    1.56  insn per cycle         
+       2.207995337 seconds time elapsed
+=Symbols in CPPProcess.o= (~sse4:    0) (avx2: 1400) (512y:   67) (512z: 9966)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/runTest.exe
 [  PASSED  ] 6 tests.
diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_common.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_common.txt
index 92695858f2..7086cd2ade 100644
--- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_common.txt
+++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_common.txt
@@ -35,38 +35,38 @@ CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 
-DATE: 2023-06-16_23:33:00
+DATE: 2023-07-18_23:36:13
 
-On itscrd80.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
+On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe -p 64 256 1 --common OMP=
-Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:COMMON+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 3.499222e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.525335e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.527608e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.490681e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.519793e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.521970e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 4.197467e-01 +- 3.250467e-01 )  GeV^-4
-TOTAL       :     0.532774 sec
-     2,217,572,873      cycles                    #    2.863 GHz                    
-     3,094,369,354      instructions              #    1.40  insn per cycle         
-       0.834887472 seconds time elapsed
+TOTAL       :     0.510029 sec
+     2,174,253,409      cycles                           #    2.909 GHz                    
+     3,341,213,467      instructions                     #    1.54  insn per cycle         
+       0.806876610 seconds time elapsed
 runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe -p 64 256 1 --common
 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255
 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100%
 .........................................................................
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 1 --common OMP=
-Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:COMMON+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 4.135027e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 4.167071e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 4.168390e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 4.140176e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 4.174234e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.175704e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 1.252232e+02 +- 1.234346e+02 )  GeV^-4
-TOTAL       :     3.158999 sec
-    10,380,525,451      cycles                    #    3.035 GHz                    
-    21,782,548,801      instructions              #    2.10  insn per cycle         
-       3.477627040 seconds time elapsed
+TOTAL       :     3.137252 sec
+    10,160,608,325      cycles                           #    2.996 GHz                    
+    23,752,172,038      instructions                     #    2.34  insn per cycle         
+       3.448867860 seconds time elapsed
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2
@@ -76,20 +76,20 @@ Relative difference = 2.837296512218831e-07
 OK (relative difference <= 5E-3)
 =========================================================================
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/check.exe -p 64 256 1 --common OMP=
-Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 2.007018e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.008318e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.008318e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.925141e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.926080e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.926080e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 4.197467e-01 +- 3.250467e-01 )  GeV^-4
-TOTAL       :     8.184199 sec
-    25,337,258,294      cycles                    #    3.096 GHz                    
-    78,731,832,013      instructions              #    3.11  insn per cycle         
-       8.189083155 seconds time elapsed
-=Symbols in CPPProcess.o= (~sse4: 4807) (avx2:    0) (512y:    0) (512z:    0)
+TOTAL       :     8.531262 sec
+    25,844,397,955      cycles                           #    3.029 GHz                    
+    78,714,919,982      instructions                     #    3.05  insn per cycle         
+       8.535249295 seconds time elapsed
+=Symbols in CPPProcess.o= (~sse4: 4798) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/runTest.exe
 [  PASSED  ] 6 tests.
@@ -102,19 +102,19 @@ Relative difference = 2.8372990776517314e-07
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/check.exe -p 64 256 1 --common OMP=
-Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 3.689365e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.693788e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.693788e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.624915e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.628369e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.628369e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 4.197467e-01 +- 3.250467e-01 )  GeV^-4
-TOTAL       :     4.457849 sec
-    12,962,355,153      cycles                    #    2.906 GHz                    
-    39,244,481,809      instructions              #    3.03  insn per cycle         
-       4.462633637 seconds time elapsed
+TOTAL       :     4.535491 sec
+    13,090,983,779      cycles                           #    2.885 GHz                    
+    39,233,707,851      instructions                     #    3.00  insn per cycle         
+       4.539286898 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:13100) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/runTest.exe
@@ -128,19 +128,19 @@ Relative difference = 2.837299079287849e-07
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/check.exe -p 64 256 1 --common OMP=
-Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 8.621598e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 8.647376e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 8.647376e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 8.408812e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 8.426603e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 8.426603e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 4.197467e-01 +- 3.250467e-01 )  GeV^-4
-TOTAL       :     1.914532 sec
-     5,534,212,685      cycles                    #    2.886 GHz                    
-    13,825,715,526      instructions              #    2.50  insn per cycle         
-       1.920377680 seconds time elapsed
+TOTAL       :     1.962726 sec
+     5,556,000,216      cycles                           #    2.827 GHz                    
+    13,815,069,994      instructions                     #    2.49  insn per cycle         
+       1.966529301 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2:10973) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/runTest.exe
@@ -154,19 +154,19 @@ Relative difference = 2.837296634927675e-07
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/check.exe -p 64 256 1 --common OMP=
-Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:COMMON+RMBHST+MESHST/512y+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 9.758885e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 9.789531e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 9.789531e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 9.569095e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 9.592740e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 9.592740e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 4.197467e-01 +- 3.250467e-01 )  GeV^-4
-TOTAL       :     1.692773 sec
-     4,894,078,000      cycles                    #    2.886 GHz                    
-    12,469,587,925      instructions              #    2.55  insn per cycle         
-       1.697826655 seconds time elapsed
+TOTAL       :     1.725721 sec
+     4,883,644,112      cycles                           #    2.825 GHz                    
+    12,458,591,116      instructions                     #    2.55  insn per cycle         
+       1.729494553 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2:10670) (512y:   29) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/runTest.exe
@@ -180,20 +180,20 @@ Relative difference = 2.837296634927675e-07
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/check.exe -p 64 256 1 --common OMP=
-Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:COMMON+RMBHST+MESHST/512z+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 7.808849e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 7.828810e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 7.828810e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 7.452812e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 7.467478e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.467478e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 4.197467e-01 +- 3.250467e-01 )  GeV^-4
-TOTAL       :     2.112703 sec
-     4,076,094,887      cycles                    #    1.926 GHz                    
-     6,358,678,796      instructions              #    1.56  insn per cycle         
-       2.117704881 seconds time elapsed
-=Symbols in CPPProcess.o= (~sse4:    0) (avx2: 1390) (512y:   66) (512z: 9967)
+TOTAL       :     2.212487 sec
+     4,077,406,536      cycles                           #    1.841 GHz                    
+     6,350,765,529      instructions                     #    1.56  insn per cycle         
+       2.216371467 seconds time elapsed
+=Symbols in CPPProcess.o= (~sse4:    0) (avx2: 1400) (512y:   67) (512z: 9966)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/runTest.exe
 [  PASSED  ] 6 tests.
diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_curhst.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_curhst.txt
index 0fd3a41abe..87192ccb2c 100644
--- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_curhst.txt
+++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_curhst.txt
@@ -35,38 +35,38 @@ CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 
-DATE: 2023-06-16_23:29:57
+DATE: 2023-07-18_23:33:15
 
-On itscrd80.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
+On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe -p 64 256 1 --curhst OMP=
-Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURHST+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 3.491741e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.517860e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.520006e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.502754e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.531694e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.534771e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 4.063123e+00 +- 2.368970e+00 )  GeV^-4
-TOTAL       :     0.531429 sec
-     2,232,387,650      cycles                    #    2.917 GHz                    
-     3,085,215,012      instructions              #    1.38  insn per cycle         
-       0.827733891 seconds time elapsed
+TOTAL       :     0.505727 sec
+     2,155,415,909      cycles                           #    2.917 GHz                    
+     3,312,395,454      instructions                     #    1.54  insn per cycle         
+       0.798203731 seconds time elapsed
 runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe -p 64 256 1 --curhst
 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255
 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100%
 .........................................................................
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 1 --curhst OMP=
-Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURHST+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 4.139492e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 4.171728e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 4.173076e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 4.124123e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 4.157827e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.159261e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 6.665112e+00 +- 5.002651e+00 )  GeV^-4
-TOTAL       :     3.104917 sec
-    10,133,121,889      cycles                    #    3.000 GHz                    
-    23,296,776,590      instructions              #    2.30  insn per cycle         
-       3.434357003 seconds time elapsed
+TOTAL       :     3.086041 sec
+    10,033,097,378      cycles                           #    3.006 GHz                    
+    20,725,109,527      instructions                     #    2.07  insn per cycle         
+       3.397473355 seconds time elapsed
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2
@@ -76,20 +76,20 @@ Relative difference = 2.837296512218831e-07
 OK (relative difference <= 5E-3)
 =========================================================================
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/check.exe -p 64 256 1 --curhst OMP=
-Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 2.009220e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.010486e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.010486e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.922814e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.923784e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.923784e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 4.063123e+00 +- 2.368970e+00 )  GeV^-4
-TOTAL       :     8.175111 sec
-    25,293,924,967      cycles                    #    3.094 GHz                    
-    78,729,597,256      instructions              #    3.11  insn per cycle         
-       8.179990802 seconds time elapsed
-=Symbols in CPPProcess.o= (~sse4: 4807) (avx2:    0) (512y:    0) (512z:    0)
+TOTAL       :     8.540121 sec
+    25,892,455,253      cycles                           #    3.032 GHz                    
+    78,716,335,922      instructions                     #    3.04  insn per cycle         
+       8.543882564 seconds time elapsed
+=Symbols in CPPProcess.o= (~sse4: 4798) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/runTest.exe
 [  PASSED  ] 6 tests.
@@ -102,19 +102,19 @@ Relative difference = 2.8372990776517314e-07
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/check.exe -p 64 256 1 --curhst OMP=
-Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 3.697722e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.701921e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.701921e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.627481e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.630749e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.630749e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 4.063123e+00 +- 2.368970e+00 )  GeV^-4
-TOTAL       :     4.446354 sec
-    12,955,336,489      cycles                    #    2.913 GHz                    
-    39,244,878,043      instructions              #    3.03  insn per cycle         
-       4.451190139 seconds time elapsed
+TOTAL       :     4.530972 sec
+    13,095,277,525      cycles                           #    2.888 GHz                    
+    39,233,630,028      instructions                     #    3.00  insn per cycle         
+       4.534902471 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:13100) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/runTest.exe
@@ -128,19 +128,19 @@ Relative difference = 2.837299079287849e-07
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/check.exe -p 64 256 1 --curhst OMP=
-Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 8.711957e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 8.735812e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 8.735812e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 8.436913e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 8.453832e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 8.453832e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 4.063123e+00 +- 2.368970e+00 )  GeV^-4
-TOTAL       :     1.893317 sec
-     5,534,953,514      cycles                    #    2.918 GHz                    
-    13,828,055,257      instructions              #    2.50  insn per cycle         
-       1.898185319 seconds time elapsed
+TOTAL       :     1.953456 sec
+     5,549,840,287      cycles                           #    2.837 GHz                    
+    13,813,947,073      instructions                     #    2.49  insn per cycle         
+       1.957552217 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2:10973) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/runTest.exe
@@ -154,19 +154,19 @@ Relative difference = 2.837296634927675e-07
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/check.exe -p 64 256 1 --curhst OMP=
-Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 9.783880e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 9.814289e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 9.814289e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 9.557004e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 9.579620e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 9.579620e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 4.063123e+00 +- 2.368970e+00 )  GeV^-4
-TOTAL       :     1.686497 sec
-     4,890,139,976      cycles                    #    2.893 GHz                    
-    12,471,298,231      instructions              #    2.55  insn per cycle         
-       1.691636381 seconds time elapsed
+TOTAL       :     1.726397 sec
+     4,883,869,602      cycles                           #    2.824 GHz                    
+    12,460,671,114      instructions                     #    2.55  insn per cycle         
+       1.730241315 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2:10670) (512y:   29) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/runTest.exe
@@ -180,20 +180,20 @@ Relative difference = 2.837296634927675e-07
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/check.exe -p 64 256 1 --curhst OMP=
-Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 7.791142e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 7.810133e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 7.810133e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 7.511323e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 7.525628e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.525628e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 4.063123e+00 +- 2.368970e+00 )  GeV^-4
-TOTAL       :     2.116611 sec
-     4,073,012,903      cycles                    #    1.922 GHz                    
-     6,360,730,501      instructions              #    1.56  insn per cycle         
-       2.121591447 seconds time elapsed
-=Symbols in CPPProcess.o= (~sse4:    0) (avx2: 1390) (512y:   66) (512z: 9967)
+TOTAL       :     2.193124 sec
+     4,069,613,051      cycles                           #    1.854 GHz                    
+     6,350,163,242      instructions                     #    1.56  insn per cycle         
+       2.197268123 seconds time elapsed
+=Symbols in CPPProcess.o= (~sse4:    0) (avx2: 1400) (512y:   67) (512z: 9966)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/runTest.exe
 [  PASSED  ] 6 tests.
diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_rmbhst.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_rmbhst.txt
index a4f7f78bc7..abafa479c3 100644
--- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_rmbhst.txt
+++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_rmbhst.txt
@@ -35,23 +35,23 @@ CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 
-DATE: 2023-06-16_23:26:59
+DATE: 2023-07-18_23:30:22
 
-On itscrd80.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
+On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe -p 64 256 1 --rmbhst OMP=
 WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost
-Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURHST+RMBHST+MESDEV/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 3.163660e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.517064e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.519281e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.239780e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.542155e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.544417e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 4.063123e+00 +- 2.368970e+00 )  GeV^-4
-TOTAL       :     0.535969 sec
-     2,257,409,350      cycles                    #    2.913 GHz                    
-     3,148,870,456      instructions              #    1.39  insn per cycle         
-       0.835585235 seconds time elapsed
+TOTAL       :     0.509580 sec
+     2,192,454,861      cycles                           #    2.954 GHz                    
+     3,384,298,240      instructions                     #    1.54  insn per cycle         
+       0.802945215 seconds time elapsed
 runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe -p 64 256 1 --rmbhst
 WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost
 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255
@@ -59,17 +59,17 @@ WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost
 .........................................................................
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 1 --rmbhst OMP=
 WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost
-Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURHST+RMBHST+MESDEV/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 3.637869e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 4.154701e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 4.156020e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.729630e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 4.165698e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.167125e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 6.665112e+00 +- 5.002651e+00 )  GeV^-4
-TOTAL       :     3.270916 sec
-    10,746,580,301      cycles                    #    3.041 GHz                    
-    21,720,295,313      instructions              #    2.02  insn per cycle         
-       3.590840027 seconds time elapsed
+TOTAL       :     3.209061 sec
+    10,443,839,106      cycles                           #    3.019 GHz                    
+    24,645,837,453      instructions                     #    2.36  insn per cycle         
+       3.518663731 seconds time elapsed
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2
@@ -79,20 +79,20 @@ Relative difference = 2.837296512218831e-07
 OK (relative difference <= 5E-3)
 =========================================================================
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/check.exe -p 64 256 1 --rmbhst OMP=
-Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 2.007137e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.008416e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.008416e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.938188e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.939108e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.939108e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 4.063123e+00 +- 2.368970e+00 )  GeV^-4
-TOTAL       :     8.182065 sec
-    25,337,840,293      cycles                    #    3.096 GHz                    
-    78,726,782,076      instructions              #    3.11  insn per cycle         
-       8.187223661 seconds time elapsed
-=Symbols in CPPProcess.o= (~sse4: 4807) (avx2:    0) (512y:    0) (512z:    0)
+TOTAL       :     8.471313 sec
+    25,846,239,910      cycles                           #    3.050 GHz                    
+    78,714,529,048      instructions                     #    3.05  insn per cycle         
+       8.475445839 seconds time elapsed
+=Symbols in CPPProcess.o= (~sse4: 4798) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/runTest.exe
 [  PASSED  ] 6 tests.
@@ -105,19 +105,19 @@ Relative difference = 2.8372990776517314e-07
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/check.exe -p 64 256 1 --rmbhst OMP=
-Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 3.676580e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.680802e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.680802e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.654348e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.657703e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.657703e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 4.063123e+00 +- 2.368970e+00 )  GeV^-4
-TOTAL       :     4.471527 sec
-    12,958,437,919      cycles                    #    2.896 GHz                    
-    39,245,010,758      instructions              #    3.03  insn per cycle         
-       4.476404809 seconds time elapsed
+TOTAL       :     4.496959 sec
+    13,076,441,166      cycles                           #    2.906 GHz                    
+    39,231,499,959      instructions                     #    3.00  insn per cycle         
+       4.501151358 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:13100) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/runTest.exe
@@ -131,19 +131,19 @@ Relative difference = 2.837299079287849e-07
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/check.exe -p 64 256 1 --rmbhst OMP=
-Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 8.510079e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 8.532022e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 8.532022e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 8.406339e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 8.423842e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 8.423842e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 4.063123e+00 +- 2.368970e+00 )  GeV^-4
-TOTAL       :     1.938058 sec
-     5,528,373,338      cycles                    #    2.847 GHz                    
-    13,827,020,883      instructions              #    2.50  insn per cycle         
-       1.943269358 seconds time elapsed
+TOTAL       :     1.960439 sec
+     5,550,772,902      cycles                           #    2.827 GHz                    
+    13,813,947,600      instructions                     #    2.49  insn per cycle         
+       1.964408897 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2:10973) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/runTest.exe
@@ -157,19 +157,19 @@ Relative difference = 2.837296634927675e-07
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/check.exe -p 64 256 1 --rmbhst OMP=
-Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 9.746577e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 9.776421e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 9.776421e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 9.481900e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 9.504023e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 9.504023e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 4.063123e+00 +- 2.368970e+00 )  GeV^-4
-TOTAL       :     1.693197 sec
-     4,886,719,743      cycles                    #    2.881 GHz                    
-    12,471,368,394      instructions              #    2.55  insn per cycle         
-       1.698247712 seconds time elapsed
+TOTAL       :     1.739600 sec
+     4,888,239,642      cycles                           #    2.805 GHz                    
+    12,461,742,004      instructions                     #    2.55  insn per cycle         
+       1.743840420 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2:10670) (512y:   29) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/runTest.exe
@@ -183,20 +183,20 @@ Relative difference = 2.837296634927675e-07
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/check.exe -p 64 256 1 --rmbhst OMP=
-Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 7.653939e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 7.671885e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 7.671885e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 7.571336e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 7.585205e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.585205e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 4.063123e+00 +- 2.368970e+00 )  GeV^-4
-TOTAL       :     2.156844 sec
-     4,112,325,475      cycles                    #    1.906 GHz                    
-     6,362,849,586      instructions              #    1.55  insn per cycle         
-       2.161761478 seconds time elapsed
-=Symbols in CPPProcess.o= (~sse4:    0) (avx2: 1390) (512y:   66) (512z: 9967)
+TOTAL       :     2.175833 sec
+     4,068,802,627      cycles                           #    1.867 GHz                    
+     6,350,264,991      instructions                     #    1.56  insn per cycle         
+       2.179973296 seconds time elapsed
+=Symbols in CPPProcess.o= (~sse4:    0) (avx2: 1400) (512y:   67) (512z: 9966)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/runTest.exe
 [  PASSED  ] 6 tests.
diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd1.txt
index f6164c5cd9..dbcf951307 100644
--- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd1.txt
+++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd1.txt
@@ -35,38 +35,38 @@ CUDACPP_BUILDDIR='build.512z_d_inl0_hrd1'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 
-DATE: 2023-06-16_22:57:00
+DATE: 2023-07-18_22:45:49
 
-On itscrd80.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
+On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd1/gcheck.exe -p 64 256 1 OMP=
-Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=1]
+Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 3.505261e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.540833e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.543305e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.484189e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.512941e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.515884e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 4.063123e+00 +- 2.368970e+00 )  GeV^-4
-TOTAL       :     0.552999 sec
-     2,261,605,886      cycles                    #    2.864 GHz                    
-     3,081,160,015      instructions              #    1.36  insn per cycle         
-       0.848098050 seconds time elapsed
+TOTAL       :     0.525119 sec
+     2,277,820,090      cycles                           #    2.954 GHz                    
+     3,332,750,506      instructions                     #    1.46  insn per cycle         
+       0.836036844 seconds time elapsed
 runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd1/gcheck.exe -p 64 256 1
 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255
 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100%
 .........................................................................
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd1/gcheck.exe -p 2048 256 1 OMP=
-Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=1]
+Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 4.144541e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 4.178169e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 4.179509e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 4.142184e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 4.176352e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.177740e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 6.665112e+00 +- 5.002651e+00 )  GeV^-4
-TOTAL       :     3.060795 sec
-    10,085,184,977      cycles                    #    3.025 GHz                    
-    22,493,985,254      instructions              #    2.23  insn per cycle         
-       3.393105734 seconds time elapsed
+TOTAL       :     3.035367 sec
+     9,924,089,083      cycles                           #    3.018 GHz                    
+    23,233,623,044      instructions                     #    2.34  insn per cycle         
+       3.348425246 seconds time elapsed
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd1/gcheck.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd1/fgcheck.exe 2 64 2
@@ -76,20 +76,20 @@ Relative difference = 2.837296512218831e-07
 OK (relative difference <= 5E-3)
 =========================================================================
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd1/check.exe -p 64 256 1 OMP=
-Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=1]
+Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.940758e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.941897e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.941897e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.932718e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.933696e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.933696e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 4.063123e+00 +- 2.368970e+00 )  GeV^-4
-TOTAL       :     8.462419 sec
-    25,229,958,045      cycles                    #    2.982 GHz                    
-    78,471,131,282      instructions              #    3.11  insn per cycle         
-       8.467976850 seconds time elapsed
-=Symbols in CPPProcess.o= (~sse4: 4138) (avx2:    0) (512y:    0) (512z:    0)
+TOTAL       :     8.497912 sec
+    25,777,112,944      cycles                           #    3.033 GHz                    
+    78,466,665,546      instructions                     #    3.04  insn per cycle         
+       8.502791973 seconds time elapsed
+=Symbols in CPPProcess.o= (~sse4: 4170) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd1/runTest.exe
 [  PASSED  ] 6 tests.
@@ -102,20 +102,20 @@ Relative difference = 2.8372990776517314e-07
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd1/check.exe -p 64 256 1 OMP=
-Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=1]
+Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 3.727132e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.731470e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.731470e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.597924e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.601066e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.601066e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 4.063123e+00 +- 2.368970e+00 )  GeV^-4
-TOTAL       :     4.410452 sec
-    12,894,155,265      cycles                    #    2.921 GHz                    
-    39,184,570,946      instructions              #    3.04  insn per cycle         
-       4.415732132 seconds time elapsed
-=Symbols in CPPProcess.o= (~sse4:12872) (avx2:    0) (512y:    0) (512z:    0)
+TOTAL       :     4.570165 sec
+    13,100,547,026      cycles                           #    2.867 GHz                    
+    39,170,316,646      instructions                     #    2.99  insn per cycle         
+       4.575595642 seconds time elapsed
+=Symbols in CPPProcess.o= (~sse4:12885) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd1/runTest.exe
 [  PASSED  ] 6 tests.
@@ -128,19 +128,19 @@ Relative difference = 2.837299079287849e-07
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd1/check.exe -p 64 256 1 OMP=
-Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=1]
+Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 8.691843e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 8.716729e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 8.716729e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 8.390403e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 8.407788e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 8.407788e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 4.063123e+00 +- 2.368970e+00 )  GeV^-4
-TOTAL       :     1.898008 sec
-     5,540,010,361      cycles                    #    2.913 GHz                    
-    13,917,256,443      instructions              #    2.51  insn per cycle         
-       1.903063407 seconds time elapsed
+TOTAL       :     1.967187 sec
+     5,567,550,322      cycles                           #    2.829 GHz                    
+    13,905,495,574      instructions                     #    2.50  insn per cycle         
+       1.972227716 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2:11079) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd1/runTest.exe
@@ -154,19 +154,19 @@ Relative difference = 2.837296634927675e-07
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd1/check.exe -p 64 256 1 OMP=
-Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=1]
+Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 9.729941e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 9.760433e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 9.760433e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 9.298393e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 9.319835e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 9.319835e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 4.063123e+00 +- 2.368970e+00 )  GeV^-4
-TOTAL       :     1.696750 sec
-     4,936,264,808      cycles                    #    2.903 GHz                    
-    12,569,278,770      instructions              #    2.55  insn per cycle         
-       1.701843841 seconds time elapsed
+TOTAL       :     1.776189 sec
+     4,927,895,040      cycles                           #    2.773 GHz                    
+    12,557,484,324      instructions                     #    2.55  insn per cycle         
+       1.781515054 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2:10689) (512y:  180) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd1/runTest.exe
@@ -180,20 +180,20 @@ Relative difference = 2.837296634927675e-07
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd1/check.exe -p 64 256 1 OMP=
-Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=1]
+Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 7.790703e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 7.810947e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 7.810947e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 7.516389e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 7.530329e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.530329e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 4.063123e+00 +- 2.368970e+00 )  GeV^-4
-TOTAL       :     2.116619 sec
-     4,079,630,480      cycles                    #    1.924 GHz                    
-     6,456,227,558      instructions              #    1.58  insn per cycle         
-       2.121736786 seconds time elapsed
-=Symbols in CPPProcess.o= (~sse4:    0) (avx2: 1301) (512y:  170) (512z:10055)
+TOTAL       :     2.195638 sec
+     4,074,235,200      cycles                           #    1.856 GHz                    
+     6,445,069,906      instructions                     #    1.58  insn per cycle         
+       2.200826665 seconds time elapsed
+=Symbols in CPPProcess.o= (~sse4:    0) (avx2: 1302) (512y:  170) (512z:10055)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd1/runTest.exe
 [  PASSED  ] 6 tests.
diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl1_hrd0.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl1_hrd0.txt
index 19fe3b6889..7f1889b172 100644
--- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl1_hrd0.txt
+++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl1_hrd0.txt
@@ -35,38 +35,38 @@ CUDACPP_BUILDDIR='build.512z_d_inl1_hrd0'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 
-DATE: 2023-06-16_23:12:02
+DATE: 2023-07-18_23:15:54
 
-On itscrd80.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
+On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd0/gcheck.exe -p 64 256 1 OMP=
-Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=1] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=1] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 3.228389e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.252132e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.253939e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.251649e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.277334e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.279317e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 4.063123e+00 +- 2.368970e+00 )  GeV^-4
-TOTAL       :     0.557644 sec
-     2,313,651,554      cycles                    #    2.916 GHz                    
-     3,231,773,253      instructions              #    1.40  insn per cycle         
-       0.852166017 seconds time elapsed
+TOTAL       :     0.536329 sec
+     2,253,929,687      cycles                           #    2.954 GHz                    
+     3,457,143,318      instructions                     #    1.53  insn per cycle         
+       0.822389443 seconds time elapsed
 runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd0/gcheck.exe -p 64 256 1
 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255
 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100%
 .........................................................................
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd0/gcheck.exe -p 2048 256 1 OMP=
-Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=1] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=1] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 3.762297e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.789061e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.790098e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.752729e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.780873e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.782025e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 6.665112e+00 +- 5.002651e+00 )  GeV^-4
-TOTAL       :     3.347281 sec
-    10,991,558,985      cycles                    #    3.045 GHz                    
-    24,428,456,620      instructions              #    2.22  insn per cycle         
-       3.667854463 seconds time elapsed
+TOTAL       :     3.324142 sec
+    10,808,418,598      cycles                           #    3.022 GHz                    
+    25,158,541,697      instructions                     #    2.33  insn per cycle         
+       3.635743762 seconds time elapsed
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd0/gcheck.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd0/fgcheck.exe 2 64 2
@@ -76,20 +76,20 @@ Relative difference = 2.837296513854949e-07
 OK (relative difference <= 5E-3)
 =========================================================================
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd0/check.exe -p 64 256 1 OMP=
-Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.2.0] [inlineHel=1] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 4.470440e+02                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 4.471090e+02                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 4.471090e+02                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 4.386990e+02                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 4.387472e+02                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.387472e+02                 )  sec^-1
 MeanMatrixElemValue         = ( 4.063123e+00 +- 2.368970e+00 )  GeV^-4
-TOTAL       :    36.698448 sec
-   113,650,760,924      cycles                    #    3.097 GHz                    
-   145,145,803,355      instructions              #    1.28  insn per cycle         
-      36.703372680 seconds time elapsed
-=Symbols in CPPProcess.o= (~sse4:21749) (avx2:    0) (512y:    0) (512z:    0)
+TOTAL       :    37.393795 sec
+   113,596,849,149      cycles                           #    3.038 GHz                    
+   145,178,851,492      instructions                     #    1.28  insn per cycle         
+      37.398172394 seconds time elapsed
+=Symbols in CPPProcess.o= (~sse4:21790) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd0/runTest.exe
 [  PASSED  ] 6 tests.
@@ -102,19 +102,19 @@ Relative difference = 2.8372991823632784e-07
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl1_hrd0/check.exe -p 64 256 1 OMP=
-Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.2.0] [inlineHel=1] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 3.344679e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.348354e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.348354e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.199113e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.201670e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.201670e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 4.063123e+00 +- 2.368970e+00 )  GeV^-4
-TOTAL       :     4.914072 sec
-    14,660,206,602      cycles                    #    2.982 GHz                    
-    37,434,709,642      instructions              #    2.55  insn per cycle         
-       4.919337364 seconds time elapsed
+TOTAL       :     5.135922 sec
+    14,655,728,105      cycles                           #    2.852 GHz                    
+    37,424,135,547      instructions                     #    2.55  insn per cycle         
+       5.140131665 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:67993) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl1_hrd0/runTest.exe
@@ -128,19 +128,19 @@ Relative difference = 2.8372990661989057e-07
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl1_hrd0/check.exe -p 64 256 1 OMP=
-Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.2.0] [inlineHel=1] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 7.882298e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 7.901243e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 7.901243e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 7.704464e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 7.719214e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.719214e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 4.063123e+00 +- 2.368970e+00 )  GeV^-4
-TOTAL       :     2.091187 sec
-     6,050,971,713      cycles                    #    2.889 GHz                    
-    12,923,420,886      instructions              #    2.14  insn per cycle         
-       2.096398672 seconds time elapsed
+TOTAL       :     2.138785 sec
+     6,043,034,186      cycles                           #    2.822 GHz                    
+    12,913,049,362      instructions                     #    2.14  insn per cycle         
+       2.142766618 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2:46338) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl1_hrd0/runTest.exe
@@ -154,19 +154,19 @@ Relative difference = 2.837296715097453e-07
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl1_hrd0/check.exe -p 64 256 1 OMP=
-Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.2.0] [inlineHel=1] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 9.423590e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 9.450741e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 9.450741e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 9.425836e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 9.447500e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 9.447500e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 4.063123e+00 +- 2.368970e+00 )  GeV^-4
-TOTAL       :     1.751970 sec
-     5,013,534,941      cycles                    #    2.856 GHz                    
-    11,332,565,506      instructions              #    2.26  insn per cycle         
-       1.757089218 seconds time elapsed
+TOTAL       :     1.749535 sec
+     4,997,190,169      cycles                           #    2.851 GHz                    
+    11,318,212,652      instructions                     #    2.26  insn per cycle         
+       1.753835753 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2:40036) (512y:  188) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl1_hrd0/runTest.exe
@@ -180,19 +180,19 @@ Relative difference = 2.837296715097453e-07
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl1_hrd0/check.exe -p 64 256 1 OMP=
-Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.2.0] [inlineHel=1] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 7.963573e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 7.983731e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 7.983731e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 7.771438e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 7.786530e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.786530e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 4.063123e+00 +- 2.368970e+00 )  GeV^-4
-TOTAL       :     2.070157 sec
-     3,940,063,125      cycles                    #    1.900 GHz                    
-     5,797,816,506      instructions              #    1.47  insn per cycle         
-       2.075319073 seconds time elapsed
+TOTAL       :     2.120548 sec
+     3,944,759,858      cycles                           #    1.858 GHz                    
+     5,787,397,751      instructions                     #    1.47  insn per cycle         
+       2.124873095 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 1924) (512y:  317) (512z:38936)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl1_hrd0/runTest.exe
diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl1_hrd1.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl1_hrd1.txt
index 5e839cdef4..382af9d5f9 100644
--- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl1_hrd1.txt
+++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl1_hrd1.txt
@@ -35,38 +35,38 @@ CUDACPP_BUILDDIR='build.512z_d_inl1_hrd1'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 
-DATE: 2023-06-16_23:13:09
+DATE: 2023-07-18_23:17:01
 
-On itscrd80.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
+On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd1/gcheck.exe -p 64 256 1 OMP=
-Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=1] [hardcodePARAM=1]
+Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=1] [hardcodePARAM=1]
 Workflow summary            = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 3.263284e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.287600e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.289479e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.270360e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.295867e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.297816e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 4.063123e+00 +- 2.368970e+00 )  GeV^-4
-TOTAL       :     0.554858 sec
-     2,295,580,731      cycles                    #    2.899 GHz                    
-     3,230,493,146      instructions              #    1.41  insn per cycle         
-       0.849020118 seconds time elapsed
+TOTAL       :     0.530823 sec
+     2,270,683,029      cycles                           #    2.966 GHz                    
+     3,480,847,783      instructions                     #    1.53  insn per cycle         
+       0.823108291 seconds time elapsed
 runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd1/gcheck.exe -p 64 256 1
 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255
 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100%
 .........................................................................
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd1/gcheck.exe -p 2048 256 1 OMP=
-Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=1] [hardcodePARAM=1]
+Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=1] [hardcodePARAM=1]
 Workflow summary            = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 3.794287e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.821112e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.822179e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.795934e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.824765e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.825972e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 6.665112e+00 +- 5.002651e+00 )  GeV^-4
-TOTAL       :     3.303147 sec
-    10,830,294,927      cycles                    #    3.034 GHz                    
-    23,889,615,566      instructions              #    2.21  insn per cycle         
-       3.626334184 seconds time elapsed
+TOTAL       :     3.278040 sec
+    10,535,389,599      cycles                           #    2.985 GHz                    
+    22,867,797,759      instructions                     #    2.17  insn per cycle         
+       3.589361332 seconds time elapsed
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd1/gcheck.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd1/fgcheck.exe 2 64 2
@@ -76,20 +76,20 @@ Relative difference = 2.837296513854949e-07
 OK (relative difference <= 5E-3)
 =========================================================================
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd1/check.exe -p 64 256 1 OMP=
-Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.2.0] [inlineHel=1] [hardcodePARAM=1]
+Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=1]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 4.442294e+02                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 4.442931e+02                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 4.442931e+02                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 4.327519e+02                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 4.327978e+02                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.327978e+02                 )  sec^-1
 MeanMatrixElemValue         = ( 4.063123e+00 +- 2.368970e+00 )  GeV^-4
-TOTAL       :    36.931797 sec
-   114,071,883,560      cycles                    #    3.089 GHz                    
-   145,758,435,006      instructions              #    1.28  insn per cycle         
-      36.937096359 seconds time elapsed
-=Symbols in CPPProcess.o= (~sse4:22580) (avx2:    0) (512y:    0) (512z:    0)
+TOTAL       :    37.907398 sec
+   114,467,345,138      cycles                           #    3.020 GHz                    
+   145,678,466,655      instructions                     #    1.27  insn per cycle         
+      37.911583534 seconds time elapsed
+=Symbols in CPPProcess.o= (~sse4:22539) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd1/runTest.exe
 [  PASSED  ] 6 tests.
@@ -102,19 +102,19 @@ Relative difference = 2.83729918072716e-07
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl1_hrd1/check.exe -p 64 256 1 OMP=
-Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.2.0] [inlineHel=1] [hardcodePARAM=1]
+Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=1]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 3.286130e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.289440e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.289440e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.113098e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.115510e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.115510e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 4.063123e+00 +- 2.368970e+00 )  GeV^-4
-TOTAL       :     5.001270 sec
-    14,944,293,116      cycles                    #    2.986 GHz                    
-    37,593,481,677      instructions              #    2.52  insn per cycle         
-       5.006579543 seconds time elapsed
+TOTAL       :     5.277417 sec
+    15,032,371,179      cycles                           #    2.847 GHz                    
+    37,583,654,408      instructions                     #    2.50  insn per cycle         
+       5.281735273 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:68265) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl1_hrd1/runTest.exe
@@ -128,19 +128,19 @@ Relative difference = 2.8372990661989057e-07
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl1_hrd1/check.exe -p 64 256 1 OMP=
-Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.2.0] [inlineHel=1] [hardcodePARAM=1]
+Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=1]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 8.014357e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 8.033871e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 8.033871e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 7.856730e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 7.870923e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.870923e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 4.063123e+00 +- 2.368970e+00 )  GeV^-4
-TOTAL       :     2.057405 sec
-     5,959,840,539      cycles                    #    2.891 GHz                    
-    12,824,533,137      instructions              #    2.15  insn per cycle         
-       2.062870435 seconds time elapsed
+TOTAL       :     2.096869 sec
+     5,955,250,625      cycles                           #    2.836 GHz                    
+    12,810,330,644      instructions                     #    2.15  insn per cycle         
+       2.101131538 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2:45687) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl1_hrd1/runTest.exe
@@ -154,19 +154,19 @@ Relative difference = 2.8372967134613354e-07
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl1_hrd1/check.exe -p 64 256 1 OMP=
-Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.2.0] [inlineHel=1] [hardcodePARAM=1]
+Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=1]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 9.500841e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 9.528151e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 9.528151e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 9.109775e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 9.130403e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 9.130403e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 4.063123e+00 +- 2.368970e+00 )  GeV^-4
-TOTAL       :     1.743665 sec
-     5,035,330,638      cycles                    #    2.888 GHz                    
-    11,343,590,938      instructions              #    2.25  insn per cycle         
-       1.748819840 seconds time elapsed
+TOTAL       :     1.809492 sec
+     5,019,187,943      cycles                           #    2.769 GHz                    
+    11,330,622,254      instructions                     #    2.26  insn per cycle         
+       1.813662613 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2:39850) (512y:  138) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl1_hrd1/runTest.exe
@@ -180,19 +180,19 @@ Relative difference = 2.8372967134613354e-07
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl1_hrd1/check.exe -p 64 256 1 OMP=
-Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.2.0] [inlineHel=1] [hardcodePARAM=1]
+Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=1]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 8.021482e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 8.042248e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 8.042248e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 7.681987e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 7.696729e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.696729e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 4.063123e+00 +- 2.368970e+00 )  GeV^-4
-TOTAL       :     2.055496 sec
-     3,937,439,973      cycles                    #    1.912 GHz                    
-     5,773,024,249      instructions              #    1.47  insn per cycle         
-       2.060744461 seconds time elapsed
+TOTAL       :     2.144494 sec
+     3,933,032,947      cycles                           #    1.833 GHz                    
+     5,761,298,930      instructions                     #    1.46  insn per cycle         
+       2.148708938 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 1589) (512y:  251) (512z:38642)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl1_hrd1/runTest.exe
diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0.txt
index 0b7bc69d3c..1795f8ff40 100644
--- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0.txt
+++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0.txt
@@ -35,61 +35,61 @@ CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 
-DATE: 2023-06-16_22:57:36
+DATE: 2023-07-18_22:46:24
 
-On itscrd80.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
+On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe -p 64 256 1 OMP=
-Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 6.285069e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 6.329665e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 6.335265e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 6.328648e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 6.379805e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 6.385954e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 4.059596e+00 +- 2.368053e+00 )  GeV^-4
-TOTAL       :     0.508351 sec
-     2,077,856,827      cycles                    #    2.833 GHz                    
-     2,725,078,892      instructions              #    1.31  insn per cycle         
-       0.790826956 seconds time elapsed
+TOTAL       :     0.481770 sec
+     2,063,713,574      cycles                           #    2.942 GHz                    
+     2,968,264,700      instructions                     #    1.44  insn per cycle         
+       0.786196143 seconds time elapsed
 runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe -p 64 256 1
 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255
 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100%
 .........................................................................
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 1 OMP=
-Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 8.539433e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 8.615009e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 8.618092e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 8.555201e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 8.627905e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 8.631075e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 6.664703e+00 +- 5.072736e+00 )  GeV^-4
-TOTAL       :     1.761585 sec
-     5,789,825,690      cycles                    #    2.887 GHz                    
-    11,967,255,873      instructions              #    2.07  insn per cycle         
-       2.065101803 seconds time elapsed
+TOTAL       :     1.726449 sec
+     5,877,053,961      cycles                           #    3.004 GHz                    
+    12,562,645,632      instructions                     #    2.14  insn per cycle         
+       2.015090243 seconds time elapsed
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2
 Avg ME (C++/CUDA)   = 6.626454e-04
-Avg ME (F77/CUDA)   = 6.6262662035525971E-004
-Relative difference = 2.8340413651595734e-05
+Avg ME (F77/CUDA)   = 6.6262659968156085E-004
+Relative difference = 2.8371612387547027e-05
 OK (relative difference <= 5E-3)
 =========================================================================
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/check.exe -p 64 256 1 OMP=
-Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.965658e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.966651e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.966651e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.991990e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.992959e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.992959e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 4.060121e+00 +- 2.367902e+00 )  GeV^-4
-TOTAL       :     8.353642 sec
-    24,461,782,795      cycles                    #    2.928 GHz                    
-    78,146,447,581      instructions              #    3.19  insn per cycle         
-       8.359034707 seconds time elapsed
-=Symbols in CPPProcess.o= (~sse4: 3563) (avx2:    0) (512y:    0) (512z:    0)
+TOTAL       :     8.244032 sec
+    25,060,848,142      cycles                           #    3.040 GHz                    
+    78,146,011,721      instructions                     #    3.12  insn per cycle         
+       8.249060073 seconds time elapsed
+=Symbols in CPPProcess.o= (~sse4: 3567) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/runTest.exe
 [  PASSED  ] 6 tests.
@@ -102,19 +102,19 @@ Relative difference = 4.998523613136231e-08
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/check.exe -p 64 256 1 OMP=
-Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 7.592457e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 7.607446e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 7.607446e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 7.168098e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 7.181458e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.181458e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 4.060121e+00 +- 2.367902e+00 )  GeV^-4
-TOTAL       :     2.176903 sec
-     6,304,553,002      cycles                    #    2.897 GHz                    
-    20,090,800,645      instructions              #    3.19  insn per cycle         
-       2.181656558 seconds time elapsed
+TOTAL       :     2.299749 sec
+     6,520,623,214      cycles                           #    2.835 GHz                    
+    20,082,651,077      instructions                     #    3.08  insn per cycle         
+       2.305297557 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:13755) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/runTest.exe
@@ -128,19 +128,19 @@ Relative difference = 8.545443743731147e-09
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/check.exe -p 64 256 1 OMP=
-Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.706358e+04                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.713844e+04                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.713844e+04                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.669486e+04                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.676783e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.676783e+04                 )  sec^-1
 MeanMatrixElemValue         = ( 4.060560e+00 +- 2.367611e+00 )  GeV^-4
-TOTAL       :     0.970457 sec
-     2,815,185,653      cycles                    #    2.890 GHz                    
-     7,033,969,834      instructions              #    2.50  insn per cycle         
-       0.975199568 seconds time elapsed
+TOTAL       :     0.992775 sec
+     2,820,421,702      cycles                           #    2.838 GHz                    
+     7,025,142,998      instructions                     #    2.49  insn per cycle         
+       0.997552890 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2:11257) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/runTest.exe
@@ -154,19 +154,19 @@ Relative difference = 1.0552292094680926e-08
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/check.exe -p 64 256 1 OMP=
-Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.944671e+04                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.953973e+04                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.953973e+04                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.918377e+04                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.927775e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.927775e+04                 )  sec^-1
 MeanMatrixElemValue         = ( 4.060560e+00 +- 2.367611e+00 )  GeV^-4
-TOTAL       :     0.852685 sec
-     2,474,785,553      cycles                    #    2.891 GHz                    
-     6,331,079,508      instructions              #    2.56  insn per cycle         
-       0.857424788 seconds time elapsed
+TOTAL       :     0.865223 sec
+     2,468,464,156      cycles                           #    2.849 GHz                    
+     6,321,993,471      instructions                     #    2.56  insn per cycle         
+       0.870455783 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2:10915) (512y:   32) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/runTest.exe
@@ -180,20 +180,20 @@ Relative difference = 1.0552292094680926e-08
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/check.exe -p 64 256 1 OMP=
-Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.497348e+04                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.503099e+04                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.503099e+04                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.476790e+04                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.482202e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.482202e+04                 )  sec^-1
 MeanMatrixElemValue         = ( 4.060562e+00 +- 2.367612e+00 )  GeV^-4
-TOTAL       :     1.105154 sec
-     2,042,054,263      cycles                    #    1.843 GHz                    
-     3,246,636,997      instructions              #    1.59  insn per cycle         
-       1.110173718 seconds time elapsed
-=Symbols in CPPProcess.o= (~sse4:    0) (avx2: 1682) (512y:   40) (512z:10085)
+TOTAL       :     1.121528 sec
+     2,033,493,855      cycles                           #    1.813 GHz                    
+     3,238,669,184      instructions                     #    1.59  insn per cycle         
+       1.130214849 seconds time elapsed
+=Symbols in CPPProcess.o= (~sse4:    0) (avx2: 1692) (512y:   40) (512z:10084)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/runTest.exe
 [  PASSED  ] 6 tests.
diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_bridge.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_bridge.txt
index f56715ce11..a1b4674f19 100644
--- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_bridge.txt
+++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_bridge.txt
@@ -35,26 +35,26 @@ CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 
-DATE: 2023-06-16_23:22:43
+DATE: 2023-07-18_23:26:14
 
-On itscrd80.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
+On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe -p 64 256 1 --bridge OMP=
 WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost
 WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost
 WARNING! Instantiate device Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384)
 WARNING! Set grid in Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384)
-Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 5.645163e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 6.314203e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 6.314203e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 5.625816e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 6.338657e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 6.338657e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 4.048178e+00 +- 2.364571e+00 )  GeV^-4
-TOTAL       :     0.495037 sec
-     2,044,288,346      cycles                    #    2.860 GHz                    
-     2,702,609,067      instructions              #    1.32  insn per cycle         
-       0.772230608 seconds time elapsed
+TOTAL       :     0.466248 sec
+     2,031,849,388      cycles                           #    2.955 GHz                    
+     2,961,771,125      instructions                     #    1.46  insn per cycle         
+       0.744065339 seconds time elapsed
 runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe -p 64 256 1 --bridge
 WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost
 WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost
@@ -68,41 +68,41 @@ WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost
 WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost
 WARNING! Instantiate device Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288)
 WARNING! Set grid in Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288)
-Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 7.239966e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 8.484650e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 8.484650e+05                 )  sec^-1
-MeanMatrixElemValue         = ( 6.641709e+00 +- 4.994249e+00 )  GeV^-4
-TOTAL       :     1.924795 sec
-     6,601,631,800      cycles                    #    3.040 GHz                    
-    13,123,912,596      instructions              #    1.99  insn per cycle         
-       2.228980169 seconds time elapsed
+EvtsPerSec[Rmb+ME]     (23) = ( 7.228267e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 8.484145e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 8.484145e+05                 )  sec^-1
+MeanMatrixElemValue         = ( 6.641710e+00 +- 4.994249e+00 )  GeV^-4
+TOTAL       :     1.904403 sec
+     6,415,900,458      cycles                           #    2.985 GHz                    
+    12,491,083,247      instructions                     #    1.95  insn per cycle         
+       2.207825533 seconds time elapsed
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2
 Avg ME (C++/CUDA)   = 6.626454e-04
-Avg ME (F77/CUDA)   = 6.6262662035525971E-004
-Relative difference = 2.8340413651595734e-05
+Avg ME (F77/CUDA)   = 6.6262659968156085E-004
+Relative difference = 2.8371612387547027e-05
 OK (relative difference <= 5E-3)
 =========================================================================
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/check.exe -p 64 256 1 --bridge OMP=
 WARNING! Instantiate host Bridge (nevt=16384)
-Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 2.043992e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.045065e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.045065e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.988726e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.989680e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.989680e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 4.060121e+00 +- 2.367902e+00 )  GeV^-4
-TOTAL       :     8.036711 sec
-    24,503,345,133      cycles                    #    3.048 GHz                    
-    78,146,893,496      instructions              #    3.19  insn per cycle         
-       8.041734648 seconds time elapsed
-=Symbols in CPPProcess.o= (~sse4: 3563) (avx2:    0) (512y:    0) (512z:    0)
+TOTAL       :     8.257291 sec
+    25,075,343,992      cycles                           #    3.036 GHz                    
+    78,149,648,975      instructions                     #    3.12  insn per cycle         
+       8.261339290 seconds time elapsed
+=Symbols in CPPProcess.o= (~sse4: 3567) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/runTest.exe
 [  PASSED  ] 6 tests.
@@ -116,19 +116,19 @@ OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/check.exe -p 64 256 1 --bridge OMP=
 WARNING! Instantiate host Bridge (nevt=16384)
-Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 7.476480e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 7.491910e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 7.491910e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 7.234102e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 7.248233e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.248233e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 4.060121e+00 +- 2.367902e+00 )  GeV^-4
-TOTAL       :     2.205721 sec
-     6,317,208,814      cycles                    #    2.860 GHz                    
-    20,099,654,048      instructions              #    3.18  insn per cycle         
-       2.210968010 seconds time elapsed
+TOTAL       :     2.277702 sec
+     6,528,357,276      cycles                           #    2.862 GHz                    
+    20,090,895,428      instructions                     #    3.08  insn per cycle         
+       2.281625361 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:13755) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/runTest.exe
@@ -143,19 +143,19 @@ OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/check.exe -p 64 256 1 --bridge OMP=
 WARNING! Instantiate host Bridge (nevt=16384)
-Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.696185e+04                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.703554e+04                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.703554e+04                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.681618e+04                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.688734e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.688734e+04                 )  sec^-1
 MeanMatrixElemValue         = ( 4.060560e+00 +- 2.367611e+00 )  GeV^-4
-TOTAL       :     0.978597 sec
-     2,817,888,819      cycles                    #    2.869 GHz                    
-     7,044,186,426      instructions              #    2.50  insn per cycle         
-       0.983797034 seconds time elapsed
+TOTAL       :     0.985910 sec
+     2,820,830,278      cycles                           #    2.853 GHz                    
+     7,034,559,684      instructions                     #    2.49  insn per cycle         
+       0.989751099 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2:11257) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/runTest.exe
@@ -170,19 +170,19 @@ OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/check.exe -p 64 256 1 --bridge OMP=
 WARNING! Instantiate host Bridge (nevt=16384)
-Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.943265e+04                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.953322e+04                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.953322e+04                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.906724e+04                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.915957e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.915957e+04                 )  sec^-1
 MeanMatrixElemValue         = ( 4.060560e+00 +- 2.367611e+00 )  GeV^-4
-TOTAL       :     0.855535 sec
-     2,477,342,849      cycles                    #    2.883 GHz                    
-     6,340,941,160      instructions              #    2.56  insn per cycle         
-       0.860485836 seconds time elapsed
+TOTAL       :     0.870831 sec
+     2,474,414,540      cycles                           #    2.833 GHz                    
+     6,331,645,863      instructions                     #    2.56  insn per cycle         
+       0.875020673 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2:10915) (512y:   32) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/runTest.exe
@@ -197,20 +197,20 @@ OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/check.exe -p 64 256 1 --bridge OMP=
 WARNING! Instantiate host Bridge (nevt=16384)
-Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.553687e+04                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.559969e+04                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.559969e+04                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.527095e+04                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.532953e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.532953e+04                 )  sec^-1
 MeanMatrixElemValue         = ( 4.060562e+00 +- 2.367612e+00 )  GeV^-4
-TOTAL       :     1.067416 sec
-     2,044,224,175      cycles                    #    1.909 GHz                    
-     3,257,025,834      instructions              #    1.59  insn per cycle         
-       1.072360210 seconds time elapsed
-=Symbols in CPPProcess.o= (~sse4:    0) (avx2: 1682) (512y:   40) (512z:10085)
+TOTAL       :     1.084935 sec
+     2,044,118,908      cycles                           #    1.878 GHz                    
+     3,248,151,332      instructions                     #    1.59  insn per cycle         
+       1.089089621 seconds time elapsed
+=Symbols in CPPProcess.o= (~sse4:    0) (avx2: 1692) (512y:   40) (512z:10084)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/runTest.exe
 [  PASSED  ] 6 tests.
diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_common.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_common.txt
index f758b117fb..719a82996d 100644
--- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_common.txt
+++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_common.txt
@@ -35,61 +35,61 @@ CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 
-DATE: 2023-06-16_23:33:36
+DATE: 2023-07-18_23:36:49
 
-On itscrd80.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
+On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe -p 64 256 1 --common OMP=
-Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:COMMON+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 6.328620e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 6.382342e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 6.387489e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 6.309200e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 6.362073e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 6.368445e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 4.159397e-01 +- 3.238804e-01 )  GeV^-4
-TOTAL       :     0.490468 sec
-     2,053,312,645      cycles                    #    2.896 GHz                    
-     2,688,522,220      instructions              #    1.31  insn per cycle         
-       0.768623655 seconds time elapsed
+TOTAL       :     0.465657 sec
+     2,008,960,497      cycles                           #    2.915 GHz                    
+     2,922,930,156      instructions                     #    1.45  insn per cycle         
+       0.746661542 seconds time elapsed
 runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe -p 64 256 1 --common
 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255
 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100%
 .........................................................................
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 1 --common OMP=
-Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:COMMON+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 8.549643e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 8.618843e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 8.621847e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 8.564625e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 8.637906e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 8.641309e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 1.094367e+02 +- 1.071509e+02 )  GeV^-4
-TOTAL       :     1.827390 sec
-     6,222,758,821      cycles                    #    3.019 GHz                    
-    12,654,178,829      instructions              #    2.03  insn per cycle         
-       2.131020283 seconds time elapsed
+TOTAL       :     1.801689 sec
+     6,057,350,087      cycles                           #    2.985 GHz                    
+    12,604,320,980      instructions                     #    2.08  insn per cycle         
+       2.085381680 seconds time elapsed
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2
 Avg ME (C++/CUDA)   = 6.626454e-04
-Avg ME (F77/CUDA)   = 6.6262662035525971E-004
-Relative difference = 2.8340413651595734e-05
+Avg ME (F77/CUDA)   = 6.6262659968156085E-004
+Relative difference = 2.8371612387547027e-05
 OK (relative difference <= 5E-3)
 =========================================================================
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/check.exe -p 64 256 1 --common OMP=
-Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 2.065245e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.066355e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.066355e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.980035e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.981007e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.981007e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 4.208459e-01 +- 3.253446e-01 )  GeV^-4
-TOTAL       :     7.951720 sec
-    24,455,995,614      cycles                    #    3.075 GHz                    
-    78,143,504,623      instructions              #    3.20  insn per cycle         
-       7.956774266 seconds time elapsed
-=Symbols in CPPProcess.o= (~sse4: 3563) (avx2:    0) (512y:    0) (512z:    0)
+TOTAL       :     8.292562 sec
+    25,056,318,775      cycles                           #    3.021 GHz                    
+    78,146,905,788      instructions                     #    3.12  insn per cycle         
+       8.296157417 seconds time elapsed
+=Symbols in CPPProcess.o= (~sse4: 3567) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/runTest.exe
 [  PASSED  ] 6 tests.
@@ -102,19 +102,19 @@ Relative difference = 4.998523613136231e-08
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/check.exe -p 64 256 1 --common OMP=
-Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 7.603380e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 7.618762e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 7.618762e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 7.078241e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 7.091572e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.091572e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 4.208457e-01 +- 3.253445e-01 )  GeV^-4
-TOTAL       :     2.174453 sec
-     6,308,707,547      cycles                    #    2.902 GHz                    
-    20,091,203,739      instructions              #    3.18  insn per cycle         
-       2.179216274 seconds time elapsed
+TOTAL       :     2.326411 sec
+     6,529,397,700      cycles                           #    2.804 GHz                    
+    20,082,220,128      instructions                     #    3.08  insn per cycle         
+       2.330014631 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:13755) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/runTest.exe
@@ -128,19 +128,19 @@ Relative difference = 8.545443743731147e-09
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/check.exe -p 64 256 1 --common OMP=
-Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.709770e+04                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.717220e+04                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.717220e+04                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.661437e+04                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.668659e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.668659e+04                 )  sec^-1
 MeanMatrixElemValue         = ( 4.214980e-01 +- 3.255523e-01 )  GeV^-4
-TOTAL       :     0.969715 sec
-     2,816,258,030      cycles                    #    2.894 GHz                    
-     7,033,592,275      instructions              #    2.50  insn per cycle         
-       0.974263363 seconds time elapsed
+TOTAL       :     0.996754 sec
+     2,817,743,008      cycles                           #    2.819 GHz                    
+     7,024,061,229      instructions                     #    2.49  insn per cycle         
+       1.000375913 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2:11257) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/runTest.exe
@@ -154,19 +154,19 @@ Relative difference = 1.0552292094680926e-08
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/check.exe -p 64 256 1 --common OMP=
-Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:COMMON+RMBHST+MESHST/512y+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.920768e+04                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.930607e+04                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.930607e+04                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.828370e+04                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.836790e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.836790e+04                 )  sec^-1
 MeanMatrixElemValue         = ( 4.214980e-01 +- 3.255523e-01 )  GeV^-4
-TOTAL       :     0.864540 sec
-     2,472,809,203      cycles                    #    2.851 GHz                    
-     6,329,435,201      instructions              #    2.56  insn per cycle         
-       0.869035951 seconds time elapsed
+TOTAL       :     0.906892 sec
+     2,470,317,560      cycles                           #    2.714 GHz                    
+     6,320,014,190      instructions                     #    2.56  insn per cycle         
+       0.910836635 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2:10915) (512y:   32) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/runTest.exe
@@ -180,20 +180,20 @@ Relative difference = 1.0552292094680926e-08
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/check.exe -p 64 256 1 --common OMP=
-Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:COMMON+RMBHST+MESHST/512z+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.537958e+04                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.544260e+04                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.544260e+04                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.501715e+04                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.507361e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.507361e+04                 )  sec^-1
 MeanMatrixElemValue         = ( 4.214982e-01 +- 3.255524e-01 )  GeV^-4
-TOTAL       :     1.077016 sec
-     2,042,333,122      cycles                    #    1.891 GHz                    
-     3,245,054,769      instructions              #    1.59  insn per cycle         
-       1.081439983 seconds time elapsed
-=Symbols in CPPProcess.o= (~sse4:    0) (avx2: 1682) (512y:   40) (512z:10085)
+TOTAL       :     1.102043 sec
+     2,037,558,095      cycles                           #    1.844 GHz                    
+     3,235,905,404      instructions                     #    1.59  insn per cycle         
+       1.106015383 seconds time elapsed
+=Symbols in CPPProcess.o= (~sse4:    0) (avx2: 1692) (512y:   40) (512z:10084)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/runTest.exe
 [  PASSED  ] 6 tests.
diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_curhst.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_curhst.txt
index 34a43cea0a..c704840cf9 100644
--- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_curhst.txt
+++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_curhst.txt
@@ -35,61 +35,61 @@ CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 
-DATE: 2023-06-16_23:30:33
+DATE: 2023-07-18_23:33:51
 
-On itscrd80.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
+On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe -p 64 256 1 --curhst OMP=
-Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURHST+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 6.323907e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 6.378161e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 6.383334e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 6.328531e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 6.379355e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 6.385023e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 4.059596e+00 +- 2.368053e+00 )  GeV^-4
-TOTAL       :     0.487701 sec
-     2,049,188,758      cycles                    #    2.905 GHz                    
-     2,686,122,587      instructions              #    1.31  insn per cycle         
-       0.764918156 seconds time elapsed
+TOTAL       :     0.462858 sec
+     1,980,722,765      cycles                           #    2.938 GHz                    
+     2,892,819,621      instructions                     #    1.46  insn per cycle         
+       0.731290269 seconds time elapsed
 runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe -p 64 256 1 --curhst
 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255
 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100%
 .........................................................................
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 1 --curhst OMP=
-Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURHST+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 8.548245e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 8.617363e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 8.620322e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 8.569950e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 8.641943e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 8.645139e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 6.664703e+00 +- 5.072736e+00 )  GeV^-4
-TOTAL       :     1.774012 sec
-     6,089,970,636      cycles                    #    3.032 GHz                    
-    11,896,059,014      instructions              #    1.95  insn per cycle         
-       2.066787545 seconds time elapsed
+TOTAL       :     1.750978 sec
+     5,918,608,993      cycles                           #    2.996 GHz                    
+    12,162,380,865      instructions                     #    2.05  insn per cycle         
+       2.034792845 seconds time elapsed
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2
 Avg ME (C++/CUDA)   = 6.626454e-04
-Avg ME (F77/CUDA)   = 6.6262662035525971E-004
-Relative difference = 2.8340413651595734e-05
+Avg ME (F77/CUDA)   = 6.6262659968156085E-004
+Relative difference = 2.8371612387547027e-05
 OK (relative difference <= 5E-3)
 =========================================================================
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/check.exe -p 64 256 1 --curhst OMP=
-Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 2.064063e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.065180e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.065180e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.985988e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.986970e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.986970e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 4.060121e+00 +- 2.367902e+00 )  GeV^-4
-TOTAL       :     7.960763 sec
-    24,464,649,749      cycles                    #    3.074 GHz                    
-    78,145,698,591      instructions              #    3.19  insn per cycle         
-       7.965669739 seconds time elapsed
-=Symbols in CPPProcess.o= (~sse4: 3563) (avx2:    0) (512y:    0) (512z:    0)
+TOTAL       :     8.266136 sec
+    25,080,094,182      cycles                           #    3.033 GHz                    
+    78,145,984,417      instructions                     #    3.12  insn per cycle         
+       8.270023006 seconds time elapsed
+=Symbols in CPPProcess.o= (~sse4: 3567) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/runTest.exe
 [  PASSED  ] 6 tests.
@@ -102,19 +102,19 @@ Relative difference = 4.998523613136231e-08
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/check.exe -p 64 256 1 --curhst OMP=
-Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 7.640949e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 7.656696e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 7.656696e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 6.949211e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 6.961352e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 6.961352e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 4.060121e+00 +- 2.367902e+00 )  GeV^-4
-TOTAL       :     2.155286 sec
-     6,306,014,277      cycles                    #    2.921 GHz                    
-    20,090,085,412      instructions              #    3.19  insn per cycle         
-       2.160045070 seconds time elapsed
+TOTAL       :     2.367907 sec
+     6,521,570,008      cycles                           #    2.751 GHz                    
+    20,082,135,071      instructions                     #    3.08  insn per cycle         
+       2.371862221 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:13755) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/runTest.exe
@@ -128,19 +128,19 @@ Relative difference = 8.545443743731147e-09
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/check.exe -p 64 256 1 --curhst OMP=
-Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.710584e+04                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.718001e+04                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.718001e+04                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.675991e+04                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.682969e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.682969e+04                 )  sec^-1
 MeanMatrixElemValue         = ( 4.060560e+00 +- 2.367611e+00 )  GeV^-4
-TOTAL       :     0.967745 sec
-     2,810,549,219      cycles                    #    2.896 GHz                    
-     7,034,117,328      instructions              #    2.50  insn per cycle         
-       0.972224772 seconds time elapsed
+TOTAL       :     0.986561 sec
+     2,813,138,713      cycles                           #    2.843 GHz                    
+     7,024,839,984      instructions                     #    2.50  insn per cycle         
+       0.990202328 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2:11257) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/runTest.exe
@@ -154,19 +154,19 @@ Relative difference = 1.0552292094680926e-08
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/check.exe -p 64 256 1 --curhst OMP=
-Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.945349e+04                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.955042e+04                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.955042e+04                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.895238e+04                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.904313e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.904313e+04                 )  sec^-1
 MeanMatrixElemValue         = ( 4.060560e+00 +- 2.367611e+00 )  GeV^-4
-TOTAL       :     0.852055 sec
-     2,468,399,437      cycles                    #    2.884 GHz                    
-     6,331,090,642      instructions              #    2.56  insn per cycle         
-       0.856970624 seconds time elapsed
+TOTAL       :     0.873329 sec
+     2,465,162,367      cycles                           #    2.812 GHz                    
+     6,321,674,271      instructions                     #    2.56  insn per cycle         
+       0.877369473 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2:10915) (512y:   32) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/runTest.exe
@@ -180,20 +180,20 @@ Relative difference = 1.0552292094680926e-08
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/check.exe -p 64 256 1 --curhst OMP=
-Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.543083e+04                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.549272e+04                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.549272e+04                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.520633e+04                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.526563e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.526563e+04                 )  sec^-1
 MeanMatrixElemValue         = ( 4.060562e+00 +- 2.367612e+00 )  GeV^-4
-TOTAL       :     1.072075 sec
-     2,041,169,827      cycles                    #    1.898 GHz                    
-     3,247,254,694      instructions              #    1.59  insn per cycle         
-       1.077160788 seconds time elapsed
-=Symbols in CPPProcess.o= (~sse4:    0) (avx2: 1682) (512y:   40) (512z:10085)
+TOTAL       :     1.086574 sec
+     2,034,950,607      cycles                           #    1.867 GHz                    
+     3,237,826,564      instructions                     #    1.59  insn per cycle         
+       1.090692761 seconds time elapsed
+=Symbols in CPPProcess.o= (~sse4:    0) (avx2: 1692) (512y:   40) (512z:10084)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/runTest.exe
 [  PASSED  ] 6 tests.
diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_rmbhst.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_rmbhst.txt
index 513c5a26b1..e4746ed953 100644
--- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_rmbhst.txt
+++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_rmbhst.txt
@@ -35,23 +35,23 @@ CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 
-DATE: 2023-06-16_23:27:35
+DATE: 2023-07-18_23:30:58
 
-On itscrd80.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
+On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe -p 64 256 1 --rmbhst OMP=
 WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost
-Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURHST+RMBHST+MESDEV/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 5.708420e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 6.347094e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 6.352007e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 5.747461e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 6.395267e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 6.400893e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 4.048178e+00 +- 2.364571e+00 )  GeV^-4
-TOTAL       :     0.490976 sec
-     2,048,446,840      cycles                    #    2.889 GHz                    
-     2,674,620,255      instructions              #    1.31  insn per cycle         
-       0.767937575 seconds time elapsed
+TOTAL       :     0.467603 sec
+     2,018,915,063      cycles                           #    2.937 GHz                    
+     2,900,155,764      instructions                     #    1.44  insn per cycle         
+       0.746177156 seconds time elapsed
 runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe -p 64 256 1 --rmbhst
 WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost
 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255
@@ -59,40 +59,40 @@ WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost
 .........................................................................
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 1 --rmbhst OMP=
 WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost
-Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURHST+RMBHST+MESDEV/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 7.421166e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 8.615921e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 8.619172e+05                 )  sec^-1
-MeanMatrixElemValue         = ( 6.641709e+00 +- 4.994249e+00 )  GeV^-4
-TOTAL       :     1.859235 sec
-     6,236,141,578      cycles                    #    2.977 GHz                    
-    12,723,940,591      instructions              #    2.04  insn per cycle         
-       2.154378517 seconds time elapsed
+EvtsPerSec[Rmb+ME]     (23) = ( 7.411123e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 8.627941e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 8.631179e+05                 )  sec^-1
+MeanMatrixElemValue         = ( 6.641710e+00 +- 4.994249e+00 )  GeV^-4
+TOTAL       :     1.838772 sec
+     6,138,854,502      cycles                           #    2.968 GHz                    
+    13,085,152,978      instructions                     #    2.13  insn per cycle         
+       2.127958322 seconds time elapsed
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2
 Avg ME (C++/CUDA)   = 6.626454e-04
-Avg ME (F77/CUDA)   = 6.6262662035525971E-004
-Relative difference = 2.8340413651595734e-05
+Avg ME (F77/CUDA)   = 6.6262659968156085E-004
+Relative difference = 2.8371612387547027e-05
 OK (relative difference <= 5E-3)
 =========================================================================
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/check.exe -p 64 256 1 --rmbhst OMP=
-Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 2.071302e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.072413e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.072413e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.967533e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.968499e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.968499e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 4.060121e+00 +- 2.367902e+00 )  GeV^-4
-TOTAL       :     7.929179 sec
-    24,461,908,171      cycles                    #    3.086 GHz                    
-    78,144,542,835      instructions              #    3.19  insn per cycle         
-       7.934172762 seconds time elapsed
-=Symbols in CPPProcess.o= (~sse4: 3563) (avx2:    0) (512y:    0) (512z:    0)
+TOTAL       :     8.343835 sec
+    25,087,465,619      cycles                           #    3.007 GHz                    
+    78,146,272,502      instructions                     #    3.11  insn per cycle         
+       8.347778703 seconds time elapsed
+=Symbols in CPPProcess.o= (~sse4: 3567) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/runTest.exe
 [  PASSED  ] 6 tests.
@@ -105,19 +105,19 @@ Relative difference = 4.998523613136231e-08
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/check.exe -p 64 256 1 --rmbhst OMP=
-Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 7.571029e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 7.586100e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 7.586100e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 7.178005e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 7.191017e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.191017e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 4.060121e+00 +- 2.367902e+00 )  GeV^-4
-TOTAL       :     2.174925 sec
-     6,301,926,717      cycles                    #    2.893 GHz                    
-    20,090,235,453      instructions              #    3.19  insn per cycle         
-       2.179846514 seconds time elapsed
+TOTAL       :     2.292891 sec
+     6,523,692,358      cycles                           #    2.841 GHz                    
+    20,081,837,659      instructions                     #    3.08  insn per cycle         
+       2.296967736 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:13755) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/runTest.exe
@@ -131,19 +131,19 @@ Relative difference = 8.545443743731147e-09
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/check.exe -p 64 256 1 --rmbhst OMP=
-Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.701876e+04                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.709090e+04                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.709090e+04                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.656324e+04                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.663221e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.663221e+04                 )  sec^-1
 MeanMatrixElemValue         = ( 4.060560e+00 +- 2.367611e+00 )  GeV^-4
-TOTAL       :     0.972776 sec
-     2,805,674,310      cycles                    #    2.875 GHz                    
-     7,034,136,735      instructions              #    2.51  insn per cycle         
-       0.977241135 seconds time elapsed
+TOTAL       :     0.998170 sec
+     2,815,673,342      cycles                           #    2.813 GHz                    
+     7,024,525,707      instructions                     #    2.49  insn per cycle         
+       1.001880395 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2:11257) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/runTest.exe
@@ -157,19 +157,19 @@ Relative difference = 1.0552292094680926e-08
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/check.exe -p 64 256 1 --rmbhst OMP=
-Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.937974e+04                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.948232e+04                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.948232e+04                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.902878e+04                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.912112e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.912112e+04                 )  sec^-1
 MeanMatrixElemValue         = ( 4.060560e+00 +- 2.367611e+00 )  GeV^-4
-TOTAL       :     0.855178 sec
-     2,469,042,209      cycles                    #    2.877 GHz                    
-     6,331,047,669      instructions              #    2.56  insn per cycle         
-       0.859915572 seconds time elapsed
+TOTAL       :     0.869920 sec
+     2,463,531,596      cycles                           #    2.822 GHz                    
+     6,321,449,141      instructions                     #    2.57  insn per cycle         
+       0.873633029 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2:10915) (512y:   32) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/runTest.exe
@@ -183,20 +183,20 @@ Relative difference = 1.0552292094680926e-08
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/check.exe -p 64 256 1 --rmbhst OMP=
-Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.553740e+04                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.559951e+04                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.559951e+04                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.485973e+04                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.491473e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.491473e+04                 )  sec^-1
 MeanMatrixElemValue         = ( 4.060562e+00 +- 2.367612e+00 )  GeV^-4
-TOTAL       :     1.064606 sec
-     2,037,786,255      cycles                    #    1.909 GHz                    
-     3,246,591,583      instructions              #    1.59  insn per cycle         
-       1.069170450 seconds time elapsed
-=Symbols in CPPProcess.o= (~sse4:    0) (avx2: 1682) (512y:   40) (512z:10085)
+TOTAL       :     1.111691 sec
+     2,049,922,439      cycles                           #    1.839 GHz                    
+     3,237,844,553      instructions                     #    1.58  insn per cycle         
+       1.115320688 seconds time elapsed
+=Symbols in CPPProcess.o= (~sse4:    0) (avx2: 1692) (512y:   40) (512z:10084)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/runTest.exe
 [  PASSED  ] 6 tests.
diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd1.txt
index 24986b526d..e664b8e4bd 100644
--- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd1.txt
+++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd1.txt
@@ -35,61 +35,61 @@ CUDACPP_BUILDDIR='build.512z_f_inl0_hrd1'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 
-DATE: 2023-06-16_22:58:06
+DATE: 2023-07-18_22:46:53
 
-On itscrd80.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
+On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd1/gcheck.exe -p 64 256 1 OMP=
-Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=1]
+Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 6.278810e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 6.338048e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 6.343904e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 6.361034e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 6.414963e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 6.420865e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 4.059596e+00 +- 2.368053e+00 )  GeV^-4
-TOTAL       :     0.508627 sec
-     2,112,782,804      cycles                    #    2.878 GHz                    
-     2,766,936,523      instructions              #    1.31  insn per cycle         
-       0.791513495 seconds time elapsed
+TOTAL       :     0.480571 sec
+     2,065,234,577      cycles                           #    2.945 GHz                    
+     2,977,941,339      instructions                     #    1.44  insn per cycle         
+       0.778873111 seconds time elapsed
 runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd1/gcheck.exe -p 64 256 1
 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255
 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100%
 .........................................................................
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd1/gcheck.exe -p 2048 256 1 OMP=
-Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=1]
+Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 8.568420e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 8.630751e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 8.634459e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 8.540141e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 8.613831e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 8.617005e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 6.664703e+00 +- 5.072736e+00 )  GeV^-4
-TOTAL       :     1.749439 sec
-     6,009,356,386      cycles                    #    3.024 GHz                    
-    12,104,468,348      instructions              #    2.01  insn per cycle         
-       2.046648448 seconds time elapsed
+TOTAL       :     1.721792 sec
+     5,879,791,948      cycles                           #    3.006 GHz                    
+    12,366,829,498      instructions                     #    2.10  insn per cycle         
+       2.012637973 seconds time elapsed
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd1/gcheck.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd1/fgcheck.exe 2 64 2
 Avg ME (C++/CUDA)   = 6.626454e-04
-Avg ME (F77/CUDA)   = 6.6262662035525971E-004
-Relative difference = 2.8340413651595734e-05
+Avg ME (F77/CUDA)   = 6.6262659968156085E-004
+Relative difference = 2.8371612387547027e-05
 OK (relative difference <= 5E-3)
 =========================================================================
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd1/check.exe -p 64 256 1 OMP=
-Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=1]
+Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 2.095981e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.097105e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.097105e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.996506e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.997490e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.997490e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 4.060121e+00 +- 2.367902e+00 )  GeV^-4
-TOTAL       :     7.835166 sec
-    24,337,509,935      cycles                    #    3.105 GHz                    
-    77,896,565,911      instructions              #    3.20  insn per cycle         
-       7.840097785 seconds time elapsed
-=Symbols in CPPProcess.o= (~sse4: 3071) (avx2:    0) (512y:    0) (512z:    0)
+TOTAL       :     8.224439 sec
+    24,894,773,723      cycles                           #    3.027 GHz                    
+    77,890,953,353      instructions                     #    3.13  insn per cycle         
+       8.229244629 seconds time elapsed
+=Symbols in CPPProcess.o= (~sse4: 3075) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd1/runTest.exe
 [  PASSED  ] 6 tests.
@@ -102,19 +102,19 @@ Relative difference = 5.65798569465384e-08
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd1/check.exe -p 64 256 1 OMP=
-Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=1]
+Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 7.647482e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 7.663046e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 7.663046e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 7.351207e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 7.365129e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.365129e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 4.060121e+00 +- 2.367902e+00 )  GeV^-4
-TOTAL       :     2.153739 sec
-     6,249,075,693      cycles                    #    2.898 GHz                    
-    20,045,520,110      instructions              #    3.21  insn per cycle         
-       2.158442622 seconds time elapsed
+TOTAL       :     2.241696 sec
+     6,482,790,377      cycles                           #    2.890 GHz                    
+    20,037,202,220      instructions                     #    3.09  insn per cycle         
+       2.246950151 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:13454) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd1/runTest.exe
@@ -128,19 +128,19 @@ Relative difference = 8.454838403082277e-09
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd1/check.exe -p 64 256 1 OMP=
-Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=1]
+Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.690515e+04                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.698040e+04                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.698040e+04                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.632955e+04                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.639361e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.639361e+04                 )  sec^-1
 MeanMatrixElemValue         = ( 4.060560e+00 +- 2.367611e+00 )  GeV^-4
-TOTAL       :     0.979063 sec
-     2,866,637,980      cycles                    #    2.916 GHz                    
-     7,146,127,186      instructions              #    2.49  insn per cycle         
-       0.984190288 seconds time elapsed
+TOTAL       :     1.014405 sec
+     2,879,568,161      cycles                           #    2.836 GHz                    
+     7,137,270,355      instructions                     #    2.48  insn per cycle         
+       1.019344865 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2:11820) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd1/runTest.exe
@@ -154,19 +154,19 @@ Relative difference = 1.0602318832827381e-08
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd1/check.exe -p 64 256 1 OMP=
-Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=1]
+Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.866768e+04                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.875905e+04                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.875905e+04                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.850016e+04                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.858529e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.858529e+04                 )  sec^-1
 MeanMatrixElemValue         = ( 4.060560e+00 +- 2.367611e+00 )  GeV^-4
-TOTAL       :     0.887507 sec
-     2,573,527,962      cycles                    #    2.887 GHz                    
-     6,441,681,549      instructions              #    2.50  insn per cycle         
-       0.892352069 seconds time elapsed
+TOTAL       :     0.895799 sec
+     2,560,614,550      cycles                           #    2.854 GHz                    
+     6,432,788,868      instructions                     #    2.51  insn per cycle         
+       0.900682497 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2:11516) (512y:   24) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd1/runTest.exe
@@ -180,20 +180,20 @@ Relative difference = 1.0602318832827381e-08
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd1/check.exe -p 64 256 1 OMP=
-Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=1]
+Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.528254e+04                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.534227e+04                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.534227e+04                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.413332e+04                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.418459e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.418459e+04                 )  sec^-1
 MeanMatrixElemValue         = ( 4.060562e+00 +- 2.367612e+00 )  GeV^-4
-TOTAL       :     1.082245 sec
-     2,091,652,812      cycles                    #    1.925 GHz                    
-     3,367,447,588      instructions              #    1.61  insn per cycle         
-       1.087515058 seconds time elapsed
-=Symbols in CPPProcess.o= (~sse4:    0) (avx2: 2221) (512y:   32) (512z:10146)
+TOTAL       :     1.170694 sec
+     2,089,713,145      cycles                           #    1.783 GHz                    
+     3,358,950,774      instructions                     #    1.61  insn per cycle         
+       1.175989368 seconds time elapsed
+=Symbols in CPPProcess.o= (~sse4:    0) (avx2: 2222) (512y:   32) (512z:10133)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd1/runTest.exe
 [  PASSED  ] 6 tests.
diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl1_hrd0.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl1_hrd0.txt
index 2114debcba..cfa67fb72a 100644
--- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl1_hrd0.txt
+++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl1_hrd0.txt
@@ -35,61 +35,61 @@ CUDACPP_BUILDDIR='build.512z_f_inl1_hrd0'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 
-DATE: 2023-06-16_23:14:17
+DATE: 2023-07-18_23:18:10
 
-On itscrd80.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
+On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd0/gcheck.exe -p 64 256 1 OMP=
-Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=1] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=1] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 5.564508e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 5.602710e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 5.606907e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 5.555562e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 5.595852e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 5.600337e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 4.059596e+00 +- 2.368053e+00 )  GeV^-4
-TOTAL       :     0.511467 sec
-     2,169,852,681      cycles                    #    2.909 GHz                    
-     2,868,849,541      instructions              #    1.32  insn per cycle         
-       0.803186385 seconds time elapsed
+TOTAL       :     0.489898 sec
+     2,117,871,861      cycles                           #    2.944 GHz                    
+     3,083,406,149      instructions                     #    1.46  insn per cycle         
+       0.779309462 seconds time elapsed
 runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd0/gcheck.exe -p 64 256 1
 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255
 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100%
 .........................................................................
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd0/gcheck.exe -p 2048 256 1 OMP=
-Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=1] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=1] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 7.693782e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 7.750237e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 7.752633e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 7.706941e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 7.767146e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.769793e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 6.664703e+00 +- 5.072736e+00 )  GeV^-4
-TOTAL       :     1.885046 sec
-     6,430,536,190      cycles                    #    3.024 GHz                    
-    13,410,448,299      instructions              #    2.09  insn per cycle         
-       2.186495562 seconds time elapsed
+TOTAL       :     1.859952 sec
+     6,265,344,549      cycles                           #    2.990 GHz                    
+    13,643,028,015      instructions                     #    2.18  insn per cycle         
+       2.154934603 seconds time elapsed
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd0/gcheck.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd0/fgcheck.exe 2 64 2
 Avg ME (C++/CUDA)   = 6.626454e-04
-Avg ME (F77/CUDA)   = 6.6262662649554244E-004
-Relative difference = 2.833114733400458e-05
+Avg ME (F77/CUDA)   = 6.6262660579844562E-004
+Relative difference = 2.836238137986709e-05
 OK (relative difference <= 5E-3)
 =========================================================================
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd0/check.exe -p 64 256 1 OMP=
-Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.2.0] [inlineHel=1] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 5.896574e+02                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 5.897472e+02                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 5.897472e+02                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 5.751534e+02                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 5.752342e+02                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 5.752342e+02                 )  sec^-1
 MeanMatrixElemValue         = ( 4.059969e+00 +- 2.367799e+00 )  GeV^-4
-TOTAL       :    27.823875 sec
-    86,121,024,305      cycles                    #    3.095 GHz                    
-   136,130,940,684      instructions              #    1.58  insn per cycle         
-      27.829007329 seconds time elapsed
-=Symbols in CPPProcess.o= (~sse4:15932) (avx2:    0) (512y:    0) (512z:    0)
+TOTAL       :    28.523012 sec
+    86,401,262,633      cycles                           #    3.029 GHz                    
+   136,144,748,047      instructions                     #    1.58  insn per cycle         
+      28.527089944 seconds time elapsed
+=Symbols in CPPProcess.o= (~sse4:15917) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd0/runTest.exe
 [  PASSED  ] 6 tests.
@@ -102,19 +102,19 @@ Relative difference = 4.9411338183416744e-09
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl1_hrd0/check.exe -p 64 256 1 OMP=
-Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.2.0] [inlineHel=1] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 7.179629e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 7.192771e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 7.192771e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 6.937203e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 6.949756e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 6.949756e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 4.059962e+00 +- 2.367792e+00 )  GeV^-4
-TOTAL       :     2.295614 sec
-     6,686,096,543      cycles                    #    2.912 GHz                    
-    19,271,682,618      instructions              #    2.88  insn per cycle         
-       2.300291912 seconds time elapsed
+TOTAL       :     2.373012 sec
+     6,730,013,186      cycles                           #    2.833 GHz                    
+    19,262,616,828      instructions                     #    2.86  insn per cycle         
+       2.376797941 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:69534) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl1_hrd0/runTest.exe
@@ -128,19 +128,19 @@ Relative difference = 2.6057152933832753e-08
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl1_hrd0/check.exe -p 64 256 1 OMP=
-Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.2.0] [inlineHel=1] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.531625e+04                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.537735e+04                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.537735e+04                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.428038e+04                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.433299e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.433299e+04                 )  sec^-1
 MeanMatrixElemValue         = ( 4.060903e+00 +- 2.367376e+00 )  GeV^-4
-TOTAL       :     1.080134 sec
-     3,114,255,706      cycles                    #    2.875 GHz                    
-     6,664,280,015      instructions              #    2.14  insn per cycle         
-       1.084789912 seconds time elapsed
+TOTAL       :     1.156685 sec
+     3,104,589,238      cycles                           #    2.678 GHz                    
+     6,652,896,903      instructions                     #    2.14  insn per cycle         
+       1.160693007 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2:47803) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl1_hrd0/runTest.exe
@@ -154,19 +154,19 @@ Relative difference = 1.9003789248133364e-08
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl1_hrd0/check.exe -p 64 256 1 OMP=
-Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.2.0] [inlineHel=1] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.841445e+04                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.850149e+04                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.850149e+04                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.832430e+04                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.840977e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.840977e+04                 )  sec^-1
 MeanMatrixElemValue         = ( 4.060903e+00 +- 2.367376e+00 )  GeV^-4
-TOTAL       :     0.899888 sec
-     2,571,109,745      cycles                    #    2.845 GHz                    
-     5,850,913,153      instructions              #    2.28  insn per cycle         
-       0.904914307 seconds time elapsed
+TOTAL       :     0.903107 sec
+     2,567,602,694      cycles                           #    2.833 GHz                    
+     5,841,410,689      instructions                     #    2.28  insn per cycle         
+       0.906868909 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2:41536) (512y:   13) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl1_hrd0/runTest.exe
@@ -180,19 +180,19 @@ Relative difference = 1.9003789248133364e-08
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl1_hrd0/check.exe -p 64 256 1 OMP=
-Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.2.0] [inlineHel=1] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.550620e+04                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.556902e+04                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.556902e+04                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.510516e+04                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.516306e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.516306e+04                 )  sec^-1
 MeanMatrixElemValue         = ( 4.060905e+00 +- 2.367377e+00 )  GeV^-4
-TOTAL       :     1.067211 sec
-     2,037,305,474      cycles                    #    1.903 GHz                    
-     3,373,327,271      instructions              #    1.66  insn per cycle         
-       1.071975743 seconds time elapsed
+TOTAL       :     1.094312 sec
+     2,033,089,410      cycles                           #    1.852 GHz                    
+     3,364,403,279      instructions                     #    1.65  insn per cycle         
+       1.098401868 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 4191) (512y:    5) (512z:44245)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl1_hrd0/runTest.exe
diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl1_hrd1.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl1_hrd1.txt
index 8e3ac4399e..51142074c9 100644
--- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl1_hrd1.txt
+++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl1_hrd1.txt
@@ -35,61 +35,61 @@ CUDACPP_BUILDDIR='build.512z_f_inl1_hrd1'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 
-DATE: 2023-06-16_23:15:08
+DATE: 2023-07-18_23:19:00
 
-On itscrd80.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
+On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd1/gcheck.exe -p 64 256 1 OMP=
-Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=1] [hardcodePARAM=1]
+Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=1] [hardcodePARAM=1]
 Workflow summary            = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 5.536252e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 5.573442e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 5.577601e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 5.573163e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 5.617115e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 5.621634e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 4.059596e+00 +- 2.368053e+00 )  GeV^-4
-TOTAL       :     0.513649 sec
-     2,145,771,085      cycles                    #    2.879 GHz                    
-     2,862,547,641      instructions              #    1.33  insn per cycle         
-       0.805011702 seconds time elapsed
+TOTAL       :     0.492997 sec
+     2,057,678,092      cycles                           #    2.877 GHz                    
+     3,062,518,407      instructions                     #    1.49  insn per cycle         
+       0.776272674 seconds time elapsed
 runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd1/gcheck.exe -p 64 256 1
 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255
 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100%
 .........................................................................
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd1/gcheck.exe -p 2048 256 1 OMP=
-Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=1] [hardcodePARAM=1]
+Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=1] [hardcodePARAM=1]
 Workflow summary            = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 7.581647e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 7.635703e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 7.638045e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 7.585370e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 7.643537e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.646114e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 6.664703e+00 +- 5.072736e+00 )  GeV^-4
-TOTAL       :     1.893721 sec
-     6,500,188,004      cycles                    #    3.036 GHz                    
-    12,690,177,194      instructions              #    1.95  insn per cycle         
-       2.199182146 seconds time elapsed
+TOTAL       :     1.867448 sec
+     6,275,513,404      cycles                           #    2.981 GHz                    
+    12,807,707,433      instructions                     #    2.04  insn per cycle         
+       2.162298283 seconds time elapsed
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd1/gcheck.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd1/fgcheck.exe 2 64 2
 Avg ME (C++/CUDA)   = 6.626454e-04
-Avg ME (F77/CUDA)   = 6.6262662649554244E-004
-Relative difference = 2.833114733400458e-05
+Avg ME (F77/CUDA)   = 6.6262660579844562E-004
+Relative difference = 2.836238137986709e-05
 OK (relative difference <= 5E-3)
 =========================================================================
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd1/check.exe -p 64 256 1 OMP=
-Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.2.0] [inlineHel=1] [hardcodePARAM=1]
+Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=1]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 5.890367e+02                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 5.891225e+02                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 5.891225e+02                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 5.733524e+02                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 5.734378e+02                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 5.734378e+02                 )  sec^-1
 MeanMatrixElemValue         = ( 4.059969e+00 +- 2.367799e+00 )  GeV^-4
-TOTAL       :    27.851215 sec
-    85,856,337,659      cycles                    #    3.083 GHz                    
-   136,047,957,548      instructions              #    1.58  insn per cycle         
-      27.856280727 seconds time elapsed
-=Symbols in CPPProcess.o= (~sse4:15933) (avx2:    0) (512y:    0) (512z:    0)
+TOTAL       :    28.613127 sec
+    86,709,704,728      cycles                           #    3.030 GHz                    
+   136,080,123,804      instructions                     #    1.57  insn per cycle         
+      28.617179548 seconds time elapsed
+=Symbols in CPPProcess.o= (~sse4:15955) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd1/runTest.exe
 [  PASSED  ] 6 tests.
@@ -102,19 +102,19 @@ Relative difference = 2.8211244692003953e-08
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl1_hrd1/check.exe -p 64 256 1 OMP=
-Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.2.0] [inlineHel=1] [hardcodePARAM=1]
+Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=1]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 7.106679e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 7.119068e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 7.119068e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 6.957648e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 6.970744e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 6.970744e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 4.059963e+00 +- 2.367792e+00 )  GeV^-4
-TOTAL       :     2.316749 sec
-     6,781,000,176      cycles                    #    2.924 GHz                    
-    19,316,989,732      instructions              #    2.85  insn per cycle         
-       2.321493873 seconds time elapsed
+TOTAL       :     2.365219 sec
+     6,808,787,197      cycles                           #    2.875 GHz                    
+    19,308,338,488      instructions                     #    2.84  insn per cycle         
+       2.369448248 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:69471) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl1_hrd1/runTest.exe
@@ -128,19 +128,19 @@ Relative difference = 3.0732494532034946e-08
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl1_hrd1/check.exe -p 64 256 1 OMP=
-Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.2.0] [inlineHel=1] [hardcodePARAM=1]
+Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=1]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.581478e+04                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.587987e+04                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.587987e+04                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.541847e+04                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.547836e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.547836e+04                 )  sec^-1
 MeanMatrixElemValue         = ( 4.060903e+00 +- 2.367376e+00 )  GeV^-4
-TOTAL       :     1.045728 sec
-     3,042,131,837      cycles                    #    2.898 GHz                    
-     6,594,840,346      instructions              #    2.17  insn per cycle         
-       1.050898181 seconds time elapsed
+TOTAL       :     1.071572 sec
+     3,039,419,289      cycles                           #    2.827 GHz                    
+     6,585,482,226      instructions                     #    2.17  insn per cycle         
+       1.075716909 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2:46795) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl1_hrd1/runTest.exe
@@ -154,19 +154,19 @@ Relative difference = 1.9674022283284887e-08
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl1_hrd1/check.exe -p 64 256 1 OMP=
-Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.2.0] [inlineHel=1] [hardcodePARAM=1]
+Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=1]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.870529e+04                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.879536e+04                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.879536e+04                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.826611e+04                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.835439e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.835439e+04                 )  sec^-1
 MeanMatrixElemValue         = ( 4.060903e+00 +- 2.367376e+00 )  GeV^-4
-TOTAL       :     0.885640 sec
-     2,578,942,088      cycles                    #    2.899 GHz                    
-     5,854,799,048      instructions              #    2.27  insn per cycle         
-       0.890682953 seconds time elapsed
+TOTAL       :     0.905723 sec
+     2,575,686,106      cycles                           #    2.834 GHz                    
+     5,845,383,132      instructions                     #    2.27  insn per cycle         
+       0.909457035 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2:41080) (512y:    9) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl1_hrd1/runTest.exe
@@ -180,19 +180,19 @@ Relative difference = 1.9674022283284887e-08
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl1_hrd1/check.exe -p 64 256 1 OMP=
-Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.2.0] [inlineHel=1] [hardcodePARAM=1]
+Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=1]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.554834e+04                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.561058e+04                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.561058e+04                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.496491e+04                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.502073e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.502073e+04                 )  sec^-1
 MeanMatrixElemValue         = ( 4.060905e+00 +- 2.367377e+00 )  GeV^-4
-TOTAL       :     1.063798 sec
-     2,024,256,431      cycles                    #    1.896 GHz                    
-     3,083,285,385      instructions              #    1.52  insn per cycle         
-       1.068833353 seconds time elapsed
+TOTAL       :     1.104130 sec
+     2,023,795,233      cycles                           #    1.829 GHz                    
+     3,074,048,274      instructions                     #    1.52  insn per cycle         
+       1.107904891 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 3372) (512y:   17) (512z:39424)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl1_hrd1/runTest.exe
diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd0.txt
index b408e7e538..09ad13168b 100644
--- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd0.txt
+++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd0.txt
@@ -35,38 +35,38 @@ CUDACPP_BUILDDIR='build.512z_m_inl0_hrd0'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 
-DATE: 2023-06-16_22:58:34
+DATE: 2023-07-18_22:47:21
 
-On itscrd80.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
+On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd0/gcheck.exe -p 64 256 1 OMP=
-Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 3.488561e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.521964e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.524427e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.480391e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.508672e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.510984e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 4.063123e+00 +- 2.368970e+00 )  GeV^-4
-TOTAL       :     0.554107 sec
-     2,279,290,212      cycles                    #    2.876 GHz                    
-     3,137,052,799      instructions              #    1.38  insn per cycle         
-       0.851081146 seconds time elapsed
+TOTAL       :     0.525952 sec
+     2,267,677,525      cycles                           #    2.954 GHz                    
+     3,392,628,719      instructions                     #    1.50  insn per cycle         
+       0.831861393 seconds time elapsed
 runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd0/gcheck.exe -p 64 256 1
 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255
 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100%
 .........................................................................
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd0/gcheck.exe -p 2048 256 1 OMP=
-Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 4.134416e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 4.168392e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 4.169702e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 4.145439e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 4.179718e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.181134e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 6.665112e+00 +- 5.002651e+00 )  GeV^-4
-TOTAL       :     3.065812 sec
-    10,122,925,933      cycles                    #    3.040 GHz                    
-    20,842,538,183      instructions              #    2.06  insn per cycle         
-       3.387502173 seconds time elapsed
+TOTAL       :     3.052698 sec
+     9,942,137,930      cycles                           #    3.004 GHz                    
+    22,214,757,723      instructions                     #    2.23  insn per cycle         
+       3.366177476 seconds time elapsed
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd0/gcheck.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd0/fgcheck.exe 2 64 2
@@ -76,20 +76,20 @@ Relative difference = 2.659538381540814e-07
 OK (relative difference <= 5E-3)
 =========================================================================
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd0/check.exe -p 64 256 1 OMP=
-Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.978031e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.979322e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.979322e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.913139e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.914080e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.914080e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 4.063123e+00 +- 2.368970e+00 )  GeV^-4
-TOTAL       :     8.302095 sec
-    25,693,038,802      cycles                    #    3.094 GHz                    
-    79,192,524,178      instructions              #    3.08  insn per cycle         
-       8.307297645 seconds time elapsed
-=Symbols in CPPProcess.o= (~sse4: 4706) (avx2:    0) (512y:    0) (512z:    0)
+TOTAL       :     8.583367 sec
+    26,165,234,964      cycles                           #    3.048 GHz                    
+    79,196,950,918      instructions                     #    3.03  insn per cycle         
+       8.588687377 seconds time elapsed
+=Symbols in CPPProcess.o= (~sse4: 4744) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd0/runTest.exe
 [  PASSED  ] 6 tests.
@@ -102,19 +102,19 @@ Relative difference = 2.8059296349552523e-07
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_m_inl0_hrd0/check.exe -p 64 256 1 OMP=
-Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 3.739227e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.743408e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.743408e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.608213e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.611317e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.611317e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 4.063123e+00 +- 2.368970e+00 )  GeV^-4
-TOTAL       :     4.404759 sec
-    12,783,705,279      cycles                    #    2.903 GHz                    
-    38,505,609,809      instructions              #    3.01  insn per cycle         
-       4.410187217 seconds time elapsed
+TOTAL       :     4.557137 sec
+    12,919,114,954      cycles                           #    2.834 GHz                    
+    38,494,869,204      instructions                     #    2.98  insn per cycle         
+       4.563393267 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:13076) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_m_inl0_hrd0/runTest.exe
@@ -128,20 +128,20 @@ Relative difference = 2.98084507782618e-07
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_m_inl0_hrd0/check.exe -p 64 256 1 OMP=
-Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 8.779933e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 8.804166e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 8.804166e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 8.504111e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 8.520835e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 8.520835e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 4.063123e+00 +- 2.368970e+00 )  GeV^-4
-TOTAL       :     1.879373 sec
-     5,476,712,305      cycles                    #    2.908 GHz                    
-    13,620,238,606      instructions              #    2.49  insn per cycle         
-       1.884513914 seconds time elapsed
-=Symbols in CPPProcess.o= (~sse4:    0) (avx2:10865) (512y:    0) (512z:    0)
+TOTAL       :     1.940434 sec
+     5,519,643,775      cycles                           #    2.843 GHz                    
+    13,603,840,523      instructions                     #    2.46  insn per cycle         
+       1.945814702 seconds time elapsed
+=Symbols in CPPProcess.o= (~sse4:    0) (avx2:10864) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_m_inl0_hrd0/runTest.exe
 [  PASSED  ] 6 tests.
@@ -154,20 +154,20 @@ Relative difference = 2.956342832710188e-07
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_m_inl0_hrd0/check.exe -p 64 256 1 OMP=
-Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 9.768885e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 9.800184e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 9.800184e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 9.534658e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 9.558180e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 9.558180e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 4.063123e+00 +- 2.368970e+00 )  GeV^-4
-TOTAL       :     1.690734 sec
-     4,867,518,430      cycles                    #    2.873 GHz                    
-    12,266,678,987      instructions              #    2.52  insn per cycle         
-       1.696100975 seconds time elapsed
-=Symbols in CPPProcess.o= (~sse4:    0) (avx2:10583) (512y:   20) (512z:    0)
+TOTAL       :     1.733857 sec
+     4,855,194,781      cycles                           #    2.802 GHz                    
+    12,251,398,833      instructions                     #    2.52  insn per cycle         
+       1.738916853 seconds time elapsed
+=Symbols in CPPProcess.o= (~sse4:    0) (avx2:10582) (512y:   20) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_m_inl0_hrd0/runTest.exe
 [  PASSED  ] 6 tests.
@@ -180,20 +180,20 @@ Relative difference = 2.956342832710188e-07
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_m_inl0_hrd0/check.exe -p 64 256 1 OMP=
-Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 7.632753e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 7.650795e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 7.650795e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 7.497075e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 7.510848e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.510848e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 4.063123e+00 +- 2.368970e+00 )  GeV^-4
-TOTAL       :     2.159522 sec
-     4,131,175,813      cycles                    #    1.911 GHz                    
-     6,362,042,970      instructions              #    1.54  insn per cycle         
-       2.164878251 seconds time elapsed
-=Symbols in CPPProcess.o= (~sse4:    0) (avx2: 1422) (512y:   57) (512z: 9944)
+TOTAL       :     2.200743 sec
+     4,095,535,430      cycles                           #    1.861 GHz                    
+     6,349,609,236      instructions                     #    1.55  insn per cycle         
+       2.206197767 seconds time elapsed
+=Symbols in CPPProcess.o= (~sse4:    0) (avx2: 1432) (512y:   58) (512z: 9948)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_m_inl0_hrd0/runTest.exe
 [  PASSED  ] 6 tests.
diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd1.txt
index a0ec009068..563afc5c04 100644
--- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd1.txt
+++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd1.txt
@@ -35,38 +35,38 @@ CUDACPP_BUILDDIR='build.512z_m_inl0_hrd1'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 
-DATE: 2023-06-16_22:59:10
+DATE: 2023-07-18_22:47:57
 
-On itscrd80.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
+On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd1/gcheck.exe -p 64 256 1 OMP=
-Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=1]
+Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 3.495757e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.531890e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.534521e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.517820e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.547651e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.549887e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 4.063123e+00 +- 2.368970e+00 )  GeV^-4
-TOTAL       :     0.553147 sec
-     2,267,397,908      cycles                    #    2.871 GHz                    
-     3,111,867,480      instructions              #    1.37  insn per cycle         
-       0.849472244 seconds time elapsed
+TOTAL       :     0.522538 sec
+     2,246,754,399      cycles                           #    2.960 GHz                    
+     3,448,544,369      instructions                     #    1.53  insn per cycle         
+       0.825847412 seconds time elapsed
 runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd1/gcheck.exe -p 64 256 1
 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255
 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100%
 .........................................................................
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd1/gcheck.exe -p 2048 256 1 OMP=
-Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=1]
+Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 4.149132e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 4.183375e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 4.184726e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 4.146188e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 4.180771e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.182209e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 6.665112e+00 +- 5.002651e+00 )  GeV^-4
-TOTAL       :     3.059498 sec
-    10,031,596,373      cycles                    #    3.007 GHz                    
-    20,916,916,227      instructions              #    2.09  insn per cycle         
-       3.392815949 seconds time elapsed
+TOTAL       :     3.039730 sec
+     9,826,496,967      cycles                           #    2.983 GHz                    
+    21,321,023,396      instructions                     #    2.17  insn per cycle         
+       3.352938907 seconds time elapsed
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd1/gcheck.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd1/fgcheck.exe 2 64 2
@@ -76,20 +76,20 @@ Relative difference = 2.659538381540814e-07
 OK (relative difference <= 5E-3)
 =========================================================================
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd1/check.exe -p 64 256 1 OMP=
-Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=1]
+Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.978511e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.979800e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.979800e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.904551e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.905470e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.905470e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 4.063123e+00 +- 2.368970e+00 )  GeV^-4
-TOTAL       :     8.301620 sec
-    25,591,925,031      cycles                    #    3.083 GHz                    
-    79,211,596,551      instructions              #    3.10  insn per cycle         
-       8.306715675 seconds time elapsed
-=Symbols in CPPProcess.o= (~sse4: 4378) (avx2:    0) (512y:    0) (512z:    0)
+TOTAL       :     8.624365 sec
+    26,132,562,164      cycles                           #    3.030 GHz                    
+    79,214,497,550      instructions                     #    3.03  insn per cycle         
+       8.629231042 seconds time elapsed
+=Symbols in CPPProcess.o= (~sse4: 4393) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd1/runTest.exe
 [  PASSED  ] 6 tests.
@@ -102,20 +102,20 @@ Relative difference = 2.8059296349552523e-07
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_m_inl0_hrd1/check.exe -p 64 256 1 OMP=
-Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=1]
+Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 3.744925e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.749316e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.749316e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.575647e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.579016e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.579016e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 4.063123e+00 +- 2.368970e+00 )  GeV^-4
-TOTAL       :     4.389824 sec
-    12,788,106,524      cycles                    #    2.911 GHz                    
-    38,452,416,346      instructions              #    3.01  insn per cycle         
-       4.395123954 seconds time elapsed
-=Symbols in CPPProcess.o= (~sse4:12869) (avx2:    0) (512y:    0) (512z:    0)
+TOTAL       :     4.598691 sec
+    12,863,248,226      cycles                           #    2.798 GHz                    
+    38,444,583,186      instructions                     #    2.99  insn per cycle         
+       4.603847034 seconds time elapsed
+=Symbols in CPPProcess.o= (~sse4:12865) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_m_inl0_hrd1/runTest.exe
 [  PASSED  ] 6 tests.
@@ -128,20 +128,20 @@ Relative difference = 2.98084507782618e-07
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_m_inl0_hrd1/check.exe -p 64 256 1 OMP=
-Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=1]
+Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 8.672327e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 8.695790e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 8.695790e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 8.431266e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 8.448537e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 8.448537e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 4.063123e+00 +- 2.368970e+00 )  GeV^-4
-TOTAL       :     1.902528 sec
-     5,504,597,953      cycles                    #    2.888 GHz                    
-    13,712,972,642      instructions              #    2.49  insn per cycle         
-       1.907990624 seconds time elapsed
-=Symbols in CPPProcess.o= (~sse4:    0) (avx2:10970) (512y:    0) (512z:    0)
+TOTAL       :     1.958382 sec
+     5,559,709,976      cycles                           #    2.839 GHz                    
+    13,699,466,614      instructions                     #    2.46  insn per cycle         
+       1.963270291 seconds time elapsed
+=Symbols in CPPProcess.o= (~sse4:    0) (avx2:10972) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_m_inl0_hrd1/runTest.exe
 [  PASSED  ] 6 tests.
@@ -154,20 +154,20 @@ Relative difference = 2.956342832710188e-07
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_m_inl0_hrd1/check.exe -p 64 256 1 OMP=
-Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=1]
+Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 9.759017e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 9.790204e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 9.790204e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 9.562589e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 9.585411e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 9.585411e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 4.063123e+00 +- 2.368970e+00 )  GeV^-4
-TOTAL       :     1.691699 sec
-     4,899,459,456      cycles                    #    2.890 GHz                    
-    12,367,541,245      instructions              #    2.52  insn per cycle         
-       1.696806589 seconds time elapsed
-=Symbols in CPPProcess.o= (~sse4:    0) (avx2:10604) (512y:  176) (512z:    0)
+TOTAL       :     1.726022 sec
+     4,896,080,544      cycles                           #    2.835 GHz                    
+    12,352,304,340      instructions                     #    2.52  insn per cycle         
+       1.731411385 seconds time elapsed
+=Symbols in CPPProcess.o= (~sse4:    0) (avx2:10602) (512y:  176) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_m_inl0_hrd1/runTest.exe
 [  PASSED  ] 6 tests.
@@ -180,20 +180,20 @@ Relative difference = 2.956342832710188e-07
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_m_inl0_hrd1/check.exe -p 64 256 1 OMP=
-Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=1]
+Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 7.611473e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 7.629905e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 7.629905e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 7.437650e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 7.451120e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.451120e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 4.063123e+00 +- 2.368970e+00 )  GeV^-4
-TOTAL       :     2.165271 sec
-     4,135,051,390      cycles                    #    1.906 GHz                    
-     6,441,618,375      instructions              #    1.56  insn per cycle         
-       2.170469997 seconds time elapsed
-=Symbols in CPPProcess.o= (~sse4:    0) (avx2: 1326) (512y:  167) (512z:10035)
+TOTAL       :     2.216802 sec
+     4,112,308,046      cycles                           #    1.855 GHz                    
+     6,429,070,692      instructions                     #    1.56  insn per cycle         
+       2.222267042 seconds time elapsed
+=Symbols in CPPProcess.o= (~sse4:    0) (avx2: 1327) (512y:  167) (512z:10033)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_m_inl0_hrd1/runTest.exe
 [  PASSED  ] 6 tests.
diff --git a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd0.txt
index f1471a9806..92bc2c7e06 100644
--- a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd0.txt
+++ b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd0.txt
@@ -35,38 +35,38 @@ CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg'
 
-DATE: 2023-06-16_23:01:26
+DATE: 2023-07-18_22:50:08
 
-On itscrd80.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
+On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/gcheck.exe -p 1 256 2 OMP=
-Process                     = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 4.072909e+02                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 4.073330e+02                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 4.073439e+02                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 4.069245e+02                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 4.069666e+02                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.069812e+02                 )  sec^-1
 MeanMatrixElemValue         = ( 1.187066e-05 +- 9.825549e-06 )  GeV^-6
-TOTAL       :     2.450063 sec
-     8,437,943,756      cycles                    #    3.031 GHz                    
-    18,613,140,327      instructions              #    2.21  insn per cycle         
-       2.841214646 seconds time elapsed
+TOTAL       :     2.455774 sec
+     8,198,003,417      cycles                           #    2.990 GHz                    
+    17,441,039,582      instructions                     #    2.13  insn per cycle         
+       2.848552676 seconds time elapsed
 runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/gcheck.exe -p 1 256 1
 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255
 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100%
 .........................................................................
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/gcheck.exe -p 64 256 1 OMP=
-Process                     = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 9.216282e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 9.218427e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 9.218617e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 9.194008e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 9.196202e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 9.196458e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 1.856249e-04 +- 8.329951e-05 )  GeV^-6
-TOTAL       :     4.025391 sec
-    13,269,588,593      cycles                    #    3.045 GHz                    
-    30,835,242,734      instructions              #    2.32  insn per cycle         
-       4.414633876 seconds time elapsed
+TOTAL       :     4.004219 sec
+    12,906,126,385      cycles                           #    2.980 GHz                    
+    30,705,078,901      instructions                     #    2.38  insn per cycle         
+       4.386333995 seconds time elapsed
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2
@@ -76,20 +76,20 @@ Relative difference = 3.5164777671934515e-07
 OK (relative difference <= 5E-3)
 =========================================================================
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/check.exe -p 1 256 2 OMP=
-Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 8.235283e+01                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 8.235601e+01                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 8.235601e+01                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 8.231825e+01                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 8.232056e+01                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 8.232056e+01                 )  sec^-1
 MeanMatrixElemValue         = ( 1.187066e-05 +- 9.825549e-06 )  GeV^-6
-TOTAL       :     6.415522 sec
-    19,160,739,671      cycles                    #    2.987 GHz                    
-    54,057,163,618      instructions              #    2.82  insn per cycle         
-       6.420564139 seconds time elapsed
-=Symbols in CPPProcess.o= (~sse4:32342) (avx2:    0) (512y:    0) (512z:    0)
+TOTAL       :     6.433436 sec
+    19,221,070,763      cycles                           #    2.989 GHz                    
+    54,051,803,217      instructions                     #    2.81  insn per cycle         
+       6.438508634 seconds time elapsed
+=Symbols in CPPProcess.o= (~sse4:32352) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/runTest.exe
 [  PASSED  ] 6 tests.
@@ -102,19 +102,19 @@ Relative difference = 3.5163655122073967e-07
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd0/check.exe -p 1 256 2 OMP=
-Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.620106e+02                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.620225e+02                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.620225e+02                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.597062e+02                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.597152e+02                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.597152e+02                 )  sec^-1
 MeanMatrixElemValue         = ( 1.187066e-05 +- 9.825549e-06 )  GeV^-6
-TOTAL       :     3.271379 sec
-     9,921,721,414      cycles                    #    3.034 GHz                    
-    27,083,114,570      instructions              #    2.73  insn per cycle         
-       3.276452626 seconds time elapsed
+TOTAL       :     3.316447 sec
+    10,011,910,485      cycles                           #    3.022 GHz                    
+    27,077,067,301      instructions                     #    2.70  insn per cycle         
+       3.322601212 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:96346) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd0/runTest.exe
@@ -128,19 +128,19 @@ Relative difference = 3.5163655122073967e-07
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd0/check.exe -p 1 256 2 OMP=
-Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 3.556228e+02                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.556831e+02                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.556831e+02                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.498130e+02                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.498535e+02                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.498535e+02                 )  sec^-1
 MeanMatrixElemValue         = ( 1.187066e-05 +- 9.825549e-06 )  GeV^-6
-TOTAL       :     1.491065 sec
-     4,312,322,670      cycles                    #    2.886 GHz                    
-     9,666,677,538      instructions              #    2.24  insn per cycle         
-       1.495685259 seconds time elapsed
+TOTAL       :     1.520366 sec
+     4,291,649,047      cycles                           #    2.828 GHz                    
+     9,660,073,588      instructions                     #    2.25  insn per cycle         
+       1.527692767 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2:83998) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd0/runTest.exe
@@ -154,19 +154,19 @@ Relative difference = 3.516375977906115e-07
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_d_inl0_hrd0/check.exe -p 1 256 2 OMP=
-Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 3.969415e+02                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.970156e+02                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.970156e+02                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.919923e+02                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.920457e+02                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.920457e+02                 )  sec^-1
 MeanMatrixElemValue         = ( 1.187066e-05 +- 9.825549e-06 )  GeV^-6
-TOTAL       :     1.336444 sec
-     3,832,216,841      cycles                    #    2.859 GHz                    
-     8,617,148,111      instructions              #    2.25  insn per cycle         
-       1.341165118 seconds time elapsed
+TOTAL       :     1.358214 sec
+     3,834,698,928      cycles                           #    2.828 GHz                    
+     8,610,372,933      instructions                     #    2.25  insn per cycle         
+       1.364092946 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2:83696) (512y:   30) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_d_inl0_hrd0/runTest.exe
@@ -180,20 +180,20 @@ Relative difference = 3.516375977906115e-07
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_d_inl0_hrd0/check.exe -p 1 256 2 OMP=
-Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 3.679839e+02                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.680629e+02                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.680629e+02                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.591087e+02                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.591651e+02                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.591651e+02                 )  sec^-1
 MeanMatrixElemValue         = ( 1.187066e-05 +- 9.825549e-06 )  GeV^-6
-TOTAL       :     1.441719 sec
-     2,716,250,084      cycles                    #    1.880 GHz                    
-     4,337,046,897      instructions              #    1.60  insn per cycle         
-       1.446804135 seconds time elapsed
-=Symbols in CPPProcess.o= (~sse4:    0) (avx2: 1874) (512y:   67) (512z:82924)
+TOTAL       :     1.476912 sec
+     2,718,672,476      cycles                           #    1.838 GHz                    
+     4,330,529,264      instructions                     #    1.59  insn per cycle         
+       1.489166453 seconds time elapsed
+=Symbols in CPPProcess.o= (~sse4:    0) (avx2: 1884) (512y:   68) (512z:82923)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_d_inl0_hrd0/runTest.exe
 [  PASSED  ] 6 tests.
diff --git a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd0_bridge.txt b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd0_bridge.txt
index 1c0184de14..da5c4db165 100644
--- a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd0_bridge.txt
+++ b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd0_bridge.txt
@@ -35,26 +35,26 @@ CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg'
 
-DATE: 2023-06-16_23:23:12
+DATE: 2023-07-18_23:26:42
 
-On itscrd80.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
+On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/gcheck.exe -p 1 256 2 --bridge OMP=
 WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost
 WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost
 WARNING! Instantiate device Bridge (nevt=256, gpublocks=1, gputhreads=256, gpublocks*gputhreads=256)
 WARNING! Set grid in Bridge (nevt=256, gpublocks=1, gputhreads=256, gpublocks*gputhreads=256)
-Process                     = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 4.061497e+02                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 4.062582e+02                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 4.062582e+02                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 4.067859e+02                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 4.068768e+02                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.068768e+02                 )  sec^-1
 MeanMatrixElemValue         = ( 1.187066e-05 +- 9.825549e-06 )  GeV^-6
-TOTAL       :     2.433988 sec
-     8,336,076,363      cycles                    #    3.005 GHz                    
-    17,014,813,881      instructions              #    2.04  insn per cycle         
-       2.833188841 seconds time elapsed
+TOTAL       :     2.367000 sec
+     8,097,817,270      cycles                           #    3.013 GHz                    
+    18,646,982,810      instructions                     #    2.30  insn per cycle         
+       2.744137164 seconds time elapsed
 runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/gcheck.exe -p 1 256 1 --bridge
 WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost
 WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost
@@ -68,17 +68,17 @@ WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost
 WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost
 WARNING! Instantiate device Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384)
 WARNING! Set grid in Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384)
-Process                     = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 9.176810e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 9.216166e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 9.216166e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 9.159334e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 9.190870e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 9.190870e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 1.856249e-04 +- 8.329951e-05 )  GeV^-6
-TOTAL       :     4.016312 sec
-    13,237,795,425      cycles                    #    3.038 GHz                    
-    28,853,451,424      instructions              #    2.18  insn per cycle         
-       4.417206461 seconds time elapsed
+TOTAL       :     4.000218 sec
+    12,996,175,307      cycles                           #    3.006 GHz                    
+    31,650,599,303      instructions                     #    2.44  insn per cycle         
+       4.379696037 seconds time elapsed
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2
@@ -89,20 +89,20 @@ OK (relative difference <= 5E-3)
 =========================================================================
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/check.exe -p 1 256 2 --bridge OMP=
 WARNING! Instantiate host Bridge (nevt=256)
-Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 8.426091e+01                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 8.426409e+01                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 8.426409e+01                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 7.583909e+01                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 7.584107e+01                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.584107e+01                 )  sec^-1
 MeanMatrixElemValue         = ( 1.187066e-05 +- 9.825549e-06 )  GeV^-6
-TOTAL       :     6.283292 sec
-    19,229,856,491      cycles                    #    3.063 GHz                    
-    54,057,430,365      instructions              #    2.81  insn per cycle         
-       6.287771357 seconds time elapsed
-=Symbols in CPPProcess.o= (~sse4:32342) (avx2:    0) (512y:    0) (512z:    0)
+TOTAL       :     6.962766 sec
+    19,348,646,238      cycles                           #    2.778 GHz                    
+    54,053,167,203      instructions                     #    2.79  insn per cycle         
+       6.966338951 seconds time elapsed
+=Symbols in CPPProcess.o= (~sse4:32352) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/runTest.exe
 [  PASSED  ] 6 tests.
@@ -116,19 +116,19 @@ OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd0/check.exe -p 1 256 2 --bridge OMP=
 WARNING! Instantiate host Bridge (nevt=256)
-Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.625803e+02                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.625919e+02                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.625919e+02                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.594374e+02                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.594459e+02                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.594459e+02                 )  sec^-1
 MeanMatrixElemValue         = ( 1.187066e-05 +- 9.825549e-06 )  GeV^-6
-TOTAL       :     3.253969 sec
-     9,967,840,687      cycles                    #    3.060 GHz                    
-    27,083,402,734      instructions              #    2.72  insn per cycle         
-       3.258914943 seconds time elapsed
+TOTAL       :     3.316146 sec
+    10,062,188,532      cycles                           #    3.032 GHz                    
+    27,076,347,840      instructions                     #    2.69  insn per cycle         
+       3.320140253 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:96346) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd0/runTest.exe
@@ -143,19 +143,19 @@ OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd0/check.exe -p 1 256 2 --bridge OMP=
 WARNING! Instantiate host Bridge (nevt=256)
-Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 3.555664e+02                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.556218e+02                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.556218e+02                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.507457e+02                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.507869e+02                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.507869e+02                 )  sec^-1
 MeanMatrixElemValue         = ( 1.187066e-05 +- 9.825549e-06 )  GeV^-6
-TOTAL       :     1.491530 sec
-     4,286,204,468      cycles                    #    2.869 GHz                    
-     9,667,785,714      instructions              #    2.26  insn per cycle         
-       1.496036399 seconds time elapsed
+TOTAL       :     1.510254 sec
+     4,286,831,616      cycles                           #    2.833 GHz                    
+     9,659,769,469      instructions                     #    2.25  insn per cycle         
+       1.513835398 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2:83998) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd0/runTest.exe
@@ -170,19 +170,19 @@ OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_d_inl0_hrd0/check.exe -p 1 256 2 --bridge OMP=
 WARNING! Instantiate host Bridge (nevt=256)
-Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 3.941069e+02                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.941824e+02                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.941824e+02                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.929207e+02                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.929712e+02                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.929712e+02                 )  sec^-1
 MeanMatrixElemValue         = ( 1.187066e-05 +- 9.825549e-06 )  GeV^-6
-TOTAL       :     1.348647 sec
-     3,839,497,436      cycles                    #    2.840 GHz                    
-     8,618,118,436      instructions              #    2.24  insn per cycle         
-       1.353595580 seconds time elapsed
+TOTAL       :     1.349459 sec
+     3,830,512,570      cycles                           #    2.834 GHz                    
+     8,610,115,617      instructions                     #    2.25  insn per cycle         
+       1.353051659 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2:83696) (512y:   30) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_d_inl0_hrd0/runTest.exe
@@ -197,20 +197,20 @@ OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_d_inl0_hrd0/check.exe -p 1 256 2 --bridge OMP=
 WARNING! Instantiate host Bridge (nevt=256)
-Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 3.666454e+02                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.667164e+02                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.667164e+02                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.588003e+02                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.588528e+02                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.588528e+02                 )  sec^-1
 MeanMatrixElemValue         = ( 1.187066e-05 +- 9.825549e-06 )  GeV^-6
-TOTAL       :     1.447325 sec
-     2,707,628,155      cycles                    #    1.868 GHz                    
-     4,338,013,213      instructions              #    1.60  insn per cycle         
-       1.451919961 seconds time elapsed
-=Symbols in CPPProcess.o= (~sse4:    0) (avx2: 1874) (512y:   67) (512z:82924)
+TOTAL       :     1.481818 sec
+     2,705,909,679      cycles                           #    1.822 GHz                    
+     4,330,220,559      instructions                     #    1.60  insn per cycle         
+       1.485587826 seconds time elapsed
+=Symbols in CPPProcess.o= (~sse4:    0) (avx2: 1884) (512y:   68) (512z:82923)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_d_inl0_hrd0/runTest.exe
 [  PASSED  ] 6 tests.
diff --git a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd1.txt
index 1144007b2b..92d23c3eb7 100644
--- a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd1.txt
+++ b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd1.txt
@@ -35,38 +35,38 @@ CUDACPP_BUILDDIR='build.512z_d_inl0_hrd1'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg'
 
-DATE: 2023-06-16_23:02:29
+DATE: 2023-07-18_22:51:11
 
-On itscrd80.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
+On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd1/gcheck.exe -p 1 256 2 OMP=
-Process                     = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=1]
+Process                     = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 4.065553e+02                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 4.065928e+02                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 4.066074e+02                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 4.073016e+02                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 4.073401e+02                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.073545e+02                 )  sec^-1
 MeanMatrixElemValue         = ( 1.187066e-05 +- 9.825549e-06 )  GeV^-6
-TOTAL       :     2.452549 sec
-     8,383,872,432      cycles                    #    3.005 GHz                    
-    18,091,785,273      instructions              #    2.16  insn per cycle         
-       2.848067694 seconds time elapsed
+TOTAL       :     2.461762 sec
+     8,235,337,655      cycles                           #    2.992 GHz                    
+    17,886,315,664      instructions                     #    2.17  insn per cycle         
+       2.874745866 seconds time elapsed
 runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd1/gcheck.exe -p 1 256 1
 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255
 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100%
 .........................................................................
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd1/gcheck.exe -p 64 256 1 OMP=
-Process                     = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=1]
+Process                     = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 9.225084e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 9.227042e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 9.227224e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 9.214723e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 9.216938e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 9.217196e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 1.856249e-04 +- 8.329951e-05 )  GeV^-6
-TOTAL       :     4.032176 sec
-    13,272,438,956      cycles                    #    3.042 GHz                    
-    28,712,791,562      instructions              #    2.16  insn per cycle         
-       4.421079188 seconds time elapsed
+TOTAL       :     4.007618 sec
+    12,832,818,033      cycles                           #    2.963 GHz                    
+    28,156,306,212      instructions                     #    2.19  insn per cycle         
+       4.389866635 seconds time elapsed
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd1/gcheck.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd1/fgcheck.exe 2 64 2
@@ -76,20 +76,20 @@ Relative difference = 3.5164777671934515e-07
 OK (relative difference <= 5E-3)
 =========================================================================
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd1/check.exe -p 1 256 2 OMP=
-Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=1]
+Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 8.506737e+01                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 8.507047e+01                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 8.507047e+01                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 7.755069e+01                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 7.755270e+01                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.755270e+01                 )  sec^-1
 MeanMatrixElemValue         = ( 1.187066e-05 +- 9.825549e-06 )  GeV^-6
-TOTAL       :     6.213859 sec
-    19,274,588,808      cycles                    #    3.101 GHz                    
-    54,076,478,702      instructions              #    2.81  insn per cycle         
-       6.218544506 seconds time elapsed
-=Symbols in CPPProcess.o= (~sse4:32261) (avx2:    0) (512y:    0) (512z:    0)
+TOTAL       :     6.820022 sec
+    19,341,258,763      cycles                           #    2.837 GHz                    
+    54,050,764,807      instructions                     #    2.79  insn per cycle         
+       6.824842050 seconds time elapsed
+=Symbols in CPPProcess.o= (~sse4:31958) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd1/runTest.exe
 [  PASSED  ] 6 tests.
@@ -102,19 +102,19 @@ Relative difference = 3.5163655122073967e-07
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd1/check.exe -p 1 256 2 OMP=
-Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=1]
+Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.638242e+02                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.638356e+02                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.638356e+02                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.595686e+02                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.595766e+02                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.595766e+02                 )  sec^-1
 MeanMatrixElemValue         = ( 1.187066e-05 +- 9.825549e-06 )  GeV^-6
-TOTAL       :     3.237531 sec
-     9,921,784,579      cycles                    #    3.067 GHz                    
-    27,077,809,738      instructions              #    2.73  insn per cycle         
-       3.242137028 seconds time elapsed
+TOTAL       :     3.318479 sec
+    10,050,715,238      cycles                           #    3.032 GHz                    
+    27,071,034,271      instructions                     #    2.69  insn per cycle         
+       3.323917457 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:96273) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd1/runTest.exe
@@ -128,20 +128,20 @@ Relative difference = 3.5163655122073967e-07
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd1/check.exe -p 1 256 2 OMP=
-Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=1]
+Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 3.488911e+02                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.489451e+02                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.489451e+02                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.443811e+02                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.444242e+02                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.444242e+02                 )  sec^-1
 MeanMatrixElemValue         = ( 1.187066e-05 +- 9.825549e-06 )  GeV^-6
-TOTAL       :     1.519935 sec
-     4,376,850,073      cycles                    #    2.874 GHz                    
-     9,677,370,071      instructions              #    2.21  insn per cycle         
-       1.524967874 seconds time elapsed
-=Symbols in CPPProcess.o= (~sse4:    0) (avx2:84092) (512y:    0) (512z:    0)
+TOTAL       :     1.543975 sec
+     4,371,174,841      cycles                           #    2.835 GHz                    
+     9,670,718,608      instructions                     #    2.21  insn per cycle         
+       1.549228190 seconds time elapsed
+=Symbols in CPPProcess.o= (~sse4:    0) (avx2:84158) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd1/runTest.exe
 [  PASSED  ] 6 tests.
@@ -154,20 +154,20 @@ Relative difference = 3.516375977906115e-07
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_d_inl0_hrd1/check.exe -p 1 256 2 OMP=
-Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=1]
+Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 3.924415e+02                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.925092e+02                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.925092e+02                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.846767e+02                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.847284e+02                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.847284e+02                 )  sec^-1
 MeanMatrixElemValue         = ( 1.187066e-05 +- 9.825549e-06 )  GeV^-6
-TOTAL       :     1.350849 sec
-     3,841,551,477      cycles                    #    2.839 GHz                    
-     8,626,542,730      instructions              #    2.25  insn per cycle         
-       1.355459933 seconds time elapsed
-=Symbols in CPPProcess.o= (~sse4:    0) (avx2:83791) (512y:  180) (512z:    0)
+TOTAL       :     1.382320 sec
+     3,806,727,433      cycles                           #    2.760 GHz                    
+     8,619,338,106      instructions                     #    2.26  insn per cycle         
+       1.387195733 seconds time elapsed
+=Symbols in CPPProcess.o= (~sse4:    0) (avx2:83696) (512y:  180) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_d_inl0_hrd1/runTest.exe
 [  PASSED  ] 6 tests.
@@ -180,20 +180,20 @@ Relative difference = 3.516375977906115e-07
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_d_inl0_hrd1/check.exe -p 1 256 2 OMP=
-Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=1]
+Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 3.671815e+02                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.672525e+02                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.672525e+02                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.624534e+02                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.625071e+02                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.625071e+02                 )  sec^-1
 MeanMatrixElemValue         = ( 1.187066e-05 +- 9.825549e-06 )  GeV^-6
-TOTAL       :     1.450398 sec
-     2,709,666,226      cycles                    #    1.865 GHz                    
-     4,344,182,027      instructions              #    1.60  insn per cycle         
-       1.455504193 seconds time elapsed
-=Symbols in CPPProcess.o= (~sse4:    0) (avx2: 1787) (512y:  166) (512z:83071)
+TOTAL       :     1.468920 sec
+     2,705,485,886      cycles                           #    1.846 GHz                    
+     4,337,157,031      instructions                     #    1.60  insn per cycle         
+       1.473526428 seconds time elapsed
+=Symbols in CPPProcess.o= (~sse4:    0) (avx2: 1788) (512y:  166) (512z:83063)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_d_inl0_hrd1/runTest.exe
 [  PASSED  ] 6 tests.
diff --git a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd0.txt
index 4ef1b474b5..97cdc10864 100644
--- a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd0.txt
+++ b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd0.txt
@@ -35,61 +35,61 @@ CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg'
 
-DATE: 2023-06-16_23:03:32
+DATE: 2023-07-18_22:52:14
 
-On itscrd80.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
+On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/gcheck.exe -p 1 256 2 OMP=
-Process                     = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 6.805716e+02                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 6.806625e+02                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 6.806872e+02                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 6.785287e+02                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 6.786309e+02                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 6.786769e+02                 )  sec^-1
 MeanMatrixElemValue         = ( 1.186984e-05 +- 9.824899e-06 )  GeV^-6
-TOTAL       :     1.680704 sec
-     5,915,505,957      cycles                    #    3.011 GHz                    
-    12,343,112,691      instructions              #    2.09  insn per cycle         
-       2.022067086 seconds time elapsed
+TOTAL       :     1.700960 sec
+     5,786,328,792      cycles                           #    2.984 GHz                    
+    12,183,728,265      instructions                     #    2.11  insn per cycle         
+       2.046788451 seconds time elapsed
 runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/gcheck.exe -p 1 256 1
 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255
 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100%
 .........................................................................
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/gcheck.exe -p 64 256 1 OMP=
-Process                     = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 2.312297e+04                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.312960e+04                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.313034e+04                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.338994e+04                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.339786e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.339941e+04                 )  sec^-1
 MeanMatrixElemValue         = ( 1.856829e-04 +- 8.333435e-05 )  GeV^-6
-TOTAL       :     1.956042 sec
-     6,779,291,735      cycles                    #    3.014 GHz                    
-    14,381,222,230      instructions              #    2.12  insn per cycle         
-       2.309303925 seconds time elapsed
+TOTAL       :     1.917438 sec
+     6,439,640,169      cycles                           #    2.930 GHz                    
+    13,998,441,228      instructions                     #    2.17  insn per cycle         
+       2.254021552 seconds time elapsed
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2
 Avg ME (C++/CUDA)   = 9.849636e-03
-Avg ME (F77/CUDA)   = 9.8712405367932642E-003
-Relative difference = 0.002193435046052877
+Avg ME (F77/CUDA)   = 9.8712405367667715E-003
+Relative difference = 0.0021934350433631634
 OK (relative difference <= 5E-3)
 =========================================================================
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/check.exe -p 1 256 2 OMP=
-Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 8.908521e+01                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 8.908794e+01                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 8.908794e+01                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 8.620419e+01                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 8.620672e+01                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 8.620672e+01                 )  sec^-1
 MeanMatrixElemValue         = ( 1.187013e-05 +- 9.825040e-06 )  GeV^-6
-TOTAL       :     5.935380 sec
-    18,280,225,429      cycles                    #    3.079 GHz                    
-    53,644,572,574      instructions              #    2.93  insn per cycle         
-       5.940879711 seconds time elapsed
-=Symbols in CPPProcess.o= (~sse4:20329) (avx2:    0) (512y:    0) (512z:    0)
+TOTAL       :     6.135818 sec
+    18,512,552,038      cycles                           #    3.019 GHz                    
+    53,641,945,364      instructions                     #    2.90  insn per cycle         
+       6.140857892 seconds time elapsed
+=Symbols in CPPProcess.o= (~sse4:20295) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/runTest.exe
 [  PASSED  ] 6 tests.
@@ -97,24 +97,24 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProce
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/fcheck.exe 2 64 2
 Avg ME (C++/C++)    = 9.847961e-03
-Avg ME (F77/C++)    = 9.8479612087550399E-003
-Relative difference = 2.119779305548787e-08
+Avg ME (F77/C++)    = 9.8479612087551509E-003
+Relative difference = 2.119780432912131e-08
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd0/check.exe -p 1 256 2 OMP=
-Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 3.628533e+02                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.628996e+02                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.628996e+02                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.465466e+02                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.465860e+02                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.465860e+02                 )  sec^-1
 MeanMatrixElemValue         = ( 1.187013e-05 +- 9.825037e-06 )  GeV^-6
-TOTAL       :     1.461345 sec
-     4,500,201,036      cycles                    #    3.073 GHz                    
-    13,763,590,198      instructions              #    3.06  insn per cycle         
-       1.466106403 seconds time elapsed
+TOTAL       :     1.533445 sec
+     4,609,129,011      cycles                           #    3.010 GHz                    
+    13,757,208,915      instructions                     #    2.98  insn per cycle         
+       1.538063941 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:96927) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd0/runTest.exe
@@ -128,19 +128,19 @@ Relative difference = 3.848767971092077e-08
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd0/check.exe -p 1 256 2 OMP=
-Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 7.026530e+02                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 7.028270e+02                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 7.028270e+02                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 6.805526e+02                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 6.807173e+02                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 6.807173e+02                 )  sec^-1
 MeanMatrixElemValue         = ( 1.187188e-05 +- 9.826769e-06 )  GeV^-6
-TOTAL       :     0.764097 sec
-     2,191,663,686      cycles                    #    2.869 GHz                    
-     4,871,112,775      instructions              #    2.22  insn per cycle         
-       0.768775887 seconds time elapsed
+TOTAL       :     0.788809 sec
+     2,211,972,639      cycles                           #    2.821 GHz                    
+     4,863,560,484      instructions                     #    2.20  insn per cycle         
+       0.793786183 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2:84275) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd0/runTest.exe
@@ -154,19 +154,19 @@ Relative difference = 3.9425359136432956e-08
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_f_inl0_hrd0/check.exe -p 1 256 2 OMP=
-Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 7.934823e+02                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 7.937028e+02                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 7.937028e+02                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 7.746508e+02                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 7.748551e+02                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.748551e+02                 )  sec^-1
 MeanMatrixElemValue         = ( 1.187188e-05 +- 9.826769e-06 )  GeV^-6
-TOTAL       :     0.671145 sec
-     1,934,376,171      cycles                    #    2.867 GHz                    
-     4,342,607,330      instructions              #    2.24  insn per cycle         
-       0.675706420 seconds time elapsed
+TOTAL       :     0.692554 sec
+     1,935,906,859      cycles                           #    2.807 GHz                    
+     4,335,582,992      instructions                     #    2.24  insn per cycle         
+       0.697192865 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2:83944) (512y:   33) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_f_inl0_hrd0/runTest.exe
@@ -180,20 +180,20 @@ Relative difference = 3.9425359136432956e-08
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_f_inl0_hrd0/check.exe -p 1 256 2 OMP=
-Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 7.285767e+02                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 7.288040e+02                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 7.288040e+02                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 7.195008e+02                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 7.197249e+02                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.197249e+02                 )  sec^-1
 MeanMatrixElemValue         = ( 1.187188e-05 +- 9.826768e-06 )  GeV^-6
-TOTAL       :     0.730681 sec
-     1,366,589,345      cycles                    #    1.861 GHz                    
-     2,193,139,079      instructions              #    1.60  insn per cycle         
-       0.735224545 seconds time elapsed
-=Symbols in CPPProcess.o= (~sse4:    0) (avx2: 2170) (512y:   41) (512z:83044)
+TOTAL       :     0.744700 sec
+     1,361,609,819      cycles                           #    1.835 GHz                    
+     2,186,530,497      instructions                     #    1.61  insn per cycle         
+       0.749913629 seconds time elapsed
+=Symbols in CPPProcess.o= (~sse4:    0) (avx2: 2180) (512y:   41) (512z:83043)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_f_inl0_hrd0/runTest.exe
 [  PASSED  ] 6 tests.
diff --git a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd0_bridge.txt b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd0_bridge.txt
index 6f1772cda6..928eec4df2 100644
--- a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd0_bridge.txt
+++ b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd0_bridge.txt
@@ -35,26 +35,26 @@ CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg'
 
-DATE: 2023-06-16_23:24:16
+DATE: 2023-07-18_23:27:45
 
-On itscrd80.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
+On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/gcheck.exe -p 1 256 2 --bridge OMP=
 WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost
 WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost
 WARNING! Instantiate device Bridge (nevt=256, gpublocks=1, gputhreads=256, gpublocks*gputhreads=256)
 WARNING! Set grid in Bridge (nevt=256, gpublocks=1, gputhreads=256, gpublocks*gputhreads=256)
-Process                     = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 6.655466e+02                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 6.657233e+02                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 6.657233e+02                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 6.815514e+02                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 6.817339e+02                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 6.817339e+02                 )  sec^-1
 MeanMatrixElemValue         = ( 1.187094e-05 +- 9.825664e-06 )  GeV^-6
-TOTAL       :     1.660443 sec
-     5,832,575,389      cycles                    #    3.001 GHz                    
-    12,080,610,113      instructions              #    2.07  insn per cycle         
-       2.000524939 seconds time elapsed
+TOTAL       :     1.597944 sec
+     5,581,307,276      cycles                           #    2.978 GHz                    
+    11,919,657,124      instructions                     #    2.14  insn per cycle         
+       1.930728577 seconds time elapsed
 runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/gcheck.exe -p 1 256 1 --bridge
 WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost
 WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost
@@ -68,41 +68,41 @@ WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost
 WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost
 WARNING! Instantiate device Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384)
 WARNING! Set grid in Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384)
-Process                     = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 2.306235e+04                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.319271e+04                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.319271e+04                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.271403e+04                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.283853e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.283853e+04                 )  sec^-1
 MeanMatrixElemValue         = ( 1.856441e-04 +- 8.331096e-05 )  GeV^-6
-TOTAL       :     1.939786 sec
-     6,710,577,294      cycles                    #    3.010 GHz                    
-    14,122,488,130      instructions              #    2.10  insn per cycle         
-       2.286693438 seconds time elapsed
+TOTAL       :     1.899162 sec
+     6,519,942,412      cycles                           #    2.998 GHz                    
+    14,543,846,826      instructions                     #    2.23  insn per cycle         
+       2.231011569 seconds time elapsed
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2
 Avg ME (C++/CUDA)   = 9.849636e-03
-Avg ME (F77/CUDA)   = 9.8712405367932642E-003
-Relative difference = 0.002193435046052877
+Avg ME (F77/CUDA)   = 9.8712405367667715E-003
+Relative difference = 0.0021934350433631634
 OK (relative difference <= 5E-3)
 =========================================================================
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/check.exe -p 1 256 2 --bridge OMP=
 WARNING! Instantiate host Bridge (nevt=256)
-Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 8.841510e+01                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 8.841775e+01                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 8.841775e+01                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 8.663821e+01                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 8.664071e+01                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 8.664071e+01                 )  sec^-1
 MeanMatrixElemValue         = ( 1.187013e-05 +- 9.825040e-06 )  GeV^-6
-TOTAL       :     5.977425 sec
-    18,376,830,240      cycles                    #    3.073 GHz                    
-    53,645,367,177      instructions              #    2.92  insn per cycle         
-       5.982025088 seconds time elapsed
-=Symbols in CPPProcess.o= (~sse4:20329) (avx2:    0) (512y:    0) (512z:    0)
+TOTAL       :     6.097901 sec
+    18,359,932,945      cycles                           #    3.010 GHz                    
+    53,640,128,637      instructions                     #    2.92  insn per cycle         
+       6.101763303 seconds time elapsed
+=Symbols in CPPProcess.o= (~sse4:20295) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/runTest.exe
 [  PASSED  ] 6 tests.
@@ -110,25 +110,25 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProce
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/fcheck.exe 2 64 2
 Avg ME (C++/C++)    = 9.847961e-03
-Avg ME (F77/C++)    = 9.8479612087550399E-003
-Relative difference = 2.119779305548787e-08
+Avg ME (F77/C++)    = 9.8479612087551509E-003
+Relative difference = 2.119780432912131e-08
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd0/check.exe -p 1 256 2 --bridge OMP=
 WARNING! Instantiate host Bridge (nevt=256)
-Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 3.623748e+02                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.624254e+02                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.624254e+02                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.496425e+02                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.496832e+02                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.496832e+02                 )  sec^-1
 MeanMatrixElemValue         = ( 1.187013e-05 +- 9.825037e-06 )  GeV^-6
-TOTAL       :     1.463452 sec
-     4,497,433,282      cycles                    #    3.068 GHz                    
-    13,764,580,046      instructions              #    3.06  insn per cycle         
-       1.468085626 seconds time elapsed
+TOTAL       :     1.515582 sec
+     4,606,909,599      cycles                           #    3.033 GHz                    
+    13,756,787,975      instructions                     #    2.99  insn per cycle         
+       1.519522272 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:96927) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd0/runTest.exe
@@ -143,19 +143,19 @@ OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd0/check.exe -p 1 256 2 --bridge OMP=
 WARNING! Instantiate host Bridge (nevt=256)
-Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 6.987628e+02                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 6.989331e+02                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 6.989331e+02                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 6.875615e+02                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 6.877209e+02                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 6.877209e+02                 )  sec^-1
 MeanMatrixElemValue         = ( 1.187188e-05 +- 9.826769e-06 )  GeV^-6
-TOTAL       :     0.762831 sec
-     2,199,477,118      cycles                    #    2.869 GHz                    
-     4,871,433,971      instructions              #    2.21  insn per cycle         
-       0.767671601 seconds time elapsed
+TOTAL       :     0.773540 sec
+     2,184,612,339      cycles                           #    2.815 GHz                    
+     4,863,407,104      instructions                     #    2.23  insn per cycle         
+       0.777199304 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2:84275) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd0/runTest.exe
@@ -170,19 +170,19 @@ OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_f_inl0_hrd0/check.exe -p 1 256 2 --bridge OMP=
 WARNING! Instantiate host Bridge (nevt=256)
-Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 7.924776e+02                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 7.926917e+02                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 7.926917e+02                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 7.845183e+02                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 7.847160e+02                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.847160e+02                 )  sec^-1
 MeanMatrixElemValue         = ( 1.187188e-05 +- 9.826769e-06 )  GeV^-6
-TOTAL       :     0.672826 sec
-     1,945,457,623      cycles                    #    2.877 GHz                    
-     4,343,708,064      instructions              #    2.23  insn per cycle         
-       0.677656533 seconds time elapsed
+TOTAL       :     0.678592 sec
+     1,931,890,794      cycles                           #    2.835 GHz                    
+     4,335,529,546      instructions                     #    2.24  insn per cycle         
+       0.682138638 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2:83944) (512y:   33) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_f_inl0_hrd0/runTest.exe
@@ -197,20 +197,20 @@ OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_f_inl0_hrd0/check.exe -p 1 256 2 --bridge OMP=
 WARNING! Instantiate host Bridge (nevt=256)
-Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 7.377484e+02                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 7.379728e+02                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 7.379728e+02                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 7.233858e+02                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 7.236035e+02                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.236035e+02                 )  sec^-1
 MeanMatrixElemValue         = ( 1.187188e-05 +- 9.826768e-06 )  GeV^-6
-TOTAL       :     0.723682 sec
-     1,379,747,419      cycles                    #    1.897 GHz                    
-     2,194,225,604      instructions              #    1.59  insn per cycle         
-       0.728534863 seconds time elapsed
-=Symbols in CPPProcess.o= (~sse4:    0) (avx2: 2170) (512y:   41) (512z:83044)
+TOTAL       :     0.735875 sec
+     1,362,688,180      cycles                           #    1.845 GHz                    
+     2,186,191,755      instructions                     #    1.60  insn per cycle         
+       0.739511514 seconds time elapsed
+=Symbols in CPPProcess.o= (~sse4:    0) (avx2: 2180) (512y:   41) (512z:83043)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_f_inl0_hrd0/runTest.exe
 [  PASSED  ] 6 tests.
diff --git a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd1.txt
index af71691b16..30b8493f61 100644
--- a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd1.txt
+++ b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd1.txt
@@ -35,61 +35,61 @@ CUDACPP_BUILDDIR='build.512z_f_inl0_hrd1'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg'
 
-DATE: 2023-06-16_23:04:19
+DATE: 2023-07-18_22:53:01
 
-On itscrd80.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
+On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd1/gcheck.exe -p 1 256 2 OMP=
-Process                     = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=1]
+Process                     = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 6.674461e+02                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 6.675325e+02                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 6.675581e+02                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 6.762140e+02                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 6.763059e+02                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 6.763409e+02                 )  sec^-1
 MeanMatrixElemValue         = ( 1.186984e-05 +- 9.824899e-06 )  GeV^-6
-TOTAL       :     1.698247 sec
-     5,886,551,862      cycles                    #    2.951 GHz                    
-    11,210,609,464      instructions              #    1.90  insn per cycle         
-       2.052173907 seconds time elapsed
+TOTAL       :     1.665282 sec
+     5,832,695,656      cycles                           #    2.986 GHz                    
+    12,267,200,402      instructions                     #    2.10  insn per cycle         
+       2.011078808 seconds time elapsed
 runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd1/gcheck.exe -p 1 256 1
 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255
 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100%
 .........................................................................
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd1/gcheck.exe -p 64 256 1 OMP=
-Process                     = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=1]
+Process                     = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 2.286680e+04                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.287310e+04                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.287386e+04                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.288186e+04                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.288997e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.289092e+04                 )  sec^-1
 MeanMatrixElemValue         = ( 1.856829e-04 +- 8.333435e-05 )  GeV^-6
-TOTAL       :     1.961982 sec
-     6,772,975,134      cycles                    #    3.012 GHz                    
-    13,750,292,328      instructions              #    2.03  insn per cycle         
-       2.305639635 seconds time elapsed
+TOTAL       :     1.931538 sec
+     6,620,092,279      cycles                           #    2.996 GHz                    
+    14,343,804,186      instructions                     #    2.17  insn per cycle         
+       2.265769331 seconds time elapsed
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd1/gcheck.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd1/fgcheck.exe 2 64 2
 Avg ME (C++/CUDA)   = 9.849636e-03
-Avg ME (F77/CUDA)   = 9.8712405367932608E-003
-Relative difference = 0.0021934350460525243
+Avg ME (F77/CUDA)   = 9.8712405367667715E-003
+Relative difference = 0.0021934350433631634
 OK (relative difference <= 5E-3)
 =========================================================================
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd1/check.exe -p 1 256 2 OMP=
-Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=1]
+Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 9.000555e+01                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 9.000832e+01                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 9.000832e+01                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 8.736091e+01                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 8.736341e+01                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 8.736341e+01                 )  sec^-1
 MeanMatrixElemValue         = ( 1.187013e-05 +- 9.825040e-06 )  GeV^-6
-TOTAL       :     5.871994 sec
-    18,105,311,919      cycles                    #    3.082 GHz                    
-    53,664,310,633      instructions              #    2.96  insn per cycle         
-       5.876674372 seconds time elapsed
-=Symbols in CPPProcess.o= (~sse4:20543) (avx2:    0) (512y:    0) (512z:    0)
+TOTAL       :     6.049248 sec
+    18,311,029,839      cycles                           #    3.026 GHz                    
+    53,621,886,907      instructions                     #    2.93  insn per cycle         
+       6.053266845 seconds time elapsed
+=Symbols in CPPProcess.o= (~sse4:20240) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd1/runTest.exe
 [  PASSED  ] 6 tests.
@@ -97,25 +97,25 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProce
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd1/check.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd1/fcheck.exe 2 64 2
 Avg ME (C++/C++)    = 9.847961e-03
-Avg ME (F77/C++)    = 9.8479612087571129E-003
-Relative difference = 2.119800355536229e-08
+Avg ME (F77/C++)    = 9.8479612087572898E-003
+Relative difference = 2.1198021522715588e-08
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd1/check.exe -p 1 256 2 OMP=
-Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=1]
+Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 3.589372e+02                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.589815e+02                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.589815e+02                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.476755e+02                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.477150e+02                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.477150e+02                 )  sec^-1
 MeanMatrixElemValue         = ( 1.187013e-05 +- 9.825037e-06 )  GeV^-6
-TOTAL       :     1.476565 sec
-     4,561,855,617      cycles                    #    3.083 GHz                    
-    13,756,278,948      instructions              #    3.02  insn per cycle         
-       1.481667004 seconds time elapsed
-=Symbols in CPPProcess.o= (~sse4:96740) (avx2:    0) (512y:    0) (512z:    0)
+TOTAL       :     1.523805 sec
+     4,595,954,892      cycles                           #    3.010 GHz                    
+    13,748,613,866      instructions                     #    2.99  insn per cycle         
+       1.527785688 seconds time elapsed
+=Symbols in CPPProcess.o= (~sse4:96684) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd1/runTest.exe
 [  PASSED  ] 6 tests.
@@ -128,20 +128,20 @@ Relative difference = 3.849071936588079e-08
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd1/check.exe -p 1 256 2 OMP=
-Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=1]
+Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 6.993067e+02                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 6.994817e+02                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 6.994817e+02                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 7.004655e+02                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 7.006299e+02                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.006299e+02                 )  sec^-1
 MeanMatrixElemValue         = ( 1.187188e-05 +- 9.826769e-06 )  GeV^-6
-TOTAL       :     0.761742 sec
-     2,188,633,514      cycles                    #    2.862 GHz                    
-     4,877,048,177      instructions              #    2.23  insn per cycle         
-       0.766373792 seconds time elapsed
-=Symbols in CPPProcess.o= (~sse4:    0) (avx2:84908) (512y:    0) (512z:    0)
+TOTAL       :     0.759115 sec
+     2,147,216,619      cycles                           #    2.818 GHz                    
+     4,869,213,337      instructions                     #    2.27  insn per cycle         
+       0.762864770 seconds time elapsed
+=Symbols in CPPProcess.o= (~sse4:    0) (avx2:84897) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd1/runTest.exe
 [  PASSED  ] 6 tests.
@@ -154,20 +154,20 @@ Relative difference = 3.9425546409167914e-08
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_f_inl0_hrd1/check.exe -p 1 256 2 OMP=
-Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=1]
+Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 7.971095e+02                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 7.973270e+02                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 7.973270e+02                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 7.805699e+02                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 7.807659e+02                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.807659e+02                 )  sec^-1
 MeanMatrixElemValue         = ( 1.187188e-05 +- 9.826769e-06 )  GeV^-6
-TOTAL       :     0.668601 sec
-     1,935,360,358      cycles                    #    2.879 GHz                    
-     4,348,847,241      instructions              #    2.25  insn per cycle         
-       0.673510965 seconds time elapsed
-=Symbols in CPPProcess.o= (~sse4:    0) (avx2:84638) (512y:   22) (512z:    0)
+TOTAL       :     0.681297 sec
+     1,924,788,541      cycles                           #    2.811 GHz                    
+     4,340,748,369      instructions                     #    2.26  insn per cycle         
+       0.685335487 seconds time elapsed
+=Symbols in CPPProcess.o= (~sse4:    0) (avx2:84599) (512y:   22) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_f_inl0_hrd1/runTest.exe
 [  PASSED  ] 6 tests.
@@ -180,20 +180,20 @@ Relative difference = 3.9425546409167914e-08
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_f_inl0_hrd1/check.exe -p 1 256 2 OMP=
-Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=1]
+Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 7.103833e+02                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 7.106308e+02                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 7.106308e+02                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 7.200580e+02                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 7.202747e+02                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.202747e+02                 )  sec^-1
 MeanMatrixElemValue         = ( 1.187188e-05 +- 9.826768e-06 )  GeV^-6
-TOTAL       :     0.749331 sec
-     1,373,934,827      cycles                    #    1.825 GHz                    
-     2,200,572,670      instructions              #    1.60  insn per cycle         
-       0.754227181 seconds time elapsed
-=Symbols in CPPProcess.o= (~sse4:    0) (avx2: 2740) (512y:   23) (512z:83143)
+TOTAL       :     0.738614 sec
+     1,363,086,584      cycles                           #    1.837 GHz                    
+     2,192,070,777      instructions                     #    1.61  insn per cycle         
+       0.742546673 seconds time elapsed
+=Symbols in CPPProcess.o= (~sse4:    0) (avx2: 2739) (512y:   23) (512z:83176)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_f_inl0_hrd1/runTest.exe
 [  PASSED  ] 6 tests.
diff --git a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_m_inl0_hrd0.txt
index 3a9dd35695..a30ca07357 100644
--- a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_m_inl0_hrd0.txt
+++ b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_m_inl0_hrd0.txt
@@ -35,38 +35,38 @@ CUDACPP_BUILDDIR='build.512z_m_inl0_hrd0'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg'
 
-DATE: 2023-06-16_23:05:06
+DATE: 2023-07-18_22:53:47
 
-On itscrd80.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
+On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd0/gcheck.exe -p 1 256 2 OMP=
-Process                     = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 4.693625e+02                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 4.694288e+02                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 4.694409e+02                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 4.696272e+02                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 4.696884e+02                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.697052e+02                 )  sec^-1
 MeanMatrixElemValue         = ( 1.187066e-05 +- 9.825548e-06 )  GeV^-6
-TOTAL       :     2.196059 sec
-     7,620,057,050      cycles                    #    3.016 GHz                    
-    15,936,153,152      instructions              #    2.09  insn per cycle         
-       2.583018062 seconds time elapsed
+TOTAL       :     2.175114 sec
+     7,423,433,219      cycles                           #    2.973 GHz                    
+    16,360,609,172      instructions                     #    2.20  insn per cycle         
+       2.553706141 seconds time elapsed
 runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd0/gcheck.exe -p 1 256 1
 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255
 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100%
 .........................................................................
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd0/gcheck.exe -p 64 256 1 OMP=
-Process                     = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 1.108976e+04                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.109249e+04                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.109272e+04                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.112322e+04                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.112640e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.112677e+04                 )  sec^-1
 MeanMatrixElemValue         = ( 1.856249e-04 +- 8.329951e-05 )  GeV^-6
-TOTAL       :     3.430946 sec
-    11,408,477,495      cycles                    #    3.034 GHz                    
-    26,313,998,938      instructions              #    2.31  insn per cycle         
-       3.820144649 seconds time elapsed
+TOTAL       :     3.402785 sec
+    11,079,983,197      cycles                           #    2.966 GHz                    
+    26,069,232,135      instructions                     #    2.35  insn per cycle         
+       3.792325716 seconds time elapsed
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd0/gcheck.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd0/fgcheck.exe 2 64 2
@@ -76,20 +76,20 @@ Relative difference = 3.1385249252060663e-07
 OK (relative difference <= 5E-3)
 =========================================================================
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd0/check.exe -p 1 256 2 OMP=
-Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 8.362446e+01                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 8.362744e+01                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 8.362744e+01                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 7.568753e+01                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 7.568979e+01                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.568979e+01                 )  sec^-1
 MeanMatrixElemValue         = ( 1.187066e-05 +- 9.825549e-06 )  GeV^-6
-TOTAL       :     6.325318 sec
-    19,430,907,237      cycles                    #    3.071 GHz                    
-    54,292,603,982      instructions              #    2.79  insn per cycle         
-       6.329867372 seconds time elapsed
-=Symbols in CPPProcess.o= (~sse4:31977) (avx2:    0) (512y:    0) (512z:    0)
+TOTAL       :     6.985657 sec
+    19,580,314,386      cycles                           #    2.803 GHz                    
+    54,288,249,136      instructions                     #    2.77  insn per cycle         
+       6.989636952 seconds time elapsed
+=Symbols in CPPProcess.o= (~sse4:31981) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd0/runTest.exe
 [  PASSED  ] 6 tests.
@@ -102,19 +102,19 @@ Relative difference = 3.457988134687711e-07
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_m_inl0_hrd0/check.exe -p 1 256 2 OMP=
-Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.619143e+02                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.619255e+02                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.619255e+02                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.560655e+02                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.560735e+02                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.560735e+02                 )  sec^-1
 MeanMatrixElemValue         = ( 1.187066e-05 +- 9.825548e-06 )  GeV^-6
-TOTAL       :     3.268613 sec
-     9,472,675,959      cycles                    #    2.895 GHz                    
-    26,115,271,176      instructions              #    2.76  insn per cycle         
-       3.273561039 seconds time elapsed
+TOTAL       :     3.389034 sec
+     9,743,077,746      cycles                           #    2.873 GHz                    
+    26,108,000,956      instructions                     #    2.68  insn per cycle         
+       3.392740513 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:95919) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_m_inl0_hrd0/runTest.exe
@@ -128,19 +128,19 @@ Relative difference = 3.5610570575237004e-07
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_m_inl0_hrd0/check.exe -p 1 256 2 OMP=
-Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 3.790808e+02                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.791484e+02                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.791484e+02                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.708056e+02                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.708516e+02                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.708516e+02                 )  sec^-1
 MeanMatrixElemValue         = ( 1.187066e-05 +- 9.825548e-06 )  GeV^-6
-TOTAL       :     1.400732 sec
-     4,075,771,818      cycles                    #    2.902 GHz                    
-     9,331,333,269      instructions              #    2.29  insn per cycle         
-       1.406221756 seconds time elapsed
+TOTAL       :     1.429005 sec
+     4,080,366,942      cycles                           #    2.850 GHz                    
+     9,329,572,618      instructions                     #    2.29  insn per cycle         
+       1.433075771 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2:83766) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_m_inl0_hrd0/runTest.exe
@@ -154,19 +154,19 @@ Relative difference = 3.613714310412983e-07
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_m_inl0_hrd0/check.exe -p 1 256 2 OMP=
-Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 4.155112e+02                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 4.155865e+02                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 4.155865e+02                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 4.154770e+02                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 4.155334e+02                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.155334e+02                 )  sec^-1
 MeanMatrixElemValue         = ( 1.187066e-05 +- 9.825548e-06 )  GeV^-6
-TOTAL       :     1.276847 sec
-     3,682,881,798      cycles                    #    2.877 GHz                    
-     8,307,430,129      instructions              #    2.26  insn per cycle         
-       1.281346988 seconds time elapsed
+TOTAL       :     1.276074 sec
+     3,634,666,906      cycles                           #    2.842 GHz                    
+     8,305,590,665      instructions                     #    2.29  insn per cycle         
+       1.280094579 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2:83502) (512y:   20) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_m_inl0_hrd0/runTest.exe
@@ -180,20 +180,20 @@ Relative difference = 3.613714310412983e-07
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_m_inl0_hrd0/check.exe -p 1 256 2 OMP=
-Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 3.785211e+02                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.786021e+02                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.786021e+02                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.730920e+02                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.731496e+02                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.731496e+02                 )  sec^-1
 MeanMatrixElemValue         = ( 1.187066e-05 +- 9.825548e-06 )  GeV^-6
-TOTAL       :     1.403459 sec
-     2,639,508,347      cycles                    #    1.878 GHz                    
-     4,234,654,161      instructions              #    1.60  insn per cycle         
-       1.408336015 seconds time elapsed
-=Symbols in CPPProcess.o= (~sse4:    0) (avx2: 1911) (512y:   57) (512z:82637)
+TOTAL       :     1.420967 sec
+     2,634,262,273      cycles                           #    1.850 GHz                    
+     4,226,115,956      instructions                     #    1.60  insn per cycle         
+       1.424616315 seconds time elapsed
+=Symbols in CPPProcess.o= (~sse4:    0) (avx2: 1921) (512y:   58) (512z:82636)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_m_inl0_hrd0/runTest.exe
 [  PASSED  ] 6 tests.
diff --git a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_m_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_m_inl0_hrd1.txt
index cd996fa793..876a0f0095 100644
--- a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_m_inl0_hrd1.txt
+++ b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_m_inl0_hrd1.txt
@@ -35,38 +35,38 @@ CUDACPP_BUILDDIR='build.512z_m_inl0_hrd1'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg'
 
-DATE: 2023-06-16_23:06:07
+DATE: 2023-07-18_22:54:48
 
-On itscrd80.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
+On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd1/gcheck.exe -p 1 256 2 OMP=
-Process                     = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=1]
+Process                     = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 4.688054e+02                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 4.688638e+02                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 4.688761e+02                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 4.682609e+02                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 4.683142e+02                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.683311e+02                 )  sec^-1
 MeanMatrixElemValue         = ( 1.187066e-05 +- 9.825548e-06 )  GeV^-6
-TOTAL       :     2.196300 sec
-     7,637,726,731      cycles                    #    3.023 GHz                    
-    15,838,474,776      instructions              #    2.07  insn per cycle         
-       2.583156463 seconds time elapsed
+TOTAL       :     2.176991 sec
+     7,484,781,827      cycles                           #    2.995 GHz                    
+    15,629,838,380      instructions                     #    2.09  insn per cycle         
+       2.555996760 seconds time elapsed
 runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd1/gcheck.exe -p 1 256 1
 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255
 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100%
 .........................................................................
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd1/gcheck.exe -p 64 256 1 OMP=
-Process                     = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=1]
+Process                     = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 1.108172e+04                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.108543e+04                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.108568e+04                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.109144e+04                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.109457e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.109491e+04                 )  sec^-1
 MeanMatrixElemValue         = ( 1.856249e-04 +- 8.329951e-05 )  GeV^-6
-TOTAL       :     3.433735 sec
-    11,452,674,028      cycles                    #    3.032 GHz                    
-    23,961,349,118      instructions              #    2.09  insn per cycle         
-       3.834644270 seconds time elapsed
+TOTAL       :     3.407213 sec
+    11,182,166,317      cycles                           #    2.996 GHz                    
+    26,393,726,074      instructions                     #    2.36  insn per cycle         
+       3.788702297 seconds time elapsed
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd1/gcheck.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd1/fgcheck.exe 2 64 2
@@ -76,20 +76,20 @@ Relative difference = 3.1385249252060663e-07
 OK (relative difference <= 5E-3)
 =========================================================================
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd1/check.exe -p 1 256 2 OMP=
-Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=1]
+Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 7.781184e+01                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 7.781448e+01                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 7.781448e+01                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 7.636770e+01                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 7.636993e+01                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.636993e+01                 )  sec^-1
 MeanMatrixElemValue         = ( 1.187066e-05 +- 9.825549e-06 )  GeV^-6
-TOTAL       :     6.780445 sec
-    19,564,693,523      cycles                    #    2.884 GHz                    
-    54,298,011,506      instructions              #    2.78  insn per cycle         
-       6.785104515 seconds time elapsed
-=Symbols in CPPProcess.o= (~sse4:32420) (avx2:    0) (512y:    0) (512z:    0)
+TOTAL       :     6.920565 sec
+    19,676,257,089      cycles                           #    2.843 GHz                    
+    54,278,198,702      instructions                     #    2.76  insn per cycle         
+       6.924591734 seconds time elapsed
+=Symbols in CPPProcess.o= (~sse4:32135) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd1/runTest.exe
 [  PASSED  ] 6 tests.
@@ -102,19 +102,19 @@ Relative difference = 3.457988134687711e-07
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_m_inl0_hrd1/check.exe -p 1 256 2 OMP=
-Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=1]
+Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.707967e+02                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.708090e+02                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.708090e+02                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.582580e+02                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.582671e+02                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.582671e+02                 )  sec^-1
 MeanMatrixElemValue         = ( 1.187066e-05 +- 9.825548e-06 )  GeV^-6
-TOTAL       :     3.103057 sec
-     9,582,705,373      cycles                    #    3.085 GHz                    
-    26,028,889,965      instructions              #    2.72  insn per cycle         
-       3.107702792 seconds time elapsed
+TOTAL       :     3.342366 sec
+     9,474,102,869      cycles                           #    2.833 GHz                    
+    26,022,233,908      instructions                     #    2.75  insn per cycle         
+       3.346342144 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:95694) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_m_inl0_hrd1/runTest.exe
@@ -128,20 +128,20 @@ Relative difference = 3.5610570575237004e-07
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_m_inl0_hrd1/check.exe -p 1 256 2 OMP=
-Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=1]
+Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 3.703281e+02                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.703858e+02                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.703858e+02                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.660841e+02                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.661337e+02                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.661337e+02                 )  sec^-1
 MeanMatrixElemValue         = ( 1.187066e-05 +- 9.825548e-06 )  GeV^-6
-TOTAL       :     1.431633 sec
-     4,120,363,711      cycles                    #    2.872 GHz                    
-     9,310,908,558      instructions              #    2.26  insn per cycle         
-       1.437026482 seconds time elapsed
-=Symbols in CPPProcess.o= (~sse4:    0) (avx2:83565) (512y:    0) (512z:    0)
+TOTAL       :     1.447230 sec
+     4,092,458,511      cycles                           #    2.822 GHz                    
+     9,308,826,047      instructions                     #    2.27  insn per cycle         
+       1.451338201 seconds time elapsed
+=Symbols in CPPProcess.o= (~sse4:    0) (avx2:83499) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_m_inl0_hrd1/runTest.exe
 [  PASSED  ] 6 tests.
@@ -154,20 +154,20 @@ Relative difference = 3.613714310412983e-07
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_m_inl0_hrd1/check.exe -p 1 256 2 OMP=
-Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=1]
+Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 4.240493e+02                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 4.241279e+02                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 4.241279e+02                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 4.110235e+02                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 4.110790e+02                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.110790e+02                 )  sec^-1
 MeanMatrixElemValue         = ( 1.187066e-05 +- 9.825548e-06 )  GeV^-6
-TOTAL       :     1.251163 sec
-     3,616,491,450      cycles                    #    2.883 GHz                    
-     8,302,817,812      instructions              #    2.30  insn per cycle         
-       1.256135099 seconds time elapsed
-=Symbols in CPPProcess.o= (~sse4:    0) (avx2:83185) (512y:  170) (512z:    0)
+TOTAL       :     1.290318 sec
+     3,656,642,519      cycles                           #    2.828 GHz                    
+     8,300,905,499      instructions                     #    2.27  insn per cycle         
+       1.294253720 seconds time elapsed
+=Symbols in CPPProcess.o= (~sse4:    0) (avx2:83152) (512y:  170) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_m_inl0_hrd1/runTest.exe
 [  PASSED  ] 6 tests.
@@ -180,20 +180,20 @@ Relative difference = 3.613714310412983e-07
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_m_inl0_hrd1/check.exe -p 1 256 2 OMP=
-Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=1]
+Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 3.753912e+02                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.754634e+02                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.754634e+02                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.703813e+02                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.704352e+02                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.704352e+02                 )  sec^-1
 MeanMatrixElemValue         = ( 1.187066e-05 +- 9.825548e-06 )  GeV^-6
-TOTAL       :     1.412974 sec
-     2,639,045,649      cycles                    #    1.864 GHz                    
-     4,230,988,059      instructions              #    1.60  insn per cycle         
-       1.417985866 seconds time elapsed
-=Symbols in CPPProcess.o= (~sse4:    0) (avx2: 1428) (512y:  156) (512z:82779)
+TOTAL       :     1.432011 sec
+     2,635,765,100      cycles                           #    1.836 GHz                    
+     4,223,288,334      instructions                     #    1.60  insn per cycle         
+       1.436037765 seconds time elapsed
+=Symbols in CPPProcess.o= (~sse4:    0) (avx2: 1429) (512y:  156) (512z:82786)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_m_inl0_hrd1/runTest.exe
 [  PASSED  ] 6 tests.
diff --git a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd0.txt
index ac1881c14d..fea83ea522 100644
--- a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd0.txt
+++ b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd0.txt
@@ -35,38 +35,38 @@ CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux'
 
-DATE: 2023-06-16_22:59:46
+DATE: 2023-07-18_22:48:33
 
-On itscrd80.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
+On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/gcheck.exe -p 64 256 10 OMP=
-Process                     = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 2.470677e+07                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 5.403049e+07                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 5.845489e+07                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.719457e+07                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 5.472965e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 5.858342e+07                 )  sec^-1
 MeanMatrixElemValue         = ( 3.404831e+01 +- 1.677228e+01 )  GeV^-2
-TOTAL       :     0.475414 sec
-     2,007,563,669      cycles                    #    2.838 GHz                    
-     2,452,577,465      instructions              #    1.22  insn per cycle         
-       0.764443076 seconds time elapsed
+TOTAL       :     0.450160 sec
+     1,949,025,471      cycles                           #    2.932 GHz                    
+     2,655,591,019      instructions                     #    1.36  insn per cycle         
+       0.737059925 seconds time elapsed
 runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/gcheck.exe -p 64 256 1
 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255
 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100%
 .........................................................................
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 1 OMP=
-Process                     = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 3.377326e+07                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 7.547761e+07                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 7.992085e+07                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.347425e+07                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 7.590288e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 8.062522e+07                 )  sec^-1
 MeanMatrixElemValue         = ( 4.282445e+02 +- 2.530899e+02 )  GeV^-2
-TOTAL       :     0.557136 sec
-     2,348,204,579      cycles                    #    2.922 GHz                    
-     2,941,788,306      instructions              #    1.25  insn per cycle         
-       0.862619515 seconds time elapsed
+TOTAL       :     0.531537 sec
+     2,272,333,411      cycles                           #    2.949 GHz                    
+     3,136,805,865      instructions                     #    1.38  insn per cycle         
+       0.828090573 seconds time elapsed
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2
@@ -76,19 +76,19 @@ Relative difference = 2.984467216677476e-07
 OK (relative difference <= 5E-3)
 =========================================================================
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/check.exe -p 64 256 10 OMP=
-Process                     = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.175937e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.209277e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.209277e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.115690e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.140019e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.140019e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 3.404831e+01 +- 1.677228e+01 )  GeV^-2
-TOTAL       :     1.417777 sec
-     4,418,078,401      cycles                    #    3.108 GHz                    
-    12,858,365,147      instructions              #    2.91  insn per cycle         
-       1.423319552 seconds time elapsed
+TOTAL       :     1.490593 sec
+     4,533,884,133      cycles                           #    3.034 GHz                    
+    12,816,471,728      instructions                     #    2.83  insn per cycle         
+       1.495888033 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:  733) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/runTest.exe
@@ -102,19 +102,19 @@ Relative difference = 2.9844565299804477e-07
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_d_inl0_hrd0/check.exe -p 64 256 10 OMP=
-Process                     = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 2.113749e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.225511e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.225511e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.063298e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.146504e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.146504e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 3.404831e+01 +- 1.677228e+01 )  GeV^-2
-TOTAL       :     0.797724 sec
-     2,444,856,977      cycles                    #    3.051 GHz                    
-     7,068,441,253      instructions              #    2.89  insn per cycle         
-       0.808817770 seconds time elapsed
+TOTAL       :     0.816326 sec
+     2,478,405,968      cycles                           #    3.027 GHz                    
+     7,027,255,753      instructions                     #    2.84  insn per cycle         
+       0.821401180 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4: 3093) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_d_inl0_hrd0/runTest.exe
@@ -128,19 +128,19 @@ Relative difference = 2.9844565299804477e-07
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_d_inl0_hrd0/check.exe -p 64 256 10 OMP=
-Process                     = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 3.944151e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 4.349910e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 4.349910e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.945941e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 4.258834e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.258834e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 3.404831e+01 +- 1.677228e+01 )  GeV^-2
-TOTAL       :     0.438420 sec
-     1,274,694,276      cycles                    #    2.885 GHz                    
-     2,842,409,497      instructions              #    2.23  insn per cycle         
-       0.443229097 seconds time elapsed
+TOTAL       :     0.439436 sec
+     1,248,930,157      cycles                           #    2.833 GHz                    
+     2,800,292,605      instructions                     #    2.24  insn per cycle         
+       0.444398293 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 2725) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_d_inl0_hrd0/runTest.exe
@@ -154,19 +154,19 @@ Relative difference = 2.9844659193456305e-07
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_d_inl0_hrd0/check.exe -p 64 256 10 OMP=
-Process                     = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 4.355894e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 4.841805e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 4.841805e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 4.388007e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 4.771114e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.771114e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 3.404831e+01 +- 1.677228e+01 )  GeV^-2
-TOTAL       :     0.399144 sec
-     1,170,164,950      cycles                    #    2.906 GHz                    
-     2,702,440,747      instructions              #    2.31  insn per cycle         
-       0.403916574 seconds time elapsed
+TOTAL       :     0.397928 sec
+     1,143,495,589      cycles                           #    2.859 GHz                    
+     2,662,566,955      instructions                     #    2.33  insn per cycle         
+       0.403205791 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 2530) (512y:   54) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_d_inl0_hrd0/runTest.exe
@@ -180,20 +180,20 @@ Relative difference = 2.9844659193456305e-07
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_d_inl0_hrd0/check.exe -p 64 256 10 OMP=
-Process                     = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 3.020461e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.251605e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.251605e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.988696e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.161799e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.161799e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 3.404831e+01 +- 1.677228e+01 )  GeV^-2
-TOTAL       :     0.567519 sec
-     1,146,857,475      cycles                    #    2.007 GHz                    
-     1,675,706,744      instructions              #    1.46  insn per cycle         
-       0.572452940 seconds time elapsed
-=Symbols in CPPProcess.o= (~sse4:    0) (avx2: 1055) (512y:   78) (512z: 2135)
+TOTAL       :     0.574523 sec
+     1,116,040,350      cycles                           #    1.937 GHz                    
+     1,636,926,421      instructions                     #    1.47  insn per cycle         
+       0.579497288 seconds time elapsed
+=Symbols in CPPProcess.o= (~sse4:    0) (avx2: 1064) (512y:   79) (512z: 2134)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_d_inl0_hrd0/runTest.exe
 [  PASSED  ] 6 tests.
diff --git a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd0_bridge.txt b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd0_bridge.txt
index 9d2980d703..2108f76071 100644
--- a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd0_bridge.txt
+++ b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd0_bridge.txt
@@ -35,26 +35,26 @@ CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux'
 
-DATE: 2023-06-16_23:21:34
+DATE: 2023-07-18_23:25:06
 
-On itscrd80.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
+On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/gcheck.exe -p 64 256 10 --bridge OMP=
 WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost
 WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost
 WARNING! Instantiate device Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384)
 WARNING! Set grid in Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384)
-Process                     = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 3.070393e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.287586e+07                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.287586e+07                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.585157e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.158790e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.158790e+07                 )  sec^-1
 MeanMatrixElemValue         = ( 3.404831e+01 +- 1.677228e+01 )  GeV^-2
-TOTAL       :     0.504575 sec
-     2,103,581,583      cycles                    #    2.905 GHz                    
-     2,735,769,174      instructions              #    1.30  insn per cycle         
-       0.783099451 seconds time elapsed
+TOTAL       :     0.474113 sec
+     2,022,739,618      cycles                           #    2.944 GHz                    
+     2,903,368,607      instructions                     #    1.44  insn per cycle         
+       0.744339953 seconds time elapsed
 runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/gcheck.exe -p 64 256 1 --bridge
 WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost
 WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost
@@ -68,17 +68,17 @@ WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost
 WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost
 WARNING! Instantiate device Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288)
 WARNING! Set grid in Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288)
-Process                     = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 2.869815e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.409855e+07                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.409855e+07                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.243278e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.304833e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.304833e+07                 )  sec^-1
 MeanMatrixElemValue         = ( 4.282445e+02 +- 2.530899e+02 )  GeV^-2
-TOTAL       :     0.796207 sec
-     3,159,417,542      cycles                    #    2.977 GHz                    
-     4,392,482,036      instructions              #    1.39  insn per cycle         
-       1.122081608 seconds time elapsed
+TOTAL       :     0.753906 sec
+     2,980,726,577      cycles                           #    2.963 GHz                    
+     4,446,396,110      instructions                     #    1.49  insn per cycle         
+       1.064431625 seconds time elapsed
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2
@@ -89,19 +89,19 @@ OK (relative difference <= 5E-3)
 =========================================================================
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/check.exe -p 64 256 10 --bridge OMP=
 WARNING! Instantiate host Bridge (nevt=16384)
-Process                     = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.166207e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.199063e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.199063e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.117698e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.142216e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.142216e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 3.404831e+01 +- 1.677228e+01 )  GeV^-2
-TOTAL       :     1.434034 sec
-     4,450,678,360      cycles                    #    3.095 GHz                    
-    12,865,964,603      instructions              #    2.89  insn per cycle         
-       1.439725535 seconds time elapsed
+TOTAL       :     1.494974 sec
+     4,566,562,470      cycles                           #    3.047 GHz                    
+    12,825,171,048      instructions                     #    2.81  insn per cycle         
+       1.499401026 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:  733) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/runTest.exe
@@ -116,19 +116,19 @@ OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_d_inl0_hrd0/check.exe -p 64 256 10 --bridge OMP=
 WARNING! Instantiate host Bridge (nevt=16384)
-Process                     = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 2.121625e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.232697e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.232697e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.063593e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.147355e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.147355e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 3.404831e+01 +- 1.677228e+01 )  GeV^-2
-TOTAL       :     0.801253 sec
-     2,477,560,136      cycles                    #    3.079 GHz                    
-     7,119,440,241      instructions              #    2.87  insn per cycle         
-       0.812534403 seconds time elapsed
+TOTAL       :     0.823092 sec
+     2,512,237,038      cycles                           #    3.038 GHz                    
+     7,078,628,755      instructions                     #    2.82  insn per cycle         
+       0.827558458 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4: 3093) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_d_inl0_hrd0/runTest.exe
@@ -143,19 +143,19 @@ OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_d_inl0_hrd0/check.exe -p 64 256 10 --bridge OMP=
 WARNING! Instantiate host Bridge (nevt=16384)
-Process                     = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 3.934946e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 4.330898e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 4.330898e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.904279e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 4.214110e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.214110e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 3.404831e+01 +- 1.677228e+01 )  GeV^-2
-TOTAL       :     0.446709 sec
-     1,309,681,066      cycles                    #    2.901 GHz                    
-     2,895,588,054      instructions              #    2.21  insn per cycle         
-       0.458145800 seconds time elapsed
+TOTAL       :     0.449381 sec
+     1,289,599,454      cycles                           #    2.847 GHz                    
+     2,851,042,796      instructions                     #    2.21  insn per cycle         
+       0.453830437 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 2725) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_d_inl0_hrd0/runTest.exe
@@ -170,19 +170,19 @@ OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_d_inl0_hrd0/check.exe -p 64 256 10 --bridge OMP=
 WARNING! Instantiate host Bridge (nevt=16384)
-Process                     = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 4.311032e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 4.799335e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 4.799335e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 4.139907e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 4.488963e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.488963e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 3.404831e+01 +- 1.677228e+01 )  GeV^-2
-TOTAL       :     0.409974 sec
-     1,200,991,662      cycles                    #    2.902 GHz                    
-     2,753,820,763      instructions              #    2.29  insn per cycle         
-       0.420684326 seconds time elapsed
+TOTAL       :     0.425111 sec
+     1,205,515,884      cycles                           #    2.813 GHz                    
+     2,709,191,392      instructions                     #    2.25  insn per cycle         
+       0.429101442 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 2530) (512y:   54) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_d_inl0_hrd0/runTest.exe
@@ -197,20 +197,20 @@ OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_d_inl0_hrd0/check.exe -p 64 256 10 --bridge OMP=
 WARNING! Instantiate host Bridge (nevt=16384)
-Process                     = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 3.052439e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.289835e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.289835e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.990198e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.165668e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.165668e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 3.404831e+01 +- 1.677228e+01 )  GeV^-2
-TOTAL       :     0.568236 sec
-     1,177,198,332      cycles                    #    2.056 GHz                    
-     1,715,837,168      instructions              #    1.46  insn per cycle         
-       0.573431193 seconds time elapsed
-=Symbols in CPPProcess.o= (~sse4:    0) (avx2: 1055) (512y:   78) (512z: 2135)
+TOTAL       :     0.578701 sec
+     1,149,012,692      cycles                           #    1.974 GHz                    
+     1,675,958,965      instructions                     #    1.46  insn per cycle         
+       0.582686476 seconds time elapsed
+=Symbols in CPPProcess.o= (~sse4:    0) (avx2: 1064) (512y:   79) (512z: 2134)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_d_inl0_hrd0/runTest.exe
 [  PASSED  ] 6 tests.
diff --git a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd1.txt
index 0ac8c3da88..76472ab9ff 100644
--- a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd1.txt
+++ b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd1.txt
@@ -35,38 +35,38 @@ CUDACPP_BUILDDIR='build.512z_d_inl0_hrd1'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux'
 
-DATE: 2023-06-16_23:00:03
+DATE: 2023-07-18_22:48:49
 
-On itscrd80.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
+On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd1/gcheck.exe -p 64 256 10 OMP=
-Process                     = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=1]
+Process                     = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 2.441903e+07                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 5.284655e+07                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 5.707498e+07                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.689816e+07                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 5.318719e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 5.685904e+07                 )  sec^-1
 MeanMatrixElemValue         = ( 3.404831e+01 +- 1.677228e+01 )  GeV^-2
-TOTAL       :     0.476149 sec
-     2,004,075,040      cycles                    #    2.858 GHz                    
-     2,447,450,357      instructions              #    1.22  insn per cycle         
-       0.758991361 seconds time elapsed
+TOTAL       :     0.449232 sec
+     1,950,934,078      cycles                           #    2.940 GHz                    
+     2,676,952,219      instructions                     #    1.37  insn per cycle         
+       0.728652348 seconds time elapsed
 runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd1/gcheck.exe -p 64 256 1
 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255
 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100%
 .........................................................................
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd1/gcheck.exe -p 2048 256 1 OMP=
-Process                     = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=1]
+Process                     = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 3.358848e+07                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 7.448514e+07                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 7.881423e+07                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.327000e+07                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 7.433359e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.907754e+07                 )  sec^-1
 MeanMatrixElemValue         = ( 4.282445e+02 +- 2.530899e+02 )  GeV^-2
-TOTAL       :     0.560373 sec
-     2,330,695,105      cycles                    #    2.896 GHz                    
-     2,941,791,611      instructions              #    1.26  insn per cycle         
-       0.864662823 seconds time elapsed
+TOTAL       :     0.531318 sec
+     2,278,354,923      cycles                           #    2.955 GHz                    
+     3,168,740,218      instructions                     #    1.39  insn per cycle         
+       0.827852908 seconds time elapsed
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd1/gcheck.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd1/fgcheck.exe 2 64 2
@@ -76,19 +76,19 @@ Relative difference = 2.984467216677476e-07
 OK (relative difference <= 5E-3)
 =========================================================================
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd1/check.exe -p 64 256 10 OMP=
-Process                     = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=1]
+Process                     = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.185072e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.219142e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.219142e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.128479e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.153366e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.153366e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 3.404831e+01 +- 1.677228e+01 )  GeV^-2
-TOTAL       :     1.405955 sec
-     4,365,481,367      cycles                    #    3.097 GHz                    
-    12,734,304,560      instructions              #    2.92  insn per cycle         
-       1.410932077 seconds time elapsed
+TOTAL       :     1.474456 sec
+     4,481,962,559      cycles                           #    3.034 GHz                    
+    12,694,148,363      instructions                     #    2.83  insn per cycle         
+       1.479503853 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:  687) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd1/runTest.exe
@@ -102,19 +102,19 @@ Relative difference = 2.9844565299804477e-07
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_d_inl0_hrd1/check.exe -p 64 256 10 OMP=
-Process                     = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=1]
+Process                     = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 2.160833e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.277513e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.277513e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.100853e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.187095e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.187095e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 3.404831e+01 +- 1.677228e+01 )  GeV^-2
-TOTAL       :     0.781135 sec
-     2,409,696,110      cycles                    #    3.070 GHz                    
-     6,927,148,000      instructions              #    2.87  insn per cycle         
-       0.792294841 seconds time elapsed
+TOTAL       :     0.802839 sec
+     2,438,392,527      cycles                           #    3.028 GHz                    
+     6,892,382,300      instructions                     #    2.83  insn per cycle         
+       0.807803393 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4: 2942) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_d_inl0_hrd1/runTest.exe
@@ -128,19 +128,19 @@ Relative difference = 2.9844565299804477e-07
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_d_inl0_hrd1/check.exe -p 64 256 10 OMP=
-Process                     = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=1]
+Process                     = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 3.625524e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.958602e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.958602e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.609054e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.875068e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.875068e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 3.404831e+01 +- 1.677228e+01 )  GeV^-2
-TOTAL       :     0.474614 sec
-     1,386,813,698      cycles                    #    2.899 GHz                    
-     3,036,853,128      instructions              #    2.19  insn per cycle         
-       0.484841120 seconds time elapsed
+TOTAL       :     0.477284 sec
+     1,366,926,685      cycles                           #    2.847 GHz                    
+     2,996,684,473      instructions                     #    2.19  insn per cycle         
+       0.482654991 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 2831) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_d_inl0_hrd1/runTest.exe
@@ -154,20 +154,20 @@ Relative difference = 2.9844659193456305e-07
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_d_inl0_hrd1/check.exe -p 64 256 10 OMP=
-Process                     = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=1]
+Process                     = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 3.841557e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 4.216736e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 4.216736e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.844644e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 4.136562e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.136562e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 3.404831e+01 +- 1.677228e+01 )  GeV^-2
-TOTAL       :     0.449897 sec
-     1,316,149,525      cycles                    #    2.902 GHz                    
-     2,929,799,564      instructions              #    2.23  insn per cycle         
-       0.464962802 seconds time elapsed
-=Symbols in CPPProcess.o= (~sse4:    0) (avx2: 2576) (512y:  207) (512z:    0)
+TOTAL       :     0.449528 sec
+     1,285,397,954      cycles                           #    2.843 GHz                    
+     2,886,405,649      instructions                     #    2.25  insn per cycle         
+       0.454867457 seconds time elapsed
+=Symbols in CPPProcess.o= (~sse4:    0) (avx2: 2577) (512y:  207) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_d_inl0_hrd1/runTest.exe
 [  PASSED  ] 6 tests.
@@ -180,19 +180,19 @@ Relative difference = 2.9844659193456305e-07
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_d_inl0_hrd1/check.exe -p 64 256 10 OMP=
-Process                     = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=1]
+Process                     = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 2.868510e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.078585e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.078585e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.819450e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.979721e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.979721e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 3.404831e+01 +- 1.677228e+01 )  GeV^-2
-TOTAL       :     0.597358 sec
-     1,194,467,791      cycles                    #    1.987 GHz                    
-     1,906,072,402      instructions              #    1.60  insn per cycle         
-       0.602593356 seconds time elapsed
+TOTAL       :     0.606788 sec
+     1,164,670,276      cycles                           #    1.911 GHz                    
+     1,864,104,765      instructions                     #    1.60  insn per cycle         
+       0.611858192 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 1002) (512y:  185) (512z: 2242)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_d_inl0_hrd1/runTest.exe
diff --git a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd0.txt
index 85404bb68d..97148e4c3d 100644
--- a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd0.txt
+++ b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd0.txt
@@ -35,60 +35,60 @@ CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux'
 
-DATE: 2023-06-16_23:00:20
+DATE: 2023-07-18_22:49:05
 
-On itscrd80.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
+On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/gcheck.exe -p 64 256 10 OMP=
-Process                     = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 4.984648e+07                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.208639e+08                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.373355e+08                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 5.373634e+07                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.221480e+08                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.361117e+08                 )  sec^-1
 MeanMatrixElemValue         = ( 3.402886e+01 +- 1.677500e+01 )  GeV^-2
-TOTAL       :     0.467463 sec
-     1,994,268,934      cycles                    #    2.883 GHz                    
-     2,406,150,579      instructions              #    1.21  insn per cycle         
-       0.748794310 seconds time elapsed
+TOTAL       :     0.447566 sec
+     1,887,922,658      cycles                           #    2.843 GHz                    
+     2,552,301,791      instructions                     #    1.35  insn per cycle         
+       0.727957341 seconds time elapsed
 runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/gcheck.exe -p 64 256 1
 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 168
 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100%
 .........................................................................
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 1 OMP=
-Process                     = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 8.108799e+07                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.846646e+08                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.967345e+08                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 7.249817e+07                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.823800e+08                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.984824e+08                 )  sec^-1
 MeanMatrixElemValue         = ( 4.166198e+02 +- 2.517590e+02 )  GeV^-2
-TOTAL       :     0.507796 sec
-     2,130,658,759      cycles                    #    2.870 GHz                    
-     2,630,610,519      instructions              #    1.23  insn per cycle         
-       0.801554505 seconds time elapsed
+TOTAL       :     0.480874 sec
+     2,072,392,252      cycles                           #    2.934 GHz                    
+     2,836,354,199      instructions                     #    1.37  insn per cycle         
+       0.765550273 seconds time elapsed
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2
 Avg ME (C++/CUDA)   = 5.619520e-01
-Avg ME (F77/CUDA)   = 0.56225629188472226
-Relative difference = 0.0005414908830687532
+Avg ME (F77/CUDA)   = 0.56225629328206139
+Relative difference = 0.0005414933696496947
 OK (relative difference <= 5E-3)
 =========================================================================
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/check.exe -p 64 256 10 OMP=
-Process                     = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.199190e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.228570e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.228570e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.146917e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.173201e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.173201e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 3.422773e+01 +- 1.683421e+01 )  GeV^-2
-TOTAL       :     1.388677 sec
-     4,265,371,799      cycles                    #    3.066 GHz                    
-    12,765,139,156      instructions              #    2.99  insn per cycle         
-       1.393497705 seconds time elapsed
+TOTAL       :     1.449015 sec
+     4,399,695,435      cycles                           #    3.030 GHz                    
+    12,749,156,633      instructions                     #    2.90  insn per cycle         
+       1.453676873 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:  701) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/runTest.exe
@@ -102,19 +102,19 @@ Relative difference = 1.714833339642312e-08
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_f_inl0_hrd0/check.exe -p 64 256 10 OMP=
-Process                     = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 3.419636e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.674540e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.674540e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.277386e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.502877e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.502877e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 3.422772e+01 +- 1.683421e+01 )  GeV^-2
-TOTAL       :     0.499936 sec
-     1,537,781,005      cycles                    #    3.051 GHz                    
-     4,119,226,953      instructions              #    2.68  insn per cycle         
-       0.510812610 seconds time elapsed
+TOTAL       :     0.520848 sec
+     1,570,490,325      cycles                           #    3.002 GHz                    
+     4,102,313,565      instructions                     #    2.61  insn per cycle         
+       0.525858184 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4: 3693) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_f_inl0_hrd0/runTest.exe
@@ -128,19 +128,19 @@ Relative difference = 4.327561348062349e-08
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_f_inl0_hrd0/check.exe -p 64 256 10 OMP=
-Process                     = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 7.327767e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 8.544279e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 8.544279e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 7.220214e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 8.344626e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 8.344626e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 3.422183e+01 +- 1.683665e+01 )  GeV^-2
-TOTAL       :     0.245130 sec
-       718,016,333      cycles                    #    2.885 GHz                    
-     1,661,722,651      instructions              #    2.31  insn per cycle         
-       0.255967125 seconds time elapsed
+TOTAL       :     0.249034 sec
+       711,985,333      cycles                           #    2.837 GHz                    
+     1,644,466,719      instructions                     #    2.31  insn per cycle         
+       0.254172664 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 3109) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_f_inl0_hrd0/runTest.exe
@@ -154,19 +154,19 @@ Relative difference = 7.389204774233901e-08
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_f_inl0_hrd0/check.exe -p 64 256 10 OMP=
-Process                     = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 7.836550e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 9.271069e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 9.271069e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 7.760904e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 9.087758e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 9.087758e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 3.422183e+01 +- 1.683665e+01 )  GeV^-2
-TOTAL       :     0.230305 sec
-       678,464,193      cycles                    #    2.902 GHz                    
-     1,594,224,390      instructions              #    2.35  insn per cycle         
-       0.240921400 seconds time elapsed
+TOTAL       :     0.233357 sec
+       666,761,003      cycles                           #    2.840 GHz                    
+     1,576,999,933      instructions                     #    2.37  insn per cycle         
+       0.238145886 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 2954) (512y:   14) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_f_inl0_hrd0/runTest.exe
@@ -180,20 +180,20 @@ Relative difference = 7.389204774233901e-08
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_f_inl0_hrd0/check.exe -p 64 256 10 OMP=
-Process                     = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 5.868158e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 6.618302e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 6.618302e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 5.650528e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 6.337145e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 6.337145e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 3.422183e+01 +- 1.683665e+01 )  GeV^-2
-TOTAL       :     0.302538 sec
-       641,516,414      cycles                    #    2.091 GHz                    
-     1,069,291,782      instructions              #    1.67  insn per cycle         
-       0.314821042 seconds time elapsed
-=Symbols in CPPProcess.o= (~sse4:    0) (avx2: 1530) (512y:   33) (512z: 2277)
+TOTAL       :     0.314517 sec
+       634,920,525      cycles                           #    2.005 GHz                    
+     1,053,554,769      instructions                     #    1.66  insn per cycle         
+       0.319856249 seconds time elapsed
+=Symbols in CPPProcess.o= (~sse4:    0) (avx2: 1459) (512y:   27) (512z: 2277)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_f_inl0_hrd0/runTest.exe
 [  PASSED  ] 6 tests.
diff --git a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd0_bridge.txt b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd0_bridge.txt
index 0cb2b651aa..98d232673d 100644
--- a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd0_bridge.txt
+++ b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd0_bridge.txt
@@ -35,26 +35,26 @@ CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux'
 
-DATE: 2023-06-16_23:21:51
+DATE: 2023-07-18_23:25:22
 
-On itscrd80.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
+On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/gcheck.exe -p 64 256 10 --bridge OMP=
 WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost
 WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost
 WARNING! Instantiate device Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384)
 WARNING! Set grid in Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384)
-Process                     = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 5.619996e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 4.273991e+07                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 4.273991e+07                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 5.603113e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 4.129465e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.129465e+07                 )  sec^-1
 MeanMatrixElemValue         = ( 3.419752e+01 +- 1.682900e+01 )  GeV^-2
-TOTAL       :     0.476940 sec
-     2,047,260,675      cycles                    #    2.896 GHz                    
-     2,612,418,216      instructions              #    1.28  insn per cycle         
-       0.764236233 seconds time elapsed
+TOTAL       :     0.453868 sec
+     1,966,832,335      cycles                           #    2.966 GHz                    
+     2,819,531,761      instructions                     #    1.43  insn per cycle         
+       0.720764500 seconds time elapsed
 runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/gcheck.exe -p 64 256 1 --bridge
 WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost
 WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost
@@ -68,40 +68,40 @@ WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost
 WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost
 WARNING! Instantiate device Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288)
 WARNING! Set grid in Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288)
-Process                     = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 5.197562e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 4.898248e+07                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 4.898248e+07                 )  sec^-1
-MeanMatrixElemValue         = ( 4.349381e+02 +- 2.541442e+02 )  GeV^-2
-TOTAL       :     0.646628 sec
-     2,586,558,663      cycles                    #    2.919 GHz                    
-     3,548,806,657      instructions              #    1.37  insn per cycle         
-       0.945018609 seconds time elapsed
+EvtsPerSec[Rmb+ME]     (23) = ( 5.114307e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 4.636350e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.636350e+07                 )  sec^-1
+MeanMatrixElemValue         = ( 4.349385e+02 +- 2.541442e+02 )  GeV^-2
+TOTAL       :     0.622676 sec
+     2,539,495,983      cycles                           #    2.965 GHz                    
+     3,758,648,716      instructions                     #    1.48  insn per cycle         
+       0.916087470 seconds time elapsed
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2
 Avg ME (C++/CUDA)   = 5.619520e-01
-Avg ME (F77/CUDA)   = 0.56225629188472226
-Relative difference = 0.0005414908830687532
+Avg ME (F77/CUDA)   = 0.56225629328206139
+Relative difference = 0.0005414933696496947
 OK (relative difference <= 5E-3)
 =========================================================================
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/check.exe -p 64 256 10 --bridge OMP=
 WARNING! Instantiate host Bridge (nevt=16384)
-Process                     = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.208772e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.238538e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.238538e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.152553e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.179376e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.179376e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 3.422773e+01 +- 1.683421e+01 )  GeV^-2
-TOTAL       :     1.380519 sec
-     4,277,155,433      cycles                    #    3.092 GHz                    
-    12,769,582,721      instructions              #    2.99  insn per cycle         
-       1.385373229 seconds time elapsed
+TOTAL       :     1.444845 sec
+     4,413,243,936      cycles                           #    3.048 GHz                    
+    12,753,279,090      instructions                     #    2.89  insn per cycle         
+       1.448548188 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:  701) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/runTest.exe
@@ -116,19 +116,19 @@ OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_f_inl0_hrd0/check.exe -p 64 256 10 --bridge OMP=
 WARNING! Instantiate host Bridge (nevt=16384)
-Process                     = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 3.430847e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.687591e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.687591e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.302069e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.527386e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.527386e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 3.422772e+01 +- 1.683421e+01 )  GeV^-2
-TOTAL       :     0.502214 sec
-     1,557,572,574      cycles                    #    3.074 GHz                    
-     4,167,570,923      instructions              #    2.68  insn per cycle         
-       0.514108531 seconds time elapsed
+TOTAL       :     0.520045 sec
+     1,587,034,298      cycles                           #    3.031 GHz                    
+     4,150,131,626      instructions                     #    2.62  insn per cycle         
+       0.524112370 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4: 3693) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_f_inl0_hrd0/runTest.exe
@@ -143,19 +143,19 @@ OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_f_inl0_hrd0/check.exe -p 64 256 10 --bridge OMP=
 WARNING! Instantiate host Bridge (nevt=16384)
-Process                     = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 7.366004e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 8.582913e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 8.582913e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 7.200645e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 8.321913e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 8.321913e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 3.422183e+01 +- 1.683665e+01 )  GeV^-2
-TOTAL       :     0.247132 sec
-       735,373,188      cycles                    #    2.931 GHz                    
-     1,698,741,216      instructions              #    2.31  insn per cycle         
-       0.251948722 seconds time elapsed
+TOTAL       :     0.251605 sec
+       730,914,549      cycles                           #    2.869 GHz                    
+     1,680,419,285      instructions                     #    2.30  insn per cycle         
+       0.255420475 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 3109) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_f_inl0_hrd0/runTest.exe
@@ -170,19 +170,19 @@ OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_f_inl0_hrd0/check.exe -p 64 256 10 --bridge OMP=
 WARNING! Instantiate host Bridge (nevt=16384)
-Process                     = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 7.814926e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 9.180518e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 9.180518e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 7.755367e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 9.077189e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 9.077189e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 3.422183e+01 +- 1.683665e+01 )  GeV^-2
-TOTAL       :     0.234804 sec
-       694,783,748      cycles                    #    2.909 GHz                    
-     1,631,382,774      instructions              #    2.35  insn per cycle         
-       0.239954292 seconds time elapsed
+TOTAL       :     0.235526 sec
+       686,239,324      cycles                           #    2.873 GHz                    
+     1,613,330,403      instructions                     #    2.35  insn per cycle         
+       0.239552259 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 2954) (512y:   14) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_f_inl0_hrd0/runTest.exe
@@ -197,20 +197,20 @@ OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_f_inl0_hrd0/check.exe -p 64 256 10 --bridge OMP=
 WARNING! Instantiate host Bridge (nevt=16384)
-Process                     = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 5.811317e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 6.536250e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 6.536250e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 5.659534e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 6.342405e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 6.342405e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 3.422183e+01 +- 1.683665e+01 )  GeV^-2
-TOTAL       :     0.309567 sec
-       660,659,148      cycles                    #    2.110 GHz                    
-     1,111,031,208      instructions              #    1.68  insn per cycle         
-       0.314367440 seconds time elapsed
-=Symbols in CPPProcess.o= (~sse4:    0) (avx2: 1530) (512y:   33) (512z: 2277)
+TOTAL       :     0.315940 sec
+       653,116,203      cycles                           #    2.045 GHz                    
+     1,094,435,423      instructions                     #    1.68  insn per cycle         
+       0.320064065 seconds time elapsed
+=Symbols in CPPProcess.o= (~sse4:    0) (avx2: 1459) (512y:   27) (512z: 2277)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_f_inl0_hrd0/runTest.exe
 [  PASSED  ] 6 tests.
diff --git a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd1.txt
index 1ed4c388b7..b12d09a085 100644
--- a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd1.txt
+++ b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd1.txt
@@ -35,60 +35,60 @@ CUDACPP_BUILDDIR='build.512z_f_inl0_hrd1'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux'
 
-DATE: 2023-06-16_23:00:36
+DATE: 2023-07-18_22:49:20
 
-On itscrd80.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
+On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd1/gcheck.exe -p 64 256 10 OMP=
-Process                     = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=1]
+Process                     = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 4.864221e+07                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.200538e+08                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.353895e+08                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 5.401447e+07                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.229959e+08                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.374243e+08                 )  sec^-1
 MeanMatrixElemValue         = ( 3.402886e+01 +- 1.677500e+01 )  GeV^-2
-TOTAL       :     0.468538 sec
-     1,986,332,099      cycles                    #    2.873 GHz                    
-     2,412,092,912      instructions              #    1.21  insn per cycle         
-       0.749649403 seconds time elapsed
+TOTAL       :     0.443674 sec
+     1,962,721,667      cycles                           #    2.933 GHz                    
+     2,652,007,103      instructions                     #    1.35  insn per cycle         
+       0.733269571 seconds time elapsed
 runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd1/gcheck.exe -p 64 256 1
 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 161
 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100%
 .........................................................................
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd1/gcheck.exe -p 2048 256 1 OMP=
-Process                     = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=1]
+Process                     = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 8.066200e+07                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.816896e+08                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.937623e+08                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 7.262517e+07                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.821886e+08                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.970952e+08                 )  sec^-1
 MeanMatrixElemValue         = ( 4.166198e+02 +- 2.517590e+02 )  GeV^-2
-TOTAL       :     0.506737 sec
-     2,138,564,502      cycles                    #    2.876 GHz                    
-     2,655,004,925      instructions              #    1.24  insn per cycle         
-       0.801248487 seconds time elapsed
+TOTAL       :     0.484607 sec
+     2,037,651,018      cycles                           #    2.866 GHz                    
+     2,772,082,982      instructions                     #    1.36  insn per cycle         
+       0.769299729 seconds time elapsed
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd1/gcheck.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd1/fgcheck.exe 2 64 2
 Avg ME (C++/CUDA)   = 5.619520e-01
-Avg ME (F77/CUDA)   = 0.56225629188472226
-Relative difference = 0.0005414908830687532
+Avg ME (F77/CUDA)   = 0.56225629328206139
+Relative difference = 0.0005414933696496947
 OK (relative difference <= 5E-3)
 =========================================================================
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd1/check.exe -p 64 256 10 OMP=
-Process                     = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=1]
+Process                     = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.218143e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.248221e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.248221e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.155470e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.181789e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.181789e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 3.422773e+01 +- 1.683421e+01 )  GeV^-2
-TOTAL       :     1.366158 sec
-     4,229,850,435      cycles                    #    3.088 GHz                    
-    12,672,250,194      instructions              #    3.00  insn per cycle         
-       1.370859371 seconds time elapsed
+TOTAL       :     1.438342 sec
+     4,366,679,772      cycles                           #    3.029 GHz                    
+    12,656,692,314      instructions                     #    2.90  insn per cycle         
+       1.443081537 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:  648) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd1/runTest.exe
@@ -102,20 +102,20 @@ Relative difference = 1.714833339642312e-08
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_f_inl0_hrd1/check.exe -p 64 256 10 OMP=
-Process                     = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=1]
+Process                     = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 3.810631e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 4.132301e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 4.132301e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.651322e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.932846e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.932846e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 3.422772e+01 +- 1.683421e+01 )  GeV^-2
-TOTAL       :     0.450376 sec
-     1,402,013,020      cycles                    #    3.084 GHz                    
-     4,014,545,759      instructions              #    2.86  insn per cycle         
-       0.461135794 seconds time elapsed
-=Symbols in CPPProcess.o= (~sse4: 3449) (avx2:    0) (512y:    0) (512z:    0)
+TOTAL       :     0.470070 sec
+     1,425,098,128      cycles                           #    3.024 GHz                    
+     3,997,006,730      instructions                     #    2.80  insn per cycle         
+       0.474958236 seconds time elapsed
+=Symbols in CPPProcess.o= (~sse4: 3448) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_f_inl0_hrd1/runTest.exe
 [  PASSED  ] 6 tests.
@@ -128,19 +128,19 @@ Relative difference = 4.327561348062349e-08
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_f_inl0_hrd1/check.exe -p 64 256 10 OMP=
-Process                     = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=1]
+Process                     = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 5.649698e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 6.339892e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 6.339892e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 5.509776e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 6.152960e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 6.152960e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 3.422183e+01 +- 1.683665e+01 )  GeV^-2
-TOTAL       :     0.310950 sec
-       917,414,969      cycles                    #    2.911 GHz                    
-     1,944,404,967      instructions              #    2.12  insn per cycle         
-       0.321816236 seconds time elapsed
+TOTAL       :     0.318346 sec
+       911,647,543      cycles                           #    2.842 GHz                    
+     1,926,936,353      instructions                     #    2.11  insn per cycle         
+       0.323109774 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 3708) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_f_inl0_hrd1/runTest.exe
@@ -154,19 +154,19 @@ Relative difference = 7.389204774233901e-08
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_f_inl0_hrd1/check.exe -p 64 256 10 OMP=
-Process                     = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=1]
+Process                     = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 5.731171e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 6.478338e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 6.478338e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 5.770278e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 6.474762e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 6.474762e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 3.422183e+01 +- 1.683665e+01 )  GeV^-2
-TOTAL       :     0.306986 sec
-       888,396,059      cycles                    #    2.858 GHz                    
-     1,866,890,226      instructions              #    2.10  insn per cycle         
-       0.318202607 seconds time elapsed
+TOTAL       :     0.304991 sec
+       875,477,557      cycles                           #    2.849 GHz                    
+     1,849,411,774      instructions                     #    2.11  insn per cycle         
+       0.309728070 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 3561) (512y:   12) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_f_inl0_hrd1/runTest.exe
@@ -180,20 +180,20 @@ Relative difference = 7.389204774233901e-08
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_f_inl0_hrd1/check.exe -p 64 256 10 OMP=
-Process                     = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=1]
+Process                     = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 4.528801e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 4.968883e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 4.968883e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 4.411391e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 4.826212e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.826212e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 3.422183e+01 +- 1.683665e+01 )  GeV^-2
-TOTAL       :     0.384950 sec
-       796,333,894      cycles                    #    2.047 GHz                    
-     1,364,853,040      instructions              #    1.71  insn per cycle         
-       0.390179000 seconds time elapsed
-=Symbols in CPPProcess.o= (~sse4:    0) (avx2: 2080) (512y:   25) (512z: 2631)
+TOTAL       :     0.394437 sec
+       787,402,709      cycles                           #    1.986 GHz                    
+     1,347,443,854      instructions                     #    1.71  insn per cycle         
+       0.399399367 seconds time elapsed
+=Symbols in CPPProcess.o= (~sse4:    0) (avx2: 2000) (512y:   19) (512z: 2631)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_f_inl0_hrd1/runTest.exe
 [  PASSED  ] 6 tests.
diff --git a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_m_inl0_hrd0.txt
index a86220883d..04fec1d98e 100644
--- a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_m_inl0_hrd0.txt
+++ b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_m_inl0_hrd0.txt
@@ -35,38 +35,38 @@ CUDACPP_BUILDDIR='build.512z_m_inl0_hrd0'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux'
 
-DATE: 2023-06-16_23:00:52
+DATE: 2023-07-18_22:49:35
 
-On itscrd80.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
+On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd0/gcheck.exe -p 64 256 10 OMP=
-Process                     = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 2.481426e+07                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 5.487957e+07                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 5.943892e+07                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.709586e+07                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 5.452250e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 5.859261e+07                 )  sec^-1
 MeanMatrixElemValue         = ( 3.404831e+01 +- 1.677228e+01 )  GeV^-2
-TOTAL       :     0.474499 sec
-     2,055,268,717      cycles                    #    2.894 GHz                    
-     2,489,636,002      instructions              #    1.21  insn per cycle         
-       0.767461965 seconds time elapsed
+TOTAL       :     0.450283 sec
+     1,941,801,386      cycles                           #    2.921 GHz                    
+     2,647,661,073      instructions                     #    1.36  insn per cycle         
+       0.729442761 seconds time elapsed
 runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd0/gcheck.exe -p 64 256 1
 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255
 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100%
 .........................................................................
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd0/gcheck.exe -p 2048 256 1 OMP=
-Process                     = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 3.380889e+07                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 7.587624e+07                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 8.036549e+07                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.371849e+07                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 7.637490e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 8.116945e+07                 )  sec^-1
 MeanMatrixElemValue         = ( 4.282445e+02 +- 2.530899e+02 )  GeV^-2
-TOTAL       :     0.556289 sec
-     2,350,861,575      cycles                    #    2.922 GHz                    
-     2,945,489,310      instructions              #    1.25  insn per cycle         
-       0.862355038 seconds time elapsed
+TOTAL       :     0.539348 sec
+     2,289,544,773      cycles                           #    2.936 GHz                    
+     3,186,092,891      instructions                     #    1.39  insn per cycle         
+       0.838776921 seconds time elapsed
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd0/gcheck.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd0/fgcheck.exe 2 64 2
@@ -76,19 +76,19 @@ Relative difference = 2.782658397826986e-07
 OK (relative difference <= 5E-3)
 =========================================================================
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd0/check.exe -p 64 256 10 OMP=
-Process                     = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.161099e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.194017e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.194017e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.114215e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.138593e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.138593e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 3.404831e+01 +- 1.677228e+01 )  GeV^-2
-TOTAL       :     1.441531 sec
-     4,441,671,913      cycles                    #    3.079 GHz                    
-    12,830,518,319      instructions              #    2.89  insn per cycle         
-       1.448679311 seconds time elapsed
+TOTAL       :     1.492939 sec
+     4,550,289,046      cycles                           #    3.040 GHz                    
+    12,789,410,611      instructions                     #    2.81  insn per cycle         
+       1.498177791 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:  708) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd0/runTest.exe
@@ -102,19 +102,19 @@ Relative difference = 2.608483884671339e-07
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_m_inl0_hrd0/check.exe -p 64 256 10 OMP=
-Process                     = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 2.046596e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.158375e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.158375e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.053669e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.136903e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.136903e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 3.404831e+01 +- 1.677228e+01 )  GeV^-2
-TOTAL       :     0.825457 sec
-     2,446,304,474      cycles                    #    2.949 GHz                    
-     6,976,512,368      instructions              #    2.85  insn per cycle         
-       0.837405524 seconds time elapsed
+TOTAL       :     0.822011 sec
+     2,476,568,425      cycles                           #    3.006 GHz                    
+     6,934,481,519      instructions                     #    2.80  insn per cycle         
+       0.827155036 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4: 3144) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_m_inl0_hrd0/runTest.exe
@@ -128,20 +128,20 @@ Relative difference = 2.608483884671339e-07
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_m_inl0_hrd0/check.exe -p 64 256 10 OMP=
-Process                     = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 3.998918e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 4.420645e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 4.420645e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 4.012591e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 4.333162e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.333162e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 3.404831e+01 +- 1.677228e+01 )  GeV^-2
-TOTAL       :     0.433627 sec
-     1,261,017,634      cycles                    #    2.881 GHz                    
-     2,809,295,715      instructions              #    2.23  insn per cycle         
-       0.438687115 seconds time elapsed
-=Symbols in CPPProcess.o= (~sse4:    0) (avx2: 2872) (512y:    0) (512z:    0)
+TOTAL       :     0.433321 sec
+     1,239,207,460      cycles                           #    2.848 GHz                    
+     2,766,742,055      instructions                     #    2.23  insn per cycle         
+       0.438634153 seconds time elapsed
+=Symbols in CPPProcess.o= (~sse4:    0) (avx2: 2871) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_m_inl0_hrd0/runTest.exe
 [  PASSED  ] 6 tests.
@@ -154,19 +154,19 @@ Relative difference = 2.777561258016791e-07
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_m_inl0_hrd0/check.exe -p 64 256 10 OMP=
-Process                     = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 4.490422e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 5.009169e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 5.009169e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 4.497352e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 4.897319e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.897319e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 3.404831e+01 +- 1.677228e+01 )  GeV^-2
-TOTAL       :     0.388524 sec
-     1,143,580,715      cycles                    #    2.915 GHz                    
-     2,667,697,890      instructions              #    2.33  insn per cycle         
-       0.393399213 seconds time elapsed
+TOTAL       :     0.388181 sec
+     1,109,768,744      cycles                           #    2.845 GHz                    
+     2,623,649,502      instructions                     #    2.36  insn per cycle         
+       0.393502237 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 2679) (512y:   60) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_m_inl0_hrd0/runTest.exe
@@ -180,20 +180,20 @@ Relative difference = 2.777561258016791e-07
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_m_inl0_hrd0/check.exe -p 64 256 10 OMP=
-Process                     = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 2.897672e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.110284e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.110284e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.876331e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.037245e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.037245e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 3.404831e+01 +- 1.677228e+01 )  GeV^-2
-TOTAL       :     0.591572 sec
-     1,191,668,768      cycles                    #    2.001 GHz                    
-     1,710,006,963      instructions              #    1.43  insn per cycle         
-       0.596717394 seconds time elapsed
-=Symbols in CPPProcess.o= (~sse4:    0) (avx2: 1441) (512y:   85) (512z: 2182)
+TOTAL       :     0.595472 sec
+     1,159,182,207      cycles                           #    1.940 GHz                    
+     1,669,017,207      instructions                     #    1.44  insn per cycle         
+       0.600912637 seconds time elapsed
+=Symbols in CPPProcess.o= (~sse4:    0) (avx2: 1450) (512y:   86) (512z: 2181)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_m_inl0_hrd0/runTest.exe
 [  PASSED  ] 6 tests.
diff --git a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_m_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_m_inl0_hrd1.txt
index d5360b871e..2b5a0b6c6d 100644
--- a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_m_inl0_hrd1.txt
+++ b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_m_inl0_hrd1.txt
@@ -35,38 +35,38 @@ CUDACPP_BUILDDIR='build.512z_m_inl0_hrd1'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux'
 
-DATE: 2023-06-16_23:01:09
+DATE: 2023-07-18_22:49:51
 
-On itscrd80.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
+On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd1/gcheck.exe -p 64 256 10 OMP=
-Process                     = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=1]
+Process                     = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 2.438523e+07                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 5.296220e+07                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 5.712443e+07                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.689669e+07                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 5.339263e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 5.704986e+07                 )  sec^-1
 MeanMatrixElemValue         = ( 3.404831e+01 +- 1.677228e+01 )  GeV^-2
-TOTAL       :     0.473737 sec
-     2,022,437,137      cycles                    #    2.902 GHz                    
-     2,442,479,633      instructions              #    1.21  insn per cycle         
-       0.756210762 seconds time elapsed
+TOTAL       :     0.450242 sec
+     1,949,757,721      cycles                           #    2.931 GHz                    
+     2,686,904,735      instructions                     #    1.38  insn per cycle         
+       0.728612654 seconds time elapsed
 runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd1/gcheck.exe -p 64 256 1
 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255
 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100%
 .........................................................................
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd1/gcheck.exe -p 2048 256 1 OMP=
-Process                     = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=1]
+Process                     = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 3.355982e+07                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 7.498538e+07                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 7.934566e+07                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.331006e+07                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 7.447818e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.909061e+07                 )  sec^-1
 MeanMatrixElemValue         = ( 4.282445e+02 +- 2.530899e+02 )  GeV^-2
-TOTAL       :     0.556362 sec
-     2,332,917,861      cycles                    #    2.900 GHz                    
-     2,935,857,970      instructions              #    1.26  insn per cycle         
-       0.862169599 seconds time elapsed
+TOTAL       :     0.533944 sec
+     2,280,719,611      cycles                           #    2.953 GHz                    
+     3,146,800,171      instructions                     #    1.38  insn per cycle         
+       0.829596740 seconds time elapsed
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd1/gcheck.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd1/fgcheck.exe 2 64 2
@@ -76,19 +76,19 @@ Relative difference = 2.782658397826986e-07
 OK (relative difference <= 5E-3)
 =========================================================================
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd1/check.exe -p 64 256 10 OMP=
-Process                     = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=1]
+Process                     = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.170795e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.204914e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.204914e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.125391e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.150066e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.150066e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 3.404831e+01 +- 1.677228e+01 )  GeV^-2
-TOTAL       :     1.422352 sec
-     4,386,288,323      cycles                    #    3.077 GHz                    
-    12,708,814,441      instructions              #    2.90  insn per cycle         
-       1.427223028 seconds time elapsed
+TOTAL       :     1.478266 sec
+     4,503,135,010      cycles                           #    3.039 GHz                    
+    12,670,550,662      instructions                     #    2.81  insn per cycle         
+       1.483617158 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:  659) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd1/runTest.exe
@@ -102,19 +102,19 @@ Relative difference = 2.608483884671339e-07
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_m_inl0_hrd1/check.exe -p 64 256 10 OMP=
-Process                     = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=1]
+Process                     = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 2.148376e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.266083e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.266083e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.086785e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.174219e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.174219e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 3.404831e+01 +- 1.677228e+01 )  GeV^-2
-TOTAL       :     0.786032 sec
-     2,382,887,401      cycles                    #    3.016 GHz                    
-     6,777,982,529      instructions              #    2.84  insn per cycle         
-       0.796666620 seconds time elapsed
+TOTAL       :     0.809175 sec
+     2,426,229,799      cycles                           #    2.989 GHz                    
+     6,736,787,469      instructions                     #    2.78  insn per cycle         
+       0.814615473 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4: 3010) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_m_inl0_hrd1/runTest.exe
@@ -128,20 +128,20 @@ Relative difference = 2.608483884671339e-07
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_m_inl0_hrd1/check.exe -p 64 256 10 OMP=
-Process                     = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=1]
+Process                     = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 3.604898e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.935937e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.935937e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.296004e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.526619e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.526619e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 3.404831e+01 +- 1.677228e+01 )  GeV^-2
-TOTAL       :     0.478024 sec
-     1,400,367,251      cycles                    #    2.905 GHz                    
-     2,987,020,671      instructions              #    2.13  insn per cycle         
-       0.488917156 seconds time elapsed
-=Symbols in CPPProcess.o= (~sse4:    0) (avx2: 3010) (512y:    0) (512z:    0)
+TOTAL       :     0.521966 sec
+     1,383,320,776      cycles                           #    2.637 GHz                    
+     2,945,181,934      instructions                     #    2.13  insn per cycle         
+       0.526914313 seconds time elapsed
+=Symbols in CPPProcess.o= (~sse4:    0) (avx2: 3009) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_m_inl0_hrd1/runTest.exe
 [  PASSED  ] 6 tests.
@@ -154,19 +154,19 @@ Relative difference = 2.777561258016791e-07
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_m_inl0_hrd1/check.exe -p 64 256 10 OMP=
-Process                     = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=1]
+Process                     = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 3.857659e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 4.247357e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 4.247357e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.859313e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 4.151231e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.151231e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 3.404831e+01 +- 1.677228e+01 )  GeV^-2
-TOTAL       :     0.448265 sec
-     1,311,795,258      cycles                    #    2.900 GHz                    
-     2,871,509,338      instructions              #    2.19  insn per cycle         
-       0.453504742 seconds time elapsed
+TOTAL       :     0.448380 sec
+     1,279,235,194      cycles                           #    2.838 GHz                    
+     2,829,550,880      instructions                     #    2.21  insn per cycle         
+       0.453442975 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 2738) (512y:  216) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_m_inl0_hrd1/runTest.exe
@@ -180,19 +180,19 @@ Relative difference = 2.777561258016791e-07
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_m_inl0_hrd1/check.exe -p 64 256 10 OMP=
-Process                     = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=1]
+Process                     = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 2.874954e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.085752e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.085752e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.833583e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.996794e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.996794e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 3.404831e+01 +- 1.677228e+01 )  GeV^-2
-TOTAL       :     0.594897 sec
-     1,202,081,331      cycles                    #    2.008 GHz                    
-     1,864,012,772      instructions              #    1.55  insn per cycle         
-       0.599852743 seconds time elapsed
+TOTAL       :     0.604455 sec
+     1,180,479,629      cycles                           #    1.945 GHz                    
+     1,824,076,834      instructions                     #    1.55  insn per cycle         
+       0.609837052 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 1344) (512y:  191) (512z: 2311)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_m_inl0_hrd1/runTest.exe

From 18ffff2215507e7b30debf1c1ecffd3a47b7ba6d Mon Sep 17 00:00:00 2001
From: Andrea Valassi <andrea.valassi@cern.ch>
Date: Wed, 19 Jul 2023 09:14:28 +0200
Subject: [PATCH 387/509] [jthip] rerun 15 tmad alltees (for cuda/c++) after
 including HIP, all looks ok

STARTED AT Tue Jul 18 11:37:16 PM CEST 2023
ENDED   AT Wed Jul 19 03:52:57 AM CEST 2023

Note, the ggttgg tests fail as expected
---
 .../log_eemumu_mad_d_inl0_hrd0.txt            | 176 ++++++-------
 .../log_eemumu_mad_f_inl0_hrd0.txt            | 184 +++++++-------
 .../log_eemumu_mad_m_inl0_hrd0.txt            | 180 +++++++-------
 .../log_ggtt_mad_d_inl0_hrd0.txt              | 174 ++++++-------
 .../log_ggtt_mad_f_inl0_hrd0.txt              | 182 +++++++-------
 .../log_ggtt_mad_m_inl0_hrd0.txt              | 178 ++++++-------
 .../log_ggttg_mad_d_inl0_hrd0.txt             | 224 ++++++++---------
 .../log_ggttg_mad_f_inl0_hrd0.txt             | 232 ++++++++---------
 .../log_ggttg_mad_m_inl0_hrd0.txt             | 234 +++++++++---------
 .../log_ggttgg_mad_d_inl0_hrd0.txt            |  42 ++--
 .../log_ggttgg_mad_f_inl0_hrd0.txt            |  42 ++--
 .../log_ggttgg_mad_m_inl0_hrd0.txt            |  44 ++--
 .../log_ggttggg_mad_d_inl0_hrd0.txt           | 180 +++++++-------
 .../log_ggttggg_mad_f_inl0_hrd0.txt           | 188 +++++++-------
 .../log_ggttggg_mad_m_inl0_hrd0.txt           | 174 ++++++-------
 15 files changed, 1217 insertions(+), 1217 deletions(-)

diff --git a/epochX/cudacpp/tmad/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0.txt
index 0ab7ae8748..3ba8c07f53 100644
--- a/epochX/cudacpp/tmad/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0.txt
+++ b/epochX/cudacpp/tmad/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0.txt
@@ -4,26 +4,26 @@ CUDACPP_BUILDDIR='.'
 
 
 make USEBUILDDIR=1 AVX=none
-make USEBUILDDIR=1 AVX=sse4
 
+make USEBUILDDIR=1 AVX=sse4
 make USEBUILDDIR=1 AVX=avx2
 make USEBUILDDIR=1 AVX=512y
+
+make USEBUILDDIR=1 AVX=512z
 make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
-
-make USEBUILDDIR=1 AVX=512z
 make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
-CUDACPP_BUILDDIR='build.none_d_inl0_hrd0'
+CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0'
 CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0'
+CUDACPP_BUILDDIR='build.none_d_inl0_hrd0'
 CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd0'
-CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
-CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 make[1]: Nothing to be done for 'all'.
@@ -33,9 +33,9 @@ make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/
 
 OMP_NUM_THREADS=
 
-DATE: 2023-06-16_23:34:34
+DATE: 2023-07-18_23:37:40
 
-On itscrd80.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
+On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum
 
 *** (1) EXECUTE MADEVENT_FORTRAN (create results.dat) ***
@@ -58,9 +58,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_eemumu_x1_fortran > /tmp/av
  [XSECTION] Configuration = 1
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.2175 [] fbridge_mode=0
- [COUNTERS] PROGRAM TOTAL          :    0.0320s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.0199s
- [COUNTERS] Fortran MEs      ( 1 ) :    0.0121s for     8192 events => throughput is 6.78E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.0338s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.0220s
+ [COUNTERS] Fortran MEs      ( 1 ) :    0.0118s for     8192 events => throughput is 6.96E+05 events/s
 
 *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) ***
 --------------------
@@ -83,8 +83,8 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_eemumu_x1_fortran > /tmp/av
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.2175 [0.21747169064681776] fbridge_mode=0
  [UNWEIGHT] Wrote 1611 events (found 1616 events)
- [COUNTERS] PROGRAM TOTAL          :    0.1743s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.1627s
+ [COUNTERS] PROGRAM TOTAL          :    0.1766s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.1650s
  [COUNTERS] Fortran MEs      ( 1 ) :    0.0116s for     8192 events => throughput is 7.07E+05 events/s
 
 *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) ***
@@ -108,9 +108,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_eemumu_x10_fortran > /tmp/a
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.0915 [9.1501919904813656E-002] fbridge_mode=0
  [UNWEIGHT] Wrote 1803 events (found 1808 events)
- [COUNTERS] PROGRAM TOTAL          :    0.4602s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.3321s
- [COUNTERS] Fortran MEs      ( 1 ) :    0.1281s for    90112 events => throughput is 7.03E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.4526s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.3272s
+ [COUNTERS] Fortran MEs      ( 1 ) :    0.1254s for    90112 events => throughput is 7.19E+05 events/s
 
 *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -133,9 +133,9 @@ Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.2175 [0.21747169064681776] fbridge_mode=1
  [UNWEIGHT] Wrote 1611 events (found 1616 events)
- [COUNTERS] PROGRAM TOTAL          :    0.1857s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.1798s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0059s for     8192 events => throughput is 1.39E+06 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.1852s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.1792s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0060s for     8192 events => throughput is 1.37E+06 events/s
 
 *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -166,9 +166,9 @@ Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.0915 [9.1501919904813628E-002] fbridge_mode=1
  [UNWEIGHT] Wrote 1803 events (found 1808 events)
- [COUNTERS] PROGRAM TOTAL          :    0.4020s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.3381s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0639s for    90112 events => throughput is 1.41E+06 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.4048s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.3392s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0656s for    90112 events => throughput is 1.37E+06 events/s
 
 *** (2-none) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -179,14 +179,14 @@ OK! xsec from fortran (9.1501919904813656E-002) and cpp (9.1501919904813628E-002
 OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
 
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
-Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.336870e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.321006e+06                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
-Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.374247e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.339172e+06                 )  sec^-1
 
 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -209,9 +209,9 @@ Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.2175 [0.21747169064681776] fbridge_mode=1
  [UNWEIGHT] Wrote 1611 events (found 1616 events)
- [COUNTERS] PROGRAM TOTAL          :    0.1985s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.1956s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0028s for     8192 events => throughput is 2.92E+06 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.1790s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.1760s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0030s for     8192 events => throughput is 2.73E+06 events/s
 
 *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -242,9 +242,9 @@ Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.0915 [9.1501919904813628E-002] fbridge_mode=1
  [UNWEIGHT] Wrote 1803 events (found 1808 events)
- [COUNTERS] PROGRAM TOTAL          :    0.3681s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.3369s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0312s for    90112 events => throughput is 2.89E+06 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.3712s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.3377s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0335s for    90112 events => throughput is 2.69E+06 events/s
 
 *** (2-sse4) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -255,14 +255,14 @@ OK! xsec from fortran (9.1501919904813656E-002) and cpp (9.1501919904813628E-002
 OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
 
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
-Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.728154e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.636316e+06                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
-Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.907973e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.734490e+06                 )  sec^-1
 
 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -285,9 +285,9 @@ Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.2175 [0.21747169064681776] fbridge_mode=1
  [UNWEIGHT] Wrote 1611 events (found 1616 events)
- [COUNTERS] PROGRAM TOTAL          :    0.1716s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.1699s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0017s for     8192 events => throughput is 4.87E+06 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.1853s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.1836s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0017s for     8192 events => throughput is 4.82E+06 events/s
 
 *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -318,9 +318,9 @@ Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.0915 [9.1501919904813656E-002] fbridge_mode=1
  [UNWEIGHT] Wrote 1803 events (found 1808 events)
- [COUNTERS] PROGRAM TOTAL          :    0.3546s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.3373s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0173s for    90112 events => throughput is 5.21E+06 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.3724s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.3536s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0187s for    90112 events => throughput is 4.81E+06 events/s
 
 *** (2-avx2) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -331,14 +331,14 @@ OK! xsec from fortran (9.1501919904813656E-002) and cpp (9.1501919904813656E-002
 OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
 
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
-Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 5.040980e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 5.167177e+06                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
-Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 5.910901e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 5.517494e+06                 )  sec^-1
 
 *** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -361,9 +361,9 @@ Executing ' ./build.512y_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.2175 [0.21747169064681776] fbridge_mode=1
  [UNWEIGHT] Wrote 1611 events (found 1616 events)
- [COUNTERS] PROGRAM TOTAL          :    0.1706s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.1692s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0014s for     8192 events => throughput is 5.70E+06 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.1775s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.1760s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0015s for     8192 events => throughput is 5.41E+06 events/s
 
 *** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -394,9 +394,9 @@ Executing ' ./build.512y_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.0915 [9.1501919904813656E-002] fbridge_mode=1
  [UNWEIGHT] Wrote 1803 events (found 1808 events)
- [COUNTERS] PROGRAM TOTAL          :    0.3592s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.3430s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0162s for    90112 events => throughput is 5.56E+06 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.3530s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.3366s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0164s for    90112 events => throughput is 5.49E+06 events/s
 
 *** (2-512y) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -407,14 +407,14 @@ OK! xsec from fortran (9.1501919904813656E-002) and cpp (9.1501919904813656E-002
 OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
 
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
-Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 5.443652e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 5.516918e+06                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
-Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 6.102617e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 5.847457e+06                 )  sec^-1
 
 *** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -437,9 +437,9 @@ Executing ' ./build.512z_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.2175 [0.21747169064681776] fbridge_mode=1
  [UNWEIGHT] Wrote 1611 events (found 1616 events)
- [COUNTERS] PROGRAM TOTAL          :    0.1741s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.1723s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0019s for     8192 events => throughput is 4.43E+06 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.1782s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.1766s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0016s for     8192 events => throughput is 5.09E+06 events/s
 
 *** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -470,9 +470,9 @@ Executing ' ./build.512z_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.0915 [9.1501919904813656E-002] fbridge_mode=1
  [UNWEIGHT] Wrote 1803 events (found 1808 events)
- [COUNTERS] PROGRAM TOTAL          :    0.3577s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.3393s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0184s for    90112 events => throughput is 4.89E+06 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.3585s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.3398s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0187s for    90112 events => throughput is 4.83E+06 events/s
 
 *** (2-512z) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -483,14 +483,14 @@ OK! xsec from fortran (9.1501919904813656E-002) and cpp (9.1501919904813656E-002
 OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
 
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
-Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 4.745967e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.863366e+06                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
-Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 5.387840e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 5.240767e+06                 )  sec^-1
 
 *** (3) EXECUTE MADEVENT_CUDA x1 (create events.lhe) ***
 --------------------
@@ -513,9 +513,9 @@ Executing ' ./build.none_d_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_eemumu_
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.2175 [0.21747169064681776] fbridge_mode=1
  [UNWEIGHT] Wrote 1611 events (found 1616 events)
- [COUNTERS] PROGRAM TOTAL          :    0.6235s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.6230s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0005s for     8192 events => throughput is 1.65E+07 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.6019s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.6014s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0005s for     8192 events => throughput is 1.67E+07 events/s
 
 *** (3) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -546,9 +546,9 @@ Executing ' ./build.none_d_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_eemumu_
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.0915 [9.1501919904813628E-002] fbridge_mode=1
  [UNWEIGHT] Wrote 1803 events (found 1808 events)
- [COUNTERS] PROGRAM TOTAL          :    0.7970s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.7922s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0048s for    90112 events => throughput is 1.87E+07 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.7631s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.7582s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0049s for    90112 events => throughput is 1.85E+07 events/s
 
 *** (3) Compare MADEVENT_CUDA x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -559,43 +559,43 @@ OK! xsec from fortran (9.1501919904813656E-002) and cpp (9.1501919904813628E-002
 OK! events.lhe.cuda.10 and events.lhe.ref.10 are identical
 
 *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge ***
-Process                     = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.108671e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.246491e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(8192) -p 256 32 1 ***
-Process                     = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.375732e+08                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.345799e+08                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge ***
-Process                     = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.977014e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.752050e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX) -p 16384 32 1 ***
-Process                     = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.036419e+09                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.022855e+09                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge ***
-Process                     = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.981844e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.756155e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 ***
-Process                     = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.078134e+09                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.083354e+09                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge ***
-Process                     = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.944271e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.751711e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 ***
-Process                     = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.976379e+08                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.022807e+08                 )  sec^-1
 
 TEST COMPLETED
diff --git a/epochX/cudacpp/tmad/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0.txt
index 6e3da73554..f44c753bdf 100644
--- a/epochX/cudacpp/tmad/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0.txt
+++ b/epochX/cudacpp/tmad/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0.txt
@@ -2,30 +2,30 @@ Working directory (build): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/e
 CUDACPP_BUILDDIR='.'
 
 
-make USEBUILDDIR=1 AVX=none
 
+make USEBUILDDIR=1 AVX=none
 make USEBUILDDIR=1 AVX=sse4
-make USEBUILDDIR=1 AVX=avx2
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 
+make USEBUILDDIR=1 AVX=avx2
 make USEBUILDDIR=1 AVX=512y
+
+make USEBUILDDIR=1 AVX=512z
 make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
-
-make USEBUILDDIR=1 AVX=512z
 make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
-CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0'
+make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+CUDACPP_BUILDDIR='build.none_f_inl0_hrd0'
 CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd0'
-CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0'
 CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0'
+CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0'
+CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
-CUDACPP_BUILDDIR='build.none_f_inl0_hrd0'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 make[1]: Nothing to be done for 'all'.
@@ -33,9 +33,9 @@ make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/
 
 OMP_NUM_THREADS=
 
-DATE: 2023-06-16_23:34:51
+DATE: 2023-07-18_23:37:56
 
-On itscrd80.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
+On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum
 
 *** (1) EXECUTE MADEVENT_FORTRAN (create results.dat) ***
@@ -58,9 +58,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_eemumu_x1_fortran > /tmp/av
  [XSECTION] Configuration = 1
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.2175 [] fbridge_mode=0
- [COUNTERS] PROGRAM TOTAL          :    0.0316s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.0200s
- [COUNTERS] Fortran MEs      ( 1 ) :    0.0115s for     8192 events => throughput is 7.11E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.0306s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.0189s
+ [COUNTERS] Fortran MEs      ( 1 ) :    0.0117s for     8192 events => throughput is 6.98E+05 events/s
 
 *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) ***
 --------------------
@@ -83,9 +83,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_eemumu_x1_fortran > /tmp/av
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.2175 [0.21747169064681776] fbridge_mode=0
  [UNWEIGHT] Wrote 1611 events (found 1616 events)
- [COUNTERS] PROGRAM TOTAL          :    0.1738s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.1623s
- [COUNTERS] Fortran MEs      ( 1 ) :    0.0115s for     8192 events => throughput is 7.12E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.1813s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.1694s
+ [COUNTERS] Fortran MEs      ( 1 ) :    0.0119s for     8192 events => throughput is 6.89E+05 events/s
 
 *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) ***
 --------------------
@@ -108,9 +108,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_eemumu_x10_fortran > /tmp/a
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.0915 [9.1501919904813656E-002] fbridge_mode=0
  [UNWEIGHT] Wrote 1803 events (found 1808 events)
- [COUNTERS] PROGRAM TOTAL          :    0.4892s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.3640s
- [COUNTERS] Fortran MEs      ( 1 ) :    0.1253s for    90112 events => throughput is 7.19E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.4607s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.3351s
+ [COUNTERS] Fortran MEs      ( 1 ) :    0.1256s for    90112 events => throughput is 7.17E+05 events/s
 
 *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -133,9 +133,9 @@ Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.2175 [0.21747166140620297] fbridge_mode=1
  [UNWEIGHT] Wrote 1611 events (found 1616 events)
- [COUNTERS] PROGRAM TOTAL          :    0.1765s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.1709s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0056s for     8192 events => throughput is 1.45E+06 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.1867s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.1809s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0058s for     8192 events => throughput is 1.41E+06 events/s
 
 *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -166,9 +166,9 @@ Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.0915 [9.1501907784661565E-002] fbridge_mode=1
  [UNWEIGHT] Wrote 1803 events (found 1808 events)
- [COUNTERS] PROGRAM TOTAL          :    0.4045s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.3421s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0625s for    90112 events => throughput is 1.44E+06 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.4048s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.3394s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0654s for    90112 events => throughput is 1.38E+06 events/s
 
 *** (2-none) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -179,14 +179,14 @@ OK! xsec from fortran (9.1501919904813656E-002) and cpp (9.1501907784661565E-002
 OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
 
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
-Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.366086e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.328859e+06                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
-Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.411236e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.356177e+06                 )  sec^-1
 
 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -209,9 +209,9 @@ Executing ' ./build.sse4_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.2175 [0.21747165549479658] fbridge_mode=1
  [UNWEIGHT] Wrote 1611 events (found 1616 events)
- [COUNTERS] PROGRAM TOTAL          :    0.1681s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.1666s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0016s for     8192 events => throughput is 5.27E+06 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.1757s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.1739s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0017s for     8192 events => throughput is 4.77E+06 events/s
 
 *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -242,9 +242,9 @@ Executing ' ./build.sse4_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.0915 [9.1501905692857932E-002] fbridge_mode=1
  [UNWEIGHT] Wrote 1803 events (found 1808 events)
- [COUNTERS] PROGRAM TOTAL          :    0.3520s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.3347s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0172s for    90112 events => throughput is 5.23E+06 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.3550s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.3368s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0182s for    90112 events => throughput is 4.94E+06 events/s
 
 *** (2-sse4) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -255,14 +255,14 @@ OK! xsec from fortran (9.1501919904813656E-002) and cpp (9.1501905692857932E-002
 OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
 
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
-Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 5.178390e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 5.042708e+06                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
-Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 5.517565e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 5.405752e+06                 )  sec^-1
 
 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -285,9 +285,9 @@ Executing ' ./build.avx2_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.2175 [0.21747165569099927] fbridge_mode=1
  [UNWEIGHT] Wrote 1611 events (found 1616 events)
- [COUNTERS] PROGRAM TOTAL          :    0.1682s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.1674s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0008s for     8192 events => throughput is 9.72E+06 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.1804s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.1795s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0009s for     8192 events => throughput is 9.00E+06 events/s
 
 *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -318,9 +318,9 @@ Executing ' ./build.avx2_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.0915 [9.1501905658047333E-002] fbridge_mode=1
  [UNWEIGHT] Wrote 1803 events (found 1808 events)
- [COUNTERS] PROGRAM TOTAL          :    0.3407s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.3314s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0094s for    90112 events => throughput is 9.61E+06 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.3621s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.3525s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0096s for    90112 events => throughput is 9.36E+06 events/s
 
 *** (2-avx2) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -331,14 +331,14 @@ OK! xsec from fortran (9.1501919904813656E-002) and cpp (9.1501905658047333E-002
 OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
 
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
-Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.065328e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 9.526379e+06                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
-Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.184488e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.122017e+07                 )  sec^-1
 
 *** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -361,9 +361,9 @@ Executing ' ./build.512y_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.2175 [0.21747165569099927] fbridge_mode=1
  [UNWEIGHT] Wrote 1611 events (found 1616 events)
- [COUNTERS] PROGRAM TOTAL          :    0.1673s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.1665s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0008s for     8192 events => throughput is 1.03E+07 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.1898s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.1889s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0009s for     8192 events => throughput is 9.31E+06 events/s
 
 *** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -394,9 +394,9 @@ Executing ' ./build.512y_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.0915 [9.1501905658047333E-002] fbridge_mode=1
  [UNWEIGHT] Wrote 1803 events (found 1808 events)
- [COUNTERS] PROGRAM TOTAL          :    0.3433s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.3345s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0087s for    90112 events => throughput is 1.03E+07 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.3555s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.3463s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0093s for    90112 events => throughput is 9.73E+06 events/s
 
 *** (2-512y) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -407,14 +407,14 @@ OK! xsec from fortran (9.1501919904813656E-002) and cpp (9.1501905658047333E-002
 OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
 
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
-Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.074978e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 9.962410e+06                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
-Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.114652e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 9.882310e+06                 )  sec^-1
 
 *** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -437,9 +437,9 @@ Executing ' ./build.512z_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.2175 [0.21747166431914253] fbridge_mode=1
  [UNWEIGHT] Wrote 1611 events (found 1616 events)
- [COUNTERS] PROGRAM TOTAL          :    0.1711s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.1701s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0009s for     8192 events => throughput is 8.67E+06 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.1889s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.1878s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0011s for     8192 events => throughput is 7.26E+06 events/s
 
 *** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -470,9 +470,9 @@ Executing ' ./build.512z_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.0915 [9.1501909358591468E-002] fbridge_mode=1
  [UNWEIGHT] Wrote 1803 events (found 1808 events)
- [COUNTERS] PROGRAM TOTAL          :    0.3454s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.3353s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0102s for    90112 events => throughput is 8.87E+06 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.3574s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.3468s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0106s for    90112 events => throughput is 8.50E+06 events/s
 
 *** (2-512z) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -483,14 +483,14 @@ OK! xsec from fortran (9.1501919904813656E-002) and cpp (9.1501909358591468E-002
 OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
 
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
-Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 9.877305e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 8.208516e+06                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
-Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.149185e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.080579e+07                 )  sec^-1
 
 *** (3) EXECUTE MADEVENT_CUDA x1 (create events.lhe) ***
 --------------------
@@ -513,9 +513,9 @@ Executing ' ./build.none_f_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_eemumu_
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.2175 [0.21747166796068879] fbridge_mode=1
  [UNWEIGHT] Wrote 1611 events (found 1616 events)
- [COUNTERS] PROGRAM TOTAL          :    0.6303s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.6298s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0005s for     8192 events => throughput is 1.72E+07 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.5963s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.5958s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0005s for     8192 events => throughput is 1.74E+07 events/s
 
 *** (3) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -546,9 +546,9 @@ Executing ' ./build.none_f_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_eemumu_
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.0915 [9.1501910316213061E-002] fbridge_mode=1
  [UNWEIGHT] Wrote 1803 events (found 1808 events)
- [COUNTERS] PROGRAM TOTAL          :    0.7969s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.7923s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0045s for    90112 events => throughput is 1.98E+07 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.7633s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.7588s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0045s for    90112 events => throughput is 1.99E+07 events/s
 
 *** (3) Compare MADEVENT_CUDA x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -559,43 +559,43 @@ OK! xsec from fortran (9.1501919904813656E-002) and cpp (9.1501910316213061E-002
 OK! events.lhe.cuda.10 and events.lhe.ref.10 are identical
 
 *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge ***
-Process                     = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 4.510070e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.595045e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(8192) -p 256 32 1 ***
-Process                     = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.150527e+08                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.237305e+08                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge ***
-Process                     = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 7.548906e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.038428e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX) -p 16384 32 1 ***
-Process                     = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.466611e+09                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.513799e+09                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge ***
-Process                     = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 7.695373e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.050612e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 ***
-Process                     = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.743025e+09                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.771650e+09                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge ***
-Process                     = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 6.786521e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 6.393159e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 ***
-Process                     = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 9.372126e+08                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 9.502447e+08                 )  sec^-1
 
 TEST COMPLETED
diff --git a/epochX/cudacpp/tmad/logs_eemumu_mad/log_eemumu_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_eemumu_mad/log_eemumu_mad_m_inl0_hrd0.txt
index 08471d1c00..0bbeab2435 100644
--- a/epochX/cudacpp/tmad/logs_eemumu_mad/log_eemumu_mad_m_inl0_hrd0.txt
+++ b/epochX/cudacpp/tmad/logs_eemumu_mad/log_eemumu_mad_m_inl0_hrd0.txt
@@ -2,23 +2,23 @@ Working directory (build): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/e
 CUDACPP_BUILDDIR='.'
 
 
-make USEBUILDDIR=1 AVX=none
 
+make USEBUILDDIR=1 AVX=none
 make USEBUILDDIR=1 AVX=sse4
-make USEBUILDDIR=1 AVX=avx2
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 
+make USEBUILDDIR=1 AVX=avx2
 make USEBUILDDIR=1 AVX=512y
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 
 make USEBUILDDIR=1 AVX=512z
 make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
-CUDACPP_BUILDDIR='build.avx2_m_inl0_hrd0'
+make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+CUDACPP_BUILDDIR='build.512y_m_inl0_hrd0'
 CUDACPP_BUILDDIR='build.sse4_m_inl0_hrd0'
 CUDACPP_BUILDDIR='build.none_m_inl0_hrd0'
-CUDACPP_BUILDDIR='build.512y_m_inl0_hrd0'
+CUDACPP_BUILDDIR='build.avx2_m_inl0_hrd0'
 CUDACPP_BUILDDIR='build.512z_m_inl0_hrd0'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
@@ -33,9 +33,9 @@ make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/
 
 OMP_NUM_THREADS=
 
-DATE: 2023-06-16_23:35:08
+DATE: 2023-07-18_23:38:12
 
-On itscrd80.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
+On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum
 
 *** (1) EXECUTE MADEVENT_FORTRAN (create results.dat) ***
@@ -58,9 +58,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_eemumu_x1_fortran > /tmp/av
  [XSECTION] Configuration = 1
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.2175 [] fbridge_mode=0
- [COUNTERS] PROGRAM TOTAL          :    0.0313s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.0197s
- [COUNTERS] Fortran MEs      ( 1 ) :    0.0115s for     8192 events => throughput is 7.11E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.0308s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.0189s
+ [COUNTERS] Fortran MEs      ( 1 ) :    0.0120s for     8192 events => throughput is 6.83E+05 events/s
 
 *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) ***
 --------------------
@@ -83,9 +83,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_eemumu_x1_fortran > /tmp/av
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.2175 [0.21747169064681776] fbridge_mode=0
  [UNWEIGHT] Wrote 1611 events (found 1616 events)
- [COUNTERS] PROGRAM TOTAL          :    0.1766s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.1651s
- [COUNTERS] Fortran MEs      ( 1 ) :    0.0116s for     8192 events => throughput is 7.08E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.1824s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.1710s
+ [COUNTERS] Fortran MEs      ( 1 ) :    0.0113s for     8192 events => throughput is 7.24E+05 events/s
 
 *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) ***
 --------------------
@@ -108,9 +108,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_eemumu_x10_fortran > /tmp/a
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.0915 [9.1501919904813656E-002] fbridge_mode=0
  [UNWEIGHT] Wrote 1803 events (found 1808 events)
- [COUNTERS] PROGRAM TOTAL          :    0.4702s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.3402s
- [COUNTERS] Fortran MEs      ( 1 ) :    0.1300s for    90112 events => throughput is 6.93E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.4718s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.3410s
+ [COUNTERS] Fortran MEs      ( 1 ) :    0.1308s for    90112 events => throughput is 6.89E+05 events/s
 
 *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -133,9 +133,9 @@ Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.2175 [0.21747169074211728] fbridge_mode=1
  [UNWEIGHT] Wrote 1611 events (found 1616 events)
- [COUNTERS] PROGRAM TOTAL          :    0.1796s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.1734s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0062s for     8192 events => throughput is 1.33E+06 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.1923s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.1857s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0066s for     8192 events => throughput is 1.25E+06 events/s
 
 *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -166,9 +166,9 @@ Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.0915 [9.1501919915927155E-002] fbridge_mode=1
  [UNWEIGHT] Wrote 1803 events (found 1808 events)
- [COUNTERS] PROGRAM TOTAL          :    0.4089s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.3434s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0655s for    90112 events => throughput is 1.38E+06 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.4100s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.3412s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0688s for    90112 events => throughput is 1.31E+06 events/s
 
 *** (2-none) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -179,14 +179,14 @@ OK! xsec from fortran (9.1501919904813656E-002) and cpp (9.1501919915927155E-002
 OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
 
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
-Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.316240e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.265020e+06                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
-Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.353376e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.281271e+06                 )  sec^-1
 
 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -209,9 +209,9 @@ Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.2175 [0.21747169074211728] fbridge_mode=1
  [UNWEIGHT] Wrote 1611 events (found 1616 events)
- [COUNTERS] PROGRAM TOTAL          :    0.1732s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.1703s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0029s for     8192 events => throughput is 2.85E+06 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.1791s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.1762s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0029s for     8192 events => throughput is 2.79E+06 events/s
 
 *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -242,9 +242,9 @@ Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.0915 [9.1501919915927155E-002] fbridge_mode=1
  [UNWEIGHT] Wrote 1803 events (found 1808 events)
- [COUNTERS] PROGRAM TOTAL          :    0.3857s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.3519s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0338s for    90112 events => throughput is 2.66E+06 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.3721s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.3394s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0327s for    90112 events => throughput is 2.75E+06 events/s
 
 *** (2-sse4) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -255,14 +255,14 @@ OK! xsec from fortran (9.1501919904813656E-002) and cpp (9.1501919915927155E-002
 OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
 
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
-Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.780240e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.694313e+06                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
-Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.942864e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.848827e+06                 )  sec^-1
 
 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -285,9 +285,9 @@ Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.2175 [0.21747169063975949] fbridge_mode=1
  [UNWEIGHT] Wrote 1611 events (found 1616 events)
- [COUNTERS] PROGRAM TOTAL          :    0.1715s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.1699s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0016s for     8192 events => throughput is 5.22E+06 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.1760s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.1744s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0016s for     8192 events => throughput is 5.15E+06 events/s
 
 *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -318,9 +318,9 @@ Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.0915 [9.1501919908700741E-002] fbridge_mode=1
  [UNWEIGHT] Wrote 1803 events (found 1808 events)
- [COUNTERS] PROGRAM TOTAL          :    0.3543s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.3369s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0174s for    90112 events => throughput is 5.17E+06 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.3516s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.3346s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0170s for    90112 events => throughput is 5.30E+06 events/s
 
 *** (2-avx2) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -331,14 +331,14 @@ OK! xsec from fortran (9.1501919904813656E-002) and cpp (9.1501919908700741E-002
 OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
 
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
-Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 5.231827e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 5.287208e+06                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
-Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 5.881656e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 5.870563e+06                 )  sec^-1
 
 *** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -361,9 +361,9 @@ Executing ' ./build.512y_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.2175 [0.21747169063975949] fbridge_mode=1
  [UNWEIGHT] Wrote 1611 events (found 1616 events)
- [COUNTERS] PROGRAM TOTAL          :    0.1708s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.1692s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0015s for     8192 events => throughput is 5.38E+06 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.1776s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.1761s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0015s for     8192 events => throughput is 5.31E+06 events/s
 
 *** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -394,9 +394,9 @@ Executing ' ./build.512y_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.0915 [9.1501919908700741E-002] fbridge_mode=1
  [UNWEIGHT] Wrote 1803 events (found 1808 events)
- [COUNTERS] PROGRAM TOTAL          :    0.3556s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.3393s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0164s for    90112 events => throughput is 5.51E+06 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.3545s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.3385s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0161s for    90112 events => throughput is 5.61E+06 events/s
 
 *** (2-512y) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -407,14 +407,14 @@ OK! xsec from fortran (9.1501919904813656E-002) and cpp (9.1501919908700741E-002
 OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
 
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
-Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 5.595835e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 5.660074e+06                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
-Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 6.329466e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 6.277308e+06                 )  sec^-1
 
 *** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -437,9 +437,9 @@ Executing ' ./build.512z_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.2175 [0.21747169063975949] fbridge_mode=1
  [UNWEIGHT] Wrote 1611 events (found 1616 events)
- [COUNTERS] PROGRAM TOTAL          :    0.1706s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.1690s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0016s for     8192 events => throughput is 5.02E+06 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.1786s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.1769s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0017s for     8192 events => throughput is 4.95E+06 events/s
 
 *** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -470,9 +470,9 @@ Executing ' ./build.512z_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.0915 [9.1501919908700741E-002] fbridge_mode=1
  [UNWEIGHT] Wrote 1803 events (found 1808 events)
- [COUNTERS] PROGRAM TOTAL          :    0.3594s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.3416s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0178s for    90112 events => throughput is 5.06E+06 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.3740s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.3541s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0200s for    90112 events => throughput is 4.51E+06 events/s
 
 *** (2-512z) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -483,14 +483,14 @@ OK! xsec from fortran (9.1501919904813656E-002) and cpp (9.1501919908700741E-002
 OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
 
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
-Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 4.773234e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.595633e+06                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
-Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 5.345326e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 5.210377e+06                 )  sec^-1
 
 *** (3) EXECUTE MADEVENT_CUDA x1 (create events.lhe) ***
 --------------------
@@ -513,9 +513,9 @@ Executing ' ./build.none_m_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_eemumu_
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.2175 [0.21747169066587255] fbridge_mode=1
  [UNWEIGHT] Wrote 1611 events (found 1616 events)
- [COUNTERS] PROGRAM TOTAL          :    0.6290s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.6285s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0005s for     8192 events => throughput is 1.67E+07 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.5975s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.5970s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0005s for     8192 events => throughput is 1.63E+07 events/s
 
 *** (3) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -546,9 +546,9 @@ Executing ' ./build.none_m_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_eemumu_
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.0915 [9.1501919911173610E-002] fbridge_mode=1
  [UNWEIGHT] Wrote 1803 events (found 1808 events)
- [COUNTERS] PROGRAM TOTAL          :    0.7937s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.7888s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0049s for    90112 events => throughput is 1.83E+07 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.7608s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.7560s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0048s for    90112 events => throughput is 1.89E+07 events/s
 
 *** (3) Compare MADEVENT_CUDA x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -559,43 +559,43 @@ OK! xsec from fortran (9.1501919904813656E-002) and cpp (9.1501919911173610E-002
 OK! events.lhe.cuda.10 and events.lhe.ref.10 are identical
 
 *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge ***
-Process                     = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.341832e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.178199e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(8192) -p 256 32 1 ***
-Process                     = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.367767e+08                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.475523e+08                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge ***
-Process                     = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 4.004324e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.774650e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX) -p 16384 32 1 ***
-Process                     = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.022759e+09                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.034605e+09                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge ***
-Process                     = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 4.001795e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.742018e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 ***
-Process                     = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.064728e+09                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.080919e+09                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge ***
-Process                     = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.982012e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.757270e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 ***
-Process                     = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.000317e+08                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.990514e+08                 )  sec^-1
 
 TEST COMPLETED
diff --git a/epochX/cudacpp/tmad/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0.txt
index 9da4005516..b8ffb59aa2 100644
--- a/epochX/cudacpp/tmad/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0.txt
+++ b/epochX/cudacpp/tmad/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0.txt
@@ -3,25 +3,25 @@ CUDACPP_BUILDDIR='.'
 
 
 
+
 make USEBUILDDIR=1 AVX=none
 make USEBUILDDIR=1 AVX=sse4
-
 make USEBUILDDIR=1 AVX=avx2
 make USEBUILDDIR=1 AVX=512y
+
+make USEBUILDDIR=1 AVX=512z
 make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
-
-make USEBUILDDIR=1 AVX=512z
 make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd0'
-CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0'
 CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0'
 CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0'
-CUDACPP_BUILDDIR='build.none_d_inl0_hrd0'
+CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+CUDACPP_BUILDDIR='build.none_d_inl0_hrd0'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 make[1]: Nothing to be done for 'all'.
@@ -33,9 +33,9 @@ make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/
 
 OMP_NUM_THREADS=
 
-DATE: 2023-06-16_23:35:24
+DATE: 2023-07-18_23:38:28
 
-On itscrd80.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
+On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx
 
 *** (1) EXECUTE MADEVENT_FORTRAN (create results.dat) ***
@@ -58,9 +58,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggtt_x1_fortran > /tmp/aval
  [XSECTION] Configuration = 1
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 47.69 [] fbridge_mode=0
- [COUNTERS] PROGRAM TOTAL          :    0.2030s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.1493s
- [COUNTERS] Fortran MEs      ( 1 ) :    0.0538s for     8192 events => throughput is 1.52E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.2086s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.1537s
+ [COUNTERS] Fortran MEs      ( 1 ) :    0.0549s for     8192 events => throughput is 1.49E+05 events/s
 
 *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) ***
 --------------------
@@ -83,9 +83,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggtt_x1_fortran > /tmp/aval
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 47.69 [47.690708277600116] fbridge_mode=0
  [UNWEIGHT] Wrote 434 events (found 1125 events)
- [COUNTERS] PROGRAM TOTAL          :    0.3135s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.2598s
- [COUNTERS] Fortran MEs      ( 1 ) :    0.0536s for     8192 events => throughput is 1.53E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.3201s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.2645s
+ [COUNTERS] Fortran MEs      ( 1 ) :    0.0557s for     8192 events => throughput is 1.47E+05 events/s
 
 *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) ***
 --------------------
@@ -108,9 +108,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggtt_x10_fortran > /tmp/ava
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 46.22 [46.223782291775365] fbridge_mode=0
  [UNWEIGHT] Wrote 1727 events (found 1732 events)
- [COUNTERS] PROGRAM TOTAL          :    1.7753s
- [COUNTERS] Fortran Overhead ( 0 ) :    1.1856s
- [COUNTERS] Fortran MEs      ( 1 ) :    0.5897s for    90112 events => throughput is 1.53E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    1.7911s
+ [COUNTERS] Fortran Overhead ( 0 ) :    1.1844s
+ [COUNTERS] Fortran MEs      ( 1 ) :    0.6067s for    90112 events => throughput is 1.49E+05 events/s
 
 *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -133,9 +133,9 @@ Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 47.69 [47.690708277600102] fbridge_mode=1
  [UNWEIGHT] Wrote 434 events (found 1125 events)
- [COUNTERS] PROGRAM TOTAL          :    0.3502s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.3091s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0411s for     8192 events => throughput is 1.99E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.3551s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.3121s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0430s for     8192 events => throughput is 1.90E+05 events/s
 
 *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -166,9 +166,9 @@ Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x10
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 46.22 [46.223782291775379] fbridge_mode=1
  [UNWEIGHT] Wrote 1727 events (found 1732 events)
- [COUNTERS] PROGRAM TOTAL          :    1.6970s
- [COUNTERS] Fortran Overhead ( 0 ) :    1.2435s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.4535s for    90112 events => throughput is 1.99E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    1.7071s
+ [COUNTERS] Fortran Overhead ( 0 ) :    1.2348s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.4724s for    90112 events => throughput is 1.91E+05 events/s
 
 *** (2-none) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -179,14 +179,14 @@ OK! xsec from fortran (46.223782291775365) and cpp (46.223782291775379) differ b
 OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
 
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
-Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.992097e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.927052e+05                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
-Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.951938e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.914151e+05                 )  sec^-1
 
 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -209,9 +209,9 @@ Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 47.69 [47.690708277600109] fbridge_mode=1
  [UNWEIGHT] Wrote 434 events (found 1125 events)
- [COUNTERS] PROGRAM TOTAL          :    0.3095s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.2872s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0223s for     8192 events => throughput is 3.68E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.3154s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.2918s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0236s for     8192 events => throughput is 3.47E+05 events/s
 
 *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -242,9 +242,9 @@ Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x10
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 46.22 [46.223782291775379] fbridge_mode=1
  [UNWEIGHT] Wrote 1727 events (found 1732 events)
- [COUNTERS] PROGRAM TOTAL          :    1.4568s
- [COUNTERS] Fortran Overhead ( 0 ) :    1.2139s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.2429s for    90112 events => throughput is 3.71E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    1.4706s
+ [COUNTERS] Fortran Overhead ( 0 ) :    1.2169s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.2537s for    90112 events => throughput is 3.55E+05 events/s
 
 *** (2-sse4) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -255,14 +255,14 @@ OK! xsec from fortran (46.223782291775365) and cpp (46.223782291775379) differ b
 OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
 
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
-Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.737410e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.493125e+05                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
-Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.630060e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.573105e+05                 )  sec^-1
 
 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -285,9 +285,9 @@ Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 47.69 [47.690708277600109] fbridge_mode=1
  [UNWEIGHT] Wrote 434 events (found 1125 events)
- [COUNTERS] PROGRAM TOTAL          :    0.2885s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.2762s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0123s for     8192 events => throughput is 6.65E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.2920s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.2792s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0128s for     8192 events => throughput is 6.41E+05 events/s
 
 *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -318,9 +318,9 @@ Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x10
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 46.22 [46.223782291775393] fbridge_mode=1
  [UNWEIGHT] Wrote 1727 events (found 1732 events)
- [COUNTERS] PROGRAM TOTAL          :    1.3388s
- [COUNTERS] Fortran Overhead ( 0 ) :    1.2039s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.1350s for    90112 events => throughput is 6.68E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    1.3508s
+ [COUNTERS] Fortran Overhead ( 0 ) :    1.2114s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.1395s for    90112 events => throughput is 6.46E+05 events/s
 
 *** (2-avx2) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -331,14 +331,14 @@ OK! xsec from fortran (46.223782291775365) and cpp (46.223782291775393) differ b
 OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
 
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
-Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 6.378851e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 6.192968e+05                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
-Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 6.500232e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 6.297627e+05                 )  sec^-1
 
 *** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -361,9 +361,9 @@ Executing ' ./build.512y_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 47.69 [47.690708277600109] fbridge_mode=1
  [UNWEIGHT] Wrote 434 events (found 1125 events)
- [COUNTERS] PROGRAM TOTAL          :    0.2876s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.2765s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0111s for     8192 events => throughput is 7.39E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.2894s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.2780s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0114s for     8192 events => throughput is 7.19E+05 events/s
 
 *** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -394,9 +394,9 @@ Executing ' ./build.512y_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x10
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 46.22 [46.223782291775393] fbridge_mode=1
  [UNWEIGHT] Wrote 1727 events (found 1732 events)
- [COUNTERS] PROGRAM TOTAL          :    1.3666s
- [COUNTERS] Fortran Overhead ( 0 ) :    1.2422s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.1244s for    90112 events => throughput is 7.24E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    1.3333s
+ [COUNTERS] Fortran Overhead ( 0 ) :    1.2059s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.1274s for    90112 events => throughput is 7.07E+05 events/s
 
 *** (2-512y) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -407,14 +407,14 @@ OK! xsec from fortran (46.223782291775365) and cpp (46.223782291775393) differ b
 OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
 
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
-Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 6.995934e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 6.981238e+05                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
-Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 6.995046e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.001926e+05                 )  sec^-1
 
 *** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -437,9 +437,9 @@ Executing ' ./build.512z_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 47.69 [47.690708277600109] fbridge_mode=1
  [UNWEIGHT] Wrote 434 events (found 1125 events)
- [COUNTERS] PROGRAM TOTAL          :    0.3014s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.2827s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0186s for     8192 events => throughput is 4.39E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.3041s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.2853s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0187s for     8192 events => throughput is 4.37E+05 events/s
 
 *** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -470,9 +470,9 @@ Executing ' ./build.512z_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x10
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 46.22 [46.223782291775393] fbridge_mode=1
  [UNWEIGHT] Wrote 1727 events (found 1732 events)
- [COUNTERS] PROGRAM TOTAL          :    1.4130s
- [COUNTERS] Fortran Overhead ( 0 ) :    1.2131s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.1999s for    90112 events => throughput is 4.51E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    1.4179s
+ [COUNTERS] Fortran Overhead ( 0 ) :    1.2121s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.2058s for    90112 events => throughput is 4.38E+05 events/s
 
 *** (2-512z) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -483,14 +483,14 @@ OK! xsec from fortran (46.223782291775365) and cpp (46.223782291775393) differ b
 OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
 
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
-Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 4.352063e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.159932e+05                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
-Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 4.264252e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.111311e+05                 )  sec^-1
 
 *** (3) EXECUTE MADEVENT_CUDA x1 (create events.lhe) ***
 --------------------
@@ -513,8 +513,8 @@ Executing ' ./build.none_d_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggtt_x1
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 47.69 [47.690708277600109] fbridge_mode=1
  [UNWEIGHT] Wrote 434 events (found 1125 events)
- [COUNTERS] PROGRAM TOTAL          :    0.7320s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.7314s
+ [COUNTERS] PROGRAM TOTAL          :    0.6947s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.6941s
  [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0006s for     8192 events => throughput is 1.38E+07 events/s
 
 *** (3) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec ***
@@ -546,9 +546,9 @@ Executing ' ./build.none_d_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggtt_x1
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 46.22 [46.223782291775386] fbridge_mode=1
  [UNWEIGHT] Wrote 1727 events (found 1732 events)
- [COUNTERS] PROGRAM TOTAL          :    1.6728s
- [COUNTERS] Fortran Overhead ( 0 ) :    1.6663s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0065s for    90112 events => throughput is 1.39E+07 events/s
+ [COUNTERS] PROGRAM TOTAL          :    1.6422s
+ [COUNTERS] Fortran Overhead ( 0 ) :    1.6357s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0065s for    90112 events => throughput is 1.38E+07 events/s
 
 *** (3) Compare MADEVENT_CUDA x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -559,43 +559,43 @@ OK! xsec from fortran (46.223782291775365) and cpp (46.223782291775386) differ b
 OK! events.lhe.cuda.10 and events.lhe.ref.10 are identical
 
 *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge ***
-Process                     = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.883699e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.080408e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(8192) -p 256 32 1 ***
-Process                     = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 5.569644e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 5.652930e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge ***
-Process                     = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.587002e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.002488e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX) -p 16384 32 1 ***
-Process                     = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.074126e+08                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.077157e+08                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge ***
-Process                     = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.570667e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.019063e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 ***
-Process                     = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.153037e+08                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.153703e+08                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge ***
-Process                     = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.562530e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.006798e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 ***
-Process                     = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.066634e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.037386e+07                 )  sec^-1
 
 TEST COMPLETED
diff --git a/epochX/cudacpp/tmad/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0.txt
index 01adf8925b..475170c8e7 100644
--- a/epochX/cudacpp/tmad/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0.txt
+++ b/epochX/cudacpp/tmad/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0.txt
@@ -2,30 +2,30 @@ Working directory (build): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/g
 CUDACPP_BUILDDIR='.'
 
 
-
 make USEBUILDDIR=1 AVX=none
+
+
 make USEBUILDDIR=1 AVX=sse4
 make USEBUILDDIR=1 AVX=avx2
-
 make USEBUILDDIR=1 AVX=512y
+
+make USEBUILDDIR=1 AVX=512z
 make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
-
-make USEBUILDDIR=1 AVX=512z
 make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
-CUDACPP_BUILDDIR='build.none_f_inl0_hrd0'
 CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd0'
-CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0'
+CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0'
+CUDACPP_BUILDDIR='build.none_f_inl0_hrd0'
+CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
-CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
-CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 make[1]: Nothing to be done for 'all'.
@@ -33,9 +33,9 @@ make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/
 
 OMP_NUM_THREADS=
 
-DATE: 2023-06-16_23:35:50
+DATE: 2023-07-18_23:38:53
 
-On itscrd80.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
+On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx
 
 *** (1) EXECUTE MADEVENT_FORTRAN (create results.dat) ***
@@ -58,9 +58,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggtt_x1_fortran > /tmp/aval
  [XSECTION] Configuration = 1
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 47.69 [] fbridge_mode=0
- [COUNTERS] PROGRAM TOTAL          :    0.2007s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.1470s
- [COUNTERS] Fortran MEs      ( 1 ) :    0.0537s for     8192 events => throughput is 1.52E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.2033s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.1483s
+ [COUNTERS] Fortran MEs      ( 1 ) :    0.0550s for     8192 events => throughput is 1.49E+05 events/s
 
 *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) ***
 --------------------
@@ -83,9 +83,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggtt_x1_fortran > /tmp/aval
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 47.69 [47.690708277600116] fbridge_mode=0
  [UNWEIGHT] Wrote 434 events (found 1125 events)
- [COUNTERS] PROGRAM TOTAL          :    0.3500s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.2965s
- [COUNTERS] Fortran MEs      ( 1 ) :    0.0535s for     8192 events => throughput is 1.53E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.3181s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.2631s
+ [COUNTERS] Fortran MEs      ( 1 ) :    0.0550s for     8192 events => throughput is 1.49E+05 events/s
 
 *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) ***
 --------------------
@@ -108,9 +108,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggtt_x10_fortran > /tmp/ava
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 46.22 [46.223782291775365] fbridge_mode=0
  [UNWEIGHT] Wrote 1727 events (found 1732 events)
- [COUNTERS] PROGRAM TOTAL          :    1.7703s
- [COUNTERS] Fortran Overhead ( 0 ) :    1.1795s
- [COUNTERS] Fortran MEs      ( 1 ) :    0.5908s for    90112 events => throughput is 1.53E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    1.7940s
+ [COUNTERS] Fortran Overhead ( 0 ) :    1.1856s
+ [COUNTERS] Fortran MEs      ( 1 ) :    0.6085s for    90112 events => throughput is 1.48E+05 events/s
 
 *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -133,9 +133,9 @@ Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 47.69 [47.690706211693573] fbridge_mode=1
  [UNWEIGHT] Wrote 434 events (found 1125 events)
- [COUNTERS] PROGRAM TOTAL          :    0.3412s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.3013s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0398s for     8192 events => throughput is 2.06E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.3506s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.3088s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0418s for     8192 events => throughput is 1.96E+05 events/s
 
 *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -166,9 +166,9 @@ Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x10
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 46.22 [46.223782418787778] fbridge_mode=1
  [UNWEIGHT] Wrote 1727 events (found 1732 events)
- [COUNTERS] PROGRAM TOTAL          :    1.6722s
- [COUNTERS] Fortran Overhead ( 0 ) :    1.2335s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.4387s for    90112 events => throughput is 2.05E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    1.6937s
+ [COUNTERS] Fortran Overhead ( 0 ) :    1.2350s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.4586s for    90112 events => throughput is 1.96E+05 events/s
 
 *** (2-none) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -179,14 +179,14 @@ OK! xsec from fortran (46.223782291775365) and cpp (46.223782418787778) differ b
 OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
 
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
-Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.020933e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.981514e+05                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
-Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.051565e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.971620e+05                 )  sec^-1
 
 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -209,9 +209,9 @@ Executing ' ./build.sse4_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 47.69 [47.690707641465352] fbridge_mode=1
  [UNWEIGHT] Wrote 434 events (found 1125 events)
- [COUNTERS] PROGRAM TOTAL          :    0.2975s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.2808s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0167s for     8192 events => throughput is 4.90E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.2964s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.2807s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0157s for     8192 events => throughput is 5.23E+05 events/s
 
 *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -242,9 +242,9 @@ Executing ' ./build.sse4_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x10
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 46.22 [46.223786452345514] fbridge_mode=1
  [UNWEIGHT] Wrote 1727 events (found 1732 events)
- [COUNTERS] PROGRAM TOTAL          :    1.3782s
- [COUNTERS] Fortran Overhead ( 0 ) :    1.2071s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.1711s for    90112 events => throughput is 5.27E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    1.3819s
+ [COUNTERS] Fortran Overhead ( 0 ) :    1.2092s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.1727s for    90112 events => throughput is 5.22E+05 events/s
 
 *** (2-sse4) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -255,14 +255,14 @@ OK! xsec from fortran (46.223782291775365) and cpp (46.223786452345514) differ b
 OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
 
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
-Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 5.159675e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 5.002821e+05                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
-Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 5.216268e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.987254e+05                 )  sec^-1
 
 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -285,9 +285,9 @@ Executing ' ./build.avx2_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 47.69 [47.690698819656767] fbridge_mode=1
  [UNWEIGHT] Wrote 434 events (found 1125 events)
- [COUNTERS] PROGRAM TOTAL          :    0.2780s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.2713s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0067s for     8192 events => throughput is 1.23E+06 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.2792s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.2723s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0069s for     8192 events => throughput is 1.18E+06 events/s
 
 *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -318,9 +318,9 @@ Executing ' ./build.avx2_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x10
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 46.22 [46.223782736292961] fbridge_mode=1
  [UNWEIGHT] Wrote 1727 events (found 1732 events)
- [COUNTERS] PROGRAM TOTAL          :    1.2805s
- [COUNTERS] Fortran Overhead ( 0 ) :    1.2059s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0747s for    90112 events => throughput is 1.21E+06 events/s
+ [COUNTERS] PROGRAM TOTAL          :    1.3015s
+ [COUNTERS] Fortran Overhead ( 0 ) :    1.2231s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0784s for    90112 events => throughput is 1.15E+06 events/s
 
 *** (2-avx2) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -331,14 +331,14 @@ OK! xsec from fortran (46.223782291775365) and cpp (46.223782736292961) differ b
 OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
 
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
-Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.150882e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.149527e+06                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
-Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.149098e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.153994e+06                 )  sec^-1
 
 *** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -361,9 +361,9 @@ Executing ' ./build.512y_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 47.69 [47.690698819656767] fbridge_mode=1
  [UNWEIGHT] Wrote 434 events (found 1125 events)
- [COUNTERS] PROGRAM TOTAL          :    0.2758s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.2693s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0064s for     8192 events => throughput is 1.27E+06 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.2801s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.2738s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0064s for     8192 events => throughput is 1.28E+06 events/s
 
 *** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -394,9 +394,9 @@ Executing ' ./build.512y_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x10
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 46.22 [46.223782736292961] fbridge_mode=1
  [UNWEIGHT] Wrote 1727 events (found 1732 events)
- [COUNTERS] PROGRAM TOTAL          :    1.2778s
- [COUNTERS] Fortran Overhead ( 0 ) :    1.2090s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0688s for    90112 events => throughput is 1.31E+06 events/s
+ [COUNTERS] PROGRAM TOTAL          :    1.2660s
+ [COUNTERS] Fortran Overhead ( 0 ) :    1.1954s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0705s for    90112 events => throughput is 1.28E+06 events/s
 
 *** (2-512y) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -407,14 +407,14 @@ OK! xsec from fortran (46.223782291775365) and cpp (46.223782736292961) differ b
 OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
 
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
-Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.195362e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.250744e+06                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
-Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.247695e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.249123e+06                 )  sec^-1
 
 *** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -437,9 +437,9 @@ Executing ' ./build.512z_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 47.69 [47.690703490151122] fbridge_mode=1
  [UNWEIGHT] Wrote 434 events (found 1125 events)
- [COUNTERS] PROGRAM TOTAL          :    0.2869s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.2776s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0094s for     8192 events => throughput is 8.73E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.2877s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.2778s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0099s for     8192 events => throughput is 8.26E+05 events/s
 
 *** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -470,9 +470,9 @@ Executing ' ./build.512z_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x10
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 46.22 [46.223787021597481] fbridge_mode=1
  [UNWEIGHT] Wrote 1727 events (found 1732 events)
- [COUNTERS] PROGRAM TOTAL          :    1.3127s
- [COUNTERS] Fortran Overhead ( 0 ) :    1.2100s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.1027s for    90112 events => throughput is 8.77E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    1.3074s
+ [COUNTERS] Fortran Overhead ( 0 ) :    1.2030s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.1044s for    90112 events => throughput is 8.63E+05 events/s
 
 *** (2-512z) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -483,14 +483,14 @@ OK! xsec from fortran (46.223782291775365) and cpp (46.223787021597481) differ b
 OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
 
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
-Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 8.078289e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 8.010195e+05                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
-Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 8.098096e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 8.128602e+05                 )  sec^-1
 
 *** (3) EXECUTE MADEVENT_CUDA x1 (create events.lhe) ***
 --------------------
@@ -513,9 +513,9 @@ Executing ' ./build.none_f_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggtt_x1
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 47.69 [47.690703397697980] fbridge_mode=1
  [UNWEIGHT] Wrote 434 events (found 1125 events)
- [COUNTERS] PROGRAM TOTAL          :    0.7221s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.7216s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0006s for     8192 events => throughput is 1.46E+07 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.6976s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.6971s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0006s for     8192 events => throughput is 1.49E+07 events/s
 
 *** (3) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -546,9 +546,9 @@ Executing ' ./build.none_f_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggtt_x1
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 46.22 [46.223786763175951] fbridge_mode=1
  [UNWEIGHT] Wrote 1727 events (found 1732 events)
- [COUNTERS] PROGRAM TOTAL          :    1.6678s
- [COUNTERS] Fortran Overhead ( 0 ) :    1.6624s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0055s for    90112 events => throughput is 1.65E+07 events/s
+ [COUNTERS] PROGRAM TOTAL          :    1.6292s
+ [COUNTERS] Fortran Overhead ( 0 ) :    1.6236s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0055s for    90112 events => throughput is 1.63E+07 events/s
 
 *** (3) Compare MADEVENT_CUDA x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -559,43 +559,43 @@ OK! xsec from fortran (46.223782291775365) and cpp (46.223786763175951) differ b
 OK! events.lhe.cuda.10 and events.lhe.ref.10 are identical
 
 *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge ***
-Process                     = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.070407e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.208611e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(8192) -p 256 32 1 ***
-Process                     = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 7.904132e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.988221e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge ***
-Process                     = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 5.021955e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 5.833331e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX) -p 16384 32 1 ***
-Process                     = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.726468e+08                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.760079e+08                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge ***
-Process                     = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 5.093398e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 5.803014e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 ***
-Process                     = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.841474e+08                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.849470e+08                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge ***
-Process                     = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 4.753742e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 5.374876e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 ***
-Process                     = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 8.383547e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 8.461091e+07                 )  sec^-1
 
 TEST COMPLETED
diff --git a/epochX/cudacpp/tmad/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd0.txt
index bcc7cef90a..3216bfef7c 100644
--- a/epochX/cudacpp/tmad/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd0.txt
+++ b/epochX/cudacpp/tmad/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd0.txt
@@ -3,25 +3,25 @@ CUDACPP_BUILDDIR='.'
 
 
 
+
 make USEBUILDDIR=1 AVX=none
 make USEBUILDDIR=1 AVX=sse4
-
 make USEBUILDDIR=1 AVX=avx2
 make USEBUILDDIR=1 AVX=512y
+
+make USEBUILDDIR=1 AVX=512z
 make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
-
-make USEBUILDDIR=1 AVX=512z
 make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
-CUDACPP_BUILDDIR='build.avx2_m_inl0_hrd0'
-CUDACPP_BUILDDIR='build.512y_m_inl0_hrd0'
 CUDACPP_BUILDDIR='build.none_m_inl0_hrd0'
-CUDACPP_BUILDDIR='build.512z_m_inl0_hrd0'
+CUDACPP_BUILDDIR='build.512y_m_inl0_hrd0'
+CUDACPP_BUILDDIR='build.avx2_m_inl0_hrd0'
 CUDACPP_BUILDDIR='build.sse4_m_inl0_hrd0'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+CUDACPP_BUILDDIR='build.512z_m_inl0_hrd0'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 make[1]: Nothing to be done for 'all'.
@@ -33,9 +33,9 @@ make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/
 
 OMP_NUM_THREADS=
 
-DATE: 2023-06-16_23:36:15
+DATE: 2023-07-18_23:39:17
 
-On itscrd80.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
+On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx
 
 *** (1) EXECUTE MADEVENT_FORTRAN (create results.dat) ***
@@ -58,9 +58,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggtt_x1_fortran > /tmp/aval
  [XSECTION] Configuration = 1
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 47.69 [] fbridge_mode=0
- [COUNTERS] PROGRAM TOTAL          :    0.2020s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.1481s
- [COUNTERS] Fortran MEs      ( 1 ) :    0.0539s for     8192 events => throughput is 1.52E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.2029s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.1480s
+ [COUNTERS] Fortran MEs      ( 1 ) :    0.0549s for     8192 events => throughput is 1.49E+05 events/s
 
 *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) ***
 --------------------
@@ -83,9 +83,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggtt_x1_fortran > /tmp/aval
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 47.69 [47.690708277600116] fbridge_mode=0
  [UNWEIGHT] Wrote 434 events (found 1125 events)
- [COUNTERS] PROGRAM TOTAL          :    0.3175s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.2632s
- [COUNTERS] Fortran MEs      ( 1 ) :    0.0543s for     8192 events => throughput is 1.51E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.3192s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.2633s
+ [COUNTERS] Fortran MEs      ( 1 ) :    0.0559s for     8192 events => throughput is 1.47E+05 events/s
 
 *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) ***
 --------------------
@@ -108,9 +108,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggtt_x10_fortran > /tmp/ava
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 46.22 [46.223782291775365] fbridge_mode=0
  [UNWEIGHT] Wrote 1727 events (found 1732 events)
- [COUNTERS] PROGRAM TOTAL          :    1.7727s
- [COUNTERS] Fortran Overhead ( 0 ) :    1.1825s
- [COUNTERS] Fortran MEs      ( 1 ) :    0.5903s for    90112 events => throughput is 1.53E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    1.7890s
+ [COUNTERS] Fortran Overhead ( 0 ) :    1.1845s
+ [COUNTERS] Fortran MEs      ( 1 ) :    0.6045s for    90112 events => throughput is 1.49E+05 events/s
 
 *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -133,9 +133,9 @@ Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 47.69 [47.690709601032026] fbridge_mode=1
  [UNWEIGHT] Wrote 434 events (found 1125 events)
- [COUNTERS] PROGRAM TOTAL          :    0.3466s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.3052s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0414s for     8192 events => throughput is 1.98E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.3521s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.3084s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0437s for     8192 events => throughput is 1.87E+05 events/s
 
 *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -166,9 +166,9 @@ Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x10
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 46.22 [46.223783635280988] fbridge_mode=1
  [UNWEIGHT] Wrote 1727 events (found 1732 events)
- [COUNTERS] PROGRAM TOTAL          :    1.6933s
- [COUNTERS] Fortran Overhead ( 0 ) :    1.2360s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.4573s for    90112 events => throughput is 1.97E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    1.7142s
+ [COUNTERS] Fortran Overhead ( 0 ) :    1.2357s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.4786s for    90112 events => throughput is 1.88E+05 events/s
 
 *** (2-none) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -179,14 +179,14 @@ OK! xsec from fortran (46.223782291775365) and cpp (46.223783635280988) differ b
 OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
 
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
-Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.952836e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.900675e+05                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
-Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.944744e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.903091e+05                 )  sec^-1
 
 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -209,9 +209,9 @@ Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 47.69 [47.690709601032026] fbridge_mode=1
  [UNWEIGHT] Wrote 434 events (found 1125 events)
- [COUNTERS] PROGRAM TOTAL          :    0.3071s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.2854s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0217s for     8192 events => throughput is 3.77E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.3140s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.2916s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0224s for     8192 events => throughput is 3.65E+05 events/s
 
 *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -242,9 +242,9 @@ Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x10
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 46.22 [46.223783635280988] fbridge_mode=1
  [UNWEIGHT] Wrote 1727 events (found 1732 events)
- [COUNTERS] PROGRAM TOTAL          :    1.4646s
- [COUNTERS] Fortran Overhead ( 0 ) :    1.2169s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.2477s for    90112 events => throughput is 3.64E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    1.4613s
+ [COUNTERS] Fortran Overhead ( 0 ) :    1.2119s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.2494s for    90112 events => throughput is 3.61E+05 events/s
 
 *** (2-sse4) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -255,14 +255,14 @@ OK! xsec from fortran (46.223782291775365) and cpp (46.223783635280988) differ b
 OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
 
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
-Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.726713e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.514697e+05                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
-Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.678276e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.531616e+05                 )  sec^-1
 
 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -285,9 +285,9 @@ Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 47.69 [47.690709681138244] fbridge_mode=1
  [UNWEIGHT] Wrote 434 events (found 1125 events)
- [COUNTERS] PROGRAM TOTAL          :    0.2880s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.2760s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0120s for     8192 events => throughput is 6.83E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.2903s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.2774s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0129s for     8192 events => throughput is 6.34E+05 events/s
 
 *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -318,9 +318,9 @@ Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x10
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 46.22 [46.223783652032040] fbridge_mode=1
  [UNWEIGHT] Wrote 1727 events (found 1732 events)
- [COUNTERS] PROGRAM TOTAL          :    1.3686s
- [COUNTERS] Fortran Overhead ( 0 ) :    1.2290s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.1396s for    90112 events => throughput is 6.46E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    1.3401s
+ [COUNTERS] Fortran Overhead ( 0 ) :    1.2030s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.1371s for    90112 events => throughput is 6.57E+05 events/s
 
 *** (2-avx2) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -331,14 +331,14 @@ OK! xsec from fortran (46.223782291775365) and cpp (46.223783652032040) differ b
 OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
 
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
-Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 6.474630e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 6.270392e+05                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
-Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 6.428506e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 6.382141e+05                 )  sec^-1
 
 *** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -361,9 +361,9 @@ Executing ' ./build.512y_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 47.69 [47.690709681138244] fbridge_mode=1
  [UNWEIGHT] Wrote 434 events (found 1125 events)
- [COUNTERS] PROGRAM TOTAL          :    0.2848s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.2738s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0110s for     8192 events => throughput is 7.42E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.3051s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.2930s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0120s for     8192 events => throughput is 6.81E+05 events/s
 
 *** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -394,9 +394,9 @@ Executing ' ./build.512y_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x10
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 46.22 [46.223783652032040] fbridge_mode=1
  [UNWEIGHT] Wrote 1727 events (found 1732 events)
- [COUNTERS] PROGRAM TOTAL          :    1.3245s
- [COUNTERS] Fortran Overhead ( 0 ) :    1.2028s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.1217s for    90112 events => throughput is 7.40E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    1.3268s
+ [COUNTERS] Fortran Overhead ( 0 ) :    1.2029s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.1238s for    90112 events => throughput is 7.28E+05 events/s
 
 *** (2-512y) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -407,14 +407,14 @@ OK! xsec from fortran (46.223782291775365) and cpp (46.223783652032040) differ b
 OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
 
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
-Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 7.064546e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.042794e+05                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
-Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 7.181463e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.134968e+05                 )  sec^-1
 
 *** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -437,9 +437,9 @@ Executing ' ./build.512z_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 47.69 [47.690709681138244] fbridge_mode=1
  [UNWEIGHT] Wrote 434 events (found 1125 events)
- [COUNTERS] PROGRAM TOTAL          :    0.3000s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.2818s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0182s for     8192 events => throughput is 4.50E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.3025s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.2836s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0190s for     8192 events => throughput is 4.32E+05 events/s
 
 *** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -470,9 +470,9 @@ Executing ' ./build.512z_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x10
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 46.22 [46.223783652032040] fbridge_mode=1
  [UNWEIGHT] Wrote 1727 events (found 1732 events)
- [COUNTERS] PROGRAM TOTAL          :    1.4141s
- [COUNTERS] Fortran Overhead ( 0 ) :    1.2163s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.1978s for    90112 events => throughput is 4.55E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    1.4160s
+ [COUNTERS] Fortran Overhead ( 0 ) :    1.2108s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.2053s for    90112 events => throughput is 4.39E+05 events/s
 
 *** (2-512z) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -483,14 +483,14 @@ OK! xsec from fortran (46.223782291775365) and cpp (46.223783652032040) differ b
 OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
 
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
-Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 4.215527e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.195485e+05                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
-Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 4.203869e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.105555e+05                 )  sec^-1
 
 *** (3) EXECUTE MADEVENT_CUDA x1 (create events.lhe) ***
 --------------------
@@ -513,9 +513,9 @@ Executing ' ./build.none_m_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggtt_x1
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 47.69 [47.690708266690699] fbridge_mode=1
  [UNWEIGHT] Wrote 434 events (found 1125 events)
- [COUNTERS] PROGRAM TOTAL          :    0.7253s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.7248s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0006s for     8192 events => throughput is 1.40E+07 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.6959s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.6953s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0006s for     8192 events => throughput is 1.41E+07 events/s
 
 *** (3) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -546,9 +546,9 @@ Executing ' ./build.none_m_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggtt_x1
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 46.22 [46.223782303744791] fbridge_mode=1
  [UNWEIGHT] Wrote 1727 events (found 1732 events)
- [COUNTERS] PROGRAM TOTAL          :    1.6584s
- [COUNTERS] Fortran Overhead ( 0 ) :    1.6519s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0065s for    90112 events => throughput is 1.39E+07 events/s
+ [COUNTERS] PROGRAM TOTAL          :    1.6281s
+ [COUNTERS] Fortran Overhead ( 0 ) :    1.6217s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0064s for    90112 events => throughput is 1.41E+07 events/s
 
 *** (3) Compare MADEVENT_CUDA x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -559,43 +559,43 @@ OK! xsec from fortran (46.223782291775365) and cpp (46.223782303744791) differ b
 OK! events.lhe.cuda.10 and events.lhe.ref.10 are identical
 
 *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge ***
-Process                     = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.858157e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.072602e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(8192) -p 256 32 1 ***
-Process                     = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 5.585975e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 5.628306e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge ***
-Process                     = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.575302e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.002417e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX) -p 16384 32 1 ***
-Process                     = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.048555e+08                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.056552e+08                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge ***
-Process                     = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.576352e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.008120e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 ***
-Process                     = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.132080e+08                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.133647e+08                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge ***
-Process                     = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.576975e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.014244e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 ***
-Process                     = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.969589e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.974355e+07                 )  sec^-1
 
 TEST COMPLETED
diff --git a/epochX/cudacpp/tmad/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd0.txt
index 52294c86ed..8868aa1905 100644
--- a/epochX/cudacpp/tmad/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd0.txt
+++ b/epochX/cudacpp/tmad/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd0.txt
@@ -7,18 +7,19 @@ make USEBUILDDIR=1 AVX=none
 
 make USEBUILDDIR=1 AVX=sse4
 make USEBUILDDIR=1 AVX=avx2
+
 make USEBUILDDIR=1 AVX=512y
+make USEBUILDDIR=1 AVX=512z
 make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
 make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
 make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
 make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
-
-make USEBUILDDIR=1 AVX=512z
 make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
 CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd0'
-CUDACPP_BUILDDIR='build.none_d_inl0_hrd0'
 CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0'
+CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0'
 CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0'
+CUDACPP_BUILDDIR='build.none_d_inl0_hrd0'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
 make[1]: Nothing to be done for 'all'.
@@ -27,15 +28,14 @@ make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
-CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
 
 OMP_NUM_THREADS=
 
-DATE: 2023-06-16_23:36:41
+DATE: 2023-07-18_23:39:42
 
-On itscrd80.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
+On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg
 
 *** (1) EXECUTE MADEVENT_FORTRAN (create results.dat) ***
@@ -58,9 +58,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttg_x1_fortran > /tmp/ava
  [XSECTION] Configuration = 1
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.0972 [] fbridge_mode=0
- [COUNTERS] PROGRAM TOTAL          :    0.5432s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.1981s
- [COUNTERS] Fortran MEs      ( 1 ) :    0.3452s for     8192 events => throughput is 2.37E+04 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.5221s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.1707s
+ [COUNTERS] Fortran MEs      ( 1 ) :    0.3515s for     8192 events => throughput is 2.33E+04 events/s
 
 *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) ***
 --------------------
@@ -81,11 +81,11 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttg_x1_fortran > /tmp/ava
  [XSECTION] MultiChannel = TRUE
  [XSECTION] Configuration = 1
  [XSECTION] ChannelId = 1
- [XSECTION] Cross section = 0.0972 [9.7195719386171234E-002] fbridge_mode=0
+ [XSECTION] Cross section = 0.0972 [9.7196357922470805E-002] fbridge_mode=0
  [UNWEIGHT] Wrote 40 events (found 738 events)
- [COUNTERS] PROGRAM TOTAL          :    0.5864s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.2409s
- [COUNTERS] Fortran MEs      ( 1 ) :    0.3455s for     8192 events => throughput is 2.37E+04 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.5998s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.2469s
+ [COUNTERS] Fortran MEs      ( 1 ) :    0.3529s for     8192 events => throughput is 2.32E+04 events/s
 
 *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) ***
 --------------------
@@ -106,11 +106,11 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttg_x10_fortran > /tmp/av
  [XSECTION] MultiChannel = TRUE
  [XSECTION] Configuration = 1
  [XSECTION] ChannelId = 1
- [XSECTION] Cross section = 0.08131 [8.1310872844967921E-002] fbridge_mode=0
+ [XSECTION] Cross section = 0.08131 [8.1310872077655555E-002] fbridge_mode=0
  [UNWEIGHT] Wrote 679 events (found 1787 events)
- [COUNTERS] PROGRAM TOTAL          :    5.1533s
- [COUNTERS] Fortran Overhead ( 0 ) :    1.3710s
- [COUNTERS] Fortran MEs      ( 1 ) :    3.7823s for    90112 events => throughput is 2.38E+04 events/s
+ [COUNTERS] PROGRAM TOTAL          :    5.2309s
+ [COUNTERS] Fortran Overhead ( 0 ) :    1.3729s
+ [COUNTERS] Fortran MEs      ( 1 ) :    3.8580s for    90112 events => throughput is 2.34E+04 events/s
 
 *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -131,15 +131,15 @@ Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1
  [XSECTION] MultiChannel = TRUE
  [XSECTION] Configuration = 1
  [XSECTION] ChannelId = 1
- [XSECTION] Cross section = 0.0972 [9.7195719386171234E-002] fbridge_mode=1
+ [XSECTION] Cross section = 0.0972 [9.7196357922470791E-002] fbridge_mode=1
  [UNWEIGHT] Wrote 40 events (found 738 events)
- [COUNTERS] PROGRAM TOTAL          :    0.8655s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.5504s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.3151s for     8192 events => throughput is 2.60E+04 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.8963s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.5674s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.3288s for     8192 events => throughput is 2.49E+04 events/s
 
 *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
-OK! xsec from fortran (9.7195719386171234E-002) and cpp (9.7195719386171234E-002) differ by less than 2E-14 (0.0)
+OK! xsec from fortran (9.7196357922470805E-002) and cpp (9.7196357922470791E-002) differ by less than 2E-14 (1.1102230246251565e-16)
 
 *** (2-none) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) ***
 
@@ -164,29 +164,29 @@ Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1
  [XSECTION] MultiChannel = TRUE
  [XSECTION] Configuration = 1
  [XSECTION] ChannelId = 1
- [XSECTION] Cross section = 0.08131 [8.1310872844967963E-002] fbridge_mode=1
+ [XSECTION] Cross section = 0.08131 [8.1310872077655597E-002] fbridge_mode=1
  [UNWEIGHT] Wrote 679 events (found 1787 events)
- [COUNTERS] PROGRAM TOTAL          :    5.2253s
- [COUNTERS] Fortran Overhead ( 0 ) :    1.7057s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    3.5196s for    90112 events => throughput is 2.56E+04 events/s
+ [COUNTERS] PROGRAM TOTAL          :    5.3437s
+ [COUNTERS] Fortran Overhead ( 0 ) :    1.7009s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    3.6428s for    90112 events => throughput is 2.47E+04 events/s
 
 *** (2-none) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
 
-OK! xsec from fortran (8.1310872844967921E-002) and cpp (8.1310872844967963E-002) differ by less than 2E-14 (4.440892098500626e-16)
+OK! xsec from fortran (8.1310872077655555E-002) and cpp (8.1310872077655597E-002) differ by less than 2E-14 (4.440892098500626e-16)
 
 *** (2-none) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) ***
 
 OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
 
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
-Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.634187e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.551954e+04                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
-Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.657523e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.532896e+04                 )  sec^-1
 
 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -207,15 +207,15 @@ Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1
  [XSECTION] MultiChannel = TRUE
  [XSECTION] Configuration = 1
  [XSECTION] ChannelId = 1
- [XSECTION] Cross section = 0.0972 [9.7195719386171234E-002] fbridge_mode=1
+ [XSECTION] Cross section = 0.0972 [9.7196357922470777E-002] fbridge_mode=1
  [UNWEIGHT] Wrote 40 events (found 738 events)
- [COUNTERS] PROGRAM TOTAL          :    0.5827s
+ [COUNTERS] PROGRAM TOTAL          :    0.5879s
  [COUNTERS] Fortran Overhead ( 0 ) :    0.4172s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.1654s for     8192 events => throughput is 4.95E+04 events/s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.1707s for     8192 events => throughput is 4.80E+04 events/s
 
 *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
-OK! xsec from fortran (9.7195719386171234E-002) and cpp (9.7195719386171234E-002) differ by less than 2E-14 (0.0)
+OK! xsec from fortran (9.7196357922470805E-002) and cpp (9.7196357922470777E-002) differ by less than 2E-14 (3.3306690738754696e-16)
 
 *** (2-sse4) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) ***
 
@@ -240,29 +240,29 @@ Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1
  [XSECTION] MultiChannel = TRUE
  [XSECTION] Configuration = 1
  [XSECTION] ChannelId = 1
- [XSECTION] Cross section = 0.08131 [8.1310872844967921E-002] fbridge_mode=1
+ [XSECTION] Cross section = 0.08131 [8.1310872077655555E-002] fbridge_mode=1
  [UNWEIGHT] Wrote 679 events (found 1787 events)
- [COUNTERS] PROGRAM TOTAL          :    3.4871s
- [COUNTERS] Fortran Overhead ( 0 ) :    1.5641s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    1.9229s for    90112 events => throughput is 4.69E+04 events/s
+ [COUNTERS] PROGRAM TOTAL          :    3.4725s
+ [COUNTERS] Fortran Overhead ( 0 ) :    1.5669s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    1.9056s for    90112 events => throughput is 4.73E+04 events/s
 
 *** (2-sse4) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
 
-OK! xsec from fortran (8.1310872844967921E-002) and cpp (8.1310872844967921E-002) differ by less than 2E-14 (0.0)
+OK! xsec from fortran (8.1310872077655555E-002) and cpp (8.1310872077655555E-002) differ by less than 2E-14 (0.0)
 
 *** (2-sse4) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) ***
 
 OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
 
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
-Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 4.964351e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.885132e+04                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
-Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 5.061720e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.909068e+04                 )  sec^-1
 
 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -283,15 +283,15 @@ Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1
  [XSECTION] MultiChannel = TRUE
  [XSECTION] Configuration = 1
  [XSECTION] ChannelId = 1
- [XSECTION] Cross section = 0.0972 [9.7195719386171206E-002] fbridge_mode=1
+ [XSECTION] Cross section = 0.0972 [9.7196357922470750E-002] fbridge_mode=1
  [UNWEIGHT] Wrote 40 events (found 738 events)
- [COUNTERS] PROGRAM TOTAL          :    0.4029s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.3246s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0783s for     8192 events => throughput is 1.05E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.4230s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.3342s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0888s for     8192 events => throughput is 9.23E+04 events/s
 
 *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
-OK! xsec from fortran (9.7195719386171234E-002) and cpp (9.7195719386171206E-002) differ by less than 2E-14 (3.3306690738754696e-16)
+OK! xsec from fortran (9.7196357922470805E-002) and cpp (9.7196357922470750E-002) differ by less than 2E-14 (5.551115123125783e-16)
 
 *** (2-avx2) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) ***
 
@@ -316,29 +316,29 @@ Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1
  [XSECTION] MultiChannel = TRUE
  [XSECTION] Configuration = 1
  [XSECTION] ChannelId = 1
- [XSECTION] Cross section = 0.08131 [8.1310872844967907E-002] fbridge_mode=1
+ [XSECTION] Cross section = 0.08131 [8.1310872077655541E-002] fbridge_mode=1
  [UNWEIGHT] Wrote 679 events (found 1787 events)
- [COUNTERS] PROGRAM TOTAL          :    2.3698s
- [COUNTERS] Fortran Overhead ( 0 ) :    1.4879s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.8819s for    90112 events => throughput is 1.02E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    2.3458s
+ [COUNTERS] Fortran Overhead ( 0 ) :    1.4550s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.8908s for    90112 events => throughput is 1.01E+05 events/s
 
 *** (2-avx2) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
 
-OK! xsec from fortran (8.1310872844967921E-002) and cpp (8.1310872844967907E-002) differ by less than 2E-14 (2.220446049250313e-16)
+OK! xsec from fortran (8.1310872077655555E-002) and cpp (8.1310872077655541E-002) differ by less than 2E-14 (2.220446049250313e-16)
 
 *** (2-avx2) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) ***
 
 OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
 
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
-Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.047204e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.016436e+05                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
-Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.053567e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.023538e+05                 )  sec^-1
 
 *** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -359,15 +359,15 @@ Executing ' ./build.512y_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1
  [XSECTION] MultiChannel = TRUE
  [XSECTION] Configuration = 1
  [XSECTION] ChannelId = 1
- [XSECTION] Cross section = 0.0972 [9.7195719386171206E-002] fbridge_mode=1
+ [XSECTION] Cross section = 0.0972 [9.7196357922470750E-002] fbridge_mode=1
  [UNWEIGHT] Wrote 40 events (found 738 events)
- [COUNTERS] PROGRAM TOTAL          :    0.3886s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.3177s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0709s for     8192 events => throughput is 1.15E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.3872s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.3152s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0720s for     8192 events => throughput is 1.14E+05 events/s
 
 *** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
-OK! xsec from fortran (9.7195719386171234E-002) and cpp (9.7195719386171206E-002) differ by less than 2E-14 (3.3306690738754696e-16)
+OK! xsec from fortran (9.7196357922470805E-002) and cpp (9.7196357922470750E-002) differ by less than 2E-14 (5.551115123125783e-16)
 
 *** (2-512y) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) ***
 
@@ -392,29 +392,29 @@ Executing ' ./build.512y_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1
  [XSECTION] MultiChannel = TRUE
  [XSECTION] Configuration = 1
  [XSECTION] ChannelId = 1
- [XSECTION] Cross section = 0.08131 [8.1310872844967907E-002] fbridge_mode=1
+ [XSECTION] Cross section = 0.08131 [8.1310872077655541E-002] fbridge_mode=1
  [UNWEIGHT] Wrote 679 events (found 1787 events)
- [COUNTERS] PROGRAM TOTAL          :    2.2440s
- [COUNTERS] Fortran Overhead ( 0 ) :    1.4672s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.7768s for    90112 events => throughput is 1.16E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    2.2612s
+ [COUNTERS] Fortran Overhead ( 0 ) :    1.4623s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.7989s for    90112 events => throughput is 1.13E+05 events/s
 
 *** (2-512y) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
 
-OK! xsec from fortran (8.1310872844967921E-002) and cpp (8.1310872844967907E-002) differ by less than 2E-14 (2.220446049250313e-16)
+OK! xsec from fortran (8.1310872077655555E-002) and cpp (8.1310872077655541E-002) differ by less than 2E-14 (2.220446049250313e-16)
 
 *** (2-512y) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) ***
 
 OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
 
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
-Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.177490e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.171611e+05                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
-Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.191999e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.155393e+05                 )  sec^-1
 
 *** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -435,15 +435,15 @@ Executing ' ./build.512z_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1
  [XSECTION] MultiChannel = TRUE
  [XSECTION] Configuration = 1
  [XSECTION] ChannelId = 1
- [XSECTION] Cross section = 0.0972 [9.7195719386171234E-002] fbridge_mode=1
+ [XSECTION] Cross section = 0.0972 [9.7196357922470750E-002] fbridge_mode=1
  [UNWEIGHT] Wrote 40 events (found 738 events)
- [COUNTERS] PROGRAM TOTAL          :    0.4369s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.3403s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0966s for     8192 events => throughput is 8.48E+04 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.4445s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.3448s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0997s for     8192 events => throughput is 8.22E+04 events/s
 
 *** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
-OK! xsec from fortran (9.7195719386171234E-002) and cpp (9.7195719386171234E-002) differ by less than 2E-14 (0.0)
+OK! xsec from fortran (9.7196357922470805E-002) and cpp (9.7196357922470750E-002) differ by less than 2E-14 (5.551115123125783e-16)
 
 *** (2-512z) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) ***
 
@@ -468,29 +468,29 @@ Executing ' ./build.512z_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1
  [XSECTION] MultiChannel = TRUE
  [XSECTION] Configuration = 1
  [XSECTION] ChannelId = 1
- [XSECTION] Cross section = 0.08131 [8.1310872844967907E-002] fbridge_mode=1
+ [XSECTION] Cross section = 0.08131 [8.1310872077655541E-002] fbridge_mode=1
  [UNWEIGHT] Wrote 679 events (found 1787 events)
- [COUNTERS] PROGRAM TOTAL          :    2.5408s
- [COUNTERS] Fortran Overhead ( 0 ) :    1.4804s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    1.0604s for    90112 events => throughput is 8.50E+04 events/s
+ [COUNTERS] PROGRAM TOTAL          :    2.5775s
+ [COUNTERS] Fortran Overhead ( 0 ) :    1.4738s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    1.1036s for    90112 events => throughput is 8.17E+04 events/s
 
 *** (2-512z) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
 
-OK! xsec from fortran (8.1310872844967921E-002) and cpp (8.1310872844967907E-002) differ by less than 2E-14 (2.220446049250313e-16)
+OK! xsec from fortran (8.1310872077655555E-002) and cpp (8.1310872077655541E-002) differ by less than 2E-14 (2.220446049250313e-16)
 
 *** (2-512z) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) ***
 
 OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
 
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
-Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 8.369247e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.663037e+04                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
-Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 8.461216e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 8.136831e+04                 )  sec^-1
 
 *** (3) EXECUTE MADEVENT_CUDA x1 (create events.lhe) ***
 --------------------
@@ -511,15 +511,15 @@ Executing ' ./build.none_d_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggttg_x
  [XSECTION] MultiChannel = TRUE
  [XSECTION] Configuration = 1
  [XSECTION] ChannelId = 1
- [XSECTION] Cross section = 0.0972 [9.7195719386171220E-002] fbridge_mode=1
+ [XSECTION] Cross section = 0.0972 [9.7196357922470764E-002] fbridge_mode=1
  [UNWEIGHT] Wrote 40 events (found 738 events)
- [COUNTERS] PROGRAM TOTAL          :    0.7104s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.7087s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0017s for     8192 events => throughput is 4.77E+06 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.6809s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.6792s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0017s for     8192 events => throughput is 4.76E+06 events/s
 
 *** (3) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec ***
 
-OK! xsec from fortran (9.7195719386171234E-002) and cpp (9.7195719386171220E-002) differ by less than 2E-14 (1.1102230246251565e-16)
+OK! xsec from fortran (9.7196357922470805E-002) and cpp (9.7196357922470764E-002) differ by less than 2E-14 (4.440892098500626e-16)
 
 *** (3) Compare MADEVENT_CUDA x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) ***
 
@@ -544,58 +544,58 @@ Executing ' ./build.none_d_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggttg_x
  [XSECTION] MultiChannel = TRUE
  [XSECTION] Configuration = 1
  [XSECTION] ChannelId = 1
- [XSECTION] Cross section = 0.08131 [8.1310872844967977E-002] fbridge_mode=1
+ [XSECTION] Cross section = 0.08131 [8.1310872077655597E-002] fbridge_mode=1
  [UNWEIGHT] Wrote 679 events (found 1787 events)
- [COUNTERS] PROGRAM TOTAL          :    1.8754s
- [COUNTERS] Fortran Overhead ( 0 ) :    1.8562s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0193s for    90112 events => throughput is 4.68E+06 events/s
+ [COUNTERS] PROGRAM TOTAL          :    1.8266s
+ [COUNTERS] Fortran Overhead ( 0 ) :    1.8074s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0191s for    90112 events => throughput is 4.71E+06 events/s
 
 *** (3) Compare MADEVENT_CUDA x10 xsec to MADEVENT_FORTRAN xsec ***
 
-OK! xsec from fortran (8.1310872844967921E-002) and cpp (8.1310872844967977E-002) differ by less than 2E-14 (6.661338147750939e-16)
+OK! xsec from fortran (8.1310872077655555E-002) and cpp (8.1310872077655597E-002) differ by less than 2E-14 (4.440892098500626e-16)
 
 *** (3) Compare MADEVENT_CUDA x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) ***
 
 OK! events.lhe.cuda.10 and events.lhe.ref.10 are identical
 
 *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge ***
-Process                     = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 5.108652e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 5.265641e+06                 )  sec^-1
 
 *** EXECUTE GCHECK(8192) -p 256 32 1 ***
-Process                     = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 8.056830e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 8.394415e+06                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge ***
-Process                     = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 9.357483e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 9.627373e+06                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX) -p 16384 32 1 ***
-Process                     = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.238541e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.236143e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge ***
-Process                     = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 9.098545e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 9.636530e+06                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 ***
-Process                     = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.247043e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.245580e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge ***
-Process                     = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 9.065031e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 9.631703e+06                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 ***
-Process                     = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.797581e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.755728e+06                 )  sec^-1
 
 TEST COMPLETED
diff --git a/epochX/cudacpp/tmad/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd0.txt
index ac98f0132b..08af34efd8 100644
--- a/epochX/cudacpp/tmad/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd0.txt
+++ b/epochX/cudacpp/tmad/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd0.txt
@@ -4,38 +4,38 @@ CUDACPP_BUILDDIR='.'
 
 
 make USEBUILDDIR=1 AVX=none
-
 make USEBUILDDIR=1 AVX=sse4
+
 make USEBUILDDIR=1 AVX=avx2
 make USEBUILDDIR=1 AVX=512y
+
+make USEBUILDDIR=1 AVX=512z
 make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
 make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
 make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
 make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
-
-make USEBUILDDIR=1 AVX=512z
 make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
-CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0'
-CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0'
+CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd0'
 CUDACPP_BUILDDIR='build.none_f_inl0_hrd0'
 make[1]: Nothing to be done for 'all'.
+CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0'
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
-CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd0'
+CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0'
+CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
-CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
 
 OMP_NUM_THREADS=
 
-DATE: 2023-06-16_23:37:23
+DATE: 2023-07-18_23:40:23
 
-On itscrd80.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
+On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg
 
 *** (1) EXECUTE MADEVENT_FORTRAN (create results.dat) ***
@@ -58,9 +58,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttg_x1_fortran > /tmp/ava
  [XSECTION] Configuration = 1
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.0972 [] fbridge_mode=0
- [COUNTERS] PROGRAM TOTAL          :    0.5130s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.1657s
- [COUNTERS] Fortran MEs      ( 1 ) :    0.3473s for     8192 events => throughput is 2.36E+04 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.5187s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.1672s
+ [COUNTERS] Fortran MEs      ( 1 ) :    0.3515s for     8192 events => throughput is 2.33E+04 events/s
 
 *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) ***
 --------------------
@@ -81,11 +81,11 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttg_x1_fortran > /tmp/ava
  [XSECTION] MultiChannel = TRUE
  [XSECTION] Configuration = 1
  [XSECTION] ChannelId = 1
- [XSECTION] Cross section = 0.0972 [9.7195719386171234E-002] fbridge_mode=0
+ [XSECTION] Cross section = 0.0972 [9.7196357922470805E-002] fbridge_mode=0
  [UNWEIGHT] Wrote 40 events (found 738 events)
- [COUNTERS] PROGRAM TOTAL          :    0.6055s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.2536s
- [COUNTERS] Fortran MEs      ( 1 ) :    0.3519s for     8192 events => throughput is 2.33E+04 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.5925s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.2438s
+ [COUNTERS] Fortran MEs      ( 1 ) :    0.3487s for     8192 events => throughput is 2.35E+04 events/s
 
 *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) ***
 --------------------
@@ -106,11 +106,11 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttg_x10_fortran > /tmp/av
  [XSECTION] MultiChannel = TRUE
  [XSECTION] Configuration = 1
  [XSECTION] ChannelId = 1
- [XSECTION] Cross section = 0.08131 [8.1310872844967921E-002] fbridge_mode=0
+ [XSECTION] Cross section = 0.08131 [8.1310872077655555E-002] fbridge_mode=0
  [UNWEIGHT] Wrote 679 events (found 1787 events)
- [COUNTERS] PROGRAM TOTAL          :    5.1398s
- [COUNTERS] Fortran Overhead ( 0 ) :    1.3678s
- [COUNTERS] Fortran MEs      ( 1 ) :    3.7720s for    90112 events => throughput is 2.39E+04 events/s
+ [COUNTERS] PROGRAM TOTAL          :    5.2118s
+ [COUNTERS] Fortran Overhead ( 0 ) :    1.3751s
+ [COUNTERS] Fortran MEs      ( 1 ) :    3.8366s for    90112 events => throughput is 2.35E+04 events/s
 
 *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -131,15 +131,15 @@ Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1
  [XSECTION] MultiChannel = TRUE
  [XSECTION] Configuration = 1
  [XSECTION] ChannelId = 1
- [XSECTION] Cross section = 0.0972 [9.7195711188152623E-002] fbridge_mode=1
+ [XSECTION] Cross section = 0.0972 [9.7196349725192449E-002] fbridge_mode=1
  [UNWEIGHT] Wrote 40 events (found 738 events)
- [COUNTERS] PROGRAM TOTAL          :    0.8594s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.5459s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.3135s for     8192 events => throughput is 2.61E+04 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.8860s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.5612s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.3248s for     8192 events => throughput is 2.52E+04 events/s
 
 *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
-OK! xsec from fortran (9.7195719386171234E-002) and cpp (9.7195711188152623E-002) differ by less than 4E-4 (8.434546971969326e-08)
+OK! xsec from fortran (9.7196357922470805E-002) and cpp (9.7196349725192449E-002) differ by less than 4E-4 (8.433729958845504e-08)
 
 *** (2-none) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) ***
 
@@ -164,29 +164,29 @@ Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1
  [XSECTION] MultiChannel = TRUE
  [XSECTION] Configuration = 1
  [XSECTION] ChannelId = 1
- [XSECTION] Cross section = 0.08131 [8.1310861450156910E-002] fbridge_mode=1
+ [XSECTION] Cross section = 0.08131 [8.1310860682799649E-002] fbridge_mode=1
  [UNWEIGHT] Wrote 679 events (found 1787 events)
- [COUNTERS] PROGRAM TOTAL          :    5.1084s
- [COUNTERS] Fortran Overhead ( 0 ) :    1.6927s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    3.4157s for    90112 events => throughput is 2.64E+04 events/s
+ [COUNTERS] PROGRAM TOTAL          :    5.3128s
+ [COUNTERS] Fortran Overhead ( 0 ) :    1.7147s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    3.5981s for    90112 events => throughput is 2.50E+04 events/s
 
 *** (2-none) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
 
-OK! xsec from fortran (8.1310872844967921E-002) and cpp (8.1310861450156910E-002) differ by less than 4E-4 (1.401388352029187e-07)
+OK! xsec from fortran (8.1310872077655555E-002) and cpp (8.1310860682799649E-002) differ by less than 4E-4 (1.4013938864909647e-07)
 
 *** (2-none) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) ***
 
 OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
 
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
-Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.731642e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.584606e+04                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
-Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.740748e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.572586e+04                 )  sec^-1
 
 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -207,15 +207,15 @@ Executing ' ./build.sse4_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1
  [XSECTION] MultiChannel = TRUE
  [XSECTION] Configuration = 1
  [XSECTION] ChannelId = 1
- [XSECTION] Cross section = 0.0972 [9.7195687405490658E-002] fbridge_mode=1
+ [XSECTION] Cross section = 0.0972 [9.7196325939550202E-002] fbridge_mode=1
  [UNWEIGHT] Wrote 40 events (found 738 events)
- [COUNTERS] PROGRAM TOTAL          :    0.4313s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.3404s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0909s for     8192 events => throughput is 9.02E+04 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.4348s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.3390s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0958s for     8192 events => throughput is 8.55E+04 events/s
 
 *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
-OK! xsec from fortran (9.7195719386171234E-002) and cpp (9.7195687405490658E-002) differ by less than 4E-4 (3.290338378425517e-07)
+OK! xsec from fortran (9.7196357922470805E-002) and cpp (9.7196325939550202E-002) differ by less than 4E-4 (3.290547226919571e-07)
 
 *** (2-sse4) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) ***
 
@@ -240,29 +240,29 @@ Executing ' ./build.sse4_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1
  [XSECTION] MultiChannel = TRUE
  [XSECTION] Configuration = 1
  [XSECTION] ChannelId = 1
- [XSECTION] Cross section = 0.08131 [8.1310854844234101E-002] fbridge_mode=1
+ [XSECTION] Cross section = 0.08131 [8.1310854076870026E-002] fbridge_mode=1
  [UNWEIGHT] Wrote 679 events (found 1787 events)
- [COUNTERS] PROGRAM TOTAL          :    2.5514s
- [COUNTERS] Fortran Overhead ( 0 ) :    1.4872s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    1.0642s for    90112 events => throughput is 8.47E+04 events/s
+ [COUNTERS] PROGRAM TOTAL          :    2.5193s
+ [COUNTERS] Fortran Overhead ( 0 ) :    1.4678s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    1.0516s for    90112 events => throughput is 8.57E+04 events/s
 
 *** (2-sse4) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
 
-OK! xsec from fortran (8.1310872844967921E-002) and cpp (8.1310854844234101E-002) differ by less than 4E-4 (2.2138163313645265e-07)
+OK! xsec from fortran (8.1310872077655555E-002) and cpp (8.1310854076870026E-002) differ by less than 4E-4 (2.213822711816249e-07)
 
 *** (2-sse4) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) ***
 
 OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
 
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
-Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 8.703250e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 8.660520e+04                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
-Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 8.982646e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 8.723830e+04                 )  sec^-1
 
 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -283,15 +283,15 @@ Executing ' ./build.avx2_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1
  [XSECTION] MultiChannel = TRUE
  [XSECTION] Configuration = 1
  [XSECTION] ChannelId = 1
- [XSECTION] Cross section = 0.0972 [9.7195715140566227E-002] fbridge_mode=1
+ [XSECTION] Cross section = 0.0972 [9.7196353680794059E-002] fbridge_mode=1
  [UNWEIGHT] Wrote 40 events (found 738 events)
- [COUNTERS] PROGRAM TOTAL          :    0.3260s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.2868s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0392s for     8192 events => throughput is 2.09E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.3269s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.2857s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0412s for     8192 events => throughput is 1.99E+05 events/s
 
 *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
-OK! xsec from fortran (9.7195719386171234E-002) and cpp (9.7195715140566227E-002) differ by less than 4E-4 (4.368098749374383e-08)
+OK! xsec from fortran (9.7196357922470805E-002) and cpp (9.7196353680794059E-002) differ by less than 4E-4 (4.364028483028193e-08)
 
 *** (2-avx2) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) ***
 
@@ -316,29 +316,29 @@ Executing ' ./build.avx2_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1
  [XSECTION] MultiChannel = TRUE
  [XSECTION] Configuration = 1
  [XSECTION] ChannelId = 1
- [XSECTION] Cross section = 0.08131 [8.1310851236127482E-002] fbridge_mode=1
+ [XSECTION] Cross section = 0.08131 [8.1310850468770415E-002] fbridge_mode=1
  [UNWEIGHT] Wrote 679 events (found 1787 events)
- [COUNTERS] PROGRAM TOTAL          :    1.8684s
- [COUNTERS] Fortran Overhead ( 0 ) :    1.4327s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.4357s for    90112 events => throughput is 2.07E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    1.8622s
+ [COUNTERS] Fortran Overhead ( 0 ) :    1.4138s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.4484s for    90112 events => throughput is 2.01E+05 events/s
 
 *** (2-avx2) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
 
-OK! xsec from fortran (8.1310872844967921E-002) and cpp (8.1310851236127482E-002) differ by less than 4E-4 (2.6575585387877965e-07)
+OK! xsec from fortran (8.1310872077655555E-002) and cpp (8.1310850468770415E-002) differ by less than 4E-4 (2.657564061037121e-07)
 
 *** (2-avx2) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) ***
 
 OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
 
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
-Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.029319e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.036815e+05                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
-Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.080772e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.040467e+05                 )  sec^-1
 
 *** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -359,15 +359,15 @@ Executing ' ./build.512y_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1
  [XSECTION] MultiChannel = TRUE
  [XSECTION] Configuration = 1
  [XSECTION] ChannelId = 1
- [XSECTION] Cross section = 0.0972 [9.7195715140566227E-002] fbridge_mode=1
+ [XSECTION] Cross section = 0.0972 [9.7196353680794059E-002] fbridge_mode=1
  [UNWEIGHT] Wrote 40 events (found 738 events)
- [COUNTERS] PROGRAM TOTAL          :    0.3161s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.2805s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0356s for     8192 events => throughput is 2.30E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.3180s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.2816s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0365s for     8192 events => throughput is 2.25E+05 events/s
 
 *** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
-OK! xsec from fortran (9.7195719386171234E-002) and cpp (9.7195715140566227E-002) differ by less than 4E-4 (4.368098749374383e-08)
+OK! xsec from fortran (9.7196357922470805E-002) and cpp (9.7196353680794059E-002) differ by less than 4E-4 (4.364028483028193e-08)
 
 *** (2-512y) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) ***
 
@@ -392,29 +392,29 @@ Executing ' ./build.512y_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1
  [XSECTION] MultiChannel = TRUE
  [XSECTION] Configuration = 1
  [XSECTION] ChannelId = 1
- [XSECTION] Cross section = 0.08131 [8.1310851236127482E-002] fbridge_mode=1
+ [XSECTION] Cross section = 0.08131 [8.1310850468770415E-002] fbridge_mode=1
  [UNWEIGHT] Wrote 679 events (found 1787 events)
- [COUNTERS] PROGRAM TOTAL          :    1.8280s
- [COUNTERS] Fortran Overhead ( 0 ) :    1.4335s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.3946s for    90112 events => throughput is 2.28E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    1.8119s
+ [COUNTERS] Fortran Overhead ( 0 ) :    1.4094s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.4025s for    90112 events => throughput is 2.24E+05 events/s
 
 *** (2-512y) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
 
-OK! xsec from fortran (8.1310872844967921E-002) and cpp (8.1310851236127482E-002) differ by less than 4E-4 (2.6575585387877965e-07)
+OK! xsec from fortran (8.1310872077655555E-002) and cpp (8.1310850468770415E-002) differ by less than 4E-4 (2.657564061037121e-07)
 
 *** (2-512y) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) ***
 
 OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
 
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
-Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.320491e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.259149e+05                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
-Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.339938e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.266261e+05                 )  sec^-1
 
 *** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -435,15 +435,15 @@ Executing ' ./build.512z_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1
  [XSECTION] MultiChannel = TRUE
  [XSECTION] Configuration = 1
  [XSECTION] ChannelId = 1
- [XSECTION] Cross section = 0.0972 [9.7195727520443878E-002] fbridge_mode=1
+ [XSECTION] Cross section = 0.0972 [9.7196366042348534E-002] fbridge_mode=1
  [UNWEIGHT] Wrote 40 events (found 738 events)
- [COUNTERS] PROGRAM TOTAL          :    0.3397s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.2926s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0470s for     8192 events => throughput is 1.74E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.3443s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.2957s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0486s for     8192 events => throughput is 1.69E+05 events/s
 
 *** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
-OK! xsec from fortran (9.7195719386171234E-002) and cpp (9.7195727520443878E-002) differ by less than 4E-4 (8.368961812443843e-08)
+OK! xsec from fortran (9.7196357922470805E-002) and cpp (9.7196366042348534E-002) differ by less than 4E-4 (8.354096703300229e-08)
 
 *** (2-512z) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) ***
 
@@ -468,29 +468,29 @@ Executing ' ./build.512z_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1
  [XSECTION] MultiChannel = TRUE
  [XSECTION] Configuration = 1
  [XSECTION] ChannelId = 1
- [XSECTION] Cross section = 0.08131 [8.1310861771879989E-002] fbridge_mode=1
+ [XSECTION] Cross section = 0.08131 [8.1310861004511001E-002] fbridge_mode=1
  [UNWEIGHT] Wrote 679 events (found 1787 events)
- [COUNTERS] PROGRAM TOTAL          :    1.9470s
- [COUNTERS] Fortran Overhead ( 0 ) :    1.4325s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.5145s for    90112 events => throughput is 1.75E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    1.9758s
+ [COUNTERS] Fortran Overhead ( 0 ) :    1.4329s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.5428s for    90112 events => throughput is 1.66E+05 events/s
 
 *** (2-512z) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
 
-OK! xsec from fortran (8.1310872844967921E-002) and cpp (8.1310861771879989E-002) differ by less than 4E-4 (1.3618213090538234e-07)
+OK! xsec from fortran (8.1310872077655555E-002) and cpp (8.1310861004511001E-002) differ by less than 4E-4 (1.36182828569531e-07)
 
 *** (2-512z) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) ***
 
 OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
 
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
-Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.739989e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.633621e+05                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
-Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.725640e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.671653e+05                 )  sec^-1
 
 *** (3) EXECUTE MADEVENT_CUDA x1 (create events.lhe) ***
 --------------------
@@ -511,15 +511,15 @@ Executing ' ./build.none_f_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggttg_x
  [XSECTION] MultiChannel = TRUE
  [XSECTION] Configuration = 1
  [XSECTION] ChannelId = 1
- [XSECTION] Cross section = 0.0972 [9.7195710869056637E-002] fbridge_mode=1
+ [XSECTION] Cross section = 0.0972 [9.7196349366365994E-002] fbridge_mode=1
  [UNWEIGHT] Wrote 40 events (found 738 events)
- [COUNTERS] PROGRAM TOTAL          :    0.7105s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.7096s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0009s for     8192 events => throughput is 9.53E+06 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.6848s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.6840s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0009s for     8192 events => throughput is 9.62E+06 events/s
 
 *** (3) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec ***
 
-OK! xsec from fortran (9.7195719386171234E-002) and cpp (9.7195710869056637E-002) differ by less than 4E-4 (8.762849490473457e-08)
+OK! xsec from fortran (9.7196357922470805E-002) and cpp (9.7196349366365994E-002) differ by less than 4E-4 (8.802906814597833e-08)
 
 *** (3) Compare MADEVENT_CUDA x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) ***
 
@@ -544,58 +544,58 @@ Executing ' ./build.none_f_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggttg_x
  [XSECTION] MultiChannel = TRUE
  [XSECTION] Configuration = 1
  [XSECTION] ChannelId = 1
- [XSECTION] Cross section = 0.08131 [8.1310865716831132E-002] fbridge_mode=1
+ [XSECTION] Cross section = 0.08131 [8.1310864949473968E-002] fbridge_mode=1
  [UNWEIGHT] Wrote 679 events (found 1787 events)
- [COUNTERS] PROGRAM TOTAL          :    1.8990s
- [COUNTERS] Fortran Overhead ( 0 ) :    1.8894s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0097s for    90112 events => throughput is 9.33E+06 events/s
+ [COUNTERS] PROGRAM TOTAL          :    1.8094s
+ [COUNTERS] Fortran Overhead ( 0 ) :    1.8000s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0094s for    90112 events => throughput is 9.54E+06 events/s
 
 *** (3) Compare MADEVENT_CUDA x10 xsec to MADEVENT_FORTRAN xsec ***
 
-OK! xsec from fortran (8.1310872844967921E-002) and cpp (8.1310865716831132E-002) differ by less than 4E-4 (8.766523518222158e-08)
+OK! xsec from fortran (8.1310872077655555E-002) and cpp (8.1310864949473968E-002) differ by less than 4E-4 (8.766578696306482e-08)
 
 *** (3) Compare MADEVENT_CUDA x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) ***
 
 OK! events.lhe.cuda.10 and events.lhe.ref.10 are identical
 
 *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge ***
-Process                     = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.264826e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.312152e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(8192) -p 256 32 1 ***
-Process                     = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.844127e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.855833e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge ***
-Process                     = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.461458e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.626006e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX) -p 16384 32 1 ***
-Process                     = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 4.280416e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.302964e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge ***
-Process                     = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.442329e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.660515e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 ***
-Process                     = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 4.431551e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.514362e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge ***
-Process                     = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.330311e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.487452e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 ***
-Process                     = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.602408e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.622543e+07                 )  sec^-1
 
 TEST COMPLETED
diff --git a/epochX/cudacpp/tmad/logs_ggttg_mad/log_ggttg_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_ggttg_mad/log_ggttg_mad_m_inl0_hrd0.txt
index de3b503603..8f3227653b 100644
--- a/epochX/cudacpp/tmad/logs_ggttg_mad/log_ggttg_mad_m_inl0_hrd0.txt
+++ b/epochX/cudacpp/tmad/logs_ggttg_mad/log_ggttg_mad_m_inl0_hrd0.txt
@@ -1,41 +1,41 @@
 Working directory (build): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg
 CUDACPP_BUILDDIR='.'
 
+make USEBUILDDIR=1 AVX=none
 
 
-make USEBUILDDIR=1 AVX=none
 
-make USEBUILDDIR=1 AVX=sse4
 make USEBUILDDIR=1 AVX=avx2
+make USEBUILDDIR=1 AVX=sse4
 make USEBUILDDIR=1 AVX=512y
+
+make USEBUILDDIR=1 AVX=512z
 make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
 make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
 make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
 make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
-
-make USEBUILDDIR=1 AVX=512z
 make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
-CUDACPP_BUILDDIR='build.none_m_inl0_hrd0'
+CUDACPP_BUILDDIR='build.avx2_m_inl0_hrd0'
+CUDACPP_BUILDDIR='build.512z_m_inl0_hrd0'
+CUDACPP_BUILDDIR='build.512y_m_inl0_hrd0'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
-CUDACPP_BUILDDIR='build.512y_m_inl0_hrd0'
-CUDACPP_BUILDDIR='build.avx2_m_inl0_hrd0'
+CUDACPP_BUILDDIR='build.sse4_m_inl0_hrd0'
+CUDACPP_BUILDDIR='build.none_m_inl0_hrd0'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
-CUDACPP_BUILDDIR='build.sse4_m_inl0_hrd0'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
-CUDACPP_BUILDDIR='build.512z_m_inl0_hrd0'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
 
 OMP_NUM_THREADS=
 
-DATE: 2023-06-16_23:38:00
+DATE: 2023-07-18_23:41:00
 
-On itscrd80.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
+On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg
 
 *** (1) EXECUTE MADEVENT_FORTRAN (create results.dat) ***
@@ -58,9 +58,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttg_x1_fortran > /tmp/ava
  [XSECTION] Configuration = 1
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.0972 [] fbridge_mode=0
- [COUNTERS] PROGRAM TOTAL          :    0.5113s
+ [COUNTERS] PROGRAM TOTAL          :    0.5157s
  [COUNTERS] Fortran Overhead ( 0 ) :    0.1671s
- [COUNTERS] Fortran MEs      ( 1 ) :    0.3442s for     8192 events => throughput is 2.38E+04 events/s
+ [COUNTERS] Fortran MEs      ( 1 ) :    0.3487s for     8192 events => throughput is 2.35E+04 events/s
 
 *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) ***
 --------------------
@@ -81,11 +81,11 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttg_x1_fortran > /tmp/ava
  [XSECTION] MultiChannel = TRUE
  [XSECTION] Configuration = 1
  [XSECTION] ChannelId = 1
- [XSECTION] Cross section = 0.0972 [9.7195719386171234E-002] fbridge_mode=0
+ [XSECTION] Cross section = 0.0972 [9.7196357922470805E-002] fbridge_mode=0
  [UNWEIGHT] Wrote 40 events (found 738 events)
- [COUNTERS] PROGRAM TOTAL          :    0.6040s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.2440s
- [COUNTERS] Fortran MEs      ( 1 ) :    0.3600s for     8192 events => throughput is 2.28E+04 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.6183s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.2503s
+ [COUNTERS] Fortran MEs      ( 1 ) :    0.3680s for     8192 events => throughput is 2.23E+04 events/s
 
 *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) ***
 --------------------
@@ -106,11 +106,11 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttg_x10_fortran > /tmp/av
  [XSECTION] MultiChannel = TRUE
  [XSECTION] Configuration = 1
  [XSECTION] ChannelId = 1
- [XSECTION] Cross section = 0.08131 [8.1310872844967921E-002] fbridge_mode=0
+ [XSECTION] Cross section = 0.08131 [8.1310872077655555E-002] fbridge_mode=0
  [UNWEIGHT] Wrote 679 events (found 1787 events)
- [COUNTERS] PROGRAM TOTAL          :    5.1624s
- [COUNTERS] Fortran Overhead ( 0 ) :    1.3666s
- [COUNTERS] Fortran MEs      ( 1 ) :    3.7958s for    90112 events => throughput is 2.37E+04 events/s
+ [COUNTERS] PROGRAM TOTAL          :    5.2542s
+ [COUNTERS] Fortran Overhead ( 0 ) :    1.3824s
+ [COUNTERS] Fortran MEs      ( 1 ) :    3.8718s for    90112 events => throughput is 2.33E+04 events/s
 
 *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -131,15 +131,15 @@ Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1
  [XSECTION] MultiChannel = TRUE
  [XSECTION] Configuration = 1
  [XSECTION] ChannelId = 1
- [XSECTION] Cross section = 0.0972 [9.7195720226233587E-002] fbridge_mode=1
+ [XSECTION] Cross section = 0.0972 [9.7196358763382007E-002] fbridge_mode=1
  [UNWEIGHT] Wrote 40 events (found 738 events)
- [COUNTERS] PROGRAM TOTAL          :    0.8805s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.5588s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.3217s for     8192 events => throughput is 2.55E+04 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.9109s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.5735s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.3374s for     8192 events => throughput is 2.43E+04 events/s
 
 *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
-OK! xsec from fortran (9.7195719386171234E-002) and cpp (9.7195720226233587E-002) differ by less than 2E-4 (8.642997428864874e-09)
+OK! xsec from fortran (9.7196357922470805E-002) and cpp (9.7196358763382007E-002) differ by less than 2E-4 (8.651674043846924e-09)
 
 *** (2-none) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) ***
 
@@ -164,29 +164,29 @@ Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1
  [XSECTION] MultiChannel = TRUE
  [XSECTION] Configuration = 1
  [XSECTION] ChannelId = 1
- [XSECTION] Cross section = 0.08131 [8.1310873602323142E-002] fbridge_mode=1
+ [XSECTION] Cross section = 0.08131 [8.1310872835011053E-002] fbridge_mode=1
  [UNWEIGHT] Wrote 679 events (found 1787 events)
- [COUNTERS] PROGRAM TOTAL          :    5.2705s
- [COUNTERS] Fortran Overhead ( 0 ) :    1.7115s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    3.5590s for    90112 events => throughput is 2.53E+04 events/s
+ [COUNTERS] PROGRAM TOTAL          :    5.4152s
+ [COUNTERS] Fortran Overhead ( 0 ) :    1.7023s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    3.7128s for    90112 events => throughput is 2.43E+04 events/s
 
 *** (2-none) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
 
-OK! xsec from fortran (8.1310872844967921E-002) and cpp (8.1310873602323142E-002) differ by less than 2E-4 (9.314316651298782e-09)
+OK! xsec from fortran (8.1310872077655555E-002) and cpp (8.1310872835011053E-002) differ by less than 2E-4 (9.31432020401246e-09)
 
 *** (2-none) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) ***
 
 OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
 
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
-Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.610260e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.446575e+04                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
-Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.619292e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.509101e+04                 )  sec^-1
 
 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -207,15 +207,15 @@ Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1
  [XSECTION] MultiChannel = TRUE
  [XSECTION] Configuration = 1
  [XSECTION] ChannelId = 1
- [XSECTION] Cross section = 0.0972 [9.7195720267415450E-002] fbridge_mode=1
+ [XSECTION] Cross section = 0.0972 [9.7196358804670396E-002] fbridge_mode=1
  [UNWEIGHT] Wrote 40 events (found 738 events)
- [COUNTERS] PROGRAM TOTAL          :    0.5849s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.4204s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.1645s for     8192 events => throughput is 4.98E+04 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.5806s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.4114s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.1692s for     8192 events => throughput is 4.84E+04 events/s
 
 *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
-OK! xsec from fortran (9.7195719386171234E-002) and cpp (9.7195720267415450E-002) differ by less than 2E-4 (9.066697836956905e-09)
+OK! xsec from fortran (9.7196357922470805E-002) and cpp (9.7196358804670396E-002) differ by less than 2E-4 (9.076467577529002e-09)
 
 *** (2-sse4) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) ***
 
@@ -240,29 +240,29 @@ Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1
  [XSECTION] MultiChannel = TRUE
  [XSECTION] Configuration = 1
  [XSECTION] ChannelId = 1
- [XSECTION] Cross section = 0.08131 [8.1310873604102080E-002] fbridge_mode=1
+ [XSECTION] Cross section = 0.08131 [8.1310872836789727E-002] fbridge_mode=1
  [UNWEIGHT] Wrote 679 events (found 1787 events)
- [COUNTERS] PROGRAM TOTAL          :    3.4135s
- [COUNTERS] Fortran Overhead ( 0 ) :    1.5611s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    1.8524s for    90112 events => throughput is 4.86E+04 events/s
+ [COUNTERS] PROGRAM TOTAL          :    3.4625s
+ [COUNTERS] Fortran Overhead ( 0 ) :    1.5672s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    1.8953s for    90112 events => throughput is 4.75E+04 events/s
 
 *** (2-sse4) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
 
-OK! xsec from fortran (8.1310872844967921E-002) and cpp (8.1310873604102080E-002) differ by less than 2E-4 (9.33619492826665e-09)
+OK! xsec from fortran (8.1310872077655555E-002) and cpp (8.1310872836789727E-002) differ by less than 2E-4 (9.336195150311255e-09)
 
 *** (2-sse4) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) ***
 
 OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
 
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
-Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 4.764708e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.922036e+04                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
-Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 4.772730e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.953512e+04                 )  sec^-1
 
 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -283,15 +283,15 @@ Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1
  [XSECTION] MultiChannel = TRUE
  [XSECTION] Configuration = 1
  [XSECTION] ChannelId = 1
- [XSECTION] Cross section = 0.0972 [9.7195720049465126E-002] fbridge_mode=1
+ [XSECTION] Cross section = 0.0972 [9.7196358586501358E-002] fbridge_mode=1
  [UNWEIGHT] Wrote 40 events (found 738 events)
- [COUNTERS] PROGRAM TOTAL          :    0.4005s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.3219s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0786s for     8192 events => throughput is 1.04E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.4053s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.3251s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0802s for     8192 events => throughput is 1.02E+05 events/s
 
 *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
-OK! xsec from fortran (9.7195719386171234E-002) and cpp (9.7195720049465126E-002) differ by less than 2E-4 (6.824311782338555e-09)
+OK! xsec from fortran (9.7196357922470805E-002) and cpp (9.7196358586501358E-002) differ by less than 2E-4 (6.831845977828266e-09)
 
 *** (2-avx2) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) ***
 
@@ -316,29 +316,29 @@ Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1
  [XSECTION] MultiChannel = TRUE
  [XSECTION] Configuration = 1
  [XSECTION] ChannelId = 1
- [XSECTION] Cross section = 0.08131 [8.1310873476230255E-002] fbridge_mode=1
+ [XSECTION] Cross section = 0.08131 [8.1310872708918333E-002] fbridge_mode=1
  [UNWEIGHT] Wrote 679 events (found 1787 events)
- [COUNTERS] PROGRAM TOTAL          :    2.3277s
- [COUNTERS] Fortran Overhead ( 0 ) :    1.4694s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.8583s for    90112 events => throughput is 1.05E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    2.3314s
+ [COUNTERS] Fortran Overhead ( 0 ) :    1.4457s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.8857s for    90112 events => throughput is 1.02E+05 events/s
 
 *** (2-avx2) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
 
-OK! xsec from fortran (8.1310872844967921E-002) and cpp (8.1310873476230255E-002) differ by less than 2E-4 (7.76356601228656e-09)
+OK! xsec from fortran (8.1310872077655555E-002) and cpp (8.1310872708918333E-002) differ by less than 2E-4 (7.763571563401683e-09)
 
 *** (2-avx2) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) ***
 
 OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
 
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
-Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.054740e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.032189e+05                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
-Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.057335e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.032333e+05                 )  sec^-1
 
 *** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -359,15 +359,15 @@ Executing ' ./build.512y_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1
  [XSECTION] MultiChannel = TRUE
  [XSECTION] Configuration = 1
  [XSECTION] ChannelId = 1
- [XSECTION] Cross section = 0.0972 [9.7195720049465126E-002] fbridge_mode=1
+ [XSECTION] Cross section = 0.0972 [9.7196358586501358E-002] fbridge_mode=1
  [UNWEIGHT] Wrote 40 events (found 738 events)
- [COUNTERS] PROGRAM TOTAL          :    0.3809s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.3126s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0683s for     8192 events => throughput is 1.20E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.3878s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.3172s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0706s for     8192 events => throughput is 1.16E+05 events/s
 
 *** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
-OK! xsec from fortran (9.7195719386171234E-002) and cpp (9.7195720049465126E-002) differ by less than 2E-4 (6.824311782338555e-09)
+OK! xsec from fortran (9.7196357922470805E-002) and cpp (9.7196358586501358E-002) differ by less than 2E-4 (6.831845977828266e-09)
 
 *** (2-512y) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) ***
 
@@ -392,29 +392,29 @@ Executing ' ./build.512y_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1
  [XSECTION] MultiChannel = TRUE
  [XSECTION] Configuration = 1
  [XSECTION] ChannelId = 1
- [XSECTION] Cross section = 0.08131 [8.1310873476230255E-002] fbridge_mode=1
+ [XSECTION] Cross section = 0.08131 [8.1310872708918333E-002] fbridge_mode=1
  [UNWEIGHT] Wrote 679 events (found 1787 events)
- [COUNTERS] PROGRAM TOTAL          :    2.2562s
- [COUNTERS] Fortran Overhead ( 0 ) :    1.4769s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.7793s for    90112 events => throughput is 1.16E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    2.2108s
+ [COUNTERS] Fortran Overhead ( 0 ) :    1.4354s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.7754s for    90112 events => throughput is 1.16E+05 events/s
 
 *** (2-512y) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
 
-OK! xsec from fortran (8.1310872844967921E-002) and cpp (8.1310873476230255E-002) differ by less than 2E-4 (7.76356601228656e-09)
+OK! xsec from fortran (8.1310872077655555E-002) and cpp (8.1310872708918333E-002) differ by less than 2E-4 (7.763571563401683e-09)
 
 *** (2-512y) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) ***
 
 OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
 
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
-Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.193684e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.188297e+05                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
-Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.221094e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.181150e+05                 )  sec^-1
 
 *** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -435,15 +435,15 @@ Executing ' ./build.512z_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1
  [XSECTION] MultiChannel = TRUE
  [XSECTION] Configuration = 1
  [XSECTION] ChannelId = 1
- [XSECTION] Cross section = 0.0972 [9.7195720220276491E-002] fbridge_mode=1
+ [XSECTION] Cross section = 0.0972 [9.7196358757578441E-002] fbridge_mode=1
  [UNWEIGHT] Wrote 40 events (found 738 events)
- [COUNTERS] PROGRAM TOTAL          :    0.4495s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.3476s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.1019s for     8192 events => throughput is 8.04E+04 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.4510s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.3480s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.1030s for     8192 events => throughput is 7.95E+04 events/s
 
 *** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
-OK! xsec from fortran (9.7195719386171234E-002) and cpp (9.7195720220276491E-002) differ by less than 2E-4 (8.581707788835047e-09)
+OK! xsec from fortran (9.7196357922470805E-002) and cpp (9.7196358757578441E-002) differ by less than 2E-4 (8.591964251181139e-09)
 
 *** (2-512z) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) ***
 
@@ -468,29 +468,29 @@ Executing ' ./build.512z_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1
  [XSECTION] MultiChannel = TRUE
  [XSECTION] Configuration = 1
  [XSECTION] ChannelId = 1
- [XSECTION] Cross section = 0.08131 [8.1310873571012007E-002] fbridge_mode=1
+ [XSECTION] Cross section = 0.08131 [8.1310872803699391E-002] fbridge_mode=1
  [UNWEIGHT] Wrote 679 events (found 1787 events)
- [COUNTERS] PROGRAM TOTAL          :    2.6068s
- [COUNTERS] Fortran Overhead ( 0 ) :    1.4960s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    1.1108s for    90112 events => throughput is 8.11E+04 events/s
+ [COUNTERS] PROGRAM TOTAL          :    2.6100s
+ [COUNTERS] Fortran Overhead ( 0 ) :    1.4743s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    1.1357s for    90112 events => throughput is 7.93E+04 events/s
 
 *** (2-512z) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
 
-OK! xsec from fortran (8.1310872844967921E-002) and cpp (8.1310873571012007E-002) differ by less than 2E-4 (8.92923734951978e-09)
+OK! xsec from fortran (8.1310872077655555E-002) and cpp (8.1310872803699391E-002) differ by less than 2E-4 (8.929234462939917e-09)
 
 *** (2-512z) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) ***
 
 OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
 
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
-Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 8.078718e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.916056e+04                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
-Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 8.044468e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.993339e+04                 )  sec^-1
 
 *** (3) EXECUTE MADEVENT_CUDA x1 (create events.lhe) ***
 --------------------
@@ -511,15 +511,15 @@ Executing ' ./build.none_m_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggttg_x
  [XSECTION] MultiChannel = TRUE
  [XSECTION] Configuration = 1
  [XSECTION] ChannelId = 1
- [XSECTION] Cross section = 0.0972 [9.7195719566775987E-002] fbridge_mode=1
+ [XSECTION] Cross section = 0.0972 [9.7196358102981245E-002] fbridge_mode=1
  [UNWEIGHT] Wrote 40 events (found 738 events)
- [COUNTERS] PROGRAM TOTAL          :    0.7145s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.7128s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0018s for     8192 events => throughput is 4.63E+06 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.6779s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.6762s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0017s for     8192 events => throughput is 4.71E+06 events/s
 
 *** (3) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec ***
 
-OK! xsec from fortran (9.7195719386171234E-002) and cpp (9.7195719566775987E-002) differ by less than 2E-4 (1.858155407319373e-09)
+OK! xsec from fortran (9.7196357922470805E-002) and cpp (9.7196358102981245E-002) differ by less than 2E-4 (1.8571728599425796e-09)
 
 *** (3) Compare MADEVENT_CUDA x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) ***
 
@@ -544,58 +544,58 @@ Executing ' ./build.none_m_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggttg_x
  [XSECTION] MultiChannel = TRUE
  [XSECTION] Configuration = 1
  [XSECTION] ChannelId = 1
- [XSECTION] Cross section = 0.08131 [8.1310872835946929E-002] fbridge_mode=1
+ [XSECTION] Cross section = 0.08131 [8.1310872068634174E-002] fbridge_mode=1
  [UNWEIGHT] Wrote 679 events (found 1787 events)
- [COUNTERS] PROGRAM TOTAL          :    1.8693s
- [COUNTERS] Fortran Overhead ( 0 ) :    1.8501s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0192s for    90112 events => throughput is 4.69E+06 events/s
+ [COUNTERS] PROGRAM TOTAL          :    1.8296s
+ [COUNTERS] Fortran Overhead ( 0 ) :    1.8104s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0192s for    90112 events => throughput is 4.70E+06 events/s
 
 *** (3) Compare MADEVENT_CUDA x10 xsec to MADEVENT_FORTRAN xsec ***
 
-OK! xsec from fortran (8.1310872844967921E-002) and cpp (8.1310872835946929E-002) differ by less than 2E-4 (1.1094447582848943e-10)
+OK! xsec from fortran (8.1310872077655555E-002) and cpp (8.1310872068634174E-002) differ by less than 2E-4 (1.1094924978749532e-10)
 
 *** (3) Compare MADEVENT_CUDA x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) ***
 
 OK! events.lhe.cuda.10 and events.lhe.ref.10 are identical
 
 *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge ***
-Process                     = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 5.091918e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 5.183642e+06                 )  sec^-1
 
 *** EXECUTE GCHECK(8192) -p 256 32 1 ***
-Process                     = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 8.629216e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 8.584600e+06                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge ***
-Process                     = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 9.133810e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 9.650819e+06                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX) -p 16384 32 1 ***
-Process                     = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.233339e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.235546e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge ***
-Process                     = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 9.048619e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 9.634710e+06                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 ***
-Process                     = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.238445e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.245773e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge ***
-Process                     = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 9.073517e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 9.629817e+06                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 ***
-Process                     = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.781744e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.727220e+06                 )  sec^-1
 
 TEST COMPLETED
diff --git a/epochX/cudacpp/tmad/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0.txt
index 56b03784ad..bd446b1ac3 100644
--- a/epochX/cudacpp/tmad/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0.txt
+++ b/epochX/cudacpp/tmad/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0.txt
@@ -2,28 +2,28 @@ Working directory (build): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/g
 CUDACPP_BUILDDIR='.'
 
 
+
 make USEBUILDDIR=1 AVX=none
 
 make USEBUILDDIR=1 AVX=sse4
 make USEBUILDDIR=1 AVX=avx2
-
 make USEBUILDDIR=1 AVX=512y
+
+make USEBUILDDIR=1 AVX=512z
 make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
-
-make USEBUILDDIR=1 AVX=512z
 make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
-CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0'
-CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0'
+CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd0'
 CUDACPP_BUILDDIR='build.none_d_inl0_hrd0'
 CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
-CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd0'
+CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 make[1]: Nothing to be done for 'all'.
@@ -33,9 +33,9 @@ make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/
 
 OMP_NUM_THREADS=
 
-DATE: 2023-06-16_23:38:42
+DATE: 2023-07-18_23:41:41
 
-On itscrd80.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
+On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg
 
 *** (1) EXECUTE MADEVENT_FORTRAN (create results.dat) ***
@@ -58,9 +58,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttgg_x1_fortran > /tmp/av
  [XSECTION] Configuration = 1
  [XSECTION] ChannelId = 2
  [XSECTION] Cross section = 0.0003628 [] fbridge_mode=0
- [COUNTERS] PROGRAM TOTAL          :    4.3757s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.2161s
- [COUNTERS] Fortran MEs      ( 1 ) :    4.1595s for     8192 events => throughput is 1.97E+03 events/s
+ [COUNTERS] PROGRAM TOTAL          :    4.4496s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.2184s
+ [COUNTERS] Fortran MEs      ( 1 ) :    4.2312s for     8192 events => throughput is 1.94E+03 events/s
 
 *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) ***
 --------------------
@@ -83,9 +83,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttgg_x1_fortran > /tmp/av
  [XSECTION] ChannelId = 2
  [XSECTION] Cross section = 0.0003628 [3.6277277311352982E-004] fbridge_mode=0
  [UNWEIGHT] Wrote 49 events (found 738 events)
- [COUNTERS] PROGRAM TOTAL          :    4.4480s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.2999s
- [COUNTERS] Fortran MEs      ( 1 ) :    4.1481s for     8192 events => throughput is 1.97E+03 events/s
+ [COUNTERS] PROGRAM TOTAL          :    4.5168s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.3053s
+ [COUNTERS] Fortran MEs      ( 1 ) :    4.2114s for     8192 events => throughput is 1.95E+03 events/s
 
 *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) ***
 --------------------
@@ -106,11 +106,11 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttgg_x10_fortran > /tmp/a
  [XSECTION] MultiChannel = TRUE
  [XSECTION] Configuration = 1
  [XSECTION] ChannelId = 2
- [XSECTION] Cross section = 0.000158 [1.5803725748610604E-004] fbridge_mode=0
+ [XSECTION] Cross section = 0.000158 [1.5803725748421164E-004] fbridge_mode=0
  [UNWEIGHT] Wrote 204 events (found 1633 events)
- [COUNTERS] PROGRAM TOTAL          :   47.5995s
- [COUNTERS] Fortran Overhead ( 0 ) :    1.8973s
- [COUNTERS] Fortran MEs      ( 1 ) :   45.7021s for    90112 events => throughput is 1.97E+03 events/s
+ [COUNTERS] PROGRAM TOTAL          :   48.2267s
+ [COUNTERS] Fortran Overhead ( 0 ) :    1.8750s
+ [COUNTERS] Fortran MEs      ( 1 ) :   46.3517s for    90112 events => throughput is 1.94E+03 events/s
 
 *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -133,9 +133,9 @@ Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x
  [XSECTION] ChannelId = 2
  [XSECTION] Cross section = 0.0003628 [3.6277277311352998E-004] fbridge_mode=1
  [UNWEIGHT] Wrote 49 events (found 738 events)
- [COUNTERS] PROGRAM TOTAL          :    8.5650s
- [COUNTERS] Fortran Overhead ( 0 ) :    4.3703s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    4.1947s for     8192 events => throughput is 1.95E+03 events/s
+ [COUNTERS] PROGRAM TOTAL          :    8.8983s
+ [COUNTERS] Fortran Overhead ( 0 ) :    4.5345s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    4.3638s for     8192 events => throughput is 1.88E+03 events/s
 
 *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
diff --git a/epochX/cudacpp/tmad/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0.txt
index fbd2ca3bdb..7f03742433 100644
--- a/epochX/cudacpp/tmad/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0.txt
+++ b/epochX/cudacpp/tmad/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0.txt
@@ -2,26 +2,26 @@ Working directory (build): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/g
 CUDACPP_BUILDDIR='.'
 
 
-
 make USEBUILDDIR=1 AVX=none
-make USEBUILDDIR=1 AVX=sse4
 
+
+make USEBUILDDIR=1 AVX=sse4
 make USEBUILDDIR=1 AVX=avx2
 make USEBUILDDIR=1 AVX=512y
+
+make USEBUILDDIR=1 AVX=512z
 make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
-
-make USEBUILDDIR=1 AVX=512z
 make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
-CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd0'
 CUDACPP_BUILDDIR='build.none_f_inl0_hrd0'
 CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0'
-CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0'
-CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd0'
+CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0'
+CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 make[1]: Nothing to be done for 'all'.
@@ -33,9 +33,9 @@ make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/
 
 OMP_NUM_THREADS=
 
-DATE: 2023-06-16_23:39:50
+DATE: 2023-07-18_23:42:50
 
-On itscrd80.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
+On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg
 
 *** (1) EXECUTE MADEVENT_FORTRAN (create results.dat) ***
@@ -58,9 +58,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttgg_x1_fortran > /tmp/av
  [XSECTION] Configuration = 1
  [XSECTION] ChannelId = 2
  [XSECTION] Cross section = 0.0003628 [] fbridge_mode=0
- [COUNTERS] PROGRAM TOTAL          :    4.4077s
+ [COUNTERS] PROGRAM TOTAL          :    4.4279s
  [COUNTERS] Fortran Overhead ( 0 ) :    0.2143s
- [COUNTERS] Fortran MEs      ( 1 ) :    4.1934s for     8192 events => throughput is 1.95E+03 events/s
+ [COUNTERS] Fortran MEs      ( 1 ) :    4.2136s for     8192 events => throughput is 1.94E+03 events/s
 
 *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) ***
 --------------------
@@ -83,9 +83,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttgg_x1_fortran > /tmp/av
  [XSECTION] ChannelId = 2
  [XSECTION] Cross section = 0.0003628 [3.6277277311352982E-004] fbridge_mode=0
  [UNWEIGHT] Wrote 49 events (found 738 events)
- [COUNTERS] PROGRAM TOTAL          :    4.4393s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.3039s
- [COUNTERS] Fortran MEs      ( 1 ) :    4.1354s for     8192 events => throughput is 1.98E+03 events/s
+ [COUNTERS] PROGRAM TOTAL          :    4.5251s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.3044s
+ [COUNTERS] Fortran MEs      ( 1 ) :    4.2207s for     8192 events => throughput is 1.94E+03 events/s
 
 *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) ***
 --------------------
@@ -106,11 +106,11 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttgg_x10_fortran > /tmp/a
  [XSECTION] MultiChannel = TRUE
  [XSECTION] Configuration = 1
  [XSECTION] ChannelId = 2
- [XSECTION] Cross section = 0.000158 [1.5803725748610604E-004] fbridge_mode=0
+ [XSECTION] Cross section = 0.000158 [1.5803725748421164E-004] fbridge_mode=0
  [UNWEIGHT] Wrote 204 events (found 1633 events)
- [COUNTERS] PROGRAM TOTAL          :   47.6726s
- [COUNTERS] Fortran Overhead ( 0 ) :    1.8981s
- [COUNTERS] Fortran MEs      ( 1 ) :   45.7745s for    90112 events => throughput is 1.97E+03 events/s
+ [COUNTERS] PROGRAM TOTAL          :   48.4173s
+ [COUNTERS] Fortran Overhead ( 0 ) :    1.8860s
+ [COUNTERS] Fortran MEs      ( 1 ) :   46.5313s for    90112 events => throughput is 1.94E+03 events/s
 
 *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -133,9 +133,9 @@ Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x
  [XSECTION] ChannelId = 2
  [XSECTION] Cross section = 0.0003628 [3.6277396515517582E-004] fbridge_mode=1
  [UNWEIGHT] Wrote 49 events (found 738 events)
- [COUNTERS] PROGRAM TOTAL          :    8.4058s
- [COUNTERS] Fortran Overhead ( 0 ) :    4.2739s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    4.1319s for     8192 events => throughput is 1.98E+03 events/s
+ [COUNTERS] PROGRAM TOTAL          :    8.6591s
+ [COUNTERS] Fortran Overhead ( 0 ) :    4.4291s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    4.2300s for     8192 events => throughput is 1.94E+03 events/s
 
 *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
diff --git a/epochX/cudacpp/tmad/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd0.txt
index c64d8630ed..3e6a0c1d61 100644
--- a/epochX/cudacpp/tmad/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd0.txt
+++ b/epochX/cudacpp/tmad/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd0.txt
@@ -3,23 +3,22 @@ CUDACPP_BUILDDIR='.'
 
 
 
+
 make USEBUILDDIR=1 AVX=none
 make USEBUILDDIR=1 AVX=sse4
 make USEBUILDDIR=1 AVX=avx2
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
-
 make USEBUILDDIR=1 AVX=512y
 
 make USEBUILDDIR=1 AVX=512z
 make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
-CUDACPP_BUILDDIR='build.sse4_m_inl0_hrd0'
+make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+CUDACPP_BUILDDIR='build.512z_m_inl0_hrd0'
 CUDACPP_BUILDDIR='build.none_m_inl0_hrd0'
-CUDACPP_BUILDDIR='build.512y_m_inl0_hrd0'
 CUDACPP_BUILDDIR='build.avx2_m_inl0_hrd0'
-CUDACPP_BUILDDIR='build.512z_m_inl0_hrd0'
+CUDACPP_BUILDDIR='build.sse4_m_inl0_hrd0'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 make[1]: Nothing to be done for 'all'.
@@ -28,14 +27,15 @@ make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+CUDACPP_BUILDDIR='build.512y_m_inl0_hrd0'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 
 OMP_NUM_THREADS=
 
-DATE: 2023-06-16_23:40:57
+DATE: 2023-07-18_23:43:58
 
-On itscrd80.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
+On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg
 
 *** (1) EXECUTE MADEVENT_FORTRAN (create results.dat) ***
@@ -58,9 +58,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttgg_x1_fortran > /tmp/av
  [XSECTION] Configuration = 1
  [XSECTION] ChannelId = 2
  [XSECTION] Cross section = 0.0003628 [] fbridge_mode=0
- [COUNTERS] PROGRAM TOTAL          :    4.4170s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.2154s
- [COUNTERS] Fortran MEs      ( 1 ) :    4.2016s for     8192 events => throughput is 1.95E+03 events/s
+ [COUNTERS] PROGRAM TOTAL          :    4.4648s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.2148s
+ [COUNTERS] Fortran MEs      ( 1 ) :    4.2501s for     8192 events => throughput is 1.93E+03 events/s
 
 *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) ***
 --------------------
@@ -83,9 +83,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttgg_x1_fortran > /tmp/av
  [XSECTION] ChannelId = 2
  [XSECTION] Cross section = 0.0003628 [3.6277277311352982E-004] fbridge_mode=0
  [UNWEIGHT] Wrote 49 events (found 738 events)
- [COUNTERS] PROGRAM TOTAL          :    4.6794s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.3042s
- [COUNTERS] Fortran MEs      ( 1 ) :    4.3752s for     8192 events => throughput is 1.87E+03 events/s
+ [COUNTERS] PROGRAM TOTAL          :    4.5404s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.3038s
+ [COUNTERS] Fortran MEs      ( 1 ) :    4.2366s for     8192 events => throughput is 1.93E+03 events/s
 
 *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) ***
 --------------------
@@ -106,11 +106,11 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttgg_x10_fortran > /tmp/a
  [XSECTION] MultiChannel = TRUE
  [XSECTION] Configuration = 1
  [XSECTION] ChannelId = 2
- [XSECTION] Cross section = 0.000158 [1.5803725748610604E-004] fbridge_mode=0
+ [XSECTION] Cross section = 0.000158 [1.5803725748421164E-004] fbridge_mode=0
  [UNWEIGHT] Wrote 204 events (found 1633 events)
- [COUNTERS] PROGRAM TOTAL          :   47.7432s
- [COUNTERS] Fortran Overhead ( 0 ) :    1.9035s
- [COUNTERS] Fortran MEs      ( 1 ) :   45.8397s for    90112 events => throughput is 1.97E+03 events/s
+ [COUNTERS] PROGRAM TOTAL          :   48.3786s
+ [COUNTERS] Fortran Overhead ( 0 ) :    1.8864s
+ [COUNTERS] Fortran MEs      ( 1 ) :   46.4922s for    90112 events => throughput is 1.94E+03 events/s
 
 *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -133,9 +133,9 @@ Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x
  [XSECTION] ChannelId = 2
  [XSECTION] Cross section = 0.0003628 [3.6277277432965013E-004] fbridge_mode=1
  [UNWEIGHT] Wrote 49 events (found 738 events)
- [COUNTERS] PROGRAM TOTAL          :    8.6986s
- [COUNTERS] Fortran Overhead ( 0 ) :    4.4272s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    4.2714s for     8192 events => throughput is 1.92E+03 events/s
+ [COUNTERS] PROGRAM TOTAL          :    9.0570s
+ [COUNTERS] Fortran Overhead ( 0 ) :    4.6177s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    4.4393s for     8192 events => throughput is 1.85E+03 events/s
 
 *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
diff --git a/epochX/cudacpp/tmad/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd0.txt
index 39ca5692cf..c2298517a8 100644
--- a/epochX/cudacpp/tmad/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd0.txt
+++ b/epochX/cudacpp/tmad/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd0.txt
@@ -4,28 +4,28 @@ CUDACPP_BUILDDIR='.'
 
 make USEBUILDDIR=1 AVX=none
 
+
 make USEBUILDDIR=1 AVX=sse4
 make USEBUILDDIR=1 AVX=avx2
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg'
-
 make USEBUILDDIR=1 AVX=512y
+
+make USEBUILDDIR=1 AVX=512z
 make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg'
 make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg'
 make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg'
-
-make USEBUILDDIR=1 AVX=512z
 make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg'
-CUDACPP_BUILDDIR='build.none_d_inl0_hrd0'
+make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg'
+CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0'
 CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0'
-CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd0'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg'
-CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0'
+CUDACPP_BUILDDIR='build.none_d_inl0_hrd0'
 CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg'
+CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd0'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg'
 make[1]: Nothing to be done for 'all'.
@@ -33,9 +33,9 @@ make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/
 
 OMP_NUM_THREADS=
 
-DATE: 2023-06-16_23:42:06
+DATE: 2023-07-18_23:45:07
 
-On itscrd80.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
+On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg
 
 *** (1) EXECUTE MADEVENT_FORTRAN (create results.dat) ***
@@ -58,9 +58,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttggg_x1_fortran > /tmp/a
  [XSECTION] Configuration = 1
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 1.169e-06 [] fbridge_mode=0
- [COUNTERS] PROGRAM TOTAL          :   96.2551s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.4152s
- [COUNTERS] Fortran MEs      ( 1 ) :   95.8399s for     8192 events => throughput is 8.55E+01 events/s
+ [COUNTERS] PROGRAM TOTAL          :   99.3861s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.4217s
+ [COUNTERS] Fortran MEs      ( 1 ) :   98.9644s for     8192 events => throughput is 8.28E+01 events/s
 
 *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) ***
 --------------------
@@ -83,9 +83,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttggg_x1_fortran > /tmp/a
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 1.169e-06 [1.1693100945435808E-006] fbridge_mode=0
  [UNWEIGHT] Wrote 14 events (found 457 events)
- [COUNTERS] PROGRAM TOTAL          :   96.4720s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.4731s
- [COUNTERS] Fortran MEs      ( 1 ) :   95.9989s for     8192 events => throughput is 8.53E+01 events/s
+ [COUNTERS] PROGRAM TOTAL          :   99.4917s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.4846s
+ [COUNTERS] Fortran MEs      ( 1 ) :   99.0072s for     8192 events => throughput is 8.27E+01 events/s
 
 *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) ***
 --------------------
@@ -108,9 +108,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttggg_x10_fortran > /tmp/
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 2.136e-07 [2.1358436158813976E-007] fbridge_mode=0
  [UNWEIGHT] Wrote 84 events (found 1181 events)
- [COUNTERS] PROGRAM TOTAL          : 1059.0573s
- [COUNTERS] Fortran Overhead ( 0 ) :    4.0327s
- [COUNTERS] Fortran MEs      ( 1 ) : 1055.0245s for    90112 events => throughput is 8.54E+01 events/s
+ [COUNTERS] PROGRAM TOTAL          : 1092.0393s
+ [COUNTERS] Fortran Overhead ( 0 ) :    4.0525s
+ [COUNTERS] Fortran MEs      ( 1 ) : 1087.9868s for    90112 events => throughput is 8.28E+01 events/s
 
 *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -133,9 +133,9 @@ Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 1.169e-06 [1.1693100945435831E-006] fbridge_mode=1
  [UNWEIGHT] Wrote 14 events (found 457 events)
- [COUNTERS] PROGRAM TOTAL          :  215.3874s
- [COUNTERS] Fortran Overhead ( 0 ) :   96.8318s
- [COUNTERS] CudaCpp MEs      ( 2 ) :  118.5557s for     8192 events => throughput is 6.91E+01 events/s
+ [COUNTERS] PROGRAM TOTAL          :  222.5426s
+ [COUNTERS] Fortran Overhead ( 0 ) :  102.9425s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :  119.6001s for     8192 events => throughput is 6.85E+01 events/s
 
 *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -166,9 +166,9 @@ Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 2.136e-07 [2.1358436158813953E-007] fbridge_mode=1
  [UNWEIGHT] Wrote 84 events (found 1181 events)
- [COUNTERS] PROGRAM TOTAL          : 1357.2262s
- [COUNTERS] Fortran Overhead ( 0 ) :  100.3230s
- [COUNTERS] CudaCpp MEs      ( 2 ) : 1256.9032s for    90112 events => throughput is 7.17E+01 events/s
+ [COUNTERS] PROGRAM TOTAL          : 1465.7798s
+ [COUNTERS] Fortran Overhead ( 0 ) :  106.1520s
+ [COUNTERS] CudaCpp MEs      ( 2 ) : 1359.6278s for    90112 events => throughput is 6.63E+01 events/s
 
 *** (2-none) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -179,14 +179,14 @@ OK! xsec from fortran (2.1358436158813976E-007) and cpp (2.1358436158813953E-007
 OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
 
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
-Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 8.518822e+01                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.803412e+01                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
-Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 8.469802e+01                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.816908e+01                 )  sec^-1
 
 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -209,9 +209,9 @@ Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 1.169e-06 [1.1693100945435827E-006] fbridge_mode=1
  [UNWEIGHT] Wrote 14 events (found 457 events)
- [COUNTERS] PROGRAM TOTAL          :  109.0441s
- [COUNTERS] Fortran Overhead ( 0 ) :   50.2199s
- [COUNTERS] CudaCpp MEs      ( 2 ) :   58.8242s for     8192 events => throughput is 1.39E+02 events/s
+ [COUNTERS] PROGRAM TOTAL          :  111.7257s
+ [COUNTERS] Fortran Overhead ( 0 ) :   51.7792s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :   59.9465s for     8192 events => throughput is 1.37E+02 events/s
 
 *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -242,9 +242,9 @@ Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 2.136e-07 [2.1358436158813958E-007] fbridge_mode=1
  [UNWEIGHT] Wrote 84 events (found 1181 events)
- [COUNTERS] PROGRAM TOTAL          :  705.3776s
- [COUNTERS] Fortran Overhead ( 0 ) :   53.9344s
- [COUNTERS] CudaCpp MEs      ( 2 ) :  651.4432s for    90112 events => throughput is 1.38E+02 events/s
+ [COUNTERS] PROGRAM TOTAL          :  715.6348s
+ [COUNTERS] Fortran Overhead ( 0 ) :   55.3978s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :  660.2371s for    90112 events => throughput is 1.36E+02 events/s
 
 *** (2-sse4) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -255,14 +255,14 @@ OK! xsec from fortran (2.1358436158813976E-007) and cpp (2.1358436158813958E-007
 OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
 
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
-Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.640675e+02                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.594296e+02                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
-Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.645178e+02                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.596087e+02                 )  sec^-1
 
 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -285,9 +285,9 @@ Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 1.169e-06 [1.1693100945435829E-006] fbridge_mode=1
  [UNWEIGHT] Wrote 14 events (found 457 events)
- [COUNTERS] PROGRAM TOTAL          :   51.2174s
- [COUNTERS] Fortran Overhead ( 0 ) :   23.3441s
- [COUNTERS] CudaCpp MEs      ( 2 ) :   27.8733s for     8192 events => throughput is 2.94E+02 events/s
+ [COUNTERS] PROGRAM TOTAL          :   52.0589s
+ [COUNTERS] Fortran Overhead ( 0 ) :   23.9023s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :   28.1566s for     8192 events => throughput is 2.91E+02 events/s
 
 *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -318,9 +318,9 @@ Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 2.136e-07 [2.1358436158813958E-007] fbridge_mode=1
  [UNWEIGHT] Wrote 84 events (found 1181 events)
- [COUNTERS] PROGRAM TOTAL          :  332.6731s
- [COUNTERS] Fortran Overhead ( 0 ) :   26.9170s
- [COUNTERS] CudaCpp MEs      ( 2 ) :  305.7561s for    90112 events => throughput is 2.95E+02 events/s
+ [COUNTERS] PROGRAM TOTAL          :  339.5675s
+ [COUNTERS] Fortran Overhead ( 0 ) :   27.9186s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :  311.6490s for    90112 events => throughput is 2.89E+02 events/s
 
 *** (2-avx2) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -331,14 +331,14 @@ OK! xsec from fortran (2.1358436158813976E-007) and cpp (2.1358436158813958E-007
 OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
 
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
-Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.549919e+02                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.480989e+02                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
-Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.597201e+02                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.499686e+02                 )  sec^-1
 
 *** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -361,9 +361,9 @@ Executing ' ./build.512y_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 1.169e-06 [1.1693100945435829E-006] fbridge_mode=1
  [UNWEIGHT] Wrote 14 events (found 457 events)
- [COUNTERS] PROGRAM TOTAL          :   45.3753s
- [COUNTERS] Fortran Overhead ( 0 ) :   20.9676s
- [COUNTERS] CudaCpp MEs      ( 2 ) :   24.4077s for     8192 events => throughput is 3.36E+02 events/s
+ [COUNTERS] PROGRAM TOTAL          :   47.5811s
+ [COUNTERS] Fortran Overhead ( 0 ) :   21.8934s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :   25.6877s for     8192 events => throughput is 3.19E+02 events/s
 
 *** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -394,9 +394,9 @@ Executing ' ./build.512y_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 2.136e-07 [2.1358436158813958E-007] fbridge_mode=1
  [UNWEIGHT] Wrote 84 events (found 1181 events)
- [COUNTERS] PROGRAM TOTAL          :  294.5042s
- [COUNTERS] Fortran Overhead ( 0 ) :   24.4920s
- [COUNTERS] CudaCpp MEs      ( 2 ) :  270.0121s for    90112 events => throughput is 3.34E+02 events/s
+ [COUNTERS] PROGRAM TOTAL          :  305.9962s
+ [COUNTERS] Fortran Overhead ( 0 ) :   25.0145s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :  280.9817s for    90112 events => throughput is 3.21E+02 events/s
 
 *** (2-512y) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -407,14 +407,14 @@ OK! xsec from fortran (2.1358436158813976E-007) and cpp (2.1358436158813958E-007
 OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
 
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
-Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 4.021260e+02                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.924534e+02                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
-Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 4.013283e+02                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.941350e+02                 )  sec^-1
 
 *** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -437,9 +437,9 @@ Executing ' ./build.512z_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 1.169e-06 [1.1693100945435829E-006] fbridge_mode=1
  [UNWEIGHT] Wrote 14 events (found 457 events)
- [COUNTERS] PROGRAM TOTAL          :   46.3863s
- [COUNTERS] Fortran Overhead ( 0 ) :   22.5070s
- [COUNTERS] CudaCpp MEs      ( 2 ) :   23.8792s for     8192 events => throughput is 3.43E+02 events/s
+ [COUNTERS] PROGRAM TOTAL          :   47.6398s
+ [COUNTERS] Fortran Overhead ( 0 ) :   23.2794s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :   24.3604s for     8192 events => throughput is 3.36E+02 events/s
 
 *** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -470,9 +470,9 @@ Executing ' ./build.512z_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 2.136e-07 [2.1358436158813958E-007] fbridge_mode=1
  [UNWEIGHT] Wrote 84 events (found 1181 events)
- [COUNTERS] PROGRAM TOTAL          :  289.0610s
- [COUNTERS] Fortran Overhead ( 0 ) :   26.2218s
- [COUNTERS] CudaCpp MEs      ( 2 ) :  262.8391s for    90112 events => throughput is 3.43E+02 events/s
+ [COUNTERS] PROGRAM TOTAL          :  295.7345s
+ [COUNTERS] Fortran Overhead ( 0 ) :   26.8620s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :  268.8725s for    90112 events => throughput is 3.35E+02 events/s
 
 *** (2-512z) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -483,14 +483,14 @@ OK! xsec from fortran (2.1358436158813976E-007) and cpp (2.1358436158813958E-007
 OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
 
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
-Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.740541e+02                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.630819e+02                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
-Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.745479e+02                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.617459e+02                 )  sec^-1
 
 *** (3) EXECUTE MADEVENT_CUDA x1 (create events.lhe) ***
 --------------------
@@ -513,9 +513,9 @@ Executing ' ./build.none_d_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggttggg
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 1.169e-06 [1.1693100945435838E-006] fbridge_mode=1
  [UNWEIGHT] Wrote 14 events (found 457 events)
- [COUNTERS] PROGRAM TOTAL          :    4.2951s
- [COUNTERS] Fortran Overhead ( 0 ) :    3.2087s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    1.0864s for     8192 events => throughput is 7.54E+03 events/s
+ [COUNTERS] PROGRAM TOTAL          :    4.2578s
+ [COUNTERS] Fortran Overhead ( 0 ) :    3.1700s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    1.0877s for     8192 events => throughput is 7.53E+03 events/s
 
 *** (3) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -546,9 +546,9 @@ Executing ' ./build.none_d_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggttggg
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 2.136e-07 [2.1358436158813958E-007] fbridge_mode=1
  [UNWEIGHT] Wrote 84 events (found 1181 events)
- [COUNTERS] PROGRAM TOTAL          :   18.7118s
- [COUNTERS] Fortran Overhead ( 0 ) :    6.7622s
- [COUNTERS] CudaCpp MEs      ( 2 ) :   11.9496s for    90112 events => throughput is 7.54E+03 events/s
+ [COUNTERS] PROGRAM TOTAL          :   18.7016s
+ [COUNTERS] Fortran Overhead ( 0 ) :    6.7987s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :   11.9029s for    90112 events => throughput is 7.57E+03 events/s
 
 *** (3) Compare MADEVENT_CUDA x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -559,43 +559,43 @@ OK! xsec from fortran (2.1358436158813976E-007) and cpp (2.1358436158813958E-007
 OK! events.lhe.cuda.10 and events.lhe.ref.10 are identical
 
 *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge ***
-Process                     = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 7.487117e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.502804e+03                 )  sec^-1
 
 *** EXECUTE GCHECK(8192) -p 256 32 1 ***
-Process                     = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 9.224789e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 9.272554e+03                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX) -p 512 32 1 --bridge ***
-Process                     = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 9.215613e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 9.173278e+03                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX) -p 512 32 1 ***
-Process                     = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 9.512340e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 9.536211e+03                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX128THR) -p 128 128 1 --bridge ***
-Process                     = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 9.227523e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 9.207244e+03                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX128THR) -p 128 128 1 ***
-Process                     = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 9.402144e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 9.442859e+03                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX8THR) -p 2048 8 1 --bridge ***
-Process                     = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 9.254965e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 9.218238e+03                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX8THR) -p 2048 8 1 ***
-Process                     = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.253935e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.222599e+03                 )  sec^-1
 
 TEST COMPLETED
diff --git a/epochX/cudacpp/tmad/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd0.txt
index c7c6154514..7ffa42b1e8 100644
--- a/epochX/cudacpp/tmad/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd0.txt
+++ b/epochX/cudacpp/tmad/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd0.txt
@@ -4,28 +4,28 @@ CUDACPP_BUILDDIR='.'
 
 make USEBUILDDIR=1 AVX=none
 
+
 make USEBUILDDIR=1 AVX=sse4
 make USEBUILDDIR=1 AVX=avx2
+make USEBUILDDIR=1 AVX=512y
+
+make USEBUILDDIR=1 AVX=512z
 make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg'
 make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg'
 make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg'
-
-make USEBUILDDIR=1 AVX=512y
 make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg'
-
-make USEBUILDDIR=1 AVX=512z
 make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg'
-CUDACPP_BUILDDIR='build.none_f_inl0_hrd0'
+CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd0'
 CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0'
 CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0'
-CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd0'
+CUDACPP_BUILDDIR='build.none_f_inl0_hrd0'
+CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg'
-CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg'
 make[1]: Nothing to be done for 'all'.
@@ -33,9 +33,9 @@ make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/
 
 OMP_NUM_THREADS=
 
-DATE: 2023-06-17_01:08:35
+DATE: 2023-07-19_01:15:18
 
-On itscrd80.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
+On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg
 
 *** (1) EXECUTE MADEVENT_FORTRAN (create results.dat) ***
@@ -58,9 +58,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttggg_x1_fortran > /tmp/a
  [XSECTION] Configuration = 1
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 1.169e-06 [] fbridge_mode=0
- [COUNTERS] PROGRAM TOTAL          :   95.9433s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.4120s
- [COUNTERS] Fortran MEs      ( 1 ) :   95.5312s for     8192 events => throughput is 8.58E+01 events/s
+ [COUNTERS] PROGRAM TOTAL          :   99.4700s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.4182s
+ [COUNTERS] Fortran MEs      ( 1 ) :   99.0518s for     8192 events => throughput is 8.27E+01 events/s
 
 *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) ***
 --------------------
@@ -83,9 +83,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttggg_x1_fortran > /tmp/a
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 1.169e-06 [1.1693100945435808E-006] fbridge_mode=0
  [UNWEIGHT] Wrote 14 events (found 457 events)
- [COUNTERS] PROGRAM TOTAL          :   96.0818s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.4708s
- [COUNTERS] Fortran MEs      ( 1 ) :   95.6110s for     8192 events => throughput is 8.57E+01 events/s
+ [COUNTERS] PROGRAM TOTAL          :   99.3733s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.4984s
+ [COUNTERS] Fortran MEs      ( 1 ) :   98.8749s for     8192 events => throughput is 8.29E+01 events/s
 
 *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) ***
 --------------------
@@ -108,9 +108,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttggg_x10_fortran > /tmp/
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 2.136e-07 [2.1358436158813976E-007] fbridge_mode=0
  [UNWEIGHT] Wrote 84 events (found 1181 events)
- [COUNTERS] PROGRAM TOTAL          : 1056.4460s
- [COUNTERS] Fortran Overhead ( 0 ) :    4.0785s
- [COUNTERS] Fortran MEs      ( 1 ) : 1052.3676s for    90112 events => throughput is 8.56E+01 events/s
+ [COUNTERS] PROGRAM TOTAL          : 1089.4213s
+ [COUNTERS] Fortran Overhead ( 0 ) :    4.0618s
+ [COUNTERS] Fortran MEs      ( 1 ) : 1085.3595s for    90112 events => throughput is 8.30E+01 events/s
 
 *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -131,15 +131,15 @@ Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_
  [XSECTION] MultiChannel = TRUE
  [XSECTION] Configuration = 1
  [XSECTION] ChannelId = 1
- [XSECTION] Cross section = 1.169e-06 [1.1694768395608941E-006] fbridge_mode=1
+ [XSECTION] Cross section = 1.169e-06 [1.1694768395202781E-006] fbridge_mode=1
  [UNWEIGHT] Wrote 14 events (found 457 events)
- [COUNTERS] PROGRAM TOTAL          :  201.5191s
- [COUNTERS] Fortran Overhead ( 0 ) :   92.3001s
- [COUNTERS] CudaCpp MEs      ( 2 ) :  109.2190s for     8192 events => throughput is 7.50E+01 events/s
+ [COUNTERS] PROGRAM TOTAL          :  202.7920s
+ [COUNTERS] Fortran Overhead ( 0 ) :   93.7840s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :  109.0081s for     8192 events => throughput is 7.52E+01 events/s
 
 *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
-OK! xsec from fortran (1.1693100945435808E-006) and cpp (1.1694768395608941E-006) differ by less than 4E-4 (0.0001426011954326345)
+OK! xsec from fortran (1.1693100945435808E-006) and cpp (1.1694768395202781E-006) differ by less than 4E-4 (0.00014260116069753082)
 
 *** (2-none) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) ***
 
@@ -164,29 +164,29 @@ Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_
  [XSECTION] MultiChannel = TRUE
  [XSECTION] Configuration = 1
  [XSECTION] ChannelId = 1
- [XSECTION] Cross section = 2.136e-07 [2.1361436148187123E-007] fbridge_mode=1
+ [XSECTION] Cross section = 2.136e-07 [2.1361436140448921E-007] fbridge_mode=1
  [UNWEIGHT] Wrote 84 events (found 1181 events)
- [COUNTERS] PROGRAM TOTAL          : 1305.0132s
- [COUNTERS] Fortran Overhead ( 0 ) :   96.3805s
- [COUNTERS] CudaCpp MEs      ( 2 ) : 1208.6327s for    90112 events => throughput is 7.46E+01 events/s
+ [COUNTERS] PROGRAM TOTAL          : 1303.2103s
+ [COUNTERS] Fortran Overhead ( 0 ) :   97.8079s
+ [COUNTERS] CudaCpp MEs      ( 2 ) : 1205.4025s for    90112 events => throughput is 7.48E+01 events/s
 
 *** (2-none) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
 
-OK! xsec from fortran (2.1358436158813976E-007) and cpp (2.1361436148187123E-007) differ by less than 4E-4 (0.00014045922420713453)
+OK! xsec from fortran (2.1358436158813976E-007) and cpp (2.1361436140448921E-007) differ by less than 4E-4 (0.00014045886190539036)
 
 *** (2-none) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) ***
 
 OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
 
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
-Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 8.869030e+01                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 8.764186e+01                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
-Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 8.908799e+01                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 8.757019e+01                 )  sec^-1
 
 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -209,9 +209,9 @@ Executing ' ./build.sse4_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 1.169e-06 [1.1694766634537254E-006] fbridge_mode=1
  [UNWEIGHT] Wrote 14 events (found 457 events)
- [COUNTERS] PROGRAM TOTAL          :   48.8581s
- [COUNTERS] Fortran Overhead ( 0 ) :   22.8819s
- [COUNTERS] CudaCpp MEs      ( 2 ) :   25.9762s for     8192 events => throughput is 3.15E+02 events/s
+ [COUNTERS] PROGRAM TOTAL          :   51.0027s
+ [COUNTERS] Fortran Overhead ( 0 ) :   24.0099s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :   26.9929s for     8192 events => throughput is 3.03E+02 events/s
 
 *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -242,9 +242,9 @@ Executing ' ./build.sse4_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 2.136e-07 [2.1361435622518579E-007] fbridge_mode=1
  [UNWEIGHT] Wrote 84 events (found 1181 events)
- [COUNTERS] PROGRAM TOTAL          :  311.8489s
- [COUNTERS] Fortran Overhead ( 0 ) :   26.4793s
- [COUNTERS] CudaCpp MEs      ( 2 ) :  285.3696s for    90112 events => throughput is 3.16E+02 events/s
+ [COUNTERS] PROGRAM TOTAL          :  324.5174s
+ [COUNTERS] Fortran Overhead ( 0 ) :   28.1889s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :  296.3285s for    90112 events => throughput is 3.04E+02 events/s
 
 *** (2-sse4) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -255,14 +255,14 @@ OK! xsec from fortran (2.1358436158813976E-007) and cpp (2.1361435622518579E-007
 OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
 
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
-Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.654834e+02                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.489004e+02                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
-Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.636170e+02                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.477635e+02                 )  sec^-1
 
 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -285,9 +285,9 @@ Executing ' ./build.avx2_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 1.169e-06 [1.1694765364749936E-006] fbridge_mode=1
  [UNWEIGHT] Wrote 14 events (found 457 events)
- [COUNTERS] PROGRAM TOTAL          :   25.9178s
- [COUNTERS] Fortran Overhead ( 0 ) :   12.1541s
- [COUNTERS] CudaCpp MEs      ( 2 ) :   13.7637s for     8192 events => throughput is 5.95E+02 events/s
+ [COUNTERS] PROGRAM TOTAL          :   26.5180s
+ [COUNTERS] Fortran Overhead ( 0 ) :   12.3665s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :   14.1515s for     8192 events => throughput is 5.79E+02 events/s
 
 *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -318,9 +318,9 @@ Executing ' ./build.avx2_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 2.136e-07 [2.1361435955979457E-007] fbridge_mode=1
  [UNWEIGHT] Wrote 84 events (found 1181 events)
- [COUNTERS] PROGRAM TOTAL          :  167.6069s
- [COUNTERS] Fortran Overhead ( 0 ) :   15.7359s
- [COUNTERS] CudaCpp MEs      ( 2 ) :  151.8710s for    90112 events => throughput is 5.93E+02 events/s
+ [COUNTERS] PROGRAM TOTAL          :  172.2093s
+ [COUNTERS] Fortran Overhead ( 0 ) :   15.9965s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :  156.2128s for    90112 events => throughput is 5.77E+02 events/s
 
 *** (2-avx2) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -331,14 +331,14 @@ OK! xsec from fortran (2.1358436158813976E-007) and cpp (2.1361435955979457E-007
 OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
 
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
-Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 7.068292e+02                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 6.926115e+02                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
-Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 7.077191e+02                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 6.916787e+02                 )  sec^-1
 
 *** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -361,9 +361,9 @@ Executing ' ./build.512y_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 1.169e-06 [1.1694765364749936E-006] fbridge_mode=1
  [UNWEIGHT] Wrote 14 events (found 457 events)
- [COUNTERS] PROGRAM TOTAL          :   23.2564s
- [COUNTERS] Fortran Overhead ( 0 ) :   10.7772s
- [COUNTERS] CudaCpp MEs      ( 2 ) :   12.4793s for     8192 events => throughput is 6.56E+02 events/s
+ [COUNTERS] PROGRAM TOTAL          :   23.7524s
+ [COUNTERS] Fortran Overhead ( 0 ) :   11.0666s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :   12.6858s for     8192 events => throughput is 6.46E+02 events/s
 
 *** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -394,9 +394,9 @@ Executing ' ./build.512y_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 2.136e-07 [2.1361435955979457E-007] fbridge_mode=1
  [UNWEIGHT] Wrote 84 events (found 1181 events)
- [COUNTERS] PROGRAM TOTAL          :  152.0499s
- [COUNTERS] Fortran Overhead ( 0 ) :   14.4286s
- [COUNTERS] CudaCpp MEs      ( 2 ) :  137.6212s for    90112 events => throughput is 6.55E+02 events/s
+ [COUNTERS] PROGRAM TOTAL          :  153.6582s
+ [COUNTERS] Fortran Overhead ( 0 ) :   14.6902s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :  138.9680s for    90112 events => throughput is 6.48E+02 events/s
 
 *** (2-512y) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -407,14 +407,14 @@ OK! xsec from fortran (2.1358436158813976E-007) and cpp (2.1361435955979457E-007
 OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
 
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
-Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 7.965515e+02                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.812155e+02                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
-Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 7.954223e+02                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.766114e+02                 )  sec^-1
 
 *** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -437,9 +437,9 @@ Executing ' ./build.512z_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 1.169e-06 [1.1694767893082863E-006] fbridge_mode=1
  [UNWEIGHT] Wrote 14 events (found 457 events)
- [COUNTERS] PROGRAM TOTAL          :   23.5866s
- [COUNTERS] Fortran Overhead ( 0 ) :   11.6220s
- [COUNTERS] CudaCpp MEs      ( 2 ) :   11.9645s for     8192 events => throughput is 6.85E+02 events/s
+ [COUNTERS] PROGRAM TOTAL          :   24.0249s
+ [COUNTERS] Fortran Overhead ( 0 ) :   11.7974s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :   12.2275s for     8192 events => throughput is 6.70E+02 events/s
 
 *** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -470,9 +470,9 @@ Executing ' ./build.512z_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 2.136e-07 [2.1361441834174529E-007] fbridge_mode=1
  [UNWEIGHT] Wrote 84 events (found 1181 events)
- [COUNTERS] PROGRAM TOTAL          :  145.9255s
- [COUNTERS] Fortran Overhead ( 0 ) :   15.0340s
- [COUNTERS] CudaCpp MEs      ( 2 ) :  130.8914s for    90112 events => throughput is 6.88E+02 events/s
+ [COUNTERS] PROGRAM TOTAL          :  150.5771s
+ [COUNTERS] Fortran Overhead ( 0 ) :   15.4619s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :  135.1153s for    90112 events => throughput is 6.67E+02 events/s
 
 *** (2-512z) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -483,14 +483,14 @@ OK! xsec from fortran (2.1358436158813976E-007) and cpp (2.1361441834174529E-007
 OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
 
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
-Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 7.533944e+02                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.259660e+02                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
-Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 7.436083e+02                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.254966e+02                 )  sec^-1
 
 *** (3) EXECUTE MADEVENT_CUDA x1 (create events.lhe) ***
 --------------------
@@ -513,9 +513,9 @@ Executing ' ./build.none_f_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggttggg
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 1.169e-06 [1.1694770708195000E-006] fbridge_mode=1
  [UNWEIGHT] Wrote 14 events (found 457 events)
- [COUNTERS] PROGRAM TOTAL          :    2.5187s
- [COUNTERS] Fortran Overhead ( 0 ) :    2.0274s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.4913s for     8192 events => throughput is 1.67E+04 events/s
+ [COUNTERS] PROGRAM TOTAL          :    2.4902s
+ [COUNTERS] Fortran Overhead ( 0 ) :    1.9980s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.4922s for     8192 events => throughput is 1.66E+04 events/s
 
 *** (3) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -546,9 +546,9 @@ Executing ' ./build.none_f_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggttggg
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 2.136e-07 [2.1361443477565659E-007] fbridge_mode=1
  [UNWEIGHT] Wrote 84 events (found 1181 events)
- [COUNTERS] PROGRAM TOTAL          :   11.0853s
- [COUNTERS] Fortran Overhead ( 0 ) :    5.6057s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    5.4795s for    90112 events => throughput is 1.64E+04 events/s
+ [COUNTERS] PROGRAM TOTAL          :   11.1455s
+ [COUNTERS] Fortran Overhead ( 0 ) :    5.6649s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    5.4806s for    90112 events => throughput is 1.64E+04 events/s
 
 *** (3) Compare MADEVENT_CUDA x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -559,43 +559,43 @@ OK! xsec from fortran (2.1358436158813976E-007) and cpp (2.1361443477565659E-007
 OK! events.lhe.cuda.10 and events.lhe.ref.10 are identical
 
 *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge ***
-Process                     = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.640525e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.624769e+04                 )  sec^-1
 
 *** EXECUTE GCHECK(8192) -p 256 32 1 ***
-Process                     = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.649395e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.628076e+04                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX) -p 512 32 1 --bridge ***
-Process                     = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.325104e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.322584e+04                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX) -p 512 32 1 ***
-Process                     = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.391765e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.352873e+04                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX128THR) -p 128 128 1 --bridge ***
-Process                     = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.322841e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.316394e+04                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX128THR) -p 128 128 1 ***
-Process                     = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.376800e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.365575e+04                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX8THR) -p 2048 8 1 --bridge ***
-Process                     = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.338034e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.324891e+04                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX8THR) -p 2048 8 1 ***
-Process                     = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 6.503445e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 6.457308e+03                 )  sec^-1
 
 TEST COMPLETED
diff --git a/epochX/cudacpp/tmad/logs_ggttggg_mad/log_ggttggg_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_ggttggg_mad/log_ggttggg_mad_m_inl0_hrd0.txt
index 40600c6dee..bb4557ce90 100644
--- a/epochX/cudacpp/tmad/logs_ggttggg_mad/log_ggttggg_mad_m_inl0_hrd0.txt
+++ b/epochX/cudacpp/tmad/logs_ggttggg_mad/log_ggttggg_mad_m_inl0_hrd0.txt
@@ -4,17 +4,18 @@ CUDACPP_BUILDDIR='.'
 
 
 make USEBUILDDIR=1 AVX=none
+
 make USEBUILDDIR=1 AVX=sse4
 make USEBUILDDIR=1 AVX=avx2
-
 make USEBUILDDIR=1 AVX=512y
+
+make USEBUILDDIR=1 AVX=512z
 make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg'
 make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg'
 make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg'
 make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg'
-
-make USEBUILDDIR=1 AVX=512z
 make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg'
+CUDACPP_BUILDDIR='build.512z_m_inl0_hrd0'
 CUDACPP_BUILDDIR='build.none_m_inl0_hrd0'
 CUDACPP_BUILDDIR='build.sse4_m_inl0_hrd0'
 CUDACPP_BUILDDIR='build.avx2_m_inl0_hrd0'
@@ -23,7 +24,6 @@ make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg'
-CUDACPP_BUILDDIR='build.512z_m_inl0_hrd0'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg'
 make[1]: Nothing to be done for 'all'.
@@ -33,9 +33,9 @@ make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/
 
 OMP_NUM_THREADS=
 
-DATE: 2023-06-17_02:15:04
+DATE: 2023-07-19_02:23:00
 
-On itscrd80.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
+On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg
 
 *** (1) EXECUTE MADEVENT_FORTRAN (create results.dat) ***
@@ -58,9 +58,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttggg_x1_fortran > /tmp/a
  [XSECTION] Configuration = 1
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 1.169e-06 [] fbridge_mode=0
- [COUNTERS] PROGRAM TOTAL          :   96.1609s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.4132s
- [COUNTERS] Fortran MEs      ( 1 ) :   95.7477s for     8192 events => throughput is 8.56E+01 events/s
+ [COUNTERS] PROGRAM TOTAL          :   99.4193s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.4375s
+ [COUNTERS] Fortran MEs      ( 1 ) :   98.9818s for     8192 events => throughput is 8.28E+01 events/s
 
 *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) ***
 --------------------
@@ -83,9 +83,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttggg_x1_fortran > /tmp/a
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 1.169e-06 [1.1693100945435808E-006] fbridge_mode=0
  [UNWEIGHT] Wrote 14 events (found 457 events)
- [COUNTERS] PROGRAM TOTAL          :   96.6436s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.4751s
- [COUNTERS] Fortran MEs      ( 1 ) :   96.1685s for     8192 events => throughput is 8.52E+01 events/s
+ [COUNTERS] PROGRAM TOTAL          :   99.9753s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.4785s
+ [COUNTERS] Fortran MEs      ( 1 ) :   99.4967s for     8192 events => throughput is 8.23E+01 events/s
 
 *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) ***
 --------------------
@@ -108,9 +108,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttggg_x10_fortran > /tmp/
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 2.136e-07 [2.1358436158813976E-007] fbridge_mode=0
  [UNWEIGHT] Wrote 84 events (found 1181 events)
- [COUNTERS] PROGRAM TOTAL          : 1059.8674s
- [COUNTERS] Fortran Overhead ( 0 ) :    4.0491s
- [COUNTERS] Fortran MEs      ( 1 ) : 1055.8184s for    90112 events => throughput is 8.53E+01 events/s
+ [COUNTERS] PROGRAM TOTAL          : 1093.3960s
+ [COUNTERS] Fortran Overhead ( 0 ) :    4.0669s
+ [COUNTERS] Fortran MEs      ( 1 ) : 1089.3291s for    90112 events => throughput is 8.27E+01 events/s
 
 *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -133,9 +133,9 @@ Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 1.169e-06 [1.1693101016896846E-006] fbridge_mode=1
  [UNWEIGHT] Wrote 14 events (found 457 events)
- [COUNTERS] PROGRAM TOTAL          :  211.4841s
- [COUNTERS] Fortran Overhead ( 0 ) :   97.1384s
- [COUNTERS] CudaCpp MEs      ( 2 ) :  114.3457s for     8192 events => throughput is 7.16E+01 events/s
+ [COUNTERS] PROGRAM TOTAL          :  233.3234s
+ [COUNTERS] Fortran Overhead ( 0 ) :  107.7432s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :  125.5802s for     8192 events => throughput is 6.52E+01 events/s
 
 *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -166,9 +166,9 @@ Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 2.136e-07 [2.1358436275882778E-007] fbridge_mode=1
  [UNWEIGHT] Wrote 84 events (found 1181 events)
- [COUNTERS] PROGRAM TOTAL          : 1372.8556s
- [COUNTERS] Fortran Overhead ( 0 ) :  101.8141s
- [COUNTERS] CudaCpp MEs      ( 2 ) : 1271.0415s for    90112 events => throughput is 7.09E+01 events/s
+ [COUNTERS] PROGRAM TOTAL          : 1483.7189s
+ [COUNTERS] Fortran Overhead ( 0 ) :  110.6562s
+ [COUNTERS] CudaCpp MEs      ( 2 ) : 1373.0626s for    90112 events => throughput is 6.56E+01 events/s
 
 *** (2-none) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -179,14 +179,14 @@ OK! xsec from fortran (2.1358436158813976E-007) and cpp (2.1358436275882778E-007
 OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
 
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
-Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 8.162213e+01                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.683350e+01                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
-Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 8.331097e+01                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.685547e+01                 )  sec^-1
 
 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -209,9 +209,9 @@ Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 1.169e-06 [1.1693101020910778E-006] fbridge_mode=1
  [UNWEIGHT] Wrote 14 events (found 457 events)
- [COUNTERS] PROGRAM TOTAL          :  110.6811s
- [COUNTERS] Fortran Overhead ( 0 ) :   51.3766s
- [COUNTERS] CudaCpp MEs      ( 2 ) :   59.3045s for     8192 events => throughput is 1.38E+02 events/s
+ [COUNTERS] PROGRAM TOTAL          :  115.0299s
+ [COUNTERS] Fortran Overhead ( 0 ) :   52.5179s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :   62.5120s for     8192 events => throughput is 1.31E+02 events/s
 
 *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -242,9 +242,9 @@ Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 2.136e-07 [2.1358436284111598E-007] fbridge_mode=1
  [UNWEIGHT] Wrote 84 events (found 1181 events)
- [COUNTERS] PROGRAM TOTAL          :  688.7864s
- [COUNTERS] Fortran Overhead ( 0 ) :   54.7130s
- [COUNTERS] CudaCpp MEs      ( 2 ) :  634.0734s for    90112 events => throughput is 1.42E+02 events/s
+ [COUNTERS] PROGRAM TOTAL          :  720.0316s
+ [COUNTERS] Fortran Overhead ( 0 ) :   56.0965s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :  663.9351s for    90112 events => throughput is 1.36E+02 events/s
 
 *** (2-sse4) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -255,14 +255,14 @@ OK! xsec from fortran (2.1358436158813976E-007) and cpp (2.1358436284111598E-007
 OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
 
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
-Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.616861e+02                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.574498e+02                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
-Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.616649e+02                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.565670e+02                 )  sec^-1
 
 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -285,9 +285,9 @@ Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 1.169e-06 [1.1693101021831071E-006] fbridge_mode=1
  [UNWEIGHT] Wrote 14 events (found 457 events)
- [COUNTERS] PROGRAM TOTAL          :   49.6391s
- [COUNTERS] Fortran Overhead ( 0 ) :   22.3277s
- [COUNTERS] CudaCpp MEs      ( 2 ) :   27.3114s for     8192 events => throughput is 3.00E+02 events/s
+ [COUNTERS] PROGRAM TOTAL          :   50.7158s
+ [COUNTERS] Fortran Overhead ( 0 ) :   22.9716s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :   27.7442s for     8192 events => throughput is 2.95E+02 events/s
 
 *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -318,9 +318,9 @@ Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 2.136e-07 [2.1358436281462142E-007] fbridge_mode=1
  [UNWEIGHT] Wrote 84 events (found 1181 events)
- [COUNTERS] PROGRAM TOTAL          :  327.4154s
- [COUNTERS] Fortran Overhead ( 0 ) :   25.8163s
- [COUNTERS] CudaCpp MEs      ( 2 ) :  301.5990s for    90112 events => throughput is 2.99E+02 events/s
+ [COUNTERS] PROGRAM TOTAL          :  320.7675s
+ [COUNTERS] Fortran Overhead ( 0 ) :   26.3391s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :  294.4284s for    90112 events => throughput is 3.06E+02 events/s
 
 *** (2-avx2) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -331,14 +331,14 @@ OK! xsec from fortran (2.1358436158813976E-007) and cpp (2.1358436281462142E-007
 OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
 
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
-Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.748239e+02                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.668610e+02                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
-Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.733522e+02                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.661090e+02                 )  sec^-1
 
 *** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -361,9 +361,9 @@ Executing ' ./build.512y_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 1.169e-06 [1.1693101021831071E-006] fbridge_mode=1
  [UNWEIGHT] Wrote 14 events (found 457 events)
- [COUNTERS] PROGRAM TOTAL          :   43.4217s
- [COUNTERS] Fortran Overhead ( 0 ) :   19.9092s
- [COUNTERS] CudaCpp MEs      ( 2 ) :   23.5125s for     8192 events => throughput is 3.48E+02 events/s
+ [COUNTERS] PROGRAM TOTAL          :   44.9958s
+ [COUNTERS] Fortran Overhead ( 0 ) :   20.3872s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :   24.6086s for     8192 events => throughput is 3.33E+02 events/s
 
 *** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -394,9 +394,9 @@ Executing ' ./build.512y_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 2.136e-07 [2.1358436281462142E-007] fbridge_mode=1
  [UNWEIGHT] Wrote 84 events (found 1181 events)
- [COUNTERS] PROGRAM TOTAL          :  282.1566s
- [COUNTERS] Fortran Overhead ( 0 ) :   23.4368s
- [COUNTERS] CudaCpp MEs      ( 2 ) :  258.7198s for    90112 events => throughput is 3.48E+02 events/s
+ [COUNTERS] PROGRAM TOTAL          :  291.0724s
+ [COUNTERS] Fortran Overhead ( 0 ) :   24.0309s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :  267.0414s for    90112 events => throughput is 3.37E+02 events/s
 
 *** (2-512y) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -407,14 +407,14 @@ OK! xsec from fortran (2.1358436158813976E-007) and cpp (2.1358436281462142E-007
 OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
 
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
-Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 4.246539e+02                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.101365e+02                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
-Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 4.237720e+02                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.159557e+02                 )  sec^-1
 
 *** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -437,9 +437,9 @@ Executing ' ./build.512z_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 1.169e-06 [1.1693101021831071E-006] fbridge_mode=1
  [UNWEIGHT] Wrote 14 events (found 457 events)
- [COUNTERS] PROGRAM TOTAL          :   45.4934s
- [COUNTERS] Fortran Overhead ( 0 ) :   21.9129s
- [COUNTERS] CudaCpp MEs      ( 2 ) :   23.5805s for     8192 events => throughput is 3.47E+02 events/s
+ [COUNTERS] PROGRAM TOTAL          :   47.0065s
+ [COUNTERS] Fortran Overhead ( 0 ) :   22.5577s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :   24.4489s for     8192 events => throughput is 3.35E+02 events/s
 
 *** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -470,9 +470,9 @@ Executing ' ./build.512z_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 2.136e-07 [2.1358436281462142E-007] fbridge_mode=1
  [UNWEIGHT] Wrote 84 events (found 1181 events)
- [COUNTERS] PROGRAM TOTAL          :  285.7173s
- [COUNTERS] Fortran Overhead ( 0 ) :   25.4131s
- [COUNTERS] CudaCpp MEs      ( 2 ) :  260.3043s for    90112 events => throughput is 3.46E+02 events/s
+ [COUNTERS] PROGRAM TOTAL          :  291.9821s
+ [COUNTERS] Fortran Overhead ( 0 ) :   26.0599s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :  265.9222s for    90112 events => throughput is 3.39E+02 events/s
 
 *** (2-512z) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -483,14 +483,14 @@ OK! xsec from fortran (2.1358436158813976E-007) and cpp (2.1358436281462142E-007
 OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
 
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
-Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.849415e+02                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.739816e+02                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
-Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.819185e+02                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.730410e+02                 )  sec^-1
 
 *** (3) EXECUTE MADEVENT_CUDA x1 (create events.lhe) ***
 --------------------
@@ -513,9 +513,9 @@ Executing ' ./build.none_m_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggttggg
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 1.169e-06 [1.1693100942770687E-006] fbridge_mode=1
  [UNWEIGHT] Wrote 14 events (found 457 events)
- [COUNTERS] PROGRAM TOTAL          :    3.6199s
- [COUNTERS] Fortran Overhead ( 0 ) :    2.7558s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.8641s for     8192 events => throughput is 9.48E+03 events/s
+ [COUNTERS] PROGRAM TOTAL          :    3.6269s
+ [COUNTERS] Fortran Overhead ( 0 ) :    2.7646s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.8623s for     8192 events => throughput is 9.50E+03 events/s
 
 *** (3) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -546,9 +546,9 @@ Executing ' ./build.none_m_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggttggg
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 2.136e-07 [2.1358436157495368E-007] fbridge_mode=1
  [UNWEIGHT] Wrote 84 events (found 1181 events)
- [COUNTERS] PROGRAM TOTAL          :   15.8257s
- [COUNTERS] Fortran Overhead ( 0 ) :    6.3170s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    9.5087s for    90112 events => throughput is 9.48E+03 events/s
+ [COUNTERS] PROGRAM TOTAL          :   15.8537s
+ [COUNTERS] Fortran Overhead ( 0 ) :    6.3851s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    9.4686s for    90112 events => throughput is 9.52E+03 events/s
 
 *** (3) Compare MADEVENT_CUDA x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -559,43 +559,43 @@ OK! xsec from fortran (2.1358436158813976E-007) and cpp (2.1358436157495368E-007
 OK! events.lhe.cuda.10 and events.lhe.ref.10 are identical
 
 *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge ***
-Process                     = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 9.405516e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 9.456219e+03                 )  sec^-1
 
 *** EXECUTE GCHECK(8192) -p 256 32 1 ***
-Process                     = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.081261e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.083869e+04                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX) -p 512 32 1 --bridge ***
-Process                     = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.109602e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.105733e+04                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX) -p 512 32 1 ***
-Process                     = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.160351e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.160235e+04                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX128THR) -p 128 128 1 --bridge ***
-Process                     = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.107621e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.109947e+04                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX128THR) -p 128 128 1 ***
-Process                     = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.111526e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.104461e+04                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX8THR) -p 2048 8 1 --bridge ***
-Process                     = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.109298e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.107698e+04                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX8THR) -p 2048 8 1 ***
-Process                     = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.655752e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.641814e+03                 )  sec^-1
 
 TEST COMPLETED

From 65350c596b3204e73c0dfd42137b00127a5f98cb Mon Sep 17 00:00:00 2001
From: Andrea Valassi <andrea.valassi@cern.ch>
Date: Wed, 19 Jul 2023 09:16:23 +0200
Subject: [PATCH 388/509] [jthip] go back to upstream/master tput/tmad logs for
 easier merging

Revert "[jthip] rerun 15 tmad alltees (for cuda/c++) after including HIP, all looks ok"
This reverts commit 18ffff2215507e7b30debf1c1ecffd3a47b7ba6d.

Revert "[jthip] rerun 78 tput alltees (for CUDA/C++) after including HIP, all looks ok"
This reverts commit 1050176e78c04fc35feea3c8963fed7739441feb.
---
 .../log_eemumu_mad_d_inl0_hrd0.txt            | 176 ++++++-------
 .../log_eemumu_mad_f_inl0_hrd0.txt            | 184 +++++++-------
 .../log_eemumu_mad_m_inl0_hrd0.txt            | 180 +++++++-------
 .../log_ggtt_mad_d_inl0_hrd0.txt              | 174 ++++++-------
 .../log_ggtt_mad_f_inl0_hrd0.txt              | 182 +++++++-------
 .../log_ggtt_mad_m_inl0_hrd0.txt              | 178 ++++++-------
 .../log_ggttg_mad_d_inl0_hrd0.txt             | 224 ++++++++---------
 .../log_ggttg_mad_f_inl0_hrd0.txt             | 232 ++++++++---------
 .../log_ggttg_mad_m_inl0_hrd0.txt             | 234 +++++++++---------
 .../log_ggttgg_mad_d_inl0_hrd0.txt            |  42 ++--
 .../log_ggttgg_mad_f_inl0_hrd0.txt            |  42 ++--
 .../log_ggttgg_mad_m_inl0_hrd0.txt            |  44 ++--
 .../log_ggttggg_mad_d_inl0_hrd0.txt           | 180 +++++++-------
 .../log_ggttggg_mad_f_inl0_hrd0.txt           | 188 +++++++-------
 .../log_ggttggg_mad_m_inl0_hrd0.txt           | 174 ++++++-------
 .../log_eemumu_mad_d_inl0_hrd0.txt            | 100 ++++----
 .../log_eemumu_mad_d_inl0_hrd0_bridge.txt     | 100 ++++----
 .../log_eemumu_mad_d_inl0_hrd0_common.txt     | 100 ++++----
 .../log_eemumu_mad_d_inl0_hrd0_curhst.txt     | 100 ++++----
 .../log_eemumu_mad_d_inl0_hrd0_rmbhst.txt     | 100 ++++----
 .../log_eemumu_mad_d_inl0_hrd1.txt            | 100 ++++----
 .../log_eemumu_mad_d_inl1_hrd0.txt            | 100 ++++----
 .../log_eemumu_mad_d_inl1_hrd1.txt            | 100 ++++----
 .../log_eemumu_mad_f_inl0_hrd0.txt            | 104 ++++----
 .../log_eemumu_mad_f_inl0_hrd0_bridge.txt     | 106 ++++----
 .../log_eemumu_mad_f_inl0_hrd0_common.txt     | 104 ++++----
 .../log_eemumu_mad_f_inl0_hrd0_curhst.txt     | 104 ++++----
 .../log_eemumu_mad_f_inl0_hrd0_rmbhst.txt     | 106 ++++----
 .../log_eemumu_mad_f_inl0_hrd1.txt            | 104 ++++----
 .../log_eemumu_mad_f_inl1_hrd0.txt            | 104 ++++----
 .../log_eemumu_mad_f_inl1_hrd1.txt            | 104 ++++----
 .../log_eemumu_mad_m_inl0_hrd0.txt            | 100 ++++----
 .../log_eemumu_mad_m_inl0_hrd1.txt            | 100 ++++----
 .../log_ggtt_mad_d_inl0_hrd0.txt              | 102 ++++----
 .../log_ggtt_mad_d_inl0_hrd0_bridge.txt       | 102 ++++----
 .../log_ggtt_mad_d_inl0_hrd0_common.txt       | 102 ++++----
 .../log_ggtt_mad_d_inl0_hrd0_curhst.txt       | 102 ++++----
 .../log_ggtt_mad_d_inl0_hrd0_rmbhst.txt       | 102 ++++----
 .../log_ggtt_mad_d_inl0_hrd1.txt              | 100 ++++----
 .../log_ggtt_mad_d_inl1_hrd0.txt              | 100 ++++----
 .../log_ggtt_mad_d_inl1_hrd1.txt              | 102 ++++----
 .../log_ggtt_mad_f_inl0_hrd0.txt              | 108 ++++----
 .../log_ggtt_mad_f_inl0_hrd0_bridge.txt       | 110 ++++----
 .../log_ggtt_mad_f_inl0_hrd0_common.txt       | 108 ++++----
 .../log_ggtt_mad_f_inl0_hrd0_curhst.txt       | 108 ++++----
 .../log_ggtt_mad_f_inl0_hrd0_rmbhst.txt       | 110 ++++----
 .../log_ggtt_mad_f_inl0_hrd1.txt              | 104 ++++----
 .../log_ggtt_mad_f_inl1_hrd0.txt              | 104 ++++----
 .../log_ggtt_mad_f_inl1_hrd1.txt              | 106 ++++----
 .../log_ggtt_mad_m_inl0_hrd0.txt              | 102 ++++----
 .../log_ggtt_mad_m_inl0_hrd1.txt              | 102 ++++----
 .../log_ggttg_mad_d_inl0_hrd0.txt             | 118 ++++-----
 .../log_ggttg_mad_d_inl0_hrd0_bridge.txt      | 118 ++++-----
 .../log_ggttg_mad_d_inl0_hrd1.txt             | 120 ++++-----
 .../log_ggttg_mad_f_inl0_hrd0.txt             | 124 +++++-----
 .../log_ggttg_mad_f_inl0_hrd0_bridge.txt      | 126 +++++-----
 .../log_ggttg_mad_f_inl0_hrd1.txt             | 124 +++++-----
 .../log_ggttg_mad_m_inl0_hrd0.txt             | 124 +++++-----
 .../log_ggttg_mad_m_inl0_hrd1.txt             | 122 ++++-----
 .../log_ggttgg_mad_d_inl0_hrd0.txt            | 120 ++++-----
 .../log_ggttgg_mad_d_inl0_hrd0_bridge.txt     | 120 ++++-----
 .../log_ggttgg_mad_d_inl0_hrd0_common.txt     | 120 ++++-----
 .../log_ggttgg_mad_d_inl0_hrd0_curhst.txt     | 120 ++++-----
 .../log_ggttgg_mad_d_inl0_hrd0_rmbhst.txt     | 120 ++++-----
 .../log_ggttgg_mad_d_inl0_hrd1.txt            | 122 ++++-----
 .../log_ggttgg_mad_d_inl1_hrd0.txt            | 118 ++++-----
 .../log_ggttgg_mad_d_inl1_hrd1.txt            | 118 ++++-----
 .../log_ggttgg_mad_f_inl0_hrd0.txt            | 124 +++++-----
 .../log_ggttgg_mad_f_inl0_hrd0_bridge.txt     | 126 +++++-----
 .../log_ggttgg_mad_f_inl0_hrd0_common.txt     | 124 +++++-----
 .../log_ggttgg_mad_f_inl0_hrd0_curhst.txt     | 124 +++++-----
 .../log_ggttgg_mad_f_inl0_hrd0_rmbhst.txt     | 126 +++++-----
 .../log_ggttgg_mad_f_inl0_hrd1.txt            | 124 +++++-----
 .../log_ggttgg_mad_f_inl1_hrd0.txt            | 122 ++++-----
 .../log_ggttgg_mad_f_inl1_hrd1.txt            | 122 ++++-----
 .../log_ggttgg_mad_m_inl0_hrd0.txt            | 124 +++++-----
 .../log_ggttgg_mad_m_inl0_hrd1.txt            | 126 +++++-----
 .../log_ggttggg_mad_d_inl0_hrd0.txt           | 120 ++++-----
 .../log_ggttggg_mad_d_inl0_hrd0_bridge.txt    | 120 ++++-----
 .../log_ggttggg_mad_d_inl0_hrd1.txt           | 124 +++++-----
 .../log_ggttggg_mad_f_inl0_hrd0.txt           | 128 +++++-----
 .../log_ggttggg_mad_f_inl0_hrd0_bridge.txt    | 128 +++++-----
 .../log_ggttggg_mad_f_inl0_hrd1.txt           | 134 +++++-----
 .../log_ggttggg_mad_m_inl0_hrd0.txt           | 120 ++++-----
 .../log_ggttggg_mad_m_inl0_hrd1.txt           | 124 +++++-----
 .../log_gqttq_mad_d_inl0_hrd0.txt             | 118 ++++-----
 .../log_gqttq_mad_d_inl0_hrd0_bridge.txt      | 118 ++++-----
 .../log_gqttq_mad_d_inl0_hrd1.txt             | 118 ++++-----
 .../log_gqttq_mad_f_inl0_hrd0.txt             | 122 ++++-----
 .../log_gqttq_mad_f_inl0_hrd0_bridge.txt      | 124 +++++-----
 .../log_gqttq_mad_f_inl0_hrd1.txt             | 124 +++++-----
 .../log_gqttq_mad_m_inl0_hrd0.txt             | 120 ++++-----
 .../log_gqttq_mad_m_inl0_hrd1.txt             | 118 ++++-----
 93 files changed, 5640 insertions(+), 5640 deletions(-)

diff --git a/epochX/cudacpp/tmad/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0.txt
index 3ba8c07f53..0ab7ae8748 100644
--- a/epochX/cudacpp/tmad/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0.txt
+++ b/epochX/cudacpp/tmad/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0.txt
@@ -4,26 +4,26 @@ CUDACPP_BUILDDIR='.'
 
 
 make USEBUILDDIR=1 AVX=none
-
 make USEBUILDDIR=1 AVX=sse4
+
 make USEBUILDDIR=1 AVX=avx2
 make USEBUILDDIR=1 AVX=512y
-
-make USEBUILDDIR=1 AVX=512z
 make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+
+make USEBUILDDIR=1 AVX=512z
 make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
-CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0'
-CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0'
 CUDACPP_BUILDDIR='build.none_d_inl0_hrd0'
+CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0'
 CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd0'
+CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
-CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 make[1]: Nothing to be done for 'all'.
@@ -33,9 +33,9 @@ make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/
 
 OMP_NUM_THREADS=
 
-DATE: 2023-07-18_23:37:40
+DATE: 2023-06-16_23:34:34
 
-On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
+On itscrd80.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum
 
 *** (1) EXECUTE MADEVENT_FORTRAN (create results.dat) ***
@@ -58,9 +58,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_eemumu_x1_fortran > /tmp/av
  [XSECTION] Configuration = 1
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.2175 [] fbridge_mode=0
- [COUNTERS] PROGRAM TOTAL          :    0.0338s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.0220s
- [COUNTERS] Fortran MEs      ( 1 ) :    0.0118s for     8192 events => throughput is 6.96E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.0320s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.0199s
+ [COUNTERS] Fortran MEs      ( 1 ) :    0.0121s for     8192 events => throughput is 6.78E+05 events/s
 
 *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) ***
 --------------------
@@ -83,8 +83,8 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_eemumu_x1_fortran > /tmp/av
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.2175 [0.21747169064681776] fbridge_mode=0
  [UNWEIGHT] Wrote 1611 events (found 1616 events)
- [COUNTERS] PROGRAM TOTAL          :    0.1766s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.1650s
+ [COUNTERS] PROGRAM TOTAL          :    0.1743s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.1627s
  [COUNTERS] Fortran MEs      ( 1 ) :    0.0116s for     8192 events => throughput is 7.07E+05 events/s
 
 *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) ***
@@ -108,9 +108,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_eemumu_x10_fortran > /tmp/a
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.0915 [9.1501919904813656E-002] fbridge_mode=0
  [UNWEIGHT] Wrote 1803 events (found 1808 events)
- [COUNTERS] PROGRAM TOTAL          :    0.4526s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.3272s
- [COUNTERS] Fortran MEs      ( 1 ) :    0.1254s for    90112 events => throughput is 7.19E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.4602s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.3321s
+ [COUNTERS] Fortran MEs      ( 1 ) :    0.1281s for    90112 events => throughput is 7.03E+05 events/s
 
 *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -133,9 +133,9 @@ Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.2175 [0.21747169064681776] fbridge_mode=1
  [UNWEIGHT] Wrote 1611 events (found 1616 events)
- [COUNTERS] PROGRAM TOTAL          :    0.1852s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.1792s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0060s for     8192 events => throughput is 1.37E+06 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.1857s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.1798s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0059s for     8192 events => throughput is 1.39E+06 events/s
 
 *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -166,9 +166,9 @@ Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.0915 [9.1501919904813628E-002] fbridge_mode=1
  [UNWEIGHT] Wrote 1803 events (found 1808 events)
- [COUNTERS] PROGRAM TOTAL          :    0.4048s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.3392s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0656s for    90112 events => throughput is 1.37E+06 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.4020s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.3381s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0639s for    90112 events => throughput is 1.41E+06 events/s
 
 *** (2-none) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -179,14 +179,14 @@ OK! xsec from fortran (9.1501919904813656E-002) and cpp (9.1501919904813628E-002
 OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
 
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
-Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.321006e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.336870e+06                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
-Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.339172e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.374247e+06                 )  sec^-1
 
 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -209,9 +209,9 @@ Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.2175 [0.21747169064681776] fbridge_mode=1
  [UNWEIGHT] Wrote 1611 events (found 1616 events)
- [COUNTERS] PROGRAM TOTAL          :    0.1790s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.1760s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0030s for     8192 events => throughput is 2.73E+06 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.1985s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.1956s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0028s for     8192 events => throughput is 2.92E+06 events/s
 
 *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -242,9 +242,9 @@ Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.0915 [9.1501919904813628E-002] fbridge_mode=1
  [UNWEIGHT] Wrote 1803 events (found 1808 events)
- [COUNTERS] PROGRAM TOTAL          :    0.3712s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.3377s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0335s for    90112 events => throughput is 2.69E+06 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.3681s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.3369s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0312s for    90112 events => throughput is 2.89E+06 events/s
 
 *** (2-sse4) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -255,14 +255,14 @@ OK! xsec from fortran (9.1501919904813656E-002) and cpp (9.1501919904813628E-002
 OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
 
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
-Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.636316e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.728154e+06                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
-Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.734490e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.907973e+06                 )  sec^-1
 
 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -285,9 +285,9 @@ Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.2175 [0.21747169064681776] fbridge_mode=1
  [UNWEIGHT] Wrote 1611 events (found 1616 events)
- [COUNTERS] PROGRAM TOTAL          :    0.1853s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.1836s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0017s for     8192 events => throughput is 4.82E+06 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.1716s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.1699s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0017s for     8192 events => throughput is 4.87E+06 events/s
 
 *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -318,9 +318,9 @@ Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.0915 [9.1501919904813656E-002] fbridge_mode=1
  [UNWEIGHT] Wrote 1803 events (found 1808 events)
- [COUNTERS] PROGRAM TOTAL          :    0.3724s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.3536s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0187s for    90112 events => throughput is 4.81E+06 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.3546s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.3373s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0173s for    90112 events => throughput is 5.21E+06 events/s
 
 *** (2-avx2) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -331,14 +331,14 @@ OK! xsec from fortran (9.1501919904813656E-002) and cpp (9.1501919904813656E-002
 OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
 
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
-Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 5.167177e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 5.040980e+06                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
-Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 5.517494e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 5.910901e+06                 )  sec^-1
 
 *** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -361,9 +361,9 @@ Executing ' ./build.512y_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.2175 [0.21747169064681776] fbridge_mode=1
  [UNWEIGHT] Wrote 1611 events (found 1616 events)
- [COUNTERS] PROGRAM TOTAL          :    0.1775s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.1760s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0015s for     8192 events => throughput is 5.41E+06 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.1706s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.1692s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0014s for     8192 events => throughput is 5.70E+06 events/s
 
 *** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -394,9 +394,9 @@ Executing ' ./build.512y_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.0915 [9.1501919904813656E-002] fbridge_mode=1
  [UNWEIGHT] Wrote 1803 events (found 1808 events)
- [COUNTERS] PROGRAM TOTAL          :    0.3530s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.3366s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0164s for    90112 events => throughput is 5.49E+06 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.3592s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.3430s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0162s for    90112 events => throughput is 5.56E+06 events/s
 
 *** (2-512y) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -407,14 +407,14 @@ OK! xsec from fortran (9.1501919904813656E-002) and cpp (9.1501919904813656E-002
 OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
 
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
-Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 5.516918e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 5.443652e+06                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
-Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 5.847457e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 6.102617e+06                 )  sec^-1
 
 *** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -437,9 +437,9 @@ Executing ' ./build.512z_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.2175 [0.21747169064681776] fbridge_mode=1
  [UNWEIGHT] Wrote 1611 events (found 1616 events)
- [COUNTERS] PROGRAM TOTAL          :    0.1782s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.1766s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0016s for     8192 events => throughput is 5.09E+06 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.1741s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.1723s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0019s for     8192 events => throughput is 4.43E+06 events/s
 
 *** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -470,9 +470,9 @@ Executing ' ./build.512z_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.0915 [9.1501919904813656E-002] fbridge_mode=1
  [UNWEIGHT] Wrote 1803 events (found 1808 events)
- [COUNTERS] PROGRAM TOTAL          :    0.3585s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.3398s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0187s for    90112 events => throughput is 4.83E+06 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.3577s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.3393s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0184s for    90112 events => throughput is 4.89E+06 events/s
 
 *** (2-512z) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -483,14 +483,14 @@ OK! xsec from fortran (9.1501919904813656E-002) and cpp (9.1501919904813656E-002
 OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
 
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
-Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 4.863366e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.745967e+06                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
-Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 5.240767e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 5.387840e+06                 )  sec^-1
 
 *** (3) EXECUTE MADEVENT_CUDA x1 (create events.lhe) ***
 --------------------
@@ -513,9 +513,9 @@ Executing ' ./build.none_d_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_eemumu_
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.2175 [0.21747169064681776] fbridge_mode=1
  [UNWEIGHT] Wrote 1611 events (found 1616 events)
- [COUNTERS] PROGRAM TOTAL          :    0.6019s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.6014s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0005s for     8192 events => throughput is 1.67E+07 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.6235s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.6230s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0005s for     8192 events => throughput is 1.65E+07 events/s
 
 *** (3) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -546,9 +546,9 @@ Executing ' ./build.none_d_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_eemumu_
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.0915 [9.1501919904813628E-002] fbridge_mode=1
  [UNWEIGHT] Wrote 1803 events (found 1808 events)
- [COUNTERS] PROGRAM TOTAL          :    0.7631s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.7582s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0049s for    90112 events => throughput is 1.85E+07 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.7970s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.7922s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0048s for    90112 events => throughput is 1.87E+07 events/s
 
 *** (3) Compare MADEVENT_CUDA x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -559,43 +559,43 @@ OK! xsec from fortran (9.1501919904813656E-002) and cpp (9.1501919904813628E-002
 OK! events.lhe.cuda.10 and events.lhe.ref.10 are identical
 
 *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge ***
-Process                     = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.246491e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.108671e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(8192) -p 256 32 1 ***
-Process                     = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.345799e+08                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.375732e+08                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge ***
-Process                     = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.752050e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.977014e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX) -p 16384 32 1 ***
-Process                     = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.022855e+09                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.036419e+09                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge ***
-Process                     = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.756155e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.981844e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 ***
-Process                     = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.083354e+09                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.078134e+09                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge ***
-Process                     = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.751711e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.944271e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 ***
-Process                     = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.022807e+08                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.976379e+08                 )  sec^-1
 
 TEST COMPLETED
diff --git a/epochX/cudacpp/tmad/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0.txt
index f44c753bdf..6e3da73554 100644
--- a/epochX/cudacpp/tmad/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0.txt
+++ b/epochX/cudacpp/tmad/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0.txt
@@ -2,29 +2,29 @@ Working directory (build): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/e
 CUDACPP_BUILDDIR='.'
 
 
-
 make USEBUILDDIR=1 AVX=none
-make USEBUILDDIR=1 AVX=sse4
 
+make USEBUILDDIR=1 AVX=sse4
 make USEBUILDDIR=1 AVX=avx2
-make USEBUILDDIR=1 AVX=512y
-
-make USEBUILDDIR=1 AVX=512z
 make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+
+make USEBUILDDIR=1 AVX=512y
 make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+
+make USEBUILDDIR=1 AVX=512z
 make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
-CUDACPP_BUILDDIR='build.none_f_inl0_hrd0'
+CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0'
 CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd0'
-CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0'
 CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0'
-CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0'
+CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
-make[1]: Nothing to be done for 'all'.
+CUDACPP_BUILDDIR='build.none_f_inl0_hrd0'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
@@ -33,9 +33,9 @@ make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/
 
 OMP_NUM_THREADS=
 
-DATE: 2023-07-18_23:37:56
+DATE: 2023-06-16_23:34:51
 
-On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
+On itscrd80.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum
 
 *** (1) EXECUTE MADEVENT_FORTRAN (create results.dat) ***
@@ -58,9 +58,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_eemumu_x1_fortran > /tmp/av
  [XSECTION] Configuration = 1
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.2175 [] fbridge_mode=0
- [COUNTERS] PROGRAM TOTAL          :    0.0306s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.0189s
- [COUNTERS] Fortran MEs      ( 1 ) :    0.0117s for     8192 events => throughput is 6.98E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.0316s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.0200s
+ [COUNTERS] Fortran MEs      ( 1 ) :    0.0115s for     8192 events => throughput is 7.11E+05 events/s
 
 *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) ***
 --------------------
@@ -83,9 +83,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_eemumu_x1_fortran > /tmp/av
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.2175 [0.21747169064681776] fbridge_mode=0
  [UNWEIGHT] Wrote 1611 events (found 1616 events)
- [COUNTERS] PROGRAM TOTAL          :    0.1813s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.1694s
- [COUNTERS] Fortran MEs      ( 1 ) :    0.0119s for     8192 events => throughput is 6.89E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.1738s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.1623s
+ [COUNTERS] Fortran MEs      ( 1 ) :    0.0115s for     8192 events => throughput is 7.12E+05 events/s
 
 *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) ***
 --------------------
@@ -108,9 +108,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_eemumu_x10_fortran > /tmp/a
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.0915 [9.1501919904813656E-002] fbridge_mode=0
  [UNWEIGHT] Wrote 1803 events (found 1808 events)
- [COUNTERS] PROGRAM TOTAL          :    0.4607s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.3351s
- [COUNTERS] Fortran MEs      ( 1 ) :    0.1256s for    90112 events => throughput is 7.17E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.4892s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.3640s
+ [COUNTERS] Fortran MEs      ( 1 ) :    0.1253s for    90112 events => throughput is 7.19E+05 events/s
 
 *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -133,9 +133,9 @@ Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.2175 [0.21747166140620297] fbridge_mode=1
  [UNWEIGHT] Wrote 1611 events (found 1616 events)
- [COUNTERS] PROGRAM TOTAL          :    0.1867s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.1809s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0058s for     8192 events => throughput is 1.41E+06 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.1765s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.1709s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0056s for     8192 events => throughput is 1.45E+06 events/s
 
 *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -166,9 +166,9 @@ Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.0915 [9.1501907784661565E-002] fbridge_mode=1
  [UNWEIGHT] Wrote 1803 events (found 1808 events)
- [COUNTERS] PROGRAM TOTAL          :    0.4048s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.3394s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0654s for    90112 events => throughput is 1.38E+06 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.4045s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.3421s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0625s for    90112 events => throughput is 1.44E+06 events/s
 
 *** (2-none) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -179,14 +179,14 @@ OK! xsec from fortran (9.1501919904813656E-002) and cpp (9.1501907784661565E-002
 OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
 
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
-Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.328859e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.366086e+06                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
-Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.356177e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.411236e+06                 )  sec^-1
 
 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -209,9 +209,9 @@ Executing ' ./build.sse4_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.2175 [0.21747165549479658] fbridge_mode=1
  [UNWEIGHT] Wrote 1611 events (found 1616 events)
- [COUNTERS] PROGRAM TOTAL          :    0.1757s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.1739s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0017s for     8192 events => throughput is 4.77E+06 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.1681s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.1666s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0016s for     8192 events => throughput is 5.27E+06 events/s
 
 *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -242,9 +242,9 @@ Executing ' ./build.sse4_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.0915 [9.1501905692857932E-002] fbridge_mode=1
  [UNWEIGHT] Wrote 1803 events (found 1808 events)
- [COUNTERS] PROGRAM TOTAL          :    0.3550s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.3368s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0182s for    90112 events => throughput is 4.94E+06 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.3520s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.3347s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0172s for    90112 events => throughput is 5.23E+06 events/s
 
 *** (2-sse4) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -255,14 +255,14 @@ OK! xsec from fortran (9.1501919904813656E-002) and cpp (9.1501905692857932E-002
 OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
 
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
-Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 5.042708e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 5.178390e+06                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
-Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 5.405752e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 5.517565e+06                 )  sec^-1
 
 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -285,9 +285,9 @@ Executing ' ./build.avx2_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.2175 [0.21747165569099927] fbridge_mode=1
  [UNWEIGHT] Wrote 1611 events (found 1616 events)
- [COUNTERS] PROGRAM TOTAL          :    0.1804s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.1795s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0009s for     8192 events => throughput is 9.00E+06 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.1682s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.1674s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0008s for     8192 events => throughput is 9.72E+06 events/s
 
 *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -318,9 +318,9 @@ Executing ' ./build.avx2_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.0915 [9.1501905658047333E-002] fbridge_mode=1
  [UNWEIGHT] Wrote 1803 events (found 1808 events)
- [COUNTERS] PROGRAM TOTAL          :    0.3621s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.3525s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0096s for    90112 events => throughput is 9.36E+06 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.3407s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.3314s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0094s for    90112 events => throughput is 9.61E+06 events/s
 
 *** (2-avx2) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -331,14 +331,14 @@ OK! xsec from fortran (9.1501919904813656E-002) and cpp (9.1501905658047333E-002
 OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
 
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
-Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 9.526379e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.065328e+07                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
-Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.122017e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.184488e+07                 )  sec^-1
 
 *** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -361,9 +361,9 @@ Executing ' ./build.512y_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.2175 [0.21747165569099927] fbridge_mode=1
  [UNWEIGHT] Wrote 1611 events (found 1616 events)
- [COUNTERS] PROGRAM TOTAL          :    0.1898s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.1889s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0009s for     8192 events => throughput is 9.31E+06 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.1673s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.1665s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0008s for     8192 events => throughput is 1.03E+07 events/s
 
 *** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -394,9 +394,9 @@ Executing ' ./build.512y_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.0915 [9.1501905658047333E-002] fbridge_mode=1
  [UNWEIGHT] Wrote 1803 events (found 1808 events)
- [COUNTERS] PROGRAM TOTAL          :    0.3555s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.3463s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0093s for    90112 events => throughput is 9.73E+06 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.3433s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.3345s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0087s for    90112 events => throughput is 1.03E+07 events/s
 
 *** (2-512y) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -407,14 +407,14 @@ OK! xsec from fortran (9.1501919904813656E-002) and cpp (9.1501905658047333E-002
 OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
 
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
-Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 9.962410e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.074978e+07                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
-Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 9.882310e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.114652e+07                 )  sec^-1
 
 *** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -437,9 +437,9 @@ Executing ' ./build.512z_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.2175 [0.21747166431914253] fbridge_mode=1
  [UNWEIGHT] Wrote 1611 events (found 1616 events)
- [COUNTERS] PROGRAM TOTAL          :    0.1889s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.1878s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0011s for     8192 events => throughput is 7.26E+06 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.1711s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.1701s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0009s for     8192 events => throughput is 8.67E+06 events/s
 
 *** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -470,9 +470,9 @@ Executing ' ./build.512z_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.0915 [9.1501909358591468E-002] fbridge_mode=1
  [UNWEIGHT] Wrote 1803 events (found 1808 events)
- [COUNTERS] PROGRAM TOTAL          :    0.3574s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.3468s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0106s for    90112 events => throughput is 8.50E+06 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.3454s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.3353s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0102s for    90112 events => throughput is 8.87E+06 events/s
 
 *** (2-512z) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -483,14 +483,14 @@ OK! xsec from fortran (9.1501919904813656E-002) and cpp (9.1501909358591468E-002
 OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
 
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
-Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 8.208516e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 9.877305e+06                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
-Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.080579e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.149185e+07                 )  sec^-1
 
 *** (3) EXECUTE MADEVENT_CUDA x1 (create events.lhe) ***
 --------------------
@@ -513,9 +513,9 @@ Executing ' ./build.none_f_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_eemumu_
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.2175 [0.21747166796068879] fbridge_mode=1
  [UNWEIGHT] Wrote 1611 events (found 1616 events)
- [COUNTERS] PROGRAM TOTAL          :    0.5963s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.5958s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0005s for     8192 events => throughput is 1.74E+07 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.6303s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.6298s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0005s for     8192 events => throughput is 1.72E+07 events/s
 
 *** (3) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -546,9 +546,9 @@ Executing ' ./build.none_f_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_eemumu_
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.0915 [9.1501910316213061E-002] fbridge_mode=1
  [UNWEIGHT] Wrote 1803 events (found 1808 events)
- [COUNTERS] PROGRAM TOTAL          :    0.7633s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.7588s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0045s for    90112 events => throughput is 1.99E+07 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.7969s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.7923s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0045s for    90112 events => throughput is 1.98E+07 events/s
 
 *** (3) Compare MADEVENT_CUDA x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -559,43 +559,43 @@ OK! xsec from fortran (9.1501919904813656E-002) and cpp (9.1501910316213061E-002
 OK! events.lhe.cuda.10 and events.lhe.ref.10 are identical
 
 *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge ***
-Process                     = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 4.595045e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.510070e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(8192) -p 256 32 1 ***
-Process                     = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.237305e+08                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.150527e+08                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge ***
-Process                     = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 7.038428e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.548906e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX) -p 16384 32 1 ***
-Process                     = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.513799e+09                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.466611e+09                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge ***
-Process                     = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 7.050612e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.695373e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 ***
-Process                     = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.771650e+09                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.743025e+09                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge ***
-Process                     = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 6.393159e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 6.786521e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 ***
-Process                     = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 9.502447e+08                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 9.372126e+08                 )  sec^-1
 
 TEST COMPLETED
diff --git a/epochX/cudacpp/tmad/logs_eemumu_mad/log_eemumu_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_eemumu_mad/log_eemumu_mad_m_inl0_hrd0.txt
index 0bbeab2435..08471d1c00 100644
--- a/epochX/cudacpp/tmad/logs_eemumu_mad/log_eemumu_mad_m_inl0_hrd0.txt
+++ b/epochX/cudacpp/tmad/logs_eemumu_mad/log_eemumu_mad_m_inl0_hrd0.txt
@@ -2,23 +2,23 @@ Working directory (build): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/e
 CUDACPP_BUILDDIR='.'
 
 
-
 make USEBUILDDIR=1 AVX=none
-make USEBUILDDIR=1 AVX=sse4
 
+make USEBUILDDIR=1 AVX=sse4
 make USEBUILDDIR=1 AVX=avx2
-make USEBUILDDIR=1 AVX=512y
-
-make USEBUILDDIR=1 AVX=512z
 make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+
+make USEBUILDDIR=1 AVX=512y
 make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+
+make USEBUILDDIR=1 AVX=512z
 make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
-CUDACPP_BUILDDIR='build.512y_m_inl0_hrd0'
+CUDACPP_BUILDDIR='build.avx2_m_inl0_hrd0'
 CUDACPP_BUILDDIR='build.sse4_m_inl0_hrd0'
 CUDACPP_BUILDDIR='build.none_m_inl0_hrd0'
-CUDACPP_BUILDDIR='build.avx2_m_inl0_hrd0'
+CUDACPP_BUILDDIR='build.512y_m_inl0_hrd0'
 CUDACPP_BUILDDIR='build.512z_m_inl0_hrd0'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
@@ -33,9 +33,9 @@ make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/
 
 OMP_NUM_THREADS=
 
-DATE: 2023-07-18_23:38:12
+DATE: 2023-06-16_23:35:08
 
-On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
+On itscrd80.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum
 
 *** (1) EXECUTE MADEVENT_FORTRAN (create results.dat) ***
@@ -58,9 +58,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_eemumu_x1_fortran > /tmp/av
  [XSECTION] Configuration = 1
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.2175 [] fbridge_mode=0
- [COUNTERS] PROGRAM TOTAL          :    0.0308s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.0189s
- [COUNTERS] Fortran MEs      ( 1 ) :    0.0120s for     8192 events => throughput is 6.83E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.0313s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.0197s
+ [COUNTERS] Fortran MEs      ( 1 ) :    0.0115s for     8192 events => throughput is 7.11E+05 events/s
 
 *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) ***
 --------------------
@@ -83,9 +83,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_eemumu_x1_fortran > /tmp/av
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.2175 [0.21747169064681776] fbridge_mode=0
  [UNWEIGHT] Wrote 1611 events (found 1616 events)
- [COUNTERS] PROGRAM TOTAL          :    0.1824s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.1710s
- [COUNTERS] Fortran MEs      ( 1 ) :    0.0113s for     8192 events => throughput is 7.24E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.1766s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.1651s
+ [COUNTERS] Fortran MEs      ( 1 ) :    0.0116s for     8192 events => throughput is 7.08E+05 events/s
 
 *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) ***
 --------------------
@@ -108,9 +108,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_eemumu_x10_fortran > /tmp/a
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.0915 [9.1501919904813656E-002] fbridge_mode=0
  [UNWEIGHT] Wrote 1803 events (found 1808 events)
- [COUNTERS] PROGRAM TOTAL          :    0.4718s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.3410s
- [COUNTERS] Fortran MEs      ( 1 ) :    0.1308s for    90112 events => throughput is 6.89E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.4702s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.3402s
+ [COUNTERS] Fortran MEs      ( 1 ) :    0.1300s for    90112 events => throughput is 6.93E+05 events/s
 
 *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -133,9 +133,9 @@ Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.2175 [0.21747169074211728] fbridge_mode=1
  [UNWEIGHT] Wrote 1611 events (found 1616 events)
- [COUNTERS] PROGRAM TOTAL          :    0.1923s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.1857s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0066s for     8192 events => throughput is 1.25E+06 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.1796s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.1734s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0062s for     8192 events => throughput is 1.33E+06 events/s
 
 *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -166,9 +166,9 @@ Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.0915 [9.1501919915927155E-002] fbridge_mode=1
  [UNWEIGHT] Wrote 1803 events (found 1808 events)
- [COUNTERS] PROGRAM TOTAL          :    0.4100s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.3412s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0688s for    90112 events => throughput is 1.31E+06 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.4089s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.3434s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0655s for    90112 events => throughput is 1.38E+06 events/s
 
 *** (2-none) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -179,14 +179,14 @@ OK! xsec from fortran (9.1501919904813656E-002) and cpp (9.1501919915927155E-002
 OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
 
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
-Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.265020e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.316240e+06                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
-Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.281271e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.353376e+06                 )  sec^-1
 
 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -209,9 +209,9 @@ Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.2175 [0.21747169074211728] fbridge_mode=1
  [UNWEIGHT] Wrote 1611 events (found 1616 events)
- [COUNTERS] PROGRAM TOTAL          :    0.1791s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.1762s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0029s for     8192 events => throughput is 2.79E+06 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.1732s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.1703s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0029s for     8192 events => throughput is 2.85E+06 events/s
 
 *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -242,9 +242,9 @@ Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.0915 [9.1501919915927155E-002] fbridge_mode=1
  [UNWEIGHT] Wrote 1803 events (found 1808 events)
- [COUNTERS] PROGRAM TOTAL          :    0.3721s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.3394s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0327s for    90112 events => throughput is 2.75E+06 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.3857s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.3519s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0338s for    90112 events => throughput is 2.66E+06 events/s
 
 *** (2-sse4) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -255,14 +255,14 @@ OK! xsec from fortran (9.1501919904813656E-002) and cpp (9.1501919915927155E-002
 OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
 
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
-Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.694313e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.780240e+06                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
-Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.848827e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.942864e+06                 )  sec^-1
 
 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -285,9 +285,9 @@ Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.2175 [0.21747169063975949] fbridge_mode=1
  [UNWEIGHT] Wrote 1611 events (found 1616 events)
- [COUNTERS] PROGRAM TOTAL          :    0.1760s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.1744s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0016s for     8192 events => throughput is 5.15E+06 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.1715s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.1699s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0016s for     8192 events => throughput is 5.22E+06 events/s
 
 *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -318,9 +318,9 @@ Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.0915 [9.1501919908700741E-002] fbridge_mode=1
  [UNWEIGHT] Wrote 1803 events (found 1808 events)
- [COUNTERS] PROGRAM TOTAL          :    0.3516s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.3346s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0170s for    90112 events => throughput is 5.30E+06 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.3543s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.3369s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0174s for    90112 events => throughput is 5.17E+06 events/s
 
 *** (2-avx2) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -331,14 +331,14 @@ OK! xsec from fortran (9.1501919904813656E-002) and cpp (9.1501919908700741E-002
 OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
 
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
-Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 5.287208e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 5.231827e+06                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
-Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 5.870563e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 5.881656e+06                 )  sec^-1
 
 *** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -361,9 +361,9 @@ Executing ' ./build.512y_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.2175 [0.21747169063975949] fbridge_mode=1
  [UNWEIGHT] Wrote 1611 events (found 1616 events)
- [COUNTERS] PROGRAM TOTAL          :    0.1776s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.1761s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0015s for     8192 events => throughput is 5.31E+06 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.1708s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.1692s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0015s for     8192 events => throughput is 5.38E+06 events/s
 
 *** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -394,9 +394,9 @@ Executing ' ./build.512y_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.0915 [9.1501919908700741E-002] fbridge_mode=1
  [UNWEIGHT] Wrote 1803 events (found 1808 events)
- [COUNTERS] PROGRAM TOTAL          :    0.3545s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.3385s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0161s for    90112 events => throughput is 5.61E+06 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.3556s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.3393s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0164s for    90112 events => throughput is 5.51E+06 events/s
 
 *** (2-512y) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -407,14 +407,14 @@ OK! xsec from fortran (9.1501919904813656E-002) and cpp (9.1501919908700741E-002
 OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
 
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
-Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 5.660074e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 5.595835e+06                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
-Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 6.277308e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 6.329466e+06                 )  sec^-1
 
 *** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -437,9 +437,9 @@ Executing ' ./build.512z_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.2175 [0.21747169063975949] fbridge_mode=1
  [UNWEIGHT] Wrote 1611 events (found 1616 events)
- [COUNTERS] PROGRAM TOTAL          :    0.1786s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.1769s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0017s for     8192 events => throughput is 4.95E+06 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.1706s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.1690s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0016s for     8192 events => throughput is 5.02E+06 events/s
 
 *** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -470,9 +470,9 @@ Executing ' ./build.512z_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.0915 [9.1501919908700741E-002] fbridge_mode=1
  [UNWEIGHT] Wrote 1803 events (found 1808 events)
- [COUNTERS] PROGRAM TOTAL          :    0.3740s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.3541s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0200s for    90112 events => throughput is 4.51E+06 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.3594s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.3416s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0178s for    90112 events => throughput is 5.06E+06 events/s
 
 *** (2-512z) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -483,14 +483,14 @@ OK! xsec from fortran (9.1501919904813656E-002) and cpp (9.1501919908700741E-002
 OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
 
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
-Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 4.595633e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.773234e+06                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
-Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 5.210377e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 5.345326e+06                 )  sec^-1
 
 *** (3) EXECUTE MADEVENT_CUDA x1 (create events.lhe) ***
 --------------------
@@ -513,9 +513,9 @@ Executing ' ./build.none_m_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_eemumu_
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.2175 [0.21747169066587255] fbridge_mode=1
  [UNWEIGHT] Wrote 1611 events (found 1616 events)
- [COUNTERS] PROGRAM TOTAL          :    0.5975s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.5970s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0005s for     8192 events => throughput is 1.63E+07 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.6290s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.6285s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0005s for     8192 events => throughput is 1.67E+07 events/s
 
 *** (3) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -546,9 +546,9 @@ Executing ' ./build.none_m_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_eemumu_
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.0915 [9.1501919911173610E-002] fbridge_mode=1
  [UNWEIGHT] Wrote 1803 events (found 1808 events)
- [COUNTERS] PROGRAM TOTAL          :    0.7608s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.7560s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0048s for    90112 events => throughput is 1.89E+07 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.7937s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.7888s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0049s for    90112 events => throughput is 1.83E+07 events/s
 
 *** (3) Compare MADEVENT_CUDA x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -559,43 +559,43 @@ OK! xsec from fortran (9.1501919904813656E-002) and cpp (9.1501919911173610E-002
 OK! events.lhe.cuda.10 and events.lhe.ref.10 are identical
 
 *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge ***
-Process                     = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.178199e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.341832e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(8192) -p 256 32 1 ***
-Process                     = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.475523e+08                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.367767e+08                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge ***
-Process                     = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.774650e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.004324e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX) -p 16384 32 1 ***
-Process                     = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.034605e+09                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.022759e+09                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge ***
-Process                     = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.742018e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.001795e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 ***
-Process                     = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.080919e+09                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.064728e+09                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge ***
-Process                     = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.757270e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.982012e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 ***
-Process                     = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.990514e+08                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.000317e+08                 )  sec^-1
 
 TEST COMPLETED
diff --git a/epochX/cudacpp/tmad/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0.txt
index b8ffb59aa2..9da4005516 100644
--- a/epochX/cudacpp/tmad/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0.txt
+++ b/epochX/cudacpp/tmad/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0.txt
@@ -3,25 +3,25 @@ CUDACPP_BUILDDIR='.'
 
 
 
-
 make USEBUILDDIR=1 AVX=none
 make USEBUILDDIR=1 AVX=sse4
+
 make USEBUILDDIR=1 AVX=avx2
 make USEBUILDDIR=1 AVX=512y
-
-make USEBUILDDIR=1 AVX=512z
 make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+
+make USEBUILDDIR=1 AVX=512z
 make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd0'
+CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0'
 CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0'
 CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0'
-CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0'
+CUDACPP_BUILDDIR='build.none_d_inl0_hrd0'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
-CUDACPP_BUILDDIR='build.none_d_inl0_hrd0'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 make[1]: Nothing to be done for 'all'.
@@ -33,9 +33,9 @@ make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/
 
 OMP_NUM_THREADS=
 
-DATE: 2023-07-18_23:38:28
+DATE: 2023-06-16_23:35:24
 
-On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
+On itscrd80.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx
 
 *** (1) EXECUTE MADEVENT_FORTRAN (create results.dat) ***
@@ -58,9 +58,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggtt_x1_fortran > /tmp/aval
  [XSECTION] Configuration = 1
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 47.69 [] fbridge_mode=0
- [COUNTERS] PROGRAM TOTAL          :    0.2086s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.1537s
- [COUNTERS] Fortran MEs      ( 1 ) :    0.0549s for     8192 events => throughput is 1.49E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.2030s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.1493s
+ [COUNTERS] Fortran MEs      ( 1 ) :    0.0538s for     8192 events => throughput is 1.52E+05 events/s
 
 *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) ***
 --------------------
@@ -83,9 +83,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggtt_x1_fortran > /tmp/aval
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 47.69 [47.690708277600116] fbridge_mode=0
  [UNWEIGHT] Wrote 434 events (found 1125 events)
- [COUNTERS] PROGRAM TOTAL          :    0.3201s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.2645s
- [COUNTERS] Fortran MEs      ( 1 ) :    0.0557s for     8192 events => throughput is 1.47E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.3135s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.2598s
+ [COUNTERS] Fortran MEs      ( 1 ) :    0.0536s for     8192 events => throughput is 1.53E+05 events/s
 
 *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) ***
 --------------------
@@ -108,9 +108,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggtt_x10_fortran > /tmp/ava
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 46.22 [46.223782291775365] fbridge_mode=0
  [UNWEIGHT] Wrote 1727 events (found 1732 events)
- [COUNTERS] PROGRAM TOTAL          :    1.7911s
- [COUNTERS] Fortran Overhead ( 0 ) :    1.1844s
- [COUNTERS] Fortran MEs      ( 1 ) :    0.6067s for    90112 events => throughput is 1.49E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    1.7753s
+ [COUNTERS] Fortran Overhead ( 0 ) :    1.1856s
+ [COUNTERS] Fortran MEs      ( 1 ) :    0.5897s for    90112 events => throughput is 1.53E+05 events/s
 
 *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -133,9 +133,9 @@ Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 47.69 [47.690708277600102] fbridge_mode=1
  [UNWEIGHT] Wrote 434 events (found 1125 events)
- [COUNTERS] PROGRAM TOTAL          :    0.3551s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.3121s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0430s for     8192 events => throughput is 1.90E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.3502s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.3091s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0411s for     8192 events => throughput is 1.99E+05 events/s
 
 *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -166,9 +166,9 @@ Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x10
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 46.22 [46.223782291775379] fbridge_mode=1
  [UNWEIGHT] Wrote 1727 events (found 1732 events)
- [COUNTERS] PROGRAM TOTAL          :    1.7071s
- [COUNTERS] Fortran Overhead ( 0 ) :    1.2348s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.4724s for    90112 events => throughput is 1.91E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    1.6970s
+ [COUNTERS] Fortran Overhead ( 0 ) :    1.2435s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.4535s for    90112 events => throughput is 1.99E+05 events/s
 
 *** (2-none) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -179,14 +179,14 @@ OK! xsec from fortran (46.223782291775365) and cpp (46.223782291775379) differ b
 OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
 
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
-Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.927052e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.992097e+05                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
-Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.914151e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.951938e+05                 )  sec^-1
 
 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -209,9 +209,9 @@ Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 47.69 [47.690708277600109] fbridge_mode=1
  [UNWEIGHT] Wrote 434 events (found 1125 events)
- [COUNTERS] PROGRAM TOTAL          :    0.3154s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.2918s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0236s for     8192 events => throughput is 3.47E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.3095s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.2872s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0223s for     8192 events => throughput is 3.68E+05 events/s
 
 *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -242,9 +242,9 @@ Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x10
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 46.22 [46.223782291775379] fbridge_mode=1
  [UNWEIGHT] Wrote 1727 events (found 1732 events)
- [COUNTERS] PROGRAM TOTAL          :    1.4706s
- [COUNTERS] Fortran Overhead ( 0 ) :    1.2169s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.2537s for    90112 events => throughput is 3.55E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    1.4568s
+ [COUNTERS] Fortran Overhead ( 0 ) :    1.2139s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.2429s for    90112 events => throughput is 3.71E+05 events/s
 
 *** (2-sse4) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -255,14 +255,14 @@ OK! xsec from fortran (46.223782291775365) and cpp (46.223782291775379) differ b
 OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
 
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
-Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.493125e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.737410e+05                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
-Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.573105e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.630060e+05                 )  sec^-1
 
 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -285,9 +285,9 @@ Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 47.69 [47.690708277600109] fbridge_mode=1
  [UNWEIGHT] Wrote 434 events (found 1125 events)
- [COUNTERS] PROGRAM TOTAL          :    0.2920s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.2792s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0128s for     8192 events => throughput is 6.41E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.2885s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.2762s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0123s for     8192 events => throughput is 6.65E+05 events/s
 
 *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -318,9 +318,9 @@ Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x10
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 46.22 [46.223782291775393] fbridge_mode=1
  [UNWEIGHT] Wrote 1727 events (found 1732 events)
- [COUNTERS] PROGRAM TOTAL          :    1.3508s
- [COUNTERS] Fortran Overhead ( 0 ) :    1.2114s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.1395s for    90112 events => throughput is 6.46E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    1.3388s
+ [COUNTERS] Fortran Overhead ( 0 ) :    1.2039s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.1350s for    90112 events => throughput is 6.68E+05 events/s
 
 *** (2-avx2) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -331,14 +331,14 @@ OK! xsec from fortran (46.223782291775365) and cpp (46.223782291775393) differ b
 OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
 
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
-Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 6.192968e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 6.378851e+05                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
-Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 6.297627e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 6.500232e+05                 )  sec^-1
 
 *** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -361,9 +361,9 @@ Executing ' ./build.512y_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 47.69 [47.690708277600109] fbridge_mode=1
  [UNWEIGHT] Wrote 434 events (found 1125 events)
- [COUNTERS] PROGRAM TOTAL          :    0.2894s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.2780s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0114s for     8192 events => throughput is 7.19E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.2876s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.2765s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0111s for     8192 events => throughput is 7.39E+05 events/s
 
 *** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -394,9 +394,9 @@ Executing ' ./build.512y_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x10
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 46.22 [46.223782291775393] fbridge_mode=1
  [UNWEIGHT] Wrote 1727 events (found 1732 events)
- [COUNTERS] PROGRAM TOTAL          :    1.3333s
- [COUNTERS] Fortran Overhead ( 0 ) :    1.2059s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.1274s for    90112 events => throughput is 7.07E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    1.3666s
+ [COUNTERS] Fortran Overhead ( 0 ) :    1.2422s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.1244s for    90112 events => throughput is 7.24E+05 events/s
 
 *** (2-512y) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -407,14 +407,14 @@ OK! xsec from fortran (46.223782291775365) and cpp (46.223782291775393) differ b
 OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
 
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
-Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 6.981238e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 6.995934e+05                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
-Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 7.001926e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 6.995046e+05                 )  sec^-1
 
 *** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -437,9 +437,9 @@ Executing ' ./build.512z_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 47.69 [47.690708277600109] fbridge_mode=1
  [UNWEIGHT] Wrote 434 events (found 1125 events)
- [COUNTERS] PROGRAM TOTAL          :    0.3041s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.2853s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0187s for     8192 events => throughput is 4.37E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.3014s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.2827s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0186s for     8192 events => throughput is 4.39E+05 events/s
 
 *** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -470,9 +470,9 @@ Executing ' ./build.512z_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x10
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 46.22 [46.223782291775393] fbridge_mode=1
  [UNWEIGHT] Wrote 1727 events (found 1732 events)
- [COUNTERS] PROGRAM TOTAL          :    1.4179s
- [COUNTERS] Fortran Overhead ( 0 ) :    1.2121s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.2058s for    90112 events => throughput is 4.38E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    1.4130s
+ [COUNTERS] Fortran Overhead ( 0 ) :    1.2131s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.1999s for    90112 events => throughput is 4.51E+05 events/s
 
 *** (2-512z) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -483,14 +483,14 @@ OK! xsec from fortran (46.223782291775365) and cpp (46.223782291775393) differ b
 OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
 
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
-Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 4.159932e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.352063e+05                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
-Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 4.111311e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.264252e+05                 )  sec^-1
 
 *** (3) EXECUTE MADEVENT_CUDA x1 (create events.lhe) ***
 --------------------
@@ -513,8 +513,8 @@ Executing ' ./build.none_d_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggtt_x1
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 47.69 [47.690708277600109] fbridge_mode=1
  [UNWEIGHT] Wrote 434 events (found 1125 events)
- [COUNTERS] PROGRAM TOTAL          :    0.6947s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.6941s
+ [COUNTERS] PROGRAM TOTAL          :    0.7320s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.7314s
  [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0006s for     8192 events => throughput is 1.38E+07 events/s
 
 *** (3) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec ***
@@ -546,9 +546,9 @@ Executing ' ./build.none_d_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggtt_x1
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 46.22 [46.223782291775386] fbridge_mode=1
  [UNWEIGHT] Wrote 1727 events (found 1732 events)
- [COUNTERS] PROGRAM TOTAL          :    1.6422s
- [COUNTERS] Fortran Overhead ( 0 ) :    1.6357s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0065s for    90112 events => throughput is 1.38E+07 events/s
+ [COUNTERS] PROGRAM TOTAL          :    1.6728s
+ [COUNTERS] Fortran Overhead ( 0 ) :    1.6663s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0065s for    90112 events => throughput is 1.39E+07 events/s
 
 *** (3) Compare MADEVENT_CUDA x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -559,43 +559,43 @@ OK! xsec from fortran (46.223782291775365) and cpp (46.223782291775386) differ b
 OK! events.lhe.cuda.10 and events.lhe.ref.10 are identical
 
 *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge ***
-Process                     = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.080408e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.883699e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(8192) -p 256 32 1 ***
-Process                     = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 5.652930e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 5.569644e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge ***
-Process                     = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.002488e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.587002e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX) -p 16384 32 1 ***
-Process                     = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.077157e+08                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.074126e+08                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge ***
-Process                     = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.019063e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.570667e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 ***
-Process                     = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.153703e+08                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.153037e+08                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge ***
-Process                     = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.006798e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.562530e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 ***
-Process                     = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.037386e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.066634e+07                 )  sec^-1
 
 TEST COMPLETED
diff --git a/epochX/cudacpp/tmad/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0.txt
index 475170c8e7..01adf8925b 100644
--- a/epochX/cudacpp/tmad/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0.txt
+++ b/epochX/cudacpp/tmad/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0.txt
@@ -2,30 +2,30 @@ Working directory (build): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/g
 CUDACPP_BUILDDIR='.'
 
 
-make USEBUILDDIR=1 AVX=none
-
 
+make USEBUILDDIR=1 AVX=none
 make USEBUILDDIR=1 AVX=sse4
 make USEBUILDDIR=1 AVX=avx2
-make USEBUILDDIR=1 AVX=512y
 
-make USEBUILDDIR=1 AVX=512z
+make USEBUILDDIR=1 AVX=512y
 make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+
+make USEBUILDDIR=1 AVX=512z
 make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
-CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd0'
-CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0'
 CUDACPP_BUILDDIR='build.none_f_inl0_hrd0'
-CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0'
+CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd0'
+CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
-CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 make[1]: Nothing to be done for 'all'.
@@ -33,9 +33,9 @@ make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/
 
 OMP_NUM_THREADS=
 
-DATE: 2023-07-18_23:38:53
+DATE: 2023-06-16_23:35:50
 
-On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
+On itscrd80.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx
 
 *** (1) EXECUTE MADEVENT_FORTRAN (create results.dat) ***
@@ -58,9 +58,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggtt_x1_fortran > /tmp/aval
  [XSECTION] Configuration = 1
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 47.69 [] fbridge_mode=0
- [COUNTERS] PROGRAM TOTAL          :    0.2033s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.1483s
- [COUNTERS] Fortran MEs      ( 1 ) :    0.0550s for     8192 events => throughput is 1.49E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.2007s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.1470s
+ [COUNTERS] Fortran MEs      ( 1 ) :    0.0537s for     8192 events => throughput is 1.52E+05 events/s
 
 *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) ***
 --------------------
@@ -83,9 +83,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggtt_x1_fortran > /tmp/aval
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 47.69 [47.690708277600116] fbridge_mode=0
  [UNWEIGHT] Wrote 434 events (found 1125 events)
- [COUNTERS] PROGRAM TOTAL          :    0.3181s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.2631s
- [COUNTERS] Fortran MEs      ( 1 ) :    0.0550s for     8192 events => throughput is 1.49E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.3500s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.2965s
+ [COUNTERS] Fortran MEs      ( 1 ) :    0.0535s for     8192 events => throughput is 1.53E+05 events/s
 
 *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) ***
 --------------------
@@ -108,9 +108,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggtt_x10_fortran > /tmp/ava
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 46.22 [46.223782291775365] fbridge_mode=0
  [UNWEIGHT] Wrote 1727 events (found 1732 events)
- [COUNTERS] PROGRAM TOTAL          :    1.7940s
- [COUNTERS] Fortran Overhead ( 0 ) :    1.1856s
- [COUNTERS] Fortran MEs      ( 1 ) :    0.6085s for    90112 events => throughput is 1.48E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    1.7703s
+ [COUNTERS] Fortran Overhead ( 0 ) :    1.1795s
+ [COUNTERS] Fortran MEs      ( 1 ) :    0.5908s for    90112 events => throughput is 1.53E+05 events/s
 
 *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -133,9 +133,9 @@ Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 47.69 [47.690706211693573] fbridge_mode=1
  [UNWEIGHT] Wrote 434 events (found 1125 events)
- [COUNTERS] PROGRAM TOTAL          :    0.3506s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.3088s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0418s for     8192 events => throughput is 1.96E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.3412s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.3013s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0398s for     8192 events => throughput is 2.06E+05 events/s
 
 *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -166,9 +166,9 @@ Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x10
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 46.22 [46.223782418787778] fbridge_mode=1
  [UNWEIGHT] Wrote 1727 events (found 1732 events)
- [COUNTERS] PROGRAM TOTAL          :    1.6937s
- [COUNTERS] Fortran Overhead ( 0 ) :    1.2350s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.4586s for    90112 events => throughput is 1.96E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    1.6722s
+ [COUNTERS] Fortran Overhead ( 0 ) :    1.2335s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.4387s for    90112 events => throughput is 2.05E+05 events/s
 
 *** (2-none) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -179,14 +179,14 @@ OK! xsec from fortran (46.223782291775365) and cpp (46.223782418787778) differ b
 OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
 
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
-Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.981514e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.020933e+05                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
-Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.971620e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.051565e+05                 )  sec^-1
 
 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -209,9 +209,9 @@ Executing ' ./build.sse4_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 47.69 [47.690707641465352] fbridge_mode=1
  [UNWEIGHT] Wrote 434 events (found 1125 events)
- [COUNTERS] PROGRAM TOTAL          :    0.2964s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.2807s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0157s for     8192 events => throughput is 5.23E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.2975s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.2808s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0167s for     8192 events => throughput is 4.90E+05 events/s
 
 *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -242,9 +242,9 @@ Executing ' ./build.sse4_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x10
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 46.22 [46.223786452345514] fbridge_mode=1
  [UNWEIGHT] Wrote 1727 events (found 1732 events)
- [COUNTERS] PROGRAM TOTAL          :    1.3819s
- [COUNTERS] Fortran Overhead ( 0 ) :    1.2092s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.1727s for    90112 events => throughput is 5.22E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    1.3782s
+ [COUNTERS] Fortran Overhead ( 0 ) :    1.2071s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.1711s for    90112 events => throughput is 5.27E+05 events/s
 
 *** (2-sse4) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -255,14 +255,14 @@ OK! xsec from fortran (46.223782291775365) and cpp (46.223786452345514) differ b
 OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
 
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
-Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 5.002821e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 5.159675e+05                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
-Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 4.987254e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 5.216268e+05                 )  sec^-1
 
 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -285,9 +285,9 @@ Executing ' ./build.avx2_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 47.69 [47.690698819656767] fbridge_mode=1
  [UNWEIGHT] Wrote 434 events (found 1125 events)
- [COUNTERS] PROGRAM TOTAL          :    0.2792s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.2723s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0069s for     8192 events => throughput is 1.18E+06 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.2780s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.2713s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0067s for     8192 events => throughput is 1.23E+06 events/s
 
 *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -318,9 +318,9 @@ Executing ' ./build.avx2_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x10
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 46.22 [46.223782736292961] fbridge_mode=1
  [UNWEIGHT] Wrote 1727 events (found 1732 events)
- [COUNTERS] PROGRAM TOTAL          :    1.3015s
- [COUNTERS] Fortran Overhead ( 0 ) :    1.2231s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0784s for    90112 events => throughput is 1.15E+06 events/s
+ [COUNTERS] PROGRAM TOTAL          :    1.2805s
+ [COUNTERS] Fortran Overhead ( 0 ) :    1.2059s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0747s for    90112 events => throughput is 1.21E+06 events/s
 
 *** (2-avx2) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -331,14 +331,14 @@ OK! xsec from fortran (46.223782291775365) and cpp (46.223782736292961) differ b
 OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
 
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
-Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.149527e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.150882e+06                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
-Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.153994e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.149098e+06                 )  sec^-1
 
 *** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -361,9 +361,9 @@ Executing ' ./build.512y_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 47.69 [47.690698819656767] fbridge_mode=1
  [UNWEIGHT] Wrote 434 events (found 1125 events)
- [COUNTERS] PROGRAM TOTAL          :    0.2801s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.2738s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0064s for     8192 events => throughput is 1.28E+06 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.2758s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.2693s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0064s for     8192 events => throughput is 1.27E+06 events/s
 
 *** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -394,9 +394,9 @@ Executing ' ./build.512y_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x10
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 46.22 [46.223782736292961] fbridge_mode=1
  [UNWEIGHT] Wrote 1727 events (found 1732 events)
- [COUNTERS] PROGRAM TOTAL          :    1.2660s
- [COUNTERS] Fortran Overhead ( 0 ) :    1.1954s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0705s for    90112 events => throughput is 1.28E+06 events/s
+ [COUNTERS] PROGRAM TOTAL          :    1.2778s
+ [COUNTERS] Fortran Overhead ( 0 ) :    1.2090s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0688s for    90112 events => throughput is 1.31E+06 events/s
 
 *** (2-512y) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -407,14 +407,14 @@ OK! xsec from fortran (46.223782291775365) and cpp (46.223782736292961) differ b
 OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
 
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
-Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.250744e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.195362e+06                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
-Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.249123e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.247695e+06                 )  sec^-1
 
 *** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -437,9 +437,9 @@ Executing ' ./build.512z_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 47.69 [47.690703490151122] fbridge_mode=1
  [UNWEIGHT] Wrote 434 events (found 1125 events)
- [COUNTERS] PROGRAM TOTAL          :    0.2877s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.2778s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0099s for     8192 events => throughput is 8.26E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.2869s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.2776s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0094s for     8192 events => throughput is 8.73E+05 events/s
 
 *** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -470,9 +470,9 @@ Executing ' ./build.512z_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x10
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 46.22 [46.223787021597481] fbridge_mode=1
  [UNWEIGHT] Wrote 1727 events (found 1732 events)
- [COUNTERS] PROGRAM TOTAL          :    1.3074s
- [COUNTERS] Fortran Overhead ( 0 ) :    1.2030s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.1044s for    90112 events => throughput is 8.63E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    1.3127s
+ [COUNTERS] Fortran Overhead ( 0 ) :    1.2100s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.1027s for    90112 events => throughput is 8.77E+05 events/s
 
 *** (2-512z) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -483,14 +483,14 @@ OK! xsec from fortran (46.223782291775365) and cpp (46.223787021597481) differ b
 OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
 
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
-Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 8.010195e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 8.078289e+05                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
-Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 8.128602e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 8.098096e+05                 )  sec^-1
 
 *** (3) EXECUTE MADEVENT_CUDA x1 (create events.lhe) ***
 --------------------
@@ -513,9 +513,9 @@ Executing ' ./build.none_f_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggtt_x1
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 47.69 [47.690703397697980] fbridge_mode=1
  [UNWEIGHT] Wrote 434 events (found 1125 events)
- [COUNTERS] PROGRAM TOTAL          :    0.6976s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.6971s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0006s for     8192 events => throughput is 1.49E+07 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.7221s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.7216s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0006s for     8192 events => throughput is 1.46E+07 events/s
 
 *** (3) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -546,9 +546,9 @@ Executing ' ./build.none_f_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggtt_x1
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 46.22 [46.223786763175951] fbridge_mode=1
  [UNWEIGHT] Wrote 1727 events (found 1732 events)
- [COUNTERS] PROGRAM TOTAL          :    1.6292s
- [COUNTERS] Fortran Overhead ( 0 ) :    1.6236s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0055s for    90112 events => throughput is 1.63E+07 events/s
+ [COUNTERS] PROGRAM TOTAL          :    1.6678s
+ [COUNTERS] Fortran Overhead ( 0 ) :    1.6624s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0055s for    90112 events => throughput is 1.65E+07 events/s
 
 *** (3) Compare MADEVENT_CUDA x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -559,43 +559,43 @@ OK! xsec from fortran (46.223782291775365) and cpp (46.223786763175951) differ b
 OK! events.lhe.cuda.10 and events.lhe.ref.10 are identical
 
 *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge ***
-Process                     = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.208611e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.070407e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(8192) -p 256 32 1 ***
-Process                     = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 7.988221e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.904132e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge ***
-Process                     = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 5.833331e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 5.021955e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX) -p 16384 32 1 ***
-Process                     = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.760079e+08                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.726468e+08                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge ***
-Process                     = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 5.803014e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 5.093398e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 ***
-Process                     = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.849470e+08                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.841474e+08                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge ***
-Process                     = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 5.374876e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.753742e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 ***
-Process                     = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 8.461091e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 8.383547e+07                 )  sec^-1
 
 TEST COMPLETED
diff --git a/epochX/cudacpp/tmad/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd0.txt
index 3216bfef7c..bcc7cef90a 100644
--- a/epochX/cudacpp/tmad/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd0.txt
+++ b/epochX/cudacpp/tmad/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd0.txt
@@ -3,25 +3,25 @@ CUDACPP_BUILDDIR='.'
 
 
 
-
 make USEBUILDDIR=1 AVX=none
 make USEBUILDDIR=1 AVX=sse4
+
 make USEBUILDDIR=1 AVX=avx2
 make USEBUILDDIR=1 AVX=512y
-
-make USEBUILDDIR=1 AVX=512z
 make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+
+make USEBUILDDIR=1 AVX=512z
 make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
-CUDACPP_BUILDDIR='build.none_m_inl0_hrd0'
-CUDACPP_BUILDDIR='build.512y_m_inl0_hrd0'
 CUDACPP_BUILDDIR='build.avx2_m_inl0_hrd0'
+CUDACPP_BUILDDIR='build.512y_m_inl0_hrd0'
+CUDACPP_BUILDDIR='build.none_m_inl0_hrd0'
+CUDACPP_BUILDDIR='build.512z_m_inl0_hrd0'
 CUDACPP_BUILDDIR='build.sse4_m_inl0_hrd0'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
-CUDACPP_BUILDDIR='build.512z_m_inl0_hrd0'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 make[1]: Nothing to be done for 'all'.
@@ -33,9 +33,9 @@ make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/
 
 OMP_NUM_THREADS=
 
-DATE: 2023-07-18_23:39:17
+DATE: 2023-06-16_23:36:15
 
-On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
+On itscrd80.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx
 
 *** (1) EXECUTE MADEVENT_FORTRAN (create results.dat) ***
@@ -58,9 +58,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggtt_x1_fortran > /tmp/aval
  [XSECTION] Configuration = 1
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 47.69 [] fbridge_mode=0
- [COUNTERS] PROGRAM TOTAL          :    0.2029s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.1480s
- [COUNTERS] Fortran MEs      ( 1 ) :    0.0549s for     8192 events => throughput is 1.49E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.2020s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.1481s
+ [COUNTERS] Fortran MEs      ( 1 ) :    0.0539s for     8192 events => throughput is 1.52E+05 events/s
 
 *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) ***
 --------------------
@@ -83,9 +83,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggtt_x1_fortran > /tmp/aval
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 47.69 [47.690708277600116] fbridge_mode=0
  [UNWEIGHT] Wrote 434 events (found 1125 events)
- [COUNTERS] PROGRAM TOTAL          :    0.3192s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.2633s
- [COUNTERS] Fortran MEs      ( 1 ) :    0.0559s for     8192 events => throughput is 1.47E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.3175s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.2632s
+ [COUNTERS] Fortran MEs      ( 1 ) :    0.0543s for     8192 events => throughput is 1.51E+05 events/s
 
 *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) ***
 --------------------
@@ -108,9 +108,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggtt_x10_fortran > /tmp/ava
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 46.22 [46.223782291775365] fbridge_mode=0
  [UNWEIGHT] Wrote 1727 events (found 1732 events)
- [COUNTERS] PROGRAM TOTAL          :    1.7890s
- [COUNTERS] Fortran Overhead ( 0 ) :    1.1845s
- [COUNTERS] Fortran MEs      ( 1 ) :    0.6045s for    90112 events => throughput is 1.49E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    1.7727s
+ [COUNTERS] Fortran Overhead ( 0 ) :    1.1825s
+ [COUNTERS] Fortran MEs      ( 1 ) :    0.5903s for    90112 events => throughput is 1.53E+05 events/s
 
 *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -133,9 +133,9 @@ Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 47.69 [47.690709601032026] fbridge_mode=1
  [UNWEIGHT] Wrote 434 events (found 1125 events)
- [COUNTERS] PROGRAM TOTAL          :    0.3521s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.3084s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0437s for     8192 events => throughput is 1.87E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.3466s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.3052s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0414s for     8192 events => throughput is 1.98E+05 events/s
 
 *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -166,9 +166,9 @@ Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x10
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 46.22 [46.223783635280988] fbridge_mode=1
  [UNWEIGHT] Wrote 1727 events (found 1732 events)
- [COUNTERS] PROGRAM TOTAL          :    1.7142s
- [COUNTERS] Fortran Overhead ( 0 ) :    1.2357s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.4786s for    90112 events => throughput is 1.88E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    1.6933s
+ [COUNTERS] Fortran Overhead ( 0 ) :    1.2360s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.4573s for    90112 events => throughput is 1.97E+05 events/s
 
 *** (2-none) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -179,14 +179,14 @@ OK! xsec from fortran (46.223782291775365) and cpp (46.223783635280988) differ b
 OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
 
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
-Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.900675e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.952836e+05                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
-Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.903091e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.944744e+05                 )  sec^-1
 
 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -209,9 +209,9 @@ Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 47.69 [47.690709601032026] fbridge_mode=1
  [UNWEIGHT] Wrote 434 events (found 1125 events)
- [COUNTERS] PROGRAM TOTAL          :    0.3140s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.2916s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0224s for     8192 events => throughput is 3.65E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.3071s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.2854s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0217s for     8192 events => throughput is 3.77E+05 events/s
 
 *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -242,9 +242,9 @@ Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x10
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 46.22 [46.223783635280988] fbridge_mode=1
  [UNWEIGHT] Wrote 1727 events (found 1732 events)
- [COUNTERS] PROGRAM TOTAL          :    1.4613s
- [COUNTERS] Fortran Overhead ( 0 ) :    1.2119s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.2494s for    90112 events => throughput is 3.61E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    1.4646s
+ [COUNTERS] Fortran Overhead ( 0 ) :    1.2169s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.2477s for    90112 events => throughput is 3.64E+05 events/s
 
 *** (2-sse4) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -255,14 +255,14 @@ OK! xsec from fortran (46.223782291775365) and cpp (46.223783635280988) differ b
 OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
 
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
-Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.514697e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.726713e+05                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
-Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.531616e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.678276e+05                 )  sec^-1
 
 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -285,9 +285,9 @@ Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 47.69 [47.690709681138244] fbridge_mode=1
  [UNWEIGHT] Wrote 434 events (found 1125 events)
- [COUNTERS] PROGRAM TOTAL          :    0.2903s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.2774s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0129s for     8192 events => throughput is 6.34E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.2880s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.2760s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0120s for     8192 events => throughput is 6.83E+05 events/s
 
 *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -318,9 +318,9 @@ Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x10
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 46.22 [46.223783652032040] fbridge_mode=1
  [UNWEIGHT] Wrote 1727 events (found 1732 events)
- [COUNTERS] PROGRAM TOTAL          :    1.3401s
- [COUNTERS] Fortran Overhead ( 0 ) :    1.2030s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.1371s for    90112 events => throughput is 6.57E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    1.3686s
+ [COUNTERS] Fortran Overhead ( 0 ) :    1.2290s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.1396s for    90112 events => throughput is 6.46E+05 events/s
 
 *** (2-avx2) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -331,14 +331,14 @@ OK! xsec from fortran (46.223782291775365) and cpp (46.223783652032040) differ b
 OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
 
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
-Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 6.270392e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 6.474630e+05                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
-Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 6.382141e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 6.428506e+05                 )  sec^-1
 
 *** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -361,9 +361,9 @@ Executing ' ./build.512y_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 47.69 [47.690709681138244] fbridge_mode=1
  [UNWEIGHT] Wrote 434 events (found 1125 events)
- [COUNTERS] PROGRAM TOTAL          :    0.3051s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.2930s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0120s for     8192 events => throughput is 6.81E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.2848s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.2738s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0110s for     8192 events => throughput is 7.42E+05 events/s
 
 *** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -394,9 +394,9 @@ Executing ' ./build.512y_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x10
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 46.22 [46.223783652032040] fbridge_mode=1
  [UNWEIGHT] Wrote 1727 events (found 1732 events)
- [COUNTERS] PROGRAM TOTAL          :    1.3268s
- [COUNTERS] Fortran Overhead ( 0 ) :    1.2029s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.1238s for    90112 events => throughput is 7.28E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    1.3245s
+ [COUNTERS] Fortran Overhead ( 0 ) :    1.2028s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.1217s for    90112 events => throughput is 7.40E+05 events/s
 
 *** (2-512y) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -407,14 +407,14 @@ OK! xsec from fortran (46.223782291775365) and cpp (46.223783652032040) differ b
 OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
 
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
-Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 7.042794e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.064546e+05                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
-Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 7.134968e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.181463e+05                 )  sec^-1
 
 *** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -437,9 +437,9 @@ Executing ' ./build.512z_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 47.69 [47.690709681138244] fbridge_mode=1
  [UNWEIGHT] Wrote 434 events (found 1125 events)
- [COUNTERS] PROGRAM TOTAL          :    0.3025s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.2836s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0190s for     8192 events => throughput is 4.32E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.3000s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.2818s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0182s for     8192 events => throughput is 4.50E+05 events/s
 
 *** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -470,9 +470,9 @@ Executing ' ./build.512z_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x10
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 46.22 [46.223783652032040] fbridge_mode=1
  [UNWEIGHT] Wrote 1727 events (found 1732 events)
- [COUNTERS] PROGRAM TOTAL          :    1.4160s
- [COUNTERS] Fortran Overhead ( 0 ) :    1.2108s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.2053s for    90112 events => throughput is 4.39E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    1.4141s
+ [COUNTERS] Fortran Overhead ( 0 ) :    1.2163s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.1978s for    90112 events => throughput is 4.55E+05 events/s
 
 *** (2-512z) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -483,14 +483,14 @@ OK! xsec from fortran (46.223782291775365) and cpp (46.223783652032040) differ b
 OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
 
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
-Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 4.195485e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.215527e+05                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
-Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 4.105555e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.203869e+05                 )  sec^-1
 
 *** (3) EXECUTE MADEVENT_CUDA x1 (create events.lhe) ***
 --------------------
@@ -513,9 +513,9 @@ Executing ' ./build.none_m_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggtt_x1
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 47.69 [47.690708266690699] fbridge_mode=1
  [UNWEIGHT] Wrote 434 events (found 1125 events)
- [COUNTERS] PROGRAM TOTAL          :    0.6959s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.6953s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0006s for     8192 events => throughput is 1.41E+07 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.7253s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.7248s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0006s for     8192 events => throughput is 1.40E+07 events/s
 
 *** (3) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -546,9 +546,9 @@ Executing ' ./build.none_m_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggtt_x1
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 46.22 [46.223782303744791] fbridge_mode=1
  [UNWEIGHT] Wrote 1727 events (found 1732 events)
- [COUNTERS] PROGRAM TOTAL          :    1.6281s
- [COUNTERS] Fortran Overhead ( 0 ) :    1.6217s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0064s for    90112 events => throughput is 1.41E+07 events/s
+ [COUNTERS] PROGRAM TOTAL          :    1.6584s
+ [COUNTERS] Fortran Overhead ( 0 ) :    1.6519s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0065s for    90112 events => throughput is 1.39E+07 events/s
 
 *** (3) Compare MADEVENT_CUDA x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -559,43 +559,43 @@ OK! xsec from fortran (46.223782291775365) and cpp (46.223782303744791) differ b
 OK! events.lhe.cuda.10 and events.lhe.ref.10 are identical
 
 *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge ***
-Process                     = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.072602e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.858157e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(8192) -p 256 32 1 ***
-Process                     = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 5.628306e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 5.585975e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge ***
-Process                     = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.002417e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.575302e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX) -p 16384 32 1 ***
-Process                     = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.056552e+08                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.048555e+08                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge ***
-Process                     = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.008120e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.576352e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 ***
-Process                     = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.133647e+08                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.132080e+08                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge ***
-Process                     = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.014244e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.576975e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 ***
-Process                     = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.974355e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.969589e+07                 )  sec^-1
 
 TEST COMPLETED
diff --git a/epochX/cudacpp/tmad/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd0.txt
index 8868aa1905..52294c86ed 100644
--- a/epochX/cudacpp/tmad/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd0.txt
+++ b/epochX/cudacpp/tmad/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd0.txt
@@ -7,19 +7,18 @@ make USEBUILDDIR=1 AVX=none
 
 make USEBUILDDIR=1 AVX=sse4
 make USEBUILDDIR=1 AVX=avx2
-
 make USEBUILDDIR=1 AVX=512y
-make USEBUILDDIR=1 AVX=512z
 make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
 make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
 make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
 make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
+
+make USEBUILDDIR=1 AVX=512z
 make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
 CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd0'
+CUDACPP_BUILDDIR='build.none_d_inl0_hrd0'
 CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0'
-CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0'
 CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0'
-CUDACPP_BUILDDIR='build.none_d_inl0_hrd0'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
 make[1]: Nothing to be done for 'all'.
@@ -28,14 +27,15 @@ make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
+CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
 
 OMP_NUM_THREADS=
 
-DATE: 2023-07-18_23:39:42
+DATE: 2023-06-16_23:36:41
 
-On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
+On itscrd80.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg
 
 *** (1) EXECUTE MADEVENT_FORTRAN (create results.dat) ***
@@ -58,9 +58,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttg_x1_fortran > /tmp/ava
  [XSECTION] Configuration = 1
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.0972 [] fbridge_mode=0
- [COUNTERS] PROGRAM TOTAL          :    0.5221s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.1707s
- [COUNTERS] Fortran MEs      ( 1 ) :    0.3515s for     8192 events => throughput is 2.33E+04 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.5432s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.1981s
+ [COUNTERS] Fortran MEs      ( 1 ) :    0.3452s for     8192 events => throughput is 2.37E+04 events/s
 
 *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) ***
 --------------------
@@ -81,11 +81,11 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttg_x1_fortran > /tmp/ava
  [XSECTION] MultiChannel = TRUE
  [XSECTION] Configuration = 1
  [XSECTION] ChannelId = 1
- [XSECTION] Cross section = 0.0972 [9.7196357922470805E-002] fbridge_mode=0
+ [XSECTION] Cross section = 0.0972 [9.7195719386171234E-002] fbridge_mode=0
  [UNWEIGHT] Wrote 40 events (found 738 events)
- [COUNTERS] PROGRAM TOTAL          :    0.5998s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.2469s
- [COUNTERS] Fortran MEs      ( 1 ) :    0.3529s for     8192 events => throughput is 2.32E+04 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.5864s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.2409s
+ [COUNTERS] Fortran MEs      ( 1 ) :    0.3455s for     8192 events => throughput is 2.37E+04 events/s
 
 *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) ***
 --------------------
@@ -106,11 +106,11 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttg_x10_fortran > /tmp/av
  [XSECTION] MultiChannel = TRUE
  [XSECTION] Configuration = 1
  [XSECTION] ChannelId = 1
- [XSECTION] Cross section = 0.08131 [8.1310872077655555E-002] fbridge_mode=0
+ [XSECTION] Cross section = 0.08131 [8.1310872844967921E-002] fbridge_mode=0
  [UNWEIGHT] Wrote 679 events (found 1787 events)
- [COUNTERS] PROGRAM TOTAL          :    5.2309s
- [COUNTERS] Fortran Overhead ( 0 ) :    1.3729s
- [COUNTERS] Fortran MEs      ( 1 ) :    3.8580s for    90112 events => throughput is 2.34E+04 events/s
+ [COUNTERS] PROGRAM TOTAL          :    5.1533s
+ [COUNTERS] Fortran Overhead ( 0 ) :    1.3710s
+ [COUNTERS] Fortran MEs      ( 1 ) :    3.7823s for    90112 events => throughput is 2.38E+04 events/s
 
 *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -131,15 +131,15 @@ Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1
  [XSECTION] MultiChannel = TRUE
  [XSECTION] Configuration = 1
  [XSECTION] ChannelId = 1
- [XSECTION] Cross section = 0.0972 [9.7196357922470791E-002] fbridge_mode=1
+ [XSECTION] Cross section = 0.0972 [9.7195719386171234E-002] fbridge_mode=1
  [UNWEIGHT] Wrote 40 events (found 738 events)
- [COUNTERS] PROGRAM TOTAL          :    0.8963s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.5674s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.3288s for     8192 events => throughput is 2.49E+04 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.8655s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.5504s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.3151s for     8192 events => throughput is 2.60E+04 events/s
 
 *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
-OK! xsec from fortran (9.7196357922470805E-002) and cpp (9.7196357922470791E-002) differ by less than 2E-14 (1.1102230246251565e-16)
+OK! xsec from fortran (9.7195719386171234E-002) and cpp (9.7195719386171234E-002) differ by less than 2E-14 (0.0)
 
 *** (2-none) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) ***
 
@@ -164,29 +164,29 @@ Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1
  [XSECTION] MultiChannel = TRUE
  [XSECTION] Configuration = 1
  [XSECTION] ChannelId = 1
- [XSECTION] Cross section = 0.08131 [8.1310872077655597E-002] fbridge_mode=1
+ [XSECTION] Cross section = 0.08131 [8.1310872844967963E-002] fbridge_mode=1
  [UNWEIGHT] Wrote 679 events (found 1787 events)
- [COUNTERS] PROGRAM TOTAL          :    5.3437s
- [COUNTERS] Fortran Overhead ( 0 ) :    1.7009s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    3.6428s for    90112 events => throughput is 2.47E+04 events/s
+ [COUNTERS] PROGRAM TOTAL          :    5.2253s
+ [COUNTERS] Fortran Overhead ( 0 ) :    1.7057s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    3.5196s for    90112 events => throughput is 2.56E+04 events/s
 
 *** (2-none) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
 
-OK! xsec from fortran (8.1310872077655555E-002) and cpp (8.1310872077655597E-002) differ by less than 2E-14 (4.440892098500626e-16)
+OK! xsec from fortran (8.1310872844967921E-002) and cpp (8.1310872844967963E-002) differ by less than 2E-14 (4.440892098500626e-16)
 
 *** (2-none) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) ***
 
 OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
 
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
-Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.551954e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.634187e+04                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
-Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.532896e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.657523e+04                 )  sec^-1
 
 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -207,15 +207,15 @@ Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1
  [XSECTION] MultiChannel = TRUE
  [XSECTION] Configuration = 1
  [XSECTION] ChannelId = 1
- [XSECTION] Cross section = 0.0972 [9.7196357922470777E-002] fbridge_mode=1
+ [XSECTION] Cross section = 0.0972 [9.7195719386171234E-002] fbridge_mode=1
  [UNWEIGHT] Wrote 40 events (found 738 events)
- [COUNTERS] PROGRAM TOTAL          :    0.5879s
+ [COUNTERS] PROGRAM TOTAL          :    0.5827s
  [COUNTERS] Fortran Overhead ( 0 ) :    0.4172s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.1707s for     8192 events => throughput is 4.80E+04 events/s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.1654s for     8192 events => throughput is 4.95E+04 events/s
 
 *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
-OK! xsec from fortran (9.7196357922470805E-002) and cpp (9.7196357922470777E-002) differ by less than 2E-14 (3.3306690738754696e-16)
+OK! xsec from fortran (9.7195719386171234E-002) and cpp (9.7195719386171234E-002) differ by less than 2E-14 (0.0)
 
 *** (2-sse4) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) ***
 
@@ -240,29 +240,29 @@ Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1
  [XSECTION] MultiChannel = TRUE
  [XSECTION] Configuration = 1
  [XSECTION] ChannelId = 1
- [XSECTION] Cross section = 0.08131 [8.1310872077655555E-002] fbridge_mode=1
+ [XSECTION] Cross section = 0.08131 [8.1310872844967921E-002] fbridge_mode=1
  [UNWEIGHT] Wrote 679 events (found 1787 events)
- [COUNTERS] PROGRAM TOTAL          :    3.4725s
- [COUNTERS] Fortran Overhead ( 0 ) :    1.5669s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    1.9056s for    90112 events => throughput is 4.73E+04 events/s
+ [COUNTERS] PROGRAM TOTAL          :    3.4871s
+ [COUNTERS] Fortran Overhead ( 0 ) :    1.5641s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    1.9229s for    90112 events => throughput is 4.69E+04 events/s
 
 *** (2-sse4) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
 
-OK! xsec from fortran (8.1310872077655555E-002) and cpp (8.1310872077655555E-002) differ by less than 2E-14 (0.0)
+OK! xsec from fortran (8.1310872844967921E-002) and cpp (8.1310872844967921E-002) differ by less than 2E-14 (0.0)
 
 *** (2-sse4) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) ***
 
 OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
 
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
-Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 4.885132e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.964351e+04                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
-Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 4.909068e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 5.061720e+04                 )  sec^-1
 
 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -283,15 +283,15 @@ Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1
  [XSECTION] MultiChannel = TRUE
  [XSECTION] Configuration = 1
  [XSECTION] ChannelId = 1
- [XSECTION] Cross section = 0.0972 [9.7196357922470750E-002] fbridge_mode=1
+ [XSECTION] Cross section = 0.0972 [9.7195719386171206E-002] fbridge_mode=1
  [UNWEIGHT] Wrote 40 events (found 738 events)
- [COUNTERS] PROGRAM TOTAL          :    0.4230s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.3342s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0888s for     8192 events => throughput is 9.23E+04 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.4029s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.3246s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0783s for     8192 events => throughput is 1.05E+05 events/s
 
 *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
-OK! xsec from fortran (9.7196357922470805E-002) and cpp (9.7196357922470750E-002) differ by less than 2E-14 (5.551115123125783e-16)
+OK! xsec from fortran (9.7195719386171234E-002) and cpp (9.7195719386171206E-002) differ by less than 2E-14 (3.3306690738754696e-16)
 
 *** (2-avx2) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) ***
 
@@ -316,29 +316,29 @@ Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1
  [XSECTION] MultiChannel = TRUE
  [XSECTION] Configuration = 1
  [XSECTION] ChannelId = 1
- [XSECTION] Cross section = 0.08131 [8.1310872077655541E-002] fbridge_mode=1
+ [XSECTION] Cross section = 0.08131 [8.1310872844967907E-002] fbridge_mode=1
  [UNWEIGHT] Wrote 679 events (found 1787 events)
- [COUNTERS] PROGRAM TOTAL          :    2.3458s
- [COUNTERS] Fortran Overhead ( 0 ) :    1.4550s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.8908s for    90112 events => throughput is 1.01E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    2.3698s
+ [COUNTERS] Fortran Overhead ( 0 ) :    1.4879s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.8819s for    90112 events => throughput is 1.02E+05 events/s
 
 *** (2-avx2) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
 
-OK! xsec from fortran (8.1310872077655555E-002) and cpp (8.1310872077655541E-002) differ by less than 2E-14 (2.220446049250313e-16)
+OK! xsec from fortran (8.1310872844967921E-002) and cpp (8.1310872844967907E-002) differ by less than 2E-14 (2.220446049250313e-16)
 
 *** (2-avx2) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) ***
 
 OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
 
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
-Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.016436e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.047204e+05                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
-Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.023538e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.053567e+05                 )  sec^-1
 
 *** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -359,15 +359,15 @@ Executing ' ./build.512y_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1
  [XSECTION] MultiChannel = TRUE
  [XSECTION] Configuration = 1
  [XSECTION] ChannelId = 1
- [XSECTION] Cross section = 0.0972 [9.7196357922470750E-002] fbridge_mode=1
+ [XSECTION] Cross section = 0.0972 [9.7195719386171206E-002] fbridge_mode=1
  [UNWEIGHT] Wrote 40 events (found 738 events)
- [COUNTERS] PROGRAM TOTAL          :    0.3872s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.3152s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0720s for     8192 events => throughput is 1.14E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.3886s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.3177s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0709s for     8192 events => throughput is 1.15E+05 events/s
 
 *** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
-OK! xsec from fortran (9.7196357922470805E-002) and cpp (9.7196357922470750E-002) differ by less than 2E-14 (5.551115123125783e-16)
+OK! xsec from fortran (9.7195719386171234E-002) and cpp (9.7195719386171206E-002) differ by less than 2E-14 (3.3306690738754696e-16)
 
 *** (2-512y) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) ***
 
@@ -392,29 +392,29 @@ Executing ' ./build.512y_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1
  [XSECTION] MultiChannel = TRUE
  [XSECTION] Configuration = 1
  [XSECTION] ChannelId = 1
- [XSECTION] Cross section = 0.08131 [8.1310872077655541E-002] fbridge_mode=1
+ [XSECTION] Cross section = 0.08131 [8.1310872844967907E-002] fbridge_mode=1
  [UNWEIGHT] Wrote 679 events (found 1787 events)
- [COUNTERS] PROGRAM TOTAL          :    2.2612s
- [COUNTERS] Fortran Overhead ( 0 ) :    1.4623s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.7989s for    90112 events => throughput is 1.13E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    2.2440s
+ [COUNTERS] Fortran Overhead ( 0 ) :    1.4672s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.7768s for    90112 events => throughput is 1.16E+05 events/s
 
 *** (2-512y) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
 
-OK! xsec from fortran (8.1310872077655555E-002) and cpp (8.1310872077655541E-002) differ by less than 2E-14 (2.220446049250313e-16)
+OK! xsec from fortran (8.1310872844967921E-002) and cpp (8.1310872844967907E-002) differ by less than 2E-14 (2.220446049250313e-16)
 
 *** (2-512y) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) ***
 
 OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
 
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
-Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.171611e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.177490e+05                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
-Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.155393e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.191999e+05                 )  sec^-1
 
 *** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -435,15 +435,15 @@ Executing ' ./build.512z_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1
  [XSECTION] MultiChannel = TRUE
  [XSECTION] Configuration = 1
  [XSECTION] ChannelId = 1
- [XSECTION] Cross section = 0.0972 [9.7196357922470750E-002] fbridge_mode=1
+ [XSECTION] Cross section = 0.0972 [9.7195719386171234E-002] fbridge_mode=1
  [UNWEIGHT] Wrote 40 events (found 738 events)
- [COUNTERS] PROGRAM TOTAL          :    0.4445s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.3448s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0997s for     8192 events => throughput is 8.22E+04 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.4369s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.3403s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0966s for     8192 events => throughput is 8.48E+04 events/s
 
 *** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
-OK! xsec from fortran (9.7196357922470805E-002) and cpp (9.7196357922470750E-002) differ by less than 2E-14 (5.551115123125783e-16)
+OK! xsec from fortran (9.7195719386171234E-002) and cpp (9.7195719386171234E-002) differ by less than 2E-14 (0.0)
 
 *** (2-512z) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) ***
 
@@ -468,29 +468,29 @@ Executing ' ./build.512z_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1
  [XSECTION] MultiChannel = TRUE
  [XSECTION] Configuration = 1
  [XSECTION] ChannelId = 1
- [XSECTION] Cross section = 0.08131 [8.1310872077655541E-002] fbridge_mode=1
+ [XSECTION] Cross section = 0.08131 [8.1310872844967907E-002] fbridge_mode=1
  [UNWEIGHT] Wrote 679 events (found 1787 events)
- [COUNTERS] PROGRAM TOTAL          :    2.5775s
- [COUNTERS] Fortran Overhead ( 0 ) :    1.4738s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    1.1036s for    90112 events => throughput is 8.17E+04 events/s
+ [COUNTERS] PROGRAM TOTAL          :    2.5408s
+ [COUNTERS] Fortran Overhead ( 0 ) :    1.4804s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    1.0604s for    90112 events => throughput is 8.50E+04 events/s
 
 *** (2-512z) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
 
-OK! xsec from fortran (8.1310872077655555E-002) and cpp (8.1310872077655541E-002) differ by less than 2E-14 (2.220446049250313e-16)
+OK! xsec from fortran (8.1310872844967921E-002) and cpp (8.1310872844967907E-002) differ by less than 2E-14 (2.220446049250313e-16)
 
 *** (2-512z) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) ***
 
 OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
 
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
-Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 7.663037e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 8.369247e+04                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
-Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 8.136831e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 8.461216e+04                 )  sec^-1
 
 *** (3) EXECUTE MADEVENT_CUDA x1 (create events.lhe) ***
 --------------------
@@ -511,15 +511,15 @@ Executing ' ./build.none_d_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggttg_x
  [XSECTION] MultiChannel = TRUE
  [XSECTION] Configuration = 1
  [XSECTION] ChannelId = 1
- [XSECTION] Cross section = 0.0972 [9.7196357922470764E-002] fbridge_mode=1
+ [XSECTION] Cross section = 0.0972 [9.7195719386171220E-002] fbridge_mode=1
  [UNWEIGHT] Wrote 40 events (found 738 events)
- [COUNTERS] PROGRAM TOTAL          :    0.6809s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.6792s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0017s for     8192 events => throughput is 4.76E+06 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.7104s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.7087s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0017s for     8192 events => throughput is 4.77E+06 events/s
 
 *** (3) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec ***
 
-OK! xsec from fortran (9.7196357922470805E-002) and cpp (9.7196357922470764E-002) differ by less than 2E-14 (4.440892098500626e-16)
+OK! xsec from fortran (9.7195719386171234E-002) and cpp (9.7195719386171220E-002) differ by less than 2E-14 (1.1102230246251565e-16)
 
 *** (3) Compare MADEVENT_CUDA x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) ***
 
@@ -544,58 +544,58 @@ Executing ' ./build.none_d_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggttg_x
  [XSECTION] MultiChannel = TRUE
  [XSECTION] Configuration = 1
  [XSECTION] ChannelId = 1
- [XSECTION] Cross section = 0.08131 [8.1310872077655597E-002] fbridge_mode=1
+ [XSECTION] Cross section = 0.08131 [8.1310872844967977E-002] fbridge_mode=1
  [UNWEIGHT] Wrote 679 events (found 1787 events)
- [COUNTERS] PROGRAM TOTAL          :    1.8266s
- [COUNTERS] Fortran Overhead ( 0 ) :    1.8074s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0191s for    90112 events => throughput is 4.71E+06 events/s
+ [COUNTERS] PROGRAM TOTAL          :    1.8754s
+ [COUNTERS] Fortran Overhead ( 0 ) :    1.8562s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0193s for    90112 events => throughput is 4.68E+06 events/s
 
 *** (3) Compare MADEVENT_CUDA x10 xsec to MADEVENT_FORTRAN xsec ***
 
-OK! xsec from fortran (8.1310872077655555E-002) and cpp (8.1310872077655597E-002) differ by less than 2E-14 (4.440892098500626e-16)
+OK! xsec from fortran (8.1310872844967921E-002) and cpp (8.1310872844967977E-002) differ by less than 2E-14 (6.661338147750939e-16)
 
 *** (3) Compare MADEVENT_CUDA x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) ***
 
 OK! events.lhe.cuda.10 and events.lhe.ref.10 are identical
 
 *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge ***
-Process                     = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 5.265641e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 5.108652e+06                 )  sec^-1
 
 *** EXECUTE GCHECK(8192) -p 256 32 1 ***
-Process                     = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 8.394415e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 8.056830e+06                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge ***
-Process                     = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 9.627373e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 9.357483e+06                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX) -p 16384 32 1 ***
-Process                     = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.236143e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.238541e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge ***
-Process                     = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 9.636530e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 9.098545e+06                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 ***
-Process                     = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.245580e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.247043e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge ***
-Process                     = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 9.631703e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 9.065031e+06                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 ***
-Process                     = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.755728e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.797581e+06                 )  sec^-1
 
 TEST COMPLETED
diff --git a/epochX/cudacpp/tmad/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd0.txt
index 08af34efd8..ac98f0132b 100644
--- a/epochX/cudacpp/tmad/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd0.txt
+++ b/epochX/cudacpp/tmad/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd0.txt
@@ -4,38 +4,38 @@ CUDACPP_BUILDDIR='.'
 
 
 make USEBUILDDIR=1 AVX=none
-make USEBUILDDIR=1 AVX=sse4
 
+make USEBUILDDIR=1 AVX=sse4
 make USEBUILDDIR=1 AVX=avx2
 make USEBUILDDIR=1 AVX=512y
-
-make USEBUILDDIR=1 AVX=512z
 make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
 make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
 make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
 make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
+
+make USEBUILDDIR=1 AVX=512z
 make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
-CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd0'
+CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0'
+CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0'
 CUDACPP_BUILDDIR='build.none_f_inl0_hrd0'
 make[1]: Nothing to be done for 'all'.
-CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0'
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
-CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0'
-CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0'
+CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd0'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
+CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
 
 OMP_NUM_THREADS=
 
-DATE: 2023-07-18_23:40:23
+DATE: 2023-06-16_23:37:23
 
-On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
+On itscrd80.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg
 
 *** (1) EXECUTE MADEVENT_FORTRAN (create results.dat) ***
@@ -58,9 +58,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttg_x1_fortran > /tmp/ava
  [XSECTION] Configuration = 1
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.0972 [] fbridge_mode=0
- [COUNTERS] PROGRAM TOTAL          :    0.5187s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.1672s
- [COUNTERS] Fortran MEs      ( 1 ) :    0.3515s for     8192 events => throughput is 2.33E+04 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.5130s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.1657s
+ [COUNTERS] Fortran MEs      ( 1 ) :    0.3473s for     8192 events => throughput is 2.36E+04 events/s
 
 *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) ***
 --------------------
@@ -81,11 +81,11 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttg_x1_fortran > /tmp/ava
  [XSECTION] MultiChannel = TRUE
  [XSECTION] Configuration = 1
  [XSECTION] ChannelId = 1
- [XSECTION] Cross section = 0.0972 [9.7196357922470805E-002] fbridge_mode=0
+ [XSECTION] Cross section = 0.0972 [9.7195719386171234E-002] fbridge_mode=0
  [UNWEIGHT] Wrote 40 events (found 738 events)
- [COUNTERS] PROGRAM TOTAL          :    0.5925s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.2438s
- [COUNTERS] Fortran MEs      ( 1 ) :    0.3487s for     8192 events => throughput is 2.35E+04 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.6055s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.2536s
+ [COUNTERS] Fortran MEs      ( 1 ) :    0.3519s for     8192 events => throughput is 2.33E+04 events/s
 
 *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) ***
 --------------------
@@ -106,11 +106,11 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttg_x10_fortran > /tmp/av
  [XSECTION] MultiChannel = TRUE
  [XSECTION] Configuration = 1
  [XSECTION] ChannelId = 1
- [XSECTION] Cross section = 0.08131 [8.1310872077655555E-002] fbridge_mode=0
+ [XSECTION] Cross section = 0.08131 [8.1310872844967921E-002] fbridge_mode=0
  [UNWEIGHT] Wrote 679 events (found 1787 events)
- [COUNTERS] PROGRAM TOTAL          :    5.2118s
- [COUNTERS] Fortran Overhead ( 0 ) :    1.3751s
- [COUNTERS] Fortran MEs      ( 1 ) :    3.8366s for    90112 events => throughput is 2.35E+04 events/s
+ [COUNTERS] PROGRAM TOTAL          :    5.1398s
+ [COUNTERS] Fortran Overhead ( 0 ) :    1.3678s
+ [COUNTERS] Fortran MEs      ( 1 ) :    3.7720s for    90112 events => throughput is 2.39E+04 events/s
 
 *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -131,15 +131,15 @@ Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1
  [XSECTION] MultiChannel = TRUE
  [XSECTION] Configuration = 1
  [XSECTION] ChannelId = 1
- [XSECTION] Cross section = 0.0972 [9.7196349725192449E-002] fbridge_mode=1
+ [XSECTION] Cross section = 0.0972 [9.7195711188152623E-002] fbridge_mode=1
  [UNWEIGHT] Wrote 40 events (found 738 events)
- [COUNTERS] PROGRAM TOTAL          :    0.8860s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.5612s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.3248s for     8192 events => throughput is 2.52E+04 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.8594s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.5459s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.3135s for     8192 events => throughput is 2.61E+04 events/s
 
 *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
-OK! xsec from fortran (9.7196357922470805E-002) and cpp (9.7196349725192449E-002) differ by less than 4E-4 (8.433729958845504e-08)
+OK! xsec from fortran (9.7195719386171234E-002) and cpp (9.7195711188152623E-002) differ by less than 4E-4 (8.434546971969326e-08)
 
 *** (2-none) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) ***
 
@@ -164,29 +164,29 @@ Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1
  [XSECTION] MultiChannel = TRUE
  [XSECTION] Configuration = 1
  [XSECTION] ChannelId = 1
- [XSECTION] Cross section = 0.08131 [8.1310860682799649E-002] fbridge_mode=1
+ [XSECTION] Cross section = 0.08131 [8.1310861450156910E-002] fbridge_mode=1
  [UNWEIGHT] Wrote 679 events (found 1787 events)
- [COUNTERS] PROGRAM TOTAL          :    5.3128s
- [COUNTERS] Fortran Overhead ( 0 ) :    1.7147s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    3.5981s for    90112 events => throughput is 2.50E+04 events/s
+ [COUNTERS] PROGRAM TOTAL          :    5.1084s
+ [COUNTERS] Fortran Overhead ( 0 ) :    1.6927s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    3.4157s for    90112 events => throughput is 2.64E+04 events/s
 
 *** (2-none) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
 
-OK! xsec from fortran (8.1310872077655555E-002) and cpp (8.1310860682799649E-002) differ by less than 4E-4 (1.4013938864909647e-07)
+OK! xsec from fortran (8.1310872844967921E-002) and cpp (8.1310861450156910E-002) differ by less than 4E-4 (1.401388352029187e-07)
 
 *** (2-none) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) ***
 
 OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
 
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
-Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.584606e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.731642e+04                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
-Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.572586e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.740748e+04                 )  sec^-1
 
 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -207,15 +207,15 @@ Executing ' ./build.sse4_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1
  [XSECTION] MultiChannel = TRUE
  [XSECTION] Configuration = 1
  [XSECTION] ChannelId = 1
- [XSECTION] Cross section = 0.0972 [9.7196325939550202E-002] fbridge_mode=1
+ [XSECTION] Cross section = 0.0972 [9.7195687405490658E-002] fbridge_mode=1
  [UNWEIGHT] Wrote 40 events (found 738 events)
- [COUNTERS] PROGRAM TOTAL          :    0.4348s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.3390s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0958s for     8192 events => throughput is 8.55E+04 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.4313s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.3404s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0909s for     8192 events => throughput is 9.02E+04 events/s
 
 *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
-OK! xsec from fortran (9.7196357922470805E-002) and cpp (9.7196325939550202E-002) differ by less than 4E-4 (3.290547226919571e-07)
+OK! xsec from fortran (9.7195719386171234E-002) and cpp (9.7195687405490658E-002) differ by less than 4E-4 (3.290338378425517e-07)
 
 *** (2-sse4) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) ***
 
@@ -240,29 +240,29 @@ Executing ' ./build.sse4_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1
  [XSECTION] MultiChannel = TRUE
  [XSECTION] Configuration = 1
  [XSECTION] ChannelId = 1
- [XSECTION] Cross section = 0.08131 [8.1310854076870026E-002] fbridge_mode=1
+ [XSECTION] Cross section = 0.08131 [8.1310854844234101E-002] fbridge_mode=1
  [UNWEIGHT] Wrote 679 events (found 1787 events)
- [COUNTERS] PROGRAM TOTAL          :    2.5193s
- [COUNTERS] Fortran Overhead ( 0 ) :    1.4678s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    1.0516s for    90112 events => throughput is 8.57E+04 events/s
+ [COUNTERS] PROGRAM TOTAL          :    2.5514s
+ [COUNTERS] Fortran Overhead ( 0 ) :    1.4872s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    1.0642s for    90112 events => throughput is 8.47E+04 events/s
 
 *** (2-sse4) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
 
-OK! xsec from fortran (8.1310872077655555E-002) and cpp (8.1310854076870026E-002) differ by less than 4E-4 (2.213822711816249e-07)
+OK! xsec from fortran (8.1310872844967921E-002) and cpp (8.1310854844234101E-002) differ by less than 4E-4 (2.2138163313645265e-07)
 
 *** (2-sse4) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) ***
 
 OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
 
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
-Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 8.660520e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 8.703250e+04                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
-Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 8.723830e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 8.982646e+04                 )  sec^-1
 
 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -283,15 +283,15 @@ Executing ' ./build.avx2_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1
  [XSECTION] MultiChannel = TRUE
  [XSECTION] Configuration = 1
  [XSECTION] ChannelId = 1
- [XSECTION] Cross section = 0.0972 [9.7196353680794059E-002] fbridge_mode=1
+ [XSECTION] Cross section = 0.0972 [9.7195715140566227E-002] fbridge_mode=1
  [UNWEIGHT] Wrote 40 events (found 738 events)
- [COUNTERS] PROGRAM TOTAL          :    0.3269s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.2857s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0412s for     8192 events => throughput is 1.99E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.3260s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.2868s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0392s for     8192 events => throughput is 2.09E+05 events/s
 
 *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
-OK! xsec from fortran (9.7196357922470805E-002) and cpp (9.7196353680794059E-002) differ by less than 4E-4 (4.364028483028193e-08)
+OK! xsec from fortran (9.7195719386171234E-002) and cpp (9.7195715140566227E-002) differ by less than 4E-4 (4.368098749374383e-08)
 
 *** (2-avx2) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) ***
 
@@ -316,29 +316,29 @@ Executing ' ./build.avx2_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1
  [XSECTION] MultiChannel = TRUE
  [XSECTION] Configuration = 1
  [XSECTION] ChannelId = 1
- [XSECTION] Cross section = 0.08131 [8.1310850468770415E-002] fbridge_mode=1
+ [XSECTION] Cross section = 0.08131 [8.1310851236127482E-002] fbridge_mode=1
  [UNWEIGHT] Wrote 679 events (found 1787 events)
- [COUNTERS] PROGRAM TOTAL          :    1.8622s
- [COUNTERS] Fortran Overhead ( 0 ) :    1.4138s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.4484s for    90112 events => throughput is 2.01E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    1.8684s
+ [COUNTERS] Fortran Overhead ( 0 ) :    1.4327s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.4357s for    90112 events => throughput is 2.07E+05 events/s
 
 *** (2-avx2) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
 
-OK! xsec from fortran (8.1310872077655555E-002) and cpp (8.1310850468770415E-002) differ by less than 4E-4 (2.657564061037121e-07)
+OK! xsec from fortran (8.1310872844967921E-002) and cpp (8.1310851236127482E-002) differ by less than 4E-4 (2.6575585387877965e-07)
 
 *** (2-avx2) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) ***
 
 OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
 
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
-Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.036815e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.029319e+05                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
-Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.040467e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.080772e+05                 )  sec^-1
 
 *** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -359,15 +359,15 @@ Executing ' ./build.512y_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1
  [XSECTION] MultiChannel = TRUE
  [XSECTION] Configuration = 1
  [XSECTION] ChannelId = 1
- [XSECTION] Cross section = 0.0972 [9.7196353680794059E-002] fbridge_mode=1
+ [XSECTION] Cross section = 0.0972 [9.7195715140566227E-002] fbridge_mode=1
  [UNWEIGHT] Wrote 40 events (found 738 events)
- [COUNTERS] PROGRAM TOTAL          :    0.3180s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.2816s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0365s for     8192 events => throughput is 2.25E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.3161s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.2805s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0356s for     8192 events => throughput is 2.30E+05 events/s
 
 *** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
-OK! xsec from fortran (9.7196357922470805E-002) and cpp (9.7196353680794059E-002) differ by less than 4E-4 (4.364028483028193e-08)
+OK! xsec from fortran (9.7195719386171234E-002) and cpp (9.7195715140566227E-002) differ by less than 4E-4 (4.368098749374383e-08)
 
 *** (2-512y) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) ***
 
@@ -392,29 +392,29 @@ Executing ' ./build.512y_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1
  [XSECTION] MultiChannel = TRUE
  [XSECTION] Configuration = 1
  [XSECTION] ChannelId = 1
- [XSECTION] Cross section = 0.08131 [8.1310850468770415E-002] fbridge_mode=1
+ [XSECTION] Cross section = 0.08131 [8.1310851236127482E-002] fbridge_mode=1
  [UNWEIGHT] Wrote 679 events (found 1787 events)
- [COUNTERS] PROGRAM TOTAL          :    1.8119s
- [COUNTERS] Fortran Overhead ( 0 ) :    1.4094s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.4025s for    90112 events => throughput is 2.24E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    1.8280s
+ [COUNTERS] Fortran Overhead ( 0 ) :    1.4335s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.3946s for    90112 events => throughput is 2.28E+05 events/s
 
 *** (2-512y) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
 
-OK! xsec from fortran (8.1310872077655555E-002) and cpp (8.1310850468770415E-002) differ by less than 4E-4 (2.657564061037121e-07)
+OK! xsec from fortran (8.1310872844967921E-002) and cpp (8.1310851236127482E-002) differ by less than 4E-4 (2.6575585387877965e-07)
 
 *** (2-512y) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) ***
 
 OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
 
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
-Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.259149e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.320491e+05                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
-Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.266261e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.339938e+05                 )  sec^-1
 
 *** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -435,15 +435,15 @@ Executing ' ./build.512z_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1
  [XSECTION] MultiChannel = TRUE
  [XSECTION] Configuration = 1
  [XSECTION] ChannelId = 1
- [XSECTION] Cross section = 0.0972 [9.7196366042348534E-002] fbridge_mode=1
+ [XSECTION] Cross section = 0.0972 [9.7195727520443878E-002] fbridge_mode=1
  [UNWEIGHT] Wrote 40 events (found 738 events)
- [COUNTERS] PROGRAM TOTAL          :    0.3443s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.2957s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0486s for     8192 events => throughput is 1.69E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.3397s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.2926s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0470s for     8192 events => throughput is 1.74E+05 events/s
 
 *** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
-OK! xsec from fortran (9.7196357922470805E-002) and cpp (9.7196366042348534E-002) differ by less than 4E-4 (8.354096703300229e-08)
+OK! xsec from fortran (9.7195719386171234E-002) and cpp (9.7195727520443878E-002) differ by less than 4E-4 (8.368961812443843e-08)
 
 *** (2-512z) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) ***
 
@@ -468,29 +468,29 @@ Executing ' ./build.512z_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1
  [XSECTION] MultiChannel = TRUE
  [XSECTION] Configuration = 1
  [XSECTION] ChannelId = 1
- [XSECTION] Cross section = 0.08131 [8.1310861004511001E-002] fbridge_mode=1
+ [XSECTION] Cross section = 0.08131 [8.1310861771879989E-002] fbridge_mode=1
  [UNWEIGHT] Wrote 679 events (found 1787 events)
- [COUNTERS] PROGRAM TOTAL          :    1.9758s
- [COUNTERS] Fortran Overhead ( 0 ) :    1.4329s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.5428s for    90112 events => throughput is 1.66E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    1.9470s
+ [COUNTERS] Fortran Overhead ( 0 ) :    1.4325s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.5145s for    90112 events => throughput is 1.75E+05 events/s
 
 *** (2-512z) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
 
-OK! xsec from fortran (8.1310872077655555E-002) and cpp (8.1310861004511001E-002) differ by less than 4E-4 (1.36182828569531e-07)
+OK! xsec from fortran (8.1310872844967921E-002) and cpp (8.1310861771879989E-002) differ by less than 4E-4 (1.3618213090538234e-07)
 
 *** (2-512z) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) ***
 
 OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
 
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
-Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.633621e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.739989e+05                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
-Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.671653e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.725640e+05                 )  sec^-1
 
 *** (3) EXECUTE MADEVENT_CUDA x1 (create events.lhe) ***
 --------------------
@@ -511,15 +511,15 @@ Executing ' ./build.none_f_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggttg_x
  [XSECTION] MultiChannel = TRUE
  [XSECTION] Configuration = 1
  [XSECTION] ChannelId = 1
- [XSECTION] Cross section = 0.0972 [9.7196349366365994E-002] fbridge_mode=1
+ [XSECTION] Cross section = 0.0972 [9.7195710869056637E-002] fbridge_mode=1
  [UNWEIGHT] Wrote 40 events (found 738 events)
- [COUNTERS] PROGRAM TOTAL          :    0.6848s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.6840s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0009s for     8192 events => throughput is 9.62E+06 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.7105s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.7096s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0009s for     8192 events => throughput is 9.53E+06 events/s
 
 *** (3) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec ***
 
-OK! xsec from fortran (9.7196357922470805E-002) and cpp (9.7196349366365994E-002) differ by less than 4E-4 (8.802906814597833e-08)
+OK! xsec from fortran (9.7195719386171234E-002) and cpp (9.7195710869056637E-002) differ by less than 4E-4 (8.762849490473457e-08)
 
 *** (3) Compare MADEVENT_CUDA x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) ***
 
@@ -544,58 +544,58 @@ Executing ' ./build.none_f_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggttg_x
  [XSECTION] MultiChannel = TRUE
  [XSECTION] Configuration = 1
  [XSECTION] ChannelId = 1
- [XSECTION] Cross section = 0.08131 [8.1310864949473968E-002] fbridge_mode=1
+ [XSECTION] Cross section = 0.08131 [8.1310865716831132E-002] fbridge_mode=1
  [UNWEIGHT] Wrote 679 events (found 1787 events)
- [COUNTERS] PROGRAM TOTAL          :    1.8094s
- [COUNTERS] Fortran Overhead ( 0 ) :    1.8000s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0094s for    90112 events => throughput is 9.54E+06 events/s
+ [COUNTERS] PROGRAM TOTAL          :    1.8990s
+ [COUNTERS] Fortran Overhead ( 0 ) :    1.8894s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0097s for    90112 events => throughput is 9.33E+06 events/s
 
 *** (3) Compare MADEVENT_CUDA x10 xsec to MADEVENT_FORTRAN xsec ***
 
-OK! xsec from fortran (8.1310872077655555E-002) and cpp (8.1310864949473968E-002) differ by less than 4E-4 (8.766578696306482e-08)
+OK! xsec from fortran (8.1310872844967921E-002) and cpp (8.1310865716831132E-002) differ by less than 4E-4 (8.766523518222158e-08)
 
 *** (3) Compare MADEVENT_CUDA x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) ***
 
 OK! events.lhe.cuda.10 and events.lhe.ref.10 are identical
 
 *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge ***
-Process                     = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.312152e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.264826e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(8192) -p 256 32 1 ***
-Process                     = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.855833e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.844127e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge ***
-Process                     = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.626006e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.461458e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX) -p 16384 32 1 ***
-Process                     = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 4.302964e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.280416e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge ***
-Process                     = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.660515e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.442329e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 ***
-Process                     = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 4.514362e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.431551e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge ***
-Process                     = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.487452e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.330311e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 ***
-Process                     = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.622543e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.602408e+07                 )  sec^-1
 
 TEST COMPLETED
diff --git a/epochX/cudacpp/tmad/logs_ggttg_mad/log_ggttg_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_ggttg_mad/log_ggttg_mad_m_inl0_hrd0.txt
index 8f3227653b..de3b503603 100644
--- a/epochX/cudacpp/tmad/logs_ggttg_mad/log_ggttg_mad_m_inl0_hrd0.txt
+++ b/epochX/cudacpp/tmad/logs_ggttg_mad/log_ggttg_mad_m_inl0_hrd0.txt
@@ -1,41 +1,41 @@
 Working directory (build): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg
 CUDACPP_BUILDDIR='.'
 
-make USEBUILDDIR=1 AVX=none
 
 
+make USEBUILDDIR=1 AVX=none
 
-make USEBUILDDIR=1 AVX=avx2
 make USEBUILDDIR=1 AVX=sse4
+make USEBUILDDIR=1 AVX=avx2
 make USEBUILDDIR=1 AVX=512y
-
-make USEBUILDDIR=1 AVX=512z
 make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
 make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
 make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
 make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
+
+make USEBUILDDIR=1 AVX=512z
 make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
-CUDACPP_BUILDDIR='build.avx2_m_inl0_hrd0'
-CUDACPP_BUILDDIR='build.512z_m_inl0_hrd0'
-CUDACPP_BUILDDIR='build.512y_m_inl0_hrd0'
+CUDACPP_BUILDDIR='build.none_m_inl0_hrd0'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
-CUDACPP_BUILDDIR='build.sse4_m_inl0_hrd0'
-CUDACPP_BUILDDIR='build.none_m_inl0_hrd0'
+CUDACPP_BUILDDIR='build.512y_m_inl0_hrd0'
+CUDACPP_BUILDDIR='build.avx2_m_inl0_hrd0'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
+CUDACPP_BUILDDIR='build.sse4_m_inl0_hrd0'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
+CUDACPP_BUILDDIR='build.512z_m_inl0_hrd0'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
 
 OMP_NUM_THREADS=
 
-DATE: 2023-07-18_23:41:00
+DATE: 2023-06-16_23:38:00
 
-On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
+On itscrd80.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg
 
 *** (1) EXECUTE MADEVENT_FORTRAN (create results.dat) ***
@@ -58,9 +58,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttg_x1_fortran > /tmp/ava
  [XSECTION] Configuration = 1
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.0972 [] fbridge_mode=0
- [COUNTERS] PROGRAM TOTAL          :    0.5157s
+ [COUNTERS] PROGRAM TOTAL          :    0.5113s
  [COUNTERS] Fortran Overhead ( 0 ) :    0.1671s
- [COUNTERS] Fortran MEs      ( 1 ) :    0.3487s for     8192 events => throughput is 2.35E+04 events/s
+ [COUNTERS] Fortran MEs      ( 1 ) :    0.3442s for     8192 events => throughput is 2.38E+04 events/s
 
 *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) ***
 --------------------
@@ -81,11 +81,11 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttg_x1_fortran > /tmp/ava
  [XSECTION] MultiChannel = TRUE
  [XSECTION] Configuration = 1
  [XSECTION] ChannelId = 1
- [XSECTION] Cross section = 0.0972 [9.7196357922470805E-002] fbridge_mode=0
+ [XSECTION] Cross section = 0.0972 [9.7195719386171234E-002] fbridge_mode=0
  [UNWEIGHT] Wrote 40 events (found 738 events)
- [COUNTERS] PROGRAM TOTAL          :    0.6183s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.2503s
- [COUNTERS] Fortran MEs      ( 1 ) :    0.3680s for     8192 events => throughput is 2.23E+04 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.6040s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.2440s
+ [COUNTERS] Fortran MEs      ( 1 ) :    0.3600s for     8192 events => throughput is 2.28E+04 events/s
 
 *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) ***
 --------------------
@@ -106,11 +106,11 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttg_x10_fortran > /tmp/av
  [XSECTION] MultiChannel = TRUE
  [XSECTION] Configuration = 1
  [XSECTION] ChannelId = 1
- [XSECTION] Cross section = 0.08131 [8.1310872077655555E-002] fbridge_mode=0
+ [XSECTION] Cross section = 0.08131 [8.1310872844967921E-002] fbridge_mode=0
  [UNWEIGHT] Wrote 679 events (found 1787 events)
- [COUNTERS] PROGRAM TOTAL          :    5.2542s
- [COUNTERS] Fortran Overhead ( 0 ) :    1.3824s
- [COUNTERS] Fortran MEs      ( 1 ) :    3.8718s for    90112 events => throughput is 2.33E+04 events/s
+ [COUNTERS] PROGRAM TOTAL          :    5.1624s
+ [COUNTERS] Fortran Overhead ( 0 ) :    1.3666s
+ [COUNTERS] Fortran MEs      ( 1 ) :    3.7958s for    90112 events => throughput is 2.37E+04 events/s
 
 *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -131,15 +131,15 @@ Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1
  [XSECTION] MultiChannel = TRUE
  [XSECTION] Configuration = 1
  [XSECTION] ChannelId = 1
- [XSECTION] Cross section = 0.0972 [9.7196358763382007E-002] fbridge_mode=1
+ [XSECTION] Cross section = 0.0972 [9.7195720226233587E-002] fbridge_mode=1
  [UNWEIGHT] Wrote 40 events (found 738 events)
- [COUNTERS] PROGRAM TOTAL          :    0.9109s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.5735s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.3374s for     8192 events => throughput is 2.43E+04 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.8805s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.5588s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.3217s for     8192 events => throughput is 2.55E+04 events/s
 
 *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
-OK! xsec from fortran (9.7196357922470805E-002) and cpp (9.7196358763382007E-002) differ by less than 2E-4 (8.651674043846924e-09)
+OK! xsec from fortran (9.7195719386171234E-002) and cpp (9.7195720226233587E-002) differ by less than 2E-4 (8.642997428864874e-09)
 
 *** (2-none) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) ***
 
@@ -164,29 +164,29 @@ Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1
  [XSECTION] MultiChannel = TRUE
  [XSECTION] Configuration = 1
  [XSECTION] ChannelId = 1
- [XSECTION] Cross section = 0.08131 [8.1310872835011053E-002] fbridge_mode=1
+ [XSECTION] Cross section = 0.08131 [8.1310873602323142E-002] fbridge_mode=1
  [UNWEIGHT] Wrote 679 events (found 1787 events)
- [COUNTERS] PROGRAM TOTAL          :    5.4152s
- [COUNTERS] Fortran Overhead ( 0 ) :    1.7023s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    3.7128s for    90112 events => throughput is 2.43E+04 events/s
+ [COUNTERS] PROGRAM TOTAL          :    5.2705s
+ [COUNTERS] Fortran Overhead ( 0 ) :    1.7115s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    3.5590s for    90112 events => throughput is 2.53E+04 events/s
 
 *** (2-none) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
 
-OK! xsec from fortran (8.1310872077655555E-002) and cpp (8.1310872835011053E-002) differ by less than 2E-4 (9.31432020401246e-09)
+OK! xsec from fortran (8.1310872844967921E-002) and cpp (8.1310873602323142E-002) differ by less than 2E-4 (9.314316651298782e-09)
 
 *** (2-none) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) ***
 
 OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
 
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
-Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.446575e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.610260e+04                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
-Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.509101e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.619292e+04                 )  sec^-1
 
 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -207,15 +207,15 @@ Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1
  [XSECTION] MultiChannel = TRUE
  [XSECTION] Configuration = 1
  [XSECTION] ChannelId = 1
- [XSECTION] Cross section = 0.0972 [9.7196358804670396E-002] fbridge_mode=1
+ [XSECTION] Cross section = 0.0972 [9.7195720267415450E-002] fbridge_mode=1
  [UNWEIGHT] Wrote 40 events (found 738 events)
- [COUNTERS] PROGRAM TOTAL          :    0.5806s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.4114s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.1692s for     8192 events => throughput is 4.84E+04 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.5849s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.4204s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.1645s for     8192 events => throughput is 4.98E+04 events/s
 
 *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
-OK! xsec from fortran (9.7196357922470805E-002) and cpp (9.7196358804670396E-002) differ by less than 2E-4 (9.076467577529002e-09)
+OK! xsec from fortran (9.7195719386171234E-002) and cpp (9.7195720267415450E-002) differ by less than 2E-4 (9.066697836956905e-09)
 
 *** (2-sse4) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) ***
 
@@ -240,29 +240,29 @@ Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1
  [XSECTION] MultiChannel = TRUE
  [XSECTION] Configuration = 1
  [XSECTION] ChannelId = 1
- [XSECTION] Cross section = 0.08131 [8.1310872836789727E-002] fbridge_mode=1
+ [XSECTION] Cross section = 0.08131 [8.1310873604102080E-002] fbridge_mode=1
  [UNWEIGHT] Wrote 679 events (found 1787 events)
- [COUNTERS] PROGRAM TOTAL          :    3.4625s
- [COUNTERS] Fortran Overhead ( 0 ) :    1.5672s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    1.8953s for    90112 events => throughput is 4.75E+04 events/s
+ [COUNTERS] PROGRAM TOTAL          :    3.4135s
+ [COUNTERS] Fortran Overhead ( 0 ) :    1.5611s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    1.8524s for    90112 events => throughput is 4.86E+04 events/s
 
 *** (2-sse4) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
 
-OK! xsec from fortran (8.1310872077655555E-002) and cpp (8.1310872836789727E-002) differ by less than 2E-4 (9.336195150311255e-09)
+OK! xsec from fortran (8.1310872844967921E-002) and cpp (8.1310873604102080E-002) differ by less than 2E-4 (9.33619492826665e-09)
 
 *** (2-sse4) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) ***
 
 OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
 
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
-Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 4.922036e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.764708e+04                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
-Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 4.953512e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.772730e+04                 )  sec^-1
 
 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -283,15 +283,15 @@ Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1
  [XSECTION] MultiChannel = TRUE
  [XSECTION] Configuration = 1
  [XSECTION] ChannelId = 1
- [XSECTION] Cross section = 0.0972 [9.7196358586501358E-002] fbridge_mode=1
+ [XSECTION] Cross section = 0.0972 [9.7195720049465126E-002] fbridge_mode=1
  [UNWEIGHT] Wrote 40 events (found 738 events)
- [COUNTERS] PROGRAM TOTAL          :    0.4053s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.3251s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0802s for     8192 events => throughput is 1.02E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.4005s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.3219s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0786s for     8192 events => throughput is 1.04E+05 events/s
 
 *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
-OK! xsec from fortran (9.7196357922470805E-002) and cpp (9.7196358586501358E-002) differ by less than 2E-4 (6.831845977828266e-09)
+OK! xsec from fortran (9.7195719386171234E-002) and cpp (9.7195720049465126E-002) differ by less than 2E-4 (6.824311782338555e-09)
 
 *** (2-avx2) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) ***
 
@@ -316,29 +316,29 @@ Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1
  [XSECTION] MultiChannel = TRUE
  [XSECTION] Configuration = 1
  [XSECTION] ChannelId = 1
- [XSECTION] Cross section = 0.08131 [8.1310872708918333E-002] fbridge_mode=1
+ [XSECTION] Cross section = 0.08131 [8.1310873476230255E-002] fbridge_mode=1
  [UNWEIGHT] Wrote 679 events (found 1787 events)
- [COUNTERS] PROGRAM TOTAL          :    2.3314s
- [COUNTERS] Fortran Overhead ( 0 ) :    1.4457s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.8857s for    90112 events => throughput is 1.02E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    2.3277s
+ [COUNTERS] Fortran Overhead ( 0 ) :    1.4694s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.8583s for    90112 events => throughput is 1.05E+05 events/s
 
 *** (2-avx2) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
 
-OK! xsec from fortran (8.1310872077655555E-002) and cpp (8.1310872708918333E-002) differ by less than 2E-4 (7.763571563401683e-09)
+OK! xsec from fortran (8.1310872844967921E-002) and cpp (8.1310873476230255E-002) differ by less than 2E-4 (7.76356601228656e-09)
 
 *** (2-avx2) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) ***
 
 OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
 
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
-Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.032189e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.054740e+05                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
-Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.032333e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.057335e+05                 )  sec^-1
 
 *** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -359,15 +359,15 @@ Executing ' ./build.512y_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1
  [XSECTION] MultiChannel = TRUE
  [XSECTION] Configuration = 1
  [XSECTION] ChannelId = 1
- [XSECTION] Cross section = 0.0972 [9.7196358586501358E-002] fbridge_mode=1
+ [XSECTION] Cross section = 0.0972 [9.7195720049465126E-002] fbridge_mode=1
  [UNWEIGHT] Wrote 40 events (found 738 events)
- [COUNTERS] PROGRAM TOTAL          :    0.3878s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.3172s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0706s for     8192 events => throughput is 1.16E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.3809s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.3126s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0683s for     8192 events => throughput is 1.20E+05 events/s
 
 *** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
-OK! xsec from fortran (9.7196357922470805E-002) and cpp (9.7196358586501358E-002) differ by less than 2E-4 (6.831845977828266e-09)
+OK! xsec from fortran (9.7195719386171234E-002) and cpp (9.7195720049465126E-002) differ by less than 2E-4 (6.824311782338555e-09)
 
 *** (2-512y) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) ***
 
@@ -392,29 +392,29 @@ Executing ' ./build.512y_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1
  [XSECTION] MultiChannel = TRUE
  [XSECTION] Configuration = 1
  [XSECTION] ChannelId = 1
- [XSECTION] Cross section = 0.08131 [8.1310872708918333E-002] fbridge_mode=1
+ [XSECTION] Cross section = 0.08131 [8.1310873476230255E-002] fbridge_mode=1
  [UNWEIGHT] Wrote 679 events (found 1787 events)
- [COUNTERS] PROGRAM TOTAL          :    2.2108s
- [COUNTERS] Fortran Overhead ( 0 ) :    1.4354s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.7754s for    90112 events => throughput is 1.16E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    2.2562s
+ [COUNTERS] Fortran Overhead ( 0 ) :    1.4769s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.7793s for    90112 events => throughput is 1.16E+05 events/s
 
 *** (2-512y) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
 
-OK! xsec from fortran (8.1310872077655555E-002) and cpp (8.1310872708918333E-002) differ by less than 2E-4 (7.763571563401683e-09)
+OK! xsec from fortran (8.1310872844967921E-002) and cpp (8.1310873476230255E-002) differ by less than 2E-4 (7.76356601228656e-09)
 
 *** (2-512y) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) ***
 
 OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
 
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
-Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.188297e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.193684e+05                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
-Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.181150e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.221094e+05                 )  sec^-1
 
 *** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -435,15 +435,15 @@ Executing ' ./build.512z_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1
  [XSECTION] MultiChannel = TRUE
  [XSECTION] Configuration = 1
  [XSECTION] ChannelId = 1
- [XSECTION] Cross section = 0.0972 [9.7196358757578441E-002] fbridge_mode=1
+ [XSECTION] Cross section = 0.0972 [9.7195720220276491E-002] fbridge_mode=1
  [UNWEIGHT] Wrote 40 events (found 738 events)
- [COUNTERS] PROGRAM TOTAL          :    0.4510s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.3480s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.1030s for     8192 events => throughput is 7.95E+04 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.4495s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.3476s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.1019s for     8192 events => throughput is 8.04E+04 events/s
 
 *** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
-OK! xsec from fortran (9.7196357922470805E-002) and cpp (9.7196358757578441E-002) differ by less than 2E-4 (8.591964251181139e-09)
+OK! xsec from fortran (9.7195719386171234E-002) and cpp (9.7195720220276491E-002) differ by less than 2E-4 (8.581707788835047e-09)
 
 *** (2-512z) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) ***
 
@@ -468,29 +468,29 @@ Executing ' ./build.512z_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1
  [XSECTION] MultiChannel = TRUE
  [XSECTION] Configuration = 1
  [XSECTION] ChannelId = 1
- [XSECTION] Cross section = 0.08131 [8.1310872803699391E-002] fbridge_mode=1
+ [XSECTION] Cross section = 0.08131 [8.1310873571012007E-002] fbridge_mode=1
  [UNWEIGHT] Wrote 679 events (found 1787 events)
- [COUNTERS] PROGRAM TOTAL          :    2.6100s
- [COUNTERS] Fortran Overhead ( 0 ) :    1.4743s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    1.1357s for    90112 events => throughput is 7.93E+04 events/s
+ [COUNTERS] PROGRAM TOTAL          :    2.6068s
+ [COUNTERS] Fortran Overhead ( 0 ) :    1.4960s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    1.1108s for    90112 events => throughput is 8.11E+04 events/s
 
 *** (2-512z) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
 
-OK! xsec from fortran (8.1310872077655555E-002) and cpp (8.1310872803699391E-002) differ by less than 2E-4 (8.929234462939917e-09)
+OK! xsec from fortran (8.1310872844967921E-002) and cpp (8.1310873571012007E-002) differ by less than 2E-4 (8.92923734951978e-09)
 
 *** (2-512z) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) ***
 
 OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
 
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
-Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 7.916056e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 8.078718e+04                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
-Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 7.993339e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 8.044468e+04                 )  sec^-1
 
 *** (3) EXECUTE MADEVENT_CUDA x1 (create events.lhe) ***
 --------------------
@@ -511,15 +511,15 @@ Executing ' ./build.none_m_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggttg_x
  [XSECTION] MultiChannel = TRUE
  [XSECTION] Configuration = 1
  [XSECTION] ChannelId = 1
- [XSECTION] Cross section = 0.0972 [9.7196358102981245E-002] fbridge_mode=1
+ [XSECTION] Cross section = 0.0972 [9.7195719566775987E-002] fbridge_mode=1
  [UNWEIGHT] Wrote 40 events (found 738 events)
- [COUNTERS] PROGRAM TOTAL          :    0.6779s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.6762s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0017s for     8192 events => throughput is 4.71E+06 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.7145s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.7128s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0018s for     8192 events => throughput is 4.63E+06 events/s
 
 *** (3) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec ***
 
-OK! xsec from fortran (9.7196357922470805E-002) and cpp (9.7196358102981245E-002) differ by less than 2E-4 (1.8571728599425796e-09)
+OK! xsec from fortran (9.7195719386171234E-002) and cpp (9.7195719566775987E-002) differ by less than 2E-4 (1.858155407319373e-09)
 
 *** (3) Compare MADEVENT_CUDA x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) ***
 
@@ -544,58 +544,58 @@ Executing ' ./build.none_m_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggttg_x
  [XSECTION] MultiChannel = TRUE
  [XSECTION] Configuration = 1
  [XSECTION] ChannelId = 1
- [XSECTION] Cross section = 0.08131 [8.1310872068634174E-002] fbridge_mode=1
+ [XSECTION] Cross section = 0.08131 [8.1310872835946929E-002] fbridge_mode=1
  [UNWEIGHT] Wrote 679 events (found 1787 events)
- [COUNTERS] PROGRAM TOTAL          :    1.8296s
- [COUNTERS] Fortran Overhead ( 0 ) :    1.8104s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0192s for    90112 events => throughput is 4.70E+06 events/s
+ [COUNTERS] PROGRAM TOTAL          :    1.8693s
+ [COUNTERS] Fortran Overhead ( 0 ) :    1.8501s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0192s for    90112 events => throughput is 4.69E+06 events/s
 
 *** (3) Compare MADEVENT_CUDA x10 xsec to MADEVENT_FORTRAN xsec ***
 
-OK! xsec from fortran (8.1310872077655555E-002) and cpp (8.1310872068634174E-002) differ by less than 2E-4 (1.1094924978749532e-10)
+OK! xsec from fortran (8.1310872844967921E-002) and cpp (8.1310872835946929E-002) differ by less than 2E-4 (1.1094447582848943e-10)
 
 *** (3) Compare MADEVENT_CUDA x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) ***
 
 OK! events.lhe.cuda.10 and events.lhe.ref.10 are identical
 
 *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge ***
-Process                     = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 5.183642e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 5.091918e+06                 )  sec^-1
 
 *** EXECUTE GCHECK(8192) -p 256 32 1 ***
-Process                     = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 8.584600e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 8.629216e+06                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge ***
-Process                     = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 9.650819e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 9.133810e+06                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX) -p 16384 32 1 ***
-Process                     = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.235546e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.233339e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge ***
-Process                     = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 9.634710e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 9.048619e+06                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 ***
-Process                     = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.245773e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.238445e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge ***
-Process                     = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 9.629817e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 9.073517e+06                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 ***
-Process                     = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.727220e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.781744e+06                 )  sec^-1
 
 TEST COMPLETED
diff --git a/epochX/cudacpp/tmad/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0.txt
index bd446b1ac3..56b03784ad 100644
--- a/epochX/cudacpp/tmad/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0.txt
+++ b/epochX/cudacpp/tmad/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0.txt
@@ -2,28 +2,28 @@ Working directory (build): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/g
 CUDACPP_BUILDDIR='.'
 
 
-
 make USEBUILDDIR=1 AVX=none
 
 make USEBUILDDIR=1 AVX=sse4
 make USEBUILDDIR=1 AVX=avx2
-make USEBUILDDIR=1 AVX=512y
 
-make USEBUILDDIR=1 AVX=512z
+make USEBUILDDIR=1 AVX=512y
 make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+
+make USEBUILDDIR=1 AVX=512z
 make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
-CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd0'
+CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0'
+CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0'
 CUDACPP_BUILDDIR='build.none_d_inl0_hrd0'
 CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
-CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0'
+CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd0'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
-CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 make[1]: Nothing to be done for 'all'.
@@ -33,9 +33,9 @@ make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/
 
 OMP_NUM_THREADS=
 
-DATE: 2023-07-18_23:41:41
+DATE: 2023-06-16_23:38:42
 
-On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
+On itscrd80.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg
 
 *** (1) EXECUTE MADEVENT_FORTRAN (create results.dat) ***
@@ -58,9 +58,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttgg_x1_fortran > /tmp/av
  [XSECTION] Configuration = 1
  [XSECTION] ChannelId = 2
  [XSECTION] Cross section = 0.0003628 [] fbridge_mode=0
- [COUNTERS] PROGRAM TOTAL          :    4.4496s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.2184s
- [COUNTERS] Fortran MEs      ( 1 ) :    4.2312s for     8192 events => throughput is 1.94E+03 events/s
+ [COUNTERS] PROGRAM TOTAL          :    4.3757s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.2161s
+ [COUNTERS] Fortran MEs      ( 1 ) :    4.1595s for     8192 events => throughput is 1.97E+03 events/s
 
 *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) ***
 --------------------
@@ -83,9 +83,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttgg_x1_fortran > /tmp/av
  [XSECTION] ChannelId = 2
  [XSECTION] Cross section = 0.0003628 [3.6277277311352982E-004] fbridge_mode=0
  [UNWEIGHT] Wrote 49 events (found 738 events)
- [COUNTERS] PROGRAM TOTAL          :    4.5168s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.3053s
- [COUNTERS] Fortran MEs      ( 1 ) :    4.2114s for     8192 events => throughput is 1.95E+03 events/s
+ [COUNTERS] PROGRAM TOTAL          :    4.4480s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.2999s
+ [COUNTERS] Fortran MEs      ( 1 ) :    4.1481s for     8192 events => throughput is 1.97E+03 events/s
 
 *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) ***
 --------------------
@@ -106,11 +106,11 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttgg_x10_fortran > /tmp/a
  [XSECTION] MultiChannel = TRUE
  [XSECTION] Configuration = 1
  [XSECTION] ChannelId = 2
- [XSECTION] Cross section = 0.000158 [1.5803725748421164E-004] fbridge_mode=0
+ [XSECTION] Cross section = 0.000158 [1.5803725748610604E-004] fbridge_mode=0
  [UNWEIGHT] Wrote 204 events (found 1633 events)
- [COUNTERS] PROGRAM TOTAL          :   48.2267s
- [COUNTERS] Fortran Overhead ( 0 ) :    1.8750s
- [COUNTERS] Fortran MEs      ( 1 ) :   46.3517s for    90112 events => throughput is 1.94E+03 events/s
+ [COUNTERS] PROGRAM TOTAL          :   47.5995s
+ [COUNTERS] Fortran Overhead ( 0 ) :    1.8973s
+ [COUNTERS] Fortran MEs      ( 1 ) :   45.7021s for    90112 events => throughput is 1.97E+03 events/s
 
 *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -133,9 +133,9 @@ Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x
  [XSECTION] ChannelId = 2
  [XSECTION] Cross section = 0.0003628 [3.6277277311352998E-004] fbridge_mode=1
  [UNWEIGHT] Wrote 49 events (found 738 events)
- [COUNTERS] PROGRAM TOTAL          :    8.8983s
- [COUNTERS] Fortran Overhead ( 0 ) :    4.5345s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    4.3638s for     8192 events => throughput is 1.88E+03 events/s
+ [COUNTERS] PROGRAM TOTAL          :    8.5650s
+ [COUNTERS] Fortran Overhead ( 0 ) :    4.3703s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    4.1947s for     8192 events => throughput is 1.95E+03 events/s
 
 *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
diff --git a/epochX/cudacpp/tmad/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0.txt
index 7f03742433..fbd2ca3bdb 100644
--- a/epochX/cudacpp/tmad/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0.txt
+++ b/epochX/cudacpp/tmad/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0.txt
@@ -2,26 +2,26 @@ Working directory (build): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/g
 CUDACPP_BUILDDIR='.'
 
 
-make USEBUILDDIR=1 AVX=none
-
 
+make USEBUILDDIR=1 AVX=none
 make USEBUILDDIR=1 AVX=sse4
+
 make USEBUILDDIR=1 AVX=avx2
 make USEBUILDDIR=1 AVX=512y
-
-make USEBUILDDIR=1 AVX=512z
 make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+
+make USEBUILDDIR=1 AVX=512z
 make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd0'
 CUDACPP_BUILDDIR='build.none_f_inl0_hrd0'
 CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0'
+CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0'
+CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
-CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd0'
-CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0'
-CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 make[1]: Nothing to be done for 'all'.
@@ -33,9 +33,9 @@ make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/
 
 OMP_NUM_THREADS=
 
-DATE: 2023-07-18_23:42:50
+DATE: 2023-06-16_23:39:50
 
-On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
+On itscrd80.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg
 
 *** (1) EXECUTE MADEVENT_FORTRAN (create results.dat) ***
@@ -58,9 +58,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttgg_x1_fortran > /tmp/av
  [XSECTION] Configuration = 1
  [XSECTION] ChannelId = 2
  [XSECTION] Cross section = 0.0003628 [] fbridge_mode=0
- [COUNTERS] PROGRAM TOTAL          :    4.4279s
+ [COUNTERS] PROGRAM TOTAL          :    4.4077s
  [COUNTERS] Fortran Overhead ( 0 ) :    0.2143s
- [COUNTERS] Fortran MEs      ( 1 ) :    4.2136s for     8192 events => throughput is 1.94E+03 events/s
+ [COUNTERS] Fortran MEs      ( 1 ) :    4.1934s for     8192 events => throughput is 1.95E+03 events/s
 
 *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) ***
 --------------------
@@ -83,9 +83,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttgg_x1_fortran > /tmp/av
  [XSECTION] ChannelId = 2
  [XSECTION] Cross section = 0.0003628 [3.6277277311352982E-004] fbridge_mode=0
  [UNWEIGHT] Wrote 49 events (found 738 events)
- [COUNTERS] PROGRAM TOTAL          :    4.5251s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.3044s
- [COUNTERS] Fortran MEs      ( 1 ) :    4.2207s for     8192 events => throughput is 1.94E+03 events/s
+ [COUNTERS] PROGRAM TOTAL          :    4.4393s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.3039s
+ [COUNTERS] Fortran MEs      ( 1 ) :    4.1354s for     8192 events => throughput is 1.98E+03 events/s
 
 *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) ***
 --------------------
@@ -106,11 +106,11 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttgg_x10_fortran > /tmp/a
  [XSECTION] MultiChannel = TRUE
  [XSECTION] Configuration = 1
  [XSECTION] ChannelId = 2
- [XSECTION] Cross section = 0.000158 [1.5803725748421164E-004] fbridge_mode=0
+ [XSECTION] Cross section = 0.000158 [1.5803725748610604E-004] fbridge_mode=0
  [UNWEIGHT] Wrote 204 events (found 1633 events)
- [COUNTERS] PROGRAM TOTAL          :   48.4173s
- [COUNTERS] Fortran Overhead ( 0 ) :    1.8860s
- [COUNTERS] Fortran MEs      ( 1 ) :   46.5313s for    90112 events => throughput is 1.94E+03 events/s
+ [COUNTERS] PROGRAM TOTAL          :   47.6726s
+ [COUNTERS] Fortran Overhead ( 0 ) :    1.8981s
+ [COUNTERS] Fortran MEs      ( 1 ) :   45.7745s for    90112 events => throughput is 1.97E+03 events/s
 
 *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -133,9 +133,9 @@ Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x
  [XSECTION] ChannelId = 2
  [XSECTION] Cross section = 0.0003628 [3.6277396515517582E-004] fbridge_mode=1
  [UNWEIGHT] Wrote 49 events (found 738 events)
- [COUNTERS] PROGRAM TOTAL          :    8.6591s
- [COUNTERS] Fortran Overhead ( 0 ) :    4.4291s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    4.2300s for     8192 events => throughput is 1.94E+03 events/s
+ [COUNTERS] PROGRAM TOTAL          :    8.4058s
+ [COUNTERS] Fortran Overhead ( 0 ) :    4.2739s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    4.1319s for     8192 events => throughput is 1.98E+03 events/s
 
 *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
diff --git a/epochX/cudacpp/tmad/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd0.txt
index 3e6a0c1d61..c64d8630ed 100644
--- a/epochX/cudacpp/tmad/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd0.txt
+++ b/epochX/cudacpp/tmad/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd0.txt
@@ -3,22 +3,23 @@ CUDACPP_BUILDDIR='.'
 
 
 
-
 make USEBUILDDIR=1 AVX=none
 make USEBUILDDIR=1 AVX=sse4
 make USEBUILDDIR=1 AVX=avx2
-make USEBUILDDIR=1 AVX=512y
-
-make USEBUILDDIR=1 AVX=512z
 make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+
+make USEBUILDDIR=1 AVX=512y
+
+make USEBUILDDIR=1 AVX=512z
 make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
-CUDACPP_BUILDDIR='build.512z_m_inl0_hrd0'
+CUDACPP_BUILDDIR='build.sse4_m_inl0_hrd0'
 CUDACPP_BUILDDIR='build.none_m_inl0_hrd0'
+CUDACPP_BUILDDIR='build.512y_m_inl0_hrd0'
 CUDACPP_BUILDDIR='build.avx2_m_inl0_hrd0'
-CUDACPP_BUILDDIR='build.sse4_m_inl0_hrd0'
+CUDACPP_BUILDDIR='build.512z_m_inl0_hrd0'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 make[1]: Nothing to be done for 'all'.
@@ -27,15 +28,14 @@ make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
-CUDACPP_BUILDDIR='build.512y_m_inl0_hrd0'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 
 OMP_NUM_THREADS=
 
-DATE: 2023-07-18_23:43:58
+DATE: 2023-06-16_23:40:57
 
-On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
+On itscrd80.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg
 
 *** (1) EXECUTE MADEVENT_FORTRAN (create results.dat) ***
@@ -58,9 +58,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttgg_x1_fortran > /tmp/av
  [XSECTION] Configuration = 1
  [XSECTION] ChannelId = 2
  [XSECTION] Cross section = 0.0003628 [] fbridge_mode=0
- [COUNTERS] PROGRAM TOTAL          :    4.4648s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.2148s
- [COUNTERS] Fortran MEs      ( 1 ) :    4.2501s for     8192 events => throughput is 1.93E+03 events/s
+ [COUNTERS] PROGRAM TOTAL          :    4.4170s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.2154s
+ [COUNTERS] Fortran MEs      ( 1 ) :    4.2016s for     8192 events => throughput is 1.95E+03 events/s
 
 *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) ***
 --------------------
@@ -83,9 +83,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttgg_x1_fortran > /tmp/av
  [XSECTION] ChannelId = 2
  [XSECTION] Cross section = 0.0003628 [3.6277277311352982E-004] fbridge_mode=0
  [UNWEIGHT] Wrote 49 events (found 738 events)
- [COUNTERS] PROGRAM TOTAL          :    4.5404s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.3038s
- [COUNTERS] Fortran MEs      ( 1 ) :    4.2366s for     8192 events => throughput is 1.93E+03 events/s
+ [COUNTERS] PROGRAM TOTAL          :    4.6794s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.3042s
+ [COUNTERS] Fortran MEs      ( 1 ) :    4.3752s for     8192 events => throughput is 1.87E+03 events/s
 
 *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) ***
 --------------------
@@ -106,11 +106,11 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttgg_x10_fortran > /tmp/a
  [XSECTION] MultiChannel = TRUE
  [XSECTION] Configuration = 1
  [XSECTION] ChannelId = 2
- [XSECTION] Cross section = 0.000158 [1.5803725748421164E-004] fbridge_mode=0
+ [XSECTION] Cross section = 0.000158 [1.5803725748610604E-004] fbridge_mode=0
  [UNWEIGHT] Wrote 204 events (found 1633 events)
- [COUNTERS] PROGRAM TOTAL          :   48.3786s
- [COUNTERS] Fortran Overhead ( 0 ) :    1.8864s
- [COUNTERS] Fortran MEs      ( 1 ) :   46.4922s for    90112 events => throughput is 1.94E+03 events/s
+ [COUNTERS] PROGRAM TOTAL          :   47.7432s
+ [COUNTERS] Fortran Overhead ( 0 ) :    1.9035s
+ [COUNTERS] Fortran MEs      ( 1 ) :   45.8397s for    90112 events => throughput is 1.97E+03 events/s
 
 *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -133,9 +133,9 @@ Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x
  [XSECTION] ChannelId = 2
  [XSECTION] Cross section = 0.0003628 [3.6277277432965013E-004] fbridge_mode=1
  [UNWEIGHT] Wrote 49 events (found 738 events)
- [COUNTERS] PROGRAM TOTAL          :    9.0570s
- [COUNTERS] Fortran Overhead ( 0 ) :    4.6177s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    4.4393s for     8192 events => throughput is 1.85E+03 events/s
+ [COUNTERS] PROGRAM TOTAL          :    8.6986s
+ [COUNTERS] Fortran Overhead ( 0 ) :    4.4272s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    4.2714s for     8192 events => throughput is 1.92E+03 events/s
 
 *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
diff --git a/epochX/cudacpp/tmad/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd0.txt
index c2298517a8..39ca5692cf 100644
--- a/epochX/cudacpp/tmad/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd0.txt
+++ b/epochX/cudacpp/tmad/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd0.txt
@@ -4,28 +4,28 @@ CUDACPP_BUILDDIR='.'
 
 make USEBUILDDIR=1 AVX=none
 
-
 make USEBUILDDIR=1 AVX=sse4
 make USEBUILDDIR=1 AVX=avx2
-make USEBUILDDIR=1 AVX=512y
-
-make USEBUILDDIR=1 AVX=512z
 make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg'
+
+make USEBUILDDIR=1 AVX=512y
 make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg'
 make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg'
 make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg'
+
+make USEBUILDDIR=1 AVX=512z
 make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg'
-CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0'
+CUDACPP_BUILDDIR='build.none_d_inl0_hrd0'
 CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0'
+CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd0'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg'
-CUDACPP_BUILDDIR='build.none_d_inl0_hrd0'
+CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0'
 CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg'
-CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd0'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg'
 make[1]: Nothing to be done for 'all'.
@@ -33,9 +33,9 @@ make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/
 
 OMP_NUM_THREADS=
 
-DATE: 2023-07-18_23:45:07
+DATE: 2023-06-16_23:42:06
 
-On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
+On itscrd80.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg
 
 *** (1) EXECUTE MADEVENT_FORTRAN (create results.dat) ***
@@ -58,9 +58,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttggg_x1_fortran > /tmp/a
  [XSECTION] Configuration = 1
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 1.169e-06 [] fbridge_mode=0
- [COUNTERS] PROGRAM TOTAL          :   99.3861s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.4217s
- [COUNTERS] Fortran MEs      ( 1 ) :   98.9644s for     8192 events => throughput is 8.28E+01 events/s
+ [COUNTERS] PROGRAM TOTAL          :   96.2551s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.4152s
+ [COUNTERS] Fortran MEs      ( 1 ) :   95.8399s for     8192 events => throughput is 8.55E+01 events/s
 
 *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) ***
 --------------------
@@ -83,9 +83,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttggg_x1_fortran > /tmp/a
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 1.169e-06 [1.1693100945435808E-006] fbridge_mode=0
  [UNWEIGHT] Wrote 14 events (found 457 events)
- [COUNTERS] PROGRAM TOTAL          :   99.4917s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.4846s
- [COUNTERS] Fortran MEs      ( 1 ) :   99.0072s for     8192 events => throughput is 8.27E+01 events/s
+ [COUNTERS] PROGRAM TOTAL          :   96.4720s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.4731s
+ [COUNTERS] Fortran MEs      ( 1 ) :   95.9989s for     8192 events => throughput is 8.53E+01 events/s
 
 *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) ***
 --------------------
@@ -108,9 +108,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttggg_x10_fortran > /tmp/
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 2.136e-07 [2.1358436158813976E-007] fbridge_mode=0
  [UNWEIGHT] Wrote 84 events (found 1181 events)
- [COUNTERS] PROGRAM TOTAL          : 1092.0393s
- [COUNTERS] Fortran Overhead ( 0 ) :    4.0525s
- [COUNTERS] Fortran MEs      ( 1 ) : 1087.9868s for    90112 events => throughput is 8.28E+01 events/s
+ [COUNTERS] PROGRAM TOTAL          : 1059.0573s
+ [COUNTERS] Fortran Overhead ( 0 ) :    4.0327s
+ [COUNTERS] Fortran MEs      ( 1 ) : 1055.0245s for    90112 events => throughput is 8.54E+01 events/s
 
 *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -133,9 +133,9 @@ Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 1.169e-06 [1.1693100945435831E-006] fbridge_mode=1
  [UNWEIGHT] Wrote 14 events (found 457 events)
- [COUNTERS] PROGRAM TOTAL          :  222.5426s
- [COUNTERS] Fortran Overhead ( 0 ) :  102.9425s
- [COUNTERS] CudaCpp MEs      ( 2 ) :  119.6001s for     8192 events => throughput is 6.85E+01 events/s
+ [COUNTERS] PROGRAM TOTAL          :  215.3874s
+ [COUNTERS] Fortran Overhead ( 0 ) :   96.8318s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :  118.5557s for     8192 events => throughput is 6.91E+01 events/s
 
 *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -166,9 +166,9 @@ Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 2.136e-07 [2.1358436158813953E-007] fbridge_mode=1
  [UNWEIGHT] Wrote 84 events (found 1181 events)
- [COUNTERS] PROGRAM TOTAL          : 1465.7798s
- [COUNTERS] Fortran Overhead ( 0 ) :  106.1520s
- [COUNTERS] CudaCpp MEs      ( 2 ) : 1359.6278s for    90112 events => throughput is 6.63E+01 events/s
+ [COUNTERS] PROGRAM TOTAL          : 1357.2262s
+ [COUNTERS] Fortran Overhead ( 0 ) :  100.3230s
+ [COUNTERS] CudaCpp MEs      ( 2 ) : 1256.9032s for    90112 events => throughput is 7.17E+01 events/s
 
 *** (2-none) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -179,14 +179,14 @@ OK! xsec from fortran (2.1358436158813976E-007) and cpp (2.1358436158813953E-007
 OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
 
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
-Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 7.803412e+01                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 8.518822e+01                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
-Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 7.816908e+01                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 8.469802e+01                 )  sec^-1
 
 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -209,9 +209,9 @@ Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 1.169e-06 [1.1693100945435827E-006] fbridge_mode=1
  [UNWEIGHT] Wrote 14 events (found 457 events)
- [COUNTERS] PROGRAM TOTAL          :  111.7257s
- [COUNTERS] Fortran Overhead ( 0 ) :   51.7792s
- [COUNTERS] CudaCpp MEs      ( 2 ) :   59.9465s for     8192 events => throughput is 1.37E+02 events/s
+ [COUNTERS] PROGRAM TOTAL          :  109.0441s
+ [COUNTERS] Fortran Overhead ( 0 ) :   50.2199s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :   58.8242s for     8192 events => throughput is 1.39E+02 events/s
 
 *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -242,9 +242,9 @@ Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 2.136e-07 [2.1358436158813958E-007] fbridge_mode=1
  [UNWEIGHT] Wrote 84 events (found 1181 events)
- [COUNTERS] PROGRAM TOTAL          :  715.6348s
- [COUNTERS] Fortran Overhead ( 0 ) :   55.3978s
- [COUNTERS] CudaCpp MEs      ( 2 ) :  660.2371s for    90112 events => throughput is 1.36E+02 events/s
+ [COUNTERS] PROGRAM TOTAL          :  705.3776s
+ [COUNTERS] Fortran Overhead ( 0 ) :   53.9344s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :  651.4432s for    90112 events => throughput is 1.38E+02 events/s
 
 *** (2-sse4) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -255,14 +255,14 @@ OK! xsec from fortran (2.1358436158813976E-007) and cpp (2.1358436158813958E-007
 OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
 
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
-Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.594296e+02                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.640675e+02                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
-Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.596087e+02                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.645178e+02                 )  sec^-1
 
 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -285,9 +285,9 @@ Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 1.169e-06 [1.1693100945435829E-006] fbridge_mode=1
  [UNWEIGHT] Wrote 14 events (found 457 events)
- [COUNTERS] PROGRAM TOTAL          :   52.0589s
- [COUNTERS] Fortran Overhead ( 0 ) :   23.9023s
- [COUNTERS] CudaCpp MEs      ( 2 ) :   28.1566s for     8192 events => throughput is 2.91E+02 events/s
+ [COUNTERS] PROGRAM TOTAL          :   51.2174s
+ [COUNTERS] Fortran Overhead ( 0 ) :   23.3441s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :   27.8733s for     8192 events => throughput is 2.94E+02 events/s
 
 *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -318,9 +318,9 @@ Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 2.136e-07 [2.1358436158813958E-007] fbridge_mode=1
  [UNWEIGHT] Wrote 84 events (found 1181 events)
- [COUNTERS] PROGRAM TOTAL          :  339.5675s
- [COUNTERS] Fortran Overhead ( 0 ) :   27.9186s
- [COUNTERS] CudaCpp MEs      ( 2 ) :  311.6490s for    90112 events => throughput is 2.89E+02 events/s
+ [COUNTERS] PROGRAM TOTAL          :  332.6731s
+ [COUNTERS] Fortran Overhead ( 0 ) :   26.9170s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :  305.7561s for    90112 events => throughput is 2.95E+02 events/s
 
 *** (2-avx2) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -331,14 +331,14 @@ OK! xsec from fortran (2.1358436158813976E-007) and cpp (2.1358436158813958E-007
 OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
 
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
-Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.480989e+02                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.549919e+02                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
-Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.499686e+02                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.597201e+02                 )  sec^-1
 
 *** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -361,9 +361,9 @@ Executing ' ./build.512y_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 1.169e-06 [1.1693100945435829E-006] fbridge_mode=1
  [UNWEIGHT] Wrote 14 events (found 457 events)
- [COUNTERS] PROGRAM TOTAL          :   47.5811s
- [COUNTERS] Fortran Overhead ( 0 ) :   21.8934s
- [COUNTERS] CudaCpp MEs      ( 2 ) :   25.6877s for     8192 events => throughput is 3.19E+02 events/s
+ [COUNTERS] PROGRAM TOTAL          :   45.3753s
+ [COUNTERS] Fortran Overhead ( 0 ) :   20.9676s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :   24.4077s for     8192 events => throughput is 3.36E+02 events/s
 
 *** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -394,9 +394,9 @@ Executing ' ./build.512y_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 2.136e-07 [2.1358436158813958E-007] fbridge_mode=1
  [UNWEIGHT] Wrote 84 events (found 1181 events)
- [COUNTERS] PROGRAM TOTAL          :  305.9962s
- [COUNTERS] Fortran Overhead ( 0 ) :   25.0145s
- [COUNTERS] CudaCpp MEs      ( 2 ) :  280.9817s for    90112 events => throughput is 3.21E+02 events/s
+ [COUNTERS] PROGRAM TOTAL          :  294.5042s
+ [COUNTERS] Fortran Overhead ( 0 ) :   24.4920s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :  270.0121s for    90112 events => throughput is 3.34E+02 events/s
 
 *** (2-512y) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -407,14 +407,14 @@ OK! xsec from fortran (2.1358436158813976E-007) and cpp (2.1358436158813958E-007
 OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
 
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
-Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.924534e+02                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.021260e+02                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
-Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.941350e+02                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.013283e+02                 )  sec^-1
 
 *** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -437,9 +437,9 @@ Executing ' ./build.512z_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 1.169e-06 [1.1693100945435829E-006] fbridge_mode=1
  [UNWEIGHT] Wrote 14 events (found 457 events)
- [COUNTERS] PROGRAM TOTAL          :   47.6398s
- [COUNTERS] Fortran Overhead ( 0 ) :   23.2794s
- [COUNTERS] CudaCpp MEs      ( 2 ) :   24.3604s for     8192 events => throughput is 3.36E+02 events/s
+ [COUNTERS] PROGRAM TOTAL          :   46.3863s
+ [COUNTERS] Fortran Overhead ( 0 ) :   22.5070s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :   23.8792s for     8192 events => throughput is 3.43E+02 events/s
 
 *** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -470,9 +470,9 @@ Executing ' ./build.512z_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 2.136e-07 [2.1358436158813958E-007] fbridge_mode=1
  [UNWEIGHT] Wrote 84 events (found 1181 events)
- [COUNTERS] PROGRAM TOTAL          :  295.7345s
- [COUNTERS] Fortran Overhead ( 0 ) :   26.8620s
- [COUNTERS] CudaCpp MEs      ( 2 ) :  268.8725s for    90112 events => throughput is 3.35E+02 events/s
+ [COUNTERS] PROGRAM TOTAL          :  289.0610s
+ [COUNTERS] Fortran Overhead ( 0 ) :   26.2218s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :  262.8391s for    90112 events => throughput is 3.43E+02 events/s
 
 *** (2-512z) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -483,14 +483,14 @@ OK! xsec from fortran (2.1358436158813976E-007) and cpp (2.1358436158813958E-007
 OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
 
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
-Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.630819e+02                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.740541e+02                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
-Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.617459e+02                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.745479e+02                 )  sec^-1
 
 *** (3) EXECUTE MADEVENT_CUDA x1 (create events.lhe) ***
 --------------------
@@ -513,9 +513,9 @@ Executing ' ./build.none_d_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggttggg
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 1.169e-06 [1.1693100945435838E-006] fbridge_mode=1
  [UNWEIGHT] Wrote 14 events (found 457 events)
- [COUNTERS] PROGRAM TOTAL          :    4.2578s
- [COUNTERS] Fortran Overhead ( 0 ) :    3.1700s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    1.0877s for     8192 events => throughput is 7.53E+03 events/s
+ [COUNTERS] PROGRAM TOTAL          :    4.2951s
+ [COUNTERS] Fortran Overhead ( 0 ) :    3.2087s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    1.0864s for     8192 events => throughput is 7.54E+03 events/s
 
 *** (3) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -546,9 +546,9 @@ Executing ' ./build.none_d_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggttggg
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 2.136e-07 [2.1358436158813958E-007] fbridge_mode=1
  [UNWEIGHT] Wrote 84 events (found 1181 events)
- [COUNTERS] PROGRAM TOTAL          :   18.7016s
- [COUNTERS] Fortran Overhead ( 0 ) :    6.7987s
- [COUNTERS] CudaCpp MEs      ( 2 ) :   11.9029s for    90112 events => throughput is 7.57E+03 events/s
+ [COUNTERS] PROGRAM TOTAL          :   18.7118s
+ [COUNTERS] Fortran Overhead ( 0 ) :    6.7622s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :   11.9496s for    90112 events => throughput is 7.54E+03 events/s
 
 *** (3) Compare MADEVENT_CUDA x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -559,43 +559,43 @@ OK! xsec from fortran (2.1358436158813976E-007) and cpp (2.1358436158813958E-007
 OK! events.lhe.cuda.10 and events.lhe.ref.10 are identical
 
 *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge ***
-Process                     = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 7.502804e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.487117e+03                 )  sec^-1
 
 *** EXECUTE GCHECK(8192) -p 256 32 1 ***
-Process                     = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 9.272554e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 9.224789e+03                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX) -p 512 32 1 --bridge ***
-Process                     = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 9.173278e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 9.215613e+03                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX) -p 512 32 1 ***
-Process                     = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 9.536211e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 9.512340e+03                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX128THR) -p 128 128 1 --bridge ***
-Process                     = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 9.207244e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 9.227523e+03                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX128THR) -p 128 128 1 ***
-Process                     = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 9.442859e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 9.402144e+03                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX8THR) -p 2048 8 1 --bridge ***
-Process                     = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 9.218238e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 9.254965e+03                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX8THR) -p 2048 8 1 ***
-Process                     = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.222599e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.253935e+03                 )  sec^-1
 
 TEST COMPLETED
diff --git a/epochX/cudacpp/tmad/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd0.txt
index 7ffa42b1e8..c7c6154514 100644
--- a/epochX/cudacpp/tmad/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd0.txt
+++ b/epochX/cudacpp/tmad/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd0.txt
@@ -4,28 +4,28 @@ CUDACPP_BUILDDIR='.'
 
 make USEBUILDDIR=1 AVX=none
 
-
 make USEBUILDDIR=1 AVX=sse4
 make USEBUILDDIR=1 AVX=avx2
-make USEBUILDDIR=1 AVX=512y
-
-make USEBUILDDIR=1 AVX=512z
 make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg'
 make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg'
 make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg'
+
+make USEBUILDDIR=1 AVX=512y
 make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg'
+
+make USEBUILDDIR=1 AVX=512z
 make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg'
-CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd0'
+CUDACPP_BUILDDIR='build.none_f_inl0_hrd0'
 CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0'
 CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0'
-CUDACPP_BUILDDIR='build.none_f_inl0_hrd0'
-CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0'
+CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd0'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg'
+CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg'
 make[1]: Nothing to be done for 'all'.
@@ -33,9 +33,9 @@ make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/
 
 OMP_NUM_THREADS=
 
-DATE: 2023-07-19_01:15:18
+DATE: 2023-06-17_01:08:35
 
-On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
+On itscrd80.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg
 
 *** (1) EXECUTE MADEVENT_FORTRAN (create results.dat) ***
@@ -58,9 +58,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttggg_x1_fortran > /tmp/a
  [XSECTION] Configuration = 1
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 1.169e-06 [] fbridge_mode=0
- [COUNTERS] PROGRAM TOTAL          :   99.4700s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.4182s
- [COUNTERS] Fortran MEs      ( 1 ) :   99.0518s for     8192 events => throughput is 8.27E+01 events/s
+ [COUNTERS] PROGRAM TOTAL          :   95.9433s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.4120s
+ [COUNTERS] Fortran MEs      ( 1 ) :   95.5312s for     8192 events => throughput is 8.58E+01 events/s
 
 *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) ***
 --------------------
@@ -83,9 +83,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttggg_x1_fortran > /tmp/a
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 1.169e-06 [1.1693100945435808E-006] fbridge_mode=0
  [UNWEIGHT] Wrote 14 events (found 457 events)
- [COUNTERS] PROGRAM TOTAL          :   99.3733s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.4984s
- [COUNTERS] Fortran MEs      ( 1 ) :   98.8749s for     8192 events => throughput is 8.29E+01 events/s
+ [COUNTERS] PROGRAM TOTAL          :   96.0818s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.4708s
+ [COUNTERS] Fortran MEs      ( 1 ) :   95.6110s for     8192 events => throughput is 8.57E+01 events/s
 
 *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) ***
 --------------------
@@ -108,9 +108,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttggg_x10_fortran > /tmp/
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 2.136e-07 [2.1358436158813976E-007] fbridge_mode=0
  [UNWEIGHT] Wrote 84 events (found 1181 events)
- [COUNTERS] PROGRAM TOTAL          : 1089.4213s
- [COUNTERS] Fortran Overhead ( 0 ) :    4.0618s
- [COUNTERS] Fortran MEs      ( 1 ) : 1085.3595s for    90112 events => throughput is 8.30E+01 events/s
+ [COUNTERS] PROGRAM TOTAL          : 1056.4460s
+ [COUNTERS] Fortran Overhead ( 0 ) :    4.0785s
+ [COUNTERS] Fortran MEs      ( 1 ) : 1052.3676s for    90112 events => throughput is 8.56E+01 events/s
 
 *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -131,15 +131,15 @@ Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_
  [XSECTION] MultiChannel = TRUE
  [XSECTION] Configuration = 1
  [XSECTION] ChannelId = 1
- [XSECTION] Cross section = 1.169e-06 [1.1694768395202781E-006] fbridge_mode=1
+ [XSECTION] Cross section = 1.169e-06 [1.1694768395608941E-006] fbridge_mode=1
  [UNWEIGHT] Wrote 14 events (found 457 events)
- [COUNTERS] PROGRAM TOTAL          :  202.7920s
- [COUNTERS] Fortran Overhead ( 0 ) :   93.7840s
- [COUNTERS] CudaCpp MEs      ( 2 ) :  109.0081s for     8192 events => throughput is 7.52E+01 events/s
+ [COUNTERS] PROGRAM TOTAL          :  201.5191s
+ [COUNTERS] Fortran Overhead ( 0 ) :   92.3001s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :  109.2190s for     8192 events => throughput is 7.50E+01 events/s
 
 *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
-OK! xsec from fortran (1.1693100945435808E-006) and cpp (1.1694768395202781E-006) differ by less than 4E-4 (0.00014260116069753082)
+OK! xsec from fortran (1.1693100945435808E-006) and cpp (1.1694768395608941E-006) differ by less than 4E-4 (0.0001426011954326345)
 
 *** (2-none) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) ***
 
@@ -164,29 +164,29 @@ Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_
  [XSECTION] MultiChannel = TRUE
  [XSECTION] Configuration = 1
  [XSECTION] ChannelId = 1
- [XSECTION] Cross section = 2.136e-07 [2.1361436140448921E-007] fbridge_mode=1
+ [XSECTION] Cross section = 2.136e-07 [2.1361436148187123E-007] fbridge_mode=1
  [UNWEIGHT] Wrote 84 events (found 1181 events)
- [COUNTERS] PROGRAM TOTAL          : 1303.2103s
- [COUNTERS] Fortran Overhead ( 0 ) :   97.8079s
- [COUNTERS] CudaCpp MEs      ( 2 ) : 1205.4025s for    90112 events => throughput is 7.48E+01 events/s
+ [COUNTERS] PROGRAM TOTAL          : 1305.0132s
+ [COUNTERS] Fortran Overhead ( 0 ) :   96.3805s
+ [COUNTERS] CudaCpp MEs      ( 2 ) : 1208.6327s for    90112 events => throughput is 7.46E+01 events/s
 
 *** (2-none) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
 
-OK! xsec from fortran (2.1358436158813976E-007) and cpp (2.1361436140448921E-007) differ by less than 4E-4 (0.00014045886190539036)
+OK! xsec from fortran (2.1358436158813976E-007) and cpp (2.1361436148187123E-007) differ by less than 4E-4 (0.00014045922420713453)
 
 *** (2-none) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) ***
 
 OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
 
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
-Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 8.764186e+01                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 8.869030e+01                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
-Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 8.757019e+01                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 8.908799e+01                 )  sec^-1
 
 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -209,9 +209,9 @@ Executing ' ./build.sse4_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 1.169e-06 [1.1694766634537254E-006] fbridge_mode=1
  [UNWEIGHT] Wrote 14 events (found 457 events)
- [COUNTERS] PROGRAM TOTAL          :   51.0027s
- [COUNTERS] Fortran Overhead ( 0 ) :   24.0099s
- [COUNTERS] CudaCpp MEs      ( 2 ) :   26.9929s for     8192 events => throughput is 3.03E+02 events/s
+ [COUNTERS] PROGRAM TOTAL          :   48.8581s
+ [COUNTERS] Fortran Overhead ( 0 ) :   22.8819s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :   25.9762s for     8192 events => throughput is 3.15E+02 events/s
 
 *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -242,9 +242,9 @@ Executing ' ./build.sse4_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 2.136e-07 [2.1361435622518579E-007] fbridge_mode=1
  [UNWEIGHT] Wrote 84 events (found 1181 events)
- [COUNTERS] PROGRAM TOTAL          :  324.5174s
- [COUNTERS] Fortran Overhead ( 0 ) :   28.1889s
- [COUNTERS] CudaCpp MEs      ( 2 ) :  296.3285s for    90112 events => throughput is 3.04E+02 events/s
+ [COUNTERS] PROGRAM TOTAL          :  311.8489s
+ [COUNTERS] Fortran Overhead ( 0 ) :   26.4793s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :  285.3696s for    90112 events => throughput is 3.16E+02 events/s
 
 *** (2-sse4) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -255,14 +255,14 @@ OK! xsec from fortran (2.1358436158813976E-007) and cpp (2.1361435622518579E-007
 OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
 
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
-Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.489004e+02                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.654834e+02                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
-Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.477635e+02                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.636170e+02                 )  sec^-1
 
 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -285,9 +285,9 @@ Executing ' ./build.avx2_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 1.169e-06 [1.1694765364749936E-006] fbridge_mode=1
  [UNWEIGHT] Wrote 14 events (found 457 events)
- [COUNTERS] PROGRAM TOTAL          :   26.5180s
- [COUNTERS] Fortran Overhead ( 0 ) :   12.3665s
- [COUNTERS] CudaCpp MEs      ( 2 ) :   14.1515s for     8192 events => throughput is 5.79E+02 events/s
+ [COUNTERS] PROGRAM TOTAL          :   25.9178s
+ [COUNTERS] Fortran Overhead ( 0 ) :   12.1541s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :   13.7637s for     8192 events => throughput is 5.95E+02 events/s
 
 *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -318,9 +318,9 @@ Executing ' ./build.avx2_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 2.136e-07 [2.1361435955979457E-007] fbridge_mode=1
  [UNWEIGHT] Wrote 84 events (found 1181 events)
- [COUNTERS] PROGRAM TOTAL          :  172.2093s
- [COUNTERS] Fortran Overhead ( 0 ) :   15.9965s
- [COUNTERS] CudaCpp MEs      ( 2 ) :  156.2128s for    90112 events => throughput is 5.77E+02 events/s
+ [COUNTERS] PROGRAM TOTAL          :  167.6069s
+ [COUNTERS] Fortran Overhead ( 0 ) :   15.7359s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :  151.8710s for    90112 events => throughput is 5.93E+02 events/s
 
 *** (2-avx2) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -331,14 +331,14 @@ OK! xsec from fortran (2.1358436158813976E-007) and cpp (2.1361435955979457E-007
 OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
 
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
-Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 6.926115e+02                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.068292e+02                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
-Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 6.916787e+02                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.077191e+02                 )  sec^-1
 
 *** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -361,9 +361,9 @@ Executing ' ./build.512y_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 1.169e-06 [1.1694765364749936E-006] fbridge_mode=1
  [UNWEIGHT] Wrote 14 events (found 457 events)
- [COUNTERS] PROGRAM TOTAL          :   23.7524s
- [COUNTERS] Fortran Overhead ( 0 ) :   11.0666s
- [COUNTERS] CudaCpp MEs      ( 2 ) :   12.6858s for     8192 events => throughput is 6.46E+02 events/s
+ [COUNTERS] PROGRAM TOTAL          :   23.2564s
+ [COUNTERS] Fortran Overhead ( 0 ) :   10.7772s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :   12.4793s for     8192 events => throughput is 6.56E+02 events/s
 
 *** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -394,9 +394,9 @@ Executing ' ./build.512y_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 2.136e-07 [2.1361435955979457E-007] fbridge_mode=1
  [UNWEIGHT] Wrote 84 events (found 1181 events)
- [COUNTERS] PROGRAM TOTAL          :  153.6582s
- [COUNTERS] Fortran Overhead ( 0 ) :   14.6902s
- [COUNTERS] CudaCpp MEs      ( 2 ) :  138.9680s for    90112 events => throughput is 6.48E+02 events/s
+ [COUNTERS] PROGRAM TOTAL          :  152.0499s
+ [COUNTERS] Fortran Overhead ( 0 ) :   14.4286s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :  137.6212s for    90112 events => throughput is 6.55E+02 events/s
 
 *** (2-512y) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -407,14 +407,14 @@ OK! xsec from fortran (2.1358436158813976E-007) and cpp (2.1361435955979457E-007
 OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
 
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
-Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 7.812155e+02                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.965515e+02                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
-Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 7.766114e+02                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.954223e+02                 )  sec^-1
 
 *** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -437,9 +437,9 @@ Executing ' ./build.512z_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 1.169e-06 [1.1694767893082863E-006] fbridge_mode=1
  [UNWEIGHT] Wrote 14 events (found 457 events)
- [COUNTERS] PROGRAM TOTAL          :   24.0249s
- [COUNTERS] Fortran Overhead ( 0 ) :   11.7974s
- [COUNTERS] CudaCpp MEs      ( 2 ) :   12.2275s for     8192 events => throughput is 6.70E+02 events/s
+ [COUNTERS] PROGRAM TOTAL          :   23.5866s
+ [COUNTERS] Fortran Overhead ( 0 ) :   11.6220s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :   11.9645s for     8192 events => throughput is 6.85E+02 events/s
 
 *** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -470,9 +470,9 @@ Executing ' ./build.512z_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 2.136e-07 [2.1361441834174529E-007] fbridge_mode=1
  [UNWEIGHT] Wrote 84 events (found 1181 events)
- [COUNTERS] PROGRAM TOTAL          :  150.5771s
- [COUNTERS] Fortran Overhead ( 0 ) :   15.4619s
- [COUNTERS] CudaCpp MEs      ( 2 ) :  135.1153s for    90112 events => throughput is 6.67E+02 events/s
+ [COUNTERS] PROGRAM TOTAL          :  145.9255s
+ [COUNTERS] Fortran Overhead ( 0 ) :   15.0340s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :  130.8914s for    90112 events => throughput is 6.88E+02 events/s
 
 *** (2-512z) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -483,14 +483,14 @@ OK! xsec from fortran (2.1358436158813976E-007) and cpp (2.1361441834174529E-007
 OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
 
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
-Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 7.259660e+02                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.533944e+02                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
-Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 7.254966e+02                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.436083e+02                 )  sec^-1
 
 *** (3) EXECUTE MADEVENT_CUDA x1 (create events.lhe) ***
 --------------------
@@ -513,9 +513,9 @@ Executing ' ./build.none_f_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggttggg
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 1.169e-06 [1.1694770708195000E-006] fbridge_mode=1
  [UNWEIGHT] Wrote 14 events (found 457 events)
- [COUNTERS] PROGRAM TOTAL          :    2.4902s
- [COUNTERS] Fortran Overhead ( 0 ) :    1.9980s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.4922s for     8192 events => throughput is 1.66E+04 events/s
+ [COUNTERS] PROGRAM TOTAL          :    2.5187s
+ [COUNTERS] Fortran Overhead ( 0 ) :    2.0274s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.4913s for     8192 events => throughput is 1.67E+04 events/s
 
 *** (3) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -546,9 +546,9 @@ Executing ' ./build.none_f_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggttggg
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 2.136e-07 [2.1361443477565659E-007] fbridge_mode=1
  [UNWEIGHT] Wrote 84 events (found 1181 events)
- [COUNTERS] PROGRAM TOTAL          :   11.1455s
- [COUNTERS] Fortran Overhead ( 0 ) :    5.6649s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    5.4806s for    90112 events => throughput is 1.64E+04 events/s
+ [COUNTERS] PROGRAM TOTAL          :   11.0853s
+ [COUNTERS] Fortran Overhead ( 0 ) :    5.6057s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    5.4795s for    90112 events => throughput is 1.64E+04 events/s
 
 *** (3) Compare MADEVENT_CUDA x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -559,43 +559,43 @@ OK! xsec from fortran (2.1358436158813976E-007) and cpp (2.1361443477565659E-007
 OK! events.lhe.cuda.10 and events.lhe.ref.10 are identical
 
 *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge ***
-Process                     = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.624769e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.640525e+04                 )  sec^-1
 
 *** EXECUTE GCHECK(8192) -p 256 32 1 ***
-Process                     = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.628076e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.649395e+04                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX) -p 512 32 1 --bridge ***
-Process                     = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.322584e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.325104e+04                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX) -p 512 32 1 ***
-Process                     = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.352873e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.391765e+04                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX128THR) -p 128 128 1 --bridge ***
-Process                     = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.316394e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.322841e+04                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX128THR) -p 128 128 1 ***
-Process                     = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.365575e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.376800e+04                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX8THR) -p 2048 8 1 --bridge ***
-Process                     = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.324891e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.338034e+04                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX8THR) -p 2048 8 1 ***
-Process                     = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 6.457308e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 6.503445e+03                 )  sec^-1
 
 TEST COMPLETED
diff --git a/epochX/cudacpp/tmad/logs_ggttggg_mad/log_ggttggg_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_ggttggg_mad/log_ggttggg_mad_m_inl0_hrd0.txt
index bb4557ce90..40600c6dee 100644
--- a/epochX/cudacpp/tmad/logs_ggttggg_mad/log_ggttggg_mad_m_inl0_hrd0.txt
+++ b/epochX/cudacpp/tmad/logs_ggttggg_mad/log_ggttggg_mad_m_inl0_hrd0.txt
@@ -4,18 +4,17 @@ CUDACPP_BUILDDIR='.'
 
 
 make USEBUILDDIR=1 AVX=none
-
 make USEBUILDDIR=1 AVX=sse4
 make USEBUILDDIR=1 AVX=avx2
-make USEBUILDDIR=1 AVX=512y
 
-make USEBUILDDIR=1 AVX=512z
+make USEBUILDDIR=1 AVX=512y
 make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg'
 make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg'
 make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg'
 make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg'
+
+make USEBUILDDIR=1 AVX=512z
 make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg'
-CUDACPP_BUILDDIR='build.512z_m_inl0_hrd0'
 CUDACPP_BUILDDIR='build.none_m_inl0_hrd0'
 CUDACPP_BUILDDIR='build.sse4_m_inl0_hrd0'
 CUDACPP_BUILDDIR='build.avx2_m_inl0_hrd0'
@@ -24,6 +23,7 @@ make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg'
+CUDACPP_BUILDDIR='build.512z_m_inl0_hrd0'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg'
 make[1]: Nothing to be done for 'all'.
@@ -33,9 +33,9 @@ make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/
 
 OMP_NUM_THREADS=
 
-DATE: 2023-07-19_02:23:00
+DATE: 2023-06-17_02:15:04
 
-On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
+On itscrd80.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg
 
 *** (1) EXECUTE MADEVENT_FORTRAN (create results.dat) ***
@@ -58,9 +58,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttggg_x1_fortran > /tmp/a
  [XSECTION] Configuration = 1
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 1.169e-06 [] fbridge_mode=0
- [COUNTERS] PROGRAM TOTAL          :   99.4193s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.4375s
- [COUNTERS] Fortran MEs      ( 1 ) :   98.9818s for     8192 events => throughput is 8.28E+01 events/s
+ [COUNTERS] PROGRAM TOTAL          :   96.1609s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.4132s
+ [COUNTERS] Fortran MEs      ( 1 ) :   95.7477s for     8192 events => throughput is 8.56E+01 events/s
 
 *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) ***
 --------------------
@@ -83,9 +83,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttggg_x1_fortran > /tmp/a
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 1.169e-06 [1.1693100945435808E-006] fbridge_mode=0
  [UNWEIGHT] Wrote 14 events (found 457 events)
- [COUNTERS] PROGRAM TOTAL          :   99.9753s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.4785s
- [COUNTERS] Fortran MEs      ( 1 ) :   99.4967s for     8192 events => throughput is 8.23E+01 events/s
+ [COUNTERS] PROGRAM TOTAL          :   96.6436s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.4751s
+ [COUNTERS] Fortran MEs      ( 1 ) :   96.1685s for     8192 events => throughput is 8.52E+01 events/s
 
 *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) ***
 --------------------
@@ -108,9 +108,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttggg_x10_fortran > /tmp/
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 2.136e-07 [2.1358436158813976E-007] fbridge_mode=0
  [UNWEIGHT] Wrote 84 events (found 1181 events)
- [COUNTERS] PROGRAM TOTAL          : 1093.3960s
- [COUNTERS] Fortran Overhead ( 0 ) :    4.0669s
- [COUNTERS] Fortran MEs      ( 1 ) : 1089.3291s for    90112 events => throughput is 8.27E+01 events/s
+ [COUNTERS] PROGRAM TOTAL          : 1059.8674s
+ [COUNTERS] Fortran Overhead ( 0 ) :    4.0491s
+ [COUNTERS] Fortran MEs      ( 1 ) : 1055.8184s for    90112 events => throughput is 8.53E+01 events/s
 
 *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -133,9 +133,9 @@ Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 1.169e-06 [1.1693101016896846E-006] fbridge_mode=1
  [UNWEIGHT] Wrote 14 events (found 457 events)
- [COUNTERS] PROGRAM TOTAL          :  233.3234s
- [COUNTERS] Fortran Overhead ( 0 ) :  107.7432s
- [COUNTERS] CudaCpp MEs      ( 2 ) :  125.5802s for     8192 events => throughput is 6.52E+01 events/s
+ [COUNTERS] PROGRAM TOTAL          :  211.4841s
+ [COUNTERS] Fortran Overhead ( 0 ) :   97.1384s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :  114.3457s for     8192 events => throughput is 7.16E+01 events/s
 
 *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -166,9 +166,9 @@ Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 2.136e-07 [2.1358436275882778E-007] fbridge_mode=1
  [UNWEIGHT] Wrote 84 events (found 1181 events)
- [COUNTERS] PROGRAM TOTAL          : 1483.7189s
- [COUNTERS] Fortran Overhead ( 0 ) :  110.6562s
- [COUNTERS] CudaCpp MEs      ( 2 ) : 1373.0626s for    90112 events => throughput is 6.56E+01 events/s
+ [COUNTERS] PROGRAM TOTAL          : 1372.8556s
+ [COUNTERS] Fortran Overhead ( 0 ) :  101.8141s
+ [COUNTERS] CudaCpp MEs      ( 2 ) : 1271.0415s for    90112 events => throughput is 7.09E+01 events/s
 
 *** (2-none) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -179,14 +179,14 @@ OK! xsec from fortran (2.1358436158813976E-007) and cpp (2.1358436275882778E-007
 OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
 
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
-Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 7.683350e+01                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 8.162213e+01                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
-Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 7.685547e+01                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 8.331097e+01                 )  sec^-1
 
 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -209,9 +209,9 @@ Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 1.169e-06 [1.1693101020910778E-006] fbridge_mode=1
  [UNWEIGHT] Wrote 14 events (found 457 events)
- [COUNTERS] PROGRAM TOTAL          :  115.0299s
- [COUNTERS] Fortran Overhead ( 0 ) :   52.5179s
- [COUNTERS] CudaCpp MEs      ( 2 ) :   62.5120s for     8192 events => throughput is 1.31E+02 events/s
+ [COUNTERS] PROGRAM TOTAL          :  110.6811s
+ [COUNTERS] Fortran Overhead ( 0 ) :   51.3766s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :   59.3045s for     8192 events => throughput is 1.38E+02 events/s
 
 *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -242,9 +242,9 @@ Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 2.136e-07 [2.1358436284111598E-007] fbridge_mode=1
  [UNWEIGHT] Wrote 84 events (found 1181 events)
- [COUNTERS] PROGRAM TOTAL          :  720.0316s
- [COUNTERS] Fortran Overhead ( 0 ) :   56.0965s
- [COUNTERS] CudaCpp MEs      ( 2 ) :  663.9351s for    90112 events => throughput is 1.36E+02 events/s
+ [COUNTERS] PROGRAM TOTAL          :  688.7864s
+ [COUNTERS] Fortran Overhead ( 0 ) :   54.7130s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :  634.0734s for    90112 events => throughput is 1.42E+02 events/s
 
 *** (2-sse4) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -255,14 +255,14 @@ OK! xsec from fortran (2.1358436158813976E-007) and cpp (2.1358436284111598E-007
 OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
 
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
-Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.574498e+02                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.616861e+02                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
-Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.565670e+02                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.616649e+02                 )  sec^-1
 
 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -285,9 +285,9 @@ Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 1.169e-06 [1.1693101021831071E-006] fbridge_mode=1
  [UNWEIGHT] Wrote 14 events (found 457 events)
- [COUNTERS] PROGRAM TOTAL          :   50.7158s
- [COUNTERS] Fortran Overhead ( 0 ) :   22.9716s
- [COUNTERS] CudaCpp MEs      ( 2 ) :   27.7442s for     8192 events => throughput is 2.95E+02 events/s
+ [COUNTERS] PROGRAM TOTAL          :   49.6391s
+ [COUNTERS] Fortran Overhead ( 0 ) :   22.3277s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :   27.3114s for     8192 events => throughput is 3.00E+02 events/s
 
 *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -318,9 +318,9 @@ Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 2.136e-07 [2.1358436281462142E-007] fbridge_mode=1
  [UNWEIGHT] Wrote 84 events (found 1181 events)
- [COUNTERS] PROGRAM TOTAL          :  320.7675s
- [COUNTERS] Fortran Overhead ( 0 ) :   26.3391s
- [COUNTERS] CudaCpp MEs      ( 2 ) :  294.4284s for    90112 events => throughput is 3.06E+02 events/s
+ [COUNTERS] PROGRAM TOTAL          :  327.4154s
+ [COUNTERS] Fortran Overhead ( 0 ) :   25.8163s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :  301.5990s for    90112 events => throughput is 2.99E+02 events/s
 
 *** (2-avx2) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -331,14 +331,14 @@ OK! xsec from fortran (2.1358436158813976E-007) and cpp (2.1358436281462142E-007
 OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
 
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
-Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.668610e+02                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.748239e+02                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
-Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.661090e+02                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.733522e+02                 )  sec^-1
 
 *** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -361,9 +361,9 @@ Executing ' ./build.512y_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 1.169e-06 [1.1693101021831071E-006] fbridge_mode=1
  [UNWEIGHT] Wrote 14 events (found 457 events)
- [COUNTERS] PROGRAM TOTAL          :   44.9958s
- [COUNTERS] Fortran Overhead ( 0 ) :   20.3872s
- [COUNTERS] CudaCpp MEs      ( 2 ) :   24.6086s for     8192 events => throughput is 3.33E+02 events/s
+ [COUNTERS] PROGRAM TOTAL          :   43.4217s
+ [COUNTERS] Fortran Overhead ( 0 ) :   19.9092s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :   23.5125s for     8192 events => throughput is 3.48E+02 events/s
 
 *** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -394,9 +394,9 @@ Executing ' ./build.512y_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 2.136e-07 [2.1358436281462142E-007] fbridge_mode=1
  [UNWEIGHT] Wrote 84 events (found 1181 events)
- [COUNTERS] PROGRAM TOTAL          :  291.0724s
- [COUNTERS] Fortran Overhead ( 0 ) :   24.0309s
- [COUNTERS] CudaCpp MEs      ( 2 ) :  267.0414s for    90112 events => throughput is 3.37E+02 events/s
+ [COUNTERS] PROGRAM TOTAL          :  282.1566s
+ [COUNTERS] Fortran Overhead ( 0 ) :   23.4368s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :  258.7198s for    90112 events => throughput is 3.48E+02 events/s
 
 *** (2-512y) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -407,14 +407,14 @@ OK! xsec from fortran (2.1358436158813976E-007) and cpp (2.1358436281462142E-007
 OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
 
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
-Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 4.101365e+02                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.246539e+02                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
-Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 4.159557e+02                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.237720e+02                 )  sec^-1
 
 *** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -437,9 +437,9 @@ Executing ' ./build.512z_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 1.169e-06 [1.1693101021831071E-006] fbridge_mode=1
  [UNWEIGHT] Wrote 14 events (found 457 events)
- [COUNTERS] PROGRAM TOTAL          :   47.0065s
- [COUNTERS] Fortran Overhead ( 0 ) :   22.5577s
- [COUNTERS] CudaCpp MEs      ( 2 ) :   24.4489s for     8192 events => throughput is 3.35E+02 events/s
+ [COUNTERS] PROGRAM TOTAL          :   45.4934s
+ [COUNTERS] Fortran Overhead ( 0 ) :   21.9129s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :   23.5805s for     8192 events => throughput is 3.47E+02 events/s
 
 *** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -470,9 +470,9 @@ Executing ' ./build.512z_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 2.136e-07 [2.1358436281462142E-007] fbridge_mode=1
  [UNWEIGHT] Wrote 84 events (found 1181 events)
- [COUNTERS] PROGRAM TOTAL          :  291.9821s
- [COUNTERS] Fortran Overhead ( 0 ) :   26.0599s
- [COUNTERS] CudaCpp MEs      ( 2 ) :  265.9222s for    90112 events => throughput is 3.39E+02 events/s
+ [COUNTERS] PROGRAM TOTAL          :  285.7173s
+ [COUNTERS] Fortran Overhead ( 0 ) :   25.4131s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :  260.3043s for    90112 events => throughput is 3.46E+02 events/s
 
 *** (2-512z) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -483,14 +483,14 @@ OK! xsec from fortran (2.1358436158813976E-007) and cpp (2.1358436281462142E-007
 OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
 
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
-Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.739816e+02                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.849415e+02                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
-Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.730410e+02                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.819185e+02                 )  sec^-1
 
 *** (3) EXECUTE MADEVENT_CUDA x1 (create events.lhe) ***
 --------------------
@@ -513,9 +513,9 @@ Executing ' ./build.none_m_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggttggg
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 1.169e-06 [1.1693100942770687E-006] fbridge_mode=1
  [UNWEIGHT] Wrote 14 events (found 457 events)
- [COUNTERS] PROGRAM TOTAL          :    3.6269s
- [COUNTERS] Fortran Overhead ( 0 ) :    2.7646s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.8623s for     8192 events => throughput is 9.50E+03 events/s
+ [COUNTERS] PROGRAM TOTAL          :    3.6199s
+ [COUNTERS] Fortran Overhead ( 0 ) :    2.7558s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.8641s for     8192 events => throughput is 9.48E+03 events/s
 
 *** (3) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -546,9 +546,9 @@ Executing ' ./build.none_m_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggttggg
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 2.136e-07 [2.1358436157495368E-007] fbridge_mode=1
  [UNWEIGHT] Wrote 84 events (found 1181 events)
- [COUNTERS] PROGRAM TOTAL          :   15.8537s
- [COUNTERS] Fortran Overhead ( 0 ) :    6.3851s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    9.4686s for    90112 events => throughput is 9.52E+03 events/s
+ [COUNTERS] PROGRAM TOTAL          :   15.8257s
+ [COUNTERS] Fortran Overhead ( 0 ) :    6.3170s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    9.5087s for    90112 events => throughput is 9.48E+03 events/s
 
 *** (3) Compare MADEVENT_CUDA x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -559,43 +559,43 @@ OK! xsec from fortran (2.1358436158813976E-007) and cpp (2.1358436157495368E-007
 OK! events.lhe.cuda.10 and events.lhe.ref.10 are identical
 
 *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge ***
-Process                     = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 9.456219e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 9.405516e+03                 )  sec^-1
 
 *** EXECUTE GCHECK(8192) -p 256 32 1 ***
-Process                     = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.083869e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.081261e+04                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX) -p 512 32 1 --bridge ***
-Process                     = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.105733e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.109602e+04                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX) -p 512 32 1 ***
-Process                     = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.160235e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.160351e+04                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX128THR) -p 128 128 1 --bridge ***
-Process                     = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.109947e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.107621e+04                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX128THR) -p 128 128 1 ***
-Process                     = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.104461e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.111526e+04                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX8THR) -p 2048 8 1 --bridge ***
-Process                     = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.107698e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.109298e+04                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX8THR) -p 2048 8 1 ***
-Process                     = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.641814e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.655752e+03                 )  sec^-1
 
 TEST COMPLETED
diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0.txt
index 97fdf2746a..2f9c234dd5 100644
--- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0.txt
+++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0.txt
@@ -35,22 +35,22 @@ CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 
-DATE: 2023-07-18_22:37:20
+DATE: 2023-06-16_22:48:21
 
-On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
+On itscrd80.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 12 OMP=
-Process                     = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 5.475387e+07                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 4.820978e+08                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 8.778893e+08                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 7.277441e+07                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 5.165288e+08                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 8.736817e+08                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270315e-06 )  GeV^0
-TOTAL       :     0.855531 sec
-     2,833,192,783      cycles                           #    2.924 GHz                    
-     4,325,389,575      instructions                     #    1.53  insn per cycle         
-       1.187306501 seconds time elapsed
+TOTAL       :     0.697989 sec
+     2,734,672,671      cycles                    #    2.891 GHz                    
+     3,867,376,774      instructions              #    1.41  insn per cycle         
+       1.006665473 seconds time elapsed
 runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 1
 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 150
 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100%
@@ -63,19 +63,19 @@ Relative difference = 1.0277080522138477e-08
 OK (relative difference <= 5E-3)
 =========================================================================
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/check.exe -p 2048 256 12 OMP=
-Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.195619e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.419858e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.419858e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.201835e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.490501e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.490501e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270315e-06 )  GeV^0
-TOTAL       :     5.668235 sec
-    17,201,910,416      cycles                           #    3.033 GHz                    
-    40,424,836,344      instructions                     #    2.35  insn per cycle         
-       5.676367615 seconds time elapsed
+TOTAL       :     5.636481 sec
+    17,424,894,515      cycles                    #    3.090 GHz                    
+    41,067,496,174      instructions              #    2.36  insn per cycle         
+       5.642861047 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:  375) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/runTest.exe
@@ -89,19 +89,19 @@ Relative difference = 1.0277102699700292e-08
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/check.exe -p 2048 256 12 OMP=
-Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 2.110803e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.016058e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.016058e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.047757e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.152885e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.152885e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270315e-06 )  GeV^0
-TOTAL       :     3.385940 sec
-    10,308,474,989      cycles                           #    3.040 GHz                    
-    24,683,556,153      instructions                     #    2.39  insn per cycle         
-       3.396019895 seconds time elapsed
+TOTAL       :     3.473381 sec
+    10,707,811,255      cycles                    #    3.080 GHz                    
+    25,328,572,543      instructions              #    2.37  insn per cycle         
+       3.485749163 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4: 1283) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/runTest.exe
@@ -115,19 +115,19 @@ Relative difference = 1.0277102699700292e-08
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/check.exe -p 2048 256 12 OMP=
-Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 3.235785e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 5.797359e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 5.797359e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.975289e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 5.963650e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 5.963650e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270315e-06 )  GeV^0
-TOTAL       :     2.343102 sec
-     6,894,343,443      cycles                           #    2.936 GHz                    
-    13,677,655,108      instructions                     #    1.98  insn per cycle         
-       2.351884504 seconds time elapsed
+TOTAL       :     2.514299 sec
+     7,529,377,193      cycles                    #    2.995 GHz                    
+    14,324,765,141      instructions              #    1.90  insn per cycle         
+       2.525874425 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 1063) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/runTest.exe
@@ -141,19 +141,19 @@ Relative difference = 1.0277089312025782e-08
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/check.exe -p 2048 256 12 OMP=
-Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 3.391389e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 6.266576e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 6.266576e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.078510e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 6.389306e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 6.389306e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270315e-06 )  GeV^0
-TOTAL       :     2.252146 sec
-     6,641,349,095      cycles                           #    2.944 GHz                    
-    13,370,765,818      instructions                     #    2.01  insn per cycle         
-       2.260544049 seconds time elapsed
+TOTAL       :     2.437364 sec
+     7,311,719,824      cycles                    #    2.995 GHz                    
+    14,031,232,859      instructions              #    1.92  insn per cycle         
+       2.448705799 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 1024) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/runTest.exe
@@ -167,19 +167,19 @@ Relative difference = 1.0277089312025782e-08
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/check.exe -p 2048 256 12 OMP=
-Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 3.179961e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 5.588825e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 5.588825e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.939663e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 5.739935e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 5.739935e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270315e-06 )  GeV^0
-TOTAL       :     2.381395 sec
-     5,905,623,572      cycles                           #    2.474 GHz                    
-    10,160,843,578      instructions                     #    1.72  insn per cycle         
-       2.390440152 seconds time elapsed
+TOTAL       :     2.537461 sec
+     6,538,056,350      cycles                    #    2.572 GHz                    
+    10,814,168,036      instructions              #    1.65  insn per cycle         
+       2.543993268 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2:  289) (512y:    0) (512z:  683)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/runTest.exe
diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_bridge.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_bridge.txt
index 328f839ecf..413524a714 100644
--- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_bridge.txt
+++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_bridge.txt
@@ -35,26 +35,26 @@ CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 
-DATE: 2023-07-18_23:22:26
+DATE: 2023-06-16_23:18:49
 
-On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
+On itscrd80.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 12 --bridge OMP=
 WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost
 WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost
 WARNING! Instantiate device Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288)
 WARNING! Set grid in Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288)
-Process                     = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 4.629780e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.551524e+07                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.551524e+07                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 4.138466e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.761625e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.761625e+07                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270315e-06 )  GeV^0
-TOTAL       :     2.253250 sec
-     7,452,417,366      cycles                           #    2.981 GHz                    
-    13,168,688,030      instructions                     #    1.77  insn per cycle         
-       2.555215575 seconds time elapsed
+TOTAL       :     2.435025 sec
+     8,057,144,700      cycles                    #    2.996 GHz                    
+    13,634,238,127      instructions              #    1.69  insn per cycle         
+       2.748108130 seconds time elapsed
 runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 1 --bridge
 WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost
 WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost
@@ -72,19 +72,19 @@ OK (relative difference <= 5E-3)
 =========================================================================
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/check.exe -p 2048 256 12 --bridge OMP=
 WARNING! Instantiate host Bridge (nevt=524288)
-Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.150979e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.358888e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.358888e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.146185e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.405096e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.405096e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270315e-06 )  GeV^0
-TOTAL       :     6.071511 sec
-    18,441,155,303      cycles                           #    3.036 GHz                    
-    40,655,259,371      instructions                     #    2.20  insn per cycle         
-       6.078353672 seconds time elapsed
+TOTAL       :     6.106864 sec
+    18,682,647,016      cycles                    #    3.057 GHz                    
+    41,378,608,262      instructions              #    2.21  insn per cycle         
+       6.114253847 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:  375) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/runTest.exe
@@ -99,19 +99,19 @@ OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/check.exe -p 2048 256 12 --bridge OMP=
 WARNING! Instantiate host Bridge (nevt=524288)
-Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.945018e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.698668e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.698668e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.923353e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.849109e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.849109e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270315e-06 )  GeV^0
-TOTAL       :     3.846123 sec
-    11,536,385,327      cycles                           #    2.995 GHz                    
-    25,528,557,805      instructions                     #    2.21  insn per cycle         
-       3.852649513 seconds time elapsed
+TOTAL       :     3.885345 sec
+    11,937,212,794      cycles                    #    3.069 GHz                    
+    26,176,863,335      instructions              #    2.19  insn per cycle         
+       3.901575111 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4: 1283) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/runTest.exe
@@ -126,19 +126,19 @@ OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/check.exe -p 2048 256 12 --bridge OMP=
 WARNING! Instantiate host Bridge (nevt=524288)
-Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 2.922595e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 4.906457e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 4.906457e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.738552e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 5.036567e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 5.036567e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270315e-06 )  GeV^0
-TOTAL       :     2.761542 sec
-     8,147,443,517      cycles                           #    2.944 GHz                    
-    15,038,071,882      instructions                     #    1.85  insn per cycle         
-       2.768392811 seconds time elapsed
+TOTAL       :     2.911902 sec
+     8,807,623,027      cycles                    #    3.018 GHz                    
+    15,689,801,673      instructions              #    1.78  insn per cycle         
+       2.919670444 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 1063) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/runTest.exe
@@ -153,19 +153,19 @@ OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/check.exe -p 2048 256 12 --bridge OMP=
 WARNING! Instantiate host Bridge (nevt=524288)
-Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 3.016753e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 5.161698e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 5.161698e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.823116e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 5.286837e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 5.286837e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270315e-06 )  GeV^0
-TOTAL       :     2.698772 sec
-     7,885,285,527      cycles                           #    2.915 GHz                    
-    14,731,068,956      instructions                     #    1.87  insn per cycle         
-       2.705638196 seconds time elapsed
+TOTAL       :     2.843015 sec
+     8,509,599,499      cycles                    #    2.988 GHz                    
+    15,397,182,071      instructions              #    1.81  insn per cycle         
+       2.850349634 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 1024) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/runTest.exe
@@ -180,19 +180,19 @@ OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/check.exe -p 2048 256 12 --bridge OMP=
 WARNING! Instantiate host Bridge (nevt=524288)
-Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 2.767252e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 4.506464e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 4.506464e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.546825e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 4.539933e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.539933e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270315e-06 )  GeV^0
-TOTAL       :     2.899290 sec
-     7,240,703,756      cycles                           #    2.492 GHz                    
-    11,306,846,019      instructions                     #    1.56  insn per cycle         
-       2.906107924 seconds time elapsed
+TOTAL       :     3.102839 sec
+     7,964,585,861      cycles                    #    2.565 GHz                    
+    11,966,072,331      instructions              #    1.50  insn per cycle         
+       3.117079564 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2:  289) (512y:    0) (512z:  683)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/runTest.exe
diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_common.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_common.txt
index 8fcaa402a4..f307a0f66d 100644
--- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_common.txt
+++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_common.txt
@@ -35,22 +35,22 @@ CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 
-DATE: 2023-07-18_23:34:26
+DATE: 2023-06-16_23:31:11
 
-On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
+On itscrd80.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 12 --common OMP=
-Process                     = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:COMMON+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 6.662259e+07                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 5.218708e+08                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 8.733644e+08                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 6.936041e+07                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 5.298657e+08                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 8.700721e+08                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371879e-02 +- 3.270020e-06 )  GeV^0
-TOTAL       :     1.334692 sec
-     4,639,793,652      cycles                           #    2.967 GHz                    
-     7,129,897,248      instructions                     #    1.54  insn per cycle         
-       1.621833847 seconds time elapsed
+TOTAL       :     1.341897 sec
+     4,726,189,877      cycles                    #    2.972 GHz                    
+     6,969,421,973      instructions              #    1.47  insn per cycle         
+       1.646460673 seconds time elapsed
 runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 1 --common
 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 150
 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100%
@@ -63,19 +63,19 @@ Relative difference = 1.0277080522138477e-08
 OK (relative difference <= 5E-3)
 =========================================================================
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/check.exe -p 2048 256 12 --common OMP=
-Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.191263e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.421752e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.421752e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.210740e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.496812e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.496812e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371879e-02 +- 3.270020e-06 )  GeV^0
-TOTAL       :     6.042662 sec
-    18,302,456,063      cycles                           #    3.028 GHz                    
-    40,527,734,047      instructions                     #    2.21  insn per cycle         
-       6.047705174 seconds time elapsed
+TOTAL       :     5.955030 sec
+    18,512,766,741      cycles                    #    3.107 GHz                    
+    41,194,403,666      instructions              #    2.23  insn per cycle         
+       5.961394817 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:  375) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/runTest.exe
@@ -89,19 +89,19 @@ Relative difference = 1.0277102699700292e-08
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/check.exe -p 2048 256 12 --common OMP=
-Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 2.100324e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.996340e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.996340e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.047034e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.131905e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.131905e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371879e-02 +- 3.270020e-06 )  GeV^0
-TOTAL       :     3.755609 sec
-    11,392,550,668      cycles                           #    3.030 GHz                    
-    24,688,239,473      instructions                     #    2.17  insn per cycle         
-       3.760743822 seconds time elapsed
+TOTAL       :     3.837203 sec
+    11,777,572,402      cycles                    #    3.067 GHz                    
+    25,355,656,397      instructions              #    2.15  insn per cycle         
+       3.848753687 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4: 1283) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/runTest.exe
@@ -115,19 +115,19 @@ Relative difference = 1.0277102699700292e-08
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/check.exe -p 2048 256 12 --common OMP=
-Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 3.232926e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 5.836224e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 5.836224e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.018909e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 5.963202e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 5.963202e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371879e-02 +- 3.270020e-06 )  GeV^0
-TOTAL       :     2.705588 sec
-     7,975,874,619      cycles                           #    2.944 GHz                    
-    13,582,310,666      instructions                     #    1.70  insn per cycle         
-       2.710967931 seconds time elapsed
+TOTAL       :     2.834606 sec
+     8,590,828,376      cycles                    #    3.027 GHz                    
+    14,249,676,242      instructions              #    1.66  insn per cycle         
+       2.846836909 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 1063) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/runTest.exe
@@ -141,19 +141,19 @@ Relative difference = 1.0277089312025782e-08
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/check.exe -p 2048 256 12 --common OMP=
-Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:COMMON+RMBHST+MESHST/512y+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 3.347476e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 6.167763e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 6.167763e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.112124e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 6.390077e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 6.390077e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371879e-02 +- 3.270020e-06 )  GeV^0
-TOTAL       :     2.646547 sec
-     7,739,527,075      cycles                           #    2.920 GHz                    
-    13,072,506,268      instructions                     #    1.69  insn per cycle         
-       2.652061733 seconds time elapsed
+TOTAL       :     2.779947 sec
+     8,413,366,977      cycles                    #    3.021 GHz                    
+    13,755,530,918      instructions              #    1.63  insn per cycle         
+       2.792283352 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 1024) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/runTest.exe
@@ -167,19 +167,19 @@ Relative difference = 1.0277089312025782e-08
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/check.exe -p 2048 256 12 --common OMP=
-Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:COMMON+RMBHST+MESHST/512z+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 3.143547e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 5.489365e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 5.489365e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.941539e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 5.752941e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 5.752941e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371879e-02 +- 3.270020e-06 )  GeV^0
-TOTAL       :     2.766665 sec
-     7,042,639,216      cycles                           #    2.542 GHz                    
-     9,862,282,848      instructions                     #    1.40  insn per cycle         
-       2.771823029 seconds time elapsed
+TOTAL       :     2.901859 sec
+     7,689,553,685      cycles                    #    2.645 GHz                    
+    10,538,700,131      instructions              #    1.37  insn per cycle         
+       2.908219333 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2:  289) (512y:    0) (512z:  683)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/runTest.exe
diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_curhst.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_curhst.txt
index f6754ccae5..c1df1cffcd 100644
--- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_curhst.txt
+++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_curhst.txt
@@ -35,22 +35,22 @@ CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 
-DATE: 2023-07-18_23:31:33
+DATE: 2023-06-16_23:28:12
 
-On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
+On itscrd80.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 12 --curhst OMP=
-Process                     = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURHST+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 6.662856e+07                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 5.245166e+08                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 8.781222e+08                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 6.961494e+07                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 5.330926e+08                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 8.727946e+08                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270315e-06 )  GeV^0
-TOTAL       :     0.978310 sec
-     3,590,915,972      cycles                           #    2.969 GHz                    
-     7,100,949,878      instructions                     #    1.98  insn per cycle         
-       1.265914090 seconds time elapsed
+TOTAL       :     0.991651 sec
+     3,630,538,921      cycles                    #    2.949 GHz                    
+     6,739,620,599      instructions              #    1.86  insn per cycle         
+       1.288650450 seconds time elapsed
 runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 1 --curhst
 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 150
 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100%
@@ -63,19 +63,19 @@ Relative difference = 1.0277080522138477e-08
 OK (relative difference <= 5E-3)
 =========================================================================
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/check.exe -p 2048 256 12 --curhst OMP=
-Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.197592e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.423169e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.423169e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.207504e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.492388e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.492388e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270315e-06 )  GeV^0
-TOTAL       :     5.658752 sec
-    17,157,501,803      cycles                           #    3.031 GHz                    
-    40,423,247,048      instructions                     #    2.36  insn per cycle         
-       5.663798783 seconds time elapsed
+TOTAL       :     5.609999 sec
+    17,377,959,437      cycles                    #    3.096 GHz                    
+    41,067,120,156      instructions              #    2.36  insn per cycle         
+       5.616601968 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:  375) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/runTest.exe
@@ -89,19 +89,19 @@ Relative difference = 1.0277102699700292e-08
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/check.exe -p 2048 256 12 --curhst OMP=
-Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 2.094129e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.984471e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.984471e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.061159e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.155823e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.155823e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270315e-06 )  GeV^0
-TOTAL       :     3.411637 sec
-    10,290,438,965      cycles                           #    3.012 GHz                    
-    24,682,963,306      instructions                     #    2.40  insn per cycle         
-       3.416785888 seconds time elapsed
+TOTAL       :     3.452926 sec
+    10,661,248,447      cycles                    #    3.085 GHz                    
+    25,328,629,608      instructions              #    2.38  insn per cycle         
+       3.465344366 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4: 1283) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/runTest.exe
@@ -115,19 +115,19 @@ Relative difference = 1.0277102699700292e-08
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/check.exe -p 2048 256 12 --curhst OMP=
-Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 3.128020e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 5.666773e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 5.666773e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.992039e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 5.943088e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 5.943088e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270315e-06 )  GeV^0
-TOTAL       :     2.417697 sec
-     6,930,904,304      cycles                           #    2.863 GHz                    
-    13,677,888,049      instructions                     #    1.97  insn per cycle         
-       2.422886649 seconds time elapsed
+TOTAL       :     2.500278 sec
+     7,490,696,881      cycles                    #    2.992 GHz                    
+    14,324,115,086      instructions              #    1.91  insn per cycle         
+       2.512588472 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 1063) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/runTest.exe
@@ -141,19 +141,19 @@ Relative difference = 1.0277089312025782e-08
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/check.exe -p 2048 256 12 --curhst OMP=
-Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 3.358143e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 6.166519e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 6.166519e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.099847e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 6.437862e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 6.437862e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270315e-06 )  GeV^0
-TOTAL       :     2.270607 sec
-     6,657,046,197      cycles                           #    2.926 GHz                    
-    13,381,817,915      instructions                     #    2.01  insn per cycle         
-       2.276064035 seconds time elapsed
+TOTAL       :     2.420313 sec
+     7,280,482,659      cycles                    #    3.003 GHz                    
+    14,031,142,533      instructions              #    1.93  insn per cycle         
+       2.426602182 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 1024) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/runTest.exe
@@ -167,19 +167,19 @@ Relative difference = 1.0277089312025782e-08
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/check.exe -p 2048 256 12 --curhst OMP=
-Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 3.165935e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 5.563823e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 5.563823e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.913808e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 5.700903e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 5.700903e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270315e-06 )  GeV^0
-TOTAL       :     2.391095 sec
-     5,900,007,770      cycles                           #    2.464 GHz                    
-    10,160,901,024      instructions                     #    1.72  insn per cycle         
-       2.396646480 seconds time elapsed
+TOTAL       :     2.559114 sec
+     6,555,384,580      cycles                    #    2.558 GHz                    
+    10,814,650,468      instructions              #    1.65  insn per cycle         
+       2.565400113 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2:  289) (512y:    0) (512z:  683)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/runTest.exe
diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_rmbhst.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_rmbhst.txt
index 2691433432..6e1b117ddd 100644
--- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_rmbhst.txt
+++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_rmbhst.txt
@@ -35,23 +35,23 @@ CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 
-DATE: 2023-07-18_23:28:38
+DATE: 2023-06-16_23:25:12
 
-On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
+On itscrd80.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 12 --rmbhst OMP=
 WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost
-Process                     = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURHST+RMBHST+MESDEV/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 6.016534e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 5.173006e+08                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 8.612395e+08                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 5.189825e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 5.289841e+08                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 8.647085e+08                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270315e-06 )  GeV^0
-TOTAL       :     1.898785 sec
-     6,257,438,480      cycles                           #    2.941 GHz                    
-    11,386,458,619      instructions                     #    1.82  insn per cycle         
-       2.187295422 seconds time elapsed
+TOTAL       :     2.074835 sec
+     6,914,545,522      cycles                    #    2.988 GHz                    
+    11,919,156,975      instructions              #    1.72  insn per cycle         
+       2.372897157 seconds time elapsed
 runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 1 --rmbhst
 WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost
 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 150
@@ -65,19 +65,19 @@ Relative difference = 1.0277080522138477e-08
 OK (relative difference <= 5E-3)
 =========================================================================
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/check.exe -p 2048 256 12 --rmbhst OMP=
-Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.194169e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.419402e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.419402e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.202902e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.488452e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.488452e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270315e-06 )  GeV^0
-TOTAL       :     5.674598 sec
-    17,158,940,169      cycles                           #    3.023 GHz                    
-    40,423,706,090      instructions                     #    2.36  insn per cycle         
-       5.679802244 seconds time elapsed
+TOTAL       :     5.632592 sec
+    17,383,031,697      cycles                    #    3.085 GHz                    
+    41,067,827,351      instructions              #    2.36  insn per cycle         
+       5.638706459 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:  375) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/runTest.exe
@@ -91,19 +91,19 @@ Relative difference = 1.0277102699700292e-08
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/check.exe -p 2048 256 12 --rmbhst OMP=
-Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 2.101127e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.993080e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.993080e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.068552e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.168923e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.168923e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270315e-06 )  GeV^0
-TOTAL       :     3.399462 sec
-    10,311,897,646      cycles                           #    3.031 GHz                    
-    24,682,892,190      instructions                     #    2.39  insn per cycle         
-       3.404932887 seconds time elapsed
+TOTAL       :     3.443621 sec
+    10,653,431,550      cycles                    #    3.090 GHz                    
+    25,328,207,820      instructions              #    2.38  insn per cycle         
+       3.459263800 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4: 1283) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/runTest.exe
@@ -117,19 +117,19 @@ Relative difference = 1.0277102699700292e-08
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/check.exe -p 2048 256 12 --rmbhst OMP=
-Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 3.251403e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 5.859944e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 5.859944e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.993581e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 5.951966e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 5.951966e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270315e-06 )  GeV^0
-TOTAL       :     2.331865 sec
-     6,864,953,180      cycles                           #    2.939 GHz                    
-    13,677,506,771      instructions                     #    1.99  insn per cycle         
-       2.336973372 seconds time elapsed
+TOTAL       :     2.497810 sec
+     7,472,614,101      cycles                    #    2.988 GHz                    
+    14,326,081,552      instructions              #    1.92  insn per cycle         
+       2.513276024 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 1063) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/runTest.exe
@@ -143,19 +143,19 @@ Relative difference = 1.0277089312025782e-08
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/check.exe -p 2048 256 12 --rmbhst OMP=
-Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 3.394665e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 6.273811e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 6.273811e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.123387e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 6.389805e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 6.389805e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270315e-06 )  GeV^0
-TOTAL       :     2.250085 sec
-     6,619,521,580      cycles                           #    2.937 GHz                    
-    13,370,614,673      instructions                     #    2.02  insn per cycle         
-       2.255242606 seconds time elapsed
+TOTAL       :     2.406381 sec
+     7,220,335,602      cycles                    #    2.995 GHz                    
+    14,031,141,989      instructions              #    1.94  insn per cycle         
+       2.412823702 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 1024) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/runTest.exe
@@ -169,19 +169,19 @@ Relative difference = 1.0277089312025782e-08
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/check.exe -p 2048 256 12 --rmbhst OMP=
-Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 3.140322e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 5.487494e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 5.487494e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.914294e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 5.704937e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 5.704937e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270315e-06 )  GeV^0
-TOTAL       :     2.405074 sec
-     5,909,857,512      cycles                           #    2.453 GHz                    
-    10,161,683,132      instructions                     #    1.72  insn per cycle         
-       2.410495582 seconds time elapsed
+TOTAL       :     2.563138 sec
+     6,557,308,012      cycles                    #    2.558 GHz                    
+    10,815,146,087      instructions              #    1.65  insn per cycle         
+       2.575443013 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2:  289) (512y:    0) (512z:  683)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/runTest.exe
diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd1.txt
index 4d453aecee..bd06bd6ba5 100644
--- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd1.txt
+++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd1.txt
@@ -35,22 +35,22 @@ CUDACPP_BUILDDIR='build.512z_d_inl0_hrd1'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 
-DATE: 2023-07-18_22:37:49
+DATE: 2023-06-16_22:48:51
 
-On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
+On itscrd80.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd1/gcheck.exe -p 2048 256 12 OMP=
-Process                     = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1]
+Process                     = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 5.573344e+07                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 5.403737e+08                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.093659e+09                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 7.643941e+07                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 5.489939e+08                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.087323e+09                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270315e-06 )  GeV^0
-TOTAL       :     0.706305 sec
-     2,778,015,010      cycles                           #    2.947 GHz                    
-     4,218,331,761      instructions                     #    1.52  insn per cycle         
-       1.008727837 seconds time elapsed
+TOTAL       :     0.690482 sec
+     2,734,951,372      cycles                    #    2.921 GHz                    
+     3,869,487,763      instructions              #    1.41  insn per cycle         
+       1.001478108 seconds time elapsed
 runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd1/gcheck.exe -p 2048 256 1
 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 118
 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100%
@@ -63,19 +63,19 @@ Relative difference = 1.027708011645137e-08
 OK (relative difference <= 5E-3)
 =========================================================================
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd1/check.exe -p 2048 256 12 OMP=
-Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1]
+Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.200514e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.426217e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.426217e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.201121e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.486695e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.486695e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270315e-06 )  GeV^0
-TOTAL       :     5.643058 sec
-    17,154,898,222      cycles                           #    3.038 GHz                    
-    40,369,035,860      instructions                     #    2.35  insn per cycle         
-       5.649770482 seconds time elapsed
+TOTAL       :     5.640974 sec
+    17,377,915,107      cycles                    #    3.080 GHz                    
+    41,019,735,572      instructions              #    2.36  insn per cycle         
+       5.647089750 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:  362) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd1/runTest.exe
@@ -89,19 +89,19 @@ Relative difference = 1.0277102294013186e-08
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd1/check.exe -p 2048 256 12 OMP=
-Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1]
+Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 2.091169e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.980266e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.980266e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.055579e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.158330e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.158330e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270315e-06 )  GeV^0
-TOTAL       :     3.416079 sec
-    10,310,582,032      cycles                           #    3.014 GHz                    
-    24,644,581,740      instructions                     #    2.39  insn per cycle         
-       3.422889711 seconds time elapsed
+TOTAL       :     3.461985 sec
+    10,655,685,663      cycles                    #    3.075 GHz                    
+    25,289,974,301      instructions              #    2.37  insn per cycle         
+       3.474152122 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4: 1270) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd1/runTest.exe
@@ -115,19 +115,19 @@ Relative difference = 1.0277102294013186e-08
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd1/check.exe -p 2048 256 12 OMP=
-Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1]
+Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 3.143094e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 5.628947e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 5.628947e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.953920e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 5.887932e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 5.887932e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270315e-06 )  GeV^0
-TOTAL       :     2.411210 sec
-     6,890,682,089      cycles                           #    2.851 GHz                    
-    13,652,338,335      instructions                     #    1.98  insn per cycle         
-       2.418474908 seconds time elapsed
+TOTAL       :     2.529478 sec
+     7,488,504,684      cycles                    #    2.955 GHz                    
+    14,297,973,959      instructions              #    1.91  insn per cycle         
+       2.535527552 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 1043) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd1/runTest.exe
@@ -141,19 +141,19 @@ Relative difference = 1.0277088906338675e-08
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd1/check.exe -p 2048 256 12 OMP=
-Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1]
+Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 3.360833e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 6.207381e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 6.207381e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.063577e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 6.335894e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 6.335894e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270315e-06 )  GeV^0
-TOTAL       :     2.268507 sec
-     6,671,924,337      cycles                           #    2.934 GHz                    
-    13,357,349,040      instructions                     #    2.00  insn per cycle         
-       2.275507029 seconds time elapsed
+TOTAL       :     2.447618 sec
+     7,308,425,300      cycles                    #    2.980 GHz                    
+    14,017,785,626      instructions              #    1.92  insn per cycle         
+       2.453901778 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 1004) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd1/runTest.exe
@@ -167,19 +167,19 @@ Relative difference = 1.0277088906338675e-08
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd1/check.exe -p 2048 256 12 OMP=
-Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1]
+Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 3.271910e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 5.888847e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 5.888847e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.005001e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 6.053828e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 6.053828e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270315e-06 )  GeV^0
-TOTAL       :     2.325808 sec
-     5,802,426,408      cycles                           #    2.490 GHz                    
-    10,040,622,291      instructions                     #    1.73  insn per cycle         
-       2.332346097 seconds time elapsed
+TOTAL       :     2.492808 sec
+     6,433,917,466      cycles                    #    2.577 GHz                    
+    10,696,732,836      instructions              #    1.66  insn per cycle         
+       2.504690032 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2:  267) (512y:    0) (512z:  663)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd1/runTest.exe
diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl1_hrd0.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl1_hrd0.txt
index 565e7311af..4005df7354 100644
--- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl1_hrd0.txt
+++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl1_hrd0.txt
@@ -35,22 +35,22 @@ CUDACPP_BUILDDIR='build.512z_d_inl1_hrd0'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 
-DATE: 2023-07-18_23:12:58
+DATE: 2023-06-16_23:08:58
 
-On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
+On itscrd80.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd0/gcheck.exe -p 2048 256 12 OMP=
-Process                     = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=1] [hardcodePARAM=0]
+Process                     = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=1] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 6.619901e+07                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 5.240858e+08                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 8.771999e+08                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 6.934597e+07                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 5.349196e+08                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 8.751072e+08                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270315e-06 )  GeV^0
-TOTAL       :     0.686369 sec
-     2,716,938,611      cycles                           #    2.937 GHz                    
-     4,035,248,213      instructions                     #    1.49  insn per cycle         
-       0.984077819 seconds time elapsed
+TOTAL       :     0.703650 sec
+     2,783,628,482      cycles                    #    2.937 GHz                    
+     3,817,522,930      instructions              #    1.37  insn per cycle         
+       1.006043909 seconds time elapsed
 runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd0/gcheck.exe -p 2048 256 1
 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 150
 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100%
@@ -63,19 +63,19 @@ Relative difference = 1.0277080522138477e-08
 OK (relative difference <= 5E-3)
 =========================================================================
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd0/check.exe -p 2048 256 12 OMP=
-Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=0]
+Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.2.0] [inlineHel=1] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 2.794717e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 4.454295e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 4.454295e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.627042e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 4.555043e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.555043e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270315e-06 )  GeV^0
-TOTAL       :     2.659170 sec
-     7,973,174,973      cycles                           #    2.994 GHz                    
-    16,667,372,696      instructions                     #    2.09  insn per cycle         
-       2.664809953 seconds time elapsed
+TOTAL       :     2.809092 sec
+     8,529,730,783      cycles                    #    3.037 GHz                    
+    17,314,493,447      instructions              #    2.03  insn per cycle         
+       2.815992048 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:  206) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd0/runTest.exe
@@ -89,19 +89,19 @@ Relative difference = 1.0277102699700292e-08
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl1_hrd0/check.exe -p 2048 256 12 OMP=
-Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=0]
+Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.2.0] [inlineHel=1] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 3.727236e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 7.881600e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 7.881600e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.394827e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 8.052489e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 8.052489e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270315e-06 )  GeV^0
-TOTAL       :     2.088301 sec
-     6,353,709,417      cycles                           #    3.036 GHz                    
-    12,770,191,691      instructions                     #    2.01  insn per cycle         
-       2.093883449 seconds time elapsed
+TOTAL       :     2.257313 sec
+     6,891,555,509      cycles                    #    3.048 GHz                    
+    13,420,618,567      instructions              #    1.95  insn per cycle         
+       2.263346481 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:  809) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl1_hrd0/runTest.exe
@@ -115,19 +115,19 @@ Relative difference = 1.0277102699700292e-08
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl1_hrd0/check.exe -p 2048 256 12 OMP=
-Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=0]
+Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.2.0] [inlineHel=1] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 4.497330e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.177920e+07                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.177920e+07                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.953793e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.186747e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.186747e+07                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270315e-06 )  GeV^0
-TOTAL       :     1.795633 sec
-     5,363,007,772      cycles                           #    2.979 GHz                    
-     9,799,409,716      instructions                     #    1.83  insn per cycle         
-       1.801392342 seconds time elapsed
+TOTAL       :     1.994156 sec
+     5,944,600,029      cycles                    #    2.973 GHz                    
+    10,446,540,517      instructions              #    1.76  insn per cycle         
+       2.006757717 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2:  460) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl1_hrd0/runTest.exe
@@ -141,19 +141,19 @@ Relative difference = 1.0277089312025782e-08
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl1_hrd0/check.exe -p 2048 256 12 OMP=
-Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=0]
+Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.2.0] [inlineHel=1] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 4.631669e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.299199e+07                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.299199e+07                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 4.165108e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.323250e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.323250e+07                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270315e-06 )  GeV^0
-TOTAL       :     1.756078 sec
-     5,218,021,073      cycles                           #    2.964 GHz                    
-     9,661,166,589      instructions                     #    1.85  insn per cycle         
-       1.761517181 seconds time elapsed
+TOTAL       :     1.906328 sec
+     5,747,679,195      cycles                    #    3.008 GHz                    
+    10,324,227,604      instructions              #    1.80  insn per cycle         
+       1.912748843 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2:  435) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl1_hrd0/runTest.exe
@@ -167,19 +167,19 @@ Relative difference = 1.0277089312025782e-08
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl1_hrd0/check.exe -p 2048 256 12 OMP=
-Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=0]
+Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.2.0] [inlineHel=1] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 4.226821e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 9.881135e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 9.881135e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.709510e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 9.834439e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 9.834439e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270315e-06 )  GeV^0
-TOTAL       :     1.889106 sec
-     4,991,885,143      cycles                           #    2.638 GHz                    
-     8,694,348,991      instructions                     #    1.74  insn per cycle         
-       1.894628100 seconds time elapsed
+TOTAL       :     2.101712 sec
+     5,645,784,770      cycles                    #    2.680 GHz                    
+     9,348,170,561      instructions              #    1.66  insn per cycle         
+       2.114432099 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2:  221) (512y:    0) (512z:  276)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl1_hrd0/runTest.exe
diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl1_hrd1.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl1_hrd1.txt
index 09c8b4ac31..bc4e48c353 100644
--- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl1_hrd1.txt
+++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl1_hrd1.txt
@@ -35,22 +35,22 @@ CUDACPP_BUILDDIR='build.512z_d_inl1_hrd1'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 
-DATE: 2023-07-18_23:13:20
+DATE: 2023-06-16_23:09:22
 
-On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
+On itscrd80.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd1/gcheck.exe -p 2048 256 12 OMP=
-Process                     = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=1] [hardcodePARAM=1]
+Process                     = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=1] [hardcodePARAM=1]
 Workflow summary            = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 6.724285e+07                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 5.937249e+08                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.086399e+09                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 7.022137e+07                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 6.034901e+08                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.081788e+09                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270315e-06 )  GeV^0
-TOTAL       :     0.674219 sec
-     2,690,679,134      cycles                           #    2.960 GHz                    
-     4,101,520,560      instructions                     #    1.52  insn per cycle         
-       0.968706738 seconds time elapsed
+TOTAL       :     0.701380 sec
+     2,726,534,804      cycles                    #    2.886 GHz                    
+     3,786,125,853      instructions              #    1.39  insn per cycle         
+       1.004213223 seconds time elapsed
 runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd1/gcheck.exe -p 2048 256 1
 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 118
 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100%
@@ -63,19 +63,19 @@ Relative difference = 1.027708011645137e-08
 OK (relative difference <= 5E-3)
 =========================================================================
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd1/check.exe -p 2048 256 12 OMP=
-Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=1]
+Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.2.0] [inlineHel=1] [hardcodePARAM=1]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 3.820226e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 7.697359e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 7.697359e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.510910e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 7.966131e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.966131e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270315e-06 )  GeV^0
-TOTAL       :     2.052669 sec
-     6,223,903,279      cycles                           #    3.025 GHz                    
-    12,927,789,626      instructions                     #    2.08  insn per cycle         
-       2.058229844 seconds time elapsed
+TOTAL       :     2.194238 sec
+     6,733,882,413      cycles                    #    3.062 GHz                    
+    13,573,888,806      instructions              #    2.02  insn per cycle         
+       2.200295951 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:  176) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd1/runTest.exe
@@ -89,19 +89,19 @@ Relative difference = 1.0277102294013186e-08
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl1_hrd1/check.exe -p 2048 256 12 OMP=
-Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=1]
+Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.2.0] [inlineHel=1] [hardcodePARAM=1]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 4.347316e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.157252e+07                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.157252e+07                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.963639e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.196983e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.196983e+07                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270315e-06 )  GeV^0
-TOTAL       :     1.858729 sec
-     5,557,210,569      cycles                           #    2.981 GHz                    
-    10,775,548,998      instructions                     #    1.94  insn per cycle         
-       1.864742495 seconds time elapsed
+TOTAL       :     1.987100 sec
+     6,108,038,272      cycles                    #    3.066 GHz                    
+    11,421,394,512      instructions              #    1.87  insn per cycle         
+       1.998910641 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:  609) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl1_hrd1/runTest.exe
@@ -115,19 +115,19 @@ Relative difference = 1.0277102294013186e-08
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl1_hrd1/check.exe -p 2048 256 12 OMP=
-Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=1]
+Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.2.0] [inlineHel=1] [hardcodePARAM=1]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 4.783880e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.401463e+07                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.401463e+07                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 4.184988e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.401255e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.401255e+07                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270315e-06 )  GeV^0
-TOTAL       :     1.713600 sec
-     5,108,676,399      cycles                           #    2.973 GHz                    
-     9,109,861,348      instructions                     #    1.78  insn per cycle         
-       1.719071566 seconds time elapsed
+TOTAL       :     1.904581 sec
+     5,674,452,757      cycles                    #    2.972 GHz                    
+     9,756,418,512      instructions              #    1.72  insn per cycle         
+       1.916141098 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2:  365) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl1_hrd1/runTest.exe
@@ -141,19 +141,19 @@ Relative difference = 1.0277088906338675e-08
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl1_hrd1/check.exe -p 2048 256 12 OMP=
-Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=1]
+Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.2.0] [inlineHel=1] [hardcodePARAM=1]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 5.121890e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.674799e+07                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.674799e+07                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 4.421355e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.712475e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.712475e+07                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270315e-06 )  GeV^0
-TOTAL       :     1.623052 sec
-     4,855,321,150      cycles                           #    2.984 GHz                    
-     9,083,995,305      instructions                     #    1.87  insn per cycle         
-       1.628469471 seconds time elapsed
+TOTAL       :     1.818113 sec
+     5,526,907,542      cycles                    #    3.034 GHz                    
+     9,745,448,132      instructions              #    1.76  insn per cycle         
+       1.824604908 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2:  356) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl1_hrd1/runTest.exe
@@ -167,19 +167,19 @@ Relative difference = 1.0277088906338675e-08
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl1_hrd1/check.exe -p 2048 256 12 OMP=
-Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=1]
+Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.2.0] [inlineHel=1] [hardcodePARAM=1]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 4.385628e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.089039e+07                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.089039e+07                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.924922e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.119550e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.119550e+07                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270315e-06 )  GeV^0
-TOTAL       :     1.835801 sec
-     4,894,193,139      cycles                           #    2.659 GHz                    
-     8,406,937,984      instructions                     #    1.72  insn per cycle         
-       1.841151608 seconds time elapsed
+TOTAL       :     2.000527 sec
+     5,531,314,603      cycles                    #    2.758 GHz                    
+     9,060,806,562      instructions              #    1.64  insn per cycle         
+       2.012991860 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2:  189) (512y:    0) (512z:  227)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl1_hrd1/runTest.exe
diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0.txt
index a5ef853643..998d7298b5 100644
--- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0.txt
+++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0.txt
@@ -35,22 +35,22 @@ CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 
-DATE: 2023-07-18_22:38:16
+DATE: 2023-06-16_22:49:20
 
-On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
+On itscrd80.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 12 OMP=
-Process                     = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=2, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 1.098638e+08                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.184721e+09                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.780918e+09                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.622816e+08                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.472400e+09                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.786994e+09                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371686e-02 +- 3.270219e-06 )  GeV^0
-TOTAL       :     0.589536 sec
-     2,402,233,552      cycles                           #    2.943 GHz                    
-     3,672,855,124      instructions                     #    1.53  insn per cycle         
-       0.878253042 seconds time elapsed
+TOTAL       :     0.583321 sec
+     2,430,104,692      cycles                    #    2.932 GHz                    
+     3,363,264,752      instructions              #    1.38  insn per cycle         
+       0.886216701 seconds time elapsed
 runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 1
 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 96
 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100%
@@ -58,24 +58,24 @@ runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProces
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2
 Avg ME (C++/CUDA)   = 1.282802e-02
-Avg ME (F77/CUDA)   = 1.2828112108763889E-002
-Relative difference = 7.180279099086847e-06
+Avg ME (F77/CUDA)   = 1.2828112026909366E-002
+Relative difference = 7.173898182689807e-06
 OK (relative difference <= 5E-3)
 =========================================================================
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/check.exe -p 2048 256 12 OMP=
-Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=6, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.216373e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.458660e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.458660e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.264259e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.523188e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.523188e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270373e-06 )  GeV^0
-TOTAL       :     5.536943 sec
-    16,849,697,016      cycles                           #    3.042 GHz                    
-    40,090,354,034      instructions                     #    2.38  insn per cycle         
-       5.543711092 seconds time elapsed
+TOTAL       :     5.337690 sec
+    16,489,915,021      cycles                    #    3.088 GHz                    
+    40,104,655,673      instructions              #    2.43  insn per cycle         
+       5.343808523 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:  368) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/runTest.exe
@@ -89,19 +89,19 @@ Relative difference = 1.500049293219082e-08
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/check.exe -p 2048 256 12 OMP=
-Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=6, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 3.140810e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 5.847975e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 5.847975e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.262321e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 6.200988e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 6.200988e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270373e-06 )  GeV^0
-TOTAL       :     2.367678 sec
-     7,192,962,805      cycles                           #    3.034 GHz                    
-    16,727,490,778      instructions                     #    2.33  insn per cycle         
-       2.374359353 seconds time elapsed
+TOTAL       :     2.286472 sec
+     7,103,303,250      cycles                    #    3.101 GHz                    
+    16,746,623,366      instructions              #    2.36  insn per cycle         
+       2.292241843 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4: 1363) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/runTest.exe
@@ -115,19 +115,19 @@ Relative difference = 3.8113554068418534e-08
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/check.exe -p 2048 256 12 OMP=
-Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=4, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 4.589522e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.217101e+07                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.217101e+07                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 4.581033e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.234851e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.234851e+07                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270338e-06 )  GeV^0
-TOTAL       :     1.730753 sec
-     5,165,936,826      cycles                           #    2.979 GHz                    
-    10,630,035,742      instructions                     #    2.06  insn per cycle         
-       1.737116155 seconds time elapsed
+TOTAL       :     1.728168 sec
+     5,223,225,548      cycles                    #    3.015 GHz                    
+    10,646,468,952      instructions              #    2.04  insn per cycle         
+       1.739591338 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 1140) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/runTest.exe
@@ -141,19 +141,19 @@ Relative difference = 2.5306003563303186e-07
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/check.exe -p 2048 256 12 OMP=
-Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=4, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 4.709413e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.299300e+07                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.299300e+07                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 4.727128e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.320782e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.320782e+07                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270338e-06 )  GeV^0
-TOTAL       :     1.696753 sec
-     5,075,364,220      cycles                           #    2.984 GHz                    
-    10,482,055,984      instructions                     #    2.07  insn per cycle         
-       1.703463309 seconds time elapsed
+TOTAL       :     1.685516 sec
+     5,127,500,119      cycles                    #    3.035 GHz                    
+    10,500,102,407      instructions              #    2.05  insn per cycle         
+       1.698440130 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 1092) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/runTest.exe
@@ -167,19 +167,19 @@ Relative difference = 2.5306003563303186e-07
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/check.exe -p 2048 256 12 OMP=
-Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=4, zero=0)
 Internal loops fptype_sv    = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 4.507095e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.144049e+07                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.144049e+07                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 4.545044e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.174805e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.174805e+07                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371705e-02 +- 3.270339e-06 )  GeV^0
-TOTAL       :     1.759221 sec
-     4,700,239,186      cycles                           #    2.667 GHz                    
-     8,928,497,530      instructions                     #    1.90  insn per cycle         
-       1.766112379 seconds time elapsed
+TOTAL       :     1.747234 sec
+     4,737,364,431      cycles                    #    2.712 GHz                    
+     8,948,917,615      instructions              #    1.89  insn per cycle         
+       1.753348775 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2:  408) (512y:    0) (512z:  710)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/runTest.exe
diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_bridge.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_bridge.txt
index 91a6918e52..eea602cb6a 100644
--- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_bridge.txt
+++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_bridge.txt
@@ -35,9 +35,9 @@ CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 
-DATE: 2023-07-18_23:22:57
+DATE: 2023-06-16_23:19:22
 
-On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
+On itscrd80.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 12 --bridge OMP=
 WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost
@@ -51,17 +51,17 @@ WARNING! flagging abnormal ME for ievt=66427
 WARNING! flagging abnormal ME for ievt=465318
 WARNING! flagging abnormal ME for ievt=458848
 WARNING! flagging abnormal ME for ievt=247522
-Process                     = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=7, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 7.181850e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 6.493425e+07                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 6.493425e+07                 )  sec^-1
-MeanMatrixElemValue         = ( 1.371709e-02 +- 3.270385e-06 )  GeV^0
-TOTAL       :     1.690119 sec
-     5,683,014,469      cycles                           #    2.962 GHz                    
-    10,259,149,546      instructions                     #    1.81  insn per cycle         
-       1.975831258 seconds time elapsed
+EvtsPerSec[Rmb+ME]     (23) = ( 7.151373e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 7.051120e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.051120e+07                 )  sec^-1
+MeanMatrixElemValue         = ( 1.371709e-02 +- 3.270386e-06 )  GeV^0
+TOTAL       :     1.716284 sec
+     5,788,689,598      cycles                    #    2.965 GHz                    
+    10,066,532,076      instructions              #    1.74  insn per cycle         
+       2.010969778 seconds time elapsed
 runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 1 --bridge
 WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost
 WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost
@@ -73,8 +73,8 @@ WARNING! Set grid in Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublo
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2
 Avg ME (C++/CUDA)   = 1.282802e-02
-Avg ME (F77/CUDA)   = 1.2828112108763889E-002
-Relative difference = 7.180279099086847e-06
+Avg ME (F77/CUDA)   = 1.2828112026909366E-002
+Relative difference = 7.173898182689807e-06
 OK (relative difference <= 5E-3)
 =========================================================================
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/check.exe -p 2048 256 12 --bridge OMP=
@@ -85,19 +85,19 @@ WARNING! flagging abnormal ME for ievt=152898
 WARNING! flagging abnormal ME for ievt=66427
 WARNING! flagging abnormal ME for ievt=164749
 WARNING! flagging abnormal ME for ievt=247522
-Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=6, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.184677e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.415360e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.415360e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.231634e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.480435e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.480435e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270373e-06 )  GeV^0
-TOTAL       :     5.784130 sec
-    17,482,262,914      cycles                           #    3.020 GHz                    
-    40,239,314,516      instructions                     #    2.30  insn per cycle         
-       5.790365098 seconds time elapsed
+TOTAL       :     5.578840 sec
+    17,225,374,898      cycles                    #    3.086 GHz                    
+    40,276,553,142      instructions              #    2.34  insn per cycle         
+       5.585821624 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:  368) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/runTest.exe
@@ -118,19 +118,19 @@ WARNING! flagging abnormal ME for ievt=152898
 WARNING! flagging abnormal ME for ievt=66427
 WARNING! flagging abnormal ME for ievt=164749
 WARNING! flagging abnormal ME for ievt=247522
-Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=6, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 2.934835e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 5.216064e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 5.216064e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.937007e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 5.345782e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 5.345782e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270373e-06 )  GeV^0
-TOTAL       :     2.629008 sec
-     7,923,607,448      cycles                           #    3.009 GHz                    
-    18,063,565,877      instructions                     #    2.28  insn per cycle         
-       2.635314179 seconds time elapsed
+TOTAL       :     2.639278 sec
+     7,955,090,673      cycles                    #    3.008 GHz                    
+    18,082,112,251      instructions              #    2.27  insn per cycle         
+       2.646477973 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4: 1363) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/runTest.exe
@@ -149,19 +149,19 @@ WARNING! flagging abnormal ME for ievt=53874
 WARNING! flagging abnormal ME for ievt=66427
 WARNING! flagging abnormal ME for ievt=164749
 WARNING! flagging abnormal ME for ievt=247522
-Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=4, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 4.187074e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 9.897049e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 9.897049e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 4.189753e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.014027e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.014027e+07                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270338e-06 )  GeV^0
-TOTAL       :     1.984011 sec
-     5,882,038,038      cycles                           #    2.956 GHz                    
-    11,750,113,647      instructions                     #    2.00  insn per cycle         
-       1.990516269 seconds time elapsed
+TOTAL       :     1.979778 sec
+     5,995,131,291      cycles                    #    3.020 GHz                    
+    11,763,127,657      instructions              #    1.96  insn per cycle         
+       1.992584680 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 1140) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/runTest.exe
@@ -180,19 +180,19 @@ WARNING! flagging abnormal ME for ievt=53874
 WARNING! flagging abnormal ME for ievt=66427
 WARNING! flagging abnormal ME for ievt=164749
 WARNING! flagging abnormal ME for ievt=247522
-Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=4, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 4.292699e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.044668e+07                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.044668e+07                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 4.319958e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.080076e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.080076e+07                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270338e-06 )  GeV^0
-TOTAL       :     1.947459 sec
-     5,799,609,939      cycles                           #    2.970 GHz                    
-    11,601,900,460      instructions                     #    2.00  insn per cycle         
-       1.953545312 seconds time elapsed
+TOTAL       :     1.934129 sec
+     5,894,246,896      cycles                    #    3.038 GHz                    
+    11,620,616,390      instructions              #    1.97  insn per cycle         
+       1.941396703 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 1092) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/runTest.exe
@@ -211,19 +211,19 @@ WARNING! flagging abnormal ME for ievt=53874
 WARNING! flagging abnormal ME for ievt=66427
 WARNING! flagging abnormal ME for ievt=164749
 WARNING! flagging abnormal ME for ievt=247522
-Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=4, zero=0)
 Internal loops fptype_sv    = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 4.075696e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 9.092017e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 9.092017e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 4.141992e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 9.375055e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 9.375055e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371705e-02 +- 3.270339e-06 )  GeV^0
-TOTAL       :     2.023993 sec
-     5,468,734,972      cycles                           #    2.695 GHz                    
-    10,135,139,968      instructions                     #    1.85  insn per cycle         
-       2.029970771 seconds time elapsed
+TOTAL       :     1.999503 sec
+     5,512,078,856      cycles                    #    2.749 GHz                    
+    10,155,647,564      instructions              #    1.84  insn per cycle         
+       2.016503539 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2:  408) (512y:    0) (512z:  710)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/runTest.exe
diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_common.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_common.txt
index bbb093b510..800a4b8c86 100644
--- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_common.txt
+++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_common.txt
@@ -35,22 +35,22 @@ CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 
-DATE: 2023-07-18_23:34:56
+DATE: 2023-06-16_23:31:42
 
-On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
+On itscrd80.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 12 --common OMP=
-Process                     = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:COMMON+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 1.336297e+08                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.304968e+09                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.746760e+09                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.394086e+08                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.330155e+09                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.707146e+09                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371863e-02 +- 3.269951e-06 )  GeV^0
-TOTAL       :     1.172878 sec
-     4,104,924,338      cycles                           #    2.941 GHz                    
-     6,526,677,484      instructions                     #    1.59  insn per cycle         
-       1.452215524 seconds time elapsed
+TOTAL       :     1.184633 sec
+     4,183,254,985      cycles                    #    2.960 GHz                    
+     6,293,699,635      instructions              #    1.50  insn per cycle         
+       1.472409896 seconds time elapsed
 runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 1 --common
 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 96
 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100%
@@ -58,24 +58,24 @@ runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProces
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2
 Avg ME (C++/CUDA)   = 1.282802e-02
-Avg ME (F77/CUDA)   = 1.2828112108763889E-002
-Relative difference = 7.180279099086847e-06
+Avg ME (F77/CUDA)   = 1.2828112026909366E-002
+Relative difference = 7.173898182689807e-06
 OK (relative difference <= 5E-3)
 =========================================================================
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/check.exe -p 2048 256 12 --common OMP=
-Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=4, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.207988e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.449886e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.449886e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.262596e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.522021e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.522021e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371887e-02 +- 3.270265e-06 )  GeV^0
-TOTAL       :     5.909710 sec
-    17,837,688,263      cycles                           #    3.016 GHz                    
-    40,273,572,845      instructions                     #    2.26  insn per cycle         
-       5.915116440 seconds time elapsed
+TOTAL       :     5.659502 sec
+    17,477,401,058      cycles                    #    3.088 GHz                    
+    40,266,459,883      instructions              #    2.30  insn per cycle         
+       5.665157466 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:  368) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/runTest.exe
@@ -89,19 +89,19 @@ Relative difference = 1.500049293219082e-08
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/check.exe -p 2048 256 12 --common OMP=
-Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=4, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 3.046412e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 5.668762e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 5.668762e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.236448e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 6.142806e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 6.142806e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371887e-02 +- 3.270265e-06 )  GeV^0
-TOTAL       :     2.778407 sec
-     8,186,545,146      cycles                           #    2.942 GHz                    
-    16,814,006,667      instructions                     #    2.05  insn per cycle         
-       2.783352640 seconds time elapsed
+TOTAL       :     2.618721 sec
+     8,103,544,055      cycles                    #    3.089 GHz                    
+    16,832,855,817      instructions              #    2.08  insn per cycle         
+       2.624569274 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4: 1363) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/runTest.exe
@@ -115,19 +115,19 @@ Relative difference = 3.8113554068418534e-08
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/check.exe -p 2048 256 12 --common OMP=
-Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=4, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 4.494077e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.185801e+07                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.185801e+07                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 4.539437e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.230738e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.230738e+07                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371885e-02 +- 3.270110e-06 )  GeV^0
-TOTAL       :     2.084718 sec
-     6,169,848,657      cycles                           #    2.955 GHz                    
-    10,542,439,223      instructions                     #    1.71  insn per cycle         
-       2.090056514 seconds time elapsed
+TOTAL       :     2.059577 sec
+     6,252,474,158      cycles                    #    3.029 GHz                    
+    10,562,743,863      instructions              #    1.69  insn per cycle         
+       2.071952790 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 1140) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/runTest.exe
@@ -141,19 +141,19 @@ Relative difference = 2.5306003563303186e-07
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/check.exe -p 2048 256 12 --common OMP=
-Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:COMMON+RMBHST+MESHST/512y+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=4, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 4.635871e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.276211e+07                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.276211e+07                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 4.617687e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.289170e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.289170e+07                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371885e-02 +- 3.270110e-06 )  GeV^0
-TOTAL       :     2.048669 sec
-     6,121,871,406      cycles                           #    2.982 GHz                    
-    10,187,558,774      instructions                     #    1.66  insn per cycle         
-       2.053957847 seconds time elapsed
+TOTAL       :     2.052194 sec
+     6,160,094,141      cycles                    #    2.996 GHz                    
+    10,211,164,443      instructions              #    1.66  insn per cycle         
+       2.069184232 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 1092) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/runTest.exe
@@ -167,19 +167,19 @@ Relative difference = 2.5306003563303186e-07
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/check.exe -p 2048 256 12 --common OMP=
-Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:COMMON+RMBHST+MESHST/512z+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=4, zero=0)
 Internal loops fptype_sv    = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 4.477590e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.132710e+07                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.132710e+07                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 4.509722e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.158916e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.158916e+07                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371884e-02 +- 3.270111e-06 )  GeV^0
-TOTAL       :     2.101013 sec
-     5,715,581,929      cycles                           #    2.715 GHz                    
-     8,638,754,232      instructions                     #    1.51  insn per cycle         
-       2.105949730 seconds time elapsed
+TOTAL       :     2.079386 sec
+     5,756,219,942      cycles                    #    2.763 GHz                    
+     8,663,173,729      instructions              #    1.51  insn per cycle         
+       2.085435176 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2:  408) (512y:    0) (512z:  710)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/runTest.exe
diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_curhst.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_curhst.txt
index 8ef12c26ca..440dcefee1 100644
--- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_curhst.txt
+++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_curhst.txt
@@ -35,22 +35,22 @@ CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 
-DATE: 2023-07-18_23:32:01
+DATE: 2023-06-16_23:28:42
 
-On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
+On itscrd80.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 12 --curhst OMP=
-Process                     = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURHST+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=2, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 1.334789e+08                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.313989e+09                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.795467e+09                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.402219e+08                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.355346e+09                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.784726e+09                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371686e-02 +- 3.270219e-06 )  GeV^0
-TOTAL       :     0.849666 sec
-     3,175,772,104      cycles                           #    2.963 GHz                    
-     6,432,890,363      instructions                     #    2.03  insn per cycle         
-       1.129537023 seconds time elapsed
+TOTAL       :     0.867299 sec
+     3,217,543,414      cycles                    #    2.926 GHz                    
+     6,137,693,119      instructions              #    1.91  insn per cycle         
+       1.157361320 seconds time elapsed
 runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 1 --curhst
 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 96
 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100%
@@ -58,24 +58,24 @@ runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProces
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2
 Avg ME (C++/CUDA)   = 1.282802e-02
-Avg ME (F77/CUDA)   = 1.2828112108763889E-002
-Relative difference = 7.180279099086847e-06
+Avg ME (F77/CUDA)   = 1.2828112026909366E-002
+Relative difference = 7.173898182689807e-06
 OK (relative difference <= 5E-3)
 =========================================================================
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/check.exe -p 2048 256 12 --curhst OMP=
-Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=6, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.193842e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.431726e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.431726e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.257010e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.513861e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.513861e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270373e-06 )  GeV^0
-TOTAL       :     5.641756 sec
-    16,854,623,313      cycles                           #    2.986 GHz                    
-    40,092,655,634      instructions                     #    2.38  insn per cycle         
-       5.647077748 seconds time elapsed
+TOTAL       :     5.368758 sec
+    16,502,939,001      cycles                    #    3.072 GHz                    
+    40,105,024,525      instructions              #    2.43  insn per cycle         
+       5.374937445 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:  368) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/runTest.exe
@@ -89,19 +89,19 @@ Relative difference = 1.500049293219082e-08
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/check.exe -p 2048 256 12 --curhst OMP=
-Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=6, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 3.125387e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 5.853813e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 5.853813e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.242270e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 6.146410e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 6.146410e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270373e-06 )  GeV^0
-TOTAL       :     2.379891 sec
-     7,204,257,236      cycles                           #    3.021 GHz                    
-    16,730,694,588      instructions                     #    2.32  insn per cycle         
-       2.385210316 seconds time elapsed
+TOTAL       :     2.303063 sec
+     7,094,904,869      cycles                    #    3.075 GHz                    
+    16,746,800,277      instructions              #    2.36  insn per cycle         
+       2.314822596 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4: 1363) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/runTest.exe
@@ -115,19 +115,19 @@ Relative difference = 3.8113554068418534e-08
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/check.exe -p 2048 256 12 --curhst OMP=
-Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=4, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 4.519803e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.207170e+07                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.207170e+07                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 4.586979e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.231913e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.231913e+07                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270338e-06 )  GeV^0
-TOTAL       :     1.750842 sec
-     5,196,697,084      cycles                           #    2.961 GHz                    
-    10,630,011,228      instructions                     #    2.05  insn per cycle         
-       1.755747802 seconds time elapsed
+TOTAL       :     1.807105 sec
+     5,428,964,238      cycles                    #    2.996 GHz                    
+    10,692,005,584      instructions              #    1.97  insn per cycle         
+       1.822790793 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 1140) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/runTest.exe
@@ -141,19 +141,19 @@ Relative difference = 2.5306003563303186e-07
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/check.exe -p 2048 256 12 --curhst OMP=
-Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=4, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 4.650477e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.286008e+07                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.286008e+07                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 4.726711e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.329947e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.329947e+07                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270338e-06 )  GeV^0
-TOTAL       :     1.711767 sec
-     5,070,226,849      cycles                           #    2.955 GHz                    
-    10,472,873,926      instructions                     #    2.07  insn per cycle         
-       1.716818170 seconds time elapsed
+TOTAL       :     1.687490 sec
+     5,097,137,843      cycles                    #    3.011 GHz                    
+    10,500,104,402      instructions              #    2.06  insn per cycle         
+       1.704174033 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 1092) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/runTest.exe
@@ -167,19 +167,19 @@ Relative difference = 2.5306003563303186e-07
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/check.exe -p 2048 256 12 --curhst OMP=
-Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=4, zero=0)
 Internal loops fptype_sv    = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 4.489878e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.142603e+07                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.142603e+07                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 4.531072e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.167602e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.167602e+07                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371705e-02 +- 3.270339e-06 )  GeV^0
-TOTAL       :     1.761077 sec
-     4,702,427,717      cycles                           #    2.664 GHz                    
-     8,928,587,643      instructions                     #    1.90  insn per cycle         
-       1.766311452 seconds time elapsed
+TOTAL       :     1.748105 sec
+     4,711,439,440      cycles                    #    2.688 GHz                    
+     8,948,315,074      instructions              #    1.90  insn per cycle         
+       1.760555512 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2:  408) (512y:    0) (512z:  710)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/runTest.exe
diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_rmbhst.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_rmbhst.txt
index aac2c3c1e7..6541a30c4f 100644
--- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_rmbhst.txt
+++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_rmbhst.txt
@@ -35,23 +35,23 @@ CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 
-DATE: 2023-07-18_23:29:07
+DATE: 2023-06-16_23:25:42
 
-On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
+On itscrd80.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 12 --rmbhst OMP=
 WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost
-Process                     = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURHST+RMBHST+MESDEV/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=7, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 9.089458e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.263743e+09                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.592799e+09                 )  sec^-1
-MeanMatrixElemValue         = ( 1.371709e-02 +- 3.270385e-06 )  GeV^0
-TOTAL       :     1.472978 sec
-     5,051,901,601      cycles                           #    2.981 GHz                    
-     9,151,755,294      instructions                     #    1.81  insn per cycle         
-       1.751826648 seconds time elapsed
+EvtsPerSec[Rmb+ME]     (23) = ( 9.106073e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.321114e+09                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.631808e+09                 )  sec^-1
+MeanMatrixElemValue         = ( 1.371709e-02 +- 3.270386e-06 )  GeV^0
+TOTAL       :     1.491290 sec
+     5,169,582,957      cycles                    #    2.984 GHz                    
+     9,050,754,175      instructions              #    1.75  insn per cycle         
+       1.789532913 seconds time elapsed
 runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 1 --rmbhst
 WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost
 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 96
@@ -60,24 +60,24 @@ WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2
 Avg ME (C++/CUDA)   = 1.282802e-02
-Avg ME (F77/CUDA)   = 1.2828112108763889E-002
-Relative difference = 7.180279099086847e-06
+Avg ME (F77/CUDA)   = 1.2828112026909366E-002
+Relative difference = 7.173898182689807e-06
 OK (relative difference <= 5E-3)
 =========================================================================
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/check.exe -p 2048 256 12 --rmbhst OMP=
-Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=6, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.205288e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.445391e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.445391e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.266280e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.526041e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.526041e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270373e-06 )  GeV^0
-TOTAL       :     5.588347 sec
-    16,838,368,532      cycles                           #    3.013 GHz                    
-    40,090,617,591      instructions                     #    2.38  insn per cycle         
-       5.593133320 seconds time elapsed
+TOTAL       :     5.330040 sec
+    16,494,480,958      cycles                    #    3.092 GHz                    
+    40,104,668,257      instructions              #    2.43  insn per cycle         
+       5.336402582 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:  368) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/runTest.exe
@@ -91,19 +91,19 @@ Relative difference = 1.500049293219082e-08
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/check.exe -p 2048 256 12 --rmbhst OMP=
-Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=6, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 3.133712e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 5.821935e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 5.821935e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.242421e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 6.129823e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 6.129823e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270373e-06 )  GeV^0
-TOTAL       :     2.373647 sec
-     7,180,844,252      cycles                           #    3.020 GHz                    
-    16,730,848,835      instructions                     #    2.33  insn per cycle         
-       2.378841839 seconds time elapsed
+TOTAL       :     2.301776 sec
+     7,083,962,972      cycles                    #    3.073 GHz                    
+    16,746,671,271      instructions              #    2.36  insn per cycle         
+       2.307495453 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4: 1363) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/runTest.exe
@@ -117,19 +117,19 @@ Relative difference = 3.8113554068418534e-08
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/check.exe -p 2048 256 12 --rmbhst OMP=
-Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=4, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 4.550598e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.196746e+07                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.196746e+07                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 4.610071e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.237721e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.237721e+07                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270338e-06 )  GeV^0
-TOTAL       :     1.739609 sec
-     5,166,190,440      cycles                           #    2.964 GHz                    
-    10,630,180,346      instructions                     #    2.06  insn per cycle         
-       1.744861725 seconds time elapsed
+TOTAL       :     1.717055 sec
+     5,214,546,292      cycles                    #    3.028 GHz                    
+    10,646,475,199      instructions              #    2.04  insn per cycle         
+       1.729853327 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 1140) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/runTest.exe
@@ -143,19 +143,19 @@ Relative difference = 2.5306003563303186e-07
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/check.exe -p 2048 256 12 --rmbhst OMP=
-Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=4, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 4.673596e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.294703e+07                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.294703e+07                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 4.669054e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.286554e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.286554e+07                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270338e-06 )  GeV^0
-TOTAL       :     1.707901 sec
-     5,075,943,253      cycles                           #    2.966 GHz                    
-    10,482,206,390      instructions                     #    2.07  insn per cycle         
-       1.713064431 seconds time elapsed
+TOTAL       :     1.706696 sec
+     5,116,265,885      cycles                    #    2.990 GHz                    
+    10,500,169,607      instructions              #    2.05  insn per cycle         
+       1.718833211 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 1092) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/runTest.exe
@@ -169,19 +169,19 @@ Relative difference = 2.5306003563303186e-07
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/check.exe -p 2048 256 12 --rmbhst OMP=
-Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=4, zero=0)
 Internal loops fptype_sv    = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 4.492142e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.134522e+07                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.134522e+07                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 4.528217e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.171325e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.171325e+07                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371705e-02 +- 3.270339e-06 )  GeV^0
-TOTAL       :     1.762628 sec
-     4,694,313,362      cycles                           #    2.657 GHz                    
-     8,928,042,352      instructions                     #    1.90  insn per cycle         
-       1.767978454 seconds time elapsed
+TOTAL       :     1.745381 sec
+     4,755,213,854      cycles                    #    2.718 GHz                    
+     8,948,305,241      instructions              #    1.88  insn per cycle         
+       1.757628494 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2:  408) (512y:    0) (512z:  710)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/runTest.exe
diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd1.txt
index 9790783d8d..278cecd3e4 100644
--- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd1.txt
+++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd1.txt
@@ -35,22 +35,22 @@ CUDACPP_BUILDDIR='build.512z_f_inl0_hrd1'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 
-DATE: 2023-07-18_22:38:41
+DATE: 2023-06-16_22:49:45
 
-On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
+On itscrd80.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd1/gcheck.exe -p 2048 256 12 OMP=
-Process                     = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1]
+Process                     = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=2, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 1.100412e+08                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.209842e+09                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.918242e+09                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.626450e+08                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.505770e+09                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.909932e+09                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371686e-02 +- 3.270219e-06 )  GeV^0
-TOTAL       :     0.587563 sec
-     2,397,239,442      cycles                           #    2.943 GHz                    
-     3,693,800,031      instructions                     #    1.54  insn per cycle         
-       0.877655792 seconds time elapsed
+TOTAL       :     0.584101 sec
+     2,373,467,870      cycles                    #    2.893 GHz                    
+     3,334,548,453      instructions              #    1.40  insn per cycle         
+       0.877449920 seconds time elapsed
 runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd1/gcheck.exe -p 2048 256 1
 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 80
 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100%
@@ -58,24 +58,24 @@ runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProces
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd1/gcheck.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd1/fgcheck.exe 2 64 2
 Avg ME (C++/CUDA)   = 1.282802e-02
-Avg ME (F77/CUDA)   = 1.2828112108763889E-002
-Relative difference = 7.180279099086847e-06
+Avg ME (F77/CUDA)   = 1.2828112026909366E-002
+Relative difference = 7.173898182689807e-06
 OK (relative difference <= 5E-3)
 =========================================================================
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd1/check.exe -p 2048 256 12 OMP=
-Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1]
+Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=6, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.202926e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.442967e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.442967e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.259578e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.517908e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.517908e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270373e-06 )  GeV^0
-TOTAL       :     5.597963 sec
-    16,833,526,961      cycles                           #    3.004 GHz                    
-    40,039,819,397      instructions                     #    2.38  insn per cycle         
-       5.604856115 seconds time elapsed
+TOTAL       :     5.356459 sec
+    16,505,080,474      cycles                    #    3.080 GHz                    
+    40,054,237,440      instructions              #    2.43  insn per cycle         
+       5.362931581 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:  351) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd1/runTest.exe
@@ -89,19 +89,19 @@ Relative difference = 1.500049293219082e-08
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd1/check.exe -p 2048 256 12 OMP=
-Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1]
+Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=6, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 3.149831e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 5.849943e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 5.849943e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.233010e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 6.140192e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 6.140192e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270373e-06 )  GeV^0
-TOTAL       :     2.363324 sec
-     7,193,211,431      cycles                           #    3.038 GHz                    
-    16,654,791,239      instructions                     #    2.32  insn per cycle         
-       2.369899698 seconds time elapsed
+TOTAL       :     2.305928 sec
+     7,093,977,561      cycles                    #    3.071 GHz                    
+    16,670,395,724      instructions              #    2.35  insn per cycle         
+       2.317849748 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4: 1338) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd1/runTest.exe
@@ -115,19 +115,19 @@ Relative difference = 3.8113554068418534e-08
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd1/check.exe -p 2048 256 12 OMP=
-Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1]
+Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=4, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 4.535444e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.206818e+07                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.206818e+07                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 4.492331e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.196200e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.196200e+07                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270338e-06 )  GeV^0
-TOTAL       :     1.745111 sec
-     5,193,548,855      cycles                           #    2.969 GHz                    
-    10,616,753,512      instructions                     #    2.04  insn per cycle         
-       1.751486110 seconds time elapsed
+TOTAL       :     1.759548 sec
+     5,233,152,516      cycles                    #    2.967 GHz                    
+    10,634,018,962      instructions              #    2.03  insn per cycle         
+       1.771464615 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 1110) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd1/runTest.exe
@@ -141,19 +141,19 @@ Relative difference = 2.5306003563303186e-07
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd1/check.exe -p 2048 256 12 OMP=
-Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1]
+Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=4, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 4.705086e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.300457e+07                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.300457e+07                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 4.707001e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.313255e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.313255e+07                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270338e-06 )  GeV^0
-TOTAL       :     1.696618 sec
-     5,074,104,058      cycles                           #    2.982 GHz                    
-    10,475,028,207      instructions                     #    2.06  insn per cycle         
-       1.703324719 seconds time elapsed
+TOTAL       :     1.696242 sec
+     5,111,285,292      cycles                    #    3.005 GHz                    
+    10,493,325,522      instructions              #    2.05  insn per cycle         
+       1.711751099 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 1062) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd1/runTest.exe
@@ -167,19 +167,19 @@ Relative difference = 2.5306003563303186e-07
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd1/check.exe -p 2048 256 12 OMP=
-Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1]
+Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=4, zero=0)
 Internal loops fptype_sv    = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 4.539230e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.200229e+07                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.200229e+07                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 4.642010e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.241108e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.241108e+07                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371705e-02 +- 3.270339e-06 )  GeV^0
-TOTAL       :     1.753059 sec
-     4,625,386,034      cycles                           #    2.633 GHz                    
-     8,857,420,373      instructions                     #    1.91  insn per cycle         
-       1.759570802 seconds time elapsed
+TOTAL       :     1.712084 sec
+     4,653,252,953      cycles                    #    2.711 GHz                    
+     8,877,573,860      instructions              #    1.91  insn per cycle         
+       1.718230554 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2:  377) (512y:    0) (512z:  678)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd1/runTest.exe
diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl1_hrd0.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl1_hrd0.txt
index 5092bfd385..212456d513 100644
--- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl1_hrd0.txt
+++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl1_hrd0.txt
@@ -35,22 +35,22 @@ CUDACPP_BUILDDIR='build.512z_f_inl1_hrd0'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 
-DATE: 2023-07-18_23:13:41
+DATE: 2023-06-16_23:09:44
 
-On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
+On itscrd80.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd0/gcheck.exe -p 2048 256 12 OMP=
-Process                     = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=1] [hardcodePARAM=0]
+Process                     = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=1] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=2, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 1.320916e+08                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.314255e+09                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.785092e+09                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.391928e+08                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.360941e+09                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.806903e+09                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371686e-02 +- 3.270219e-06 )  GeV^0
-TOTAL       :     0.575146 sec
-     2,379,081,623      cycles                           #    2.948 GHz                    
-     3,617,069,651      instructions                     #    1.52  insn per cycle         
-       0.863850288 seconds time elapsed
+TOTAL       :     0.595523 sec
+     2,413,044,519      cycles                    #    2.909 GHz                    
+     3,371,780,308      instructions              #    1.40  insn per cycle         
+       0.887688074 seconds time elapsed
 runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd0/gcheck.exe -p 2048 256 1
 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 96
 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100%
@@ -58,24 +58,24 @@ runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProces
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd0/gcheck.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd0/fgcheck.exe 2 64 2
 Avg ME (C++/CUDA)   = 1.282802e-02
-Avg ME (F77/CUDA)   = 1.2828112108763889E-002
-Relative difference = 7.180279099086847e-06
+Avg ME (F77/CUDA)   = 1.2828112026909366E-002
+Relative difference = 7.173898182689807e-06
 OK (relative difference <= 5E-3)
 =========================================================================
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd0/check.exe -p 2048 256 12 OMP=
-Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=0]
+Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.2.0] [inlineHel=1] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=6, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 2.990930e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 5.047464e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 5.047464e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.031021e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 5.106428e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 5.106428e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270373e-06 )  GeV^0
-TOTAL       :     2.470211 sec
-     7,431,999,027      cycles                           #    3.003 GHz                    
-    16,617,479,109      instructions                     #    2.24  insn per cycle         
-       2.475635291 seconds time elapsed
+TOTAL       :     2.437957 sec
+     7,408,174,050      cycles                    #    3.033 GHz                    
+    16,633,646,919      instructions              #    2.25  insn per cycle         
+       2.444238664 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:  226) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd0/runTest.exe
@@ -89,19 +89,19 @@ Relative difference = 1.4858695011109669e-08
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl1_hrd0/check.exe -p 2048 256 12 OMP=
-Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=0]
+Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.2.0] [inlineHel=1] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=6, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 4.220042e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.202891e+07                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.202891e+07                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 4.503700e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.305405e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.305405e+07                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270373e-06 )  GeV^0
-TOTAL       :     1.872741 sec
-     5,421,090,473      cycles                           #    2.889 GHz                    
-    11,166,873,964      instructions                     #    2.06  insn per cycle         
-       1.878025127 seconds time elapsed
+TOTAL       :     1.757246 sec
+     5,418,067,590      cycles                    #    3.076 GHz                    
+    11,183,088,134      instructions              #    2.06  insn per cycle         
+       1.763156753 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:  532) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl1_hrd0/runTest.exe
@@ -115,19 +115,19 @@ Relative difference = 3.8113554068418534e-08
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl1_hrd0/check.exe -p 2048 256 12 OMP=
-Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=0]
+Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.2.0] [inlineHel=1] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=4, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 5.696033e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.535675e+07                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.535675e+07                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 5.726065e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.682697e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.682697e+07                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270338e-06 )  GeV^0
-TOTAL       :     1.461547 sec
-     4,388,837,969      cycles                           #    2.993 GHz                    
-     8,671,935,595      instructions                     #    1.98  insn per cycle         
-       1.467055991 seconds time elapsed
+TOTAL       :     1.451397 sec
+     4,426,469,159      cycles                    #    3.040 GHz                    
+     8,688,961,662      instructions              #    1.96  insn per cycle         
+       1.457509989 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2:  530) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl1_hrd0/runTest.exe
@@ -141,19 +141,19 @@ Relative difference = 2.5306003563303186e-07
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl1_hrd0/check.exe -p 2048 256 12 OMP=
-Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=0]
+Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.2.0] [inlineHel=1] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=4, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 5.767451e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.731200e+07                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.731200e+07                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 5.791450e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.780450e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.780450e+07                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270338e-06 )  GeV^0
-TOTAL       :     1.453990 sec
-     4,366,268,857      cycles                           #    2.994 GHz                    
-     8,617,347,584      instructions                     #    1.97  insn per cycle         
-       1.459292578 seconds time elapsed
+TOTAL       :     1.446231 sec
+     4,397,730,279      cycles                    #    3.031 GHz                    
+     8,635,389,369      instructions              #    1.96  insn per cycle         
+       1.458743810 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2:  502) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl1_hrd0/runTest.exe
@@ -167,19 +167,19 @@ Relative difference = 2.5306003563303186e-07
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl1_hrd0/check.exe -p 2048 256 12 OMP=
-Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=0]
+Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.2.0] [inlineHel=1] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=4, zero=0)
 Internal loops fptype_sv    = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 5.480160e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.113430e+07                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.113430e+07                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 5.422680e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.137968e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.137968e+07                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371705e-02 +- 3.270339e-06 )  GeV^0
-TOTAL       :     1.506867 sec
-     4,219,327,935      cycles                           #    2.791 GHz                    
-     8,194,521,601      instructions                     #    1.94  insn per cycle         
-       1.512223325 seconds time elapsed
+TOTAL       :     1.522551 sec
+     4,240,504,189      cycles                    #    2.777 GHz                    
+     8,218,517,462      instructions              #    1.94  insn per cycle         
+       1.543890893 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2:  345) (512y:    0) (512z:  301)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl1_hrd0/runTest.exe
diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl1_hrd1.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl1_hrd1.txt
index 673b2dc9eb..ac5f47f7f2 100644
--- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl1_hrd1.txt
+++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl1_hrd1.txt
@@ -35,22 +35,22 @@ CUDACPP_BUILDDIR='build.512z_f_inl1_hrd1'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 
-DATE: 2023-07-18_23:14:01
+DATE: 2023-06-16_23:10:05
 
-On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
+On itscrd80.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd1/gcheck.exe -p 2048 256 12 OMP=
-Process                     = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=1] [hardcodePARAM=1]
+Process                     = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=1] [hardcodePARAM=1]
 Workflow summary            = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=2, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 1.327036e+08                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.329373e+09                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.853831e+09                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.391214e+08                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.376725e+09                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.873588e+09                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371686e-02 +- 3.270219e-06 )  GeV^0
-TOTAL       :     0.574675 sec
-     2,378,492,152      cycles                           #    2.944 GHz                    
-     3,593,429,174      instructions                     #    1.51  insn per cycle         
-       0.866047495 seconds time elapsed
+TOTAL       :     0.594001 sec
+     2,406,505,129      cycles                    #    2.898 GHz                    
+     3,366,385,454      instructions              #    1.40  insn per cycle         
+       0.891567175 seconds time elapsed
 runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd1/gcheck.exe -p 2048 256 1
 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 80
 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100%
@@ -58,24 +58,24 @@ runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProces
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd1/gcheck.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd1/fgcheck.exe 2 64 2
 Avg ME (C++/CUDA)   = 1.282802e-02
-Avg ME (F77/CUDA)   = 1.2828112108763889E-002
-Relative difference = 7.180279099086847e-06
+Avg ME (F77/CUDA)   = 1.2828112026909366E-002
+Relative difference = 7.173898182689807e-06
 OK (relative difference <= 5E-3)
 =========================================================================
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd1/check.exe -p 2048 256 12 OMP=
-Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=1]
+Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.2.0] [inlineHel=1] [hardcodePARAM=1]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=6, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 3.964570e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 8.800048e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 8.800048e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 4.071244e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 9.178096e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 9.178096e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270373e-06 )  GeV^0
-TOTAL       :     1.958996 sec
-     5,876,137,740      cycles                           #    2.993 GHz                    
-    12,902,833,170      instructions                     #    2.20  insn per cycle         
-       1.964145548 seconds time elapsed
+TOTAL       :     1.909389 sec
+     5,830,536,600      cycles                    #    3.046 GHz                    
+    12,919,583,380      instructions              #    2.22  insn per cycle         
+       1.916035180 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:  196) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd1/runTest.exe
@@ -89,19 +89,19 @@ Relative difference = 1.3015322037054697e-08
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl1_hrd1/check.exe -p 2048 256 12 OMP=
-Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=1]
+Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.2.0] [inlineHel=1] [hardcodePARAM=1]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=6, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 4.963534e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.843882e+07                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.843882e+07                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 5.045068e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.916048e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.916048e+07                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270373e-06 )  GeV^0
-TOTAL       :     1.635684 sec
-     4,951,868,064      cycles                           #    3.019 GHz                    
-     9,966,972,442      instructions                     #    2.01  insn per cycle         
-       1.641293343 seconds time elapsed
+TOTAL       :     1.609251 sec
+     4,943,298,493      cycles                    #    3.063 GHz                    
+     9,983,440,182      instructions              #    2.02  insn per cycle         
+       1.615247999 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:  391) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl1_hrd1/runTest.exe
@@ -115,19 +115,19 @@ Relative difference = 3.8113554068418534e-08
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl1_hrd1/check.exe -p 2048 256 12 OMP=
-Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=1]
+Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.2.0] [inlineHel=1] [hardcodePARAM=1]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=4, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 6.019861e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.403402e+07                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.403402e+07                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 6.077418e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.611229e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.611229e+07                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270338e-06 )  GeV^0
-TOTAL       :     1.401927 sec
-     4,211,127,106      cycles                           #    2.994 GHz                    
-     8,315,678,328      instructions                     #    1.97  insn per cycle         
-       1.407507913 seconds time elapsed
+TOTAL       :     1.386065 sec
+     4,257,966,284      cycles                    #    3.061 GHz                    
+     8,332,432,046      instructions              #    1.96  insn per cycle         
+       1.401917759 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2:  418) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl1_hrd1/runTest.exe
@@ -141,19 +141,19 @@ Relative difference = 2.5306003563303186e-07
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl1_hrd1/check.exe -p 2048 256 12 OMP=
-Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=1]
+Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.2.0] [inlineHel=1] [hardcodePARAM=1]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=4, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 6.094647e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.677968e+07                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.677968e+07                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 6.185758e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.872361e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.872361e+07                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270338e-06 )  GeV^0
-TOTAL       :     1.391847 sec
-     4,178,481,727      cycles                           #    2.992 GHz                    
-     8,319,910,388      instructions                     #    1.99  insn per cycle         
-       1.397372095 seconds time elapsed
+TOTAL       :     1.368255 sec
+     4,217,890,128      cycles                    #    3.071 GHz                    
+     8,344,168,177      instructions              #    1.98  insn per cycle         
+       1.374282023 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2:  404) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl1_hrd1/runTest.exe
@@ -167,19 +167,19 @@ Relative difference = 2.5306003563303186e-07
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl1_hrd1/check.exe -p 2048 256 12 OMP=
-Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=1]
+Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.2.0] [inlineHel=1] [hardcodePARAM=1]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=4, zero=0)
 Internal loops fptype_sv    = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 5.660417e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.435550e+07                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.435550e+07                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 5.831994e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.619622e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.619622e+07                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371705e-02 +- 3.270339e-06 )  GeV^0
-TOTAL       :     1.473902 sec
-     4,166,139,184      cycles                           #    2.818 GHz                    
-     8,033,518,917      instructions                     #    1.93  insn per cycle         
-       1.479050546 seconds time elapsed
+TOTAL       :     1.433154 sec
+     4,169,968,523      cycles                    #    2.901 GHz                    
+     8,053,769,834      instructions              #    1.93  insn per cycle         
+       1.439092794 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2:  297) (512y:    0) (512z:  234)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl1_hrd1/runTest.exe
diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_m_inl0_hrd0.txt
index 842f26a638..6dfac9c1ed 100644
--- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_m_inl0_hrd0.txt
+++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_m_inl0_hrd0.txt
@@ -35,22 +35,22 @@ CUDACPP_BUILDDIR='build.512z_m_inl0_hrd0'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 
-DATE: 2023-07-18_22:39:06
+DATE: 2023-06-16_22:50:10
 
-On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
+On itscrd80.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd0/gcheck.exe -p 2048 256 12 OMP=
-Process                     = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 5.506055e+07                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 4.821545e+08                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 8.796974e+08                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 7.376494e+07                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 4.953518e+08                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 8.671712e+08                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270315e-06 )  GeV^0
-TOTAL       :     0.705545 sec
-     2,790,742,182      cycles                           #    2.893 GHz                    
-     4,222,733,657      instructions                     #    1.51  insn per cycle         
-       1.029186888 seconds time elapsed
+TOTAL       :     0.688402 sec
+     2,700,826,195      cycles                    #    2.905 GHz                    
+     3,806,980,667      instructions              #    1.41  insn per cycle         
+       0.989702055 seconds time elapsed
 runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd0/gcheck.exe -p 2048 256 1
 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 150
 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100%
@@ -63,19 +63,19 @@ Relative difference = 7.671454200650844e-09
 OK (relative difference <= 5E-3)
 =========================================================================
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd0/check.exe -p 2048 256 12 OMP=
-Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.177223e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.395160e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.395160e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.180803e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.453359e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.453359e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270315e-06 )  GeV^0
-TOTAL       :     5.748578 sec
-    17,433,563,503      cycles                           #    3.031 GHz                    
-    40,599,667,455      instructions                     #    2.33  insn per cycle         
-       5.755064250 seconds time elapsed
+TOTAL       :     5.728686 sec
+    17,708,432,158      cycles                    #    3.090 GHz                    
+    41,244,089,604      instructions              #    2.33  insn per cycle         
+       5.734765056 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:  377) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd0/runTest.exe
@@ -89,19 +89,19 @@ Relative difference = 3.6990156841838714e-09
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_m_inl0_hrd0/check.exe -p 2048 256 12 OMP=
-Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 2.109821e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.048932e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.048932e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.063645e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.188199e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.188199e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270315e-06 )  GeV^0
-TOTAL       :     3.390395 sec
-    10,287,311,971      cycles                           #    3.030 GHz                    
-    24,844,772,489      instructions                     #    2.42  insn per cycle         
-       3.397350143 seconds time elapsed
+TOTAL       :     3.450990 sec
+    10,583,383,925      cycles                    #    3.064 GHz                    
+    25,489,452,356      instructions              #    2.41  insn per cycle         
+       3.462562119 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4: 1316) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_m_inl0_hrd0/runTest.exe
@@ -115,19 +115,19 @@ Relative difference = 3.6990156841838714e-09
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_m_inl0_hrd0/check.exe -p 2048 256 12 OMP=
-Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 3.271948e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 5.939610e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 5.939610e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.037721e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 6.147402e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 6.147402e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270315e-06 )  GeV^0
-TOTAL       :     2.324350 sec
-     6,838,266,308      cycles                           #    2.937 GHz                    
-    13,636,393,663      instructions                     #    1.99  insn per cycle         
-       2.331441003 seconds time elapsed
+TOTAL       :     2.460588 sec
+     7,441,510,421      cycles                    #    3.018 GHz                    
+    14,282,092,763      instructions              #    1.92  insn per cycle         
+       2.472351528 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 1222) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_m_inl0_hrd0/runTest.exe
@@ -141,19 +141,19 @@ Relative difference = 3.767475112924841e-09
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_m_inl0_hrd0/check.exe -p 2048 256 12 OMP=
-Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 3.432176e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 6.420412e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 6.420412e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.158847e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 6.607374e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 6.607374e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270315e-06 )  GeV^0
-TOTAL       :     2.234107 sec
-     6,563,629,435      cycles                           #    2.935 GHz                    
-    13,328,217,942      instructions                     #    2.03  insn per cycle         
-       2.240707772 seconds time elapsed
+TOTAL       :     2.383448 sec
+     7,184,403,404      cycles                    #    3.008 GHz                    
+    13,977,543,468      instructions              #    1.95  insn per cycle         
+       2.395596096 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 1170) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_m_inl0_hrd0/runTest.exe
@@ -167,19 +167,19 @@ Relative difference = 3.767475112924841e-09
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_m_inl0_hrd0/check.exe -p 2048 256 12 OMP=
-Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 3.157816e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 5.531671e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 5.531671e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.933204e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 5.709618e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 5.709618e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270315e-06 )  GeV^0
-TOTAL       :     2.398333 sec
-     5,903,707,649      cycles                           #    2.458 GHz                    
-    10,213,997,707      instructions                     #    1.73  insn per cycle         
-       2.405561211 seconds time elapsed
+TOTAL       :     2.541646 sec
+     6,535,344,396      cycles                    #    2.567 GHz                    
+    10,866,787,933      instructions              #    1.66  insn per cycle         
+       2.548121091 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2:  473) (512y:    0) (512z:  707)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_m_inl0_hrd0/runTest.exe
diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_m_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_m_inl0_hrd1.txt
index 27856cc8ed..a6191f8a49 100644
--- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_m_inl0_hrd1.txt
+++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_m_inl0_hrd1.txt
@@ -35,22 +35,22 @@ CUDACPP_BUILDDIR='build.512z_m_inl0_hrd1'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 
-DATE: 2023-07-18_22:39:33
+DATE: 2023-06-16_22:50:39
 
-On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
+On itscrd80.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd1/gcheck.exe -p 2048 256 12 OMP=
-Process                     = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1]
+Process                     = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 5.571073e+07                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 5.374061e+08                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.082215e+09                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 7.486758e+07                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 5.407739e+08                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.073604e+09                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270315e-06 )  GeV^0
-TOTAL       :     0.700216 sec
-     2,772,657,778      cycles                           #    2.959 GHz                    
-     4,163,185,243      instructions                     #    1.50  insn per cycle         
-       1.001266216 seconds time elapsed
+TOTAL       :     0.687151 sec
+     2,695,987,556      cycles                    #    2.909 GHz                    
+     3,846,627,469      instructions              #    1.43  insn per cycle         
+       0.990030497 seconds time elapsed
 runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd1/gcheck.exe -p 2048 256 1
 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 118
 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100%
@@ -63,19 +63,19 @@ Relative difference = 7.671454200650844e-09
 OK (relative difference <= 5E-3)
 =========================================================================
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd1/check.exe -p 2048 256 12 OMP=
-Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1]
+Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.173359e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.391323e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.391323e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.184791e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.461890e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.461890e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270315e-06 )  GeV^0
-TOTAL       :     5.767106 sec
-    17,412,569,782      cycles                           #    3.017 GHz                    
-    40,548,741,589      instructions                     #    2.33  insn per cycle         
-       5.773604309 seconds time elapsed
+TOTAL       :     5.709105 sec
+    17,648,992,290      cycles                    #    3.090 GHz                    
+    41,192,633,916      instructions              #    2.33  insn per cycle         
+       5.715345529 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:  364) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd1/runTest.exe
@@ -89,19 +89,19 @@ Relative difference = 3.6990156841838714e-09
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_m_inl0_hrd1/check.exe -p 2048 256 12 OMP=
-Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1]
+Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 2.115602e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.022710e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.022710e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.065279e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.171891e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.171891e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270315e-06 )  GeV^0
-TOTAL       :     3.378561 sec
-    10,228,415,037      cycles                           #    3.023 GHz                    
-    24,804,793,058      instructions                     #    2.43  insn per cycle         
-       3.385098578 seconds time elapsed
+TOTAL       :     3.445809 sec
+    10,635,101,807      cycles                    #    3.083 GHz                    
+    25,450,128,846      instructions              #    2.39  insn per cycle         
+       3.457774379 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4: 1303) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_m_inl0_hrd1/runTest.exe
@@ -115,19 +115,19 @@ Relative difference = 3.6990156841838714e-09
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_m_inl0_hrd1/check.exe -p 2048 256 12 OMP=
-Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1]
+Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 3.283997e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 5.986448e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 5.986448e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.055789e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 6.156027e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 6.156027e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270315e-06 )  GeV^0
-TOTAL       :     2.313046 sec
-     6,817,504,422      cycles                           #    2.941 GHz                    
-    13,609,894,657      instructions                     #    2.00  insn per cycle         
-       2.319689673 seconds time elapsed
+TOTAL       :     2.453672 sec
+     7,396,722,782      cycles                    #    3.009 GHz                    
+    14,256,099,046      instructions              #    1.93  insn per cycle         
+       2.460024595 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 1202) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_m_inl0_hrd1/runTest.exe
@@ -141,19 +141,19 @@ Relative difference = 3.767475112924841e-09
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_m_inl0_hrd1/check.exe -p 2048 256 12 OMP=
-Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1]
+Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 3.428716e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 6.446641e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 6.446641e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.105741e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 6.538001e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 6.538001e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270315e-06 )  GeV^0
-TOTAL       :     2.229820 sec
-     6,574,314,217      cycles                           #    2.942 GHz                    
-    13,301,660,522      instructions                     #    2.02  insn per cycle         
-       2.236816103 seconds time elapsed
+TOTAL       :     2.423016 sec
+     7,227,506,496      cycles                    #    2.978 GHz                    
+    13,966,530,659      instructions              #    1.93  insn per cycle         
+       2.435636437 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 1150) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_m_inl0_hrd1/runTest.exe
@@ -167,19 +167,19 @@ Relative difference = 3.767475112924841e-09
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_m_inl0_hrd1/check.exe -p 2048 256 12 OMP=
-Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1]
+Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 3.251675e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 5.824241e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 5.824241e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.961514e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 5.946738e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 5.946738e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270315e-06 )  GeV^0
-TOTAL       :     2.338011 sec
-     5,826,014,320      cycles                           #    2.489 GHz                    
-    10,093,757,750      instructions                     #    1.73  insn per cycle         
-       2.344971034 seconds time elapsed
+TOTAL       :     2.525122 sec
+     6,474,425,117      cycles                    #    2.561 GHz                    
+    10,746,498,563      instructions              #    1.66  insn per cycle         
+       2.536989466 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2:  453) (512y:    0) (512z:  688)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_m_inl0_hrd1/runTest.exe
diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0.txt
index 23f8555033..238b115334 100644
--- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0.txt
+++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0.txt
@@ -35,22 +35,22 @@ CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 
-DATE: 2023-07-18_22:40:01
+DATE: 2023-06-16_22:51:08
 
-On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
+On itscrd80.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 2 OMP=
-Process                     = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 4.041058e+07                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.155921e+08                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.270597e+08                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 4.924818e+07                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.131524e+08                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.265700e+08                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086689e+00 +- 3.413217e-03 )  GeV^0
-TOTAL       :     0.537791 sec
-     2,267,470,311      cycles                           #    2.936 GHz                    
-     3,162,069,972      instructions                     #    1.39  insn per cycle         
-       0.841128427 seconds time elapsed
+TOTAL       :     0.548095 sec
+     2,264,164,972      cycles                    #    2.867 GHz                    
+     2,877,964,725      instructions              #    1.27  insn per cycle         
+       0.846931183 seconds time elapsed
 runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 1
 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 214
 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100%
@@ -63,19 +63,19 @@ Relative difference = 3.2588034143755247e-07
 OK (relative difference <= 5E-3)
 =========================================================================
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/check.exe -p 2048 256 2 OMP=
-Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.865897e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.913508e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.913508e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.970787e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.034500e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.034500e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086689e+00 +- 3.413217e-03 )  GeV^0
-TOTAL       :     5.727029 sec
-    17,122,088,671      cycles                           #    2.988 GHz                    
-    45,405,896,149      instructions                     #    2.65  insn per cycle         
-       5.734069298 seconds time elapsed
+TOTAL       :     5.428827 sec
+    16,813,146,395      cycles                    #    3.096 GHz                    
+    45,522,826,465      instructions              #    2.71  insn per cycle         
+       5.435350903 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:  625) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/runTest.exe
@@ -89,19 +89,19 @@ Relative difference = 3.258803992249869e-07
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/check.exe -p 2048 256 2 OMP=
-Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 3.429859e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.611560e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.611560e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.561847e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.792373e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.792373e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086689e+00 +- 3.413217e-03 )  GeV^0
-TOTAL       :     3.167563 sec
-     9,543,841,338      cycles                           #    3.010 GHz                    
-    26,463,320,160      instructions                     #    2.77  insn per cycle         
-       3.174476374 seconds time elapsed
+TOTAL       :     3.052689 sec
+     9,449,457,732      cycles                    #    3.092 GHz                    
+    26,574,621,752      instructions              #    2.81  insn per cycle         
+       3.065025864 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4: 2475) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/runTest.exe
@@ -115,19 +115,19 @@ Relative difference = 3.2588039900609506e-07
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/check.exe -p 2048 256 2 OMP=
-Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 6.094977e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 6.655959e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 6.655959e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 6.062194e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 6.740435e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 6.740435e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086689e+00 +- 3.413217e-03 )  GeV^0
-TOTAL       :     1.829248 sec
-     5,245,339,310      cycles                           #    2.861 GHz                    
-    11,203,451,715      instructions                     #    2.14  insn per cycle         
-       1.836368230 seconds time elapsed
+TOTAL       :     1.837966 sec
+     5,299,704,595      cycles                    #    2.875 GHz                    
+    11,318,276,197      instructions              #    2.14  insn per cycle         
+       1.850204362 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 2317) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/runTest.exe
@@ -141,19 +141,19 @@ Relative difference = 3.2588037186351226e-07
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/check.exe -p 2048 256 2 OMP=
-Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 6.746194e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 7.426314e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 7.426314e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 6.705723e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 7.548710e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.548710e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086689e+00 +- 3.413217e-03 )  GeV^0
-TOTAL       :     1.662736 sec
-     4,783,397,891      cycles                           #    2.870 GHz                    
-    10,623,484,397      instructions                     #    2.22  insn per cycle         
-       1.669838048 seconds time elapsed
+TOTAL       :     1.671188 sec
+     4,885,067,241      cycles                    #    2.915 GHz                    
+    10,738,150,017      instructions              #    2.20  insn per cycle         
+       1.677721158 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 2116) (512y:   84) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/runTest.exe
@@ -167,20 +167,20 @@ Relative difference = 3.2588037186351226e-07
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/check.exe -p 2048 256 2 OMP=
-Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 4.165186e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 4.415979e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 4.415979e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 4.237343e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 4.564057e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.564057e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086689e+00 +- 3.413217e-03 )  GeV^0
-TOTAL       :     2.627050 sec
-     5,157,567,367      cycles                           #    1.961 GHz                    
-     6,977,826,191      instructions                     #    1.35  insn per cycle         
-       2.634198680 seconds time elapsed
-=Symbols in CPPProcess.o= (~sse4:    0) (avx2: 1093) (512y:   96) (512z: 1628)
+TOTAL       :     2.582849 sec
+     5,275,527,208      cycles                    #    2.039 GHz                    
+     7,074,506,056      instructions              #    1.34  insn per cycle         
+       2.594504687 seconds time elapsed
+=Symbols in CPPProcess.o= (~sse4:    0) (avx2: 1084) (512y:   95) (512z: 1629)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/runTest.exe
 [  PASSED  ] 6 tests.
diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_bridge.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_bridge.txt
index 3b1248b3b8..639aca4e98 100644
--- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_bridge.txt
+++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_bridge.txt
@@ -35,26 +35,26 @@ CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 
-DATE: 2023-07-18_23:23:24
+DATE: 2023-06-16_23:19:50
 
-On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
+On itscrd80.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 2 --bridge OMP=
 WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost
 WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost
 WARNING! Instantiate device Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288)
 WARNING! Set grid in Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288)
-Process                     = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 4.517155e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.850093e+07                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.850093e+07                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 4.044204e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.996138e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.996138e+07                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086689e+00 +- 3.413217e-03 )  GeV^0
-TOTAL       :     0.811994 sec
-     3,122,813,862      cycles                           #    2.957 GHz                    
-     4,726,707,499      instructions                     #    1.51  insn per cycle         
-       1.114267763 seconds time elapsed
+TOTAL       :     0.864809 sec
+     3,286,181,196      cycles                    #    2.941 GHz                    
+     4,692,253,429      instructions              #    1.43  insn per cycle         
+       1.175702347 seconds time elapsed
 runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 1 --bridge
 WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost
 WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost
@@ -72,19 +72,19 @@ OK (relative difference <= 5E-3)
 =========================================================================
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/check.exe -p 2048 256 2 --bridge OMP=
 WARNING! Instantiate host Bridge (nevt=524288)
-Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.891761e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.940341e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.940341e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.945517e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.007133e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.007133e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086689e+00 +- 3.413217e-03 )  GeV^0
-TOTAL       :     5.725155 sec
-    17,443,155,891      cycles                           #    3.044 GHz                    
-    45,463,067,794      instructions                     #    2.61  insn per cycle         
-       5.731889775 seconds time elapsed
+TOTAL       :     5.579973 sec
+    17,197,411,796      cycles                    #    3.079 GHz                    
+    45,599,315,728      instructions              #    2.65  insn per cycle         
+       5.587453353 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:  625) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/runTest.exe
@@ -99,19 +99,19 @@ OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/check.exe -p 2048 256 2 --bridge OMP=
 WARNING! Instantiate host Bridge (nevt=524288)
-Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 3.424146e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.600690e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.600690e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.516201e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.735806e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.735806e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086689e+00 +- 3.413217e-03 )  GeV^0
-TOTAL       :     3.245399 sec
-     9,877,145,077      cycles                           #    3.039 GHz                    
-    26,644,958,913      instructions                     #    2.70  insn per cycle         
-       3.251786063 seconds time elapsed
+TOTAL       :     3.173720 sec
+     9,799,980,850      cycles                    #    3.084 GHz                    
+    26,760,156,296      instructions              #    2.73  insn per cycle         
+       3.191189055 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4: 2475) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/runTest.exe
@@ -126,19 +126,19 @@ OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/check.exe -p 2048 256 2 --bridge OMP=
 WARNING! Instantiate host Bridge (nevt=524288)
-Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 5.989584e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 6.525267e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 6.525267e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 6.004187e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 6.680852e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 6.680852e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086689e+00 +- 3.413217e-03 )  GeV^0
-TOTAL       :     1.935210 sec
-     5,584,834,974      cycles                           #    2.877 GHz                    
-    11,490,126,989      instructions                     #    2.06  insn per cycle         
-       1.942181580 seconds time elapsed
+TOTAL       :     1.934049 sec
+     5,690,588,652      cycles                    #    2.933 GHz                    
+    11,606,672,585      instructions              #    2.04  insn per cycle         
+       1.948751225 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 2317) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/runTest.exe
@@ -153,19 +153,19 @@ OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/check.exe -p 2048 256 2 --bridge OMP=
 WARNING! Instantiate host Bridge (nevt=524288)
-Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 6.585484e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 7.233673e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 7.233673e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 6.578597e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 7.386711e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.386711e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086689e+00 +- 3.413217e-03 )  GeV^0
-TOTAL       :     1.776662 sec
-     5,115,306,828      cycles                           #    2.870 GHz                    
-    10,908,248,078      instructions                     #    2.13  insn per cycle         
-       1.783566470 seconds time elapsed
+TOTAL       :     1.781882 sec
+     5,244,605,674      cycles                    #    2.934 GHz                    
+    11,027,240,287      instructions              #    2.10  insn per cycle         
+       1.797644688 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 2116) (512y:   84) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/runTest.exe
@@ -180,20 +180,20 @@ OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/check.exe -p 2048 256 2 --bridge OMP=
 WARNING! Instantiate host Bridge (nevt=524288)
-Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 4.052284e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 4.287533e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 4.287533e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 4.159645e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 4.469350e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.469350e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086689e+00 +- 3.413217e-03 )  GeV^0
-TOTAL       :     2.773144 sec
-     5,511,544,850      cycles                           #    1.984 GHz                    
-     7,221,836,653      instructions                     #    1.31  insn per cycle         
-       2.779849469 seconds time elapsed
-=Symbols in CPPProcess.o= (~sse4:    0) (avx2: 1093) (512y:   96) (512z: 1628)
+TOTAL       :     2.712347 sec
+     5,634,887,558      cycles                    #    2.074 GHz                    
+     7,322,991,974      instructions              #    1.30  insn per cycle         
+       2.729029702 seconds time elapsed
+=Symbols in CPPProcess.o= (~sse4:    0) (avx2: 1084) (512y:   95) (512z: 1629)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/runTest.exe
 [  PASSED  ] 6 tests.
diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_common.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_common.txt
index 3ee287cbb2..654e369bcd 100644
--- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_common.txt
+++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_common.txt
@@ -35,22 +35,22 @@ CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 
-DATE: 2023-07-18_23:35:23
+DATE: 2023-06-16_23:32:09
 
-On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
+On itscrd80.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 2 --common OMP=
-Process                     = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:COMMON+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 4.594075e+07                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.159099e+08                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.273479e+08                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 4.720407e+07                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.158709e+08                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.266813e+08                 )  sec^-1
 MeanMatrixElemValue         = ( 2.079401e+00 +- 3.402993e-03 )  GeV^0
-TOTAL       :     0.619590 sec
-     2,505,610,814      cycles                           #    2.955 GHz                    
-     3,592,105,745      instructions                     #    1.43  insn per cycle         
-       0.906983599 seconds time elapsed
+TOTAL       :     0.643304 sec
+     2,557,243,803      cycles                    #    2.902 GHz                    
+     3,323,241,435      instructions              #    1.30  insn per cycle         
+       0.940843234 seconds time elapsed
 runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 1 --common
 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 214
 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100%
@@ -63,19 +63,19 @@ Relative difference = 3.2588034143755247e-07
 OK (relative difference <= 5E-3)
 =========================================================================
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/check.exe -p 2048 256 2 --common OMP=
-Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.888766e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.937039e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.937039e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.976660e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.040382e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.040382e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.079401e+00 +- 3.402993e-03 )  GeV^0
-TOTAL       :     5.717439 sec
-    17,289,665,830      cycles                           #    3.023 GHz                    
-    45,422,171,459      instructions                     #    2.63  insn per cycle         
-       5.722653885 seconds time elapsed
+TOTAL       :     5.473136 sec
+    17,003,987,037      cycles                    #    3.104 GHz                    
+    45,540,260,291      instructions              #    2.68  insn per cycle         
+       5.479696030 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:  625) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/runTest.exe
@@ -89,19 +89,19 @@ Relative difference = 3.258803992249869e-07
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/check.exe -p 2048 256 2 --common OMP=
-Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 3.455670e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.635570e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.635570e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.560915e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.788844e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.788844e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.079401e+00 +- 3.402993e-03 )  GeV^0
-TOTAL       :     3.200432 sec
-     9,723,038,655      cycles                           #    3.036 GHz                    
-    26,462,151,211      instructions                     #    2.72  insn per cycle         
-       3.205930504 seconds time elapsed
+TOTAL       :     3.112416 sec
+     9,646,257,782      cycles                    #    3.095 GHz                    
+    26,577,811,209      instructions              #    2.76  insn per cycle         
+       3.124182275 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4: 2475) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/runTest.exe
@@ -115,19 +115,19 @@ Relative difference = 3.2588039900609506e-07
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/check.exe -p 2048 256 2 --common OMP=
-Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 6.081846e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 6.628914e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 6.628914e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 6.139210e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 6.855962e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 6.855962e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.079401e+00 +- 3.402993e-03 )  GeV^0
-TOTAL       :     1.889454 sec
-     5,425,081,251      cycles                           #    2.866 GHz                    
-    11,186,282,050      instructions                     #    2.06  insn per cycle         
-       1.894560758 seconds time elapsed
+TOTAL       :     1.874805 sec
+     5,513,978,865      cycles                    #    2.934 GHz                    
+    11,304,407,461      instructions              #    2.05  insn per cycle         
+       1.886432659 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 2317) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/runTest.exe
@@ -141,19 +141,19 @@ Relative difference = 3.2588037186351226e-07
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/check.exe -p 2048 256 2 --common OMP=
-Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:COMMON+RMBHST+MESHST/512y+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 6.753281e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 7.434153e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 7.434153e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 6.726123e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 7.568760e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.568760e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.079401e+00 +- 3.402993e-03 )  GeV^0
-TOTAL       :     1.719503 sec
-     4,944,778,621      cycles                           #    2.869 GHz                    
-    10,572,680,419      instructions                     #    2.14  insn per cycle         
-       1.725111487 seconds time elapsed
+TOTAL       :     1.727887 sec
+     5,070,617,194      cycles                    #    2.927 GHz                    
+    10,691,249,015      instructions              #    2.11  insn per cycle         
+       1.739511158 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 2116) (512y:   84) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/runTest.exe
@@ -167,20 +167,20 @@ Relative difference = 3.2588037186351226e-07
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/check.exe -p 2048 256 2 --common OMP=
-Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:COMMON+RMBHST+MESHST/512z+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 4.150572e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 4.398100e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 4.398100e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 4.172539e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 4.488358e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.488358e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.079401e+00 +- 3.402993e-03 )  GeV^0
-TOTAL       :     2.693591 sec
-     5,346,956,955      cycles                           #    1.983 GHz                    
-     6,927,109,897      instructions                     #    1.30  insn per cycle         
-       2.698900902 seconds time elapsed
-=Symbols in CPPProcess.o= (~sse4:    0) (avx2: 1093) (512y:   96) (512z: 1628)
+TOTAL       :     2.683562 sec
+     5,459,517,935      cycles                    #    2.032 GHz                    
+     7,027,784,941      instructions              #    1.29  insn per cycle         
+       2.699812290 seconds time elapsed
+=Symbols in CPPProcess.o= (~sse4:    0) (avx2: 1084) (512y:   95) (512z: 1629)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/runTest.exe
 [  PASSED  ] 6 tests.
diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_curhst.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_curhst.txt
index c07d5465aa..a8675ddd60 100644
--- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_curhst.txt
+++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_curhst.txt
@@ -35,22 +35,22 @@ CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 
-DATE: 2023-07-18_23:32:26
+DATE: 2023-06-16_23:29:07
 
-On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
+On itscrd80.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 2 --curhst OMP=
-Process                     = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURHST+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 4.590823e+07                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.156069e+08                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.272880e+08                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 4.723347e+07                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.157467e+08                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.265955e+08                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086689e+00 +- 3.413217e-03 )  GeV^0
-TOTAL       :     0.561090 sec
-     2,337,153,978      cycles                           #    2.954 GHz                    
-     3,567,976,480      instructions                     #    1.53  insn per cycle         
-       0.848347727 seconds time elapsed
+TOTAL       :     0.586317 sec
+     2,370,587,754      cycles                    #    2.873 GHz                    
+     3,311,584,988      instructions              #    1.40  insn per cycle         
+       0.883202740 seconds time elapsed
 runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 1 --curhst
 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 214
 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100%
@@ -63,19 +63,19 @@ Relative difference = 3.2588034143755247e-07
 OK (relative difference <= 5E-3)
 =========================================================================
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/check.exe -p 2048 256 2 --curhst OMP=
-Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.901838e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.950756e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.950756e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.965968e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.028993e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.028993e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086689e+00 +- 3.413217e-03 )  GeV^0
-TOTAL       :     5.619703 sec
-    17,098,072,758      cycles                           #    3.041 GHz                    
-    45,403,233,892      instructions                     #    2.66  insn per cycle         
-       5.625298678 seconds time elapsed
+TOTAL       :     5.442844 sec
+    16,814,668,007      cycles                    #    3.088 GHz                    
+    45,523,651,040      instructions              #    2.71  insn per cycle         
+       5.449380623 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:  625) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/runTest.exe
@@ -89,19 +89,19 @@ Relative difference = 3.258803992249869e-07
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/check.exe -p 2048 256 2 --curhst OMP=
-Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 3.465062e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.643542e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.643542e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.557389e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.784830e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.784830e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086689e+00 +- 3.413217e-03 )  GeV^0
-TOTAL       :     3.133350 sec
-     9,530,990,715      cycles                           #    3.038 GHz                    
-    26,461,775,640      instructions                     #    2.78  insn per cycle         
-       3.138486769 seconds time elapsed
+TOTAL       :     3.056098 sec
+     9,441,724,564      cycles                    #    3.085 GHz                    
+    26,574,286,961      instructions              #    2.81  insn per cycle         
+       3.067987765 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4: 2475) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/runTest.exe
@@ -115,19 +115,19 @@ Relative difference = 3.2588039900609506e-07
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/check.exe -p 2048 256 2 --curhst OMP=
-Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 6.113272e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 6.680038e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 6.680038e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 6.028635e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 6.705882e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 6.705882e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086689e+00 +- 3.413217e-03 )  GeV^0
-TOTAL       :     1.822016 sec
-     5,239,798,170      cycles                           #    2.869 GHz                    
-    11,203,273,002      instructions                     #    2.14  insn per cycle         
-       1.827279491 seconds time elapsed
+TOTAL       :     1.847560 sec
+     5,305,013,811      cycles                    #    2.865 GHz                    
+    11,318,072,423      instructions              #    2.13  insn per cycle         
+       1.862970960 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 2317) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/runTest.exe
@@ -141,19 +141,19 @@ Relative difference = 3.2588037186351226e-07
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/check.exe -p 2048 256 2 --curhst OMP=
-Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 6.752624e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 7.436775e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 7.436775e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 6.705810e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 7.546514e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.546514e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086689e+00 +- 3.413217e-03 )  GeV^0
-TOTAL       :     1.656970 sec
-     4,770,141,804      cycles                           #    2.871 GHz                    
-    10,617,376,861      instructions                     #    2.23  insn per cycle         
-       1.662328882 seconds time elapsed
+TOTAL       :     1.670710 sec
+     4,873,672,906      cycles                    #    2.909 GHz                    
+    10,738,237,712      instructions              #    2.20  insn per cycle         
+       1.682858352 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 2116) (512y:   84) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/runTest.exe
@@ -167,20 +167,20 @@ Relative difference = 3.2588037186351226e-07
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/check.exe -p 2048 256 2 --curhst OMP=
-Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 4.111293e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 4.354169e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 4.354169e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 4.212444e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 4.528464e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.528464e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086689e+00 +- 3.413217e-03 )  GeV^0
-TOTAL       :     2.656883 sec
-     5,149,232,654      cycles                           #    1.935 GHz                    
-     6,977,685,886      instructions                     #    1.36  insn per cycle         
-       2.662117722 seconds time elapsed
-=Symbols in CPPProcess.o= (~sse4:    0) (avx2: 1093) (512y:   96) (512z: 1628)
+TOTAL       :     2.596550 sec
+     5,275,280,525      cycles                    #    2.028 GHz                    
+     7,074,607,503      instructions              #    1.34  insn per cycle         
+       2.603068795 seconds time elapsed
+=Symbols in CPPProcess.o= (~sse4:    0) (avx2: 1084) (512y:   95) (512z: 1629)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/runTest.exe
 [  PASSED  ] 6 tests.
diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_rmbhst.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_rmbhst.txt
index f3fc0520ec..359002bbc2 100644
--- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_rmbhst.txt
+++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_rmbhst.txt
@@ -35,23 +35,23 @@ CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 
-DATE: 2023-07-18_23:29:33
+DATE: 2023-06-16_23:26:08
 
-On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
+On itscrd80.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 2 --rmbhst OMP=
 WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost
-Process                     = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURHST+RMBHST+MESDEV/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 5.823745e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.160361e+08                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.275150e+08                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 5.030270e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.154608e+08                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.262941e+08                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086689e+00 +- 3.413217e-03 )  GeV^0
-TOTAL       :     0.711538 sec
-     2,800,525,082      cycles                           #    2.960 GHz                    
-     4,320,932,668      instructions                     #    1.54  insn per cycle         
-       1.004387612 seconds time elapsed
+TOTAL       :     0.762017 sec
+     2,931,874,448      cycles                    #    2.934 GHz                    
+     4,162,111,608      instructions              #    1.42  insn per cycle         
+       1.058050965 seconds time elapsed
 runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 1 --rmbhst
 WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost
 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 214
@@ -65,19 +65,19 @@ Relative difference = 3.2588034143755247e-07
 OK (relative difference <= 5E-3)
 =========================================================================
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/check.exe -p 2048 256 2 --rmbhst OMP=
-Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.893916e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.943371e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.943371e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.969731e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.034215e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.034215e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086689e+00 +- 3.413217e-03 )  GeV^0
-TOTAL       :     5.643047 sec
-    17,107,721,214      cycles                           #    3.029 GHz                    
-    45,404,077,046      instructions                     #    2.65  insn per cycle         
-       5.648523883 seconds time elapsed
+TOTAL       :     5.431365 sec
+    16,808,611,250      cycles                    #    3.092 GHz                    
+    45,520,552,491      instructions              #    2.71  insn per cycle         
+       5.438033144 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:  625) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/runTest.exe
@@ -91,19 +91,19 @@ Relative difference = 3.258803992249869e-07
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/check.exe -p 2048 256 2 --rmbhst OMP=
-Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 3.464238e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.642365e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.642365e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.568471e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.797188e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.797188e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086689e+00 +- 3.413217e-03 )  GeV^0
-TOTAL       :     3.133051 sec
-     9,535,469,676      cycles                           #    3.040 GHz                    
-    26,461,903,214      instructions                     #    2.78  insn per cycle         
-       3.138570872 seconds time elapsed
+TOTAL       :     3.046374 sec
+     9,446,582,684      cycles                    #    3.097 GHz                    
+    26,574,434,074      instructions              #    2.81  insn per cycle         
+       3.058800051 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4: 2475) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/runTest.exe
@@ -117,19 +117,19 @@ Relative difference = 3.2588039900609506e-07
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/check.exe -p 2048 256 2 --rmbhst OMP=
-Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 6.023884e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 6.568023e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 6.568023e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 6.022985e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 6.704154e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 6.704154e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086689e+00 +- 3.413217e-03 )  GeV^0
-TOTAL       :     1.845680 sec
-     5,244,293,032      cycles                           #    2.834 GHz                    
-    11,203,499,976      instructions                     #    2.14  insn per cycle         
-       1.851246836 seconds time elapsed
+TOTAL       :     1.849517 sec
+     5,358,628,341      cycles                    #    2.890 GHz                    
+    11,318,219,306      instructions              #    2.11  insn per cycle         
+       1.862335681 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 2317) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/runTest.exe
@@ -143,19 +143,19 @@ Relative difference = 3.2588037186351226e-07
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/check.exe -p 2048 256 2 --rmbhst OMP=
-Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 6.768737e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 7.456321e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 7.456321e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 6.716217e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 7.550859e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.550859e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086689e+00 +- 3.413217e-03 )  GeV^0
-TOTAL       :     1.654802 sec
-     4,755,871,808      cycles                           #    2.866 GHz                    
-    10,620,982,700      instructions                     #    2.23  insn per cycle         
-       1.660273899 seconds time elapsed
+TOTAL       :     1.668825 sec
+     4,888,461,793      cycles                    #    2.922 GHz                    
+    10,738,236,069      instructions              #    2.20  insn per cycle         
+       1.680870974 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 2116) (512y:   84) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/runTest.exe
@@ -169,20 +169,20 @@ Relative difference = 3.2588037186351226e-07
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/check.exe -p 2048 256 2 --rmbhst OMP=
-Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 4.161738e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 4.413225e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 4.413225e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 4.203955e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 4.527248e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.527248e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086689e+00 +- 3.413217e-03 )  GeV^0
-TOTAL       :     2.626109 sec
-     5,153,641,042      cycles                           #    1.960 GHz                    
-     6,978,206,846      instructions                     #    1.35  insn per cycle         
-       2.631697516 seconds time elapsed
-=Symbols in CPPProcess.o= (~sse4:    0) (avx2: 1093) (512y:   96) (512z: 1628)
+TOTAL       :     2.602140 sec
+     5,278,828,196      cycles                    #    2.025 GHz                    
+     7,074,793,601      instructions              #    1.34  insn per cycle         
+       2.608554211 seconds time elapsed
+=Symbols in CPPProcess.o= (~sse4:    0) (avx2: 1084) (512y:   95) (512z: 1629)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/runTest.exe
 [  PASSED  ] 6 tests.
diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd1.txt
index 93b17b3385..eac3f7700f 100644
--- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd1.txt
+++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd1.txt
@@ -35,22 +35,22 @@ CUDACPP_BUILDDIR='build.512z_d_inl0_hrd1'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 
-DATE: 2023-07-18_22:40:28
+DATE: 2023-06-16_22:51:35
 
-On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
+On itscrd80.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd1/gcheck.exe -p 2048 256 2 OMP=
-Process                     = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1]
+Process                     = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 4.097464e+07                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.160557e+08                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.275726e+08                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 4.913985e+07                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.133710e+08                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.264879e+08                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086689e+00 +- 3.413217e-03 )  GeV^0
-TOTAL       :     0.529241 sec
-     2,249,151,280      cycles                           #    2.939 GHz                    
-     3,126,073,041      instructions                     #    1.39  insn per cycle         
-       0.833374909 seconds time elapsed
+TOTAL       :     0.547036 sec
+     2,265,211,371      cycles                    #    2.879 GHz                    
+     2,883,998,397      instructions              #    1.27  insn per cycle         
+       0.846249580 seconds time elapsed
 runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd1/gcheck.exe -p 2048 256 1
 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 208
 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100%
@@ -63,19 +63,19 @@ Relative difference = 3.2588034143755247e-07
 OK (relative difference <= 5E-3)
 =========================================================================
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd1/check.exe -p 2048 256 2 OMP=
-Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1]
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.947164e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.998957e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.998957e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.025487e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.092839e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.092839e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086689e+00 +- 3.413217e-03 )  GeV^0
-TOTAL       :     5.491250 sec
-    16,727,580,632      cycles                           #    3.044 GHz                    
-    44,378,708,972      instructions                     #    2.65  insn per cycle         
-       5.497794723 seconds time elapsed
+TOTAL       :     5.293594 sec
+    16,435,613,118      cycles                    #    3.105 GHz                    
+    44,496,848,749      instructions              #    2.71  insn per cycle         
+       5.300078017 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:  576) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd1/runTest.exe
@@ -89,19 +89,19 @@ Relative difference = 3.258803992249869e-07
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd1/check.exe -p 2048 256 2 OMP=
-Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1]
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 3.657519e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.857574e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.857574e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.743778e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.997210e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.997210e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086689e+00 +- 3.413217e-03 )  GeV^0
-TOTAL       :     2.975412 sec
-     9,062,080,469      cycles                           #    3.042 GHz                    
-    25,287,862,621      instructions                     #    2.79  insn per cycle         
-       2.982186168 seconds time elapsed
+TOTAL       :     2.909580 sec
+     8,996,897,610      cycles                    #    3.087 GHz                    
+    25,400,434,000      instructions              #    2.82  insn per cycle         
+       2.923168620 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4: 2305) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd1/runTest.exe
@@ -115,19 +115,19 @@ Relative difference = 3.2588039900609506e-07
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd1/check.exe -p 2048 256 2 OMP=
-Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1]
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 5.559920e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 6.020470e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 6.020470e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 5.683039e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 6.268975e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 6.268975e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086689e+00 +- 3.413217e-03 )  GeV^0
-TOTAL       :     1.994543 sec
-     5,687,301,019      cycles                           #    2.847 GHz                    
-    12,306,543,864      instructions                     #    2.16  insn per cycle         
-       2.001637740 seconds time elapsed
+TOTAL       :     1.953170 sec
+     5,745,024,701      cycles                    #    2.934 GHz                    
+    12,420,701,802      instructions              #    2.16  insn per cycle         
+       1.968877475 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 2408) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd1/runTest.exe
@@ -141,19 +141,19 @@ Relative difference = 3.2588037186351226e-07
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd1/check.exe -p 2048 256 2 OMP=
-Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1]
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 5.854570e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 6.362779e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 6.362779e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 5.895401e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 6.531291e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 6.531291e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086689e+00 +- 3.413217e-03 )  GeV^0
-TOTAL       :     1.898550 sec
-     5,405,706,794      cycles                           #    2.842 GHz                    
-    11,884,289,492      instructions                     #    2.20  insn per cycle         
-       1.905488215 seconds time elapsed
+TOTAL       :     1.886801 sec
+     5,519,025,390      cycles                    #    2.918 GHz                    
+    12,000,810,261      instructions              #    2.17  insn per cycle         
+       1.892947629 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 2127) (512y:  239) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd1/runTest.exe
@@ -167,19 +167,19 @@ Relative difference = 3.2588037186351226e-07
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd1/check.exe -p 2048 256 2 OMP=
-Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1]
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 4.007795e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 4.242707e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 4.242707e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 4.060444e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 4.357914e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.357914e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086689e+00 +- 3.413217e-03 )  GeV^0
-TOTAL       :     2.726437 sec
-     5,336,054,647      cycles                           #    1.955 GHz                    
-     8,411,434,439      instructions                     #    1.58  insn per cycle         
-       2.733346003 seconds time elapsed
+TOTAL       :     2.690681 sec
+     5,453,449,759      cycles                    #    2.024 GHz                    
+     8,526,907,153      instructions              #    1.56  insn per cycle         
+       2.696635390 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 1067) (512y:  204) (512z: 1715)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd1/runTest.exe
diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl1_hrd0.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl1_hrd0.txt
index e5969ef6ae..e46f7db696 100644
--- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl1_hrd0.txt
+++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl1_hrd0.txt
@@ -35,22 +35,22 @@ CUDACPP_BUILDDIR='build.512z_d_inl1_hrd0'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 
-DATE: 2023-07-18_23:14:20
+DATE: 2023-06-16_23:10:25
 
-On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
+On itscrd80.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd0/gcheck.exe -p 2048 256 2 OMP=
-Process                     = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=1] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=1] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 4.592910e+07                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.162940e+08                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.277756e+08                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 4.728254e+07                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.165396e+08                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.275486e+08                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086689e+00 +- 3.413217e-03 )  GeV^0
-TOTAL       :     0.525339 sec
-     2,240,703,448      cycles                           #    2.932 GHz                    
-     3,098,476,503      instructions                     #    1.38  insn per cycle         
-       0.823729716 seconds time elapsed
+TOTAL       :     0.552941 sec
+     2,310,610,153      cycles                    #    2.859 GHz                    
+     2,915,241,692      instructions              #    1.26  insn per cycle         
+       0.867252595 seconds time elapsed
 runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd0/gcheck.exe -p 2048 256 1
 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 214
 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100%
@@ -63,19 +63,19 @@ Relative difference = 3.2588034143755247e-07
 OK (relative difference <= 5E-3)
 =========================================================================
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd0/check.exe -p 2048 256 2 OMP=
-Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.2.0] [inlineHel=1] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 2.523001e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.610147e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.610147e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.605414e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.718612e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.718612e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086689e+00 +- 3.413217e-03 )  GeV^0
-TOTAL       :     4.262946 sec
-    12,917,991,309      cycles                           #    3.027 GHz                    
-    34,356,800,274      instructions                     #    2.66  insn per cycle         
-       4.268384369 seconds time elapsed
+TOTAL       :     4.132902 sec
+    12,781,577,555      cycles                    #    3.090 GHz                    
+    34,468,010,618      instructions              #    2.70  insn per cycle         
+       4.139327277 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:  672) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd0/runTest.exe
@@ -89,19 +89,19 @@ Relative difference = 3.258803992249869e-07
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl1_hrd0/check.exe -p 2048 256 2 OMP=
-Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.2.0] [inlineHel=1] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 3.097210e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.240059e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.240059e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.146546e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.322919e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.322919e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086689e+00 +- 3.413217e-03 )  GeV^0
-TOTAL       :     3.491661 sec
-    10,619,998,715      cycles                           #    3.037 GHz                    
-    22,832,779,188      instructions                     #    2.15  insn per cycle         
-       3.497483266 seconds time elapsed
+TOTAL       :     3.441664 sec
+    10,637,428,563      cycles                    #    3.088 GHz                    
+    22,946,767,172      instructions              #    2.16  insn per cycle         
+       3.453622149 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4: 2554) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl1_hrd0/runTest.exe
@@ -115,19 +115,19 @@ Relative difference = 3.2588039900609506e-07
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl1_hrd0/check.exe -p 2048 256 2 OMP=
-Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.2.0] [inlineHel=1] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 5.431271e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 5.883655e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 5.883655e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 5.616486e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 6.203845e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 6.203845e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086689e+00 +- 3.413217e-03 )  GeV^0
-TOTAL       :     2.039012 sec
-     5,716,156,076      cycles                           #    2.802 GHz                    
-    10,652,722,821      instructions                     #    1.86  insn per cycle         
-       2.044492539 seconds time elapsed
+TOTAL       :     1.975506 sec
+     5,785,528,635      cycles                    #    2.924 GHz                    
+    10,765,799,966      instructions              #    1.86  insn per cycle         
+       1.987834132 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 2694) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl1_hrd0/runTest.exe
@@ -141,19 +141,19 @@ Relative difference = 3.2588037186351226e-07
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl1_hrd0/check.exe -p 2048 256 2 OMP=
-Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.2.0] [inlineHel=1] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 5.705619e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 6.188017e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 6.188017e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 5.684700e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 6.274843e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 6.274843e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086689e+00 +- 3.413217e-03 )  GeV^0
-TOTAL       :     1.944139 sec
-     5,573,008,606      cycles                           #    2.860 GHz                    
-     9,877,800,784      instructions                     #    1.77  insn per cycle         
-       1.950007637 seconds time elapsed
+TOTAL       :     1.951945 sec
+     5,696,598,180      cycles                    #    2.912 GHz                    
+     9,993,327,583      instructions              #    1.75  insn per cycle         
+       1.964406649 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 2323) (512y:  159) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl1_hrd0/runTest.exe
@@ -167,19 +167,19 @@ Relative difference = 3.2588037186351226e-07
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl1_hrd0/check.exe -p 2048 256 2 OMP=
-Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.2.0] [inlineHel=1] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 4.361963e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 4.636685e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 4.636685e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 4.413791e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 4.764783e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.764783e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086689e+00 +- 3.413217e-03 )  GeV^0
-TOTAL       :     2.510468 sec
-     4,946,319,713      cycles                           #    1.966 GHz                    
-     7,490,938,080      instructions                     #    1.51  insn per cycle         
-       2.516345616 seconds time elapsed
+TOTAL       :     2.484051 sec
+     5,055,557,870      cycles                    #    2.031 GHz                    
+     7,607,210,183      instructions              #    1.50  insn per cycle         
+       2.496248175 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 1617) (512y:  257) (512z: 1663)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl1_hrd0/runTest.exe
diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl1_hrd1.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl1_hrd1.txt
index 0b9a5fa56c..336c6b2c5b 100644
--- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl1_hrd1.txt
+++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl1_hrd1.txt
@@ -35,22 +35,22 @@ CUDACPP_BUILDDIR='build.512z_d_inl1_hrd1'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 
-DATE: 2023-07-18_23:14:46
+DATE: 2023-06-16_23:10:52
 
-On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
+On itscrd80.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd1/gcheck.exe -p 2048 256 2 OMP=
-Process                     = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=1] [hardcodePARAM=1]
+Process                     = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=1] [hardcodePARAM=1]
 Workflow summary            = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 4.590276e+07                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.161665e+08                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.276091e+08                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 4.704482e+07                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.154812e+08                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.264069e+08                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086689e+00 +- 3.413217e-03 )  GeV^0
-TOTAL       :     0.523953 sec
-     2,257,235,761      cycles                           #    2.939 GHz                    
-     3,115,853,120      instructions                     #    1.38  insn per cycle         
-       0.825012907 seconds time elapsed
+TOTAL       :     0.548991 sec
+     2,291,931,243      cycles                    #    2.897 GHz                    
+     2,895,980,863      instructions              #    1.26  insn per cycle         
+       0.849095215 seconds time elapsed
 runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd1/gcheck.exe -p 2048 256 1
 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 208
 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100%
@@ -63,20 +63,20 @@ Relative difference = 3.2588034143755247e-07
 OK (relative difference <= 5E-3)
 =========================================================================
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd1/check.exe -p 2048 256 2 OMP=
-Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=1]
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.2.0] [inlineHel=1] [hardcodePARAM=1]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 2.638662e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.736999e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.736999e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.696861e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.817621e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.817621e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086689e+00 +- 3.413217e-03 )  GeV^0
-TOTAL       :     4.081958 sec
-    12,320,219,319      cycles                           #    3.014 GHz                    
-    35,009,881,921      instructions                     #    2.84  insn per cycle         
-       4.088093070 seconds time elapsed
-=Symbols in CPPProcess.o= (~sse4:  457) (avx2:    0) (512y:    0) (512z:    0)
+TOTAL       :     3.995865 sec
+    12,393,440,631      cycles                    #    3.099 GHz                    
+    35,121,655,729      instructions              #    2.83  insn per cycle         
+       4.002434492 seconds time elapsed
+=Symbols in CPPProcess.o= (~sse4:  458) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd1/runTest.exe
 [  PASSED  ] 6 tests.
@@ -89,19 +89,19 @@ Relative difference = 3.258803992249869e-07
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl1_hrd1/check.exe -p 2048 256 2 OMP=
-Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=1]
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.2.0] [inlineHel=1] [hardcodePARAM=1]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 3.150755e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.298430e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.298430e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.203775e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.393294e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.393294e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086689e+00 +- 3.413217e-03 )  GeV^0
-TOTAL       :     3.435005 sec
-    10,404,447,581      cycles                           #    3.026 GHz                    
-    21,993,103,626      instructions                     #    2.11  insn per cycle         
-       3.440569541 seconds time elapsed
+TOTAL       :     3.383474 sec
+    10,433,445,794      cycles                    #    3.079 GHz                    
+    22,106,457,751      instructions              #    2.12  insn per cycle         
+       3.399221962 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4: 2351) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl1_hrd1/runTest.exe
@@ -115,19 +115,19 @@ Relative difference = 3.2588039900609506e-07
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl1_hrd1/check.exe -p 2048 256 2 OMP=
-Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=1]
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.2.0] [inlineHel=1] [hardcodePARAM=1]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 5.736703e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 6.246663e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 6.246663e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 5.959707e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 6.624477e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 6.624477e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086689e+00 +- 3.413217e-03 )  GeV^0
-TOTAL       :     1.937670 sec
-     5,358,288,000      cycles                           #    2.758 GHz                    
-    10,256,578,853      instructions                     #    1.91  insn per cycle         
-       1.943859017 seconds time elapsed
+TOTAL       :     1.874404 sec
+     5,449,857,591      cycles                    #    2.908 GHz                    
+    10,370,988,020      instructions              #    1.90  insn per cycle         
+       1.886802544 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 2170) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl1_hrd1/runTest.exe
@@ -141,19 +141,19 @@ Relative difference = 3.2588037186351226e-07
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl1_hrd1/check.exe -p 2048 256 2 OMP=
-Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=1]
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.2.0] [inlineHel=1] [hardcodePARAM=1]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 5.908842e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 6.427368e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 6.427368e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 5.940537e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 6.590919e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 6.590919e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086689e+00 +- 3.413217e-03 )  GeV^0
-TOTAL       :     1.882902 sec
-     5,396,388,166      cycles                           #    2.859 GHz                    
-     9,460,392,900      instructions                     #    1.75  insn per cycle         
-       1.888502703 seconds time elapsed
+TOTAL       :     1.873958 sec
+     5,488,219,503      cycles                    #    2.922 GHz                    
+     9,577,123,391      instructions              #    1.75  insn per cycle         
+       1.890504897 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 1857) (512y:  115) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl1_hrd1/runTest.exe
@@ -167,19 +167,19 @@ Relative difference = 3.2588037186351226e-07
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl1_hrd1/check.exe -p 2048 256 2 OMP=
-Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=1]
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.2.0] [inlineHel=1] [hardcodePARAM=1]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 4.549884e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 4.851953e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 4.851953e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 4.678748e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 5.078914e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 5.078914e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086689e+00 +- 3.413217e-03 )  GeV^0
-TOTAL       :     2.411770 sec
-     4,707,067,837      cycles                           #    1.950 GHz                    
-     7,285,308,864      instructions                     #    1.55  insn per cycle         
-       2.417369345 seconds time elapsed
+TOTAL       :     2.351438 sec
+     4,817,365,178      cycles                    #    2.046 GHz                    
+     7,401,125,646      instructions              #    1.54  insn per cycle         
+       2.365073669 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 1298) (512y:  193) (512z: 1369)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl1_hrd1/runTest.exe
diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0.txt
index 2c306e7eab..51b62d4486 100644
--- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0.txt
+++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0.txt
@@ -35,22 +35,22 @@ CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 
-DATE: 2023-07-18_22:40:55
+DATE: 2023-06-16_22:52:02
 
-On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
+On itscrd80.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 2 OMP=
-Process                     = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 8.217276e+07                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.572809e+08                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.948314e+08                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.086859e+08                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.694248e+08                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.954158e+08                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086718e+00 +- 3.413389e-03 )  GeV^0
-TOTAL       :     0.485442 sec
-     2,080,435,790      cycles                           #    2.923 GHz                    
-     2,866,762,926      instructions                     #    1.38  insn per cycle         
-       0.791800006 seconds time elapsed
+TOTAL       :     0.501803 sec
+     2,114,448,212      cycles                    #    2.892 GHz                    
+     2,650,449,805      instructions              #    1.25  insn per cycle         
+       0.791582379 seconds time elapsed
 runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 1
 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 128
 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100%
@@ -58,25 +58,25 @@ runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesse
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2
 Avg ME (C++/CUDA)   = 2.028811e+00
-Avg ME (F77/CUDA)   = 2.0288499749731272
-Relative difference = 1.9210746159747678e-05
+Avg ME (F77/CUDA)   = 2.0288499668240547
+Relative difference = 1.920672948568199e-05
 OK (relative difference <= 5E-3)
 =========================================================================
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/check.exe -p 2048 256 2 OMP=
-Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.956154e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.009707e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.009707e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.045424e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.103487e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.103487e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086780e+00 +- 3.413794e-03 )  GeV^0
-TOTAL       :     5.446517 sec
-    16,514,086,283      cycles                           #    3.029 GHz                    
-    45,242,334,961      instructions                     #    2.74  insn per cycle         
-       5.453327424 seconds time elapsed
-=Symbols in CPPProcess.o= (~sse4:  628) (avx2:    0) (512y:    0) (512z:    0)
+TOTAL       :     5.214918 sec
+    16,086,691,839      cycles                    #    3.084 GHz                    
+    45,264,306,297      instructions              #    2.81  insn per cycle         
+       5.221298921 seconds time elapsed
+=Symbols in CPPProcess.o= (~sse4:  630) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/runTest.exe
 [  PASSED  ] 6 tests.
@@ -89,19 +89,19 @@ Relative difference = 6.443528218283898e-08
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/check.exe -p 2048 256 2 OMP=
-Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 4.892673e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 5.273909e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 5.273909e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 5.087937e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 5.510673e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 5.510673e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086781e+00 +- 3.413806e-03 )  GeV^0
-TOTAL       :     2.230204 sec
-     6,747,558,443      cycles                           #    3.021 GHz                    
-    16,682,885,775      instructions                     #    2.47  insn per cycle         
-       2.237199250 seconds time elapsed
+TOTAL       :     2.147379 sec
+     6,633,404,493      cycles                    #    3.083 GHz                    
+    16,691,710,310      instructions              #    2.52  insn per cycle         
+       2.153586806 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4: 3123) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/runTest.exe
@@ -115,19 +115,19 @@ Relative difference = 8.24528544926829e-08
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/check.exe -p 2048 256 2 OMP=
-Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.039386e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.217213e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.217213e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.082531e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.272863e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.272863e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086812e+00 +- 3.414242e-03 )  GeV^0
-TOTAL       :     1.098393 sec
-     3,098,459,353      cycles                           #    2.813 GHz                    
-     7,014,812,146      instructions                     #    2.26  insn per cycle         
-       1.105173507 seconds time elapsed
+TOTAL       :     1.053599 sec
+     3,105,351,321      cycles                    #    2.935 GHz                    
+     7,028,445,226      instructions              #    2.26  insn per cycle         
+       1.065166686 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 2735) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/runTest.exe
@@ -141,19 +141,19 @@ Relative difference = 1.2313965798858044e-07
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/check.exe -p 2048 256 2 OMP=
-Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.133289e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.341271e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.341271e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.144769e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.363322e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.363322e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086812e+00 +- 3.414242e-03 )  GeV^0
-TOTAL       :     1.012727 sec
-     2,932,731,503      cycles                           #    2.887 GHz                    
-     6,731,148,444      instructions                     #    2.30  insn per cycle         
-       1.019615763 seconds time elapsed
+TOTAL       :     1.001609 sec
+     2,959,023,588      cycles                    #    2.940 GHz                    
+     6,742,632,212      instructions              #    2.28  insn per cycle         
+       1.013857674 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 2593) (512y:   12) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/runTest.exe
@@ -167,20 +167,20 @@ Relative difference = 1.2313965798858044e-07
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/check.exe -p 2048 256 2 OMP=
-Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 7.994135e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 8.967821e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 8.967821e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 8.162662e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 9.186029e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 9.186029e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086812e+00 +- 3.414242e-03 )  GeV^0
-TOTAL       :     1.398110 sec
-     2,867,836,390      cycles                           #    2.047 GHz                    
-     4,844,016,196      instructions                     #    1.69  insn per cycle         
-       1.404798548 seconds time elapsed
-=Symbols in CPPProcess.o= (~sse4:    0) (avx2: 1731) (512y:   22) (512z: 1849)
+TOTAL       :     1.370260 sec
+     2,871,098,848      cycles                    #    2.089 GHz                    
+     4,848,815,662      instructions              #    1.69  insn per cycle         
+       1.382844305 seconds time elapsed
+=Symbols in CPPProcess.o= (~sse4:    0) (avx2: 1722) (512y:   22) (512z: 1849)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/runTest.exe
 [  PASSED  ] 6 tests.
diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_bridge.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_bridge.txt
index 240f0eb579..500d268665 100644
--- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_bridge.txt
+++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_bridge.txt
@@ -35,26 +35,26 @@ CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 
-DATE: 2023-07-18_23:23:52
+DATE: 2023-06-16_23:20:18
 
-On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
+On itscrd80.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 2 --bridge OMP=
 WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost
 WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost
 WARNING! Instantiate device Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288)
 WARNING! Set grid in Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288)
-Process                     = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 7.077735e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 5.457332e+07                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 5.457332e+07                 )  sec^-1
-MeanMatrixElemValue         = ( 2.086805e+00 +- 3.414078e-03 )  GeV^0
-TOTAL       :     0.668812 sec
-     2,707,938,072      cycles                           #    2.970 GHz                    
-     4,089,951,980      instructions                     #    1.51  insn per cycle         
-       0.968803408 seconds time elapsed
+EvtsPerSec[Rmb+ME]     (23) = ( 7.110307e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 5.876714e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 5.876714e+07                 )  sec^-1
+MeanMatrixElemValue         = ( 2.086808e+00 +- 3.414090e-03 )  GeV^0
+TOTAL       :     0.696990 sec
+     2,704,320,756      cycles                    #    2.897 GHz                    
+     3,812,649,511      instructions              #    1.41  insn per cycle         
+       0.992471982 seconds time elapsed
 runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 1 --bridge
 WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost
 WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost
@@ -66,26 +66,26 @@ WARNING! Set grid in Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublo
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2
 Avg ME (C++/CUDA)   = 2.028811e+00
-Avg ME (F77/CUDA)   = 2.0288499749731272
-Relative difference = 1.9210746159747678e-05
+Avg ME (F77/CUDA)   = 2.0288499668240547
+Relative difference = 1.920672948568199e-05
 OK (relative difference <= 5E-3)
 =========================================================================
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/check.exe -p 2048 256 2 --bridge OMP=
 WARNING! Instantiate host Bridge (nevt=524288)
-Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.927286e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.979745e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.979745e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.034027e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.092499e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.092499e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086780e+00 +- 3.413794e-03 )  GeV^0
-TOTAL       :     5.569040 sec
-    16,686,762,464      cycles                           #    2.995 GHz                    
-    45,285,461,035      instructions                     #    2.71  insn per cycle         
-       5.575706950 seconds time elapsed
-=Symbols in CPPProcess.o= (~sse4:  628) (avx2:    0) (512y:    0) (512z:    0)
+TOTAL       :     5.291084 sec
+    16,288,379,429      cycles                    #    3.078 GHz                    
+    45,312,040,286      instructions              #    2.78  insn per cycle         
+       5.298079548 seconds time elapsed
+=Symbols in CPPProcess.o= (~sse4:  630) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/runTest.exe
 [  PASSED  ] 6 tests.
@@ -99,19 +99,19 @@ OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/check.exe -p 2048 256 2 --bridge OMP=
 WARNING! Instantiate host Bridge (nevt=524288)
-Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 4.892165e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 5.271278e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 5.271278e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 5.061059e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 5.467740e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 5.467740e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086781e+00 +- 3.413806e-03 )  GeV^0
-TOTAL       :     2.272550 sec
-     6,943,591,915      cycles                           #    3.049 GHz                    
-    16,959,252,147      instructions                     #    2.44  insn per cycle         
-       2.278715637 seconds time elapsed
+TOTAL       :     2.203734 sec
+     6,829,055,399      cycles                    #    3.092 GHz                    
+    16,972,646,719      instructions              #    2.49  insn per cycle         
+       2.216688904 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4: 3123) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/runTest.exe
@@ -126,19 +126,19 @@ OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/check.exe -p 2048 256 2 --bridge OMP=
 WARNING! Instantiate host Bridge (nevt=524288)
-Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.040490e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.216209e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.216209e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.061515e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.242839e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.242839e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086812e+00 +- 3.414242e-03 )  GeV^0
-TOTAL       :     1.138766 sec
-     3,304,662,247      cycles                           #    2.887 GHz                    
-     7,255,626,586      instructions                     #    2.20  insn per cycle         
-       1.145475247 seconds time elapsed
+TOTAL       :     1.119383 sec
+     3,302,419,161      cycles                    #    2.935 GHz                    
+     7,266,282,160      instructions              #    2.20  insn per cycle         
+       1.131841634 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 2735) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/runTest.exe
@@ -153,19 +153,19 @@ OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/check.exe -p 2048 256 2 --bridge OMP=
 WARNING! Instantiate host Bridge (nevt=524288)
-Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.115706e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.317047e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.317047e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.096474e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.293763e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.293763e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086812e+00 +- 3.414242e-03 )  GeV^0
-TOTAL       :     1.070181 sec
-     3,123,669,032      cycles                           #    2.906 GHz                    
-     6,969,124,690      instructions                     #    2.23  insn per cycle         
-       1.076178013 seconds time elapsed
+TOTAL       :     1.088730 sec
+     3,150,094,724      cycles                    #    2.877 GHz                    
+     6,980,573,007      instructions              #    2.22  insn per cycle         
+       1.104462726 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 2593) (512y:   12) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/runTest.exe
@@ -180,20 +180,20 @@ OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/check.exe -p 2048 256 2 --bridge OMP=
 WARNING! Instantiate host Bridge (nevt=524288)
-Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 7.898530e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 8.849790e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 8.849790e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 8.080793e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 9.067489e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 9.067489e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086812e+00 +- 3.414242e-03 )  GeV^0
-TOTAL       :     1.459194 sec
-     3,072,125,058      cycles                           #    2.097 GHz                    
-     5,101,749,018      instructions                     #    1.66  insn per cycle         
-       1.465704045 seconds time elapsed
-=Symbols in CPPProcess.o= (~sse4:    0) (avx2: 1731) (512y:   22) (512z: 1849)
+TOTAL       :     1.430225 sec
+     3,084,429,817      cycles                    #    2.147 GHz                    
+     5,103,946,062      instructions              #    1.65  insn per cycle         
+       1.443126402 seconds time elapsed
+=Symbols in CPPProcess.o= (~sse4:    0) (avx2: 1722) (512y:   22) (512z: 1849)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/runTest.exe
 [  PASSED  ] 6 tests.
diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_common.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_common.txt
index 95a4fe5b8e..a6492e3922 100644
--- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_common.txt
+++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_common.txt
@@ -35,22 +35,22 @@ CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 
-DATE: 2023-07-18_23:35:50
+DATE: 2023-06-16_23:32:37
 
-On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
+On itscrd80.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 2 --common OMP=
-Process                     = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:COMMON+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 9.466782e+07                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.650096e+08                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.971128e+08                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 9.808067e+07                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.652444e+08                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.960520e+08                 )  sec^-1
 MeanMatrixElemValue         = ( 2.079446e+00 +- 3.403306e-03 )  GeV^0
-TOTAL       :     0.564456 sec
-     2,318,311,977      cycles                           #    2.948 GHz                    
-     3,311,952,807      instructions                     #    1.43  insn per cycle         
-       0.843584481 seconds time elapsed
+TOTAL       :     0.588246 sec
+     2,402,908,432      cycles                    #    2.901 GHz                    
+     3,110,614,158      instructions              #    1.29  insn per cycle         
+       0.886196949 seconds time elapsed
 runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 1 --common
 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 128
 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100%
@@ -58,25 +58,25 @@ runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesse
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2
 Avg ME (C++/CUDA)   = 2.028811e+00
-Avg ME (F77/CUDA)   = 2.0288499749731272
-Relative difference = 1.9210746159747678e-05
+Avg ME (F77/CUDA)   = 2.0288499668240547
+Relative difference = 1.920672948568199e-05
 OK (relative difference <= 5E-3)
 =========================================================================
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/check.exe -p 2048 256 2 --common OMP=
-Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.962432e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.016728e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.016728e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.033763e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.091227e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.091227e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.079573e+00 +- 3.404712e-03 )  GeV^0
-TOTAL       :     5.483598 sec
-    16,671,675,276      cycles                           #    3.039 GHz                    
-    45,271,419,303      instructions                     #    2.72  insn per cycle         
-       5.488608674 seconds time elapsed
-=Symbols in CPPProcess.o= (~sse4:  628) (avx2:    0) (512y:    0) (512z:    0)
+TOTAL       :     5.296990 sec
+    16,269,407,762      cycles                    #    3.070 GHz                    
+    45,296,730,260      instructions              #    2.78  insn per cycle         
+       5.303386291 seconds time elapsed
+=Symbols in CPPProcess.o= (~sse4:  630) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/runTest.exe
 [  PASSED  ] 6 tests.
@@ -89,19 +89,19 @@ Relative difference = 6.443528218283898e-08
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/check.exe -p 2048 256 2 --common OMP=
-Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 4.869022e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 5.249889e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 5.249889e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 5.111672e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 5.523499e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 5.523499e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.079574e+00 +- 3.404724e-03 )  GeV^0
-TOTAL       :     2.292634 sec
-     6,902,222,192      cycles                           #    3.006 GHz                    
-    16,695,390,275      instructions                     #    2.42  insn per cycle         
-       2.297548144 seconds time elapsed
+TOTAL       :     2.191606 sec
+     6,792,051,649      cycles                    #    3.094 GHz                    
+    16,705,051,298      instructions              #    2.46  insn per cycle         
+       2.206896271 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4: 3123) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/runTest.exe
@@ -115,19 +115,19 @@ Relative difference = 8.24528544926829e-08
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/check.exe -p 2048 256 2 --common OMP=
-Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.053420e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.233732e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.233732e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.084154e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.276390e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.276390e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 2.079552e+00 +- 3.404217e-03 )  GeV^0
-TOTAL       :     1.134406 sec
-     3,272,747,000      cycles                           #    2.874 GHz                    
-     7,002,338,013      instructions                     #    2.14  insn per cycle         
-       1.139504150 seconds time elapsed
+TOTAL       :     1.106630 sec
+     3,283,893,034      cycles                    #    2.956 GHz                    
+     7,013,364,350      instructions              #    2.14  insn per cycle         
+       1.121623240 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 2735) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/runTest.exe
@@ -141,19 +141,19 @@ Relative difference = 1.2313965798858044e-07
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/check.exe -p 2048 256 2 --common OMP=
-Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:COMMON+RMBHST+MESHST/512y+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.105641e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.306620e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.306620e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.140251e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.355699e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.355699e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 2.079552e+00 +- 3.404217e-03 )  GeV^0
-TOTAL       :     1.090369 sec
-     3,098,852,591      cycles                           #    2.831 GHz                    
-     6,681,759,155      instructions                     #    2.16  insn per cycle         
-       1.095947246 seconds time elapsed
+TOTAL       :     1.061058 sec
+     3,126,583,899      cycles                    #    2.934 GHz                    
+     6,694,705,597      instructions              #    2.14  insn per cycle         
+       1.076779612 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 2593) (512y:   12) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/runTest.exe
@@ -167,20 +167,20 @@ Relative difference = 1.2313965798858044e-07
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/check.exe -p 2048 256 2 --common OMP=
-Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:COMMON+RMBHST+MESHST/512z+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 7.633216e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 8.532699e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 8.532699e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 8.180445e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 9.199369e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 9.199369e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.079552e+00 +- 3.404217e-03 )  GeV^0
-TOTAL       :     1.516441 sec
-     3,031,711,893      cycles                           #    1.994 GHz                    
-     4,798,217,271      instructions                     #    1.58  insn per cycle         
-       1.521855756 seconds time elapsed
-=Symbols in CPPProcess.o= (~sse4:    0) (avx2: 1731) (512y:   22) (512z: 1849)
+TOTAL       :     1.423127 sec
+     3,043,448,384      cycles                    #    2.132 GHz                    
+     4,800,221,178      instructions              #    1.58  insn per cycle         
+       1.434967626 seconds time elapsed
+=Symbols in CPPProcess.o= (~sse4:    0) (avx2: 1722) (512y:   22) (512z: 1849)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/runTest.exe
 [  PASSED  ] 6 tests.
diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_curhst.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_curhst.txt
index 24f2d03ca1..5b694fb236 100644
--- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_curhst.txt
+++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_curhst.txt
@@ -35,22 +35,22 @@ CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 
-DATE: 2023-07-18_23:32:52
+DATE: 2023-06-16_23:29:34
 
-On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
+On itscrd80.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 2 --curhst OMP=
-Process                     = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURHST+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 9.484059e+07                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.651757e+08                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.971705e+08                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 9.836019e+07                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.651430e+08                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.966312e+08                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086718e+00 +- 3.413389e-03 )  GeV^0
-TOTAL       :     0.510522 sec
-     2,157,248,955      cycles                           #    2.948 GHz                    
-     3,306,751,827      instructions                     #    1.53  insn per cycle         
-       0.789217813 seconds time elapsed
+TOTAL       :     0.535629 sec
+     2,203,011,013      cycles                    #    2.886 GHz                    
+     3,060,363,719      instructions              #    1.39  insn per cycle         
+       0.822111510 seconds time elapsed
 runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 1 --curhst
 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 128
 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100%
@@ -58,25 +58,25 @@ runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesse
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2
 Avg ME (C++/CUDA)   = 2.028811e+00
-Avg ME (F77/CUDA)   = 2.0288499749731272
-Relative difference = 1.9210746159747678e-05
+Avg ME (F77/CUDA)   = 2.0288499668240547
+Relative difference = 1.920672948568199e-05
 OK (relative difference <= 5E-3)
 =========================================================================
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/check.exe -p 2048 256 2 --curhst OMP=
-Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.941861e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.997041e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.997041e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.025315e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.083260e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.083260e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086780e+00 +- 3.413794e-03 )  GeV^0
-TOTAL       :     5.487408 sec
-    16,511,648,399      cycles                           #    3.007 GHz                    
-    45,244,112,398      instructions                     #    2.74  insn per cycle         
-       5.492462207 seconds time elapsed
-=Symbols in CPPProcess.o= (~sse4:  628) (avx2:    0) (512y:    0) (512z:    0)
+TOTAL       :     5.275218 sec
+    16,091,710,481      cycles                    #    3.052 GHz                    
+    45,265,017,659      instructions              #    2.81  insn per cycle         
+       5.281528258 seconds time elapsed
+=Symbols in CPPProcess.o= (~sse4:  630) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/runTest.exe
 [  PASSED  ] 6 tests.
@@ -89,19 +89,19 @@ Relative difference = 6.443528218283898e-08
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/check.exe -p 2048 256 2 --curhst OMP=
-Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 4.895899e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 5.279157e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 5.279157e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 5.134361e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 5.551581e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 5.551581e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086781e+00 +- 3.413806e-03 )  GeV^0
-TOTAL       :     2.227339 sec
-     6,742,004,236      cycles                           #    3.021 GHz                    
-    16,683,347,281      instructions                     #    2.47  insn per cycle         
-       2.232732049 seconds time elapsed
+TOTAL       :     2.128639 sec
+     6,626,366,458      cycles                    #    3.106 GHz                    
+    16,691,617,090      instructions              #    2.52  insn per cycle         
+       2.144077506 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4: 3123) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/runTest.exe
@@ -115,19 +115,19 @@ Relative difference = 8.24528544926829e-08
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/check.exe -p 2048 256 2 --curhst OMP=
-Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.054815e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.234111e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.234111e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.089985e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.279320e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.279320e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086812e+00 +- 3.414242e-03 )  GeV^0
-TOTAL       :     1.078882 sec
-     3,105,837,405      cycles                           #    2.867 GHz                    
-     7,018,311,290      instructions                     #    2.26  insn per cycle         
-       1.084486905 seconds time elapsed
+TOTAL       :     1.046794 sec
+     3,097,881,265      cycles                    #    2.946 GHz                    
+     7,028,413,867      instructions              #    2.27  insn per cycle         
+       1.062080365 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 2735) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/runTest.exe
@@ -141,19 +141,19 @@ Relative difference = 1.2313965798858044e-07
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/check.exe -p 2048 256 2 --curhst OMP=
-Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.063461e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.252592e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.252592e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.151751e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.366544e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.366544e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086812e+00 +- 3.414242e-03 )  GeV^0
-TOTAL       :     1.074162 sec
-     2,944,652,129      cycles                           #    2.730 GHz                    
-     6,731,582,934      instructions                     #    2.29  insn per cycle         
-       1.079931534 seconds time elapsed
+TOTAL       :     0.995620 sec
+     2,939,121,938      cycles                    #    2.938 GHz                    
+     6,742,732,632      instructions              #    2.29  insn per cycle         
+       1.001882555 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 2593) (512y:   12) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/runTest.exe
@@ -167,20 +167,20 @@ Relative difference = 1.2313965798858044e-07
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/check.exe -p 2048 256 2 --curhst OMP=
-Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 7.961017e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 8.924748e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 8.924748e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 8.156539e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 9.166833e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 9.166833e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086812e+00 +- 3.414242e-03 )  GeV^0
-TOTAL       :     1.402742 sec
-     2,862,663,665      cycles                           #    2.034 GHz                    
-     4,847,235,521      instructions                     #    1.69  insn per cycle         
-       1.408043991 seconds time elapsed
-=Symbols in CPPProcess.o= (~sse4:    0) (avx2: 1731) (512y:   22) (512z: 1849)
+TOTAL       :     1.371454 sec
+     2,896,780,603      cycles                    #    2.105 GHz                    
+     4,848,892,795      instructions              #    1.67  insn per cycle         
+       1.377614632 seconds time elapsed
+=Symbols in CPPProcess.o= (~sse4:    0) (avx2: 1722) (512y:   22) (512z: 1849)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/runTest.exe
 [  PASSED  ] 6 tests.
diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_rmbhst.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_rmbhst.txt
index 04fe0fb90b..f142e268bd 100644
--- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_rmbhst.txt
+++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_rmbhst.txt
@@ -35,23 +35,23 @@ CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 
-DATE: 2023-07-18_23:29:59
+DATE: 2023-06-16_23:26:35
 
-On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
+On itscrd80.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 2 --rmbhst OMP=
 WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost
-Process                     = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURHST+RMBHST+MESDEV/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 8.694538e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.624973e+08                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.937992e+08                 )  sec^-1
-MeanMatrixElemValue         = ( 2.086805e+00 +- 3.414078e-03 )  GeV^0
-TOTAL       :     0.621749 sec
-     2,436,447,955      cycles                           #    2.874 GHz                    
-     3,730,153,997      instructions                     #    1.53  insn per cycle         
-       0.906983386 seconds time elapsed
+EvtsPerSec[Rmb+ME]     (23) = ( 8.844906e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.667637e+08                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.968555e+08                 )  sec^-1
+MeanMatrixElemValue         = ( 2.086808e+00 +- 3.414090e-03 )  GeV^0
+TOTAL       :     0.637578 sec
+     2,532,069,078      cycles                    #    2.919 GHz                    
+     3,546,459,768      instructions              #    1.40  insn per cycle         
+       0.925210240 seconds time elapsed
 runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 1 --rmbhst
 WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost
 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 128
@@ -60,25 +60,25 @@ WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2
 Avg ME (C++/CUDA)   = 2.028811e+00
-Avg ME (F77/CUDA)   = 2.0288499749731272
-Relative difference = 1.9210746159747678e-05
+Avg ME (F77/CUDA)   = 2.0288499668240547
+Relative difference = 1.920672948568199e-05
 OK (relative difference <= 5E-3)
 =========================================================================
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/check.exe -p 2048 256 2 --rmbhst OMP=
-Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.929722e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.983904e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.983904e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.040228e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.098157e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.098157e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086780e+00 +- 3.413794e-03 )  GeV^0
-TOTAL       :     5.522797 sec
-    16,514,064,227      cycles                           #    2.989 GHz                    
-    45,244,888,942      instructions                     #    2.74  insn per cycle         
-       5.527861458 seconds time elapsed
-=Symbols in CPPProcess.o= (~sse4:  628) (avx2:    0) (512y:    0) (512z:    0)
+TOTAL       :     5.228615 sec
+    16,104,862,870      cycles                    #    3.079 GHz                    
+    45,267,189,999      instructions              #    2.81  insn per cycle         
+       5.234886365 seconds time elapsed
+=Symbols in CPPProcess.o= (~sse4:  630) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/runTest.exe
 [  PASSED  ] 6 tests.
@@ -91,19 +91,19 @@ Relative difference = 6.443528218283898e-08
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/check.exe -p 2048 256 2 --rmbhst OMP=
-Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 4.882531e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 5.256446e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 5.256446e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 5.110524e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 5.524586e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 5.524586e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086781e+00 +- 3.413806e-03 )  GeV^0
-TOTAL       :     2.233627 sec
-     6,749,698,693      cycles                           #    3.016 GHz                    
-    16,683,905,940      instructions                     #    2.47  insn per cycle         
-       2.239003688 seconds time elapsed
+TOTAL       :     2.139950 sec
+     6,622,300,033      cycles                    #    3.088 GHz                    
+    16,691,797,560      instructions              #    2.52  insn per cycle         
+       2.152617384 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4: 3123) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/runTest.exe
@@ -117,19 +117,19 @@ Relative difference = 8.24528544926829e-08
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/check.exe -p 2048 256 2 --rmbhst OMP=
-Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.066310e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.248063e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.248063e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.073753e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.265534e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.265534e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086812e+00 +- 3.414242e-03 )  GeV^0
-TOTAL       :     1.068064 sec
-     3,096,676,145      cycles                           #    2.887 GHz                    
-     7,018,044,016      instructions                     #    2.27  insn per cycle         
-       1.073453269 seconds time elapsed
+TOTAL       :     1.063032 sec
+     3,116,892,646      cycles                    #    2.921 GHz                    
+     7,028,647,548      instructions              #    2.26  insn per cycle         
+       1.074679609 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 2735) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/runTest.exe
@@ -143,19 +143,19 @@ Relative difference = 1.2313965798858044e-07
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/check.exe -p 2048 256 2 --rmbhst OMP=
-Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.131277e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.337030e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.337030e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.147777e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.361400e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.361400e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086812e+00 +- 3.414242e-03 )  GeV^0
-TOTAL       :     1.011866 sec
-     2,935,754,900      cycles                           #    2.890 GHz                    
-     6,731,265,559      instructions                     #    2.29  insn per cycle         
-       1.017019453 seconds time elapsed
+TOTAL       :     0.998684 sec
+     2,942,336,224      cycles                    #    2.933 GHz                    
+     6,742,794,858      instructions              #    2.29  insn per cycle         
+       1.014229588 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 2593) (512y:   12) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/runTest.exe
@@ -169,20 +169,20 @@ Relative difference = 1.2313965798858044e-07
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/check.exe -p 2048 256 2 --rmbhst OMP=
-Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 8.038564e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 9.027544e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 9.027544e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 8.156146e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 9.170100e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 9.170100e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086812e+00 +- 3.414242e-03 )  GeV^0
-TOTAL       :     1.390802 sec
-     2,864,121,739      cycles                           #    2.054 GHz                    
-     4,847,614,894      instructions                     #    1.69  insn per cycle         
-       1.395918141 seconds time elapsed
-=Symbols in CPPProcess.o= (~sse4:    0) (avx2: 1731) (512y:   22) (512z: 1849)
+TOTAL       :     1.371673 sec
+     2,870,181,896      cycles                    #    2.085 GHz                    
+     4,848,797,007      instructions              #    1.69  insn per cycle         
+       1.377996079 seconds time elapsed
+=Symbols in CPPProcess.o= (~sse4:    0) (avx2: 1722) (512y:   22) (512z: 1849)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/runTest.exe
 [  PASSED  ] 6 tests.
diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd1.txt
index c12727faea..908359aae9 100644
--- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd1.txt
+++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd1.txt
@@ -35,22 +35,22 @@ CUDACPP_BUILDDIR='build.512z_f_inl0_hrd1'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 
-DATE: 2023-07-18_22:41:17
+DATE: 2023-06-16_22:52:25
 
-On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
+On itscrd80.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd1/gcheck.exe -p 2048 256 2 OMP=
-Process                     = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1]
+Process                     = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 8.274020e+07                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.620265e+08                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.010701e+08                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.091832e+08                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.744916e+08                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.014144e+08                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086718e+00 +- 3.413389e-03 )  GeV^0
-TOTAL       :     0.485160 sec
-     2,099,851,451      cycles                           #    2.951 GHz                    
-     2,930,083,354      instructions                     #    1.40  insn per cycle         
-       0.775769836 seconds time elapsed
+TOTAL       :     0.502212 sec
+     2,095,360,596      cycles                    #    2.861 GHz                    
+     2,633,922,104      instructions              #    1.26  insn per cycle         
+       0.791606560 seconds time elapsed
 runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd1/gcheck.exe -p 2048 256 1
 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 127
 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100%
@@ -58,24 +58,24 @@ runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesse
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd1/gcheck.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd1/fgcheck.exe 2 64 2
 Avg ME (C++/CUDA)   = 2.028811e+00
-Avg ME (F77/CUDA)   = 2.0288499749731272
-Relative difference = 1.9210746159747678e-05
+Avg ME (F77/CUDA)   = 2.0288499668240547
+Relative difference = 1.920672948568199e-05
 OK (relative difference <= 5E-3)
 =========================================================================
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd1/check.exe -p 2048 256 2 OMP=
-Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1]
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.999828e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.056311e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.056311e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.080122e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.140133e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.140133e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086780e+00 +- 3.413794e-03 )  GeV^0
-TOTAL       :     5.329602 sec
-    16,261,566,148      cycles                           #    3.049 GHz                    
-    44,485,747,751      instructions                     #    2.74  insn per cycle         
-       5.336424594 seconds time elapsed
+TOTAL       :     5.126985 sec
+    15,885,461,434      cycles                    #    3.097 GHz                    
+    44,491,325,292      instructions              #    2.80  insn per cycle         
+       5.132808718 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:  580) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd1/runTest.exe
@@ -89,19 +89,19 @@ Relative difference = 6.443528218283898e-08
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd1/check.exe -p 2048 256 2 OMP=
-Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1]
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 5.927613e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 6.489171e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 6.489171e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 6.233465e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 6.861527e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 6.861527e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086781e+00 +- 3.413806e-03 )  GeV^0
-TOTAL       :     1.855943 sec
-     5,655,311,427      cycles                           #    3.042 GHz                    
-    15,825,122,163      instructions                     #    2.80  insn per cycle         
-       1.862537652 seconds time elapsed
+TOTAL       :     1.768519 sec
+     5,514,620,841      cycles                    #    3.109 GHz                    
+    15,833,995,859      instructions              #    2.87  insn per cycle         
+       1.780397431 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4: 2852) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd1/runTest.exe
@@ -115,19 +115,19 @@ Relative difference = 8.24528544926829e-08
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd1/check.exe -p 2048 256 2 OMP=
-Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1]
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 7.442084e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 8.286668e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 8.286668e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 7.563962e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 8.456105e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 8.456105e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086812e+00 +- 3.414242e-03 )  GeV^0
-TOTAL       :     1.494877 sec
-     4,308,010,076      cycles                           #    2.875 GHz                    
-     8,699,047,100      instructions                     #    2.02  insn per cycle         
-       1.501395488 seconds time elapsed
+TOTAL       :     1.472161 sec
+     4,310,727,148      cycles                    #    2.921 GHz                    
+     8,709,473,097      instructions              #    2.02  insn per cycle         
+       1.483481957 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 3300) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd1/runTest.exe
@@ -141,19 +141,19 @@ Relative difference = 1.2313965798858044e-07
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd1/check.exe -p 2048 256 2 OMP=
-Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1]
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 7.653978e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 8.555644e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 8.555644e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 7.777637e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 8.715040e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 8.715040e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086812e+00 +- 3.414242e-03 )  GeV^0
-TOTAL       :     1.456247 sec
-     4,198,587,715      cycles                           #    2.876 GHz                    
-     8,418,940,783      instructions                     #    2.01  insn per cycle         
-       1.462871031 seconds time elapsed
+TOTAL       :     1.433573 sec
+     4,206,371,569      cycles                    #    2.925 GHz                    
+     8,430,013,752      instructions              #    2.00  insn per cycle         
+       1.439953997 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 3203) (512y:    5) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd1/runTest.exe
@@ -167,19 +167,19 @@ Relative difference = 1.2313965798858044e-07
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd1/check.exe -p 2048 256 2 OMP=
-Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1]
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 5.777520e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 6.265352e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 6.265352e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 5.942233e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 6.458072e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 6.458072e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086812e+00 +- 3.414242e-03 )  GeV^0
-TOTAL       :     1.901803 sec
-     3,793,058,233      cycles                           #    1.992 GHz                    
-     6,732,056,557      instructions                     #    1.77  insn per cycle         
-       1.908711601 seconds time elapsed
+TOTAL       :     1.858685 sec
+     3,800,386,936      cycles                    #    2.045 GHz                    
+     6,743,751,136      instructions              #    1.77  insn per cycle         
+       1.870417357 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 2337) (512y:   12) (512z: 2190)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd1/runTest.exe
diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl1_hrd0.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl1_hrd0.txt
index a0fa7cdf34..03f579fc70 100644
--- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl1_hrd0.txt
+++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl1_hrd0.txt
@@ -35,22 +35,22 @@ CUDACPP_BUILDDIR='build.512z_f_inl1_hrd0'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 
-DATE: 2023-07-18_23:15:11
+DATE: 2023-06-16_23:11:17
 
-On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
+On itscrd80.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd0/gcheck.exe -p 2048 256 2 OMP=
-Process                     = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=1] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=1] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 9.422354e+07                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.645287e+08                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.966949e+08                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 9.788614e+07                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.673918e+08                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.981454e+08                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086718e+00 +- 3.413389e-03 )  GeV^0
-TOTAL       :     0.481868 sec
-     2,073,830,742      cycles                           #    2.935 GHz                    
-     2,863,733,688      instructions                     #    1.38  insn per cycle         
-       0.764204199 seconds time elapsed
+TOTAL       :     0.503723 sec
+     2,153,847,666      cycles                    #    2.865 GHz                    
+     2,679,220,576      instructions              #    1.24  insn per cycle         
+       0.809587763 seconds time elapsed
 runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd0/gcheck.exe -p 2048 256 1
 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 128
 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100%
@@ -58,24 +58,24 @@ runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesse
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd0/gcheck.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd0/fgcheck.exe 2 64 2
 Avg ME (C++/CUDA)   = 2.028811e+00
-Avg ME (F77/CUDA)   = 2.0288499749731272
-Relative difference = 1.9210746159747678e-05
+Avg ME (F77/CUDA)   = 2.0288499668240547
+Relative difference = 1.920672948568199e-05
 OK (relative difference <= 5E-3)
 =========================================================================
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd0/check.exe -p 2048 256 2 OMP=
-Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.2.0] [inlineHel=1] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 2.572125e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.666571e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.666571e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.654314e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.753826e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.753826e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086780e+00 +- 3.413794e-03 )  GeV^0
-TOTAL       :     4.163180 sec
-    12,652,638,378      cycles                           #    3.038 GHz                    
-    34,713,568,176      instructions                     #    2.74  insn per cycle         
-       4.168589758 seconds time elapsed
+TOTAL       :     4.037765 sec
+    12,382,941,525      cycles                    #    3.063 GHz                    
+    34,720,031,809      instructions              #    2.80  insn per cycle         
+       4.044257307 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:  710) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd0/runTest.exe
@@ -89,19 +89,19 @@ Relative difference = 4.463890496342449e-08
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl1_hrd0/check.exe -p 2048 256 2 OMP=
-Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.2.0] [inlineHel=1] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 5.684803e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 6.208747e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 6.208747e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 5.839094e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 6.391075e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 6.391075e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086781e+00 +- 3.413806e-03 )  GeV^0
-TOTAL       :     1.930654 sec
-     5,894,451,853      cycles                           #    3.047 GHz                    
-    13,730,800,746      instructions                     #    2.33  insn per cycle         
-       1.936122727 seconds time elapsed
+TOTAL       :     1.882962 sec
+     5,789,032,021      cycles                    #    3.067 GHz                    
+    13,741,089,000      instructions              #    2.37  insn per cycle         
+       1.901362450 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4: 3019) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl1_hrd0/runTest.exe
@@ -115,19 +115,19 @@ Relative difference = 3.8327016574625664e-08
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl1_hrd0/check.exe -p 2048 256 2 OMP=
-Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.2.0] [inlineHel=1] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 8.993820e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.025845e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.025845e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 9.215187e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.053508e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.053508e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086812e+00 +- 3.414242e-03 )  GeV^0
-TOTAL       :     1.250517 sec
-     3,619,439,498      cycles                           #    2.883 GHz                    
-     7,561,578,701      instructions                     #    2.09  insn per cycle         
-       1.255914931 seconds time elapsed
+TOTAL       :     1.223202 sec
+     3,599,020,391      cycles                    #    2.929 GHz                    
+     7,571,835,022      instructions              #    2.10  insn per cycle         
+       1.236020404 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 3640) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl1_hrd0/runTest.exe
@@ -141,19 +141,19 @@ Relative difference = 1.1252420410236244e-07
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl1_hrd0/check.exe -p 2048 256 2 OMP=
-Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.2.0] [inlineHel=1] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 9.078382e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.036724e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.036724e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 9.176325e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.050726e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.050726e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086812e+00 +- 3.414242e-03 )  GeV^0
-TOTAL       :     1.240246 sec
-     3,576,341,649      cycles                           #    2.872 GHz                    
-     7,127,496,668      instructions                     #    1.99  insn per cycle         
-       1.245884945 seconds time elapsed
+TOTAL       :     1.228546 sec
+     3,600,171,901      cycles                    #    2.920 GHz                    
+     7,138,528,819      instructions              #    1.98  insn per cycle         
+       1.240059403 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 3407) (512y:    1) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl1_hrd0/runTest.exe
@@ -167,19 +167,19 @@ Relative difference = 1.1252420410236244e-07
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl1_hrd0/check.exe -p 2048 256 2 OMP=
-Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.2.0] [inlineHel=1] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 6.921947e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 7.641857e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 7.641857e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 7.044103e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 7.781585e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.781585e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086812e+00 +- 3.414242e-03 )  GeV^0
-TOTAL       :     1.600928 sec
-     3,260,409,094      cycles                           #    2.031 GHz                    
-     6,088,341,124      instructions                     #    1.87  insn per cycle         
-       1.606537939 seconds time elapsed
+TOTAL       :     1.574940 sec
+     3,264,671,911      cycles                    #    2.066 GHz                    
+     6,099,502,522      instructions              #    1.87  insn per cycle         
+       1.581189951 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 3531) (512y:    0) (512z: 2032)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl1_hrd0/runTest.exe
diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl1_hrd1.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl1_hrd1.txt
index dab893021c..fc059049f2 100644
--- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl1_hrd1.txt
+++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl1_hrd1.txt
@@ -35,22 +35,22 @@ CUDACPP_BUILDDIR='build.512z_f_inl1_hrd1'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 
-DATE: 2023-07-18_23:15:33
+DATE: 2023-06-16_23:11:40
 
-On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
+On itscrd80.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd1/gcheck.exe -p 2048 256 2 OMP=
-Process                     = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=1] [hardcodePARAM=1]
+Process                     = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=1] [hardcodePARAM=1]
 Workflow summary            = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 9.454682e+07                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.671605e+08                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.998737e+08                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 9.829844e+07                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.682794e+08                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.002681e+08                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086718e+00 +- 3.413389e-03 )  GeV^0
-TOTAL       :     0.480198 sec
-     2,077,693,016      cycles                           #    2.948 GHz                    
-     2,863,537,381      instructions                     #    1.38  insn per cycle         
-       0.762361625 seconds time elapsed
+TOTAL       :     0.504050 sec
+     2,156,269,977      cycles                    #    2.889 GHz                    
+     2,661,239,316      instructions              #    1.23  insn per cycle         
+       0.803885518 seconds time elapsed
 runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd1/gcheck.exe -p 2048 256 1
 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 127
 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100%
@@ -58,25 +58,25 @@ runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesse
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd1/gcheck.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd1/fgcheck.exe 2 64 2
 Avg ME (C++/CUDA)   = 2.028811e+00
-Avg ME (F77/CUDA)   = 2.0288499749731272
-Relative difference = 1.9210746159747678e-05
+Avg ME (F77/CUDA)   = 2.0288499668240547
+Relative difference = 1.920672948568199e-05
 OK (relative difference <= 5E-3)
 =========================================================================
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd1/check.exe -p 2048 256 2 OMP=
-Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=1]
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.2.0] [inlineHel=1] [hardcodePARAM=1]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 2.773245e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.883059e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.883059e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.707277e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.810959e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.810959e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086780e+00 +- 3.413794e-03 )  GeV^0
-TOTAL       :     3.868309 sec
-    11,682,709,052      cycles                           #    3.018 GHz                    
-    34,862,885,807      instructions                     #    2.98  insn per cycle         
-       3.873785737 seconds time elapsed
-=Symbols in CPPProcess.o= (~sse4:  464) (avx2:    0) (512y:    0) (512z:    0)
+TOTAL       :     3.961686 sec
+    11,627,305,519      cycles                    #    2.933 GHz                    
+    34,903,521,773      instructions              #    3.00  insn per cycle         
+       3.967678741 seconds time elapsed
+=Symbols in CPPProcess.o= (~sse4:  465) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd1/runTest.exe
 [  PASSED  ] 6 tests.
@@ -89,19 +89,19 @@ Relative difference = 4.463890496342449e-08
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl1_hrd1/check.exe -p 2048 256 2 OMP=
-Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=1]
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.2.0] [inlineHel=1] [hardcodePARAM=1]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 5.632454e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 6.155978e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 6.155978e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 5.940642e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 6.518726e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 6.518726e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086781e+00 +- 3.413806e-03 )  GeV^0
-TOTAL       :     1.947933 sec
-     5,813,664,106      cycles                           #    2.978 GHz                    
-    13,358,473,505      instructions                     #    2.30  insn per cycle         
-       1.953458740 seconds time elapsed
+TOTAL       :     1.852150 sec
+     5,740,254,137      cycles                    #    3.092 GHz                    
+    13,368,786,182      instructions              #    2.33  insn per cycle         
+       1.858247428 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4: 2503) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl1_hrd1/runTest.exe
@@ -115,19 +115,19 @@ Relative difference = 5.749220495516028e-08
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl1_hrd1/check.exe -p 2048 256 2 OMP=
-Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=1]
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.2.0] [inlineHel=1] [hardcodePARAM=1]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 9.243536e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.059312e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.059312e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 9.380718e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.076685e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.076685e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086812e+00 +- 3.414242e-03 )  GeV^0
-TOTAL       :     1.219381 sec
-     3,526,273,182      cycles                           #    2.882 GHz                    
-     7,320,880,885      instructions                     #    2.08  insn per cycle         
-       1.224848096 seconds time elapsed
+TOTAL       :     1.203884 sec
+     3,531,102,752      cycles                    #    2.922 GHz                    
+     7,331,328,779      instructions              #    2.08  insn per cycle         
+       1.215736492 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 2935) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl1_hrd1/runTest.exe
@@ -141,19 +141,19 @@ Relative difference = 1.0167922688887485e-07
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl1_hrd1/check.exe -p 2048 256 2 OMP=
-Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=1]
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.2.0] [inlineHel=1] [hardcodePARAM=1]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 9.262178e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.066026e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.066026e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 9.449213e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.085295e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.085295e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086812e+00 +- 3.414242e-03 )  GeV^0
-TOTAL       :     1.219224 sec
-     3,497,432,036      cycles                           #    2.859 GHz                    
-     6,955,863,876      instructions                     #    1.99  insn per cycle         
-       1.224609988 seconds time elapsed
+TOTAL       :     1.195397 sec
+     3,507,221,220      cycles                    #    2.922 GHz                    
+     6,966,297,573      instructions              #    1.99  insn per cycle         
+       1.207630553 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 2744) (512y:    1) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl1_hrd1/runTest.exe
@@ -167,19 +167,19 @@ Relative difference = 1.0167922688887485e-07
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl1_hrd1/check.exe -p 2048 256 2 OMP=
-Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=1]
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.2.0] [inlineHel=1] [hardcodePARAM=1]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 6.931601e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 7.665347e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 7.665347e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 7.302201e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 8.102178e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 8.102178e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086812e+00 +- 3.414242e-03 )  GeV^0
-TOTAL       :     1.598191 sec
-     3,150,397,894      cycles                           #    1.966 GHz                    
-     5,916,588,298      instructions                     #    1.88  insn per cycle         
-       1.603813998 seconds time elapsed
+TOTAL       :     1.522441 sec
+     3,161,449,900      cycles                    #    2.070 GHz                    
+     5,927,790,892      instructions              #    1.88  insn per cycle         
+       1.534090530 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 2811) (512y:    0) (512z: 1595)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl1_hrd1/runTest.exe
diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd0.txt
index fe0561b233..2eb3f90d4a 100644
--- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd0.txt
+++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd0.txt
@@ -35,22 +35,22 @@ CUDACPP_BUILDDIR='build.512z_m_inl0_hrd0'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 
-DATE: 2023-07-18_22:41:41
+DATE: 2023-06-16_22:52:49
 
-On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
+On itscrd80.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd0/gcheck.exe -p 2048 256 2 OMP=
-Process                     = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 4.392536e+07                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.158109e+08                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.275734e+08                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 4.883163e+07                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.153438e+08                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.268351e+08                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086689e+00 +- 3.413217e-03 )  GeV^0
-TOTAL       :     0.531194 sec
-     2,261,044,639      cycles                           #    2.950 GHz                    
-     3,130,191,732      instructions                     #    1.38  insn per cycle         
-       0.834655474 seconds time elapsed
+TOTAL       :     0.546765 sec
+     2,277,271,038      cycles                    #    2.894 GHz                    
+     2,877,502,583      instructions              #    1.26  insn per cycle         
+       0.845241312 seconds time elapsed
 runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd0/gcheck.exe -p 2048 256 1
 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 214
 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100%
@@ -63,19 +63,19 @@ Relative difference = 3.241686432649386e-07
 OK (relative difference <= 5E-3)
 =========================================================================
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd0/check.exe -p 2048 256 2 OMP=
-Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.870049e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.918386e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.918386e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.947985e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.010292e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.010292e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086689e+00 +- 3.413217e-03 )  GeV^0
-TOTAL       :     5.713565 sec
-    17,359,831,528      cycles                           #    3.036 GHz                    
-    45,571,194,182      instructions                     #    2.63  insn per cycle         
-       5.720648230 seconds time elapsed
+TOTAL       :     5.490788 sec
+    16,975,144,546      cycles                    #    3.089 GHz                    
+    45,688,141,777      instructions              #    2.69  insn per cycle         
+       5.496744556 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:  625) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd0/runTest.exe
@@ -89,19 +89,19 @@ Relative difference = 3.0048445715164216e-07
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd0/check.exe -p 2048 256 2 OMP=
-Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 3.398273e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.571667e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.571667e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.550845e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.778152e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.778152e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086689e+00 +- 3.413217e-03 )  GeV^0
-TOTAL       :     3.193762 sec
-     9,453,328,575      cycles                           #    2.956 GHz                    
-    26,269,627,906      instructions                     #    2.78  insn per cycle         
-       3.200386787 seconds time elapsed
+TOTAL       :     3.061992 sec
+     9,396,473,786      cycles                    #    3.065 GHz                    
+    26,383,157,357      instructions              #    2.81  insn per cycle         
+       3.078372883 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4: 2530) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd0/runTest.exe
@@ -115,19 +115,19 @@ Relative difference = 3.0048445715164216e-07
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd0/check.exe -p 2048 256 2 OMP=
-Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 6.201835e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 6.775338e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 6.775338e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 6.169685e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 6.889662e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 6.889662e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086689e+00 +- 3.413217e-03 )  GeV^0
-TOTAL       :     1.798271 sec
-     5,177,674,203      cycles                           #    2.875 GHz                    
-    11,076,621,451      instructions                     #    2.14  insn per cycle         
-       1.805172074 seconds time elapsed
+TOTAL       :     1.808628 sec
+     5,256,027,628      cycles                    #    2.899 GHz                    
+    11,191,741,528      instructions              #    2.13  insn per cycle         
+       1.814736091 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 2396) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd0/runTest.exe
@@ -141,19 +141,19 @@ Relative difference = 2.9292737240031234e-07
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd0/check.exe -p 2048 256 2 OMP=
-Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 6.833335e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 7.536038e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 7.536038e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 6.822007e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 7.696291e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.696291e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086689e+00 +- 3.413217e-03 )  GeV^0
-TOTAL       :     1.642692 sec
-     4,730,582,091      cycles                           #    2.874 GHz                    
-    10,512,103,131      instructions                     #    2.22  insn per cycle         
-       1.649610489 seconds time elapsed
+TOTAL       :     1.643994 sec
+     4,833,892,867      cycles                    #    2.931 GHz                    
+    10,629,174,604      instructions              #    2.20  insn per cycle         
+       1.660177459 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 2214) (512y:   86) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd0/runTest.exe
@@ -167,20 +167,20 @@ Relative difference = 2.9292737240031234e-07
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd0/check.exe -p 2048 256 2 OMP=
-Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 4.155965e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 4.407271e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 4.407271e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 4.215185e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 4.538766e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.538766e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086689e+00 +- 3.413217e-03 )  GeV^0
-TOTAL       :     2.632441 sec
-     5,100,977,173      cycles                           #    1.935 GHz                    
-     6,869,699,327      instructions                     #    1.35  insn per cycle         
-       2.639569883 seconds time elapsed
-=Symbols in CPPProcess.o= (~sse4:    0) (avx2: 1331) (512y:   99) (512z: 1680)
+TOTAL       :     2.596408 sec
+     5,235,856,295      cycles                    #    2.013 GHz                    
+     6,967,015,570      instructions              #    1.33  insn per cycle         
+       2.609031673 seconds time elapsed
+=Symbols in CPPProcess.o= (~sse4:    0) (avx2: 1322) (512y:   98) (512z: 1681)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd0/runTest.exe
 [  PASSED  ] 6 tests.
diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd1.txt
index 1d55c3f17d..2922f1f91d 100644
--- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd1.txt
+++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd1.txt
@@ -35,22 +35,22 @@ CUDACPP_BUILDDIR='build.512z_m_inl0_hrd1'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 
-DATE: 2023-07-18_22:42:07
+DATE: 2023-06-16_22:53:16
 
-On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
+On itscrd80.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd1/gcheck.exe -p 2048 256 2 OMP=
-Process                     = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1]
+Process                     = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 4.591732e+07                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.163988e+08                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.278377e+08                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 4.913817e+07                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.163142e+08                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.279616e+08                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086689e+00 +- 3.413217e-03 )  GeV^0
-TOTAL       :     0.525972 sec
-     2,258,252,017      cycles                           #    2.941 GHz                    
-     3,161,692,710      instructions                     #    1.40  insn per cycle         
-       0.831314902 seconds time elapsed
+TOTAL       :     0.546816 sec
+     2,259,525,710      cycles                    #    2.872 GHz                    
+     2,874,198,806      instructions              #    1.27  insn per cycle         
+       0.845865418 seconds time elapsed
 runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd1/gcheck.exe -p 2048 256 1
 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 208
 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100%
@@ -63,19 +63,19 @@ Relative difference = 3.241686432649386e-07
 OK (relative difference <= 5E-3)
 =========================================================================
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd1/check.exe -p 2048 256 2 OMP=
-Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1]
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.918606e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.968345e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.968345e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.980085e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.044880e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.044880e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086689e+00 +- 3.413217e-03 )  GeV^0
-TOTAL       :     5.572154 sec
-    16,905,818,500      cycles                           #    3.032 GHz                    
-    44,546,391,043      instructions                     #    2.63  insn per cycle         
-       5.579066388 seconds time elapsed
+TOTAL       :     5.404304 sec
+    16,542,848,858      cycles                    #    3.058 GHz                    
+    44,663,688,353      instructions              #    2.70  insn per cycle         
+       5.410750967 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:  574) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd1/runTest.exe
@@ -89,20 +89,20 @@ Relative difference = 3.0048445715164216e-07
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd1/check.exe -p 2048 256 2 OMP=
-Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1]
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 3.521745e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.706976e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.706976e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.631536e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.869634e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.869634e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086689e+00 +- 3.413217e-03 )  GeV^0
-TOTAL       :     3.085524 sec
-     9,099,594,779      cycles                           #    2.946 GHz                    
-    24,902,805,370      instructions                     #    2.74  insn per cycle         
-       3.092491354 seconds time elapsed
-=Symbols in CPPProcess.o= (~sse4: 2369) (avx2:    0) (512y:    0) (512z:    0)
+TOTAL       :     2.995465 sec
+     8,977,674,629      cycles                    #    2.992 GHz                    
+    25,016,671,404      instructions              #    2.79  insn per cycle         
+       3.009020680 seconds time elapsed
+=Symbols in CPPProcess.o= (~sse4: 2371) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd1/runTest.exe
 [  PASSED  ] 6 tests.
@@ -115,19 +115,19 @@ Relative difference = 3.0048445715164216e-07
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd1/check.exe -p 2048 256 2 OMP=
-Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1]
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 5.391961e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 5.824695e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 5.824695e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 5.387090e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 5.916862e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 5.916862e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086689e+00 +- 3.413217e-03 )  GeV^0
-TOTAL       :     2.052840 sec
-     5,892,762,503      cycles                           #    2.865 GHz                    
-    12,222,365,250      instructions                     #    2.07  insn per cycle         
-       2.059770395 seconds time elapsed
+TOTAL       :     2.054054 sec
+     6,000,307,213      cycles                    #    2.915 GHz                    
+    12,337,042,112      instructions              #    2.06  insn per cycle         
+       2.066748623 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 2523) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd1/runTest.exe
@@ -141,19 +141,19 @@ Relative difference = 2.9292737240031234e-07
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd1/check.exe -p 2048 256 2 OMP=
-Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1]
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 5.645945e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 6.116372e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 6.116372e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 5.658921e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 6.249521e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 6.249521e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086689e+00 +- 3.413217e-03 )  GeV^0
-TOTAL       :     1.964121 sec
-     5,617,009,330      cycles                           #    2.855 GHz                    
-    11,751,786,889      instructions                     #    2.09  insn per cycle         
-       1.971325073 seconds time elapsed
+TOTAL       :     1.961008 sec
+     5,732,861,626      cycles                    #    2.917 GHz                    
+    11,869,224,559      instructions              #    2.07  insn per cycle         
+       1.973908984 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 2246) (512y:  242) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd1/runTest.exe
@@ -167,19 +167,19 @@ Relative difference = 2.9292737240031234e-07
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd1/check.exe -p 2048 256 2 OMP=
-Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1]
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 4.250862e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 4.513092e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 4.513092e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 4.318569e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 4.652438e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.652438e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086689e+00 +- 3.413217e-03 )  GeV^0
-TOTAL       :     2.574723 sec
-     5,071,782,594      cycles                           #    1.967 GHz                    
-     7,819,966,245      instructions                     #    1.54  insn per cycle         
-       2.581825456 seconds time elapsed
+TOTAL       :     2.536785 sec
+     5,170,015,216      cycles                    #    2.035 GHz                    
+     7,935,725,128      instructions              #    1.53  insn per cycle         
+       2.548762025 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 1280) (512y:  203) (512z: 1763)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd1/runTest.exe
diff --git a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd0.txt
index 5537ac3142..2e037d0b10 100644
--- a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd0.txt
+++ b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd0.txt
@@ -35,38 +35,38 @@ CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
 
-DATE: 2023-07-18_22:42:34
+DATE: 2023-06-16_22:53:43
 
-On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
+On itscrd80.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/gcheck.exe -p 64 256 10 OMP=
-Process                     = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 8.503919e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.049667e+07                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.066496e+07                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 9.026215e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.052133e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.065201e+07                 )  sec^-1
 MeanMatrixElemValue         = ( 1.008920e+02 +- 5.001681e+01 )  GeV^-2
-TOTAL       :     0.467316 sec
-     2,001,974,079      cycles                           #    2.930 GHz                    
-     2,757,134,869      instructions                     #    1.38  insn per cycle         
-       0.752234585 seconds time elapsed
+TOTAL       :     0.487786 sec
+     2,022,328,369      cycles                    #    2.846 GHz                    
+     2,538,461,046      instructions              #    1.26  insn per cycle         
+       0.768629495 seconds time elapsed
 runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/gcheck.exe -p 64 256 1
 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255
 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100%
 .........................................................................
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 1 OMP=
-Process                     = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 1.082976e+07                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.320902e+07                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.335180e+07                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.088515e+07                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.317362e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.330192e+07                 )  sec^-1
 MeanMatrixElemValue         = ( 6.734461e+02 +- 4.775415e+02 )  GeV^-2
-TOTAL       :     0.608019 sec
-     2,462,986,844      cycles                           #    2.901 GHz                    
-     3,667,408,117      instructions                     #    1.49  insn per cycle         
-       0.906833656 seconds time elapsed
+TOTAL       :     0.622651 sec
+     2,554,475,723      cycles                    #    2.940 GHz                    
+     3,479,055,818      instructions              #    1.36  insn per cycle         
+       0.928929641 seconds time elapsed
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2
@@ -76,19 +76,19 @@ Relative difference = 4.469239988637851e-07
 OK (relative difference <= 5E-3)
 =========================================================================
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/check.exe -p 64 256 10 OMP=
-Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 2.514851e+04                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.527304e+04                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.527304e+04                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.659944e+04                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.676456e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.676456e+04                 )  sec^-1
 MeanMatrixElemValue         = ( 1.008920e+02 +- 5.001681e+01 )  GeV^-2
-TOTAL       :     6.540428 sec
-    19,799,689,712      cycles                           #    3.027 GHz                    
-    59,004,193,140      instructions                     #    2.98  insn per cycle         
-       6.545917684 seconds time elapsed
+TOTAL       :     6.184495 sec
+    19,272,486,733      cycles                    #    3.115 GHz                    
+    59,041,427,699      instructions              #    3.06  insn per cycle         
+       6.189768442 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4: 1187) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/runTest.exe
@@ -102,19 +102,19 @@ Relative difference = 4.46923023397472e-07
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd0/check.exe -p 64 256 10 OMP=
-Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 4.780759e+04                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 4.825135e+04                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 4.825135e+04                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 4.959664e+04                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 5.017593e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 5.017593e+04                 )  sec^-1
 MeanMatrixElemValue         = ( 1.008920e+02 +- 5.001681e+01 )  GeV^-2
-TOTAL       :     3.452934 sec
-    10,383,918,995      cycles                           #    3.006 GHz                    
-    30,619,374,623      instructions                     #    2.95  insn per cycle         
-       3.457954159 seconds time elapsed
+TOTAL       :     3.326440 sec
+    10,241,888,109      cycles                    #    3.075 GHz                    
+    30,662,975,210      instructions              #    2.99  insn per cycle         
+       3.331736567 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4: 5158) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd0/runTest.exe
@@ -128,19 +128,19 @@ Relative difference = 4.46923023397472e-07
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd0/check.exe -p 64 256 10 OMP=
-Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.012797e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.032407e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.032407e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 9.791918e+04                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.003008e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.003008e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 1.008920e+02 +- 5.001681e+01 )  GeV^-2
-TOTAL       :     1.642738 sec
-     4,678,343,363      cycles                           #    2.846 GHz                    
-    10,873,667,959      instructions                     #    2.32  insn per cycle         
-       1.647799642 seconds time elapsed
+TOTAL       :     1.697221 sec
+     4,672,524,845      cycles                    #    2.747 GHz                    
+    10,912,223,676      instructions              #    2.34  insn per cycle         
+       1.702314818 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 4166) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd0/runTest.exe
@@ -154,19 +154,19 @@ Relative difference = 4.469241533230934e-07
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_d_inl0_hrd0/check.exe -p 64 256 10 OMP=
-Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.153352e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.178336e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.178336e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.140088e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.171799e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.171799e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 1.008920e+02 +- 5.001681e+01 )  GeV^-2
-TOTAL       :     1.446538 sec
-     4,118,589,455      cycles                           #    2.844 GHz                    
-    10,067,662,580      instructions                     #    2.44  insn per cycle         
-       1.452318936 seconds time elapsed
+TOTAL       :     1.461092 sec
+     4,145,138,859      cycles                    #    2.829 GHz                    
+    10,109,831,075      instructions              #    2.44  insn per cycle         
+       1.466721195 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 3967) (512y:   32) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_d_inl0_hrd0/runTest.exe
@@ -180,20 +180,20 @@ Relative difference = 4.469241533230934e-07
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_d_inl0_hrd0/check.exe -p 64 256 10 OMP=
-Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 8.072222e+04                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 8.195209e+04                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 8.195209e+04                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 8.316911e+04                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 8.479148e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 8.479148e+04                 )  sec^-1
 MeanMatrixElemValue         = ( 1.008920e+02 +- 5.001681e+01 )  GeV^-2
-TOTAL       :     2.055995 sec
-     3,852,949,281      cycles                           #    1.873 GHz                    
-     5,509,316,607      instructions                     #    1.43  insn per cycle         
-       2.061113838 seconds time elapsed
-=Symbols in CPPProcess.o= (~sse4:    0) (avx2: 1135) (512y:   60) (512z: 3429)
+TOTAL       :     1.996366 sec
+     3,886,762,322      cycles                    #    1.943 GHz                    
+     5,545,319,779      instructions              #    1.43  insn per cycle         
+       2.006628065 seconds time elapsed
+=Symbols in CPPProcess.o= (~sse4:    0) (avx2: 1125) (512y:   59) (512z: 3431)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_d_inl0_hrd0/runTest.exe
 [  PASSED  ] 6 tests.
diff --git a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd0_bridge.txt b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd0_bridge.txt
index 3462aa965e..a475d7222b 100644
--- a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd0_bridge.txt
+++ b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd0_bridge.txt
@@ -35,26 +35,26 @@ CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
 
-DATE: 2023-07-18_23:24:15
+DATE: 2023-06-16_23:20:42
 
-On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
+On itscrd80.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/gcheck.exe -p 64 256 10 --bridge OMP=
 WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost
 WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost
 WARNING! Instantiate device Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384)
 WARNING! Set grid in Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384)
-Process                     = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 2.810503e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 8.141219e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 8.141219e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.496628e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 8.248177e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 8.248177e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.008920e+02 +- 5.001681e+01 )  GeV^-2
-TOTAL       :     0.486993 sec
-     2,066,943,312      cycles                           #    2.937 GHz                    
-     3,000,534,395      instructions                     #    1.45  insn per cycle         
-       0.761751041 seconds time elapsed
+TOTAL       :     0.521129 sec
+     2,135,210,830      cycles                    #    2.889 GHz                    
+     2,829,931,118      instructions              #    1.33  insn per cycle         
+       0.798733192 seconds time elapsed
 runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/gcheck.exe -p 64 256 1 --bridge
 WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost
 WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost
@@ -68,17 +68,17 @@ WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost
 WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost
 WARNING! Instantiate device Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288)
 WARNING! Set grid in Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288)
-Process                     = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 2.682258e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 9.444835e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 9.444835e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.419279e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 9.564570e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 9.564570e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 6.734461e+02 +- 4.775415e+02 )  GeV^-2
-TOTAL       :     0.823911 sec
-     3,195,140,467      cycles                           #    2.964 GHz                    
-     4,939,449,955      instructions                     #    1.55  insn per cycle         
-       1.137578322 seconds time elapsed
+TOTAL       :     0.870649 sec
+     3,346,648,541      cycles                    #    2.958 GHz                    
+     4,943,935,079      instructions              #    1.48  insn per cycle         
+       1.190299197 seconds time elapsed
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2
@@ -89,19 +89,19 @@ OK (relative difference <= 5E-3)
 =========================================================================
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/check.exe -p 64 256 10 --bridge OMP=
 WARNING! Instantiate host Bridge (nevt=16384)
-Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 2.512317e+04                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.524834e+04                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.524834e+04                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.646125e+04                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.663019e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.663019e+04                 )  sec^-1
 MeanMatrixElemValue         = ( 1.008920e+02 +- 5.001681e+01 )  GeV^-2
-TOTAL       :     6.551068 sec
-    19,859,136,318      cycles                           #    3.030 GHz                    
-    59,010,773,498      instructions                     #    2.97  insn per cycle         
-       6.555009301 seconds time elapsed
+TOTAL       :     6.222512 sec
+    19,287,909,225      cycles                    #    3.099 GHz                    
+    59,047,382,237      instructions              #    3.06  insn per cycle         
+       6.228060526 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4: 1187) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/runTest.exe
@@ -116,19 +116,19 @@ OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd0/check.exe -p 64 256 10 --bridge OMP=
 WARNING! Instantiate host Bridge (nevt=16384)
-Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 4.849323e+04                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 4.895894e+04                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 4.895894e+04                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 4.998676e+04                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 5.058773e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 5.058773e+04                 )  sec^-1
 MeanMatrixElemValue         = ( 1.008920e+02 +- 5.001681e+01 )  GeV^-2
-TOTAL       :     3.408439 sec
-    10,416,721,509      cycles                           #    3.053 GHz                    
-    30,668,561,647      instructions                     #    2.94  insn per cycle         
-       3.412905583 seconds time elapsed
+TOTAL       :     3.306706 sec
+    10,251,708,193      cycles                    #    3.096 GHz                    
+    30,706,584,426      instructions              #    3.00  insn per cycle         
+       3.312153580 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4: 5158) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd0/runTest.exe
@@ -143,19 +143,19 @@ OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd0/check.exe -p 64 256 10 --bridge OMP=
 WARNING! Instantiate host Bridge (nevt=16384)
-Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.011440e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.030823e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.030823e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.028551e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.054465e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.054465e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 1.008920e+02 +- 5.001681e+01 )  GeV^-2
-TOTAL       :     1.649546 sec
-     4,709,324,732      cycles                           #    2.849 GHz                    
-    10,921,961,578      instructions                     #    2.32  insn per cycle         
-       1.653928822 seconds time elapsed
+TOTAL       :     1.624375 sec
+     4,708,780,636      cycles                    #    2.891 GHz                    
+    10,966,076,818      instructions              #    2.33  insn per cycle         
+       1.629709328 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 4166) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd0/runTest.exe
@@ -170,19 +170,19 @@ OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_d_inl0_hrd0/check.exe -p 64 256 10 --bridge OMP=
 WARNING! Instantiate host Bridge (nevt=16384)
-Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.150271e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.177125e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.177125e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.164974e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.197651e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.197651e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 1.008920e+02 +- 5.001681e+01 )  GeV^-2
-TOTAL       :     1.453735 sec
-     4,150,109,601      cycles                           #    2.849 GHz                    
-    10,114,883,487      instructions                     #    2.44  insn per cycle         
-       1.457709150 seconds time elapsed
+TOTAL       :     1.437942 sec
+     4,180,610,973      cycles                    #    2.898 GHz                    
+    10,160,732,858      instructions              #    2.43  insn per cycle         
+       1.452787766 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 3967) (512y:   32) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_d_inl0_hrd0/runTest.exe
@@ -197,20 +197,20 @@ OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_d_inl0_hrd0/check.exe -p 64 256 10 --bridge OMP=
 WARNING! Instantiate host Bridge (nevt=16384)
-Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 8.115929e+04                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 8.241926e+04                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 8.241926e+04                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 8.237498e+04                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 8.399499e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 8.399499e+04                 )  sec^-1
 MeanMatrixElemValue         = ( 1.008920e+02 +- 5.001681e+01 )  GeV^-2
-TOTAL       :     2.048746 sec
-     3,884,858,685      cycles                           #    1.895 GHz                    
-     5,544,738,228      instructions                     #    1.43  insn per cycle         
-       2.052682001 seconds time elapsed
-=Symbols in CPPProcess.o= (~sse4:    0) (avx2: 1135) (512y:   60) (512z: 3429)
+TOTAL       :     2.022999 sec
+     3,915,207,010      cycles                    #    1.933 GHz                    
+     5,583,645,604      instructions              #    1.43  insn per cycle         
+       2.033737208 seconds time elapsed
+=Symbols in CPPProcess.o= (~sse4:    0) (avx2: 1125) (512y:   59) (512z: 3431)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_d_inl0_hrd0/runTest.exe
 [  PASSED  ] 6 tests.
diff --git a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd1.txt
index 24c2bf3235..c7fa7a5874 100644
--- a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd1.txt
+++ b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd1.txt
@@ -35,38 +35,38 @@ CUDACPP_BUILDDIR='build.512z_d_inl0_hrd1'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
 
-DATE: 2023-07-18_22:43:02
+DATE: 2023-06-16_22:54:11
 
-On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
+On itscrd80.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd1/gcheck.exe -p 64 256 10 OMP=
-Process                     = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1]
+Process                     = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 8.719998e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.041825e+07                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.055822e+07                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 8.406208e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.034522e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.051529e+07                 )  sec^-1
 MeanMatrixElemValue         = ( 1.008920e+02 +- 5.001681e+01 )  GeV^-2
-TOTAL       :     0.467133 sec
-     1,982,839,200      cycles                           #    2.903 GHz                    
-     2,732,083,404      instructions                     #    1.38  insn per cycle         
-       0.750043715 seconds time elapsed
+TOTAL       :     0.495761 sec
+     2,045,430,788      cycles                    #    2.844 GHz                    
+     2,567,034,130      instructions              #    1.26  insn per cycle         
+       0.778564031 seconds time elapsed
 runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd1/gcheck.exe -p 64 256 1
 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255
 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100%
 .........................................................................
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd1/gcheck.exe -p 2048 256 1 OMP=
-Process                     = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1]
+Process                     = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 1.075933e+07                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.309859e+07                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.323648e+07                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.078800e+07                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.305420e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.318205e+07                 )  sec^-1
 MeanMatrixElemValue         = ( 6.734461e+02 +- 4.775415e+02 )  GeV^-2
-TOTAL       :     0.602656 sec
-     2,465,959,222      cycles                           #    2.929 GHz                    
-     3,643,124,057      instructions                     #    1.48  insn per cycle         
-       0.900913388 seconds time elapsed
+TOTAL       :     0.628664 sec
+     2,557,626,902      cycles                    #    2.898 GHz                    
+     3,428,444,966      instructions              #    1.34  insn per cycle         
+       0.942725213 seconds time elapsed
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd1/gcheck.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd1/fgcheck.exe 2 64 2
@@ -76,20 +76,20 @@ Relative difference = 4.469239988637851e-07
 OK (relative difference <= 5E-3)
 =========================================================================
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd1/check.exe -p 64 256 10 OMP=
-Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1]
+Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 2.480213e+04                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.492782e+04                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.492782e+04                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.633830e+04                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.650576e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.650576e+04                 )  sec^-1
 MeanMatrixElemValue         = ( 1.008920e+02 +- 5.001681e+01 )  GeV^-2
-TOTAL       :     6.631622 sec
-    19,752,961,343      cycles                           #    2.979 GHz                    
-    59,276,588,700      instructions                     #    3.00  insn per cycle         
-       6.637134779 seconds time elapsed
-=Symbols in CPPProcess.o= (~sse4: 1312) (avx2:    0) (512y:    0) (512z:    0)
+TOTAL       :     6.246365 sec
+    19,302,390,466      cycles                    #    3.090 GHz                    
+    59,308,261,536      instructions              #    3.07  insn per cycle         
+       6.251260293 seconds time elapsed
+=Symbols in CPPProcess.o= (~sse4: 1309) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd1/runTest.exe
 [  PASSED  ] 6 tests.
@@ -102,19 +102,19 @@ Relative difference = 4.46923023397472e-07
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd1/check.exe -p 64 256 10 OMP=
-Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1]
+Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 4.862316e+04                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 4.908955e+04                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 4.908955e+04                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 5.048737e+04                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 5.110388e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 5.110388e+04                 )  sec^-1
 MeanMatrixElemValue         = ( 1.008920e+02 +- 5.001681e+01 )  GeV^-2
-TOTAL       :     3.395194 sec
-    10,280,079,176      cycles                           #    3.026 GHz                    
-    30,281,005,461      instructions                     #    2.95  insn per cycle         
-       3.400628119 seconds time elapsed
+TOTAL       :     3.268980 sec
+    10,121,456,601      cycles                    #    3.094 GHz                    
+    30,320,135,500      instructions              #    3.00  insn per cycle         
+       3.274001858 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4: 5009) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd1/runTest.exe
@@ -128,19 +128,19 @@ Relative difference = 4.46923023397472e-07
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd1/check.exe -p 64 256 10 OMP=
-Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1]
+Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 9.762243e+04                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 9.944781e+04                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 9.944781e+04                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 9.955459e+04                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.019425e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.019425e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 1.008920e+02 +- 5.001681e+01 )  GeV^-2
-TOTAL       :     1.702839 sec
-     4,841,922,221      cycles                           #    2.840 GHz                    
-    11,278,605,221      instructions                     #    2.33  insn per cycle         
-       1.708262409 seconds time elapsed
+TOTAL       :     1.669916 sec
+     4,868,753,430      cycles                    #    2.909 GHz                    
+    11,322,372,755      instructions              #    2.33  insn per cycle         
+       1.675375031 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 4330) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd1/runTest.exe
@@ -154,19 +154,19 @@ Relative difference = 4.469241533230934e-07
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_d_inl0_hrd1/check.exe -p 64 256 10 OMP=
-Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1]
+Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.068844e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.090489e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.090489e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.093964e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.122647e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.122647e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 1.008920e+02 +- 5.001681e+01 )  GeV^-2
-TOTAL       :     1.556503 sec
-     4,414,507,903      cycles                           #    2.832 GHz                    
-    10,506,180,954      instructions                     #    2.38  insn per cycle         
-       1.561434477 seconds time elapsed
+TOTAL       :     1.521327 sec
+     4,446,518,647      cycles                    #    2.915 GHz                    
+    10,548,968,003      instructions              #    2.37  insn per cycle         
+       1.526307477 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 4044) (512y:  186) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_d_inl0_hrd1/runTest.exe
@@ -180,20 +180,20 @@ Relative difference = 4.469241533230934e-07
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_d_inl0_hrd1/check.exe -p 64 256 10 OMP=
-Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1]
+Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 7.947687e+04                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 8.071742e+04                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 8.071742e+04                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 8.202771e+04                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 8.370511e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 8.370511e+04                 )  sec^-1
 MeanMatrixElemValue         = ( 1.008920e+02 +- 5.001681e+01 )  GeV^-2
-TOTAL       :     2.087384 sec
-     3,884,235,117      cycles                           #    1.859 GHz                    
-     5,736,642,934      instructions                     #    1.48  insn per cycle         
-       2.092378399 seconds time elapsed
-=Symbols in CPPProcess.o= (~sse4:    0) (avx2: 1094) (512y:  110) (512z: 3505)
+TOTAL       :     2.023574 sec
+     3,918,082,536      cycles                    #    1.933 GHz                    
+     5,775,352,534      instructions              #    1.47  insn per cycle         
+       2.028589705 seconds time elapsed
+=Symbols in CPPProcess.o= (~sse4:    0) (avx2: 1089) (512y:  110) (512z: 3505)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_d_inl0_hrd1/runTest.exe
 [  PASSED  ] 6 tests.
diff --git a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd0.txt
index e528e70741..421224af27 100644
--- a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd0.txt
+++ b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd0.txt
@@ -35,60 +35,60 @@ CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
 
-DATE: 2023-07-18_22:43:30
+DATE: 2023-06-16_22:54:40
 
-On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
+On itscrd80.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/gcheck.exe -p 64 256 10 OMP=
-Process                     = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 2.443897e+07                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.281940e+07                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.377474e+07                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.327176e+07                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.257598e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.374537e+07                 )  sec^-1
 MeanMatrixElemValue         = ( 1.008472e+02 +- 5.002447e+01 )  GeV^-2
-TOTAL       :     0.450086 sec
-     1,932,638,031      cycles                           #    2.910 GHz                    
-     2,644,542,305      instructions                     #    1.37  insn per cycle         
-       0.736735331 seconds time elapsed
+TOTAL       :     0.473310 sec
+     2,027,915,544      cycles                    #    2.867 GHz                    
+     2,463,775,432      instructions              #    1.21  insn per cycle         
+       0.764839697 seconds time elapsed
 runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/gcheck.exe -p 64 256 1
 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 249
 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100%
 .........................................................................
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 1 OMP=
-Process                     = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 3.224449e+07                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 4.399050e+07                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 4.481223e+07                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.391345e+07                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 4.450691e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.517488e+07                 )  sec^-1
 MeanMatrixElemValue         = ( 6.630099e+02 +- 4.770719e+02 )  GeV^-2
-TOTAL       :     0.496307 sec
-     2,113,797,836      cycles                           #    2.927 GHz                    
-     2,979,656,527      instructions                     #    1.41  insn per cycle         
-       0.780892096 seconds time elapsed
+TOTAL       :     0.522081 sec
+     2,217,811,440      cycles                    #    2.881 GHz                    
+     2,788,557,835      instructions              #    1.26  insn per cycle         
+       0.827734782 seconds time elapsed
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2
 Avg ME (C++/CUDA)   = 1.412608e+00
-Avg ME (F77/CUDA)   = 1.4132214346515752
-Relative difference = 0.00043425681546129636
+Avg ME (F77/CUDA)   = 1.4132214343518683
+Relative difference = 0.0004342566032956241
 OK (relative difference <= 5E-3)
 =========================================================================
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/check.exe -p 64 256 10 OMP=
-Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 2.558206e+04                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.571081e+04                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.571081e+04                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.701153e+04                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.715579e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.715579e+04                 )  sec^-1
 MeanMatrixElemValue         = ( 1.009236e+02 +- 5.002643e+01 )  GeV^-2
-TOTAL       :     6.427410 sec
-    19,447,033,884      cycles                           #    3.025 GHz                    
-    59,469,776,554      instructions                     #    3.06  insn per cycle         
-       6.432547562 seconds time elapsed
+TOTAL       :     6.095019 sec
+    18,873,238,787      cycles                    #    3.098 GHz                    
+    59,483,025,635      instructions              #    3.15  insn per cycle         
+       6.100057991 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:  970) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/runTest.exe
@@ -102,20 +102,20 @@ Relative difference = 2.1728426918172542e-08
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd0/check.exe -p 64 256 10 OMP=
-Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 8.472251e+04                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 8.617510e+04                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 8.617510e+04                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 8.884416e+04                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 9.046889e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 9.046889e+04                 )  sec^-1
 MeanMatrixElemValue         = ( 1.009236e+02 +- 5.002643e+01 )  GeV^-2
-TOTAL       :     1.956307 sec
-     5,875,533,113      cycles                           #    3.002 GHz                    
-    16,505,988,555      instructions                     #    2.81  insn per cycle         
-       1.961302040 seconds time elapsed
-=Symbols in CPPProcess.o= (~sse4: 5864) (avx2:    0) (512y:    0) (512z:    0)
+TOTAL       :     1.866765 sec
+     5,710,913,630      cycles                    #    3.054 GHz                    
+    16,521,962,319      instructions              #    2.89  insn per cycle         
+       1.876809148 seconds time elapsed
+=Symbols in CPPProcess.o= (~sse4: 5863) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd0/runTest.exe
 [  PASSED  ] 6 tests.
@@ -128,19 +128,19 @@ Relative difference = 1.2948889545181803e-07
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd0/check.exe -p 64 256 10 OMP=
-Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.982725e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.060793e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.060793e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.049407e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.136391e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.136391e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 1.008858e+02 +- 5.002467e+01 )  GeV^-2
-TOTAL       :     0.849896 sec
-     2,399,557,922      cycles                           #    2.819 GHz                    
-     5,764,751,975      instructions                     #    2.40  insn per cycle         
-       0.854825256 seconds time elapsed
+TOTAL       :     0.820791 sec
+     2,394,884,327      cycles                    #    2.906 GHz                    
+     5,781,261,143      instructions              #    2.41  insn per cycle         
+       0.825770188 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 4396) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd0/runTest.exe
@@ -154,19 +154,19 @@ Relative difference = 2.7390098302447566e-07
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_f_inl0_hrd0/check.exe -p 64 256 10 OMP=
-Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 2.206732e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.302653e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.302653e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.285711e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.389224e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.389224e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 1.008858e+02 +- 5.002467e+01 )  GeV^-2
-TOTAL       :     0.765407 sec
-     2,156,413,717      cycles                           #    2.812 GHz                    
-     5,334,290,181      instructions                     #    2.47  insn per cycle         
-       0.772175970 seconds time elapsed
+TOTAL       :     0.738019 sec
+     2,162,282,356      cycles                    #    2.914 GHz                    
+     5,351,223,903      instructions              #    2.47  insn per cycle         
+       0.743188108 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 4167) (512y:   25) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_f_inl0_hrd0/runTest.exe
@@ -180,20 +180,20 @@ Relative difference = 2.7390098302447566e-07
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_f_inl0_hrd0/check.exe -p 64 256 10 OMP=
-Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.638617e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.692125e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.692125e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.684448e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.741867e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.741867e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 1.008856e+02 +- 5.002468e+01 )  GeV^-2
-TOTAL       :     1.024345 sec
-     1,941,949,756      cycles                           #    1.893 GHz                    
-     3,005,992,347      instructions                     #    1.55  insn per cycle         
-       1.029400462 seconds time elapsed
-=Symbols in CPPProcess.o= (~sse4:    0) (avx2: 1425) (512y:   33) (512z: 3547)
+TOTAL       :     0.995951 sec
+     1,960,707,531      cycles                    #    1.961 GHz                    
+     3,020,418,486      instructions              #    1.54  insn per cycle         
+       1.001177596 seconds time elapsed
+=Symbols in CPPProcess.o= (~sse4:    0) (avx2: 1416) (512y:   33) (512z: 3549)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_f_inl0_hrd0/runTest.exe
 [  PASSED  ] 6 tests.
diff --git a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd0_bridge.txt b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd0_bridge.txt
index 9a33b358ad..cc4c5da246 100644
--- a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd0_bridge.txt
+++ b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd0_bridge.txt
@@ -35,26 +35,26 @@ CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
 
-DATE: 2023-07-18_23:24:43
+DATE: 2023-06-16_23:21:10
 
-On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
+On itscrd80.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/gcheck.exe -p 64 256 10 --bridge OMP=
 WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost
 WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost
 WARNING! Instantiate device Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384)
 WARNING! Set grid in Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384)
-Process                     = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 4.944597e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.100237e+07                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.100237e+07                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 4.989407e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.193414e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.193414e+07                 )  sec^-1
 MeanMatrixElemValue         = ( 1.009071e+02 +- 5.002295e+01 )  GeV^-2
-TOTAL       :     0.457752 sec
-     1,966,081,921      cycles                           #    2.939 GHz                    
-     2,842,862,075      instructions                     #    1.45  insn per cycle         
-       0.725727150 seconds time elapsed
+TOTAL       :     0.482951 sec
+     2,013,685,658      cycles                    #    2.866 GHz                    
+     2,592,200,009      instructions              #    1.29  insn per cycle         
+       0.760014140 seconds time elapsed
 runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/gcheck.exe -p 64 256 1 --bridge
 WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost
 WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost
@@ -68,40 +68,40 @@ WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost
 WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost
 WARNING! Instantiate device Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288)
 WARNING! Set grid in Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288)
-Process                     = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 4.725699e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.575204e+07                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.575204e+07                 )  sec^-1
-MeanMatrixElemValue         = ( 6.737500e+02 +- 4.776370e+02 )  GeV^-2
-TOTAL       :     0.637779 sec
-     2,573,028,941      cycles                           #    2.957 GHz                    
-     3,873,609,391      instructions                     #    1.51  insn per cycle         
-       0.927196086 seconds time elapsed
+EvtsPerSec[Rmb+ME]     (23) = ( 4.724425e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.653929e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.653929e+07                 )  sec^-1
+MeanMatrixElemValue         = ( 6.737489e+02 +- 4.776370e+02 )  GeV^-2
+TOTAL       :     0.663195 sec
+     2,648,120,389      cycles                    #    2.929 GHz                    
+     3,654,507,605      instructions              #    1.38  insn per cycle         
+       0.962099488 seconds time elapsed
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2
 Avg ME (C++/CUDA)   = 1.412608e+00
-Avg ME (F77/CUDA)   = 1.4132214346515752
-Relative difference = 0.00043425681546129636
+Avg ME (F77/CUDA)   = 1.4132214343518683
+Relative difference = 0.0004342566032956241
 OK (relative difference <= 5E-3)
 =========================================================================
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/check.exe -p 64 256 10 --bridge OMP=
 WARNING! Instantiate host Bridge (nevt=16384)
-Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 2.572228e+04                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.585304e+04                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.585304e+04                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.700458e+04                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.715017e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.715017e+04                 )  sec^-1
 MeanMatrixElemValue         = ( 1.009236e+02 +- 5.002643e+01 )  GeV^-2
-TOTAL       :     6.394035 sec
-    19,472,549,387      cycles                           #    3.044 GHz                    
-    59,473,909,839      instructions                     #    3.05  insn per cycle         
-       6.398143881 seconds time elapsed
+TOTAL       :     6.092117 sec
+    18,886,470,035      cycles                    #    3.099 GHz                    
+    59,487,095,856      instructions              #    3.15  insn per cycle         
+       6.096869282 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:  970) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/runTest.exe
@@ -116,20 +116,20 @@ OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd0/check.exe -p 64 256 10 --bridge OMP=
 WARNING! Instantiate host Bridge (nevt=16384)
-Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 8.584034e+04                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 8.734732e+04                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 8.734732e+04                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 8.823507e+04                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 8.985125e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 8.985125e+04                 )  sec^-1
 MeanMatrixElemValue         = ( 1.009236e+02 +- 5.002643e+01 )  GeV^-2
-TOTAL       :     1.933468 sec
-     5,894,952,061      cycles                           #    3.045 GHz                    
-    16,554,069,417      instructions                     #    2.81  insn per cycle         
-       1.937631603 seconds time elapsed
-=Symbols in CPPProcess.o= (~sse4: 5864) (avx2:    0) (512y:    0) (512z:    0)
+TOTAL       :     1.882730 sec
+     5,728,750,294      cycles                    #    3.037 GHz                    
+    16,570,856,852      instructions              #    2.89  insn per cycle         
+       1.893324475 seconds time elapsed
+=Symbols in CPPProcess.o= (~sse4: 5863) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd0/runTest.exe
 [  PASSED  ] 6 tests.
@@ -143,19 +143,19 @@ OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd0/check.exe -p 64 256 10 --bridge OMP=
 WARNING! Instantiate host Bridge (nevt=16384)
-Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 2.001540e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.081012e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.081012e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.034411e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.118355e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.118355e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 1.008858e+02 +- 5.002467e+01 )  GeV^-2
-TOTAL       :     0.843109 sec
-     2,417,753,828      cycles                           #    2.857 GHz                    
-     5,800,856,711      instructions                     #    2.40  insn per cycle         
-       0.846901747 seconds time elapsed
+TOTAL       :     0.830571 sec
+     2,411,689,968      cycles                    #    2.891 GHz                    
+     5,818,447,773      instructions              #    2.41  insn per cycle         
+       0.835666053 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 4396) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd0/runTest.exe
@@ -170,19 +170,19 @@ OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_f_inl0_hrd0/check.exe -p 64 256 10 --bridge OMP=
 WARNING! Instantiate host Bridge (nevt=16384)
-Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 2.196336e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.291341e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.291341e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.276703e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.381223e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.381223e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 1.008858e+02 +- 5.002467e+01 )  GeV^-2
-TOTAL       :     0.771255 sec
-     2,177,354,153      cycles                           #    2.811 GHz                    
-     5,370,873,032      instructions                     #    2.47  insn per cycle         
-       0.775290977 seconds time elapsed
+TOTAL       :     0.750194 sec
+     2,182,144,900      cycles                    #    2.906 GHz                    
+     5,388,939,388      instructions              #    2.47  insn per cycle         
+       0.755052239 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 4167) (512y:   25) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_f_inl0_hrd0/runTest.exe
@@ -197,20 +197,20 @@ OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_f_inl0_hrd0/check.exe -p 64 256 10 --bridge OMP=
 WARNING! Instantiate host Bridge (nevt=16384)
-Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.589791e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.641495e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.641495e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.697532e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.759499e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.759499e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 1.008856e+02 +- 5.002468e+01 )  GeV^-2
-TOTAL       :     1.058213 sec
-     1,967,102,783      cycles                           #    1.854 GHz                    
-     3,047,015,939      instructions                     #    1.55  insn per cycle         
-       1.062417559 seconds time elapsed
-=Symbols in CPPProcess.o= (~sse4:    0) (avx2: 1425) (512y:   33) (512z: 3547)
+TOTAL       :     0.992848 sec
+     1,971,737,093      cycles                    #    1.979 GHz                    
+     3,062,300,048      instructions              #    1.55  insn per cycle         
+       0.997634050 seconds time elapsed
+=Symbols in CPPProcess.o= (~sse4:    0) (avx2: 1416) (512y:   33) (512z: 3549)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_f_inl0_hrd0/runTest.exe
 [  PASSED  ] 6 tests.
diff --git a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd1.txt
index b9428218ff..adf6bfc552 100644
--- a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd1.txt
+++ b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd1.txt
@@ -35,60 +35,60 @@ CUDACPP_BUILDDIR='build.512z_f_inl0_hrd1'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
 
-DATE: 2023-07-18_22:43:53
+DATE: 2023-06-16_22:55:04
 
-On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
+On itscrd80.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd1/gcheck.exe -p 64 256 10 OMP=
-Process                     = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1]
+Process                     = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 2.428863e+07                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.263490e+07                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.366244e+07                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.339749e+07                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.271976e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.390547e+07                 )  sec^-1
 MeanMatrixElemValue         = ( 1.008472e+02 +- 5.002447e+01 )  GeV^-2
-TOTAL       :     0.447829 sec
-     1,946,982,573      cycles                           #    2.935 GHz                    
-     2,680,932,348      instructions                     #    1.38  insn per cycle         
-       0.728844364 seconds time elapsed
+TOTAL       :     0.474512 sec
+     1,999,862,138      cycles                    #    2.850 GHz                    
+     2,440,400,706      instructions              #    1.22  insn per cycle         
+       0.759138369 seconds time elapsed
 runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd1/gcheck.exe -p 64 256 1
 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 248
 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100%
 .........................................................................
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd1/gcheck.exe -p 2048 256 1 OMP=
-Process                     = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1]
+Process                     = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 3.216441e+07                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 4.386159e+07                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 4.466691e+07                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.328864e+07                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 4.325114e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.388368e+07                 )  sec^-1
 MeanMatrixElemValue         = ( 6.630099e+02 +- 4.770719e+02 )  GeV^-2
-TOTAL       :     0.495369 sec
-     2,121,287,530      cycles                           #    2.937 GHz                    
-     2,989,151,728      instructions                     #    1.41  insn per cycle         
-       0.780758288 seconds time elapsed
+TOTAL       :     0.521920 sec
+     2,232,068,560      cycles                    #    2.897 GHz                    
+     2,825,671,567      instructions              #    1.27  insn per cycle         
+       0.828183210 seconds time elapsed
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd1/gcheck.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd1/fgcheck.exe 2 64 2
 Avg ME (C++/CUDA)   = 1.412608e+00
-Avg ME (F77/CUDA)   = 1.4132214346515752
-Relative difference = 0.00043425681546129636
+Avg ME (F77/CUDA)   = 1.4132214343518683
+Relative difference = 0.0004342566032956241
 OK (relative difference <= 5E-3)
 =========================================================================
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd1/check.exe -p 64 256 10 OMP=
-Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1]
+Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 2.561997e+04                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.574840e+04                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.574840e+04                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.699454e+04                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.714152e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.714152e+04                 )  sec^-1
 MeanMatrixElemValue         = ( 1.009236e+02 +- 5.002643e+01 )  GeV^-2
-TOTAL       :     6.417399 sec
-    19,389,104,365      cycles                           #    3.020 GHz                    
-    59,233,313,389      instructions                     #    3.05  insn per cycle         
-       6.422046826 seconds time elapsed
+TOTAL       :     6.092288 sec
+    18,813,707,143      cycles                    #    3.087 GHz                    
+    59,245,916,196      instructions              #    3.15  insn per cycle         
+       6.097217914 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4: 1031) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd1/runTest.exe
@@ -102,20 +102,20 @@ Relative difference = 2.1728426918172542e-08
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd1/check.exe -p 64 256 10 OMP=
-Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1]
+Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 8.882489e+04                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 9.043796e+04                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 9.043796e+04                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 9.395747e+04                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 9.576744e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 9.576744e+04                 )  sec^-1
 MeanMatrixElemValue         = ( 1.009236e+02 +- 5.002643e+01 )  GeV^-2
-TOTAL       :     1.866886 sec
-     5,621,059,731      cycles                           #    3.009 GHz                    
-    16,304,017,213      instructions                     #    2.90  insn per cycle         
-       1.872024197 seconds time elapsed
-=Symbols in CPPProcess.o= (~sse4: 5639) (avx2:    0) (512y:    0) (512z:    0)
+TOTAL       :     1.764298 sec
+     5,437,573,561      cycles                    #    3.075 GHz                    
+    16,318,666,941      instructions              #    3.00  insn per cycle         
+       1.774906001 seconds time elapsed
+=Symbols in CPPProcess.o= (~sse4: 5638) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd1/runTest.exe
 [  PASSED  ] 6 tests.
@@ -128,19 +128,19 @@ Relative difference = 1.2948889545181803e-07
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd1/check.exe -p 64 256 10 OMP=
-Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1]
+Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.718948e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.776130e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.776130e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.771528e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.833956e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.833956e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 1.008858e+02 +- 5.002467e+01 )  GeV^-2
-TOTAL       :     0.975130 sec
-     2,776,282,140      cycles                           #    2.840 GHz                    
-     6,328,900,147      instructions                     #    2.28  insn per cycle         
-       0.980407362 seconds time elapsed
+TOTAL       :     0.946284 sec
+     2,773,608,102      cycles                    #    2.919 GHz                    
+     6,345,516,906      instructions              #    2.29  insn per cycle         
+       0.951331609 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 5044) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd1/runTest.exe
@@ -154,19 +154,19 @@ Relative difference = 2.7390098302447566e-07
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_f_inl0_hrd1/check.exe -p 64 256 10 OMP=
-Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1]
+Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.858619e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.926048e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.926048e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.898207e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.970096e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.970096e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 1.008858e+02 +- 5.002467e+01 )  GeV^-2
-TOTAL       :     0.902998 sec
-     2,569,021,706      cycles                           #    2.837 GHz                    
-     5,882,455,801      instructions                     #    2.29  insn per cycle         
-       0.908262650 seconds time elapsed
+TOTAL       :     0.884490 sec
+     2,572,468,156      cycles                    #    2.896 GHz                    
+     5,899,227,322      instructions              #    2.29  insn per cycle         
+       0.889316146 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 4834) (512y:   18) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_f_inl0_hrd1/runTest.exe
@@ -180,20 +180,20 @@ Relative difference = 2.7390098302447566e-07
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_f_inl0_hrd1/check.exe -p 64 256 10 OMP=
-Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1]
+Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.446292e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.489559e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.489559e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.504005e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.551206e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.551206e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 1.008856e+02 +- 5.002468e+01 )  GeV^-2
-TOTAL       :     1.157234 sec
-     2,094,475,571      cycles                           #    1.806 GHz                    
-     3,307,191,778      instructions                     #    1.58  insn per cycle         
-       1.161991638 seconds time elapsed
-=Symbols in CPPProcess.o= (~sse4:    0) (avx2: 1706) (512y:   31) (512z: 3743)
+TOTAL       :     1.114511 sec
+     2,105,248,250      cycles                    #    1.885 GHz                    
+     3,318,671,370      instructions              #    1.58  insn per cycle         
+       1.119425864 seconds time elapsed
+=Symbols in CPPProcess.o= (~sse4:    0) (avx2: 1702) (512y:   31) (512z: 3743)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_f_inl0_hrd1/runTest.exe
 [  PASSED  ] 6 tests.
diff --git a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_m_inl0_hrd0.txt
index c7a57bbaf7..509a151f9e 100644
--- a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_m_inl0_hrd0.txt
+++ b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_m_inl0_hrd0.txt
@@ -35,38 +35,38 @@ CUDACPP_BUILDDIR='build.512z_m_inl0_hrd0'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
 
-DATE: 2023-07-18_22:44:17
+DATE: 2023-06-16_22:55:28
 
-On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
+On itscrd80.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd0/gcheck.exe -p 64 256 10 OMP=
-Process                     = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 8.751950e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.049189e+07                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.063151e+07                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 8.484913e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.046842e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.063089e+07                 )  sec^-1
 MeanMatrixElemValue         = ( 1.008920e+02 +- 5.001681e+01 )  GeV^-2
-TOTAL       :     0.465247 sec
-     1,992,296,691      cycles                           #    2.929 GHz                    
-     2,742,134,661      instructions                     #    1.38  insn per cycle         
-       0.745964144 seconds time elapsed
+TOTAL       :     0.491268 sec
+     2,111,124,427      cycles                    #    2.897 GHz                    
+     2,572,779,858      instructions              #    1.22  insn per cycle         
+       0.786655156 seconds time elapsed
 runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd0/gcheck.exe -p 64 256 1
 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255
 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100%
 .........................................................................
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd0/gcheck.exe -p 2048 256 1 OMP=
-Process                     = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 1.080664e+07                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.316749e+07                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.330839e+07                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.087869e+07                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.318151e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.331093e+07                 )  sec^-1
 MeanMatrixElemValue         = ( 6.734461e+02 +- 4.775415e+02 )  GeV^-2
-TOTAL       :     0.609902 sec
-     2,480,338,961      cycles                           #    2.929 GHz                    
-     3,630,300,542      instructions                     #    1.46  insn per cycle         
-       0.907777782 seconds time elapsed
+TOTAL       :     0.625996 sec
+     2,572,645,106      cycles                    #    2.943 GHz                    
+     3,495,847,613      instructions              #    1.36  insn per cycle         
+       0.932270031 seconds time elapsed
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd0/gcheck.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd0/fgcheck.exe 2 64 2
@@ -76,19 +76,19 @@ Relative difference = 4.418889885423659e-07
 OK (relative difference <= 5E-3)
 =========================================================================
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd0/check.exe -p 64 256 10 OMP=
-Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 2.473267e+04                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.485439e+04                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.485439e+04                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.611473e+04                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.627816e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.627816e+04                 )  sec^-1
 MeanMatrixElemValue         = ( 1.008920e+02 +- 5.001681e+01 )  GeV^-2
-TOTAL       :     6.649315 sec
-    20,090,384,636      cycles                           #    3.020 GHz                    
-    60,091,853,129      instructions                     #    2.99  insn per cycle         
-       6.654276724 seconds time elapsed
+TOTAL       :     6.298063 sec
+    19,584,640,453      cycles                    #    3.108 GHz                    
+    60,128,609,602      instructions              #    3.07  insn per cycle         
+       6.303425917 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4: 1222) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd0/runTest.exe
@@ -102,20 +102,20 @@ Relative difference = 4.345647726386255e-07
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_m_inl0_hrd0/check.exe -p 64 256 10 OMP=
-Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 4.877837e+04                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 4.923415e+04                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 4.923415e+04                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 4.848908e+04                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 4.905341e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.905341e+04                 )  sec^-1
 MeanMatrixElemValue         = ( 1.008920e+02 +- 5.001681e+01 )  GeV^-2
-TOTAL       :     3.383033 sec
-    10,288,712,274      cycles                           #    3.041 GHz                    
-    30,361,296,631      instructions                     #    2.95  insn per cycle         
-       3.388432916 seconds time elapsed
-=Symbols in CPPProcess.o= (~sse4: 5291) (avx2:    0) (512y:    0) (512z:    0)
+TOTAL       :     3.403128 sec
+    10,110,840,700      cycles                    #    2.969 GHz                    
+    30,404,987,403      instructions              #    3.01  insn per cycle         
+       3.408194423 seconds time elapsed
+=Symbols in CPPProcess.o= (~sse4: 5293) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_m_inl0_hrd0/runTest.exe
 [  PASSED  ] 6 tests.
@@ -128,20 +128,20 @@ Relative difference = 4.392710025734405e-07
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_m_inl0_hrd0/check.exe -p 64 256 10 OMP=
-Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.026505e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.047006e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.047006e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.045472e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.071998e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.071998e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 1.008920e+02 +- 5.001681e+01 )  GeV^-2
-TOTAL       :     1.621347 sec
-     4,623,658,846      cycles                           #    2.848 GHz                    
-    10,831,015,634      instructions                     #    2.34  insn per cycle         
-       1.626661019 seconds time elapsed
-=Symbols in CPPProcess.o= (~sse4:    0) (avx2: 4302) (512y:    0) (512z:    0)
+TOTAL       :     1.590737 sec
+     4,630,470,943      cycles                    #    2.904 GHz                    
+    10,870,600,906      instructions              #    2.35  insn per cycle         
+       1.595619026 seconds time elapsed
+=Symbols in CPPProcess.o= (~sse4:    0) (avx2: 4303) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_m_inl0_hrd0/runTest.exe
 [  PASSED  ] 6 tests.
@@ -154,20 +154,20 @@ Relative difference = 4.5288254008796884e-07
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_m_inl0_hrd0/check.exe -p 64 256 10 OMP=
-Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.168519e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.194583e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.194583e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.186438e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.219581e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.219581e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 1.008920e+02 +- 5.001681e+01 )  GeV^-2
-TOTAL       :     1.426741 sec
-     4,058,981,984      cycles                           #    2.842 GHz                    
-    10,018,453,229      instructions                     #    2.47  insn per cycle         
-       1.432132556 seconds time elapsed
-=Symbols in CPPProcess.o= (~sse4:    0) (avx2: 4103) (512y:   24) (512z:    0)
+TOTAL       :     1.404412 sec
+     4,081,268,261      cycles                    #    2.899 GHz                    
+    10,057,889,430      instructions              #    2.46  insn per cycle         
+       1.409450424 seconds time elapsed
+=Symbols in CPPProcess.o= (~sse4:    0) (avx2: 4105) (512y:   24) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_m_inl0_hrd0/runTest.exe
 [  PASSED  ] 6 tests.
@@ -180,20 +180,20 @@ Relative difference = 4.5288254008796884e-07
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_m_inl0_hrd0/check.exe -p 64 256 10 OMP=
-Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 7.857979e+04                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 7.975503e+04                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 7.975503e+04                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 8.045191e+04                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 8.199654e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 8.199654e+04                 )  sec^-1
 MeanMatrixElemValue         = ( 1.008920e+02 +- 5.001681e+01 )  GeV^-2
-TOTAL       :     2.112045 sec
-     3,967,011,902      cycles                           #    1.877 GHz                    
-     5,714,986,051      instructions                     #    1.44  insn per cycle         
-       2.117098871 seconds time elapsed
-=Symbols in CPPProcess.o= (~sse4:    0) (avx2: 1695) (512y:   82) (512z: 3505)
+TOTAL       :     2.069671 sec
+     3,996,882,474      cycles                    #    1.933 GHz                    
+     5,753,481,101      instructions              #    1.44  insn per cycle         
+       2.074584748 seconds time elapsed
+=Symbols in CPPProcess.o= (~sse4:    0) (avx2: 1687) (512y:   81) (512z: 3506)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_m_inl0_hrd0/runTest.exe
 [  PASSED  ] 6 tests.
diff --git a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_m_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_m_inl0_hrd1.txt
index c4d3ede309..ccf71ae338 100644
--- a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_m_inl0_hrd1.txt
+++ b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_m_inl0_hrd1.txt
@@ -35,38 +35,38 @@ CUDACPP_BUILDDIR='build.512z_m_inl0_hrd1'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
 
-DATE: 2023-07-18_22:44:45
+DATE: 2023-06-16_22:55:56
 
-On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
+On itscrd80.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd1/gcheck.exe -p 64 256 10 OMP=
-Process                     = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1]
+Process                     = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 8.739531e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.045718e+07                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.060032e+07                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 8.431385e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.039088e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.054905e+07                 )  sec^-1
 MeanMatrixElemValue         = ( 1.008920e+02 +- 5.001681e+01 )  GeV^-2
-TOTAL       :     0.463123 sec
-     2,035,412,609      cycles                           #    2.951 GHz                    
-     2,826,905,066      instructions                     #    1.39  insn per cycle         
-       0.757115522 seconds time elapsed
+TOTAL       :     0.491602 sec
+     2,104,753,968      cycles                    #    2.908 GHz                    
+     2,609,636,072      instructions              #    1.24  insn per cycle         
+       0.781468608 seconds time elapsed
 runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd1/gcheck.exe -p 64 256 1
 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255
 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100%
 .........................................................................
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd1/gcheck.exe -p 2048 256 1 OMP=
-Process                     = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1]
+Process                     = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 1.072296e+07                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.305322e+07                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.318939e+07                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.077122e+07                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.304284e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.317280e+07                 )  sec^-1
 MeanMatrixElemValue         = ( 6.734461e+02 +- 4.775415e+02 )  GeV^-2
-TOTAL       :     0.601037 sec
-     2,472,372,598      cycles                           #    2.942 GHz                    
-     3,637,776,581      instructions                     #    1.47  insn per cycle         
-       0.899651920 seconds time elapsed
+TOTAL       :     0.629159 sec
+     2,584,689,149      cycles                    #    2.926 GHz                    
+     3,485,232,370      instructions              #    1.35  insn per cycle         
+       0.945332117 seconds time elapsed
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd1/gcheck.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd1/fgcheck.exe 2 64 2
@@ -76,19 +76,19 @@ Relative difference = 4.418889885423659e-07
 OK (relative difference <= 5E-3)
 =========================================================================
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd1/check.exe -p 64 256 10 OMP=
-Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1]
+Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 2.498871e+04                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.510973e+04                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.510973e+04                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.592801e+04                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.608815e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.608815e+04                 )  sec^-1
 MeanMatrixElemValue         = ( 1.008920e+02 +- 5.001681e+01 )  GeV^-2
-TOTAL       :     6.581447 sec
-    20,078,816,266      cycles                           #    3.050 GHz                    
-    60,290,196,754      instructions                     #    3.00  insn per cycle         
-       6.586924347 seconds time elapsed
+TOTAL       :     6.344781 sec
+    19,563,789,540      cycles                    #    3.082 GHz                    
+    60,327,298,777      instructions              #    3.08  insn per cycle         
+       6.350004821 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4: 1269) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd1/runTest.exe
@@ -102,19 +102,19 @@ Relative difference = 4.345647726386255e-07
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_m_inl0_hrd1/check.exe -p 64 256 10 OMP=
-Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1]
+Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 4.899627e+04                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 4.946616e+04                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 4.946616e+04                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 4.907260e+04                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 4.965434e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.965434e+04                 )  sec^-1
 MeanMatrixElemValue         = ( 1.008920e+02 +- 5.001681e+01 )  GeV^-2
-TOTAL       :     3.368086 sec
-    10,135,128,900      cycles                           #    3.007 GHz                    
-    30,027,179,365      instructions                     #    2.96  insn per cycle         
-       3.373489630 seconds time elapsed
+TOTAL       :     3.362361 sec
+     9,992,104,885      cycles                    #    2.970 GHz                    
+    30,065,057,144      instructions              #    3.01  insn per cycle         
+       3.367365912 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4: 5113) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_m_inl0_hrd1/runTest.exe
@@ -128,20 +128,20 @@ Relative difference = 4.392710025734405e-07
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_m_inl0_hrd1/check.exe -p 64 256 10 OMP=
-Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1]
+Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 9.789146e+04                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 9.969687e+04                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 9.969687e+04                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 9.851728e+04                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.008826e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.008826e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 1.008920e+02 +- 5.001681e+01 )  GeV^-2
-TOTAL       :     1.697227 sec
-     4,841,194,253      cycles                           #    2.849 GHz                    
-    11,245,209,770      instructions                     #    2.32  insn per cycle         
-       1.702258703 seconds time elapsed
-=Symbols in CPPProcess.o= (~sse4:    0) (avx2: 4443) (512y:    0) (512z:    0)
+TOTAL       :     1.687332 sec
+     4,858,147,798      cycles                    #    2.873 GHz                    
+    11,292,265,821      instructions              #    2.32  insn per cycle         
+       1.692863069 seconds time elapsed
+=Symbols in CPPProcess.o= (~sse4:    0) (avx2: 4448) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_m_inl0_hrd1/runTest.exe
 [  PASSED  ] 6 tests.
@@ -154,20 +154,20 @@ Relative difference = 4.5288254008796884e-07
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_m_inl0_hrd1/check.exe -p 64 256 10 OMP=
-Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1]
+Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.070485e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.092848e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.092848e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.079486e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.107998e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.107998e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 1.008920e+02 +- 5.001681e+01 )  GeV^-2
-TOTAL       :     1.555422 sec
-     4,392,434,745      cycles                           #    2.821 GHz                    
-    10,466,303,892      instructions                     #    2.38  insn per cycle         
-       1.560659362 seconds time elapsed
-=Symbols in CPPProcess.o= (~sse4:    0) (avx2: 4149) (512y:  177) (512z:    0)
+TOTAL       :     1.542428 sec
+     4,416,137,343      cycles                    #    2.857 GHz                    
+    10,507,517,652      instructions              #    2.38  insn per cycle         
+       1.556019289 seconds time elapsed
+=Symbols in CPPProcess.o= (~sse4:    0) (avx2: 4154) (512y:  177) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_m_inl0_hrd1/runTest.exe
 [  PASSED  ] 6 tests.
@@ -180,20 +180,20 @@ Relative difference = 4.5288254008796884e-07
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_m_inl0_hrd1/check.exe -p 64 256 10 OMP=
-Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1]
+Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 7.842072e+04                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 7.959105e+04                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 7.959105e+04                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 8.082610e+04                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 8.236339e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 8.236339e+04                 )  sec^-1
 MeanMatrixElemValue         = ( 1.008920e+02 +- 5.001681e+01 )  GeV^-2
-TOTAL       :     2.115498 sec
-     3,984,090,326      cycles                           #    1.881 GHz                    
-     5,904,512,729      instructions                     #    1.48  insn per cycle         
-       2.120336902 seconds time elapsed
-=Symbols in CPPProcess.o= (~sse4:    0) (avx2: 1629) (512y:  130) (512z: 3558)
+TOTAL       :     2.052702 sec
+     4,019,238,803      cycles                    #    1.954 GHz                    
+     5,946,674,764      instructions              #    1.48  insn per cycle         
+       2.063384894 seconds time elapsed
+=Symbols in CPPProcess.o= (~sse4:    0) (avx2: 1626) (512y:  130) (512z: 3560)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_m_inl0_hrd1/runTest.exe
 [  PASSED  ] 6 tests.
diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0.txt
index 29f62f0d17..674d696ef5 100644
--- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0.txt
+++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0.txt
@@ -35,38 +35,38 @@ CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 
-DATE: 2023-07-18_22:45:13
+DATE: 2023-06-16_22:56:24
 
-On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
+On itscrd80.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe -p 64 256 1 OMP=
-Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 3.486362e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.515101e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.517375e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.480605e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.516689e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.519182e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 4.063123e+00 +- 2.368970e+00 )  GeV^-4
-TOTAL       :     0.528132 sec
-     2,245,590,903      cycles                           #    2.913 GHz                    
-     3,327,857,187      instructions                     #    1.48  insn per cycle         
-       0.837844369 seconds time elapsed
+TOTAL       :     0.551619 sec
+     2,292,239,292      cycles                    #    2.899 GHz                    
+     3,148,001,142      instructions              #    1.37  insn per cycle         
+       0.847995580 seconds time elapsed
 runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe -p 64 256 1
 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255
 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100%
 .........................................................................
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 1 OMP=
-Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 4.148048e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 4.182455e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 4.183874e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 4.150203e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 4.184079e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.185408e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 6.665112e+00 +- 5.002651e+00 )  GeV^-4
-TOTAL       :     3.046993 sec
-     9,895,463,238      cycles                           #    2.997 GHz                    
-    20,573,508,118      instructions                     #    2.08  insn per cycle         
-       3.361731000 seconds time elapsed
+TOTAL       :     3.060087 sec
+    10,127,877,925      cycles                    #    3.044 GHz                    
+    22,666,867,696      instructions              #    2.24  insn per cycle         
+       3.384890862 seconds time elapsed
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2
@@ -76,20 +76,20 @@ Relative difference = 2.837296512218831e-07
 OK (relative difference <= 5E-3)
 =========================================================================
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/check.exe -p 64 256 1 OMP=
-Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.927904e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.928903e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.928903e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.003968e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.005255e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.005255e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 4.063123e+00 +- 2.368970e+00 )  GeV^-4
-TOTAL       :     8.518600 sec
-    25,868,819,765      cycles                           #    3.036 GHz                    
-    78,716,411,819      instructions                     #    3.04  insn per cycle         
-       8.523451145 seconds time elapsed
-=Symbols in CPPProcess.o= (~sse4: 4798) (avx2:    0) (512y:    0) (512z:    0)
+TOTAL       :     8.196590 sec
+    25,354,902,918      cycles                    #    3.092 GHz                    
+    78,729,043,359      instructions              #    3.11  insn per cycle         
+       8.201758404 seconds time elapsed
+=Symbols in CPPProcess.o= (~sse4: 4807) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/runTest.exe
 [  PASSED  ] 6 tests.
@@ -102,19 +102,19 @@ Relative difference = 2.8372990776517314e-07
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/check.exe -p 64 256 1 OMP=
-Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 3.642763e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.646140e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.646140e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.668144e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.672261e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.672261e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 4.063123e+00 +- 2.368970e+00 )  GeV^-4
-TOTAL       :     4.514266 sec
-    13,094,581,918      cycles                           #    2.900 GHz                    
-    39,233,207,168      instructions                     #    3.00  insn per cycle         
-       4.520265687 seconds time elapsed
+TOTAL       :     4.481440 sec
+    12,963,646,112      cycles                    #    2.890 GHz                    
+    39,242,921,927      instructions              #    3.03  insn per cycle         
+       4.486873061 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:13100) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/runTest.exe
@@ -128,19 +128,19 @@ Relative difference = 2.837299079287849e-07
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/check.exe -p 64 256 1 OMP=
-Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 8.470283e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 8.487708e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 8.487708e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 8.660201e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 8.683942e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 8.683942e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 4.063123e+00 +- 2.368970e+00 )  GeV^-4
-TOTAL       :     1.948046 sec
-     5,551,646,863      cycles                           #    2.851 GHz                    
-    13,814,806,208      instructions                     #    2.49  insn per cycle         
-       1.953490189 seconds time elapsed
+TOTAL       :     1.904647 sec
+     5,529,577,984      cycles                    #    2.897 GHz                    
+    13,826,051,095      instructions              #    2.50  insn per cycle         
+       1.909890252 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2:10973) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/runTest.exe
@@ -154,19 +154,19 @@ Relative difference = 2.837296634927675e-07
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/check.exe -p 64 256 1 OMP=
-Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 9.621230e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 9.644526e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 9.644526e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 9.795604e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 9.827220e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 9.827220e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 4.063123e+00 +- 2.368970e+00 )  GeV^-4
-TOTAL       :     1.716666 sec
-     4,883,495,439      cycles                           #    2.843 GHz                    
-    12,459,263,271      instructions                     #    2.55  insn per cycle         
-       1.721976811 seconds time elapsed
+TOTAL       :     1.693130 sec
+     4,892,778,969      cycles                    #    2.891 GHz                    
+    12,473,829,555      instructions              #    2.55  insn per cycle         
+       1.698091935 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2:10670) (512y:   29) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/runTest.exe
@@ -180,20 +180,20 @@ Relative difference = 2.837296634927675e-07
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/check.exe -p 64 256 1 OMP=
-Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 7.530563e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 7.545549e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 7.545549e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 7.725507e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 7.744232e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.744232e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 4.063123e+00 +- 2.368970e+00 )  GeV^-4
-TOTAL       :     2.191833 sec
-     4,072,863,886      cycles                           #    1.858 GHz                    
-     6,352,171,216      instructions                     #    1.56  insn per cycle         
-       2.197154031 seconds time elapsed
-=Symbols in CPPProcess.o= (~sse4:    0) (avx2: 1400) (512y:   67) (512z: 9966)
+TOTAL       :     2.133997 sec
+     4,079,803,226      cycles                    #    1.909 GHz                    
+     6,360,421,828      instructions              #    1.56  insn per cycle         
+       2.139161289 seconds time elapsed
+=Symbols in CPPProcess.o= (~sse4:    0) (avx2: 1390) (512y:   66) (512z: 9967)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/runTest.exe
 [  PASSED  ] 6 tests.
diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_bridge.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_bridge.txt
index 3c6a1dc7e6..f6b028fd57 100644
--- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_bridge.txt
+++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_bridge.txt
@@ -35,26 +35,26 @@ CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 
-DATE: 2023-07-18_23:25:38
+DATE: 2023-06-16_23:22:07
 
-On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
+On itscrd80.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe -p 64 256 1 --bridge OMP=
 WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost
 WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost
 WARNING! Instantiate device Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384)
 WARNING! Set grid in Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384)
-Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 3.120653e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.456740e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.456740e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.072253e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.474976e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.474976e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 4.063123e+00 +- 2.368970e+00 )  GeV^-4
-TOTAL       :     0.513909 sec
-     2,200,002,847      cycles                           #    2.961 GHz                    
-     3,401,705,219      instructions                     #    1.55  insn per cycle         
-       0.803499105 seconds time elapsed
+TOTAL       :     0.540694 sec
+     2,277,802,180      cycles                    #    2.911 GHz                    
+     3,167,166,199      instructions              #    1.39  insn per cycle         
+       0.842923456 seconds time elapsed
 runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe -p 64 256 1 --bridge
 WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost
 WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost
@@ -68,17 +68,17 @@ WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost
 WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost
 WARNING! Instantiate device Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288)
 WARNING! Set grid in Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288)
-Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 3.637838e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 4.119387e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 4.119387e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.563283e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 4.119139e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.119139e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 6.665112e+00 +- 5.002651e+00 )  GeV^-4
-TOTAL       :     3.312176 sec
-    10,816,030,086      cycles                           #    3.007 GHz                    
-    25,033,915,585      instructions                     #    2.31  insn per cycle         
-       3.656175148 seconds time elapsed
+TOTAL       :     3.373561 sec
+    11,106,851,106      cycles                    #    3.040 GHz                    
+    22,871,021,639      instructions              #    2.06  insn per cycle         
+       3.711939491 seconds time elapsed
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2
@@ -89,20 +89,20 @@ OK (relative difference <= 5E-3)
 =========================================================================
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/check.exe -p 64 256 1 --bridge OMP=
 WARNING! Instantiate host Bridge (nevt=16384)
-Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.930072e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.931007e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.931007e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.000691e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.001917e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.001917e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 4.063123e+00 +- 2.368970e+00 )  GeV^-4
-TOTAL       :     8.510916 sec
-    25,852,391,140      cycles                           #    3.037 GHz                    
-    78,719,532,972      instructions                     #    3.04  insn per cycle         
-       8.514893856 seconds time elapsed
-=Symbols in CPPProcess.o= (~sse4: 4798) (avx2:    0) (512y:    0) (512z:    0)
+TOTAL       :     8.213921 sec
+    25,358,245,526      cycles                    #    3.087 GHz                    
+    78,737,811,439      instructions              #    3.11  insn per cycle         
+       8.219246769 seconds time elapsed
+=Symbols in CPPProcess.o= (~sse4: 4807) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/runTest.exe
 [  PASSED  ] 6 tests.
@@ -116,19 +116,19 @@ OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/check.exe -p 64 256 1 --bridge OMP=
 WARNING! Instantiate host Bridge (nevt=16384)
-Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 3.640619e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.644045e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.644045e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.672231e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.676605e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.676605e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 4.063123e+00 +- 2.368970e+00 )  GeV^-4
-TOTAL       :     4.518890 sec
-    13,104,499,016      cycles                           #    2.899 GHz                    
-    39,245,795,332      instructions                     #    2.99  insn per cycle         
-       4.522830732 seconds time elapsed
+TOTAL       :     4.487084 sec
+    13,003,690,708      cycles                    #    2.899 GHz                    
+    39,263,649,353      instructions              #    3.02  insn per cycle         
+       4.492556901 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:13100) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/runTest.exe
@@ -143,19 +143,19 @@ OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/check.exe -p 64 256 1 --bridge OMP=
 WARNING! Instantiate host Bridge (nevt=16384)
-Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 8.082245e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 8.099128e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 8.099128e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 8.646145e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 8.670478e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 8.670478e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 4.063123e+00 +- 2.368970e+00 )  GeV^-4
-TOTAL       :     2.042668 sec
-     5,583,209,757      cycles                           #    2.729 GHz                    
-    13,824,560,946      instructions                     #    2.48  insn per cycle         
-       2.047130706 seconds time elapsed
+TOTAL       :     1.912940 sec
+     5,540,461,476      cycles                    #    2.890 GHz                    
+    13,840,907,410      instructions              #    2.50  insn per cycle         
+       1.918355435 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2:10973) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/runTest.exe
@@ -170,19 +170,19 @@ OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/check.exe -p 64 256 1 --bridge OMP=
 WARNING! Instantiate host Bridge (nevt=16384)
-Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 9.516477e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 9.539576e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 9.539576e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 9.799344e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 9.831037e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 9.831037e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 4.063123e+00 +- 2.368970e+00 )  GeV^-4
-TOTAL       :     1.737653 sec
-     4,899,889,298      cycles                           #    2.815 GHz                    
-    12,471,583,278      instructions                     #    2.55  insn per cycle         
-       1.741981812 seconds time elapsed
+TOTAL       :     1.689800 sec
+     4,910,066,879      cycles                    #    2.898 GHz                    
+    12,486,034,623      instructions              #    2.54  insn per cycle         
+       1.695175100 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2:10670) (512y:   29) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/runTest.exe
@@ -197,20 +197,20 @@ OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/check.exe -p 64 256 1 --bridge OMP=
 WARNING! Instantiate host Bridge (nevt=16384)
-Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 7.491439e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 7.505589e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 7.505589e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 7.596903e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 7.616542e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.616542e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 4.063123e+00 +- 2.368970e+00 )  GeV^-4
-TOTAL       :     2.203756 sec
-     4,082,065,510      cycles                           #    1.852 GHz                    
-     6,360,646,741      instructions                     #    1.56  insn per cycle         
-       2.207995337 seconds time elapsed
-=Symbols in CPPProcess.o= (~sse4:    0) (avx2: 1400) (512y:   67) (512z: 9966)
+TOTAL       :     2.177343 sec
+     4,099,033,272      cycles                    #    1.880 GHz                    
+     6,376,304,361      instructions              #    1.56  insn per cycle         
+       2.182949238 seconds time elapsed
+=Symbols in CPPProcess.o= (~sse4:    0) (avx2: 1390) (512y:   66) (512z: 9967)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/runTest.exe
 [  PASSED  ] 6 tests.
diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_common.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_common.txt
index 7086cd2ade..92695858f2 100644
--- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_common.txt
+++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_common.txt
@@ -35,38 +35,38 @@ CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 
-DATE: 2023-07-18_23:36:13
+DATE: 2023-06-16_23:33:00
 
-On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
+On itscrd80.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe -p 64 256 1 --common OMP=
-Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:COMMON+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 3.490681e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.519793e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.521970e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.499222e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.525335e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.527608e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 4.197467e-01 +- 3.250467e-01 )  GeV^-4
-TOTAL       :     0.510029 sec
-     2,174,253,409      cycles                           #    2.909 GHz                    
-     3,341,213,467      instructions                     #    1.54  insn per cycle         
-       0.806876610 seconds time elapsed
+TOTAL       :     0.532774 sec
+     2,217,572,873      cycles                    #    2.863 GHz                    
+     3,094,369,354      instructions              #    1.40  insn per cycle         
+       0.834887472 seconds time elapsed
 runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe -p 64 256 1 --common
 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255
 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100%
 .........................................................................
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 1 --common OMP=
-Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:COMMON+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 4.140176e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 4.174234e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 4.175704e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 4.135027e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 4.167071e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.168390e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 1.252232e+02 +- 1.234346e+02 )  GeV^-4
-TOTAL       :     3.137252 sec
-    10,160,608,325      cycles                           #    2.996 GHz                    
-    23,752,172,038      instructions                     #    2.34  insn per cycle         
-       3.448867860 seconds time elapsed
+TOTAL       :     3.158999 sec
+    10,380,525,451      cycles                    #    3.035 GHz                    
+    21,782,548,801      instructions              #    2.10  insn per cycle         
+       3.477627040 seconds time elapsed
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2
@@ -76,20 +76,20 @@ Relative difference = 2.837296512218831e-07
 OK (relative difference <= 5E-3)
 =========================================================================
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/check.exe -p 64 256 1 --common OMP=
-Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.925141e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.926080e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.926080e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.007018e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.008318e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.008318e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 4.197467e-01 +- 3.250467e-01 )  GeV^-4
-TOTAL       :     8.531262 sec
-    25,844,397,955      cycles                           #    3.029 GHz                    
-    78,714,919,982      instructions                     #    3.05  insn per cycle         
-       8.535249295 seconds time elapsed
-=Symbols in CPPProcess.o= (~sse4: 4798) (avx2:    0) (512y:    0) (512z:    0)
+TOTAL       :     8.184199 sec
+    25,337,258,294      cycles                    #    3.096 GHz                    
+    78,731,832,013      instructions              #    3.11  insn per cycle         
+       8.189083155 seconds time elapsed
+=Symbols in CPPProcess.o= (~sse4: 4807) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/runTest.exe
 [  PASSED  ] 6 tests.
@@ -102,19 +102,19 @@ Relative difference = 2.8372990776517314e-07
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/check.exe -p 64 256 1 --common OMP=
-Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 3.624915e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.628369e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.628369e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.689365e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.693788e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.693788e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 4.197467e-01 +- 3.250467e-01 )  GeV^-4
-TOTAL       :     4.535491 sec
-    13,090,983,779      cycles                           #    2.885 GHz                    
-    39,233,707,851      instructions                     #    3.00  insn per cycle         
-       4.539286898 seconds time elapsed
+TOTAL       :     4.457849 sec
+    12,962,355,153      cycles                    #    2.906 GHz                    
+    39,244,481,809      instructions              #    3.03  insn per cycle         
+       4.462633637 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:13100) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/runTest.exe
@@ -128,19 +128,19 @@ Relative difference = 2.837299079287849e-07
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/check.exe -p 64 256 1 --common OMP=
-Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 8.408812e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 8.426603e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 8.426603e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 8.621598e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 8.647376e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 8.647376e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 4.197467e-01 +- 3.250467e-01 )  GeV^-4
-TOTAL       :     1.962726 sec
-     5,556,000,216      cycles                           #    2.827 GHz                    
-    13,815,069,994      instructions                     #    2.49  insn per cycle         
-       1.966529301 seconds time elapsed
+TOTAL       :     1.914532 sec
+     5,534,212,685      cycles                    #    2.886 GHz                    
+    13,825,715,526      instructions              #    2.50  insn per cycle         
+       1.920377680 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2:10973) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/runTest.exe
@@ -154,19 +154,19 @@ Relative difference = 2.837296634927675e-07
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/check.exe -p 64 256 1 --common OMP=
-Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:COMMON+RMBHST+MESHST/512y+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 9.569095e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 9.592740e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 9.592740e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 9.758885e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 9.789531e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 9.789531e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 4.197467e-01 +- 3.250467e-01 )  GeV^-4
-TOTAL       :     1.725721 sec
-     4,883,644,112      cycles                           #    2.825 GHz                    
-    12,458,591,116      instructions                     #    2.55  insn per cycle         
-       1.729494553 seconds time elapsed
+TOTAL       :     1.692773 sec
+     4,894,078,000      cycles                    #    2.886 GHz                    
+    12,469,587,925      instructions              #    2.55  insn per cycle         
+       1.697826655 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2:10670) (512y:   29) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/runTest.exe
@@ -180,20 +180,20 @@ Relative difference = 2.837296634927675e-07
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/check.exe -p 64 256 1 --common OMP=
-Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:COMMON+RMBHST+MESHST/512z+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 7.452812e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 7.467478e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 7.467478e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 7.808849e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 7.828810e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.828810e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 4.197467e-01 +- 3.250467e-01 )  GeV^-4
-TOTAL       :     2.212487 sec
-     4,077,406,536      cycles                           #    1.841 GHz                    
-     6,350,765,529      instructions                     #    1.56  insn per cycle         
-       2.216371467 seconds time elapsed
-=Symbols in CPPProcess.o= (~sse4:    0) (avx2: 1400) (512y:   67) (512z: 9966)
+TOTAL       :     2.112703 sec
+     4,076,094,887      cycles                    #    1.926 GHz                    
+     6,358,678,796      instructions              #    1.56  insn per cycle         
+       2.117704881 seconds time elapsed
+=Symbols in CPPProcess.o= (~sse4:    0) (avx2: 1390) (512y:   66) (512z: 9967)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/runTest.exe
 [  PASSED  ] 6 tests.
diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_curhst.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_curhst.txt
index 87192ccb2c..0fd3a41abe 100644
--- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_curhst.txt
+++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_curhst.txt
@@ -35,38 +35,38 @@ CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 
-DATE: 2023-07-18_23:33:15
+DATE: 2023-06-16_23:29:57
 
-On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
+On itscrd80.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe -p 64 256 1 --curhst OMP=
-Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURHST+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 3.502754e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.531694e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.534771e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.491741e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.517860e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.520006e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 4.063123e+00 +- 2.368970e+00 )  GeV^-4
-TOTAL       :     0.505727 sec
-     2,155,415,909      cycles                           #    2.917 GHz                    
-     3,312,395,454      instructions                     #    1.54  insn per cycle         
-       0.798203731 seconds time elapsed
+TOTAL       :     0.531429 sec
+     2,232,387,650      cycles                    #    2.917 GHz                    
+     3,085,215,012      instructions              #    1.38  insn per cycle         
+       0.827733891 seconds time elapsed
 runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe -p 64 256 1 --curhst
 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255
 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100%
 .........................................................................
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 1 --curhst OMP=
-Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURHST+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 4.124123e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 4.157827e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 4.159261e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 4.139492e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 4.171728e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.173076e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 6.665112e+00 +- 5.002651e+00 )  GeV^-4
-TOTAL       :     3.086041 sec
-    10,033,097,378      cycles                           #    3.006 GHz                    
-    20,725,109,527      instructions                     #    2.07  insn per cycle         
-       3.397473355 seconds time elapsed
+TOTAL       :     3.104917 sec
+    10,133,121,889      cycles                    #    3.000 GHz                    
+    23,296,776,590      instructions              #    2.30  insn per cycle         
+       3.434357003 seconds time elapsed
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2
@@ -76,20 +76,20 @@ Relative difference = 2.837296512218831e-07
 OK (relative difference <= 5E-3)
 =========================================================================
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/check.exe -p 64 256 1 --curhst OMP=
-Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.922814e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.923784e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.923784e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.009220e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.010486e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.010486e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 4.063123e+00 +- 2.368970e+00 )  GeV^-4
-TOTAL       :     8.540121 sec
-    25,892,455,253      cycles                           #    3.032 GHz                    
-    78,716,335,922      instructions                     #    3.04  insn per cycle         
-       8.543882564 seconds time elapsed
-=Symbols in CPPProcess.o= (~sse4: 4798) (avx2:    0) (512y:    0) (512z:    0)
+TOTAL       :     8.175111 sec
+    25,293,924,967      cycles                    #    3.094 GHz                    
+    78,729,597,256      instructions              #    3.11  insn per cycle         
+       8.179990802 seconds time elapsed
+=Symbols in CPPProcess.o= (~sse4: 4807) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/runTest.exe
 [  PASSED  ] 6 tests.
@@ -102,19 +102,19 @@ Relative difference = 2.8372990776517314e-07
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/check.exe -p 64 256 1 --curhst OMP=
-Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 3.627481e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.630749e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.630749e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.697722e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.701921e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.701921e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 4.063123e+00 +- 2.368970e+00 )  GeV^-4
-TOTAL       :     4.530972 sec
-    13,095,277,525      cycles                           #    2.888 GHz                    
-    39,233,630,028      instructions                     #    3.00  insn per cycle         
-       4.534902471 seconds time elapsed
+TOTAL       :     4.446354 sec
+    12,955,336,489      cycles                    #    2.913 GHz                    
+    39,244,878,043      instructions              #    3.03  insn per cycle         
+       4.451190139 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:13100) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/runTest.exe
@@ -128,19 +128,19 @@ Relative difference = 2.837299079287849e-07
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/check.exe -p 64 256 1 --curhst OMP=
-Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 8.436913e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 8.453832e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 8.453832e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 8.711957e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 8.735812e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 8.735812e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 4.063123e+00 +- 2.368970e+00 )  GeV^-4
-TOTAL       :     1.953456 sec
-     5,549,840,287      cycles                           #    2.837 GHz                    
-    13,813,947,073      instructions                     #    2.49  insn per cycle         
-       1.957552217 seconds time elapsed
+TOTAL       :     1.893317 sec
+     5,534,953,514      cycles                    #    2.918 GHz                    
+    13,828,055,257      instructions              #    2.50  insn per cycle         
+       1.898185319 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2:10973) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/runTest.exe
@@ -154,19 +154,19 @@ Relative difference = 2.837296634927675e-07
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/check.exe -p 64 256 1 --curhst OMP=
-Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 9.557004e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 9.579620e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 9.579620e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 9.783880e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 9.814289e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 9.814289e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 4.063123e+00 +- 2.368970e+00 )  GeV^-4
-TOTAL       :     1.726397 sec
-     4,883,869,602      cycles                           #    2.824 GHz                    
-    12,460,671,114      instructions                     #    2.55  insn per cycle         
-       1.730241315 seconds time elapsed
+TOTAL       :     1.686497 sec
+     4,890,139,976      cycles                    #    2.893 GHz                    
+    12,471,298,231      instructions              #    2.55  insn per cycle         
+       1.691636381 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2:10670) (512y:   29) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/runTest.exe
@@ -180,20 +180,20 @@ Relative difference = 2.837296634927675e-07
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/check.exe -p 64 256 1 --curhst OMP=
-Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 7.511323e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 7.525628e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 7.525628e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 7.791142e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 7.810133e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.810133e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 4.063123e+00 +- 2.368970e+00 )  GeV^-4
-TOTAL       :     2.193124 sec
-     4,069,613,051      cycles                           #    1.854 GHz                    
-     6,350,163,242      instructions                     #    1.56  insn per cycle         
-       2.197268123 seconds time elapsed
-=Symbols in CPPProcess.o= (~sse4:    0) (avx2: 1400) (512y:   67) (512z: 9966)
+TOTAL       :     2.116611 sec
+     4,073,012,903      cycles                    #    1.922 GHz                    
+     6,360,730,501      instructions              #    1.56  insn per cycle         
+       2.121591447 seconds time elapsed
+=Symbols in CPPProcess.o= (~sse4:    0) (avx2: 1390) (512y:   66) (512z: 9967)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/runTest.exe
 [  PASSED  ] 6 tests.
diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_rmbhst.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_rmbhst.txt
index abafa479c3..a4f7f78bc7 100644
--- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_rmbhst.txt
+++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_rmbhst.txt
@@ -35,23 +35,23 @@ CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 
-DATE: 2023-07-18_23:30:22
+DATE: 2023-06-16_23:26:59
 
-On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
+On itscrd80.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe -p 64 256 1 --rmbhst OMP=
 WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost
-Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURHST+RMBHST+MESDEV/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 3.239780e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.542155e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.544417e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.163660e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.517064e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.519281e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 4.063123e+00 +- 2.368970e+00 )  GeV^-4
-TOTAL       :     0.509580 sec
-     2,192,454,861      cycles                           #    2.954 GHz                    
-     3,384,298,240      instructions                     #    1.54  insn per cycle         
-       0.802945215 seconds time elapsed
+TOTAL       :     0.535969 sec
+     2,257,409,350      cycles                    #    2.913 GHz                    
+     3,148,870,456      instructions              #    1.39  insn per cycle         
+       0.835585235 seconds time elapsed
 runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe -p 64 256 1 --rmbhst
 WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost
 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255
@@ -59,17 +59,17 @@ WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost
 .........................................................................
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 1 --rmbhst OMP=
 WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost
-Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURHST+RMBHST+MESDEV/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 3.729630e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 4.165698e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 4.167125e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.637869e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 4.154701e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.156020e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 6.665112e+00 +- 5.002651e+00 )  GeV^-4
-TOTAL       :     3.209061 sec
-    10,443,839,106      cycles                           #    3.019 GHz                    
-    24,645,837,453      instructions                     #    2.36  insn per cycle         
-       3.518663731 seconds time elapsed
+TOTAL       :     3.270916 sec
+    10,746,580,301      cycles                    #    3.041 GHz                    
+    21,720,295,313      instructions              #    2.02  insn per cycle         
+       3.590840027 seconds time elapsed
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2
@@ -79,20 +79,20 @@ Relative difference = 2.837296512218831e-07
 OK (relative difference <= 5E-3)
 =========================================================================
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/check.exe -p 64 256 1 --rmbhst OMP=
-Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.938188e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.939108e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.939108e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.007137e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.008416e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.008416e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 4.063123e+00 +- 2.368970e+00 )  GeV^-4
-TOTAL       :     8.471313 sec
-    25,846,239,910      cycles                           #    3.050 GHz                    
-    78,714,529,048      instructions                     #    3.05  insn per cycle         
-       8.475445839 seconds time elapsed
-=Symbols in CPPProcess.o= (~sse4: 4798) (avx2:    0) (512y:    0) (512z:    0)
+TOTAL       :     8.182065 sec
+    25,337,840,293      cycles                    #    3.096 GHz                    
+    78,726,782,076      instructions              #    3.11  insn per cycle         
+       8.187223661 seconds time elapsed
+=Symbols in CPPProcess.o= (~sse4: 4807) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/runTest.exe
 [  PASSED  ] 6 tests.
@@ -105,19 +105,19 @@ Relative difference = 2.8372990776517314e-07
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/check.exe -p 64 256 1 --rmbhst OMP=
-Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 3.654348e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.657703e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.657703e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.676580e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.680802e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.680802e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 4.063123e+00 +- 2.368970e+00 )  GeV^-4
-TOTAL       :     4.496959 sec
-    13,076,441,166      cycles                           #    2.906 GHz                    
-    39,231,499,959      instructions                     #    3.00  insn per cycle         
-       4.501151358 seconds time elapsed
+TOTAL       :     4.471527 sec
+    12,958,437,919      cycles                    #    2.896 GHz                    
+    39,245,010,758      instructions              #    3.03  insn per cycle         
+       4.476404809 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:13100) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/runTest.exe
@@ -131,19 +131,19 @@ Relative difference = 2.837299079287849e-07
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/check.exe -p 64 256 1 --rmbhst OMP=
-Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 8.406339e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 8.423842e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 8.423842e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 8.510079e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 8.532022e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 8.532022e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 4.063123e+00 +- 2.368970e+00 )  GeV^-4
-TOTAL       :     1.960439 sec
-     5,550,772,902      cycles                           #    2.827 GHz                    
-    13,813,947,600      instructions                     #    2.49  insn per cycle         
-       1.964408897 seconds time elapsed
+TOTAL       :     1.938058 sec
+     5,528,373,338      cycles                    #    2.847 GHz                    
+    13,827,020,883      instructions              #    2.50  insn per cycle         
+       1.943269358 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2:10973) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/runTest.exe
@@ -157,19 +157,19 @@ Relative difference = 2.837296634927675e-07
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/check.exe -p 64 256 1 --rmbhst OMP=
-Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 9.481900e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 9.504023e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 9.504023e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 9.746577e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 9.776421e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 9.776421e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 4.063123e+00 +- 2.368970e+00 )  GeV^-4
-TOTAL       :     1.739600 sec
-     4,888,239,642      cycles                           #    2.805 GHz                    
-    12,461,742,004      instructions                     #    2.55  insn per cycle         
-       1.743840420 seconds time elapsed
+TOTAL       :     1.693197 sec
+     4,886,719,743      cycles                    #    2.881 GHz                    
+    12,471,368,394      instructions              #    2.55  insn per cycle         
+       1.698247712 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2:10670) (512y:   29) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/runTest.exe
@@ -183,20 +183,20 @@ Relative difference = 2.837296634927675e-07
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/check.exe -p 64 256 1 --rmbhst OMP=
-Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 7.571336e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 7.585205e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 7.585205e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 7.653939e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 7.671885e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.671885e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 4.063123e+00 +- 2.368970e+00 )  GeV^-4
-TOTAL       :     2.175833 sec
-     4,068,802,627      cycles                           #    1.867 GHz                    
-     6,350,264,991      instructions                     #    1.56  insn per cycle         
-       2.179973296 seconds time elapsed
-=Symbols in CPPProcess.o= (~sse4:    0) (avx2: 1400) (512y:   67) (512z: 9966)
+TOTAL       :     2.156844 sec
+     4,112,325,475      cycles                    #    1.906 GHz                    
+     6,362,849,586      instructions              #    1.55  insn per cycle         
+       2.161761478 seconds time elapsed
+=Symbols in CPPProcess.o= (~sse4:    0) (avx2: 1390) (512y:   66) (512z: 9967)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/runTest.exe
 [  PASSED  ] 6 tests.
diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd1.txt
index dbcf951307..f6164c5cd9 100644
--- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd1.txt
+++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd1.txt
@@ -35,38 +35,38 @@ CUDACPP_BUILDDIR='build.512z_d_inl0_hrd1'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 
-DATE: 2023-07-18_22:45:49
+DATE: 2023-06-16_22:57:00
 
-On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
+On itscrd80.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd1/gcheck.exe -p 64 256 1 OMP=
-Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1]
+Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 3.484189e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.512941e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.515884e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.505261e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.540833e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.543305e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 4.063123e+00 +- 2.368970e+00 )  GeV^-4
-TOTAL       :     0.525119 sec
-     2,277,820,090      cycles                           #    2.954 GHz                    
-     3,332,750,506      instructions                     #    1.46  insn per cycle         
-       0.836036844 seconds time elapsed
+TOTAL       :     0.552999 sec
+     2,261,605,886      cycles                    #    2.864 GHz                    
+     3,081,160,015      instructions              #    1.36  insn per cycle         
+       0.848098050 seconds time elapsed
 runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd1/gcheck.exe -p 64 256 1
 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255
 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100%
 .........................................................................
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd1/gcheck.exe -p 2048 256 1 OMP=
-Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1]
+Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 4.142184e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 4.176352e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 4.177740e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 4.144541e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 4.178169e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.179509e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 6.665112e+00 +- 5.002651e+00 )  GeV^-4
-TOTAL       :     3.035367 sec
-     9,924,089,083      cycles                           #    3.018 GHz                    
-    23,233,623,044      instructions                     #    2.34  insn per cycle         
-       3.348425246 seconds time elapsed
+TOTAL       :     3.060795 sec
+    10,085,184,977      cycles                    #    3.025 GHz                    
+    22,493,985,254      instructions              #    2.23  insn per cycle         
+       3.393105734 seconds time elapsed
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd1/gcheck.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd1/fgcheck.exe 2 64 2
@@ -76,20 +76,20 @@ Relative difference = 2.837296512218831e-07
 OK (relative difference <= 5E-3)
 =========================================================================
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd1/check.exe -p 64 256 1 OMP=
-Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1]
+Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.932718e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.933696e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.933696e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.940758e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.941897e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.941897e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 4.063123e+00 +- 2.368970e+00 )  GeV^-4
-TOTAL       :     8.497912 sec
-    25,777,112,944      cycles                           #    3.033 GHz                    
-    78,466,665,546      instructions                     #    3.04  insn per cycle         
-       8.502791973 seconds time elapsed
-=Symbols in CPPProcess.o= (~sse4: 4170) (avx2:    0) (512y:    0) (512z:    0)
+TOTAL       :     8.462419 sec
+    25,229,958,045      cycles                    #    2.982 GHz                    
+    78,471,131,282      instructions              #    3.11  insn per cycle         
+       8.467976850 seconds time elapsed
+=Symbols in CPPProcess.o= (~sse4: 4138) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd1/runTest.exe
 [  PASSED  ] 6 tests.
@@ -102,20 +102,20 @@ Relative difference = 2.8372990776517314e-07
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd1/check.exe -p 64 256 1 OMP=
-Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1]
+Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 3.597924e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.601066e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.601066e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.727132e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.731470e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.731470e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 4.063123e+00 +- 2.368970e+00 )  GeV^-4
-TOTAL       :     4.570165 sec
-    13,100,547,026      cycles                           #    2.867 GHz                    
-    39,170,316,646      instructions                     #    2.99  insn per cycle         
-       4.575595642 seconds time elapsed
-=Symbols in CPPProcess.o= (~sse4:12885) (avx2:    0) (512y:    0) (512z:    0)
+TOTAL       :     4.410452 sec
+    12,894,155,265      cycles                    #    2.921 GHz                    
+    39,184,570,946      instructions              #    3.04  insn per cycle         
+       4.415732132 seconds time elapsed
+=Symbols in CPPProcess.o= (~sse4:12872) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd1/runTest.exe
 [  PASSED  ] 6 tests.
@@ -128,19 +128,19 @@ Relative difference = 2.837299079287849e-07
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd1/check.exe -p 64 256 1 OMP=
-Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1]
+Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 8.390403e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 8.407788e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 8.407788e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 8.691843e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 8.716729e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 8.716729e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 4.063123e+00 +- 2.368970e+00 )  GeV^-4
-TOTAL       :     1.967187 sec
-     5,567,550,322      cycles                           #    2.829 GHz                    
-    13,905,495,574      instructions                     #    2.50  insn per cycle         
-       1.972227716 seconds time elapsed
+TOTAL       :     1.898008 sec
+     5,540,010,361      cycles                    #    2.913 GHz                    
+    13,917,256,443      instructions              #    2.51  insn per cycle         
+       1.903063407 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2:11079) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd1/runTest.exe
@@ -154,19 +154,19 @@ Relative difference = 2.837296634927675e-07
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd1/check.exe -p 64 256 1 OMP=
-Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1]
+Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 9.298393e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 9.319835e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 9.319835e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 9.729941e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 9.760433e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 9.760433e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 4.063123e+00 +- 2.368970e+00 )  GeV^-4
-TOTAL       :     1.776189 sec
-     4,927,895,040      cycles                           #    2.773 GHz                    
-    12,557,484,324      instructions                     #    2.55  insn per cycle         
-       1.781515054 seconds time elapsed
+TOTAL       :     1.696750 sec
+     4,936,264,808      cycles                    #    2.903 GHz                    
+    12,569,278,770      instructions              #    2.55  insn per cycle         
+       1.701843841 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2:10689) (512y:  180) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd1/runTest.exe
@@ -180,20 +180,20 @@ Relative difference = 2.837296634927675e-07
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd1/check.exe -p 64 256 1 OMP=
-Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1]
+Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 7.516389e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 7.530329e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 7.530329e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 7.790703e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 7.810947e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.810947e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 4.063123e+00 +- 2.368970e+00 )  GeV^-4
-TOTAL       :     2.195638 sec
-     4,074,235,200      cycles                           #    1.856 GHz                    
-     6,445,069,906      instructions                     #    1.58  insn per cycle         
-       2.200826665 seconds time elapsed
-=Symbols in CPPProcess.o= (~sse4:    0) (avx2: 1302) (512y:  170) (512z:10055)
+TOTAL       :     2.116619 sec
+     4,079,630,480      cycles                    #    1.924 GHz                    
+     6,456,227,558      instructions              #    1.58  insn per cycle         
+       2.121736786 seconds time elapsed
+=Symbols in CPPProcess.o= (~sse4:    0) (avx2: 1301) (512y:  170) (512z:10055)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd1/runTest.exe
 [  PASSED  ] 6 tests.
diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl1_hrd0.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl1_hrd0.txt
index 7f1889b172..19fe3b6889 100644
--- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl1_hrd0.txt
+++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl1_hrd0.txt
@@ -35,38 +35,38 @@ CUDACPP_BUILDDIR='build.512z_d_inl1_hrd0'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 
-DATE: 2023-07-18_23:15:54
+DATE: 2023-06-16_23:12:02
 
-On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
+On itscrd80.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd0/gcheck.exe -p 64 256 1 OMP=
-Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=1] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=1] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 3.251649e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.277334e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.279317e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.228389e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.252132e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.253939e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 4.063123e+00 +- 2.368970e+00 )  GeV^-4
-TOTAL       :     0.536329 sec
-     2,253,929,687      cycles                           #    2.954 GHz                    
-     3,457,143,318      instructions                     #    1.53  insn per cycle         
-       0.822389443 seconds time elapsed
+TOTAL       :     0.557644 sec
+     2,313,651,554      cycles                    #    2.916 GHz                    
+     3,231,773,253      instructions              #    1.40  insn per cycle         
+       0.852166017 seconds time elapsed
 runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd0/gcheck.exe -p 64 256 1
 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255
 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100%
 .........................................................................
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd0/gcheck.exe -p 2048 256 1 OMP=
-Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=1] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=1] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 3.752729e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.780873e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.782025e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.762297e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.789061e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.790098e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 6.665112e+00 +- 5.002651e+00 )  GeV^-4
-TOTAL       :     3.324142 sec
-    10,808,418,598      cycles                           #    3.022 GHz                    
-    25,158,541,697      instructions                     #    2.33  insn per cycle         
-       3.635743762 seconds time elapsed
+TOTAL       :     3.347281 sec
+    10,991,558,985      cycles                    #    3.045 GHz                    
+    24,428,456,620      instructions              #    2.22  insn per cycle         
+       3.667854463 seconds time elapsed
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd0/gcheck.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd0/fgcheck.exe 2 64 2
@@ -76,20 +76,20 @@ Relative difference = 2.837296513854949e-07
 OK (relative difference <= 5E-3)
 =========================================================================
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd0/check.exe -p 64 256 1 OMP=
-Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.2.0] [inlineHel=1] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 4.386990e+02                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 4.387472e+02                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 4.387472e+02                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 4.470440e+02                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 4.471090e+02                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.471090e+02                 )  sec^-1
 MeanMatrixElemValue         = ( 4.063123e+00 +- 2.368970e+00 )  GeV^-4
-TOTAL       :    37.393795 sec
-   113,596,849,149      cycles                           #    3.038 GHz                    
-   145,178,851,492      instructions                     #    1.28  insn per cycle         
-      37.398172394 seconds time elapsed
-=Symbols in CPPProcess.o= (~sse4:21790) (avx2:    0) (512y:    0) (512z:    0)
+TOTAL       :    36.698448 sec
+   113,650,760,924      cycles                    #    3.097 GHz                    
+   145,145,803,355      instructions              #    1.28  insn per cycle         
+      36.703372680 seconds time elapsed
+=Symbols in CPPProcess.o= (~sse4:21749) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd0/runTest.exe
 [  PASSED  ] 6 tests.
@@ -102,19 +102,19 @@ Relative difference = 2.8372991823632784e-07
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl1_hrd0/check.exe -p 64 256 1 OMP=
-Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.2.0] [inlineHel=1] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 3.199113e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.201670e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.201670e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.344679e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.348354e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.348354e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 4.063123e+00 +- 2.368970e+00 )  GeV^-4
-TOTAL       :     5.135922 sec
-    14,655,728,105      cycles                           #    2.852 GHz                    
-    37,424,135,547      instructions                     #    2.55  insn per cycle         
-       5.140131665 seconds time elapsed
+TOTAL       :     4.914072 sec
+    14,660,206,602      cycles                    #    2.982 GHz                    
+    37,434,709,642      instructions              #    2.55  insn per cycle         
+       4.919337364 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:67993) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl1_hrd0/runTest.exe
@@ -128,19 +128,19 @@ Relative difference = 2.8372990661989057e-07
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl1_hrd0/check.exe -p 64 256 1 OMP=
-Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.2.0] [inlineHel=1] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 7.704464e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 7.719214e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 7.719214e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 7.882298e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 7.901243e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.901243e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 4.063123e+00 +- 2.368970e+00 )  GeV^-4
-TOTAL       :     2.138785 sec
-     6,043,034,186      cycles                           #    2.822 GHz                    
-    12,913,049,362      instructions                     #    2.14  insn per cycle         
-       2.142766618 seconds time elapsed
+TOTAL       :     2.091187 sec
+     6,050,971,713      cycles                    #    2.889 GHz                    
+    12,923,420,886      instructions              #    2.14  insn per cycle         
+       2.096398672 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2:46338) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl1_hrd0/runTest.exe
@@ -154,19 +154,19 @@ Relative difference = 2.837296715097453e-07
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl1_hrd0/check.exe -p 64 256 1 OMP=
-Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.2.0] [inlineHel=1] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 9.425836e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 9.447500e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 9.447500e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 9.423590e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 9.450741e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 9.450741e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 4.063123e+00 +- 2.368970e+00 )  GeV^-4
-TOTAL       :     1.749535 sec
-     4,997,190,169      cycles                           #    2.851 GHz                    
-    11,318,212,652      instructions                     #    2.26  insn per cycle         
-       1.753835753 seconds time elapsed
+TOTAL       :     1.751970 sec
+     5,013,534,941      cycles                    #    2.856 GHz                    
+    11,332,565,506      instructions              #    2.26  insn per cycle         
+       1.757089218 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2:40036) (512y:  188) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl1_hrd0/runTest.exe
@@ -180,19 +180,19 @@ Relative difference = 2.837296715097453e-07
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl1_hrd0/check.exe -p 64 256 1 OMP=
-Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.2.0] [inlineHel=1] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 7.771438e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 7.786530e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 7.786530e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 7.963573e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 7.983731e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.983731e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 4.063123e+00 +- 2.368970e+00 )  GeV^-4
-TOTAL       :     2.120548 sec
-     3,944,759,858      cycles                           #    1.858 GHz                    
-     5,787,397,751      instructions                     #    1.47  insn per cycle         
-       2.124873095 seconds time elapsed
+TOTAL       :     2.070157 sec
+     3,940,063,125      cycles                    #    1.900 GHz                    
+     5,797,816,506      instructions              #    1.47  insn per cycle         
+       2.075319073 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 1924) (512y:  317) (512z:38936)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl1_hrd0/runTest.exe
diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl1_hrd1.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl1_hrd1.txt
index 382af9d5f9..5e839cdef4 100644
--- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl1_hrd1.txt
+++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl1_hrd1.txt
@@ -35,38 +35,38 @@ CUDACPP_BUILDDIR='build.512z_d_inl1_hrd1'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 
-DATE: 2023-07-18_23:17:01
+DATE: 2023-06-16_23:13:09
 
-On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
+On itscrd80.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd1/gcheck.exe -p 64 256 1 OMP=
-Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=1] [hardcodePARAM=1]
+Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=1] [hardcodePARAM=1]
 Workflow summary            = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 3.270360e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.295867e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.297816e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.263284e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.287600e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.289479e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 4.063123e+00 +- 2.368970e+00 )  GeV^-4
-TOTAL       :     0.530823 sec
-     2,270,683,029      cycles                           #    2.966 GHz                    
-     3,480,847,783      instructions                     #    1.53  insn per cycle         
-       0.823108291 seconds time elapsed
+TOTAL       :     0.554858 sec
+     2,295,580,731      cycles                    #    2.899 GHz                    
+     3,230,493,146      instructions              #    1.41  insn per cycle         
+       0.849020118 seconds time elapsed
 runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd1/gcheck.exe -p 64 256 1
 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255
 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100%
 .........................................................................
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd1/gcheck.exe -p 2048 256 1 OMP=
-Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=1] [hardcodePARAM=1]
+Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=1] [hardcodePARAM=1]
 Workflow summary            = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 3.795934e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.824765e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.825972e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.794287e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.821112e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.822179e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 6.665112e+00 +- 5.002651e+00 )  GeV^-4
-TOTAL       :     3.278040 sec
-    10,535,389,599      cycles                           #    2.985 GHz                    
-    22,867,797,759      instructions                     #    2.17  insn per cycle         
-       3.589361332 seconds time elapsed
+TOTAL       :     3.303147 sec
+    10,830,294,927      cycles                    #    3.034 GHz                    
+    23,889,615,566      instructions              #    2.21  insn per cycle         
+       3.626334184 seconds time elapsed
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd1/gcheck.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd1/fgcheck.exe 2 64 2
@@ -76,20 +76,20 @@ Relative difference = 2.837296513854949e-07
 OK (relative difference <= 5E-3)
 =========================================================================
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd1/check.exe -p 64 256 1 OMP=
-Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=1]
+Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.2.0] [inlineHel=1] [hardcodePARAM=1]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 4.327519e+02                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 4.327978e+02                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 4.327978e+02                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 4.442294e+02                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 4.442931e+02                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.442931e+02                 )  sec^-1
 MeanMatrixElemValue         = ( 4.063123e+00 +- 2.368970e+00 )  GeV^-4
-TOTAL       :    37.907398 sec
-   114,467,345,138      cycles                           #    3.020 GHz                    
-   145,678,466,655      instructions                     #    1.27  insn per cycle         
-      37.911583534 seconds time elapsed
-=Symbols in CPPProcess.o= (~sse4:22539) (avx2:    0) (512y:    0) (512z:    0)
+TOTAL       :    36.931797 sec
+   114,071,883,560      cycles                    #    3.089 GHz                    
+   145,758,435,006      instructions              #    1.28  insn per cycle         
+      36.937096359 seconds time elapsed
+=Symbols in CPPProcess.o= (~sse4:22580) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd1/runTest.exe
 [  PASSED  ] 6 tests.
@@ -102,19 +102,19 @@ Relative difference = 2.83729918072716e-07
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl1_hrd1/check.exe -p 64 256 1 OMP=
-Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=1]
+Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.2.0] [inlineHel=1] [hardcodePARAM=1]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 3.113098e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.115510e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.115510e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.286130e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.289440e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.289440e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 4.063123e+00 +- 2.368970e+00 )  GeV^-4
-TOTAL       :     5.277417 sec
-    15,032,371,179      cycles                           #    2.847 GHz                    
-    37,583,654,408      instructions                     #    2.50  insn per cycle         
-       5.281735273 seconds time elapsed
+TOTAL       :     5.001270 sec
+    14,944,293,116      cycles                    #    2.986 GHz                    
+    37,593,481,677      instructions              #    2.52  insn per cycle         
+       5.006579543 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:68265) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl1_hrd1/runTest.exe
@@ -128,19 +128,19 @@ Relative difference = 2.8372990661989057e-07
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl1_hrd1/check.exe -p 64 256 1 OMP=
-Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=1]
+Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.2.0] [inlineHel=1] [hardcodePARAM=1]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 7.856730e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 7.870923e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 7.870923e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 8.014357e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 8.033871e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 8.033871e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 4.063123e+00 +- 2.368970e+00 )  GeV^-4
-TOTAL       :     2.096869 sec
-     5,955,250,625      cycles                           #    2.836 GHz                    
-    12,810,330,644      instructions                     #    2.15  insn per cycle         
-       2.101131538 seconds time elapsed
+TOTAL       :     2.057405 sec
+     5,959,840,539      cycles                    #    2.891 GHz                    
+    12,824,533,137      instructions              #    2.15  insn per cycle         
+       2.062870435 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2:45687) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl1_hrd1/runTest.exe
@@ -154,19 +154,19 @@ Relative difference = 2.8372967134613354e-07
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl1_hrd1/check.exe -p 64 256 1 OMP=
-Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=1]
+Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.2.0] [inlineHel=1] [hardcodePARAM=1]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 9.109775e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 9.130403e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 9.130403e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 9.500841e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 9.528151e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 9.528151e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 4.063123e+00 +- 2.368970e+00 )  GeV^-4
-TOTAL       :     1.809492 sec
-     5,019,187,943      cycles                           #    2.769 GHz                    
-    11,330,622,254      instructions                     #    2.26  insn per cycle         
-       1.813662613 seconds time elapsed
+TOTAL       :     1.743665 sec
+     5,035,330,638      cycles                    #    2.888 GHz                    
+    11,343,590,938      instructions              #    2.25  insn per cycle         
+       1.748819840 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2:39850) (512y:  138) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl1_hrd1/runTest.exe
@@ -180,19 +180,19 @@ Relative difference = 2.8372967134613354e-07
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl1_hrd1/check.exe -p 64 256 1 OMP=
-Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=1]
+Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.2.0] [inlineHel=1] [hardcodePARAM=1]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 7.681987e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 7.696729e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 7.696729e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 8.021482e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 8.042248e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 8.042248e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 4.063123e+00 +- 2.368970e+00 )  GeV^-4
-TOTAL       :     2.144494 sec
-     3,933,032,947      cycles                           #    1.833 GHz                    
-     5,761,298,930      instructions                     #    1.46  insn per cycle         
-       2.148708938 seconds time elapsed
+TOTAL       :     2.055496 sec
+     3,937,439,973      cycles                    #    1.912 GHz                    
+     5,773,024,249      instructions              #    1.47  insn per cycle         
+       2.060744461 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 1589) (512y:  251) (512z:38642)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl1_hrd1/runTest.exe
diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0.txt
index 1795f8ff40..0b7bc69d3c 100644
--- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0.txt
+++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0.txt
@@ -35,61 +35,61 @@ CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 
-DATE: 2023-07-18_22:46:24
+DATE: 2023-06-16_22:57:36
 
-On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
+On itscrd80.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe -p 64 256 1 OMP=
-Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 6.328648e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 6.379805e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 6.385954e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 6.285069e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 6.329665e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 6.335265e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 4.059596e+00 +- 2.368053e+00 )  GeV^-4
-TOTAL       :     0.481770 sec
-     2,063,713,574      cycles                           #    2.942 GHz                    
-     2,968,264,700      instructions                     #    1.44  insn per cycle         
-       0.786196143 seconds time elapsed
+TOTAL       :     0.508351 sec
+     2,077,856,827      cycles                    #    2.833 GHz                    
+     2,725,078,892      instructions              #    1.31  insn per cycle         
+       0.790826956 seconds time elapsed
 runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe -p 64 256 1
 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255
 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100%
 .........................................................................
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 1 OMP=
-Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 8.555201e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 8.627905e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 8.631075e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 8.539433e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 8.615009e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 8.618092e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 6.664703e+00 +- 5.072736e+00 )  GeV^-4
-TOTAL       :     1.726449 sec
-     5,877,053,961      cycles                           #    3.004 GHz                    
-    12,562,645,632      instructions                     #    2.14  insn per cycle         
-       2.015090243 seconds time elapsed
+TOTAL       :     1.761585 sec
+     5,789,825,690      cycles                    #    2.887 GHz                    
+    11,967,255,873      instructions              #    2.07  insn per cycle         
+       2.065101803 seconds time elapsed
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2
 Avg ME (C++/CUDA)   = 6.626454e-04
-Avg ME (F77/CUDA)   = 6.6262659968156085E-004
-Relative difference = 2.8371612387547027e-05
+Avg ME (F77/CUDA)   = 6.6262662035525971E-004
+Relative difference = 2.8340413651595734e-05
 OK (relative difference <= 5E-3)
 =========================================================================
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/check.exe -p 64 256 1 OMP=
-Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.991990e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.992959e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.992959e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.965658e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.966651e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.966651e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 4.060121e+00 +- 2.367902e+00 )  GeV^-4
-TOTAL       :     8.244032 sec
-    25,060,848,142      cycles                           #    3.040 GHz                    
-    78,146,011,721      instructions                     #    3.12  insn per cycle         
-       8.249060073 seconds time elapsed
-=Symbols in CPPProcess.o= (~sse4: 3567) (avx2:    0) (512y:    0) (512z:    0)
+TOTAL       :     8.353642 sec
+    24,461,782,795      cycles                    #    2.928 GHz                    
+    78,146,447,581      instructions              #    3.19  insn per cycle         
+       8.359034707 seconds time elapsed
+=Symbols in CPPProcess.o= (~sse4: 3563) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/runTest.exe
 [  PASSED  ] 6 tests.
@@ -102,19 +102,19 @@ Relative difference = 4.998523613136231e-08
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/check.exe -p 64 256 1 OMP=
-Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 7.168098e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 7.181458e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 7.181458e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 7.592457e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 7.607446e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.607446e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 4.060121e+00 +- 2.367902e+00 )  GeV^-4
-TOTAL       :     2.299749 sec
-     6,520,623,214      cycles                           #    2.835 GHz                    
-    20,082,651,077      instructions                     #    3.08  insn per cycle         
-       2.305297557 seconds time elapsed
+TOTAL       :     2.176903 sec
+     6,304,553,002      cycles                    #    2.897 GHz                    
+    20,090,800,645      instructions              #    3.19  insn per cycle         
+       2.181656558 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:13755) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/runTest.exe
@@ -128,19 +128,19 @@ Relative difference = 8.545443743731147e-09
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/check.exe -p 64 256 1 OMP=
-Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.669486e+04                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.676783e+04                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.676783e+04                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.706358e+04                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.713844e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.713844e+04                 )  sec^-1
 MeanMatrixElemValue         = ( 4.060560e+00 +- 2.367611e+00 )  GeV^-4
-TOTAL       :     0.992775 sec
-     2,820,421,702      cycles                           #    2.838 GHz                    
-     7,025,142,998      instructions                     #    2.49  insn per cycle         
-       0.997552890 seconds time elapsed
+TOTAL       :     0.970457 sec
+     2,815,185,653      cycles                    #    2.890 GHz                    
+     7,033,969,834      instructions              #    2.50  insn per cycle         
+       0.975199568 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2:11257) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/runTest.exe
@@ -154,19 +154,19 @@ Relative difference = 1.0552292094680926e-08
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/check.exe -p 64 256 1 OMP=
-Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.918377e+04                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.927775e+04                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.927775e+04                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.944671e+04                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.953973e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.953973e+04                 )  sec^-1
 MeanMatrixElemValue         = ( 4.060560e+00 +- 2.367611e+00 )  GeV^-4
-TOTAL       :     0.865223 sec
-     2,468,464,156      cycles                           #    2.849 GHz                    
-     6,321,993,471      instructions                     #    2.56  insn per cycle         
-       0.870455783 seconds time elapsed
+TOTAL       :     0.852685 sec
+     2,474,785,553      cycles                    #    2.891 GHz                    
+     6,331,079,508      instructions              #    2.56  insn per cycle         
+       0.857424788 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2:10915) (512y:   32) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/runTest.exe
@@ -180,20 +180,20 @@ Relative difference = 1.0552292094680926e-08
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/check.exe -p 64 256 1 OMP=
-Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.476790e+04                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.482202e+04                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.482202e+04                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.497348e+04                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.503099e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.503099e+04                 )  sec^-1
 MeanMatrixElemValue         = ( 4.060562e+00 +- 2.367612e+00 )  GeV^-4
-TOTAL       :     1.121528 sec
-     2,033,493,855      cycles                           #    1.813 GHz                    
-     3,238,669,184      instructions                     #    1.59  insn per cycle         
-       1.130214849 seconds time elapsed
-=Symbols in CPPProcess.o= (~sse4:    0) (avx2: 1692) (512y:   40) (512z:10084)
+TOTAL       :     1.105154 sec
+     2,042,054,263      cycles                    #    1.843 GHz                    
+     3,246,636,997      instructions              #    1.59  insn per cycle         
+       1.110173718 seconds time elapsed
+=Symbols in CPPProcess.o= (~sse4:    0) (avx2: 1682) (512y:   40) (512z:10085)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/runTest.exe
 [  PASSED  ] 6 tests.
diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_bridge.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_bridge.txt
index a1b4674f19..f56715ce11 100644
--- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_bridge.txt
+++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_bridge.txt
@@ -35,26 +35,26 @@ CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 
-DATE: 2023-07-18_23:26:14
+DATE: 2023-06-16_23:22:43
 
-On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
+On itscrd80.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe -p 64 256 1 --bridge OMP=
 WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost
 WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost
 WARNING! Instantiate device Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384)
 WARNING! Set grid in Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384)
-Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 5.625816e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 6.338657e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 6.338657e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 5.645163e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 6.314203e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 6.314203e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 4.048178e+00 +- 2.364571e+00 )  GeV^-4
-TOTAL       :     0.466248 sec
-     2,031,849,388      cycles                           #    2.955 GHz                    
-     2,961,771,125      instructions                     #    1.46  insn per cycle         
-       0.744065339 seconds time elapsed
+TOTAL       :     0.495037 sec
+     2,044,288,346      cycles                    #    2.860 GHz                    
+     2,702,609,067      instructions              #    1.32  insn per cycle         
+       0.772230608 seconds time elapsed
 runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe -p 64 256 1 --bridge
 WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost
 WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost
@@ -68,41 +68,41 @@ WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost
 WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost
 WARNING! Instantiate device Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288)
 WARNING! Set grid in Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288)
-Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 7.228267e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 8.484145e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 8.484145e+05                 )  sec^-1
-MeanMatrixElemValue         = ( 6.641710e+00 +- 4.994249e+00 )  GeV^-4
-TOTAL       :     1.904403 sec
-     6,415,900,458      cycles                           #    2.985 GHz                    
-    12,491,083,247      instructions                     #    1.95  insn per cycle         
-       2.207825533 seconds time elapsed
+EvtsPerSec[Rmb+ME]     (23) = ( 7.239966e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 8.484650e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 8.484650e+05                 )  sec^-1
+MeanMatrixElemValue         = ( 6.641709e+00 +- 4.994249e+00 )  GeV^-4
+TOTAL       :     1.924795 sec
+     6,601,631,800      cycles                    #    3.040 GHz                    
+    13,123,912,596      instructions              #    1.99  insn per cycle         
+       2.228980169 seconds time elapsed
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2
 Avg ME (C++/CUDA)   = 6.626454e-04
-Avg ME (F77/CUDA)   = 6.6262659968156085E-004
-Relative difference = 2.8371612387547027e-05
+Avg ME (F77/CUDA)   = 6.6262662035525971E-004
+Relative difference = 2.8340413651595734e-05
 OK (relative difference <= 5E-3)
 =========================================================================
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/check.exe -p 64 256 1 --bridge OMP=
 WARNING! Instantiate host Bridge (nevt=16384)
-Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.988726e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.989680e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.989680e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.043992e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.045065e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.045065e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 4.060121e+00 +- 2.367902e+00 )  GeV^-4
-TOTAL       :     8.257291 sec
-    25,075,343,992      cycles                           #    3.036 GHz                    
-    78,149,648,975      instructions                     #    3.12  insn per cycle         
-       8.261339290 seconds time elapsed
-=Symbols in CPPProcess.o= (~sse4: 3567) (avx2:    0) (512y:    0) (512z:    0)
+TOTAL       :     8.036711 sec
+    24,503,345,133      cycles                    #    3.048 GHz                    
+    78,146,893,496      instructions              #    3.19  insn per cycle         
+       8.041734648 seconds time elapsed
+=Symbols in CPPProcess.o= (~sse4: 3563) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/runTest.exe
 [  PASSED  ] 6 tests.
@@ -116,19 +116,19 @@ OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/check.exe -p 64 256 1 --bridge OMP=
 WARNING! Instantiate host Bridge (nevt=16384)
-Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 7.234102e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 7.248233e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 7.248233e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 7.476480e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 7.491910e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.491910e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 4.060121e+00 +- 2.367902e+00 )  GeV^-4
-TOTAL       :     2.277702 sec
-     6,528,357,276      cycles                           #    2.862 GHz                    
-    20,090,895,428      instructions                     #    3.08  insn per cycle         
-       2.281625361 seconds time elapsed
+TOTAL       :     2.205721 sec
+     6,317,208,814      cycles                    #    2.860 GHz                    
+    20,099,654,048      instructions              #    3.18  insn per cycle         
+       2.210968010 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:13755) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/runTest.exe
@@ -143,19 +143,19 @@ OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/check.exe -p 64 256 1 --bridge OMP=
 WARNING! Instantiate host Bridge (nevt=16384)
-Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.681618e+04                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.688734e+04                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.688734e+04                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.696185e+04                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.703554e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.703554e+04                 )  sec^-1
 MeanMatrixElemValue         = ( 4.060560e+00 +- 2.367611e+00 )  GeV^-4
-TOTAL       :     0.985910 sec
-     2,820,830,278      cycles                           #    2.853 GHz                    
-     7,034,559,684      instructions                     #    2.49  insn per cycle         
-       0.989751099 seconds time elapsed
+TOTAL       :     0.978597 sec
+     2,817,888,819      cycles                    #    2.869 GHz                    
+     7,044,186,426      instructions              #    2.50  insn per cycle         
+       0.983797034 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2:11257) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/runTest.exe
@@ -170,19 +170,19 @@ OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/check.exe -p 64 256 1 --bridge OMP=
 WARNING! Instantiate host Bridge (nevt=16384)
-Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.906724e+04                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.915957e+04                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.915957e+04                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.943265e+04                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.953322e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.953322e+04                 )  sec^-1
 MeanMatrixElemValue         = ( 4.060560e+00 +- 2.367611e+00 )  GeV^-4
-TOTAL       :     0.870831 sec
-     2,474,414,540      cycles                           #    2.833 GHz                    
-     6,331,645,863      instructions                     #    2.56  insn per cycle         
-       0.875020673 seconds time elapsed
+TOTAL       :     0.855535 sec
+     2,477,342,849      cycles                    #    2.883 GHz                    
+     6,340,941,160      instructions              #    2.56  insn per cycle         
+       0.860485836 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2:10915) (512y:   32) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/runTest.exe
@@ -197,20 +197,20 @@ OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/check.exe -p 64 256 1 --bridge OMP=
 WARNING! Instantiate host Bridge (nevt=16384)
-Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.527095e+04                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.532953e+04                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.532953e+04                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.553687e+04                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.559969e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.559969e+04                 )  sec^-1
 MeanMatrixElemValue         = ( 4.060562e+00 +- 2.367612e+00 )  GeV^-4
-TOTAL       :     1.084935 sec
-     2,044,118,908      cycles                           #    1.878 GHz                    
-     3,248,151,332      instructions                     #    1.59  insn per cycle         
-       1.089089621 seconds time elapsed
-=Symbols in CPPProcess.o= (~sse4:    0) (avx2: 1692) (512y:   40) (512z:10084)
+TOTAL       :     1.067416 sec
+     2,044,224,175      cycles                    #    1.909 GHz                    
+     3,257,025,834      instructions              #    1.59  insn per cycle         
+       1.072360210 seconds time elapsed
+=Symbols in CPPProcess.o= (~sse4:    0) (avx2: 1682) (512y:   40) (512z:10085)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/runTest.exe
 [  PASSED  ] 6 tests.
diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_common.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_common.txt
index 719a82996d..f758b117fb 100644
--- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_common.txt
+++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_common.txt
@@ -35,61 +35,61 @@ CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 
-DATE: 2023-07-18_23:36:49
+DATE: 2023-06-16_23:33:36
 
-On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
+On itscrd80.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe -p 64 256 1 --common OMP=
-Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:COMMON+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 6.309200e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 6.362073e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 6.368445e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 6.328620e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 6.382342e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 6.387489e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 4.159397e-01 +- 3.238804e-01 )  GeV^-4
-TOTAL       :     0.465657 sec
-     2,008,960,497      cycles                           #    2.915 GHz                    
-     2,922,930,156      instructions                     #    1.45  insn per cycle         
-       0.746661542 seconds time elapsed
+TOTAL       :     0.490468 sec
+     2,053,312,645      cycles                    #    2.896 GHz                    
+     2,688,522,220      instructions              #    1.31  insn per cycle         
+       0.768623655 seconds time elapsed
 runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe -p 64 256 1 --common
 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255
 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100%
 .........................................................................
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 1 --common OMP=
-Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:COMMON+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 8.564625e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 8.637906e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 8.641309e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 8.549643e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 8.618843e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 8.621847e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 1.094367e+02 +- 1.071509e+02 )  GeV^-4
-TOTAL       :     1.801689 sec
-     6,057,350,087      cycles                           #    2.985 GHz                    
-    12,604,320,980      instructions                     #    2.08  insn per cycle         
-       2.085381680 seconds time elapsed
+TOTAL       :     1.827390 sec
+     6,222,758,821      cycles                    #    3.019 GHz                    
+    12,654,178,829      instructions              #    2.03  insn per cycle         
+       2.131020283 seconds time elapsed
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2
 Avg ME (C++/CUDA)   = 6.626454e-04
-Avg ME (F77/CUDA)   = 6.6262659968156085E-004
-Relative difference = 2.8371612387547027e-05
+Avg ME (F77/CUDA)   = 6.6262662035525971E-004
+Relative difference = 2.8340413651595734e-05
 OK (relative difference <= 5E-3)
 =========================================================================
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/check.exe -p 64 256 1 --common OMP=
-Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.980035e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.981007e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.981007e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.065245e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.066355e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.066355e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 4.208459e-01 +- 3.253446e-01 )  GeV^-4
-TOTAL       :     8.292562 sec
-    25,056,318,775      cycles                           #    3.021 GHz                    
-    78,146,905,788      instructions                     #    3.12  insn per cycle         
-       8.296157417 seconds time elapsed
-=Symbols in CPPProcess.o= (~sse4: 3567) (avx2:    0) (512y:    0) (512z:    0)
+TOTAL       :     7.951720 sec
+    24,455,995,614      cycles                    #    3.075 GHz                    
+    78,143,504,623      instructions              #    3.20  insn per cycle         
+       7.956774266 seconds time elapsed
+=Symbols in CPPProcess.o= (~sse4: 3563) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/runTest.exe
 [  PASSED  ] 6 tests.
@@ -102,19 +102,19 @@ Relative difference = 4.998523613136231e-08
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/check.exe -p 64 256 1 --common OMP=
-Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 7.078241e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 7.091572e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 7.091572e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 7.603380e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 7.618762e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.618762e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 4.208457e-01 +- 3.253445e-01 )  GeV^-4
-TOTAL       :     2.326411 sec
-     6,529,397,700      cycles                           #    2.804 GHz                    
-    20,082,220,128      instructions                     #    3.08  insn per cycle         
-       2.330014631 seconds time elapsed
+TOTAL       :     2.174453 sec
+     6,308,707,547      cycles                    #    2.902 GHz                    
+    20,091,203,739      instructions              #    3.18  insn per cycle         
+       2.179216274 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:13755) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/runTest.exe
@@ -128,19 +128,19 @@ Relative difference = 8.545443743731147e-09
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/check.exe -p 64 256 1 --common OMP=
-Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.661437e+04                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.668659e+04                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.668659e+04                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.709770e+04                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.717220e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.717220e+04                 )  sec^-1
 MeanMatrixElemValue         = ( 4.214980e-01 +- 3.255523e-01 )  GeV^-4
-TOTAL       :     0.996754 sec
-     2,817,743,008      cycles                           #    2.819 GHz                    
-     7,024,061,229      instructions                     #    2.49  insn per cycle         
-       1.000375913 seconds time elapsed
+TOTAL       :     0.969715 sec
+     2,816,258,030      cycles                    #    2.894 GHz                    
+     7,033,592,275      instructions              #    2.50  insn per cycle         
+       0.974263363 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2:11257) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/runTest.exe
@@ -154,19 +154,19 @@ Relative difference = 1.0552292094680926e-08
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/check.exe -p 64 256 1 --common OMP=
-Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:COMMON+RMBHST+MESHST/512y+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.828370e+04                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.836790e+04                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.836790e+04                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.920768e+04                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.930607e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.930607e+04                 )  sec^-1
 MeanMatrixElemValue         = ( 4.214980e-01 +- 3.255523e-01 )  GeV^-4
-TOTAL       :     0.906892 sec
-     2,470,317,560      cycles                           #    2.714 GHz                    
-     6,320,014,190      instructions                     #    2.56  insn per cycle         
-       0.910836635 seconds time elapsed
+TOTAL       :     0.864540 sec
+     2,472,809,203      cycles                    #    2.851 GHz                    
+     6,329,435,201      instructions              #    2.56  insn per cycle         
+       0.869035951 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2:10915) (512y:   32) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/runTest.exe
@@ -180,20 +180,20 @@ Relative difference = 1.0552292094680926e-08
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/check.exe -p 64 256 1 --common OMP=
-Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:COMMON+RMBHST+MESHST/512z+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.501715e+04                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.507361e+04                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.507361e+04                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.537958e+04                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.544260e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.544260e+04                 )  sec^-1
 MeanMatrixElemValue         = ( 4.214982e-01 +- 3.255524e-01 )  GeV^-4
-TOTAL       :     1.102043 sec
-     2,037,558,095      cycles                           #    1.844 GHz                    
-     3,235,905,404      instructions                     #    1.59  insn per cycle         
-       1.106015383 seconds time elapsed
-=Symbols in CPPProcess.o= (~sse4:    0) (avx2: 1692) (512y:   40) (512z:10084)
+TOTAL       :     1.077016 sec
+     2,042,333,122      cycles                    #    1.891 GHz                    
+     3,245,054,769      instructions              #    1.59  insn per cycle         
+       1.081439983 seconds time elapsed
+=Symbols in CPPProcess.o= (~sse4:    0) (avx2: 1682) (512y:   40) (512z:10085)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/runTest.exe
 [  PASSED  ] 6 tests.
diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_curhst.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_curhst.txt
index c704840cf9..34a43cea0a 100644
--- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_curhst.txt
+++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_curhst.txt
@@ -35,61 +35,61 @@ CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 
-DATE: 2023-07-18_23:33:51
+DATE: 2023-06-16_23:30:33
 
-On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
+On itscrd80.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe -p 64 256 1 --curhst OMP=
-Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURHST+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 6.328531e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 6.379355e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 6.385023e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 6.323907e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 6.378161e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 6.383334e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 4.059596e+00 +- 2.368053e+00 )  GeV^-4
-TOTAL       :     0.462858 sec
-     1,980,722,765      cycles                           #    2.938 GHz                    
-     2,892,819,621      instructions                     #    1.46  insn per cycle         
-       0.731290269 seconds time elapsed
+TOTAL       :     0.487701 sec
+     2,049,188,758      cycles                    #    2.905 GHz                    
+     2,686,122,587      instructions              #    1.31  insn per cycle         
+       0.764918156 seconds time elapsed
 runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe -p 64 256 1 --curhst
 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255
 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100%
 .........................................................................
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 1 --curhst OMP=
-Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURHST+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 8.569950e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 8.641943e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 8.645139e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 8.548245e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 8.617363e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 8.620322e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 6.664703e+00 +- 5.072736e+00 )  GeV^-4
-TOTAL       :     1.750978 sec
-     5,918,608,993      cycles                           #    2.996 GHz                    
-    12,162,380,865      instructions                     #    2.05  insn per cycle         
-       2.034792845 seconds time elapsed
+TOTAL       :     1.774012 sec
+     6,089,970,636      cycles                    #    3.032 GHz                    
+    11,896,059,014      instructions              #    1.95  insn per cycle         
+       2.066787545 seconds time elapsed
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2
 Avg ME (C++/CUDA)   = 6.626454e-04
-Avg ME (F77/CUDA)   = 6.6262659968156085E-004
-Relative difference = 2.8371612387547027e-05
+Avg ME (F77/CUDA)   = 6.6262662035525971E-004
+Relative difference = 2.8340413651595734e-05
 OK (relative difference <= 5E-3)
 =========================================================================
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/check.exe -p 64 256 1 --curhst OMP=
-Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.985988e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.986970e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.986970e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.064063e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.065180e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.065180e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 4.060121e+00 +- 2.367902e+00 )  GeV^-4
-TOTAL       :     8.266136 sec
-    25,080,094,182      cycles                           #    3.033 GHz                    
-    78,145,984,417      instructions                     #    3.12  insn per cycle         
-       8.270023006 seconds time elapsed
-=Symbols in CPPProcess.o= (~sse4: 3567) (avx2:    0) (512y:    0) (512z:    0)
+TOTAL       :     7.960763 sec
+    24,464,649,749      cycles                    #    3.074 GHz                    
+    78,145,698,591      instructions              #    3.19  insn per cycle         
+       7.965669739 seconds time elapsed
+=Symbols in CPPProcess.o= (~sse4: 3563) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/runTest.exe
 [  PASSED  ] 6 tests.
@@ -102,19 +102,19 @@ Relative difference = 4.998523613136231e-08
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/check.exe -p 64 256 1 --curhst OMP=
-Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 6.949211e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 6.961352e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 6.961352e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 7.640949e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 7.656696e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.656696e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 4.060121e+00 +- 2.367902e+00 )  GeV^-4
-TOTAL       :     2.367907 sec
-     6,521,570,008      cycles                           #    2.751 GHz                    
-    20,082,135,071      instructions                     #    3.08  insn per cycle         
-       2.371862221 seconds time elapsed
+TOTAL       :     2.155286 sec
+     6,306,014,277      cycles                    #    2.921 GHz                    
+    20,090,085,412      instructions              #    3.19  insn per cycle         
+       2.160045070 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:13755) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/runTest.exe
@@ -128,19 +128,19 @@ Relative difference = 8.545443743731147e-09
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/check.exe -p 64 256 1 --curhst OMP=
-Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.675991e+04                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.682969e+04                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.682969e+04                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.710584e+04                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.718001e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.718001e+04                 )  sec^-1
 MeanMatrixElemValue         = ( 4.060560e+00 +- 2.367611e+00 )  GeV^-4
-TOTAL       :     0.986561 sec
-     2,813,138,713      cycles                           #    2.843 GHz                    
-     7,024,839,984      instructions                     #    2.50  insn per cycle         
-       0.990202328 seconds time elapsed
+TOTAL       :     0.967745 sec
+     2,810,549,219      cycles                    #    2.896 GHz                    
+     7,034,117,328      instructions              #    2.50  insn per cycle         
+       0.972224772 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2:11257) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/runTest.exe
@@ -154,19 +154,19 @@ Relative difference = 1.0552292094680926e-08
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/check.exe -p 64 256 1 --curhst OMP=
-Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.895238e+04                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.904313e+04                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.904313e+04                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.945349e+04                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.955042e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.955042e+04                 )  sec^-1
 MeanMatrixElemValue         = ( 4.060560e+00 +- 2.367611e+00 )  GeV^-4
-TOTAL       :     0.873329 sec
-     2,465,162,367      cycles                           #    2.812 GHz                    
-     6,321,674,271      instructions                     #    2.56  insn per cycle         
-       0.877369473 seconds time elapsed
+TOTAL       :     0.852055 sec
+     2,468,399,437      cycles                    #    2.884 GHz                    
+     6,331,090,642      instructions              #    2.56  insn per cycle         
+       0.856970624 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2:10915) (512y:   32) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/runTest.exe
@@ -180,20 +180,20 @@ Relative difference = 1.0552292094680926e-08
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/check.exe -p 64 256 1 --curhst OMP=
-Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.520633e+04                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.526563e+04                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.526563e+04                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.543083e+04                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.549272e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.549272e+04                 )  sec^-1
 MeanMatrixElemValue         = ( 4.060562e+00 +- 2.367612e+00 )  GeV^-4
-TOTAL       :     1.086574 sec
-     2,034,950,607      cycles                           #    1.867 GHz                    
-     3,237,826,564      instructions                     #    1.59  insn per cycle         
-       1.090692761 seconds time elapsed
-=Symbols in CPPProcess.o= (~sse4:    0) (avx2: 1692) (512y:   40) (512z:10084)
+TOTAL       :     1.072075 sec
+     2,041,169,827      cycles                    #    1.898 GHz                    
+     3,247,254,694      instructions              #    1.59  insn per cycle         
+       1.077160788 seconds time elapsed
+=Symbols in CPPProcess.o= (~sse4:    0) (avx2: 1682) (512y:   40) (512z:10085)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/runTest.exe
 [  PASSED  ] 6 tests.
diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_rmbhst.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_rmbhst.txt
index e4746ed953..513c5a26b1 100644
--- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_rmbhst.txt
+++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_rmbhst.txt
@@ -35,23 +35,23 @@ CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 
-DATE: 2023-07-18_23:30:58
+DATE: 2023-06-16_23:27:35
 
-On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
+On itscrd80.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe -p 64 256 1 --rmbhst OMP=
 WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost
-Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURHST+RMBHST+MESDEV/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 5.747461e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 6.395267e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 6.400893e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 5.708420e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 6.347094e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 6.352007e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 4.048178e+00 +- 2.364571e+00 )  GeV^-4
-TOTAL       :     0.467603 sec
-     2,018,915,063      cycles                           #    2.937 GHz                    
-     2,900,155,764      instructions                     #    1.44  insn per cycle         
-       0.746177156 seconds time elapsed
+TOTAL       :     0.490976 sec
+     2,048,446,840      cycles                    #    2.889 GHz                    
+     2,674,620,255      instructions              #    1.31  insn per cycle         
+       0.767937575 seconds time elapsed
 runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe -p 64 256 1 --rmbhst
 WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost
 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255
@@ -59,40 +59,40 @@ WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost
 .........................................................................
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 1 --rmbhst OMP=
 WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost
-Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURHST+RMBHST+MESDEV/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 7.411123e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 8.627941e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 8.631179e+05                 )  sec^-1
-MeanMatrixElemValue         = ( 6.641710e+00 +- 4.994249e+00 )  GeV^-4
-TOTAL       :     1.838772 sec
-     6,138,854,502      cycles                           #    2.968 GHz                    
-    13,085,152,978      instructions                     #    2.13  insn per cycle         
-       2.127958322 seconds time elapsed
+EvtsPerSec[Rmb+ME]     (23) = ( 7.421166e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 8.615921e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 8.619172e+05                 )  sec^-1
+MeanMatrixElemValue         = ( 6.641709e+00 +- 4.994249e+00 )  GeV^-4
+TOTAL       :     1.859235 sec
+     6,236,141,578      cycles                    #    2.977 GHz                    
+    12,723,940,591      instructions              #    2.04  insn per cycle         
+       2.154378517 seconds time elapsed
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2
 Avg ME (C++/CUDA)   = 6.626454e-04
-Avg ME (F77/CUDA)   = 6.6262659968156085E-004
-Relative difference = 2.8371612387547027e-05
+Avg ME (F77/CUDA)   = 6.6262662035525971E-004
+Relative difference = 2.8340413651595734e-05
 OK (relative difference <= 5E-3)
 =========================================================================
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/check.exe -p 64 256 1 --rmbhst OMP=
-Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.967533e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.968499e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.968499e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.071302e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.072413e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.072413e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 4.060121e+00 +- 2.367902e+00 )  GeV^-4
-TOTAL       :     8.343835 sec
-    25,087,465,619      cycles                           #    3.007 GHz                    
-    78,146,272,502      instructions                     #    3.11  insn per cycle         
-       8.347778703 seconds time elapsed
-=Symbols in CPPProcess.o= (~sse4: 3567) (avx2:    0) (512y:    0) (512z:    0)
+TOTAL       :     7.929179 sec
+    24,461,908,171      cycles                    #    3.086 GHz                    
+    78,144,542,835      instructions              #    3.19  insn per cycle         
+       7.934172762 seconds time elapsed
+=Symbols in CPPProcess.o= (~sse4: 3563) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/runTest.exe
 [  PASSED  ] 6 tests.
@@ -105,19 +105,19 @@ Relative difference = 4.998523613136231e-08
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/check.exe -p 64 256 1 --rmbhst OMP=
-Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 7.178005e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 7.191017e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 7.191017e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 7.571029e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 7.586100e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.586100e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 4.060121e+00 +- 2.367902e+00 )  GeV^-4
-TOTAL       :     2.292891 sec
-     6,523,692,358      cycles                           #    2.841 GHz                    
-    20,081,837,659      instructions                     #    3.08  insn per cycle         
-       2.296967736 seconds time elapsed
+TOTAL       :     2.174925 sec
+     6,301,926,717      cycles                    #    2.893 GHz                    
+    20,090,235,453      instructions              #    3.19  insn per cycle         
+       2.179846514 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:13755) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/runTest.exe
@@ -131,19 +131,19 @@ Relative difference = 8.545443743731147e-09
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/check.exe -p 64 256 1 --rmbhst OMP=
-Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.656324e+04                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.663221e+04                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.663221e+04                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.701876e+04                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.709090e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.709090e+04                 )  sec^-1
 MeanMatrixElemValue         = ( 4.060560e+00 +- 2.367611e+00 )  GeV^-4
-TOTAL       :     0.998170 sec
-     2,815,673,342      cycles                           #    2.813 GHz                    
-     7,024,525,707      instructions                     #    2.49  insn per cycle         
-       1.001880395 seconds time elapsed
+TOTAL       :     0.972776 sec
+     2,805,674,310      cycles                    #    2.875 GHz                    
+     7,034,136,735      instructions              #    2.51  insn per cycle         
+       0.977241135 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2:11257) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/runTest.exe
@@ -157,19 +157,19 @@ Relative difference = 1.0552292094680926e-08
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/check.exe -p 64 256 1 --rmbhst OMP=
-Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.902878e+04                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.912112e+04                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.912112e+04                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.937974e+04                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.948232e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.948232e+04                 )  sec^-1
 MeanMatrixElemValue         = ( 4.060560e+00 +- 2.367611e+00 )  GeV^-4
-TOTAL       :     0.869920 sec
-     2,463,531,596      cycles                           #    2.822 GHz                    
-     6,321,449,141      instructions                     #    2.57  insn per cycle         
-       0.873633029 seconds time elapsed
+TOTAL       :     0.855178 sec
+     2,469,042,209      cycles                    #    2.877 GHz                    
+     6,331,047,669      instructions              #    2.56  insn per cycle         
+       0.859915572 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2:10915) (512y:   32) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/runTest.exe
@@ -183,20 +183,20 @@ Relative difference = 1.0552292094680926e-08
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/check.exe -p 64 256 1 --rmbhst OMP=
-Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.485973e+04                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.491473e+04                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.491473e+04                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.553740e+04                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.559951e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.559951e+04                 )  sec^-1
 MeanMatrixElemValue         = ( 4.060562e+00 +- 2.367612e+00 )  GeV^-4
-TOTAL       :     1.111691 sec
-     2,049,922,439      cycles                           #    1.839 GHz                    
-     3,237,844,553      instructions                     #    1.58  insn per cycle         
-       1.115320688 seconds time elapsed
-=Symbols in CPPProcess.o= (~sse4:    0) (avx2: 1692) (512y:   40) (512z:10084)
+TOTAL       :     1.064606 sec
+     2,037,786,255      cycles                    #    1.909 GHz                    
+     3,246,591,583      instructions              #    1.59  insn per cycle         
+       1.069170450 seconds time elapsed
+=Symbols in CPPProcess.o= (~sse4:    0) (avx2: 1682) (512y:   40) (512z:10085)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/runTest.exe
 [  PASSED  ] 6 tests.
diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd1.txt
index e664b8e4bd..24986b526d 100644
--- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd1.txt
+++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd1.txt
@@ -35,61 +35,61 @@ CUDACPP_BUILDDIR='build.512z_f_inl0_hrd1'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 
-DATE: 2023-07-18_22:46:53
+DATE: 2023-06-16_22:58:06
 
-On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
+On itscrd80.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd1/gcheck.exe -p 64 256 1 OMP=
-Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1]
+Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 6.361034e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 6.414963e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 6.420865e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 6.278810e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 6.338048e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 6.343904e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 4.059596e+00 +- 2.368053e+00 )  GeV^-4
-TOTAL       :     0.480571 sec
-     2,065,234,577      cycles                           #    2.945 GHz                    
-     2,977,941,339      instructions                     #    1.44  insn per cycle         
-       0.778873111 seconds time elapsed
+TOTAL       :     0.508627 sec
+     2,112,782,804      cycles                    #    2.878 GHz                    
+     2,766,936,523      instructions              #    1.31  insn per cycle         
+       0.791513495 seconds time elapsed
 runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd1/gcheck.exe -p 64 256 1
 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255
 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100%
 .........................................................................
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd1/gcheck.exe -p 2048 256 1 OMP=
-Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1]
+Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 8.540141e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 8.613831e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 8.617005e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 8.568420e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 8.630751e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 8.634459e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 6.664703e+00 +- 5.072736e+00 )  GeV^-4
-TOTAL       :     1.721792 sec
-     5,879,791,948      cycles                           #    3.006 GHz                    
-    12,366,829,498      instructions                     #    2.10  insn per cycle         
-       2.012637973 seconds time elapsed
+TOTAL       :     1.749439 sec
+     6,009,356,386      cycles                    #    3.024 GHz                    
+    12,104,468,348      instructions              #    2.01  insn per cycle         
+       2.046648448 seconds time elapsed
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd1/gcheck.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd1/fgcheck.exe 2 64 2
 Avg ME (C++/CUDA)   = 6.626454e-04
-Avg ME (F77/CUDA)   = 6.6262659968156085E-004
-Relative difference = 2.8371612387547027e-05
+Avg ME (F77/CUDA)   = 6.6262662035525971E-004
+Relative difference = 2.8340413651595734e-05
 OK (relative difference <= 5E-3)
 =========================================================================
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd1/check.exe -p 64 256 1 OMP=
-Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1]
+Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.996506e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.997490e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.997490e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.095981e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.097105e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.097105e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 4.060121e+00 +- 2.367902e+00 )  GeV^-4
-TOTAL       :     8.224439 sec
-    24,894,773,723      cycles                           #    3.027 GHz                    
-    77,890,953,353      instructions                     #    3.13  insn per cycle         
-       8.229244629 seconds time elapsed
-=Symbols in CPPProcess.o= (~sse4: 3075) (avx2:    0) (512y:    0) (512z:    0)
+TOTAL       :     7.835166 sec
+    24,337,509,935      cycles                    #    3.105 GHz                    
+    77,896,565,911      instructions              #    3.20  insn per cycle         
+       7.840097785 seconds time elapsed
+=Symbols in CPPProcess.o= (~sse4: 3071) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd1/runTest.exe
 [  PASSED  ] 6 tests.
@@ -102,19 +102,19 @@ Relative difference = 5.65798569465384e-08
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd1/check.exe -p 64 256 1 OMP=
-Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1]
+Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 7.351207e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 7.365129e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 7.365129e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 7.647482e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 7.663046e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.663046e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 4.060121e+00 +- 2.367902e+00 )  GeV^-4
-TOTAL       :     2.241696 sec
-     6,482,790,377      cycles                           #    2.890 GHz                    
-    20,037,202,220      instructions                     #    3.09  insn per cycle         
-       2.246950151 seconds time elapsed
+TOTAL       :     2.153739 sec
+     6,249,075,693      cycles                    #    2.898 GHz                    
+    20,045,520,110      instructions              #    3.21  insn per cycle         
+       2.158442622 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:13454) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd1/runTest.exe
@@ -128,19 +128,19 @@ Relative difference = 8.454838403082277e-09
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd1/check.exe -p 64 256 1 OMP=
-Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1]
+Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.632955e+04                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.639361e+04                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.639361e+04                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.690515e+04                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.698040e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.698040e+04                 )  sec^-1
 MeanMatrixElemValue         = ( 4.060560e+00 +- 2.367611e+00 )  GeV^-4
-TOTAL       :     1.014405 sec
-     2,879,568,161      cycles                           #    2.836 GHz                    
-     7,137,270,355      instructions                     #    2.48  insn per cycle         
-       1.019344865 seconds time elapsed
+TOTAL       :     0.979063 sec
+     2,866,637,980      cycles                    #    2.916 GHz                    
+     7,146,127,186      instructions              #    2.49  insn per cycle         
+       0.984190288 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2:11820) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd1/runTest.exe
@@ -154,19 +154,19 @@ Relative difference = 1.0602318832827381e-08
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd1/check.exe -p 64 256 1 OMP=
-Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1]
+Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.850016e+04                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.858529e+04                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.858529e+04                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.866768e+04                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.875905e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.875905e+04                 )  sec^-1
 MeanMatrixElemValue         = ( 4.060560e+00 +- 2.367611e+00 )  GeV^-4
-TOTAL       :     0.895799 sec
-     2,560,614,550      cycles                           #    2.854 GHz                    
-     6,432,788,868      instructions                     #    2.51  insn per cycle         
-       0.900682497 seconds time elapsed
+TOTAL       :     0.887507 sec
+     2,573,527,962      cycles                    #    2.887 GHz                    
+     6,441,681,549      instructions              #    2.50  insn per cycle         
+       0.892352069 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2:11516) (512y:   24) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd1/runTest.exe
@@ -180,20 +180,20 @@ Relative difference = 1.0602318832827381e-08
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd1/check.exe -p 64 256 1 OMP=
-Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1]
+Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.413332e+04                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.418459e+04                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.418459e+04                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.528254e+04                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.534227e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.534227e+04                 )  sec^-1
 MeanMatrixElemValue         = ( 4.060562e+00 +- 2.367612e+00 )  GeV^-4
-TOTAL       :     1.170694 sec
-     2,089,713,145      cycles                           #    1.783 GHz                    
-     3,358,950,774      instructions                     #    1.61  insn per cycle         
-       1.175989368 seconds time elapsed
-=Symbols in CPPProcess.o= (~sse4:    0) (avx2: 2222) (512y:   32) (512z:10133)
+TOTAL       :     1.082245 sec
+     2,091,652,812      cycles                    #    1.925 GHz                    
+     3,367,447,588      instructions              #    1.61  insn per cycle         
+       1.087515058 seconds time elapsed
+=Symbols in CPPProcess.o= (~sse4:    0) (avx2: 2221) (512y:   32) (512z:10146)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd1/runTest.exe
 [  PASSED  ] 6 tests.
diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl1_hrd0.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl1_hrd0.txt
index cfa67fb72a..2114debcba 100644
--- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl1_hrd0.txt
+++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl1_hrd0.txt
@@ -35,61 +35,61 @@ CUDACPP_BUILDDIR='build.512z_f_inl1_hrd0'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 
-DATE: 2023-07-18_23:18:10
+DATE: 2023-06-16_23:14:17
 
-On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
+On itscrd80.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd0/gcheck.exe -p 64 256 1 OMP=
-Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=1] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=1] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 5.555562e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 5.595852e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 5.600337e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 5.564508e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 5.602710e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 5.606907e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 4.059596e+00 +- 2.368053e+00 )  GeV^-4
-TOTAL       :     0.489898 sec
-     2,117,871,861      cycles                           #    2.944 GHz                    
-     3,083,406,149      instructions                     #    1.46  insn per cycle         
-       0.779309462 seconds time elapsed
+TOTAL       :     0.511467 sec
+     2,169,852,681      cycles                    #    2.909 GHz                    
+     2,868,849,541      instructions              #    1.32  insn per cycle         
+       0.803186385 seconds time elapsed
 runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd0/gcheck.exe -p 64 256 1
 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255
 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100%
 .........................................................................
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd0/gcheck.exe -p 2048 256 1 OMP=
-Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=1] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=1] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 7.706941e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 7.767146e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 7.769793e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 7.693782e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 7.750237e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.752633e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 6.664703e+00 +- 5.072736e+00 )  GeV^-4
-TOTAL       :     1.859952 sec
-     6,265,344,549      cycles                           #    2.990 GHz                    
-    13,643,028,015      instructions                     #    2.18  insn per cycle         
-       2.154934603 seconds time elapsed
+TOTAL       :     1.885046 sec
+     6,430,536,190      cycles                    #    3.024 GHz                    
+    13,410,448,299      instructions              #    2.09  insn per cycle         
+       2.186495562 seconds time elapsed
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd0/gcheck.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd0/fgcheck.exe 2 64 2
 Avg ME (C++/CUDA)   = 6.626454e-04
-Avg ME (F77/CUDA)   = 6.6262660579844562E-004
-Relative difference = 2.836238137986709e-05
+Avg ME (F77/CUDA)   = 6.6262662649554244E-004
+Relative difference = 2.833114733400458e-05
 OK (relative difference <= 5E-3)
 =========================================================================
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd0/check.exe -p 64 256 1 OMP=
-Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.2.0] [inlineHel=1] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 5.751534e+02                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 5.752342e+02                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 5.752342e+02                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 5.896574e+02                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 5.897472e+02                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 5.897472e+02                 )  sec^-1
 MeanMatrixElemValue         = ( 4.059969e+00 +- 2.367799e+00 )  GeV^-4
-TOTAL       :    28.523012 sec
-    86,401,262,633      cycles                           #    3.029 GHz                    
-   136,144,748,047      instructions                     #    1.58  insn per cycle         
-      28.527089944 seconds time elapsed
-=Symbols in CPPProcess.o= (~sse4:15917) (avx2:    0) (512y:    0) (512z:    0)
+TOTAL       :    27.823875 sec
+    86,121,024,305      cycles                    #    3.095 GHz                    
+   136,130,940,684      instructions              #    1.58  insn per cycle         
+      27.829007329 seconds time elapsed
+=Symbols in CPPProcess.o= (~sse4:15932) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd0/runTest.exe
 [  PASSED  ] 6 tests.
@@ -102,19 +102,19 @@ Relative difference = 4.9411338183416744e-09
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl1_hrd0/check.exe -p 64 256 1 OMP=
-Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.2.0] [inlineHel=1] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 6.937203e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 6.949756e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 6.949756e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 7.179629e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 7.192771e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.192771e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 4.059962e+00 +- 2.367792e+00 )  GeV^-4
-TOTAL       :     2.373012 sec
-     6,730,013,186      cycles                           #    2.833 GHz                    
-    19,262,616,828      instructions                     #    2.86  insn per cycle         
-       2.376797941 seconds time elapsed
+TOTAL       :     2.295614 sec
+     6,686,096,543      cycles                    #    2.912 GHz                    
+    19,271,682,618      instructions              #    2.88  insn per cycle         
+       2.300291912 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:69534) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl1_hrd0/runTest.exe
@@ -128,19 +128,19 @@ Relative difference = 2.6057152933832753e-08
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl1_hrd0/check.exe -p 64 256 1 OMP=
-Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.2.0] [inlineHel=1] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.428038e+04                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.433299e+04                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.433299e+04                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.531625e+04                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.537735e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.537735e+04                 )  sec^-1
 MeanMatrixElemValue         = ( 4.060903e+00 +- 2.367376e+00 )  GeV^-4
-TOTAL       :     1.156685 sec
-     3,104,589,238      cycles                           #    2.678 GHz                    
-     6,652,896,903      instructions                     #    2.14  insn per cycle         
-       1.160693007 seconds time elapsed
+TOTAL       :     1.080134 sec
+     3,114,255,706      cycles                    #    2.875 GHz                    
+     6,664,280,015      instructions              #    2.14  insn per cycle         
+       1.084789912 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2:47803) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl1_hrd0/runTest.exe
@@ -154,19 +154,19 @@ Relative difference = 1.9003789248133364e-08
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl1_hrd0/check.exe -p 64 256 1 OMP=
-Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.2.0] [inlineHel=1] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.832430e+04                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.840977e+04                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.840977e+04                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.841445e+04                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.850149e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.850149e+04                 )  sec^-1
 MeanMatrixElemValue         = ( 4.060903e+00 +- 2.367376e+00 )  GeV^-4
-TOTAL       :     0.903107 sec
-     2,567,602,694      cycles                           #    2.833 GHz                    
-     5,841,410,689      instructions                     #    2.28  insn per cycle         
-       0.906868909 seconds time elapsed
+TOTAL       :     0.899888 sec
+     2,571,109,745      cycles                    #    2.845 GHz                    
+     5,850,913,153      instructions              #    2.28  insn per cycle         
+       0.904914307 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2:41536) (512y:   13) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl1_hrd0/runTest.exe
@@ -180,19 +180,19 @@ Relative difference = 1.9003789248133364e-08
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl1_hrd0/check.exe -p 64 256 1 OMP=
-Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.2.0] [inlineHel=1] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.510516e+04                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.516306e+04                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.516306e+04                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.550620e+04                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.556902e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.556902e+04                 )  sec^-1
 MeanMatrixElemValue         = ( 4.060905e+00 +- 2.367377e+00 )  GeV^-4
-TOTAL       :     1.094312 sec
-     2,033,089,410      cycles                           #    1.852 GHz                    
-     3,364,403,279      instructions                     #    1.65  insn per cycle         
-       1.098401868 seconds time elapsed
+TOTAL       :     1.067211 sec
+     2,037,305,474      cycles                    #    1.903 GHz                    
+     3,373,327,271      instructions              #    1.66  insn per cycle         
+       1.071975743 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 4191) (512y:    5) (512z:44245)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl1_hrd0/runTest.exe
diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl1_hrd1.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl1_hrd1.txt
index 51142074c9..8e3ac4399e 100644
--- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl1_hrd1.txt
+++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl1_hrd1.txt
@@ -35,61 +35,61 @@ CUDACPP_BUILDDIR='build.512z_f_inl1_hrd1'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 
-DATE: 2023-07-18_23:19:00
+DATE: 2023-06-16_23:15:08
 
-On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
+On itscrd80.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd1/gcheck.exe -p 64 256 1 OMP=
-Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=1] [hardcodePARAM=1]
+Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=1] [hardcodePARAM=1]
 Workflow summary            = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 5.573163e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 5.617115e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 5.621634e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 5.536252e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 5.573442e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 5.577601e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 4.059596e+00 +- 2.368053e+00 )  GeV^-4
-TOTAL       :     0.492997 sec
-     2,057,678,092      cycles                           #    2.877 GHz                    
-     3,062,518,407      instructions                     #    1.49  insn per cycle         
-       0.776272674 seconds time elapsed
+TOTAL       :     0.513649 sec
+     2,145,771,085      cycles                    #    2.879 GHz                    
+     2,862,547,641      instructions              #    1.33  insn per cycle         
+       0.805011702 seconds time elapsed
 runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd1/gcheck.exe -p 64 256 1
 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255
 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100%
 .........................................................................
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd1/gcheck.exe -p 2048 256 1 OMP=
-Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=1] [hardcodePARAM=1]
+Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=1] [hardcodePARAM=1]
 Workflow summary            = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 7.585370e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 7.643537e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 7.646114e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 7.581647e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 7.635703e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.638045e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 6.664703e+00 +- 5.072736e+00 )  GeV^-4
-TOTAL       :     1.867448 sec
-     6,275,513,404      cycles                           #    2.981 GHz                    
-    12,807,707,433      instructions                     #    2.04  insn per cycle         
-       2.162298283 seconds time elapsed
+TOTAL       :     1.893721 sec
+     6,500,188,004      cycles                    #    3.036 GHz                    
+    12,690,177,194      instructions              #    1.95  insn per cycle         
+       2.199182146 seconds time elapsed
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd1/gcheck.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd1/fgcheck.exe 2 64 2
 Avg ME (C++/CUDA)   = 6.626454e-04
-Avg ME (F77/CUDA)   = 6.6262660579844562E-004
-Relative difference = 2.836238137986709e-05
+Avg ME (F77/CUDA)   = 6.6262662649554244E-004
+Relative difference = 2.833114733400458e-05
 OK (relative difference <= 5E-3)
 =========================================================================
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd1/check.exe -p 64 256 1 OMP=
-Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=1]
+Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.2.0] [inlineHel=1] [hardcodePARAM=1]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 5.733524e+02                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 5.734378e+02                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 5.734378e+02                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 5.890367e+02                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 5.891225e+02                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 5.891225e+02                 )  sec^-1
 MeanMatrixElemValue         = ( 4.059969e+00 +- 2.367799e+00 )  GeV^-4
-TOTAL       :    28.613127 sec
-    86,709,704,728      cycles                           #    3.030 GHz                    
-   136,080,123,804      instructions                     #    1.57  insn per cycle         
-      28.617179548 seconds time elapsed
-=Symbols in CPPProcess.o= (~sse4:15955) (avx2:    0) (512y:    0) (512z:    0)
+TOTAL       :    27.851215 sec
+    85,856,337,659      cycles                    #    3.083 GHz                    
+   136,047,957,548      instructions              #    1.58  insn per cycle         
+      27.856280727 seconds time elapsed
+=Symbols in CPPProcess.o= (~sse4:15933) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd1/runTest.exe
 [  PASSED  ] 6 tests.
@@ -102,19 +102,19 @@ Relative difference = 2.8211244692003953e-08
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl1_hrd1/check.exe -p 64 256 1 OMP=
-Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=1]
+Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.2.0] [inlineHel=1] [hardcodePARAM=1]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 6.957648e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 6.970744e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 6.970744e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 7.106679e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 7.119068e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.119068e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 4.059963e+00 +- 2.367792e+00 )  GeV^-4
-TOTAL       :     2.365219 sec
-     6,808,787,197      cycles                           #    2.875 GHz                    
-    19,308,338,488      instructions                     #    2.84  insn per cycle         
-       2.369448248 seconds time elapsed
+TOTAL       :     2.316749 sec
+     6,781,000,176      cycles                    #    2.924 GHz                    
+    19,316,989,732      instructions              #    2.85  insn per cycle         
+       2.321493873 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:69471) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl1_hrd1/runTest.exe
@@ -128,19 +128,19 @@ Relative difference = 3.0732494532034946e-08
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl1_hrd1/check.exe -p 64 256 1 OMP=
-Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=1]
+Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.2.0] [inlineHel=1] [hardcodePARAM=1]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.541847e+04                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.547836e+04                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.547836e+04                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.581478e+04                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.587987e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.587987e+04                 )  sec^-1
 MeanMatrixElemValue         = ( 4.060903e+00 +- 2.367376e+00 )  GeV^-4
-TOTAL       :     1.071572 sec
-     3,039,419,289      cycles                           #    2.827 GHz                    
-     6,585,482,226      instructions                     #    2.17  insn per cycle         
-       1.075716909 seconds time elapsed
+TOTAL       :     1.045728 sec
+     3,042,131,837      cycles                    #    2.898 GHz                    
+     6,594,840,346      instructions              #    2.17  insn per cycle         
+       1.050898181 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2:46795) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl1_hrd1/runTest.exe
@@ -154,19 +154,19 @@ Relative difference = 1.9674022283284887e-08
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl1_hrd1/check.exe -p 64 256 1 OMP=
-Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=1]
+Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.2.0] [inlineHel=1] [hardcodePARAM=1]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.826611e+04                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.835439e+04                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.835439e+04                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.870529e+04                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.879536e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.879536e+04                 )  sec^-1
 MeanMatrixElemValue         = ( 4.060903e+00 +- 2.367376e+00 )  GeV^-4
-TOTAL       :     0.905723 sec
-     2,575,686,106      cycles                           #    2.834 GHz                    
-     5,845,383,132      instructions                     #    2.27  insn per cycle         
-       0.909457035 seconds time elapsed
+TOTAL       :     0.885640 sec
+     2,578,942,088      cycles                    #    2.899 GHz                    
+     5,854,799,048      instructions              #    2.27  insn per cycle         
+       0.890682953 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2:41080) (512y:    9) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl1_hrd1/runTest.exe
@@ -180,19 +180,19 @@ Relative difference = 1.9674022283284887e-08
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl1_hrd1/check.exe -p 64 256 1 OMP=
-Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=1]
+Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.2.0] [inlineHel=1] [hardcodePARAM=1]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.496491e+04                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.502073e+04                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.502073e+04                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.554834e+04                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.561058e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.561058e+04                 )  sec^-1
 MeanMatrixElemValue         = ( 4.060905e+00 +- 2.367377e+00 )  GeV^-4
-TOTAL       :     1.104130 sec
-     2,023,795,233      cycles                           #    1.829 GHz                    
-     3,074,048,274      instructions                     #    1.52  insn per cycle         
-       1.107904891 seconds time elapsed
+TOTAL       :     1.063798 sec
+     2,024,256,431      cycles                    #    1.896 GHz                    
+     3,083,285,385      instructions              #    1.52  insn per cycle         
+       1.068833353 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 3372) (512y:   17) (512z:39424)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl1_hrd1/runTest.exe
diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd0.txt
index 09ad13168b..b408e7e538 100644
--- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd0.txt
+++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd0.txt
@@ -35,38 +35,38 @@ CUDACPP_BUILDDIR='build.512z_m_inl0_hrd0'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 
-DATE: 2023-07-18_22:47:21
+DATE: 2023-06-16_22:58:34
 
-On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
+On itscrd80.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd0/gcheck.exe -p 64 256 1 OMP=
-Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 3.480391e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.508672e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.510984e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.488561e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.521964e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.524427e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 4.063123e+00 +- 2.368970e+00 )  GeV^-4
-TOTAL       :     0.525952 sec
-     2,267,677,525      cycles                           #    2.954 GHz                    
-     3,392,628,719      instructions                     #    1.50  insn per cycle         
-       0.831861393 seconds time elapsed
+TOTAL       :     0.554107 sec
+     2,279,290,212      cycles                    #    2.876 GHz                    
+     3,137,052,799      instructions              #    1.38  insn per cycle         
+       0.851081146 seconds time elapsed
 runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd0/gcheck.exe -p 64 256 1
 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255
 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100%
 .........................................................................
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd0/gcheck.exe -p 2048 256 1 OMP=
-Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 4.145439e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 4.179718e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 4.181134e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 4.134416e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 4.168392e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.169702e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 6.665112e+00 +- 5.002651e+00 )  GeV^-4
-TOTAL       :     3.052698 sec
-     9,942,137,930      cycles                           #    3.004 GHz                    
-    22,214,757,723      instructions                     #    2.23  insn per cycle         
-       3.366177476 seconds time elapsed
+TOTAL       :     3.065812 sec
+    10,122,925,933      cycles                    #    3.040 GHz                    
+    20,842,538,183      instructions              #    2.06  insn per cycle         
+       3.387502173 seconds time elapsed
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd0/gcheck.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd0/fgcheck.exe 2 64 2
@@ -76,20 +76,20 @@ Relative difference = 2.659538381540814e-07
 OK (relative difference <= 5E-3)
 =========================================================================
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd0/check.exe -p 64 256 1 OMP=
-Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.913139e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.914080e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.914080e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.978031e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.979322e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.979322e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 4.063123e+00 +- 2.368970e+00 )  GeV^-4
-TOTAL       :     8.583367 sec
-    26,165,234,964      cycles                           #    3.048 GHz                    
-    79,196,950,918      instructions                     #    3.03  insn per cycle         
-       8.588687377 seconds time elapsed
-=Symbols in CPPProcess.o= (~sse4: 4744) (avx2:    0) (512y:    0) (512z:    0)
+TOTAL       :     8.302095 sec
+    25,693,038,802      cycles                    #    3.094 GHz                    
+    79,192,524,178      instructions              #    3.08  insn per cycle         
+       8.307297645 seconds time elapsed
+=Symbols in CPPProcess.o= (~sse4: 4706) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd0/runTest.exe
 [  PASSED  ] 6 tests.
@@ -102,19 +102,19 @@ Relative difference = 2.8059296349552523e-07
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_m_inl0_hrd0/check.exe -p 64 256 1 OMP=
-Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 3.608213e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.611317e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.611317e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.739227e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.743408e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.743408e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 4.063123e+00 +- 2.368970e+00 )  GeV^-4
-TOTAL       :     4.557137 sec
-    12,919,114,954      cycles                           #    2.834 GHz                    
-    38,494,869,204      instructions                     #    2.98  insn per cycle         
-       4.563393267 seconds time elapsed
+TOTAL       :     4.404759 sec
+    12,783,705,279      cycles                    #    2.903 GHz                    
+    38,505,609,809      instructions              #    3.01  insn per cycle         
+       4.410187217 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:13076) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_m_inl0_hrd0/runTest.exe
@@ -128,20 +128,20 @@ Relative difference = 2.98084507782618e-07
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_m_inl0_hrd0/check.exe -p 64 256 1 OMP=
-Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 8.504111e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 8.520835e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 8.520835e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 8.779933e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 8.804166e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 8.804166e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 4.063123e+00 +- 2.368970e+00 )  GeV^-4
-TOTAL       :     1.940434 sec
-     5,519,643,775      cycles                           #    2.843 GHz                    
-    13,603,840,523      instructions                     #    2.46  insn per cycle         
-       1.945814702 seconds time elapsed
-=Symbols in CPPProcess.o= (~sse4:    0) (avx2:10864) (512y:    0) (512z:    0)
+TOTAL       :     1.879373 sec
+     5,476,712,305      cycles                    #    2.908 GHz                    
+    13,620,238,606      instructions              #    2.49  insn per cycle         
+       1.884513914 seconds time elapsed
+=Symbols in CPPProcess.o= (~sse4:    0) (avx2:10865) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_m_inl0_hrd0/runTest.exe
 [  PASSED  ] 6 tests.
@@ -154,20 +154,20 @@ Relative difference = 2.956342832710188e-07
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_m_inl0_hrd0/check.exe -p 64 256 1 OMP=
-Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 9.534658e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 9.558180e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 9.558180e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 9.768885e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 9.800184e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 9.800184e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 4.063123e+00 +- 2.368970e+00 )  GeV^-4
-TOTAL       :     1.733857 sec
-     4,855,194,781      cycles                           #    2.802 GHz                    
-    12,251,398,833      instructions                     #    2.52  insn per cycle         
-       1.738916853 seconds time elapsed
-=Symbols in CPPProcess.o= (~sse4:    0) (avx2:10582) (512y:   20) (512z:    0)
+TOTAL       :     1.690734 sec
+     4,867,518,430      cycles                    #    2.873 GHz                    
+    12,266,678,987      instructions              #    2.52  insn per cycle         
+       1.696100975 seconds time elapsed
+=Symbols in CPPProcess.o= (~sse4:    0) (avx2:10583) (512y:   20) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_m_inl0_hrd0/runTest.exe
 [  PASSED  ] 6 tests.
@@ -180,20 +180,20 @@ Relative difference = 2.956342832710188e-07
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_m_inl0_hrd0/check.exe -p 64 256 1 OMP=
-Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 7.497075e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 7.510848e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 7.510848e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 7.632753e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 7.650795e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.650795e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 4.063123e+00 +- 2.368970e+00 )  GeV^-4
-TOTAL       :     2.200743 sec
-     4,095,535,430      cycles                           #    1.861 GHz                    
-     6,349,609,236      instructions                     #    1.55  insn per cycle         
-       2.206197767 seconds time elapsed
-=Symbols in CPPProcess.o= (~sse4:    0) (avx2: 1432) (512y:   58) (512z: 9948)
+TOTAL       :     2.159522 sec
+     4,131,175,813      cycles                    #    1.911 GHz                    
+     6,362,042,970      instructions              #    1.54  insn per cycle         
+       2.164878251 seconds time elapsed
+=Symbols in CPPProcess.o= (~sse4:    0) (avx2: 1422) (512y:   57) (512z: 9944)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_m_inl0_hrd0/runTest.exe
 [  PASSED  ] 6 tests.
diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd1.txt
index 563afc5c04..a0ec009068 100644
--- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd1.txt
+++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd1.txt
@@ -35,38 +35,38 @@ CUDACPP_BUILDDIR='build.512z_m_inl0_hrd1'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 
-DATE: 2023-07-18_22:47:57
+DATE: 2023-06-16_22:59:10
 
-On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
+On itscrd80.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd1/gcheck.exe -p 64 256 1 OMP=
-Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1]
+Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 3.517820e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.547651e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.549887e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.495757e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.531890e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.534521e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 4.063123e+00 +- 2.368970e+00 )  GeV^-4
-TOTAL       :     0.522538 sec
-     2,246,754,399      cycles                           #    2.960 GHz                    
-     3,448,544,369      instructions                     #    1.53  insn per cycle         
-       0.825847412 seconds time elapsed
+TOTAL       :     0.553147 sec
+     2,267,397,908      cycles                    #    2.871 GHz                    
+     3,111,867,480      instructions              #    1.37  insn per cycle         
+       0.849472244 seconds time elapsed
 runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd1/gcheck.exe -p 64 256 1
 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255
 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100%
 .........................................................................
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd1/gcheck.exe -p 2048 256 1 OMP=
-Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1]
+Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 4.146188e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 4.180771e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 4.182209e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 4.149132e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 4.183375e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.184726e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 6.665112e+00 +- 5.002651e+00 )  GeV^-4
-TOTAL       :     3.039730 sec
-     9,826,496,967      cycles                           #    2.983 GHz                    
-    21,321,023,396      instructions                     #    2.17  insn per cycle         
-       3.352938907 seconds time elapsed
+TOTAL       :     3.059498 sec
+    10,031,596,373      cycles                    #    3.007 GHz                    
+    20,916,916,227      instructions              #    2.09  insn per cycle         
+       3.392815949 seconds time elapsed
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd1/gcheck.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd1/fgcheck.exe 2 64 2
@@ -76,20 +76,20 @@ Relative difference = 2.659538381540814e-07
 OK (relative difference <= 5E-3)
 =========================================================================
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd1/check.exe -p 64 256 1 OMP=
-Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1]
+Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.904551e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.905470e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.905470e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.978511e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.979800e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.979800e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 4.063123e+00 +- 2.368970e+00 )  GeV^-4
-TOTAL       :     8.624365 sec
-    26,132,562,164      cycles                           #    3.030 GHz                    
-    79,214,497,550      instructions                     #    3.03  insn per cycle         
-       8.629231042 seconds time elapsed
-=Symbols in CPPProcess.o= (~sse4: 4393) (avx2:    0) (512y:    0) (512z:    0)
+TOTAL       :     8.301620 sec
+    25,591,925,031      cycles                    #    3.083 GHz                    
+    79,211,596,551      instructions              #    3.10  insn per cycle         
+       8.306715675 seconds time elapsed
+=Symbols in CPPProcess.o= (~sse4: 4378) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd1/runTest.exe
 [  PASSED  ] 6 tests.
@@ -102,20 +102,20 @@ Relative difference = 2.8059296349552523e-07
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_m_inl0_hrd1/check.exe -p 64 256 1 OMP=
-Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1]
+Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 3.575647e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.579016e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.579016e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.744925e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.749316e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.749316e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 4.063123e+00 +- 2.368970e+00 )  GeV^-4
-TOTAL       :     4.598691 sec
-    12,863,248,226      cycles                           #    2.798 GHz                    
-    38,444,583,186      instructions                     #    2.99  insn per cycle         
-       4.603847034 seconds time elapsed
-=Symbols in CPPProcess.o= (~sse4:12865) (avx2:    0) (512y:    0) (512z:    0)
+TOTAL       :     4.389824 sec
+    12,788,106,524      cycles                    #    2.911 GHz                    
+    38,452,416,346      instructions              #    3.01  insn per cycle         
+       4.395123954 seconds time elapsed
+=Symbols in CPPProcess.o= (~sse4:12869) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_m_inl0_hrd1/runTest.exe
 [  PASSED  ] 6 tests.
@@ -128,20 +128,20 @@ Relative difference = 2.98084507782618e-07
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_m_inl0_hrd1/check.exe -p 64 256 1 OMP=
-Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1]
+Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 8.431266e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 8.448537e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 8.448537e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 8.672327e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 8.695790e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 8.695790e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 4.063123e+00 +- 2.368970e+00 )  GeV^-4
-TOTAL       :     1.958382 sec
-     5,559,709,976      cycles                           #    2.839 GHz                    
-    13,699,466,614      instructions                     #    2.46  insn per cycle         
-       1.963270291 seconds time elapsed
-=Symbols in CPPProcess.o= (~sse4:    0) (avx2:10972) (512y:    0) (512z:    0)
+TOTAL       :     1.902528 sec
+     5,504,597,953      cycles                    #    2.888 GHz                    
+    13,712,972,642      instructions              #    2.49  insn per cycle         
+       1.907990624 seconds time elapsed
+=Symbols in CPPProcess.o= (~sse4:    0) (avx2:10970) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_m_inl0_hrd1/runTest.exe
 [  PASSED  ] 6 tests.
@@ -154,20 +154,20 @@ Relative difference = 2.956342832710188e-07
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_m_inl0_hrd1/check.exe -p 64 256 1 OMP=
-Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1]
+Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 9.562589e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 9.585411e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 9.585411e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 9.759017e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 9.790204e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 9.790204e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 4.063123e+00 +- 2.368970e+00 )  GeV^-4
-TOTAL       :     1.726022 sec
-     4,896,080,544      cycles                           #    2.835 GHz                    
-    12,352,304,340      instructions                     #    2.52  insn per cycle         
-       1.731411385 seconds time elapsed
-=Symbols in CPPProcess.o= (~sse4:    0) (avx2:10602) (512y:  176) (512z:    0)
+TOTAL       :     1.691699 sec
+     4,899,459,456      cycles                    #    2.890 GHz                    
+    12,367,541,245      instructions              #    2.52  insn per cycle         
+       1.696806589 seconds time elapsed
+=Symbols in CPPProcess.o= (~sse4:    0) (avx2:10604) (512y:  176) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_m_inl0_hrd1/runTest.exe
 [  PASSED  ] 6 tests.
@@ -180,20 +180,20 @@ Relative difference = 2.956342832710188e-07
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_m_inl0_hrd1/check.exe -p 64 256 1 OMP=
-Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1]
+Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 7.437650e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 7.451120e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 7.451120e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 7.611473e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 7.629905e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.629905e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 4.063123e+00 +- 2.368970e+00 )  GeV^-4
-TOTAL       :     2.216802 sec
-     4,112,308,046      cycles                           #    1.855 GHz                    
-     6,429,070,692      instructions                     #    1.56  insn per cycle         
-       2.222267042 seconds time elapsed
-=Symbols in CPPProcess.o= (~sse4:    0) (avx2: 1327) (512y:  167) (512z:10033)
+TOTAL       :     2.165271 sec
+     4,135,051,390      cycles                    #    1.906 GHz                    
+     6,441,618,375      instructions              #    1.56  insn per cycle         
+       2.170469997 seconds time elapsed
+=Symbols in CPPProcess.o= (~sse4:    0) (avx2: 1326) (512y:  167) (512z:10035)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_m_inl0_hrd1/runTest.exe
 [  PASSED  ] 6 tests.
diff --git a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd0.txt
index 92bc2c7e06..f1471a9806 100644
--- a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd0.txt
+++ b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd0.txt
@@ -35,38 +35,38 @@ CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg'
 
-DATE: 2023-07-18_22:50:08
+DATE: 2023-06-16_23:01:26
 
-On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
+On itscrd80.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/gcheck.exe -p 1 256 2 OMP=
-Process                     = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 4.069245e+02                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 4.069666e+02                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 4.069812e+02                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 4.072909e+02                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 4.073330e+02                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.073439e+02                 )  sec^-1
 MeanMatrixElemValue         = ( 1.187066e-05 +- 9.825549e-06 )  GeV^-6
-TOTAL       :     2.455774 sec
-     8,198,003,417      cycles                           #    2.990 GHz                    
-    17,441,039,582      instructions                     #    2.13  insn per cycle         
-       2.848552676 seconds time elapsed
+TOTAL       :     2.450063 sec
+     8,437,943,756      cycles                    #    3.031 GHz                    
+    18,613,140,327      instructions              #    2.21  insn per cycle         
+       2.841214646 seconds time elapsed
 runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/gcheck.exe -p 1 256 1
 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255
 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100%
 .........................................................................
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/gcheck.exe -p 64 256 1 OMP=
-Process                     = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 9.194008e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 9.196202e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 9.196458e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 9.216282e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 9.218427e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 9.218617e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 1.856249e-04 +- 8.329951e-05 )  GeV^-6
-TOTAL       :     4.004219 sec
-    12,906,126,385      cycles                           #    2.980 GHz                    
-    30,705,078,901      instructions                     #    2.38  insn per cycle         
-       4.386333995 seconds time elapsed
+TOTAL       :     4.025391 sec
+    13,269,588,593      cycles                    #    3.045 GHz                    
+    30,835,242,734      instructions              #    2.32  insn per cycle         
+       4.414633876 seconds time elapsed
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2
@@ -76,20 +76,20 @@ Relative difference = 3.5164777671934515e-07
 OK (relative difference <= 5E-3)
 =========================================================================
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/check.exe -p 1 256 2 OMP=
-Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 8.231825e+01                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 8.232056e+01                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 8.232056e+01                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 8.235283e+01                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 8.235601e+01                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 8.235601e+01                 )  sec^-1
 MeanMatrixElemValue         = ( 1.187066e-05 +- 9.825549e-06 )  GeV^-6
-TOTAL       :     6.433436 sec
-    19,221,070,763      cycles                           #    2.989 GHz                    
-    54,051,803,217      instructions                     #    2.81  insn per cycle         
-       6.438508634 seconds time elapsed
-=Symbols in CPPProcess.o= (~sse4:32352) (avx2:    0) (512y:    0) (512z:    0)
+TOTAL       :     6.415522 sec
+    19,160,739,671      cycles                    #    2.987 GHz                    
+    54,057,163,618      instructions              #    2.82  insn per cycle         
+       6.420564139 seconds time elapsed
+=Symbols in CPPProcess.o= (~sse4:32342) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/runTest.exe
 [  PASSED  ] 6 tests.
@@ -102,19 +102,19 @@ Relative difference = 3.5163655122073967e-07
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd0/check.exe -p 1 256 2 OMP=
-Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.597062e+02                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.597152e+02                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.597152e+02                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.620106e+02                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.620225e+02                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.620225e+02                 )  sec^-1
 MeanMatrixElemValue         = ( 1.187066e-05 +- 9.825549e-06 )  GeV^-6
-TOTAL       :     3.316447 sec
-    10,011,910,485      cycles                           #    3.022 GHz                    
-    27,077,067,301      instructions                     #    2.70  insn per cycle         
-       3.322601212 seconds time elapsed
+TOTAL       :     3.271379 sec
+     9,921,721,414      cycles                    #    3.034 GHz                    
+    27,083,114,570      instructions              #    2.73  insn per cycle         
+       3.276452626 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:96346) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd0/runTest.exe
@@ -128,19 +128,19 @@ Relative difference = 3.5163655122073967e-07
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd0/check.exe -p 1 256 2 OMP=
-Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 3.498130e+02                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.498535e+02                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.498535e+02                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.556228e+02                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.556831e+02                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.556831e+02                 )  sec^-1
 MeanMatrixElemValue         = ( 1.187066e-05 +- 9.825549e-06 )  GeV^-6
-TOTAL       :     1.520366 sec
-     4,291,649,047      cycles                           #    2.828 GHz                    
-     9,660,073,588      instructions                     #    2.25  insn per cycle         
-       1.527692767 seconds time elapsed
+TOTAL       :     1.491065 sec
+     4,312,322,670      cycles                    #    2.886 GHz                    
+     9,666,677,538      instructions              #    2.24  insn per cycle         
+       1.495685259 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2:83998) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd0/runTest.exe
@@ -154,19 +154,19 @@ Relative difference = 3.516375977906115e-07
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_d_inl0_hrd0/check.exe -p 1 256 2 OMP=
-Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 3.919923e+02                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.920457e+02                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.920457e+02                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.969415e+02                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.970156e+02                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.970156e+02                 )  sec^-1
 MeanMatrixElemValue         = ( 1.187066e-05 +- 9.825549e-06 )  GeV^-6
-TOTAL       :     1.358214 sec
-     3,834,698,928      cycles                           #    2.828 GHz                    
-     8,610,372,933      instructions                     #    2.25  insn per cycle         
-       1.364092946 seconds time elapsed
+TOTAL       :     1.336444 sec
+     3,832,216,841      cycles                    #    2.859 GHz                    
+     8,617,148,111      instructions              #    2.25  insn per cycle         
+       1.341165118 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2:83696) (512y:   30) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_d_inl0_hrd0/runTest.exe
@@ -180,20 +180,20 @@ Relative difference = 3.516375977906115e-07
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_d_inl0_hrd0/check.exe -p 1 256 2 OMP=
-Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 3.591087e+02                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.591651e+02                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.591651e+02                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.679839e+02                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.680629e+02                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.680629e+02                 )  sec^-1
 MeanMatrixElemValue         = ( 1.187066e-05 +- 9.825549e-06 )  GeV^-6
-TOTAL       :     1.476912 sec
-     2,718,672,476      cycles                           #    1.838 GHz                    
-     4,330,529,264      instructions                     #    1.59  insn per cycle         
-       1.489166453 seconds time elapsed
-=Symbols in CPPProcess.o= (~sse4:    0) (avx2: 1884) (512y:   68) (512z:82923)
+TOTAL       :     1.441719 sec
+     2,716,250,084      cycles                    #    1.880 GHz                    
+     4,337,046,897      instructions              #    1.60  insn per cycle         
+       1.446804135 seconds time elapsed
+=Symbols in CPPProcess.o= (~sse4:    0) (avx2: 1874) (512y:   67) (512z:82924)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_d_inl0_hrd0/runTest.exe
 [  PASSED  ] 6 tests.
diff --git a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd0_bridge.txt b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd0_bridge.txt
index da5c4db165..1c0184de14 100644
--- a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd0_bridge.txt
+++ b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd0_bridge.txt
@@ -35,26 +35,26 @@ CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg'
 
-DATE: 2023-07-18_23:26:42
+DATE: 2023-06-16_23:23:12
 
-On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
+On itscrd80.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/gcheck.exe -p 1 256 2 --bridge OMP=
 WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost
 WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost
 WARNING! Instantiate device Bridge (nevt=256, gpublocks=1, gputhreads=256, gpublocks*gputhreads=256)
 WARNING! Set grid in Bridge (nevt=256, gpublocks=1, gputhreads=256, gpublocks*gputhreads=256)
-Process                     = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 4.067859e+02                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 4.068768e+02                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 4.068768e+02                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 4.061497e+02                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 4.062582e+02                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.062582e+02                 )  sec^-1
 MeanMatrixElemValue         = ( 1.187066e-05 +- 9.825549e-06 )  GeV^-6
-TOTAL       :     2.367000 sec
-     8,097,817,270      cycles                           #    3.013 GHz                    
-    18,646,982,810      instructions                     #    2.30  insn per cycle         
-       2.744137164 seconds time elapsed
+TOTAL       :     2.433988 sec
+     8,336,076,363      cycles                    #    3.005 GHz                    
+    17,014,813,881      instructions              #    2.04  insn per cycle         
+       2.833188841 seconds time elapsed
 runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/gcheck.exe -p 1 256 1 --bridge
 WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost
 WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost
@@ -68,17 +68,17 @@ WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost
 WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost
 WARNING! Instantiate device Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384)
 WARNING! Set grid in Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384)
-Process                     = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 9.159334e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 9.190870e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 9.190870e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 9.176810e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 9.216166e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 9.216166e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 1.856249e-04 +- 8.329951e-05 )  GeV^-6
-TOTAL       :     4.000218 sec
-    12,996,175,307      cycles                           #    3.006 GHz                    
-    31,650,599,303      instructions                     #    2.44  insn per cycle         
-       4.379696037 seconds time elapsed
+TOTAL       :     4.016312 sec
+    13,237,795,425      cycles                    #    3.038 GHz                    
+    28,853,451,424      instructions              #    2.18  insn per cycle         
+       4.417206461 seconds time elapsed
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2
@@ -89,20 +89,20 @@ OK (relative difference <= 5E-3)
 =========================================================================
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/check.exe -p 1 256 2 --bridge OMP=
 WARNING! Instantiate host Bridge (nevt=256)
-Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 7.583909e+01                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 7.584107e+01                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 7.584107e+01                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 8.426091e+01                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 8.426409e+01                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 8.426409e+01                 )  sec^-1
 MeanMatrixElemValue         = ( 1.187066e-05 +- 9.825549e-06 )  GeV^-6
-TOTAL       :     6.962766 sec
-    19,348,646,238      cycles                           #    2.778 GHz                    
-    54,053,167,203      instructions                     #    2.79  insn per cycle         
-       6.966338951 seconds time elapsed
-=Symbols in CPPProcess.o= (~sse4:32352) (avx2:    0) (512y:    0) (512z:    0)
+TOTAL       :     6.283292 sec
+    19,229,856,491      cycles                    #    3.063 GHz                    
+    54,057,430,365      instructions              #    2.81  insn per cycle         
+       6.287771357 seconds time elapsed
+=Symbols in CPPProcess.o= (~sse4:32342) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/runTest.exe
 [  PASSED  ] 6 tests.
@@ -116,19 +116,19 @@ OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd0/check.exe -p 1 256 2 --bridge OMP=
 WARNING! Instantiate host Bridge (nevt=256)
-Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.594374e+02                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.594459e+02                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.594459e+02                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.625803e+02                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.625919e+02                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.625919e+02                 )  sec^-1
 MeanMatrixElemValue         = ( 1.187066e-05 +- 9.825549e-06 )  GeV^-6
-TOTAL       :     3.316146 sec
-    10,062,188,532      cycles                           #    3.032 GHz                    
-    27,076,347,840      instructions                     #    2.69  insn per cycle         
-       3.320140253 seconds time elapsed
+TOTAL       :     3.253969 sec
+     9,967,840,687      cycles                    #    3.060 GHz                    
+    27,083,402,734      instructions              #    2.72  insn per cycle         
+       3.258914943 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:96346) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd0/runTest.exe
@@ -143,19 +143,19 @@ OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd0/check.exe -p 1 256 2 --bridge OMP=
 WARNING! Instantiate host Bridge (nevt=256)
-Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 3.507457e+02                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.507869e+02                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.507869e+02                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.555664e+02                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.556218e+02                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.556218e+02                 )  sec^-1
 MeanMatrixElemValue         = ( 1.187066e-05 +- 9.825549e-06 )  GeV^-6
-TOTAL       :     1.510254 sec
-     4,286,831,616      cycles                           #    2.833 GHz                    
-     9,659,769,469      instructions                     #    2.25  insn per cycle         
-       1.513835398 seconds time elapsed
+TOTAL       :     1.491530 sec
+     4,286,204,468      cycles                    #    2.869 GHz                    
+     9,667,785,714      instructions              #    2.26  insn per cycle         
+       1.496036399 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2:83998) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd0/runTest.exe
@@ -170,19 +170,19 @@ OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_d_inl0_hrd0/check.exe -p 1 256 2 --bridge OMP=
 WARNING! Instantiate host Bridge (nevt=256)
-Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 3.929207e+02                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.929712e+02                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.929712e+02                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.941069e+02                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.941824e+02                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.941824e+02                 )  sec^-1
 MeanMatrixElemValue         = ( 1.187066e-05 +- 9.825549e-06 )  GeV^-6
-TOTAL       :     1.349459 sec
-     3,830,512,570      cycles                           #    2.834 GHz                    
-     8,610,115,617      instructions                     #    2.25  insn per cycle         
-       1.353051659 seconds time elapsed
+TOTAL       :     1.348647 sec
+     3,839,497,436      cycles                    #    2.840 GHz                    
+     8,618,118,436      instructions              #    2.24  insn per cycle         
+       1.353595580 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2:83696) (512y:   30) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_d_inl0_hrd0/runTest.exe
@@ -197,20 +197,20 @@ OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_d_inl0_hrd0/check.exe -p 1 256 2 --bridge OMP=
 WARNING! Instantiate host Bridge (nevt=256)
-Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 3.588003e+02                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.588528e+02                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.588528e+02                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.666454e+02                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.667164e+02                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.667164e+02                 )  sec^-1
 MeanMatrixElemValue         = ( 1.187066e-05 +- 9.825549e-06 )  GeV^-6
-TOTAL       :     1.481818 sec
-     2,705,909,679      cycles                           #    1.822 GHz                    
-     4,330,220,559      instructions                     #    1.60  insn per cycle         
-       1.485587826 seconds time elapsed
-=Symbols in CPPProcess.o= (~sse4:    0) (avx2: 1884) (512y:   68) (512z:82923)
+TOTAL       :     1.447325 sec
+     2,707,628,155      cycles                    #    1.868 GHz                    
+     4,338,013,213      instructions              #    1.60  insn per cycle         
+       1.451919961 seconds time elapsed
+=Symbols in CPPProcess.o= (~sse4:    0) (avx2: 1874) (512y:   67) (512z:82924)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_d_inl0_hrd0/runTest.exe
 [  PASSED  ] 6 tests.
diff --git a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd1.txt
index 92d23c3eb7..1144007b2b 100644
--- a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd1.txt
+++ b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd1.txt
@@ -35,38 +35,38 @@ CUDACPP_BUILDDIR='build.512z_d_inl0_hrd1'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg'
 
-DATE: 2023-07-18_22:51:11
+DATE: 2023-06-16_23:02:29
 
-On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
+On itscrd80.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd1/gcheck.exe -p 1 256 2 OMP=
-Process                     = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1]
+Process                     = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 4.073016e+02                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 4.073401e+02                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 4.073545e+02                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 4.065553e+02                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 4.065928e+02                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.066074e+02                 )  sec^-1
 MeanMatrixElemValue         = ( 1.187066e-05 +- 9.825549e-06 )  GeV^-6
-TOTAL       :     2.461762 sec
-     8,235,337,655      cycles                           #    2.992 GHz                    
-    17,886,315,664      instructions                     #    2.17  insn per cycle         
-       2.874745866 seconds time elapsed
+TOTAL       :     2.452549 sec
+     8,383,872,432      cycles                    #    3.005 GHz                    
+    18,091,785,273      instructions              #    2.16  insn per cycle         
+       2.848067694 seconds time elapsed
 runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd1/gcheck.exe -p 1 256 1
 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255
 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100%
 .........................................................................
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd1/gcheck.exe -p 64 256 1 OMP=
-Process                     = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1]
+Process                     = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 9.214723e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 9.216938e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 9.217196e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 9.225084e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 9.227042e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 9.227224e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 1.856249e-04 +- 8.329951e-05 )  GeV^-6
-TOTAL       :     4.007618 sec
-    12,832,818,033      cycles                           #    2.963 GHz                    
-    28,156,306,212      instructions                     #    2.19  insn per cycle         
-       4.389866635 seconds time elapsed
+TOTAL       :     4.032176 sec
+    13,272,438,956      cycles                    #    3.042 GHz                    
+    28,712,791,562      instructions              #    2.16  insn per cycle         
+       4.421079188 seconds time elapsed
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd1/gcheck.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd1/fgcheck.exe 2 64 2
@@ -76,20 +76,20 @@ Relative difference = 3.5164777671934515e-07
 OK (relative difference <= 5E-3)
 =========================================================================
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd1/check.exe -p 1 256 2 OMP=
-Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1]
+Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 7.755069e+01                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 7.755270e+01                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 7.755270e+01                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 8.506737e+01                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 8.507047e+01                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 8.507047e+01                 )  sec^-1
 MeanMatrixElemValue         = ( 1.187066e-05 +- 9.825549e-06 )  GeV^-6
-TOTAL       :     6.820022 sec
-    19,341,258,763      cycles                           #    2.837 GHz                    
-    54,050,764,807      instructions                     #    2.79  insn per cycle         
-       6.824842050 seconds time elapsed
-=Symbols in CPPProcess.o= (~sse4:31958) (avx2:    0) (512y:    0) (512z:    0)
+TOTAL       :     6.213859 sec
+    19,274,588,808      cycles                    #    3.101 GHz                    
+    54,076,478,702      instructions              #    2.81  insn per cycle         
+       6.218544506 seconds time elapsed
+=Symbols in CPPProcess.o= (~sse4:32261) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd1/runTest.exe
 [  PASSED  ] 6 tests.
@@ -102,19 +102,19 @@ Relative difference = 3.5163655122073967e-07
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd1/check.exe -p 1 256 2 OMP=
-Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1]
+Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.595686e+02                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.595766e+02                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.595766e+02                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.638242e+02                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.638356e+02                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.638356e+02                 )  sec^-1
 MeanMatrixElemValue         = ( 1.187066e-05 +- 9.825549e-06 )  GeV^-6
-TOTAL       :     3.318479 sec
-    10,050,715,238      cycles                           #    3.032 GHz                    
-    27,071,034,271      instructions                     #    2.69  insn per cycle         
-       3.323917457 seconds time elapsed
+TOTAL       :     3.237531 sec
+     9,921,784,579      cycles                    #    3.067 GHz                    
+    27,077,809,738      instructions              #    2.73  insn per cycle         
+       3.242137028 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:96273) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd1/runTest.exe
@@ -128,20 +128,20 @@ Relative difference = 3.5163655122073967e-07
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd1/check.exe -p 1 256 2 OMP=
-Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1]
+Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 3.443811e+02                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.444242e+02                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.444242e+02                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.488911e+02                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.489451e+02                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.489451e+02                 )  sec^-1
 MeanMatrixElemValue         = ( 1.187066e-05 +- 9.825549e-06 )  GeV^-6
-TOTAL       :     1.543975 sec
-     4,371,174,841      cycles                           #    2.835 GHz                    
-     9,670,718,608      instructions                     #    2.21  insn per cycle         
-       1.549228190 seconds time elapsed
-=Symbols in CPPProcess.o= (~sse4:    0) (avx2:84158) (512y:    0) (512z:    0)
+TOTAL       :     1.519935 sec
+     4,376,850,073      cycles                    #    2.874 GHz                    
+     9,677,370,071      instructions              #    2.21  insn per cycle         
+       1.524967874 seconds time elapsed
+=Symbols in CPPProcess.o= (~sse4:    0) (avx2:84092) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd1/runTest.exe
 [  PASSED  ] 6 tests.
@@ -154,20 +154,20 @@ Relative difference = 3.516375977906115e-07
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_d_inl0_hrd1/check.exe -p 1 256 2 OMP=
-Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1]
+Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 3.846767e+02                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.847284e+02                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.847284e+02                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.924415e+02                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.925092e+02                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.925092e+02                 )  sec^-1
 MeanMatrixElemValue         = ( 1.187066e-05 +- 9.825549e-06 )  GeV^-6
-TOTAL       :     1.382320 sec
-     3,806,727,433      cycles                           #    2.760 GHz                    
-     8,619,338,106      instructions                     #    2.26  insn per cycle         
-       1.387195733 seconds time elapsed
-=Symbols in CPPProcess.o= (~sse4:    0) (avx2:83696) (512y:  180) (512z:    0)
+TOTAL       :     1.350849 sec
+     3,841,551,477      cycles                    #    2.839 GHz                    
+     8,626,542,730      instructions              #    2.25  insn per cycle         
+       1.355459933 seconds time elapsed
+=Symbols in CPPProcess.o= (~sse4:    0) (avx2:83791) (512y:  180) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_d_inl0_hrd1/runTest.exe
 [  PASSED  ] 6 tests.
@@ -180,20 +180,20 @@ Relative difference = 3.516375977906115e-07
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_d_inl0_hrd1/check.exe -p 1 256 2 OMP=
-Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1]
+Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 3.624534e+02                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.625071e+02                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.625071e+02                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.671815e+02                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.672525e+02                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.672525e+02                 )  sec^-1
 MeanMatrixElemValue         = ( 1.187066e-05 +- 9.825549e-06 )  GeV^-6
-TOTAL       :     1.468920 sec
-     2,705,485,886      cycles                           #    1.846 GHz                    
-     4,337,157,031      instructions                     #    1.60  insn per cycle         
-       1.473526428 seconds time elapsed
-=Symbols in CPPProcess.o= (~sse4:    0) (avx2: 1788) (512y:  166) (512z:83063)
+TOTAL       :     1.450398 sec
+     2,709,666,226      cycles                    #    1.865 GHz                    
+     4,344,182,027      instructions              #    1.60  insn per cycle         
+       1.455504193 seconds time elapsed
+=Symbols in CPPProcess.o= (~sse4:    0) (avx2: 1787) (512y:  166) (512z:83071)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_d_inl0_hrd1/runTest.exe
 [  PASSED  ] 6 tests.
diff --git a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd0.txt
index 97cdc10864..4ef1b474b5 100644
--- a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd0.txt
+++ b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd0.txt
@@ -35,61 +35,61 @@ CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg'
 
-DATE: 2023-07-18_22:52:14
+DATE: 2023-06-16_23:03:32
 
-On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
+On itscrd80.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/gcheck.exe -p 1 256 2 OMP=
-Process                     = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 6.785287e+02                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 6.786309e+02                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 6.786769e+02                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 6.805716e+02                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 6.806625e+02                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 6.806872e+02                 )  sec^-1
 MeanMatrixElemValue         = ( 1.186984e-05 +- 9.824899e-06 )  GeV^-6
-TOTAL       :     1.700960 sec
-     5,786,328,792      cycles                           #    2.984 GHz                    
-    12,183,728,265      instructions                     #    2.11  insn per cycle         
-       2.046788451 seconds time elapsed
+TOTAL       :     1.680704 sec
+     5,915,505,957      cycles                    #    3.011 GHz                    
+    12,343,112,691      instructions              #    2.09  insn per cycle         
+       2.022067086 seconds time elapsed
 runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/gcheck.exe -p 1 256 1
 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255
 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100%
 .........................................................................
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/gcheck.exe -p 64 256 1 OMP=
-Process                     = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 2.338994e+04                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.339786e+04                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.339941e+04                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.312297e+04                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.312960e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.313034e+04                 )  sec^-1
 MeanMatrixElemValue         = ( 1.856829e-04 +- 8.333435e-05 )  GeV^-6
-TOTAL       :     1.917438 sec
-     6,439,640,169      cycles                           #    2.930 GHz                    
-    13,998,441,228      instructions                     #    2.17  insn per cycle         
-       2.254021552 seconds time elapsed
+TOTAL       :     1.956042 sec
+     6,779,291,735      cycles                    #    3.014 GHz                    
+    14,381,222,230      instructions              #    2.12  insn per cycle         
+       2.309303925 seconds time elapsed
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2
 Avg ME (C++/CUDA)   = 9.849636e-03
-Avg ME (F77/CUDA)   = 9.8712405367667715E-003
-Relative difference = 0.0021934350433631634
+Avg ME (F77/CUDA)   = 9.8712405367932642E-003
+Relative difference = 0.002193435046052877
 OK (relative difference <= 5E-3)
 =========================================================================
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/check.exe -p 1 256 2 OMP=
-Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 8.620419e+01                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 8.620672e+01                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 8.620672e+01                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 8.908521e+01                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 8.908794e+01                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 8.908794e+01                 )  sec^-1
 MeanMatrixElemValue         = ( 1.187013e-05 +- 9.825040e-06 )  GeV^-6
-TOTAL       :     6.135818 sec
-    18,512,552,038      cycles                           #    3.019 GHz                    
-    53,641,945,364      instructions                     #    2.90  insn per cycle         
-       6.140857892 seconds time elapsed
-=Symbols in CPPProcess.o= (~sse4:20295) (avx2:    0) (512y:    0) (512z:    0)
+TOTAL       :     5.935380 sec
+    18,280,225,429      cycles                    #    3.079 GHz                    
+    53,644,572,574      instructions              #    2.93  insn per cycle         
+       5.940879711 seconds time elapsed
+=Symbols in CPPProcess.o= (~sse4:20329) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/runTest.exe
 [  PASSED  ] 6 tests.
@@ -97,24 +97,24 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProce
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/fcheck.exe 2 64 2
 Avg ME (C++/C++)    = 9.847961e-03
-Avg ME (F77/C++)    = 9.8479612087551509E-003
-Relative difference = 2.119780432912131e-08
+Avg ME (F77/C++)    = 9.8479612087550399E-003
+Relative difference = 2.119779305548787e-08
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd0/check.exe -p 1 256 2 OMP=
-Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 3.465466e+02                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.465860e+02                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.465860e+02                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.628533e+02                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.628996e+02                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.628996e+02                 )  sec^-1
 MeanMatrixElemValue         = ( 1.187013e-05 +- 9.825037e-06 )  GeV^-6
-TOTAL       :     1.533445 sec
-     4,609,129,011      cycles                           #    3.010 GHz                    
-    13,757,208,915      instructions                     #    2.98  insn per cycle         
-       1.538063941 seconds time elapsed
+TOTAL       :     1.461345 sec
+     4,500,201,036      cycles                    #    3.073 GHz                    
+    13,763,590,198      instructions              #    3.06  insn per cycle         
+       1.466106403 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:96927) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd0/runTest.exe
@@ -128,19 +128,19 @@ Relative difference = 3.848767971092077e-08
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd0/check.exe -p 1 256 2 OMP=
-Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 6.805526e+02                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 6.807173e+02                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 6.807173e+02                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 7.026530e+02                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 7.028270e+02                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.028270e+02                 )  sec^-1
 MeanMatrixElemValue         = ( 1.187188e-05 +- 9.826769e-06 )  GeV^-6
-TOTAL       :     0.788809 sec
-     2,211,972,639      cycles                           #    2.821 GHz                    
-     4,863,560,484      instructions                     #    2.20  insn per cycle         
-       0.793786183 seconds time elapsed
+TOTAL       :     0.764097 sec
+     2,191,663,686      cycles                    #    2.869 GHz                    
+     4,871,112,775      instructions              #    2.22  insn per cycle         
+       0.768775887 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2:84275) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd0/runTest.exe
@@ -154,19 +154,19 @@ Relative difference = 3.9425359136432956e-08
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_f_inl0_hrd0/check.exe -p 1 256 2 OMP=
-Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 7.746508e+02                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 7.748551e+02                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 7.748551e+02                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 7.934823e+02                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 7.937028e+02                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.937028e+02                 )  sec^-1
 MeanMatrixElemValue         = ( 1.187188e-05 +- 9.826769e-06 )  GeV^-6
-TOTAL       :     0.692554 sec
-     1,935,906,859      cycles                           #    2.807 GHz                    
-     4,335,582,992      instructions                     #    2.24  insn per cycle         
-       0.697192865 seconds time elapsed
+TOTAL       :     0.671145 sec
+     1,934,376,171      cycles                    #    2.867 GHz                    
+     4,342,607,330      instructions              #    2.24  insn per cycle         
+       0.675706420 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2:83944) (512y:   33) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_f_inl0_hrd0/runTest.exe
@@ -180,20 +180,20 @@ Relative difference = 3.9425359136432956e-08
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_f_inl0_hrd0/check.exe -p 1 256 2 OMP=
-Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 7.195008e+02                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 7.197249e+02                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 7.197249e+02                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 7.285767e+02                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 7.288040e+02                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.288040e+02                 )  sec^-1
 MeanMatrixElemValue         = ( 1.187188e-05 +- 9.826768e-06 )  GeV^-6
-TOTAL       :     0.744700 sec
-     1,361,609,819      cycles                           #    1.835 GHz                    
-     2,186,530,497      instructions                     #    1.61  insn per cycle         
-       0.749913629 seconds time elapsed
-=Symbols in CPPProcess.o= (~sse4:    0) (avx2: 2180) (512y:   41) (512z:83043)
+TOTAL       :     0.730681 sec
+     1,366,589,345      cycles                    #    1.861 GHz                    
+     2,193,139,079      instructions              #    1.60  insn per cycle         
+       0.735224545 seconds time elapsed
+=Symbols in CPPProcess.o= (~sse4:    0) (avx2: 2170) (512y:   41) (512z:83044)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_f_inl0_hrd0/runTest.exe
 [  PASSED  ] 6 tests.
diff --git a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd0_bridge.txt b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd0_bridge.txt
index 928eec4df2..6f1772cda6 100644
--- a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd0_bridge.txt
+++ b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd0_bridge.txt
@@ -35,26 +35,26 @@ CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg'
 
-DATE: 2023-07-18_23:27:45
+DATE: 2023-06-16_23:24:16
 
-On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
+On itscrd80.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/gcheck.exe -p 1 256 2 --bridge OMP=
 WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost
 WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost
 WARNING! Instantiate device Bridge (nevt=256, gpublocks=1, gputhreads=256, gpublocks*gputhreads=256)
 WARNING! Set grid in Bridge (nevt=256, gpublocks=1, gputhreads=256, gpublocks*gputhreads=256)
-Process                     = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 6.815514e+02                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 6.817339e+02                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 6.817339e+02                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 6.655466e+02                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 6.657233e+02                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 6.657233e+02                 )  sec^-1
 MeanMatrixElemValue         = ( 1.187094e-05 +- 9.825664e-06 )  GeV^-6
-TOTAL       :     1.597944 sec
-     5,581,307,276      cycles                           #    2.978 GHz                    
-    11,919,657,124      instructions                     #    2.14  insn per cycle         
-       1.930728577 seconds time elapsed
+TOTAL       :     1.660443 sec
+     5,832,575,389      cycles                    #    3.001 GHz                    
+    12,080,610,113      instructions              #    2.07  insn per cycle         
+       2.000524939 seconds time elapsed
 runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/gcheck.exe -p 1 256 1 --bridge
 WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost
 WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost
@@ -68,41 +68,41 @@ WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost
 WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost
 WARNING! Instantiate device Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384)
 WARNING! Set grid in Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384)
-Process                     = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 2.271403e+04                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.283853e+04                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.283853e+04                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.306235e+04                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.319271e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.319271e+04                 )  sec^-1
 MeanMatrixElemValue         = ( 1.856441e-04 +- 8.331096e-05 )  GeV^-6
-TOTAL       :     1.899162 sec
-     6,519,942,412      cycles                           #    2.998 GHz                    
-    14,543,846,826      instructions                     #    2.23  insn per cycle         
-       2.231011569 seconds time elapsed
+TOTAL       :     1.939786 sec
+     6,710,577,294      cycles                    #    3.010 GHz                    
+    14,122,488,130      instructions              #    2.10  insn per cycle         
+       2.286693438 seconds time elapsed
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2
 Avg ME (C++/CUDA)   = 9.849636e-03
-Avg ME (F77/CUDA)   = 9.8712405367667715E-003
-Relative difference = 0.0021934350433631634
+Avg ME (F77/CUDA)   = 9.8712405367932642E-003
+Relative difference = 0.002193435046052877
 OK (relative difference <= 5E-3)
 =========================================================================
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/check.exe -p 1 256 2 --bridge OMP=
 WARNING! Instantiate host Bridge (nevt=256)
-Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 8.663821e+01                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 8.664071e+01                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 8.664071e+01                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 8.841510e+01                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 8.841775e+01                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 8.841775e+01                 )  sec^-1
 MeanMatrixElemValue         = ( 1.187013e-05 +- 9.825040e-06 )  GeV^-6
-TOTAL       :     6.097901 sec
-    18,359,932,945      cycles                           #    3.010 GHz                    
-    53,640,128,637      instructions                     #    2.92  insn per cycle         
-       6.101763303 seconds time elapsed
-=Symbols in CPPProcess.o= (~sse4:20295) (avx2:    0) (512y:    0) (512z:    0)
+TOTAL       :     5.977425 sec
+    18,376,830,240      cycles                    #    3.073 GHz                    
+    53,645,367,177      instructions              #    2.92  insn per cycle         
+       5.982025088 seconds time elapsed
+=Symbols in CPPProcess.o= (~sse4:20329) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/runTest.exe
 [  PASSED  ] 6 tests.
@@ -110,25 +110,25 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProce
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/fcheck.exe 2 64 2
 Avg ME (C++/C++)    = 9.847961e-03
-Avg ME (F77/C++)    = 9.8479612087551509E-003
-Relative difference = 2.119780432912131e-08
+Avg ME (F77/C++)    = 9.8479612087550399E-003
+Relative difference = 2.119779305548787e-08
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd0/check.exe -p 1 256 2 --bridge OMP=
 WARNING! Instantiate host Bridge (nevt=256)
-Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 3.496425e+02                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.496832e+02                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.496832e+02                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.623748e+02                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.624254e+02                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.624254e+02                 )  sec^-1
 MeanMatrixElemValue         = ( 1.187013e-05 +- 9.825037e-06 )  GeV^-6
-TOTAL       :     1.515582 sec
-     4,606,909,599      cycles                           #    3.033 GHz                    
-    13,756,787,975      instructions                     #    2.99  insn per cycle         
-       1.519522272 seconds time elapsed
+TOTAL       :     1.463452 sec
+     4,497,433,282      cycles                    #    3.068 GHz                    
+    13,764,580,046      instructions              #    3.06  insn per cycle         
+       1.468085626 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:96927) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd0/runTest.exe
@@ -143,19 +143,19 @@ OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd0/check.exe -p 1 256 2 --bridge OMP=
 WARNING! Instantiate host Bridge (nevt=256)
-Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 6.875615e+02                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 6.877209e+02                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 6.877209e+02                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 6.987628e+02                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 6.989331e+02                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 6.989331e+02                 )  sec^-1
 MeanMatrixElemValue         = ( 1.187188e-05 +- 9.826769e-06 )  GeV^-6
-TOTAL       :     0.773540 sec
-     2,184,612,339      cycles                           #    2.815 GHz                    
-     4,863,407,104      instructions                     #    2.23  insn per cycle         
-       0.777199304 seconds time elapsed
+TOTAL       :     0.762831 sec
+     2,199,477,118      cycles                    #    2.869 GHz                    
+     4,871,433,971      instructions              #    2.21  insn per cycle         
+       0.767671601 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2:84275) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd0/runTest.exe
@@ -170,19 +170,19 @@ OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_f_inl0_hrd0/check.exe -p 1 256 2 --bridge OMP=
 WARNING! Instantiate host Bridge (nevt=256)
-Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 7.845183e+02                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 7.847160e+02                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 7.847160e+02                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 7.924776e+02                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 7.926917e+02                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.926917e+02                 )  sec^-1
 MeanMatrixElemValue         = ( 1.187188e-05 +- 9.826769e-06 )  GeV^-6
-TOTAL       :     0.678592 sec
-     1,931,890,794      cycles                           #    2.835 GHz                    
-     4,335,529,546      instructions                     #    2.24  insn per cycle         
-       0.682138638 seconds time elapsed
+TOTAL       :     0.672826 sec
+     1,945,457,623      cycles                    #    2.877 GHz                    
+     4,343,708,064      instructions              #    2.23  insn per cycle         
+       0.677656533 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2:83944) (512y:   33) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_f_inl0_hrd0/runTest.exe
@@ -197,20 +197,20 @@ OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_f_inl0_hrd0/check.exe -p 1 256 2 --bridge OMP=
 WARNING! Instantiate host Bridge (nevt=256)
-Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 7.233858e+02                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 7.236035e+02                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 7.236035e+02                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 7.377484e+02                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 7.379728e+02                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.379728e+02                 )  sec^-1
 MeanMatrixElemValue         = ( 1.187188e-05 +- 9.826768e-06 )  GeV^-6
-TOTAL       :     0.735875 sec
-     1,362,688,180      cycles                           #    1.845 GHz                    
-     2,186,191,755      instructions                     #    1.60  insn per cycle         
-       0.739511514 seconds time elapsed
-=Symbols in CPPProcess.o= (~sse4:    0) (avx2: 2180) (512y:   41) (512z:83043)
+TOTAL       :     0.723682 sec
+     1,379,747,419      cycles                    #    1.897 GHz                    
+     2,194,225,604      instructions              #    1.59  insn per cycle         
+       0.728534863 seconds time elapsed
+=Symbols in CPPProcess.o= (~sse4:    0) (avx2: 2170) (512y:   41) (512z:83044)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_f_inl0_hrd0/runTest.exe
 [  PASSED  ] 6 tests.
diff --git a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd1.txt
index 30b8493f61..af71691b16 100644
--- a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd1.txt
+++ b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd1.txt
@@ -35,61 +35,61 @@ CUDACPP_BUILDDIR='build.512z_f_inl0_hrd1'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg'
 
-DATE: 2023-07-18_22:53:01
+DATE: 2023-06-16_23:04:19
 
-On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
+On itscrd80.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd1/gcheck.exe -p 1 256 2 OMP=
-Process                     = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1]
+Process                     = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 6.762140e+02                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 6.763059e+02                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 6.763409e+02                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 6.674461e+02                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 6.675325e+02                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 6.675581e+02                 )  sec^-1
 MeanMatrixElemValue         = ( 1.186984e-05 +- 9.824899e-06 )  GeV^-6
-TOTAL       :     1.665282 sec
-     5,832,695,656      cycles                           #    2.986 GHz                    
-    12,267,200,402      instructions                     #    2.10  insn per cycle         
-       2.011078808 seconds time elapsed
+TOTAL       :     1.698247 sec
+     5,886,551,862      cycles                    #    2.951 GHz                    
+    11,210,609,464      instructions              #    1.90  insn per cycle         
+       2.052173907 seconds time elapsed
 runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd1/gcheck.exe -p 1 256 1
 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255
 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100%
 .........................................................................
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd1/gcheck.exe -p 64 256 1 OMP=
-Process                     = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1]
+Process                     = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 2.288186e+04                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.288997e+04                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.289092e+04                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.286680e+04                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.287310e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.287386e+04                 )  sec^-1
 MeanMatrixElemValue         = ( 1.856829e-04 +- 8.333435e-05 )  GeV^-6
-TOTAL       :     1.931538 sec
-     6,620,092,279      cycles                           #    2.996 GHz                    
-    14,343,804,186      instructions                     #    2.17  insn per cycle         
-       2.265769331 seconds time elapsed
+TOTAL       :     1.961982 sec
+     6,772,975,134      cycles                    #    3.012 GHz                    
+    13,750,292,328      instructions              #    2.03  insn per cycle         
+       2.305639635 seconds time elapsed
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd1/gcheck.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd1/fgcheck.exe 2 64 2
 Avg ME (C++/CUDA)   = 9.849636e-03
-Avg ME (F77/CUDA)   = 9.8712405367667715E-003
-Relative difference = 0.0021934350433631634
+Avg ME (F77/CUDA)   = 9.8712405367932608E-003
+Relative difference = 0.0021934350460525243
 OK (relative difference <= 5E-3)
 =========================================================================
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd1/check.exe -p 1 256 2 OMP=
-Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1]
+Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 8.736091e+01                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 8.736341e+01                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 8.736341e+01                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 9.000555e+01                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 9.000832e+01                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 9.000832e+01                 )  sec^-1
 MeanMatrixElemValue         = ( 1.187013e-05 +- 9.825040e-06 )  GeV^-6
-TOTAL       :     6.049248 sec
-    18,311,029,839      cycles                           #    3.026 GHz                    
-    53,621,886,907      instructions                     #    2.93  insn per cycle         
-       6.053266845 seconds time elapsed
-=Symbols in CPPProcess.o= (~sse4:20240) (avx2:    0) (512y:    0) (512z:    0)
+TOTAL       :     5.871994 sec
+    18,105,311,919      cycles                    #    3.082 GHz                    
+    53,664,310,633      instructions              #    2.96  insn per cycle         
+       5.876674372 seconds time elapsed
+=Symbols in CPPProcess.o= (~sse4:20543) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd1/runTest.exe
 [  PASSED  ] 6 tests.
@@ -97,25 +97,25 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProce
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd1/check.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd1/fcheck.exe 2 64 2
 Avg ME (C++/C++)    = 9.847961e-03
-Avg ME (F77/C++)    = 9.8479612087572898E-003
-Relative difference = 2.1198021522715588e-08
+Avg ME (F77/C++)    = 9.8479612087571129E-003
+Relative difference = 2.119800355536229e-08
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd1/check.exe -p 1 256 2 OMP=
-Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1]
+Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 3.476755e+02                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.477150e+02                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.477150e+02                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.589372e+02                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.589815e+02                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.589815e+02                 )  sec^-1
 MeanMatrixElemValue         = ( 1.187013e-05 +- 9.825037e-06 )  GeV^-6
-TOTAL       :     1.523805 sec
-     4,595,954,892      cycles                           #    3.010 GHz                    
-    13,748,613,866      instructions                     #    2.99  insn per cycle         
-       1.527785688 seconds time elapsed
-=Symbols in CPPProcess.o= (~sse4:96684) (avx2:    0) (512y:    0) (512z:    0)
+TOTAL       :     1.476565 sec
+     4,561,855,617      cycles                    #    3.083 GHz                    
+    13,756,278,948      instructions              #    3.02  insn per cycle         
+       1.481667004 seconds time elapsed
+=Symbols in CPPProcess.o= (~sse4:96740) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd1/runTest.exe
 [  PASSED  ] 6 tests.
@@ -128,20 +128,20 @@ Relative difference = 3.849071936588079e-08
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd1/check.exe -p 1 256 2 OMP=
-Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1]
+Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 7.004655e+02                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 7.006299e+02                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 7.006299e+02                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 6.993067e+02                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 6.994817e+02                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 6.994817e+02                 )  sec^-1
 MeanMatrixElemValue         = ( 1.187188e-05 +- 9.826769e-06 )  GeV^-6
-TOTAL       :     0.759115 sec
-     2,147,216,619      cycles                           #    2.818 GHz                    
-     4,869,213,337      instructions                     #    2.27  insn per cycle         
-       0.762864770 seconds time elapsed
-=Symbols in CPPProcess.o= (~sse4:    0) (avx2:84897) (512y:    0) (512z:    0)
+TOTAL       :     0.761742 sec
+     2,188,633,514      cycles                    #    2.862 GHz                    
+     4,877,048,177      instructions              #    2.23  insn per cycle         
+       0.766373792 seconds time elapsed
+=Symbols in CPPProcess.o= (~sse4:    0) (avx2:84908) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd1/runTest.exe
 [  PASSED  ] 6 tests.
@@ -154,20 +154,20 @@ Relative difference = 3.9425546409167914e-08
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_f_inl0_hrd1/check.exe -p 1 256 2 OMP=
-Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1]
+Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 7.805699e+02                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 7.807659e+02                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 7.807659e+02                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 7.971095e+02                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 7.973270e+02                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.973270e+02                 )  sec^-1
 MeanMatrixElemValue         = ( 1.187188e-05 +- 9.826769e-06 )  GeV^-6
-TOTAL       :     0.681297 sec
-     1,924,788,541      cycles                           #    2.811 GHz                    
-     4,340,748,369      instructions                     #    2.26  insn per cycle         
-       0.685335487 seconds time elapsed
-=Symbols in CPPProcess.o= (~sse4:    0) (avx2:84599) (512y:   22) (512z:    0)
+TOTAL       :     0.668601 sec
+     1,935,360,358      cycles                    #    2.879 GHz                    
+     4,348,847,241      instructions              #    2.25  insn per cycle         
+       0.673510965 seconds time elapsed
+=Symbols in CPPProcess.o= (~sse4:    0) (avx2:84638) (512y:   22) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_f_inl0_hrd1/runTest.exe
 [  PASSED  ] 6 tests.
@@ -180,20 +180,20 @@ Relative difference = 3.9425546409167914e-08
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_f_inl0_hrd1/check.exe -p 1 256 2 OMP=
-Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1]
+Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 7.200580e+02                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 7.202747e+02                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 7.202747e+02                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 7.103833e+02                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 7.106308e+02                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.106308e+02                 )  sec^-1
 MeanMatrixElemValue         = ( 1.187188e-05 +- 9.826768e-06 )  GeV^-6
-TOTAL       :     0.738614 sec
-     1,363,086,584      cycles                           #    1.837 GHz                    
-     2,192,070,777      instructions                     #    1.61  insn per cycle         
-       0.742546673 seconds time elapsed
-=Symbols in CPPProcess.o= (~sse4:    0) (avx2: 2739) (512y:   23) (512z:83176)
+TOTAL       :     0.749331 sec
+     1,373,934,827      cycles                    #    1.825 GHz                    
+     2,200,572,670      instructions              #    1.60  insn per cycle         
+       0.754227181 seconds time elapsed
+=Symbols in CPPProcess.o= (~sse4:    0) (avx2: 2740) (512y:   23) (512z:83143)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_f_inl0_hrd1/runTest.exe
 [  PASSED  ] 6 tests.
diff --git a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_m_inl0_hrd0.txt
index a30ca07357..3a9dd35695 100644
--- a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_m_inl0_hrd0.txt
+++ b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_m_inl0_hrd0.txt
@@ -35,38 +35,38 @@ CUDACPP_BUILDDIR='build.512z_m_inl0_hrd0'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg'
 
-DATE: 2023-07-18_22:53:47
+DATE: 2023-06-16_23:05:06
 
-On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
+On itscrd80.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd0/gcheck.exe -p 1 256 2 OMP=
-Process                     = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 4.696272e+02                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 4.696884e+02                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 4.697052e+02                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 4.693625e+02                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 4.694288e+02                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.694409e+02                 )  sec^-1
 MeanMatrixElemValue         = ( 1.187066e-05 +- 9.825548e-06 )  GeV^-6
-TOTAL       :     2.175114 sec
-     7,423,433,219      cycles                           #    2.973 GHz                    
-    16,360,609,172      instructions                     #    2.20  insn per cycle         
-       2.553706141 seconds time elapsed
+TOTAL       :     2.196059 sec
+     7,620,057,050      cycles                    #    3.016 GHz                    
+    15,936,153,152      instructions              #    2.09  insn per cycle         
+       2.583018062 seconds time elapsed
 runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd0/gcheck.exe -p 1 256 1
 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255
 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100%
 .........................................................................
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd0/gcheck.exe -p 64 256 1 OMP=
-Process                     = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 1.112322e+04                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.112640e+04                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.112677e+04                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.108976e+04                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.109249e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.109272e+04                 )  sec^-1
 MeanMatrixElemValue         = ( 1.856249e-04 +- 8.329951e-05 )  GeV^-6
-TOTAL       :     3.402785 sec
-    11,079,983,197      cycles                           #    2.966 GHz                    
-    26,069,232,135      instructions                     #    2.35  insn per cycle         
-       3.792325716 seconds time elapsed
+TOTAL       :     3.430946 sec
+    11,408,477,495      cycles                    #    3.034 GHz                    
+    26,313,998,938      instructions              #    2.31  insn per cycle         
+       3.820144649 seconds time elapsed
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd0/gcheck.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd0/fgcheck.exe 2 64 2
@@ -76,20 +76,20 @@ Relative difference = 3.1385249252060663e-07
 OK (relative difference <= 5E-3)
 =========================================================================
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd0/check.exe -p 1 256 2 OMP=
-Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 7.568753e+01                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 7.568979e+01                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 7.568979e+01                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 8.362446e+01                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 8.362744e+01                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 8.362744e+01                 )  sec^-1
 MeanMatrixElemValue         = ( 1.187066e-05 +- 9.825549e-06 )  GeV^-6
-TOTAL       :     6.985657 sec
-    19,580,314,386      cycles                           #    2.803 GHz                    
-    54,288,249,136      instructions                     #    2.77  insn per cycle         
-       6.989636952 seconds time elapsed
-=Symbols in CPPProcess.o= (~sse4:31981) (avx2:    0) (512y:    0) (512z:    0)
+TOTAL       :     6.325318 sec
+    19,430,907,237      cycles                    #    3.071 GHz                    
+    54,292,603,982      instructions              #    2.79  insn per cycle         
+       6.329867372 seconds time elapsed
+=Symbols in CPPProcess.o= (~sse4:31977) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd0/runTest.exe
 [  PASSED  ] 6 tests.
@@ -102,19 +102,19 @@ Relative difference = 3.457988134687711e-07
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_m_inl0_hrd0/check.exe -p 1 256 2 OMP=
-Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.560655e+02                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.560735e+02                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.560735e+02                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.619143e+02                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.619255e+02                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.619255e+02                 )  sec^-1
 MeanMatrixElemValue         = ( 1.187066e-05 +- 9.825548e-06 )  GeV^-6
-TOTAL       :     3.389034 sec
-     9,743,077,746      cycles                           #    2.873 GHz                    
-    26,108,000,956      instructions                     #    2.68  insn per cycle         
-       3.392740513 seconds time elapsed
+TOTAL       :     3.268613 sec
+     9,472,675,959      cycles                    #    2.895 GHz                    
+    26,115,271,176      instructions              #    2.76  insn per cycle         
+       3.273561039 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:95919) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_m_inl0_hrd0/runTest.exe
@@ -128,19 +128,19 @@ Relative difference = 3.5610570575237004e-07
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_m_inl0_hrd0/check.exe -p 1 256 2 OMP=
-Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 3.708056e+02                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.708516e+02                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.708516e+02                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.790808e+02                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.791484e+02                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.791484e+02                 )  sec^-1
 MeanMatrixElemValue         = ( 1.187066e-05 +- 9.825548e-06 )  GeV^-6
-TOTAL       :     1.429005 sec
-     4,080,366,942      cycles                           #    2.850 GHz                    
-     9,329,572,618      instructions                     #    2.29  insn per cycle         
-       1.433075771 seconds time elapsed
+TOTAL       :     1.400732 sec
+     4,075,771,818      cycles                    #    2.902 GHz                    
+     9,331,333,269      instructions              #    2.29  insn per cycle         
+       1.406221756 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2:83766) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_m_inl0_hrd0/runTest.exe
@@ -154,19 +154,19 @@ Relative difference = 3.613714310412983e-07
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_m_inl0_hrd0/check.exe -p 1 256 2 OMP=
-Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 4.154770e+02                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 4.155334e+02                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 4.155334e+02                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 4.155112e+02                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 4.155865e+02                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.155865e+02                 )  sec^-1
 MeanMatrixElemValue         = ( 1.187066e-05 +- 9.825548e-06 )  GeV^-6
-TOTAL       :     1.276074 sec
-     3,634,666,906      cycles                           #    2.842 GHz                    
-     8,305,590,665      instructions                     #    2.29  insn per cycle         
-       1.280094579 seconds time elapsed
+TOTAL       :     1.276847 sec
+     3,682,881,798      cycles                    #    2.877 GHz                    
+     8,307,430,129      instructions              #    2.26  insn per cycle         
+       1.281346988 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2:83502) (512y:   20) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_m_inl0_hrd0/runTest.exe
@@ -180,20 +180,20 @@ Relative difference = 3.613714310412983e-07
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_m_inl0_hrd0/check.exe -p 1 256 2 OMP=
-Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 3.730920e+02                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.731496e+02                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.731496e+02                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.785211e+02                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.786021e+02                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.786021e+02                 )  sec^-1
 MeanMatrixElemValue         = ( 1.187066e-05 +- 9.825548e-06 )  GeV^-6
-TOTAL       :     1.420967 sec
-     2,634,262,273      cycles                           #    1.850 GHz                    
-     4,226,115,956      instructions                     #    1.60  insn per cycle         
-       1.424616315 seconds time elapsed
-=Symbols in CPPProcess.o= (~sse4:    0) (avx2: 1921) (512y:   58) (512z:82636)
+TOTAL       :     1.403459 sec
+     2,639,508,347      cycles                    #    1.878 GHz                    
+     4,234,654,161      instructions              #    1.60  insn per cycle         
+       1.408336015 seconds time elapsed
+=Symbols in CPPProcess.o= (~sse4:    0) (avx2: 1911) (512y:   57) (512z:82637)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_m_inl0_hrd0/runTest.exe
 [  PASSED  ] 6 tests.
diff --git a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_m_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_m_inl0_hrd1.txt
index 876a0f0095..cd996fa793 100644
--- a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_m_inl0_hrd1.txt
+++ b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_m_inl0_hrd1.txt
@@ -35,38 +35,38 @@ CUDACPP_BUILDDIR='build.512z_m_inl0_hrd1'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg'
 
-DATE: 2023-07-18_22:54:48
+DATE: 2023-06-16_23:06:07
 
-On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
+On itscrd80.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd1/gcheck.exe -p 1 256 2 OMP=
-Process                     = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1]
+Process                     = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 4.682609e+02                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 4.683142e+02                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 4.683311e+02                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 4.688054e+02                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 4.688638e+02                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.688761e+02                 )  sec^-1
 MeanMatrixElemValue         = ( 1.187066e-05 +- 9.825548e-06 )  GeV^-6
-TOTAL       :     2.176991 sec
-     7,484,781,827      cycles                           #    2.995 GHz                    
-    15,629,838,380      instructions                     #    2.09  insn per cycle         
-       2.555996760 seconds time elapsed
+TOTAL       :     2.196300 sec
+     7,637,726,731      cycles                    #    3.023 GHz                    
+    15,838,474,776      instructions              #    2.07  insn per cycle         
+       2.583156463 seconds time elapsed
 runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd1/gcheck.exe -p 1 256 1
 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255
 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100%
 .........................................................................
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd1/gcheck.exe -p 64 256 1 OMP=
-Process                     = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1]
+Process                     = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 1.109144e+04                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.109457e+04                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.109491e+04                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.108172e+04                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.108543e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.108568e+04                 )  sec^-1
 MeanMatrixElemValue         = ( 1.856249e-04 +- 8.329951e-05 )  GeV^-6
-TOTAL       :     3.407213 sec
-    11,182,166,317      cycles                           #    2.996 GHz                    
-    26,393,726,074      instructions                     #    2.36  insn per cycle         
-       3.788702297 seconds time elapsed
+TOTAL       :     3.433735 sec
+    11,452,674,028      cycles                    #    3.032 GHz                    
+    23,961,349,118      instructions              #    2.09  insn per cycle         
+       3.834644270 seconds time elapsed
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd1/gcheck.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd1/fgcheck.exe 2 64 2
@@ -76,20 +76,20 @@ Relative difference = 3.1385249252060663e-07
 OK (relative difference <= 5E-3)
 =========================================================================
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd1/check.exe -p 1 256 2 OMP=
-Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1]
+Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 7.636770e+01                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 7.636993e+01                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 7.636993e+01                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 7.781184e+01                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 7.781448e+01                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.781448e+01                 )  sec^-1
 MeanMatrixElemValue         = ( 1.187066e-05 +- 9.825549e-06 )  GeV^-6
-TOTAL       :     6.920565 sec
-    19,676,257,089      cycles                           #    2.843 GHz                    
-    54,278,198,702      instructions                     #    2.76  insn per cycle         
-       6.924591734 seconds time elapsed
-=Symbols in CPPProcess.o= (~sse4:32135) (avx2:    0) (512y:    0) (512z:    0)
+TOTAL       :     6.780445 sec
+    19,564,693,523      cycles                    #    2.884 GHz                    
+    54,298,011,506      instructions              #    2.78  insn per cycle         
+       6.785104515 seconds time elapsed
+=Symbols in CPPProcess.o= (~sse4:32420) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd1/runTest.exe
 [  PASSED  ] 6 tests.
@@ -102,19 +102,19 @@ Relative difference = 3.457988134687711e-07
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_m_inl0_hrd1/check.exe -p 1 256 2 OMP=
-Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1]
+Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.582580e+02                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.582671e+02                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.582671e+02                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.707967e+02                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.708090e+02                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.708090e+02                 )  sec^-1
 MeanMatrixElemValue         = ( 1.187066e-05 +- 9.825548e-06 )  GeV^-6
-TOTAL       :     3.342366 sec
-     9,474,102,869      cycles                           #    2.833 GHz                    
-    26,022,233,908      instructions                     #    2.75  insn per cycle         
-       3.346342144 seconds time elapsed
+TOTAL       :     3.103057 sec
+     9,582,705,373      cycles                    #    3.085 GHz                    
+    26,028,889,965      instructions              #    2.72  insn per cycle         
+       3.107702792 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:95694) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_m_inl0_hrd1/runTest.exe
@@ -128,20 +128,20 @@ Relative difference = 3.5610570575237004e-07
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_m_inl0_hrd1/check.exe -p 1 256 2 OMP=
-Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1]
+Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 3.660841e+02                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.661337e+02                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.661337e+02                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.703281e+02                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.703858e+02                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.703858e+02                 )  sec^-1
 MeanMatrixElemValue         = ( 1.187066e-05 +- 9.825548e-06 )  GeV^-6
-TOTAL       :     1.447230 sec
-     4,092,458,511      cycles                           #    2.822 GHz                    
-     9,308,826,047      instructions                     #    2.27  insn per cycle         
-       1.451338201 seconds time elapsed
-=Symbols in CPPProcess.o= (~sse4:    0) (avx2:83499) (512y:    0) (512z:    0)
+TOTAL       :     1.431633 sec
+     4,120,363,711      cycles                    #    2.872 GHz                    
+     9,310,908,558      instructions              #    2.26  insn per cycle         
+       1.437026482 seconds time elapsed
+=Symbols in CPPProcess.o= (~sse4:    0) (avx2:83565) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_m_inl0_hrd1/runTest.exe
 [  PASSED  ] 6 tests.
@@ -154,20 +154,20 @@ Relative difference = 3.613714310412983e-07
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_m_inl0_hrd1/check.exe -p 1 256 2 OMP=
-Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1]
+Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 4.110235e+02                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 4.110790e+02                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 4.110790e+02                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 4.240493e+02                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 4.241279e+02                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.241279e+02                 )  sec^-1
 MeanMatrixElemValue         = ( 1.187066e-05 +- 9.825548e-06 )  GeV^-6
-TOTAL       :     1.290318 sec
-     3,656,642,519      cycles                           #    2.828 GHz                    
-     8,300,905,499      instructions                     #    2.27  insn per cycle         
-       1.294253720 seconds time elapsed
-=Symbols in CPPProcess.o= (~sse4:    0) (avx2:83152) (512y:  170) (512z:    0)
+TOTAL       :     1.251163 sec
+     3,616,491,450      cycles                    #    2.883 GHz                    
+     8,302,817,812      instructions              #    2.30  insn per cycle         
+       1.256135099 seconds time elapsed
+=Symbols in CPPProcess.o= (~sse4:    0) (avx2:83185) (512y:  170) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_m_inl0_hrd1/runTest.exe
 [  PASSED  ] 6 tests.
@@ -180,20 +180,20 @@ Relative difference = 3.613714310412983e-07
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_m_inl0_hrd1/check.exe -p 1 256 2 OMP=
-Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1]
+Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 3.703813e+02                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.704352e+02                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.704352e+02                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.753912e+02                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.754634e+02                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.754634e+02                 )  sec^-1
 MeanMatrixElemValue         = ( 1.187066e-05 +- 9.825548e-06 )  GeV^-6
-TOTAL       :     1.432011 sec
-     2,635,765,100      cycles                           #    1.836 GHz                    
-     4,223,288,334      instructions                     #    1.60  insn per cycle         
-       1.436037765 seconds time elapsed
-=Symbols in CPPProcess.o= (~sse4:    0) (avx2: 1429) (512y:  156) (512z:82786)
+TOTAL       :     1.412974 sec
+     2,639,045,649      cycles                    #    1.864 GHz                    
+     4,230,988,059      instructions              #    1.60  insn per cycle         
+       1.417985866 seconds time elapsed
+=Symbols in CPPProcess.o= (~sse4:    0) (avx2: 1428) (512y:  156) (512z:82779)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_m_inl0_hrd1/runTest.exe
 [  PASSED  ] 6 tests.
diff --git a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd0.txt
index fea83ea522..ac1881c14d 100644
--- a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd0.txt
+++ b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd0.txt
@@ -35,38 +35,38 @@ CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux'
 
-DATE: 2023-07-18_22:48:33
+DATE: 2023-06-16_22:59:46
 
-On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
+On itscrd80.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/gcheck.exe -p 64 256 10 OMP=
-Process                     = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 2.719457e+07                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 5.472965e+07                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 5.858342e+07                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.470677e+07                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 5.403049e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 5.845489e+07                 )  sec^-1
 MeanMatrixElemValue         = ( 3.404831e+01 +- 1.677228e+01 )  GeV^-2
-TOTAL       :     0.450160 sec
-     1,949,025,471      cycles                           #    2.932 GHz                    
-     2,655,591,019      instructions                     #    1.36  insn per cycle         
-       0.737059925 seconds time elapsed
+TOTAL       :     0.475414 sec
+     2,007,563,669      cycles                    #    2.838 GHz                    
+     2,452,577,465      instructions              #    1.22  insn per cycle         
+       0.764443076 seconds time elapsed
 runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/gcheck.exe -p 64 256 1
 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255
 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100%
 .........................................................................
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 1 OMP=
-Process                     = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 3.347425e+07                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 7.590288e+07                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 8.062522e+07                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.377326e+07                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 7.547761e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.992085e+07                 )  sec^-1
 MeanMatrixElemValue         = ( 4.282445e+02 +- 2.530899e+02 )  GeV^-2
-TOTAL       :     0.531537 sec
-     2,272,333,411      cycles                           #    2.949 GHz                    
-     3,136,805,865      instructions                     #    1.38  insn per cycle         
-       0.828090573 seconds time elapsed
+TOTAL       :     0.557136 sec
+     2,348,204,579      cycles                    #    2.922 GHz                    
+     2,941,788,306      instructions              #    1.25  insn per cycle         
+       0.862619515 seconds time elapsed
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2
@@ -76,19 +76,19 @@ Relative difference = 2.984467216677476e-07
 OK (relative difference <= 5E-3)
 =========================================================================
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/check.exe -p 64 256 10 OMP=
-Process                     = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.115690e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.140019e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.140019e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.175937e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.209277e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.209277e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 3.404831e+01 +- 1.677228e+01 )  GeV^-2
-TOTAL       :     1.490593 sec
-     4,533,884,133      cycles                           #    3.034 GHz                    
-    12,816,471,728      instructions                     #    2.83  insn per cycle         
-       1.495888033 seconds time elapsed
+TOTAL       :     1.417777 sec
+     4,418,078,401      cycles                    #    3.108 GHz                    
+    12,858,365,147      instructions              #    2.91  insn per cycle         
+       1.423319552 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:  733) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/runTest.exe
@@ -102,19 +102,19 @@ Relative difference = 2.9844565299804477e-07
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_d_inl0_hrd0/check.exe -p 64 256 10 OMP=
-Process                     = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 2.063298e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.146504e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.146504e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.113749e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.225511e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.225511e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 3.404831e+01 +- 1.677228e+01 )  GeV^-2
-TOTAL       :     0.816326 sec
-     2,478,405,968      cycles                           #    3.027 GHz                    
-     7,027,255,753      instructions                     #    2.84  insn per cycle         
-       0.821401180 seconds time elapsed
+TOTAL       :     0.797724 sec
+     2,444,856,977      cycles                    #    3.051 GHz                    
+     7,068,441,253      instructions              #    2.89  insn per cycle         
+       0.808817770 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4: 3093) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_d_inl0_hrd0/runTest.exe
@@ -128,19 +128,19 @@ Relative difference = 2.9844565299804477e-07
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_d_inl0_hrd0/check.exe -p 64 256 10 OMP=
-Process                     = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 3.945941e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 4.258834e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 4.258834e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.944151e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 4.349910e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.349910e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 3.404831e+01 +- 1.677228e+01 )  GeV^-2
-TOTAL       :     0.439436 sec
-     1,248,930,157      cycles                           #    2.833 GHz                    
-     2,800,292,605      instructions                     #    2.24  insn per cycle         
-       0.444398293 seconds time elapsed
+TOTAL       :     0.438420 sec
+     1,274,694,276      cycles                    #    2.885 GHz                    
+     2,842,409,497      instructions              #    2.23  insn per cycle         
+       0.443229097 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 2725) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_d_inl0_hrd0/runTest.exe
@@ -154,19 +154,19 @@ Relative difference = 2.9844659193456305e-07
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_d_inl0_hrd0/check.exe -p 64 256 10 OMP=
-Process                     = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 4.388007e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 4.771114e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 4.771114e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 4.355894e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 4.841805e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.841805e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 3.404831e+01 +- 1.677228e+01 )  GeV^-2
-TOTAL       :     0.397928 sec
-     1,143,495,589      cycles                           #    2.859 GHz                    
-     2,662,566,955      instructions                     #    2.33  insn per cycle         
-       0.403205791 seconds time elapsed
+TOTAL       :     0.399144 sec
+     1,170,164,950      cycles                    #    2.906 GHz                    
+     2,702,440,747      instructions              #    2.31  insn per cycle         
+       0.403916574 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 2530) (512y:   54) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_d_inl0_hrd0/runTest.exe
@@ -180,20 +180,20 @@ Relative difference = 2.9844659193456305e-07
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_d_inl0_hrd0/check.exe -p 64 256 10 OMP=
-Process                     = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 2.988696e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.161799e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.161799e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.020461e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.251605e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.251605e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 3.404831e+01 +- 1.677228e+01 )  GeV^-2
-TOTAL       :     0.574523 sec
-     1,116,040,350      cycles                           #    1.937 GHz                    
-     1,636,926,421      instructions                     #    1.47  insn per cycle         
-       0.579497288 seconds time elapsed
-=Symbols in CPPProcess.o= (~sse4:    0) (avx2: 1064) (512y:   79) (512z: 2134)
+TOTAL       :     0.567519 sec
+     1,146,857,475      cycles                    #    2.007 GHz                    
+     1,675,706,744      instructions              #    1.46  insn per cycle         
+       0.572452940 seconds time elapsed
+=Symbols in CPPProcess.o= (~sse4:    0) (avx2: 1055) (512y:   78) (512z: 2135)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_d_inl0_hrd0/runTest.exe
 [  PASSED  ] 6 tests.
diff --git a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd0_bridge.txt b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd0_bridge.txt
index 2108f76071..9d2980d703 100644
--- a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd0_bridge.txt
+++ b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd0_bridge.txt
@@ -35,26 +35,26 @@ CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux'
 
-DATE: 2023-07-18_23:25:06
+DATE: 2023-06-16_23:21:34
 
-On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
+On itscrd80.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/gcheck.exe -p 64 256 10 --bridge OMP=
 WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost
 WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost
 WARNING! Instantiate device Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384)
 WARNING! Set grid in Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384)
-Process                     = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 3.585157e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.158790e+07                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.158790e+07                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.070393e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.287586e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.287586e+07                 )  sec^-1
 MeanMatrixElemValue         = ( 3.404831e+01 +- 1.677228e+01 )  GeV^-2
-TOTAL       :     0.474113 sec
-     2,022,739,618      cycles                           #    2.944 GHz                    
-     2,903,368,607      instructions                     #    1.44  insn per cycle         
-       0.744339953 seconds time elapsed
+TOTAL       :     0.504575 sec
+     2,103,581,583      cycles                    #    2.905 GHz                    
+     2,735,769,174      instructions              #    1.30  insn per cycle         
+       0.783099451 seconds time elapsed
 runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/gcheck.exe -p 64 256 1 --bridge
 WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost
 WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost
@@ -68,17 +68,17 @@ WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost
 WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost
 WARNING! Instantiate device Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288)
 WARNING! Set grid in Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288)
-Process                     = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 3.243278e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.304833e+07                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.304833e+07                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.869815e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.409855e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.409855e+07                 )  sec^-1
 MeanMatrixElemValue         = ( 4.282445e+02 +- 2.530899e+02 )  GeV^-2
-TOTAL       :     0.753906 sec
-     2,980,726,577      cycles                           #    2.963 GHz                    
-     4,446,396,110      instructions                     #    1.49  insn per cycle         
-       1.064431625 seconds time elapsed
+TOTAL       :     0.796207 sec
+     3,159,417,542      cycles                    #    2.977 GHz                    
+     4,392,482,036      instructions              #    1.39  insn per cycle         
+       1.122081608 seconds time elapsed
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2
@@ -89,19 +89,19 @@ OK (relative difference <= 5E-3)
 =========================================================================
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/check.exe -p 64 256 10 --bridge OMP=
 WARNING! Instantiate host Bridge (nevt=16384)
-Process                     = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.117698e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.142216e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.142216e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.166207e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.199063e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.199063e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 3.404831e+01 +- 1.677228e+01 )  GeV^-2
-TOTAL       :     1.494974 sec
-     4,566,562,470      cycles                           #    3.047 GHz                    
-    12,825,171,048      instructions                     #    2.81  insn per cycle         
-       1.499401026 seconds time elapsed
+TOTAL       :     1.434034 sec
+     4,450,678,360      cycles                    #    3.095 GHz                    
+    12,865,964,603      instructions              #    2.89  insn per cycle         
+       1.439725535 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:  733) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/runTest.exe
@@ -116,19 +116,19 @@ OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_d_inl0_hrd0/check.exe -p 64 256 10 --bridge OMP=
 WARNING! Instantiate host Bridge (nevt=16384)
-Process                     = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 2.063593e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.147355e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.147355e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.121625e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.232697e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.232697e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 3.404831e+01 +- 1.677228e+01 )  GeV^-2
-TOTAL       :     0.823092 sec
-     2,512,237,038      cycles                           #    3.038 GHz                    
-     7,078,628,755      instructions                     #    2.82  insn per cycle         
-       0.827558458 seconds time elapsed
+TOTAL       :     0.801253 sec
+     2,477,560,136      cycles                    #    3.079 GHz                    
+     7,119,440,241      instructions              #    2.87  insn per cycle         
+       0.812534403 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4: 3093) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_d_inl0_hrd0/runTest.exe
@@ -143,19 +143,19 @@ OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_d_inl0_hrd0/check.exe -p 64 256 10 --bridge OMP=
 WARNING! Instantiate host Bridge (nevt=16384)
-Process                     = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 3.904279e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 4.214110e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 4.214110e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.934946e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 4.330898e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.330898e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 3.404831e+01 +- 1.677228e+01 )  GeV^-2
-TOTAL       :     0.449381 sec
-     1,289,599,454      cycles                           #    2.847 GHz                    
-     2,851,042,796      instructions                     #    2.21  insn per cycle         
-       0.453830437 seconds time elapsed
+TOTAL       :     0.446709 sec
+     1,309,681,066      cycles                    #    2.901 GHz                    
+     2,895,588,054      instructions              #    2.21  insn per cycle         
+       0.458145800 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 2725) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_d_inl0_hrd0/runTest.exe
@@ -170,19 +170,19 @@ OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_d_inl0_hrd0/check.exe -p 64 256 10 --bridge OMP=
 WARNING! Instantiate host Bridge (nevt=16384)
-Process                     = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 4.139907e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 4.488963e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 4.488963e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 4.311032e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 4.799335e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.799335e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 3.404831e+01 +- 1.677228e+01 )  GeV^-2
-TOTAL       :     0.425111 sec
-     1,205,515,884      cycles                           #    2.813 GHz                    
-     2,709,191,392      instructions                     #    2.25  insn per cycle         
-       0.429101442 seconds time elapsed
+TOTAL       :     0.409974 sec
+     1,200,991,662      cycles                    #    2.902 GHz                    
+     2,753,820,763      instructions              #    2.29  insn per cycle         
+       0.420684326 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 2530) (512y:   54) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_d_inl0_hrd0/runTest.exe
@@ -197,20 +197,20 @@ OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_d_inl0_hrd0/check.exe -p 64 256 10 --bridge OMP=
 WARNING! Instantiate host Bridge (nevt=16384)
-Process                     = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 2.990198e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.165668e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.165668e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.052439e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.289835e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.289835e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 3.404831e+01 +- 1.677228e+01 )  GeV^-2
-TOTAL       :     0.578701 sec
-     1,149,012,692      cycles                           #    1.974 GHz                    
-     1,675,958,965      instructions                     #    1.46  insn per cycle         
-       0.582686476 seconds time elapsed
-=Symbols in CPPProcess.o= (~sse4:    0) (avx2: 1064) (512y:   79) (512z: 2134)
+TOTAL       :     0.568236 sec
+     1,177,198,332      cycles                    #    2.056 GHz                    
+     1,715,837,168      instructions              #    1.46  insn per cycle         
+       0.573431193 seconds time elapsed
+=Symbols in CPPProcess.o= (~sse4:    0) (avx2: 1055) (512y:   78) (512z: 2135)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_d_inl0_hrd0/runTest.exe
 [  PASSED  ] 6 tests.
diff --git a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd1.txt
index 76472ab9ff..0ac8c3da88 100644
--- a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd1.txt
+++ b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd1.txt
@@ -35,38 +35,38 @@ CUDACPP_BUILDDIR='build.512z_d_inl0_hrd1'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux'
 
-DATE: 2023-07-18_22:48:49
+DATE: 2023-06-16_23:00:03
 
-On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
+On itscrd80.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd1/gcheck.exe -p 64 256 10 OMP=
-Process                     = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1]
+Process                     = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 2.689816e+07                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 5.318719e+07                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 5.685904e+07                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.441903e+07                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 5.284655e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 5.707498e+07                 )  sec^-1
 MeanMatrixElemValue         = ( 3.404831e+01 +- 1.677228e+01 )  GeV^-2
-TOTAL       :     0.449232 sec
-     1,950,934,078      cycles                           #    2.940 GHz                    
-     2,676,952,219      instructions                     #    1.37  insn per cycle         
-       0.728652348 seconds time elapsed
+TOTAL       :     0.476149 sec
+     2,004,075,040      cycles                    #    2.858 GHz                    
+     2,447,450,357      instructions              #    1.22  insn per cycle         
+       0.758991361 seconds time elapsed
 runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd1/gcheck.exe -p 64 256 1
 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255
 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100%
 .........................................................................
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd1/gcheck.exe -p 2048 256 1 OMP=
-Process                     = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1]
+Process                     = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 3.327000e+07                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 7.433359e+07                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 7.907754e+07                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.358848e+07                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 7.448514e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.881423e+07                 )  sec^-1
 MeanMatrixElemValue         = ( 4.282445e+02 +- 2.530899e+02 )  GeV^-2
-TOTAL       :     0.531318 sec
-     2,278,354,923      cycles                           #    2.955 GHz                    
-     3,168,740,218      instructions                     #    1.39  insn per cycle         
-       0.827852908 seconds time elapsed
+TOTAL       :     0.560373 sec
+     2,330,695,105      cycles                    #    2.896 GHz                    
+     2,941,791,611      instructions              #    1.26  insn per cycle         
+       0.864662823 seconds time elapsed
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd1/gcheck.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd1/fgcheck.exe 2 64 2
@@ -76,19 +76,19 @@ Relative difference = 2.984467216677476e-07
 OK (relative difference <= 5E-3)
 =========================================================================
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd1/check.exe -p 64 256 10 OMP=
-Process                     = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1]
+Process                     = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.128479e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.153366e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.153366e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.185072e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.219142e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.219142e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 3.404831e+01 +- 1.677228e+01 )  GeV^-2
-TOTAL       :     1.474456 sec
-     4,481,962,559      cycles                           #    3.034 GHz                    
-    12,694,148,363      instructions                     #    2.83  insn per cycle         
-       1.479503853 seconds time elapsed
+TOTAL       :     1.405955 sec
+     4,365,481,367      cycles                    #    3.097 GHz                    
+    12,734,304,560      instructions              #    2.92  insn per cycle         
+       1.410932077 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:  687) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd1/runTest.exe
@@ -102,19 +102,19 @@ Relative difference = 2.9844565299804477e-07
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_d_inl0_hrd1/check.exe -p 64 256 10 OMP=
-Process                     = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1]
+Process                     = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 2.100853e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.187095e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.187095e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.160833e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.277513e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.277513e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 3.404831e+01 +- 1.677228e+01 )  GeV^-2
-TOTAL       :     0.802839 sec
-     2,438,392,527      cycles                           #    3.028 GHz                    
-     6,892,382,300      instructions                     #    2.83  insn per cycle         
-       0.807803393 seconds time elapsed
+TOTAL       :     0.781135 sec
+     2,409,696,110      cycles                    #    3.070 GHz                    
+     6,927,148,000      instructions              #    2.87  insn per cycle         
+       0.792294841 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4: 2942) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_d_inl0_hrd1/runTest.exe
@@ -128,19 +128,19 @@ Relative difference = 2.9844565299804477e-07
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_d_inl0_hrd1/check.exe -p 64 256 10 OMP=
-Process                     = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1]
+Process                     = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 3.609054e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.875068e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.875068e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.625524e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.958602e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.958602e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 3.404831e+01 +- 1.677228e+01 )  GeV^-2
-TOTAL       :     0.477284 sec
-     1,366,926,685      cycles                           #    2.847 GHz                    
-     2,996,684,473      instructions                     #    2.19  insn per cycle         
-       0.482654991 seconds time elapsed
+TOTAL       :     0.474614 sec
+     1,386,813,698      cycles                    #    2.899 GHz                    
+     3,036,853,128      instructions              #    2.19  insn per cycle         
+       0.484841120 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 2831) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_d_inl0_hrd1/runTest.exe
@@ -154,20 +154,20 @@ Relative difference = 2.9844659193456305e-07
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_d_inl0_hrd1/check.exe -p 64 256 10 OMP=
-Process                     = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1]
+Process                     = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 3.844644e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 4.136562e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 4.136562e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.841557e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 4.216736e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.216736e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 3.404831e+01 +- 1.677228e+01 )  GeV^-2
-TOTAL       :     0.449528 sec
-     1,285,397,954      cycles                           #    2.843 GHz                    
-     2,886,405,649      instructions                     #    2.25  insn per cycle         
-       0.454867457 seconds time elapsed
-=Symbols in CPPProcess.o= (~sse4:    0) (avx2: 2577) (512y:  207) (512z:    0)
+TOTAL       :     0.449897 sec
+     1,316,149,525      cycles                    #    2.902 GHz                    
+     2,929,799,564      instructions              #    2.23  insn per cycle         
+       0.464962802 seconds time elapsed
+=Symbols in CPPProcess.o= (~sse4:    0) (avx2: 2576) (512y:  207) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_d_inl0_hrd1/runTest.exe
 [  PASSED  ] 6 tests.
@@ -180,19 +180,19 @@ Relative difference = 2.9844659193456305e-07
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_d_inl0_hrd1/check.exe -p 64 256 10 OMP=
-Process                     = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1]
+Process                     = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 2.819450e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.979721e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.979721e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.868510e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.078585e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.078585e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 3.404831e+01 +- 1.677228e+01 )  GeV^-2
-TOTAL       :     0.606788 sec
-     1,164,670,276      cycles                           #    1.911 GHz                    
-     1,864,104,765      instructions                     #    1.60  insn per cycle         
-       0.611858192 seconds time elapsed
+TOTAL       :     0.597358 sec
+     1,194,467,791      cycles                    #    1.987 GHz                    
+     1,906,072,402      instructions              #    1.60  insn per cycle         
+       0.602593356 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 1002) (512y:  185) (512z: 2242)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_d_inl0_hrd1/runTest.exe
diff --git a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd0.txt
index 97148e4c3d..85404bb68d 100644
--- a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd0.txt
+++ b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd0.txt
@@ -35,60 +35,60 @@ CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux'
 
-DATE: 2023-07-18_22:49:05
+DATE: 2023-06-16_23:00:20
 
-On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
+On itscrd80.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/gcheck.exe -p 64 256 10 OMP=
-Process                     = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 5.373634e+07                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.221480e+08                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.361117e+08                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 4.984648e+07                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.208639e+08                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.373355e+08                 )  sec^-1
 MeanMatrixElemValue         = ( 3.402886e+01 +- 1.677500e+01 )  GeV^-2
-TOTAL       :     0.447566 sec
-     1,887,922,658      cycles                           #    2.843 GHz                    
-     2,552,301,791      instructions                     #    1.35  insn per cycle         
-       0.727957341 seconds time elapsed
+TOTAL       :     0.467463 sec
+     1,994,268,934      cycles                    #    2.883 GHz                    
+     2,406,150,579      instructions              #    1.21  insn per cycle         
+       0.748794310 seconds time elapsed
 runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/gcheck.exe -p 64 256 1
 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 168
 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100%
 .........................................................................
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 1 OMP=
-Process                     = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 7.249817e+07                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.823800e+08                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.984824e+08                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 8.108799e+07                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.846646e+08                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.967345e+08                 )  sec^-1
 MeanMatrixElemValue         = ( 4.166198e+02 +- 2.517590e+02 )  GeV^-2
-TOTAL       :     0.480874 sec
-     2,072,392,252      cycles                           #    2.934 GHz                    
-     2,836,354,199      instructions                     #    1.37  insn per cycle         
-       0.765550273 seconds time elapsed
+TOTAL       :     0.507796 sec
+     2,130,658,759      cycles                    #    2.870 GHz                    
+     2,630,610,519      instructions              #    1.23  insn per cycle         
+       0.801554505 seconds time elapsed
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2
 Avg ME (C++/CUDA)   = 5.619520e-01
-Avg ME (F77/CUDA)   = 0.56225629328206139
-Relative difference = 0.0005414933696496947
+Avg ME (F77/CUDA)   = 0.56225629188472226
+Relative difference = 0.0005414908830687532
 OK (relative difference <= 5E-3)
 =========================================================================
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/check.exe -p 64 256 10 OMP=
-Process                     = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.146917e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.173201e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.173201e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.199190e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.228570e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.228570e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 3.422773e+01 +- 1.683421e+01 )  GeV^-2
-TOTAL       :     1.449015 sec
-     4,399,695,435      cycles                           #    3.030 GHz                    
-    12,749,156,633      instructions                     #    2.90  insn per cycle         
-       1.453676873 seconds time elapsed
+TOTAL       :     1.388677 sec
+     4,265,371,799      cycles                    #    3.066 GHz                    
+    12,765,139,156      instructions              #    2.99  insn per cycle         
+       1.393497705 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:  701) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/runTest.exe
@@ -102,19 +102,19 @@ Relative difference = 1.714833339642312e-08
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_f_inl0_hrd0/check.exe -p 64 256 10 OMP=
-Process                     = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 3.277386e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.502877e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.502877e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.419636e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.674540e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.674540e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 3.422772e+01 +- 1.683421e+01 )  GeV^-2
-TOTAL       :     0.520848 sec
-     1,570,490,325      cycles                           #    3.002 GHz                    
-     4,102,313,565      instructions                     #    2.61  insn per cycle         
-       0.525858184 seconds time elapsed
+TOTAL       :     0.499936 sec
+     1,537,781,005      cycles                    #    3.051 GHz                    
+     4,119,226,953      instructions              #    2.68  insn per cycle         
+       0.510812610 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4: 3693) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_f_inl0_hrd0/runTest.exe
@@ -128,19 +128,19 @@ Relative difference = 4.327561348062349e-08
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_f_inl0_hrd0/check.exe -p 64 256 10 OMP=
-Process                     = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 7.220214e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 8.344626e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 8.344626e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 7.327767e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 8.544279e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 8.544279e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 3.422183e+01 +- 1.683665e+01 )  GeV^-2
-TOTAL       :     0.249034 sec
-       711,985,333      cycles                           #    2.837 GHz                    
-     1,644,466,719      instructions                     #    2.31  insn per cycle         
-       0.254172664 seconds time elapsed
+TOTAL       :     0.245130 sec
+       718,016,333      cycles                    #    2.885 GHz                    
+     1,661,722,651      instructions              #    2.31  insn per cycle         
+       0.255967125 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 3109) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_f_inl0_hrd0/runTest.exe
@@ -154,19 +154,19 @@ Relative difference = 7.389204774233901e-08
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_f_inl0_hrd0/check.exe -p 64 256 10 OMP=
-Process                     = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 7.760904e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 9.087758e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 9.087758e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 7.836550e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 9.271069e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 9.271069e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 3.422183e+01 +- 1.683665e+01 )  GeV^-2
-TOTAL       :     0.233357 sec
-       666,761,003      cycles                           #    2.840 GHz                    
-     1,576,999,933      instructions                     #    2.37  insn per cycle         
-       0.238145886 seconds time elapsed
+TOTAL       :     0.230305 sec
+       678,464,193      cycles                    #    2.902 GHz                    
+     1,594,224,390      instructions              #    2.35  insn per cycle         
+       0.240921400 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 2954) (512y:   14) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_f_inl0_hrd0/runTest.exe
@@ -180,20 +180,20 @@ Relative difference = 7.389204774233901e-08
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_f_inl0_hrd0/check.exe -p 64 256 10 OMP=
-Process                     = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 5.650528e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 6.337145e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 6.337145e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 5.868158e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 6.618302e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 6.618302e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 3.422183e+01 +- 1.683665e+01 )  GeV^-2
-TOTAL       :     0.314517 sec
-       634,920,525      cycles                           #    2.005 GHz                    
-     1,053,554,769      instructions                     #    1.66  insn per cycle         
-       0.319856249 seconds time elapsed
-=Symbols in CPPProcess.o= (~sse4:    0) (avx2: 1459) (512y:   27) (512z: 2277)
+TOTAL       :     0.302538 sec
+       641,516,414      cycles                    #    2.091 GHz                    
+     1,069,291,782      instructions              #    1.67  insn per cycle         
+       0.314821042 seconds time elapsed
+=Symbols in CPPProcess.o= (~sse4:    0) (avx2: 1530) (512y:   33) (512z: 2277)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_f_inl0_hrd0/runTest.exe
 [  PASSED  ] 6 tests.
diff --git a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd0_bridge.txt b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd0_bridge.txt
index 98d232673d..0cb2b651aa 100644
--- a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd0_bridge.txt
+++ b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd0_bridge.txt
@@ -35,26 +35,26 @@ CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux'
 
-DATE: 2023-07-18_23:25:22
+DATE: 2023-06-16_23:21:51
 
-On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
+On itscrd80.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/gcheck.exe -p 64 256 10 --bridge OMP=
 WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost
 WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost
 WARNING! Instantiate device Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384)
 WARNING! Set grid in Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384)
-Process                     = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 5.603113e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 4.129465e+07                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 4.129465e+07                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 5.619996e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 4.273991e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.273991e+07                 )  sec^-1
 MeanMatrixElemValue         = ( 3.419752e+01 +- 1.682900e+01 )  GeV^-2
-TOTAL       :     0.453868 sec
-     1,966,832,335      cycles                           #    2.966 GHz                    
-     2,819,531,761      instructions                     #    1.43  insn per cycle         
-       0.720764500 seconds time elapsed
+TOTAL       :     0.476940 sec
+     2,047,260,675      cycles                    #    2.896 GHz                    
+     2,612,418,216      instructions              #    1.28  insn per cycle         
+       0.764236233 seconds time elapsed
 runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/gcheck.exe -p 64 256 1 --bridge
 WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost
 WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost
@@ -68,40 +68,40 @@ WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost
 WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost
 WARNING! Instantiate device Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288)
 WARNING! Set grid in Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288)
-Process                     = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 5.114307e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 4.636350e+07                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 4.636350e+07                 )  sec^-1
-MeanMatrixElemValue         = ( 4.349385e+02 +- 2.541442e+02 )  GeV^-2
-TOTAL       :     0.622676 sec
-     2,539,495,983      cycles                           #    2.965 GHz                    
-     3,758,648,716      instructions                     #    1.48  insn per cycle         
-       0.916087470 seconds time elapsed
+EvtsPerSec[Rmb+ME]     (23) = ( 5.197562e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 4.898248e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.898248e+07                 )  sec^-1
+MeanMatrixElemValue         = ( 4.349381e+02 +- 2.541442e+02 )  GeV^-2
+TOTAL       :     0.646628 sec
+     2,586,558,663      cycles                    #    2.919 GHz                    
+     3,548,806,657      instructions              #    1.37  insn per cycle         
+       0.945018609 seconds time elapsed
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2
 Avg ME (C++/CUDA)   = 5.619520e-01
-Avg ME (F77/CUDA)   = 0.56225629328206139
-Relative difference = 0.0005414933696496947
+Avg ME (F77/CUDA)   = 0.56225629188472226
+Relative difference = 0.0005414908830687532
 OK (relative difference <= 5E-3)
 =========================================================================
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/check.exe -p 64 256 10 --bridge OMP=
 WARNING! Instantiate host Bridge (nevt=16384)
-Process                     = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.152553e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.179376e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.179376e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.208772e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.238538e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.238538e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 3.422773e+01 +- 1.683421e+01 )  GeV^-2
-TOTAL       :     1.444845 sec
-     4,413,243,936      cycles                           #    3.048 GHz                    
-    12,753,279,090      instructions                     #    2.89  insn per cycle         
-       1.448548188 seconds time elapsed
+TOTAL       :     1.380519 sec
+     4,277,155,433      cycles                    #    3.092 GHz                    
+    12,769,582,721      instructions              #    2.99  insn per cycle         
+       1.385373229 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:  701) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/runTest.exe
@@ -116,19 +116,19 @@ OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_f_inl0_hrd0/check.exe -p 64 256 10 --bridge OMP=
 WARNING! Instantiate host Bridge (nevt=16384)
-Process                     = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 3.302069e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.527386e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.527386e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.430847e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.687591e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.687591e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 3.422772e+01 +- 1.683421e+01 )  GeV^-2
-TOTAL       :     0.520045 sec
-     1,587,034,298      cycles                           #    3.031 GHz                    
-     4,150,131,626      instructions                     #    2.62  insn per cycle         
-       0.524112370 seconds time elapsed
+TOTAL       :     0.502214 sec
+     1,557,572,574      cycles                    #    3.074 GHz                    
+     4,167,570,923      instructions              #    2.68  insn per cycle         
+       0.514108531 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4: 3693) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_f_inl0_hrd0/runTest.exe
@@ -143,19 +143,19 @@ OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_f_inl0_hrd0/check.exe -p 64 256 10 --bridge OMP=
 WARNING! Instantiate host Bridge (nevt=16384)
-Process                     = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 7.200645e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 8.321913e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 8.321913e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 7.366004e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 8.582913e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 8.582913e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 3.422183e+01 +- 1.683665e+01 )  GeV^-2
-TOTAL       :     0.251605 sec
-       730,914,549      cycles                           #    2.869 GHz                    
-     1,680,419,285      instructions                     #    2.30  insn per cycle         
-       0.255420475 seconds time elapsed
+TOTAL       :     0.247132 sec
+       735,373,188      cycles                    #    2.931 GHz                    
+     1,698,741,216      instructions              #    2.31  insn per cycle         
+       0.251948722 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 3109) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_f_inl0_hrd0/runTest.exe
@@ -170,19 +170,19 @@ OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_f_inl0_hrd0/check.exe -p 64 256 10 --bridge OMP=
 WARNING! Instantiate host Bridge (nevt=16384)
-Process                     = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 7.755367e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 9.077189e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 9.077189e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 7.814926e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 9.180518e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 9.180518e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 3.422183e+01 +- 1.683665e+01 )  GeV^-2
-TOTAL       :     0.235526 sec
-       686,239,324      cycles                           #    2.873 GHz                    
-     1,613,330,403      instructions                     #    2.35  insn per cycle         
-       0.239552259 seconds time elapsed
+TOTAL       :     0.234804 sec
+       694,783,748      cycles                    #    2.909 GHz                    
+     1,631,382,774      instructions              #    2.35  insn per cycle         
+       0.239954292 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 2954) (512y:   14) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_f_inl0_hrd0/runTest.exe
@@ -197,20 +197,20 @@ OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_f_inl0_hrd0/check.exe -p 64 256 10 --bridge OMP=
 WARNING! Instantiate host Bridge (nevt=16384)
-Process                     = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 5.659534e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 6.342405e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 6.342405e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 5.811317e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 6.536250e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 6.536250e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 3.422183e+01 +- 1.683665e+01 )  GeV^-2
-TOTAL       :     0.315940 sec
-       653,116,203      cycles                           #    2.045 GHz                    
-     1,094,435,423      instructions                     #    1.68  insn per cycle         
-       0.320064065 seconds time elapsed
-=Symbols in CPPProcess.o= (~sse4:    0) (avx2: 1459) (512y:   27) (512z: 2277)
+TOTAL       :     0.309567 sec
+       660,659,148      cycles                    #    2.110 GHz                    
+     1,111,031,208      instructions              #    1.68  insn per cycle         
+       0.314367440 seconds time elapsed
+=Symbols in CPPProcess.o= (~sse4:    0) (avx2: 1530) (512y:   33) (512z: 2277)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_f_inl0_hrd0/runTest.exe
 [  PASSED  ] 6 tests.
diff --git a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd1.txt
index b12d09a085..1ed4c388b7 100644
--- a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd1.txt
+++ b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd1.txt
@@ -35,60 +35,60 @@ CUDACPP_BUILDDIR='build.512z_f_inl0_hrd1'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux'
 
-DATE: 2023-07-18_22:49:20
+DATE: 2023-06-16_23:00:36
 
-On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
+On itscrd80.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd1/gcheck.exe -p 64 256 10 OMP=
-Process                     = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1]
+Process                     = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 5.401447e+07                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.229959e+08                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.374243e+08                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 4.864221e+07                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.200538e+08                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.353895e+08                 )  sec^-1
 MeanMatrixElemValue         = ( 3.402886e+01 +- 1.677500e+01 )  GeV^-2
-TOTAL       :     0.443674 sec
-     1,962,721,667      cycles                           #    2.933 GHz                    
-     2,652,007,103      instructions                     #    1.35  insn per cycle         
-       0.733269571 seconds time elapsed
+TOTAL       :     0.468538 sec
+     1,986,332,099      cycles                    #    2.873 GHz                    
+     2,412,092,912      instructions              #    1.21  insn per cycle         
+       0.749649403 seconds time elapsed
 runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd1/gcheck.exe -p 64 256 1
 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 161
 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100%
 .........................................................................
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd1/gcheck.exe -p 2048 256 1 OMP=
-Process                     = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1]
+Process                     = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 7.262517e+07                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.821886e+08                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.970952e+08                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 8.066200e+07                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.816896e+08                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.937623e+08                 )  sec^-1
 MeanMatrixElemValue         = ( 4.166198e+02 +- 2.517590e+02 )  GeV^-2
-TOTAL       :     0.484607 sec
-     2,037,651,018      cycles                           #    2.866 GHz                    
-     2,772,082,982      instructions                     #    1.36  insn per cycle         
-       0.769299729 seconds time elapsed
+TOTAL       :     0.506737 sec
+     2,138,564,502      cycles                    #    2.876 GHz                    
+     2,655,004,925      instructions              #    1.24  insn per cycle         
+       0.801248487 seconds time elapsed
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd1/gcheck.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd1/fgcheck.exe 2 64 2
 Avg ME (C++/CUDA)   = 5.619520e-01
-Avg ME (F77/CUDA)   = 0.56225629328206139
-Relative difference = 0.0005414933696496947
+Avg ME (F77/CUDA)   = 0.56225629188472226
+Relative difference = 0.0005414908830687532
 OK (relative difference <= 5E-3)
 =========================================================================
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd1/check.exe -p 64 256 10 OMP=
-Process                     = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1]
+Process                     = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.155470e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.181789e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.181789e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.218143e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.248221e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.248221e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 3.422773e+01 +- 1.683421e+01 )  GeV^-2
-TOTAL       :     1.438342 sec
-     4,366,679,772      cycles                           #    3.029 GHz                    
-    12,656,692,314      instructions                     #    2.90  insn per cycle         
-       1.443081537 seconds time elapsed
+TOTAL       :     1.366158 sec
+     4,229,850,435      cycles                    #    3.088 GHz                    
+    12,672,250,194      instructions              #    3.00  insn per cycle         
+       1.370859371 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:  648) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd1/runTest.exe
@@ -102,20 +102,20 @@ Relative difference = 1.714833339642312e-08
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_f_inl0_hrd1/check.exe -p 64 256 10 OMP=
-Process                     = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1]
+Process                     = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 3.651322e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.932846e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.932846e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.810631e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 4.132301e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.132301e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 3.422772e+01 +- 1.683421e+01 )  GeV^-2
-TOTAL       :     0.470070 sec
-     1,425,098,128      cycles                           #    3.024 GHz                    
-     3,997,006,730      instructions                     #    2.80  insn per cycle         
-       0.474958236 seconds time elapsed
-=Symbols in CPPProcess.o= (~sse4: 3448) (avx2:    0) (512y:    0) (512z:    0)
+TOTAL       :     0.450376 sec
+     1,402,013,020      cycles                    #    3.084 GHz                    
+     4,014,545,759      instructions              #    2.86  insn per cycle         
+       0.461135794 seconds time elapsed
+=Symbols in CPPProcess.o= (~sse4: 3449) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_f_inl0_hrd1/runTest.exe
 [  PASSED  ] 6 tests.
@@ -128,19 +128,19 @@ Relative difference = 4.327561348062349e-08
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_f_inl0_hrd1/check.exe -p 64 256 10 OMP=
-Process                     = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1]
+Process                     = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 5.509776e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 6.152960e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 6.152960e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 5.649698e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 6.339892e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 6.339892e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 3.422183e+01 +- 1.683665e+01 )  GeV^-2
-TOTAL       :     0.318346 sec
-       911,647,543      cycles                           #    2.842 GHz                    
-     1,926,936,353      instructions                     #    2.11  insn per cycle         
-       0.323109774 seconds time elapsed
+TOTAL       :     0.310950 sec
+       917,414,969      cycles                    #    2.911 GHz                    
+     1,944,404,967      instructions              #    2.12  insn per cycle         
+       0.321816236 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 3708) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_f_inl0_hrd1/runTest.exe
@@ -154,19 +154,19 @@ Relative difference = 7.389204774233901e-08
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_f_inl0_hrd1/check.exe -p 64 256 10 OMP=
-Process                     = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1]
+Process                     = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 5.770278e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 6.474762e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 6.474762e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 5.731171e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 6.478338e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 6.478338e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 3.422183e+01 +- 1.683665e+01 )  GeV^-2
-TOTAL       :     0.304991 sec
-       875,477,557      cycles                           #    2.849 GHz                    
-     1,849,411,774      instructions                     #    2.11  insn per cycle         
-       0.309728070 seconds time elapsed
+TOTAL       :     0.306986 sec
+       888,396,059      cycles                    #    2.858 GHz                    
+     1,866,890,226      instructions              #    2.10  insn per cycle         
+       0.318202607 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 3561) (512y:   12) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_f_inl0_hrd1/runTest.exe
@@ -180,20 +180,20 @@ Relative difference = 7.389204774233901e-08
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_f_inl0_hrd1/check.exe -p 64 256 10 OMP=
-Process                     = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1]
+Process                     = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 4.411391e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 4.826212e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 4.826212e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 4.528801e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 4.968883e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.968883e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 3.422183e+01 +- 1.683665e+01 )  GeV^-2
-TOTAL       :     0.394437 sec
-       787,402,709      cycles                           #    1.986 GHz                    
-     1,347,443,854      instructions                     #    1.71  insn per cycle         
-       0.399399367 seconds time elapsed
-=Symbols in CPPProcess.o= (~sse4:    0) (avx2: 2000) (512y:   19) (512z: 2631)
+TOTAL       :     0.384950 sec
+       796,333,894      cycles                    #    2.047 GHz                    
+     1,364,853,040      instructions              #    1.71  insn per cycle         
+       0.390179000 seconds time elapsed
+=Symbols in CPPProcess.o= (~sse4:    0) (avx2: 2080) (512y:   25) (512z: 2631)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_f_inl0_hrd1/runTest.exe
 [  PASSED  ] 6 tests.
diff --git a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_m_inl0_hrd0.txt
index 04fec1d98e..a86220883d 100644
--- a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_m_inl0_hrd0.txt
+++ b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_m_inl0_hrd0.txt
@@ -35,38 +35,38 @@ CUDACPP_BUILDDIR='build.512z_m_inl0_hrd0'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux'
 
-DATE: 2023-07-18_22:49:35
+DATE: 2023-06-16_23:00:52
 
-On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
+On itscrd80.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd0/gcheck.exe -p 64 256 10 OMP=
-Process                     = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 2.709586e+07                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 5.452250e+07                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 5.859261e+07                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.481426e+07                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 5.487957e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 5.943892e+07                 )  sec^-1
 MeanMatrixElemValue         = ( 3.404831e+01 +- 1.677228e+01 )  GeV^-2
-TOTAL       :     0.450283 sec
-     1,941,801,386      cycles                           #    2.921 GHz                    
-     2,647,661,073      instructions                     #    1.36  insn per cycle         
-       0.729442761 seconds time elapsed
+TOTAL       :     0.474499 sec
+     2,055,268,717      cycles                    #    2.894 GHz                    
+     2,489,636,002      instructions              #    1.21  insn per cycle         
+       0.767461965 seconds time elapsed
 runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd0/gcheck.exe -p 64 256 1
 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255
 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100%
 .........................................................................
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd0/gcheck.exe -p 2048 256 1 OMP=
-Process                     = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 3.371849e+07                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 7.637490e+07                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 8.116945e+07                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.380889e+07                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 7.587624e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 8.036549e+07                 )  sec^-1
 MeanMatrixElemValue         = ( 4.282445e+02 +- 2.530899e+02 )  GeV^-2
-TOTAL       :     0.539348 sec
-     2,289,544,773      cycles                           #    2.936 GHz                    
-     3,186,092,891      instructions                     #    1.39  insn per cycle         
-       0.838776921 seconds time elapsed
+TOTAL       :     0.556289 sec
+     2,350,861,575      cycles                    #    2.922 GHz                    
+     2,945,489,310      instructions              #    1.25  insn per cycle         
+       0.862355038 seconds time elapsed
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd0/gcheck.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd0/fgcheck.exe 2 64 2
@@ -76,19 +76,19 @@ Relative difference = 2.782658397826986e-07
 OK (relative difference <= 5E-3)
 =========================================================================
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd0/check.exe -p 64 256 10 OMP=
-Process                     = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.114215e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.138593e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.138593e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.161099e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.194017e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.194017e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 3.404831e+01 +- 1.677228e+01 )  GeV^-2
-TOTAL       :     1.492939 sec
-     4,550,289,046      cycles                           #    3.040 GHz                    
-    12,789,410,611      instructions                     #    2.81  insn per cycle         
-       1.498177791 seconds time elapsed
+TOTAL       :     1.441531 sec
+     4,441,671,913      cycles                    #    3.079 GHz                    
+    12,830,518,319      instructions              #    2.89  insn per cycle         
+       1.448679311 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:  708) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd0/runTest.exe
@@ -102,19 +102,19 @@ Relative difference = 2.608483884671339e-07
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_m_inl0_hrd0/check.exe -p 64 256 10 OMP=
-Process                     = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 2.053669e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.136903e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.136903e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.046596e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.158375e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.158375e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 3.404831e+01 +- 1.677228e+01 )  GeV^-2
-TOTAL       :     0.822011 sec
-     2,476,568,425      cycles                           #    3.006 GHz                    
-     6,934,481,519      instructions                     #    2.80  insn per cycle         
-       0.827155036 seconds time elapsed
+TOTAL       :     0.825457 sec
+     2,446,304,474      cycles                    #    2.949 GHz                    
+     6,976,512,368      instructions              #    2.85  insn per cycle         
+       0.837405524 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4: 3144) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_m_inl0_hrd0/runTest.exe
@@ -128,20 +128,20 @@ Relative difference = 2.608483884671339e-07
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_m_inl0_hrd0/check.exe -p 64 256 10 OMP=
-Process                     = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 4.012591e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 4.333162e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 4.333162e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.998918e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 4.420645e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.420645e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 3.404831e+01 +- 1.677228e+01 )  GeV^-2
-TOTAL       :     0.433321 sec
-     1,239,207,460      cycles                           #    2.848 GHz                    
-     2,766,742,055      instructions                     #    2.23  insn per cycle         
-       0.438634153 seconds time elapsed
-=Symbols in CPPProcess.o= (~sse4:    0) (avx2: 2871) (512y:    0) (512z:    0)
+TOTAL       :     0.433627 sec
+     1,261,017,634      cycles                    #    2.881 GHz                    
+     2,809,295,715      instructions              #    2.23  insn per cycle         
+       0.438687115 seconds time elapsed
+=Symbols in CPPProcess.o= (~sse4:    0) (avx2: 2872) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_m_inl0_hrd0/runTest.exe
 [  PASSED  ] 6 tests.
@@ -154,19 +154,19 @@ Relative difference = 2.777561258016791e-07
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_m_inl0_hrd0/check.exe -p 64 256 10 OMP=
-Process                     = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 4.497352e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 4.897319e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 4.897319e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 4.490422e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 5.009169e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 5.009169e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 3.404831e+01 +- 1.677228e+01 )  GeV^-2
-TOTAL       :     0.388181 sec
-     1,109,768,744      cycles                           #    2.845 GHz                    
-     2,623,649,502      instructions                     #    2.36  insn per cycle         
-       0.393502237 seconds time elapsed
+TOTAL       :     0.388524 sec
+     1,143,580,715      cycles                    #    2.915 GHz                    
+     2,667,697,890      instructions              #    2.33  insn per cycle         
+       0.393399213 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 2679) (512y:   60) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_m_inl0_hrd0/runTest.exe
@@ -180,20 +180,20 @@ Relative difference = 2.777561258016791e-07
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_m_inl0_hrd0/check.exe -p 64 256 10 OMP=
-Process                     = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 2.876331e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.037245e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.037245e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.897672e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.110284e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.110284e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 3.404831e+01 +- 1.677228e+01 )  GeV^-2
-TOTAL       :     0.595472 sec
-     1,159,182,207      cycles                           #    1.940 GHz                    
-     1,669,017,207      instructions                     #    1.44  insn per cycle         
-       0.600912637 seconds time elapsed
-=Symbols in CPPProcess.o= (~sse4:    0) (avx2: 1450) (512y:   86) (512z: 2181)
+TOTAL       :     0.591572 sec
+     1,191,668,768      cycles                    #    2.001 GHz                    
+     1,710,006,963      instructions              #    1.43  insn per cycle         
+       0.596717394 seconds time elapsed
+=Symbols in CPPProcess.o= (~sse4:    0) (avx2: 1441) (512y:   85) (512z: 2182)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_m_inl0_hrd0/runTest.exe
 [  PASSED  ] 6 tests.
diff --git a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_m_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_m_inl0_hrd1.txt
index 2b5a0b6c6d..d5360b871e 100644
--- a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_m_inl0_hrd1.txt
+++ b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_m_inl0_hrd1.txt
@@ -35,38 +35,38 @@ CUDACPP_BUILDDIR='build.512z_m_inl0_hrd1'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux'
 
-DATE: 2023-07-18_22:49:51
+DATE: 2023-06-16_23:01:09
 
-On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
+On itscrd80.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd1/gcheck.exe -p 64 256 10 OMP=
-Process                     = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1]
+Process                     = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 2.689669e+07                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 5.339263e+07                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 5.704986e+07                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.438523e+07                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 5.296220e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 5.712443e+07                 )  sec^-1
 MeanMatrixElemValue         = ( 3.404831e+01 +- 1.677228e+01 )  GeV^-2
-TOTAL       :     0.450242 sec
-     1,949,757,721      cycles                           #    2.931 GHz                    
-     2,686,904,735      instructions                     #    1.38  insn per cycle         
-       0.728612654 seconds time elapsed
+TOTAL       :     0.473737 sec
+     2,022,437,137      cycles                    #    2.902 GHz                    
+     2,442,479,633      instructions              #    1.21  insn per cycle         
+       0.756210762 seconds time elapsed
 runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd1/gcheck.exe -p 64 256 1
 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255
 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100%
 .........................................................................
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd1/gcheck.exe -p 2048 256 1 OMP=
-Process                     = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1]
+Process                     = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 3.331006e+07                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 7.447818e+07                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 7.909061e+07                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.355982e+07                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 7.498538e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.934566e+07                 )  sec^-1
 MeanMatrixElemValue         = ( 4.282445e+02 +- 2.530899e+02 )  GeV^-2
-TOTAL       :     0.533944 sec
-     2,280,719,611      cycles                           #    2.953 GHz                    
-     3,146,800,171      instructions                     #    1.38  insn per cycle         
-       0.829596740 seconds time elapsed
+TOTAL       :     0.556362 sec
+     2,332,917,861      cycles                    #    2.900 GHz                    
+     2,935,857,970      instructions              #    1.26  insn per cycle         
+       0.862169599 seconds time elapsed
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd1/gcheck.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd1/fgcheck.exe 2 64 2
@@ -76,19 +76,19 @@ Relative difference = 2.782658397826986e-07
 OK (relative difference <= 5E-3)
 =========================================================================
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd1/check.exe -p 64 256 10 OMP=
-Process                     = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1]
+Process                     = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.125391e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.150066e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.150066e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.170795e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.204914e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.204914e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 3.404831e+01 +- 1.677228e+01 )  GeV^-2
-TOTAL       :     1.478266 sec
-     4,503,135,010      cycles                           #    3.039 GHz                    
-    12,670,550,662      instructions                     #    2.81  insn per cycle         
-       1.483617158 seconds time elapsed
+TOTAL       :     1.422352 sec
+     4,386,288,323      cycles                    #    3.077 GHz                    
+    12,708,814,441      instructions              #    2.90  insn per cycle         
+       1.427223028 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:  659) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd1/runTest.exe
@@ -102,19 +102,19 @@ Relative difference = 2.608483884671339e-07
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_m_inl0_hrd1/check.exe -p 64 256 10 OMP=
-Process                     = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1]
+Process                     = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 2.086785e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.174219e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.174219e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.148376e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.266083e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.266083e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 3.404831e+01 +- 1.677228e+01 )  GeV^-2
-TOTAL       :     0.809175 sec
-     2,426,229,799      cycles                           #    2.989 GHz                    
-     6,736,787,469      instructions                     #    2.78  insn per cycle         
-       0.814615473 seconds time elapsed
+TOTAL       :     0.786032 sec
+     2,382,887,401      cycles                    #    3.016 GHz                    
+     6,777,982,529      instructions              #    2.84  insn per cycle         
+       0.796666620 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4: 3010) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_m_inl0_hrd1/runTest.exe
@@ -128,20 +128,20 @@ Relative difference = 2.608483884671339e-07
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_m_inl0_hrd1/check.exe -p 64 256 10 OMP=
-Process                     = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1]
+Process                     = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 3.296004e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.526619e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.526619e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.604898e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.935937e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.935937e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 3.404831e+01 +- 1.677228e+01 )  GeV^-2
-TOTAL       :     0.521966 sec
-     1,383,320,776      cycles                           #    2.637 GHz                    
-     2,945,181,934      instructions                     #    2.13  insn per cycle         
-       0.526914313 seconds time elapsed
-=Symbols in CPPProcess.o= (~sse4:    0) (avx2: 3009) (512y:    0) (512z:    0)
+TOTAL       :     0.478024 sec
+     1,400,367,251      cycles                    #    2.905 GHz                    
+     2,987,020,671      instructions              #    2.13  insn per cycle         
+       0.488917156 seconds time elapsed
+=Symbols in CPPProcess.o= (~sse4:    0) (avx2: 3010) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_m_inl0_hrd1/runTest.exe
 [  PASSED  ] 6 tests.
@@ -154,19 +154,19 @@ Relative difference = 2.777561258016791e-07
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_m_inl0_hrd1/check.exe -p 64 256 10 OMP=
-Process                     = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1]
+Process                     = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 3.859313e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 4.151231e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 4.151231e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.857659e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 4.247357e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.247357e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 3.404831e+01 +- 1.677228e+01 )  GeV^-2
-TOTAL       :     0.448380 sec
-     1,279,235,194      cycles                           #    2.838 GHz                    
-     2,829,550,880      instructions                     #    2.21  insn per cycle         
-       0.453442975 seconds time elapsed
+TOTAL       :     0.448265 sec
+     1,311,795,258      cycles                    #    2.900 GHz                    
+     2,871,509,338      instructions              #    2.19  insn per cycle         
+       0.453504742 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 2738) (512y:  216) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_m_inl0_hrd1/runTest.exe
@@ -180,19 +180,19 @@ Relative difference = 2.777561258016791e-07
 OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_m_inl0_hrd1/check.exe -p 64 256 10 OMP=
-Process                     = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1]
+Process                     = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 2.833583e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.996794e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.996794e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.874954e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.085752e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.085752e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 3.404831e+01 +- 1.677228e+01 )  GeV^-2
-TOTAL       :     0.604455 sec
-     1,180,479,629      cycles                           #    1.945 GHz                    
-     1,824,076,834      instructions                     #    1.55  insn per cycle         
-       0.609837052 seconds time elapsed
+TOTAL       :     0.594897 sec
+     1,202,081,331      cycles                    #    2.008 GHz                    
+     1,864,012,772      instructions              #    1.55  insn per cycle         
+       0.599852743 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 1344) (512y:  191) (512z: 2311)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_m_inl0_hrd1/runTest.exe

From 4014dfa67bc17fec092ebdbd4c9eb03207ea3ebf Mon Sep 17 00:00:00 2001
From: Andrea Valassi <andrea.valassi@cern.ch>
Date: Wed, 19 Jul 2023 09:17:15 +0200
Subject: [PATCH 389/509] [jthip] go back to 14 upstream/master CODEGEN logs
 for easier merging

git checkout upstream/master `gitls */CODEGEN*txt`
---
 .../ee_mumu.mad/CODEGEN_mad_ee_mumu_log.txt   | 22 ++++----
 .../CODEGEN_cudacpp_ee_mumu_log.txt           | 16 +++---
 .../gg_tt.mad/CODEGEN_mad_gg_tt_log.txt       | 30 +++++------
 .../gg_tt.sa/CODEGEN_cudacpp_gg_tt_log.txt    | 26 +++++-----
 .../gg_tt01g.mad/CODEGEN_mad_gg_tt01g_log.txt | 50 +++++++++----------
 .../gg_ttg.mad/CODEGEN_mad_gg_ttg_log.txt     | 38 +++++++-------
 .../gg_ttg.sa/CODEGEN_cudacpp_gg_ttg_log.txt  | 32 ++++++------
 .../gg_ttgg.mad/CODEGEN_mad_gg_ttgg_log.txt   | 42 ++++++++--------
 .../CODEGEN_cudacpp_gg_ttgg_log.txt           | 36 ++++++-------
 .../gg_ttggg.mad/CODEGEN_mad_gg_ttggg_log.txt | 48 +++++++++---------
 .../CODEGEN_cudacpp_gg_ttggg_log.txt          | 40 +++++++--------
 .../gq_ttq.mad/CODEGEN_mad_gq_ttq_log.txt     | 38 +++++++-------
 .../gq_ttq.sa/CODEGEN_cudacpp_gq_ttq_log.txt  | 30 +++++------
 .../CODEGEN_cudacpp_heft_gg_h_log.txt         | 24 ++++-----
 14 files changed, 236 insertions(+), 236 deletions(-)

diff --git a/epochX/cudacpp/ee_mumu.mad/CODEGEN_mad_ee_mumu_log.txt b/epochX/cudacpp/ee_mumu.mad/CODEGEN_mad_ee_mumu_log.txt
index fc7f62d186..528176e84e 100644
--- a/epochX/cudacpp/ee_mumu.mad/CODEGEN_mad_ee_mumu_log.txt
+++ b/epochX/cudacpp/ee_mumu.mad/CODEGEN_mad_ee_mumu_log.txt
@@ -51,8 +51,8 @@ Note that you can still compile and run aMC@NLO with the built-in PDFs
  MG5_aMC> set lhapdf /PATH/TO/lhapdf-config
 
 Using default text editor "vi". Set another one in ./input/mg5_configuration.txt
-Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt
-No valid web browser found. Please set in ./input/mg5_configuration.txt
+No valid eps viewer found. Please set in ./input/mg5_configuration.txt
+Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt
 import /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_mad_ee_mumu.mg
 The import format was not given, so we guess it as command
 set stdout_level DEBUG
@@ -62,7 +62,7 @@ generate e+ e- > mu+ mu-
 No model currently active, so we import the Standard Model
 INFO: load particles 
 INFO: load vertices 
-[1;32mDEBUG: model prefixing  takes 0.005497932434082031 [0m
+[1;32mDEBUG: model prefixing  takes 0.004788637161254883 [0m
 INFO: Restrict model sm with file models/sm/restrict_default.dat . 
 [1;32mDEBUG: Simplifying conditional expressions [0m
 [1;32mDEBUG: remove interactions: u s w+ at order: QED=1 [0m
@@ -176,7 +176,7 @@ INFO: Creating files in directory P1_epem_mupmum
 [1;32mDEBUG:  Entering PLUGIN_OneProcessExporter.__init__ [1;30m[model_handling.py at line 1028][0m [0m
 [1;32mDEBUG:  kwargs[prefix] = 0 [1;30m[model_handling.py at line 1029][0m [0m
 [1;32mDEBUG:  proc_id = [0m 1 [1;30m[model_handling.py at line 1034][0m [0m
-[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_SA_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7f00153634f0> [1;30m[export_v4.py at line 6174][0m [0m
+[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_SA_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7f26d2393b80> [1;30m[export_v4.py at line 6174][0m [0m
 INFO: Creating files in directory . 
 [1;32mDEBUG:  Entering PLUGIN_OneProcessExporter.generate_process_files [1;30m[model_handling.py at line 1286][0m [0m
 [1;32mDEBUG:  self.include_multi_channel is already defined: this is madevent+second_exporter mode [1;30m[model_handling.py at line 1288][0m [0m
@@ -191,7 +191,7 @@ FileWriter <class 'PLUGIN.CUDACPP_SA_OUTPUT.model_handling.PLUGIN_CPPWriter'> fo
 [1;32mDEBUG:  self.support_multichannel, self.include_multi_channel = [0m True [1, 2] [1;30m[model_handling.py at line 1152][0m [0m
 [1;32mDEBUG:  multi_channel = [0m {1: [0], 2: [1]} [1;30m[model_handling.py at line 1158][0m [0m
 [1;32mDEBUG:  multi_channel_map = [0m {1: [0], 2: [1]} [1;30m[model_handling.py at line 1643][0m [0m
-[1;32mDEBUG:  diag_to_config = [0m {1: 1, 2: 2} [1;30m[model_handling.py at line 1700][0m [0m
+[1;32mDEBUG:  diag_to_config = [0m {1: 1, 2: 2} [1;30m[model_handling.py at line 1698][0m [0m
 INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./. 
 [1;32mDEBUG:  Entering PLUGIN_OneProcessExporter.edit_CMakeLists [1;30m[model_handling.py at line 1332][0m [0m
 [1;32mDEBUG:  Entering PLUGIN_OneProcessExporter.edit_check_sa [1;30m[model_handling.py at line 1341][0m [0m
@@ -210,19 +210,19 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./.
 INFO: Generating Feynman diagrams for Process: e+ e- > mu+ mu- WEIGHTED<=4 @1 
 INFO: Finding symmetric diagrams for subprocess group epem_mupmum 
 Generated helas calls for 1 subprocesses (2 diagrams) in 0.004 s
-Wrote files for 8 helas calls in 0.096 s
+Wrote files for 8 helas calls in 0.094 s
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates FFV1 routines[0m
 ALOHA: aloha creates FFV2 routines[0m
 ALOHA: aloha creates FFV4 routines[0m
-ALOHA: aloha creates 3 routines in  0.188 s
+ALOHA: aloha creates 3 routines in  0.260 s
 [1;32mDEBUG:  Entering PLUGIN_ProcessExporter.convert_model (create the model) [1;30m[output.py at line 194][0m [0m
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates FFV1 routines[0m
 ALOHA: aloha creates FFV2 routines[0m
 ALOHA: aloha creates FFV4 routines[0m
 ALOHA: aloha creates FFV2_4 routines[0m
-ALOHA: aloha creates 7 routines in  0.238 s
+ALOHA: aloha creates 7 routines in  0.239 s
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
 <class 'aloha.create_aloha.AbstractRoutine'> FFV2
@@ -260,6 +260,6 @@ Type "launch" to generate events from this process, or see
 Run "open index.html" to see more information about this process.
 quit
 
-real	0m1.891s
-user	0m1.628s
-sys	0m0.195s
+real	0m1.855s
+user	0m1.538s
+sys	0m0.189s
diff --git a/epochX/cudacpp/ee_mumu.sa/CODEGEN_cudacpp_ee_mumu_log.txt b/epochX/cudacpp/ee_mumu.sa/CODEGEN_cudacpp_ee_mumu_log.txt
index f5f2ead4aa..138f426e62 100644
--- a/epochX/cudacpp/ee_mumu.sa/CODEGEN_cudacpp_ee_mumu_log.txt
+++ b/epochX/cudacpp/ee_mumu.sa/CODEGEN_cudacpp_ee_mumu_log.txt
@@ -51,8 +51,8 @@ Note that you can still compile and run aMC@NLO with the built-in PDFs
  MG5_aMC> set lhapdf /PATH/TO/lhapdf-config
 
 Using default text editor "vi". Set another one in ./input/mg5_configuration.txt
-Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt
-No valid web browser found. Please set in ./input/mg5_configuration.txt
+No valid eps viewer found. Please set in ./input/mg5_configuration.txt
+Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt
 import /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_cudacpp_ee_mumu.mg
 The import format was not given, so we guess it as command
 set stdout_level DEBUG
@@ -62,7 +62,7 @@ generate e+ e- > mu+ mu-
 No model currently active, so we import the Standard Model
 INFO: load particles 
 INFO: load vertices 
-[1;32mDEBUG: model prefixing  takes 0.005217790603637695 [0m
+[1;32mDEBUG: model prefixing  takes 0.004713535308837891 [0m
 INFO: Restrict model sm with file models/sm/restrict_default.dat . 
 [1;32mDEBUG: Simplifying conditional expressions [0m
 [1;32mDEBUG: remove interactions: u s w+ at order: QED=1 [0m
@@ -185,7 +185,7 @@ FileWriter <class 'PLUGIN.CUDACPP_SA_OUTPUT.model_handling.PLUGIN_CPPWriter'> fo
 [1;32mDEBUG:  type(self.helas_call_writer) = [0m <class 'PLUGIN.CUDACPP_SA_OUTPUT.model_handling.PLUGIN_GPUFOHelasCallWriter'> [1;30m[model_handling.py at line 1151][0m [0m
 [1;32mDEBUG:  self.support_multichannel, self.include_multi_channel = [0m True False [1;30m[model_handling.py at line 1152][0m [0m
 [1;32mDEBUG:  multi_channel_map = [0m None [1;30m[model_handling.py at line 1643][0m [0m
-[1;32mDEBUG:  diag_to_config = [0m {} [1;30m[model_handling.py at line 1700][0m [0m
+[1;32mDEBUG:  diag_to_config = [0m {} [1;30m[model_handling.py at line 1698][0m [0m
 INFO: Created files CPPProcess.h and CPPProcess.cc in directory /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_cudacpp_ee_mumu/SubProcesses/P1_Sigma_sm_epem_mupmum/. 
 [1;32mDEBUG:  Entering PLUGIN_OneProcessExporter.edit_CMakeLists [1;30m[model_handling.py at line 1332][0m [0m
 [1;32mDEBUG:  Entering PLUGIN_OneProcessExporter.edit_check_sa [1;30m[model_handling.py at line 1341][0m [0m
@@ -202,7 +202,7 @@ ALOHA: aloha creates FFV1 routines[0m
 ALOHA: aloha creates FFV2 routines[0m
 ALOHA: aloha creates FFV4 routines[0m
 ALOHA: aloha creates FFV2_4 routines[0m
-ALOHA: aloha creates 4 routines in  0.252 s
+ALOHA: aloha creates 4 routines in  0.229 s
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
 <class 'aloha.create_aloha.AbstractRoutine'> FFV2
@@ -231,6 +231,6 @@ INFO: /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_cudacpp_ee_mumu/src/.
 [1;32mDEBUG:  Entering PLUGIN_ProcessExporter.finalize [1;30m[output.py at line 203][0m [0m
 quit
 
-real	0m0.762s
-user	0m0.624s
-sys	0m0.057s
+real	0m0.652s
+user	0m0.574s
+sys	0m0.066s
diff --git a/epochX/cudacpp/gg_tt.mad/CODEGEN_mad_gg_tt_log.txt b/epochX/cudacpp/gg_tt.mad/CODEGEN_mad_gg_tt_log.txt
index babfee914b..18208a863b 100644
--- a/epochX/cudacpp/gg_tt.mad/CODEGEN_mad_gg_tt_log.txt
+++ b/epochX/cudacpp/gg_tt.mad/CODEGEN_mad_gg_tt_log.txt
@@ -51,8 +51,8 @@ Note that you can still compile and run aMC@NLO with the built-in PDFs
  MG5_aMC> set lhapdf /PATH/TO/lhapdf-config
 
 Using default text editor "vi". Set another one in ./input/mg5_configuration.txt
-Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt
-No valid web browser found. Please set in ./input/mg5_configuration.txt
+No valid eps viewer found. Please set in ./input/mg5_configuration.txt
+Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt
 import /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_mad_gg_tt.mg
 The import format was not given, so we guess it as command
 set stdout_level DEBUG
@@ -62,7 +62,7 @@ generate g g > t t~
 No model currently active, so we import the Standard Model
 INFO: load particles 
 INFO: load vertices 
-[1;32mDEBUG: model prefixing  takes 0.005231142044067383 [0m
+[1;32mDEBUG: model prefixing  takes 0.004682779312133789 [0m
 INFO: Restrict model sm with file models/sm/restrict_default.dat . 
 [1;32mDEBUG: Simplifying conditional expressions [0m
 [1;32mDEBUG: remove interactions: u s w+ at order: QED=1 [0m
@@ -177,7 +177,7 @@ INFO: Creating files in directory P1_gg_ttx
 [1;32mDEBUG:  Entering PLUGIN_OneProcessExporter.__init__ [1;30m[model_handling.py at line 1028][0m [0m
 [1;32mDEBUG:  kwargs[prefix] = 0 [1;30m[model_handling.py at line 1029][0m [0m
 [1;32mDEBUG:  proc_id = [0m 1 [1;30m[model_handling.py at line 1034][0m [0m
-[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_SA_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7f4130fed7c0> [1;30m[export_v4.py at line 6174][0m [0m
+[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_SA_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7f8b34fc4760> [1;30m[export_v4.py at line 6174][0m [0m
 INFO: Creating files in directory . 
 [1;32mDEBUG:  Entering PLUGIN_OneProcessExporter.generate_process_files [1;30m[model_handling.py at line 1286][0m [0m
 [1;32mDEBUG:  self.include_multi_channel is already defined: this is madevent+second_exporter mode [1;30m[model_handling.py at line 1288][0m [0m
@@ -192,11 +192,11 @@ FileWriter <class 'PLUGIN.CUDACPP_SA_OUTPUT.model_handling.PLUGIN_CPPWriter'> fo
 [1;32mDEBUG:  self.support_multichannel, self.include_multi_channel = [0m True [1, 2, 3] [1;30m[model_handling.py at line 1152][0m [0m
 [1;32mDEBUG:  multi_channel = [0m {1: [0], 2: [1], 3: [2]} [1;30m[model_handling.py at line 1158][0m [0m
 [1;32mDEBUG:  multi_channel_map = [0m {1: [0], 2: [1], 3: [2]} [1;30m[model_handling.py at line 1643][0m [0m
-[1;32mDEBUG:  diag_to_config = [0m {1: 1, 2: 2, 3: 3} [1;30m[model_handling.py at line 1700][0m [0m
-[1;32mDEBUG:  call = [0m vxxxxx( momenta,m_pars->%s, cHel[ihel][%d],%+d, w_sv[%d], %d ); [1;30m[model_handling.py at line 1812][0m [0m
-[1;32mDEBUG:  ('ZERO', 0, -1, 0, 0) [1;30m[model_handling.py at line 1813][0m [0m
-[1;32mDEBUG:  call = [0m vxxxxx( momenta,m_pars->%s, cHel[ihel][%d],%+d, w_sv[%d], %d ); [1;30m[model_handling.py at line 1812][0m [0m
-[1;32mDEBUG:  ('ZERO', 1, -1, 1, 1) [1;30m[model_handling.py at line 1813][0m [0m
+[1;32mDEBUG:  diag_to_config = [0m {1: 1, 2: 2, 3: 3} [1;30m[model_handling.py at line 1698][0m [0m
+[1;32mDEBUG:  call = [0m vxxxxx( momenta,m_pars->%s, cHel[ihel][%d],%+d, w_sv[%d], %d ); [1;30m[model_handling.py at line 1810][0m [0m
+[1;32mDEBUG:  ('ZERO', 0, -1, 0, 0) [1;30m[model_handling.py at line 1811][0m [0m
+[1;32mDEBUG:  call = [0m vxxxxx( momenta,m_pars->%s, cHel[ihel][%d],%+d, w_sv[%d], %d ); [1;30m[model_handling.py at line 1810][0m [0m
+[1;32mDEBUG:  ('ZERO', 1, -1, 1, 1) [1;30m[model_handling.py at line 1811][0m [0m
 INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./. 
 [1;32mDEBUG:  Entering PLUGIN_OneProcessExporter.edit_CMakeLists [1;30m[model_handling.py at line 1332][0m [0m
 [1;32mDEBUG:  Entering PLUGIN_OneProcessExporter.edit_check_sa [1;30m[model_handling.py at line 1341][0m [0m
@@ -214,16 +214,16 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./.
 INFO: Generating Feynman diagrams for Process: g g > t t~ WEIGHTED<=2 @1 
 INFO: Finding symmetric diagrams for subprocess group gg_ttx 
 Generated helas calls for 1 subprocesses (3 diagrams) in 0.006 s
-Wrote files for 10 helas calls in 0.110 s
+Wrote files for 10 helas calls in 0.109 s
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates VVV1 set of routines with options: P0[0m
 ALOHA: aloha creates FFV1 routines[0m
-ALOHA: aloha creates 2 routines in  0.137 s
+ALOHA: aloha creates 2 routines in  0.126 s
 [1;32mDEBUG:  Entering PLUGIN_ProcessExporter.convert_model (create the model) [1;30m[output.py at line 194][0m [0m
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates VVV1 set of routines with options: P0[0m
 ALOHA: aloha creates FFV1 routines[0m
-ALOHA: aloha creates 4 routines in  0.126 s
+ALOHA: aloha creates 4 routines in  0.113 s
 <class 'aloha.create_aloha.AbstractRoutine'> VVV1
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
@@ -266,6 +266,6 @@ Type "launch" to generate events from this process, or see
 Run "open index.html" to see more information about this process.
 quit
 
-real	0m1.707s
-user	0m1.508s
-sys	0m0.188s
+real	0m1.778s
+user	0m1.425s
+sys	0m0.206s
diff --git a/epochX/cudacpp/gg_tt.sa/CODEGEN_cudacpp_gg_tt_log.txt b/epochX/cudacpp/gg_tt.sa/CODEGEN_cudacpp_gg_tt_log.txt
index b017693308..b8f6269784 100644
--- a/epochX/cudacpp/gg_tt.sa/CODEGEN_cudacpp_gg_tt_log.txt
+++ b/epochX/cudacpp/gg_tt.sa/CODEGEN_cudacpp_gg_tt_log.txt
@@ -51,8 +51,8 @@ Note that you can still compile and run aMC@NLO with the built-in PDFs
  MG5_aMC> set lhapdf /PATH/TO/lhapdf-config
 
 Using default text editor "vi". Set another one in ./input/mg5_configuration.txt
-Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt
-No valid web browser found. Please set in ./input/mg5_configuration.txt
+No valid eps viewer found. Please set in ./input/mg5_configuration.txt
+Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt
 import /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_cudacpp_gg_tt.mg
 The import format was not given, so we guess it as command
 set stdout_level DEBUG
@@ -62,7 +62,7 @@ generate g g > t t~
 No model currently active, so we import the Standard Model
 INFO: load particles 
 INFO: load vertices 
-[1;32mDEBUG: model prefixing  takes 0.005097150802612305 [0m
+[1;32mDEBUG: model prefixing  takes 0.005010843276977539 [0m
 INFO: Restrict model sm with file models/sm/restrict_default.dat . 
 [1;32mDEBUG: Simplifying conditional expressions [0m
 [1;32mDEBUG: remove interactions: u s w+ at order: QED=1 [0m
@@ -155,7 +155,7 @@ INFO: Please specify coupling orders to bypass this step.
 INFO: Trying coupling order WEIGHTED<=2: WEIGTHED IS QCD+2*QED 
 INFO: Trying process: g g > t t~ WEIGHTED<=2 @1  
 INFO: Process has 3 diagrams 
-1 processes with 3 diagrams generated in 0.008 s
+1 processes with 3 diagrams generated in 0.007 s
 Total: 1 processes with 3 diagrams
 output standalone_cudacpp CODEGEN_cudacpp_gg_tt
 Load PLUGIN.CUDACPP_SA_OUTPUT
@@ -186,11 +186,11 @@ FileWriter <class 'PLUGIN.CUDACPP_SA_OUTPUT.model_handling.PLUGIN_CPPWriter'> fo
 [1;32mDEBUG:  type(self.helas_call_writer) = [0m <class 'PLUGIN.CUDACPP_SA_OUTPUT.model_handling.PLUGIN_GPUFOHelasCallWriter'> [1;30m[model_handling.py at line 1151][0m [0m
 [1;32mDEBUG:  self.support_multichannel, self.include_multi_channel = [0m True False [1;30m[model_handling.py at line 1152][0m [0m
 [1;32mDEBUG:  multi_channel_map = [0m None [1;30m[model_handling.py at line 1643][0m [0m
-[1;32mDEBUG:  diag_to_config = [0m {} [1;30m[model_handling.py at line 1700][0m [0m
-[1;32mDEBUG:  call = [0m vxxxxx( momenta,m_pars->%s, cHel[ihel][%d],%+d, w_sv[%d], %d ); [1;30m[model_handling.py at line 1812][0m [0m
-[1;32mDEBUG:  ('ZERO', 0, -1, 0, 0) [1;30m[model_handling.py at line 1813][0m [0m
-[1;32mDEBUG:  call = [0m vxxxxx( momenta,m_pars->%s, cHel[ihel][%d],%+d, w_sv[%d], %d ); [1;30m[model_handling.py at line 1812][0m [0m
-[1;32mDEBUG:  ('ZERO', 1, -1, 1, 1) [1;30m[model_handling.py at line 1813][0m [0m
+[1;32mDEBUG:  diag_to_config = [0m {} [1;30m[model_handling.py at line 1698][0m [0m
+[1;32mDEBUG:  call = [0m vxxxxx( momenta,m_pars->%s, cHel[ihel][%d],%+d, w_sv[%d], %d ); [1;30m[model_handling.py at line 1810][0m [0m
+[1;32mDEBUG:  ('ZERO', 0, -1, 0, 0) [1;30m[model_handling.py at line 1811][0m [0m
+[1;32mDEBUG:  call = [0m vxxxxx( momenta,m_pars->%s, cHel[ihel][%d],%+d, w_sv[%d], %d ); [1;30m[model_handling.py at line 1810][0m [0m
+[1;32mDEBUG:  ('ZERO', 1, -1, 1, 1) [1;30m[model_handling.py at line 1811][0m [0m
 INFO: Created files CPPProcess.h and CPPProcess.cc in directory /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_cudacpp_gg_tt/SubProcesses/P1_Sigma_sm_gg_ttx/. 
 [1;32mDEBUG:  Entering PLUGIN_OneProcessExporter.edit_CMakeLists [1;30m[model_handling.py at line 1332][0m [0m
 [1;32mDEBUG:  Entering PLUGIN_OneProcessExporter.edit_check_sa [1;30m[model_handling.py at line 1341][0m [0m
@@ -205,7 +205,7 @@ Generated helas calls for 1 subprocesses (3 diagrams) in 0.005 s
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates VVV1 set of routines with options: P0[0m
 ALOHA: aloha creates FFV1 routines[0m
-ALOHA: aloha creates 2 routines in  0.136 s
+ALOHA: aloha creates 2 routines in  0.123 s
 <class 'aloha.create_aloha.AbstractRoutine'> VVV1
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
@@ -239,6 +239,6 @@ INFO: /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_cudacpp_gg_tt/src/. a
 [1;32mDEBUG:  Entering PLUGIN_ProcessExporter.finalize [1;30m[output.py at line 203][0m [0m
 quit
 
-real	0m2.710s
-user	0m0.550s
-sys	0m0.052s
+real	0m0.581s
+user	0m0.514s
+sys	0m0.058s
diff --git a/epochX/cudacpp/gg_tt01g.mad/CODEGEN_mad_gg_tt01g_log.txt b/epochX/cudacpp/gg_tt01g.mad/CODEGEN_mad_gg_tt01g_log.txt
index aa1674d4d2..76ccc27b8e 100644
--- a/epochX/cudacpp/gg_tt01g.mad/CODEGEN_mad_gg_tt01g_log.txt
+++ b/epochX/cudacpp/gg_tt01g.mad/CODEGEN_mad_gg_tt01g_log.txt
@@ -51,8 +51,8 @@ Note that you can still compile and run aMC@NLO with the built-in PDFs
  MG5_aMC> set lhapdf /PATH/TO/lhapdf-config
 
 Using default text editor "vi". Set another one in ./input/mg5_configuration.txt
-Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt
-No valid web browser found. Please set in ./input/mg5_configuration.txt
+No valid eps viewer found. Please set in ./input/mg5_configuration.txt
+Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt
 import /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_mad_gg_tt01g.mg
 The import format was not given, so we guess it as command
 set stdout_level DEBUG
@@ -62,7 +62,7 @@ generate g g > t t~; add process g g > t t~ g
 No model currently active, so we import the Standard Model
 INFO: load particles 
 INFO: load vertices 
-[1;32mDEBUG: model prefixing  takes 0.005077838897705078 [0m
+[1;32mDEBUG: model prefixing  takes 0.004670143127441406 [0m
 INFO: Restrict model sm with file models/sm/restrict_default.dat . 
 [1;32mDEBUG: Simplifying conditional expressions [0m
 [1;32mDEBUG: remove interactions: u s w+ at order: QED=1 [0m
@@ -162,7 +162,7 @@ INFO: Please specify coupling orders to bypass this step.
 INFO: Trying coupling order WEIGHTED<=3: WEIGTHED IS QCD+2*QED 
 INFO: Trying process: g g > t t~ g WEIGHTED<=3 @2  
 INFO: Process has 16 diagrams 
-1 processes with 16 diagrams generated in 0.019 s
+1 processes with 16 diagrams generated in 0.018 s
 Total: 2 processes with 19 diagrams
 output madevent CODEGEN_mad_gg_tt01g --hel_recycling=False --vector_size=16384 --me_exporter=standalone_cudacpp
 Load PLUGIN.CUDACPP_SA_OUTPUT
@@ -186,7 +186,7 @@ INFO: Creating files in directory P2_gg_ttxg
 [1;32mDEBUG:  Entering PLUGIN_OneProcessExporter.__init__ [1;30m[model_handling.py at line 1028][0m [0m
 [1;32mDEBUG:  kwargs[prefix] = 0 [1;30m[model_handling.py at line 1029][0m [0m
 [1;32mDEBUG:  proc_id = [0m 1 [1;30m[model_handling.py at line 1034][0m [0m
-[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_SA_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7f90c157afd0> [1;30m[export_v4.py at line 6174][0m [0m
+[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_SA_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7fa068f5e6d0> [1;30m[export_v4.py at line 6174][0m [0m
 INFO: Creating files in directory . 
 [1;32mDEBUG:  Entering PLUGIN_OneProcessExporter.generate_process_files [1;30m[model_handling.py at line 1286][0m [0m
 [1;32mDEBUG:  self.include_multi_channel is already defined: this is madevent+second_exporter mode [1;30m[model_handling.py at line 1288][0m [0m
@@ -201,13 +201,13 @@ FileWriter <class 'PLUGIN.CUDACPP_SA_OUTPUT.model_handling.PLUGIN_CPPWriter'> fo
 [1;32mDEBUG:  self.support_multichannel, self.include_multi_channel = [0m True [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 0] [1;30m[model_handling.py at line 1152][0m [0m
 [1;32mDEBUG:  multi_channel = [0m {1: [0], 2: [1], 3: [2], 4: [3], 5: [4], 6: [5], 7: [6], 8: [7], 9: [8], 10: [9], 11: [10], 12: [11], 13: [12], 14: [13], 15: [14]} [1;30m[model_handling.py at line 1158][0m [0m
 [1;32mDEBUG:  multi_channel_map = [0m {1: [0], 2: [1], 3: [2], 4: [3], 5: [4], 6: [5], 7: [6], 8: [7], 9: [8], 10: [9], 11: [10], 12: [11], 13: [12], 14: [13], 15: [14]} [1;30m[model_handling.py at line 1643][0m [0m
-[1;32mDEBUG:  diag_to_config = [0m {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14, 15: 15} [1;30m[model_handling.py at line 1700][0m [0m
-[1;32mDEBUG:  call = [0m vxxxxx( momenta,m_pars->%s, cHel[ihel][%d],%+d, w_sv[%d], %d ); [1;30m[model_handling.py at line 1812][0m [0m
-[1;32mDEBUG:  ('ZERO', 0, -1, 0, 0) [1;30m[model_handling.py at line 1813][0m [0m
-[1;32mDEBUG:  call = [0m vxxxxx( momenta,m_pars->%s, cHel[ihel][%d],%+d, w_sv[%d], %d ); [1;30m[model_handling.py at line 1812][0m [0m
-[1;32mDEBUG:  ('ZERO', 1, -1, 1, 1) [1;30m[model_handling.py at line 1813][0m [0m
-[1;32mDEBUG:  call = [0m vxxxxx( momenta,m_pars->%s, cHel[ihel][%d],%+d, w_sv[%d], %d ); [1;30m[model_handling.py at line 1812][0m [0m
-[1;32mDEBUG:  ('ZERO', 4, 1, 4, 4) [1;30m[model_handling.py at line 1813][0m [0m
+[1;32mDEBUG:  diag_to_config = [0m {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14, 15: 15} [1;30m[model_handling.py at line 1698][0m [0m
+[1;32mDEBUG:  call = [0m vxxxxx( momenta,m_pars->%s, cHel[ihel][%d],%+d, w_sv[%d], %d ); [1;30m[model_handling.py at line 1810][0m [0m
+[1;32mDEBUG:  ('ZERO', 0, -1, 0, 0) [1;30m[model_handling.py at line 1811][0m [0m
+[1;32mDEBUG:  call = [0m vxxxxx( momenta,m_pars->%s, cHel[ihel][%d],%+d, w_sv[%d], %d ); [1;30m[model_handling.py at line 1810][0m [0m
+[1;32mDEBUG:  ('ZERO', 1, -1, 1, 1) [1;30m[model_handling.py at line 1811][0m [0m
+[1;32mDEBUG:  call = [0m vxxxxx( momenta,m_pars->%s, cHel[ihel][%d],%+d, w_sv[%d], %d ); [1;30m[model_handling.py at line 1810][0m [0m
+[1;32mDEBUG:  ('ZERO', 4, 1, 4, 4) [1;30m[model_handling.py at line 1811][0m [0m
 INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./. 
 [1;32mDEBUG:  Entering PLUGIN_OneProcessExporter.edit_CMakeLists [1;30m[model_handling.py at line 1332][0m [0m
 [1;32mDEBUG:  Entering PLUGIN_OneProcessExporter.edit_check_sa [1;30m[model_handling.py at line 1341][0m [0m
@@ -228,7 +228,7 @@ INFO: Creating files in directory P1_gg_ttx
 [1;32mDEBUG:  Entering PLUGIN_OneProcessExporter.__init__ [1;30m[model_handling.py at line 1028][0m [0m
 [1;32mDEBUG:  kwargs[prefix] = 0 [1;30m[model_handling.py at line 1029][0m [0m
 [1;32mDEBUG:  proc_id = [0m 1 [1;30m[model_handling.py at line 1034][0m [0m
-[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_SA_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7f90c1647820> [1;30m[export_v4.py at line 6174][0m [0m
+[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_SA_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7fa068f44100> [1;30m[export_v4.py at line 6174][0m [0m
 INFO: Creating files in directory . 
 [1;32mDEBUG:  Entering PLUGIN_OneProcessExporter.generate_process_files [1;30m[model_handling.py at line 1286][0m [0m
 [1;32mDEBUG:  self.include_multi_channel is already defined: this is madevent+second_exporter mode [1;30m[model_handling.py at line 1288][0m [0m
@@ -243,11 +243,11 @@ FileWriter <class 'PLUGIN.CUDACPP_SA_OUTPUT.model_handling.PLUGIN_CPPWriter'> fo
 [1;32mDEBUG:  self.support_multichannel, self.include_multi_channel = [0m True [1, 2, 3] [1;30m[model_handling.py at line 1152][0m [0m
 [1;32mDEBUG:  multi_channel = [0m {1: [0], 2: [1], 3: [2]} [1;30m[model_handling.py at line 1158][0m [0m
 [1;32mDEBUG:  multi_channel_map = [0m {1: [0], 2: [1], 3: [2]} [1;30m[model_handling.py at line 1643][0m [0m
-[1;32mDEBUG:  diag_to_config = [0m {1: 1, 2: 2, 3: 3} [1;30m[model_handling.py at line 1700][0m [0m
-[1;32mDEBUG:  call = [0m vxxxxx( momenta,m_pars->%s, cHel[ihel][%d],%+d, w_sv[%d], %d ); [1;30m[model_handling.py at line 1812][0m [0m
-[1;32mDEBUG:  ('ZERO', 0, -1, 0, 0) [1;30m[model_handling.py at line 1813][0m [0m
-[1;32mDEBUG:  call = [0m vxxxxx( momenta,m_pars->%s, cHel[ihel][%d],%+d, w_sv[%d], %d ); [1;30m[model_handling.py at line 1812][0m [0m
-[1;32mDEBUG:  ('ZERO', 1, -1, 1, 1) [1;30m[model_handling.py at line 1813][0m [0m
+[1;32mDEBUG:  diag_to_config = [0m {1: 1, 2: 2, 3: 3} [1;30m[model_handling.py at line 1698][0m [0m
+[1;32mDEBUG:  call = [0m vxxxxx( momenta,m_pars->%s, cHel[ihel][%d],%+d, w_sv[%d], %d ); [1;30m[model_handling.py at line 1810][0m [0m
+[1;32mDEBUG:  ('ZERO', 0, -1, 0, 0) [1;30m[model_handling.py at line 1811][0m [0m
+[1;32mDEBUG:  call = [0m vxxxxx( momenta,m_pars->%s, cHel[ihel][%d],%+d, w_sv[%d], %d ); [1;30m[model_handling.py at line 1810][0m [0m
+[1;32mDEBUG:  ('ZERO', 1, -1, 1, 1) [1;30m[model_handling.py at line 1811][0m [0m
 INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./. 
 [1;32mDEBUG:  Entering PLUGIN_OneProcessExporter.edit_CMakeLists [1;30m[model_handling.py at line 1332][0m [0m
 [1;32mDEBUG:  Entering PLUGIN_OneProcessExporter.edit_check_sa [1;30m[model_handling.py at line 1341][0m [0m
@@ -264,15 +264,15 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./.
 [1;32mDEBUG:  Done [1;30m[export_cpp.py at line 713][0m [0m
 INFO: Generating Feynman diagrams for Process: g g > t t~ WEIGHTED<=2 @1 
 INFO: Finding symmetric diagrams for subprocess group gg_ttx 
-Generated helas calls for 2 subprocesses (19 diagrams) in 0.041 s
-Wrote files for 46 helas calls in 0.261 s
+Generated helas calls for 2 subprocesses (19 diagrams) in 0.039 s
+Wrote files for 46 helas calls in 0.248 s
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates VVV1 routines[0m
 ALOHA: aloha creates FFV1 routines[0m
 ALOHA: aloha creates VVVV1 set of routines with options: P0[0m
 ALOHA: aloha creates VVVV3 set of routines with options: P0[0m
 ALOHA: aloha creates VVVV4 set of routines with options: P0[0m
-ALOHA: aloha creates 5 routines in  0.307 s
+ALOHA: aloha creates 5 routines in  0.277 s
 [1;32mDEBUG:  Entering PLUGIN_ProcessExporter.convert_model (create the model) [1;30m[output.py at line 194][0m [0m
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates VVV1 routines[0m
@@ -280,7 +280,7 @@ ALOHA: aloha creates FFV1 routines[0m
 ALOHA: aloha creates VVVV1 set of routines with options: P0[0m
 ALOHA: aloha creates VVVV3 set of routines with options: P0[0m
 ALOHA: aloha creates VVVV4 set of routines with options: P0[0m
-ALOHA: aloha creates 10 routines in  0.293 s
+ALOHA: aloha creates 10 routines in  0.278 s
 <class 'aloha.create_aloha.AbstractRoutine'> VVV1
 <class 'aloha.create_aloha.AbstractRoutine'> VVV1
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
@@ -328,6 +328,6 @@ Type "launch" to generate events from this process, or see
 Run "open index.html" to see more information about this process.
 quit
 
-real	0m2.335s
-user	0m2.066s
-sys	0m0.192s
+real	0m2.229s
+user	0m1.925s
+sys	0m0.210s
diff --git a/epochX/cudacpp/gg_ttg.mad/CODEGEN_mad_gg_ttg_log.txt b/epochX/cudacpp/gg_ttg.mad/CODEGEN_mad_gg_ttg_log.txt
index f273353c34..e56e6dfb27 100644
--- a/epochX/cudacpp/gg_ttg.mad/CODEGEN_mad_gg_ttg_log.txt
+++ b/epochX/cudacpp/gg_ttg.mad/CODEGEN_mad_gg_ttg_log.txt
@@ -51,8 +51,8 @@ Note that you can still compile and run aMC@NLO with the built-in PDFs
  MG5_aMC> set lhapdf /PATH/TO/lhapdf-config
 
 Using default text editor "vi". Set another one in ./input/mg5_configuration.txt
-Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt
-No valid web browser found. Please set in ./input/mg5_configuration.txt
+No valid eps viewer found. Please set in ./input/mg5_configuration.txt
+Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt
 import /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_mad_gg_ttg.mg
 The import format was not given, so we guess it as command
 set stdout_level DEBUG
@@ -62,7 +62,7 @@ generate g g > t t~ g
 No model currently active, so we import the Standard Model
 INFO: load particles 
 INFO: load vertices 
-[1;32mDEBUG: model prefixing  takes 0.005414009094238281 [0m
+[1;32mDEBUG: model prefixing  takes 0.004709720611572266 [0m
 INFO: Restrict model sm with file models/sm/restrict_default.dat . 
 [1;32mDEBUG: Simplifying conditional expressions [0m
 [1;32mDEBUG: remove interactions: u s w+ at order: QED=1 [0m
@@ -155,7 +155,7 @@ INFO: Please specify coupling orders to bypass this step.
 INFO: Trying coupling order WEIGHTED<=3: WEIGTHED IS QCD+2*QED 
 INFO: Trying process: g g > t t~ g WEIGHTED<=3 @1  
 INFO: Process has 16 diagrams 
-1 processes with 16 diagrams generated in 0.021 s
+1 processes with 16 diagrams generated in 0.019 s
 Total: 1 processes with 16 diagrams
 output madevent CODEGEN_mad_gg_ttg --hel_recycling=False --vector_size=16384 --me_exporter=standalone_cudacpp
 Load PLUGIN.CUDACPP_SA_OUTPUT
@@ -177,7 +177,7 @@ INFO: Creating files in directory P1_gg_ttxg
 [1;32mDEBUG:  Entering PLUGIN_OneProcessExporter.__init__ [1;30m[model_handling.py at line 1028][0m [0m
 [1;32mDEBUG:  kwargs[prefix] = 0 [1;30m[model_handling.py at line 1029][0m [0m
 [1;32mDEBUG:  proc_id = [0m 1 [1;30m[model_handling.py at line 1034][0m [0m
-[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_SA_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7f4ee7745e80> [1;30m[export_v4.py at line 6174][0m [0m
+[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_SA_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7f2a1901cc40> [1;30m[export_v4.py at line 6174][0m [0m
 INFO: Creating files in directory . 
 [1;32mDEBUG:  Entering PLUGIN_OneProcessExporter.generate_process_files [1;30m[model_handling.py at line 1286][0m [0m
 [1;32mDEBUG:  self.include_multi_channel is already defined: this is madevent+second_exporter mode [1;30m[model_handling.py at line 1288][0m [0m
@@ -192,13 +192,13 @@ FileWriter <class 'PLUGIN.CUDACPP_SA_OUTPUT.model_handling.PLUGIN_CPPWriter'> fo
 [1;32mDEBUG:  self.support_multichannel, self.include_multi_channel = [0m True [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 0] [1;30m[model_handling.py at line 1152][0m [0m
 [1;32mDEBUG:  multi_channel = [0m {1: [0], 2: [1], 3: [2], 4: [3], 5: [4], 6: [5], 7: [6], 8: [7], 9: [8], 10: [9], 11: [10], 12: [11], 13: [12], 14: [13], 15: [14]} [1;30m[model_handling.py at line 1158][0m [0m
 [1;32mDEBUG:  multi_channel_map = [0m {1: [0], 2: [1], 3: [2], 4: [3], 5: [4], 6: [5], 7: [6], 8: [7], 9: [8], 10: [9], 11: [10], 12: [11], 13: [12], 14: [13], 15: [14]} [1;30m[model_handling.py at line 1643][0m [0m
-[1;32mDEBUG:  diag_to_config = [0m {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14, 15: 15} [1;30m[model_handling.py at line 1700][0m [0m
-[1;32mDEBUG:  call = [0m vxxxxx( momenta,m_pars->%s, cHel[ihel][%d],%+d, w_sv[%d], %d ); [1;30m[model_handling.py at line 1812][0m [0m
-[1;32mDEBUG:  ('ZERO', 0, -1, 0, 0) [1;30m[model_handling.py at line 1813][0m [0m
-[1;32mDEBUG:  call = [0m vxxxxx( momenta,m_pars->%s, cHel[ihel][%d],%+d, w_sv[%d], %d ); [1;30m[model_handling.py at line 1812][0m [0m
-[1;32mDEBUG:  ('ZERO', 1, -1, 1, 1) [1;30m[model_handling.py at line 1813][0m [0m
-[1;32mDEBUG:  call = [0m vxxxxx( momenta,m_pars->%s, cHel[ihel][%d],%+d, w_sv[%d], %d ); [1;30m[model_handling.py at line 1812][0m [0m
-[1;32mDEBUG:  ('ZERO', 4, 1, 4, 4) [1;30m[model_handling.py at line 1813][0m [0m
+[1;32mDEBUG:  diag_to_config = [0m {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14, 15: 15} [1;30m[model_handling.py at line 1698][0m [0m
+[1;32mDEBUG:  call = [0m vxxxxx( momenta,m_pars->%s, cHel[ihel][%d],%+d, w_sv[%d], %d ); [1;30m[model_handling.py at line 1810][0m [0m
+[1;32mDEBUG:  ('ZERO', 0, -1, 0, 0) [1;30m[model_handling.py at line 1811][0m [0m
+[1;32mDEBUG:  call = [0m vxxxxx( momenta,m_pars->%s, cHel[ihel][%d],%+d, w_sv[%d], %d ); [1;30m[model_handling.py at line 1810][0m [0m
+[1;32mDEBUG:  ('ZERO', 1, -1, 1, 1) [1;30m[model_handling.py at line 1811][0m [0m
+[1;32mDEBUG:  call = [0m vxxxxx( momenta,m_pars->%s, cHel[ihel][%d],%+d, w_sv[%d], %d ); [1;30m[model_handling.py at line 1810][0m [0m
+[1;32mDEBUG:  ('ZERO', 4, 1, 4, 4) [1;30m[model_handling.py at line 1811][0m [0m
 INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./. 
 [1;32mDEBUG:  Entering PLUGIN_OneProcessExporter.edit_CMakeLists [1;30m[model_handling.py at line 1332][0m [0m
 [1;32mDEBUG:  Entering PLUGIN_OneProcessExporter.edit_check_sa [1;30m[model_handling.py at line 1341][0m [0m
@@ -215,15 +215,15 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./.
 [1;32mDEBUG:  Done [1;30m[export_cpp.py at line 713][0m [0m
 INFO: Generating Feynman diagrams for Process: g g > t t~ g WEIGHTED<=3 @1 
 INFO: Finding symmetric diagrams for subprocess group gg_ttxg 
-Generated helas calls for 1 subprocesses (16 diagrams) in 0.037 s
-Wrote files for 36 helas calls in 0.160 s
+Generated helas calls for 1 subprocesses (16 diagrams) in 0.035 s
+Wrote files for 36 helas calls in 0.153 s
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates VVV1 routines[0m
 ALOHA: aloha creates FFV1 routines[0m
 ALOHA: aloha creates VVVV1 set of routines with options: P0[0m
 ALOHA: aloha creates VVVV3 set of routines with options: P0[0m
 ALOHA: aloha creates VVVV4 set of routines with options: P0[0m
-ALOHA: aloha creates 5 routines in  0.309 s
+ALOHA: aloha creates 5 routines in  0.290 s
 [1;32mDEBUG:  Entering PLUGIN_ProcessExporter.convert_model (create the model) [1;30m[output.py at line 194][0m [0m
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates VVV1 routines[0m
@@ -231,7 +231,7 @@ ALOHA: aloha creates FFV1 routines[0m
 ALOHA: aloha creates VVVV1 set of routines with options: P0[0m
 ALOHA: aloha creates VVVV3 set of routines with options: P0[0m
 ALOHA: aloha creates VVVV4 set of routines with options: P0[0m
-ALOHA: aloha creates 10 routines in  0.294 s
+ALOHA: aloha creates 10 routines in  0.267 s
 <class 'aloha.create_aloha.AbstractRoutine'> VVV1
 <class 'aloha.create_aloha.AbstractRoutine'> VVV1
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
@@ -279,6 +279,6 @@ Type "launch" to generate events from this process, or see
 Run "open index.html" to see more information about this process.
 quit
 
-real	0m3.084s
-user	0m1.962s
-sys	0m0.184s
+real	0m2.132s
+user	0m1.824s
+sys	0m0.213s
diff --git a/epochX/cudacpp/gg_ttg.sa/CODEGEN_cudacpp_gg_ttg_log.txt b/epochX/cudacpp/gg_ttg.sa/CODEGEN_cudacpp_gg_ttg_log.txt
index e8d2db38ce..49a78d1df6 100644
--- a/epochX/cudacpp/gg_ttg.sa/CODEGEN_cudacpp_gg_ttg_log.txt
+++ b/epochX/cudacpp/gg_ttg.sa/CODEGEN_cudacpp_gg_ttg_log.txt
@@ -51,8 +51,8 @@ Note that you can still compile and run aMC@NLO with the built-in PDFs
  MG5_aMC> set lhapdf /PATH/TO/lhapdf-config
 
 Using default text editor "vi". Set another one in ./input/mg5_configuration.txt
-Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt
-No valid web browser found. Please set in ./input/mg5_configuration.txt
+No valid eps viewer found. Please set in ./input/mg5_configuration.txt
+Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt
 import /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_cudacpp_gg_ttg.mg
 The import format was not given, so we guess it as command
 set stdout_level DEBUG
@@ -62,7 +62,7 @@ generate g g > t t~ g
 No model currently active, so we import the Standard Model
 INFO: load particles 
 INFO: load vertices 
-[1;32mDEBUG: model prefixing  takes 0.00521087646484375 [0m
+[1;32mDEBUG: model prefixing  takes 0.004593372344970703 [0m
 INFO: Restrict model sm with file models/sm/restrict_default.dat . 
 [1;32mDEBUG: Simplifying conditional expressions [0m
 [1;32mDEBUG: remove interactions: u s w+ at order: QED=1 [0m
@@ -155,7 +155,7 @@ INFO: Please specify coupling orders to bypass this step.
 INFO: Trying coupling order WEIGHTED<=3: WEIGTHED IS QCD+2*QED 
 INFO: Trying process: g g > t t~ g WEIGHTED<=3 @1  
 INFO: Process has 16 diagrams 
-1 processes with 16 diagrams generated in 0.022 s
+1 processes with 16 diagrams generated in 0.020 s
 Total: 1 processes with 16 diagrams
 output standalone_cudacpp CODEGEN_cudacpp_gg_ttg
 Load PLUGIN.CUDACPP_SA_OUTPUT
@@ -186,13 +186,13 @@ FileWriter <class 'PLUGIN.CUDACPP_SA_OUTPUT.model_handling.PLUGIN_CPPWriter'> fo
 [1;32mDEBUG:  type(self.helas_call_writer) = [0m <class 'PLUGIN.CUDACPP_SA_OUTPUT.model_handling.PLUGIN_GPUFOHelasCallWriter'> [1;30m[model_handling.py at line 1151][0m [0m
 [1;32mDEBUG:  self.support_multichannel, self.include_multi_channel = [0m True False [1;30m[model_handling.py at line 1152][0m [0m
 [1;32mDEBUG:  multi_channel_map = [0m None [1;30m[model_handling.py at line 1643][0m [0m
-[1;32mDEBUG:  diag_to_config = [0m {} [1;30m[model_handling.py at line 1700][0m [0m
-[1;32mDEBUG:  call = [0m vxxxxx( momenta,m_pars->%s, cHel[ihel][%d],%+d, w_sv[%d], %d ); [1;30m[model_handling.py at line 1812][0m [0m
-[1;32mDEBUG:  ('ZERO', 0, -1, 0, 0) [1;30m[model_handling.py at line 1813][0m [0m
-[1;32mDEBUG:  call = [0m vxxxxx( momenta,m_pars->%s, cHel[ihel][%d],%+d, w_sv[%d], %d ); [1;30m[model_handling.py at line 1812][0m [0m
-[1;32mDEBUG:  ('ZERO', 1, -1, 1, 1) [1;30m[model_handling.py at line 1813][0m [0m
-[1;32mDEBUG:  call = [0m vxxxxx( momenta,m_pars->%s, cHel[ihel][%d],%+d, w_sv[%d], %d ); [1;30m[model_handling.py at line 1812][0m [0m
-[1;32mDEBUG:  ('ZERO', 4, 1, 4, 4) [1;30m[model_handling.py at line 1813][0m [0m
+[1;32mDEBUG:  diag_to_config = [0m {} [1;30m[model_handling.py at line 1698][0m [0m
+[1;32mDEBUG:  call = [0m vxxxxx( momenta,m_pars->%s, cHel[ihel][%d],%+d, w_sv[%d], %d ); [1;30m[model_handling.py at line 1810][0m [0m
+[1;32mDEBUG:  ('ZERO', 0, -1, 0, 0) [1;30m[model_handling.py at line 1811][0m [0m
+[1;32mDEBUG:  call = [0m vxxxxx( momenta,m_pars->%s, cHel[ihel][%d],%+d, w_sv[%d], %d ); [1;30m[model_handling.py at line 1810][0m [0m
+[1;32mDEBUG:  ('ZERO', 1, -1, 1, 1) [1;30m[model_handling.py at line 1811][0m [0m
+[1;32mDEBUG:  call = [0m vxxxxx( momenta,m_pars->%s, cHel[ihel][%d],%+d, w_sv[%d], %d ); [1;30m[model_handling.py at line 1810][0m [0m
+[1;32mDEBUG:  ('ZERO', 4, 1, 4, 4) [1;30m[model_handling.py at line 1811][0m [0m
 INFO: Created files CPPProcess.h and CPPProcess.cc in directory /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_cudacpp_gg_ttg/SubProcesses/P1_Sigma_sm_gg_ttxg/. 
 [1;32mDEBUG:  Entering PLUGIN_OneProcessExporter.edit_CMakeLists [1;30m[model_handling.py at line 1332][0m [0m
 [1;32mDEBUG:  Entering PLUGIN_OneProcessExporter.edit_check_sa [1;30m[model_handling.py at line 1341][0m [0m
@@ -202,7 +202,7 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory /data/avalassi/G
 [1;32mDEBUG:  Entering PLUGIN_OneProcessExporter.edit_memorybuffers [1;30m[model_handling.py at line 1419][0m [0m
 [1;32mDEBUG:  Entering PLUGIN_OneProcessExporter.edit_memoryaccesscouplings [1;30m[model_handling.py at line 1430][0m [0m
 [1;32mDEBUG:  'Copying test reference file: ', template_ref  = [0m Copying test reference file:  /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/../../../test/ref/dump_CPUTest.Sigma_sm_gg_ttxg.txt [1;30m[model_handling.py at line 1324][0m [0m
-Generated helas calls for 1 subprocesses (16 diagrams) in 0.037 s
+Generated helas calls for 1 subprocesses (16 diagrams) in 0.034 s
 [1;32mDEBUG:  Entering PLUGIN_ProcessExporter.convert_model (create the model) [1;30m[output.py at line 194][0m [0m
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates VVV1 routines[0m
@@ -210,7 +210,7 @@ ALOHA: aloha creates FFV1 routines[0m
 ALOHA: aloha creates VVVV1 set of routines with options: P0[0m
 ALOHA: aloha creates VVVV3 set of routines with options: P0[0m
 ALOHA: aloha creates VVVV4 set of routines with options: P0[0m
-ALOHA: aloha creates 5 routines in  0.313 s
+ALOHA: aloha creates 5 routines in  0.275 s
 <class 'aloha.create_aloha.AbstractRoutine'> VVV1
 <class 'aloha.create_aloha.AbstractRoutine'> VVV1
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
@@ -249,6 +249,6 @@ INFO: /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_cudacpp_gg_ttg/src/.
 [1;32mDEBUG:  Entering PLUGIN_ProcessExporter.finalize [1;30m[output.py at line 203][0m [0m
 quit
 
-real	0m0.981s
-user	0m0.792s
-sys	0m0.066s
+real	0m0.820s
+user	0m0.731s
+sys	0m0.052s
diff --git a/epochX/cudacpp/gg_ttgg.mad/CODEGEN_mad_gg_ttgg_log.txt b/epochX/cudacpp/gg_ttgg.mad/CODEGEN_mad_gg_ttgg_log.txt
index cc40481acf..3c85563e1c 100644
--- a/epochX/cudacpp/gg_ttgg.mad/CODEGEN_mad_gg_ttgg_log.txt
+++ b/epochX/cudacpp/gg_ttgg.mad/CODEGEN_mad_gg_ttgg_log.txt
@@ -51,8 +51,8 @@ Note that you can still compile and run aMC@NLO with the built-in PDFs
  MG5_aMC> set lhapdf /PATH/TO/lhapdf-config
 
 Using default text editor "vi". Set another one in ./input/mg5_configuration.txt
-Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt
-No valid web browser found. Please set in ./input/mg5_configuration.txt
+No valid eps viewer found. Please set in ./input/mg5_configuration.txt
+Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt
 import /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_mad_gg_ttgg.mg
 The import format was not given, so we guess it as command
 set stdout_level DEBUG
@@ -62,7 +62,7 @@ generate g g > t t~ g g
 No model currently active, so we import the Standard Model
 INFO: load particles 
 INFO: load vertices 
-[1;32mDEBUG: model prefixing  takes 0.0049550533294677734 [0m
+[1;32mDEBUG: model prefixing  takes 0.004593610763549805 [0m
 INFO: Restrict model sm with file models/sm/restrict_default.dat . 
 [1;32mDEBUG: Simplifying conditional expressions [0m
 [1;32mDEBUG: remove interactions: u s w+ at order: QED=1 [0m
@@ -155,7 +155,7 @@ INFO: Please specify coupling orders to bypass this step.
 INFO: Trying coupling order WEIGHTED<=4: WEIGTHED IS QCD+2*QED 
 INFO: Trying process: g g > t t~ g g WEIGHTED<=4 @1  
 INFO: Process has 123 diagrams 
-1 processes with 123 diagrams generated in 0.157 s
+1 processes with 123 diagrams generated in 0.144 s
 Total: 1 processes with 123 diagrams
 output madevent CODEGEN_mad_gg_ttgg --hel_recycling=False --vector_size=16384 --me_exporter=standalone_cudacpp
 Load PLUGIN.CUDACPP_SA_OUTPUT
@@ -177,7 +177,7 @@ INFO: Creating files in directory P1_gg_ttxgg
 [1;32mDEBUG:  Entering PLUGIN_OneProcessExporter.__init__ [1;30m[model_handling.py at line 1028][0m [0m
 [1;32mDEBUG:  kwargs[prefix] = 0 [1;30m[model_handling.py at line 1029][0m [0m
 [1;32mDEBUG:  proc_id = [0m 1 [1;30m[model_handling.py at line 1034][0m [0m
-[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_SA_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7f325f8ed0d0> [1;30m[export_v4.py at line 6174][0m [0m
+[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_SA_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7fea80645580> [1;30m[export_v4.py at line 6174][0m [0m
 INFO: Creating files in directory . 
 [1;32mDEBUG:  Entering PLUGIN_OneProcessExporter.generate_process_files [1;30m[model_handling.py at line 1286][0m [0m
 [1;32mDEBUG:  self.include_multi_channel is already defined: this is madevent+second_exporter mode [1;30m[model_handling.py at line 1288][0m [0m
@@ -192,15 +192,15 @@ FileWriter <class 'PLUGIN.CUDACPP_SA_OUTPUT.model_handling.PLUGIN_CPPWriter'> fo
 [1;32mDEBUG:  self.support_multichannel, self.include_multi_channel = [0m True [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 0, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 0, 46, 47, 48, 49, 50, 51, 52, 53, 54, 0, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 0, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 0, 88, 89, 90, 91, 92, 93, 0, 94, 95, 96, 97, 98, 99, 0, 100, 101, 102, 103, 104, 105, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] [1;30m[model_handling.py at line 1152][0m [0m
 [1;32mDEBUG:  multi_channel = [0m {1: [1], 2: [2], 3: [3], 4: [4], 5: [5], 6: [6], 7: [7], 8: [8], 9: [9], 10: [10], 11: [11], 12: [12], 13: [13], 14: [14], 15: [15], 16: [16], 17: [17], 18: [18], 19: [19], 20: [20], 21: [21], 22: [22], 23: [23], 24: [24], 25: [25], 26: [26], 27: [27], 28: [28], 29: [29], 30: [30], 31: [32], 32: [33], 33: [34], 34: [35], 35: [36], 36: [37], 37: [38], 38: [39], 39: [40], 40: [41], 41: [42], 42: [43], 43: [44], 44: [45], 45: [46], 46: [48], 47: [49], 48: [50], 49: [51], 50: [52], 51: [53], 52: [54], 53: [55], 54: [56], 55: [58], 56: [59], 57: [60], 58: [61], 59: [62], 60: [63], 61: [64], 62: [65], 63: [66], 64: [67], 65: [68], 66: [69], 67: [70], 68: [71], 69: [72], 70: [74], 71: [75], 72: [76], 73: [77], 74: [78], 75: [79], 76: [80], 77: [81], 78: [82], 79: [83], 80: [84], 81: [85], 82: [86], 83: [87], 84: [88], 85: [89], 86: [90], 87: [91], 88: [93], 89: [94], 90: [95], 91: [96], 92: [97], 93: [98], 94: [100], 95: [101], 96: [102], 97: [103], 98: [104], 99: [105], 100: [107], 101: [108], 102: [109], 103: [110], 104: [111], 105: [112]} [1;30m[model_handling.py at line 1158][0m [0m
 [1;32mDEBUG:  multi_channel_map = [0m {1: [1], 2: [2], 3: [3], 4: [4], 5: [5], 6: [6], 7: [7], 8: [8], 9: [9], 10: [10], 11: [11], 12: [12], 13: [13], 14: [14], 15: [15], 16: [16], 17: [17], 18: [18], 19: [19], 20: [20], 21: [21], 22: [22], 23: [23], 24: [24], 25: [25], 26: [26], 27: [27], 28: [28], 29: [29], 30: [30], 31: [32], 32: [33], 33: [34], 34: [35], 35: [36], 36: [37], 37: [38], 38: [39], 39: [40], 40: [41], 41: [42], 42: [43], 43: [44], 44: [45], 45: [46], 46: [48], 47: [49], 48: [50], 49: [51], 50: [52], 51: [53], 52: [54], 53: [55], 54: [56], 55: [58], 56: [59], 57: [60], 58: [61], 59: [62], 60: [63], 61: [64], 62: [65], 63: [66], 64: [67], 65: [68], 66: [69], 67: [70], 68: [71], 69: [72], 70: [74], 71: [75], 72: [76], 73: [77], 74: [78], 75: [79], 76: [80], 77: [81], 78: [82], 79: [83], 80: [84], 81: [85], 82: [86], 83: [87], 84: [88], 85: [89], 86: [90], 87: [91], 88: [93], 89: [94], 90: [95], 91: [96], 92: [97], 93: [98], 94: [100], 95: [101], 96: [102], 97: [103], 98: [104], 99: [105], 100: [107], 101: [108], 102: [109], 103: [110], 104: [111], 105: [112]} [1;30m[model_handling.py at line 1643][0m [0m
-[1;32mDEBUG:  diag_to_config = [0m {4: 1, 5: 2, 6: 3, 7: 4, 8: 5, 9: 6, 10: 7, 11: 8, 12: 9, 13: 10, 14: 11, 15: 12, 16: 13, 17: 14, 18: 15, 19: 16, 20: 17, 21: 18, 22: 19, 23: 20, 24: 21, 25: 22, 26: 23, 27: 24, 28: 25, 29: 26, 30: 27, 31: 28, 32: 29, 33: 30, 37: 31, 38: 32, 39: 33, 40: 34, 41: 35, 42: 36, 43: 37, 44: 38, 45: 39, 46: 40, 47: 41, 48: 42, 49: 43, 50: 44, 51: 45, 55: 46, 56: 47, 57: 48, 58: 49, 59: 50, 60: 51, 61: 52, 62: 53, 63: 54, 67: 55, 68: 56, 69: 57, 70: 58, 71: 59, 72: 60, 73: 61, 74: 62, 75: 63, 76: 64, 77: 65, 78: 66, 79: 67, 80: 68, 81: 69, 85: 70, 86: 71, 87: 72, 88: 73, 89: 74, 90: 75, 91: 76, 92: 77, 93: 78, 94: 79, 95: 80, 96: 81, 97: 82, 98: 83, 99: 84, 100: 85, 101: 86, 102: 87, 106: 88, 107: 89, 108: 90, 109: 91, 110: 92, 111: 93, 115: 94, 116: 95, 117: 96, 118: 97, 119: 98, 120: 99, 124: 100, 125: 101, 126: 102, 127: 103, 128: 104, 129: 105} [1;30m[model_handling.py at line 1700][0m [0m
-[1;32mDEBUG:  call = [0m vxxxxx( momenta,m_pars->%s, cHel[ihel][%d],%+d, w_sv[%d], %d ); [1;30m[model_handling.py at line 1812][0m [0m
-[1;32mDEBUG:  ('ZERO', 0, -1, 0, 0) [1;30m[model_handling.py at line 1813][0m [0m
-[1;32mDEBUG:  call = [0m vxxxxx( momenta,m_pars->%s, cHel[ihel][%d],%+d, w_sv[%d], %d ); [1;30m[model_handling.py at line 1812][0m [0m
-[1;32mDEBUG:  ('ZERO', 1, -1, 1, 1) [1;30m[model_handling.py at line 1813][0m [0m
-[1;32mDEBUG:  call = [0m vxxxxx( momenta,m_pars->%s, cHel[ihel][%d],%+d, w_sv[%d], %d ); [1;30m[model_handling.py at line 1812][0m [0m
-[1;32mDEBUG:  ('ZERO', 4, 1, 4, 4) [1;30m[model_handling.py at line 1813][0m [0m
-[1;32mDEBUG:  call = [0m vxxxxx( momenta,m_pars->%s, cHel[ihel][%d],%+d, w_sv[%d], %d ); [1;30m[model_handling.py at line 1812][0m [0m
-[1;32mDEBUG:  ('ZERO', 5, 1, 5, 5) [1;30m[model_handling.py at line 1813][0m [0m
+[1;32mDEBUG:  diag_to_config = [0m {4: 1, 5: 2, 6: 3, 7: 4, 8: 5, 9: 6, 10: 7, 11: 8, 12: 9, 13: 10, 14: 11, 15: 12, 16: 13, 17: 14, 18: 15, 19: 16, 20: 17, 21: 18, 22: 19, 23: 20, 24: 21, 25: 22, 26: 23, 27: 24, 28: 25, 29: 26, 30: 27, 31: 28, 32: 29, 33: 30, 37: 31, 38: 32, 39: 33, 40: 34, 41: 35, 42: 36, 43: 37, 44: 38, 45: 39, 46: 40, 47: 41, 48: 42, 49: 43, 50: 44, 51: 45, 55: 46, 56: 47, 57: 48, 58: 49, 59: 50, 60: 51, 61: 52, 62: 53, 63: 54, 67: 55, 68: 56, 69: 57, 70: 58, 71: 59, 72: 60, 73: 61, 74: 62, 75: 63, 76: 64, 77: 65, 78: 66, 79: 67, 80: 68, 81: 69, 85: 70, 86: 71, 87: 72, 88: 73, 89: 74, 90: 75, 91: 76, 92: 77, 93: 78, 94: 79, 95: 80, 96: 81, 97: 82, 98: 83, 99: 84, 100: 85, 101: 86, 102: 87, 106: 88, 107: 89, 108: 90, 109: 91, 110: 92, 111: 93, 115: 94, 116: 95, 117: 96, 118: 97, 119: 98, 120: 99, 124: 100, 125: 101, 126: 102, 127: 103, 128: 104, 129: 105} [1;30m[model_handling.py at line 1698][0m [0m
+[1;32mDEBUG:  call = [0m vxxxxx( momenta,m_pars->%s, cHel[ihel][%d],%+d, w_sv[%d], %d ); [1;30m[model_handling.py at line 1810][0m [0m
+[1;32mDEBUG:  ('ZERO', 0, -1, 0, 0) [1;30m[model_handling.py at line 1811][0m [0m
+[1;32mDEBUG:  call = [0m vxxxxx( momenta,m_pars->%s, cHel[ihel][%d],%+d, w_sv[%d], %d ); [1;30m[model_handling.py at line 1810][0m [0m
+[1;32mDEBUG:  ('ZERO', 1, -1, 1, 1) [1;30m[model_handling.py at line 1811][0m [0m
+[1;32mDEBUG:  call = [0m vxxxxx( momenta,m_pars->%s, cHel[ihel][%d],%+d, w_sv[%d], %d ); [1;30m[model_handling.py at line 1810][0m [0m
+[1;32mDEBUG:  ('ZERO', 4, 1, 4, 4) [1;30m[model_handling.py at line 1811][0m [0m
+[1;32mDEBUG:  call = [0m vxxxxx( momenta,m_pars->%s, cHel[ihel][%d],%+d, w_sv[%d], %d ); [1;30m[model_handling.py at line 1810][0m [0m
+[1;32mDEBUG:  ('ZERO', 5, 1, 5, 5) [1;30m[model_handling.py at line 1811][0m [0m
 INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./. 
 [1;32mDEBUG:  Entering PLUGIN_OneProcessExporter.edit_CMakeLists [1;30m[model_handling.py at line 1332][0m [0m
 [1;32mDEBUG:  Entering PLUGIN_OneProcessExporter.edit_check_sa [1;30m[model_handling.py at line 1341][0m [0m
@@ -217,15 +217,15 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./.
 [1;32mDEBUG:  Done [1;30m[export_cpp.py at line 713][0m [0m
 INFO: Generating Feynman diagrams for Process: g g > t t~ g g WEIGHTED<=4 @1 
 INFO: Finding symmetric diagrams for subprocess group gg_ttxgg 
-Generated helas calls for 1 subprocesses (123 diagrams) in 0.413 s
-Wrote files for 222 helas calls in 0.707 s
+Generated helas calls for 1 subprocesses (123 diagrams) in 0.380 s
+Wrote files for 222 helas calls in 0.696 s
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates VVV1 routines[0m
 ALOHA: aloha creates FFV1 routines[0m
 ALOHA: aloha creates VVVV1 routines[0m
 ALOHA: aloha creates VVVV3 routines[0m
 ALOHA: aloha creates VVVV4 routines[0m
-ALOHA: aloha creates 5 routines in  0.316 s
+ALOHA: aloha creates 5 routines in  0.281 s
 [1;32mDEBUG:  Entering PLUGIN_ProcessExporter.convert_model (create the model) [1;30m[output.py at line 194][0m [0m
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates VVV1 routines[0m
@@ -233,7 +233,7 @@ ALOHA: aloha creates FFV1 routines[0m
 ALOHA: aloha creates VVVV1 routines[0m
 ALOHA: aloha creates VVVV3 routines[0m
 ALOHA: aloha creates VVVV4 routines[0m
-ALOHA: aloha creates 10 routines in  0.296 s
+ALOHA: aloha creates 10 routines in  0.287 s
 <class 'aloha.create_aloha.AbstractRoutine'> VVV1
 <class 'aloha.create_aloha.AbstractRoutine'> VVV1
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
@@ -284,6 +284,6 @@ Type "launch" to generate events from this process, or see
 Run "open index.html" to see more information about this process.
 quit
 
-real	0m3.597s
-user	0m3.023s
-sys	0m0.207s
+real	0m3.208s
+user	0m2.774s
+sys	0m0.232s
diff --git a/epochX/cudacpp/gg_ttgg.sa/CODEGEN_cudacpp_gg_ttgg_log.txt b/epochX/cudacpp/gg_ttgg.sa/CODEGEN_cudacpp_gg_ttgg_log.txt
index 4f93261b95..a14b6d40d3 100644
--- a/epochX/cudacpp/gg_ttgg.sa/CODEGEN_cudacpp_gg_ttgg_log.txt
+++ b/epochX/cudacpp/gg_ttgg.sa/CODEGEN_cudacpp_gg_ttgg_log.txt
@@ -51,8 +51,8 @@ Note that you can still compile and run aMC@NLO with the built-in PDFs
  MG5_aMC> set lhapdf /PATH/TO/lhapdf-config
 
 Using default text editor "vi". Set another one in ./input/mg5_configuration.txt
-Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt
-No valid web browser found. Please set in ./input/mg5_configuration.txt
+No valid eps viewer found. Please set in ./input/mg5_configuration.txt
+Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt
 import /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_cudacpp_gg_ttgg.mg
 The import format was not given, so we guess it as command
 set stdout_level DEBUG
@@ -62,7 +62,7 @@ generate g g > t t~ g g
 No model currently active, so we import the Standard Model
 INFO: load particles 
 INFO: load vertices 
-[1;32mDEBUG: model prefixing  takes 0.005167722702026367 [0m
+[1;32mDEBUG: model prefixing  takes 0.004655122756958008 [0m
 INFO: Restrict model sm with file models/sm/restrict_default.dat . 
 [1;32mDEBUG: Simplifying conditional expressions [0m
 [1;32mDEBUG: remove interactions: u s w+ at order: QED=1 [0m
@@ -155,7 +155,7 @@ INFO: Please specify coupling orders to bypass this step.
 INFO: Trying coupling order WEIGHTED<=4: WEIGTHED IS QCD+2*QED 
 INFO: Trying process: g g > t t~ g g WEIGHTED<=4 @1  
 INFO: Process has 123 diagrams 
-1 processes with 123 diagrams generated in 0.152 s
+1 processes with 123 diagrams generated in 0.143 s
 Total: 1 processes with 123 diagrams
 output standalone_cudacpp CODEGEN_cudacpp_gg_ttgg
 Load PLUGIN.CUDACPP_SA_OUTPUT
@@ -186,15 +186,15 @@ FileWriter <class 'PLUGIN.CUDACPP_SA_OUTPUT.model_handling.PLUGIN_CPPWriter'> fo
 [1;32mDEBUG:  type(self.helas_call_writer) = [0m <class 'PLUGIN.CUDACPP_SA_OUTPUT.model_handling.PLUGIN_GPUFOHelasCallWriter'> [1;30m[model_handling.py at line 1151][0m [0m
 [1;32mDEBUG:  self.support_multichannel, self.include_multi_channel = [0m True False [1;30m[model_handling.py at line 1152][0m [0m
 [1;32mDEBUG:  multi_channel_map = [0m None [1;30m[model_handling.py at line 1643][0m [0m
-[1;32mDEBUG:  diag_to_config = [0m {} [1;30m[model_handling.py at line 1700][0m [0m
-[1;32mDEBUG:  call = [0m vxxxxx( momenta,m_pars->%s, cHel[ihel][%d],%+d, w_sv[%d], %d ); [1;30m[model_handling.py at line 1812][0m [0m
-[1;32mDEBUG:  ('ZERO', 0, -1, 0, 0) [1;30m[model_handling.py at line 1813][0m [0m
-[1;32mDEBUG:  call = [0m vxxxxx( momenta,m_pars->%s, cHel[ihel][%d],%+d, w_sv[%d], %d ); [1;30m[model_handling.py at line 1812][0m [0m
-[1;32mDEBUG:  ('ZERO', 1, -1, 1, 1) [1;30m[model_handling.py at line 1813][0m [0m
-[1;32mDEBUG:  call = [0m vxxxxx( momenta,m_pars->%s, cHel[ihel][%d],%+d, w_sv[%d], %d ); [1;30m[model_handling.py at line 1812][0m [0m
-[1;32mDEBUG:  ('ZERO', 4, 1, 4, 4) [1;30m[model_handling.py at line 1813][0m [0m
-[1;32mDEBUG:  call = [0m vxxxxx( momenta,m_pars->%s, cHel[ihel][%d],%+d, w_sv[%d], %d ); [1;30m[model_handling.py at line 1812][0m [0m
-[1;32mDEBUG:  ('ZERO', 5, 1, 5, 5) [1;30m[model_handling.py at line 1813][0m [0m
+[1;32mDEBUG:  diag_to_config = [0m {} [1;30m[model_handling.py at line 1698][0m [0m
+[1;32mDEBUG:  call = [0m vxxxxx( momenta,m_pars->%s, cHel[ihel][%d],%+d, w_sv[%d], %d ); [1;30m[model_handling.py at line 1810][0m [0m
+[1;32mDEBUG:  ('ZERO', 0, -1, 0, 0) [1;30m[model_handling.py at line 1811][0m [0m
+[1;32mDEBUG:  call = [0m vxxxxx( momenta,m_pars->%s, cHel[ihel][%d],%+d, w_sv[%d], %d ); [1;30m[model_handling.py at line 1810][0m [0m
+[1;32mDEBUG:  ('ZERO', 1, -1, 1, 1) [1;30m[model_handling.py at line 1811][0m [0m
+[1;32mDEBUG:  call = [0m vxxxxx( momenta,m_pars->%s, cHel[ihel][%d],%+d, w_sv[%d], %d ); [1;30m[model_handling.py at line 1810][0m [0m
+[1;32mDEBUG:  ('ZERO', 4, 1, 4, 4) [1;30m[model_handling.py at line 1811][0m [0m
+[1;32mDEBUG:  call = [0m vxxxxx( momenta,m_pars->%s, cHel[ihel][%d],%+d, w_sv[%d], %d ); [1;30m[model_handling.py at line 1810][0m [0m
+[1;32mDEBUG:  ('ZERO', 5, 1, 5, 5) [1;30m[model_handling.py at line 1811][0m [0m
 INFO: Created files CPPProcess.h and CPPProcess.cc in directory /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_cudacpp_gg_ttgg/SubProcesses/P1_Sigma_sm_gg_ttxgg/. 
 [1;32mDEBUG:  Entering PLUGIN_OneProcessExporter.edit_CMakeLists [1;30m[model_handling.py at line 1332][0m [0m
 [1;32mDEBUG:  Entering PLUGIN_OneProcessExporter.edit_check_sa [1;30m[model_handling.py at line 1341][0m [0m
@@ -204,7 +204,7 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory /data/avalassi/G
 [1;32mDEBUG:  Entering PLUGIN_OneProcessExporter.edit_memorybuffers [1;30m[model_handling.py at line 1419][0m [0m
 [1;32mDEBUG:  Entering PLUGIN_OneProcessExporter.edit_memoryaccesscouplings [1;30m[model_handling.py at line 1430][0m [0m
 [1;32mDEBUG:  'Copying test reference file: ', template_ref  = [0m Copying test reference file:  /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/../../../test/ref/dump_CPUTest.Sigma_sm_gg_ttxgg.txt [1;30m[model_handling.py at line 1324][0m [0m
-Generated helas calls for 1 subprocesses (123 diagrams) in 0.406 s
+Generated helas calls for 1 subprocesses (123 diagrams) in 0.380 s
 [1;32mDEBUG:  Entering PLUGIN_ProcessExporter.convert_model (create the model) [1;30m[output.py at line 194][0m [0m
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates VVV1 routines[0m
@@ -212,7 +212,7 @@ ALOHA: aloha creates FFV1 routines[0m
 ALOHA: aloha creates VVVV1 routines[0m
 ALOHA: aloha creates VVVV3 routines[0m
 ALOHA: aloha creates VVVV4 routines[0m
-ALOHA: aloha creates 5 routines in  0.308 s
+ALOHA: aloha creates 5 routines in  0.271 s
 <class 'aloha.create_aloha.AbstractRoutine'> VVV1
 <class 'aloha.create_aloha.AbstractRoutine'> VVV1
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
@@ -254,6 +254,6 @@ INFO: /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_cudacpp_gg_ttgg/src/.
 [1;32mDEBUG:  Entering PLUGIN_ProcessExporter.finalize [1;30m[output.py at line 203][0m [0m
 quit
 
-real	0m1.524s
-user	0m1.397s
-sys	0m0.070s
+real	0m1.382s
+user	0m1.312s
+sys	0m0.060s
diff --git a/epochX/cudacpp/gg_ttggg.mad/CODEGEN_mad_gg_ttggg_log.txt b/epochX/cudacpp/gg_ttggg.mad/CODEGEN_mad_gg_ttggg_log.txt
index 5b1f36bf57..99d1f8f4a8 100644
--- a/epochX/cudacpp/gg_ttggg.mad/CODEGEN_mad_gg_ttggg_log.txt
+++ b/epochX/cudacpp/gg_ttggg.mad/CODEGEN_mad_gg_ttggg_log.txt
@@ -51,8 +51,8 @@ Note that you can still compile and run aMC@NLO with the built-in PDFs
  MG5_aMC> set lhapdf /PATH/TO/lhapdf-config
 
 Using default text editor "vi". Set another one in ./input/mg5_configuration.txt
-Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt
-No valid web browser found. Please set in ./input/mg5_configuration.txt
+No valid eps viewer found. Please set in ./input/mg5_configuration.txt
+Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt
 import /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_mad_gg_ttggg.mg
 The import format was not given, so we guess it as command
 set stdout_level DEBUG
@@ -62,7 +62,7 @@ generate g g > t t~ g g g
 No model currently active, so we import the Standard Model
 INFO: load particles 
 INFO: load vertices 
-[1;32mDEBUG: model prefixing  takes 0.004990100860595703 [0m
+[1;32mDEBUG: model prefixing  takes 0.004646778106689453 [0m
 INFO: Restrict model sm with file models/sm/restrict_default.dat . 
 [1;32mDEBUG: Simplifying conditional expressions [0m
 [1;32mDEBUG: remove interactions: u s w+ at order: QED=1 [0m
@@ -155,7 +155,7 @@ INFO: Please specify coupling orders to bypass this step.
 INFO: Trying coupling order WEIGHTED<=5: WEIGTHED IS QCD+2*QED 
 INFO: Trying process: g g > t t~ g g g WEIGHTED<=5 @1  
 INFO: Process has 1240 diagrams 
-1 processes with 1240 diagrams generated in 1.812 s
+1 processes with 1240 diagrams generated in 1.702 s
 Total: 1 processes with 1240 diagrams
 output madevent CODEGEN_mad_gg_ttggg --hel_recycling=False --vector_size=16384 --me_exporter=standalone_cudacpp
 Load PLUGIN.CUDACPP_SA_OUTPUT
@@ -175,11 +175,11 @@ INFO: Generating Helas calls for process: g g > t t~ g g g WEIGHTED<=5 @1
 INFO: Processing color information for process: g g > t t~ g g g @1 
 INFO: Creating files in directory P1_gg_ttxggg 
 INFO: Computing Color-Flow optimization [15120 term] 
-INFO: Color-Flow passed to 1592 term in 33s. Introduce 2768 contraction 
+INFO: Color-Flow passed to 1592 term in 30s. Introduce 2768 contraction 
 [1;32mDEBUG:  Entering PLUGIN_OneProcessExporter.__init__ [1;30m[model_handling.py at line 1028][0m [0m
 [1;32mDEBUG:  kwargs[prefix] = 0 [1;30m[model_handling.py at line 1029][0m [0m
 [1;32mDEBUG:  proc_id = [0m 1 [1;30m[model_handling.py at line 1034][0m [0m
-[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_SA_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7f344337fc70> [1;30m[export_v4.py at line 6174][0m [0m
+[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_SA_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7ffa3e89c250> [1;30m[export_v4.py at line 6174][0m [0m
 INFO: Creating files in directory . 
 [1;32mDEBUG:  Entering PLUGIN_OneProcessExporter.generate_process_files [1;30m[model_handling.py at line 1286][0m [0m
 [1;32mDEBUG:  self.include_multi_channel is already defined: this is madevent+second_exporter mode [1;30m[model_handling.py at line 1288][0m [0m
@@ -194,17 +194,17 @@ FileWriter <class 'PLUGIN.CUDACPP_SA_OUTPUT.model_handling.PLUGIN_CPPWriter'> fo
 [1;32mDEBUG:  self.support_multichannel, self.include_multi_channel = [0m True [1, 2, 0, 3, 4, 0, 5, 6, 0, 0, 0, 0, 0, 7, 8, 9, 0, 10, 11, 12, 0, 13, 14, 15, 0, 16, 17, 18, 19, 20, 21, 0, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 0, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 0, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 0, 67, 68, 69, 70, 71, 72, 73, 74, 75, 0, 76, 77, 78, 79, 80, 81, 82, 83, 84, 0, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 0, 0, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 0, 121, 122, 0, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 0, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 0, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196, 0, 197, 198, 199, 200, 201, 202, 0, 203, 204, 205, 206, 207, 208, 0, 209, 210, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223, 224, 225, 0, 226, 227, 0, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239, 240, 241, 242, 0, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254, 255, 256, 257, 0, 258, 259, 260, 261, 262, 263, 264, 265, 266, 267, 268, 269, 270, 271, 272, 273, 274, 275, 276, 277, 278, 279, 280, 281, 282, 283, 284, 285, 286, 287, 288, 289, 290, 291, 292, 293, 294, 295, 296, 297, 298, 299, 300, 301, 0, 302, 303, 304, 305, 306, 307, 0, 308, 309, 310, 311, 312, 313, 0, 314, 315, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 316, 317, 318, 319, 320, 321, 0, 322, 323, 324, 325, 326, 327, 328, 329, 330, 331, 332, 333, 334, 335, 336, 0, 337, 338, 339, 340, 341, 342, 343, 344, 345, 346, 347, 348, 349, 350, 351, 0, 352, 353, 354, 355, 356, 357, 358, 359, 360, 361, 362, 363, 364, 365, 366, 0, 367, 368, 369, 370, 371, 372, 373, 374, 375, 376, 377, 0, 378, 379, 0, 380, 381, 0, 0, 0, 0, 0, 382, 383, 384, 385, 386, 387, 388, 389, 390, 0, 391, 392, 393, 394, 395, 396, 397, 398, 399, 0, 400, 401, 402, 403, 404, 405, 406, 407, 408, 0, 409, 410, 411, 412, 413, 414, 0, 415, 416, 417, 418, 419, 420, 0, 0, 0, 421, 422, 423, 424, 425, 426, 0, 427, 428, 429, 430, 431, 432, 433, 434, 435, 436, 437, 438, 439, 440, 441, 0, 442, 443, 444, 445, 446, 447, 448, 449, 450, 451, 452, 453, 454, 455, 456, 0, 457, 458, 459, 460, 461, 462, 463, 464, 465, 466, 467, 468, 469, 470, 471, 0, 472, 473, 474, 475, 476, 477, 478, 479, 480, 481, 482, 0, 483, 484, 0, 485, 486, 0, 0, 0, 0, 0, 487, 488, 489, 490, 491, 492, 493, 494, 495, 0, 496, 497, 498, 499, 500, 501, 502, 503, 504, 0, 505, 506, 507, 508, 509, 510, 511, 512, 513, 0, 514, 515, 516, 517, 518, 519, 0, 520, 521, 522, 523, 524, 525, 0, 0, 0, 526, 527, 528, 529, 530, 531, 0, 532, 533, 534, 535, 536, 537, 538, 539, 540, 541, 542, 543, 544, 545, 546, 0, 547, 548, 549, 550, 551, 552, 553, 554, 555, 556, 557, 558, 559, 560, 561, 0, 562, 563, 564, 565, 566, 567, 568, 569, 570, 571, 572, 573, 574, 575, 576, 0, 577, 578, 579, 580, 581, 582, 583, 584, 585, 586, 587, 0, 588, 589, 0, 590, 591, 0, 0, 0, 0, 0, 592, 593, 594, 595, 596, 597, 598, 599, 600, 0, 601, 602, 603, 604, 605, 606, 607, 608, 609, 0, 610, 611, 612, 613, 614, 615, 616, 617, 618, 0, 619, 620, 621, 622, 623, 624, 0, 625, 626, 627, 628, 629, 630, 0, 0, 0, 631, 632, 633, 634, 635, 636, 637, 638, 639, 640, 641, 642, 643, 644, 645, 646, 647, 648, 649, 650, 651, 652, 653, 654, 655, 656, 657, 658, 659, 660, 661, 662, 663, 0, 664, 665, 666, 667, 668, 669, 0, 670, 671, 672, 673, 674, 675, 0, 0, 0, 676, 677, 678, 679, 680, 681, 682, 683, 684, 685, 686, 687, 688, 689, 690, 691, 692, 693, 694, 695, 696, 697, 698, 699, 700, 701, 702, 703, 704, 705, 706, 707, 708, 0, 709, 710, 711, 712, 713, 714, 0, 715, 716, 717, 718, 719, 720, 0, 0, 0, 721, 722, 0, 723, 724, 0, 725, 726, 0, 0, 0, 0, 0, 727, 728, 729, 730, 731, 732, 733, 734, 735, 0, 736, 737, 738, 739, 740, 741, 742, 743, 744, 0, 745, 746, 747, 748, 749, 750, 751, 752, 753, 0, 754, 755, 756, 757, 758, 759, 0, 760, 761, 762, 763, 764, 765, 766, 767, 0, 768, 769, 0, 770, 771, 0, 0, 0, 0, 0, 772, 773, 774, 775, 776, 777, 778, 779, 780, 0, 781, 782, 783, 784, 785, 786, 787, 788, 789, 0, 790, 791, 792, 793, 794, 795, 796, 797, 798, 0, 799, 800, 801, 802, 803, 804, 0, 805, 806, 807, 808, 809, 810, 811, 812, 0, 813, 814, 0, 815, 816, 0, 0, 0, 0, 0, 817, 818, 819, 820, 821, 822, 823, 824, 825, 0, 826, 827, 828, 829, 830, 831, 832, 833, 834, 0, 835, 836, 837, 838, 839, 840, 841, 842, 843, 0, 844, 845, 846, 847, 848, 849, 0, 850, 851, 852, 853, 854, 855, 856, 857, 0, 858, 859, 0, 860, 861, 0, 0, 0, 0, 862, 863, 0, 864, 865, 0, 866, 867, 0, 0, 0, 0, 868, 869, 0, 870, 871, 0, 872, 873, 0, 0, 0, 0, 0, 0, 0, 874, 875, 876, 877, 878, 879, 880, 881, 882, 883, 884, 885, 886, 887, 888, 889, 890, 891, 0, 892, 893, 894, 895, 896, 897, 898, 899, 900, 901, 902, 903, 904, 905, 906, 907, 908, 909, 0, 910, 911, 912, 913, 914, 915, 916, 917, 918, 919, 920, 921, 922, 923, 924, 925, 926, 927, 0, 928, 929, 930, 931, 932, 933, 0, 934, 935, 936, 937, 938, 939, 0, 940, 941, 942, 943, 944, 945, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] [1;30m[model_handling.py at line 1152][0m [0m
 [1;32mDEBUG:  multi_channel = [0m {1: [0], 2: [1], 3: [3], 4: [4], 5: [6], 6: [7], 7: [13], 8: [14], 9: [15], 10: [17], 11: [18], 12: [19], 13: [21], 14: [22], 15: [23], 16: [25], 17: [26], 18: [27], 19: [28], 20: [29], 21: [30], 22: [32], 23: [33], 24: [34], 25: [35], 26: [36], 27: [37], 28: [38], 29: [39], 30: [40], 31: [41], 32: [42], 33: [43], 34: [44], 35: [45], 36: [46], 37: [48], 38: [49], 39: [50], 40: [51], 41: [52], 42: [53], 43: [54], 44: [55], 45: [56], 46: [57], 47: [58], 48: [59], 49: [60], 50: [61], 51: [62], 52: [64], 53: [65], 54: [66], 55: [67], 56: [68], 57: [69], 58: [70], 59: [71], 60: [72], 61: [73], 62: [74], 63: [75], 64: [76], 65: [77], 66: [78], 67: [80], 68: [81], 69: [82], 70: [83], 71: [84], 72: [85], 73: [86], 74: [87], 75: [88], 76: [90], 77: [91], 78: [92], 79: [93], 80: [94], 81: [95], 82: [96], 83: [97], 84: [98], 85: [100], 86: [101], 87: [102], 88: [103], 89: [104], 90: [105], 91: [106], 92: [107], 93: [108], 94: [109], 95: [110], 96: [111], 97: [112], 98: [113], 99: [114], 100: [115], 101: [116], 102: [117], 103: [118], 104: [119], 105: [120], 106: [123], 107: [124], 108: [125], 109: [126], 110: [127], 111: [128], 112: [129], 113: [130], 114: [131], 115: [132], 116: [133], 117: [134], 118: [135], 119: [136], 120: [137], 121: [139], 122: [140], 123: [142], 124: [143], 125: [144], 126: [145], 127: [146], 128: [147], 129: [148], 130: [149], 131: [150], 132: [151], 133: [152], 134: [153], 135: [154], 136: [155], 137: [156], 138: [158], 139: [159], 140: [160], 141: [161], 142: [162], 143: [163], 144: [164], 145: [165], 146: [166], 147: [167], 148: [168], 149: [169], 150: [170], 151: [171], 152: [172], 153: [174], 154: [175], 155: [176], 156: [177], 157: [178], 158: [179], 159: [180], 160: [181], 161: [182], 162: [183], 163: [184], 164: [185], 165: [186], 166: [187], 167: [188], 168: [189], 169: [190], 170: [191], 171: [192], 172: [193], 173: [194], 174: [195], 175: [196], 176: [197], 177: [198], 178: [199], 179: [200], 180: [201], 181: [202], 182: [203], 183: [204], 184: [205], 185: [206], 186: [207], 187: [208], 188: [209], 189: [210], 190: [211], 191: [212], 192: [213], 193: [214], 194: [215], 195: [216], 196: [217], 197: [219], 198: [220], 199: [221], 200: [222], 201: [223], 202: [224], 203: [226], 204: [227], 205: [228], 206: [229], 207: [230], 208: [231], 209: [233], 210: [234], 211: [246], 212: [247], 213: [248], 214: [249], 215: [250], 216: [251], 217: [252], 218: [253], 219: [254], 220: [255], 221: [256], 222: [257], 223: [258], 224: [259], 225: [260], 226: [262], 227: [263], 228: [265], 229: [266], 230: [267], 231: [268], 232: [269], 233: [270], 234: [271], 235: [272], 236: [273], 237: [274], 238: [275], 239: [276], 240: [277], 241: [278], 242: [279], 243: [281], 244: [282], 245: [283], 246: [284], 247: [285], 248: [286], 249: [287], 250: [288], 251: [289], 252: [290], 253: [291], 254: [292], 255: [293], 256: [294], 257: [295], 258: [297], 259: [298], 260: [299], 261: [300], 262: [301], 263: [302], 264: [303], 265: [304], 266: [305], 267: [306], 268: [307], 269: [308], 270: [309], 271: [310], 272: [311], 273: [312], 274: [313], 275: [314], 276: [315], 277: [316], 278: [317], 279: [318], 280: [319], 281: [320], 282: [321], 283: [322], 284: [323], 285: [324], 286: [325], 287: [326], 288: [327], 289: [328], 290: [329], 291: [330], 292: [331], 293: [332], 294: [333], 295: [334], 296: [335], 297: [336], 298: [337], 299: [338], 300: [339], 301: [340], 302: [342], 303: [343], 304: [344], 305: [345], 306: [346], 307: [347], 308: [349], 309: [350], 310: [351], 311: [352], 312: [353], 313: [354], 314: [356], 315: [357], 316: [369], 317: [370], 318: [371], 319: [372], 320: [373], 321: [374], 322: [376], 323: [377], 324: [378], 325: [379], 326: [380], 327: [381], 328: [382], 329: [383], 330: [384], 331: [385], 332: [386], 333: [387], 334: [388], 335: [389], 336: [390], 337: [392], 338: [393], 339: [394], 340: [395], 341: [396], 342: [397], 343: [398], 344: [399], 345: [400], 346: [401], 347: [402], 348: [403], 349: [404], 350: [405], 351: [406], 352: [408], 353: [409], 354: [410], 355: [411], 356: [412], 357: [413], 358: [414], 359: [415], 360: [416], 361: [417], 362: [418], 363: [419], 364: [420], 365: [421], 366: [422], 367: [424], 368: [425], 369: [426], 370: [427], 371: [428], 372: [429], 373: [430], 374: [431], 375: [432], 376: [433], 377: [434], 378: [436], 379: [437], 380: [439], 381: [440], 382: [446], 383: [447], 384: [448], 385: [449], 386: [450], 387: [451], 388: [452], 389: [453], 390: [454], 391: [456], 392: [457], 393: [458], 394: [459], 395: [460], 396: [461], 397: [462], 398: [463], 399: [464], 400: [466], 401: [467], 402: [468], 403: [469], 404: [470], 405: [471], 406: [472], 407: [473], 408: [474], 409: [476], 410: [477], 411: [478], 412: [479], 413: [480], 414: [481], 415: [483], 416: [484], 417: [485], 418: [486], 419: [487], 420: [488], 421: [492], 422: [493], 423: [494], 424: [495], 425: [496], 426: [497], 427: [499], 428: [500], 429: [501], 430: [502], 431: [503], 432: [504], 433: [505], 434: [506], 435: [507], 436: [508], 437: [509], 438: [510], 439: [511], 440: [512], 441: [513], 442: [515], 443: [516], 444: [517], 445: [518], 446: [519], 447: [520], 448: [521], 449: [522], 450: [523], 451: [524], 452: [525], 453: [526], 454: [527], 455: [528], 456: [529], 457: [531], 458: [532], 459: [533], 460: [534], 461: [535], 462: [536], 463: [537], 464: [538], 465: [539], 466: [540], 467: [541], 468: [542], 469: [543], 470: [544], 471: [545], 472: [547], 473: [548], 474: [549], 475: [550], 476: [551], 477: [552], 478: [553], 479: [554], 480: [555], 481: [556], 482: [557], 483: [559], 484: [560], 485: [562], 486: [563], 487: [569], 488: [570], 489: [571], 490: [572], 491: [573], 492: [574], 493: [575], 494: [576], 495: [577], 496: [579], 497: [580], 498: [581], 499: [582], 500: [583], 501: [584], 502: [585], 503: [586], 504: [587], 505: [589], 506: [590], 507: [591], 508: [592], 509: [593], 510: [594], 511: [595], 512: [596], 513: [597], 514: [599], 515: [600], 516: [601], 517: [602], 518: [603], 519: [604], 520: [606], 521: [607], 522: [608], 523: [609], 524: [610], 525: [611], 526: [615], 527: [616], 528: [617], 529: [618], 530: [619], 531: [620], 532: [622], 533: [623], 534: [624], 535: [625], 536: [626], 537: [627], 538: [628], 539: [629], 540: [630], 541: [631], 542: [632], 543: [633], 544: [634], 545: [635], 546: [636], 547: [638], 548: [639], 549: [640], 550: [641], 551: [642], 552: [643], 553: [644], 554: [645], 555: [646], 556: [647], 557: [648], 558: [649], 559: [650], 560: [651], 561: [652], 562: [654], 563: [655], 564: [656], 565: [657], 566: [658], 567: [659], 568: [660], 569: [661], 570: [662], 571: [663], 572: [664], 573: [665], 574: [666], 575: [667], 576: [668], 577: [670], 578: [671], 579: [672], 580: [673], 581: [674], 582: [675], 583: [676], 584: [677], 585: [678], 586: [679], 587: [680], 588: [682], 589: [683], 590: [685], 591: [686], 592: [692], 593: [693], 594: [694], 595: [695], 596: [696], 597: [697], 598: [698], 599: [699], 600: [700], 601: [702], 602: [703], 603: [704], 604: [705], 605: [706], 606: [707], 607: [708], 608: [709], 609: [710], 610: [712], 611: [713], 612: [714], 613: [715], 614: [716], 615: [717], 616: [718], 617: [719], 618: [720], 619: [722], 620: [723], 621: [724], 622: [725], 623: [726], 624: [727], 625: [729], 626: [730], 627: [731], 628: [732], 629: [733], 630: [734], 631: [738], 632: [739], 633: [740], 634: [741], 635: [742], 636: [743], 637: [744], 638: [745], 639: [746], 640: [747], 641: [748], 642: [749], 643: [750], 644: [751], 645: [752], 646: [753], 647: [754], 648: [755], 649: [756], 650: [757], 651: [758], 652: [759], 653: [760], 654: [761], 655: [762], 656: [763], 657: [764], 658: [765], 659: [766], 660: [767], 661: [768], 662: [769], 663: [770], 664: [772], 665: [773], 666: [774], 667: [775], 668: [776], 669: [777], 670: [779], 671: [780], 672: [781], 673: [782], 674: [783], 675: [784], 676: [788], 677: [789], 678: [790], 679: [791], 680: [792], 681: [793], 682: [794], 683: [795], 684: [796], 685: [797], 686: [798], 687: [799], 688: [800], 689: [801], 690: [802], 691: [803], 692: [804], 693: [805], 694: [806], 695: [807], 696: [808], 697: [809], 698: [810], 699: [811], 700: [812], 701: [813], 702: [814], 703: [815], 704: [816], 705: [817], 706: [818], 707: [819], 708: [820], 709: [822], 710: [823], 711: [824], 712: [825], 713: [826], 714: [827], 715: [829], 716: [830], 717: [831], 718: [832], 719: [833], 720: [834], 721: [838], 722: [839], 723: [841], 724: [842], 725: [844], 726: [845], 727: [851], 728: [852], 729: [853], 730: [854], 731: [855], 732: [856], 733: [857], 734: [858], 735: [859], 736: [861], 737: [862], 738: [863], 739: [864], 740: [865], 741: [866], 742: [867], 743: [868], 744: [869], 745: [871], 746: [872], 747: [873], 748: [874], 749: [875], 750: [876], 751: [877], 752: [878], 753: [879], 754: [881], 755: [882], 756: [883], 757: [884], 758: [885], 759: [886], 760: [888], 761: [889], 762: [890], 763: [891], 764: [892], 765: [893], 766: [894], 767: [895], 768: [897], 769: [898], 770: [900], 771: [901], 772: [907], 773: [908], 774: [909], 775: [910], 776: [911], 777: [912], 778: [913], 779: [914], 780: [915], 781: [917], 782: [918], 783: [919], 784: [920], 785: [921], 786: [922], 787: [923], 788: [924], 789: [925], 790: [927], 791: [928], 792: [929], 793: [930], 794: [931], 795: [932], 796: [933], 797: [934], 798: [935], 799: [937], 800: [938], 801: [939], 802: [940], 803: [941], 804: [942], 805: [944], 806: [945], 807: [946], 808: [947], 809: [948], 810: [949], 811: [950], 812: [951], 813: [953], 814: [954], 815: [956], 816: [957], 817: [963], 818: [964], 819: [965], 820: [966], 821: [967], 822: [968], 823: [969], 824: [970], 825: [971], 826: [973], 827: [974], 828: [975], 829: [976], 830: [977], 831: [978], 832: [979], 833: [980], 834: [981], 835: [983], 836: [984], 837: [985], 838: [986], 839: [987], 840: [988], 841: [989], 842: [990], 843: [991], 844: [993], 845: [994], 846: [995], 847: [996], 848: [997], 849: [998], 850: [1000], 851: [1001], 852: [1002], 853: [1003], 854: [1004], 855: [1005], 856: [1006], 857: [1007], 858: [1009], 859: [1010], 860: [1012], 861: [1013], 862: [1018], 863: [1019], 864: [1021], 865: [1022], 866: [1024], 867: [1025], 868: [1030], 869: [1031], 870: [1033], 871: [1034], 872: [1036], 873: [1037], 874: [1045], 875: [1046], 876: [1047], 877: [1048], 878: [1049], 879: [1050], 880: [1051], 881: [1052], 882: [1053], 883: [1054], 884: [1055], 885: [1056], 886: [1057], 887: [1058], 888: [1059], 889: [1060], 890: [1061], 891: [1062], 892: [1064], 893: [1065], 894: [1066], 895: [1067], 896: [1068], 897: [1069], 898: [1070], 899: [1071], 900: [1072], 901: [1073], 902: [1074], 903: [1075], 904: [1076], 905: [1077], 906: [1078], 907: [1079], 908: [1080], 909: [1081], 910: [1083], 911: [1084], 912: [1085], 913: [1086], 914: [1087], 915: [1088], 916: [1089], 917: [1090], 918: [1091], 919: [1092], 920: [1093], 921: [1094], 922: [1095], 923: [1096], 924: [1097], 925: [1098], 926: [1099], 927: [1100], 928: [1102], 929: [1103], 930: [1104], 931: [1105], 932: [1106], 933: [1107], 934: [1109], 935: [1110], 936: [1111], 937: [1112], 938: [1113], 939: [1114], 940: [1116], 941: [1117], 942: [1118], 943: [1119], 944: [1120], 945: [1121]} [1;30m[model_handling.py at line 1158][0m [0m
 [1;32mDEBUG:  multi_channel_map = [0m {1: [0], 2: [1], 3: [3], 4: [4], 5: [6], 6: [7], 7: [13], 8: [14], 9: [15], 10: [17], 11: [18], 12: [19], 13: [21], 14: [22], 15: [23], 16: [25], 17: [26], 18: [27], 19: [28], 20: [29], 21: [30], 22: [32], 23: [33], 24: [34], 25: [35], 26: [36], 27: [37], 28: [38], 29: [39], 30: [40], 31: [41], 32: [42], 33: [43], 34: [44], 35: [45], 36: [46], 37: [48], 38: [49], 39: [50], 40: [51], 41: [52], 42: [53], 43: [54], 44: [55], 45: [56], 46: [57], 47: [58], 48: [59], 49: [60], 50: [61], 51: [62], 52: [64], 53: [65], 54: [66], 55: [67], 56: [68], 57: [69], 58: [70], 59: [71], 60: [72], 61: [73], 62: [74], 63: [75], 64: [76], 65: [77], 66: [78], 67: [80], 68: [81], 69: [82], 70: [83], 71: [84], 72: [85], 73: [86], 74: [87], 75: [88], 76: [90], 77: [91], 78: [92], 79: [93], 80: [94], 81: [95], 82: [96], 83: [97], 84: [98], 85: [100], 86: [101], 87: [102], 88: [103], 89: [104], 90: [105], 91: [106], 92: [107], 93: [108], 94: [109], 95: [110], 96: [111], 97: [112], 98: [113], 99: [114], 100: [115], 101: [116], 102: [117], 103: [118], 104: [119], 105: [120], 106: [123], 107: [124], 108: [125], 109: [126], 110: [127], 111: [128], 112: [129], 113: [130], 114: [131], 115: [132], 116: [133], 117: [134], 118: [135], 119: [136], 120: [137], 121: [139], 122: [140], 123: [142], 124: [143], 125: [144], 126: [145], 127: [146], 128: [147], 129: [148], 130: [149], 131: [150], 132: [151], 133: [152], 134: [153], 135: [154], 136: [155], 137: [156], 138: [158], 139: [159], 140: [160], 141: [161], 142: [162], 143: [163], 144: [164], 145: [165], 146: [166], 147: [167], 148: [168], 149: [169], 150: [170], 151: [171], 152: [172], 153: [174], 154: [175], 155: [176], 156: [177], 157: [178], 158: [179], 159: [180], 160: [181], 161: [182], 162: [183], 163: [184], 164: [185], 165: [186], 166: [187], 167: [188], 168: [189], 169: [190], 170: [191], 171: [192], 172: [193], 173: [194], 174: [195], 175: [196], 176: [197], 177: [198], 178: [199], 179: [200], 180: [201], 181: [202], 182: [203], 183: [204], 184: [205], 185: [206], 186: [207], 187: [208], 188: [209], 189: [210], 190: [211], 191: [212], 192: [213], 193: [214], 194: [215], 195: [216], 196: [217], 197: [219], 198: [220], 199: [221], 200: [222], 201: [223], 202: [224], 203: [226], 204: [227], 205: [228], 206: [229], 207: [230], 208: [231], 209: [233], 210: [234], 211: [246], 212: [247], 213: [248], 214: [249], 215: [250], 216: [251], 217: [252], 218: [253], 219: [254], 220: [255], 221: [256], 222: [257], 223: [258], 224: [259], 225: [260], 226: [262], 227: [263], 228: [265], 229: [266], 230: [267], 231: [268], 232: [269], 233: [270], 234: [271], 235: [272], 236: [273], 237: [274], 238: [275], 239: [276], 240: [277], 241: [278], 242: [279], 243: [281], 244: [282], 245: [283], 246: [284], 247: [285], 248: [286], 249: [287], 250: [288], 251: [289], 252: [290], 253: [291], 254: [292], 255: [293], 256: [294], 257: [295], 258: [297], 259: [298], 260: [299], 261: [300], 262: [301], 263: [302], 264: [303], 265: [304], 266: [305], 267: [306], 268: [307], 269: [308], 270: [309], 271: [310], 272: [311], 273: [312], 274: [313], 275: [314], 276: [315], 277: [316], 278: [317], 279: [318], 280: [319], 281: [320], 282: [321], 283: [322], 284: [323], 285: [324], 286: [325], 287: [326], 288: [327], 289: [328], 290: [329], 291: [330], 292: [331], 293: [332], 294: [333], 295: [334], 296: [335], 297: [336], 298: [337], 299: [338], 300: [339], 301: [340], 302: [342], 303: [343], 304: [344], 305: [345], 306: [346], 307: [347], 308: [349], 309: [350], 310: [351], 311: [352], 312: [353], 313: [354], 314: [356], 315: [357], 316: [369], 317: [370], 318: [371], 319: [372], 320: [373], 321: [374], 322: [376], 323: [377], 324: [378], 325: [379], 326: [380], 327: [381], 328: [382], 329: [383], 330: [384], 331: [385], 332: [386], 333: [387], 334: [388], 335: [389], 336: [390], 337: [392], 338: [393], 339: [394], 340: [395], 341: [396], 342: [397], 343: [398], 344: [399], 345: [400], 346: [401], 347: [402], 348: [403], 349: [404], 350: [405], 351: [406], 352: [408], 353: [409], 354: [410], 355: [411], 356: [412], 357: [413], 358: [414], 359: [415], 360: [416], 361: [417], 362: [418], 363: [419], 364: [420], 365: [421], 366: [422], 367: [424], 368: [425], 369: [426], 370: [427], 371: [428], 372: [429], 373: [430], 374: [431], 375: [432], 376: [433], 377: [434], 378: [436], 379: [437], 380: [439], 381: [440], 382: [446], 383: [447], 384: [448], 385: [449], 386: [450], 387: [451], 388: [452], 389: [453], 390: [454], 391: [456], 392: [457], 393: [458], 394: [459], 395: [460], 396: [461], 397: [462], 398: [463], 399: [464], 400: [466], 401: [467], 402: [468], 403: [469], 404: [470], 405: [471], 406: [472], 407: [473], 408: [474], 409: [476], 410: [477], 411: [478], 412: [479], 413: [480], 414: [481], 415: [483], 416: [484], 417: [485], 418: [486], 419: [487], 420: [488], 421: [492], 422: [493], 423: [494], 424: [495], 425: [496], 426: [497], 427: [499], 428: [500], 429: [501], 430: [502], 431: [503], 432: [504], 433: [505], 434: [506], 435: [507], 436: [508], 437: [509], 438: [510], 439: [511], 440: [512], 441: [513], 442: [515], 443: [516], 444: [517], 445: [518], 446: [519], 447: [520], 448: [521], 449: [522], 450: [523], 451: [524], 452: [525], 453: [526], 454: [527], 455: [528], 456: [529], 457: [531], 458: [532], 459: [533], 460: [534], 461: [535], 462: [536], 463: [537], 464: [538], 465: [539], 466: [540], 467: [541], 468: [542], 469: [543], 470: [544], 471: [545], 472: [547], 473: [548], 474: [549], 475: [550], 476: [551], 477: [552], 478: [553], 479: [554], 480: [555], 481: [556], 482: [557], 483: [559], 484: [560], 485: [562], 486: [563], 487: [569], 488: [570], 489: [571], 490: [572], 491: [573], 492: [574], 493: [575], 494: [576], 495: [577], 496: [579], 497: [580], 498: [581], 499: [582], 500: [583], 501: [584], 502: [585], 503: [586], 504: [587], 505: [589], 506: [590], 507: [591], 508: [592], 509: [593], 510: [594], 511: [595], 512: [596], 513: [597], 514: [599], 515: [600], 516: [601], 517: [602], 518: [603], 519: [604], 520: [606], 521: [607], 522: [608], 523: [609], 524: [610], 525: [611], 526: [615], 527: [616], 528: [617], 529: [618], 530: [619], 531: [620], 532: [622], 533: [623], 534: [624], 535: [625], 536: [626], 537: [627], 538: [628], 539: [629], 540: [630], 541: [631], 542: [632], 543: [633], 544: [634], 545: [635], 546: [636], 547: [638], 548: [639], 549: [640], 550: [641], 551: [642], 552: [643], 553: [644], 554: [645], 555: [646], 556: [647], 557: [648], 558: [649], 559: [650], 560: [651], 561: [652], 562: [654], 563: [655], 564: [656], 565: [657], 566: [658], 567: [659], 568: [660], 569: [661], 570: [662], 571: [663], 572: [664], 573: [665], 574: [666], 575: [667], 576: [668], 577: [670], 578: [671], 579: [672], 580: [673], 581: [674], 582: [675], 583: [676], 584: [677], 585: [678], 586: [679], 587: [680], 588: [682], 589: [683], 590: [685], 591: [686], 592: [692], 593: [693], 594: [694], 595: [695], 596: [696], 597: [697], 598: [698], 599: [699], 600: [700], 601: [702], 602: [703], 603: [704], 604: [705], 605: [706], 606: [707], 607: [708], 608: [709], 609: [710], 610: [712], 611: [713], 612: [714], 613: [715], 614: [716], 615: [717], 616: [718], 617: [719], 618: [720], 619: [722], 620: [723], 621: [724], 622: [725], 623: [726], 624: [727], 625: [729], 626: [730], 627: [731], 628: [732], 629: [733], 630: [734], 631: [738], 632: [739], 633: [740], 634: [741], 635: [742], 636: [743], 637: [744], 638: [745], 639: [746], 640: [747], 641: [748], 642: [749], 643: [750], 644: [751], 645: [752], 646: [753], 647: [754], 648: [755], 649: [756], 650: [757], 651: [758], 652: [759], 653: [760], 654: [761], 655: [762], 656: [763], 657: [764], 658: [765], 659: [766], 660: [767], 661: [768], 662: [769], 663: [770], 664: [772], 665: [773], 666: [774], 667: [775], 668: [776], 669: [777], 670: [779], 671: [780], 672: [781], 673: [782], 674: [783], 675: [784], 676: [788], 677: [789], 678: [790], 679: [791], 680: [792], 681: [793], 682: [794], 683: [795], 684: [796], 685: [797], 686: [798], 687: [799], 688: [800], 689: [801], 690: [802], 691: [803], 692: [804], 693: [805], 694: [806], 695: [807], 696: [808], 697: [809], 698: [810], 699: [811], 700: [812], 701: [813], 702: [814], 703: [815], 704: [816], 705: [817], 706: [818], 707: [819], 708: [820], 709: [822], 710: [823], 711: [824], 712: [825], 713: [826], 714: [827], 715: [829], 716: [830], 717: [831], 718: [832], 719: [833], 720: [834], 721: [838], 722: [839], 723: [841], 724: [842], 725: [844], 726: [845], 727: [851], 728: [852], 729: [853], 730: [854], 731: [855], 732: [856], 733: [857], 734: [858], 735: [859], 736: [861], 737: [862], 738: [863], 739: [864], 740: [865], 741: [866], 742: [867], 743: [868], 744: [869], 745: [871], 746: [872], 747: [873], 748: [874], 749: [875], 750: [876], 751: [877], 752: [878], 753: [879], 754: [881], 755: [882], 756: [883], 757: [884], 758: [885], 759: [886], 760: [888], 761: [889], 762: [890], 763: [891], 764: [892], 765: [893], 766: [894], 767: [895], 768: [897], 769: [898], 770: [900], 771: [901], 772: [907], 773: [908], 774: [909], 775: [910], 776: [911], 777: [912], 778: [913], 779: [914], 780: [915], 781: [917], 782: [918], 783: [919], 784: [920], 785: [921], 786: [922], 787: [923], 788: [924], 789: [925], 790: [927], 791: [928], 792: [929], 793: [930], 794: [931], 795: [932], 796: [933], 797: [934], 798: [935], 799: [937], 800: [938], 801: [939], 802: [940], 803: [941], 804: [942], 805: [944], 806: [945], 807: [946], 808: [947], 809: [948], 810: [949], 811: [950], 812: [951], 813: [953], 814: [954], 815: [956], 816: [957], 817: [963], 818: [964], 819: [965], 820: [966], 821: [967], 822: [968], 823: [969], 824: [970], 825: [971], 826: [973], 827: [974], 828: [975], 829: [976], 830: [977], 831: [978], 832: [979], 833: [980], 834: [981], 835: [983], 836: [984], 837: [985], 838: [986], 839: [987], 840: [988], 841: [989], 842: [990], 843: [991], 844: [993], 845: [994], 846: [995], 847: [996], 848: [997], 849: [998], 850: [1000], 851: [1001], 852: [1002], 853: [1003], 854: [1004], 855: [1005], 856: [1006], 857: [1007], 858: [1009], 859: [1010], 860: [1012], 861: [1013], 862: [1018], 863: [1019], 864: [1021], 865: [1022], 866: [1024], 867: [1025], 868: [1030], 869: [1031], 870: [1033], 871: [1034], 872: [1036], 873: [1037], 874: [1045], 875: [1046], 876: [1047], 877: [1048], 878: [1049], 879: [1050], 880: [1051], 881: [1052], 882: [1053], 883: [1054], 884: [1055], 885: [1056], 886: [1057], 887: [1058], 888: [1059], 889: [1060], 890: [1061], 891: [1062], 892: [1064], 893: [1065], 894: [1066], 895: [1067], 896: [1068], 897: [1069], 898: [1070], 899: [1071], 900: [1072], 901: [1073], 902: [1074], 903: [1075], 904: [1076], 905: [1077], 906: [1078], 907: [1079], 908: [1080], 909: [1081], 910: [1083], 911: [1084], 912: [1085], 913: [1086], 914: [1087], 915: [1088], 916: [1089], 917: [1090], 918: [1091], 919: [1092], 920: [1093], 921: [1094], 922: [1095], 923: [1096], 924: [1097], 925: [1098], 926: [1099], 927: [1100], 928: [1102], 929: [1103], 930: [1104], 931: [1105], 932: [1106], 933: [1107], 934: [1109], 935: [1110], 936: [1111], 937: [1112], 938: [1113], 939: [1114], 940: [1116], 941: [1117], 942: [1118], 943: [1119], 944: [1120], 945: [1121]} [1;30m[model_handling.py at line 1643][0m [0m
-[1;32mDEBUG:  diag_to_config = [0m {1: 1, 2: 2, 6: 3, 7: 4, 11: 5, 12: 6, 28: 7, 29: 8, 30: 9, 34: 10, 35: 11, 36: 12, 40: 13, 41: 14, 42: 15, 46: 16, 47: 17, 48: 18, 49: 19, 50: 20, 51: 21, 55: 22, 56: 23, 57: 24, 58: 25, 59: 26, 60: 27, 61: 28, 62: 29, 63: 30, 64: 31, 65: 32, 66: 33, 67: 34, 68: 35, 69: 36, 73: 37, 74: 38, 75: 39, 76: 40, 77: 41, 78: 42, 79: 43, 80: 44, 81: 45, 82: 46, 83: 47, 84: 48, 85: 49, 86: 50, 87: 51, 91: 52, 92: 53, 93: 54, 94: 55, 95: 56, 96: 57, 97: 58, 98: 59, 99: 60, 100: 61, 101: 62, 102: 63, 103: 64, 104: 65, 105: 66, 109: 67, 110: 68, 111: 69, 112: 70, 113: 71, 114: 72, 115: 73, 116: 74, 117: 75, 121: 76, 122: 77, 123: 78, 124: 79, 125: 80, 126: 81, 127: 82, 128: 83, 129: 84, 133: 85, 134: 86, 135: 87, 136: 88, 137: 89, 138: 90, 139: 91, 140: 92, 141: 93, 142: 94, 143: 95, 144: 96, 145: 97, 146: 98, 147: 99, 148: 100, 149: 101, 150: 102, 151: 103, 152: 104, 153: 105, 160: 106, 161: 107, 162: 108, 163: 109, 164: 110, 165: 111, 166: 112, 167: 113, 168: 114, 169: 115, 170: 116, 171: 117, 172: 118, 173: 119, 174: 120, 178: 121, 179: 122, 183: 123, 184: 124, 185: 125, 186: 126, 187: 127, 188: 128, 189: 129, 190: 130, 191: 131, 192: 132, 193: 133, 194: 134, 195: 135, 196: 136, 197: 137, 201: 138, 202: 139, 203: 140, 204: 141, 205: 142, 206: 143, 207: 144, 208: 145, 209: 146, 210: 147, 211: 148, 212: 149, 213: 150, 214: 151, 215: 152, 219: 153, 220: 154, 221: 155, 222: 156, 223: 157, 224: 158, 225: 159, 226: 160, 227: 161, 228: 162, 229: 163, 230: 164, 231: 165, 232: 166, 233: 167, 234: 168, 235: 169, 236: 170, 237: 171, 238: 172, 239: 173, 240: 174, 241: 175, 242: 176, 243: 177, 244: 178, 245: 179, 246: 180, 247: 181, 248: 182, 249: 183, 250: 184, 251: 185, 252: 186, 253: 187, 254: 188, 255: 189, 256: 190, 257: 191, 258: 192, 259: 193, 260: 194, 261: 195, 262: 196, 266: 197, 267: 198, 268: 199, 269: 200, 270: 201, 271: 202, 275: 203, 276: 204, 277: 205, 278: 206, 279: 207, 280: 208, 284: 209, 285: 210, 319: 211, 320: 212, 321: 213, 322: 214, 323: 215, 324: 216, 325: 217, 326: 218, 327: 219, 328: 220, 329: 221, 330: 222, 331: 223, 332: 224, 333: 225, 337: 226, 338: 227, 342: 228, 343: 229, 344: 230, 345: 231, 346: 232, 347: 233, 348: 234, 349: 235, 350: 236, 351: 237, 352: 238, 353: 239, 354: 240, 355: 241, 356: 242, 360: 243, 361: 244, 362: 245, 363: 246, 364: 247, 365: 248, 366: 249, 367: 250, 368: 251, 369: 252, 370: 253, 371: 254, 372: 255, 373: 256, 374: 257, 378: 258, 379: 259, 380: 260, 381: 261, 382: 262, 383: 263, 384: 264, 385: 265, 386: 266, 387: 267, 388: 268, 389: 269, 390: 270, 391: 271, 392: 272, 393: 273, 394: 274, 395: 275, 396: 276, 397: 277, 398: 278, 399: 279, 400: 280, 401: 281, 402: 282, 403: 283, 404: 284, 405: 285, 406: 286, 407: 287, 408: 288, 409: 289, 410: 290, 411: 291, 412: 292, 413: 293, 414: 294, 415: 295, 416: 296, 417: 297, 418: 298, 419: 299, 420: 300, 421: 301, 425: 302, 426: 303, 427: 304, 428: 305, 429: 306, 430: 307, 434: 308, 435: 309, 436: 310, 437: 311, 438: 312, 439: 313, 443: 314, 444: 315, 478: 316, 479: 317, 480: 318, 481: 319, 482: 320, 483: 321, 487: 322, 488: 323, 489: 324, 490: 325, 491: 326, 492: 327, 493: 328, 494: 329, 495: 330, 496: 331, 497: 332, 498: 333, 499: 334, 500: 335, 501: 336, 505: 337, 506: 338, 507: 339, 508: 340, 509: 341, 510: 342, 511: 343, 512: 344, 513: 345, 514: 346, 515: 347, 516: 348, 517: 349, 518: 350, 519: 351, 523: 352, 524: 353, 525: 354, 526: 355, 527: 356, 528: 357, 529: 358, 530: 359, 531: 360, 532: 361, 533: 362, 534: 363, 535: 364, 536: 365, 537: 366, 541: 367, 542: 368, 543: 369, 544: 370, 545: 371, 546: 372, 547: 373, 548: 374, 549: 375, 550: 376, 551: 377, 555: 378, 556: 379, 560: 380, 561: 381, 577: 382, 578: 383, 579: 384, 580: 385, 581: 386, 582: 387, 583: 388, 584: 389, 585: 390, 589: 391, 590: 392, 591: 393, 592: 394, 593: 395, 594: 396, 595: 397, 596: 398, 597: 399, 601: 400, 602: 401, 603: 402, 604: 403, 605: 404, 606: 405, 607: 406, 608: 407, 609: 408, 613: 409, 614: 410, 615: 411, 616: 412, 617: 413, 618: 414, 622: 415, 623: 416, 624: 417, 625: 418, 626: 419, 627: 420, 637: 421, 638: 422, 639: 423, 640: 424, 641: 425, 642: 426, 646: 427, 647: 428, 648: 429, 649: 430, 650: 431, 651: 432, 652: 433, 653: 434, 654: 435, 655: 436, 656: 437, 657: 438, 658: 439, 659: 440, 660: 441, 664: 442, 665: 443, 666: 444, 667: 445, 668: 446, 669: 447, 670: 448, 671: 449, 672: 450, 673: 451, 674: 452, 675: 453, 676: 454, 677: 455, 678: 456, 682: 457, 683: 458, 684: 459, 685: 460, 686: 461, 687: 462, 688: 463, 689: 464, 690: 465, 691: 466, 692: 467, 693: 468, 694: 469, 695: 470, 696: 471, 700: 472, 701: 473, 702: 474, 703: 475, 704: 476, 705: 477, 706: 478, 707: 479, 708: 480, 709: 481, 710: 482, 714: 483, 715: 484, 719: 485, 720: 486, 736: 487, 737: 488, 738: 489, 739: 490, 740: 491, 741: 492, 742: 493, 743: 494, 744: 495, 748: 496, 749: 497, 750: 498, 751: 499, 752: 500, 753: 501, 754: 502, 755: 503, 756: 504, 760: 505, 761: 506, 762: 507, 763: 508, 764: 509, 765: 510, 766: 511, 767: 512, 768: 513, 772: 514, 773: 515, 774: 516, 775: 517, 776: 518, 777: 519, 781: 520, 782: 521, 783: 522, 784: 523, 785: 524, 786: 525, 796: 526, 797: 527, 798: 528, 799: 529, 800: 530, 801: 531, 805: 532, 806: 533, 807: 534, 808: 535, 809: 536, 810: 537, 811: 538, 812: 539, 813: 540, 814: 541, 815: 542, 816: 543, 817: 544, 818: 545, 819: 546, 823: 547, 824: 548, 825: 549, 826: 550, 827: 551, 828: 552, 829: 553, 830: 554, 831: 555, 832: 556, 833: 557, 834: 558, 835: 559, 836: 560, 837: 561, 841: 562, 842: 563, 843: 564, 844: 565, 845: 566, 846: 567, 847: 568, 848: 569, 849: 570, 850: 571, 851: 572, 852: 573, 853: 574, 854: 575, 855: 576, 859: 577, 860: 578, 861: 579, 862: 580, 863: 581, 864: 582, 865: 583, 866: 584, 867: 585, 868: 586, 869: 587, 873: 588, 874: 589, 878: 590, 879: 591, 895: 592, 896: 593, 897: 594, 898: 595, 899: 596, 900: 597, 901: 598, 902: 599, 903: 600, 907: 601, 908: 602, 909: 603, 910: 604, 911: 605, 912: 606, 913: 607, 914: 608, 915: 609, 919: 610, 920: 611, 921: 612, 922: 613, 923: 614, 924: 615, 925: 616, 926: 617, 927: 618, 931: 619, 932: 620, 933: 621, 934: 622, 935: 623, 936: 624, 940: 625, 941: 626, 942: 627, 943: 628, 944: 629, 945: 630, 955: 631, 956: 632, 957: 633, 958: 634, 959: 635, 960: 636, 961: 637, 962: 638, 963: 639, 964: 640, 965: 641, 966: 642, 967: 643, 968: 644, 969: 645, 970: 646, 971: 647, 972: 648, 973: 649, 974: 650, 975: 651, 976: 652, 977: 653, 978: 654, 979: 655, 980: 656, 981: 657, 982: 658, 983: 659, 984: 660, 985: 661, 986: 662, 987: 663, 991: 664, 992: 665, 993: 666, 994: 667, 995: 668, 996: 669, 1000: 670, 1001: 671, 1002: 672, 1003: 673, 1004: 674, 1005: 675, 1015: 676, 1016: 677, 1017: 678, 1018: 679, 1019: 680, 1020: 681, 1021: 682, 1022: 683, 1023: 684, 1024: 685, 1025: 686, 1026: 687, 1027: 688, 1028: 689, 1029: 690, 1030: 691, 1031: 692, 1032: 693, 1033: 694, 1034: 695, 1035: 696, 1036: 697, 1037: 698, 1038: 699, 1039: 700, 1040: 701, 1041: 702, 1042: 703, 1043: 704, 1044: 705, 1045: 706, 1046: 707, 1047: 708, 1051: 709, 1052: 710, 1053: 711, 1054: 712, 1055: 713, 1056: 714, 1060: 715, 1061: 716, 1062: 717, 1063: 718, 1064: 719, 1065: 720, 1075: 721, 1076: 722, 1080: 723, 1081: 724, 1085: 725, 1086: 726, 1102: 727, 1103: 728, 1104: 729, 1105: 730, 1106: 731, 1107: 732, 1108: 733, 1109: 734, 1110: 735, 1114: 736, 1115: 737, 1116: 738, 1117: 739, 1118: 740, 1119: 741, 1120: 742, 1121: 743, 1122: 744, 1126: 745, 1127: 746, 1128: 747, 1129: 748, 1130: 749, 1131: 750, 1132: 751, 1133: 752, 1134: 753, 1138: 754, 1139: 755, 1140: 756, 1141: 757, 1142: 758, 1143: 759, 1147: 760, 1148: 761, 1149: 762, 1150: 763, 1151: 764, 1152: 765, 1153: 766, 1154: 767, 1158: 768, 1159: 769, 1163: 770, 1164: 771, 1180: 772, 1181: 773, 1182: 774, 1183: 775, 1184: 776, 1185: 777, 1186: 778, 1187: 779, 1188: 780, 1192: 781, 1193: 782, 1194: 783, 1195: 784, 1196: 785, 1197: 786, 1198: 787, 1199: 788, 1200: 789, 1204: 790, 1205: 791, 1206: 792, 1207: 793, 1208: 794, 1209: 795, 1210: 796, 1211: 797, 1212: 798, 1216: 799, 1217: 800, 1218: 801, 1219: 802, 1220: 803, 1221: 804, 1225: 805, 1226: 806, 1227: 807, 1228: 808, 1229: 809, 1230: 810, 1231: 811, 1232: 812, 1236: 813, 1237: 814, 1241: 815, 1242: 816, 1258: 817, 1259: 818, 1260: 819, 1261: 820, 1262: 821, 1263: 822, 1264: 823, 1265: 824, 1266: 825, 1270: 826, 1271: 827, 1272: 828, 1273: 829, 1274: 830, 1275: 831, 1276: 832, 1277: 833, 1278: 834, 1282: 835, 1283: 836, 1284: 837, 1285: 838, 1286: 839, 1287: 840, 1288: 841, 1289: 842, 1290: 843, 1294: 844, 1295: 845, 1296: 846, 1297: 847, 1298: 848, 1299: 849, 1303: 850, 1304: 851, 1305: 852, 1306: 853, 1307: 854, 1308: 855, 1309: 856, 1310: 857, 1314: 858, 1315: 859, 1319: 860, 1320: 861, 1333: 862, 1334: 863, 1338: 864, 1339: 865, 1343: 866, 1344: 867, 1357: 868, 1358: 869, 1362: 870, 1363: 871, 1367: 872, 1368: 873, 1396: 874, 1397: 875, 1398: 876, 1399: 877, 1400: 878, 1401: 879, 1402: 880, 1403: 881, 1404: 882, 1405: 883, 1406: 884, 1407: 885, 1408: 886, 1409: 887, 1410: 888, 1411: 889, 1412: 890, 1413: 891, 1417: 892, 1418: 893, 1419: 894, 1420: 895, 1421: 896, 1422: 897, 1423: 898, 1424: 899, 1425: 900, 1426: 901, 1427: 902, 1428: 903, 1429: 904, 1430: 905, 1431: 906, 1432: 907, 1433: 908, 1434: 909, 1438: 910, 1439: 911, 1440: 912, 1441: 913, 1442: 914, 1443: 915, 1444: 916, 1445: 917, 1446: 918, 1447: 919, 1448: 920, 1449: 921, 1450: 922, 1451: 923, 1452: 924, 1453: 925, 1454: 926, 1455: 927, 1459: 928, 1460: 929, 1461: 930, 1462: 931, 1463: 932, 1464: 933, 1468: 934, 1469: 935, 1470: 936, 1471: 937, 1472: 938, 1473: 939, 1477: 940, 1478: 941, 1479: 942, 1480: 943, 1481: 944, 1482: 945} [1;30m[model_handling.py at line 1700][0m [0m
-[1;32mDEBUG:  call = [0m vxxxxx( momenta,m_pars->%s, cHel[ihel][%d],%+d, w_sv[%d], %d ); [1;30m[model_handling.py at line 1812][0m [0m
-[1;32mDEBUG:  ('ZERO', 0, -1, 0, 0) [1;30m[model_handling.py at line 1813][0m [0m
-[1;32mDEBUG:  call = [0m vxxxxx( momenta,m_pars->%s, cHel[ihel][%d],%+d, w_sv[%d], %d ); [1;30m[model_handling.py at line 1812][0m [0m
-[1;32mDEBUG:  ('ZERO', 1, -1, 1, 1) [1;30m[model_handling.py at line 1813][0m [0m
-[1;32mDEBUG:  call = [0m vxxxxx( momenta,m_pars->%s, cHel[ihel][%d],%+d, w_sv[%d], %d ); [1;30m[model_handling.py at line 1812][0m [0m
-[1;32mDEBUG:  ('ZERO', 4, 1, 4, 4) [1;30m[model_handling.py at line 1813][0m [0m
-[1;32mDEBUG:  call = [0m vxxxxx( momenta,m_pars->%s, cHel[ihel][%d],%+d, w_sv[%d], %d ); [1;30m[model_handling.py at line 1812][0m [0m
-[1;32mDEBUG:  ('ZERO', 5, 1, 5, 5) [1;30m[model_handling.py at line 1813][0m [0m
-[1;32mDEBUG:  call = [0m vxxxxx( momenta,m_pars->%s, cHel[ihel][%d],%+d, w_sv[%d], %d ); [1;30m[model_handling.py at line 1812][0m [0m
-[1;32mDEBUG:  ('ZERO', 6, 1, 6, 6) [1;30m[model_handling.py at line 1813][0m [0m
+[1;32mDEBUG:  diag_to_config = [0m {1: 1, 2: 2, 6: 3, 7: 4, 11: 5, 12: 6, 28: 7, 29: 8, 30: 9, 34: 10, 35: 11, 36: 12, 40: 13, 41: 14, 42: 15, 46: 16, 47: 17, 48: 18, 49: 19, 50: 20, 51: 21, 55: 22, 56: 23, 57: 24, 58: 25, 59: 26, 60: 27, 61: 28, 62: 29, 63: 30, 64: 31, 65: 32, 66: 33, 67: 34, 68: 35, 69: 36, 73: 37, 74: 38, 75: 39, 76: 40, 77: 41, 78: 42, 79: 43, 80: 44, 81: 45, 82: 46, 83: 47, 84: 48, 85: 49, 86: 50, 87: 51, 91: 52, 92: 53, 93: 54, 94: 55, 95: 56, 96: 57, 97: 58, 98: 59, 99: 60, 100: 61, 101: 62, 102: 63, 103: 64, 104: 65, 105: 66, 109: 67, 110: 68, 111: 69, 112: 70, 113: 71, 114: 72, 115: 73, 116: 74, 117: 75, 121: 76, 122: 77, 123: 78, 124: 79, 125: 80, 126: 81, 127: 82, 128: 83, 129: 84, 133: 85, 134: 86, 135: 87, 136: 88, 137: 89, 138: 90, 139: 91, 140: 92, 141: 93, 142: 94, 143: 95, 144: 96, 145: 97, 146: 98, 147: 99, 148: 100, 149: 101, 150: 102, 151: 103, 152: 104, 153: 105, 160: 106, 161: 107, 162: 108, 163: 109, 164: 110, 165: 111, 166: 112, 167: 113, 168: 114, 169: 115, 170: 116, 171: 117, 172: 118, 173: 119, 174: 120, 178: 121, 179: 122, 183: 123, 184: 124, 185: 125, 186: 126, 187: 127, 188: 128, 189: 129, 190: 130, 191: 131, 192: 132, 193: 133, 194: 134, 195: 135, 196: 136, 197: 137, 201: 138, 202: 139, 203: 140, 204: 141, 205: 142, 206: 143, 207: 144, 208: 145, 209: 146, 210: 147, 211: 148, 212: 149, 213: 150, 214: 151, 215: 152, 219: 153, 220: 154, 221: 155, 222: 156, 223: 157, 224: 158, 225: 159, 226: 160, 227: 161, 228: 162, 229: 163, 230: 164, 231: 165, 232: 166, 233: 167, 234: 168, 235: 169, 236: 170, 237: 171, 238: 172, 239: 173, 240: 174, 241: 175, 242: 176, 243: 177, 244: 178, 245: 179, 246: 180, 247: 181, 248: 182, 249: 183, 250: 184, 251: 185, 252: 186, 253: 187, 254: 188, 255: 189, 256: 190, 257: 191, 258: 192, 259: 193, 260: 194, 261: 195, 262: 196, 266: 197, 267: 198, 268: 199, 269: 200, 270: 201, 271: 202, 275: 203, 276: 204, 277: 205, 278: 206, 279: 207, 280: 208, 284: 209, 285: 210, 319: 211, 320: 212, 321: 213, 322: 214, 323: 215, 324: 216, 325: 217, 326: 218, 327: 219, 328: 220, 329: 221, 330: 222, 331: 223, 332: 224, 333: 225, 337: 226, 338: 227, 342: 228, 343: 229, 344: 230, 345: 231, 346: 232, 347: 233, 348: 234, 349: 235, 350: 236, 351: 237, 352: 238, 353: 239, 354: 240, 355: 241, 356: 242, 360: 243, 361: 244, 362: 245, 363: 246, 364: 247, 365: 248, 366: 249, 367: 250, 368: 251, 369: 252, 370: 253, 371: 254, 372: 255, 373: 256, 374: 257, 378: 258, 379: 259, 380: 260, 381: 261, 382: 262, 383: 263, 384: 264, 385: 265, 386: 266, 387: 267, 388: 268, 389: 269, 390: 270, 391: 271, 392: 272, 393: 273, 394: 274, 395: 275, 396: 276, 397: 277, 398: 278, 399: 279, 400: 280, 401: 281, 402: 282, 403: 283, 404: 284, 405: 285, 406: 286, 407: 287, 408: 288, 409: 289, 410: 290, 411: 291, 412: 292, 413: 293, 414: 294, 415: 295, 416: 296, 417: 297, 418: 298, 419: 299, 420: 300, 421: 301, 425: 302, 426: 303, 427: 304, 428: 305, 429: 306, 430: 307, 434: 308, 435: 309, 436: 310, 437: 311, 438: 312, 439: 313, 443: 314, 444: 315, 478: 316, 479: 317, 480: 318, 481: 319, 482: 320, 483: 321, 487: 322, 488: 323, 489: 324, 490: 325, 491: 326, 492: 327, 493: 328, 494: 329, 495: 330, 496: 331, 497: 332, 498: 333, 499: 334, 500: 335, 501: 336, 505: 337, 506: 338, 507: 339, 508: 340, 509: 341, 510: 342, 511: 343, 512: 344, 513: 345, 514: 346, 515: 347, 516: 348, 517: 349, 518: 350, 519: 351, 523: 352, 524: 353, 525: 354, 526: 355, 527: 356, 528: 357, 529: 358, 530: 359, 531: 360, 532: 361, 533: 362, 534: 363, 535: 364, 536: 365, 537: 366, 541: 367, 542: 368, 543: 369, 544: 370, 545: 371, 546: 372, 547: 373, 548: 374, 549: 375, 550: 376, 551: 377, 555: 378, 556: 379, 560: 380, 561: 381, 577: 382, 578: 383, 579: 384, 580: 385, 581: 386, 582: 387, 583: 388, 584: 389, 585: 390, 589: 391, 590: 392, 591: 393, 592: 394, 593: 395, 594: 396, 595: 397, 596: 398, 597: 399, 601: 400, 602: 401, 603: 402, 604: 403, 605: 404, 606: 405, 607: 406, 608: 407, 609: 408, 613: 409, 614: 410, 615: 411, 616: 412, 617: 413, 618: 414, 622: 415, 623: 416, 624: 417, 625: 418, 626: 419, 627: 420, 637: 421, 638: 422, 639: 423, 640: 424, 641: 425, 642: 426, 646: 427, 647: 428, 648: 429, 649: 430, 650: 431, 651: 432, 652: 433, 653: 434, 654: 435, 655: 436, 656: 437, 657: 438, 658: 439, 659: 440, 660: 441, 664: 442, 665: 443, 666: 444, 667: 445, 668: 446, 669: 447, 670: 448, 671: 449, 672: 450, 673: 451, 674: 452, 675: 453, 676: 454, 677: 455, 678: 456, 682: 457, 683: 458, 684: 459, 685: 460, 686: 461, 687: 462, 688: 463, 689: 464, 690: 465, 691: 466, 692: 467, 693: 468, 694: 469, 695: 470, 696: 471, 700: 472, 701: 473, 702: 474, 703: 475, 704: 476, 705: 477, 706: 478, 707: 479, 708: 480, 709: 481, 710: 482, 714: 483, 715: 484, 719: 485, 720: 486, 736: 487, 737: 488, 738: 489, 739: 490, 740: 491, 741: 492, 742: 493, 743: 494, 744: 495, 748: 496, 749: 497, 750: 498, 751: 499, 752: 500, 753: 501, 754: 502, 755: 503, 756: 504, 760: 505, 761: 506, 762: 507, 763: 508, 764: 509, 765: 510, 766: 511, 767: 512, 768: 513, 772: 514, 773: 515, 774: 516, 775: 517, 776: 518, 777: 519, 781: 520, 782: 521, 783: 522, 784: 523, 785: 524, 786: 525, 796: 526, 797: 527, 798: 528, 799: 529, 800: 530, 801: 531, 805: 532, 806: 533, 807: 534, 808: 535, 809: 536, 810: 537, 811: 538, 812: 539, 813: 540, 814: 541, 815: 542, 816: 543, 817: 544, 818: 545, 819: 546, 823: 547, 824: 548, 825: 549, 826: 550, 827: 551, 828: 552, 829: 553, 830: 554, 831: 555, 832: 556, 833: 557, 834: 558, 835: 559, 836: 560, 837: 561, 841: 562, 842: 563, 843: 564, 844: 565, 845: 566, 846: 567, 847: 568, 848: 569, 849: 570, 850: 571, 851: 572, 852: 573, 853: 574, 854: 575, 855: 576, 859: 577, 860: 578, 861: 579, 862: 580, 863: 581, 864: 582, 865: 583, 866: 584, 867: 585, 868: 586, 869: 587, 873: 588, 874: 589, 878: 590, 879: 591, 895: 592, 896: 593, 897: 594, 898: 595, 899: 596, 900: 597, 901: 598, 902: 599, 903: 600, 907: 601, 908: 602, 909: 603, 910: 604, 911: 605, 912: 606, 913: 607, 914: 608, 915: 609, 919: 610, 920: 611, 921: 612, 922: 613, 923: 614, 924: 615, 925: 616, 926: 617, 927: 618, 931: 619, 932: 620, 933: 621, 934: 622, 935: 623, 936: 624, 940: 625, 941: 626, 942: 627, 943: 628, 944: 629, 945: 630, 955: 631, 956: 632, 957: 633, 958: 634, 959: 635, 960: 636, 961: 637, 962: 638, 963: 639, 964: 640, 965: 641, 966: 642, 967: 643, 968: 644, 969: 645, 970: 646, 971: 647, 972: 648, 973: 649, 974: 650, 975: 651, 976: 652, 977: 653, 978: 654, 979: 655, 980: 656, 981: 657, 982: 658, 983: 659, 984: 660, 985: 661, 986: 662, 987: 663, 991: 664, 992: 665, 993: 666, 994: 667, 995: 668, 996: 669, 1000: 670, 1001: 671, 1002: 672, 1003: 673, 1004: 674, 1005: 675, 1015: 676, 1016: 677, 1017: 678, 1018: 679, 1019: 680, 1020: 681, 1021: 682, 1022: 683, 1023: 684, 1024: 685, 1025: 686, 1026: 687, 1027: 688, 1028: 689, 1029: 690, 1030: 691, 1031: 692, 1032: 693, 1033: 694, 1034: 695, 1035: 696, 1036: 697, 1037: 698, 1038: 699, 1039: 700, 1040: 701, 1041: 702, 1042: 703, 1043: 704, 1044: 705, 1045: 706, 1046: 707, 1047: 708, 1051: 709, 1052: 710, 1053: 711, 1054: 712, 1055: 713, 1056: 714, 1060: 715, 1061: 716, 1062: 717, 1063: 718, 1064: 719, 1065: 720, 1075: 721, 1076: 722, 1080: 723, 1081: 724, 1085: 725, 1086: 726, 1102: 727, 1103: 728, 1104: 729, 1105: 730, 1106: 731, 1107: 732, 1108: 733, 1109: 734, 1110: 735, 1114: 736, 1115: 737, 1116: 738, 1117: 739, 1118: 740, 1119: 741, 1120: 742, 1121: 743, 1122: 744, 1126: 745, 1127: 746, 1128: 747, 1129: 748, 1130: 749, 1131: 750, 1132: 751, 1133: 752, 1134: 753, 1138: 754, 1139: 755, 1140: 756, 1141: 757, 1142: 758, 1143: 759, 1147: 760, 1148: 761, 1149: 762, 1150: 763, 1151: 764, 1152: 765, 1153: 766, 1154: 767, 1158: 768, 1159: 769, 1163: 770, 1164: 771, 1180: 772, 1181: 773, 1182: 774, 1183: 775, 1184: 776, 1185: 777, 1186: 778, 1187: 779, 1188: 780, 1192: 781, 1193: 782, 1194: 783, 1195: 784, 1196: 785, 1197: 786, 1198: 787, 1199: 788, 1200: 789, 1204: 790, 1205: 791, 1206: 792, 1207: 793, 1208: 794, 1209: 795, 1210: 796, 1211: 797, 1212: 798, 1216: 799, 1217: 800, 1218: 801, 1219: 802, 1220: 803, 1221: 804, 1225: 805, 1226: 806, 1227: 807, 1228: 808, 1229: 809, 1230: 810, 1231: 811, 1232: 812, 1236: 813, 1237: 814, 1241: 815, 1242: 816, 1258: 817, 1259: 818, 1260: 819, 1261: 820, 1262: 821, 1263: 822, 1264: 823, 1265: 824, 1266: 825, 1270: 826, 1271: 827, 1272: 828, 1273: 829, 1274: 830, 1275: 831, 1276: 832, 1277: 833, 1278: 834, 1282: 835, 1283: 836, 1284: 837, 1285: 838, 1286: 839, 1287: 840, 1288: 841, 1289: 842, 1290: 843, 1294: 844, 1295: 845, 1296: 846, 1297: 847, 1298: 848, 1299: 849, 1303: 850, 1304: 851, 1305: 852, 1306: 853, 1307: 854, 1308: 855, 1309: 856, 1310: 857, 1314: 858, 1315: 859, 1319: 860, 1320: 861, 1333: 862, 1334: 863, 1338: 864, 1339: 865, 1343: 866, 1344: 867, 1357: 868, 1358: 869, 1362: 870, 1363: 871, 1367: 872, 1368: 873, 1396: 874, 1397: 875, 1398: 876, 1399: 877, 1400: 878, 1401: 879, 1402: 880, 1403: 881, 1404: 882, 1405: 883, 1406: 884, 1407: 885, 1408: 886, 1409: 887, 1410: 888, 1411: 889, 1412: 890, 1413: 891, 1417: 892, 1418: 893, 1419: 894, 1420: 895, 1421: 896, 1422: 897, 1423: 898, 1424: 899, 1425: 900, 1426: 901, 1427: 902, 1428: 903, 1429: 904, 1430: 905, 1431: 906, 1432: 907, 1433: 908, 1434: 909, 1438: 910, 1439: 911, 1440: 912, 1441: 913, 1442: 914, 1443: 915, 1444: 916, 1445: 917, 1446: 918, 1447: 919, 1448: 920, 1449: 921, 1450: 922, 1451: 923, 1452: 924, 1453: 925, 1454: 926, 1455: 927, 1459: 928, 1460: 929, 1461: 930, 1462: 931, 1463: 932, 1464: 933, 1468: 934, 1469: 935, 1470: 936, 1471: 937, 1472: 938, 1473: 939, 1477: 940, 1478: 941, 1479: 942, 1480: 943, 1481: 944, 1482: 945} [1;30m[model_handling.py at line 1698][0m [0m
+[1;32mDEBUG:  call = [0m vxxxxx( momenta,m_pars->%s, cHel[ihel][%d],%+d, w_sv[%d], %d ); [1;30m[model_handling.py at line 1810][0m [0m
+[1;32mDEBUG:  ('ZERO', 0, -1, 0, 0) [1;30m[model_handling.py at line 1811][0m [0m
+[1;32mDEBUG:  call = [0m vxxxxx( momenta,m_pars->%s, cHel[ihel][%d],%+d, w_sv[%d], %d ); [1;30m[model_handling.py at line 1810][0m [0m
+[1;32mDEBUG:  ('ZERO', 1, -1, 1, 1) [1;30m[model_handling.py at line 1811][0m [0m
+[1;32mDEBUG:  call = [0m vxxxxx( momenta,m_pars->%s, cHel[ihel][%d],%+d, w_sv[%d], %d ); [1;30m[model_handling.py at line 1810][0m [0m
+[1;32mDEBUG:  ('ZERO', 4, 1, 4, 4) [1;30m[model_handling.py at line 1811][0m [0m
+[1;32mDEBUG:  call = [0m vxxxxx( momenta,m_pars->%s, cHel[ihel][%d],%+d, w_sv[%d], %d ); [1;30m[model_handling.py at line 1810][0m [0m
+[1;32mDEBUG:  ('ZERO', 5, 1, 5, 5) [1;30m[model_handling.py at line 1811][0m [0m
+[1;32mDEBUG:  call = [0m vxxxxx( momenta,m_pars->%s, cHel[ihel][%d],%+d, w_sv[%d], %d ); [1;30m[model_handling.py at line 1810][0m [0m
+[1;32mDEBUG:  ('ZERO', 6, 1, 6, 6) [1;30m[model_handling.py at line 1811][0m [0m
 INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./. 
 [1;32mDEBUG:  Entering PLUGIN_OneProcessExporter.edit_CMakeLists [1;30m[model_handling.py at line 1332][0m [0m
 [1;32mDEBUG:  Entering PLUGIN_OneProcessExporter.edit_check_sa [1;30m[model_handling.py at line 1341][0m [0m
@@ -221,15 +221,15 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./.
 [1;32mDEBUG:  Done [1;30m[export_cpp.py at line 713][0m [0m
 INFO: Generating Feynman diagrams for Process: g g > t t~ g g g WEIGHTED<=5 @1 
 INFO: Finding symmetric diagrams for subprocess group gg_ttxggg 
-Generated helas calls for 1 subprocesses (1240 diagrams) in 6.293 s
-Wrote files for 2281 helas calls in 43.698 s
+Generated helas calls for 1 subprocesses (1240 diagrams) in 5.868 s
+Wrote files for 2281 helas calls in 39.490 s
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates VVV1 routines[0m
 ALOHA: aloha creates FFV1 routines[0m
 ALOHA: aloha creates VVVV1 routines[0m
 ALOHA: aloha creates VVVV3 routines[0m
 ALOHA: aloha creates VVVV4 routines[0m
-ALOHA: aloha creates 5 routines in  0.302 s
+ALOHA: aloha creates 5 routines in  0.272 s
 [1;32mDEBUG:  Entering PLUGIN_ProcessExporter.convert_model (create the model) [1;30m[output.py at line 194][0m [0m
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates VVV1 routines[0m
@@ -237,7 +237,7 @@ ALOHA: aloha creates FFV1 routines[0m
 ALOHA: aloha creates VVVV1 routines[0m
 ALOHA: aloha creates VVVV3 routines[0m
 ALOHA: aloha creates VVVV4 routines[0m
-ALOHA: aloha creates 10 routines in  0.296 s
+ALOHA: aloha creates 10 routines in  0.268 s
 <class 'aloha.create_aloha.AbstractRoutine'> VVV1
 <class 'aloha.create_aloha.AbstractRoutine'> VVV1
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
@@ -288,6 +288,6 @@ Type "launch" to generate events from this process, or see
 Run "open index.html" to see more information about this process.
 quit
 
-real	0m54.095s
-user	0m53.047s
-sys	0m0.756s
+real	0m49.958s
+user	0m48.085s
+sys	0m0.851s
diff --git a/epochX/cudacpp/gg_ttggg.sa/CODEGEN_cudacpp_gg_ttggg_log.txt b/epochX/cudacpp/gg_ttggg.sa/CODEGEN_cudacpp_gg_ttggg_log.txt
index abe8c1ab15..4105134487 100644
--- a/epochX/cudacpp/gg_ttggg.sa/CODEGEN_cudacpp_gg_ttggg_log.txt
+++ b/epochX/cudacpp/gg_ttggg.sa/CODEGEN_cudacpp_gg_ttggg_log.txt
@@ -51,8 +51,8 @@ Note that you can still compile and run aMC@NLO with the built-in PDFs
  MG5_aMC> set lhapdf /PATH/TO/lhapdf-config
 
 Using default text editor "vi". Set another one in ./input/mg5_configuration.txt
-Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt
-No valid web browser found. Please set in ./input/mg5_configuration.txt
+No valid eps viewer found. Please set in ./input/mg5_configuration.txt
+Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt
 import /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_cudacpp_gg_ttggg.mg
 The import format was not given, so we guess it as command
 set stdout_level DEBUG
@@ -62,7 +62,7 @@ generate g g > t t~ g g g
 No model currently active, so we import the Standard Model
 INFO: load particles 
 INFO: load vertices 
-[1;32mDEBUG: model prefixing  takes 0.005353450775146484 [0m
+[1;32mDEBUG: model prefixing  takes 0.004633426666259766 [0m
 INFO: Restrict model sm with file models/sm/restrict_default.dat . 
 [1;32mDEBUG: Simplifying conditional expressions [0m
 [1;32mDEBUG: remove interactions: u s w+ at order: QED=1 [0m
@@ -155,7 +155,7 @@ INFO: Please specify coupling orders to bypass this step.
 INFO: Trying coupling order WEIGHTED<=5: WEIGTHED IS QCD+2*QED 
 INFO: Trying process: g g > t t~ g g g WEIGHTED<=5 @1  
 INFO: Process has 1240 diagrams 
-1 processes with 1240 diagrams generated in 1.788 s
+1 processes with 1240 diagrams generated in 1.703 s
 Total: 1 processes with 1240 diagrams
 output standalone_cudacpp CODEGEN_cudacpp_gg_ttggg
 Load PLUGIN.CUDACPP_SA_OUTPUT
@@ -186,17 +186,17 @@ FileWriter <class 'PLUGIN.CUDACPP_SA_OUTPUT.model_handling.PLUGIN_CPPWriter'> fo
 [1;32mDEBUG:  type(self.helas_call_writer) = [0m <class 'PLUGIN.CUDACPP_SA_OUTPUT.model_handling.PLUGIN_GPUFOHelasCallWriter'> [1;30m[model_handling.py at line 1151][0m [0m
 [1;32mDEBUG:  self.support_multichannel, self.include_multi_channel = [0m True False [1;30m[model_handling.py at line 1152][0m [0m
 [1;32mDEBUG:  multi_channel_map = [0m None [1;30m[model_handling.py at line 1643][0m [0m
-[1;32mDEBUG:  diag_to_config = [0m {} [1;30m[model_handling.py at line 1700][0m [0m
-[1;32mDEBUG:  call = [0m vxxxxx( momenta,m_pars->%s, cHel[ihel][%d],%+d, w_sv[%d], %d ); [1;30m[model_handling.py at line 1812][0m [0m
-[1;32mDEBUG:  ('ZERO', 0, -1, 0, 0) [1;30m[model_handling.py at line 1813][0m [0m
-[1;32mDEBUG:  call = [0m vxxxxx( momenta,m_pars->%s, cHel[ihel][%d],%+d, w_sv[%d], %d ); [1;30m[model_handling.py at line 1812][0m [0m
-[1;32mDEBUG:  ('ZERO', 1, -1, 1, 1) [1;30m[model_handling.py at line 1813][0m [0m
-[1;32mDEBUG:  call = [0m vxxxxx( momenta,m_pars->%s, cHel[ihel][%d],%+d, w_sv[%d], %d ); [1;30m[model_handling.py at line 1812][0m [0m
-[1;32mDEBUG:  ('ZERO', 4, 1, 4, 4) [1;30m[model_handling.py at line 1813][0m [0m
-[1;32mDEBUG:  call = [0m vxxxxx( momenta,m_pars->%s, cHel[ihel][%d],%+d, w_sv[%d], %d ); [1;30m[model_handling.py at line 1812][0m [0m
-[1;32mDEBUG:  ('ZERO', 5, 1, 5, 5) [1;30m[model_handling.py at line 1813][0m [0m
-[1;32mDEBUG:  call = [0m vxxxxx( momenta,m_pars->%s, cHel[ihel][%d],%+d, w_sv[%d], %d ); [1;30m[model_handling.py at line 1812][0m [0m
-[1;32mDEBUG:  ('ZERO', 6, 1, 6, 6) [1;30m[model_handling.py at line 1813][0m [0m
+[1;32mDEBUG:  diag_to_config = [0m {} [1;30m[model_handling.py at line 1698][0m [0m
+[1;32mDEBUG:  call = [0m vxxxxx( momenta,m_pars->%s, cHel[ihel][%d],%+d, w_sv[%d], %d ); [1;30m[model_handling.py at line 1810][0m [0m
+[1;32mDEBUG:  ('ZERO', 0, -1, 0, 0) [1;30m[model_handling.py at line 1811][0m [0m
+[1;32mDEBUG:  call = [0m vxxxxx( momenta,m_pars->%s, cHel[ihel][%d],%+d, w_sv[%d], %d ); [1;30m[model_handling.py at line 1810][0m [0m
+[1;32mDEBUG:  ('ZERO', 1, -1, 1, 1) [1;30m[model_handling.py at line 1811][0m [0m
+[1;32mDEBUG:  call = [0m vxxxxx( momenta,m_pars->%s, cHel[ihel][%d],%+d, w_sv[%d], %d ); [1;30m[model_handling.py at line 1810][0m [0m
+[1;32mDEBUG:  ('ZERO', 4, 1, 4, 4) [1;30m[model_handling.py at line 1811][0m [0m
+[1;32mDEBUG:  call = [0m vxxxxx( momenta,m_pars->%s, cHel[ihel][%d],%+d, w_sv[%d], %d ); [1;30m[model_handling.py at line 1810][0m [0m
+[1;32mDEBUG:  ('ZERO', 5, 1, 5, 5) [1;30m[model_handling.py at line 1811][0m [0m
+[1;32mDEBUG:  call = [0m vxxxxx( momenta,m_pars->%s, cHel[ihel][%d],%+d, w_sv[%d], %d ); [1;30m[model_handling.py at line 1810][0m [0m
+[1;32mDEBUG:  ('ZERO', 6, 1, 6, 6) [1;30m[model_handling.py at line 1811][0m [0m
 INFO: Created files CPPProcess.h and CPPProcess.cc in directory /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_cudacpp_gg_ttggg/SubProcesses/P1_Sigma_sm_gg_ttxggg/. 
 [1;32mDEBUG:  Entering PLUGIN_OneProcessExporter.edit_CMakeLists [1;30m[model_handling.py at line 1332][0m [0m
 [1;32mDEBUG:  Entering PLUGIN_OneProcessExporter.edit_check_sa [1;30m[model_handling.py at line 1341][0m [0m
@@ -206,7 +206,7 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory /data/avalassi/G
 [1;32mDEBUG:  Entering PLUGIN_OneProcessExporter.edit_memorybuffers [1;30m[model_handling.py at line 1419][0m [0m
 [1;32mDEBUG:  Entering PLUGIN_OneProcessExporter.edit_memoryaccesscouplings [1;30m[model_handling.py at line 1430][0m [0m
 [1;32mDEBUG:  'Copying test reference file: ', template_ref  = [0m Copying test reference file:  /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/../../../test/ref/dump_CPUTest.Sigma_sm_gg_ttxggg.txt [1;30m[model_handling.py at line 1324][0m [0m
-Generated helas calls for 1 subprocesses (1240 diagrams) in 6.320 s
+Generated helas calls for 1 subprocesses (1240 diagrams) in 5.972 s
 [1;32mDEBUG:  Entering PLUGIN_ProcessExporter.convert_model (create the model) [1;30m[output.py at line 194][0m [0m
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates VVV1 routines[0m
@@ -214,7 +214,7 @@ ALOHA: aloha creates FFV1 routines[0m
 ALOHA: aloha creates VVVV1 routines[0m
 ALOHA: aloha creates VVVV3 routines[0m
 ALOHA: aloha creates VVVV4 routines[0m
-ALOHA: aloha creates 5 routines in  0.331 s
+ALOHA: aloha creates 5 routines in  0.296 s
 <class 'aloha.create_aloha.AbstractRoutine'> VVV1
 <class 'aloha.create_aloha.AbstractRoutine'> VVV1
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
@@ -256,6 +256,6 @@ INFO: /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_cudacpp_gg_ttggg/src/
 [1;32mDEBUG:  Entering PLUGIN_ProcessExporter.finalize [1;30m[output.py at line 203][0m [0m
 quit
 
-real	0m12.606s
-user	0m12.392s
-sys	0m0.114s
+real	0m11.811s
+user	0m11.639s
+sys	0m0.116s
diff --git a/epochX/cudacpp/gq_ttq.mad/CODEGEN_mad_gq_ttq_log.txt b/epochX/cudacpp/gq_ttq.mad/CODEGEN_mad_gq_ttq_log.txt
index bab8b8f779..1505c1a4b4 100644
--- a/epochX/cudacpp/gq_ttq.mad/CODEGEN_mad_gq_ttq_log.txt
+++ b/epochX/cudacpp/gq_ttq.mad/CODEGEN_mad_gq_ttq_log.txt
@@ -51,8 +51,8 @@ Note that you can still compile and run aMC@NLO with the built-in PDFs
  MG5_aMC> set lhapdf /PATH/TO/lhapdf-config
 
 Using default text editor "vi". Set another one in ./input/mg5_configuration.txt
-Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt
-No valid web browser found. Please set in ./input/mg5_configuration.txt
+No valid eps viewer found. Please set in ./input/mg5_configuration.txt
+Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt
 import /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_mad_gq_ttq.mg
 The import format was not given, so we guess it as command
 set stdout_level DEBUG
@@ -61,7 +61,7 @@ set zerowidth_tchannel F
 define q = u c d s u~ c~ d~ s~; generate g q > t t~ q
 INFO: load particles 
 INFO: load vertices 
-[1;32mDEBUG: model prefixing  takes 0.00509190559387207 [0m
+[1;32mDEBUG: model prefixing  takes 0.004947662353515625 [0m
 INFO: Restrict model sm with file models/sm/restrict_default.dat . 
 [1;32mDEBUG: Simplifying conditional expressions [0m
 [1;32mDEBUG: remove interactions: u s w+ at order: QED=1 [0m
@@ -169,7 +169,7 @@ INFO: Crossed process found for g u~ > t t~ u~, reuse diagrams.
 INFO: Crossed process found for g c~ > t t~ c~, reuse diagrams. 
 INFO: Crossed process found for g d~ > t t~ d~, reuse diagrams. 
 INFO: Crossed process found for g s~ > t t~ s~, reuse diagrams. 
-8 processes with 40 diagrams generated in 0.075 s
+8 processes with 40 diagrams generated in 0.071 s
 Total: 8 processes with 40 diagrams
 output madevent CODEGEN_mad_gq_ttq --hel_recycling=False --vector_size=16384 --me_exporter=standalone_cudacpp
 Load PLUGIN.CUDACPP_SA_OUTPUT
@@ -199,7 +199,7 @@ INFO: Creating files in directory P1_gu_ttxu
 [1;32mDEBUG:  Entering PLUGIN_OneProcessExporter.__init__ [1;30m[model_handling.py at line 1028][0m [0m
 [1;32mDEBUG:  kwargs[prefix] = 0 [1;30m[model_handling.py at line 1029][0m [0m
 [1;32mDEBUG:  proc_id = [0m 1 [1;30m[model_handling.py at line 1034][0m [0m
-[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_SA_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7f01c88098b0> [1;30m[export_v4.py at line 6174][0m [0m
+[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_SA_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7fa896bb1e20> [1;30m[export_v4.py at line 6174][0m [0m
 INFO: Creating files in directory . 
 [1;32mDEBUG:  Entering PLUGIN_OneProcessExporter.generate_process_files [1;30m[model_handling.py at line 1286][0m [0m
 [1;32mDEBUG:  self.include_multi_channel is already defined: this is madevent+second_exporter mode [1;30m[model_handling.py at line 1288][0m [0m
@@ -214,9 +214,9 @@ FileWriter <class 'PLUGIN.CUDACPP_SA_OUTPUT.model_handling.PLUGIN_CPPWriter'> fo
 [1;32mDEBUG:  self.support_multichannel, self.include_multi_channel = [0m True [1, 2, 3, 4, 5] [1;30m[model_handling.py at line 1152][0m [0m
 [1;32mDEBUG:  multi_channel = [0m {1: [0], 2: [1], 3: [2], 4: [3], 5: [4]} [1;30m[model_handling.py at line 1158][0m [0m
 [1;32mDEBUG:  multi_channel_map = [0m {1: [0], 2: [1], 3: [2], 4: [3], 5: [4]} [1;30m[model_handling.py at line 1643][0m [0m
-[1;32mDEBUG:  diag_to_config = [0m {1: 1, 2: 2, 3: 3, 4: 4, 5: 5} [1;30m[model_handling.py at line 1700][0m [0m
-[1;32mDEBUG:  call = [0m vxxxxx( momenta,m_pars->%s, cHel[ihel][%d],%+d, w_sv[%d], %d ); [1;30m[model_handling.py at line 1812][0m [0m
-[1;32mDEBUG:  ('ZERO', 0, -1, 0, 0) [1;30m[model_handling.py at line 1813][0m [0m
+[1;32mDEBUG:  diag_to_config = [0m {1: 1, 2: 2, 3: 3, 4: 4, 5: 5} [1;30m[model_handling.py at line 1698][0m [0m
+[1;32mDEBUG:  call = [0m vxxxxx( momenta,m_pars->%s, cHel[ihel][%d],%+d, w_sv[%d], %d ); [1;30m[model_handling.py at line 1810][0m [0m
+[1;32mDEBUG:  ('ZERO', 0, -1, 0, 0) [1;30m[model_handling.py at line 1811][0m [0m
 INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./. 
 [1;32mDEBUG:  Entering PLUGIN_OneProcessExporter.edit_CMakeLists [1;30m[model_handling.py at line 1332][0m [0m
 [1;32mDEBUG:  Entering PLUGIN_OneProcessExporter.edit_check_sa [1;30m[model_handling.py at line 1341][0m [0m
@@ -237,7 +237,7 @@ INFO: Creating files in directory P1_gux_ttxux
 [1;32mDEBUG:  Entering PLUGIN_OneProcessExporter.__init__ [1;30m[model_handling.py at line 1028][0m [0m
 [1;32mDEBUG:  kwargs[prefix] = 0 [1;30m[model_handling.py at line 1029][0m [0m
 [1;32mDEBUG:  proc_id = [0m 1 [1;30m[model_handling.py at line 1034][0m [0m
-[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_SA_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7f01c86caee0> [1;30m[export_v4.py at line 6174][0m [0m
+[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_SA_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7fa896c73430> [1;30m[export_v4.py at line 6174][0m [0m
 INFO: Creating files in directory . 
 [1;32mDEBUG:  Entering PLUGIN_OneProcessExporter.generate_process_files [1;30m[model_handling.py at line 1286][0m [0m
 [1;32mDEBUG:  self.include_multi_channel is already defined: this is madevent+second_exporter mode [1;30m[model_handling.py at line 1288][0m [0m
@@ -252,9 +252,9 @@ FileWriter <class 'PLUGIN.CUDACPP_SA_OUTPUT.model_handling.PLUGIN_CPPWriter'> fo
 [1;32mDEBUG:  self.support_multichannel, self.include_multi_channel = [0m True [1, 2, 3, 4, 5] [1;30m[model_handling.py at line 1152][0m [0m
 [1;32mDEBUG:  multi_channel = [0m {1: [0], 2: [1], 3: [2], 4: [3], 5: [4]} [1;30m[model_handling.py at line 1158][0m [0m
 [1;32mDEBUG:  multi_channel_map = [0m {1: [0], 2: [1], 3: [2], 4: [3], 5: [4]} [1;30m[model_handling.py at line 1643][0m [0m
-[1;32mDEBUG:  diag_to_config = [0m {1: 1, 2: 2, 3: 3, 4: 4, 5: 5} [1;30m[model_handling.py at line 1700][0m [0m
-[1;32mDEBUG:  call = [0m vxxxxx( momenta,m_pars->%s, cHel[ihel][%d],%+d, w_sv[%d], %d ); [1;30m[model_handling.py at line 1812][0m [0m
-[1;32mDEBUG:  ('ZERO', 0, -1, 0, 0) [1;30m[model_handling.py at line 1813][0m [0m
+[1;32mDEBUG:  diag_to_config = [0m {1: 1, 2: 2, 3: 3, 4: 4, 5: 5} [1;30m[model_handling.py at line 1698][0m [0m
+[1;32mDEBUG:  call = [0m vxxxxx( momenta,m_pars->%s, cHel[ihel][%d],%+d, w_sv[%d], %d ); [1;30m[model_handling.py at line 1810][0m [0m
+[1;32mDEBUG:  ('ZERO', 0, -1, 0, 0) [1;30m[model_handling.py at line 1811][0m [0m
 INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./. 
 [1;32mDEBUG:  Entering PLUGIN_OneProcessExporter.edit_CMakeLists [1;30m[model_handling.py at line 1332][0m [0m
 [1;32mDEBUG:  Entering PLUGIN_OneProcessExporter.edit_check_sa [1;30m[model_handling.py at line 1341][0m [0m
@@ -271,17 +271,17 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./.
 [1;32mDEBUG:  Done [1;30m[export_cpp.py at line 713][0m [0m
 INFO: Generating Feynman diagrams for Process: g u~ > t t~ u~ WEIGHTED<=3 @1 
 INFO: Finding symmetric diagrams for subprocess group gux_ttxux 
-Generated helas calls for 2 subprocesses (10 diagrams) in 0.029 s
-Wrote files for 32 helas calls in 0.223 s
+Generated helas calls for 2 subprocesses (10 diagrams) in 0.028 s
+Wrote files for 32 helas calls in 0.213 s
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates FFV1 routines[0m
 ALOHA: aloha creates VVV1 routines[0m
-ALOHA: aloha creates 2 routines in  0.138 s
+ALOHA: aloha creates 2 routines in  0.126 s
 [1;32mDEBUG:  Entering PLUGIN_ProcessExporter.convert_model (create the model) [1;30m[output.py at line 194][0m [0m
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates FFV1 routines[0m
 ALOHA: aloha creates VVV1 routines[0m
-ALOHA: aloha creates 4 routines in  0.124 s
+ALOHA: aloha creates 4 routines in  0.112 s
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
@@ -325,6 +325,6 @@ Type "launch" to generate events from this process, or see
 Run "open index.html" to see more information about this process.
 quit
 
-real	0m2.313s
-user	0m1.757s
-sys	0m0.177s
+real	0m1.861s
+user	0m1.623s
+sys	0m0.202s
diff --git a/epochX/cudacpp/gq_ttq.sa/CODEGEN_cudacpp_gq_ttq_log.txt b/epochX/cudacpp/gq_ttq.sa/CODEGEN_cudacpp_gq_ttq_log.txt
index bfaddc5def..b973f7da7e 100644
--- a/epochX/cudacpp/gq_ttq.sa/CODEGEN_cudacpp_gq_ttq_log.txt
+++ b/epochX/cudacpp/gq_ttq.sa/CODEGEN_cudacpp_gq_ttq_log.txt
@@ -51,8 +51,8 @@ Note that you can still compile and run aMC@NLO with the built-in PDFs
  MG5_aMC> set lhapdf /PATH/TO/lhapdf-config
 
 Using default text editor "vi". Set another one in ./input/mg5_configuration.txt
-Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt
-No valid web browser found. Please set in ./input/mg5_configuration.txt
+No valid eps viewer found. Please set in ./input/mg5_configuration.txt
+Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt
 import /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_cudacpp_gq_ttq.mg
 The import format was not given, so we guess it as command
 set stdout_level DEBUG
@@ -61,7 +61,7 @@ set zerowidth_tchannel F
 define q = u c d s u~ c~ d~ s~; generate g q > t t~ q
 INFO: load particles 
 INFO: load vertices 
-[1;32mDEBUG: model prefixing  takes 0.005072832107543945 [0m
+[1;32mDEBUG: model prefixing  takes 0.0045931339263916016 [0m
 INFO: Restrict model sm with file models/sm/restrict_default.dat . 
 [1;32mDEBUG: Simplifying conditional expressions [0m
 [1;32mDEBUG: remove interactions: u s w+ at order: QED=1 [0m
@@ -169,7 +169,7 @@ INFO: Crossed process found for g u~ > t t~ u~, reuse diagrams.
 INFO: Crossed process found for g c~ > t t~ c~, reuse diagrams. 
 INFO: Crossed process found for g d~ > t t~ d~, reuse diagrams. 
 INFO: Crossed process found for g s~ > t t~ s~, reuse diagrams. 
-8 processes with 40 diagrams generated in 0.075 s
+8 processes with 40 diagrams generated in 0.070 s
 Total: 8 processes with 40 diagrams
 output standalone_cudacpp CODEGEN_cudacpp_gq_ttq
 Load PLUGIN.CUDACPP_SA_OUTPUT
@@ -208,9 +208,9 @@ FileWriter <class 'PLUGIN.CUDACPP_SA_OUTPUT.model_handling.PLUGIN_CPPWriter'> fo
 [1;32mDEBUG:  type(self.helas_call_writer) = [0m <class 'PLUGIN.CUDACPP_SA_OUTPUT.model_handling.PLUGIN_GPUFOHelasCallWriter'> [1;30m[model_handling.py at line 1151][0m [0m
 [1;32mDEBUG:  self.support_multichannel, self.include_multi_channel = [0m True False [1;30m[model_handling.py at line 1152][0m [0m
 [1;32mDEBUG:  multi_channel_map = [0m None [1;30m[model_handling.py at line 1643][0m [0m
-[1;32mDEBUG:  diag_to_config = [0m {} [1;30m[model_handling.py at line 1700][0m [0m
-[1;32mDEBUG:  call = [0m vxxxxx( momenta,m_pars->%s, cHel[ihel][%d],%+d, w_sv[%d], %d ); [1;30m[model_handling.py at line 1812][0m [0m
-[1;32mDEBUG:  ('ZERO', 0, -1, 0, 0) [1;30m[model_handling.py at line 1813][0m [0m
+[1;32mDEBUG:  diag_to_config = [0m {} [1;30m[model_handling.py at line 1698][0m [0m
+[1;32mDEBUG:  call = [0m vxxxxx( momenta,m_pars->%s, cHel[ihel][%d],%+d, w_sv[%d], %d ); [1;30m[model_handling.py at line 1810][0m [0m
+[1;32mDEBUG:  ('ZERO', 0, -1, 0, 0) [1;30m[model_handling.py at line 1811][0m [0m
 INFO: Created files CPPProcess.h and CPPProcess.cc in directory /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_cudacpp_gq_ttq/SubProcesses/P1_Sigma_sm_gu_ttxu/. 
 [1;32mDEBUG:  Entering PLUGIN_OneProcessExporter.edit_CMakeLists [1;30m[model_handling.py at line 1332][0m [0m
 [1;32mDEBUG:  Entering PLUGIN_OneProcessExporter.edit_check_sa [1;30m[model_handling.py at line 1341][0m [0m
@@ -239,9 +239,9 @@ FileWriter <class 'PLUGIN.CUDACPP_SA_OUTPUT.model_handling.PLUGIN_CPPWriter'> fo
 [1;32mDEBUG:  type(self.helas_call_writer) = [0m <class 'PLUGIN.CUDACPP_SA_OUTPUT.model_handling.PLUGIN_GPUFOHelasCallWriter'> [1;30m[model_handling.py at line 1151][0m [0m
 [1;32mDEBUG:  self.support_multichannel, self.include_multi_channel = [0m True False [1;30m[model_handling.py at line 1152][0m [0m
 [1;32mDEBUG:  multi_channel_map = [0m None [1;30m[model_handling.py at line 1643][0m [0m
-[1;32mDEBUG:  diag_to_config = [0m {} [1;30m[model_handling.py at line 1700][0m [0m
-[1;32mDEBUG:  call = [0m vxxxxx( momenta,m_pars->%s, cHel[ihel][%d],%+d, w_sv[%d], %d ); [1;30m[model_handling.py at line 1812][0m [0m
-[1;32mDEBUG:  ('ZERO', 0, -1, 0, 0) [1;30m[model_handling.py at line 1813][0m [0m
+[1;32mDEBUG:  diag_to_config = [0m {} [1;30m[model_handling.py at line 1698][0m [0m
+[1;32mDEBUG:  call = [0m vxxxxx( momenta,m_pars->%s, cHel[ihel][%d],%+d, w_sv[%d], %d ); [1;30m[model_handling.py at line 1810][0m [0m
+[1;32mDEBUG:  ('ZERO', 0, -1, 0, 0) [1;30m[model_handling.py at line 1811][0m [0m
 INFO: Created files CPPProcess.h and CPPProcess.cc in directory /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_cudacpp_gq_ttq/SubProcesses/P1_Sigma_sm_gux_ttxux/. 
 [1;32mDEBUG:  Entering PLUGIN_OneProcessExporter.edit_CMakeLists [1;30m[model_handling.py at line 1332][0m [0m
 [1;32mDEBUG:  Entering PLUGIN_OneProcessExporter.edit_check_sa [1;30m[model_handling.py at line 1341][0m [0m
@@ -251,12 +251,12 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory /data/avalassi/G
 [1;32mDEBUG:  Entering PLUGIN_OneProcessExporter.edit_memorybuffers [1;30m[model_handling.py at line 1419][0m [0m
 [1;32mDEBUG:  Entering PLUGIN_OneProcessExporter.edit_memoryaccesscouplings [1;30m[model_handling.py at line 1430][0m [0m
 [1;32mDEBUG:  'Copying test reference file: ', template_ref  = [0m Copying test reference file:  /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/../../../test/ref/dump_CPUTest.Sigma_sm_gux_ttxux.txt [1;30m[model_handling.py at line 1324][0m [0m
-Generated helas calls for 2 subprocesses (10 diagrams) in 0.029 s
+Generated helas calls for 2 subprocesses (10 diagrams) in 0.027 s
 [1;32mDEBUG:  Entering PLUGIN_ProcessExporter.convert_model (create the model) [1;30m[output.py at line 194][0m [0m
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates FFV1 routines[0m
 ALOHA: aloha creates VVV1 routines[0m
-ALOHA: aloha creates 2 routines in  0.137 s
+ALOHA: aloha creates 2 routines in  0.124 s
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
@@ -291,6 +291,6 @@ INFO: /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_cudacpp_gq_ttq/src/.
 [1;32mDEBUG:  Entering PLUGIN_ProcessExporter.finalize [1;30m[output.py at line 203][0m [0m
 quit
 
-real	0m0.791s
-user	0m0.673s
-sys	0m0.063s
+real	0m0.705s
+user	0m0.639s
+sys	0m0.059s
diff --git a/epochX/cudacpp/heft_gg_h.sa/CODEGEN_cudacpp_heft_gg_h_log.txt b/epochX/cudacpp/heft_gg_h.sa/CODEGEN_cudacpp_heft_gg_h_log.txt
index 2b0816a6e3..33f009f727 100644
--- a/epochX/cudacpp/heft_gg_h.sa/CODEGEN_cudacpp_heft_gg_h_log.txt
+++ b/epochX/cudacpp/heft_gg_h.sa/CODEGEN_cudacpp_heft_gg_h_log.txt
@@ -51,8 +51,8 @@ Note that you can still compile and run aMC@NLO with the built-in PDFs
  MG5_aMC> set lhapdf /PATH/TO/lhapdf-config
 
 Using default text editor "vi". Set another one in ./input/mg5_configuration.txt
-Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt
-No valid web browser found. Please set in ./input/mg5_configuration.txt
+No valid eps viewer found. Please set in ./input/mg5_configuration.txt
+Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt
 import /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_cudacpp_heft_gg_h.mg
 The import format was not given, so we guess it as command
 set stdout_level DEBUG
@@ -127,7 +127,7 @@ INFO: Checking for minimal orders which gives processes.
 INFO: Please specify coupling orders to bypass this step. 
 INFO: Trying process: g g > h HIG<=1 HIW<=1 WEIGHTED<=2 @1  
 INFO: Process has 1 diagrams 
-1 processes with 1 diagrams generated in 0.004 s
+1 processes with 1 diagrams generated in 0.003 s
 Total: 1 processes with 1 diagrams
 output standalone_cudacpp CODEGEN_cudacpp_heft_gg_h
 Load PLUGIN.CUDACPP_SA_OUTPUT
@@ -158,11 +158,11 @@ FileWriter <class 'PLUGIN.CUDACPP_SA_OUTPUT.model_handling.PLUGIN_CPPWriter'> fo
 [1;32mDEBUG:  type(self.helas_call_writer) = [0m <class 'PLUGIN.CUDACPP_SA_OUTPUT.model_handling.PLUGIN_GPUFOHelasCallWriter'> [1;30m[model_handling.py at line 1151][0m [0m
 [1;32mDEBUG:  self.support_multichannel, self.include_multi_channel = [0m True False [1;30m[model_handling.py at line 1152][0m [0m
 [1;32mDEBUG:  multi_channel_map = [0m None [1;30m[model_handling.py at line 1643][0m [0m
-[1;32mDEBUG:  diag_to_config = [0m {} [1;30m[model_handling.py at line 1700][0m [0m
-[1;32mDEBUG:  call = [0m vxxxxx( momenta,m_pars->%s, cHel[ihel][%d],%+d, w_sv[%d], %d ); [1;30m[model_handling.py at line 1812][0m [0m
-[1;32mDEBUG:  ('ZERO', 0, -1, 0, 0) [1;30m[model_handling.py at line 1813][0m [0m
-[1;32mDEBUG:  call = [0m vxxxxx( momenta,m_pars->%s, cHel[ihel][%d],%+d, w_sv[%d], %d ); [1;30m[model_handling.py at line 1812][0m [0m
-[1;32mDEBUG:  ('ZERO', 1, -1, 1, 1) [1;30m[model_handling.py at line 1813][0m [0m
+[1;32mDEBUG:  diag_to_config = [0m {} [1;30m[model_handling.py at line 1698][0m [0m
+[1;32mDEBUG:  call = [0m vxxxxx( momenta,m_pars->%s, cHel[ihel][%d],%+d, w_sv[%d], %d ); [1;30m[model_handling.py at line 1810][0m [0m
+[1;32mDEBUG:  ('ZERO', 0, -1, 0, 0) [1;30m[model_handling.py at line 1811][0m [0m
+[1;32mDEBUG:  call = [0m vxxxxx( momenta,m_pars->%s, cHel[ihel][%d],%+d, w_sv[%d], %d ); [1;30m[model_handling.py at line 1810][0m [0m
+[1;32mDEBUG:  ('ZERO', 1, -1, 1, 1) [1;30m[model_handling.py at line 1811][0m [0m
 INFO: Created files CPPProcess.h and CPPProcess.cc in directory /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_cudacpp_heft_gg_h/SubProcesses/P1_Sigma_heft_gg_h/. 
 [1;32mDEBUG:  Entering PLUGIN_OneProcessExporter.edit_CMakeLists [1;30m[model_handling.py at line 1332][0m [0m
 [1;32mDEBUG:  Entering PLUGIN_OneProcessExporter.edit_check_sa [1;30m[model_handling.py at line 1341][0m [0m
@@ -176,7 +176,7 @@ Generated helas calls for 1 subprocesses (1 diagrams) in 0.002 s
 [1;32mDEBUG:  Entering PLUGIN_ProcessExporter.convert_model (create the model) [1;30m[output.py at line 194][0m [0m
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates VVS3 routines[0m
-ALOHA: aloha creates 1 routines in  0.058 s
+ALOHA: aloha creates 1 routines in  0.053 s
 <class 'aloha.create_aloha.AbstractRoutine'> VVS3
 FileWriter <class 'PLUGIN.CUDACPP_SA_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_cudacpp_heft_gg_h/src/./HelAmps_heft.h
 INFO: Created file HelAmps_heft.h in directory /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_cudacpp_heft_gg_h/src/. 
@@ -207,6 +207,6 @@ INFO: /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_cudacpp_heft_gg_h/src
 [1;32mDEBUG:  Entering PLUGIN_ProcessExporter.finalize [1;30m[output.py at line 203][0m [0m
 quit
 
-real	0m0.612s
-user	0m0.439s
-sys	0m0.057s
+real	0m0.806s
+user	0m0.424s
+sys	0m0.052s

From d2ac31071f8adc0cc5cbcb078f480fe3de7f1b08 Mon Sep 17 00:00:00 2001
From: Jorgen T <jorgen.teig@gmail.com>
Date: Thu, 20 Jul 2023 12:13:08 +0200
Subject: [PATCH 390/509] Moved hip_runtime.h include to mgOnGpuConfig.h

---
 epochX/cudacpp/gg_ttgg.mad/SubProcesses/GpuAbstraction.h | 2 --
 epochX/cudacpp/gg_ttgg.mad/src/mgOnGpuConfig.h           | 1 +
 2 files changed, 1 insertion(+), 2 deletions(-)

diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/GpuAbstraction.h b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/GpuAbstraction.h
index 6a7d9c05c0..9c467b1e04 100644
--- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/GpuAbstraction.h
+++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/GpuAbstraction.h
@@ -39,8 +39,6 @@
 
 #elif defined __HIPCC__
 
-#include "hip/hip_runtime.h"
-
 #define gpuError_t hipError_t
 #define gpuPeekAtLastError hipPeekAtLastError
 #define gpuGetErrorString hipGetErrorString
diff --git a/epochX/cudacpp/gg_ttgg.mad/src/mgOnGpuConfig.h b/epochX/cudacpp/gg_ttgg.mad/src/mgOnGpuConfig.h
index 390766116b..f9b5faee54 100644
--- a/epochX/cudacpp/gg_ttgg.mad/src/mgOnGpuConfig.h
+++ b/epochX/cudacpp/gg_ttgg.mad/src/mgOnGpuConfig.h
@@ -15,6 +15,7 @@
 #define MGONGPUCPP_GPUIMPL cuda
 #elif defined __HIPCC__
 #define MGONGPUCPP_GPUIMPL hip
+#include "hip/hip_runtime.h"
 #else
 #undef MGONGPUCPP_GPUIMPL
 #endif

From 35913a385f9961f4ca8e67aabaa37940149c5aa5 Mon Sep 17 00:00:00 2001
From: Jorgen T <jorgen.teig@gmail.com>
Date: Thu, 20 Jul 2023 14:41:55 +0200
Subject: [PATCH 391/509] [CODEGEN] Added HIP runtime include in
 mgOnGpuConfig.h in codegen

---
 .../madgraph/iolibs/template_files/gpu/GpuAbstraction.h         | 2 --
 .../madgraph/iolibs/template_files/gpu/mgOnGpuConfig.h          | 1 +
 2 files changed, 1 insertion(+), 2 deletions(-)

diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/GpuAbstraction.h b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/GpuAbstraction.h
index 6a7d9c05c0..9c467b1e04 100644
--- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/GpuAbstraction.h
+++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/GpuAbstraction.h
@@ -39,8 +39,6 @@
 
 #elif defined __HIPCC__
 
-#include "hip/hip_runtime.h"
-
 #define gpuError_t hipError_t
 #define gpuPeekAtLastError hipPeekAtLastError
 #define gpuGetErrorString hipGetErrorString
diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/mgOnGpuConfig.h b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/mgOnGpuConfig.h
index 0884c88d37..333572a6ba 100644
--- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/mgOnGpuConfig.h
+++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/mgOnGpuConfig.h
@@ -15,6 +15,7 @@
 #define MGONGPUCPP_GPUIMPL cuda
 #elif defined __HIPCC__
 #define MGONGPUCPP_GPUIMPL hip
+#include "hip/hip_runtime.h"
 #else
 #undef MGONGPUCPP_GPUIMPL
 #endif

From ee8875b9f79881f4f379d1f916fbd742c6157aa0 Mon Sep 17 00:00:00 2001
From: Andrea Valassi <andrea.valassi@cern.ch>
Date: Tue, 25 Jul 2023 16:04:32 +0200
Subject: [PATCH 392/509] [jthip] go back to pre-HIP changes in all 6 mad
 (except ggttgg) and 7 sa for easier merging from upstream/master

git checkout origin/f2py $(gitls *sa *.mad | grep -v gg_ttgg.mad)

NB: this does NOT remove the newly added GpuAbstraction.h and GpuRuntime.h files, it is better like this...
---
 epochX/cudacpp/ee_mumu.mad/COPYRIGHT          |   1 -
 .../ee_mumu.mad/Source/DHELAS/aloha_file.inc  |   2 +-
 .../cudacpp/ee_mumu.mad/SubProcesses/Bridge.h |  32 +-
 .../ee_mumu.mad/SubProcesses/BridgeKernels.cc |   9 +-
 .../ee_mumu.mad/SubProcesses/BridgeKernels.h  |   8 +-
 .../SubProcesses/CommonRandomNumberKernel.cc  |   5 +-
 .../SubProcesses/CrossSectionKernels.cc       |   7 +-
 .../SubProcesses/CrossSectionKernels.h        |   6 +-
 .../ee_mumu.mad/SubProcesses/CudaRuntime.h    |  85 +++++
 .../SubProcesses/CurandRandomNumberKernel.cc  |   8 +-
 .../SubProcesses/EventStatistics.h            |   4 +-
 .../ee_mumu.mad/SubProcesses/MadgraphTest.h   |   8 +-
 .../SubProcesses/MatrixElementKernels.cc      |  26 +-
 .../SubProcesses/MatrixElementKernels.h       |   8 +-
 .../SubProcesses/MemoryAccessHelpers.h        |   4 +-
 .../SubProcesses/MemoryAccessMomenta.h        |   6 +-
 .../SubProcesses/MemoryAccessRandomNumbers.h  |   4 +-
 .../SubProcesses/MemoryAccessVectors.h        |   4 +-
 .../ee_mumu.mad/SubProcesses/MemoryBuffers.h  |  64 ++--
 .../SubProcesses/P1_epem_mupmum/CPPProcess.cc |  64 ++--
 .../SubProcesses/P1_epem_mupmum/CPPProcess.h  |  10 +-
 .../SubProcesses/P1_epem_mupmum/CudaRuntime.h |   1 +
 .../SubProcesses/P1_epem_mupmum/check_sa.cc   | 103 +++---
 .../SubProcesses/RamboSamplingKernels.cc      |  20 +-
 .../SubProcesses/RamboSamplingKernels.h       |   6 +-
 .../SubProcesses/RandomNumberKernels.h        |   6 +-
 .../ee_mumu.mad/SubProcesses/cudacpp.mk       | 297 +++++++-----------
 .../ee_mumu.mad/SubProcesses/fbridge.cc       |  16 +-
 .../ee_mumu.mad/SubProcesses/fsampler.cc      |   8 +-
 .../ee_mumu.mad/SubProcesses/runTest.cc       |  12 +-
 .../ee_mumu.mad/SubProcesses/testmisc.cc      |   4 +-
 .../ee_mumu.mad/SubProcesses/testxxx.cc       |   8 +-
 epochX/cudacpp/ee_mumu.mad/src/HelAmps_sm.h   |   4 +-
 .../cudacpp/ee_mumu.mad/src/Parameters_sm.cc  |   2 +-
 .../cudacpp/ee_mumu.mad/src/Parameters_sm.h   |   6 +-
 .../cudacpp/ee_mumu.mad/src/mgOnGpuConfig.h   |  70 ++---
 .../cudacpp/ee_mumu.mad/src/mgOnGpuCxtypes.h  |  18 +-
 .../cudacpp/ee_mumu.mad/src/mgOnGpuFptypes.h  |  10 +-
 .../cudacpp/ee_mumu.mad/src/mgOnGpuVectors.h  |  14 +-
 epochX/cudacpp/ee_mumu.mad/src/rambo.h        |   8 +-
 epochX/cudacpp/ee_mumu.sa/COPYRIGHT           |   1 -
 .../cudacpp/ee_mumu.sa/SubProcesses/Bridge.h  |  32 +-
 .../ee_mumu.sa/SubProcesses/BridgeKernels.cc  |   9 +-
 .../ee_mumu.sa/SubProcesses/BridgeKernels.h   |   8 +-
 .../SubProcesses/CommonRandomNumberKernel.cc  |   5 +-
 .../SubProcesses/CrossSectionKernels.cc       |   7 +-
 .../SubProcesses/CrossSectionKernels.h        |   6 +-
 .../ee_mumu.sa/SubProcesses/CudaRuntime.h     |  85 +++++
 .../SubProcesses/CurandRandomNumberKernel.cc  |   8 +-
 .../ee_mumu.sa/SubProcesses/EventStatistics.h |   4 +-
 .../ee_mumu.sa/SubProcesses/MadgraphTest.h    |   8 +-
 .../SubProcesses/MatrixElementKernels.cc      |  26 +-
 .../SubProcesses/MatrixElementKernels.h       |   8 +-
 .../SubProcesses/MemoryAccessHelpers.h        |   4 +-
 .../SubProcesses/MemoryAccessMomenta.h        |   6 +-
 .../SubProcesses/MemoryAccessRandomNumbers.h  |   4 +-
 .../SubProcesses/MemoryAccessVectors.h        |   4 +-
 .../ee_mumu.sa/SubProcesses/MemoryBuffers.h   |  64 ++--
 .../P1_Sigma_sm_epem_mupmum/CPPProcess.cc     |  64 ++--
 .../P1_Sigma_sm_epem_mupmum/CPPProcess.h      |  10 +-
 .../P1_Sigma_sm_epem_mupmum/CudaRuntime.h     |   1 +
 .../P1_Sigma_sm_epem_mupmum/check_sa.cc       | 103 +++---
 .../SubProcesses/RamboSamplingKernels.cc      |  20 +-
 .../SubProcesses/RamboSamplingKernels.h       |   6 +-
 .../SubProcesses/RandomNumberKernels.h        |   6 +-
 .../ee_mumu.sa/SubProcesses/cudacpp.mk        | 297 +++++++-----------
 .../ee_mumu.sa/SubProcesses/fbridge.cc        |  16 +-
 .../ee_mumu.sa/SubProcesses/fsampler.cc       |   8 +-
 .../ee_mumu.sa/SubProcesses/runTest.cc        |  12 +-
 .../ee_mumu.sa/SubProcesses/testmisc.cc       |   4 +-
 .../ee_mumu.sa/SubProcesses/testxxx.cc        |   8 +-
 epochX/cudacpp/ee_mumu.sa/src/HelAmps_sm.h    |   4 +-
 .../cudacpp/ee_mumu.sa/src/Parameters_sm.cc   |   2 +-
 epochX/cudacpp/ee_mumu.sa/src/Parameters_sm.h |   6 +-
 epochX/cudacpp/ee_mumu.sa/src/mgOnGpuConfig.h |  70 ++---
 .../cudacpp/ee_mumu.sa/src/mgOnGpuCxtypes.h   |  18 +-
 .../cudacpp/ee_mumu.sa/src/mgOnGpuFptypes.h   |  10 +-
 .../cudacpp/ee_mumu.sa/src/mgOnGpuVectors.h   |  14 +-
 epochX/cudacpp/ee_mumu.sa/src/rambo.h         |   8 +-
 epochX/cudacpp/gg_tt.mad/COPYRIGHT            |   1 -
 .../gg_tt.mad/Source/DHELAS/aloha_file.inc    |   2 +-
 .../cudacpp/gg_tt.mad/SubProcesses/Bridge.h   |  32 +-
 .../gg_tt.mad/SubProcesses/BridgeKernels.cc   |   9 +-
 .../gg_tt.mad/SubProcesses/BridgeKernels.h    |   8 +-
 .../SubProcesses/CommonRandomNumberKernel.cc  |   5 +-
 .../SubProcesses/CrossSectionKernels.cc       |   7 +-
 .../SubProcesses/CrossSectionKernels.h        |   6 +-
 .../gg_tt.mad/SubProcesses/CudaRuntime.h      |  85 +++++
 .../SubProcesses/CurandRandomNumberKernel.cc  |   8 +-
 .../gg_tt.mad/SubProcesses/EventStatistics.h  |   4 +-
 .../gg_tt.mad/SubProcesses/MadgraphTest.h     |   8 +-
 .../SubProcesses/MatrixElementKernels.cc      |  26 +-
 .../SubProcesses/MatrixElementKernels.h       |   8 +-
 .../SubProcesses/MemoryAccessHelpers.h        |   4 +-
 .../SubProcesses/MemoryAccessMomenta.h        |   6 +-
 .../SubProcesses/MemoryAccessRandomNumbers.h  |   4 +-
 .../SubProcesses/MemoryAccessVectors.h        |   4 +-
 .../gg_tt.mad/SubProcesses/MemoryBuffers.h    |  64 ++--
 .../SubProcesses/P1_gg_ttx/CPPProcess.cc      |  62 ++--
 .../SubProcesses/P1_gg_ttx/CPPProcess.h       |  10 +-
 .../SubProcesses/P1_gg_ttx/CudaRuntime.h      |   1 +
 .../SubProcesses/P1_gg_ttx/check_sa.cc        | 103 +++---
 .../SubProcesses/RamboSamplingKernels.cc      |  20 +-
 .../SubProcesses/RamboSamplingKernels.h       |   6 +-
 .../SubProcesses/RandomNumberKernels.h        |   6 +-
 .../cudacpp/gg_tt.mad/SubProcesses/cudacpp.mk | 297 +++++++-----------
 .../cudacpp/gg_tt.mad/SubProcesses/fbridge.cc |  16 +-
 .../gg_tt.mad/SubProcesses/fsampler.cc        |   8 +-
 .../cudacpp/gg_tt.mad/SubProcesses/runTest.cc |  12 +-
 .../gg_tt.mad/SubProcesses/testmisc.cc        |   4 +-
 .../cudacpp/gg_tt.mad/SubProcesses/testxxx.cc |   8 +-
 epochX/cudacpp/gg_tt.mad/src/HelAmps_sm.h     |   4 +-
 epochX/cudacpp/gg_tt.mad/src/Parameters_sm.cc |   2 +-
 epochX/cudacpp/gg_tt.mad/src/Parameters_sm.h  |   6 +-
 epochX/cudacpp/gg_tt.mad/src/mgOnGpuConfig.h  |  70 ++---
 epochX/cudacpp/gg_tt.mad/src/mgOnGpuCxtypes.h |  18 +-
 epochX/cudacpp/gg_tt.mad/src/mgOnGpuFptypes.h |  10 +-
 epochX/cudacpp/gg_tt.mad/src/mgOnGpuVectors.h |  14 +-
 epochX/cudacpp/gg_tt.mad/src/rambo.h          |   8 +-
 epochX/cudacpp/gg_tt.sa/COPYRIGHT             |   1 -
 epochX/cudacpp/gg_tt.sa/SubProcesses/Bridge.h |  32 +-
 .../gg_tt.sa/SubProcesses/BridgeKernels.cc    |   9 +-
 .../gg_tt.sa/SubProcesses/BridgeKernels.h     |   8 +-
 .../SubProcesses/CommonRandomNumberKernel.cc  |   5 +-
 .../SubProcesses/CrossSectionKernels.cc       |   7 +-
 .../SubProcesses/CrossSectionKernels.h        |   6 +-
 .../gg_tt.sa/SubProcesses/CudaRuntime.h       |  85 +++++
 .../SubProcesses/CurandRandomNumberKernel.cc  |   8 +-
 .../gg_tt.sa/SubProcesses/EventStatistics.h   |   4 +-
 .../gg_tt.sa/SubProcesses/MadgraphTest.h      |   8 +-
 .../SubProcesses/MatrixElementKernels.cc      |  26 +-
 .../SubProcesses/MatrixElementKernels.h       |   8 +-
 .../SubProcesses/MemoryAccessHelpers.h        |   4 +-
 .../SubProcesses/MemoryAccessMomenta.h        |   6 +-
 .../SubProcesses/MemoryAccessRandomNumbers.h  |   4 +-
 .../SubProcesses/MemoryAccessVectors.h        |   4 +-
 .../gg_tt.sa/SubProcesses/MemoryBuffers.h     |  64 ++--
 .../P1_Sigma_sm_gg_ttx/CPPProcess.cc          |  62 ++--
 .../P1_Sigma_sm_gg_ttx/CPPProcess.h           |  10 +-
 .../P1_Sigma_sm_gg_ttx/CudaRuntime.h          |   1 +
 .../P1_Sigma_sm_gg_ttx/check_sa.cc            | 103 +++---
 .../SubProcesses/RamboSamplingKernels.cc      |  20 +-
 .../SubProcesses/RamboSamplingKernels.h       |   6 +-
 .../SubProcesses/RandomNumberKernels.h        |   6 +-
 .../cudacpp/gg_tt.sa/SubProcesses/cudacpp.mk  | 297 +++++++-----------
 .../cudacpp/gg_tt.sa/SubProcesses/fbridge.cc  |  16 +-
 .../cudacpp/gg_tt.sa/SubProcesses/fsampler.cc |   8 +-
 .../cudacpp/gg_tt.sa/SubProcesses/runTest.cc  |  12 +-
 .../cudacpp/gg_tt.sa/SubProcesses/testmisc.cc |   4 +-
 .../cudacpp/gg_tt.sa/SubProcesses/testxxx.cc  |   8 +-
 epochX/cudacpp/gg_tt.sa/src/HelAmps_sm.h      |   4 +-
 epochX/cudacpp/gg_tt.sa/src/Parameters_sm.cc  |   2 +-
 epochX/cudacpp/gg_tt.sa/src/Parameters_sm.h   |   6 +-
 epochX/cudacpp/gg_tt.sa/src/mgOnGpuConfig.h   |  70 ++---
 epochX/cudacpp/gg_tt.sa/src/mgOnGpuCxtypes.h  |  18 +-
 epochX/cudacpp/gg_tt.sa/src/mgOnGpuFptypes.h  |  10 +-
 epochX/cudacpp/gg_tt.sa/src/mgOnGpuVectors.h  |  14 +-
 epochX/cudacpp/gg_tt.sa/src/rambo.h           |   8 +-
 epochX/cudacpp/gg_tt01g.mad/COPYRIGHT         |   1 -
 .../gg_tt01g.mad/Source/DHELAS/aloha_file.inc |   2 +-
 .../gg_tt01g.mad/SubProcesses/Bridge.h        |  32 +-
 .../SubProcesses/BridgeKernels.cc             |   9 +-
 .../gg_tt01g.mad/SubProcesses/BridgeKernels.h |   8 +-
 .../SubProcesses/CommonRandomNumberKernel.cc  |   5 +-
 .../SubProcesses/CrossSectionKernels.cc       |   7 +-
 .../SubProcesses/CrossSectionKernels.h        |   6 +-
 .../gg_tt01g.mad/SubProcesses/CudaRuntime.h   |  85 +++++
 .../SubProcesses/CurandRandomNumberKernel.cc  |   8 +-
 .../SubProcesses/EventStatistics.h            |   4 +-
 .../gg_tt01g.mad/SubProcesses/MadgraphTest.h  |   8 +-
 .../SubProcesses/MatrixElementKernels.cc      |  26 +-
 .../SubProcesses/MatrixElementKernels.h       |   8 +-
 .../SubProcesses/MemoryAccessHelpers.h        |   4 +-
 .../SubProcesses/MemoryAccessMomenta.h        |   6 +-
 .../SubProcesses/MemoryAccessRandomNumbers.h  |   4 +-
 .../SubProcesses/MemoryAccessVectors.h        |   4 +-
 .../gg_tt01g.mad/SubProcesses/MemoryBuffers.h |  64 ++--
 .../SubProcesses/P1_gg_ttx/CPPProcess.cc      |  62 ++--
 .../SubProcesses/P1_gg_ttx/CPPProcess.h       |  10 +-
 .../SubProcesses/P1_gg_ttx/CudaRuntime.h      |   1 +
 .../SubProcesses/P1_gg_ttx/check_sa.cc        | 103 +++---
 .../SubProcesses/P2_gg_ttxg/CPPProcess.cc     |  62 ++--
 .../SubProcesses/P2_gg_ttxg/CPPProcess.h      |  10 +-
 .../SubProcesses/P2_gg_ttxg/CudaRuntime.h     |   1 +
 .../SubProcesses/P2_gg_ttxg/check_sa.cc       | 103 +++---
 .../SubProcesses/RamboSamplingKernels.cc      |  20 +-
 .../SubProcesses/RamboSamplingKernels.h       |   6 +-
 .../SubProcesses/RandomNumberKernels.h        |   6 +-
 .../gg_tt01g.mad/SubProcesses/cudacpp.mk      | 297 +++++++-----------
 .../gg_tt01g.mad/SubProcesses/fbridge.cc      |  16 +-
 .../gg_tt01g.mad/SubProcesses/fsampler.cc     |   8 +-
 .../gg_tt01g.mad/SubProcesses/runTest.cc      |  12 +-
 .../gg_tt01g.mad/SubProcesses/testmisc.cc     |   4 +-
 .../gg_tt01g.mad/SubProcesses/testxxx.cc      |   8 +-
 epochX/cudacpp/gg_tt01g.mad/src/HelAmps_sm.h  |   4 +-
 .../cudacpp/gg_tt01g.mad/src/Parameters_sm.cc |   2 +-
 .../cudacpp/gg_tt01g.mad/src/Parameters_sm.h  |   6 +-
 .../cudacpp/gg_tt01g.mad/src/mgOnGpuConfig.h  |  70 ++---
 .../cudacpp/gg_tt01g.mad/src/mgOnGpuCxtypes.h |  18 +-
 .../cudacpp/gg_tt01g.mad/src/mgOnGpuFptypes.h |  10 +-
 .../cudacpp/gg_tt01g.mad/src/mgOnGpuVectors.h |  14 +-
 epochX/cudacpp/gg_tt01g.mad/src/rambo.h       |   8 +-
 epochX/cudacpp/gg_ttg.mad/COPYRIGHT           |   1 -
 .../gg_ttg.mad/Source/DHELAS/aloha_file.inc   |   2 +-
 .../cudacpp/gg_ttg.mad/SubProcesses/Bridge.h  |  32 +-
 .../gg_ttg.mad/SubProcesses/BridgeKernels.cc  |   9 +-
 .../gg_ttg.mad/SubProcesses/BridgeKernels.h   |   8 +-
 .../SubProcesses/CommonRandomNumberKernel.cc  |   5 +-
 .../SubProcesses/CrossSectionKernels.cc       |   7 +-
 .../SubProcesses/CrossSectionKernels.h        |   6 +-
 .../gg_ttg.mad/SubProcesses/CudaRuntime.h     |  85 +++++
 .../SubProcesses/CurandRandomNumberKernel.cc  |   8 +-
 .../gg_ttg.mad/SubProcesses/EventStatistics.h |   4 +-
 .../gg_ttg.mad/SubProcesses/MadgraphTest.h    |   8 +-
 .../SubProcesses/MatrixElementKernels.cc      |  26 +-
 .../SubProcesses/MatrixElementKernels.h       |   8 +-
 .../SubProcesses/MemoryAccessHelpers.h        |   4 +-
 .../SubProcesses/MemoryAccessMomenta.h        |   6 +-
 .../SubProcesses/MemoryAccessRandomNumbers.h  |   4 +-
 .../SubProcesses/MemoryAccessVectors.h        |   4 +-
 .../gg_ttg.mad/SubProcesses/MemoryBuffers.h   |  64 ++--
 .../SubProcesses/P1_gg_ttxg/CPPProcess.cc     |  62 ++--
 .../SubProcesses/P1_gg_ttxg/CPPProcess.h      |  10 +-
 .../SubProcesses/P1_gg_ttxg/CudaRuntime.h     |   1 +
 .../SubProcesses/P1_gg_ttxg/check_sa.cc       | 103 +++---
 .../SubProcesses/RamboSamplingKernels.cc      |  20 +-
 .../SubProcesses/RamboSamplingKernels.h       |   6 +-
 .../SubProcesses/RandomNumberKernels.h        |   6 +-
 .../gg_ttg.mad/SubProcesses/cudacpp.mk        | 297 +++++++-----------
 .../gg_ttg.mad/SubProcesses/fbridge.cc        |  16 +-
 .../gg_ttg.mad/SubProcesses/fsampler.cc       |   8 +-
 .../gg_ttg.mad/SubProcesses/runTest.cc        |  12 +-
 .../gg_ttg.mad/SubProcesses/testmisc.cc       |   4 +-
 .../gg_ttg.mad/SubProcesses/testxxx.cc        |   8 +-
 epochX/cudacpp/gg_ttg.mad/src/HelAmps_sm.h    |   4 +-
 .../cudacpp/gg_ttg.mad/src/Parameters_sm.cc   |   2 +-
 epochX/cudacpp/gg_ttg.mad/src/Parameters_sm.h |   6 +-
 epochX/cudacpp/gg_ttg.mad/src/mgOnGpuConfig.h |  70 ++---
 .../cudacpp/gg_ttg.mad/src/mgOnGpuCxtypes.h   |  18 +-
 .../cudacpp/gg_ttg.mad/src/mgOnGpuFptypes.h   |  10 +-
 .../cudacpp/gg_ttg.mad/src/mgOnGpuVectors.h   |  14 +-
 epochX/cudacpp/gg_ttg.mad/src/rambo.h         |   8 +-
 epochX/cudacpp/gg_ttg.sa/COPYRIGHT            |   1 -
 .../cudacpp/gg_ttg.sa/SubProcesses/Bridge.h   |  32 +-
 .../gg_ttg.sa/SubProcesses/BridgeKernels.cc   |   9 +-
 .../gg_ttg.sa/SubProcesses/BridgeKernels.h    |   8 +-
 .../SubProcesses/CommonRandomNumberKernel.cc  |   5 +-
 .../SubProcesses/CrossSectionKernels.cc       |   7 +-
 .../SubProcesses/CrossSectionKernels.h        |   6 +-
 .../gg_ttg.sa/SubProcesses/CudaRuntime.h      |  85 +++++
 .../SubProcesses/CurandRandomNumberKernel.cc  |   8 +-
 .../gg_ttg.sa/SubProcesses/EventStatistics.h  |   4 +-
 .../gg_ttg.sa/SubProcesses/MadgraphTest.h     |   8 +-
 .../SubProcesses/MatrixElementKernels.cc      |  26 +-
 .../SubProcesses/MatrixElementKernels.h       |   8 +-
 .../SubProcesses/MemoryAccessHelpers.h        |   4 +-
 .../SubProcesses/MemoryAccessMomenta.h        |   6 +-
 .../SubProcesses/MemoryAccessRandomNumbers.h  |   4 +-
 .../SubProcesses/MemoryAccessVectors.h        |   4 +-
 .../gg_ttg.sa/SubProcesses/MemoryBuffers.h    |  64 ++--
 .../P1_Sigma_sm_gg_ttxg/CPPProcess.cc         |  62 ++--
 .../P1_Sigma_sm_gg_ttxg/CPPProcess.h          |  10 +-
 .../P1_Sigma_sm_gg_ttxg/CudaRuntime.h         |   1 +
 .../P1_Sigma_sm_gg_ttxg/check_sa.cc           | 103 +++---
 .../SubProcesses/RamboSamplingKernels.cc      |  20 +-
 .../SubProcesses/RamboSamplingKernels.h       |   6 +-
 .../SubProcesses/RandomNumberKernels.h        |   6 +-
 .../cudacpp/gg_ttg.sa/SubProcesses/cudacpp.mk | 297 +++++++-----------
 .../cudacpp/gg_ttg.sa/SubProcesses/fbridge.cc |  16 +-
 .../gg_ttg.sa/SubProcesses/fsampler.cc        |   8 +-
 .../cudacpp/gg_ttg.sa/SubProcesses/runTest.cc |  12 +-
 .../gg_ttg.sa/SubProcesses/testmisc.cc        |   4 +-
 .../cudacpp/gg_ttg.sa/SubProcesses/testxxx.cc |   8 +-
 epochX/cudacpp/gg_ttg.sa/src/HelAmps_sm.h     |   4 +-
 epochX/cudacpp/gg_ttg.sa/src/Parameters_sm.cc |   2 +-
 epochX/cudacpp/gg_ttg.sa/src/Parameters_sm.h  |   6 +-
 epochX/cudacpp/gg_ttg.sa/src/mgOnGpuConfig.h  |  70 ++---
 epochX/cudacpp/gg_ttg.sa/src/mgOnGpuCxtypes.h |  18 +-
 epochX/cudacpp/gg_ttg.sa/src/mgOnGpuFptypes.h |  10 +-
 epochX/cudacpp/gg_ttg.sa/src/mgOnGpuVectors.h |  14 +-
 epochX/cudacpp/gg_ttg.sa/src/rambo.h          |   8 +-
 epochX/cudacpp/gg_ttgg.sa/COPYRIGHT           |   1 -
 .../cudacpp/gg_ttgg.sa/SubProcesses/Bridge.h  |  32 +-
 .../gg_ttgg.sa/SubProcesses/BridgeKernels.cc  |   9 +-
 .../gg_ttgg.sa/SubProcesses/BridgeKernels.h   |   8 +-
 .../SubProcesses/CommonRandomNumberKernel.cc  |   5 +-
 .../SubProcesses/CrossSectionKernels.cc       |   7 +-
 .../SubProcesses/CrossSectionKernels.h        |   6 +-
 .../gg_ttgg.sa/SubProcesses/CudaRuntime.h     |  85 +++++
 .../SubProcesses/CurandRandomNumberKernel.cc  |   8 +-
 .../gg_ttgg.sa/SubProcesses/EventStatistics.h |   4 +-
 .../gg_ttgg.sa/SubProcesses/MadgraphTest.h    |   8 +-
 .../SubProcesses/MatrixElementKernels.cc      |  26 +-
 .../SubProcesses/MatrixElementKernels.h       |   8 +-
 .../SubProcesses/MemoryAccessHelpers.h        |   4 +-
 .../SubProcesses/MemoryAccessMomenta.h        |   6 +-
 .../SubProcesses/MemoryAccessRandomNumbers.h  |   4 +-
 .../SubProcesses/MemoryAccessVectors.h        |   4 +-
 .../gg_ttgg.sa/SubProcesses/MemoryBuffers.h   |  64 ++--
 .../P1_Sigma_sm_gg_ttxgg/CPPProcess.cc        |  62 ++--
 .../P1_Sigma_sm_gg_ttxgg/CPPProcess.h         |  10 +-
 .../P1_Sigma_sm_gg_ttxgg/CudaRuntime.h        |   1 +
 .../P1_Sigma_sm_gg_ttxgg/check_sa.cc          | 103 +++---
 .../SubProcesses/RamboSamplingKernels.cc      |  20 +-
 .../SubProcesses/RamboSamplingKernels.h       |   6 +-
 .../SubProcesses/RandomNumberKernels.h        |   6 +-
 .../gg_ttgg.sa/SubProcesses/cudacpp.mk        | 297 +++++++-----------
 .../gg_ttgg.sa/SubProcesses/fbridge.cc        |  16 +-
 .../gg_ttgg.sa/SubProcesses/fsampler.cc       |   8 +-
 .../gg_ttgg.sa/SubProcesses/runTest.cc        |  12 +-
 .../gg_ttgg.sa/SubProcesses/testmisc.cc       |   4 +-
 .../gg_ttgg.sa/SubProcesses/testxxx.cc        |   8 +-
 epochX/cudacpp/gg_ttgg.sa/src/HelAmps_sm.h    |   4 +-
 .../cudacpp/gg_ttgg.sa/src/Parameters_sm.cc   |   2 +-
 epochX/cudacpp/gg_ttgg.sa/src/Parameters_sm.h |   6 +-
 epochX/cudacpp/gg_ttgg.sa/src/mgOnGpuConfig.h |  70 ++---
 .../cudacpp/gg_ttgg.sa/src/mgOnGpuCxtypes.h   |  18 +-
 .../cudacpp/gg_ttgg.sa/src/mgOnGpuFptypes.h   |  10 +-
 .../cudacpp/gg_ttgg.sa/src/mgOnGpuVectors.h   |  14 +-
 epochX/cudacpp/gg_ttgg.sa/src/rambo.h         |   8 +-
 epochX/cudacpp/gg_ttggg.mad/COPYRIGHT         |   1 -
 .../gg_ttggg.mad/Source/DHELAS/aloha_file.inc |   2 +-
 .../gg_ttggg.mad/SubProcesses/Bridge.h        |  32 +-
 .../SubProcesses/BridgeKernels.cc             |   9 +-
 .../gg_ttggg.mad/SubProcesses/BridgeKernels.h |   8 +-
 .../SubProcesses/CommonRandomNumberKernel.cc  |   5 +-
 .../SubProcesses/CrossSectionKernels.cc       |   7 +-
 .../SubProcesses/CrossSectionKernels.h        |   6 +-
 .../gg_ttggg.mad/SubProcesses/CudaRuntime.h   |  85 +++++
 .../SubProcesses/CurandRandomNumberKernel.cc  |   8 +-
 .../SubProcesses/EventStatistics.h            |   4 +-
 .../gg_ttggg.mad/SubProcesses/MadgraphTest.h  |   8 +-
 .../SubProcesses/MatrixElementKernels.cc      |  26 +-
 .../SubProcesses/MatrixElementKernels.h       |   8 +-
 .../SubProcesses/MemoryAccessHelpers.h        |   4 +-
 .../SubProcesses/MemoryAccessMomenta.h        |   6 +-
 .../SubProcesses/MemoryAccessRandomNumbers.h  |   4 +-
 .../SubProcesses/MemoryAccessVectors.h        |   4 +-
 .../gg_ttggg.mad/SubProcesses/MemoryBuffers.h |  64 ++--
 .../SubProcesses/P1_gg_ttxggg/CPPProcess.cc   |  62 ++--
 .../SubProcesses/P1_gg_ttxggg/CPPProcess.h    |  10 +-
 .../SubProcesses/P1_gg_ttxggg/CudaRuntime.h   |   1 +
 .../SubProcesses/P1_gg_ttxggg/check_sa.cc     | 103 +++---
 .../SubProcesses/RamboSamplingKernels.cc      |  20 +-
 .../SubProcesses/RamboSamplingKernels.h       |   6 +-
 .../SubProcesses/RandomNumberKernels.h        |   6 +-
 .../gg_ttggg.mad/SubProcesses/cudacpp.mk      | 297 +++++++-----------
 .../gg_ttggg.mad/SubProcesses/fbridge.cc      |  16 +-
 .../gg_ttggg.mad/SubProcesses/fsampler.cc     |   8 +-
 .../gg_ttggg.mad/SubProcesses/runTest.cc      |  12 +-
 .../gg_ttggg.mad/SubProcesses/testmisc.cc     |   4 +-
 .../gg_ttggg.mad/SubProcesses/testxxx.cc      |   8 +-
 epochX/cudacpp/gg_ttggg.mad/src/HelAmps_sm.h  |   4 +-
 .../cudacpp/gg_ttggg.mad/src/Parameters_sm.cc |   2 +-
 .../cudacpp/gg_ttggg.mad/src/Parameters_sm.h  |   6 +-
 .../cudacpp/gg_ttggg.mad/src/mgOnGpuConfig.h  |  70 ++---
 .../cudacpp/gg_ttggg.mad/src/mgOnGpuCxtypes.h |  18 +-
 .../cudacpp/gg_ttggg.mad/src/mgOnGpuFptypes.h |  10 +-
 .../cudacpp/gg_ttggg.mad/src/mgOnGpuVectors.h |  14 +-
 epochX/cudacpp/gg_ttggg.mad/src/rambo.h       |   8 +-
 epochX/cudacpp/gg_ttggg.sa/COPYRIGHT          |   1 -
 .../cudacpp/gg_ttggg.sa/SubProcesses/Bridge.h |  32 +-
 .../gg_ttggg.sa/SubProcesses/BridgeKernels.cc |   9 +-
 .../gg_ttggg.sa/SubProcesses/BridgeKernels.h  |   8 +-
 .../SubProcesses/CommonRandomNumberKernel.cc  |   5 +-
 .../SubProcesses/CrossSectionKernels.cc       |   7 +-
 .../SubProcesses/CrossSectionKernels.h        |   6 +-
 .../gg_ttggg.sa/SubProcesses/CudaRuntime.h    |  85 +++++
 .../SubProcesses/CurandRandomNumberKernel.cc  |   8 +-
 .../SubProcesses/EventStatistics.h            |   4 +-
 .../gg_ttggg.sa/SubProcesses/MadgraphTest.h   |   8 +-
 .../SubProcesses/MatrixElementKernels.cc      |  26 +-
 .../SubProcesses/MatrixElementKernels.h       |   8 +-
 .../SubProcesses/MemoryAccessHelpers.h        |   4 +-
 .../SubProcesses/MemoryAccessMomenta.h        |   6 +-
 .../SubProcesses/MemoryAccessRandomNumbers.h  |   4 +-
 .../SubProcesses/MemoryAccessVectors.h        |   4 +-
 .../gg_ttggg.sa/SubProcesses/MemoryBuffers.h  |  64 ++--
 .../P1_Sigma_sm_gg_ttxggg/CPPProcess.cc       |  62 ++--
 .../P1_Sigma_sm_gg_ttxggg/CPPProcess.h        |  10 +-
 .../P1_Sigma_sm_gg_ttxggg/CudaRuntime.h       |   1 +
 .../P1_Sigma_sm_gg_ttxggg/check_sa.cc         | 103 +++---
 .../SubProcesses/RamboSamplingKernels.cc      |  20 +-
 .../SubProcesses/RamboSamplingKernels.h       |   6 +-
 .../SubProcesses/RandomNumberKernels.h        |   6 +-
 .../gg_ttggg.sa/SubProcesses/cudacpp.mk       | 297 +++++++-----------
 .../gg_ttggg.sa/SubProcesses/fbridge.cc       |  16 +-
 .../gg_ttggg.sa/SubProcesses/fsampler.cc      |   8 +-
 .../gg_ttggg.sa/SubProcesses/runTest.cc       |  12 +-
 .../gg_ttggg.sa/SubProcesses/testmisc.cc      |   4 +-
 .../gg_ttggg.sa/SubProcesses/testxxx.cc       |   8 +-
 epochX/cudacpp/gg_ttggg.sa/src/HelAmps_sm.h   |   4 +-
 .../cudacpp/gg_ttggg.sa/src/Parameters_sm.cc  |   2 +-
 .../cudacpp/gg_ttggg.sa/src/Parameters_sm.h   |   6 +-
 .../cudacpp/gg_ttggg.sa/src/mgOnGpuConfig.h   |  70 ++---
 .../cudacpp/gg_ttggg.sa/src/mgOnGpuCxtypes.h  |  18 +-
 .../cudacpp/gg_ttggg.sa/src/mgOnGpuFptypes.h  |  10 +-
 .../cudacpp/gg_ttggg.sa/src/mgOnGpuVectors.h  |  14 +-
 epochX/cudacpp/gg_ttggg.sa/src/rambo.h        |   8 +-
 epochX/cudacpp/gq_ttq.mad/COPYRIGHT           |   1 -
 .../gq_ttq.mad/Source/DHELAS/aloha_file.inc   |   2 +-
 .../cudacpp/gq_ttq.mad/SubProcesses/Bridge.h  |  32 +-
 .../gq_ttq.mad/SubProcesses/BridgeKernels.cc  |   9 +-
 .../gq_ttq.mad/SubProcesses/BridgeKernels.h   |   8 +-
 .../SubProcesses/CommonRandomNumberKernel.cc  |   5 +-
 .../SubProcesses/CrossSectionKernels.cc       |   7 +-
 .../SubProcesses/CrossSectionKernels.h        |   6 +-
 .../gq_ttq.mad/SubProcesses/CudaRuntime.h     |  85 +++++
 .../SubProcesses/CurandRandomNumberKernel.cc  |   8 +-
 .../gq_ttq.mad/SubProcesses/EventStatistics.h |   4 +-
 .../gq_ttq.mad/SubProcesses/MadgraphTest.h    |   8 +-
 .../SubProcesses/MatrixElementKernels.cc      |  26 +-
 .../SubProcesses/MatrixElementKernels.h       |   8 +-
 .../SubProcesses/MemoryAccessHelpers.h        |   4 +-
 .../SubProcesses/MemoryAccessMomenta.h        |   6 +-
 .../SubProcesses/MemoryAccessRandomNumbers.h  |   4 +-
 .../SubProcesses/MemoryAccessVectors.h        |   4 +-
 .../gq_ttq.mad/SubProcesses/MemoryBuffers.h   |  64 ++--
 .../SubProcesses/P1_gu_ttxu/CPPProcess.cc     |  64 ++--
 .../SubProcesses/P1_gu_ttxu/CPPProcess.h      |  10 +-
 .../SubProcesses/P1_gu_ttxu/CudaRuntime.h     |   1 +
 .../SubProcesses/P1_gu_ttxu/check_sa.cc       | 103 +++---
 .../SubProcesses/P1_gux_ttxux/CPPProcess.cc   |  62 ++--
 .../SubProcesses/P1_gux_ttxux/CPPProcess.h    |  10 +-
 .../SubProcesses/P1_gux_ttxux/CudaRuntime.h   |   1 +
 .../SubProcesses/P1_gux_ttxux/check_sa.cc     | 103 +++---
 .../SubProcesses/RamboSamplingKernels.cc      |  20 +-
 .../SubProcesses/RamboSamplingKernels.h       |   6 +-
 .../SubProcesses/RandomNumberKernels.h        |   6 +-
 .../gq_ttq.mad/SubProcesses/cudacpp.mk        | 297 +++++++-----------
 .../gq_ttq.mad/SubProcesses/fbridge.cc        |  16 +-
 .../gq_ttq.mad/SubProcesses/fsampler.cc       |   8 +-
 .../gq_ttq.mad/SubProcesses/runTest.cc        |  12 +-
 .../gq_ttq.mad/SubProcesses/testmisc.cc       |   4 +-
 .../gq_ttq.mad/SubProcesses/testxxx.cc        |   8 +-
 epochX/cudacpp/gq_ttq.mad/src/HelAmps_sm.h    |   4 +-
 .../cudacpp/gq_ttq.mad/src/Parameters_sm.cc   |   2 +-
 epochX/cudacpp/gq_ttq.mad/src/Parameters_sm.h |   6 +-
 epochX/cudacpp/gq_ttq.mad/src/mgOnGpuConfig.h |  70 ++---
 .../cudacpp/gq_ttq.mad/src/mgOnGpuCxtypes.h   |  18 +-
 .../cudacpp/gq_ttq.mad/src/mgOnGpuFptypes.h   |  10 +-
 .../cudacpp/gq_ttq.mad/src/mgOnGpuVectors.h   |  14 +-
 epochX/cudacpp/gq_ttq.mad/src/rambo.h         |   8 +-
 epochX/cudacpp/gq_ttq.sa/COPYRIGHT            |   1 -
 .../cudacpp/gq_ttq.sa/SubProcesses/Bridge.h   |  32 +-
 .../gq_ttq.sa/SubProcesses/BridgeKernels.cc   |   9 +-
 .../gq_ttq.sa/SubProcesses/BridgeKernels.h    |   8 +-
 .../SubProcesses/CommonRandomNumberKernel.cc  |   5 +-
 .../SubProcesses/CrossSectionKernels.cc       |   7 +-
 .../SubProcesses/CrossSectionKernels.h        |   6 +-
 .../gq_ttq.sa/SubProcesses/CudaRuntime.h      |  85 +++++
 .../SubProcesses/CurandRandomNumberKernel.cc  |   8 +-
 .../gq_ttq.sa/SubProcesses/EventStatistics.h  |   4 +-
 .../gq_ttq.sa/SubProcesses/MadgraphTest.h     |   8 +-
 .../SubProcesses/MatrixElementKernels.cc      |  26 +-
 .../SubProcesses/MatrixElementKernels.h       |   8 +-
 .../SubProcesses/MemoryAccessHelpers.h        |   4 +-
 .../SubProcesses/MemoryAccessMomenta.h        |   6 +-
 .../SubProcesses/MemoryAccessRandomNumbers.h  |   4 +-
 .../SubProcesses/MemoryAccessVectors.h        |   4 +-
 .../gq_ttq.sa/SubProcesses/MemoryBuffers.h    |  64 ++--
 .../P1_Sigma_sm_gu_ttxu/CPPProcess.cc         |  64 ++--
 .../P1_Sigma_sm_gu_ttxu/CPPProcess.h          |  10 +-
 .../P1_Sigma_sm_gu_ttxu/CudaRuntime.h         |   1 +
 .../P1_Sigma_sm_gu_ttxu/check_sa.cc           | 103 +++---
 .../P1_Sigma_sm_gux_ttxux/CPPProcess.cc       |  62 ++--
 .../P1_Sigma_sm_gux_ttxux/CPPProcess.h        |  10 +-
 .../P1_Sigma_sm_gux_ttxux/CudaRuntime.h       |   1 +
 .../P1_Sigma_sm_gux_ttxux/check_sa.cc         | 103 +++---
 .../SubProcesses/RamboSamplingKernels.cc      |  20 +-
 .../SubProcesses/RamboSamplingKernels.h       |   6 +-
 .../SubProcesses/RandomNumberKernels.h        |   6 +-
 .../cudacpp/gq_ttq.sa/SubProcesses/cudacpp.mk | 297 +++++++-----------
 .../cudacpp/gq_ttq.sa/SubProcesses/fbridge.cc |  16 +-
 .../gq_ttq.sa/SubProcesses/fsampler.cc        |   8 +-
 .../cudacpp/gq_ttq.sa/SubProcesses/runTest.cc |  12 +-
 .../gq_ttq.sa/SubProcesses/testmisc.cc        |   4 +-
 .../cudacpp/gq_ttq.sa/SubProcesses/testxxx.cc |   8 +-
 epochX/cudacpp/gq_ttq.sa/src/HelAmps_sm.h     |   4 +-
 epochX/cudacpp/gq_ttq.sa/src/Parameters_sm.cc |   2 +-
 epochX/cudacpp/gq_ttq.sa/src/Parameters_sm.h  |   6 +-
 epochX/cudacpp/gq_ttq.sa/src/mgOnGpuConfig.h  |  70 ++---
 epochX/cudacpp/gq_ttq.sa/src/mgOnGpuCxtypes.h |  18 +-
 epochX/cudacpp/gq_ttq.sa/src/mgOnGpuFptypes.h |  10 +-
 epochX/cudacpp/gq_ttq.sa/src/mgOnGpuVectors.h |  14 +-
 epochX/cudacpp/gq_ttq.sa/src/rambo.h          |   8 +-
 epochX/cudacpp/heft_gg_h.sa/COPYRIGHT         |   1 -
 .../heft_gg_h.sa/SubProcesses/Bridge.h        |  32 +-
 .../SubProcesses/BridgeKernels.cc             |   9 +-
 .../heft_gg_h.sa/SubProcesses/BridgeKernels.h |   8 +-
 .../SubProcesses/CommonRandomNumberKernel.cc  |   5 +-
 .../SubProcesses/CrossSectionKernels.cc       |   7 +-
 .../SubProcesses/CrossSectionKernels.h        |   6 +-
 .../heft_gg_h.sa/SubProcesses/CudaRuntime.h   |  85 +++++
 .../SubProcesses/CurandRandomNumberKernel.cc  |   8 +-
 .../SubProcesses/EventStatistics.h            |   4 +-
 .../heft_gg_h.sa/SubProcesses/MadgraphTest.h  |   8 +-
 .../SubProcesses/MatrixElementKernels.cc      |  26 +-
 .../SubProcesses/MatrixElementKernels.h       |   8 +-
 .../SubProcesses/MemoryAccessHelpers.h        |   4 +-
 .../SubProcesses/MemoryAccessMomenta.h        |   6 +-
 .../SubProcesses/MemoryAccessRandomNumbers.h  |   4 +-
 .../SubProcesses/MemoryAccessVectors.h        |   4 +-
 .../heft_gg_h.sa/SubProcesses/MemoryBuffers.h |  64 ++--
 .../P1_Sigma_heft_gg_h/CPPProcess.cc          |  62 ++--
 .../P1_Sigma_heft_gg_h/CPPProcess.h           |  10 +-
 .../P1_Sigma_heft_gg_h/CudaRuntime.h          |   1 +
 .../P1_Sigma_heft_gg_h/check_sa.cc            | 103 +++---
 .../SubProcesses/RamboSamplingKernels.cc      |  20 +-
 .../SubProcesses/RamboSamplingKernels.h       |   6 +-
 .../SubProcesses/RandomNumberKernels.h        |   6 +-
 .../heft_gg_h.sa/SubProcesses/cudacpp.mk      | 297 +++++++-----------
 .../heft_gg_h.sa/SubProcesses/fbridge.cc      |  16 +-
 .../heft_gg_h.sa/SubProcesses/fsampler.cc     |   8 +-
 .../heft_gg_h.sa/SubProcesses/runTest.cc      |  12 +-
 .../heft_gg_h.sa/SubProcesses/testmisc.cc     |   4 +-
 .../heft_gg_h.sa/SubProcesses/testxxx.cc      |   8 +-
 .../cudacpp/heft_gg_h.sa/src/HelAmps_heft.h   |   4 +-
 .../heft_gg_h.sa/src/Parameters_heft.cc       |   2 +-
 .../heft_gg_h.sa/src/Parameters_heft.h        |   6 +-
 .../cudacpp/heft_gg_h.sa/src/mgOnGpuConfig.h  |  70 ++---
 .../cudacpp/heft_gg_h.sa/src/mgOnGpuCxtypes.h |  18 +-
 .../cudacpp/heft_gg_h.sa/src/mgOnGpuFptypes.h |  10 +-
 .../cudacpp/heft_gg_h.sa/src/mgOnGpuVectors.h |  14 +-
 epochX/cudacpp/heft_gg_h.sa/src/rambo.h       |   8 +-
 525 files changed, 6360 insertions(+), 6850 deletions(-)
 create mode 100644 epochX/cudacpp/ee_mumu.mad/SubProcesses/CudaRuntime.h
 create mode 120000 epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/CudaRuntime.h
 create mode 100644 epochX/cudacpp/ee_mumu.sa/SubProcesses/CudaRuntime.h
 create mode 120000 epochX/cudacpp/ee_mumu.sa/SubProcesses/P1_Sigma_sm_epem_mupmum/CudaRuntime.h
 create mode 100644 epochX/cudacpp/gg_tt.mad/SubProcesses/CudaRuntime.h
 create mode 120000 epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/CudaRuntime.h
 create mode 100644 epochX/cudacpp/gg_tt.sa/SubProcesses/CudaRuntime.h
 create mode 120000 epochX/cudacpp/gg_tt.sa/SubProcesses/P1_Sigma_sm_gg_ttx/CudaRuntime.h
 create mode 100644 epochX/cudacpp/gg_tt01g.mad/SubProcesses/CudaRuntime.h
 create mode 120000 epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/CudaRuntime.h
 create mode 120000 epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/CudaRuntime.h
 create mode 100644 epochX/cudacpp/gg_ttg.mad/SubProcesses/CudaRuntime.h
 create mode 120000 epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/CudaRuntime.h
 create mode 100644 epochX/cudacpp/gg_ttg.sa/SubProcesses/CudaRuntime.h
 create mode 120000 epochX/cudacpp/gg_ttg.sa/SubProcesses/P1_Sigma_sm_gg_ttxg/CudaRuntime.h
 create mode 100644 epochX/cudacpp/gg_ttgg.sa/SubProcesses/CudaRuntime.h
 create mode 120000 epochX/cudacpp/gg_ttgg.sa/SubProcesses/P1_Sigma_sm_gg_ttxgg/CudaRuntime.h
 create mode 100644 epochX/cudacpp/gg_ttggg.mad/SubProcesses/CudaRuntime.h
 create mode 120000 epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/CudaRuntime.h
 create mode 100644 epochX/cudacpp/gg_ttggg.sa/SubProcesses/CudaRuntime.h
 create mode 120000 epochX/cudacpp/gg_ttggg.sa/SubProcesses/P1_Sigma_sm_gg_ttxggg/CudaRuntime.h
 create mode 100644 epochX/cudacpp/gq_ttq.mad/SubProcesses/CudaRuntime.h
 create mode 120000 epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/CudaRuntime.h
 create mode 120000 epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/CudaRuntime.h
 create mode 100644 epochX/cudacpp/gq_ttq.sa/SubProcesses/CudaRuntime.h
 create mode 120000 epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gu_ttxu/CudaRuntime.h
 create mode 120000 epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gux_ttxux/CudaRuntime.h
 create mode 100644 epochX/cudacpp/heft_gg_h.sa/SubProcesses/CudaRuntime.h
 create mode 120000 epochX/cudacpp/heft_gg_h.sa/SubProcesses/P1_Sigma_heft_gg_h/CudaRuntime.h

diff --git a/epochX/cudacpp/ee_mumu.mad/COPYRIGHT b/epochX/cudacpp/ee_mumu.mad/COPYRIGHT
index 84a883fbb0..a134b5fef9 100644
--- a/epochX/cudacpp/ee_mumu.mad/COPYRIGHT
+++ b/epochX/cudacpp/ee_mumu.mad/COPYRIGHT
@@ -15,7 +15,6 @@ The full development team currently includes the following authors :
   Stephan Hageboeck (CERN)
   Olivier Mattelaer (Universite Catholique de Louvain, original author)
   Stefan Roiser (CERN, original author)
-  Joergen Teig (CERN)
   Andrea Valassi (CERN, original author)
   Zenny Wettersten (CERN)
 See https://github.com/madgraph5/madgraph4gpu for more details. For the full
diff --git a/epochX/cudacpp/ee_mumu.mad/Source/DHELAS/aloha_file.inc b/epochX/cudacpp/ee_mumu.mad/Source/DHELAS/aloha_file.inc
index 738db319fd..4f385d6435 100644
--- a/epochX/cudacpp/ee_mumu.mad/Source/DHELAS/aloha_file.inc
+++ b/epochX/cudacpp/ee_mumu.mad/Source/DHELAS/aloha_file.inc
@@ -1 +1 @@
-ALOHARoutine = FFV1_0.o FFV4_3.o FFV1P0_3.o FFV2_0.o FFV4_0.o FFV2_3.o
+ALOHARoutine = FFV2_3.o FFV2_0.o FFV4_0.o FFV4_3.o FFV1_0.o FFV1P0_3.o
diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/Bridge.h b/epochX/cudacpp/ee_mumu.mad/SubProcesses/Bridge.h
index c04628dfd1..4cafe0c997 100644
--- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/Bridge.h
+++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/Bridge.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: S. Roiser (Nov 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Roiser, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef BRIDGE_H
 #define BRIDGE_H 1
@@ -22,7 +22,7 @@
 #include <memory>
 #include <type_traits>
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -82,7 +82,7 @@ namespace mg5amcCpu
     Bridge& operator=( const Bridge& ) = delete;
     Bridge& operator=( Bridge&& ) = delete;
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
     /**
      * Set the gpublocks and gputhreads for the gpusequence - throws if evnt != gpublocks*gputhreads
      * (this is needed for BridgeKernel tests rather than for actual production use in Fortran)
@@ -149,7 +149,7 @@ namespace mg5amcCpu
     unsigned int m_nevt; // number of events
     int m_nGoodHel;      // the number of good helicities (-1 initially when they have not yet been calculated)
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
     int m_gputhreads; // number of gpu threads (default set from number of events, can be modified)
     int m_gpublocks;  // number of gpu blocks (default set from number of events, can be modified)
     mg5amcGpu::DeviceBuffer<FORTRANFPTYPE, sizePerEventMomenta> m_devMomentaF;
@@ -186,12 +186,12 @@ namespace mg5amcCpu
   // Forward declare transposition methods
   //
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 
   template<typename Tin, typename Tout>
   __global__ void dev_transposeMomentaF2C( const Tin* in, Tout* out, const unsigned int nevt );
 
-#endif // MGONGPUCPP_GPUIMPL
+#endif // __CUDACC__
 
   template<typename Tin, typename Tout>
   void hst_transposeMomentaF2C( const Tin* in, Tout* out, const unsigned int nevt );
@@ -208,7 +208,7 @@ namespace mg5amcCpu
   Bridge<FORTRANFPTYPE>::Bridge( unsigned int nevtF, unsigned int nparF, unsigned int np4F )
     : m_nevt( nevtF )
     , m_nGoodHel( -1 )
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
     , m_gputhreads( 256 )                  // default number of gpu threads
     , m_gpublocks( m_nevt / m_gputhreads ) // this ensures m_nevt <= m_gpublocks*m_gputhreads
     , m_devMomentaF( m_nevt )
@@ -232,7 +232,7 @@ namespace mg5amcCpu
   {
     if( nparF != CPPProcess::npar ) throw std::runtime_error( "Bridge constructor: npar mismatch" );
     if( np4F != CPPProcess::np4 ) throw std::runtime_error( "Bridge constructor: np4 mismatch" );
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
     if( ( m_nevt < s_gputhreadsmin ) || ( m_nevt % s_gputhreadsmin != 0 ) )
       throw std::runtime_error( "Bridge constructor: nevt should be a multiple of " + std::to_string( s_gputhreadsmin ) );
     while( m_nevt != m_gpublocks * m_gputhreads )
@@ -250,11 +250,11 @@ namespace mg5amcCpu
     std::cout << "WARNING! Instantiate host Bridge (nevt=" << m_nevt << ")" << std::endl;
     mg5amcCpu::CPPProcess process( /*verbose=*/false );
     m_pmek.reset( new mg5amcCpu::MatrixElementKernelHost( m_hstMomentaC, m_hstGs, m_hstRndHel, m_hstRndCol, m_hstMEs, m_hstSelHel, m_hstSelCol, m_nevt ) );
-#endif // MGONGPUCPP_GPUIMPL
+#endif // __CUDACC__
     process.initProc( "../../Cards/param_card.dat" );
   }
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
   template<typename FORTRANFPTYPE>
   void Bridge<FORTRANFPTYPE>::set_gpugrid( const int gpublocks, const int gputhreads )
   {
@@ -268,7 +268,7 @@ namespace mg5amcCpu
   }
 #endif
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
   template<typename FORTRANFPTYPE>
   void Bridge<FORTRANFPTYPE>::gpu_sequence( const FORTRANFPTYPE* momenta,
                                             const FORTRANFPTYPE* gs,
@@ -283,14 +283,14 @@ namespace mg5amcCpu
     constexpr int neppM = MemoryAccessMomenta::neppM;
     if constexpr( neppM == 1 && std::is_same_v<FORTRANFPTYPE, fptype> )
     {
-      gpuMemcpy( m_devMomentaC.data(), momenta, m_devMomentaC.bytes(), gpuMemcpyHostToDevice );
+      checkCuda( cudaMemcpy( m_devMomentaC.data(), momenta, m_devMomentaC.bytes(), cudaMemcpyHostToDevice ) );
     }
     else
     {
-      gpuMemcpy( m_devMomentaF.data(), momenta, m_devMomentaF.bytes(), gpuMemcpyHostToDevice );
+      checkCuda( cudaMemcpy( m_devMomentaF.data(), momenta, m_devMomentaF.bytes(), cudaMemcpyHostToDevice ) );
       const int thrPerEvt = CPPProcess::npar * CPPProcess::np4; // AV: transpose alg does 1 element per thread (NOT 1 event per thread)
       //const int thrPerEvt = 1; // AV: try new alg with 1 event per thread... this seems slower
-      gpuLaunchKernel( dev_transposeMomentaF2C, m_gpublocks * thrPerEvt, m_gputhreads, m_devMomentaF.data(), m_devMomentaC.data(), m_nevt );
+      dev_transposeMomentaF2C<<<m_gpublocks * thrPerEvt, m_gputhreads>>>( m_devMomentaF.data(), m_devMomentaC.data(), m_nevt );
     }
     if constexpr( std::is_same_v<FORTRANFPTYPE, fptype> )
     {
@@ -333,7 +333,7 @@ namespace mg5amcCpu
   }
 #endif
 
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
   template<typename FORTRANFPTYPE>
   void Bridge<FORTRANFPTYPE>::cpu_sequence( const FORTRANFPTYPE* momenta,
                                             const FORTRANFPTYPE* gs,
@@ -388,7 +388,7 @@ namespace mg5amcCpu
   // - C++ array: momenta[npagM][npar][np4][neppM] with nevt=npagM*neppM (AOSOA)
   //
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
   template<typename Tin, typename Tout>
   __global__ void dev_transposeMomentaF2C( const Tin* in, Tout* out, const unsigned int nevt )
   {
diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/BridgeKernels.cc b/epochX/cudacpp/ee_mumu.mad/SubProcesses/BridgeKernels.cc
index 90c7f2d3b8..cef4cb3c71 100644
--- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/BridgeKernels.cc
+++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/BridgeKernels.cc
@@ -1,11 +1,10 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #include "BridgeKernels.h"
 
-#include "GpuAbstraction.h"
 #include "MemoryAccessMomenta.h"
 
 #include <sstream>
@@ -15,7 +14,7 @@ constexpr int npar = CPPProcess::npar; // #particles in total (external = initia
 
 //============================================================================
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -46,7 +45,7 @@ namespace mg5amcCpu
 
 //============================================================================
 
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
 namespace mg5amcCpu
 {
 
@@ -97,7 +96,7 @@ namespace mg5amcCpu
 
 //============================================================================
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 namespace mg5amcGpu
 {
 
diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/BridgeKernels.h b/epochX/cudacpp/ee_mumu.mad/SubProcesses/BridgeKernels.h
index 3efef8ce97..15eb4bff4d 100644
--- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/BridgeKernels.h
+++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/BridgeKernels.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef BRIDGEKERNELS_H
 #define BRIDGEKERNELS_H 1
@@ -12,7 +12,7 @@
 #include "MatrixElementKernels.h"
 #include "MemoryBuffers.h"
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -49,7 +49,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
   // A Bridge wrapper class encapsulating matrix element calculations on a CPU host
   class BridgeKernelHost final : public BridgeKernelBase
   {
@@ -89,7 +89,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
   // A Bridge wrapper class encapsulating matrix element calculations on a GPU device
   class BridgeKernelDevice : public BridgeKernelBase
   {
diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/CommonRandomNumberKernel.cc b/epochX/cudacpp/ee_mumu.mad/SubProcesses/CommonRandomNumberKernel.cc
index 010bc4cbd0..985b39f576 100644
--- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/CommonRandomNumberKernel.cc
+++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/CommonRandomNumberKernel.cc
@@ -1,16 +1,15 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #include "CommonRandomNumbers.h"
-#include "GpuAbstraction.h"
 #include "MemoryBuffers.h"
 #include "RandomNumberKernels.h"
 
 #include <cassert>
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/CrossSectionKernels.cc b/epochX/cudacpp/ee_mumu.mad/SubProcesses/CrossSectionKernels.cc
index c15b39844d..0b355a3c8d 100644
--- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/CrossSectionKernels.cc
+++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/CrossSectionKernels.cc
@@ -1,11 +1,10 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #include "CrossSectionKernels.h"
 
-#include "GpuAbstraction.h"
 #include "MemoryAccessMatrixElements.h"
 #include "MemoryAccessWeights.h"
 #include "MemoryBuffers.h"
@@ -78,7 +77,7 @@ debug_me_is_abnormal( const fptype& me, size_t ievtALL )
 
 //============================================================================
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -186,7 +185,7 @@ namespace mg5amcCpu
 
 //============================================================================
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 namespace mg5amcGpu
 {
 
diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/CrossSectionKernels.h b/epochX/cudacpp/ee_mumu.mad/SubProcesses/CrossSectionKernels.h
index 4d9659e04e..7933ca4bbf 100644
--- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/CrossSectionKernels.h
+++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/CrossSectionKernels.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef CROSSSECTIONKERNELS_H
 #define CROSSSECTIONKERNELS_H 1
@@ -13,7 +13,7 @@
 
 //============================================================================
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -96,7 +96,7 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
   /*
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
   // A class encapsulating the calculation of event statistics on a GPU device
   class CrossSectionKernelDevice : public CrossSectionKernelBase, public NumberOfEvents
   {
diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/CudaRuntime.h b/epochX/cudacpp/ee_mumu.mad/SubProcesses/CudaRuntime.h
new file mode 100644
index 0000000000..64ce52f4b3
--- /dev/null
+++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/CudaRuntime.h
@@ -0,0 +1,85 @@
+// Copyright (C) 2020-2023 CERN and UCLouvain.
+// Licensed under the GNU Lesser General Public License (version 3 or later).
+// Created by: S. Roiser (Jul 2020) for the MG5aMC CUDACPP plugin.
+// Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+
+#ifndef MG5AMC_CUDARUNTIME_H
+#define MG5AMC_CUDARUNTIME_H 1
+
+// MG5AMC on GPU uses the CUDA runtime API, not the lower level CUDA driver API
+// See https://docs.nvidia.com/cuda/cuda-runtime-api/driver-vs-runtime-api.html#driver-vs-runtime-api
+
+#include <cassert>
+#include <iostream>
+
+//--------------------------------------------------------------------------
+
+// See https://stackoverflow.com/a/14038590
+#ifdef __CUDACC__ /* clang-format off */
+#define checkCuda( code ) { assertCuda( code, __FILE__, __LINE__ ); }
+inline void assertCuda( cudaError_t code, const char* file, int line, bool abort = true )
+{
+  if( code != cudaSuccess )
+  {
+    printf( "ERROR! assertCuda: '%s' (%d) in %s:%d\n", cudaGetErrorString( code ), code, file, line );
+    if( abort ) assert( code == cudaSuccess );
+  }
+}
+#endif /* clang-format on */
+
+//--------------------------------------------------------------------------
+
+#ifdef __CUDACC__
+namespace mg5amcGpu
+{
+  // Instantiate a CudaRuntime at the beginnining of the application's main to
+  // invoke cudaSetDevice(0) in the constructor and book a cudaDeviceReset() call in the destructor
+  // *** FIXME! This will all need to be designed differently when going to multi-GPU nodes! ***
+  struct CudaRuntime final
+  {
+    CudaRuntime( const bool debug = true )
+      : m_debug( debug ) { setUp( m_debug ); }
+    ~CudaRuntime() { tearDown( m_debug ); }
+    CudaRuntime( const CudaRuntime& ) = delete;
+    CudaRuntime( CudaRuntime&& ) = delete;
+    CudaRuntime& operator=( const CudaRuntime& ) = delete;
+    CudaRuntime& operator=( CudaRuntime&& ) = delete;
+    bool m_debug;
+
+    // Set up CUDA application
+    // ** NB: strictly speaking this is not needed when using the CUDA runtime API **
+    // Calling cudaSetDevice on startup is useful to properly book-keep the time spent in CUDA initialization
+    static void setUp( const bool debug = true )
+    {
+      // ** NB: it is useful to call cudaSetDevice, or cudaFree, to properly book-keep the time spent in CUDA initialization
+      // ** NB: otherwise, the first CUDA operation (eg a cudaMemcpyToSymbol in CPPProcess ctor) appears to take much longer!
+      /*
+      // [We initially added cudaFree(0) to "ease profile analysis" only because it shows up as a big recognizable block!]
+      // No explicit initialization is needed: https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#initialization
+      // It is not clear what cudaFree(0) does at all: https://stackoverflow.com/questions/69967813/
+      if ( debug ) std::cout << "__CudaRuntime: calling cudaFree(0)" << std::endl;
+      checkCuda( cudaFree( 0 ) ); // SLOW!
+      */
+      // Replace cudaFree(0) by cudaSetDevice(0), even if it is not really needed either
+      // (but see https://developer.nvidia.com/blog/cuda-pro-tip-always-set-current-device-avoid-multithreading-bugs)
+      if( debug ) std::cout << "__CudaRuntime: calling cudaSetDevice(0)" << std::endl;
+      checkCuda( cudaSetDevice( 0 ) ); // SLOW!
+    }
+
+    // Tear down CUDA application (call cudaDeviceReset)
+    // ** NB: strictly speaking this is not needed when using the CUDA runtime API **
+    // Calling cudaDeviceReset on shutdown is only needed for checking memory leaks in cuda-memcheck
+    // See https://docs.nvidia.com/cuda/cuda-memcheck/index.html#leak-checking
+    static void tearDown( const bool debug = true )
+    {
+      if( debug ) std::cout << "__CudaRuntime: calling cudaDeviceReset()" << std::endl;
+      checkCuda( cudaDeviceReset() );
+    }
+  };
+
+}
+#endif
+
+//--------------------------------------------------------------------------
+
+#endif // MG5AMC_CUDARUNTIME_H
diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/CurandRandomNumberKernel.cc b/epochX/cudacpp/ee_mumu.mad/SubProcesses/CurandRandomNumberKernel.cc
index 38c477c17a..eb56333b03 100644
--- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/CurandRandomNumberKernel.cc
+++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/CurandRandomNumberKernel.cc
@@ -1,9 +1,9 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
-#include "GpuRuntime.h"
+#include "CudaRuntime.h"
 #include "MemoryBuffers.h"
 #include "RandomNumberKernels.h"
 
@@ -114,7 +114,7 @@ namespace mg5amcCpu
     /*
     printf( "\nCurandRandomNumberKernel::generateRnarray size = %d\n", (int)m_rnarray.size() );
     fptype* data = m_rnarray.data();
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
     if( m_rnarray.isOnDevice() )
     {
       data = new fptype[m_rnarray.size()]();
@@ -123,7 +123,7 @@ namespace mg5amcCpu
 #endif
     for( int i = 0; i < ( (int)m_rnarray.size() / 4 ); i++ )
       printf( "[%4d] %f %f %f %f\n", i * 4, data[i * 4], data[i * 4 + 2], data[i * 4 + 2], data[i * 4 + 3] );
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
     if( m_rnarray.isOnDevice() ) delete[] data;
 #endif
     */
diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/EventStatistics.h b/epochX/cudacpp/ee_mumu.mad/SubProcesses/EventStatistics.h
index b425a5bade..48b51e0a49 100644
--- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/EventStatistics.h
+++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/EventStatistics.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef EventStatistics_H
 #define EventStatistics_H 1
@@ -16,7 +16,7 @@
 #include <limits>
 #include <string>
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/MadgraphTest.h b/epochX/cudacpp/ee_mumu.mad/SubProcesses/MadgraphTest.h
index d2ff326e20..fd7734ce42 100644
--- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/MadgraphTest.h
+++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/MadgraphTest.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: S. Hageboeck (Dec 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MADGRAPHTEST_H_
 #define MADGRAPHTEST_H_ 1
@@ -21,7 +21,7 @@
 #include <string>
 #include <vector>
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 using mg5amcGpu::CPPProcess;
 #else
 using mg5amcCpu::CPPProcess;
@@ -200,7 +200,7 @@ class MadgraphTest : public testing::TestWithParam<TestDriverBase*>
 
 // Since we link both the CPU-only and GPU tests into the same executable, we prevent
 // a multiply defined symbol by only compiling this in the non-CUDA phase:
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
 
 /// Compare momenta and matrix elements.
 /// This uses an implementation of TestDriverBase to run a madgraph workflow,
@@ -309,6 +309,6 @@ TEST_P( MadgraphTest, CompareMomentaAndME )
   }
 }
 
-#endif // MGONGPUCPP_GPUIMPL
+#endif // __CUDACC__
 
 #endif /* MADGRAPHTEST_H_ */
diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/MatrixElementKernels.cc b/epochX/cudacpp/ee_mumu.mad/SubProcesses/MatrixElementKernels.cc
index d6d6c4f179..30257195b6 100644
--- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/MatrixElementKernels.cc
+++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/MatrixElementKernels.cc
@@ -1,12 +1,12 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #include "MatrixElementKernels.h"
 
 #include "CPPProcess.h"
-#include "GpuRuntime.h" // Includes the abstraction for Nvidia/AMD compilation
+#include "CudaRuntime.h"
 #include "MemoryAccessMomenta.h"
 #include "MemoryBuffers.h"
 
@@ -14,7 +14,7 @@
 
 //============================================================================
 
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
 namespace mg5amcCpu
 {
 
@@ -143,7 +143,7 @@ namespace mg5amcCpu
 
 //============================================================================
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 namespace mg5amcGpu
 {
 
@@ -202,13 +202,13 @@ namespace mg5amcGpu
     PinnedHostBufferHelicityMask hstIsGoodHel( ncomb );
     DeviceBufferHelicityMask devIsGoodHel( ncomb );
     // ... 0d1. Compute good helicity mask on the device
-    gpuLaunchKernel( computeDependentCouplings, m_gpublocks, m_gputhreads, m_gs.data(), m_couplings.data() );
+    computeDependentCouplings<<<m_gpublocks, m_gputhreads>>>( m_gs.data(), m_couplings.data() );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    gpuLaunchKernel( sigmaKin_getGoodHel, m_gpublocks, m_gputhreads, m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_numerators.data(), m_denominators.data(), devIsGoodHel.data() );
+    sigmaKin_getGoodHel<<<m_gpublocks, m_gputhreads>>>( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_numerators.data(), m_denominators.data(), devIsGoodHel.data() );
 #else
-    gpuLaunchKernel( sigmaKin_getGoodHel, m_gpublocks, m_gputhreads, m_momenta.data(), m_couplings.data(), m_matrixElements.data(), devIsGoodHel.data() );
+    sigmaKin_getGoodHel<<<m_gpublocks, m_gputhreads>>>( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), devIsGoodHel.data() );
 #endif
-    checkGpu( gpuPeekAtLastError() );
+    checkCuda( cudaPeekAtLastError() );
     // ... 0d2. Copy back good helicity mask to the host
     copyHostFromDevice( hstIsGoodHel, devIsGoodHel );
     // ... 0d3. Copy back good helicity list to constant memory on the device
@@ -219,19 +219,19 @@ namespace mg5amcGpu
 
   void MatrixElementKernelDevice::computeMatrixElements( const unsigned int channelId )
   {
-    gpuLaunchKernel( computeDependentCouplings, m_gpublocks, m_gputhreads, m_gs.data(), m_couplings.data() );
+    computeDependentCouplings<<<m_gpublocks, m_gputhreads>>>( m_gs.data(), m_couplings.data() );
 #ifndef MGONGPU_NSIGHT_DEBUG
     constexpr unsigned int sharedMemSize = 0;
 #else
     constexpr unsigned int sharedMemSize = ntpbMAX * sizeof( float );
 #endif
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    gpuLaunchKernelSharedMem( sigmaKin, m_gpublocks, m_gputhreads, sharedMemSize, m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), channelId, m_numerators.data(), m_denominators.data(), m_selhel.data(), m_selcol.data() );
+    sigmaKin<<<m_gpublocks, m_gputhreads, sharedMemSize>>>( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), channelId, m_numerators.data(), m_denominators.data(), m_selhel.data(), m_selcol.data() );
 #else
-    gpuLaunchKernelSharedMem( sigmaKin, m_gpublocks, m_gputhreads, sharedMemSize, m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), m_selhel.data(), m_selcol.data() );
+    sigmaKin<<<m_gpublocks, m_gputhreads, sharedMemSize>>>( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), m_selhel.data(), m_selcol.data() );
 #endif
-    checkGpu( gpuPeekAtLastError() );
-    checkGpu( gpuDeviceSynchronize() );
+    checkCuda( cudaPeekAtLastError() );
+    checkCuda( cudaDeviceSynchronize() );
   }
 
   //--------------------------------------------------------------------------
diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/MatrixElementKernels.h b/epochX/cudacpp/ee_mumu.mad/SubProcesses/MatrixElementKernels.h
index 72bd8f195b..23e84757a2 100644
--- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/MatrixElementKernels.h
+++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/MatrixElementKernels.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MATRIXELEMENTKERNELS_H
 #define MATRIXELEMENTKERNELS_H 1
@@ -10,7 +10,7 @@
 
 #include "MemoryBuffers.h"
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -81,7 +81,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
   // A class encapsulating matrix element calculations on a CPU host
   class MatrixElementKernelHost final : public MatrixElementKernelBase, public NumberOfEvents
   {
@@ -130,7 +130,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
   // A class encapsulating matrix element calculations on a GPU device
   class MatrixElementKernelDevice : public MatrixElementKernelBase, public NumberOfEvents
   {
diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/MemoryAccessHelpers.h b/epochX/cudacpp/ee_mumu.mad/SubProcesses/MemoryAccessHelpers.h
index db73e4e064..c82a6c7635 100644
--- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/MemoryAccessHelpers.h
+++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/MemoryAccessHelpers.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MemoryAccessHelpers_H
 #define MemoryAccessHelpers_H 1
@@ -105,7 +105,7 @@ class KernelAccessHelper : public MemoryAccessHelper<T>
     }
     else
     {
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
       const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid
       //printf( "kernelAccessRecord: ievt=%d threadId=%d\n", ievt, threadIdx.x );
       return T::ieventAccessRecord( buffer, ievt ); // NB fptype and fptype_sv coincide for CUDA
diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/MemoryAccessMomenta.h b/epochX/cudacpp/ee_mumu.mad/SubProcesses/MemoryAccessMomenta.h
index 38fade09fb..0ac4faa3c7 100644
--- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/MemoryAccessMomenta.h
+++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/MemoryAccessMomenta.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MemoryAccessMomenta_H
 #define MemoryAccessMomenta_H 1
@@ -12,7 +12,7 @@
 #include "MemoryAccessHelpers.h"
 #include "MemoryAccessVectors.h"
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 using mg5amcGpu::CPPProcess;
 #else
 using mg5amcCpu::CPPProcess;
@@ -29,7 +29,7 @@ class MemoryAccessMomentaBase //_AOSOAv1
 
   // Number of Events Per Page in the momenta AOSOA memory buffer layout
   // (these are all best kept as a compile-time constants: see issue #23)
-#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
+#ifdef __CUDACC__ /* clang-format off */
   // -----------------------------------------------------------------------------------------------
   // --- GPUs: neppM is best set to a power of 2 times the number of fptype's in a 32-byte cacheline
   // --- This is relevant to ensure coalesced access to momenta in global memory
diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/MemoryAccessRandomNumbers.h b/epochX/cudacpp/ee_mumu.mad/SubProcesses/MemoryAccessRandomNumbers.h
index 40cb089135..e2988d39f3 100644
--- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/MemoryAccessRandomNumbers.h
+++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/MemoryAccessRandomNumbers.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MemoryAccessRandomNumbers_H
 #define MemoryAccessRandomNumbers_H 1
@@ -11,7 +11,7 @@
 #include "CPPProcess.h"
 #include "MemoryAccessHelpers.h"
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 using mg5amcGpu::CPPProcess;
 #else
 using mg5amcCpu::CPPProcess;
diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/MemoryAccessVectors.h b/epochX/cudacpp/ee_mumu.mad/SubProcesses/MemoryAccessVectors.h
index 08faccff0f..e9b197368e 100644
--- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/MemoryAccessVectors.h
+++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/MemoryAccessVectors.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MemoryAccessVectors_H
 #define MemoryAccessVectors_H 1
@@ -10,7 +10,7 @@
 
 #include "mgOnGpuVectors.h"
 
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
 namespace mg5amcCpu // this is only needed for CPU SIMD vectorization
 {
 
diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/MemoryBuffers.h b/epochX/cudacpp/ee_mumu.mad/SubProcesses/MemoryBuffers.h
index 7756a71621..3093e6ed18 100644
--- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/MemoryBuffers.h
+++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/MemoryBuffers.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021, based on earlier work by S. Hageboeck) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Roiser, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MemoryBuffers_H
 #define MemoryBuffers_H 1
@@ -11,12 +11,12 @@
 #include "mgOnGpuCxtypes.h"
 
 #include "CPPProcess.h"
-#include "GpuRuntime.h"
+#include "CudaRuntime.h"
 #include "Parameters_sm.h"
 
 #include <sstream>
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -87,7 +87,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
   constexpr bool HostBufferALIGNED = false;   // ismisaligned=false
   constexpr bool HostBufferMISALIGNED = true; // ismisaligned=true
 
@@ -119,7 +119,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
   // A class encapsulating a CUDA pinned host buffer
   template<typename T>
   class PinnedHostBufferBase : public BufferBase<T>
@@ -128,18 +128,18 @@ namespace mg5amcCpu
     PinnedHostBufferBase( const size_t size )
       : BufferBase<T>( size, false )
     {
-      gpuMallocHost( &( this->m_data ), this->bytes() );
+      checkCuda( cudaMallocHost( &( this->m_data ), this->bytes() ) );
     }
     virtual ~PinnedHostBufferBase()
     {
-      gpuFreeHost( this->m_data );
+      checkCuda( cudaFreeHost( this->m_data ) );
     }
   };
 #endif
 
   //--------------------------------------------------------------------------
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
   // A class encapsulating a CUDA device buffer
   template<typename T>
   class DeviceBufferBase : public BufferBase<T>
@@ -148,18 +148,18 @@ namespace mg5amcCpu
     DeviceBufferBase( const size_t size )
       : BufferBase<T>( size, true )
     {
-      gpuMalloc( &( this->m_data ), this->bytes() );
+      checkCuda( cudaMalloc( &( this->m_data ), this->bytes() ) );
     }
     virtual ~DeviceBufferBase()
     {
-      gpuFree( this->m_data );
+      checkCuda( cudaFree( this->m_data ) );
     }
   };
 #endif
 
   //--------------------------------------------------------------------------
 
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
   // A class encapsulating a C++ host buffer for a given number of events
   template<typename T, size_t sizePerEvent, bool ismisaligned>
   class HostBuffer : public HostBufferBase<T, ismisaligned>, virtual private NumberOfEvents
@@ -175,7 +175,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
   // A class encapsulating a CUDA pinned host buffer for a given number of events
   template<typename T, size_t sizePerEvent>
   class PinnedHostBuffer : public PinnedHostBufferBase<T>, virtual private NumberOfEvents
@@ -191,7 +191,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
   // A class encapsulating a CUDA device buffer for a given number of events
   template<typename T, size_t sizePerEvent>
   class DeviceBuffer : public DeviceBufferBase<T>, virtual private NumberOfEvents
@@ -213,7 +213,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for momenta random numbers
   constexpr size_t sizePerEventRndNumMomenta = MemoryBuffers::np4 * MemoryBuffers::nparf;
 
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
   // A class encapsulating a C++ host buffer for momenta random numbers
   typedef HostBuffer<fptype, sizePerEventRndNumMomenta, HostBufferALIGNED> HostBufferRndNumMomenta;
 #else
@@ -232,7 +232,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer with ONE fptype per event
   constexpr size_t sizePerEventOneFp = 1;
 
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
   // A class encapsulating a C++ host buffer with ONE fptype per event
   typedef HostBuffer<fptype, sizePerEventOneFp, HostBufferALIGNED> HostBufferOneFp;
 #else
@@ -257,7 +257,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for Gs
   constexpr size_t sizePerEventGs = 1;
 
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
   // A class encapsulating a C++ host buffer for gs
   typedef HostBuffer<fptype, sizePerEventGs, HostBufferALIGNED> HostBufferGs;
 #else
@@ -276,7 +276,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for numerators
   constexpr size_t sizePerEventNumerators = 1;
 
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
   // A class encapsulating a C++ host buffer for gs
   typedef HostBuffer<fptype, sizePerEventNumerators, HostBufferALIGNED> HostBufferNumerators;
 #else
@@ -296,7 +296,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for denominators
   constexpr size_t sizePerEventDenominators = 1;
 
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
   // A class encapsulating a C++ host buffer for gs
   typedef HostBuffer<fptype, sizePerEventDenominators, HostBufferALIGNED> HostBufferDenominators;
 #else
@@ -315,7 +315,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for random numbers
   constexpr size_t sizePerEventCouplings = MemoryBuffers::ndcoup * MemoryBuffers::nx2;
 
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
   // A class encapsulating a C++ host buffer for gs
   typedef HostBuffer<fptype, sizePerEventCouplings, HostBufferALIGNED> HostBufferCouplings;
 #else
@@ -333,7 +333,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for momenta
   constexpr size_t sizePerEventMomenta = MemoryBuffers::np4 * MemoryBuffers::npar;
 
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
   // A class encapsulating a C++ host buffer for momenta
   typedef HostBuffer<fptype, sizePerEventMomenta, HostBufferALIGNED> HostBufferMomenta;
   //typedef HostBuffer<fptype, sizePerEventMomenta, HostBufferMISALIGNED> HostBufferMomenta; // TEST MISALIGNMENT!
@@ -352,7 +352,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for sampling weights
   constexpr size_t sizePerEventWeights = 1;
 
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
   // A class encapsulating a C++ host buffer for sampling weights
   typedef HostBuffer<fptype, sizePerEventWeights, HostBufferALIGNED> HostBufferWeights;
 #else
@@ -370,7 +370,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for matrix elements
   constexpr size_t sizePerEventMatrixElements = 1;
 
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
   // A class encapsulating a C++ host buffer for matrix elements
   typedef HostBuffer<fptype, sizePerEventMatrixElements, HostBufferALIGNED> HostBufferMatrixElements;
 #else
@@ -385,7 +385,7 @@ namespace mg5amcCpu
   // A base class encapsulating a memory buffer for the helicity mask
   typedef BufferBase<bool> BufferHelicityMask;
 
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
   // A class encapsulating a C++ host buffer for the helicity mask
   typedef HostBufferBase<bool, HostBufferALIGNED> HostBufferHelicityMask;
 #else
@@ -403,7 +403,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for wavefunctions
   constexpr size_t sizePerEventWavefunctions = MemoryBuffers::nw6 * MemoryBuffers::nx2;
 
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
   // A class encapsulating a C++ host buffer for wavefunctions
   typedef HostBuffer<fptype, sizePerEventWavefunctions, HostBufferALIGNED> HostBufferWavefunctions;
 #else
@@ -421,7 +421,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for helicity random numbers
   constexpr size_t sizePerEventRndNumHelicity = 1;
 
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
   // A class encapsulating a C++ host buffer for helicity random numbers
   typedef HostBuffer<fptype, sizePerEventRndNumHelicity, HostBufferALIGNED> HostBufferRndNumHelicity;
 #else
@@ -439,7 +439,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for color random numbers
   constexpr size_t sizePerEventRndNumColor = 1;
 
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
   // A class encapsulating a C++ host buffer for color random numbers
   typedef HostBuffer<fptype, sizePerEventRndNumColor, HostBufferALIGNED> HostBufferRndNumColor;
 #else
@@ -457,7 +457,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for helicity selection
   constexpr size_t sizePerEventSelectedHelicity = 1;
 
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
   // A class encapsulating a C++ host buffer for helicity selection
   typedef HostBuffer<int, sizePerEventSelectedHelicity, HostBufferALIGNED> HostBufferSelectedHelicity;
 #else
@@ -475,7 +475,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for color selection
   constexpr size_t sizePerEventSelectedColor = 1;
 
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
   // A class encapsulating a C++ host buffer for color selection
   typedef HostBuffer<int, sizePerEventSelectedColor, HostBufferALIGNED> HostBufferSelectedColor;
 #else
@@ -487,7 +487,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
   template<class Tdst, class Tsrc>
   void copyDeviceFromHost( Tdst& dst, const Tsrc& src ) // keep the same order of arguments as in memcpy
   {
@@ -504,13 +504,13 @@ namespace mg5amcCpu
       throw std::runtime_error( sstr.str() );
     }
     // NB (PR #45): cudaMemcpy involves an intermediate memcpy to pinned memory if host array is a not a pinned host array
-    gpuMemcpy( dst.data(), src.data(), src.bytes(), gpuMemcpyHostToDevice );
+    checkCuda( cudaMemcpy( dst.data(), src.data(), src.bytes(), cudaMemcpyHostToDevice ) );
   }
 #endif
 
   //--------------------------------------------------------------------------
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
   template<class Tdst, class Tsrc>
   void copyHostFromDevice( Tdst& dst, const Tsrc& src ) // keep the same order of arguments as in memcpy
   {
@@ -527,7 +527,7 @@ namespace mg5amcCpu
       throw std::runtime_error( sstr.str() );
     }
     // NB (PR #45): cudaMemcpy involves an intermediate memcpy to pinned memory if host array is a not a pinned host array
-    gpuMemcpy( dst.data(), src.data(), src.bytes(), gpuMemcpyDeviceToHost );
+    checkCuda( cudaMemcpy( dst.data(), src.data(), src.bytes(), cudaMemcpyDeviceToHost ) );
   }
 #endif
 
diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/CPPProcess.cc b/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/CPPProcess.cc
index 9a16d0301d..1d0299e4e6 100644
--- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/CPPProcess.cc
+++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/CPPProcess.cc
@@ -4,7 +4,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
 // MadGraph5_aMC@NLO v. 3.5.0_lo_vect, 2023-06-09
@@ -16,6 +16,7 @@
 
 #include "mgOnGpuConfig.h"
 
+#include "CudaRuntime.h"
 #include "HelAmps_sm.h"
 #include "MemoryAccessAmplitudes.h"
 #include "MemoryAccessCouplings.h"
@@ -45,7 +46,7 @@
 // Class member functions for calculating the matrix elements for
 // Process: e+ e- > mu+ mu- WEIGHTED<=4 @1
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -79,7 +80,7 @@ namespace mg5amcCpu
   __device__ const fptype cIPD[2] = { (fptype)Parameters_sm::mdl_MZ, (fptype)Parameters_sm::mdl_WZ };
   __device__ const fptype cIPC[6] = { (fptype)Parameters_sm::GC_3.real(), (fptype)Parameters_sm::GC_3.imag(), (fptype)Parameters_sm::GC_50.real(), (fptype)Parameters_sm::GC_50.imag(), (fptype)Parameters_sm::GC_59.real(), (fptype)Parameters_sm::GC_59.imag() };
 #else
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
   __device__ __constant__ fptype cIPD[2];
   __device__ __constant__ fptype cIPC[6];
 #else
@@ -89,7 +90,7 @@ namespace mg5amcCpu
 #endif
 
   // Helicity combinations (and filtering of "good" helicity combinations)
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
   __device__ __constant__ short cHel[ncomb][npar];
   __device__ __constant__ int cNGoodHel;
   __device__ __constant__ int cGoodHel[ncomb];
@@ -117,13 +118,13 @@ namespace mg5amcCpu
                            fptype* allDenominators,       // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
                            fptype_sv* jamp2_sv            // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled)
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
                            , const int ievt00             // input: first event number in current C++ event page (for CUDA, ievt depends on threadid)
 #endif
                            )
   //ALWAYS_INLINE // attributes are not permitted in a function definition
   {
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
     using namespace mg5amcGpu;
     using M_ACCESS = DeviceAccessMomenta;         // non-trivial access: buffer includes all events
     using E_ACCESS = DeviceAccessMatrixElements;  // non-trivial access: buffer includes all events
@@ -150,7 +151,7 @@ namespace mg5amcCpu
 #endif /* clang-format on */
     mgDebug( 0, __FUNCTION__ );
     //printf( "calculate_wavefunctions: ihel=%2d\n", ihel );
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
     //printf( "calculate_wavefunctions: ievt00=%d\n", ievt00 );
 #endif
 
@@ -186,7 +187,7 @@ namespace mg5amcCpu
 #endif
     for( int iParity = 0; iParity < nParity; ++iParity )
     { // START LOOP ON IPARITY
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
       const int ievt0 = ievt00 + iParity * neppV;
 #endif
       constexpr size_t nxcoup = ndcoup + nicoup; // both dependent and independent couplings
@@ -199,10 +200,8 @@ namespace mg5amcCpu
         allCOUPs[idcoup] = CD_ACCESS::idcoupAccessBufferConst( allcouplings, idcoup ); // dependent couplings, vary event-by-event
       for( size_t iicoup = 0; iicoup < nicoup; iicoup++ )
         allCOUPs[ndcoup + iicoup] = CI_ACCESS::iicoupAccessBufferConst( cIPC, iicoup ); // independent couplings, fixed for all events
-#ifdef MGONGPUCPP_GPUIMPL
 #ifdef __CUDACC__
 #pragma nv_diagnostic pop
-#endif
       // CUDA kernels take input/output buffers with momenta/MEs for all events
       const fptype* momenta = allmomenta;
       const fptype* COUPs[nxcoup];
@@ -239,7 +238,7 @@ namespace mg5amcCpu
       // *** DIAGRAM 1 OF 2 ***
 
       // Wavefunction(s) for diagram number 1
-#if not( defined MGONGPUCPP_GPUIMPL and defined MGONGPU_TEST_DIVERGENCE )
+#if not( defined __CUDACC__ and defined MGONGPU_TEST_DIVERGENCE )
       opzxxx<M_ACCESS, W_ACCESS>( momenta, cHel[ihel][0], -1, w_fp[0], 0 ); // NB: opzxxx only uses pz
 #else
       if( ( blockDim.x * blockIdx.x + threadIdx.x ) % 2 == 0 )
@@ -294,7 +293,7 @@ namespace mg5amcCpu
       // [NB do keep 'static' for these constexpr arrays, see issue #283]
       static constexpr fptype2 cf[ncolor][ncolor] = { { 1 } }; // 2-D array[1][1]
 
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
       // Pre-compute a constexpr triangular color matrix properly normalized #475
       struct TriangularNormalizedColorMatrix
       {
@@ -351,7 +350,7 @@ namespace mg5amcCpu
 #endif
       for( int icol = 0; icol < ncolor; icol++ )
       {
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
         // === C++ START ===
         // Diagonal terms
 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
@@ -410,7 +409,7 @@ namespace mg5amcCpu
       MEs_sv_previous += deltaMEs_previous;
 #endif
       /*
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
       if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", blockDim.x * blockIdx.x + threadIdx.x, ihel, MEs_sv );
 #else
 #ifdef MGONGPU_CPPSIMD
@@ -457,8 +456,8 @@ namespace mg5amcCpu
       { 1, -1, 1, 1 },
       { 1, -1, -1, -1 },
       { 1, -1, -1, 1 } };
-#ifdef MGONGPUCPP_GPUIMPL
-    gpuMemcpyToSymbol( cHel, tHel, ncomb * npar * sizeof( short ) );
+#ifdef __CUDACC__
+    checkCuda( cudaMemcpyToSymbol( cHel, tHel, ncomb * npar * sizeof( short ) ) );
 #else
     memcpy( cHel, tHel, ncomb * npar * sizeof( short ) );
 #endif
@@ -498,9 +497,9 @@ namespace mg5amcCpu
     // Then copy them to CUDA constant memory (issue #39) or its C++ emulation in file-scope static memory
     const fptype tIPD[2] = { (fptype)m_pars->mdl_MZ, (fptype)m_pars->mdl_WZ };
     const cxtype tIPC[3] = { cxmake( m_pars->GC_3 ), cxmake( m_pars->GC_50 ), cxmake( m_pars->GC_59 ) };
-#ifdef MGONGPUCPP_GPUIMPL
-    gpuMemcpyToSymbol( cIPD, tIPD, 2 * sizeof( fptype ) );
-    gpuMemcpyToSymbol( cIPC, tIPC, 3 * sizeof( cxtype ) );
+#ifdef __CUDACC__
+    checkCuda( cudaMemcpyToSymbol( cIPD, tIPD, 2 * sizeof( fptype ) ) );
+    checkCuda( cudaMemcpyToSymbol( cIPC, tIPC, 3 * sizeof( cxtype ) ) );
 #else
     memcpy( cIPD, tIPD, 2 * sizeof( fptype ) );
     memcpy( cIPC, tIPC, 3 * sizeof( cxtype ) );
@@ -537,7 +536,7 @@ namespace mg5amcCpu
   {
     std::stringstream out;
     // CUDA version (NVCC)
-    // [Use __NVCC__ instead of MGONGPUCPP_GPUIMPL here!]
+    // [Use __NVCC__ instead of __CUDACC__ here!]
     // [This tests if 'nvcc' was used even to build a .cc file, even if not necessarily 'nvcc -x cu' for a .cu file]
     // [Check 'nvcc --compiler-options -dM -E dummy.c | grep CUDA': see https://stackoverflow.com/a/53713712]
 #ifdef __NVCC__
@@ -602,12 +601,12 @@ namespace mg5amcCpu
   __global__ void /* clang-format off */
   computeDependentCouplings( const fptype* allgs, // input: Gs[nevt]
                              fptype* allcouplings // output: couplings[nevt*ndcoup*2]
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
                              , const int nevt     // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #endif
   ) /* clang-format on */
   {
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
     using namespace mg5amcGpu;
     using G_ACCESS = DeviceAccessGs;
     using C_ACCESS = DeviceAccessCouplings;
@@ -628,7 +627,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
+#ifdef __CUDACC__ /* clang-format off */
   __global__ void
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
@@ -757,9 +756,9 @@ namespace mg5amcCpu
         nGoodHel++;
       }
     }
-#ifdef MGONGPUCPP_GPUIMPL
-    gpuMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) );
-    gpuMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) );
+#ifdef __CUDACC__
+    checkCuda( cudaMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) ) );
+    checkCuda( cudaMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) ) );
 #else
     cNGoodHel = nGoodHel;
     for( int ihel = 0; ihel < ncomb; ihel++ ) cGoodHel[ihel] = goodHel[ihel];
@@ -783,7 +782,7 @@ namespace mg5amcCpu
 #endif
             int* allselhel,                // output: helicity selection[nevt]
             int* allselcol                 // output: helicity selection[nevt]
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
             , const int nevt               // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #endif
             ) /* clang-format on */
@@ -804,7 +803,7 @@ namespace mg5amcCpu
     // Denominators: spins, colors and identical particles
     constexpr int helcolDenominators[1] = { 4 }; // assume nprocesses == 1 (#272 and #343)
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
     // Remember: in CUDA this is a kernel for one event, in c++ this processes n events
     const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid
 #else
@@ -818,12 +817,9 @@ namespace mg5amcCpu
 #endif
 
     // Start sigmaKin_lines
-
-#include "GpuAbstraction.h"
-
     // === PART 0 - INITIALISATION (before calculate_wavefunctions) ===
     // Reset the "matrix elements" - running sums of |M|^2 over helicities for the given event
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
     allMEs[ievt] = 0;
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     allNumerators[ievt] = 0;
@@ -851,7 +847,7 @@ namespace mg5amcCpu
     // === PART 1 - HELICITY LOOP: CALCULATE WAVEFUNCTIONS ===
     // (in both CUDA and C++, using precomputed good helicities)
 
-#ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++
+#ifdef __CUDACC__ // CUDA OR C++
 
     // *** START OF PART 1a - CUDA (one event per CPU thread) ***
     // Running sum of partial amplitudes squared for event by event color selection (#402)
@@ -1055,7 +1051,7 @@ namespace mg5amcCpu
     // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event
     // [NB 'sum over final spins, average over initial spins', eg see
     // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf]
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
     allMEs[ievt] /= helcolDenominators[0];
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     if( channelId > 0 ) allMEs[ievt] *= allNumerators[ievt] / allDenominators[ievt];
diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/CPPProcess.h b/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/CPPProcess.h
index ebbc2800d3..08d6c29e7b 100644
--- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/CPPProcess.h
+++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/CPPProcess.h
@@ -4,7 +4,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
 // MadGraph5_aMC@NLO v. 3.5.0_lo_vect, 2023-06-09
@@ -25,7 +25,7 @@
 
 //--------------------------------------------------------------------------
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -107,7 +107,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
   __global__ void
   computeDependentCouplings( const fptype* allgs,    // input: Gs[nevt]
                              fptype* allcouplings ); // output: couplings[nevt*ndcoup*2]
@@ -120,7 +120,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
+#ifdef __CUDACC__ /* clang-format off */
   __global__ void
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
@@ -150,7 +150,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
+#ifdef __CUDACC__ /* clang-format off */
   __global__ void
   sigmaKin( const fptype* allmomenta,      // input: momenta[nevt*npar*4]
             const fptype* allcouplings,    // input: couplings[nevt*ndcoup*2]
diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/CudaRuntime.h b/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/CudaRuntime.h
new file mode 120000
index 0000000000..ce9e1a487a
--- /dev/null
+++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/CudaRuntime.h
@@ -0,0 +1 @@
+../CudaRuntime.h
\ No newline at end of file
diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/check_sa.cc b/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/check_sa.cc
index 1bad694d1c..f1e75b9252 100644
--- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/check_sa.cc
+++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/check_sa.cc
@@ -4,7 +4,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: O. Mattelaer (Nov 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 
 #include "mgOnGpuConfig.h"
@@ -12,7 +12,6 @@
 #include "BridgeKernels.h"
 #include "CPPProcess.h"
 #include "CrossSectionKernels.h"
-#include "GpuRuntime.h"
 #include "MatrixElementKernels.h"
 #include "MemoryAccessMatrixElements.h"
 #include "MemoryAccessMomenta.h"
@@ -64,7 +63,7 @@ usage( char* argv0, int ret = 1 )
   std::cout << std::endl;
   std::cout << "Summary stats are always computed: '-p' and '-j' only control their printout" << std::endl;
   std::cout << "The '-d' flag only enables NaN/abnormal warnings and OMP debugging" << std::endl;
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
 #ifdef _OPENMP
   std::cout << std::endl;
   std::cout << "Use the OMP_NUM_THREADS environment variable to control OMP multi-threading" << std::endl;
@@ -78,7 +77,7 @@ int
 main( int argc, char** argv )
 {
   // Namespaces for CUDA and C++ (FIXME - eventually use the same namespace everywhere...)
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
   using namespace mg5amcGpu;
 #else
   using namespace mg5amcCpu;
@@ -104,11 +103,11 @@ main( int argc, char** argv )
     CurandDevice = 2
   };
 #ifdef __CUDACC__
-  RandomNumberMode rndgen = RandomNumberMode::CurandDevice; // default on CUDA GPU
+  RandomNumberMode rndgen = RandomNumberMode::CurandDevice; // default on GPU
 #elif not defined MGONGPU_HAS_NO_CURAND
   RandomNumberMode rndgen = RandomNumberMode::CurandHost;  // default on CPU if build has curand
 #else
-  RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // default on CPU if build has no curand and on HIP GPU
+  RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // default on CPU if build has no curand
 #endif
   // Rambo sampling mode (NB RamboHost implies CommonRandom or CurandHost!)
   enum class RamboSamplingMode
@@ -116,7 +115,7 @@ main( int argc, char** argv )
     RamboHost = 1,
     RamboDevice = 2
   };
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
   RamboSamplingMode rmbsmp = RamboSamplingMode::RamboDevice; // default on GPU
 #else
   RamboSamplingMode rmbsmp = RamboSamplingMode::RamboHost; // default on CPU
@@ -149,7 +148,7 @@ main( int argc, char** argv )
 #ifdef __CUDACC__
       rndgen = RandomNumberMode::CurandDevice;
 #else
-      throw std::runtime_error( "CurandDevice is not supported on CPUs or on HIP GPUs" );
+      throw std::runtime_error( "CurandDevice is not supported on CPUs" );
 #endif
     }
     else if( arg == "--curhst" )
@@ -166,7 +165,7 @@ main( int argc, char** argv )
     }
     else if( arg == "--rmbdev" )
     {
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
       rmbsmp = RamboSamplingMode::RamboDevice;
 #else
       throw std::runtime_error( "RamboDevice is not supported on CPUs" );
@@ -240,13 +239,13 @@ main( int argc, char** argv )
     return usage( argv[0] );
   }
 
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
 #ifdef _OPENMP
   ompnumthreadsNotSetMeansOneThread( debug ? 1 : 0 ); // quiet(-1), info(0), debug(1)
 #endif
 #endif
 
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
   // Fail gently and avoid "Illegal instruction (core dumped)" if the host does not support the SIMD used in the ME calculation
   // Note: this prevents a crash on pmpe04 but not on some github CI nodes?
   // [NB: SIMD vectorization in mg5amc C++ code is only used in the ME calculation below MatrixElementKernelHost!]
@@ -264,14 +263,14 @@ main( int argc, char** argv )
 
   // === STEP 0 - INITIALISE
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 
-  // --- 00. Initialise GPU
-  // Instantiate a GpuRuntime at the beginnining of the application's main.
-  // For CUDA this invokes cudaSetDevice(0) in the constructor and books a cudaDeviceReset() call in the destructor.
-  const std::string cdinKey = "00 GpuInit";
+  // --- 00. Initialise cuda
+  // Instantiate a CudaRuntime at the beginnining of the application's main to
+  // invoke cudaSetDevice(0) in the constructor and book a cudaDeviceReset() call in the destructor
+  const std::string cdinKey = "00 CudaInit";
   timermap.start( cdinKey );
-  GpuRuntime GpuRuntime( debug );
+  CudaRuntime cudaRuntime( debug );
 #endif
 
   // --- 0a. Initialise physics process
@@ -293,7 +292,7 @@ main( int argc, char** argv )
   timermap.start( alloKey );
 
   // Memory buffers for random numbers for momenta
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
   HostBufferRndNumMomenta hstRndmom( nevt );
 #else
   PinnedHostBufferRndNumMomenta hstRndmom( nevt );
@@ -301,7 +300,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for sampling weights
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
   HostBufferWeights hstWeights( nevt );
 #else
   PinnedHostBufferWeights hstWeights( nevt );
@@ -309,7 +308,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for momenta
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
   HostBufferMomenta hstMomenta( nevt );
 #else
   PinnedHostBufferMomenta hstMomenta( nevt );
@@ -317,7 +316,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for Gs
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
   HostBufferGs hstGs( nevt );
 #else
   PinnedHostBufferGs hstGs( nevt );
@@ -334,7 +333,7 @@ main( int argc, char** argv )
   }
 
   // Memory buffers for matrix elements
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
   HostBufferMatrixElements hstMatrixElements( nevt );
 #else
   PinnedHostBufferMatrixElements hstMatrixElements( nevt );
@@ -343,7 +342,7 @@ main( int argc, char** argv )
 
   // Memory buffers for random numbers for helicity selection
   // *** NB #403 these buffers always remain initialised at 0: no need for helicity choice in gcheck/check (no LHE produced) ***
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
   HostBufferRndNumHelicity hstRndHel( nevt );
 #else
   PinnedHostBufferRndNumHelicity hstRndHel( nevt );
@@ -352,7 +351,7 @@ main( int argc, char** argv )
 
   // Memory buffers for random numbers for color selection
   // *** NB #402 these buffers always remain initialised at 0: no need for color choice in gcheck/check (no LHE produced) ***
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
   HostBufferRndNumColor hstRndCol( nevt );
 #else
   PinnedHostBufferRndNumColor hstRndCol( nevt );
@@ -360,7 +359,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for helicity selection
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
   HostBufferSelectedHelicity hstSelHel( nevt );
 #else
   PinnedHostBufferSelectedHelicity hstSelHel( nevt );
@@ -368,7 +367,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for color selection
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
   HostBufferSelectedColor hstSelCol( nevt );
 #else
   PinnedHostBufferSelectedColor hstSelCol( nevt );
@@ -404,7 +403,7 @@ main( int argc, char** argv )
 #else
   else
   {
-    throw std::logic_error( "CurandDevice is not supported on CPUs or HIP GPUs" ); // INTERNAL ERROR (no path to this statement)
+    throw std::logic_error( "CurandDevice is not supported on CPUs" ); // INTERNAL ERROR (no path to this statement)
   }
 #endif
 #else
@@ -422,7 +421,7 @@ main( int argc, char** argv )
   }
   else
   {
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
     prsk.reset( new RamboSamplingKernelDevice( energy, devRndmom, devMomenta, devWeights, gpublocks, gputhreads ) );
 #else
     throw std::logic_error( "RamboDevice is not supported on CPUs" ); // INTERNAL ERROR (no path to this statement)
@@ -433,7 +432,7 @@ main( int argc, char** argv )
   std::unique_ptr<MatrixElementKernelBase> pmek;
   if( !bridge )
   {
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
     pmek.reset( new MatrixElementKernelDevice( devMomenta, devGs, devRndHel, devRndCol, devMatrixElements, devSelHel, devSelCol, gpublocks, gputhreads ) );
 #else
     pmek.reset( new MatrixElementKernelHost( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, nevt ) );
@@ -441,7 +440,7 @@ main( int argc, char** argv )
   }
   else
   {
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
     pmek.reset( new BridgeKernelDevice( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, gpublocks, gputhreads ) );
 #else
     pmek.reset( new BridgeKernelHost( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, nevt ) );
@@ -483,7 +482,7 @@ main( int argc, char** argv )
     prnk->generateRnarray();
     //std::cout << "Got random numbers" << std::endl;
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
     if( rndgen != RandomNumberMode::CurandDevice && rmbsmp == RamboSamplingMode::RamboDevice )
     {
       // --- 1c. Copy rndmom from host to device
@@ -515,7 +514,7 @@ main( int argc, char** argv )
     prsk->getMomentaFinal();
     //std::cout << "Got final momenta" << std::endl;
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
     if( rmbsmp == RamboSamplingMode::RamboDevice )
     {
       // --- 2c. CopyDToH Weights
@@ -560,7 +559,7 @@ main( int argc, char** argv )
       dynamic_cast<BridgeKernelBase*>( pmek.get() )->transposeInputMomentaC2F();
     }
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
     // --- 2d. CopyHToD Momenta
     const std::string gKey = "0.. CpHTDg";
     rambtime += timermap.start( gKey ); // FIXME! NOT A RAMBO TIMER!
@@ -589,7 +588,7 @@ main( int argc, char** argv )
     wv3atime += timermap.stop(); // calc only
     wavetime += wv3atime;        // calc plus copy
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
     if( !bridge )
     {
       // --- 3b. CopyDToH MEs
@@ -732,19 +731,15 @@ main( int argc, char** argv )
     rndgentxt = "CURAND DEVICE";
 #ifdef __CUDACC__
   rndgentxt += " (CUDA code)";
-#elif defined __HIPCC__
-  rndgentxt += " (HIP code)";
 #else
   rndgentxt += " (C++ code)";
 #endif
 
   // Workflow description summary
   std::string wrkflwtxt;
-  // -- CUDA or HIP or C++?
+  // -- CUDA or C++?
 #ifdef __CUDACC__
   wrkflwtxt += "CUD:";
-#elif defined __HIPCC__
-  wrkflwtxt += "HIP:";
 #else
   wrkflwtxt += "CPP:";
 #endif
@@ -769,12 +764,6 @@ main( int argc, char** argv )
 #else
   wrkflwtxt += "???:"; // no path to this statement
 #endif
-#elif defined __HIPCC__
-#if defined MGONGPU_CUCXTYPE_CXSMPL
-  wrkflwtxt += "CXS:";
-#else
-  wrkflwtxt += "???:"; // no path to this statement
-#endif
 #else
 #if defined MGONGPU_CPPCXTYPE_STDCOMPLEX
   wrkflwtxt += "STX:";
@@ -800,7 +789,7 @@ main( int argc, char** argv )
     wrkflwtxt += "RMBDEV+";
   else
     wrkflwtxt += "??????+"; // no path to this statement
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
   // -- HOST or DEVICE matrix elements? Standalone MEs or BRIDGE?
   if( !bridge )
     wrkflwtxt += "MESDEV";
@@ -856,7 +845,7 @@ main( int argc, char** argv )
 
   if( perf )
   {
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
 #ifdef _OPENMP
     // Get the output of "nproc --all" (https://stackoverflow.com/a/478960)
     std::string nprocall;
@@ -877,8 +866,6 @@ main( int argc, char** argv )
     std::cout << std::string( SEP79, '*' ) << std::endl
 #ifdef __CUDACC__
               << "Process                     = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_CUDA"
-#elif defined __HIPCC__
-              << "Process                     = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_HIP"
 #else
               << "Process                     = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_CPP"
 #endif
@@ -905,21 +892,21 @@ main( int argc, char** argv )
 #elif defined MGONGPU_FPTYPE_FLOAT
               << "FP precision                = FLOAT (NaN/abnormal=" << nabn << ", zero=" << nzero << ")" << std::endl
 #endif
+#ifdef __CUDACC__
 #if defined MGONGPU_CUCXTYPE_CUCOMPLEX
               << "Complex type                = CUCOMPLEX" << std::endl
 #elif defined MGONGPU_CUCXTYPE_THRUST
               << "Complex type                = THRUST::COMPLEX" << std::endl
-#elif defined MGONGPU_CUCXTYPE_CXSMPL
-              << "Complex type                = STD::COMPLEX" << std::endl
+#endif
 #else
-              << "Complex type                = ???" << std::endl // no path to this statement...
+              << "Complex type                = STD::COMPLEX" << std::endl
 #endif
               << "RanNumb memory layout       = AOSOA[" << neppR << "]"
               << ( neppR == 1 ? " == AOS" : "" )
               << " [HARDCODED FOR REPRODUCIBILITY]" << std::endl
               << "Momenta memory layout       = AOSOA[" << neppM << "]"
               << ( neppM == 1 ? " == AOS" : "" ) << std::endl
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
     //<< "Wavefunction GPU memory     = LOCAL" << std::endl
 #else
 #if !defined MGONGPU_CPPSIMD
@@ -950,7 +937,7 @@ main( int argc, char** argv )
 #endif
 #endif
               << "Random number generation    = " << rndgentxt << std::endl
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
 #ifdef _OPENMP
               << "OMP threads / `nproc --all` = " << omp_get_max_threads() << " / " << nprocall // includes a newline
 #endif
@@ -1046,14 +1033,14 @@ main( int argc, char** argv )
              << "\"FLOAT (NaN/abnormal=" << nabn << ")\"," << std::endl
 #endif
              << "\"Complex type\": "
+#ifdef __CUDACC__
 #if defined MGONGPU_CUCXTYPE_CUCOMPLEX
              << "\"CUCOMPLEX\"," << std::endl
 #elif defined MGONGPU_CUCXTYPE_THRUST
              << "\"THRUST::COMPLEX\"," << std::endl
-#elif defined MGONGPU_CUCXTYPE_CXSMPL
-             << "\"STD::COMPLEX\"," << std::endl
+#endif
 #else
-             << "\"???\"," << std::endl                           // no path to this statement...
+             << "\"STD::COMPLEX\"," << std::endl
 #endif
              << "\"RanNumb memory layout\": "
              << "\"AOSOA[" << neppR << "]\""
@@ -1061,7 +1048,7 @@ main( int argc, char** argv )
              << "\"Momenta memory layout\": "
              << "\"AOSOA[" << neppM << "]\""
              << ( neppM == 1 ? " == AOS" : "" ) << ", " << std::endl
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
     //<< "\"Wavefunction GPU memory\": " << "\"LOCAL\"," << std::endl
 #endif
              << "\"Curand generation\": "
diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/RamboSamplingKernels.cc b/epochX/cudacpp/ee_mumu.mad/SubProcesses/RamboSamplingKernels.cc
index 79abbcc4f8..da68aa9255 100644
--- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/RamboSamplingKernels.cc
+++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/RamboSamplingKernels.cc
@@ -1,11 +1,11 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #include "RamboSamplingKernels.h"
 
-#include "GpuRuntime.h"
+#include "CudaRuntime.h"
 #include "MemoryAccessMomenta.h"
 #include "MemoryAccessRandomNumbers.h"
 #include "MemoryAccessWeights.h"
@@ -14,7 +14,7 @@
 
 #include <sstream>
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -92,7 +92,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
   RamboSamplingKernelDevice::RamboSamplingKernelDevice( const fptype energy,               // input: energy
                                                         const BufferRndNumMomenta& rndmom, // input: random numbers in [0,1]
                                                         BufferMomenta& momenta,            // output: momenta
@@ -135,7 +135,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
   __global__ void
   getMomentaInitialDevice( const fptype energy,
                            fptype* momenta )
@@ -147,17 +147,17 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
   void
   RamboSamplingKernelDevice::getMomentaInitial()
   {
-    gpuLaunchKernel( getMomentaInitialDevice, m_gpublocks, m_gputhreads, m_energy, m_momenta.data() );
+    getMomentaInitialDevice<<<m_gpublocks, m_gputhreads>>>( m_energy, m_momenta.data() );
   }
 #endif
 
   //--------------------------------------------------------------------------
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
   __global__ void
   getMomentaFinalDevice( const fptype energy,
                          const fptype* rndmom,
@@ -171,11 +171,11 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
   void
   RamboSamplingKernelDevice::getMomentaFinal()
   {
-    gpuLaunchKernel( getMomentaFinalDevice, m_gpublocks, m_gputhreads, m_energy, m_rndmom.data(), m_momenta.data(), m_weights.data() );
+    getMomentaFinalDevice<<<m_gpublocks, m_gputhreads>>>( m_energy, m_rndmom.data(), m_momenta.data(), m_weights.data() );
   }
 #endif
 
diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/RamboSamplingKernels.h b/epochX/cudacpp/ee_mumu.mad/SubProcesses/RamboSamplingKernels.h
index 7c214cd74b..184089efd7 100644
--- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/RamboSamplingKernels.h
+++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/RamboSamplingKernels.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef RAMBOSAMPLINGKERNELS_H
 #define RAMBOSAMPLINGKERNELS_H 1
@@ -10,7 +10,7 @@
 
 #include "MemoryBuffers.h"
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -93,7 +93,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
   // A class encapsulating RAMBO phase space sampling on a GPU device
   class RamboSamplingKernelDevice final : public SamplingKernelBase, public NumberOfEvents
   {
diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/RandomNumberKernels.h b/epochX/cudacpp/ee_mumu.mad/SubProcesses/RandomNumberKernels.h
index 21d63beeac..188a72c2c9 100644
--- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/RandomNumberKernels.h
+++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/RandomNumberKernels.h
@@ -1,14 +1,14 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef RANDOMNUMBERKERNELS_H
 #define RANDOMNUMBERKERNELS_H 1
 
 #include "mgOnGpuConfig.h"
 
-// NB This must come AFTER mgOnGpuConfig.h which contains our definition of __global__ when MGONGPUCPP_GPUIMPL is not defined
+// NB This must come AFTER mgOnGpuConfig.h which contains our definition of __global__ when __CUDACC__ is not defined
 #ifndef MGONGPU_HAS_NO_CURAND
 //#include "curand.h"
 struct curandGenerator_st; // forward definition from curand.h
@@ -16,7 +16,7 @@ struct curandGenerator_st; // forward definition from curand.h
 
 #include "MemoryBuffers.h"
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/cudacpp.mk b/epochX/cudacpp/ee_mumu.mad/SubProcesses/cudacpp.mk
index bcb73d7f01..a0397e9ecc 100644
--- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/cudacpp.mk
+++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/cudacpp.mk
@@ -1,7 +1,7 @@
 # Copyright (C) 2020-2023 CERN and UCLouvain.
 # Licensed under the GNU Lesser General Public License (version 3 or later).
 # Created by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin.
-# Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+# Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 
 #=== Determine the name of this makefile (https://ftp.gnu.org/old-gnu/Manuals/make-3.80/html_node/make_17.html)
 #=== NB: different names (e.g. cudacpp.mk and cudacpp_src.mk) are used in the Subprocess and src directories
@@ -32,7 +32,7 @@ UNAME_P := $(shell uname -p)
 #=== Configure common compiler flags for C++ and CUDA
 
 INCFLAGS = -I.
-OPTFLAGS = -O3 # this ends up in GPUFLAGS too (should it?), cannot add -Ofast or -ffast-math here
+OPTFLAGS = -O3 # this ends up in CUFLAGS too (should it?), cannot add -Ofast or -ffast-math here
 
 # Dependency on src directory
 MG5AMC_COMMONLIB = mg5amc_common
@@ -89,139 +89,69 @@ endif
 
 #-------------------------------------------------------------------------------
 
-CUDA_COMPILER_PATH := $(shell compiler="`which nvcc 2>/dev/null`" && while [ -L "$$compiler" ]; do compiler=`readlink "$$compiler"`; done && echo "$$compiler")
-HIP_COMPILER_PATH := $(shell compiler="`which hipcc 2>/dev/null`" && while [ -L "$$compiler" ]; do compiler=`readlink "$$compiler"`; done && echo "$$compiler")
-
-ifeq ($(findstring nvcc,$(CUDA_COMPILER_PATH)),nvcc)
-  #=== Configure the CUDA compiler
-
-  # If CXX is not a single word (example "clang++ --gcc-toolchain...") then disable CUDA builds (issue #505)
-  # This is because it is impossible to pass this to "GPUFLAGS += -ccbin <host-compiler>" below
-  ifneq ($(words $(subst ccache ,,$(CXX))),1) # allow at most "CXX=ccache <host-compiler>" from outside
-    $(warning CUDA builds are not supported for multi-word CXX "$(CXX)")
-    override CUDA_HOME=disabled
-  endif
-
-  # If CUDA_HOME is not set, try to set it from the location of NVCC
-  ifndef CUDA_HOME
-    CUDA_HOME = $(patsubst %bin/nvcc,%,$(shell which nvcc 2>/dev/null))
-    $(warning CUDA_HOME was not set: using "$(CUDA_HOME)")
-  endif
-
-  # Set GPUCC as $(CUDA_HOME)/bin/nvcc if it exists
-  ifneq ($(wildcard $(CUDA_HOME)/bin/nvcc),)
-    GPUCC = $(CUDA_HOME)/bin/nvcc
-    USE_NVTX ?=-DUSE_NVTX
-    # See https://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/index.html
-    # See https://arnon.dk/matching-sm-architectures-arch-and-gencode-for-various-nvidia-cards/
-    # Default: use compute capability 70 for V100 (CERN lxbatch, CERN itscrd, Juwels Cluster).
-    # Embed device code for 70, and PTX for 70+.
-    # Export MADGRAPH_CUDA_ARCHITECTURE (comma-separated list) to use another value or list of values (see #533).
-    # Examples: use 60 for P100 (Piz Daint), 80 for A100 (Juwels Booster, NVidia raplab/Curiosity).
-    MADGRAPH_CUDA_ARCHITECTURE ?= 70
-    ###CUARCHFLAGS = -gencode arch=compute_$(MADGRAPH_CUDA_ARCHITECTURE),code=compute_$(MADGRAPH_CUDA_ARCHITECTURE) -gencode arch=compute_$(MADGRAPH_CUDA_ARCHITECTURE),code=sm_$(MADGRAPH_CUDA_ARCHITECTURE) # Older implementation (AV): go back to this one for multi-GPU support #533
-    ###CUARCHFLAGS = --gpu-architecture=compute_$(MADGRAPH_CUDA_ARCHITECTURE) --gpu-code=sm_$(MADGRAPH_CUDA_ARCHITECTURE),compute_$(MADGRAPH_CUDA_ARCHITECTURE) # Newer implementation (SH): cannot use this as-is for multi-GPU support #533
-    comma:=,
-    CUARCHFLAGS = $(foreach arch,$(subst $(comma), ,$(MADGRAPH_CUDA_ARCHITECTURE)),-gencode arch=compute_$(arch),code=compute_$(arch) -gencode arch=compute_$(arch),code=sm_$(arch))
-    CUINC = -I$(CUDA_HOME)/include/
-    CURANDLIBFLAGS = -L$(CUDA_HOME)/lib64/ -lcurand # NB: -lcuda is not needed here!
-    CUOPTFLAGS = -lineinfo
-    GPUFLAGS = $(OPTFLAGS) $(CUOPTFLAGS) $(INCFLAGS) $(CUINC) $(USE_NVTX) $(CUARCHFLAGS) -use_fast_math
-    ###GPUFLAGS += -Xcompiler -Wall -Xcompiler -Wextra -Xcompiler -Wshadow
-    ###GPUCC_VERSION = $(shell $(GPUCC) --version | grep 'Cuda compilation tools' | cut -d' ' -f5 | cut -d, -f1)
-    GPUFLAGS += -std=c++17 # need CUDA >= 11.2 (see #333): this is enforced in mgOnGpuConfig.h
-    # Without -maxrregcount: baseline throughput: 6.5E8 (16384 32 12) up to 7.3E8 (65536 128 12)
-    ###GPUFLAGS+= --maxrregcount 160 # improves throughput: 6.9E8 (16384 32 12) up to 7.7E8 (65536 128 12)
-    ###GPUFLAGS+= --maxrregcount 128 # improves throughput: 7.3E8 (16384 32 12) up to 7.6E8 (65536 128 12)
-    ###GPUFLAGS+= --maxrregcount 96 # degrades throughput: 4.1E8 (16384 32 12) up to 4.5E8 (65536 128 12)
-    ###GPUFLAGS+= --maxrregcount 64 # degrades throughput: 1.7E8 (16384 32 12) flat at 1.7E8 (65536 128 12)
-
-    CUBUILDRULEFLAGS = -Xcompiler -fPIC -c
-    CCBUILDRULEFLAGS = -Xcompiler -fPIC -c -x cu
-
-    CUDATESTFLAGS = -lcuda
-
-  else ifneq ($(origin REQUIRE_CUDA),undefined)
-    # If REQUIRE_CUDA is set but no cuda is found, stop here (e.g. for CI tests on GPU #443)
-    $(error No cuda installation found (set CUDA_HOME or make GPUCC visible in PATH))
-  else
-    # No cuda. Switch cuda compilation off and go to common random numbers in C++
-    $(warning CUDA_HOME is not set or is invalid: export CUDA_HOME to compile with cuda)
-    override GPUCC=
-    override USE_NVTX=
-    override CUINC=
-    override CURANDLIBFLAGS=
-  endif
-
-  # Set the host C++ compiler for GPUCC via "-ccbin <host-compiler>"
-  # (NB issue #505: this must be a single word, "clang++ --gcc-toolchain..." is not supported)
-  GPUFLAGS += -ccbin $(shell which $(subst ccache ,,$(CXX)))
-
-  # Allow newer (unsupported) C++ compilers with older versions of CUDA if ALLOW_UNSUPPORTED_COMPILER_IN_CUDA is set (#504)
-  ifneq ($(origin ALLOW_UNSUPPORTED_COMPILER_IN_CUDA),undefined)
-  GPUFLAGS += -allow-unsupported-compiler
-  endif
-
-else ifeq ($(findstring hipcc,$(HIP_COMPILER_PATH)),hipcc)
-  #=== Configure the HIP compiler
-
-  # If CXX is not a single word (example "clang++ --gcc-toolchain...") then disable CUDA builds (issue #505)
-  # This is because it is impossible to pass this to "GPUFLAGS += -ccbin <host-compiler>" below
-  ifneq ($(words $(subst ccache ,,$(CXX))),1) # allow at most "CXX=ccache <host-compiler>" from outside
-    $(warning CUDA builds are not supported for multi-word CXX "$(CXX)")
-    override CUDA_HOME=disabled
-  endif
-
-  # If HIP_HOME is not set, try to set it from the location of GPUCC
-  ifndef HIP_HOME
-    HIP_HOME = $(patsubst %bin/hipcc,%,$(HIP_COMPILER_PATH))
-    $(warning HIP_HOME was not set: using "$(HIP_HOME)")
-  endif
-
-  # Set GPUCC as $(HIP_HOME)/bin/hipcc if it exists
-  ifneq ($(wildcard $(HIP_HOME)/bin/hipcc),)
-    GPUCC = $(HIP_HOME)/bin/hipcc
-
-    # Should maybe find something equivelant to this in HIP
-    #USE_NVTX ?=-DUSE_NVTX
-
-    HIPARCHFLAGS = -target x86_64-linux-gnu --offload-arch=gfx90a
-    HIPINC = -I$(HIP_HOME)/include/
-
-    # -DHIP_FAST_MATH equivelant to -use_fast_math in HIP 
-    # (But only for single precision line 208: https://rocm-developer-tools.github.io/HIP/hcc__detail_2math__functions_8h_source.html)
-    GPUFLAGS = $(OPTFLAGS) $(CUOPTFLAGS) $(INCFLAGS) $(HIPINC) $(HIPARCHFLAGS) -DHIP_FAST_MATH -DHIP_PLATFORM=amd -fPIC
-    ###GPUFLAGS += -Xcompiler -Wall -Xcompiler -Wextra -Xcompiler -Wshadow
-    GPUFLAGS += -std=c++17
-    # Without -maxrregcount: baseline throughput: 6.5E8 (16384 32 12) up to 7.3E8 (65536 128 12)
-    ###GPUFLAGS+= --maxrregcount 160 # improves throughput: 6.9E8 (16384 32 12) up to 7.7E8 (65536 128 12)
-    ###GPUFLAGS+= --maxrregcount 128 # improves throughput: 7.3E8 (16384 32 12) up to 7.6E8 (65536 128 12)
-    ###GPUFLAGS+= --maxrregcount 96 # degrades throughput: 4.1E8 (16384 32 12) up to 4.5E8 (65536 128 12)
-    ###GPUFLAGS+= --maxrregcount 64 # degrades throughput: 1.7E8 (16384 32 12) flat at 1.7E8 (65536 128 12)
-
-    CUBUILDRULEFLAGS = -fPIC -c
-    CCBUILDRULEFLAGS = -fPIC -c
-
-  else ifneq ($(origin REQUIRE_HIP),undefined)
-    # If REQUIRE_HIP is set but no cuda is found, stop here (e.g. for CI tests on GPU #443)
-    $(error No hip installation found (set HIP_HOME or make GPUCC visible in PATH))
-  else
-    # No hip. Switch hip compilation off and go to common random numbers in C++
-    $(warning HIP_HOME is not set or is invalid: export HIP_HOME to compile with hip)
-    override GPUCC=
-    override USE_NVTX=
-    override CUINC=
-    override CURANDLIBFLAGS=
-  endif
+#=== Configure the CUDA compiler
+
+# If CXX is not a single word (example "clang++ --gcc-toolchain...") then disable CUDA builds (issue #505)
+# This is because it is impossible to pass this to "CUFLAGS += -ccbin <host-compiler>" below
+ifneq ($(words $(subst ccache ,,$(CXX))),1) # allow at most "CXX=ccache <host-compiler>" from outside
+  $(warning CUDA builds are not supported for multi-word CXX "$(CXX)")
+  override CUDA_HOME=disabled
+endif
+
+# If CUDA_HOME is not set, try to set it from the location of nvcc
+ifndef CUDA_HOME
+  CUDA_HOME = $(patsubst %bin/nvcc,%,$(shell which nvcc 2>/dev/null))
+  $(warning CUDA_HOME was not set: using "$(CUDA_HOME)")
+endif
+
+# Set NVCC as $(CUDA_HOME)/bin/nvcc if it exists
+ifneq ($(wildcard $(CUDA_HOME)/bin/nvcc),)
+  NVCC = $(CUDA_HOME)/bin/nvcc
+  USE_NVTX ?=-DUSE_NVTX
+  # See https://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/index.html
+  # See https://arnon.dk/matching-sm-architectures-arch-and-gencode-for-various-nvidia-cards/
+  # Default: use compute capability 70 for V100 (CERN lxbatch, CERN itscrd, Juwels Cluster).
+  # Embed device code for 70, and PTX for 70+.
+  # Export MADGRAPH_CUDA_ARCHITECTURE (comma-separated list) to use another value or list of values (see #533).
+  # Examples: use 60 for P100 (Piz Daint), 80 for A100 (Juwels Booster, NVidia raplab/Curiosity).
+  MADGRAPH_CUDA_ARCHITECTURE ?= 70
+  ###CUARCHFLAGS = -gencode arch=compute_$(MADGRAPH_CUDA_ARCHITECTURE),code=compute_$(MADGRAPH_CUDA_ARCHITECTURE) -gencode arch=compute_$(MADGRAPH_CUDA_ARCHITECTURE),code=sm_$(MADGRAPH_CUDA_ARCHITECTURE) # Older implementation (AV): go back to this one for multi-GPU support #533
+  ###CUARCHFLAGS = --gpu-architecture=compute_$(MADGRAPH_CUDA_ARCHITECTURE) --gpu-code=sm_$(MADGRAPH_CUDA_ARCHITECTURE),compute_$(MADGRAPH_CUDA_ARCHITECTURE) # Newer implementation (SH): cannot use this as-is for multi-GPU support #533
+  comma:=,
+  CUARCHFLAGS = $(foreach arch,$(subst $(comma), ,$(MADGRAPH_CUDA_ARCHITECTURE)),-gencode arch=compute_$(arch),code=compute_$(arch) -gencode arch=compute_$(arch),code=sm_$(arch))
+  CUINC = -I$(CUDA_HOME)/include/
+  CURANDLIBFLAGS = -L$(CUDA_HOME)/lib64/ -lcurand # NB: -lcuda is not needed here!
+  CUOPTFLAGS = -lineinfo
+  CUFLAGS = $(OPTFLAGS) $(CUOPTFLAGS) $(INCFLAGS) $(CUINC) $(USE_NVTX) $(CUARCHFLAGS) -use_fast_math
+  ###CUFLAGS += -Xcompiler -Wall -Xcompiler -Wextra -Xcompiler -Wshadow
+  ###NVCC_VERSION = $(shell $(NVCC) --version | grep 'Cuda compilation tools' | cut -d' ' -f5 | cut -d, -f1)
+  CUFLAGS += -std=c++17 # need CUDA >= 11.2 (see #333): this is enforced in mgOnGpuConfig.h
+  # Without -maxrregcount: baseline throughput: 6.5E8 (16384 32 12) up to 7.3E8 (65536 128 12)
+  ###CUFLAGS+= --maxrregcount 160 # improves throughput: 6.9E8 (16384 32 12) up to 7.7E8 (65536 128 12)
+  ###CUFLAGS+= --maxrregcount 128 # improves throughput: 7.3E8 (16384 32 12) up to 7.6E8 (65536 128 12)
+  ###CUFLAGS+= --maxrregcount 96 # degrades throughput: 4.1E8 (16384 32 12) up to 4.5E8 (65536 128 12)
+  ###CUFLAGS+= --maxrregcount 64 # degrades throughput: 1.7E8 (16384 32 12) flat at 1.7E8 (65536 128 12)
+else ifneq ($(origin REQUIRE_CUDA),undefined)
+  # If REQUIRE_CUDA is set but no cuda is found, stop here (e.g. for CI tests on GPU #443)
+  $(error No cuda installation found (set CUDA_HOME or make nvcc visible in PATH))
+else
+  # No cuda. Switch cuda compilation off and go to common random numbers in C++
+  $(warning CUDA_HOME is not set or is invalid: export CUDA_HOME to compile with cuda)
+  override NVCC=
+  override USE_NVTX=
+  override CUINC=
+  override CURANDLIBFLAGS=
+endif
 
-  # Allow newer (unsupported) C++ compilers with older versions of CUDA if ALLOW_UNSUPPORTED_COMPILER_IN_CUDA is set (#504)
-  ifneq ($(origin ALLOW_UNSUPPORTED_COMPILER_IN_CUDA),undefined)
-  GPUFLAGS += -allow-unsupported-compiler
-  endif
+# Set the host C++ compiler for nvcc via "-ccbin <host-compiler>"
+# (NB issue #505: this must be a single word, "clang++ --gcc-toolchain..." is not supported)
+CUFLAGS += -ccbin $(shell which $(subst ccache ,,$(CXX)))
 
+# Allow newer (unsupported) C++ compilers with older versions of CUDA if ALLOW_UNSUPPORTED_COMPILER_IN_CUDA is set (#504)
+ifneq ($(origin ALLOW_UNSUPPORTED_COMPILER_IN_CUDA),undefined)
+CUFLAGS += -allow-unsupported-compiler
 endif
 
-  
 #-------------------------------------------------------------------------------
 
 #=== Configure ccache for C++ and CUDA builds
@@ -233,9 +163,9 @@ endif
 #ifeq ($(USECCACHE)$(shell echo $(AR) | grep ccache),1)
 #  override AR:=ccache $(AR)
 #endif
-ifneq ($(GPUCC),)
-  ifeq ($(USECCACHE)$(shell echo $(GPUCC) | grep ccache),1)
-    override GPUCC:=ccache $(GPUCC)
+ifneq ($(NVCC),)
+  ifeq ($(USECCACHE)$(shell echo $(NVCC) | grep ccache),1)
+    override NVCC:=ccache $(NVCC)
   endif
 endif
 
@@ -259,7 +189,7 @@ endif
 
 # PowerPC-specific CUDA compiler flags (to be reviewed!)
 ifeq ($(UNAME_P),ppc64le)
-  GPUFLAGS+= -Xcompiler -mno-float128
+  CUFLAGS+= -Xcompiler -mno-float128
 endif
 
 #-------------------------------------------------------------------------------
@@ -269,10 +199,10 @@ endif
 # Set the default OMPFLAGS choice
 ifneq ($(shell $(CXX) --version | egrep '^Intel'),)
 override OMPFLAGS = -fopenmp
-###override OMPFLAGS = # disable OpenMP MT on Intel (was ok without GPUCC but not ok with GPUCC before #578)
+###override OMPFLAGS = # disable OpenMP MT on Intel (was ok without nvcc but not ok with nvcc before #578)
 else ifneq ($(shell $(CXX) --version | egrep '^(clang)'),)
 override OMPFLAGS = -fopenmp
-###override OMPFLAGS = # disable OpenMP MT on clang (was not ok without or with GPUCC before #578)
+###override OMPFLAGS = # disable OpenMP MT on clang (was not ok without or with nvcc before #578)
 else ifneq ($(shell $(CXX) --version | egrep '^(Apple clang)'),)
 override OMPFLAGS = # disable OpenMP MT on Apple clang (builds fail in the CI #578)
 else
@@ -323,10 +253,7 @@ endif
 
 # Set the default RNDGEN (random number generator) choice
 ifeq ($(RNDGEN),)
-  ifeq ($(GPUCC),)
-    override RNDGEN = hasNoCurand
-  # Edgecase for HIP compilation
-  else ifeq ($(findstring hipcc,$(GPUCC)),hipcc)
+  ifeq ($(NVCC),)
     override RNDGEN = hasNoCurand
   else ifeq ($(RNDGEN),)
     override RNDGEN = hasCurand
@@ -401,13 +328,13 @@ CXXFLAGS+= $(AVXFLAGS)
 $(info FPTYPE=$(FPTYPE))
 ifeq ($(FPTYPE),d)
   CXXFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_DOUBLE
-  GPUFLAGS  += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_DOUBLE
+  CUFLAGS  += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_DOUBLE
 else ifeq ($(FPTYPE),f)
   CXXFLAGS += -DMGONGPU_FPTYPE_FLOAT -DMGONGPU_FPTYPE2_FLOAT
-  GPUFLAGS  += -DMGONGPU_FPTYPE_FLOAT -DMGONGPU_FPTYPE2_FLOAT
+  CUFLAGS  += -DMGONGPU_FPTYPE_FLOAT -DMGONGPU_FPTYPE2_FLOAT
 else ifeq ($(FPTYPE),m)
   CXXFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_FLOAT
-  GPUFLAGS  += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_FLOAT
+  CUFLAGS  += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_FLOAT
 else
   $(error Unknown FPTYPE='$(FPTYPE)': only 'd', 'f' and 'm' are supported)
 endif
@@ -416,7 +343,7 @@ endif
 $(info HELINL=$(HELINL))
 ifeq ($(HELINL),1)
   CXXFLAGS += -DMGONGPU_INLINE_HELAMPS
-  GPUFLAGS  += -DMGONGPU_INLINE_HELAMPS
+  CUFLAGS  += -DMGONGPU_INLINE_HELAMPS
 else ifneq ($(HELINL),0)
   $(error Unknown HELINL='$(HELINL)': only '0' and '1' are supported)
 endif
@@ -425,7 +352,7 @@ endif
 $(info HRDCOD=$(HRDCOD))
 ifeq ($(HRDCOD),1)
   CXXFLAGS += -DMGONGPU_HARDCODE_PARAM
-  GPUFLAGS  += -DMGONGPU_HARDCODE_PARAM
+  CUFLAGS  += -DMGONGPU_HARDCODE_PARAM
 else ifneq ($(HRDCOD),0)
   $(error Unknown HRDCOD='$(HRDCOD)': only '0' and '1' are supported)
 endif
@@ -477,11 +404,11 @@ ifeq ($(UNAME_S),Darwin)
   override CULIBFLAGSRPATH2 =
 else
   # RPATH to cuda/cpp libs when linking executables
-  override CXXLIBFLAGSRPATH = -Wl,-rpath=$(LIBDIRRPATH)
-  override CULIBFLAGSRPATH = -Xlinker -rpath=$(LIBDIRRPATH)
+  override CXXLIBFLAGSRPATH = -Wl,-rpath,$(LIBDIRRPATH)
+  override CULIBFLAGSRPATH = -Xlinker -rpath,$(LIBDIRRPATH)
   # RPATH to common lib when linking cuda/cpp libs
-  override CXXLIBFLAGSRPATH2 = -Wl,-rpath='$$ORIGIN'
-  override CULIBFLAGSRPATH2 = -Xlinker -rpath='$$ORIGIN'
+  override CXXLIBFLAGSRPATH2 = -Wl,-rpath,'$$ORIGIN'
+  override CULIBFLAGSRPATH2 = -Xlinker -rpath,'$$ORIGIN'
 endif
 
 # Setting LD_LIBRARY_PATH or DYLD_LIBRARY_PATH in the RUNTIME is no longer necessary (neither on Linux nor on Mac)
@@ -494,7 +421,7 @@ override RUNTIME =
 cxx_main=$(BUILDDIR)/check.exe
 fcxx_main=$(BUILDDIR)/fcheck.exe
 
-ifneq ($(GPUCC),)
+ifneq ($(NVCC),)
 cu_main=$(BUILDDIR)/gcheck.exe
 fcu_main=$(BUILDDIR)/fgcheck.exe
 else
@@ -525,16 +452,15 @@ $(BUILDDIR)/.build.$(TAG):
 	@touch $(BUILDDIR)/.build.$(TAG)
 
 # Generic target and build rules: objects from CUDA compilation
-ifneq ($(GPUCC),)
+ifneq ($(NVCC),)
 $(BUILDDIR)/%.o : %.cu *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
 	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
-	$(GPUCC) $(CPPFLAGS) $(GPUFLAGS) $(CUBUILDRULEFLAGS) $< -o $@
+	$(NVCC) $(CPPFLAGS) $(CUFLAGS) -Xcompiler -fPIC -c $< -o $@
 
 $(BUILDDIR)/%_cu.o : %.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
 	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
-	$(GPUCC) $(CPPFLAGS) $(GPUFLAGS) $(CCBUILDRULEFLAGS) $< -o $@
+	$(NVCC) $(CPPFLAGS) $(CUFLAGS) -Xcompiler -fPIC -c -x cu $< -o $@
 endif
-# -x cu in line above
 
 # Generic target and build rules: objects from C++ compilation
 # (NB do not include CUINC here! add it only for NVTX or curand #679)
@@ -543,14 +469,11 @@ $(BUILDDIR)/%.o : %.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
 	$(CXX) $(CPPFLAGS) $(CXXFLAGS) -fPIC -c $< -o $@
 
 # Apply special build flags only to CrossSectionKernel.cc and gCrossSectionKernel.cu (no fast math, see #117)
-# Added edgecase for HIP compilation
 ifeq ($(shell $(CXX) --version | grep ^nvc++),)
 $(BUILDDIR)/CrossSectionKernels.o: CXXFLAGS += -fno-fast-math
 $(BUILDDIR)/CrossSectionKernels.o: CXXFLAGS += -fno-fast-math
-ifeq ($(findstring nvcc,$(GPUCC)),nvcc)
-  $(BUILDDIR)/gCrossSectionKernels.o: GPUFLAGS += -Xcompiler -fno-fast-math
-else
-  $(BUILDDIR)/gCrossSectionKernels.o: GPUFLAGS += -fno-fast-math
+ifneq ($(NVCC),)
+$(BUILDDIR)/gCrossSectionKernels.o: CUFLAGS += -Xcompiler -fno-fast-math
 endif
 endif
 
@@ -566,10 +489,10 @@ ifeq ($(RNDGEN),hasCurand)
 $(BUILDDIR)/CurandRandomNumberKernel.o: CXXFLAGS += $(CUINC)
 endif
 
-# Avoid "warning: builtin __has_trivial_... is deprecated; use __is_trivially_... instead" in GPUCC with icx2023 (#592)
+# Avoid "warning: builtin __has_trivial_... is deprecated; use __is_trivially_... instead" in nvcc with icx2023 (#592)
 ifneq ($(shell $(CXX) --version | egrep '^(Intel)'),)
-ifneq ($(GPUCC),)
-GPUFLAGS += -Wno-deprecated-builtins
+ifneq ($(NVCC),)
+CUFLAGS += -Xcompiler -Wno-deprecated-builtins
 endif
 endif
 
@@ -577,8 +500,8 @@ endif
 # This patch does remove the warning, but I prefer to keep it disabled for the moment...
 ###ifneq ($(shell $(CXX) --version | egrep '^(clang|Apple clang|Intel)'),)
 ###$(BUILDDIR)/CrossSectionKernels.o: CXXFLAGS += -Wno-overriding-t-option
-###ifneq ($(GPUCC),)
-###$(BUILDDIR)/gCrossSectionKernels.o: GPUFLAGS += -Xcompiler -Wno-overriding-t-option
+###ifneq ($(NVCC),)
+###$(BUILDDIR)/gCrossSectionKernels.o: CUFLAGS += -Xcompiler -Wno-overriding-t-option
 ###endif
 ###endif
 
@@ -605,7 +528,7 @@ MG5AMC_CXXLIB = mg5amc_$(processid_short)_cpp
 cxx_objects_lib=$(BUILDDIR)/CPPProcess.o $(BUILDDIR)/MatrixElementKernels.o $(BUILDDIR)/BridgeKernels.o $(BUILDDIR)/CrossSectionKernels.o
 cxx_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel.o $(BUILDDIR)/RamboSamplingKernels.o
 
-ifneq ($(GPUCC),)
+ifneq ($(NVCC),)
 MG5AMC_CULIB = mg5amc_$(processid_short)_cuda
 cu_objects_lib=$(BUILDDIR)/gCPPProcess.o $(BUILDDIR)/gMatrixElementKernels.o $(BUILDDIR)/gBridgeKernels.o $(BUILDDIR)/gCrossSectionKernels.o
 cu_objects_exe=$(BUILDDIR)/gCommonRandomNumberKernel.o $(BUILDDIR)/gRamboSamplingKernels.o
@@ -617,11 +540,11 @@ $(LIBDIR)/lib$(MG5AMC_CXXLIB).so: cxx_objects_lib += $(BUILDDIR)/fbridge.o
 $(LIBDIR)/lib$(MG5AMC_CXXLIB).so: $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib)
 	$(CXX) -shared -o $@ $(cxx_objects_lib) $(CXXLIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB)
 
-ifneq ($(GPUCC),)
+ifneq ($(NVCC),)
 $(LIBDIR)/lib$(MG5AMC_CULIB).so: $(BUILDDIR)/fbridge_cu.o
 $(LIBDIR)/lib$(MG5AMC_CULIB).so: cu_objects_lib += $(BUILDDIR)/fbridge_cu.o
 $(LIBDIR)/lib$(MG5AMC_CULIB).so: $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cu_objects_lib)
-	$(GPUCC) --shared -o $@ $(cu_objects_lib) $(CULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB)
+	$(NVCC) --shared -o $@ $(cu_objects_lib) $(CULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB)
 endif
 
 #-------------------------------------------------------------------------------
@@ -638,16 +561,16 @@ $(cxx_main): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PAT
 $(cxx_main): $(BUILDDIR)/check_sa.o $(LIBDIR)/lib$(MG5AMC_CXXLIB).so $(cxx_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel.o
 	$(CXX) -o $@ $(BUILDDIR)/check_sa.o $(OMPFLAGS) -ldl -pthread $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_CXXLIB) $(cxx_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel.o $(CURANDLIBFLAGS)
 
-ifneq ($(GPUCC),)
+ifneq ($(NVCC),)
 ifneq ($(shell $(CXX) --version | grep ^Intel),)
-$(cu_main): LIBFLAGS += -lintlc # compile with icpx and link with GPUCC (undefined reference to `_intel_fast_memcpy')
-$(cu_main): LIBFLAGS += -lsvml # compile with icpx and link with GPUCC (undefined reference to `__svml_cos4_l9')
+$(cu_main): LIBFLAGS += -lintlc # compile with icpx and link with nvcc (undefined reference to `_intel_fast_memcpy')
+$(cu_main): LIBFLAGS += -lsvml # compile with icpx and link with nvcc (undefined reference to `__svml_cos4_l9')
 else ifneq ($(shell $(CXX) --version | grep ^nvc++),) # support nvc++ #531
 $(cu_main): LIBFLAGS += -L$(patsubst %bin/nvc++,%lib,$(subst ccache ,,$(CXX))) -lnvhpcatm -lnvcpumath -lnvc
 endif
 $(cu_main): LIBFLAGS += $(CULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH
 $(cu_main): $(BUILDDIR)/gcheck_sa.o $(LIBDIR)/lib$(MG5AMC_CULIB).so $(cu_objects_exe) $(BUILDDIR)/gCurandRandomNumberKernel.o
-	$(GPUCC) -o $@ $(BUILDDIR)/gcheck_sa.o $(CUARCHFLAGS) $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe) $(BUILDDIR)/gCurandRandomNumberKernel.o $(CURANDLIBFLAGS)
+	$(NVCC) -o $@ $(BUILDDIR)/gcheck_sa.o $(CUARCHFLAGS) $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe) $(BUILDDIR)/gCurandRandomNumberKernel.o $(CURANDLIBFLAGS)
 endif
 
 #-------------------------------------------------------------------------------
@@ -673,17 +596,17 @@ $(fcxx_main): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PA
 $(fcxx_main): $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler.o $(LIBDIR)/lib$(MG5AMC_CXXLIB).so $(cxx_objects_exe)
 	$(CXX) -o $@ $(BUILDDIR)/fcheck_sa.o $(OMPFLAGS) $(BUILDDIR)/fsampler.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CXXLIB) $(cxx_objects_exe)
 
-ifneq ($(GPUCC),)
+ifneq ($(NVCC),)
 ifneq ($(shell $(CXX) --version | grep ^Intel),)
-$(fcu_main): LIBFLAGS += -lintlc # compile with icpx and link with GPUCC (undefined reference to `_intel_fast_memcpy')
-$(fcu_main): LIBFLAGS += -lsvml # compile with icpx and link with GPUCC (undefined reference to `__svml_cos4_l9')
+$(fcu_main): LIBFLAGS += -lintlc # compile with icpx and link with nvcc (undefined reference to `_intel_fast_memcpy')
+$(fcu_main): LIBFLAGS += -lsvml # compile with icpx and link with nvcc (undefined reference to `__svml_cos4_l9')
 endif
 ifeq ($(UNAME_S),Darwin)
 $(fcu_main): LIBFLAGS += -L$(shell dirname $(shell $(FC) --print-file-name libgfortran.dylib)) # add path to libgfortran on Mac #375
 endif
 $(fcu_main): LIBFLAGS += $(CULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH
 $(fcu_main): $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler_cu.o $(LIBDIR)/lib$(MG5AMC_CULIB).so $(cu_objects_exe)
-	$(GPUCC) -o $@ $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler_cu.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe)
+	$(NVCC) -o $@ $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler_cu.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe)
 endif
 
 #-------------------------------------------------------------------------------
@@ -695,7 +618,7 @@ $(BUILDDIR)/testxxx.o: testxxx_cc_ref.txt
 $(testmain): $(BUILDDIR)/testxxx.o
 $(testmain): cxx_objects_exe += $(BUILDDIR)/testxxx.o # Comment out this line to skip the C++ test of xxx functions
 
-ifneq ($(GPUCC),)
+ifneq ($(NVCC),)
 $(BUILDDIR)/testxxx_cu.o: $(GTESTLIBS)
 $(BUILDDIR)/testxxx_cu.o: INCFLAGS += $(GTESTINC)
 $(BUILDDIR)/testxxx_cu.o: testxxx_cc_ref.txt
@@ -708,7 +631,7 @@ $(BUILDDIR)/testmisc.o: INCFLAGS += $(GTESTINC)
 $(testmain): $(BUILDDIR)/testmisc.o
 $(testmain): cxx_objects_exe += $(BUILDDIR)/testmisc.o # Comment out this line to skip the C++ miscellaneous tests
 
-ifneq ($(GPUCC),)
+ifneq ($(NVCC),)
 $(BUILDDIR)/testmisc_cu.o: $(GTESTLIBS)
 $(BUILDDIR)/testmisc_cu.o: INCFLAGS += $(GTESTINC)
 $(testmain): $(BUILDDIR)/testmisc_cu.o
@@ -720,12 +643,12 @@ $(BUILDDIR)/runTest.o: INCFLAGS += $(GTESTINC)
 $(testmain): $(BUILDDIR)/runTest.o
 $(testmain): cxx_objects_exe += $(BUILDDIR)/runTest.o
 
-ifneq ($(GPUCC),)
+ifneq ($(NVCC),)
 $(BUILDDIR)/runTest_cu.o: $(GTESTLIBS)
 $(BUILDDIR)/runTest_cu.o: INCFLAGS += $(GTESTINC)
 ifneq ($(shell $(CXX) --version | grep ^Intel),)
-$(testmain): LIBFLAGS += -lintlc # compile with icpx and link with GPUCC (undefined reference to `_intel_fast_memcpy')
-$(testmain): LIBFLAGS += -lsvml # compile with icpx and link with GPUCC (undefined reference to `__svml_cos4_l9')
+$(testmain): LIBFLAGS += -lintlc # compile with icpx and link with nvcc (undefined reference to `_intel_fast_memcpy')
+$(testmain): LIBFLAGS += -lsvml # compile with icpx and link with nvcc (undefined reference to `__svml_cos4_l9')
 else ifneq ($(shell $(CXX) --version | grep ^nvc++),) # support nvc++ #531
 $(testmain): LIBFLAGS += -L$(patsubst %bin/nvc++,%lib,$(subst ccache ,,$(CXX))) -lnvhpcatm -lnvcpumath -lnvc
 endif
@@ -749,14 +672,14 @@ $(testmain): LIBFLAGS += -lgomp
 endif
 endif
 
-ifeq ($(GPUCC),) # link only runTest.o
+ifeq ($(NVCC),) # link only runTest.o
 $(testmain): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH
 $(testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(cxx_objects_exe) $(GTESTLIBS)
 	$(CXX) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) -ldl -pthread $(LIBFLAGS)
 else # link both runTest.o and runTest_cu.o
 $(testmain): LIBFLAGS += $(CULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH
 $(testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) $(GTESTLIBS)
-	  $(GPUCC) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) -ldl $(LIBFLAGS) $(CUDATESTFLAGS)
+	$(NVCC) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) -ldl $(LIBFLAGS) -lcuda
 endif
 
 # Use flock (Linux only, no Mac) to allow 'make -j' if googletest has not yet been downloaded https://stackoverflow.com/a/32666215
@@ -859,9 +782,9 @@ ifeq ($(USECCACHE),1)
 	ccache --version | head -1
 endif
 	@echo ""
-	@echo GPUCC=$(GPUCC)
-ifneq ($(GPUCC),)
-	$(GPUCC) --version
+	@echo NVCC=$(NVCC)
+ifneq ($(NVCC),)
+	$(NVCC) --version
 endif
 	@echo ""
 	@echo CXX=$(CXX)
@@ -880,7 +803,7 @@ endif
 
 # Target: check (run the C++ test executable)
 # [NB THIS IS WHAT IS USED IN THE GITHUB CI!]
-ifneq ($(GPUCC),)
+ifneq ($(NVCC),)
 check: runTest cmpFcheck cmpFGcheck
 else
 check: runTest cmpFcheck
diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/fbridge.cc b/epochX/cudacpp/ee_mumu.mad/SubProcesses/fbridge.cc
index 2b956730d4..f93c05b0b3 100644
--- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/fbridge.cc
+++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/fbridge.cc
@@ -1,11 +1,11 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: S. Roiser (Oct 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Roiser, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #include "Bridge.h"
 #include "CPPProcess.h"
-#include "GpuRuntime.h"
+#include "CudaRuntime.h"
 
 extern "C"
 {
@@ -22,7 +22,7 @@ extern "C"
    * Using the same Fortran MadEvent code, linking to the hetrerogeneous library would allow access to both CPU and GPU implementations.
    * The specific heterogeneous configuration (how many GPUs, how many threads on each CPU, etc) could be loaded in CUDA/C++ from a data file.
    */
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
   using namespace mg5amcGpu;
 #else
   using namespace mg5amcCpu;
@@ -46,8 +46,8 @@ extern "C"
    */
   void fbridgecreate_( CppObjectInFortran** ppbridge, const int* pnevtF, const int* pnparF, const int* pnp4F )
   {
-#ifdef MGONGPUCPP_GPUIMPL
-    GpuRuntime::setUp();
+#ifdef __CUDACC__
+    CudaRuntime::setUp();
 #endif
     // Create a process object, read parm card and set parameters
     // FIXME: the process instance can happily go out of scope because it is only needed to read parameters?
@@ -69,8 +69,8 @@ extern "C"
     Bridge<FORTRANFPTYPE>* pbridge = dynamic_cast<Bridge<FORTRANFPTYPE>*>( *ppbridge );
     if( pbridge == 0 ) throw std::runtime_error( "fbridgedelete_: invalid Bridge address" );
     delete pbridge;
-#ifdef MGONGPUCPP_GPUIMPL
-    GpuRuntime::tearDown();
+#ifdef __CUDACC__
+    CudaRuntime::tearDown();
 #endif
   }
 
@@ -100,7 +100,7 @@ extern "C"
   {
     Bridge<FORTRANFPTYPE>* pbridge = dynamic_cast<Bridge<FORTRANFPTYPE>*>( *ppbridge );
     if( pbridge == 0 ) throw std::runtime_error( "fbridgesequence_: invalid Bridge address" );
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
     // Use the device/GPU implementation in the CUDA library
     // (there is also a host implementation in this library)
     pbridge->gpu_sequence( momenta, gs, rndhel, rndcol, *pchannelId, mes, selhel, selcol );
diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/fsampler.cc b/epochX/cudacpp/ee_mumu.mad/SubProcesses/fsampler.cc
index 3743934f41..2fb445372d 100644
--- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/fsampler.cc
+++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/fsampler.cc
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Feb 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #include "mgOnGpuConfig.h"
 
@@ -13,7 +13,7 @@
 
 //--------------------------------------------------------------------------
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -40,7 +40,7 @@ namespace mg5amcCpu
   private:
     const int m_nevt; // The number of events in each iteration
     int m_iiter;      // The iteration counter (for random number seeding)
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
     HostBufferRndNumMomenta m_hstRndmom; // Memory buffers for random numbers
     HostBufferMomenta m_hstMomenta;      // Memory buffers for momenta
     HostBufferWeights m_hstWeights;      // Memory buffers for sampling weights
@@ -105,7 +105,7 @@ namespace mg5amcCpu
 
 extern "C"
 {
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
   using namespace mg5amcGpu;
 #else
   using namespace mg5amcCpu;
diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/runTest.cc b/epochX/cudacpp/ee_mumu.mad/SubProcesses/runTest.cc
index 461ec5c3a5..572e28aaea 100644
--- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/runTest.cc
+++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/runTest.cc
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: S. Hageboeck (Nov 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 
 #include "mgOnGpuConfig.h"
 
@@ -15,7 +15,7 @@
 #include "RandomNumberKernels.h"
 #include "epoch_process_id.h"
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 using namespace mg5amcGpu;
 #else
 using namespace mg5amcCpu;
@@ -32,7 +32,7 @@ struct CUDA_CPU_TestBase : public TestDriverBase
     : TestDriverBase( npar, refFileName ) {}
 };
 
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
 struct CPUTest : public CUDA_CPU_TestBase
 {
   // Struct data members (process, and memory structures for random numbers, momenta, matrix elements and weights on host and device)
@@ -114,7 +114,7 @@ struct CPUTest : public CUDA_CPU_TestBase
 };
 #endif
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 struct CUDATest : public CUDA_CPU_TestBase
 {
   // Reset the device when our test goes out of scope. Note that this should happen after
@@ -123,7 +123,7 @@ struct CUDATest : public CUDA_CPU_TestBase
   {
     ~DeviceReset()
     {
-      gpuDeviceReset(); // this is needed by cuda-memcheck --leak-check full
+      checkCuda( cudaDeviceReset() ); // this is needed by cuda-memcheck --leak-check full
     }
   } deviceResetter;
 
@@ -249,7 +249,7 @@ INSTANTIATE_TEST_SUITE_P( prefix, \
                           test_suite_name, \
                           testing::Values( new CUDATest( MG_EPOCH_REFERENCE_FILE_NAME ) ) );
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 MG_INSTANTIATE_TEST_SUITE_GPU( XTESTID_GPU( MG_EPOCH_PROCESS_ID ), MadgraphTest );
 #else
 MG_INSTANTIATE_TEST_SUITE_CPU( XTESTID_CPU( MG_EPOCH_PROCESS_ID ), MadgraphTest );
diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/testmisc.cc b/epochX/cudacpp/ee_mumu.mad/SubProcesses/testmisc.cc
index 2bd7a9fcf9..989aba1fdc 100644
--- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/testmisc.cc
+++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/testmisc.cc
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 //----------------------------------------------------------------------------
 // Use ./runTest.exe --gtest_filter=*misc to run only this test
 //----------------------------------------------------------------------------
@@ -17,7 +17,7 @@
 #include <sstream>
 #include <typeinfo>
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 #define TESTID( s ) s##_GPU_MISC
 #else
 #define TESTID( s ) s##_CPU_MISC
diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/testxxx.cc b/epochX/cudacpp/ee_mumu.mad/SubProcesses/testxxx.cc
index 6e8657edca..4243e9fcec 100644
--- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/testxxx.cc
+++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/testxxx.cc
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Apr 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #include "mgOnGpuConfig.h"
 
@@ -20,7 +20,7 @@
 #include <iomanip>
 #include <iostream>
 #include <vector>
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 #define TESTID( s ) s##_GPU_XXX
 #else
 #define TESTID( s ) s##_CPU_XXX
@@ -41,7 +41,7 @@ TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testxxx )
   assert( nevt % neppM == 0 ); // nevt must be a multiple of neppM
   assert( nevt % neppV == 0 ); // nevt must be a multiple of neppV
   // Fill in the input momenta
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
   mg5amcGpu::PinnedHostBufferMomenta hstMomenta( nevt ); // AOSOA[npagM][npar=4][np4=4][neppM]
 #else
   mg5amcCpu::HostBufferMomenta hstMomenta( nevt ); // AOSOA[npagM][npar=4][np4=4][neppM]
@@ -228,7 +228,7 @@ TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testxxx )
   {
     for( int ievt = 0; ievt < nevt; ievt++ )
     {
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
       using namespace mg5amcGpu;
 #else
       using namespace mg5amcCpu;
diff --git a/epochX/cudacpp/ee_mumu.mad/src/HelAmps_sm.h b/epochX/cudacpp/ee_mumu.mad/src/HelAmps_sm.h
index c2c572778b..fe9cb24d88 100644
--- a/epochX/cudacpp/ee_mumu.mad/src/HelAmps_sm.h
+++ b/epochX/cudacpp/ee_mumu.mad/src/HelAmps_sm.h
@@ -4,7 +4,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: A. Valassi (Sep 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
 // MadGraph5_aMC@NLO v. 3.5.0_lo_vect, 2023-06-09
@@ -26,7 +26,7 @@
 //#include <iomanip>
 //#include <iostream>
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/ee_mumu.mad/src/Parameters_sm.cc b/epochX/cudacpp/ee_mumu.mad/src/Parameters_sm.cc
index 89bbb57a0d..daed91bb80 100644
--- a/epochX/cudacpp/ee_mumu.mad/src/Parameters_sm.cc
+++ b/epochX/cudacpp/ee_mumu.mad/src/Parameters_sm.cc
@@ -4,7 +4,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: A. Valassi (Sep 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
 // MadGraph5_aMC@NLO v. 3.5.0_lo_vect, 2023-06-09
diff --git a/epochX/cudacpp/ee_mumu.mad/src/Parameters_sm.h b/epochX/cudacpp/ee_mumu.mad/src/Parameters_sm.h
index 5e20ca27b5..852861ced0 100644
--- a/epochX/cudacpp/ee_mumu.mad/src/Parameters_sm.h
+++ b/epochX/cudacpp/ee_mumu.mad/src/Parameters_sm.h
@@ -210,7 +210,7 @@ namespace Parameters_sm_dependentCouplings
 #pragma GCC diagnostic push
 #pragma GCC diagnostic ignored "-Wunused-variable"  // e.g. <<warning: unused variable ‘mdl_G__exp__2’ [-Wunused-variable]>>
 #pragma GCC diagnostic ignored "-Wunused-parameter" // e.g. <<warning: unused parameter ‘G’ [-Wunused-parameter]>>
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 #pragma nv_diagnostic push
 #pragma nv_diag_suppress 177 // e.g. <<warning #177-D: variable "mdl_G__exp__2" was declared but never referenced>>
 #endif
@@ -234,7 +234,7 @@ namespace Parameters_sm_dependentCouplings
     // End SM implementation - no special handling of vectors of floats as in EFT (#439)
     return out;
   }
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 #pragma GCC diagnostic pop
 #pragma nv_diagnostic pop
 #endif
@@ -252,7 +252,7 @@ namespace Parameters_sm_independentCouplings
 
 //==========================================================================
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/ee_mumu.mad/src/mgOnGpuConfig.h b/epochX/cudacpp/ee_mumu.mad/src/mgOnGpuConfig.h
index 390766116b..881353abac 100644
--- a/epochX/cudacpp/ee_mumu.mad/src/mgOnGpuConfig.h
+++ b/epochX/cudacpp/ee_mumu.mad/src/mgOnGpuConfig.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jul 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MGONGPUCONFIG_H
 #define MGONGPUCONFIG_H 1
@@ -10,26 +10,13 @@
 // There are two different code bases for standalone_cudacpp (without multichannel) and madevent+cudacpp (with multichannel)
 #define MGONGPU_SUPPORTS_MULTICHANNEL 1
 
-// Is this a GPU (CUDA, HIP) or CPU implementation?
-#ifdef __CUDACC__
-#define MGONGPUCPP_GPUIMPL cuda
-#elif defined __HIPCC__
-#define MGONGPUCPP_GPUIMPL hip
-#else
-#undef MGONGPUCPP_GPUIMPL
-#endif
-
 // ** NB1 Throughputs (e.g. 6.8E8) are events/sec for "./gcheck.exe -p 65536 128 12"
 // ** NB2 Baseline on b7g47n0004 fluctuates (probably depends on load on other VMs)
 
 // Choose if curand is supported for generating random numbers
-// For CUDA, by default, it is supported
-// For HIP, by default, it is not supported
 // For C++, by default, do not inline, but allow this macro to be set from outside with e.g. -DMGONGPU_HAS_NO_CURAND
 #ifdef __CUDACC__
 #undef MGONGPU_HAS_NO_CURAND
-#elif defined __HIPCC__
-#define MGONGPU_HAS_NO_CURAND 1
 #else
 //#undef MGONGPU_HAS_NO_CURAND // default
 ////#define MGONGPU_HAS_NO_CURAND 1
@@ -65,28 +52,23 @@
 //#undef MGONGPU_HARDCODE_PARAM // default
 ////#define MGONGPU_HARDCODE_PARAM 1
 
-// Complex type in CUDA: thrust or cucomplex or cxsmpl (CHOOSE ONLY ONE)
+// Complex type in c++: std::complex or cxsmpl (CHOOSE ONLY ONE)
+#ifndef __CUDACC__
+//#define MGONGPU_CPPCXTYPE_STDCOMPLEX 1 // ~8 percent slower on float, same on double (5.1E6/double, 9.4E6/float)
+#define MGONGPU_CPPCXTYPE_CXSMPL 1 // new default (5.1E6/double, 10.2E6/float)
+#endif
+
+// Complex type in cuda: thrust or cucomplex or cxsmpl (CHOOSE ONLY ONE)
 #ifdef __CUDACC__
 #define MGONGPU_CUCXTYPE_THRUST 1 // default (~1.15E9/double, ~3.2E9/float)
 //#define MGONGPU_CUCXTYPE_CUCOMPLEX 1 // ~10 percent slower (1.03E9/double, ~2.8E9/float)
 //#define MGONGPU_CUCXTYPE_CXSMPL 1 // ~10 percent slower (1.00E9/double, ~2.9E9/float)
-
-// Complex type in HIP: cxsmpl (ONLY ONE OPTION POSSIBLE)
-#elif defined __HIPCC__
-#define MGONGPU_CUCXTYPE_CXSMPL 1 // ~10 percent slower (1.00E9/double, ~2.9E9/float)
-
-// Complex type in C++: std::complex or cxsmpl (CHOOSE ONLY ONE)
-#else
-//#define MGONGPU_CPPCXTYPE_STDCOMPLEX 1 // ~8 percent slower on float, same on double (5.1E6/double, 9.4E6/float)
-#define MGONGPU_CPPCXTYPE_CXSMPL 1 // new default (5.1E6/double, 10.2E6/float)
 #endif
 
-// CUDA nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation
+// Cuda nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation
 #ifdef __CUDACC__
-#undef MGONGPU_NSIGHT_DEBUG // default in CUDA
+#undef MGONGPU_NSIGHT_DEBUG // default
 //#define MGONGPU_NSIGHT_DEBUG 1
-#else
-#undef MGONGPU_NSIGHT_DEBUG // only option in HIP or C++
 #endif
 
 // SANITY CHECKS (floating point precision for everything but color algebra #537)
@@ -102,21 +84,17 @@
 #error You cannot use double precision for color algebra and single precision elsewhere
 #endif
 
-// SANITY CHECKS (CUDA complex number implementation)
-#ifdef __CUDACC__
-#if defined MGONGPU_CUCXTYPE_THRUST and defined MGONGPU_CUCXTYPE_CUCOMPLEX
-#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CUCXTYPE_THRUST or MGONGPU_CUCXTYPE_CUCOMPLEX for CUDA
-#elif defined MGONGPU_CUCXTYPE_THRUST and defined MGONGPU_CUCXTYPE_CXSMPL
-#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CUCXTYPE_THRUST or MGONGPU_CUCXTYPE_CXSMPL for CUDA
-#elif defined MGONGPU_CUCXTYPE_CUCOMPLEX and defined MGONGPU_CUCXTYPE_CXSMPL
-#error You must CHOOSE (ONE AND) ONLY ONE OF MGONGPU_CUCXTYPE_CUCOMPLEX or MGONGPU_CUCXTYPE_CXSMPL for CUDA
+// SANITY CHECKS (c++ complex number implementation)
+#ifndef __CUDACC__
+#if defined MGONGPU_CPPCXTYPE_STDCOMPLEX and defined MGONGPU_CPPCXTYPE_CXSMPL
+#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CPPCXTYPE_STDCOMPLEX or MGONGPU_CPPCXTYPE_CXSMPL
 #endif
 #endif
 
-// SANITY CHECKS (C++ complex number implementation)
-#ifndef MGONGPUCPP_GPUIMPL
-#if defined MGONGPU_CPPCXTYPE_STDCOMPLEX and defined MGONGPU_CPPCXTYPE_CXSMPL
-#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CPPCXTYPE_STDCOMPLEX or MGONGPU_CPPCXTYPE_CXSMPL for C++
+// SANITY CHECKS (cuda complex number implementation)
+#ifdef __CUDACC__
+#if defined MGONGPU_CUCXTYPE_THRUST and defined MGONGPU_CUCXTYPE_CUCOMPLEX and defined MGONGPU_CUCXTYPE_CXSMPL
+#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CUCXTYPE_THRUST or MGONGPU_CUCXTYPE_CUCOMPLEX or MGONGPU_CUCXTYPE_CXSMPL
 #endif
 #endif
 
@@ -153,7 +131,7 @@ namespace mgOnGpu
   // Alignment requirement for using reinterpret_cast with SIMD vectorized code
   // (using reinterpret_cast with non aligned memory may lead to segmentation faults!)
   // Only needed for C++ code but can be enforced also in NVCC builds of C++ code using CUDA>=11.2 and C++17 (#318, #319, #333)
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
   constexpr int cppAlign = 64; // alignment requirement for SIMD vectorization (64-byte i.e. 512-bit)
 #endif
 
@@ -164,7 +142,7 @@ using mgOnGpu::fptype;
 using mgOnGpu::fptype2;
 
 // C++ SIMD vectorization width (this will be used to set neppV)
-#ifdef MGONGPUCPP_GPUIMPL // CUDA and HIP implementations have no SIMD
+#ifdef __CUDACC__ // CUDA implementation has no SIMD
 #undef MGONGPU_CPPSIMD
 #elif defined __AVX512VL__ && defined MGONGPU_PVW512 // C++ "512z" AVX512 with 512 width (512-bit ie 64-byte): 8 (DOUBLE) or 16 (FLOAT)
 #ifdef MGONGPU_FPTYPE_DOUBLE
@@ -194,9 +172,9 @@ using mgOnGpu::fptype2;
 #undef MGONGPU_CPPSIMD
 #endif
 
-// CUDA nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation
+// Cuda nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation
 // Arguments (not used so far): text is __FUNCTION__, code is 0 (start) or 1 (end)
-#if defined __CUDA__ && defined MGONGPU_NSIGHT_DEBUG /* clang-format off */
+#if defined __CUDACC__ && defined MGONGPU_NSIGHT_DEBUG /* clang-format off */
 #define mgDebugDeclare() __shared__ float mgDebugCounter[mgOnGpu::ntpbMAX];
 #define mgDebugInitialise() { mgDebugCounter[threadIdx.x] = 0; }
 #define mgDebug( code, text ) { mgDebugCounter[threadIdx.x] += 1; }
@@ -208,8 +186,8 @@ using mgOnGpu::fptype2;
 #define mgDebugFinalise() { /*noop*/ }
 #endif /* clang-format on */
 
-// Define empty CUDA/HIP declaration specifiers for C++
-#ifndef MGONGPUCPP_GPUIMPL
+// Define empty CUDA declaration specifiers for C++
+#ifndef __CUDACC__
 #define __global__
 #define __host__
 #define __device__
diff --git a/epochX/cudacpp/ee_mumu.mad/src/mgOnGpuCxtypes.h b/epochX/cudacpp/ee_mumu.mad/src/mgOnGpuCxtypes.h
index 4e7ab03fa2..0cb2f1db7e 100644
--- a/epochX/cudacpp/ee_mumu.mad/src/mgOnGpuCxtypes.h
+++ b/epochX/cudacpp/ee_mumu.mad/src/mgOnGpuCxtypes.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022, based on earlier work by D. Smith) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MGONGPUCXTYPES_H
 #define MGONGPUCXTYPES_H 1
@@ -19,7 +19,7 @@
 #include <complex>
 
 // Complex type in cuda: thrust or cucomplex or cxsmpl
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 #if defined MGONGPU_CUCXTYPE_THRUST
 #pragma clang diagnostic push
 #pragma clang diagnostic ignored "-Wtautological-compare" // for icpx2021/clang13 (https://stackoverflow.com/a/15864661)
@@ -201,7 +201,7 @@ namespace mgOnGpu
 {
 
   // --- Type definitions (complex type: cxtype)
-#ifdef MGONGPUCPP_GPUIMPL // cuda
+#ifdef __CUDACC__ // cuda
 #if defined MGONGPU_CUCXTYPE_THRUST
   typedef thrust::complex<fptype> cxtype;
 #elif defined MGONGPU_CUCXTYPE_CUCOMPLEX
@@ -281,7 +281,7 @@ cxmake( const std::complex<double>& c ) // std::complex to cxsmpl (double-to-flo
 
 //==========================================================================
 
-#if defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CUCXTYPE_THRUST // cuda + thrust
+#if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_THRUST // cuda + thrust
 
 //------------------------------
 // CUDA - using thrust::complex
@@ -317,11 +317,11 @@ cxmake( const cxtype& c )
   return c;
 }
 
-#endif // #if defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CUCXTYPE_THRUST
+#endif // #if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_THRUST
 
 //==========================================================================
 
-#if defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CUCXTYPE_CUCOMPLEX // cuda + cucomplex
+#if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_CUCOMPLEX // cuda + cucomplex
 
 //------------------------------
 // CUDA - using cuComplex
@@ -536,11 +536,11 @@ cxmake( const std::complex<fptype>& c ) // std::complex to cucomplex (float-to-f
   return cxmake( c.real(), c.imag() );
 }
 
-#endif // #if defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CUCXTYPE_CUCOMPLEX
+#endif // #if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_CUCOMPLEX
 
 //==========================================================================
 
-#if not defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CPPCXTYPE_STDCOMPLEX // c++ + stdcomplex
+#if not defined __CUDACC__ and defined MGONGPU_CPPCXTYPE_STDCOMPLEX // c++ + stdcomplex
 
 //------------------------------
 // C++ - using std::complex
@@ -584,7 +584,7 @@ cxmake( const std::complex<double>& c ) // std::complex to std::complex (cast do
 }
 #endif
 
-#endif // #if not defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CPPCXTYPE_STDCOMPLEX
+#endif // #if not defined __CUDACC__ and defined MGONGPU_CPPCXTYPE_STDCOMPLEX
 
 //==========================================================================
 
diff --git a/epochX/cudacpp/ee_mumu.mad/src/mgOnGpuFptypes.h b/epochX/cudacpp/ee_mumu.mad/src/mgOnGpuFptypes.h
index 6f6cee64d6..a1cde16a67 100644
--- a/epochX/cudacpp/ee_mumu.mad/src/mgOnGpuFptypes.h
+++ b/epochX/cudacpp/ee_mumu.mad/src/mgOnGpuFptypes.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MGONGPUFPTYPES_H
 #define MGONGPUFPTYPES_H 1
@@ -13,7 +13,7 @@
 
 //==========================================================================
 
-#ifdef MGONGPUCPP_GPUIMPL // cuda
+#ifdef __CUDACC__ // cuda
 
 //------------------------------
 // Floating point types - Cuda
@@ -57,11 +57,11 @@ fpsqrt( const fptype& f )
 #endif
 }
 
-#endif // #ifdef MGONGPUCPP_GPUIMPL
+#endif // #ifdef __CUDACC__
 
 //==========================================================================
 
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
 
 //------------------------------
 // Floating point types - C++
@@ -85,7 +85,7 @@ fpsqrt( const fptype& f )
   return std::sqrt( f );
 }
 
-#endif // #ifndef MGONGPUCPP_GPUIMPL
+#endif // #ifndef __CUDACC__
 
 //==========================================================================
 
diff --git a/epochX/cudacpp/ee_mumu.mad/src/mgOnGpuVectors.h b/epochX/cudacpp/ee_mumu.mad/src/mgOnGpuVectors.h
index 7904b93c61..9d3e82b1e3 100644
--- a/epochX/cudacpp/ee_mumu.mad/src/mgOnGpuVectors.h
+++ b/epochX/cudacpp/ee_mumu.mad/src/mgOnGpuVectors.h
@@ -108,7 +108,7 @@ namespace mgOnGpu /* clang-format off */
 #endif
 #endif
 
-#else // i.e #ifndef MGONGPU_CPPSIMD (this includes #ifdef MGONGPUCPP_GPUIMPL)
+#else // i.e #ifndef MGONGPU_CPPSIMD (this includes #ifdef __CUDACC__)
 
   const int neppV = 1;
 
@@ -129,7 +129,7 @@ using mgOnGpu::bool_v;
 
 //--------------------------------------------------------------------------
 
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
 
 // Printout to stream for user defined types
 
@@ -744,11 +744,11 @@ fpvsplit1( const fptype2_v& v )
 
 #endif // #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
 
-#endif // #ifndef MGONGPUCPP_GPUIMPL
+#endif // #ifndef __CUDACC__
 
 //==========================================================================
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 
 //------------------------------
 // Vector types - CUDA
@@ -786,12 +786,12 @@ cxternary( const bool& mask, const cxtype& a, const cxtype& b )
   return ( mask ? a : b );
 }
 
-#endif // #ifdef MGONGPUCPP_GPUIMPL
+#endif // #ifdef __CUDACC__
 
 //==========================================================================
 
 // Scalar-or-vector types: scalar in CUDA, vector or scalar in C++
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 typedef bool bool_sv;
 typedef fptype fptype_sv;
 typedef fptype2 fptype2_sv;
@@ -812,7 +812,7 @@ typedef mgOnGpu::cxtype_ref cxtype_sv_ref;
 #endif
 
 // Scalar-or-vector zeros: scalar in CUDA, vector or scalar in C++
-#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
+#ifdef __CUDACC__ /* clang-format off */
 inline __host__ __device__ cxtype cxzero_sv(){ return cxtype( 0, 0 ); }
 #elif defined MGONGPU_CPPSIMD
 inline cxtype_v cxzero_sv() { return cxtype_v(); } // RRRR=0000 IIII=0000
diff --git a/epochX/cudacpp/ee_mumu.mad/src/rambo.h b/epochX/cudacpp/ee_mumu.mad/src/rambo.h
index cd7e1008ea..e02ea52496 100644
--- a/epochX/cudacpp/ee_mumu.mad/src/rambo.h
+++ b/epochX/cudacpp/ee_mumu.mad/src/rambo.h
@@ -4,7 +4,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 
 #include "mgOnGpuConfig.h"
@@ -18,7 +18,7 @@
 #include <iostream>
 
 // Simplified rambo version for 2 to N (with N>=2) processes with massless particles
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -83,7 +83,7 @@ namespace mg5amcCpu
       static bool first = true;
       if( first )
       {
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
         if constexpr( M_ACCESS::isOnDevice() ) // avoid
         {
           const int ievt0 = 0;
@@ -166,7 +166,7 @@ namespace mg5amcCpu
     wt = po2log;
     if( nparf != 2 ) wt = ( 2. * nparf - 4. ) * log( energy ) + z[nparf - 1];
 
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
     // issue warnings if weight is too small or too large
     static int iwarn[5] = { 0, 0, 0, 0, 0 };
     if( wt < -180. )
diff --git a/epochX/cudacpp/ee_mumu.sa/COPYRIGHT b/epochX/cudacpp/ee_mumu.sa/COPYRIGHT
index 84a883fbb0..a134b5fef9 100644
--- a/epochX/cudacpp/ee_mumu.sa/COPYRIGHT
+++ b/epochX/cudacpp/ee_mumu.sa/COPYRIGHT
@@ -15,7 +15,6 @@ The full development team currently includes the following authors :
   Stephan Hageboeck (CERN)
   Olivier Mattelaer (Universite Catholique de Louvain, original author)
   Stefan Roiser (CERN, original author)
-  Joergen Teig (CERN)
   Andrea Valassi (CERN, original author)
   Zenny Wettersten (CERN)
 See https://github.com/madgraph5/madgraph4gpu for more details. For the full
diff --git a/epochX/cudacpp/ee_mumu.sa/SubProcesses/Bridge.h b/epochX/cudacpp/ee_mumu.sa/SubProcesses/Bridge.h
index c04628dfd1..4cafe0c997 100644
--- a/epochX/cudacpp/ee_mumu.sa/SubProcesses/Bridge.h
+++ b/epochX/cudacpp/ee_mumu.sa/SubProcesses/Bridge.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: S. Roiser (Nov 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Roiser, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef BRIDGE_H
 #define BRIDGE_H 1
@@ -22,7 +22,7 @@
 #include <memory>
 #include <type_traits>
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -82,7 +82,7 @@ namespace mg5amcCpu
     Bridge& operator=( const Bridge& ) = delete;
     Bridge& operator=( Bridge&& ) = delete;
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
     /**
      * Set the gpublocks and gputhreads for the gpusequence - throws if evnt != gpublocks*gputhreads
      * (this is needed for BridgeKernel tests rather than for actual production use in Fortran)
@@ -149,7 +149,7 @@ namespace mg5amcCpu
     unsigned int m_nevt; // number of events
     int m_nGoodHel;      // the number of good helicities (-1 initially when they have not yet been calculated)
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
     int m_gputhreads; // number of gpu threads (default set from number of events, can be modified)
     int m_gpublocks;  // number of gpu blocks (default set from number of events, can be modified)
     mg5amcGpu::DeviceBuffer<FORTRANFPTYPE, sizePerEventMomenta> m_devMomentaF;
@@ -186,12 +186,12 @@ namespace mg5amcCpu
   // Forward declare transposition methods
   //
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 
   template<typename Tin, typename Tout>
   __global__ void dev_transposeMomentaF2C( const Tin* in, Tout* out, const unsigned int nevt );
 
-#endif // MGONGPUCPP_GPUIMPL
+#endif // __CUDACC__
 
   template<typename Tin, typename Tout>
   void hst_transposeMomentaF2C( const Tin* in, Tout* out, const unsigned int nevt );
@@ -208,7 +208,7 @@ namespace mg5amcCpu
   Bridge<FORTRANFPTYPE>::Bridge( unsigned int nevtF, unsigned int nparF, unsigned int np4F )
     : m_nevt( nevtF )
     , m_nGoodHel( -1 )
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
     , m_gputhreads( 256 )                  // default number of gpu threads
     , m_gpublocks( m_nevt / m_gputhreads ) // this ensures m_nevt <= m_gpublocks*m_gputhreads
     , m_devMomentaF( m_nevt )
@@ -232,7 +232,7 @@ namespace mg5amcCpu
   {
     if( nparF != CPPProcess::npar ) throw std::runtime_error( "Bridge constructor: npar mismatch" );
     if( np4F != CPPProcess::np4 ) throw std::runtime_error( "Bridge constructor: np4 mismatch" );
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
     if( ( m_nevt < s_gputhreadsmin ) || ( m_nevt % s_gputhreadsmin != 0 ) )
       throw std::runtime_error( "Bridge constructor: nevt should be a multiple of " + std::to_string( s_gputhreadsmin ) );
     while( m_nevt != m_gpublocks * m_gputhreads )
@@ -250,11 +250,11 @@ namespace mg5amcCpu
     std::cout << "WARNING! Instantiate host Bridge (nevt=" << m_nevt << ")" << std::endl;
     mg5amcCpu::CPPProcess process( /*verbose=*/false );
     m_pmek.reset( new mg5amcCpu::MatrixElementKernelHost( m_hstMomentaC, m_hstGs, m_hstRndHel, m_hstRndCol, m_hstMEs, m_hstSelHel, m_hstSelCol, m_nevt ) );
-#endif // MGONGPUCPP_GPUIMPL
+#endif // __CUDACC__
     process.initProc( "../../Cards/param_card.dat" );
   }
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
   template<typename FORTRANFPTYPE>
   void Bridge<FORTRANFPTYPE>::set_gpugrid( const int gpublocks, const int gputhreads )
   {
@@ -268,7 +268,7 @@ namespace mg5amcCpu
   }
 #endif
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
   template<typename FORTRANFPTYPE>
   void Bridge<FORTRANFPTYPE>::gpu_sequence( const FORTRANFPTYPE* momenta,
                                             const FORTRANFPTYPE* gs,
@@ -283,14 +283,14 @@ namespace mg5amcCpu
     constexpr int neppM = MemoryAccessMomenta::neppM;
     if constexpr( neppM == 1 && std::is_same_v<FORTRANFPTYPE, fptype> )
     {
-      gpuMemcpy( m_devMomentaC.data(), momenta, m_devMomentaC.bytes(), gpuMemcpyHostToDevice );
+      checkCuda( cudaMemcpy( m_devMomentaC.data(), momenta, m_devMomentaC.bytes(), cudaMemcpyHostToDevice ) );
     }
     else
     {
-      gpuMemcpy( m_devMomentaF.data(), momenta, m_devMomentaF.bytes(), gpuMemcpyHostToDevice );
+      checkCuda( cudaMemcpy( m_devMomentaF.data(), momenta, m_devMomentaF.bytes(), cudaMemcpyHostToDevice ) );
       const int thrPerEvt = CPPProcess::npar * CPPProcess::np4; // AV: transpose alg does 1 element per thread (NOT 1 event per thread)
       //const int thrPerEvt = 1; // AV: try new alg with 1 event per thread... this seems slower
-      gpuLaunchKernel( dev_transposeMomentaF2C, m_gpublocks * thrPerEvt, m_gputhreads, m_devMomentaF.data(), m_devMomentaC.data(), m_nevt );
+      dev_transposeMomentaF2C<<<m_gpublocks * thrPerEvt, m_gputhreads>>>( m_devMomentaF.data(), m_devMomentaC.data(), m_nevt );
     }
     if constexpr( std::is_same_v<FORTRANFPTYPE, fptype> )
     {
@@ -333,7 +333,7 @@ namespace mg5amcCpu
   }
 #endif
 
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
   template<typename FORTRANFPTYPE>
   void Bridge<FORTRANFPTYPE>::cpu_sequence( const FORTRANFPTYPE* momenta,
                                             const FORTRANFPTYPE* gs,
@@ -388,7 +388,7 @@ namespace mg5amcCpu
   // - C++ array: momenta[npagM][npar][np4][neppM] with nevt=npagM*neppM (AOSOA)
   //
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
   template<typename Tin, typename Tout>
   __global__ void dev_transposeMomentaF2C( const Tin* in, Tout* out, const unsigned int nevt )
   {
diff --git a/epochX/cudacpp/ee_mumu.sa/SubProcesses/BridgeKernels.cc b/epochX/cudacpp/ee_mumu.sa/SubProcesses/BridgeKernels.cc
index 90c7f2d3b8..cef4cb3c71 100644
--- a/epochX/cudacpp/ee_mumu.sa/SubProcesses/BridgeKernels.cc
+++ b/epochX/cudacpp/ee_mumu.sa/SubProcesses/BridgeKernels.cc
@@ -1,11 +1,10 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #include "BridgeKernels.h"
 
-#include "GpuAbstraction.h"
 #include "MemoryAccessMomenta.h"
 
 #include <sstream>
@@ -15,7 +14,7 @@ constexpr int npar = CPPProcess::npar; // #particles in total (external = initia
 
 //============================================================================
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -46,7 +45,7 @@ namespace mg5amcCpu
 
 //============================================================================
 
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
 namespace mg5amcCpu
 {
 
@@ -97,7 +96,7 @@ namespace mg5amcCpu
 
 //============================================================================
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 namespace mg5amcGpu
 {
 
diff --git a/epochX/cudacpp/ee_mumu.sa/SubProcesses/BridgeKernels.h b/epochX/cudacpp/ee_mumu.sa/SubProcesses/BridgeKernels.h
index 3efef8ce97..15eb4bff4d 100644
--- a/epochX/cudacpp/ee_mumu.sa/SubProcesses/BridgeKernels.h
+++ b/epochX/cudacpp/ee_mumu.sa/SubProcesses/BridgeKernels.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef BRIDGEKERNELS_H
 #define BRIDGEKERNELS_H 1
@@ -12,7 +12,7 @@
 #include "MatrixElementKernels.h"
 #include "MemoryBuffers.h"
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -49,7 +49,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
   // A Bridge wrapper class encapsulating matrix element calculations on a CPU host
   class BridgeKernelHost final : public BridgeKernelBase
   {
@@ -89,7 +89,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
   // A Bridge wrapper class encapsulating matrix element calculations on a GPU device
   class BridgeKernelDevice : public BridgeKernelBase
   {
diff --git a/epochX/cudacpp/ee_mumu.sa/SubProcesses/CommonRandomNumberKernel.cc b/epochX/cudacpp/ee_mumu.sa/SubProcesses/CommonRandomNumberKernel.cc
index 010bc4cbd0..985b39f576 100644
--- a/epochX/cudacpp/ee_mumu.sa/SubProcesses/CommonRandomNumberKernel.cc
+++ b/epochX/cudacpp/ee_mumu.sa/SubProcesses/CommonRandomNumberKernel.cc
@@ -1,16 +1,15 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #include "CommonRandomNumbers.h"
-#include "GpuAbstraction.h"
 #include "MemoryBuffers.h"
 #include "RandomNumberKernels.h"
 
 #include <cassert>
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/ee_mumu.sa/SubProcesses/CrossSectionKernels.cc b/epochX/cudacpp/ee_mumu.sa/SubProcesses/CrossSectionKernels.cc
index c15b39844d..0b355a3c8d 100644
--- a/epochX/cudacpp/ee_mumu.sa/SubProcesses/CrossSectionKernels.cc
+++ b/epochX/cudacpp/ee_mumu.sa/SubProcesses/CrossSectionKernels.cc
@@ -1,11 +1,10 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #include "CrossSectionKernels.h"
 
-#include "GpuAbstraction.h"
 #include "MemoryAccessMatrixElements.h"
 #include "MemoryAccessWeights.h"
 #include "MemoryBuffers.h"
@@ -78,7 +77,7 @@ debug_me_is_abnormal( const fptype& me, size_t ievtALL )
 
 //============================================================================
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -186,7 +185,7 @@ namespace mg5amcCpu
 
 //============================================================================
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 namespace mg5amcGpu
 {
 
diff --git a/epochX/cudacpp/ee_mumu.sa/SubProcesses/CrossSectionKernels.h b/epochX/cudacpp/ee_mumu.sa/SubProcesses/CrossSectionKernels.h
index 4d9659e04e..7933ca4bbf 100644
--- a/epochX/cudacpp/ee_mumu.sa/SubProcesses/CrossSectionKernels.h
+++ b/epochX/cudacpp/ee_mumu.sa/SubProcesses/CrossSectionKernels.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef CROSSSECTIONKERNELS_H
 #define CROSSSECTIONKERNELS_H 1
@@ -13,7 +13,7 @@
 
 //============================================================================
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -96,7 +96,7 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
   /*
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
   // A class encapsulating the calculation of event statistics on a GPU device
   class CrossSectionKernelDevice : public CrossSectionKernelBase, public NumberOfEvents
   {
diff --git a/epochX/cudacpp/ee_mumu.sa/SubProcesses/CudaRuntime.h b/epochX/cudacpp/ee_mumu.sa/SubProcesses/CudaRuntime.h
new file mode 100644
index 0000000000..64ce52f4b3
--- /dev/null
+++ b/epochX/cudacpp/ee_mumu.sa/SubProcesses/CudaRuntime.h
@@ -0,0 +1,85 @@
+// Copyright (C) 2020-2023 CERN and UCLouvain.
+// Licensed under the GNU Lesser General Public License (version 3 or later).
+// Created by: S. Roiser (Jul 2020) for the MG5aMC CUDACPP plugin.
+// Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+
+#ifndef MG5AMC_CUDARUNTIME_H
+#define MG5AMC_CUDARUNTIME_H 1
+
+// MG5AMC on GPU uses the CUDA runtime API, not the lower level CUDA driver API
+// See https://docs.nvidia.com/cuda/cuda-runtime-api/driver-vs-runtime-api.html#driver-vs-runtime-api
+
+#include <cassert>
+#include <iostream>
+
+//--------------------------------------------------------------------------
+
+// See https://stackoverflow.com/a/14038590
+#ifdef __CUDACC__ /* clang-format off */
+#define checkCuda( code ) { assertCuda( code, __FILE__, __LINE__ ); }
+inline void assertCuda( cudaError_t code, const char* file, int line, bool abort = true )
+{
+  if( code != cudaSuccess )
+  {
+    printf( "ERROR! assertCuda: '%s' (%d) in %s:%d\n", cudaGetErrorString( code ), code, file, line );
+    if( abort ) assert( code == cudaSuccess );
+  }
+}
+#endif /* clang-format on */
+
+//--------------------------------------------------------------------------
+
+#ifdef __CUDACC__
+namespace mg5amcGpu
+{
+  // Instantiate a CudaRuntime at the beginnining of the application's main to
+  // invoke cudaSetDevice(0) in the constructor and book a cudaDeviceReset() call in the destructor
+  // *** FIXME! This will all need to be designed differently when going to multi-GPU nodes! ***
+  struct CudaRuntime final
+  {
+    CudaRuntime( const bool debug = true )
+      : m_debug( debug ) { setUp( m_debug ); }
+    ~CudaRuntime() { tearDown( m_debug ); }
+    CudaRuntime( const CudaRuntime& ) = delete;
+    CudaRuntime( CudaRuntime&& ) = delete;
+    CudaRuntime& operator=( const CudaRuntime& ) = delete;
+    CudaRuntime& operator=( CudaRuntime&& ) = delete;
+    bool m_debug;
+
+    // Set up CUDA application
+    // ** NB: strictly speaking this is not needed when using the CUDA runtime API **
+    // Calling cudaSetDevice on startup is useful to properly book-keep the time spent in CUDA initialization
+    static void setUp( const bool debug = true )
+    {
+      // ** NB: it is useful to call cudaSetDevice, or cudaFree, to properly book-keep the time spent in CUDA initialization
+      // ** NB: otherwise, the first CUDA operation (eg a cudaMemcpyToSymbol in CPPProcess ctor) appears to take much longer!
+      /*
+      // [We initially added cudaFree(0) to "ease profile analysis" only because it shows up as a big recognizable block!]
+      // No explicit initialization is needed: https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#initialization
+      // It is not clear what cudaFree(0) does at all: https://stackoverflow.com/questions/69967813/
+      if ( debug ) std::cout << "__CudaRuntime: calling cudaFree(0)" << std::endl;
+      checkCuda( cudaFree( 0 ) ); // SLOW!
+      */
+      // Replace cudaFree(0) by cudaSetDevice(0), even if it is not really needed either
+      // (but see https://developer.nvidia.com/blog/cuda-pro-tip-always-set-current-device-avoid-multithreading-bugs)
+      if( debug ) std::cout << "__CudaRuntime: calling cudaSetDevice(0)" << std::endl;
+      checkCuda( cudaSetDevice( 0 ) ); // SLOW!
+    }
+
+    // Tear down CUDA application (call cudaDeviceReset)
+    // ** NB: strictly speaking this is not needed when using the CUDA runtime API **
+    // Calling cudaDeviceReset on shutdown is only needed for checking memory leaks in cuda-memcheck
+    // See https://docs.nvidia.com/cuda/cuda-memcheck/index.html#leak-checking
+    static void tearDown( const bool debug = true )
+    {
+      if( debug ) std::cout << "__CudaRuntime: calling cudaDeviceReset()" << std::endl;
+      checkCuda( cudaDeviceReset() );
+    }
+  };
+
+}
+#endif
+
+//--------------------------------------------------------------------------
+
+#endif // MG5AMC_CUDARUNTIME_H
diff --git a/epochX/cudacpp/ee_mumu.sa/SubProcesses/CurandRandomNumberKernel.cc b/epochX/cudacpp/ee_mumu.sa/SubProcesses/CurandRandomNumberKernel.cc
index 38c477c17a..eb56333b03 100644
--- a/epochX/cudacpp/ee_mumu.sa/SubProcesses/CurandRandomNumberKernel.cc
+++ b/epochX/cudacpp/ee_mumu.sa/SubProcesses/CurandRandomNumberKernel.cc
@@ -1,9 +1,9 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
-#include "GpuRuntime.h"
+#include "CudaRuntime.h"
 #include "MemoryBuffers.h"
 #include "RandomNumberKernels.h"
 
@@ -114,7 +114,7 @@ namespace mg5amcCpu
     /*
     printf( "\nCurandRandomNumberKernel::generateRnarray size = %d\n", (int)m_rnarray.size() );
     fptype* data = m_rnarray.data();
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
     if( m_rnarray.isOnDevice() )
     {
       data = new fptype[m_rnarray.size()]();
@@ -123,7 +123,7 @@ namespace mg5amcCpu
 #endif
     for( int i = 0; i < ( (int)m_rnarray.size() / 4 ); i++ )
       printf( "[%4d] %f %f %f %f\n", i * 4, data[i * 4], data[i * 4 + 2], data[i * 4 + 2], data[i * 4 + 3] );
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
     if( m_rnarray.isOnDevice() ) delete[] data;
 #endif
     */
diff --git a/epochX/cudacpp/ee_mumu.sa/SubProcesses/EventStatistics.h b/epochX/cudacpp/ee_mumu.sa/SubProcesses/EventStatistics.h
index b425a5bade..48b51e0a49 100644
--- a/epochX/cudacpp/ee_mumu.sa/SubProcesses/EventStatistics.h
+++ b/epochX/cudacpp/ee_mumu.sa/SubProcesses/EventStatistics.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef EventStatistics_H
 #define EventStatistics_H 1
@@ -16,7 +16,7 @@
 #include <limits>
 #include <string>
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/ee_mumu.sa/SubProcesses/MadgraphTest.h b/epochX/cudacpp/ee_mumu.sa/SubProcesses/MadgraphTest.h
index d2ff326e20..fd7734ce42 100644
--- a/epochX/cudacpp/ee_mumu.sa/SubProcesses/MadgraphTest.h
+++ b/epochX/cudacpp/ee_mumu.sa/SubProcesses/MadgraphTest.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: S. Hageboeck (Dec 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MADGRAPHTEST_H_
 #define MADGRAPHTEST_H_ 1
@@ -21,7 +21,7 @@
 #include <string>
 #include <vector>
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 using mg5amcGpu::CPPProcess;
 #else
 using mg5amcCpu::CPPProcess;
@@ -200,7 +200,7 @@ class MadgraphTest : public testing::TestWithParam<TestDriverBase*>
 
 // Since we link both the CPU-only and GPU tests into the same executable, we prevent
 // a multiply defined symbol by only compiling this in the non-CUDA phase:
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
 
 /// Compare momenta and matrix elements.
 /// This uses an implementation of TestDriverBase to run a madgraph workflow,
@@ -309,6 +309,6 @@ TEST_P( MadgraphTest, CompareMomentaAndME )
   }
 }
 
-#endif // MGONGPUCPP_GPUIMPL
+#endif // __CUDACC__
 
 #endif /* MADGRAPHTEST_H_ */
diff --git a/epochX/cudacpp/ee_mumu.sa/SubProcesses/MatrixElementKernels.cc b/epochX/cudacpp/ee_mumu.sa/SubProcesses/MatrixElementKernels.cc
index d6d6c4f179..30257195b6 100644
--- a/epochX/cudacpp/ee_mumu.sa/SubProcesses/MatrixElementKernels.cc
+++ b/epochX/cudacpp/ee_mumu.sa/SubProcesses/MatrixElementKernels.cc
@@ -1,12 +1,12 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #include "MatrixElementKernels.h"
 
 #include "CPPProcess.h"
-#include "GpuRuntime.h" // Includes the abstraction for Nvidia/AMD compilation
+#include "CudaRuntime.h"
 #include "MemoryAccessMomenta.h"
 #include "MemoryBuffers.h"
 
@@ -14,7 +14,7 @@
 
 //============================================================================
 
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
 namespace mg5amcCpu
 {
 
@@ -143,7 +143,7 @@ namespace mg5amcCpu
 
 //============================================================================
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 namespace mg5amcGpu
 {
 
@@ -202,13 +202,13 @@ namespace mg5amcGpu
     PinnedHostBufferHelicityMask hstIsGoodHel( ncomb );
     DeviceBufferHelicityMask devIsGoodHel( ncomb );
     // ... 0d1. Compute good helicity mask on the device
-    gpuLaunchKernel( computeDependentCouplings, m_gpublocks, m_gputhreads, m_gs.data(), m_couplings.data() );
+    computeDependentCouplings<<<m_gpublocks, m_gputhreads>>>( m_gs.data(), m_couplings.data() );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    gpuLaunchKernel( sigmaKin_getGoodHel, m_gpublocks, m_gputhreads, m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_numerators.data(), m_denominators.data(), devIsGoodHel.data() );
+    sigmaKin_getGoodHel<<<m_gpublocks, m_gputhreads>>>( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_numerators.data(), m_denominators.data(), devIsGoodHel.data() );
 #else
-    gpuLaunchKernel( sigmaKin_getGoodHel, m_gpublocks, m_gputhreads, m_momenta.data(), m_couplings.data(), m_matrixElements.data(), devIsGoodHel.data() );
+    sigmaKin_getGoodHel<<<m_gpublocks, m_gputhreads>>>( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), devIsGoodHel.data() );
 #endif
-    checkGpu( gpuPeekAtLastError() );
+    checkCuda( cudaPeekAtLastError() );
     // ... 0d2. Copy back good helicity mask to the host
     copyHostFromDevice( hstIsGoodHel, devIsGoodHel );
     // ... 0d3. Copy back good helicity list to constant memory on the device
@@ -219,19 +219,19 @@ namespace mg5amcGpu
 
   void MatrixElementKernelDevice::computeMatrixElements( const unsigned int channelId )
   {
-    gpuLaunchKernel( computeDependentCouplings, m_gpublocks, m_gputhreads, m_gs.data(), m_couplings.data() );
+    computeDependentCouplings<<<m_gpublocks, m_gputhreads>>>( m_gs.data(), m_couplings.data() );
 #ifndef MGONGPU_NSIGHT_DEBUG
     constexpr unsigned int sharedMemSize = 0;
 #else
     constexpr unsigned int sharedMemSize = ntpbMAX * sizeof( float );
 #endif
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    gpuLaunchKernelSharedMem( sigmaKin, m_gpublocks, m_gputhreads, sharedMemSize, m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), channelId, m_numerators.data(), m_denominators.data(), m_selhel.data(), m_selcol.data() );
+    sigmaKin<<<m_gpublocks, m_gputhreads, sharedMemSize>>>( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), channelId, m_numerators.data(), m_denominators.data(), m_selhel.data(), m_selcol.data() );
 #else
-    gpuLaunchKernelSharedMem( sigmaKin, m_gpublocks, m_gputhreads, sharedMemSize, m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), m_selhel.data(), m_selcol.data() );
+    sigmaKin<<<m_gpublocks, m_gputhreads, sharedMemSize>>>( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), m_selhel.data(), m_selcol.data() );
 #endif
-    checkGpu( gpuPeekAtLastError() );
-    checkGpu( gpuDeviceSynchronize() );
+    checkCuda( cudaPeekAtLastError() );
+    checkCuda( cudaDeviceSynchronize() );
   }
 
   //--------------------------------------------------------------------------
diff --git a/epochX/cudacpp/ee_mumu.sa/SubProcesses/MatrixElementKernels.h b/epochX/cudacpp/ee_mumu.sa/SubProcesses/MatrixElementKernels.h
index 72bd8f195b..23e84757a2 100644
--- a/epochX/cudacpp/ee_mumu.sa/SubProcesses/MatrixElementKernels.h
+++ b/epochX/cudacpp/ee_mumu.sa/SubProcesses/MatrixElementKernels.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MATRIXELEMENTKERNELS_H
 #define MATRIXELEMENTKERNELS_H 1
@@ -10,7 +10,7 @@
 
 #include "MemoryBuffers.h"
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -81,7 +81,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
   // A class encapsulating matrix element calculations on a CPU host
   class MatrixElementKernelHost final : public MatrixElementKernelBase, public NumberOfEvents
   {
@@ -130,7 +130,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
   // A class encapsulating matrix element calculations on a GPU device
   class MatrixElementKernelDevice : public MatrixElementKernelBase, public NumberOfEvents
   {
diff --git a/epochX/cudacpp/ee_mumu.sa/SubProcesses/MemoryAccessHelpers.h b/epochX/cudacpp/ee_mumu.sa/SubProcesses/MemoryAccessHelpers.h
index db73e4e064..c82a6c7635 100644
--- a/epochX/cudacpp/ee_mumu.sa/SubProcesses/MemoryAccessHelpers.h
+++ b/epochX/cudacpp/ee_mumu.sa/SubProcesses/MemoryAccessHelpers.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MemoryAccessHelpers_H
 #define MemoryAccessHelpers_H 1
@@ -105,7 +105,7 @@ class KernelAccessHelper : public MemoryAccessHelper<T>
     }
     else
     {
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
       const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid
       //printf( "kernelAccessRecord: ievt=%d threadId=%d\n", ievt, threadIdx.x );
       return T::ieventAccessRecord( buffer, ievt ); // NB fptype and fptype_sv coincide for CUDA
diff --git a/epochX/cudacpp/ee_mumu.sa/SubProcesses/MemoryAccessMomenta.h b/epochX/cudacpp/ee_mumu.sa/SubProcesses/MemoryAccessMomenta.h
index 38fade09fb..0ac4faa3c7 100644
--- a/epochX/cudacpp/ee_mumu.sa/SubProcesses/MemoryAccessMomenta.h
+++ b/epochX/cudacpp/ee_mumu.sa/SubProcesses/MemoryAccessMomenta.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MemoryAccessMomenta_H
 #define MemoryAccessMomenta_H 1
@@ -12,7 +12,7 @@
 #include "MemoryAccessHelpers.h"
 #include "MemoryAccessVectors.h"
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 using mg5amcGpu::CPPProcess;
 #else
 using mg5amcCpu::CPPProcess;
@@ -29,7 +29,7 @@ class MemoryAccessMomentaBase //_AOSOAv1
 
   // Number of Events Per Page in the momenta AOSOA memory buffer layout
   // (these are all best kept as a compile-time constants: see issue #23)
-#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
+#ifdef __CUDACC__ /* clang-format off */
   // -----------------------------------------------------------------------------------------------
   // --- GPUs: neppM is best set to a power of 2 times the number of fptype's in a 32-byte cacheline
   // --- This is relevant to ensure coalesced access to momenta in global memory
diff --git a/epochX/cudacpp/ee_mumu.sa/SubProcesses/MemoryAccessRandomNumbers.h b/epochX/cudacpp/ee_mumu.sa/SubProcesses/MemoryAccessRandomNumbers.h
index 40cb089135..e2988d39f3 100644
--- a/epochX/cudacpp/ee_mumu.sa/SubProcesses/MemoryAccessRandomNumbers.h
+++ b/epochX/cudacpp/ee_mumu.sa/SubProcesses/MemoryAccessRandomNumbers.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MemoryAccessRandomNumbers_H
 #define MemoryAccessRandomNumbers_H 1
@@ -11,7 +11,7 @@
 #include "CPPProcess.h"
 #include "MemoryAccessHelpers.h"
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 using mg5amcGpu::CPPProcess;
 #else
 using mg5amcCpu::CPPProcess;
diff --git a/epochX/cudacpp/ee_mumu.sa/SubProcesses/MemoryAccessVectors.h b/epochX/cudacpp/ee_mumu.sa/SubProcesses/MemoryAccessVectors.h
index 08faccff0f..e9b197368e 100644
--- a/epochX/cudacpp/ee_mumu.sa/SubProcesses/MemoryAccessVectors.h
+++ b/epochX/cudacpp/ee_mumu.sa/SubProcesses/MemoryAccessVectors.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MemoryAccessVectors_H
 #define MemoryAccessVectors_H 1
@@ -10,7 +10,7 @@
 
 #include "mgOnGpuVectors.h"
 
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
 namespace mg5amcCpu // this is only needed for CPU SIMD vectorization
 {
 
diff --git a/epochX/cudacpp/ee_mumu.sa/SubProcesses/MemoryBuffers.h b/epochX/cudacpp/ee_mumu.sa/SubProcesses/MemoryBuffers.h
index 7756a71621..3093e6ed18 100644
--- a/epochX/cudacpp/ee_mumu.sa/SubProcesses/MemoryBuffers.h
+++ b/epochX/cudacpp/ee_mumu.sa/SubProcesses/MemoryBuffers.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021, based on earlier work by S. Hageboeck) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Roiser, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MemoryBuffers_H
 #define MemoryBuffers_H 1
@@ -11,12 +11,12 @@
 #include "mgOnGpuCxtypes.h"
 
 #include "CPPProcess.h"
-#include "GpuRuntime.h"
+#include "CudaRuntime.h"
 #include "Parameters_sm.h"
 
 #include <sstream>
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -87,7 +87,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
   constexpr bool HostBufferALIGNED = false;   // ismisaligned=false
   constexpr bool HostBufferMISALIGNED = true; // ismisaligned=true
 
@@ -119,7 +119,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
   // A class encapsulating a CUDA pinned host buffer
   template<typename T>
   class PinnedHostBufferBase : public BufferBase<T>
@@ -128,18 +128,18 @@ namespace mg5amcCpu
     PinnedHostBufferBase( const size_t size )
       : BufferBase<T>( size, false )
     {
-      gpuMallocHost( &( this->m_data ), this->bytes() );
+      checkCuda( cudaMallocHost( &( this->m_data ), this->bytes() ) );
     }
     virtual ~PinnedHostBufferBase()
     {
-      gpuFreeHost( this->m_data );
+      checkCuda( cudaFreeHost( this->m_data ) );
     }
   };
 #endif
 
   //--------------------------------------------------------------------------
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
   // A class encapsulating a CUDA device buffer
   template<typename T>
   class DeviceBufferBase : public BufferBase<T>
@@ -148,18 +148,18 @@ namespace mg5amcCpu
     DeviceBufferBase( const size_t size )
       : BufferBase<T>( size, true )
     {
-      gpuMalloc( &( this->m_data ), this->bytes() );
+      checkCuda( cudaMalloc( &( this->m_data ), this->bytes() ) );
     }
     virtual ~DeviceBufferBase()
     {
-      gpuFree( this->m_data );
+      checkCuda( cudaFree( this->m_data ) );
     }
   };
 #endif
 
   //--------------------------------------------------------------------------
 
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
   // A class encapsulating a C++ host buffer for a given number of events
   template<typename T, size_t sizePerEvent, bool ismisaligned>
   class HostBuffer : public HostBufferBase<T, ismisaligned>, virtual private NumberOfEvents
@@ -175,7 +175,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
   // A class encapsulating a CUDA pinned host buffer for a given number of events
   template<typename T, size_t sizePerEvent>
   class PinnedHostBuffer : public PinnedHostBufferBase<T>, virtual private NumberOfEvents
@@ -191,7 +191,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
   // A class encapsulating a CUDA device buffer for a given number of events
   template<typename T, size_t sizePerEvent>
   class DeviceBuffer : public DeviceBufferBase<T>, virtual private NumberOfEvents
@@ -213,7 +213,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for momenta random numbers
   constexpr size_t sizePerEventRndNumMomenta = MemoryBuffers::np4 * MemoryBuffers::nparf;
 
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
   // A class encapsulating a C++ host buffer for momenta random numbers
   typedef HostBuffer<fptype, sizePerEventRndNumMomenta, HostBufferALIGNED> HostBufferRndNumMomenta;
 #else
@@ -232,7 +232,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer with ONE fptype per event
   constexpr size_t sizePerEventOneFp = 1;
 
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
   // A class encapsulating a C++ host buffer with ONE fptype per event
   typedef HostBuffer<fptype, sizePerEventOneFp, HostBufferALIGNED> HostBufferOneFp;
 #else
@@ -257,7 +257,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for Gs
   constexpr size_t sizePerEventGs = 1;
 
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
   // A class encapsulating a C++ host buffer for gs
   typedef HostBuffer<fptype, sizePerEventGs, HostBufferALIGNED> HostBufferGs;
 #else
@@ -276,7 +276,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for numerators
   constexpr size_t sizePerEventNumerators = 1;
 
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
   // A class encapsulating a C++ host buffer for gs
   typedef HostBuffer<fptype, sizePerEventNumerators, HostBufferALIGNED> HostBufferNumerators;
 #else
@@ -296,7 +296,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for denominators
   constexpr size_t sizePerEventDenominators = 1;
 
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
   // A class encapsulating a C++ host buffer for gs
   typedef HostBuffer<fptype, sizePerEventDenominators, HostBufferALIGNED> HostBufferDenominators;
 #else
@@ -315,7 +315,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for random numbers
   constexpr size_t sizePerEventCouplings = MemoryBuffers::ndcoup * MemoryBuffers::nx2;
 
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
   // A class encapsulating a C++ host buffer for gs
   typedef HostBuffer<fptype, sizePerEventCouplings, HostBufferALIGNED> HostBufferCouplings;
 #else
@@ -333,7 +333,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for momenta
   constexpr size_t sizePerEventMomenta = MemoryBuffers::np4 * MemoryBuffers::npar;
 
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
   // A class encapsulating a C++ host buffer for momenta
   typedef HostBuffer<fptype, sizePerEventMomenta, HostBufferALIGNED> HostBufferMomenta;
   //typedef HostBuffer<fptype, sizePerEventMomenta, HostBufferMISALIGNED> HostBufferMomenta; // TEST MISALIGNMENT!
@@ -352,7 +352,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for sampling weights
   constexpr size_t sizePerEventWeights = 1;
 
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
   // A class encapsulating a C++ host buffer for sampling weights
   typedef HostBuffer<fptype, sizePerEventWeights, HostBufferALIGNED> HostBufferWeights;
 #else
@@ -370,7 +370,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for matrix elements
   constexpr size_t sizePerEventMatrixElements = 1;
 
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
   // A class encapsulating a C++ host buffer for matrix elements
   typedef HostBuffer<fptype, sizePerEventMatrixElements, HostBufferALIGNED> HostBufferMatrixElements;
 #else
@@ -385,7 +385,7 @@ namespace mg5amcCpu
   // A base class encapsulating a memory buffer for the helicity mask
   typedef BufferBase<bool> BufferHelicityMask;
 
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
   // A class encapsulating a C++ host buffer for the helicity mask
   typedef HostBufferBase<bool, HostBufferALIGNED> HostBufferHelicityMask;
 #else
@@ -403,7 +403,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for wavefunctions
   constexpr size_t sizePerEventWavefunctions = MemoryBuffers::nw6 * MemoryBuffers::nx2;
 
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
   // A class encapsulating a C++ host buffer for wavefunctions
   typedef HostBuffer<fptype, sizePerEventWavefunctions, HostBufferALIGNED> HostBufferWavefunctions;
 #else
@@ -421,7 +421,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for helicity random numbers
   constexpr size_t sizePerEventRndNumHelicity = 1;
 
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
   // A class encapsulating a C++ host buffer for helicity random numbers
   typedef HostBuffer<fptype, sizePerEventRndNumHelicity, HostBufferALIGNED> HostBufferRndNumHelicity;
 #else
@@ -439,7 +439,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for color random numbers
   constexpr size_t sizePerEventRndNumColor = 1;
 
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
   // A class encapsulating a C++ host buffer for color random numbers
   typedef HostBuffer<fptype, sizePerEventRndNumColor, HostBufferALIGNED> HostBufferRndNumColor;
 #else
@@ -457,7 +457,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for helicity selection
   constexpr size_t sizePerEventSelectedHelicity = 1;
 
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
   // A class encapsulating a C++ host buffer for helicity selection
   typedef HostBuffer<int, sizePerEventSelectedHelicity, HostBufferALIGNED> HostBufferSelectedHelicity;
 #else
@@ -475,7 +475,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for color selection
   constexpr size_t sizePerEventSelectedColor = 1;
 
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
   // A class encapsulating a C++ host buffer for color selection
   typedef HostBuffer<int, sizePerEventSelectedColor, HostBufferALIGNED> HostBufferSelectedColor;
 #else
@@ -487,7 +487,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
   template<class Tdst, class Tsrc>
   void copyDeviceFromHost( Tdst& dst, const Tsrc& src ) // keep the same order of arguments as in memcpy
   {
@@ -504,13 +504,13 @@ namespace mg5amcCpu
       throw std::runtime_error( sstr.str() );
     }
     // NB (PR #45): cudaMemcpy involves an intermediate memcpy to pinned memory if host array is a not a pinned host array
-    gpuMemcpy( dst.data(), src.data(), src.bytes(), gpuMemcpyHostToDevice );
+    checkCuda( cudaMemcpy( dst.data(), src.data(), src.bytes(), cudaMemcpyHostToDevice ) );
   }
 #endif
 
   //--------------------------------------------------------------------------
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
   template<class Tdst, class Tsrc>
   void copyHostFromDevice( Tdst& dst, const Tsrc& src ) // keep the same order of arguments as in memcpy
   {
@@ -527,7 +527,7 @@ namespace mg5amcCpu
       throw std::runtime_error( sstr.str() );
     }
     // NB (PR #45): cudaMemcpy involves an intermediate memcpy to pinned memory if host array is a not a pinned host array
-    gpuMemcpy( dst.data(), src.data(), src.bytes(), gpuMemcpyDeviceToHost );
+    checkCuda( cudaMemcpy( dst.data(), src.data(), src.bytes(), cudaMemcpyDeviceToHost ) );
   }
 #endif
 
diff --git a/epochX/cudacpp/ee_mumu.sa/SubProcesses/P1_Sigma_sm_epem_mupmum/CPPProcess.cc b/epochX/cudacpp/ee_mumu.sa/SubProcesses/P1_Sigma_sm_epem_mupmum/CPPProcess.cc
index 709a3d6cdf..8bbc9ba493 100644
--- a/epochX/cudacpp/ee_mumu.sa/SubProcesses/P1_Sigma_sm_epem_mupmum/CPPProcess.cc
+++ b/epochX/cudacpp/ee_mumu.sa/SubProcesses/P1_Sigma_sm_epem_mupmum/CPPProcess.cc
@@ -4,7 +4,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
 // MadGraph5_aMC@NLO v. 3.5.0_lo_vect, 2023-06-09
@@ -16,6 +16,7 @@
 
 #include "mgOnGpuConfig.h"
 
+#include "CudaRuntime.h"
 #include "HelAmps_sm.h"
 #include "MemoryAccessAmplitudes.h"
 #include "MemoryAccessCouplings.h"
@@ -45,7 +46,7 @@
 // Class member functions for calculating the matrix elements for
 // Process: e+ e- > mu+ mu- WEIGHTED<=4 @1
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -79,7 +80,7 @@ namespace mg5amcCpu
   __device__ const fptype cIPD[2] = { (fptype)Parameters_sm::mdl_MZ, (fptype)Parameters_sm::mdl_WZ };
   __device__ const fptype cIPC[6] = { (fptype)Parameters_sm::GC_3.real(), (fptype)Parameters_sm::GC_3.imag(), (fptype)Parameters_sm::GC_50.real(), (fptype)Parameters_sm::GC_50.imag(), (fptype)Parameters_sm::GC_59.real(), (fptype)Parameters_sm::GC_59.imag() };
 #else
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
   __device__ __constant__ fptype cIPD[2];
   __device__ __constant__ fptype cIPC[6];
 #else
@@ -89,7 +90,7 @@ namespace mg5amcCpu
 #endif
 
   // Helicity combinations (and filtering of "good" helicity combinations)
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
   __device__ __constant__ short cHel[ncomb][npar];
   __device__ __constant__ int cNGoodHel;
   __device__ __constant__ int cGoodHel[ncomb];
@@ -117,13 +118,13 @@ namespace mg5amcCpu
                            fptype* allDenominators,       // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
                            fptype_sv* jamp2_sv            // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled)
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
                            , const int ievt00             // input: first event number in current C++ event page (for CUDA, ievt depends on threadid)
 #endif
                            )
   //ALWAYS_INLINE // attributes are not permitted in a function definition
   {
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
     using namespace mg5amcGpu;
     using M_ACCESS = DeviceAccessMomenta;         // non-trivial access: buffer includes all events
     using E_ACCESS = DeviceAccessMatrixElements;  // non-trivial access: buffer includes all events
@@ -150,7 +151,7 @@ namespace mg5amcCpu
 #endif /* clang-format on */
     mgDebug( 0, __FUNCTION__ );
     //printf( "calculate_wavefunctions: ihel=%2d\n", ihel );
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
     //printf( "calculate_wavefunctions: ievt00=%d\n", ievt00 );
 #endif
 
@@ -186,7 +187,7 @@ namespace mg5amcCpu
 #endif
     for( int iParity = 0; iParity < nParity; ++iParity )
     { // START LOOP ON IPARITY
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
       const int ievt0 = ievt00 + iParity * neppV;
 #endif
       constexpr size_t nxcoup = ndcoup + nicoup; // both dependent and independent couplings
@@ -199,10 +200,8 @@ namespace mg5amcCpu
         allCOUPs[idcoup] = CD_ACCESS::idcoupAccessBufferConst( allcouplings, idcoup ); // dependent couplings, vary event-by-event
       for( size_t iicoup = 0; iicoup < nicoup; iicoup++ )
         allCOUPs[ndcoup + iicoup] = CI_ACCESS::iicoupAccessBufferConst( cIPC, iicoup ); // independent couplings, fixed for all events
-#ifdef MGONGPUCPP_GPUIMPL
 #ifdef __CUDACC__
 #pragma nv_diagnostic pop
-#endif
       // CUDA kernels take input/output buffers with momenta/MEs for all events
       const fptype* momenta = allmomenta;
       const fptype* COUPs[nxcoup];
@@ -239,7 +238,7 @@ namespace mg5amcCpu
       // *** DIAGRAM 1 OF 2 ***
 
       // Wavefunction(s) for diagram number 1
-#if not( defined MGONGPUCPP_GPUIMPL and defined MGONGPU_TEST_DIVERGENCE )
+#if not( defined __CUDACC__ and defined MGONGPU_TEST_DIVERGENCE )
       opzxxx<M_ACCESS, W_ACCESS>( momenta, cHel[ihel][0], -1, w_fp[0], 0 ); // NB: opzxxx only uses pz
 #else
       if( ( blockDim.x * blockIdx.x + threadIdx.x ) % 2 == 0 )
@@ -292,7 +291,7 @@ namespace mg5amcCpu
       // [NB do keep 'static' for these constexpr arrays, see issue #283]
       static constexpr fptype2 cf[ncolor][ncolor] = { { 1 } }; // 2-D array[1][1]
 
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
       // Pre-compute a constexpr triangular color matrix properly normalized #475
       struct TriangularNormalizedColorMatrix
       {
@@ -349,7 +348,7 @@ namespace mg5amcCpu
 #endif
       for( int icol = 0; icol < ncolor; icol++ )
       {
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
         // === C++ START ===
         // Diagonal terms
 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
@@ -408,7 +407,7 @@ namespace mg5amcCpu
       MEs_sv_previous += deltaMEs_previous;
 #endif
       /*
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
       if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", blockDim.x * blockIdx.x + threadIdx.x, ihel, MEs_sv );
 #else
 #ifdef MGONGPU_CPPSIMD
@@ -455,8 +454,8 @@ namespace mg5amcCpu
       { 1, -1, 1, 1 },
       { 1, -1, -1, -1 },
       { 1, -1, -1, 1 } };
-#ifdef MGONGPUCPP_GPUIMPL
-    gpuMemcpyToSymbol( cHel, tHel, ncomb * npar * sizeof( short ) );
+#ifdef __CUDACC__
+    checkCuda( cudaMemcpyToSymbol( cHel, tHel, ncomb * npar * sizeof( short ) ) );
 #else
     memcpy( cHel, tHel, ncomb * npar * sizeof( short ) );
 #endif
@@ -496,9 +495,9 @@ namespace mg5amcCpu
     // Then copy them to CUDA constant memory (issue #39) or its C++ emulation in file-scope static memory
     const fptype tIPD[2] = { (fptype)m_pars->mdl_MZ, (fptype)m_pars->mdl_WZ };
     const cxtype tIPC[3] = { cxmake( m_pars->GC_3 ), cxmake( m_pars->GC_50 ), cxmake( m_pars->GC_59 ) };
-#ifdef MGONGPUCPP_GPUIMPL
-    gpuMemcpyToSymbol( cIPD, tIPD, 2 * sizeof( fptype ) );
-    gpuMemcpyToSymbol( cIPC, tIPC, 3 * sizeof( cxtype ) );
+#ifdef __CUDACC__
+    checkCuda( cudaMemcpyToSymbol( cIPD, tIPD, 2 * sizeof( fptype ) ) );
+    checkCuda( cudaMemcpyToSymbol( cIPC, tIPC, 3 * sizeof( cxtype ) ) );
 #else
     memcpy( cIPD, tIPD, 2 * sizeof( fptype ) );
     memcpy( cIPC, tIPC, 3 * sizeof( cxtype ) );
@@ -535,7 +534,7 @@ namespace mg5amcCpu
   {
     std::stringstream out;
     // CUDA version (NVCC)
-    // [Use __NVCC__ instead of MGONGPUCPP_GPUIMPL here!]
+    // [Use __NVCC__ instead of __CUDACC__ here!]
     // [This tests if 'nvcc' was used even to build a .cc file, even if not necessarily 'nvcc -x cu' for a .cu file]
     // [Check 'nvcc --compiler-options -dM -E dummy.c | grep CUDA': see https://stackoverflow.com/a/53713712]
 #ifdef __NVCC__
@@ -600,12 +599,12 @@ namespace mg5amcCpu
   __global__ void /* clang-format off */
   computeDependentCouplings( const fptype* allgs, // input: Gs[nevt]
                              fptype* allcouplings // output: couplings[nevt*ndcoup*2]
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
                              , const int nevt     // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #endif
   ) /* clang-format on */
   {
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
     using namespace mg5amcGpu;
     using G_ACCESS = DeviceAccessGs;
     using C_ACCESS = DeviceAccessCouplings;
@@ -626,7 +625,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
+#ifdef __CUDACC__ /* clang-format off */
   __global__ void
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
@@ -755,9 +754,9 @@ namespace mg5amcCpu
         nGoodHel++;
       }
     }
-#ifdef MGONGPUCPP_GPUIMPL
-    gpuMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) );
-    gpuMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) );
+#ifdef __CUDACC__
+    checkCuda( cudaMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) ) );
+    checkCuda( cudaMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) ) );
 #else
     cNGoodHel = nGoodHel;
     for( int ihel = 0; ihel < ncomb; ihel++ ) cGoodHel[ihel] = goodHel[ihel];
@@ -781,7 +780,7 @@ namespace mg5amcCpu
 #endif
             int* allselhel,                // output: helicity selection[nevt]
             int* allselcol                 // output: helicity selection[nevt]
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
             , const int nevt               // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #endif
             ) /* clang-format on */
@@ -802,7 +801,7 @@ namespace mg5amcCpu
     // Denominators: spins, colors and identical particles
     constexpr int helcolDenominators[1] = { 4 }; // assume nprocesses == 1 (#272 and #343)
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
     // Remember: in CUDA this is a kernel for one event, in c++ this processes n events
     const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid
 #else
@@ -816,12 +815,9 @@ namespace mg5amcCpu
 #endif
 
     // Start sigmaKin_lines
-
-#include "GpuAbstraction.h"
-
     // === PART 0 - INITIALISATION (before calculate_wavefunctions) ===
     // Reset the "matrix elements" - running sums of |M|^2 over helicities for the given event
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
     allMEs[ievt] = 0;
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     allNumerators[ievt] = 0;
@@ -849,7 +845,7 @@ namespace mg5amcCpu
     // === PART 1 - HELICITY LOOP: CALCULATE WAVEFUNCTIONS ===
     // (in both CUDA and C++, using precomputed good helicities)
 
-#ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++
+#ifdef __CUDACC__ // CUDA OR C++
 
     // *** START OF PART 1a - CUDA (one event per CPU thread) ***
     // Running sum of partial amplitudes squared for event by event color selection (#402)
@@ -1053,7 +1049,7 @@ namespace mg5amcCpu
     // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event
     // [NB 'sum over final spins, average over initial spins', eg see
     // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf]
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
     allMEs[ievt] /= helcolDenominators[0];
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     if( channelId > 0 ) allMEs[ievt] *= allNumerators[ievt] / allDenominators[ievt];
diff --git a/epochX/cudacpp/ee_mumu.sa/SubProcesses/P1_Sigma_sm_epem_mupmum/CPPProcess.h b/epochX/cudacpp/ee_mumu.sa/SubProcesses/P1_Sigma_sm_epem_mupmum/CPPProcess.h
index ebbc2800d3..08d6c29e7b 100644
--- a/epochX/cudacpp/ee_mumu.sa/SubProcesses/P1_Sigma_sm_epem_mupmum/CPPProcess.h
+++ b/epochX/cudacpp/ee_mumu.sa/SubProcesses/P1_Sigma_sm_epem_mupmum/CPPProcess.h
@@ -4,7 +4,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
 // MadGraph5_aMC@NLO v. 3.5.0_lo_vect, 2023-06-09
@@ -25,7 +25,7 @@
 
 //--------------------------------------------------------------------------
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -107,7 +107,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
   __global__ void
   computeDependentCouplings( const fptype* allgs,    // input: Gs[nevt]
                              fptype* allcouplings ); // output: couplings[nevt*ndcoup*2]
@@ -120,7 +120,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
+#ifdef __CUDACC__ /* clang-format off */
   __global__ void
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
@@ -150,7 +150,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
+#ifdef __CUDACC__ /* clang-format off */
   __global__ void
   sigmaKin( const fptype* allmomenta,      // input: momenta[nevt*npar*4]
             const fptype* allcouplings,    // input: couplings[nevt*ndcoup*2]
diff --git a/epochX/cudacpp/ee_mumu.sa/SubProcesses/P1_Sigma_sm_epem_mupmum/CudaRuntime.h b/epochX/cudacpp/ee_mumu.sa/SubProcesses/P1_Sigma_sm_epem_mupmum/CudaRuntime.h
new file mode 120000
index 0000000000..ce9e1a487a
--- /dev/null
+++ b/epochX/cudacpp/ee_mumu.sa/SubProcesses/P1_Sigma_sm_epem_mupmum/CudaRuntime.h
@@ -0,0 +1 @@
+../CudaRuntime.h
\ No newline at end of file
diff --git a/epochX/cudacpp/ee_mumu.sa/SubProcesses/P1_Sigma_sm_epem_mupmum/check_sa.cc b/epochX/cudacpp/ee_mumu.sa/SubProcesses/P1_Sigma_sm_epem_mupmum/check_sa.cc
index 1bad694d1c..f1e75b9252 100644
--- a/epochX/cudacpp/ee_mumu.sa/SubProcesses/P1_Sigma_sm_epem_mupmum/check_sa.cc
+++ b/epochX/cudacpp/ee_mumu.sa/SubProcesses/P1_Sigma_sm_epem_mupmum/check_sa.cc
@@ -4,7 +4,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: O. Mattelaer (Nov 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 
 #include "mgOnGpuConfig.h"
@@ -12,7 +12,6 @@
 #include "BridgeKernels.h"
 #include "CPPProcess.h"
 #include "CrossSectionKernels.h"
-#include "GpuRuntime.h"
 #include "MatrixElementKernels.h"
 #include "MemoryAccessMatrixElements.h"
 #include "MemoryAccessMomenta.h"
@@ -64,7 +63,7 @@ usage( char* argv0, int ret = 1 )
   std::cout << std::endl;
   std::cout << "Summary stats are always computed: '-p' and '-j' only control their printout" << std::endl;
   std::cout << "The '-d' flag only enables NaN/abnormal warnings and OMP debugging" << std::endl;
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
 #ifdef _OPENMP
   std::cout << std::endl;
   std::cout << "Use the OMP_NUM_THREADS environment variable to control OMP multi-threading" << std::endl;
@@ -78,7 +77,7 @@ int
 main( int argc, char** argv )
 {
   // Namespaces for CUDA and C++ (FIXME - eventually use the same namespace everywhere...)
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
   using namespace mg5amcGpu;
 #else
   using namespace mg5amcCpu;
@@ -104,11 +103,11 @@ main( int argc, char** argv )
     CurandDevice = 2
   };
 #ifdef __CUDACC__
-  RandomNumberMode rndgen = RandomNumberMode::CurandDevice; // default on CUDA GPU
+  RandomNumberMode rndgen = RandomNumberMode::CurandDevice; // default on GPU
 #elif not defined MGONGPU_HAS_NO_CURAND
   RandomNumberMode rndgen = RandomNumberMode::CurandHost;  // default on CPU if build has curand
 #else
-  RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // default on CPU if build has no curand and on HIP GPU
+  RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // default on CPU if build has no curand
 #endif
   // Rambo sampling mode (NB RamboHost implies CommonRandom or CurandHost!)
   enum class RamboSamplingMode
@@ -116,7 +115,7 @@ main( int argc, char** argv )
     RamboHost = 1,
     RamboDevice = 2
   };
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
   RamboSamplingMode rmbsmp = RamboSamplingMode::RamboDevice; // default on GPU
 #else
   RamboSamplingMode rmbsmp = RamboSamplingMode::RamboHost; // default on CPU
@@ -149,7 +148,7 @@ main( int argc, char** argv )
 #ifdef __CUDACC__
       rndgen = RandomNumberMode::CurandDevice;
 #else
-      throw std::runtime_error( "CurandDevice is not supported on CPUs or on HIP GPUs" );
+      throw std::runtime_error( "CurandDevice is not supported on CPUs" );
 #endif
     }
     else if( arg == "--curhst" )
@@ -166,7 +165,7 @@ main( int argc, char** argv )
     }
     else if( arg == "--rmbdev" )
     {
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
       rmbsmp = RamboSamplingMode::RamboDevice;
 #else
       throw std::runtime_error( "RamboDevice is not supported on CPUs" );
@@ -240,13 +239,13 @@ main( int argc, char** argv )
     return usage( argv[0] );
   }
 
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
 #ifdef _OPENMP
   ompnumthreadsNotSetMeansOneThread( debug ? 1 : 0 ); // quiet(-1), info(0), debug(1)
 #endif
 #endif
 
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
   // Fail gently and avoid "Illegal instruction (core dumped)" if the host does not support the SIMD used in the ME calculation
   // Note: this prevents a crash on pmpe04 but not on some github CI nodes?
   // [NB: SIMD vectorization in mg5amc C++ code is only used in the ME calculation below MatrixElementKernelHost!]
@@ -264,14 +263,14 @@ main( int argc, char** argv )
 
   // === STEP 0 - INITIALISE
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 
-  // --- 00. Initialise GPU
-  // Instantiate a GpuRuntime at the beginnining of the application's main.
-  // For CUDA this invokes cudaSetDevice(0) in the constructor and books a cudaDeviceReset() call in the destructor.
-  const std::string cdinKey = "00 GpuInit";
+  // --- 00. Initialise cuda
+  // Instantiate a CudaRuntime at the beginnining of the application's main to
+  // invoke cudaSetDevice(0) in the constructor and book a cudaDeviceReset() call in the destructor
+  const std::string cdinKey = "00 CudaInit";
   timermap.start( cdinKey );
-  GpuRuntime GpuRuntime( debug );
+  CudaRuntime cudaRuntime( debug );
 #endif
 
   // --- 0a. Initialise physics process
@@ -293,7 +292,7 @@ main( int argc, char** argv )
   timermap.start( alloKey );
 
   // Memory buffers for random numbers for momenta
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
   HostBufferRndNumMomenta hstRndmom( nevt );
 #else
   PinnedHostBufferRndNumMomenta hstRndmom( nevt );
@@ -301,7 +300,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for sampling weights
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
   HostBufferWeights hstWeights( nevt );
 #else
   PinnedHostBufferWeights hstWeights( nevt );
@@ -309,7 +308,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for momenta
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
   HostBufferMomenta hstMomenta( nevt );
 #else
   PinnedHostBufferMomenta hstMomenta( nevt );
@@ -317,7 +316,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for Gs
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
   HostBufferGs hstGs( nevt );
 #else
   PinnedHostBufferGs hstGs( nevt );
@@ -334,7 +333,7 @@ main( int argc, char** argv )
   }
 
   // Memory buffers for matrix elements
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
   HostBufferMatrixElements hstMatrixElements( nevt );
 #else
   PinnedHostBufferMatrixElements hstMatrixElements( nevt );
@@ -343,7 +342,7 @@ main( int argc, char** argv )
 
   // Memory buffers for random numbers for helicity selection
   // *** NB #403 these buffers always remain initialised at 0: no need for helicity choice in gcheck/check (no LHE produced) ***
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
   HostBufferRndNumHelicity hstRndHel( nevt );
 #else
   PinnedHostBufferRndNumHelicity hstRndHel( nevt );
@@ -352,7 +351,7 @@ main( int argc, char** argv )
 
   // Memory buffers for random numbers for color selection
   // *** NB #402 these buffers always remain initialised at 0: no need for color choice in gcheck/check (no LHE produced) ***
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
   HostBufferRndNumColor hstRndCol( nevt );
 #else
   PinnedHostBufferRndNumColor hstRndCol( nevt );
@@ -360,7 +359,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for helicity selection
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
   HostBufferSelectedHelicity hstSelHel( nevt );
 #else
   PinnedHostBufferSelectedHelicity hstSelHel( nevt );
@@ -368,7 +367,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for color selection
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
   HostBufferSelectedColor hstSelCol( nevt );
 #else
   PinnedHostBufferSelectedColor hstSelCol( nevt );
@@ -404,7 +403,7 @@ main( int argc, char** argv )
 #else
   else
   {
-    throw std::logic_error( "CurandDevice is not supported on CPUs or HIP GPUs" ); // INTERNAL ERROR (no path to this statement)
+    throw std::logic_error( "CurandDevice is not supported on CPUs" ); // INTERNAL ERROR (no path to this statement)
   }
 #endif
 #else
@@ -422,7 +421,7 @@ main( int argc, char** argv )
   }
   else
   {
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
     prsk.reset( new RamboSamplingKernelDevice( energy, devRndmom, devMomenta, devWeights, gpublocks, gputhreads ) );
 #else
     throw std::logic_error( "RamboDevice is not supported on CPUs" ); // INTERNAL ERROR (no path to this statement)
@@ -433,7 +432,7 @@ main( int argc, char** argv )
   std::unique_ptr<MatrixElementKernelBase> pmek;
   if( !bridge )
   {
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
     pmek.reset( new MatrixElementKernelDevice( devMomenta, devGs, devRndHel, devRndCol, devMatrixElements, devSelHel, devSelCol, gpublocks, gputhreads ) );
 #else
     pmek.reset( new MatrixElementKernelHost( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, nevt ) );
@@ -441,7 +440,7 @@ main( int argc, char** argv )
   }
   else
   {
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
     pmek.reset( new BridgeKernelDevice( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, gpublocks, gputhreads ) );
 #else
     pmek.reset( new BridgeKernelHost( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, nevt ) );
@@ -483,7 +482,7 @@ main( int argc, char** argv )
     prnk->generateRnarray();
     //std::cout << "Got random numbers" << std::endl;
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
     if( rndgen != RandomNumberMode::CurandDevice && rmbsmp == RamboSamplingMode::RamboDevice )
     {
       // --- 1c. Copy rndmom from host to device
@@ -515,7 +514,7 @@ main( int argc, char** argv )
     prsk->getMomentaFinal();
     //std::cout << "Got final momenta" << std::endl;
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
     if( rmbsmp == RamboSamplingMode::RamboDevice )
     {
       // --- 2c. CopyDToH Weights
@@ -560,7 +559,7 @@ main( int argc, char** argv )
       dynamic_cast<BridgeKernelBase*>( pmek.get() )->transposeInputMomentaC2F();
     }
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
     // --- 2d. CopyHToD Momenta
     const std::string gKey = "0.. CpHTDg";
     rambtime += timermap.start( gKey ); // FIXME! NOT A RAMBO TIMER!
@@ -589,7 +588,7 @@ main( int argc, char** argv )
     wv3atime += timermap.stop(); // calc only
     wavetime += wv3atime;        // calc plus copy
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
     if( !bridge )
     {
       // --- 3b. CopyDToH MEs
@@ -732,19 +731,15 @@ main( int argc, char** argv )
     rndgentxt = "CURAND DEVICE";
 #ifdef __CUDACC__
   rndgentxt += " (CUDA code)";
-#elif defined __HIPCC__
-  rndgentxt += " (HIP code)";
 #else
   rndgentxt += " (C++ code)";
 #endif
 
   // Workflow description summary
   std::string wrkflwtxt;
-  // -- CUDA or HIP or C++?
+  // -- CUDA or C++?
 #ifdef __CUDACC__
   wrkflwtxt += "CUD:";
-#elif defined __HIPCC__
-  wrkflwtxt += "HIP:";
 #else
   wrkflwtxt += "CPP:";
 #endif
@@ -769,12 +764,6 @@ main( int argc, char** argv )
 #else
   wrkflwtxt += "???:"; // no path to this statement
 #endif
-#elif defined __HIPCC__
-#if defined MGONGPU_CUCXTYPE_CXSMPL
-  wrkflwtxt += "CXS:";
-#else
-  wrkflwtxt += "???:"; // no path to this statement
-#endif
 #else
 #if defined MGONGPU_CPPCXTYPE_STDCOMPLEX
   wrkflwtxt += "STX:";
@@ -800,7 +789,7 @@ main( int argc, char** argv )
     wrkflwtxt += "RMBDEV+";
   else
     wrkflwtxt += "??????+"; // no path to this statement
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
   // -- HOST or DEVICE matrix elements? Standalone MEs or BRIDGE?
   if( !bridge )
     wrkflwtxt += "MESDEV";
@@ -856,7 +845,7 @@ main( int argc, char** argv )
 
   if( perf )
   {
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
 #ifdef _OPENMP
     // Get the output of "nproc --all" (https://stackoverflow.com/a/478960)
     std::string nprocall;
@@ -877,8 +866,6 @@ main( int argc, char** argv )
     std::cout << std::string( SEP79, '*' ) << std::endl
 #ifdef __CUDACC__
               << "Process                     = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_CUDA"
-#elif defined __HIPCC__
-              << "Process                     = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_HIP"
 #else
               << "Process                     = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_CPP"
 #endif
@@ -905,21 +892,21 @@ main( int argc, char** argv )
 #elif defined MGONGPU_FPTYPE_FLOAT
               << "FP precision                = FLOAT (NaN/abnormal=" << nabn << ", zero=" << nzero << ")" << std::endl
 #endif
+#ifdef __CUDACC__
 #if defined MGONGPU_CUCXTYPE_CUCOMPLEX
               << "Complex type                = CUCOMPLEX" << std::endl
 #elif defined MGONGPU_CUCXTYPE_THRUST
               << "Complex type                = THRUST::COMPLEX" << std::endl
-#elif defined MGONGPU_CUCXTYPE_CXSMPL
-              << "Complex type                = STD::COMPLEX" << std::endl
+#endif
 #else
-              << "Complex type                = ???" << std::endl // no path to this statement...
+              << "Complex type                = STD::COMPLEX" << std::endl
 #endif
               << "RanNumb memory layout       = AOSOA[" << neppR << "]"
               << ( neppR == 1 ? " == AOS" : "" )
               << " [HARDCODED FOR REPRODUCIBILITY]" << std::endl
               << "Momenta memory layout       = AOSOA[" << neppM << "]"
               << ( neppM == 1 ? " == AOS" : "" ) << std::endl
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
     //<< "Wavefunction GPU memory     = LOCAL" << std::endl
 #else
 #if !defined MGONGPU_CPPSIMD
@@ -950,7 +937,7 @@ main( int argc, char** argv )
 #endif
 #endif
               << "Random number generation    = " << rndgentxt << std::endl
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
 #ifdef _OPENMP
               << "OMP threads / `nproc --all` = " << omp_get_max_threads() << " / " << nprocall // includes a newline
 #endif
@@ -1046,14 +1033,14 @@ main( int argc, char** argv )
              << "\"FLOAT (NaN/abnormal=" << nabn << ")\"," << std::endl
 #endif
              << "\"Complex type\": "
+#ifdef __CUDACC__
 #if defined MGONGPU_CUCXTYPE_CUCOMPLEX
              << "\"CUCOMPLEX\"," << std::endl
 #elif defined MGONGPU_CUCXTYPE_THRUST
              << "\"THRUST::COMPLEX\"," << std::endl
-#elif defined MGONGPU_CUCXTYPE_CXSMPL
-             << "\"STD::COMPLEX\"," << std::endl
+#endif
 #else
-             << "\"???\"," << std::endl                           // no path to this statement...
+             << "\"STD::COMPLEX\"," << std::endl
 #endif
              << "\"RanNumb memory layout\": "
              << "\"AOSOA[" << neppR << "]\""
@@ -1061,7 +1048,7 @@ main( int argc, char** argv )
              << "\"Momenta memory layout\": "
              << "\"AOSOA[" << neppM << "]\""
              << ( neppM == 1 ? " == AOS" : "" ) << ", " << std::endl
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
     //<< "\"Wavefunction GPU memory\": " << "\"LOCAL\"," << std::endl
 #endif
              << "\"Curand generation\": "
diff --git a/epochX/cudacpp/ee_mumu.sa/SubProcesses/RamboSamplingKernels.cc b/epochX/cudacpp/ee_mumu.sa/SubProcesses/RamboSamplingKernels.cc
index 79abbcc4f8..da68aa9255 100644
--- a/epochX/cudacpp/ee_mumu.sa/SubProcesses/RamboSamplingKernels.cc
+++ b/epochX/cudacpp/ee_mumu.sa/SubProcesses/RamboSamplingKernels.cc
@@ -1,11 +1,11 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #include "RamboSamplingKernels.h"
 
-#include "GpuRuntime.h"
+#include "CudaRuntime.h"
 #include "MemoryAccessMomenta.h"
 #include "MemoryAccessRandomNumbers.h"
 #include "MemoryAccessWeights.h"
@@ -14,7 +14,7 @@
 
 #include <sstream>
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -92,7 +92,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
   RamboSamplingKernelDevice::RamboSamplingKernelDevice( const fptype energy,               // input: energy
                                                         const BufferRndNumMomenta& rndmom, // input: random numbers in [0,1]
                                                         BufferMomenta& momenta,            // output: momenta
@@ -135,7 +135,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
   __global__ void
   getMomentaInitialDevice( const fptype energy,
                            fptype* momenta )
@@ -147,17 +147,17 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
   void
   RamboSamplingKernelDevice::getMomentaInitial()
   {
-    gpuLaunchKernel( getMomentaInitialDevice, m_gpublocks, m_gputhreads, m_energy, m_momenta.data() );
+    getMomentaInitialDevice<<<m_gpublocks, m_gputhreads>>>( m_energy, m_momenta.data() );
   }
 #endif
 
   //--------------------------------------------------------------------------
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
   __global__ void
   getMomentaFinalDevice( const fptype energy,
                          const fptype* rndmom,
@@ -171,11 +171,11 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
   void
   RamboSamplingKernelDevice::getMomentaFinal()
   {
-    gpuLaunchKernel( getMomentaFinalDevice, m_gpublocks, m_gputhreads, m_energy, m_rndmom.data(), m_momenta.data(), m_weights.data() );
+    getMomentaFinalDevice<<<m_gpublocks, m_gputhreads>>>( m_energy, m_rndmom.data(), m_momenta.data(), m_weights.data() );
   }
 #endif
 
diff --git a/epochX/cudacpp/ee_mumu.sa/SubProcesses/RamboSamplingKernels.h b/epochX/cudacpp/ee_mumu.sa/SubProcesses/RamboSamplingKernels.h
index 7c214cd74b..184089efd7 100644
--- a/epochX/cudacpp/ee_mumu.sa/SubProcesses/RamboSamplingKernels.h
+++ b/epochX/cudacpp/ee_mumu.sa/SubProcesses/RamboSamplingKernels.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef RAMBOSAMPLINGKERNELS_H
 #define RAMBOSAMPLINGKERNELS_H 1
@@ -10,7 +10,7 @@
 
 #include "MemoryBuffers.h"
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -93,7 +93,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
   // A class encapsulating RAMBO phase space sampling on a GPU device
   class RamboSamplingKernelDevice final : public SamplingKernelBase, public NumberOfEvents
   {
diff --git a/epochX/cudacpp/ee_mumu.sa/SubProcesses/RandomNumberKernels.h b/epochX/cudacpp/ee_mumu.sa/SubProcesses/RandomNumberKernels.h
index 21d63beeac..188a72c2c9 100644
--- a/epochX/cudacpp/ee_mumu.sa/SubProcesses/RandomNumberKernels.h
+++ b/epochX/cudacpp/ee_mumu.sa/SubProcesses/RandomNumberKernels.h
@@ -1,14 +1,14 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef RANDOMNUMBERKERNELS_H
 #define RANDOMNUMBERKERNELS_H 1
 
 #include "mgOnGpuConfig.h"
 
-// NB This must come AFTER mgOnGpuConfig.h which contains our definition of __global__ when MGONGPUCPP_GPUIMPL is not defined
+// NB This must come AFTER mgOnGpuConfig.h which contains our definition of __global__ when __CUDACC__ is not defined
 #ifndef MGONGPU_HAS_NO_CURAND
 //#include "curand.h"
 struct curandGenerator_st; // forward definition from curand.h
@@ -16,7 +16,7 @@ struct curandGenerator_st; // forward definition from curand.h
 
 #include "MemoryBuffers.h"
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/ee_mumu.sa/SubProcesses/cudacpp.mk b/epochX/cudacpp/ee_mumu.sa/SubProcesses/cudacpp.mk
index bcb73d7f01..a0397e9ecc 100644
--- a/epochX/cudacpp/ee_mumu.sa/SubProcesses/cudacpp.mk
+++ b/epochX/cudacpp/ee_mumu.sa/SubProcesses/cudacpp.mk
@@ -1,7 +1,7 @@
 # Copyright (C) 2020-2023 CERN and UCLouvain.
 # Licensed under the GNU Lesser General Public License (version 3 or later).
 # Created by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin.
-# Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+# Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 
 #=== Determine the name of this makefile (https://ftp.gnu.org/old-gnu/Manuals/make-3.80/html_node/make_17.html)
 #=== NB: different names (e.g. cudacpp.mk and cudacpp_src.mk) are used in the Subprocess and src directories
@@ -32,7 +32,7 @@ UNAME_P := $(shell uname -p)
 #=== Configure common compiler flags for C++ and CUDA
 
 INCFLAGS = -I.
-OPTFLAGS = -O3 # this ends up in GPUFLAGS too (should it?), cannot add -Ofast or -ffast-math here
+OPTFLAGS = -O3 # this ends up in CUFLAGS too (should it?), cannot add -Ofast or -ffast-math here
 
 # Dependency on src directory
 MG5AMC_COMMONLIB = mg5amc_common
@@ -89,139 +89,69 @@ endif
 
 #-------------------------------------------------------------------------------
 
-CUDA_COMPILER_PATH := $(shell compiler="`which nvcc 2>/dev/null`" && while [ -L "$$compiler" ]; do compiler=`readlink "$$compiler"`; done && echo "$$compiler")
-HIP_COMPILER_PATH := $(shell compiler="`which hipcc 2>/dev/null`" && while [ -L "$$compiler" ]; do compiler=`readlink "$$compiler"`; done && echo "$$compiler")
-
-ifeq ($(findstring nvcc,$(CUDA_COMPILER_PATH)),nvcc)
-  #=== Configure the CUDA compiler
-
-  # If CXX is not a single word (example "clang++ --gcc-toolchain...") then disable CUDA builds (issue #505)
-  # This is because it is impossible to pass this to "GPUFLAGS += -ccbin <host-compiler>" below
-  ifneq ($(words $(subst ccache ,,$(CXX))),1) # allow at most "CXX=ccache <host-compiler>" from outside
-    $(warning CUDA builds are not supported for multi-word CXX "$(CXX)")
-    override CUDA_HOME=disabled
-  endif
-
-  # If CUDA_HOME is not set, try to set it from the location of NVCC
-  ifndef CUDA_HOME
-    CUDA_HOME = $(patsubst %bin/nvcc,%,$(shell which nvcc 2>/dev/null))
-    $(warning CUDA_HOME was not set: using "$(CUDA_HOME)")
-  endif
-
-  # Set GPUCC as $(CUDA_HOME)/bin/nvcc if it exists
-  ifneq ($(wildcard $(CUDA_HOME)/bin/nvcc),)
-    GPUCC = $(CUDA_HOME)/bin/nvcc
-    USE_NVTX ?=-DUSE_NVTX
-    # See https://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/index.html
-    # See https://arnon.dk/matching-sm-architectures-arch-and-gencode-for-various-nvidia-cards/
-    # Default: use compute capability 70 for V100 (CERN lxbatch, CERN itscrd, Juwels Cluster).
-    # Embed device code for 70, and PTX for 70+.
-    # Export MADGRAPH_CUDA_ARCHITECTURE (comma-separated list) to use another value or list of values (see #533).
-    # Examples: use 60 for P100 (Piz Daint), 80 for A100 (Juwels Booster, NVidia raplab/Curiosity).
-    MADGRAPH_CUDA_ARCHITECTURE ?= 70
-    ###CUARCHFLAGS = -gencode arch=compute_$(MADGRAPH_CUDA_ARCHITECTURE),code=compute_$(MADGRAPH_CUDA_ARCHITECTURE) -gencode arch=compute_$(MADGRAPH_CUDA_ARCHITECTURE),code=sm_$(MADGRAPH_CUDA_ARCHITECTURE) # Older implementation (AV): go back to this one for multi-GPU support #533
-    ###CUARCHFLAGS = --gpu-architecture=compute_$(MADGRAPH_CUDA_ARCHITECTURE) --gpu-code=sm_$(MADGRAPH_CUDA_ARCHITECTURE),compute_$(MADGRAPH_CUDA_ARCHITECTURE) # Newer implementation (SH): cannot use this as-is for multi-GPU support #533
-    comma:=,
-    CUARCHFLAGS = $(foreach arch,$(subst $(comma), ,$(MADGRAPH_CUDA_ARCHITECTURE)),-gencode arch=compute_$(arch),code=compute_$(arch) -gencode arch=compute_$(arch),code=sm_$(arch))
-    CUINC = -I$(CUDA_HOME)/include/
-    CURANDLIBFLAGS = -L$(CUDA_HOME)/lib64/ -lcurand # NB: -lcuda is not needed here!
-    CUOPTFLAGS = -lineinfo
-    GPUFLAGS = $(OPTFLAGS) $(CUOPTFLAGS) $(INCFLAGS) $(CUINC) $(USE_NVTX) $(CUARCHFLAGS) -use_fast_math
-    ###GPUFLAGS += -Xcompiler -Wall -Xcompiler -Wextra -Xcompiler -Wshadow
-    ###GPUCC_VERSION = $(shell $(GPUCC) --version | grep 'Cuda compilation tools' | cut -d' ' -f5 | cut -d, -f1)
-    GPUFLAGS += -std=c++17 # need CUDA >= 11.2 (see #333): this is enforced in mgOnGpuConfig.h
-    # Without -maxrregcount: baseline throughput: 6.5E8 (16384 32 12) up to 7.3E8 (65536 128 12)
-    ###GPUFLAGS+= --maxrregcount 160 # improves throughput: 6.9E8 (16384 32 12) up to 7.7E8 (65536 128 12)
-    ###GPUFLAGS+= --maxrregcount 128 # improves throughput: 7.3E8 (16384 32 12) up to 7.6E8 (65536 128 12)
-    ###GPUFLAGS+= --maxrregcount 96 # degrades throughput: 4.1E8 (16384 32 12) up to 4.5E8 (65536 128 12)
-    ###GPUFLAGS+= --maxrregcount 64 # degrades throughput: 1.7E8 (16384 32 12) flat at 1.7E8 (65536 128 12)
-
-    CUBUILDRULEFLAGS = -Xcompiler -fPIC -c
-    CCBUILDRULEFLAGS = -Xcompiler -fPIC -c -x cu
-
-    CUDATESTFLAGS = -lcuda
-
-  else ifneq ($(origin REQUIRE_CUDA),undefined)
-    # If REQUIRE_CUDA is set but no cuda is found, stop here (e.g. for CI tests on GPU #443)
-    $(error No cuda installation found (set CUDA_HOME or make GPUCC visible in PATH))
-  else
-    # No cuda. Switch cuda compilation off and go to common random numbers in C++
-    $(warning CUDA_HOME is not set or is invalid: export CUDA_HOME to compile with cuda)
-    override GPUCC=
-    override USE_NVTX=
-    override CUINC=
-    override CURANDLIBFLAGS=
-  endif
-
-  # Set the host C++ compiler for GPUCC via "-ccbin <host-compiler>"
-  # (NB issue #505: this must be a single word, "clang++ --gcc-toolchain..." is not supported)
-  GPUFLAGS += -ccbin $(shell which $(subst ccache ,,$(CXX)))
-
-  # Allow newer (unsupported) C++ compilers with older versions of CUDA if ALLOW_UNSUPPORTED_COMPILER_IN_CUDA is set (#504)
-  ifneq ($(origin ALLOW_UNSUPPORTED_COMPILER_IN_CUDA),undefined)
-  GPUFLAGS += -allow-unsupported-compiler
-  endif
-
-else ifeq ($(findstring hipcc,$(HIP_COMPILER_PATH)),hipcc)
-  #=== Configure the HIP compiler
-
-  # If CXX is not a single word (example "clang++ --gcc-toolchain...") then disable CUDA builds (issue #505)
-  # This is because it is impossible to pass this to "GPUFLAGS += -ccbin <host-compiler>" below
-  ifneq ($(words $(subst ccache ,,$(CXX))),1) # allow at most "CXX=ccache <host-compiler>" from outside
-    $(warning CUDA builds are not supported for multi-word CXX "$(CXX)")
-    override CUDA_HOME=disabled
-  endif
-
-  # If HIP_HOME is not set, try to set it from the location of GPUCC
-  ifndef HIP_HOME
-    HIP_HOME = $(patsubst %bin/hipcc,%,$(HIP_COMPILER_PATH))
-    $(warning HIP_HOME was not set: using "$(HIP_HOME)")
-  endif
-
-  # Set GPUCC as $(HIP_HOME)/bin/hipcc if it exists
-  ifneq ($(wildcard $(HIP_HOME)/bin/hipcc),)
-    GPUCC = $(HIP_HOME)/bin/hipcc
-
-    # Should maybe find something equivelant to this in HIP
-    #USE_NVTX ?=-DUSE_NVTX
-
-    HIPARCHFLAGS = -target x86_64-linux-gnu --offload-arch=gfx90a
-    HIPINC = -I$(HIP_HOME)/include/
-
-    # -DHIP_FAST_MATH equivelant to -use_fast_math in HIP 
-    # (But only for single precision line 208: https://rocm-developer-tools.github.io/HIP/hcc__detail_2math__functions_8h_source.html)
-    GPUFLAGS = $(OPTFLAGS) $(CUOPTFLAGS) $(INCFLAGS) $(HIPINC) $(HIPARCHFLAGS) -DHIP_FAST_MATH -DHIP_PLATFORM=amd -fPIC
-    ###GPUFLAGS += -Xcompiler -Wall -Xcompiler -Wextra -Xcompiler -Wshadow
-    GPUFLAGS += -std=c++17
-    # Without -maxrregcount: baseline throughput: 6.5E8 (16384 32 12) up to 7.3E8 (65536 128 12)
-    ###GPUFLAGS+= --maxrregcount 160 # improves throughput: 6.9E8 (16384 32 12) up to 7.7E8 (65536 128 12)
-    ###GPUFLAGS+= --maxrregcount 128 # improves throughput: 7.3E8 (16384 32 12) up to 7.6E8 (65536 128 12)
-    ###GPUFLAGS+= --maxrregcount 96 # degrades throughput: 4.1E8 (16384 32 12) up to 4.5E8 (65536 128 12)
-    ###GPUFLAGS+= --maxrregcount 64 # degrades throughput: 1.7E8 (16384 32 12) flat at 1.7E8 (65536 128 12)
-
-    CUBUILDRULEFLAGS = -fPIC -c
-    CCBUILDRULEFLAGS = -fPIC -c
-
-  else ifneq ($(origin REQUIRE_HIP),undefined)
-    # If REQUIRE_HIP is set but no cuda is found, stop here (e.g. for CI tests on GPU #443)
-    $(error No hip installation found (set HIP_HOME or make GPUCC visible in PATH))
-  else
-    # No hip. Switch hip compilation off and go to common random numbers in C++
-    $(warning HIP_HOME is not set or is invalid: export HIP_HOME to compile with hip)
-    override GPUCC=
-    override USE_NVTX=
-    override CUINC=
-    override CURANDLIBFLAGS=
-  endif
+#=== Configure the CUDA compiler
+
+# If CXX is not a single word (example "clang++ --gcc-toolchain...") then disable CUDA builds (issue #505)
+# This is because it is impossible to pass this to "CUFLAGS += -ccbin <host-compiler>" below
+ifneq ($(words $(subst ccache ,,$(CXX))),1) # allow at most "CXX=ccache <host-compiler>" from outside
+  $(warning CUDA builds are not supported for multi-word CXX "$(CXX)")
+  override CUDA_HOME=disabled
+endif
+
+# If CUDA_HOME is not set, try to set it from the location of nvcc
+ifndef CUDA_HOME
+  CUDA_HOME = $(patsubst %bin/nvcc,%,$(shell which nvcc 2>/dev/null))
+  $(warning CUDA_HOME was not set: using "$(CUDA_HOME)")
+endif
+
+# Set NVCC as $(CUDA_HOME)/bin/nvcc if it exists
+ifneq ($(wildcard $(CUDA_HOME)/bin/nvcc),)
+  NVCC = $(CUDA_HOME)/bin/nvcc
+  USE_NVTX ?=-DUSE_NVTX
+  # See https://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/index.html
+  # See https://arnon.dk/matching-sm-architectures-arch-and-gencode-for-various-nvidia-cards/
+  # Default: use compute capability 70 for V100 (CERN lxbatch, CERN itscrd, Juwels Cluster).
+  # Embed device code for 70, and PTX for 70+.
+  # Export MADGRAPH_CUDA_ARCHITECTURE (comma-separated list) to use another value or list of values (see #533).
+  # Examples: use 60 for P100 (Piz Daint), 80 for A100 (Juwels Booster, NVidia raplab/Curiosity).
+  MADGRAPH_CUDA_ARCHITECTURE ?= 70
+  ###CUARCHFLAGS = -gencode arch=compute_$(MADGRAPH_CUDA_ARCHITECTURE),code=compute_$(MADGRAPH_CUDA_ARCHITECTURE) -gencode arch=compute_$(MADGRAPH_CUDA_ARCHITECTURE),code=sm_$(MADGRAPH_CUDA_ARCHITECTURE) # Older implementation (AV): go back to this one for multi-GPU support #533
+  ###CUARCHFLAGS = --gpu-architecture=compute_$(MADGRAPH_CUDA_ARCHITECTURE) --gpu-code=sm_$(MADGRAPH_CUDA_ARCHITECTURE),compute_$(MADGRAPH_CUDA_ARCHITECTURE) # Newer implementation (SH): cannot use this as-is for multi-GPU support #533
+  comma:=,
+  CUARCHFLAGS = $(foreach arch,$(subst $(comma), ,$(MADGRAPH_CUDA_ARCHITECTURE)),-gencode arch=compute_$(arch),code=compute_$(arch) -gencode arch=compute_$(arch),code=sm_$(arch))
+  CUINC = -I$(CUDA_HOME)/include/
+  CURANDLIBFLAGS = -L$(CUDA_HOME)/lib64/ -lcurand # NB: -lcuda is not needed here!
+  CUOPTFLAGS = -lineinfo
+  CUFLAGS = $(OPTFLAGS) $(CUOPTFLAGS) $(INCFLAGS) $(CUINC) $(USE_NVTX) $(CUARCHFLAGS) -use_fast_math
+  ###CUFLAGS += -Xcompiler -Wall -Xcompiler -Wextra -Xcompiler -Wshadow
+  ###NVCC_VERSION = $(shell $(NVCC) --version | grep 'Cuda compilation tools' | cut -d' ' -f5 | cut -d, -f1)
+  CUFLAGS += -std=c++17 # need CUDA >= 11.2 (see #333): this is enforced in mgOnGpuConfig.h
+  # Without -maxrregcount: baseline throughput: 6.5E8 (16384 32 12) up to 7.3E8 (65536 128 12)
+  ###CUFLAGS+= --maxrregcount 160 # improves throughput: 6.9E8 (16384 32 12) up to 7.7E8 (65536 128 12)
+  ###CUFLAGS+= --maxrregcount 128 # improves throughput: 7.3E8 (16384 32 12) up to 7.6E8 (65536 128 12)
+  ###CUFLAGS+= --maxrregcount 96 # degrades throughput: 4.1E8 (16384 32 12) up to 4.5E8 (65536 128 12)
+  ###CUFLAGS+= --maxrregcount 64 # degrades throughput: 1.7E8 (16384 32 12) flat at 1.7E8 (65536 128 12)
+else ifneq ($(origin REQUIRE_CUDA),undefined)
+  # If REQUIRE_CUDA is set but no cuda is found, stop here (e.g. for CI tests on GPU #443)
+  $(error No cuda installation found (set CUDA_HOME or make nvcc visible in PATH))
+else
+  # No cuda. Switch cuda compilation off and go to common random numbers in C++
+  $(warning CUDA_HOME is not set or is invalid: export CUDA_HOME to compile with cuda)
+  override NVCC=
+  override USE_NVTX=
+  override CUINC=
+  override CURANDLIBFLAGS=
+endif
 
-  # Allow newer (unsupported) C++ compilers with older versions of CUDA if ALLOW_UNSUPPORTED_COMPILER_IN_CUDA is set (#504)
-  ifneq ($(origin ALLOW_UNSUPPORTED_COMPILER_IN_CUDA),undefined)
-  GPUFLAGS += -allow-unsupported-compiler
-  endif
+# Set the host C++ compiler for nvcc via "-ccbin <host-compiler>"
+# (NB issue #505: this must be a single word, "clang++ --gcc-toolchain..." is not supported)
+CUFLAGS += -ccbin $(shell which $(subst ccache ,,$(CXX)))
 
+# Allow newer (unsupported) C++ compilers with older versions of CUDA if ALLOW_UNSUPPORTED_COMPILER_IN_CUDA is set (#504)
+ifneq ($(origin ALLOW_UNSUPPORTED_COMPILER_IN_CUDA),undefined)
+CUFLAGS += -allow-unsupported-compiler
 endif
 
-  
 #-------------------------------------------------------------------------------
 
 #=== Configure ccache for C++ and CUDA builds
@@ -233,9 +163,9 @@ endif
 #ifeq ($(USECCACHE)$(shell echo $(AR) | grep ccache),1)
 #  override AR:=ccache $(AR)
 #endif
-ifneq ($(GPUCC),)
-  ifeq ($(USECCACHE)$(shell echo $(GPUCC) | grep ccache),1)
-    override GPUCC:=ccache $(GPUCC)
+ifneq ($(NVCC),)
+  ifeq ($(USECCACHE)$(shell echo $(NVCC) | grep ccache),1)
+    override NVCC:=ccache $(NVCC)
   endif
 endif
 
@@ -259,7 +189,7 @@ endif
 
 # PowerPC-specific CUDA compiler flags (to be reviewed!)
 ifeq ($(UNAME_P),ppc64le)
-  GPUFLAGS+= -Xcompiler -mno-float128
+  CUFLAGS+= -Xcompiler -mno-float128
 endif
 
 #-------------------------------------------------------------------------------
@@ -269,10 +199,10 @@ endif
 # Set the default OMPFLAGS choice
 ifneq ($(shell $(CXX) --version | egrep '^Intel'),)
 override OMPFLAGS = -fopenmp
-###override OMPFLAGS = # disable OpenMP MT on Intel (was ok without GPUCC but not ok with GPUCC before #578)
+###override OMPFLAGS = # disable OpenMP MT on Intel (was ok without nvcc but not ok with nvcc before #578)
 else ifneq ($(shell $(CXX) --version | egrep '^(clang)'),)
 override OMPFLAGS = -fopenmp
-###override OMPFLAGS = # disable OpenMP MT on clang (was not ok without or with GPUCC before #578)
+###override OMPFLAGS = # disable OpenMP MT on clang (was not ok without or with nvcc before #578)
 else ifneq ($(shell $(CXX) --version | egrep '^(Apple clang)'),)
 override OMPFLAGS = # disable OpenMP MT on Apple clang (builds fail in the CI #578)
 else
@@ -323,10 +253,7 @@ endif
 
 # Set the default RNDGEN (random number generator) choice
 ifeq ($(RNDGEN),)
-  ifeq ($(GPUCC),)
-    override RNDGEN = hasNoCurand
-  # Edgecase for HIP compilation
-  else ifeq ($(findstring hipcc,$(GPUCC)),hipcc)
+  ifeq ($(NVCC),)
     override RNDGEN = hasNoCurand
   else ifeq ($(RNDGEN),)
     override RNDGEN = hasCurand
@@ -401,13 +328,13 @@ CXXFLAGS+= $(AVXFLAGS)
 $(info FPTYPE=$(FPTYPE))
 ifeq ($(FPTYPE),d)
   CXXFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_DOUBLE
-  GPUFLAGS  += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_DOUBLE
+  CUFLAGS  += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_DOUBLE
 else ifeq ($(FPTYPE),f)
   CXXFLAGS += -DMGONGPU_FPTYPE_FLOAT -DMGONGPU_FPTYPE2_FLOAT
-  GPUFLAGS  += -DMGONGPU_FPTYPE_FLOAT -DMGONGPU_FPTYPE2_FLOAT
+  CUFLAGS  += -DMGONGPU_FPTYPE_FLOAT -DMGONGPU_FPTYPE2_FLOAT
 else ifeq ($(FPTYPE),m)
   CXXFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_FLOAT
-  GPUFLAGS  += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_FLOAT
+  CUFLAGS  += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_FLOAT
 else
   $(error Unknown FPTYPE='$(FPTYPE)': only 'd', 'f' and 'm' are supported)
 endif
@@ -416,7 +343,7 @@ endif
 $(info HELINL=$(HELINL))
 ifeq ($(HELINL),1)
   CXXFLAGS += -DMGONGPU_INLINE_HELAMPS
-  GPUFLAGS  += -DMGONGPU_INLINE_HELAMPS
+  CUFLAGS  += -DMGONGPU_INLINE_HELAMPS
 else ifneq ($(HELINL),0)
   $(error Unknown HELINL='$(HELINL)': only '0' and '1' are supported)
 endif
@@ -425,7 +352,7 @@ endif
 $(info HRDCOD=$(HRDCOD))
 ifeq ($(HRDCOD),1)
   CXXFLAGS += -DMGONGPU_HARDCODE_PARAM
-  GPUFLAGS  += -DMGONGPU_HARDCODE_PARAM
+  CUFLAGS  += -DMGONGPU_HARDCODE_PARAM
 else ifneq ($(HRDCOD),0)
   $(error Unknown HRDCOD='$(HRDCOD)': only '0' and '1' are supported)
 endif
@@ -477,11 +404,11 @@ ifeq ($(UNAME_S),Darwin)
   override CULIBFLAGSRPATH2 =
 else
   # RPATH to cuda/cpp libs when linking executables
-  override CXXLIBFLAGSRPATH = -Wl,-rpath=$(LIBDIRRPATH)
-  override CULIBFLAGSRPATH = -Xlinker -rpath=$(LIBDIRRPATH)
+  override CXXLIBFLAGSRPATH = -Wl,-rpath,$(LIBDIRRPATH)
+  override CULIBFLAGSRPATH = -Xlinker -rpath,$(LIBDIRRPATH)
   # RPATH to common lib when linking cuda/cpp libs
-  override CXXLIBFLAGSRPATH2 = -Wl,-rpath='$$ORIGIN'
-  override CULIBFLAGSRPATH2 = -Xlinker -rpath='$$ORIGIN'
+  override CXXLIBFLAGSRPATH2 = -Wl,-rpath,'$$ORIGIN'
+  override CULIBFLAGSRPATH2 = -Xlinker -rpath,'$$ORIGIN'
 endif
 
 # Setting LD_LIBRARY_PATH or DYLD_LIBRARY_PATH in the RUNTIME is no longer necessary (neither on Linux nor on Mac)
@@ -494,7 +421,7 @@ override RUNTIME =
 cxx_main=$(BUILDDIR)/check.exe
 fcxx_main=$(BUILDDIR)/fcheck.exe
 
-ifneq ($(GPUCC),)
+ifneq ($(NVCC),)
 cu_main=$(BUILDDIR)/gcheck.exe
 fcu_main=$(BUILDDIR)/fgcheck.exe
 else
@@ -525,16 +452,15 @@ $(BUILDDIR)/.build.$(TAG):
 	@touch $(BUILDDIR)/.build.$(TAG)
 
 # Generic target and build rules: objects from CUDA compilation
-ifneq ($(GPUCC),)
+ifneq ($(NVCC),)
 $(BUILDDIR)/%.o : %.cu *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
 	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
-	$(GPUCC) $(CPPFLAGS) $(GPUFLAGS) $(CUBUILDRULEFLAGS) $< -o $@
+	$(NVCC) $(CPPFLAGS) $(CUFLAGS) -Xcompiler -fPIC -c $< -o $@
 
 $(BUILDDIR)/%_cu.o : %.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
 	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
-	$(GPUCC) $(CPPFLAGS) $(GPUFLAGS) $(CCBUILDRULEFLAGS) $< -o $@
+	$(NVCC) $(CPPFLAGS) $(CUFLAGS) -Xcompiler -fPIC -c -x cu $< -o $@
 endif
-# -x cu in line above
 
 # Generic target and build rules: objects from C++ compilation
 # (NB do not include CUINC here! add it only for NVTX or curand #679)
@@ -543,14 +469,11 @@ $(BUILDDIR)/%.o : %.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
 	$(CXX) $(CPPFLAGS) $(CXXFLAGS) -fPIC -c $< -o $@
 
 # Apply special build flags only to CrossSectionKernel.cc and gCrossSectionKernel.cu (no fast math, see #117)
-# Added edgecase for HIP compilation
 ifeq ($(shell $(CXX) --version | grep ^nvc++),)
 $(BUILDDIR)/CrossSectionKernels.o: CXXFLAGS += -fno-fast-math
 $(BUILDDIR)/CrossSectionKernels.o: CXXFLAGS += -fno-fast-math
-ifeq ($(findstring nvcc,$(GPUCC)),nvcc)
-  $(BUILDDIR)/gCrossSectionKernels.o: GPUFLAGS += -Xcompiler -fno-fast-math
-else
-  $(BUILDDIR)/gCrossSectionKernels.o: GPUFLAGS += -fno-fast-math
+ifneq ($(NVCC),)
+$(BUILDDIR)/gCrossSectionKernels.o: CUFLAGS += -Xcompiler -fno-fast-math
 endif
 endif
 
@@ -566,10 +489,10 @@ ifeq ($(RNDGEN),hasCurand)
 $(BUILDDIR)/CurandRandomNumberKernel.o: CXXFLAGS += $(CUINC)
 endif
 
-# Avoid "warning: builtin __has_trivial_... is deprecated; use __is_trivially_... instead" in GPUCC with icx2023 (#592)
+# Avoid "warning: builtin __has_trivial_... is deprecated; use __is_trivially_... instead" in nvcc with icx2023 (#592)
 ifneq ($(shell $(CXX) --version | egrep '^(Intel)'),)
-ifneq ($(GPUCC),)
-GPUFLAGS += -Wno-deprecated-builtins
+ifneq ($(NVCC),)
+CUFLAGS += -Xcompiler -Wno-deprecated-builtins
 endif
 endif
 
@@ -577,8 +500,8 @@ endif
 # This patch does remove the warning, but I prefer to keep it disabled for the moment...
 ###ifneq ($(shell $(CXX) --version | egrep '^(clang|Apple clang|Intel)'),)
 ###$(BUILDDIR)/CrossSectionKernels.o: CXXFLAGS += -Wno-overriding-t-option
-###ifneq ($(GPUCC),)
-###$(BUILDDIR)/gCrossSectionKernels.o: GPUFLAGS += -Xcompiler -Wno-overriding-t-option
+###ifneq ($(NVCC),)
+###$(BUILDDIR)/gCrossSectionKernels.o: CUFLAGS += -Xcompiler -Wno-overriding-t-option
 ###endif
 ###endif
 
@@ -605,7 +528,7 @@ MG5AMC_CXXLIB = mg5amc_$(processid_short)_cpp
 cxx_objects_lib=$(BUILDDIR)/CPPProcess.o $(BUILDDIR)/MatrixElementKernels.o $(BUILDDIR)/BridgeKernels.o $(BUILDDIR)/CrossSectionKernels.o
 cxx_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel.o $(BUILDDIR)/RamboSamplingKernels.o
 
-ifneq ($(GPUCC),)
+ifneq ($(NVCC),)
 MG5AMC_CULIB = mg5amc_$(processid_short)_cuda
 cu_objects_lib=$(BUILDDIR)/gCPPProcess.o $(BUILDDIR)/gMatrixElementKernels.o $(BUILDDIR)/gBridgeKernels.o $(BUILDDIR)/gCrossSectionKernels.o
 cu_objects_exe=$(BUILDDIR)/gCommonRandomNumberKernel.o $(BUILDDIR)/gRamboSamplingKernels.o
@@ -617,11 +540,11 @@ $(LIBDIR)/lib$(MG5AMC_CXXLIB).so: cxx_objects_lib += $(BUILDDIR)/fbridge.o
 $(LIBDIR)/lib$(MG5AMC_CXXLIB).so: $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib)
 	$(CXX) -shared -o $@ $(cxx_objects_lib) $(CXXLIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB)
 
-ifneq ($(GPUCC),)
+ifneq ($(NVCC),)
 $(LIBDIR)/lib$(MG5AMC_CULIB).so: $(BUILDDIR)/fbridge_cu.o
 $(LIBDIR)/lib$(MG5AMC_CULIB).so: cu_objects_lib += $(BUILDDIR)/fbridge_cu.o
 $(LIBDIR)/lib$(MG5AMC_CULIB).so: $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cu_objects_lib)
-	$(GPUCC) --shared -o $@ $(cu_objects_lib) $(CULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB)
+	$(NVCC) --shared -o $@ $(cu_objects_lib) $(CULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB)
 endif
 
 #-------------------------------------------------------------------------------
@@ -638,16 +561,16 @@ $(cxx_main): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PAT
 $(cxx_main): $(BUILDDIR)/check_sa.o $(LIBDIR)/lib$(MG5AMC_CXXLIB).so $(cxx_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel.o
 	$(CXX) -o $@ $(BUILDDIR)/check_sa.o $(OMPFLAGS) -ldl -pthread $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_CXXLIB) $(cxx_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel.o $(CURANDLIBFLAGS)
 
-ifneq ($(GPUCC),)
+ifneq ($(NVCC),)
 ifneq ($(shell $(CXX) --version | grep ^Intel),)
-$(cu_main): LIBFLAGS += -lintlc # compile with icpx and link with GPUCC (undefined reference to `_intel_fast_memcpy')
-$(cu_main): LIBFLAGS += -lsvml # compile with icpx and link with GPUCC (undefined reference to `__svml_cos4_l9')
+$(cu_main): LIBFLAGS += -lintlc # compile with icpx and link with nvcc (undefined reference to `_intel_fast_memcpy')
+$(cu_main): LIBFLAGS += -lsvml # compile with icpx and link with nvcc (undefined reference to `__svml_cos4_l9')
 else ifneq ($(shell $(CXX) --version | grep ^nvc++),) # support nvc++ #531
 $(cu_main): LIBFLAGS += -L$(patsubst %bin/nvc++,%lib,$(subst ccache ,,$(CXX))) -lnvhpcatm -lnvcpumath -lnvc
 endif
 $(cu_main): LIBFLAGS += $(CULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH
 $(cu_main): $(BUILDDIR)/gcheck_sa.o $(LIBDIR)/lib$(MG5AMC_CULIB).so $(cu_objects_exe) $(BUILDDIR)/gCurandRandomNumberKernel.o
-	$(GPUCC) -o $@ $(BUILDDIR)/gcheck_sa.o $(CUARCHFLAGS) $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe) $(BUILDDIR)/gCurandRandomNumberKernel.o $(CURANDLIBFLAGS)
+	$(NVCC) -o $@ $(BUILDDIR)/gcheck_sa.o $(CUARCHFLAGS) $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe) $(BUILDDIR)/gCurandRandomNumberKernel.o $(CURANDLIBFLAGS)
 endif
 
 #-------------------------------------------------------------------------------
@@ -673,17 +596,17 @@ $(fcxx_main): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PA
 $(fcxx_main): $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler.o $(LIBDIR)/lib$(MG5AMC_CXXLIB).so $(cxx_objects_exe)
 	$(CXX) -o $@ $(BUILDDIR)/fcheck_sa.o $(OMPFLAGS) $(BUILDDIR)/fsampler.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CXXLIB) $(cxx_objects_exe)
 
-ifneq ($(GPUCC),)
+ifneq ($(NVCC),)
 ifneq ($(shell $(CXX) --version | grep ^Intel),)
-$(fcu_main): LIBFLAGS += -lintlc # compile with icpx and link with GPUCC (undefined reference to `_intel_fast_memcpy')
-$(fcu_main): LIBFLAGS += -lsvml # compile with icpx and link with GPUCC (undefined reference to `__svml_cos4_l9')
+$(fcu_main): LIBFLAGS += -lintlc # compile with icpx and link with nvcc (undefined reference to `_intel_fast_memcpy')
+$(fcu_main): LIBFLAGS += -lsvml # compile with icpx and link with nvcc (undefined reference to `__svml_cos4_l9')
 endif
 ifeq ($(UNAME_S),Darwin)
 $(fcu_main): LIBFLAGS += -L$(shell dirname $(shell $(FC) --print-file-name libgfortran.dylib)) # add path to libgfortran on Mac #375
 endif
 $(fcu_main): LIBFLAGS += $(CULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH
 $(fcu_main): $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler_cu.o $(LIBDIR)/lib$(MG5AMC_CULIB).so $(cu_objects_exe)
-	$(GPUCC) -o $@ $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler_cu.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe)
+	$(NVCC) -o $@ $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler_cu.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe)
 endif
 
 #-------------------------------------------------------------------------------
@@ -695,7 +618,7 @@ $(BUILDDIR)/testxxx.o: testxxx_cc_ref.txt
 $(testmain): $(BUILDDIR)/testxxx.o
 $(testmain): cxx_objects_exe += $(BUILDDIR)/testxxx.o # Comment out this line to skip the C++ test of xxx functions
 
-ifneq ($(GPUCC),)
+ifneq ($(NVCC),)
 $(BUILDDIR)/testxxx_cu.o: $(GTESTLIBS)
 $(BUILDDIR)/testxxx_cu.o: INCFLAGS += $(GTESTINC)
 $(BUILDDIR)/testxxx_cu.o: testxxx_cc_ref.txt
@@ -708,7 +631,7 @@ $(BUILDDIR)/testmisc.o: INCFLAGS += $(GTESTINC)
 $(testmain): $(BUILDDIR)/testmisc.o
 $(testmain): cxx_objects_exe += $(BUILDDIR)/testmisc.o # Comment out this line to skip the C++ miscellaneous tests
 
-ifneq ($(GPUCC),)
+ifneq ($(NVCC),)
 $(BUILDDIR)/testmisc_cu.o: $(GTESTLIBS)
 $(BUILDDIR)/testmisc_cu.o: INCFLAGS += $(GTESTINC)
 $(testmain): $(BUILDDIR)/testmisc_cu.o
@@ -720,12 +643,12 @@ $(BUILDDIR)/runTest.o: INCFLAGS += $(GTESTINC)
 $(testmain): $(BUILDDIR)/runTest.o
 $(testmain): cxx_objects_exe += $(BUILDDIR)/runTest.o
 
-ifneq ($(GPUCC),)
+ifneq ($(NVCC),)
 $(BUILDDIR)/runTest_cu.o: $(GTESTLIBS)
 $(BUILDDIR)/runTest_cu.o: INCFLAGS += $(GTESTINC)
 ifneq ($(shell $(CXX) --version | grep ^Intel),)
-$(testmain): LIBFLAGS += -lintlc # compile with icpx and link with GPUCC (undefined reference to `_intel_fast_memcpy')
-$(testmain): LIBFLAGS += -lsvml # compile with icpx and link with GPUCC (undefined reference to `__svml_cos4_l9')
+$(testmain): LIBFLAGS += -lintlc # compile with icpx and link with nvcc (undefined reference to `_intel_fast_memcpy')
+$(testmain): LIBFLAGS += -lsvml # compile with icpx and link with nvcc (undefined reference to `__svml_cos4_l9')
 else ifneq ($(shell $(CXX) --version | grep ^nvc++),) # support nvc++ #531
 $(testmain): LIBFLAGS += -L$(patsubst %bin/nvc++,%lib,$(subst ccache ,,$(CXX))) -lnvhpcatm -lnvcpumath -lnvc
 endif
@@ -749,14 +672,14 @@ $(testmain): LIBFLAGS += -lgomp
 endif
 endif
 
-ifeq ($(GPUCC),) # link only runTest.o
+ifeq ($(NVCC),) # link only runTest.o
 $(testmain): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH
 $(testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(cxx_objects_exe) $(GTESTLIBS)
 	$(CXX) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) -ldl -pthread $(LIBFLAGS)
 else # link both runTest.o and runTest_cu.o
 $(testmain): LIBFLAGS += $(CULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH
 $(testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) $(GTESTLIBS)
-	  $(GPUCC) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) -ldl $(LIBFLAGS) $(CUDATESTFLAGS)
+	$(NVCC) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) -ldl $(LIBFLAGS) -lcuda
 endif
 
 # Use flock (Linux only, no Mac) to allow 'make -j' if googletest has not yet been downloaded https://stackoverflow.com/a/32666215
@@ -859,9 +782,9 @@ ifeq ($(USECCACHE),1)
 	ccache --version | head -1
 endif
 	@echo ""
-	@echo GPUCC=$(GPUCC)
-ifneq ($(GPUCC),)
-	$(GPUCC) --version
+	@echo NVCC=$(NVCC)
+ifneq ($(NVCC),)
+	$(NVCC) --version
 endif
 	@echo ""
 	@echo CXX=$(CXX)
@@ -880,7 +803,7 @@ endif
 
 # Target: check (run the C++ test executable)
 # [NB THIS IS WHAT IS USED IN THE GITHUB CI!]
-ifneq ($(GPUCC),)
+ifneq ($(NVCC),)
 check: runTest cmpFcheck cmpFGcheck
 else
 check: runTest cmpFcheck
diff --git a/epochX/cudacpp/ee_mumu.sa/SubProcesses/fbridge.cc b/epochX/cudacpp/ee_mumu.sa/SubProcesses/fbridge.cc
index 2b956730d4..f93c05b0b3 100644
--- a/epochX/cudacpp/ee_mumu.sa/SubProcesses/fbridge.cc
+++ b/epochX/cudacpp/ee_mumu.sa/SubProcesses/fbridge.cc
@@ -1,11 +1,11 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: S. Roiser (Oct 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Roiser, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #include "Bridge.h"
 #include "CPPProcess.h"
-#include "GpuRuntime.h"
+#include "CudaRuntime.h"
 
 extern "C"
 {
@@ -22,7 +22,7 @@ extern "C"
    * Using the same Fortran MadEvent code, linking to the hetrerogeneous library would allow access to both CPU and GPU implementations.
    * The specific heterogeneous configuration (how many GPUs, how many threads on each CPU, etc) could be loaded in CUDA/C++ from a data file.
    */
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
   using namespace mg5amcGpu;
 #else
   using namespace mg5amcCpu;
@@ -46,8 +46,8 @@ extern "C"
    */
   void fbridgecreate_( CppObjectInFortran** ppbridge, const int* pnevtF, const int* pnparF, const int* pnp4F )
   {
-#ifdef MGONGPUCPP_GPUIMPL
-    GpuRuntime::setUp();
+#ifdef __CUDACC__
+    CudaRuntime::setUp();
 #endif
     // Create a process object, read parm card and set parameters
     // FIXME: the process instance can happily go out of scope because it is only needed to read parameters?
@@ -69,8 +69,8 @@ extern "C"
     Bridge<FORTRANFPTYPE>* pbridge = dynamic_cast<Bridge<FORTRANFPTYPE>*>( *ppbridge );
     if( pbridge == 0 ) throw std::runtime_error( "fbridgedelete_: invalid Bridge address" );
     delete pbridge;
-#ifdef MGONGPUCPP_GPUIMPL
-    GpuRuntime::tearDown();
+#ifdef __CUDACC__
+    CudaRuntime::tearDown();
 #endif
   }
 
@@ -100,7 +100,7 @@ extern "C"
   {
     Bridge<FORTRANFPTYPE>* pbridge = dynamic_cast<Bridge<FORTRANFPTYPE>*>( *ppbridge );
     if( pbridge == 0 ) throw std::runtime_error( "fbridgesequence_: invalid Bridge address" );
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
     // Use the device/GPU implementation in the CUDA library
     // (there is also a host implementation in this library)
     pbridge->gpu_sequence( momenta, gs, rndhel, rndcol, *pchannelId, mes, selhel, selcol );
diff --git a/epochX/cudacpp/ee_mumu.sa/SubProcesses/fsampler.cc b/epochX/cudacpp/ee_mumu.sa/SubProcesses/fsampler.cc
index 3743934f41..2fb445372d 100644
--- a/epochX/cudacpp/ee_mumu.sa/SubProcesses/fsampler.cc
+++ b/epochX/cudacpp/ee_mumu.sa/SubProcesses/fsampler.cc
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Feb 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #include "mgOnGpuConfig.h"
 
@@ -13,7 +13,7 @@
 
 //--------------------------------------------------------------------------
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -40,7 +40,7 @@ namespace mg5amcCpu
   private:
     const int m_nevt; // The number of events in each iteration
     int m_iiter;      // The iteration counter (for random number seeding)
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
     HostBufferRndNumMomenta m_hstRndmom; // Memory buffers for random numbers
     HostBufferMomenta m_hstMomenta;      // Memory buffers for momenta
     HostBufferWeights m_hstWeights;      // Memory buffers for sampling weights
@@ -105,7 +105,7 @@ namespace mg5amcCpu
 
 extern "C"
 {
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
   using namespace mg5amcGpu;
 #else
   using namespace mg5amcCpu;
diff --git a/epochX/cudacpp/ee_mumu.sa/SubProcesses/runTest.cc b/epochX/cudacpp/ee_mumu.sa/SubProcesses/runTest.cc
index 461ec5c3a5..572e28aaea 100644
--- a/epochX/cudacpp/ee_mumu.sa/SubProcesses/runTest.cc
+++ b/epochX/cudacpp/ee_mumu.sa/SubProcesses/runTest.cc
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: S. Hageboeck (Nov 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 
 #include "mgOnGpuConfig.h"
 
@@ -15,7 +15,7 @@
 #include "RandomNumberKernels.h"
 #include "epoch_process_id.h"
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 using namespace mg5amcGpu;
 #else
 using namespace mg5amcCpu;
@@ -32,7 +32,7 @@ struct CUDA_CPU_TestBase : public TestDriverBase
     : TestDriverBase( npar, refFileName ) {}
 };
 
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
 struct CPUTest : public CUDA_CPU_TestBase
 {
   // Struct data members (process, and memory structures for random numbers, momenta, matrix elements and weights on host and device)
@@ -114,7 +114,7 @@ struct CPUTest : public CUDA_CPU_TestBase
 };
 #endif
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 struct CUDATest : public CUDA_CPU_TestBase
 {
   // Reset the device when our test goes out of scope. Note that this should happen after
@@ -123,7 +123,7 @@ struct CUDATest : public CUDA_CPU_TestBase
   {
     ~DeviceReset()
     {
-      gpuDeviceReset(); // this is needed by cuda-memcheck --leak-check full
+      checkCuda( cudaDeviceReset() ); // this is needed by cuda-memcheck --leak-check full
     }
   } deviceResetter;
 
@@ -249,7 +249,7 @@ INSTANTIATE_TEST_SUITE_P( prefix, \
                           test_suite_name, \
                           testing::Values( new CUDATest( MG_EPOCH_REFERENCE_FILE_NAME ) ) );
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 MG_INSTANTIATE_TEST_SUITE_GPU( XTESTID_GPU( MG_EPOCH_PROCESS_ID ), MadgraphTest );
 #else
 MG_INSTANTIATE_TEST_SUITE_CPU( XTESTID_CPU( MG_EPOCH_PROCESS_ID ), MadgraphTest );
diff --git a/epochX/cudacpp/ee_mumu.sa/SubProcesses/testmisc.cc b/epochX/cudacpp/ee_mumu.sa/SubProcesses/testmisc.cc
index 2bd7a9fcf9..989aba1fdc 100644
--- a/epochX/cudacpp/ee_mumu.sa/SubProcesses/testmisc.cc
+++ b/epochX/cudacpp/ee_mumu.sa/SubProcesses/testmisc.cc
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 //----------------------------------------------------------------------------
 // Use ./runTest.exe --gtest_filter=*misc to run only this test
 //----------------------------------------------------------------------------
@@ -17,7 +17,7 @@
 #include <sstream>
 #include <typeinfo>
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 #define TESTID( s ) s##_GPU_MISC
 #else
 #define TESTID( s ) s##_CPU_MISC
diff --git a/epochX/cudacpp/ee_mumu.sa/SubProcesses/testxxx.cc b/epochX/cudacpp/ee_mumu.sa/SubProcesses/testxxx.cc
index 6e8657edca..4243e9fcec 100644
--- a/epochX/cudacpp/ee_mumu.sa/SubProcesses/testxxx.cc
+++ b/epochX/cudacpp/ee_mumu.sa/SubProcesses/testxxx.cc
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Apr 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #include "mgOnGpuConfig.h"
 
@@ -20,7 +20,7 @@
 #include <iomanip>
 #include <iostream>
 #include <vector>
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 #define TESTID( s ) s##_GPU_XXX
 #else
 #define TESTID( s ) s##_CPU_XXX
@@ -41,7 +41,7 @@ TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testxxx )
   assert( nevt % neppM == 0 ); // nevt must be a multiple of neppM
   assert( nevt % neppV == 0 ); // nevt must be a multiple of neppV
   // Fill in the input momenta
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
   mg5amcGpu::PinnedHostBufferMomenta hstMomenta( nevt ); // AOSOA[npagM][npar=4][np4=4][neppM]
 #else
   mg5amcCpu::HostBufferMomenta hstMomenta( nevt ); // AOSOA[npagM][npar=4][np4=4][neppM]
@@ -228,7 +228,7 @@ TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testxxx )
   {
     for( int ievt = 0; ievt < nevt; ievt++ )
     {
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
       using namespace mg5amcGpu;
 #else
       using namespace mg5amcCpu;
diff --git a/epochX/cudacpp/ee_mumu.sa/src/HelAmps_sm.h b/epochX/cudacpp/ee_mumu.sa/src/HelAmps_sm.h
index c2c572778b..fe9cb24d88 100644
--- a/epochX/cudacpp/ee_mumu.sa/src/HelAmps_sm.h
+++ b/epochX/cudacpp/ee_mumu.sa/src/HelAmps_sm.h
@@ -4,7 +4,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: A. Valassi (Sep 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
 // MadGraph5_aMC@NLO v. 3.5.0_lo_vect, 2023-06-09
@@ -26,7 +26,7 @@
 //#include <iomanip>
 //#include <iostream>
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/ee_mumu.sa/src/Parameters_sm.cc b/epochX/cudacpp/ee_mumu.sa/src/Parameters_sm.cc
index 89bbb57a0d..daed91bb80 100644
--- a/epochX/cudacpp/ee_mumu.sa/src/Parameters_sm.cc
+++ b/epochX/cudacpp/ee_mumu.sa/src/Parameters_sm.cc
@@ -4,7 +4,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: A. Valassi (Sep 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
 // MadGraph5_aMC@NLO v. 3.5.0_lo_vect, 2023-06-09
diff --git a/epochX/cudacpp/ee_mumu.sa/src/Parameters_sm.h b/epochX/cudacpp/ee_mumu.sa/src/Parameters_sm.h
index 5e20ca27b5..852861ced0 100644
--- a/epochX/cudacpp/ee_mumu.sa/src/Parameters_sm.h
+++ b/epochX/cudacpp/ee_mumu.sa/src/Parameters_sm.h
@@ -210,7 +210,7 @@ namespace Parameters_sm_dependentCouplings
 #pragma GCC diagnostic push
 #pragma GCC diagnostic ignored "-Wunused-variable"  // e.g. <<warning: unused variable ‘mdl_G__exp__2’ [-Wunused-variable]>>
 #pragma GCC diagnostic ignored "-Wunused-parameter" // e.g. <<warning: unused parameter ‘G’ [-Wunused-parameter]>>
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 #pragma nv_diagnostic push
 #pragma nv_diag_suppress 177 // e.g. <<warning #177-D: variable "mdl_G__exp__2" was declared but never referenced>>
 #endif
@@ -234,7 +234,7 @@ namespace Parameters_sm_dependentCouplings
     // End SM implementation - no special handling of vectors of floats as in EFT (#439)
     return out;
   }
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 #pragma GCC diagnostic pop
 #pragma nv_diagnostic pop
 #endif
@@ -252,7 +252,7 @@ namespace Parameters_sm_independentCouplings
 
 //==========================================================================
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/ee_mumu.sa/src/mgOnGpuConfig.h b/epochX/cudacpp/ee_mumu.sa/src/mgOnGpuConfig.h
index d4e37d19b3..6c0c4919e9 100644
--- a/epochX/cudacpp/ee_mumu.sa/src/mgOnGpuConfig.h
+++ b/epochX/cudacpp/ee_mumu.sa/src/mgOnGpuConfig.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jul 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MGONGPUCONFIG_H
 #define MGONGPUCONFIG_H 1
@@ -10,26 +10,13 @@
 // There are two different code bases for standalone_cudacpp (without multichannel) and madevent+cudacpp (with multichannel)
 #undef MGONGPU_SUPPORTS_MULTICHANNEL
 
-// Is this a GPU (CUDA, HIP) or CPU implementation?
-#ifdef __CUDACC__
-#define MGONGPUCPP_GPUIMPL cuda
-#elif defined __HIPCC__
-#define MGONGPUCPP_GPUIMPL hip
-#else
-#undef MGONGPUCPP_GPUIMPL
-#endif
-
 // ** NB1 Throughputs (e.g. 6.8E8) are events/sec for "./gcheck.exe -p 65536 128 12"
 // ** NB2 Baseline on b7g47n0004 fluctuates (probably depends on load on other VMs)
 
 // Choose if curand is supported for generating random numbers
-// For CUDA, by default, it is supported
-// For HIP, by default, it is not supported
 // For C++, by default, do not inline, but allow this macro to be set from outside with e.g. -DMGONGPU_HAS_NO_CURAND
 #ifdef __CUDACC__
 #undef MGONGPU_HAS_NO_CURAND
-#elif defined __HIPCC__
-#define MGONGPU_HAS_NO_CURAND 1
 #else
 //#undef MGONGPU_HAS_NO_CURAND // default
 ////#define MGONGPU_HAS_NO_CURAND 1
@@ -65,28 +52,23 @@
 //#undef MGONGPU_HARDCODE_PARAM // default
 ////#define MGONGPU_HARDCODE_PARAM 1
 
-// Complex type in CUDA: thrust or cucomplex or cxsmpl (CHOOSE ONLY ONE)
+// Complex type in c++: std::complex or cxsmpl (CHOOSE ONLY ONE)
+#ifndef __CUDACC__
+//#define MGONGPU_CPPCXTYPE_STDCOMPLEX 1 // ~8 percent slower on float, same on double (5.1E6/double, 9.4E6/float)
+#define MGONGPU_CPPCXTYPE_CXSMPL 1 // new default (5.1E6/double, 10.2E6/float)
+#endif
+
+// Complex type in cuda: thrust or cucomplex or cxsmpl (CHOOSE ONLY ONE)
 #ifdef __CUDACC__
 #define MGONGPU_CUCXTYPE_THRUST 1 // default (~1.15E9/double, ~3.2E9/float)
 //#define MGONGPU_CUCXTYPE_CUCOMPLEX 1 // ~10 percent slower (1.03E9/double, ~2.8E9/float)
 //#define MGONGPU_CUCXTYPE_CXSMPL 1 // ~10 percent slower (1.00E9/double, ~2.9E9/float)
-
-// Complex type in HIP: cxsmpl (ONLY ONE OPTION POSSIBLE)
-#elif defined __HIPCC__
-#define MGONGPU_CUCXTYPE_CXSMPL 1 // ~10 percent slower (1.00E9/double, ~2.9E9/float)
-
-// Complex type in C++: std::complex or cxsmpl (CHOOSE ONLY ONE)
-#else
-//#define MGONGPU_CPPCXTYPE_STDCOMPLEX 1 // ~8 percent slower on float, same on double (5.1E6/double, 9.4E6/float)
-#define MGONGPU_CPPCXTYPE_CXSMPL 1 // new default (5.1E6/double, 10.2E6/float)
 #endif
 
-// CUDA nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation
+// Cuda nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation
 #ifdef __CUDACC__
-#undef MGONGPU_NSIGHT_DEBUG // default in CUDA
+#undef MGONGPU_NSIGHT_DEBUG // default
 //#define MGONGPU_NSIGHT_DEBUG 1
-#else
-#undef MGONGPU_NSIGHT_DEBUG // only option in HIP or C++
 #endif
 
 // SANITY CHECKS (floating point precision for everything but color algebra #537)
@@ -102,21 +84,17 @@
 #error You cannot use double precision for color algebra and single precision elsewhere
 #endif
 
-// SANITY CHECKS (CUDA complex number implementation)
-#ifdef __CUDACC__
-#if defined MGONGPU_CUCXTYPE_THRUST and defined MGONGPU_CUCXTYPE_CUCOMPLEX
-#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CUCXTYPE_THRUST or MGONGPU_CUCXTYPE_CUCOMPLEX for CUDA
-#elif defined MGONGPU_CUCXTYPE_THRUST and defined MGONGPU_CUCXTYPE_CXSMPL
-#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CUCXTYPE_THRUST or MGONGPU_CUCXTYPE_CXSMPL for CUDA
-#elif defined MGONGPU_CUCXTYPE_CUCOMPLEX and defined MGONGPU_CUCXTYPE_CXSMPL
-#error You must CHOOSE (ONE AND) ONLY ONE OF MGONGPU_CUCXTYPE_CUCOMPLEX or MGONGPU_CUCXTYPE_CXSMPL for CUDA
+// SANITY CHECKS (c++ complex number implementation)
+#ifndef __CUDACC__
+#if defined MGONGPU_CPPCXTYPE_STDCOMPLEX and defined MGONGPU_CPPCXTYPE_CXSMPL
+#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CPPCXTYPE_STDCOMPLEX or MGONGPU_CPPCXTYPE_CXSMPL
 #endif
 #endif
 
-// SANITY CHECKS (C++ complex number implementation)
-#ifndef MGONGPUCPP_GPUIMPL
-#if defined MGONGPU_CPPCXTYPE_STDCOMPLEX and defined MGONGPU_CPPCXTYPE_CXSMPL
-#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CPPCXTYPE_STDCOMPLEX or MGONGPU_CPPCXTYPE_CXSMPL for C++
+// SANITY CHECKS (cuda complex number implementation)
+#ifdef __CUDACC__
+#if defined MGONGPU_CUCXTYPE_THRUST and defined MGONGPU_CUCXTYPE_CUCOMPLEX and defined MGONGPU_CUCXTYPE_CXSMPL
+#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CUCXTYPE_THRUST or MGONGPU_CUCXTYPE_CUCOMPLEX or MGONGPU_CUCXTYPE_CXSMPL
 #endif
 #endif
 
@@ -153,7 +131,7 @@ namespace mgOnGpu
   // Alignment requirement for using reinterpret_cast with SIMD vectorized code
   // (using reinterpret_cast with non aligned memory may lead to segmentation faults!)
   // Only needed for C++ code but can be enforced also in NVCC builds of C++ code using CUDA>=11.2 and C++17 (#318, #319, #333)
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
   constexpr int cppAlign = 64; // alignment requirement for SIMD vectorization (64-byte i.e. 512-bit)
 #endif
 
@@ -164,7 +142,7 @@ using mgOnGpu::fptype;
 using mgOnGpu::fptype2;
 
 // C++ SIMD vectorization width (this will be used to set neppV)
-#ifdef MGONGPUCPP_GPUIMPL // CUDA and HIP implementations have no SIMD
+#ifdef __CUDACC__ // CUDA implementation has no SIMD
 #undef MGONGPU_CPPSIMD
 #elif defined __AVX512VL__ && defined MGONGPU_PVW512 // C++ "512z" AVX512 with 512 width (512-bit ie 64-byte): 8 (DOUBLE) or 16 (FLOAT)
 #ifdef MGONGPU_FPTYPE_DOUBLE
@@ -194,9 +172,9 @@ using mgOnGpu::fptype2;
 #undef MGONGPU_CPPSIMD
 #endif
 
-// CUDA nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation
+// Cuda nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation
 // Arguments (not used so far): text is __FUNCTION__, code is 0 (start) or 1 (end)
-#if defined __CUDA__ && defined MGONGPU_NSIGHT_DEBUG /* clang-format off */
+#if defined __CUDACC__ && defined MGONGPU_NSIGHT_DEBUG /* clang-format off */
 #define mgDebugDeclare() __shared__ float mgDebugCounter[mgOnGpu::ntpbMAX];
 #define mgDebugInitialise() { mgDebugCounter[threadIdx.x] = 0; }
 #define mgDebug( code, text ) { mgDebugCounter[threadIdx.x] += 1; }
@@ -208,8 +186,8 @@ using mgOnGpu::fptype2;
 #define mgDebugFinalise() { /*noop*/ }
 #endif /* clang-format on */
 
-// Define empty CUDA/HIP declaration specifiers for C++
-#ifndef MGONGPUCPP_GPUIMPL
+// Define empty CUDA declaration specifiers for C++
+#ifndef __CUDACC__
 #define __global__
 #define __host__
 #define __device__
diff --git a/epochX/cudacpp/ee_mumu.sa/src/mgOnGpuCxtypes.h b/epochX/cudacpp/ee_mumu.sa/src/mgOnGpuCxtypes.h
index 4e7ab03fa2..0cb2f1db7e 100644
--- a/epochX/cudacpp/ee_mumu.sa/src/mgOnGpuCxtypes.h
+++ b/epochX/cudacpp/ee_mumu.sa/src/mgOnGpuCxtypes.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022, based on earlier work by D. Smith) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MGONGPUCXTYPES_H
 #define MGONGPUCXTYPES_H 1
@@ -19,7 +19,7 @@
 #include <complex>
 
 // Complex type in cuda: thrust or cucomplex or cxsmpl
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 #if defined MGONGPU_CUCXTYPE_THRUST
 #pragma clang diagnostic push
 #pragma clang diagnostic ignored "-Wtautological-compare" // for icpx2021/clang13 (https://stackoverflow.com/a/15864661)
@@ -201,7 +201,7 @@ namespace mgOnGpu
 {
 
   // --- Type definitions (complex type: cxtype)
-#ifdef MGONGPUCPP_GPUIMPL // cuda
+#ifdef __CUDACC__ // cuda
 #if defined MGONGPU_CUCXTYPE_THRUST
   typedef thrust::complex<fptype> cxtype;
 #elif defined MGONGPU_CUCXTYPE_CUCOMPLEX
@@ -281,7 +281,7 @@ cxmake( const std::complex<double>& c ) // std::complex to cxsmpl (double-to-flo
 
 //==========================================================================
 
-#if defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CUCXTYPE_THRUST // cuda + thrust
+#if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_THRUST // cuda + thrust
 
 //------------------------------
 // CUDA - using thrust::complex
@@ -317,11 +317,11 @@ cxmake( const cxtype& c )
   return c;
 }
 
-#endif // #if defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CUCXTYPE_THRUST
+#endif // #if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_THRUST
 
 //==========================================================================
 
-#if defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CUCXTYPE_CUCOMPLEX // cuda + cucomplex
+#if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_CUCOMPLEX // cuda + cucomplex
 
 //------------------------------
 // CUDA - using cuComplex
@@ -536,11 +536,11 @@ cxmake( const std::complex<fptype>& c ) // std::complex to cucomplex (float-to-f
   return cxmake( c.real(), c.imag() );
 }
 
-#endif // #if defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CUCXTYPE_CUCOMPLEX
+#endif // #if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_CUCOMPLEX
 
 //==========================================================================
 
-#if not defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CPPCXTYPE_STDCOMPLEX // c++ + stdcomplex
+#if not defined __CUDACC__ and defined MGONGPU_CPPCXTYPE_STDCOMPLEX // c++ + stdcomplex
 
 //------------------------------
 // C++ - using std::complex
@@ -584,7 +584,7 @@ cxmake( const std::complex<double>& c ) // std::complex to std::complex (cast do
 }
 #endif
 
-#endif // #if not defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CPPCXTYPE_STDCOMPLEX
+#endif // #if not defined __CUDACC__ and defined MGONGPU_CPPCXTYPE_STDCOMPLEX
 
 //==========================================================================
 
diff --git a/epochX/cudacpp/ee_mumu.sa/src/mgOnGpuFptypes.h b/epochX/cudacpp/ee_mumu.sa/src/mgOnGpuFptypes.h
index 6f6cee64d6..a1cde16a67 100644
--- a/epochX/cudacpp/ee_mumu.sa/src/mgOnGpuFptypes.h
+++ b/epochX/cudacpp/ee_mumu.sa/src/mgOnGpuFptypes.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MGONGPUFPTYPES_H
 #define MGONGPUFPTYPES_H 1
@@ -13,7 +13,7 @@
 
 //==========================================================================
 
-#ifdef MGONGPUCPP_GPUIMPL // cuda
+#ifdef __CUDACC__ // cuda
 
 //------------------------------
 // Floating point types - Cuda
@@ -57,11 +57,11 @@ fpsqrt( const fptype& f )
 #endif
 }
 
-#endif // #ifdef MGONGPUCPP_GPUIMPL
+#endif // #ifdef __CUDACC__
 
 //==========================================================================
 
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
 
 //------------------------------
 // Floating point types - C++
@@ -85,7 +85,7 @@ fpsqrt( const fptype& f )
   return std::sqrt( f );
 }
 
-#endif // #ifndef MGONGPUCPP_GPUIMPL
+#endif // #ifndef __CUDACC__
 
 //==========================================================================
 
diff --git a/epochX/cudacpp/ee_mumu.sa/src/mgOnGpuVectors.h b/epochX/cudacpp/ee_mumu.sa/src/mgOnGpuVectors.h
index 7904b93c61..9d3e82b1e3 100644
--- a/epochX/cudacpp/ee_mumu.sa/src/mgOnGpuVectors.h
+++ b/epochX/cudacpp/ee_mumu.sa/src/mgOnGpuVectors.h
@@ -108,7 +108,7 @@ namespace mgOnGpu /* clang-format off */
 #endif
 #endif
 
-#else // i.e #ifndef MGONGPU_CPPSIMD (this includes #ifdef MGONGPUCPP_GPUIMPL)
+#else // i.e #ifndef MGONGPU_CPPSIMD (this includes #ifdef __CUDACC__)
 
   const int neppV = 1;
 
@@ -129,7 +129,7 @@ using mgOnGpu::bool_v;
 
 //--------------------------------------------------------------------------
 
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
 
 // Printout to stream for user defined types
 
@@ -744,11 +744,11 @@ fpvsplit1( const fptype2_v& v )
 
 #endif // #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
 
-#endif // #ifndef MGONGPUCPP_GPUIMPL
+#endif // #ifndef __CUDACC__
 
 //==========================================================================
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 
 //------------------------------
 // Vector types - CUDA
@@ -786,12 +786,12 @@ cxternary( const bool& mask, const cxtype& a, const cxtype& b )
   return ( mask ? a : b );
 }
 
-#endif // #ifdef MGONGPUCPP_GPUIMPL
+#endif // #ifdef __CUDACC__
 
 //==========================================================================
 
 // Scalar-or-vector types: scalar in CUDA, vector or scalar in C++
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 typedef bool bool_sv;
 typedef fptype fptype_sv;
 typedef fptype2 fptype2_sv;
@@ -812,7 +812,7 @@ typedef mgOnGpu::cxtype_ref cxtype_sv_ref;
 #endif
 
 // Scalar-or-vector zeros: scalar in CUDA, vector or scalar in C++
-#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
+#ifdef __CUDACC__ /* clang-format off */
 inline __host__ __device__ cxtype cxzero_sv(){ return cxtype( 0, 0 ); }
 #elif defined MGONGPU_CPPSIMD
 inline cxtype_v cxzero_sv() { return cxtype_v(); } // RRRR=0000 IIII=0000
diff --git a/epochX/cudacpp/ee_mumu.sa/src/rambo.h b/epochX/cudacpp/ee_mumu.sa/src/rambo.h
index cd7e1008ea..e02ea52496 100644
--- a/epochX/cudacpp/ee_mumu.sa/src/rambo.h
+++ b/epochX/cudacpp/ee_mumu.sa/src/rambo.h
@@ -4,7 +4,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 
 #include "mgOnGpuConfig.h"
@@ -18,7 +18,7 @@
 #include <iostream>
 
 // Simplified rambo version for 2 to N (with N>=2) processes with massless particles
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -83,7 +83,7 @@ namespace mg5amcCpu
       static bool first = true;
       if( first )
       {
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
         if constexpr( M_ACCESS::isOnDevice() ) // avoid
         {
           const int ievt0 = 0;
@@ -166,7 +166,7 @@ namespace mg5amcCpu
     wt = po2log;
     if( nparf != 2 ) wt = ( 2. * nparf - 4. ) * log( energy ) + z[nparf - 1];
 
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
     // issue warnings if weight is too small or too large
     static int iwarn[5] = { 0, 0, 0, 0, 0 };
     if( wt < -180. )
diff --git a/epochX/cudacpp/gg_tt.mad/COPYRIGHT b/epochX/cudacpp/gg_tt.mad/COPYRIGHT
index 84a883fbb0..a134b5fef9 100644
--- a/epochX/cudacpp/gg_tt.mad/COPYRIGHT
+++ b/epochX/cudacpp/gg_tt.mad/COPYRIGHT
@@ -15,7 +15,6 @@ The full development team currently includes the following authors :
   Stephan Hageboeck (CERN)
   Olivier Mattelaer (Universite Catholique de Louvain, original author)
   Stefan Roiser (CERN, original author)
-  Joergen Teig (CERN)
   Andrea Valassi (CERN, original author)
   Zenny Wettersten (CERN)
 See https://github.com/madgraph5/madgraph4gpu for more details. For the full
diff --git a/epochX/cudacpp/gg_tt.mad/Source/DHELAS/aloha_file.inc b/epochX/cudacpp/gg_tt.mad/Source/DHELAS/aloha_file.inc
index 5597c614b0..59e590217d 100644
--- a/epochX/cudacpp/gg_tt.mad/Source/DHELAS/aloha_file.inc
+++ b/epochX/cudacpp/gg_tt.mad/Source/DHELAS/aloha_file.inc
@@ -1 +1 @@
-ALOHARoutine = FFV1_1.o FFV1_0.o FFV1_2.o VVV1P0_1.o
+ALOHARoutine = FFV1_1.o FFV1_2.o VVV1P0_1.o FFV1_0.o
diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/Bridge.h b/epochX/cudacpp/gg_tt.mad/SubProcesses/Bridge.h
index c04628dfd1..4cafe0c997 100644
--- a/epochX/cudacpp/gg_tt.mad/SubProcesses/Bridge.h
+++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/Bridge.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: S. Roiser (Nov 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Roiser, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef BRIDGE_H
 #define BRIDGE_H 1
@@ -22,7 +22,7 @@
 #include <memory>
 #include <type_traits>
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -82,7 +82,7 @@ namespace mg5amcCpu
     Bridge& operator=( const Bridge& ) = delete;
     Bridge& operator=( Bridge&& ) = delete;
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
     /**
      * Set the gpublocks and gputhreads for the gpusequence - throws if evnt != gpublocks*gputhreads
      * (this is needed for BridgeKernel tests rather than for actual production use in Fortran)
@@ -149,7 +149,7 @@ namespace mg5amcCpu
     unsigned int m_nevt; // number of events
     int m_nGoodHel;      // the number of good helicities (-1 initially when they have not yet been calculated)
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
     int m_gputhreads; // number of gpu threads (default set from number of events, can be modified)
     int m_gpublocks;  // number of gpu blocks (default set from number of events, can be modified)
     mg5amcGpu::DeviceBuffer<FORTRANFPTYPE, sizePerEventMomenta> m_devMomentaF;
@@ -186,12 +186,12 @@ namespace mg5amcCpu
   // Forward declare transposition methods
   //
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 
   template<typename Tin, typename Tout>
   __global__ void dev_transposeMomentaF2C( const Tin* in, Tout* out, const unsigned int nevt );
 
-#endif // MGONGPUCPP_GPUIMPL
+#endif // __CUDACC__
 
   template<typename Tin, typename Tout>
   void hst_transposeMomentaF2C( const Tin* in, Tout* out, const unsigned int nevt );
@@ -208,7 +208,7 @@ namespace mg5amcCpu
   Bridge<FORTRANFPTYPE>::Bridge( unsigned int nevtF, unsigned int nparF, unsigned int np4F )
     : m_nevt( nevtF )
     , m_nGoodHel( -1 )
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
     , m_gputhreads( 256 )                  // default number of gpu threads
     , m_gpublocks( m_nevt / m_gputhreads ) // this ensures m_nevt <= m_gpublocks*m_gputhreads
     , m_devMomentaF( m_nevt )
@@ -232,7 +232,7 @@ namespace mg5amcCpu
   {
     if( nparF != CPPProcess::npar ) throw std::runtime_error( "Bridge constructor: npar mismatch" );
     if( np4F != CPPProcess::np4 ) throw std::runtime_error( "Bridge constructor: np4 mismatch" );
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
     if( ( m_nevt < s_gputhreadsmin ) || ( m_nevt % s_gputhreadsmin != 0 ) )
       throw std::runtime_error( "Bridge constructor: nevt should be a multiple of " + std::to_string( s_gputhreadsmin ) );
     while( m_nevt != m_gpublocks * m_gputhreads )
@@ -250,11 +250,11 @@ namespace mg5amcCpu
     std::cout << "WARNING! Instantiate host Bridge (nevt=" << m_nevt << ")" << std::endl;
     mg5amcCpu::CPPProcess process( /*verbose=*/false );
     m_pmek.reset( new mg5amcCpu::MatrixElementKernelHost( m_hstMomentaC, m_hstGs, m_hstRndHel, m_hstRndCol, m_hstMEs, m_hstSelHel, m_hstSelCol, m_nevt ) );
-#endif // MGONGPUCPP_GPUIMPL
+#endif // __CUDACC__
     process.initProc( "../../Cards/param_card.dat" );
   }
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
   template<typename FORTRANFPTYPE>
   void Bridge<FORTRANFPTYPE>::set_gpugrid( const int gpublocks, const int gputhreads )
   {
@@ -268,7 +268,7 @@ namespace mg5amcCpu
   }
 #endif
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
   template<typename FORTRANFPTYPE>
   void Bridge<FORTRANFPTYPE>::gpu_sequence( const FORTRANFPTYPE* momenta,
                                             const FORTRANFPTYPE* gs,
@@ -283,14 +283,14 @@ namespace mg5amcCpu
     constexpr int neppM = MemoryAccessMomenta::neppM;
     if constexpr( neppM == 1 && std::is_same_v<FORTRANFPTYPE, fptype> )
     {
-      gpuMemcpy( m_devMomentaC.data(), momenta, m_devMomentaC.bytes(), gpuMemcpyHostToDevice );
+      checkCuda( cudaMemcpy( m_devMomentaC.data(), momenta, m_devMomentaC.bytes(), cudaMemcpyHostToDevice ) );
     }
     else
     {
-      gpuMemcpy( m_devMomentaF.data(), momenta, m_devMomentaF.bytes(), gpuMemcpyHostToDevice );
+      checkCuda( cudaMemcpy( m_devMomentaF.data(), momenta, m_devMomentaF.bytes(), cudaMemcpyHostToDevice ) );
       const int thrPerEvt = CPPProcess::npar * CPPProcess::np4; // AV: transpose alg does 1 element per thread (NOT 1 event per thread)
       //const int thrPerEvt = 1; // AV: try new alg with 1 event per thread... this seems slower
-      gpuLaunchKernel( dev_transposeMomentaF2C, m_gpublocks * thrPerEvt, m_gputhreads, m_devMomentaF.data(), m_devMomentaC.data(), m_nevt );
+      dev_transposeMomentaF2C<<<m_gpublocks * thrPerEvt, m_gputhreads>>>( m_devMomentaF.data(), m_devMomentaC.data(), m_nevt );
     }
     if constexpr( std::is_same_v<FORTRANFPTYPE, fptype> )
     {
@@ -333,7 +333,7 @@ namespace mg5amcCpu
   }
 #endif
 
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
   template<typename FORTRANFPTYPE>
   void Bridge<FORTRANFPTYPE>::cpu_sequence( const FORTRANFPTYPE* momenta,
                                             const FORTRANFPTYPE* gs,
@@ -388,7 +388,7 @@ namespace mg5amcCpu
   // - C++ array: momenta[npagM][npar][np4][neppM] with nevt=npagM*neppM (AOSOA)
   //
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
   template<typename Tin, typename Tout>
   __global__ void dev_transposeMomentaF2C( const Tin* in, Tout* out, const unsigned int nevt )
   {
diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/BridgeKernels.cc b/epochX/cudacpp/gg_tt.mad/SubProcesses/BridgeKernels.cc
index 90c7f2d3b8..cef4cb3c71 100644
--- a/epochX/cudacpp/gg_tt.mad/SubProcesses/BridgeKernels.cc
+++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/BridgeKernels.cc
@@ -1,11 +1,10 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #include "BridgeKernels.h"
 
-#include "GpuAbstraction.h"
 #include "MemoryAccessMomenta.h"
 
 #include <sstream>
@@ -15,7 +14,7 @@ constexpr int npar = CPPProcess::npar; // #particles in total (external = initia
 
 //============================================================================
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -46,7 +45,7 @@ namespace mg5amcCpu
 
 //============================================================================
 
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
 namespace mg5amcCpu
 {
 
@@ -97,7 +96,7 @@ namespace mg5amcCpu
 
 //============================================================================
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 namespace mg5amcGpu
 {
 
diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/BridgeKernels.h b/epochX/cudacpp/gg_tt.mad/SubProcesses/BridgeKernels.h
index 3efef8ce97..15eb4bff4d 100644
--- a/epochX/cudacpp/gg_tt.mad/SubProcesses/BridgeKernels.h
+++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/BridgeKernels.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef BRIDGEKERNELS_H
 #define BRIDGEKERNELS_H 1
@@ -12,7 +12,7 @@
 #include "MatrixElementKernels.h"
 #include "MemoryBuffers.h"
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -49,7 +49,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
   // A Bridge wrapper class encapsulating matrix element calculations on a CPU host
   class BridgeKernelHost final : public BridgeKernelBase
   {
@@ -89,7 +89,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
   // A Bridge wrapper class encapsulating matrix element calculations on a GPU device
   class BridgeKernelDevice : public BridgeKernelBase
   {
diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/CommonRandomNumberKernel.cc b/epochX/cudacpp/gg_tt.mad/SubProcesses/CommonRandomNumberKernel.cc
index 010bc4cbd0..985b39f576 100644
--- a/epochX/cudacpp/gg_tt.mad/SubProcesses/CommonRandomNumberKernel.cc
+++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/CommonRandomNumberKernel.cc
@@ -1,16 +1,15 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #include "CommonRandomNumbers.h"
-#include "GpuAbstraction.h"
 #include "MemoryBuffers.h"
 #include "RandomNumberKernels.h"
 
 #include <cassert>
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/CrossSectionKernels.cc b/epochX/cudacpp/gg_tt.mad/SubProcesses/CrossSectionKernels.cc
index c15b39844d..0b355a3c8d 100644
--- a/epochX/cudacpp/gg_tt.mad/SubProcesses/CrossSectionKernels.cc
+++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/CrossSectionKernels.cc
@@ -1,11 +1,10 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #include "CrossSectionKernels.h"
 
-#include "GpuAbstraction.h"
 #include "MemoryAccessMatrixElements.h"
 #include "MemoryAccessWeights.h"
 #include "MemoryBuffers.h"
@@ -78,7 +77,7 @@ debug_me_is_abnormal( const fptype& me, size_t ievtALL )
 
 //============================================================================
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -186,7 +185,7 @@ namespace mg5amcCpu
 
 //============================================================================
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 namespace mg5amcGpu
 {
 
diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/CrossSectionKernels.h b/epochX/cudacpp/gg_tt.mad/SubProcesses/CrossSectionKernels.h
index 4d9659e04e..7933ca4bbf 100644
--- a/epochX/cudacpp/gg_tt.mad/SubProcesses/CrossSectionKernels.h
+++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/CrossSectionKernels.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef CROSSSECTIONKERNELS_H
 #define CROSSSECTIONKERNELS_H 1
@@ -13,7 +13,7 @@
 
 //============================================================================
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -96,7 +96,7 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
   /*
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
   // A class encapsulating the calculation of event statistics on a GPU device
   class CrossSectionKernelDevice : public CrossSectionKernelBase, public NumberOfEvents
   {
diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/CudaRuntime.h b/epochX/cudacpp/gg_tt.mad/SubProcesses/CudaRuntime.h
new file mode 100644
index 0000000000..64ce52f4b3
--- /dev/null
+++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/CudaRuntime.h
@@ -0,0 +1,85 @@
+// Copyright (C) 2020-2023 CERN and UCLouvain.
+// Licensed under the GNU Lesser General Public License (version 3 or later).
+// Created by: S. Roiser (Jul 2020) for the MG5aMC CUDACPP plugin.
+// Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+
+#ifndef MG5AMC_CUDARUNTIME_H
+#define MG5AMC_CUDARUNTIME_H 1
+
+// MG5AMC on GPU uses the CUDA runtime API, not the lower level CUDA driver API
+// See https://docs.nvidia.com/cuda/cuda-runtime-api/driver-vs-runtime-api.html#driver-vs-runtime-api
+
+#include <cassert>
+#include <iostream>
+
+//--------------------------------------------------------------------------
+
+// See https://stackoverflow.com/a/14038590
+#ifdef __CUDACC__ /* clang-format off */
+#define checkCuda( code ) { assertCuda( code, __FILE__, __LINE__ ); }
+inline void assertCuda( cudaError_t code, const char* file, int line, bool abort = true )
+{
+  if( code != cudaSuccess )
+  {
+    printf( "ERROR! assertCuda: '%s' (%d) in %s:%d\n", cudaGetErrorString( code ), code, file, line );
+    if( abort ) assert( code == cudaSuccess );
+  }
+}
+#endif /* clang-format on */
+
+//--------------------------------------------------------------------------
+
+#ifdef __CUDACC__
+namespace mg5amcGpu
+{
+  // Instantiate a CudaRuntime at the beginnining of the application's main to
+  // invoke cudaSetDevice(0) in the constructor and book a cudaDeviceReset() call in the destructor
+  // *** FIXME! This will all need to be designed differently when going to multi-GPU nodes! ***
+  struct CudaRuntime final
+  {
+    CudaRuntime( const bool debug = true )
+      : m_debug( debug ) { setUp( m_debug ); }
+    ~CudaRuntime() { tearDown( m_debug ); }
+    CudaRuntime( const CudaRuntime& ) = delete;
+    CudaRuntime( CudaRuntime&& ) = delete;
+    CudaRuntime& operator=( const CudaRuntime& ) = delete;
+    CudaRuntime& operator=( CudaRuntime&& ) = delete;
+    bool m_debug;
+
+    // Set up CUDA application
+    // ** NB: strictly speaking this is not needed when using the CUDA runtime API **
+    // Calling cudaSetDevice on startup is useful to properly book-keep the time spent in CUDA initialization
+    static void setUp( const bool debug = true )
+    {
+      // ** NB: it is useful to call cudaSetDevice, or cudaFree, to properly book-keep the time spent in CUDA initialization
+      // ** NB: otherwise, the first CUDA operation (eg a cudaMemcpyToSymbol in CPPProcess ctor) appears to take much longer!
+      /*
+      // [We initially added cudaFree(0) to "ease profile analysis" only because it shows up as a big recognizable block!]
+      // No explicit initialization is needed: https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#initialization
+      // It is not clear what cudaFree(0) does at all: https://stackoverflow.com/questions/69967813/
+      if ( debug ) std::cout << "__CudaRuntime: calling cudaFree(0)" << std::endl;
+      checkCuda( cudaFree( 0 ) ); // SLOW!
+      */
+      // Replace cudaFree(0) by cudaSetDevice(0), even if it is not really needed either
+      // (but see https://developer.nvidia.com/blog/cuda-pro-tip-always-set-current-device-avoid-multithreading-bugs)
+      if( debug ) std::cout << "__CudaRuntime: calling cudaSetDevice(0)" << std::endl;
+      checkCuda( cudaSetDevice( 0 ) ); // SLOW!
+    }
+
+    // Tear down CUDA application (call cudaDeviceReset)
+    // ** NB: strictly speaking this is not needed when using the CUDA runtime API **
+    // Calling cudaDeviceReset on shutdown is only needed for checking memory leaks in cuda-memcheck
+    // See https://docs.nvidia.com/cuda/cuda-memcheck/index.html#leak-checking
+    static void tearDown( const bool debug = true )
+    {
+      if( debug ) std::cout << "__CudaRuntime: calling cudaDeviceReset()" << std::endl;
+      checkCuda( cudaDeviceReset() );
+    }
+  };
+
+}
+#endif
+
+//--------------------------------------------------------------------------
+
+#endif // MG5AMC_CUDARUNTIME_H
diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/CurandRandomNumberKernel.cc b/epochX/cudacpp/gg_tt.mad/SubProcesses/CurandRandomNumberKernel.cc
index 38c477c17a..eb56333b03 100644
--- a/epochX/cudacpp/gg_tt.mad/SubProcesses/CurandRandomNumberKernel.cc
+++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/CurandRandomNumberKernel.cc
@@ -1,9 +1,9 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
-#include "GpuRuntime.h"
+#include "CudaRuntime.h"
 #include "MemoryBuffers.h"
 #include "RandomNumberKernels.h"
 
@@ -114,7 +114,7 @@ namespace mg5amcCpu
     /*
     printf( "\nCurandRandomNumberKernel::generateRnarray size = %d\n", (int)m_rnarray.size() );
     fptype* data = m_rnarray.data();
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
     if( m_rnarray.isOnDevice() )
     {
       data = new fptype[m_rnarray.size()]();
@@ -123,7 +123,7 @@ namespace mg5amcCpu
 #endif
     for( int i = 0; i < ( (int)m_rnarray.size() / 4 ); i++ )
       printf( "[%4d] %f %f %f %f\n", i * 4, data[i * 4], data[i * 4 + 2], data[i * 4 + 2], data[i * 4 + 3] );
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
     if( m_rnarray.isOnDevice() ) delete[] data;
 #endif
     */
diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/EventStatistics.h b/epochX/cudacpp/gg_tt.mad/SubProcesses/EventStatistics.h
index b425a5bade..48b51e0a49 100644
--- a/epochX/cudacpp/gg_tt.mad/SubProcesses/EventStatistics.h
+++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/EventStatistics.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef EventStatistics_H
 #define EventStatistics_H 1
@@ -16,7 +16,7 @@
 #include <limits>
 #include <string>
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/MadgraphTest.h b/epochX/cudacpp/gg_tt.mad/SubProcesses/MadgraphTest.h
index d2ff326e20..fd7734ce42 100644
--- a/epochX/cudacpp/gg_tt.mad/SubProcesses/MadgraphTest.h
+++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/MadgraphTest.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: S. Hageboeck (Dec 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MADGRAPHTEST_H_
 #define MADGRAPHTEST_H_ 1
@@ -21,7 +21,7 @@
 #include <string>
 #include <vector>
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 using mg5amcGpu::CPPProcess;
 #else
 using mg5amcCpu::CPPProcess;
@@ -200,7 +200,7 @@ class MadgraphTest : public testing::TestWithParam<TestDriverBase*>
 
 // Since we link both the CPU-only and GPU tests into the same executable, we prevent
 // a multiply defined symbol by only compiling this in the non-CUDA phase:
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
 
 /// Compare momenta and matrix elements.
 /// This uses an implementation of TestDriverBase to run a madgraph workflow,
@@ -309,6 +309,6 @@ TEST_P( MadgraphTest, CompareMomentaAndME )
   }
 }
 
-#endif // MGONGPUCPP_GPUIMPL
+#endif // __CUDACC__
 
 #endif /* MADGRAPHTEST_H_ */
diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/MatrixElementKernels.cc b/epochX/cudacpp/gg_tt.mad/SubProcesses/MatrixElementKernels.cc
index d6d6c4f179..30257195b6 100644
--- a/epochX/cudacpp/gg_tt.mad/SubProcesses/MatrixElementKernels.cc
+++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/MatrixElementKernels.cc
@@ -1,12 +1,12 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #include "MatrixElementKernels.h"
 
 #include "CPPProcess.h"
-#include "GpuRuntime.h" // Includes the abstraction for Nvidia/AMD compilation
+#include "CudaRuntime.h"
 #include "MemoryAccessMomenta.h"
 #include "MemoryBuffers.h"
 
@@ -14,7 +14,7 @@
 
 //============================================================================
 
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
 namespace mg5amcCpu
 {
 
@@ -143,7 +143,7 @@ namespace mg5amcCpu
 
 //============================================================================
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 namespace mg5amcGpu
 {
 
@@ -202,13 +202,13 @@ namespace mg5amcGpu
     PinnedHostBufferHelicityMask hstIsGoodHel( ncomb );
     DeviceBufferHelicityMask devIsGoodHel( ncomb );
     // ... 0d1. Compute good helicity mask on the device
-    gpuLaunchKernel( computeDependentCouplings, m_gpublocks, m_gputhreads, m_gs.data(), m_couplings.data() );
+    computeDependentCouplings<<<m_gpublocks, m_gputhreads>>>( m_gs.data(), m_couplings.data() );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    gpuLaunchKernel( sigmaKin_getGoodHel, m_gpublocks, m_gputhreads, m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_numerators.data(), m_denominators.data(), devIsGoodHel.data() );
+    sigmaKin_getGoodHel<<<m_gpublocks, m_gputhreads>>>( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_numerators.data(), m_denominators.data(), devIsGoodHel.data() );
 #else
-    gpuLaunchKernel( sigmaKin_getGoodHel, m_gpublocks, m_gputhreads, m_momenta.data(), m_couplings.data(), m_matrixElements.data(), devIsGoodHel.data() );
+    sigmaKin_getGoodHel<<<m_gpublocks, m_gputhreads>>>( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), devIsGoodHel.data() );
 #endif
-    checkGpu( gpuPeekAtLastError() );
+    checkCuda( cudaPeekAtLastError() );
     // ... 0d2. Copy back good helicity mask to the host
     copyHostFromDevice( hstIsGoodHel, devIsGoodHel );
     // ... 0d3. Copy back good helicity list to constant memory on the device
@@ -219,19 +219,19 @@ namespace mg5amcGpu
 
   void MatrixElementKernelDevice::computeMatrixElements( const unsigned int channelId )
   {
-    gpuLaunchKernel( computeDependentCouplings, m_gpublocks, m_gputhreads, m_gs.data(), m_couplings.data() );
+    computeDependentCouplings<<<m_gpublocks, m_gputhreads>>>( m_gs.data(), m_couplings.data() );
 #ifndef MGONGPU_NSIGHT_DEBUG
     constexpr unsigned int sharedMemSize = 0;
 #else
     constexpr unsigned int sharedMemSize = ntpbMAX * sizeof( float );
 #endif
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    gpuLaunchKernelSharedMem( sigmaKin, m_gpublocks, m_gputhreads, sharedMemSize, m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), channelId, m_numerators.data(), m_denominators.data(), m_selhel.data(), m_selcol.data() );
+    sigmaKin<<<m_gpublocks, m_gputhreads, sharedMemSize>>>( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), channelId, m_numerators.data(), m_denominators.data(), m_selhel.data(), m_selcol.data() );
 #else
-    gpuLaunchKernelSharedMem( sigmaKin, m_gpublocks, m_gputhreads, sharedMemSize, m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), m_selhel.data(), m_selcol.data() );
+    sigmaKin<<<m_gpublocks, m_gputhreads, sharedMemSize>>>( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), m_selhel.data(), m_selcol.data() );
 #endif
-    checkGpu( gpuPeekAtLastError() );
-    checkGpu( gpuDeviceSynchronize() );
+    checkCuda( cudaPeekAtLastError() );
+    checkCuda( cudaDeviceSynchronize() );
   }
 
   //--------------------------------------------------------------------------
diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/MatrixElementKernels.h b/epochX/cudacpp/gg_tt.mad/SubProcesses/MatrixElementKernels.h
index 72bd8f195b..23e84757a2 100644
--- a/epochX/cudacpp/gg_tt.mad/SubProcesses/MatrixElementKernels.h
+++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/MatrixElementKernels.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MATRIXELEMENTKERNELS_H
 #define MATRIXELEMENTKERNELS_H 1
@@ -10,7 +10,7 @@
 
 #include "MemoryBuffers.h"
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -81,7 +81,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
   // A class encapsulating matrix element calculations on a CPU host
   class MatrixElementKernelHost final : public MatrixElementKernelBase, public NumberOfEvents
   {
@@ -130,7 +130,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
   // A class encapsulating matrix element calculations on a GPU device
   class MatrixElementKernelDevice : public MatrixElementKernelBase, public NumberOfEvents
   {
diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryAccessHelpers.h b/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryAccessHelpers.h
index db73e4e064..c82a6c7635 100644
--- a/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryAccessHelpers.h
+++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryAccessHelpers.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MemoryAccessHelpers_H
 #define MemoryAccessHelpers_H 1
@@ -105,7 +105,7 @@ class KernelAccessHelper : public MemoryAccessHelper<T>
     }
     else
     {
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
       const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid
       //printf( "kernelAccessRecord: ievt=%d threadId=%d\n", ievt, threadIdx.x );
       return T::ieventAccessRecord( buffer, ievt ); // NB fptype and fptype_sv coincide for CUDA
diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryAccessMomenta.h b/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryAccessMomenta.h
index 38fade09fb..0ac4faa3c7 100644
--- a/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryAccessMomenta.h
+++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryAccessMomenta.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MemoryAccessMomenta_H
 #define MemoryAccessMomenta_H 1
@@ -12,7 +12,7 @@
 #include "MemoryAccessHelpers.h"
 #include "MemoryAccessVectors.h"
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 using mg5amcGpu::CPPProcess;
 #else
 using mg5amcCpu::CPPProcess;
@@ -29,7 +29,7 @@ class MemoryAccessMomentaBase //_AOSOAv1
 
   // Number of Events Per Page in the momenta AOSOA memory buffer layout
   // (these are all best kept as a compile-time constants: see issue #23)
-#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
+#ifdef __CUDACC__ /* clang-format off */
   // -----------------------------------------------------------------------------------------------
   // --- GPUs: neppM is best set to a power of 2 times the number of fptype's in a 32-byte cacheline
   // --- This is relevant to ensure coalesced access to momenta in global memory
diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryAccessRandomNumbers.h b/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryAccessRandomNumbers.h
index 40cb089135..e2988d39f3 100644
--- a/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryAccessRandomNumbers.h
+++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryAccessRandomNumbers.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MemoryAccessRandomNumbers_H
 #define MemoryAccessRandomNumbers_H 1
@@ -11,7 +11,7 @@
 #include "CPPProcess.h"
 #include "MemoryAccessHelpers.h"
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 using mg5amcGpu::CPPProcess;
 #else
 using mg5amcCpu::CPPProcess;
diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryAccessVectors.h b/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryAccessVectors.h
index 08faccff0f..e9b197368e 100644
--- a/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryAccessVectors.h
+++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryAccessVectors.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MemoryAccessVectors_H
 #define MemoryAccessVectors_H 1
@@ -10,7 +10,7 @@
 
 #include "mgOnGpuVectors.h"
 
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
 namespace mg5amcCpu // this is only needed for CPU SIMD vectorization
 {
 
diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryBuffers.h b/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryBuffers.h
index 7756a71621..3093e6ed18 100644
--- a/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryBuffers.h
+++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryBuffers.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021, based on earlier work by S. Hageboeck) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Roiser, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MemoryBuffers_H
 #define MemoryBuffers_H 1
@@ -11,12 +11,12 @@
 #include "mgOnGpuCxtypes.h"
 
 #include "CPPProcess.h"
-#include "GpuRuntime.h"
+#include "CudaRuntime.h"
 #include "Parameters_sm.h"
 
 #include <sstream>
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -87,7 +87,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
   constexpr bool HostBufferALIGNED = false;   // ismisaligned=false
   constexpr bool HostBufferMISALIGNED = true; // ismisaligned=true
 
@@ -119,7 +119,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
   // A class encapsulating a CUDA pinned host buffer
   template<typename T>
   class PinnedHostBufferBase : public BufferBase<T>
@@ -128,18 +128,18 @@ namespace mg5amcCpu
     PinnedHostBufferBase( const size_t size )
       : BufferBase<T>( size, false )
     {
-      gpuMallocHost( &( this->m_data ), this->bytes() );
+      checkCuda( cudaMallocHost( &( this->m_data ), this->bytes() ) );
     }
     virtual ~PinnedHostBufferBase()
     {
-      gpuFreeHost( this->m_data );
+      checkCuda( cudaFreeHost( this->m_data ) );
     }
   };
 #endif
 
   //--------------------------------------------------------------------------
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
   // A class encapsulating a CUDA device buffer
   template<typename T>
   class DeviceBufferBase : public BufferBase<T>
@@ -148,18 +148,18 @@ namespace mg5amcCpu
     DeviceBufferBase( const size_t size )
       : BufferBase<T>( size, true )
     {
-      gpuMalloc( &( this->m_data ), this->bytes() );
+      checkCuda( cudaMalloc( &( this->m_data ), this->bytes() ) );
     }
     virtual ~DeviceBufferBase()
     {
-      gpuFree( this->m_data );
+      checkCuda( cudaFree( this->m_data ) );
     }
   };
 #endif
 
   //--------------------------------------------------------------------------
 
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
   // A class encapsulating a C++ host buffer for a given number of events
   template<typename T, size_t sizePerEvent, bool ismisaligned>
   class HostBuffer : public HostBufferBase<T, ismisaligned>, virtual private NumberOfEvents
@@ -175,7 +175,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
   // A class encapsulating a CUDA pinned host buffer for a given number of events
   template<typename T, size_t sizePerEvent>
   class PinnedHostBuffer : public PinnedHostBufferBase<T>, virtual private NumberOfEvents
@@ -191,7 +191,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
   // A class encapsulating a CUDA device buffer for a given number of events
   template<typename T, size_t sizePerEvent>
   class DeviceBuffer : public DeviceBufferBase<T>, virtual private NumberOfEvents
@@ -213,7 +213,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for momenta random numbers
   constexpr size_t sizePerEventRndNumMomenta = MemoryBuffers::np4 * MemoryBuffers::nparf;
 
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
   // A class encapsulating a C++ host buffer for momenta random numbers
   typedef HostBuffer<fptype, sizePerEventRndNumMomenta, HostBufferALIGNED> HostBufferRndNumMomenta;
 #else
@@ -232,7 +232,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer with ONE fptype per event
   constexpr size_t sizePerEventOneFp = 1;
 
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
   // A class encapsulating a C++ host buffer with ONE fptype per event
   typedef HostBuffer<fptype, sizePerEventOneFp, HostBufferALIGNED> HostBufferOneFp;
 #else
@@ -257,7 +257,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for Gs
   constexpr size_t sizePerEventGs = 1;
 
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
   // A class encapsulating a C++ host buffer for gs
   typedef HostBuffer<fptype, sizePerEventGs, HostBufferALIGNED> HostBufferGs;
 #else
@@ -276,7 +276,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for numerators
   constexpr size_t sizePerEventNumerators = 1;
 
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
   // A class encapsulating a C++ host buffer for gs
   typedef HostBuffer<fptype, sizePerEventNumerators, HostBufferALIGNED> HostBufferNumerators;
 #else
@@ -296,7 +296,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for denominators
   constexpr size_t sizePerEventDenominators = 1;
 
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
   // A class encapsulating a C++ host buffer for gs
   typedef HostBuffer<fptype, sizePerEventDenominators, HostBufferALIGNED> HostBufferDenominators;
 #else
@@ -315,7 +315,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for random numbers
   constexpr size_t sizePerEventCouplings = MemoryBuffers::ndcoup * MemoryBuffers::nx2;
 
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
   // A class encapsulating a C++ host buffer for gs
   typedef HostBuffer<fptype, sizePerEventCouplings, HostBufferALIGNED> HostBufferCouplings;
 #else
@@ -333,7 +333,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for momenta
   constexpr size_t sizePerEventMomenta = MemoryBuffers::np4 * MemoryBuffers::npar;
 
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
   // A class encapsulating a C++ host buffer for momenta
   typedef HostBuffer<fptype, sizePerEventMomenta, HostBufferALIGNED> HostBufferMomenta;
   //typedef HostBuffer<fptype, sizePerEventMomenta, HostBufferMISALIGNED> HostBufferMomenta; // TEST MISALIGNMENT!
@@ -352,7 +352,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for sampling weights
   constexpr size_t sizePerEventWeights = 1;
 
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
   // A class encapsulating a C++ host buffer for sampling weights
   typedef HostBuffer<fptype, sizePerEventWeights, HostBufferALIGNED> HostBufferWeights;
 #else
@@ -370,7 +370,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for matrix elements
   constexpr size_t sizePerEventMatrixElements = 1;
 
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
   // A class encapsulating a C++ host buffer for matrix elements
   typedef HostBuffer<fptype, sizePerEventMatrixElements, HostBufferALIGNED> HostBufferMatrixElements;
 #else
@@ -385,7 +385,7 @@ namespace mg5amcCpu
   // A base class encapsulating a memory buffer for the helicity mask
   typedef BufferBase<bool> BufferHelicityMask;
 
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
   // A class encapsulating a C++ host buffer for the helicity mask
   typedef HostBufferBase<bool, HostBufferALIGNED> HostBufferHelicityMask;
 #else
@@ -403,7 +403,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for wavefunctions
   constexpr size_t sizePerEventWavefunctions = MemoryBuffers::nw6 * MemoryBuffers::nx2;
 
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
   // A class encapsulating a C++ host buffer for wavefunctions
   typedef HostBuffer<fptype, sizePerEventWavefunctions, HostBufferALIGNED> HostBufferWavefunctions;
 #else
@@ -421,7 +421,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for helicity random numbers
   constexpr size_t sizePerEventRndNumHelicity = 1;
 
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
   // A class encapsulating a C++ host buffer for helicity random numbers
   typedef HostBuffer<fptype, sizePerEventRndNumHelicity, HostBufferALIGNED> HostBufferRndNumHelicity;
 #else
@@ -439,7 +439,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for color random numbers
   constexpr size_t sizePerEventRndNumColor = 1;
 
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
   // A class encapsulating a C++ host buffer for color random numbers
   typedef HostBuffer<fptype, sizePerEventRndNumColor, HostBufferALIGNED> HostBufferRndNumColor;
 #else
@@ -457,7 +457,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for helicity selection
   constexpr size_t sizePerEventSelectedHelicity = 1;
 
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
   // A class encapsulating a C++ host buffer for helicity selection
   typedef HostBuffer<int, sizePerEventSelectedHelicity, HostBufferALIGNED> HostBufferSelectedHelicity;
 #else
@@ -475,7 +475,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for color selection
   constexpr size_t sizePerEventSelectedColor = 1;
 
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
   // A class encapsulating a C++ host buffer for color selection
   typedef HostBuffer<int, sizePerEventSelectedColor, HostBufferALIGNED> HostBufferSelectedColor;
 #else
@@ -487,7 +487,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
   template<class Tdst, class Tsrc>
   void copyDeviceFromHost( Tdst& dst, const Tsrc& src ) // keep the same order of arguments as in memcpy
   {
@@ -504,13 +504,13 @@ namespace mg5amcCpu
       throw std::runtime_error( sstr.str() );
     }
     // NB (PR #45): cudaMemcpy involves an intermediate memcpy to pinned memory if host array is a not a pinned host array
-    gpuMemcpy( dst.data(), src.data(), src.bytes(), gpuMemcpyHostToDevice );
+    checkCuda( cudaMemcpy( dst.data(), src.data(), src.bytes(), cudaMemcpyHostToDevice ) );
   }
 #endif
 
   //--------------------------------------------------------------------------
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
   template<class Tdst, class Tsrc>
   void copyHostFromDevice( Tdst& dst, const Tsrc& src ) // keep the same order of arguments as in memcpy
   {
@@ -527,7 +527,7 @@ namespace mg5amcCpu
       throw std::runtime_error( sstr.str() );
     }
     // NB (PR #45): cudaMemcpy involves an intermediate memcpy to pinned memory if host array is a not a pinned host array
-    gpuMemcpy( dst.data(), src.data(), src.bytes(), gpuMemcpyDeviceToHost );
+    checkCuda( cudaMemcpy( dst.data(), src.data(), src.bytes(), cudaMemcpyDeviceToHost ) );
   }
 #endif
 
diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/CPPProcess.cc b/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/CPPProcess.cc
index 62fa7f0088..a4cc98e6b1 100644
--- a/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/CPPProcess.cc
+++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/CPPProcess.cc
@@ -4,7 +4,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
 // MadGraph5_aMC@NLO v. 3.5.0_lo_vect, 2023-06-09
@@ -16,6 +16,7 @@
 
 #include "mgOnGpuConfig.h"
 
+#include "CudaRuntime.h"
 #include "HelAmps_sm.h"
 #include "MemoryAccessAmplitudes.h"
 #include "MemoryAccessCouplings.h"
@@ -45,7 +46,7 @@
 // Class member functions for calculating the matrix elements for
 // Process: g g > t t~ WEIGHTED<=2 @1
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -79,7 +80,7 @@ namespace mg5amcCpu
   __device__ const fptype cIPD[2] = { (fptype)Parameters_sm::mdl_MT, (fptype)Parameters_sm::mdl_WT };
   __device__ const fptype* cIPC = nullptr; // unused as nicoup=0
 #else
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
   __device__ __constant__ fptype cIPD[2];
   __device__ __constant__ fptype* cIPC = nullptr; // unused as nicoup=0
 #else
@@ -89,7 +90,7 @@ namespace mg5amcCpu
 #endif
 
   // Helicity combinations (and filtering of "good" helicity combinations)
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
   __device__ __constant__ short cHel[ncomb][npar];
   __device__ __constant__ int cNGoodHel;
   __device__ __constant__ int cGoodHel[ncomb];
@@ -117,13 +118,13 @@ namespace mg5amcCpu
                            fptype* allDenominators,       // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
                            fptype_sv* jamp2_sv            // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled)
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
                            , const int ievt00             // input: first event number in current C++ event page (for CUDA, ievt depends on threadid)
 #endif
                            )
   //ALWAYS_INLINE // attributes are not permitted in a function definition
   {
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
     using namespace mg5amcGpu;
     using M_ACCESS = DeviceAccessMomenta;         // non-trivial access: buffer includes all events
     using E_ACCESS = DeviceAccessMatrixElements;  // non-trivial access: buffer includes all events
@@ -150,7 +151,7 @@ namespace mg5amcCpu
 #endif /* clang-format on */
     mgDebug( 0, __FUNCTION__ );
     //printf( "calculate_wavefunctions: ihel=%2d\n", ihel );
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
     //printf( "calculate_wavefunctions: ievt00=%d\n", ievt00 );
 #endif
 
@@ -186,7 +187,7 @@ namespace mg5amcCpu
 #endif
     for( int iParity = 0; iParity < nParity; ++iParity )
     { // START LOOP ON IPARITY
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
       const int ievt0 = ievt00 + iParity * neppV;
 #endif
       constexpr size_t nxcoup = ndcoup + nicoup; // both dependent and independent couplings
@@ -199,10 +200,8 @@ namespace mg5amcCpu
         allCOUPs[idcoup] = CD_ACCESS::idcoupAccessBufferConst( allcouplings, idcoup ); // dependent couplings, vary event-by-event
       for( size_t iicoup = 0; iicoup < nicoup; iicoup++ )
         allCOUPs[ndcoup + iicoup] = CI_ACCESS::iicoupAccessBufferConst( cIPC, iicoup ); // independent couplings, fixed for all events
-#ifdef MGONGPUCPP_GPUIMPL
 #ifdef __CUDACC__
 #pragma nv_diagnostic pop
-#endif
       // CUDA kernels take input/output buffers with momenta/MEs for all events
       const fptype* momenta = allmomenta;
       const fptype* COUPs[nxcoup];
@@ -303,7 +302,7 @@ namespace mg5amcCpu
         { 16, -2 },
         { -2, 16 } }; // 2-D array[2][2]
 
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
       // Pre-compute a constexpr triangular color matrix properly normalized #475
       struct TriangularNormalizedColorMatrix
       {
@@ -360,7 +359,7 @@ namespace mg5amcCpu
 #endif
       for( int icol = 0; icol < ncolor; icol++ )
       {
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
         // === C++ START ===
         // Diagonal terms
 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
@@ -419,7 +418,7 @@ namespace mg5amcCpu
       MEs_sv_previous += deltaMEs_previous;
 #endif
       /*
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
       if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", blockDim.x * blockIdx.x + threadIdx.x, ihel, MEs_sv );
 #else
 #ifdef MGONGPU_CPPSIMD
@@ -466,8 +465,8 @@ namespace mg5amcCpu
       { 1, 1, -1, -1 },
       { 1, 1, 1, 1 },
       { 1, 1, 1, -1 } };
-#ifdef MGONGPUCPP_GPUIMPL
-    gpuMemcpyToSymbol( cHel, tHel, ncomb * npar * sizeof( short ) );
+#ifdef __CUDACC__
+    checkCuda( cudaMemcpyToSymbol( cHel, tHel, ncomb * npar * sizeof( short ) ) );
 #else
     memcpy( cHel, tHel, ncomb * npar * sizeof( short ) );
 #endif
@@ -507,9 +506,9 @@ namespace mg5amcCpu
     // Then copy them to CUDA constant memory (issue #39) or its C++ emulation in file-scope static memory
     const fptype tIPD[2] = { (fptype)m_pars->mdl_MT, (fptype)m_pars->mdl_WT };
     //const cxtype tIPC[0] = { ... }; // nicoup=0
-#ifdef MGONGPUCPP_GPUIMPL
-    gpuMemcpyToSymbol( cIPD, tIPD, 2 * sizeof( fptype ) );
-    //gpuMemcpyToSymbol( cIPC, tIPC, 0 * sizeof( cxtype ) ); // nicoup=0
+#ifdef __CUDACC__
+    checkCuda( cudaMemcpyToSymbol( cIPD, tIPD, 2 * sizeof( fptype ) ) );
+    //checkCuda( cudaMemcpyToSymbol( cIPC, tIPC, 0 * sizeof( cxtype ) ) ); // nicoup=0
 #else
     memcpy( cIPD, tIPD, 2 * sizeof( fptype ) );
     //memcpy( cIPC, tIPC, 0 * sizeof( cxtype ) ); // nicoup=0
@@ -545,7 +544,7 @@ namespace mg5amcCpu
   {
     std::stringstream out;
     // CUDA version (NVCC)
-    // [Use __NVCC__ instead of MGONGPUCPP_GPUIMPL here!]
+    // [Use __NVCC__ instead of __CUDACC__ here!]
     // [This tests if 'nvcc' was used even to build a .cc file, even if not necessarily 'nvcc -x cu' for a .cu file]
     // [Check 'nvcc --compiler-options -dM -E dummy.c | grep CUDA': see https://stackoverflow.com/a/53713712]
 #ifdef __NVCC__
@@ -610,12 +609,12 @@ namespace mg5amcCpu
   __global__ void /* clang-format off */
   computeDependentCouplings( const fptype* allgs, // input: Gs[nevt]
                              fptype* allcouplings // output: couplings[nevt*ndcoup*2]
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
                              , const int nevt     // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #endif
   ) /* clang-format on */
   {
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
     using namespace mg5amcGpu;
     using G_ACCESS = DeviceAccessGs;
     using C_ACCESS = DeviceAccessCouplings;
@@ -636,7 +635,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
+#ifdef __CUDACC__ /* clang-format off */
   __global__ void
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
@@ -765,9 +764,9 @@ namespace mg5amcCpu
         nGoodHel++;
       }
     }
-#ifdef MGONGPUCPP_GPUIMPL
-    gpuMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) );
-    gpuMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) );
+#ifdef __CUDACC__
+    checkCuda( cudaMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) ) );
+    checkCuda( cudaMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) ) );
 #else
     cNGoodHel = nGoodHel;
     for( int ihel = 0; ihel < ncomb; ihel++ ) cGoodHel[ihel] = goodHel[ihel];
@@ -791,7 +790,7 @@ namespace mg5amcCpu
 #endif
             int* allselhel,                // output: helicity selection[nevt]
             int* allselcol                 // output: helicity selection[nevt]
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
             , const int nevt               // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #endif
             ) /* clang-format on */
@@ -812,7 +811,7 @@ namespace mg5amcCpu
     // Denominators: spins, colors and identical particles
     constexpr int helcolDenominators[1] = { 256 }; // assume nprocesses == 1 (#272 and #343)
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
     // Remember: in CUDA this is a kernel for one event, in c++ this processes n events
     const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid
 #else
@@ -826,12 +825,9 @@ namespace mg5amcCpu
 #endif
 
     // Start sigmaKin_lines
-
-#include "GpuAbstraction.h"
-
     // === PART 0 - INITIALISATION (before calculate_wavefunctions) ===
     // Reset the "matrix elements" - running sums of |M|^2 over helicities for the given event
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
     allMEs[ievt] = 0;
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     allNumerators[ievt] = 0;
@@ -859,7 +855,7 @@ namespace mg5amcCpu
     // === PART 1 - HELICITY LOOP: CALCULATE WAVEFUNCTIONS ===
     // (in both CUDA and C++, using precomputed good helicities)
 
-#ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++
+#ifdef __CUDACC__ // CUDA OR C++
 
     // *** START OF PART 1a - CUDA (one event per CPU thread) ***
     // Running sum of partial amplitudes squared for event by event color selection (#402)
@@ -1063,7 +1059,7 @@ namespace mg5amcCpu
     // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event
     // [NB 'sum over final spins, average over initial spins', eg see
     // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf]
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
     allMEs[ievt] /= helcolDenominators[0];
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     if( channelId > 0 ) allMEs[ievt] *= allNumerators[ievt] / allDenominators[ievt];
diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/CPPProcess.h b/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/CPPProcess.h
index 5a6e96d9e8..51f966d10f 100644
--- a/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/CPPProcess.h
+++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/CPPProcess.h
@@ -4,7 +4,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
 // MadGraph5_aMC@NLO v. 3.5.0_lo_vect, 2023-06-09
@@ -25,7 +25,7 @@
 
 //--------------------------------------------------------------------------
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -107,7 +107,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
   __global__ void
   computeDependentCouplings( const fptype* allgs,    // input: Gs[nevt]
                              fptype* allcouplings ); // output: couplings[nevt*ndcoup*2]
@@ -120,7 +120,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
+#ifdef __CUDACC__ /* clang-format off */
   __global__ void
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
@@ -150,7 +150,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
+#ifdef __CUDACC__ /* clang-format off */
   __global__ void
   sigmaKin( const fptype* allmomenta,      // input: momenta[nevt*npar*4]
             const fptype* allcouplings,    // input: couplings[nevt*ndcoup*2]
diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/CudaRuntime.h b/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/CudaRuntime.h
new file mode 120000
index 0000000000..ce9e1a487a
--- /dev/null
+++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/CudaRuntime.h
@@ -0,0 +1 @@
+../CudaRuntime.h
\ No newline at end of file
diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/check_sa.cc b/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/check_sa.cc
index 1bad694d1c..f1e75b9252 100644
--- a/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/check_sa.cc
+++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/check_sa.cc
@@ -4,7 +4,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: O. Mattelaer (Nov 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 
 #include "mgOnGpuConfig.h"
@@ -12,7 +12,6 @@
 #include "BridgeKernels.h"
 #include "CPPProcess.h"
 #include "CrossSectionKernels.h"
-#include "GpuRuntime.h"
 #include "MatrixElementKernels.h"
 #include "MemoryAccessMatrixElements.h"
 #include "MemoryAccessMomenta.h"
@@ -64,7 +63,7 @@ usage( char* argv0, int ret = 1 )
   std::cout << std::endl;
   std::cout << "Summary stats are always computed: '-p' and '-j' only control their printout" << std::endl;
   std::cout << "The '-d' flag only enables NaN/abnormal warnings and OMP debugging" << std::endl;
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
 #ifdef _OPENMP
   std::cout << std::endl;
   std::cout << "Use the OMP_NUM_THREADS environment variable to control OMP multi-threading" << std::endl;
@@ -78,7 +77,7 @@ int
 main( int argc, char** argv )
 {
   // Namespaces for CUDA and C++ (FIXME - eventually use the same namespace everywhere...)
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
   using namespace mg5amcGpu;
 #else
   using namespace mg5amcCpu;
@@ -104,11 +103,11 @@ main( int argc, char** argv )
     CurandDevice = 2
   };
 #ifdef __CUDACC__
-  RandomNumberMode rndgen = RandomNumberMode::CurandDevice; // default on CUDA GPU
+  RandomNumberMode rndgen = RandomNumberMode::CurandDevice; // default on GPU
 #elif not defined MGONGPU_HAS_NO_CURAND
   RandomNumberMode rndgen = RandomNumberMode::CurandHost;  // default on CPU if build has curand
 #else
-  RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // default on CPU if build has no curand and on HIP GPU
+  RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // default on CPU if build has no curand
 #endif
   // Rambo sampling mode (NB RamboHost implies CommonRandom or CurandHost!)
   enum class RamboSamplingMode
@@ -116,7 +115,7 @@ main( int argc, char** argv )
     RamboHost = 1,
     RamboDevice = 2
   };
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
   RamboSamplingMode rmbsmp = RamboSamplingMode::RamboDevice; // default on GPU
 #else
   RamboSamplingMode rmbsmp = RamboSamplingMode::RamboHost; // default on CPU
@@ -149,7 +148,7 @@ main( int argc, char** argv )
 #ifdef __CUDACC__
       rndgen = RandomNumberMode::CurandDevice;
 #else
-      throw std::runtime_error( "CurandDevice is not supported on CPUs or on HIP GPUs" );
+      throw std::runtime_error( "CurandDevice is not supported on CPUs" );
 #endif
     }
     else if( arg == "--curhst" )
@@ -166,7 +165,7 @@ main( int argc, char** argv )
     }
     else if( arg == "--rmbdev" )
     {
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
       rmbsmp = RamboSamplingMode::RamboDevice;
 #else
       throw std::runtime_error( "RamboDevice is not supported on CPUs" );
@@ -240,13 +239,13 @@ main( int argc, char** argv )
     return usage( argv[0] );
   }
 
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
 #ifdef _OPENMP
   ompnumthreadsNotSetMeansOneThread( debug ? 1 : 0 ); // quiet(-1), info(0), debug(1)
 #endif
 #endif
 
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
   // Fail gently and avoid "Illegal instruction (core dumped)" if the host does not support the SIMD used in the ME calculation
   // Note: this prevents a crash on pmpe04 but not on some github CI nodes?
   // [NB: SIMD vectorization in mg5amc C++ code is only used in the ME calculation below MatrixElementKernelHost!]
@@ -264,14 +263,14 @@ main( int argc, char** argv )
 
   // === STEP 0 - INITIALISE
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 
-  // --- 00. Initialise GPU
-  // Instantiate a GpuRuntime at the beginnining of the application's main.
-  // For CUDA this invokes cudaSetDevice(0) in the constructor and books a cudaDeviceReset() call in the destructor.
-  const std::string cdinKey = "00 GpuInit";
+  // --- 00. Initialise cuda
+  // Instantiate a CudaRuntime at the beginnining of the application's main to
+  // invoke cudaSetDevice(0) in the constructor and book a cudaDeviceReset() call in the destructor
+  const std::string cdinKey = "00 CudaInit";
   timermap.start( cdinKey );
-  GpuRuntime GpuRuntime( debug );
+  CudaRuntime cudaRuntime( debug );
 #endif
 
   // --- 0a. Initialise physics process
@@ -293,7 +292,7 @@ main( int argc, char** argv )
   timermap.start( alloKey );
 
   // Memory buffers for random numbers for momenta
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
   HostBufferRndNumMomenta hstRndmom( nevt );
 #else
   PinnedHostBufferRndNumMomenta hstRndmom( nevt );
@@ -301,7 +300,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for sampling weights
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
   HostBufferWeights hstWeights( nevt );
 #else
   PinnedHostBufferWeights hstWeights( nevt );
@@ -309,7 +308,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for momenta
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
   HostBufferMomenta hstMomenta( nevt );
 #else
   PinnedHostBufferMomenta hstMomenta( nevt );
@@ -317,7 +316,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for Gs
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
   HostBufferGs hstGs( nevt );
 #else
   PinnedHostBufferGs hstGs( nevt );
@@ -334,7 +333,7 @@ main( int argc, char** argv )
   }
 
   // Memory buffers for matrix elements
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
   HostBufferMatrixElements hstMatrixElements( nevt );
 #else
   PinnedHostBufferMatrixElements hstMatrixElements( nevt );
@@ -343,7 +342,7 @@ main( int argc, char** argv )
 
   // Memory buffers for random numbers for helicity selection
   // *** NB #403 these buffers always remain initialised at 0: no need for helicity choice in gcheck/check (no LHE produced) ***
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
   HostBufferRndNumHelicity hstRndHel( nevt );
 #else
   PinnedHostBufferRndNumHelicity hstRndHel( nevt );
@@ -352,7 +351,7 @@ main( int argc, char** argv )
 
   // Memory buffers for random numbers for color selection
   // *** NB #402 these buffers always remain initialised at 0: no need for color choice in gcheck/check (no LHE produced) ***
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
   HostBufferRndNumColor hstRndCol( nevt );
 #else
   PinnedHostBufferRndNumColor hstRndCol( nevt );
@@ -360,7 +359,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for helicity selection
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
   HostBufferSelectedHelicity hstSelHel( nevt );
 #else
   PinnedHostBufferSelectedHelicity hstSelHel( nevt );
@@ -368,7 +367,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for color selection
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
   HostBufferSelectedColor hstSelCol( nevt );
 #else
   PinnedHostBufferSelectedColor hstSelCol( nevt );
@@ -404,7 +403,7 @@ main( int argc, char** argv )
 #else
   else
   {
-    throw std::logic_error( "CurandDevice is not supported on CPUs or HIP GPUs" ); // INTERNAL ERROR (no path to this statement)
+    throw std::logic_error( "CurandDevice is not supported on CPUs" ); // INTERNAL ERROR (no path to this statement)
   }
 #endif
 #else
@@ -422,7 +421,7 @@ main( int argc, char** argv )
   }
   else
   {
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
     prsk.reset( new RamboSamplingKernelDevice( energy, devRndmom, devMomenta, devWeights, gpublocks, gputhreads ) );
 #else
     throw std::logic_error( "RamboDevice is not supported on CPUs" ); // INTERNAL ERROR (no path to this statement)
@@ -433,7 +432,7 @@ main( int argc, char** argv )
   std::unique_ptr<MatrixElementKernelBase> pmek;
   if( !bridge )
   {
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
     pmek.reset( new MatrixElementKernelDevice( devMomenta, devGs, devRndHel, devRndCol, devMatrixElements, devSelHel, devSelCol, gpublocks, gputhreads ) );
 #else
     pmek.reset( new MatrixElementKernelHost( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, nevt ) );
@@ -441,7 +440,7 @@ main( int argc, char** argv )
   }
   else
   {
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
     pmek.reset( new BridgeKernelDevice( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, gpublocks, gputhreads ) );
 #else
     pmek.reset( new BridgeKernelHost( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, nevt ) );
@@ -483,7 +482,7 @@ main( int argc, char** argv )
     prnk->generateRnarray();
     //std::cout << "Got random numbers" << std::endl;
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
     if( rndgen != RandomNumberMode::CurandDevice && rmbsmp == RamboSamplingMode::RamboDevice )
     {
       // --- 1c. Copy rndmom from host to device
@@ -515,7 +514,7 @@ main( int argc, char** argv )
     prsk->getMomentaFinal();
     //std::cout << "Got final momenta" << std::endl;
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
     if( rmbsmp == RamboSamplingMode::RamboDevice )
     {
       // --- 2c. CopyDToH Weights
@@ -560,7 +559,7 @@ main( int argc, char** argv )
       dynamic_cast<BridgeKernelBase*>( pmek.get() )->transposeInputMomentaC2F();
     }
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
     // --- 2d. CopyHToD Momenta
     const std::string gKey = "0.. CpHTDg";
     rambtime += timermap.start( gKey ); // FIXME! NOT A RAMBO TIMER!
@@ -589,7 +588,7 @@ main( int argc, char** argv )
     wv3atime += timermap.stop(); // calc only
     wavetime += wv3atime;        // calc plus copy
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
     if( !bridge )
     {
       // --- 3b. CopyDToH MEs
@@ -732,19 +731,15 @@ main( int argc, char** argv )
     rndgentxt = "CURAND DEVICE";
 #ifdef __CUDACC__
   rndgentxt += " (CUDA code)";
-#elif defined __HIPCC__
-  rndgentxt += " (HIP code)";
 #else
   rndgentxt += " (C++ code)";
 #endif
 
   // Workflow description summary
   std::string wrkflwtxt;
-  // -- CUDA or HIP or C++?
+  // -- CUDA or C++?
 #ifdef __CUDACC__
   wrkflwtxt += "CUD:";
-#elif defined __HIPCC__
-  wrkflwtxt += "HIP:";
 #else
   wrkflwtxt += "CPP:";
 #endif
@@ -769,12 +764,6 @@ main( int argc, char** argv )
 #else
   wrkflwtxt += "???:"; // no path to this statement
 #endif
-#elif defined __HIPCC__
-#if defined MGONGPU_CUCXTYPE_CXSMPL
-  wrkflwtxt += "CXS:";
-#else
-  wrkflwtxt += "???:"; // no path to this statement
-#endif
 #else
 #if defined MGONGPU_CPPCXTYPE_STDCOMPLEX
   wrkflwtxt += "STX:";
@@ -800,7 +789,7 @@ main( int argc, char** argv )
     wrkflwtxt += "RMBDEV+";
   else
     wrkflwtxt += "??????+"; // no path to this statement
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
   // -- HOST or DEVICE matrix elements? Standalone MEs or BRIDGE?
   if( !bridge )
     wrkflwtxt += "MESDEV";
@@ -856,7 +845,7 @@ main( int argc, char** argv )
 
   if( perf )
   {
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
 #ifdef _OPENMP
     // Get the output of "nproc --all" (https://stackoverflow.com/a/478960)
     std::string nprocall;
@@ -877,8 +866,6 @@ main( int argc, char** argv )
     std::cout << std::string( SEP79, '*' ) << std::endl
 #ifdef __CUDACC__
               << "Process                     = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_CUDA"
-#elif defined __HIPCC__
-              << "Process                     = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_HIP"
 #else
               << "Process                     = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_CPP"
 #endif
@@ -905,21 +892,21 @@ main( int argc, char** argv )
 #elif defined MGONGPU_FPTYPE_FLOAT
               << "FP precision                = FLOAT (NaN/abnormal=" << nabn << ", zero=" << nzero << ")" << std::endl
 #endif
+#ifdef __CUDACC__
 #if defined MGONGPU_CUCXTYPE_CUCOMPLEX
               << "Complex type                = CUCOMPLEX" << std::endl
 #elif defined MGONGPU_CUCXTYPE_THRUST
               << "Complex type                = THRUST::COMPLEX" << std::endl
-#elif defined MGONGPU_CUCXTYPE_CXSMPL
-              << "Complex type                = STD::COMPLEX" << std::endl
+#endif
 #else
-              << "Complex type                = ???" << std::endl // no path to this statement...
+              << "Complex type                = STD::COMPLEX" << std::endl
 #endif
               << "RanNumb memory layout       = AOSOA[" << neppR << "]"
               << ( neppR == 1 ? " == AOS" : "" )
               << " [HARDCODED FOR REPRODUCIBILITY]" << std::endl
               << "Momenta memory layout       = AOSOA[" << neppM << "]"
               << ( neppM == 1 ? " == AOS" : "" ) << std::endl
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
     //<< "Wavefunction GPU memory     = LOCAL" << std::endl
 #else
 #if !defined MGONGPU_CPPSIMD
@@ -950,7 +937,7 @@ main( int argc, char** argv )
 #endif
 #endif
               << "Random number generation    = " << rndgentxt << std::endl
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
 #ifdef _OPENMP
               << "OMP threads / `nproc --all` = " << omp_get_max_threads() << " / " << nprocall // includes a newline
 #endif
@@ -1046,14 +1033,14 @@ main( int argc, char** argv )
              << "\"FLOAT (NaN/abnormal=" << nabn << ")\"," << std::endl
 #endif
              << "\"Complex type\": "
+#ifdef __CUDACC__
 #if defined MGONGPU_CUCXTYPE_CUCOMPLEX
              << "\"CUCOMPLEX\"," << std::endl
 #elif defined MGONGPU_CUCXTYPE_THRUST
              << "\"THRUST::COMPLEX\"," << std::endl
-#elif defined MGONGPU_CUCXTYPE_CXSMPL
-             << "\"STD::COMPLEX\"," << std::endl
+#endif
 #else
-             << "\"???\"," << std::endl                           // no path to this statement...
+             << "\"STD::COMPLEX\"," << std::endl
 #endif
              << "\"RanNumb memory layout\": "
              << "\"AOSOA[" << neppR << "]\""
@@ -1061,7 +1048,7 @@ main( int argc, char** argv )
              << "\"Momenta memory layout\": "
              << "\"AOSOA[" << neppM << "]\""
              << ( neppM == 1 ? " == AOS" : "" ) << ", " << std::endl
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
     //<< "\"Wavefunction GPU memory\": " << "\"LOCAL\"," << std::endl
 #endif
              << "\"Curand generation\": "
diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/RamboSamplingKernels.cc b/epochX/cudacpp/gg_tt.mad/SubProcesses/RamboSamplingKernels.cc
index 79abbcc4f8..da68aa9255 100644
--- a/epochX/cudacpp/gg_tt.mad/SubProcesses/RamboSamplingKernels.cc
+++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/RamboSamplingKernels.cc
@@ -1,11 +1,11 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #include "RamboSamplingKernels.h"
 
-#include "GpuRuntime.h"
+#include "CudaRuntime.h"
 #include "MemoryAccessMomenta.h"
 #include "MemoryAccessRandomNumbers.h"
 #include "MemoryAccessWeights.h"
@@ -14,7 +14,7 @@
 
 #include <sstream>
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -92,7 +92,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
   RamboSamplingKernelDevice::RamboSamplingKernelDevice( const fptype energy,               // input: energy
                                                         const BufferRndNumMomenta& rndmom, // input: random numbers in [0,1]
                                                         BufferMomenta& momenta,            // output: momenta
@@ -135,7 +135,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
   __global__ void
   getMomentaInitialDevice( const fptype energy,
                            fptype* momenta )
@@ -147,17 +147,17 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
   void
   RamboSamplingKernelDevice::getMomentaInitial()
   {
-    gpuLaunchKernel( getMomentaInitialDevice, m_gpublocks, m_gputhreads, m_energy, m_momenta.data() );
+    getMomentaInitialDevice<<<m_gpublocks, m_gputhreads>>>( m_energy, m_momenta.data() );
   }
 #endif
 
   //--------------------------------------------------------------------------
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
   __global__ void
   getMomentaFinalDevice( const fptype energy,
                          const fptype* rndmom,
@@ -171,11 +171,11 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
   void
   RamboSamplingKernelDevice::getMomentaFinal()
   {
-    gpuLaunchKernel( getMomentaFinalDevice, m_gpublocks, m_gputhreads, m_energy, m_rndmom.data(), m_momenta.data(), m_weights.data() );
+    getMomentaFinalDevice<<<m_gpublocks, m_gputhreads>>>( m_energy, m_rndmom.data(), m_momenta.data(), m_weights.data() );
   }
 #endif
 
diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/RamboSamplingKernels.h b/epochX/cudacpp/gg_tt.mad/SubProcesses/RamboSamplingKernels.h
index 7c214cd74b..184089efd7 100644
--- a/epochX/cudacpp/gg_tt.mad/SubProcesses/RamboSamplingKernels.h
+++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/RamboSamplingKernels.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef RAMBOSAMPLINGKERNELS_H
 #define RAMBOSAMPLINGKERNELS_H 1
@@ -10,7 +10,7 @@
 
 #include "MemoryBuffers.h"
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -93,7 +93,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
   // A class encapsulating RAMBO phase space sampling on a GPU device
   class RamboSamplingKernelDevice final : public SamplingKernelBase, public NumberOfEvents
   {
diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/RandomNumberKernels.h b/epochX/cudacpp/gg_tt.mad/SubProcesses/RandomNumberKernels.h
index 21d63beeac..188a72c2c9 100644
--- a/epochX/cudacpp/gg_tt.mad/SubProcesses/RandomNumberKernels.h
+++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/RandomNumberKernels.h
@@ -1,14 +1,14 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef RANDOMNUMBERKERNELS_H
 #define RANDOMNUMBERKERNELS_H 1
 
 #include "mgOnGpuConfig.h"
 
-// NB This must come AFTER mgOnGpuConfig.h which contains our definition of __global__ when MGONGPUCPP_GPUIMPL is not defined
+// NB This must come AFTER mgOnGpuConfig.h which contains our definition of __global__ when __CUDACC__ is not defined
 #ifndef MGONGPU_HAS_NO_CURAND
 //#include "curand.h"
 struct curandGenerator_st; // forward definition from curand.h
@@ -16,7 +16,7 @@ struct curandGenerator_st; // forward definition from curand.h
 
 #include "MemoryBuffers.h"
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/cudacpp.mk b/epochX/cudacpp/gg_tt.mad/SubProcesses/cudacpp.mk
index bcb73d7f01..a0397e9ecc 100644
--- a/epochX/cudacpp/gg_tt.mad/SubProcesses/cudacpp.mk
+++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/cudacpp.mk
@@ -1,7 +1,7 @@
 # Copyright (C) 2020-2023 CERN and UCLouvain.
 # Licensed under the GNU Lesser General Public License (version 3 or later).
 # Created by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin.
-# Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+# Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 
 #=== Determine the name of this makefile (https://ftp.gnu.org/old-gnu/Manuals/make-3.80/html_node/make_17.html)
 #=== NB: different names (e.g. cudacpp.mk and cudacpp_src.mk) are used in the Subprocess and src directories
@@ -32,7 +32,7 @@ UNAME_P := $(shell uname -p)
 #=== Configure common compiler flags for C++ and CUDA
 
 INCFLAGS = -I.
-OPTFLAGS = -O3 # this ends up in GPUFLAGS too (should it?), cannot add -Ofast or -ffast-math here
+OPTFLAGS = -O3 # this ends up in CUFLAGS too (should it?), cannot add -Ofast or -ffast-math here
 
 # Dependency on src directory
 MG5AMC_COMMONLIB = mg5amc_common
@@ -89,139 +89,69 @@ endif
 
 #-------------------------------------------------------------------------------
 
-CUDA_COMPILER_PATH := $(shell compiler="`which nvcc 2>/dev/null`" && while [ -L "$$compiler" ]; do compiler=`readlink "$$compiler"`; done && echo "$$compiler")
-HIP_COMPILER_PATH := $(shell compiler="`which hipcc 2>/dev/null`" && while [ -L "$$compiler" ]; do compiler=`readlink "$$compiler"`; done && echo "$$compiler")
-
-ifeq ($(findstring nvcc,$(CUDA_COMPILER_PATH)),nvcc)
-  #=== Configure the CUDA compiler
-
-  # If CXX is not a single word (example "clang++ --gcc-toolchain...") then disable CUDA builds (issue #505)
-  # This is because it is impossible to pass this to "GPUFLAGS += -ccbin <host-compiler>" below
-  ifneq ($(words $(subst ccache ,,$(CXX))),1) # allow at most "CXX=ccache <host-compiler>" from outside
-    $(warning CUDA builds are not supported for multi-word CXX "$(CXX)")
-    override CUDA_HOME=disabled
-  endif
-
-  # If CUDA_HOME is not set, try to set it from the location of NVCC
-  ifndef CUDA_HOME
-    CUDA_HOME = $(patsubst %bin/nvcc,%,$(shell which nvcc 2>/dev/null))
-    $(warning CUDA_HOME was not set: using "$(CUDA_HOME)")
-  endif
-
-  # Set GPUCC as $(CUDA_HOME)/bin/nvcc if it exists
-  ifneq ($(wildcard $(CUDA_HOME)/bin/nvcc),)
-    GPUCC = $(CUDA_HOME)/bin/nvcc
-    USE_NVTX ?=-DUSE_NVTX
-    # See https://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/index.html
-    # See https://arnon.dk/matching-sm-architectures-arch-and-gencode-for-various-nvidia-cards/
-    # Default: use compute capability 70 for V100 (CERN lxbatch, CERN itscrd, Juwels Cluster).
-    # Embed device code for 70, and PTX for 70+.
-    # Export MADGRAPH_CUDA_ARCHITECTURE (comma-separated list) to use another value or list of values (see #533).
-    # Examples: use 60 for P100 (Piz Daint), 80 for A100 (Juwels Booster, NVidia raplab/Curiosity).
-    MADGRAPH_CUDA_ARCHITECTURE ?= 70
-    ###CUARCHFLAGS = -gencode arch=compute_$(MADGRAPH_CUDA_ARCHITECTURE),code=compute_$(MADGRAPH_CUDA_ARCHITECTURE) -gencode arch=compute_$(MADGRAPH_CUDA_ARCHITECTURE),code=sm_$(MADGRAPH_CUDA_ARCHITECTURE) # Older implementation (AV): go back to this one for multi-GPU support #533
-    ###CUARCHFLAGS = --gpu-architecture=compute_$(MADGRAPH_CUDA_ARCHITECTURE) --gpu-code=sm_$(MADGRAPH_CUDA_ARCHITECTURE),compute_$(MADGRAPH_CUDA_ARCHITECTURE) # Newer implementation (SH): cannot use this as-is for multi-GPU support #533
-    comma:=,
-    CUARCHFLAGS = $(foreach arch,$(subst $(comma), ,$(MADGRAPH_CUDA_ARCHITECTURE)),-gencode arch=compute_$(arch),code=compute_$(arch) -gencode arch=compute_$(arch),code=sm_$(arch))
-    CUINC = -I$(CUDA_HOME)/include/
-    CURANDLIBFLAGS = -L$(CUDA_HOME)/lib64/ -lcurand # NB: -lcuda is not needed here!
-    CUOPTFLAGS = -lineinfo
-    GPUFLAGS = $(OPTFLAGS) $(CUOPTFLAGS) $(INCFLAGS) $(CUINC) $(USE_NVTX) $(CUARCHFLAGS) -use_fast_math
-    ###GPUFLAGS += -Xcompiler -Wall -Xcompiler -Wextra -Xcompiler -Wshadow
-    ###GPUCC_VERSION = $(shell $(GPUCC) --version | grep 'Cuda compilation tools' | cut -d' ' -f5 | cut -d, -f1)
-    GPUFLAGS += -std=c++17 # need CUDA >= 11.2 (see #333): this is enforced in mgOnGpuConfig.h
-    # Without -maxrregcount: baseline throughput: 6.5E8 (16384 32 12) up to 7.3E8 (65536 128 12)
-    ###GPUFLAGS+= --maxrregcount 160 # improves throughput: 6.9E8 (16384 32 12) up to 7.7E8 (65536 128 12)
-    ###GPUFLAGS+= --maxrregcount 128 # improves throughput: 7.3E8 (16384 32 12) up to 7.6E8 (65536 128 12)
-    ###GPUFLAGS+= --maxrregcount 96 # degrades throughput: 4.1E8 (16384 32 12) up to 4.5E8 (65536 128 12)
-    ###GPUFLAGS+= --maxrregcount 64 # degrades throughput: 1.7E8 (16384 32 12) flat at 1.7E8 (65536 128 12)
-
-    CUBUILDRULEFLAGS = -Xcompiler -fPIC -c
-    CCBUILDRULEFLAGS = -Xcompiler -fPIC -c -x cu
-
-    CUDATESTFLAGS = -lcuda
-
-  else ifneq ($(origin REQUIRE_CUDA),undefined)
-    # If REQUIRE_CUDA is set but no cuda is found, stop here (e.g. for CI tests on GPU #443)
-    $(error No cuda installation found (set CUDA_HOME or make GPUCC visible in PATH))
-  else
-    # No cuda. Switch cuda compilation off and go to common random numbers in C++
-    $(warning CUDA_HOME is not set or is invalid: export CUDA_HOME to compile with cuda)
-    override GPUCC=
-    override USE_NVTX=
-    override CUINC=
-    override CURANDLIBFLAGS=
-  endif
-
-  # Set the host C++ compiler for GPUCC via "-ccbin <host-compiler>"
-  # (NB issue #505: this must be a single word, "clang++ --gcc-toolchain..." is not supported)
-  GPUFLAGS += -ccbin $(shell which $(subst ccache ,,$(CXX)))
-
-  # Allow newer (unsupported) C++ compilers with older versions of CUDA if ALLOW_UNSUPPORTED_COMPILER_IN_CUDA is set (#504)
-  ifneq ($(origin ALLOW_UNSUPPORTED_COMPILER_IN_CUDA),undefined)
-  GPUFLAGS += -allow-unsupported-compiler
-  endif
-
-else ifeq ($(findstring hipcc,$(HIP_COMPILER_PATH)),hipcc)
-  #=== Configure the HIP compiler
-
-  # If CXX is not a single word (example "clang++ --gcc-toolchain...") then disable CUDA builds (issue #505)
-  # This is because it is impossible to pass this to "GPUFLAGS += -ccbin <host-compiler>" below
-  ifneq ($(words $(subst ccache ,,$(CXX))),1) # allow at most "CXX=ccache <host-compiler>" from outside
-    $(warning CUDA builds are not supported for multi-word CXX "$(CXX)")
-    override CUDA_HOME=disabled
-  endif
-
-  # If HIP_HOME is not set, try to set it from the location of GPUCC
-  ifndef HIP_HOME
-    HIP_HOME = $(patsubst %bin/hipcc,%,$(HIP_COMPILER_PATH))
-    $(warning HIP_HOME was not set: using "$(HIP_HOME)")
-  endif
-
-  # Set GPUCC as $(HIP_HOME)/bin/hipcc if it exists
-  ifneq ($(wildcard $(HIP_HOME)/bin/hipcc),)
-    GPUCC = $(HIP_HOME)/bin/hipcc
-
-    # Should maybe find something equivelant to this in HIP
-    #USE_NVTX ?=-DUSE_NVTX
-
-    HIPARCHFLAGS = -target x86_64-linux-gnu --offload-arch=gfx90a
-    HIPINC = -I$(HIP_HOME)/include/
-
-    # -DHIP_FAST_MATH equivelant to -use_fast_math in HIP 
-    # (But only for single precision line 208: https://rocm-developer-tools.github.io/HIP/hcc__detail_2math__functions_8h_source.html)
-    GPUFLAGS = $(OPTFLAGS) $(CUOPTFLAGS) $(INCFLAGS) $(HIPINC) $(HIPARCHFLAGS) -DHIP_FAST_MATH -DHIP_PLATFORM=amd -fPIC
-    ###GPUFLAGS += -Xcompiler -Wall -Xcompiler -Wextra -Xcompiler -Wshadow
-    GPUFLAGS += -std=c++17
-    # Without -maxrregcount: baseline throughput: 6.5E8 (16384 32 12) up to 7.3E8 (65536 128 12)
-    ###GPUFLAGS+= --maxrregcount 160 # improves throughput: 6.9E8 (16384 32 12) up to 7.7E8 (65536 128 12)
-    ###GPUFLAGS+= --maxrregcount 128 # improves throughput: 7.3E8 (16384 32 12) up to 7.6E8 (65536 128 12)
-    ###GPUFLAGS+= --maxrregcount 96 # degrades throughput: 4.1E8 (16384 32 12) up to 4.5E8 (65536 128 12)
-    ###GPUFLAGS+= --maxrregcount 64 # degrades throughput: 1.7E8 (16384 32 12) flat at 1.7E8 (65536 128 12)
-
-    CUBUILDRULEFLAGS = -fPIC -c
-    CCBUILDRULEFLAGS = -fPIC -c
-
-  else ifneq ($(origin REQUIRE_HIP),undefined)
-    # If REQUIRE_HIP is set but no cuda is found, stop here (e.g. for CI tests on GPU #443)
-    $(error No hip installation found (set HIP_HOME or make GPUCC visible in PATH))
-  else
-    # No hip. Switch hip compilation off and go to common random numbers in C++
-    $(warning HIP_HOME is not set or is invalid: export HIP_HOME to compile with hip)
-    override GPUCC=
-    override USE_NVTX=
-    override CUINC=
-    override CURANDLIBFLAGS=
-  endif
+#=== Configure the CUDA compiler
+
+# If CXX is not a single word (example "clang++ --gcc-toolchain...") then disable CUDA builds (issue #505)
+# This is because it is impossible to pass this to "CUFLAGS += -ccbin <host-compiler>" below
+ifneq ($(words $(subst ccache ,,$(CXX))),1) # allow at most "CXX=ccache <host-compiler>" from outside
+  $(warning CUDA builds are not supported for multi-word CXX "$(CXX)")
+  override CUDA_HOME=disabled
+endif
+
+# If CUDA_HOME is not set, try to set it from the location of nvcc
+ifndef CUDA_HOME
+  CUDA_HOME = $(patsubst %bin/nvcc,%,$(shell which nvcc 2>/dev/null))
+  $(warning CUDA_HOME was not set: using "$(CUDA_HOME)")
+endif
+
+# Set NVCC as $(CUDA_HOME)/bin/nvcc if it exists
+ifneq ($(wildcard $(CUDA_HOME)/bin/nvcc),)
+  NVCC = $(CUDA_HOME)/bin/nvcc
+  USE_NVTX ?=-DUSE_NVTX
+  # See https://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/index.html
+  # See https://arnon.dk/matching-sm-architectures-arch-and-gencode-for-various-nvidia-cards/
+  # Default: use compute capability 70 for V100 (CERN lxbatch, CERN itscrd, Juwels Cluster).
+  # Embed device code for 70, and PTX for 70+.
+  # Export MADGRAPH_CUDA_ARCHITECTURE (comma-separated list) to use another value or list of values (see #533).
+  # Examples: use 60 for P100 (Piz Daint), 80 for A100 (Juwels Booster, NVidia raplab/Curiosity).
+  MADGRAPH_CUDA_ARCHITECTURE ?= 70
+  ###CUARCHFLAGS = -gencode arch=compute_$(MADGRAPH_CUDA_ARCHITECTURE),code=compute_$(MADGRAPH_CUDA_ARCHITECTURE) -gencode arch=compute_$(MADGRAPH_CUDA_ARCHITECTURE),code=sm_$(MADGRAPH_CUDA_ARCHITECTURE) # Older implementation (AV): go back to this one for multi-GPU support #533
+  ###CUARCHFLAGS = --gpu-architecture=compute_$(MADGRAPH_CUDA_ARCHITECTURE) --gpu-code=sm_$(MADGRAPH_CUDA_ARCHITECTURE),compute_$(MADGRAPH_CUDA_ARCHITECTURE) # Newer implementation (SH): cannot use this as-is for multi-GPU support #533
+  comma:=,
+  CUARCHFLAGS = $(foreach arch,$(subst $(comma), ,$(MADGRAPH_CUDA_ARCHITECTURE)),-gencode arch=compute_$(arch),code=compute_$(arch) -gencode arch=compute_$(arch),code=sm_$(arch))
+  CUINC = -I$(CUDA_HOME)/include/
+  CURANDLIBFLAGS = -L$(CUDA_HOME)/lib64/ -lcurand # NB: -lcuda is not needed here!
+  CUOPTFLAGS = -lineinfo
+  CUFLAGS = $(OPTFLAGS) $(CUOPTFLAGS) $(INCFLAGS) $(CUINC) $(USE_NVTX) $(CUARCHFLAGS) -use_fast_math
+  ###CUFLAGS += -Xcompiler -Wall -Xcompiler -Wextra -Xcompiler -Wshadow
+  ###NVCC_VERSION = $(shell $(NVCC) --version | grep 'Cuda compilation tools' | cut -d' ' -f5 | cut -d, -f1)
+  CUFLAGS += -std=c++17 # need CUDA >= 11.2 (see #333): this is enforced in mgOnGpuConfig.h
+  # Without -maxrregcount: baseline throughput: 6.5E8 (16384 32 12) up to 7.3E8 (65536 128 12)
+  ###CUFLAGS+= --maxrregcount 160 # improves throughput: 6.9E8 (16384 32 12) up to 7.7E8 (65536 128 12)
+  ###CUFLAGS+= --maxrregcount 128 # improves throughput: 7.3E8 (16384 32 12) up to 7.6E8 (65536 128 12)
+  ###CUFLAGS+= --maxrregcount 96 # degrades throughput: 4.1E8 (16384 32 12) up to 4.5E8 (65536 128 12)
+  ###CUFLAGS+= --maxrregcount 64 # degrades throughput: 1.7E8 (16384 32 12) flat at 1.7E8 (65536 128 12)
+else ifneq ($(origin REQUIRE_CUDA),undefined)
+  # If REQUIRE_CUDA is set but no cuda is found, stop here (e.g. for CI tests on GPU #443)
+  $(error No cuda installation found (set CUDA_HOME or make nvcc visible in PATH))
+else
+  # No cuda. Switch cuda compilation off and go to common random numbers in C++
+  $(warning CUDA_HOME is not set or is invalid: export CUDA_HOME to compile with cuda)
+  override NVCC=
+  override USE_NVTX=
+  override CUINC=
+  override CURANDLIBFLAGS=
+endif
 
-  # Allow newer (unsupported) C++ compilers with older versions of CUDA if ALLOW_UNSUPPORTED_COMPILER_IN_CUDA is set (#504)
-  ifneq ($(origin ALLOW_UNSUPPORTED_COMPILER_IN_CUDA),undefined)
-  GPUFLAGS += -allow-unsupported-compiler
-  endif
+# Set the host C++ compiler for nvcc via "-ccbin <host-compiler>"
+# (NB issue #505: this must be a single word, "clang++ --gcc-toolchain..." is not supported)
+CUFLAGS += -ccbin $(shell which $(subst ccache ,,$(CXX)))
 
+# Allow newer (unsupported) C++ compilers with older versions of CUDA if ALLOW_UNSUPPORTED_COMPILER_IN_CUDA is set (#504)
+ifneq ($(origin ALLOW_UNSUPPORTED_COMPILER_IN_CUDA),undefined)
+CUFLAGS += -allow-unsupported-compiler
 endif
 
-  
 #-------------------------------------------------------------------------------
 
 #=== Configure ccache for C++ and CUDA builds
@@ -233,9 +163,9 @@ endif
 #ifeq ($(USECCACHE)$(shell echo $(AR) | grep ccache),1)
 #  override AR:=ccache $(AR)
 #endif
-ifneq ($(GPUCC),)
-  ifeq ($(USECCACHE)$(shell echo $(GPUCC) | grep ccache),1)
-    override GPUCC:=ccache $(GPUCC)
+ifneq ($(NVCC),)
+  ifeq ($(USECCACHE)$(shell echo $(NVCC) | grep ccache),1)
+    override NVCC:=ccache $(NVCC)
   endif
 endif
 
@@ -259,7 +189,7 @@ endif
 
 # PowerPC-specific CUDA compiler flags (to be reviewed!)
 ifeq ($(UNAME_P),ppc64le)
-  GPUFLAGS+= -Xcompiler -mno-float128
+  CUFLAGS+= -Xcompiler -mno-float128
 endif
 
 #-------------------------------------------------------------------------------
@@ -269,10 +199,10 @@ endif
 # Set the default OMPFLAGS choice
 ifneq ($(shell $(CXX) --version | egrep '^Intel'),)
 override OMPFLAGS = -fopenmp
-###override OMPFLAGS = # disable OpenMP MT on Intel (was ok without GPUCC but not ok with GPUCC before #578)
+###override OMPFLAGS = # disable OpenMP MT on Intel (was ok without nvcc but not ok with nvcc before #578)
 else ifneq ($(shell $(CXX) --version | egrep '^(clang)'),)
 override OMPFLAGS = -fopenmp
-###override OMPFLAGS = # disable OpenMP MT on clang (was not ok without or with GPUCC before #578)
+###override OMPFLAGS = # disable OpenMP MT on clang (was not ok without or with nvcc before #578)
 else ifneq ($(shell $(CXX) --version | egrep '^(Apple clang)'),)
 override OMPFLAGS = # disable OpenMP MT on Apple clang (builds fail in the CI #578)
 else
@@ -323,10 +253,7 @@ endif
 
 # Set the default RNDGEN (random number generator) choice
 ifeq ($(RNDGEN),)
-  ifeq ($(GPUCC),)
-    override RNDGEN = hasNoCurand
-  # Edgecase for HIP compilation
-  else ifeq ($(findstring hipcc,$(GPUCC)),hipcc)
+  ifeq ($(NVCC),)
     override RNDGEN = hasNoCurand
   else ifeq ($(RNDGEN),)
     override RNDGEN = hasCurand
@@ -401,13 +328,13 @@ CXXFLAGS+= $(AVXFLAGS)
 $(info FPTYPE=$(FPTYPE))
 ifeq ($(FPTYPE),d)
   CXXFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_DOUBLE
-  GPUFLAGS  += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_DOUBLE
+  CUFLAGS  += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_DOUBLE
 else ifeq ($(FPTYPE),f)
   CXXFLAGS += -DMGONGPU_FPTYPE_FLOAT -DMGONGPU_FPTYPE2_FLOAT
-  GPUFLAGS  += -DMGONGPU_FPTYPE_FLOAT -DMGONGPU_FPTYPE2_FLOAT
+  CUFLAGS  += -DMGONGPU_FPTYPE_FLOAT -DMGONGPU_FPTYPE2_FLOAT
 else ifeq ($(FPTYPE),m)
   CXXFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_FLOAT
-  GPUFLAGS  += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_FLOAT
+  CUFLAGS  += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_FLOAT
 else
   $(error Unknown FPTYPE='$(FPTYPE)': only 'd', 'f' and 'm' are supported)
 endif
@@ -416,7 +343,7 @@ endif
 $(info HELINL=$(HELINL))
 ifeq ($(HELINL),1)
   CXXFLAGS += -DMGONGPU_INLINE_HELAMPS
-  GPUFLAGS  += -DMGONGPU_INLINE_HELAMPS
+  CUFLAGS  += -DMGONGPU_INLINE_HELAMPS
 else ifneq ($(HELINL),0)
   $(error Unknown HELINL='$(HELINL)': only '0' and '1' are supported)
 endif
@@ -425,7 +352,7 @@ endif
 $(info HRDCOD=$(HRDCOD))
 ifeq ($(HRDCOD),1)
   CXXFLAGS += -DMGONGPU_HARDCODE_PARAM
-  GPUFLAGS  += -DMGONGPU_HARDCODE_PARAM
+  CUFLAGS  += -DMGONGPU_HARDCODE_PARAM
 else ifneq ($(HRDCOD),0)
   $(error Unknown HRDCOD='$(HRDCOD)': only '0' and '1' are supported)
 endif
@@ -477,11 +404,11 @@ ifeq ($(UNAME_S),Darwin)
   override CULIBFLAGSRPATH2 =
 else
   # RPATH to cuda/cpp libs when linking executables
-  override CXXLIBFLAGSRPATH = -Wl,-rpath=$(LIBDIRRPATH)
-  override CULIBFLAGSRPATH = -Xlinker -rpath=$(LIBDIRRPATH)
+  override CXXLIBFLAGSRPATH = -Wl,-rpath,$(LIBDIRRPATH)
+  override CULIBFLAGSRPATH = -Xlinker -rpath,$(LIBDIRRPATH)
   # RPATH to common lib when linking cuda/cpp libs
-  override CXXLIBFLAGSRPATH2 = -Wl,-rpath='$$ORIGIN'
-  override CULIBFLAGSRPATH2 = -Xlinker -rpath='$$ORIGIN'
+  override CXXLIBFLAGSRPATH2 = -Wl,-rpath,'$$ORIGIN'
+  override CULIBFLAGSRPATH2 = -Xlinker -rpath,'$$ORIGIN'
 endif
 
 # Setting LD_LIBRARY_PATH or DYLD_LIBRARY_PATH in the RUNTIME is no longer necessary (neither on Linux nor on Mac)
@@ -494,7 +421,7 @@ override RUNTIME =
 cxx_main=$(BUILDDIR)/check.exe
 fcxx_main=$(BUILDDIR)/fcheck.exe
 
-ifneq ($(GPUCC),)
+ifneq ($(NVCC),)
 cu_main=$(BUILDDIR)/gcheck.exe
 fcu_main=$(BUILDDIR)/fgcheck.exe
 else
@@ -525,16 +452,15 @@ $(BUILDDIR)/.build.$(TAG):
 	@touch $(BUILDDIR)/.build.$(TAG)
 
 # Generic target and build rules: objects from CUDA compilation
-ifneq ($(GPUCC),)
+ifneq ($(NVCC),)
 $(BUILDDIR)/%.o : %.cu *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
 	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
-	$(GPUCC) $(CPPFLAGS) $(GPUFLAGS) $(CUBUILDRULEFLAGS) $< -o $@
+	$(NVCC) $(CPPFLAGS) $(CUFLAGS) -Xcompiler -fPIC -c $< -o $@
 
 $(BUILDDIR)/%_cu.o : %.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
 	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
-	$(GPUCC) $(CPPFLAGS) $(GPUFLAGS) $(CCBUILDRULEFLAGS) $< -o $@
+	$(NVCC) $(CPPFLAGS) $(CUFLAGS) -Xcompiler -fPIC -c -x cu $< -o $@
 endif
-# -x cu in line above
 
 # Generic target and build rules: objects from C++ compilation
 # (NB do not include CUINC here! add it only for NVTX or curand #679)
@@ -543,14 +469,11 @@ $(BUILDDIR)/%.o : %.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
 	$(CXX) $(CPPFLAGS) $(CXXFLAGS) -fPIC -c $< -o $@
 
 # Apply special build flags only to CrossSectionKernel.cc and gCrossSectionKernel.cu (no fast math, see #117)
-# Added edgecase for HIP compilation
 ifeq ($(shell $(CXX) --version | grep ^nvc++),)
 $(BUILDDIR)/CrossSectionKernels.o: CXXFLAGS += -fno-fast-math
 $(BUILDDIR)/CrossSectionKernels.o: CXXFLAGS += -fno-fast-math
-ifeq ($(findstring nvcc,$(GPUCC)),nvcc)
-  $(BUILDDIR)/gCrossSectionKernels.o: GPUFLAGS += -Xcompiler -fno-fast-math
-else
-  $(BUILDDIR)/gCrossSectionKernels.o: GPUFLAGS += -fno-fast-math
+ifneq ($(NVCC),)
+$(BUILDDIR)/gCrossSectionKernels.o: CUFLAGS += -Xcompiler -fno-fast-math
 endif
 endif
 
@@ -566,10 +489,10 @@ ifeq ($(RNDGEN),hasCurand)
 $(BUILDDIR)/CurandRandomNumberKernel.o: CXXFLAGS += $(CUINC)
 endif
 
-# Avoid "warning: builtin __has_trivial_... is deprecated; use __is_trivially_... instead" in GPUCC with icx2023 (#592)
+# Avoid "warning: builtin __has_trivial_... is deprecated; use __is_trivially_... instead" in nvcc with icx2023 (#592)
 ifneq ($(shell $(CXX) --version | egrep '^(Intel)'),)
-ifneq ($(GPUCC),)
-GPUFLAGS += -Wno-deprecated-builtins
+ifneq ($(NVCC),)
+CUFLAGS += -Xcompiler -Wno-deprecated-builtins
 endif
 endif
 
@@ -577,8 +500,8 @@ endif
 # This patch does remove the warning, but I prefer to keep it disabled for the moment...
 ###ifneq ($(shell $(CXX) --version | egrep '^(clang|Apple clang|Intel)'),)
 ###$(BUILDDIR)/CrossSectionKernels.o: CXXFLAGS += -Wno-overriding-t-option
-###ifneq ($(GPUCC),)
-###$(BUILDDIR)/gCrossSectionKernels.o: GPUFLAGS += -Xcompiler -Wno-overriding-t-option
+###ifneq ($(NVCC),)
+###$(BUILDDIR)/gCrossSectionKernels.o: CUFLAGS += -Xcompiler -Wno-overriding-t-option
 ###endif
 ###endif
 
@@ -605,7 +528,7 @@ MG5AMC_CXXLIB = mg5amc_$(processid_short)_cpp
 cxx_objects_lib=$(BUILDDIR)/CPPProcess.o $(BUILDDIR)/MatrixElementKernels.o $(BUILDDIR)/BridgeKernels.o $(BUILDDIR)/CrossSectionKernels.o
 cxx_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel.o $(BUILDDIR)/RamboSamplingKernels.o
 
-ifneq ($(GPUCC),)
+ifneq ($(NVCC),)
 MG5AMC_CULIB = mg5amc_$(processid_short)_cuda
 cu_objects_lib=$(BUILDDIR)/gCPPProcess.o $(BUILDDIR)/gMatrixElementKernels.o $(BUILDDIR)/gBridgeKernels.o $(BUILDDIR)/gCrossSectionKernels.o
 cu_objects_exe=$(BUILDDIR)/gCommonRandomNumberKernel.o $(BUILDDIR)/gRamboSamplingKernels.o
@@ -617,11 +540,11 @@ $(LIBDIR)/lib$(MG5AMC_CXXLIB).so: cxx_objects_lib += $(BUILDDIR)/fbridge.o
 $(LIBDIR)/lib$(MG5AMC_CXXLIB).so: $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib)
 	$(CXX) -shared -o $@ $(cxx_objects_lib) $(CXXLIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB)
 
-ifneq ($(GPUCC),)
+ifneq ($(NVCC),)
 $(LIBDIR)/lib$(MG5AMC_CULIB).so: $(BUILDDIR)/fbridge_cu.o
 $(LIBDIR)/lib$(MG5AMC_CULIB).so: cu_objects_lib += $(BUILDDIR)/fbridge_cu.o
 $(LIBDIR)/lib$(MG5AMC_CULIB).so: $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cu_objects_lib)
-	$(GPUCC) --shared -o $@ $(cu_objects_lib) $(CULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB)
+	$(NVCC) --shared -o $@ $(cu_objects_lib) $(CULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB)
 endif
 
 #-------------------------------------------------------------------------------
@@ -638,16 +561,16 @@ $(cxx_main): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PAT
 $(cxx_main): $(BUILDDIR)/check_sa.o $(LIBDIR)/lib$(MG5AMC_CXXLIB).so $(cxx_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel.o
 	$(CXX) -o $@ $(BUILDDIR)/check_sa.o $(OMPFLAGS) -ldl -pthread $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_CXXLIB) $(cxx_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel.o $(CURANDLIBFLAGS)
 
-ifneq ($(GPUCC),)
+ifneq ($(NVCC),)
 ifneq ($(shell $(CXX) --version | grep ^Intel),)
-$(cu_main): LIBFLAGS += -lintlc # compile with icpx and link with GPUCC (undefined reference to `_intel_fast_memcpy')
-$(cu_main): LIBFLAGS += -lsvml # compile with icpx and link with GPUCC (undefined reference to `__svml_cos4_l9')
+$(cu_main): LIBFLAGS += -lintlc # compile with icpx and link with nvcc (undefined reference to `_intel_fast_memcpy')
+$(cu_main): LIBFLAGS += -lsvml # compile with icpx and link with nvcc (undefined reference to `__svml_cos4_l9')
 else ifneq ($(shell $(CXX) --version | grep ^nvc++),) # support nvc++ #531
 $(cu_main): LIBFLAGS += -L$(patsubst %bin/nvc++,%lib,$(subst ccache ,,$(CXX))) -lnvhpcatm -lnvcpumath -lnvc
 endif
 $(cu_main): LIBFLAGS += $(CULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH
 $(cu_main): $(BUILDDIR)/gcheck_sa.o $(LIBDIR)/lib$(MG5AMC_CULIB).so $(cu_objects_exe) $(BUILDDIR)/gCurandRandomNumberKernel.o
-	$(GPUCC) -o $@ $(BUILDDIR)/gcheck_sa.o $(CUARCHFLAGS) $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe) $(BUILDDIR)/gCurandRandomNumberKernel.o $(CURANDLIBFLAGS)
+	$(NVCC) -o $@ $(BUILDDIR)/gcheck_sa.o $(CUARCHFLAGS) $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe) $(BUILDDIR)/gCurandRandomNumberKernel.o $(CURANDLIBFLAGS)
 endif
 
 #-------------------------------------------------------------------------------
@@ -673,17 +596,17 @@ $(fcxx_main): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PA
 $(fcxx_main): $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler.o $(LIBDIR)/lib$(MG5AMC_CXXLIB).so $(cxx_objects_exe)
 	$(CXX) -o $@ $(BUILDDIR)/fcheck_sa.o $(OMPFLAGS) $(BUILDDIR)/fsampler.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CXXLIB) $(cxx_objects_exe)
 
-ifneq ($(GPUCC),)
+ifneq ($(NVCC),)
 ifneq ($(shell $(CXX) --version | grep ^Intel),)
-$(fcu_main): LIBFLAGS += -lintlc # compile with icpx and link with GPUCC (undefined reference to `_intel_fast_memcpy')
-$(fcu_main): LIBFLAGS += -lsvml # compile with icpx and link with GPUCC (undefined reference to `__svml_cos4_l9')
+$(fcu_main): LIBFLAGS += -lintlc # compile with icpx and link with nvcc (undefined reference to `_intel_fast_memcpy')
+$(fcu_main): LIBFLAGS += -lsvml # compile with icpx and link with nvcc (undefined reference to `__svml_cos4_l9')
 endif
 ifeq ($(UNAME_S),Darwin)
 $(fcu_main): LIBFLAGS += -L$(shell dirname $(shell $(FC) --print-file-name libgfortran.dylib)) # add path to libgfortran on Mac #375
 endif
 $(fcu_main): LIBFLAGS += $(CULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH
 $(fcu_main): $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler_cu.o $(LIBDIR)/lib$(MG5AMC_CULIB).so $(cu_objects_exe)
-	$(GPUCC) -o $@ $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler_cu.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe)
+	$(NVCC) -o $@ $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler_cu.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe)
 endif
 
 #-------------------------------------------------------------------------------
@@ -695,7 +618,7 @@ $(BUILDDIR)/testxxx.o: testxxx_cc_ref.txt
 $(testmain): $(BUILDDIR)/testxxx.o
 $(testmain): cxx_objects_exe += $(BUILDDIR)/testxxx.o # Comment out this line to skip the C++ test of xxx functions
 
-ifneq ($(GPUCC),)
+ifneq ($(NVCC),)
 $(BUILDDIR)/testxxx_cu.o: $(GTESTLIBS)
 $(BUILDDIR)/testxxx_cu.o: INCFLAGS += $(GTESTINC)
 $(BUILDDIR)/testxxx_cu.o: testxxx_cc_ref.txt
@@ -708,7 +631,7 @@ $(BUILDDIR)/testmisc.o: INCFLAGS += $(GTESTINC)
 $(testmain): $(BUILDDIR)/testmisc.o
 $(testmain): cxx_objects_exe += $(BUILDDIR)/testmisc.o # Comment out this line to skip the C++ miscellaneous tests
 
-ifneq ($(GPUCC),)
+ifneq ($(NVCC),)
 $(BUILDDIR)/testmisc_cu.o: $(GTESTLIBS)
 $(BUILDDIR)/testmisc_cu.o: INCFLAGS += $(GTESTINC)
 $(testmain): $(BUILDDIR)/testmisc_cu.o
@@ -720,12 +643,12 @@ $(BUILDDIR)/runTest.o: INCFLAGS += $(GTESTINC)
 $(testmain): $(BUILDDIR)/runTest.o
 $(testmain): cxx_objects_exe += $(BUILDDIR)/runTest.o
 
-ifneq ($(GPUCC),)
+ifneq ($(NVCC),)
 $(BUILDDIR)/runTest_cu.o: $(GTESTLIBS)
 $(BUILDDIR)/runTest_cu.o: INCFLAGS += $(GTESTINC)
 ifneq ($(shell $(CXX) --version | grep ^Intel),)
-$(testmain): LIBFLAGS += -lintlc # compile with icpx and link with GPUCC (undefined reference to `_intel_fast_memcpy')
-$(testmain): LIBFLAGS += -lsvml # compile with icpx and link with GPUCC (undefined reference to `__svml_cos4_l9')
+$(testmain): LIBFLAGS += -lintlc # compile with icpx and link with nvcc (undefined reference to `_intel_fast_memcpy')
+$(testmain): LIBFLAGS += -lsvml # compile with icpx and link with nvcc (undefined reference to `__svml_cos4_l9')
 else ifneq ($(shell $(CXX) --version | grep ^nvc++),) # support nvc++ #531
 $(testmain): LIBFLAGS += -L$(patsubst %bin/nvc++,%lib,$(subst ccache ,,$(CXX))) -lnvhpcatm -lnvcpumath -lnvc
 endif
@@ -749,14 +672,14 @@ $(testmain): LIBFLAGS += -lgomp
 endif
 endif
 
-ifeq ($(GPUCC),) # link only runTest.o
+ifeq ($(NVCC),) # link only runTest.o
 $(testmain): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH
 $(testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(cxx_objects_exe) $(GTESTLIBS)
 	$(CXX) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) -ldl -pthread $(LIBFLAGS)
 else # link both runTest.o and runTest_cu.o
 $(testmain): LIBFLAGS += $(CULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH
 $(testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) $(GTESTLIBS)
-	  $(GPUCC) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) -ldl $(LIBFLAGS) $(CUDATESTFLAGS)
+	$(NVCC) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) -ldl $(LIBFLAGS) -lcuda
 endif
 
 # Use flock (Linux only, no Mac) to allow 'make -j' if googletest has not yet been downloaded https://stackoverflow.com/a/32666215
@@ -859,9 +782,9 @@ ifeq ($(USECCACHE),1)
 	ccache --version | head -1
 endif
 	@echo ""
-	@echo GPUCC=$(GPUCC)
-ifneq ($(GPUCC),)
-	$(GPUCC) --version
+	@echo NVCC=$(NVCC)
+ifneq ($(NVCC),)
+	$(NVCC) --version
 endif
 	@echo ""
 	@echo CXX=$(CXX)
@@ -880,7 +803,7 @@ endif
 
 # Target: check (run the C++ test executable)
 # [NB THIS IS WHAT IS USED IN THE GITHUB CI!]
-ifneq ($(GPUCC),)
+ifneq ($(NVCC),)
 check: runTest cmpFcheck cmpFGcheck
 else
 check: runTest cmpFcheck
diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/fbridge.cc b/epochX/cudacpp/gg_tt.mad/SubProcesses/fbridge.cc
index 2b956730d4..f93c05b0b3 100644
--- a/epochX/cudacpp/gg_tt.mad/SubProcesses/fbridge.cc
+++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/fbridge.cc
@@ -1,11 +1,11 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: S. Roiser (Oct 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Roiser, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #include "Bridge.h"
 #include "CPPProcess.h"
-#include "GpuRuntime.h"
+#include "CudaRuntime.h"
 
 extern "C"
 {
@@ -22,7 +22,7 @@ extern "C"
    * Using the same Fortran MadEvent code, linking to the hetrerogeneous library would allow access to both CPU and GPU implementations.
    * The specific heterogeneous configuration (how many GPUs, how many threads on each CPU, etc) could be loaded in CUDA/C++ from a data file.
    */
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
   using namespace mg5amcGpu;
 #else
   using namespace mg5amcCpu;
@@ -46,8 +46,8 @@ extern "C"
    */
   void fbridgecreate_( CppObjectInFortran** ppbridge, const int* pnevtF, const int* pnparF, const int* pnp4F )
   {
-#ifdef MGONGPUCPP_GPUIMPL
-    GpuRuntime::setUp();
+#ifdef __CUDACC__
+    CudaRuntime::setUp();
 #endif
     // Create a process object, read parm card and set parameters
     // FIXME: the process instance can happily go out of scope because it is only needed to read parameters?
@@ -69,8 +69,8 @@ extern "C"
     Bridge<FORTRANFPTYPE>* pbridge = dynamic_cast<Bridge<FORTRANFPTYPE>*>( *ppbridge );
     if( pbridge == 0 ) throw std::runtime_error( "fbridgedelete_: invalid Bridge address" );
     delete pbridge;
-#ifdef MGONGPUCPP_GPUIMPL
-    GpuRuntime::tearDown();
+#ifdef __CUDACC__
+    CudaRuntime::tearDown();
 #endif
   }
 
@@ -100,7 +100,7 @@ extern "C"
   {
     Bridge<FORTRANFPTYPE>* pbridge = dynamic_cast<Bridge<FORTRANFPTYPE>*>( *ppbridge );
     if( pbridge == 0 ) throw std::runtime_error( "fbridgesequence_: invalid Bridge address" );
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
     // Use the device/GPU implementation in the CUDA library
     // (there is also a host implementation in this library)
     pbridge->gpu_sequence( momenta, gs, rndhel, rndcol, *pchannelId, mes, selhel, selcol );
diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/fsampler.cc b/epochX/cudacpp/gg_tt.mad/SubProcesses/fsampler.cc
index 3743934f41..2fb445372d 100644
--- a/epochX/cudacpp/gg_tt.mad/SubProcesses/fsampler.cc
+++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/fsampler.cc
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Feb 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #include "mgOnGpuConfig.h"
 
@@ -13,7 +13,7 @@
 
 //--------------------------------------------------------------------------
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -40,7 +40,7 @@ namespace mg5amcCpu
   private:
     const int m_nevt; // The number of events in each iteration
     int m_iiter;      // The iteration counter (for random number seeding)
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
     HostBufferRndNumMomenta m_hstRndmom; // Memory buffers for random numbers
     HostBufferMomenta m_hstMomenta;      // Memory buffers for momenta
     HostBufferWeights m_hstWeights;      // Memory buffers for sampling weights
@@ -105,7 +105,7 @@ namespace mg5amcCpu
 
 extern "C"
 {
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
   using namespace mg5amcGpu;
 #else
   using namespace mg5amcCpu;
diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/runTest.cc b/epochX/cudacpp/gg_tt.mad/SubProcesses/runTest.cc
index 461ec5c3a5..572e28aaea 100644
--- a/epochX/cudacpp/gg_tt.mad/SubProcesses/runTest.cc
+++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/runTest.cc
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: S. Hageboeck (Nov 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 
 #include "mgOnGpuConfig.h"
 
@@ -15,7 +15,7 @@
 #include "RandomNumberKernels.h"
 #include "epoch_process_id.h"
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 using namespace mg5amcGpu;
 #else
 using namespace mg5amcCpu;
@@ -32,7 +32,7 @@ struct CUDA_CPU_TestBase : public TestDriverBase
     : TestDriverBase( npar, refFileName ) {}
 };
 
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
 struct CPUTest : public CUDA_CPU_TestBase
 {
   // Struct data members (process, and memory structures for random numbers, momenta, matrix elements and weights on host and device)
@@ -114,7 +114,7 @@ struct CPUTest : public CUDA_CPU_TestBase
 };
 #endif
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 struct CUDATest : public CUDA_CPU_TestBase
 {
   // Reset the device when our test goes out of scope. Note that this should happen after
@@ -123,7 +123,7 @@ struct CUDATest : public CUDA_CPU_TestBase
   {
     ~DeviceReset()
     {
-      gpuDeviceReset(); // this is needed by cuda-memcheck --leak-check full
+      checkCuda( cudaDeviceReset() ); // this is needed by cuda-memcheck --leak-check full
     }
   } deviceResetter;
 
@@ -249,7 +249,7 @@ INSTANTIATE_TEST_SUITE_P( prefix, \
                           test_suite_name, \
                           testing::Values( new CUDATest( MG_EPOCH_REFERENCE_FILE_NAME ) ) );
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 MG_INSTANTIATE_TEST_SUITE_GPU( XTESTID_GPU( MG_EPOCH_PROCESS_ID ), MadgraphTest );
 #else
 MG_INSTANTIATE_TEST_SUITE_CPU( XTESTID_CPU( MG_EPOCH_PROCESS_ID ), MadgraphTest );
diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/testmisc.cc b/epochX/cudacpp/gg_tt.mad/SubProcesses/testmisc.cc
index 2bd7a9fcf9..989aba1fdc 100644
--- a/epochX/cudacpp/gg_tt.mad/SubProcesses/testmisc.cc
+++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/testmisc.cc
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 //----------------------------------------------------------------------------
 // Use ./runTest.exe --gtest_filter=*misc to run only this test
 //----------------------------------------------------------------------------
@@ -17,7 +17,7 @@
 #include <sstream>
 #include <typeinfo>
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 #define TESTID( s ) s##_GPU_MISC
 #else
 #define TESTID( s ) s##_CPU_MISC
diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/testxxx.cc b/epochX/cudacpp/gg_tt.mad/SubProcesses/testxxx.cc
index 6e8657edca..4243e9fcec 100644
--- a/epochX/cudacpp/gg_tt.mad/SubProcesses/testxxx.cc
+++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/testxxx.cc
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Apr 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #include "mgOnGpuConfig.h"
 
@@ -20,7 +20,7 @@
 #include <iomanip>
 #include <iostream>
 #include <vector>
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 #define TESTID( s ) s##_GPU_XXX
 #else
 #define TESTID( s ) s##_CPU_XXX
@@ -41,7 +41,7 @@ TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testxxx )
   assert( nevt % neppM == 0 ); // nevt must be a multiple of neppM
   assert( nevt % neppV == 0 ); // nevt must be a multiple of neppV
   // Fill in the input momenta
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
   mg5amcGpu::PinnedHostBufferMomenta hstMomenta( nevt ); // AOSOA[npagM][npar=4][np4=4][neppM]
 #else
   mg5amcCpu::HostBufferMomenta hstMomenta( nevt ); // AOSOA[npagM][npar=4][np4=4][neppM]
@@ -228,7 +228,7 @@ TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testxxx )
   {
     for( int ievt = 0; ievt < nevt; ievt++ )
     {
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
       using namespace mg5amcGpu;
 #else
       using namespace mg5amcCpu;
diff --git a/epochX/cudacpp/gg_tt.mad/src/HelAmps_sm.h b/epochX/cudacpp/gg_tt.mad/src/HelAmps_sm.h
index f7ecb29537..bc2adb6258 100644
--- a/epochX/cudacpp/gg_tt.mad/src/HelAmps_sm.h
+++ b/epochX/cudacpp/gg_tt.mad/src/HelAmps_sm.h
@@ -4,7 +4,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: A. Valassi (Sep 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
 // MadGraph5_aMC@NLO v. 3.5.0_lo_vect, 2023-06-09
@@ -26,7 +26,7 @@
 //#include <iomanip>
 //#include <iostream>
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/gg_tt.mad/src/Parameters_sm.cc b/epochX/cudacpp/gg_tt.mad/src/Parameters_sm.cc
index 459dae9e99..7255e49119 100644
--- a/epochX/cudacpp/gg_tt.mad/src/Parameters_sm.cc
+++ b/epochX/cudacpp/gg_tt.mad/src/Parameters_sm.cc
@@ -4,7 +4,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: A. Valassi (Sep 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
 // MadGraph5_aMC@NLO v. 3.5.0_lo_vect, 2023-06-09
diff --git a/epochX/cudacpp/gg_tt.mad/src/Parameters_sm.h b/epochX/cudacpp/gg_tt.mad/src/Parameters_sm.h
index db5520aa96..c935779eb3 100644
--- a/epochX/cudacpp/gg_tt.mad/src/Parameters_sm.h
+++ b/epochX/cudacpp/gg_tt.mad/src/Parameters_sm.h
@@ -211,7 +211,7 @@ namespace Parameters_sm_dependentCouplings
 #pragma GCC diagnostic push
 #pragma GCC diagnostic ignored "-Wunused-variable"  // e.g. <<warning: unused variable ‘mdl_G__exp__2’ [-Wunused-variable]>>
 #pragma GCC diagnostic ignored "-Wunused-parameter" // e.g. <<warning: unused parameter ‘G’ [-Wunused-parameter]>>
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 #pragma nv_diagnostic push
 #pragma nv_diag_suppress 177 // e.g. <<warning #177-D: variable "mdl_G__exp__2" was declared but never referenced>>
 #endif
@@ -238,7 +238,7 @@ namespace Parameters_sm_dependentCouplings
     // End SM implementation - no special handling of vectors of floats as in EFT (#439)
     return out;
   }
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 #pragma GCC diagnostic pop
 #pragma nv_diagnostic pop
 #endif
@@ -254,7 +254,7 @@ namespace Parameters_sm_independentCouplings
 
 //==========================================================================
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/gg_tt.mad/src/mgOnGpuConfig.h b/epochX/cudacpp/gg_tt.mad/src/mgOnGpuConfig.h
index 390766116b..881353abac 100644
--- a/epochX/cudacpp/gg_tt.mad/src/mgOnGpuConfig.h
+++ b/epochX/cudacpp/gg_tt.mad/src/mgOnGpuConfig.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jul 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MGONGPUCONFIG_H
 #define MGONGPUCONFIG_H 1
@@ -10,26 +10,13 @@
 // There are two different code bases for standalone_cudacpp (without multichannel) and madevent+cudacpp (with multichannel)
 #define MGONGPU_SUPPORTS_MULTICHANNEL 1
 
-// Is this a GPU (CUDA, HIP) or CPU implementation?
-#ifdef __CUDACC__
-#define MGONGPUCPP_GPUIMPL cuda
-#elif defined __HIPCC__
-#define MGONGPUCPP_GPUIMPL hip
-#else
-#undef MGONGPUCPP_GPUIMPL
-#endif
-
 // ** NB1 Throughputs (e.g. 6.8E8) are events/sec for "./gcheck.exe -p 65536 128 12"
 // ** NB2 Baseline on b7g47n0004 fluctuates (probably depends on load on other VMs)
 
 // Choose if curand is supported for generating random numbers
-// For CUDA, by default, it is supported
-// For HIP, by default, it is not supported
 // For C++, by default, do not inline, but allow this macro to be set from outside with e.g. -DMGONGPU_HAS_NO_CURAND
 #ifdef __CUDACC__
 #undef MGONGPU_HAS_NO_CURAND
-#elif defined __HIPCC__
-#define MGONGPU_HAS_NO_CURAND 1
 #else
 //#undef MGONGPU_HAS_NO_CURAND // default
 ////#define MGONGPU_HAS_NO_CURAND 1
@@ -65,28 +52,23 @@
 //#undef MGONGPU_HARDCODE_PARAM // default
 ////#define MGONGPU_HARDCODE_PARAM 1
 
-// Complex type in CUDA: thrust or cucomplex or cxsmpl (CHOOSE ONLY ONE)
+// Complex type in c++: std::complex or cxsmpl (CHOOSE ONLY ONE)
+#ifndef __CUDACC__
+//#define MGONGPU_CPPCXTYPE_STDCOMPLEX 1 // ~8 percent slower on float, same on double (5.1E6/double, 9.4E6/float)
+#define MGONGPU_CPPCXTYPE_CXSMPL 1 // new default (5.1E6/double, 10.2E6/float)
+#endif
+
+// Complex type in cuda: thrust or cucomplex or cxsmpl (CHOOSE ONLY ONE)
 #ifdef __CUDACC__
 #define MGONGPU_CUCXTYPE_THRUST 1 // default (~1.15E9/double, ~3.2E9/float)
 //#define MGONGPU_CUCXTYPE_CUCOMPLEX 1 // ~10 percent slower (1.03E9/double, ~2.8E9/float)
 //#define MGONGPU_CUCXTYPE_CXSMPL 1 // ~10 percent slower (1.00E9/double, ~2.9E9/float)
-
-// Complex type in HIP: cxsmpl (ONLY ONE OPTION POSSIBLE)
-#elif defined __HIPCC__
-#define MGONGPU_CUCXTYPE_CXSMPL 1 // ~10 percent slower (1.00E9/double, ~2.9E9/float)
-
-// Complex type in C++: std::complex or cxsmpl (CHOOSE ONLY ONE)
-#else
-//#define MGONGPU_CPPCXTYPE_STDCOMPLEX 1 // ~8 percent slower on float, same on double (5.1E6/double, 9.4E6/float)
-#define MGONGPU_CPPCXTYPE_CXSMPL 1 // new default (5.1E6/double, 10.2E6/float)
 #endif
 
-// CUDA nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation
+// Cuda nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation
 #ifdef __CUDACC__
-#undef MGONGPU_NSIGHT_DEBUG // default in CUDA
+#undef MGONGPU_NSIGHT_DEBUG // default
 //#define MGONGPU_NSIGHT_DEBUG 1
-#else
-#undef MGONGPU_NSIGHT_DEBUG // only option in HIP or C++
 #endif
 
 // SANITY CHECKS (floating point precision for everything but color algebra #537)
@@ -102,21 +84,17 @@
 #error You cannot use double precision for color algebra and single precision elsewhere
 #endif
 
-// SANITY CHECKS (CUDA complex number implementation)
-#ifdef __CUDACC__
-#if defined MGONGPU_CUCXTYPE_THRUST and defined MGONGPU_CUCXTYPE_CUCOMPLEX
-#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CUCXTYPE_THRUST or MGONGPU_CUCXTYPE_CUCOMPLEX for CUDA
-#elif defined MGONGPU_CUCXTYPE_THRUST and defined MGONGPU_CUCXTYPE_CXSMPL
-#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CUCXTYPE_THRUST or MGONGPU_CUCXTYPE_CXSMPL for CUDA
-#elif defined MGONGPU_CUCXTYPE_CUCOMPLEX and defined MGONGPU_CUCXTYPE_CXSMPL
-#error You must CHOOSE (ONE AND) ONLY ONE OF MGONGPU_CUCXTYPE_CUCOMPLEX or MGONGPU_CUCXTYPE_CXSMPL for CUDA
+// SANITY CHECKS (c++ complex number implementation)
+#ifndef __CUDACC__
+#if defined MGONGPU_CPPCXTYPE_STDCOMPLEX and defined MGONGPU_CPPCXTYPE_CXSMPL
+#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CPPCXTYPE_STDCOMPLEX or MGONGPU_CPPCXTYPE_CXSMPL
 #endif
 #endif
 
-// SANITY CHECKS (C++ complex number implementation)
-#ifndef MGONGPUCPP_GPUIMPL
-#if defined MGONGPU_CPPCXTYPE_STDCOMPLEX and defined MGONGPU_CPPCXTYPE_CXSMPL
-#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CPPCXTYPE_STDCOMPLEX or MGONGPU_CPPCXTYPE_CXSMPL for C++
+// SANITY CHECKS (cuda complex number implementation)
+#ifdef __CUDACC__
+#if defined MGONGPU_CUCXTYPE_THRUST and defined MGONGPU_CUCXTYPE_CUCOMPLEX and defined MGONGPU_CUCXTYPE_CXSMPL
+#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CUCXTYPE_THRUST or MGONGPU_CUCXTYPE_CUCOMPLEX or MGONGPU_CUCXTYPE_CXSMPL
 #endif
 #endif
 
@@ -153,7 +131,7 @@ namespace mgOnGpu
   // Alignment requirement for using reinterpret_cast with SIMD vectorized code
   // (using reinterpret_cast with non aligned memory may lead to segmentation faults!)
   // Only needed for C++ code but can be enforced also in NVCC builds of C++ code using CUDA>=11.2 and C++17 (#318, #319, #333)
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
   constexpr int cppAlign = 64; // alignment requirement for SIMD vectorization (64-byte i.e. 512-bit)
 #endif
 
@@ -164,7 +142,7 @@ using mgOnGpu::fptype;
 using mgOnGpu::fptype2;
 
 // C++ SIMD vectorization width (this will be used to set neppV)
-#ifdef MGONGPUCPP_GPUIMPL // CUDA and HIP implementations have no SIMD
+#ifdef __CUDACC__ // CUDA implementation has no SIMD
 #undef MGONGPU_CPPSIMD
 #elif defined __AVX512VL__ && defined MGONGPU_PVW512 // C++ "512z" AVX512 with 512 width (512-bit ie 64-byte): 8 (DOUBLE) or 16 (FLOAT)
 #ifdef MGONGPU_FPTYPE_DOUBLE
@@ -194,9 +172,9 @@ using mgOnGpu::fptype2;
 #undef MGONGPU_CPPSIMD
 #endif
 
-// CUDA nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation
+// Cuda nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation
 // Arguments (not used so far): text is __FUNCTION__, code is 0 (start) or 1 (end)
-#if defined __CUDA__ && defined MGONGPU_NSIGHT_DEBUG /* clang-format off */
+#if defined __CUDACC__ && defined MGONGPU_NSIGHT_DEBUG /* clang-format off */
 #define mgDebugDeclare() __shared__ float mgDebugCounter[mgOnGpu::ntpbMAX];
 #define mgDebugInitialise() { mgDebugCounter[threadIdx.x] = 0; }
 #define mgDebug( code, text ) { mgDebugCounter[threadIdx.x] += 1; }
@@ -208,8 +186,8 @@ using mgOnGpu::fptype2;
 #define mgDebugFinalise() { /*noop*/ }
 #endif /* clang-format on */
 
-// Define empty CUDA/HIP declaration specifiers for C++
-#ifndef MGONGPUCPP_GPUIMPL
+// Define empty CUDA declaration specifiers for C++
+#ifndef __CUDACC__
 #define __global__
 #define __host__
 #define __device__
diff --git a/epochX/cudacpp/gg_tt.mad/src/mgOnGpuCxtypes.h b/epochX/cudacpp/gg_tt.mad/src/mgOnGpuCxtypes.h
index 4e7ab03fa2..0cb2f1db7e 100644
--- a/epochX/cudacpp/gg_tt.mad/src/mgOnGpuCxtypes.h
+++ b/epochX/cudacpp/gg_tt.mad/src/mgOnGpuCxtypes.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022, based on earlier work by D. Smith) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MGONGPUCXTYPES_H
 #define MGONGPUCXTYPES_H 1
@@ -19,7 +19,7 @@
 #include <complex>
 
 // Complex type in cuda: thrust or cucomplex or cxsmpl
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 #if defined MGONGPU_CUCXTYPE_THRUST
 #pragma clang diagnostic push
 #pragma clang diagnostic ignored "-Wtautological-compare" // for icpx2021/clang13 (https://stackoverflow.com/a/15864661)
@@ -201,7 +201,7 @@ namespace mgOnGpu
 {
 
   // --- Type definitions (complex type: cxtype)
-#ifdef MGONGPUCPP_GPUIMPL // cuda
+#ifdef __CUDACC__ // cuda
 #if defined MGONGPU_CUCXTYPE_THRUST
   typedef thrust::complex<fptype> cxtype;
 #elif defined MGONGPU_CUCXTYPE_CUCOMPLEX
@@ -281,7 +281,7 @@ cxmake( const std::complex<double>& c ) // std::complex to cxsmpl (double-to-flo
 
 //==========================================================================
 
-#if defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CUCXTYPE_THRUST // cuda + thrust
+#if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_THRUST // cuda + thrust
 
 //------------------------------
 // CUDA - using thrust::complex
@@ -317,11 +317,11 @@ cxmake( const cxtype& c )
   return c;
 }
 
-#endif // #if defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CUCXTYPE_THRUST
+#endif // #if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_THRUST
 
 //==========================================================================
 
-#if defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CUCXTYPE_CUCOMPLEX // cuda + cucomplex
+#if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_CUCOMPLEX // cuda + cucomplex
 
 //------------------------------
 // CUDA - using cuComplex
@@ -536,11 +536,11 @@ cxmake( const std::complex<fptype>& c ) // std::complex to cucomplex (float-to-f
   return cxmake( c.real(), c.imag() );
 }
 
-#endif // #if defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CUCXTYPE_CUCOMPLEX
+#endif // #if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_CUCOMPLEX
 
 //==========================================================================
 
-#if not defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CPPCXTYPE_STDCOMPLEX // c++ + stdcomplex
+#if not defined __CUDACC__ and defined MGONGPU_CPPCXTYPE_STDCOMPLEX // c++ + stdcomplex
 
 //------------------------------
 // C++ - using std::complex
@@ -584,7 +584,7 @@ cxmake( const std::complex<double>& c ) // std::complex to std::complex (cast do
 }
 #endif
 
-#endif // #if not defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CPPCXTYPE_STDCOMPLEX
+#endif // #if not defined __CUDACC__ and defined MGONGPU_CPPCXTYPE_STDCOMPLEX
 
 //==========================================================================
 
diff --git a/epochX/cudacpp/gg_tt.mad/src/mgOnGpuFptypes.h b/epochX/cudacpp/gg_tt.mad/src/mgOnGpuFptypes.h
index 6f6cee64d6..a1cde16a67 100644
--- a/epochX/cudacpp/gg_tt.mad/src/mgOnGpuFptypes.h
+++ b/epochX/cudacpp/gg_tt.mad/src/mgOnGpuFptypes.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MGONGPUFPTYPES_H
 #define MGONGPUFPTYPES_H 1
@@ -13,7 +13,7 @@
 
 //==========================================================================
 
-#ifdef MGONGPUCPP_GPUIMPL // cuda
+#ifdef __CUDACC__ // cuda
 
 //------------------------------
 // Floating point types - Cuda
@@ -57,11 +57,11 @@ fpsqrt( const fptype& f )
 #endif
 }
 
-#endif // #ifdef MGONGPUCPP_GPUIMPL
+#endif // #ifdef __CUDACC__
 
 //==========================================================================
 
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
 
 //------------------------------
 // Floating point types - C++
@@ -85,7 +85,7 @@ fpsqrt( const fptype& f )
   return std::sqrt( f );
 }
 
-#endif // #ifndef MGONGPUCPP_GPUIMPL
+#endif // #ifndef __CUDACC__
 
 //==========================================================================
 
diff --git a/epochX/cudacpp/gg_tt.mad/src/mgOnGpuVectors.h b/epochX/cudacpp/gg_tt.mad/src/mgOnGpuVectors.h
index 7904b93c61..9d3e82b1e3 100644
--- a/epochX/cudacpp/gg_tt.mad/src/mgOnGpuVectors.h
+++ b/epochX/cudacpp/gg_tt.mad/src/mgOnGpuVectors.h
@@ -108,7 +108,7 @@ namespace mgOnGpu /* clang-format off */
 #endif
 #endif
 
-#else // i.e #ifndef MGONGPU_CPPSIMD (this includes #ifdef MGONGPUCPP_GPUIMPL)
+#else // i.e #ifndef MGONGPU_CPPSIMD (this includes #ifdef __CUDACC__)
 
   const int neppV = 1;
 
@@ -129,7 +129,7 @@ using mgOnGpu::bool_v;
 
 //--------------------------------------------------------------------------
 
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
 
 // Printout to stream for user defined types
 
@@ -744,11 +744,11 @@ fpvsplit1( const fptype2_v& v )
 
 #endif // #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
 
-#endif // #ifndef MGONGPUCPP_GPUIMPL
+#endif // #ifndef __CUDACC__
 
 //==========================================================================
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 
 //------------------------------
 // Vector types - CUDA
@@ -786,12 +786,12 @@ cxternary( const bool& mask, const cxtype& a, const cxtype& b )
   return ( mask ? a : b );
 }
 
-#endif // #ifdef MGONGPUCPP_GPUIMPL
+#endif // #ifdef __CUDACC__
 
 //==========================================================================
 
 // Scalar-or-vector types: scalar in CUDA, vector or scalar in C++
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 typedef bool bool_sv;
 typedef fptype fptype_sv;
 typedef fptype2 fptype2_sv;
@@ -812,7 +812,7 @@ typedef mgOnGpu::cxtype_ref cxtype_sv_ref;
 #endif
 
 // Scalar-or-vector zeros: scalar in CUDA, vector or scalar in C++
-#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
+#ifdef __CUDACC__ /* clang-format off */
 inline __host__ __device__ cxtype cxzero_sv(){ return cxtype( 0, 0 ); }
 #elif defined MGONGPU_CPPSIMD
 inline cxtype_v cxzero_sv() { return cxtype_v(); } // RRRR=0000 IIII=0000
diff --git a/epochX/cudacpp/gg_tt.mad/src/rambo.h b/epochX/cudacpp/gg_tt.mad/src/rambo.h
index cd7e1008ea..e02ea52496 100644
--- a/epochX/cudacpp/gg_tt.mad/src/rambo.h
+++ b/epochX/cudacpp/gg_tt.mad/src/rambo.h
@@ -4,7 +4,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 
 #include "mgOnGpuConfig.h"
@@ -18,7 +18,7 @@
 #include <iostream>
 
 // Simplified rambo version for 2 to N (with N>=2) processes with massless particles
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -83,7 +83,7 @@ namespace mg5amcCpu
       static bool first = true;
       if( first )
       {
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
         if constexpr( M_ACCESS::isOnDevice() ) // avoid
         {
           const int ievt0 = 0;
@@ -166,7 +166,7 @@ namespace mg5amcCpu
     wt = po2log;
     if( nparf != 2 ) wt = ( 2. * nparf - 4. ) * log( energy ) + z[nparf - 1];
 
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
     // issue warnings if weight is too small or too large
     static int iwarn[5] = { 0, 0, 0, 0, 0 };
     if( wt < -180. )
diff --git a/epochX/cudacpp/gg_tt.sa/COPYRIGHT b/epochX/cudacpp/gg_tt.sa/COPYRIGHT
index 84a883fbb0..a134b5fef9 100644
--- a/epochX/cudacpp/gg_tt.sa/COPYRIGHT
+++ b/epochX/cudacpp/gg_tt.sa/COPYRIGHT
@@ -15,7 +15,6 @@ The full development team currently includes the following authors :
   Stephan Hageboeck (CERN)
   Olivier Mattelaer (Universite Catholique de Louvain, original author)
   Stefan Roiser (CERN, original author)
-  Joergen Teig (CERN)
   Andrea Valassi (CERN, original author)
   Zenny Wettersten (CERN)
 See https://github.com/madgraph5/madgraph4gpu for more details. For the full
diff --git a/epochX/cudacpp/gg_tt.sa/SubProcesses/Bridge.h b/epochX/cudacpp/gg_tt.sa/SubProcesses/Bridge.h
index c04628dfd1..4cafe0c997 100644
--- a/epochX/cudacpp/gg_tt.sa/SubProcesses/Bridge.h
+++ b/epochX/cudacpp/gg_tt.sa/SubProcesses/Bridge.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: S. Roiser (Nov 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Roiser, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef BRIDGE_H
 #define BRIDGE_H 1
@@ -22,7 +22,7 @@
 #include <memory>
 #include <type_traits>
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -82,7 +82,7 @@ namespace mg5amcCpu
     Bridge& operator=( const Bridge& ) = delete;
     Bridge& operator=( Bridge&& ) = delete;
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
     /**
      * Set the gpublocks and gputhreads for the gpusequence - throws if evnt != gpublocks*gputhreads
      * (this is needed for BridgeKernel tests rather than for actual production use in Fortran)
@@ -149,7 +149,7 @@ namespace mg5amcCpu
     unsigned int m_nevt; // number of events
     int m_nGoodHel;      // the number of good helicities (-1 initially when they have not yet been calculated)
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
     int m_gputhreads; // number of gpu threads (default set from number of events, can be modified)
     int m_gpublocks;  // number of gpu blocks (default set from number of events, can be modified)
     mg5amcGpu::DeviceBuffer<FORTRANFPTYPE, sizePerEventMomenta> m_devMomentaF;
@@ -186,12 +186,12 @@ namespace mg5amcCpu
   // Forward declare transposition methods
   //
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 
   template<typename Tin, typename Tout>
   __global__ void dev_transposeMomentaF2C( const Tin* in, Tout* out, const unsigned int nevt );
 
-#endif // MGONGPUCPP_GPUIMPL
+#endif // __CUDACC__
 
   template<typename Tin, typename Tout>
   void hst_transposeMomentaF2C( const Tin* in, Tout* out, const unsigned int nevt );
@@ -208,7 +208,7 @@ namespace mg5amcCpu
   Bridge<FORTRANFPTYPE>::Bridge( unsigned int nevtF, unsigned int nparF, unsigned int np4F )
     : m_nevt( nevtF )
     , m_nGoodHel( -1 )
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
     , m_gputhreads( 256 )                  // default number of gpu threads
     , m_gpublocks( m_nevt / m_gputhreads ) // this ensures m_nevt <= m_gpublocks*m_gputhreads
     , m_devMomentaF( m_nevt )
@@ -232,7 +232,7 @@ namespace mg5amcCpu
   {
     if( nparF != CPPProcess::npar ) throw std::runtime_error( "Bridge constructor: npar mismatch" );
     if( np4F != CPPProcess::np4 ) throw std::runtime_error( "Bridge constructor: np4 mismatch" );
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
     if( ( m_nevt < s_gputhreadsmin ) || ( m_nevt % s_gputhreadsmin != 0 ) )
       throw std::runtime_error( "Bridge constructor: nevt should be a multiple of " + std::to_string( s_gputhreadsmin ) );
     while( m_nevt != m_gpublocks * m_gputhreads )
@@ -250,11 +250,11 @@ namespace mg5amcCpu
     std::cout << "WARNING! Instantiate host Bridge (nevt=" << m_nevt << ")" << std::endl;
     mg5amcCpu::CPPProcess process( /*verbose=*/false );
     m_pmek.reset( new mg5amcCpu::MatrixElementKernelHost( m_hstMomentaC, m_hstGs, m_hstRndHel, m_hstRndCol, m_hstMEs, m_hstSelHel, m_hstSelCol, m_nevt ) );
-#endif // MGONGPUCPP_GPUIMPL
+#endif // __CUDACC__
     process.initProc( "../../Cards/param_card.dat" );
   }
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
   template<typename FORTRANFPTYPE>
   void Bridge<FORTRANFPTYPE>::set_gpugrid( const int gpublocks, const int gputhreads )
   {
@@ -268,7 +268,7 @@ namespace mg5amcCpu
   }
 #endif
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
   template<typename FORTRANFPTYPE>
   void Bridge<FORTRANFPTYPE>::gpu_sequence( const FORTRANFPTYPE* momenta,
                                             const FORTRANFPTYPE* gs,
@@ -283,14 +283,14 @@ namespace mg5amcCpu
     constexpr int neppM = MemoryAccessMomenta::neppM;
     if constexpr( neppM == 1 && std::is_same_v<FORTRANFPTYPE, fptype> )
     {
-      gpuMemcpy( m_devMomentaC.data(), momenta, m_devMomentaC.bytes(), gpuMemcpyHostToDevice );
+      checkCuda( cudaMemcpy( m_devMomentaC.data(), momenta, m_devMomentaC.bytes(), cudaMemcpyHostToDevice ) );
     }
     else
     {
-      gpuMemcpy( m_devMomentaF.data(), momenta, m_devMomentaF.bytes(), gpuMemcpyHostToDevice );
+      checkCuda( cudaMemcpy( m_devMomentaF.data(), momenta, m_devMomentaF.bytes(), cudaMemcpyHostToDevice ) );
       const int thrPerEvt = CPPProcess::npar * CPPProcess::np4; // AV: transpose alg does 1 element per thread (NOT 1 event per thread)
       //const int thrPerEvt = 1; // AV: try new alg with 1 event per thread... this seems slower
-      gpuLaunchKernel( dev_transposeMomentaF2C, m_gpublocks * thrPerEvt, m_gputhreads, m_devMomentaF.data(), m_devMomentaC.data(), m_nevt );
+      dev_transposeMomentaF2C<<<m_gpublocks * thrPerEvt, m_gputhreads>>>( m_devMomentaF.data(), m_devMomentaC.data(), m_nevt );
     }
     if constexpr( std::is_same_v<FORTRANFPTYPE, fptype> )
     {
@@ -333,7 +333,7 @@ namespace mg5amcCpu
   }
 #endif
 
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
   template<typename FORTRANFPTYPE>
   void Bridge<FORTRANFPTYPE>::cpu_sequence( const FORTRANFPTYPE* momenta,
                                             const FORTRANFPTYPE* gs,
@@ -388,7 +388,7 @@ namespace mg5amcCpu
   // - C++ array: momenta[npagM][npar][np4][neppM] with nevt=npagM*neppM (AOSOA)
   //
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
   template<typename Tin, typename Tout>
   __global__ void dev_transposeMomentaF2C( const Tin* in, Tout* out, const unsigned int nevt )
   {
diff --git a/epochX/cudacpp/gg_tt.sa/SubProcesses/BridgeKernels.cc b/epochX/cudacpp/gg_tt.sa/SubProcesses/BridgeKernels.cc
index 90c7f2d3b8..cef4cb3c71 100644
--- a/epochX/cudacpp/gg_tt.sa/SubProcesses/BridgeKernels.cc
+++ b/epochX/cudacpp/gg_tt.sa/SubProcesses/BridgeKernels.cc
@@ -1,11 +1,10 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #include "BridgeKernels.h"
 
-#include "GpuAbstraction.h"
 #include "MemoryAccessMomenta.h"
 
 #include <sstream>
@@ -15,7 +14,7 @@ constexpr int npar = CPPProcess::npar; // #particles in total (external = initia
 
 //============================================================================
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -46,7 +45,7 @@ namespace mg5amcCpu
 
 //============================================================================
 
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
 namespace mg5amcCpu
 {
 
@@ -97,7 +96,7 @@ namespace mg5amcCpu
 
 //============================================================================
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 namespace mg5amcGpu
 {
 
diff --git a/epochX/cudacpp/gg_tt.sa/SubProcesses/BridgeKernels.h b/epochX/cudacpp/gg_tt.sa/SubProcesses/BridgeKernels.h
index 3efef8ce97..15eb4bff4d 100644
--- a/epochX/cudacpp/gg_tt.sa/SubProcesses/BridgeKernels.h
+++ b/epochX/cudacpp/gg_tt.sa/SubProcesses/BridgeKernels.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef BRIDGEKERNELS_H
 #define BRIDGEKERNELS_H 1
@@ -12,7 +12,7 @@
 #include "MatrixElementKernels.h"
 #include "MemoryBuffers.h"
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -49,7 +49,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
   // A Bridge wrapper class encapsulating matrix element calculations on a CPU host
   class BridgeKernelHost final : public BridgeKernelBase
   {
@@ -89,7 +89,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
   // A Bridge wrapper class encapsulating matrix element calculations on a GPU device
   class BridgeKernelDevice : public BridgeKernelBase
   {
diff --git a/epochX/cudacpp/gg_tt.sa/SubProcesses/CommonRandomNumberKernel.cc b/epochX/cudacpp/gg_tt.sa/SubProcesses/CommonRandomNumberKernel.cc
index 010bc4cbd0..985b39f576 100644
--- a/epochX/cudacpp/gg_tt.sa/SubProcesses/CommonRandomNumberKernel.cc
+++ b/epochX/cudacpp/gg_tt.sa/SubProcesses/CommonRandomNumberKernel.cc
@@ -1,16 +1,15 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #include "CommonRandomNumbers.h"
-#include "GpuAbstraction.h"
 #include "MemoryBuffers.h"
 #include "RandomNumberKernels.h"
 
 #include <cassert>
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/gg_tt.sa/SubProcesses/CrossSectionKernels.cc b/epochX/cudacpp/gg_tt.sa/SubProcesses/CrossSectionKernels.cc
index c15b39844d..0b355a3c8d 100644
--- a/epochX/cudacpp/gg_tt.sa/SubProcesses/CrossSectionKernels.cc
+++ b/epochX/cudacpp/gg_tt.sa/SubProcesses/CrossSectionKernels.cc
@@ -1,11 +1,10 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #include "CrossSectionKernels.h"
 
-#include "GpuAbstraction.h"
 #include "MemoryAccessMatrixElements.h"
 #include "MemoryAccessWeights.h"
 #include "MemoryBuffers.h"
@@ -78,7 +77,7 @@ debug_me_is_abnormal( const fptype& me, size_t ievtALL )
 
 //============================================================================
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -186,7 +185,7 @@ namespace mg5amcCpu
 
 //============================================================================
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 namespace mg5amcGpu
 {
 
diff --git a/epochX/cudacpp/gg_tt.sa/SubProcesses/CrossSectionKernels.h b/epochX/cudacpp/gg_tt.sa/SubProcesses/CrossSectionKernels.h
index 4d9659e04e..7933ca4bbf 100644
--- a/epochX/cudacpp/gg_tt.sa/SubProcesses/CrossSectionKernels.h
+++ b/epochX/cudacpp/gg_tt.sa/SubProcesses/CrossSectionKernels.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef CROSSSECTIONKERNELS_H
 #define CROSSSECTIONKERNELS_H 1
@@ -13,7 +13,7 @@
 
 //============================================================================
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -96,7 +96,7 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
   /*
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
   // A class encapsulating the calculation of event statistics on a GPU device
   class CrossSectionKernelDevice : public CrossSectionKernelBase, public NumberOfEvents
   {
diff --git a/epochX/cudacpp/gg_tt.sa/SubProcesses/CudaRuntime.h b/epochX/cudacpp/gg_tt.sa/SubProcesses/CudaRuntime.h
new file mode 100644
index 0000000000..64ce52f4b3
--- /dev/null
+++ b/epochX/cudacpp/gg_tt.sa/SubProcesses/CudaRuntime.h
@@ -0,0 +1,85 @@
+// Copyright (C) 2020-2023 CERN and UCLouvain.
+// Licensed under the GNU Lesser General Public License (version 3 or later).
+// Created by: S. Roiser (Jul 2020) for the MG5aMC CUDACPP plugin.
+// Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+
+#ifndef MG5AMC_CUDARUNTIME_H
+#define MG5AMC_CUDARUNTIME_H 1
+
+// MG5AMC on GPU uses the CUDA runtime API, not the lower level CUDA driver API
+// See https://docs.nvidia.com/cuda/cuda-runtime-api/driver-vs-runtime-api.html#driver-vs-runtime-api
+
+#include <cassert>
+#include <iostream>
+
+//--------------------------------------------------------------------------
+
+// See https://stackoverflow.com/a/14038590
+#ifdef __CUDACC__ /* clang-format off */
+#define checkCuda( code ) { assertCuda( code, __FILE__, __LINE__ ); }
+inline void assertCuda( cudaError_t code, const char* file, int line, bool abort = true )
+{
+  if( code != cudaSuccess )
+  {
+    printf( "ERROR! assertCuda: '%s' (%d) in %s:%d\n", cudaGetErrorString( code ), code, file, line );
+    if( abort ) assert( code == cudaSuccess );
+  }
+}
+#endif /* clang-format on */
+
+//--------------------------------------------------------------------------
+
+#ifdef __CUDACC__
+namespace mg5amcGpu
+{
+  // Instantiate a CudaRuntime at the beginnining of the application's main to
+  // invoke cudaSetDevice(0) in the constructor and book a cudaDeviceReset() call in the destructor
+  // *** FIXME! This will all need to be designed differently when going to multi-GPU nodes! ***
+  struct CudaRuntime final
+  {
+    CudaRuntime( const bool debug = true )
+      : m_debug( debug ) { setUp( m_debug ); }
+    ~CudaRuntime() { tearDown( m_debug ); }
+    CudaRuntime( const CudaRuntime& ) = delete;
+    CudaRuntime( CudaRuntime&& ) = delete;
+    CudaRuntime& operator=( const CudaRuntime& ) = delete;
+    CudaRuntime& operator=( CudaRuntime&& ) = delete;
+    bool m_debug;
+
+    // Set up CUDA application
+    // ** NB: strictly speaking this is not needed when using the CUDA runtime API **
+    // Calling cudaSetDevice on startup is useful to properly book-keep the time spent in CUDA initialization
+    static void setUp( const bool debug = true )
+    {
+      // ** NB: it is useful to call cudaSetDevice, or cudaFree, to properly book-keep the time spent in CUDA initialization
+      // ** NB: otherwise, the first CUDA operation (eg a cudaMemcpyToSymbol in CPPProcess ctor) appears to take much longer!
+      /*
+      // [We initially added cudaFree(0) to "ease profile analysis" only because it shows up as a big recognizable block!]
+      // No explicit initialization is needed: https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#initialization
+      // It is not clear what cudaFree(0) does at all: https://stackoverflow.com/questions/69967813/
+      if ( debug ) std::cout << "__CudaRuntime: calling cudaFree(0)" << std::endl;
+      checkCuda( cudaFree( 0 ) ); // SLOW!
+      */
+      // Replace cudaFree(0) by cudaSetDevice(0), even if it is not really needed either
+      // (but see https://developer.nvidia.com/blog/cuda-pro-tip-always-set-current-device-avoid-multithreading-bugs)
+      if( debug ) std::cout << "__CudaRuntime: calling cudaSetDevice(0)" << std::endl;
+      checkCuda( cudaSetDevice( 0 ) ); // SLOW!
+    }
+
+    // Tear down CUDA application (call cudaDeviceReset)
+    // ** NB: strictly speaking this is not needed when using the CUDA runtime API **
+    // Calling cudaDeviceReset on shutdown is only needed for checking memory leaks in cuda-memcheck
+    // See https://docs.nvidia.com/cuda/cuda-memcheck/index.html#leak-checking
+    static void tearDown( const bool debug = true )
+    {
+      if( debug ) std::cout << "__CudaRuntime: calling cudaDeviceReset()" << std::endl;
+      checkCuda( cudaDeviceReset() );
+    }
+  };
+
+}
+#endif
+
+//--------------------------------------------------------------------------
+
+#endif // MG5AMC_CUDARUNTIME_H
diff --git a/epochX/cudacpp/gg_tt.sa/SubProcesses/CurandRandomNumberKernel.cc b/epochX/cudacpp/gg_tt.sa/SubProcesses/CurandRandomNumberKernel.cc
index 38c477c17a..eb56333b03 100644
--- a/epochX/cudacpp/gg_tt.sa/SubProcesses/CurandRandomNumberKernel.cc
+++ b/epochX/cudacpp/gg_tt.sa/SubProcesses/CurandRandomNumberKernel.cc
@@ -1,9 +1,9 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
-#include "GpuRuntime.h"
+#include "CudaRuntime.h"
 #include "MemoryBuffers.h"
 #include "RandomNumberKernels.h"
 
@@ -114,7 +114,7 @@ namespace mg5amcCpu
     /*
     printf( "\nCurandRandomNumberKernel::generateRnarray size = %d\n", (int)m_rnarray.size() );
     fptype* data = m_rnarray.data();
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
     if( m_rnarray.isOnDevice() )
     {
       data = new fptype[m_rnarray.size()]();
@@ -123,7 +123,7 @@ namespace mg5amcCpu
 #endif
     for( int i = 0; i < ( (int)m_rnarray.size() / 4 ); i++ )
       printf( "[%4d] %f %f %f %f\n", i * 4, data[i * 4], data[i * 4 + 2], data[i * 4 + 2], data[i * 4 + 3] );
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
     if( m_rnarray.isOnDevice() ) delete[] data;
 #endif
     */
diff --git a/epochX/cudacpp/gg_tt.sa/SubProcesses/EventStatistics.h b/epochX/cudacpp/gg_tt.sa/SubProcesses/EventStatistics.h
index b425a5bade..48b51e0a49 100644
--- a/epochX/cudacpp/gg_tt.sa/SubProcesses/EventStatistics.h
+++ b/epochX/cudacpp/gg_tt.sa/SubProcesses/EventStatistics.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef EventStatistics_H
 #define EventStatistics_H 1
@@ -16,7 +16,7 @@
 #include <limits>
 #include <string>
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/gg_tt.sa/SubProcesses/MadgraphTest.h b/epochX/cudacpp/gg_tt.sa/SubProcesses/MadgraphTest.h
index d2ff326e20..fd7734ce42 100644
--- a/epochX/cudacpp/gg_tt.sa/SubProcesses/MadgraphTest.h
+++ b/epochX/cudacpp/gg_tt.sa/SubProcesses/MadgraphTest.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: S. Hageboeck (Dec 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MADGRAPHTEST_H_
 #define MADGRAPHTEST_H_ 1
@@ -21,7 +21,7 @@
 #include <string>
 #include <vector>
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 using mg5amcGpu::CPPProcess;
 #else
 using mg5amcCpu::CPPProcess;
@@ -200,7 +200,7 @@ class MadgraphTest : public testing::TestWithParam<TestDriverBase*>
 
 // Since we link both the CPU-only and GPU tests into the same executable, we prevent
 // a multiply defined symbol by only compiling this in the non-CUDA phase:
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
 
 /// Compare momenta and matrix elements.
 /// This uses an implementation of TestDriverBase to run a madgraph workflow,
@@ -309,6 +309,6 @@ TEST_P( MadgraphTest, CompareMomentaAndME )
   }
 }
 
-#endif // MGONGPUCPP_GPUIMPL
+#endif // __CUDACC__
 
 #endif /* MADGRAPHTEST_H_ */
diff --git a/epochX/cudacpp/gg_tt.sa/SubProcesses/MatrixElementKernels.cc b/epochX/cudacpp/gg_tt.sa/SubProcesses/MatrixElementKernels.cc
index d6d6c4f179..30257195b6 100644
--- a/epochX/cudacpp/gg_tt.sa/SubProcesses/MatrixElementKernels.cc
+++ b/epochX/cudacpp/gg_tt.sa/SubProcesses/MatrixElementKernels.cc
@@ -1,12 +1,12 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #include "MatrixElementKernels.h"
 
 #include "CPPProcess.h"
-#include "GpuRuntime.h" // Includes the abstraction for Nvidia/AMD compilation
+#include "CudaRuntime.h"
 #include "MemoryAccessMomenta.h"
 #include "MemoryBuffers.h"
 
@@ -14,7 +14,7 @@
 
 //============================================================================
 
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
 namespace mg5amcCpu
 {
 
@@ -143,7 +143,7 @@ namespace mg5amcCpu
 
 //============================================================================
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 namespace mg5amcGpu
 {
 
@@ -202,13 +202,13 @@ namespace mg5amcGpu
     PinnedHostBufferHelicityMask hstIsGoodHel( ncomb );
     DeviceBufferHelicityMask devIsGoodHel( ncomb );
     // ... 0d1. Compute good helicity mask on the device
-    gpuLaunchKernel( computeDependentCouplings, m_gpublocks, m_gputhreads, m_gs.data(), m_couplings.data() );
+    computeDependentCouplings<<<m_gpublocks, m_gputhreads>>>( m_gs.data(), m_couplings.data() );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    gpuLaunchKernel( sigmaKin_getGoodHel, m_gpublocks, m_gputhreads, m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_numerators.data(), m_denominators.data(), devIsGoodHel.data() );
+    sigmaKin_getGoodHel<<<m_gpublocks, m_gputhreads>>>( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_numerators.data(), m_denominators.data(), devIsGoodHel.data() );
 #else
-    gpuLaunchKernel( sigmaKin_getGoodHel, m_gpublocks, m_gputhreads, m_momenta.data(), m_couplings.data(), m_matrixElements.data(), devIsGoodHel.data() );
+    sigmaKin_getGoodHel<<<m_gpublocks, m_gputhreads>>>( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), devIsGoodHel.data() );
 #endif
-    checkGpu( gpuPeekAtLastError() );
+    checkCuda( cudaPeekAtLastError() );
     // ... 0d2. Copy back good helicity mask to the host
     copyHostFromDevice( hstIsGoodHel, devIsGoodHel );
     // ... 0d3. Copy back good helicity list to constant memory on the device
@@ -219,19 +219,19 @@ namespace mg5amcGpu
 
   void MatrixElementKernelDevice::computeMatrixElements( const unsigned int channelId )
   {
-    gpuLaunchKernel( computeDependentCouplings, m_gpublocks, m_gputhreads, m_gs.data(), m_couplings.data() );
+    computeDependentCouplings<<<m_gpublocks, m_gputhreads>>>( m_gs.data(), m_couplings.data() );
 #ifndef MGONGPU_NSIGHT_DEBUG
     constexpr unsigned int sharedMemSize = 0;
 #else
     constexpr unsigned int sharedMemSize = ntpbMAX * sizeof( float );
 #endif
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    gpuLaunchKernelSharedMem( sigmaKin, m_gpublocks, m_gputhreads, sharedMemSize, m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), channelId, m_numerators.data(), m_denominators.data(), m_selhel.data(), m_selcol.data() );
+    sigmaKin<<<m_gpublocks, m_gputhreads, sharedMemSize>>>( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), channelId, m_numerators.data(), m_denominators.data(), m_selhel.data(), m_selcol.data() );
 #else
-    gpuLaunchKernelSharedMem( sigmaKin, m_gpublocks, m_gputhreads, sharedMemSize, m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), m_selhel.data(), m_selcol.data() );
+    sigmaKin<<<m_gpublocks, m_gputhreads, sharedMemSize>>>( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), m_selhel.data(), m_selcol.data() );
 #endif
-    checkGpu( gpuPeekAtLastError() );
-    checkGpu( gpuDeviceSynchronize() );
+    checkCuda( cudaPeekAtLastError() );
+    checkCuda( cudaDeviceSynchronize() );
   }
 
   //--------------------------------------------------------------------------
diff --git a/epochX/cudacpp/gg_tt.sa/SubProcesses/MatrixElementKernels.h b/epochX/cudacpp/gg_tt.sa/SubProcesses/MatrixElementKernels.h
index 72bd8f195b..23e84757a2 100644
--- a/epochX/cudacpp/gg_tt.sa/SubProcesses/MatrixElementKernels.h
+++ b/epochX/cudacpp/gg_tt.sa/SubProcesses/MatrixElementKernels.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MATRIXELEMENTKERNELS_H
 #define MATRIXELEMENTKERNELS_H 1
@@ -10,7 +10,7 @@
 
 #include "MemoryBuffers.h"
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -81,7 +81,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
   // A class encapsulating matrix element calculations on a CPU host
   class MatrixElementKernelHost final : public MatrixElementKernelBase, public NumberOfEvents
   {
@@ -130,7 +130,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
   // A class encapsulating matrix element calculations on a GPU device
   class MatrixElementKernelDevice : public MatrixElementKernelBase, public NumberOfEvents
   {
diff --git a/epochX/cudacpp/gg_tt.sa/SubProcesses/MemoryAccessHelpers.h b/epochX/cudacpp/gg_tt.sa/SubProcesses/MemoryAccessHelpers.h
index db73e4e064..c82a6c7635 100644
--- a/epochX/cudacpp/gg_tt.sa/SubProcesses/MemoryAccessHelpers.h
+++ b/epochX/cudacpp/gg_tt.sa/SubProcesses/MemoryAccessHelpers.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MemoryAccessHelpers_H
 #define MemoryAccessHelpers_H 1
@@ -105,7 +105,7 @@ class KernelAccessHelper : public MemoryAccessHelper<T>
     }
     else
     {
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
       const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid
       //printf( "kernelAccessRecord: ievt=%d threadId=%d\n", ievt, threadIdx.x );
       return T::ieventAccessRecord( buffer, ievt ); // NB fptype and fptype_sv coincide for CUDA
diff --git a/epochX/cudacpp/gg_tt.sa/SubProcesses/MemoryAccessMomenta.h b/epochX/cudacpp/gg_tt.sa/SubProcesses/MemoryAccessMomenta.h
index 38fade09fb..0ac4faa3c7 100644
--- a/epochX/cudacpp/gg_tt.sa/SubProcesses/MemoryAccessMomenta.h
+++ b/epochX/cudacpp/gg_tt.sa/SubProcesses/MemoryAccessMomenta.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MemoryAccessMomenta_H
 #define MemoryAccessMomenta_H 1
@@ -12,7 +12,7 @@
 #include "MemoryAccessHelpers.h"
 #include "MemoryAccessVectors.h"
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 using mg5amcGpu::CPPProcess;
 #else
 using mg5amcCpu::CPPProcess;
@@ -29,7 +29,7 @@ class MemoryAccessMomentaBase //_AOSOAv1
 
   // Number of Events Per Page in the momenta AOSOA memory buffer layout
   // (these are all best kept as a compile-time constants: see issue #23)
-#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
+#ifdef __CUDACC__ /* clang-format off */
   // -----------------------------------------------------------------------------------------------
   // --- GPUs: neppM is best set to a power of 2 times the number of fptype's in a 32-byte cacheline
   // --- This is relevant to ensure coalesced access to momenta in global memory
diff --git a/epochX/cudacpp/gg_tt.sa/SubProcesses/MemoryAccessRandomNumbers.h b/epochX/cudacpp/gg_tt.sa/SubProcesses/MemoryAccessRandomNumbers.h
index 40cb089135..e2988d39f3 100644
--- a/epochX/cudacpp/gg_tt.sa/SubProcesses/MemoryAccessRandomNumbers.h
+++ b/epochX/cudacpp/gg_tt.sa/SubProcesses/MemoryAccessRandomNumbers.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MemoryAccessRandomNumbers_H
 #define MemoryAccessRandomNumbers_H 1
@@ -11,7 +11,7 @@
 #include "CPPProcess.h"
 #include "MemoryAccessHelpers.h"
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 using mg5amcGpu::CPPProcess;
 #else
 using mg5amcCpu::CPPProcess;
diff --git a/epochX/cudacpp/gg_tt.sa/SubProcesses/MemoryAccessVectors.h b/epochX/cudacpp/gg_tt.sa/SubProcesses/MemoryAccessVectors.h
index 08faccff0f..e9b197368e 100644
--- a/epochX/cudacpp/gg_tt.sa/SubProcesses/MemoryAccessVectors.h
+++ b/epochX/cudacpp/gg_tt.sa/SubProcesses/MemoryAccessVectors.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MemoryAccessVectors_H
 #define MemoryAccessVectors_H 1
@@ -10,7 +10,7 @@
 
 #include "mgOnGpuVectors.h"
 
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
 namespace mg5amcCpu // this is only needed for CPU SIMD vectorization
 {
 
diff --git a/epochX/cudacpp/gg_tt.sa/SubProcesses/MemoryBuffers.h b/epochX/cudacpp/gg_tt.sa/SubProcesses/MemoryBuffers.h
index 7756a71621..3093e6ed18 100644
--- a/epochX/cudacpp/gg_tt.sa/SubProcesses/MemoryBuffers.h
+++ b/epochX/cudacpp/gg_tt.sa/SubProcesses/MemoryBuffers.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021, based on earlier work by S. Hageboeck) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Roiser, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MemoryBuffers_H
 #define MemoryBuffers_H 1
@@ -11,12 +11,12 @@
 #include "mgOnGpuCxtypes.h"
 
 #include "CPPProcess.h"
-#include "GpuRuntime.h"
+#include "CudaRuntime.h"
 #include "Parameters_sm.h"
 
 #include <sstream>
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -87,7 +87,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
   constexpr bool HostBufferALIGNED = false;   // ismisaligned=false
   constexpr bool HostBufferMISALIGNED = true; // ismisaligned=true
 
@@ -119,7 +119,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
   // A class encapsulating a CUDA pinned host buffer
   template<typename T>
   class PinnedHostBufferBase : public BufferBase<T>
@@ -128,18 +128,18 @@ namespace mg5amcCpu
     PinnedHostBufferBase( const size_t size )
       : BufferBase<T>( size, false )
     {
-      gpuMallocHost( &( this->m_data ), this->bytes() );
+      checkCuda( cudaMallocHost( &( this->m_data ), this->bytes() ) );
     }
     virtual ~PinnedHostBufferBase()
     {
-      gpuFreeHost( this->m_data );
+      checkCuda( cudaFreeHost( this->m_data ) );
     }
   };
 #endif
 
   //--------------------------------------------------------------------------
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
   // A class encapsulating a CUDA device buffer
   template<typename T>
   class DeviceBufferBase : public BufferBase<T>
@@ -148,18 +148,18 @@ namespace mg5amcCpu
     DeviceBufferBase( const size_t size )
       : BufferBase<T>( size, true )
     {
-      gpuMalloc( &( this->m_data ), this->bytes() );
+      checkCuda( cudaMalloc( &( this->m_data ), this->bytes() ) );
     }
     virtual ~DeviceBufferBase()
     {
-      gpuFree( this->m_data );
+      checkCuda( cudaFree( this->m_data ) );
     }
   };
 #endif
 
   //--------------------------------------------------------------------------
 
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
   // A class encapsulating a C++ host buffer for a given number of events
   template<typename T, size_t sizePerEvent, bool ismisaligned>
   class HostBuffer : public HostBufferBase<T, ismisaligned>, virtual private NumberOfEvents
@@ -175,7 +175,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
   // A class encapsulating a CUDA pinned host buffer for a given number of events
   template<typename T, size_t sizePerEvent>
   class PinnedHostBuffer : public PinnedHostBufferBase<T>, virtual private NumberOfEvents
@@ -191,7 +191,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
   // A class encapsulating a CUDA device buffer for a given number of events
   template<typename T, size_t sizePerEvent>
   class DeviceBuffer : public DeviceBufferBase<T>, virtual private NumberOfEvents
@@ -213,7 +213,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for momenta random numbers
   constexpr size_t sizePerEventRndNumMomenta = MemoryBuffers::np4 * MemoryBuffers::nparf;
 
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
   // A class encapsulating a C++ host buffer for momenta random numbers
   typedef HostBuffer<fptype, sizePerEventRndNumMomenta, HostBufferALIGNED> HostBufferRndNumMomenta;
 #else
@@ -232,7 +232,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer with ONE fptype per event
   constexpr size_t sizePerEventOneFp = 1;
 
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
   // A class encapsulating a C++ host buffer with ONE fptype per event
   typedef HostBuffer<fptype, sizePerEventOneFp, HostBufferALIGNED> HostBufferOneFp;
 #else
@@ -257,7 +257,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for Gs
   constexpr size_t sizePerEventGs = 1;
 
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
   // A class encapsulating a C++ host buffer for gs
   typedef HostBuffer<fptype, sizePerEventGs, HostBufferALIGNED> HostBufferGs;
 #else
@@ -276,7 +276,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for numerators
   constexpr size_t sizePerEventNumerators = 1;
 
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
   // A class encapsulating a C++ host buffer for gs
   typedef HostBuffer<fptype, sizePerEventNumerators, HostBufferALIGNED> HostBufferNumerators;
 #else
@@ -296,7 +296,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for denominators
   constexpr size_t sizePerEventDenominators = 1;
 
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
   // A class encapsulating a C++ host buffer for gs
   typedef HostBuffer<fptype, sizePerEventDenominators, HostBufferALIGNED> HostBufferDenominators;
 #else
@@ -315,7 +315,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for random numbers
   constexpr size_t sizePerEventCouplings = MemoryBuffers::ndcoup * MemoryBuffers::nx2;
 
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
   // A class encapsulating a C++ host buffer for gs
   typedef HostBuffer<fptype, sizePerEventCouplings, HostBufferALIGNED> HostBufferCouplings;
 #else
@@ -333,7 +333,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for momenta
   constexpr size_t sizePerEventMomenta = MemoryBuffers::np4 * MemoryBuffers::npar;
 
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
   // A class encapsulating a C++ host buffer for momenta
   typedef HostBuffer<fptype, sizePerEventMomenta, HostBufferALIGNED> HostBufferMomenta;
   //typedef HostBuffer<fptype, sizePerEventMomenta, HostBufferMISALIGNED> HostBufferMomenta; // TEST MISALIGNMENT!
@@ -352,7 +352,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for sampling weights
   constexpr size_t sizePerEventWeights = 1;
 
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
   // A class encapsulating a C++ host buffer for sampling weights
   typedef HostBuffer<fptype, sizePerEventWeights, HostBufferALIGNED> HostBufferWeights;
 #else
@@ -370,7 +370,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for matrix elements
   constexpr size_t sizePerEventMatrixElements = 1;
 
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
   // A class encapsulating a C++ host buffer for matrix elements
   typedef HostBuffer<fptype, sizePerEventMatrixElements, HostBufferALIGNED> HostBufferMatrixElements;
 #else
@@ -385,7 +385,7 @@ namespace mg5amcCpu
   // A base class encapsulating a memory buffer for the helicity mask
   typedef BufferBase<bool> BufferHelicityMask;
 
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
   // A class encapsulating a C++ host buffer for the helicity mask
   typedef HostBufferBase<bool, HostBufferALIGNED> HostBufferHelicityMask;
 #else
@@ -403,7 +403,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for wavefunctions
   constexpr size_t sizePerEventWavefunctions = MemoryBuffers::nw6 * MemoryBuffers::nx2;
 
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
   // A class encapsulating a C++ host buffer for wavefunctions
   typedef HostBuffer<fptype, sizePerEventWavefunctions, HostBufferALIGNED> HostBufferWavefunctions;
 #else
@@ -421,7 +421,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for helicity random numbers
   constexpr size_t sizePerEventRndNumHelicity = 1;
 
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
   // A class encapsulating a C++ host buffer for helicity random numbers
   typedef HostBuffer<fptype, sizePerEventRndNumHelicity, HostBufferALIGNED> HostBufferRndNumHelicity;
 #else
@@ -439,7 +439,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for color random numbers
   constexpr size_t sizePerEventRndNumColor = 1;
 
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
   // A class encapsulating a C++ host buffer for color random numbers
   typedef HostBuffer<fptype, sizePerEventRndNumColor, HostBufferALIGNED> HostBufferRndNumColor;
 #else
@@ -457,7 +457,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for helicity selection
   constexpr size_t sizePerEventSelectedHelicity = 1;
 
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
   // A class encapsulating a C++ host buffer for helicity selection
   typedef HostBuffer<int, sizePerEventSelectedHelicity, HostBufferALIGNED> HostBufferSelectedHelicity;
 #else
@@ -475,7 +475,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for color selection
   constexpr size_t sizePerEventSelectedColor = 1;
 
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
   // A class encapsulating a C++ host buffer for color selection
   typedef HostBuffer<int, sizePerEventSelectedColor, HostBufferALIGNED> HostBufferSelectedColor;
 #else
@@ -487,7 +487,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
   template<class Tdst, class Tsrc>
   void copyDeviceFromHost( Tdst& dst, const Tsrc& src ) // keep the same order of arguments as in memcpy
   {
@@ -504,13 +504,13 @@ namespace mg5amcCpu
       throw std::runtime_error( sstr.str() );
     }
     // NB (PR #45): cudaMemcpy involves an intermediate memcpy to pinned memory if host array is a not a pinned host array
-    gpuMemcpy( dst.data(), src.data(), src.bytes(), gpuMemcpyHostToDevice );
+    checkCuda( cudaMemcpy( dst.data(), src.data(), src.bytes(), cudaMemcpyHostToDevice ) );
   }
 #endif
 
   //--------------------------------------------------------------------------
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
   template<class Tdst, class Tsrc>
   void copyHostFromDevice( Tdst& dst, const Tsrc& src ) // keep the same order of arguments as in memcpy
   {
@@ -527,7 +527,7 @@ namespace mg5amcCpu
       throw std::runtime_error( sstr.str() );
     }
     // NB (PR #45): cudaMemcpy involves an intermediate memcpy to pinned memory if host array is a not a pinned host array
-    gpuMemcpy( dst.data(), src.data(), src.bytes(), gpuMemcpyDeviceToHost );
+    checkCuda( cudaMemcpy( dst.data(), src.data(), src.bytes(), cudaMemcpyDeviceToHost ) );
   }
 #endif
 
diff --git a/epochX/cudacpp/gg_tt.sa/SubProcesses/P1_Sigma_sm_gg_ttx/CPPProcess.cc b/epochX/cudacpp/gg_tt.sa/SubProcesses/P1_Sigma_sm_gg_ttx/CPPProcess.cc
index b0d93e9401..327b69d008 100644
--- a/epochX/cudacpp/gg_tt.sa/SubProcesses/P1_Sigma_sm_gg_ttx/CPPProcess.cc
+++ b/epochX/cudacpp/gg_tt.sa/SubProcesses/P1_Sigma_sm_gg_ttx/CPPProcess.cc
@@ -4,7 +4,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
 // MadGraph5_aMC@NLO v. 3.5.0_lo_vect, 2023-06-09
@@ -16,6 +16,7 @@
 
 #include "mgOnGpuConfig.h"
 
+#include "CudaRuntime.h"
 #include "HelAmps_sm.h"
 #include "MemoryAccessAmplitudes.h"
 #include "MemoryAccessCouplings.h"
@@ -45,7 +46,7 @@
 // Class member functions for calculating the matrix elements for
 // Process: g g > t t~ WEIGHTED<=2 @1
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -79,7 +80,7 @@ namespace mg5amcCpu
   __device__ const fptype cIPD[2] = { (fptype)Parameters_sm::mdl_MT, (fptype)Parameters_sm::mdl_WT };
   __device__ const fptype* cIPC = nullptr; // unused as nicoup=0
 #else
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
   __device__ __constant__ fptype cIPD[2];
   __device__ __constant__ fptype* cIPC = nullptr; // unused as nicoup=0
 #else
@@ -89,7 +90,7 @@ namespace mg5amcCpu
 #endif
 
   // Helicity combinations (and filtering of "good" helicity combinations)
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
   __device__ __constant__ short cHel[ncomb][npar];
   __device__ __constant__ int cNGoodHel;
   __device__ __constant__ int cGoodHel[ncomb];
@@ -117,13 +118,13 @@ namespace mg5amcCpu
                            fptype* allDenominators,       // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
                            fptype_sv* jamp2_sv            // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled)
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
                            , const int ievt00             // input: first event number in current C++ event page (for CUDA, ievt depends on threadid)
 #endif
                            )
   //ALWAYS_INLINE // attributes are not permitted in a function definition
   {
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
     using namespace mg5amcGpu;
     using M_ACCESS = DeviceAccessMomenta;         // non-trivial access: buffer includes all events
     using E_ACCESS = DeviceAccessMatrixElements;  // non-trivial access: buffer includes all events
@@ -150,7 +151,7 @@ namespace mg5amcCpu
 #endif /* clang-format on */
     mgDebug( 0, __FUNCTION__ );
     //printf( "calculate_wavefunctions: ihel=%2d\n", ihel );
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
     //printf( "calculate_wavefunctions: ievt00=%d\n", ievt00 );
 #endif
 
@@ -186,7 +187,7 @@ namespace mg5amcCpu
 #endif
     for( int iParity = 0; iParity < nParity; ++iParity )
     { // START LOOP ON IPARITY
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
       const int ievt0 = ievt00 + iParity * neppV;
 #endif
       constexpr size_t nxcoup = ndcoup + nicoup; // both dependent and independent couplings
@@ -199,10 +200,8 @@ namespace mg5amcCpu
         allCOUPs[idcoup] = CD_ACCESS::idcoupAccessBufferConst( allcouplings, idcoup ); // dependent couplings, vary event-by-event
       for( size_t iicoup = 0; iicoup < nicoup; iicoup++ )
         allCOUPs[ndcoup + iicoup] = CI_ACCESS::iicoupAccessBufferConst( cIPC, iicoup ); // independent couplings, fixed for all events
-#ifdef MGONGPUCPP_GPUIMPL
 #ifdef __CUDACC__
 #pragma nv_diagnostic pop
-#endif
       // CUDA kernels take input/output buffers with momenta/MEs for all events
       const fptype* momenta = allmomenta;
       const fptype* COUPs[nxcoup];
@@ -300,7 +299,7 @@ namespace mg5amcCpu
         { 16, -2 },
         { -2, 16 } }; // 2-D array[2][2]
 
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
       // Pre-compute a constexpr triangular color matrix properly normalized #475
       struct TriangularNormalizedColorMatrix
       {
@@ -357,7 +356,7 @@ namespace mg5amcCpu
 #endif
       for( int icol = 0; icol < ncolor; icol++ )
       {
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
         // === C++ START ===
         // Diagonal terms
 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
@@ -416,7 +415,7 @@ namespace mg5amcCpu
       MEs_sv_previous += deltaMEs_previous;
 #endif
       /*
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
       if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", blockDim.x * blockIdx.x + threadIdx.x, ihel, MEs_sv );
 #else
 #ifdef MGONGPU_CPPSIMD
@@ -463,8 +462,8 @@ namespace mg5amcCpu
       { 1, 1, -1, -1 },
       { 1, 1, 1, 1 },
       { 1, 1, 1, -1 } };
-#ifdef MGONGPUCPP_GPUIMPL
-    gpuMemcpyToSymbol( cHel, tHel, ncomb * npar * sizeof( short ) );
+#ifdef __CUDACC__
+    checkCuda( cudaMemcpyToSymbol( cHel, tHel, ncomb * npar * sizeof( short ) ) );
 #else
     memcpy( cHel, tHel, ncomb * npar * sizeof( short ) );
 #endif
@@ -504,9 +503,9 @@ namespace mg5amcCpu
     // Then copy them to CUDA constant memory (issue #39) or its C++ emulation in file-scope static memory
     const fptype tIPD[2] = { (fptype)m_pars->mdl_MT, (fptype)m_pars->mdl_WT };
     //const cxtype tIPC[0] = { ... }; // nicoup=0
-#ifdef MGONGPUCPP_GPUIMPL
-    gpuMemcpyToSymbol( cIPD, tIPD, 2 * sizeof( fptype ) );
-    //gpuMemcpyToSymbol( cIPC, tIPC, 0 * sizeof( cxtype ) ); // nicoup=0
+#ifdef __CUDACC__
+    checkCuda( cudaMemcpyToSymbol( cIPD, tIPD, 2 * sizeof( fptype ) ) );
+    //checkCuda( cudaMemcpyToSymbol( cIPC, tIPC, 0 * sizeof( cxtype ) ) ); // nicoup=0
 #else
     memcpy( cIPD, tIPD, 2 * sizeof( fptype ) );
     //memcpy( cIPC, tIPC, 0 * sizeof( cxtype ) ); // nicoup=0
@@ -542,7 +541,7 @@ namespace mg5amcCpu
   {
     std::stringstream out;
     // CUDA version (NVCC)
-    // [Use __NVCC__ instead of MGONGPUCPP_GPUIMPL here!]
+    // [Use __NVCC__ instead of __CUDACC__ here!]
     // [This tests if 'nvcc' was used even to build a .cc file, even if not necessarily 'nvcc -x cu' for a .cu file]
     // [Check 'nvcc --compiler-options -dM -E dummy.c | grep CUDA': see https://stackoverflow.com/a/53713712]
 #ifdef __NVCC__
@@ -607,12 +606,12 @@ namespace mg5amcCpu
   __global__ void /* clang-format off */
   computeDependentCouplings( const fptype* allgs, // input: Gs[nevt]
                              fptype* allcouplings // output: couplings[nevt*ndcoup*2]
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
                              , const int nevt     // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #endif
   ) /* clang-format on */
   {
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
     using namespace mg5amcGpu;
     using G_ACCESS = DeviceAccessGs;
     using C_ACCESS = DeviceAccessCouplings;
@@ -633,7 +632,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
+#ifdef __CUDACC__ /* clang-format off */
   __global__ void
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
@@ -762,9 +761,9 @@ namespace mg5amcCpu
         nGoodHel++;
       }
     }
-#ifdef MGONGPUCPP_GPUIMPL
-    gpuMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) );
-    gpuMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) );
+#ifdef __CUDACC__
+    checkCuda( cudaMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) ) );
+    checkCuda( cudaMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) ) );
 #else
     cNGoodHel = nGoodHel;
     for( int ihel = 0; ihel < ncomb; ihel++ ) cGoodHel[ihel] = goodHel[ihel];
@@ -788,7 +787,7 @@ namespace mg5amcCpu
 #endif
             int* allselhel,                // output: helicity selection[nevt]
             int* allselcol                 // output: helicity selection[nevt]
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
             , const int nevt               // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #endif
             ) /* clang-format on */
@@ -809,7 +808,7 @@ namespace mg5amcCpu
     // Denominators: spins, colors and identical particles
     constexpr int helcolDenominators[1] = { 256 }; // assume nprocesses == 1 (#272 and #343)
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
     // Remember: in CUDA this is a kernel for one event, in c++ this processes n events
     const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid
 #else
@@ -823,12 +822,9 @@ namespace mg5amcCpu
 #endif
 
     // Start sigmaKin_lines
-
-#include "GpuAbstraction.h"
-
     // === PART 0 - INITIALISATION (before calculate_wavefunctions) ===
     // Reset the "matrix elements" - running sums of |M|^2 over helicities for the given event
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
     allMEs[ievt] = 0;
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     allNumerators[ievt] = 0;
@@ -856,7 +852,7 @@ namespace mg5amcCpu
     // === PART 1 - HELICITY LOOP: CALCULATE WAVEFUNCTIONS ===
     // (in both CUDA and C++, using precomputed good helicities)
 
-#ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++
+#ifdef __CUDACC__ // CUDA OR C++
 
     // *** START OF PART 1a - CUDA (one event per CPU thread) ***
     // Running sum of partial amplitudes squared for event by event color selection (#402)
@@ -1060,7 +1056,7 @@ namespace mg5amcCpu
     // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event
     // [NB 'sum over final spins, average over initial spins', eg see
     // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf]
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
     allMEs[ievt] /= helcolDenominators[0];
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     if( channelId > 0 ) allMEs[ievt] *= allNumerators[ievt] / allDenominators[ievt];
diff --git a/epochX/cudacpp/gg_tt.sa/SubProcesses/P1_Sigma_sm_gg_ttx/CPPProcess.h b/epochX/cudacpp/gg_tt.sa/SubProcesses/P1_Sigma_sm_gg_ttx/CPPProcess.h
index 5a6e96d9e8..51f966d10f 100644
--- a/epochX/cudacpp/gg_tt.sa/SubProcesses/P1_Sigma_sm_gg_ttx/CPPProcess.h
+++ b/epochX/cudacpp/gg_tt.sa/SubProcesses/P1_Sigma_sm_gg_ttx/CPPProcess.h
@@ -4,7 +4,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
 // MadGraph5_aMC@NLO v. 3.5.0_lo_vect, 2023-06-09
@@ -25,7 +25,7 @@
 
 //--------------------------------------------------------------------------
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -107,7 +107,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
   __global__ void
   computeDependentCouplings( const fptype* allgs,    // input: Gs[nevt]
                              fptype* allcouplings ); // output: couplings[nevt*ndcoup*2]
@@ -120,7 +120,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
+#ifdef __CUDACC__ /* clang-format off */
   __global__ void
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
@@ -150,7 +150,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
+#ifdef __CUDACC__ /* clang-format off */
   __global__ void
   sigmaKin( const fptype* allmomenta,      // input: momenta[nevt*npar*4]
             const fptype* allcouplings,    // input: couplings[nevt*ndcoup*2]
diff --git a/epochX/cudacpp/gg_tt.sa/SubProcesses/P1_Sigma_sm_gg_ttx/CudaRuntime.h b/epochX/cudacpp/gg_tt.sa/SubProcesses/P1_Sigma_sm_gg_ttx/CudaRuntime.h
new file mode 120000
index 0000000000..ce9e1a487a
--- /dev/null
+++ b/epochX/cudacpp/gg_tt.sa/SubProcesses/P1_Sigma_sm_gg_ttx/CudaRuntime.h
@@ -0,0 +1 @@
+../CudaRuntime.h
\ No newline at end of file
diff --git a/epochX/cudacpp/gg_tt.sa/SubProcesses/P1_Sigma_sm_gg_ttx/check_sa.cc b/epochX/cudacpp/gg_tt.sa/SubProcesses/P1_Sigma_sm_gg_ttx/check_sa.cc
index 1bad694d1c..f1e75b9252 100644
--- a/epochX/cudacpp/gg_tt.sa/SubProcesses/P1_Sigma_sm_gg_ttx/check_sa.cc
+++ b/epochX/cudacpp/gg_tt.sa/SubProcesses/P1_Sigma_sm_gg_ttx/check_sa.cc
@@ -4,7 +4,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: O. Mattelaer (Nov 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 
 #include "mgOnGpuConfig.h"
@@ -12,7 +12,6 @@
 #include "BridgeKernels.h"
 #include "CPPProcess.h"
 #include "CrossSectionKernels.h"
-#include "GpuRuntime.h"
 #include "MatrixElementKernels.h"
 #include "MemoryAccessMatrixElements.h"
 #include "MemoryAccessMomenta.h"
@@ -64,7 +63,7 @@ usage( char* argv0, int ret = 1 )
   std::cout << std::endl;
   std::cout << "Summary stats are always computed: '-p' and '-j' only control their printout" << std::endl;
   std::cout << "The '-d' flag only enables NaN/abnormal warnings and OMP debugging" << std::endl;
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
 #ifdef _OPENMP
   std::cout << std::endl;
   std::cout << "Use the OMP_NUM_THREADS environment variable to control OMP multi-threading" << std::endl;
@@ -78,7 +77,7 @@ int
 main( int argc, char** argv )
 {
   // Namespaces for CUDA and C++ (FIXME - eventually use the same namespace everywhere...)
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
   using namespace mg5amcGpu;
 #else
   using namespace mg5amcCpu;
@@ -104,11 +103,11 @@ main( int argc, char** argv )
     CurandDevice = 2
   };
 #ifdef __CUDACC__
-  RandomNumberMode rndgen = RandomNumberMode::CurandDevice; // default on CUDA GPU
+  RandomNumberMode rndgen = RandomNumberMode::CurandDevice; // default on GPU
 #elif not defined MGONGPU_HAS_NO_CURAND
   RandomNumberMode rndgen = RandomNumberMode::CurandHost;  // default on CPU if build has curand
 #else
-  RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // default on CPU if build has no curand and on HIP GPU
+  RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // default on CPU if build has no curand
 #endif
   // Rambo sampling mode (NB RamboHost implies CommonRandom or CurandHost!)
   enum class RamboSamplingMode
@@ -116,7 +115,7 @@ main( int argc, char** argv )
     RamboHost = 1,
     RamboDevice = 2
   };
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
   RamboSamplingMode rmbsmp = RamboSamplingMode::RamboDevice; // default on GPU
 #else
   RamboSamplingMode rmbsmp = RamboSamplingMode::RamboHost; // default on CPU
@@ -149,7 +148,7 @@ main( int argc, char** argv )
 #ifdef __CUDACC__
       rndgen = RandomNumberMode::CurandDevice;
 #else
-      throw std::runtime_error( "CurandDevice is not supported on CPUs or on HIP GPUs" );
+      throw std::runtime_error( "CurandDevice is not supported on CPUs" );
 #endif
     }
     else if( arg == "--curhst" )
@@ -166,7 +165,7 @@ main( int argc, char** argv )
     }
     else if( arg == "--rmbdev" )
     {
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
       rmbsmp = RamboSamplingMode::RamboDevice;
 #else
       throw std::runtime_error( "RamboDevice is not supported on CPUs" );
@@ -240,13 +239,13 @@ main( int argc, char** argv )
     return usage( argv[0] );
   }
 
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
 #ifdef _OPENMP
   ompnumthreadsNotSetMeansOneThread( debug ? 1 : 0 ); // quiet(-1), info(0), debug(1)
 #endif
 #endif
 
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
   // Fail gently and avoid "Illegal instruction (core dumped)" if the host does not support the SIMD used in the ME calculation
   // Note: this prevents a crash on pmpe04 but not on some github CI nodes?
   // [NB: SIMD vectorization in mg5amc C++ code is only used in the ME calculation below MatrixElementKernelHost!]
@@ -264,14 +263,14 @@ main( int argc, char** argv )
 
   // === STEP 0 - INITIALISE
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 
-  // --- 00. Initialise GPU
-  // Instantiate a GpuRuntime at the beginnining of the application's main.
-  // For CUDA this invokes cudaSetDevice(0) in the constructor and books a cudaDeviceReset() call in the destructor.
-  const std::string cdinKey = "00 GpuInit";
+  // --- 00. Initialise cuda
+  // Instantiate a CudaRuntime at the beginnining of the application's main to
+  // invoke cudaSetDevice(0) in the constructor and book a cudaDeviceReset() call in the destructor
+  const std::string cdinKey = "00 CudaInit";
   timermap.start( cdinKey );
-  GpuRuntime GpuRuntime( debug );
+  CudaRuntime cudaRuntime( debug );
 #endif
 
   // --- 0a. Initialise physics process
@@ -293,7 +292,7 @@ main( int argc, char** argv )
   timermap.start( alloKey );
 
   // Memory buffers for random numbers for momenta
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
   HostBufferRndNumMomenta hstRndmom( nevt );
 #else
   PinnedHostBufferRndNumMomenta hstRndmom( nevt );
@@ -301,7 +300,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for sampling weights
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
   HostBufferWeights hstWeights( nevt );
 #else
   PinnedHostBufferWeights hstWeights( nevt );
@@ -309,7 +308,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for momenta
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
   HostBufferMomenta hstMomenta( nevt );
 #else
   PinnedHostBufferMomenta hstMomenta( nevt );
@@ -317,7 +316,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for Gs
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
   HostBufferGs hstGs( nevt );
 #else
   PinnedHostBufferGs hstGs( nevt );
@@ -334,7 +333,7 @@ main( int argc, char** argv )
   }
 
   // Memory buffers for matrix elements
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
   HostBufferMatrixElements hstMatrixElements( nevt );
 #else
   PinnedHostBufferMatrixElements hstMatrixElements( nevt );
@@ -343,7 +342,7 @@ main( int argc, char** argv )
 
   // Memory buffers for random numbers for helicity selection
   // *** NB #403 these buffers always remain initialised at 0: no need for helicity choice in gcheck/check (no LHE produced) ***
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
   HostBufferRndNumHelicity hstRndHel( nevt );
 #else
   PinnedHostBufferRndNumHelicity hstRndHel( nevt );
@@ -352,7 +351,7 @@ main( int argc, char** argv )
 
   // Memory buffers for random numbers for color selection
   // *** NB #402 these buffers always remain initialised at 0: no need for color choice in gcheck/check (no LHE produced) ***
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
   HostBufferRndNumColor hstRndCol( nevt );
 #else
   PinnedHostBufferRndNumColor hstRndCol( nevt );
@@ -360,7 +359,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for helicity selection
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
   HostBufferSelectedHelicity hstSelHel( nevt );
 #else
   PinnedHostBufferSelectedHelicity hstSelHel( nevt );
@@ -368,7 +367,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for color selection
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
   HostBufferSelectedColor hstSelCol( nevt );
 #else
   PinnedHostBufferSelectedColor hstSelCol( nevt );
@@ -404,7 +403,7 @@ main( int argc, char** argv )
 #else
   else
   {
-    throw std::logic_error( "CurandDevice is not supported on CPUs or HIP GPUs" ); // INTERNAL ERROR (no path to this statement)
+    throw std::logic_error( "CurandDevice is not supported on CPUs" ); // INTERNAL ERROR (no path to this statement)
   }
 #endif
 #else
@@ -422,7 +421,7 @@ main( int argc, char** argv )
   }
   else
   {
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
     prsk.reset( new RamboSamplingKernelDevice( energy, devRndmom, devMomenta, devWeights, gpublocks, gputhreads ) );
 #else
     throw std::logic_error( "RamboDevice is not supported on CPUs" ); // INTERNAL ERROR (no path to this statement)
@@ -433,7 +432,7 @@ main( int argc, char** argv )
   std::unique_ptr<MatrixElementKernelBase> pmek;
   if( !bridge )
   {
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
     pmek.reset( new MatrixElementKernelDevice( devMomenta, devGs, devRndHel, devRndCol, devMatrixElements, devSelHel, devSelCol, gpublocks, gputhreads ) );
 #else
     pmek.reset( new MatrixElementKernelHost( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, nevt ) );
@@ -441,7 +440,7 @@ main( int argc, char** argv )
   }
   else
   {
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
     pmek.reset( new BridgeKernelDevice( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, gpublocks, gputhreads ) );
 #else
     pmek.reset( new BridgeKernelHost( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, nevt ) );
@@ -483,7 +482,7 @@ main( int argc, char** argv )
     prnk->generateRnarray();
     //std::cout << "Got random numbers" << std::endl;
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
     if( rndgen != RandomNumberMode::CurandDevice && rmbsmp == RamboSamplingMode::RamboDevice )
     {
       // --- 1c. Copy rndmom from host to device
@@ -515,7 +514,7 @@ main( int argc, char** argv )
     prsk->getMomentaFinal();
     //std::cout << "Got final momenta" << std::endl;
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
     if( rmbsmp == RamboSamplingMode::RamboDevice )
     {
       // --- 2c. CopyDToH Weights
@@ -560,7 +559,7 @@ main( int argc, char** argv )
       dynamic_cast<BridgeKernelBase*>( pmek.get() )->transposeInputMomentaC2F();
     }
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
     // --- 2d. CopyHToD Momenta
     const std::string gKey = "0.. CpHTDg";
     rambtime += timermap.start( gKey ); // FIXME! NOT A RAMBO TIMER!
@@ -589,7 +588,7 @@ main( int argc, char** argv )
     wv3atime += timermap.stop(); // calc only
     wavetime += wv3atime;        // calc plus copy
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
     if( !bridge )
     {
       // --- 3b. CopyDToH MEs
@@ -732,19 +731,15 @@ main( int argc, char** argv )
     rndgentxt = "CURAND DEVICE";
 #ifdef __CUDACC__
   rndgentxt += " (CUDA code)";
-#elif defined __HIPCC__
-  rndgentxt += " (HIP code)";
 #else
   rndgentxt += " (C++ code)";
 #endif
 
   // Workflow description summary
   std::string wrkflwtxt;
-  // -- CUDA or HIP or C++?
+  // -- CUDA or C++?
 #ifdef __CUDACC__
   wrkflwtxt += "CUD:";
-#elif defined __HIPCC__
-  wrkflwtxt += "HIP:";
 #else
   wrkflwtxt += "CPP:";
 #endif
@@ -769,12 +764,6 @@ main( int argc, char** argv )
 #else
   wrkflwtxt += "???:"; // no path to this statement
 #endif
-#elif defined __HIPCC__
-#if defined MGONGPU_CUCXTYPE_CXSMPL
-  wrkflwtxt += "CXS:";
-#else
-  wrkflwtxt += "???:"; // no path to this statement
-#endif
 #else
 #if defined MGONGPU_CPPCXTYPE_STDCOMPLEX
   wrkflwtxt += "STX:";
@@ -800,7 +789,7 @@ main( int argc, char** argv )
     wrkflwtxt += "RMBDEV+";
   else
     wrkflwtxt += "??????+"; // no path to this statement
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
   // -- HOST or DEVICE matrix elements? Standalone MEs or BRIDGE?
   if( !bridge )
     wrkflwtxt += "MESDEV";
@@ -856,7 +845,7 @@ main( int argc, char** argv )
 
   if( perf )
   {
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
 #ifdef _OPENMP
     // Get the output of "nproc --all" (https://stackoverflow.com/a/478960)
     std::string nprocall;
@@ -877,8 +866,6 @@ main( int argc, char** argv )
     std::cout << std::string( SEP79, '*' ) << std::endl
 #ifdef __CUDACC__
               << "Process                     = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_CUDA"
-#elif defined __HIPCC__
-              << "Process                     = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_HIP"
 #else
               << "Process                     = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_CPP"
 #endif
@@ -905,21 +892,21 @@ main( int argc, char** argv )
 #elif defined MGONGPU_FPTYPE_FLOAT
               << "FP precision                = FLOAT (NaN/abnormal=" << nabn << ", zero=" << nzero << ")" << std::endl
 #endif
+#ifdef __CUDACC__
 #if defined MGONGPU_CUCXTYPE_CUCOMPLEX
               << "Complex type                = CUCOMPLEX" << std::endl
 #elif defined MGONGPU_CUCXTYPE_THRUST
               << "Complex type                = THRUST::COMPLEX" << std::endl
-#elif defined MGONGPU_CUCXTYPE_CXSMPL
-              << "Complex type                = STD::COMPLEX" << std::endl
+#endif
 #else
-              << "Complex type                = ???" << std::endl // no path to this statement...
+              << "Complex type                = STD::COMPLEX" << std::endl
 #endif
               << "RanNumb memory layout       = AOSOA[" << neppR << "]"
               << ( neppR == 1 ? " == AOS" : "" )
               << " [HARDCODED FOR REPRODUCIBILITY]" << std::endl
               << "Momenta memory layout       = AOSOA[" << neppM << "]"
               << ( neppM == 1 ? " == AOS" : "" ) << std::endl
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
     //<< "Wavefunction GPU memory     = LOCAL" << std::endl
 #else
 #if !defined MGONGPU_CPPSIMD
@@ -950,7 +937,7 @@ main( int argc, char** argv )
 #endif
 #endif
               << "Random number generation    = " << rndgentxt << std::endl
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
 #ifdef _OPENMP
               << "OMP threads / `nproc --all` = " << omp_get_max_threads() << " / " << nprocall // includes a newline
 #endif
@@ -1046,14 +1033,14 @@ main( int argc, char** argv )
              << "\"FLOAT (NaN/abnormal=" << nabn << ")\"," << std::endl
 #endif
              << "\"Complex type\": "
+#ifdef __CUDACC__
 #if defined MGONGPU_CUCXTYPE_CUCOMPLEX
              << "\"CUCOMPLEX\"," << std::endl
 #elif defined MGONGPU_CUCXTYPE_THRUST
              << "\"THRUST::COMPLEX\"," << std::endl
-#elif defined MGONGPU_CUCXTYPE_CXSMPL
-             << "\"STD::COMPLEX\"," << std::endl
+#endif
 #else
-             << "\"???\"," << std::endl                           // no path to this statement...
+             << "\"STD::COMPLEX\"," << std::endl
 #endif
              << "\"RanNumb memory layout\": "
              << "\"AOSOA[" << neppR << "]\""
@@ -1061,7 +1048,7 @@ main( int argc, char** argv )
              << "\"Momenta memory layout\": "
              << "\"AOSOA[" << neppM << "]\""
              << ( neppM == 1 ? " == AOS" : "" ) << ", " << std::endl
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
     //<< "\"Wavefunction GPU memory\": " << "\"LOCAL\"," << std::endl
 #endif
              << "\"Curand generation\": "
diff --git a/epochX/cudacpp/gg_tt.sa/SubProcesses/RamboSamplingKernels.cc b/epochX/cudacpp/gg_tt.sa/SubProcesses/RamboSamplingKernels.cc
index 79abbcc4f8..da68aa9255 100644
--- a/epochX/cudacpp/gg_tt.sa/SubProcesses/RamboSamplingKernels.cc
+++ b/epochX/cudacpp/gg_tt.sa/SubProcesses/RamboSamplingKernels.cc
@@ -1,11 +1,11 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #include "RamboSamplingKernels.h"
 
-#include "GpuRuntime.h"
+#include "CudaRuntime.h"
 #include "MemoryAccessMomenta.h"
 #include "MemoryAccessRandomNumbers.h"
 #include "MemoryAccessWeights.h"
@@ -14,7 +14,7 @@
 
 #include <sstream>
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -92,7 +92,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
   RamboSamplingKernelDevice::RamboSamplingKernelDevice( const fptype energy,               // input: energy
                                                         const BufferRndNumMomenta& rndmom, // input: random numbers in [0,1]
                                                         BufferMomenta& momenta,            // output: momenta
@@ -135,7 +135,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
   __global__ void
   getMomentaInitialDevice( const fptype energy,
                            fptype* momenta )
@@ -147,17 +147,17 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
   void
   RamboSamplingKernelDevice::getMomentaInitial()
   {
-    gpuLaunchKernel( getMomentaInitialDevice, m_gpublocks, m_gputhreads, m_energy, m_momenta.data() );
+    getMomentaInitialDevice<<<m_gpublocks, m_gputhreads>>>( m_energy, m_momenta.data() );
   }
 #endif
 
   //--------------------------------------------------------------------------
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
   __global__ void
   getMomentaFinalDevice( const fptype energy,
                          const fptype* rndmom,
@@ -171,11 +171,11 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
   void
   RamboSamplingKernelDevice::getMomentaFinal()
   {
-    gpuLaunchKernel( getMomentaFinalDevice, m_gpublocks, m_gputhreads, m_energy, m_rndmom.data(), m_momenta.data(), m_weights.data() );
+    getMomentaFinalDevice<<<m_gpublocks, m_gputhreads>>>( m_energy, m_rndmom.data(), m_momenta.data(), m_weights.data() );
   }
 #endif
 
diff --git a/epochX/cudacpp/gg_tt.sa/SubProcesses/RamboSamplingKernels.h b/epochX/cudacpp/gg_tt.sa/SubProcesses/RamboSamplingKernels.h
index 7c214cd74b..184089efd7 100644
--- a/epochX/cudacpp/gg_tt.sa/SubProcesses/RamboSamplingKernels.h
+++ b/epochX/cudacpp/gg_tt.sa/SubProcesses/RamboSamplingKernels.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef RAMBOSAMPLINGKERNELS_H
 #define RAMBOSAMPLINGKERNELS_H 1
@@ -10,7 +10,7 @@
 
 #include "MemoryBuffers.h"
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -93,7 +93,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
   // A class encapsulating RAMBO phase space sampling on a GPU device
   class RamboSamplingKernelDevice final : public SamplingKernelBase, public NumberOfEvents
   {
diff --git a/epochX/cudacpp/gg_tt.sa/SubProcesses/RandomNumberKernels.h b/epochX/cudacpp/gg_tt.sa/SubProcesses/RandomNumberKernels.h
index 21d63beeac..188a72c2c9 100644
--- a/epochX/cudacpp/gg_tt.sa/SubProcesses/RandomNumberKernels.h
+++ b/epochX/cudacpp/gg_tt.sa/SubProcesses/RandomNumberKernels.h
@@ -1,14 +1,14 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef RANDOMNUMBERKERNELS_H
 #define RANDOMNUMBERKERNELS_H 1
 
 #include "mgOnGpuConfig.h"
 
-// NB This must come AFTER mgOnGpuConfig.h which contains our definition of __global__ when MGONGPUCPP_GPUIMPL is not defined
+// NB This must come AFTER mgOnGpuConfig.h which contains our definition of __global__ when __CUDACC__ is not defined
 #ifndef MGONGPU_HAS_NO_CURAND
 //#include "curand.h"
 struct curandGenerator_st; // forward definition from curand.h
@@ -16,7 +16,7 @@ struct curandGenerator_st; // forward definition from curand.h
 
 #include "MemoryBuffers.h"
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/gg_tt.sa/SubProcesses/cudacpp.mk b/epochX/cudacpp/gg_tt.sa/SubProcesses/cudacpp.mk
index bcb73d7f01..a0397e9ecc 100644
--- a/epochX/cudacpp/gg_tt.sa/SubProcesses/cudacpp.mk
+++ b/epochX/cudacpp/gg_tt.sa/SubProcesses/cudacpp.mk
@@ -1,7 +1,7 @@
 # Copyright (C) 2020-2023 CERN and UCLouvain.
 # Licensed under the GNU Lesser General Public License (version 3 or later).
 # Created by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin.
-# Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+# Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 
 #=== Determine the name of this makefile (https://ftp.gnu.org/old-gnu/Manuals/make-3.80/html_node/make_17.html)
 #=== NB: different names (e.g. cudacpp.mk and cudacpp_src.mk) are used in the Subprocess and src directories
@@ -32,7 +32,7 @@ UNAME_P := $(shell uname -p)
 #=== Configure common compiler flags for C++ and CUDA
 
 INCFLAGS = -I.
-OPTFLAGS = -O3 # this ends up in GPUFLAGS too (should it?), cannot add -Ofast or -ffast-math here
+OPTFLAGS = -O3 # this ends up in CUFLAGS too (should it?), cannot add -Ofast or -ffast-math here
 
 # Dependency on src directory
 MG5AMC_COMMONLIB = mg5amc_common
@@ -89,139 +89,69 @@ endif
 
 #-------------------------------------------------------------------------------
 
-CUDA_COMPILER_PATH := $(shell compiler="`which nvcc 2>/dev/null`" && while [ -L "$$compiler" ]; do compiler=`readlink "$$compiler"`; done && echo "$$compiler")
-HIP_COMPILER_PATH := $(shell compiler="`which hipcc 2>/dev/null`" && while [ -L "$$compiler" ]; do compiler=`readlink "$$compiler"`; done && echo "$$compiler")
-
-ifeq ($(findstring nvcc,$(CUDA_COMPILER_PATH)),nvcc)
-  #=== Configure the CUDA compiler
-
-  # If CXX is not a single word (example "clang++ --gcc-toolchain...") then disable CUDA builds (issue #505)
-  # This is because it is impossible to pass this to "GPUFLAGS += -ccbin <host-compiler>" below
-  ifneq ($(words $(subst ccache ,,$(CXX))),1) # allow at most "CXX=ccache <host-compiler>" from outside
-    $(warning CUDA builds are not supported for multi-word CXX "$(CXX)")
-    override CUDA_HOME=disabled
-  endif
-
-  # If CUDA_HOME is not set, try to set it from the location of NVCC
-  ifndef CUDA_HOME
-    CUDA_HOME = $(patsubst %bin/nvcc,%,$(shell which nvcc 2>/dev/null))
-    $(warning CUDA_HOME was not set: using "$(CUDA_HOME)")
-  endif
-
-  # Set GPUCC as $(CUDA_HOME)/bin/nvcc if it exists
-  ifneq ($(wildcard $(CUDA_HOME)/bin/nvcc),)
-    GPUCC = $(CUDA_HOME)/bin/nvcc
-    USE_NVTX ?=-DUSE_NVTX
-    # See https://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/index.html
-    # See https://arnon.dk/matching-sm-architectures-arch-and-gencode-for-various-nvidia-cards/
-    # Default: use compute capability 70 for V100 (CERN lxbatch, CERN itscrd, Juwels Cluster).
-    # Embed device code for 70, and PTX for 70+.
-    # Export MADGRAPH_CUDA_ARCHITECTURE (comma-separated list) to use another value or list of values (see #533).
-    # Examples: use 60 for P100 (Piz Daint), 80 for A100 (Juwels Booster, NVidia raplab/Curiosity).
-    MADGRAPH_CUDA_ARCHITECTURE ?= 70
-    ###CUARCHFLAGS = -gencode arch=compute_$(MADGRAPH_CUDA_ARCHITECTURE),code=compute_$(MADGRAPH_CUDA_ARCHITECTURE) -gencode arch=compute_$(MADGRAPH_CUDA_ARCHITECTURE),code=sm_$(MADGRAPH_CUDA_ARCHITECTURE) # Older implementation (AV): go back to this one for multi-GPU support #533
-    ###CUARCHFLAGS = --gpu-architecture=compute_$(MADGRAPH_CUDA_ARCHITECTURE) --gpu-code=sm_$(MADGRAPH_CUDA_ARCHITECTURE),compute_$(MADGRAPH_CUDA_ARCHITECTURE) # Newer implementation (SH): cannot use this as-is for multi-GPU support #533
-    comma:=,
-    CUARCHFLAGS = $(foreach arch,$(subst $(comma), ,$(MADGRAPH_CUDA_ARCHITECTURE)),-gencode arch=compute_$(arch),code=compute_$(arch) -gencode arch=compute_$(arch),code=sm_$(arch))
-    CUINC = -I$(CUDA_HOME)/include/
-    CURANDLIBFLAGS = -L$(CUDA_HOME)/lib64/ -lcurand # NB: -lcuda is not needed here!
-    CUOPTFLAGS = -lineinfo
-    GPUFLAGS = $(OPTFLAGS) $(CUOPTFLAGS) $(INCFLAGS) $(CUINC) $(USE_NVTX) $(CUARCHFLAGS) -use_fast_math
-    ###GPUFLAGS += -Xcompiler -Wall -Xcompiler -Wextra -Xcompiler -Wshadow
-    ###GPUCC_VERSION = $(shell $(GPUCC) --version | grep 'Cuda compilation tools' | cut -d' ' -f5 | cut -d, -f1)
-    GPUFLAGS += -std=c++17 # need CUDA >= 11.2 (see #333): this is enforced in mgOnGpuConfig.h
-    # Without -maxrregcount: baseline throughput: 6.5E8 (16384 32 12) up to 7.3E8 (65536 128 12)
-    ###GPUFLAGS+= --maxrregcount 160 # improves throughput: 6.9E8 (16384 32 12) up to 7.7E8 (65536 128 12)
-    ###GPUFLAGS+= --maxrregcount 128 # improves throughput: 7.3E8 (16384 32 12) up to 7.6E8 (65536 128 12)
-    ###GPUFLAGS+= --maxrregcount 96 # degrades throughput: 4.1E8 (16384 32 12) up to 4.5E8 (65536 128 12)
-    ###GPUFLAGS+= --maxrregcount 64 # degrades throughput: 1.7E8 (16384 32 12) flat at 1.7E8 (65536 128 12)
-
-    CUBUILDRULEFLAGS = -Xcompiler -fPIC -c
-    CCBUILDRULEFLAGS = -Xcompiler -fPIC -c -x cu
-
-    CUDATESTFLAGS = -lcuda
-
-  else ifneq ($(origin REQUIRE_CUDA),undefined)
-    # If REQUIRE_CUDA is set but no cuda is found, stop here (e.g. for CI tests on GPU #443)
-    $(error No cuda installation found (set CUDA_HOME or make GPUCC visible in PATH))
-  else
-    # No cuda. Switch cuda compilation off and go to common random numbers in C++
-    $(warning CUDA_HOME is not set or is invalid: export CUDA_HOME to compile with cuda)
-    override GPUCC=
-    override USE_NVTX=
-    override CUINC=
-    override CURANDLIBFLAGS=
-  endif
-
-  # Set the host C++ compiler for GPUCC via "-ccbin <host-compiler>"
-  # (NB issue #505: this must be a single word, "clang++ --gcc-toolchain..." is not supported)
-  GPUFLAGS += -ccbin $(shell which $(subst ccache ,,$(CXX)))
-
-  # Allow newer (unsupported) C++ compilers with older versions of CUDA if ALLOW_UNSUPPORTED_COMPILER_IN_CUDA is set (#504)
-  ifneq ($(origin ALLOW_UNSUPPORTED_COMPILER_IN_CUDA),undefined)
-  GPUFLAGS += -allow-unsupported-compiler
-  endif
-
-else ifeq ($(findstring hipcc,$(HIP_COMPILER_PATH)),hipcc)
-  #=== Configure the HIP compiler
-
-  # If CXX is not a single word (example "clang++ --gcc-toolchain...") then disable CUDA builds (issue #505)
-  # This is because it is impossible to pass this to "GPUFLAGS += -ccbin <host-compiler>" below
-  ifneq ($(words $(subst ccache ,,$(CXX))),1) # allow at most "CXX=ccache <host-compiler>" from outside
-    $(warning CUDA builds are not supported for multi-word CXX "$(CXX)")
-    override CUDA_HOME=disabled
-  endif
-
-  # If HIP_HOME is not set, try to set it from the location of GPUCC
-  ifndef HIP_HOME
-    HIP_HOME = $(patsubst %bin/hipcc,%,$(HIP_COMPILER_PATH))
-    $(warning HIP_HOME was not set: using "$(HIP_HOME)")
-  endif
-
-  # Set GPUCC as $(HIP_HOME)/bin/hipcc if it exists
-  ifneq ($(wildcard $(HIP_HOME)/bin/hipcc),)
-    GPUCC = $(HIP_HOME)/bin/hipcc
-
-    # Should maybe find something equivelant to this in HIP
-    #USE_NVTX ?=-DUSE_NVTX
-
-    HIPARCHFLAGS = -target x86_64-linux-gnu --offload-arch=gfx90a
-    HIPINC = -I$(HIP_HOME)/include/
-
-    # -DHIP_FAST_MATH equivelant to -use_fast_math in HIP 
-    # (But only for single precision line 208: https://rocm-developer-tools.github.io/HIP/hcc__detail_2math__functions_8h_source.html)
-    GPUFLAGS = $(OPTFLAGS) $(CUOPTFLAGS) $(INCFLAGS) $(HIPINC) $(HIPARCHFLAGS) -DHIP_FAST_MATH -DHIP_PLATFORM=amd -fPIC
-    ###GPUFLAGS += -Xcompiler -Wall -Xcompiler -Wextra -Xcompiler -Wshadow
-    GPUFLAGS += -std=c++17
-    # Without -maxrregcount: baseline throughput: 6.5E8 (16384 32 12) up to 7.3E8 (65536 128 12)
-    ###GPUFLAGS+= --maxrregcount 160 # improves throughput: 6.9E8 (16384 32 12) up to 7.7E8 (65536 128 12)
-    ###GPUFLAGS+= --maxrregcount 128 # improves throughput: 7.3E8 (16384 32 12) up to 7.6E8 (65536 128 12)
-    ###GPUFLAGS+= --maxrregcount 96 # degrades throughput: 4.1E8 (16384 32 12) up to 4.5E8 (65536 128 12)
-    ###GPUFLAGS+= --maxrregcount 64 # degrades throughput: 1.7E8 (16384 32 12) flat at 1.7E8 (65536 128 12)
-
-    CUBUILDRULEFLAGS = -fPIC -c
-    CCBUILDRULEFLAGS = -fPIC -c
-
-  else ifneq ($(origin REQUIRE_HIP),undefined)
-    # If REQUIRE_HIP is set but no cuda is found, stop here (e.g. for CI tests on GPU #443)
-    $(error No hip installation found (set HIP_HOME or make GPUCC visible in PATH))
-  else
-    # No hip. Switch hip compilation off and go to common random numbers in C++
-    $(warning HIP_HOME is not set or is invalid: export HIP_HOME to compile with hip)
-    override GPUCC=
-    override USE_NVTX=
-    override CUINC=
-    override CURANDLIBFLAGS=
-  endif
+#=== Configure the CUDA compiler
+
+# If CXX is not a single word (example "clang++ --gcc-toolchain...") then disable CUDA builds (issue #505)
+# This is because it is impossible to pass this to "CUFLAGS += -ccbin <host-compiler>" below
+ifneq ($(words $(subst ccache ,,$(CXX))),1) # allow at most "CXX=ccache <host-compiler>" from outside
+  $(warning CUDA builds are not supported for multi-word CXX "$(CXX)")
+  override CUDA_HOME=disabled
+endif
+
+# If CUDA_HOME is not set, try to set it from the location of nvcc
+ifndef CUDA_HOME
+  CUDA_HOME = $(patsubst %bin/nvcc,%,$(shell which nvcc 2>/dev/null))
+  $(warning CUDA_HOME was not set: using "$(CUDA_HOME)")
+endif
+
+# Set NVCC as $(CUDA_HOME)/bin/nvcc if it exists
+ifneq ($(wildcard $(CUDA_HOME)/bin/nvcc),)
+  NVCC = $(CUDA_HOME)/bin/nvcc
+  USE_NVTX ?=-DUSE_NVTX
+  # See https://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/index.html
+  # See https://arnon.dk/matching-sm-architectures-arch-and-gencode-for-various-nvidia-cards/
+  # Default: use compute capability 70 for V100 (CERN lxbatch, CERN itscrd, Juwels Cluster).
+  # Embed device code for 70, and PTX for 70+.
+  # Export MADGRAPH_CUDA_ARCHITECTURE (comma-separated list) to use another value or list of values (see #533).
+  # Examples: use 60 for P100 (Piz Daint), 80 for A100 (Juwels Booster, NVidia raplab/Curiosity).
+  MADGRAPH_CUDA_ARCHITECTURE ?= 70
+  ###CUARCHFLAGS = -gencode arch=compute_$(MADGRAPH_CUDA_ARCHITECTURE),code=compute_$(MADGRAPH_CUDA_ARCHITECTURE) -gencode arch=compute_$(MADGRAPH_CUDA_ARCHITECTURE),code=sm_$(MADGRAPH_CUDA_ARCHITECTURE) # Older implementation (AV): go back to this one for multi-GPU support #533
+  ###CUARCHFLAGS = --gpu-architecture=compute_$(MADGRAPH_CUDA_ARCHITECTURE) --gpu-code=sm_$(MADGRAPH_CUDA_ARCHITECTURE),compute_$(MADGRAPH_CUDA_ARCHITECTURE) # Newer implementation (SH): cannot use this as-is for multi-GPU support #533
+  comma:=,
+  CUARCHFLAGS = $(foreach arch,$(subst $(comma), ,$(MADGRAPH_CUDA_ARCHITECTURE)),-gencode arch=compute_$(arch),code=compute_$(arch) -gencode arch=compute_$(arch),code=sm_$(arch))
+  CUINC = -I$(CUDA_HOME)/include/
+  CURANDLIBFLAGS = -L$(CUDA_HOME)/lib64/ -lcurand # NB: -lcuda is not needed here!
+  CUOPTFLAGS = -lineinfo
+  CUFLAGS = $(OPTFLAGS) $(CUOPTFLAGS) $(INCFLAGS) $(CUINC) $(USE_NVTX) $(CUARCHFLAGS) -use_fast_math
+  ###CUFLAGS += -Xcompiler -Wall -Xcompiler -Wextra -Xcompiler -Wshadow
+  ###NVCC_VERSION = $(shell $(NVCC) --version | grep 'Cuda compilation tools' | cut -d' ' -f5 | cut -d, -f1)
+  CUFLAGS += -std=c++17 # need CUDA >= 11.2 (see #333): this is enforced in mgOnGpuConfig.h
+  # Without -maxrregcount: baseline throughput: 6.5E8 (16384 32 12) up to 7.3E8 (65536 128 12)
+  ###CUFLAGS+= --maxrregcount 160 # improves throughput: 6.9E8 (16384 32 12) up to 7.7E8 (65536 128 12)
+  ###CUFLAGS+= --maxrregcount 128 # improves throughput: 7.3E8 (16384 32 12) up to 7.6E8 (65536 128 12)
+  ###CUFLAGS+= --maxrregcount 96 # degrades throughput: 4.1E8 (16384 32 12) up to 4.5E8 (65536 128 12)
+  ###CUFLAGS+= --maxrregcount 64 # degrades throughput: 1.7E8 (16384 32 12) flat at 1.7E8 (65536 128 12)
+else ifneq ($(origin REQUIRE_CUDA),undefined)
+  # If REQUIRE_CUDA is set but no cuda is found, stop here (e.g. for CI tests on GPU #443)
+  $(error No cuda installation found (set CUDA_HOME or make nvcc visible in PATH))
+else
+  # No cuda. Switch cuda compilation off and go to common random numbers in C++
+  $(warning CUDA_HOME is not set or is invalid: export CUDA_HOME to compile with cuda)
+  override NVCC=
+  override USE_NVTX=
+  override CUINC=
+  override CURANDLIBFLAGS=
+endif
 
-  # Allow newer (unsupported) C++ compilers with older versions of CUDA if ALLOW_UNSUPPORTED_COMPILER_IN_CUDA is set (#504)
-  ifneq ($(origin ALLOW_UNSUPPORTED_COMPILER_IN_CUDA),undefined)
-  GPUFLAGS += -allow-unsupported-compiler
-  endif
+# Set the host C++ compiler for nvcc via "-ccbin <host-compiler>"
+# (NB issue #505: this must be a single word, "clang++ --gcc-toolchain..." is not supported)
+CUFLAGS += -ccbin $(shell which $(subst ccache ,,$(CXX)))
 
+# Allow newer (unsupported) C++ compilers with older versions of CUDA if ALLOW_UNSUPPORTED_COMPILER_IN_CUDA is set (#504)
+ifneq ($(origin ALLOW_UNSUPPORTED_COMPILER_IN_CUDA),undefined)
+CUFLAGS += -allow-unsupported-compiler
 endif
 
-  
 #-------------------------------------------------------------------------------
 
 #=== Configure ccache for C++ and CUDA builds
@@ -233,9 +163,9 @@ endif
 #ifeq ($(USECCACHE)$(shell echo $(AR) | grep ccache),1)
 #  override AR:=ccache $(AR)
 #endif
-ifneq ($(GPUCC),)
-  ifeq ($(USECCACHE)$(shell echo $(GPUCC) | grep ccache),1)
-    override GPUCC:=ccache $(GPUCC)
+ifneq ($(NVCC),)
+  ifeq ($(USECCACHE)$(shell echo $(NVCC) | grep ccache),1)
+    override NVCC:=ccache $(NVCC)
   endif
 endif
 
@@ -259,7 +189,7 @@ endif
 
 # PowerPC-specific CUDA compiler flags (to be reviewed!)
 ifeq ($(UNAME_P),ppc64le)
-  GPUFLAGS+= -Xcompiler -mno-float128
+  CUFLAGS+= -Xcompiler -mno-float128
 endif
 
 #-------------------------------------------------------------------------------
@@ -269,10 +199,10 @@ endif
 # Set the default OMPFLAGS choice
 ifneq ($(shell $(CXX) --version | egrep '^Intel'),)
 override OMPFLAGS = -fopenmp
-###override OMPFLAGS = # disable OpenMP MT on Intel (was ok without GPUCC but not ok with GPUCC before #578)
+###override OMPFLAGS = # disable OpenMP MT on Intel (was ok without nvcc but not ok with nvcc before #578)
 else ifneq ($(shell $(CXX) --version | egrep '^(clang)'),)
 override OMPFLAGS = -fopenmp
-###override OMPFLAGS = # disable OpenMP MT on clang (was not ok without or with GPUCC before #578)
+###override OMPFLAGS = # disable OpenMP MT on clang (was not ok without or with nvcc before #578)
 else ifneq ($(shell $(CXX) --version | egrep '^(Apple clang)'),)
 override OMPFLAGS = # disable OpenMP MT on Apple clang (builds fail in the CI #578)
 else
@@ -323,10 +253,7 @@ endif
 
 # Set the default RNDGEN (random number generator) choice
 ifeq ($(RNDGEN),)
-  ifeq ($(GPUCC),)
-    override RNDGEN = hasNoCurand
-  # Edgecase for HIP compilation
-  else ifeq ($(findstring hipcc,$(GPUCC)),hipcc)
+  ifeq ($(NVCC),)
     override RNDGEN = hasNoCurand
   else ifeq ($(RNDGEN),)
     override RNDGEN = hasCurand
@@ -401,13 +328,13 @@ CXXFLAGS+= $(AVXFLAGS)
 $(info FPTYPE=$(FPTYPE))
 ifeq ($(FPTYPE),d)
   CXXFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_DOUBLE
-  GPUFLAGS  += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_DOUBLE
+  CUFLAGS  += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_DOUBLE
 else ifeq ($(FPTYPE),f)
   CXXFLAGS += -DMGONGPU_FPTYPE_FLOAT -DMGONGPU_FPTYPE2_FLOAT
-  GPUFLAGS  += -DMGONGPU_FPTYPE_FLOAT -DMGONGPU_FPTYPE2_FLOAT
+  CUFLAGS  += -DMGONGPU_FPTYPE_FLOAT -DMGONGPU_FPTYPE2_FLOAT
 else ifeq ($(FPTYPE),m)
   CXXFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_FLOAT
-  GPUFLAGS  += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_FLOAT
+  CUFLAGS  += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_FLOAT
 else
   $(error Unknown FPTYPE='$(FPTYPE)': only 'd', 'f' and 'm' are supported)
 endif
@@ -416,7 +343,7 @@ endif
 $(info HELINL=$(HELINL))
 ifeq ($(HELINL),1)
   CXXFLAGS += -DMGONGPU_INLINE_HELAMPS
-  GPUFLAGS  += -DMGONGPU_INLINE_HELAMPS
+  CUFLAGS  += -DMGONGPU_INLINE_HELAMPS
 else ifneq ($(HELINL),0)
   $(error Unknown HELINL='$(HELINL)': only '0' and '1' are supported)
 endif
@@ -425,7 +352,7 @@ endif
 $(info HRDCOD=$(HRDCOD))
 ifeq ($(HRDCOD),1)
   CXXFLAGS += -DMGONGPU_HARDCODE_PARAM
-  GPUFLAGS  += -DMGONGPU_HARDCODE_PARAM
+  CUFLAGS  += -DMGONGPU_HARDCODE_PARAM
 else ifneq ($(HRDCOD),0)
   $(error Unknown HRDCOD='$(HRDCOD)': only '0' and '1' are supported)
 endif
@@ -477,11 +404,11 @@ ifeq ($(UNAME_S),Darwin)
   override CULIBFLAGSRPATH2 =
 else
   # RPATH to cuda/cpp libs when linking executables
-  override CXXLIBFLAGSRPATH = -Wl,-rpath=$(LIBDIRRPATH)
-  override CULIBFLAGSRPATH = -Xlinker -rpath=$(LIBDIRRPATH)
+  override CXXLIBFLAGSRPATH = -Wl,-rpath,$(LIBDIRRPATH)
+  override CULIBFLAGSRPATH = -Xlinker -rpath,$(LIBDIRRPATH)
   # RPATH to common lib when linking cuda/cpp libs
-  override CXXLIBFLAGSRPATH2 = -Wl,-rpath='$$ORIGIN'
-  override CULIBFLAGSRPATH2 = -Xlinker -rpath='$$ORIGIN'
+  override CXXLIBFLAGSRPATH2 = -Wl,-rpath,'$$ORIGIN'
+  override CULIBFLAGSRPATH2 = -Xlinker -rpath,'$$ORIGIN'
 endif
 
 # Setting LD_LIBRARY_PATH or DYLD_LIBRARY_PATH in the RUNTIME is no longer necessary (neither on Linux nor on Mac)
@@ -494,7 +421,7 @@ override RUNTIME =
 cxx_main=$(BUILDDIR)/check.exe
 fcxx_main=$(BUILDDIR)/fcheck.exe
 
-ifneq ($(GPUCC),)
+ifneq ($(NVCC),)
 cu_main=$(BUILDDIR)/gcheck.exe
 fcu_main=$(BUILDDIR)/fgcheck.exe
 else
@@ -525,16 +452,15 @@ $(BUILDDIR)/.build.$(TAG):
 	@touch $(BUILDDIR)/.build.$(TAG)
 
 # Generic target and build rules: objects from CUDA compilation
-ifneq ($(GPUCC),)
+ifneq ($(NVCC),)
 $(BUILDDIR)/%.o : %.cu *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
 	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
-	$(GPUCC) $(CPPFLAGS) $(GPUFLAGS) $(CUBUILDRULEFLAGS) $< -o $@
+	$(NVCC) $(CPPFLAGS) $(CUFLAGS) -Xcompiler -fPIC -c $< -o $@
 
 $(BUILDDIR)/%_cu.o : %.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
 	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
-	$(GPUCC) $(CPPFLAGS) $(GPUFLAGS) $(CCBUILDRULEFLAGS) $< -o $@
+	$(NVCC) $(CPPFLAGS) $(CUFLAGS) -Xcompiler -fPIC -c -x cu $< -o $@
 endif
-# -x cu in line above
 
 # Generic target and build rules: objects from C++ compilation
 # (NB do not include CUINC here! add it only for NVTX or curand #679)
@@ -543,14 +469,11 @@ $(BUILDDIR)/%.o : %.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
 	$(CXX) $(CPPFLAGS) $(CXXFLAGS) -fPIC -c $< -o $@
 
 # Apply special build flags only to CrossSectionKernel.cc and gCrossSectionKernel.cu (no fast math, see #117)
-# Added edgecase for HIP compilation
 ifeq ($(shell $(CXX) --version | grep ^nvc++),)
 $(BUILDDIR)/CrossSectionKernels.o: CXXFLAGS += -fno-fast-math
 $(BUILDDIR)/CrossSectionKernels.o: CXXFLAGS += -fno-fast-math
-ifeq ($(findstring nvcc,$(GPUCC)),nvcc)
-  $(BUILDDIR)/gCrossSectionKernels.o: GPUFLAGS += -Xcompiler -fno-fast-math
-else
-  $(BUILDDIR)/gCrossSectionKernels.o: GPUFLAGS += -fno-fast-math
+ifneq ($(NVCC),)
+$(BUILDDIR)/gCrossSectionKernels.o: CUFLAGS += -Xcompiler -fno-fast-math
 endif
 endif
 
@@ -566,10 +489,10 @@ ifeq ($(RNDGEN),hasCurand)
 $(BUILDDIR)/CurandRandomNumberKernel.o: CXXFLAGS += $(CUINC)
 endif
 
-# Avoid "warning: builtin __has_trivial_... is deprecated; use __is_trivially_... instead" in GPUCC with icx2023 (#592)
+# Avoid "warning: builtin __has_trivial_... is deprecated; use __is_trivially_... instead" in nvcc with icx2023 (#592)
 ifneq ($(shell $(CXX) --version | egrep '^(Intel)'),)
-ifneq ($(GPUCC),)
-GPUFLAGS += -Wno-deprecated-builtins
+ifneq ($(NVCC),)
+CUFLAGS += -Xcompiler -Wno-deprecated-builtins
 endif
 endif
 
@@ -577,8 +500,8 @@ endif
 # This patch does remove the warning, but I prefer to keep it disabled for the moment...
 ###ifneq ($(shell $(CXX) --version | egrep '^(clang|Apple clang|Intel)'),)
 ###$(BUILDDIR)/CrossSectionKernels.o: CXXFLAGS += -Wno-overriding-t-option
-###ifneq ($(GPUCC),)
-###$(BUILDDIR)/gCrossSectionKernels.o: GPUFLAGS += -Xcompiler -Wno-overriding-t-option
+###ifneq ($(NVCC),)
+###$(BUILDDIR)/gCrossSectionKernels.o: CUFLAGS += -Xcompiler -Wno-overriding-t-option
 ###endif
 ###endif
 
@@ -605,7 +528,7 @@ MG5AMC_CXXLIB = mg5amc_$(processid_short)_cpp
 cxx_objects_lib=$(BUILDDIR)/CPPProcess.o $(BUILDDIR)/MatrixElementKernels.o $(BUILDDIR)/BridgeKernels.o $(BUILDDIR)/CrossSectionKernels.o
 cxx_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel.o $(BUILDDIR)/RamboSamplingKernels.o
 
-ifneq ($(GPUCC),)
+ifneq ($(NVCC),)
 MG5AMC_CULIB = mg5amc_$(processid_short)_cuda
 cu_objects_lib=$(BUILDDIR)/gCPPProcess.o $(BUILDDIR)/gMatrixElementKernels.o $(BUILDDIR)/gBridgeKernels.o $(BUILDDIR)/gCrossSectionKernels.o
 cu_objects_exe=$(BUILDDIR)/gCommonRandomNumberKernel.o $(BUILDDIR)/gRamboSamplingKernels.o
@@ -617,11 +540,11 @@ $(LIBDIR)/lib$(MG5AMC_CXXLIB).so: cxx_objects_lib += $(BUILDDIR)/fbridge.o
 $(LIBDIR)/lib$(MG5AMC_CXXLIB).so: $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib)
 	$(CXX) -shared -o $@ $(cxx_objects_lib) $(CXXLIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB)
 
-ifneq ($(GPUCC),)
+ifneq ($(NVCC),)
 $(LIBDIR)/lib$(MG5AMC_CULIB).so: $(BUILDDIR)/fbridge_cu.o
 $(LIBDIR)/lib$(MG5AMC_CULIB).so: cu_objects_lib += $(BUILDDIR)/fbridge_cu.o
 $(LIBDIR)/lib$(MG5AMC_CULIB).so: $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cu_objects_lib)
-	$(GPUCC) --shared -o $@ $(cu_objects_lib) $(CULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB)
+	$(NVCC) --shared -o $@ $(cu_objects_lib) $(CULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB)
 endif
 
 #-------------------------------------------------------------------------------
@@ -638,16 +561,16 @@ $(cxx_main): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PAT
 $(cxx_main): $(BUILDDIR)/check_sa.o $(LIBDIR)/lib$(MG5AMC_CXXLIB).so $(cxx_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel.o
 	$(CXX) -o $@ $(BUILDDIR)/check_sa.o $(OMPFLAGS) -ldl -pthread $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_CXXLIB) $(cxx_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel.o $(CURANDLIBFLAGS)
 
-ifneq ($(GPUCC),)
+ifneq ($(NVCC),)
 ifneq ($(shell $(CXX) --version | grep ^Intel),)
-$(cu_main): LIBFLAGS += -lintlc # compile with icpx and link with GPUCC (undefined reference to `_intel_fast_memcpy')
-$(cu_main): LIBFLAGS += -lsvml # compile with icpx and link with GPUCC (undefined reference to `__svml_cos4_l9')
+$(cu_main): LIBFLAGS += -lintlc # compile with icpx and link with nvcc (undefined reference to `_intel_fast_memcpy')
+$(cu_main): LIBFLAGS += -lsvml # compile with icpx and link with nvcc (undefined reference to `__svml_cos4_l9')
 else ifneq ($(shell $(CXX) --version | grep ^nvc++),) # support nvc++ #531
 $(cu_main): LIBFLAGS += -L$(patsubst %bin/nvc++,%lib,$(subst ccache ,,$(CXX))) -lnvhpcatm -lnvcpumath -lnvc
 endif
 $(cu_main): LIBFLAGS += $(CULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH
 $(cu_main): $(BUILDDIR)/gcheck_sa.o $(LIBDIR)/lib$(MG5AMC_CULIB).so $(cu_objects_exe) $(BUILDDIR)/gCurandRandomNumberKernel.o
-	$(GPUCC) -o $@ $(BUILDDIR)/gcheck_sa.o $(CUARCHFLAGS) $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe) $(BUILDDIR)/gCurandRandomNumberKernel.o $(CURANDLIBFLAGS)
+	$(NVCC) -o $@ $(BUILDDIR)/gcheck_sa.o $(CUARCHFLAGS) $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe) $(BUILDDIR)/gCurandRandomNumberKernel.o $(CURANDLIBFLAGS)
 endif
 
 #-------------------------------------------------------------------------------
@@ -673,17 +596,17 @@ $(fcxx_main): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PA
 $(fcxx_main): $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler.o $(LIBDIR)/lib$(MG5AMC_CXXLIB).so $(cxx_objects_exe)
 	$(CXX) -o $@ $(BUILDDIR)/fcheck_sa.o $(OMPFLAGS) $(BUILDDIR)/fsampler.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CXXLIB) $(cxx_objects_exe)
 
-ifneq ($(GPUCC),)
+ifneq ($(NVCC),)
 ifneq ($(shell $(CXX) --version | grep ^Intel),)
-$(fcu_main): LIBFLAGS += -lintlc # compile with icpx and link with GPUCC (undefined reference to `_intel_fast_memcpy')
-$(fcu_main): LIBFLAGS += -lsvml # compile with icpx and link with GPUCC (undefined reference to `__svml_cos4_l9')
+$(fcu_main): LIBFLAGS += -lintlc # compile with icpx and link with nvcc (undefined reference to `_intel_fast_memcpy')
+$(fcu_main): LIBFLAGS += -lsvml # compile with icpx and link with nvcc (undefined reference to `__svml_cos4_l9')
 endif
 ifeq ($(UNAME_S),Darwin)
 $(fcu_main): LIBFLAGS += -L$(shell dirname $(shell $(FC) --print-file-name libgfortran.dylib)) # add path to libgfortran on Mac #375
 endif
 $(fcu_main): LIBFLAGS += $(CULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH
 $(fcu_main): $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler_cu.o $(LIBDIR)/lib$(MG5AMC_CULIB).so $(cu_objects_exe)
-	$(GPUCC) -o $@ $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler_cu.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe)
+	$(NVCC) -o $@ $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler_cu.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe)
 endif
 
 #-------------------------------------------------------------------------------
@@ -695,7 +618,7 @@ $(BUILDDIR)/testxxx.o: testxxx_cc_ref.txt
 $(testmain): $(BUILDDIR)/testxxx.o
 $(testmain): cxx_objects_exe += $(BUILDDIR)/testxxx.o # Comment out this line to skip the C++ test of xxx functions
 
-ifneq ($(GPUCC),)
+ifneq ($(NVCC),)
 $(BUILDDIR)/testxxx_cu.o: $(GTESTLIBS)
 $(BUILDDIR)/testxxx_cu.o: INCFLAGS += $(GTESTINC)
 $(BUILDDIR)/testxxx_cu.o: testxxx_cc_ref.txt
@@ -708,7 +631,7 @@ $(BUILDDIR)/testmisc.o: INCFLAGS += $(GTESTINC)
 $(testmain): $(BUILDDIR)/testmisc.o
 $(testmain): cxx_objects_exe += $(BUILDDIR)/testmisc.o # Comment out this line to skip the C++ miscellaneous tests
 
-ifneq ($(GPUCC),)
+ifneq ($(NVCC),)
 $(BUILDDIR)/testmisc_cu.o: $(GTESTLIBS)
 $(BUILDDIR)/testmisc_cu.o: INCFLAGS += $(GTESTINC)
 $(testmain): $(BUILDDIR)/testmisc_cu.o
@@ -720,12 +643,12 @@ $(BUILDDIR)/runTest.o: INCFLAGS += $(GTESTINC)
 $(testmain): $(BUILDDIR)/runTest.o
 $(testmain): cxx_objects_exe += $(BUILDDIR)/runTest.o
 
-ifneq ($(GPUCC),)
+ifneq ($(NVCC),)
 $(BUILDDIR)/runTest_cu.o: $(GTESTLIBS)
 $(BUILDDIR)/runTest_cu.o: INCFLAGS += $(GTESTINC)
 ifneq ($(shell $(CXX) --version | grep ^Intel),)
-$(testmain): LIBFLAGS += -lintlc # compile with icpx and link with GPUCC (undefined reference to `_intel_fast_memcpy')
-$(testmain): LIBFLAGS += -lsvml # compile with icpx and link with GPUCC (undefined reference to `__svml_cos4_l9')
+$(testmain): LIBFLAGS += -lintlc # compile with icpx and link with nvcc (undefined reference to `_intel_fast_memcpy')
+$(testmain): LIBFLAGS += -lsvml # compile with icpx and link with nvcc (undefined reference to `__svml_cos4_l9')
 else ifneq ($(shell $(CXX) --version | grep ^nvc++),) # support nvc++ #531
 $(testmain): LIBFLAGS += -L$(patsubst %bin/nvc++,%lib,$(subst ccache ,,$(CXX))) -lnvhpcatm -lnvcpumath -lnvc
 endif
@@ -749,14 +672,14 @@ $(testmain): LIBFLAGS += -lgomp
 endif
 endif
 
-ifeq ($(GPUCC),) # link only runTest.o
+ifeq ($(NVCC),) # link only runTest.o
 $(testmain): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH
 $(testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(cxx_objects_exe) $(GTESTLIBS)
 	$(CXX) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) -ldl -pthread $(LIBFLAGS)
 else # link both runTest.o and runTest_cu.o
 $(testmain): LIBFLAGS += $(CULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH
 $(testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) $(GTESTLIBS)
-	  $(GPUCC) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) -ldl $(LIBFLAGS) $(CUDATESTFLAGS)
+	$(NVCC) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) -ldl $(LIBFLAGS) -lcuda
 endif
 
 # Use flock (Linux only, no Mac) to allow 'make -j' if googletest has not yet been downloaded https://stackoverflow.com/a/32666215
@@ -859,9 +782,9 @@ ifeq ($(USECCACHE),1)
 	ccache --version | head -1
 endif
 	@echo ""
-	@echo GPUCC=$(GPUCC)
-ifneq ($(GPUCC),)
-	$(GPUCC) --version
+	@echo NVCC=$(NVCC)
+ifneq ($(NVCC),)
+	$(NVCC) --version
 endif
 	@echo ""
 	@echo CXX=$(CXX)
@@ -880,7 +803,7 @@ endif
 
 # Target: check (run the C++ test executable)
 # [NB THIS IS WHAT IS USED IN THE GITHUB CI!]
-ifneq ($(GPUCC),)
+ifneq ($(NVCC),)
 check: runTest cmpFcheck cmpFGcheck
 else
 check: runTest cmpFcheck
diff --git a/epochX/cudacpp/gg_tt.sa/SubProcesses/fbridge.cc b/epochX/cudacpp/gg_tt.sa/SubProcesses/fbridge.cc
index 2b956730d4..f93c05b0b3 100644
--- a/epochX/cudacpp/gg_tt.sa/SubProcesses/fbridge.cc
+++ b/epochX/cudacpp/gg_tt.sa/SubProcesses/fbridge.cc
@@ -1,11 +1,11 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: S. Roiser (Oct 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Roiser, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #include "Bridge.h"
 #include "CPPProcess.h"
-#include "GpuRuntime.h"
+#include "CudaRuntime.h"
 
 extern "C"
 {
@@ -22,7 +22,7 @@ extern "C"
    * Using the same Fortran MadEvent code, linking to the hetrerogeneous library would allow access to both CPU and GPU implementations.
    * The specific heterogeneous configuration (how many GPUs, how many threads on each CPU, etc) could be loaded in CUDA/C++ from a data file.
    */
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
   using namespace mg5amcGpu;
 #else
   using namespace mg5amcCpu;
@@ -46,8 +46,8 @@ extern "C"
    */
   void fbridgecreate_( CppObjectInFortran** ppbridge, const int* pnevtF, const int* pnparF, const int* pnp4F )
   {
-#ifdef MGONGPUCPP_GPUIMPL
-    GpuRuntime::setUp();
+#ifdef __CUDACC__
+    CudaRuntime::setUp();
 #endif
     // Create a process object, read parm card and set parameters
     // FIXME: the process instance can happily go out of scope because it is only needed to read parameters?
@@ -69,8 +69,8 @@ extern "C"
     Bridge<FORTRANFPTYPE>* pbridge = dynamic_cast<Bridge<FORTRANFPTYPE>*>( *ppbridge );
     if( pbridge == 0 ) throw std::runtime_error( "fbridgedelete_: invalid Bridge address" );
     delete pbridge;
-#ifdef MGONGPUCPP_GPUIMPL
-    GpuRuntime::tearDown();
+#ifdef __CUDACC__
+    CudaRuntime::tearDown();
 #endif
   }
 
@@ -100,7 +100,7 @@ extern "C"
   {
     Bridge<FORTRANFPTYPE>* pbridge = dynamic_cast<Bridge<FORTRANFPTYPE>*>( *ppbridge );
     if( pbridge == 0 ) throw std::runtime_error( "fbridgesequence_: invalid Bridge address" );
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
     // Use the device/GPU implementation in the CUDA library
     // (there is also a host implementation in this library)
     pbridge->gpu_sequence( momenta, gs, rndhel, rndcol, *pchannelId, mes, selhel, selcol );
diff --git a/epochX/cudacpp/gg_tt.sa/SubProcesses/fsampler.cc b/epochX/cudacpp/gg_tt.sa/SubProcesses/fsampler.cc
index 3743934f41..2fb445372d 100644
--- a/epochX/cudacpp/gg_tt.sa/SubProcesses/fsampler.cc
+++ b/epochX/cudacpp/gg_tt.sa/SubProcesses/fsampler.cc
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Feb 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #include "mgOnGpuConfig.h"
 
@@ -13,7 +13,7 @@
 
 //--------------------------------------------------------------------------
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -40,7 +40,7 @@ namespace mg5amcCpu
   private:
     const int m_nevt; // The number of events in each iteration
     int m_iiter;      // The iteration counter (for random number seeding)
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
     HostBufferRndNumMomenta m_hstRndmom; // Memory buffers for random numbers
     HostBufferMomenta m_hstMomenta;      // Memory buffers for momenta
     HostBufferWeights m_hstWeights;      // Memory buffers for sampling weights
@@ -105,7 +105,7 @@ namespace mg5amcCpu
 
 extern "C"
 {
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
   using namespace mg5amcGpu;
 #else
   using namespace mg5amcCpu;
diff --git a/epochX/cudacpp/gg_tt.sa/SubProcesses/runTest.cc b/epochX/cudacpp/gg_tt.sa/SubProcesses/runTest.cc
index 461ec5c3a5..572e28aaea 100644
--- a/epochX/cudacpp/gg_tt.sa/SubProcesses/runTest.cc
+++ b/epochX/cudacpp/gg_tt.sa/SubProcesses/runTest.cc
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: S. Hageboeck (Nov 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 
 #include "mgOnGpuConfig.h"
 
@@ -15,7 +15,7 @@
 #include "RandomNumberKernels.h"
 #include "epoch_process_id.h"
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 using namespace mg5amcGpu;
 #else
 using namespace mg5amcCpu;
@@ -32,7 +32,7 @@ struct CUDA_CPU_TestBase : public TestDriverBase
     : TestDriverBase( npar, refFileName ) {}
 };
 
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
 struct CPUTest : public CUDA_CPU_TestBase
 {
   // Struct data members (process, and memory structures for random numbers, momenta, matrix elements and weights on host and device)
@@ -114,7 +114,7 @@ struct CPUTest : public CUDA_CPU_TestBase
 };
 #endif
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 struct CUDATest : public CUDA_CPU_TestBase
 {
   // Reset the device when our test goes out of scope. Note that this should happen after
@@ -123,7 +123,7 @@ struct CUDATest : public CUDA_CPU_TestBase
   {
     ~DeviceReset()
     {
-      gpuDeviceReset(); // this is needed by cuda-memcheck --leak-check full
+      checkCuda( cudaDeviceReset() ); // this is needed by cuda-memcheck --leak-check full
     }
   } deviceResetter;
 
@@ -249,7 +249,7 @@ INSTANTIATE_TEST_SUITE_P( prefix, \
                           test_suite_name, \
                           testing::Values( new CUDATest( MG_EPOCH_REFERENCE_FILE_NAME ) ) );
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 MG_INSTANTIATE_TEST_SUITE_GPU( XTESTID_GPU( MG_EPOCH_PROCESS_ID ), MadgraphTest );
 #else
 MG_INSTANTIATE_TEST_SUITE_CPU( XTESTID_CPU( MG_EPOCH_PROCESS_ID ), MadgraphTest );
diff --git a/epochX/cudacpp/gg_tt.sa/SubProcesses/testmisc.cc b/epochX/cudacpp/gg_tt.sa/SubProcesses/testmisc.cc
index 2bd7a9fcf9..989aba1fdc 100644
--- a/epochX/cudacpp/gg_tt.sa/SubProcesses/testmisc.cc
+++ b/epochX/cudacpp/gg_tt.sa/SubProcesses/testmisc.cc
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 //----------------------------------------------------------------------------
 // Use ./runTest.exe --gtest_filter=*misc to run only this test
 //----------------------------------------------------------------------------
@@ -17,7 +17,7 @@
 #include <sstream>
 #include <typeinfo>
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 #define TESTID( s ) s##_GPU_MISC
 #else
 #define TESTID( s ) s##_CPU_MISC
diff --git a/epochX/cudacpp/gg_tt.sa/SubProcesses/testxxx.cc b/epochX/cudacpp/gg_tt.sa/SubProcesses/testxxx.cc
index 6e8657edca..4243e9fcec 100644
--- a/epochX/cudacpp/gg_tt.sa/SubProcesses/testxxx.cc
+++ b/epochX/cudacpp/gg_tt.sa/SubProcesses/testxxx.cc
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Apr 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #include "mgOnGpuConfig.h"
 
@@ -20,7 +20,7 @@
 #include <iomanip>
 #include <iostream>
 #include <vector>
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 #define TESTID( s ) s##_GPU_XXX
 #else
 #define TESTID( s ) s##_CPU_XXX
@@ -41,7 +41,7 @@ TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testxxx )
   assert( nevt % neppM == 0 ); // nevt must be a multiple of neppM
   assert( nevt % neppV == 0 ); // nevt must be a multiple of neppV
   // Fill in the input momenta
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
   mg5amcGpu::PinnedHostBufferMomenta hstMomenta( nevt ); // AOSOA[npagM][npar=4][np4=4][neppM]
 #else
   mg5amcCpu::HostBufferMomenta hstMomenta( nevt ); // AOSOA[npagM][npar=4][np4=4][neppM]
@@ -228,7 +228,7 @@ TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testxxx )
   {
     for( int ievt = 0; ievt < nevt; ievt++ )
     {
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
       using namespace mg5amcGpu;
 #else
       using namespace mg5amcCpu;
diff --git a/epochX/cudacpp/gg_tt.sa/src/HelAmps_sm.h b/epochX/cudacpp/gg_tt.sa/src/HelAmps_sm.h
index f7ecb29537..bc2adb6258 100644
--- a/epochX/cudacpp/gg_tt.sa/src/HelAmps_sm.h
+++ b/epochX/cudacpp/gg_tt.sa/src/HelAmps_sm.h
@@ -4,7 +4,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: A. Valassi (Sep 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
 // MadGraph5_aMC@NLO v. 3.5.0_lo_vect, 2023-06-09
@@ -26,7 +26,7 @@
 //#include <iomanip>
 //#include <iostream>
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/gg_tt.sa/src/Parameters_sm.cc b/epochX/cudacpp/gg_tt.sa/src/Parameters_sm.cc
index 459dae9e99..7255e49119 100644
--- a/epochX/cudacpp/gg_tt.sa/src/Parameters_sm.cc
+++ b/epochX/cudacpp/gg_tt.sa/src/Parameters_sm.cc
@@ -4,7 +4,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: A. Valassi (Sep 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
 // MadGraph5_aMC@NLO v. 3.5.0_lo_vect, 2023-06-09
diff --git a/epochX/cudacpp/gg_tt.sa/src/Parameters_sm.h b/epochX/cudacpp/gg_tt.sa/src/Parameters_sm.h
index db5520aa96..c935779eb3 100644
--- a/epochX/cudacpp/gg_tt.sa/src/Parameters_sm.h
+++ b/epochX/cudacpp/gg_tt.sa/src/Parameters_sm.h
@@ -211,7 +211,7 @@ namespace Parameters_sm_dependentCouplings
 #pragma GCC diagnostic push
 #pragma GCC diagnostic ignored "-Wunused-variable"  // e.g. <<warning: unused variable ‘mdl_G__exp__2’ [-Wunused-variable]>>
 #pragma GCC diagnostic ignored "-Wunused-parameter" // e.g. <<warning: unused parameter ‘G’ [-Wunused-parameter]>>
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 #pragma nv_diagnostic push
 #pragma nv_diag_suppress 177 // e.g. <<warning #177-D: variable "mdl_G__exp__2" was declared but never referenced>>
 #endif
@@ -238,7 +238,7 @@ namespace Parameters_sm_dependentCouplings
     // End SM implementation - no special handling of vectors of floats as in EFT (#439)
     return out;
   }
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 #pragma GCC diagnostic pop
 #pragma nv_diagnostic pop
 #endif
@@ -254,7 +254,7 @@ namespace Parameters_sm_independentCouplings
 
 //==========================================================================
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/gg_tt.sa/src/mgOnGpuConfig.h b/epochX/cudacpp/gg_tt.sa/src/mgOnGpuConfig.h
index d4e37d19b3..6c0c4919e9 100644
--- a/epochX/cudacpp/gg_tt.sa/src/mgOnGpuConfig.h
+++ b/epochX/cudacpp/gg_tt.sa/src/mgOnGpuConfig.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jul 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MGONGPUCONFIG_H
 #define MGONGPUCONFIG_H 1
@@ -10,26 +10,13 @@
 // There are two different code bases for standalone_cudacpp (without multichannel) and madevent+cudacpp (with multichannel)
 #undef MGONGPU_SUPPORTS_MULTICHANNEL
 
-// Is this a GPU (CUDA, HIP) or CPU implementation?
-#ifdef __CUDACC__
-#define MGONGPUCPP_GPUIMPL cuda
-#elif defined __HIPCC__
-#define MGONGPUCPP_GPUIMPL hip
-#else
-#undef MGONGPUCPP_GPUIMPL
-#endif
-
 // ** NB1 Throughputs (e.g. 6.8E8) are events/sec for "./gcheck.exe -p 65536 128 12"
 // ** NB2 Baseline on b7g47n0004 fluctuates (probably depends on load on other VMs)
 
 // Choose if curand is supported for generating random numbers
-// For CUDA, by default, it is supported
-// For HIP, by default, it is not supported
 // For C++, by default, do not inline, but allow this macro to be set from outside with e.g. -DMGONGPU_HAS_NO_CURAND
 #ifdef __CUDACC__
 #undef MGONGPU_HAS_NO_CURAND
-#elif defined __HIPCC__
-#define MGONGPU_HAS_NO_CURAND 1
 #else
 //#undef MGONGPU_HAS_NO_CURAND // default
 ////#define MGONGPU_HAS_NO_CURAND 1
@@ -65,28 +52,23 @@
 //#undef MGONGPU_HARDCODE_PARAM // default
 ////#define MGONGPU_HARDCODE_PARAM 1
 
-// Complex type in CUDA: thrust or cucomplex or cxsmpl (CHOOSE ONLY ONE)
+// Complex type in c++: std::complex or cxsmpl (CHOOSE ONLY ONE)
+#ifndef __CUDACC__
+//#define MGONGPU_CPPCXTYPE_STDCOMPLEX 1 // ~8 percent slower on float, same on double (5.1E6/double, 9.4E6/float)
+#define MGONGPU_CPPCXTYPE_CXSMPL 1 // new default (5.1E6/double, 10.2E6/float)
+#endif
+
+// Complex type in cuda: thrust or cucomplex or cxsmpl (CHOOSE ONLY ONE)
 #ifdef __CUDACC__
 #define MGONGPU_CUCXTYPE_THRUST 1 // default (~1.15E9/double, ~3.2E9/float)
 //#define MGONGPU_CUCXTYPE_CUCOMPLEX 1 // ~10 percent slower (1.03E9/double, ~2.8E9/float)
 //#define MGONGPU_CUCXTYPE_CXSMPL 1 // ~10 percent slower (1.00E9/double, ~2.9E9/float)
-
-// Complex type in HIP: cxsmpl (ONLY ONE OPTION POSSIBLE)
-#elif defined __HIPCC__
-#define MGONGPU_CUCXTYPE_CXSMPL 1 // ~10 percent slower (1.00E9/double, ~2.9E9/float)
-
-// Complex type in C++: std::complex or cxsmpl (CHOOSE ONLY ONE)
-#else
-//#define MGONGPU_CPPCXTYPE_STDCOMPLEX 1 // ~8 percent slower on float, same on double (5.1E6/double, 9.4E6/float)
-#define MGONGPU_CPPCXTYPE_CXSMPL 1 // new default (5.1E6/double, 10.2E6/float)
 #endif
 
-// CUDA nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation
+// Cuda nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation
 #ifdef __CUDACC__
-#undef MGONGPU_NSIGHT_DEBUG // default in CUDA
+#undef MGONGPU_NSIGHT_DEBUG // default
 //#define MGONGPU_NSIGHT_DEBUG 1
-#else
-#undef MGONGPU_NSIGHT_DEBUG // only option in HIP or C++
 #endif
 
 // SANITY CHECKS (floating point precision for everything but color algebra #537)
@@ -102,21 +84,17 @@
 #error You cannot use double precision for color algebra and single precision elsewhere
 #endif
 
-// SANITY CHECKS (CUDA complex number implementation)
-#ifdef __CUDACC__
-#if defined MGONGPU_CUCXTYPE_THRUST and defined MGONGPU_CUCXTYPE_CUCOMPLEX
-#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CUCXTYPE_THRUST or MGONGPU_CUCXTYPE_CUCOMPLEX for CUDA
-#elif defined MGONGPU_CUCXTYPE_THRUST and defined MGONGPU_CUCXTYPE_CXSMPL
-#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CUCXTYPE_THRUST or MGONGPU_CUCXTYPE_CXSMPL for CUDA
-#elif defined MGONGPU_CUCXTYPE_CUCOMPLEX and defined MGONGPU_CUCXTYPE_CXSMPL
-#error You must CHOOSE (ONE AND) ONLY ONE OF MGONGPU_CUCXTYPE_CUCOMPLEX or MGONGPU_CUCXTYPE_CXSMPL for CUDA
+// SANITY CHECKS (c++ complex number implementation)
+#ifndef __CUDACC__
+#if defined MGONGPU_CPPCXTYPE_STDCOMPLEX and defined MGONGPU_CPPCXTYPE_CXSMPL
+#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CPPCXTYPE_STDCOMPLEX or MGONGPU_CPPCXTYPE_CXSMPL
 #endif
 #endif
 
-// SANITY CHECKS (C++ complex number implementation)
-#ifndef MGONGPUCPP_GPUIMPL
-#if defined MGONGPU_CPPCXTYPE_STDCOMPLEX and defined MGONGPU_CPPCXTYPE_CXSMPL
-#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CPPCXTYPE_STDCOMPLEX or MGONGPU_CPPCXTYPE_CXSMPL for C++
+// SANITY CHECKS (cuda complex number implementation)
+#ifdef __CUDACC__
+#if defined MGONGPU_CUCXTYPE_THRUST and defined MGONGPU_CUCXTYPE_CUCOMPLEX and defined MGONGPU_CUCXTYPE_CXSMPL
+#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CUCXTYPE_THRUST or MGONGPU_CUCXTYPE_CUCOMPLEX or MGONGPU_CUCXTYPE_CXSMPL
 #endif
 #endif
 
@@ -153,7 +131,7 @@ namespace mgOnGpu
   // Alignment requirement for using reinterpret_cast with SIMD vectorized code
   // (using reinterpret_cast with non aligned memory may lead to segmentation faults!)
   // Only needed for C++ code but can be enforced also in NVCC builds of C++ code using CUDA>=11.2 and C++17 (#318, #319, #333)
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
   constexpr int cppAlign = 64; // alignment requirement for SIMD vectorization (64-byte i.e. 512-bit)
 #endif
 
@@ -164,7 +142,7 @@ using mgOnGpu::fptype;
 using mgOnGpu::fptype2;
 
 // C++ SIMD vectorization width (this will be used to set neppV)
-#ifdef MGONGPUCPP_GPUIMPL // CUDA and HIP implementations have no SIMD
+#ifdef __CUDACC__ // CUDA implementation has no SIMD
 #undef MGONGPU_CPPSIMD
 #elif defined __AVX512VL__ && defined MGONGPU_PVW512 // C++ "512z" AVX512 with 512 width (512-bit ie 64-byte): 8 (DOUBLE) or 16 (FLOAT)
 #ifdef MGONGPU_FPTYPE_DOUBLE
@@ -194,9 +172,9 @@ using mgOnGpu::fptype2;
 #undef MGONGPU_CPPSIMD
 #endif
 
-// CUDA nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation
+// Cuda nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation
 // Arguments (not used so far): text is __FUNCTION__, code is 0 (start) or 1 (end)
-#if defined __CUDA__ && defined MGONGPU_NSIGHT_DEBUG /* clang-format off */
+#if defined __CUDACC__ && defined MGONGPU_NSIGHT_DEBUG /* clang-format off */
 #define mgDebugDeclare() __shared__ float mgDebugCounter[mgOnGpu::ntpbMAX];
 #define mgDebugInitialise() { mgDebugCounter[threadIdx.x] = 0; }
 #define mgDebug( code, text ) { mgDebugCounter[threadIdx.x] += 1; }
@@ -208,8 +186,8 @@ using mgOnGpu::fptype2;
 #define mgDebugFinalise() { /*noop*/ }
 #endif /* clang-format on */
 
-// Define empty CUDA/HIP declaration specifiers for C++
-#ifndef MGONGPUCPP_GPUIMPL
+// Define empty CUDA declaration specifiers for C++
+#ifndef __CUDACC__
 #define __global__
 #define __host__
 #define __device__
diff --git a/epochX/cudacpp/gg_tt.sa/src/mgOnGpuCxtypes.h b/epochX/cudacpp/gg_tt.sa/src/mgOnGpuCxtypes.h
index 4e7ab03fa2..0cb2f1db7e 100644
--- a/epochX/cudacpp/gg_tt.sa/src/mgOnGpuCxtypes.h
+++ b/epochX/cudacpp/gg_tt.sa/src/mgOnGpuCxtypes.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022, based on earlier work by D. Smith) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MGONGPUCXTYPES_H
 #define MGONGPUCXTYPES_H 1
@@ -19,7 +19,7 @@
 #include <complex>
 
 // Complex type in cuda: thrust or cucomplex or cxsmpl
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 #if defined MGONGPU_CUCXTYPE_THRUST
 #pragma clang diagnostic push
 #pragma clang diagnostic ignored "-Wtautological-compare" // for icpx2021/clang13 (https://stackoverflow.com/a/15864661)
@@ -201,7 +201,7 @@ namespace mgOnGpu
 {
 
   // --- Type definitions (complex type: cxtype)
-#ifdef MGONGPUCPP_GPUIMPL // cuda
+#ifdef __CUDACC__ // cuda
 #if defined MGONGPU_CUCXTYPE_THRUST
   typedef thrust::complex<fptype> cxtype;
 #elif defined MGONGPU_CUCXTYPE_CUCOMPLEX
@@ -281,7 +281,7 @@ cxmake( const std::complex<double>& c ) // std::complex to cxsmpl (double-to-flo
 
 //==========================================================================
 
-#if defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CUCXTYPE_THRUST // cuda + thrust
+#if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_THRUST // cuda + thrust
 
 //------------------------------
 // CUDA - using thrust::complex
@@ -317,11 +317,11 @@ cxmake( const cxtype& c )
   return c;
 }
 
-#endif // #if defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CUCXTYPE_THRUST
+#endif // #if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_THRUST
 
 //==========================================================================
 
-#if defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CUCXTYPE_CUCOMPLEX // cuda + cucomplex
+#if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_CUCOMPLEX // cuda + cucomplex
 
 //------------------------------
 // CUDA - using cuComplex
@@ -536,11 +536,11 @@ cxmake( const std::complex<fptype>& c ) // std::complex to cucomplex (float-to-f
   return cxmake( c.real(), c.imag() );
 }
 
-#endif // #if defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CUCXTYPE_CUCOMPLEX
+#endif // #if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_CUCOMPLEX
 
 //==========================================================================
 
-#if not defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CPPCXTYPE_STDCOMPLEX // c++ + stdcomplex
+#if not defined __CUDACC__ and defined MGONGPU_CPPCXTYPE_STDCOMPLEX // c++ + stdcomplex
 
 //------------------------------
 // C++ - using std::complex
@@ -584,7 +584,7 @@ cxmake( const std::complex<double>& c ) // std::complex to std::complex (cast do
 }
 #endif
 
-#endif // #if not defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CPPCXTYPE_STDCOMPLEX
+#endif // #if not defined __CUDACC__ and defined MGONGPU_CPPCXTYPE_STDCOMPLEX
 
 //==========================================================================
 
diff --git a/epochX/cudacpp/gg_tt.sa/src/mgOnGpuFptypes.h b/epochX/cudacpp/gg_tt.sa/src/mgOnGpuFptypes.h
index 6f6cee64d6..a1cde16a67 100644
--- a/epochX/cudacpp/gg_tt.sa/src/mgOnGpuFptypes.h
+++ b/epochX/cudacpp/gg_tt.sa/src/mgOnGpuFptypes.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MGONGPUFPTYPES_H
 #define MGONGPUFPTYPES_H 1
@@ -13,7 +13,7 @@
 
 //==========================================================================
 
-#ifdef MGONGPUCPP_GPUIMPL // cuda
+#ifdef __CUDACC__ // cuda
 
 //------------------------------
 // Floating point types - Cuda
@@ -57,11 +57,11 @@ fpsqrt( const fptype& f )
 #endif
 }
 
-#endif // #ifdef MGONGPUCPP_GPUIMPL
+#endif // #ifdef __CUDACC__
 
 //==========================================================================
 
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
 
 //------------------------------
 // Floating point types - C++
@@ -85,7 +85,7 @@ fpsqrt( const fptype& f )
   return std::sqrt( f );
 }
 
-#endif // #ifndef MGONGPUCPP_GPUIMPL
+#endif // #ifndef __CUDACC__
 
 //==========================================================================
 
diff --git a/epochX/cudacpp/gg_tt.sa/src/mgOnGpuVectors.h b/epochX/cudacpp/gg_tt.sa/src/mgOnGpuVectors.h
index 7904b93c61..9d3e82b1e3 100644
--- a/epochX/cudacpp/gg_tt.sa/src/mgOnGpuVectors.h
+++ b/epochX/cudacpp/gg_tt.sa/src/mgOnGpuVectors.h
@@ -108,7 +108,7 @@ namespace mgOnGpu /* clang-format off */
 #endif
 #endif
 
-#else // i.e #ifndef MGONGPU_CPPSIMD (this includes #ifdef MGONGPUCPP_GPUIMPL)
+#else // i.e #ifndef MGONGPU_CPPSIMD (this includes #ifdef __CUDACC__)
 
   const int neppV = 1;
 
@@ -129,7 +129,7 @@ using mgOnGpu::bool_v;
 
 //--------------------------------------------------------------------------
 
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
 
 // Printout to stream for user defined types
 
@@ -744,11 +744,11 @@ fpvsplit1( const fptype2_v& v )
 
 #endif // #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
 
-#endif // #ifndef MGONGPUCPP_GPUIMPL
+#endif // #ifndef __CUDACC__
 
 //==========================================================================
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 
 //------------------------------
 // Vector types - CUDA
@@ -786,12 +786,12 @@ cxternary( const bool& mask, const cxtype& a, const cxtype& b )
   return ( mask ? a : b );
 }
 
-#endif // #ifdef MGONGPUCPP_GPUIMPL
+#endif // #ifdef __CUDACC__
 
 //==========================================================================
 
 // Scalar-or-vector types: scalar in CUDA, vector or scalar in C++
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 typedef bool bool_sv;
 typedef fptype fptype_sv;
 typedef fptype2 fptype2_sv;
@@ -812,7 +812,7 @@ typedef mgOnGpu::cxtype_ref cxtype_sv_ref;
 #endif
 
 // Scalar-or-vector zeros: scalar in CUDA, vector or scalar in C++
-#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
+#ifdef __CUDACC__ /* clang-format off */
 inline __host__ __device__ cxtype cxzero_sv(){ return cxtype( 0, 0 ); }
 #elif defined MGONGPU_CPPSIMD
 inline cxtype_v cxzero_sv() { return cxtype_v(); } // RRRR=0000 IIII=0000
diff --git a/epochX/cudacpp/gg_tt.sa/src/rambo.h b/epochX/cudacpp/gg_tt.sa/src/rambo.h
index cd7e1008ea..e02ea52496 100644
--- a/epochX/cudacpp/gg_tt.sa/src/rambo.h
+++ b/epochX/cudacpp/gg_tt.sa/src/rambo.h
@@ -4,7 +4,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 
 #include "mgOnGpuConfig.h"
@@ -18,7 +18,7 @@
 #include <iostream>
 
 // Simplified rambo version for 2 to N (with N>=2) processes with massless particles
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -83,7 +83,7 @@ namespace mg5amcCpu
       static bool first = true;
       if( first )
       {
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
         if constexpr( M_ACCESS::isOnDevice() ) // avoid
         {
           const int ievt0 = 0;
@@ -166,7 +166,7 @@ namespace mg5amcCpu
     wt = po2log;
     if( nparf != 2 ) wt = ( 2. * nparf - 4. ) * log( energy ) + z[nparf - 1];
 
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
     // issue warnings if weight is too small or too large
     static int iwarn[5] = { 0, 0, 0, 0, 0 };
     if( wt < -180. )
diff --git a/epochX/cudacpp/gg_tt01g.mad/COPYRIGHT b/epochX/cudacpp/gg_tt01g.mad/COPYRIGHT
index 84a883fbb0..a134b5fef9 100644
--- a/epochX/cudacpp/gg_tt01g.mad/COPYRIGHT
+++ b/epochX/cudacpp/gg_tt01g.mad/COPYRIGHT
@@ -15,7 +15,6 @@ The full development team currently includes the following authors :
   Stephan Hageboeck (CERN)
   Olivier Mattelaer (Universite Catholique de Louvain, original author)
   Stefan Roiser (CERN, original author)
-  Joergen Teig (CERN)
   Andrea Valassi (CERN, original author)
   Zenny Wettersten (CERN)
 See https://github.com/madgraph5/madgraph4gpu for more details. For the full
diff --git a/epochX/cudacpp/gg_tt01g.mad/Source/DHELAS/aloha_file.inc b/epochX/cudacpp/gg_tt01g.mad/Source/DHELAS/aloha_file.inc
index 50c12b0804..4f2ef3d0d8 100644
--- a/epochX/cudacpp/gg_tt01g.mad/Source/DHELAS/aloha_file.inc
+++ b/epochX/cudacpp/gg_tt01g.mad/Source/DHELAS/aloha_file.inc
@@ -1 +1 @@
-ALOHARoutine = FFV1_1.o VVVV4P0_1.o FFV1_0.o VVV1_0.o FFV1_2.o VVVV3P0_1.o VVVV1P0_1.o VVV1P0_1.o FFV1P0_3.o
+ALOHARoutine = VVVV4P0_1.o VVVV3P0_1.o VVVV1P0_1.o FFV1_1.o FFV1_2.o VVV1P0_1.o VVV1_0.o FFV1_0.o FFV1P0_3.o
diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/Bridge.h b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/Bridge.h
index c04628dfd1..4cafe0c997 100644
--- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/Bridge.h
+++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/Bridge.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: S. Roiser (Nov 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Roiser, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef BRIDGE_H
 #define BRIDGE_H 1
@@ -22,7 +22,7 @@
 #include <memory>
 #include <type_traits>
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -82,7 +82,7 @@ namespace mg5amcCpu
     Bridge& operator=( const Bridge& ) = delete;
     Bridge& operator=( Bridge&& ) = delete;
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
     /**
      * Set the gpublocks and gputhreads for the gpusequence - throws if evnt != gpublocks*gputhreads
      * (this is needed for BridgeKernel tests rather than for actual production use in Fortran)
@@ -149,7 +149,7 @@ namespace mg5amcCpu
     unsigned int m_nevt; // number of events
     int m_nGoodHel;      // the number of good helicities (-1 initially when they have not yet been calculated)
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
     int m_gputhreads; // number of gpu threads (default set from number of events, can be modified)
     int m_gpublocks;  // number of gpu blocks (default set from number of events, can be modified)
     mg5amcGpu::DeviceBuffer<FORTRANFPTYPE, sizePerEventMomenta> m_devMomentaF;
@@ -186,12 +186,12 @@ namespace mg5amcCpu
   // Forward declare transposition methods
   //
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 
   template<typename Tin, typename Tout>
   __global__ void dev_transposeMomentaF2C( const Tin* in, Tout* out, const unsigned int nevt );
 
-#endif // MGONGPUCPP_GPUIMPL
+#endif // __CUDACC__
 
   template<typename Tin, typename Tout>
   void hst_transposeMomentaF2C( const Tin* in, Tout* out, const unsigned int nevt );
@@ -208,7 +208,7 @@ namespace mg5amcCpu
   Bridge<FORTRANFPTYPE>::Bridge( unsigned int nevtF, unsigned int nparF, unsigned int np4F )
     : m_nevt( nevtF )
     , m_nGoodHel( -1 )
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
     , m_gputhreads( 256 )                  // default number of gpu threads
     , m_gpublocks( m_nevt / m_gputhreads ) // this ensures m_nevt <= m_gpublocks*m_gputhreads
     , m_devMomentaF( m_nevt )
@@ -232,7 +232,7 @@ namespace mg5amcCpu
   {
     if( nparF != CPPProcess::npar ) throw std::runtime_error( "Bridge constructor: npar mismatch" );
     if( np4F != CPPProcess::np4 ) throw std::runtime_error( "Bridge constructor: np4 mismatch" );
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
     if( ( m_nevt < s_gputhreadsmin ) || ( m_nevt % s_gputhreadsmin != 0 ) )
       throw std::runtime_error( "Bridge constructor: nevt should be a multiple of " + std::to_string( s_gputhreadsmin ) );
     while( m_nevt != m_gpublocks * m_gputhreads )
@@ -250,11 +250,11 @@ namespace mg5amcCpu
     std::cout << "WARNING! Instantiate host Bridge (nevt=" << m_nevt << ")" << std::endl;
     mg5amcCpu::CPPProcess process( /*verbose=*/false );
     m_pmek.reset( new mg5amcCpu::MatrixElementKernelHost( m_hstMomentaC, m_hstGs, m_hstRndHel, m_hstRndCol, m_hstMEs, m_hstSelHel, m_hstSelCol, m_nevt ) );
-#endif // MGONGPUCPP_GPUIMPL
+#endif // __CUDACC__
     process.initProc( "../../Cards/param_card.dat" );
   }
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
   template<typename FORTRANFPTYPE>
   void Bridge<FORTRANFPTYPE>::set_gpugrid( const int gpublocks, const int gputhreads )
   {
@@ -268,7 +268,7 @@ namespace mg5amcCpu
   }
 #endif
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
   template<typename FORTRANFPTYPE>
   void Bridge<FORTRANFPTYPE>::gpu_sequence( const FORTRANFPTYPE* momenta,
                                             const FORTRANFPTYPE* gs,
@@ -283,14 +283,14 @@ namespace mg5amcCpu
     constexpr int neppM = MemoryAccessMomenta::neppM;
     if constexpr( neppM == 1 && std::is_same_v<FORTRANFPTYPE, fptype> )
     {
-      gpuMemcpy( m_devMomentaC.data(), momenta, m_devMomentaC.bytes(), gpuMemcpyHostToDevice );
+      checkCuda( cudaMemcpy( m_devMomentaC.data(), momenta, m_devMomentaC.bytes(), cudaMemcpyHostToDevice ) );
     }
     else
     {
-      gpuMemcpy( m_devMomentaF.data(), momenta, m_devMomentaF.bytes(), gpuMemcpyHostToDevice );
+      checkCuda( cudaMemcpy( m_devMomentaF.data(), momenta, m_devMomentaF.bytes(), cudaMemcpyHostToDevice ) );
       const int thrPerEvt = CPPProcess::npar * CPPProcess::np4; // AV: transpose alg does 1 element per thread (NOT 1 event per thread)
       //const int thrPerEvt = 1; // AV: try new alg with 1 event per thread... this seems slower
-      gpuLaunchKernel( dev_transposeMomentaF2C, m_gpublocks * thrPerEvt, m_gputhreads, m_devMomentaF.data(), m_devMomentaC.data(), m_nevt );
+      dev_transposeMomentaF2C<<<m_gpublocks * thrPerEvt, m_gputhreads>>>( m_devMomentaF.data(), m_devMomentaC.data(), m_nevt );
     }
     if constexpr( std::is_same_v<FORTRANFPTYPE, fptype> )
     {
@@ -333,7 +333,7 @@ namespace mg5amcCpu
   }
 #endif
 
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
   template<typename FORTRANFPTYPE>
   void Bridge<FORTRANFPTYPE>::cpu_sequence( const FORTRANFPTYPE* momenta,
                                             const FORTRANFPTYPE* gs,
@@ -388,7 +388,7 @@ namespace mg5amcCpu
   // - C++ array: momenta[npagM][npar][np4][neppM] with nevt=npagM*neppM (AOSOA)
   //
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
   template<typename Tin, typename Tout>
   __global__ void dev_transposeMomentaF2C( const Tin* in, Tout* out, const unsigned int nevt )
   {
diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/BridgeKernels.cc b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/BridgeKernels.cc
index 90c7f2d3b8..cef4cb3c71 100644
--- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/BridgeKernels.cc
+++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/BridgeKernels.cc
@@ -1,11 +1,10 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #include "BridgeKernels.h"
 
-#include "GpuAbstraction.h"
 #include "MemoryAccessMomenta.h"
 
 #include <sstream>
@@ -15,7 +14,7 @@ constexpr int npar = CPPProcess::npar; // #particles in total (external = initia
 
 //============================================================================
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -46,7 +45,7 @@ namespace mg5amcCpu
 
 //============================================================================
 
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
 namespace mg5amcCpu
 {
 
@@ -97,7 +96,7 @@ namespace mg5amcCpu
 
 //============================================================================
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 namespace mg5amcGpu
 {
 
diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/BridgeKernels.h b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/BridgeKernels.h
index 3efef8ce97..15eb4bff4d 100644
--- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/BridgeKernels.h
+++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/BridgeKernels.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef BRIDGEKERNELS_H
 #define BRIDGEKERNELS_H 1
@@ -12,7 +12,7 @@
 #include "MatrixElementKernels.h"
 #include "MemoryBuffers.h"
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -49,7 +49,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
   // A Bridge wrapper class encapsulating matrix element calculations on a CPU host
   class BridgeKernelHost final : public BridgeKernelBase
   {
@@ -89,7 +89,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
   // A Bridge wrapper class encapsulating matrix element calculations on a GPU device
   class BridgeKernelDevice : public BridgeKernelBase
   {
diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/CommonRandomNumberKernel.cc b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/CommonRandomNumberKernel.cc
index 010bc4cbd0..985b39f576 100644
--- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/CommonRandomNumberKernel.cc
+++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/CommonRandomNumberKernel.cc
@@ -1,16 +1,15 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #include "CommonRandomNumbers.h"
-#include "GpuAbstraction.h"
 #include "MemoryBuffers.h"
 #include "RandomNumberKernels.h"
 
 #include <cassert>
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/CrossSectionKernels.cc b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/CrossSectionKernels.cc
index c15b39844d..0b355a3c8d 100644
--- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/CrossSectionKernels.cc
+++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/CrossSectionKernels.cc
@@ -1,11 +1,10 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #include "CrossSectionKernels.h"
 
-#include "GpuAbstraction.h"
 #include "MemoryAccessMatrixElements.h"
 #include "MemoryAccessWeights.h"
 #include "MemoryBuffers.h"
@@ -78,7 +77,7 @@ debug_me_is_abnormal( const fptype& me, size_t ievtALL )
 
 //============================================================================
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -186,7 +185,7 @@ namespace mg5amcCpu
 
 //============================================================================
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 namespace mg5amcGpu
 {
 
diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/CrossSectionKernels.h b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/CrossSectionKernels.h
index 4d9659e04e..7933ca4bbf 100644
--- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/CrossSectionKernels.h
+++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/CrossSectionKernels.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef CROSSSECTIONKERNELS_H
 #define CROSSSECTIONKERNELS_H 1
@@ -13,7 +13,7 @@
 
 //============================================================================
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -96,7 +96,7 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
   /*
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
   // A class encapsulating the calculation of event statistics on a GPU device
   class CrossSectionKernelDevice : public CrossSectionKernelBase, public NumberOfEvents
   {
diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/CudaRuntime.h b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/CudaRuntime.h
new file mode 100644
index 0000000000..64ce52f4b3
--- /dev/null
+++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/CudaRuntime.h
@@ -0,0 +1,85 @@
+// Copyright (C) 2020-2023 CERN and UCLouvain.
+// Licensed under the GNU Lesser General Public License (version 3 or later).
+// Created by: S. Roiser (Jul 2020) for the MG5aMC CUDACPP plugin.
+// Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+
+#ifndef MG5AMC_CUDARUNTIME_H
+#define MG5AMC_CUDARUNTIME_H 1
+
+// MG5AMC on GPU uses the CUDA runtime API, not the lower level CUDA driver API
+// See https://docs.nvidia.com/cuda/cuda-runtime-api/driver-vs-runtime-api.html#driver-vs-runtime-api
+
+#include <cassert>
+#include <iostream>
+
+//--------------------------------------------------------------------------
+
+// See https://stackoverflow.com/a/14038590
+#ifdef __CUDACC__ /* clang-format off */
+#define checkCuda( code ) { assertCuda( code, __FILE__, __LINE__ ); }
+inline void assertCuda( cudaError_t code, const char* file, int line, bool abort = true )
+{
+  if( code != cudaSuccess )
+  {
+    printf( "ERROR! assertCuda: '%s' (%d) in %s:%d\n", cudaGetErrorString( code ), code, file, line );
+    if( abort ) assert( code == cudaSuccess );
+  }
+}
+#endif /* clang-format on */
+
+//--------------------------------------------------------------------------
+
+#ifdef __CUDACC__
+namespace mg5amcGpu
+{
+  // Instantiate a CudaRuntime at the beginnining of the application's main to
+  // invoke cudaSetDevice(0) in the constructor and book a cudaDeviceReset() call in the destructor
+  // *** FIXME! This will all need to be designed differently when going to multi-GPU nodes! ***
+  struct CudaRuntime final
+  {
+    CudaRuntime( const bool debug = true )
+      : m_debug( debug ) { setUp( m_debug ); }
+    ~CudaRuntime() { tearDown( m_debug ); }
+    CudaRuntime( const CudaRuntime& ) = delete;
+    CudaRuntime( CudaRuntime&& ) = delete;
+    CudaRuntime& operator=( const CudaRuntime& ) = delete;
+    CudaRuntime& operator=( CudaRuntime&& ) = delete;
+    bool m_debug;
+
+    // Set up CUDA application
+    // ** NB: strictly speaking this is not needed when using the CUDA runtime API **
+    // Calling cudaSetDevice on startup is useful to properly book-keep the time spent in CUDA initialization
+    static void setUp( const bool debug = true )
+    {
+      // ** NB: it is useful to call cudaSetDevice, or cudaFree, to properly book-keep the time spent in CUDA initialization
+      // ** NB: otherwise, the first CUDA operation (eg a cudaMemcpyToSymbol in CPPProcess ctor) appears to take much longer!
+      /*
+      // [We initially added cudaFree(0) to "ease profile analysis" only because it shows up as a big recognizable block!]
+      // No explicit initialization is needed: https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#initialization
+      // It is not clear what cudaFree(0) does at all: https://stackoverflow.com/questions/69967813/
+      if ( debug ) std::cout << "__CudaRuntime: calling cudaFree(0)" << std::endl;
+      checkCuda( cudaFree( 0 ) ); // SLOW!
+      */
+      // Replace cudaFree(0) by cudaSetDevice(0), even if it is not really needed either
+      // (but see https://developer.nvidia.com/blog/cuda-pro-tip-always-set-current-device-avoid-multithreading-bugs)
+      if( debug ) std::cout << "__CudaRuntime: calling cudaSetDevice(0)" << std::endl;
+      checkCuda( cudaSetDevice( 0 ) ); // SLOW!
+    }
+
+    // Tear down CUDA application (call cudaDeviceReset)
+    // ** NB: strictly speaking this is not needed when using the CUDA runtime API **
+    // Calling cudaDeviceReset on shutdown is only needed for checking memory leaks in cuda-memcheck
+    // See https://docs.nvidia.com/cuda/cuda-memcheck/index.html#leak-checking
+    static void tearDown( const bool debug = true )
+    {
+      if( debug ) std::cout << "__CudaRuntime: calling cudaDeviceReset()" << std::endl;
+      checkCuda( cudaDeviceReset() );
+    }
+  };
+
+}
+#endif
+
+//--------------------------------------------------------------------------
+
+#endif // MG5AMC_CUDARUNTIME_H
diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/CurandRandomNumberKernel.cc b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/CurandRandomNumberKernel.cc
index 38c477c17a..eb56333b03 100644
--- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/CurandRandomNumberKernel.cc
+++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/CurandRandomNumberKernel.cc
@@ -1,9 +1,9 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
-#include "GpuRuntime.h"
+#include "CudaRuntime.h"
 #include "MemoryBuffers.h"
 #include "RandomNumberKernels.h"
 
@@ -114,7 +114,7 @@ namespace mg5amcCpu
     /*
     printf( "\nCurandRandomNumberKernel::generateRnarray size = %d\n", (int)m_rnarray.size() );
     fptype* data = m_rnarray.data();
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
     if( m_rnarray.isOnDevice() )
     {
       data = new fptype[m_rnarray.size()]();
@@ -123,7 +123,7 @@ namespace mg5amcCpu
 #endif
     for( int i = 0; i < ( (int)m_rnarray.size() / 4 ); i++ )
       printf( "[%4d] %f %f %f %f\n", i * 4, data[i * 4], data[i * 4 + 2], data[i * 4 + 2], data[i * 4 + 3] );
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
     if( m_rnarray.isOnDevice() ) delete[] data;
 #endif
     */
diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/EventStatistics.h b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/EventStatistics.h
index b425a5bade..48b51e0a49 100644
--- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/EventStatistics.h
+++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/EventStatistics.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef EventStatistics_H
 #define EventStatistics_H 1
@@ -16,7 +16,7 @@
 #include <limits>
 #include <string>
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MadgraphTest.h b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MadgraphTest.h
index d2ff326e20..fd7734ce42 100644
--- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MadgraphTest.h
+++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MadgraphTest.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: S. Hageboeck (Dec 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MADGRAPHTEST_H_
 #define MADGRAPHTEST_H_ 1
@@ -21,7 +21,7 @@
 #include <string>
 #include <vector>
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 using mg5amcGpu::CPPProcess;
 #else
 using mg5amcCpu::CPPProcess;
@@ -200,7 +200,7 @@ class MadgraphTest : public testing::TestWithParam<TestDriverBase*>
 
 // Since we link both the CPU-only and GPU tests into the same executable, we prevent
 // a multiply defined symbol by only compiling this in the non-CUDA phase:
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
 
 /// Compare momenta and matrix elements.
 /// This uses an implementation of TestDriverBase to run a madgraph workflow,
@@ -309,6 +309,6 @@ TEST_P( MadgraphTest, CompareMomentaAndME )
   }
 }
 
-#endif // MGONGPUCPP_GPUIMPL
+#endif // __CUDACC__
 
 #endif /* MADGRAPHTEST_H_ */
diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MatrixElementKernels.cc b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MatrixElementKernels.cc
index d6d6c4f179..30257195b6 100644
--- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MatrixElementKernels.cc
+++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MatrixElementKernels.cc
@@ -1,12 +1,12 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #include "MatrixElementKernels.h"
 
 #include "CPPProcess.h"
-#include "GpuRuntime.h" // Includes the abstraction for Nvidia/AMD compilation
+#include "CudaRuntime.h"
 #include "MemoryAccessMomenta.h"
 #include "MemoryBuffers.h"
 
@@ -14,7 +14,7 @@
 
 //============================================================================
 
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
 namespace mg5amcCpu
 {
 
@@ -143,7 +143,7 @@ namespace mg5amcCpu
 
 //============================================================================
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 namespace mg5amcGpu
 {
 
@@ -202,13 +202,13 @@ namespace mg5amcGpu
     PinnedHostBufferHelicityMask hstIsGoodHel( ncomb );
     DeviceBufferHelicityMask devIsGoodHel( ncomb );
     // ... 0d1. Compute good helicity mask on the device
-    gpuLaunchKernel( computeDependentCouplings, m_gpublocks, m_gputhreads, m_gs.data(), m_couplings.data() );
+    computeDependentCouplings<<<m_gpublocks, m_gputhreads>>>( m_gs.data(), m_couplings.data() );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    gpuLaunchKernel( sigmaKin_getGoodHel, m_gpublocks, m_gputhreads, m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_numerators.data(), m_denominators.data(), devIsGoodHel.data() );
+    sigmaKin_getGoodHel<<<m_gpublocks, m_gputhreads>>>( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_numerators.data(), m_denominators.data(), devIsGoodHel.data() );
 #else
-    gpuLaunchKernel( sigmaKin_getGoodHel, m_gpublocks, m_gputhreads, m_momenta.data(), m_couplings.data(), m_matrixElements.data(), devIsGoodHel.data() );
+    sigmaKin_getGoodHel<<<m_gpublocks, m_gputhreads>>>( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), devIsGoodHel.data() );
 #endif
-    checkGpu( gpuPeekAtLastError() );
+    checkCuda( cudaPeekAtLastError() );
     // ... 0d2. Copy back good helicity mask to the host
     copyHostFromDevice( hstIsGoodHel, devIsGoodHel );
     // ... 0d3. Copy back good helicity list to constant memory on the device
@@ -219,19 +219,19 @@ namespace mg5amcGpu
 
   void MatrixElementKernelDevice::computeMatrixElements( const unsigned int channelId )
   {
-    gpuLaunchKernel( computeDependentCouplings, m_gpublocks, m_gputhreads, m_gs.data(), m_couplings.data() );
+    computeDependentCouplings<<<m_gpublocks, m_gputhreads>>>( m_gs.data(), m_couplings.data() );
 #ifndef MGONGPU_NSIGHT_DEBUG
     constexpr unsigned int sharedMemSize = 0;
 #else
     constexpr unsigned int sharedMemSize = ntpbMAX * sizeof( float );
 #endif
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    gpuLaunchKernelSharedMem( sigmaKin, m_gpublocks, m_gputhreads, sharedMemSize, m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), channelId, m_numerators.data(), m_denominators.data(), m_selhel.data(), m_selcol.data() );
+    sigmaKin<<<m_gpublocks, m_gputhreads, sharedMemSize>>>( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), channelId, m_numerators.data(), m_denominators.data(), m_selhel.data(), m_selcol.data() );
 #else
-    gpuLaunchKernelSharedMem( sigmaKin, m_gpublocks, m_gputhreads, sharedMemSize, m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), m_selhel.data(), m_selcol.data() );
+    sigmaKin<<<m_gpublocks, m_gputhreads, sharedMemSize>>>( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), m_selhel.data(), m_selcol.data() );
 #endif
-    checkGpu( gpuPeekAtLastError() );
-    checkGpu( gpuDeviceSynchronize() );
+    checkCuda( cudaPeekAtLastError() );
+    checkCuda( cudaDeviceSynchronize() );
   }
 
   //--------------------------------------------------------------------------
diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MatrixElementKernels.h b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MatrixElementKernels.h
index 72bd8f195b..23e84757a2 100644
--- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MatrixElementKernels.h
+++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MatrixElementKernels.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MATRIXELEMENTKERNELS_H
 #define MATRIXELEMENTKERNELS_H 1
@@ -10,7 +10,7 @@
 
 #include "MemoryBuffers.h"
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -81,7 +81,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
   // A class encapsulating matrix element calculations on a CPU host
   class MatrixElementKernelHost final : public MatrixElementKernelBase, public NumberOfEvents
   {
@@ -130,7 +130,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
   // A class encapsulating matrix element calculations on a GPU device
   class MatrixElementKernelDevice : public MatrixElementKernelBase, public NumberOfEvents
   {
diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MemoryAccessHelpers.h b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MemoryAccessHelpers.h
index db73e4e064..c82a6c7635 100644
--- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MemoryAccessHelpers.h
+++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MemoryAccessHelpers.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MemoryAccessHelpers_H
 #define MemoryAccessHelpers_H 1
@@ -105,7 +105,7 @@ class KernelAccessHelper : public MemoryAccessHelper<T>
     }
     else
     {
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
       const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid
       //printf( "kernelAccessRecord: ievt=%d threadId=%d\n", ievt, threadIdx.x );
       return T::ieventAccessRecord( buffer, ievt ); // NB fptype and fptype_sv coincide for CUDA
diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MemoryAccessMomenta.h b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MemoryAccessMomenta.h
index 38fade09fb..0ac4faa3c7 100644
--- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MemoryAccessMomenta.h
+++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MemoryAccessMomenta.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MemoryAccessMomenta_H
 #define MemoryAccessMomenta_H 1
@@ -12,7 +12,7 @@
 #include "MemoryAccessHelpers.h"
 #include "MemoryAccessVectors.h"
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 using mg5amcGpu::CPPProcess;
 #else
 using mg5amcCpu::CPPProcess;
@@ -29,7 +29,7 @@ class MemoryAccessMomentaBase //_AOSOAv1
 
   // Number of Events Per Page in the momenta AOSOA memory buffer layout
   // (these are all best kept as a compile-time constants: see issue #23)
-#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
+#ifdef __CUDACC__ /* clang-format off */
   // -----------------------------------------------------------------------------------------------
   // --- GPUs: neppM is best set to a power of 2 times the number of fptype's in a 32-byte cacheline
   // --- This is relevant to ensure coalesced access to momenta in global memory
diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MemoryAccessRandomNumbers.h b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MemoryAccessRandomNumbers.h
index 40cb089135..e2988d39f3 100644
--- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MemoryAccessRandomNumbers.h
+++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MemoryAccessRandomNumbers.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MemoryAccessRandomNumbers_H
 #define MemoryAccessRandomNumbers_H 1
@@ -11,7 +11,7 @@
 #include "CPPProcess.h"
 #include "MemoryAccessHelpers.h"
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 using mg5amcGpu::CPPProcess;
 #else
 using mg5amcCpu::CPPProcess;
diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MemoryAccessVectors.h b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MemoryAccessVectors.h
index 08faccff0f..e9b197368e 100644
--- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MemoryAccessVectors.h
+++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MemoryAccessVectors.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MemoryAccessVectors_H
 #define MemoryAccessVectors_H 1
@@ -10,7 +10,7 @@
 
 #include "mgOnGpuVectors.h"
 
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
 namespace mg5amcCpu // this is only needed for CPU SIMD vectorization
 {
 
diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MemoryBuffers.h b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MemoryBuffers.h
index 7756a71621..3093e6ed18 100644
--- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MemoryBuffers.h
+++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MemoryBuffers.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021, based on earlier work by S. Hageboeck) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Roiser, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MemoryBuffers_H
 #define MemoryBuffers_H 1
@@ -11,12 +11,12 @@
 #include "mgOnGpuCxtypes.h"
 
 #include "CPPProcess.h"
-#include "GpuRuntime.h"
+#include "CudaRuntime.h"
 #include "Parameters_sm.h"
 
 #include <sstream>
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -87,7 +87,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
   constexpr bool HostBufferALIGNED = false;   // ismisaligned=false
   constexpr bool HostBufferMISALIGNED = true; // ismisaligned=true
 
@@ -119,7 +119,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
   // A class encapsulating a CUDA pinned host buffer
   template<typename T>
   class PinnedHostBufferBase : public BufferBase<T>
@@ -128,18 +128,18 @@ namespace mg5amcCpu
     PinnedHostBufferBase( const size_t size )
       : BufferBase<T>( size, false )
     {
-      gpuMallocHost( &( this->m_data ), this->bytes() );
+      checkCuda( cudaMallocHost( &( this->m_data ), this->bytes() ) );
     }
     virtual ~PinnedHostBufferBase()
     {
-      gpuFreeHost( this->m_data );
+      checkCuda( cudaFreeHost( this->m_data ) );
     }
   };
 #endif
 
   //--------------------------------------------------------------------------
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
   // A class encapsulating a CUDA device buffer
   template<typename T>
   class DeviceBufferBase : public BufferBase<T>
@@ -148,18 +148,18 @@ namespace mg5amcCpu
     DeviceBufferBase( const size_t size )
       : BufferBase<T>( size, true )
     {
-      gpuMalloc( &( this->m_data ), this->bytes() );
+      checkCuda( cudaMalloc( &( this->m_data ), this->bytes() ) );
     }
     virtual ~DeviceBufferBase()
     {
-      gpuFree( this->m_data );
+      checkCuda( cudaFree( this->m_data ) );
     }
   };
 #endif
 
   //--------------------------------------------------------------------------
 
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
   // A class encapsulating a C++ host buffer for a given number of events
   template<typename T, size_t sizePerEvent, bool ismisaligned>
   class HostBuffer : public HostBufferBase<T, ismisaligned>, virtual private NumberOfEvents
@@ -175,7 +175,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
   // A class encapsulating a CUDA pinned host buffer for a given number of events
   template<typename T, size_t sizePerEvent>
   class PinnedHostBuffer : public PinnedHostBufferBase<T>, virtual private NumberOfEvents
@@ -191,7 +191,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
   // A class encapsulating a CUDA device buffer for a given number of events
   template<typename T, size_t sizePerEvent>
   class DeviceBuffer : public DeviceBufferBase<T>, virtual private NumberOfEvents
@@ -213,7 +213,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for momenta random numbers
   constexpr size_t sizePerEventRndNumMomenta = MemoryBuffers::np4 * MemoryBuffers::nparf;
 
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
   // A class encapsulating a C++ host buffer for momenta random numbers
   typedef HostBuffer<fptype, sizePerEventRndNumMomenta, HostBufferALIGNED> HostBufferRndNumMomenta;
 #else
@@ -232,7 +232,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer with ONE fptype per event
   constexpr size_t sizePerEventOneFp = 1;
 
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
   // A class encapsulating a C++ host buffer with ONE fptype per event
   typedef HostBuffer<fptype, sizePerEventOneFp, HostBufferALIGNED> HostBufferOneFp;
 #else
@@ -257,7 +257,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for Gs
   constexpr size_t sizePerEventGs = 1;
 
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
   // A class encapsulating a C++ host buffer for gs
   typedef HostBuffer<fptype, sizePerEventGs, HostBufferALIGNED> HostBufferGs;
 #else
@@ -276,7 +276,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for numerators
   constexpr size_t sizePerEventNumerators = 1;
 
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
   // A class encapsulating a C++ host buffer for gs
   typedef HostBuffer<fptype, sizePerEventNumerators, HostBufferALIGNED> HostBufferNumerators;
 #else
@@ -296,7 +296,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for denominators
   constexpr size_t sizePerEventDenominators = 1;
 
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
   // A class encapsulating a C++ host buffer for gs
   typedef HostBuffer<fptype, sizePerEventDenominators, HostBufferALIGNED> HostBufferDenominators;
 #else
@@ -315,7 +315,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for random numbers
   constexpr size_t sizePerEventCouplings = MemoryBuffers::ndcoup * MemoryBuffers::nx2;
 
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
   // A class encapsulating a C++ host buffer for gs
   typedef HostBuffer<fptype, sizePerEventCouplings, HostBufferALIGNED> HostBufferCouplings;
 #else
@@ -333,7 +333,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for momenta
   constexpr size_t sizePerEventMomenta = MemoryBuffers::np4 * MemoryBuffers::npar;
 
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
   // A class encapsulating a C++ host buffer for momenta
   typedef HostBuffer<fptype, sizePerEventMomenta, HostBufferALIGNED> HostBufferMomenta;
   //typedef HostBuffer<fptype, sizePerEventMomenta, HostBufferMISALIGNED> HostBufferMomenta; // TEST MISALIGNMENT!
@@ -352,7 +352,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for sampling weights
   constexpr size_t sizePerEventWeights = 1;
 
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
   // A class encapsulating a C++ host buffer for sampling weights
   typedef HostBuffer<fptype, sizePerEventWeights, HostBufferALIGNED> HostBufferWeights;
 #else
@@ -370,7 +370,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for matrix elements
   constexpr size_t sizePerEventMatrixElements = 1;
 
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
   // A class encapsulating a C++ host buffer for matrix elements
   typedef HostBuffer<fptype, sizePerEventMatrixElements, HostBufferALIGNED> HostBufferMatrixElements;
 #else
@@ -385,7 +385,7 @@ namespace mg5amcCpu
   // A base class encapsulating a memory buffer for the helicity mask
   typedef BufferBase<bool> BufferHelicityMask;
 
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
   // A class encapsulating a C++ host buffer for the helicity mask
   typedef HostBufferBase<bool, HostBufferALIGNED> HostBufferHelicityMask;
 #else
@@ -403,7 +403,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for wavefunctions
   constexpr size_t sizePerEventWavefunctions = MemoryBuffers::nw6 * MemoryBuffers::nx2;
 
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
   // A class encapsulating a C++ host buffer for wavefunctions
   typedef HostBuffer<fptype, sizePerEventWavefunctions, HostBufferALIGNED> HostBufferWavefunctions;
 #else
@@ -421,7 +421,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for helicity random numbers
   constexpr size_t sizePerEventRndNumHelicity = 1;
 
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
   // A class encapsulating a C++ host buffer for helicity random numbers
   typedef HostBuffer<fptype, sizePerEventRndNumHelicity, HostBufferALIGNED> HostBufferRndNumHelicity;
 #else
@@ -439,7 +439,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for color random numbers
   constexpr size_t sizePerEventRndNumColor = 1;
 
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
   // A class encapsulating a C++ host buffer for color random numbers
   typedef HostBuffer<fptype, sizePerEventRndNumColor, HostBufferALIGNED> HostBufferRndNumColor;
 #else
@@ -457,7 +457,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for helicity selection
   constexpr size_t sizePerEventSelectedHelicity = 1;
 
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
   // A class encapsulating a C++ host buffer for helicity selection
   typedef HostBuffer<int, sizePerEventSelectedHelicity, HostBufferALIGNED> HostBufferSelectedHelicity;
 #else
@@ -475,7 +475,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for color selection
   constexpr size_t sizePerEventSelectedColor = 1;
 
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
   // A class encapsulating a C++ host buffer for color selection
   typedef HostBuffer<int, sizePerEventSelectedColor, HostBufferALIGNED> HostBufferSelectedColor;
 #else
@@ -487,7 +487,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
   template<class Tdst, class Tsrc>
   void copyDeviceFromHost( Tdst& dst, const Tsrc& src ) // keep the same order of arguments as in memcpy
   {
@@ -504,13 +504,13 @@ namespace mg5amcCpu
       throw std::runtime_error( sstr.str() );
     }
     // NB (PR #45): cudaMemcpy involves an intermediate memcpy to pinned memory if host array is a not a pinned host array
-    gpuMemcpy( dst.data(), src.data(), src.bytes(), gpuMemcpyHostToDevice );
+    checkCuda( cudaMemcpy( dst.data(), src.data(), src.bytes(), cudaMemcpyHostToDevice ) );
   }
 #endif
 
   //--------------------------------------------------------------------------
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
   template<class Tdst, class Tsrc>
   void copyHostFromDevice( Tdst& dst, const Tsrc& src ) // keep the same order of arguments as in memcpy
   {
@@ -527,7 +527,7 @@ namespace mg5amcCpu
       throw std::runtime_error( sstr.str() );
     }
     // NB (PR #45): cudaMemcpy involves an intermediate memcpy to pinned memory if host array is a not a pinned host array
-    gpuMemcpy( dst.data(), src.data(), src.bytes(), gpuMemcpyDeviceToHost );
+    checkCuda( cudaMemcpy( dst.data(), src.data(), src.bytes(), cudaMemcpyDeviceToHost ) );
   }
 #endif
 
diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/CPPProcess.cc b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/CPPProcess.cc
index 62fa7f0088..a4cc98e6b1 100644
--- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/CPPProcess.cc
+++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/CPPProcess.cc
@@ -4,7 +4,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
 // MadGraph5_aMC@NLO v. 3.5.0_lo_vect, 2023-06-09
@@ -16,6 +16,7 @@
 
 #include "mgOnGpuConfig.h"
 
+#include "CudaRuntime.h"
 #include "HelAmps_sm.h"
 #include "MemoryAccessAmplitudes.h"
 #include "MemoryAccessCouplings.h"
@@ -45,7 +46,7 @@
 // Class member functions for calculating the matrix elements for
 // Process: g g > t t~ WEIGHTED<=2 @1
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -79,7 +80,7 @@ namespace mg5amcCpu
   __device__ const fptype cIPD[2] = { (fptype)Parameters_sm::mdl_MT, (fptype)Parameters_sm::mdl_WT };
   __device__ const fptype* cIPC = nullptr; // unused as nicoup=0
 #else
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
   __device__ __constant__ fptype cIPD[2];
   __device__ __constant__ fptype* cIPC = nullptr; // unused as nicoup=0
 #else
@@ -89,7 +90,7 @@ namespace mg5amcCpu
 #endif
 
   // Helicity combinations (and filtering of "good" helicity combinations)
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
   __device__ __constant__ short cHel[ncomb][npar];
   __device__ __constant__ int cNGoodHel;
   __device__ __constant__ int cGoodHel[ncomb];
@@ -117,13 +118,13 @@ namespace mg5amcCpu
                            fptype* allDenominators,       // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
                            fptype_sv* jamp2_sv            // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled)
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
                            , const int ievt00             // input: first event number in current C++ event page (for CUDA, ievt depends on threadid)
 #endif
                            )
   //ALWAYS_INLINE // attributes are not permitted in a function definition
   {
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
     using namespace mg5amcGpu;
     using M_ACCESS = DeviceAccessMomenta;         // non-trivial access: buffer includes all events
     using E_ACCESS = DeviceAccessMatrixElements;  // non-trivial access: buffer includes all events
@@ -150,7 +151,7 @@ namespace mg5amcCpu
 #endif /* clang-format on */
     mgDebug( 0, __FUNCTION__ );
     //printf( "calculate_wavefunctions: ihel=%2d\n", ihel );
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
     //printf( "calculate_wavefunctions: ievt00=%d\n", ievt00 );
 #endif
 
@@ -186,7 +187,7 @@ namespace mg5amcCpu
 #endif
     for( int iParity = 0; iParity < nParity; ++iParity )
     { // START LOOP ON IPARITY
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
       const int ievt0 = ievt00 + iParity * neppV;
 #endif
       constexpr size_t nxcoup = ndcoup + nicoup; // both dependent and independent couplings
@@ -199,10 +200,8 @@ namespace mg5amcCpu
         allCOUPs[idcoup] = CD_ACCESS::idcoupAccessBufferConst( allcouplings, idcoup ); // dependent couplings, vary event-by-event
       for( size_t iicoup = 0; iicoup < nicoup; iicoup++ )
         allCOUPs[ndcoup + iicoup] = CI_ACCESS::iicoupAccessBufferConst( cIPC, iicoup ); // independent couplings, fixed for all events
-#ifdef MGONGPUCPP_GPUIMPL
 #ifdef __CUDACC__
 #pragma nv_diagnostic pop
-#endif
       // CUDA kernels take input/output buffers with momenta/MEs for all events
       const fptype* momenta = allmomenta;
       const fptype* COUPs[nxcoup];
@@ -303,7 +302,7 @@ namespace mg5amcCpu
         { 16, -2 },
         { -2, 16 } }; // 2-D array[2][2]
 
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
       // Pre-compute a constexpr triangular color matrix properly normalized #475
       struct TriangularNormalizedColorMatrix
       {
@@ -360,7 +359,7 @@ namespace mg5amcCpu
 #endif
       for( int icol = 0; icol < ncolor; icol++ )
       {
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
         // === C++ START ===
         // Diagonal terms
 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
@@ -419,7 +418,7 @@ namespace mg5amcCpu
       MEs_sv_previous += deltaMEs_previous;
 #endif
       /*
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
       if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", blockDim.x * blockIdx.x + threadIdx.x, ihel, MEs_sv );
 #else
 #ifdef MGONGPU_CPPSIMD
@@ -466,8 +465,8 @@ namespace mg5amcCpu
       { 1, 1, -1, -1 },
       { 1, 1, 1, 1 },
       { 1, 1, 1, -1 } };
-#ifdef MGONGPUCPP_GPUIMPL
-    gpuMemcpyToSymbol( cHel, tHel, ncomb * npar * sizeof( short ) );
+#ifdef __CUDACC__
+    checkCuda( cudaMemcpyToSymbol( cHel, tHel, ncomb * npar * sizeof( short ) ) );
 #else
     memcpy( cHel, tHel, ncomb * npar * sizeof( short ) );
 #endif
@@ -507,9 +506,9 @@ namespace mg5amcCpu
     // Then copy them to CUDA constant memory (issue #39) or its C++ emulation in file-scope static memory
     const fptype tIPD[2] = { (fptype)m_pars->mdl_MT, (fptype)m_pars->mdl_WT };
     //const cxtype tIPC[0] = { ... }; // nicoup=0
-#ifdef MGONGPUCPP_GPUIMPL
-    gpuMemcpyToSymbol( cIPD, tIPD, 2 * sizeof( fptype ) );
-    //gpuMemcpyToSymbol( cIPC, tIPC, 0 * sizeof( cxtype ) ); // nicoup=0
+#ifdef __CUDACC__
+    checkCuda( cudaMemcpyToSymbol( cIPD, tIPD, 2 * sizeof( fptype ) ) );
+    //checkCuda( cudaMemcpyToSymbol( cIPC, tIPC, 0 * sizeof( cxtype ) ) ); // nicoup=0
 #else
     memcpy( cIPD, tIPD, 2 * sizeof( fptype ) );
     //memcpy( cIPC, tIPC, 0 * sizeof( cxtype ) ); // nicoup=0
@@ -545,7 +544,7 @@ namespace mg5amcCpu
   {
     std::stringstream out;
     // CUDA version (NVCC)
-    // [Use __NVCC__ instead of MGONGPUCPP_GPUIMPL here!]
+    // [Use __NVCC__ instead of __CUDACC__ here!]
     // [This tests if 'nvcc' was used even to build a .cc file, even if not necessarily 'nvcc -x cu' for a .cu file]
     // [Check 'nvcc --compiler-options -dM -E dummy.c | grep CUDA': see https://stackoverflow.com/a/53713712]
 #ifdef __NVCC__
@@ -610,12 +609,12 @@ namespace mg5amcCpu
   __global__ void /* clang-format off */
   computeDependentCouplings( const fptype* allgs, // input: Gs[nevt]
                              fptype* allcouplings // output: couplings[nevt*ndcoup*2]
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
                              , const int nevt     // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #endif
   ) /* clang-format on */
   {
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
     using namespace mg5amcGpu;
     using G_ACCESS = DeviceAccessGs;
     using C_ACCESS = DeviceAccessCouplings;
@@ -636,7 +635,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
+#ifdef __CUDACC__ /* clang-format off */
   __global__ void
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
@@ -765,9 +764,9 @@ namespace mg5amcCpu
         nGoodHel++;
       }
     }
-#ifdef MGONGPUCPP_GPUIMPL
-    gpuMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) );
-    gpuMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) );
+#ifdef __CUDACC__
+    checkCuda( cudaMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) ) );
+    checkCuda( cudaMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) ) );
 #else
     cNGoodHel = nGoodHel;
     for( int ihel = 0; ihel < ncomb; ihel++ ) cGoodHel[ihel] = goodHel[ihel];
@@ -791,7 +790,7 @@ namespace mg5amcCpu
 #endif
             int* allselhel,                // output: helicity selection[nevt]
             int* allselcol                 // output: helicity selection[nevt]
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
             , const int nevt               // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #endif
             ) /* clang-format on */
@@ -812,7 +811,7 @@ namespace mg5amcCpu
     // Denominators: spins, colors and identical particles
     constexpr int helcolDenominators[1] = { 256 }; // assume nprocesses == 1 (#272 and #343)
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
     // Remember: in CUDA this is a kernel for one event, in c++ this processes n events
     const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid
 #else
@@ -826,12 +825,9 @@ namespace mg5amcCpu
 #endif
 
     // Start sigmaKin_lines
-
-#include "GpuAbstraction.h"
-
     // === PART 0 - INITIALISATION (before calculate_wavefunctions) ===
     // Reset the "matrix elements" - running sums of |M|^2 over helicities for the given event
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
     allMEs[ievt] = 0;
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     allNumerators[ievt] = 0;
@@ -859,7 +855,7 @@ namespace mg5amcCpu
     // === PART 1 - HELICITY LOOP: CALCULATE WAVEFUNCTIONS ===
     // (in both CUDA and C++, using precomputed good helicities)
 
-#ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++
+#ifdef __CUDACC__ // CUDA OR C++
 
     // *** START OF PART 1a - CUDA (one event per CPU thread) ***
     // Running sum of partial amplitudes squared for event by event color selection (#402)
@@ -1063,7 +1059,7 @@ namespace mg5amcCpu
     // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event
     // [NB 'sum over final spins, average over initial spins', eg see
     // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf]
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
     allMEs[ievt] /= helcolDenominators[0];
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     if( channelId > 0 ) allMEs[ievt] *= allNumerators[ievt] / allDenominators[ievt];
diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/CPPProcess.h b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/CPPProcess.h
index 5a6e96d9e8..51f966d10f 100644
--- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/CPPProcess.h
+++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/CPPProcess.h
@@ -4,7 +4,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
 // MadGraph5_aMC@NLO v. 3.5.0_lo_vect, 2023-06-09
@@ -25,7 +25,7 @@
 
 //--------------------------------------------------------------------------
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -107,7 +107,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
   __global__ void
   computeDependentCouplings( const fptype* allgs,    // input: Gs[nevt]
                              fptype* allcouplings ); // output: couplings[nevt*ndcoup*2]
@@ -120,7 +120,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
+#ifdef __CUDACC__ /* clang-format off */
   __global__ void
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
@@ -150,7 +150,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
+#ifdef __CUDACC__ /* clang-format off */
   __global__ void
   sigmaKin( const fptype* allmomenta,      // input: momenta[nevt*npar*4]
             const fptype* allcouplings,    // input: couplings[nevt*ndcoup*2]
diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/CudaRuntime.h b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/CudaRuntime.h
new file mode 120000
index 0000000000..ce9e1a487a
--- /dev/null
+++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/CudaRuntime.h
@@ -0,0 +1 @@
+../CudaRuntime.h
\ No newline at end of file
diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/check_sa.cc b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/check_sa.cc
index 1bad694d1c..f1e75b9252 100644
--- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/check_sa.cc
+++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/check_sa.cc
@@ -4,7 +4,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: O. Mattelaer (Nov 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 
 #include "mgOnGpuConfig.h"
@@ -12,7 +12,6 @@
 #include "BridgeKernels.h"
 #include "CPPProcess.h"
 #include "CrossSectionKernels.h"
-#include "GpuRuntime.h"
 #include "MatrixElementKernels.h"
 #include "MemoryAccessMatrixElements.h"
 #include "MemoryAccessMomenta.h"
@@ -64,7 +63,7 @@ usage( char* argv0, int ret = 1 )
   std::cout << std::endl;
   std::cout << "Summary stats are always computed: '-p' and '-j' only control their printout" << std::endl;
   std::cout << "The '-d' flag only enables NaN/abnormal warnings and OMP debugging" << std::endl;
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
 #ifdef _OPENMP
   std::cout << std::endl;
   std::cout << "Use the OMP_NUM_THREADS environment variable to control OMP multi-threading" << std::endl;
@@ -78,7 +77,7 @@ int
 main( int argc, char** argv )
 {
   // Namespaces for CUDA and C++ (FIXME - eventually use the same namespace everywhere...)
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
   using namespace mg5amcGpu;
 #else
   using namespace mg5amcCpu;
@@ -104,11 +103,11 @@ main( int argc, char** argv )
     CurandDevice = 2
   };
 #ifdef __CUDACC__
-  RandomNumberMode rndgen = RandomNumberMode::CurandDevice; // default on CUDA GPU
+  RandomNumberMode rndgen = RandomNumberMode::CurandDevice; // default on GPU
 #elif not defined MGONGPU_HAS_NO_CURAND
   RandomNumberMode rndgen = RandomNumberMode::CurandHost;  // default on CPU if build has curand
 #else
-  RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // default on CPU if build has no curand and on HIP GPU
+  RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // default on CPU if build has no curand
 #endif
   // Rambo sampling mode (NB RamboHost implies CommonRandom or CurandHost!)
   enum class RamboSamplingMode
@@ -116,7 +115,7 @@ main( int argc, char** argv )
     RamboHost = 1,
     RamboDevice = 2
   };
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
   RamboSamplingMode rmbsmp = RamboSamplingMode::RamboDevice; // default on GPU
 #else
   RamboSamplingMode rmbsmp = RamboSamplingMode::RamboHost; // default on CPU
@@ -149,7 +148,7 @@ main( int argc, char** argv )
 #ifdef __CUDACC__
       rndgen = RandomNumberMode::CurandDevice;
 #else
-      throw std::runtime_error( "CurandDevice is not supported on CPUs or on HIP GPUs" );
+      throw std::runtime_error( "CurandDevice is not supported on CPUs" );
 #endif
     }
     else if( arg == "--curhst" )
@@ -166,7 +165,7 @@ main( int argc, char** argv )
     }
     else if( arg == "--rmbdev" )
     {
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
       rmbsmp = RamboSamplingMode::RamboDevice;
 #else
       throw std::runtime_error( "RamboDevice is not supported on CPUs" );
@@ -240,13 +239,13 @@ main( int argc, char** argv )
     return usage( argv[0] );
   }
 
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
 #ifdef _OPENMP
   ompnumthreadsNotSetMeansOneThread( debug ? 1 : 0 ); // quiet(-1), info(0), debug(1)
 #endif
 #endif
 
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
   // Fail gently and avoid "Illegal instruction (core dumped)" if the host does not support the SIMD used in the ME calculation
   // Note: this prevents a crash on pmpe04 but not on some github CI nodes?
   // [NB: SIMD vectorization in mg5amc C++ code is only used in the ME calculation below MatrixElementKernelHost!]
@@ -264,14 +263,14 @@ main( int argc, char** argv )
 
   // === STEP 0 - INITIALISE
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 
-  // --- 00. Initialise GPU
-  // Instantiate a GpuRuntime at the beginnining of the application's main.
-  // For CUDA this invokes cudaSetDevice(0) in the constructor and books a cudaDeviceReset() call in the destructor.
-  const std::string cdinKey = "00 GpuInit";
+  // --- 00. Initialise cuda
+  // Instantiate a CudaRuntime at the beginnining of the application's main to
+  // invoke cudaSetDevice(0) in the constructor and book a cudaDeviceReset() call in the destructor
+  const std::string cdinKey = "00 CudaInit";
   timermap.start( cdinKey );
-  GpuRuntime GpuRuntime( debug );
+  CudaRuntime cudaRuntime( debug );
 #endif
 
   // --- 0a. Initialise physics process
@@ -293,7 +292,7 @@ main( int argc, char** argv )
   timermap.start( alloKey );
 
   // Memory buffers for random numbers for momenta
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
   HostBufferRndNumMomenta hstRndmom( nevt );
 #else
   PinnedHostBufferRndNumMomenta hstRndmom( nevt );
@@ -301,7 +300,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for sampling weights
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
   HostBufferWeights hstWeights( nevt );
 #else
   PinnedHostBufferWeights hstWeights( nevt );
@@ -309,7 +308,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for momenta
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
   HostBufferMomenta hstMomenta( nevt );
 #else
   PinnedHostBufferMomenta hstMomenta( nevt );
@@ -317,7 +316,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for Gs
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
   HostBufferGs hstGs( nevt );
 #else
   PinnedHostBufferGs hstGs( nevt );
@@ -334,7 +333,7 @@ main( int argc, char** argv )
   }
 
   // Memory buffers for matrix elements
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
   HostBufferMatrixElements hstMatrixElements( nevt );
 #else
   PinnedHostBufferMatrixElements hstMatrixElements( nevt );
@@ -343,7 +342,7 @@ main( int argc, char** argv )
 
   // Memory buffers for random numbers for helicity selection
   // *** NB #403 these buffers always remain initialised at 0: no need for helicity choice in gcheck/check (no LHE produced) ***
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
   HostBufferRndNumHelicity hstRndHel( nevt );
 #else
   PinnedHostBufferRndNumHelicity hstRndHel( nevt );
@@ -352,7 +351,7 @@ main( int argc, char** argv )
 
   // Memory buffers for random numbers for color selection
   // *** NB #402 these buffers always remain initialised at 0: no need for color choice in gcheck/check (no LHE produced) ***
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
   HostBufferRndNumColor hstRndCol( nevt );
 #else
   PinnedHostBufferRndNumColor hstRndCol( nevt );
@@ -360,7 +359,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for helicity selection
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
   HostBufferSelectedHelicity hstSelHel( nevt );
 #else
   PinnedHostBufferSelectedHelicity hstSelHel( nevt );
@@ -368,7 +367,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for color selection
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
   HostBufferSelectedColor hstSelCol( nevt );
 #else
   PinnedHostBufferSelectedColor hstSelCol( nevt );
@@ -404,7 +403,7 @@ main( int argc, char** argv )
 #else
   else
   {
-    throw std::logic_error( "CurandDevice is not supported on CPUs or HIP GPUs" ); // INTERNAL ERROR (no path to this statement)
+    throw std::logic_error( "CurandDevice is not supported on CPUs" ); // INTERNAL ERROR (no path to this statement)
   }
 #endif
 #else
@@ -422,7 +421,7 @@ main( int argc, char** argv )
   }
   else
   {
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
     prsk.reset( new RamboSamplingKernelDevice( energy, devRndmom, devMomenta, devWeights, gpublocks, gputhreads ) );
 #else
     throw std::logic_error( "RamboDevice is not supported on CPUs" ); // INTERNAL ERROR (no path to this statement)
@@ -433,7 +432,7 @@ main( int argc, char** argv )
   std::unique_ptr<MatrixElementKernelBase> pmek;
   if( !bridge )
   {
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
     pmek.reset( new MatrixElementKernelDevice( devMomenta, devGs, devRndHel, devRndCol, devMatrixElements, devSelHel, devSelCol, gpublocks, gputhreads ) );
 #else
     pmek.reset( new MatrixElementKernelHost( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, nevt ) );
@@ -441,7 +440,7 @@ main( int argc, char** argv )
   }
   else
   {
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
     pmek.reset( new BridgeKernelDevice( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, gpublocks, gputhreads ) );
 #else
     pmek.reset( new BridgeKernelHost( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, nevt ) );
@@ -483,7 +482,7 @@ main( int argc, char** argv )
     prnk->generateRnarray();
     //std::cout << "Got random numbers" << std::endl;
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
     if( rndgen != RandomNumberMode::CurandDevice && rmbsmp == RamboSamplingMode::RamboDevice )
     {
       // --- 1c. Copy rndmom from host to device
@@ -515,7 +514,7 @@ main( int argc, char** argv )
     prsk->getMomentaFinal();
     //std::cout << "Got final momenta" << std::endl;
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
     if( rmbsmp == RamboSamplingMode::RamboDevice )
     {
       // --- 2c. CopyDToH Weights
@@ -560,7 +559,7 @@ main( int argc, char** argv )
       dynamic_cast<BridgeKernelBase*>( pmek.get() )->transposeInputMomentaC2F();
     }
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
     // --- 2d. CopyHToD Momenta
     const std::string gKey = "0.. CpHTDg";
     rambtime += timermap.start( gKey ); // FIXME! NOT A RAMBO TIMER!
@@ -589,7 +588,7 @@ main( int argc, char** argv )
     wv3atime += timermap.stop(); // calc only
     wavetime += wv3atime;        // calc plus copy
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
     if( !bridge )
     {
       // --- 3b. CopyDToH MEs
@@ -732,19 +731,15 @@ main( int argc, char** argv )
     rndgentxt = "CURAND DEVICE";
 #ifdef __CUDACC__
   rndgentxt += " (CUDA code)";
-#elif defined __HIPCC__
-  rndgentxt += " (HIP code)";
 #else
   rndgentxt += " (C++ code)";
 #endif
 
   // Workflow description summary
   std::string wrkflwtxt;
-  // -- CUDA or HIP or C++?
+  // -- CUDA or C++?
 #ifdef __CUDACC__
   wrkflwtxt += "CUD:";
-#elif defined __HIPCC__
-  wrkflwtxt += "HIP:";
 #else
   wrkflwtxt += "CPP:";
 #endif
@@ -769,12 +764,6 @@ main( int argc, char** argv )
 #else
   wrkflwtxt += "???:"; // no path to this statement
 #endif
-#elif defined __HIPCC__
-#if defined MGONGPU_CUCXTYPE_CXSMPL
-  wrkflwtxt += "CXS:";
-#else
-  wrkflwtxt += "???:"; // no path to this statement
-#endif
 #else
 #if defined MGONGPU_CPPCXTYPE_STDCOMPLEX
   wrkflwtxt += "STX:";
@@ -800,7 +789,7 @@ main( int argc, char** argv )
     wrkflwtxt += "RMBDEV+";
   else
     wrkflwtxt += "??????+"; // no path to this statement
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
   // -- HOST or DEVICE matrix elements? Standalone MEs or BRIDGE?
   if( !bridge )
     wrkflwtxt += "MESDEV";
@@ -856,7 +845,7 @@ main( int argc, char** argv )
 
   if( perf )
   {
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
 #ifdef _OPENMP
     // Get the output of "nproc --all" (https://stackoverflow.com/a/478960)
     std::string nprocall;
@@ -877,8 +866,6 @@ main( int argc, char** argv )
     std::cout << std::string( SEP79, '*' ) << std::endl
 #ifdef __CUDACC__
               << "Process                     = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_CUDA"
-#elif defined __HIPCC__
-              << "Process                     = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_HIP"
 #else
               << "Process                     = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_CPP"
 #endif
@@ -905,21 +892,21 @@ main( int argc, char** argv )
 #elif defined MGONGPU_FPTYPE_FLOAT
               << "FP precision                = FLOAT (NaN/abnormal=" << nabn << ", zero=" << nzero << ")" << std::endl
 #endif
+#ifdef __CUDACC__
 #if defined MGONGPU_CUCXTYPE_CUCOMPLEX
               << "Complex type                = CUCOMPLEX" << std::endl
 #elif defined MGONGPU_CUCXTYPE_THRUST
               << "Complex type                = THRUST::COMPLEX" << std::endl
-#elif defined MGONGPU_CUCXTYPE_CXSMPL
-              << "Complex type                = STD::COMPLEX" << std::endl
+#endif
 #else
-              << "Complex type                = ???" << std::endl // no path to this statement...
+              << "Complex type                = STD::COMPLEX" << std::endl
 #endif
               << "RanNumb memory layout       = AOSOA[" << neppR << "]"
               << ( neppR == 1 ? " == AOS" : "" )
               << " [HARDCODED FOR REPRODUCIBILITY]" << std::endl
               << "Momenta memory layout       = AOSOA[" << neppM << "]"
               << ( neppM == 1 ? " == AOS" : "" ) << std::endl
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
     //<< "Wavefunction GPU memory     = LOCAL" << std::endl
 #else
 #if !defined MGONGPU_CPPSIMD
@@ -950,7 +937,7 @@ main( int argc, char** argv )
 #endif
 #endif
               << "Random number generation    = " << rndgentxt << std::endl
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
 #ifdef _OPENMP
               << "OMP threads / `nproc --all` = " << omp_get_max_threads() << " / " << nprocall // includes a newline
 #endif
@@ -1046,14 +1033,14 @@ main( int argc, char** argv )
              << "\"FLOAT (NaN/abnormal=" << nabn << ")\"," << std::endl
 #endif
              << "\"Complex type\": "
+#ifdef __CUDACC__
 #if defined MGONGPU_CUCXTYPE_CUCOMPLEX
              << "\"CUCOMPLEX\"," << std::endl
 #elif defined MGONGPU_CUCXTYPE_THRUST
              << "\"THRUST::COMPLEX\"," << std::endl
-#elif defined MGONGPU_CUCXTYPE_CXSMPL
-             << "\"STD::COMPLEX\"," << std::endl
+#endif
 #else
-             << "\"???\"," << std::endl                           // no path to this statement...
+             << "\"STD::COMPLEX\"," << std::endl
 #endif
              << "\"RanNumb memory layout\": "
              << "\"AOSOA[" << neppR << "]\""
@@ -1061,7 +1048,7 @@ main( int argc, char** argv )
              << "\"Momenta memory layout\": "
              << "\"AOSOA[" << neppM << "]\""
              << ( neppM == 1 ? " == AOS" : "" ) << ", " << std::endl
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
     //<< "\"Wavefunction GPU memory\": " << "\"LOCAL\"," << std::endl
 #endif
              << "\"Curand generation\": "
diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/CPPProcess.cc b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/CPPProcess.cc
index b26c54fe3c..2afd9a2b1b 100644
--- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/CPPProcess.cc
+++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/CPPProcess.cc
@@ -4,7 +4,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
 // MadGraph5_aMC@NLO v. 3.5.0_lo_vect, 2023-06-09
@@ -16,6 +16,7 @@
 
 #include "mgOnGpuConfig.h"
 
+#include "CudaRuntime.h"
 #include "HelAmps_sm.h"
 #include "MemoryAccessAmplitudes.h"
 #include "MemoryAccessCouplings.h"
@@ -45,7 +46,7 @@
 // Class member functions for calculating the matrix elements for
 // Process: g g > t t~ g WEIGHTED<=3 @2
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -79,7 +80,7 @@ namespace mg5amcCpu
   __device__ const fptype cIPD[2] = { (fptype)Parameters_sm::mdl_MT, (fptype)Parameters_sm::mdl_WT };
   __device__ const fptype* cIPC = nullptr; // unused as nicoup=0
 #else
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
   __device__ __constant__ fptype cIPD[2];
   __device__ __constant__ fptype* cIPC = nullptr; // unused as nicoup=0
 #else
@@ -89,7 +90,7 @@ namespace mg5amcCpu
 #endif
 
   // Helicity combinations (and filtering of "good" helicity combinations)
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
   __device__ __constant__ short cHel[ncomb][npar];
   __device__ __constant__ int cNGoodHel;
   __device__ __constant__ int cGoodHel[ncomb];
@@ -117,13 +118,13 @@ namespace mg5amcCpu
                            fptype* allDenominators,       // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
                            fptype_sv* jamp2_sv            // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled)
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
                            , const int ievt00             // input: first event number in current C++ event page (for CUDA, ievt depends on threadid)
 #endif
                            )
   //ALWAYS_INLINE // attributes are not permitted in a function definition
   {
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
     using namespace mg5amcGpu;
     using M_ACCESS = DeviceAccessMomenta;         // non-trivial access: buffer includes all events
     using E_ACCESS = DeviceAccessMatrixElements;  // non-trivial access: buffer includes all events
@@ -150,7 +151,7 @@ namespace mg5amcCpu
 #endif /* clang-format on */
     mgDebug( 0, __FUNCTION__ );
     //printf( "calculate_wavefunctions: ihel=%2d\n", ihel );
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
     //printf( "calculate_wavefunctions: ievt00=%d\n", ievt00 );
 #endif
 
@@ -186,7 +187,7 @@ namespace mg5amcCpu
 #endif
     for( int iParity = 0; iParity < nParity; ++iParity )
     { // START LOOP ON IPARITY
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
       const int ievt0 = ievt00 + iParity * neppV;
 #endif
       constexpr size_t nxcoup = ndcoup + nicoup; // both dependent and independent couplings
@@ -199,10 +200,8 @@ namespace mg5amcCpu
         allCOUPs[idcoup] = CD_ACCESS::idcoupAccessBufferConst( allcouplings, idcoup ); // dependent couplings, vary event-by-event
       for( size_t iicoup = 0; iicoup < nicoup; iicoup++ )
         allCOUPs[ndcoup + iicoup] = CI_ACCESS::iicoupAccessBufferConst( cIPC, iicoup ); // independent couplings, fixed for all events
-#ifdef MGONGPUCPP_GPUIMPL
 #ifdef __CUDACC__
 #pragma nv_diagnostic pop
-#endif
       // CUDA kernels take input/output buffers with momenta/MEs for all events
       const fptype* momenta = allmomenta;
       const fptype* COUPs[nxcoup];
@@ -506,7 +505,7 @@ namespace mg5amcCpu
         { 1, -8, 10, 1, 64, -8 },
         { 10, 1, 1, -8, -8, 64 } }; // 2-D array[6][6]
 
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
       // Pre-compute a constexpr triangular color matrix properly normalized #475
       struct TriangularNormalizedColorMatrix
       {
@@ -563,7 +562,7 @@ namespace mg5amcCpu
 #endif
       for( int icol = 0; icol < ncolor; icol++ )
       {
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
         // === C++ START ===
         // Diagonal terms
 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
@@ -622,7 +621,7 @@ namespace mg5amcCpu
       MEs_sv_previous += deltaMEs_previous;
 #endif
       /*
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
       if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", blockDim.x * blockIdx.x + threadIdx.x, ihel, MEs_sv );
 #else
 #ifdef MGONGPU_CPPSIMD
@@ -685,8 +684,8 @@ namespace mg5amcCpu
       { 1, 1, 1, 1, 1 },
       { 1, 1, 1, -1, -1 },
       { 1, 1, 1, -1, 1 } };
-#ifdef MGONGPUCPP_GPUIMPL
-    gpuMemcpyToSymbol( cHel, tHel, ncomb * npar * sizeof( short ) );
+#ifdef __CUDACC__
+    checkCuda( cudaMemcpyToSymbol( cHel, tHel, ncomb * npar * sizeof( short ) ) );
 #else
     memcpy( cHel, tHel, ncomb * npar * sizeof( short ) );
 #endif
@@ -727,9 +726,9 @@ namespace mg5amcCpu
     // Then copy them to CUDA constant memory (issue #39) or its C++ emulation in file-scope static memory
     const fptype tIPD[2] = { (fptype)m_pars->mdl_MT, (fptype)m_pars->mdl_WT };
     //const cxtype tIPC[0] = { ... }; // nicoup=0
-#ifdef MGONGPUCPP_GPUIMPL
-    gpuMemcpyToSymbol( cIPD, tIPD, 2 * sizeof( fptype ) );
-    //gpuMemcpyToSymbol( cIPC, tIPC, 0 * sizeof( cxtype ) ); // nicoup=0
+#ifdef __CUDACC__
+    checkCuda( cudaMemcpyToSymbol( cIPD, tIPD, 2 * sizeof( fptype ) ) );
+    //checkCuda( cudaMemcpyToSymbol( cIPC, tIPC, 0 * sizeof( cxtype ) ) ); // nicoup=0
 #else
     memcpy( cIPD, tIPD, 2 * sizeof( fptype ) );
     //memcpy( cIPC, tIPC, 0 * sizeof( cxtype ) ); // nicoup=0
@@ -766,7 +765,7 @@ namespace mg5amcCpu
   {
     std::stringstream out;
     // CUDA version (NVCC)
-    // [Use __NVCC__ instead of MGONGPUCPP_GPUIMPL here!]
+    // [Use __NVCC__ instead of __CUDACC__ here!]
     // [This tests if 'nvcc' was used even to build a .cc file, even if not necessarily 'nvcc -x cu' for a .cu file]
     // [Check 'nvcc --compiler-options -dM -E dummy.c | grep CUDA': see https://stackoverflow.com/a/53713712]
 #ifdef __NVCC__
@@ -831,12 +830,12 @@ namespace mg5amcCpu
   __global__ void /* clang-format off */
   computeDependentCouplings( const fptype* allgs, // input: Gs[nevt]
                              fptype* allcouplings // output: couplings[nevt*ndcoup*2]
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
                              , const int nevt     // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #endif
   ) /* clang-format on */
   {
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
     using namespace mg5amcGpu;
     using G_ACCESS = DeviceAccessGs;
     using C_ACCESS = DeviceAccessCouplings;
@@ -857,7 +856,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
+#ifdef __CUDACC__ /* clang-format off */
   __global__ void
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
@@ -986,9 +985,9 @@ namespace mg5amcCpu
         nGoodHel++;
       }
     }
-#ifdef MGONGPUCPP_GPUIMPL
-    gpuMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) );
-    gpuMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) );
+#ifdef __CUDACC__
+    checkCuda( cudaMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) ) );
+    checkCuda( cudaMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) ) );
 #else
     cNGoodHel = nGoodHel;
     for( int ihel = 0; ihel < ncomb; ihel++ ) cGoodHel[ihel] = goodHel[ihel];
@@ -1012,7 +1011,7 @@ namespace mg5amcCpu
 #endif
             int* allselhel,                // output: helicity selection[nevt]
             int* allselcol                 // output: helicity selection[nevt]
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
             , const int nevt               // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #endif
             ) /* clang-format on */
@@ -1033,7 +1032,7 @@ namespace mg5amcCpu
     // Denominators: spins, colors and identical particles
     constexpr int helcolDenominators[1] = { 256 }; // assume nprocesses == 1 (#272 and #343)
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
     // Remember: in CUDA this is a kernel for one event, in c++ this processes n events
     const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid
 #else
@@ -1047,12 +1046,9 @@ namespace mg5amcCpu
 #endif
 
     // Start sigmaKin_lines
-
-#include "GpuAbstraction.h"
-
     // === PART 0 - INITIALISATION (before calculate_wavefunctions) ===
     // Reset the "matrix elements" - running sums of |M|^2 over helicities for the given event
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
     allMEs[ievt] = 0;
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     allNumerators[ievt] = 0;
@@ -1080,7 +1076,7 @@ namespace mg5amcCpu
     // === PART 1 - HELICITY LOOP: CALCULATE WAVEFUNCTIONS ===
     // (in both CUDA and C++, using precomputed good helicities)
 
-#ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++
+#ifdef __CUDACC__ // CUDA OR C++
 
     // *** START OF PART 1a - CUDA (one event per CPU thread) ***
     // Running sum of partial amplitudes squared for event by event color selection (#402)
@@ -1284,7 +1280,7 @@ namespace mg5amcCpu
     // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event
     // [NB 'sum over final spins, average over initial spins', eg see
     // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf]
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
     allMEs[ievt] /= helcolDenominators[0];
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     if( channelId > 0 ) allMEs[ievt] *= allNumerators[ievt] / allDenominators[ievt];
diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/CPPProcess.h b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/CPPProcess.h
index d8179c5c94..5cba84f97c 100644
--- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/CPPProcess.h
+++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/CPPProcess.h
@@ -4,7 +4,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
 // MadGraph5_aMC@NLO v. 3.5.0_lo_vect, 2023-06-09
@@ -25,7 +25,7 @@
 
 //--------------------------------------------------------------------------
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -107,7 +107,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
   __global__ void
   computeDependentCouplings( const fptype* allgs,    // input: Gs[nevt]
                              fptype* allcouplings ); // output: couplings[nevt*ndcoup*2]
@@ -120,7 +120,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
+#ifdef __CUDACC__ /* clang-format off */
   __global__ void
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
@@ -150,7 +150,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
+#ifdef __CUDACC__ /* clang-format off */
   __global__ void
   sigmaKin( const fptype* allmomenta,      // input: momenta[nevt*npar*4]
             const fptype* allcouplings,    // input: couplings[nevt*ndcoup*2]
diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/CudaRuntime.h b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/CudaRuntime.h
new file mode 120000
index 0000000000..ce9e1a487a
--- /dev/null
+++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/CudaRuntime.h
@@ -0,0 +1 @@
+../CudaRuntime.h
\ No newline at end of file
diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/check_sa.cc b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/check_sa.cc
index 1bad694d1c..f1e75b9252 100644
--- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/check_sa.cc
+++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/check_sa.cc
@@ -4,7 +4,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: O. Mattelaer (Nov 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 
 #include "mgOnGpuConfig.h"
@@ -12,7 +12,6 @@
 #include "BridgeKernels.h"
 #include "CPPProcess.h"
 #include "CrossSectionKernels.h"
-#include "GpuRuntime.h"
 #include "MatrixElementKernels.h"
 #include "MemoryAccessMatrixElements.h"
 #include "MemoryAccessMomenta.h"
@@ -64,7 +63,7 @@ usage( char* argv0, int ret = 1 )
   std::cout << std::endl;
   std::cout << "Summary stats are always computed: '-p' and '-j' only control their printout" << std::endl;
   std::cout << "The '-d' flag only enables NaN/abnormal warnings and OMP debugging" << std::endl;
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
 #ifdef _OPENMP
   std::cout << std::endl;
   std::cout << "Use the OMP_NUM_THREADS environment variable to control OMP multi-threading" << std::endl;
@@ -78,7 +77,7 @@ int
 main( int argc, char** argv )
 {
   // Namespaces for CUDA and C++ (FIXME - eventually use the same namespace everywhere...)
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
   using namespace mg5amcGpu;
 #else
   using namespace mg5amcCpu;
@@ -104,11 +103,11 @@ main( int argc, char** argv )
     CurandDevice = 2
   };
 #ifdef __CUDACC__
-  RandomNumberMode rndgen = RandomNumberMode::CurandDevice; // default on CUDA GPU
+  RandomNumberMode rndgen = RandomNumberMode::CurandDevice; // default on GPU
 #elif not defined MGONGPU_HAS_NO_CURAND
   RandomNumberMode rndgen = RandomNumberMode::CurandHost;  // default on CPU if build has curand
 #else
-  RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // default on CPU if build has no curand and on HIP GPU
+  RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // default on CPU if build has no curand
 #endif
   // Rambo sampling mode (NB RamboHost implies CommonRandom or CurandHost!)
   enum class RamboSamplingMode
@@ -116,7 +115,7 @@ main( int argc, char** argv )
     RamboHost = 1,
     RamboDevice = 2
   };
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
   RamboSamplingMode rmbsmp = RamboSamplingMode::RamboDevice; // default on GPU
 #else
   RamboSamplingMode rmbsmp = RamboSamplingMode::RamboHost; // default on CPU
@@ -149,7 +148,7 @@ main( int argc, char** argv )
 #ifdef __CUDACC__
       rndgen = RandomNumberMode::CurandDevice;
 #else
-      throw std::runtime_error( "CurandDevice is not supported on CPUs or on HIP GPUs" );
+      throw std::runtime_error( "CurandDevice is not supported on CPUs" );
 #endif
     }
     else if( arg == "--curhst" )
@@ -166,7 +165,7 @@ main( int argc, char** argv )
     }
     else if( arg == "--rmbdev" )
     {
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
       rmbsmp = RamboSamplingMode::RamboDevice;
 #else
       throw std::runtime_error( "RamboDevice is not supported on CPUs" );
@@ -240,13 +239,13 @@ main( int argc, char** argv )
     return usage( argv[0] );
   }
 
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
 #ifdef _OPENMP
   ompnumthreadsNotSetMeansOneThread( debug ? 1 : 0 ); // quiet(-1), info(0), debug(1)
 #endif
 #endif
 
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
   // Fail gently and avoid "Illegal instruction (core dumped)" if the host does not support the SIMD used in the ME calculation
   // Note: this prevents a crash on pmpe04 but not on some github CI nodes?
   // [NB: SIMD vectorization in mg5amc C++ code is only used in the ME calculation below MatrixElementKernelHost!]
@@ -264,14 +263,14 @@ main( int argc, char** argv )
 
   // === STEP 0 - INITIALISE
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 
-  // --- 00. Initialise GPU
-  // Instantiate a GpuRuntime at the beginnining of the application's main.
-  // For CUDA this invokes cudaSetDevice(0) in the constructor and books a cudaDeviceReset() call in the destructor.
-  const std::string cdinKey = "00 GpuInit";
+  // --- 00. Initialise cuda
+  // Instantiate a CudaRuntime at the beginnining of the application's main to
+  // invoke cudaSetDevice(0) in the constructor and book a cudaDeviceReset() call in the destructor
+  const std::string cdinKey = "00 CudaInit";
   timermap.start( cdinKey );
-  GpuRuntime GpuRuntime( debug );
+  CudaRuntime cudaRuntime( debug );
 #endif
 
   // --- 0a. Initialise physics process
@@ -293,7 +292,7 @@ main( int argc, char** argv )
   timermap.start( alloKey );
 
   // Memory buffers for random numbers for momenta
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
   HostBufferRndNumMomenta hstRndmom( nevt );
 #else
   PinnedHostBufferRndNumMomenta hstRndmom( nevt );
@@ -301,7 +300,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for sampling weights
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
   HostBufferWeights hstWeights( nevt );
 #else
   PinnedHostBufferWeights hstWeights( nevt );
@@ -309,7 +308,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for momenta
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
   HostBufferMomenta hstMomenta( nevt );
 #else
   PinnedHostBufferMomenta hstMomenta( nevt );
@@ -317,7 +316,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for Gs
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
   HostBufferGs hstGs( nevt );
 #else
   PinnedHostBufferGs hstGs( nevt );
@@ -334,7 +333,7 @@ main( int argc, char** argv )
   }
 
   // Memory buffers for matrix elements
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
   HostBufferMatrixElements hstMatrixElements( nevt );
 #else
   PinnedHostBufferMatrixElements hstMatrixElements( nevt );
@@ -343,7 +342,7 @@ main( int argc, char** argv )
 
   // Memory buffers for random numbers for helicity selection
   // *** NB #403 these buffers always remain initialised at 0: no need for helicity choice in gcheck/check (no LHE produced) ***
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
   HostBufferRndNumHelicity hstRndHel( nevt );
 #else
   PinnedHostBufferRndNumHelicity hstRndHel( nevt );
@@ -352,7 +351,7 @@ main( int argc, char** argv )
 
   // Memory buffers for random numbers for color selection
   // *** NB #402 these buffers always remain initialised at 0: no need for color choice in gcheck/check (no LHE produced) ***
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
   HostBufferRndNumColor hstRndCol( nevt );
 #else
   PinnedHostBufferRndNumColor hstRndCol( nevt );
@@ -360,7 +359,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for helicity selection
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
   HostBufferSelectedHelicity hstSelHel( nevt );
 #else
   PinnedHostBufferSelectedHelicity hstSelHel( nevt );
@@ -368,7 +367,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for color selection
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
   HostBufferSelectedColor hstSelCol( nevt );
 #else
   PinnedHostBufferSelectedColor hstSelCol( nevt );
@@ -404,7 +403,7 @@ main( int argc, char** argv )
 #else
   else
   {
-    throw std::logic_error( "CurandDevice is not supported on CPUs or HIP GPUs" ); // INTERNAL ERROR (no path to this statement)
+    throw std::logic_error( "CurandDevice is not supported on CPUs" ); // INTERNAL ERROR (no path to this statement)
   }
 #endif
 #else
@@ -422,7 +421,7 @@ main( int argc, char** argv )
   }
   else
   {
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
     prsk.reset( new RamboSamplingKernelDevice( energy, devRndmom, devMomenta, devWeights, gpublocks, gputhreads ) );
 #else
     throw std::logic_error( "RamboDevice is not supported on CPUs" ); // INTERNAL ERROR (no path to this statement)
@@ -433,7 +432,7 @@ main( int argc, char** argv )
   std::unique_ptr<MatrixElementKernelBase> pmek;
   if( !bridge )
   {
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
     pmek.reset( new MatrixElementKernelDevice( devMomenta, devGs, devRndHel, devRndCol, devMatrixElements, devSelHel, devSelCol, gpublocks, gputhreads ) );
 #else
     pmek.reset( new MatrixElementKernelHost( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, nevt ) );
@@ -441,7 +440,7 @@ main( int argc, char** argv )
   }
   else
   {
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
     pmek.reset( new BridgeKernelDevice( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, gpublocks, gputhreads ) );
 #else
     pmek.reset( new BridgeKernelHost( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, nevt ) );
@@ -483,7 +482,7 @@ main( int argc, char** argv )
     prnk->generateRnarray();
     //std::cout << "Got random numbers" << std::endl;
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
     if( rndgen != RandomNumberMode::CurandDevice && rmbsmp == RamboSamplingMode::RamboDevice )
     {
       // --- 1c. Copy rndmom from host to device
@@ -515,7 +514,7 @@ main( int argc, char** argv )
     prsk->getMomentaFinal();
     //std::cout << "Got final momenta" << std::endl;
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
     if( rmbsmp == RamboSamplingMode::RamboDevice )
     {
       // --- 2c. CopyDToH Weights
@@ -560,7 +559,7 @@ main( int argc, char** argv )
       dynamic_cast<BridgeKernelBase*>( pmek.get() )->transposeInputMomentaC2F();
     }
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
     // --- 2d. CopyHToD Momenta
     const std::string gKey = "0.. CpHTDg";
     rambtime += timermap.start( gKey ); // FIXME! NOT A RAMBO TIMER!
@@ -589,7 +588,7 @@ main( int argc, char** argv )
     wv3atime += timermap.stop(); // calc only
     wavetime += wv3atime;        // calc plus copy
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
     if( !bridge )
     {
       // --- 3b. CopyDToH MEs
@@ -732,19 +731,15 @@ main( int argc, char** argv )
     rndgentxt = "CURAND DEVICE";
 #ifdef __CUDACC__
   rndgentxt += " (CUDA code)";
-#elif defined __HIPCC__
-  rndgentxt += " (HIP code)";
 #else
   rndgentxt += " (C++ code)";
 #endif
 
   // Workflow description summary
   std::string wrkflwtxt;
-  // -- CUDA or HIP or C++?
+  // -- CUDA or C++?
 #ifdef __CUDACC__
   wrkflwtxt += "CUD:";
-#elif defined __HIPCC__
-  wrkflwtxt += "HIP:";
 #else
   wrkflwtxt += "CPP:";
 #endif
@@ -769,12 +764,6 @@ main( int argc, char** argv )
 #else
   wrkflwtxt += "???:"; // no path to this statement
 #endif
-#elif defined __HIPCC__
-#if defined MGONGPU_CUCXTYPE_CXSMPL
-  wrkflwtxt += "CXS:";
-#else
-  wrkflwtxt += "???:"; // no path to this statement
-#endif
 #else
 #if defined MGONGPU_CPPCXTYPE_STDCOMPLEX
   wrkflwtxt += "STX:";
@@ -800,7 +789,7 @@ main( int argc, char** argv )
     wrkflwtxt += "RMBDEV+";
   else
     wrkflwtxt += "??????+"; // no path to this statement
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
   // -- HOST or DEVICE matrix elements? Standalone MEs or BRIDGE?
   if( !bridge )
     wrkflwtxt += "MESDEV";
@@ -856,7 +845,7 @@ main( int argc, char** argv )
 
   if( perf )
   {
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
 #ifdef _OPENMP
     // Get the output of "nproc --all" (https://stackoverflow.com/a/478960)
     std::string nprocall;
@@ -877,8 +866,6 @@ main( int argc, char** argv )
     std::cout << std::string( SEP79, '*' ) << std::endl
 #ifdef __CUDACC__
               << "Process                     = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_CUDA"
-#elif defined __HIPCC__
-              << "Process                     = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_HIP"
 #else
               << "Process                     = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_CPP"
 #endif
@@ -905,21 +892,21 @@ main( int argc, char** argv )
 #elif defined MGONGPU_FPTYPE_FLOAT
               << "FP precision                = FLOAT (NaN/abnormal=" << nabn << ", zero=" << nzero << ")" << std::endl
 #endif
+#ifdef __CUDACC__
 #if defined MGONGPU_CUCXTYPE_CUCOMPLEX
               << "Complex type                = CUCOMPLEX" << std::endl
 #elif defined MGONGPU_CUCXTYPE_THRUST
               << "Complex type                = THRUST::COMPLEX" << std::endl
-#elif defined MGONGPU_CUCXTYPE_CXSMPL
-              << "Complex type                = STD::COMPLEX" << std::endl
+#endif
 #else
-              << "Complex type                = ???" << std::endl // no path to this statement...
+              << "Complex type                = STD::COMPLEX" << std::endl
 #endif
               << "RanNumb memory layout       = AOSOA[" << neppR << "]"
               << ( neppR == 1 ? " == AOS" : "" )
               << " [HARDCODED FOR REPRODUCIBILITY]" << std::endl
               << "Momenta memory layout       = AOSOA[" << neppM << "]"
               << ( neppM == 1 ? " == AOS" : "" ) << std::endl
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
     //<< "Wavefunction GPU memory     = LOCAL" << std::endl
 #else
 #if !defined MGONGPU_CPPSIMD
@@ -950,7 +937,7 @@ main( int argc, char** argv )
 #endif
 #endif
               << "Random number generation    = " << rndgentxt << std::endl
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
 #ifdef _OPENMP
               << "OMP threads / `nproc --all` = " << omp_get_max_threads() << " / " << nprocall // includes a newline
 #endif
@@ -1046,14 +1033,14 @@ main( int argc, char** argv )
              << "\"FLOAT (NaN/abnormal=" << nabn << ")\"," << std::endl
 #endif
              << "\"Complex type\": "
+#ifdef __CUDACC__
 #if defined MGONGPU_CUCXTYPE_CUCOMPLEX
              << "\"CUCOMPLEX\"," << std::endl
 #elif defined MGONGPU_CUCXTYPE_THRUST
              << "\"THRUST::COMPLEX\"," << std::endl
-#elif defined MGONGPU_CUCXTYPE_CXSMPL
-             << "\"STD::COMPLEX\"," << std::endl
+#endif
 #else
-             << "\"???\"," << std::endl                           // no path to this statement...
+             << "\"STD::COMPLEX\"," << std::endl
 #endif
              << "\"RanNumb memory layout\": "
              << "\"AOSOA[" << neppR << "]\""
@@ -1061,7 +1048,7 @@ main( int argc, char** argv )
              << "\"Momenta memory layout\": "
              << "\"AOSOA[" << neppM << "]\""
              << ( neppM == 1 ? " == AOS" : "" ) << ", " << std::endl
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
     //<< "\"Wavefunction GPU memory\": " << "\"LOCAL\"," << std::endl
 #endif
              << "\"Curand generation\": "
diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/RamboSamplingKernels.cc b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/RamboSamplingKernels.cc
index 79abbcc4f8..da68aa9255 100644
--- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/RamboSamplingKernels.cc
+++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/RamboSamplingKernels.cc
@@ -1,11 +1,11 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #include "RamboSamplingKernels.h"
 
-#include "GpuRuntime.h"
+#include "CudaRuntime.h"
 #include "MemoryAccessMomenta.h"
 #include "MemoryAccessRandomNumbers.h"
 #include "MemoryAccessWeights.h"
@@ -14,7 +14,7 @@
 
 #include <sstream>
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -92,7 +92,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
   RamboSamplingKernelDevice::RamboSamplingKernelDevice( const fptype energy,               // input: energy
                                                         const BufferRndNumMomenta& rndmom, // input: random numbers in [0,1]
                                                         BufferMomenta& momenta,            // output: momenta
@@ -135,7 +135,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
   __global__ void
   getMomentaInitialDevice( const fptype energy,
                            fptype* momenta )
@@ -147,17 +147,17 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
   void
   RamboSamplingKernelDevice::getMomentaInitial()
   {
-    gpuLaunchKernel( getMomentaInitialDevice, m_gpublocks, m_gputhreads, m_energy, m_momenta.data() );
+    getMomentaInitialDevice<<<m_gpublocks, m_gputhreads>>>( m_energy, m_momenta.data() );
   }
 #endif
 
   //--------------------------------------------------------------------------
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
   __global__ void
   getMomentaFinalDevice( const fptype energy,
                          const fptype* rndmom,
@@ -171,11 +171,11 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
   void
   RamboSamplingKernelDevice::getMomentaFinal()
   {
-    gpuLaunchKernel( getMomentaFinalDevice, m_gpublocks, m_gputhreads, m_energy, m_rndmom.data(), m_momenta.data(), m_weights.data() );
+    getMomentaFinalDevice<<<m_gpublocks, m_gputhreads>>>( m_energy, m_rndmom.data(), m_momenta.data(), m_weights.data() );
   }
 #endif
 
diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/RamboSamplingKernels.h b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/RamboSamplingKernels.h
index 7c214cd74b..184089efd7 100644
--- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/RamboSamplingKernels.h
+++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/RamboSamplingKernels.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef RAMBOSAMPLINGKERNELS_H
 #define RAMBOSAMPLINGKERNELS_H 1
@@ -10,7 +10,7 @@
 
 #include "MemoryBuffers.h"
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -93,7 +93,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
   // A class encapsulating RAMBO phase space sampling on a GPU device
   class RamboSamplingKernelDevice final : public SamplingKernelBase, public NumberOfEvents
   {
diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/RandomNumberKernels.h b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/RandomNumberKernels.h
index 21d63beeac..188a72c2c9 100644
--- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/RandomNumberKernels.h
+++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/RandomNumberKernels.h
@@ -1,14 +1,14 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef RANDOMNUMBERKERNELS_H
 #define RANDOMNUMBERKERNELS_H 1
 
 #include "mgOnGpuConfig.h"
 
-// NB This must come AFTER mgOnGpuConfig.h which contains our definition of __global__ when MGONGPUCPP_GPUIMPL is not defined
+// NB This must come AFTER mgOnGpuConfig.h which contains our definition of __global__ when __CUDACC__ is not defined
 #ifndef MGONGPU_HAS_NO_CURAND
 //#include "curand.h"
 struct curandGenerator_st; // forward definition from curand.h
@@ -16,7 +16,7 @@ struct curandGenerator_st; // forward definition from curand.h
 
 #include "MemoryBuffers.h"
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/cudacpp.mk b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/cudacpp.mk
index bcb73d7f01..a0397e9ecc 100644
--- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/cudacpp.mk
+++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/cudacpp.mk
@@ -1,7 +1,7 @@
 # Copyright (C) 2020-2023 CERN and UCLouvain.
 # Licensed under the GNU Lesser General Public License (version 3 or later).
 # Created by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin.
-# Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+# Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 
 #=== Determine the name of this makefile (https://ftp.gnu.org/old-gnu/Manuals/make-3.80/html_node/make_17.html)
 #=== NB: different names (e.g. cudacpp.mk and cudacpp_src.mk) are used in the Subprocess and src directories
@@ -32,7 +32,7 @@ UNAME_P := $(shell uname -p)
 #=== Configure common compiler flags for C++ and CUDA
 
 INCFLAGS = -I.
-OPTFLAGS = -O3 # this ends up in GPUFLAGS too (should it?), cannot add -Ofast or -ffast-math here
+OPTFLAGS = -O3 # this ends up in CUFLAGS too (should it?), cannot add -Ofast or -ffast-math here
 
 # Dependency on src directory
 MG5AMC_COMMONLIB = mg5amc_common
@@ -89,139 +89,69 @@ endif
 
 #-------------------------------------------------------------------------------
 
-CUDA_COMPILER_PATH := $(shell compiler="`which nvcc 2>/dev/null`" && while [ -L "$$compiler" ]; do compiler=`readlink "$$compiler"`; done && echo "$$compiler")
-HIP_COMPILER_PATH := $(shell compiler="`which hipcc 2>/dev/null`" && while [ -L "$$compiler" ]; do compiler=`readlink "$$compiler"`; done && echo "$$compiler")
-
-ifeq ($(findstring nvcc,$(CUDA_COMPILER_PATH)),nvcc)
-  #=== Configure the CUDA compiler
-
-  # If CXX is not a single word (example "clang++ --gcc-toolchain...") then disable CUDA builds (issue #505)
-  # This is because it is impossible to pass this to "GPUFLAGS += -ccbin <host-compiler>" below
-  ifneq ($(words $(subst ccache ,,$(CXX))),1) # allow at most "CXX=ccache <host-compiler>" from outside
-    $(warning CUDA builds are not supported for multi-word CXX "$(CXX)")
-    override CUDA_HOME=disabled
-  endif
-
-  # If CUDA_HOME is not set, try to set it from the location of NVCC
-  ifndef CUDA_HOME
-    CUDA_HOME = $(patsubst %bin/nvcc,%,$(shell which nvcc 2>/dev/null))
-    $(warning CUDA_HOME was not set: using "$(CUDA_HOME)")
-  endif
-
-  # Set GPUCC as $(CUDA_HOME)/bin/nvcc if it exists
-  ifneq ($(wildcard $(CUDA_HOME)/bin/nvcc),)
-    GPUCC = $(CUDA_HOME)/bin/nvcc
-    USE_NVTX ?=-DUSE_NVTX
-    # See https://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/index.html
-    # See https://arnon.dk/matching-sm-architectures-arch-and-gencode-for-various-nvidia-cards/
-    # Default: use compute capability 70 for V100 (CERN lxbatch, CERN itscrd, Juwels Cluster).
-    # Embed device code for 70, and PTX for 70+.
-    # Export MADGRAPH_CUDA_ARCHITECTURE (comma-separated list) to use another value or list of values (see #533).
-    # Examples: use 60 for P100 (Piz Daint), 80 for A100 (Juwels Booster, NVidia raplab/Curiosity).
-    MADGRAPH_CUDA_ARCHITECTURE ?= 70
-    ###CUARCHFLAGS = -gencode arch=compute_$(MADGRAPH_CUDA_ARCHITECTURE),code=compute_$(MADGRAPH_CUDA_ARCHITECTURE) -gencode arch=compute_$(MADGRAPH_CUDA_ARCHITECTURE),code=sm_$(MADGRAPH_CUDA_ARCHITECTURE) # Older implementation (AV): go back to this one for multi-GPU support #533
-    ###CUARCHFLAGS = --gpu-architecture=compute_$(MADGRAPH_CUDA_ARCHITECTURE) --gpu-code=sm_$(MADGRAPH_CUDA_ARCHITECTURE),compute_$(MADGRAPH_CUDA_ARCHITECTURE) # Newer implementation (SH): cannot use this as-is for multi-GPU support #533
-    comma:=,
-    CUARCHFLAGS = $(foreach arch,$(subst $(comma), ,$(MADGRAPH_CUDA_ARCHITECTURE)),-gencode arch=compute_$(arch),code=compute_$(arch) -gencode arch=compute_$(arch),code=sm_$(arch))
-    CUINC = -I$(CUDA_HOME)/include/
-    CURANDLIBFLAGS = -L$(CUDA_HOME)/lib64/ -lcurand # NB: -lcuda is not needed here!
-    CUOPTFLAGS = -lineinfo
-    GPUFLAGS = $(OPTFLAGS) $(CUOPTFLAGS) $(INCFLAGS) $(CUINC) $(USE_NVTX) $(CUARCHFLAGS) -use_fast_math
-    ###GPUFLAGS += -Xcompiler -Wall -Xcompiler -Wextra -Xcompiler -Wshadow
-    ###GPUCC_VERSION = $(shell $(GPUCC) --version | grep 'Cuda compilation tools' | cut -d' ' -f5 | cut -d, -f1)
-    GPUFLAGS += -std=c++17 # need CUDA >= 11.2 (see #333): this is enforced in mgOnGpuConfig.h
-    # Without -maxrregcount: baseline throughput: 6.5E8 (16384 32 12) up to 7.3E8 (65536 128 12)
-    ###GPUFLAGS+= --maxrregcount 160 # improves throughput: 6.9E8 (16384 32 12) up to 7.7E8 (65536 128 12)
-    ###GPUFLAGS+= --maxrregcount 128 # improves throughput: 7.3E8 (16384 32 12) up to 7.6E8 (65536 128 12)
-    ###GPUFLAGS+= --maxrregcount 96 # degrades throughput: 4.1E8 (16384 32 12) up to 4.5E8 (65536 128 12)
-    ###GPUFLAGS+= --maxrregcount 64 # degrades throughput: 1.7E8 (16384 32 12) flat at 1.7E8 (65536 128 12)
-
-    CUBUILDRULEFLAGS = -Xcompiler -fPIC -c
-    CCBUILDRULEFLAGS = -Xcompiler -fPIC -c -x cu
-
-    CUDATESTFLAGS = -lcuda
-
-  else ifneq ($(origin REQUIRE_CUDA),undefined)
-    # If REQUIRE_CUDA is set but no cuda is found, stop here (e.g. for CI tests on GPU #443)
-    $(error No cuda installation found (set CUDA_HOME or make GPUCC visible in PATH))
-  else
-    # No cuda. Switch cuda compilation off and go to common random numbers in C++
-    $(warning CUDA_HOME is not set or is invalid: export CUDA_HOME to compile with cuda)
-    override GPUCC=
-    override USE_NVTX=
-    override CUINC=
-    override CURANDLIBFLAGS=
-  endif
-
-  # Set the host C++ compiler for GPUCC via "-ccbin <host-compiler>"
-  # (NB issue #505: this must be a single word, "clang++ --gcc-toolchain..." is not supported)
-  GPUFLAGS += -ccbin $(shell which $(subst ccache ,,$(CXX)))
-
-  # Allow newer (unsupported) C++ compilers with older versions of CUDA if ALLOW_UNSUPPORTED_COMPILER_IN_CUDA is set (#504)
-  ifneq ($(origin ALLOW_UNSUPPORTED_COMPILER_IN_CUDA),undefined)
-  GPUFLAGS += -allow-unsupported-compiler
-  endif
-
-else ifeq ($(findstring hipcc,$(HIP_COMPILER_PATH)),hipcc)
-  #=== Configure the HIP compiler
-
-  # If CXX is not a single word (example "clang++ --gcc-toolchain...") then disable CUDA builds (issue #505)
-  # This is because it is impossible to pass this to "GPUFLAGS += -ccbin <host-compiler>" below
-  ifneq ($(words $(subst ccache ,,$(CXX))),1) # allow at most "CXX=ccache <host-compiler>" from outside
-    $(warning CUDA builds are not supported for multi-word CXX "$(CXX)")
-    override CUDA_HOME=disabled
-  endif
-
-  # If HIP_HOME is not set, try to set it from the location of GPUCC
-  ifndef HIP_HOME
-    HIP_HOME = $(patsubst %bin/hipcc,%,$(HIP_COMPILER_PATH))
-    $(warning HIP_HOME was not set: using "$(HIP_HOME)")
-  endif
-
-  # Set GPUCC as $(HIP_HOME)/bin/hipcc if it exists
-  ifneq ($(wildcard $(HIP_HOME)/bin/hipcc),)
-    GPUCC = $(HIP_HOME)/bin/hipcc
-
-    # Should maybe find something equivelant to this in HIP
-    #USE_NVTX ?=-DUSE_NVTX
-
-    HIPARCHFLAGS = -target x86_64-linux-gnu --offload-arch=gfx90a
-    HIPINC = -I$(HIP_HOME)/include/
-
-    # -DHIP_FAST_MATH equivelant to -use_fast_math in HIP 
-    # (But only for single precision line 208: https://rocm-developer-tools.github.io/HIP/hcc__detail_2math__functions_8h_source.html)
-    GPUFLAGS = $(OPTFLAGS) $(CUOPTFLAGS) $(INCFLAGS) $(HIPINC) $(HIPARCHFLAGS) -DHIP_FAST_MATH -DHIP_PLATFORM=amd -fPIC
-    ###GPUFLAGS += -Xcompiler -Wall -Xcompiler -Wextra -Xcompiler -Wshadow
-    GPUFLAGS += -std=c++17
-    # Without -maxrregcount: baseline throughput: 6.5E8 (16384 32 12) up to 7.3E8 (65536 128 12)
-    ###GPUFLAGS+= --maxrregcount 160 # improves throughput: 6.9E8 (16384 32 12) up to 7.7E8 (65536 128 12)
-    ###GPUFLAGS+= --maxrregcount 128 # improves throughput: 7.3E8 (16384 32 12) up to 7.6E8 (65536 128 12)
-    ###GPUFLAGS+= --maxrregcount 96 # degrades throughput: 4.1E8 (16384 32 12) up to 4.5E8 (65536 128 12)
-    ###GPUFLAGS+= --maxrregcount 64 # degrades throughput: 1.7E8 (16384 32 12) flat at 1.7E8 (65536 128 12)
-
-    CUBUILDRULEFLAGS = -fPIC -c
-    CCBUILDRULEFLAGS = -fPIC -c
-
-  else ifneq ($(origin REQUIRE_HIP),undefined)
-    # If REQUIRE_HIP is set but no cuda is found, stop here (e.g. for CI tests on GPU #443)
-    $(error No hip installation found (set HIP_HOME or make GPUCC visible in PATH))
-  else
-    # No hip. Switch hip compilation off and go to common random numbers in C++
-    $(warning HIP_HOME is not set or is invalid: export HIP_HOME to compile with hip)
-    override GPUCC=
-    override USE_NVTX=
-    override CUINC=
-    override CURANDLIBFLAGS=
-  endif
+#=== Configure the CUDA compiler
+
+# If CXX is not a single word (example "clang++ --gcc-toolchain...") then disable CUDA builds (issue #505)
+# This is because it is impossible to pass this to "CUFLAGS += -ccbin <host-compiler>" below
+ifneq ($(words $(subst ccache ,,$(CXX))),1) # allow at most "CXX=ccache <host-compiler>" from outside
+  $(warning CUDA builds are not supported for multi-word CXX "$(CXX)")
+  override CUDA_HOME=disabled
+endif
+
+# If CUDA_HOME is not set, try to set it from the location of nvcc
+ifndef CUDA_HOME
+  CUDA_HOME = $(patsubst %bin/nvcc,%,$(shell which nvcc 2>/dev/null))
+  $(warning CUDA_HOME was not set: using "$(CUDA_HOME)")
+endif
+
+# Set NVCC as $(CUDA_HOME)/bin/nvcc if it exists
+ifneq ($(wildcard $(CUDA_HOME)/bin/nvcc),)
+  NVCC = $(CUDA_HOME)/bin/nvcc
+  USE_NVTX ?=-DUSE_NVTX
+  # See https://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/index.html
+  # See https://arnon.dk/matching-sm-architectures-arch-and-gencode-for-various-nvidia-cards/
+  # Default: use compute capability 70 for V100 (CERN lxbatch, CERN itscrd, Juwels Cluster).
+  # Embed device code for 70, and PTX for 70+.
+  # Export MADGRAPH_CUDA_ARCHITECTURE (comma-separated list) to use another value or list of values (see #533).
+  # Examples: use 60 for P100 (Piz Daint), 80 for A100 (Juwels Booster, NVidia raplab/Curiosity).
+  MADGRAPH_CUDA_ARCHITECTURE ?= 70
+  ###CUARCHFLAGS = -gencode arch=compute_$(MADGRAPH_CUDA_ARCHITECTURE),code=compute_$(MADGRAPH_CUDA_ARCHITECTURE) -gencode arch=compute_$(MADGRAPH_CUDA_ARCHITECTURE),code=sm_$(MADGRAPH_CUDA_ARCHITECTURE) # Older implementation (AV): go back to this one for multi-GPU support #533
+  ###CUARCHFLAGS = --gpu-architecture=compute_$(MADGRAPH_CUDA_ARCHITECTURE) --gpu-code=sm_$(MADGRAPH_CUDA_ARCHITECTURE),compute_$(MADGRAPH_CUDA_ARCHITECTURE) # Newer implementation (SH): cannot use this as-is for multi-GPU support #533
+  comma:=,
+  CUARCHFLAGS = $(foreach arch,$(subst $(comma), ,$(MADGRAPH_CUDA_ARCHITECTURE)),-gencode arch=compute_$(arch),code=compute_$(arch) -gencode arch=compute_$(arch),code=sm_$(arch))
+  CUINC = -I$(CUDA_HOME)/include/
+  CURANDLIBFLAGS = -L$(CUDA_HOME)/lib64/ -lcurand # NB: -lcuda is not needed here!
+  CUOPTFLAGS = -lineinfo
+  CUFLAGS = $(OPTFLAGS) $(CUOPTFLAGS) $(INCFLAGS) $(CUINC) $(USE_NVTX) $(CUARCHFLAGS) -use_fast_math
+  ###CUFLAGS += -Xcompiler -Wall -Xcompiler -Wextra -Xcompiler -Wshadow
+  ###NVCC_VERSION = $(shell $(NVCC) --version | grep 'Cuda compilation tools' | cut -d' ' -f5 | cut -d, -f1)
+  CUFLAGS += -std=c++17 # need CUDA >= 11.2 (see #333): this is enforced in mgOnGpuConfig.h
+  # Without -maxrregcount: baseline throughput: 6.5E8 (16384 32 12) up to 7.3E8 (65536 128 12)
+  ###CUFLAGS+= --maxrregcount 160 # improves throughput: 6.9E8 (16384 32 12) up to 7.7E8 (65536 128 12)
+  ###CUFLAGS+= --maxrregcount 128 # improves throughput: 7.3E8 (16384 32 12) up to 7.6E8 (65536 128 12)
+  ###CUFLAGS+= --maxrregcount 96 # degrades throughput: 4.1E8 (16384 32 12) up to 4.5E8 (65536 128 12)
+  ###CUFLAGS+= --maxrregcount 64 # degrades throughput: 1.7E8 (16384 32 12) flat at 1.7E8 (65536 128 12)
+else ifneq ($(origin REQUIRE_CUDA),undefined)
+  # If REQUIRE_CUDA is set but no cuda is found, stop here (e.g. for CI tests on GPU #443)
+  $(error No cuda installation found (set CUDA_HOME or make nvcc visible in PATH))
+else
+  # No cuda. Switch cuda compilation off and go to common random numbers in C++
+  $(warning CUDA_HOME is not set or is invalid: export CUDA_HOME to compile with cuda)
+  override NVCC=
+  override USE_NVTX=
+  override CUINC=
+  override CURANDLIBFLAGS=
+endif
 
-  # Allow newer (unsupported) C++ compilers with older versions of CUDA if ALLOW_UNSUPPORTED_COMPILER_IN_CUDA is set (#504)
-  ifneq ($(origin ALLOW_UNSUPPORTED_COMPILER_IN_CUDA),undefined)
-  GPUFLAGS += -allow-unsupported-compiler
-  endif
+# Set the host C++ compiler for nvcc via "-ccbin <host-compiler>"
+# (NB issue #505: this must be a single word, "clang++ --gcc-toolchain..." is not supported)
+CUFLAGS += -ccbin $(shell which $(subst ccache ,,$(CXX)))
 
+# Allow newer (unsupported) C++ compilers with older versions of CUDA if ALLOW_UNSUPPORTED_COMPILER_IN_CUDA is set (#504)
+ifneq ($(origin ALLOW_UNSUPPORTED_COMPILER_IN_CUDA),undefined)
+CUFLAGS += -allow-unsupported-compiler
 endif
 
-  
 #-------------------------------------------------------------------------------
 
 #=== Configure ccache for C++ and CUDA builds
@@ -233,9 +163,9 @@ endif
 #ifeq ($(USECCACHE)$(shell echo $(AR) | grep ccache),1)
 #  override AR:=ccache $(AR)
 #endif
-ifneq ($(GPUCC),)
-  ifeq ($(USECCACHE)$(shell echo $(GPUCC) | grep ccache),1)
-    override GPUCC:=ccache $(GPUCC)
+ifneq ($(NVCC),)
+  ifeq ($(USECCACHE)$(shell echo $(NVCC) | grep ccache),1)
+    override NVCC:=ccache $(NVCC)
   endif
 endif
 
@@ -259,7 +189,7 @@ endif
 
 # PowerPC-specific CUDA compiler flags (to be reviewed!)
 ifeq ($(UNAME_P),ppc64le)
-  GPUFLAGS+= -Xcompiler -mno-float128
+  CUFLAGS+= -Xcompiler -mno-float128
 endif
 
 #-------------------------------------------------------------------------------
@@ -269,10 +199,10 @@ endif
 # Set the default OMPFLAGS choice
 ifneq ($(shell $(CXX) --version | egrep '^Intel'),)
 override OMPFLAGS = -fopenmp
-###override OMPFLAGS = # disable OpenMP MT on Intel (was ok without GPUCC but not ok with GPUCC before #578)
+###override OMPFLAGS = # disable OpenMP MT on Intel (was ok without nvcc but not ok with nvcc before #578)
 else ifneq ($(shell $(CXX) --version | egrep '^(clang)'),)
 override OMPFLAGS = -fopenmp
-###override OMPFLAGS = # disable OpenMP MT on clang (was not ok without or with GPUCC before #578)
+###override OMPFLAGS = # disable OpenMP MT on clang (was not ok without or with nvcc before #578)
 else ifneq ($(shell $(CXX) --version | egrep '^(Apple clang)'),)
 override OMPFLAGS = # disable OpenMP MT on Apple clang (builds fail in the CI #578)
 else
@@ -323,10 +253,7 @@ endif
 
 # Set the default RNDGEN (random number generator) choice
 ifeq ($(RNDGEN),)
-  ifeq ($(GPUCC),)
-    override RNDGEN = hasNoCurand
-  # Edgecase for HIP compilation
-  else ifeq ($(findstring hipcc,$(GPUCC)),hipcc)
+  ifeq ($(NVCC),)
     override RNDGEN = hasNoCurand
   else ifeq ($(RNDGEN),)
     override RNDGEN = hasCurand
@@ -401,13 +328,13 @@ CXXFLAGS+= $(AVXFLAGS)
 $(info FPTYPE=$(FPTYPE))
 ifeq ($(FPTYPE),d)
   CXXFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_DOUBLE
-  GPUFLAGS  += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_DOUBLE
+  CUFLAGS  += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_DOUBLE
 else ifeq ($(FPTYPE),f)
   CXXFLAGS += -DMGONGPU_FPTYPE_FLOAT -DMGONGPU_FPTYPE2_FLOAT
-  GPUFLAGS  += -DMGONGPU_FPTYPE_FLOAT -DMGONGPU_FPTYPE2_FLOAT
+  CUFLAGS  += -DMGONGPU_FPTYPE_FLOAT -DMGONGPU_FPTYPE2_FLOAT
 else ifeq ($(FPTYPE),m)
   CXXFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_FLOAT
-  GPUFLAGS  += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_FLOAT
+  CUFLAGS  += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_FLOAT
 else
   $(error Unknown FPTYPE='$(FPTYPE)': only 'd', 'f' and 'm' are supported)
 endif
@@ -416,7 +343,7 @@ endif
 $(info HELINL=$(HELINL))
 ifeq ($(HELINL),1)
   CXXFLAGS += -DMGONGPU_INLINE_HELAMPS
-  GPUFLAGS  += -DMGONGPU_INLINE_HELAMPS
+  CUFLAGS  += -DMGONGPU_INLINE_HELAMPS
 else ifneq ($(HELINL),0)
   $(error Unknown HELINL='$(HELINL)': only '0' and '1' are supported)
 endif
@@ -425,7 +352,7 @@ endif
 $(info HRDCOD=$(HRDCOD))
 ifeq ($(HRDCOD),1)
   CXXFLAGS += -DMGONGPU_HARDCODE_PARAM
-  GPUFLAGS  += -DMGONGPU_HARDCODE_PARAM
+  CUFLAGS  += -DMGONGPU_HARDCODE_PARAM
 else ifneq ($(HRDCOD),0)
   $(error Unknown HRDCOD='$(HRDCOD)': only '0' and '1' are supported)
 endif
@@ -477,11 +404,11 @@ ifeq ($(UNAME_S),Darwin)
   override CULIBFLAGSRPATH2 =
 else
   # RPATH to cuda/cpp libs when linking executables
-  override CXXLIBFLAGSRPATH = -Wl,-rpath=$(LIBDIRRPATH)
-  override CULIBFLAGSRPATH = -Xlinker -rpath=$(LIBDIRRPATH)
+  override CXXLIBFLAGSRPATH = -Wl,-rpath,$(LIBDIRRPATH)
+  override CULIBFLAGSRPATH = -Xlinker -rpath,$(LIBDIRRPATH)
   # RPATH to common lib when linking cuda/cpp libs
-  override CXXLIBFLAGSRPATH2 = -Wl,-rpath='$$ORIGIN'
-  override CULIBFLAGSRPATH2 = -Xlinker -rpath='$$ORIGIN'
+  override CXXLIBFLAGSRPATH2 = -Wl,-rpath,'$$ORIGIN'
+  override CULIBFLAGSRPATH2 = -Xlinker -rpath,'$$ORIGIN'
 endif
 
 # Setting LD_LIBRARY_PATH or DYLD_LIBRARY_PATH in the RUNTIME is no longer necessary (neither on Linux nor on Mac)
@@ -494,7 +421,7 @@ override RUNTIME =
 cxx_main=$(BUILDDIR)/check.exe
 fcxx_main=$(BUILDDIR)/fcheck.exe
 
-ifneq ($(GPUCC),)
+ifneq ($(NVCC),)
 cu_main=$(BUILDDIR)/gcheck.exe
 fcu_main=$(BUILDDIR)/fgcheck.exe
 else
@@ -525,16 +452,15 @@ $(BUILDDIR)/.build.$(TAG):
 	@touch $(BUILDDIR)/.build.$(TAG)
 
 # Generic target and build rules: objects from CUDA compilation
-ifneq ($(GPUCC),)
+ifneq ($(NVCC),)
 $(BUILDDIR)/%.o : %.cu *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
 	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
-	$(GPUCC) $(CPPFLAGS) $(GPUFLAGS) $(CUBUILDRULEFLAGS) $< -o $@
+	$(NVCC) $(CPPFLAGS) $(CUFLAGS) -Xcompiler -fPIC -c $< -o $@
 
 $(BUILDDIR)/%_cu.o : %.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
 	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
-	$(GPUCC) $(CPPFLAGS) $(GPUFLAGS) $(CCBUILDRULEFLAGS) $< -o $@
+	$(NVCC) $(CPPFLAGS) $(CUFLAGS) -Xcompiler -fPIC -c -x cu $< -o $@
 endif
-# -x cu in line above
 
 # Generic target and build rules: objects from C++ compilation
 # (NB do not include CUINC here! add it only for NVTX or curand #679)
@@ -543,14 +469,11 @@ $(BUILDDIR)/%.o : %.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
 	$(CXX) $(CPPFLAGS) $(CXXFLAGS) -fPIC -c $< -o $@
 
 # Apply special build flags only to CrossSectionKernel.cc and gCrossSectionKernel.cu (no fast math, see #117)
-# Added edgecase for HIP compilation
 ifeq ($(shell $(CXX) --version | grep ^nvc++),)
 $(BUILDDIR)/CrossSectionKernels.o: CXXFLAGS += -fno-fast-math
 $(BUILDDIR)/CrossSectionKernels.o: CXXFLAGS += -fno-fast-math
-ifeq ($(findstring nvcc,$(GPUCC)),nvcc)
-  $(BUILDDIR)/gCrossSectionKernels.o: GPUFLAGS += -Xcompiler -fno-fast-math
-else
-  $(BUILDDIR)/gCrossSectionKernels.o: GPUFLAGS += -fno-fast-math
+ifneq ($(NVCC),)
+$(BUILDDIR)/gCrossSectionKernels.o: CUFLAGS += -Xcompiler -fno-fast-math
 endif
 endif
 
@@ -566,10 +489,10 @@ ifeq ($(RNDGEN),hasCurand)
 $(BUILDDIR)/CurandRandomNumberKernel.o: CXXFLAGS += $(CUINC)
 endif
 
-# Avoid "warning: builtin __has_trivial_... is deprecated; use __is_trivially_... instead" in GPUCC with icx2023 (#592)
+# Avoid "warning: builtin __has_trivial_... is deprecated; use __is_trivially_... instead" in nvcc with icx2023 (#592)
 ifneq ($(shell $(CXX) --version | egrep '^(Intel)'),)
-ifneq ($(GPUCC),)
-GPUFLAGS += -Wno-deprecated-builtins
+ifneq ($(NVCC),)
+CUFLAGS += -Xcompiler -Wno-deprecated-builtins
 endif
 endif
 
@@ -577,8 +500,8 @@ endif
 # This patch does remove the warning, but I prefer to keep it disabled for the moment...
 ###ifneq ($(shell $(CXX) --version | egrep '^(clang|Apple clang|Intel)'),)
 ###$(BUILDDIR)/CrossSectionKernels.o: CXXFLAGS += -Wno-overriding-t-option
-###ifneq ($(GPUCC),)
-###$(BUILDDIR)/gCrossSectionKernels.o: GPUFLAGS += -Xcompiler -Wno-overriding-t-option
+###ifneq ($(NVCC),)
+###$(BUILDDIR)/gCrossSectionKernels.o: CUFLAGS += -Xcompiler -Wno-overriding-t-option
 ###endif
 ###endif
 
@@ -605,7 +528,7 @@ MG5AMC_CXXLIB = mg5amc_$(processid_short)_cpp
 cxx_objects_lib=$(BUILDDIR)/CPPProcess.o $(BUILDDIR)/MatrixElementKernels.o $(BUILDDIR)/BridgeKernels.o $(BUILDDIR)/CrossSectionKernels.o
 cxx_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel.o $(BUILDDIR)/RamboSamplingKernels.o
 
-ifneq ($(GPUCC),)
+ifneq ($(NVCC),)
 MG5AMC_CULIB = mg5amc_$(processid_short)_cuda
 cu_objects_lib=$(BUILDDIR)/gCPPProcess.o $(BUILDDIR)/gMatrixElementKernels.o $(BUILDDIR)/gBridgeKernels.o $(BUILDDIR)/gCrossSectionKernels.o
 cu_objects_exe=$(BUILDDIR)/gCommonRandomNumberKernel.o $(BUILDDIR)/gRamboSamplingKernels.o
@@ -617,11 +540,11 @@ $(LIBDIR)/lib$(MG5AMC_CXXLIB).so: cxx_objects_lib += $(BUILDDIR)/fbridge.o
 $(LIBDIR)/lib$(MG5AMC_CXXLIB).so: $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib)
 	$(CXX) -shared -o $@ $(cxx_objects_lib) $(CXXLIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB)
 
-ifneq ($(GPUCC),)
+ifneq ($(NVCC),)
 $(LIBDIR)/lib$(MG5AMC_CULIB).so: $(BUILDDIR)/fbridge_cu.o
 $(LIBDIR)/lib$(MG5AMC_CULIB).so: cu_objects_lib += $(BUILDDIR)/fbridge_cu.o
 $(LIBDIR)/lib$(MG5AMC_CULIB).so: $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cu_objects_lib)
-	$(GPUCC) --shared -o $@ $(cu_objects_lib) $(CULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB)
+	$(NVCC) --shared -o $@ $(cu_objects_lib) $(CULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB)
 endif
 
 #-------------------------------------------------------------------------------
@@ -638,16 +561,16 @@ $(cxx_main): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PAT
 $(cxx_main): $(BUILDDIR)/check_sa.o $(LIBDIR)/lib$(MG5AMC_CXXLIB).so $(cxx_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel.o
 	$(CXX) -o $@ $(BUILDDIR)/check_sa.o $(OMPFLAGS) -ldl -pthread $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_CXXLIB) $(cxx_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel.o $(CURANDLIBFLAGS)
 
-ifneq ($(GPUCC),)
+ifneq ($(NVCC),)
 ifneq ($(shell $(CXX) --version | grep ^Intel),)
-$(cu_main): LIBFLAGS += -lintlc # compile with icpx and link with GPUCC (undefined reference to `_intel_fast_memcpy')
-$(cu_main): LIBFLAGS += -lsvml # compile with icpx and link with GPUCC (undefined reference to `__svml_cos4_l9')
+$(cu_main): LIBFLAGS += -lintlc # compile with icpx and link with nvcc (undefined reference to `_intel_fast_memcpy')
+$(cu_main): LIBFLAGS += -lsvml # compile with icpx and link with nvcc (undefined reference to `__svml_cos4_l9')
 else ifneq ($(shell $(CXX) --version | grep ^nvc++),) # support nvc++ #531
 $(cu_main): LIBFLAGS += -L$(patsubst %bin/nvc++,%lib,$(subst ccache ,,$(CXX))) -lnvhpcatm -lnvcpumath -lnvc
 endif
 $(cu_main): LIBFLAGS += $(CULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH
 $(cu_main): $(BUILDDIR)/gcheck_sa.o $(LIBDIR)/lib$(MG5AMC_CULIB).so $(cu_objects_exe) $(BUILDDIR)/gCurandRandomNumberKernel.o
-	$(GPUCC) -o $@ $(BUILDDIR)/gcheck_sa.o $(CUARCHFLAGS) $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe) $(BUILDDIR)/gCurandRandomNumberKernel.o $(CURANDLIBFLAGS)
+	$(NVCC) -o $@ $(BUILDDIR)/gcheck_sa.o $(CUARCHFLAGS) $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe) $(BUILDDIR)/gCurandRandomNumberKernel.o $(CURANDLIBFLAGS)
 endif
 
 #-------------------------------------------------------------------------------
@@ -673,17 +596,17 @@ $(fcxx_main): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PA
 $(fcxx_main): $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler.o $(LIBDIR)/lib$(MG5AMC_CXXLIB).so $(cxx_objects_exe)
 	$(CXX) -o $@ $(BUILDDIR)/fcheck_sa.o $(OMPFLAGS) $(BUILDDIR)/fsampler.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CXXLIB) $(cxx_objects_exe)
 
-ifneq ($(GPUCC),)
+ifneq ($(NVCC),)
 ifneq ($(shell $(CXX) --version | grep ^Intel),)
-$(fcu_main): LIBFLAGS += -lintlc # compile with icpx and link with GPUCC (undefined reference to `_intel_fast_memcpy')
-$(fcu_main): LIBFLAGS += -lsvml # compile with icpx and link with GPUCC (undefined reference to `__svml_cos4_l9')
+$(fcu_main): LIBFLAGS += -lintlc # compile with icpx and link with nvcc (undefined reference to `_intel_fast_memcpy')
+$(fcu_main): LIBFLAGS += -lsvml # compile with icpx and link with nvcc (undefined reference to `__svml_cos4_l9')
 endif
 ifeq ($(UNAME_S),Darwin)
 $(fcu_main): LIBFLAGS += -L$(shell dirname $(shell $(FC) --print-file-name libgfortran.dylib)) # add path to libgfortran on Mac #375
 endif
 $(fcu_main): LIBFLAGS += $(CULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH
 $(fcu_main): $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler_cu.o $(LIBDIR)/lib$(MG5AMC_CULIB).so $(cu_objects_exe)
-	$(GPUCC) -o $@ $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler_cu.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe)
+	$(NVCC) -o $@ $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler_cu.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe)
 endif
 
 #-------------------------------------------------------------------------------
@@ -695,7 +618,7 @@ $(BUILDDIR)/testxxx.o: testxxx_cc_ref.txt
 $(testmain): $(BUILDDIR)/testxxx.o
 $(testmain): cxx_objects_exe += $(BUILDDIR)/testxxx.o # Comment out this line to skip the C++ test of xxx functions
 
-ifneq ($(GPUCC),)
+ifneq ($(NVCC),)
 $(BUILDDIR)/testxxx_cu.o: $(GTESTLIBS)
 $(BUILDDIR)/testxxx_cu.o: INCFLAGS += $(GTESTINC)
 $(BUILDDIR)/testxxx_cu.o: testxxx_cc_ref.txt
@@ -708,7 +631,7 @@ $(BUILDDIR)/testmisc.o: INCFLAGS += $(GTESTINC)
 $(testmain): $(BUILDDIR)/testmisc.o
 $(testmain): cxx_objects_exe += $(BUILDDIR)/testmisc.o # Comment out this line to skip the C++ miscellaneous tests
 
-ifneq ($(GPUCC),)
+ifneq ($(NVCC),)
 $(BUILDDIR)/testmisc_cu.o: $(GTESTLIBS)
 $(BUILDDIR)/testmisc_cu.o: INCFLAGS += $(GTESTINC)
 $(testmain): $(BUILDDIR)/testmisc_cu.o
@@ -720,12 +643,12 @@ $(BUILDDIR)/runTest.o: INCFLAGS += $(GTESTINC)
 $(testmain): $(BUILDDIR)/runTest.o
 $(testmain): cxx_objects_exe += $(BUILDDIR)/runTest.o
 
-ifneq ($(GPUCC),)
+ifneq ($(NVCC),)
 $(BUILDDIR)/runTest_cu.o: $(GTESTLIBS)
 $(BUILDDIR)/runTest_cu.o: INCFLAGS += $(GTESTINC)
 ifneq ($(shell $(CXX) --version | grep ^Intel),)
-$(testmain): LIBFLAGS += -lintlc # compile with icpx and link with GPUCC (undefined reference to `_intel_fast_memcpy')
-$(testmain): LIBFLAGS += -lsvml # compile with icpx and link with GPUCC (undefined reference to `__svml_cos4_l9')
+$(testmain): LIBFLAGS += -lintlc # compile with icpx and link with nvcc (undefined reference to `_intel_fast_memcpy')
+$(testmain): LIBFLAGS += -lsvml # compile with icpx and link with nvcc (undefined reference to `__svml_cos4_l9')
 else ifneq ($(shell $(CXX) --version | grep ^nvc++),) # support nvc++ #531
 $(testmain): LIBFLAGS += -L$(patsubst %bin/nvc++,%lib,$(subst ccache ,,$(CXX))) -lnvhpcatm -lnvcpumath -lnvc
 endif
@@ -749,14 +672,14 @@ $(testmain): LIBFLAGS += -lgomp
 endif
 endif
 
-ifeq ($(GPUCC),) # link only runTest.o
+ifeq ($(NVCC),) # link only runTest.o
 $(testmain): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH
 $(testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(cxx_objects_exe) $(GTESTLIBS)
 	$(CXX) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) -ldl -pthread $(LIBFLAGS)
 else # link both runTest.o and runTest_cu.o
 $(testmain): LIBFLAGS += $(CULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH
 $(testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) $(GTESTLIBS)
-	  $(GPUCC) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) -ldl $(LIBFLAGS) $(CUDATESTFLAGS)
+	$(NVCC) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) -ldl $(LIBFLAGS) -lcuda
 endif
 
 # Use flock (Linux only, no Mac) to allow 'make -j' if googletest has not yet been downloaded https://stackoverflow.com/a/32666215
@@ -859,9 +782,9 @@ ifeq ($(USECCACHE),1)
 	ccache --version | head -1
 endif
 	@echo ""
-	@echo GPUCC=$(GPUCC)
-ifneq ($(GPUCC),)
-	$(GPUCC) --version
+	@echo NVCC=$(NVCC)
+ifneq ($(NVCC),)
+	$(NVCC) --version
 endif
 	@echo ""
 	@echo CXX=$(CXX)
@@ -880,7 +803,7 @@ endif
 
 # Target: check (run the C++ test executable)
 # [NB THIS IS WHAT IS USED IN THE GITHUB CI!]
-ifneq ($(GPUCC),)
+ifneq ($(NVCC),)
 check: runTest cmpFcheck cmpFGcheck
 else
 check: runTest cmpFcheck
diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/fbridge.cc b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/fbridge.cc
index 2b956730d4..f93c05b0b3 100644
--- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/fbridge.cc
+++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/fbridge.cc
@@ -1,11 +1,11 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: S. Roiser (Oct 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Roiser, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #include "Bridge.h"
 #include "CPPProcess.h"
-#include "GpuRuntime.h"
+#include "CudaRuntime.h"
 
 extern "C"
 {
@@ -22,7 +22,7 @@ extern "C"
    * Using the same Fortran MadEvent code, linking to the hetrerogeneous library would allow access to both CPU and GPU implementations.
    * The specific heterogeneous configuration (how many GPUs, how many threads on each CPU, etc) could be loaded in CUDA/C++ from a data file.
    */
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
   using namespace mg5amcGpu;
 #else
   using namespace mg5amcCpu;
@@ -46,8 +46,8 @@ extern "C"
    */
   void fbridgecreate_( CppObjectInFortran** ppbridge, const int* pnevtF, const int* pnparF, const int* pnp4F )
   {
-#ifdef MGONGPUCPP_GPUIMPL
-    GpuRuntime::setUp();
+#ifdef __CUDACC__
+    CudaRuntime::setUp();
 #endif
     // Create a process object, read parm card and set parameters
     // FIXME: the process instance can happily go out of scope because it is only needed to read parameters?
@@ -69,8 +69,8 @@ extern "C"
     Bridge<FORTRANFPTYPE>* pbridge = dynamic_cast<Bridge<FORTRANFPTYPE>*>( *ppbridge );
     if( pbridge == 0 ) throw std::runtime_error( "fbridgedelete_: invalid Bridge address" );
     delete pbridge;
-#ifdef MGONGPUCPP_GPUIMPL
-    GpuRuntime::tearDown();
+#ifdef __CUDACC__
+    CudaRuntime::tearDown();
 #endif
   }
 
@@ -100,7 +100,7 @@ extern "C"
   {
     Bridge<FORTRANFPTYPE>* pbridge = dynamic_cast<Bridge<FORTRANFPTYPE>*>( *ppbridge );
     if( pbridge == 0 ) throw std::runtime_error( "fbridgesequence_: invalid Bridge address" );
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
     // Use the device/GPU implementation in the CUDA library
     // (there is also a host implementation in this library)
     pbridge->gpu_sequence( momenta, gs, rndhel, rndcol, *pchannelId, mes, selhel, selcol );
diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/fsampler.cc b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/fsampler.cc
index 3743934f41..2fb445372d 100644
--- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/fsampler.cc
+++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/fsampler.cc
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Feb 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #include "mgOnGpuConfig.h"
 
@@ -13,7 +13,7 @@
 
 //--------------------------------------------------------------------------
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -40,7 +40,7 @@ namespace mg5amcCpu
   private:
     const int m_nevt; // The number of events in each iteration
     int m_iiter;      // The iteration counter (for random number seeding)
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
     HostBufferRndNumMomenta m_hstRndmom; // Memory buffers for random numbers
     HostBufferMomenta m_hstMomenta;      // Memory buffers for momenta
     HostBufferWeights m_hstWeights;      // Memory buffers for sampling weights
@@ -105,7 +105,7 @@ namespace mg5amcCpu
 
 extern "C"
 {
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
   using namespace mg5amcGpu;
 #else
   using namespace mg5amcCpu;
diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/runTest.cc b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/runTest.cc
index 461ec5c3a5..572e28aaea 100644
--- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/runTest.cc
+++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/runTest.cc
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: S. Hageboeck (Nov 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 
 #include "mgOnGpuConfig.h"
 
@@ -15,7 +15,7 @@
 #include "RandomNumberKernels.h"
 #include "epoch_process_id.h"
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 using namespace mg5amcGpu;
 #else
 using namespace mg5amcCpu;
@@ -32,7 +32,7 @@ struct CUDA_CPU_TestBase : public TestDriverBase
     : TestDriverBase( npar, refFileName ) {}
 };
 
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
 struct CPUTest : public CUDA_CPU_TestBase
 {
   // Struct data members (process, and memory structures for random numbers, momenta, matrix elements and weights on host and device)
@@ -114,7 +114,7 @@ struct CPUTest : public CUDA_CPU_TestBase
 };
 #endif
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 struct CUDATest : public CUDA_CPU_TestBase
 {
   // Reset the device when our test goes out of scope. Note that this should happen after
@@ -123,7 +123,7 @@ struct CUDATest : public CUDA_CPU_TestBase
   {
     ~DeviceReset()
     {
-      gpuDeviceReset(); // this is needed by cuda-memcheck --leak-check full
+      checkCuda( cudaDeviceReset() ); // this is needed by cuda-memcheck --leak-check full
     }
   } deviceResetter;
 
@@ -249,7 +249,7 @@ INSTANTIATE_TEST_SUITE_P( prefix, \
                           test_suite_name, \
                           testing::Values( new CUDATest( MG_EPOCH_REFERENCE_FILE_NAME ) ) );
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 MG_INSTANTIATE_TEST_SUITE_GPU( XTESTID_GPU( MG_EPOCH_PROCESS_ID ), MadgraphTest );
 #else
 MG_INSTANTIATE_TEST_SUITE_CPU( XTESTID_CPU( MG_EPOCH_PROCESS_ID ), MadgraphTest );
diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/testmisc.cc b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/testmisc.cc
index 2bd7a9fcf9..989aba1fdc 100644
--- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/testmisc.cc
+++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/testmisc.cc
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 //----------------------------------------------------------------------------
 // Use ./runTest.exe --gtest_filter=*misc to run only this test
 //----------------------------------------------------------------------------
@@ -17,7 +17,7 @@
 #include <sstream>
 #include <typeinfo>
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 #define TESTID( s ) s##_GPU_MISC
 #else
 #define TESTID( s ) s##_CPU_MISC
diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/testxxx.cc b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/testxxx.cc
index 6e8657edca..4243e9fcec 100644
--- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/testxxx.cc
+++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/testxxx.cc
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Apr 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #include "mgOnGpuConfig.h"
 
@@ -20,7 +20,7 @@
 #include <iomanip>
 #include <iostream>
 #include <vector>
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 #define TESTID( s ) s##_GPU_XXX
 #else
 #define TESTID( s ) s##_CPU_XXX
@@ -41,7 +41,7 @@ TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testxxx )
   assert( nevt % neppM == 0 ); // nevt must be a multiple of neppM
   assert( nevt % neppV == 0 ); // nevt must be a multiple of neppV
   // Fill in the input momenta
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
   mg5amcGpu::PinnedHostBufferMomenta hstMomenta( nevt ); // AOSOA[npagM][npar=4][np4=4][neppM]
 #else
   mg5amcCpu::HostBufferMomenta hstMomenta( nevt ); // AOSOA[npagM][npar=4][np4=4][neppM]
@@ -228,7 +228,7 @@ TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testxxx )
   {
     for( int ievt = 0; ievt < nevt; ievt++ )
     {
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
       using namespace mg5amcGpu;
 #else
       using namespace mg5amcCpu;
diff --git a/epochX/cudacpp/gg_tt01g.mad/src/HelAmps_sm.h b/epochX/cudacpp/gg_tt01g.mad/src/HelAmps_sm.h
index 3593d9f169..5a3a5dc76f 100644
--- a/epochX/cudacpp/gg_tt01g.mad/src/HelAmps_sm.h
+++ b/epochX/cudacpp/gg_tt01g.mad/src/HelAmps_sm.h
@@ -4,7 +4,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: A. Valassi (Sep 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
 // MadGraph5_aMC@NLO v. 3.5.0_lo_vect, 2023-06-09
@@ -26,7 +26,7 @@
 //#include <iomanip>
 //#include <iostream>
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/gg_tt01g.mad/src/Parameters_sm.cc b/epochX/cudacpp/gg_tt01g.mad/src/Parameters_sm.cc
index de87dcaf64..d3d01102fd 100644
--- a/epochX/cudacpp/gg_tt01g.mad/src/Parameters_sm.cc
+++ b/epochX/cudacpp/gg_tt01g.mad/src/Parameters_sm.cc
@@ -4,7 +4,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: A. Valassi (Sep 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
 // MadGraph5_aMC@NLO v. 3.5.0_lo_vect, 2023-06-09
diff --git a/epochX/cudacpp/gg_tt01g.mad/src/Parameters_sm.h b/epochX/cudacpp/gg_tt01g.mad/src/Parameters_sm.h
index fe7d686938..6551d8da81 100644
--- a/epochX/cudacpp/gg_tt01g.mad/src/Parameters_sm.h
+++ b/epochX/cudacpp/gg_tt01g.mad/src/Parameters_sm.h
@@ -214,7 +214,7 @@ namespace Parameters_sm_dependentCouplings
 #pragma GCC diagnostic push
 #pragma GCC diagnostic ignored "-Wunused-variable"  // e.g. <<warning: unused variable ‘mdl_G__exp__2’ [-Wunused-variable]>>
 #pragma GCC diagnostic ignored "-Wunused-parameter" // e.g. <<warning: unused parameter ‘G’ [-Wunused-parameter]>>
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 #pragma nv_diagnostic push
 #pragma nv_diag_suppress 177 // e.g. <<warning #177-D: variable "mdl_G__exp__2" was declared but never referenced>>
 #endif
@@ -242,7 +242,7 @@ namespace Parameters_sm_dependentCouplings
     // End SM implementation - no special handling of vectors of floats as in EFT (#439)
     return out;
   }
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 #pragma GCC diagnostic pop
 #pragma nv_diagnostic pop
 #endif
@@ -258,7 +258,7 @@ namespace Parameters_sm_independentCouplings
 
 //==========================================================================
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/gg_tt01g.mad/src/mgOnGpuConfig.h b/epochX/cudacpp/gg_tt01g.mad/src/mgOnGpuConfig.h
index 390766116b..881353abac 100644
--- a/epochX/cudacpp/gg_tt01g.mad/src/mgOnGpuConfig.h
+++ b/epochX/cudacpp/gg_tt01g.mad/src/mgOnGpuConfig.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jul 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MGONGPUCONFIG_H
 #define MGONGPUCONFIG_H 1
@@ -10,26 +10,13 @@
 // There are two different code bases for standalone_cudacpp (without multichannel) and madevent+cudacpp (with multichannel)
 #define MGONGPU_SUPPORTS_MULTICHANNEL 1
 
-// Is this a GPU (CUDA, HIP) or CPU implementation?
-#ifdef __CUDACC__
-#define MGONGPUCPP_GPUIMPL cuda
-#elif defined __HIPCC__
-#define MGONGPUCPP_GPUIMPL hip
-#else
-#undef MGONGPUCPP_GPUIMPL
-#endif
-
 // ** NB1 Throughputs (e.g. 6.8E8) are events/sec for "./gcheck.exe -p 65536 128 12"
 // ** NB2 Baseline on b7g47n0004 fluctuates (probably depends on load on other VMs)
 
 // Choose if curand is supported for generating random numbers
-// For CUDA, by default, it is supported
-// For HIP, by default, it is not supported
 // For C++, by default, do not inline, but allow this macro to be set from outside with e.g. -DMGONGPU_HAS_NO_CURAND
 #ifdef __CUDACC__
 #undef MGONGPU_HAS_NO_CURAND
-#elif defined __HIPCC__
-#define MGONGPU_HAS_NO_CURAND 1
 #else
 //#undef MGONGPU_HAS_NO_CURAND // default
 ////#define MGONGPU_HAS_NO_CURAND 1
@@ -65,28 +52,23 @@
 //#undef MGONGPU_HARDCODE_PARAM // default
 ////#define MGONGPU_HARDCODE_PARAM 1
 
-// Complex type in CUDA: thrust or cucomplex or cxsmpl (CHOOSE ONLY ONE)
+// Complex type in c++: std::complex or cxsmpl (CHOOSE ONLY ONE)
+#ifndef __CUDACC__
+//#define MGONGPU_CPPCXTYPE_STDCOMPLEX 1 // ~8 percent slower on float, same on double (5.1E6/double, 9.4E6/float)
+#define MGONGPU_CPPCXTYPE_CXSMPL 1 // new default (5.1E6/double, 10.2E6/float)
+#endif
+
+// Complex type in cuda: thrust or cucomplex or cxsmpl (CHOOSE ONLY ONE)
 #ifdef __CUDACC__
 #define MGONGPU_CUCXTYPE_THRUST 1 // default (~1.15E9/double, ~3.2E9/float)
 //#define MGONGPU_CUCXTYPE_CUCOMPLEX 1 // ~10 percent slower (1.03E9/double, ~2.8E9/float)
 //#define MGONGPU_CUCXTYPE_CXSMPL 1 // ~10 percent slower (1.00E9/double, ~2.9E9/float)
-
-// Complex type in HIP: cxsmpl (ONLY ONE OPTION POSSIBLE)
-#elif defined __HIPCC__
-#define MGONGPU_CUCXTYPE_CXSMPL 1 // ~10 percent slower (1.00E9/double, ~2.9E9/float)
-
-// Complex type in C++: std::complex or cxsmpl (CHOOSE ONLY ONE)
-#else
-//#define MGONGPU_CPPCXTYPE_STDCOMPLEX 1 // ~8 percent slower on float, same on double (5.1E6/double, 9.4E6/float)
-#define MGONGPU_CPPCXTYPE_CXSMPL 1 // new default (5.1E6/double, 10.2E6/float)
 #endif
 
-// CUDA nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation
+// Cuda nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation
 #ifdef __CUDACC__
-#undef MGONGPU_NSIGHT_DEBUG // default in CUDA
+#undef MGONGPU_NSIGHT_DEBUG // default
 //#define MGONGPU_NSIGHT_DEBUG 1
-#else
-#undef MGONGPU_NSIGHT_DEBUG // only option in HIP or C++
 #endif
 
 // SANITY CHECKS (floating point precision for everything but color algebra #537)
@@ -102,21 +84,17 @@
 #error You cannot use double precision for color algebra and single precision elsewhere
 #endif
 
-// SANITY CHECKS (CUDA complex number implementation)
-#ifdef __CUDACC__
-#if defined MGONGPU_CUCXTYPE_THRUST and defined MGONGPU_CUCXTYPE_CUCOMPLEX
-#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CUCXTYPE_THRUST or MGONGPU_CUCXTYPE_CUCOMPLEX for CUDA
-#elif defined MGONGPU_CUCXTYPE_THRUST and defined MGONGPU_CUCXTYPE_CXSMPL
-#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CUCXTYPE_THRUST or MGONGPU_CUCXTYPE_CXSMPL for CUDA
-#elif defined MGONGPU_CUCXTYPE_CUCOMPLEX and defined MGONGPU_CUCXTYPE_CXSMPL
-#error You must CHOOSE (ONE AND) ONLY ONE OF MGONGPU_CUCXTYPE_CUCOMPLEX or MGONGPU_CUCXTYPE_CXSMPL for CUDA
+// SANITY CHECKS (c++ complex number implementation)
+#ifndef __CUDACC__
+#if defined MGONGPU_CPPCXTYPE_STDCOMPLEX and defined MGONGPU_CPPCXTYPE_CXSMPL
+#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CPPCXTYPE_STDCOMPLEX or MGONGPU_CPPCXTYPE_CXSMPL
 #endif
 #endif
 
-// SANITY CHECKS (C++ complex number implementation)
-#ifndef MGONGPUCPP_GPUIMPL
-#if defined MGONGPU_CPPCXTYPE_STDCOMPLEX and defined MGONGPU_CPPCXTYPE_CXSMPL
-#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CPPCXTYPE_STDCOMPLEX or MGONGPU_CPPCXTYPE_CXSMPL for C++
+// SANITY CHECKS (cuda complex number implementation)
+#ifdef __CUDACC__
+#if defined MGONGPU_CUCXTYPE_THRUST and defined MGONGPU_CUCXTYPE_CUCOMPLEX and defined MGONGPU_CUCXTYPE_CXSMPL
+#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CUCXTYPE_THRUST or MGONGPU_CUCXTYPE_CUCOMPLEX or MGONGPU_CUCXTYPE_CXSMPL
 #endif
 #endif
 
@@ -153,7 +131,7 @@ namespace mgOnGpu
   // Alignment requirement for using reinterpret_cast with SIMD vectorized code
   // (using reinterpret_cast with non aligned memory may lead to segmentation faults!)
   // Only needed for C++ code but can be enforced also in NVCC builds of C++ code using CUDA>=11.2 and C++17 (#318, #319, #333)
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
   constexpr int cppAlign = 64; // alignment requirement for SIMD vectorization (64-byte i.e. 512-bit)
 #endif
 
@@ -164,7 +142,7 @@ using mgOnGpu::fptype;
 using mgOnGpu::fptype2;
 
 // C++ SIMD vectorization width (this will be used to set neppV)
-#ifdef MGONGPUCPP_GPUIMPL // CUDA and HIP implementations have no SIMD
+#ifdef __CUDACC__ // CUDA implementation has no SIMD
 #undef MGONGPU_CPPSIMD
 #elif defined __AVX512VL__ && defined MGONGPU_PVW512 // C++ "512z" AVX512 with 512 width (512-bit ie 64-byte): 8 (DOUBLE) or 16 (FLOAT)
 #ifdef MGONGPU_FPTYPE_DOUBLE
@@ -194,9 +172,9 @@ using mgOnGpu::fptype2;
 #undef MGONGPU_CPPSIMD
 #endif
 
-// CUDA nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation
+// Cuda nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation
 // Arguments (not used so far): text is __FUNCTION__, code is 0 (start) or 1 (end)
-#if defined __CUDA__ && defined MGONGPU_NSIGHT_DEBUG /* clang-format off */
+#if defined __CUDACC__ && defined MGONGPU_NSIGHT_DEBUG /* clang-format off */
 #define mgDebugDeclare() __shared__ float mgDebugCounter[mgOnGpu::ntpbMAX];
 #define mgDebugInitialise() { mgDebugCounter[threadIdx.x] = 0; }
 #define mgDebug( code, text ) { mgDebugCounter[threadIdx.x] += 1; }
@@ -208,8 +186,8 @@ using mgOnGpu::fptype2;
 #define mgDebugFinalise() { /*noop*/ }
 #endif /* clang-format on */
 
-// Define empty CUDA/HIP declaration specifiers for C++
-#ifndef MGONGPUCPP_GPUIMPL
+// Define empty CUDA declaration specifiers for C++
+#ifndef __CUDACC__
 #define __global__
 #define __host__
 #define __device__
diff --git a/epochX/cudacpp/gg_tt01g.mad/src/mgOnGpuCxtypes.h b/epochX/cudacpp/gg_tt01g.mad/src/mgOnGpuCxtypes.h
index 4e7ab03fa2..0cb2f1db7e 100644
--- a/epochX/cudacpp/gg_tt01g.mad/src/mgOnGpuCxtypes.h
+++ b/epochX/cudacpp/gg_tt01g.mad/src/mgOnGpuCxtypes.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022, based on earlier work by D. Smith) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MGONGPUCXTYPES_H
 #define MGONGPUCXTYPES_H 1
@@ -19,7 +19,7 @@
 #include <complex>
 
 // Complex type in cuda: thrust or cucomplex or cxsmpl
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 #if defined MGONGPU_CUCXTYPE_THRUST
 #pragma clang diagnostic push
 #pragma clang diagnostic ignored "-Wtautological-compare" // for icpx2021/clang13 (https://stackoverflow.com/a/15864661)
@@ -201,7 +201,7 @@ namespace mgOnGpu
 {
 
   // --- Type definitions (complex type: cxtype)
-#ifdef MGONGPUCPP_GPUIMPL // cuda
+#ifdef __CUDACC__ // cuda
 #if defined MGONGPU_CUCXTYPE_THRUST
   typedef thrust::complex<fptype> cxtype;
 #elif defined MGONGPU_CUCXTYPE_CUCOMPLEX
@@ -281,7 +281,7 @@ cxmake( const std::complex<double>& c ) // std::complex to cxsmpl (double-to-flo
 
 //==========================================================================
 
-#if defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CUCXTYPE_THRUST // cuda + thrust
+#if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_THRUST // cuda + thrust
 
 //------------------------------
 // CUDA - using thrust::complex
@@ -317,11 +317,11 @@ cxmake( const cxtype& c )
   return c;
 }
 
-#endif // #if defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CUCXTYPE_THRUST
+#endif // #if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_THRUST
 
 //==========================================================================
 
-#if defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CUCXTYPE_CUCOMPLEX // cuda + cucomplex
+#if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_CUCOMPLEX // cuda + cucomplex
 
 //------------------------------
 // CUDA - using cuComplex
@@ -536,11 +536,11 @@ cxmake( const std::complex<fptype>& c ) // std::complex to cucomplex (float-to-f
   return cxmake( c.real(), c.imag() );
 }
 
-#endif // #if defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CUCXTYPE_CUCOMPLEX
+#endif // #if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_CUCOMPLEX
 
 //==========================================================================
 
-#if not defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CPPCXTYPE_STDCOMPLEX // c++ + stdcomplex
+#if not defined __CUDACC__ and defined MGONGPU_CPPCXTYPE_STDCOMPLEX // c++ + stdcomplex
 
 //------------------------------
 // C++ - using std::complex
@@ -584,7 +584,7 @@ cxmake( const std::complex<double>& c ) // std::complex to std::complex (cast do
 }
 #endif
 
-#endif // #if not defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CPPCXTYPE_STDCOMPLEX
+#endif // #if not defined __CUDACC__ and defined MGONGPU_CPPCXTYPE_STDCOMPLEX
 
 //==========================================================================
 
diff --git a/epochX/cudacpp/gg_tt01g.mad/src/mgOnGpuFptypes.h b/epochX/cudacpp/gg_tt01g.mad/src/mgOnGpuFptypes.h
index 6f6cee64d6..a1cde16a67 100644
--- a/epochX/cudacpp/gg_tt01g.mad/src/mgOnGpuFptypes.h
+++ b/epochX/cudacpp/gg_tt01g.mad/src/mgOnGpuFptypes.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MGONGPUFPTYPES_H
 #define MGONGPUFPTYPES_H 1
@@ -13,7 +13,7 @@
 
 //==========================================================================
 
-#ifdef MGONGPUCPP_GPUIMPL // cuda
+#ifdef __CUDACC__ // cuda
 
 //------------------------------
 // Floating point types - Cuda
@@ -57,11 +57,11 @@ fpsqrt( const fptype& f )
 #endif
 }
 
-#endif // #ifdef MGONGPUCPP_GPUIMPL
+#endif // #ifdef __CUDACC__
 
 //==========================================================================
 
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
 
 //------------------------------
 // Floating point types - C++
@@ -85,7 +85,7 @@ fpsqrt( const fptype& f )
   return std::sqrt( f );
 }
 
-#endif // #ifndef MGONGPUCPP_GPUIMPL
+#endif // #ifndef __CUDACC__
 
 //==========================================================================
 
diff --git a/epochX/cudacpp/gg_tt01g.mad/src/mgOnGpuVectors.h b/epochX/cudacpp/gg_tt01g.mad/src/mgOnGpuVectors.h
index 7904b93c61..9d3e82b1e3 100644
--- a/epochX/cudacpp/gg_tt01g.mad/src/mgOnGpuVectors.h
+++ b/epochX/cudacpp/gg_tt01g.mad/src/mgOnGpuVectors.h
@@ -108,7 +108,7 @@ namespace mgOnGpu /* clang-format off */
 #endif
 #endif
 
-#else // i.e #ifndef MGONGPU_CPPSIMD (this includes #ifdef MGONGPUCPP_GPUIMPL)
+#else // i.e #ifndef MGONGPU_CPPSIMD (this includes #ifdef __CUDACC__)
 
   const int neppV = 1;
 
@@ -129,7 +129,7 @@ using mgOnGpu::bool_v;
 
 //--------------------------------------------------------------------------
 
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
 
 // Printout to stream for user defined types
 
@@ -744,11 +744,11 @@ fpvsplit1( const fptype2_v& v )
 
 #endif // #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
 
-#endif // #ifndef MGONGPUCPP_GPUIMPL
+#endif // #ifndef __CUDACC__
 
 //==========================================================================
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 
 //------------------------------
 // Vector types - CUDA
@@ -786,12 +786,12 @@ cxternary( const bool& mask, const cxtype& a, const cxtype& b )
   return ( mask ? a : b );
 }
 
-#endif // #ifdef MGONGPUCPP_GPUIMPL
+#endif // #ifdef __CUDACC__
 
 //==========================================================================
 
 // Scalar-or-vector types: scalar in CUDA, vector or scalar in C++
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 typedef bool bool_sv;
 typedef fptype fptype_sv;
 typedef fptype2 fptype2_sv;
@@ -812,7 +812,7 @@ typedef mgOnGpu::cxtype_ref cxtype_sv_ref;
 #endif
 
 // Scalar-or-vector zeros: scalar in CUDA, vector or scalar in C++
-#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
+#ifdef __CUDACC__ /* clang-format off */
 inline __host__ __device__ cxtype cxzero_sv(){ return cxtype( 0, 0 ); }
 #elif defined MGONGPU_CPPSIMD
 inline cxtype_v cxzero_sv() { return cxtype_v(); } // RRRR=0000 IIII=0000
diff --git a/epochX/cudacpp/gg_tt01g.mad/src/rambo.h b/epochX/cudacpp/gg_tt01g.mad/src/rambo.h
index cd7e1008ea..e02ea52496 100644
--- a/epochX/cudacpp/gg_tt01g.mad/src/rambo.h
+++ b/epochX/cudacpp/gg_tt01g.mad/src/rambo.h
@@ -4,7 +4,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 
 #include "mgOnGpuConfig.h"
@@ -18,7 +18,7 @@
 #include <iostream>
 
 // Simplified rambo version for 2 to N (with N>=2) processes with massless particles
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -83,7 +83,7 @@ namespace mg5amcCpu
       static bool first = true;
       if( first )
       {
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
         if constexpr( M_ACCESS::isOnDevice() ) // avoid
         {
           const int ievt0 = 0;
@@ -166,7 +166,7 @@ namespace mg5amcCpu
     wt = po2log;
     if( nparf != 2 ) wt = ( 2. * nparf - 4. ) * log( energy ) + z[nparf - 1];
 
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
     // issue warnings if weight is too small or too large
     static int iwarn[5] = { 0, 0, 0, 0, 0 };
     if( wt < -180. )
diff --git a/epochX/cudacpp/gg_ttg.mad/COPYRIGHT b/epochX/cudacpp/gg_ttg.mad/COPYRIGHT
index 84a883fbb0..a134b5fef9 100644
--- a/epochX/cudacpp/gg_ttg.mad/COPYRIGHT
+++ b/epochX/cudacpp/gg_ttg.mad/COPYRIGHT
@@ -15,7 +15,6 @@ The full development team currently includes the following authors :
   Stephan Hageboeck (CERN)
   Olivier Mattelaer (Universite Catholique de Louvain, original author)
   Stefan Roiser (CERN, original author)
-  Joergen Teig (CERN)
   Andrea Valassi (CERN, original author)
   Zenny Wettersten (CERN)
 See https://github.com/madgraph5/madgraph4gpu for more details. For the full
diff --git a/epochX/cudacpp/gg_ttg.mad/Source/DHELAS/aloha_file.inc b/epochX/cudacpp/gg_ttg.mad/Source/DHELAS/aloha_file.inc
index 50c12b0804..4f2ef3d0d8 100644
--- a/epochX/cudacpp/gg_ttg.mad/Source/DHELAS/aloha_file.inc
+++ b/epochX/cudacpp/gg_ttg.mad/Source/DHELAS/aloha_file.inc
@@ -1 +1 @@
-ALOHARoutine = FFV1_1.o VVVV4P0_1.o FFV1_0.o VVV1_0.o FFV1_2.o VVVV3P0_1.o VVVV1P0_1.o VVV1P0_1.o FFV1P0_3.o
+ALOHARoutine = VVVV4P0_1.o VVVV3P0_1.o VVVV1P0_1.o FFV1_1.o FFV1_2.o VVV1P0_1.o VVV1_0.o FFV1_0.o FFV1P0_3.o
diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/Bridge.h b/epochX/cudacpp/gg_ttg.mad/SubProcesses/Bridge.h
index c04628dfd1..4cafe0c997 100644
--- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/Bridge.h
+++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/Bridge.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: S. Roiser (Nov 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Roiser, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef BRIDGE_H
 #define BRIDGE_H 1
@@ -22,7 +22,7 @@
 #include <memory>
 #include <type_traits>
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -82,7 +82,7 @@ namespace mg5amcCpu
     Bridge& operator=( const Bridge& ) = delete;
     Bridge& operator=( Bridge&& ) = delete;
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
     /**
      * Set the gpublocks and gputhreads for the gpusequence - throws if evnt != gpublocks*gputhreads
      * (this is needed for BridgeKernel tests rather than for actual production use in Fortran)
@@ -149,7 +149,7 @@ namespace mg5amcCpu
     unsigned int m_nevt; // number of events
     int m_nGoodHel;      // the number of good helicities (-1 initially when they have not yet been calculated)
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
     int m_gputhreads; // number of gpu threads (default set from number of events, can be modified)
     int m_gpublocks;  // number of gpu blocks (default set from number of events, can be modified)
     mg5amcGpu::DeviceBuffer<FORTRANFPTYPE, sizePerEventMomenta> m_devMomentaF;
@@ -186,12 +186,12 @@ namespace mg5amcCpu
   // Forward declare transposition methods
   //
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 
   template<typename Tin, typename Tout>
   __global__ void dev_transposeMomentaF2C( const Tin* in, Tout* out, const unsigned int nevt );
 
-#endif // MGONGPUCPP_GPUIMPL
+#endif // __CUDACC__
 
   template<typename Tin, typename Tout>
   void hst_transposeMomentaF2C( const Tin* in, Tout* out, const unsigned int nevt );
@@ -208,7 +208,7 @@ namespace mg5amcCpu
   Bridge<FORTRANFPTYPE>::Bridge( unsigned int nevtF, unsigned int nparF, unsigned int np4F )
     : m_nevt( nevtF )
     , m_nGoodHel( -1 )
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
     , m_gputhreads( 256 )                  // default number of gpu threads
     , m_gpublocks( m_nevt / m_gputhreads ) // this ensures m_nevt <= m_gpublocks*m_gputhreads
     , m_devMomentaF( m_nevt )
@@ -232,7 +232,7 @@ namespace mg5amcCpu
   {
     if( nparF != CPPProcess::npar ) throw std::runtime_error( "Bridge constructor: npar mismatch" );
     if( np4F != CPPProcess::np4 ) throw std::runtime_error( "Bridge constructor: np4 mismatch" );
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
     if( ( m_nevt < s_gputhreadsmin ) || ( m_nevt % s_gputhreadsmin != 0 ) )
       throw std::runtime_error( "Bridge constructor: nevt should be a multiple of " + std::to_string( s_gputhreadsmin ) );
     while( m_nevt != m_gpublocks * m_gputhreads )
@@ -250,11 +250,11 @@ namespace mg5amcCpu
     std::cout << "WARNING! Instantiate host Bridge (nevt=" << m_nevt << ")" << std::endl;
     mg5amcCpu::CPPProcess process( /*verbose=*/false );
     m_pmek.reset( new mg5amcCpu::MatrixElementKernelHost( m_hstMomentaC, m_hstGs, m_hstRndHel, m_hstRndCol, m_hstMEs, m_hstSelHel, m_hstSelCol, m_nevt ) );
-#endif // MGONGPUCPP_GPUIMPL
+#endif // __CUDACC__
     process.initProc( "../../Cards/param_card.dat" );
   }
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
   template<typename FORTRANFPTYPE>
   void Bridge<FORTRANFPTYPE>::set_gpugrid( const int gpublocks, const int gputhreads )
   {
@@ -268,7 +268,7 @@ namespace mg5amcCpu
   }
 #endif
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
   template<typename FORTRANFPTYPE>
   void Bridge<FORTRANFPTYPE>::gpu_sequence( const FORTRANFPTYPE* momenta,
                                             const FORTRANFPTYPE* gs,
@@ -283,14 +283,14 @@ namespace mg5amcCpu
     constexpr int neppM = MemoryAccessMomenta::neppM;
     if constexpr( neppM == 1 && std::is_same_v<FORTRANFPTYPE, fptype> )
     {
-      gpuMemcpy( m_devMomentaC.data(), momenta, m_devMomentaC.bytes(), gpuMemcpyHostToDevice );
+      checkCuda( cudaMemcpy( m_devMomentaC.data(), momenta, m_devMomentaC.bytes(), cudaMemcpyHostToDevice ) );
     }
     else
     {
-      gpuMemcpy( m_devMomentaF.data(), momenta, m_devMomentaF.bytes(), gpuMemcpyHostToDevice );
+      checkCuda( cudaMemcpy( m_devMomentaF.data(), momenta, m_devMomentaF.bytes(), cudaMemcpyHostToDevice ) );
       const int thrPerEvt = CPPProcess::npar * CPPProcess::np4; // AV: transpose alg does 1 element per thread (NOT 1 event per thread)
       //const int thrPerEvt = 1; // AV: try new alg with 1 event per thread... this seems slower
-      gpuLaunchKernel( dev_transposeMomentaF2C, m_gpublocks * thrPerEvt, m_gputhreads, m_devMomentaF.data(), m_devMomentaC.data(), m_nevt );
+      dev_transposeMomentaF2C<<<m_gpublocks * thrPerEvt, m_gputhreads>>>( m_devMomentaF.data(), m_devMomentaC.data(), m_nevt );
     }
     if constexpr( std::is_same_v<FORTRANFPTYPE, fptype> )
     {
@@ -333,7 +333,7 @@ namespace mg5amcCpu
   }
 #endif
 
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
   template<typename FORTRANFPTYPE>
   void Bridge<FORTRANFPTYPE>::cpu_sequence( const FORTRANFPTYPE* momenta,
                                             const FORTRANFPTYPE* gs,
@@ -388,7 +388,7 @@ namespace mg5amcCpu
   // - C++ array: momenta[npagM][npar][np4][neppM] with nevt=npagM*neppM (AOSOA)
   //
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
   template<typename Tin, typename Tout>
   __global__ void dev_transposeMomentaF2C( const Tin* in, Tout* out, const unsigned int nevt )
   {
diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/BridgeKernels.cc b/epochX/cudacpp/gg_ttg.mad/SubProcesses/BridgeKernels.cc
index 90c7f2d3b8..cef4cb3c71 100644
--- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/BridgeKernels.cc
+++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/BridgeKernels.cc
@@ -1,11 +1,10 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #include "BridgeKernels.h"
 
-#include "GpuAbstraction.h"
 #include "MemoryAccessMomenta.h"
 
 #include <sstream>
@@ -15,7 +14,7 @@ constexpr int npar = CPPProcess::npar; // #particles in total (external = initia
 
 //============================================================================
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -46,7 +45,7 @@ namespace mg5amcCpu
 
 //============================================================================
 
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
 namespace mg5amcCpu
 {
 
@@ -97,7 +96,7 @@ namespace mg5amcCpu
 
 //============================================================================
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 namespace mg5amcGpu
 {
 
diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/BridgeKernels.h b/epochX/cudacpp/gg_ttg.mad/SubProcesses/BridgeKernels.h
index 3efef8ce97..15eb4bff4d 100644
--- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/BridgeKernels.h
+++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/BridgeKernels.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef BRIDGEKERNELS_H
 #define BRIDGEKERNELS_H 1
@@ -12,7 +12,7 @@
 #include "MatrixElementKernels.h"
 #include "MemoryBuffers.h"
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -49,7 +49,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
   // A Bridge wrapper class encapsulating matrix element calculations on a CPU host
   class BridgeKernelHost final : public BridgeKernelBase
   {
@@ -89,7 +89,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
   // A Bridge wrapper class encapsulating matrix element calculations on a GPU device
   class BridgeKernelDevice : public BridgeKernelBase
   {
diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/CommonRandomNumberKernel.cc b/epochX/cudacpp/gg_ttg.mad/SubProcesses/CommonRandomNumberKernel.cc
index 010bc4cbd0..985b39f576 100644
--- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/CommonRandomNumberKernel.cc
+++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/CommonRandomNumberKernel.cc
@@ -1,16 +1,15 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #include "CommonRandomNumbers.h"
-#include "GpuAbstraction.h"
 #include "MemoryBuffers.h"
 #include "RandomNumberKernels.h"
 
 #include <cassert>
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/CrossSectionKernels.cc b/epochX/cudacpp/gg_ttg.mad/SubProcesses/CrossSectionKernels.cc
index c15b39844d..0b355a3c8d 100644
--- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/CrossSectionKernels.cc
+++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/CrossSectionKernels.cc
@@ -1,11 +1,10 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #include "CrossSectionKernels.h"
 
-#include "GpuAbstraction.h"
 #include "MemoryAccessMatrixElements.h"
 #include "MemoryAccessWeights.h"
 #include "MemoryBuffers.h"
@@ -78,7 +77,7 @@ debug_me_is_abnormal( const fptype& me, size_t ievtALL )
 
 //============================================================================
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -186,7 +185,7 @@ namespace mg5amcCpu
 
 //============================================================================
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 namespace mg5amcGpu
 {
 
diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/CrossSectionKernels.h b/epochX/cudacpp/gg_ttg.mad/SubProcesses/CrossSectionKernels.h
index 4d9659e04e..7933ca4bbf 100644
--- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/CrossSectionKernels.h
+++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/CrossSectionKernels.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef CROSSSECTIONKERNELS_H
 #define CROSSSECTIONKERNELS_H 1
@@ -13,7 +13,7 @@
 
 //============================================================================
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -96,7 +96,7 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
   /*
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
   // A class encapsulating the calculation of event statistics on a GPU device
   class CrossSectionKernelDevice : public CrossSectionKernelBase, public NumberOfEvents
   {
diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/CudaRuntime.h b/epochX/cudacpp/gg_ttg.mad/SubProcesses/CudaRuntime.h
new file mode 100644
index 0000000000..64ce52f4b3
--- /dev/null
+++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/CudaRuntime.h
@@ -0,0 +1,85 @@
+// Copyright (C) 2020-2023 CERN and UCLouvain.
+// Licensed under the GNU Lesser General Public License (version 3 or later).
+// Created by: S. Roiser (Jul 2020) for the MG5aMC CUDACPP plugin.
+// Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+
+#ifndef MG5AMC_CUDARUNTIME_H
+#define MG5AMC_CUDARUNTIME_H 1
+
+// MG5AMC on GPU uses the CUDA runtime API, not the lower level CUDA driver API
+// See https://docs.nvidia.com/cuda/cuda-runtime-api/driver-vs-runtime-api.html#driver-vs-runtime-api
+
+#include <cassert>
+#include <iostream>
+
+//--------------------------------------------------------------------------
+
+// See https://stackoverflow.com/a/14038590
+#ifdef __CUDACC__ /* clang-format off */
+#define checkCuda( code ) { assertCuda( code, __FILE__, __LINE__ ); }
+inline void assertCuda( cudaError_t code, const char* file, int line, bool abort = true )
+{
+  if( code != cudaSuccess )
+  {
+    printf( "ERROR! assertCuda: '%s' (%d) in %s:%d\n", cudaGetErrorString( code ), code, file, line );
+    if( abort ) assert( code == cudaSuccess );
+  }
+}
+#endif /* clang-format on */
+
+//--------------------------------------------------------------------------
+
+#ifdef __CUDACC__
+namespace mg5amcGpu
+{
+  // Instantiate a CudaRuntime at the beginnining of the application's main to
+  // invoke cudaSetDevice(0) in the constructor and book a cudaDeviceReset() call in the destructor
+  // *** FIXME! This will all need to be designed differently when going to multi-GPU nodes! ***
+  struct CudaRuntime final
+  {
+    CudaRuntime( const bool debug = true )
+      : m_debug( debug ) { setUp( m_debug ); }
+    ~CudaRuntime() { tearDown( m_debug ); }
+    CudaRuntime( const CudaRuntime& ) = delete;
+    CudaRuntime( CudaRuntime&& ) = delete;
+    CudaRuntime& operator=( const CudaRuntime& ) = delete;
+    CudaRuntime& operator=( CudaRuntime&& ) = delete;
+    bool m_debug;
+
+    // Set up CUDA application
+    // ** NB: strictly speaking this is not needed when using the CUDA runtime API **
+    // Calling cudaSetDevice on startup is useful to properly book-keep the time spent in CUDA initialization
+    static void setUp( const bool debug = true )
+    {
+      // ** NB: it is useful to call cudaSetDevice, or cudaFree, to properly book-keep the time spent in CUDA initialization
+      // ** NB: otherwise, the first CUDA operation (eg a cudaMemcpyToSymbol in CPPProcess ctor) appears to take much longer!
+      /*
+      // [We initially added cudaFree(0) to "ease profile analysis" only because it shows up as a big recognizable block!]
+      // No explicit initialization is needed: https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#initialization
+      // It is not clear what cudaFree(0) does at all: https://stackoverflow.com/questions/69967813/
+      if ( debug ) std::cout << "__CudaRuntime: calling cudaFree(0)" << std::endl;
+      checkCuda( cudaFree( 0 ) ); // SLOW!
+      */
+      // Replace cudaFree(0) by cudaSetDevice(0), even if it is not really needed either
+      // (but see https://developer.nvidia.com/blog/cuda-pro-tip-always-set-current-device-avoid-multithreading-bugs)
+      if( debug ) std::cout << "__CudaRuntime: calling cudaSetDevice(0)" << std::endl;
+      checkCuda( cudaSetDevice( 0 ) ); // SLOW!
+    }
+
+    // Tear down CUDA application (call cudaDeviceReset)
+    // ** NB: strictly speaking this is not needed when using the CUDA runtime API **
+    // Calling cudaDeviceReset on shutdown is only needed for checking memory leaks in cuda-memcheck
+    // See https://docs.nvidia.com/cuda/cuda-memcheck/index.html#leak-checking
+    static void tearDown( const bool debug = true )
+    {
+      if( debug ) std::cout << "__CudaRuntime: calling cudaDeviceReset()" << std::endl;
+      checkCuda( cudaDeviceReset() );
+    }
+  };
+
+}
+#endif
+
+//--------------------------------------------------------------------------
+
+#endif // MG5AMC_CUDARUNTIME_H
diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/CurandRandomNumberKernel.cc b/epochX/cudacpp/gg_ttg.mad/SubProcesses/CurandRandomNumberKernel.cc
index 38c477c17a..eb56333b03 100644
--- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/CurandRandomNumberKernel.cc
+++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/CurandRandomNumberKernel.cc
@@ -1,9 +1,9 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
-#include "GpuRuntime.h"
+#include "CudaRuntime.h"
 #include "MemoryBuffers.h"
 #include "RandomNumberKernels.h"
 
@@ -114,7 +114,7 @@ namespace mg5amcCpu
     /*
     printf( "\nCurandRandomNumberKernel::generateRnarray size = %d\n", (int)m_rnarray.size() );
     fptype* data = m_rnarray.data();
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
     if( m_rnarray.isOnDevice() )
     {
       data = new fptype[m_rnarray.size()]();
@@ -123,7 +123,7 @@ namespace mg5amcCpu
 #endif
     for( int i = 0; i < ( (int)m_rnarray.size() / 4 ); i++ )
       printf( "[%4d] %f %f %f %f\n", i * 4, data[i * 4], data[i * 4 + 2], data[i * 4 + 2], data[i * 4 + 3] );
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
     if( m_rnarray.isOnDevice() ) delete[] data;
 #endif
     */
diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/EventStatistics.h b/epochX/cudacpp/gg_ttg.mad/SubProcesses/EventStatistics.h
index b425a5bade..48b51e0a49 100644
--- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/EventStatistics.h
+++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/EventStatistics.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef EventStatistics_H
 #define EventStatistics_H 1
@@ -16,7 +16,7 @@
 #include <limits>
 #include <string>
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/MadgraphTest.h b/epochX/cudacpp/gg_ttg.mad/SubProcesses/MadgraphTest.h
index d2ff326e20..fd7734ce42 100644
--- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/MadgraphTest.h
+++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/MadgraphTest.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: S. Hageboeck (Dec 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MADGRAPHTEST_H_
 #define MADGRAPHTEST_H_ 1
@@ -21,7 +21,7 @@
 #include <string>
 #include <vector>
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 using mg5amcGpu::CPPProcess;
 #else
 using mg5amcCpu::CPPProcess;
@@ -200,7 +200,7 @@ class MadgraphTest : public testing::TestWithParam<TestDriverBase*>
 
 // Since we link both the CPU-only and GPU tests into the same executable, we prevent
 // a multiply defined symbol by only compiling this in the non-CUDA phase:
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
 
 /// Compare momenta and matrix elements.
 /// This uses an implementation of TestDriverBase to run a madgraph workflow,
@@ -309,6 +309,6 @@ TEST_P( MadgraphTest, CompareMomentaAndME )
   }
 }
 
-#endif // MGONGPUCPP_GPUIMPL
+#endif // __CUDACC__
 
 #endif /* MADGRAPHTEST_H_ */
diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/MatrixElementKernels.cc b/epochX/cudacpp/gg_ttg.mad/SubProcesses/MatrixElementKernels.cc
index d6d6c4f179..30257195b6 100644
--- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/MatrixElementKernels.cc
+++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/MatrixElementKernels.cc
@@ -1,12 +1,12 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #include "MatrixElementKernels.h"
 
 #include "CPPProcess.h"
-#include "GpuRuntime.h" // Includes the abstraction for Nvidia/AMD compilation
+#include "CudaRuntime.h"
 #include "MemoryAccessMomenta.h"
 #include "MemoryBuffers.h"
 
@@ -14,7 +14,7 @@
 
 //============================================================================
 
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
 namespace mg5amcCpu
 {
 
@@ -143,7 +143,7 @@ namespace mg5amcCpu
 
 //============================================================================
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 namespace mg5amcGpu
 {
 
@@ -202,13 +202,13 @@ namespace mg5amcGpu
     PinnedHostBufferHelicityMask hstIsGoodHel( ncomb );
     DeviceBufferHelicityMask devIsGoodHel( ncomb );
     // ... 0d1. Compute good helicity mask on the device
-    gpuLaunchKernel( computeDependentCouplings, m_gpublocks, m_gputhreads, m_gs.data(), m_couplings.data() );
+    computeDependentCouplings<<<m_gpublocks, m_gputhreads>>>( m_gs.data(), m_couplings.data() );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    gpuLaunchKernel( sigmaKin_getGoodHel, m_gpublocks, m_gputhreads, m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_numerators.data(), m_denominators.data(), devIsGoodHel.data() );
+    sigmaKin_getGoodHel<<<m_gpublocks, m_gputhreads>>>( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_numerators.data(), m_denominators.data(), devIsGoodHel.data() );
 #else
-    gpuLaunchKernel( sigmaKin_getGoodHel, m_gpublocks, m_gputhreads, m_momenta.data(), m_couplings.data(), m_matrixElements.data(), devIsGoodHel.data() );
+    sigmaKin_getGoodHel<<<m_gpublocks, m_gputhreads>>>( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), devIsGoodHel.data() );
 #endif
-    checkGpu( gpuPeekAtLastError() );
+    checkCuda( cudaPeekAtLastError() );
     // ... 0d2. Copy back good helicity mask to the host
     copyHostFromDevice( hstIsGoodHel, devIsGoodHel );
     // ... 0d3. Copy back good helicity list to constant memory on the device
@@ -219,19 +219,19 @@ namespace mg5amcGpu
 
   void MatrixElementKernelDevice::computeMatrixElements( const unsigned int channelId )
   {
-    gpuLaunchKernel( computeDependentCouplings, m_gpublocks, m_gputhreads, m_gs.data(), m_couplings.data() );
+    computeDependentCouplings<<<m_gpublocks, m_gputhreads>>>( m_gs.data(), m_couplings.data() );
 #ifndef MGONGPU_NSIGHT_DEBUG
     constexpr unsigned int sharedMemSize = 0;
 #else
     constexpr unsigned int sharedMemSize = ntpbMAX * sizeof( float );
 #endif
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    gpuLaunchKernelSharedMem( sigmaKin, m_gpublocks, m_gputhreads, sharedMemSize, m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), channelId, m_numerators.data(), m_denominators.data(), m_selhel.data(), m_selcol.data() );
+    sigmaKin<<<m_gpublocks, m_gputhreads, sharedMemSize>>>( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), channelId, m_numerators.data(), m_denominators.data(), m_selhel.data(), m_selcol.data() );
 #else
-    gpuLaunchKernelSharedMem( sigmaKin, m_gpublocks, m_gputhreads, sharedMemSize, m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), m_selhel.data(), m_selcol.data() );
+    sigmaKin<<<m_gpublocks, m_gputhreads, sharedMemSize>>>( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), m_selhel.data(), m_selcol.data() );
 #endif
-    checkGpu( gpuPeekAtLastError() );
-    checkGpu( gpuDeviceSynchronize() );
+    checkCuda( cudaPeekAtLastError() );
+    checkCuda( cudaDeviceSynchronize() );
   }
 
   //--------------------------------------------------------------------------
diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/MatrixElementKernels.h b/epochX/cudacpp/gg_ttg.mad/SubProcesses/MatrixElementKernels.h
index 72bd8f195b..23e84757a2 100644
--- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/MatrixElementKernels.h
+++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/MatrixElementKernels.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MATRIXELEMENTKERNELS_H
 #define MATRIXELEMENTKERNELS_H 1
@@ -10,7 +10,7 @@
 
 #include "MemoryBuffers.h"
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -81,7 +81,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
   // A class encapsulating matrix element calculations on a CPU host
   class MatrixElementKernelHost final : public MatrixElementKernelBase, public NumberOfEvents
   {
@@ -130,7 +130,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
   // A class encapsulating matrix element calculations on a GPU device
   class MatrixElementKernelDevice : public MatrixElementKernelBase, public NumberOfEvents
   {
diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/MemoryAccessHelpers.h b/epochX/cudacpp/gg_ttg.mad/SubProcesses/MemoryAccessHelpers.h
index db73e4e064..c82a6c7635 100644
--- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/MemoryAccessHelpers.h
+++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/MemoryAccessHelpers.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MemoryAccessHelpers_H
 #define MemoryAccessHelpers_H 1
@@ -105,7 +105,7 @@ class KernelAccessHelper : public MemoryAccessHelper<T>
     }
     else
     {
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
       const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid
       //printf( "kernelAccessRecord: ievt=%d threadId=%d\n", ievt, threadIdx.x );
       return T::ieventAccessRecord( buffer, ievt ); // NB fptype and fptype_sv coincide for CUDA
diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/MemoryAccessMomenta.h b/epochX/cudacpp/gg_ttg.mad/SubProcesses/MemoryAccessMomenta.h
index 38fade09fb..0ac4faa3c7 100644
--- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/MemoryAccessMomenta.h
+++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/MemoryAccessMomenta.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MemoryAccessMomenta_H
 #define MemoryAccessMomenta_H 1
@@ -12,7 +12,7 @@
 #include "MemoryAccessHelpers.h"
 #include "MemoryAccessVectors.h"
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 using mg5amcGpu::CPPProcess;
 #else
 using mg5amcCpu::CPPProcess;
@@ -29,7 +29,7 @@ class MemoryAccessMomentaBase //_AOSOAv1
 
   // Number of Events Per Page in the momenta AOSOA memory buffer layout
   // (these are all best kept as a compile-time constants: see issue #23)
-#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
+#ifdef __CUDACC__ /* clang-format off */
   // -----------------------------------------------------------------------------------------------
   // --- GPUs: neppM is best set to a power of 2 times the number of fptype's in a 32-byte cacheline
   // --- This is relevant to ensure coalesced access to momenta in global memory
diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/MemoryAccessRandomNumbers.h b/epochX/cudacpp/gg_ttg.mad/SubProcesses/MemoryAccessRandomNumbers.h
index 40cb089135..e2988d39f3 100644
--- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/MemoryAccessRandomNumbers.h
+++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/MemoryAccessRandomNumbers.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MemoryAccessRandomNumbers_H
 #define MemoryAccessRandomNumbers_H 1
@@ -11,7 +11,7 @@
 #include "CPPProcess.h"
 #include "MemoryAccessHelpers.h"
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 using mg5amcGpu::CPPProcess;
 #else
 using mg5amcCpu::CPPProcess;
diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/MemoryAccessVectors.h b/epochX/cudacpp/gg_ttg.mad/SubProcesses/MemoryAccessVectors.h
index 08faccff0f..e9b197368e 100644
--- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/MemoryAccessVectors.h
+++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/MemoryAccessVectors.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MemoryAccessVectors_H
 #define MemoryAccessVectors_H 1
@@ -10,7 +10,7 @@
 
 #include "mgOnGpuVectors.h"
 
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
 namespace mg5amcCpu // this is only needed for CPU SIMD vectorization
 {
 
diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/MemoryBuffers.h b/epochX/cudacpp/gg_ttg.mad/SubProcesses/MemoryBuffers.h
index 7756a71621..3093e6ed18 100644
--- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/MemoryBuffers.h
+++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/MemoryBuffers.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021, based on earlier work by S. Hageboeck) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Roiser, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MemoryBuffers_H
 #define MemoryBuffers_H 1
@@ -11,12 +11,12 @@
 #include "mgOnGpuCxtypes.h"
 
 #include "CPPProcess.h"
-#include "GpuRuntime.h"
+#include "CudaRuntime.h"
 #include "Parameters_sm.h"
 
 #include <sstream>
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -87,7 +87,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
   constexpr bool HostBufferALIGNED = false;   // ismisaligned=false
   constexpr bool HostBufferMISALIGNED = true; // ismisaligned=true
 
@@ -119,7 +119,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
   // A class encapsulating a CUDA pinned host buffer
   template<typename T>
   class PinnedHostBufferBase : public BufferBase<T>
@@ -128,18 +128,18 @@ namespace mg5amcCpu
     PinnedHostBufferBase( const size_t size )
       : BufferBase<T>( size, false )
     {
-      gpuMallocHost( &( this->m_data ), this->bytes() );
+      checkCuda( cudaMallocHost( &( this->m_data ), this->bytes() ) );
     }
     virtual ~PinnedHostBufferBase()
     {
-      gpuFreeHost( this->m_data );
+      checkCuda( cudaFreeHost( this->m_data ) );
     }
   };
 #endif
 
   //--------------------------------------------------------------------------
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
   // A class encapsulating a CUDA device buffer
   template<typename T>
   class DeviceBufferBase : public BufferBase<T>
@@ -148,18 +148,18 @@ namespace mg5amcCpu
     DeviceBufferBase( const size_t size )
       : BufferBase<T>( size, true )
     {
-      gpuMalloc( &( this->m_data ), this->bytes() );
+      checkCuda( cudaMalloc( &( this->m_data ), this->bytes() ) );
     }
     virtual ~DeviceBufferBase()
     {
-      gpuFree( this->m_data );
+      checkCuda( cudaFree( this->m_data ) );
     }
   };
 #endif
 
   //--------------------------------------------------------------------------
 
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
   // A class encapsulating a C++ host buffer for a given number of events
   template<typename T, size_t sizePerEvent, bool ismisaligned>
   class HostBuffer : public HostBufferBase<T, ismisaligned>, virtual private NumberOfEvents
@@ -175,7 +175,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
   // A class encapsulating a CUDA pinned host buffer for a given number of events
   template<typename T, size_t sizePerEvent>
   class PinnedHostBuffer : public PinnedHostBufferBase<T>, virtual private NumberOfEvents
@@ -191,7 +191,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
   // A class encapsulating a CUDA device buffer for a given number of events
   template<typename T, size_t sizePerEvent>
   class DeviceBuffer : public DeviceBufferBase<T>, virtual private NumberOfEvents
@@ -213,7 +213,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for momenta random numbers
   constexpr size_t sizePerEventRndNumMomenta = MemoryBuffers::np4 * MemoryBuffers::nparf;
 
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
   // A class encapsulating a C++ host buffer for momenta random numbers
   typedef HostBuffer<fptype, sizePerEventRndNumMomenta, HostBufferALIGNED> HostBufferRndNumMomenta;
 #else
@@ -232,7 +232,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer with ONE fptype per event
   constexpr size_t sizePerEventOneFp = 1;
 
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
   // A class encapsulating a C++ host buffer with ONE fptype per event
   typedef HostBuffer<fptype, sizePerEventOneFp, HostBufferALIGNED> HostBufferOneFp;
 #else
@@ -257,7 +257,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for Gs
   constexpr size_t sizePerEventGs = 1;
 
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
   // A class encapsulating a C++ host buffer for gs
   typedef HostBuffer<fptype, sizePerEventGs, HostBufferALIGNED> HostBufferGs;
 #else
@@ -276,7 +276,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for numerators
   constexpr size_t sizePerEventNumerators = 1;
 
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
   // A class encapsulating a C++ host buffer for gs
   typedef HostBuffer<fptype, sizePerEventNumerators, HostBufferALIGNED> HostBufferNumerators;
 #else
@@ -296,7 +296,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for denominators
   constexpr size_t sizePerEventDenominators = 1;
 
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
   // A class encapsulating a C++ host buffer for gs
   typedef HostBuffer<fptype, sizePerEventDenominators, HostBufferALIGNED> HostBufferDenominators;
 #else
@@ -315,7 +315,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for random numbers
   constexpr size_t sizePerEventCouplings = MemoryBuffers::ndcoup * MemoryBuffers::nx2;
 
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
   // A class encapsulating a C++ host buffer for gs
   typedef HostBuffer<fptype, sizePerEventCouplings, HostBufferALIGNED> HostBufferCouplings;
 #else
@@ -333,7 +333,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for momenta
   constexpr size_t sizePerEventMomenta = MemoryBuffers::np4 * MemoryBuffers::npar;
 
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
   // A class encapsulating a C++ host buffer for momenta
   typedef HostBuffer<fptype, sizePerEventMomenta, HostBufferALIGNED> HostBufferMomenta;
   //typedef HostBuffer<fptype, sizePerEventMomenta, HostBufferMISALIGNED> HostBufferMomenta; // TEST MISALIGNMENT!
@@ -352,7 +352,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for sampling weights
   constexpr size_t sizePerEventWeights = 1;
 
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
   // A class encapsulating a C++ host buffer for sampling weights
   typedef HostBuffer<fptype, sizePerEventWeights, HostBufferALIGNED> HostBufferWeights;
 #else
@@ -370,7 +370,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for matrix elements
   constexpr size_t sizePerEventMatrixElements = 1;
 
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
   // A class encapsulating a C++ host buffer for matrix elements
   typedef HostBuffer<fptype, sizePerEventMatrixElements, HostBufferALIGNED> HostBufferMatrixElements;
 #else
@@ -385,7 +385,7 @@ namespace mg5amcCpu
   // A base class encapsulating a memory buffer for the helicity mask
   typedef BufferBase<bool> BufferHelicityMask;
 
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
   // A class encapsulating a C++ host buffer for the helicity mask
   typedef HostBufferBase<bool, HostBufferALIGNED> HostBufferHelicityMask;
 #else
@@ -403,7 +403,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for wavefunctions
   constexpr size_t sizePerEventWavefunctions = MemoryBuffers::nw6 * MemoryBuffers::nx2;
 
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
   // A class encapsulating a C++ host buffer for wavefunctions
   typedef HostBuffer<fptype, sizePerEventWavefunctions, HostBufferALIGNED> HostBufferWavefunctions;
 #else
@@ -421,7 +421,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for helicity random numbers
   constexpr size_t sizePerEventRndNumHelicity = 1;
 
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
   // A class encapsulating a C++ host buffer for helicity random numbers
   typedef HostBuffer<fptype, sizePerEventRndNumHelicity, HostBufferALIGNED> HostBufferRndNumHelicity;
 #else
@@ -439,7 +439,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for color random numbers
   constexpr size_t sizePerEventRndNumColor = 1;
 
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
   // A class encapsulating a C++ host buffer for color random numbers
   typedef HostBuffer<fptype, sizePerEventRndNumColor, HostBufferALIGNED> HostBufferRndNumColor;
 #else
@@ -457,7 +457,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for helicity selection
   constexpr size_t sizePerEventSelectedHelicity = 1;
 
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
   // A class encapsulating a C++ host buffer for helicity selection
   typedef HostBuffer<int, sizePerEventSelectedHelicity, HostBufferALIGNED> HostBufferSelectedHelicity;
 #else
@@ -475,7 +475,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for color selection
   constexpr size_t sizePerEventSelectedColor = 1;
 
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
   // A class encapsulating a C++ host buffer for color selection
   typedef HostBuffer<int, sizePerEventSelectedColor, HostBufferALIGNED> HostBufferSelectedColor;
 #else
@@ -487,7 +487,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
   template<class Tdst, class Tsrc>
   void copyDeviceFromHost( Tdst& dst, const Tsrc& src ) // keep the same order of arguments as in memcpy
   {
@@ -504,13 +504,13 @@ namespace mg5amcCpu
       throw std::runtime_error( sstr.str() );
     }
     // NB (PR #45): cudaMemcpy involves an intermediate memcpy to pinned memory if host array is a not a pinned host array
-    gpuMemcpy( dst.data(), src.data(), src.bytes(), gpuMemcpyHostToDevice );
+    checkCuda( cudaMemcpy( dst.data(), src.data(), src.bytes(), cudaMemcpyHostToDevice ) );
   }
 #endif
 
   //--------------------------------------------------------------------------
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
   template<class Tdst, class Tsrc>
   void copyHostFromDevice( Tdst& dst, const Tsrc& src ) // keep the same order of arguments as in memcpy
   {
@@ -527,7 +527,7 @@ namespace mg5amcCpu
       throw std::runtime_error( sstr.str() );
     }
     // NB (PR #45): cudaMemcpy involves an intermediate memcpy to pinned memory if host array is a not a pinned host array
-    gpuMemcpy( dst.data(), src.data(), src.bytes(), gpuMemcpyDeviceToHost );
+    checkCuda( cudaMemcpy( dst.data(), src.data(), src.bytes(), cudaMemcpyDeviceToHost ) );
   }
 #endif
 
diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/CPPProcess.cc b/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/CPPProcess.cc
index 389a5d98b3..5856e464ed 100644
--- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/CPPProcess.cc
+++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/CPPProcess.cc
@@ -4,7 +4,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
 // MadGraph5_aMC@NLO v. 3.5.0_lo_vect, 2023-06-09
@@ -16,6 +16,7 @@
 
 #include "mgOnGpuConfig.h"
 
+#include "CudaRuntime.h"
 #include "HelAmps_sm.h"
 #include "MemoryAccessAmplitudes.h"
 #include "MemoryAccessCouplings.h"
@@ -45,7 +46,7 @@
 // Class member functions for calculating the matrix elements for
 // Process: g g > t t~ g WEIGHTED<=3 @1
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -79,7 +80,7 @@ namespace mg5amcCpu
   __device__ const fptype cIPD[2] = { (fptype)Parameters_sm::mdl_MT, (fptype)Parameters_sm::mdl_WT };
   __device__ const fptype* cIPC = nullptr; // unused as nicoup=0
 #else
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
   __device__ __constant__ fptype cIPD[2];
   __device__ __constant__ fptype* cIPC = nullptr; // unused as nicoup=0
 #else
@@ -89,7 +90,7 @@ namespace mg5amcCpu
 #endif
 
   // Helicity combinations (and filtering of "good" helicity combinations)
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
   __device__ __constant__ short cHel[ncomb][npar];
   __device__ __constant__ int cNGoodHel;
   __device__ __constant__ int cGoodHel[ncomb];
@@ -117,13 +118,13 @@ namespace mg5amcCpu
                            fptype* allDenominators,       // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
                            fptype_sv* jamp2_sv            // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled)
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
                            , const int ievt00             // input: first event number in current C++ event page (for CUDA, ievt depends on threadid)
 #endif
                            )
   //ALWAYS_INLINE // attributes are not permitted in a function definition
   {
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
     using namespace mg5amcGpu;
     using M_ACCESS = DeviceAccessMomenta;         // non-trivial access: buffer includes all events
     using E_ACCESS = DeviceAccessMatrixElements;  // non-trivial access: buffer includes all events
@@ -150,7 +151,7 @@ namespace mg5amcCpu
 #endif /* clang-format on */
     mgDebug( 0, __FUNCTION__ );
     //printf( "calculate_wavefunctions: ihel=%2d\n", ihel );
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
     //printf( "calculate_wavefunctions: ievt00=%d\n", ievt00 );
 #endif
 
@@ -186,7 +187,7 @@ namespace mg5amcCpu
 #endif
     for( int iParity = 0; iParity < nParity; ++iParity )
     { // START LOOP ON IPARITY
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
       const int ievt0 = ievt00 + iParity * neppV;
 #endif
       constexpr size_t nxcoup = ndcoup + nicoup; // both dependent and independent couplings
@@ -199,10 +200,8 @@ namespace mg5amcCpu
         allCOUPs[idcoup] = CD_ACCESS::idcoupAccessBufferConst( allcouplings, idcoup ); // dependent couplings, vary event-by-event
       for( size_t iicoup = 0; iicoup < nicoup; iicoup++ )
         allCOUPs[ndcoup + iicoup] = CI_ACCESS::iicoupAccessBufferConst( cIPC, iicoup ); // independent couplings, fixed for all events
-#ifdef MGONGPUCPP_GPUIMPL
 #ifdef __CUDACC__
 #pragma nv_diagnostic pop
-#endif
       // CUDA kernels take input/output buffers with momenta/MEs for all events
       const fptype* momenta = allmomenta;
       const fptype* COUPs[nxcoup];
@@ -506,7 +505,7 @@ namespace mg5amcCpu
         { 1, -8, 10, 1, 64, -8 },
         { 10, 1, 1, -8, -8, 64 } }; // 2-D array[6][6]
 
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
       // Pre-compute a constexpr triangular color matrix properly normalized #475
       struct TriangularNormalizedColorMatrix
       {
@@ -563,7 +562,7 @@ namespace mg5amcCpu
 #endif
       for( int icol = 0; icol < ncolor; icol++ )
       {
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
         // === C++ START ===
         // Diagonal terms
 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
@@ -622,7 +621,7 @@ namespace mg5amcCpu
       MEs_sv_previous += deltaMEs_previous;
 #endif
       /*
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
       if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", blockDim.x * blockIdx.x + threadIdx.x, ihel, MEs_sv );
 #else
 #ifdef MGONGPU_CPPSIMD
@@ -685,8 +684,8 @@ namespace mg5amcCpu
       { 1, 1, 1, 1, 1 },
       { 1, 1, 1, -1, -1 },
       { 1, 1, 1, -1, 1 } };
-#ifdef MGONGPUCPP_GPUIMPL
-    gpuMemcpyToSymbol( cHel, tHel, ncomb * npar * sizeof( short ) );
+#ifdef __CUDACC__
+    checkCuda( cudaMemcpyToSymbol( cHel, tHel, ncomb * npar * sizeof( short ) ) );
 #else
     memcpy( cHel, tHel, ncomb * npar * sizeof( short ) );
 #endif
@@ -727,9 +726,9 @@ namespace mg5amcCpu
     // Then copy them to CUDA constant memory (issue #39) or its C++ emulation in file-scope static memory
     const fptype tIPD[2] = { (fptype)m_pars->mdl_MT, (fptype)m_pars->mdl_WT };
     //const cxtype tIPC[0] = { ... }; // nicoup=0
-#ifdef MGONGPUCPP_GPUIMPL
-    gpuMemcpyToSymbol( cIPD, tIPD, 2 * sizeof( fptype ) );
-    //gpuMemcpyToSymbol( cIPC, tIPC, 0 * sizeof( cxtype ) ); // nicoup=0
+#ifdef __CUDACC__
+    checkCuda( cudaMemcpyToSymbol( cIPD, tIPD, 2 * sizeof( fptype ) ) );
+    //checkCuda( cudaMemcpyToSymbol( cIPC, tIPC, 0 * sizeof( cxtype ) ) ); // nicoup=0
 #else
     memcpy( cIPD, tIPD, 2 * sizeof( fptype ) );
     //memcpy( cIPC, tIPC, 0 * sizeof( cxtype ) ); // nicoup=0
@@ -766,7 +765,7 @@ namespace mg5amcCpu
   {
     std::stringstream out;
     // CUDA version (NVCC)
-    // [Use __NVCC__ instead of MGONGPUCPP_GPUIMPL here!]
+    // [Use __NVCC__ instead of __CUDACC__ here!]
     // [This tests if 'nvcc' was used even to build a .cc file, even if not necessarily 'nvcc -x cu' for a .cu file]
     // [Check 'nvcc --compiler-options -dM -E dummy.c | grep CUDA': see https://stackoverflow.com/a/53713712]
 #ifdef __NVCC__
@@ -831,12 +830,12 @@ namespace mg5amcCpu
   __global__ void /* clang-format off */
   computeDependentCouplings( const fptype* allgs, // input: Gs[nevt]
                              fptype* allcouplings // output: couplings[nevt*ndcoup*2]
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
                              , const int nevt     // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #endif
   ) /* clang-format on */
   {
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
     using namespace mg5amcGpu;
     using G_ACCESS = DeviceAccessGs;
     using C_ACCESS = DeviceAccessCouplings;
@@ -857,7 +856,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
+#ifdef __CUDACC__ /* clang-format off */
   __global__ void
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
@@ -986,9 +985,9 @@ namespace mg5amcCpu
         nGoodHel++;
       }
     }
-#ifdef MGONGPUCPP_GPUIMPL
-    gpuMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) );
-    gpuMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) );
+#ifdef __CUDACC__
+    checkCuda( cudaMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) ) );
+    checkCuda( cudaMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) ) );
 #else
     cNGoodHel = nGoodHel;
     for( int ihel = 0; ihel < ncomb; ihel++ ) cGoodHel[ihel] = goodHel[ihel];
@@ -1012,7 +1011,7 @@ namespace mg5amcCpu
 #endif
             int* allselhel,                // output: helicity selection[nevt]
             int* allselcol                 // output: helicity selection[nevt]
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
             , const int nevt               // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #endif
             ) /* clang-format on */
@@ -1033,7 +1032,7 @@ namespace mg5amcCpu
     // Denominators: spins, colors and identical particles
     constexpr int helcolDenominators[1] = { 256 }; // assume nprocesses == 1 (#272 and #343)
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
     // Remember: in CUDA this is a kernel for one event, in c++ this processes n events
     const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid
 #else
@@ -1047,12 +1046,9 @@ namespace mg5amcCpu
 #endif
 
     // Start sigmaKin_lines
-
-#include "GpuAbstraction.h"
-
     // === PART 0 - INITIALISATION (before calculate_wavefunctions) ===
     // Reset the "matrix elements" - running sums of |M|^2 over helicities for the given event
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
     allMEs[ievt] = 0;
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     allNumerators[ievt] = 0;
@@ -1080,7 +1076,7 @@ namespace mg5amcCpu
     // === PART 1 - HELICITY LOOP: CALCULATE WAVEFUNCTIONS ===
     // (in both CUDA and C++, using precomputed good helicities)
 
-#ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++
+#ifdef __CUDACC__ // CUDA OR C++
 
     // *** START OF PART 1a - CUDA (one event per CPU thread) ***
     // Running sum of partial amplitudes squared for event by event color selection (#402)
@@ -1284,7 +1280,7 @@ namespace mg5amcCpu
     // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event
     // [NB 'sum over final spins, average over initial spins', eg see
     // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf]
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
     allMEs[ievt] /= helcolDenominators[0];
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     if( channelId > 0 ) allMEs[ievt] *= allNumerators[ievt] / allDenominators[ievt];
diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/CPPProcess.h b/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/CPPProcess.h
index ff2cb4ab9a..0edca1b52a 100644
--- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/CPPProcess.h
+++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/CPPProcess.h
@@ -4,7 +4,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
 // MadGraph5_aMC@NLO v. 3.5.0_lo_vect, 2023-06-09
@@ -25,7 +25,7 @@
 
 //--------------------------------------------------------------------------
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -107,7 +107,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
   __global__ void
   computeDependentCouplings( const fptype* allgs,    // input: Gs[nevt]
                              fptype* allcouplings ); // output: couplings[nevt*ndcoup*2]
@@ -120,7 +120,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
+#ifdef __CUDACC__ /* clang-format off */
   __global__ void
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
@@ -150,7 +150,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
+#ifdef __CUDACC__ /* clang-format off */
   __global__ void
   sigmaKin( const fptype* allmomenta,      // input: momenta[nevt*npar*4]
             const fptype* allcouplings,    // input: couplings[nevt*ndcoup*2]
diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/CudaRuntime.h b/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/CudaRuntime.h
new file mode 120000
index 0000000000..ce9e1a487a
--- /dev/null
+++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/CudaRuntime.h
@@ -0,0 +1 @@
+../CudaRuntime.h
\ No newline at end of file
diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/check_sa.cc b/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/check_sa.cc
index 1bad694d1c..f1e75b9252 100644
--- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/check_sa.cc
+++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/check_sa.cc
@@ -4,7 +4,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: O. Mattelaer (Nov 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 
 #include "mgOnGpuConfig.h"
@@ -12,7 +12,6 @@
 #include "BridgeKernels.h"
 #include "CPPProcess.h"
 #include "CrossSectionKernels.h"
-#include "GpuRuntime.h"
 #include "MatrixElementKernels.h"
 #include "MemoryAccessMatrixElements.h"
 #include "MemoryAccessMomenta.h"
@@ -64,7 +63,7 @@ usage( char* argv0, int ret = 1 )
   std::cout << std::endl;
   std::cout << "Summary stats are always computed: '-p' and '-j' only control their printout" << std::endl;
   std::cout << "The '-d' flag only enables NaN/abnormal warnings and OMP debugging" << std::endl;
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
 #ifdef _OPENMP
   std::cout << std::endl;
   std::cout << "Use the OMP_NUM_THREADS environment variable to control OMP multi-threading" << std::endl;
@@ -78,7 +77,7 @@ int
 main( int argc, char** argv )
 {
   // Namespaces for CUDA and C++ (FIXME - eventually use the same namespace everywhere...)
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
   using namespace mg5amcGpu;
 #else
   using namespace mg5amcCpu;
@@ -104,11 +103,11 @@ main( int argc, char** argv )
     CurandDevice = 2
   };
 #ifdef __CUDACC__
-  RandomNumberMode rndgen = RandomNumberMode::CurandDevice; // default on CUDA GPU
+  RandomNumberMode rndgen = RandomNumberMode::CurandDevice; // default on GPU
 #elif not defined MGONGPU_HAS_NO_CURAND
   RandomNumberMode rndgen = RandomNumberMode::CurandHost;  // default on CPU if build has curand
 #else
-  RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // default on CPU if build has no curand and on HIP GPU
+  RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // default on CPU if build has no curand
 #endif
   // Rambo sampling mode (NB RamboHost implies CommonRandom or CurandHost!)
   enum class RamboSamplingMode
@@ -116,7 +115,7 @@ main( int argc, char** argv )
     RamboHost = 1,
     RamboDevice = 2
   };
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
   RamboSamplingMode rmbsmp = RamboSamplingMode::RamboDevice; // default on GPU
 #else
   RamboSamplingMode rmbsmp = RamboSamplingMode::RamboHost; // default on CPU
@@ -149,7 +148,7 @@ main( int argc, char** argv )
 #ifdef __CUDACC__
       rndgen = RandomNumberMode::CurandDevice;
 #else
-      throw std::runtime_error( "CurandDevice is not supported on CPUs or on HIP GPUs" );
+      throw std::runtime_error( "CurandDevice is not supported on CPUs" );
 #endif
     }
     else if( arg == "--curhst" )
@@ -166,7 +165,7 @@ main( int argc, char** argv )
     }
     else if( arg == "--rmbdev" )
     {
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
       rmbsmp = RamboSamplingMode::RamboDevice;
 #else
       throw std::runtime_error( "RamboDevice is not supported on CPUs" );
@@ -240,13 +239,13 @@ main( int argc, char** argv )
     return usage( argv[0] );
   }
 
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
 #ifdef _OPENMP
   ompnumthreadsNotSetMeansOneThread( debug ? 1 : 0 ); // quiet(-1), info(0), debug(1)
 #endif
 #endif
 
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
   // Fail gently and avoid "Illegal instruction (core dumped)" if the host does not support the SIMD used in the ME calculation
   // Note: this prevents a crash on pmpe04 but not on some github CI nodes?
   // [NB: SIMD vectorization in mg5amc C++ code is only used in the ME calculation below MatrixElementKernelHost!]
@@ -264,14 +263,14 @@ main( int argc, char** argv )
 
   // === STEP 0 - INITIALISE
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 
-  // --- 00. Initialise GPU
-  // Instantiate a GpuRuntime at the beginnining of the application's main.
-  // For CUDA this invokes cudaSetDevice(0) in the constructor and books a cudaDeviceReset() call in the destructor.
-  const std::string cdinKey = "00 GpuInit";
+  // --- 00. Initialise cuda
+  // Instantiate a CudaRuntime at the beginnining of the application's main to
+  // invoke cudaSetDevice(0) in the constructor and book a cudaDeviceReset() call in the destructor
+  const std::string cdinKey = "00 CudaInit";
   timermap.start( cdinKey );
-  GpuRuntime GpuRuntime( debug );
+  CudaRuntime cudaRuntime( debug );
 #endif
 
   // --- 0a. Initialise physics process
@@ -293,7 +292,7 @@ main( int argc, char** argv )
   timermap.start( alloKey );
 
   // Memory buffers for random numbers for momenta
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
   HostBufferRndNumMomenta hstRndmom( nevt );
 #else
   PinnedHostBufferRndNumMomenta hstRndmom( nevt );
@@ -301,7 +300,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for sampling weights
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
   HostBufferWeights hstWeights( nevt );
 #else
   PinnedHostBufferWeights hstWeights( nevt );
@@ -309,7 +308,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for momenta
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
   HostBufferMomenta hstMomenta( nevt );
 #else
   PinnedHostBufferMomenta hstMomenta( nevt );
@@ -317,7 +316,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for Gs
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
   HostBufferGs hstGs( nevt );
 #else
   PinnedHostBufferGs hstGs( nevt );
@@ -334,7 +333,7 @@ main( int argc, char** argv )
   }
 
   // Memory buffers for matrix elements
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
   HostBufferMatrixElements hstMatrixElements( nevt );
 #else
   PinnedHostBufferMatrixElements hstMatrixElements( nevt );
@@ -343,7 +342,7 @@ main( int argc, char** argv )
 
   // Memory buffers for random numbers for helicity selection
   // *** NB #403 these buffers always remain initialised at 0: no need for helicity choice in gcheck/check (no LHE produced) ***
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
   HostBufferRndNumHelicity hstRndHel( nevt );
 #else
   PinnedHostBufferRndNumHelicity hstRndHel( nevt );
@@ -352,7 +351,7 @@ main( int argc, char** argv )
 
   // Memory buffers for random numbers for color selection
   // *** NB #402 these buffers always remain initialised at 0: no need for color choice in gcheck/check (no LHE produced) ***
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
   HostBufferRndNumColor hstRndCol( nevt );
 #else
   PinnedHostBufferRndNumColor hstRndCol( nevt );
@@ -360,7 +359,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for helicity selection
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
   HostBufferSelectedHelicity hstSelHel( nevt );
 #else
   PinnedHostBufferSelectedHelicity hstSelHel( nevt );
@@ -368,7 +367,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for color selection
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
   HostBufferSelectedColor hstSelCol( nevt );
 #else
   PinnedHostBufferSelectedColor hstSelCol( nevt );
@@ -404,7 +403,7 @@ main( int argc, char** argv )
 #else
   else
   {
-    throw std::logic_error( "CurandDevice is not supported on CPUs or HIP GPUs" ); // INTERNAL ERROR (no path to this statement)
+    throw std::logic_error( "CurandDevice is not supported on CPUs" ); // INTERNAL ERROR (no path to this statement)
   }
 #endif
 #else
@@ -422,7 +421,7 @@ main( int argc, char** argv )
   }
   else
   {
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
     prsk.reset( new RamboSamplingKernelDevice( energy, devRndmom, devMomenta, devWeights, gpublocks, gputhreads ) );
 #else
     throw std::logic_error( "RamboDevice is not supported on CPUs" ); // INTERNAL ERROR (no path to this statement)
@@ -433,7 +432,7 @@ main( int argc, char** argv )
   std::unique_ptr<MatrixElementKernelBase> pmek;
   if( !bridge )
   {
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
     pmek.reset( new MatrixElementKernelDevice( devMomenta, devGs, devRndHel, devRndCol, devMatrixElements, devSelHel, devSelCol, gpublocks, gputhreads ) );
 #else
     pmek.reset( new MatrixElementKernelHost( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, nevt ) );
@@ -441,7 +440,7 @@ main( int argc, char** argv )
   }
   else
   {
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
     pmek.reset( new BridgeKernelDevice( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, gpublocks, gputhreads ) );
 #else
     pmek.reset( new BridgeKernelHost( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, nevt ) );
@@ -483,7 +482,7 @@ main( int argc, char** argv )
     prnk->generateRnarray();
     //std::cout << "Got random numbers" << std::endl;
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
     if( rndgen != RandomNumberMode::CurandDevice && rmbsmp == RamboSamplingMode::RamboDevice )
     {
       // --- 1c. Copy rndmom from host to device
@@ -515,7 +514,7 @@ main( int argc, char** argv )
     prsk->getMomentaFinal();
     //std::cout << "Got final momenta" << std::endl;
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
     if( rmbsmp == RamboSamplingMode::RamboDevice )
     {
       // --- 2c. CopyDToH Weights
@@ -560,7 +559,7 @@ main( int argc, char** argv )
       dynamic_cast<BridgeKernelBase*>( pmek.get() )->transposeInputMomentaC2F();
     }
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
     // --- 2d. CopyHToD Momenta
     const std::string gKey = "0.. CpHTDg";
     rambtime += timermap.start( gKey ); // FIXME! NOT A RAMBO TIMER!
@@ -589,7 +588,7 @@ main( int argc, char** argv )
     wv3atime += timermap.stop(); // calc only
     wavetime += wv3atime;        // calc plus copy
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
     if( !bridge )
     {
       // --- 3b. CopyDToH MEs
@@ -732,19 +731,15 @@ main( int argc, char** argv )
     rndgentxt = "CURAND DEVICE";
 #ifdef __CUDACC__
   rndgentxt += " (CUDA code)";
-#elif defined __HIPCC__
-  rndgentxt += " (HIP code)";
 #else
   rndgentxt += " (C++ code)";
 #endif
 
   // Workflow description summary
   std::string wrkflwtxt;
-  // -- CUDA or HIP or C++?
+  // -- CUDA or C++?
 #ifdef __CUDACC__
   wrkflwtxt += "CUD:";
-#elif defined __HIPCC__
-  wrkflwtxt += "HIP:";
 #else
   wrkflwtxt += "CPP:";
 #endif
@@ -769,12 +764,6 @@ main( int argc, char** argv )
 #else
   wrkflwtxt += "???:"; // no path to this statement
 #endif
-#elif defined __HIPCC__
-#if defined MGONGPU_CUCXTYPE_CXSMPL
-  wrkflwtxt += "CXS:";
-#else
-  wrkflwtxt += "???:"; // no path to this statement
-#endif
 #else
 #if defined MGONGPU_CPPCXTYPE_STDCOMPLEX
   wrkflwtxt += "STX:";
@@ -800,7 +789,7 @@ main( int argc, char** argv )
     wrkflwtxt += "RMBDEV+";
   else
     wrkflwtxt += "??????+"; // no path to this statement
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
   // -- HOST or DEVICE matrix elements? Standalone MEs or BRIDGE?
   if( !bridge )
     wrkflwtxt += "MESDEV";
@@ -856,7 +845,7 @@ main( int argc, char** argv )
 
   if( perf )
   {
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
 #ifdef _OPENMP
     // Get the output of "nproc --all" (https://stackoverflow.com/a/478960)
     std::string nprocall;
@@ -877,8 +866,6 @@ main( int argc, char** argv )
     std::cout << std::string( SEP79, '*' ) << std::endl
 #ifdef __CUDACC__
               << "Process                     = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_CUDA"
-#elif defined __HIPCC__
-              << "Process                     = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_HIP"
 #else
               << "Process                     = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_CPP"
 #endif
@@ -905,21 +892,21 @@ main( int argc, char** argv )
 #elif defined MGONGPU_FPTYPE_FLOAT
               << "FP precision                = FLOAT (NaN/abnormal=" << nabn << ", zero=" << nzero << ")" << std::endl
 #endif
+#ifdef __CUDACC__
 #if defined MGONGPU_CUCXTYPE_CUCOMPLEX
               << "Complex type                = CUCOMPLEX" << std::endl
 #elif defined MGONGPU_CUCXTYPE_THRUST
               << "Complex type                = THRUST::COMPLEX" << std::endl
-#elif defined MGONGPU_CUCXTYPE_CXSMPL
-              << "Complex type                = STD::COMPLEX" << std::endl
+#endif
 #else
-              << "Complex type                = ???" << std::endl // no path to this statement...
+              << "Complex type                = STD::COMPLEX" << std::endl
 #endif
               << "RanNumb memory layout       = AOSOA[" << neppR << "]"
               << ( neppR == 1 ? " == AOS" : "" )
               << " [HARDCODED FOR REPRODUCIBILITY]" << std::endl
               << "Momenta memory layout       = AOSOA[" << neppM << "]"
               << ( neppM == 1 ? " == AOS" : "" ) << std::endl
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
     //<< "Wavefunction GPU memory     = LOCAL" << std::endl
 #else
 #if !defined MGONGPU_CPPSIMD
@@ -950,7 +937,7 @@ main( int argc, char** argv )
 #endif
 #endif
               << "Random number generation    = " << rndgentxt << std::endl
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
 #ifdef _OPENMP
               << "OMP threads / `nproc --all` = " << omp_get_max_threads() << " / " << nprocall // includes a newline
 #endif
@@ -1046,14 +1033,14 @@ main( int argc, char** argv )
              << "\"FLOAT (NaN/abnormal=" << nabn << ")\"," << std::endl
 #endif
              << "\"Complex type\": "
+#ifdef __CUDACC__
 #if defined MGONGPU_CUCXTYPE_CUCOMPLEX
              << "\"CUCOMPLEX\"," << std::endl
 #elif defined MGONGPU_CUCXTYPE_THRUST
              << "\"THRUST::COMPLEX\"," << std::endl
-#elif defined MGONGPU_CUCXTYPE_CXSMPL
-             << "\"STD::COMPLEX\"," << std::endl
+#endif
 #else
-             << "\"???\"," << std::endl                           // no path to this statement...
+             << "\"STD::COMPLEX\"," << std::endl
 #endif
              << "\"RanNumb memory layout\": "
              << "\"AOSOA[" << neppR << "]\""
@@ -1061,7 +1048,7 @@ main( int argc, char** argv )
              << "\"Momenta memory layout\": "
              << "\"AOSOA[" << neppM << "]\""
              << ( neppM == 1 ? " == AOS" : "" ) << ", " << std::endl
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
     //<< "\"Wavefunction GPU memory\": " << "\"LOCAL\"," << std::endl
 #endif
              << "\"Curand generation\": "
diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/RamboSamplingKernels.cc b/epochX/cudacpp/gg_ttg.mad/SubProcesses/RamboSamplingKernels.cc
index 79abbcc4f8..da68aa9255 100644
--- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/RamboSamplingKernels.cc
+++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/RamboSamplingKernels.cc
@@ -1,11 +1,11 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #include "RamboSamplingKernels.h"
 
-#include "GpuRuntime.h"
+#include "CudaRuntime.h"
 #include "MemoryAccessMomenta.h"
 #include "MemoryAccessRandomNumbers.h"
 #include "MemoryAccessWeights.h"
@@ -14,7 +14,7 @@
 
 #include <sstream>
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -92,7 +92,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
   RamboSamplingKernelDevice::RamboSamplingKernelDevice( const fptype energy,               // input: energy
                                                         const BufferRndNumMomenta& rndmom, // input: random numbers in [0,1]
                                                         BufferMomenta& momenta,            // output: momenta
@@ -135,7 +135,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
   __global__ void
   getMomentaInitialDevice( const fptype energy,
                            fptype* momenta )
@@ -147,17 +147,17 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
   void
   RamboSamplingKernelDevice::getMomentaInitial()
   {
-    gpuLaunchKernel( getMomentaInitialDevice, m_gpublocks, m_gputhreads, m_energy, m_momenta.data() );
+    getMomentaInitialDevice<<<m_gpublocks, m_gputhreads>>>( m_energy, m_momenta.data() );
   }
 #endif
 
   //--------------------------------------------------------------------------
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
   __global__ void
   getMomentaFinalDevice( const fptype energy,
                          const fptype* rndmom,
@@ -171,11 +171,11 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
   void
   RamboSamplingKernelDevice::getMomentaFinal()
   {
-    gpuLaunchKernel( getMomentaFinalDevice, m_gpublocks, m_gputhreads, m_energy, m_rndmom.data(), m_momenta.data(), m_weights.data() );
+    getMomentaFinalDevice<<<m_gpublocks, m_gputhreads>>>( m_energy, m_rndmom.data(), m_momenta.data(), m_weights.data() );
   }
 #endif
 
diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/RamboSamplingKernels.h b/epochX/cudacpp/gg_ttg.mad/SubProcesses/RamboSamplingKernels.h
index 7c214cd74b..184089efd7 100644
--- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/RamboSamplingKernels.h
+++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/RamboSamplingKernels.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef RAMBOSAMPLINGKERNELS_H
 #define RAMBOSAMPLINGKERNELS_H 1
@@ -10,7 +10,7 @@
 
 #include "MemoryBuffers.h"
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -93,7 +93,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
   // A class encapsulating RAMBO phase space sampling on a GPU device
   class RamboSamplingKernelDevice final : public SamplingKernelBase, public NumberOfEvents
   {
diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/RandomNumberKernels.h b/epochX/cudacpp/gg_ttg.mad/SubProcesses/RandomNumberKernels.h
index 21d63beeac..188a72c2c9 100644
--- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/RandomNumberKernels.h
+++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/RandomNumberKernels.h
@@ -1,14 +1,14 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef RANDOMNUMBERKERNELS_H
 #define RANDOMNUMBERKERNELS_H 1
 
 #include "mgOnGpuConfig.h"
 
-// NB This must come AFTER mgOnGpuConfig.h which contains our definition of __global__ when MGONGPUCPP_GPUIMPL is not defined
+// NB This must come AFTER mgOnGpuConfig.h which contains our definition of __global__ when __CUDACC__ is not defined
 #ifndef MGONGPU_HAS_NO_CURAND
 //#include "curand.h"
 struct curandGenerator_st; // forward definition from curand.h
@@ -16,7 +16,7 @@ struct curandGenerator_st; // forward definition from curand.h
 
 #include "MemoryBuffers.h"
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/cudacpp.mk b/epochX/cudacpp/gg_ttg.mad/SubProcesses/cudacpp.mk
index bcb73d7f01..a0397e9ecc 100644
--- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/cudacpp.mk
+++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/cudacpp.mk
@@ -1,7 +1,7 @@
 # Copyright (C) 2020-2023 CERN and UCLouvain.
 # Licensed under the GNU Lesser General Public License (version 3 or later).
 # Created by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin.
-# Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+# Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 
 #=== Determine the name of this makefile (https://ftp.gnu.org/old-gnu/Manuals/make-3.80/html_node/make_17.html)
 #=== NB: different names (e.g. cudacpp.mk and cudacpp_src.mk) are used in the Subprocess and src directories
@@ -32,7 +32,7 @@ UNAME_P := $(shell uname -p)
 #=== Configure common compiler flags for C++ and CUDA
 
 INCFLAGS = -I.
-OPTFLAGS = -O3 # this ends up in GPUFLAGS too (should it?), cannot add -Ofast or -ffast-math here
+OPTFLAGS = -O3 # this ends up in CUFLAGS too (should it?), cannot add -Ofast or -ffast-math here
 
 # Dependency on src directory
 MG5AMC_COMMONLIB = mg5amc_common
@@ -89,139 +89,69 @@ endif
 
 #-------------------------------------------------------------------------------
 
-CUDA_COMPILER_PATH := $(shell compiler="`which nvcc 2>/dev/null`" && while [ -L "$$compiler" ]; do compiler=`readlink "$$compiler"`; done && echo "$$compiler")
-HIP_COMPILER_PATH := $(shell compiler="`which hipcc 2>/dev/null`" && while [ -L "$$compiler" ]; do compiler=`readlink "$$compiler"`; done && echo "$$compiler")
-
-ifeq ($(findstring nvcc,$(CUDA_COMPILER_PATH)),nvcc)
-  #=== Configure the CUDA compiler
-
-  # If CXX is not a single word (example "clang++ --gcc-toolchain...") then disable CUDA builds (issue #505)
-  # This is because it is impossible to pass this to "GPUFLAGS += -ccbin <host-compiler>" below
-  ifneq ($(words $(subst ccache ,,$(CXX))),1) # allow at most "CXX=ccache <host-compiler>" from outside
-    $(warning CUDA builds are not supported for multi-word CXX "$(CXX)")
-    override CUDA_HOME=disabled
-  endif
-
-  # If CUDA_HOME is not set, try to set it from the location of NVCC
-  ifndef CUDA_HOME
-    CUDA_HOME = $(patsubst %bin/nvcc,%,$(shell which nvcc 2>/dev/null))
-    $(warning CUDA_HOME was not set: using "$(CUDA_HOME)")
-  endif
-
-  # Set GPUCC as $(CUDA_HOME)/bin/nvcc if it exists
-  ifneq ($(wildcard $(CUDA_HOME)/bin/nvcc),)
-    GPUCC = $(CUDA_HOME)/bin/nvcc
-    USE_NVTX ?=-DUSE_NVTX
-    # See https://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/index.html
-    # See https://arnon.dk/matching-sm-architectures-arch-and-gencode-for-various-nvidia-cards/
-    # Default: use compute capability 70 for V100 (CERN lxbatch, CERN itscrd, Juwels Cluster).
-    # Embed device code for 70, and PTX for 70+.
-    # Export MADGRAPH_CUDA_ARCHITECTURE (comma-separated list) to use another value or list of values (see #533).
-    # Examples: use 60 for P100 (Piz Daint), 80 for A100 (Juwels Booster, NVidia raplab/Curiosity).
-    MADGRAPH_CUDA_ARCHITECTURE ?= 70
-    ###CUARCHFLAGS = -gencode arch=compute_$(MADGRAPH_CUDA_ARCHITECTURE),code=compute_$(MADGRAPH_CUDA_ARCHITECTURE) -gencode arch=compute_$(MADGRAPH_CUDA_ARCHITECTURE),code=sm_$(MADGRAPH_CUDA_ARCHITECTURE) # Older implementation (AV): go back to this one for multi-GPU support #533
-    ###CUARCHFLAGS = --gpu-architecture=compute_$(MADGRAPH_CUDA_ARCHITECTURE) --gpu-code=sm_$(MADGRAPH_CUDA_ARCHITECTURE),compute_$(MADGRAPH_CUDA_ARCHITECTURE) # Newer implementation (SH): cannot use this as-is for multi-GPU support #533
-    comma:=,
-    CUARCHFLAGS = $(foreach arch,$(subst $(comma), ,$(MADGRAPH_CUDA_ARCHITECTURE)),-gencode arch=compute_$(arch),code=compute_$(arch) -gencode arch=compute_$(arch),code=sm_$(arch))
-    CUINC = -I$(CUDA_HOME)/include/
-    CURANDLIBFLAGS = -L$(CUDA_HOME)/lib64/ -lcurand # NB: -lcuda is not needed here!
-    CUOPTFLAGS = -lineinfo
-    GPUFLAGS = $(OPTFLAGS) $(CUOPTFLAGS) $(INCFLAGS) $(CUINC) $(USE_NVTX) $(CUARCHFLAGS) -use_fast_math
-    ###GPUFLAGS += -Xcompiler -Wall -Xcompiler -Wextra -Xcompiler -Wshadow
-    ###GPUCC_VERSION = $(shell $(GPUCC) --version | grep 'Cuda compilation tools' | cut -d' ' -f5 | cut -d, -f1)
-    GPUFLAGS += -std=c++17 # need CUDA >= 11.2 (see #333): this is enforced in mgOnGpuConfig.h
-    # Without -maxrregcount: baseline throughput: 6.5E8 (16384 32 12) up to 7.3E8 (65536 128 12)
-    ###GPUFLAGS+= --maxrregcount 160 # improves throughput: 6.9E8 (16384 32 12) up to 7.7E8 (65536 128 12)
-    ###GPUFLAGS+= --maxrregcount 128 # improves throughput: 7.3E8 (16384 32 12) up to 7.6E8 (65536 128 12)
-    ###GPUFLAGS+= --maxrregcount 96 # degrades throughput: 4.1E8 (16384 32 12) up to 4.5E8 (65536 128 12)
-    ###GPUFLAGS+= --maxrregcount 64 # degrades throughput: 1.7E8 (16384 32 12) flat at 1.7E8 (65536 128 12)
-
-    CUBUILDRULEFLAGS = -Xcompiler -fPIC -c
-    CCBUILDRULEFLAGS = -Xcompiler -fPIC -c -x cu
-
-    CUDATESTFLAGS = -lcuda
-
-  else ifneq ($(origin REQUIRE_CUDA),undefined)
-    # If REQUIRE_CUDA is set but no cuda is found, stop here (e.g. for CI tests on GPU #443)
-    $(error No cuda installation found (set CUDA_HOME or make GPUCC visible in PATH))
-  else
-    # No cuda. Switch cuda compilation off and go to common random numbers in C++
-    $(warning CUDA_HOME is not set or is invalid: export CUDA_HOME to compile with cuda)
-    override GPUCC=
-    override USE_NVTX=
-    override CUINC=
-    override CURANDLIBFLAGS=
-  endif
-
-  # Set the host C++ compiler for GPUCC via "-ccbin <host-compiler>"
-  # (NB issue #505: this must be a single word, "clang++ --gcc-toolchain..." is not supported)
-  GPUFLAGS += -ccbin $(shell which $(subst ccache ,,$(CXX)))
-
-  # Allow newer (unsupported) C++ compilers with older versions of CUDA if ALLOW_UNSUPPORTED_COMPILER_IN_CUDA is set (#504)
-  ifneq ($(origin ALLOW_UNSUPPORTED_COMPILER_IN_CUDA),undefined)
-  GPUFLAGS += -allow-unsupported-compiler
-  endif
-
-else ifeq ($(findstring hipcc,$(HIP_COMPILER_PATH)),hipcc)
-  #=== Configure the HIP compiler
-
-  # If CXX is not a single word (example "clang++ --gcc-toolchain...") then disable CUDA builds (issue #505)
-  # This is because it is impossible to pass this to "GPUFLAGS += -ccbin <host-compiler>" below
-  ifneq ($(words $(subst ccache ,,$(CXX))),1) # allow at most "CXX=ccache <host-compiler>" from outside
-    $(warning CUDA builds are not supported for multi-word CXX "$(CXX)")
-    override CUDA_HOME=disabled
-  endif
-
-  # If HIP_HOME is not set, try to set it from the location of GPUCC
-  ifndef HIP_HOME
-    HIP_HOME = $(patsubst %bin/hipcc,%,$(HIP_COMPILER_PATH))
-    $(warning HIP_HOME was not set: using "$(HIP_HOME)")
-  endif
-
-  # Set GPUCC as $(HIP_HOME)/bin/hipcc if it exists
-  ifneq ($(wildcard $(HIP_HOME)/bin/hipcc),)
-    GPUCC = $(HIP_HOME)/bin/hipcc
-
-    # Should maybe find something equivelant to this in HIP
-    #USE_NVTX ?=-DUSE_NVTX
-
-    HIPARCHFLAGS = -target x86_64-linux-gnu --offload-arch=gfx90a
-    HIPINC = -I$(HIP_HOME)/include/
-
-    # -DHIP_FAST_MATH equivelant to -use_fast_math in HIP 
-    # (But only for single precision line 208: https://rocm-developer-tools.github.io/HIP/hcc__detail_2math__functions_8h_source.html)
-    GPUFLAGS = $(OPTFLAGS) $(CUOPTFLAGS) $(INCFLAGS) $(HIPINC) $(HIPARCHFLAGS) -DHIP_FAST_MATH -DHIP_PLATFORM=amd -fPIC
-    ###GPUFLAGS += -Xcompiler -Wall -Xcompiler -Wextra -Xcompiler -Wshadow
-    GPUFLAGS += -std=c++17
-    # Without -maxrregcount: baseline throughput: 6.5E8 (16384 32 12) up to 7.3E8 (65536 128 12)
-    ###GPUFLAGS+= --maxrregcount 160 # improves throughput: 6.9E8 (16384 32 12) up to 7.7E8 (65536 128 12)
-    ###GPUFLAGS+= --maxrregcount 128 # improves throughput: 7.3E8 (16384 32 12) up to 7.6E8 (65536 128 12)
-    ###GPUFLAGS+= --maxrregcount 96 # degrades throughput: 4.1E8 (16384 32 12) up to 4.5E8 (65536 128 12)
-    ###GPUFLAGS+= --maxrregcount 64 # degrades throughput: 1.7E8 (16384 32 12) flat at 1.7E8 (65536 128 12)
-
-    CUBUILDRULEFLAGS = -fPIC -c
-    CCBUILDRULEFLAGS = -fPIC -c
-
-  else ifneq ($(origin REQUIRE_HIP),undefined)
-    # If REQUIRE_HIP is set but no cuda is found, stop here (e.g. for CI tests on GPU #443)
-    $(error No hip installation found (set HIP_HOME or make GPUCC visible in PATH))
-  else
-    # No hip. Switch hip compilation off and go to common random numbers in C++
-    $(warning HIP_HOME is not set or is invalid: export HIP_HOME to compile with hip)
-    override GPUCC=
-    override USE_NVTX=
-    override CUINC=
-    override CURANDLIBFLAGS=
-  endif
+#=== Configure the CUDA compiler
+
+# If CXX is not a single word (example "clang++ --gcc-toolchain...") then disable CUDA builds (issue #505)
+# This is because it is impossible to pass this to "CUFLAGS += -ccbin <host-compiler>" below
+ifneq ($(words $(subst ccache ,,$(CXX))),1) # allow at most "CXX=ccache <host-compiler>" from outside
+  $(warning CUDA builds are not supported for multi-word CXX "$(CXX)")
+  override CUDA_HOME=disabled
+endif
+
+# If CUDA_HOME is not set, try to set it from the location of nvcc
+ifndef CUDA_HOME
+  CUDA_HOME = $(patsubst %bin/nvcc,%,$(shell which nvcc 2>/dev/null))
+  $(warning CUDA_HOME was not set: using "$(CUDA_HOME)")
+endif
+
+# Set NVCC as $(CUDA_HOME)/bin/nvcc if it exists
+ifneq ($(wildcard $(CUDA_HOME)/bin/nvcc),)
+  NVCC = $(CUDA_HOME)/bin/nvcc
+  USE_NVTX ?=-DUSE_NVTX
+  # See https://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/index.html
+  # See https://arnon.dk/matching-sm-architectures-arch-and-gencode-for-various-nvidia-cards/
+  # Default: use compute capability 70 for V100 (CERN lxbatch, CERN itscrd, Juwels Cluster).
+  # Embed device code for 70, and PTX for 70+.
+  # Export MADGRAPH_CUDA_ARCHITECTURE (comma-separated list) to use another value or list of values (see #533).
+  # Examples: use 60 for P100 (Piz Daint), 80 for A100 (Juwels Booster, NVidia raplab/Curiosity).
+  MADGRAPH_CUDA_ARCHITECTURE ?= 70
+  ###CUARCHFLAGS = -gencode arch=compute_$(MADGRAPH_CUDA_ARCHITECTURE),code=compute_$(MADGRAPH_CUDA_ARCHITECTURE) -gencode arch=compute_$(MADGRAPH_CUDA_ARCHITECTURE),code=sm_$(MADGRAPH_CUDA_ARCHITECTURE) # Older implementation (AV): go back to this one for multi-GPU support #533
+  ###CUARCHFLAGS = --gpu-architecture=compute_$(MADGRAPH_CUDA_ARCHITECTURE) --gpu-code=sm_$(MADGRAPH_CUDA_ARCHITECTURE),compute_$(MADGRAPH_CUDA_ARCHITECTURE) # Newer implementation (SH): cannot use this as-is for multi-GPU support #533
+  comma:=,
+  CUARCHFLAGS = $(foreach arch,$(subst $(comma), ,$(MADGRAPH_CUDA_ARCHITECTURE)),-gencode arch=compute_$(arch),code=compute_$(arch) -gencode arch=compute_$(arch),code=sm_$(arch))
+  CUINC = -I$(CUDA_HOME)/include/
+  CURANDLIBFLAGS = -L$(CUDA_HOME)/lib64/ -lcurand # NB: -lcuda is not needed here!
+  CUOPTFLAGS = -lineinfo
+  CUFLAGS = $(OPTFLAGS) $(CUOPTFLAGS) $(INCFLAGS) $(CUINC) $(USE_NVTX) $(CUARCHFLAGS) -use_fast_math
+  ###CUFLAGS += -Xcompiler -Wall -Xcompiler -Wextra -Xcompiler -Wshadow
+  ###NVCC_VERSION = $(shell $(NVCC) --version | grep 'Cuda compilation tools' | cut -d' ' -f5 | cut -d, -f1)
+  CUFLAGS += -std=c++17 # need CUDA >= 11.2 (see #333): this is enforced in mgOnGpuConfig.h
+  # Without -maxrregcount: baseline throughput: 6.5E8 (16384 32 12) up to 7.3E8 (65536 128 12)
+  ###CUFLAGS+= --maxrregcount 160 # improves throughput: 6.9E8 (16384 32 12) up to 7.7E8 (65536 128 12)
+  ###CUFLAGS+= --maxrregcount 128 # improves throughput: 7.3E8 (16384 32 12) up to 7.6E8 (65536 128 12)
+  ###CUFLAGS+= --maxrregcount 96 # degrades throughput: 4.1E8 (16384 32 12) up to 4.5E8 (65536 128 12)
+  ###CUFLAGS+= --maxrregcount 64 # degrades throughput: 1.7E8 (16384 32 12) flat at 1.7E8 (65536 128 12)
+else ifneq ($(origin REQUIRE_CUDA),undefined)
+  # If REQUIRE_CUDA is set but no cuda is found, stop here (e.g. for CI tests on GPU #443)
+  $(error No cuda installation found (set CUDA_HOME or make nvcc visible in PATH))
+else
+  # No cuda. Switch cuda compilation off and go to common random numbers in C++
+  $(warning CUDA_HOME is not set or is invalid: export CUDA_HOME to compile with cuda)
+  override NVCC=
+  override USE_NVTX=
+  override CUINC=
+  override CURANDLIBFLAGS=
+endif
 
-  # Allow newer (unsupported) C++ compilers with older versions of CUDA if ALLOW_UNSUPPORTED_COMPILER_IN_CUDA is set (#504)
-  ifneq ($(origin ALLOW_UNSUPPORTED_COMPILER_IN_CUDA),undefined)
-  GPUFLAGS += -allow-unsupported-compiler
-  endif
+# Set the host C++ compiler for nvcc via "-ccbin <host-compiler>"
+# (NB issue #505: this must be a single word, "clang++ --gcc-toolchain..." is not supported)
+CUFLAGS += -ccbin $(shell which $(subst ccache ,,$(CXX)))
 
+# Allow newer (unsupported) C++ compilers with older versions of CUDA if ALLOW_UNSUPPORTED_COMPILER_IN_CUDA is set (#504)
+ifneq ($(origin ALLOW_UNSUPPORTED_COMPILER_IN_CUDA),undefined)
+CUFLAGS += -allow-unsupported-compiler
 endif
 
-  
 #-------------------------------------------------------------------------------
 
 #=== Configure ccache for C++ and CUDA builds
@@ -233,9 +163,9 @@ endif
 #ifeq ($(USECCACHE)$(shell echo $(AR) | grep ccache),1)
 #  override AR:=ccache $(AR)
 #endif
-ifneq ($(GPUCC),)
-  ifeq ($(USECCACHE)$(shell echo $(GPUCC) | grep ccache),1)
-    override GPUCC:=ccache $(GPUCC)
+ifneq ($(NVCC),)
+  ifeq ($(USECCACHE)$(shell echo $(NVCC) | grep ccache),1)
+    override NVCC:=ccache $(NVCC)
   endif
 endif
 
@@ -259,7 +189,7 @@ endif
 
 # PowerPC-specific CUDA compiler flags (to be reviewed!)
 ifeq ($(UNAME_P),ppc64le)
-  GPUFLAGS+= -Xcompiler -mno-float128
+  CUFLAGS+= -Xcompiler -mno-float128
 endif
 
 #-------------------------------------------------------------------------------
@@ -269,10 +199,10 @@ endif
 # Set the default OMPFLAGS choice
 ifneq ($(shell $(CXX) --version | egrep '^Intel'),)
 override OMPFLAGS = -fopenmp
-###override OMPFLAGS = # disable OpenMP MT on Intel (was ok without GPUCC but not ok with GPUCC before #578)
+###override OMPFLAGS = # disable OpenMP MT on Intel (was ok without nvcc but not ok with nvcc before #578)
 else ifneq ($(shell $(CXX) --version | egrep '^(clang)'),)
 override OMPFLAGS = -fopenmp
-###override OMPFLAGS = # disable OpenMP MT on clang (was not ok without or with GPUCC before #578)
+###override OMPFLAGS = # disable OpenMP MT on clang (was not ok without or with nvcc before #578)
 else ifneq ($(shell $(CXX) --version | egrep '^(Apple clang)'),)
 override OMPFLAGS = # disable OpenMP MT on Apple clang (builds fail in the CI #578)
 else
@@ -323,10 +253,7 @@ endif
 
 # Set the default RNDGEN (random number generator) choice
 ifeq ($(RNDGEN),)
-  ifeq ($(GPUCC),)
-    override RNDGEN = hasNoCurand
-  # Edgecase for HIP compilation
-  else ifeq ($(findstring hipcc,$(GPUCC)),hipcc)
+  ifeq ($(NVCC),)
     override RNDGEN = hasNoCurand
   else ifeq ($(RNDGEN),)
     override RNDGEN = hasCurand
@@ -401,13 +328,13 @@ CXXFLAGS+= $(AVXFLAGS)
 $(info FPTYPE=$(FPTYPE))
 ifeq ($(FPTYPE),d)
   CXXFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_DOUBLE
-  GPUFLAGS  += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_DOUBLE
+  CUFLAGS  += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_DOUBLE
 else ifeq ($(FPTYPE),f)
   CXXFLAGS += -DMGONGPU_FPTYPE_FLOAT -DMGONGPU_FPTYPE2_FLOAT
-  GPUFLAGS  += -DMGONGPU_FPTYPE_FLOAT -DMGONGPU_FPTYPE2_FLOAT
+  CUFLAGS  += -DMGONGPU_FPTYPE_FLOAT -DMGONGPU_FPTYPE2_FLOAT
 else ifeq ($(FPTYPE),m)
   CXXFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_FLOAT
-  GPUFLAGS  += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_FLOAT
+  CUFLAGS  += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_FLOAT
 else
   $(error Unknown FPTYPE='$(FPTYPE)': only 'd', 'f' and 'm' are supported)
 endif
@@ -416,7 +343,7 @@ endif
 $(info HELINL=$(HELINL))
 ifeq ($(HELINL),1)
   CXXFLAGS += -DMGONGPU_INLINE_HELAMPS
-  GPUFLAGS  += -DMGONGPU_INLINE_HELAMPS
+  CUFLAGS  += -DMGONGPU_INLINE_HELAMPS
 else ifneq ($(HELINL),0)
   $(error Unknown HELINL='$(HELINL)': only '0' and '1' are supported)
 endif
@@ -425,7 +352,7 @@ endif
 $(info HRDCOD=$(HRDCOD))
 ifeq ($(HRDCOD),1)
   CXXFLAGS += -DMGONGPU_HARDCODE_PARAM
-  GPUFLAGS  += -DMGONGPU_HARDCODE_PARAM
+  CUFLAGS  += -DMGONGPU_HARDCODE_PARAM
 else ifneq ($(HRDCOD),0)
   $(error Unknown HRDCOD='$(HRDCOD)': only '0' and '1' are supported)
 endif
@@ -477,11 +404,11 @@ ifeq ($(UNAME_S),Darwin)
   override CULIBFLAGSRPATH2 =
 else
   # RPATH to cuda/cpp libs when linking executables
-  override CXXLIBFLAGSRPATH = -Wl,-rpath=$(LIBDIRRPATH)
-  override CULIBFLAGSRPATH = -Xlinker -rpath=$(LIBDIRRPATH)
+  override CXXLIBFLAGSRPATH = -Wl,-rpath,$(LIBDIRRPATH)
+  override CULIBFLAGSRPATH = -Xlinker -rpath,$(LIBDIRRPATH)
   # RPATH to common lib when linking cuda/cpp libs
-  override CXXLIBFLAGSRPATH2 = -Wl,-rpath='$$ORIGIN'
-  override CULIBFLAGSRPATH2 = -Xlinker -rpath='$$ORIGIN'
+  override CXXLIBFLAGSRPATH2 = -Wl,-rpath,'$$ORIGIN'
+  override CULIBFLAGSRPATH2 = -Xlinker -rpath,'$$ORIGIN'
 endif
 
 # Setting LD_LIBRARY_PATH or DYLD_LIBRARY_PATH in the RUNTIME is no longer necessary (neither on Linux nor on Mac)
@@ -494,7 +421,7 @@ override RUNTIME =
 cxx_main=$(BUILDDIR)/check.exe
 fcxx_main=$(BUILDDIR)/fcheck.exe
 
-ifneq ($(GPUCC),)
+ifneq ($(NVCC),)
 cu_main=$(BUILDDIR)/gcheck.exe
 fcu_main=$(BUILDDIR)/fgcheck.exe
 else
@@ -525,16 +452,15 @@ $(BUILDDIR)/.build.$(TAG):
 	@touch $(BUILDDIR)/.build.$(TAG)
 
 # Generic target and build rules: objects from CUDA compilation
-ifneq ($(GPUCC),)
+ifneq ($(NVCC),)
 $(BUILDDIR)/%.o : %.cu *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
 	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
-	$(GPUCC) $(CPPFLAGS) $(GPUFLAGS) $(CUBUILDRULEFLAGS) $< -o $@
+	$(NVCC) $(CPPFLAGS) $(CUFLAGS) -Xcompiler -fPIC -c $< -o $@
 
 $(BUILDDIR)/%_cu.o : %.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
 	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
-	$(GPUCC) $(CPPFLAGS) $(GPUFLAGS) $(CCBUILDRULEFLAGS) $< -o $@
+	$(NVCC) $(CPPFLAGS) $(CUFLAGS) -Xcompiler -fPIC -c -x cu $< -o $@
 endif
-# -x cu in line above
 
 # Generic target and build rules: objects from C++ compilation
 # (NB do not include CUINC here! add it only for NVTX or curand #679)
@@ -543,14 +469,11 @@ $(BUILDDIR)/%.o : %.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
 	$(CXX) $(CPPFLAGS) $(CXXFLAGS) -fPIC -c $< -o $@
 
 # Apply special build flags only to CrossSectionKernel.cc and gCrossSectionKernel.cu (no fast math, see #117)
-# Added edgecase for HIP compilation
 ifeq ($(shell $(CXX) --version | grep ^nvc++),)
 $(BUILDDIR)/CrossSectionKernels.o: CXXFLAGS += -fno-fast-math
 $(BUILDDIR)/CrossSectionKernels.o: CXXFLAGS += -fno-fast-math
-ifeq ($(findstring nvcc,$(GPUCC)),nvcc)
-  $(BUILDDIR)/gCrossSectionKernels.o: GPUFLAGS += -Xcompiler -fno-fast-math
-else
-  $(BUILDDIR)/gCrossSectionKernels.o: GPUFLAGS += -fno-fast-math
+ifneq ($(NVCC),)
+$(BUILDDIR)/gCrossSectionKernels.o: CUFLAGS += -Xcompiler -fno-fast-math
 endif
 endif
 
@@ -566,10 +489,10 @@ ifeq ($(RNDGEN),hasCurand)
 $(BUILDDIR)/CurandRandomNumberKernel.o: CXXFLAGS += $(CUINC)
 endif
 
-# Avoid "warning: builtin __has_trivial_... is deprecated; use __is_trivially_... instead" in GPUCC with icx2023 (#592)
+# Avoid "warning: builtin __has_trivial_... is deprecated; use __is_trivially_... instead" in nvcc with icx2023 (#592)
 ifneq ($(shell $(CXX) --version | egrep '^(Intel)'),)
-ifneq ($(GPUCC),)
-GPUFLAGS += -Wno-deprecated-builtins
+ifneq ($(NVCC),)
+CUFLAGS += -Xcompiler -Wno-deprecated-builtins
 endif
 endif
 
@@ -577,8 +500,8 @@ endif
 # This patch does remove the warning, but I prefer to keep it disabled for the moment...
 ###ifneq ($(shell $(CXX) --version | egrep '^(clang|Apple clang|Intel)'),)
 ###$(BUILDDIR)/CrossSectionKernels.o: CXXFLAGS += -Wno-overriding-t-option
-###ifneq ($(GPUCC),)
-###$(BUILDDIR)/gCrossSectionKernels.o: GPUFLAGS += -Xcompiler -Wno-overriding-t-option
+###ifneq ($(NVCC),)
+###$(BUILDDIR)/gCrossSectionKernels.o: CUFLAGS += -Xcompiler -Wno-overriding-t-option
 ###endif
 ###endif
 
@@ -605,7 +528,7 @@ MG5AMC_CXXLIB = mg5amc_$(processid_short)_cpp
 cxx_objects_lib=$(BUILDDIR)/CPPProcess.o $(BUILDDIR)/MatrixElementKernels.o $(BUILDDIR)/BridgeKernels.o $(BUILDDIR)/CrossSectionKernels.o
 cxx_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel.o $(BUILDDIR)/RamboSamplingKernels.o
 
-ifneq ($(GPUCC),)
+ifneq ($(NVCC),)
 MG5AMC_CULIB = mg5amc_$(processid_short)_cuda
 cu_objects_lib=$(BUILDDIR)/gCPPProcess.o $(BUILDDIR)/gMatrixElementKernels.o $(BUILDDIR)/gBridgeKernels.o $(BUILDDIR)/gCrossSectionKernels.o
 cu_objects_exe=$(BUILDDIR)/gCommonRandomNumberKernel.o $(BUILDDIR)/gRamboSamplingKernels.o
@@ -617,11 +540,11 @@ $(LIBDIR)/lib$(MG5AMC_CXXLIB).so: cxx_objects_lib += $(BUILDDIR)/fbridge.o
 $(LIBDIR)/lib$(MG5AMC_CXXLIB).so: $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib)
 	$(CXX) -shared -o $@ $(cxx_objects_lib) $(CXXLIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB)
 
-ifneq ($(GPUCC),)
+ifneq ($(NVCC),)
 $(LIBDIR)/lib$(MG5AMC_CULIB).so: $(BUILDDIR)/fbridge_cu.o
 $(LIBDIR)/lib$(MG5AMC_CULIB).so: cu_objects_lib += $(BUILDDIR)/fbridge_cu.o
 $(LIBDIR)/lib$(MG5AMC_CULIB).so: $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cu_objects_lib)
-	$(GPUCC) --shared -o $@ $(cu_objects_lib) $(CULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB)
+	$(NVCC) --shared -o $@ $(cu_objects_lib) $(CULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB)
 endif
 
 #-------------------------------------------------------------------------------
@@ -638,16 +561,16 @@ $(cxx_main): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PAT
 $(cxx_main): $(BUILDDIR)/check_sa.o $(LIBDIR)/lib$(MG5AMC_CXXLIB).so $(cxx_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel.o
 	$(CXX) -o $@ $(BUILDDIR)/check_sa.o $(OMPFLAGS) -ldl -pthread $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_CXXLIB) $(cxx_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel.o $(CURANDLIBFLAGS)
 
-ifneq ($(GPUCC),)
+ifneq ($(NVCC),)
 ifneq ($(shell $(CXX) --version | grep ^Intel),)
-$(cu_main): LIBFLAGS += -lintlc # compile with icpx and link with GPUCC (undefined reference to `_intel_fast_memcpy')
-$(cu_main): LIBFLAGS += -lsvml # compile with icpx and link with GPUCC (undefined reference to `__svml_cos4_l9')
+$(cu_main): LIBFLAGS += -lintlc # compile with icpx and link with nvcc (undefined reference to `_intel_fast_memcpy')
+$(cu_main): LIBFLAGS += -lsvml # compile with icpx and link with nvcc (undefined reference to `__svml_cos4_l9')
 else ifneq ($(shell $(CXX) --version | grep ^nvc++),) # support nvc++ #531
 $(cu_main): LIBFLAGS += -L$(patsubst %bin/nvc++,%lib,$(subst ccache ,,$(CXX))) -lnvhpcatm -lnvcpumath -lnvc
 endif
 $(cu_main): LIBFLAGS += $(CULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH
 $(cu_main): $(BUILDDIR)/gcheck_sa.o $(LIBDIR)/lib$(MG5AMC_CULIB).so $(cu_objects_exe) $(BUILDDIR)/gCurandRandomNumberKernel.o
-	$(GPUCC) -o $@ $(BUILDDIR)/gcheck_sa.o $(CUARCHFLAGS) $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe) $(BUILDDIR)/gCurandRandomNumberKernel.o $(CURANDLIBFLAGS)
+	$(NVCC) -o $@ $(BUILDDIR)/gcheck_sa.o $(CUARCHFLAGS) $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe) $(BUILDDIR)/gCurandRandomNumberKernel.o $(CURANDLIBFLAGS)
 endif
 
 #-------------------------------------------------------------------------------
@@ -673,17 +596,17 @@ $(fcxx_main): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PA
 $(fcxx_main): $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler.o $(LIBDIR)/lib$(MG5AMC_CXXLIB).so $(cxx_objects_exe)
 	$(CXX) -o $@ $(BUILDDIR)/fcheck_sa.o $(OMPFLAGS) $(BUILDDIR)/fsampler.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CXXLIB) $(cxx_objects_exe)
 
-ifneq ($(GPUCC),)
+ifneq ($(NVCC),)
 ifneq ($(shell $(CXX) --version | grep ^Intel),)
-$(fcu_main): LIBFLAGS += -lintlc # compile with icpx and link with GPUCC (undefined reference to `_intel_fast_memcpy')
-$(fcu_main): LIBFLAGS += -lsvml # compile with icpx and link with GPUCC (undefined reference to `__svml_cos4_l9')
+$(fcu_main): LIBFLAGS += -lintlc # compile with icpx and link with nvcc (undefined reference to `_intel_fast_memcpy')
+$(fcu_main): LIBFLAGS += -lsvml # compile with icpx and link with nvcc (undefined reference to `__svml_cos4_l9')
 endif
 ifeq ($(UNAME_S),Darwin)
 $(fcu_main): LIBFLAGS += -L$(shell dirname $(shell $(FC) --print-file-name libgfortran.dylib)) # add path to libgfortran on Mac #375
 endif
 $(fcu_main): LIBFLAGS += $(CULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH
 $(fcu_main): $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler_cu.o $(LIBDIR)/lib$(MG5AMC_CULIB).so $(cu_objects_exe)
-	$(GPUCC) -o $@ $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler_cu.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe)
+	$(NVCC) -o $@ $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler_cu.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe)
 endif
 
 #-------------------------------------------------------------------------------
@@ -695,7 +618,7 @@ $(BUILDDIR)/testxxx.o: testxxx_cc_ref.txt
 $(testmain): $(BUILDDIR)/testxxx.o
 $(testmain): cxx_objects_exe += $(BUILDDIR)/testxxx.o # Comment out this line to skip the C++ test of xxx functions
 
-ifneq ($(GPUCC),)
+ifneq ($(NVCC),)
 $(BUILDDIR)/testxxx_cu.o: $(GTESTLIBS)
 $(BUILDDIR)/testxxx_cu.o: INCFLAGS += $(GTESTINC)
 $(BUILDDIR)/testxxx_cu.o: testxxx_cc_ref.txt
@@ -708,7 +631,7 @@ $(BUILDDIR)/testmisc.o: INCFLAGS += $(GTESTINC)
 $(testmain): $(BUILDDIR)/testmisc.o
 $(testmain): cxx_objects_exe += $(BUILDDIR)/testmisc.o # Comment out this line to skip the C++ miscellaneous tests
 
-ifneq ($(GPUCC),)
+ifneq ($(NVCC),)
 $(BUILDDIR)/testmisc_cu.o: $(GTESTLIBS)
 $(BUILDDIR)/testmisc_cu.o: INCFLAGS += $(GTESTINC)
 $(testmain): $(BUILDDIR)/testmisc_cu.o
@@ -720,12 +643,12 @@ $(BUILDDIR)/runTest.o: INCFLAGS += $(GTESTINC)
 $(testmain): $(BUILDDIR)/runTest.o
 $(testmain): cxx_objects_exe += $(BUILDDIR)/runTest.o
 
-ifneq ($(GPUCC),)
+ifneq ($(NVCC),)
 $(BUILDDIR)/runTest_cu.o: $(GTESTLIBS)
 $(BUILDDIR)/runTest_cu.o: INCFLAGS += $(GTESTINC)
 ifneq ($(shell $(CXX) --version | grep ^Intel),)
-$(testmain): LIBFLAGS += -lintlc # compile with icpx and link with GPUCC (undefined reference to `_intel_fast_memcpy')
-$(testmain): LIBFLAGS += -lsvml # compile with icpx and link with GPUCC (undefined reference to `__svml_cos4_l9')
+$(testmain): LIBFLAGS += -lintlc # compile with icpx and link with nvcc (undefined reference to `_intel_fast_memcpy')
+$(testmain): LIBFLAGS += -lsvml # compile with icpx and link with nvcc (undefined reference to `__svml_cos4_l9')
 else ifneq ($(shell $(CXX) --version | grep ^nvc++),) # support nvc++ #531
 $(testmain): LIBFLAGS += -L$(patsubst %bin/nvc++,%lib,$(subst ccache ,,$(CXX))) -lnvhpcatm -lnvcpumath -lnvc
 endif
@@ -749,14 +672,14 @@ $(testmain): LIBFLAGS += -lgomp
 endif
 endif
 
-ifeq ($(GPUCC),) # link only runTest.o
+ifeq ($(NVCC),) # link only runTest.o
 $(testmain): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH
 $(testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(cxx_objects_exe) $(GTESTLIBS)
 	$(CXX) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) -ldl -pthread $(LIBFLAGS)
 else # link both runTest.o and runTest_cu.o
 $(testmain): LIBFLAGS += $(CULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH
 $(testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) $(GTESTLIBS)
-	  $(GPUCC) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) -ldl $(LIBFLAGS) $(CUDATESTFLAGS)
+	$(NVCC) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) -ldl $(LIBFLAGS) -lcuda
 endif
 
 # Use flock (Linux only, no Mac) to allow 'make -j' if googletest has not yet been downloaded https://stackoverflow.com/a/32666215
@@ -859,9 +782,9 @@ ifeq ($(USECCACHE),1)
 	ccache --version | head -1
 endif
 	@echo ""
-	@echo GPUCC=$(GPUCC)
-ifneq ($(GPUCC),)
-	$(GPUCC) --version
+	@echo NVCC=$(NVCC)
+ifneq ($(NVCC),)
+	$(NVCC) --version
 endif
 	@echo ""
 	@echo CXX=$(CXX)
@@ -880,7 +803,7 @@ endif
 
 # Target: check (run the C++ test executable)
 # [NB THIS IS WHAT IS USED IN THE GITHUB CI!]
-ifneq ($(GPUCC),)
+ifneq ($(NVCC),)
 check: runTest cmpFcheck cmpFGcheck
 else
 check: runTest cmpFcheck
diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/fbridge.cc b/epochX/cudacpp/gg_ttg.mad/SubProcesses/fbridge.cc
index 2b956730d4..f93c05b0b3 100644
--- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/fbridge.cc
+++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/fbridge.cc
@@ -1,11 +1,11 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: S. Roiser (Oct 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Roiser, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #include "Bridge.h"
 #include "CPPProcess.h"
-#include "GpuRuntime.h"
+#include "CudaRuntime.h"
 
 extern "C"
 {
@@ -22,7 +22,7 @@ extern "C"
    * Using the same Fortran MadEvent code, linking to the hetrerogeneous library would allow access to both CPU and GPU implementations.
    * The specific heterogeneous configuration (how many GPUs, how many threads on each CPU, etc) could be loaded in CUDA/C++ from a data file.
    */
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
   using namespace mg5amcGpu;
 #else
   using namespace mg5amcCpu;
@@ -46,8 +46,8 @@ extern "C"
    */
   void fbridgecreate_( CppObjectInFortran** ppbridge, const int* pnevtF, const int* pnparF, const int* pnp4F )
   {
-#ifdef MGONGPUCPP_GPUIMPL
-    GpuRuntime::setUp();
+#ifdef __CUDACC__
+    CudaRuntime::setUp();
 #endif
     // Create a process object, read parm card and set parameters
     // FIXME: the process instance can happily go out of scope because it is only needed to read parameters?
@@ -69,8 +69,8 @@ extern "C"
     Bridge<FORTRANFPTYPE>* pbridge = dynamic_cast<Bridge<FORTRANFPTYPE>*>( *ppbridge );
     if( pbridge == 0 ) throw std::runtime_error( "fbridgedelete_: invalid Bridge address" );
     delete pbridge;
-#ifdef MGONGPUCPP_GPUIMPL
-    GpuRuntime::tearDown();
+#ifdef __CUDACC__
+    CudaRuntime::tearDown();
 #endif
   }
 
@@ -100,7 +100,7 @@ extern "C"
   {
     Bridge<FORTRANFPTYPE>* pbridge = dynamic_cast<Bridge<FORTRANFPTYPE>*>( *ppbridge );
     if( pbridge == 0 ) throw std::runtime_error( "fbridgesequence_: invalid Bridge address" );
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
     // Use the device/GPU implementation in the CUDA library
     // (there is also a host implementation in this library)
     pbridge->gpu_sequence( momenta, gs, rndhel, rndcol, *pchannelId, mes, selhel, selcol );
diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/fsampler.cc b/epochX/cudacpp/gg_ttg.mad/SubProcesses/fsampler.cc
index 3743934f41..2fb445372d 100644
--- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/fsampler.cc
+++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/fsampler.cc
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Feb 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #include "mgOnGpuConfig.h"
 
@@ -13,7 +13,7 @@
 
 //--------------------------------------------------------------------------
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -40,7 +40,7 @@ namespace mg5amcCpu
   private:
     const int m_nevt; // The number of events in each iteration
     int m_iiter;      // The iteration counter (for random number seeding)
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
     HostBufferRndNumMomenta m_hstRndmom; // Memory buffers for random numbers
     HostBufferMomenta m_hstMomenta;      // Memory buffers for momenta
     HostBufferWeights m_hstWeights;      // Memory buffers for sampling weights
@@ -105,7 +105,7 @@ namespace mg5amcCpu
 
 extern "C"
 {
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
   using namespace mg5amcGpu;
 #else
   using namespace mg5amcCpu;
diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/runTest.cc b/epochX/cudacpp/gg_ttg.mad/SubProcesses/runTest.cc
index 461ec5c3a5..572e28aaea 100644
--- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/runTest.cc
+++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/runTest.cc
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: S. Hageboeck (Nov 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 
 #include "mgOnGpuConfig.h"
 
@@ -15,7 +15,7 @@
 #include "RandomNumberKernels.h"
 #include "epoch_process_id.h"
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 using namespace mg5amcGpu;
 #else
 using namespace mg5amcCpu;
@@ -32,7 +32,7 @@ struct CUDA_CPU_TestBase : public TestDriverBase
     : TestDriverBase( npar, refFileName ) {}
 };
 
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
 struct CPUTest : public CUDA_CPU_TestBase
 {
   // Struct data members (process, and memory structures for random numbers, momenta, matrix elements and weights on host and device)
@@ -114,7 +114,7 @@ struct CPUTest : public CUDA_CPU_TestBase
 };
 #endif
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 struct CUDATest : public CUDA_CPU_TestBase
 {
   // Reset the device when our test goes out of scope. Note that this should happen after
@@ -123,7 +123,7 @@ struct CUDATest : public CUDA_CPU_TestBase
   {
     ~DeviceReset()
     {
-      gpuDeviceReset(); // this is needed by cuda-memcheck --leak-check full
+      checkCuda( cudaDeviceReset() ); // this is needed by cuda-memcheck --leak-check full
     }
   } deviceResetter;
 
@@ -249,7 +249,7 @@ INSTANTIATE_TEST_SUITE_P( prefix, \
                           test_suite_name, \
                           testing::Values( new CUDATest( MG_EPOCH_REFERENCE_FILE_NAME ) ) );
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 MG_INSTANTIATE_TEST_SUITE_GPU( XTESTID_GPU( MG_EPOCH_PROCESS_ID ), MadgraphTest );
 #else
 MG_INSTANTIATE_TEST_SUITE_CPU( XTESTID_CPU( MG_EPOCH_PROCESS_ID ), MadgraphTest );
diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/testmisc.cc b/epochX/cudacpp/gg_ttg.mad/SubProcesses/testmisc.cc
index 2bd7a9fcf9..989aba1fdc 100644
--- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/testmisc.cc
+++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/testmisc.cc
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 //----------------------------------------------------------------------------
 // Use ./runTest.exe --gtest_filter=*misc to run only this test
 //----------------------------------------------------------------------------
@@ -17,7 +17,7 @@
 #include <sstream>
 #include <typeinfo>
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 #define TESTID( s ) s##_GPU_MISC
 #else
 #define TESTID( s ) s##_CPU_MISC
diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/testxxx.cc b/epochX/cudacpp/gg_ttg.mad/SubProcesses/testxxx.cc
index 6e8657edca..4243e9fcec 100644
--- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/testxxx.cc
+++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/testxxx.cc
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Apr 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #include "mgOnGpuConfig.h"
 
@@ -20,7 +20,7 @@
 #include <iomanip>
 #include <iostream>
 #include <vector>
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 #define TESTID( s ) s##_GPU_XXX
 #else
 #define TESTID( s ) s##_CPU_XXX
@@ -41,7 +41,7 @@ TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testxxx )
   assert( nevt % neppM == 0 ); // nevt must be a multiple of neppM
   assert( nevt % neppV == 0 ); // nevt must be a multiple of neppV
   // Fill in the input momenta
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
   mg5amcGpu::PinnedHostBufferMomenta hstMomenta( nevt ); // AOSOA[npagM][npar=4][np4=4][neppM]
 #else
   mg5amcCpu::HostBufferMomenta hstMomenta( nevt ); // AOSOA[npagM][npar=4][np4=4][neppM]
@@ -228,7 +228,7 @@ TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testxxx )
   {
     for( int ievt = 0; ievt < nevt; ievt++ )
     {
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
       using namespace mg5amcGpu;
 #else
       using namespace mg5amcCpu;
diff --git a/epochX/cudacpp/gg_ttg.mad/src/HelAmps_sm.h b/epochX/cudacpp/gg_ttg.mad/src/HelAmps_sm.h
index 3593d9f169..5a3a5dc76f 100644
--- a/epochX/cudacpp/gg_ttg.mad/src/HelAmps_sm.h
+++ b/epochX/cudacpp/gg_ttg.mad/src/HelAmps_sm.h
@@ -4,7 +4,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: A. Valassi (Sep 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
 // MadGraph5_aMC@NLO v. 3.5.0_lo_vect, 2023-06-09
@@ -26,7 +26,7 @@
 //#include <iomanip>
 //#include <iostream>
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/gg_ttg.mad/src/Parameters_sm.cc b/epochX/cudacpp/gg_ttg.mad/src/Parameters_sm.cc
index de87dcaf64..d3d01102fd 100644
--- a/epochX/cudacpp/gg_ttg.mad/src/Parameters_sm.cc
+++ b/epochX/cudacpp/gg_ttg.mad/src/Parameters_sm.cc
@@ -4,7 +4,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: A. Valassi (Sep 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
 // MadGraph5_aMC@NLO v. 3.5.0_lo_vect, 2023-06-09
diff --git a/epochX/cudacpp/gg_ttg.mad/src/Parameters_sm.h b/epochX/cudacpp/gg_ttg.mad/src/Parameters_sm.h
index fe7d686938..6551d8da81 100644
--- a/epochX/cudacpp/gg_ttg.mad/src/Parameters_sm.h
+++ b/epochX/cudacpp/gg_ttg.mad/src/Parameters_sm.h
@@ -214,7 +214,7 @@ namespace Parameters_sm_dependentCouplings
 #pragma GCC diagnostic push
 #pragma GCC diagnostic ignored "-Wunused-variable"  // e.g. <<warning: unused variable ‘mdl_G__exp__2’ [-Wunused-variable]>>
 #pragma GCC diagnostic ignored "-Wunused-parameter" // e.g. <<warning: unused parameter ‘G’ [-Wunused-parameter]>>
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 #pragma nv_diagnostic push
 #pragma nv_diag_suppress 177 // e.g. <<warning #177-D: variable "mdl_G__exp__2" was declared but never referenced>>
 #endif
@@ -242,7 +242,7 @@ namespace Parameters_sm_dependentCouplings
     // End SM implementation - no special handling of vectors of floats as in EFT (#439)
     return out;
   }
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 #pragma GCC diagnostic pop
 #pragma nv_diagnostic pop
 #endif
@@ -258,7 +258,7 @@ namespace Parameters_sm_independentCouplings
 
 //==========================================================================
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/gg_ttg.mad/src/mgOnGpuConfig.h b/epochX/cudacpp/gg_ttg.mad/src/mgOnGpuConfig.h
index 390766116b..881353abac 100644
--- a/epochX/cudacpp/gg_ttg.mad/src/mgOnGpuConfig.h
+++ b/epochX/cudacpp/gg_ttg.mad/src/mgOnGpuConfig.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jul 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MGONGPUCONFIG_H
 #define MGONGPUCONFIG_H 1
@@ -10,26 +10,13 @@
 // There are two different code bases for standalone_cudacpp (without multichannel) and madevent+cudacpp (with multichannel)
 #define MGONGPU_SUPPORTS_MULTICHANNEL 1
 
-// Is this a GPU (CUDA, HIP) or CPU implementation?
-#ifdef __CUDACC__
-#define MGONGPUCPP_GPUIMPL cuda
-#elif defined __HIPCC__
-#define MGONGPUCPP_GPUIMPL hip
-#else
-#undef MGONGPUCPP_GPUIMPL
-#endif
-
 // ** NB1 Throughputs (e.g. 6.8E8) are events/sec for "./gcheck.exe -p 65536 128 12"
 // ** NB2 Baseline on b7g47n0004 fluctuates (probably depends on load on other VMs)
 
 // Choose if curand is supported for generating random numbers
-// For CUDA, by default, it is supported
-// For HIP, by default, it is not supported
 // For C++, by default, do not inline, but allow this macro to be set from outside with e.g. -DMGONGPU_HAS_NO_CURAND
 #ifdef __CUDACC__
 #undef MGONGPU_HAS_NO_CURAND
-#elif defined __HIPCC__
-#define MGONGPU_HAS_NO_CURAND 1
 #else
 //#undef MGONGPU_HAS_NO_CURAND // default
 ////#define MGONGPU_HAS_NO_CURAND 1
@@ -65,28 +52,23 @@
 //#undef MGONGPU_HARDCODE_PARAM // default
 ////#define MGONGPU_HARDCODE_PARAM 1
 
-// Complex type in CUDA: thrust or cucomplex or cxsmpl (CHOOSE ONLY ONE)
+// Complex type in c++: std::complex or cxsmpl (CHOOSE ONLY ONE)
+#ifndef __CUDACC__
+//#define MGONGPU_CPPCXTYPE_STDCOMPLEX 1 // ~8 percent slower on float, same on double (5.1E6/double, 9.4E6/float)
+#define MGONGPU_CPPCXTYPE_CXSMPL 1 // new default (5.1E6/double, 10.2E6/float)
+#endif
+
+// Complex type in cuda: thrust or cucomplex or cxsmpl (CHOOSE ONLY ONE)
 #ifdef __CUDACC__
 #define MGONGPU_CUCXTYPE_THRUST 1 // default (~1.15E9/double, ~3.2E9/float)
 //#define MGONGPU_CUCXTYPE_CUCOMPLEX 1 // ~10 percent slower (1.03E9/double, ~2.8E9/float)
 //#define MGONGPU_CUCXTYPE_CXSMPL 1 // ~10 percent slower (1.00E9/double, ~2.9E9/float)
-
-// Complex type in HIP: cxsmpl (ONLY ONE OPTION POSSIBLE)
-#elif defined __HIPCC__
-#define MGONGPU_CUCXTYPE_CXSMPL 1 // ~10 percent slower (1.00E9/double, ~2.9E9/float)
-
-// Complex type in C++: std::complex or cxsmpl (CHOOSE ONLY ONE)
-#else
-//#define MGONGPU_CPPCXTYPE_STDCOMPLEX 1 // ~8 percent slower on float, same on double (5.1E6/double, 9.4E6/float)
-#define MGONGPU_CPPCXTYPE_CXSMPL 1 // new default (5.1E6/double, 10.2E6/float)
 #endif
 
-// CUDA nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation
+// Cuda nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation
 #ifdef __CUDACC__
-#undef MGONGPU_NSIGHT_DEBUG // default in CUDA
+#undef MGONGPU_NSIGHT_DEBUG // default
 //#define MGONGPU_NSIGHT_DEBUG 1
-#else
-#undef MGONGPU_NSIGHT_DEBUG // only option in HIP or C++
 #endif
 
 // SANITY CHECKS (floating point precision for everything but color algebra #537)
@@ -102,21 +84,17 @@
 #error You cannot use double precision for color algebra and single precision elsewhere
 #endif
 
-// SANITY CHECKS (CUDA complex number implementation)
-#ifdef __CUDACC__
-#if defined MGONGPU_CUCXTYPE_THRUST and defined MGONGPU_CUCXTYPE_CUCOMPLEX
-#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CUCXTYPE_THRUST or MGONGPU_CUCXTYPE_CUCOMPLEX for CUDA
-#elif defined MGONGPU_CUCXTYPE_THRUST and defined MGONGPU_CUCXTYPE_CXSMPL
-#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CUCXTYPE_THRUST or MGONGPU_CUCXTYPE_CXSMPL for CUDA
-#elif defined MGONGPU_CUCXTYPE_CUCOMPLEX and defined MGONGPU_CUCXTYPE_CXSMPL
-#error You must CHOOSE (ONE AND) ONLY ONE OF MGONGPU_CUCXTYPE_CUCOMPLEX or MGONGPU_CUCXTYPE_CXSMPL for CUDA
+// SANITY CHECKS (c++ complex number implementation)
+#ifndef __CUDACC__
+#if defined MGONGPU_CPPCXTYPE_STDCOMPLEX and defined MGONGPU_CPPCXTYPE_CXSMPL
+#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CPPCXTYPE_STDCOMPLEX or MGONGPU_CPPCXTYPE_CXSMPL
 #endif
 #endif
 
-// SANITY CHECKS (C++ complex number implementation)
-#ifndef MGONGPUCPP_GPUIMPL
-#if defined MGONGPU_CPPCXTYPE_STDCOMPLEX and defined MGONGPU_CPPCXTYPE_CXSMPL
-#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CPPCXTYPE_STDCOMPLEX or MGONGPU_CPPCXTYPE_CXSMPL for C++
+// SANITY CHECKS (cuda complex number implementation)
+#ifdef __CUDACC__
+#if defined MGONGPU_CUCXTYPE_THRUST and defined MGONGPU_CUCXTYPE_CUCOMPLEX and defined MGONGPU_CUCXTYPE_CXSMPL
+#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CUCXTYPE_THRUST or MGONGPU_CUCXTYPE_CUCOMPLEX or MGONGPU_CUCXTYPE_CXSMPL
 #endif
 #endif
 
@@ -153,7 +131,7 @@ namespace mgOnGpu
   // Alignment requirement for using reinterpret_cast with SIMD vectorized code
   // (using reinterpret_cast with non aligned memory may lead to segmentation faults!)
   // Only needed for C++ code but can be enforced also in NVCC builds of C++ code using CUDA>=11.2 and C++17 (#318, #319, #333)
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
   constexpr int cppAlign = 64; // alignment requirement for SIMD vectorization (64-byte i.e. 512-bit)
 #endif
 
@@ -164,7 +142,7 @@ using mgOnGpu::fptype;
 using mgOnGpu::fptype2;
 
 // C++ SIMD vectorization width (this will be used to set neppV)
-#ifdef MGONGPUCPP_GPUIMPL // CUDA and HIP implementations have no SIMD
+#ifdef __CUDACC__ // CUDA implementation has no SIMD
 #undef MGONGPU_CPPSIMD
 #elif defined __AVX512VL__ && defined MGONGPU_PVW512 // C++ "512z" AVX512 with 512 width (512-bit ie 64-byte): 8 (DOUBLE) or 16 (FLOAT)
 #ifdef MGONGPU_FPTYPE_DOUBLE
@@ -194,9 +172,9 @@ using mgOnGpu::fptype2;
 #undef MGONGPU_CPPSIMD
 #endif
 
-// CUDA nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation
+// Cuda nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation
 // Arguments (not used so far): text is __FUNCTION__, code is 0 (start) or 1 (end)
-#if defined __CUDA__ && defined MGONGPU_NSIGHT_DEBUG /* clang-format off */
+#if defined __CUDACC__ && defined MGONGPU_NSIGHT_DEBUG /* clang-format off */
 #define mgDebugDeclare() __shared__ float mgDebugCounter[mgOnGpu::ntpbMAX];
 #define mgDebugInitialise() { mgDebugCounter[threadIdx.x] = 0; }
 #define mgDebug( code, text ) { mgDebugCounter[threadIdx.x] += 1; }
@@ -208,8 +186,8 @@ using mgOnGpu::fptype2;
 #define mgDebugFinalise() { /*noop*/ }
 #endif /* clang-format on */
 
-// Define empty CUDA/HIP declaration specifiers for C++
-#ifndef MGONGPUCPP_GPUIMPL
+// Define empty CUDA declaration specifiers for C++
+#ifndef __CUDACC__
 #define __global__
 #define __host__
 #define __device__
diff --git a/epochX/cudacpp/gg_ttg.mad/src/mgOnGpuCxtypes.h b/epochX/cudacpp/gg_ttg.mad/src/mgOnGpuCxtypes.h
index 4e7ab03fa2..0cb2f1db7e 100644
--- a/epochX/cudacpp/gg_ttg.mad/src/mgOnGpuCxtypes.h
+++ b/epochX/cudacpp/gg_ttg.mad/src/mgOnGpuCxtypes.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022, based on earlier work by D. Smith) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MGONGPUCXTYPES_H
 #define MGONGPUCXTYPES_H 1
@@ -19,7 +19,7 @@
 #include <complex>
 
 // Complex type in cuda: thrust or cucomplex or cxsmpl
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 #if defined MGONGPU_CUCXTYPE_THRUST
 #pragma clang diagnostic push
 #pragma clang diagnostic ignored "-Wtautological-compare" // for icpx2021/clang13 (https://stackoverflow.com/a/15864661)
@@ -201,7 +201,7 @@ namespace mgOnGpu
 {
 
   // --- Type definitions (complex type: cxtype)
-#ifdef MGONGPUCPP_GPUIMPL // cuda
+#ifdef __CUDACC__ // cuda
 #if defined MGONGPU_CUCXTYPE_THRUST
   typedef thrust::complex<fptype> cxtype;
 #elif defined MGONGPU_CUCXTYPE_CUCOMPLEX
@@ -281,7 +281,7 @@ cxmake( const std::complex<double>& c ) // std::complex to cxsmpl (double-to-flo
 
 //==========================================================================
 
-#if defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CUCXTYPE_THRUST // cuda + thrust
+#if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_THRUST // cuda + thrust
 
 //------------------------------
 // CUDA - using thrust::complex
@@ -317,11 +317,11 @@ cxmake( const cxtype& c )
   return c;
 }
 
-#endif // #if defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CUCXTYPE_THRUST
+#endif // #if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_THRUST
 
 //==========================================================================
 
-#if defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CUCXTYPE_CUCOMPLEX // cuda + cucomplex
+#if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_CUCOMPLEX // cuda + cucomplex
 
 //------------------------------
 // CUDA - using cuComplex
@@ -536,11 +536,11 @@ cxmake( const std::complex<fptype>& c ) // std::complex to cucomplex (float-to-f
   return cxmake( c.real(), c.imag() );
 }
 
-#endif // #if defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CUCXTYPE_CUCOMPLEX
+#endif // #if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_CUCOMPLEX
 
 //==========================================================================
 
-#if not defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CPPCXTYPE_STDCOMPLEX // c++ + stdcomplex
+#if not defined __CUDACC__ and defined MGONGPU_CPPCXTYPE_STDCOMPLEX // c++ + stdcomplex
 
 //------------------------------
 // C++ - using std::complex
@@ -584,7 +584,7 @@ cxmake( const std::complex<double>& c ) // std::complex to std::complex (cast do
 }
 #endif
 
-#endif // #if not defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CPPCXTYPE_STDCOMPLEX
+#endif // #if not defined __CUDACC__ and defined MGONGPU_CPPCXTYPE_STDCOMPLEX
 
 //==========================================================================
 
diff --git a/epochX/cudacpp/gg_ttg.mad/src/mgOnGpuFptypes.h b/epochX/cudacpp/gg_ttg.mad/src/mgOnGpuFptypes.h
index 6f6cee64d6..a1cde16a67 100644
--- a/epochX/cudacpp/gg_ttg.mad/src/mgOnGpuFptypes.h
+++ b/epochX/cudacpp/gg_ttg.mad/src/mgOnGpuFptypes.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MGONGPUFPTYPES_H
 #define MGONGPUFPTYPES_H 1
@@ -13,7 +13,7 @@
 
 //==========================================================================
 
-#ifdef MGONGPUCPP_GPUIMPL // cuda
+#ifdef __CUDACC__ // cuda
 
 //------------------------------
 // Floating point types - Cuda
@@ -57,11 +57,11 @@ fpsqrt( const fptype& f )
 #endif
 }
 
-#endif // #ifdef MGONGPUCPP_GPUIMPL
+#endif // #ifdef __CUDACC__
 
 //==========================================================================
 
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
 
 //------------------------------
 // Floating point types - C++
@@ -85,7 +85,7 @@ fpsqrt( const fptype& f )
   return std::sqrt( f );
 }
 
-#endif // #ifndef MGONGPUCPP_GPUIMPL
+#endif // #ifndef __CUDACC__
 
 //==========================================================================
 
diff --git a/epochX/cudacpp/gg_ttg.mad/src/mgOnGpuVectors.h b/epochX/cudacpp/gg_ttg.mad/src/mgOnGpuVectors.h
index 7904b93c61..9d3e82b1e3 100644
--- a/epochX/cudacpp/gg_ttg.mad/src/mgOnGpuVectors.h
+++ b/epochX/cudacpp/gg_ttg.mad/src/mgOnGpuVectors.h
@@ -108,7 +108,7 @@ namespace mgOnGpu /* clang-format off */
 #endif
 #endif
 
-#else // i.e #ifndef MGONGPU_CPPSIMD (this includes #ifdef MGONGPUCPP_GPUIMPL)
+#else // i.e #ifndef MGONGPU_CPPSIMD (this includes #ifdef __CUDACC__)
 
   const int neppV = 1;
 
@@ -129,7 +129,7 @@ using mgOnGpu::bool_v;
 
 //--------------------------------------------------------------------------
 
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
 
 // Printout to stream for user defined types
 
@@ -744,11 +744,11 @@ fpvsplit1( const fptype2_v& v )
 
 #endif // #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
 
-#endif // #ifndef MGONGPUCPP_GPUIMPL
+#endif // #ifndef __CUDACC__
 
 //==========================================================================
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 
 //------------------------------
 // Vector types - CUDA
@@ -786,12 +786,12 @@ cxternary( const bool& mask, const cxtype& a, const cxtype& b )
   return ( mask ? a : b );
 }
 
-#endif // #ifdef MGONGPUCPP_GPUIMPL
+#endif // #ifdef __CUDACC__
 
 //==========================================================================
 
 // Scalar-or-vector types: scalar in CUDA, vector or scalar in C++
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 typedef bool bool_sv;
 typedef fptype fptype_sv;
 typedef fptype2 fptype2_sv;
@@ -812,7 +812,7 @@ typedef mgOnGpu::cxtype_ref cxtype_sv_ref;
 #endif
 
 // Scalar-or-vector zeros: scalar in CUDA, vector or scalar in C++
-#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
+#ifdef __CUDACC__ /* clang-format off */
 inline __host__ __device__ cxtype cxzero_sv(){ return cxtype( 0, 0 ); }
 #elif defined MGONGPU_CPPSIMD
 inline cxtype_v cxzero_sv() { return cxtype_v(); } // RRRR=0000 IIII=0000
diff --git a/epochX/cudacpp/gg_ttg.mad/src/rambo.h b/epochX/cudacpp/gg_ttg.mad/src/rambo.h
index cd7e1008ea..e02ea52496 100644
--- a/epochX/cudacpp/gg_ttg.mad/src/rambo.h
+++ b/epochX/cudacpp/gg_ttg.mad/src/rambo.h
@@ -4,7 +4,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 
 #include "mgOnGpuConfig.h"
@@ -18,7 +18,7 @@
 #include <iostream>
 
 // Simplified rambo version for 2 to N (with N>=2) processes with massless particles
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -83,7 +83,7 @@ namespace mg5amcCpu
       static bool first = true;
       if( first )
       {
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
         if constexpr( M_ACCESS::isOnDevice() ) // avoid
         {
           const int ievt0 = 0;
@@ -166,7 +166,7 @@ namespace mg5amcCpu
     wt = po2log;
     if( nparf != 2 ) wt = ( 2. * nparf - 4. ) * log( energy ) + z[nparf - 1];
 
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
     // issue warnings if weight is too small or too large
     static int iwarn[5] = { 0, 0, 0, 0, 0 };
     if( wt < -180. )
diff --git a/epochX/cudacpp/gg_ttg.sa/COPYRIGHT b/epochX/cudacpp/gg_ttg.sa/COPYRIGHT
index 84a883fbb0..a134b5fef9 100644
--- a/epochX/cudacpp/gg_ttg.sa/COPYRIGHT
+++ b/epochX/cudacpp/gg_ttg.sa/COPYRIGHT
@@ -15,7 +15,6 @@ The full development team currently includes the following authors :
   Stephan Hageboeck (CERN)
   Olivier Mattelaer (Universite Catholique de Louvain, original author)
   Stefan Roiser (CERN, original author)
-  Joergen Teig (CERN)
   Andrea Valassi (CERN, original author)
   Zenny Wettersten (CERN)
 See https://github.com/madgraph5/madgraph4gpu for more details. For the full
diff --git a/epochX/cudacpp/gg_ttg.sa/SubProcesses/Bridge.h b/epochX/cudacpp/gg_ttg.sa/SubProcesses/Bridge.h
index c04628dfd1..4cafe0c997 100644
--- a/epochX/cudacpp/gg_ttg.sa/SubProcesses/Bridge.h
+++ b/epochX/cudacpp/gg_ttg.sa/SubProcesses/Bridge.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: S. Roiser (Nov 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Roiser, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef BRIDGE_H
 #define BRIDGE_H 1
@@ -22,7 +22,7 @@
 #include <memory>
 #include <type_traits>
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -82,7 +82,7 @@ namespace mg5amcCpu
     Bridge& operator=( const Bridge& ) = delete;
     Bridge& operator=( Bridge&& ) = delete;
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
     /**
      * Set the gpublocks and gputhreads for the gpusequence - throws if evnt != gpublocks*gputhreads
      * (this is needed for BridgeKernel tests rather than for actual production use in Fortran)
@@ -149,7 +149,7 @@ namespace mg5amcCpu
     unsigned int m_nevt; // number of events
     int m_nGoodHel;      // the number of good helicities (-1 initially when they have not yet been calculated)
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
     int m_gputhreads; // number of gpu threads (default set from number of events, can be modified)
     int m_gpublocks;  // number of gpu blocks (default set from number of events, can be modified)
     mg5amcGpu::DeviceBuffer<FORTRANFPTYPE, sizePerEventMomenta> m_devMomentaF;
@@ -186,12 +186,12 @@ namespace mg5amcCpu
   // Forward declare transposition methods
   //
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 
   template<typename Tin, typename Tout>
   __global__ void dev_transposeMomentaF2C( const Tin* in, Tout* out, const unsigned int nevt );
 
-#endif // MGONGPUCPP_GPUIMPL
+#endif // __CUDACC__
 
   template<typename Tin, typename Tout>
   void hst_transposeMomentaF2C( const Tin* in, Tout* out, const unsigned int nevt );
@@ -208,7 +208,7 @@ namespace mg5amcCpu
   Bridge<FORTRANFPTYPE>::Bridge( unsigned int nevtF, unsigned int nparF, unsigned int np4F )
     : m_nevt( nevtF )
     , m_nGoodHel( -1 )
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
     , m_gputhreads( 256 )                  // default number of gpu threads
     , m_gpublocks( m_nevt / m_gputhreads ) // this ensures m_nevt <= m_gpublocks*m_gputhreads
     , m_devMomentaF( m_nevt )
@@ -232,7 +232,7 @@ namespace mg5amcCpu
   {
     if( nparF != CPPProcess::npar ) throw std::runtime_error( "Bridge constructor: npar mismatch" );
     if( np4F != CPPProcess::np4 ) throw std::runtime_error( "Bridge constructor: np4 mismatch" );
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
     if( ( m_nevt < s_gputhreadsmin ) || ( m_nevt % s_gputhreadsmin != 0 ) )
       throw std::runtime_error( "Bridge constructor: nevt should be a multiple of " + std::to_string( s_gputhreadsmin ) );
     while( m_nevt != m_gpublocks * m_gputhreads )
@@ -250,11 +250,11 @@ namespace mg5amcCpu
     std::cout << "WARNING! Instantiate host Bridge (nevt=" << m_nevt << ")" << std::endl;
     mg5amcCpu::CPPProcess process( /*verbose=*/false );
     m_pmek.reset( new mg5amcCpu::MatrixElementKernelHost( m_hstMomentaC, m_hstGs, m_hstRndHel, m_hstRndCol, m_hstMEs, m_hstSelHel, m_hstSelCol, m_nevt ) );
-#endif // MGONGPUCPP_GPUIMPL
+#endif // __CUDACC__
     process.initProc( "../../Cards/param_card.dat" );
   }
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
   template<typename FORTRANFPTYPE>
   void Bridge<FORTRANFPTYPE>::set_gpugrid( const int gpublocks, const int gputhreads )
   {
@@ -268,7 +268,7 @@ namespace mg5amcCpu
   }
 #endif
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
   template<typename FORTRANFPTYPE>
   void Bridge<FORTRANFPTYPE>::gpu_sequence( const FORTRANFPTYPE* momenta,
                                             const FORTRANFPTYPE* gs,
@@ -283,14 +283,14 @@ namespace mg5amcCpu
     constexpr int neppM = MemoryAccessMomenta::neppM;
     if constexpr( neppM == 1 && std::is_same_v<FORTRANFPTYPE, fptype> )
     {
-      gpuMemcpy( m_devMomentaC.data(), momenta, m_devMomentaC.bytes(), gpuMemcpyHostToDevice );
+      checkCuda( cudaMemcpy( m_devMomentaC.data(), momenta, m_devMomentaC.bytes(), cudaMemcpyHostToDevice ) );
     }
     else
     {
-      gpuMemcpy( m_devMomentaF.data(), momenta, m_devMomentaF.bytes(), gpuMemcpyHostToDevice );
+      checkCuda( cudaMemcpy( m_devMomentaF.data(), momenta, m_devMomentaF.bytes(), cudaMemcpyHostToDevice ) );
       const int thrPerEvt = CPPProcess::npar * CPPProcess::np4; // AV: transpose alg does 1 element per thread (NOT 1 event per thread)
       //const int thrPerEvt = 1; // AV: try new alg with 1 event per thread... this seems slower
-      gpuLaunchKernel( dev_transposeMomentaF2C, m_gpublocks * thrPerEvt, m_gputhreads, m_devMomentaF.data(), m_devMomentaC.data(), m_nevt );
+      dev_transposeMomentaF2C<<<m_gpublocks * thrPerEvt, m_gputhreads>>>( m_devMomentaF.data(), m_devMomentaC.data(), m_nevt );
     }
     if constexpr( std::is_same_v<FORTRANFPTYPE, fptype> )
     {
@@ -333,7 +333,7 @@ namespace mg5amcCpu
   }
 #endif
 
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
   template<typename FORTRANFPTYPE>
   void Bridge<FORTRANFPTYPE>::cpu_sequence( const FORTRANFPTYPE* momenta,
                                             const FORTRANFPTYPE* gs,
@@ -388,7 +388,7 @@ namespace mg5amcCpu
   // - C++ array: momenta[npagM][npar][np4][neppM] with nevt=npagM*neppM (AOSOA)
   //
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
   template<typename Tin, typename Tout>
   __global__ void dev_transposeMomentaF2C( const Tin* in, Tout* out, const unsigned int nevt )
   {
diff --git a/epochX/cudacpp/gg_ttg.sa/SubProcesses/BridgeKernels.cc b/epochX/cudacpp/gg_ttg.sa/SubProcesses/BridgeKernels.cc
index 90c7f2d3b8..cef4cb3c71 100644
--- a/epochX/cudacpp/gg_ttg.sa/SubProcesses/BridgeKernels.cc
+++ b/epochX/cudacpp/gg_ttg.sa/SubProcesses/BridgeKernels.cc
@@ -1,11 +1,10 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #include "BridgeKernels.h"
 
-#include "GpuAbstraction.h"
 #include "MemoryAccessMomenta.h"
 
 #include <sstream>
@@ -15,7 +14,7 @@ constexpr int npar = CPPProcess::npar; // #particles in total (external = initia
 
 //============================================================================
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -46,7 +45,7 @@ namespace mg5amcCpu
 
 //============================================================================
 
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
 namespace mg5amcCpu
 {
 
@@ -97,7 +96,7 @@ namespace mg5amcCpu
 
 //============================================================================
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 namespace mg5amcGpu
 {
 
diff --git a/epochX/cudacpp/gg_ttg.sa/SubProcesses/BridgeKernels.h b/epochX/cudacpp/gg_ttg.sa/SubProcesses/BridgeKernels.h
index 3efef8ce97..15eb4bff4d 100644
--- a/epochX/cudacpp/gg_ttg.sa/SubProcesses/BridgeKernels.h
+++ b/epochX/cudacpp/gg_ttg.sa/SubProcesses/BridgeKernels.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef BRIDGEKERNELS_H
 #define BRIDGEKERNELS_H 1
@@ -12,7 +12,7 @@
 #include "MatrixElementKernels.h"
 #include "MemoryBuffers.h"
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -49,7 +49,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
   // A Bridge wrapper class encapsulating matrix element calculations on a CPU host
   class BridgeKernelHost final : public BridgeKernelBase
   {
@@ -89,7 +89,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
   // A Bridge wrapper class encapsulating matrix element calculations on a GPU device
   class BridgeKernelDevice : public BridgeKernelBase
   {
diff --git a/epochX/cudacpp/gg_ttg.sa/SubProcesses/CommonRandomNumberKernel.cc b/epochX/cudacpp/gg_ttg.sa/SubProcesses/CommonRandomNumberKernel.cc
index 010bc4cbd0..985b39f576 100644
--- a/epochX/cudacpp/gg_ttg.sa/SubProcesses/CommonRandomNumberKernel.cc
+++ b/epochX/cudacpp/gg_ttg.sa/SubProcesses/CommonRandomNumberKernel.cc
@@ -1,16 +1,15 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #include "CommonRandomNumbers.h"
-#include "GpuAbstraction.h"
 #include "MemoryBuffers.h"
 #include "RandomNumberKernels.h"
 
 #include <cassert>
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/gg_ttg.sa/SubProcesses/CrossSectionKernels.cc b/epochX/cudacpp/gg_ttg.sa/SubProcesses/CrossSectionKernels.cc
index c15b39844d..0b355a3c8d 100644
--- a/epochX/cudacpp/gg_ttg.sa/SubProcesses/CrossSectionKernels.cc
+++ b/epochX/cudacpp/gg_ttg.sa/SubProcesses/CrossSectionKernels.cc
@@ -1,11 +1,10 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #include "CrossSectionKernels.h"
 
-#include "GpuAbstraction.h"
 #include "MemoryAccessMatrixElements.h"
 #include "MemoryAccessWeights.h"
 #include "MemoryBuffers.h"
@@ -78,7 +77,7 @@ debug_me_is_abnormal( const fptype& me, size_t ievtALL )
 
 //============================================================================
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -186,7 +185,7 @@ namespace mg5amcCpu
 
 //============================================================================
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 namespace mg5amcGpu
 {
 
diff --git a/epochX/cudacpp/gg_ttg.sa/SubProcesses/CrossSectionKernels.h b/epochX/cudacpp/gg_ttg.sa/SubProcesses/CrossSectionKernels.h
index 4d9659e04e..7933ca4bbf 100644
--- a/epochX/cudacpp/gg_ttg.sa/SubProcesses/CrossSectionKernels.h
+++ b/epochX/cudacpp/gg_ttg.sa/SubProcesses/CrossSectionKernels.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef CROSSSECTIONKERNELS_H
 #define CROSSSECTIONKERNELS_H 1
@@ -13,7 +13,7 @@
 
 //============================================================================
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -96,7 +96,7 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
   /*
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
   // A class encapsulating the calculation of event statistics on a GPU device
   class CrossSectionKernelDevice : public CrossSectionKernelBase, public NumberOfEvents
   {
diff --git a/epochX/cudacpp/gg_ttg.sa/SubProcesses/CudaRuntime.h b/epochX/cudacpp/gg_ttg.sa/SubProcesses/CudaRuntime.h
new file mode 100644
index 0000000000..64ce52f4b3
--- /dev/null
+++ b/epochX/cudacpp/gg_ttg.sa/SubProcesses/CudaRuntime.h
@@ -0,0 +1,85 @@
+// Copyright (C) 2020-2023 CERN and UCLouvain.
+// Licensed under the GNU Lesser General Public License (version 3 or later).
+// Created by: S. Roiser (Jul 2020) for the MG5aMC CUDACPP plugin.
+// Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+
+#ifndef MG5AMC_CUDARUNTIME_H
+#define MG5AMC_CUDARUNTIME_H 1
+
+// MG5AMC on GPU uses the CUDA runtime API, not the lower level CUDA driver API
+// See https://docs.nvidia.com/cuda/cuda-runtime-api/driver-vs-runtime-api.html#driver-vs-runtime-api
+
+#include <cassert>
+#include <iostream>
+
+//--------------------------------------------------------------------------
+
+// See https://stackoverflow.com/a/14038590
+#ifdef __CUDACC__ /* clang-format off */
+#define checkCuda( code ) { assertCuda( code, __FILE__, __LINE__ ); }
+inline void assertCuda( cudaError_t code, const char* file, int line, bool abort = true )
+{
+  if( code != cudaSuccess )
+  {
+    printf( "ERROR! assertCuda: '%s' (%d) in %s:%d\n", cudaGetErrorString( code ), code, file, line );
+    if( abort ) assert( code == cudaSuccess );
+  }
+}
+#endif /* clang-format on */
+
+//--------------------------------------------------------------------------
+
+#ifdef __CUDACC__
+namespace mg5amcGpu
+{
+  // Instantiate a CudaRuntime at the beginnining of the application's main to
+  // invoke cudaSetDevice(0) in the constructor and book a cudaDeviceReset() call in the destructor
+  // *** FIXME! This will all need to be designed differently when going to multi-GPU nodes! ***
+  struct CudaRuntime final
+  {
+    CudaRuntime( const bool debug = true )
+      : m_debug( debug ) { setUp( m_debug ); }
+    ~CudaRuntime() { tearDown( m_debug ); }
+    CudaRuntime( const CudaRuntime& ) = delete;
+    CudaRuntime( CudaRuntime&& ) = delete;
+    CudaRuntime& operator=( const CudaRuntime& ) = delete;
+    CudaRuntime& operator=( CudaRuntime&& ) = delete;
+    bool m_debug;
+
+    // Set up CUDA application
+    // ** NB: strictly speaking this is not needed when using the CUDA runtime API **
+    // Calling cudaSetDevice on startup is useful to properly book-keep the time spent in CUDA initialization
+    static void setUp( const bool debug = true )
+    {
+      // ** NB: it is useful to call cudaSetDevice, or cudaFree, to properly book-keep the time spent in CUDA initialization
+      // ** NB: otherwise, the first CUDA operation (eg a cudaMemcpyToSymbol in CPPProcess ctor) appears to take much longer!
+      /*
+      // [We initially added cudaFree(0) to "ease profile analysis" only because it shows up as a big recognizable block!]
+      // No explicit initialization is needed: https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#initialization
+      // It is not clear what cudaFree(0) does at all: https://stackoverflow.com/questions/69967813/
+      if ( debug ) std::cout << "__CudaRuntime: calling cudaFree(0)" << std::endl;
+      checkCuda( cudaFree( 0 ) ); // SLOW!
+      */
+      // Replace cudaFree(0) by cudaSetDevice(0), even if it is not really needed either
+      // (but see https://developer.nvidia.com/blog/cuda-pro-tip-always-set-current-device-avoid-multithreading-bugs)
+      if( debug ) std::cout << "__CudaRuntime: calling cudaSetDevice(0)" << std::endl;
+      checkCuda( cudaSetDevice( 0 ) ); // SLOW!
+    }
+
+    // Tear down CUDA application (call cudaDeviceReset)
+    // ** NB: strictly speaking this is not needed when using the CUDA runtime API **
+    // Calling cudaDeviceReset on shutdown is only needed for checking memory leaks in cuda-memcheck
+    // See https://docs.nvidia.com/cuda/cuda-memcheck/index.html#leak-checking
+    static void tearDown( const bool debug = true )
+    {
+      if( debug ) std::cout << "__CudaRuntime: calling cudaDeviceReset()" << std::endl;
+      checkCuda( cudaDeviceReset() );
+    }
+  };
+
+}
+#endif
+
+//--------------------------------------------------------------------------
+
+#endif // MG5AMC_CUDARUNTIME_H
diff --git a/epochX/cudacpp/gg_ttg.sa/SubProcesses/CurandRandomNumberKernel.cc b/epochX/cudacpp/gg_ttg.sa/SubProcesses/CurandRandomNumberKernel.cc
index 38c477c17a..eb56333b03 100644
--- a/epochX/cudacpp/gg_ttg.sa/SubProcesses/CurandRandomNumberKernel.cc
+++ b/epochX/cudacpp/gg_ttg.sa/SubProcesses/CurandRandomNumberKernel.cc
@@ -1,9 +1,9 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
-#include "GpuRuntime.h"
+#include "CudaRuntime.h"
 #include "MemoryBuffers.h"
 #include "RandomNumberKernels.h"
 
@@ -114,7 +114,7 @@ namespace mg5amcCpu
     /*
     printf( "\nCurandRandomNumberKernel::generateRnarray size = %d\n", (int)m_rnarray.size() );
     fptype* data = m_rnarray.data();
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
     if( m_rnarray.isOnDevice() )
     {
       data = new fptype[m_rnarray.size()]();
@@ -123,7 +123,7 @@ namespace mg5amcCpu
 #endif
     for( int i = 0; i < ( (int)m_rnarray.size() / 4 ); i++ )
       printf( "[%4d] %f %f %f %f\n", i * 4, data[i * 4], data[i * 4 + 2], data[i * 4 + 2], data[i * 4 + 3] );
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
     if( m_rnarray.isOnDevice() ) delete[] data;
 #endif
     */
diff --git a/epochX/cudacpp/gg_ttg.sa/SubProcesses/EventStatistics.h b/epochX/cudacpp/gg_ttg.sa/SubProcesses/EventStatistics.h
index b425a5bade..48b51e0a49 100644
--- a/epochX/cudacpp/gg_ttg.sa/SubProcesses/EventStatistics.h
+++ b/epochX/cudacpp/gg_ttg.sa/SubProcesses/EventStatistics.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef EventStatistics_H
 #define EventStatistics_H 1
@@ -16,7 +16,7 @@
 #include <limits>
 #include <string>
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/gg_ttg.sa/SubProcesses/MadgraphTest.h b/epochX/cudacpp/gg_ttg.sa/SubProcesses/MadgraphTest.h
index d2ff326e20..fd7734ce42 100644
--- a/epochX/cudacpp/gg_ttg.sa/SubProcesses/MadgraphTest.h
+++ b/epochX/cudacpp/gg_ttg.sa/SubProcesses/MadgraphTest.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: S. Hageboeck (Dec 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MADGRAPHTEST_H_
 #define MADGRAPHTEST_H_ 1
@@ -21,7 +21,7 @@
 #include <string>
 #include <vector>
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 using mg5amcGpu::CPPProcess;
 #else
 using mg5amcCpu::CPPProcess;
@@ -200,7 +200,7 @@ class MadgraphTest : public testing::TestWithParam<TestDriverBase*>
 
 // Since we link both the CPU-only and GPU tests into the same executable, we prevent
 // a multiply defined symbol by only compiling this in the non-CUDA phase:
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
 
 /// Compare momenta and matrix elements.
 /// This uses an implementation of TestDriverBase to run a madgraph workflow,
@@ -309,6 +309,6 @@ TEST_P( MadgraphTest, CompareMomentaAndME )
   }
 }
 
-#endif // MGONGPUCPP_GPUIMPL
+#endif // __CUDACC__
 
 #endif /* MADGRAPHTEST_H_ */
diff --git a/epochX/cudacpp/gg_ttg.sa/SubProcesses/MatrixElementKernels.cc b/epochX/cudacpp/gg_ttg.sa/SubProcesses/MatrixElementKernels.cc
index d6d6c4f179..30257195b6 100644
--- a/epochX/cudacpp/gg_ttg.sa/SubProcesses/MatrixElementKernels.cc
+++ b/epochX/cudacpp/gg_ttg.sa/SubProcesses/MatrixElementKernels.cc
@@ -1,12 +1,12 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #include "MatrixElementKernels.h"
 
 #include "CPPProcess.h"
-#include "GpuRuntime.h" // Includes the abstraction for Nvidia/AMD compilation
+#include "CudaRuntime.h"
 #include "MemoryAccessMomenta.h"
 #include "MemoryBuffers.h"
 
@@ -14,7 +14,7 @@
 
 //============================================================================
 
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
 namespace mg5amcCpu
 {
 
@@ -143,7 +143,7 @@ namespace mg5amcCpu
 
 //============================================================================
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 namespace mg5amcGpu
 {
 
@@ -202,13 +202,13 @@ namespace mg5amcGpu
     PinnedHostBufferHelicityMask hstIsGoodHel( ncomb );
     DeviceBufferHelicityMask devIsGoodHel( ncomb );
     // ... 0d1. Compute good helicity mask on the device
-    gpuLaunchKernel( computeDependentCouplings, m_gpublocks, m_gputhreads, m_gs.data(), m_couplings.data() );
+    computeDependentCouplings<<<m_gpublocks, m_gputhreads>>>( m_gs.data(), m_couplings.data() );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    gpuLaunchKernel( sigmaKin_getGoodHel, m_gpublocks, m_gputhreads, m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_numerators.data(), m_denominators.data(), devIsGoodHel.data() );
+    sigmaKin_getGoodHel<<<m_gpublocks, m_gputhreads>>>( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_numerators.data(), m_denominators.data(), devIsGoodHel.data() );
 #else
-    gpuLaunchKernel( sigmaKin_getGoodHel, m_gpublocks, m_gputhreads, m_momenta.data(), m_couplings.data(), m_matrixElements.data(), devIsGoodHel.data() );
+    sigmaKin_getGoodHel<<<m_gpublocks, m_gputhreads>>>( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), devIsGoodHel.data() );
 #endif
-    checkGpu( gpuPeekAtLastError() );
+    checkCuda( cudaPeekAtLastError() );
     // ... 0d2. Copy back good helicity mask to the host
     copyHostFromDevice( hstIsGoodHel, devIsGoodHel );
     // ... 0d3. Copy back good helicity list to constant memory on the device
@@ -219,19 +219,19 @@ namespace mg5amcGpu
 
   void MatrixElementKernelDevice::computeMatrixElements( const unsigned int channelId )
   {
-    gpuLaunchKernel( computeDependentCouplings, m_gpublocks, m_gputhreads, m_gs.data(), m_couplings.data() );
+    computeDependentCouplings<<<m_gpublocks, m_gputhreads>>>( m_gs.data(), m_couplings.data() );
 #ifndef MGONGPU_NSIGHT_DEBUG
     constexpr unsigned int sharedMemSize = 0;
 #else
     constexpr unsigned int sharedMemSize = ntpbMAX * sizeof( float );
 #endif
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    gpuLaunchKernelSharedMem( sigmaKin, m_gpublocks, m_gputhreads, sharedMemSize, m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), channelId, m_numerators.data(), m_denominators.data(), m_selhel.data(), m_selcol.data() );
+    sigmaKin<<<m_gpublocks, m_gputhreads, sharedMemSize>>>( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), channelId, m_numerators.data(), m_denominators.data(), m_selhel.data(), m_selcol.data() );
 #else
-    gpuLaunchKernelSharedMem( sigmaKin, m_gpublocks, m_gputhreads, sharedMemSize, m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), m_selhel.data(), m_selcol.data() );
+    sigmaKin<<<m_gpublocks, m_gputhreads, sharedMemSize>>>( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), m_selhel.data(), m_selcol.data() );
 #endif
-    checkGpu( gpuPeekAtLastError() );
-    checkGpu( gpuDeviceSynchronize() );
+    checkCuda( cudaPeekAtLastError() );
+    checkCuda( cudaDeviceSynchronize() );
   }
 
   //--------------------------------------------------------------------------
diff --git a/epochX/cudacpp/gg_ttg.sa/SubProcesses/MatrixElementKernels.h b/epochX/cudacpp/gg_ttg.sa/SubProcesses/MatrixElementKernels.h
index 72bd8f195b..23e84757a2 100644
--- a/epochX/cudacpp/gg_ttg.sa/SubProcesses/MatrixElementKernels.h
+++ b/epochX/cudacpp/gg_ttg.sa/SubProcesses/MatrixElementKernels.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MATRIXELEMENTKERNELS_H
 #define MATRIXELEMENTKERNELS_H 1
@@ -10,7 +10,7 @@
 
 #include "MemoryBuffers.h"
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -81,7 +81,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
   // A class encapsulating matrix element calculations on a CPU host
   class MatrixElementKernelHost final : public MatrixElementKernelBase, public NumberOfEvents
   {
@@ -130,7 +130,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
   // A class encapsulating matrix element calculations on a GPU device
   class MatrixElementKernelDevice : public MatrixElementKernelBase, public NumberOfEvents
   {
diff --git a/epochX/cudacpp/gg_ttg.sa/SubProcesses/MemoryAccessHelpers.h b/epochX/cudacpp/gg_ttg.sa/SubProcesses/MemoryAccessHelpers.h
index db73e4e064..c82a6c7635 100644
--- a/epochX/cudacpp/gg_ttg.sa/SubProcesses/MemoryAccessHelpers.h
+++ b/epochX/cudacpp/gg_ttg.sa/SubProcesses/MemoryAccessHelpers.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MemoryAccessHelpers_H
 #define MemoryAccessHelpers_H 1
@@ -105,7 +105,7 @@ class KernelAccessHelper : public MemoryAccessHelper<T>
     }
     else
     {
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
       const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid
       //printf( "kernelAccessRecord: ievt=%d threadId=%d\n", ievt, threadIdx.x );
       return T::ieventAccessRecord( buffer, ievt ); // NB fptype and fptype_sv coincide for CUDA
diff --git a/epochX/cudacpp/gg_ttg.sa/SubProcesses/MemoryAccessMomenta.h b/epochX/cudacpp/gg_ttg.sa/SubProcesses/MemoryAccessMomenta.h
index 38fade09fb..0ac4faa3c7 100644
--- a/epochX/cudacpp/gg_ttg.sa/SubProcesses/MemoryAccessMomenta.h
+++ b/epochX/cudacpp/gg_ttg.sa/SubProcesses/MemoryAccessMomenta.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MemoryAccessMomenta_H
 #define MemoryAccessMomenta_H 1
@@ -12,7 +12,7 @@
 #include "MemoryAccessHelpers.h"
 #include "MemoryAccessVectors.h"
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 using mg5amcGpu::CPPProcess;
 #else
 using mg5amcCpu::CPPProcess;
@@ -29,7 +29,7 @@ class MemoryAccessMomentaBase //_AOSOAv1
 
   // Number of Events Per Page in the momenta AOSOA memory buffer layout
   // (these are all best kept as a compile-time constants: see issue #23)
-#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
+#ifdef __CUDACC__ /* clang-format off */
   // -----------------------------------------------------------------------------------------------
   // --- GPUs: neppM is best set to a power of 2 times the number of fptype's in a 32-byte cacheline
   // --- This is relevant to ensure coalesced access to momenta in global memory
diff --git a/epochX/cudacpp/gg_ttg.sa/SubProcesses/MemoryAccessRandomNumbers.h b/epochX/cudacpp/gg_ttg.sa/SubProcesses/MemoryAccessRandomNumbers.h
index 40cb089135..e2988d39f3 100644
--- a/epochX/cudacpp/gg_ttg.sa/SubProcesses/MemoryAccessRandomNumbers.h
+++ b/epochX/cudacpp/gg_ttg.sa/SubProcesses/MemoryAccessRandomNumbers.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MemoryAccessRandomNumbers_H
 #define MemoryAccessRandomNumbers_H 1
@@ -11,7 +11,7 @@
 #include "CPPProcess.h"
 #include "MemoryAccessHelpers.h"
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 using mg5amcGpu::CPPProcess;
 #else
 using mg5amcCpu::CPPProcess;
diff --git a/epochX/cudacpp/gg_ttg.sa/SubProcesses/MemoryAccessVectors.h b/epochX/cudacpp/gg_ttg.sa/SubProcesses/MemoryAccessVectors.h
index 08faccff0f..e9b197368e 100644
--- a/epochX/cudacpp/gg_ttg.sa/SubProcesses/MemoryAccessVectors.h
+++ b/epochX/cudacpp/gg_ttg.sa/SubProcesses/MemoryAccessVectors.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MemoryAccessVectors_H
 #define MemoryAccessVectors_H 1
@@ -10,7 +10,7 @@
 
 #include "mgOnGpuVectors.h"
 
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
 namespace mg5amcCpu // this is only needed for CPU SIMD vectorization
 {
 
diff --git a/epochX/cudacpp/gg_ttg.sa/SubProcesses/MemoryBuffers.h b/epochX/cudacpp/gg_ttg.sa/SubProcesses/MemoryBuffers.h
index 7756a71621..3093e6ed18 100644
--- a/epochX/cudacpp/gg_ttg.sa/SubProcesses/MemoryBuffers.h
+++ b/epochX/cudacpp/gg_ttg.sa/SubProcesses/MemoryBuffers.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021, based on earlier work by S. Hageboeck) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Roiser, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MemoryBuffers_H
 #define MemoryBuffers_H 1
@@ -11,12 +11,12 @@
 #include "mgOnGpuCxtypes.h"
 
 #include "CPPProcess.h"
-#include "GpuRuntime.h"
+#include "CudaRuntime.h"
 #include "Parameters_sm.h"
 
 #include <sstream>
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -87,7 +87,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
   constexpr bool HostBufferALIGNED = false;   // ismisaligned=false
   constexpr bool HostBufferMISALIGNED = true; // ismisaligned=true
 
@@ -119,7 +119,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
   // A class encapsulating a CUDA pinned host buffer
   template<typename T>
   class PinnedHostBufferBase : public BufferBase<T>
@@ -128,18 +128,18 @@ namespace mg5amcCpu
     PinnedHostBufferBase( const size_t size )
       : BufferBase<T>( size, false )
     {
-      gpuMallocHost( &( this->m_data ), this->bytes() );
+      checkCuda( cudaMallocHost( &( this->m_data ), this->bytes() ) );
     }
     virtual ~PinnedHostBufferBase()
     {
-      gpuFreeHost( this->m_data );
+      checkCuda( cudaFreeHost( this->m_data ) );
     }
   };
 #endif
 
   //--------------------------------------------------------------------------
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
   // A class encapsulating a CUDA device buffer
   template<typename T>
   class DeviceBufferBase : public BufferBase<T>
@@ -148,18 +148,18 @@ namespace mg5amcCpu
     DeviceBufferBase( const size_t size )
       : BufferBase<T>( size, true )
     {
-      gpuMalloc( &( this->m_data ), this->bytes() );
+      checkCuda( cudaMalloc( &( this->m_data ), this->bytes() ) );
     }
     virtual ~DeviceBufferBase()
     {
-      gpuFree( this->m_data );
+      checkCuda( cudaFree( this->m_data ) );
     }
   };
 #endif
 
   //--------------------------------------------------------------------------
 
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
   // A class encapsulating a C++ host buffer for a given number of events
   template<typename T, size_t sizePerEvent, bool ismisaligned>
   class HostBuffer : public HostBufferBase<T, ismisaligned>, virtual private NumberOfEvents
@@ -175,7 +175,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
   // A class encapsulating a CUDA pinned host buffer for a given number of events
   template<typename T, size_t sizePerEvent>
   class PinnedHostBuffer : public PinnedHostBufferBase<T>, virtual private NumberOfEvents
@@ -191,7 +191,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
   // A class encapsulating a CUDA device buffer for a given number of events
   template<typename T, size_t sizePerEvent>
   class DeviceBuffer : public DeviceBufferBase<T>, virtual private NumberOfEvents
@@ -213,7 +213,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for momenta random numbers
   constexpr size_t sizePerEventRndNumMomenta = MemoryBuffers::np4 * MemoryBuffers::nparf;
 
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
   // A class encapsulating a C++ host buffer for momenta random numbers
   typedef HostBuffer<fptype, sizePerEventRndNumMomenta, HostBufferALIGNED> HostBufferRndNumMomenta;
 #else
@@ -232,7 +232,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer with ONE fptype per event
   constexpr size_t sizePerEventOneFp = 1;
 
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
   // A class encapsulating a C++ host buffer with ONE fptype per event
   typedef HostBuffer<fptype, sizePerEventOneFp, HostBufferALIGNED> HostBufferOneFp;
 #else
@@ -257,7 +257,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for Gs
   constexpr size_t sizePerEventGs = 1;
 
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
   // A class encapsulating a C++ host buffer for gs
   typedef HostBuffer<fptype, sizePerEventGs, HostBufferALIGNED> HostBufferGs;
 #else
@@ -276,7 +276,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for numerators
   constexpr size_t sizePerEventNumerators = 1;
 
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
   // A class encapsulating a C++ host buffer for gs
   typedef HostBuffer<fptype, sizePerEventNumerators, HostBufferALIGNED> HostBufferNumerators;
 #else
@@ -296,7 +296,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for denominators
   constexpr size_t sizePerEventDenominators = 1;
 
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
   // A class encapsulating a C++ host buffer for gs
   typedef HostBuffer<fptype, sizePerEventDenominators, HostBufferALIGNED> HostBufferDenominators;
 #else
@@ -315,7 +315,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for random numbers
   constexpr size_t sizePerEventCouplings = MemoryBuffers::ndcoup * MemoryBuffers::nx2;
 
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
   // A class encapsulating a C++ host buffer for gs
   typedef HostBuffer<fptype, sizePerEventCouplings, HostBufferALIGNED> HostBufferCouplings;
 #else
@@ -333,7 +333,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for momenta
   constexpr size_t sizePerEventMomenta = MemoryBuffers::np4 * MemoryBuffers::npar;
 
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
   // A class encapsulating a C++ host buffer for momenta
   typedef HostBuffer<fptype, sizePerEventMomenta, HostBufferALIGNED> HostBufferMomenta;
   //typedef HostBuffer<fptype, sizePerEventMomenta, HostBufferMISALIGNED> HostBufferMomenta; // TEST MISALIGNMENT!
@@ -352,7 +352,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for sampling weights
   constexpr size_t sizePerEventWeights = 1;
 
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
   // A class encapsulating a C++ host buffer for sampling weights
   typedef HostBuffer<fptype, sizePerEventWeights, HostBufferALIGNED> HostBufferWeights;
 #else
@@ -370,7 +370,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for matrix elements
   constexpr size_t sizePerEventMatrixElements = 1;
 
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
   // A class encapsulating a C++ host buffer for matrix elements
   typedef HostBuffer<fptype, sizePerEventMatrixElements, HostBufferALIGNED> HostBufferMatrixElements;
 #else
@@ -385,7 +385,7 @@ namespace mg5amcCpu
   // A base class encapsulating a memory buffer for the helicity mask
   typedef BufferBase<bool> BufferHelicityMask;
 
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
   // A class encapsulating a C++ host buffer for the helicity mask
   typedef HostBufferBase<bool, HostBufferALIGNED> HostBufferHelicityMask;
 #else
@@ -403,7 +403,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for wavefunctions
   constexpr size_t sizePerEventWavefunctions = MemoryBuffers::nw6 * MemoryBuffers::nx2;
 
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
   // A class encapsulating a C++ host buffer for wavefunctions
   typedef HostBuffer<fptype, sizePerEventWavefunctions, HostBufferALIGNED> HostBufferWavefunctions;
 #else
@@ -421,7 +421,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for helicity random numbers
   constexpr size_t sizePerEventRndNumHelicity = 1;
 
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
   // A class encapsulating a C++ host buffer for helicity random numbers
   typedef HostBuffer<fptype, sizePerEventRndNumHelicity, HostBufferALIGNED> HostBufferRndNumHelicity;
 #else
@@ -439,7 +439,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for color random numbers
   constexpr size_t sizePerEventRndNumColor = 1;
 
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
   // A class encapsulating a C++ host buffer for color random numbers
   typedef HostBuffer<fptype, sizePerEventRndNumColor, HostBufferALIGNED> HostBufferRndNumColor;
 #else
@@ -457,7 +457,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for helicity selection
   constexpr size_t sizePerEventSelectedHelicity = 1;
 
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
   // A class encapsulating a C++ host buffer for helicity selection
   typedef HostBuffer<int, sizePerEventSelectedHelicity, HostBufferALIGNED> HostBufferSelectedHelicity;
 #else
@@ -475,7 +475,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for color selection
   constexpr size_t sizePerEventSelectedColor = 1;
 
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
   // A class encapsulating a C++ host buffer for color selection
   typedef HostBuffer<int, sizePerEventSelectedColor, HostBufferALIGNED> HostBufferSelectedColor;
 #else
@@ -487,7 +487,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
   template<class Tdst, class Tsrc>
   void copyDeviceFromHost( Tdst& dst, const Tsrc& src ) // keep the same order of arguments as in memcpy
   {
@@ -504,13 +504,13 @@ namespace mg5amcCpu
       throw std::runtime_error( sstr.str() );
     }
     // NB (PR #45): cudaMemcpy involves an intermediate memcpy to pinned memory if host array is a not a pinned host array
-    gpuMemcpy( dst.data(), src.data(), src.bytes(), gpuMemcpyHostToDevice );
+    checkCuda( cudaMemcpy( dst.data(), src.data(), src.bytes(), cudaMemcpyHostToDevice ) );
   }
 #endif
 
   //--------------------------------------------------------------------------
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
   template<class Tdst, class Tsrc>
   void copyHostFromDevice( Tdst& dst, const Tsrc& src ) // keep the same order of arguments as in memcpy
   {
@@ -527,7 +527,7 @@ namespace mg5amcCpu
       throw std::runtime_error( sstr.str() );
     }
     // NB (PR #45): cudaMemcpy involves an intermediate memcpy to pinned memory if host array is a not a pinned host array
-    gpuMemcpy( dst.data(), src.data(), src.bytes(), gpuMemcpyDeviceToHost );
+    checkCuda( cudaMemcpy( dst.data(), src.data(), src.bytes(), cudaMemcpyDeviceToHost ) );
   }
 #endif
 
diff --git a/epochX/cudacpp/gg_ttg.sa/SubProcesses/P1_Sigma_sm_gg_ttxg/CPPProcess.cc b/epochX/cudacpp/gg_ttg.sa/SubProcesses/P1_Sigma_sm_gg_ttxg/CPPProcess.cc
index b723717621..09575d4a91 100644
--- a/epochX/cudacpp/gg_ttg.sa/SubProcesses/P1_Sigma_sm_gg_ttxg/CPPProcess.cc
+++ b/epochX/cudacpp/gg_ttg.sa/SubProcesses/P1_Sigma_sm_gg_ttxg/CPPProcess.cc
@@ -4,7 +4,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
 // MadGraph5_aMC@NLO v. 3.5.0_lo_vect, 2023-06-09
@@ -16,6 +16,7 @@
 
 #include "mgOnGpuConfig.h"
 
+#include "CudaRuntime.h"
 #include "HelAmps_sm.h"
 #include "MemoryAccessAmplitudes.h"
 #include "MemoryAccessCouplings.h"
@@ -45,7 +46,7 @@
 // Class member functions for calculating the matrix elements for
 // Process: g g > t t~ g WEIGHTED<=3 @1
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -79,7 +80,7 @@ namespace mg5amcCpu
   __device__ const fptype cIPD[2] = { (fptype)Parameters_sm::mdl_MT, (fptype)Parameters_sm::mdl_WT };
   __device__ const fptype* cIPC = nullptr; // unused as nicoup=0
 #else
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
   __device__ __constant__ fptype cIPD[2];
   __device__ __constant__ fptype* cIPC = nullptr; // unused as nicoup=0
 #else
@@ -89,7 +90,7 @@ namespace mg5amcCpu
 #endif
 
   // Helicity combinations (and filtering of "good" helicity combinations)
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
   __device__ __constant__ short cHel[ncomb][npar];
   __device__ __constant__ int cNGoodHel;
   __device__ __constant__ int cGoodHel[ncomb];
@@ -117,13 +118,13 @@ namespace mg5amcCpu
                            fptype* allDenominators,       // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
                            fptype_sv* jamp2_sv            // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled)
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
                            , const int ievt00             // input: first event number in current C++ event page (for CUDA, ievt depends on threadid)
 #endif
                            )
   //ALWAYS_INLINE // attributes are not permitted in a function definition
   {
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
     using namespace mg5amcGpu;
     using M_ACCESS = DeviceAccessMomenta;         // non-trivial access: buffer includes all events
     using E_ACCESS = DeviceAccessMatrixElements;  // non-trivial access: buffer includes all events
@@ -150,7 +151,7 @@ namespace mg5amcCpu
 #endif /* clang-format on */
     mgDebug( 0, __FUNCTION__ );
     //printf( "calculate_wavefunctions: ihel=%2d\n", ihel );
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
     //printf( "calculate_wavefunctions: ievt00=%d\n", ievt00 );
 #endif
 
@@ -186,7 +187,7 @@ namespace mg5amcCpu
 #endif
     for( int iParity = 0; iParity < nParity; ++iParity )
     { // START LOOP ON IPARITY
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
       const int ievt0 = ievt00 + iParity * neppV;
 #endif
       constexpr size_t nxcoup = ndcoup + nicoup; // both dependent and independent couplings
@@ -199,10 +200,8 @@ namespace mg5amcCpu
         allCOUPs[idcoup] = CD_ACCESS::idcoupAccessBufferConst( allcouplings, idcoup ); // dependent couplings, vary event-by-event
       for( size_t iicoup = 0; iicoup < nicoup; iicoup++ )
         allCOUPs[ndcoup + iicoup] = CI_ACCESS::iicoupAccessBufferConst( cIPC, iicoup ); // independent couplings, fixed for all events
-#ifdef MGONGPUCPP_GPUIMPL
 #ifdef __CUDACC__
 #pragma nv_diagnostic pop
-#endif
       // CUDA kernels take input/output buffers with momenta/MEs for all events
       const fptype* momenta = allmomenta;
       const fptype* COUPs[nxcoup];
@@ -500,7 +499,7 @@ namespace mg5amcCpu
         { 1, -8, 10, 1, 64, -8 },
         { 10, 1, 1, -8, -8, 64 } }; // 2-D array[6][6]
 
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
       // Pre-compute a constexpr triangular color matrix properly normalized #475
       struct TriangularNormalizedColorMatrix
       {
@@ -557,7 +556,7 @@ namespace mg5amcCpu
 #endif
       for( int icol = 0; icol < ncolor; icol++ )
       {
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
         // === C++ START ===
         // Diagonal terms
 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
@@ -616,7 +615,7 @@ namespace mg5amcCpu
       MEs_sv_previous += deltaMEs_previous;
 #endif
       /*
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
       if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", blockDim.x * blockIdx.x + threadIdx.x, ihel, MEs_sv );
 #else
 #ifdef MGONGPU_CPPSIMD
@@ -679,8 +678,8 @@ namespace mg5amcCpu
       { 1, 1, 1, 1, 1 },
       { 1, 1, 1, -1, -1 },
       { 1, 1, 1, -1, 1 } };
-#ifdef MGONGPUCPP_GPUIMPL
-    gpuMemcpyToSymbol( cHel, tHel, ncomb * npar * sizeof( short ) );
+#ifdef __CUDACC__
+    checkCuda( cudaMemcpyToSymbol( cHel, tHel, ncomb * npar * sizeof( short ) ) );
 #else
     memcpy( cHel, tHel, ncomb * npar * sizeof( short ) );
 #endif
@@ -721,9 +720,9 @@ namespace mg5amcCpu
     // Then copy them to CUDA constant memory (issue #39) or its C++ emulation in file-scope static memory
     const fptype tIPD[2] = { (fptype)m_pars->mdl_MT, (fptype)m_pars->mdl_WT };
     //const cxtype tIPC[0] = { ... }; // nicoup=0
-#ifdef MGONGPUCPP_GPUIMPL
-    gpuMemcpyToSymbol( cIPD, tIPD, 2 * sizeof( fptype ) );
-    //gpuMemcpyToSymbol( cIPC, tIPC, 0 * sizeof( cxtype ) ); // nicoup=0
+#ifdef __CUDACC__
+    checkCuda( cudaMemcpyToSymbol( cIPD, tIPD, 2 * sizeof( fptype ) ) );
+    //checkCuda( cudaMemcpyToSymbol( cIPC, tIPC, 0 * sizeof( cxtype ) ) ); // nicoup=0
 #else
     memcpy( cIPD, tIPD, 2 * sizeof( fptype ) );
     //memcpy( cIPC, tIPC, 0 * sizeof( cxtype ) ); // nicoup=0
@@ -760,7 +759,7 @@ namespace mg5amcCpu
   {
     std::stringstream out;
     // CUDA version (NVCC)
-    // [Use __NVCC__ instead of MGONGPUCPP_GPUIMPL here!]
+    // [Use __NVCC__ instead of __CUDACC__ here!]
     // [This tests if 'nvcc' was used even to build a .cc file, even if not necessarily 'nvcc -x cu' for a .cu file]
     // [Check 'nvcc --compiler-options -dM -E dummy.c | grep CUDA': see https://stackoverflow.com/a/53713712]
 #ifdef __NVCC__
@@ -825,12 +824,12 @@ namespace mg5amcCpu
   __global__ void /* clang-format off */
   computeDependentCouplings( const fptype* allgs, // input: Gs[nevt]
                              fptype* allcouplings // output: couplings[nevt*ndcoup*2]
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
                              , const int nevt     // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #endif
   ) /* clang-format on */
   {
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
     using namespace mg5amcGpu;
     using G_ACCESS = DeviceAccessGs;
     using C_ACCESS = DeviceAccessCouplings;
@@ -851,7 +850,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
+#ifdef __CUDACC__ /* clang-format off */
   __global__ void
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
@@ -980,9 +979,9 @@ namespace mg5amcCpu
         nGoodHel++;
       }
     }
-#ifdef MGONGPUCPP_GPUIMPL
-    gpuMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) );
-    gpuMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) );
+#ifdef __CUDACC__
+    checkCuda( cudaMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) ) );
+    checkCuda( cudaMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) ) );
 #else
     cNGoodHel = nGoodHel;
     for( int ihel = 0; ihel < ncomb; ihel++ ) cGoodHel[ihel] = goodHel[ihel];
@@ -1006,7 +1005,7 @@ namespace mg5amcCpu
 #endif
             int* allselhel,                // output: helicity selection[nevt]
             int* allselcol                 // output: helicity selection[nevt]
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
             , const int nevt               // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #endif
             ) /* clang-format on */
@@ -1027,7 +1026,7 @@ namespace mg5amcCpu
     // Denominators: spins, colors and identical particles
     constexpr int helcolDenominators[1] = { 256 }; // assume nprocesses == 1 (#272 and #343)
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
     // Remember: in CUDA this is a kernel for one event, in c++ this processes n events
     const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid
 #else
@@ -1041,12 +1040,9 @@ namespace mg5amcCpu
 #endif
 
     // Start sigmaKin_lines
-
-#include "GpuAbstraction.h"
-
     // === PART 0 - INITIALISATION (before calculate_wavefunctions) ===
     // Reset the "matrix elements" - running sums of |M|^2 over helicities for the given event
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
     allMEs[ievt] = 0;
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     allNumerators[ievt] = 0;
@@ -1074,7 +1070,7 @@ namespace mg5amcCpu
     // === PART 1 - HELICITY LOOP: CALCULATE WAVEFUNCTIONS ===
     // (in both CUDA and C++, using precomputed good helicities)
 
-#ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++
+#ifdef __CUDACC__ // CUDA OR C++
 
     // *** START OF PART 1a - CUDA (one event per CPU thread) ***
     // Running sum of partial amplitudes squared for event by event color selection (#402)
@@ -1278,7 +1274,7 @@ namespace mg5amcCpu
     // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event
     // [NB 'sum over final spins, average over initial spins', eg see
     // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf]
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
     allMEs[ievt] /= helcolDenominators[0];
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     if( channelId > 0 ) allMEs[ievt] *= allNumerators[ievt] / allDenominators[ievt];
diff --git a/epochX/cudacpp/gg_ttg.sa/SubProcesses/P1_Sigma_sm_gg_ttxg/CPPProcess.h b/epochX/cudacpp/gg_ttg.sa/SubProcesses/P1_Sigma_sm_gg_ttxg/CPPProcess.h
index ff2cb4ab9a..0edca1b52a 100644
--- a/epochX/cudacpp/gg_ttg.sa/SubProcesses/P1_Sigma_sm_gg_ttxg/CPPProcess.h
+++ b/epochX/cudacpp/gg_ttg.sa/SubProcesses/P1_Sigma_sm_gg_ttxg/CPPProcess.h
@@ -4,7 +4,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
 // MadGraph5_aMC@NLO v. 3.5.0_lo_vect, 2023-06-09
@@ -25,7 +25,7 @@
 
 //--------------------------------------------------------------------------
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -107,7 +107,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
   __global__ void
   computeDependentCouplings( const fptype* allgs,    // input: Gs[nevt]
                              fptype* allcouplings ); // output: couplings[nevt*ndcoup*2]
@@ -120,7 +120,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
+#ifdef __CUDACC__ /* clang-format off */
   __global__ void
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
@@ -150,7 +150,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
+#ifdef __CUDACC__ /* clang-format off */
   __global__ void
   sigmaKin( const fptype* allmomenta,      // input: momenta[nevt*npar*4]
             const fptype* allcouplings,    // input: couplings[nevt*ndcoup*2]
diff --git a/epochX/cudacpp/gg_ttg.sa/SubProcesses/P1_Sigma_sm_gg_ttxg/CudaRuntime.h b/epochX/cudacpp/gg_ttg.sa/SubProcesses/P1_Sigma_sm_gg_ttxg/CudaRuntime.h
new file mode 120000
index 0000000000..ce9e1a487a
--- /dev/null
+++ b/epochX/cudacpp/gg_ttg.sa/SubProcesses/P1_Sigma_sm_gg_ttxg/CudaRuntime.h
@@ -0,0 +1 @@
+../CudaRuntime.h
\ No newline at end of file
diff --git a/epochX/cudacpp/gg_ttg.sa/SubProcesses/P1_Sigma_sm_gg_ttxg/check_sa.cc b/epochX/cudacpp/gg_ttg.sa/SubProcesses/P1_Sigma_sm_gg_ttxg/check_sa.cc
index 1bad694d1c..f1e75b9252 100644
--- a/epochX/cudacpp/gg_ttg.sa/SubProcesses/P1_Sigma_sm_gg_ttxg/check_sa.cc
+++ b/epochX/cudacpp/gg_ttg.sa/SubProcesses/P1_Sigma_sm_gg_ttxg/check_sa.cc
@@ -4,7 +4,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: O. Mattelaer (Nov 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 
 #include "mgOnGpuConfig.h"
@@ -12,7 +12,6 @@
 #include "BridgeKernels.h"
 #include "CPPProcess.h"
 #include "CrossSectionKernels.h"
-#include "GpuRuntime.h"
 #include "MatrixElementKernels.h"
 #include "MemoryAccessMatrixElements.h"
 #include "MemoryAccessMomenta.h"
@@ -64,7 +63,7 @@ usage( char* argv0, int ret = 1 )
   std::cout << std::endl;
   std::cout << "Summary stats are always computed: '-p' and '-j' only control their printout" << std::endl;
   std::cout << "The '-d' flag only enables NaN/abnormal warnings and OMP debugging" << std::endl;
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
 #ifdef _OPENMP
   std::cout << std::endl;
   std::cout << "Use the OMP_NUM_THREADS environment variable to control OMP multi-threading" << std::endl;
@@ -78,7 +77,7 @@ int
 main( int argc, char** argv )
 {
   // Namespaces for CUDA and C++ (FIXME - eventually use the same namespace everywhere...)
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
   using namespace mg5amcGpu;
 #else
   using namespace mg5amcCpu;
@@ -104,11 +103,11 @@ main( int argc, char** argv )
     CurandDevice = 2
   };
 #ifdef __CUDACC__
-  RandomNumberMode rndgen = RandomNumberMode::CurandDevice; // default on CUDA GPU
+  RandomNumberMode rndgen = RandomNumberMode::CurandDevice; // default on GPU
 #elif not defined MGONGPU_HAS_NO_CURAND
   RandomNumberMode rndgen = RandomNumberMode::CurandHost;  // default on CPU if build has curand
 #else
-  RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // default on CPU if build has no curand and on HIP GPU
+  RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // default on CPU if build has no curand
 #endif
   // Rambo sampling mode (NB RamboHost implies CommonRandom or CurandHost!)
   enum class RamboSamplingMode
@@ -116,7 +115,7 @@ main( int argc, char** argv )
     RamboHost = 1,
     RamboDevice = 2
   };
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
   RamboSamplingMode rmbsmp = RamboSamplingMode::RamboDevice; // default on GPU
 #else
   RamboSamplingMode rmbsmp = RamboSamplingMode::RamboHost; // default on CPU
@@ -149,7 +148,7 @@ main( int argc, char** argv )
 #ifdef __CUDACC__
       rndgen = RandomNumberMode::CurandDevice;
 #else
-      throw std::runtime_error( "CurandDevice is not supported on CPUs or on HIP GPUs" );
+      throw std::runtime_error( "CurandDevice is not supported on CPUs" );
 #endif
     }
     else if( arg == "--curhst" )
@@ -166,7 +165,7 @@ main( int argc, char** argv )
     }
     else if( arg == "--rmbdev" )
     {
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
       rmbsmp = RamboSamplingMode::RamboDevice;
 #else
       throw std::runtime_error( "RamboDevice is not supported on CPUs" );
@@ -240,13 +239,13 @@ main( int argc, char** argv )
     return usage( argv[0] );
   }
 
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
 #ifdef _OPENMP
   ompnumthreadsNotSetMeansOneThread( debug ? 1 : 0 ); // quiet(-1), info(0), debug(1)
 #endif
 #endif
 
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
   // Fail gently and avoid "Illegal instruction (core dumped)" if the host does not support the SIMD used in the ME calculation
   // Note: this prevents a crash on pmpe04 but not on some github CI nodes?
   // [NB: SIMD vectorization in mg5amc C++ code is only used in the ME calculation below MatrixElementKernelHost!]
@@ -264,14 +263,14 @@ main( int argc, char** argv )
 
   // === STEP 0 - INITIALISE
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 
-  // --- 00. Initialise GPU
-  // Instantiate a GpuRuntime at the beginnining of the application's main.
-  // For CUDA this invokes cudaSetDevice(0) in the constructor and books a cudaDeviceReset() call in the destructor.
-  const std::string cdinKey = "00 GpuInit";
+  // --- 00. Initialise cuda
+  // Instantiate a CudaRuntime at the beginnining of the application's main to
+  // invoke cudaSetDevice(0) in the constructor and book a cudaDeviceReset() call in the destructor
+  const std::string cdinKey = "00 CudaInit";
   timermap.start( cdinKey );
-  GpuRuntime GpuRuntime( debug );
+  CudaRuntime cudaRuntime( debug );
 #endif
 
   // --- 0a. Initialise physics process
@@ -293,7 +292,7 @@ main( int argc, char** argv )
   timermap.start( alloKey );
 
   // Memory buffers for random numbers for momenta
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
   HostBufferRndNumMomenta hstRndmom( nevt );
 #else
   PinnedHostBufferRndNumMomenta hstRndmom( nevt );
@@ -301,7 +300,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for sampling weights
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
   HostBufferWeights hstWeights( nevt );
 #else
   PinnedHostBufferWeights hstWeights( nevt );
@@ -309,7 +308,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for momenta
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
   HostBufferMomenta hstMomenta( nevt );
 #else
   PinnedHostBufferMomenta hstMomenta( nevt );
@@ -317,7 +316,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for Gs
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
   HostBufferGs hstGs( nevt );
 #else
   PinnedHostBufferGs hstGs( nevt );
@@ -334,7 +333,7 @@ main( int argc, char** argv )
   }
 
   // Memory buffers for matrix elements
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
   HostBufferMatrixElements hstMatrixElements( nevt );
 #else
   PinnedHostBufferMatrixElements hstMatrixElements( nevt );
@@ -343,7 +342,7 @@ main( int argc, char** argv )
 
   // Memory buffers for random numbers for helicity selection
   // *** NB #403 these buffers always remain initialised at 0: no need for helicity choice in gcheck/check (no LHE produced) ***
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
   HostBufferRndNumHelicity hstRndHel( nevt );
 #else
   PinnedHostBufferRndNumHelicity hstRndHel( nevt );
@@ -352,7 +351,7 @@ main( int argc, char** argv )
 
   // Memory buffers for random numbers for color selection
   // *** NB #402 these buffers always remain initialised at 0: no need for color choice in gcheck/check (no LHE produced) ***
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
   HostBufferRndNumColor hstRndCol( nevt );
 #else
   PinnedHostBufferRndNumColor hstRndCol( nevt );
@@ -360,7 +359,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for helicity selection
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
   HostBufferSelectedHelicity hstSelHel( nevt );
 #else
   PinnedHostBufferSelectedHelicity hstSelHel( nevt );
@@ -368,7 +367,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for color selection
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
   HostBufferSelectedColor hstSelCol( nevt );
 #else
   PinnedHostBufferSelectedColor hstSelCol( nevt );
@@ -404,7 +403,7 @@ main( int argc, char** argv )
 #else
   else
   {
-    throw std::logic_error( "CurandDevice is not supported on CPUs or HIP GPUs" ); // INTERNAL ERROR (no path to this statement)
+    throw std::logic_error( "CurandDevice is not supported on CPUs" ); // INTERNAL ERROR (no path to this statement)
   }
 #endif
 #else
@@ -422,7 +421,7 @@ main( int argc, char** argv )
   }
   else
   {
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
     prsk.reset( new RamboSamplingKernelDevice( energy, devRndmom, devMomenta, devWeights, gpublocks, gputhreads ) );
 #else
     throw std::logic_error( "RamboDevice is not supported on CPUs" ); // INTERNAL ERROR (no path to this statement)
@@ -433,7 +432,7 @@ main( int argc, char** argv )
   std::unique_ptr<MatrixElementKernelBase> pmek;
   if( !bridge )
   {
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
     pmek.reset( new MatrixElementKernelDevice( devMomenta, devGs, devRndHel, devRndCol, devMatrixElements, devSelHel, devSelCol, gpublocks, gputhreads ) );
 #else
     pmek.reset( new MatrixElementKernelHost( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, nevt ) );
@@ -441,7 +440,7 @@ main( int argc, char** argv )
   }
   else
   {
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
     pmek.reset( new BridgeKernelDevice( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, gpublocks, gputhreads ) );
 #else
     pmek.reset( new BridgeKernelHost( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, nevt ) );
@@ -483,7 +482,7 @@ main( int argc, char** argv )
     prnk->generateRnarray();
     //std::cout << "Got random numbers" << std::endl;
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
     if( rndgen != RandomNumberMode::CurandDevice && rmbsmp == RamboSamplingMode::RamboDevice )
     {
       // --- 1c. Copy rndmom from host to device
@@ -515,7 +514,7 @@ main( int argc, char** argv )
     prsk->getMomentaFinal();
     //std::cout << "Got final momenta" << std::endl;
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
     if( rmbsmp == RamboSamplingMode::RamboDevice )
     {
       // --- 2c. CopyDToH Weights
@@ -560,7 +559,7 @@ main( int argc, char** argv )
       dynamic_cast<BridgeKernelBase*>( pmek.get() )->transposeInputMomentaC2F();
     }
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
     // --- 2d. CopyHToD Momenta
     const std::string gKey = "0.. CpHTDg";
     rambtime += timermap.start( gKey ); // FIXME! NOT A RAMBO TIMER!
@@ -589,7 +588,7 @@ main( int argc, char** argv )
     wv3atime += timermap.stop(); // calc only
     wavetime += wv3atime;        // calc plus copy
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
     if( !bridge )
     {
       // --- 3b. CopyDToH MEs
@@ -732,19 +731,15 @@ main( int argc, char** argv )
     rndgentxt = "CURAND DEVICE";
 #ifdef __CUDACC__
   rndgentxt += " (CUDA code)";
-#elif defined __HIPCC__
-  rndgentxt += " (HIP code)";
 #else
   rndgentxt += " (C++ code)";
 #endif
 
   // Workflow description summary
   std::string wrkflwtxt;
-  // -- CUDA or HIP or C++?
+  // -- CUDA or C++?
 #ifdef __CUDACC__
   wrkflwtxt += "CUD:";
-#elif defined __HIPCC__
-  wrkflwtxt += "HIP:";
 #else
   wrkflwtxt += "CPP:";
 #endif
@@ -769,12 +764,6 @@ main( int argc, char** argv )
 #else
   wrkflwtxt += "???:"; // no path to this statement
 #endif
-#elif defined __HIPCC__
-#if defined MGONGPU_CUCXTYPE_CXSMPL
-  wrkflwtxt += "CXS:";
-#else
-  wrkflwtxt += "???:"; // no path to this statement
-#endif
 #else
 #if defined MGONGPU_CPPCXTYPE_STDCOMPLEX
   wrkflwtxt += "STX:";
@@ -800,7 +789,7 @@ main( int argc, char** argv )
     wrkflwtxt += "RMBDEV+";
   else
     wrkflwtxt += "??????+"; // no path to this statement
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
   // -- HOST or DEVICE matrix elements? Standalone MEs or BRIDGE?
   if( !bridge )
     wrkflwtxt += "MESDEV";
@@ -856,7 +845,7 @@ main( int argc, char** argv )
 
   if( perf )
   {
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
 #ifdef _OPENMP
     // Get the output of "nproc --all" (https://stackoverflow.com/a/478960)
     std::string nprocall;
@@ -877,8 +866,6 @@ main( int argc, char** argv )
     std::cout << std::string( SEP79, '*' ) << std::endl
 #ifdef __CUDACC__
               << "Process                     = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_CUDA"
-#elif defined __HIPCC__
-              << "Process                     = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_HIP"
 #else
               << "Process                     = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_CPP"
 #endif
@@ -905,21 +892,21 @@ main( int argc, char** argv )
 #elif defined MGONGPU_FPTYPE_FLOAT
               << "FP precision                = FLOAT (NaN/abnormal=" << nabn << ", zero=" << nzero << ")" << std::endl
 #endif
+#ifdef __CUDACC__
 #if defined MGONGPU_CUCXTYPE_CUCOMPLEX
               << "Complex type                = CUCOMPLEX" << std::endl
 #elif defined MGONGPU_CUCXTYPE_THRUST
               << "Complex type                = THRUST::COMPLEX" << std::endl
-#elif defined MGONGPU_CUCXTYPE_CXSMPL
-              << "Complex type                = STD::COMPLEX" << std::endl
+#endif
 #else
-              << "Complex type                = ???" << std::endl // no path to this statement...
+              << "Complex type                = STD::COMPLEX" << std::endl
 #endif
               << "RanNumb memory layout       = AOSOA[" << neppR << "]"
               << ( neppR == 1 ? " == AOS" : "" )
               << " [HARDCODED FOR REPRODUCIBILITY]" << std::endl
               << "Momenta memory layout       = AOSOA[" << neppM << "]"
               << ( neppM == 1 ? " == AOS" : "" ) << std::endl
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
     //<< "Wavefunction GPU memory     = LOCAL" << std::endl
 #else
 #if !defined MGONGPU_CPPSIMD
@@ -950,7 +937,7 @@ main( int argc, char** argv )
 #endif
 #endif
               << "Random number generation    = " << rndgentxt << std::endl
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
 #ifdef _OPENMP
               << "OMP threads / `nproc --all` = " << omp_get_max_threads() << " / " << nprocall // includes a newline
 #endif
@@ -1046,14 +1033,14 @@ main( int argc, char** argv )
              << "\"FLOAT (NaN/abnormal=" << nabn << ")\"," << std::endl
 #endif
              << "\"Complex type\": "
+#ifdef __CUDACC__
 #if defined MGONGPU_CUCXTYPE_CUCOMPLEX
              << "\"CUCOMPLEX\"," << std::endl
 #elif defined MGONGPU_CUCXTYPE_THRUST
              << "\"THRUST::COMPLEX\"," << std::endl
-#elif defined MGONGPU_CUCXTYPE_CXSMPL
-             << "\"STD::COMPLEX\"," << std::endl
+#endif
 #else
-             << "\"???\"," << std::endl                           // no path to this statement...
+             << "\"STD::COMPLEX\"," << std::endl
 #endif
              << "\"RanNumb memory layout\": "
              << "\"AOSOA[" << neppR << "]\""
@@ -1061,7 +1048,7 @@ main( int argc, char** argv )
              << "\"Momenta memory layout\": "
              << "\"AOSOA[" << neppM << "]\""
              << ( neppM == 1 ? " == AOS" : "" ) << ", " << std::endl
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
     //<< "\"Wavefunction GPU memory\": " << "\"LOCAL\"," << std::endl
 #endif
              << "\"Curand generation\": "
diff --git a/epochX/cudacpp/gg_ttg.sa/SubProcesses/RamboSamplingKernels.cc b/epochX/cudacpp/gg_ttg.sa/SubProcesses/RamboSamplingKernels.cc
index 79abbcc4f8..da68aa9255 100644
--- a/epochX/cudacpp/gg_ttg.sa/SubProcesses/RamboSamplingKernels.cc
+++ b/epochX/cudacpp/gg_ttg.sa/SubProcesses/RamboSamplingKernels.cc
@@ -1,11 +1,11 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #include "RamboSamplingKernels.h"
 
-#include "GpuRuntime.h"
+#include "CudaRuntime.h"
 #include "MemoryAccessMomenta.h"
 #include "MemoryAccessRandomNumbers.h"
 #include "MemoryAccessWeights.h"
@@ -14,7 +14,7 @@
 
 #include <sstream>
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -92,7 +92,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
   RamboSamplingKernelDevice::RamboSamplingKernelDevice( const fptype energy,               // input: energy
                                                         const BufferRndNumMomenta& rndmom, // input: random numbers in [0,1]
                                                         BufferMomenta& momenta,            // output: momenta
@@ -135,7 +135,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
   __global__ void
   getMomentaInitialDevice( const fptype energy,
                            fptype* momenta )
@@ -147,17 +147,17 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
   void
   RamboSamplingKernelDevice::getMomentaInitial()
   {
-    gpuLaunchKernel( getMomentaInitialDevice, m_gpublocks, m_gputhreads, m_energy, m_momenta.data() );
+    getMomentaInitialDevice<<<m_gpublocks, m_gputhreads>>>( m_energy, m_momenta.data() );
   }
 #endif
 
   //--------------------------------------------------------------------------
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
   __global__ void
   getMomentaFinalDevice( const fptype energy,
                          const fptype* rndmom,
@@ -171,11 +171,11 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
   void
   RamboSamplingKernelDevice::getMomentaFinal()
   {
-    gpuLaunchKernel( getMomentaFinalDevice, m_gpublocks, m_gputhreads, m_energy, m_rndmom.data(), m_momenta.data(), m_weights.data() );
+    getMomentaFinalDevice<<<m_gpublocks, m_gputhreads>>>( m_energy, m_rndmom.data(), m_momenta.data(), m_weights.data() );
   }
 #endif
 
diff --git a/epochX/cudacpp/gg_ttg.sa/SubProcesses/RamboSamplingKernels.h b/epochX/cudacpp/gg_ttg.sa/SubProcesses/RamboSamplingKernels.h
index 7c214cd74b..184089efd7 100644
--- a/epochX/cudacpp/gg_ttg.sa/SubProcesses/RamboSamplingKernels.h
+++ b/epochX/cudacpp/gg_ttg.sa/SubProcesses/RamboSamplingKernels.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef RAMBOSAMPLINGKERNELS_H
 #define RAMBOSAMPLINGKERNELS_H 1
@@ -10,7 +10,7 @@
 
 #include "MemoryBuffers.h"
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -93,7 +93,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
   // A class encapsulating RAMBO phase space sampling on a GPU device
   class RamboSamplingKernelDevice final : public SamplingKernelBase, public NumberOfEvents
   {
diff --git a/epochX/cudacpp/gg_ttg.sa/SubProcesses/RandomNumberKernels.h b/epochX/cudacpp/gg_ttg.sa/SubProcesses/RandomNumberKernels.h
index 21d63beeac..188a72c2c9 100644
--- a/epochX/cudacpp/gg_ttg.sa/SubProcesses/RandomNumberKernels.h
+++ b/epochX/cudacpp/gg_ttg.sa/SubProcesses/RandomNumberKernels.h
@@ -1,14 +1,14 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef RANDOMNUMBERKERNELS_H
 #define RANDOMNUMBERKERNELS_H 1
 
 #include "mgOnGpuConfig.h"
 
-// NB This must come AFTER mgOnGpuConfig.h which contains our definition of __global__ when MGONGPUCPP_GPUIMPL is not defined
+// NB This must come AFTER mgOnGpuConfig.h which contains our definition of __global__ when __CUDACC__ is not defined
 #ifndef MGONGPU_HAS_NO_CURAND
 //#include "curand.h"
 struct curandGenerator_st; // forward definition from curand.h
@@ -16,7 +16,7 @@ struct curandGenerator_st; // forward definition from curand.h
 
 #include "MemoryBuffers.h"
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/gg_ttg.sa/SubProcesses/cudacpp.mk b/epochX/cudacpp/gg_ttg.sa/SubProcesses/cudacpp.mk
index bcb73d7f01..a0397e9ecc 100644
--- a/epochX/cudacpp/gg_ttg.sa/SubProcesses/cudacpp.mk
+++ b/epochX/cudacpp/gg_ttg.sa/SubProcesses/cudacpp.mk
@@ -1,7 +1,7 @@
 # Copyright (C) 2020-2023 CERN and UCLouvain.
 # Licensed under the GNU Lesser General Public License (version 3 or later).
 # Created by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin.
-# Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+# Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 
 #=== Determine the name of this makefile (https://ftp.gnu.org/old-gnu/Manuals/make-3.80/html_node/make_17.html)
 #=== NB: different names (e.g. cudacpp.mk and cudacpp_src.mk) are used in the Subprocess and src directories
@@ -32,7 +32,7 @@ UNAME_P := $(shell uname -p)
 #=== Configure common compiler flags for C++ and CUDA
 
 INCFLAGS = -I.
-OPTFLAGS = -O3 # this ends up in GPUFLAGS too (should it?), cannot add -Ofast or -ffast-math here
+OPTFLAGS = -O3 # this ends up in CUFLAGS too (should it?), cannot add -Ofast or -ffast-math here
 
 # Dependency on src directory
 MG5AMC_COMMONLIB = mg5amc_common
@@ -89,139 +89,69 @@ endif
 
 #-------------------------------------------------------------------------------
 
-CUDA_COMPILER_PATH := $(shell compiler="`which nvcc 2>/dev/null`" && while [ -L "$$compiler" ]; do compiler=`readlink "$$compiler"`; done && echo "$$compiler")
-HIP_COMPILER_PATH := $(shell compiler="`which hipcc 2>/dev/null`" && while [ -L "$$compiler" ]; do compiler=`readlink "$$compiler"`; done && echo "$$compiler")
-
-ifeq ($(findstring nvcc,$(CUDA_COMPILER_PATH)),nvcc)
-  #=== Configure the CUDA compiler
-
-  # If CXX is not a single word (example "clang++ --gcc-toolchain...") then disable CUDA builds (issue #505)
-  # This is because it is impossible to pass this to "GPUFLAGS += -ccbin <host-compiler>" below
-  ifneq ($(words $(subst ccache ,,$(CXX))),1) # allow at most "CXX=ccache <host-compiler>" from outside
-    $(warning CUDA builds are not supported for multi-word CXX "$(CXX)")
-    override CUDA_HOME=disabled
-  endif
-
-  # If CUDA_HOME is not set, try to set it from the location of NVCC
-  ifndef CUDA_HOME
-    CUDA_HOME = $(patsubst %bin/nvcc,%,$(shell which nvcc 2>/dev/null))
-    $(warning CUDA_HOME was not set: using "$(CUDA_HOME)")
-  endif
-
-  # Set GPUCC as $(CUDA_HOME)/bin/nvcc if it exists
-  ifneq ($(wildcard $(CUDA_HOME)/bin/nvcc),)
-    GPUCC = $(CUDA_HOME)/bin/nvcc
-    USE_NVTX ?=-DUSE_NVTX
-    # See https://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/index.html
-    # See https://arnon.dk/matching-sm-architectures-arch-and-gencode-for-various-nvidia-cards/
-    # Default: use compute capability 70 for V100 (CERN lxbatch, CERN itscrd, Juwels Cluster).
-    # Embed device code for 70, and PTX for 70+.
-    # Export MADGRAPH_CUDA_ARCHITECTURE (comma-separated list) to use another value or list of values (see #533).
-    # Examples: use 60 for P100 (Piz Daint), 80 for A100 (Juwels Booster, NVidia raplab/Curiosity).
-    MADGRAPH_CUDA_ARCHITECTURE ?= 70
-    ###CUARCHFLAGS = -gencode arch=compute_$(MADGRAPH_CUDA_ARCHITECTURE),code=compute_$(MADGRAPH_CUDA_ARCHITECTURE) -gencode arch=compute_$(MADGRAPH_CUDA_ARCHITECTURE),code=sm_$(MADGRAPH_CUDA_ARCHITECTURE) # Older implementation (AV): go back to this one for multi-GPU support #533
-    ###CUARCHFLAGS = --gpu-architecture=compute_$(MADGRAPH_CUDA_ARCHITECTURE) --gpu-code=sm_$(MADGRAPH_CUDA_ARCHITECTURE),compute_$(MADGRAPH_CUDA_ARCHITECTURE) # Newer implementation (SH): cannot use this as-is for multi-GPU support #533
-    comma:=,
-    CUARCHFLAGS = $(foreach arch,$(subst $(comma), ,$(MADGRAPH_CUDA_ARCHITECTURE)),-gencode arch=compute_$(arch),code=compute_$(arch) -gencode arch=compute_$(arch),code=sm_$(arch))
-    CUINC = -I$(CUDA_HOME)/include/
-    CURANDLIBFLAGS = -L$(CUDA_HOME)/lib64/ -lcurand # NB: -lcuda is not needed here!
-    CUOPTFLAGS = -lineinfo
-    GPUFLAGS = $(OPTFLAGS) $(CUOPTFLAGS) $(INCFLAGS) $(CUINC) $(USE_NVTX) $(CUARCHFLAGS) -use_fast_math
-    ###GPUFLAGS += -Xcompiler -Wall -Xcompiler -Wextra -Xcompiler -Wshadow
-    ###GPUCC_VERSION = $(shell $(GPUCC) --version | grep 'Cuda compilation tools' | cut -d' ' -f5 | cut -d, -f1)
-    GPUFLAGS += -std=c++17 # need CUDA >= 11.2 (see #333): this is enforced in mgOnGpuConfig.h
-    # Without -maxrregcount: baseline throughput: 6.5E8 (16384 32 12) up to 7.3E8 (65536 128 12)
-    ###GPUFLAGS+= --maxrregcount 160 # improves throughput: 6.9E8 (16384 32 12) up to 7.7E8 (65536 128 12)
-    ###GPUFLAGS+= --maxrregcount 128 # improves throughput: 7.3E8 (16384 32 12) up to 7.6E8 (65536 128 12)
-    ###GPUFLAGS+= --maxrregcount 96 # degrades throughput: 4.1E8 (16384 32 12) up to 4.5E8 (65536 128 12)
-    ###GPUFLAGS+= --maxrregcount 64 # degrades throughput: 1.7E8 (16384 32 12) flat at 1.7E8 (65536 128 12)
-
-    CUBUILDRULEFLAGS = -Xcompiler -fPIC -c
-    CCBUILDRULEFLAGS = -Xcompiler -fPIC -c -x cu
-
-    CUDATESTFLAGS = -lcuda
-
-  else ifneq ($(origin REQUIRE_CUDA),undefined)
-    # If REQUIRE_CUDA is set but no cuda is found, stop here (e.g. for CI tests on GPU #443)
-    $(error No cuda installation found (set CUDA_HOME or make GPUCC visible in PATH))
-  else
-    # No cuda. Switch cuda compilation off and go to common random numbers in C++
-    $(warning CUDA_HOME is not set or is invalid: export CUDA_HOME to compile with cuda)
-    override GPUCC=
-    override USE_NVTX=
-    override CUINC=
-    override CURANDLIBFLAGS=
-  endif
-
-  # Set the host C++ compiler for GPUCC via "-ccbin <host-compiler>"
-  # (NB issue #505: this must be a single word, "clang++ --gcc-toolchain..." is not supported)
-  GPUFLAGS += -ccbin $(shell which $(subst ccache ,,$(CXX)))
-
-  # Allow newer (unsupported) C++ compilers with older versions of CUDA if ALLOW_UNSUPPORTED_COMPILER_IN_CUDA is set (#504)
-  ifneq ($(origin ALLOW_UNSUPPORTED_COMPILER_IN_CUDA),undefined)
-  GPUFLAGS += -allow-unsupported-compiler
-  endif
-
-else ifeq ($(findstring hipcc,$(HIP_COMPILER_PATH)),hipcc)
-  #=== Configure the HIP compiler
-
-  # If CXX is not a single word (example "clang++ --gcc-toolchain...") then disable CUDA builds (issue #505)
-  # This is because it is impossible to pass this to "GPUFLAGS += -ccbin <host-compiler>" below
-  ifneq ($(words $(subst ccache ,,$(CXX))),1) # allow at most "CXX=ccache <host-compiler>" from outside
-    $(warning CUDA builds are not supported for multi-word CXX "$(CXX)")
-    override CUDA_HOME=disabled
-  endif
-
-  # If HIP_HOME is not set, try to set it from the location of GPUCC
-  ifndef HIP_HOME
-    HIP_HOME = $(patsubst %bin/hipcc,%,$(HIP_COMPILER_PATH))
-    $(warning HIP_HOME was not set: using "$(HIP_HOME)")
-  endif
-
-  # Set GPUCC as $(HIP_HOME)/bin/hipcc if it exists
-  ifneq ($(wildcard $(HIP_HOME)/bin/hipcc),)
-    GPUCC = $(HIP_HOME)/bin/hipcc
-
-    # Should maybe find something equivelant to this in HIP
-    #USE_NVTX ?=-DUSE_NVTX
-
-    HIPARCHFLAGS = -target x86_64-linux-gnu --offload-arch=gfx90a
-    HIPINC = -I$(HIP_HOME)/include/
-
-    # -DHIP_FAST_MATH equivelant to -use_fast_math in HIP 
-    # (But only for single precision line 208: https://rocm-developer-tools.github.io/HIP/hcc__detail_2math__functions_8h_source.html)
-    GPUFLAGS = $(OPTFLAGS) $(CUOPTFLAGS) $(INCFLAGS) $(HIPINC) $(HIPARCHFLAGS) -DHIP_FAST_MATH -DHIP_PLATFORM=amd -fPIC
-    ###GPUFLAGS += -Xcompiler -Wall -Xcompiler -Wextra -Xcompiler -Wshadow
-    GPUFLAGS += -std=c++17
-    # Without -maxrregcount: baseline throughput: 6.5E8 (16384 32 12) up to 7.3E8 (65536 128 12)
-    ###GPUFLAGS+= --maxrregcount 160 # improves throughput: 6.9E8 (16384 32 12) up to 7.7E8 (65536 128 12)
-    ###GPUFLAGS+= --maxrregcount 128 # improves throughput: 7.3E8 (16384 32 12) up to 7.6E8 (65536 128 12)
-    ###GPUFLAGS+= --maxrregcount 96 # degrades throughput: 4.1E8 (16384 32 12) up to 4.5E8 (65536 128 12)
-    ###GPUFLAGS+= --maxrregcount 64 # degrades throughput: 1.7E8 (16384 32 12) flat at 1.7E8 (65536 128 12)
-
-    CUBUILDRULEFLAGS = -fPIC -c
-    CCBUILDRULEFLAGS = -fPIC -c
-
-  else ifneq ($(origin REQUIRE_HIP),undefined)
-    # If REQUIRE_HIP is set but no cuda is found, stop here (e.g. for CI tests on GPU #443)
-    $(error No hip installation found (set HIP_HOME or make GPUCC visible in PATH))
-  else
-    # No hip. Switch hip compilation off and go to common random numbers in C++
-    $(warning HIP_HOME is not set or is invalid: export HIP_HOME to compile with hip)
-    override GPUCC=
-    override USE_NVTX=
-    override CUINC=
-    override CURANDLIBFLAGS=
-  endif
+#=== Configure the CUDA compiler
+
+# If CXX is not a single word (example "clang++ --gcc-toolchain...") then disable CUDA builds (issue #505)
+# This is because it is impossible to pass this to "CUFLAGS += -ccbin <host-compiler>" below
+ifneq ($(words $(subst ccache ,,$(CXX))),1) # allow at most "CXX=ccache <host-compiler>" from outside
+  $(warning CUDA builds are not supported for multi-word CXX "$(CXX)")
+  override CUDA_HOME=disabled
+endif
+
+# If CUDA_HOME is not set, try to set it from the location of nvcc
+ifndef CUDA_HOME
+  CUDA_HOME = $(patsubst %bin/nvcc,%,$(shell which nvcc 2>/dev/null))
+  $(warning CUDA_HOME was not set: using "$(CUDA_HOME)")
+endif
+
+# Set NVCC as $(CUDA_HOME)/bin/nvcc if it exists
+ifneq ($(wildcard $(CUDA_HOME)/bin/nvcc),)
+  NVCC = $(CUDA_HOME)/bin/nvcc
+  USE_NVTX ?=-DUSE_NVTX
+  # See https://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/index.html
+  # See https://arnon.dk/matching-sm-architectures-arch-and-gencode-for-various-nvidia-cards/
+  # Default: use compute capability 70 for V100 (CERN lxbatch, CERN itscrd, Juwels Cluster).
+  # Embed device code for 70, and PTX for 70+.
+  # Export MADGRAPH_CUDA_ARCHITECTURE (comma-separated list) to use another value or list of values (see #533).
+  # Examples: use 60 for P100 (Piz Daint), 80 for A100 (Juwels Booster, NVidia raplab/Curiosity).
+  MADGRAPH_CUDA_ARCHITECTURE ?= 70
+  ###CUARCHFLAGS = -gencode arch=compute_$(MADGRAPH_CUDA_ARCHITECTURE),code=compute_$(MADGRAPH_CUDA_ARCHITECTURE) -gencode arch=compute_$(MADGRAPH_CUDA_ARCHITECTURE),code=sm_$(MADGRAPH_CUDA_ARCHITECTURE) # Older implementation (AV): go back to this one for multi-GPU support #533
+  ###CUARCHFLAGS = --gpu-architecture=compute_$(MADGRAPH_CUDA_ARCHITECTURE) --gpu-code=sm_$(MADGRAPH_CUDA_ARCHITECTURE),compute_$(MADGRAPH_CUDA_ARCHITECTURE) # Newer implementation (SH): cannot use this as-is for multi-GPU support #533
+  comma:=,
+  CUARCHFLAGS = $(foreach arch,$(subst $(comma), ,$(MADGRAPH_CUDA_ARCHITECTURE)),-gencode arch=compute_$(arch),code=compute_$(arch) -gencode arch=compute_$(arch),code=sm_$(arch))
+  CUINC = -I$(CUDA_HOME)/include/
+  CURANDLIBFLAGS = -L$(CUDA_HOME)/lib64/ -lcurand # NB: -lcuda is not needed here!
+  CUOPTFLAGS = -lineinfo
+  CUFLAGS = $(OPTFLAGS) $(CUOPTFLAGS) $(INCFLAGS) $(CUINC) $(USE_NVTX) $(CUARCHFLAGS) -use_fast_math
+  ###CUFLAGS += -Xcompiler -Wall -Xcompiler -Wextra -Xcompiler -Wshadow
+  ###NVCC_VERSION = $(shell $(NVCC) --version | grep 'Cuda compilation tools' | cut -d' ' -f5 | cut -d, -f1)
+  CUFLAGS += -std=c++17 # need CUDA >= 11.2 (see #333): this is enforced in mgOnGpuConfig.h
+  # Without -maxrregcount: baseline throughput: 6.5E8 (16384 32 12) up to 7.3E8 (65536 128 12)
+  ###CUFLAGS+= --maxrregcount 160 # improves throughput: 6.9E8 (16384 32 12) up to 7.7E8 (65536 128 12)
+  ###CUFLAGS+= --maxrregcount 128 # improves throughput: 7.3E8 (16384 32 12) up to 7.6E8 (65536 128 12)
+  ###CUFLAGS+= --maxrregcount 96 # degrades throughput: 4.1E8 (16384 32 12) up to 4.5E8 (65536 128 12)
+  ###CUFLAGS+= --maxrregcount 64 # degrades throughput: 1.7E8 (16384 32 12) flat at 1.7E8 (65536 128 12)
+else ifneq ($(origin REQUIRE_CUDA),undefined)
+  # If REQUIRE_CUDA is set but no cuda is found, stop here (e.g. for CI tests on GPU #443)
+  $(error No cuda installation found (set CUDA_HOME or make nvcc visible in PATH))
+else
+  # No cuda. Switch cuda compilation off and go to common random numbers in C++
+  $(warning CUDA_HOME is not set or is invalid: export CUDA_HOME to compile with cuda)
+  override NVCC=
+  override USE_NVTX=
+  override CUINC=
+  override CURANDLIBFLAGS=
+endif
 
-  # Allow newer (unsupported) C++ compilers with older versions of CUDA if ALLOW_UNSUPPORTED_COMPILER_IN_CUDA is set (#504)
-  ifneq ($(origin ALLOW_UNSUPPORTED_COMPILER_IN_CUDA),undefined)
-  GPUFLAGS += -allow-unsupported-compiler
-  endif
+# Set the host C++ compiler for nvcc via "-ccbin <host-compiler>"
+# (NB issue #505: this must be a single word, "clang++ --gcc-toolchain..." is not supported)
+CUFLAGS += -ccbin $(shell which $(subst ccache ,,$(CXX)))
 
+# Allow newer (unsupported) C++ compilers with older versions of CUDA if ALLOW_UNSUPPORTED_COMPILER_IN_CUDA is set (#504)
+ifneq ($(origin ALLOW_UNSUPPORTED_COMPILER_IN_CUDA),undefined)
+CUFLAGS += -allow-unsupported-compiler
 endif
 
-  
 #-------------------------------------------------------------------------------
 
 #=== Configure ccache for C++ and CUDA builds
@@ -233,9 +163,9 @@ endif
 #ifeq ($(USECCACHE)$(shell echo $(AR) | grep ccache),1)
 #  override AR:=ccache $(AR)
 #endif
-ifneq ($(GPUCC),)
-  ifeq ($(USECCACHE)$(shell echo $(GPUCC) | grep ccache),1)
-    override GPUCC:=ccache $(GPUCC)
+ifneq ($(NVCC),)
+  ifeq ($(USECCACHE)$(shell echo $(NVCC) | grep ccache),1)
+    override NVCC:=ccache $(NVCC)
   endif
 endif
 
@@ -259,7 +189,7 @@ endif
 
 # PowerPC-specific CUDA compiler flags (to be reviewed!)
 ifeq ($(UNAME_P),ppc64le)
-  GPUFLAGS+= -Xcompiler -mno-float128
+  CUFLAGS+= -Xcompiler -mno-float128
 endif
 
 #-------------------------------------------------------------------------------
@@ -269,10 +199,10 @@ endif
 # Set the default OMPFLAGS choice
 ifneq ($(shell $(CXX) --version | egrep '^Intel'),)
 override OMPFLAGS = -fopenmp
-###override OMPFLAGS = # disable OpenMP MT on Intel (was ok without GPUCC but not ok with GPUCC before #578)
+###override OMPFLAGS = # disable OpenMP MT on Intel (was ok without nvcc but not ok with nvcc before #578)
 else ifneq ($(shell $(CXX) --version | egrep '^(clang)'),)
 override OMPFLAGS = -fopenmp
-###override OMPFLAGS = # disable OpenMP MT on clang (was not ok without or with GPUCC before #578)
+###override OMPFLAGS = # disable OpenMP MT on clang (was not ok without or with nvcc before #578)
 else ifneq ($(shell $(CXX) --version | egrep '^(Apple clang)'),)
 override OMPFLAGS = # disable OpenMP MT on Apple clang (builds fail in the CI #578)
 else
@@ -323,10 +253,7 @@ endif
 
 # Set the default RNDGEN (random number generator) choice
 ifeq ($(RNDGEN),)
-  ifeq ($(GPUCC),)
-    override RNDGEN = hasNoCurand
-  # Edgecase for HIP compilation
-  else ifeq ($(findstring hipcc,$(GPUCC)),hipcc)
+  ifeq ($(NVCC),)
     override RNDGEN = hasNoCurand
   else ifeq ($(RNDGEN),)
     override RNDGEN = hasCurand
@@ -401,13 +328,13 @@ CXXFLAGS+= $(AVXFLAGS)
 $(info FPTYPE=$(FPTYPE))
 ifeq ($(FPTYPE),d)
   CXXFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_DOUBLE
-  GPUFLAGS  += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_DOUBLE
+  CUFLAGS  += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_DOUBLE
 else ifeq ($(FPTYPE),f)
   CXXFLAGS += -DMGONGPU_FPTYPE_FLOAT -DMGONGPU_FPTYPE2_FLOAT
-  GPUFLAGS  += -DMGONGPU_FPTYPE_FLOAT -DMGONGPU_FPTYPE2_FLOAT
+  CUFLAGS  += -DMGONGPU_FPTYPE_FLOAT -DMGONGPU_FPTYPE2_FLOAT
 else ifeq ($(FPTYPE),m)
   CXXFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_FLOAT
-  GPUFLAGS  += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_FLOAT
+  CUFLAGS  += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_FLOAT
 else
   $(error Unknown FPTYPE='$(FPTYPE)': only 'd', 'f' and 'm' are supported)
 endif
@@ -416,7 +343,7 @@ endif
 $(info HELINL=$(HELINL))
 ifeq ($(HELINL),1)
   CXXFLAGS += -DMGONGPU_INLINE_HELAMPS
-  GPUFLAGS  += -DMGONGPU_INLINE_HELAMPS
+  CUFLAGS  += -DMGONGPU_INLINE_HELAMPS
 else ifneq ($(HELINL),0)
   $(error Unknown HELINL='$(HELINL)': only '0' and '1' are supported)
 endif
@@ -425,7 +352,7 @@ endif
 $(info HRDCOD=$(HRDCOD))
 ifeq ($(HRDCOD),1)
   CXXFLAGS += -DMGONGPU_HARDCODE_PARAM
-  GPUFLAGS  += -DMGONGPU_HARDCODE_PARAM
+  CUFLAGS  += -DMGONGPU_HARDCODE_PARAM
 else ifneq ($(HRDCOD),0)
   $(error Unknown HRDCOD='$(HRDCOD)': only '0' and '1' are supported)
 endif
@@ -477,11 +404,11 @@ ifeq ($(UNAME_S),Darwin)
   override CULIBFLAGSRPATH2 =
 else
   # RPATH to cuda/cpp libs when linking executables
-  override CXXLIBFLAGSRPATH = -Wl,-rpath=$(LIBDIRRPATH)
-  override CULIBFLAGSRPATH = -Xlinker -rpath=$(LIBDIRRPATH)
+  override CXXLIBFLAGSRPATH = -Wl,-rpath,$(LIBDIRRPATH)
+  override CULIBFLAGSRPATH = -Xlinker -rpath,$(LIBDIRRPATH)
   # RPATH to common lib when linking cuda/cpp libs
-  override CXXLIBFLAGSRPATH2 = -Wl,-rpath='$$ORIGIN'
-  override CULIBFLAGSRPATH2 = -Xlinker -rpath='$$ORIGIN'
+  override CXXLIBFLAGSRPATH2 = -Wl,-rpath,'$$ORIGIN'
+  override CULIBFLAGSRPATH2 = -Xlinker -rpath,'$$ORIGIN'
 endif
 
 # Setting LD_LIBRARY_PATH or DYLD_LIBRARY_PATH in the RUNTIME is no longer necessary (neither on Linux nor on Mac)
@@ -494,7 +421,7 @@ override RUNTIME =
 cxx_main=$(BUILDDIR)/check.exe
 fcxx_main=$(BUILDDIR)/fcheck.exe
 
-ifneq ($(GPUCC),)
+ifneq ($(NVCC),)
 cu_main=$(BUILDDIR)/gcheck.exe
 fcu_main=$(BUILDDIR)/fgcheck.exe
 else
@@ -525,16 +452,15 @@ $(BUILDDIR)/.build.$(TAG):
 	@touch $(BUILDDIR)/.build.$(TAG)
 
 # Generic target and build rules: objects from CUDA compilation
-ifneq ($(GPUCC),)
+ifneq ($(NVCC),)
 $(BUILDDIR)/%.o : %.cu *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
 	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
-	$(GPUCC) $(CPPFLAGS) $(GPUFLAGS) $(CUBUILDRULEFLAGS) $< -o $@
+	$(NVCC) $(CPPFLAGS) $(CUFLAGS) -Xcompiler -fPIC -c $< -o $@
 
 $(BUILDDIR)/%_cu.o : %.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
 	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
-	$(GPUCC) $(CPPFLAGS) $(GPUFLAGS) $(CCBUILDRULEFLAGS) $< -o $@
+	$(NVCC) $(CPPFLAGS) $(CUFLAGS) -Xcompiler -fPIC -c -x cu $< -o $@
 endif
-# -x cu in line above
 
 # Generic target and build rules: objects from C++ compilation
 # (NB do not include CUINC here! add it only for NVTX or curand #679)
@@ -543,14 +469,11 @@ $(BUILDDIR)/%.o : %.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
 	$(CXX) $(CPPFLAGS) $(CXXFLAGS) -fPIC -c $< -o $@
 
 # Apply special build flags only to CrossSectionKernel.cc and gCrossSectionKernel.cu (no fast math, see #117)
-# Added edgecase for HIP compilation
 ifeq ($(shell $(CXX) --version | grep ^nvc++),)
 $(BUILDDIR)/CrossSectionKernels.o: CXXFLAGS += -fno-fast-math
 $(BUILDDIR)/CrossSectionKernels.o: CXXFLAGS += -fno-fast-math
-ifeq ($(findstring nvcc,$(GPUCC)),nvcc)
-  $(BUILDDIR)/gCrossSectionKernels.o: GPUFLAGS += -Xcompiler -fno-fast-math
-else
-  $(BUILDDIR)/gCrossSectionKernels.o: GPUFLAGS += -fno-fast-math
+ifneq ($(NVCC),)
+$(BUILDDIR)/gCrossSectionKernels.o: CUFLAGS += -Xcompiler -fno-fast-math
 endif
 endif
 
@@ -566,10 +489,10 @@ ifeq ($(RNDGEN),hasCurand)
 $(BUILDDIR)/CurandRandomNumberKernel.o: CXXFLAGS += $(CUINC)
 endif
 
-# Avoid "warning: builtin __has_trivial_... is deprecated; use __is_trivially_... instead" in GPUCC with icx2023 (#592)
+# Avoid "warning: builtin __has_trivial_... is deprecated; use __is_trivially_... instead" in nvcc with icx2023 (#592)
 ifneq ($(shell $(CXX) --version | egrep '^(Intel)'),)
-ifneq ($(GPUCC),)
-GPUFLAGS += -Wno-deprecated-builtins
+ifneq ($(NVCC),)
+CUFLAGS += -Xcompiler -Wno-deprecated-builtins
 endif
 endif
 
@@ -577,8 +500,8 @@ endif
 # This patch does remove the warning, but I prefer to keep it disabled for the moment...
 ###ifneq ($(shell $(CXX) --version | egrep '^(clang|Apple clang|Intel)'),)
 ###$(BUILDDIR)/CrossSectionKernels.o: CXXFLAGS += -Wno-overriding-t-option
-###ifneq ($(GPUCC),)
-###$(BUILDDIR)/gCrossSectionKernels.o: GPUFLAGS += -Xcompiler -Wno-overriding-t-option
+###ifneq ($(NVCC),)
+###$(BUILDDIR)/gCrossSectionKernels.o: CUFLAGS += -Xcompiler -Wno-overriding-t-option
 ###endif
 ###endif
 
@@ -605,7 +528,7 @@ MG5AMC_CXXLIB = mg5amc_$(processid_short)_cpp
 cxx_objects_lib=$(BUILDDIR)/CPPProcess.o $(BUILDDIR)/MatrixElementKernels.o $(BUILDDIR)/BridgeKernels.o $(BUILDDIR)/CrossSectionKernels.o
 cxx_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel.o $(BUILDDIR)/RamboSamplingKernels.o
 
-ifneq ($(GPUCC),)
+ifneq ($(NVCC),)
 MG5AMC_CULIB = mg5amc_$(processid_short)_cuda
 cu_objects_lib=$(BUILDDIR)/gCPPProcess.o $(BUILDDIR)/gMatrixElementKernels.o $(BUILDDIR)/gBridgeKernels.o $(BUILDDIR)/gCrossSectionKernels.o
 cu_objects_exe=$(BUILDDIR)/gCommonRandomNumberKernel.o $(BUILDDIR)/gRamboSamplingKernels.o
@@ -617,11 +540,11 @@ $(LIBDIR)/lib$(MG5AMC_CXXLIB).so: cxx_objects_lib += $(BUILDDIR)/fbridge.o
 $(LIBDIR)/lib$(MG5AMC_CXXLIB).so: $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib)
 	$(CXX) -shared -o $@ $(cxx_objects_lib) $(CXXLIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB)
 
-ifneq ($(GPUCC),)
+ifneq ($(NVCC),)
 $(LIBDIR)/lib$(MG5AMC_CULIB).so: $(BUILDDIR)/fbridge_cu.o
 $(LIBDIR)/lib$(MG5AMC_CULIB).so: cu_objects_lib += $(BUILDDIR)/fbridge_cu.o
 $(LIBDIR)/lib$(MG5AMC_CULIB).so: $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cu_objects_lib)
-	$(GPUCC) --shared -o $@ $(cu_objects_lib) $(CULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB)
+	$(NVCC) --shared -o $@ $(cu_objects_lib) $(CULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB)
 endif
 
 #-------------------------------------------------------------------------------
@@ -638,16 +561,16 @@ $(cxx_main): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PAT
 $(cxx_main): $(BUILDDIR)/check_sa.o $(LIBDIR)/lib$(MG5AMC_CXXLIB).so $(cxx_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel.o
 	$(CXX) -o $@ $(BUILDDIR)/check_sa.o $(OMPFLAGS) -ldl -pthread $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_CXXLIB) $(cxx_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel.o $(CURANDLIBFLAGS)
 
-ifneq ($(GPUCC),)
+ifneq ($(NVCC),)
 ifneq ($(shell $(CXX) --version | grep ^Intel),)
-$(cu_main): LIBFLAGS += -lintlc # compile with icpx and link with GPUCC (undefined reference to `_intel_fast_memcpy')
-$(cu_main): LIBFLAGS += -lsvml # compile with icpx and link with GPUCC (undefined reference to `__svml_cos4_l9')
+$(cu_main): LIBFLAGS += -lintlc # compile with icpx and link with nvcc (undefined reference to `_intel_fast_memcpy')
+$(cu_main): LIBFLAGS += -lsvml # compile with icpx and link with nvcc (undefined reference to `__svml_cos4_l9')
 else ifneq ($(shell $(CXX) --version | grep ^nvc++),) # support nvc++ #531
 $(cu_main): LIBFLAGS += -L$(patsubst %bin/nvc++,%lib,$(subst ccache ,,$(CXX))) -lnvhpcatm -lnvcpumath -lnvc
 endif
 $(cu_main): LIBFLAGS += $(CULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH
 $(cu_main): $(BUILDDIR)/gcheck_sa.o $(LIBDIR)/lib$(MG5AMC_CULIB).so $(cu_objects_exe) $(BUILDDIR)/gCurandRandomNumberKernel.o
-	$(GPUCC) -o $@ $(BUILDDIR)/gcheck_sa.o $(CUARCHFLAGS) $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe) $(BUILDDIR)/gCurandRandomNumberKernel.o $(CURANDLIBFLAGS)
+	$(NVCC) -o $@ $(BUILDDIR)/gcheck_sa.o $(CUARCHFLAGS) $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe) $(BUILDDIR)/gCurandRandomNumberKernel.o $(CURANDLIBFLAGS)
 endif
 
 #-------------------------------------------------------------------------------
@@ -673,17 +596,17 @@ $(fcxx_main): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PA
 $(fcxx_main): $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler.o $(LIBDIR)/lib$(MG5AMC_CXXLIB).so $(cxx_objects_exe)
 	$(CXX) -o $@ $(BUILDDIR)/fcheck_sa.o $(OMPFLAGS) $(BUILDDIR)/fsampler.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CXXLIB) $(cxx_objects_exe)
 
-ifneq ($(GPUCC),)
+ifneq ($(NVCC),)
 ifneq ($(shell $(CXX) --version | grep ^Intel),)
-$(fcu_main): LIBFLAGS += -lintlc # compile with icpx and link with GPUCC (undefined reference to `_intel_fast_memcpy')
-$(fcu_main): LIBFLAGS += -lsvml # compile with icpx and link with GPUCC (undefined reference to `__svml_cos4_l9')
+$(fcu_main): LIBFLAGS += -lintlc # compile with icpx and link with nvcc (undefined reference to `_intel_fast_memcpy')
+$(fcu_main): LIBFLAGS += -lsvml # compile with icpx and link with nvcc (undefined reference to `__svml_cos4_l9')
 endif
 ifeq ($(UNAME_S),Darwin)
 $(fcu_main): LIBFLAGS += -L$(shell dirname $(shell $(FC) --print-file-name libgfortran.dylib)) # add path to libgfortran on Mac #375
 endif
 $(fcu_main): LIBFLAGS += $(CULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH
 $(fcu_main): $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler_cu.o $(LIBDIR)/lib$(MG5AMC_CULIB).so $(cu_objects_exe)
-	$(GPUCC) -o $@ $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler_cu.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe)
+	$(NVCC) -o $@ $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler_cu.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe)
 endif
 
 #-------------------------------------------------------------------------------
@@ -695,7 +618,7 @@ $(BUILDDIR)/testxxx.o: testxxx_cc_ref.txt
 $(testmain): $(BUILDDIR)/testxxx.o
 $(testmain): cxx_objects_exe += $(BUILDDIR)/testxxx.o # Comment out this line to skip the C++ test of xxx functions
 
-ifneq ($(GPUCC),)
+ifneq ($(NVCC),)
 $(BUILDDIR)/testxxx_cu.o: $(GTESTLIBS)
 $(BUILDDIR)/testxxx_cu.o: INCFLAGS += $(GTESTINC)
 $(BUILDDIR)/testxxx_cu.o: testxxx_cc_ref.txt
@@ -708,7 +631,7 @@ $(BUILDDIR)/testmisc.o: INCFLAGS += $(GTESTINC)
 $(testmain): $(BUILDDIR)/testmisc.o
 $(testmain): cxx_objects_exe += $(BUILDDIR)/testmisc.o # Comment out this line to skip the C++ miscellaneous tests
 
-ifneq ($(GPUCC),)
+ifneq ($(NVCC),)
 $(BUILDDIR)/testmisc_cu.o: $(GTESTLIBS)
 $(BUILDDIR)/testmisc_cu.o: INCFLAGS += $(GTESTINC)
 $(testmain): $(BUILDDIR)/testmisc_cu.o
@@ -720,12 +643,12 @@ $(BUILDDIR)/runTest.o: INCFLAGS += $(GTESTINC)
 $(testmain): $(BUILDDIR)/runTest.o
 $(testmain): cxx_objects_exe += $(BUILDDIR)/runTest.o
 
-ifneq ($(GPUCC),)
+ifneq ($(NVCC),)
 $(BUILDDIR)/runTest_cu.o: $(GTESTLIBS)
 $(BUILDDIR)/runTest_cu.o: INCFLAGS += $(GTESTINC)
 ifneq ($(shell $(CXX) --version | grep ^Intel),)
-$(testmain): LIBFLAGS += -lintlc # compile with icpx and link with GPUCC (undefined reference to `_intel_fast_memcpy')
-$(testmain): LIBFLAGS += -lsvml # compile with icpx and link with GPUCC (undefined reference to `__svml_cos4_l9')
+$(testmain): LIBFLAGS += -lintlc # compile with icpx and link with nvcc (undefined reference to `_intel_fast_memcpy')
+$(testmain): LIBFLAGS += -lsvml # compile with icpx and link with nvcc (undefined reference to `__svml_cos4_l9')
 else ifneq ($(shell $(CXX) --version | grep ^nvc++),) # support nvc++ #531
 $(testmain): LIBFLAGS += -L$(patsubst %bin/nvc++,%lib,$(subst ccache ,,$(CXX))) -lnvhpcatm -lnvcpumath -lnvc
 endif
@@ -749,14 +672,14 @@ $(testmain): LIBFLAGS += -lgomp
 endif
 endif
 
-ifeq ($(GPUCC),) # link only runTest.o
+ifeq ($(NVCC),) # link only runTest.o
 $(testmain): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH
 $(testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(cxx_objects_exe) $(GTESTLIBS)
 	$(CXX) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) -ldl -pthread $(LIBFLAGS)
 else # link both runTest.o and runTest_cu.o
 $(testmain): LIBFLAGS += $(CULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH
 $(testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) $(GTESTLIBS)
-	  $(GPUCC) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) -ldl $(LIBFLAGS) $(CUDATESTFLAGS)
+	$(NVCC) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) -ldl $(LIBFLAGS) -lcuda
 endif
 
 # Use flock (Linux only, no Mac) to allow 'make -j' if googletest has not yet been downloaded https://stackoverflow.com/a/32666215
@@ -859,9 +782,9 @@ ifeq ($(USECCACHE),1)
 	ccache --version | head -1
 endif
 	@echo ""
-	@echo GPUCC=$(GPUCC)
-ifneq ($(GPUCC),)
-	$(GPUCC) --version
+	@echo NVCC=$(NVCC)
+ifneq ($(NVCC),)
+	$(NVCC) --version
 endif
 	@echo ""
 	@echo CXX=$(CXX)
@@ -880,7 +803,7 @@ endif
 
 # Target: check (run the C++ test executable)
 # [NB THIS IS WHAT IS USED IN THE GITHUB CI!]
-ifneq ($(GPUCC),)
+ifneq ($(NVCC),)
 check: runTest cmpFcheck cmpFGcheck
 else
 check: runTest cmpFcheck
diff --git a/epochX/cudacpp/gg_ttg.sa/SubProcesses/fbridge.cc b/epochX/cudacpp/gg_ttg.sa/SubProcesses/fbridge.cc
index 2b956730d4..f93c05b0b3 100644
--- a/epochX/cudacpp/gg_ttg.sa/SubProcesses/fbridge.cc
+++ b/epochX/cudacpp/gg_ttg.sa/SubProcesses/fbridge.cc
@@ -1,11 +1,11 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: S. Roiser (Oct 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Roiser, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #include "Bridge.h"
 #include "CPPProcess.h"
-#include "GpuRuntime.h"
+#include "CudaRuntime.h"
 
 extern "C"
 {
@@ -22,7 +22,7 @@ extern "C"
    * Using the same Fortran MadEvent code, linking to the hetrerogeneous library would allow access to both CPU and GPU implementations.
    * The specific heterogeneous configuration (how many GPUs, how many threads on each CPU, etc) could be loaded in CUDA/C++ from a data file.
    */
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
   using namespace mg5amcGpu;
 #else
   using namespace mg5amcCpu;
@@ -46,8 +46,8 @@ extern "C"
    */
   void fbridgecreate_( CppObjectInFortran** ppbridge, const int* pnevtF, const int* pnparF, const int* pnp4F )
   {
-#ifdef MGONGPUCPP_GPUIMPL
-    GpuRuntime::setUp();
+#ifdef __CUDACC__
+    CudaRuntime::setUp();
 #endif
     // Create a process object, read parm card and set parameters
     // FIXME: the process instance can happily go out of scope because it is only needed to read parameters?
@@ -69,8 +69,8 @@ extern "C"
     Bridge<FORTRANFPTYPE>* pbridge = dynamic_cast<Bridge<FORTRANFPTYPE>*>( *ppbridge );
     if( pbridge == 0 ) throw std::runtime_error( "fbridgedelete_: invalid Bridge address" );
     delete pbridge;
-#ifdef MGONGPUCPP_GPUIMPL
-    GpuRuntime::tearDown();
+#ifdef __CUDACC__
+    CudaRuntime::tearDown();
 #endif
   }
 
@@ -100,7 +100,7 @@ extern "C"
   {
     Bridge<FORTRANFPTYPE>* pbridge = dynamic_cast<Bridge<FORTRANFPTYPE>*>( *ppbridge );
     if( pbridge == 0 ) throw std::runtime_error( "fbridgesequence_: invalid Bridge address" );
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
     // Use the device/GPU implementation in the CUDA library
     // (there is also a host implementation in this library)
     pbridge->gpu_sequence( momenta, gs, rndhel, rndcol, *pchannelId, mes, selhel, selcol );
diff --git a/epochX/cudacpp/gg_ttg.sa/SubProcesses/fsampler.cc b/epochX/cudacpp/gg_ttg.sa/SubProcesses/fsampler.cc
index 3743934f41..2fb445372d 100644
--- a/epochX/cudacpp/gg_ttg.sa/SubProcesses/fsampler.cc
+++ b/epochX/cudacpp/gg_ttg.sa/SubProcesses/fsampler.cc
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Feb 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #include "mgOnGpuConfig.h"
 
@@ -13,7 +13,7 @@
 
 //--------------------------------------------------------------------------
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -40,7 +40,7 @@ namespace mg5amcCpu
   private:
     const int m_nevt; // The number of events in each iteration
     int m_iiter;      // The iteration counter (for random number seeding)
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
     HostBufferRndNumMomenta m_hstRndmom; // Memory buffers for random numbers
     HostBufferMomenta m_hstMomenta;      // Memory buffers for momenta
     HostBufferWeights m_hstWeights;      // Memory buffers for sampling weights
@@ -105,7 +105,7 @@ namespace mg5amcCpu
 
 extern "C"
 {
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
   using namespace mg5amcGpu;
 #else
   using namespace mg5amcCpu;
diff --git a/epochX/cudacpp/gg_ttg.sa/SubProcesses/runTest.cc b/epochX/cudacpp/gg_ttg.sa/SubProcesses/runTest.cc
index 461ec5c3a5..572e28aaea 100644
--- a/epochX/cudacpp/gg_ttg.sa/SubProcesses/runTest.cc
+++ b/epochX/cudacpp/gg_ttg.sa/SubProcesses/runTest.cc
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: S. Hageboeck (Nov 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 
 #include "mgOnGpuConfig.h"
 
@@ -15,7 +15,7 @@
 #include "RandomNumberKernels.h"
 #include "epoch_process_id.h"
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 using namespace mg5amcGpu;
 #else
 using namespace mg5amcCpu;
@@ -32,7 +32,7 @@ struct CUDA_CPU_TestBase : public TestDriverBase
     : TestDriverBase( npar, refFileName ) {}
 };
 
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
 struct CPUTest : public CUDA_CPU_TestBase
 {
   // Struct data members (process, and memory structures for random numbers, momenta, matrix elements and weights on host and device)
@@ -114,7 +114,7 @@ struct CPUTest : public CUDA_CPU_TestBase
 };
 #endif
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 struct CUDATest : public CUDA_CPU_TestBase
 {
   // Reset the device when our test goes out of scope. Note that this should happen after
@@ -123,7 +123,7 @@ struct CUDATest : public CUDA_CPU_TestBase
   {
     ~DeviceReset()
     {
-      gpuDeviceReset(); // this is needed by cuda-memcheck --leak-check full
+      checkCuda( cudaDeviceReset() ); // this is needed by cuda-memcheck --leak-check full
     }
   } deviceResetter;
 
@@ -249,7 +249,7 @@ INSTANTIATE_TEST_SUITE_P( prefix, \
                           test_suite_name, \
                           testing::Values( new CUDATest( MG_EPOCH_REFERENCE_FILE_NAME ) ) );
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 MG_INSTANTIATE_TEST_SUITE_GPU( XTESTID_GPU( MG_EPOCH_PROCESS_ID ), MadgraphTest );
 #else
 MG_INSTANTIATE_TEST_SUITE_CPU( XTESTID_CPU( MG_EPOCH_PROCESS_ID ), MadgraphTest );
diff --git a/epochX/cudacpp/gg_ttg.sa/SubProcesses/testmisc.cc b/epochX/cudacpp/gg_ttg.sa/SubProcesses/testmisc.cc
index 2bd7a9fcf9..989aba1fdc 100644
--- a/epochX/cudacpp/gg_ttg.sa/SubProcesses/testmisc.cc
+++ b/epochX/cudacpp/gg_ttg.sa/SubProcesses/testmisc.cc
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 //----------------------------------------------------------------------------
 // Use ./runTest.exe --gtest_filter=*misc to run only this test
 //----------------------------------------------------------------------------
@@ -17,7 +17,7 @@
 #include <sstream>
 #include <typeinfo>
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 #define TESTID( s ) s##_GPU_MISC
 #else
 #define TESTID( s ) s##_CPU_MISC
diff --git a/epochX/cudacpp/gg_ttg.sa/SubProcesses/testxxx.cc b/epochX/cudacpp/gg_ttg.sa/SubProcesses/testxxx.cc
index 6e8657edca..4243e9fcec 100644
--- a/epochX/cudacpp/gg_ttg.sa/SubProcesses/testxxx.cc
+++ b/epochX/cudacpp/gg_ttg.sa/SubProcesses/testxxx.cc
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Apr 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #include "mgOnGpuConfig.h"
 
@@ -20,7 +20,7 @@
 #include <iomanip>
 #include <iostream>
 #include <vector>
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 #define TESTID( s ) s##_GPU_XXX
 #else
 #define TESTID( s ) s##_CPU_XXX
@@ -41,7 +41,7 @@ TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testxxx )
   assert( nevt % neppM == 0 ); // nevt must be a multiple of neppM
   assert( nevt % neppV == 0 ); // nevt must be a multiple of neppV
   // Fill in the input momenta
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
   mg5amcGpu::PinnedHostBufferMomenta hstMomenta( nevt ); // AOSOA[npagM][npar=4][np4=4][neppM]
 #else
   mg5amcCpu::HostBufferMomenta hstMomenta( nevt ); // AOSOA[npagM][npar=4][np4=4][neppM]
@@ -228,7 +228,7 @@ TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testxxx )
   {
     for( int ievt = 0; ievt < nevt; ievt++ )
     {
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
       using namespace mg5amcGpu;
 #else
       using namespace mg5amcCpu;
diff --git a/epochX/cudacpp/gg_ttg.sa/src/HelAmps_sm.h b/epochX/cudacpp/gg_ttg.sa/src/HelAmps_sm.h
index 3593d9f169..5a3a5dc76f 100644
--- a/epochX/cudacpp/gg_ttg.sa/src/HelAmps_sm.h
+++ b/epochX/cudacpp/gg_ttg.sa/src/HelAmps_sm.h
@@ -4,7 +4,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: A. Valassi (Sep 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
 // MadGraph5_aMC@NLO v. 3.5.0_lo_vect, 2023-06-09
@@ -26,7 +26,7 @@
 //#include <iomanip>
 //#include <iostream>
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/gg_ttg.sa/src/Parameters_sm.cc b/epochX/cudacpp/gg_ttg.sa/src/Parameters_sm.cc
index de87dcaf64..d3d01102fd 100644
--- a/epochX/cudacpp/gg_ttg.sa/src/Parameters_sm.cc
+++ b/epochX/cudacpp/gg_ttg.sa/src/Parameters_sm.cc
@@ -4,7 +4,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: A. Valassi (Sep 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
 // MadGraph5_aMC@NLO v. 3.5.0_lo_vect, 2023-06-09
diff --git a/epochX/cudacpp/gg_ttg.sa/src/Parameters_sm.h b/epochX/cudacpp/gg_ttg.sa/src/Parameters_sm.h
index fe7d686938..6551d8da81 100644
--- a/epochX/cudacpp/gg_ttg.sa/src/Parameters_sm.h
+++ b/epochX/cudacpp/gg_ttg.sa/src/Parameters_sm.h
@@ -214,7 +214,7 @@ namespace Parameters_sm_dependentCouplings
 #pragma GCC diagnostic push
 #pragma GCC diagnostic ignored "-Wunused-variable"  // e.g. <<warning: unused variable ‘mdl_G__exp__2’ [-Wunused-variable]>>
 #pragma GCC diagnostic ignored "-Wunused-parameter" // e.g. <<warning: unused parameter ‘G’ [-Wunused-parameter]>>
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 #pragma nv_diagnostic push
 #pragma nv_diag_suppress 177 // e.g. <<warning #177-D: variable "mdl_G__exp__2" was declared but never referenced>>
 #endif
@@ -242,7 +242,7 @@ namespace Parameters_sm_dependentCouplings
     // End SM implementation - no special handling of vectors of floats as in EFT (#439)
     return out;
   }
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 #pragma GCC diagnostic pop
 #pragma nv_diagnostic pop
 #endif
@@ -258,7 +258,7 @@ namespace Parameters_sm_independentCouplings
 
 //==========================================================================
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/gg_ttg.sa/src/mgOnGpuConfig.h b/epochX/cudacpp/gg_ttg.sa/src/mgOnGpuConfig.h
index d4e37d19b3..6c0c4919e9 100644
--- a/epochX/cudacpp/gg_ttg.sa/src/mgOnGpuConfig.h
+++ b/epochX/cudacpp/gg_ttg.sa/src/mgOnGpuConfig.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jul 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MGONGPUCONFIG_H
 #define MGONGPUCONFIG_H 1
@@ -10,26 +10,13 @@
 // There are two different code bases for standalone_cudacpp (without multichannel) and madevent+cudacpp (with multichannel)
 #undef MGONGPU_SUPPORTS_MULTICHANNEL
 
-// Is this a GPU (CUDA, HIP) or CPU implementation?
-#ifdef __CUDACC__
-#define MGONGPUCPP_GPUIMPL cuda
-#elif defined __HIPCC__
-#define MGONGPUCPP_GPUIMPL hip
-#else
-#undef MGONGPUCPP_GPUIMPL
-#endif
-
 // ** NB1 Throughputs (e.g. 6.8E8) are events/sec for "./gcheck.exe -p 65536 128 12"
 // ** NB2 Baseline on b7g47n0004 fluctuates (probably depends on load on other VMs)
 
 // Choose if curand is supported for generating random numbers
-// For CUDA, by default, it is supported
-// For HIP, by default, it is not supported
 // For C++, by default, do not inline, but allow this macro to be set from outside with e.g. -DMGONGPU_HAS_NO_CURAND
 #ifdef __CUDACC__
 #undef MGONGPU_HAS_NO_CURAND
-#elif defined __HIPCC__
-#define MGONGPU_HAS_NO_CURAND 1
 #else
 //#undef MGONGPU_HAS_NO_CURAND // default
 ////#define MGONGPU_HAS_NO_CURAND 1
@@ -65,28 +52,23 @@
 //#undef MGONGPU_HARDCODE_PARAM // default
 ////#define MGONGPU_HARDCODE_PARAM 1
 
-// Complex type in CUDA: thrust or cucomplex or cxsmpl (CHOOSE ONLY ONE)
+// Complex type in c++: std::complex or cxsmpl (CHOOSE ONLY ONE)
+#ifndef __CUDACC__
+//#define MGONGPU_CPPCXTYPE_STDCOMPLEX 1 // ~8 percent slower on float, same on double (5.1E6/double, 9.4E6/float)
+#define MGONGPU_CPPCXTYPE_CXSMPL 1 // new default (5.1E6/double, 10.2E6/float)
+#endif
+
+// Complex type in cuda: thrust or cucomplex or cxsmpl (CHOOSE ONLY ONE)
 #ifdef __CUDACC__
 #define MGONGPU_CUCXTYPE_THRUST 1 // default (~1.15E9/double, ~3.2E9/float)
 //#define MGONGPU_CUCXTYPE_CUCOMPLEX 1 // ~10 percent slower (1.03E9/double, ~2.8E9/float)
 //#define MGONGPU_CUCXTYPE_CXSMPL 1 // ~10 percent slower (1.00E9/double, ~2.9E9/float)
-
-// Complex type in HIP: cxsmpl (ONLY ONE OPTION POSSIBLE)
-#elif defined __HIPCC__
-#define MGONGPU_CUCXTYPE_CXSMPL 1 // ~10 percent slower (1.00E9/double, ~2.9E9/float)
-
-// Complex type in C++: std::complex or cxsmpl (CHOOSE ONLY ONE)
-#else
-//#define MGONGPU_CPPCXTYPE_STDCOMPLEX 1 // ~8 percent slower on float, same on double (5.1E6/double, 9.4E6/float)
-#define MGONGPU_CPPCXTYPE_CXSMPL 1 // new default (5.1E6/double, 10.2E6/float)
 #endif
 
-// CUDA nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation
+// Cuda nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation
 #ifdef __CUDACC__
-#undef MGONGPU_NSIGHT_DEBUG // default in CUDA
+#undef MGONGPU_NSIGHT_DEBUG // default
 //#define MGONGPU_NSIGHT_DEBUG 1
-#else
-#undef MGONGPU_NSIGHT_DEBUG // only option in HIP or C++
 #endif
 
 // SANITY CHECKS (floating point precision for everything but color algebra #537)
@@ -102,21 +84,17 @@
 #error You cannot use double precision for color algebra and single precision elsewhere
 #endif
 
-// SANITY CHECKS (CUDA complex number implementation)
-#ifdef __CUDACC__
-#if defined MGONGPU_CUCXTYPE_THRUST and defined MGONGPU_CUCXTYPE_CUCOMPLEX
-#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CUCXTYPE_THRUST or MGONGPU_CUCXTYPE_CUCOMPLEX for CUDA
-#elif defined MGONGPU_CUCXTYPE_THRUST and defined MGONGPU_CUCXTYPE_CXSMPL
-#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CUCXTYPE_THRUST or MGONGPU_CUCXTYPE_CXSMPL for CUDA
-#elif defined MGONGPU_CUCXTYPE_CUCOMPLEX and defined MGONGPU_CUCXTYPE_CXSMPL
-#error You must CHOOSE (ONE AND) ONLY ONE OF MGONGPU_CUCXTYPE_CUCOMPLEX or MGONGPU_CUCXTYPE_CXSMPL for CUDA
+// SANITY CHECKS (c++ complex number implementation)
+#ifndef __CUDACC__
+#if defined MGONGPU_CPPCXTYPE_STDCOMPLEX and defined MGONGPU_CPPCXTYPE_CXSMPL
+#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CPPCXTYPE_STDCOMPLEX or MGONGPU_CPPCXTYPE_CXSMPL
 #endif
 #endif
 
-// SANITY CHECKS (C++ complex number implementation)
-#ifndef MGONGPUCPP_GPUIMPL
-#if defined MGONGPU_CPPCXTYPE_STDCOMPLEX and defined MGONGPU_CPPCXTYPE_CXSMPL
-#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CPPCXTYPE_STDCOMPLEX or MGONGPU_CPPCXTYPE_CXSMPL for C++
+// SANITY CHECKS (cuda complex number implementation)
+#ifdef __CUDACC__
+#if defined MGONGPU_CUCXTYPE_THRUST and defined MGONGPU_CUCXTYPE_CUCOMPLEX and defined MGONGPU_CUCXTYPE_CXSMPL
+#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CUCXTYPE_THRUST or MGONGPU_CUCXTYPE_CUCOMPLEX or MGONGPU_CUCXTYPE_CXSMPL
 #endif
 #endif
 
@@ -153,7 +131,7 @@ namespace mgOnGpu
   // Alignment requirement for using reinterpret_cast with SIMD vectorized code
   // (using reinterpret_cast with non aligned memory may lead to segmentation faults!)
   // Only needed for C++ code but can be enforced also in NVCC builds of C++ code using CUDA>=11.2 and C++17 (#318, #319, #333)
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
   constexpr int cppAlign = 64; // alignment requirement for SIMD vectorization (64-byte i.e. 512-bit)
 #endif
 
@@ -164,7 +142,7 @@ using mgOnGpu::fptype;
 using mgOnGpu::fptype2;
 
 // C++ SIMD vectorization width (this will be used to set neppV)
-#ifdef MGONGPUCPP_GPUIMPL // CUDA and HIP implementations have no SIMD
+#ifdef __CUDACC__ // CUDA implementation has no SIMD
 #undef MGONGPU_CPPSIMD
 #elif defined __AVX512VL__ && defined MGONGPU_PVW512 // C++ "512z" AVX512 with 512 width (512-bit ie 64-byte): 8 (DOUBLE) or 16 (FLOAT)
 #ifdef MGONGPU_FPTYPE_DOUBLE
@@ -194,9 +172,9 @@ using mgOnGpu::fptype2;
 #undef MGONGPU_CPPSIMD
 #endif
 
-// CUDA nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation
+// Cuda nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation
 // Arguments (not used so far): text is __FUNCTION__, code is 0 (start) or 1 (end)
-#if defined __CUDA__ && defined MGONGPU_NSIGHT_DEBUG /* clang-format off */
+#if defined __CUDACC__ && defined MGONGPU_NSIGHT_DEBUG /* clang-format off */
 #define mgDebugDeclare() __shared__ float mgDebugCounter[mgOnGpu::ntpbMAX];
 #define mgDebugInitialise() { mgDebugCounter[threadIdx.x] = 0; }
 #define mgDebug( code, text ) { mgDebugCounter[threadIdx.x] += 1; }
@@ -208,8 +186,8 @@ using mgOnGpu::fptype2;
 #define mgDebugFinalise() { /*noop*/ }
 #endif /* clang-format on */
 
-// Define empty CUDA/HIP declaration specifiers for C++
-#ifndef MGONGPUCPP_GPUIMPL
+// Define empty CUDA declaration specifiers for C++
+#ifndef __CUDACC__
 #define __global__
 #define __host__
 #define __device__
diff --git a/epochX/cudacpp/gg_ttg.sa/src/mgOnGpuCxtypes.h b/epochX/cudacpp/gg_ttg.sa/src/mgOnGpuCxtypes.h
index 4e7ab03fa2..0cb2f1db7e 100644
--- a/epochX/cudacpp/gg_ttg.sa/src/mgOnGpuCxtypes.h
+++ b/epochX/cudacpp/gg_ttg.sa/src/mgOnGpuCxtypes.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022, based on earlier work by D. Smith) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MGONGPUCXTYPES_H
 #define MGONGPUCXTYPES_H 1
@@ -19,7 +19,7 @@
 #include <complex>
 
 // Complex type in cuda: thrust or cucomplex or cxsmpl
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 #if defined MGONGPU_CUCXTYPE_THRUST
 #pragma clang diagnostic push
 #pragma clang diagnostic ignored "-Wtautological-compare" // for icpx2021/clang13 (https://stackoverflow.com/a/15864661)
@@ -201,7 +201,7 @@ namespace mgOnGpu
 {
 
   // --- Type definitions (complex type: cxtype)
-#ifdef MGONGPUCPP_GPUIMPL // cuda
+#ifdef __CUDACC__ // cuda
 #if defined MGONGPU_CUCXTYPE_THRUST
   typedef thrust::complex<fptype> cxtype;
 #elif defined MGONGPU_CUCXTYPE_CUCOMPLEX
@@ -281,7 +281,7 @@ cxmake( const std::complex<double>& c ) // std::complex to cxsmpl (double-to-flo
 
 //==========================================================================
 
-#if defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CUCXTYPE_THRUST // cuda + thrust
+#if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_THRUST // cuda + thrust
 
 //------------------------------
 // CUDA - using thrust::complex
@@ -317,11 +317,11 @@ cxmake( const cxtype& c )
   return c;
 }
 
-#endif // #if defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CUCXTYPE_THRUST
+#endif // #if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_THRUST
 
 //==========================================================================
 
-#if defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CUCXTYPE_CUCOMPLEX // cuda + cucomplex
+#if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_CUCOMPLEX // cuda + cucomplex
 
 //------------------------------
 // CUDA - using cuComplex
@@ -536,11 +536,11 @@ cxmake( const std::complex<fptype>& c ) // std::complex to cucomplex (float-to-f
   return cxmake( c.real(), c.imag() );
 }
 
-#endif // #if defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CUCXTYPE_CUCOMPLEX
+#endif // #if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_CUCOMPLEX
 
 //==========================================================================
 
-#if not defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CPPCXTYPE_STDCOMPLEX // c++ + stdcomplex
+#if not defined __CUDACC__ and defined MGONGPU_CPPCXTYPE_STDCOMPLEX // c++ + stdcomplex
 
 //------------------------------
 // C++ - using std::complex
@@ -584,7 +584,7 @@ cxmake( const std::complex<double>& c ) // std::complex to std::complex (cast do
 }
 #endif
 
-#endif // #if not defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CPPCXTYPE_STDCOMPLEX
+#endif // #if not defined __CUDACC__ and defined MGONGPU_CPPCXTYPE_STDCOMPLEX
 
 //==========================================================================
 
diff --git a/epochX/cudacpp/gg_ttg.sa/src/mgOnGpuFptypes.h b/epochX/cudacpp/gg_ttg.sa/src/mgOnGpuFptypes.h
index 6f6cee64d6..a1cde16a67 100644
--- a/epochX/cudacpp/gg_ttg.sa/src/mgOnGpuFptypes.h
+++ b/epochX/cudacpp/gg_ttg.sa/src/mgOnGpuFptypes.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MGONGPUFPTYPES_H
 #define MGONGPUFPTYPES_H 1
@@ -13,7 +13,7 @@
 
 //==========================================================================
 
-#ifdef MGONGPUCPP_GPUIMPL // cuda
+#ifdef __CUDACC__ // cuda
 
 //------------------------------
 // Floating point types - Cuda
@@ -57,11 +57,11 @@ fpsqrt( const fptype& f )
 #endif
 }
 
-#endif // #ifdef MGONGPUCPP_GPUIMPL
+#endif // #ifdef __CUDACC__
 
 //==========================================================================
 
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
 
 //------------------------------
 // Floating point types - C++
@@ -85,7 +85,7 @@ fpsqrt( const fptype& f )
   return std::sqrt( f );
 }
 
-#endif // #ifndef MGONGPUCPP_GPUIMPL
+#endif // #ifndef __CUDACC__
 
 //==========================================================================
 
diff --git a/epochX/cudacpp/gg_ttg.sa/src/mgOnGpuVectors.h b/epochX/cudacpp/gg_ttg.sa/src/mgOnGpuVectors.h
index 7904b93c61..9d3e82b1e3 100644
--- a/epochX/cudacpp/gg_ttg.sa/src/mgOnGpuVectors.h
+++ b/epochX/cudacpp/gg_ttg.sa/src/mgOnGpuVectors.h
@@ -108,7 +108,7 @@ namespace mgOnGpu /* clang-format off */
 #endif
 #endif
 
-#else // i.e #ifndef MGONGPU_CPPSIMD (this includes #ifdef MGONGPUCPP_GPUIMPL)
+#else // i.e #ifndef MGONGPU_CPPSIMD (this includes #ifdef __CUDACC__)
 
   const int neppV = 1;
 
@@ -129,7 +129,7 @@ using mgOnGpu::bool_v;
 
 //--------------------------------------------------------------------------
 
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
 
 // Printout to stream for user defined types
 
@@ -744,11 +744,11 @@ fpvsplit1( const fptype2_v& v )
 
 #endif // #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
 
-#endif // #ifndef MGONGPUCPP_GPUIMPL
+#endif // #ifndef __CUDACC__
 
 //==========================================================================
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 
 //------------------------------
 // Vector types - CUDA
@@ -786,12 +786,12 @@ cxternary( const bool& mask, const cxtype& a, const cxtype& b )
   return ( mask ? a : b );
 }
 
-#endif // #ifdef MGONGPUCPP_GPUIMPL
+#endif // #ifdef __CUDACC__
 
 //==========================================================================
 
 // Scalar-or-vector types: scalar in CUDA, vector or scalar in C++
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 typedef bool bool_sv;
 typedef fptype fptype_sv;
 typedef fptype2 fptype2_sv;
@@ -812,7 +812,7 @@ typedef mgOnGpu::cxtype_ref cxtype_sv_ref;
 #endif
 
 // Scalar-or-vector zeros: scalar in CUDA, vector or scalar in C++
-#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
+#ifdef __CUDACC__ /* clang-format off */
 inline __host__ __device__ cxtype cxzero_sv(){ return cxtype( 0, 0 ); }
 #elif defined MGONGPU_CPPSIMD
 inline cxtype_v cxzero_sv() { return cxtype_v(); } // RRRR=0000 IIII=0000
diff --git a/epochX/cudacpp/gg_ttg.sa/src/rambo.h b/epochX/cudacpp/gg_ttg.sa/src/rambo.h
index cd7e1008ea..e02ea52496 100644
--- a/epochX/cudacpp/gg_ttg.sa/src/rambo.h
+++ b/epochX/cudacpp/gg_ttg.sa/src/rambo.h
@@ -4,7 +4,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 
 #include "mgOnGpuConfig.h"
@@ -18,7 +18,7 @@
 #include <iostream>
 
 // Simplified rambo version for 2 to N (with N>=2) processes with massless particles
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -83,7 +83,7 @@ namespace mg5amcCpu
       static bool first = true;
       if( first )
       {
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
         if constexpr( M_ACCESS::isOnDevice() ) // avoid
         {
           const int ievt0 = 0;
@@ -166,7 +166,7 @@ namespace mg5amcCpu
     wt = po2log;
     if( nparf != 2 ) wt = ( 2. * nparf - 4. ) * log( energy ) + z[nparf - 1];
 
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
     // issue warnings if weight is too small or too large
     static int iwarn[5] = { 0, 0, 0, 0, 0 };
     if( wt < -180. )
diff --git a/epochX/cudacpp/gg_ttgg.sa/COPYRIGHT b/epochX/cudacpp/gg_ttgg.sa/COPYRIGHT
index 84a883fbb0..a134b5fef9 100644
--- a/epochX/cudacpp/gg_ttgg.sa/COPYRIGHT
+++ b/epochX/cudacpp/gg_ttgg.sa/COPYRIGHT
@@ -15,7 +15,6 @@ The full development team currently includes the following authors :
   Stephan Hageboeck (CERN)
   Olivier Mattelaer (Universite Catholique de Louvain, original author)
   Stefan Roiser (CERN, original author)
-  Joergen Teig (CERN)
   Andrea Valassi (CERN, original author)
   Zenny Wettersten (CERN)
 See https://github.com/madgraph5/madgraph4gpu for more details. For the full
diff --git a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/Bridge.h b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/Bridge.h
index c04628dfd1..4cafe0c997 100644
--- a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/Bridge.h
+++ b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/Bridge.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: S. Roiser (Nov 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Roiser, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef BRIDGE_H
 #define BRIDGE_H 1
@@ -22,7 +22,7 @@
 #include <memory>
 #include <type_traits>
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -82,7 +82,7 @@ namespace mg5amcCpu
     Bridge& operator=( const Bridge& ) = delete;
     Bridge& operator=( Bridge&& ) = delete;
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
     /**
      * Set the gpublocks and gputhreads for the gpusequence - throws if evnt != gpublocks*gputhreads
      * (this is needed for BridgeKernel tests rather than for actual production use in Fortran)
@@ -149,7 +149,7 @@ namespace mg5amcCpu
     unsigned int m_nevt; // number of events
     int m_nGoodHel;      // the number of good helicities (-1 initially when they have not yet been calculated)
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
     int m_gputhreads; // number of gpu threads (default set from number of events, can be modified)
     int m_gpublocks;  // number of gpu blocks (default set from number of events, can be modified)
     mg5amcGpu::DeviceBuffer<FORTRANFPTYPE, sizePerEventMomenta> m_devMomentaF;
@@ -186,12 +186,12 @@ namespace mg5amcCpu
   // Forward declare transposition methods
   //
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 
   template<typename Tin, typename Tout>
   __global__ void dev_transposeMomentaF2C( const Tin* in, Tout* out, const unsigned int nevt );
 
-#endif // MGONGPUCPP_GPUIMPL
+#endif // __CUDACC__
 
   template<typename Tin, typename Tout>
   void hst_transposeMomentaF2C( const Tin* in, Tout* out, const unsigned int nevt );
@@ -208,7 +208,7 @@ namespace mg5amcCpu
   Bridge<FORTRANFPTYPE>::Bridge( unsigned int nevtF, unsigned int nparF, unsigned int np4F )
     : m_nevt( nevtF )
     , m_nGoodHel( -1 )
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
     , m_gputhreads( 256 )                  // default number of gpu threads
     , m_gpublocks( m_nevt / m_gputhreads ) // this ensures m_nevt <= m_gpublocks*m_gputhreads
     , m_devMomentaF( m_nevt )
@@ -232,7 +232,7 @@ namespace mg5amcCpu
   {
     if( nparF != CPPProcess::npar ) throw std::runtime_error( "Bridge constructor: npar mismatch" );
     if( np4F != CPPProcess::np4 ) throw std::runtime_error( "Bridge constructor: np4 mismatch" );
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
     if( ( m_nevt < s_gputhreadsmin ) || ( m_nevt % s_gputhreadsmin != 0 ) )
       throw std::runtime_error( "Bridge constructor: nevt should be a multiple of " + std::to_string( s_gputhreadsmin ) );
     while( m_nevt != m_gpublocks * m_gputhreads )
@@ -250,11 +250,11 @@ namespace mg5amcCpu
     std::cout << "WARNING! Instantiate host Bridge (nevt=" << m_nevt << ")" << std::endl;
     mg5amcCpu::CPPProcess process( /*verbose=*/false );
     m_pmek.reset( new mg5amcCpu::MatrixElementKernelHost( m_hstMomentaC, m_hstGs, m_hstRndHel, m_hstRndCol, m_hstMEs, m_hstSelHel, m_hstSelCol, m_nevt ) );
-#endif // MGONGPUCPP_GPUIMPL
+#endif // __CUDACC__
     process.initProc( "../../Cards/param_card.dat" );
   }
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
   template<typename FORTRANFPTYPE>
   void Bridge<FORTRANFPTYPE>::set_gpugrid( const int gpublocks, const int gputhreads )
   {
@@ -268,7 +268,7 @@ namespace mg5amcCpu
   }
 #endif
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
   template<typename FORTRANFPTYPE>
   void Bridge<FORTRANFPTYPE>::gpu_sequence( const FORTRANFPTYPE* momenta,
                                             const FORTRANFPTYPE* gs,
@@ -283,14 +283,14 @@ namespace mg5amcCpu
     constexpr int neppM = MemoryAccessMomenta::neppM;
     if constexpr( neppM == 1 && std::is_same_v<FORTRANFPTYPE, fptype> )
     {
-      gpuMemcpy( m_devMomentaC.data(), momenta, m_devMomentaC.bytes(), gpuMemcpyHostToDevice );
+      checkCuda( cudaMemcpy( m_devMomentaC.data(), momenta, m_devMomentaC.bytes(), cudaMemcpyHostToDevice ) );
     }
     else
     {
-      gpuMemcpy( m_devMomentaF.data(), momenta, m_devMomentaF.bytes(), gpuMemcpyHostToDevice );
+      checkCuda( cudaMemcpy( m_devMomentaF.data(), momenta, m_devMomentaF.bytes(), cudaMemcpyHostToDevice ) );
       const int thrPerEvt = CPPProcess::npar * CPPProcess::np4; // AV: transpose alg does 1 element per thread (NOT 1 event per thread)
       //const int thrPerEvt = 1; // AV: try new alg with 1 event per thread... this seems slower
-      gpuLaunchKernel( dev_transposeMomentaF2C, m_gpublocks * thrPerEvt, m_gputhreads, m_devMomentaF.data(), m_devMomentaC.data(), m_nevt );
+      dev_transposeMomentaF2C<<<m_gpublocks * thrPerEvt, m_gputhreads>>>( m_devMomentaF.data(), m_devMomentaC.data(), m_nevt );
     }
     if constexpr( std::is_same_v<FORTRANFPTYPE, fptype> )
     {
@@ -333,7 +333,7 @@ namespace mg5amcCpu
   }
 #endif
 
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
   template<typename FORTRANFPTYPE>
   void Bridge<FORTRANFPTYPE>::cpu_sequence( const FORTRANFPTYPE* momenta,
                                             const FORTRANFPTYPE* gs,
@@ -388,7 +388,7 @@ namespace mg5amcCpu
   // - C++ array: momenta[npagM][npar][np4][neppM] with nevt=npagM*neppM (AOSOA)
   //
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
   template<typename Tin, typename Tout>
   __global__ void dev_transposeMomentaF2C( const Tin* in, Tout* out, const unsigned int nevt )
   {
diff --git a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/BridgeKernels.cc b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/BridgeKernels.cc
index 90c7f2d3b8..cef4cb3c71 100644
--- a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/BridgeKernels.cc
+++ b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/BridgeKernels.cc
@@ -1,11 +1,10 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #include "BridgeKernels.h"
 
-#include "GpuAbstraction.h"
 #include "MemoryAccessMomenta.h"
 
 #include <sstream>
@@ -15,7 +14,7 @@ constexpr int npar = CPPProcess::npar; // #particles in total (external = initia
 
 //============================================================================
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -46,7 +45,7 @@ namespace mg5amcCpu
 
 //============================================================================
 
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
 namespace mg5amcCpu
 {
 
@@ -97,7 +96,7 @@ namespace mg5amcCpu
 
 //============================================================================
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 namespace mg5amcGpu
 {
 
diff --git a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/BridgeKernels.h b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/BridgeKernels.h
index 3efef8ce97..15eb4bff4d 100644
--- a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/BridgeKernels.h
+++ b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/BridgeKernels.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef BRIDGEKERNELS_H
 #define BRIDGEKERNELS_H 1
@@ -12,7 +12,7 @@
 #include "MatrixElementKernels.h"
 #include "MemoryBuffers.h"
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -49,7 +49,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
   // A Bridge wrapper class encapsulating matrix element calculations on a CPU host
   class BridgeKernelHost final : public BridgeKernelBase
   {
@@ -89,7 +89,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
   // A Bridge wrapper class encapsulating matrix element calculations on a GPU device
   class BridgeKernelDevice : public BridgeKernelBase
   {
diff --git a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/CommonRandomNumberKernel.cc b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/CommonRandomNumberKernel.cc
index 010bc4cbd0..985b39f576 100644
--- a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/CommonRandomNumberKernel.cc
+++ b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/CommonRandomNumberKernel.cc
@@ -1,16 +1,15 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #include "CommonRandomNumbers.h"
-#include "GpuAbstraction.h"
 #include "MemoryBuffers.h"
 #include "RandomNumberKernels.h"
 
 #include <cassert>
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/CrossSectionKernels.cc b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/CrossSectionKernels.cc
index c15b39844d..0b355a3c8d 100644
--- a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/CrossSectionKernels.cc
+++ b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/CrossSectionKernels.cc
@@ -1,11 +1,10 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #include "CrossSectionKernels.h"
 
-#include "GpuAbstraction.h"
 #include "MemoryAccessMatrixElements.h"
 #include "MemoryAccessWeights.h"
 #include "MemoryBuffers.h"
@@ -78,7 +77,7 @@ debug_me_is_abnormal( const fptype& me, size_t ievtALL )
 
 //============================================================================
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -186,7 +185,7 @@ namespace mg5amcCpu
 
 //============================================================================
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 namespace mg5amcGpu
 {
 
diff --git a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/CrossSectionKernels.h b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/CrossSectionKernels.h
index 4d9659e04e..7933ca4bbf 100644
--- a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/CrossSectionKernels.h
+++ b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/CrossSectionKernels.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef CROSSSECTIONKERNELS_H
 #define CROSSSECTIONKERNELS_H 1
@@ -13,7 +13,7 @@
 
 //============================================================================
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -96,7 +96,7 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
   /*
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
   // A class encapsulating the calculation of event statistics on a GPU device
   class CrossSectionKernelDevice : public CrossSectionKernelBase, public NumberOfEvents
   {
diff --git a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/CudaRuntime.h b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/CudaRuntime.h
new file mode 100644
index 0000000000..64ce52f4b3
--- /dev/null
+++ b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/CudaRuntime.h
@@ -0,0 +1,85 @@
+// Copyright (C) 2020-2023 CERN and UCLouvain.
+// Licensed under the GNU Lesser General Public License (version 3 or later).
+// Created by: S. Roiser (Jul 2020) for the MG5aMC CUDACPP plugin.
+// Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+
+#ifndef MG5AMC_CUDARUNTIME_H
+#define MG5AMC_CUDARUNTIME_H 1
+
+// MG5AMC on GPU uses the CUDA runtime API, not the lower level CUDA driver API
+// See https://docs.nvidia.com/cuda/cuda-runtime-api/driver-vs-runtime-api.html#driver-vs-runtime-api
+
+#include <cassert>
+#include <iostream>
+
+//--------------------------------------------------------------------------
+
+// See https://stackoverflow.com/a/14038590
+#ifdef __CUDACC__ /* clang-format off */
+#define checkCuda( code ) { assertCuda( code, __FILE__, __LINE__ ); }
+inline void assertCuda( cudaError_t code, const char* file, int line, bool abort = true )
+{
+  if( code != cudaSuccess )
+  {
+    printf( "ERROR! assertCuda: '%s' (%d) in %s:%d\n", cudaGetErrorString( code ), code, file, line );
+    if( abort ) assert( code == cudaSuccess );
+  }
+}
+#endif /* clang-format on */
+
+//--------------------------------------------------------------------------
+
+#ifdef __CUDACC__
+namespace mg5amcGpu
+{
+  // Instantiate a CudaRuntime at the beginnining of the application's main to
+  // invoke cudaSetDevice(0) in the constructor and book a cudaDeviceReset() call in the destructor
+  // *** FIXME! This will all need to be designed differently when going to multi-GPU nodes! ***
+  struct CudaRuntime final
+  {
+    CudaRuntime( const bool debug = true )
+      : m_debug( debug ) { setUp( m_debug ); }
+    ~CudaRuntime() { tearDown( m_debug ); }
+    CudaRuntime( const CudaRuntime& ) = delete;
+    CudaRuntime( CudaRuntime&& ) = delete;
+    CudaRuntime& operator=( const CudaRuntime& ) = delete;
+    CudaRuntime& operator=( CudaRuntime&& ) = delete;
+    bool m_debug;
+
+    // Set up CUDA application
+    // ** NB: strictly speaking this is not needed when using the CUDA runtime API **
+    // Calling cudaSetDevice on startup is useful to properly book-keep the time spent in CUDA initialization
+    static void setUp( const bool debug = true )
+    {
+      // ** NB: it is useful to call cudaSetDevice, or cudaFree, to properly book-keep the time spent in CUDA initialization
+      // ** NB: otherwise, the first CUDA operation (eg a cudaMemcpyToSymbol in CPPProcess ctor) appears to take much longer!
+      /*
+      // [We initially added cudaFree(0) to "ease profile analysis" only because it shows up as a big recognizable block!]
+      // No explicit initialization is needed: https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#initialization
+      // It is not clear what cudaFree(0) does at all: https://stackoverflow.com/questions/69967813/
+      if ( debug ) std::cout << "__CudaRuntime: calling cudaFree(0)" << std::endl;
+      checkCuda( cudaFree( 0 ) ); // SLOW!
+      */
+      // Replace cudaFree(0) by cudaSetDevice(0), even if it is not really needed either
+      // (but see https://developer.nvidia.com/blog/cuda-pro-tip-always-set-current-device-avoid-multithreading-bugs)
+      if( debug ) std::cout << "__CudaRuntime: calling cudaSetDevice(0)" << std::endl;
+      checkCuda( cudaSetDevice( 0 ) ); // SLOW!
+    }
+
+    // Tear down CUDA application (call cudaDeviceReset)
+    // ** NB: strictly speaking this is not needed when using the CUDA runtime API **
+    // Calling cudaDeviceReset on shutdown is only needed for checking memory leaks in cuda-memcheck
+    // See https://docs.nvidia.com/cuda/cuda-memcheck/index.html#leak-checking
+    static void tearDown( const bool debug = true )
+    {
+      if( debug ) std::cout << "__CudaRuntime: calling cudaDeviceReset()" << std::endl;
+      checkCuda( cudaDeviceReset() );
+    }
+  };
+
+}
+#endif
+
+//--------------------------------------------------------------------------
+
+#endif // MG5AMC_CUDARUNTIME_H
diff --git a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/CurandRandomNumberKernel.cc b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/CurandRandomNumberKernel.cc
index 38c477c17a..eb56333b03 100644
--- a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/CurandRandomNumberKernel.cc
+++ b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/CurandRandomNumberKernel.cc
@@ -1,9 +1,9 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
-#include "GpuRuntime.h"
+#include "CudaRuntime.h"
 #include "MemoryBuffers.h"
 #include "RandomNumberKernels.h"
 
@@ -114,7 +114,7 @@ namespace mg5amcCpu
     /*
     printf( "\nCurandRandomNumberKernel::generateRnarray size = %d\n", (int)m_rnarray.size() );
     fptype* data = m_rnarray.data();
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
     if( m_rnarray.isOnDevice() )
     {
       data = new fptype[m_rnarray.size()]();
@@ -123,7 +123,7 @@ namespace mg5amcCpu
 #endif
     for( int i = 0; i < ( (int)m_rnarray.size() / 4 ); i++ )
       printf( "[%4d] %f %f %f %f\n", i * 4, data[i * 4], data[i * 4 + 2], data[i * 4 + 2], data[i * 4 + 3] );
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
     if( m_rnarray.isOnDevice() ) delete[] data;
 #endif
     */
diff --git a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/EventStatistics.h b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/EventStatistics.h
index b425a5bade..48b51e0a49 100644
--- a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/EventStatistics.h
+++ b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/EventStatistics.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef EventStatistics_H
 #define EventStatistics_H 1
@@ -16,7 +16,7 @@
 #include <limits>
 #include <string>
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MadgraphTest.h b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MadgraphTest.h
index d2ff326e20..fd7734ce42 100644
--- a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MadgraphTest.h
+++ b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MadgraphTest.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: S. Hageboeck (Dec 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MADGRAPHTEST_H_
 #define MADGRAPHTEST_H_ 1
@@ -21,7 +21,7 @@
 #include <string>
 #include <vector>
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 using mg5amcGpu::CPPProcess;
 #else
 using mg5amcCpu::CPPProcess;
@@ -200,7 +200,7 @@ class MadgraphTest : public testing::TestWithParam<TestDriverBase*>
 
 // Since we link both the CPU-only and GPU tests into the same executable, we prevent
 // a multiply defined symbol by only compiling this in the non-CUDA phase:
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
 
 /// Compare momenta and matrix elements.
 /// This uses an implementation of TestDriverBase to run a madgraph workflow,
@@ -309,6 +309,6 @@ TEST_P( MadgraphTest, CompareMomentaAndME )
   }
 }
 
-#endif // MGONGPUCPP_GPUIMPL
+#endif // __CUDACC__
 
 #endif /* MADGRAPHTEST_H_ */
diff --git a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MatrixElementKernels.cc b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MatrixElementKernels.cc
index d6d6c4f179..30257195b6 100644
--- a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MatrixElementKernels.cc
+++ b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MatrixElementKernels.cc
@@ -1,12 +1,12 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #include "MatrixElementKernels.h"
 
 #include "CPPProcess.h"
-#include "GpuRuntime.h" // Includes the abstraction for Nvidia/AMD compilation
+#include "CudaRuntime.h"
 #include "MemoryAccessMomenta.h"
 #include "MemoryBuffers.h"
 
@@ -14,7 +14,7 @@
 
 //============================================================================
 
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
 namespace mg5amcCpu
 {
 
@@ -143,7 +143,7 @@ namespace mg5amcCpu
 
 //============================================================================
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 namespace mg5amcGpu
 {
 
@@ -202,13 +202,13 @@ namespace mg5amcGpu
     PinnedHostBufferHelicityMask hstIsGoodHel( ncomb );
     DeviceBufferHelicityMask devIsGoodHel( ncomb );
     // ... 0d1. Compute good helicity mask on the device
-    gpuLaunchKernel( computeDependentCouplings, m_gpublocks, m_gputhreads, m_gs.data(), m_couplings.data() );
+    computeDependentCouplings<<<m_gpublocks, m_gputhreads>>>( m_gs.data(), m_couplings.data() );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    gpuLaunchKernel( sigmaKin_getGoodHel, m_gpublocks, m_gputhreads, m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_numerators.data(), m_denominators.data(), devIsGoodHel.data() );
+    sigmaKin_getGoodHel<<<m_gpublocks, m_gputhreads>>>( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_numerators.data(), m_denominators.data(), devIsGoodHel.data() );
 #else
-    gpuLaunchKernel( sigmaKin_getGoodHel, m_gpublocks, m_gputhreads, m_momenta.data(), m_couplings.data(), m_matrixElements.data(), devIsGoodHel.data() );
+    sigmaKin_getGoodHel<<<m_gpublocks, m_gputhreads>>>( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), devIsGoodHel.data() );
 #endif
-    checkGpu( gpuPeekAtLastError() );
+    checkCuda( cudaPeekAtLastError() );
     // ... 0d2. Copy back good helicity mask to the host
     copyHostFromDevice( hstIsGoodHel, devIsGoodHel );
     // ... 0d3. Copy back good helicity list to constant memory on the device
@@ -219,19 +219,19 @@ namespace mg5amcGpu
 
   void MatrixElementKernelDevice::computeMatrixElements( const unsigned int channelId )
   {
-    gpuLaunchKernel( computeDependentCouplings, m_gpublocks, m_gputhreads, m_gs.data(), m_couplings.data() );
+    computeDependentCouplings<<<m_gpublocks, m_gputhreads>>>( m_gs.data(), m_couplings.data() );
 #ifndef MGONGPU_NSIGHT_DEBUG
     constexpr unsigned int sharedMemSize = 0;
 #else
     constexpr unsigned int sharedMemSize = ntpbMAX * sizeof( float );
 #endif
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    gpuLaunchKernelSharedMem( sigmaKin, m_gpublocks, m_gputhreads, sharedMemSize, m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), channelId, m_numerators.data(), m_denominators.data(), m_selhel.data(), m_selcol.data() );
+    sigmaKin<<<m_gpublocks, m_gputhreads, sharedMemSize>>>( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), channelId, m_numerators.data(), m_denominators.data(), m_selhel.data(), m_selcol.data() );
 #else
-    gpuLaunchKernelSharedMem( sigmaKin, m_gpublocks, m_gputhreads, sharedMemSize, m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), m_selhel.data(), m_selcol.data() );
+    sigmaKin<<<m_gpublocks, m_gputhreads, sharedMemSize>>>( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), m_selhel.data(), m_selcol.data() );
 #endif
-    checkGpu( gpuPeekAtLastError() );
-    checkGpu( gpuDeviceSynchronize() );
+    checkCuda( cudaPeekAtLastError() );
+    checkCuda( cudaDeviceSynchronize() );
   }
 
   //--------------------------------------------------------------------------
diff --git a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MatrixElementKernels.h b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MatrixElementKernels.h
index 72bd8f195b..23e84757a2 100644
--- a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MatrixElementKernels.h
+++ b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MatrixElementKernels.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MATRIXELEMENTKERNELS_H
 #define MATRIXELEMENTKERNELS_H 1
@@ -10,7 +10,7 @@
 
 #include "MemoryBuffers.h"
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -81,7 +81,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
   // A class encapsulating matrix element calculations on a CPU host
   class MatrixElementKernelHost final : public MatrixElementKernelBase, public NumberOfEvents
   {
@@ -130,7 +130,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
   // A class encapsulating matrix element calculations on a GPU device
   class MatrixElementKernelDevice : public MatrixElementKernelBase, public NumberOfEvents
   {
diff --git a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MemoryAccessHelpers.h b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MemoryAccessHelpers.h
index db73e4e064..c82a6c7635 100644
--- a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MemoryAccessHelpers.h
+++ b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MemoryAccessHelpers.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MemoryAccessHelpers_H
 #define MemoryAccessHelpers_H 1
@@ -105,7 +105,7 @@ class KernelAccessHelper : public MemoryAccessHelper<T>
     }
     else
     {
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
       const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid
       //printf( "kernelAccessRecord: ievt=%d threadId=%d\n", ievt, threadIdx.x );
       return T::ieventAccessRecord( buffer, ievt ); // NB fptype and fptype_sv coincide for CUDA
diff --git a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MemoryAccessMomenta.h b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MemoryAccessMomenta.h
index 38fade09fb..0ac4faa3c7 100644
--- a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MemoryAccessMomenta.h
+++ b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MemoryAccessMomenta.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MemoryAccessMomenta_H
 #define MemoryAccessMomenta_H 1
@@ -12,7 +12,7 @@
 #include "MemoryAccessHelpers.h"
 #include "MemoryAccessVectors.h"
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 using mg5amcGpu::CPPProcess;
 #else
 using mg5amcCpu::CPPProcess;
@@ -29,7 +29,7 @@ class MemoryAccessMomentaBase //_AOSOAv1
 
   // Number of Events Per Page in the momenta AOSOA memory buffer layout
   // (these are all best kept as a compile-time constants: see issue #23)
-#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
+#ifdef __CUDACC__ /* clang-format off */
   // -----------------------------------------------------------------------------------------------
   // --- GPUs: neppM is best set to a power of 2 times the number of fptype's in a 32-byte cacheline
   // --- This is relevant to ensure coalesced access to momenta in global memory
diff --git a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MemoryAccessRandomNumbers.h b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MemoryAccessRandomNumbers.h
index 40cb089135..e2988d39f3 100644
--- a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MemoryAccessRandomNumbers.h
+++ b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MemoryAccessRandomNumbers.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MemoryAccessRandomNumbers_H
 #define MemoryAccessRandomNumbers_H 1
@@ -11,7 +11,7 @@
 #include "CPPProcess.h"
 #include "MemoryAccessHelpers.h"
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 using mg5amcGpu::CPPProcess;
 #else
 using mg5amcCpu::CPPProcess;
diff --git a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MemoryAccessVectors.h b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MemoryAccessVectors.h
index 08faccff0f..e9b197368e 100644
--- a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MemoryAccessVectors.h
+++ b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MemoryAccessVectors.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MemoryAccessVectors_H
 #define MemoryAccessVectors_H 1
@@ -10,7 +10,7 @@
 
 #include "mgOnGpuVectors.h"
 
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
 namespace mg5amcCpu // this is only needed for CPU SIMD vectorization
 {
 
diff --git a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MemoryBuffers.h b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MemoryBuffers.h
index 7756a71621..3093e6ed18 100644
--- a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MemoryBuffers.h
+++ b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MemoryBuffers.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021, based on earlier work by S. Hageboeck) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Roiser, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MemoryBuffers_H
 #define MemoryBuffers_H 1
@@ -11,12 +11,12 @@
 #include "mgOnGpuCxtypes.h"
 
 #include "CPPProcess.h"
-#include "GpuRuntime.h"
+#include "CudaRuntime.h"
 #include "Parameters_sm.h"
 
 #include <sstream>
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -87,7 +87,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
   constexpr bool HostBufferALIGNED = false;   // ismisaligned=false
   constexpr bool HostBufferMISALIGNED = true; // ismisaligned=true
 
@@ -119,7 +119,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
   // A class encapsulating a CUDA pinned host buffer
   template<typename T>
   class PinnedHostBufferBase : public BufferBase<T>
@@ -128,18 +128,18 @@ namespace mg5amcCpu
     PinnedHostBufferBase( const size_t size )
       : BufferBase<T>( size, false )
     {
-      gpuMallocHost( &( this->m_data ), this->bytes() );
+      checkCuda( cudaMallocHost( &( this->m_data ), this->bytes() ) );
     }
     virtual ~PinnedHostBufferBase()
     {
-      gpuFreeHost( this->m_data );
+      checkCuda( cudaFreeHost( this->m_data ) );
     }
   };
 #endif
 
   //--------------------------------------------------------------------------
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
   // A class encapsulating a CUDA device buffer
   template<typename T>
   class DeviceBufferBase : public BufferBase<T>
@@ -148,18 +148,18 @@ namespace mg5amcCpu
     DeviceBufferBase( const size_t size )
       : BufferBase<T>( size, true )
     {
-      gpuMalloc( &( this->m_data ), this->bytes() );
+      checkCuda( cudaMalloc( &( this->m_data ), this->bytes() ) );
     }
     virtual ~DeviceBufferBase()
     {
-      gpuFree( this->m_data );
+      checkCuda( cudaFree( this->m_data ) );
     }
   };
 #endif
 
   //--------------------------------------------------------------------------
 
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
   // A class encapsulating a C++ host buffer for a given number of events
   template<typename T, size_t sizePerEvent, bool ismisaligned>
   class HostBuffer : public HostBufferBase<T, ismisaligned>, virtual private NumberOfEvents
@@ -175,7 +175,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
   // A class encapsulating a CUDA pinned host buffer for a given number of events
   template<typename T, size_t sizePerEvent>
   class PinnedHostBuffer : public PinnedHostBufferBase<T>, virtual private NumberOfEvents
@@ -191,7 +191,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
   // A class encapsulating a CUDA device buffer for a given number of events
   template<typename T, size_t sizePerEvent>
   class DeviceBuffer : public DeviceBufferBase<T>, virtual private NumberOfEvents
@@ -213,7 +213,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for momenta random numbers
   constexpr size_t sizePerEventRndNumMomenta = MemoryBuffers::np4 * MemoryBuffers::nparf;
 
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
   // A class encapsulating a C++ host buffer for momenta random numbers
   typedef HostBuffer<fptype, sizePerEventRndNumMomenta, HostBufferALIGNED> HostBufferRndNumMomenta;
 #else
@@ -232,7 +232,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer with ONE fptype per event
   constexpr size_t sizePerEventOneFp = 1;
 
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
   // A class encapsulating a C++ host buffer with ONE fptype per event
   typedef HostBuffer<fptype, sizePerEventOneFp, HostBufferALIGNED> HostBufferOneFp;
 #else
@@ -257,7 +257,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for Gs
   constexpr size_t sizePerEventGs = 1;
 
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
   // A class encapsulating a C++ host buffer for gs
   typedef HostBuffer<fptype, sizePerEventGs, HostBufferALIGNED> HostBufferGs;
 #else
@@ -276,7 +276,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for numerators
   constexpr size_t sizePerEventNumerators = 1;
 
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
   // A class encapsulating a C++ host buffer for gs
   typedef HostBuffer<fptype, sizePerEventNumerators, HostBufferALIGNED> HostBufferNumerators;
 #else
@@ -296,7 +296,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for denominators
   constexpr size_t sizePerEventDenominators = 1;
 
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
   // A class encapsulating a C++ host buffer for gs
   typedef HostBuffer<fptype, sizePerEventDenominators, HostBufferALIGNED> HostBufferDenominators;
 #else
@@ -315,7 +315,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for random numbers
   constexpr size_t sizePerEventCouplings = MemoryBuffers::ndcoup * MemoryBuffers::nx2;
 
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
   // A class encapsulating a C++ host buffer for gs
   typedef HostBuffer<fptype, sizePerEventCouplings, HostBufferALIGNED> HostBufferCouplings;
 #else
@@ -333,7 +333,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for momenta
   constexpr size_t sizePerEventMomenta = MemoryBuffers::np4 * MemoryBuffers::npar;
 
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
   // A class encapsulating a C++ host buffer for momenta
   typedef HostBuffer<fptype, sizePerEventMomenta, HostBufferALIGNED> HostBufferMomenta;
   //typedef HostBuffer<fptype, sizePerEventMomenta, HostBufferMISALIGNED> HostBufferMomenta; // TEST MISALIGNMENT!
@@ -352,7 +352,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for sampling weights
   constexpr size_t sizePerEventWeights = 1;
 
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
   // A class encapsulating a C++ host buffer for sampling weights
   typedef HostBuffer<fptype, sizePerEventWeights, HostBufferALIGNED> HostBufferWeights;
 #else
@@ -370,7 +370,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for matrix elements
   constexpr size_t sizePerEventMatrixElements = 1;
 
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
   // A class encapsulating a C++ host buffer for matrix elements
   typedef HostBuffer<fptype, sizePerEventMatrixElements, HostBufferALIGNED> HostBufferMatrixElements;
 #else
@@ -385,7 +385,7 @@ namespace mg5amcCpu
   // A base class encapsulating a memory buffer for the helicity mask
   typedef BufferBase<bool> BufferHelicityMask;
 
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
   // A class encapsulating a C++ host buffer for the helicity mask
   typedef HostBufferBase<bool, HostBufferALIGNED> HostBufferHelicityMask;
 #else
@@ -403,7 +403,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for wavefunctions
   constexpr size_t sizePerEventWavefunctions = MemoryBuffers::nw6 * MemoryBuffers::nx2;
 
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
   // A class encapsulating a C++ host buffer for wavefunctions
   typedef HostBuffer<fptype, sizePerEventWavefunctions, HostBufferALIGNED> HostBufferWavefunctions;
 #else
@@ -421,7 +421,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for helicity random numbers
   constexpr size_t sizePerEventRndNumHelicity = 1;
 
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
   // A class encapsulating a C++ host buffer for helicity random numbers
   typedef HostBuffer<fptype, sizePerEventRndNumHelicity, HostBufferALIGNED> HostBufferRndNumHelicity;
 #else
@@ -439,7 +439,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for color random numbers
   constexpr size_t sizePerEventRndNumColor = 1;
 
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
   // A class encapsulating a C++ host buffer for color random numbers
   typedef HostBuffer<fptype, sizePerEventRndNumColor, HostBufferALIGNED> HostBufferRndNumColor;
 #else
@@ -457,7 +457,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for helicity selection
   constexpr size_t sizePerEventSelectedHelicity = 1;
 
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
   // A class encapsulating a C++ host buffer for helicity selection
   typedef HostBuffer<int, sizePerEventSelectedHelicity, HostBufferALIGNED> HostBufferSelectedHelicity;
 #else
@@ -475,7 +475,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for color selection
   constexpr size_t sizePerEventSelectedColor = 1;
 
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
   // A class encapsulating a C++ host buffer for color selection
   typedef HostBuffer<int, sizePerEventSelectedColor, HostBufferALIGNED> HostBufferSelectedColor;
 #else
@@ -487,7 +487,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
   template<class Tdst, class Tsrc>
   void copyDeviceFromHost( Tdst& dst, const Tsrc& src ) // keep the same order of arguments as in memcpy
   {
@@ -504,13 +504,13 @@ namespace mg5amcCpu
       throw std::runtime_error( sstr.str() );
     }
     // NB (PR #45): cudaMemcpy involves an intermediate memcpy to pinned memory if host array is a not a pinned host array
-    gpuMemcpy( dst.data(), src.data(), src.bytes(), gpuMemcpyHostToDevice );
+    checkCuda( cudaMemcpy( dst.data(), src.data(), src.bytes(), cudaMemcpyHostToDevice ) );
   }
 #endif
 
   //--------------------------------------------------------------------------
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
   template<class Tdst, class Tsrc>
   void copyHostFromDevice( Tdst& dst, const Tsrc& src ) // keep the same order of arguments as in memcpy
   {
@@ -527,7 +527,7 @@ namespace mg5amcCpu
       throw std::runtime_error( sstr.str() );
     }
     // NB (PR #45): cudaMemcpy involves an intermediate memcpy to pinned memory if host array is a not a pinned host array
-    gpuMemcpy( dst.data(), src.data(), src.bytes(), gpuMemcpyDeviceToHost );
+    checkCuda( cudaMemcpy( dst.data(), src.data(), src.bytes(), cudaMemcpyDeviceToHost ) );
   }
 #endif
 
diff --git a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/P1_Sigma_sm_gg_ttxgg/CPPProcess.cc b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/P1_Sigma_sm_gg_ttxgg/CPPProcess.cc
index 0d88d93225..53ef4c5751 100644
--- a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/P1_Sigma_sm_gg_ttxgg/CPPProcess.cc
+++ b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/P1_Sigma_sm_gg_ttxgg/CPPProcess.cc
@@ -4,7 +4,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
 // MadGraph5_aMC@NLO v. 3.5.0_lo_vect, 2023-06-09
@@ -16,6 +16,7 @@
 
 #include "mgOnGpuConfig.h"
 
+#include "CudaRuntime.h"
 #include "HelAmps_sm.h"
 #include "MemoryAccessAmplitudes.h"
 #include "MemoryAccessCouplings.h"
@@ -45,7 +46,7 @@
 // Class member functions for calculating the matrix elements for
 // Process: g g > t t~ g g WEIGHTED<=4 @1
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -79,7 +80,7 @@ namespace mg5amcCpu
   __device__ const fptype cIPD[2] = { (fptype)Parameters_sm::mdl_MT, (fptype)Parameters_sm::mdl_WT };
   __device__ const fptype* cIPC = nullptr; // unused as nicoup=0
 #else
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
   __device__ __constant__ fptype cIPD[2];
   __device__ __constant__ fptype* cIPC = nullptr; // unused as nicoup=0
 #else
@@ -89,7 +90,7 @@ namespace mg5amcCpu
 #endif
 
   // Helicity combinations (and filtering of "good" helicity combinations)
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
   __device__ __constant__ short cHel[ncomb][npar];
   __device__ __constant__ int cNGoodHel;
   __device__ __constant__ int cGoodHel[ncomb];
@@ -117,13 +118,13 @@ namespace mg5amcCpu
                            fptype* allDenominators,       // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
                            fptype_sv* jamp2_sv            // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled)
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
                            , const int ievt00             // input: first event number in current C++ event page (for CUDA, ievt depends on threadid)
 #endif
                            )
   //ALWAYS_INLINE // attributes are not permitted in a function definition
   {
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
     using namespace mg5amcGpu;
     using M_ACCESS = DeviceAccessMomenta;         // non-trivial access: buffer includes all events
     using E_ACCESS = DeviceAccessMatrixElements;  // non-trivial access: buffer includes all events
@@ -150,7 +151,7 @@ namespace mg5amcCpu
 #endif /* clang-format on */
     mgDebug( 0, __FUNCTION__ );
     //printf( "calculate_wavefunctions: ihel=%2d\n", ihel );
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
     //printf( "calculate_wavefunctions: ievt00=%d\n", ievt00 );
 #endif
 
@@ -186,7 +187,7 @@ namespace mg5amcCpu
 #endif
     for( int iParity = 0; iParity < nParity; ++iParity )
     { // START LOOP ON IPARITY
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
       const int ievt0 = ievt00 + iParity * neppV;
 #endif
       constexpr size_t nxcoup = ndcoup + nicoup; // both dependent and independent couplings
@@ -199,10 +200,8 @@ namespace mg5amcCpu
         allCOUPs[idcoup] = CD_ACCESS::idcoupAccessBufferConst( allcouplings, idcoup ); // dependent couplings, vary event-by-event
       for( size_t iicoup = 0; iicoup < nicoup; iicoup++ )
         allCOUPs[ndcoup + iicoup] = CI_ACCESS::iicoupAccessBufferConst( cIPC, iicoup ); // independent couplings, fixed for all events
-#ifdef MGONGPUCPP_GPUIMPL
 #ifdef __CUDACC__
 #pragma nv_diagnostic pop
-#endif
       // CUDA kernels take input/output buffers with momenta/MEs for all events
       const fptype* momenta = allmomenta;
       const fptype* COUPs[nxcoup];
@@ -2475,7 +2474,7 @@ namespace mg5amcCpu
         { 62, 71, -10, 80, -1, 8, -28, 62, 62, -10, -10, -1, -1, 8, -10, -1, -64, 8, 8, -64, 80, 8, 512, -64 },
         { -28, 62, 62, -10, -10, -1, 62, 71, -10, 80, -1, 8, -10, -1, -1, 8, 8, -64, 80, 8, 8, -64, -64, 512 } }; // 2-D array[24][24]
 
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
       // Pre-compute a constexpr triangular color matrix properly normalized #475
       struct TriangularNormalizedColorMatrix
       {
@@ -2532,7 +2531,7 @@ namespace mg5amcCpu
 #endif
       for( int icol = 0; icol < ncolor; icol++ )
       {
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
         // === C++ START ===
         // Diagonal terms
 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
@@ -2591,7 +2590,7 @@ namespace mg5amcCpu
       MEs_sv_previous += deltaMEs_previous;
 #endif
       /*
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
       if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", blockDim.x * blockIdx.x + threadIdx.x, ihel, MEs_sv );
 #else
 #ifdef MGONGPU_CPPSIMD
@@ -2686,8 +2685,8 @@ namespace mg5amcCpu
       { 1, 1, 1, -1, -1, 1 },
       { 1, 1, 1, -1, 1, -1 },
       { 1, 1, 1, -1, 1, 1 } };
-#ifdef MGONGPUCPP_GPUIMPL
-    gpuMemcpyToSymbol( cHel, tHel, ncomb * npar * sizeof( short ) );
+#ifdef __CUDACC__
+    checkCuda( cudaMemcpyToSymbol( cHel, tHel, ncomb * npar * sizeof( short ) ) );
 #else
     memcpy( cHel, tHel, ncomb * npar * sizeof( short ) );
 #endif
@@ -2729,9 +2728,9 @@ namespace mg5amcCpu
     // Then copy them to CUDA constant memory (issue #39) or its C++ emulation in file-scope static memory
     const fptype tIPD[2] = { (fptype)m_pars->mdl_MT, (fptype)m_pars->mdl_WT };
     //const cxtype tIPC[0] = { ... }; // nicoup=0
-#ifdef MGONGPUCPP_GPUIMPL
-    gpuMemcpyToSymbol( cIPD, tIPD, 2 * sizeof( fptype ) );
-    //gpuMemcpyToSymbol( cIPC, tIPC, 0 * sizeof( cxtype ) ); // nicoup=0
+#ifdef __CUDACC__
+    checkCuda( cudaMemcpyToSymbol( cIPD, tIPD, 2 * sizeof( fptype ) ) );
+    //checkCuda( cudaMemcpyToSymbol( cIPC, tIPC, 0 * sizeof( cxtype ) ) ); // nicoup=0
 #else
     memcpy( cIPD, tIPD, 2 * sizeof( fptype ) );
     //memcpy( cIPC, tIPC, 0 * sizeof( cxtype ) ); // nicoup=0
@@ -2769,7 +2768,7 @@ namespace mg5amcCpu
   {
     std::stringstream out;
     // CUDA version (NVCC)
-    // [Use __NVCC__ instead of MGONGPUCPP_GPUIMPL here!]
+    // [Use __NVCC__ instead of __CUDACC__ here!]
     // [This tests if 'nvcc' was used even to build a .cc file, even if not necessarily 'nvcc -x cu' for a .cu file]
     // [Check 'nvcc --compiler-options -dM -E dummy.c | grep CUDA': see https://stackoverflow.com/a/53713712]
 #ifdef __NVCC__
@@ -2834,12 +2833,12 @@ namespace mg5amcCpu
   __global__ void /* clang-format off */
   computeDependentCouplings( const fptype* allgs, // input: Gs[nevt]
                              fptype* allcouplings // output: couplings[nevt*ndcoup*2]
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
                              , const int nevt     // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #endif
   ) /* clang-format on */
   {
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
     using namespace mg5amcGpu;
     using G_ACCESS = DeviceAccessGs;
     using C_ACCESS = DeviceAccessCouplings;
@@ -2860,7 +2859,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
+#ifdef __CUDACC__ /* clang-format off */
   __global__ void
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
@@ -2989,9 +2988,9 @@ namespace mg5amcCpu
         nGoodHel++;
       }
     }
-#ifdef MGONGPUCPP_GPUIMPL
-    gpuMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) );
-    gpuMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) );
+#ifdef __CUDACC__
+    checkCuda( cudaMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) ) );
+    checkCuda( cudaMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) ) );
 #else
     cNGoodHel = nGoodHel;
     for( int ihel = 0; ihel < ncomb; ihel++ ) cGoodHel[ihel] = goodHel[ihel];
@@ -3015,7 +3014,7 @@ namespace mg5amcCpu
 #endif
             int* allselhel,                // output: helicity selection[nevt]
             int* allselcol                 // output: helicity selection[nevt]
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
             , const int nevt               // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #endif
             ) /* clang-format on */
@@ -3036,7 +3035,7 @@ namespace mg5amcCpu
     // Denominators: spins, colors and identical particles
     constexpr int helcolDenominators[1] = { 512 }; // assume nprocesses == 1 (#272 and #343)
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
     // Remember: in CUDA this is a kernel for one event, in c++ this processes n events
     const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid
 #else
@@ -3050,12 +3049,9 @@ namespace mg5amcCpu
 #endif
 
     // Start sigmaKin_lines
-
-#include "GpuAbstraction.h"
-
     // === PART 0 - INITIALISATION (before calculate_wavefunctions) ===
     // Reset the "matrix elements" - running sums of |M|^2 over helicities for the given event
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
     allMEs[ievt] = 0;
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     allNumerators[ievt] = 0;
@@ -3083,7 +3079,7 @@ namespace mg5amcCpu
     // === PART 1 - HELICITY LOOP: CALCULATE WAVEFUNCTIONS ===
     // (in both CUDA and C++, using precomputed good helicities)
 
-#ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++
+#ifdef __CUDACC__ // CUDA OR C++
 
     // *** START OF PART 1a - CUDA (one event per CPU thread) ***
     // Running sum of partial amplitudes squared for event by event color selection (#402)
@@ -3287,7 +3283,7 @@ namespace mg5amcCpu
     // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event
     // [NB 'sum over final spins, average over initial spins', eg see
     // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf]
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
     allMEs[ievt] /= helcolDenominators[0];
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     if( channelId > 0 ) allMEs[ievt] *= allNumerators[ievt] / allDenominators[ievt];
diff --git a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/P1_Sigma_sm_gg_ttxgg/CPPProcess.h b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/P1_Sigma_sm_gg_ttxgg/CPPProcess.h
index 5fa603d43c..b3323a7a84 100644
--- a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/P1_Sigma_sm_gg_ttxgg/CPPProcess.h
+++ b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/P1_Sigma_sm_gg_ttxgg/CPPProcess.h
@@ -4,7 +4,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
 // MadGraph5_aMC@NLO v. 3.5.0_lo_vect, 2023-06-09
@@ -25,7 +25,7 @@
 
 //--------------------------------------------------------------------------
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -107,7 +107,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
   __global__ void
   computeDependentCouplings( const fptype* allgs,    // input: Gs[nevt]
                              fptype* allcouplings ); // output: couplings[nevt*ndcoup*2]
@@ -120,7 +120,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
+#ifdef __CUDACC__ /* clang-format off */
   __global__ void
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
@@ -150,7 +150,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
+#ifdef __CUDACC__ /* clang-format off */
   __global__ void
   sigmaKin( const fptype* allmomenta,      // input: momenta[nevt*npar*4]
             const fptype* allcouplings,    // input: couplings[nevt*ndcoup*2]
diff --git a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/P1_Sigma_sm_gg_ttxgg/CudaRuntime.h b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/P1_Sigma_sm_gg_ttxgg/CudaRuntime.h
new file mode 120000
index 0000000000..ce9e1a487a
--- /dev/null
+++ b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/P1_Sigma_sm_gg_ttxgg/CudaRuntime.h
@@ -0,0 +1 @@
+../CudaRuntime.h
\ No newline at end of file
diff --git a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/P1_Sigma_sm_gg_ttxgg/check_sa.cc b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/P1_Sigma_sm_gg_ttxgg/check_sa.cc
index 1bad694d1c..f1e75b9252 100644
--- a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/P1_Sigma_sm_gg_ttxgg/check_sa.cc
+++ b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/P1_Sigma_sm_gg_ttxgg/check_sa.cc
@@ -4,7 +4,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: O. Mattelaer (Nov 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 
 #include "mgOnGpuConfig.h"
@@ -12,7 +12,6 @@
 #include "BridgeKernels.h"
 #include "CPPProcess.h"
 #include "CrossSectionKernels.h"
-#include "GpuRuntime.h"
 #include "MatrixElementKernels.h"
 #include "MemoryAccessMatrixElements.h"
 #include "MemoryAccessMomenta.h"
@@ -64,7 +63,7 @@ usage( char* argv0, int ret = 1 )
   std::cout << std::endl;
   std::cout << "Summary stats are always computed: '-p' and '-j' only control their printout" << std::endl;
   std::cout << "The '-d' flag only enables NaN/abnormal warnings and OMP debugging" << std::endl;
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
 #ifdef _OPENMP
   std::cout << std::endl;
   std::cout << "Use the OMP_NUM_THREADS environment variable to control OMP multi-threading" << std::endl;
@@ -78,7 +77,7 @@ int
 main( int argc, char** argv )
 {
   // Namespaces for CUDA and C++ (FIXME - eventually use the same namespace everywhere...)
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
   using namespace mg5amcGpu;
 #else
   using namespace mg5amcCpu;
@@ -104,11 +103,11 @@ main( int argc, char** argv )
     CurandDevice = 2
   };
 #ifdef __CUDACC__
-  RandomNumberMode rndgen = RandomNumberMode::CurandDevice; // default on CUDA GPU
+  RandomNumberMode rndgen = RandomNumberMode::CurandDevice; // default on GPU
 #elif not defined MGONGPU_HAS_NO_CURAND
   RandomNumberMode rndgen = RandomNumberMode::CurandHost;  // default on CPU if build has curand
 #else
-  RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // default on CPU if build has no curand and on HIP GPU
+  RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // default on CPU if build has no curand
 #endif
   // Rambo sampling mode (NB RamboHost implies CommonRandom or CurandHost!)
   enum class RamboSamplingMode
@@ -116,7 +115,7 @@ main( int argc, char** argv )
     RamboHost = 1,
     RamboDevice = 2
   };
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
   RamboSamplingMode rmbsmp = RamboSamplingMode::RamboDevice; // default on GPU
 #else
   RamboSamplingMode rmbsmp = RamboSamplingMode::RamboHost; // default on CPU
@@ -149,7 +148,7 @@ main( int argc, char** argv )
 #ifdef __CUDACC__
       rndgen = RandomNumberMode::CurandDevice;
 #else
-      throw std::runtime_error( "CurandDevice is not supported on CPUs or on HIP GPUs" );
+      throw std::runtime_error( "CurandDevice is not supported on CPUs" );
 #endif
     }
     else if( arg == "--curhst" )
@@ -166,7 +165,7 @@ main( int argc, char** argv )
     }
     else if( arg == "--rmbdev" )
     {
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
       rmbsmp = RamboSamplingMode::RamboDevice;
 #else
       throw std::runtime_error( "RamboDevice is not supported on CPUs" );
@@ -240,13 +239,13 @@ main( int argc, char** argv )
     return usage( argv[0] );
   }
 
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
 #ifdef _OPENMP
   ompnumthreadsNotSetMeansOneThread( debug ? 1 : 0 ); // quiet(-1), info(0), debug(1)
 #endif
 #endif
 
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
   // Fail gently and avoid "Illegal instruction (core dumped)" if the host does not support the SIMD used in the ME calculation
   // Note: this prevents a crash on pmpe04 but not on some github CI nodes?
   // [NB: SIMD vectorization in mg5amc C++ code is only used in the ME calculation below MatrixElementKernelHost!]
@@ -264,14 +263,14 @@ main( int argc, char** argv )
 
   // === STEP 0 - INITIALISE
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 
-  // --- 00. Initialise GPU
-  // Instantiate a GpuRuntime at the beginnining of the application's main.
-  // For CUDA this invokes cudaSetDevice(0) in the constructor and books a cudaDeviceReset() call in the destructor.
-  const std::string cdinKey = "00 GpuInit";
+  // --- 00. Initialise cuda
+  // Instantiate a CudaRuntime at the beginnining of the application's main to
+  // invoke cudaSetDevice(0) in the constructor and book a cudaDeviceReset() call in the destructor
+  const std::string cdinKey = "00 CudaInit";
   timermap.start( cdinKey );
-  GpuRuntime GpuRuntime( debug );
+  CudaRuntime cudaRuntime( debug );
 #endif
 
   // --- 0a. Initialise physics process
@@ -293,7 +292,7 @@ main( int argc, char** argv )
   timermap.start( alloKey );
 
   // Memory buffers for random numbers for momenta
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
   HostBufferRndNumMomenta hstRndmom( nevt );
 #else
   PinnedHostBufferRndNumMomenta hstRndmom( nevt );
@@ -301,7 +300,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for sampling weights
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
   HostBufferWeights hstWeights( nevt );
 #else
   PinnedHostBufferWeights hstWeights( nevt );
@@ -309,7 +308,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for momenta
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
   HostBufferMomenta hstMomenta( nevt );
 #else
   PinnedHostBufferMomenta hstMomenta( nevt );
@@ -317,7 +316,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for Gs
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
   HostBufferGs hstGs( nevt );
 #else
   PinnedHostBufferGs hstGs( nevt );
@@ -334,7 +333,7 @@ main( int argc, char** argv )
   }
 
   // Memory buffers for matrix elements
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
   HostBufferMatrixElements hstMatrixElements( nevt );
 #else
   PinnedHostBufferMatrixElements hstMatrixElements( nevt );
@@ -343,7 +342,7 @@ main( int argc, char** argv )
 
   // Memory buffers for random numbers for helicity selection
   // *** NB #403 these buffers always remain initialised at 0: no need for helicity choice in gcheck/check (no LHE produced) ***
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
   HostBufferRndNumHelicity hstRndHel( nevt );
 #else
   PinnedHostBufferRndNumHelicity hstRndHel( nevt );
@@ -352,7 +351,7 @@ main( int argc, char** argv )
 
   // Memory buffers for random numbers for color selection
   // *** NB #402 these buffers always remain initialised at 0: no need for color choice in gcheck/check (no LHE produced) ***
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
   HostBufferRndNumColor hstRndCol( nevt );
 #else
   PinnedHostBufferRndNumColor hstRndCol( nevt );
@@ -360,7 +359,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for helicity selection
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
   HostBufferSelectedHelicity hstSelHel( nevt );
 #else
   PinnedHostBufferSelectedHelicity hstSelHel( nevt );
@@ -368,7 +367,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for color selection
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
   HostBufferSelectedColor hstSelCol( nevt );
 #else
   PinnedHostBufferSelectedColor hstSelCol( nevt );
@@ -404,7 +403,7 @@ main( int argc, char** argv )
 #else
   else
   {
-    throw std::logic_error( "CurandDevice is not supported on CPUs or HIP GPUs" ); // INTERNAL ERROR (no path to this statement)
+    throw std::logic_error( "CurandDevice is not supported on CPUs" ); // INTERNAL ERROR (no path to this statement)
   }
 #endif
 #else
@@ -422,7 +421,7 @@ main( int argc, char** argv )
   }
   else
   {
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
     prsk.reset( new RamboSamplingKernelDevice( energy, devRndmom, devMomenta, devWeights, gpublocks, gputhreads ) );
 #else
     throw std::logic_error( "RamboDevice is not supported on CPUs" ); // INTERNAL ERROR (no path to this statement)
@@ -433,7 +432,7 @@ main( int argc, char** argv )
   std::unique_ptr<MatrixElementKernelBase> pmek;
   if( !bridge )
   {
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
     pmek.reset( new MatrixElementKernelDevice( devMomenta, devGs, devRndHel, devRndCol, devMatrixElements, devSelHel, devSelCol, gpublocks, gputhreads ) );
 #else
     pmek.reset( new MatrixElementKernelHost( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, nevt ) );
@@ -441,7 +440,7 @@ main( int argc, char** argv )
   }
   else
   {
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
     pmek.reset( new BridgeKernelDevice( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, gpublocks, gputhreads ) );
 #else
     pmek.reset( new BridgeKernelHost( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, nevt ) );
@@ -483,7 +482,7 @@ main( int argc, char** argv )
     prnk->generateRnarray();
     //std::cout << "Got random numbers" << std::endl;
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
     if( rndgen != RandomNumberMode::CurandDevice && rmbsmp == RamboSamplingMode::RamboDevice )
     {
       // --- 1c. Copy rndmom from host to device
@@ -515,7 +514,7 @@ main( int argc, char** argv )
     prsk->getMomentaFinal();
     //std::cout << "Got final momenta" << std::endl;
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
     if( rmbsmp == RamboSamplingMode::RamboDevice )
     {
       // --- 2c. CopyDToH Weights
@@ -560,7 +559,7 @@ main( int argc, char** argv )
       dynamic_cast<BridgeKernelBase*>( pmek.get() )->transposeInputMomentaC2F();
     }
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
     // --- 2d. CopyHToD Momenta
     const std::string gKey = "0.. CpHTDg";
     rambtime += timermap.start( gKey ); // FIXME! NOT A RAMBO TIMER!
@@ -589,7 +588,7 @@ main( int argc, char** argv )
     wv3atime += timermap.stop(); // calc only
     wavetime += wv3atime;        // calc plus copy
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
     if( !bridge )
     {
       // --- 3b. CopyDToH MEs
@@ -732,19 +731,15 @@ main( int argc, char** argv )
     rndgentxt = "CURAND DEVICE";
 #ifdef __CUDACC__
   rndgentxt += " (CUDA code)";
-#elif defined __HIPCC__
-  rndgentxt += " (HIP code)";
 #else
   rndgentxt += " (C++ code)";
 #endif
 
   // Workflow description summary
   std::string wrkflwtxt;
-  // -- CUDA or HIP or C++?
+  // -- CUDA or C++?
 #ifdef __CUDACC__
   wrkflwtxt += "CUD:";
-#elif defined __HIPCC__
-  wrkflwtxt += "HIP:";
 #else
   wrkflwtxt += "CPP:";
 #endif
@@ -769,12 +764,6 @@ main( int argc, char** argv )
 #else
   wrkflwtxt += "???:"; // no path to this statement
 #endif
-#elif defined __HIPCC__
-#if defined MGONGPU_CUCXTYPE_CXSMPL
-  wrkflwtxt += "CXS:";
-#else
-  wrkflwtxt += "???:"; // no path to this statement
-#endif
 #else
 #if defined MGONGPU_CPPCXTYPE_STDCOMPLEX
   wrkflwtxt += "STX:";
@@ -800,7 +789,7 @@ main( int argc, char** argv )
     wrkflwtxt += "RMBDEV+";
   else
     wrkflwtxt += "??????+"; // no path to this statement
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
   // -- HOST or DEVICE matrix elements? Standalone MEs or BRIDGE?
   if( !bridge )
     wrkflwtxt += "MESDEV";
@@ -856,7 +845,7 @@ main( int argc, char** argv )
 
   if( perf )
   {
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
 #ifdef _OPENMP
     // Get the output of "nproc --all" (https://stackoverflow.com/a/478960)
     std::string nprocall;
@@ -877,8 +866,6 @@ main( int argc, char** argv )
     std::cout << std::string( SEP79, '*' ) << std::endl
 #ifdef __CUDACC__
               << "Process                     = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_CUDA"
-#elif defined __HIPCC__
-              << "Process                     = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_HIP"
 #else
               << "Process                     = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_CPP"
 #endif
@@ -905,21 +892,21 @@ main( int argc, char** argv )
 #elif defined MGONGPU_FPTYPE_FLOAT
               << "FP precision                = FLOAT (NaN/abnormal=" << nabn << ", zero=" << nzero << ")" << std::endl
 #endif
+#ifdef __CUDACC__
 #if defined MGONGPU_CUCXTYPE_CUCOMPLEX
               << "Complex type                = CUCOMPLEX" << std::endl
 #elif defined MGONGPU_CUCXTYPE_THRUST
               << "Complex type                = THRUST::COMPLEX" << std::endl
-#elif defined MGONGPU_CUCXTYPE_CXSMPL
-              << "Complex type                = STD::COMPLEX" << std::endl
+#endif
 #else
-              << "Complex type                = ???" << std::endl // no path to this statement...
+              << "Complex type                = STD::COMPLEX" << std::endl
 #endif
               << "RanNumb memory layout       = AOSOA[" << neppR << "]"
               << ( neppR == 1 ? " == AOS" : "" )
               << " [HARDCODED FOR REPRODUCIBILITY]" << std::endl
               << "Momenta memory layout       = AOSOA[" << neppM << "]"
               << ( neppM == 1 ? " == AOS" : "" ) << std::endl
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
     //<< "Wavefunction GPU memory     = LOCAL" << std::endl
 #else
 #if !defined MGONGPU_CPPSIMD
@@ -950,7 +937,7 @@ main( int argc, char** argv )
 #endif
 #endif
               << "Random number generation    = " << rndgentxt << std::endl
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
 #ifdef _OPENMP
               << "OMP threads / `nproc --all` = " << omp_get_max_threads() << " / " << nprocall // includes a newline
 #endif
@@ -1046,14 +1033,14 @@ main( int argc, char** argv )
              << "\"FLOAT (NaN/abnormal=" << nabn << ")\"," << std::endl
 #endif
              << "\"Complex type\": "
+#ifdef __CUDACC__
 #if defined MGONGPU_CUCXTYPE_CUCOMPLEX
              << "\"CUCOMPLEX\"," << std::endl
 #elif defined MGONGPU_CUCXTYPE_THRUST
              << "\"THRUST::COMPLEX\"," << std::endl
-#elif defined MGONGPU_CUCXTYPE_CXSMPL
-             << "\"STD::COMPLEX\"," << std::endl
+#endif
 #else
-             << "\"???\"," << std::endl                           // no path to this statement...
+             << "\"STD::COMPLEX\"," << std::endl
 #endif
              << "\"RanNumb memory layout\": "
              << "\"AOSOA[" << neppR << "]\""
@@ -1061,7 +1048,7 @@ main( int argc, char** argv )
              << "\"Momenta memory layout\": "
              << "\"AOSOA[" << neppM << "]\""
              << ( neppM == 1 ? " == AOS" : "" ) << ", " << std::endl
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
     //<< "\"Wavefunction GPU memory\": " << "\"LOCAL\"," << std::endl
 #endif
              << "\"Curand generation\": "
diff --git a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/RamboSamplingKernels.cc b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/RamboSamplingKernels.cc
index 79abbcc4f8..da68aa9255 100644
--- a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/RamboSamplingKernels.cc
+++ b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/RamboSamplingKernels.cc
@@ -1,11 +1,11 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #include "RamboSamplingKernels.h"
 
-#include "GpuRuntime.h"
+#include "CudaRuntime.h"
 #include "MemoryAccessMomenta.h"
 #include "MemoryAccessRandomNumbers.h"
 #include "MemoryAccessWeights.h"
@@ -14,7 +14,7 @@
 
 #include <sstream>
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -92,7 +92,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
   RamboSamplingKernelDevice::RamboSamplingKernelDevice( const fptype energy,               // input: energy
                                                         const BufferRndNumMomenta& rndmom, // input: random numbers in [0,1]
                                                         BufferMomenta& momenta,            // output: momenta
@@ -135,7 +135,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
   __global__ void
   getMomentaInitialDevice( const fptype energy,
                            fptype* momenta )
@@ -147,17 +147,17 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
   void
   RamboSamplingKernelDevice::getMomentaInitial()
   {
-    gpuLaunchKernel( getMomentaInitialDevice, m_gpublocks, m_gputhreads, m_energy, m_momenta.data() );
+    getMomentaInitialDevice<<<m_gpublocks, m_gputhreads>>>( m_energy, m_momenta.data() );
   }
 #endif
 
   //--------------------------------------------------------------------------
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
   __global__ void
   getMomentaFinalDevice( const fptype energy,
                          const fptype* rndmom,
@@ -171,11 +171,11 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
   void
   RamboSamplingKernelDevice::getMomentaFinal()
   {
-    gpuLaunchKernel( getMomentaFinalDevice, m_gpublocks, m_gputhreads, m_energy, m_rndmom.data(), m_momenta.data(), m_weights.data() );
+    getMomentaFinalDevice<<<m_gpublocks, m_gputhreads>>>( m_energy, m_rndmom.data(), m_momenta.data(), m_weights.data() );
   }
 #endif
 
diff --git a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/RamboSamplingKernels.h b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/RamboSamplingKernels.h
index 7c214cd74b..184089efd7 100644
--- a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/RamboSamplingKernels.h
+++ b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/RamboSamplingKernels.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef RAMBOSAMPLINGKERNELS_H
 #define RAMBOSAMPLINGKERNELS_H 1
@@ -10,7 +10,7 @@
 
 #include "MemoryBuffers.h"
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -93,7 +93,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
   // A class encapsulating RAMBO phase space sampling on a GPU device
   class RamboSamplingKernelDevice final : public SamplingKernelBase, public NumberOfEvents
   {
diff --git a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/RandomNumberKernels.h b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/RandomNumberKernels.h
index 21d63beeac..188a72c2c9 100644
--- a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/RandomNumberKernels.h
+++ b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/RandomNumberKernels.h
@@ -1,14 +1,14 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef RANDOMNUMBERKERNELS_H
 #define RANDOMNUMBERKERNELS_H 1
 
 #include "mgOnGpuConfig.h"
 
-// NB This must come AFTER mgOnGpuConfig.h which contains our definition of __global__ when MGONGPUCPP_GPUIMPL is not defined
+// NB This must come AFTER mgOnGpuConfig.h which contains our definition of __global__ when __CUDACC__ is not defined
 #ifndef MGONGPU_HAS_NO_CURAND
 //#include "curand.h"
 struct curandGenerator_st; // forward definition from curand.h
@@ -16,7 +16,7 @@ struct curandGenerator_st; // forward definition from curand.h
 
 #include "MemoryBuffers.h"
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/cudacpp.mk b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/cudacpp.mk
index bcb73d7f01..a0397e9ecc 100644
--- a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/cudacpp.mk
+++ b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/cudacpp.mk
@@ -1,7 +1,7 @@
 # Copyright (C) 2020-2023 CERN and UCLouvain.
 # Licensed under the GNU Lesser General Public License (version 3 or later).
 # Created by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin.
-# Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+# Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 
 #=== Determine the name of this makefile (https://ftp.gnu.org/old-gnu/Manuals/make-3.80/html_node/make_17.html)
 #=== NB: different names (e.g. cudacpp.mk and cudacpp_src.mk) are used in the Subprocess and src directories
@@ -32,7 +32,7 @@ UNAME_P := $(shell uname -p)
 #=== Configure common compiler flags for C++ and CUDA
 
 INCFLAGS = -I.
-OPTFLAGS = -O3 # this ends up in GPUFLAGS too (should it?), cannot add -Ofast or -ffast-math here
+OPTFLAGS = -O3 # this ends up in CUFLAGS too (should it?), cannot add -Ofast or -ffast-math here
 
 # Dependency on src directory
 MG5AMC_COMMONLIB = mg5amc_common
@@ -89,139 +89,69 @@ endif
 
 #-------------------------------------------------------------------------------
 
-CUDA_COMPILER_PATH := $(shell compiler="`which nvcc 2>/dev/null`" && while [ -L "$$compiler" ]; do compiler=`readlink "$$compiler"`; done && echo "$$compiler")
-HIP_COMPILER_PATH := $(shell compiler="`which hipcc 2>/dev/null`" && while [ -L "$$compiler" ]; do compiler=`readlink "$$compiler"`; done && echo "$$compiler")
-
-ifeq ($(findstring nvcc,$(CUDA_COMPILER_PATH)),nvcc)
-  #=== Configure the CUDA compiler
-
-  # If CXX is not a single word (example "clang++ --gcc-toolchain...") then disable CUDA builds (issue #505)
-  # This is because it is impossible to pass this to "GPUFLAGS += -ccbin <host-compiler>" below
-  ifneq ($(words $(subst ccache ,,$(CXX))),1) # allow at most "CXX=ccache <host-compiler>" from outside
-    $(warning CUDA builds are not supported for multi-word CXX "$(CXX)")
-    override CUDA_HOME=disabled
-  endif
-
-  # If CUDA_HOME is not set, try to set it from the location of NVCC
-  ifndef CUDA_HOME
-    CUDA_HOME = $(patsubst %bin/nvcc,%,$(shell which nvcc 2>/dev/null))
-    $(warning CUDA_HOME was not set: using "$(CUDA_HOME)")
-  endif
-
-  # Set GPUCC as $(CUDA_HOME)/bin/nvcc if it exists
-  ifneq ($(wildcard $(CUDA_HOME)/bin/nvcc),)
-    GPUCC = $(CUDA_HOME)/bin/nvcc
-    USE_NVTX ?=-DUSE_NVTX
-    # See https://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/index.html
-    # See https://arnon.dk/matching-sm-architectures-arch-and-gencode-for-various-nvidia-cards/
-    # Default: use compute capability 70 for V100 (CERN lxbatch, CERN itscrd, Juwels Cluster).
-    # Embed device code for 70, and PTX for 70+.
-    # Export MADGRAPH_CUDA_ARCHITECTURE (comma-separated list) to use another value or list of values (see #533).
-    # Examples: use 60 for P100 (Piz Daint), 80 for A100 (Juwels Booster, NVidia raplab/Curiosity).
-    MADGRAPH_CUDA_ARCHITECTURE ?= 70
-    ###CUARCHFLAGS = -gencode arch=compute_$(MADGRAPH_CUDA_ARCHITECTURE),code=compute_$(MADGRAPH_CUDA_ARCHITECTURE) -gencode arch=compute_$(MADGRAPH_CUDA_ARCHITECTURE),code=sm_$(MADGRAPH_CUDA_ARCHITECTURE) # Older implementation (AV): go back to this one for multi-GPU support #533
-    ###CUARCHFLAGS = --gpu-architecture=compute_$(MADGRAPH_CUDA_ARCHITECTURE) --gpu-code=sm_$(MADGRAPH_CUDA_ARCHITECTURE),compute_$(MADGRAPH_CUDA_ARCHITECTURE) # Newer implementation (SH): cannot use this as-is for multi-GPU support #533
-    comma:=,
-    CUARCHFLAGS = $(foreach arch,$(subst $(comma), ,$(MADGRAPH_CUDA_ARCHITECTURE)),-gencode arch=compute_$(arch),code=compute_$(arch) -gencode arch=compute_$(arch),code=sm_$(arch))
-    CUINC = -I$(CUDA_HOME)/include/
-    CURANDLIBFLAGS = -L$(CUDA_HOME)/lib64/ -lcurand # NB: -lcuda is not needed here!
-    CUOPTFLAGS = -lineinfo
-    GPUFLAGS = $(OPTFLAGS) $(CUOPTFLAGS) $(INCFLAGS) $(CUINC) $(USE_NVTX) $(CUARCHFLAGS) -use_fast_math
-    ###GPUFLAGS += -Xcompiler -Wall -Xcompiler -Wextra -Xcompiler -Wshadow
-    ###GPUCC_VERSION = $(shell $(GPUCC) --version | grep 'Cuda compilation tools' | cut -d' ' -f5 | cut -d, -f1)
-    GPUFLAGS += -std=c++17 # need CUDA >= 11.2 (see #333): this is enforced in mgOnGpuConfig.h
-    # Without -maxrregcount: baseline throughput: 6.5E8 (16384 32 12) up to 7.3E8 (65536 128 12)
-    ###GPUFLAGS+= --maxrregcount 160 # improves throughput: 6.9E8 (16384 32 12) up to 7.7E8 (65536 128 12)
-    ###GPUFLAGS+= --maxrregcount 128 # improves throughput: 7.3E8 (16384 32 12) up to 7.6E8 (65536 128 12)
-    ###GPUFLAGS+= --maxrregcount 96 # degrades throughput: 4.1E8 (16384 32 12) up to 4.5E8 (65536 128 12)
-    ###GPUFLAGS+= --maxrregcount 64 # degrades throughput: 1.7E8 (16384 32 12) flat at 1.7E8 (65536 128 12)
-
-    CUBUILDRULEFLAGS = -Xcompiler -fPIC -c
-    CCBUILDRULEFLAGS = -Xcompiler -fPIC -c -x cu
-
-    CUDATESTFLAGS = -lcuda
-
-  else ifneq ($(origin REQUIRE_CUDA),undefined)
-    # If REQUIRE_CUDA is set but no cuda is found, stop here (e.g. for CI tests on GPU #443)
-    $(error No cuda installation found (set CUDA_HOME or make GPUCC visible in PATH))
-  else
-    # No cuda. Switch cuda compilation off and go to common random numbers in C++
-    $(warning CUDA_HOME is not set or is invalid: export CUDA_HOME to compile with cuda)
-    override GPUCC=
-    override USE_NVTX=
-    override CUINC=
-    override CURANDLIBFLAGS=
-  endif
-
-  # Set the host C++ compiler for GPUCC via "-ccbin <host-compiler>"
-  # (NB issue #505: this must be a single word, "clang++ --gcc-toolchain..." is not supported)
-  GPUFLAGS += -ccbin $(shell which $(subst ccache ,,$(CXX)))
-
-  # Allow newer (unsupported) C++ compilers with older versions of CUDA if ALLOW_UNSUPPORTED_COMPILER_IN_CUDA is set (#504)
-  ifneq ($(origin ALLOW_UNSUPPORTED_COMPILER_IN_CUDA),undefined)
-  GPUFLAGS += -allow-unsupported-compiler
-  endif
-
-else ifeq ($(findstring hipcc,$(HIP_COMPILER_PATH)),hipcc)
-  #=== Configure the HIP compiler
-
-  # If CXX is not a single word (example "clang++ --gcc-toolchain...") then disable CUDA builds (issue #505)
-  # This is because it is impossible to pass this to "GPUFLAGS += -ccbin <host-compiler>" below
-  ifneq ($(words $(subst ccache ,,$(CXX))),1) # allow at most "CXX=ccache <host-compiler>" from outside
-    $(warning CUDA builds are not supported for multi-word CXX "$(CXX)")
-    override CUDA_HOME=disabled
-  endif
-
-  # If HIP_HOME is not set, try to set it from the location of GPUCC
-  ifndef HIP_HOME
-    HIP_HOME = $(patsubst %bin/hipcc,%,$(HIP_COMPILER_PATH))
-    $(warning HIP_HOME was not set: using "$(HIP_HOME)")
-  endif
-
-  # Set GPUCC as $(HIP_HOME)/bin/hipcc if it exists
-  ifneq ($(wildcard $(HIP_HOME)/bin/hipcc),)
-    GPUCC = $(HIP_HOME)/bin/hipcc
-
-    # Should maybe find something equivelant to this in HIP
-    #USE_NVTX ?=-DUSE_NVTX
-
-    HIPARCHFLAGS = -target x86_64-linux-gnu --offload-arch=gfx90a
-    HIPINC = -I$(HIP_HOME)/include/
-
-    # -DHIP_FAST_MATH equivelant to -use_fast_math in HIP 
-    # (But only for single precision line 208: https://rocm-developer-tools.github.io/HIP/hcc__detail_2math__functions_8h_source.html)
-    GPUFLAGS = $(OPTFLAGS) $(CUOPTFLAGS) $(INCFLAGS) $(HIPINC) $(HIPARCHFLAGS) -DHIP_FAST_MATH -DHIP_PLATFORM=amd -fPIC
-    ###GPUFLAGS += -Xcompiler -Wall -Xcompiler -Wextra -Xcompiler -Wshadow
-    GPUFLAGS += -std=c++17
-    # Without -maxrregcount: baseline throughput: 6.5E8 (16384 32 12) up to 7.3E8 (65536 128 12)
-    ###GPUFLAGS+= --maxrregcount 160 # improves throughput: 6.9E8 (16384 32 12) up to 7.7E8 (65536 128 12)
-    ###GPUFLAGS+= --maxrregcount 128 # improves throughput: 7.3E8 (16384 32 12) up to 7.6E8 (65536 128 12)
-    ###GPUFLAGS+= --maxrregcount 96 # degrades throughput: 4.1E8 (16384 32 12) up to 4.5E8 (65536 128 12)
-    ###GPUFLAGS+= --maxrregcount 64 # degrades throughput: 1.7E8 (16384 32 12) flat at 1.7E8 (65536 128 12)
-
-    CUBUILDRULEFLAGS = -fPIC -c
-    CCBUILDRULEFLAGS = -fPIC -c
-
-  else ifneq ($(origin REQUIRE_HIP),undefined)
-    # If REQUIRE_HIP is set but no cuda is found, stop here (e.g. for CI tests on GPU #443)
-    $(error No hip installation found (set HIP_HOME or make GPUCC visible in PATH))
-  else
-    # No hip. Switch hip compilation off and go to common random numbers in C++
-    $(warning HIP_HOME is not set or is invalid: export HIP_HOME to compile with hip)
-    override GPUCC=
-    override USE_NVTX=
-    override CUINC=
-    override CURANDLIBFLAGS=
-  endif
+#=== Configure the CUDA compiler
+
+# If CXX is not a single word (example "clang++ --gcc-toolchain...") then disable CUDA builds (issue #505)
+# This is because it is impossible to pass this to "CUFLAGS += -ccbin <host-compiler>" below
+ifneq ($(words $(subst ccache ,,$(CXX))),1) # allow at most "CXX=ccache <host-compiler>" from outside
+  $(warning CUDA builds are not supported for multi-word CXX "$(CXX)")
+  override CUDA_HOME=disabled
+endif
+
+# If CUDA_HOME is not set, try to set it from the location of nvcc
+ifndef CUDA_HOME
+  CUDA_HOME = $(patsubst %bin/nvcc,%,$(shell which nvcc 2>/dev/null))
+  $(warning CUDA_HOME was not set: using "$(CUDA_HOME)")
+endif
+
+# Set NVCC as $(CUDA_HOME)/bin/nvcc if it exists
+ifneq ($(wildcard $(CUDA_HOME)/bin/nvcc),)
+  NVCC = $(CUDA_HOME)/bin/nvcc
+  USE_NVTX ?=-DUSE_NVTX
+  # See https://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/index.html
+  # See https://arnon.dk/matching-sm-architectures-arch-and-gencode-for-various-nvidia-cards/
+  # Default: use compute capability 70 for V100 (CERN lxbatch, CERN itscrd, Juwels Cluster).
+  # Embed device code for 70, and PTX for 70+.
+  # Export MADGRAPH_CUDA_ARCHITECTURE (comma-separated list) to use another value or list of values (see #533).
+  # Examples: use 60 for P100 (Piz Daint), 80 for A100 (Juwels Booster, NVidia raplab/Curiosity).
+  MADGRAPH_CUDA_ARCHITECTURE ?= 70
+  ###CUARCHFLAGS = -gencode arch=compute_$(MADGRAPH_CUDA_ARCHITECTURE),code=compute_$(MADGRAPH_CUDA_ARCHITECTURE) -gencode arch=compute_$(MADGRAPH_CUDA_ARCHITECTURE),code=sm_$(MADGRAPH_CUDA_ARCHITECTURE) # Older implementation (AV): go back to this one for multi-GPU support #533
+  ###CUARCHFLAGS = --gpu-architecture=compute_$(MADGRAPH_CUDA_ARCHITECTURE) --gpu-code=sm_$(MADGRAPH_CUDA_ARCHITECTURE),compute_$(MADGRAPH_CUDA_ARCHITECTURE) # Newer implementation (SH): cannot use this as-is for multi-GPU support #533
+  comma:=,
+  CUARCHFLAGS = $(foreach arch,$(subst $(comma), ,$(MADGRAPH_CUDA_ARCHITECTURE)),-gencode arch=compute_$(arch),code=compute_$(arch) -gencode arch=compute_$(arch),code=sm_$(arch))
+  CUINC = -I$(CUDA_HOME)/include/
+  CURANDLIBFLAGS = -L$(CUDA_HOME)/lib64/ -lcurand # NB: -lcuda is not needed here!
+  CUOPTFLAGS = -lineinfo
+  CUFLAGS = $(OPTFLAGS) $(CUOPTFLAGS) $(INCFLAGS) $(CUINC) $(USE_NVTX) $(CUARCHFLAGS) -use_fast_math
+  ###CUFLAGS += -Xcompiler -Wall -Xcompiler -Wextra -Xcompiler -Wshadow
+  ###NVCC_VERSION = $(shell $(NVCC) --version | grep 'Cuda compilation tools' | cut -d' ' -f5 | cut -d, -f1)
+  CUFLAGS += -std=c++17 # need CUDA >= 11.2 (see #333): this is enforced in mgOnGpuConfig.h
+  # Without -maxrregcount: baseline throughput: 6.5E8 (16384 32 12) up to 7.3E8 (65536 128 12)
+  ###CUFLAGS+= --maxrregcount 160 # improves throughput: 6.9E8 (16384 32 12) up to 7.7E8 (65536 128 12)
+  ###CUFLAGS+= --maxrregcount 128 # improves throughput: 7.3E8 (16384 32 12) up to 7.6E8 (65536 128 12)
+  ###CUFLAGS+= --maxrregcount 96 # degrades throughput: 4.1E8 (16384 32 12) up to 4.5E8 (65536 128 12)
+  ###CUFLAGS+= --maxrregcount 64 # degrades throughput: 1.7E8 (16384 32 12) flat at 1.7E8 (65536 128 12)
+else ifneq ($(origin REQUIRE_CUDA),undefined)
+  # If REQUIRE_CUDA is set but no cuda is found, stop here (e.g. for CI tests on GPU #443)
+  $(error No cuda installation found (set CUDA_HOME or make nvcc visible in PATH))
+else
+  # No cuda. Switch cuda compilation off and go to common random numbers in C++
+  $(warning CUDA_HOME is not set or is invalid: export CUDA_HOME to compile with cuda)
+  override NVCC=
+  override USE_NVTX=
+  override CUINC=
+  override CURANDLIBFLAGS=
+endif
 
-  # Allow newer (unsupported) C++ compilers with older versions of CUDA if ALLOW_UNSUPPORTED_COMPILER_IN_CUDA is set (#504)
-  ifneq ($(origin ALLOW_UNSUPPORTED_COMPILER_IN_CUDA),undefined)
-  GPUFLAGS += -allow-unsupported-compiler
-  endif
+# Set the host C++ compiler for nvcc via "-ccbin <host-compiler>"
+# (NB issue #505: this must be a single word, "clang++ --gcc-toolchain..." is not supported)
+CUFLAGS += -ccbin $(shell which $(subst ccache ,,$(CXX)))
 
+# Allow newer (unsupported) C++ compilers with older versions of CUDA if ALLOW_UNSUPPORTED_COMPILER_IN_CUDA is set (#504)
+ifneq ($(origin ALLOW_UNSUPPORTED_COMPILER_IN_CUDA),undefined)
+CUFLAGS += -allow-unsupported-compiler
 endif
 
-  
 #-------------------------------------------------------------------------------
 
 #=== Configure ccache for C++ and CUDA builds
@@ -233,9 +163,9 @@ endif
 #ifeq ($(USECCACHE)$(shell echo $(AR) | grep ccache),1)
 #  override AR:=ccache $(AR)
 #endif
-ifneq ($(GPUCC),)
-  ifeq ($(USECCACHE)$(shell echo $(GPUCC) | grep ccache),1)
-    override GPUCC:=ccache $(GPUCC)
+ifneq ($(NVCC),)
+  ifeq ($(USECCACHE)$(shell echo $(NVCC) | grep ccache),1)
+    override NVCC:=ccache $(NVCC)
   endif
 endif
 
@@ -259,7 +189,7 @@ endif
 
 # PowerPC-specific CUDA compiler flags (to be reviewed!)
 ifeq ($(UNAME_P),ppc64le)
-  GPUFLAGS+= -Xcompiler -mno-float128
+  CUFLAGS+= -Xcompiler -mno-float128
 endif
 
 #-------------------------------------------------------------------------------
@@ -269,10 +199,10 @@ endif
 # Set the default OMPFLAGS choice
 ifneq ($(shell $(CXX) --version | egrep '^Intel'),)
 override OMPFLAGS = -fopenmp
-###override OMPFLAGS = # disable OpenMP MT on Intel (was ok without GPUCC but not ok with GPUCC before #578)
+###override OMPFLAGS = # disable OpenMP MT on Intel (was ok without nvcc but not ok with nvcc before #578)
 else ifneq ($(shell $(CXX) --version | egrep '^(clang)'),)
 override OMPFLAGS = -fopenmp
-###override OMPFLAGS = # disable OpenMP MT on clang (was not ok without or with GPUCC before #578)
+###override OMPFLAGS = # disable OpenMP MT on clang (was not ok without or with nvcc before #578)
 else ifneq ($(shell $(CXX) --version | egrep '^(Apple clang)'),)
 override OMPFLAGS = # disable OpenMP MT on Apple clang (builds fail in the CI #578)
 else
@@ -323,10 +253,7 @@ endif
 
 # Set the default RNDGEN (random number generator) choice
 ifeq ($(RNDGEN),)
-  ifeq ($(GPUCC),)
-    override RNDGEN = hasNoCurand
-  # Edgecase for HIP compilation
-  else ifeq ($(findstring hipcc,$(GPUCC)),hipcc)
+  ifeq ($(NVCC),)
     override RNDGEN = hasNoCurand
   else ifeq ($(RNDGEN),)
     override RNDGEN = hasCurand
@@ -401,13 +328,13 @@ CXXFLAGS+= $(AVXFLAGS)
 $(info FPTYPE=$(FPTYPE))
 ifeq ($(FPTYPE),d)
   CXXFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_DOUBLE
-  GPUFLAGS  += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_DOUBLE
+  CUFLAGS  += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_DOUBLE
 else ifeq ($(FPTYPE),f)
   CXXFLAGS += -DMGONGPU_FPTYPE_FLOAT -DMGONGPU_FPTYPE2_FLOAT
-  GPUFLAGS  += -DMGONGPU_FPTYPE_FLOAT -DMGONGPU_FPTYPE2_FLOAT
+  CUFLAGS  += -DMGONGPU_FPTYPE_FLOAT -DMGONGPU_FPTYPE2_FLOAT
 else ifeq ($(FPTYPE),m)
   CXXFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_FLOAT
-  GPUFLAGS  += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_FLOAT
+  CUFLAGS  += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_FLOAT
 else
   $(error Unknown FPTYPE='$(FPTYPE)': only 'd', 'f' and 'm' are supported)
 endif
@@ -416,7 +343,7 @@ endif
 $(info HELINL=$(HELINL))
 ifeq ($(HELINL),1)
   CXXFLAGS += -DMGONGPU_INLINE_HELAMPS
-  GPUFLAGS  += -DMGONGPU_INLINE_HELAMPS
+  CUFLAGS  += -DMGONGPU_INLINE_HELAMPS
 else ifneq ($(HELINL),0)
   $(error Unknown HELINL='$(HELINL)': only '0' and '1' are supported)
 endif
@@ -425,7 +352,7 @@ endif
 $(info HRDCOD=$(HRDCOD))
 ifeq ($(HRDCOD),1)
   CXXFLAGS += -DMGONGPU_HARDCODE_PARAM
-  GPUFLAGS  += -DMGONGPU_HARDCODE_PARAM
+  CUFLAGS  += -DMGONGPU_HARDCODE_PARAM
 else ifneq ($(HRDCOD),0)
   $(error Unknown HRDCOD='$(HRDCOD)': only '0' and '1' are supported)
 endif
@@ -477,11 +404,11 @@ ifeq ($(UNAME_S),Darwin)
   override CULIBFLAGSRPATH2 =
 else
   # RPATH to cuda/cpp libs when linking executables
-  override CXXLIBFLAGSRPATH = -Wl,-rpath=$(LIBDIRRPATH)
-  override CULIBFLAGSRPATH = -Xlinker -rpath=$(LIBDIRRPATH)
+  override CXXLIBFLAGSRPATH = -Wl,-rpath,$(LIBDIRRPATH)
+  override CULIBFLAGSRPATH = -Xlinker -rpath,$(LIBDIRRPATH)
   # RPATH to common lib when linking cuda/cpp libs
-  override CXXLIBFLAGSRPATH2 = -Wl,-rpath='$$ORIGIN'
-  override CULIBFLAGSRPATH2 = -Xlinker -rpath='$$ORIGIN'
+  override CXXLIBFLAGSRPATH2 = -Wl,-rpath,'$$ORIGIN'
+  override CULIBFLAGSRPATH2 = -Xlinker -rpath,'$$ORIGIN'
 endif
 
 # Setting LD_LIBRARY_PATH or DYLD_LIBRARY_PATH in the RUNTIME is no longer necessary (neither on Linux nor on Mac)
@@ -494,7 +421,7 @@ override RUNTIME =
 cxx_main=$(BUILDDIR)/check.exe
 fcxx_main=$(BUILDDIR)/fcheck.exe
 
-ifneq ($(GPUCC),)
+ifneq ($(NVCC),)
 cu_main=$(BUILDDIR)/gcheck.exe
 fcu_main=$(BUILDDIR)/fgcheck.exe
 else
@@ -525,16 +452,15 @@ $(BUILDDIR)/.build.$(TAG):
 	@touch $(BUILDDIR)/.build.$(TAG)
 
 # Generic target and build rules: objects from CUDA compilation
-ifneq ($(GPUCC),)
+ifneq ($(NVCC),)
 $(BUILDDIR)/%.o : %.cu *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
 	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
-	$(GPUCC) $(CPPFLAGS) $(GPUFLAGS) $(CUBUILDRULEFLAGS) $< -o $@
+	$(NVCC) $(CPPFLAGS) $(CUFLAGS) -Xcompiler -fPIC -c $< -o $@
 
 $(BUILDDIR)/%_cu.o : %.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
 	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
-	$(GPUCC) $(CPPFLAGS) $(GPUFLAGS) $(CCBUILDRULEFLAGS) $< -o $@
+	$(NVCC) $(CPPFLAGS) $(CUFLAGS) -Xcompiler -fPIC -c -x cu $< -o $@
 endif
-# -x cu in line above
 
 # Generic target and build rules: objects from C++ compilation
 # (NB do not include CUINC here! add it only for NVTX or curand #679)
@@ -543,14 +469,11 @@ $(BUILDDIR)/%.o : %.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
 	$(CXX) $(CPPFLAGS) $(CXXFLAGS) -fPIC -c $< -o $@
 
 # Apply special build flags only to CrossSectionKernel.cc and gCrossSectionKernel.cu (no fast math, see #117)
-# Added edgecase for HIP compilation
 ifeq ($(shell $(CXX) --version | grep ^nvc++),)
 $(BUILDDIR)/CrossSectionKernels.o: CXXFLAGS += -fno-fast-math
 $(BUILDDIR)/CrossSectionKernels.o: CXXFLAGS += -fno-fast-math
-ifeq ($(findstring nvcc,$(GPUCC)),nvcc)
-  $(BUILDDIR)/gCrossSectionKernels.o: GPUFLAGS += -Xcompiler -fno-fast-math
-else
-  $(BUILDDIR)/gCrossSectionKernels.o: GPUFLAGS += -fno-fast-math
+ifneq ($(NVCC),)
+$(BUILDDIR)/gCrossSectionKernels.o: CUFLAGS += -Xcompiler -fno-fast-math
 endif
 endif
 
@@ -566,10 +489,10 @@ ifeq ($(RNDGEN),hasCurand)
 $(BUILDDIR)/CurandRandomNumberKernel.o: CXXFLAGS += $(CUINC)
 endif
 
-# Avoid "warning: builtin __has_trivial_... is deprecated; use __is_trivially_... instead" in GPUCC with icx2023 (#592)
+# Avoid "warning: builtin __has_trivial_... is deprecated; use __is_trivially_... instead" in nvcc with icx2023 (#592)
 ifneq ($(shell $(CXX) --version | egrep '^(Intel)'),)
-ifneq ($(GPUCC),)
-GPUFLAGS += -Wno-deprecated-builtins
+ifneq ($(NVCC),)
+CUFLAGS += -Xcompiler -Wno-deprecated-builtins
 endif
 endif
 
@@ -577,8 +500,8 @@ endif
 # This patch does remove the warning, but I prefer to keep it disabled for the moment...
 ###ifneq ($(shell $(CXX) --version | egrep '^(clang|Apple clang|Intel)'),)
 ###$(BUILDDIR)/CrossSectionKernels.o: CXXFLAGS += -Wno-overriding-t-option
-###ifneq ($(GPUCC),)
-###$(BUILDDIR)/gCrossSectionKernels.o: GPUFLAGS += -Xcompiler -Wno-overriding-t-option
+###ifneq ($(NVCC),)
+###$(BUILDDIR)/gCrossSectionKernels.o: CUFLAGS += -Xcompiler -Wno-overriding-t-option
 ###endif
 ###endif
 
@@ -605,7 +528,7 @@ MG5AMC_CXXLIB = mg5amc_$(processid_short)_cpp
 cxx_objects_lib=$(BUILDDIR)/CPPProcess.o $(BUILDDIR)/MatrixElementKernels.o $(BUILDDIR)/BridgeKernels.o $(BUILDDIR)/CrossSectionKernels.o
 cxx_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel.o $(BUILDDIR)/RamboSamplingKernels.o
 
-ifneq ($(GPUCC),)
+ifneq ($(NVCC),)
 MG5AMC_CULIB = mg5amc_$(processid_short)_cuda
 cu_objects_lib=$(BUILDDIR)/gCPPProcess.o $(BUILDDIR)/gMatrixElementKernels.o $(BUILDDIR)/gBridgeKernels.o $(BUILDDIR)/gCrossSectionKernels.o
 cu_objects_exe=$(BUILDDIR)/gCommonRandomNumberKernel.o $(BUILDDIR)/gRamboSamplingKernels.o
@@ -617,11 +540,11 @@ $(LIBDIR)/lib$(MG5AMC_CXXLIB).so: cxx_objects_lib += $(BUILDDIR)/fbridge.o
 $(LIBDIR)/lib$(MG5AMC_CXXLIB).so: $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib)
 	$(CXX) -shared -o $@ $(cxx_objects_lib) $(CXXLIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB)
 
-ifneq ($(GPUCC),)
+ifneq ($(NVCC),)
 $(LIBDIR)/lib$(MG5AMC_CULIB).so: $(BUILDDIR)/fbridge_cu.o
 $(LIBDIR)/lib$(MG5AMC_CULIB).so: cu_objects_lib += $(BUILDDIR)/fbridge_cu.o
 $(LIBDIR)/lib$(MG5AMC_CULIB).so: $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cu_objects_lib)
-	$(GPUCC) --shared -o $@ $(cu_objects_lib) $(CULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB)
+	$(NVCC) --shared -o $@ $(cu_objects_lib) $(CULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB)
 endif
 
 #-------------------------------------------------------------------------------
@@ -638,16 +561,16 @@ $(cxx_main): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PAT
 $(cxx_main): $(BUILDDIR)/check_sa.o $(LIBDIR)/lib$(MG5AMC_CXXLIB).so $(cxx_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel.o
 	$(CXX) -o $@ $(BUILDDIR)/check_sa.o $(OMPFLAGS) -ldl -pthread $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_CXXLIB) $(cxx_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel.o $(CURANDLIBFLAGS)
 
-ifneq ($(GPUCC),)
+ifneq ($(NVCC),)
 ifneq ($(shell $(CXX) --version | grep ^Intel),)
-$(cu_main): LIBFLAGS += -lintlc # compile with icpx and link with GPUCC (undefined reference to `_intel_fast_memcpy')
-$(cu_main): LIBFLAGS += -lsvml # compile with icpx and link with GPUCC (undefined reference to `__svml_cos4_l9')
+$(cu_main): LIBFLAGS += -lintlc # compile with icpx and link with nvcc (undefined reference to `_intel_fast_memcpy')
+$(cu_main): LIBFLAGS += -lsvml # compile with icpx and link with nvcc (undefined reference to `__svml_cos4_l9')
 else ifneq ($(shell $(CXX) --version | grep ^nvc++),) # support nvc++ #531
 $(cu_main): LIBFLAGS += -L$(patsubst %bin/nvc++,%lib,$(subst ccache ,,$(CXX))) -lnvhpcatm -lnvcpumath -lnvc
 endif
 $(cu_main): LIBFLAGS += $(CULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH
 $(cu_main): $(BUILDDIR)/gcheck_sa.o $(LIBDIR)/lib$(MG5AMC_CULIB).so $(cu_objects_exe) $(BUILDDIR)/gCurandRandomNumberKernel.o
-	$(GPUCC) -o $@ $(BUILDDIR)/gcheck_sa.o $(CUARCHFLAGS) $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe) $(BUILDDIR)/gCurandRandomNumberKernel.o $(CURANDLIBFLAGS)
+	$(NVCC) -o $@ $(BUILDDIR)/gcheck_sa.o $(CUARCHFLAGS) $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe) $(BUILDDIR)/gCurandRandomNumberKernel.o $(CURANDLIBFLAGS)
 endif
 
 #-------------------------------------------------------------------------------
@@ -673,17 +596,17 @@ $(fcxx_main): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PA
 $(fcxx_main): $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler.o $(LIBDIR)/lib$(MG5AMC_CXXLIB).so $(cxx_objects_exe)
 	$(CXX) -o $@ $(BUILDDIR)/fcheck_sa.o $(OMPFLAGS) $(BUILDDIR)/fsampler.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CXXLIB) $(cxx_objects_exe)
 
-ifneq ($(GPUCC),)
+ifneq ($(NVCC),)
 ifneq ($(shell $(CXX) --version | grep ^Intel),)
-$(fcu_main): LIBFLAGS += -lintlc # compile with icpx and link with GPUCC (undefined reference to `_intel_fast_memcpy')
-$(fcu_main): LIBFLAGS += -lsvml # compile with icpx and link with GPUCC (undefined reference to `__svml_cos4_l9')
+$(fcu_main): LIBFLAGS += -lintlc # compile with icpx and link with nvcc (undefined reference to `_intel_fast_memcpy')
+$(fcu_main): LIBFLAGS += -lsvml # compile with icpx and link with nvcc (undefined reference to `__svml_cos4_l9')
 endif
 ifeq ($(UNAME_S),Darwin)
 $(fcu_main): LIBFLAGS += -L$(shell dirname $(shell $(FC) --print-file-name libgfortran.dylib)) # add path to libgfortran on Mac #375
 endif
 $(fcu_main): LIBFLAGS += $(CULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH
 $(fcu_main): $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler_cu.o $(LIBDIR)/lib$(MG5AMC_CULIB).so $(cu_objects_exe)
-	$(GPUCC) -o $@ $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler_cu.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe)
+	$(NVCC) -o $@ $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler_cu.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe)
 endif
 
 #-------------------------------------------------------------------------------
@@ -695,7 +618,7 @@ $(BUILDDIR)/testxxx.o: testxxx_cc_ref.txt
 $(testmain): $(BUILDDIR)/testxxx.o
 $(testmain): cxx_objects_exe += $(BUILDDIR)/testxxx.o # Comment out this line to skip the C++ test of xxx functions
 
-ifneq ($(GPUCC),)
+ifneq ($(NVCC),)
 $(BUILDDIR)/testxxx_cu.o: $(GTESTLIBS)
 $(BUILDDIR)/testxxx_cu.o: INCFLAGS += $(GTESTINC)
 $(BUILDDIR)/testxxx_cu.o: testxxx_cc_ref.txt
@@ -708,7 +631,7 @@ $(BUILDDIR)/testmisc.o: INCFLAGS += $(GTESTINC)
 $(testmain): $(BUILDDIR)/testmisc.o
 $(testmain): cxx_objects_exe += $(BUILDDIR)/testmisc.o # Comment out this line to skip the C++ miscellaneous tests
 
-ifneq ($(GPUCC),)
+ifneq ($(NVCC),)
 $(BUILDDIR)/testmisc_cu.o: $(GTESTLIBS)
 $(BUILDDIR)/testmisc_cu.o: INCFLAGS += $(GTESTINC)
 $(testmain): $(BUILDDIR)/testmisc_cu.o
@@ -720,12 +643,12 @@ $(BUILDDIR)/runTest.o: INCFLAGS += $(GTESTINC)
 $(testmain): $(BUILDDIR)/runTest.o
 $(testmain): cxx_objects_exe += $(BUILDDIR)/runTest.o
 
-ifneq ($(GPUCC),)
+ifneq ($(NVCC),)
 $(BUILDDIR)/runTest_cu.o: $(GTESTLIBS)
 $(BUILDDIR)/runTest_cu.o: INCFLAGS += $(GTESTINC)
 ifneq ($(shell $(CXX) --version | grep ^Intel),)
-$(testmain): LIBFLAGS += -lintlc # compile with icpx and link with GPUCC (undefined reference to `_intel_fast_memcpy')
-$(testmain): LIBFLAGS += -lsvml # compile with icpx and link with GPUCC (undefined reference to `__svml_cos4_l9')
+$(testmain): LIBFLAGS += -lintlc # compile with icpx and link with nvcc (undefined reference to `_intel_fast_memcpy')
+$(testmain): LIBFLAGS += -lsvml # compile with icpx and link with nvcc (undefined reference to `__svml_cos4_l9')
 else ifneq ($(shell $(CXX) --version | grep ^nvc++),) # support nvc++ #531
 $(testmain): LIBFLAGS += -L$(patsubst %bin/nvc++,%lib,$(subst ccache ,,$(CXX))) -lnvhpcatm -lnvcpumath -lnvc
 endif
@@ -749,14 +672,14 @@ $(testmain): LIBFLAGS += -lgomp
 endif
 endif
 
-ifeq ($(GPUCC),) # link only runTest.o
+ifeq ($(NVCC),) # link only runTest.o
 $(testmain): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH
 $(testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(cxx_objects_exe) $(GTESTLIBS)
 	$(CXX) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) -ldl -pthread $(LIBFLAGS)
 else # link both runTest.o and runTest_cu.o
 $(testmain): LIBFLAGS += $(CULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH
 $(testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) $(GTESTLIBS)
-	  $(GPUCC) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) -ldl $(LIBFLAGS) $(CUDATESTFLAGS)
+	$(NVCC) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) -ldl $(LIBFLAGS) -lcuda
 endif
 
 # Use flock (Linux only, no Mac) to allow 'make -j' if googletest has not yet been downloaded https://stackoverflow.com/a/32666215
@@ -859,9 +782,9 @@ ifeq ($(USECCACHE),1)
 	ccache --version | head -1
 endif
 	@echo ""
-	@echo GPUCC=$(GPUCC)
-ifneq ($(GPUCC),)
-	$(GPUCC) --version
+	@echo NVCC=$(NVCC)
+ifneq ($(NVCC),)
+	$(NVCC) --version
 endif
 	@echo ""
 	@echo CXX=$(CXX)
@@ -880,7 +803,7 @@ endif
 
 # Target: check (run the C++ test executable)
 # [NB THIS IS WHAT IS USED IN THE GITHUB CI!]
-ifneq ($(GPUCC),)
+ifneq ($(NVCC),)
 check: runTest cmpFcheck cmpFGcheck
 else
 check: runTest cmpFcheck
diff --git a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/fbridge.cc b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/fbridge.cc
index 2b956730d4..f93c05b0b3 100644
--- a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/fbridge.cc
+++ b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/fbridge.cc
@@ -1,11 +1,11 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: S. Roiser (Oct 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Roiser, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #include "Bridge.h"
 #include "CPPProcess.h"
-#include "GpuRuntime.h"
+#include "CudaRuntime.h"
 
 extern "C"
 {
@@ -22,7 +22,7 @@ extern "C"
    * Using the same Fortran MadEvent code, linking to the hetrerogeneous library would allow access to both CPU and GPU implementations.
    * The specific heterogeneous configuration (how many GPUs, how many threads on each CPU, etc) could be loaded in CUDA/C++ from a data file.
    */
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
   using namespace mg5amcGpu;
 #else
   using namespace mg5amcCpu;
@@ -46,8 +46,8 @@ extern "C"
    */
   void fbridgecreate_( CppObjectInFortran** ppbridge, const int* pnevtF, const int* pnparF, const int* pnp4F )
   {
-#ifdef MGONGPUCPP_GPUIMPL
-    GpuRuntime::setUp();
+#ifdef __CUDACC__
+    CudaRuntime::setUp();
 #endif
     // Create a process object, read parm card and set parameters
     // FIXME: the process instance can happily go out of scope because it is only needed to read parameters?
@@ -69,8 +69,8 @@ extern "C"
     Bridge<FORTRANFPTYPE>* pbridge = dynamic_cast<Bridge<FORTRANFPTYPE>*>( *ppbridge );
     if( pbridge == 0 ) throw std::runtime_error( "fbridgedelete_: invalid Bridge address" );
     delete pbridge;
-#ifdef MGONGPUCPP_GPUIMPL
-    GpuRuntime::tearDown();
+#ifdef __CUDACC__
+    CudaRuntime::tearDown();
 #endif
   }
 
@@ -100,7 +100,7 @@ extern "C"
   {
     Bridge<FORTRANFPTYPE>* pbridge = dynamic_cast<Bridge<FORTRANFPTYPE>*>( *ppbridge );
     if( pbridge == 0 ) throw std::runtime_error( "fbridgesequence_: invalid Bridge address" );
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
     // Use the device/GPU implementation in the CUDA library
     // (there is also a host implementation in this library)
     pbridge->gpu_sequence( momenta, gs, rndhel, rndcol, *pchannelId, mes, selhel, selcol );
diff --git a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/fsampler.cc b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/fsampler.cc
index 3743934f41..2fb445372d 100644
--- a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/fsampler.cc
+++ b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/fsampler.cc
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Feb 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #include "mgOnGpuConfig.h"
 
@@ -13,7 +13,7 @@
 
 //--------------------------------------------------------------------------
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -40,7 +40,7 @@ namespace mg5amcCpu
   private:
     const int m_nevt; // The number of events in each iteration
     int m_iiter;      // The iteration counter (for random number seeding)
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
     HostBufferRndNumMomenta m_hstRndmom; // Memory buffers for random numbers
     HostBufferMomenta m_hstMomenta;      // Memory buffers for momenta
     HostBufferWeights m_hstWeights;      // Memory buffers for sampling weights
@@ -105,7 +105,7 @@ namespace mg5amcCpu
 
 extern "C"
 {
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
   using namespace mg5amcGpu;
 #else
   using namespace mg5amcCpu;
diff --git a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/runTest.cc b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/runTest.cc
index 461ec5c3a5..572e28aaea 100644
--- a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/runTest.cc
+++ b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/runTest.cc
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: S. Hageboeck (Nov 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 
 #include "mgOnGpuConfig.h"
 
@@ -15,7 +15,7 @@
 #include "RandomNumberKernels.h"
 #include "epoch_process_id.h"
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 using namespace mg5amcGpu;
 #else
 using namespace mg5amcCpu;
@@ -32,7 +32,7 @@ struct CUDA_CPU_TestBase : public TestDriverBase
     : TestDriverBase( npar, refFileName ) {}
 };
 
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
 struct CPUTest : public CUDA_CPU_TestBase
 {
   // Struct data members (process, and memory structures for random numbers, momenta, matrix elements and weights on host and device)
@@ -114,7 +114,7 @@ struct CPUTest : public CUDA_CPU_TestBase
 };
 #endif
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 struct CUDATest : public CUDA_CPU_TestBase
 {
   // Reset the device when our test goes out of scope. Note that this should happen after
@@ -123,7 +123,7 @@ struct CUDATest : public CUDA_CPU_TestBase
   {
     ~DeviceReset()
     {
-      gpuDeviceReset(); // this is needed by cuda-memcheck --leak-check full
+      checkCuda( cudaDeviceReset() ); // this is needed by cuda-memcheck --leak-check full
     }
   } deviceResetter;
 
@@ -249,7 +249,7 @@ INSTANTIATE_TEST_SUITE_P( prefix, \
                           test_suite_name, \
                           testing::Values( new CUDATest( MG_EPOCH_REFERENCE_FILE_NAME ) ) );
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 MG_INSTANTIATE_TEST_SUITE_GPU( XTESTID_GPU( MG_EPOCH_PROCESS_ID ), MadgraphTest );
 #else
 MG_INSTANTIATE_TEST_SUITE_CPU( XTESTID_CPU( MG_EPOCH_PROCESS_ID ), MadgraphTest );
diff --git a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/testmisc.cc b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/testmisc.cc
index 2bd7a9fcf9..989aba1fdc 100644
--- a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/testmisc.cc
+++ b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/testmisc.cc
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 //----------------------------------------------------------------------------
 // Use ./runTest.exe --gtest_filter=*misc to run only this test
 //----------------------------------------------------------------------------
@@ -17,7 +17,7 @@
 #include <sstream>
 #include <typeinfo>
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 #define TESTID( s ) s##_GPU_MISC
 #else
 #define TESTID( s ) s##_CPU_MISC
diff --git a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/testxxx.cc b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/testxxx.cc
index 6e8657edca..4243e9fcec 100644
--- a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/testxxx.cc
+++ b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/testxxx.cc
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Apr 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #include "mgOnGpuConfig.h"
 
@@ -20,7 +20,7 @@
 #include <iomanip>
 #include <iostream>
 #include <vector>
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 #define TESTID( s ) s##_GPU_XXX
 #else
 #define TESTID( s ) s##_CPU_XXX
@@ -41,7 +41,7 @@ TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testxxx )
   assert( nevt % neppM == 0 ); // nevt must be a multiple of neppM
   assert( nevt % neppV == 0 ); // nevt must be a multiple of neppV
   // Fill in the input momenta
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
   mg5amcGpu::PinnedHostBufferMomenta hstMomenta( nevt ); // AOSOA[npagM][npar=4][np4=4][neppM]
 #else
   mg5amcCpu::HostBufferMomenta hstMomenta( nevt ); // AOSOA[npagM][npar=4][np4=4][neppM]
@@ -228,7 +228,7 @@ TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testxxx )
   {
     for( int ievt = 0; ievt < nevt; ievt++ )
     {
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
       using namespace mg5amcGpu;
 #else
       using namespace mg5amcCpu;
diff --git a/epochX/cudacpp/gg_ttgg.sa/src/HelAmps_sm.h b/epochX/cudacpp/gg_ttgg.sa/src/HelAmps_sm.h
index f772885631..ee2fcbbde5 100644
--- a/epochX/cudacpp/gg_ttgg.sa/src/HelAmps_sm.h
+++ b/epochX/cudacpp/gg_ttgg.sa/src/HelAmps_sm.h
@@ -4,7 +4,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: A. Valassi (Sep 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
 // MadGraph5_aMC@NLO v. 3.5.0_lo_vect, 2023-06-09
@@ -26,7 +26,7 @@
 //#include <iomanip>
 //#include <iostream>
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/gg_ttgg.sa/src/Parameters_sm.cc b/epochX/cudacpp/gg_ttgg.sa/src/Parameters_sm.cc
index de87dcaf64..d3d01102fd 100644
--- a/epochX/cudacpp/gg_ttgg.sa/src/Parameters_sm.cc
+++ b/epochX/cudacpp/gg_ttgg.sa/src/Parameters_sm.cc
@@ -4,7 +4,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: A. Valassi (Sep 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
 // MadGraph5_aMC@NLO v. 3.5.0_lo_vect, 2023-06-09
diff --git a/epochX/cudacpp/gg_ttgg.sa/src/Parameters_sm.h b/epochX/cudacpp/gg_ttgg.sa/src/Parameters_sm.h
index fe7d686938..6551d8da81 100644
--- a/epochX/cudacpp/gg_ttgg.sa/src/Parameters_sm.h
+++ b/epochX/cudacpp/gg_ttgg.sa/src/Parameters_sm.h
@@ -214,7 +214,7 @@ namespace Parameters_sm_dependentCouplings
 #pragma GCC diagnostic push
 #pragma GCC diagnostic ignored "-Wunused-variable"  // e.g. <<warning: unused variable ‘mdl_G__exp__2’ [-Wunused-variable]>>
 #pragma GCC diagnostic ignored "-Wunused-parameter" // e.g. <<warning: unused parameter ‘G’ [-Wunused-parameter]>>
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 #pragma nv_diagnostic push
 #pragma nv_diag_suppress 177 // e.g. <<warning #177-D: variable "mdl_G__exp__2" was declared but never referenced>>
 #endif
@@ -242,7 +242,7 @@ namespace Parameters_sm_dependentCouplings
     // End SM implementation - no special handling of vectors of floats as in EFT (#439)
     return out;
   }
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 #pragma GCC diagnostic pop
 #pragma nv_diagnostic pop
 #endif
@@ -258,7 +258,7 @@ namespace Parameters_sm_independentCouplings
 
 //==========================================================================
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/gg_ttgg.sa/src/mgOnGpuConfig.h b/epochX/cudacpp/gg_ttgg.sa/src/mgOnGpuConfig.h
index d4e37d19b3..6c0c4919e9 100644
--- a/epochX/cudacpp/gg_ttgg.sa/src/mgOnGpuConfig.h
+++ b/epochX/cudacpp/gg_ttgg.sa/src/mgOnGpuConfig.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jul 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MGONGPUCONFIG_H
 #define MGONGPUCONFIG_H 1
@@ -10,26 +10,13 @@
 // There are two different code bases for standalone_cudacpp (without multichannel) and madevent+cudacpp (with multichannel)
 #undef MGONGPU_SUPPORTS_MULTICHANNEL
 
-// Is this a GPU (CUDA, HIP) or CPU implementation?
-#ifdef __CUDACC__
-#define MGONGPUCPP_GPUIMPL cuda
-#elif defined __HIPCC__
-#define MGONGPUCPP_GPUIMPL hip
-#else
-#undef MGONGPUCPP_GPUIMPL
-#endif
-
 // ** NB1 Throughputs (e.g. 6.8E8) are events/sec for "./gcheck.exe -p 65536 128 12"
 // ** NB2 Baseline on b7g47n0004 fluctuates (probably depends on load on other VMs)
 
 // Choose if curand is supported for generating random numbers
-// For CUDA, by default, it is supported
-// For HIP, by default, it is not supported
 // For C++, by default, do not inline, but allow this macro to be set from outside with e.g. -DMGONGPU_HAS_NO_CURAND
 #ifdef __CUDACC__
 #undef MGONGPU_HAS_NO_CURAND
-#elif defined __HIPCC__
-#define MGONGPU_HAS_NO_CURAND 1
 #else
 //#undef MGONGPU_HAS_NO_CURAND // default
 ////#define MGONGPU_HAS_NO_CURAND 1
@@ -65,28 +52,23 @@
 //#undef MGONGPU_HARDCODE_PARAM // default
 ////#define MGONGPU_HARDCODE_PARAM 1
 
-// Complex type in CUDA: thrust or cucomplex or cxsmpl (CHOOSE ONLY ONE)
+// Complex type in c++: std::complex or cxsmpl (CHOOSE ONLY ONE)
+#ifndef __CUDACC__
+//#define MGONGPU_CPPCXTYPE_STDCOMPLEX 1 // ~8 percent slower on float, same on double (5.1E6/double, 9.4E6/float)
+#define MGONGPU_CPPCXTYPE_CXSMPL 1 // new default (5.1E6/double, 10.2E6/float)
+#endif
+
+// Complex type in cuda: thrust or cucomplex or cxsmpl (CHOOSE ONLY ONE)
 #ifdef __CUDACC__
 #define MGONGPU_CUCXTYPE_THRUST 1 // default (~1.15E9/double, ~3.2E9/float)
 //#define MGONGPU_CUCXTYPE_CUCOMPLEX 1 // ~10 percent slower (1.03E9/double, ~2.8E9/float)
 //#define MGONGPU_CUCXTYPE_CXSMPL 1 // ~10 percent slower (1.00E9/double, ~2.9E9/float)
-
-// Complex type in HIP: cxsmpl (ONLY ONE OPTION POSSIBLE)
-#elif defined __HIPCC__
-#define MGONGPU_CUCXTYPE_CXSMPL 1 // ~10 percent slower (1.00E9/double, ~2.9E9/float)
-
-// Complex type in C++: std::complex or cxsmpl (CHOOSE ONLY ONE)
-#else
-//#define MGONGPU_CPPCXTYPE_STDCOMPLEX 1 // ~8 percent slower on float, same on double (5.1E6/double, 9.4E6/float)
-#define MGONGPU_CPPCXTYPE_CXSMPL 1 // new default (5.1E6/double, 10.2E6/float)
 #endif
 
-// CUDA nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation
+// Cuda nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation
 #ifdef __CUDACC__
-#undef MGONGPU_NSIGHT_DEBUG // default in CUDA
+#undef MGONGPU_NSIGHT_DEBUG // default
 //#define MGONGPU_NSIGHT_DEBUG 1
-#else
-#undef MGONGPU_NSIGHT_DEBUG // only option in HIP or C++
 #endif
 
 // SANITY CHECKS (floating point precision for everything but color algebra #537)
@@ -102,21 +84,17 @@
 #error You cannot use double precision for color algebra and single precision elsewhere
 #endif
 
-// SANITY CHECKS (CUDA complex number implementation)
-#ifdef __CUDACC__
-#if defined MGONGPU_CUCXTYPE_THRUST and defined MGONGPU_CUCXTYPE_CUCOMPLEX
-#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CUCXTYPE_THRUST or MGONGPU_CUCXTYPE_CUCOMPLEX for CUDA
-#elif defined MGONGPU_CUCXTYPE_THRUST and defined MGONGPU_CUCXTYPE_CXSMPL
-#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CUCXTYPE_THRUST or MGONGPU_CUCXTYPE_CXSMPL for CUDA
-#elif defined MGONGPU_CUCXTYPE_CUCOMPLEX and defined MGONGPU_CUCXTYPE_CXSMPL
-#error You must CHOOSE (ONE AND) ONLY ONE OF MGONGPU_CUCXTYPE_CUCOMPLEX or MGONGPU_CUCXTYPE_CXSMPL for CUDA
+// SANITY CHECKS (c++ complex number implementation)
+#ifndef __CUDACC__
+#if defined MGONGPU_CPPCXTYPE_STDCOMPLEX and defined MGONGPU_CPPCXTYPE_CXSMPL
+#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CPPCXTYPE_STDCOMPLEX or MGONGPU_CPPCXTYPE_CXSMPL
 #endif
 #endif
 
-// SANITY CHECKS (C++ complex number implementation)
-#ifndef MGONGPUCPP_GPUIMPL
-#if defined MGONGPU_CPPCXTYPE_STDCOMPLEX and defined MGONGPU_CPPCXTYPE_CXSMPL
-#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CPPCXTYPE_STDCOMPLEX or MGONGPU_CPPCXTYPE_CXSMPL for C++
+// SANITY CHECKS (cuda complex number implementation)
+#ifdef __CUDACC__
+#if defined MGONGPU_CUCXTYPE_THRUST and defined MGONGPU_CUCXTYPE_CUCOMPLEX and defined MGONGPU_CUCXTYPE_CXSMPL
+#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CUCXTYPE_THRUST or MGONGPU_CUCXTYPE_CUCOMPLEX or MGONGPU_CUCXTYPE_CXSMPL
 #endif
 #endif
 
@@ -153,7 +131,7 @@ namespace mgOnGpu
   // Alignment requirement for using reinterpret_cast with SIMD vectorized code
   // (using reinterpret_cast with non aligned memory may lead to segmentation faults!)
   // Only needed for C++ code but can be enforced also in NVCC builds of C++ code using CUDA>=11.2 and C++17 (#318, #319, #333)
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
   constexpr int cppAlign = 64; // alignment requirement for SIMD vectorization (64-byte i.e. 512-bit)
 #endif
 
@@ -164,7 +142,7 @@ using mgOnGpu::fptype;
 using mgOnGpu::fptype2;
 
 // C++ SIMD vectorization width (this will be used to set neppV)
-#ifdef MGONGPUCPP_GPUIMPL // CUDA and HIP implementations have no SIMD
+#ifdef __CUDACC__ // CUDA implementation has no SIMD
 #undef MGONGPU_CPPSIMD
 #elif defined __AVX512VL__ && defined MGONGPU_PVW512 // C++ "512z" AVX512 with 512 width (512-bit ie 64-byte): 8 (DOUBLE) or 16 (FLOAT)
 #ifdef MGONGPU_FPTYPE_DOUBLE
@@ -194,9 +172,9 @@ using mgOnGpu::fptype2;
 #undef MGONGPU_CPPSIMD
 #endif
 
-// CUDA nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation
+// Cuda nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation
 // Arguments (not used so far): text is __FUNCTION__, code is 0 (start) or 1 (end)
-#if defined __CUDA__ && defined MGONGPU_NSIGHT_DEBUG /* clang-format off */
+#if defined __CUDACC__ && defined MGONGPU_NSIGHT_DEBUG /* clang-format off */
 #define mgDebugDeclare() __shared__ float mgDebugCounter[mgOnGpu::ntpbMAX];
 #define mgDebugInitialise() { mgDebugCounter[threadIdx.x] = 0; }
 #define mgDebug( code, text ) { mgDebugCounter[threadIdx.x] += 1; }
@@ -208,8 +186,8 @@ using mgOnGpu::fptype2;
 #define mgDebugFinalise() { /*noop*/ }
 #endif /* clang-format on */
 
-// Define empty CUDA/HIP declaration specifiers for C++
-#ifndef MGONGPUCPP_GPUIMPL
+// Define empty CUDA declaration specifiers for C++
+#ifndef __CUDACC__
 #define __global__
 #define __host__
 #define __device__
diff --git a/epochX/cudacpp/gg_ttgg.sa/src/mgOnGpuCxtypes.h b/epochX/cudacpp/gg_ttgg.sa/src/mgOnGpuCxtypes.h
index 4e7ab03fa2..0cb2f1db7e 100644
--- a/epochX/cudacpp/gg_ttgg.sa/src/mgOnGpuCxtypes.h
+++ b/epochX/cudacpp/gg_ttgg.sa/src/mgOnGpuCxtypes.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022, based on earlier work by D. Smith) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MGONGPUCXTYPES_H
 #define MGONGPUCXTYPES_H 1
@@ -19,7 +19,7 @@
 #include <complex>
 
 // Complex type in cuda: thrust or cucomplex or cxsmpl
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 #if defined MGONGPU_CUCXTYPE_THRUST
 #pragma clang diagnostic push
 #pragma clang diagnostic ignored "-Wtautological-compare" // for icpx2021/clang13 (https://stackoverflow.com/a/15864661)
@@ -201,7 +201,7 @@ namespace mgOnGpu
 {
 
   // --- Type definitions (complex type: cxtype)
-#ifdef MGONGPUCPP_GPUIMPL // cuda
+#ifdef __CUDACC__ // cuda
 #if defined MGONGPU_CUCXTYPE_THRUST
   typedef thrust::complex<fptype> cxtype;
 #elif defined MGONGPU_CUCXTYPE_CUCOMPLEX
@@ -281,7 +281,7 @@ cxmake( const std::complex<double>& c ) // std::complex to cxsmpl (double-to-flo
 
 //==========================================================================
 
-#if defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CUCXTYPE_THRUST // cuda + thrust
+#if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_THRUST // cuda + thrust
 
 //------------------------------
 // CUDA - using thrust::complex
@@ -317,11 +317,11 @@ cxmake( const cxtype& c )
   return c;
 }
 
-#endif // #if defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CUCXTYPE_THRUST
+#endif // #if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_THRUST
 
 //==========================================================================
 
-#if defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CUCXTYPE_CUCOMPLEX // cuda + cucomplex
+#if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_CUCOMPLEX // cuda + cucomplex
 
 //------------------------------
 // CUDA - using cuComplex
@@ -536,11 +536,11 @@ cxmake( const std::complex<fptype>& c ) // std::complex to cucomplex (float-to-f
   return cxmake( c.real(), c.imag() );
 }
 
-#endif // #if defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CUCXTYPE_CUCOMPLEX
+#endif // #if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_CUCOMPLEX
 
 //==========================================================================
 
-#if not defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CPPCXTYPE_STDCOMPLEX // c++ + stdcomplex
+#if not defined __CUDACC__ and defined MGONGPU_CPPCXTYPE_STDCOMPLEX // c++ + stdcomplex
 
 //------------------------------
 // C++ - using std::complex
@@ -584,7 +584,7 @@ cxmake( const std::complex<double>& c ) // std::complex to std::complex (cast do
 }
 #endif
 
-#endif // #if not defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CPPCXTYPE_STDCOMPLEX
+#endif // #if not defined __CUDACC__ and defined MGONGPU_CPPCXTYPE_STDCOMPLEX
 
 //==========================================================================
 
diff --git a/epochX/cudacpp/gg_ttgg.sa/src/mgOnGpuFptypes.h b/epochX/cudacpp/gg_ttgg.sa/src/mgOnGpuFptypes.h
index 6f6cee64d6..a1cde16a67 100644
--- a/epochX/cudacpp/gg_ttgg.sa/src/mgOnGpuFptypes.h
+++ b/epochX/cudacpp/gg_ttgg.sa/src/mgOnGpuFptypes.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MGONGPUFPTYPES_H
 #define MGONGPUFPTYPES_H 1
@@ -13,7 +13,7 @@
 
 //==========================================================================
 
-#ifdef MGONGPUCPP_GPUIMPL // cuda
+#ifdef __CUDACC__ // cuda
 
 //------------------------------
 // Floating point types - Cuda
@@ -57,11 +57,11 @@ fpsqrt( const fptype& f )
 #endif
 }
 
-#endif // #ifdef MGONGPUCPP_GPUIMPL
+#endif // #ifdef __CUDACC__
 
 //==========================================================================
 
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
 
 //------------------------------
 // Floating point types - C++
@@ -85,7 +85,7 @@ fpsqrt( const fptype& f )
   return std::sqrt( f );
 }
 
-#endif // #ifndef MGONGPUCPP_GPUIMPL
+#endif // #ifndef __CUDACC__
 
 //==========================================================================
 
diff --git a/epochX/cudacpp/gg_ttgg.sa/src/mgOnGpuVectors.h b/epochX/cudacpp/gg_ttgg.sa/src/mgOnGpuVectors.h
index 7904b93c61..9d3e82b1e3 100644
--- a/epochX/cudacpp/gg_ttgg.sa/src/mgOnGpuVectors.h
+++ b/epochX/cudacpp/gg_ttgg.sa/src/mgOnGpuVectors.h
@@ -108,7 +108,7 @@ namespace mgOnGpu /* clang-format off */
 #endif
 #endif
 
-#else // i.e #ifndef MGONGPU_CPPSIMD (this includes #ifdef MGONGPUCPP_GPUIMPL)
+#else // i.e #ifndef MGONGPU_CPPSIMD (this includes #ifdef __CUDACC__)
 
   const int neppV = 1;
 
@@ -129,7 +129,7 @@ using mgOnGpu::bool_v;
 
 //--------------------------------------------------------------------------
 
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
 
 // Printout to stream for user defined types
 
@@ -744,11 +744,11 @@ fpvsplit1( const fptype2_v& v )
 
 #endif // #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
 
-#endif // #ifndef MGONGPUCPP_GPUIMPL
+#endif // #ifndef __CUDACC__
 
 //==========================================================================
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 
 //------------------------------
 // Vector types - CUDA
@@ -786,12 +786,12 @@ cxternary( const bool& mask, const cxtype& a, const cxtype& b )
   return ( mask ? a : b );
 }
 
-#endif // #ifdef MGONGPUCPP_GPUIMPL
+#endif // #ifdef __CUDACC__
 
 //==========================================================================
 
 // Scalar-or-vector types: scalar in CUDA, vector or scalar in C++
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 typedef bool bool_sv;
 typedef fptype fptype_sv;
 typedef fptype2 fptype2_sv;
@@ -812,7 +812,7 @@ typedef mgOnGpu::cxtype_ref cxtype_sv_ref;
 #endif
 
 // Scalar-or-vector zeros: scalar in CUDA, vector or scalar in C++
-#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
+#ifdef __CUDACC__ /* clang-format off */
 inline __host__ __device__ cxtype cxzero_sv(){ return cxtype( 0, 0 ); }
 #elif defined MGONGPU_CPPSIMD
 inline cxtype_v cxzero_sv() { return cxtype_v(); } // RRRR=0000 IIII=0000
diff --git a/epochX/cudacpp/gg_ttgg.sa/src/rambo.h b/epochX/cudacpp/gg_ttgg.sa/src/rambo.h
index cd7e1008ea..e02ea52496 100644
--- a/epochX/cudacpp/gg_ttgg.sa/src/rambo.h
+++ b/epochX/cudacpp/gg_ttgg.sa/src/rambo.h
@@ -4,7 +4,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 
 #include "mgOnGpuConfig.h"
@@ -18,7 +18,7 @@
 #include <iostream>
 
 // Simplified rambo version for 2 to N (with N>=2) processes with massless particles
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -83,7 +83,7 @@ namespace mg5amcCpu
       static bool first = true;
       if( first )
       {
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
         if constexpr( M_ACCESS::isOnDevice() ) // avoid
         {
           const int ievt0 = 0;
@@ -166,7 +166,7 @@ namespace mg5amcCpu
     wt = po2log;
     if( nparf != 2 ) wt = ( 2. * nparf - 4. ) * log( energy ) + z[nparf - 1];
 
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
     // issue warnings if weight is too small or too large
     static int iwarn[5] = { 0, 0, 0, 0, 0 };
     if( wt < -180. )
diff --git a/epochX/cudacpp/gg_ttggg.mad/COPYRIGHT b/epochX/cudacpp/gg_ttggg.mad/COPYRIGHT
index 84a883fbb0..a134b5fef9 100644
--- a/epochX/cudacpp/gg_ttggg.mad/COPYRIGHT
+++ b/epochX/cudacpp/gg_ttggg.mad/COPYRIGHT
@@ -15,7 +15,6 @@ The full development team currently includes the following authors :
   Stephan Hageboeck (CERN)
   Olivier Mattelaer (Universite Catholique de Louvain, original author)
   Stefan Roiser (CERN, original author)
-  Joergen Teig (CERN)
   Andrea Valassi (CERN, original author)
   Zenny Wettersten (CERN)
 See https://github.com/madgraph5/madgraph4gpu for more details. For the full
diff --git a/epochX/cudacpp/gg_ttggg.mad/Source/DHELAS/aloha_file.inc b/epochX/cudacpp/gg_ttggg.mad/Source/DHELAS/aloha_file.inc
index ec923afd6d..cf4ec946f8 100644
--- a/epochX/cudacpp/gg_ttggg.mad/Source/DHELAS/aloha_file.inc
+++ b/epochX/cudacpp/gg_ttggg.mad/Source/DHELAS/aloha_file.inc
@@ -1 +1 @@
-ALOHARoutine = FFV1_1.o VVVV4_0.o VVVV4P0_1.o FFV1_0.o VVV1_0.o FFV1_2.o VVVV3_0.o VVVV1_0.o VVVV3P0_1.o VVVV1P0_1.o VVV1P0_1.o FFV1P0_3.o
+ALOHARoutine = VVVV3_0.o VVVV4P0_1.o VVVV3P0_1.o VVVV1P0_1.o FFV1_1.o FFV1_2.o VVV1P0_1.o VVV1_0.o FFV1_0.o FFV1P0_3.o VVVV1_0.o VVVV4_0.o
diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/Bridge.h b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/Bridge.h
index c04628dfd1..4cafe0c997 100644
--- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/Bridge.h
+++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/Bridge.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: S. Roiser (Nov 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Roiser, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef BRIDGE_H
 #define BRIDGE_H 1
@@ -22,7 +22,7 @@
 #include <memory>
 #include <type_traits>
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -82,7 +82,7 @@ namespace mg5amcCpu
     Bridge& operator=( const Bridge& ) = delete;
     Bridge& operator=( Bridge&& ) = delete;
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
     /**
      * Set the gpublocks and gputhreads for the gpusequence - throws if evnt != gpublocks*gputhreads
      * (this is needed for BridgeKernel tests rather than for actual production use in Fortran)
@@ -149,7 +149,7 @@ namespace mg5amcCpu
     unsigned int m_nevt; // number of events
     int m_nGoodHel;      // the number of good helicities (-1 initially when they have not yet been calculated)
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
     int m_gputhreads; // number of gpu threads (default set from number of events, can be modified)
     int m_gpublocks;  // number of gpu blocks (default set from number of events, can be modified)
     mg5amcGpu::DeviceBuffer<FORTRANFPTYPE, sizePerEventMomenta> m_devMomentaF;
@@ -186,12 +186,12 @@ namespace mg5amcCpu
   // Forward declare transposition methods
   //
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 
   template<typename Tin, typename Tout>
   __global__ void dev_transposeMomentaF2C( const Tin* in, Tout* out, const unsigned int nevt );
 
-#endif // MGONGPUCPP_GPUIMPL
+#endif // __CUDACC__
 
   template<typename Tin, typename Tout>
   void hst_transposeMomentaF2C( const Tin* in, Tout* out, const unsigned int nevt );
@@ -208,7 +208,7 @@ namespace mg5amcCpu
   Bridge<FORTRANFPTYPE>::Bridge( unsigned int nevtF, unsigned int nparF, unsigned int np4F )
     : m_nevt( nevtF )
     , m_nGoodHel( -1 )
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
     , m_gputhreads( 256 )                  // default number of gpu threads
     , m_gpublocks( m_nevt / m_gputhreads ) // this ensures m_nevt <= m_gpublocks*m_gputhreads
     , m_devMomentaF( m_nevt )
@@ -232,7 +232,7 @@ namespace mg5amcCpu
   {
     if( nparF != CPPProcess::npar ) throw std::runtime_error( "Bridge constructor: npar mismatch" );
     if( np4F != CPPProcess::np4 ) throw std::runtime_error( "Bridge constructor: np4 mismatch" );
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
     if( ( m_nevt < s_gputhreadsmin ) || ( m_nevt % s_gputhreadsmin != 0 ) )
       throw std::runtime_error( "Bridge constructor: nevt should be a multiple of " + std::to_string( s_gputhreadsmin ) );
     while( m_nevt != m_gpublocks * m_gputhreads )
@@ -250,11 +250,11 @@ namespace mg5amcCpu
     std::cout << "WARNING! Instantiate host Bridge (nevt=" << m_nevt << ")" << std::endl;
     mg5amcCpu::CPPProcess process( /*verbose=*/false );
     m_pmek.reset( new mg5amcCpu::MatrixElementKernelHost( m_hstMomentaC, m_hstGs, m_hstRndHel, m_hstRndCol, m_hstMEs, m_hstSelHel, m_hstSelCol, m_nevt ) );
-#endif // MGONGPUCPP_GPUIMPL
+#endif // __CUDACC__
     process.initProc( "../../Cards/param_card.dat" );
   }
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
   template<typename FORTRANFPTYPE>
   void Bridge<FORTRANFPTYPE>::set_gpugrid( const int gpublocks, const int gputhreads )
   {
@@ -268,7 +268,7 @@ namespace mg5amcCpu
   }
 #endif
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
   template<typename FORTRANFPTYPE>
   void Bridge<FORTRANFPTYPE>::gpu_sequence( const FORTRANFPTYPE* momenta,
                                             const FORTRANFPTYPE* gs,
@@ -283,14 +283,14 @@ namespace mg5amcCpu
     constexpr int neppM = MemoryAccessMomenta::neppM;
     if constexpr( neppM == 1 && std::is_same_v<FORTRANFPTYPE, fptype> )
     {
-      gpuMemcpy( m_devMomentaC.data(), momenta, m_devMomentaC.bytes(), gpuMemcpyHostToDevice );
+      checkCuda( cudaMemcpy( m_devMomentaC.data(), momenta, m_devMomentaC.bytes(), cudaMemcpyHostToDevice ) );
     }
     else
     {
-      gpuMemcpy( m_devMomentaF.data(), momenta, m_devMomentaF.bytes(), gpuMemcpyHostToDevice );
+      checkCuda( cudaMemcpy( m_devMomentaF.data(), momenta, m_devMomentaF.bytes(), cudaMemcpyHostToDevice ) );
       const int thrPerEvt = CPPProcess::npar * CPPProcess::np4; // AV: transpose alg does 1 element per thread (NOT 1 event per thread)
       //const int thrPerEvt = 1; // AV: try new alg with 1 event per thread... this seems slower
-      gpuLaunchKernel( dev_transposeMomentaF2C, m_gpublocks * thrPerEvt, m_gputhreads, m_devMomentaF.data(), m_devMomentaC.data(), m_nevt );
+      dev_transposeMomentaF2C<<<m_gpublocks * thrPerEvt, m_gputhreads>>>( m_devMomentaF.data(), m_devMomentaC.data(), m_nevt );
     }
     if constexpr( std::is_same_v<FORTRANFPTYPE, fptype> )
     {
@@ -333,7 +333,7 @@ namespace mg5amcCpu
   }
 #endif
 
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
   template<typename FORTRANFPTYPE>
   void Bridge<FORTRANFPTYPE>::cpu_sequence( const FORTRANFPTYPE* momenta,
                                             const FORTRANFPTYPE* gs,
@@ -388,7 +388,7 @@ namespace mg5amcCpu
   // - C++ array: momenta[npagM][npar][np4][neppM] with nevt=npagM*neppM (AOSOA)
   //
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
   template<typename Tin, typename Tout>
   __global__ void dev_transposeMomentaF2C( const Tin* in, Tout* out, const unsigned int nevt )
   {
diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/BridgeKernels.cc b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/BridgeKernels.cc
index 90c7f2d3b8..cef4cb3c71 100644
--- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/BridgeKernels.cc
+++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/BridgeKernels.cc
@@ -1,11 +1,10 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #include "BridgeKernels.h"
 
-#include "GpuAbstraction.h"
 #include "MemoryAccessMomenta.h"
 
 #include <sstream>
@@ -15,7 +14,7 @@ constexpr int npar = CPPProcess::npar; // #particles in total (external = initia
 
 //============================================================================
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -46,7 +45,7 @@ namespace mg5amcCpu
 
 //============================================================================
 
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
 namespace mg5amcCpu
 {
 
@@ -97,7 +96,7 @@ namespace mg5amcCpu
 
 //============================================================================
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 namespace mg5amcGpu
 {
 
diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/BridgeKernels.h b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/BridgeKernels.h
index 3efef8ce97..15eb4bff4d 100644
--- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/BridgeKernels.h
+++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/BridgeKernels.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef BRIDGEKERNELS_H
 #define BRIDGEKERNELS_H 1
@@ -12,7 +12,7 @@
 #include "MatrixElementKernels.h"
 #include "MemoryBuffers.h"
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -49,7 +49,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
   // A Bridge wrapper class encapsulating matrix element calculations on a CPU host
   class BridgeKernelHost final : public BridgeKernelBase
   {
@@ -89,7 +89,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
   // A Bridge wrapper class encapsulating matrix element calculations on a GPU device
   class BridgeKernelDevice : public BridgeKernelBase
   {
diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/CommonRandomNumberKernel.cc b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/CommonRandomNumberKernel.cc
index 010bc4cbd0..985b39f576 100644
--- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/CommonRandomNumberKernel.cc
+++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/CommonRandomNumberKernel.cc
@@ -1,16 +1,15 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #include "CommonRandomNumbers.h"
-#include "GpuAbstraction.h"
 #include "MemoryBuffers.h"
 #include "RandomNumberKernels.h"
 
 #include <cassert>
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/CrossSectionKernels.cc b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/CrossSectionKernels.cc
index c15b39844d..0b355a3c8d 100644
--- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/CrossSectionKernels.cc
+++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/CrossSectionKernels.cc
@@ -1,11 +1,10 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #include "CrossSectionKernels.h"
 
-#include "GpuAbstraction.h"
 #include "MemoryAccessMatrixElements.h"
 #include "MemoryAccessWeights.h"
 #include "MemoryBuffers.h"
@@ -78,7 +77,7 @@ debug_me_is_abnormal( const fptype& me, size_t ievtALL )
 
 //============================================================================
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -186,7 +185,7 @@ namespace mg5amcCpu
 
 //============================================================================
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 namespace mg5amcGpu
 {
 
diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/CrossSectionKernels.h b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/CrossSectionKernels.h
index 4d9659e04e..7933ca4bbf 100644
--- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/CrossSectionKernels.h
+++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/CrossSectionKernels.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef CROSSSECTIONKERNELS_H
 #define CROSSSECTIONKERNELS_H 1
@@ -13,7 +13,7 @@
 
 //============================================================================
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -96,7 +96,7 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
   /*
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
   // A class encapsulating the calculation of event statistics on a GPU device
   class CrossSectionKernelDevice : public CrossSectionKernelBase, public NumberOfEvents
   {
diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/CudaRuntime.h b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/CudaRuntime.h
new file mode 100644
index 0000000000..64ce52f4b3
--- /dev/null
+++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/CudaRuntime.h
@@ -0,0 +1,85 @@
+// Copyright (C) 2020-2023 CERN and UCLouvain.
+// Licensed under the GNU Lesser General Public License (version 3 or later).
+// Created by: S. Roiser (Jul 2020) for the MG5aMC CUDACPP plugin.
+// Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+
+#ifndef MG5AMC_CUDARUNTIME_H
+#define MG5AMC_CUDARUNTIME_H 1
+
+// MG5AMC on GPU uses the CUDA runtime API, not the lower level CUDA driver API
+// See https://docs.nvidia.com/cuda/cuda-runtime-api/driver-vs-runtime-api.html#driver-vs-runtime-api
+
+#include <cassert>
+#include <iostream>
+
+//--------------------------------------------------------------------------
+
+// See https://stackoverflow.com/a/14038590
+#ifdef __CUDACC__ /* clang-format off */
+#define checkCuda( code ) { assertCuda( code, __FILE__, __LINE__ ); }
+inline void assertCuda( cudaError_t code, const char* file, int line, bool abort = true )
+{
+  if( code != cudaSuccess )
+  {
+    printf( "ERROR! assertCuda: '%s' (%d) in %s:%d\n", cudaGetErrorString( code ), code, file, line );
+    if( abort ) assert( code == cudaSuccess );
+  }
+}
+#endif /* clang-format on */
+
+//--------------------------------------------------------------------------
+
+#ifdef __CUDACC__
+namespace mg5amcGpu
+{
+  // Instantiate a CudaRuntime at the beginnining of the application's main to
+  // invoke cudaSetDevice(0) in the constructor and book a cudaDeviceReset() call in the destructor
+  // *** FIXME! This will all need to be designed differently when going to multi-GPU nodes! ***
+  struct CudaRuntime final
+  {
+    CudaRuntime( const bool debug = true )
+      : m_debug( debug ) { setUp( m_debug ); }
+    ~CudaRuntime() { tearDown( m_debug ); }
+    CudaRuntime( const CudaRuntime& ) = delete;
+    CudaRuntime( CudaRuntime&& ) = delete;
+    CudaRuntime& operator=( const CudaRuntime& ) = delete;
+    CudaRuntime& operator=( CudaRuntime&& ) = delete;
+    bool m_debug;
+
+    // Set up CUDA application
+    // ** NB: strictly speaking this is not needed when using the CUDA runtime API **
+    // Calling cudaSetDevice on startup is useful to properly book-keep the time spent in CUDA initialization
+    static void setUp( const bool debug = true )
+    {
+      // ** NB: it is useful to call cudaSetDevice, or cudaFree, to properly book-keep the time spent in CUDA initialization
+      // ** NB: otherwise, the first CUDA operation (eg a cudaMemcpyToSymbol in CPPProcess ctor) appears to take much longer!
+      /*
+      // [We initially added cudaFree(0) to "ease profile analysis" only because it shows up as a big recognizable block!]
+      // No explicit initialization is needed: https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#initialization
+      // It is not clear what cudaFree(0) does at all: https://stackoverflow.com/questions/69967813/
+      if ( debug ) std::cout << "__CudaRuntime: calling cudaFree(0)" << std::endl;
+      checkCuda( cudaFree( 0 ) ); // SLOW!
+      */
+      // Replace cudaFree(0) by cudaSetDevice(0), even if it is not really needed either
+      // (but see https://developer.nvidia.com/blog/cuda-pro-tip-always-set-current-device-avoid-multithreading-bugs)
+      if( debug ) std::cout << "__CudaRuntime: calling cudaSetDevice(0)" << std::endl;
+      checkCuda( cudaSetDevice( 0 ) ); // SLOW!
+    }
+
+    // Tear down CUDA application (call cudaDeviceReset)
+    // ** NB: strictly speaking this is not needed when using the CUDA runtime API **
+    // Calling cudaDeviceReset on shutdown is only needed for checking memory leaks in cuda-memcheck
+    // See https://docs.nvidia.com/cuda/cuda-memcheck/index.html#leak-checking
+    static void tearDown( const bool debug = true )
+    {
+      if( debug ) std::cout << "__CudaRuntime: calling cudaDeviceReset()" << std::endl;
+      checkCuda( cudaDeviceReset() );
+    }
+  };
+
+}
+#endif
+
+//--------------------------------------------------------------------------
+
+#endif // MG5AMC_CUDARUNTIME_H
diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/CurandRandomNumberKernel.cc b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/CurandRandomNumberKernel.cc
index 38c477c17a..eb56333b03 100644
--- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/CurandRandomNumberKernel.cc
+++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/CurandRandomNumberKernel.cc
@@ -1,9 +1,9 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
-#include "GpuRuntime.h"
+#include "CudaRuntime.h"
 #include "MemoryBuffers.h"
 #include "RandomNumberKernels.h"
 
@@ -114,7 +114,7 @@ namespace mg5amcCpu
     /*
     printf( "\nCurandRandomNumberKernel::generateRnarray size = %d\n", (int)m_rnarray.size() );
     fptype* data = m_rnarray.data();
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
     if( m_rnarray.isOnDevice() )
     {
       data = new fptype[m_rnarray.size()]();
@@ -123,7 +123,7 @@ namespace mg5amcCpu
 #endif
     for( int i = 0; i < ( (int)m_rnarray.size() / 4 ); i++ )
       printf( "[%4d] %f %f %f %f\n", i * 4, data[i * 4], data[i * 4 + 2], data[i * 4 + 2], data[i * 4 + 3] );
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
     if( m_rnarray.isOnDevice() ) delete[] data;
 #endif
     */
diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/EventStatistics.h b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/EventStatistics.h
index b425a5bade..48b51e0a49 100644
--- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/EventStatistics.h
+++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/EventStatistics.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef EventStatistics_H
 #define EventStatistics_H 1
@@ -16,7 +16,7 @@
 #include <limits>
 #include <string>
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MadgraphTest.h b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MadgraphTest.h
index d2ff326e20..fd7734ce42 100644
--- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MadgraphTest.h
+++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MadgraphTest.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: S. Hageboeck (Dec 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MADGRAPHTEST_H_
 #define MADGRAPHTEST_H_ 1
@@ -21,7 +21,7 @@
 #include <string>
 #include <vector>
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 using mg5amcGpu::CPPProcess;
 #else
 using mg5amcCpu::CPPProcess;
@@ -200,7 +200,7 @@ class MadgraphTest : public testing::TestWithParam<TestDriverBase*>
 
 // Since we link both the CPU-only and GPU tests into the same executable, we prevent
 // a multiply defined symbol by only compiling this in the non-CUDA phase:
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
 
 /// Compare momenta and matrix elements.
 /// This uses an implementation of TestDriverBase to run a madgraph workflow,
@@ -309,6 +309,6 @@ TEST_P( MadgraphTest, CompareMomentaAndME )
   }
 }
 
-#endif // MGONGPUCPP_GPUIMPL
+#endif // __CUDACC__
 
 #endif /* MADGRAPHTEST_H_ */
diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MatrixElementKernels.cc b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MatrixElementKernels.cc
index d6d6c4f179..30257195b6 100644
--- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MatrixElementKernels.cc
+++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MatrixElementKernels.cc
@@ -1,12 +1,12 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #include "MatrixElementKernels.h"
 
 #include "CPPProcess.h"
-#include "GpuRuntime.h" // Includes the abstraction for Nvidia/AMD compilation
+#include "CudaRuntime.h"
 #include "MemoryAccessMomenta.h"
 #include "MemoryBuffers.h"
 
@@ -14,7 +14,7 @@
 
 //============================================================================
 
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
 namespace mg5amcCpu
 {
 
@@ -143,7 +143,7 @@ namespace mg5amcCpu
 
 //============================================================================
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 namespace mg5amcGpu
 {
 
@@ -202,13 +202,13 @@ namespace mg5amcGpu
     PinnedHostBufferHelicityMask hstIsGoodHel( ncomb );
     DeviceBufferHelicityMask devIsGoodHel( ncomb );
     // ... 0d1. Compute good helicity mask on the device
-    gpuLaunchKernel( computeDependentCouplings, m_gpublocks, m_gputhreads, m_gs.data(), m_couplings.data() );
+    computeDependentCouplings<<<m_gpublocks, m_gputhreads>>>( m_gs.data(), m_couplings.data() );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    gpuLaunchKernel( sigmaKin_getGoodHel, m_gpublocks, m_gputhreads, m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_numerators.data(), m_denominators.data(), devIsGoodHel.data() );
+    sigmaKin_getGoodHel<<<m_gpublocks, m_gputhreads>>>( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_numerators.data(), m_denominators.data(), devIsGoodHel.data() );
 #else
-    gpuLaunchKernel( sigmaKin_getGoodHel, m_gpublocks, m_gputhreads, m_momenta.data(), m_couplings.data(), m_matrixElements.data(), devIsGoodHel.data() );
+    sigmaKin_getGoodHel<<<m_gpublocks, m_gputhreads>>>( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), devIsGoodHel.data() );
 #endif
-    checkGpu( gpuPeekAtLastError() );
+    checkCuda( cudaPeekAtLastError() );
     // ... 0d2. Copy back good helicity mask to the host
     copyHostFromDevice( hstIsGoodHel, devIsGoodHel );
     // ... 0d3. Copy back good helicity list to constant memory on the device
@@ -219,19 +219,19 @@ namespace mg5amcGpu
 
   void MatrixElementKernelDevice::computeMatrixElements( const unsigned int channelId )
   {
-    gpuLaunchKernel( computeDependentCouplings, m_gpublocks, m_gputhreads, m_gs.data(), m_couplings.data() );
+    computeDependentCouplings<<<m_gpublocks, m_gputhreads>>>( m_gs.data(), m_couplings.data() );
 #ifndef MGONGPU_NSIGHT_DEBUG
     constexpr unsigned int sharedMemSize = 0;
 #else
     constexpr unsigned int sharedMemSize = ntpbMAX * sizeof( float );
 #endif
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    gpuLaunchKernelSharedMem( sigmaKin, m_gpublocks, m_gputhreads, sharedMemSize, m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), channelId, m_numerators.data(), m_denominators.data(), m_selhel.data(), m_selcol.data() );
+    sigmaKin<<<m_gpublocks, m_gputhreads, sharedMemSize>>>( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), channelId, m_numerators.data(), m_denominators.data(), m_selhel.data(), m_selcol.data() );
 #else
-    gpuLaunchKernelSharedMem( sigmaKin, m_gpublocks, m_gputhreads, sharedMemSize, m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), m_selhel.data(), m_selcol.data() );
+    sigmaKin<<<m_gpublocks, m_gputhreads, sharedMemSize>>>( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), m_selhel.data(), m_selcol.data() );
 #endif
-    checkGpu( gpuPeekAtLastError() );
-    checkGpu( gpuDeviceSynchronize() );
+    checkCuda( cudaPeekAtLastError() );
+    checkCuda( cudaDeviceSynchronize() );
   }
 
   //--------------------------------------------------------------------------
diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MatrixElementKernels.h b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MatrixElementKernels.h
index 72bd8f195b..23e84757a2 100644
--- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MatrixElementKernels.h
+++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MatrixElementKernels.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MATRIXELEMENTKERNELS_H
 #define MATRIXELEMENTKERNELS_H 1
@@ -10,7 +10,7 @@
 
 #include "MemoryBuffers.h"
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -81,7 +81,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
   // A class encapsulating matrix element calculations on a CPU host
   class MatrixElementKernelHost final : public MatrixElementKernelBase, public NumberOfEvents
   {
@@ -130,7 +130,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
   // A class encapsulating matrix element calculations on a GPU device
   class MatrixElementKernelDevice : public MatrixElementKernelBase, public NumberOfEvents
   {
diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MemoryAccessHelpers.h b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MemoryAccessHelpers.h
index db73e4e064..c82a6c7635 100644
--- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MemoryAccessHelpers.h
+++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MemoryAccessHelpers.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MemoryAccessHelpers_H
 #define MemoryAccessHelpers_H 1
@@ -105,7 +105,7 @@ class KernelAccessHelper : public MemoryAccessHelper<T>
     }
     else
     {
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
       const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid
       //printf( "kernelAccessRecord: ievt=%d threadId=%d\n", ievt, threadIdx.x );
       return T::ieventAccessRecord( buffer, ievt ); // NB fptype and fptype_sv coincide for CUDA
diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MemoryAccessMomenta.h b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MemoryAccessMomenta.h
index 38fade09fb..0ac4faa3c7 100644
--- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MemoryAccessMomenta.h
+++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MemoryAccessMomenta.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MemoryAccessMomenta_H
 #define MemoryAccessMomenta_H 1
@@ -12,7 +12,7 @@
 #include "MemoryAccessHelpers.h"
 #include "MemoryAccessVectors.h"
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 using mg5amcGpu::CPPProcess;
 #else
 using mg5amcCpu::CPPProcess;
@@ -29,7 +29,7 @@ class MemoryAccessMomentaBase //_AOSOAv1
 
   // Number of Events Per Page in the momenta AOSOA memory buffer layout
   // (these are all best kept as a compile-time constants: see issue #23)
-#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
+#ifdef __CUDACC__ /* clang-format off */
   // -----------------------------------------------------------------------------------------------
   // --- GPUs: neppM is best set to a power of 2 times the number of fptype's in a 32-byte cacheline
   // --- This is relevant to ensure coalesced access to momenta in global memory
diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MemoryAccessRandomNumbers.h b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MemoryAccessRandomNumbers.h
index 40cb089135..e2988d39f3 100644
--- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MemoryAccessRandomNumbers.h
+++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MemoryAccessRandomNumbers.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MemoryAccessRandomNumbers_H
 #define MemoryAccessRandomNumbers_H 1
@@ -11,7 +11,7 @@
 #include "CPPProcess.h"
 #include "MemoryAccessHelpers.h"
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 using mg5amcGpu::CPPProcess;
 #else
 using mg5amcCpu::CPPProcess;
diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MemoryAccessVectors.h b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MemoryAccessVectors.h
index 08faccff0f..e9b197368e 100644
--- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MemoryAccessVectors.h
+++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MemoryAccessVectors.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MemoryAccessVectors_H
 #define MemoryAccessVectors_H 1
@@ -10,7 +10,7 @@
 
 #include "mgOnGpuVectors.h"
 
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
 namespace mg5amcCpu // this is only needed for CPU SIMD vectorization
 {
 
diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MemoryBuffers.h b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MemoryBuffers.h
index 7756a71621..3093e6ed18 100644
--- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MemoryBuffers.h
+++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MemoryBuffers.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021, based on earlier work by S. Hageboeck) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Roiser, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MemoryBuffers_H
 #define MemoryBuffers_H 1
@@ -11,12 +11,12 @@
 #include "mgOnGpuCxtypes.h"
 
 #include "CPPProcess.h"
-#include "GpuRuntime.h"
+#include "CudaRuntime.h"
 #include "Parameters_sm.h"
 
 #include <sstream>
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -87,7 +87,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
   constexpr bool HostBufferALIGNED = false;   // ismisaligned=false
   constexpr bool HostBufferMISALIGNED = true; // ismisaligned=true
 
@@ -119,7 +119,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
   // A class encapsulating a CUDA pinned host buffer
   template<typename T>
   class PinnedHostBufferBase : public BufferBase<T>
@@ -128,18 +128,18 @@ namespace mg5amcCpu
     PinnedHostBufferBase( const size_t size )
       : BufferBase<T>( size, false )
     {
-      gpuMallocHost( &( this->m_data ), this->bytes() );
+      checkCuda( cudaMallocHost( &( this->m_data ), this->bytes() ) );
     }
     virtual ~PinnedHostBufferBase()
     {
-      gpuFreeHost( this->m_data );
+      checkCuda( cudaFreeHost( this->m_data ) );
     }
   };
 #endif
 
   //--------------------------------------------------------------------------
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
   // A class encapsulating a CUDA device buffer
   template<typename T>
   class DeviceBufferBase : public BufferBase<T>
@@ -148,18 +148,18 @@ namespace mg5amcCpu
     DeviceBufferBase( const size_t size )
       : BufferBase<T>( size, true )
     {
-      gpuMalloc( &( this->m_data ), this->bytes() );
+      checkCuda( cudaMalloc( &( this->m_data ), this->bytes() ) );
     }
     virtual ~DeviceBufferBase()
     {
-      gpuFree( this->m_data );
+      checkCuda( cudaFree( this->m_data ) );
     }
   };
 #endif
 
   //--------------------------------------------------------------------------
 
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
   // A class encapsulating a C++ host buffer for a given number of events
   template<typename T, size_t sizePerEvent, bool ismisaligned>
   class HostBuffer : public HostBufferBase<T, ismisaligned>, virtual private NumberOfEvents
@@ -175,7 +175,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
   // A class encapsulating a CUDA pinned host buffer for a given number of events
   template<typename T, size_t sizePerEvent>
   class PinnedHostBuffer : public PinnedHostBufferBase<T>, virtual private NumberOfEvents
@@ -191,7 +191,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
   // A class encapsulating a CUDA device buffer for a given number of events
   template<typename T, size_t sizePerEvent>
   class DeviceBuffer : public DeviceBufferBase<T>, virtual private NumberOfEvents
@@ -213,7 +213,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for momenta random numbers
   constexpr size_t sizePerEventRndNumMomenta = MemoryBuffers::np4 * MemoryBuffers::nparf;
 
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
   // A class encapsulating a C++ host buffer for momenta random numbers
   typedef HostBuffer<fptype, sizePerEventRndNumMomenta, HostBufferALIGNED> HostBufferRndNumMomenta;
 #else
@@ -232,7 +232,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer with ONE fptype per event
   constexpr size_t sizePerEventOneFp = 1;
 
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
   // A class encapsulating a C++ host buffer with ONE fptype per event
   typedef HostBuffer<fptype, sizePerEventOneFp, HostBufferALIGNED> HostBufferOneFp;
 #else
@@ -257,7 +257,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for Gs
   constexpr size_t sizePerEventGs = 1;
 
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
   // A class encapsulating a C++ host buffer for gs
   typedef HostBuffer<fptype, sizePerEventGs, HostBufferALIGNED> HostBufferGs;
 #else
@@ -276,7 +276,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for numerators
   constexpr size_t sizePerEventNumerators = 1;
 
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
   // A class encapsulating a C++ host buffer for gs
   typedef HostBuffer<fptype, sizePerEventNumerators, HostBufferALIGNED> HostBufferNumerators;
 #else
@@ -296,7 +296,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for denominators
   constexpr size_t sizePerEventDenominators = 1;
 
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
   // A class encapsulating a C++ host buffer for gs
   typedef HostBuffer<fptype, sizePerEventDenominators, HostBufferALIGNED> HostBufferDenominators;
 #else
@@ -315,7 +315,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for random numbers
   constexpr size_t sizePerEventCouplings = MemoryBuffers::ndcoup * MemoryBuffers::nx2;
 
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
   // A class encapsulating a C++ host buffer for gs
   typedef HostBuffer<fptype, sizePerEventCouplings, HostBufferALIGNED> HostBufferCouplings;
 #else
@@ -333,7 +333,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for momenta
   constexpr size_t sizePerEventMomenta = MemoryBuffers::np4 * MemoryBuffers::npar;
 
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
   // A class encapsulating a C++ host buffer for momenta
   typedef HostBuffer<fptype, sizePerEventMomenta, HostBufferALIGNED> HostBufferMomenta;
   //typedef HostBuffer<fptype, sizePerEventMomenta, HostBufferMISALIGNED> HostBufferMomenta; // TEST MISALIGNMENT!
@@ -352,7 +352,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for sampling weights
   constexpr size_t sizePerEventWeights = 1;
 
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
   // A class encapsulating a C++ host buffer for sampling weights
   typedef HostBuffer<fptype, sizePerEventWeights, HostBufferALIGNED> HostBufferWeights;
 #else
@@ -370,7 +370,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for matrix elements
   constexpr size_t sizePerEventMatrixElements = 1;
 
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
   // A class encapsulating a C++ host buffer for matrix elements
   typedef HostBuffer<fptype, sizePerEventMatrixElements, HostBufferALIGNED> HostBufferMatrixElements;
 #else
@@ -385,7 +385,7 @@ namespace mg5amcCpu
   // A base class encapsulating a memory buffer for the helicity mask
   typedef BufferBase<bool> BufferHelicityMask;
 
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
   // A class encapsulating a C++ host buffer for the helicity mask
   typedef HostBufferBase<bool, HostBufferALIGNED> HostBufferHelicityMask;
 #else
@@ -403,7 +403,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for wavefunctions
   constexpr size_t sizePerEventWavefunctions = MemoryBuffers::nw6 * MemoryBuffers::nx2;
 
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
   // A class encapsulating a C++ host buffer for wavefunctions
   typedef HostBuffer<fptype, sizePerEventWavefunctions, HostBufferALIGNED> HostBufferWavefunctions;
 #else
@@ -421,7 +421,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for helicity random numbers
   constexpr size_t sizePerEventRndNumHelicity = 1;
 
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
   // A class encapsulating a C++ host buffer for helicity random numbers
   typedef HostBuffer<fptype, sizePerEventRndNumHelicity, HostBufferALIGNED> HostBufferRndNumHelicity;
 #else
@@ -439,7 +439,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for color random numbers
   constexpr size_t sizePerEventRndNumColor = 1;
 
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
   // A class encapsulating a C++ host buffer for color random numbers
   typedef HostBuffer<fptype, sizePerEventRndNumColor, HostBufferALIGNED> HostBufferRndNumColor;
 #else
@@ -457,7 +457,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for helicity selection
   constexpr size_t sizePerEventSelectedHelicity = 1;
 
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
   // A class encapsulating a C++ host buffer for helicity selection
   typedef HostBuffer<int, sizePerEventSelectedHelicity, HostBufferALIGNED> HostBufferSelectedHelicity;
 #else
@@ -475,7 +475,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for color selection
   constexpr size_t sizePerEventSelectedColor = 1;
 
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
   // A class encapsulating a C++ host buffer for color selection
   typedef HostBuffer<int, sizePerEventSelectedColor, HostBufferALIGNED> HostBufferSelectedColor;
 #else
@@ -487,7 +487,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
   template<class Tdst, class Tsrc>
   void copyDeviceFromHost( Tdst& dst, const Tsrc& src ) // keep the same order of arguments as in memcpy
   {
@@ -504,13 +504,13 @@ namespace mg5amcCpu
       throw std::runtime_error( sstr.str() );
     }
     // NB (PR #45): cudaMemcpy involves an intermediate memcpy to pinned memory if host array is a not a pinned host array
-    gpuMemcpy( dst.data(), src.data(), src.bytes(), gpuMemcpyHostToDevice );
+    checkCuda( cudaMemcpy( dst.data(), src.data(), src.bytes(), cudaMemcpyHostToDevice ) );
   }
 #endif
 
   //--------------------------------------------------------------------------
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
   template<class Tdst, class Tsrc>
   void copyHostFromDevice( Tdst& dst, const Tsrc& src ) // keep the same order of arguments as in memcpy
   {
@@ -527,7 +527,7 @@ namespace mg5amcCpu
       throw std::runtime_error( sstr.str() );
     }
     // NB (PR #45): cudaMemcpy involves an intermediate memcpy to pinned memory if host array is a not a pinned host array
-    gpuMemcpy( dst.data(), src.data(), src.bytes(), gpuMemcpyDeviceToHost );
+    checkCuda( cudaMemcpy( dst.data(), src.data(), src.bytes(), cudaMemcpyDeviceToHost ) );
   }
 #endif
 
diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/CPPProcess.cc b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/CPPProcess.cc
index caf3f4c49d..5459588505 100644
--- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/CPPProcess.cc
+++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/CPPProcess.cc
@@ -4,7 +4,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
 // MadGraph5_aMC@NLO v. 3.5.0_lo_vect, 2023-06-09
@@ -16,6 +16,7 @@
 
 #include "mgOnGpuConfig.h"
 
+#include "CudaRuntime.h"
 #include "HelAmps_sm.h"
 #include "MemoryAccessAmplitudes.h"
 #include "MemoryAccessCouplings.h"
@@ -45,7 +46,7 @@
 // Class member functions for calculating the matrix elements for
 // Process: g g > t t~ g g g WEIGHTED<=5 @1
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -79,7 +80,7 @@ namespace mg5amcCpu
   __device__ const fptype cIPD[2] = { (fptype)Parameters_sm::mdl_MT, (fptype)Parameters_sm::mdl_WT };
   __device__ const fptype* cIPC = nullptr; // unused as nicoup=0
 #else
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
   __device__ __constant__ fptype cIPD[2];
   __device__ __constant__ fptype* cIPC = nullptr; // unused as nicoup=0
 #else
@@ -89,7 +90,7 @@ namespace mg5amcCpu
 #endif
 
   // Helicity combinations (and filtering of "good" helicity combinations)
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
   __device__ __constant__ short cHel[ncomb][npar];
   __device__ __constant__ int cNGoodHel;
   __device__ __constant__ int cGoodHel[ncomb];
@@ -117,13 +118,13 @@ namespace mg5amcCpu
                            fptype* allDenominators,       // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
                            fptype_sv* jamp2_sv            // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled)
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
                            , const int ievt00             // input: first event number in current C++ event page (for CUDA, ievt depends on threadid)
 #endif
                            )
   //ALWAYS_INLINE // attributes are not permitted in a function definition
   {
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
     using namespace mg5amcGpu;
     using M_ACCESS = DeviceAccessMomenta;         // non-trivial access: buffer includes all events
     using E_ACCESS = DeviceAccessMatrixElements;  // non-trivial access: buffer includes all events
@@ -150,7 +151,7 @@ namespace mg5amcCpu
 #endif /* clang-format on */
     mgDebug( 0, __FUNCTION__ );
     //printf( "calculate_wavefunctions: ihel=%2d\n", ihel );
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
     //printf( "calculate_wavefunctions: ievt00=%d\n", ievt00 );
 #endif
 
@@ -186,7 +187,7 @@ namespace mg5amcCpu
 #endif
     for( int iParity = 0; iParity < nParity; ++iParity )
     { // START LOOP ON IPARITY
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
       const int ievt0 = ievt00 + iParity * neppV;
 #endif
       constexpr size_t nxcoup = ndcoup + nicoup; // both dependent and independent couplings
@@ -199,10 +200,8 @@ namespace mg5amcCpu
         allCOUPs[idcoup] = CD_ACCESS::idcoupAccessBufferConst( allcouplings, idcoup ); // dependent couplings, vary event-by-event
       for( size_t iicoup = 0; iicoup < nicoup; iicoup++ )
         allCOUPs[ndcoup + iicoup] = CI_ACCESS::iicoupAccessBufferConst( cIPC, iicoup ); // independent couplings, fixed for all events
-#ifdef MGONGPUCPP_GPUIMPL
 #ifdef __CUDACC__
 #pragma nv_diagnostic pop
-#endif
       // CUDA kernels take input/output buffers with momenta/MEs for all events
       const fptype* momenta = allmomenta;
       const fptype* COUPs[nxcoup];
@@ -30019,7 +30018,7 @@ namespace mg5amcCpu
         { -116, 442, 442, -134, -134, 505, -44, -134, 28, -224, -62, 496, -53, 19, -62, 496, 10, -80, -62, -71, 10, -80, 1, -8, 136, -116, -116, -44, -44, 514, -116, 442, -44, 28, -53, -62, -44, -53, 514, -62, 100, 10, 28, -62, -62, 10, 10, 1, 514, 505, -62, 496, -71, 568, -44, -134, -53, -62, 19, -71, 10, -80, 100, 10, 640, -80, 1, -8, 10, 1, 64, -8, -62, -71, 10, -80, 1, -8, 28, -62, -62, 10, 10, 1, 1, -8, 10, 1, 64, -8, -8, 64, -80, -8, -512, 64, 496, 568, -80, 640, -8, 64, -224, 496, 496, -80, -80, -8, -8, 64, -80, -8, -512, 64, 64, -512, 640, 64, 4096, -512 },
         { 136, -116, -116, -44, -44, 514, -116, 442, -44, 28, -53, -62, -44, -53, 514, -62, 100, 10, 28, -62, -62, 10, 10, 1, -116, 442, 442, -134, -134, 505, -44, -134, 28, -224, -62, 496, -53, 19, -62, 496, 10, -80, -62, -71, 10, -80, 1, -8, -44, -134, -53, -62, 19, -71, 514, 505, -62, 496, -71, 568, 100, 10, 10, -80, -80, 640, 10, 1, 1, -8, -8, 64, 28, -62, -62, 10, 10, 1, -62, -71, 10, -80, 1, -8, 10, 1, 1, -8, -8, 64, -80, -8, -8, 64, 64, -512, -224, 496, 496, -80, -80, -8, 496, 568, -80, 640, -8, 64, -80, -8, -8, 64, 64, -512, 640, 64, 64, -512, -512, 4096 } }; // 2-D array[120][120]
 
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
       // Pre-compute a constexpr triangular color matrix properly normalized #475
       struct TriangularNormalizedColorMatrix
       {
@@ -30076,7 +30075,7 @@ namespace mg5amcCpu
 #endif
       for( int icol = 0; icol < ncolor; icol++ )
       {
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
         // === C++ START ===
         // Diagonal terms
 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
@@ -30135,7 +30134,7 @@ namespace mg5amcCpu
       MEs_sv_previous += deltaMEs_previous;
 #endif
       /*
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
       if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", blockDim.x * blockIdx.x + threadIdx.x, ihel, MEs_sv );
 #else
 #ifdef MGONGPU_CPPSIMD
@@ -30294,8 +30293,8 @@ namespace mg5amcCpu
       { 1, 1, 1, -1, 1, -1, 1 },
       { 1, 1, 1, -1, 1, 1, -1 },
       { 1, 1, 1, -1, 1, 1, 1 } };
-#ifdef MGONGPUCPP_GPUIMPL
-    gpuMemcpyToSymbol( cHel, tHel, ncomb * npar * sizeof( short ) );
+#ifdef __CUDACC__
+    checkCuda( cudaMemcpyToSymbol( cHel, tHel, ncomb * npar * sizeof( short ) ) );
 #else
     memcpy( cHel, tHel, ncomb * npar * sizeof( short ) );
 #endif
@@ -30338,9 +30337,9 @@ namespace mg5amcCpu
     // Then copy them to CUDA constant memory (issue #39) or its C++ emulation in file-scope static memory
     const fptype tIPD[2] = { (fptype)m_pars->mdl_MT, (fptype)m_pars->mdl_WT };
     //const cxtype tIPC[0] = { ... }; // nicoup=0
-#ifdef MGONGPUCPP_GPUIMPL
-    gpuMemcpyToSymbol( cIPD, tIPD, 2 * sizeof( fptype ) );
-    //gpuMemcpyToSymbol( cIPC, tIPC, 0 * sizeof( cxtype ) ); // nicoup=0
+#ifdef __CUDACC__
+    checkCuda( cudaMemcpyToSymbol( cIPD, tIPD, 2 * sizeof( fptype ) ) );
+    //checkCuda( cudaMemcpyToSymbol( cIPC, tIPC, 0 * sizeof( cxtype ) ) ); // nicoup=0
 #else
     memcpy( cIPD, tIPD, 2 * sizeof( fptype ) );
     //memcpy( cIPC, tIPC, 0 * sizeof( cxtype ) ); // nicoup=0
@@ -30379,7 +30378,7 @@ namespace mg5amcCpu
   {
     std::stringstream out;
     // CUDA version (NVCC)
-    // [Use __NVCC__ instead of MGONGPUCPP_GPUIMPL here!]
+    // [Use __NVCC__ instead of __CUDACC__ here!]
     // [This tests if 'nvcc' was used even to build a .cc file, even if not necessarily 'nvcc -x cu' for a .cu file]
     // [Check 'nvcc --compiler-options -dM -E dummy.c | grep CUDA': see https://stackoverflow.com/a/53713712]
 #ifdef __NVCC__
@@ -30444,12 +30443,12 @@ namespace mg5amcCpu
   __global__ void /* clang-format off */
   computeDependentCouplings( const fptype* allgs, // input: Gs[nevt]
                              fptype* allcouplings // output: couplings[nevt*ndcoup*2]
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
                              , const int nevt     // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #endif
   ) /* clang-format on */
   {
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
     using namespace mg5amcGpu;
     using G_ACCESS = DeviceAccessGs;
     using C_ACCESS = DeviceAccessCouplings;
@@ -30470,7 +30469,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
+#ifdef __CUDACC__ /* clang-format off */
   __global__ void
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
@@ -30599,9 +30598,9 @@ namespace mg5amcCpu
         nGoodHel++;
       }
     }
-#ifdef MGONGPUCPP_GPUIMPL
-    gpuMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) );
-    gpuMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) );
+#ifdef __CUDACC__
+    checkCuda( cudaMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) ) );
+    checkCuda( cudaMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) ) );
 #else
     cNGoodHel = nGoodHel;
     for( int ihel = 0; ihel < ncomb; ihel++ ) cGoodHel[ihel] = goodHel[ihel];
@@ -30625,7 +30624,7 @@ namespace mg5amcCpu
 #endif
             int* allselhel,                // output: helicity selection[nevt]
             int* allselcol                 // output: helicity selection[nevt]
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
             , const int nevt               // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #endif
             ) /* clang-format on */
@@ -30646,7 +30645,7 @@ namespace mg5amcCpu
     // Denominators: spins, colors and identical particles
     constexpr int helcolDenominators[1] = { 1536 }; // assume nprocesses == 1 (#272 and #343)
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
     // Remember: in CUDA this is a kernel for one event, in c++ this processes n events
     const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid
 #else
@@ -30660,12 +30659,9 @@ namespace mg5amcCpu
 #endif
 
     // Start sigmaKin_lines
-
-#include "GpuAbstraction.h"
-
     // === PART 0 - INITIALISATION (before calculate_wavefunctions) ===
     // Reset the "matrix elements" - running sums of |M|^2 over helicities for the given event
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
     allMEs[ievt] = 0;
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     allNumerators[ievt] = 0;
@@ -30693,7 +30689,7 @@ namespace mg5amcCpu
     // === PART 1 - HELICITY LOOP: CALCULATE WAVEFUNCTIONS ===
     // (in both CUDA and C++, using precomputed good helicities)
 
-#ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++
+#ifdef __CUDACC__ // CUDA OR C++
 
     // *** START OF PART 1a - CUDA (one event per CPU thread) ***
     // Running sum of partial amplitudes squared for event by event color selection (#402)
@@ -30897,7 +30893,7 @@ namespace mg5amcCpu
     // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event
     // [NB 'sum over final spins, average over initial spins', eg see
     // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf]
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
     allMEs[ievt] /= helcolDenominators[0];
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     if( channelId > 0 ) allMEs[ievt] *= allNumerators[ievt] / allDenominators[ievt];
diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/CPPProcess.h b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/CPPProcess.h
index b1f469b1c9..d1dd4d6150 100644
--- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/CPPProcess.h
+++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/CPPProcess.h
@@ -4,7 +4,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
 // MadGraph5_aMC@NLO v. 3.5.0_lo_vect, 2023-06-09
@@ -25,7 +25,7 @@
 
 //--------------------------------------------------------------------------
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -107,7 +107,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
   __global__ void
   computeDependentCouplings( const fptype* allgs,    // input: Gs[nevt]
                              fptype* allcouplings ); // output: couplings[nevt*ndcoup*2]
@@ -120,7 +120,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
+#ifdef __CUDACC__ /* clang-format off */
   __global__ void
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
@@ -150,7 +150,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
+#ifdef __CUDACC__ /* clang-format off */
   __global__ void
   sigmaKin( const fptype* allmomenta,      // input: momenta[nevt*npar*4]
             const fptype* allcouplings,    // input: couplings[nevt*ndcoup*2]
diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/CudaRuntime.h b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/CudaRuntime.h
new file mode 120000
index 0000000000..ce9e1a487a
--- /dev/null
+++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/CudaRuntime.h
@@ -0,0 +1 @@
+../CudaRuntime.h
\ No newline at end of file
diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/check_sa.cc b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/check_sa.cc
index 1bad694d1c..f1e75b9252 100644
--- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/check_sa.cc
+++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/check_sa.cc
@@ -4,7 +4,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: O. Mattelaer (Nov 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 
 #include "mgOnGpuConfig.h"
@@ -12,7 +12,6 @@
 #include "BridgeKernels.h"
 #include "CPPProcess.h"
 #include "CrossSectionKernels.h"
-#include "GpuRuntime.h"
 #include "MatrixElementKernels.h"
 #include "MemoryAccessMatrixElements.h"
 #include "MemoryAccessMomenta.h"
@@ -64,7 +63,7 @@ usage( char* argv0, int ret = 1 )
   std::cout << std::endl;
   std::cout << "Summary stats are always computed: '-p' and '-j' only control their printout" << std::endl;
   std::cout << "The '-d' flag only enables NaN/abnormal warnings and OMP debugging" << std::endl;
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
 #ifdef _OPENMP
   std::cout << std::endl;
   std::cout << "Use the OMP_NUM_THREADS environment variable to control OMP multi-threading" << std::endl;
@@ -78,7 +77,7 @@ int
 main( int argc, char** argv )
 {
   // Namespaces for CUDA and C++ (FIXME - eventually use the same namespace everywhere...)
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
   using namespace mg5amcGpu;
 #else
   using namespace mg5amcCpu;
@@ -104,11 +103,11 @@ main( int argc, char** argv )
     CurandDevice = 2
   };
 #ifdef __CUDACC__
-  RandomNumberMode rndgen = RandomNumberMode::CurandDevice; // default on CUDA GPU
+  RandomNumberMode rndgen = RandomNumberMode::CurandDevice; // default on GPU
 #elif not defined MGONGPU_HAS_NO_CURAND
   RandomNumberMode rndgen = RandomNumberMode::CurandHost;  // default on CPU if build has curand
 #else
-  RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // default on CPU if build has no curand and on HIP GPU
+  RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // default on CPU if build has no curand
 #endif
   // Rambo sampling mode (NB RamboHost implies CommonRandom or CurandHost!)
   enum class RamboSamplingMode
@@ -116,7 +115,7 @@ main( int argc, char** argv )
     RamboHost = 1,
     RamboDevice = 2
   };
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
   RamboSamplingMode rmbsmp = RamboSamplingMode::RamboDevice; // default on GPU
 #else
   RamboSamplingMode rmbsmp = RamboSamplingMode::RamboHost; // default on CPU
@@ -149,7 +148,7 @@ main( int argc, char** argv )
 #ifdef __CUDACC__
       rndgen = RandomNumberMode::CurandDevice;
 #else
-      throw std::runtime_error( "CurandDevice is not supported on CPUs or on HIP GPUs" );
+      throw std::runtime_error( "CurandDevice is not supported on CPUs" );
 #endif
     }
     else if( arg == "--curhst" )
@@ -166,7 +165,7 @@ main( int argc, char** argv )
     }
     else if( arg == "--rmbdev" )
     {
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
       rmbsmp = RamboSamplingMode::RamboDevice;
 #else
       throw std::runtime_error( "RamboDevice is not supported on CPUs" );
@@ -240,13 +239,13 @@ main( int argc, char** argv )
     return usage( argv[0] );
   }
 
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
 #ifdef _OPENMP
   ompnumthreadsNotSetMeansOneThread( debug ? 1 : 0 ); // quiet(-1), info(0), debug(1)
 #endif
 #endif
 
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
   // Fail gently and avoid "Illegal instruction (core dumped)" if the host does not support the SIMD used in the ME calculation
   // Note: this prevents a crash on pmpe04 but not on some github CI nodes?
   // [NB: SIMD vectorization in mg5amc C++ code is only used in the ME calculation below MatrixElementKernelHost!]
@@ -264,14 +263,14 @@ main( int argc, char** argv )
 
   // === STEP 0 - INITIALISE
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 
-  // --- 00. Initialise GPU
-  // Instantiate a GpuRuntime at the beginnining of the application's main.
-  // For CUDA this invokes cudaSetDevice(0) in the constructor and books a cudaDeviceReset() call in the destructor.
-  const std::string cdinKey = "00 GpuInit";
+  // --- 00. Initialise cuda
+  // Instantiate a CudaRuntime at the beginnining of the application's main to
+  // invoke cudaSetDevice(0) in the constructor and book a cudaDeviceReset() call in the destructor
+  const std::string cdinKey = "00 CudaInit";
   timermap.start( cdinKey );
-  GpuRuntime GpuRuntime( debug );
+  CudaRuntime cudaRuntime( debug );
 #endif
 
   // --- 0a. Initialise physics process
@@ -293,7 +292,7 @@ main( int argc, char** argv )
   timermap.start( alloKey );
 
   // Memory buffers for random numbers for momenta
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
   HostBufferRndNumMomenta hstRndmom( nevt );
 #else
   PinnedHostBufferRndNumMomenta hstRndmom( nevt );
@@ -301,7 +300,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for sampling weights
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
   HostBufferWeights hstWeights( nevt );
 #else
   PinnedHostBufferWeights hstWeights( nevt );
@@ -309,7 +308,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for momenta
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
   HostBufferMomenta hstMomenta( nevt );
 #else
   PinnedHostBufferMomenta hstMomenta( nevt );
@@ -317,7 +316,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for Gs
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
   HostBufferGs hstGs( nevt );
 #else
   PinnedHostBufferGs hstGs( nevt );
@@ -334,7 +333,7 @@ main( int argc, char** argv )
   }
 
   // Memory buffers for matrix elements
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
   HostBufferMatrixElements hstMatrixElements( nevt );
 #else
   PinnedHostBufferMatrixElements hstMatrixElements( nevt );
@@ -343,7 +342,7 @@ main( int argc, char** argv )
 
   // Memory buffers for random numbers for helicity selection
   // *** NB #403 these buffers always remain initialised at 0: no need for helicity choice in gcheck/check (no LHE produced) ***
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
   HostBufferRndNumHelicity hstRndHel( nevt );
 #else
   PinnedHostBufferRndNumHelicity hstRndHel( nevt );
@@ -352,7 +351,7 @@ main( int argc, char** argv )
 
   // Memory buffers for random numbers for color selection
   // *** NB #402 these buffers always remain initialised at 0: no need for color choice in gcheck/check (no LHE produced) ***
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
   HostBufferRndNumColor hstRndCol( nevt );
 #else
   PinnedHostBufferRndNumColor hstRndCol( nevt );
@@ -360,7 +359,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for helicity selection
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
   HostBufferSelectedHelicity hstSelHel( nevt );
 #else
   PinnedHostBufferSelectedHelicity hstSelHel( nevt );
@@ -368,7 +367,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for color selection
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
   HostBufferSelectedColor hstSelCol( nevt );
 #else
   PinnedHostBufferSelectedColor hstSelCol( nevt );
@@ -404,7 +403,7 @@ main( int argc, char** argv )
 #else
   else
   {
-    throw std::logic_error( "CurandDevice is not supported on CPUs or HIP GPUs" ); // INTERNAL ERROR (no path to this statement)
+    throw std::logic_error( "CurandDevice is not supported on CPUs" ); // INTERNAL ERROR (no path to this statement)
   }
 #endif
 #else
@@ -422,7 +421,7 @@ main( int argc, char** argv )
   }
   else
   {
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
     prsk.reset( new RamboSamplingKernelDevice( energy, devRndmom, devMomenta, devWeights, gpublocks, gputhreads ) );
 #else
     throw std::logic_error( "RamboDevice is not supported on CPUs" ); // INTERNAL ERROR (no path to this statement)
@@ -433,7 +432,7 @@ main( int argc, char** argv )
   std::unique_ptr<MatrixElementKernelBase> pmek;
   if( !bridge )
   {
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
     pmek.reset( new MatrixElementKernelDevice( devMomenta, devGs, devRndHel, devRndCol, devMatrixElements, devSelHel, devSelCol, gpublocks, gputhreads ) );
 #else
     pmek.reset( new MatrixElementKernelHost( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, nevt ) );
@@ -441,7 +440,7 @@ main( int argc, char** argv )
   }
   else
   {
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
     pmek.reset( new BridgeKernelDevice( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, gpublocks, gputhreads ) );
 #else
     pmek.reset( new BridgeKernelHost( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, nevt ) );
@@ -483,7 +482,7 @@ main( int argc, char** argv )
     prnk->generateRnarray();
     //std::cout << "Got random numbers" << std::endl;
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
     if( rndgen != RandomNumberMode::CurandDevice && rmbsmp == RamboSamplingMode::RamboDevice )
     {
       // --- 1c. Copy rndmom from host to device
@@ -515,7 +514,7 @@ main( int argc, char** argv )
     prsk->getMomentaFinal();
     //std::cout << "Got final momenta" << std::endl;
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
     if( rmbsmp == RamboSamplingMode::RamboDevice )
     {
       // --- 2c. CopyDToH Weights
@@ -560,7 +559,7 @@ main( int argc, char** argv )
       dynamic_cast<BridgeKernelBase*>( pmek.get() )->transposeInputMomentaC2F();
     }
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
     // --- 2d. CopyHToD Momenta
     const std::string gKey = "0.. CpHTDg";
     rambtime += timermap.start( gKey ); // FIXME! NOT A RAMBO TIMER!
@@ -589,7 +588,7 @@ main( int argc, char** argv )
     wv3atime += timermap.stop(); // calc only
     wavetime += wv3atime;        // calc plus copy
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
     if( !bridge )
     {
       // --- 3b. CopyDToH MEs
@@ -732,19 +731,15 @@ main( int argc, char** argv )
     rndgentxt = "CURAND DEVICE";
 #ifdef __CUDACC__
   rndgentxt += " (CUDA code)";
-#elif defined __HIPCC__
-  rndgentxt += " (HIP code)";
 #else
   rndgentxt += " (C++ code)";
 #endif
 
   // Workflow description summary
   std::string wrkflwtxt;
-  // -- CUDA or HIP or C++?
+  // -- CUDA or C++?
 #ifdef __CUDACC__
   wrkflwtxt += "CUD:";
-#elif defined __HIPCC__
-  wrkflwtxt += "HIP:";
 #else
   wrkflwtxt += "CPP:";
 #endif
@@ -769,12 +764,6 @@ main( int argc, char** argv )
 #else
   wrkflwtxt += "???:"; // no path to this statement
 #endif
-#elif defined __HIPCC__
-#if defined MGONGPU_CUCXTYPE_CXSMPL
-  wrkflwtxt += "CXS:";
-#else
-  wrkflwtxt += "???:"; // no path to this statement
-#endif
 #else
 #if defined MGONGPU_CPPCXTYPE_STDCOMPLEX
   wrkflwtxt += "STX:";
@@ -800,7 +789,7 @@ main( int argc, char** argv )
     wrkflwtxt += "RMBDEV+";
   else
     wrkflwtxt += "??????+"; // no path to this statement
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
   // -- HOST or DEVICE matrix elements? Standalone MEs or BRIDGE?
   if( !bridge )
     wrkflwtxt += "MESDEV";
@@ -856,7 +845,7 @@ main( int argc, char** argv )
 
   if( perf )
   {
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
 #ifdef _OPENMP
     // Get the output of "nproc --all" (https://stackoverflow.com/a/478960)
     std::string nprocall;
@@ -877,8 +866,6 @@ main( int argc, char** argv )
     std::cout << std::string( SEP79, '*' ) << std::endl
 #ifdef __CUDACC__
               << "Process                     = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_CUDA"
-#elif defined __HIPCC__
-              << "Process                     = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_HIP"
 #else
               << "Process                     = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_CPP"
 #endif
@@ -905,21 +892,21 @@ main( int argc, char** argv )
 #elif defined MGONGPU_FPTYPE_FLOAT
               << "FP precision                = FLOAT (NaN/abnormal=" << nabn << ", zero=" << nzero << ")" << std::endl
 #endif
+#ifdef __CUDACC__
 #if defined MGONGPU_CUCXTYPE_CUCOMPLEX
               << "Complex type                = CUCOMPLEX" << std::endl
 #elif defined MGONGPU_CUCXTYPE_THRUST
               << "Complex type                = THRUST::COMPLEX" << std::endl
-#elif defined MGONGPU_CUCXTYPE_CXSMPL
-              << "Complex type                = STD::COMPLEX" << std::endl
+#endif
 #else
-              << "Complex type                = ???" << std::endl // no path to this statement...
+              << "Complex type                = STD::COMPLEX" << std::endl
 #endif
               << "RanNumb memory layout       = AOSOA[" << neppR << "]"
               << ( neppR == 1 ? " == AOS" : "" )
               << " [HARDCODED FOR REPRODUCIBILITY]" << std::endl
               << "Momenta memory layout       = AOSOA[" << neppM << "]"
               << ( neppM == 1 ? " == AOS" : "" ) << std::endl
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
     //<< "Wavefunction GPU memory     = LOCAL" << std::endl
 #else
 #if !defined MGONGPU_CPPSIMD
@@ -950,7 +937,7 @@ main( int argc, char** argv )
 #endif
 #endif
               << "Random number generation    = " << rndgentxt << std::endl
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
 #ifdef _OPENMP
               << "OMP threads / `nproc --all` = " << omp_get_max_threads() << " / " << nprocall // includes a newline
 #endif
@@ -1046,14 +1033,14 @@ main( int argc, char** argv )
              << "\"FLOAT (NaN/abnormal=" << nabn << ")\"," << std::endl
 #endif
              << "\"Complex type\": "
+#ifdef __CUDACC__
 #if defined MGONGPU_CUCXTYPE_CUCOMPLEX
              << "\"CUCOMPLEX\"," << std::endl
 #elif defined MGONGPU_CUCXTYPE_THRUST
              << "\"THRUST::COMPLEX\"," << std::endl
-#elif defined MGONGPU_CUCXTYPE_CXSMPL
-             << "\"STD::COMPLEX\"," << std::endl
+#endif
 #else
-             << "\"???\"," << std::endl                           // no path to this statement...
+             << "\"STD::COMPLEX\"," << std::endl
 #endif
              << "\"RanNumb memory layout\": "
              << "\"AOSOA[" << neppR << "]\""
@@ -1061,7 +1048,7 @@ main( int argc, char** argv )
              << "\"Momenta memory layout\": "
              << "\"AOSOA[" << neppM << "]\""
              << ( neppM == 1 ? " == AOS" : "" ) << ", " << std::endl
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
     //<< "\"Wavefunction GPU memory\": " << "\"LOCAL\"," << std::endl
 #endif
              << "\"Curand generation\": "
diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/RamboSamplingKernels.cc b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/RamboSamplingKernels.cc
index 79abbcc4f8..da68aa9255 100644
--- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/RamboSamplingKernels.cc
+++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/RamboSamplingKernels.cc
@@ -1,11 +1,11 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #include "RamboSamplingKernels.h"
 
-#include "GpuRuntime.h"
+#include "CudaRuntime.h"
 #include "MemoryAccessMomenta.h"
 #include "MemoryAccessRandomNumbers.h"
 #include "MemoryAccessWeights.h"
@@ -14,7 +14,7 @@
 
 #include <sstream>
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -92,7 +92,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
   RamboSamplingKernelDevice::RamboSamplingKernelDevice( const fptype energy,               // input: energy
                                                         const BufferRndNumMomenta& rndmom, // input: random numbers in [0,1]
                                                         BufferMomenta& momenta,            // output: momenta
@@ -135,7 +135,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
   __global__ void
   getMomentaInitialDevice( const fptype energy,
                            fptype* momenta )
@@ -147,17 +147,17 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
   void
   RamboSamplingKernelDevice::getMomentaInitial()
   {
-    gpuLaunchKernel( getMomentaInitialDevice, m_gpublocks, m_gputhreads, m_energy, m_momenta.data() );
+    getMomentaInitialDevice<<<m_gpublocks, m_gputhreads>>>( m_energy, m_momenta.data() );
   }
 #endif
 
   //--------------------------------------------------------------------------
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
   __global__ void
   getMomentaFinalDevice( const fptype energy,
                          const fptype* rndmom,
@@ -171,11 +171,11 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
   void
   RamboSamplingKernelDevice::getMomentaFinal()
   {
-    gpuLaunchKernel( getMomentaFinalDevice, m_gpublocks, m_gputhreads, m_energy, m_rndmom.data(), m_momenta.data(), m_weights.data() );
+    getMomentaFinalDevice<<<m_gpublocks, m_gputhreads>>>( m_energy, m_rndmom.data(), m_momenta.data(), m_weights.data() );
   }
 #endif
 
diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/RamboSamplingKernels.h b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/RamboSamplingKernels.h
index 7c214cd74b..184089efd7 100644
--- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/RamboSamplingKernels.h
+++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/RamboSamplingKernels.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef RAMBOSAMPLINGKERNELS_H
 #define RAMBOSAMPLINGKERNELS_H 1
@@ -10,7 +10,7 @@
 
 #include "MemoryBuffers.h"
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -93,7 +93,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
   // A class encapsulating RAMBO phase space sampling on a GPU device
   class RamboSamplingKernelDevice final : public SamplingKernelBase, public NumberOfEvents
   {
diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/RandomNumberKernels.h b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/RandomNumberKernels.h
index 21d63beeac..188a72c2c9 100644
--- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/RandomNumberKernels.h
+++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/RandomNumberKernels.h
@@ -1,14 +1,14 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef RANDOMNUMBERKERNELS_H
 #define RANDOMNUMBERKERNELS_H 1
 
 #include "mgOnGpuConfig.h"
 
-// NB This must come AFTER mgOnGpuConfig.h which contains our definition of __global__ when MGONGPUCPP_GPUIMPL is not defined
+// NB This must come AFTER mgOnGpuConfig.h which contains our definition of __global__ when __CUDACC__ is not defined
 #ifndef MGONGPU_HAS_NO_CURAND
 //#include "curand.h"
 struct curandGenerator_st; // forward definition from curand.h
@@ -16,7 +16,7 @@ struct curandGenerator_st; // forward definition from curand.h
 
 #include "MemoryBuffers.h"
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/cudacpp.mk b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/cudacpp.mk
index bcb73d7f01..a0397e9ecc 100644
--- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/cudacpp.mk
+++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/cudacpp.mk
@@ -1,7 +1,7 @@
 # Copyright (C) 2020-2023 CERN and UCLouvain.
 # Licensed under the GNU Lesser General Public License (version 3 or later).
 # Created by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin.
-# Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+# Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 
 #=== Determine the name of this makefile (https://ftp.gnu.org/old-gnu/Manuals/make-3.80/html_node/make_17.html)
 #=== NB: different names (e.g. cudacpp.mk and cudacpp_src.mk) are used in the Subprocess and src directories
@@ -32,7 +32,7 @@ UNAME_P := $(shell uname -p)
 #=== Configure common compiler flags for C++ and CUDA
 
 INCFLAGS = -I.
-OPTFLAGS = -O3 # this ends up in GPUFLAGS too (should it?), cannot add -Ofast or -ffast-math here
+OPTFLAGS = -O3 # this ends up in CUFLAGS too (should it?), cannot add -Ofast or -ffast-math here
 
 # Dependency on src directory
 MG5AMC_COMMONLIB = mg5amc_common
@@ -89,139 +89,69 @@ endif
 
 #-------------------------------------------------------------------------------
 
-CUDA_COMPILER_PATH := $(shell compiler="`which nvcc 2>/dev/null`" && while [ -L "$$compiler" ]; do compiler=`readlink "$$compiler"`; done && echo "$$compiler")
-HIP_COMPILER_PATH := $(shell compiler="`which hipcc 2>/dev/null`" && while [ -L "$$compiler" ]; do compiler=`readlink "$$compiler"`; done && echo "$$compiler")
-
-ifeq ($(findstring nvcc,$(CUDA_COMPILER_PATH)),nvcc)
-  #=== Configure the CUDA compiler
-
-  # If CXX is not a single word (example "clang++ --gcc-toolchain...") then disable CUDA builds (issue #505)
-  # This is because it is impossible to pass this to "GPUFLAGS += -ccbin <host-compiler>" below
-  ifneq ($(words $(subst ccache ,,$(CXX))),1) # allow at most "CXX=ccache <host-compiler>" from outside
-    $(warning CUDA builds are not supported for multi-word CXX "$(CXX)")
-    override CUDA_HOME=disabled
-  endif
-
-  # If CUDA_HOME is not set, try to set it from the location of NVCC
-  ifndef CUDA_HOME
-    CUDA_HOME = $(patsubst %bin/nvcc,%,$(shell which nvcc 2>/dev/null))
-    $(warning CUDA_HOME was not set: using "$(CUDA_HOME)")
-  endif
-
-  # Set GPUCC as $(CUDA_HOME)/bin/nvcc if it exists
-  ifneq ($(wildcard $(CUDA_HOME)/bin/nvcc),)
-    GPUCC = $(CUDA_HOME)/bin/nvcc
-    USE_NVTX ?=-DUSE_NVTX
-    # See https://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/index.html
-    # See https://arnon.dk/matching-sm-architectures-arch-and-gencode-for-various-nvidia-cards/
-    # Default: use compute capability 70 for V100 (CERN lxbatch, CERN itscrd, Juwels Cluster).
-    # Embed device code for 70, and PTX for 70+.
-    # Export MADGRAPH_CUDA_ARCHITECTURE (comma-separated list) to use another value or list of values (see #533).
-    # Examples: use 60 for P100 (Piz Daint), 80 for A100 (Juwels Booster, NVidia raplab/Curiosity).
-    MADGRAPH_CUDA_ARCHITECTURE ?= 70
-    ###CUARCHFLAGS = -gencode arch=compute_$(MADGRAPH_CUDA_ARCHITECTURE),code=compute_$(MADGRAPH_CUDA_ARCHITECTURE) -gencode arch=compute_$(MADGRAPH_CUDA_ARCHITECTURE),code=sm_$(MADGRAPH_CUDA_ARCHITECTURE) # Older implementation (AV): go back to this one for multi-GPU support #533
-    ###CUARCHFLAGS = --gpu-architecture=compute_$(MADGRAPH_CUDA_ARCHITECTURE) --gpu-code=sm_$(MADGRAPH_CUDA_ARCHITECTURE),compute_$(MADGRAPH_CUDA_ARCHITECTURE) # Newer implementation (SH): cannot use this as-is for multi-GPU support #533
-    comma:=,
-    CUARCHFLAGS = $(foreach arch,$(subst $(comma), ,$(MADGRAPH_CUDA_ARCHITECTURE)),-gencode arch=compute_$(arch),code=compute_$(arch) -gencode arch=compute_$(arch),code=sm_$(arch))
-    CUINC = -I$(CUDA_HOME)/include/
-    CURANDLIBFLAGS = -L$(CUDA_HOME)/lib64/ -lcurand # NB: -lcuda is not needed here!
-    CUOPTFLAGS = -lineinfo
-    GPUFLAGS = $(OPTFLAGS) $(CUOPTFLAGS) $(INCFLAGS) $(CUINC) $(USE_NVTX) $(CUARCHFLAGS) -use_fast_math
-    ###GPUFLAGS += -Xcompiler -Wall -Xcompiler -Wextra -Xcompiler -Wshadow
-    ###GPUCC_VERSION = $(shell $(GPUCC) --version | grep 'Cuda compilation tools' | cut -d' ' -f5 | cut -d, -f1)
-    GPUFLAGS += -std=c++17 # need CUDA >= 11.2 (see #333): this is enforced in mgOnGpuConfig.h
-    # Without -maxrregcount: baseline throughput: 6.5E8 (16384 32 12) up to 7.3E8 (65536 128 12)
-    ###GPUFLAGS+= --maxrregcount 160 # improves throughput: 6.9E8 (16384 32 12) up to 7.7E8 (65536 128 12)
-    ###GPUFLAGS+= --maxrregcount 128 # improves throughput: 7.3E8 (16384 32 12) up to 7.6E8 (65536 128 12)
-    ###GPUFLAGS+= --maxrregcount 96 # degrades throughput: 4.1E8 (16384 32 12) up to 4.5E8 (65536 128 12)
-    ###GPUFLAGS+= --maxrregcount 64 # degrades throughput: 1.7E8 (16384 32 12) flat at 1.7E8 (65536 128 12)
-
-    CUBUILDRULEFLAGS = -Xcompiler -fPIC -c
-    CCBUILDRULEFLAGS = -Xcompiler -fPIC -c -x cu
-
-    CUDATESTFLAGS = -lcuda
-
-  else ifneq ($(origin REQUIRE_CUDA),undefined)
-    # If REQUIRE_CUDA is set but no cuda is found, stop here (e.g. for CI tests on GPU #443)
-    $(error No cuda installation found (set CUDA_HOME or make GPUCC visible in PATH))
-  else
-    # No cuda. Switch cuda compilation off and go to common random numbers in C++
-    $(warning CUDA_HOME is not set or is invalid: export CUDA_HOME to compile with cuda)
-    override GPUCC=
-    override USE_NVTX=
-    override CUINC=
-    override CURANDLIBFLAGS=
-  endif
-
-  # Set the host C++ compiler for GPUCC via "-ccbin <host-compiler>"
-  # (NB issue #505: this must be a single word, "clang++ --gcc-toolchain..." is not supported)
-  GPUFLAGS += -ccbin $(shell which $(subst ccache ,,$(CXX)))
-
-  # Allow newer (unsupported) C++ compilers with older versions of CUDA if ALLOW_UNSUPPORTED_COMPILER_IN_CUDA is set (#504)
-  ifneq ($(origin ALLOW_UNSUPPORTED_COMPILER_IN_CUDA),undefined)
-  GPUFLAGS += -allow-unsupported-compiler
-  endif
-
-else ifeq ($(findstring hipcc,$(HIP_COMPILER_PATH)),hipcc)
-  #=== Configure the HIP compiler
-
-  # If CXX is not a single word (example "clang++ --gcc-toolchain...") then disable CUDA builds (issue #505)
-  # This is because it is impossible to pass this to "GPUFLAGS += -ccbin <host-compiler>" below
-  ifneq ($(words $(subst ccache ,,$(CXX))),1) # allow at most "CXX=ccache <host-compiler>" from outside
-    $(warning CUDA builds are not supported for multi-word CXX "$(CXX)")
-    override CUDA_HOME=disabled
-  endif
-
-  # If HIP_HOME is not set, try to set it from the location of GPUCC
-  ifndef HIP_HOME
-    HIP_HOME = $(patsubst %bin/hipcc,%,$(HIP_COMPILER_PATH))
-    $(warning HIP_HOME was not set: using "$(HIP_HOME)")
-  endif
-
-  # Set GPUCC as $(HIP_HOME)/bin/hipcc if it exists
-  ifneq ($(wildcard $(HIP_HOME)/bin/hipcc),)
-    GPUCC = $(HIP_HOME)/bin/hipcc
-
-    # Should maybe find something equivelant to this in HIP
-    #USE_NVTX ?=-DUSE_NVTX
-
-    HIPARCHFLAGS = -target x86_64-linux-gnu --offload-arch=gfx90a
-    HIPINC = -I$(HIP_HOME)/include/
-
-    # -DHIP_FAST_MATH equivelant to -use_fast_math in HIP 
-    # (But only for single precision line 208: https://rocm-developer-tools.github.io/HIP/hcc__detail_2math__functions_8h_source.html)
-    GPUFLAGS = $(OPTFLAGS) $(CUOPTFLAGS) $(INCFLAGS) $(HIPINC) $(HIPARCHFLAGS) -DHIP_FAST_MATH -DHIP_PLATFORM=amd -fPIC
-    ###GPUFLAGS += -Xcompiler -Wall -Xcompiler -Wextra -Xcompiler -Wshadow
-    GPUFLAGS += -std=c++17
-    # Without -maxrregcount: baseline throughput: 6.5E8 (16384 32 12) up to 7.3E8 (65536 128 12)
-    ###GPUFLAGS+= --maxrregcount 160 # improves throughput: 6.9E8 (16384 32 12) up to 7.7E8 (65536 128 12)
-    ###GPUFLAGS+= --maxrregcount 128 # improves throughput: 7.3E8 (16384 32 12) up to 7.6E8 (65536 128 12)
-    ###GPUFLAGS+= --maxrregcount 96 # degrades throughput: 4.1E8 (16384 32 12) up to 4.5E8 (65536 128 12)
-    ###GPUFLAGS+= --maxrregcount 64 # degrades throughput: 1.7E8 (16384 32 12) flat at 1.7E8 (65536 128 12)
-
-    CUBUILDRULEFLAGS = -fPIC -c
-    CCBUILDRULEFLAGS = -fPIC -c
-
-  else ifneq ($(origin REQUIRE_HIP),undefined)
-    # If REQUIRE_HIP is set but no cuda is found, stop here (e.g. for CI tests on GPU #443)
-    $(error No hip installation found (set HIP_HOME or make GPUCC visible in PATH))
-  else
-    # No hip. Switch hip compilation off and go to common random numbers in C++
-    $(warning HIP_HOME is not set or is invalid: export HIP_HOME to compile with hip)
-    override GPUCC=
-    override USE_NVTX=
-    override CUINC=
-    override CURANDLIBFLAGS=
-  endif
+#=== Configure the CUDA compiler
+
+# If CXX is not a single word (example "clang++ --gcc-toolchain...") then disable CUDA builds (issue #505)
+# This is because it is impossible to pass this to "CUFLAGS += -ccbin <host-compiler>" below
+ifneq ($(words $(subst ccache ,,$(CXX))),1) # allow at most "CXX=ccache <host-compiler>" from outside
+  $(warning CUDA builds are not supported for multi-word CXX "$(CXX)")
+  override CUDA_HOME=disabled
+endif
+
+# If CUDA_HOME is not set, try to set it from the location of nvcc
+ifndef CUDA_HOME
+  CUDA_HOME = $(patsubst %bin/nvcc,%,$(shell which nvcc 2>/dev/null))
+  $(warning CUDA_HOME was not set: using "$(CUDA_HOME)")
+endif
+
+# Set NVCC as $(CUDA_HOME)/bin/nvcc if it exists
+ifneq ($(wildcard $(CUDA_HOME)/bin/nvcc),)
+  NVCC = $(CUDA_HOME)/bin/nvcc
+  USE_NVTX ?=-DUSE_NVTX
+  # See https://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/index.html
+  # See https://arnon.dk/matching-sm-architectures-arch-and-gencode-for-various-nvidia-cards/
+  # Default: use compute capability 70 for V100 (CERN lxbatch, CERN itscrd, Juwels Cluster).
+  # Embed device code for 70, and PTX for 70+.
+  # Export MADGRAPH_CUDA_ARCHITECTURE (comma-separated list) to use another value or list of values (see #533).
+  # Examples: use 60 for P100 (Piz Daint), 80 for A100 (Juwels Booster, NVidia raplab/Curiosity).
+  MADGRAPH_CUDA_ARCHITECTURE ?= 70
+  ###CUARCHFLAGS = -gencode arch=compute_$(MADGRAPH_CUDA_ARCHITECTURE),code=compute_$(MADGRAPH_CUDA_ARCHITECTURE) -gencode arch=compute_$(MADGRAPH_CUDA_ARCHITECTURE),code=sm_$(MADGRAPH_CUDA_ARCHITECTURE) # Older implementation (AV): go back to this one for multi-GPU support #533
+  ###CUARCHFLAGS = --gpu-architecture=compute_$(MADGRAPH_CUDA_ARCHITECTURE) --gpu-code=sm_$(MADGRAPH_CUDA_ARCHITECTURE),compute_$(MADGRAPH_CUDA_ARCHITECTURE) # Newer implementation (SH): cannot use this as-is for multi-GPU support #533
+  comma:=,
+  CUARCHFLAGS = $(foreach arch,$(subst $(comma), ,$(MADGRAPH_CUDA_ARCHITECTURE)),-gencode arch=compute_$(arch),code=compute_$(arch) -gencode arch=compute_$(arch),code=sm_$(arch))
+  CUINC = -I$(CUDA_HOME)/include/
+  CURANDLIBFLAGS = -L$(CUDA_HOME)/lib64/ -lcurand # NB: -lcuda is not needed here!
+  CUOPTFLAGS = -lineinfo
+  CUFLAGS = $(OPTFLAGS) $(CUOPTFLAGS) $(INCFLAGS) $(CUINC) $(USE_NVTX) $(CUARCHFLAGS) -use_fast_math
+  ###CUFLAGS += -Xcompiler -Wall -Xcompiler -Wextra -Xcompiler -Wshadow
+  ###NVCC_VERSION = $(shell $(NVCC) --version | grep 'Cuda compilation tools' | cut -d' ' -f5 | cut -d, -f1)
+  CUFLAGS += -std=c++17 # need CUDA >= 11.2 (see #333): this is enforced in mgOnGpuConfig.h
+  # Without -maxrregcount: baseline throughput: 6.5E8 (16384 32 12) up to 7.3E8 (65536 128 12)
+  ###CUFLAGS+= --maxrregcount 160 # improves throughput: 6.9E8 (16384 32 12) up to 7.7E8 (65536 128 12)
+  ###CUFLAGS+= --maxrregcount 128 # improves throughput: 7.3E8 (16384 32 12) up to 7.6E8 (65536 128 12)
+  ###CUFLAGS+= --maxrregcount 96 # degrades throughput: 4.1E8 (16384 32 12) up to 4.5E8 (65536 128 12)
+  ###CUFLAGS+= --maxrregcount 64 # degrades throughput: 1.7E8 (16384 32 12) flat at 1.7E8 (65536 128 12)
+else ifneq ($(origin REQUIRE_CUDA),undefined)
+  # If REQUIRE_CUDA is set but no cuda is found, stop here (e.g. for CI tests on GPU #443)
+  $(error No cuda installation found (set CUDA_HOME or make nvcc visible in PATH))
+else
+  # No cuda. Switch cuda compilation off and go to common random numbers in C++
+  $(warning CUDA_HOME is not set or is invalid: export CUDA_HOME to compile with cuda)
+  override NVCC=
+  override USE_NVTX=
+  override CUINC=
+  override CURANDLIBFLAGS=
+endif
 
-  # Allow newer (unsupported) C++ compilers with older versions of CUDA if ALLOW_UNSUPPORTED_COMPILER_IN_CUDA is set (#504)
-  ifneq ($(origin ALLOW_UNSUPPORTED_COMPILER_IN_CUDA),undefined)
-  GPUFLAGS += -allow-unsupported-compiler
-  endif
+# Set the host C++ compiler for nvcc via "-ccbin <host-compiler>"
+# (NB issue #505: this must be a single word, "clang++ --gcc-toolchain..." is not supported)
+CUFLAGS += -ccbin $(shell which $(subst ccache ,,$(CXX)))
 
+# Allow newer (unsupported) C++ compilers with older versions of CUDA if ALLOW_UNSUPPORTED_COMPILER_IN_CUDA is set (#504)
+ifneq ($(origin ALLOW_UNSUPPORTED_COMPILER_IN_CUDA),undefined)
+CUFLAGS += -allow-unsupported-compiler
 endif
 
-  
 #-------------------------------------------------------------------------------
 
 #=== Configure ccache for C++ and CUDA builds
@@ -233,9 +163,9 @@ endif
 #ifeq ($(USECCACHE)$(shell echo $(AR) | grep ccache),1)
 #  override AR:=ccache $(AR)
 #endif
-ifneq ($(GPUCC),)
-  ifeq ($(USECCACHE)$(shell echo $(GPUCC) | grep ccache),1)
-    override GPUCC:=ccache $(GPUCC)
+ifneq ($(NVCC),)
+  ifeq ($(USECCACHE)$(shell echo $(NVCC) | grep ccache),1)
+    override NVCC:=ccache $(NVCC)
   endif
 endif
 
@@ -259,7 +189,7 @@ endif
 
 # PowerPC-specific CUDA compiler flags (to be reviewed!)
 ifeq ($(UNAME_P),ppc64le)
-  GPUFLAGS+= -Xcompiler -mno-float128
+  CUFLAGS+= -Xcompiler -mno-float128
 endif
 
 #-------------------------------------------------------------------------------
@@ -269,10 +199,10 @@ endif
 # Set the default OMPFLAGS choice
 ifneq ($(shell $(CXX) --version | egrep '^Intel'),)
 override OMPFLAGS = -fopenmp
-###override OMPFLAGS = # disable OpenMP MT on Intel (was ok without GPUCC but not ok with GPUCC before #578)
+###override OMPFLAGS = # disable OpenMP MT on Intel (was ok without nvcc but not ok with nvcc before #578)
 else ifneq ($(shell $(CXX) --version | egrep '^(clang)'),)
 override OMPFLAGS = -fopenmp
-###override OMPFLAGS = # disable OpenMP MT on clang (was not ok without or with GPUCC before #578)
+###override OMPFLAGS = # disable OpenMP MT on clang (was not ok without or with nvcc before #578)
 else ifneq ($(shell $(CXX) --version | egrep '^(Apple clang)'),)
 override OMPFLAGS = # disable OpenMP MT on Apple clang (builds fail in the CI #578)
 else
@@ -323,10 +253,7 @@ endif
 
 # Set the default RNDGEN (random number generator) choice
 ifeq ($(RNDGEN),)
-  ifeq ($(GPUCC),)
-    override RNDGEN = hasNoCurand
-  # Edgecase for HIP compilation
-  else ifeq ($(findstring hipcc,$(GPUCC)),hipcc)
+  ifeq ($(NVCC),)
     override RNDGEN = hasNoCurand
   else ifeq ($(RNDGEN),)
     override RNDGEN = hasCurand
@@ -401,13 +328,13 @@ CXXFLAGS+= $(AVXFLAGS)
 $(info FPTYPE=$(FPTYPE))
 ifeq ($(FPTYPE),d)
   CXXFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_DOUBLE
-  GPUFLAGS  += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_DOUBLE
+  CUFLAGS  += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_DOUBLE
 else ifeq ($(FPTYPE),f)
   CXXFLAGS += -DMGONGPU_FPTYPE_FLOAT -DMGONGPU_FPTYPE2_FLOAT
-  GPUFLAGS  += -DMGONGPU_FPTYPE_FLOAT -DMGONGPU_FPTYPE2_FLOAT
+  CUFLAGS  += -DMGONGPU_FPTYPE_FLOAT -DMGONGPU_FPTYPE2_FLOAT
 else ifeq ($(FPTYPE),m)
   CXXFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_FLOAT
-  GPUFLAGS  += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_FLOAT
+  CUFLAGS  += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_FLOAT
 else
   $(error Unknown FPTYPE='$(FPTYPE)': only 'd', 'f' and 'm' are supported)
 endif
@@ -416,7 +343,7 @@ endif
 $(info HELINL=$(HELINL))
 ifeq ($(HELINL),1)
   CXXFLAGS += -DMGONGPU_INLINE_HELAMPS
-  GPUFLAGS  += -DMGONGPU_INLINE_HELAMPS
+  CUFLAGS  += -DMGONGPU_INLINE_HELAMPS
 else ifneq ($(HELINL),0)
   $(error Unknown HELINL='$(HELINL)': only '0' and '1' are supported)
 endif
@@ -425,7 +352,7 @@ endif
 $(info HRDCOD=$(HRDCOD))
 ifeq ($(HRDCOD),1)
   CXXFLAGS += -DMGONGPU_HARDCODE_PARAM
-  GPUFLAGS  += -DMGONGPU_HARDCODE_PARAM
+  CUFLAGS  += -DMGONGPU_HARDCODE_PARAM
 else ifneq ($(HRDCOD),0)
   $(error Unknown HRDCOD='$(HRDCOD)': only '0' and '1' are supported)
 endif
@@ -477,11 +404,11 @@ ifeq ($(UNAME_S),Darwin)
   override CULIBFLAGSRPATH2 =
 else
   # RPATH to cuda/cpp libs when linking executables
-  override CXXLIBFLAGSRPATH = -Wl,-rpath=$(LIBDIRRPATH)
-  override CULIBFLAGSRPATH = -Xlinker -rpath=$(LIBDIRRPATH)
+  override CXXLIBFLAGSRPATH = -Wl,-rpath,$(LIBDIRRPATH)
+  override CULIBFLAGSRPATH = -Xlinker -rpath,$(LIBDIRRPATH)
   # RPATH to common lib when linking cuda/cpp libs
-  override CXXLIBFLAGSRPATH2 = -Wl,-rpath='$$ORIGIN'
-  override CULIBFLAGSRPATH2 = -Xlinker -rpath='$$ORIGIN'
+  override CXXLIBFLAGSRPATH2 = -Wl,-rpath,'$$ORIGIN'
+  override CULIBFLAGSRPATH2 = -Xlinker -rpath,'$$ORIGIN'
 endif
 
 # Setting LD_LIBRARY_PATH or DYLD_LIBRARY_PATH in the RUNTIME is no longer necessary (neither on Linux nor on Mac)
@@ -494,7 +421,7 @@ override RUNTIME =
 cxx_main=$(BUILDDIR)/check.exe
 fcxx_main=$(BUILDDIR)/fcheck.exe
 
-ifneq ($(GPUCC),)
+ifneq ($(NVCC),)
 cu_main=$(BUILDDIR)/gcheck.exe
 fcu_main=$(BUILDDIR)/fgcheck.exe
 else
@@ -525,16 +452,15 @@ $(BUILDDIR)/.build.$(TAG):
 	@touch $(BUILDDIR)/.build.$(TAG)
 
 # Generic target and build rules: objects from CUDA compilation
-ifneq ($(GPUCC),)
+ifneq ($(NVCC),)
 $(BUILDDIR)/%.o : %.cu *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
 	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
-	$(GPUCC) $(CPPFLAGS) $(GPUFLAGS) $(CUBUILDRULEFLAGS) $< -o $@
+	$(NVCC) $(CPPFLAGS) $(CUFLAGS) -Xcompiler -fPIC -c $< -o $@
 
 $(BUILDDIR)/%_cu.o : %.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
 	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
-	$(GPUCC) $(CPPFLAGS) $(GPUFLAGS) $(CCBUILDRULEFLAGS) $< -o $@
+	$(NVCC) $(CPPFLAGS) $(CUFLAGS) -Xcompiler -fPIC -c -x cu $< -o $@
 endif
-# -x cu in line above
 
 # Generic target and build rules: objects from C++ compilation
 # (NB do not include CUINC here! add it only for NVTX or curand #679)
@@ -543,14 +469,11 @@ $(BUILDDIR)/%.o : %.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
 	$(CXX) $(CPPFLAGS) $(CXXFLAGS) -fPIC -c $< -o $@
 
 # Apply special build flags only to CrossSectionKernel.cc and gCrossSectionKernel.cu (no fast math, see #117)
-# Added edgecase for HIP compilation
 ifeq ($(shell $(CXX) --version | grep ^nvc++),)
 $(BUILDDIR)/CrossSectionKernels.o: CXXFLAGS += -fno-fast-math
 $(BUILDDIR)/CrossSectionKernels.o: CXXFLAGS += -fno-fast-math
-ifeq ($(findstring nvcc,$(GPUCC)),nvcc)
-  $(BUILDDIR)/gCrossSectionKernels.o: GPUFLAGS += -Xcompiler -fno-fast-math
-else
-  $(BUILDDIR)/gCrossSectionKernels.o: GPUFLAGS += -fno-fast-math
+ifneq ($(NVCC),)
+$(BUILDDIR)/gCrossSectionKernels.o: CUFLAGS += -Xcompiler -fno-fast-math
 endif
 endif
 
@@ -566,10 +489,10 @@ ifeq ($(RNDGEN),hasCurand)
 $(BUILDDIR)/CurandRandomNumberKernel.o: CXXFLAGS += $(CUINC)
 endif
 
-# Avoid "warning: builtin __has_trivial_... is deprecated; use __is_trivially_... instead" in GPUCC with icx2023 (#592)
+# Avoid "warning: builtin __has_trivial_... is deprecated; use __is_trivially_... instead" in nvcc with icx2023 (#592)
 ifneq ($(shell $(CXX) --version | egrep '^(Intel)'),)
-ifneq ($(GPUCC),)
-GPUFLAGS += -Wno-deprecated-builtins
+ifneq ($(NVCC),)
+CUFLAGS += -Xcompiler -Wno-deprecated-builtins
 endif
 endif
 
@@ -577,8 +500,8 @@ endif
 # This patch does remove the warning, but I prefer to keep it disabled for the moment...
 ###ifneq ($(shell $(CXX) --version | egrep '^(clang|Apple clang|Intel)'),)
 ###$(BUILDDIR)/CrossSectionKernels.o: CXXFLAGS += -Wno-overriding-t-option
-###ifneq ($(GPUCC),)
-###$(BUILDDIR)/gCrossSectionKernels.o: GPUFLAGS += -Xcompiler -Wno-overriding-t-option
+###ifneq ($(NVCC),)
+###$(BUILDDIR)/gCrossSectionKernels.o: CUFLAGS += -Xcompiler -Wno-overriding-t-option
 ###endif
 ###endif
 
@@ -605,7 +528,7 @@ MG5AMC_CXXLIB = mg5amc_$(processid_short)_cpp
 cxx_objects_lib=$(BUILDDIR)/CPPProcess.o $(BUILDDIR)/MatrixElementKernels.o $(BUILDDIR)/BridgeKernels.o $(BUILDDIR)/CrossSectionKernels.o
 cxx_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel.o $(BUILDDIR)/RamboSamplingKernels.o
 
-ifneq ($(GPUCC),)
+ifneq ($(NVCC),)
 MG5AMC_CULIB = mg5amc_$(processid_short)_cuda
 cu_objects_lib=$(BUILDDIR)/gCPPProcess.o $(BUILDDIR)/gMatrixElementKernels.o $(BUILDDIR)/gBridgeKernels.o $(BUILDDIR)/gCrossSectionKernels.o
 cu_objects_exe=$(BUILDDIR)/gCommonRandomNumberKernel.o $(BUILDDIR)/gRamboSamplingKernels.o
@@ -617,11 +540,11 @@ $(LIBDIR)/lib$(MG5AMC_CXXLIB).so: cxx_objects_lib += $(BUILDDIR)/fbridge.o
 $(LIBDIR)/lib$(MG5AMC_CXXLIB).so: $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib)
 	$(CXX) -shared -o $@ $(cxx_objects_lib) $(CXXLIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB)
 
-ifneq ($(GPUCC),)
+ifneq ($(NVCC),)
 $(LIBDIR)/lib$(MG5AMC_CULIB).so: $(BUILDDIR)/fbridge_cu.o
 $(LIBDIR)/lib$(MG5AMC_CULIB).so: cu_objects_lib += $(BUILDDIR)/fbridge_cu.o
 $(LIBDIR)/lib$(MG5AMC_CULIB).so: $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cu_objects_lib)
-	$(GPUCC) --shared -o $@ $(cu_objects_lib) $(CULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB)
+	$(NVCC) --shared -o $@ $(cu_objects_lib) $(CULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB)
 endif
 
 #-------------------------------------------------------------------------------
@@ -638,16 +561,16 @@ $(cxx_main): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PAT
 $(cxx_main): $(BUILDDIR)/check_sa.o $(LIBDIR)/lib$(MG5AMC_CXXLIB).so $(cxx_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel.o
 	$(CXX) -o $@ $(BUILDDIR)/check_sa.o $(OMPFLAGS) -ldl -pthread $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_CXXLIB) $(cxx_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel.o $(CURANDLIBFLAGS)
 
-ifneq ($(GPUCC),)
+ifneq ($(NVCC),)
 ifneq ($(shell $(CXX) --version | grep ^Intel),)
-$(cu_main): LIBFLAGS += -lintlc # compile with icpx and link with GPUCC (undefined reference to `_intel_fast_memcpy')
-$(cu_main): LIBFLAGS += -lsvml # compile with icpx and link with GPUCC (undefined reference to `__svml_cos4_l9')
+$(cu_main): LIBFLAGS += -lintlc # compile with icpx and link with nvcc (undefined reference to `_intel_fast_memcpy')
+$(cu_main): LIBFLAGS += -lsvml # compile with icpx and link with nvcc (undefined reference to `__svml_cos4_l9')
 else ifneq ($(shell $(CXX) --version | grep ^nvc++),) # support nvc++ #531
 $(cu_main): LIBFLAGS += -L$(patsubst %bin/nvc++,%lib,$(subst ccache ,,$(CXX))) -lnvhpcatm -lnvcpumath -lnvc
 endif
 $(cu_main): LIBFLAGS += $(CULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH
 $(cu_main): $(BUILDDIR)/gcheck_sa.o $(LIBDIR)/lib$(MG5AMC_CULIB).so $(cu_objects_exe) $(BUILDDIR)/gCurandRandomNumberKernel.o
-	$(GPUCC) -o $@ $(BUILDDIR)/gcheck_sa.o $(CUARCHFLAGS) $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe) $(BUILDDIR)/gCurandRandomNumberKernel.o $(CURANDLIBFLAGS)
+	$(NVCC) -o $@ $(BUILDDIR)/gcheck_sa.o $(CUARCHFLAGS) $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe) $(BUILDDIR)/gCurandRandomNumberKernel.o $(CURANDLIBFLAGS)
 endif
 
 #-------------------------------------------------------------------------------
@@ -673,17 +596,17 @@ $(fcxx_main): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PA
 $(fcxx_main): $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler.o $(LIBDIR)/lib$(MG5AMC_CXXLIB).so $(cxx_objects_exe)
 	$(CXX) -o $@ $(BUILDDIR)/fcheck_sa.o $(OMPFLAGS) $(BUILDDIR)/fsampler.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CXXLIB) $(cxx_objects_exe)
 
-ifneq ($(GPUCC),)
+ifneq ($(NVCC),)
 ifneq ($(shell $(CXX) --version | grep ^Intel),)
-$(fcu_main): LIBFLAGS += -lintlc # compile with icpx and link with GPUCC (undefined reference to `_intel_fast_memcpy')
-$(fcu_main): LIBFLAGS += -lsvml # compile with icpx and link with GPUCC (undefined reference to `__svml_cos4_l9')
+$(fcu_main): LIBFLAGS += -lintlc # compile with icpx and link with nvcc (undefined reference to `_intel_fast_memcpy')
+$(fcu_main): LIBFLAGS += -lsvml # compile with icpx and link with nvcc (undefined reference to `__svml_cos4_l9')
 endif
 ifeq ($(UNAME_S),Darwin)
 $(fcu_main): LIBFLAGS += -L$(shell dirname $(shell $(FC) --print-file-name libgfortran.dylib)) # add path to libgfortran on Mac #375
 endif
 $(fcu_main): LIBFLAGS += $(CULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH
 $(fcu_main): $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler_cu.o $(LIBDIR)/lib$(MG5AMC_CULIB).so $(cu_objects_exe)
-	$(GPUCC) -o $@ $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler_cu.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe)
+	$(NVCC) -o $@ $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler_cu.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe)
 endif
 
 #-------------------------------------------------------------------------------
@@ -695,7 +618,7 @@ $(BUILDDIR)/testxxx.o: testxxx_cc_ref.txt
 $(testmain): $(BUILDDIR)/testxxx.o
 $(testmain): cxx_objects_exe += $(BUILDDIR)/testxxx.o # Comment out this line to skip the C++ test of xxx functions
 
-ifneq ($(GPUCC),)
+ifneq ($(NVCC),)
 $(BUILDDIR)/testxxx_cu.o: $(GTESTLIBS)
 $(BUILDDIR)/testxxx_cu.o: INCFLAGS += $(GTESTINC)
 $(BUILDDIR)/testxxx_cu.o: testxxx_cc_ref.txt
@@ -708,7 +631,7 @@ $(BUILDDIR)/testmisc.o: INCFLAGS += $(GTESTINC)
 $(testmain): $(BUILDDIR)/testmisc.o
 $(testmain): cxx_objects_exe += $(BUILDDIR)/testmisc.o # Comment out this line to skip the C++ miscellaneous tests
 
-ifneq ($(GPUCC),)
+ifneq ($(NVCC),)
 $(BUILDDIR)/testmisc_cu.o: $(GTESTLIBS)
 $(BUILDDIR)/testmisc_cu.o: INCFLAGS += $(GTESTINC)
 $(testmain): $(BUILDDIR)/testmisc_cu.o
@@ -720,12 +643,12 @@ $(BUILDDIR)/runTest.o: INCFLAGS += $(GTESTINC)
 $(testmain): $(BUILDDIR)/runTest.o
 $(testmain): cxx_objects_exe += $(BUILDDIR)/runTest.o
 
-ifneq ($(GPUCC),)
+ifneq ($(NVCC),)
 $(BUILDDIR)/runTest_cu.o: $(GTESTLIBS)
 $(BUILDDIR)/runTest_cu.o: INCFLAGS += $(GTESTINC)
 ifneq ($(shell $(CXX) --version | grep ^Intel),)
-$(testmain): LIBFLAGS += -lintlc # compile with icpx and link with GPUCC (undefined reference to `_intel_fast_memcpy')
-$(testmain): LIBFLAGS += -lsvml # compile with icpx and link with GPUCC (undefined reference to `__svml_cos4_l9')
+$(testmain): LIBFLAGS += -lintlc # compile with icpx and link with nvcc (undefined reference to `_intel_fast_memcpy')
+$(testmain): LIBFLAGS += -lsvml # compile with icpx and link with nvcc (undefined reference to `__svml_cos4_l9')
 else ifneq ($(shell $(CXX) --version | grep ^nvc++),) # support nvc++ #531
 $(testmain): LIBFLAGS += -L$(patsubst %bin/nvc++,%lib,$(subst ccache ,,$(CXX))) -lnvhpcatm -lnvcpumath -lnvc
 endif
@@ -749,14 +672,14 @@ $(testmain): LIBFLAGS += -lgomp
 endif
 endif
 
-ifeq ($(GPUCC),) # link only runTest.o
+ifeq ($(NVCC),) # link only runTest.o
 $(testmain): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH
 $(testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(cxx_objects_exe) $(GTESTLIBS)
 	$(CXX) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) -ldl -pthread $(LIBFLAGS)
 else # link both runTest.o and runTest_cu.o
 $(testmain): LIBFLAGS += $(CULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH
 $(testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) $(GTESTLIBS)
-	  $(GPUCC) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) -ldl $(LIBFLAGS) $(CUDATESTFLAGS)
+	$(NVCC) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) -ldl $(LIBFLAGS) -lcuda
 endif
 
 # Use flock (Linux only, no Mac) to allow 'make -j' if googletest has not yet been downloaded https://stackoverflow.com/a/32666215
@@ -859,9 +782,9 @@ ifeq ($(USECCACHE),1)
 	ccache --version | head -1
 endif
 	@echo ""
-	@echo GPUCC=$(GPUCC)
-ifneq ($(GPUCC),)
-	$(GPUCC) --version
+	@echo NVCC=$(NVCC)
+ifneq ($(NVCC),)
+	$(NVCC) --version
 endif
 	@echo ""
 	@echo CXX=$(CXX)
@@ -880,7 +803,7 @@ endif
 
 # Target: check (run the C++ test executable)
 # [NB THIS IS WHAT IS USED IN THE GITHUB CI!]
-ifneq ($(GPUCC),)
+ifneq ($(NVCC),)
 check: runTest cmpFcheck cmpFGcheck
 else
 check: runTest cmpFcheck
diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/fbridge.cc b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/fbridge.cc
index 2b956730d4..f93c05b0b3 100644
--- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/fbridge.cc
+++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/fbridge.cc
@@ -1,11 +1,11 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: S. Roiser (Oct 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Roiser, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #include "Bridge.h"
 #include "CPPProcess.h"
-#include "GpuRuntime.h"
+#include "CudaRuntime.h"
 
 extern "C"
 {
@@ -22,7 +22,7 @@ extern "C"
    * Using the same Fortran MadEvent code, linking to the hetrerogeneous library would allow access to both CPU and GPU implementations.
    * The specific heterogeneous configuration (how many GPUs, how many threads on each CPU, etc) could be loaded in CUDA/C++ from a data file.
    */
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
   using namespace mg5amcGpu;
 #else
   using namespace mg5amcCpu;
@@ -46,8 +46,8 @@ extern "C"
    */
   void fbridgecreate_( CppObjectInFortran** ppbridge, const int* pnevtF, const int* pnparF, const int* pnp4F )
   {
-#ifdef MGONGPUCPP_GPUIMPL
-    GpuRuntime::setUp();
+#ifdef __CUDACC__
+    CudaRuntime::setUp();
 #endif
     // Create a process object, read parm card and set parameters
     // FIXME: the process instance can happily go out of scope because it is only needed to read parameters?
@@ -69,8 +69,8 @@ extern "C"
     Bridge<FORTRANFPTYPE>* pbridge = dynamic_cast<Bridge<FORTRANFPTYPE>*>( *ppbridge );
     if( pbridge == 0 ) throw std::runtime_error( "fbridgedelete_: invalid Bridge address" );
     delete pbridge;
-#ifdef MGONGPUCPP_GPUIMPL
-    GpuRuntime::tearDown();
+#ifdef __CUDACC__
+    CudaRuntime::tearDown();
 #endif
   }
 
@@ -100,7 +100,7 @@ extern "C"
   {
     Bridge<FORTRANFPTYPE>* pbridge = dynamic_cast<Bridge<FORTRANFPTYPE>*>( *ppbridge );
     if( pbridge == 0 ) throw std::runtime_error( "fbridgesequence_: invalid Bridge address" );
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
     // Use the device/GPU implementation in the CUDA library
     // (there is also a host implementation in this library)
     pbridge->gpu_sequence( momenta, gs, rndhel, rndcol, *pchannelId, mes, selhel, selcol );
diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/fsampler.cc b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/fsampler.cc
index 3743934f41..2fb445372d 100644
--- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/fsampler.cc
+++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/fsampler.cc
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Feb 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #include "mgOnGpuConfig.h"
 
@@ -13,7 +13,7 @@
 
 //--------------------------------------------------------------------------
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -40,7 +40,7 @@ namespace mg5amcCpu
   private:
     const int m_nevt; // The number of events in each iteration
     int m_iiter;      // The iteration counter (for random number seeding)
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
     HostBufferRndNumMomenta m_hstRndmom; // Memory buffers for random numbers
     HostBufferMomenta m_hstMomenta;      // Memory buffers for momenta
     HostBufferWeights m_hstWeights;      // Memory buffers for sampling weights
@@ -105,7 +105,7 @@ namespace mg5amcCpu
 
 extern "C"
 {
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
   using namespace mg5amcGpu;
 #else
   using namespace mg5amcCpu;
diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/runTest.cc b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/runTest.cc
index 461ec5c3a5..572e28aaea 100644
--- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/runTest.cc
+++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/runTest.cc
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: S. Hageboeck (Nov 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 
 #include "mgOnGpuConfig.h"
 
@@ -15,7 +15,7 @@
 #include "RandomNumberKernels.h"
 #include "epoch_process_id.h"
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 using namespace mg5amcGpu;
 #else
 using namespace mg5amcCpu;
@@ -32,7 +32,7 @@ struct CUDA_CPU_TestBase : public TestDriverBase
     : TestDriverBase( npar, refFileName ) {}
 };
 
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
 struct CPUTest : public CUDA_CPU_TestBase
 {
   // Struct data members (process, and memory structures for random numbers, momenta, matrix elements and weights on host and device)
@@ -114,7 +114,7 @@ struct CPUTest : public CUDA_CPU_TestBase
 };
 #endif
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 struct CUDATest : public CUDA_CPU_TestBase
 {
   // Reset the device when our test goes out of scope. Note that this should happen after
@@ -123,7 +123,7 @@ struct CUDATest : public CUDA_CPU_TestBase
   {
     ~DeviceReset()
     {
-      gpuDeviceReset(); // this is needed by cuda-memcheck --leak-check full
+      checkCuda( cudaDeviceReset() ); // this is needed by cuda-memcheck --leak-check full
     }
   } deviceResetter;
 
@@ -249,7 +249,7 @@ INSTANTIATE_TEST_SUITE_P( prefix, \
                           test_suite_name, \
                           testing::Values( new CUDATest( MG_EPOCH_REFERENCE_FILE_NAME ) ) );
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 MG_INSTANTIATE_TEST_SUITE_GPU( XTESTID_GPU( MG_EPOCH_PROCESS_ID ), MadgraphTest );
 #else
 MG_INSTANTIATE_TEST_SUITE_CPU( XTESTID_CPU( MG_EPOCH_PROCESS_ID ), MadgraphTest );
diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/testmisc.cc b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/testmisc.cc
index 2bd7a9fcf9..989aba1fdc 100644
--- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/testmisc.cc
+++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/testmisc.cc
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 //----------------------------------------------------------------------------
 // Use ./runTest.exe --gtest_filter=*misc to run only this test
 //----------------------------------------------------------------------------
@@ -17,7 +17,7 @@
 #include <sstream>
 #include <typeinfo>
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 #define TESTID( s ) s##_GPU_MISC
 #else
 #define TESTID( s ) s##_CPU_MISC
diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/testxxx.cc b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/testxxx.cc
index 6e8657edca..4243e9fcec 100644
--- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/testxxx.cc
+++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/testxxx.cc
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Apr 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #include "mgOnGpuConfig.h"
 
@@ -20,7 +20,7 @@
 #include <iomanip>
 #include <iostream>
 #include <vector>
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 #define TESTID( s ) s##_GPU_XXX
 #else
 #define TESTID( s ) s##_CPU_XXX
@@ -41,7 +41,7 @@ TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testxxx )
   assert( nevt % neppM == 0 ); // nevt must be a multiple of neppM
   assert( nevt % neppV == 0 ); // nevt must be a multiple of neppV
   // Fill in the input momenta
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
   mg5amcGpu::PinnedHostBufferMomenta hstMomenta( nevt ); // AOSOA[npagM][npar=4][np4=4][neppM]
 #else
   mg5amcCpu::HostBufferMomenta hstMomenta( nevt ); // AOSOA[npagM][npar=4][np4=4][neppM]
@@ -228,7 +228,7 @@ TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testxxx )
   {
     for( int ievt = 0; ievt < nevt; ievt++ )
     {
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
       using namespace mg5amcGpu;
 #else
       using namespace mg5amcCpu;
diff --git a/epochX/cudacpp/gg_ttggg.mad/src/HelAmps_sm.h b/epochX/cudacpp/gg_ttggg.mad/src/HelAmps_sm.h
index f772885631..ee2fcbbde5 100644
--- a/epochX/cudacpp/gg_ttggg.mad/src/HelAmps_sm.h
+++ b/epochX/cudacpp/gg_ttggg.mad/src/HelAmps_sm.h
@@ -4,7 +4,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: A. Valassi (Sep 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
 // MadGraph5_aMC@NLO v. 3.5.0_lo_vect, 2023-06-09
@@ -26,7 +26,7 @@
 //#include <iomanip>
 //#include <iostream>
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/gg_ttggg.mad/src/Parameters_sm.cc b/epochX/cudacpp/gg_ttggg.mad/src/Parameters_sm.cc
index de87dcaf64..d3d01102fd 100644
--- a/epochX/cudacpp/gg_ttggg.mad/src/Parameters_sm.cc
+++ b/epochX/cudacpp/gg_ttggg.mad/src/Parameters_sm.cc
@@ -4,7 +4,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: A. Valassi (Sep 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
 // MadGraph5_aMC@NLO v. 3.5.0_lo_vect, 2023-06-09
diff --git a/epochX/cudacpp/gg_ttggg.mad/src/Parameters_sm.h b/epochX/cudacpp/gg_ttggg.mad/src/Parameters_sm.h
index fe7d686938..6551d8da81 100644
--- a/epochX/cudacpp/gg_ttggg.mad/src/Parameters_sm.h
+++ b/epochX/cudacpp/gg_ttggg.mad/src/Parameters_sm.h
@@ -214,7 +214,7 @@ namespace Parameters_sm_dependentCouplings
 #pragma GCC diagnostic push
 #pragma GCC diagnostic ignored "-Wunused-variable"  // e.g. <<warning: unused variable ‘mdl_G__exp__2’ [-Wunused-variable]>>
 #pragma GCC diagnostic ignored "-Wunused-parameter" // e.g. <<warning: unused parameter ‘G’ [-Wunused-parameter]>>
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 #pragma nv_diagnostic push
 #pragma nv_diag_suppress 177 // e.g. <<warning #177-D: variable "mdl_G__exp__2" was declared but never referenced>>
 #endif
@@ -242,7 +242,7 @@ namespace Parameters_sm_dependentCouplings
     // End SM implementation - no special handling of vectors of floats as in EFT (#439)
     return out;
   }
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 #pragma GCC diagnostic pop
 #pragma nv_diagnostic pop
 #endif
@@ -258,7 +258,7 @@ namespace Parameters_sm_independentCouplings
 
 //==========================================================================
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/gg_ttggg.mad/src/mgOnGpuConfig.h b/epochX/cudacpp/gg_ttggg.mad/src/mgOnGpuConfig.h
index 390766116b..881353abac 100644
--- a/epochX/cudacpp/gg_ttggg.mad/src/mgOnGpuConfig.h
+++ b/epochX/cudacpp/gg_ttggg.mad/src/mgOnGpuConfig.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jul 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MGONGPUCONFIG_H
 #define MGONGPUCONFIG_H 1
@@ -10,26 +10,13 @@
 // There are two different code bases for standalone_cudacpp (without multichannel) and madevent+cudacpp (with multichannel)
 #define MGONGPU_SUPPORTS_MULTICHANNEL 1
 
-// Is this a GPU (CUDA, HIP) or CPU implementation?
-#ifdef __CUDACC__
-#define MGONGPUCPP_GPUIMPL cuda
-#elif defined __HIPCC__
-#define MGONGPUCPP_GPUIMPL hip
-#else
-#undef MGONGPUCPP_GPUIMPL
-#endif
-
 // ** NB1 Throughputs (e.g. 6.8E8) are events/sec for "./gcheck.exe -p 65536 128 12"
 // ** NB2 Baseline on b7g47n0004 fluctuates (probably depends on load on other VMs)
 
 // Choose if curand is supported for generating random numbers
-// For CUDA, by default, it is supported
-// For HIP, by default, it is not supported
 // For C++, by default, do not inline, but allow this macro to be set from outside with e.g. -DMGONGPU_HAS_NO_CURAND
 #ifdef __CUDACC__
 #undef MGONGPU_HAS_NO_CURAND
-#elif defined __HIPCC__
-#define MGONGPU_HAS_NO_CURAND 1
 #else
 //#undef MGONGPU_HAS_NO_CURAND // default
 ////#define MGONGPU_HAS_NO_CURAND 1
@@ -65,28 +52,23 @@
 //#undef MGONGPU_HARDCODE_PARAM // default
 ////#define MGONGPU_HARDCODE_PARAM 1
 
-// Complex type in CUDA: thrust or cucomplex or cxsmpl (CHOOSE ONLY ONE)
+// Complex type in c++: std::complex or cxsmpl (CHOOSE ONLY ONE)
+#ifndef __CUDACC__
+//#define MGONGPU_CPPCXTYPE_STDCOMPLEX 1 // ~8 percent slower on float, same on double (5.1E6/double, 9.4E6/float)
+#define MGONGPU_CPPCXTYPE_CXSMPL 1 // new default (5.1E6/double, 10.2E6/float)
+#endif
+
+// Complex type in cuda: thrust or cucomplex or cxsmpl (CHOOSE ONLY ONE)
 #ifdef __CUDACC__
 #define MGONGPU_CUCXTYPE_THRUST 1 // default (~1.15E9/double, ~3.2E9/float)
 //#define MGONGPU_CUCXTYPE_CUCOMPLEX 1 // ~10 percent slower (1.03E9/double, ~2.8E9/float)
 //#define MGONGPU_CUCXTYPE_CXSMPL 1 // ~10 percent slower (1.00E9/double, ~2.9E9/float)
-
-// Complex type in HIP: cxsmpl (ONLY ONE OPTION POSSIBLE)
-#elif defined __HIPCC__
-#define MGONGPU_CUCXTYPE_CXSMPL 1 // ~10 percent slower (1.00E9/double, ~2.9E9/float)
-
-// Complex type in C++: std::complex or cxsmpl (CHOOSE ONLY ONE)
-#else
-//#define MGONGPU_CPPCXTYPE_STDCOMPLEX 1 // ~8 percent slower on float, same on double (5.1E6/double, 9.4E6/float)
-#define MGONGPU_CPPCXTYPE_CXSMPL 1 // new default (5.1E6/double, 10.2E6/float)
 #endif
 
-// CUDA nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation
+// Cuda nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation
 #ifdef __CUDACC__
-#undef MGONGPU_NSIGHT_DEBUG // default in CUDA
+#undef MGONGPU_NSIGHT_DEBUG // default
 //#define MGONGPU_NSIGHT_DEBUG 1
-#else
-#undef MGONGPU_NSIGHT_DEBUG // only option in HIP or C++
 #endif
 
 // SANITY CHECKS (floating point precision for everything but color algebra #537)
@@ -102,21 +84,17 @@
 #error You cannot use double precision for color algebra and single precision elsewhere
 #endif
 
-// SANITY CHECKS (CUDA complex number implementation)
-#ifdef __CUDACC__
-#if defined MGONGPU_CUCXTYPE_THRUST and defined MGONGPU_CUCXTYPE_CUCOMPLEX
-#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CUCXTYPE_THRUST or MGONGPU_CUCXTYPE_CUCOMPLEX for CUDA
-#elif defined MGONGPU_CUCXTYPE_THRUST and defined MGONGPU_CUCXTYPE_CXSMPL
-#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CUCXTYPE_THRUST or MGONGPU_CUCXTYPE_CXSMPL for CUDA
-#elif defined MGONGPU_CUCXTYPE_CUCOMPLEX and defined MGONGPU_CUCXTYPE_CXSMPL
-#error You must CHOOSE (ONE AND) ONLY ONE OF MGONGPU_CUCXTYPE_CUCOMPLEX or MGONGPU_CUCXTYPE_CXSMPL for CUDA
+// SANITY CHECKS (c++ complex number implementation)
+#ifndef __CUDACC__
+#if defined MGONGPU_CPPCXTYPE_STDCOMPLEX and defined MGONGPU_CPPCXTYPE_CXSMPL
+#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CPPCXTYPE_STDCOMPLEX or MGONGPU_CPPCXTYPE_CXSMPL
 #endif
 #endif
 
-// SANITY CHECKS (C++ complex number implementation)
-#ifndef MGONGPUCPP_GPUIMPL
-#if defined MGONGPU_CPPCXTYPE_STDCOMPLEX and defined MGONGPU_CPPCXTYPE_CXSMPL
-#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CPPCXTYPE_STDCOMPLEX or MGONGPU_CPPCXTYPE_CXSMPL for C++
+// SANITY CHECKS (cuda complex number implementation)
+#ifdef __CUDACC__
+#if defined MGONGPU_CUCXTYPE_THRUST and defined MGONGPU_CUCXTYPE_CUCOMPLEX and defined MGONGPU_CUCXTYPE_CXSMPL
+#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CUCXTYPE_THRUST or MGONGPU_CUCXTYPE_CUCOMPLEX or MGONGPU_CUCXTYPE_CXSMPL
 #endif
 #endif
 
@@ -153,7 +131,7 @@ namespace mgOnGpu
   // Alignment requirement for using reinterpret_cast with SIMD vectorized code
   // (using reinterpret_cast with non aligned memory may lead to segmentation faults!)
   // Only needed for C++ code but can be enforced also in NVCC builds of C++ code using CUDA>=11.2 and C++17 (#318, #319, #333)
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
   constexpr int cppAlign = 64; // alignment requirement for SIMD vectorization (64-byte i.e. 512-bit)
 #endif
 
@@ -164,7 +142,7 @@ using mgOnGpu::fptype;
 using mgOnGpu::fptype2;
 
 // C++ SIMD vectorization width (this will be used to set neppV)
-#ifdef MGONGPUCPP_GPUIMPL // CUDA and HIP implementations have no SIMD
+#ifdef __CUDACC__ // CUDA implementation has no SIMD
 #undef MGONGPU_CPPSIMD
 #elif defined __AVX512VL__ && defined MGONGPU_PVW512 // C++ "512z" AVX512 with 512 width (512-bit ie 64-byte): 8 (DOUBLE) or 16 (FLOAT)
 #ifdef MGONGPU_FPTYPE_DOUBLE
@@ -194,9 +172,9 @@ using mgOnGpu::fptype2;
 #undef MGONGPU_CPPSIMD
 #endif
 
-// CUDA nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation
+// Cuda nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation
 // Arguments (not used so far): text is __FUNCTION__, code is 0 (start) or 1 (end)
-#if defined __CUDA__ && defined MGONGPU_NSIGHT_DEBUG /* clang-format off */
+#if defined __CUDACC__ && defined MGONGPU_NSIGHT_DEBUG /* clang-format off */
 #define mgDebugDeclare() __shared__ float mgDebugCounter[mgOnGpu::ntpbMAX];
 #define mgDebugInitialise() { mgDebugCounter[threadIdx.x] = 0; }
 #define mgDebug( code, text ) { mgDebugCounter[threadIdx.x] += 1; }
@@ -208,8 +186,8 @@ using mgOnGpu::fptype2;
 #define mgDebugFinalise() { /*noop*/ }
 #endif /* clang-format on */
 
-// Define empty CUDA/HIP declaration specifiers for C++
-#ifndef MGONGPUCPP_GPUIMPL
+// Define empty CUDA declaration specifiers for C++
+#ifndef __CUDACC__
 #define __global__
 #define __host__
 #define __device__
diff --git a/epochX/cudacpp/gg_ttggg.mad/src/mgOnGpuCxtypes.h b/epochX/cudacpp/gg_ttggg.mad/src/mgOnGpuCxtypes.h
index 4e7ab03fa2..0cb2f1db7e 100644
--- a/epochX/cudacpp/gg_ttggg.mad/src/mgOnGpuCxtypes.h
+++ b/epochX/cudacpp/gg_ttggg.mad/src/mgOnGpuCxtypes.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022, based on earlier work by D. Smith) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MGONGPUCXTYPES_H
 #define MGONGPUCXTYPES_H 1
@@ -19,7 +19,7 @@
 #include <complex>
 
 // Complex type in cuda: thrust or cucomplex or cxsmpl
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 #if defined MGONGPU_CUCXTYPE_THRUST
 #pragma clang diagnostic push
 #pragma clang diagnostic ignored "-Wtautological-compare" // for icpx2021/clang13 (https://stackoverflow.com/a/15864661)
@@ -201,7 +201,7 @@ namespace mgOnGpu
 {
 
   // --- Type definitions (complex type: cxtype)
-#ifdef MGONGPUCPP_GPUIMPL // cuda
+#ifdef __CUDACC__ // cuda
 #if defined MGONGPU_CUCXTYPE_THRUST
   typedef thrust::complex<fptype> cxtype;
 #elif defined MGONGPU_CUCXTYPE_CUCOMPLEX
@@ -281,7 +281,7 @@ cxmake( const std::complex<double>& c ) // std::complex to cxsmpl (double-to-flo
 
 //==========================================================================
 
-#if defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CUCXTYPE_THRUST // cuda + thrust
+#if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_THRUST // cuda + thrust
 
 //------------------------------
 // CUDA - using thrust::complex
@@ -317,11 +317,11 @@ cxmake( const cxtype& c )
   return c;
 }
 
-#endif // #if defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CUCXTYPE_THRUST
+#endif // #if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_THRUST
 
 //==========================================================================
 
-#if defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CUCXTYPE_CUCOMPLEX // cuda + cucomplex
+#if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_CUCOMPLEX // cuda + cucomplex
 
 //------------------------------
 // CUDA - using cuComplex
@@ -536,11 +536,11 @@ cxmake( const std::complex<fptype>& c ) // std::complex to cucomplex (float-to-f
   return cxmake( c.real(), c.imag() );
 }
 
-#endif // #if defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CUCXTYPE_CUCOMPLEX
+#endif // #if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_CUCOMPLEX
 
 //==========================================================================
 
-#if not defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CPPCXTYPE_STDCOMPLEX // c++ + stdcomplex
+#if not defined __CUDACC__ and defined MGONGPU_CPPCXTYPE_STDCOMPLEX // c++ + stdcomplex
 
 //------------------------------
 // C++ - using std::complex
@@ -584,7 +584,7 @@ cxmake( const std::complex<double>& c ) // std::complex to std::complex (cast do
 }
 #endif
 
-#endif // #if not defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CPPCXTYPE_STDCOMPLEX
+#endif // #if not defined __CUDACC__ and defined MGONGPU_CPPCXTYPE_STDCOMPLEX
 
 //==========================================================================
 
diff --git a/epochX/cudacpp/gg_ttggg.mad/src/mgOnGpuFptypes.h b/epochX/cudacpp/gg_ttggg.mad/src/mgOnGpuFptypes.h
index 6f6cee64d6..a1cde16a67 100644
--- a/epochX/cudacpp/gg_ttggg.mad/src/mgOnGpuFptypes.h
+++ b/epochX/cudacpp/gg_ttggg.mad/src/mgOnGpuFptypes.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MGONGPUFPTYPES_H
 #define MGONGPUFPTYPES_H 1
@@ -13,7 +13,7 @@
 
 //==========================================================================
 
-#ifdef MGONGPUCPP_GPUIMPL // cuda
+#ifdef __CUDACC__ // cuda
 
 //------------------------------
 // Floating point types - Cuda
@@ -57,11 +57,11 @@ fpsqrt( const fptype& f )
 #endif
 }
 
-#endif // #ifdef MGONGPUCPP_GPUIMPL
+#endif // #ifdef __CUDACC__
 
 //==========================================================================
 
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
 
 //------------------------------
 // Floating point types - C++
@@ -85,7 +85,7 @@ fpsqrt( const fptype& f )
   return std::sqrt( f );
 }
 
-#endif // #ifndef MGONGPUCPP_GPUIMPL
+#endif // #ifndef __CUDACC__
 
 //==========================================================================
 
diff --git a/epochX/cudacpp/gg_ttggg.mad/src/mgOnGpuVectors.h b/epochX/cudacpp/gg_ttggg.mad/src/mgOnGpuVectors.h
index 7904b93c61..9d3e82b1e3 100644
--- a/epochX/cudacpp/gg_ttggg.mad/src/mgOnGpuVectors.h
+++ b/epochX/cudacpp/gg_ttggg.mad/src/mgOnGpuVectors.h
@@ -108,7 +108,7 @@ namespace mgOnGpu /* clang-format off */
 #endif
 #endif
 
-#else // i.e #ifndef MGONGPU_CPPSIMD (this includes #ifdef MGONGPUCPP_GPUIMPL)
+#else // i.e #ifndef MGONGPU_CPPSIMD (this includes #ifdef __CUDACC__)
 
   const int neppV = 1;
 
@@ -129,7 +129,7 @@ using mgOnGpu::bool_v;
 
 //--------------------------------------------------------------------------
 
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
 
 // Printout to stream for user defined types
 
@@ -744,11 +744,11 @@ fpvsplit1( const fptype2_v& v )
 
 #endif // #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
 
-#endif // #ifndef MGONGPUCPP_GPUIMPL
+#endif // #ifndef __CUDACC__
 
 //==========================================================================
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 
 //------------------------------
 // Vector types - CUDA
@@ -786,12 +786,12 @@ cxternary( const bool& mask, const cxtype& a, const cxtype& b )
   return ( mask ? a : b );
 }
 
-#endif // #ifdef MGONGPUCPP_GPUIMPL
+#endif // #ifdef __CUDACC__
 
 //==========================================================================
 
 // Scalar-or-vector types: scalar in CUDA, vector or scalar in C++
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 typedef bool bool_sv;
 typedef fptype fptype_sv;
 typedef fptype2 fptype2_sv;
@@ -812,7 +812,7 @@ typedef mgOnGpu::cxtype_ref cxtype_sv_ref;
 #endif
 
 // Scalar-or-vector zeros: scalar in CUDA, vector or scalar in C++
-#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
+#ifdef __CUDACC__ /* clang-format off */
 inline __host__ __device__ cxtype cxzero_sv(){ return cxtype( 0, 0 ); }
 #elif defined MGONGPU_CPPSIMD
 inline cxtype_v cxzero_sv() { return cxtype_v(); } // RRRR=0000 IIII=0000
diff --git a/epochX/cudacpp/gg_ttggg.mad/src/rambo.h b/epochX/cudacpp/gg_ttggg.mad/src/rambo.h
index cd7e1008ea..e02ea52496 100644
--- a/epochX/cudacpp/gg_ttggg.mad/src/rambo.h
+++ b/epochX/cudacpp/gg_ttggg.mad/src/rambo.h
@@ -4,7 +4,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 
 #include "mgOnGpuConfig.h"
@@ -18,7 +18,7 @@
 #include <iostream>
 
 // Simplified rambo version for 2 to N (with N>=2) processes with massless particles
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -83,7 +83,7 @@ namespace mg5amcCpu
       static bool first = true;
       if( first )
       {
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
         if constexpr( M_ACCESS::isOnDevice() ) // avoid
         {
           const int ievt0 = 0;
@@ -166,7 +166,7 @@ namespace mg5amcCpu
     wt = po2log;
     if( nparf != 2 ) wt = ( 2. * nparf - 4. ) * log( energy ) + z[nparf - 1];
 
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
     // issue warnings if weight is too small or too large
     static int iwarn[5] = { 0, 0, 0, 0, 0 };
     if( wt < -180. )
diff --git a/epochX/cudacpp/gg_ttggg.sa/COPYRIGHT b/epochX/cudacpp/gg_ttggg.sa/COPYRIGHT
index 84a883fbb0..a134b5fef9 100644
--- a/epochX/cudacpp/gg_ttggg.sa/COPYRIGHT
+++ b/epochX/cudacpp/gg_ttggg.sa/COPYRIGHT
@@ -15,7 +15,6 @@ The full development team currently includes the following authors :
   Stephan Hageboeck (CERN)
   Olivier Mattelaer (Universite Catholique de Louvain, original author)
   Stefan Roiser (CERN, original author)
-  Joergen Teig (CERN)
   Andrea Valassi (CERN, original author)
   Zenny Wettersten (CERN)
 See https://github.com/madgraph5/madgraph4gpu for more details. For the full
diff --git a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/Bridge.h b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/Bridge.h
index c04628dfd1..4cafe0c997 100644
--- a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/Bridge.h
+++ b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/Bridge.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: S. Roiser (Nov 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Roiser, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef BRIDGE_H
 #define BRIDGE_H 1
@@ -22,7 +22,7 @@
 #include <memory>
 #include <type_traits>
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -82,7 +82,7 @@ namespace mg5amcCpu
     Bridge& operator=( const Bridge& ) = delete;
     Bridge& operator=( Bridge&& ) = delete;
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
     /**
      * Set the gpublocks and gputhreads for the gpusequence - throws if evnt != gpublocks*gputhreads
      * (this is needed for BridgeKernel tests rather than for actual production use in Fortran)
@@ -149,7 +149,7 @@ namespace mg5amcCpu
     unsigned int m_nevt; // number of events
     int m_nGoodHel;      // the number of good helicities (-1 initially when they have not yet been calculated)
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
     int m_gputhreads; // number of gpu threads (default set from number of events, can be modified)
     int m_gpublocks;  // number of gpu blocks (default set from number of events, can be modified)
     mg5amcGpu::DeviceBuffer<FORTRANFPTYPE, sizePerEventMomenta> m_devMomentaF;
@@ -186,12 +186,12 @@ namespace mg5amcCpu
   // Forward declare transposition methods
   //
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 
   template<typename Tin, typename Tout>
   __global__ void dev_transposeMomentaF2C( const Tin* in, Tout* out, const unsigned int nevt );
 
-#endif // MGONGPUCPP_GPUIMPL
+#endif // __CUDACC__
 
   template<typename Tin, typename Tout>
   void hst_transposeMomentaF2C( const Tin* in, Tout* out, const unsigned int nevt );
@@ -208,7 +208,7 @@ namespace mg5amcCpu
   Bridge<FORTRANFPTYPE>::Bridge( unsigned int nevtF, unsigned int nparF, unsigned int np4F )
     : m_nevt( nevtF )
     , m_nGoodHel( -1 )
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
     , m_gputhreads( 256 )                  // default number of gpu threads
     , m_gpublocks( m_nevt / m_gputhreads ) // this ensures m_nevt <= m_gpublocks*m_gputhreads
     , m_devMomentaF( m_nevt )
@@ -232,7 +232,7 @@ namespace mg5amcCpu
   {
     if( nparF != CPPProcess::npar ) throw std::runtime_error( "Bridge constructor: npar mismatch" );
     if( np4F != CPPProcess::np4 ) throw std::runtime_error( "Bridge constructor: np4 mismatch" );
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
     if( ( m_nevt < s_gputhreadsmin ) || ( m_nevt % s_gputhreadsmin != 0 ) )
       throw std::runtime_error( "Bridge constructor: nevt should be a multiple of " + std::to_string( s_gputhreadsmin ) );
     while( m_nevt != m_gpublocks * m_gputhreads )
@@ -250,11 +250,11 @@ namespace mg5amcCpu
     std::cout << "WARNING! Instantiate host Bridge (nevt=" << m_nevt << ")" << std::endl;
     mg5amcCpu::CPPProcess process( /*verbose=*/false );
     m_pmek.reset( new mg5amcCpu::MatrixElementKernelHost( m_hstMomentaC, m_hstGs, m_hstRndHel, m_hstRndCol, m_hstMEs, m_hstSelHel, m_hstSelCol, m_nevt ) );
-#endif // MGONGPUCPP_GPUIMPL
+#endif // __CUDACC__
     process.initProc( "../../Cards/param_card.dat" );
   }
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
   template<typename FORTRANFPTYPE>
   void Bridge<FORTRANFPTYPE>::set_gpugrid( const int gpublocks, const int gputhreads )
   {
@@ -268,7 +268,7 @@ namespace mg5amcCpu
   }
 #endif
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
   template<typename FORTRANFPTYPE>
   void Bridge<FORTRANFPTYPE>::gpu_sequence( const FORTRANFPTYPE* momenta,
                                             const FORTRANFPTYPE* gs,
@@ -283,14 +283,14 @@ namespace mg5amcCpu
     constexpr int neppM = MemoryAccessMomenta::neppM;
     if constexpr( neppM == 1 && std::is_same_v<FORTRANFPTYPE, fptype> )
     {
-      gpuMemcpy( m_devMomentaC.data(), momenta, m_devMomentaC.bytes(), gpuMemcpyHostToDevice );
+      checkCuda( cudaMemcpy( m_devMomentaC.data(), momenta, m_devMomentaC.bytes(), cudaMemcpyHostToDevice ) );
     }
     else
     {
-      gpuMemcpy( m_devMomentaF.data(), momenta, m_devMomentaF.bytes(), gpuMemcpyHostToDevice );
+      checkCuda( cudaMemcpy( m_devMomentaF.data(), momenta, m_devMomentaF.bytes(), cudaMemcpyHostToDevice ) );
       const int thrPerEvt = CPPProcess::npar * CPPProcess::np4; // AV: transpose alg does 1 element per thread (NOT 1 event per thread)
       //const int thrPerEvt = 1; // AV: try new alg with 1 event per thread... this seems slower
-      gpuLaunchKernel( dev_transposeMomentaF2C, m_gpublocks * thrPerEvt, m_gputhreads, m_devMomentaF.data(), m_devMomentaC.data(), m_nevt );
+      dev_transposeMomentaF2C<<<m_gpublocks * thrPerEvt, m_gputhreads>>>( m_devMomentaF.data(), m_devMomentaC.data(), m_nevt );
     }
     if constexpr( std::is_same_v<FORTRANFPTYPE, fptype> )
     {
@@ -333,7 +333,7 @@ namespace mg5amcCpu
   }
 #endif
 
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
   template<typename FORTRANFPTYPE>
   void Bridge<FORTRANFPTYPE>::cpu_sequence( const FORTRANFPTYPE* momenta,
                                             const FORTRANFPTYPE* gs,
@@ -388,7 +388,7 @@ namespace mg5amcCpu
   // - C++ array: momenta[npagM][npar][np4][neppM] with nevt=npagM*neppM (AOSOA)
   //
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
   template<typename Tin, typename Tout>
   __global__ void dev_transposeMomentaF2C( const Tin* in, Tout* out, const unsigned int nevt )
   {
diff --git a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/BridgeKernels.cc b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/BridgeKernels.cc
index 90c7f2d3b8..cef4cb3c71 100644
--- a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/BridgeKernels.cc
+++ b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/BridgeKernels.cc
@@ -1,11 +1,10 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #include "BridgeKernels.h"
 
-#include "GpuAbstraction.h"
 #include "MemoryAccessMomenta.h"
 
 #include <sstream>
@@ -15,7 +14,7 @@ constexpr int npar = CPPProcess::npar; // #particles in total (external = initia
 
 //============================================================================
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -46,7 +45,7 @@ namespace mg5amcCpu
 
 //============================================================================
 
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
 namespace mg5amcCpu
 {
 
@@ -97,7 +96,7 @@ namespace mg5amcCpu
 
 //============================================================================
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 namespace mg5amcGpu
 {
 
diff --git a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/BridgeKernels.h b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/BridgeKernels.h
index 3efef8ce97..15eb4bff4d 100644
--- a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/BridgeKernels.h
+++ b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/BridgeKernels.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef BRIDGEKERNELS_H
 #define BRIDGEKERNELS_H 1
@@ -12,7 +12,7 @@
 #include "MatrixElementKernels.h"
 #include "MemoryBuffers.h"
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -49,7 +49,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
   // A Bridge wrapper class encapsulating matrix element calculations on a CPU host
   class BridgeKernelHost final : public BridgeKernelBase
   {
@@ -89,7 +89,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
   // A Bridge wrapper class encapsulating matrix element calculations on a GPU device
   class BridgeKernelDevice : public BridgeKernelBase
   {
diff --git a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/CommonRandomNumberKernel.cc b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/CommonRandomNumberKernel.cc
index 010bc4cbd0..985b39f576 100644
--- a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/CommonRandomNumberKernel.cc
+++ b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/CommonRandomNumberKernel.cc
@@ -1,16 +1,15 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #include "CommonRandomNumbers.h"
-#include "GpuAbstraction.h"
 #include "MemoryBuffers.h"
 #include "RandomNumberKernels.h"
 
 #include <cassert>
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/CrossSectionKernels.cc b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/CrossSectionKernels.cc
index c15b39844d..0b355a3c8d 100644
--- a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/CrossSectionKernels.cc
+++ b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/CrossSectionKernels.cc
@@ -1,11 +1,10 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #include "CrossSectionKernels.h"
 
-#include "GpuAbstraction.h"
 #include "MemoryAccessMatrixElements.h"
 #include "MemoryAccessWeights.h"
 #include "MemoryBuffers.h"
@@ -78,7 +77,7 @@ debug_me_is_abnormal( const fptype& me, size_t ievtALL )
 
 //============================================================================
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -186,7 +185,7 @@ namespace mg5amcCpu
 
 //============================================================================
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 namespace mg5amcGpu
 {
 
diff --git a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/CrossSectionKernels.h b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/CrossSectionKernels.h
index 4d9659e04e..7933ca4bbf 100644
--- a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/CrossSectionKernels.h
+++ b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/CrossSectionKernels.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef CROSSSECTIONKERNELS_H
 #define CROSSSECTIONKERNELS_H 1
@@ -13,7 +13,7 @@
 
 //============================================================================
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -96,7 +96,7 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
   /*
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
   // A class encapsulating the calculation of event statistics on a GPU device
   class CrossSectionKernelDevice : public CrossSectionKernelBase, public NumberOfEvents
   {
diff --git a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/CudaRuntime.h b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/CudaRuntime.h
new file mode 100644
index 0000000000..64ce52f4b3
--- /dev/null
+++ b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/CudaRuntime.h
@@ -0,0 +1,85 @@
+// Copyright (C) 2020-2023 CERN and UCLouvain.
+// Licensed under the GNU Lesser General Public License (version 3 or later).
+// Created by: S. Roiser (Jul 2020) for the MG5aMC CUDACPP plugin.
+// Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+
+#ifndef MG5AMC_CUDARUNTIME_H
+#define MG5AMC_CUDARUNTIME_H 1
+
+// MG5AMC on GPU uses the CUDA runtime API, not the lower level CUDA driver API
+// See https://docs.nvidia.com/cuda/cuda-runtime-api/driver-vs-runtime-api.html#driver-vs-runtime-api
+
+#include <cassert>
+#include <iostream>
+
+//--------------------------------------------------------------------------
+
+// See https://stackoverflow.com/a/14038590
+#ifdef __CUDACC__ /* clang-format off */
+#define checkCuda( code ) { assertCuda( code, __FILE__, __LINE__ ); }
+inline void assertCuda( cudaError_t code, const char* file, int line, bool abort = true )
+{
+  if( code != cudaSuccess )
+  {
+    printf( "ERROR! assertCuda: '%s' (%d) in %s:%d\n", cudaGetErrorString( code ), code, file, line );
+    if( abort ) assert( code == cudaSuccess );
+  }
+}
+#endif /* clang-format on */
+
+//--------------------------------------------------------------------------
+
+#ifdef __CUDACC__
+namespace mg5amcGpu
+{
+  // Instantiate a CudaRuntime at the beginnining of the application's main to
+  // invoke cudaSetDevice(0) in the constructor and book a cudaDeviceReset() call in the destructor
+  // *** FIXME! This will all need to be designed differently when going to multi-GPU nodes! ***
+  struct CudaRuntime final
+  {
+    CudaRuntime( const bool debug = true )
+      : m_debug( debug ) { setUp( m_debug ); }
+    ~CudaRuntime() { tearDown( m_debug ); }
+    CudaRuntime( const CudaRuntime& ) = delete;
+    CudaRuntime( CudaRuntime&& ) = delete;
+    CudaRuntime& operator=( const CudaRuntime& ) = delete;
+    CudaRuntime& operator=( CudaRuntime&& ) = delete;
+    bool m_debug;
+
+    // Set up CUDA application
+    // ** NB: strictly speaking this is not needed when using the CUDA runtime API **
+    // Calling cudaSetDevice on startup is useful to properly book-keep the time spent in CUDA initialization
+    static void setUp( const bool debug = true )
+    {
+      // ** NB: it is useful to call cudaSetDevice, or cudaFree, to properly book-keep the time spent in CUDA initialization
+      // ** NB: otherwise, the first CUDA operation (eg a cudaMemcpyToSymbol in CPPProcess ctor) appears to take much longer!
+      /*
+      // [We initially added cudaFree(0) to "ease profile analysis" only because it shows up as a big recognizable block!]
+      // No explicit initialization is needed: https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#initialization
+      // It is not clear what cudaFree(0) does at all: https://stackoverflow.com/questions/69967813/
+      if ( debug ) std::cout << "__CudaRuntime: calling cudaFree(0)" << std::endl;
+      checkCuda( cudaFree( 0 ) ); // SLOW!
+      */
+      // Replace cudaFree(0) by cudaSetDevice(0), even if it is not really needed either
+      // (but see https://developer.nvidia.com/blog/cuda-pro-tip-always-set-current-device-avoid-multithreading-bugs)
+      if( debug ) std::cout << "__CudaRuntime: calling cudaSetDevice(0)" << std::endl;
+      checkCuda( cudaSetDevice( 0 ) ); // SLOW!
+    }
+
+    // Tear down CUDA application (call cudaDeviceReset)
+    // ** NB: strictly speaking this is not needed when using the CUDA runtime API **
+    // Calling cudaDeviceReset on shutdown is only needed for checking memory leaks in cuda-memcheck
+    // See https://docs.nvidia.com/cuda/cuda-memcheck/index.html#leak-checking
+    static void tearDown( const bool debug = true )
+    {
+      if( debug ) std::cout << "__CudaRuntime: calling cudaDeviceReset()" << std::endl;
+      checkCuda( cudaDeviceReset() );
+    }
+  };
+
+}
+#endif
+
+//--------------------------------------------------------------------------
+
+#endif // MG5AMC_CUDARUNTIME_H
diff --git a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/CurandRandomNumberKernel.cc b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/CurandRandomNumberKernel.cc
index 38c477c17a..eb56333b03 100644
--- a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/CurandRandomNumberKernel.cc
+++ b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/CurandRandomNumberKernel.cc
@@ -1,9 +1,9 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
-#include "GpuRuntime.h"
+#include "CudaRuntime.h"
 #include "MemoryBuffers.h"
 #include "RandomNumberKernels.h"
 
@@ -114,7 +114,7 @@ namespace mg5amcCpu
     /*
     printf( "\nCurandRandomNumberKernel::generateRnarray size = %d\n", (int)m_rnarray.size() );
     fptype* data = m_rnarray.data();
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
     if( m_rnarray.isOnDevice() )
     {
       data = new fptype[m_rnarray.size()]();
@@ -123,7 +123,7 @@ namespace mg5amcCpu
 #endif
     for( int i = 0; i < ( (int)m_rnarray.size() / 4 ); i++ )
       printf( "[%4d] %f %f %f %f\n", i * 4, data[i * 4], data[i * 4 + 2], data[i * 4 + 2], data[i * 4 + 3] );
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
     if( m_rnarray.isOnDevice() ) delete[] data;
 #endif
     */
diff --git a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/EventStatistics.h b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/EventStatistics.h
index b425a5bade..48b51e0a49 100644
--- a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/EventStatistics.h
+++ b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/EventStatistics.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef EventStatistics_H
 #define EventStatistics_H 1
@@ -16,7 +16,7 @@
 #include <limits>
 #include <string>
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MadgraphTest.h b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MadgraphTest.h
index d2ff326e20..fd7734ce42 100644
--- a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MadgraphTest.h
+++ b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MadgraphTest.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: S. Hageboeck (Dec 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MADGRAPHTEST_H_
 #define MADGRAPHTEST_H_ 1
@@ -21,7 +21,7 @@
 #include <string>
 #include <vector>
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 using mg5amcGpu::CPPProcess;
 #else
 using mg5amcCpu::CPPProcess;
@@ -200,7 +200,7 @@ class MadgraphTest : public testing::TestWithParam<TestDriverBase*>
 
 // Since we link both the CPU-only and GPU tests into the same executable, we prevent
 // a multiply defined symbol by only compiling this in the non-CUDA phase:
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
 
 /// Compare momenta and matrix elements.
 /// This uses an implementation of TestDriverBase to run a madgraph workflow,
@@ -309,6 +309,6 @@ TEST_P( MadgraphTest, CompareMomentaAndME )
   }
 }
 
-#endif // MGONGPUCPP_GPUIMPL
+#endif // __CUDACC__
 
 #endif /* MADGRAPHTEST_H_ */
diff --git a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MatrixElementKernels.cc b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MatrixElementKernels.cc
index d6d6c4f179..30257195b6 100644
--- a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MatrixElementKernels.cc
+++ b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MatrixElementKernels.cc
@@ -1,12 +1,12 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #include "MatrixElementKernels.h"
 
 #include "CPPProcess.h"
-#include "GpuRuntime.h" // Includes the abstraction for Nvidia/AMD compilation
+#include "CudaRuntime.h"
 #include "MemoryAccessMomenta.h"
 #include "MemoryBuffers.h"
 
@@ -14,7 +14,7 @@
 
 //============================================================================
 
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
 namespace mg5amcCpu
 {
 
@@ -143,7 +143,7 @@ namespace mg5amcCpu
 
 //============================================================================
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 namespace mg5amcGpu
 {
 
@@ -202,13 +202,13 @@ namespace mg5amcGpu
     PinnedHostBufferHelicityMask hstIsGoodHel( ncomb );
     DeviceBufferHelicityMask devIsGoodHel( ncomb );
     // ... 0d1. Compute good helicity mask on the device
-    gpuLaunchKernel( computeDependentCouplings, m_gpublocks, m_gputhreads, m_gs.data(), m_couplings.data() );
+    computeDependentCouplings<<<m_gpublocks, m_gputhreads>>>( m_gs.data(), m_couplings.data() );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    gpuLaunchKernel( sigmaKin_getGoodHel, m_gpublocks, m_gputhreads, m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_numerators.data(), m_denominators.data(), devIsGoodHel.data() );
+    sigmaKin_getGoodHel<<<m_gpublocks, m_gputhreads>>>( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_numerators.data(), m_denominators.data(), devIsGoodHel.data() );
 #else
-    gpuLaunchKernel( sigmaKin_getGoodHel, m_gpublocks, m_gputhreads, m_momenta.data(), m_couplings.data(), m_matrixElements.data(), devIsGoodHel.data() );
+    sigmaKin_getGoodHel<<<m_gpublocks, m_gputhreads>>>( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), devIsGoodHel.data() );
 #endif
-    checkGpu( gpuPeekAtLastError() );
+    checkCuda( cudaPeekAtLastError() );
     // ... 0d2. Copy back good helicity mask to the host
     copyHostFromDevice( hstIsGoodHel, devIsGoodHel );
     // ... 0d3. Copy back good helicity list to constant memory on the device
@@ -219,19 +219,19 @@ namespace mg5amcGpu
 
   void MatrixElementKernelDevice::computeMatrixElements( const unsigned int channelId )
   {
-    gpuLaunchKernel( computeDependentCouplings, m_gpublocks, m_gputhreads, m_gs.data(), m_couplings.data() );
+    computeDependentCouplings<<<m_gpublocks, m_gputhreads>>>( m_gs.data(), m_couplings.data() );
 #ifndef MGONGPU_NSIGHT_DEBUG
     constexpr unsigned int sharedMemSize = 0;
 #else
     constexpr unsigned int sharedMemSize = ntpbMAX * sizeof( float );
 #endif
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    gpuLaunchKernelSharedMem( sigmaKin, m_gpublocks, m_gputhreads, sharedMemSize, m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), channelId, m_numerators.data(), m_denominators.data(), m_selhel.data(), m_selcol.data() );
+    sigmaKin<<<m_gpublocks, m_gputhreads, sharedMemSize>>>( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), channelId, m_numerators.data(), m_denominators.data(), m_selhel.data(), m_selcol.data() );
 #else
-    gpuLaunchKernelSharedMem( sigmaKin, m_gpublocks, m_gputhreads, sharedMemSize, m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), m_selhel.data(), m_selcol.data() );
+    sigmaKin<<<m_gpublocks, m_gputhreads, sharedMemSize>>>( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), m_selhel.data(), m_selcol.data() );
 #endif
-    checkGpu( gpuPeekAtLastError() );
-    checkGpu( gpuDeviceSynchronize() );
+    checkCuda( cudaPeekAtLastError() );
+    checkCuda( cudaDeviceSynchronize() );
   }
 
   //--------------------------------------------------------------------------
diff --git a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MatrixElementKernels.h b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MatrixElementKernels.h
index 72bd8f195b..23e84757a2 100644
--- a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MatrixElementKernels.h
+++ b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MatrixElementKernels.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MATRIXELEMENTKERNELS_H
 #define MATRIXELEMENTKERNELS_H 1
@@ -10,7 +10,7 @@
 
 #include "MemoryBuffers.h"
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -81,7 +81,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
   // A class encapsulating matrix element calculations on a CPU host
   class MatrixElementKernelHost final : public MatrixElementKernelBase, public NumberOfEvents
   {
@@ -130,7 +130,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
   // A class encapsulating matrix element calculations on a GPU device
   class MatrixElementKernelDevice : public MatrixElementKernelBase, public NumberOfEvents
   {
diff --git a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MemoryAccessHelpers.h b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MemoryAccessHelpers.h
index db73e4e064..c82a6c7635 100644
--- a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MemoryAccessHelpers.h
+++ b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MemoryAccessHelpers.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MemoryAccessHelpers_H
 #define MemoryAccessHelpers_H 1
@@ -105,7 +105,7 @@ class KernelAccessHelper : public MemoryAccessHelper<T>
     }
     else
     {
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
       const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid
       //printf( "kernelAccessRecord: ievt=%d threadId=%d\n", ievt, threadIdx.x );
       return T::ieventAccessRecord( buffer, ievt ); // NB fptype and fptype_sv coincide for CUDA
diff --git a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MemoryAccessMomenta.h b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MemoryAccessMomenta.h
index 38fade09fb..0ac4faa3c7 100644
--- a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MemoryAccessMomenta.h
+++ b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MemoryAccessMomenta.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MemoryAccessMomenta_H
 #define MemoryAccessMomenta_H 1
@@ -12,7 +12,7 @@
 #include "MemoryAccessHelpers.h"
 #include "MemoryAccessVectors.h"
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 using mg5amcGpu::CPPProcess;
 #else
 using mg5amcCpu::CPPProcess;
@@ -29,7 +29,7 @@ class MemoryAccessMomentaBase //_AOSOAv1
 
   // Number of Events Per Page in the momenta AOSOA memory buffer layout
   // (these are all best kept as a compile-time constants: see issue #23)
-#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
+#ifdef __CUDACC__ /* clang-format off */
   // -----------------------------------------------------------------------------------------------
   // --- GPUs: neppM is best set to a power of 2 times the number of fptype's in a 32-byte cacheline
   // --- This is relevant to ensure coalesced access to momenta in global memory
diff --git a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MemoryAccessRandomNumbers.h b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MemoryAccessRandomNumbers.h
index 40cb089135..e2988d39f3 100644
--- a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MemoryAccessRandomNumbers.h
+++ b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MemoryAccessRandomNumbers.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MemoryAccessRandomNumbers_H
 #define MemoryAccessRandomNumbers_H 1
@@ -11,7 +11,7 @@
 #include "CPPProcess.h"
 #include "MemoryAccessHelpers.h"
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 using mg5amcGpu::CPPProcess;
 #else
 using mg5amcCpu::CPPProcess;
diff --git a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MemoryAccessVectors.h b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MemoryAccessVectors.h
index 08faccff0f..e9b197368e 100644
--- a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MemoryAccessVectors.h
+++ b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MemoryAccessVectors.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MemoryAccessVectors_H
 #define MemoryAccessVectors_H 1
@@ -10,7 +10,7 @@
 
 #include "mgOnGpuVectors.h"
 
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
 namespace mg5amcCpu // this is only needed for CPU SIMD vectorization
 {
 
diff --git a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MemoryBuffers.h b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MemoryBuffers.h
index 7756a71621..3093e6ed18 100644
--- a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MemoryBuffers.h
+++ b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MemoryBuffers.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021, based on earlier work by S. Hageboeck) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Roiser, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MemoryBuffers_H
 #define MemoryBuffers_H 1
@@ -11,12 +11,12 @@
 #include "mgOnGpuCxtypes.h"
 
 #include "CPPProcess.h"
-#include "GpuRuntime.h"
+#include "CudaRuntime.h"
 #include "Parameters_sm.h"
 
 #include <sstream>
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -87,7 +87,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
   constexpr bool HostBufferALIGNED = false;   // ismisaligned=false
   constexpr bool HostBufferMISALIGNED = true; // ismisaligned=true
 
@@ -119,7 +119,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
   // A class encapsulating a CUDA pinned host buffer
   template<typename T>
   class PinnedHostBufferBase : public BufferBase<T>
@@ -128,18 +128,18 @@ namespace mg5amcCpu
     PinnedHostBufferBase( const size_t size )
       : BufferBase<T>( size, false )
     {
-      gpuMallocHost( &( this->m_data ), this->bytes() );
+      checkCuda( cudaMallocHost( &( this->m_data ), this->bytes() ) );
     }
     virtual ~PinnedHostBufferBase()
     {
-      gpuFreeHost( this->m_data );
+      checkCuda( cudaFreeHost( this->m_data ) );
     }
   };
 #endif
 
   //--------------------------------------------------------------------------
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
   // A class encapsulating a CUDA device buffer
   template<typename T>
   class DeviceBufferBase : public BufferBase<T>
@@ -148,18 +148,18 @@ namespace mg5amcCpu
     DeviceBufferBase( const size_t size )
       : BufferBase<T>( size, true )
     {
-      gpuMalloc( &( this->m_data ), this->bytes() );
+      checkCuda( cudaMalloc( &( this->m_data ), this->bytes() ) );
     }
     virtual ~DeviceBufferBase()
     {
-      gpuFree( this->m_data );
+      checkCuda( cudaFree( this->m_data ) );
     }
   };
 #endif
 
   //--------------------------------------------------------------------------
 
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
   // A class encapsulating a C++ host buffer for a given number of events
   template<typename T, size_t sizePerEvent, bool ismisaligned>
   class HostBuffer : public HostBufferBase<T, ismisaligned>, virtual private NumberOfEvents
@@ -175,7 +175,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
   // A class encapsulating a CUDA pinned host buffer for a given number of events
   template<typename T, size_t sizePerEvent>
   class PinnedHostBuffer : public PinnedHostBufferBase<T>, virtual private NumberOfEvents
@@ -191,7 +191,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
   // A class encapsulating a CUDA device buffer for a given number of events
   template<typename T, size_t sizePerEvent>
   class DeviceBuffer : public DeviceBufferBase<T>, virtual private NumberOfEvents
@@ -213,7 +213,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for momenta random numbers
   constexpr size_t sizePerEventRndNumMomenta = MemoryBuffers::np4 * MemoryBuffers::nparf;
 
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
   // A class encapsulating a C++ host buffer for momenta random numbers
   typedef HostBuffer<fptype, sizePerEventRndNumMomenta, HostBufferALIGNED> HostBufferRndNumMomenta;
 #else
@@ -232,7 +232,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer with ONE fptype per event
   constexpr size_t sizePerEventOneFp = 1;
 
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
   // A class encapsulating a C++ host buffer with ONE fptype per event
   typedef HostBuffer<fptype, sizePerEventOneFp, HostBufferALIGNED> HostBufferOneFp;
 #else
@@ -257,7 +257,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for Gs
   constexpr size_t sizePerEventGs = 1;
 
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
   // A class encapsulating a C++ host buffer for gs
   typedef HostBuffer<fptype, sizePerEventGs, HostBufferALIGNED> HostBufferGs;
 #else
@@ -276,7 +276,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for numerators
   constexpr size_t sizePerEventNumerators = 1;
 
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
   // A class encapsulating a C++ host buffer for gs
   typedef HostBuffer<fptype, sizePerEventNumerators, HostBufferALIGNED> HostBufferNumerators;
 #else
@@ -296,7 +296,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for denominators
   constexpr size_t sizePerEventDenominators = 1;
 
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
   // A class encapsulating a C++ host buffer for gs
   typedef HostBuffer<fptype, sizePerEventDenominators, HostBufferALIGNED> HostBufferDenominators;
 #else
@@ -315,7 +315,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for random numbers
   constexpr size_t sizePerEventCouplings = MemoryBuffers::ndcoup * MemoryBuffers::nx2;
 
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
   // A class encapsulating a C++ host buffer for gs
   typedef HostBuffer<fptype, sizePerEventCouplings, HostBufferALIGNED> HostBufferCouplings;
 #else
@@ -333,7 +333,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for momenta
   constexpr size_t sizePerEventMomenta = MemoryBuffers::np4 * MemoryBuffers::npar;
 
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
   // A class encapsulating a C++ host buffer for momenta
   typedef HostBuffer<fptype, sizePerEventMomenta, HostBufferALIGNED> HostBufferMomenta;
   //typedef HostBuffer<fptype, sizePerEventMomenta, HostBufferMISALIGNED> HostBufferMomenta; // TEST MISALIGNMENT!
@@ -352,7 +352,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for sampling weights
   constexpr size_t sizePerEventWeights = 1;
 
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
   // A class encapsulating a C++ host buffer for sampling weights
   typedef HostBuffer<fptype, sizePerEventWeights, HostBufferALIGNED> HostBufferWeights;
 #else
@@ -370,7 +370,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for matrix elements
   constexpr size_t sizePerEventMatrixElements = 1;
 
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
   // A class encapsulating a C++ host buffer for matrix elements
   typedef HostBuffer<fptype, sizePerEventMatrixElements, HostBufferALIGNED> HostBufferMatrixElements;
 #else
@@ -385,7 +385,7 @@ namespace mg5amcCpu
   // A base class encapsulating a memory buffer for the helicity mask
   typedef BufferBase<bool> BufferHelicityMask;
 
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
   // A class encapsulating a C++ host buffer for the helicity mask
   typedef HostBufferBase<bool, HostBufferALIGNED> HostBufferHelicityMask;
 #else
@@ -403,7 +403,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for wavefunctions
   constexpr size_t sizePerEventWavefunctions = MemoryBuffers::nw6 * MemoryBuffers::nx2;
 
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
   // A class encapsulating a C++ host buffer for wavefunctions
   typedef HostBuffer<fptype, sizePerEventWavefunctions, HostBufferALIGNED> HostBufferWavefunctions;
 #else
@@ -421,7 +421,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for helicity random numbers
   constexpr size_t sizePerEventRndNumHelicity = 1;
 
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
   // A class encapsulating a C++ host buffer for helicity random numbers
   typedef HostBuffer<fptype, sizePerEventRndNumHelicity, HostBufferALIGNED> HostBufferRndNumHelicity;
 #else
@@ -439,7 +439,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for color random numbers
   constexpr size_t sizePerEventRndNumColor = 1;
 
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
   // A class encapsulating a C++ host buffer for color random numbers
   typedef HostBuffer<fptype, sizePerEventRndNumColor, HostBufferALIGNED> HostBufferRndNumColor;
 #else
@@ -457,7 +457,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for helicity selection
   constexpr size_t sizePerEventSelectedHelicity = 1;
 
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
   // A class encapsulating a C++ host buffer for helicity selection
   typedef HostBuffer<int, sizePerEventSelectedHelicity, HostBufferALIGNED> HostBufferSelectedHelicity;
 #else
@@ -475,7 +475,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for color selection
   constexpr size_t sizePerEventSelectedColor = 1;
 
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
   // A class encapsulating a C++ host buffer for color selection
   typedef HostBuffer<int, sizePerEventSelectedColor, HostBufferALIGNED> HostBufferSelectedColor;
 #else
@@ -487,7 +487,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
   template<class Tdst, class Tsrc>
   void copyDeviceFromHost( Tdst& dst, const Tsrc& src ) // keep the same order of arguments as in memcpy
   {
@@ -504,13 +504,13 @@ namespace mg5amcCpu
       throw std::runtime_error( sstr.str() );
     }
     // NB (PR #45): cudaMemcpy involves an intermediate memcpy to pinned memory if host array is a not a pinned host array
-    gpuMemcpy( dst.data(), src.data(), src.bytes(), gpuMemcpyHostToDevice );
+    checkCuda( cudaMemcpy( dst.data(), src.data(), src.bytes(), cudaMemcpyHostToDevice ) );
   }
 #endif
 
   //--------------------------------------------------------------------------
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
   template<class Tdst, class Tsrc>
   void copyHostFromDevice( Tdst& dst, const Tsrc& src ) // keep the same order of arguments as in memcpy
   {
@@ -527,7 +527,7 @@ namespace mg5amcCpu
       throw std::runtime_error( sstr.str() );
     }
     // NB (PR #45): cudaMemcpy involves an intermediate memcpy to pinned memory if host array is a not a pinned host array
-    gpuMemcpy( dst.data(), src.data(), src.bytes(), gpuMemcpyDeviceToHost );
+    checkCuda( cudaMemcpy( dst.data(), src.data(), src.bytes(), cudaMemcpyDeviceToHost ) );
   }
 #endif
 
diff --git a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/P1_Sigma_sm_gg_ttxggg/CPPProcess.cc b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/P1_Sigma_sm_gg_ttxggg/CPPProcess.cc
index a2f1fc1dc2..b7a16f1170 100644
--- a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/P1_Sigma_sm_gg_ttxggg/CPPProcess.cc
+++ b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/P1_Sigma_sm_gg_ttxggg/CPPProcess.cc
@@ -4,7 +4,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
 // MadGraph5_aMC@NLO v. 3.5.0_lo_vect, 2023-06-09
@@ -16,6 +16,7 @@
 
 #include "mgOnGpuConfig.h"
 
+#include "CudaRuntime.h"
 #include "HelAmps_sm.h"
 #include "MemoryAccessAmplitudes.h"
 #include "MemoryAccessCouplings.h"
@@ -45,7 +46,7 @@
 // Class member functions for calculating the matrix elements for
 // Process: g g > t t~ g g g WEIGHTED<=5 @1
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -79,7 +80,7 @@ namespace mg5amcCpu
   __device__ const fptype cIPD[2] = { (fptype)Parameters_sm::mdl_MT, (fptype)Parameters_sm::mdl_WT };
   __device__ const fptype* cIPC = nullptr; // unused as nicoup=0
 #else
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
   __device__ __constant__ fptype cIPD[2];
   __device__ __constant__ fptype* cIPC = nullptr; // unused as nicoup=0
 #else
@@ -89,7 +90,7 @@ namespace mg5amcCpu
 #endif
 
   // Helicity combinations (and filtering of "good" helicity combinations)
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
   __device__ __constant__ short cHel[ncomb][npar];
   __device__ __constant__ int cNGoodHel;
   __device__ __constant__ int cGoodHel[ncomb];
@@ -117,13 +118,13 @@ namespace mg5amcCpu
                            fptype* allDenominators,       // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
                            fptype_sv* jamp2_sv            // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled)
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
                            , const int ievt00             // input: first event number in current C++ event page (for CUDA, ievt depends on threadid)
 #endif
                            )
   //ALWAYS_INLINE // attributes are not permitted in a function definition
   {
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
     using namespace mg5amcGpu;
     using M_ACCESS = DeviceAccessMomenta;         // non-trivial access: buffer includes all events
     using E_ACCESS = DeviceAccessMatrixElements;  // non-trivial access: buffer includes all events
@@ -150,7 +151,7 @@ namespace mg5amcCpu
 #endif /* clang-format on */
     mgDebug( 0, __FUNCTION__ );
     //printf( "calculate_wavefunctions: ihel=%2d\n", ihel );
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
     //printf( "calculate_wavefunctions: ievt00=%d\n", ievt00 );
 #endif
 
@@ -186,7 +187,7 @@ namespace mg5amcCpu
 #endif
     for( int iParity = 0; iParity < nParity; ++iParity )
     { // START LOOP ON IPARITY
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
       const int ievt0 = ievt00 + iParity * neppV;
 #endif
       constexpr size_t nxcoup = ndcoup + nicoup; // both dependent and independent couplings
@@ -199,10 +200,8 @@ namespace mg5amcCpu
         allCOUPs[idcoup] = CD_ACCESS::idcoupAccessBufferConst( allcouplings, idcoup ); // dependent couplings, vary event-by-event
       for( size_t iicoup = 0; iicoup < nicoup; iicoup++ )
         allCOUPs[ndcoup + iicoup] = CI_ACCESS::iicoupAccessBufferConst( cIPC, iicoup ); // independent couplings, fixed for all events
-#ifdef MGONGPUCPP_GPUIMPL
 #ifdef __CUDACC__
 #pragma nv_diagnostic pop
-#endif
       // CUDA kernels take input/output buffers with momenta/MEs for all events
       const fptype* momenta = allmomenta;
       const fptype* COUPs[nxcoup];
@@ -31909,7 +31908,7 @@ namespace mg5amcCpu
         { -116, 442, 442, -134, -134, 505, -44, -134, 28, -224, -62, 496, -53, 19, -62, 496, 10, -80, -62, -71, 10, -80, 1, -8, 136, -116, -116, -44, -44, 514, -116, 442, -44, 28, -53, -62, -44, -53, 514, -62, 100, 10, 28, -62, -62, 10, 10, 1, 514, 505, -62, 496, -71, 568, -44, -134, -53, -62, 19, -71, 10, -80, 100, 10, 640, -80, 1, -8, 10, 1, 64, -8, -62, -71, 10, -80, 1, -8, 28, -62, -62, 10, 10, 1, 1, -8, 10, 1, 64, -8, -8, 64, -80, -8, -512, 64, 496, 568, -80, 640, -8, 64, -224, 496, 496, -80, -80, -8, -8, 64, -80, -8, -512, 64, 64, -512, 640, 64, 4096, -512 },
         { 136, -116, -116, -44, -44, 514, -116, 442, -44, 28, -53, -62, -44, -53, 514, -62, 100, 10, 28, -62, -62, 10, 10, 1, -116, 442, 442, -134, -134, 505, -44, -134, 28, -224, -62, 496, -53, 19, -62, 496, 10, -80, -62, -71, 10, -80, 1, -8, -44, -134, -53, -62, 19, -71, 514, 505, -62, 496, -71, 568, 100, 10, 10, -80, -80, 640, 10, 1, 1, -8, -8, 64, 28, -62, -62, 10, 10, 1, -62, -71, 10, -80, 1, -8, 10, 1, 1, -8, -8, 64, -80, -8, -8, 64, 64, -512, -224, 496, 496, -80, -80, -8, 496, 568, -80, 640, -8, 64, -80, -8, -8, 64, 64, -512, 640, 64, 64, -512, -512, 4096 } }; // 2-D array[120][120]
 
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
       // Pre-compute a constexpr triangular color matrix properly normalized #475
       struct TriangularNormalizedColorMatrix
       {
@@ -31966,7 +31965,7 @@ namespace mg5amcCpu
 #endif
       for( int icol = 0; icol < ncolor; icol++ )
       {
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
         // === C++ START ===
         // Diagonal terms
 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
@@ -32025,7 +32024,7 @@ namespace mg5amcCpu
       MEs_sv_previous += deltaMEs_previous;
 #endif
       /*
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
       if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", blockDim.x * blockIdx.x + threadIdx.x, ihel, MEs_sv );
 #else
 #ifdef MGONGPU_CPPSIMD
@@ -32184,8 +32183,8 @@ namespace mg5amcCpu
       { 1, 1, 1, -1, 1, -1, 1 },
       { 1, 1, 1, -1, 1, 1, -1 },
       { 1, 1, 1, -1, 1, 1, 1 } };
-#ifdef MGONGPUCPP_GPUIMPL
-    gpuMemcpyToSymbol( cHel, tHel, ncomb * npar * sizeof( short ) );
+#ifdef __CUDACC__
+    checkCuda( cudaMemcpyToSymbol( cHel, tHel, ncomb * npar * sizeof( short ) ) );
 #else
     memcpy( cHel, tHel, ncomb * npar * sizeof( short ) );
 #endif
@@ -32228,9 +32227,9 @@ namespace mg5amcCpu
     // Then copy them to CUDA constant memory (issue #39) or its C++ emulation in file-scope static memory
     const fptype tIPD[2] = { (fptype)m_pars->mdl_MT, (fptype)m_pars->mdl_WT };
     //const cxtype tIPC[0] = { ... }; // nicoup=0
-#ifdef MGONGPUCPP_GPUIMPL
-    gpuMemcpyToSymbol( cIPD, tIPD, 2 * sizeof( fptype ) );
-    //gpuMemcpyToSymbol( cIPC, tIPC, 0 * sizeof( cxtype ) ); // nicoup=0
+#ifdef __CUDACC__
+    checkCuda( cudaMemcpyToSymbol( cIPD, tIPD, 2 * sizeof( fptype ) ) );
+    //checkCuda( cudaMemcpyToSymbol( cIPC, tIPC, 0 * sizeof( cxtype ) ) ); // nicoup=0
 #else
     memcpy( cIPD, tIPD, 2 * sizeof( fptype ) );
     //memcpy( cIPC, tIPC, 0 * sizeof( cxtype ) ); // nicoup=0
@@ -32269,7 +32268,7 @@ namespace mg5amcCpu
   {
     std::stringstream out;
     // CUDA version (NVCC)
-    // [Use __NVCC__ instead of MGONGPUCPP_GPUIMPL here!]
+    // [Use __NVCC__ instead of __CUDACC__ here!]
     // [This tests if 'nvcc' was used even to build a .cc file, even if not necessarily 'nvcc -x cu' for a .cu file]
     // [Check 'nvcc --compiler-options -dM -E dummy.c | grep CUDA': see https://stackoverflow.com/a/53713712]
 #ifdef __NVCC__
@@ -32334,12 +32333,12 @@ namespace mg5amcCpu
   __global__ void /* clang-format off */
   computeDependentCouplings( const fptype* allgs, // input: Gs[nevt]
                              fptype* allcouplings // output: couplings[nevt*ndcoup*2]
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
                              , const int nevt     // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #endif
   ) /* clang-format on */
   {
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
     using namespace mg5amcGpu;
     using G_ACCESS = DeviceAccessGs;
     using C_ACCESS = DeviceAccessCouplings;
@@ -32360,7 +32359,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
+#ifdef __CUDACC__ /* clang-format off */
   __global__ void
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
@@ -32489,9 +32488,9 @@ namespace mg5amcCpu
         nGoodHel++;
       }
     }
-#ifdef MGONGPUCPP_GPUIMPL
-    gpuMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) );
-    gpuMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) );
+#ifdef __CUDACC__
+    checkCuda( cudaMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) ) );
+    checkCuda( cudaMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) ) );
 #else
     cNGoodHel = nGoodHel;
     for( int ihel = 0; ihel < ncomb; ihel++ ) cGoodHel[ihel] = goodHel[ihel];
@@ -32515,7 +32514,7 @@ namespace mg5amcCpu
 #endif
             int* allselhel,                // output: helicity selection[nevt]
             int* allselcol                 // output: helicity selection[nevt]
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
             , const int nevt               // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #endif
             ) /* clang-format on */
@@ -32536,7 +32535,7 @@ namespace mg5amcCpu
     // Denominators: spins, colors and identical particles
     constexpr int helcolDenominators[1] = { 1536 }; // assume nprocesses == 1 (#272 and #343)
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
     // Remember: in CUDA this is a kernel for one event, in c++ this processes n events
     const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid
 #else
@@ -32550,12 +32549,9 @@ namespace mg5amcCpu
 #endif
 
     // Start sigmaKin_lines
-
-#include "GpuAbstraction.h"
-
     // === PART 0 - INITIALISATION (before calculate_wavefunctions) ===
     // Reset the "matrix elements" - running sums of |M|^2 over helicities for the given event
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
     allMEs[ievt] = 0;
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     allNumerators[ievt] = 0;
@@ -32583,7 +32579,7 @@ namespace mg5amcCpu
     // === PART 1 - HELICITY LOOP: CALCULATE WAVEFUNCTIONS ===
     // (in both CUDA and C++, using precomputed good helicities)
 
-#ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++
+#ifdef __CUDACC__ // CUDA OR C++
 
     // *** START OF PART 1a - CUDA (one event per CPU thread) ***
     // Running sum of partial amplitudes squared for event by event color selection (#402)
@@ -32787,7 +32783,7 @@ namespace mg5amcCpu
     // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event
     // [NB 'sum over final spins, average over initial spins', eg see
     // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf]
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
     allMEs[ievt] /= helcolDenominators[0];
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     if( channelId > 0 ) allMEs[ievt] *= allNumerators[ievt] / allDenominators[ievt];
diff --git a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/P1_Sigma_sm_gg_ttxggg/CPPProcess.h b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/P1_Sigma_sm_gg_ttxggg/CPPProcess.h
index b1f469b1c9..d1dd4d6150 100644
--- a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/P1_Sigma_sm_gg_ttxggg/CPPProcess.h
+++ b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/P1_Sigma_sm_gg_ttxggg/CPPProcess.h
@@ -4,7 +4,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
 // MadGraph5_aMC@NLO v. 3.5.0_lo_vect, 2023-06-09
@@ -25,7 +25,7 @@
 
 //--------------------------------------------------------------------------
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -107,7 +107,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
   __global__ void
   computeDependentCouplings( const fptype* allgs,    // input: Gs[nevt]
                              fptype* allcouplings ); // output: couplings[nevt*ndcoup*2]
@@ -120,7 +120,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
+#ifdef __CUDACC__ /* clang-format off */
   __global__ void
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
@@ -150,7 +150,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
+#ifdef __CUDACC__ /* clang-format off */
   __global__ void
   sigmaKin( const fptype* allmomenta,      // input: momenta[nevt*npar*4]
             const fptype* allcouplings,    // input: couplings[nevt*ndcoup*2]
diff --git a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/P1_Sigma_sm_gg_ttxggg/CudaRuntime.h b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/P1_Sigma_sm_gg_ttxggg/CudaRuntime.h
new file mode 120000
index 0000000000..ce9e1a487a
--- /dev/null
+++ b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/P1_Sigma_sm_gg_ttxggg/CudaRuntime.h
@@ -0,0 +1 @@
+../CudaRuntime.h
\ No newline at end of file
diff --git a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/P1_Sigma_sm_gg_ttxggg/check_sa.cc b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/P1_Sigma_sm_gg_ttxggg/check_sa.cc
index 1bad694d1c..f1e75b9252 100644
--- a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/P1_Sigma_sm_gg_ttxggg/check_sa.cc
+++ b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/P1_Sigma_sm_gg_ttxggg/check_sa.cc
@@ -4,7 +4,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: O. Mattelaer (Nov 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 
 #include "mgOnGpuConfig.h"
@@ -12,7 +12,6 @@
 #include "BridgeKernels.h"
 #include "CPPProcess.h"
 #include "CrossSectionKernels.h"
-#include "GpuRuntime.h"
 #include "MatrixElementKernels.h"
 #include "MemoryAccessMatrixElements.h"
 #include "MemoryAccessMomenta.h"
@@ -64,7 +63,7 @@ usage( char* argv0, int ret = 1 )
   std::cout << std::endl;
   std::cout << "Summary stats are always computed: '-p' and '-j' only control their printout" << std::endl;
   std::cout << "The '-d' flag only enables NaN/abnormal warnings and OMP debugging" << std::endl;
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
 #ifdef _OPENMP
   std::cout << std::endl;
   std::cout << "Use the OMP_NUM_THREADS environment variable to control OMP multi-threading" << std::endl;
@@ -78,7 +77,7 @@ int
 main( int argc, char** argv )
 {
   // Namespaces for CUDA and C++ (FIXME - eventually use the same namespace everywhere...)
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
   using namespace mg5amcGpu;
 #else
   using namespace mg5amcCpu;
@@ -104,11 +103,11 @@ main( int argc, char** argv )
     CurandDevice = 2
   };
 #ifdef __CUDACC__
-  RandomNumberMode rndgen = RandomNumberMode::CurandDevice; // default on CUDA GPU
+  RandomNumberMode rndgen = RandomNumberMode::CurandDevice; // default on GPU
 #elif not defined MGONGPU_HAS_NO_CURAND
   RandomNumberMode rndgen = RandomNumberMode::CurandHost;  // default on CPU if build has curand
 #else
-  RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // default on CPU if build has no curand and on HIP GPU
+  RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // default on CPU if build has no curand
 #endif
   // Rambo sampling mode (NB RamboHost implies CommonRandom or CurandHost!)
   enum class RamboSamplingMode
@@ -116,7 +115,7 @@ main( int argc, char** argv )
     RamboHost = 1,
     RamboDevice = 2
   };
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
   RamboSamplingMode rmbsmp = RamboSamplingMode::RamboDevice; // default on GPU
 #else
   RamboSamplingMode rmbsmp = RamboSamplingMode::RamboHost; // default on CPU
@@ -149,7 +148,7 @@ main( int argc, char** argv )
 #ifdef __CUDACC__
       rndgen = RandomNumberMode::CurandDevice;
 #else
-      throw std::runtime_error( "CurandDevice is not supported on CPUs or on HIP GPUs" );
+      throw std::runtime_error( "CurandDevice is not supported on CPUs" );
 #endif
     }
     else if( arg == "--curhst" )
@@ -166,7 +165,7 @@ main( int argc, char** argv )
     }
     else if( arg == "--rmbdev" )
     {
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
       rmbsmp = RamboSamplingMode::RamboDevice;
 #else
       throw std::runtime_error( "RamboDevice is not supported on CPUs" );
@@ -240,13 +239,13 @@ main( int argc, char** argv )
     return usage( argv[0] );
   }
 
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
 #ifdef _OPENMP
   ompnumthreadsNotSetMeansOneThread( debug ? 1 : 0 ); // quiet(-1), info(0), debug(1)
 #endif
 #endif
 
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
   // Fail gently and avoid "Illegal instruction (core dumped)" if the host does not support the SIMD used in the ME calculation
   // Note: this prevents a crash on pmpe04 but not on some github CI nodes?
   // [NB: SIMD vectorization in mg5amc C++ code is only used in the ME calculation below MatrixElementKernelHost!]
@@ -264,14 +263,14 @@ main( int argc, char** argv )
 
   // === STEP 0 - INITIALISE
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 
-  // --- 00. Initialise GPU
-  // Instantiate a GpuRuntime at the beginnining of the application's main.
-  // For CUDA this invokes cudaSetDevice(0) in the constructor and books a cudaDeviceReset() call in the destructor.
-  const std::string cdinKey = "00 GpuInit";
+  // --- 00. Initialise cuda
+  // Instantiate a CudaRuntime at the beginnining of the application's main to
+  // invoke cudaSetDevice(0) in the constructor and book a cudaDeviceReset() call in the destructor
+  const std::string cdinKey = "00 CudaInit";
   timermap.start( cdinKey );
-  GpuRuntime GpuRuntime( debug );
+  CudaRuntime cudaRuntime( debug );
 #endif
 
   // --- 0a. Initialise physics process
@@ -293,7 +292,7 @@ main( int argc, char** argv )
   timermap.start( alloKey );
 
   // Memory buffers for random numbers for momenta
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
   HostBufferRndNumMomenta hstRndmom( nevt );
 #else
   PinnedHostBufferRndNumMomenta hstRndmom( nevt );
@@ -301,7 +300,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for sampling weights
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
   HostBufferWeights hstWeights( nevt );
 #else
   PinnedHostBufferWeights hstWeights( nevt );
@@ -309,7 +308,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for momenta
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
   HostBufferMomenta hstMomenta( nevt );
 #else
   PinnedHostBufferMomenta hstMomenta( nevt );
@@ -317,7 +316,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for Gs
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
   HostBufferGs hstGs( nevt );
 #else
   PinnedHostBufferGs hstGs( nevt );
@@ -334,7 +333,7 @@ main( int argc, char** argv )
   }
 
   // Memory buffers for matrix elements
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
   HostBufferMatrixElements hstMatrixElements( nevt );
 #else
   PinnedHostBufferMatrixElements hstMatrixElements( nevt );
@@ -343,7 +342,7 @@ main( int argc, char** argv )
 
   // Memory buffers for random numbers for helicity selection
   // *** NB #403 these buffers always remain initialised at 0: no need for helicity choice in gcheck/check (no LHE produced) ***
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
   HostBufferRndNumHelicity hstRndHel( nevt );
 #else
   PinnedHostBufferRndNumHelicity hstRndHel( nevt );
@@ -352,7 +351,7 @@ main( int argc, char** argv )
 
   // Memory buffers for random numbers for color selection
   // *** NB #402 these buffers always remain initialised at 0: no need for color choice in gcheck/check (no LHE produced) ***
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
   HostBufferRndNumColor hstRndCol( nevt );
 #else
   PinnedHostBufferRndNumColor hstRndCol( nevt );
@@ -360,7 +359,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for helicity selection
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
   HostBufferSelectedHelicity hstSelHel( nevt );
 #else
   PinnedHostBufferSelectedHelicity hstSelHel( nevt );
@@ -368,7 +367,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for color selection
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
   HostBufferSelectedColor hstSelCol( nevt );
 #else
   PinnedHostBufferSelectedColor hstSelCol( nevt );
@@ -404,7 +403,7 @@ main( int argc, char** argv )
 #else
   else
   {
-    throw std::logic_error( "CurandDevice is not supported on CPUs or HIP GPUs" ); // INTERNAL ERROR (no path to this statement)
+    throw std::logic_error( "CurandDevice is not supported on CPUs" ); // INTERNAL ERROR (no path to this statement)
   }
 #endif
 #else
@@ -422,7 +421,7 @@ main( int argc, char** argv )
   }
   else
   {
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
     prsk.reset( new RamboSamplingKernelDevice( energy, devRndmom, devMomenta, devWeights, gpublocks, gputhreads ) );
 #else
     throw std::logic_error( "RamboDevice is not supported on CPUs" ); // INTERNAL ERROR (no path to this statement)
@@ -433,7 +432,7 @@ main( int argc, char** argv )
   std::unique_ptr<MatrixElementKernelBase> pmek;
   if( !bridge )
   {
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
     pmek.reset( new MatrixElementKernelDevice( devMomenta, devGs, devRndHel, devRndCol, devMatrixElements, devSelHel, devSelCol, gpublocks, gputhreads ) );
 #else
     pmek.reset( new MatrixElementKernelHost( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, nevt ) );
@@ -441,7 +440,7 @@ main( int argc, char** argv )
   }
   else
   {
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
     pmek.reset( new BridgeKernelDevice( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, gpublocks, gputhreads ) );
 #else
     pmek.reset( new BridgeKernelHost( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, nevt ) );
@@ -483,7 +482,7 @@ main( int argc, char** argv )
     prnk->generateRnarray();
     //std::cout << "Got random numbers" << std::endl;
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
     if( rndgen != RandomNumberMode::CurandDevice && rmbsmp == RamboSamplingMode::RamboDevice )
     {
       // --- 1c. Copy rndmom from host to device
@@ -515,7 +514,7 @@ main( int argc, char** argv )
     prsk->getMomentaFinal();
     //std::cout << "Got final momenta" << std::endl;
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
     if( rmbsmp == RamboSamplingMode::RamboDevice )
     {
       // --- 2c. CopyDToH Weights
@@ -560,7 +559,7 @@ main( int argc, char** argv )
       dynamic_cast<BridgeKernelBase*>( pmek.get() )->transposeInputMomentaC2F();
     }
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
     // --- 2d. CopyHToD Momenta
     const std::string gKey = "0.. CpHTDg";
     rambtime += timermap.start( gKey ); // FIXME! NOT A RAMBO TIMER!
@@ -589,7 +588,7 @@ main( int argc, char** argv )
     wv3atime += timermap.stop(); // calc only
     wavetime += wv3atime;        // calc plus copy
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
     if( !bridge )
     {
       // --- 3b. CopyDToH MEs
@@ -732,19 +731,15 @@ main( int argc, char** argv )
     rndgentxt = "CURAND DEVICE";
 #ifdef __CUDACC__
   rndgentxt += " (CUDA code)";
-#elif defined __HIPCC__
-  rndgentxt += " (HIP code)";
 #else
   rndgentxt += " (C++ code)";
 #endif
 
   // Workflow description summary
   std::string wrkflwtxt;
-  // -- CUDA or HIP or C++?
+  // -- CUDA or C++?
 #ifdef __CUDACC__
   wrkflwtxt += "CUD:";
-#elif defined __HIPCC__
-  wrkflwtxt += "HIP:";
 #else
   wrkflwtxt += "CPP:";
 #endif
@@ -769,12 +764,6 @@ main( int argc, char** argv )
 #else
   wrkflwtxt += "???:"; // no path to this statement
 #endif
-#elif defined __HIPCC__
-#if defined MGONGPU_CUCXTYPE_CXSMPL
-  wrkflwtxt += "CXS:";
-#else
-  wrkflwtxt += "???:"; // no path to this statement
-#endif
 #else
 #if defined MGONGPU_CPPCXTYPE_STDCOMPLEX
   wrkflwtxt += "STX:";
@@ -800,7 +789,7 @@ main( int argc, char** argv )
     wrkflwtxt += "RMBDEV+";
   else
     wrkflwtxt += "??????+"; // no path to this statement
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
   // -- HOST or DEVICE matrix elements? Standalone MEs or BRIDGE?
   if( !bridge )
     wrkflwtxt += "MESDEV";
@@ -856,7 +845,7 @@ main( int argc, char** argv )
 
   if( perf )
   {
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
 #ifdef _OPENMP
     // Get the output of "nproc --all" (https://stackoverflow.com/a/478960)
     std::string nprocall;
@@ -877,8 +866,6 @@ main( int argc, char** argv )
     std::cout << std::string( SEP79, '*' ) << std::endl
 #ifdef __CUDACC__
               << "Process                     = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_CUDA"
-#elif defined __HIPCC__
-              << "Process                     = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_HIP"
 #else
               << "Process                     = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_CPP"
 #endif
@@ -905,21 +892,21 @@ main( int argc, char** argv )
 #elif defined MGONGPU_FPTYPE_FLOAT
               << "FP precision                = FLOAT (NaN/abnormal=" << nabn << ", zero=" << nzero << ")" << std::endl
 #endif
+#ifdef __CUDACC__
 #if defined MGONGPU_CUCXTYPE_CUCOMPLEX
               << "Complex type                = CUCOMPLEX" << std::endl
 #elif defined MGONGPU_CUCXTYPE_THRUST
               << "Complex type                = THRUST::COMPLEX" << std::endl
-#elif defined MGONGPU_CUCXTYPE_CXSMPL
-              << "Complex type                = STD::COMPLEX" << std::endl
+#endif
 #else
-              << "Complex type                = ???" << std::endl // no path to this statement...
+              << "Complex type                = STD::COMPLEX" << std::endl
 #endif
               << "RanNumb memory layout       = AOSOA[" << neppR << "]"
               << ( neppR == 1 ? " == AOS" : "" )
               << " [HARDCODED FOR REPRODUCIBILITY]" << std::endl
               << "Momenta memory layout       = AOSOA[" << neppM << "]"
               << ( neppM == 1 ? " == AOS" : "" ) << std::endl
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
     //<< "Wavefunction GPU memory     = LOCAL" << std::endl
 #else
 #if !defined MGONGPU_CPPSIMD
@@ -950,7 +937,7 @@ main( int argc, char** argv )
 #endif
 #endif
               << "Random number generation    = " << rndgentxt << std::endl
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
 #ifdef _OPENMP
               << "OMP threads / `nproc --all` = " << omp_get_max_threads() << " / " << nprocall // includes a newline
 #endif
@@ -1046,14 +1033,14 @@ main( int argc, char** argv )
              << "\"FLOAT (NaN/abnormal=" << nabn << ")\"," << std::endl
 #endif
              << "\"Complex type\": "
+#ifdef __CUDACC__
 #if defined MGONGPU_CUCXTYPE_CUCOMPLEX
              << "\"CUCOMPLEX\"," << std::endl
 #elif defined MGONGPU_CUCXTYPE_THRUST
              << "\"THRUST::COMPLEX\"," << std::endl
-#elif defined MGONGPU_CUCXTYPE_CXSMPL
-             << "\"STD::COMPLEX\"," << std::endl
+#endif
 #else
-             << "\"???\"," << std::endl                           // no path to this statement...
+             << "\"STD::COMPLEX\"," << std::endl
 #endif
              << "\"RanNumb memory layout\": "
              << "\"AOSOA[" << neppR << "]\""
@@ -1061,7 +1048,7 @@ main( int argc, char** argv )
              << "\"Momenta memory layout\": "
              << "\"AOSOA[" << neppM << "]\""
              << ( neppM == 1 ? " == AOS" : "" ) << ", " << std::endl
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
     //<< "\"Wavefunction GPU memory\": " << "\"LOCAL\"," << std::endl
 #endif
              << "\"Curand generation\": "
diff --git a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/RamboSamplingKernels.cc b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/RamboSamplingKernels.cc
index 79abbcc4f8..da68aa9255 100644
--- a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/RamboSamplingKernels.cc
+++ b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/RamboSamplingKernels.cc
@@ -1,11 +1,11 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #include "RamboSamplingKernels.h"
 
-#include "GpuRuntime.h"
+#include "CudaRuntime.h"
 #include "MemoryAccessMomenta.h"
 #include "MemoryAccessRandomNumbers.h"
 #include "MemoryAccessWeights.h"
@@ -14,7 +14,7 @@
 
 #include <sstream>
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -92,7 +92,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
   RamboSamplingKernelDevice::RamboSamplingKernelDevice( const fptype energy,               // input: energy
                                                         const BufferRndNumMomenta& rndmom, // input: random numbers in [0,1]
                                                         BufferMomenta& momenta,            // output: momenta
@@ -135,7 +135,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
   __global__ void
   getMomentaInitialDevice( const fptype energy,
                            fptype* momenta )
@@ -147,17 +147,17 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
   void
   RamboSamplingKernelDevice::getMomentaInitial()
   {
-    gpuLaunchKernel( getMomentaInitialDevice, m_gpublocks, m_gputhreads, m_energy, m_momenta.data() );
+    getMomentaInitialDevice<<<m_gpublocks, m_gputhreads>>>( m_energy, m_momenta.data() );
   }
 #endif
 
   //--------------------------------------------------------------------------
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
   __global__ void
   getMomentaFinalDevice( const fptype energy,
                          const fptype* rndmom,
@@ -171,11 +171,11 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
   void
   RamboSamplingKernelDevice::getMomentaFinal()
   {
-    gpuLaunchKernel( getMomentaFinalDevice, m_gpublocks, m_gputhreads, m_energy, m_rndmom.data(), m_momenta.data(), m_weights.data() );
+    getMomentaFinalDevice<<<m_gpublocks, m_gputhreads>>>( m_energy, m_rndmom.data(), m_momenta.data(), m_weights.data() );
   }
 #endif
 
diff --git a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/RamboSamplingKernels.h b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/RamboSamplingKernels.h
index 7c214cd74b..184089efd7 100644
--- a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/RamboSamplingKernels.h
+++ b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/RamboSamplingKernels.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef RAMBOSAMPLINGKERNELS_H
 #define RAMBOSAMPLINGKERNELS_H 1
@@ -10,7 +10,7 @@
 
 #include "MemoryBuffers.h"
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -93,7 +93,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
   // A class encapsulating RAMBO phase space sampling on a GPU device
   class RamboSamplingKernelDevice final : public SamplingKernelBase, public NumberOfEvents
   {
diff --git a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/RandomNumberKernels.h b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/RandomNumberKernels.h
index 21d63beeac..188a72c2c9 100644
--- a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/RandomNumberKernels.h
+++ b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/RandomNumberKernels.h
@@ -1,14 +1,14 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef RANDOMNUMBERKERNELS_H
 #define RANDOMNUMBERKERNELS_H 1
 
 #include "mgOnGpuConfig.h"
 
-// NB This must come AFTER mgOnGpuConfig.h which contains our definition of __global__ when MGONGPUCPP_GPUIMPL is not defined
+// NB This must come AFTER mgOnGpuConfig.h which contains our definition of __global__ when __CUDACC__ is not defined
 #ifndef MGONGPU_HAS_NO_CURAND
 //#include "curand.h"
 struct curandGenerator_st; // forward definition from curand.h
@@ -16,7 +16,7 @@ struct curandGenerator_st; // forward definition from curand.h
 
 #include "MemoryBuffers.h"
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/cudacpp.mk b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/cudacpp.mk
index bcb73d7f01..a0397e9ecc 100644
--- a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/cudacpp.mk
+++ b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/cudacpp.mk
@@ -1,7 +1,7 @@
 # Copyright (C) 2020-2023 CERN and UCLouvain.
 # Licensed under the GNU Lesser General Public License (version 3 or later).
 # Created by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin.
-# Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+# Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 
 #=== Determine the name of this makefile (https://ftp.gnu.org/old-gnu/Manuals/make-3.80/html_node/make_17.html)
 #=== NB: different names (e.g. cudacpp.mk and cudacpp_src.mk) are used in the Subprocess and src directories
@@ -32,7 +32,7 @@ UNAME_P := $(shell uname -p)
 #=== Configure common compiler flags for C++ and CUDA
 
 INCFLAGS = -I.
-OPTFLAGS = -O3 # this ends up in GPUFLAGS too (should it?), cannot add -Ofast or -ffast-math here
+OPTFLAGS = -O3 # this ends up in CUFLAGS too (should it?), cannot add -Ofast or -ffast-math here
 
 # Dependency on src directory
 MG5AMC_COMMONLIB = mg5amc_common
@@ -89,139 +89,69 @@ endif
 
 #-------------------------------------------------------------------------------
 
-CUDA_COMPILER_PATH := $(shell compiler="`which nvcc 2>/dev/null`" && while [ -L "$$compiler" ]; do compiler=`readlink "$$compiler"`; done && echo "$$compiler")
-HIP_COMPILER_PATH := $(shell compiler="`which hipcc 2>/dev/null`" && while [ -L "$$compiler" ]; do compiler=`readlink "$$compiler"`; done && echo "$$compiler")
-
-ifeq ($(findstring nvcc,$(CUDA_COMPILER_PATH)),nvcc)
-  #=== Configure the CUDA compiler
-
-  # If CXX is not a single word (example "clang++ --gcc-toolchain...") then disable CUDA builds (issue #505)
-  # This is because it is impossible to pass this to "GPUFLAGS += -ccbin <host-compiler>" below
-  ifneq ($(words $(subst ccache ,,$(CXX))),1) # allow at most "CXX=ccache <host-compiler>" from outside
-    $(warning CUDA builds are not supported for multi-word CXX "$(CXX)")
-    override CUDA_HOME=disabled
-  endif
-
-  # If CUDA_HOME is not set, try to set it from the location of NVCC
-  ifndef CUDA_HOME
-    CUDA_HOME = $(patsubst %bin/nvcc,%,$(shell which nvcc 2>/dev/null))
-    $(warning CUDA_HOME was not set: using "$(CUDA_HOME)")
-  endif
-
-  # Set GPUCC as $(CUDA_HOME)/bin/nvcc if it exists
-  ifneq ($(wildcard $(CUDA_HOME)/bin/nvcc),)
-    GPUCC = $(CUDA_HOME)/bin/nvcc
-    USE_NVTX ?=-DUSE_NVTX
-    # See https://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/index.html
-    # See https://arnon.dk/matching-sm-architectures-arch-and-gencode-for-various-nvidia-cards/
-    # Default: use compute capability 70 for V100 (CERN lxbatch, CERN itscrd, Juwels Cluster).
-    # Embed device code for 70, and PTX for 70+.
-    # Export MADGRAPH_CUDA_ARCHITECTURE (comma-separated list) to use another value or list of values (see #533).
-    # Examples: use 60 for P100 (Piz Daint), 80 for A100 (Juwels Booster, NVidia raplab/Curiosity).
-    MADGRAPH_CUDA_ARCHITECTURE ?= 70
-    ###CUARCHFLAGS = -gencode arch=compute_$(MADGRAPH_CUDA_ARCHITECTURE),code=compute_$(MADGRAPH_CUDA_ARCHITECTURE) -gencode arch=compute_$(MADGRAPH_CUDA_ARCHITECTURE),code=sm_$(MADGRAPH_CUDA_ARCHITECTURE) # Older implementation (AV): go back to this one for multi-GPU support #533
-    ###CUARCHFLAGS = --gpu-architecture=compute_$(MADGRAPH_CUDA_ARCHITECTURE) --gpu-code=sm_$(MADGRAPH_CUDA_ARCHITECTURE),compute_$(MADGRAPH_CUDA_ARCHITECTURE) # Newer implementation (SH): cannot use this as-is for multi-GPU support #533
-    comma:=,
-    CUARCHFLAGS = $(foreach arch,$(subst $(comma), ,$(MADGRAPH_CUDA_ARCHITECTURE)),-gencode arch=compute_$(arch),code=compute_$(arch) -gencode arch=compute_$(arch),code=sm_$(arch))
-    CUINC = -I$(CUDA_HOME)/include/
-    CURANDLIBFLAGS = -L$(CUDA_HOME)/lib64/ -lcurand # NB: -lcuda is not needed here!
-    CUOPTFLAGS = -lineinfo
-    GPUFLAGS = $(OPTFLAGS) $(CUOPTFLAGS) $(INCFLAGS) $(CUINC) $(USE_NVTX) $(CUARCHFLAGS) -use_fast_math
-    ###GPUFLAGS += -Xcompiler -Wall -Xcompiler -Wextra -Xcompiler -Wshadow
-    ###GPUCC_VERSION = $(shell $(GPUCC) --version | grep 'Cuda compilation tools' | cut -d' ' -f5 | cut -d, -f1)
-    GPUFLAGS += -std=c++17 # need CUDA >= 11.2 (see #333): this is enforced in mgOnGpuConfig.h
-    # Without -maxrregcount: baseline throughput: 6.5E8 (16384 32 12) up to 7.3E8 (65536 128 12)
-    ###GPUFLAGS+= --maxrregcount 160 # improves throughput: 6.9E8 (16384 32 12) up to 7.7E8 (65536 128 12)
-    ###GPUFLAGS+= --maxrregcount 128 # improves throughput: 7.3E8 (16384 32 12) up to 7.6E8 (65536 128 12)
-    ###GPUFLAGS+= --maxrregcount 96 # degrades throughput: 4.1E8 (16384 32 12) up to 4.5E8 (65536 128 12)
-    ###GPUFLAGS+= --maxrregcount 64 # degrades throughput: 1.7E8 (16384 32 12) flat at 1.7E8 (65536 128 12)
-
-    CUBUILDRULEFLAGS = -Xcompiler -fPIC -c
-    CCBUILDRULEFLAGS = -Xcompiler -fPIC -c -x cu
-
-    CUDATESTFLAGS = -lcuda
-
-  else ifneq ($(origin REQUIRE_CUDA),undefined)
-    # If REQUIRE_CUDA is set but no cuda is found, stop here (e.g. for CI tests on GPU #443)
-    $(error No cuda installation found (set CUDA_HOME or make GPUCC visible in PATH))
-  else
-    # No cuda. Switch cuda compilation off and go to common random numbers in C++
-    $(warning CUDA_HOME is not set or is invalid: export CUDA_HOME to compile with cuda)
-    override GPUCC=
-    override USE_NVTX=
-    override CUINC=
-    override CURANDLIBFLAGS=
-  endif
-
-  # Set the host C++ compiler for GPUCC via "-ccbin <host-compiler>"
-  # (NB issue #505: this must be a single word, "clang++ --gcc-toolchain..." is not supported)
-  GPUFLAGS += -ccbin $(shell which $(subst ccache ,,$(CXX)))
-
-  # Allow newer (unsupported) C++ compilers with older versions of CUDA if ALLOW_UNSUPPORTED_COMPILER_IN_CUDA is set (#504)
-  ifneq ($(origin ALLOW_UNSUPPORTED_COMPILER_IN_CUDA),undefined)
-  GPUFLAGS += -allow-unsupported-compiler
-  endif
-
-else ifeq ($(findstring hipcc,$(HIP_COMPILER_PATH)),hipcc)
-  #=== Configure the HIP compiler
-
-  # If CXX is not a single word (example "clang++ --gcc-toolchain...") then disable CUDA builds (issue #505)
-  # This is because it is impossible to pass this to "GPUFLAGS += -ccbin <host-compiler>" below
-  ifneq ($(words $(subst ccache ,,$(CXX))),1) # allow at most "CXX=ccache <host-compiler>" from outside
-    $(warning CUDA builds are not supported for multi-word CXX "$(CXX)")
-    override CUDA_HOME=disabled
-  endif
-
-  # If HIP_HOME is not set, try to set it from the location of GPUCC
-  ifndef HIP_HOME
-    HIP_HOME = $(patsubst %bin/hipcc,%,$(HIP_COMPILER_PATH))
-    $(warning HIP_HOME was not set: using "$(HIP_HOME)")
-  endif
-
-  # Set GPUCC as $(HIP_HOME)/bin/hipcc if it exists
-  ifneq ($(wildcard $(HIP_HOME)/bin/hipcc),)
-    GPUCC = $(HIP_HOME)/bin/hipcc
-
-    # Should maybe find something equivelant to this in HIP
-    #USE_NVTX ?=-DUSE_NVTX
-
-    HIPARCHFLAGS = -target x86_64-linux-gnu --offload-arch=gfx90a
-    HIPINC = -I$(HIP_HOME)/include/
-
-    # -DHIP_FAST_MATH equivelant to -use_fast_math in HIP 
-    # (But only for single precision line 208: https://rocm-developer-tools.github.io/HIP/hcc__detail_2math__functions_8h_source.html)
-    GPUFLAGS = $(OPTFLAGS) $(CUOPTFLAGS) $(INCFLAGS) $(HIPINC) $(HIPARCHFLAGS) -DHIP_FAST_MATH -DHIP_PLATFORM=amd -fPIC
-    ###GPUFLAGS += -Xcompiler -Wall -Xcompiler -Wextra -Xcompiler -Wshadow
-    GPUFLAGS += -std=c++17
-    # Without -maxrregcount: baseline throughput: 6.5E8 (16384 32 12) up to 7.3E8 (65536 128 12)
-    ###GPUFLAGS+= --maxrregcount 160 # improves throughput: 6.9E8 (16384 32 12) up to 7.7E8 (65536 128 12)
-    ###GPUFLAGS+= --maxrregcount 128 # improves throughput: 7.3E8 (16384 32 12) up to 7.6E8 (65536 128 12)
-    ###GPUFLAGS+= --maxrregcount 96 # degrades throughput: 4.1E8 (16384 32 12) up to 4.5E8 (65536 128 12)
-    ###GPUFLAGS+= --maxrregcount 64 # degrades throughput: 1.7E8 (16384 32 12) flat at 1.7E8 (65536 128 12)
-
-    CUBUILDRULEFLAGS = -fPIC -c
-    CCBUILDRULEFLAGS = -fPIC -c
-
-  else ifneq ($(origin REQUIRE_HIP),undefined)
-    # If REQUIRE_HIP is set but no cuda is found, stop here (e.g. for CI tests on GPU #443)
-    $(error No hip installation found (set HIP_HOME or make GPUCC visible in PATH))
-  else
-    # No hip. Switch hip compilation off and go to common random numbers in C++
-    $(warning HIP_HOME is not set or is invalid: export HIP_HOME to compile with hip)
-    override GPUCC=
-    override USE_NVTX=
-    override CUINC=
-    override CURANDLIBFLAGS=
-  endif
+#=== Configure the CUDA compiler
+
+# If CXX is not a single word (example "clang++ --gcc-toolchain...") then disable CUDA builds (issue #505)
+# This is because it is impossible to pass this to "CUFLAGS += -ccbin <host-compiler>" below
+ifneq ($(words $(subst ccache ,,$(CXX))),1) # allow at most "CXX=ccache <host-compiler>" from outside
+  $(warning CUDA builds are not supported for multi-word CXX "$(CXX)")
+  override CUDA_HOME=disabled
+endif
+
+# If CUDA_HOME is not set, try to set it from the location of nvcc
+ifndef CUDA_HOME
+  CUDA_HOME = $(patsubst %bin/nvcc,%,$(shell which nvcc 2>/dev/null))
+  $(warning CUDA_HOME was not set: using "$(CUDA_HOME)")
+endif
+
+# Set NVCC as $(CUDA_HOME)/bin/nvcc if it exists
+ifneq ($(wildcard $(CUDA_HOME)/bin/nvcc),)
+  NVCC = $(CUDA_HOME)/bin/nvcc
+  USE_NVTX ?=-DUSE_NVTX
+  # See https://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/index.html
+  # See https://arnon.dk/matching-sm-architectures-arch-and-gencode-for-various-nvidia-cards/
+  # Default: use compute capability 70 for V100 (CERN lxbatch, CERN itscrd, Juwels Cluster).
+  # Embed device code for 70, and PTX for 70+.
+  # Export MADGRAPH_CUDA_ARCHITECTURE (comma-separated list) to use another value or list of values (see #533).
+  # Examples: use 60 for P100 (Piz Daint), 80 for A100 (Juwels Booster, NVidia raplab/Curiosity).
+  MADGRAPH_CUDA_ARCHITECTURE ?= 70
+  ###CUARCHFLAGS = -gencode arch=compute_$(MADGRAPH_CUDA_ARCHITECTURE),code=compute_$(MADGRAPH_CUDA_ARCHITECTURE) -gencode arch=compute_$(MADGRAPH_CUDA_ARCHITECTURE),code=sm_$(MADGRAPH_CUDA_ARCHITECTURE) # Older implementation (AV): go back to this one for multi-GPU support #533
+  ###CUARCHFLAGS = --gpu-architecture=compute_$(MADGRAPH_CUDA_ARCHITECTURE) --gpu-code=sm_$(MADGRAPH_CUDA_ARCHITECTURE),compute_$(MADGRAPH_CUDA_ARCHITECTURE) # Newer implementation (SH): cannot use this as-is for multi-GPU support #533
+  comma:=,
+  CUARCHFLAGS = $(foreach arch,$(subst $(comma), ,$(MADGRAPH_CUDA_ARCHITECTURE)),-gencode arch=compute_$(arch),code=compute_$(arch) -gencode arch=compute_$(arch),code=sm_$(arch))
+  CUINC = -I$(CUDA_HOME)/include/
+  CURANDLIBFLAGS = -L$(CUDA_HOME)/lib64/ -lcurand # NB: -lcuda is not needed here!
+  CUOPTFLAGS = -lineinfo
+  CUFLAGS = $(OPTFLAGS) $(CUOPTFLAGS) $(INCFLAGS) $(CUINC) $(USE_NVTX) $(CUARCHFLAGS) -use_fast_math
+  ###CUFLAGS += -Xcompiler -Wall -Xcompiler -Wextra -Xcompiler -Wshadow
+  ###NVCC_VERSION = $(shell $(NVCC) --version | grep 'Cuda compilation tools' | cut -d' ' -f5 | cut -d, -f1)
+  CUFLAGS += -std=c++17 # need CUDA >= 11.2 (see #333): this is enforced in mgOnGpuConfig.h
+  # Without -maxrregcount: baseline throughput: 6.5E8 (16384 32 12) up to 7.3E8 (65536 128 12)
+  ###CUFLAGS+= --maxrregcount 160 # improves throughput: 6.9E8 (16384 32 12) up to 7.7E8 (65536 128 12)
+  ###CUFLAGS+= --maxrregcount 128 # improves throughput: 7.3E8 (16384 32 12) up to 7.6E8 (65536 128 12)
+  ###CUFLAGS+= --maxrregcount 96 # degrades throughput: 4.1E8 (16384 32 12) up to 4.5E8 (65536 128 12)
+  ###CUFLAGS+= --maxrregcount 64 # degrades throughput: 1.7E8 (16384 32 12) flat at 1.7E8 (65536 128 12)
+else ifneq ($(origin REQUIRE_CUDA),undefined)
+  # If REQUIRE_CUDA is set but no cuda is found, stop here (e.g. for CI tests on GPU #443)
+  $(error No cuda installation found (set CUDA_HOME or make nvcc visible in PATH))
+else
+  # No cuda. Switch cuda compilation off and go to common random numbers in C++
+  $(warning CUDA_HOME is not set or is invalid: export CUDA_HOME to compile with cuda)
+  override NVCC=
+  override USE_NVTX=
+  override CUINC=
+  override CURANDLIBFLAGS=
+endif
 
-  # Allow newer (unsupported) C++ compilers with older versions of CUDA if ALLOW_UNSUPPORTED_COMPILER_IN_CUDA is set (#504)
-  ifneq ($(origin ALLOW_UNSUPPORTED_COMPILER_IN_CUDA),undefined)
-  GPUFLAGS += -allow-unsupported-compiler
-  endif
+# Set the host C++ compiler for nvcc via "-ccbin <host-compiler>"
+# (NB issue #505: this must be a single word, "clang++ --gcc-toolchain..." is not supported)
+CUFLAGS += -ccbin $(shell which $(subst ccache ,,$(CXX)))
 
+# Allow newer (unsupported) C++ compilers with older versions of CUDA if ALLOW_UNSUPPORTED_COMPILER_IN_CUDA is set (#504)
+ifneq ($(origin ALLOW_UNSUPPORTED_COMPILER_IN_CUDA),undefined)
+CUFLAGS += -allow-unsupported-compiler
 endif
 
-  
 #-------------------------------------------------------------------------------
 
 #=== Configure ccache for C++ and CUDA builds
@@ -233,9 +163,9 @@ endif
 #ifeq ($(USECCACHE)$(shell echo $(AR) | grep ccache),1)
 #  override AR:=ccache $(AR)
 #endif
-ifneq ($(GPUCC),)
-  ifeq ($(USECCACHE)$(shell echo $(GPUCC) | grep ccache),1)
-    override GPUCC:=ccache $(GPUCC)
+ifneq ($(NVCC),)
+  ifeq ($(USECCACHE)$(shell echo $(NVCC) | grep ccache),1)
+    override NVCC:=ccache $(NVCC)
   endif
 endif
 
@@ -259,7 +189,7 @@ endif
 
 # PowerPC-specific CUDA compiler flags (to be reviewed!)
 ifeq ($(UNAME_P),ppc64le)
-  GPUFLAGS+= -Xcompiler -mno-float128
+  CUFLAGS+= -Xcompiler -mno-float128
 endif
 
 #-------------------------------------------------------------------------------
@@ -269,10 +199,10 @@ endif
 # Set the default OMPFLAGS choice
 ifneq ($(shell $(CXX) --version | egrep '^Intel'),)
 override OMPFLAGS = -fopenmp
-###override OMPFLAGS = # disable OpenMP MT on Intel (was ok without GPUCC but not ok with GPUCC before #578)
+###override OMPFLAGS = # disable OpenMP MT on Intel (was ok without nvcc but not ok with nvcc before #578)
 else ifneq ($(shell $(CXX) --version | egrep '^(clang)'),)
 override OMPFLAGS = -fopenmp
-###override OMPFLAGS = # disable OpenMP MT on clang (was not ok without or with GPUCC before #578)
+###override OMPFLAGS = # disable OpenMP MT on clang (was not ok without or with nvcc before #578)
 else ifneq ($(shell $(CXX) --version | egrep '^(Apple clang)'),)
 override OMPFLAGS = # disable OpenMP MT on Apple clang (builds fail in the CI #578)
 else
@@ -323,10 +253,7 @@ endif
 
 # Set the default RNDGEN (random number generator) choice
 ifeq ($(RNDGEN),)
-  ifeq ($(GPUCC),)
-    override RNDGEN = hasNoCurand
-  # Edgecase for HIP compilation
-  else ifeq ($(findstring hipcc,$(GPUCC)),hipcc)
+  ifeq ($(NVCC),)
     override RNDGEN = hasNoCurand
   else ifeq ($(RNDGEN),)
     override RNDGEN = hasCurand
@@ -401,13 +328,13 @@ CXXFLAGS+= $(AVXFLAGS)
 $(info FPTYPE=$(FPTYPE))
 ifeq ($(FPTYPE),d)
   CXXFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_DOUBLE
-  GPUFLAGS  += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_DOUBLE
+  CUFLAGS  += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_DOUBLE
 else ifeq ($(FPTYPE),f)
   CXXFLAGS += -DMGONGPU_FPTYPE_FLOAT -DMGONGPU_FPTYPE2_FLOAT
-  GPUFLAGS  += -DMGONGPU_FPTYPE_FLOAT -DMGONGPU_FPTYPE2_FLOAT
+  CUFLAGS  += -DMGONGPU_FPTYPE_FLOAT -DMGONGPU_FPTYPE2_FLOAT
 else ifeq ($(FPTYPE),m)
   CXXFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_FLOAT
-  GPUFLAGS  += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_FLOAT
+  CUFLAGS  += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_FLOAT
 else
   $(error Unknown FPTYPE='$(FPTYPE)': only 'd', 'f' and 'm' are supported)
 endif
@@ -416,7 +343,7 @@ endif
 $(info HELINL=$(HELINL))
 ifeq ($(HELINL),1)
   CXXFLAGS += -DMGONGPU_INLINE_HELAMPS
-  GPUFLAGS  += -DMGONGPU_INLINE_HELAMPS
+  CUFLAGS  += -DMGONGPU_INLINE_HELAMPS
 else ifneq ($(HELINL),0)
   $(error Unknown HELINL='$(HELINL)': only '0' and '1' are supported)
 endif
@@ -425,7 +352,7 @@ endif
 $(info HRDCOD=$(HRDCOD))
 ifeq ($(HRDCOD),1)
   CXXFLAGS += -DMGONGPU_HARDCODE_PARAM
-  GPUFLAGS  += -DMGONGPU_HARDCODE_PARAM
+  CUFLAGS  += -DMGONGPU_HARDCODE_PARAM
 else ifneq ($(HRDCOD),0)
   $(error Unknown HRDCOD='$(HRDCOD)': only '0' and '1' are supported)
 endif
@@ -477,11 +404,11 @@ ifeq ($(UNAME_S),Darwin)
   override CULIBFLAGSRPATH2 =
 else
   # RPATH to cuda/cpp libs when linking executables
-  override CXXLIBFLAGSRPATH = -Wl,-rpath=$(LIBDIRRPATH)
-  override CULIBFLAGSRPATH = -Xlinker -rpath=$(LIBDIRRPATH)
+  override CXXLIBFLAGSRPATH = -Wl,-rpath,$(LIBDIRRPATH)
+  override CULIBFLAGSRPATH = -Xlinker -rpath,$(LIBDIRRPATH)
   # RPATH to common lib when linking cuda/cpp libs
-  override CXXLIBFLAGSRPATH2 = -Wl,-rpath='$$ORIGIN'
-  override CULIBFLAGSRPATH2 = -Xlinker -rpath='$$ORIGIN'
+  override CXXLIBFLAGSRPATH2 = -Wl,-rpath,'$$ORIGIN'
+  override CULIBFLAGSRPATH2 = -Xlinker -rpath,'$$ORIGIN'
 endif
 
 # Setting LD_LIBRARY_PATH or DYLD_LIBRARY_PATH in the RUNTIME is no longer necessary (neither on Linux nor on Mac)
@@ -494,7 +421,7 @@ override RUNTIME =
 cxx_main=$(BUILDDIR)/check.exe
 fcxx_main=$(BUILDDIR)/fcheck.exe
 
-ifneq ($(GPUCC),)
+ifneq ($(NVCC),)
 cu_main=$(BUILDDIR)/gcheck.exe
 fcu_main=$(BUILDDIR)/fgcheck.exe
 else
@@ -525,16 +452,15 @@ $(BUILDDIR)/.build.$(TAG):
 	@touch $(BUILDDIR)/.build.$(TAG)
 
 # Generic target and build rules: objects from CUDA compilation
-ifneq ($(GPUCC),)
+ifneq ($(NVCC),)
 $(BUILDDIR)/%.o : %.cu *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
 	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
-	$(GPUCC) $(CPPFLAGS) $(GPUFLAGS) $(CUBUILDRULEFLAGS) $< -o $@
+	$(NVCC) $(CPPFLAGS) $(CUFLAGS) -Xcompiler -fPIC -c $< -o $@
 
 $(BUILDDIR)/%_cu.o : %.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
 	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
-	$(GPUCC) $(CPPFLAGS) $(GPUFLAGS) $(CCBUILDRULEFLAGS) $< -o $@
+	$(NVCC) $(CPPFLAGS) $(CUFLAGS) -Xcompiler -fPIC -c -x cu $< -o $@
 endif
-# -x cu in line above
 
 # Generic target and build rules: objects from C++ compilation
 # (NB do not include CUINC here! add it only for NVTX or curand #679)
@@ -543,14 +469,11 @@ $(BUILDDIR)/%.o : %.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
 	$(CXX) $(CPPFLAGS) $(CXXFLAGS) -fPIC -c $< -o $@
 
 # Apply special build flags only to CrossSectionKernel.cc and gCrossSectionKernel.cu (no fast math, see #117)
-# Added edgecase for HIP compilation
 ifeq ($(shell $(CXX) --version | grep ^nvc++),)
 $(BUILDDIR)/CrossSectionKernels.o: CXXFLAGS += -fno-fast-math
 $(BUILDDIR)/CrossSectionKernels.o: CXXFLAGS += -fno-fast-math
-ifeq ($(findstring nvcc,$(GPUCC)),nvcc)
-  $(BUILDDIR)/gCrossSectionKernels.o: GPUFLAGS += -Xcompiler -fno-fast-math
-else
-  $(BUILDDIR)/gCrossSectionKernels.o: GPUFLAGS += -fno-fast-math
+ifneq ($(NVCC),)
+$(BUILDDIR)/gCrossSectionKernels.o: CUFLAGS += -Xcompiler -fno-fast-math
 endif
 endif
 
@@ -566,10 +489,10 @@ ifeq ($(RNDGEN),hasCurand)
 $(BUILDDIR)/CurandRandomNumberKernel.o: CXXFLAGS += $(CUINC)
 endif
 
-# Avoid "warning: builtin __has_trivial_... is deprecated; use __is_trivially_... instead" in GPUCC with icx2023 (#592)
+# Avoid "warning: builtin __has_trivial_... is deprecated; use __is_trivially_... instead" in nvcc with icx2023 (#592)
 ifneq ($(shell $(CXX) --version | egrep '^(Intel)'),)
-ifneq ($(GPUCC),)
-GPUFLAGS += -Wno-deprecated-builtins
+ifneq ($(NVCC),)
+CUFLAGS += -Xcompiler -Wno-deprecated-builtins
 endif
 endif
 
@@ -577,8 +500,8 @@ endif
 # This patch does remove the warning, but I prefer to keep it disabled for the moment...
 ###ifneq ($(shell $(CXX) --version | egrep '^(clang|Apple clang|Intel)'),)
 ###$(BUILDDIR)/CrossSectionKernels.o: CXXFLAGS += -Wno-overriding-t-option
-###ifneq ($(GPUCC),)
-###$(BUILDDIR)/gCrossSectionKernels.o: GPUFLAGS += -Xcompiler -Wno-overriding-t-option
+###ifneq ($(NVCC),)
+###$(BUILDDIR)/gCrossSectionKernels.o: CUFLAGS += -Xcompiler -Wno-overriding-t-option
 ###endif
 ###endif
 
@@ -605,7 +528,7 @@ MG5AMC_CXXLIB = mg5amc_$(processid_short)_cpp
 cxx_objects_lib=$(BUILDDIR)/CPPProcess.o $(BUILDDIR)/MatrixElementKernels.o $(BUILDDIR)/BridgeKernels.o $(BUILDDIR)/CrossSectionKernels.o
 cxx_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel.o $(BUILDDIR)/RamboSamplingKernels.o
 
-ifneq ($(GPUCC),)
+ifneq ($(NVCC),)
 MG5AMC_CULIB = mg5amc_$(processid_short)_cuda
 cu_objects_lib=$(BUILDDIR)/gCPPProcess.o $(BUILDDIR)/gMatrixElementKernels.o $(BUILDDIR)/gBridgeKernels.o $(BUILDDIR)/gCrossSectionKernels.o
 cu_objects_exe=$(BUILDDIR)/gCommonRandomNumberKernel.o $(BUILDDIR)/gRamboSamplingKernels.o
@@ -617,11 +540,11 @@ $(LIBDIR)/lib$(MG5AMC_CXXLIB).so: cxx_objects_lib += $(BUILDDIR)/fbridge.o
 $(LIBDIR)/lib$(MG5AMC_CXXLIB).so: $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib)
 	$(CXX) -shared -o $@ $(cxx_objects_lib) $(CXXLIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB)
 
-ifneq ($(GPUCC),)
+ifneq ($(NVCC),)
 $(LIBDIR)/lib$(MG5AMC_CULIB).so: $(BUILDDIR)/fbridge_cu.o
 $(LIBDIR)/lib$(MG5AMC_CULIB).so: cu_objects_lib += $(BUILDDIR)/fbridge_cu.o
 $(LIBDIR)/lib$(MG5AMC_CULIB).so: $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cu_objects_lib)
-	$(GPUCC) --shared -o $@ $(cu_objects_lib) $(CULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB)
+	$(NVCC) --shared -o $@ $(cu_objects_lib) $(CULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB)
 endif
 
 #-------------------------------------------------------------------------------
@@ -638,16 +561,16 @@ $(cxx_main): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PAT
 $(cxx_main): $(BUILDDIR)/check_sa.o $(LIBDIR)/lib$(MG5AMC_CXXLIB).so $(cxx_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel.o
 	$(CXX) -o $@ $(BUILDDIR)/check_sa.o $(OMPFLAGS) -ldl -pthread $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_CXXLIB) $(cxx_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel.o $(CURANDLIBFLAGS)
 
-ifneq ($(GPUCC),)
+ifneq ($(NVCC),)
 ifneq ($(shell $(CXX) --version | grep ^Intel),)
-$(cu_main): LIBFLAGS += -lintlc # compile with icpx and link with GPUCC (undefined reference to `_intel_fast_memcpy')
-$(cu_main): LIBFLAGS += -lsvml # compile with icpx and link with GPUCC (undefined reference to `__svml_cos4_l9')
+$(cu_main): LIBFLAGS += -lintlc # compile with icpx and link with nvcc (undefined reference to `_intel_fast_memcpy')
+$(cu_main): LIBFLAGS += -lsvml # compile with icpx and link with nvcc (undefined reference to `__svml_cos4_l9')
 else ifneq ($(shell $(CXX) --version | grep ^nvc++),) # support nvc++ #531
 $(cu_main): LIBFLAGS += -L$(patsubst %bin/nvc++,%lib,$(subst ccache ,,$(CXX))) -lnvhpcatm -lnvcpumath -lnvc
 endif
 $(cu_main): LIBFLAGS += $(CULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH
 $(cu_main): $(BUILDDIR)/gcheck_sa.o $(LIBDIR)/lib$(MG5AMC_CULIB).so $(cu_objects_exe) $(BUILDDIR)/gCurandRandomNumberKernel.o
-	$(GPUCC) -o $@ $(BUILDDIR)/gcheck_sa.o $(CUARCHFLAGS) $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe) $(BUILDDIR)/gCurandRandomNumberKernel.o $(CURANDLIBFLAGS)
+	$(NVCC) -o $@ $(BUILDDIR)/gcheck_sa.o $(CUARCHFLAGS) $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe) $(BUILDDIR)/gCurandRandomNumberKernel.o $(CURANDLIBFLAGS)
 endif
 
 #-------------------------------------------------------------------------------
@@ -673,17 +596,17 @@ $(fcxx_main): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PA
 $(fcxx_main): $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler.o $(LIBDIR)/lib$(MG5AMC_CXXLIB).so $(cxx_objects_exe)
 	$(CXX) -o $@ $(BUILDDIR)/fcheck_sa.o $(OMPFLAGS) $(BUILDDIR)/fsampler.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CXXLIB) $(cxx_objects_exe)
 
-ifneq ($(GPUCC),)
+ifneq ($(NVCC),)
 ifneq ($(shell $(CXX) --version | grep ^Intel),)
-$(fcu_main): LIBFLAGS += -lintlc # compile with icpx and link with GPUCC (undefined reference to `_intel_fast_memcpy')
-$(fcu_main): LIBFLAGS += -lsvml # compile with icpx and link with GPUCC (undefined reference to `__svml_cos4_l9')
+$(fcu_main): LIBFLAGS += -lintlc # compile with icpx and link with nvcc (undefined reference to `_intel_fast_memcpy')
+$(fcu_main): LIBFLAGS += -lsvml # compile with icpx and link with nvcc (undefined reference to `__svml_cos4_l9')
 endif
 ifeq ($(UNAME_S),Darwin)
 $(fcu_main): LIBFLAGS += -L$(shell dirname $(shell $(FC) --print-file-name libgfortran.dylib)) # add path to libgfortran on Mac #375
 endif
 $(fcu_main): LIBFLAGS += $(CULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH
 $(fcu_main): $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler_cu.o $(LIBDIR)/lib$(MG5AMC_CULIB).so $(cu_objects_exe)
-	$(GPUCC) -o $@ $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler_cu.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe)
+	$(NVCC) -o $@ $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler_cu.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe)
 endif
 
 #-------------------------------------------------------------------------------
@@ -695,7 +618,7 @@ $(BUILDDIR)/testxxx.o: testxxx_cc_ref.txt
 $(testmain): $(BUILDDIR)/testxxx.o
 $(testmain): cxx_objects_exe += $(BUILDDIR)/testxxx.o # Comment out this line to skip the C++ test of xxx functions
 
-ifneq ($(GPUCC),)
+ifneq ($(NVCC),)
 $(BUILDDIR)/testxxx_cu.o: $(GTESTLIBS)
 $(BUILDDIR)/testxxx_cu.o: INCFLAGS += $(GTESTINC)
 $(BUILDDIR)/testxxx_cu.o: testxxx_cc_ref.txt
@@ -708,7 +631,7 @@ $(BUILDDIR)/testmisc.o: INCFLAGS += $(GTESTINC)
 $(testmain): $(BUILDDIR)/testmisc.o
 $(testmain): cxx_objects_exe += $(BUILDDIR)/testmisc.o # Comment out this line to skip the C++ miscellaneous tests
 
-ifneq ($(GPUCC),)
+ifneq ($(NVCC),)
 $(BUILDDIR)/testmisc_cu.o: $(GTESTLIBS)
 $(BUILDDIR)/testmisc_cu.o: INCFLAGS += $(GTESTINC)
 $(testmain): $(BUILDDIR)/testmisc_cu.o
@@ -720,12 +643,12 @@ $(BUILDDIR)/runTest.o: INCFLAGS += $(GTESTINC)
 $(testmain): $(BUILDDIR)/runTest.o
 $(testmain): cxx_objects_exe += $(BUILDDIR)/runTest.o
 
-ifneq ($(GPUCC),)
+ifneq ($(NVCC),)
 $(BUILDDIR)/runTest_cu.o: $(GTESTLIBS)
 $(BUILDDIR)/runTest_cu.o: INCFLAGS += $(GTESTINC)
 ifneq ($(shell $(CXX) --version | grep ^Intel),)
-$(testmain): LIBFLAGS += -lintlc # compile with icpx and link with GPUCC (undefined reference to `_intel_fast_memcpy')
-$(testmain): LIBFLAGS += -lsvml # compile with icpx and link with GPUCC (undefined reference to `__svml_cos4_l9')
+$(testmain): LIBFLAGS += -lintlc # compile with icpx and link with nvcc (undefined reference to `_intel_fast_memcpy')
+$(testmain): LIBFLAGS += -lsvml # compile with icpx and link with nvcc (undefined reference to `__svml_cos4_l9')
 else ifneq ($(shell $(CXX) --version | grep ^nvc++),) # support nvc++ #531
 $(testmain): LIBFLAGS += -L$(patsubst %bin/nvc++,%lib,$(subst ccache ,,$(CXX))) -lnvhpcatm -lnvcpumath -lnvc
 endif
@@ -749,14 +672,14 @@ $(testmain): LIBFLAGS += -lgomp
 endif
 endif
 
-ifeq ($(GPUCC),) # link only runTest.o
+ifeq ($(NVCC),) # link only runTest.o
 $(testmain): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH
 $(testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(cxx_objects_exe) $(GTESTLIBS)
 	$(CXX) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) -ldl -pthread $(LIBFLAGS)
 else # link both runTest.o and runTest_cu.o
 $(testmain): LIBFLAGS += $(CULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH
 $(testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) $(GTESTLIBS)
-	  $(GPUCC) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) -ldl $(LIBFLAGS) $(CUDATESTFLAGS)
+	$(NVCC) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) -ldl $(LIBFLAGS) -lcuda
 endif
 
 # Use flock (Linux only, no Mac) to allow 'make -j' if googletest has not yet been downloaded https://stackoverflow.com/a/32666215
@@ -859,9 +782,9 @@ ifeq ($(USECCACHE),1)
 	ccache --version | head -1
 endif
 	@echo ""
-	@echo GPUCC=$(GPUCC)
-ifneq ($(GPUCC),)
-	$(GPUCC) --version
+	@echo NVCC=$(NVCC)
+ifneq ($(NVCC),)
+	$(NVCC) --version
 endif
 	@echo ""
 	@echo CXX=$(CXX)
@@ -880,7 +803,7 @@ endif
 
 # Target: check (run the C++ test executable)
 # [NB THIS IS WHAT IS USED IN THE GITHUB CI!]
-ifneq ($(GPUCC),)
+ifneq ($(NVCC),)
 check: runTest cmpFcheck cmpFGcheck
 else
 check: runTest cmpFcheck
diff --git a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/fbridge.cc b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/fbridge.cc
index 2b956730d4..f93c05b0b3 100644
--- a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/fbridge.cc
+++ b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/fbridge.cc
@@ -1,11 +1,11 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: S. Roiser (Oct 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Roiser, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #include "Bridge.h"
 #include "CPPProcess.h"
-#include "GpuRuntime.h"
+#include "CudaRuntime.h"
 
 extern "C"
 {
@@ -22,7 +22,7 @@ extern "C"
    * Using the same Fortran MadEvent code, linking to the hetrerogeneous library would allow access to both CPU and GPU implementations.
    * The specific heterogeneous configuration (how many GPUs, how many threads on each CPU, etc) could be loaded in CUDA/C++ from a data file.
    */
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
   using namespace mg5amcGpu;
 #else
   using namespace mg5amcCpu;
@@ -46,8 +46,8 @@ extern "C"
    */
   void fbridgecreate_( CppObjectInFortran** ppbridge, const int* pnevtF, const int* pnparF, const int* pnp4F )
   {
-#ifdef MGONGPUCPP_GPUIMPL
-    GpuRuntime::setUp();
+#ifdef __CUDACC__
+    CudaRuntime::setUp();
 #endif
     // Create a process object, read parm card and set parameters
     // FIXME: the process instance can happily go out of scope because it is only needed to read parameters?
@@ -69,8 +69,8 @@ extern "C"
     Bridge<FORTRANFPTYPE>* pbridge = dynamic_cast<Bridge<FORTRANFPTYPE>*>( *ppbridge );
     if( pbridge == 0 ) throw std::runtime_error( "fbridgedelete_: invalid Bridge address" );
     delete pbridge;
-#ifdef MGONGPUCPP_GPUIMPL
-    GpuRuntime::tearDown();
+#ifdef __CUDACC__
+    CudaRuntime::tearDown();
 #endif
   }
 
@@ -100,7 +100,7 @@ extern "C"
   {
     Bridge<FORTRANFPTYPE>* pbridge = dynamic_cast<Bridge<FORTRANFPTYPE>*>( *ppbridge );
     if( pbridge == 0 ) throw std::runtime_error( "fbridgesequence_: invalid Bridge address" );
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
     // Use the device/GPU implementation in the CUDA library
     // (there is also a host implementation in this library)
     pbridge->gpu_sequence( momenta, gs, rndhel, rndcol, *pchannelId, mes, selhel, selcol );
diff --git a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/fsampler.cc b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/fsampler.cc
index 3743934f41..2fb445372d 100644
--- a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/fsampler.cc
+++ b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/fsampler.cc
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Feb 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #include "mgOnGpuConfig.h"
 
@@ -13,7 +13,7 @@
 
 //--------------------------------------------------------------------------
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -40,7 +40,7 @@ namespace mg5amcCpu
   private:
     const int m_nevt; // The number of events in each iteration
     int m_iiter;      // The iteration counter (for random number seeding)
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
     HostBufferRndNumMomenta m_hstRndmom; // Memory buffers for random numbers
     HostBufferMomenta m_hstMomenta;      // Memory buffers for momenta
     HostBufferWeights m_hstWeights;      // Memory buffers for sampling weights
@@ -105,7 +105,7 @@ namespace mg5amcCpu
 
 extern "C"
 {
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
   using namespace mg5amcGpu;
 #else
   using namespace mg5amcCpu;
diff --git a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/runTest.cc b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/runTest.cc
index 461ec5c3a5..572e28aaea 100644
--- a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/runTest.cc
+++ b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/runTest.cc
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: S. Hageboeck (Nov 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 
 #include "mgOnGpuConfig.h"
 
@@ -15,7 +15,7 @@
 #include "RandomNumberKernels.h"
 #include "epoch_process_id.h"
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 using namespace mg5amcGpu;
 #else
 using namespace mg5amcCpu;
@@ -32,7 +32,7 @@ struct CUDA_CPU_TestBase : public TestDriverBase
     : TestDriverBase( npar, refFileName ) {}
 };
 
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
 struct CPUTest : public CUDA_CPU_TestBase
 {
   // Struct data members (process, and memory structures for random numbers, momenta, matrix elements and weights on host and device)
@@ -114,7 +114,7 @@ struct CPUTest : public CUDA_CPU_TestBase
 };
 #endif
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 struct CUDATest : public CUDA_CPU_TestBase
 {
   // Reset the device when our test goes out of scope. Note that this should happen after
@@ -123,7 +123,7 @@ struct CUDATest : public CUDA_CPU_TestBase
   {
     ~DeviceReset()
     {
-      gpuDeviceReset(); // this is needed by cuda-memcheck --leak-check full
+      checkCuda( cudaDeviceReset() ); // this is needed by cuda-memcheck --leak-check full
     }
   } deviceResetter;
 
@@ -249,7 +249,7 @@ INSTANTIATE_TEST_SUITE_P( prefix, \
                           test_suite_name, \
                           testing::Values( new CUDATest( MG_EPOCH_REFERENCE_FILE_NAME ) ) );
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 MG_INSTANTIATE_TEST_SUITE_GPU( XTESTID_GPU( MG_EPOCH_PROCESS_ID ), MadgraphTest );
 #else
 MG_INSTANTIATE_TEST_SUITE_CPU( XTESTID_CPU( MG_EPOCH_PROCESS_ID ), MadgraphTest );
diff --git a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/testmisc.cc b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/testmisc.cc
index 2bd7a9fcf9..989aba1fdc 100644
--- a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/testmisc.cc
+++ b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/testmisc.cc
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 //----------------------------------------------------------------------------
 // Use ./runTest.exe --gtest_filter=*misc to run only this test
 //----------------------------------------------------------------------------
@@ -17,7 +17,7 @@
 #include <sstream>
 #include <typeinfo>
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 #define TESTID( s ) s##_GPU_MISC
 #else
 #define TESTID( s ) s##_CPU_MISC
diff --git a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/testxxx.cc b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/testxxx.cc
index 6e8657edca..4243e9fcec 100644
--- a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/testxxx.cc
+++ b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/testxxx.cc
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Apr 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #include "mgOnGpuConfig.h"
 
@@ -20,7 +20,7 @@
 #include <iomanip>
 #include <iostream>
 #include <vector>
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 #define TESTID( s ) s##_GPU_XXX
 #else
 #define TESTID( s ) s##_CPU_XXX
@@ -41,7 +41,7 @@ TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testxxx )
   assert( nevt % neppM == 0 ); // nevt must be a multiple of neppM
   assert( nevt % neppV == 0 ); // nevt must be a multiple of neppV
   // Fill in the input momenta
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
   mg5amcGpu::PinnedHostBufferMomenta hstMomenta( nevt ); // AOSOA[npagM][npar=4][np4=4][neppM]
 #else
   mg5amcCpu::HostBufferMomenta hstMomenta( nevt ); // AOSOA[npagM][npar=4][np4=4][neppM]
@@ -228,7 +228,7 @@ TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testxxx )
   {
     for( int ievt = 0; ievt < nevt; ievt++ )
     {
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
       using namespace mg5amcGpu;
 #else
       using namespace mg5amcCpu;
diff --git a/epochX/cudacpp/gg_ttggg.sa/src/HelAmps_sm.h b/epochX/cudacpp/gg_ttggg.sa/src/HelAmps_sm.h
index f772885631..ee2fcbbde5 100644
--- a/epochX/cudacpp/gg_ttggg.sa/src/HelAmps_sm.h
+++ b/epochX/cudacpp/gg_ttggg.sa/src/HelAmps_sm.h
@@ -4,7 +4,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: A. Valassi (Sep 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
 // MadGraph5_aMC@NLO v. 3.5.0_lo_vect, 2023-06-09
@@ -26,7 +26,7 @@
 //#include <iomanip>
 //#include <iostream>
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/gg_ttggg.sa/src/Parameters_sm.cc b/epochX/cudacpp/gg_ttggg.sa/src/Parameters_sm.cc
index de87dcaf64..d3d01102fd 100644
--- a/epochX/cudacpp/gg_ttggg.sa/src/Parameters_sm.cc
+++ b/epochX/cudacpp/gg_ttggg.sa/src/Parameters_sm.cc
@@ -4,7 +4,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: A. Valassi (Sep 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
 // MadGraph5_aMC@NLO v. 3.5.0_lo_vect, 2023-06-09
diff --git a/epochX/cudacpp/gg_ttggg.sa/src/Parameters_sm.h b/epochX/cudacpp/gg_ttggg.sa/src/Parameters_sm.h
index fe7d686938..6551d8da81 100644
--- a/epochX/cudacpp/gg_ttggg.sa/src/Parameters_sm.h
+++ b/epochX/cudacpp/gg_ttggg.sa/src/Parameters_sm.h
@@ -214,7 +214,7 @@ namespace Parameters_sm_dependentCouplings
 #pragma GCC diagnostic push
 #pragma GCC diagnostic ignored "-Wunused-variable"  // e.g. <<warning: unused variable ‘mdl_G__exp__2’ [-Wunused-variable]>>
 #pragma GCC diagnostic ignored "-Wunused-parameter" // e.g. <<warning: unused parameter ‘G’ [-Wunused-parameter]>>
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 #pragma nv_diagnostic push
 #pragma nv_diag_suppress 177 // e.g. <<warning #177-D: variable "mdl_G__exp__2" was declared but never referenced>>
 #endif
@@ -242,7 +242,7 @@ namespace Parameters_sm_dependentCouplings
     // End SM implementation - no special handling of vectors of floats as in EFT (#439)
     return out;
   }
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 #pragma GCC diagnostic pop
 #pragma nv_diagnostic pop
 #endif
@@ -258,7 +258,7 @@ namespace Parameters_sm_independentCouplings
 
 //==========================================================================
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/gg_ttggg.sa/src/mgOnGpuConfig.h b/epochX/cudacpp/gg_ttggg.sa/src/mgOnGpuConfig.h
index d4e37d19b3..6c0c4919e9 100644
--- a/epochX/cudacpp/gg_ttggg.sa/src/mgOnGpuConfig.h
+++ b/epochX/cudacpp/gg_ttggg.sa/src/mgOnGpuConfig.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jul 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MGONGPUCONFIG_H
 #define MGONGPUCONFIG_H 1
@@ -10,26 +10,13 @@
 // There are two different code bases for standalone_cudacpp (without multichannel) and madevent+cudacpp (with multichannel)
 #undef MGONGPU_SUPPORTS_MULTICHANNEL
 
-// Is this a GPU (CUDA, HIP) or CPU implementation?
-#ifdef __CUDACC__
-#define MGONGPUCPP_GPUIMPL cuda
-#elif defined __HIPCC__
-#define MGONGPUCPP_GPUIMPL hip
-#else
-#undef MGONGPUCPP_GPUIMPL
-#endif
-
 // ** NB1 Throughputs (e.g. 6.8E8) are events/sec for "./gcheck.exe -p 65536 128 12"
 // ** NB2 Baseline on b7g47n0004 fluctuates (probably depends on load on other VMs)
 
 // Choose if curand is supported for generating random numbers
-// For CUDA, by default, it is supported
-// For HIP, by default, it is not supported
 // For C++, by default, do not inline, but allow this macro to be set from outside with e.g. -DMGONGPU_HAS_NO_CURAND
 #ifdef __CUDACC__
 #undef MGONGPU_HAS_NO_CURAND
-#elif defined __HIPCC__
-#define MGONGPU_HAS_NO_CURAND 1
 #else
 //#undef MGONGPU_HAS_NO_CURAND // default
 ////#define MGONGPU_HAS_NO_CURAND 1
@@ -65,28 +52,23 @@
 //#undef MGONGPU_HARDCODE_PARAM // default
 ////#define MGONGPU_HARDCODE_PARAM 1
 
-// Complex type in CUDA: thrust or cucomplex or cxsmpl (CHOOSE ONLY ONE)
+// Complex type in c++: std::complex or cxsmpl (CHOOSE ONLY ONE)
+#ifndef __CUDACC__
+//#define MGONGPU_CPPCXTYPE_STDCOMPLEX 1 // ~8 percent slower on float, same on double (5.1E6/double, 9.4E6/float)
+#define MGONGPU_CPPCXTYPE_CXSMPL 1 // new default (5.1E6/double, 10.2E6/float)
+#endif
+
+// Complex type in cuda: thrust or cucomplex or cxsmpl (CHOOSE ONLY ONE)
 #ifdef __CUDACC__
 #define MGONGPU_CUCXTYPE_THRUST 1 // default (~1.15E9/double, ~3.2E9/float)
 //#define MGONGPU_CUCXTYPE_CUCOMPLEX 1 // ~10 percent slower (1.03E9/double, ~2.8E9/float)
 //#define MGONGPU_CUCXTYPE_CXSMPL 1 // ~10 percent slower (1.00E9/double, ~2.9E9/float)
-
-// Complex type in HIP: cxsmpl (ONLY ONE OPTION POSSIBLE)
-#elif defined __HIPCC__
-#define MGONGPU_CUCXTYPE_CXSMPL 1 // ~10 percent slower (1.00E9/double, ~2.9E9/float)
-
-// Complex type in C++: std::complex or cxsmpl (CHOOSE ONLY ONE)
-#else
-//#define MGONGPU_CPPCXTYPE_STDCOMPLEX 1 // ~8 percent slower on float, same on double (5.1E6/double, 9.4E6/float)
-#define MGONGPU_CPPCXTYPE_CXSMPL 1 // new default (5.1E6/double, 10.2E6/float)
 #endif
 
-// CUDA nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation
+// Cuda nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation
 #ifdef __CUDACC__
-#undef MGONGPU_NSIGHT_DEBUG // default in CUDA
+#undef MGONGPU_NSIGHT_DEBUG // default
 //#define MGONGPU_NSIGHT_DEBUG 1
-#else
-#undef MGONGPU_NSIGHT_DEBUG // only option in HIP or C++
 #endif
 
 // SANITY CHECKS (floating point precision for everything but color algebra #537)
@@ -102,21 +84,17 @@
 #error You cannot use double precision for color algebra and single precision elsewhere
 #endif
 
-// SANITY CHECKS (CUDA complex number implementation)
-#ifdef __CUDACC__
-#if defined MGONGPU_CUCXTYPE_THRUST and defined MGONGPU_CUCXTYPE_CUCOMPLEX
-#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CUCXTYPE_THRUST or MGONGPU_CUCXTYPE_CUCOMPLEX for CUDA
-#elif defined MGONGPU_CUCXTYPE_THRUST and defined MGONGPU_CUCXTYPE_CXSMPL
-#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CUCXTYPE_THRUST or MGONGPU_CUCXTYPE_CXSMPL for CUDA
-#elif defined MGONGPU_CUCXTYPE_CUCOMPLEX and defined MGONGPU_CUCXTYPE_CXSMPL
-#error You must CHOOSE (ONE AND) ONLY ONE OF MGONGPU_CUCXTYPE_CUCOMPLEX or MGONGPU_CUCXTYPE_CXSMPL for CUDA
+// SANITY CHECKS (c++ complex number implementation)
+#ifndef __CUDACC__
+#if defined MGONGPU_CPPCXTYPE_STDCOMPLEX and defined MGONGPU_CPPCXTYPE_CXSMPL
+#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CPPCXTYPE_STDCOMPLEX or MGONGPU_CPPCXTYPE_CXSMPL
 #endif
 #endif
 
-// SANITY CHECKS (C++ complex number implementation)
-#ifndef MGONGPUCPP_GPUIMPL
-#if defined MGONGPU_CPPCXTYPE_STDCOMPLEX and defined MGONGPU_CPPCXTYPE_CXSMPL
-#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CPPCXTYPE_STDCOMPLEX or MGONGPU_CPPCXTYPE_CXSMPL for C++
+// SANITY CHECKS (cuda complex number implementation)
+#ifdef __CUDACC__
+#if defined MGONGPU_CUCXTYPE_THRUST and defined MGONGPU_CUCXTYPE_CUCOMPLEX and defined MGONGPU_CUCXTYPE_CXSMPL
+#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CUCXTYPE_THRUST or MGONGPU_CUCXTYPE_CUCOMPLEX or MGONGPU_CUCXTYPE_CXSMPL
 #endif
 #endif
 
@@ -153,7 +131,7 @@ namespace mgOnGpu
   // Alignment requirement for using reinterpret_cast with SIMD vectorized code
   // (using reinterpret_cast with non aligned memory may lead to segmentation faults!)
   // Only needed for C++ code but can be enforced also in NVCC builds of C++ code using CUDA>=11.2 and C++17 (#318, #319, #333)
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
   constexpr int cppAlign = 64; // alignment requirement for SIMD vectorization (64-byte i.e. 512-bit)
 #endif
 
@@ -164,7 +142,7 @@ using mgOnGpu::fptype;
 using mgOnGpu::fptype2;
 
 // C++ SIMD vectorization width (this will be used to set neppV)
-#ifdef MGONGPUCPP_GPUIMPL // CUDA and HIP implementations have no SIMD
+#ifdef __CUDACC__ // CUDA implementation has no SIMD
 #undef MGONGPU_CPPSIMD
 #elif defined __AVX512VL__ && defined MGONGPU_PVW512 // C++ "512z" AVX512 with 512 width (512-bit ie 64-byte): 8 (DOUBLE) or 16 (FLOAT)
 #ifdef MGONGPU_FPTYPE_DOUBLE
@@ -194,9 +172,9 @@ using mgOnGpu::fptype2;
 #undef MGONGPU_CPPSIMD
 #endif
 
-// CUDA nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation
+// Cuda nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation
 // Arguments (not used so far): text is __FUNCTION__, code is 0 (start) or 1 (end)
-#if defined __CUDA__ && defined MGONGPU_NSIGHT_DEBUG /* clang-format off */
+#if defined __CUDACC__ && defined MGONGPU_NSIGHT_DEBUG /* clang-format off */
 #define mgDebugDeclare() __shared__ float mgDebugCounter[mgOnGpu::ntpbMAX];
 #define mgDebugInitialise() { mgDebugCounter[threadIdx.x] = 0; }
 #define mgDebug( code, text ) { mgDebugCounter[threadIdx.x] += 1; }
@@ -208,8 +186,8 @@ using mgOnGpu::fptype2;
 #define mgDebugFinalise() { /*noop*/ }
 #endif /* clang-format on */
 
-// Define empty CUDA/HIP declaration specifiers for C++
-#ifndef MGONGPUCPP_GPUIMPL
+// Define empty CUDA declaration specifiers for C++
+#ifndef __CUDACC__
 #define __global__
 #define __host__
 #define __device__
diff --git a/epochX/cudacpp/gg_ttggg.sa/src/mgOnGpuCxtypes.h b/epochX/cudacpp/gg_ttggg.sa/src/mgOnGpuCxtypes.h
index 4e7ab03fa2..0cb2f1db7e 100644
--- a/epochX/cudacpp/gg_ttggg.sa/src/mgOnGpuCxtypes.h
+++ b/epochX/cudacpp/gg_ttggg.sa/src/mgOnGpuCxtypes.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022, based on earlier work by D. Smith) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MGONGPUCXTYPES_H
 #define MGONGPUCXTYPES_H 1
@@ -19,7 +19,7 @@
 #include <complex>
 
 // Complex type in cuda: thrust or cucomplex or cxsmpl
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 #if defined MGONGPU_CUCXTYPE_THRUST
 #pragma clang diagnostic push
 #pragma clang diagnostic ignored "-Wtautological-compare" // for icpx2021/clang13 (https://stackoverflow.com/a/15864661)
@@ -201,7 +201,7 @@ namespace mgOnGpu
 {
 
   // --- Type definitions (complex type: cxtype)
-#ifdef MGONGPUCPP_GPUIMPL // cuda
+#ifdef __CUDACC__ // cuda
 #if defined MGONGPU_CUCXTYPE_THRUST
   typedef thrust::complex<fptype> cxtype;
 #elif defined MGONGPU_CUCXTYPE_CUCOMPLEX
@@ -281,7 +281,7 @@ cxmake( const std::complex<double>& c ) // std::complex to cxsmpl (double-to-flo
 
 //==========================================================================
 
-#if defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CUCXTYPE_THRUST // cuda + thrust
+#if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_THRUST // cuda + thrust
 
 //------------------------------
 // CUDA - using thrust::complex
@@ -317,11 +317,11 @@ cxmake( const cxtype& c )
   return c;
 }
 
-#endif // #if defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CUCXTYPE_THRUST
+#endif // #if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_THRUST
 
 //==========================================================================
 
-#if defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CUCXTYPE_CUCOMPLEX // cuda + cucomplex
+#if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_CUCOMPLEX // cuda + cucomplex
 
 //------------------------------
 // CUDA - using cuComplex
@@ -536,11 +536,11 @@ cxmake( const std::complex<fptype>& c ) // std::complex to cucomplex (float-to-f
   return cxmake( c.real(), c.imag() );
 }
 
-#endif // #if defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CUCXTYPE_CUCOMPLEX
+#endif // #if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_CUCOMPLEX
 
 //==========================================================================
 
-#if not defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CPPCXTYPE_STDCOMPLEX // c++ + stdcomplex
+#if not defined __CUDACC__ and defined MGONGPU_CPPCXTYPE_STDCOMPLEX // c++ + stdcomplex
 
 //------------------------------
 // C++ - using std::complex
@@ -584,7 +584,7 @@ cxmake( const std::complex<double>& c ) // std::complex to std::complex (cast do
 }
 #endif
 
-#endif // #if not defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CPPCXTYPE_STDCOMPLEX
+#endif // #if not defined __CUDACC__ and defined MGONGPU_CPPCXTYPE_STDCOMPLEX
 
 //==========================================================================
 
diff --git a/epochX/cudacpp/gg_ttggg.sa/src/mgOnGpuFptypes.h b/epochX/cudacpp/gg_ttggg.sa/src/mgOnGpuFptypes.h
index 6f6cee64d6..a1cde16a67 100644
--- a/epochX/cudacpp/gg_ttggg.sa/src/mgOnGpuFptypes.h
+++ b/epochX/cudacpp/gg_ttggg.sa/src/mgOnGpuFptypes.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MGONGPUFPTYPES_H
 #define MGONGPUFPTYPES_H 1
@@ -13,7 +13,7 @@
 
 //==========================================================================
 
-#ifdef MGONGPUCPP_GPUIMPL // cuda
+#ifdef __CUDACC__ // cuda
 
 //------------------------------
 // Floating point types - Cuda
@@ -57,11 +57,11 @@ fpsqrt( const fptype& f )
 #endif
 }
 
-#endif // #ifdef MGONGPUCPP_GPUIMPL
+#endif // #ifdef __CUDACC__
 
 //==========================================================================
 
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
 
 //------------------------------
 // Floating point types - C++
@@ -85,7 +85,7 @@ fpsqrt( const fptype& f )
   return std::sqrt( f );
 }
 
-#endif // #ifndef MGONGPUCPP_GPUIMPL
+#endif // #ifndef __CUDACC__
 
 //==========================================================================
 
diff --git a/epochX/cudacpp/gg_ttggg.sa/src/mgOnGpuVectors.h b/epochX/cudacpp/gg_ttggg.sa/src/mgOnGpuVectors.h
index 7904b93c61..9d3e82b1e3 100644
--- a/epochX/cudacpp/gg_ttggg.sa/src/mgOnGpuVectors.h
+++ b/epochX/cudacpp/gg_ttggg.sa/src/mgOnGpuVectors.h
@@ -108,7 +108,7 @@ namespace mgOnGpu /* clang-format off */
 #endif
 #endif
 
-#else // i.e #ifndef MGONGPU_CPPSIMD (this includes #ifdef MGONGPUCPP_GPUIMPL)
+#else // i.e #ifndef MGONGPU_CPPSIMD (this includes #ifdef __CUDACC__)
 
   const int neppV = 1;
 
@@ -129,7 +129,7 @@ using mgOnGpu::bool_v;
 
 //--------------------------------------------------------------------------
 
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
 
 // Printout to stream for user defined types
 
@@ -744,11 +744,11 @@ fpvsplit1( const fptype2_v& v )
 
 #endif // #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
 
-#endif // #ifndef MGONGPUCPP_GPUIMPL
+#endif // #ifndef __CUDACC__
 
 //==========================================================================
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 
 //------------------------------
 // Vector types - CUDA
@@ -786,12 +786,12 @@ cxternary( const bool& mask, const cxtype& a, const cxtype& b )
   return ( mask ? a : b );
 }
 
-#endif // #ifdef MGONGPUCPP_GPUIMPL
+#endif // #ifdef __CUDACC__
 
 //==========================================================================
 
 // Scalar-or-vector types: scalar in CUDA, vector or scalar in C++
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 typedef bool bool_sv;
 typedef fptype fptype_sv;
 typedef fptype2 fptype2_sv;
@@ -812,7 +812,7 @@ typedef mgOnGpu::cxtype_ref cxtype_sv_ref;
 #endif
 
 // Scalar-or-vector zeros: scalar in CUDA, vector or scalar in C++
-#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
+#ifdef __CUDACC__ /* clang-format off */
 inline __host__ __device__ cxtype cxzero_sv(){ return cxtype( 0, 0 ); }
 #elif defined MGONGPU_CPPSIMD
 inline cxtype_v cxzero_sv() { return cxtype_v(); } // RRRR=0000 IIII=0000
diff --git a/epochX/cudacpp/gg_ttggg.sa/src/rambo.h b/epochX/cudacpp/gg_ttggg.sa/src/rambo.h
index cd7e1008ea..e02ea52496 100644
--- a/epochX/cudacpp/gg_ttggg.sa/src/rambo.h
+++ b/epochX/cudacpp/gg_ttggg.sa/src/rambo.h
@@ -4,7 +4,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 
 #include "mgOnGpuConfig.h"
@@ -18,7 +18,7 @@
 #include <iostream>
 
 // Simplified rambo version for 2 to N (with N>=2) processes with massless particles
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -83,7 +83,7 @@ namespace mg5amcCpu
       static bool first = true;
       if( first )
       {
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
         if constexpr( M_ACCESS::isOnDevice() ) // avoid
         {
           const int ievt0 = 0;
@@ -166,7 +166,7 @@ namespace mg5amcCpu
     wt = po2log;
     if( nparf != 2 ) wt = ( 2. * nparf - 4. ) * log( energy ) + z[nparf - 1];
 
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
     // issue warnings if weight is too small or too large
     static int iwarn[5] = { 0, 0, 0, 0, 0 };
     if( wt < -180. )
diff --git a/epochX/cudacpp/gq_ttq.mad/COPYRIGHT b/epochX/cudacpp/gq_ttq.mad/COPYRIGHT
index 84a883fbb0..a134b5fef9 100644
--- a/epochX/cudacpp/gq_ttq.mad/COPYRIGHT
+++ b/epochX/cudacpp/gq_ttq.mad/COPYRIGHT
@@ -15,7 +15,6 @@ The full development team currently includes the following authors :
   Stephan Hageboeck (CERN)
   Olivier Mattelaer (Universite Catholique de Louvain, original author)
   Stefan Roiser (CERN, original author)
-  Joergen Teig (CERN)
   Andrea Valassi (CERN, original author)
   Zenny Wettersten (CERN)
 See https://github.com/madgraph5/madgraph4gpu for more details. For the full
diff --git a/epochX/cudacpp/gq_ttq.mad/Source/DHELAS/aloha_file.inc b/epochX/cudacpp/gq_ttq.mad/Source/DHELAS/aloha_file.inc
index 4457933199..0c895f2b2c 100644
--- a/epochX/cudacpp/gq_ttq.mad/Source/DHELAS/aloha_file.inc
+++ b/epochX/cudacpp/gq_ttq.mad/Source/DHELAS/aloha_file.inc
@@ -1 +1 @@
-ALOHARoutine = FFV1_1.o FFV1_0.o VVV1_0.o FFV1_2.o FFV1P0_3.o
+ALOHARoutine = FFV1_1.o FFV1_2.o VVV1_0.o FFV1_0.o FFV1P0_3.o
diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/Bridge.h b/epochX/cudacpp/gq_ttq.mad/SubProcesses/Bridge.h
index c04628dfd1..4cafe0c997 100644
--- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/Bridge.h
+++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/Bridge.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: S. Roiser (Nov 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Roiser, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef BRIDGE_H
 #define BRIDGE_H 1
@@ -22,7 +22,7 @@
 #include <memory>
 #include <type_traits>
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -82,7 +82,7 @@ namespace mg5amcCpu
     Bridge& operator=( const Bridge& ) = delete;
     Bridge& operator=( Bridge&& ) = delete;
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
     /**
      * Set the gpublocks and gputhreads for the gpusequence - throws if evnt != gpublocks*gputhreads
      * (this is needed for BridgeKernel tests rather than for actual production use in Fortran)
@@ -149,7 +149,7 @@ namespace mg5amcCpu
     unsigned int m_nevt; // number of events
     int m_nGoodHel;      // the number of good helicities (-1 initially when they have not yet been calculated)
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
     int m_gputhreads; // number of gpu threads (default set from number of events, can be modified)
     int m_gpublocks;  // number of gpu blocks (default set from number of events, can be modified)
     mg5amcGpu::DeviceBuffer<FORTRANFPTYPE, sizePerEventMomenta> m_devMomentaF;
@@ -186,12 +186,12 @@ namespace mg5amcCpu
   // Forward declare transposition methods
   //
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 
   template<typename Tin, typename Tout>
   __global__ void dev_transposeMomentaF2C( const Tin* in, Tout* out, const unsigned int nevt );
 
-#endif // MGONGPUCPP_GPUIMPL
+#endif // __CUDACC__
 
   template<typename Tin, typename Tout>
   void hst_transposeMomentaF2C( const Tin* in, Tout* out, const unsigned int nevt );
@@ -208,7 +208,7 @@ namespace mg5amcCpu
   Bridge<FORTRANFPTYPE>::Bridge( unsigned int nevtF, unsigned int nparF, unsigned int np4F )
     : m_nevt( nevtF )
     , m_nGoodHel( -1 )
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
     , m_gputhreads( 256 )                  // default number of gpu threads
     , m_gpublocks( m_nevt / m_gputhreads ) // this ensures m_nevt <= m_gpublocks*m_gputhreads
     , m_devMomentaF( m_nevt )
@@ -232,7 +232,7 @@ namespace mg5amcCpu
   {
     if( nparF != CPPProcess::npar ) throw std::runtime_error( "Bridge constructor: npar mismatch" );
     if( np4F != CPPProcess::np4 ) throw std::runtime_error( "Bridge constructor: np4 mismatch" );
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
     if( ( m_nevt < s_gputhreadsmin ) || ( m_nevt % s_gputhreadsmin != 0 ) )
       throw std::runtime_error( "Bridge constructor: nevt should be a multiple of " + std::to_string( s_gputhreadsmin ) );
     while( m_nevt != m_gpublocks * m_gputhreads )
@@ -250,11 +250,11 @@ namespace mg5amcCpu
     std::cout << "WARNING! Instantiate host Bridge (nevt=" << m_nevt << ")" << std::endl;
     mg5amcCpu::CPPProcess process( /*verbose=*/false );
     m_pmek.reset( new mg5amcCpu::MatrixElementKernelHost( m_hstMomentaC, m_hstGs, m_hstRndHel, m_hstRndCol, m_hstMEs, m_hstSelHel, m_hstSelCol, m_nevt ) );
-#endif // MGONGPUCPP_GPUIMPL
+#endif // __CUDACC__
     process.initProc( "../../Cards/param_card.dat" );
   }
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
   template<typename FORTRANFPTYPE>
   void Bridge<FORTRANFPTYPE>::set_gpugrid( const int gpublocks, const int gputhreads )
   {
@@ -268,7 +268,7 @@ namespace mg5amcCpu
   }
 #endif
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
   template<typename FORTRANFPTYPE>
   void Bridge<FORTRANFPTYPE>::gpu_sequence( const FORTRANFPTYPE* momenta,
                                             const FORTRANFPTYPE* gs,
@@ -283,14 +283,14 @@ namespace mg5amcCpu
     constexpr int neppM = MemoryAccessMomenta::neppM;
     if constexpr( neppM == 1 && std::is_same_v<FORTRANFPTYPE, fptype> )
     {
-      gpuMemcpy( m_devMomentaC.data(), momenta, m_devMomentaC.bytes(), gpuMemcpyHostToDevice );
+      checkCuda( cudaMemcpy( m_devMomentaC.data(), momenta, m_devMomentaC.bytes(), cudaMemcpyHostToDevice ) );
     }
     else
     {
-      gpuMemcpy( m_devMomentaF.data(), momenta, m_devMomentaF.bytes(), gpuMemcpyHostToDevice );
+      checkCuda( cudaMemcpy( m_devMomentaF.data(), momenta, m_devMomentaF.bytes(), cudaMemcpyHostToDevice ) );
       const int thrPerEvt = CPPProcess::npar * CPPProcess::np4; // AV: transpose alg does 1 element per thread (NOT 1 event per thread)
       //const int thrPerEvt = 1; // AV: try new alg with 1 event per thread... this seems slower
-      gpuLaunchKernel( dev_transposeMomentaF2C, m_gpublocks * thrPerEvt, m_gputhreads, m_devMomentaF.data(), m_devMomentaC.data(), m_nevt );
+      dev_transposeMomentaF2C<<<m_gpublocks * thrPerEvt, m_gputhreads>>>( m_devMomentaF.data(), m_devMomentaC.data(), m_nevt );
     }
     if constexpr( std::is_same_v<FORTRANFPTYPE, fptype> )
     {
@@ -333,7 +333,7 @@ namespace mg5amcCpu
   }
 #endif
 
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
   template<typename FORTRANFPTYPE>
   void Bridge<FORTRANFPTYPE>::cpu_sequence( const FORTRANFPTYPE* momenta,
                                             const FORTRANFPTYPE* gs,
@@ -388,7 +388,7 @@ namespace mg5amcCpu
   // - C++ array: momenta[npagM][npar][np4][neppM] with nevt=npagM*neppM (AOSOA)
   //
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
   template<typename Tin, typename Tout>
   __global__ void dev_transposeMomentaF2C( const Tin* in, Tout* out, const unsigned int nevt )
   {
diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/BridgeKernels.cc b/epochX/cudacpp/gq_ttq.mad/SubProcesses/BridgeKernels.cc
index 90c7f2d3b8..cef4cb3c71 100644
--- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/BridgeKernels.cc
+++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/BridgeKernels.cc
@@ -1,11 +1,10 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #include "BridgeKernels.h"
 
-#include "GpuAbstraction.h"
 #include "MemoryAccessMomenta.h"
 
 #include <sstream>
@@ -15,7 +14,7 @@ constexpr int npar = CPPProcess::npar; // #particles in total (external = initia
 
 //============================================================================
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -46,7 +45,7 @@ namespace mg5amcCpu
 
 //============================================================================
 
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
 namespace mg5amcCpu
 {
 
@@ -97,7 +96,7 @@ namespace mg5amcCpu
 
 //============================================================================
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 namespace mg5amcGpu
 {
 
diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/BridgeKernels.h b/epochX/cudacpp/gq_ttq.mad/SubProcesses/BridgeKernels.h
index 3efef8ce97..15eb4bff4d 100644
--- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/BridgeKernels.h
+++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/BridgeKernels.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef BRIDGEKERNELS_H
 #define BRIDGEKERNELS_H 1
@@ -12,7 +12,7 @@
 #include "MatrixElementKernels.h"
 #include "MemoryBuffers.h"
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -49,7 +49,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
   // A Bridge wrapper class encapsulating matrix element calculations on a CPU host
   class BridgeKernelHost final : public BridgeKernelBase
   {
@@ -89,7 +89,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
   // A Bridge wrapper class encapsulating matrix element calculations on a GPU device
   class BridgeKernelDevice : public BridgeKernelBase
   {
diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/CommonRandomNumberKernel.cc b/epochX/cudacpp/gq_ttq.mad/SubProcesses/CommonRandomNumberKernel.cc
index 010bc4cbd0..985b39f576 100644
--- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/CommonRandomNumberKernel.cc
+++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/CommonRandomNumberKernel.cc
@@ -1,16 +1,15 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #include "CommonRandomNumbers.h"
-#include "GpuAbstraction.h"
 #include "MemoryBuffers.h"
 #include "RandomNumberKernels.h"
 
 #include <cassert>
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/CrossSectionKernels.cc b/epochX/cudacpp/gq_ttq.mad/SubProcesses/CrossSectionKernels.cc
index c15b39844d..0b355a3c8d 100644
--- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/CrossSectionKernels.cc
+++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/CrossSectionKernels.cc
@@ -1,11 +1,10 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #include "CrossSectionKernels.h"
 
-#include "GpuAbstraction.h"
 #include "MemoryAccessMatrixElements.h"
 #include "MemoryAccessWeights.h"
 #include "MemoryBuffers.h"
@@ -78,7 +77,7 @@ debug_me_is_abnormal( const fptype& me, size_t ievtALL )
 
 //============================================================================
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -186,7 +185,7 @@ namespace mg5amcCpu
 
 //============================================================================
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 namespace mg5amcGpu
 {
 
diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/CrossSectionKernels.h b/epochX/cudacpp/gq_ttq.mad/SubProcesses/CrossSectionKernels.h
index 4d9659e04e..7933ca4bbf 100644
--- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/CrossSectionKernels.h
+++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/CrossSectionKernels.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef CROSSSECTIONKERNELS_H
 #define CROSSSECTIONKERNELS_H 1
@@ -13,7 +13,7 @@
 
 //============================================================================
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -96,7 +96,7 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
   /*
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
   // A class encapsulating the calculation of event statistics on a GPU device
   class CrossSectionKernelDevice : public CrossSectionKernelBase, public NumberOfEvents
   {
diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/CudaRuntime.h b/epochX/cudacpp/gq_ttq.mad/SubProcesses/CudaRuntime.h
new file mode 100644
index 0000000000..64ce52f4b3
--- /dev/null
+++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/CudaRuntime.h
@@ -0,0 +1,85 @@
+// Copyright (C) 2020-2023 CERN and UCLouvain.
+// Licensed under the GNU Lesser General Public License (version 3 or later).
+// Created by: S. Roiser (Jul 2020) for the MG5aMC CUDACPP plugin.
+// Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+
+#ifndef MG5AMC_CUDARUNTIME_H
+#define MG5AMC_CUDARUNTIME_H 1
+
+// MG5AMC on GPU uses the CUDA runtime API, not the lower level CUDA driver API
+// See https://docs.nvidia.com/cuda/cuda-runtime-api/driver-vs-runtime-api.html#driver-vs-runtime-api
+
+#include <cassert>
+#include <iostream>
+
+//--------------------------------------------------------------------------
+
+// See https://stackoverflow.com/a/14038590
+#ifdef __CUDACC__ /* clang-format off */
+#define checkCuda( code ) { assertCuda( code, __FILE__, __LINE__ ); }
+inline void assertCuda( cudaError_t code, const char* file, int line, bool abort = true )
+{
+  if( code != cudaSuccess )
+  {
+    printf( "ERROR! assertCuda: '%s' (%d) in %s:%d\n", cudaGetErrorString( code ), code, file, line );
+    if( abort ) assert( code == cudaSuccess );
+  }
+}
+#endif /* clang-format on */
+
+//--------------------------------------------------------------------------
+
+#ifdef __CUDACC__
+namespace mg5amcGpu
+{
+  // Instantiate a CudaRuntime at the beginnining of the application's main to
+  // invoke cudaSetDevice(0) in the constructor and book a cudaDeviceReset() call in the destructor
+  // *** FIXME! This will all need to be designed differently when going to multi-GPU nodes! ***
+  struct CudaRuntime final
+  {
+    CudaRuntime( const bool debug = true )
+      : m_debug( debug ) { setUp( m_debug ); }
+    ~CudaRuntime() { tearDown( m_debug ); }
+    CudaRuntime( const CudaRuntime& ) = delete;
+    CudaRuntime( CudaRuntime&& ) = delete;
+    CudaRuntime& operator=( const CudaRuntime& ) = delete;
+    CudaRuntime& operator=( CudaRuntime&& ) = delete;
+    bool m_debug;
+
+    // Set up CUDA application
+    // ** NB: strictly speaking this is not needed when using the CUDA runtime API **
+    // Calling cudaSetDevice on startup is useful to properly book-keep the time spent in CUDA initialization
+    static void setUp( const bool debug = true )
+    {
+      // ** NB: it is useful to call cudaSetDevice, or cudaFree, to properly book-keep the time spent in CUDA initialization
+      // ** NB: otherwise, the first CUDA operation (eg a cudaMemcpyToSymbol in CPPProcess ctor) appears to take much longer!
+      /*
+      // [We initially added cudaFree(0) to "ease profile analysis" only because it shows up as a big recognizable block!]
+      // No explicit initialization is needed: https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#initialization
+      // It is not clear what cudaFree(0) does at all: https://stackoverflow.com/questions/69967813/
+      if ( debug ) std::cout << "__CudaRuntime: calling cudaFree(0)" << std::endl;
+      checkCuda( cudaFree( 0 ) ); // SLOW!
+      */
+      // Replace cudaFree(0) by cudaSetDevice(0), even if it is not really needed either
+      // (but see https://developer.nvidia.com/blog/cuda-pro-tip-always-set-current-device-avoid-multithreading-bugs)
+      if( debug ) std::cout << "__CudaRuntime: calling cudaSetDevice(0)" << std::endl;
+      checkCuda( cudaSetDevice( 0 ) ); // SLOW!
+    }
+
+    // Tear down CUDA application (call cudaDeviceReset)
+    // ** NB: strictly speaking this is not needed when using the CUDA runtime API **
+    // Calling cudaDeviceReset on shutdown is only needed for checking memory leaks in cuda-memcheck
+    // See https://docs.nvidia.com/cuda/cuda-memcheck/index.html#leak-checking
+    static void tearDown( const bool debug = true )
+    {
+      if( debug ) std::cout << "__CudaRuntime: calling cudaDeviceReset()" << std::endl;
+      checkCuda( cudaDeviceReset() );
+    }
+  };
+
+}
+#endif
+
+//--------------------------------------------------------------------------
+
+#endif // MG5AMC_CUDARUNTIME_H
diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/CurandRandomNumberKernel.cc b/epochX/cudacpp/gq_ttq.mad/SubProcesses/CurandRandomNumberKernel.cc
index 38c477c17a..eb56333b03 100644
--- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/CurandRandomNumberKernel.cc
+++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/CurandRandomNumberKernel.cc
@@ -1,9 +1,9 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
-#include "GpuRuntime.h"
+#include "CudaRuntime.h"
 #include "MemoryBuffers.h"
 #include "RandomNumberKernels.h"
 
@@ -114,7 +114,7 @@ namespace mg5amcCpu
     /*
     printf( "\nCurandRandomNumberKernel::generateRnarray size = %d\n", (int)m_rnarray.size() );
     fptype* data = m_rnarray.data();
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
     if( m_rnarray.isOnDevice() )
     {
       data = new fptype[m_rnarray.size()]();
@@ -123,7 +123,7 @@ namespace mg5amcCpu
 #endif
     for( int i = 0; i < ( (int)m_rnarray.size() / 4 ); i++ )
       printf( "[%4d] %f %f %f %f\n", i * 4, data[i * 4], data[i * 4 + 2], data[i * 4 + 2], data[i * 4 + 3] );
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
     if( m_rnarray.isOnDevice() ) delete[] data;
 #endif
     */
diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/EventStatistics.h b/epochX/cudacpp/gq_ttq.mad/SubProcesses/EventStatistics.h
index b425a5bade..48b51e0a49 100644
--- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/EventStatistics.h
+++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/EventStatistics.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef EventStatistics_H
 #define EventStatistics_H 1
@@ -16,7 +16,7 @@
 #include <limits>
 #include <string>
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/MadgraphTest.h b/epochX/cudacpp/gq_ttq.mad/SubProcesses/MadgraphTest.h
index d2ff326e20..fd7734ce42 100644
--- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/MadgraphTest.h
+++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/MadgraphTest.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: S. Hageboeck (Dec 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MADGRAPHTEST_H_
 #define MADGRAPHTEST_H_ 1
@@ -21,7 +21,7 @@
 #include <string>
 #include <vector>
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 using mg5amcGpu::CPPProcess;
 #else
 using mg5amcCpu::CPPProcess;
@@ -200,7 +200,7 @@ class MadgraphTest : public testing::TestWithParam<TestDriverBase*>
 
 // Since we link both the CPU-only and GPU tests into the same executable, we prevent
 // a multiply defined symbol by only compiling this in the non-CUDA phase:
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
 
 /// Compare momenta and matrix elements.
 /// This uses an implementation of TestDriverBase to run a madgraph workflow,
@@ -309,6 +309,6 @@ TEST_P( MadgraphTest, CompareMomentaAndME )
   }
 }
 
-#endif // MGONGPUCPP_GPUIMPL
+#endif // __CUDACC__
 
 #endif /* MADGRAPHTEST_H_ */
diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/MatrixElementKernels.cc b/epochX/cudacpp/gq_ttq.mad/SubProcesses/MatrixElementKernels.cc
index d6d6c4f179..30257195b6 100644
--- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/MatrixElementKernels.cc
+++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/MatrixElementKernels.cc
@@ -1,12 +1,12 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #include "MatrixElementKernels.h"
 
 #include "CPPProcess.h"
-#include "GpuRuntime.h" // Includes the abstraction for Nvidia/AMD compilation
+#include "CudaRuntime.h"
 #include "MemoryAccessMomenta.h"
 #include "MemoryBuffers.h"
 
@@ -14,7 +14,7 @@
 
 //============================================================================
 
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
 namespace mg5amcCpu
 {
 
@@ -143,7 +143,7 @@ namespace mg5amcCpu
 
 //============================================================================
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 namespace mg5amcGpu
 {
 
@@ -202,13 +202,13 @@ namespace mg5amcGpu
     PinnedHostBufferHelicityMask hstIsGoodHel( ncomb );
     DeviceBufferHelicityMask devIsGoodHel( ncomb );
     // ... 0d1. Compute good helicity mask on the device
-    gpuLaunchKernel( computeDependentCouplings, m_gpublocks, m_gputhreads, m_gs.data(), m_couplings.data() );
+    computeDependentCouplings<<<m_gpublocks, m_gputhreads>>>( m_gs.data(), m_couplings.data() );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    gpuLaunchKernel( sigmaKin_getGoodHel, m_gpublocks, m_gputhreads, m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_numerators.data(), m_denominators.data(), devIsGoodHel.data() );
+    sigmaKin_getGoodHel<<<m_gpublocks, m_gputhreads>>>( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_numerators.data(), m_denominators.data(), devIsGoodHel.data() );
 #else
-    gpuLaunchKernel( sigmaKin_getGoodHel, m_gpublocks, m_gputhreads, m_momenta.data(), m_couplings.data(), m_matrixElements.data(), devIsGoodHel.data() );
+    sigmaKin_getGoodHel<<<m_gpublocks, m_gputhreads>>>( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), devIsGoodHel.data() );
 #endif
-    checkGpu( gpuPeekAtLastError() );
+    checkCuda( cudaPeekAtLastError() );
     // ... 0d2. Copy back good helicity mask to the host
     copyHostFromDevice( hstIsGoodHel, devIsGoodHel );
     // ... 0d3. Copy back good helicity list to constant memory on the device
@@ -219,19 +219,19 @@ namespace mg5amcGpu
 
   void MatrixElementKernelDevice::computeMatrixElements( const unsigned int channelId )
   {
-    gpuLaunchKernel( computeDependentCouplings, m_gpublocks, m_gputhreads, m_gs.data(), m_couplings.data() );
+    computeDependentCouplings<<<m_gpublocks, m_gputhreads>>>( m_gs.data(), m_couplings.data() );
 #ifndef MGONGPU_NSIGHT_DEBUG
     constexpr unsigned int sharedMemSize = 0;
 #else
     constexpr unsigned int sharedMemSize = ntpbMAX * sizeof( float );
 #endif
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    gpuLaunchKernelSharedMem( sigmaKin, m_gpublocks, m_gputhreads, sharedMemSize, m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), channelId, m_numerators.data(), m_denominators.data(), m_selhel.data(), m_selcol.data() );
+    sigmaKin<<<m_gpublocks, m_gputhreads, sharedMemSize>>>( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), channelId, m_numerators.data(), m_denominators.data(), m_selhel.data(), m_selcol.data() );
 #else
-    gpuLaunchKernelSharedMem( sigmaKin, m_gpublocks, m_gputhreads, sharedMemSize, m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), m_selhel.data(), m_selcol.data() );
+    sigmaKin<<<m_gpublocks, m_gputhreads, sharedMemSize>>>( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), m_selhel.data(), m_selcol.data() );
 #endif
-    checkGpu( gpuPeekAtLastError() );
-    checkGpu( gpuDeviceSynchronize() );
+    checkCuda( cudaPeekAtLastError() );
+    checkCuda( cudaDeviceSynchronize() );
   }
 
   //--------------------------------------------------------------------------
diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/MatrixElementKernels.h b/epochX/cudacpp/gq_ttq.mad/SubProcesses/MatrixElementKernels.h
index 72bd8f195b..23e84757a2 100644
--- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/MatrixElementKernels.h
+++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/MatrixElementKernels.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MATRIXELEMENTKERNELS_H
 #define MATRIXELEMENTKERNELS_H 1
@@ -10,7 +10,7 @@
 
 #include "MemoryBuffers.h"
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -81,7 +81,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
   // A class encapsulating matrix element calculations on a CPU host
   class MatrixElementKernelHost final : public MatrixElementKernelBase, public NumberOfEvents
   {
@@ -130,7 +130,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
   // A class encapsulating matrix element calculations on a GPU device
   class MatrixElementKernelDevice : public MatrixElementKernelBase, public NumberOfEvents
   {
diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/MemoryAccessHelpers.h b/epochX/cudacpp/gq_ttq.mad/SubProcesses/MemoryAccessHelpers.h
index db73e4e064..c82a6c7635 100644
--- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/MemoryAccessHelpers.h
+++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/MemoryAccessHelpers.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MemoryAccessHelpers_H
 #define MemoryAccessHelpers_H 1
@@ -105,7 +105,7 @@ class KernelAccessHelper : public MemoryAccessHelper<T>
     }
     else
     {
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
       const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid
       //printf( "kernelAccessRecord: ievt=%d threadId=%d\n", ievt, threadIdx.x );
       return T::ieventAccessRecord( buffer, ievt ); // NB fptype and fptype_sv coincide for CUDA
diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/MemoryAccessMomenta.h b/epochX/cudacpp/gq_ttq.mad/SubProcesses/MemoryAccessMomenta.h
index 38fade09fb..0ac4faa3c7 100644
--- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/MemoryAccessMomenta.h
+++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/MemoryAccessMomenta.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MemoryAccessMomenta_H
 #define MemoryAccessMomenta_H 1
@@ -12,7 +12,7 @@
 #include "MemoryAccessHelpers.h"
 #include "MemoryAccessVectors.h"
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 using mg5amcGpu::CPPProcess;
 #else
 using mg5amcCpu::CPPProcess;
@@ -29,7 +29,7 @@ class MemoryAccessMomentaBase //_AOSOAv1
 
   // Number of Events Per Page in the momenta AOSOA memory buffer layout
   // (these are all best kept as a compile-time constants: see issue #23)
-#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
+#ifdef __CUDACC__ /* clang-format off */
   // -----------------------------------------------------------------------------------------------
   // --- GPUs: neppM is best set to a power of 2 times the number of fptype's in a 32-byte cacheline
   // --- This is relevant to ensure coalesced access to momenta in global memory
diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/MemoryAccessRandomNumbers.h b/epochX/cudacpp/gq_ttq.mad/SubProcesses/MemoryAccessRandomNumbers.h
index 40cb089135..e2988d39f3 100644
--- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/MemoryAccessRandomNumbers.h
+++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/MemoryAccessRandomNumbers.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MemoryAccessRandomNumbers_H
 #define MemoryAccessRandomNumbers_H 1
@@ -11,7 +11,7 @@
 #include "CPPProcess.h"
 #include "MemoryAccessHelpers.h"
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 using mg5amcGpu::CPPProcess;
 #else
 using mg5amcCpu::CPPProcess;
diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/MemoryAccessVectors.h b/epochX/cudacpp/gq_ttq.mad/SubProcesses/MemoryAccessVectors.h
index 08faccff0f..e9b197368e 100644
--- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/MemoryAccessVectors.h
+++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/MemoryAccessVectors.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MemoryAccessVectors_H
 #define MemoryAccessVectors_H 1
@@ -10,7 +10,7 @@
 
 #include "mgOnGpuVectors.h"
 
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
 namespace mg5amcCpu // this is only needed for CPU SIMD vectorization
 {
 
diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/MemoryBuffers.h b/epochX/cudacpp/gq_ttq.mad/SubProcesses/MemoryBuffers.h
index 7756a71621..3093e6ed18 100644
--- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/MemoryBuffers.h
+++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/MemoryBuffers.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021, based on earlier work by S. Hageboeck) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Roiser, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MemoryBuffers_H
 #define MemoryBuffers_H 1
@@ -11,12 +11,12 @@
 #include "mgOnGpuCxtypes.h"
 
 #include "CPPProcess.h"
-#include "GpuRuntime.h"
+#include "CudaRuntime.h"
 #include "Parameters_sm.h"
 
 #include <sstream>
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -87,7 +87,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
   constexpr bool HostBufferALIGNED = false;   // ismisaligned=false
   constexpr bool HostBufferMISALIGNED = true; // ismisaligned=true
 
@@ -119,7 +119,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
   // A class encapsulating a CUDA pinned host buffer
   template<typename T>
   class PinnedHostBufferBase : public BufferBase<T>
@@ -128,18 +128,18 @@ namespace mg5amcCpu
     PinnedHostBufferBase( const size_t size )
       : BufferBase<T>( size, false )
     {
-      gpuMallocHost( &( this->m_data ), this->bytes() );
+      checkCuda( cudaMallocHost( &( this->m_data ), this->bytes() ) );
     }
     virtual ~PinnedHostBufferBase()
     {
-      gpuFreeHost( this->m_data );
+      checkCuda( cudaFreeHost( this->m_data ) );
     }
   };
 #endif
 
   //--------------------------------------------------------------------------
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
   // A class encapsulating a CUDA device buffer
   template<typename T>
   class DeviceBufferBase : public BufferBase<T>
@@ -148,18 +148,18 @@ namespace mg5amcCpu
     DeviceBufferBase( const size_t size )
       : BufferBase<T>( size, true )
     {
-      gpuMalloc( &( this->m_data ), this->bytes() );
+      checkCuda( cudaMalloc( &( this->m_data ), this->bytes() ) );
     }
     virtual ~DeviceBufferBase()
     {
-      gpuFree( this->m_data );
+      checkCuda( cudaFree( this->m_data ) );
     }
   };
 #endif
 
   //--------------------------------------------------------------------------
 
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
   // A class encapsulating a C++ host buffer for a given number of events
   template<typename T, size_t sizePerEvent, bool ismisaligned>
   class HostBuffer : public HostBufferBase<T, ismisaligned>, virtual private NumberOfEvents
@@ -175,7 +175,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
   // A class encapsulating a CUDA pinned host buffer for a given number of events
   template<typename T, size_t sizePerEvent>
   class PinnedHostBuffer : public PinnedHostBufferBase<T>, virtual private NumberOfEvents
@@ -191,7 +191,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
   // A class encapsulating a CUDA device buffer for a given number of events
   template<typename T, size_t sizePerEvent>
   class DeviceBuffer : public DeviceBufferBase<T>, virtual private NumberOfEvents
@@ -213,7 +213,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for momenta random numbers
   constexpr size_t sizePerEventRndNumMomenta = MemoryBuffers::np4 * MemoryBuffers::nparf;
 
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
   // A class encapsulating a C++ host buffer for momenta random numbers
   typedef HostBuffer<fptype, sizePerEventRndNumMomenta, HostBufferALIGNED> HostBufferRndNumMomenta;
 #else
@@ -232,7 +232,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer with ONE fptype per event
   constexpr size_t sizePerEventOneFp = 1;
 
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
   // A class encapsulating a C++ host buffer with ONE fptype per event
   typedef HostBuffer<fptype, sizePerEventOneFp, HostBufferALIGNED> HostBufferOneFp;
 #else
@@ -257,7 +257,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for Gs
   constexpr size_t sizePerEventGs = 1;
 
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
   // A class encapsulating a C++ host buffer for gs
   typedef HostBuffer<fptype, sizePerEventGs, HostBufferALIGNED> HostBufferGs;
 #else
@@ -276,7 +276,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for numerators
   constexpr size_t sizePerEventNumerators = 1;
 
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
   // A class encapsulating a C++ host buffer for gs
   typedef HostBuffer<fptype, sizePerEventNumerators, HostBufferALIGNED> HostBufferNumerators;
 #else
@@ -296,7 +296,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for denominators
   constexpr size_t sizePerEventDenominators = 1;
 
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
   // A class encapsulating a C++ host buffer for gs
   typedef HostBuffer<fptype, sizePerEventDenominators, HostBufferALIGNED> HostBufferDenominators;
 #else
@@ -315,7 +315,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for random numbers
   constexpr size_t sizePerEventCouplings = MemoryBuffers::ndcoup * MemoryBuffers::nx2;
 
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
   // A class encapsulating a C++ host buffer for gs
   typedef HostBuffer<fptype, sizePerEventCouplings, HostBufferALIGNED> HostBufferCouplings;
 #else
@@ -333,7 +333,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for momenta
   constexpr size_t sizePerEventMomenta = MemoryBuffers::np4 * MemoryBuffers::npar;
 
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
   // A class encapsulating a C++ host buffer for momenta
   typedef HostBuffer<fptype, sizePerEventMomenta, HostBufferALIGNED> HostBufferMomenta;
   //typedef HostBuffer<fptype, sizePerEventMomenta, HostBufferMISALIGNED> HostBufferMomenta; // TEST MISALIGNMENT!
@@ -352,7 +352,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for sampling weights
   constexpr size_t sizePerEventWeights = 1;
 
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
   // A class encapsulating a C++ host buffer for sampling weights
   typedef HostBuffer<fptype, sizePerEventWeights, HostBufferALIGNED> HostBufferWeights;
 #else
@@ -370,7 +370,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for matrix elements
   constexpr size_t sizePerEventMatrixElements = 1;
 
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
   // A class encapsulating a C++ host buffer for matrix elements
   typedef HostBuffer<fptype, sizePerEventMatrixElements, HostBufferALIGNED> HostBufferMatrixElements;
 #else
@@ -385,7 +385,7 @@ namespace mg5amcCpu
   // A base class encapsulating a memory buffer for the helicity mask
   typedef BufferBase<bool> BufferHelicityMask;
 
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
   // A class encapsulating a C++ host buffer for the helicity mask
   typedef HostBufferBase<bool, HostBufferALIGNED> HostBufferHelicityMask;
 #else
@@ -403,7 +403,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for wavefunctions
   constexpr size_t sizePerEventWavefunctions = MemoryBuffers::nw6 * MemoryBuffers::nx2;
 
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
   // A class encapsulating a C++ host buffer for wavefunctions
   typedef HostBuffer<fptype, sizePerEventWavefunctions, HostBufferALIGNED> HostBufferWavefunctions;
 #else
@@ -421,7 +421,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for helicity random numbers
   constexpr size_t sizePerEventRndNumHelicity = 1;
 
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
   // A class encapsulating a C++ host buffer for helicity random numbers
   typedef HostBuffer<fptype, sizePerEventRndNumHelicity, HostBufferALIGNED> HostBufferRndNumHelicity;
 #else
@@ -439,7 +439,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for color random numbers
   constexpr size_t sizePerEventRndNumColor = 1;
 
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
   // A class encapsulating a C++ host buffer for color random numbers
   typedef HostBuffer<fptype, sizePerEventRndNumColor, HostBufferALIGNED> HostBufferRndNumColor;
 #else
@@ -457,7 +457,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for helicity selection
   constexpr size_t sizePerEventSelectedHelicity = 1;
 
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
   // A class encapsulating a C++ host buffer for helicity selection
   typedef HostBuffer<int, sizePerEventSelectedHelicity, HostBufferALIGNED> HostBufferSelectedHelicity;
 #else
@@ -475,7 +475,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for color selection
   constexpr size_t sizePerEventSelectedColor = 1;
 
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
   // A class encapsulating a C++ host buffer for color selection
   typedef HostBuffer<int, sizePerEventSelectedColor, HostBufferALIGNED> HostBufferSelectedColor;
 #else
@@ -487,7 +487,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
   template<class Tdst, class Tsrc>
   void copyDeviceFromHost( Tdst& dst, const Tsrc& src ) // keep the same order of arguments as in memcpy
   {
@@ -504,13 +504,13 @@ namespace mg5amcCpu
       throw std::runtime_error( sstr.str() );
     }
     // NB (PR #45): cudaMemcpy involves an intermediate memcpy to pinned memory if host array is a not a pinned host array
-    gpuMemcpy( dst.data(), src.data(), src.bytes(), gpuMemcpyHostToDevice );
+    checkCuda( cudaMemcpy( dst.data(), src.data(), src.bytes(), cudaMemcpyHostToDevice ) );
   }
 #endif
 
   //--------------------------------------------------------------------------
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
   template<class Tdst, class Tsrc>
   void copyHostFromDevice( Tdst& dst, const Tsrc& src ) // keep the same order of arguments as in memcpy
   {
@@ -527,7 +527,7 @@ namespace mg5amcCpu
       throw std::runtime_error( sstr.str() );
     }
     // NB (PR #45): cudaMemcpy involves an intermediate memcpy to pinned memory if host array is a not a pinned host array
-    gpuMemcpy( dst.data(), src.data(), src.bytes(), gpuMemcpyDeviceToHost );
+    checkCuda( cudaMemcpy( dst.data(), src.data(), src.bytes(), cudaMemcpyDeviceToHost ) );
   }
 #endif
 
diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/CPPProcess.cc b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/CPPProcess.cc
index 3732ec6679..3a9b2fddaf 100644
--- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/CPPProcess.cc
+++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/CPPProcess.cc
@@ -4,7 +4,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
 // MadGraph5_aMC@NLO v. 3.5.0_lo_vect, 2023-06-09
@@ -16,6 +16,7 @@
 
 #include "mgOnGpuConfig.h"
 
+#include "CudaRuntime.h"
 #include "HelAmps_sm.h"
 #include "MemoryAccessAmplitudes.h"
 #include "MemoryAccessCouplings.h"
@@ -48,7 +49,7 @@
 // Process: g d > t t~ d WEIGHTED<=3 @1
 // Process: g s > t t~ s WEIGHTED<=3 @1
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -82,7 +83,7 @@ namespace mg5amcCpu
   __device__ const fptype cIPD[2] = { (fptype)Parameters_sm::mdl_MT, (fptype)Parameters_sm::mdl_WT };
   __device__ const fptype* cIPC = nullptr; // unused as nicoup=0
 #else
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
   __device__ __constant__ fptype cIPD[2];
   __device__ __constant__ fptype* cIPC = nullptr; // unused as nicoup=0
 #else
@@ -92,7 +93,7 @@ namespace mg5amcCpu
 #endif
 
   // Helicity combinations (and filtering of "good" helicity combinations)
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
   __device__ __constant__ short cHel[ncomb][npar];
   __device__ __constant__ int cNGoodHel;
   __device__ __constant__ int cGoodHel[ncomb];
@@ -120,13 +121,13 @@ namespace mg5amcCpu
                            fptype* allDenominators,       // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
                            fptype_sv* jamp2_sv            // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled)
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
                            , const int ievt00             // input: first event number in current C++ event page (for CUDA, ievt depends on threadid)
 #endif
                            )
   //ALWAYS_INLINE // attributes are not permitted in a function definition
   {
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
     using namespace mg5amcGpu;
     using M_ACCESS = DeviceAccessMomenta;         // non-trivial access: buffer includes all events
     using E_ACCESS = DeviceAccessMatrixElements;  // non-trivial access: buffer includes all events
@@ -153,7 +154,7 @@ namespace mg5amcCpu
 #endif /* clang-format on */
     mgDebug( 0, __FUNCTION__ );
     //printf( "calculate_wavefunctions: ihel=%2d\n", ihel );
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
     //printf( "calculate_wavefunctions: ievt00=%d\n", ievt00 );
 #endif
 
@@ -189,7 +190,7 @@ namespace mg5amcCpu
 #endif
     for( int iParity = 0; iParity < nParity; ++iParity )
     { // START LOOP ON IPARITY
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
       const int ievt0 = ievt00 + iParity * neppV;
 #endif
       constexpr size_t nxcoup = ndcoup + nicoup; // both dependent and independent couplings
@@ -202,10 +203,8 @@ namespace mg5amcCpu
         allCOUPs[idcoup] = CD_ACCESS::idcoupAccessBufferConst( allcouplings, idcoup ); // dependent couplings, vary event-by-event
       for( size_t iicoup = 0; iicoup < nicoup; iicoup++ )
         allCOUPs[ndcoup + iicoup] = CI_ACCESS::iicoupAccessBufferConst( cIPC, iicoup ); // independent couplings, fixed for all events
-#ifdef MGONGPUCPP_GPUIMPL
 #ifdef __CUDACC__
 #pragma nv_diagnostic pop
-#endif
       // CUDA kernels take input/output buffers with momenta/MEs for all events
       const fptype* momenta = allmomenta;
       const fptype* COUPs[nxcoup];
@@ -244,7 +243,7 @@ namespace mg5amcCpu
       // Wavefunction(s) for diagram number 1
       vxxxxx<M_ACCESS, W_ACCESS>( momenta, 0., cHel[ihel][0], -1, w_fp[0], 0 );
 
-#if not( defined MGONGPUCPP_GPUIMPL and defined MGONGPU_TEST_DIVERGENCE )
+#if not( defined __CUDACC__ and defined MGONGPU_TEST_DIVERGENCE )
       imzxxx<M_ACCESS, W_ACCESS>( momenta, cHel[ihel][1], +1, w_fp[1], 1 ); // NB: imzxxx only uses pz
 #else
       if( ( blockDim.x * blockIdx.x + threadIdx.x ) % 2 == 0 )
@@ -349,7 +348,7 @@ namespace mg5amcCpu
         { 4, 0, 12, 4 },
         { 0, 4, 4, 12 } }; // 2-D array[4][4]
 
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
       // Pre-compute a constexpr triangular color matrix properly normalized #475
       struct TriangularNormalizedColorMatrix
       {
@@ -406,7 +405,7 @@ namespace mg5amcCpu
 #endif
       for( int icol = 0; icol < ncolor; icol++ )
       {
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
         // === C++ START ===
         // Diagonal terms
 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
@@ -465,7 +464,7 @@ namespace mg5amcCpu
       MEs_sv_previous += deltaMEs_previous;
 #endif
       /*
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
       if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", blockDim.x * blockIdx.x + threadIdx.x, ihel, MEs_sv );
 #else
 #ifdef MGONGPU_CPPSIMD
@@ -528,8 +527,8 @@ namespace mg5amcCpu
       { 1, -1, 1, 1, 1 },
       { 1, -1, 1, -1, -1 },
       { 1, -1, 1, -1, 1 } };
-#ifdef MGONGPUCPP_GPUIMPL
-    gpuMemcpyToSymbol( cHel, tHel, ncomb * npar * sizeof( short ) );
+#ifdef __CUDACC__
+    checkCuda( cudaMemcpyToSymbol( cHel, tHel, ncomb * npar * sizeof( short ) ) );
 #else
     memcpy( cHel, tHel, ncomb * npar * sizeof( short ) );
 #endif
@@ -570,9 +569,9 @@ namespace mg5amcCpu
     // Then copy them to CUDA constant memory (issue #39) or its C++ emulation in file-scope static memory
     const fptype tIPD[2] = { (fptype)m_pars->mdl_MT, (fptype)m_pars->mdl_WT };
     //const cxtype tIPC[0] = { ... }; // nicoup=0
-#ifdef MGONGPUCPP_GPUIMPL
-    gpuMemcpyToSymbol( cIPD, tIPD, 2 * sizeof( fptype ) );
-    //gpuMemcpyToSymbol( cIPC, tIPC, 0 * sizeof( cxtype ) ); // nicoup=0
+#ifdef __CUDACC__
+    checkCuda( cudaMemcpyToSymbol( cIPD, tIPD, 2 * sizeof( fptype ) ) );
+    //checkCuda( cudaMemcpyToSymbol( cIPC, tIPC, 0 * sizeof( cxtype ) ) ); // nicoup=0
 #else
     memcpy( cIPD, tIPD, 2 * sizeof( fptype ) );
     //memcpy( cIPC, tIPC, 0 * sizeof( cxtype ) ); // nicoup=0
@@ -609,7 +608,7 @@ namespace mg5amcCpu
   {
     std::stringstream out;
     // CUDA version (NVCC)
-    // [Use __NVCC__ instead of MGONGPUCPP_GPUIMPL here!]
+    // [Use __NVCC__ instead of __CUDACC__ here!]
     // [This tests if 'nvcc' was used even to build a .cc file, even if not necessarily 'nvcc -x cu' for a .cu file]
     // [Check 'nvcc --compiler-options -dM -E dummy.c | grep CUDA': see https://stackoverflow.com/a/53713712]
 #ifdef __NVCC__
@@ -674,12 +673,12 @@ namespace mg5amcCpu
   __global__ void /* clang-format off */
   computeDependentCouplings( const fptype* allgs, // input: Gs[nevt]
                              fptype* allcouplings // output: couplings[nevt*ndcoup*2]
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
                              , const int nevt     // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #endif
   ) /* clang-format on */
   {
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
     using namespace mg5amcGpu;
     using G_ACCESS = DeviceAccessGs;
     using C_ACCESS = DeviceAccessCouplings;
@@ -700,7 +699,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
+#ifdef __CUDACC__ /* clang-format off */
   __global__ void
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
@@ -829,9 +828,9 @@ namespace mg5amcCpu
         nGoodHel++;
       }
     }
-#ifdef MGONGPUCPP_GPUIMPL
-    gpuMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) );
-    gpuMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) );
+#ifdef __CUDACC__
+    checkCuda( cudaMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) ) );
+    checkCuda( cudaMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) ) );
 #else
     cNGoodHel = nGoodHel;
     for( int ihel = 0; ihel < ncomb; ihel++ ) cGoodHel[ihel] = goodHel[ihel];
@@ -855,7 +854,7 @@ namespace mg5amcCpu
 #endif
             int* allselhel,                // output: helicity selection[nevt]
             int* allselcol                 // output: helicity selection[nevt]
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
             , const int nevt               // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #endif
             ) /* clang-format on */
@@ -876,7 +875,7 @@ namespace mg5amcCpu
     // Denominators: spins, colors and identical particles
     constexpr int helcolDenominators[1] = { 96 }; // assume nprocesses == 1 (#272 and #343)
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
     // Remember: in CUDA this is a kernel for one event, in c++ this processes n events
     const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid
 #else
@@ -890,12 +889,9 @@ namespace mg5amcCpu
 #endif
 
     // Start sigmaKin_lines
-
-#include "GpuAbstraction.h"
-
     // === PART 0 - INITIALISATION (before calculate_wavefunctions) ===
     // Reset the "matrix elements" - running sums of |M|^2 over helicities for the given event
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
     allMEs[ievt] = 0;
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     allNumerators[ievt] = 0;
@@ -923,7 +919,7 @@ namespace mg5amcCpu
     // === PART 1 - HELICITY LOOP: CALCULATE WAVEFUNCTIONS ===
     // (in both CUDA and C++, using precomputed good helicities)
 
-#ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++
+#ifdef __CUDACC__ // CUDA OR C++
 
     // *** START OF PART 1a - CUDA (one event per CPU thread) ***
     // Running sum of partial amplitudes squared for event by event color selection (#402)
@@ -1127,7 +1123,7 @@ namespace mg5amcCpu
     // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event
     // [NB 'sum over final spins, average over initial spins', eg see
     // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf]
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
     allMEs[ievt] /= helcolDenominators[0];
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     if( channelId > 0 ) allMEs[ievt] *= allNumerators[ievt] / allDenominators[ievt];
diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/CPPProcess.h b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/CPPProcess.h
index ee747a8ae4..9554285817 100644
--- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/CPPProcess.h
+++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/CPPProcess.h
@@ -4,7 +4,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
 // MadGraph5_aMC@NLO v. 3.5.0_lo_vect, 2023-06-09
@@ -25,7 +25,7 @@
 
 //--------------------------------------------------------------------------
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -110,7 +110,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
   __global__ void
   computeDependentCouplings( const fptype* allgs,    // input: Gs[nevt]
                              fptype* allcouplings ); // output: couplings[nevt*ndcoup*2]
@@ -123,7 +123,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
+#ifdef __CUDACC__ /* clang-format off */
   __global__ void
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
@@ -153,7 +153,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
+#ifdef __CUDACC__ /* clang-format off */
   __global__ void
   sigmaKin( const fptype* allmomenta,      // input: momenta[nevt*npar*4]
             const fptype* allcouplings,    // input: couplings[nevt*ndcoup*2]
diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/CudaRuntime.h b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/CudaRuntime.h
new file mode 120000
index 0000000000..ce9e1a487a
--- /dev/null
+++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/CudaRuntime.h
@@ -0,0 +1 @@
+../CudaRuntime.h
\ No newline at end of file
diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/check_sa.cc b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/check_sa.cc
index 1bad694d1c..f1e75b9252 100644
--- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/check_sa.cc
+++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/check_sa.cc
@@ -4,7 +4,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: O. Mattelaer (Nov 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 
 #include "mgOnGpuConfig.h"
@@ -12,7 +12,6 @@
 #include "BridgeKernels.h"
 #include "CPPProcess.h"
 #include "CrossSectionKernels.h"
-#include "GpuRuntime.h"
 #include "MatrixElementKernels.h"
 #include "MemoryAccessMatrixElements.h"
 #include "MemoryAccessMomenta.h"
@@ -64,7 +63,7 @@ usage( char* argv0, int ret = 1 )
   std::cout << std::endl;
   std::cout << "Summary stats are always computed: '-p' and '-j' only control their printout" << std::endl;
   std::cout << "The '-d' flag only enables NaN/abnormal warnings and OMP debugging" << std::endl;
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
 #ifdef _OPENMP
   std::cout << std::endl;
   std::cout << "Use the OMP_NUM_THREADS environment variable to control OMP multi-threading" << std::endl;
@@ -78,7 +77,7 @@ int
 main( int argc, char** argv )
 {
   // Namespaces for CUDA and C++ (FIXME - eventually use the same namespace everywhere...)
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
   using namespace mg5amcGpu;
 #else
   using namespace mg5amcCpu;
@@ -104,11 +103,11 @@ main( int argc, char** argv )
     CurandDevice = 2
   };
 #ifdef __CUDACC__
-  RandomNumberMode rndgen = RandomNumberMode::CurandDevice; // default on CUDA GPU
+  RandomNumberMode rndgen = RandomNumberMode::CurandDevice; // default on GPU
 #elif not defined MGONGPU_HAS_NO_CURAND
   RandomNumberMode rndgen = RandomNumberMode::CurandHost;  // default on CPU if build has curand
 #else
-  RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // default on CPU if build has no curand and on HIP GPU
+  RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // default on CPU if build has no curand
 #endif
   // Rambo sampling mode (NB RamboHost implies CommonRandom or CurandHost!)
   enum class RamboSamplingMode
@@ -116,7 +115,7 @@ main( int argc, char** argv )
     RamboHost = 1,
     RamboDevice = 2
   };
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
   RamboSamplingMode rmbsmp = RamboSamplingMode::RamboDevice; // default on GPU
 #else
   RamboSamplingMode rmbsmp = RamboSamplingMode::RamboHost; // default on CPU
@@ -149,7 +148,7 @@ main( int argc, char** argv )
 #ifdef __CUDACC__
       rndgen = RandomNumberMode::CurandDevice;
 #else
-      throw std::runtime_error( "CurandDevice is not supported on CPUs or on HIP GPUs" );
+      throw std::runtime_error( "CurandDevice is not supported on CPUs" );
 #endif
     }
     else if( arg == "--curhst" )
@@ -166,7 +165,7 @@ main( int argc, char** argv )
     }
     else if( arg == "--rmbdev" )
     {
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
       rmbsmp = RamboSamplingMode::RamboDevice;
 #else
       throw std::runtime_error( "RamboDevice is not supported on CPUs" );
@@ -240,13 +239,13 @@ main( int argc, char** argv )
     return usage( argv[0] );
   }
 
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
 #ifdef _OPENMP
   ompnumthreadsNotSetMeansOneThread( debug ? 1 : 0 ); // quiet(-1), info(0), debug(1)
 #endif
 #endif
 
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
   // Fail gently and avoid "Illegal instruction (core dumped)" if the host does not support the SIMD used in the ME calculation
   // Note: this prevents a crash on pmpe04 but not on some github CI nodes?
   // [NB: SIMD vectorization in mg5amc C++ code is only used in the ME calculation below MatrixElementKernelHost!]
@@ -264,14 +263,14 @@ main( int argc, char** argv )
 
   // === STEP 0 - INITIALISE
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 
-  // --- 00. Initialise GPU
-  // Instantiate a GpuRuntime at the beginnining of the application's main.
-  // For CUDA this invokes cudaSetDevice(0) in the constructor and books a cudaDeviceReset() call in the destructor.
-  const std::string cdinKey = "00 GpuInit";
+  // --- 00. Initialise cuda
+  // Instantiate a CudaRuntime at the beginnining of the application's main to
+  // invoke cudaSetDevice(0) in the constructor and book a cudaDeviceReset() call in the destructor
+  const std::string cdinKey = "00 CudaInit";
   timermap.start( cdinKey );
-  GpuRuntime GpuRuntime( debug );
+  CudaRuntime cudaRuntime( debug );
 #endif
 
   // --- 0a. Initialise physics process
@@ -293,7 +292,7 @@ main( int argc, char** argv )
   timermap.start( alloKey );
 
   // Memory buffers for random numbers for momenta
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
   HostBufferRndNumMomenta hstRndmom( nevt );
 #else
   PinnedHostBufferRndNumMomenta hstRndmom( nevt );
@@ -301,7 +300,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for sampling weights
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
   HostBufferWeights hstWeights( nevt );
 #else
   PinnedHostBufferWeights hstWeights( nevt );
@@ -309,7 +308,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for momenta
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
   HostBufferMomenta hstMomenta( nevt );
 #else
   PinnedHostBufferMomenta hstMomenta( nevt );
@@ -317,7 +316,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for Gs
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
   HostBufferGs hstGs( nevt );
 #else
   PinnedHostBufferGs hstGs( nevt );
@@ -334,7 +333,7 @@ main( int argc, char** argv )
   }
 
   // Memory buffers for matrix elements
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
   HostBufferMatrixElements hstMatrixElements( nevt );
 #else
   PinnedHostBufferMatrixElements hstMatrixElements( nevt );
@@ -343,7 +342,7 @@ main( int argc, char** argv )
 
   // Memory buffers for random numbers for helicity selection
   // *** NB #403 these buffers always remain initialised at 0: no need for helicity choice in gcheck/check (no LHE produced) ***
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
   HostBufferRndNumHelicity hstRndHel( nevt );
 #else
   PinnedHostBufferRndNumHelicity hstRndHel( nevt );
@@ -352,7 +351,7 @@ main( int argc, char** argv )
 
   // Memory buffers for random numbers for color selection
   // *** NB #402 these buffers always remain initialised at 0: no need for color choice in gcheck/check (no LHE produced) ***
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
   HostBufferRndNumColor hstRndCol( nevt );
 #else
   PinnedHostBufferRndNumColor hstRndCol( nevt );
@@ -360,7 +359,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for helicity selection
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
   HostBufferSelectedHelicity hstSelHel( nevt );
 #else
   PinnedHostBufferSelectedHelicity hstSelHel( nevt );
@@ -368,7 +367,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for color selection
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
   HostBufferSelectedColor hstSelCol( nevt );
 #else
   PinnedHostBufferSelectedColor hstSelCol( nevt );
@@ -404,7 +403,7 @@ main( int argc, char** argv )
 #else
   else
   {
-    throw std::logic_error( "CurandDevice is not supported on CPUs or HIP GPUs" ); // INTERNAL ERROR (no path to this statement)
+    throw std::logic_error( "CurandDevice is not supported on CPUs" ); // INTERNAL ERROR (no path to this statement)
   }
 #endif
 #else
@@ -422,7 +421,7 @@ main( int argc, char** argv )
   }
   else
   {
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
     prsk.reset( new RamboSamplingKernelDevice( energy, devRndmom, devMomenta, devWeights, gpublocks, gputhreads ) );
 #else
     throw std::logic_error( "RamboDevice is not supported on CPUs" ); // INTERNAL ERROR (no path to this statement)
@@ -433,7 +432,7 @@ main( int argc, char** argv )
   std::unique_ptr<MatrixElementKernelBase> pmek;
   if( !bridge )
   {
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
     pmek.reset( new MatrixElementKernelDevice( devMomenta, devGs, devRndHel, devRndCol, devMatrixElements, devSelHel, devSelCol, gpublocks, gputhreads ) );
 #else
     pmek.reset( new MatrixElementKernelHost( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, nevt ) );
@@ -441,7 +440,7 @@ main( int argc, char** argv )
   }
   else
   {
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
     pmek.reset( new BridgeKernelDevice( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, gpublocks, gputhreads ) );
 #else
     pmek.reset( new BridgeKernelHost( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, nevt ) );
@@ -483,7 +482,7 @@ main( int argc, char** argv )
     prnk->generateRnarray();
     //std::cout << "Got random numbers" << std::endl;
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
     if( rndgen != RandomNumberMode::CurandDevice && rmbsmp == RamboSamplingMode::RamboDevice )
     {
       // --- 1c. Copy rndmom from host to device
@@ -515,7 +514,7 @@ main( int argc, char** argv )
     prsk->getMomentaFinal();
     //std::cout << "Got final momenta" << std::endl;
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
     if( rmbsmp == RamboSamplingMode::RamboDevice )
     {
       // --- 2c. CopyDToH Weights
@@ -560,7 +559,7 @@ main( int argc, char** argv )
       dynamic_cast<BridgeKernelBase*>( pmek.get() )->transposeInputMomentaC2F();
     }
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
     // --- 2d. CopyHToD Momenta
     const std::string gKey = "0.. CpHTDg";
     rambtime += timermap.start( gKey ); // FIXME! NOT A RAMBO TIMER!
@@ -589,7 +588,7 @@ main( int argc, char** argv )
     wv3atime += timermap.stop(); // calc only
     wavetime += wv3atime;        // calc plus copy
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
     if( !bridge )
     {
       // --- 3b. CopyDToH MEs
@@ -732,19 +731,15 @@ main( int argc, char** argv )
     rndgentxt = "CURAND DEVICE";
 #ifdef __CUDACC__
   rndgentxt += " (CUDA code)";
-#elif defined __HIPCC__
-  rndgentxt += " (HIP code)";
 #else
   rndgentxt += " (C++ code)";
 #endif
 
   // Workflow description summary
   std::string wrkflwtxt;
-  // -- CUDA or HIP or C++?
+  // -- CUDA or C++?
 #ifdef __CUDACC__
   wrkflwtxt += "CUD:";
-#elif defined __HIPCC__
-  wrkflwtxt += "HIP:";
 #else
   wrkflwtxt += "CPP:";
 #endif
@@ -769,12 +764,6 @@ main( int argc, char** argv )
 #else
   wrkflwtxt += "???:"; // no path to this statement
 #endif
-#elif defined __HIPCC__
-#if defined MGONGPU_CUCXTYPE_CXSMPL
-  wrkflwtxt += "CXS:";
-#else
-  wrkflwtxt += "???:"; // no path to this statement
-#endif
 #else
 #if defined MGONGPU_CPPCXTYPE_STDCOMPLEX
   wrkflwtxt += "STX:";
@@ -800,7 +789,7 @@ main( int argc, char** argv )
     wrkflwtxt += "RMBDEV+";
   else
     wrkflwtxt += "??????+"; // no path to this statement
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
   // -- HOST or DEVICE matrix elements? Standalone MEs or BRIDGE?
   if( !bridge )
     wrkflwtxt += "MESDEV";
@@ -856,7 +845,7 @@ main( int argc, char** argv )
 
   if( perf )
   {
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
 #ifdef _OPENMP
     // Get the output of "nproc --all" (https://stackoverflow.com/a/478960)
     std::string nprocall;
@@ -877,8 +866,6 @@ main( int argc, char** argv )
     std::cout << std::string( SEP79, '*' ) << std::endl
 #ifdef __CUDACC__
               << "Process                     = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_CUDA"
-#elif defined __HIPCC__
-              << "Process                     = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_HIP"
 #else
               << "Process                     = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_CPP"
 #endif
@@ -905,21 +892,21 @@ main( int argc, char** argv )
 #elif defined MGONGPU_FPTYPE_FLOAT
               << "FP precision                = FLOAT (NaN/abnormal=" << nabn << ", zero=" << nzero << ")" << std::endl
 #endif
+#ifdef __CUDACC__
 #if defined MGONGPU_CUCXTYPE_CUCOMPLEX
               << "Complex type                = CUCOMPLEX" << std::endl
 #elif defined MGONGPU_CUCXTYPE_THRUST
               << "Complex type                = THRUST::COMPLEX" << std::endl
-#elif defined MGONGPU_CUCXTYPE_CXSMPL
-              << "Complex type                = STD::COMPLEX" << std::endl
+#endif
 #else
-              << "Complex type                = ???" << std::endl // no path to this statement...
+              << "Complex type                = STD::COMPLEX" << std::endl
 #endif
               << "RanNumb memory layout       = AOSOA[" << neppR << "]"
               << ( neppR == 1 ? " == AOS" : "" )
               << " [HARDCODED FOR REPRODUCIBILITY]" << std::endl
               << "Momenta memory layout       = AOSOA[" << neppM << "]"
               << ( neppM == 1 ? " == AOS" : "" ) << std::endl
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
     //<< "Wavefunction GPU memory     = LOCAL" << std::endl
 #else
 #if !defined MGONGPU_CPPSIMD
@@ -950,7 +937,7 @@ main( int argc, char** argv )
 #endif
 #endif
               << "Random number generation    = " << rndgentxt << std::endl
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
 #ifdef _OPENMP
               << "OMP threads / `nproc --all` = " << omp_get_max_threads() << " / " << nprocall // includes a newline
 #endif
@@ -1046,14 +1033,14 @@ main( int argc, char** argv )
              << "\"FLOAT (NaN/abnormal=" << nabn << ")\"," << std::endl
 #endif
              << "\"Complex type\": "
+#ifdef __CUDACC__
 #if defined MGONGPU_CUCXTYPE_CUCOMPLEX
              << "\"CUCOMPLEX\"," << std::endl
 #elif defined MGONGPU_CUCXTYPE_THRUST
              << "\"THRUST::COMPLEX\"," << std::endl
-#elif defined MGONGPU_CUCXTYPE_CXSMPL
-             << "\"STD::COMPLEX\"," << std::endl
+#endif
 #else
-             << "\"???\"," << std::endl                           // no path to this statement...
+             << "\"STD::COMPLEX\"," << std::endl
 #endif
              << "\"RanNumb memory layout\": "
              << "\"AOSOA[" << neppR << "]\""
@@ -1061,7 +1048,7 @@ main( int argc, char** argv )
              << "\"Momenta memory layout\": "
              << "\"AOSOA[" << neppM << "]\""
              << ( neppM == 1 ? " == AOS" : "" ) << ", " << std::endl
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
     //<< "\"Wavefunction GPU memory\": " << "\"LOCAL\"," << std::endl
 #endif
              << "\"Curand generation\": "
diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/CPPProcess.cc b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/CPPProcess.cc
index af91413156..408fbb52f9 100644
--- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/CPPProcess.cc
+++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/CPPProcess.cc
@@ -4,7 +4,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
 // MadGraph5_aMC@NLO v. 3.5.0_lo_vect, 2023-06-09
@@ -16,6 +16,7 @@
 
 #include "mgOnGpuConfig.h"
 
+#include "CudaRuntime.h"
 #include "HelAmps_sm.h"
 #include "MemoryAccessAmplitudes.h"
 #include "MemoryAccessCouplings.h"
@@ -48,7 +49,7 @@
 // Process: g d~ > t t~ d~ WEIGHTED<=3 @1
 // Process: g s~ > t t~ s~ WEIGHTED<=3 @1
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -82,7 +83,7 @@ namespace mg5amcCpu
   __device__ const fptype cIPD[2] = { (fptype)Parameters_sm::mdl_MT, (fptype)Parameters_sm::mdl_WT };
   __device__ const fptype* cIPC = nullptr; // unused as nicoup=0
 #else
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
   __device__ __constant__ fptype cIPD[2];
   __device__ __constant__ fptype* cIPC = nullptr; // unused as nicoup=0
 #else
@@ -92,7 +93,7 @@ namespace mg5amcCpu
 #endif
 
   // Helicity combinations (and filtering of "good" helicity combinations)
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
   __device__ __constant__ short cHel[ncomb][npar];
   __device__ __constant__ int cNGoodHel;
   __device__ __constant__ int cGoodHel[ncomb];
@@ -120,13 +121,13 @@ namespace mg5amcCpu
                            fptype* allDenominators,       // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
                            fptype_sv* jamp2_sv            // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled)
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
                            , const int ievt00             // input: first event number in current C++ event page (for CUDA, ievt depends on threadid)
 #endif
                            )
   //ALWAYS_INLINE // attributes are not permitted in a function definition
   {
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
     using namespace mg5amcGpu;
     using M_ACCESS = DeviceAccessMomenta;         // non-trivial access: buffer includes all events
     using E_ACCESS = DeviceAccessMatrixElements;  // non-trivial access: buffer includes all events
@@ -153,7 +154,7 @@ namespace mg5amcCpu
 #endif /* clang-format on */
     mgDebug( 0, __FUNCTION__ );
     //printf( "calculate_wavefunctions: ihel=%2d\n", ihel );
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
     //printf( "calculate_wavefunctions: ievt00=%d\n", ievt00 );
 #endif
 
@@ -189,7 +190,7 @@ namespace mg5amcCpu
 #endif
     for( int iParity = 0; iParity < nParity; ++iParity )
     { // START LOOP ON IPARITY
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
       const int ievt0 = ievt00 + iParity * neppV;
 #endif
       constexpr size_t nxcoup = ndcoup + nicoup; // both dependent and independent couplings
@@ -202,10 +203,8 @@ namespace mg5amcCpu
         allCOUPs[idcoup] = CD_ACCESS::idcoupAccessBufferConst( allcouplings, idcoup ); // dependent couplings, vary event-by-event
       for( size_t iicoup = 0; iicoup < nicoup; iicoup++ )
         allCOUPs[ndcoup + iicoup] = CI_ACCESS::iicoupAccessBufferConst( cIPC, iicoup ); // independent couplings, fixed for all events
-#ifdef MGONGPUCPP_GPUIMPL
 #ifdef __CUDACC__
 #pragma nv_diagnostic pop
-#endif
       // CUDA kernels take input/output buffers with momenta/MEs for all events
       const fptype* momenta = allmomenta;
       const fptype* COUPs[nxcoup];
@@ -342,7 +341,7 @@ namespace mg5amcCpu
         { 4, 0, 12, 4 },
         { 0, 4, 4, 12 } }; // 2-D array[4][4]
 
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
       // Pre-compute a constexpr triangular color matrix properly normalized #475
       struct TriangularNormalizedColorMatrix
       {
@@ -399,7 +398,7 @@ namespace mg5amcCpu
 #endif
       for( int icol = 0; icol < ncolor; icol++ )
       {
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
         // === C++ START ===
         // Diagonal terms
 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
@@ -458,7 +457,7 @@ namespace mg5amcCpu
       MEs_sv_previous += deltaMEs_previous;
 #endif
       /*
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
       if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", blockDim.x * blockIdx.x + threadIdx.x, ihel, MEs_sv );
 #else
 #ifdef MGONGPU_CPPSIMD
@@ -521,8 +520,8 @@ namespace mg5amcCpu
       { 1, 1, 1, 1, -1 },
       { 1, 1, 1, -1, 1 },
       { 1, 1, 1, -1, -1 } };
-#ifdef MGONGPUCPP_GPUIMPL
-    gpuMemcpyToSymbol( cHel, tHel, ncomb * npar * sizeof( short ) );
+#ifdef __CUDACC__
+    checkCuda( cudaMemcpyToSymbol( cHel, tHel, ncomb * npar * sizeof( short ) ) );
 #else
     memcpy( cHel, tHel, ncomb * npar * sizeof( short ) );
 #endif
@@ -563,9 +562,9 @@ namespace mg5amcCpu
     // Then copy them to CUDA constant memory (issue #39) or its C++ emulation in file-scope static memory
     const fptype tIPD[2] = { (fptype)m_pars->mdl_MT, (fptype)m_pars->mdl_WT };
     //const cxtype tIPC[0] = { ... }; // nicoup=0
-#ifdef MGONGPUCPP_GPUIMPL
-    gpuMemcpyToSymbol( cIPD, tIPD, 2 * sizeof( fptype ) );
-    //gpuMemcpyToSymbol( cIPC, tIPC, 0 * sizeof( cxtype ) ); // nicoup=0
+#ifdef __CUDACC__
+    checkCuda( cudaMemcpyToSymbol( cIPD, tIPD, 2 * sizeof( fptype ) ) );
+    //checkCuda( cudaMemcpyToSymbol( cIPC, tIPC, 0 * sizeof( cxtype ) ) ); // nicoup=0
 #else
     memcpy( cIPD, tIPD, 2 * sizeof( fptype ) );
     //memcpy( cIPC, tIPC, 0 * sizeof( cxtype ) ); // nicoup=0
@@ -602,7 +601,7 @@ namespace mg5amcCpu
   {
     std::stringstream out;
     // CUDA version (NVCC)
-    // [Use __NVCC__ instead of MGONGPUCPP_GPUIMPL here!]
+    // [Use __NVCC__ instead of __CUDACC__ here!]
     // [This tests if 'nvcc' was used even to build a .cc file, even if not necessarily 'nvcc -x cu' for a .cu file]
     // [Check 'nvcc --compiler-options -dM -E dummy.c | grep CUDA': see https://stackoverflow.com/a/53713712]
 #ifdef __NVCC__
@@ -667,12 +666,12 @@ namespace mg5amcCpu
   __global__ void /* clang-format off */
   computeDependentCouplings( const fptype* allgs, // input: Gs[nevt]
                              fptype* allcouplings // output: couplings[nevt*ndcoup*2]
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
                              , const int nevt     // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #endif
   ) /* clang-format on */
   {
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
     using namespace mg5amcGpu;
     using G_ACCESS = DeviceAccessGs;
     using C_ACCESS = DeviceAccessCouplings;
@@ -693,7 +692,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
+#ifdef __CUDACC__ /* clang-format off */
   __global__ void
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
@@ -822,9 +821,9 @@ namespace mg5amcCpu
         nGoodHel++;
       }
     }
-#ifdef MGONGPUCPP_GPUIMPL
-    gpuMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) );
-    gpuMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) );
+#ifdef __CUDACC__
+    checkCuda( cudaMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) ) );
+    checkCuda( cudaMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) ) );
 #else
     cNGoodHel = nGoodHel;
     for( int ihel = 0; ihel < ncomb; ihel++ ) cGoodHel[ihel] = goodHel[ihel];
@@ -848,7 +847,7 @@ namespace mg5amcCpu
 #endif
             int* allselhel,                // output: helicity selection[nevt]
             int* allselcol                 // output: helicity selection[nevt]
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
             , const int nevt               // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #endif
             ) /* clang-format on */
@@ -869,7 +868,7 @@ namespace mg5amcCpu
     // Denominators: spins, colors and identical particles
     constexpr int helcolDenominators[1] = { 96 }; // assume nprocesses == 1 (#272 and #343)
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
     // Remember: in CUDA this is a kernel for one event, in c++ this processes n events
     const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid
 #else
@@ -883,12 +882,9 @@ namespace mg5amcCpu
 #endif
 
     // Start sigmaKin_lines
-
-#include "GpuAbstraction.h"
-
     // === PART 0 - INITIALISATION (before calculate_wavefunctions) ===
     // Reset the "matrix elements" - running sums of |M|^2 over helicities for the given event
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
     allMEs[ievt] = 0;
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     allNumerators[ievt] = 0;
@@ -916,7 +912,7 @@ namespace mg5amcCpu
     // === PART 1 - HELICITY LOOP: CALCULATE WAVEFUNCTIONS ===
     // (in both CUDA and C++, using precomputed good helicities)
 
-#ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++
+#ifdef __CUDACC__ // CUDA OR C++
 
     // *** START OF PART 1a - CUDA (one event per CPU thread) ***
     // Running sum of partial amplitudes squared for event by event color selection (#402)
@@ -1120,7 +1116,7 @@ namespace mg5amcCpu
     // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event
     // [NB 'sum over final spins, average over initial spins', eg see
     // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf]
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
     allMEs[ievt] /= helcolDenominators[0];
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     if( channelId > 0 ) allMEs[ievt] *= allNumerators[ievt] / allDenominators[ievt];
diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/CPPProcess.h b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/CPPProcess.h
index 53bb5ccd94..1d1e130ec2 100644
--- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/CPPProcess.h
+++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/CPPProcess.h
@@ -4,7 +4,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
 // MadGraph5_aMC@NLO v. 3.5.0_lo_vect, 2023-06-09
@@ -25,7 +25,7 @@
 
 //--------------------------------------------------------------------------
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -110,7 +110,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
   __global__ void
   computeDependentCouplings( const fptype* allgs,    // input: Gs[nevt]
                              fptype* allcouplings ); // output: couplings[nevt*ndcoup*2]
@@ -123,7 +123,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
+#ifdef __CUDACC__ /* clang-format off */
   __global__ void
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
@@ -153,7 +153,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
+#ifdef __CUDACC__ /* clang-format off */
   __global__ void
   sigmaKin( const fptype* allmomenta,      // input: momenta[nevt*npar*4]
             const fptype* allcouplings,    // input: couplings[nevt*ndcoup*2]
diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/CudaRuntime.h b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/CudaRuntime.h
new file mode 120000
index 0000000000..ce9e1a487a
--- /dev/null
+++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/CudaRuntime.h
@@ -0,0 +1 @@
+../CudaRuntime.h
\ No newline at end of file
diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/check_sa.cc b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/check_sa.cc
index 1bad694d1c..f1e75b9252 100644
--- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/check_sa.cc
+++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/check_sa.cc
@@ -4,7 +4,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: O. Mattelaer (Nov 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 
 #include "mgOnGpuConfig.h"
@@ -12,7 +12,6 @@
 #include "BridgeKernels.h"
 #include "CPPProcess.h"
 #include "CrossSectionKernels.h"
-#include "GpuRuntime.h"
 #include "MatrixElementKernels.h"
 #include "MemoryAccessMatrixElements.h"
 #include "MemoryAccessMomenta.h"
@@ -64,7 +63,7 @@ usage( char* argv0, int ret = 1 )
   std::cout << std::endl;
   std::cout << "Summary stats are always computed: '-p' and '-j' only control their printout" << std::endl;
   std::cout << "The '-d' flag only enables NaN/abnormal warnings and OMP debugging" << std::endl;
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
 #ifdef _OPENMP
   std::cout << std::endl;
   std::cout << "Use the OMP_NUM_THREADS environment variable to control OMP multi-threading" << std::endl;
@@ -78,7 +77,7 @@ int
 main( int argc, char** argv )
 {
   // Namespaces for CUDA and C++ (FIXME - eventually use the same namespace everywhere...)
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
   using namespace mg5amcGpu;
 #else
   using namespace mg5amcCpu;
@@ -104,11 +103,11 @@ main( int argc, char** argv )
     CurandDevice = 2
   };
 #ifdef __CUDACC__
-  RandomNumberMode rndgen = RandomNumberMode::CurandDevice; // default on CUDA GPU
+  RandomNumberMode rndgen = RandomNumberMode::CurandDevice; // default on GPU
 #elif not defined MGONGPU_HAS_NO_CURAND
   RandomNumberMode rndgen = RandomNumberMode::CurandHost;  // default on CPU if build has curand
 #else
-  RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // default on CPU if build has no curand and on HIP GPU
+  RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // default on CPU if build has no curand
 #endif
   // Rambo sampling mode (NB RamboHost implies CommonRandom or CurandHost!)
   enum class RamboSamplingMode
@@ -116,7 +115,7 @@ main( int argc, char** argv )
     RamboHost = 1,
     RamboDevice = 2
   };
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
   RamboSamplingMode rmbsmp = RamboSamplingMode::RamboDevice; // default on GPU
 #else
   RamboSamplingMode rmbsmp = RamboSamplingMode::RamboHost; // default on CPU
@@ -149,7 +148,7 @@ main( int argc, char** argv )
 #ifdef __CUDACC__
       rndgen = RandomNumberMode::CurandDevice;
 #else
-      throw std::runtime_error( "CurandDevice is not supported on CPUs or on HIP GPUs" );
+      throw std::runtime_error( "CurandDevice is not supported on CPUs" );
 #endif
     }
     else if( arg == "--curhst" )
@@ -166,7 +165,7 @@ main( int argc, char** argv )
     }
     else if( arg == "--rmbdev" )
     {
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
       rmbsmp = RamboSamplingMode::RamboDevice;
 #else
       throw std::runtime_error( "RamboDevice is not supported on CPUs" );
@@ -240,13 +239,13 @@ main( int argc, char** argv )
     return usage( argv[0] );
   }
 
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
 #ifdef _OPENMP
   ompnumthreadsNotSetMeansOneThread( debug ? 1 : 0 ); // quiet(-1), info(0), debug(1)
 #endif
 #endif
 
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
   // Fail gently and avoid "Illegal instruction (core dumped)" if the host does not support the SIMD used in the ME calculation
   // Note: this prevents a crash on pmpe04 but not on some github CI nodes?
   // [NB: SIMD vectorization in mg5amc C++ code is only used in the ME calculation below MatrixElementKernelHost!]
@@ -264,14 +263,14 @@ main( int argc, char** argv )
 
   // === STEP 0 - INITIALISE
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 
-  // --- 00. Initialise GPU
-  // Instantiate a GpuRuntime at the beginnining of the application's main.
-  // For CUDA this invokes cudaSetDevice(0) in the constructor and books a cudaDeviceReset() call in the destructor.
-  const std::string cdinKey = "00 GpuInit";
+  // --- 00. Initialise cuda
+  // Instantiate a CudaRuntime at the beginnining of the application's main to
+  // invoke cudaSetDevice(0) in the constructor and book a cudaDeviceReset() call in the destructor
+  const std::string cdinKey = "00 CudaInit";
   timermap.start( cdinKey );
-  GpuRuntime GpuRuntime( debug );
+  CudaRuntime cudaRuntime( debug );
 #endif
 
   // --- 0a. Initialise physics process
@@ -293,7 +292,7 @@ main( int argc, char** argv )
   timermap.start( alloKey );
 
   // Memory buffers for random numbers for momenta
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
   HostBufferRndNumMomenta hstRndmom( nevt );
 #else
   PinnedHostBufferRndNumMomenta hstRndmom( nevt );
@@ -301,7 +300,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for sampling weights
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
   HostBufferWeights hstWeights( nevt );
 #else
   PinnedHostBufferWeights hstWeights( nevt );
@@ -309,7 +308,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for momenta
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
   HostBufferMomenta hstMomenta( nevt );
 #else
   PinnedHostBufferMomenta hstMomenta( nevt );
@@ -317,7 +316,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for Gs
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
   HostBufferGs hstGs( nevt );
 #else
   PinnedHostBufferGs hstGs( nevt );
@@ -334,7 +333,7 @@ main( int argc, char** argv )
   }
 
   // Memory buffers for matrix elements
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
   HostBufferMatrixElements hstMatrixElements( nevt );
 #else
   PinnedHostBufferMatrixElements hstMatrixElements( nevt );
@@ -343,7 +342,7 @@ main( int argc, char** argv )
 
   // Memory buffers for random numbers for helicity selection
   // *** NB #403 these buffers always remain initialised at 0: no need for helicity choice in gcheck/check (no LHE produced) ***
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
   HostBufferRndNumHelicity hstRndHel( nevt );
 #else
   PinnedHostBufferRndNumHelicity hstRndHel( nevt );
@@ -352,7 +351,7 @@ main( int argc, char** argv )
 
   // Memory buffers for random numbers for color selection
   // *** NB #402 these buffers always remain initialised at 0: no need for color choice in gcheck/check (no LHE produced) ***
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
   HostBufferRndNumColor hstRndCol( nevt );
 #else
   PinnedHostBufferRndNumColor hstRndCol( nevt );
@@ -360,7 +359,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for helicity selection
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
   HostBufferSelectedHelicity hstSelHel( nevt );
 #else
   PinnedHostBufferSelectedHelicity hstSelHel( nevt );
@@ -368,7 +367,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for color selection
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
   HostBufferSelectedColor hstSelCol( nevt );
 #else
   PinnedHostBufferSelectedColor hstSelCol( nevt );
@@ -404,7 +403,7 @@ main( int argc, char** argv )
 #else
   else
   {
-    throw std::logic_error( "CurandDevice is not supported on CPUs or HIP GPUs" ); // INTERNAL ERROR (no path to this statement)
+    throw std::logic_error( "CurandDevice is not supported on CPUs" ); // INTERNAL ERROR (no path to this statement)
   }
 #endif
 #else
@@ -422,7 +421,7 @@ main( int argc, char** argv )
   }
   else
   {
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
     prsk.reset( new RamboSamplingKernelDevice( energy, devRndmom, devMomenta, devWeights, gpublocks, gputhreads ) );
 #else
     throw std::logic_error( "RamboDevice is not supported on CPUs" ); // INTERNAL ERROR (no path to this statement)
@@ -433,7 +432,7 @@ main( int argc, char** argv )
   std::unique_ptr<MatrixElementKernelBase> pmek;
   if( !bridge )
   {
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
     pmek.reset( new MatrixElementKernelDevice( devMomenta, devGs, devRndHel, devRndCol, devMatrixElements, devSelHel, devSelCol, gpublocks, gputhreads ) );
 #else
     pmek.reset( new MatrixElementKernelHost( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, nevt ) );
@@ -441,7 +440,7 @@ main( int argc, char** argv )
   }
   else
   {
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
     pmek.reset( new BridgeKernelDevice( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, gpublocks, gputhreads ) );
 #else
     pmek.reset( new BridgeKernelHost( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, nevt ) );
@@ -483,7 +482,7 @@ main( int argc, char** argv )
     prnk->generateRnarray();
     //std::cout << "Got random numbers" << std::endl;
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
     if( rndgen != RandomNumberMode::CurandDevice && rmbsmp == RamboSamplingMode::RamboDevice )
     {
       // --- 1c. Copy rndmom from host to device
@@ -515,7 +514,7 @@ main( int argc, char** argv )
     prsk->getMomentaFinal();
     //std::cout << "Got final momenta" << std::endl;
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
     if( rmbsmp == RamboSamplingMode::RamboDevice )
     {
       // --- 2c. CopyDToH Weights
@@ -560,7 +559,7 @@ main( int argc, char** argv )
       dynamic_cast<BridgeKernelBase*>( pmek.get() )->transposeInputMomentaC2F();
     }
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
     // --- 2d. CopyHToD Momenta
     const std::string gKey = "0.. CpHTDg";
     rambtime += timermap.start( gKey ); // FIXME! NOT A RAMBO TIMER!
@@ -589,7 +588,7 @@ main( int argc, char** argv )
     wv3atime += timermap.stop(); // calc only
     wavetime += wv3atime;        // calc plus copy
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
     if( !bridge )
     {
       // --- 3b. CopyDToH MEs
@@ -732,19 +731,15 @@ main( int argc, char** argv )
     rndgentxt = "CURAND DEVICE";
 #ifdef __CUDACC__
   rndgentxt += " (CUDA code)";
-#elif defined __HIPCC__
-  rndgentxt += " (HIP code)";
 #else
   rndgentxt += " (C++ code)";
 #endif
 
   // Workflow description summary
   std::string wrkflwtxt;
-  // -- CUDA or HIP or C++?
+  // -- CUDA or C++?
 #ifdef __CUDACC__
   wrkflwtxt += "CUD:";
-#elif defined __HIPCC__
-  wrkflwtxt += "HIP:";
 #else
   wrkflwtxt += "CPP:";
 #endif
@@ -769,12 +764,6 @@ main( int argc, char** argv )
 #else
   wrkflwtxt += "???:"; // no path to this statement
 #endif
-#elif defined __HIPCC__
-#if defined MGONGPU_CUCXTYPE_CXSMPL
-  wrkflwtxt += "CXS:";
-#else
-  wrkflwtxt += "???:"; // no path to this statement
-#endif
 #else
 #if defined MGONGPU_CPPCXTYPE_STDCOMPLEX
   wrkflwtxt += "STX:";
@@ -800,7 +789,7 @@ main( int argc, char** argv )
     wrkflwtxt += "RMBDEV+";
   else
     wrkflwtxt += "??????+"; // no path to this statement
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
   // -- HOST or DEVICE matrix elements? Standalone MEs or BRIDGE?
   if( !bridge )
     wrkflwtxt += "MESDEV";
@@ -856,7 +845,7 @@ main( int argc, char** argv )
 
   if( perf )
   {
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
 #ifdef _OPENMP
     // Get the output of "nproc --all" (https://stackoverflow.com/a/478960)
     std::string nprocall;
@@ -877,8 +866,6 @@ main( int argc, char** argv )
     std::cout << std::string( SEP79, '*' ) << std::endl
 #ifdef __CUDACC__
               << "Process                     = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_CUDA"
-#elif defined __HIPCC__
-              << "Process                     = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_HIP"
 #else
               << "Process                     = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_CPP"
 #endif
@@ -905,21 +892,21 @@ main( int argc, char** argv )
 #elif defined MGONGPU_FPTYPE_FLOAT
               << "FP precision                = FLOAT (NaN/abnormal=" << nabn << ", zero=" << nzero << ")" << std::endl
 #endif
+#ifdef __CUDACC__
 #if defined MGONGPU_CUCXTYPE_CUCOMPLEX
               << "Complex type                = CUCOMPLEX" << std::endl
 #elif defined MGONGPU_CUCXTYPE_THRUST
               << "Complex type                = THRUST::COMPLEX" << std::endl
-#elif defined MGONGPU_CUCXTYPE_CXSMPL
-              << "Complex type                = STD::COMPLEX" << std::endl
+#endif
 #else
-              << "Complex type                = ???" << std::endl // no path to this statement...
+              << "Complex type                = STD::COMPLEX" << std::endl
 #endif
               << "RanNumb memory layout       = AOSOA[" << neppR << "]"
               << ( neppR == 1 ? " == AOS" : "" )
               << " [HARDCODED FOR REPRODUCIBILITY]" << std::endl
               << "Momenta memory layout       = AOSOA[" << neppM << "]"
               << ( neppM == 1 ? " == AOS" : "" ) << std::endl
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
     //<< "Wavefunction GPU memory     = LOCAL" << std::endl
 #else
 #if !defined MGONGPU_CPPSIMD
@@ -950,7 +937,7 @@ main( int argc, char** argv )
 #endif
 #endif
               << "Random number generation    = " << rndgentxt << std::endl
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
 #ifdef _OPENMP
               << "OMP threads / `nproc --all` = " << omp_get_max_threads() << " / " << nprocall // includes a newline
 #endif
@@ -1046,14 +1033,14 @@ main( int argc, char** argv )
              << "\"FLOAT (NaN/abnormal=" << nabn << ")\"," << std::endl
 #endif
              << "\"Complex type\": "
+#ifdef __CUDACC__
 #if defined MGONGPU_CUCXTYPE_CUCOMPLEX
              << "\"CUCOMPLEX\"," << std::endl
 #elif defined MGONGPU_CUCXTYPE_THRUST
              << "\"THRUST::COMPLEX\"," << std::endl
-#elif defined MGONGPU_CUCXTYPE_CXSMPL
-             << "\"STD::COMPLEX\"," << std::endl
+#endif
 #else
-             << "\"???\"," << std::endl                           // no path to this statement...
+             << "\"STD::COMPLEX\"," << std::endl
 #endif
              << "\"RanNumb memory layout\": "
              << "\"AOSOA[" << neppR << "]\""
@@ -1061,7 +1048,7 @@ main( int argc, char** argv )
              << "\"Momenta memory layout\": "
              << "\"AOSOA[" << neppM << "]\""
              << ( neppM == 1 ? " == AOS" : "" ) << ", " << std::endl
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
     //<< "\"Wavefunction GPU memory\": " << "\"LOCAL\"," << std::endl
 #endif
              << "\"Curand generation\": "
diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/RamboSamplingKernels.cc b/epochX/cudacpp/gq_ttq.mad/SubProcesses/RamboSamplingKernels.cc
index 79abbcc4f8..da68aa9255 100644
--- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/RamboSamplingKernels.cc
+++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/RamboSamplingKernels.cc
@@ -1,11 +1,11 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #include "RamboSamplingKernels.h"
 
-#include "GpuRuntime.h"
+#include "CudaRuntime.h"
 #include "MemoryAccessMomenta.h"
 #include "MemoryAccessRandomNumbers.h"
 #include "MemoryAccessWeights.h"
@@ -14,7 +14,7 @@
 
 #include <sstream>
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -92,7 +92,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
   RamboSamplingKernelDevice::RamboSamplingKernelDevice( const fptype energy,               // input: energy
                                                         const BufferRndNumMomenta& rndmom, // input: random numbers in [0,1]
                                                         BufferMomenta& momenta,            // output: momenta
@@ -135,7 +135,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
   __global__ void
   getMomentaInitialDevice( const fptype energy,
                            fptype* momenta )
@@ -147,17 +147,17 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
   void
   RamboSamplingKernelDevice::getMomentaInitial()
   {
-    gpuLaunchKernel( getMomentaInitialDevice, m_gpublocks, m_gputhreads, m_energy, m_momenta.data() );
+    getMomentaInitialDevice<<<m_gpublocks, m_gputhreads>>>( m_energy, m_momenta.data() );
   }
 #endif
 
   //--------------------------------------------------------------------------
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
   __global__ void
   getMomentaFinalDevice( const fptype energy,
                          const fptype* rndmom,
@@ -171,11 +171,11 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
   void
   RamboSamplingKernelDevice::getMomentaFinal()
   {
-    gpuLaunchKernel( getMomentaFinalDevice, m_gpublocks, m_gputhreads, m_energy, m_rndmom.data(), m_momenta.data(), m_weights.data() );
+    getMomentaFinalDevice<<<m_gpublocks, m_gputhreads>>>( m_energy, m_rndmom.data(), m_momenta.data(), m_weights.data() );
   }
 #endif
 
diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/RamboSamplingKernels.h b/epochX/cudacpp/gq_ttq.mad/SubProcesses/RamboSamplingKernels.h
index 7c214cd74b..184089efd7 100644
--- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/RamboSamplingKernels.h
+++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/RamboSamplingKernels.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef RAMBOSAMPLINGKERNELS_H
 #define RAMBOSAMPLINGKERNELS_H 1
@@ -10,7 +10,7 @@
 
 #include "MemoryBuffers.h"
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -93,7 +93,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
   // A class encapsulating RAMBO phase space sampling on a GPU device
   class RamboSamplingKernelDevice final : public SamplingKernelBase, public NumberOfEvents
   {
diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/RandomNumberKernels.h b/epochX/cudacpp/gq_ttq.mad/SubProcesses/RandomNumberKernels.h
index 21d63beeac..188a72c2c9 100644
--- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/RandomNumberKernels.h
+++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/RandomNumberKernels.h
@@ -1,14 +1,14 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef RANDOMNUMBERKERNELS_H
 #define RANDOMNUMBERKERNELS_H 1
 
 #include "mgOnGpuConfig.h"
 
-// NB This must come AFTER mgOnGpuConfig.h which contains our definition of __global__ when MGONGPUCPP_GPUIMPL is not defined
+// NB This must come AFTER mgOnGpuConfig.h which contains our definition of __global__ when __CUDACC__ is not defined
 #ifndef MGONGPU_HAS_NO_CURAND
 //#include "curand.h"
 struct curandGenerator_st; // forward definition from curand.h
@@ -16,7 +16,7 @@ struct curandGenerator_st; // forward definition from curand.h
 
 #include "MemoryBuffers.h"
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/cudacpp.mk b/epochX/cudacpp/gq_ttq.mad/SubProcesses/cudacpp.mk
index bcb73d7f01..a0397e9ecc 100644
--- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/cudacpp.mk
+++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/cudacpp.mk
@@ -1,7 +1,7 @@
 # Copyright (C) 2020-2023 CERN and UCLouvain.
 # Licensed under the GNU Lesser General Public License (version 3 or later).
 # Created by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin.
-# Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+# Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 
 #=== Determine the name of this makefile (https://ftp.gnu.org/old-gnu/Manuals/make-3.80/html_node/make_17.html)
 #=== NB: different names (e.g. cudacpp.mk and cudacpp_src.mk) are used in the Subprocess and src directories
@@ -32,7 +32,7 @@ UNAME_P := $(shell uname -p)
 #=== Configure common compiler flags for C++ and CUDA
 
 INCFLAGS = -I.
-OPTFLAGS = -O3 # this ends up in GPUFLAGS too (should it?), cannot add -Ofast or -ffast-math here
+OPTFLAGS = -O3 # this ends up in CUFLAGS too (should it?), cannot add -Ofast or -ffast-math here
 
 # Dependency on src directory
 MG5AMC_COMMONLIB = mg5amc_common
@@ -89,139 +89,69 @@ endif
 
 #-------------------------------------------------------------------------------
 
-CUDA_COMPILER_PATH := $(shell compiler="`which nvcc 2>/dev/null`" && while [ -L "$$compiler" ]; do compiler=`readlink "$$compiler"`; done && echo "$$compiler")
-HIP_COMPILER_PATH := $(shell compiler="`which hipcc 2>/dev/null`" && while [ -L "$$compiler" ]; do compiler=`readlink "$$compiler"`; done && echo "$$compiler")
-
-ifeq ($(findstring nvcc,$(CUDA_COMPILER_PATH)),nvcc)
-  #=== Configure the CUDA compiler
-
-  # If CXX is not a single word (example "clang++ --gcc-toolchain...") then disable CUDA builds (issue #505)
-  # This is because it is impossible to pass this to "GPUFLAGS += -ccbin <host-compiler>" below
-  ifneq ($(words $(subst ccache ,,$(CXX))),1) # allow at most "CXX=ccache <host-compiler>" from outside
-    $(warning CUDA builds are not supported for multi-word CXX "$(CXX)")
-    override CUDA_HOME=disabled
-  endif
-
-  # If CUDA_HOME is not set, try to set it from the location of NVCC
-  ifndef CUDA_HOME
-    CUDA_HOME = $(patsubst %bin/nvcc,%,$(shell which nvcc 2>/dev/null))
-    $(warning CUDA_HOME was not set: using "$(CUDA_HOME)")
-  endif
-
-  # Set GPUCC as $(CUDA_HOME)/bin/nvcc if it exists
-  ifneq ($(wildcard $(CUDA_HOME)/bin/nvcc),)
-    GPUCC = $(CUDA_HOME)/bin/nvcc
-    USE_NVTX ?=-DUSE_NVTX
-    # See https://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/index.html
-    # See https://arnon.dk/matching-sm-architectures-arch-and-gencode-for-various-nvidia-cards/
-    # Default: use compute capability 70 for V100 (CERN lxbatch, CERN itscrd, Juwels Cluster).
-    # Embed device code for 70, and PTX for 70+.
-    # Export MADGRAPH_CUDA_ARCHITECTURE (comma-separated list) to use another value or list of values (see #533).
-    # Examples: use 60 for P100 (Piz Daint), 80 for A100 (Juwels Booster, NVidia raplab/Curiosity).
-    MADGRAPH_CUDA_ARCHITECTURE ?= 70
-    ###CUARCHFLAGS = -gencode arch=compute_$(MADGRAPH_CUDA_ARCHITECTURE),code=compute_$(MADGRAPH_CUDA_ARCHITECTURE) -gencode arch=compute_$(MADGRAPH_CUDA_ARCHITECTURE),code=sm_$(MADGRAPH_CUDA_ARCHITECTURE) # Older implementation (AV): go back to this one for multi-GPU support #533
-    ###CUARCHFLAGS = --gpu-architecture=compute_$(MADGRAPH_CUDA_ARCHITECTURE) --gpu-code=sm_$(MADGRAPH_CUDA_ARCHITECTURE),compute_$(MADGRAPH_CUDA_ARCHITECTURE) # Newer implementation (SH): cannot use this as-is for multi-GPU support #533
-    comma:=,
-    CUARCHFLAGS = $(foreach arch,$(subst $(comma), ,$(MADGRAPH_CUDA_ARCHITECTURE)),-gencode arch=compute_$(arch),code=compute_$(arch) -gencode arch=compute_$(arch),code=sm_$(arch))
-    CUINC = -I$(CUDA_HOME)/include/
-    CURANDLIBFLAGS = -L$(CUDA_HOME)/lib64/ -lcurand # NB: -lcuda is not needed here!
-    CUOPTFLAGS = -lineinfo
-    GPUFLAGS = $(OPTFLAGS) $(CUOPTFLAGS) $(INCFLAGS) $(CUINC) $(USE_NVTX) $(CUARCHFLAGS) -use_fast_math
-    ###GPUFLAGS += -Xcompiler -Wall -Xcompiler -Wextra -Xcompiler -Wshadow
-    ###GPUCC_VERSION = $(shell $(GPUCC) --version | grep 'Cuda compilation tools' | cut -d' ' -f5 | cut -d, -f1)
-    GPUFLAGS += -std=c++17 # need CUDA >= 11.2 (see #333): this is enforced in mgOnGpuConfig.h
-    # Without -maxrregcount: baseline throughput: 6.5E8 (16384 32 12) up to 7.3E8 (65536 128 12)
-    ###GPUFLAGS+= --maxrregcount 160 # improves throughput: 6.9E8 (16384 32 12) up to 7.7E8 (65536 128 12)
-    ###GPUFLAGS+= --maxrregcount 128 # improves throughput: 7.3E8 (16384 32 12) up to 7.6E8 (65536 128 12)
-    ###GPUFLAGS+= --maxrregcount 96 # degrades throughput: 4.1E8 (16384 32 12) up to 4.5E8 (65536 128 12)
-    ###GPUFLAGS+= --maxrregcount 64 # degrades throughput: 1.7E8 (16384 32 12) flat at 1.7E8 (65536 128 12)
-
-    CUBUILDRULEFLAGS = -Xcompiler -fPIC -c
-    CCBUILDRULEFLAGS = -Xcompiler -fPIC -c -x cu
-
-    CUDATESTFLAGS = -lcuda
-
-  else ifneq ($(origin REQUIRE_CUDA),undefined)
-    # If REQUIRE_CUDA is set but no cuda is found, stop here (e.g. for CI tests on GPU #443)
-    $(error No cuda installation found (set CUDA_HOME or make GPUCC visible in PATH))
-  else
-    # No cuda. Switch cuda compilation off and go to common random numbers in C++
-    $(warning CUDA_HOME is not set or is invalid: export CUDA_HOME to compile with cuda)
-    override GPUCC=
-    override USE_NVTX=
-    override CUINC=
-    override CURANDLIBFLAGS=
-  endif
-
-  # Set the host C++ compiler for GPUCC via "-ccbin <host-compiler>"
-  # (NB issue #505: this must be a single word, "clang++ --gcc-toolchain..." is not supported)
-  GPUFLAGS += -ccbin $(shell which $(subst ccache ,,$(CXX)))
-
-  # Allow newer (unsupported) C++ compilers with older versions of CUDA if ALLOW_UNSUPPORTED_COMPILER_IN_CUDA is set (#504)
-  ifneq ($(origin ALLOW_UNSUPPORTED_COMPILER_IN_CUDA),undefined)
-  GPUFLAGS += -allow-unsupported-compiler
-  endif
-
-else ifeq ($(findstring hipcc,$(HIP_COMPILER_PATH)),hipcc)
-  #=== Configure the HIP compiler
-
-  # If CXX is not a single word (example "clang++ --gcc-toolchain...") then disable CUDA builds (issue #505)
-  # This is because it is impossible to pass this to "GPUFLAGS += -ccbin <host-compiler>" below
-  ifneq ($(words $(subst ccache ,,$(CXX))),1) # allow at most "CXX=ccache <host-compiler>" from outside
-    $(warning CUDA builds are not supported for multi-word CXX "$(CXX)")
-    override CUDA_HOME=disabled
-  endif
-
-  # If HIP_HOME is not set, try to set it from the location of GPUCC
-  ifndef HIP_HOME
-    HIP_HOME = $(patsubst %bin/hipcc,%,$(HIP_COMPILER_PATH))
-    $(warning HIP_HOME was not set: using "$(HIP_HOME)")
-  endif
-
-  # Set GPUCC as $(HIP_HOME)/bin/hipcc if it exists
-  ifneq ($(wildcard $(HIP_HOME)/bin/hipcc),)
-    GPUCC = $(HIP_HOME)/bin/hipcc
-
-    # Should maybe find something equivelant to this in HIP
-    #USE_NVTX ?=-DUSE_NVTX
-
-    HIPARCHFLAGS = -target x86_64-linux-gnu --offload-arch=gfx90a
-    HIPINC = -I$(HIP_HOME)/include/
-
-    # -DHIP_FAST_MATH equivelant to -use_fast_math in HIP 
-    # (But only for single precision line 208: https://rocm-developer-tools.github.io/HIP/hcc__detail_2math__functions_8h_source.html)
-    GPUFLAGS = $(OPTFLAGS) $(CUOPTFLAGS) $(INCFLAGS) $(HIPINC) $(HIPARCHFLAGS) -DHIP_FAST_MATH -DHIP_PLATFORM=amd -fPIC
-    ###GPUFLAGS += -Xcompiler -Wall -Xcompiler -Wextra -Xcompiler -Wshadow
-    GPUFLAGS += -std=c++17
-    # Without -maxrregcount: baseline throughput: 6.5E8 (16384 32 12) up to 7.3E8 (65536 128 12)
-    ###GPUFLAGS+= --maxrregcount 160 # improves throughput: 6.9E8 (16384 32 12) up to 7.7E8 (65536 128 12)
-    ###GPUFLAGS+= --maxrregcount 128 # improves throughput: 7.3E8 (16384 32 12) up to 7.6E8 (65536 128 12)
-    ###GPUFLAGS+= --maxrregcount 96 # degrades throughput: 4.1E8 (16384 32 12) up to 4.5E8 (65536 128 12)
-    ###GPUFLAGS+= --maxrregcount 64 # degrades throughput: 1.7E8 (16384 32 12) flat at 1.7E8 (65536 128 12)
-
-    CUBUILDRULEFLAGS = -fPIC -c
-    CCBUILDRULEFLAGS = -fPIC -c
-
-  else ifneq ($(origin REQUIRE_HIP),undefined)
-    # If REQUIRE_HIP is set but no cuda is found, stop here (e.g. for CI tests on GPU #443)
-    $(error No hip installation found (set HIP_HOME or make GPUCC visible in PATH))
-  else
-    # No hip. Switch hip compilation off and go to common random numbers in C++
-    $(warning HIP_HOME is not set or is invalid: export HIP_HOME to compile with hip)
-    override GPUCC=
-    override USE_NVTX=
-    override CUINC=
-    override CURANDLIBFLAGS=
-  endif
+#=== Configure the CUDA compiler
+
+# If CXX is not a single word (example "clang++ --gcc-toolchain...") then disable CUDA builds (issue #505)
+# This is because it is impossible to pass this to "CUFLAGS += -ccbin <host-compiler>" below
+ifneq ($(words $(subst ccache ,,$(CXX))),1) # allow at most "CXX=ccache <host-compiler>" from outside
+  $(warning CUDA builds are not supported for multi-word CXX "$(CXX)")
+  override CUDA_HOME=disabled
+endif
+
+# If CUDA_HOME is not set, try to set it from the location of nvcc
+ifndef CUDA_HOME
+  CUDA_HOME = $(patsubst %bin/nvcc,%,$(shell which nvcc 2>/dev/null))
+  $(warning CUDA_HOME was not set: using "$(CUDA_HOME)")
+endif
+
+# Set NVCC as $(CUDA_HOME)/bin/nvcc if it exists
+ifneq ($(wildcard $(CUDA_HOME)/bin/nvcc),)
+  NVCC = $(CUDA_HOME)/bin/nvcc
+  USE_NVTX ?=-DUSE_NVTX
+  # See https://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/index.html
+  # See https://arnon.dk/matching-sm-architectures-arch-and-gencode-for-various-nvidia-cards/
+  # Default: use compute capability 70 for V100 (CERN lxbatch, CERN itscrd, Juwels Cluster).
+  # Embed device code for 70, and PTX for 70+.
+  # Export MADGRAPH_CUDA_ARCHITECTURE (comma-separated list) to use another value or list of values (see #533).
+  # Examples: use 60 for P100 (Piz Daint), 80 for A100 (Juwels Booster, NVidia raplab/Curiosity).
+  MADGRAPH_CUDA_ARCHITECTURE ?= 70
+  ###CUARCHFLAGS = -gencode arch=compute_$(MADGRAPH_CUDA_ARCHITECTURE),code=compute_$(MADGRAPH_CUDA_ARCHITECTURE) -gencode arch=compute_$(MADGRAPH_CUDA_ARCHITECTURE),code=sm_$(MADGRAPH_CUDA_ARCHITECTURE) # Older implementation (AV): go back to this one for multi-GPU support #533
+  ###CUARCHFLAGS = --gpu-architecture=compute_$(MADGRAPH_CUDA_ARCHITECTURE) --gpu-code=sm_$(MADGRAPH_CUDA_ARCHITECTURE),compute_$(MADGRAPH_CUDA_ARCHITECTURE) # Newer implementation (SH): cannot use this as-is for multi-GPU support #533
+  comma:=,
+  CUARCHFLAGS = $(foreach arch,$(subst $(comma), ,$(MADGRAPH_CUDA_ARCHITECTURE)),-gencode arch=compute_$(arch),code=compute_$(arch) -gencode arch=compute_$(arch),code=sm_$(arch))
+  CUINC = -I$(CUDA_HOME)/include/
+  CURANDLIBFLAGS = -L$(CUDA_HOME)/lib64/ -lcurand # NB: -lcuda is not needed here!
+  CUOPTFLAGS = -lineinfo
+  CUFLAGS = $(OPTFLAGS) $(CUOPTFLAGS) $(INCFLAGS) $(CUINC) $(USE_NVTX) $(CUARCHFLAGS) -use_fast_math
+  ###CUFLAGS += -Xcompiler -Wall -Xcompiler -Wextra -Xcompiler -Wshadow
+  ###NVCC_VERSION = $(shell $(NVCC) --version | grep 'Cuda compilation tools' | cut -d' ' -f5 | cut -d, -f1)
+  CUFLAGS += -std=c++17 # need CUDA >= 11.2 (see #333): this is enforced in mgOnGpuConfig.h
+  # Without -maxrregcount: baseline throughput: 6.5E8 (16384 32 12) up to 7.3E8 (65536 128 12)
+  ###CUFLAGS+= --maxrregcount 160 # improves throughput: 6.9E8 (16384 32 12) up to 7.7E8 (65536 128 12)
+  ###CUFLAGS+= --maxrregcount 128 # improves throughput: 7.3E8 (16384 32 12) up to 7.6E8 (65536 128 12)
+  ###CUFLAGS+= --maxrregcount 96 # degrades throughput: 4.1E8 (16384 32 12) up to 4.5E8 (65536 128 12)
+  ###CUFLAGS+= --maxrregcount 64 # degrades throughput: 1.7E8 (16384 32 12) flat at 1.7E8 (65536 128 12)
+else ifneq ($(origin REQUIRE_CUDA),undefined)
+  # If REQUIRE_CUDA is set but no cuda is found, stop here (e.g. for CI tests on GPU #443)
+  $(error No cuda installation found (set CUDA_HOME or make nvcc visible in PATH))
+else
+  # No cuda. Switch cuda compilation off and go to common random numbers in C++
+  $(warning CUDA_HOME is not set or is invalid: export CUDA_HOME to compile with cuda)
+  override NVCC=
+  override USE_NVTX=
+  override CUINC=
+  override CURANDLIBFLAGS=
+endif
 
-  # Allow newer (unsupported) C++ compilers with older versions of CUDA if ALLOW_UNSUPPORTED_COMPILER_IN_CUDA is set (#504)
-  ifneq ($(origin ALLOW_UNSUPPORTED_COMPILER_IN_CUDA),undefined)
-  GPUFLAGS += -allow-unsupported-compiler
-  endif
+# Set the host C++ compiler for nvcc via "-ccbin <host-compiler>"
+# (NB issue #505: this must be a single word, "clang++ --gcc-toolchain..." is not supported)
+CUFLAGS += -ccbin $(shell which $(subst ccache ,,$(CXX)))
 
+# Allow newer (unsupported) C++ compilers with older versions of CUDA if ALLOW_UNSUPPORTED_COMPILER_IN_CUDA is set (#504)
+ifneq ($(origin ALLOW_UNSUPPORTED_COMPILER_IN_CUDA),undefined)
+CUFLAGS += -allow-unsupported-compiler
 endif
 
-  
 #-------------------------------------------------------------------------------
 
 #=== Configure ccache for C++ and CUDA builds
@@ -233,9 +163,9 @@ endif
 #ifeq ($(USECCACHE)$(shell echo $(AR) | grep ccache),1)
 #  override AR:=ccache $(AR)
 #endif
-ifneq ($(GPUCC),)
-  ifeq ($(USECCACHE)$(shell echo $(GPUCC) | grep ccache),1)
-    override GPUCC:=ccache $(GPUCC)
+ifneq ($(NVCC),)
+  ifeq ($(USECCACHE)$(shell echo $(NVCC) | grep ccache),1)
+    override NVCC:=ccache $(NVCC)
   endif
 endif
 
@@ -259,7 +189,7 @@ endif
 
 # PowerPC-specific CUDA compiler flags (to be reviewed!)
 ifeq ($(UNAME_P),ppc64le)
-  GPUFLAGS+= -Xcompiler -mno-float128
+  CUFLAGS+= -Xcompiler -mno-float128
 endif
 
 #-------------------------------------------------------------------------------
@@ -269,10 +199,10 @@ endif
 # Set the default OMPFLAGS choice
 ifneq ($(shell $(CXX) --version | egrep '^Intel'),)
 override OMPFLAGS = -fopenmp
-###override OMPFLAGS = # disable OpenMP MT on Intel (was ok without GPUCC but not ok with GPUCC before #578)
+###override OMPFLAGS = # disable OpenMP MT on Intel (was ok without nvcc but not ok with nvcc before #578)
 else ifneq ($(shell $(CXX) --version | egrep '^(clang)'),)
 override OMPFLAGS = -fopenmp
-###override OMPFLAGS = # disable OpenMP MT on clang (was not ok without or with GPUCC before #578)
+###override OMPFLAGS = # disable OpenMP MT on clang (was not ok without or with nvcc before #578)
 else ifneq ($(shell $(CXX) --version | egrep '^(Apple clang)'),)
 override OMPFLAGS = # disable OpenMP MT on Apple clang (builds fail in the CI #578)
 else
@@ -323,10 +253,7 @@ endif
 
 # Set the default RNDGEN (random number generator) choice
 ifeq ($(RNDGEN),)
-  ifeq ($(GPUCC),)
-    override RNDGEN = hasNoCurand
-  # Edgecase for HIP compilation
-  else ifeq ($(findstring hipcc,$(GPUCC)),hipcc)
+  ifeq ($(NVCC),)
     override RNDGEN = hasNoCurand
   else ifeq ($(RNDGEN),)
     override RNDGEN = hasCurand
@@ -401,13 +328,13 @@ CXXFLAGS+= $(AVXFLAGS)
 $(info FPTYPE=$(FPTYPE))
 ifeq ($(FPTYPE),d)
   CXXFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_DOUBLE
-  GPUFLAGS  += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_DOUBLE
+  CUFLAGS  += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_DOUBLE
 else ifeq ($(FPTYPE),f)
   CXXFLAGS += -DMGONGPU_FPTYPE_FLOAT -DMGONGPU_FPTYPE2_FLOAT
-  GPUFLAGS  += -DMGONGPU_FPTYPE_FLOAT -DMGONGPU_FPTYPE2_FLOAT
+  CUFLAGS  += -DMGONGPU_FPTYPE_FLOAT -DMGONGPU_FPTYPE2_FLOAT
 else ifeq ($(FPTYPE),m)
   CXXFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_FLOAT
-  GPUFLAGS  += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_FLOAT
+  CUFLAGS  += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_FLOAT
 else
   $(error Unknown FPTYPE='$(FPTYPE)': only 'd', 'f' and 'm' are supported)
 endif
@@ -416,7 +343,7 @@ endif
 $(info HELINL=$(HELINL))
 ifeq ($(HELINL),1)
   CXXFLAGS += -DMGONGPU_INLINE_HELAMPS
-  GPUFLAGS  += -DMGONGPU_INLINE_HELAMPS
+  CUFLAGS  += -DMGONGPU_INLINE_HELAMPS
 else ifneq ($(HELINL),0)
   $(error Unknown HELINL='$(HELINL)': only '0' and '1' are supported)
 endif
@@ -425,7 +352,7 @@ endif
 $(info HRDCOD=$(HRDCOD))
 ifeq ($(HRDCOD),1)
   CXXFLAGS += -DMGONGPU_HARDCODE_PARAM
-  GPUFLAGS  += -DMGONGPU_HARDCODE_PARAM
+  CUFLAGS  += -DMGONGPU_HARDCODE_PARAM
 else ifneq ($(HRDCOD),0)
   $(error Unknown HRDCOD='$(HRDCOD)': only '0' and '1' are supported)
 endif
@@ -477,11 +404,11 @@ ifeq ($(UNAME_S),Darwin)
   override CULIBFLAGSRPATH2 =
 else
   # RPATH to cuda/cpp libs when linking executables
-  override CXXLIBFLAGSRPATH = -Wl,-rpath=$(LIBDIRRPATH)
-  override CULIBFLAGSRPATH = -Xlinker -rpath=$(LIBDIRRPATH)
+  override CXXLIBFLAGSRPATH = -Wl,-rpath,$(LIBDIRRPATH)
+  override CULIBFLAGSRPATH = -Xlinker -rpath,$(LIBDIRRPATH)
   # RPATH to common lib when linking cuda/cpp libs
-  override CXXLIBFLAGSRPATH2 = -Wl,-rpath='$$ORIGIN'
-  override CULIBFLAGSRPATH2 = -Xlinker -rpath='$$ORIGIN'
+  override CXXLIBFLAGSRPATH2 = -Wl,-rpath,'$$ORIGIN'
+  override CULIBFLAGSRPATH2 = -Xlinker -rpath,'$$ORIGIN'
 endif
 
 # Setting LD_LIBRARY_PATH or DYLD_LIBRARY_PATH in the RUNTIME is no longer necessary (neither on Linux nor on Mac)
@@ -494,7 +421,7 @@ override RUNTIME =
 cxx_main=$(BUILDDIR)/check.exe
 fcxx_main=$(BUILDDIR)/fcheck.exe
 
-ifneq ($(GPUCC),)
+ifneq ($(NVCC),)
 cu_main=$(BUILDDIR)/gcheck.exe
 fcu_main=$(BUILDDIR)/fgcheck.exe
 else
@@ -525,16 +452,15 @@ $(BUILDDIR)/.build.$(TAG):
 	@touch $(BUILDDIR)/.build.$(TAG)
 
 # Generic target and build rules: objects from CUDA compilation
-ifneq ($(GPUCC),)
+ifneq ($(NVCC),)
 $(BUILDDIR)/%.o : %.cu *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
 	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
-	$(GPUCC) $(CPPFLAGS) $(GPUFLAGS) $(CUBUILDRULEFLAGS) $< -o $@
+	$(NVCC) $(CPPFLAGS) $(CUFLAGS) -Xcompiler -fPIC -c $< -o $@
 
 $(BUILDDIR)/%_cu.o : %.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
 	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
-	$(GPUCC) $(CPPFLAGS) $(GPUFLAGS) $(CCBUILDRULEFLAGS) $< -o $@
+	$(NVCC) $(CPPFLAGS) $(CUFLAGS) -Xcompiler -fPIC -c -x cu $< -o $@
 endif
-# -x cu in line above
 
 # Generic target and build rules: objects from C++ compilation
 # (NB do not include CUINC here! add it only for NVTX or curand #679)
@@ -543,14 +469,11 @@ $(BUILDDIR)/%.o : %.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
 	$(CXX) $(CPPFLAGS) $(CXXFLAGS) -fPIC -c $< -o $@
 
 # Apply special build flags only to CrossSectionKernel.cc and gCrossSectionKernel.cu (no fast math, see #117)
-# Added edgecase for HIP compilation
 ifeq ($(shell $(CXX) --version | grep ^nvc++),)
 $(BUILDDIR)/CrossSectionKernels.o: CXXFLAGS += -fno-fast-math
 $(BUILDDIR)/CrossSectionKernels.o: CXXFLAGS += -fno-fast-math
-ifeq ($(findstring nvcc,$(GPUCC)),nvcc)
-  $(BUILDDIR)/gCrossSectionKernels.o: GPUFLAGS += -Xcompiler -fno-fast-math
-else
-  $(BUILDDIR)/gCrossSectionKernels.o: GPUFLAGS += -fno-fast-math
+ifneq ($(NVCC),)
+$(BUILDDIR)/gCrossSectionKernels.o: CUFLAGS += -Xcompiler -fno-fast-math
 endif
 endif
 
@@ -566,10 +489,10 @@ ifeq ($(RNDGEN),hasCurand)
 $(BUILDDIR)/CurandRandomNumberKernel.o: CXXFLAGS += $(CUINC)
 endif
 
-# Avoid "warning: builtin __has_trivial_... is deprecated; use __is_trivially_... instead" in GPUCC with icx2023 (#592)
+# Avoid "warning: builtin __has_trivial_... is deprecated; use __is_trivially_... instead" in nvcc with icx2023 (#592)
 ifneq ($(shell $(CXX) --version | egrep '^(Intel)'),)
-ifneq ($(GPUCC),)
-GPUFLAGS += -Wno-deprecated-builtins
+ifneq ($(NVCC),)
+CUFLAGS += -Xcompiler -Wno-deprecated-builtins
 endif
 endif
 
@@ -577,8 +500,8 @@ endif
 # This patch does remove the warning, but I prefer to keep it disabled for the moment...
 ###ifneq ($(shell $(CXX) --version | egrep '^(clang|Apple clang|Intel)'),)
 ###$(BUILDDIR)/CrossSectionKernels.o: CXXFLAGS += -Wno-overriding-t-option
-###ifneq ($(GPUCC),)
-###$(BUILDDIR)/gCrossSectionKernels.o: GPUFLAGS += -Xcompiler -Wno-overriding-t-option
+###ifneq ($(NVCC),)
+###$(BUILDDIR)/gCrossSectionKernels.o: CUFLAGS += -Xcompiler -Wno-overriding-t-option
 ###endif
 ###endif
 
@@ -605,7 +528,7 @@ MG5AMC_CXXLIB = mg5amc_$(processid_short)_cpp
 cxx_objects_lib=$(BUILDDIR)/CPPProcess.o $(BUILDDIR)/MatrixElementKernels.o $(BUILDDIR)/BridgeKernels.o $(BUILDDIR)/CrossSectionKernels.o
 cxx_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel.o $(BUILDDIR)/RamboSamplingKernels.o
 
-ifneq ($(GPUCC),)
+ifneq ($(NVCC),)
 MG5AMC_CULIB = mg5amc_$(processid_short)_cuda
 cu_objects_lib=$(BUILDDIR)/gCPPProcess.o $(BUILDDIR)/gMatrixElementKernels.o $(BUILDDIR)/gBridgeKernels.o $(BUILDDIR)/gCrossSectionKernels.o
 cu_objects_exe=$(BUILDDIR)/gCommonRandomNumberKernel.o $(BUILDDIR)/gRamboSamplingKernels.o
@@ -617,11 +540,11 @@ $(LIBDIR)/lib$(MG5AMC_CXXLIB).so: cxx_objects_lib += $(BUILDDIR)/fbridge.o
 $(LIBDIR)/lib$(MG5AMC_CXXLIB).so: $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib)
 	$(CXX) -shared -o $@ $(cxx_objects_lib) $(CXXLIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB)
 
-ifneq ($(GPUCC),)
+ifneq ($(NVCC),)
 $(LIBDIR)/lib$(MG5AMC_CULIB).so: $(BUILDDIR)/fbridge_cu.o
 $(LIBDIR)/lib$(MG5AMC_CULIB).so: cu_objects_lib += $(BUILDDIR)/fbridge_cu.o
 $(LIBDIR)/lib$(MG5AMC_CULIB).so: $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cu_objects_lib)
-	$(GPUCC) --shared -o $@ $(cu_objects_lib) $(CULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB)
+	$(NVCC) --shared -o $@ $(cu_objects_lib) $(CULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB)
 endif
 
 #-------------------------------------------------------------------------------
@@ -638,16 +561,16 @@ $(cxx_main): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PAT
 $(cxx_main): $(BUILDDIR)/check_sa.o $(LIBDIR)/lib$(MG5AMC_CXXLIB).so $(cxx_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel.o
 	$(CXX) -o $@ $(BUILDDIR)/check_sa.o $(OMPFLAGS) -ldl -pthread $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_CXXLIB) $(cxx_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel.o $(CURANDLIBFLAGS)
 
-ifneq ($(GPUCC),)
+ifneq ($(NVCC),)
 ifneq ($(shell $(CXX) --version | grep ^Intel),)
-$(cu_main): LIBFLAGS += -lintlc # compile with icpx and link with GPUCC (undefined reference to `_intel_fast_memcpy')
-$(cu_main): LIBFLAGS += -lsvml # compile with icpx and link with GPUCC (undefined reference to `__svml_cos4_l9')
+$(cu_main): LIBFLAGS += -lintlc # compile with icpx and link with nvcc (undefined reference to `_intel_fast_memcpy')
+$(cu_main): LIBFLAGS += -lsvml # compile with icpx and link with nvcc (undefined reference to `__svml_cos4_l9')
 else ifneq ($(shell $(CXX) --version | grep ^nvc++),) # support nvc++ #531
 $(cu_main): LIBFLAGS += -L$(patsubst %bin/nvc++,%lib,$(subst ccache ,,$(CXX))) -lnvhpcatm -lnvcpumath -lnvc
 endif
 $(cu_main): LIBFLAGS += $(CULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH
 $(cu_main): $(BUILDDIR)/gcheck_sa.o $(LIBDIR)/lib$(MG5AMC_CULIB).so $(cu_objects_exe) $(BUILDDIR)/gCurandRandomNumberKernel.o
-	$(GPUCC) -o $@ $(BUILDDIR)/gcheck_sa.o $(CUARCHFLAGS) $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe) $(BUILDDIR)/gCurandRandomNumberKernel.o $(CURANDLIBFLAGS)
+	$(NVCC) -o $@ $(BUILDDIR)/gcheck_sa.o $(CUARCHFLAGS) $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe) $(BUILDDIR)/gCurandRandomNumberKernel.o $(CURANDLIBFLAGS)
 endif
 
 #-------------------------------------------------------------------------------
@@ -673,17 +596,17 @@ $(fcxx_main): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PA
 $(fcxx_main): $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler.o $(LIBDIR)/lib$(MG5AMC_CXXLIB).so $(cxx_objects_exe)
 	$(CXX) -o $@ $(BUILDDIR)/fcheck_sa.o $(OMPFLAGS) $(BUILDDIR)/fsampler.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CXXLIB) $(cxx_objects_exe)
 
-ifneq ($(GPUCC),)
+ifneq ($(NVCC),)
 ifneq ($(shell $(CXX) --version | grep ^Intel),)
-$(fcu_main): LIBFLAGS += -lintlc # compile with icpx and link with GPUCC (undefined reference to `_intel_fast_memcpy')
-$(fcu_main): LIBFLAGS += -lsvml # compile with icpx and link with GPUCC (undefined reference to `__svml_cos4_l9')
+$(fcu_main): LIBFLAGS += -lintlc # compile with icpx and link with nvcc (undefined reference to `_intel_fast_memcpy')
+$(fcu_main): LIBFLAGS += -lsvml # compile with icpx and link with nvcc (undefined reference to `__svml_cos4_l9')
 endif
 ifeq ($(UNAME_S),Darwin)
 $(fcu_main): LIBFLAGS += -L$(shell dirname $(shell $(FC) --print-file-name libgfortran.dylib)) # add path to libgfortran on Mac #375
 endif
 $(fcu_main): LIBFLAGS += $(CULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH
 $(fcu_main): $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler_cu.o $(LIBDIR)/lib$(MG5AMC_CULIB).so $(cu_objects_exe)
-	$(GPUCC) -o $@ $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler_cu.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe)
+	$(NVCC) -o $@ $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler_cu.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe)
 endif
 
 #-------------------------------------------------------------------------------
@@ -695,7 +618,7 @@ $(BUILDDIR)/testxxx.o: testxxx_cc_ref.txt
 $(testmain): $(BUILDDIR)/testxxx.o
 $(testmain): cxx_objects_exe += $(BUILDDIR)/testxxx.o # Comment out this line to skip the C++ test of xxx functions
 
-ifneq ($(GPUCC),)
+ifneq ($(NVCC),)
 $(BUILDDIR)/testxxx_cu.o: $(GTESTLIBS)
 $(BUILDDIR)/testxxx_cu.o: INCFLAGS += $(GTESTINC)
 $(BUILDDIR)/testxxx_cu.o: testxxx_cc_ref.txt
@@ -708,7 +631,7 @@ $(BUILDDIR)/testmisc.o: INCFLAGS += $(GTESTINC)
 $(testmain): $(BUILDDIR)/testmisc.o
 $(testmain): cxx_objects_exe += $(BUILDDIR)/testmisc.o # Comment out this line to skip the C++ miscellaneous tests
 
-ifneq ($(GPUCC),)
+ifneq ($(NVCC),)
 $(BUILDDIR)/testmisc_cu.o: $(GTESTLIBS)
 $(BUILDDIR)/testmisc_cu.o: INCFLAGS += $(GTESTINC)
 $(testmain): $(BUILDDIR)/testmisc_cu.o
@@ -720,12 +643,12 @@ $(BUILDDIR)/runTest.o: INCFLAGS += $(GTESTINC)
 $(testmain): $(BUILDDIR)/runTest.o
 $(testmain): cxx_objects_exe += $(BUILDDIR)/runTest.o
 
-ifneq ($(GPUCC),)
+ifneq ($(NVCC),)
 $(BUILDDIR)/runTest_cu.o: $(GTESTLIBS)
 $(BUILDDIR)/runTest_cu.o: INCFLAGS += $(GTESTINC)
 ifneq ($(shell $(CXX) --version | grep ^Intel),)
-$(testmain): LIBFLAGS += -lintlc # compile with icpx and link with GPUCC (undefined reference to `_intel_fast_memcpy')
-$(testmain): LIBFLAGS += -lsvml # compile with icpx and link with GPUCC (undefined reference to `__svml_cos4_l9')
+$(testmain): LIBFLAGS += -lintlc # compile with icpx and link with nvcc (undefined reference to `_intel_fast_memcpy')
+$(testmain): LIBFLAGS += -lsvml # compile with icpx and link with nvcc (undefined reference to `__svml_cos4_l9')
 else ifneq ($(shell $(CXX) --version | grep ^nvc++),) # support nvc++ #531
 $(testmain): LIBFLAGS += -L$(patsubst %bin/nvc++,%lib,$(subst ccache ,,$(CXX))) -lnvhpcatm -lnvcpumath -lnvc
 endif
@@ -749,14 +672,14 @@ $(testmain): LIBFLAGS += -lgomp
 endif
 endif
 
-ifeq ($(GPUCC),) # link only runTest.o
+ifeq ($(NVCC),) # link only runTest.o
 $(testmain): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH
 $(testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(cxx_objects_exe) $(GTESTLIBS)
 	$(CXX) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) -ldl -pthread $(LIBFLAGS)
 else # link both runTest.o and runTest_cu.o
 $(testmain): LIBFLAGS += $(CULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH
 $(testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) $(GTESTLIBS)
-	  $(GPUCC) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) -ldl $(LIBFLAGS) $(CUDATESTFLAGS)
+	$(NVCC) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) -ldl $(LIBFLAGS) -lcuda
 endif
 
 # Use flock (Linux only, no Mac) to allow 'make -j' if googletest has not yet been downloaded https://stackoverflow.com/a/32666215
@@ -859,9 +782,9 @@ ifeq ($(USECCACHE),1)
 	ccache --version | head -1
 endif
 	@echo ""
-	@echo GPUCC=$(GPUCC)
-ifneq ($(GPUCC),)
-	$(GPUCC) --version
+	@echo NVCC=$(NVCC)
+ifneq ($(NVCC),)
+	$(NVCC) --version
 endif
 	@echo ""
 	@echo CXX=$(CXX)
@@ -880,7 +803,7 @@ endif
 
 # Target: check (run the C++ test executable)
 # [NB THIS IS WHAT IS USED IN THE GITHUB CI!]
-ifneq ($(GPUCC),)
+ifneq ($(NVCC),)
 check: runTest cmpFcheck cmpFGcheck
 else
 check: runTest cmpFcheck
diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/fbridge.cc b/epochX/cudacpp/gq_ttq.mad/SubProcesses/fbridge.cc
index 2b956730d4..f93c05b0b3 100644
--- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/fbridge.cc
+++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/fbridge.cc
@@ -1,11 +1,11 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: S. Roiser (Oct 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Roiser, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #include "Bridge.h"
 #include "CPPProcess.h"
-#include "GpuRuntime.h"
+#include "CudaRuntime.h"
 
 extern "C"
 {
@@ -22,7 +22,7 @@ extern "C"
    * Using the same Fortran MadEvent code, linking to the hetrerogeneous library would allow access to both CPU and GPU implementations.
    * The specific heterogeneous configuration (how many GPUs, how many threads on each CPU, etc) could be loaded in CUDA/C++ from a data file.
    */
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
   using namespace mg5amcGpu;
 #else
   using namespace mg5amcCpu;
@@ -46,8 +46,8 @@ extern "C"
    */
   void fbridgecreate_( CppObjectInFortran** ppbridge, const int* pnevtF, const int* pnparF, const int* pnp4F )
   {
-#ifdef MGONGPUCPP_GPUIMPL
-    GpuRuntime::setUp();
+#ifdef __CUDACC__
+    CudaRuntime::setUp();
 #endif
     // Create a process object, read parm card and set parameters
     // FIXME: the process instance can happily go out of scope because it is only needed to read parameters?
@@ -69,8 +69,8 @@ extern "C"
     Bridge<FORTRANFPTYPE>* pbridge = dynamic_cast<Bridge<FORTRANFPTYPE>*>( *ppbridge );
     if( pbridge == 0 ) throw std::runtime_error( "fbridgedelete_: invalid Bridge address" );
     delete pbridge;
-#ifdef MGONGPUCPP_GPUIMPL
-    GpuRuntime::tearDown();
+#ifdef __CUDACC__
+    CudaRuntime::tearDown();
 #endif
   }
 
@@ -100,7 +100,7 @@ extern "C"
   {
     Bridge<FORTRANFPTYPE>* pbridge = dynamic_cast<Bridge<FORTRANFPTYPE>*>( *ppbridge );
     if( pbridge == 0 ) throw std::runtime_error( "fbridgesequence_: invalid Bridge address" );
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
     // Use the device/GPU implementation in the CUDA library
     // (there is also a host implementation in this library)
     pbridge->gpu_sequence( momenta, gs, rndhel, rndcol, *pchannelId, mes, selhel, selcol );
diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/fsampler.cc b/epochX/cudacpp/gq_ttq.mad/SubProcesses/fsampler.cc
index 3743934f41..2fb445372d 100644
--- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/fsampler.cc
+++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/fsampler.cc
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Feb 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #include "mgOnGpuConfig.h"
 
@@ -13,7 +13,7 @@
 
 //--------------------------------------------------------------------------
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -40,7 +40,7 @@ namespace mg5amcCpu
   private:
     const int m_nevt; // The number of events in each iteration
     int m_iiter;      // The iteration counter (for random number seeding)
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
     HostBufferRndNumMomenta m_hstRndmom; // Memory buffers for random numbers
     HostBufferMomenta m_hstMomenta;      // Memory buffers for momenta
     HostBufferWeights m_hstWeights;      // Memory buffers for sampling weights
@@ -105,7 +105,7 @@ namespace mg5amcCpu
 
 extern "C"
 {
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
   using namespace mg5amcGpu;
 #else
   using namespace mg5amcCpu;
diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/runTest.cc b/epochX/cudacpp/gq_ttq.mad/SubProcesses/runTest.cc
index 461ec5c3a5..572e28aaea 100644
--- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/runTest.cc
+++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/runTest.cc
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: S. Hageboeck (Nov 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 
 #include "mgOnGpuConfig.h"
 
@@ -15,7 +15,7 @@
 #include "RandomNumberKernels.h"
 #include "epoch_process_id.h"
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 using namespace mg5amcGpu;
 #else
 using namespace mg5amcCpu;
@@ -32,7 +32,7 @@ struct CUDA_CPU_TestBase : public TestDriverBase
     : TestDriverBase( npar, refFileName ) {}
 };
 
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
 struct CPUTest : public CUDA_CPU_TestBase
 {
   // Struct data members (process, and memory structures for random numbers, momenta, matrix elements and weights on host and device)
@@ -114,7 +114,7 @@ struct CPUTest : public CUDA_CPU_TestBase
 };
 #endif
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 struct CUDATest : public CUDA_CPU_TestBase
 {
   // Reset the device when our test goes out of scope. Note that this should happen after
@@ -123,7 +123,7 @@ struct CUDATest : public CUDA_CPU_TestBase
   {
     ~DeviceReset()
     {
-      gpuDeviceReset(); // this is needed by cuda-memcheck --leak-check full
+      checkCuda( cudaDeviceReset() ); // this is needed by cuda-memcheck --leak-check full
     }
   } deviceResetter;
 
@@ -249,7 +249,7 @@ INSTANTIATE_TEST_SUITE_P( prefix, \
                           test_suite_name, \
                           testing::Values( new CUDATest( MG_EPOCH_REFERENCE_FILE_NAME ) ) );
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 MG_INSTANTIATE_TEST_SUITE_GPU( XTESTID_GPU( MG_EPOCH_PROCESS_ID ), MadgraphTest );
 #else
 MG_INSTANTIATE_TEST_SUITE_CPU( XTESTID_CPU( MG_EPOCH_PROCESS_ID ), MadgraphTest );
diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/testmisc.cc b/epochX/cudacpp/gq_ttq.mad/SubProcesses/testmisc.cc
index 2bd7a9fcf9..989aba1fdc 100644
--- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/testmisc.cc
+++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/testmisc.cc
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 //----------------------------------------------------------------------------
 // Use ./runTest.exe --gtest_filter=*misc to run only this test
 //----------------------------------------------------------------------------
@@ -17,7 +17,7 @@
 #include <sstream>
 #include <typeinfo>
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 #define TESTID( s ) s##_GPU_MISC
 #else
 #define TESTID( s ) s##_CPU_MISC
diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/testxxx.cc b/epochX/cudacpp/gq_ttq.mad/SubProcesses/testxxx.cc
index 6e8657edca..4243e9fcec 100644
--- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/testxxx.cc
+++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/testxxx.cc
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Apr 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #include "mgOnGpuConfig.h"
 
@@ -20,7 +20,7 @@
 #include <iomanip>
 #include <iostream>
 #include <vector>
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 #define TESTID( s ) s##_GPU_XXX
 #else
 #define TESTID( s ) s##_CPU_XXX
@@ -41,7 +41,7 @@ TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testxxx )
   assert( nevt % neppM == 0 ); // nevt must be a multiple of neppM
   assert( nevt % neppV == 0 ); // nevt must be a multiple of neppV
   // Fill in the input momenta
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
   mg5amcGpu::PinnedHostBufferMomenta hstMomenta( nevt ); // AOSOA[npagM][npar=4][np4=4][neppM]
 #else
   mg5amcCpu::HostBufferMomenta hstMomenta( nevt ); // AOSOA[npagM][npar=4][np4=4][neppM]
@@ -228,7 +228,7 @@ TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testxxx )
   {
     for( int ievt = 0; ievt < nevt; ievt++ )
     {
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
       using namespace mg5amcGpu;
 #else
       using namespace mg5amcCpu;
diff --git a/epochX/cudacpp/gq_ttq.mad/src/HelAmps_sm.h b/epochX/cudacpp/gq_ttq.mad/src/HelAmps_sm.h
index 9b0bfb10ee..e15ce959e9 100644
--- a/epochX/cudacpp/gq_ttq.mad/src/HelAmps_sm.h
+++ b/epochX/cudacpp/gq_ttq.mad/src/HelAmps_sm.h
@@ -4,7 +4,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: A. Valassi (Sep 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
 // MadGraph5_aMC@NLO v. 3.5.0_lo_vect, 2023-06-09
@@ -26,7 +26,7 @@
 //#include <iomanip>
 //#include <iostream>
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/gq_ttq.mad/src/Parameters_sm.cc b/epochX/cudacpp/gq_ttq.mad/src/Parameters_sm.cc
index 459dae9e99..7255e49119 100644
--- a/epochX/cudacpp/gq_ttq.mad/src/Parameters_sm.cc
+++ b/epochX/cudacpp/gq_ttq.mad/src/Parameters_sm.cc
@@ -4,7 +4,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: A. Valassi (Sep 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
 // MadGraph5_aMC@NLO v. 3.5.0_lo_vect, 2023-06-09
diff --git a/epochX/cudacpp/gq_ttq.mad/src/Parameters_sm.h b/epochX/cudacpp/gq_ttq.mad/src/Parameters_sm.h
index db5520aa96..c935779eb3 100644
--- a/epochX/cudacpp/gq_ttq.mad/src/Parameters_sm.h
+++ b/epochX/cudacpp/gq_ttq.mad/src/Parameters_sm.h
@@ -211,7 +211,7 @@ namespace Parameters_sm_dependentCouplings
 #pragma GCC diagnostic push
 #pragma GCC diagnostic ignored "-Wunused-variable"  // e.g. <<warning: unused variable ‘mdl_G__exp__2’ [-Wunused-variable]>>
 #pragma GCC diagnostic ignored "-Wunused-parameter" // e.g. <<warning: unused parameter ‘G’ [-Wunused-parameter]>>
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 #pragma nv_diagnostic push
 #pragma nv_diag_suppress 177 // e.g. <<warning #177-D: variable "mdl_G__exp__2" was declared but never referenced>>
 #endif
@@ -238,7 +238,7 @@ namespace Parameters_sm_dependentCouplings
     // End SM implementation - no special handling of vectors of floats as in EFT (#439)
     return out;
   }
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 #pragma GCC diagnostic pop
 #pragma nv_diagnostic pop
 #endif
@@ -254,7 +254,7 @@ namespace Parameters_sm_independentCouplings
 
 //==========================================================================
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/gq_ttq.mad/src/mgOnGpuConfig.h b/epochX/cudacpp/gq_ttq.mad/src/mgOnGpuConfig.h
index 390766116b..881353abac 100644
--- a/epochX/cudacpp/gq_ttq.mad/src/mgOnGpuConfig.h
+++ b/epochX/cudacpp/gq_ttq.mad/src/mgOnGpuConfig.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jul 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MGONGPUCONFIG_H
 #define MGONGPUCONFIG_H 1
@@ -10,26 +10,13 @@
 // There are two different code bases for standalone_cudacpp (without multichannel) and madevent+cudacpp (with multichannel)
 #define MGONGPU_SUPPORTS_MULTICHANNEL 1
 
-// Is this a GPU (CUDA, HIP) or CPU implementation?
-#ifdef __CUDACC__
-#define MGONGPUCPP_GPUIMPL cuda
-#elif defined __HIPCC__
-#define MGONGPUCPP_GPUIMPL hip
-#else
-#undef MGONGPUCPP_GPUIMPL
-#endif
-
 // ** NB1 Throughputs (e.g. 6.8E8) are events/sec for "./gcheck.exe -p 65536 128 12"
 // ** NB2 Baseline on b7g47n0004 fluctuates (probably depends on load on other VMs)
 
 // Choose if curand is supported for generating random numbers
-// For CUDA, by default, it is supported
-// For HIP, by default, it is not supported
 // For C++, by default, do not inline, but allow this macro to be set from outside with e.g. -DMGONGPU_HAS_NO_CURAND
 #ifdef __CUDACC__
 #undef MGONGPU_HAS_NO_CURAND
-#elif defined __HIPCC__
-#define MGONGPU_HAS_NO_CURAND 1
 #else
 //#undef MGONGPU_HAS_NO_CURAND // default
 ////#define MGONGPU_HAS_NO_CURAND 1
@@ -65,28 +52,23 @@
 //#undef MGONGPU_HARDCODE_PARAM // default
 ////#define MGONGPU_HARDCODE_PARAM 1
 
-// Complex type in CUDA: thrust or cucomplex or cxsmpl (CHOOSE ONLY ONE)
+// Complex type in c++: std::complex or cxsmpl (CHOOSE ONLY ONE)
+#ifndef __CUDACC__
+//#define MGONGPU_CPPCXTYPE_STDCOMPLEX 1 // ~8 percent slower on float, same on double (5.1E6/double, 9.4E6/float)
+#define MGONGPU_CPPCXTYPE_CXSMPL 1 // new default (5.1E6/double, 10.2E6/float)
+#endif
+
+// Complex type in cuda: thrust or cucomplex or cxsmpl (CHOOSE ONLY ONE)
 #ifdef __CUDACC__
 #define MGONGPU_CUCXTYPE_THRUST 1 // default (~1.15E9/double, ~3.2E9/float)
 //#define MGONGPU_CUCXTYPE_CUCOMPLEX 1 // ~10 percent slower (1.03E9/double, ~2.8E9/float)
 //#define MGONGPU_CUCXTYPE_CXSMPL 1 // ~10 percent slower (1.00E9/double, ~2.9E9/float)
-
-// Complex type in HIP: cxsmpl (ONLY ONE OPTION POSSIBLE)
-#elif defined __HIPCC__
-#define MGONGPU_CUCXTYPE_CXSMPL 1 // ~10 percent slower (1.00E9/double, ~2.9E9/float)
-
-// Complex type in C++: std::complex or cxsmpl (CHOOSE ONLY ONE)
-#else
-//#define MGONGPU_CPPCXTYPE_STDCOMPLEX 1 // ~8 percent slower on float, same on double (5.1E6/double, 9.4E6/float)
-#define MGONGPU_CPPCXTYPE_CXSMPL 1 // new default (5.1E6/double, 10.2E6/float)
 #endif
 
-// CUDA nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation
+// Cuda nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation
 #ifdef __CUDACC__
-#undef MGONGPU_NSIGHT_DEBUG // default in CUDA
+#undef MGONGPU_NSIGHT_DEBUG // default
 //#define MGONGPU_NSIGHT_DEBUG 1
-#else
-#undef MGONGPU_NSIGHT_DEBUG // only option in HIP or C++
 #endif
 
 // SANITY CHECKS (floating point precision for everything but color algebra #537)
@@ -102,21 +84,17 @@
 #error You cannot use double precision for color algebra and single precision elsewhere
 #endif
 
-// SANITY CHECKS (CUDA complex number implementation)
-#ifdef __CUDACC__
-#if defined MGONGPU_CUCXTYPE_THRUST and defined MGONGPU_CUCXTYPE_CUCOMPLEX
-#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CUCXTYPE_THRUST or MGONGPU_CUCXTYPE_CUCOMPLEX for CUDA
-#elif defined MGONGPU_CUCXTYPE_THRUST and defined MGONGPU_CUCXTYPE_CXSMPL
-#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CUCXTYPE_THRUST or MGONGPU_CUCXTYPE_CXSMPL for CUDA
-#elif defined MGONGPU_CUCXTYPE_CUCOMPLEX and defined MGONGPU_CUCXTYPE_CXSMPL
-#error You must CHOOSE (ONE AND) ONLY ONE OF MGONGPU_CUCXTYPE_CUCOMPLEX or MGONGPU_CUCXTYPE_CXSMPL for CUDA
+// SANITY CHECKS (c++ complex number implementation)
+#ifndef __CUDACC__
+#if defined MGONGPU_CPPCXTYPE_STDCOMPLEX and defined MGONGPU_CPPCXTYPE_CXSMPL
+#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CPPCXTYPE_STDCOMPLEX or MGONGPU_CPPCXTYPE_CXSMPL
 #endif
 #endif
 
-// SANITY CHECKS (C++ complex number implementation)
-#ifndef MGONGPUCPP_GPUIMPL
-#if defined MGONGPU_CPPCXTYPE_STDCOMPLEX and defined MGONGPU_CPPCXTYPE_CXSMPL
-#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CPPCXTYPE_STDCOMPLEX or MGONGPU_CPPCXTYPE_CXSMPL for C++
+// SANITY CHECKS (cuda complex number implementation)
+#ifdef __CUDACC__
+#if defined MGONGPU_CUCXTYPE_THRUST and defined MGONGPU_CUCXTYPE_CUCOMPLEX and defined MGONGPU_CUCXTYPE_CXSMPL
+#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CUCXTYPE_THRUST or MGONGPU_CUCXTYPE_CUCOMPLEX or MGONGPU_CUCXTYPE_CXSMPL
 #endif
 #endif
 
@@ -153,7 +131,7 @@ namespace mgOnGpu
   // Alignment requirement for using reinterpret_cast with SIMD vectorized code
   // (using reinterpret_cast with non aligned memory may lead to segmentation faults!)
   // Only needed for C++ code but can be enforced also in NVCC builds of C++ code using CUDA>=11.2 and C++17 (#318, #319, #333)
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
   constexpr int cppAlign = 64; // alignment requirement for SIMD vectorization (64-byte i.e. 512-bit)
 #endif
 
@@ -164,7 +142,7 @@ using mgOnGpu::fptype;
 using mgOnGpu::fptype2;
 
 // C++ SIMD vectorization width (this will be used to set neppV)
-#ifdef MGONGPUCPP_GPUIMPL // CUDA and HIP implementations have no SIMD
+#ifdef __CUDACC__ // CUDA implementation has no SIMD
 #undef MGONGPU_CPPSIMD
 #elif defined __AVX512VL__ && defined MGONGPU_PVW512 // C++ "512z" AVX512 with 512 width (512-bit ie 64-byte): 8 (DOUBLE) or 16 (FLOAT)
 #ifdef MGONGPU_FPTYPE_DOUBLE
@@ -194,9 +172,9 @@ using mgOnGpu::fptype2;
 #undef MGONGPU_CPPSIMD
 #endif
 
-// CUDA nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation
+// Cuda nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation
 // Arguments (not used so far): text is __FUNCTION__, code is 0 (start) or 1 (end)
-#if defined __CUDA__ && defined MGONGPU_NSIGHT_DEBUG /* clang-format off */
+#if defined __CUDACC__ && defined MGONGPU_NSIGHT_DEBUG /* clang-format off */
 #define mgDebugDeclare() __shared__ float mgDebugCounter[mgOnGpu::ntpbMAX];
 #define mgDebugInitialise() { mgDebugCounter[threadIdx.x] = 0; }
 #define mgDebug( code, text ) { mgDebugCounter[threadIdx.x] += 1; }
@@ -208,8 +186,8 @@ using mgOnGpu::fptype2;
 #define mgDebugFinalise() { /*noop*/ }
 #endif /* clang-format on */
 
-// Define empty CUDA/HIP declaration specifiers for C++
-#ifndef MGONGPUCPP_GPUIMPL
+// Define empty CUDA declaration specifiers for C++
+#ifndef __CUDACC__
 #define __global__
 #define __host__
 #define __device__
diff --git a/epochX/cudacpp/gq_ttq.mad/src/mgOnGpuCxtypes.h b/epochX/cudacpp/gq_ttq.mad/src/mgOnGpuCxtypes.h
index 4e7ab03fa2..0cb2f1db7e 100644
--- a/epochX/cudacpp/gq_ttq.mad/src/mgOnGpuCxtypes.h
+++ b/epochX/cudacpp/gq_ttq.mad/src/mgOnGpuCxtypes.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022, based on earlier work by D. Smith) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MGONGPUCXTYPES_H
 #define MGONGPUCXTYPES_H 1
@@ -19,7 +19,7 @@
 #include <complex>
 
 // Complex type in cuda: thrust or cucomplex or cxsmpl
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 #if defined MGONGPU_CUCXTYPE_THRUST
 #pragma clang diagnostic push
 #pragma clang diagnostic ignored "-Wtautological-compare" // for icpx2021/clang13 (https://stackoverflow.com/a/15864661)
@@ -201,7 +201,7 @@ namespace mgOnGpu
 {
 
   // --- Type definitions (complex type: cxtype)
-#ifdef MGONGPUCPP_GPUIMPL // cuda
+#ifdef __CUDACC__ // cuda
 #if defined MGONGPU_CUCXTYPE_THRUST
   typedef thrust::complex<fptype> cxtype;
 #elif defined MGONGPU_CUCXTYPE_CUCOMPLEX
@@ -281,7 +281,7 @@ cxmake( const std::complex<double>& c ) // std::complex to cxsmpl (double-to-flo
 
 //==========================================================================
 
-#if defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CUCXTYPE_THRUST // cuda + thrust
+#if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_THRUST // cuda + thrust
 
 //------------------------------
 // CUDA - using thrust::complex
@@ -317,11 +317,11 @@ cxmake( const cxtype& c )
   return c;
 }
 
-#endif // #if defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CUCXTYPE_THRUST
+#endif // #if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_THRUST
 
 //==========================================================================
 
-#if defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CUCXTYPE_CUCOMPLEX // cuda + cucomplex
+#if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_CUCOMPLEX // cuda + cucomplex
 
 //------------------------------
 // CUDA - using cuComplex
@@ -536,11 +536,11 @@ cxmake( const std::complex<fptype>& c ) // std::complex to cucomplex (float-to-f
   return cxmake( c.real(), c.imag() );
 }
 
-#endif // #if defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CUCXTYPE_CUCOMPLEX
+#endif // #if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_CUCOMPLEX
 
 //==========================================================================
 
-#if not defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CPPCXTYPE_STDCOMPLEX // c++ + stdcomplex
+#if not defined __CUDACC__ and defined MGONGPU_CPPCXTYPE_STDCOMPLEX // c++ + stdcomplex
 
 //------------------------------
 // C++ - using std::complex
@@ -584,7 +584,7 @@ cxmake( const std::complex<double>& c ) // std::complex to std::complex (cast do
 }
 #endif
 
-#endif // #if not defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CPPCXTYPE_STDCOMPLEX
+#endif // #if not defined __CUDACC__ and defined MGONGPU_CPPCXTYPE_STDCOMPLEX
 
 //==========================================================================
 
diff --git a/epochX/cudacpp/gq_ttq.mad/src/mgOnGpuFptypes.h b/epochX/cudacpp/gq_ttq.mad/src/mgOnGpuFptypes.h
index 6f6cee64d6..a1cde16a67 100644
--- a/epochX/cudacpp/gq_ttq.mad/src/mgOnGpuFptypes.h
+++ b/epochX/cudacpp/gq_ttq.mad/src/mgOnGpuFptypes.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MGONGPUFPTYPES_H
 #define MGONGPUFPTYPES_H 1
@@ -13,7 +13,7 @@
 
 //==========================================================================
 
-#ifdef MGONGPUCPP_GPUIMPL // cuda
+#ifdef __CUDACC__ // cuda
 
 //------------------------------
 // Floating point types - Cuda
@@ -57,11 +57,11 @@ fpsqrt( const fptype& f )
 #endif
 }
 
-#endif // #ifdef MGONGPUCPP_GPUIMPL
+#endif // #ifdef __CUDACC__
 
 //==========================================================================
 
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
 
 //------------------------------
 // Floating point types - C++
@@ -85,7 +85,7 @@ fpsqrt( const fptype& f )
   return std::sqrt( f );
 }
 
-#endif // #ifndef MGONGPUCPP_GPUIMPL
+#endif // #ifndef __CUDACC__
 
 //==========================================================================
 
diff --git a/epochX/cudacpp/gq_ttq.mad/src/mgOnGpuVectors.h b/epochX/cudacpp/gq_ttq.mad/src/mgOnGpuVectors.h
index 7904b93c61..9d3e82b1e3 100644
--- a/epochX/cudacpp/gq_ttq.mad/src/mgOnGpuVectors.h
+++ b/epochX/cudacpp/gq_ttq.mad/src/mgOnGpuVectors.h
@@ -108,7 +108,7 @@ namespace mgOnGpu /* clang-format off */
 #endif
 #endif
 
-#else // i.e #ifndef MGONGPU_CPPSIMD (this includes #ifdef MGONGPUCPP_GPUIMPL)
+#else // i.e #ifndef MGONGPU_CPPSIMD (this includes #ifdef __CUDACC__)
 
   const int neppV = 1;
 
@@ -129,7 +129,7 @@ using mgOnGpu::bool_v;
 
 //--------------------------------------------------------------------------
 
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
 
 // Printout to stream for user defined types
 
@@ -744,11 +744,11 @@ fpvsplit1( const fptype2_v& v )
 
 #endif // #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
 
-#endif // #ifndef MGONGPUCPP_GPUIMPL
+#endif // #ifndef __CUDACC__
 
 //==========================================================================
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 
 //------------------------------
 // Vector types - CUDA
@@ -786,12 +786,12 @@ cxternary( const bool& mask, const cxtype& a, const cxtype& b )
   return ( mask ? a : b );
 }
 
-#endif // #ifdef MGONGPUCPP_GPUIMPL
+#endif // #ifdef __CUDACC__
 
 //==========================================================================
 
 // Scalar-or-vector types: scalar in CUDA, vector or scalar in C++
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 typedef bool bool_sv;
 typedef fptype fptype_sv;
 typedef fptype2 fptype2_sv;
@@ -812,7 +812,7 @@ typedef mgOnGpu::cxtype_ref cxtype_sv_ref;
 #endif
 
 // Scalar-or-vector zeros: scalar in CUDA, vector or scalar in C++
-#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
+#ifdef __CUDACC__ /* clang-format off */
 inline __host__ __device__ cxtype cxzero_sv(){ return cxtype( 0, 0 ); }
 #elif defined MGONGPU_CPPSIMD
 inline cxtype_v cxzero_sv() { return cxtype_v(); } // RRRR=0000 IIII=0000
diff --git a/epochX/cudacpp/gq_ttq.mad/src/rambo.h b/epochX/cudacpp/gq_ttq.mad/src/rambo.h
index cd7e1008ea..e02ea52496 100644
--- a/epochX/cudacpp/gq_ttq.mad/src/rambo.h
+++ b/epochX/cudacpp/gq_ttq.mad/src/rambo.h
@@ -4,7 +4,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 
 #include "mgOnGpuConfig.h"
@@ -18,7 +18,7 @@
 #include <iostream>
 
 // Simplified rambo version for 2 to N (with N>=2) processes with massless particles
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -83,7 +83,7 @@ namespace mg5amcCpu
       static bool first = true;
       if( first )
       {
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
         if constexpr( M_ACCESS::isOnDevice() ) // avoid
         {
           const int ievt0 = 0;
@@ -166,7 +166,7 @@ namespace mg5amcCpu
     wt = po2log;
     if( nparf != 2 ) wt = ( 2. * nparf - 4. ) * log( energy ) + z[nparf - 1];
 
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
     // issue warnings if weight is too small or too large
     static int iwarn[5] = { 0, 0, 0, 0, 0 };
     if( wt < -180. )
diff --git a/epochX/cudacpp/gq_ttq.sa/COPYRIGHT b/epochX/cudacpp/gq_ttq.sa/COPYRIGHT
index 84a883fbb0..a134b5fef9 100644
--- a/epochX/cudacpp/gq_ttq.sa/COPYRIGHT
+++ b/epochX/cudacpp/gq_ttq.sa/COPYRIGHT
@@ -15,7 +15,6 @@ The full development team currently includes the following authors :
   Stephan Hageboeck (CERN)
   Olivier Mattelaer (Universite Catholique de Louvain, original author)
   Stefan Roiser (CERN, original author)
-  Joergen Teig (CERN)
   Andrea Valassi (CERN, original author)
   Zenny Wettersten (CERN)
 See https://github.com/madgraph5/madgraph4gpu for more details. For the full
diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/Bridge.h b/epochX/cudacpp/gq_ttq.sa/SubProcesses/Bridge.h
index c04628dfd1..4cafe0c997 100644
--- a/epochX/cudacpp/gq_ttq.sa/SubProcesses/Bridge.h
+++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/Bridge.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: S. Roiser (Nov 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Roiser, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef BRIDGE_H
 #define BRIDGE_H 1
@@ -22,7 +22,7 @@
 #include <memory>
 #include <type_traits>
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -82,7 +82,7 @@ namespace mg5amcCpu
     Bridge& operator=( const Bridge& ) = delete;
     Bridge& operator=( Bridge&& ) = delete;
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
     /**
      * Set the gpublocks and gputhreads for the gpusequence - throws if evnt != gpublocks*gputhreads
      * (this is needed for BridgeKernel tests rather than for actual production use in Fortran)
@@ -149,7 +149,7 @@ namespace mg5amcCpu
     unsigned int m_nevt; // number of events
     int m_nGoodHel;      // the number of good helicities (-1 initially when they have not yet been calculated)
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
     int m_gputhreads; // number of gpu threads (default set from number of events, can be modified)
     int m_gpublocks;  // number of gpu blocks (default set from number of events, can be modified)
     mg5amcGpu::DeviceBuffer<FORTRANFPTYPE, sizePerEventMomenta> m_devMomentaF;
@@ -186,12 +186,12 @@ namespace mg5amcCpu
   // Forward declare transposition methods
   //
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 
   template<typename Tin, typename Tout>
   __global__ void dev_transposeMomentaF2C( const Tin* in, Tout* out, const unsigned int nevt );
 
-#endif // MGONGPUCPP_GPUIMPL
+#endif // __CUDACC__
 
   template<typename Tin, typename Tout>
   void hst_transposeMomentaF2C( const Tin* in, Tout* out, const unsigned int nevt );
@@ -208,7 +208,7 @@ namespace mg5amcCpu
   Bridge<FORTRANFPTYPE>::Bridge( unsigned int nevtF, unsigned int nparF, unsigned int np4F )
     : m_nevt( nevtF )
     , m_nGoodHel( -1 )
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
     , m_gputhreads( 256 )                  // default number of gpu threads
     , m_gpublocks( m_nevt / m_gputhreads ) // this ensures m_nevt <= m_gpublocks*m_gputhreads
     , m_devMomentaF( m_nevt )
@@ -232,7 +232,7 @@ namespace mg5amcCpu
   {
     if( nparF != CPPProcess::npar ) throw std::runtime_error( "Bridge constructor: npar mismatch" );
     if( np4F != CPPProcess::np4 ) throw std::runtime_error( "Bridge constructor: np4 mismatch" );
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
     if( ( m_nevt < s_gputhreadsmin ) || ( m_nevt % s_gputhreadsmin != 0 ) )
       throw std::runtime_error( "Bridge constructor: nevt should be a multiple of " + std::to_string( s_gputhreadsmin ) );
     while( m_nevt != m_gpublocks * m_gputhreads )
@@ -250,11 +250,11 @@ namespace mg5amcCpu
     std::cout << "WARNING! Instantiate host Bridge (nevt=" << m_nevt << ")" << std::endl;
     mg5amcCpu::CPPProcess process( /*verbose=*/false );
     m_pmek.reset( new mg5amcCpu::MatrixElementKernelHost( m_hstMomentaC, m_hstGs, m_hstRndHel, m_hstRndCol, m_hstMEs, m_hstSelHel, m_hstSelCol, m_nevt ) );
-#endif // MGONGPUCPP_GPUIMPL
+#endif // __CUDACC__
     process.initProc( "../../Cards/param_card.dat" );
   }
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
   template<typename FORTRANFPTYPE>
   void Bridge<FORTRANFPTYPE>::set_gpugrid( const int gpublocks, const int gputhreads )
   {
@@ -268,7 +268,7 @@ namespace mg5amcCpu
   }
 #endif
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
   template<typename FORTRANFPTYPE>
   void Bridge<FORTRANFPTYPE>::gpu_sequence( const FORTRANFPTYPE* momenta,
                                             const FORTRANFPTYPE* gs,
@@ -283,14 +283,14 @@ namespace mg5amcCpu
     constexpr int neppM = MemoryAccessMomenta::neppM;
     if constexpr( neppM == 1 && std::is_same_v<FORTRANFPTYPE, fptype> )
     {
-      gpuMemcpy( m_devMomentaC.data(), momenta, m_devMomentaC.bytes(), gpuMemcpyHostToDevice );
+      checkCuda( cudaMemcpy( m_devMomentaC.data(), momenta, m_devMomentaC.bytes(), cudaMemcpyHostToDevice ) );
     }
     else
     {
-      gpuMemcpy( m_devMomentaF.data(), momenta, m_devMomentaF.bytes(), gpuMemcpyHostToDevice );
+      checkCuda( cudaMemcpy( m_devMomentaF.data(), momenta, m_devMomentaF.bytes(), cudaMemcpyHostToDevice ) );
       const int thrPerEvt = CPPProcess::npar * CPPProcess::np4; // AV: transpose alg does 1 element per thread (NOT 1 event per thread)
       //const int thrPerEvt = 1; // AV: try new alg with 1 event per thread... this seems slower
-      gpuLaunchKernel( dev_transposeMomentaF2C, m_gpublocks * thrPerEvt, m_gputhreads, m_devMomentaF.data(), m_devMomentaC.data(), m_nevt );
+      dev_transposeMomentaF2C<<<m_gpublocks * thrPerEvt, m_gputhreads>>>( m_devMomentaF.data(), m_devMomentaC.data(), m_nevt );
     }
     if constexpr( std::is_same_v<FORTRANFPTYPE, fptype> )
     {
@@ -333,7 +333,7 @@ namespace mg5amcCpu
   }
 #endif
 
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
   template<typename FORTRANFPTYPE>
   void Bridge<FORTRANFPTYPE>::cpu_sequence( const FORTRANFPTYPE* momenta,
                                             const FORTRANFPTYPE* gs,
@@ -388,7 +388,7 @@ namespace mg5amcCpu
   // - C++ array: momenta[npagM][npar][np4][neppM] with nevt=npagM*neppM (AOSOA)
   //
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
   template<typename Tin, typename Tout>
   __global__ void dev_transposeMomentaF2C( const Tin* in, Tout* out, const unsigned int nevt )
   {
diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/BridgeKernels.cc b/epochX/cudacpp/gq_ttq.sa/SubProcesses/BridgeKernels.cc
index 90c7f2d3b8..cef4cb3c71 100644
--- a/epochX/cudacpp/gq_ttq.sa/SubProcesses/BridgeKernels.cc
+++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/BridgeKernels.cc
@@ -1,11 +1,10 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #include "BridgeKernels.h"
 
-#include "GpuAbstraction.h"
 #include "MemoryAccessMomenta.h"
 
 #include <sstream>
@@ -15,7 +14,7 @@ constexpr int npar = CPPProcess::npar; // #particles in total (external = initia
 
 //============================================================================
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -46,7 +45,7 @@ namespace mg5amcCpu
 
 //============================================================================
 
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
 namespace mg5amcCpu
 {
 
@@ -97,7 +96,7 @@ namespace mg5amcCpu
 
 //============================================================================
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 namespace mg5amcGpu
 {
 
diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/BridgeKernels.h b/epochX/cudacpp/gq_ttq.sa/SubProcesses/BridgeKernels.h
index 3efef8ce97..15eb4bff4d 100644
--- a/epochX/cudacpp/gq_ttq.sa/SubProcesses/BridgeKernels.h
+++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/BridgeKernels.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef BRIDGEKERNELS_H
 #define BRIDGEKERNELS_H 1
@@ -12,7 +12,7 @@
 #include "MatrixElementKernels.h"
 #include "MemoryBuffers.h"
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -49,7 +49,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
   // A Bridge wrapper class encapsulating matrix element calculations on a CPU host
   class BridgeKernelHost final : public BridgeKernelBase
   {
@@ -89,7 +89,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
   // A Bridge wrapper class encapsulating matrix element calculations on a GPU device
   class BridgeKernelDevice : public BridgeKernelBase
   {
diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/CommonRandomNumberKernel.cc b/epochX/cudacpp/gq_ttq.sa/SubProcesses/CommonRandomNumberKernel.cc
index 010bc4cbd0..985b39f576 100644
--- a/epochX/cudacpp/gq_ttq.sa/SubProcesses/CommonRandomNumberKernel.cc
+++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/CommonRandomNumberKernel.cc
@@ -1,16 +1,15 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #include "CommonRandomNumbers.h"
-#include "GpuAbstraction.h"
 #include "MemoryBuffers.h"
 #include "RandomNumberKernels.h"
 
 #include <cassert>
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/CrossSectionKernels.cc b/epochX/cudacpp/gq_ttq.sa/SubProcesses/CrossSectionKernels.cc
index c15b39844d..0b355a3c8d 100644
--- a/epochX/cudacpp/gq_ttq.sa/SubProcesses/CrossSectionKernels.cc
+++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/CrossSectionKernels.cc
@@ -1,11 +1,10 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #include "CrossSectionKernels.h"
 
-#include "GpuAbstraction.h"
 #include "MemoryAccessMatrixElements.h"
 #include "MemoryAccessWeights.h"
 #include "MemoryBuffers.h"
@@ -78,7 +77,7 @@ debug_me_is_abnormal( const fptype& me, size_t ievtALL )
 
 //============================================================================
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -186,7 +185,7 @@ namespace mg5amcCpu
 
 //============================================================================
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 namespace mg5amcGpu
 {
 
diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/CrossSectionKernels.h b/epochX/cudacpp/gq_ttq.sa/SubProcesses/CrossSectionKernels.h
index 4d9659e04e..7933ca4bbf 100644
--- a/epochX/cudacpp/gq_ttq.sa/SubProcesses/CrossSectionKernels.h
+++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/CrossSectionKernels.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef CROSSSECTIONKERNELS_H
 #define CROSSSECTIONKERNELS_H 1
@@ -13,7 +13,7 @@
 
 //============================================================================
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -96,7 +96,7 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
   /*
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
   // A class encapsulating the calculation of event statistics on a GPU device
   class CrossSectionKernelDevice : public CrossSectionKernelBase, public NumberOfEvents
   {
diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/CudaRuntime.h b/epochX/cudacpp/gq_ttq.sa/SubProcesses/CudaRuntime.h
new file mode 100644
index 0000000000..64ce52f4b3
--- /dev/null
+++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/CudaRuntime.h
@@ -0,0 +1,85 @@
+// Copyright (C) 2020-2023 CERN and UCLouvain.
+// Licensed under the GNU Lesser General Public License (version 3 or later).
+// Created by: S. Roiser (Jul 2020) for the MG5aMC CUDACPP plugin.
+// Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+
+#ifndef MG5AMC_CUDARUNTIME_H
+#define MG5AMC_CUDARUNTIME_H 1
+
+// MG5AMC on GPU uses the CUDA runtime API, not the lower level CUDA driver API
+// See https://docs.nvidia.com/cuda/cuda-runtime-api/driver-vs-runtime-api.html#driver-vs-runtime-api
+
+#include <cassert>
+#include <iostream>
+
+//--------------------------------------------------------------------------
+
+// See https://stackoverflow.com/a/14038590
+#ifdef __CUDACC__ /* clang-format off */
+#define checkCuda( code ) { assertCuda( code, __FILE__, __LINE__ ); }
+inline void assertCuda( cudaError_t code, const char* file, int line, bool abort = true )
+{
+  if( code != cudaSuccess )
+  {
+    printf( "ERROR! assertCuda: '%s' (%d) in %s:%d\n", cudaGetErrorString( code ), code, file, line );
+    if( abort ) assert( code == cudaSuccess );
+  }
+}
+#endif /* clang-format on */
+
+//--------------------------------------------------------------------------
+
+#ifdef __CUDACC__
+namespace mg5amcGpu
+{
+  // Instantiate a CudaRuntime at the beginnining of the application's main to
+  // invoke cudaSetDevice(0) in the constructor and book a cudaDeviceReset() call in the destructor
+  // *** FIXME! This will all need to be designed differently when going to multi-GPU nodes! ***
+  struct CudaRuntime final
+  {
+    CudaRuntime( const bool debug = true )
+      : m_debug( debug ) { setUp( m_debug ); }
+    ~CudaRuntime() { tearDown( m_debug ); }
+    CudaRuntime( const CudaRuntime& ) = delete;
+    CudaRuntime( CudaRuntime&& ) = delete;
+    CudaRuntime& operator=( const CudaRuntime& ) = delete;
+    CudaRuntime& operator=( CudaRuntime&& ) = delete;
+    bool m_debug;
+
+    // Set up CUDA application
+    // ** NB: strictly speaking this is not needed when using the CUDA runtime API **
+    // Calling cudaSetDevice on startup is useful to properly book-keep the time spent in CUDA initialization
+    static void setUp( const bool debug = true )
+    {
+      // ** NB: it is useful to call cudaSetDevice, or cudaFree, to properly book-keep the time spent in CUDA initialization
+      // ** NB: otherwise, the first CUDA operation (eg a cudaMemcpyToSymbol in CPPProcess ctor) appears to take much longer!
+      /*
+      // [We initially added cudaFree(0) to "ease profile analysis" only because it shows up as a big recognizable block!]
+      // No explicit initialization is needed: https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#initialization
+      // It is not clear what cudaFree(0) does at all: https://stackoverflow.com/questions/69967813/
+      if ( debug ) std::cout << "__CudaRuntime: calling cudaFree(0)" << std::endl;
+      checkCuda( cudaFree( 0 ) ); // SLOW!
+      */
+      // Replace cudaFree(0) by cudaSetDevice(0), even if it is not really needed either
+      // (but see https://developer.nvidia.com/blog/cuda-pro-tip-always-set-current-device-avoid-multithreading-bugs)
+      if( debug ) std::cout << "__CudaRuntime: calling cudaSetDevice(0)" << std::endl;
+      checkCuda( cudaSetDevice( 0 ) ); // SLOW!
+    }
+
+    // Tear down CUDA application (call cudaDeviceReset)
+    // ** NB: strictly speaking this is not needed when using the CUDA runtime API **
+    // Calling cudaDeviceReset on shutdown is only needed for checking memory leaks in cuda-memcheck
+    // See https://docs.nvidia.com/cuda/cuda-memcheck/index.html#leak-checking
+    static void tearDown( const bool debug = true )
+    {
+      if( debug ) std::cout << "__CudaRuntime: calling cudaDeviceReset()" << std::endl;
+      checkCuda( cudaDeviceReset() );
+    }
+  };
+
+}
+#endif
+
+//--------------------------------------------------------------------------
+
+#endif // MG5AMC_CUDARUNTIME_H
diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/CurandRandomNumberKernel.cc b/epochX/cudacpp/gq_ttq.sa/SubProcesses/CurandRandomNumberKernel.cc
index 38c477c17a..eb56333b03 100644
--- a/epochX/cudacpp/gq_ttq.sa/SubProcesses/CurandRandomNumberKernel.cc
+++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/CurandRandomNumberKernel.cc
@@ -1,9 +1,9 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
-#include "GpuRuntime.h"
+#include "CudaRuntime.h"
 #include "MemoryBuffers.h"
 #include "RandomNumberKernels.h"
 
@@ -114,7 +114,7 @@ namespace mg5amcCpu
     /*
     printf( "\nCurandRandomNumberKernel::generateRnarray size = %d\n", (int)m_rnarray.size() );
     fptype* data = m_rnarray.data();
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
     if( m_rnarray.isOnDevice() )
     {
       data = new fptype[m_rnarray.size()]();
@@ -123,7 +123,7 @@ namespace mg5amcCpu
 #endif
     for( int i = 0; i < ( (int)m_rnarray.size() / 4 ); i++ )
       printf( "[%4d] %f %f %f %f\n", i * 4, data[i * 4], data[i * 4 + 2], data[i * 4 + 2], data[i * 4 + 3] );
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
     if( m_rnarray.isOnDevice() ) delete[] data;
 #endif
     */
diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/EventStatistics.h b/epochX/cudacpp/gq_ttq.sa/SubProcesses/EventStatistics.h
index b425a5bade..48b51e0a49 100644
--- a/epochX/cudacpp/gq_ttq.sa/SubProcesses/EventStatistics.h
+++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/EventStatistics.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef EventStatistics_H
 #define EventStatistics_H 1
@@ -16,7 +16,7 @@
 #include <limits>
 #include <string>
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/MadgraphTest.h b/epochX/cudacpp/gq_ttq.sa/SubProcesses/MadgraphTest.h
index d2ff326e20..fd7734ce42 100644
--- a/epochX/cudacpp/gq_ttq.sa/SubProcesses/MadgraphTest.h
+++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/MadgraphTest.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: S. Hageboeck (Dec 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MADGRAPHTEST_H_
 #define MADGRAPHTEST_H_ 1
@@ -21,7 +21,7 @@
 #include <string>
 #include <vector>
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 using mg5amcGpu::CPPProcess;
 #else
 using mg5amcCpu::CPPProcess;
@@ -200,7 +200,7 @@ class MadgraphTest : public testing::TestWithParam<TestDriverBase*>
 
 // Since we link both the CPU-only and GPU tests into the same executable, we prevent
 // a multiply defined symbol by only compiling this in the non-CUDA phase:
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
 
 /// Compare momenta and matrix elements.
 /// This uses an implementation of TestDriverBase to run a madgraph workflow,
@@ -309,6 +309,6 @@ TEST_P( MadgraphTest, CompareMomentaAndME )
   }
 }
 
-#endif // MGONGPUCPP_GPUIMPL
+#endif // __CUDACC__
 
 #endif /* MADGRAPHTEST_H_ */
diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/MatrixElementKernels.cc b/epochX/cudacpp/gq_ttq.sa/SubProcesses/MatrixElementKernels.cc
index d6d6c4f179..30257195b6 100644
--- a/epochX/cudacpp/gq_ttq.sa/SubProcesses/MatrixElementKernels.cc
+++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/MatrixElementKernels.cc
@@ -1,12 +1,12 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #include "MatrixElementKernels.h"
 
 #include "CPPProcess.h"
-#include "GpuRuntime.h" // Includes the abstraction for Nvidia/AMD compilation
+#include "CudaRuntime.h"
 #include "MemoryAccessMomenta.h"
 #include "MemoryBuffers.h"
 
@@ -14,7 +14,7 @@
 
 //============================================================================
 
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
 namespace mg5amcCpu
 {
 
@@ -143,7 +143,7 @@ namespace mg5amcCpu
 
 //============================================================================
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 namespace mg5amcGpu
 {
 
@@ -202,13 +202,13 @@ namespace mg5amcGpu
     PinnedHostBufferHelicityMask hstIsGoodHel( ncomb );
     DeviceBufferHelicityMask devIsGoodHel( ncomb );
     // ... 0d1. Compute good helicity mask on the device
-    gpuLaunchKernel( computeDependentCouplings, m_gpublocks, m_gputhreads, m_gs.data(), m_couplings.data() );
+    computeDependentCouplings<<<m_gpublocks, m_gputhreads>>>( m_gs.data(), m_couplings.data() );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    gpuLaunchKernel( sigmaKin_getGoodHel, m_gpublocks, m_gputhreads, m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_numerators.data(), m_denominators.data(), devIsGoodHel.data() );
+    sigmaKin_getGoodHel<<<m_gpublocks, m_gputhreads>>>( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_numerators.data(), m_denominators.data(), devIsGoodHel.data() );
 #else
-    gpuLaunchKernel( sigmaKin_getGoodHel, m_gpublocks, m_gputhreads, m_momenta.data(), m_couplings.data(), m_matrixElements.data(), devIsGoodHel.data() );
+    sigmaKin_getGoodHel<<<m_gpublocks, m_gputhreads>>>( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), devIsGoodHel.data() );
 #endif
-    checkGpu( gpuPeekAtLastError() );
+    checkCuda( cudaPeekAtLastError() );
     // ... 0d2. Copy back good helicity mask to the host
     copyHostFromDevice( hstIsGoodHel, devIsGoodHel );
     // ... 0d3. Copy back good helicity list to constant memory on the device
@@ -219,19 +219,19 @@ namespace mg5amcGpu
 
   void MatrixElementKernelDevice::computeMatrixElements( const unsigned int channelId )
   {
-    gpuLaunchKernel( computeDependentCouplings, m_gpublocks, m_gputhreads, m_gs.data(), m_couplings.data() );
+    computeDependentCouplings<<<m_gpublocks, m_gputhreads>>>( m_gs.data(), m_couplings.data() );
 #ifndef MGONGPU_NSIGHT_DEBUG
     constexpr unsigned int sharedMemSize = 0;
 #else
     constexpr unsigned int sharedMemSize = ntpbMAX * sizeof( float );
 #endif
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    gpuLaunchKernelSharedMem( sigmaKin, m_gpublocks, m_gputhreads, sharedMemSize, m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), channelId, m_numerators.data(), m_denominators.data(), m_selhel.data(), m_selcol.data() );
+    sigmaKin<<<m_gpublocks, m_gputhreads, sharedMemSize>>>( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), channelId, m_numerators.data(), m_denominators.data(), m_selhel.data(), m_selcol.data() );
 #else
-    gpuLaunchKernelSharedMem( sigmaKin, m_gpublocks, m_gputhreads, sharedMemSize, m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), m_selhel.data(), m_selcol.data() );
+    sigmaKin<<<m_gpublocks, m_gputhreads, sharedMemSize>>>( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), m_selhel.data(), m_selcol.data() );
 #endif
-    checkGpu( gpuPeekAtLastError() );
-    checkGpu( gpuDeviceSynchronize() );
+    checkCuda( cudaPeekAtLastError() );
+    checkCuda( cudaDeviceSynchronize() );
   }
 
   //--------------------------------------------------------------------------
diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/MatrixElementKernels.h b/epochX/cudacpp/gq_ttq.sa/SubProcesses/MatrixElementKernels.h
index 72bd8f195b..23e84757a2 100644
--- a/epochX/cudacpp/gq_ttq.sa/SubProcesses/MatrixElementKernels.h
+++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/MatrixElementKernels.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MATRIXELEMENTKERNELS_H
 #define MATRIXELEMENTKERNELS_H 1
@@ -10,7 +10,7 @@
 
 #include "MemoryBuffers.h"
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -81,7 +81,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
   // A class encapsulating matrix element calculations on a CPU host
   class MatrixElementKernelHost final : public MatrixElementKernelBase, public NumberOfEvents
   {
@@ -130,7 +130,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
   // A class encapsulating matrix element calculations on a GPU device
   class MatrixElementKernelDevice : public MatrixElementKernelBase, public NumberOfEvents
   {
diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/MemoryAccessHelpers.h b/epochX/cudacpp/gq_ttq.sa/SubProcesses/MemoryAccessHelpers.h
index db73e4e064..c82a6c7635 100644
--- a/epochX/cudacpp/gq_ttq.sa/SubProcesses/MemoryAccessHelpers.h
+++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/MemoryAccessHelpers.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MemoryAccessHelpers_H
 #define MemoryAccessHelpers_H 1
@@ -105,7 +105,7 @@ class KernelAccessHelper : public MemoryAccessHelper<T>
     }
     else
     {
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
       const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid
       //printf( "kernelAccessRecord: ievt=%d threadId=%d\n", ievt, threadIdx.x );
       return T::ieventAccessRecord( buffer, ievt ); // NB fptype and fptype_sv coincide for CUDA
diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/MemoryAccessMomenta.h b/epochX/cudacpp/gq_ttq.sa/SubProcesses/MemoryAccessMomenta.h
index 38fade09fb..0ac4faa3c7 100644
--- a/epochX/cudacpp/gq_ttq.sa/SubProcesses/MemoryAccessMomenta.h
+++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/MemoryAccessMomenta.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MemoryAccessMomenta_H
 #define MemoryAccessMomenta_H 1
@@ -12,7 +12,7 @@
 #include "MemoryAccessHelpers.h"
 #include "MemoryAccessVectors.h"
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 using mg5amcGpu::CPPProcess;
 #else
 using mg5amcCpu::CPPProcess;
@@ -29,7 +29,7 @@ class MemoryAccessMomentaBase //_AOSOAv1
 
   // Number of Events Per Page in the momenta AOSOA memory buffer layout
   // (these are all best kept as a compile-time constants: see issue #23)
-#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
+#ifdef __CUDACC__ /* clang-format off */
   // -----------------------------------------------------------------------------------------------
   // --- GPUs: neppM is best set to a power of 2 times the number of fptype's in a 32-byte cacheline
   // --- This is relevant to ensure coalesced access to momenta in global memory
diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/MemoryAccessRandomNumbers.h b/epochX/cudacpp/gq_ttq.sa/SubProcesses/MemoryAccessRandomNumbers.h
index 40cb089135..e2988d39f3 100644
--- a/epochX/cudacpp/gq_ttq.sa/SubProcesses/MemoryAccessRandomNumbers.h
+++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/MemoryAccessRandomNumbers.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MemoryAccessRandomNumbers_H
 #define MemoryAccessRandomNumbers_H 1
@@ -11,7 +11,7 @@
 #include "CPPProcess.h"
 #include "MemoryAccessHelpers.h"
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 using mg5amcGpu::CPPProcess;
 #else
 using mg5amcCpu::CPPProcess;
diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/MemoryAccessVectors.h b/epochX/cudacpp/gq_ttq.sa/SubProcesses/MemoryAccessVectors.h
index 08faccff0f..e9b197368e 100644
--- a/epochX/cudacpp/gq_ttq.sa/SubProcesses/MemoryAccessVectors.h
+++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/MemoryAccessVectors.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MemoryAccessVectors_H
 #define MemoryAccessVectors_H 1
@@ -10,7 +10,7 @@
 
 #include "mgOnGpuVectors.h"
 
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
 namespace mg5amcCpu // this is only needed for CPU SIMD vectorization
 {
 
diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/MemoryBuffers.h b/epochX/cudacpp/gq_ttq.sa/SubProcesses/MemoryBuffers.h
index 7756a71621..3093e6ed18 100644
--- a/epochX/cudacpp/gq_ttq.sa/SubProcesses/MemoryBuffers.h
+++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/MemoryBuffers.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021, based on earlier work by S. Hageboeck) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Roiser, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MemoryBuffers_H
 #define MemoryBuffers_H 1
@@ -11,12 +11,12 @@
 #include "mgOnGpuCxtypes.h"
 
 #include "CPPProcess.h"
-#include "GpuRuntime.h"
+#include "CudaRuntime.h"
 #include "Parameters_sm.h"
 
 #include <sstream>
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -87,7 +87,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
   constexpr bool HostBufferALIGNED = false;   // ismisaligned=false
   constexpr bool HostBufferMISALIGNED = true; // ismisaligned=true
 
@@ -119,7 +119,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
   // A class encapsulating a CUDA pinned host buffer
   template<typename T>
   class PinnedHostBufferBase : public BufferBase<T>
@@ -128,18 +128,18 @@ namespace mg5amcCpu
     PinnedHostBufferBase( const size_t size )
       : BufferBase<T>( size, false )
     {
-      gpuMallocHost( &( this->m_data ), this->bytes() );
+      checkCuda( cudaMallocHost( &( this->m_data ), this->bytes() ) );
     }
     virtual ~PinnedHostBufferBase()
     {
-      gpuFreeHost( this->m_data );
+      checkCuda( cudaFreeHost( this->m_data ) );
     }
   };
 #endif
 
   //--------------------------------------------------------------------------
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
   // A class encapsulating a CUDA device buffer
   template<typename T>
   class DeviceBufferBase : public BufferBase<T>
@@ -148,18 +148,18 @@ namespace mg5amcCpu
     DeviceBufferBase( const size_t size )
       : BufferBase<T>( size, true )
     {
-      gpuMalloc( &( this->m_data ), this->bytes() );
+      checkCuda( cudaMalloc( &( this->m_data ), this->bytes() ) );
     }
     virtual ~DeviceBufferBase()
     {
-      gpuFree( this->m_data );
+      checkCuda( cudaFree( this->m_data ) );
     }
   };
 #endif
 
   //--------------------------------------------------------------------------
 
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
   // A class encapsulating a C++ host buffer for a given number of events
   template<typename T, size_t sizePerEvent, bool ismisaligned>
   class HostBuffer : public HostBufferBase<T, ismisaligned>, virtual private NumberOfEvents
@@ -175,7 +175,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
   // A class encapsulating a CUDA pinned host buffer for a given number of events
   template<typename T, size_t sizePerEvent>
   class PinnedHostBuffer : public PinnedHostBufferBase<T>, virtual private NumberOfEvents
@@ -191,7 +191,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
   // A class encapsulating a CUDA device buffer for a given number of events
   template<typename T, size_t sizePerEvent>
   class DeviceBuffer : public DeviceBufferBase<T>, virtual private NumberOfEvents
@@ -213,7 +213,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for momenta random numbers
   constexpr size_t sizePerEventRndNumMomenta = MemoryBuffers::np4 * MemoryBuffers::nparf;
 
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
   // A class encapsulating a C++ host buffer for momenta random numbers
   typedef HostBuffer<fptype, sizePerEventRndNumMomenta, HostBufferALIGNED> HostBufferRndNumMomenta;
 #else
@@ -232,7 +232,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer with ONE fptype per event
   constexpr size_t sizePerEventOneFp = 1;
 
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
   // A class encapsulating a C++ host buffer with ONE fptype per event
   typedef HostBuffer<fptype, sizePerEventOneFp, HostBufferALIGNED> HostBufferOneFp;
 #else
@@ -257,7 +257,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for Gs
   constexpr size_t sizePerEventGs = 1;
 
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
   // A class encapsulating a C++ host buffer for gs
   typedef HostBuffer<fptype, sizePerEventGs, HostBufferALIGNED> HostBufferGs;
 #else
@@ -276,7 +276,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for numerators
   constexpr size_t sizePerEventNumerators = 1;
 
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
   // A class encapsulating a C++ host buffer for gs
   typedef HostBuffer<fptype, sizePerEventNumerators, HostBufferALIGNED> HostBufferNumerators;
 #else
@@ -296,7 +296,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for denominators
   constexpr size_t sizePerEventDenominators = 1;
 
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
   // A class encapsulating a C++ host buffer for gs
   typedef HostBuffer<fptype, sizePerEventDenominators, HostBufferALIGNED> HostBufferDenominators;
 #else
@@ -315,7 +315,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for random numbers
   constexpr size_t sizePerEventCouplings = MemoryBuffers::ndcoup * MemoryBuffers::nx2;
 
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
   // A class encapsulating a C++ host buffer for gs
   typedef HostBuffer<fptype, sizePerEventCouplings, HostBufferALIGNED> HostBufferCouplings;
 #else
@@ -333,7 +333,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for momenta
   constexpr size_t sizePerEventMomenta = MemoryBuffers::np4 * MemoryBuffers::npar;
 
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
   // A class encapsulating a C++ host buffer for momenta
   typedef HostBuffer<fptype, sizePerEventMomenta, HostBufferALIGNED> HostBufferMomenta;
   //typedef HostBuffer<fptype, sizePerEventMomenta, HostBufferMISALIGNED> HostBufferMomenta; // TEST MISALIGNMENT!
@@ -352,7 +352,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for sampling weights
   constexpr size_t sizePerEventWeights = 1;
 
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
   // A class encapsulating a C++ host buffer for sampling weights
   typedef HostBuffer<fptype, sizePerEventWeights, HostBufferALIGNED> HostBufferWeights;
 #else
@@ -370,7 +370,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for matrix elements
   constexpr size_t sizePerEventMatrixElements = 1;
 
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
   // A class encapsulating a C++ host buffer for matrix elements
   typedef HostBuffer<fptype, sizePerEventMatrixElements, HostBufferALIGNED> HostBufferMatrixElements;
 #else
@@ -385,7 +385,7 @@ namespace mg5amcCpu
   // A base class encapsulating a memory buffer for the helicity mask
   typedef BufferBase<bool> BufferHelicityMask;
 
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
   // A class encapsulating a C++ host buffer for the helicity mask
   typedef HostBufferBase<bool, HostBufferALIGNED> HostBufferHelicityMask;
 #else
@@ -403,7 +403,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for wavefunctions
   constexpr size_t sizePerEventWavefunctions = MemoryBuffers::nw6 * MemoryBuffers::nx2;
 
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
   // A class encapsulating a C++ host buffer for wavefunctions
   typedef HostBuffer<fptype, sizePerEventWavefunctions, HostBufferALIGNED> HostBufferWavefunctions;
 #else
@@ -421,7 +421,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for helicity random numbers
   constexpr size_t sizePerEventRndNumHelicity = 1;
 
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
   // A class encapsulating a C++ host buffer for helicity random numbers
   typedef HostBuffer<fptype, sizePerEventRndNumHelicity, HostBufferALIGNED> HostBufferRndNumHelicity;
 #else
@@ -439,7 +439,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for color random numbers
   constexpr size_t sizePerEventRndNumColor = 1;
 
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
   // A class encapsulating a C++ host buffer for color random numbers
   typedef HostBuffer<fptype, sizePerEventRndNumColor, HostBufferALIGNED> HostBufferRndNumColor;
 #else
@@ -457,7 +457,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for helicity selection
   constexpr size_t sizePerEventSelectedHelicity = 1;
 
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
   // A class encapsulating a C++ host buffer for helicity selection
   typedef HostBuffer<int, sizePerEventSelectedHelicity, HostBufferALIGNED> HostBufferSelectedHelicity;
 #else
@@ -475,7 +475,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for color selection
   constexpr size_t sizePerEventSelectedColor = 1;
 
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
   // A class encapsulating a C++ host buffer for color selection
   typedef HostBuffer<int, sizePerEventSelectedColor, HostBufferALIGNED> HostBufferSelectedColor;
 #else
@@ -487,7 +487,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
   template<class Tdst, class Tsrc>
   void copyDeviceFromHost( Tdst& dst, const Tsrc& src ) // keep the same order of arguments as in memcpy
   {
@@ -504,13 +504,13 @@ namespace mg5amcCpu
       throw std::runtime_error( sstr.str() );
     }
     // NB (PR #45): cudaMemcpy involves an intermediate memcpy to pinned memory if host array is a not a pinned host array
-    gpuMemcpy( dst.data(), src.data(), src.bytes(), gpuMemcpyHostToDevice );
+    checkCuda( cudaMemcpy( dst.data(), src.data(), src.bytes(), cudaMemcpyHostToDevice ) );
   }
 #endif
 
   //--------------------------------------------------------------------------
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
   template<class Tdst, class Tsrc>
   void copyHostFromDevice( Tdst& dst, const Tsrc& src ) // keep the same order of arguments as in memcpy
   {
@@ -527,7 +527,7 @@ namespace mg5amcCpu
       throw std::runtime_error( sstr.str() );
     }
     // NB (PR #45): cudaMemcpy involves an intermediate memcpy to pinned memory if host array is a not a pinned host array
-    gpuMemcpy( dst.data(), src.data(), src.bytes(), gpuMemcpyDeviceToHost );
+    checkCuda( cudaMemcpy( dst.data(), src.data(), src.bytes(), cudaMemcpyDeviceToHost ) );
   }
 #endif
 
diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gu_ttxu/CPPProcess.cc b/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gu_ttxu/CPPProcess.cc
index 0c59a4bcd3..abcb3c9654 100644
--- a/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gu_ttxu/CPPProcess.cc
+++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gu_ttxu/CPPProcess.cc
@@ -4,7 +4,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
 // MadGraph5_aMC@NLO v. 3.5.0_lo_vect, 2023-06-09
@@ -16,6 +16,7 @@
 
 #include "mgOnGpuConfig.h"
 
+#include "CudaRuntime.h"
 #include "HelAmps_sm.h"
 #include "MemoryAccessAmplitudes.h"
 #include "MemoryAccessCouplings.h"
@@ -48,7 +49,7 @@
 // Process: g d > t t~ d WEIGHTED<=3 @1
 // Process: g s > t t~ s WEIGHTED<=3 @1
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -82,7 +83,7 @@ namespace mg5amcCpu
   __device__ const fptype cIPD[2] = { (fptype)Parameters_sm::mdl_MT, (fptype)Parameters_sm::mdl_WT };
   __device__ const fptype* cIPC = nullptr; // unused as nicoup=0
 #else
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
   __device__ __constant__ fptype cIPD[2];
   __device__ __constant__ fptype* cIPC = nullptr; // unused as nicoup=0
 #else
@@ -92,7 +93,7 @@ namespace mg5amcCpu
 #endif
 
   // Helicity combinations (and filtering of "good" helicity combinations)
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
   __device__ __constant__ short cHel[ncomb][npar];
   __device__ __constant__ int cNGoodHel;
   __device__ __constant__ int cGoodHel[ncomb];
@@ -120,13 +121,13 @@ namespace mg5amcCpu
                            fptype* allDenominators,       // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
                            fptype_sv* jamp2_sv            // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled)
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
                            , const int ievt00             // input: first event number in current C++ event page (for CUDA, ievt depends on threadid)
 #endif
                            )
   //ALWAYS_INLINE // attributes are not permitted in a function definition
   {
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
     using namespace mg5amcGpu;
     using M_ACCESS = DeviceAccessMomenta;         // non-trivial access: buffer includes all events
     using E_ACCESS = DeviceAccessMatrixElements;  // non-trivial access: buffer includes all events
@@ -153,7 +154,7 @@ namespace mg5amcCpu
 #endif /* clang-format on */
     mgDebug( 0, __FUNCTION__ );
     //printf( "calculate_wavefunctions: ihel=%2d\n", ihel );
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
     //printf( "calculate_wavefunctions: ievt00=%d\n", ievt00 );
 #endif
 
@@ -189,7 +190,7 @@ namespace mg5amcCpu
 #endif
     for( int iParity = 0; iParity < nParity; ++iParity )
     { // START LOOP ON IPARITY
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
       const int ievt0 = ievt00 + iParity * neppV;
 #endif
       constexpr size_t nxcoup = ndcoup + nicoup; // both dependent and independent couplings
@@ -202,10 +203,8 @@ namespace mg5amcCpu
         allCOUPs[idcoup] = CD_ACCESS::idcoupAccessBufferConst( allcouplings, idcoup ); // dependent couplings, vary event-by-event
       for( size_t iicoup = 0; iicoup < nicoup; iicoup++ )
         allCOUPs[ndcoup + iicoup] = CI_ACCESS::iicoupAccessBufferConst( cIPC, iicoup ); // independent couplings, fixed for all events
-#ifdef MGONGPUCPP_GPUIMPL
 #ifdef __CUDACC__
 #pragma nv_diagnostic pop
-#endif
       // CUDA kernels take input/output buffers with momenta/MEs for all events
       const fptype* momenta = allmomenta;
       const fptype* COUPs[nxcoup];
@@ -244,7 +243,7 @@ namespace mg5amcCpu
       // Wavefunction(s) for diagram number 1
       vxxxxx<M_ACCESS, W_ACCESS>( momenta, 0., cHel[ihel][0], -1, w_fp[0], 0 );
 
-#if not( defined MGONGPUCPP_GPUIMPL and defined MGONGPU_TEST_DIVERGENCE )
+#if not( defined __CUDACC__ and defined MGONGPU_TEST_DIVERGENCE )
       imzxxx<M_ACCESS, W_ACCESS>( momenta, cHel[ihel][1], +1, w_fp[1], 1 ); // NB: imzxxx only uses pz
 #else
       if( ( blockDim.x * blockIdx.x + threadIdx.x ) % 2 == 0 )
@@ -344,7 +343,7 @@ namespace mg5amcCpu
         { 4, 0, 12, 4 },
         { 0, 4, 4, 12 } }; // 2-D array[4][4]
 
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
       // Pre-compute a constexpr triangular color matrix properly normalized #475
       struct TriangularNormalizedColorMatrix
       {
@@ -401,7 +400,7 @@ namespace mg5amcCpu
 #endif
       for( int icol = 0; icol < ncolor; icol++ )
       {
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
         // === C++ START ===
         // Diagonal terms
 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
@@ -460,7 +459,7 @@ namespace mg5amcCpu
       MEs_sv_previous += deltaMEs_previous;
 #endif
       /*
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
       if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", blockDim.x * blockIdx.x + threadIdx.x, ihel, MEs_sv );
 #else
 #ifdef MGONGPU_CPPSIMD
@@ -523,8 +522,8 @@ namespace mg5amcCpu
       { 1, -1, 1, 1, 1 },
       { 1, -1, 1, -1, -1 },
       { 1, -1, 1, -1, 1 } };
-#ifdef MGONGPUCPP_GPUIMPL
-    gpuMemcpyToSymbol( cHel, tHel, ncomb * npar * sizeof( short ) );
+#ifdef __CUDACC__
+    checkCuda( cudaMemcpyToSymbol( cHel, tHel, ncomb * npar * sizeof( short ) ) );
 #else
     memcpy( cHel, tHel, ncomb * npar * sizeof( short ) );
 #endif
@@ -565,9 +564,9 @@ namespace mg5amcCpu
     // Then copy them to CUDA constant memory (issue #39) or its C++ emulation in file-scope static memory
     const fptype tIPD[2] = { (fptype)m_pars->mdl_MT, (fptype)m_pars->mdl_WT };
     //const cxtype tIPC[0] = { ... }; // nicoup=0
-#ifdef MGONGPUCPP_GPUIMPL
-    gpuMemcpyToSymbol( cIPD, tIPD, 2 * sizeof( fptype ) );
-    //gpuMemcpyToSymbol( cIPC, tIPC, 0 * sizeof( cxtype ) ); // nicoup=0
+#ifdef __CUDACC__
+    checkCuda( cudaMemcpyToSymbol( cIPD, tIPD, 2 * sizeof( fptype ) ) );
+    //checkCuda( cudaMemcpyToSymbol( cIPC, tIPC, 0 * sizeof( cxtype ) ) ); // nicoup=0
 #else
     memcpy( cIPD, tIPD, 2 * sizeof( fptype ) );
     //memcpy( cIPC, tIPC, 0 * sizeof( cxtype ) ); // nicoup=0
@@ -604,7 +603,7 @@ namespace mg5amcCpu
   {
     std::stringstream out;
     // CUDA version (NVCC)
-    // [Use __NVCC__ instead of MGONGPUCPP_GPUIMPL here!]
+    // [Use __NVCC__ instead of __CUDACC__ here!]
     // [This tests if 'nvcc' was used even to build a .cc file, even if not necessarily 'nvcc -x cu' for a .cu file]
     // [Check 'nvcc --compiler-options -dM -E dummy.c | grep CUDA': see https://stackoverflow.com/a/53713712]
 #ifdef __NVCC__
@@ -669,12 +668,12 @@ namespace mg5amcCpu
   __global__ void /* clang-format off */
   computeDependentCouplings( const fptype* allgs, // input: Gs[nevt]
                              fptype* allcouplings // output: couplings[nevt*ndcoup*2]
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
                              , const int nevt     // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #endif
   ) /* clang-format on */
   {
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
     using namespace mg5amcGpu;
     using G_ACCESS = DeviceAccessGs;
     using C_ACCESS = DeviceAccessCouplings;
@@ -695,7 +694,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
+#ifdef __CUDACC__ /* clang-format off */
   __global__ void
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
@@ -824,9 +823,9 @@ namespace mg5amcCpu
         nGoodHel++;
       }
     }
-#ifdef MGONGPUCPP_GPUIMPL
-    gpuMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) );
-    gpuMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) );
+#ifdef __CUDACC__
+    checkCuda( cudaMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) ) );
+    checkCuda( cudaMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) ) );
 #else
     cNGoodHel = nGoodHel;
     for( int ihel = 0; ihel < ncomb; ihel++ ) cGoodHel[ihel] = goodHel[ihel];
@@ -850,7 +849,7 @@ namespace mg5amcCpu
 #endif
             int* allselhel,                // output: helicity selection[nevt]
             int* allselcol                 // output: helicity selection[nevt]
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
             , const int nevt               // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #endif
             ) /* clang-format on */
@@ -871,7 +870,7 @@ namespace mg5amcCpu
     // Denominators: spins, colors and identical particles
     constexpr int helcolDenominators[1] = { 96 }; // assume nprocesses == 1 (#272 and #343)
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
     // Remember: in CUDA this is a kernel for one event, in c++ this processes n events
     const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid
 #else
@@ -885,12 +884,9 @@ namespace mg5amcCpu
 #endif
 
     // Start sigmaKin_lines
-
-#include "GpuAbstraction.h"
-
     // === PART 0 - INITIALISATION (before calculate_wavefunctions) ===
     // Reset the "matrix elements" - running sums of |M|^2 over helicities for the given event
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
     allMEs[ievt] = 0;
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     allNumerators[ievt] = 0;
@@ -918,7 +914,7 @@ namespace mg5amcCpu
     // === PART 1 - HELICITY LOOP: CALCULATE WAVEFUNCTIONS ===
     // (in both CUDA and C++, using precomputed good helicities)
 
-#ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++
+#ifdef __CUDACC__ // CUDA OR C++
 
     // *** START OF PART 1a - CUDA (one event per CPU thread) ***
     // Running sum of partial amplitudes squared for event by event color selection (#402)
@@ -1122,7 +1118,7 @@ namespace mg5amcCpu
     // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event
     // [NB 'sum over final spins, average over initial spins', eg see
     // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf]
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
     allMEs[ievt] /= helcolDenominators[0];
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     if( channelId > 0 ) allMEs[ievt] *= allNumerators[ievt] / allDenominators[ievt];
diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gu_ttxu/CPPProcess.h b/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gu_ttxu/CPPProcess.h
index ee747a8ae4..9554285817 100644
--- a/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gu_ttxu/CPPProcess.h
+++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gu_ttxu/CPPProcess.h
@@ -4,7 +4,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
 // MadGraph5_aMC@NLO v. 3.5.0_lo_vect, 2023-06-09
@@ -25,7 +25,7 @@
 
 //--------------------------------------------------------------------------
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -110,7 +110,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
   __global__ void
   computeDependentCouplings( const fptype* allgs,    // input: Gs[nevt]
                              fptype* allcouplings ); // output: couplings[nevt*ndcoup*2]
@@ -123,7 +123,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
+#ifdef __CUDACC__ /* clang-format off */
   __global__ void
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
@@ -153,7 +153,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
+#ifdef __CUDACC__ /* clang-format off */
   __global__ void
   sigmaKin( const fptype* allmomenta,      // input: momenta[nevt*npar*4]
             const fptype* allcouplings,    // input: couplings[nevt*ndcoup*2]
diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gu_ttxu/CudaRuntime.h b/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gu_ttxu/CudaRuntime.h
new file mode 120000
index 0000000000..ce9e1a487a
--- /dev/null
+++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gu_ttxu/CudaRuntime.h
@@ -0,0 +1 @@
+../CudaRuntime.h
\ No newline at end of file
diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gu_ttxu/check_sa.cc b/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gu_ttxu/check_sa.cc
index 1bad694d1c..f1e75b9252 100644
--- a/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gu_ttxu/check_sa.cc
+++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gu_ttxu/check_sa.cc
@@ -4,7 +4,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: O. Mattelaer (Nov 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 
 #include "mgOnGpuConfig.h"
@@ -12,7 +12,6 @@
 #include "BridgeKernels.h"
 #include "CPPProcess.h"
 #include "CrossSectionKernels.h"
-#include "GpuRuntime.h"
 #include "MatrixElementKernels.h"
 #include "MemoryAccessMatrixElements.h"
 #include "MemoryAccessMomenta.h"
@@ -64,7 +63,7 @@ usage( char* argv0, int ret = 1 )
   std::cout << std::endl;
   std::cout << "Summary stats are always computed: '-p' and '-j' only control their printout" << std::endl;
   std::cout << "The '-d' flag only enables NaN/abnormal warnings and OMP debugging" << std::endl;
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
 #ifdef _OPENMP
   std::cout << std::endl;
   std::cout << "Use the OMP_NUM_THREADS environment variable to control OMP multi-threading" << std::endl;
@@ -78,7 +77,7 @@ int
 main( int argc, char** argv )
 {
   // Namespaces for CUDA and C++ (FIXME - eventually use the same namespace everywhere...)
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
   using namespace mg5amcGpu;
 #else
   using namespace mg5amcCpu;
@@ -104,11 +103,11 @@ main( int argc, char** argv )
     CurandDevice = 2
   };
 #ifdef __CUDACC__
-  RandomNumberMode rndgen = RandomNumberMode::CurandDevice; // default on CUDA GPU
+  RandomNumberMode rndgen = RandomNumberMode::CurandDevice; // default on GPU
 #elif not defined MGONGPU_HAS_NO_CURAND
   RandomNumberMode rndgen = RandomNumberMode::CurandHost;  // default on CPU if build has curand
 #else
-  RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // default on CPU if build has no curand and on HIP GPU
+  RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // default on CPU if build has no curand
 #endif
   // Rambo sampling mode (NB RamboHost implies CommonRandom or CurandHost!)
   enum class RamboSamplingMode
@@ -116,7 +115,7 @@ main( int argc, char** argv )
     RamboHost = 1,
     RamboDevice = 2
   };
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
   RamboSamplingMode rmbsmp = RamboSamplingMode::RamboDevice; // default on GPU
 #else
   RamboSamplingMode rmbsmp = RamboSamplingMode::RamboHost; // default on CPU
@@ -149,7 +148,7 @@ main( int argc, char** argv )
 #ifdef __CUDACC__
       rndgen = RandomNumberMode::CurandDevice;
 #else
-      throw std::runtime_error( "CurandDevice is not supported on CPUs or on HIP GPUs" );
+      throw std::runtime_error( "CurandDevice is not supported on CPUs" );
 #endif
     }
     else if( arg == "--curhst" )
@@ -166,7 +165,7 @@ main( int argc, char** argv )
     }
     else if( arg == "--rmbdev" )
     {
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
       rmbsmp = RamboSamplingMode::RamboDevice;
 #else
       throw std::runtime_error( "RamboDevice is not supported on CPUs" );
@@ -240,13 +239,13 @@ main( int argc, char** argv )
     return usage( argv[0] );
   }
 
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
 #ifdef _OPENMP
   ompnumthreadsNotSetMeansOneThread( debug ? 1 : 0 ); // quiet(-1), info(0), debug(1)
 #endif
 #endif
 
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
   // Fail gently and avoid "Illegal instruction (core dumped)" if the host does not support the SIMD used in the ME calculation
   // Note: this prevents a crash on pmpe04 but not on some github CI nodes?
   // [NB: SIMD vectorization in mg5amc C++ code is only used in the ME calculation below MatrixElementKernelHost!]
@@ -264,14 +263,14 @@ main( int argc, char** argv )
 
   // === STEP 0 - INITIALISE
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 
-  // --- 00. Initialise GPU
-  // Instantiate a GpuRuntime at the beginnining of the application's main.
-  // For CUDA this invokes cudaSetDevice(0) in the constructor and books a cudaDeviceReset() call in the destructor.
-  const std::string cdinKey = "00 GpuInit";
+  // --- 00. Initialise cuda
+  // Instantiate a CudaRuntime at the beginnining of the application's main to
+  // invoke cudaSetDevice(0) in the constructor and book a cudaDeviceReset() call in the destructor
+  const std::string cdinKey = "00 CudaInit";
   timermap.start( cdinKey );
-  GpuRuntime GpuRuntime( debug );
+  CudaRuntime cudaRuntime( debug );
 #endif
 
   // --- 0a. Initialise physics process
@@ -293,7 +292,7 @@ main( int argc, char** argv )
   timermap.start( alloKey );
 
   // Memory buffers for random numbers for momenta
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
   HostBufferRndNumMomenta hstRndmom( nevt );
 #else
   PinnedHostBufferRndNumMomenta hstRndmom( nevt );
@@ -301,7 +300,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for sampling weights
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
   HostBufferWeights hstWeights( nevt );
 #else
   PinnedHostBufferWeights hstWeights( nevt );
@@ -309,7 +308,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for momenta
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
   HostBufferMomenta hstMomenta( nevt );
 #else
   PinnedHostBufferMomenta hstMomenta( nevt );
@@ -317,7 +316,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for Gs
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
   HostBufferGs hstGs( nevt );
 #else
   PinnedHostBufferGs hstGs( nevt );
@@ -334,7 +333,7 @@ main( int argc, char** argv )
   }
 
   // Memory buffers for matrix elements
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
   HostBufferMatrixElements hstMatrixElements( nevt );
 #else
   PinnedHostBufferMatrixElements hstMatrixElements( nevt );
@@ -343,7 +342,7 @@ main( int argc, char** argv )
 
   // Memory buffers for random numbers for helicity selection
   // *** NB #403 these buffers always remain initialised at 0: no need for helicity choice in gcheck/check (no LHE produced) ***
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
   HostBufferRndNumHelicity hstRndHel( nevt );
 #else
   PinnedHostBufferRndNumHelicity hstRndHel( nevt );
@@ -352,7 +351,7 @@ main( int argc, char** argv )
 
   // Memory buffers for random numbers for color selection
   // *** NB #402 these buffers always remain initialised at 0: no need for color choice in gcheck/check (no LHE produced) ***
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
   HostBufferRndNumColor hstRndCol( nevt );
 #else
   PinnedHostBufferRndNumColor hstRndCol( nevt );
@@ -360,7 +359,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for helicity selection
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
   HostBufferSelectedHelicity hstSelHel( nevt );
 #else
   PinnedHostBufferSelectedHelicity hstSelHel( nevt );
@@ -368,7 +367,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for color selection
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
   HostBufferSelectedColor hstSelCol( nevt );
 #else
   PinnedHostBufferSelectedColor hstSelCol( nevt );
@@ -404,7 +403,7 @@ main( int argc, char** argv )
 #else
   else
   {
-    throw std::logic_error( "CurandDevice is not supported on CPUs or HIP GPUs" ); // INTERNAL ERROR (no path to this statement)
+    throw std::logic_error( "CurandDevice is not supported on CPUs" ); // INTERNAL ERROR (no path to this statement)
   }
 #endif
 #else
@@ -422,7 +421,7 @@ main( int argc, char** argv )
   }
   else
   {
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
     prsk.reset( new RamboSamplingKernelDevice( energy, devRndmom, devMomenta, devWeights, gpublocks, gputhreads ) );
 #else
     throw std::logic_error( "RamboDevice is not supported on CPUs" ); // INTERNAL ERROR (no path to this statement)
@@ -433,7 +432,7 @@ main( int argc, char** argv )
   std::unique_ptr<MatrixElementKernelBase> pmek;
   if( !bridge )
   {
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
     pmek.reset( new MatrixElementKernelDevice( devMomenta, devGs, devRndHel, devRndCol, devMatrixElements, devSelHel, devSelCol, gpublocks, gputhreads ) );
 #else
     pmek.reset( new MatrixElementKernelHost( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, nevt ) );
@@ -441,7 +440,7 @@ main( int argc, char** argv )
   }
   else
   {
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
     pmek.reset( new BridgeKernelDevice( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, gpublocks, gputhreads ) );
 #else
     pmek.reset( new BridgeKernelHost( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, nevt ) );
@@ -483,7 +482,7 @@ main( int argc, char** argv )
     prnk->generateRnarray();
     //std::cout << "Got random numbers" << std::endl;
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
     if( rndgen != RandomNumberMode::CurandDevice && rmbsmp == RamboSamplingMode::RamboDevice )
     {
       // --- 1c. Copy rndmom from host to device
@@ -515,7 +514,7 @@ main( int argc, char** argv )
     prsk->getMomentaFinal();
     //std::cout << "Got final momenta" << std::endl;
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
     if( rmbsmp == RamboSamplingMode::RamboDevice )
     {
       // --- 2c. CopyDToH Weights
@@ -560,7 +559,7 @@ main( int argc, char** argv )
       dynamic_cast<BridgeKernelBase*>( pmek.get() )->transposeInputMomentaC2F();
     }
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
     // --- 2d. CopyHToD Momenta
     const std::string gKey = "0.. CpHTDg";
     rambtime += timermap.start( gKey ); // FIXME! NOT A RAMBO TIMER!
@@ -589,7 +588,7 @@ main( int argc, char** argv )
     wv3atime += timermap.stop(); // calc only
     wavetime += wv3atime;        // calc plus copy
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
     if( !bridge )
     {
       // --- 3b. CopyDToH MEs
@@ -732,19 +731,15 @@ main( int argc, char** argv )
     rndgentxt = "CURAND DEVICE";
 #ifdef __CUDACC__
   rndgentxt += " (CUDA code)";
-#elif defined __HIPCC__
-  rndgentxt += " (HIP code)";
 #else
   rndgentxt += " (C++ code)";
 #endif
 
   // Workflow description summary
   std::string wrkflwtxt;
-  // -- CUDA or HIP or C++?
+  // -- CUDA or C++?
 #ifdef __CUDACC__
   wrkflwtxt += "CUD:";
-#elif defined __HIPCC__
-  wrkflwtxt += "HIP:";
 #else
   wrkflwtxt += "CPP:";
 #endif
@@ -769,12 +764,6 @@ main( int argc, char** argv )
 #else
   wrkflwtxt += "???:"; // no path to this statement
 #endif
-#elif defined __HIPCC__
-#if defined MGONGPU_CUCXTYPE_CXSMPL
-  wrkflwtxt += "CXS:";
-#else
-  wrkflwtxt += "???:"; // no path to this statement
-#endif
 #else
 #if defined MGONGPU_CPPCXTYPE_STDCOMPLEX
   wrkflwtxt += "STX:";
@@ -800,7 +789,7 @@ main( int argc, char** argv )
     wrkflwtxt += "RMBDEV+";
   else
     wrkflwtxt += "??????+"; // no path to this statement
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
   // -- HOST or DEVICE matrix elements? Standalone MEs or BRIDGE?
   if( !bridge )
     wrkflwtxt += "MESDEV";
@@ -856,7 +845,7 @@ main( int argc, char** argv )
 
   if( perf )
   {
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
 #ifdef _OPENMP
     // Get the output of "nproc --all" (https://stackoverflow.com/a/478960)
     std::string nprocall;
@@ -877,8 +866,6 @@ main( int argc, char** argv )
     std::cout << std::string( SEP79, '*' ) << std::endl
 #ifdef __CUDACC__
               << "Process                     = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_CUDA"
-#elif defined __HIPCC__
-              << "Process                     = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_HIP"
 #else
               << "Process                     = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_CPP"
 #endif
@@ -905,21 +892,21 @@ main( int argc, char** argv )
 #elif defined MGONGPU_FPTYPE_FLOAT
               << "FP precision                = FLOAT (NaN/abnormal=" << nabn << ", zero=" << nzero << ")" << std::endl
 #endif
+#ifdef __CUDACC__
 #if defined MGONGPU_CUCXTYPE_CUCOMPLEX
               << "Complex type                = CUCOMPLEX" << std::endl
 #elif defined MGONGPU_CUCXTYPE_THRUST
               << "Complex type                = THRUST::COMPLEX" << std::endl
-#elif defined MGONGPU_CUCXTYPE_CXSMPL
-              << "Complex type                = STD::COMPLEX" << std::endl
+#endif
 #else
-              << "Complex type                = ???" << std::endl // no path to this statement...
+              << "Complex type                = STD::COMPLEX" << std::endl
 #endif
               << "RanNumb memory layout       = AOSOA[" << neppR << "]"
               << ( neppR == 1 ? " == AOS" : "" )
               << " [HARDCODED FOR REPRODUCIBILITY]" << std::endl
               << "Momenta memory layout       = AOSOA[" << neppM << "]"
               << ( neppM == 1 ? " == AOS" : "" ) << std::endl
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
     //<< "Wavefunction GPU memory     = LOCAL" << std::endl
 #else
 #if !defined MGONGPU_CPPSIMD
@@ -950,7 +937,7 @@ main( int argc, char** argv )
 #endif
 #endif
               << "Random number generation    = " << rndgentxt << std::endl
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
 #ifdef _OPENMP
               << "OMP threads / `nproc --all` = " << omp_get_max_threads() << " / " << nprocall // includes a newline
 #endif
@@ -1046,14 +1033,14 @@ main( int argc, char** argv )
              << "\"FLOAT (NaN/abnormal=" << nabn << ")\"," << std::endl
 #endif
              << "\"Complex type\": "
+#ifdef __CUDACC__
 #if defined MGONGPU_CUCXTYPE_CUCOMPLEX
              << "\"CUCOMPLEX\"," << std::endl
 #elif defined MGONGPU_CUCXTYPE_THRUST
              << "\"THRUST::COMPLEX\"," << std::endl
-#elif defined MGONGPU_CUCXTYPE_CXSMPL
-             << "\"STD::COMPLEX\"," << std::endl
+#endif
 #else
-             << "\"???\"," << std::endl                           // no path to this statement...
+             << "\"STD::COMPLEX\"," << std::endl
 #endif
              << "\"RanNumb memory layout\": "
              << "\"AOSOA[" << neppR << "]\""
@@ -1061,7 +1048,7 @@ main( int argc, char** argv )
              << "\"Momenta memory layout\": "
              << "\"AOSOA[" << neppM << "]\""
              << ( neppM == 1 ? " == AOS" : "" ) << ", " << std::endl
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
     //<< "\"Wavefunction GPU memory\": " << "\"LOCAL\"," << std::endl
 #endif
              << "\"Curand generation\": "
diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gux_ttxux/CPPProcess.cc b/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gux_ttxux/CPPProcess.cc
index c828a1d13b..12f74f99ea 100644
--- a/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gux_ttxux/CPPProcess.cc
+++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gux_ttxux/CPPProcess.cc
@@ -4,7 +4,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
 // MadGraph5_aMC@NLO v. 3.5.0_lo_vect, 2023-06-09
@@ -16,6 +16,7 @@
 
 #include "mgOnGpuConfig.h"
 
+#include "CudaRuntime.h"
 #include "HelAmps_sm.h"
 #include "MemoryAccessAmplitudes.h"
 #include "MemoryAccessCouplings.h"
@@ -48,7 +49,7 @@
 // Process: g d~ > t t~ d~ WEIGHTED<=3 @1
 // Process: g s~ > t t~ s~ WEIGHTED<=3 @1
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -82,7 +83,7 @@ namespace mg5amcCpu
   __device__ const fptype cIPD[2] = { (fptype)Parameters_sm::mdl_MT, (fptype)Parameters_sm::mdl_WT };
   __device__ const fptype* cIPC = nullptr; // unused as nicoup=0
 #else
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
   __device__ __constant__ fptype cIPD[2];
   __device__ __constant__ fptype* cIPC = nullptr; // unused as nicoup=0
 #else
@@ -92,7 +93,7 @@ namespace mg5amcCpu
 #endif
 
   // Helicity combinations (and filtering of "good" helicity combinations)
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
   __device__ __constant__ short cHel[ncomb][npar];
   __device__ __constant__ int cNGoodHel;
   __device__ __constant__ int cGoodHel[ncomb];
@@ -120,13 +121,13 @@ namespace mg5amcCpu
                            fptype* allDenominators,       // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
                            fptype_sv* jamp2_sv            // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled)
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
                            , const int ievt00             // input: first event number in current C++ event page (for CUDA, ievt depends on threadid)
 #endif
                            )
   //ALWAYS_INLINE // attributes are not permitted in a function definition
   {
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
     using namespace mg5amcGpu;
     using M_ACCESS = DeviceAccessMomenta;         // non-trivial access: buffer includes all events
     using E_ACCESS = DeviceAccessMatrixElements;  // non-trivial access: buffer includes all events
@@ -153,7 +154,7 @@ namespace mg5amcCpu
 #endif /* clang-format on */
     mgDebug( 0, __FUNCTION__ );
     //printf( "calculate_wavefunctions: ihel=%2d\n", ihel );
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
     //printf( "calculate_wavefunctions: ievt00=%d\n", ievt00 );
 #endif
 
@@ -189,7 +190,7 @@ namespace mg5amcCpu
 #endif
     for( int iParity = 0; iParity < nParity; ++iParity )
     { // START LOOP ON IPARITY
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
       const int ievt0 = ievt00 + iParity * neppV;
 #endif
       constexpr size_t nxcoup = ndcoup + nicoup; // both dependent and independent couplings
@@ -202,10 +203,8 @@ namespace mg5amcCpu
         allCOUPs[idcoup] = CD_ACCESS::idcoupAccessBufferConst( allcouplings, idcoup ); // dependent couplings, vary event-by-event
       for( size_t iicoup = 0; iicoup < nicoup; iicoup++ )
         allCOUPs[ndcoup + iicoup] = CI_ACCESS::iicoupAccessBufferConst( cIPC, iicoup ); // independent couplings, fixed for all events
-#ifdef MGONGPUCPP_GPUIMPL
 #ifdef __CUDACC__
 #pragma nv_diagnostic pop
-#endif
       // CUDA kernels take input/output buffers with momenta/MEs for all events
       const fptype* momenta = allmomenta;
       const fptype* COUPs[nxcoup];
@@ -337,7 +336,7 @@ namespace mg5amcCpu
         { 4, 0, 12, 4 },
         { 0, 4, 4, 12 } }; // 2-D array[4][4]
 
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
       // Pre-compute a constexpr triangular color matrix properly normalized #475
       struct TriangularNormalizedColorMatrix
       {
@@ -394,7 +393,7 @@ namespace mg5amcCpu
 #endif
       for( int icol = 0; icol < ncolor; icol++ )
       {
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
         // === C++ START ===
         // Diagonal terms
 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
@@ -453,7 +452,7 @@ namespace mg5amcCpu
       MEs_sv_previous += deltaMEs_previous;
 #endif
       /*
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
       if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", blockDim.x * blockIdx.x + threadIdx.x, ihel, MEs_sv );
 #else
 #ifdef MGONGPU_CPPSIMD
@@ -516,8 +515,8 @@ namespace mg5amcCpu
       { 1, 1, 1, 1, -1 },
       { 1, 1, 1, -1, 1 },
       { 1, 1, 1, -1, -1 } };
-#ifdef MGONGPUCPP_GPUIMPL
-    gpuMemcpyToSymbol( cHel, tHel, ncomb * npar * sizeof( short ) );
+#ifdef __CUDACC__
+    checkCuda( cudaMemcpyToSymbol( cHel, tHel, ncomb * npar * sizeof( short ) ) );
 #else
     memcpy( cHel, tHel, ncomb * npar * sizeof( short ) );
 #endif
@@ -558,9 +557,9 @@ namespace mg5amcCpu
     // Then copy them to CUDA constant memory (issue #39) or its C++ emulation in file-scope static memory
     const fptype tIPD[2] = { (fptype)m_pars->mdl_MT, (fptype)m_pars->mdl_WT };
     //const cxtype tIPC[0] = { ... }; // nicoup=0
-#ifdef MGONGPUCPP_GPUIMPL
-    gpuMemcpyToSymbol( cIPD, tIPD, 2 * sizeof( fptype ) );
-    //gpuMemcpyToSymbol( cIPC, tIPC, 0 * sizeof( cxtype ) ); // nicoup=0
+#ifdef __CUDACC__
+    checkCuda( cudaMemcpyToSymbol( cIPD, tIPD, 2 * sizeof( fptype ) ) );
+    //checkCuda( cudaMemcpyToSymbol( cIPC, tIPC, 0 * sizeof( cxtype ) ) ); // nicoup=0
 #else
     memcpy( cIPD, tIPD, 2 * sizeof( fptype ) );
     //memcpy( cIPC, tIPC, 0 * sizeof( cxtype ) ); // nicoup=0
@@ -597,7 +596,7 @@ namespace mg5amcCpu
   {
     std::stringstream out;
     // CUDA version (NVCC)
-    // [Use __NVCC__ instead of MGONGPUCPP_GPUIMPL here!]
+    // [Use __NVCC__ instead of __CUDACC__ here!]
     // [This tests if 'nvcc' was used even to build a .cc file, even if not necessarily 'nvcc -x cu' for a .cu file]
     // [Check 'nvcc --compiler-options -dM -E dummy.c | grep CUDA': see https://stackoverflow.com/a/53713712]
 #ifdef __NVCC__
@@ -662,12 +661,12 @@ namespace mg5amcCpu
   __global__ void /* clang-format off */
   computeDependentCouplings( const fptype* allgs, // input: Gs[nevt]
                              fptype* allcouplings // output: couplings[nevt*ndcoup*2]
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
                              , const int nevt     // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #endif
   ) /* clang-format on */
   {
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
     using namespace mg5amcGpu;
     using G_ACCESS = DeviceAccessGs;
     using C_ACCESS = DeviceAccessCouplings;
@@ -688,7 +687,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
+#ifdef __CUDACC__ /* clang-format off */
   __global__ void
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
@@ -817,9 +816,9 @@ namespace mg5amcCpu
         nGoodHel++;
       }
     }
-#ifdef MGONGPUCPP_GPUIMPL
-    gpuMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) );
-    gpuMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) );
+#ifdef __CUDACC__
+    checkCuda( cudaMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) ) );
+    checkCuda( cudaMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) ) );
 #else
     cNGoodHel = nGoodHel;
     for( int ihel = 0; ihel < ncomb; ihel++ ) cGoodHel[ihel] = goodHel[ihel];
@@ -843,7 +842,7 @@ namespace mg5amcCpu
 #endif
             int* allselhel,                // output: helicity selection[nevt]
             int* allselcol                 // output: helicity selection[nevt]
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
             , const int nevt               // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #endif
             ) /* clang-format on */
@@ -864,7 +863,7 @@ namespace mg5amcCpu
     // Denominators: spins, colors and identical particles
     constexpr int helcolDenominators[1] = { 96 }; // assume nprocesses == 1 (#272 and #343)
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
     // Remember: in CUDA this is a kernel for one event, in c++ this processes n events
     const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid
 #else
@@ -878,12 +877,9 @@ namespace mg5amcCpu
 #endif
 
     // Start sigmaKin_lines
-
-#include "GpuAbstraction.h"
-
     // === PART 0 - INITIALISATION (before calculate_wavefunctions) ===
     // Reset the "matrix elements" - running sums of |M|^2 over helicities for the given event
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
     allMEs[ievt] = 0;
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     allNumerators[ievt] = 0;
@@ -911,7 +907,7 @@ namespace mg5amcCpu
     // === PART 1 - HELICITY LOOP: CALCULATE WAVEFUNCTIONS ===
     // (in both CUDA and C++, using precomputed good helicities)
 
-#ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++
+#ifdef __CUDACC__ // CUDA OR C++
 
     // *** START OF PART 1a - CUDA (one event per CPU thread) ***
     // Running sum of partial amplitudes squared for event by event color selection (#402)
@@ -1115,7 +1111,7 @@ namespace mg5amcCpu
     // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event
     // [NB 'sum over final spins, average over initial spins', eg see
     // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf]
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
     allMEs[ievt] /= helcolDenominators[0];
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     if( channelId > 0 ) allMEs[ievt] *= allNumerators[ievt] / allDenominators[ievt];
diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gux_ttxux/CPPProcess.h b/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gux_ttxux/CPPProcess.h
index 53bb5ccd94..1d1e130ec2 100644
--- a/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gux_ttxux/CPPProcess.h
+++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gux_ttxux/CPPProcess.h
@@ -4,7 +4,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
 // MadGraph5_aMC@NLO v. 3.5.0_lo_vect, 2023-06-09
@@ -25,7 +25,7 @@
 
 //--------------------------------------------------------------------------
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -110,7 +110,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
   __global__ void
   computeDependentCouplings( const fptype* allgs,    // input: Gs[nevt]
                              fptype* allcouplings ); // output: couplings[nevt*ndcoup*2]
@@ -123,7 +123,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
+#ifdef __CUDACC__ /* clang-format off */
   __global__ void
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
@@ -153,7 +153,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
+#ifdef __CUDACC__ /* clang-format off */
   __global__ void
   sigmaKin( const fptype* allmomenta,      // input: momenta[nevt*npar*4]
             const fptype* allcouplings,    // input: couplings[nevt*ndcoup*2]
diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gux_ttxux/CudaRuntime.h b/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gux_ttxux/CudaRuntime.h
new file mode 120000
index 0000000000..ce9e1a487a
--- /dev/null
+++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gux_ttxux/CudaRuntime.h
@@ -0,0 +1 @@
+../CudaRuntime.h
\ No newline at end of file
diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gux_ttxux/check_sa.cc b/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gux_ttxux/check_sa.cc
index 1bad694d1c..f1e75b9252 100644
--- a/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gux_ttxux/check_sa.cc
+++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gux_ttxux/check_sa.cc
@@ -4,7 +4,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: O. Mattelaer (Nov 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 
 #include "mgOnGpuConfig.h"
@@ -12,7 +12,6 @@
 #include "BridgeKernels.h"
 #include "CPPProcess.h"
 #include "CrossSectionKernels.h"
-#include "GpuRuntime.h"
 #include "MatrixElementKernels.h"
 #include "MemoryAccessMatrixElements.h"
 #include "MemoryAccessMomenta.h"
@@ -64,7 +63,7 @@ usage( char* argv0, int ret = 1 )
   std::cout << std::endl;
   std::cout << "Summary stats are always computed: '-p' and '-j' only control their printout" << std::endl;
   std::cout << "The '-d' flag only enables NaN/abnormal warnings and OMP debugging" << std::endl;
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
 #ifdef _OPENMP
   std::cout << std::endl;
   std::cout << "Use the OMP_NUM_THREADS environment variable to control OMP multi-threading" << std::endl;
@@ -78,7 +77,7 @@ int
 main( int argc, char** argv )
 {
   // Namespaces for CUDA and C++ (FIXME - eventually use the same namespace everywhere...)
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
   using namespace mg5amcGpu;
 #else
   using namespace mg5amcCpu;
@@ -104,11 +103,11 @@ main( int argc, char** argv )
     CurandDevice = 2
   };
 #ifdef __CUDACC__
-  RandomNumberMode rndgen = RandomNumberMode::CurandDevice; // default on CUDA GPU
+  RandomNumberMode rndgen = RandomNumberMode::CurandDevice; // default on GPU
 #elif not defined MGONGPU_HAS_NO_CURAND
   RandomNumberMode rndgen = RandomNumberMode::CurandHost;  // default on CPU if build has curand
 #else
-  RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // default on CPU if build has no curand and on HIP GPU
+  RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // default on CPU if build has no curand
 #endif
   // Rambo sampling mode (NB RamboHost implies CommonRandom or CurandHost!)
   enum class RamboSamplingMode
@@ -116,7 +115,7 @@ main( int argc, char** argv )
     RamboHost = 1,
     RamboDevice = 2
   };
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
   RamboSamplingMode rmbsmp = RamboSamplingMode::RamboDevice; // default on GPU
 #else
   RamboSamplingMode rmbsmp = RamboSamplingMode::RamboHost; // default on CPU
@@ -149,7 +148,7 @@ main( int argc, char** argv )
 #ifdef __CUDACC__
       rndgen = RandomNumberMode::CurandDevice;
 #else
-      throw std::runtime_error( "CurandDevice is not supported on CPUs or on HIP GPUs" );
+      throw std::runtime_error( "CurandDevice is not supported on CPUs" );
 #endif
     }
     else if( arg == "--curhst" )
@@ -166,7 +165,7 @@ main( int argc, char** argv )
     }
     else if( arg == "--rmbdev" )
     {
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
       rmbsmp = RamboSamplingMode::RamboDevice;
 #else
       throw std::runtime_error( "RamboDevice is not supported on CPUs" );
@@ -240,13 +239,13 @@ main( int argc, char** argv )
     return usage( argv[0] );
   }
 
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
 #ifdef _OPENMP
   ompnumthreadsNotSetMeansOneThread( debug ? 1 : 0 ); // quiet(-1), info(0), debug(1)
 #endif
 #endif
 
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
   // Fail gently and avoid "Illegal instruction (core dumped)" if the host does not support the SIMD used in the ME calculation
   // Note: this prevents a crash on pmpe04 but not on some github CI nodes?
   // [NB: SIMD vectorization in mg5amc C++ code is only used in the ME calculation below MatrixElementKernelHost!]
@@ -264,14 +263,14 @@ main( int argc, char** argv )
 
   // === STEP 0 - INITIALISE
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 
-  // --- 00. Initialise GPU
-  // Instantiate a GpuRuntime at the beginnining of the application's main.
-  // For CUDA this invokes cudaSetDevice(0) in the constructor and books a cudaDeviceReset() call in the destructor.
-  const std::string cdinKey = "00 GpuInit";
+  // --- 00. Initialise cuda
+  // Instantiate a CudaRuntime at the beginnining of the application's main to
+  // invoke cudaSetDevice(0) in the constructor and book a cudaDeviceReset() call in the destructor
+  const std::string cdinKey = "00 CudaInit";
   timermap.start( cdinKey );
-  GpuRuntime GpuRuntime( debug );
+  CudaRuntime cudaRuntime( debug );
 #endif
 
   // --- 0a. Initialise physics process
@@ -293,7 +292,7 @@ main( int argc, char** argv )
   timermap.start( alloKey );
 
   // Memory buffers for random numbers for momenta
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
   HostBufferRndNumMomenta hstRndmom( nevt );
 #else
   PinnedHostBufferRndNumMomenta hstRndmom( nevt );
@@ -301,7 +300,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for sampling weights
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
   HostBufferWeights hstWeights( nevt );
 #else
   PinnedHostBufferWeights hstWeights( nevt );
@@ -309,7 +308,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for momenta
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
   HostBufferMomenta hstMomenta( nevt );
 #else
   PinnedHostBufferMomenta hstMomenta( nevt );
@@ -317,7 +316,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for Gs
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
   HostBufferGs hstGs( nevt );
 #else
   PinnedHostBufferGs hstGs( nevt );
@@ -334,7 +333,7 @@ main( int argc, char** argv )
   }
 
   // Memory buffers for matrix elements
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
   HostBufferMatrixElements hstMatrixElements( nevt );
 #else
   PinnedHostBufferMatrixElements hstMatrixElements( nevt );
@@ -343,7 +342,7 @@ main( int argc, char** argv )
 
   // Memory buffers for random numbers for helicity selection
   // *** NB #403 these buffers always remain initialised at 0: no need for helicity choice in gcheck/check (no LHE produced) ***
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
   HostBufferRndNumHelicity hstRndHel( nevt );
 #else
   PinnedHostBufferRndNumHelicity hstRndHel( nevt );
@@ -352,7 +351,7 @@ main( int argc, char** argv )
 
   // Memory buffers for random numbers for color selection
   // *** NB #402 these buffers always remain initialised at 0: no need for color choice in gcheck/check (no LHE produced) ***
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
   HostBufferRndNumColor hstRndCol( nevt );
 #else
   PinnedHostBufferRndNumColor hstRndCol( nevt );
@@ -360,7 +359,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for helicity selection
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
   HostBufferSelectedHelicity hstSelHel( nevt );
 #else
   PinnedHostBufferSelectedHelicity hstSelHel( nevt );
@@ -368,7 +367,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for color selection
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
   HostBufferSelectedColor hstSelCol( nevt );
 #else
   PinnedHostBufferSelectedColor hstSelCol( nevt );
@@ -404,7 +403,7 @@ main( int argc, char** argv )
 #else
   else
   {
-    throw std::logic_error( "CurandDevice is not supported on CPUs or HIP GPUs" ); // INTERNAL ERROR (no path to this statement)
+    throw std::logic_error( "CurandDevice is not supported on CPUs" ); // INTERNAL ERROR (no path to this statement)
   }
 #endif
 #else
@@ -422,7 +421,7 @@ main( int argc, char** argv )
   }
   else
   {
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
     prsk.reset( new RamboSamplingKernelDevice( energy, devRndmom, devMomenta, devWeights, gpublocks, gputhreads ) );
 #else
     throw std::logic_error( "RamboDevice is not supported on CPUs" ); // INTERNAL ERROR (no path to this statement)
@@ -433,7 +432,7 @@ main( int argc, char** argv )
   std::unique_ptr<MatrixElementKernelBase> pmek;
   if( !bridge )
   {
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
     pmek.reset( new MatrixElementKernelDevice( devMomenta, devGs, devRndHel, devRndCol, devMatrixElements, devSelHel, devSelCol, gpublocks, gputhreads ) );
 #else
     pmek.reset( new MatrixElementKernelHost( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, nevt ) );
@@ -441,7 +440,7 @@ main( int argc, char** argv )
   }
   else
   {
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
     pmek.reset( new BridgeKernelDevice( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, gpublocks, gputhreads ) );
 #else
     pmek.reset( new BridgeKernelHost( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, nevt ) );
@@ -483,7 +482,7 @@ main( int argc, char** argv )
     prnk->generateRnarray();
     //std::cout << "Got random numbers" << std::endl;
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
     if( rndgen != RandomNumberMode::CurandDevice && rmbsmp == RamboSamplingMode::RamboDevice )
     {
       // --- 1c. Copy rndmom from host to device
@@ -515,7 +514,7 @@ main( int argc, char** argv )
     prsk->getMomentaFinal();
     //std::cout << "Got final momenta" << std::endl;
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
     if( rmbsmp == RamboSamplingMode::RamboDevice )
     {
       // --- 2c. CopyDToH Weights
@@ -560,7 +559,7 @@ main( int argc, char** argv )
       dynamic_cast<BridgeKernelBase*>( pmek.get() )->transposeInputMomentaC2F();
     }
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
     // --- 2d. CopyHToD Momenta
     const std::string gKey = "0.. CpHTDg";
     rambtime += timermap.start( gKey ); // FIXME! NOT A RAMBO TIMER!
@@ -589,7 +588,7 @@ main( int argc, char** argv )
     wv3atime += timermap.stop(); // calc only
     wavetime += wv3atime;        // calc plus copy
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
     if( !bridge )
     {
       // --- 3b. CopyDToH MEs
@@ -732,19 +731,15 @@ main( int argc, char** argv )
     rndgentxt = "CURAND DEVICE";
 #ifdef __CUDACC__
   rndgentxt += " (CUDA code)";
-#elif defined __HIPCC__
-  rndgentxt += " (HIP code)";
 #else
   rndgentxt += " (C++ code)";
 #endif
 
   // Workflow description summary
   std::string wrkflwtxt;
-  // -- CUDA or HIP or C++?
+  // -- CUDA or C++?
 #ifdef __CUDACC__
   wrkflwtxt += "CUD:";
-#elif defined __HIPCC__
-  wrkflwtxt += "HIP:";
 #else
   wrkflwtxt += "CPP:";
 #endif
@@ -769,12 +764,6 @@ main( int argc, char** argv )
 #else
   wrkflwtxt += "???:"; // no path to this statement
 #endif
-#elif defined __HIPCC__
-#if defined MGONGPU_CUCXTYPE_CXSMPL
-  wrkflwtxt += "CXS:";
-#else
-  wrkflwtxt += "???:"; // no path to this statement
-#endif
 #else
 #if defined MGONGPU_CPPCXTYPE_STDCOMPLEX
   wrkflwtxt += "STX:";
@@ -800,7 +789,7 @@ main( int argc, char** argv )
     wrkflwtxt += "RMBDEV+";
   else
     wrkflwtxt += "??????+"; // no path to this statement
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
   // -- HOST or DEVICE matrix elements? Standalone MEs or BRIDGE?
   if( !bridge )
     wrkflwtxt += "MESDEV";
@@ -856,7 +845,7 @@ main( int argc, char** argv )
 
   if( perf )
   {
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
 #ifdef _OPENMP
     // Get the output of "nproc --all" (https://stackoverflow.com/a/478960)
     std::string nprocall;
@@ -877,8 +866,6 @@ main( int argc, char** argv )
     std::cout << std::string( SEP79, '*' ) << std::endl
 #ifdef __CUDACC__
               << "Process                     = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_CUDA"
-#elif defined __HIPCC__
-              << "Process                     = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_HIP"
 #else
               << "Process                     = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_CPP"
 #endif
@@ -905,21 +892,21 @@ main( int argc, char** argv )
 #elif defined MGONGPU_FPTYPE_FLOAT
               << "FP precision                = FLOAT (NaN/abnormal=" << nabn << ", zero=" << nzero << ")" << std::endl
 #endif
+#ifdef __CUDACC__
 #if defined MGONGPU_CUCXTYPE_CUCOMPLEX
               << "Complex type                = CUCOMPLEX" << std::endl
 #elif defined MGONGPU_CUCXTYPE_THRUST
               << "Complex type                = THRUST::COMPLEX" << std::endl
-#elif defined MGONGPU_CUCXTYPE_CXSMPL
-              << "Complex type                = STD::COMPLEX" << std::endl
+#endif
 #else
-              << "Complex type                = ???" << std::endl // no path to this statement...
+              << "Complex type                = STD::COMPLEX" << std::endl
 #endif
               << "RanNumb memory layout       = AOSOA[" << neppR << "]"
               << ( neppR == 1 ? " == AOS" : "" )
               << " [HARDCODED FOR REPRODUCIBILITY]" << std::endl
               << "Momenta memory layout       = AOSOA[" << neppM << "]"
               << ( neppM == 1 ? " == AOS" : "" ) << std::endl
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
     //<< "Wavefunction GPU memory     = LOCAL" << std::endl
 #else
 #if !defined MGONGPU_CPPSIMD
@@ -950,7 +937,7 @@ main( int argc, char** argv )
 #endif
 #endif
               << "Random number generation    = " << rndgentxt << std::endl
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
 #ifdef _OPENMP
               << "OMP threads / `nproc --all` = " << omp_get_max_threads() << " / " << nprocall // includes a newline
 #endif
@@ -1046,14 +1033,14 @@ main( int argc, char** argv )
              << "\"FLOAT (NaN/abnormal=" << nabn << ")\"," << std::endl
 #endif
              << "\"Complex type\": "
+#ifdef __CUDACC__
 #if defined MGONGPU_CUCXTYPE_CUCOMPLEX
              << "\"CUCOMPLEX\"," << std::endl
 #elif defined MGONGPU_CUCXTYPE_THRUST
              << "\"THRUST::COMPLEX\"," << std::endl
-#elif defined MGONGPU_CUCXTYPE_CXSMPL
-             << "\"STD::COMPLEX\"," << std::endl
+#endif
 #else
-             << "\"???\"," << std::endl                           // no path to this statement...
+             << "\"STD::COMPLEX\"," << std::endl
 #endif
              << "\"RanNumb memory layout\": "
              << "\"AOSOA[" << neppR << "]\""
@@ -1061,7 +1048,7 @@ main( int argc, char** argv )
              << "\"Momenta memory layout\": "
              << "\"AOSOA[" << neppM << "]\""
              << ( neppM == 1 ? " == AOS" : "" ) << ", " << std::endl
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
     //<< "\"Wavefunction GPU memory\": " << "\"LOCAL\"," << std::endl
 #endif
              << "\"Curand generation\": "
diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/RamboSamplingKernels.cc b/epochX/cudacpp/gq_ttq.sa/SubProcesses/RamboSamplingKernels.cc
index 79abbcc4f8..da68aa9255 100644
--- a/epochX/cudacpp/gq_ttq.sa/SubProcesses/RamboSamplingKernels.cc
+++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/RamboSamplingKernels.cc
@@ -1,11 +1,11 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #include "RamboSamplingKernels.h"
 
-#include "GpuRuntime.h"
+#include "CudaRuntime.h"
 #include "MemoryAccessMomenta.h"
 #include "MemoryAccessRandomNumbers.h"
 #include "MemoryAccessWeights.h"
@@ -14,7 +14,7 @@
 
 #include <sstream>
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -92,7 +92,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
   RamboSamplingKernelDevice::RamboSamplingKernelDevice( const fptype energy,               // input: energy
                                                         const BufferRndNumMomenta& rndmom, // input: random numbers in [0,1]
                                                         BufferMomenta& momenta,            // output: momenta
@@ -135,7 +135,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
   __global__ void
   getMomentaInitialDevice( const fptype energy,
                            fptype* momenta )
@@ -147,17 +147,17 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
   void
   RamboSamplingKernelDevice::getMomentaInitial()
   {
-    gpuLaunchKernel( getMomentaInitialDevice, m_gpublocks, m_gputhreads, m_energy, m_momenta.data() );
+    getMomentaInitialDevice<<<m_gpublocks, m_gputhreads>>>( m_energy, m_momenta.data() );
   }
 #endif
 
   //--------------------------------------------------------------------------
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
   __global__ void
   getMomentaFinalDevice( const fptype energy,
                          const fptype* rndmom,
@@ -171,11 +171,11 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
   void
   RamboSamplingKernelDevice::getMomentaFinal()
   {
-    gpuLaunchKernel( getMomentaFinalDevice, m_gpublocks, m_gputhreads, m_energy, m_rndmom.data(), m_momenta.data(), m_weights.data() );
+    getMomentaFinalDevice<<<m_gpublocks, m_gputhreads>>>( m_energy, m_rndmom.data(), m_momenta.data(), m_weights.data() );
   }
 #endif
 
diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/RamboSamplingKernels.h b/epochX/cudacpp/gq_ttq.sa/SubProcesses/RamboSamplingKernels.h
index 7c214cd74b..184089efd7 100644
--- a/epochX/cudacpp/gq_ttq.sa/SubProcesses/RamboSamplingKernels.h
+++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/RamboSamplingKernels.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef RAMBOSAMPLINGKERNELS_H
 #define RAMBOSAMPLINGKERNELS_H 1
@@ -10,7 +10,7 @@
 
 #include "MemoryBuffers.h"
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -93,7 +93,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
   // A class encapsulating RAMBO phase space sampling on a GPU device
   class RamboSamplingKernelDevice final : public SamplingKernelBase, public NumberOfEvents
   {
diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/RandomNumberKernels.h b/epochX/cudacpp/gq_ttq.sa/SubProcesses/RandomNumberKernels.h
index 21d63beeac..188a72c2c9 100644
--- a/epochX/cudacpp/gq_ttq.sa/SubProcesses/RandomNumberKernels.h
+++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/RandomNumberKernels.h
@@ -1,14 +1,14 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef RANDOMNUMBERKERNELS_H
 #define RANDOMNUMBERKERNELS_H 1
 
 #include "mgOnGpuConfig.h"
 
-// NB This must come AFTER mgOnGpuConfig.h which contains our definition of __global__ when MGONGPUCPP_GPUIMPL is not defined
+// NB This must come AFTER mgOnGpuConfig.h which contains our definition of __global__ when __CUDACC__ is not defined
 #ifndef MGONGPU_HAS_NO_CURAND
 //#include "curand.h"
 struct curandGenerator_st; // forward definition from curand.h
@@ -16,7 +16,7 @@ struct curandGenerator_st; // forward definition from curand.h
 
 #include "MemoryBuffers.h"
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/cudacpp.mk b/epochX/cudacpp/gq_ttq.sa/SubProcesses/cudacpp.mk
index bcb73d7f01..a0397e9ecc 100644
--- a/epochX/cudacpp/gq_ttq.sa/SubProcesses/cudacpp.mk
+++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/cudacpp.mk
@@ -1,7 +1,7 @@
 # Copyright (C) 2020-2023 CERN and UCLouvain.
 # Licensed under the GNU Lesser General Public License (version 3 or later).
 # Created by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin.
-# Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+# Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 
 #=== Determine the name of this makefile (https://ftp.gnu.org/old-gnu/Manuals/make-3.80/html_node/make_17.html)
 #=== NB: different names (e.g. cudacpp.mk and cudacpp_src.mk) are used in the Subprocess and src directories
@@ -32,7 +32,7 @@ UNAME_P := $(shell uname -p)
 #=== Configure common compiler flags for C++ and CUDA
 
 INCFLAGS = -I.
-OPTFLAGS = -O3 # this ends up in GPUFLAGS too (should it?), cannot add -Ofast or -ffast-math here
+OPTFLAGS = -O3 # this ends up in CUFLAGS too (should it?), cannot add -Ofast or -ffast-math here
 
 # Dependency on src directory
 MG5AMC_COMMONLIB = mg5amc_common
@@ -89,139 +89,69 @@ endif
 
 #-------------------------------------------------------------------------------
 
-CUDA_COMPILER_PATH := $(shell compiler="`which nvcc 2>/dev/null`" && while [ -L "$$compiler" ]; do compiler=`readlink "$$compiler"`; done && echo "$$compiler")
-HIP_COMPILER_PATH := $(shell compiler="`which hipcc 2>/dev/null`" && while [ -L "$$compiler" ]; do compiler=`readlink "$$compiler"`; done && echo "$$compiler")
-
-ifeq ($(findstring nvcc,$(CUDA_COMPILER_PATH)),nvcc)
-  #=== Configure the CUDA compiler
-
-  # If CXX is not a single word (example "clang++ --gcc-toolchain...") then disable CUDA builds (issue #505)
-  # This is because it is impossible to pass this to "GPUFLAGS += -ccbin <host-compiler>" below
-  ifneq ($(words $(subst ccache ,,$(CXX))),1) # allow at most "CXX=ccache <host-compiler>" from outside
-    $(warning CUDA builds are not supported for multi-word CXX "$(CXX)")
-    override CUDA_HOME=disabled
-  endif
-
-  # If CUDA_HOME is not set, try to set it from the location of NVCC
-  ifndef CUDA_HOME
-    CUDA_HOME = $(patsubst %bin/nvcc,%,$(shell which nvcc 2>/dev/null))
-    $(warning CUDA_HOME was not set: using "$(CUDA_HOME)")
-  endif
-
-  # Set GPUCC as $(CUDA_HOME)/bin/nvcc if it exists
-  ifneq ($(wildcard $(CUDA_HOME)/bin/nvcc),)
-    GPUCC = $(CUDA_HOME)/bin/nvcc
-    USE_NVTX ?=-DUSE_NVTX
-    # See https://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/index.html
-    # See https://arnon.dk/matching-sm-architectures-arch-and-gencode-for-various-nvidia-cards/
-    # Default: use compute capability 70 for V100 (CERN lxbatch, CERN itscrd, Juwels Cluster).
-    # Embed device code for 70, and PTX for 70+.
-    # Export MADGRAPH_CUDA_ARCHITECTURE (comma-separated list) to use another value or list of values (see #533).
-    # Examples: use 60 for P100 (Piz Daint), 80 for A100 (Juwels Booster, NVidia raplab/Curiosity).
-    MADGRAPH_CUDA_ARCHITECTURE ?= 70
-    ###CUARCHFLAGS = -gencode arch=compute_$(MADGRAPH_CUDA_ARCHITECTURE),code=compute_$(MADGRAPH_CUDA_ARCHITECTURE) -gencode arch=compute_$(MADGRAPH_CUDA_ARCHITECTURE),code=sm_$(MADGRAPH_CUDA_ARCHITECTURE) # Older implementation (AV): go back to this one for multi-GPU support #533
-    ###CUARCHFLAGS = --gpu-architecture=compute_$(MADGRAPH_CUDA_ARCHITECTURE) --gpu-code=sm_$(MADGRAPH_CUDA_ARCHITECTURE),compute_$(MADGRAPH_CUDA_ARCHITECTURE) # Newer implementation (SH): cannot use this as-is for multi-GPU support #533
-    comma:=,
-    CUARCHFLAGS = $(foreach arch,$(subst $(comma), ,$(MADGRAPH_CUDA_ARCHITECTURE)),-gencode arch=compute_$(arch),code=compute_$(arch) -gencode arch=compute_$(arch),code=sm_$(arch))
-    CUINC = -I$(CUDA_HOME)/include/
-    CURANDLIBFLAGS = -L$(CUDA_HOME)/lib64/ -lcurand # NB: -lcuda is not needed here!
-    CUOPTFLAGS = -lineinfo
-    GPUFLAGS = $(OPTFLAGS) $(CUOPTFLAGS) $(INCFLAGS) $(CUINC) $(USE_NVTX) $(CUARCHFLAGS) -use_fast_math
-    ###GPUFLAGS += -Xcompiler -Wall -Xcompiler -Wextra -Xcompiler -Wshadow
-    ###GPUCC_VERSION = $(shell $(GPUCC) --version | grep 'Cuda compilation tools' | cut -d' ' -f5 | cut -d, -f1)
-    GPUFLAGS += -std=c++17 # need CUDA >= 11.2 (see #333): this is enforced in mgOnGpuConfig.h
-    # Without -maxrregcount: baseline throughput: 6.5E8 (16384 32 12) up to 7.3E8 (65536 128 12)
-    ###GPUFLAGS+= --maxrregcount 160 # improves throughput: 6.9E8 (16384 32 12) up to 7.7E8 (65536 128 12)
-    ###GPUFLAGS+= --maxrregcount 128 # improves throughput: 7.3E8 (16384 32 12) up to 7.6E8 (65536 128 12)
-    ###GPUFLAGS+= --maxrregcount 96 # degrades throughput: 4.1E8 (16384 32 12) up to 4.5E8 (65536 128 12)
-    ###GPUFLAGS+= --maxrregcount 64 # degrades throughput: 1.7E8 (16384 32 12) flat at 1.7E8 (65536 128 12)
-
-    CUBUILDRULEFLAGS = -Xcompiler -fPIC -c
-    CCBUILDRULEFLAGS = -Xcompiler -fPIC -c -x cu
-
-    CUDATESTFLAGS = -lcuda
-
-  else ifneq ($(origin REQUIRE_CUDA),undefined)
-    # If REQUIRE_CUDA is set but no cuda is found, stop here (e.g. for CI tests on GPU #443)
-    $(error No cuda installation found (set CUDA_HOME or make GPUCC visible in PATH))
-  else
-    # No cuda. Switch cuda compilation off and go to common random numbers in C++
-    $(warning CUDA_HOME is not set or is invalid: export CUDA_HOME to compile with cuda)
-    override GPUCC=
-    override USE_NVTX=
-    override CUINC=
-    override CURANDLIBFLAGS=
-  endif
-
-  # Set the host C++ compiler for GPUCC via "-ccbin <host-compiler>"
-  # (NB issue #505: this must be a single word, "clang++ --gcc-toolchain..." is not supported)
-  GPUFLAGS += -ccbin $(shell which $(subst ccache ,,$(CXX)))
-
-  # Allow newer (unsupported) C++ compilers with older versions of CUDA if ALLOW_UNSUPPORTED_COMPILER_IN_CUDA is set (#504)
-  ifneq ($(origin ALLOW_UNSUPPORTED_COMPILER_IN_CUDA),undefined)
-  GPUFLAGS += -allow-unsupported-compiler
-  endif
-
-else ifeq ($(findstring hipcc,$(HIP_COMPILER_PATH)),hipcc)
-  #=== Configure the HIP compiler
-
-  # If CXX is not a single word (example "clang++ --gcc-toolchain...") then disable CUDA builds (issue #505)
-  # This is because it is impossible to pass this to "GPUFLAGS += -ccbin <host-compiler>" below
-  ifneq ($(words $(subst ccache ,,$(CXX))),1) # allow at most "CXX=ccache <host-compiler>" from outside
-    $(warning CUDA builds are not supported for multi-word CXX "$(CXX)")
-    override CUDA_HOME=disabled
-  endif
-
-  # If HIP_HOME is not set, try to set it from the location of GPUCC
-  ifndef HIP_HOME
-    HIP_HOME = $(patsubst %bin/hipcc,%,$(HIP_COMPILER_PATH))
-    $(warning HIP_HOME was not set: using "$(HIP_HOME)")
-  endif
-
-  # Set GPUCC as $(HIP_HOME)/bin/hipcc if it exists
-  ifneq ($(wildcard $(HIP_HOME)/bin/hipcc),)
-    GPUCC = $(HIP_HOME)/bin/hipcc
-
-    # Should maybe find something equivelant to this in HIP
-    #USE_NVTX ?=-DUSE_NVTX
-
-    HIPARCHFLAGS = -target x86_64-linux-gnu --offload-arch=gfx90a
-    HIPINC = -I$(HIP_HOME)/include/
-
-    # -DHIP_FAST_MATH equivelant to -use_fast_math in HIP 
-    # (But only for single precision line 208: https://rocm-developer-tools.github.io/HIP/hcc__detail_2math__functions_8h_source.html)
-    GPUFLAGS = $(OPTFLAGS) $(CUOPTFLAGS) $(INCFLAGS) $(HIPINC) $(HIPARCHFLAGS) -DHIP_FAST_MATH -DHIP_PLATFORM=amd -fPIC
-    ###GPUFLAGS += -Xcompiler -Wall -Xcompiler -Wextra -Xcompiler -Wshadow
-    GPUFLAGS += -std=c++17
-    # Without -maxrregcount: baseline throughput: 6.5E8 (16384 32 12) up to 7.3E8 (65536 128 12)
-    ###GPUFLAGS+= --maxrregcount 160 # improves throughput: 6.9E8 (16384 32 12) up to 7.7E8 (65536 128 12)
-    ###GPUFLAGS+= --maxrregcount 128 # improves throughput: 7.3E8 (16384 32 12) up to 7.6E8 (65536 128 12)
-    ###GPUFLAGS+= --maxrregcount 96 # degrades throughput: 4.1E8 (16384 32 12) up to 4.5E8 (65536 128 12)
-    ###GPUFLAGS+= --maxrregcount 64 # degrades throughput: 1.7E8 (16384 32 12) flat at 1.7E8 (65536 128 12)
-
-    CUBUILDRULEFLAGS = -fPIC -c
-    CCBUILDRULEFLAGS = -fPIC -c
-
-  else ifneq ($(origin REQUIRE_HIP),undefined)
-    # If REQUIRE_HIP is set but no cuda is found, stop here (e.g. for CI tests on GPU #443)
-    $(error No hip installation found (set HIP_HOME or make GPUCC visible in PATH))
-  else
-    # No hip. Switch hip compilation off and go to common random numbers in C++
-    $(warning HIP_HOME is not set or is invalid: export HIP_HOME to compile with hip)
-    override GPUCC=
-    override USE_NVTX=
-    override CUINC=
-    override CURANDLIBFLAGS=
-  endif
+#=== Configure the CUDA compiler
+
+# If CXX is not a single word (example "clang++ --gcc-toolchain...") then disable CUDA builds (issue #505)
+# This is because it is impossible to pass this to "CUFLAGS += -ccbin <host-compiler>" below
+ifneq ($(words $(subst ccache ,,$(CXX))),1) # allow at most "CXX=ccache <host-compiler>" from outside
+  $(warning CUDA builds are not supported for multi-word CXX "$(CXX)")
+  override CUDA_HOME=disabled
+endif
+
+# If CUDA_HOME is not set, try to set it from the location of nvcc
+ifndef CUDA_HOME
+  CUDA_HOME = $(patsubst %bin/nvcc,%,$(shell which nvcc 2>/dev/null))
+  $(warning CUDA_HOME was not set: using "$(CUDA_HOME)")
+endif
+
+# Set NVCC as $(CUDA_HOME)/bin/nvcc if it exists
+ifneq ($(wildcard $(CUDA_HOME)/bin/nvcc),)
+  NVCC = $(CUDA_HOME)/bin/nvcc
+  USE_NVTX ?=-DUSE_NVTX
+  # See https://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/index.html
+  # See https://arnon.dk/matching-sm-architectures-arch-and-gencode-for-various-nvidia-cards/
+  # Default: use compute capability 70 for V100 (CERN lxbatch, CERN itscrd, Juwels Cluster).
+  # Embed device code for 70, and PTX for 70+.
+  # Export MADGRAPH_CUDA_ARCHITECTURE (comma-separated list) to use another value or list of values (see #533).
+  # Examples: use 60 for P100 (Piz Daint), 80 for A100 (Juwels Booster, NVidia raplab/Curiosity).
+  MADGRAPH_CUDA_ARCHITECTURE ?= 70
+  ###CUARCHFLAGS = -gencode arch=compute_$(MADGRAPH_CUDA_ARCHITECTURE),code=compute_$(MADGRAPH_CUDA_ARCHITECTURE) -gencode arch=compute_$(MADGRAPH_CUDA_ARCHITECTURE),code=sm_$(MADGRAPH_CUDA_ARCHITECTURE) # Older implementation (AV): go back to this one for multi-GPU support #533
+  ###CUARCHFLAGS = --gpu-architecture=compute_$(MADGRAPH_CUDA_ARCHITECTURE) --gpu-code=sm_$(MADGRAPH_CUDA_ARCHITECTURE),compute_$(MADGRAPH_CUDA_ARCHITECTURE) # Newer implementation (SH): cannot use this as-is for multi-GPU support #533
+  comma:=,
+  CUARCHFLAGS = $(foreach arch,$(subst $(comma), ,$(MADGRAPH_CUDA_ARCHITECTURE)),-gencode arch=compute_$(arch),code=compute_$(arch) -gencode arch=compute_$(arch),code=sm_$(arch))
+  CUINC = -I$(CUDA_HOME)/include/
+  CURANDLIBFLAGS = -L$(CUDA_HOME)/lib64/ -lcurand # NB: -lcuda is not needed here!
+  CUOPTFLAGS = -lineinfo
+  CUFLAGS = $(OPTFLAGS) $(CUOPTFLAGS) $(INCFLAGS) $(CUINC) $(USE_NVTX) $(CUARCHFLAGS) -use_fast_math
+  ###CUFLAGS += -Xcompiler -Wall -Xcompiler -Wextra -Xcompiler -Wshadow
+  ###NVCC_VERSION = $(shell $(NVCC) --version | grep 'Cuda compilation tools' | cut -d' ' -f5 | cut -d, -f1)
+  CUFLAGS += -std=c++17 # need CUDA >= 11.2 (see #333): this is enforced in mgOnGpuConfig.h
+  # Without -maxrregcount: baseline throughput: 6.5E8 (16384 32 12) up to 7.3E8 (65536 128 12)
+  ###CUFLAGS+= --maxrregcount 160 # improves throughput: 6.9E8 (16384 32 12) up to 7.7E8 (65536 128 12)
+  ###CUFLAGS+= --maxrregcount 128 # improves throughput: 7.3E8 (16384 32 12) up to 7.6E8 (65536 128 12)
+  ###CUFLAGS+= --maxrregcount 96 # degrades throughput: 4.1E8 (16384 32 12) up to 4.5E8 (65536 128 12)
+  ###CUFLAGS+= --maxrregcount 64 # degrades throughput: 1.7E8 (16384 32 12) flat at 1.7E8 (65536 128 12)
+else ifneq ($(origin REQUIRE_CUDA),undefined)
+  # If REQUIRE_CUDA is set but no cuda is found, stop here (e.g. for CI tests on GPU #443)
+  $(error No cuda installation found (set CUDA_HOME or make nvcc visible in PATH))
+else
+  # No cuda. Switch cuda compilation off and go to common random numbers in C++
+  $(warning CUDA_HOME is not set or is invalid: export CUDA_HOME to compile with cuda)
+  override NVCC=
+  override USE_NVTX=
+  override CUINC=
+  override CURANDLIBFLAGS=
+endif
 
-  # Allow newer (unsupported) C++ compilers with older versions of CUDA if ALLOW_UNSUPPORTED_COMPILER_IN_CUDA is set (#504)
-  ifneq ($(origin ALLOW_UNSUPPORTED_COMPILER_IN_CUDA),undefined)
-  GPUFLAGS += -allow-unsupported-compiler
-  endif
+# Set the host C++ compiler for nvcc via "-ccbin <host-compiler>"
+# (NB issue #505: this must be a single word, "clang++ --gcc-toolchain..." is not supported)
+CUFLAGS += -ccbin $(shell which $(subst ccache ,,$(CXX)))
 
+# Allow newer (unsupported) C++ compilers with older versions of CUDA if ALLOW_UNSUPPORTED_COMPILER_IN_CUDA is set (#504)
+ifneq ($(origin ALLOW_UNSUPPORTED_COMPILER_IN_CUDA),undefined)
+CUFLAGS += -allow-unsupported-compiler
 endif
 
-  
 #-------------------------------------------------------------------------------
 
 #=== Configure ccache for C++ and CUDA builds
@@ -233,9 +163,9 @@ endif
 #ifeq ($(USECCACHE)$(shell echo $(AR) | grep ccache),1)
 #  override AR:=ccache $(AR)
 #endif
-ifneq ($(GPUCC),)
-  ifeq ($(USECCACHE)$(shell echo $(GPUCC) | grep ccache),1)
-    override GPUCC:=ccache $(GPUCC)
+ifneq ($(NVCC),)
+  ifeq ($(USECCACHE)$(shell echo $(NVCC) | grep ccache),1)
+    override NVCC:=ccache $(NVCC)
   endif
 endif
 
@@ -259,7 +189,7 @@ endif
 
 # PowerPC-specific CUDA compiler flags (to be reviewed!)
 ifeq ($(UNAME_P),ppc64le)
-  GPUFLAGS+= -Xcompiler -mno-float128
+  CUFLAGS+= -Xcompiler -mno-float128
 endif
 
 #-------------------------------------------------------------------------------
@@ -269,10 +199,10 @@ endif
 # Set the default OMPFLAGS choice
 ifneq ($(shell $(CXX) --version | egrep '^Intel'),)
 override OMPFLAGS = -fopenmp
-###override OMPFLAGS = # disable OpenMP MT on Intel (was ok without GPUCC but not ok with GPUCC before #578)
+###override OMPFLAGS = # disable OpenMP MT on Intel (was ok without nvcc but not ok with nvcc before #578)
 else ifneq ($(shell $(CXX) --version | egrep '^(clang)'),)
 override OMPFLAGS = -fopenmp
-###override OMPFLAGS = # disable OpenMP MT on clang (was not ok without or with GPUCC before #578)
+###override OMPFLAGS = # disable OpenMP MT on clang (was not ok without or with nvcc before #578)
 else ifneq ($(shell $(CXX) --version | egrep '^(Apple clang)'),)
 override OMPFLAGS = # disable OpenMP MT on Apple clang (builds fail in the CI #578)
 else
@@ -323,10 +253,7 @@ endif
 
 # Set the default RNDGEN (random number generator) choice
 ifeq ($(RNDGEN),)
-  ifeq ($(GPUCC),)
-    override RNDGEN = hasNoCurand
-  # Edgecase for HIP compilation
-  else ifeq ($(findstring hipcc,$(GPUCC)),hipcc)
+  ifeq ($(NVCC),)
     override RNDGEN = hasNoCurand
   else ifeq ($(RNDGEN),)
     override RNDGEN = hasCurand
@@ -401,13 +328,13 @@ CXXFLAGS+= $(AVXFLAGS)
 $(info FPTYPE=$(FPTYPE))
 ifeq ($(FPTYPE),d)
   CXXFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_DOUBLE
-  GPUFLAGS  += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_DOUBLE
+  CUFLAGS  += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_DOUBLE
 else ifeq ($(FPTYPE),f)
   CXXFLAGS += -DMGONGPU_FPTYPE_FLOAT -DMGONGPU_FPTYPE2_FLOAT
-  GPUFLAGS  += -DMGONGPU_FPTYPE_FLOAT -DMGONGPU_FPTYPE2_FLOAT
+  CUFLAGS  += -DMGONGPU_FPTYPE_FLOAT -DMGONGPU_FPTYPE2_FLOAT
 else ifeq ($(FPTYPE),m)
   CXXFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_FLOAT
-  GPUFLAGS  += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_FLOAT
+  CUFLAGS  += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_FLOAT
 else
   $(error Unknown FPTYPE='$(FPTYPE)': only 'd', 'f' and 'm' are supported)
 endif
@@ -416,7 +343,7 @@ endif
 $(info HELINL=$(HELINL))
 ifeq ($(HELINL),1)
   CXXFLAGS += -DMGONGPU_INLINE_HELAMPS
-  GPUFLAGS  += -DMGONGPU_INLINE_HELAMPS
+  CUFLAGS  += -DMGONGPU_INLINE_HELAMPS
 else ifneq ($(HELINL),0)
   $(error Unknown HELINL='$(HELINL)': only '0' and '1' are supported)
 endif
@@ -425,7 +352,7 @@ endif
 $(info HRDCOD=$(HRDCOD))
 ifeq ($(HRDCOD),1)
   CXXFLAGS += -DMGONGPU_HARDCODE_PARAM
-  GPUFLAGS  += -DMGONGPU_HARDCODE_PARAM
+  CUFLAGS  += -DMGONGPU_HARDCODE_PARAM
 else ifneq ($(HRDCOD),0)
   $(error Unknown HRDCOD='$(HRDCOD)': only '0' and '1' are supported)
 endif
@@ -477,11 +404,11 @@ ifeq ($(UNAME_S),Darwin)
   override CULIBFLAGSRPATH2 =
 else
   # RPATH to cuda/cpp libs when linking executables
-  override CXXLIBFLAGSRPATH = -Wl,-rpath=$(LIBDIRRPATH)
-  override CULIBFLAGSRPATH = -Xlinker -rpath=$(LIBDIRRPATH)
+  override CXXLIBFLAGSRPATH = -Wl,-rpath,$(LIBDIRRPATH)
+  override CULIBFLAGSRPATH = -Xlinker -rpath,$(LIBDIRRPATH)
   # RPATH to common lib when linking cuda/cpp libs
-  override CXXLIBFLAGSRPATH2 = -Wl,-rpath='$$ORIGIN'
-  override CULIBFLAGSRPATH2 = -Xlinker -rpath='$$ORIGIN'
+  override CXXLIBFLAGSRPATH2 = -Wl,-rpath,'$$ORIGIN'
+  override CULIBFLAGSRPATH2 = -Xlinker -rpath,'$$ORIGIN'
 endif
 
 # Setting LD_LIBRARY_PATH or DYLD_LIBRARY_PATH in the RUNTIME is no longer necessary (neither on Linux nor on Mac)
@@ -494,7 +421,7 @@ override RUNTIME =
 cxx_main=$(BUILDDIR)/check.exe
 fcxx_main=$(BUILDDIR)/fcheck.exe
 
-ifneq ($(GPUCC),)
+ifneq ($(NVCC),)
 cu_main=$(BUILDDIR)/gcheck.exe
 fcu_main=$(BUILDDIR)/fgcheck.exe
 else
@@ -525,16 +452,15 @@ $(BUILDDIR)/.build.$(TAG):
 	@touch $(BUILDDIR)/.build.$(TAG)
 
 # Generic target and build rules: objects from CUDA compilation
-ifneq ($(GPUCC),)
+ifneq ($(NVCC),)
 $(BUILDDIR)/%.o : %.cu *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
 	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
-	$(GPUCC) $(CPPFLAGS) $(GPUFLAGS) $(CUBUILDRULEFLAGS) $< -o $@
+	$(NVCC) $(CPPFLAGS) $(CUFLAGS) -Xcompiler -fPIC -c $< -o $@
 
 $(BUILDDIR)/%_cu.o : %.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
 	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
-	$(GPUCC) $(CPPFLAGS) $(GPUFLAGS) $(CCBUILDRULEFLAGS) $< -o $@
+	$(NVCC) $(CPPFLAGS) $(CUFLAGS) -Xcompiler -fPIC -c -x cu $< -o $@
 endif
-# -x cu in line above
 
 # Generic target and build rules: objects from C++ compilation
 # (NB do not include CUINC here! add it only for NVTX or curand #679)
@@ -543,14 +469,11 @@ $(BUILDDIR)/%.o : %.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
 	$(CXX) $(CPPFLAGS) $(CXXFLAGS) -fPIC -c $< -o $@
 
 # Apply special build flags only to CrossSectionKernel.cc and gCrossSectionKernel.cu (no fast math, see #117)
-# Added edgecase for HIP compilation
 ifeq ($(shell $(CXX) --version | grep ^nvc++),)
 $(BUILDDIR)/CrossSectionKernels.o: CXXFLAGS += -fno-fast-math
 $(BUILDDIR)/CrossSectionKernels.o: CXXFLAGS += -fno-fast-math
-ifeq ($(findstring nvcc,$(GPUCC)),nvcc)
-  $(BUILDDIR)/gCrossSectionKernels.o: GPUFLAGS += -Xcompiler -fno-fast-math
-else
-  $(BUILDDIR)/gCrossSectionKernels.o: GPUFLAGS += -fno-fast-math
+ifneq ($(NVCC),)
+$(BUILDDIR)/gCrossSectionKernels.o: CUFLAGS += -Xcompiler -fno-fast-math
 endif
 endif
 
@@ -566,10 +489,10 @@ ifeq ($(RNDGEN),hasCurand)
 $(BUILDDIR)/CurandRandomNumberKernel.o: CXXFLAGS += $(CUINC)
 endif
 
-# Avoid "warning: builtin __has_trivial_... is deprecated; use __is_trivially_... instead" in GPUCC with icx2023 (#592)
+# Avoid "warning: builtin __has_trivial_... is deprecated; use __is_trivially_... instead" in nvcc with icx2023 (#592)
 ifneq ($(shell $(CXX) --version | egrep '^(Intel)'),)
-ifneq ($(GPUCC),)
-GPUFLAGS += -Wno-deprecated-builtins
+ifneq ($(NVCC),)
+CUFLAGS += -Xcompiler -Wno-deprecated-builtins
 endif
 endif
 
@@ -577,8 +500,8 @@ endif
 # This patch does remove the warning, but I prefer to keep it disabled for the moment...
 ###ifneq ($(shell $(CXX) --version | egrep '^(clang|Apple clang|Intel)'),)
 ###$(BUILDDIR)/CrossSectionKernels.o: CXXFLAGS += -Wno-overriding-t-option
-###ifneq ($(GPUCC),)
-###$(BUILDDIR)/gCrossSectionKernels.o: GPUFLAGS += -Xcompiler -Wno-overriding-t-option
+###ifneq ($(NVCC),)
+###$(BUILDDIR)/gCrossSectionKernels.o: CUFLAGS += -Xcompiler -Wno-overriding-t-option
 ###endif
 ###endif
 
@@ -605,7 +528,7 @@ MG5AMC_CXXLIB = mg5amc_$(processid_short)_cpp
 cxx_objects_lib=$(BUILDDIR)/CPPProcess.o $(BUILDDIR)/MatrixElementKernels.o $(BUILDDIR)/BridgeKernels.o $(BUILDDIR)/CrossSectionKernels.o
 cxx_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel.o $(BUILDDIR)/RamboSamplingKernels.o
 
-ifneq ($(GPUCC),)
+ifneq ($(NVCC),)
 MG5AMC_CULIB = mg5amc_$(processid_short)_cuda
 cu_objects_lib=$(BUILDDIR)/gCPPProcess.o $(BUILDDIR)/gMatrixElementKernels.o $(BUILDDIR)/gBridgeKernels.o $(BUILDDIR)/gCrossSectionKernels.o
 cu_objects_exe=$(BUILDDIR)/gCommonRandomNumberKernel.o $(BUILDDIR)/gRamboSamplingKernels.o
@@ -617,11 +540,11 @@ $(LIBDIR)/lib$(MG5AMC_CXXLIB).so: cxx_objects_lib += $(BUILDDIR)/fbridge.o
 $(LIBDIR)/lib$(MG5AMC_CXXLIB).so: $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib)
 	$(CXX) -shared -o $@ $(cxx_objects_lib) $(CXXLIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB)
 
-ifneq ($(GPUCC),)
+ifneq ($(NVCC),)
 $(LIBDIR)/lib$(MG5AMC_CULIB).so: $(BUILDDIR)/fbridge_cu.o
 $(LIBDIR)/lib$(MG5AMC_CULIB).so: cu_objects_lib += $(BUILDDIR)/fbridge_cu.o
 $(LIBDIR)/lib$(MG5AMC_CULIB).so: $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cu_objects_lib)
-	$(GPUCC) --shared -o $@ $(cu_objects_lib) $(CULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB)
+	$(NVCC) --shared -o $@ $(cu_objects_lib) $(CULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB)
 endif
 
 #-------------------------------------------------------------------------------
@@ -638,16 +561,16 @@ $(cxx_main): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PAT
 $(cxx_main): $(BUILDDIR)/check_sa.o $(LIBDIR)/lib$(MG5AMC_CXXLIB).so $(cxx_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel.o
 	$(CXX) -o $@ $(BUILDDIR)/check_sa.o $(OMPFLAGS) -ldl -pthread $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_CXXLIB) $(cxx_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel.o $(CURANDLIBFLAGS)
 
-ifneq ($(GPUCC),)
+ifneq ($(NVCC),)
 ifneq ($(shell $(CXX) --version | grep ^Intel),)
-$(cu_main): LIBFLAGS += -lintlc # compile with icpx and link with GPUCC (undefined reference to `_intel_fast_memcpy')
-$(cu_main): LIBFLAGS += -lsvml # compile with icpx and link with GPUCC (undefined reference to `__svml_cos4_l9')
+$(cu_main): LIBFLAGS += -lintlc # compile with icpx and link with nvcc (undefined reference to `_intel_fast_memcpy')
+$(cu_main): LIBFLAGS += -lsvml # compile with icpx and link with nvcc (undefined reference to `__svml_cos4_l9')
 else ifneq ($(shell $(CXX) --version | grep ^nvc++),) # support nvc++ #531
 $(cu_main): LIBFLAGS += -L$(patsubst %bin/nvc++,%lib,$(subst ccache ,,$(CXX))) -lnvhpcatm -lnvcpumath -lnvc
 endif
 $(cu_main): LIBFLAGS += $(CULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH
 $(cu_main): $(BUILDDIR)/gcheck_sa.o $(LIBDIR)/lib$(MG5AMC_CULIB).so $(cu_objects_exe) $(BUILDDIR)/gCurandRandomNumberKernel.o
-	$(GPUCC) -o $@ $(BUILDDIR)/gcheck_sa.o $(CUARCHFLAGS) $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe) $(BUILDDIR)/gCurandRandomNumberKernel.o $(CURANDLIBFLAGS)
+	$(NVCC) -o $@ $(BUILDDIR)/gcheck_sa.o $(CUARCHFLAGS) $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe) $(BUILDDIR)/gCurandRandomNumberKernel.o $(CURANDLIBFLAGS)
 endif
 
 #-------------------------------------------------------------------------------
@@ -673,17 +596,17 @@ $(fcxx_main): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PA
 $(fcxx_main): $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler.o $(LIBDIR)/lib$(MG5AMC_CXXLIB).so $(cxx_objects_exe)
 	$(CXX) -o $@ $(BUILDDIR)/fcheck_sa.o $(OMPFLAGS) $(BUILDDIR)/fsampler.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CXXLIB) $(cxx_objects_exe)
 
-ifneq ($(GPUCC),)
+ifneq ($(NVCC),)
 ifneq ($(shell $(CXX) --version | grep ^Intel),)
-$(fcu_main): LIBFLAGS += -lintlc # compile with icpx and link with GPUCC (undefined reference to `_intel_fast_memcpy')
-$(fcu_main): LIBFLAGS += -lsvml # compile with icpx and link with GPUCC (undefined reference to `__svml_cos4_l9')
+$(fcu_main): LIBFLAGS += -lintlc # compile with icpx and link with nvcc (undefined reference to `_intel_fast_memcpy')
+$(fcu_main): LIBFLAGS += -lsvml # compile with icpx and link with nvcc (undefined reference to `__svml_cos4_l9')
 endif
 ifeq ($(UNAME_S),Darwin)
 $(fcu_main): LIBFLAGS += -L$(shell dirname $(shell $(FC) --print-file-name libgfortran.dylib)) # add path to libgfortran on Mac #375
 endif
 $(fcu_main): LIBFLAGS += $(CULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH
 $(fcu_main): $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler_cu.o $(LIBDIR)/lib$(MG5AMC_CULIB).so $(cu_objects_exe)
-	$(GPUCC) -o $@ $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler_cu.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe)
+	$(NVCC) -o $@ $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler_cu.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe)
 endif
 
 #-------------------------------------------------------------------------------
@@ -695,7 +618,7 @@ $(BUILDDIR)/testxxx.o: testxxx_cc_ref.txt
 $(testmain): $(BUILDDIR)/testxxx.o
 $(testmain): cxx_objects_exe += $(BUILDDIR)/testxxx.o # Comment out this line to skip the C++ test of xxx functions
 
-ifneq ($(GPUCC),)
+ifneq ($(NVCC),)
 $(BUILDDIR)/testxxx_cu.o: $(GTESTLIBS)
 $(BUILDDIR)/testxxx_cu.o: INCFLAGS += $(GTESTINC)
 $(BUILDDIR)/testxxx_cu.o: testxxx_cc_ref.txt
@@ -708,7 +631,7 @@ $(BUILDDIR)/testmisc.o: INCFLAGS += $(GTESTINC)
 $(testmain): $(BUILDDIR)/testmisc.o
 $(testmain): cxx_objects_exe += $(BUILDDIR)/testmisc.o # Comment out this line to skip the C++ miscellaneous tests
 
-ifneq ($(GPUCC),)
+ifneq ($(NVCC),)
 $(BUILDDIR)/testmisc_cu.o: $(GTESTLIBS)
 $(BUILDDIR)/testmisc_cu.o: INCFLAGS += $(GTESTINC)
 $(testmain): $(BUILDDIR)/testmisc_cu.o
@@ -720,12 +643,12 @@ $(BUILDDIR)/runTest.o: INCFLAGS += $(GTESTINC)
 $(testmain): $(BUILDDIR)/runTest.o
 $(testmain): cxx_objects_exe += $(BUILDDIR)/runTest.o
 
-ifneq ($(GPUCC),)
+ifneq ($(NVCC),)
 $(BUILDDIR)/runTest_cu.o: $(GTESTLIBS)
 $(BUILDDIR)/runTest_cu.o: INCFLAGS += $(GTESTINC)
 ifneq ($(shell $(CXX) --version | grep ^Intel),)
-$(testmain): LIBFLAGS += -lintlc # compile with icpx and link with GPUCC (undefined reference to `_intel_fast_memcpy')
-$(testmain): LIBFLAGS += -lsvml # compile with icpx and link with GPUCC (undefined reference to `__svml_cos4_l9')
+$(testmain): LIBFLAGS += -lintlc # compile with icpx and link with nvcc (undefined reference to `_intel_fast_memcpy')
+$(testmain): LIBFLAGS += -lsvml # compile with icpx and link with nvcc (undefined reference to `__svml_cos4_l9')
 else ifneq ($(shell $(CXX) --version | grep ^nvc++),) # support nvc++ #531
 $(testmain): LIBFLAGS += -L$(patsubst %bin/nvc++,%lib,$(subst ccache ,,$(CXX))) -lnvhpcatm -lnvcpumath -lnvc
 endif
@@ -749,14 +672,14 @@ $(testmain): LIBFLAGS += -lgomp
 endif
 endif
 
-ifeq ($(GPUCC),) # link only runTest.o
+ifeq ($(NVCC),) # link only runTest.o
 $(testmain): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH
 $(testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(cxx_objects_exe) $(GTESTLIBS)
 	$(CXX) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) -ldl -pthread $(LIBFLAGS)
 else # link both runTest.o and runTest_cu.o
 $(testmain): LIBFLAGS += $(CULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH
 $(testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) $(GTESTLIBS)
-	  $(GPUCC) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) -ldl $(LIBFLAGS) $(CUDATESTFLAGS)
+	$(NVCC) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) -ldl $(LIBFLAGS) -lcuda
 endif
 
 # Use flock (Linux only, no Mac) to allow 'make -j' if googletest has not yet been downloaded https://stackoverflow.com/a/32666215
@@ -859,9 +782,9 @@ ifeq ($(USECCACHE),1)
 	ccache --version | head -1
 endif
 	@echo ""
-	@echo GPUCC=$(GPUCC)
-ifneq ($(GPUCC),)
-	$(GPUCC) --version
+	@echo NVCC=$(NVCC)
+ifneq ($(NVCC),)
+	$(NVCC) --version
 endif
 	@echo ""
 	@echo CXX=$(CXX)
@@ -880,7 +803,7 @@ endif
 
 # Target: check (run the C++ test executable)
 # [NB THIS IS WHAT IS USED IN THE GITHUB CI!]
-ifneq ($(GPUCC),)
+ifneq ($(NVCC),)
 check: runTest cmpFcheck cmpFGcheck
 else
 check: runTest cmpFcheck
diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/fbridge.cc b/epochX/cudacpp/gq_ttq.sa/SubProcesses/fbridge.cc
index 2b956730d4..f93c05b0b3 100644
--- a/epochX/cudacpp/gq_ttq.sa/SubProcesses/fbridge.cc
+++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/fbridge.cc
@@ -1,11 +1,11 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: S. Roiser (Oct 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Roiser, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #include "Bridge.h"
 #include "CPPProcess.h"
-#include "GpuRuntime.h"
+#include "CudaRuntime.h"
 
 extern "C"
 {
@@ -22,7 +22,7 @@ extern "C"
    * Using the same Fortran MadEvent code, linking to the hetrerogeneous library would allow access to both CPU and GPU implementations.
    * The specific heterogeneous configuration (how many GPUs, how many threads on each CPU, etc) could be loaded in CUDA/C++ from a data file.
    */
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
   using namespace mg5amcGpu;
 #else
   using namespace mg5amcCpu;
@@ -46,8 +46,8 @@ extern "C"
    */
   void fbridgecreate_( CppObjectInFortran** ppbridge, const int* pnevtF, const int* pnparF, const int* pnp4F )
   {
-#ifdef MGONGPUCPP_GPUIMPL
-    GpuRuntime::setUp();
+#ifdef __CUDACC__
+    CudaRuntime::setUp();
 #endif
     // Create a process object, read parm card and set parameters
     // FIXME: the process instance can happily go out of scope because it is only needed to read parameters?
@@ -69,8 +69,8 @@ extern "C"
     Bridge<FORTRANFPTYPE>* pbridge = dynamic_cast<Bridge<FORTRANFPTYPE>*>( *ppbridge );
     if( pbridge == 0 ) throw std::runtime_error( "fbridgedelete_: invalid Bridge address" );
     delete pbridge;
-#ifdef MGONGPUCPP_GPUIMPL
-    GpuRuntime::tearDown();
+#ifdef __CUDACC__
+    CudaRuntime::tearDown();
 #endif
   }
 
@@ -100,7 +100,7 @@ extern "C"
   {
     Bridge<FORTRANFPTYPE>* pbridge = dynamic_cast<Bridge<FORTRANFPTYPE>*>( *ppbridge );
     if( pbridge == 0 ) throw std::runtime_error( "fbridgesequence_: invalid Bridge address" );
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
     // Use the device/GPU implementation in the CUDA library
     // (there is also a host implementation in this library)
     pbridge->gpu_sequence( momenta, gs, rndhel, rndcol, *pchannelId, mes, selhel, selcol );
diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/fsampler.cc b/epochX/cudacpp/gq_ttq.sa/SubProcesses/fsampler.cc
index 3743934f41..2fb445372d 100644
--- a/epochX/cudacpp/gq_ttq.sa/SubProcesses/fsampler.cc
+++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/fsampler.cc
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Feb 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #include "mgOnGpuConfig.h"
 
@@ -13,7 +13,7 @@
 
 //--------------------------------------------------------------------------
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -40,7 +40,7 @@ namespace mg5amcCpu
   private:
     const int m_nevt; // The number of events in each iteration
     int m_iiter;      // The iteration counter (for random number seeding)
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
     HostBufferRndNumMomenta m_hstRndmom; // Memory buffers for random numbers
     HostBufferMomenta m_hstMomenta;      // Memory buffers for momenta
     HostBufferWeights m_hstWeights;      // Memory buffers for sampling weights
@@ -105,7 +105,7 @@ namespace mg5amcCpu
 
 extern "C"
 {
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
   using namespace mg5amcGpu;
 #else
   using namespace mg5amcCpu;
diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/runTest.cc b/epochX/cudacpp/gq_ttq.sa/SubProcesses/runTest.cc
index 461ec5c3a5..572e28aaea 100644
--- a/epochX/cudacpp/gq_ttq.sa/SubProcesses/runTest.cc
+++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/runTest.cc
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: S. Hageboeck (Nov 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 
 #include "mgOnGpuConfig.h"
 
@@ -15,7 +15,7 @@
 #include "RandomNumberKernels.h"
 #include "epoch_process_id.h"
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 using namespace mg5amcGpu;
 #else
 using namespace mg5amcCpu;
@@ -32,7 +32,7 @@ struct CUDA_CPU_TestBase : public TestDriverBase
     : TestDriverBase( npar, refFileName ) {}
 };
 
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
 struct CPUTest : public CUDA_CPU_TestBase
 {
   // Struct data members (process, and memory structures for random numbers, momenta, matrix elements and weights on host and device)
@@ -114,7 +114,7 @@ struct CPUTest : public CUDA_CPU_TestBase
 };
 #endif
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 struct CUDATest : public CUDA_CPU_TestBase
 {
   // Reset the device when our test goes out of scope. Note that this should happen after
@@ -123,7 +123,7 @@ struct CUDATest : public CUDA_CPU_TestBase
   {
     ~DeviceReset()
     {
-      gpuDeviceReset(); // this is needed by cuda-memcheck --leak-check full
+      checkCuda( cudaDeviceReset() ); // this is needed by cuda-memcheck --leak-check full
     }
   } deviceResetter;
 
@@ -249,7 +249,7 @@ INSTANTIATE_TEST_SUITE_P( prefix, \
                           test_suite_name, \
                           testing::Values( new CUDATest( MG_EPOCH_REFERENCE_FILE_NAME ) ) );
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 MG_INSTANTIATE_TEST_SUITE_GPU( XTESTID_GPU( MG_EPOCH_PROCESS_ID ), MadgraphTest );
 #else
 MG_INSTANTIATE_TEST_SUITE_CPU( XTESTID_CPU( MG_EPOCH_PROCESS_ID ), MadgraphTest );
diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/testmisc.cc b/epochX/cudacpp/gq_ttq.sa/SubProcesses/testmisc.cc
index 2bd7a9fcf9..989aba1fdc 100644
--- a/epochX/cudacpp/gq_ttq.sa/SubProcesses/testmisc.cc
+++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/testmisc.cc
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 //----------------------------------------------------------------------------
 // Use ./runTest.exe --gtest_filter=*misc to run only this test
 //----------------------------------------------------------------------------
@@ -17,7 +17,7 @@
 #include <sstream>
 #include <typeinfo>
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 #define TESTID( s ) s##_GPU_MISC
 #else
 #define TESTID( s ) s##_CPU_MISC
diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/testxxx.cc b/epochX/cudacpp/gq_ttq.sa/SubProcesses/testxxx.cc
index 6e8657edca..4243e9fcec 100644
--- a/epochX/cudacpp/gq_ttq.sa/SubProcesses/testxxx.cc
+++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/testxxx.cc
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Apr 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #include "mgOnGpuConfig.h"
 
@@ -20,7 +20,7 @@
 #include <iomanip>
 #include <iostream>
 #include <vector>
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 #define TESTID( s ) s##_GPU_XXX
 #else
 #define TESTID( s ) s##_CPU_XXX
@@ -41,7 +41,7 @@ TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testxxx )
   assert( nevt % neppM == 0 ); // nevt must be a multiple of neppM
   assert( nevt % neppV == 0 ); // nevt must be a multiple of neppV
   // Fill in the input momenta
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
   mg5amcGpu::PinnedHostBufferMomenta hstMomenta( nevt ); // AOSOA[npagM][npar=4][np4=4][neppM]
 #else
   mg5amcCpu::HostBufferMomenta hstMomenta( nevt ); // AOSOA[npagM][npar=4][np4=4][neppM]
@@ -228,7 +228,7 @@ TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testxxx )
   {
     for( int ievt = 0; ievt < nevt; ievt++ )
     {
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
       using namespace mg5amcGpu;
 #else
       using namespace mg5amcCpu;
diff --git a/epochX/cudacpp/gq_ttq.sa/src/HelAmps_sm.h b/epochX/cudacpp/gq_ttq.sa/src/HelAmps_sm.h
index 9b0bfb10ee..e15ce959e9 100644
--- a/epochX/cudacpp/gq_ttq.sa/src/HelAmps_sm.h
+++ b/epochX/cudacpp/gq_ttq.sa/src/HelAmps_sm.h
@@ -4,7 +4,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: A. Valassi (Sep 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
 // MadGraph5_aMC@NLO v. 3.5.0_lo_vect, 2023-06-09
@@ -26,7 +26,7 @@
 //#include <iomanip>
 //#include <iostream>
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/gq_ttq.sa/src/Parameters_sm.cc b/epochX/cudacpp/gq_ttq.sa/src/Parameters_sm.cc
index 459dae9e99..7255e49119 100644
--- a/epochX/cudacpp/gq_ttq.sa/src/Parameters_sm.cc
+++ b/epochX/cudacpp/gq_ttq.sa/src/Parameters_sm.cc
@@ -4,7 +4,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: A. Valassi (Sep 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
 // MadGraph5_aMC@NLO v. 3.5.0_lo_vect, 2023-06-09
diff --git a/epochX/cudacpp/gq_ttq.sa/src/Parameters_sm.h b/epochX/cudacpp/gq_ttq.sa/src/Parameters_sm.h
index db5520aa96..c935779eb3 100644
--- a/epochX/cudacpp/gq_ttq.sa/src/Parameters_sm.h
+++ b/epochX/cudacpp/gq_ttq.sa/src/Parameters_sm.h
@@ -211,7 +211,7 @@ namespace Parameters_sm_dependentCouplings
 #pragma GCC diagnostic push
 #pragma GCC diagnostic ignored "-Wunused-variable"  // e.g. <<warning: unused variable ‘mdl_G__exp__2’ [-Wunused-variable]>>
 #pragma GCC diagnostic ignored "-Wunused-parameter" // e.g. <<warning: unused parameter ‘G’ [-Wunused-parameter]>>
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 #pragma nv_diagnostic push
 #pragma nv_diag_suppress 177 // e.g. <<warning #177-D: variable "mdl_G__exp__2" was declared but never referenced>>
 #endif
@@ -238,7 +238,7 @@ namespace Parameters_sm_dependentCouplings
     // End SM implementation - no special handling of vectors of floats as in EFT (#439)
     return out;
   }
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 #pragma GCC diagnostic pop
 #pragma nv_diagnostic pop
 #endif
@@ -254,7 +254,7 @@ namespace Parameters_sm_independentCouplings
 
 //==========================================================================
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/gq_ttq.sa/src/mgOnGpuConfig.h b/epochX/cudacpp/gq_ttq.sa/src/mgOnGpuConfig.h
index d4e37d19b3..6c0c4919e9 100644
--- a/epochX/cudacpp/gq_ttq.sa/src/mgOnGpuConfig.h
+++ b/epochX/cudacpp/gq_ttq.sa/src/mgOnGpuConfig.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jul 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MGONGPUCONFIG_H
 #define MGONGPUCONFIG_H 1
@@ -10,26 +10,13 @@
 // There are two different code bases for standalone_cudacpp (without multichannel) and madevent+cudacpp (with multichannel)
 #undef MGONGPU_SUPPORTS_MULTICHANNEL
 
-// Is this a GPU (CUDA, HIP) or CPU implementation?
-#ifdef __CUDACC__
-#define MGONGPUCPP_GPUIMPL cuda
-#elif defined __HIPCC__
-#define MGONGPUCPP_GPUIMPL hip
-#else
-#undef MGONGPUCPP_GPUIMPL
-#endif
-
 // ** NB1 Throughputs (e.g. 6.8E8) are events/sec for "./gcheck.exe -p 65536 128 12"
 // ** NB2 Baseline on b7g47n0004 fluctuates (probably depends on load on other VMs)
 
 // Choose if curand is supported for generating random numbers
-// For CUDA, by default, it is supported
-// For HIP, by default, it is not supported
 // For C++, by default, do not inline, but allow this macro to be set from outside with e.g. -DMGONGPU_HAS_NO_CURAND
 #ifdef __CUDACC__
 #undef MGONGPU_HAS_NO_CURAND
-#elif defined __HIPCC__
-#define MGONGPU_HAS_NO_CURAND 1
 #else
 //#undef MGONGPU_HAS_NO_CURAND // default
 ////#define MGONGPU_HAS_NO_CURAND 1
@@ -65,28 +52,23 @@
 //#undef MGONGPU_HARDCODE_PARAM // default
 ////#define MGONGPU_HARDCODE_PARAM 1
 
-// Complex type in CUDA: thrust or cucomplex or cxsmpl (CHOOSE ONLY ONE)
+// Complex type in c++: std::complex or cxsmpl (CHOOSE ONLY ONE)
+#ifndef __CUDACC__
+//#define MGONGPU_CPPCXTYPE_STDCOMPLEX 1 // ~8 percent slower on float, same on double (5.1E6/double, 9.4E6/float)
+#define MGONGPU_CPPCXTYPE_CXSMPL 1 // new default (5.1E6/double, 10.2E6/float)
+#endif
+
+// Complex type in cuda: thrust or cucomplex or cxsmpl (CHOOSE ONLY ONE)
 #ifdef __CUDACC__
 #define MGONGPU_CUCXTYPE_THRUST 1 // default (~1.15E9/double, ~3.2E9/float)
 //#define MGONGPU_CUCXTYPE_CUCOMPLEX 1 // ~10 percent slower (1.03E9/double, ~2.8E9/float)
 //#define MGONGPU_CUCXTYPE_CXSMPL 1 // ~10 percent slower (1.00E9/double, ~2.9E9/float)
-
-// Complex type in HIP: cxsmpl (ONLY ONE OPTION POSSIBLE)
-#elif defined __HIPCC__
-#define MGONGPU_CUCXTYPE_CXSMPL 1 // ~10 percent slower (1.00E9/double, ~2.9E9/float)
-
-// Complex type in C++: std::complex or cxsmpl (CHOOSE ONLY ONE)
-#else
-//#define MGONGPU_CPPCXTYPE_STDCOMPLEX 1 // ~8 percent slower on float, same on double (5.1E6/double, 9.4E6/float)
-#define MGONGPU_CPPCXTYPE_CXSMPL 1 // new default (5.1E6/double, 10.2E6/float)
 #endif
 
-// CUDA nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation
+// Cuda nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation
 #ifdef __CUDACC__
-#undef MGONGPU_NSIGHT_DEBUG // default in CUDA
+#undef MGONGPU_NSIGHT_DEBUG // default
 //#define MGONGPU_NSIGHT_DEBUG 1
-#else
-#undef MGONGPU_NSIGHT_DEBUG // only option in HIP or C++
 #endif
 
 // SANITY CHECKS (floating point precision for everything but color algebra #537)
@@ -102,21 +84,17 @@
 #error You cannot use double precision for color algebra and single precision elsewhere
 #endif
 
-// SANITY CHECKS (CUDA complex number implementation)
-#ifdef __CUDACC__
-#if defined MGONGPU_CUCXTYPE_THRUST and defined MGONGPU_CUCXTYPE_CUCOMPLEX
-#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CUCXTYPE_THRUST or MGONGPU_CUCXTYPE_CUCOMPLEX for CUDA
-#elif defined MGONGPU_CUCXTYPE_THRUST and defined MGONGPU_CUCXTYPE_CXSMPL
-#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CUCXTYPE_THRUST or MGONGPU_CUCXTYPE_CXSMPL for CUDA
-#elif defined MGONGPU_CUCXTYPE_CUCOMPLEX and defined MGONGPU_CUCXTYPE_CXSMPL
-#error You must CHOOSE (ONE AND) ONLY ONE OF MGONGPU_CUCXTYPE_CUCOMPLEX or MGONGPU_CUCXTYPE_CXSMPL for CUDA
+// SANITY CHECKS (c++ complex number implementation)
+#ifndef __CUDACC__
+#if defined MGONGPU_CPPCXTYPE_STDCOMPLEX and defined MGONGPU_CPPCXTYPE_CXSMPL
+#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CPPCXTYPE_STDCOMPLEX or MGONGPU_CPPCXTYPE_CXSMPL
 #endif
 #endif
 
-// SANITY CHECKS (C++ complex number implementation)
-#ifndef MGONGPUCPP_GPUIMPL
-#if defined MGONGPU_CPPCXTYPE_STDCOMPLEX and defined MGONGPU_CPPCXTYPE_CXSMPL
-#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CPPCXTYPE_STDCOMPLEX or MGONGPU_CPPCXTYPE_CXSMPL for C++
+// SANITY CHECKS (cuda complex number implementation)
+#ifdef __CUDACC__
+#if defined MGONGPU_CUCXTYPE_THRUST and defined MGONGPU_CUCXTYPE_CUCOMPLEX and defined MGONGPU_CUCXTYPE_CXSMPL
+#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CUCXTYPE_THRUST or MGONGPU_CUCXTYPE_CUCOMPLEX or MGONGPU_CUCXTYPE_CXSMPL
 #endif
 #endif
 
@@ -153,7 +131,7 @@ namespace mgOnGpu
   // Alignment requirement for using reinterpret_cast with SIMD vectorized code
   // (using reinterpret_cast with non aligned memory may lead to segmentation faults!)
   // Only needed for C++ code but can be enforced also in NVCC builds of C++ code using CUDA>=11.2 and C++17 (#318, #319, #333)
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
   constexpr int cppAlign = 64; // alignment requirement for SIMD vectorization (64-byte i.e. 512-bit)
 #endif
 
@@ -164,7 +142,7 @@ using mgOnGpu::fptype;
 using mgOnGpu::fptype2;
 
 // C++ SIMD vectorization width (this will be used to set neppV)
-#ifdef MGONGPUCPP_GPUIMPL // CUDA and HIP implementations have no SIMD
+#ifdef __CUDACC__ // CUDA implementation has no SIMD
 #undef MGONGPU_CPPSIMD
 #elif defined __AVX512VL__ && defined MGONGPU_PVW512 // C++ "512z" AVX512 with 512 width (512-bit ie 64-byte): 8 (DOUBLE) or 16 (FLOAT)
 #ifdef MGONGPU_FPTYPE_DOUBLE
@@ -194,9 +172,9 @@ using mgOnGpu::fptype2;
 #undef MGONGPU_CPPSIMD
 #endif
 
-// CUDA nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation
+// Cuda nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation
 // Arguments (not used so far): text is __FUNCTION__, code is 0 (start) or 1 (end)
-#if defined __CUDA__ && defined MGONGPU_NSIGHT_DEBUG /* clang-format off */
+#if defined __CUDACC__ && defined MGONGPU_NSIGHT_DEBUG /* clang-format off */
 #define mgDebugDeclare() __shared__ float mgDebugCounter[mgOnGpu::ntpbMAX];
 #define mgDebugInitialise() { mgDebugCounter[threadIdx.x] = 0; }
 #define mgDebug( code, text ) { mgDebugCounter[threadIdx.x] += 1; }
@@ -208,8 +186,8 @@ using mgOnGpu::fptype2;
 #define mgDebugFinalise() { /*noop*/ }
 #endif /* clang-format on */
 
-// Define empty CUDA/HIP declaration specifiers for C++
-#ifndef MGONGPUCPP_GPUIMPL
+// Define empty CUDA declaration specifiers for C++
+#ifndef __CUDACC__
 #define __global__
 #define __host__
 #define __device__
diff --git a/epochX/cudacpp/gq_ttq.sa/src/mgOnGpuCxtypes.h b/epochX/cudacpp/gq_ttq.sa/src/mgOnGpuCxtypes.h
index 4e7ab03fa2..0cb2f1db7e 100644
--- a/epochX/cudacpp/gq_ttq.sa/src/mgOnGpuCxtypes.h
+++ b/epochX/cudacpp/gq_ttq.sa/src/mgOnGpuCxtypes.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022, based on earlier work by D. Smith) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MGONGPUCXTYPES_H
 #define MGONGPUCXTYPES_H 1
@@ -19,7 +19,7 @@
 #include <complex>
 
 // Complex type in cuda: thrust or cucomplex or cxsmpl
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 #if defined MGONGPU_CUCXTYPE_THRUST
 #pragma clang diagnostic push
 #pragma clang diagnostic ignored "-Wtautological-compare" // for icpx2021/clang13 (https://stackoverflow.com/a/15864661)
@@ -201,7 +201,7 @@ namespace mgOnGpu
 {
 
   // --- Type definitions (complex type: cxtype)
-#ifdef MGONGPUCPP_GPUIMPL // cuda
+#ifdef __CUDACC__ // cuda
 #if defined MGONGPU_CUCXTYPE_THRUST
   typedef thrust::complex<fptype> cxtype;
 #elif defined MGONGPU_CUCXTYPE_CUCOMPLEX
@@ -281,7 +281,7 @@ cxmake( const std::complex<double>& c ) // std::complex to cxsmpl (double-to-flo
 
 //==========================================================================
 
-#if defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CUCXTYPE_THRUST // cuda + thrust
+#if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_THRUST // cuda + thrust
 
 //------------------------------
 // CUDA - using thrust::complex
@@ -317,11 +317,11 @@ cxmake( const cxtype& c )
   return c;
 }
 
-#endif // #if defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CUCXTYPE_THRUST
+#endif // #if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_THRUST
 
 //==========================================================================
 
-#if defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CUCXTYPE_CUCOMPLEX // cuda + cucomplex
+#if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_CUCOMPLEX // cuda + cucomplex
 
 //------------------------------
 // CUDA - using cuComplex
@@ -536,11 +536,11 @@ cxmake( const std::complex<fptype>& c ) // std::complex to cucomplex (float-to-f
   return cxmake( c.real(), c.imag() );
 }
 
-#endif // #if defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CUCXTYPE_CUCOMPLEX
+#endif // #if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_CUCOMPLEX
 
 //==========================================================================
 
-#if not defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CPPCXTYPE_STDCOMPLEX // c++ + stdcomplex
+#if not defined __CUDACC__ and defined MGONGPU_CPPCXTYPE_STDCOMPLEX // c++ + stdcomplex
 
 //------------------------------
 // C++ - using std::complex
@@ -584,7 +584,7 @@ cxmake( const std::complex<double>& c ) // std::complex to std::complex (cast do
 }
 #endif
 
-#endif // #if not defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CPPCXTYPE_STDCOMPLEX
+#endif // #if not defined __CUDACC__ and defined MGONGPU_CPPCXTYPE_STDCOMPLEX
 
 //==========================================================================
 
diff --git a/epochX/cudacpp/gq_ttq.sa/src/mgOnGpuFptypes.h b/epochX/cudacpp/gq_ttq.sa/src/mgOnGpuFptypes.h
index 6f6cee64d6..a1cde16a67 100644
--- a/epochX/cudacpp/gq_ttq.sa/src/mgOnGpuFptypes.h
+++ b/epochX/cudacpp/gq_ttq.sa/src/mgOnGpuFptypes.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MGONGPUFPTYPES_H
 #define MGONGPUFPTYPES_H 1
@@ -13,7 +13,7 @@
 
 //==========================================================================
 
-#ifdef MGONGPUCPP_GPUIMPL // cuda
+#ifdef __CUDACC__ // cuda
 
 //------------------------------
 // Floating point types - Cuda
@@ -57,11 +57,11 @@ fpsqrt( const fptype& f )
 #endif
 }
 
-#endif // #ifdef MGONGPUCPP_GPUIMPL
+#endif // #ifdef __CUDACC__
 
 //==========================================================================
 
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
 
 //------------------------------
 // Floating point types - C++
@@ -85,7 +85,7 @@ fpsqrt( const fptype& f )
   return std::sqrt( f );
 }
 
-#endif // #ifndef MGONGPUCPP_GPUIMPL
+#endif // #ifndef __CUDACC__
 
 //==========================================================================
 
diff --git a/epochX/cudacpp/gq_ttq.sa/src/mgOnGpuVectors.h b/epochX/cudacpp/gq_ttq.sa/src/mgOnGpuVectors.h
index 7904b93c61..9d3e82b1e3 100644
--- a/epochX/cudacpp/gq_ttq.sa/src/mgOnGpuVectors.h
+++ b/epochX/cudacpp/gq_ttq.sa/src/mgOnGpuVectors.h
@@ -108,7 +108,7 @@ namespace mgOnGpu /* clang-format off */
 #endif
 #endif
 
-#else // i.e #ifndef MGONGPU_CPPSIMD (this includes #ifdef MGONGPUCPP_GPUIMPL)
+#else // i.e #ifndef MGONGPU_CPPSIMD (this includes #ifdef __CUDACC__)
 
   const int neppV = 1;
 
@@ -129,7 +129,7 @@ using mgOnGpu::bool_v;
 
 //--------------------------------------------------------------------------
 
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
 
 // Printout to stream for user defined types
 
@@ -744,11 +744,11 @@ fpvsplit1( const fptype2_v& v )
 
 #endif // #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
 
-#endif // #ifndef MGONGPUCPP_GPUIMPL
+#endif // #ifndef __CUDACC__
 
 //==========================================================================
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 
 //------------------------------
 // Vector types - CUDA
@@ -786,12 +786,12 @@ cxternary( const bool& mask, const cxtype& a, const cxtype& b )
   return ( mask ? a : b );
 }
 
-#endif // #ifdef MGONGPUCPP_GPUIMPL
+#endif // #ifdef __CUDACC__
 
 //==========================================================================
 
 // Scalar-or-vector types: scalar in CUDA, vector or scalar in C++
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 typedef bool bool_sv;
 typedef fptype fptype_sv;
 typedef fptype2 fptype2_sv;
@@ -812,7 +812,7 @@ typedef mgOnGpu::cxtype_ref cxtype_sv_ref;
 #endif
 
 // Scalar-or-vector zeros: scalar in CUDA, vector or scalar in C++
-#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
+#ifdef __CUDACC__ /* clang-format off */
 inline __host__ __device__ cxtype cxzero_sv(){ return cxtype( 0, 0 ); }
 #elif defined MGONGPU_CPPSIMD
 inline cxtype_v cxzero_sv() { return cxtype_v(); } // RRRR=0000 IIII=0000
diff --git a/epochX/cudacpp/gq_ttq.sa/src/rambo.h b/epochX/cudacpp/gq_ttq.sa/src/rambo.h
index cd7e1008ea..e02ea52496 100644
--- a/epochX/cudacpp/gq_ttq.sa/src/rambo.h
+++ b/epochX/cudacpp/gq_ttq.sa/src/rambo.h
@@ -4,7 +4,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 
 #include "mgOnGpuConfig.h"
@@ -18,7 +18,7 @@
 #include <iostream>
 
 // Simplified rambo version for 2 to N (with N>=2) processes with massless particles
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -83,7 +83,7 @@ namespace mg5amcCpu
       static bool first = true;
       if( first )
       {
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
         if constexpr( M_ACCESS::isOnDevice() ) // avoid
         {
           const int ievt0 = 0;
@@ -166,7 +166,7 @@ namespace mg5amcCpu
     wt = po2log;
     if( nparf != 2 ) wt = ( 2. * nparf - 4. ) * log( energy ) + z[nparf - 1];
 
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
     // issue warnings if weight is too small or too large
     static int iwarn[5] = { 0, 0, 0, 0, 0 };
     if( wt < -180. )
diff --git a/epochX/cudacpp/heft_gg_h.sa/COPYRIGHT b/epochX/cudacpp/heft_gg_h.sa/COPYRIGHT
index 84a883fbb0..a134b5fef9 100644
--- a/epochX/cudacpp/heft_gg_h.sa/COPYRIGHT
+++ b/epochX/cudacpp/heft_gg_h.sa/COPYRIGHT
@@ -15,7 +15,6 @@ The full development team currently includes the following authors :
   Stephan Hageboeck (CERN)
   Olivier Mattelaer (Universite Catholique de Louvain, original author)
   Stefan Roiser (CERN, original author)
-  Joergen Teig (CERN)
   Andrea Valassi (CERN, original author)
   Zenny Wettersten (CERN)
 See https://github.com/madgraph5/madgraph4gpu for more details. For the full
diff --git a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/Bridge.h b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/Bridge.h
index c04628dfd1..4cafe0c997 100644
--- a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/Bridge.h
+++ b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/Bridge.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: S. Roiser (Nov 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Roiser, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef BRIDGE_H
 #define BRIDGE_H 1
@@ -22,7 +22,7 @@
 #include <memory>
 #include <type_traits>
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -82,7 +82,7 @@ namespace mg5amcCpu
     Bridge& operator=( const Bridge& ) = delete;
     Bridge& operator=( Bridge&& ) = delete;
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
     /**
      * Set the gpublocks and gputhreads for the gpusequence - throws if evnt != gpublocks*gputhreads
      * (this is needed for BridgeKernel tests rather than for actual production use in Fortran)
@@ -149,7 +149,7 @@ namespace mg5amcCpu
     unsigned int m_nevt; // number of events
     int m_nGoodHel;      // the number of good helicities (-1 initially when they have not yet been calculated)
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
     int m_gputhreads; // number of gpu threads (default set from number of events, can be modified)
     int m_gpublocks;  // number of gpu blocks (default set from number of events, can be modified)
     mg5amcGpu::DeviceBuffer<FORTRANFPTYPE, sizePerEventMomenta> m_devMomentaF;
@@ -186,12 +186,12 @@ namespace mg5amcCpu
   // Forward declare transposition methods
   //
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 
   template<typename Tin, typename Tout>
   __global__ void dev_transposeMomentaF2C( const Tin* in, Tout* out, const unsigned int nevt );
 
-#endif // MGONGPUCPP_GPUIMPL
+#endif // __CUDACC__
 
   template<typename Tin, typename Tout>
   void hst_transposeMomentaF2C( const Tin* in, Tout* out, const unsigned int nevt );
@@ -208,7 +208,7 @@ namespace mg5amcCpu
   Bridge<FORTRANFPTYPE>::Bridge( unsigned int nevtF, unsigned int nparF, unsigned int np4F )
     : m_nevt( nevtF )
     , m_nGoodHel( -1 )
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
     , m_gputhreads( 256 )                  // default number of gpu threads
     , m_gpublocks( m_nevt / m_gputhreads ) // this ensures m_nevt <= m_gpublocks*m_gputhreads
     , m_devMomentaF( m_nevt )
@@ -232,7 +232,7 @@ namespace mg5amcCpu
   {
     if( nparF != CPPProcess::npar ) throw std::runtime_error( "Bridge constructor: npar mismatch" );
     if( np4F != CPPProcess::np4 ) throw std::runtime_error( "Bridge constructor: np4 mismatch" );
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
     if( ( m_nevt < s_gputhreadsmin ) || ( m_nevt % s_gputhreadsmin != 0 ) )
       throw std::runtime_error( "Bridge constructor: nevt should be a multiple of " + std::to_string( s_gputhreadsmin ) );
     while( m_nevt != m_gpublocks * m_gputhreads )
@@ -250,11 +250,11 @@ namespace mg5amcCpu
     std::cout << "WARNING! Instantiate host Bridge (nevt=" << m_nevt << ")" << std::endl;
     mg5amcCpu::CPPProcess process( /*verbose=*/false );
     m_pmek.reset( new mg5amcCpu::MatrixElementKernelHost( m_hstMomentaC, m_hstGs, m_hstRndHel, m_hstRndCol, m_hstMEs, m_hstSelHel, m_hstSelCol, m_nevt ) );
-#endif // MGONGPUCPP_GPUIMPL
+#endif // __CUDACC__
     process.initProc( "../../Cards/param_card.dat" );
   }
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
   template<typename FORTRANFPTYPE>
   void Bridge<FORTRANFPTYPE>::set_gpugrid( const int gpublocks, const int gputhreads )
   {
@@ -268,7 +268,7 @@ namespace mg5amcCpu
   }
 #endif
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
   template<typename FORTRANFPTYPE>
   void Bridge<FORTRANFPTYPE>::gpu_sequence( const FORTRANFPTYPE* momenta,
                                             const FORTRANFPTYPE* gs,
@@ -283,14 +283,14 @@ namespace mg5amcCpu
     constexpr int neppM = MemoryAccessMomenta::neppM;
     if constexpr( neppM == 1 && std::is_same_v<FORTRANFPTYPE, fptype> )
     {
-      gpuMemcpy( m_devMomentaC.data(), momenta, m_devMomentaC.bytes(), gpuMemcpyHostToDevice );
+      checkCuda( cudaMemcpy( m_devMomentaC.data(), momenta, m_devMomentaC.bytes(), cudaMemcpyHostToDevice ) );
     }
     else
     {
-      gpuMemcpy( m_devMomentaF.data(), momenta, m_devMomentaF.bytes(), gpuMemcpyHostToDevice );
+      checkCuda( cudaMemcpy( m_devMomentaF.data(), momenta, m_devMomentaF.bytes(), cudaMemcpyHostToDevice ) );
       const int thrPerEvt = CPPProcess::npar * CPPProcess::np4; // AV: transpose alg does 1 element per thread (NOT 1 event per thread)
       //const int thrPerEvt = 1; // AV: try new alg with 1 event per thread... this seems slower
-      gpuLaunchKernel( dev_transposeMomentaF2C, m_gpublocks * thrPerEvt, m_gputhreads, m_devMomentaF.data(), m_devMomentaC.data(), m_nevt );
+      dev_transposeMomentaF2C<<<m_gpublocks * thrPerEvt, m_gputhreads>>>( m_devMomentaF.data(), m_devMomentaC.data(), m_nevt );
     }
     if constexpr( std::is_same_v<FORTRANFPTYPE, fptype> )
     {
@@ -333,7 +333,7 @@ namespace mg5amcCpu
   }
 #endif
 
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
   template<typename FORTRANFPTYPE>
   void Bridge<FORTRANFPTYPE>::cpu_sequence( const FORTRANFPTYPE* momenta,
                                             const FORTRANFPTYPE* gs,
@@ -388,7 +388,7 @@ namespace mg5amcCpu
   // - C++ array: momenta[npagM][npar][np4][neppM] with nevt=npagM*neppM (AOSOA)
   //
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
   template<typename Tin, typename Tout>
   __global__ void dev_transposeMomentaF2C( const Tin* in, Tout* out, const unsigned int nevt )
   {
diff --git a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/BridgeKernels.cc b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/BridgeKernels.cc
index 90c7f2d3b8..cef4cb3c71 100644
--- a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/BridgeKernels.cc
+++ b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/BridgeKernels.cc
@@ -1,11 +1,10 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #include "BridgeKernels.h"
 
-#include "GpuAbstraction.h"
 #include "MemoryAccessMomenta.h"
 
 #include <sstream>
@@ -15,7 +14,7 @@ constexpr int npar = CPPProcess::npar; // #particles in total (external = initia
 
 //============================================================================
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -46,7 +45,7 @@ namespace mg5amcCpu
 
 //============================================================================
 
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
 namespace mg5amcCpu
 {
 
@@ -97,7 +96,7 @@ namespace mg5amcCpu
 
 //============================================================================
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 namespace mg5amcGpu
 {
 
diff --git a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/BridgeKernels.h b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/BridgeKernels.h
index 3efef8ce97..15eb4bff4d 100644
--- a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/BridgeKernels.h
+++ b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/BridgeKernels.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef BRIDGEKERNELS_H
 #define BRIDGEKERNELS_H 1
@@ -12,7 +12,7 @@
 #include "MatrixElementKernels.h"
 #include "MemoryBuffers.h"
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -49,7 +49,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
   // A Bridge wrapper class encapsulating matrix element calculations on a CPU host
   class BridgeKernelHost final : public BridgeKernelBase
   {
@@ -89,7 +89,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
   // A Bridge wrapper class encapsulating matrix element calculations on a GPU device
   class BridgeKernelDevice : public BridgeKernelBase
   {
diff --git a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/CommonRandomNumberKernel.cc b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/CommonRandomNumberKernel.cc
index 010bc4cbd0..985b39f576 100644
--- a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/CommonRandomNumberKernel.cc
+++ b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/CommonRandomNumberKernel.cc
@@ -1,16 +1,15 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #include "CommonRandomNumbers.h"
-#include "GpuAbstraction.h"
 #include "MemoryBuffers.h"
 #include "RandomNumberKernels.h"
 
 #include <cassert>
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/CrossSectionKernels.cc b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/CrossSectionKernels.cc
index c15b39844d..0b355a3c8d 100644
--- a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/CrossSectionKernels.cc
+++ b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/CrossSectionKernels.cc
@@ -1,11 +1,10 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #include "CrossSectionKernels.h"
 
-#include "GpuAbstraction.h"
 #include "MemoryAccessMatrixElements.h"
 #include "MemoryAccessWeights.h"
 #include "MemoryBuffers.h"
@@ -78,7 +77,7 @@ debug_me_is_abnormal( const fptype& me, size_t ievtALL )
 
 //============================================================================
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -186,7 +185,7 @@ namespace mg5amcCpu
 
 //============================================================================
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 namespace mg5amcGpu
 {
 
diff --git a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/CrossSectionKernels.h b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/CrossSectionKernels.h
index 4d9659e04e..7933ca4bbf 100644
--- a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/CrossSectionKernels.h
+++ b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/CrossSectionKernels.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef CROSSSECTIONKERNELS_H
 #define CROSSSECTIONKERNELS_H 1
@@ -13,7 +13,7 @@
 
 //============================================================================
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -96,7 +96,7 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
   /*
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
   // A class encapsulating the calculation of event statistics on a GPU device
   class CrossSectionKernelDevice : public CrossSectionKernelBase, public NumberOfEvents
   {
diff --git a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/CudaRuntime.h b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/CudaRuntime.h
new file mode 100644
index 0000000000..64ce52f4b3
--- /dev/null
+++ b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/CudaRuntime.h
@@ -0,0 +1,85 @@
+// Copyright (C) 2020-2023 CERN and UCLouvain.
+// Licensed under the GNU Lesser General Public License (version 3 or later).
+// Created by: S. Roiser (Jul 2020) for the MG5aMC CUDACPP plugin.
+// Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+
+#ifndef MG5AMC_CUDARUNTIME_H
+#define MG5AMC_CUDARUNTIME_H 1
+
+// MG5AMC on GPU uses the CUDA runtime API, not the lower level CUDA driver API
+// See https://docs.nvidia.com/cuda/cuda-runtime-api/driver-vs-runtime-api.html#driver-vs-runtime-api
+
+#include <cassert>
+#include <iostream>
+
+//--------------------------------------------------------------------------
+
+// See https://stackoverflow.com/a/14038590
+#ifdef __CUDACC__ /* clang-format off */
+#define checkCuda( code ) { assertCuda( code, __FILE__, __LINE__ ); }
+inline void assertCuda( cudaError_t code, const char* file, int line, bool abort = true )
+{
+  if( code != cudaSuccess )
+  {
+    printf( "ERROR! assertCuda: '%s' (%d) in %s:%d\n", cudaGetErrorString( code ), code, file, line );
+    if( abort ) assert( code == cudaSuccess );
+  }
+}
+#endif /* clang-format on */
+
+//--------------------------------------------------------------------------
+
+#ifdef __CUDACC__
+namespace mg5amcGpu
+{
+  // Instantiate a CudaRuntime at the beginnining of the application's main to
+  // invoke cudaSetDevice(0) in the constructor and book a cudaDeviceReset() call in the destructor
+  // *** FIXME! This will all need to be designed differently when going to multi-GPU nodes! ***
+  struct CudaRuntime final
+  {
+    CudaRuntime( const bool debug = true )
+      : m_debug( debug ) { setUp( m_debug ); }
+    ~CudaRuntime() { tearDown( m_debug ); }
+    CudaRuntime( const CudaRuntime& ) = delete;
+    CudaRuntime( CudaRuntime&& ) = delete;
+    CudaRuntime& operator=( const CudaRuntime& ) = delete;
+    CudaRuntime& operator=( CudaRuntime&& ) = delete;
+    bool m_debug;
+
+    // Set up CUDA application
+    // ** NB: strictly speaking this is not needed when using the CUDA runtime API **
+    // Calling cudaSetDevice on startup is useful to properly book-keep the time spent in CUDA initialization
+    static void setUp( const bool debug = true )
+    {
+      // ** NB: it is useful to call cudaSetDevice, or cudaFree, to properly book-keep the time spent in CUDA initialization
+      // ** NB: otherwise, the first CUDA operation (eg a cudaMemcpyToSymbol in CPPProcess ctor) appears to take much longer!
+      /*
+      // [We initially added cudaFree(0) to "ease profile analysis" only because it shows up as a big recognizable block!]
+      // No explicit initialization is needed: https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#initialization
+      // It is not clear what cudaFree(0) does at all: https://stackoverflow.com/questions/69967813/
+      if ( debug ) std::cout << "__CudaRuntime: calling cudaFree(0)" << std::endl;
+      checkCuda( cudaFree( 0 ) ); // SLOW!
+      */
+      // Replace cudaFree(0) by cudaSetDevice(0), even if it is not really needed either
+      // (but see https://developer.nvidia.com/blog/cuda-pro-tip-always-set-current-device-avoid-multithreading-bugs)
+      if( debug ) std::cout << "__CudaRuntime: calling cudaSetDevice(0)" << std::endl;
+      checkCuda( cudaSetDevice( 0 ) ); // SLOW!
+    }
+
+    // Tear down CUDA application (call cudaDeviceReset)
+    // ** NB: strictly speaking this is not needed when using the CUDA runtime API **
+    // Calling cudaDeviceReset on shutdown is only needed for checking memory leaks in cuda-memcheck
+    // See https://docs.nvidia.com/cuda/cuda-memcheck/index.html#leak-checking
+    static void tearDown( const bool debug = true )
+    {
+      if( debug ) std::cout << "__CudaRuntime: calling cudaDeviceReset()" << std::endl;
+      checkCuda( cudaDeviceReset() );
+    }
+  };
+
+}
+#endif
+
+//--------------------------------------------------------------------------
+
+#endif // MG5AMC_CUDARUNTIME_H
diff --git a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/CurandRandomNumberKernel.cc b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/CurandRandomNumberKernel.cc
index 38c477c17a..eb56333b03 100644
--- a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/CurandRandomNumberKernel.cc
+++ b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/CurandRandomNumberKernel.cc
@@ -1,9 +1,9 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
-#include "GpuRuntime.h"
+#include "CudaRuntime.h"
 #include "MemoryBuffers.h"
 #include "RandomNumberKernels.h"
 
@@ -114,7 +114,7 @@ namespace mg5amcCpu
     /*
     printf( "\nCurandRandomNumberKernel::generateRnarray size = %d\n", (int)m_rnarray.size() );
     fptype* data = m_rnarray.data();
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
     if( m_rnarray.isOnDevice() )
     {
       data = new fptype[m_rnarray.size()]();
@@ -123,7 +123,7 @@ namespace mg5amcCpu
 #endif
     for( int i = 0; i < ( (int)m_rnarray.size() / 4 ); i++ )
       printf( "[%4d] %f %f %f %f\n", i * 4, data[i * 4], data[i * 4 + 2], data[i * 4 + 2], data[i * 4 + 3] );
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
     if( m_rnarray.isOnDevice() ) delete[] data;
 #endif
     */
diff --git a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/EventStatistics.h b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/EventStatistics.h
index b425a5bade..48b51e0a49 100644
--- a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/EventStatistics.h
+++ b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/EventStatistics.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef EventStatistics_H
 #define EventStatistics_H 1
@@ -16,7 +16,7 @@
 #include <limits>
 #include <string>
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/MadgraphTest.h b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/MadgraphTest.h
index d2ff326e20..fd7734ce42 100644
--- a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/MadgraphTest.h
+++ b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/MadgraphTest.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: S. Hageboeck (Dec 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MADGRAPHTEST_H_
 #define MADGRAPHTEST_H_ 1
@@ -21,7 +21,7 @@
 #include <string>
 #include <vector>
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 using mg5amcGpu::CPPProcess;
 #else
 using mg5amcCpu::CPPProcess;
@@ -200,7 +200,7 @@ class MadgraphTest : public testing::TestWithParam<TestDriverBase*>
 
 // Since we link both the CPU-only and GPU tests into the same executable, we prevent
 // a multiply defined symbol by only compiling this in the non-CUDA phase:
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
 
 /// Compare momenta and matrix elements.
 /// This uses an implementation of TestDriverBase to run a madgraph workflow,
@@ -309,6 +309,6 @@ TEST_P( MadgraphTest, CompareMomentaAndME )
   }
 }
 
-#endif // MGONGPUCPP_GPUIMPL
+#endif // __CUDACC__
 
 #endif /* MADGRAPHTEST_H_ */
diff --git a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/MatrixElementKernels.cc b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/MatrixElementKernels.cc
index d6d6c4f179..30257195b6 100644
--- a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/MatrixElementKernels.cc
+++ b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/MatrixElementKernels.cc
@@ -1,12 +1,12 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #include "MatrixElementKernels.h"
 
 #include "CPPProcess.h"
-#include "GpuRuntime.h" // Includes the abstraction for Nvidia/AMD compilation
+#include "CudaRuntime.h"
 #include "MemoryAccessMomenta.h"
 #include "MemoryBuffers.h"
 
@@ -14,7 +14,7 @@
 
 //============================================================================
 
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
 namespace mg5amcCpu
 {
 
@@ -143,7 +143,7 @@ namespace mg5amcCpu
 
 //============================================================================
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 namespace mg5amcGpu
 {
 
@@ -202,13 +202,13 @@ namespace mg5amcGpu
     PinnedHostBufferHelicityMask hstIsGoodHel( ncomb );
     DeviceBufferHelicityMask devIsGoodHel( ncomb );
     // ... 0d1. Compute good helicity mask on the device
-    gpuLaunchKernel( computeDependentCouplings, m_gpublocks, m_gputhreads, m_gs.data(), m_couplings.data() );
+    computeDependentCouplings<<<m_gpublocks, m_gputhreads>>>( m_gs.data(), m_couplings.data() );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    gpuLaunchKernel( sigmaKin_getGoodHel, m_gpublocks, m_gputhreads, m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_numerators.data(), m_denominators.data(), devIsGoodHel.data() );
+    sigmaKin_getGoodHel<<<m_gpublocks, m_gputhreads>>>( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_numerators.data(), m_denominators.data(), devIsGoodHel.data() );
 #else
-    gpuLaunchKernel( sigmaKin_getGoodHel, m_gpublocks, m_gputhreads, m_momenta.data(), m_couplings.data(), m_matrixElements.data(), devIsGoodHel.data() );
+    sigmaKin_getGoodHel<<<m_gpublocks, m_gputhreads>>>( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), devIsGoodHel.data() );
 #endif
-    checkGpu( gpuPeekAtLastError() );
+    checkCuda( cudaPeekAtLastError() );
     // ... 0d2. Copy back good helicity mask to the host
     copyHostFromDevice( hstIsGoodHel, devIsGoodHel );
     // ... 0d3. Copy back good helicity list to constant memory on the device
@@ -219,19 +219,19 @@ namespace mg5amcGpu
 
   void MatrixElementKernelDevice::computeMatrixElements( const unsigned int channelId )
   {
-    gpuLaunchKernel( computeDependentCouplings, m_gpublocks, m_gputhreads, m_gs.data(), m_couplings.data() );
+    computeDependentCouplings<<<m_gpublocks, m_gputhreads>>>( m_gs.data(), m_couplings.data() );
 #ifndef MGONGPU_NSIGHT_DEBUG
     constexpr unsigned int sharedMemSize = 0;
 #else
     constexpr unsigned int sharedMemSize = ntpbMAX * sizeof( float );
 #endif
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    gpuLaunchKernelSharedMem( sigmaKin, m_gpublocks, m_gputhreads, sharedMemSize, m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), channelId, m_numerators.data(), m_denominators.data(), m_selhel.data(), m_selcol.data() );
+    sigmaKin<<<m_gpublocks, m_gputhreads, sharedMemSize>>>( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), channelId, m_numerators.data(), m_denominators.data(), m_selhel.data(), m_selcol.data() );
 #else
-    gpuLaunchKernelSharedMem( sigmaKin, m_gpublocks, m_gputhreads, sharedMemSize, m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), m_selhel.data(), m_selcol.data() );
+    sigmaKin<<<m_gpublocks, m_gputhreads, sharedMemSize>>>( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), m_selhel.data(), m_selcol.data() );
 #endif
-    checkGpu( gpuPeekAtLastError() );
-    checkGpu( gpuDeviceSynchronize() );
+    checkCuda( cudaPeekAtLastError() );
+    checkCuda( cudaDeviceSynchronize() );
   }
 
   //--------------------------------------------------------------------------
diff --git a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/MatrixElementKernels.h b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/MatrixElementKernels.h
index 72bd8f195b..23e84757a2 100644
--- a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/MatrixElementKernels.h
+++ b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/MatrixElementKernels.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MATRIXELEMENTKERNELS_H
 #define MATRIXELEMENTKERNELS_H 1
@@ -10,7 +10,7 @@
 
 #include "MemoryBuffers.h"
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -81,7 +81,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
   // A class encapsulating matrix element calculations on a CPU host
   class MatrixElementKernelHost final : public MatrixElementKernelBase, public NumberOfEvents
   {
@@ -130,7 +130,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
   // A class encapsulating matrix element calculations on a GPU device
   class MatrixElementKernelDevice : public MatrixElementKernelBase, public NumberOfEvents
   {
diff --git a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/MemoryAccessHelpers.h b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/MemoryAccessHelpers.h
index db73e4e064..c82a6c7635 100644
--- a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/MemoryAccessHelpers.h
+++ b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/MemoryAccessHelpers.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MemoryAccessHelpers_H
 #define MemoryAccessHelpers_H 1
@@ -105,7 +105,7 @@ class KernelAccessHelper : public MemoryAccessHelper<T>
     }
     else
     {
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
       const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid
       //printf( "kernelAccessRecord: ievt=%d threadId=%d\n", ievt, threadIdx.x );
       return T::ieventAccessRecord( buffer, ievt ); // NB fptype and fptype_sv coincide for CUDA
diff --git a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/MemoryAccessMomenta.h b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/MemoryAccessMomenta.h
index 38fade09fb..0ac4faa3c7 100644
--- a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/MemoryAccessMomenta.h
+++ b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/MemoryAccessMomenta.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MemoryAccessMomenta_H
 #define MemoryAccessMomenta_H 1
@@ -12,7 +12,7 @@
 #include "MemoryAccessHelpers.h"
 #include "MemoryAccessVectors.h"
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 using mg5amcGpu::CPPProcess;
 #else
 using mg5amcCpu::CPPProcess;
@@ -29,7 +29,7 @@ class MemoryAccessMomentaBase //_AOSOAv1
 
   // Number of Events Per Page in the momenta AOSOA memory buffer layout
   // (these are all best kept as a compile-time constants: see issue #23)
-#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
+#ifdef __CUDACC__ /* clang-format off */
   // -----------------------------------------------------------------------------------------------
   // --- GPUs: neppM is best set to a power of 2 times the number of fptype's in a 32-byte cacheline
   // --- This is relevant to ensure coalesced access to momenta in global memory
diff --git a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/MemoryAccessRandomNumbers.h b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/MemoryAccessRandomNumbers.h
index 40cb089135..e2988d39f3 100644
--- a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/MemoryAccessRandomNumbers.h
+++ b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/MemoryAccessRandomNumbers.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MemoryAccessRandomNumbers_H
 #define MemoryAccessRandomNumbers_H 1
@@ -11,7 +11,7 @@
 #include "CPPProcess.h"
 #include "MemoryAccessHelpers.h"
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 using mg5amcGpu::CPPProcess;
 #else
 using mg5amcCpu::CPPProcess;
diff --git a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/MemoryAccessVectors.h b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/MemoryAccessVectors.h
index 08faccff0f..e9b197368e 100644
--- a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/MemoryAccessVectors.h
+++ b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/MemoryAccessVectors.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MemoryAccessVectors_H
 #define MemoryAccessVectors_H 1
@@ -10,7 +10,7 @@
 
 #include "mgOnGpuVectors.h"
 
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
 namespace mg5amcCpu // this is only needed for CPU SIMD vectorization
 {
 
diff --git a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/MemoryBuffers.h b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/MemoryBuffers.h
index 78004e66cc..8109470148 100644
--- a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/MemoryBuffers.h
+++ b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/MemoryBuffers.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021, based on earlier work by S. Hageboeck) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Roiser, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MemoryBuffers_H
 #define MemoryBuffers_H 1
@@ -11,12 +11,12 @@
 #include "mgOnGpuCxtypes.h"
 
 #include "CPPProcess.h"
-#include "GpuRuntime.h"
+#include "CudaRuntime.h"
 #include "Parameters_heft.h"
 
 #include <sstream>
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -87,7 +87,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
   constexpr bool HostBufferALIGNED = false;   // ismisaligned=false
   constexpr bool HostBufferMISALIGNED = true; // ismisaligned=true
 
@@ -119,7 +119,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
   // A class encapsulating a CUDA pinned host buffer
   template<typename T>
   class PinnedHostBufferBase : public BufferBase<T>
@@ -128,18 +128,18 @@ namespace mg5amcCpu
     PinnedHostBufferBase( const size_t size )
       : BufferBase<T>( size, false )
     {
-      gpuMallocHost( &( this->m_data ), this->bytes() );
+      checkCuda( cudaMallocHost( &( this->m_data ), this->bytes() ) );
     }
     virtual ~PinnedHostBufferBase()
     {
-      gpuFreeHost( this->m_data );
+      checkCuda( cudaFreeHost( this->m_data ) );
     }
   };
 #endif
 
   //--------------------------------------------------------------------------
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
   // A class encapsulating a CUDA device buffer
   template<typename T>
   class DeviceBufferBase : public BufferBase<T>
@@ -148,18 +148,18 @@ namespace mg5amcCpu
     DeviceBufferBase( const size_t size )
       : BufferBase<T>( size, true )
     {
-      gpuMalloc( &( this->m_data ), this->bytes() );
+      checkCuda( cudaMalloc( &( this->m_data ), this->bytes() ) );
     }
     virtual ~DeviceBufferBase()
     {
-      gpuFree( this->m_data );
+      checkCuda( cudaFree( this->m_data ) );
     }
   };
 #endif
 
   //--------------------------------------------------------------------------
 
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
   // A class encapsulating a C++ host buffer for a given number of events
   template<typename T, size_t sizePerEvent, bool ismisaligned>
   class HostBuffer : public HostBufferBase<T, ismisaligned>, virtual private NumberOfEvents
@@ -175,7 +175,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
   // A class encapsulating a CUDA pinned host buffer for a given number of events
   template<typename T, size_t sizePerEvent>
   class PinnedHostBuffer : public PinnedHostBufferBase<T>, virtual private NumberOfEvents
@@ -191,7 +191,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
   // A class encapsulating a CUDA device buffer for a given number of events
   template<typename T, size_t sizePerEvent>
   class DeviceBuffer : public DeviceBufferBase<T>, virtual private NumberOfEvents
@@ -213,7 +213,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for momenta random numbers
   constexpr size_t sizePerEventRndNumMomenta = MemoryBuffers::np4 * MemoryBuffers::nparf;
 
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
   // A class encapsulating a C++ host buffer for momenta random numbers
   typedef HostBuffer<fptype, sizePerEventRndNumMomenta, HostBufferALIGNED> HostBufferRndNumMomenta;
 #else
@@ -232,7 +232,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer with ONE fptype per event
   constexpr size_t sizePerEventOneFp = 1;
 
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
   // A class encapsulating a C++ host buffer with ONE fptype per event
   typedef HostBuffer<fptype, sizePerEventOneFp, HostBufferALIGNED> HostBufferOneFp;
 #else
@@ -257,7 +257,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for Gs
   constexpr size_t sizePerEventGs = 1;
 
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
   // A class encapsulating a C++ host buffer for gs
   typedef HostBuffer<fptype, sizePerEventGs, HostBufferALIGNED> HostBufferGs;
 #else
@@ -276,7 +276,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for numerators
   constexpr size_t sizePerEventNumerators = 1;
 
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
   // A class encapsulating a C++ host buffer for gs
   typedef HostBuffer<fptype, sizePerEventNumerators, HostBufferALIGNED> HostBufferNumerators;
 #else
@@ -296,7 +296,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for denominators
   constexpr size_t sizePerEventDenominators = 1;
 
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
   // A class encapsulating a C++ host buffer for gs
   typedef HostBuffer<fptype, sizePerEventDenominators, HostBufferALIGNED> HostBufferDenominators;
 #else
@@ -315,7 +315,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for random numbers
   constexpr size_t sizePerEventCouplings = MemoryBuffers::ndcoup * MemoryBuffers::nx2;
 
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
   // A class encapsulating a C++ host buffer for gs
   typedef HostBuffer<fptype, sizePerEventCouplings, HostBufferALIGNED> HostBufferCouplings;
 #else
@@ -333,7 +333,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for momenta
   constexpr size_t sizePerEventMomenta = MemoryBuffers::np4 * MemoryBuffers::npar;
 
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
   // A class encapsulating a C++ host buffer for momenta
   typedef HostBuffer<fptype, sizePerEventMomenta, HostBufferALIGNED> HostBufferMomenta;
   //typedef HostBuffer<fptype, sizePerEventMomenta, HostBufferMISALIGNED> HostBufferMomenta; // TEST MISALIGNMENT!
@@ -352,7 +352,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for sampling weights
   constexpr size_t sizePerEventWeights = 1;
 
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
   // A class encapsulating a C++ host buffer for sampling weights
   typedef HostBuffer<fptype, sizePerEventWeights, HostBufferALIGNED> HostBufferWeights;
 #else
@@ -370,7 +370,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for matrix elements
   constexpr size_t sizePerEventMatrixElements = 1;
 
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
   // A class encapsulating a C++ host buffer for matrix elements
   typedef HostBuffer<fptype, sizePerEventMatrixElements, HostBufferALIGNED> HostBufferMatrixElements;
 #else
@@ -385,7 +385,7 @@ namespace mg5amcCpu
   // A base class encapsulating a memory buffer for the helicity mask
   typedef BufferBase<bool> BufferHelicityMask;
 
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
   // A class encapsulating a C++ host buffer for the helicity mask
   typedef HostBufferBase<bool, HostBufferALIGNED> HostBufferHelicityMask;
 #else
@@ -403,7 +403,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for wavefunctions
   constexpr size_t sizePerEventWavefunctions = MemoryBuffers::nw6 * MemoryBuffers::nx2;
 
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
   // A class encapsulating a C++ host buffer for wavefunctions
   typedef HostBuffer<fptype, sizePerEventWavefunctions, HostBufferALIGNED> HostBufferWavefunctions;
 #else
@@ -421,7 +421,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for helicity random numbers
   constexpr size_t sizePerEventRndNumHelicity = 1;
 
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
   // A class encapsulating a C++ host buffer for helicity random numbers
   typedef HostBuffer<fptype, sizePerEventRndNumHelicity, HostBufferALIGNED> HostBufferRndNumHelicity;
 #else
@@ -439,7 +439,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for color random numbers
   constexpr size_t sizePerEventRndNumColor = 1;
 
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
   // A class encapsulating a C++ host buffer for color random numbers
   typedef HostBuffer<fptype, sizePerEventRndNumColor, HostBufferALIGNED> HostBufferRndNumColor;
 #else
@@ -457,7 +457,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for helicity selection
   constexpr size_t sizePerEventSelectedHelicity = 1;
 
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
   // A class encapsulating a C++ host buffer for helicity selection
   typedef HostBuffer<int, sizePerEventSelectedHelicity, HostBufferALIGNED> HostBufferSelectedHelicity;
 #else
@@ -475,7 +475,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for color selection
   constexpr size_t sizePerEventSelectedColor = 1;
 
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
   // A class encapsulating a C++ host buffer for color selection
   typedef HostBuffer<int, sizePerEventSelectedColor, HostBufferALIGNED> HostBufferSelectedColor;
 #else
@@ -487,7 +487,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
   template<class Tdst, class Tsrc>
   void copyDeviceFromHost( Tdst& dst, const Tsrc& src ) // keep the same order of arguments as in memcpy
   {
@@ -504,13 +504,13 @@ namespace mg5amcCpu
       throw std::runtime_error( sstr.str() );
     }
     // NB (PR #45): cudaMemcpy involves an intermediate memcpy to pinned memory if host array is a not a pinned host array
-    gpuMemcpy( dst.data(), src.data(), src.bytes(), gpuMemcpyHostToDevice );
+    checkCuda( cudaMemcpy( dst.data(), src.data(), src.bytes(), cudaMemcpyHostToDevice ) );
   }
 #endif
 
   //--------------------------------------------------------------------------
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
   template<class Tdst, class Tsrc>
   void copyHostFromDevice( Tdst& dst, const Tsrc& src ) // keep the same order of arguments as in memcpy
   {
@@ -527,7 +527,7 @@ namespace mg5amcCpu
       throw std::runtime_error( sstr.str() );
     }
     // NB (PR #45): cudaMemcpy involves an intermediate memcpy to pinned memory if host array is a not a pinned host array
-    gpuMemcpy( dst.data(), src.data(), src.bytes(), gpuMemcpyDeviceToHost );
+    checkCuda( cudaMemcpy( dst.data(), src.data(), src.bytes(), cudaMemcpyDeviceToHost ) );
   }
 #endif
 
diff --git a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/P1_Sigma_heft_gg_h/CPPProcess.cc b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/P1_Sigma_heft_gg_h/CPPProcess.cc
index c3fcba5970..e0c90c3bc7 100644
--- a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/P1_Sigma_heft_gg_h/CPPProcess.cc
+++ b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/P1_Sigma_heft_gg_h/CPPProcess.cc
@@ -4,7 +4,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
 // MadGraph5_aMC@NLO v. 3.5.0_lo_vect, 2023-06-09
@@ -16,6 +16,7 @@
 
 #include "mgOnGpuConfig.h"
 
+#include "CudaRuntime.h"
 #include "HelAmps_heft.h"
 #include "MemoryAccessAmplitudes.h"
 #include "MemoryAccessCouplings.h"
@@ -45,7 +46,7 @@
 // Class member functions for calculating the matrix elements for
 // Process: g g > h HIG<=1 HIW<=1 WEIGHTED<=2 @1
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -79,7 +80,7 @@ namespace mg5amcCpu
   //__device__ const fptype* cIPD = nullptr; // unused as nparam=0
   __device__ const fptype* cIPC = nullptr; // unused as nicoup=0
 #else
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
   //__device__ __constant__ fptype* cIPD = nullptr; // unused as nparam=0
   __device__ __constant__ fptype* cIPC = nullptr; // unused as nicoup=0
 #else
@@ -89,7 +90,7 @@ namespace mg5amcCpu
 #endif
 
   // Helicity combinations (and filtering of "good" helicity combinations)
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
   __device__ __constant__ short cHel[ncomb][npar];
   __device__ __constant__ int cNGoodHel;
   __device__ __constant__ int cGoodHel[ncomb];
@@ -117,13 +118,13 @@ namespace mg5amcCpu
                            fptype* allDenominators,       // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
                            fptype_sv* jamp2_sv            // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled)
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
                            , const int ievt00             // input: first event number in current C++ event page (for CUDA, ievt depends on threadid)
 #endif
                            )
   //ALWAYS_INLINE // attributes are not permitted in a function definition
   {
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
     using namespace mg5amcGpu;
     using M_ACCESS = DeviceAccessMomenta;         // non-trivial access: buffer includes all events
     using E_ACCESS = DeviceAccessMatrixElements;  // non-trivial access: buffer includes all events
@@ -150,7 +151,7 @@ namespace mg5amcCpu
 #endif /* clang-format on */
     mgDebug( 0, __FUNCTION__ );
     //printf( "calculate_wavefunctions: ihel=%2d\n", ihel );
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
     //printf( "calculate_wavefunctions: ievt00=%d\n", ievt00 );
 #endif
 
@@ -186,7 +187,7 @@ namespace mg5amcCpu
 #endif
     for( int iParity = 0; iParity < nParity; ++iParity )
     { // START LOOP ON IPARITY
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
       const int ievt0 = ievt00 + iParity * neppV;
 #endif
       constexpr size_t nxcoup = ndcoup + nicoup; // both dependent and independent couplings
@@ -199,10 +200,8 @@ namespace mg5amcCpu
         allCOUPs[idcoup] = CD_ACCESS::idcoupAccessBufferConst( allcouplings, idcoup ); // dependent couplings, vary event-by-event
       for( size_t iicoup = 0; iicoup < nicoup; iicoup++ )
         allCOUPs[ndcoup + iicoup] = CI_ACCESS::iicoupAccessBufferConst( cIPC, iicoup ); // independent couplings, fixed for all events
-#ifdef MGONGPUCPP_GPUIMPL
 #ifdef __CUDACC__
 #pragma nv_diagnostic pop
-#endif
       // CUDA kernels take input/output buffers with momenta/MEs for all events
       const fptype* momenta = allmomenta;
       const fptype* COUPs[nxcoup];
@@ -269,7 +268,7 @@ namespace mg5amcCpu
       // [NB do keep 'static' for these constexpr arrays, see issue #283]
       static constexpr fptype2 cf[ncolor][ncolor] = { { 2 } }; // 2-D array[1][1]
 
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
       // Pre-compute a constexpr triangular color matrix properly normalized #475
       struct TriangularNormalizedColorMatrix
       {
@@ -326,7 +325,7 @@ namespace mg5amcCpu
 #endif
       for( int icol = 0; icol < ncolor; icol++ )
       {
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
         // === C++ START ===
         // Diagonal terms
 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
@@ -385,7 +384,7 @@ namespace mg5amcCpu
       MEs_sv_previous += deltaMEs_previous;
 #endif
       /*
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
       if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", blockDim.x * blockIdx.x + threadIdx.x, ihel, MEs_sv );
 #else
 #ifdef MGONGPU_CPPSIMD
@@ -420,8 +419,8 @@ namespace mg5amcCpu
       { -1, 1, 0 },
       { 1, -1, 0 },
       { 1, 1, 0 } };
-#ifdef MGONGPUCPP_GPUIMPL
-    gpuMemcpyToSymbol( cHel, tHel, ncomb * npar * sizeof( short ) );
+#ifdef __CUDACC__
+    checkCuda( cudaMemcpyToSymbol( cHel, tHel, ncomb * npar * sizeof( short ) ) );
 #else
     memcpy( cHel, tHel, ncomb * npar * sizeof( short ) );
 #endif
@@ -460,9 +459,9 @@ namespace mg5amcCpu
     // Then copy them to CUDA constant memory (issue #39) or its C++ emulation in file-scope static memory
     //const fptype tIPD[0] = { ... }; // nparam=0
     //const cxtype tIPC[0] = { ... }; // nicoup=0
-#ifdef MGONGPUCPP_GPUIMPL
-    //gpuMemcpyToSymbol( cIPD, tIPD, 0 * sizeof( fptype ) ); // nparam=0
-    //gpuMemcpyToSymbol( cIPC, tIPC, 0 * sizeof( cxtype ) ); // nicoup=0
+#ifdef __CUDACC__
+    //checkCuda( cudaMemcpyToSymbol( cIPD, tIPD, 0 * sizeof( fptype ) ) ); // nparam=0
+    //checkCuda( cudaMemcpyToSymbol( cIPC, tIPC, 0 * sizeof( cxtype ) ) ); // nicoup=0
 #else
     //memcpy( cIPD, tIPD, 0 * sizeof( fptype ) ); // nparam=0
     //memcpy( cIPC, tIPC, 0 * sizeof( cxtype ) ); // nicoup=0
@@ -496,7 +495,7 @@ namespace mg5amcCpu
   {
     std::stringstream out;
     // CUDA version (NVCC)
-    // [Use __NVCC__ instead of MGONGPUCPP_GPUIMPL here!]
+    // [Use __NVCC__ instead of __CUDACC__ here!]
     // [This tests if 'nvcc' was used even to build a .cc file, even if not necessarily 'nvcc -x cu' for a .cu file]
     // [Check 'nvcc --compiler-options -dM -E dummy.c | grep CUDA': see https://stackoverflow.com/a/53713712]
 #ifdef __NVCC__
@@ -561,12 +560,12 @@ namespace mg5amcCpu
   __global__ void /* clang-format off */
   computeDependentCouplings( const fptype* allgs, // input: Gs[nevt]
                              fptype* allcouplings // output: couplings[nevt*ndcoup*2]
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
                              , const int nevt     // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #endif
   ) /* clang-format on */
   {
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
     using namespace mg5amcGpu;
     using G_ACCESS = DeviceAccessGs;
     using C_ACCESS = DeviceAccessCouplings;
@@ -587,7 +586,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
+#ifdef __CUDACC__ /* clang-format off */
   __global__ void
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
@@ -716,9 +715,9 @@ namespace mg5amcCpu
         nGoodHel++;
       }
     }
-#ifdef MGONGPUCPP_GPUIMPL
-    gpuMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) );
-    gpuMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) );
+#ifdef __CUDACC__
+    checkCuda( cudaMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) ) );
+    checkCuda( cudaMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) ) );
 #else
     cNGoodHel = nGoodHel;
     for( int ihel = 0; ihel < ncomb; ihel++ ) cGoodHel[ihel] = goodHel[ihel];
@@ -742,7 +741,7 @@ namespace mg5amcCpu
 #endif
             int* allselhel,                // output: helicity selection[nevt]
             int* allselcol                 // output: helicity selection[nevt]
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
             , const int nevt               // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #endif
             ) /* clang-format on */
@@ -763,7 +762,7 @@ namespace mg5amcCpu
     // Denominators: spins, colors and identical particles
     constexpr int helcolDenominators[1] = { 256 }; // assume nprocesses == 1 (#272 and #343)
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
     // Remember: in CUDA this is a kernel for one event, in c++ this processes n events
     const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid
 #else
@@ -777,12 +776,9 @@ namespace mg5amcCpu
 #endif
 
     // Start sigmaKin_lines
-
-#include "GpuAbstraction.h"
-
     // === PART 0 - INITIALISATION (before calculate_wavefunctions) ===
     // Reset the "matrix elements" - running sums of |M|^2 over helicities for the given event
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
     allMEs[ievt] = 0;
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     allNumerators[ievt] = 0;
@@ -810,7 +806,7 @@ namespace mg5amcCpu
     // === PART 1 - HELICITY LOOP: CALCULATE WAVEFUNCTIONS ===
     // (in both CUDA and C++, using precomputed good helicities)
 
-#ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++
+#ifdef __CUDACC__ // CUDA OR C++
 
     // *** START OF PART 1a - CUDA (one event per CPU thread) ***
     // Running sum of partial amplitudes squared for event by event color selection (#402)
@@ -1014,7 +1010,7 @@ namespace mg5amcCpu
     // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event
     // [NB 'sum over final spins, average over initial spins', eg see
     // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf]
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
     allMEs[ievt] /= helcolDenominators[0];
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     if( channelId > 0 ) allMEs[ievt] *= allNumerators[ievt] / allDenominators[ievt];
diff --git a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/P1_Sigma_heft_gg_h/CPPProcess.h b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/P1_Sigma_heft_gg_h/CPPProcess.h
index 3febdd5abe..1210ee05bc 100644
--- a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/P1_Sigma_heft_gg_h/CPPProcess.h
+++ b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/P1_Sigma_heft_gg_h/CPPProcess.h
@@ -4,7 +4,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
 // MadGraph5_aMC@NLO v. 3.5.0_lo_vect, 2023-06-09
@@ -25,7 +25,7 @@
 
 //--------------------------------------------------------------------------
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -107,7 +107,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
   __global__ void
   computeDependentCouplings( const fptype* allgs,    // input: Gs[nevt]
                              fptype* allcouplings ); // output: couplings[nevt*ndcoup*2]
@@ -120,7 +120,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
+#ifdef __CUDACC__ /* clang-format off */
   __global__ void
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
@@ -150,7 +150,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
+#ifdef __CUDACC__ /* clang-format off */
   __global__ void
   sigmaKin( const fptype* allmomenta,      // input: momenta[nevt*npar*4]
             const fptype* allcouplings,    // input: couplings[nevt*ndcoup*2]
diff --git a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/P1_Sigma_heft_gg_h/CudaRuntime.h b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/P1_Sigma_heft_gg_h/CudaRuntime.h
new file mode 120000
index 0000000000..ce9e1a487a
--- /dev/null
+++ b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/P1_Sigma_heft_gg_h/CudaRuntime.h
@@ -0,0 +1 @@
+../CudaRuntime.h
\ No newline at end of file
diff --git a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/P1_Sigma_heft_gg_h/check_sa.cc b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/P1_Sigma_heft_gg_h/check_sa.cc
index 1bad694d1c..f1e75b9252 100644
--- a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/P1_Sigma_heft_gg_h/check_sa.cc
+++ b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/P1_Sigma_heft_gg_h/check_sa.cc
@@ -4,7 +4,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: O. Mattelaer (Nov 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 
 #include "mgOnGpuConfig.h"
@@ -12,7 +12,6 @@
 #include "BridgeKernels.h"
 #include "CPPProcess.h"
 #include "CrossSectionKernels.h"
-#include "GpuRuntime.h"
 #include "MatrixElementKernels.h"
 #include "MemoryAccessMatrixElements.h"
 #include "MemoryAccessMomenta.h"
@@ -64,7 +63,7 @@ usage( char* argv0, int ret = 1 )
   std::cout << std::endl;
   std::cout << "Summary stats are always computed: '-p' and '-j' only control their printout" << std::endl;
   std::cout << "The '-d' flag only enables NaN/abnormal warnings and OMP debugging" << std::endl;
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
 #ifdef _OPENMP
   std::cout << std::endl;
   std::cout << "Use the OMP_NUM_THREADS environment variable to control OMP multi-threading" << std::endl;
@@ -78,7 +77,7 @@ int
 main( int argc, char** argv )
 {
   // Namespaces for CUDA and C++ (FIXME - eventually use the same namespace everywhere...)
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
   using namespace mg5amcGpu;
 #else
   using namespace mg5amcCpu;
@@ -104,11 +103,11 @@ main( int argc, char** argv )
     CurandDevice = 2
   };
 #ifdef __CUDACC__
-  RandomNumberMode rndgen = RandomNumberMode::CurandDevice; // default on CUDA GPU
+  RandomNumberMode rndgen = RandomNumberMode::CurandDevice; // default on GPU
 #elif not defined MGONGPU_HAS_NO_CURAND
   RandomNumberMode rndgen = RandomNumberMode::CurandHost;  // default on CPU if build has curand
 #else
-  RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // default on CPU if build has no curand and on HIP GPU
+  RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // default on CPU if build has no curand
 #endif
   // Rambo sampling mode (NB RamboHost implies CommonRandom or CurandHost!)
   enum class RamboSamplingMode
@@ -116,7 +115,7 @@ main( int argc, char** argv )
     RamboHost = 1,
     RamboDevice = 2
   };
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
   RamboSamplingMode rmbsmp = RamboSamplingMode::RamboDevice; // default on GPU
 #else
   RamboSamplingMode rmbsmp = RamboSamplingMode::RamboHost; // default on CPU
@@ -149,7 +148,7 @@ main( int argc, char** argv )
 #ifdef __CUDACC__
       rndgen = RandomNumberMode::CurandDevice;
 #else
-      throw std::runtime_error( "CurandDevice is not supported on CPUs or on HIP GPUs" );
+      throw std::runtime_error( "CurandDevice is not supported on CPUs" );
 #endif
     }
     else if( arg == "--curhst" )
@@ -166,7 +165,7 @@ main( int argc, char** argv )
     }
     else if( arg == "--rmbdev" )
     {
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
       rmbsmp = RamboSamplingMode::RamboDevice;
 #else
       throw std::runtime_error( "RamboDevice is not supported on CPUs" );
@@ -240,13 +239,13 @@ main( int argc, char** argv )
     return usage( argv[0] );
   }
 
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
 #ifdef _OPENMP
   ompnumthreadsNotSetMeansOneThread( debug ? 1 : 0 ); // quiet(-1), info(0), debug(1)
 #endif
 #endif
 
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
   // Fail gently and avoid "Illegal instruction (core dumped)" if the host does not support the SIMD used in the ME calculation
   // Note: this prevents a crash on pmpe04 but not on some github CI nodes?
   // [NB: SIMD vectorization in mg5amc C++ code is only used in the ME calculation below MatrixElementKernelHost!]
@@ -264,14 +263,14 @@ main( int argc, char** argv )
 
   // === STEP 0 - INITIALISE
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 
-  // --- 00. Initialise GPU
-  // Instantiate a GpuRuntime at the beginnining of the application's main.
-  // For CUDA this invokes cudaSetDevice(0) in the constructor and books a cudaDeviceReset() call in the destructor.
-  const std::string cdinKey = "00 GpuInit";
+  // --- 00. Initialise cuda
+  // Instantiate a CudaRuntime at the beginnining of the application's main to
+  // invoke cudaSetDevice(0) in the constructor and book a cudaDeviceReset() call in the destructor
+  const std::string cdinKey = "00 CudaInit";
   timermap.start( cdinKey );
-  GpuRuntime GpuRuntime( debug );
+  CudaRuntime cudaRuntime( debug );
 #endif
 
   // --- 0a. Initialise physics process
@@ -293,7 +292,7 @@ main( int argc, char** argv )
   timermap.start( alloKey );
 
   // Memory buffers for random numbers for momenta
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
   HostBufferRndNumMomenta hstRndmom( nevt );
 #else
   PinnedHostBufferRndNumMomenta hstRndmom( nevt );
@@ -301,7 +300,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for sampling weights
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
   HostBufferWeights hstWeights( nevt );
 #else
   PinnedHostBufferWeights hstWeights( nevt );
@@ -309,7 +308,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for momenta
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
   HostBufferMomenta hstMomenta( nevt );
 #else
   PinnedHostBufferMomenta hstMomenta( nevt );
@@ -317,7 +316,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for Gs
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
   HostBufferGs hstGs( nevt );
 #else
   PinnedHostBufferGs hstGs( nevt );
@@ -334,7 +333,7 @@ main( int argc, char** argv )
   }
 
   // Memory buffers for matrix elements
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
   HostBufferMatrixElements hstMatrixElements( nevt );
 #else
   PinnedHostBufferMatrixElements hstMatrixElements( nevt );
@@ -343,7 +342,7 @@ main( int argc, char** argv )
 
   // Memory buffers for random numbers for helicity selection
   // *** NB #403 these buffers always remain initialised at 0: no need for helicity choice in gcheck/check (no LHE produced) ***
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
   HostBufferRndNumHelicity hstRndHel( nevt );
 #else
   PinnedHostBufferRndNumHelicity hstRndHel( nevt );
@@ -352,7 +351,7 @@ main( int argc, char** argv )
 
   // Memory buffers for random numbers for color selection
   // *** NB #402 these buffers always remain initialised at 0: no need for color choice in gcheck/check (no LHE produced) ***
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
   HostBufferRndNumColor hstRndCol( nevt );
 #else
   PinnedHostBufferRndNumColor hstRndCol( nevt );
@@ -360,7 +359,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for helicity selection
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
   HostBufferSelectedHelicity hstSelHel( nevt );
 #else
   PinnedHostBufferSelectedHelicity hstSelHel( nevt );
@@ -368,7 +367,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for color selection
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
   HostBufferSelectedColor hstSelCol( nevt );
 #else
   PinnedHostBufferSelectedColor hstSelCol( nevt );
@@ -404,7 +403,7 @@ main( int argc, char** argv )
 #else
   else
   {
-    throw std::logic_error( "CurandDevice is not supported on CPUs or HIP GPUs" ); // INTERNAL ERROR (no path to this statement)
+    throw std::logic_error( "CurandDevice is not supported on CPUs" ); // INTERNAL ERROR (no path to this statement)
   }
 #endif
 #else
@@ -422,7 +421,7 @@ main( int argc, char** argv )
   }
   else
   {
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
     prsk.reset( new RamboSamplingKernelDevice( energy, devRndmom, devMomenta, devWeights, gpublocks, gputhreads ) );
 #else
     throw std::logic_error( "RamboDevice is not supported on CPUs" ); // INTERNAL ERROR (no path to this statement)
@@ -433,7 +432,7 @@ main( int argc, char** argv )
   std::unique_ptr<MatrixElementKernelBase> pmek;
   if( !bridge )
   {
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
     pmek.reset( new MatrixElementKernelDevice( devMomenta, devGs, devRndHel, devRndCol, devMatrixElements, devSelHel, devSelCol, gpublocks, gputhreads ) );
 #else
     pmek.reset( new MatrixElementKernelHost( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, nevt ) );
@@ -441,7 +440,7 @@ main( int argc, char** argv )
   }
   else
   {
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
     pmek.reset( new BridgeKernelDevice( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, gpublocks, gputhreads ) );
 #else
     pmek.reset( new BridgeKernelHost( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, nevt ) );
@@ -483,7 +482,7 @@ main( int argc, char** argv )
     prnk->generateRnarray();
     //std::cout << "Got random numbers" << std::endl;
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
     if( rndgen != RandomNumberMode::CurandDevice && rmbsmp == RamboSamplingMode::RamboDevice )
     {
       // --- 1c. Copy rndmom from host to device
@@ -515,7 +514,7 @@ main( int argc, char** argv )
     prsk->getMomentaFinal();
     //std::cout << "Got final momenta" << std::endl;
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
     if( rmbsmp == RamboSamplingMode::RamboDevice )
     {
       // --- 2c. CopyDToH Weights
@@ -560,7 +559,7 @@ main( int argc, char** argv )
       dynamic_cast<BridgeKernelBase*>( pmek.get() )->transposeInputMomentaC2F();
     }
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
     // --- 2d. CopyHToD Momenta
     const std::string gKey = "0.. CpHTDg";
     rambtime += timermap.start( gKey ); // FIXME! NOT A RAMBO TIMER!
@@ -589,7 +588,7 @@ main( int argc, char** argv )
     wv3atime += timermap.stop(); // calc only
     wavetime += wv3atime;        // calc plus copy
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
     if( !bridge )
     {
       // --- 3b. CopyDToH MEs
@@ -732,19 +731,15 @@ main( int argc, char** argv )
     rndgentxt = "CURAND DEVICE";
 #ifdef __CUDACC__
   rndgentxt += " (CUDA code)";
-#elif defined __HIPCC__
-  rndgentxt += " (HIP code)";
 #else
   rndgentxt += " (C++ code)";
 #endif
 
   // Workflow description summary
   std::string wrkflwtxt;
-  // -- CUDA or HIP or C++?
+  // -- CUDA or C++?
 #ifdef __CUDACC__
   wrkflwtxt += "CUD:";
-#elif defined __HIPCC__
-  wrkflwtxt += "HIP:";
 #else
   wrkflwtxt += "CPP:";
 #endif
@@ -769,12 +764,6 @@ main( int argc, char** argv )
 #else
   wrkflwtxt += "???:"; // no path to this statement
 #endif
-#elif defined __HIPCC__
-#if defined MGONGPU_CUCXTYPE_CXSMPL
-  wrkflwtxt += "CXS:";
-#else
-  wrkflwtxt += "???:"; // no path to this statement
-#endif
 #else
 #if defined MGONGPU_CPPCXTYPE_STDCOMPLEX
   wrkflwtxt += "STX:";
@@ -800,7 +789,7 @@ main( int argc, char** argv )
     wrkflwtxt += "RMBDEV+";
   else
     wrkflwtxt += "??????+"; // no path to this statement
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
   // -- HOST or DEVICE matrix elements? Standalone MEs or BRIDGE?
   if( !bridge )
     wrkflwtxt += "MESDEV";
@@ -856,7 +845,7 @@ main( int argc, char** argv )
 
   if( perf )
   {
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
 #ifdef _OPENMP
     // Get the output of "nproc --all" (https://stackoverflow.com/a/478960)
     std::string nprocall;
@@ -877,8 +866,6 @@ main( int argc, char** argv )
     std::cout << std::string( SEP79, '*' ) << std::endl
 #ifdef __CUDACC__
               << "Process                     = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_CUDA"
-#elif defined __HIPCC__
-              << "Process                     = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_HIP"
 #else
               << "Process                     = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_CPP"
 #endif
@@ -905,21 +892,21 @@ main( int argc, char** argv )
 #elif defined MGONGPU_FPTYPE_FLOAT
               << "FP precision                = FLOAT (NaN/abnormal=" << nabn << ", zero=" << nzero << ")" << std::endl
 #endif
+#ifdef __CUDACC__
 #if defined MGONGPU_CUCXTYPE_CUCOMPLEX
               << "Complex type                = CUCOMPLEX" << std::endl
 #elif defined MGONGPU_CUCXTYPE_THRUST
               << "Complex type                = THRUST::COMPLEX" << std::endl
-#elif defined MGONGPU_CUCXTYPE_CXSMPL
-              << "Complex type                = STD::COMPLEX" << std::endl
+#endif
 #else
-              << "Complex type                = ???" << std::endl // no path to this statement...
+              << "Complex type                = STD::COMPLEX" << std::endl
 #endif
               << "RanNumb memory layout       = AOSOA[" << neppR << "]"
               << ( neppR == 1 ? " == AOS" : "" )
               << " [HARDCODED FOR REPRODUCIBILITY]" << std::endl
               << "Momenta memory layout       = AOSOA[" << neppM << "]"
               << ( neppM == 1 ? " == AOS" : "" ) << std::endl
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
     //<< "Wavefunction GPU memory     = LOCAL" << std::endl
 #else
 #if !defined MGONGPU_CPPSIMD
@@ -950,7 +937,7 @@ main( int argc, char** argv )
 #endif
 #endif
               << "Random number generation    = " << rndgentxt << std::endl
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
 #ifdef _OPENMP
               << "OMP threads / `nproc --all` = " << omp_get_max_threads() << " / " << nprocall // includes a newline
 #endif
@@ -1046,14 +1033,14 @@ main( int argc, char** argv )
              << "\"FLOAT (NaN/abnormal=" << nabn << ")\"," << std::endl
 #endif
              << "\"Complex type\": "
+#ifdef __CUDACC__
 #if defined MGONGPU_CUCXTYPE_CUCOMPLEX
              << "\"CUCOMPLEX\"," << std::endl
 #elif defined MGONGPU_CUCXTYPE_THRUST
              << "\"THRUST::COMPLEX\"," << std::endl
-#elif defined MGONGPU_CUCXTYPE_CXSMPL
-             << "\"STD::COMPLEX\"," << std::endl
+#endif
 #else
-             << "\"???\"," << std::endl                           // no path to this statement...
+             << "\"STD::COMPLEX\"," << std::endl
 #endif
              << "\"RanNumb memory layout\": "
              << "\"AOSOA[" << neppR << "]\""
@@ -1061,7 +1048,7 @@ main( int argc, char** argv )
              << "\"Momenta memory layout\": "
              << "\"AOSOA[" << neppM << "]\""
              << ( neppM == 1 ? " == AOS" : "" ) << ", " << std::endl
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
     //<< "\"Wavefunction GPU memory\": " << "\"LOCAL\"," << std::endl
 #endif
              << "\"Curand generation\": "
diff --git a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/RamboSamplingKernels.cc b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/RamboSamplingKernels.cc
index 79abbcc4f8..da68aa9255 100644
--- a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/RamboSamplingKernels.cc
+++ b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/RamboSamplingKernels.cc
@@ -1,11 +1,11 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #include "RamboSamplingKernels.h"
 
-#include "GpuRuntime.h"
+#include "CudaRuntime.h"
 #include "MemoryAccessMomenta.h"
 #include "MemoryAccessRandomNumbers.h"
 #include "MemoryAccessWeights.h"
@@ -14,7 +14,7 @@
 
 #include <sstream>
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -92,7 +92,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
   RamboSamplingKernelDevice::RamboSamplingKernelDevice( const fptype energy,               // input: energy
                                                         const BufferRndNumMomenta& rndmom, // input: random numbers in [0,1]
                                                         BufferMomenta& momenta,            // output: momenta
@@ -135,7 +135,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
   __global__ void
   getMomentaInitialDevice( const fptype energy,
                            fptype* momenta )
@@ -147,17 +147,17 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
   void
   RamboSamplingKernelDevice::getMomentaInitial()
   {
-    gpuLaunchKernel( getMomentaInitialDevice, m_gpublocks, m_gputhreads, m_energy, m_momenta.data() );
+    getMomentaInitialDevice<<<m_gpublocks, m_gputhreads>>>( m_energy, m_momenta.data() );
   }
 #endif
 
   //--------------------------------------------------------------------------
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
   __global__ void
   getMomentaFinalDevice( const fptype energy,
                          const fptype* rndmom,
@@ -171,11 +171,11 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
   void
   RamboSamplingKernelDevice::getMomentaFinal()
   {
-    gpuLaunchKernel( getMomentaFinalDevice, m_gpublocks, m_gputhreads, m_energy, m_rndmom.data(), m_momenta.data(), m_weights.data() );
+    getMomentaFinalDevice<<<m_gpublocks, m_gputhreads>>>( m_energy, m_rndmom.data(), m_momenta.data(), m_weights.data() );
   }
 #endif
 
diff --git a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/RamboSamplingKernels.h b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/RamboSamplingKernels.h
index 7c214cd74b..184089efd7 100644
--- a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/RamboSamplingKernels.h
+++ b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/RamboSamplingKernels.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef RAMBOSAMPLINGKERNELS_H
 #define RAMBOSAMPLINGKERNELS_H 1
@@ -10,7 +10,7 @@
 
 #include "MemoryBuffers.h"
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -93,7 +93,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
   // A class encapsulating RAMBO phase space sampling on a GPU device
   class RamboSamplingKernelDevice final : public SamplingKernelBase, public NumberOfEvents
   {
diff --git a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/RandomNumberKernels.h b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/RandomNumberKernels.h
index 21d63beeac..188a72c2c9 100644
--- a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/RandomNumberKernels.h
+++ b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/RandomNumberKernels.h
@@ -1,14 +1,14 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef RANDOMNUMBERKERNELS_H
 #define RANDOMNUMBERKERNELS_H 1
 
 #include "mgOnGpuConfig.h"
 
-// NB This must come AFTER mgOnGpuConfig.h which contains our definition of __global__ when MGONGPUCPP_GPUIMPL is not defined
+// NB This must come AFTER mgOnGpuConfig.h which contains our definition of __global__ when __CUDACC__ is not defined
 #ifndef MGONGPU_HAS_NO_CURAND
 //#include "curand.h"
 struct curandGenerator_st; // forward definition from curand.h
@@ -16,7 +16,7 @@ struct curandGenerator_st; // forward definition from curand.h
 
 #include "MemoryBuffers.h"
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/cudacpp.mk b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/cudacpp.mk
index bcb73d7f01..a0397e9ecc 100644
--- a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/cudacpp.mk
+++ b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/cudacpp.mk
@@ -1,7 +1,7 @@
 # Copyright (C) 2020-2023 CERN and UCLouvain.
 # Licensed under the GNU Lesser General Public License (version 3 or later).
 # Created by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin.
-# Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+# Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 
 #=== Determine the name of this makefile (https://ftp.gnu.org/old-gnu/Manuals/make-3.80/html_node/make_17.html)
 #=== NB: different names (e.g. cudacpp.mk and cudacpp_src.mk) are used in the Subprocess and src directories
@@ -32,7 +32,7 @@ UNAME_P := $(shell uname -p)
 #=== Configure common compiler flags for C++ and CUDA
 
 INCFLAGS = -I.
-OPTFLAGS = -O3 # this ends up in GPUFLAGS too (should it?), cannot add -Ofast or -ffast-math here
+OPTFLAGS = -O3 # this ends up in CUFLAGS too (should it?), cannot add -Ofast or -ffast-math here
 
 # Dependency on src directory
 MG5AMC_COMMONLIB = mg5amc_common
@@ -89,139 +89,69 @@ endif
 
 #-------------------------------------------------------------------------------
 
-CUDA_COMPILER_PATH := $(shell compiler="`which nvcc 2>/dev/null`" && while [ -L "$$compiler" ]; do compiler=`readlink "$$compiler"`; done && echo "$$compiler")
-HIP_COMPILER_PATH := $(shell compiler="`which hipcc 2>/dev/null`" && while [ -L "$$compiler" ]; do compiler=`readlink "$$compiler"`; done && echo "$$compiler")
-
-ifeq ($(findstring nvcc,$(CUDA_COMPILER_PATH)),nvcc)
-  #=== Configure the CUDA compiler
-
-  # If CXX is not a single word (example "clang++ --gcc-toolchain...") then disable CUDA builds (issue #505)
-  # This is because it is impossible to pass this to "GPUFLAGS += -ccbin <host-compiler>" below
-  ifneq ($(words $(subst ccache ,,$(CXX))),1) # allow at most "CXX=ccache <host-compiler>" from outside
-    $(warning CUDA builds are not supported for multi-word CXX "$(CXX)")
-    override CUDA_HOME=disabled
-  endif
-
-  # If CUDA_HOME is not set, try to set it from the location of NVCC
-  ifndef CUDA_HOME
-    CUDA_HOME = $(patsubst %bin/nvcc,%,$(shell which nvcc 2>/dev/null))
-    $(warning CUDA_HOME was not set: using "$(CUDA_HOME)")
-  endif
-
-  # Set GPUCC as $(CUDA_HOME)/bin/nvcc if it exists
-  ifneq ($(wildcard $(CUDA_HOME)/bin/nvcc),)
-    GPUCC = $(CUDA_HOME)/bin/nvcc
-    USE_NVTX ?=-DUSE_NVTX
-    # See https://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/index.html
-    # See https://arnon.dk/matching-sm-architectures-arch-and-gencode-for-various-nvidia-cards/
-    # Default: use compute capability 70 for V100 (CERN lxbatch, CERN itscrd, Juwels Cluster).
-    # Embed device code for 70, and PTX for 70+.
-    # Export MADGRAPH_CUDA_ARCHITECTURE (comma-separated list) to use another value or list of values (see #533).
-    # Examples: use 60 for P100 (Piz Daint), 80 for A100 (Juwels Booster, NVidia raplab/Curiosity).
-    MADGRAPH_CUDA_ARCHITECTURE ?= 70
-    ###CUARCHFLAGS = -gencode arch=compute_$(MADGRAPH_CUDA_ARCHITECTURE),code=compute_$(MADGRAPH_CUDA_ARCHITECTURE) -gencode arch=compute_$(MADGRAPH_CUDA_ARCHITECTURE),code=sm_$(MADGRAPH_CUDA_ARCHITECTURE) # Older implementation (AV): go back to this one for multi-GPU support #533
-    ###CUARCHFLAGS = --gpu-architecture=compute_$(MADGRAPH_CUDA_ARCHITECTURE) --gpu-code=sm_$(MADGRAPH_CUDA_ARCHITECTURE),compute_$(MADGRAPH_CUDA_ARCHITECTURE) # Newer implementation (SH): cannot use this as-is for multi-GPU support #533
-    comma:=,
-    CUARCHFLAGS = $(foreach arch,$(subst $(comma), ,$(MADGRAPH_CUDA_ARCHITECTURE)),-gencode arch=compute_$(arch),code=compute_$(arch) -gencode arch=compute_$(arch),code=sm_$(arch))
-    CUINC = -I$(CUDA_HOME)/include/
-    CURANDLIBFLAGS = -L$(CUDA_HOME)/lib64/ -lcurand # NB: -lcuda is not needed here!
-    CUOPTFLAGS = -lineinfo
-    GPUFLAGS = $(OPTFLAGS) $(CUOPTFLAGS) $(INCFLAGS) $(CUINC) $(USE_NVTX) $(CUARCHFLAGS) -use_fast_math
-    ###GPUFLAGS += -Xcompiler -Wall -Xcompiler -Wextra -Xcompiler -Wshadow
-    ###GPUCC_VERSION = $(shell $(GPUCC) --version | grep 'Cuda compilation tools' | cut -d' ' -f5 | cut -d, -f1)
-    GPUFLAGS += -std=c++17 # need CUDA >= 11.2 (see #333): this is enforced in mgOnGpuConfig.h
-    # Without -maxrregcount: baseline throughput: 6.5E8 (16384 32 12) up to 7.3E8 (65536 128 12)
-    ###GPUFLAGS+= --maxrregcount 160 # improves throughput: 6.9E8 (16384 32 12) up to 7.7E8 (65536 128 12)
-    ###GPUFLAGS+= --maxrregcount 128 # improves throughput: 7.3E8 (16384 32 12) up to 7.6E8 (65536 128 12)
-    ###GPUFLAGS+= --maxrregcount 96 # degrades throughput: 4.1E8 (16384 32 12) up to 4.5E8 (65536 128 12)
-    ###GPUFLAGS+= --maxrregcount 64 # degrades throughput: 1.7E8 (16384 32 12) flat at 1.7E8 (65536 128 12)
-
-    CUBUILDRULEFLAGS = -Xcompiler -fPIC -c
-    CCBUILDRULEFLAGS = -Xcompiler -fPIC -c -x cu
-
-    CUDATESTFLAGS = -lcuda
-
-  else ifneq ($(origin REQUIRE_CUDA),undefined)
-    # If REQUIRE_CUDA is set but no cuda is found, stop here (e.g. for CI tests on GPU #443)
-    $(error No cuda installation found (set CUDA_HOME or make GPUCC visible in PATH))
-  else
-    # No cuda. Switch cuda compilation off and go to common random numbers in C++
-    $(warning CUDA_HOME is not set or is invalid: export CUDA_HOME to compile with cuda)
-    override GPUCC=
-    override USE_NVTX=
-    override CUINC=
-    override CURANDLIBFLAGS=
-  endif
-
-  # Set the host C++ compiler for GPUCC via "-ccbin <host-compiler>"
-  # (NB issue #505: this must be a single word, "clang++ --gcc-toolchain..." is not supported)
-  GPUFLAGS += -ccbin $(shell which $(subst ccache ,,$(CXX)))
-
-  # Allow newer (unsupported) C++ compilers with older versions of CUDA if ALLOW_UNSUPPORTED_COMPILER_IN_CUDA is set (#504)
-  ifneq ($(origin ALLOW_UNSUPPORTED_COMPILER_IN_CUDA),undefined)
-  GPUFLAGS += -allow-unsupported-compiler
-  endif
-
-else ifeq ($(findstring hipcc,$(HIP_COMPILER_PATH)),hipcc)
-  #=== Configure the HIP compiler
-
-  # If CXX is not a single word (example "clang++ --gcc-toolchain...") then disable CUDA builds (issue #505)
-  # This is because it is impossible to pass this to "GPUFLAGS += -ccbin <host-compiler>" below
-  ifneq ($(words $(subst ccache ,,$(CXX))),1) # allow at most "CXX=ccache <host-compiler>" from outside
-    $(warning CUDA builds are not supported for multi-word CXX "$(CXX)")
-    override CUDA_HOME=disabled
-  endif
-
-  # If HIP_HOME is not set, try to set it from the location of GPUCC
-  ifndef HIP_HOME
-    HIP_HOME = $(patsubst %bin/hipcc,%,$(HIP_COMPILER_PATH))
-    $(warning HIP_HOME was not set: using "$(HIP_HOME)")
-  endif
-
-  # Set GPUCC as $(HIP_HOME)/bin/hipcc if it exists
-  ifneq ($(wildcard $(HIP_HOME)/bin/hipcc),)
-    GPUCC = $(HIP_HOME)/bin/hipcc
-
-    # Should maybe find something equivelant to this in HIP
-    #USE_NVTX ?=-DUSE_NVTX
-
-    HIPARCHFLAGS = -target x86_64-linux-gnu --offload-arch=gfx90a
-    HIPINC = -I$(HIP_HOME)/include/
-
-    # -DHIP_FAST_MATH equivelant to -use_fast_math in HIP 
-    # (But only for single precision line 208: https://rocm-developer-tools.github.io/HIP/hcc__detail_2math__functions_8h_source.html)
-    GPUFLAGS = $(OPTFLAGS) $(CUOPTFLAGS) $(INCFLAGS) $(HIPINC) $(HIPARCHFLAGS) -DHIP_FAST_MATH -DHIP_PLATFORM=amd -fPIC
-    ###GPUFLAGS += -Xcompiler -Wall -Xcompiler -Wextra -Xcompiler -Wshadow
-    GPUFLAGS += -std=c++17
-    # Without -maxrregcount: baseline throughput: 6.5E8 (16384 32 12) up to 7.3E8 (65536 128 12)
-    ###GPUFLAGS+= --maxrregcount 160 # improves throughput: 6.9E8 (16384 32 12) up to 7.7E8 (65536 128 12)
-    ###GPUFLAGS+= --maxrregcount 128 # improves throughput: 7.3E8 (16384 32 12) up to 7.6E8 (65536 128 12)
-    ###GPUFLAGS+= --maxrregcount 96 # degrades throughput: 4.1E8 (16384 32 12) up to 4.5E8 (65536 128 12)
-    ###GPUFLAGS+= --maxrregcount 64 # degrades throughput: 1.7E8 (16384 32 12) flat at 1.7E8 (65536 128 12)
-
-    CUBUILDRULEFLAGS = -fPIC -c
-    CCBUILDRULEFLAGS = -fPIC -c
-
-  else ifneq ($(origin REQUIRE_HIP),undefined)
-    # If REQUIRE_HIP is set but no cuda is found, stop here (e.g. for CI tests on GPU #443)
-    $(error No hip installation found (set HIP_HOME or make GPUCC visible in PATH))
-  else
-    # No hip. Switch hip compilation off and go to common random numbers in C++
-    $(warning HIP_HOME is not set or is invalid: export HIP_HOME to compile with hip)
-    override GPUCC=
-    override USE_NVTX=
-    override CUINC=
-    override CURANDLIBFLAGS=
-  endif
+#=== Configure the CUDA compiler
+
+# If CXX is not a single word (example "clang++ --gcc-toolchain...") then disable CUDA builds (issue #505)
+# This is because it is impossible to pass this to "CUFLAGS += -ccbin <host-compiler>" below
+ifneq ($(words $(subst ccache ,,$(CXX))),1) # allow at most "CXX=ccache <host-compiler>" from outside
+  $(warning CUDA builds are not supported for multi-word CXX "$(CXX)")
+  override CUDA_HOME=disabled
+endif
+
+# If CUDA_HOME is not set, try to set it from the location of nvcc
+ifndef CUDA_HOME
+  CUDA_HOME = $(patsubst %bin/nvcc,%,$(shell which nvcc 2>/dev/null))
+  $(warning CUDA_HOME was not set: using "$(CUDA_HOME)")
+endif
+
+# Set NVCC as $(CUDA_HOME)/bin/nvcc if it exists
+ifneq ($(wildcard $(CUDA_HOME)/bin/nvcc),)
+  NVCC = $(CUDA_HOME)/bin/nvcc
+  USE_NVTX ?=-DUSE_NVTX
+  # See https://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/index.html
+  # See https://arnon.dk/matching-sm-architectures-arch-and-gencode-for-various-nvidia-cards/
+  # Default: use compute capability 70 for V100 (CERN lxbatch, CERN itscrd, Juwels Cluster).
+  # Embed device code for 70, and PTX for 70+.
+  # Export MADGRAPH_CUDA_ARCHITECTURE (comma-separated list) to use another value or list of values (see #533).
+  # Examples: use 60 for P100 (Piz Daint), 80 for A100 (Juwels Booster, NVidia raplab/Curiosity).
+  MADGRAPH_CUDA_ARCHITECTURE ?= 70
+  ###CUARCHFLAGS = -gencode arch=compute_$(MADGRAPH_CUDA_ARCHITECTURE),code=compute_$(MADGRAPH_CUDA_ARCHITECTURE) -gencode arch=compute_$(MADGRAPH_CUDA_ARCHITECTURE),code=sm_$(MADGRAPH_CUDA_ARCHITECTURE) # Older implementation (AV): go back to this one for multi-GPU support #533
+  ###CUARCHFLAGS = --gpu-architecture=compute_$(MADGRAPH_CUDA_ARCHITECTURE) --gpu-code=sm_$(MADGRAPH_CUDA_ARCHITECTURE),compute_$(MADGRAPH_CUDA_ARCHITECTURE) # Newer implementation (SH): cannot use this as-is for multi-GPU support #533
+  comma:=,
+  CUARCHFLAGS = $(foreach arch,$(subst $(comma), ,$(MADGRAPH_CUDA_ARCHITECTURE)),-gencode arch=compute_$(arch),code=compute_$(arch) -gencode arch=compute_$(arch),code=sm_$(arch))
+  CUINC = -I$(CUDA_HOME)/include/
+  CURANDLIBFLAGS = -L$(CUDA_HOME)/lib64/ -lcurand # NB: -lcuda is not needed here!
+  CUOPTFLAGS = -lineinfo
+  CUFLAGS = $(OPTFLAGS) $(CUOPTFLAGS) $(INCFLAGS) $(CUINC) $(USE_NVTX) $(CUARCHFLAGS) -use_fast_math
+  ###CUFLAGS += -Xcompiler -Wall -Xcompiler -Wextra -Xcompiler -Wshadow
+  ###NVCC_VERSION = $(shell $(NVCC) --version | grep 'Cuda compilation tools' | cut -d' ' -f5 | cut -d, -f1)
+  CUFLAGS += -std=c++17 # need CUDA >= 11.2 (see #333): this is enforced in mgOnGpuConfig.h
+  # Without -maxrregcount: baseline throughput: 6.5E8 (16384 32 12) up to 7.3E8 (65536 128 12)
+  ###CUFLAGS+= --maxrregcount 160 # improves throughput: 6.9E8 (16384 32 12) up to 7.7E8 (65536 128 12)
+  ###CUFLAGS+= --maxrregcount 128 # improves throughput: 7.3E8 (16384 32 12) up to 7.6E8 (65536 128 12)
+  ###CUFLAGS+= --maxrregcount 96 # degrades throughput: 4.1E8 (16384 32 12) up to 4.5E8 (65536 128 12)
+  ###CUFLAGS+= --maxrregcount 64 # degrades throughput: 1.7E8 (16384 32 12) flat at 1.7E8 (65536 128 12)
+else ifneq ($(origin REQUIRE_CUDA),undefined)
+  # If REQUIRE_CUDA is set but no cuda is found, stop here (e.g. for CI tests on GPU #443)
+  $(error No cuda installation found (set CUDA_HOME or make nvcc visible in PATH))
+else
+  # No cuda. Switch cuda compilation off and go to common random numbers in C++
+  $(warning CUDA_HOME is not set or is invalid: export CUDA_HOME to compile with cuda)
+  override NVCC=
+  override USE_NVTX=
+  override CUINC=
+  override CURANDLIBFLAGS=
+endif
 
-  # Allow newer (unsupported) C++ compilers with older versions of CUDA if ALLOW_UNSUPPORTED_COMPILER_IN_CUDA is set (#504)
-  ifneq ($(origin ALLOW_UNSUPPORTED_COMPILER_IN_CUDA),undefined)
-  GPUFLAGS += -allow-unsupported-compiler
-  endif
+# Set the host C++ compiler for nvcc via "-ccbin <host-compiler>"
+# (NB issue #505: this must be a single word, "clang++ --gcc-toolchain..." is not supported)
+CUFLAGS += -ccbin $(shell which $(subst ccache ,,$(CXX)))
 
+# Allow newer (unsupported) C++ compilers with older versions of CUDA if ALLOW_UNSUPPORTED_COMPILER_IN_CUDA is set (#504)
+ifneq ($(origin ALLOW_UNSUPPORTED_COMPILER_IN_CUDA),undefined)
+CUFLAGS += -allow-unsupported-compiler
 endif
 
-  
 #-------------------------------------------------------------------------------
 
 #=== Configure ccache for C++ and CUDA builds
@@ -233,9 +163,9 @@ endif
 #ifeq ($(USECCACHE)$(shell echo $(AR) | grep ccache),1)
 #  override AR:=ccache $(AR)
 #endif
-ifneq ($(GPUCC),)
-  ifeq ($(USECCACHE)$(shell echo $(GPUCC) | grep ccache),1)
-    override GPUCC:=ccache $(GPUCC)
+ifneq ($(NVCC),)
+  ifeq ($(USECCACHE)$(shell echo $(NVCC) | grep ccache),1)
+    override NVCC:=ccache $(NVCC)
   endif
 endif
 
@@ -259,7 +189,7 @@ endif
 
 # PowerPC-specific CUDA compiler flags (to be reviewed!)
 ifeq ($(UNAME_P),ppc64le)
-  GPUFLAGS+= -Xcompiler -mno-float128
+  CUFLAGS+= -Xcompiler -mno-float128
 endif
 
 #-------------------------------------------------------------------------------
@@ -269,10 +199,10 @@ endif
 # Set the default OMPFLAGS choice
 ifneq ($(shell $(CXX) --version | egrep '^Intel'),)
 override OMPFLAGS = -fopenmp
-###override OMPFLAGS = # disable OpenMP MT on Intel (was ok without GPUCC but not ok with GPUCC before #578)
+###override OMPFLAGS = # disable OpenMP MT on Intel (was ok without nvcc but not ok with nvcc before #578)
 else ifneq ($(shell $(CXX) --version | egrep '^(clang)'),)
 override OMPFLAGS = -fopenmp
-###override OMPFLAGS = # disable OpenMP MT on clang (was not ok without or with GPUCC before #578)
+###override OMPFLAGS = # disable OpenMP MT on clang (was not ok without or with nvcc before #578)
 else ifneq ($(shell $(CXX) --version | egrep '^(Apple clang)'),)
 override OMPFLAGS = # disable OpenMP MT on Apple clang (builds fail in the CI #578)
 else
@@ -323,10 +253,7 @@ endif
 
 # Set the default RNDGEN (random number generator) choice
 ifeq ($(RNDGEN),)
-  ifeq ($(GPUCC),)
-    override RNDGEN = hasNoCurand
-  # Edgecase for HIP compilation
-  else ifeq ($(findstring hipcc,$(GPUCC)),hipcc)
+  ifeq ($(NVCC),)
     override RNDGEN = hasNoCurand
   else ifeq ($(RNDGEN),)
     override RNDGEN = hasCurand
@@ -401,13 +328,13 @@ CXXFLAGS+= $(AVXFLAGS)
 $(info FPTYPE=$(FPTYPE))
 ifeq ($(FPTYPE),d)
   CXXFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_DOUBLE
-  GPUFLAGS  += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_DOUBLE
+  CUFLAGS  += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_DOUBLE
 else ifeq ($(FPTYPE),f)
   CXXFLAGS += -DMGONGPU_FPTYPE_FLOAT -DMGONGPU_FPTYPE2_FLOAT
-  GPUFLAGS  += -DMGONGPU_FPTYPE_FLOAT -DMGONGPU_FPTYPE2_FLOAT
+  CUFLAGS  += -DMGONGPU_FPTYPE_FLOAT -DMGONGPU_FPTYPE2_FLOAT
 else ifeq ($(FPTYPE),m)
   CXXFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_FLOAT
-  GPUFLAGS  += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_FLOAT
+  CUFLAGS  += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_FLOAT
 else
   $(error Unknown FPTYPE='$(FPTYPE)': only 'd', 'f' and 'm' are supported)
 endif
@@ -416,7 +343,7 @@ endif
 $(info HELINL=$(HELINL))
 ifeq ($(HELINL),1)
   CXXFLAGS += -DMGONGPU_INLINE_HELAMPS
-  GPUFLAGS  += -DMGONGPU_INLINE_HELAMPS
+  CUFLAGS  += -DMGONGPU_INLINE_HELAMPS
 else ifneq ($(HELINL),0)
   $(error Unknown HELINL='$(HELINL)': only '0' and '1' are supported)
 endif
@@ -425,7 +352,7 @@ endif
 $(info HRDCOD=$(HRDCOD))
 ifeq ($(HRDCOD),1)
   CXXFLAGS += -DMGONGPU_HARDCODE_PARAM
-  GPUFLAGS  += -DMGONGPU_HARDCODE_PARAM
+  CUFLAGS  += -DMGONGPU_HARDCODE_PARAM
 else ifneq ($(HRDCOD),0)
   $(error Unknown HRDCOD='$(HRDCOD)': only '0' and '1' are supported)
 endif
@@ -477,11 +404,11 @@ ifeq ($(UNAME_S),Darwin)
   override CULIBFLAGSRPATH2 =
 else
   # RPATH to cuda/cpp libs when linking executables
-  override CXXLIBFLAGSRPATH = -Wl,-rpath=$(LIBDIRRPATH)
-  override CULIBFLAGSRPATH = -Xlinker -rpath=$(LIBDIRRPATH)
+  override CXXLIBFLAGSRPATH = -Wl,-rpath,$(LIBDIRRPATH)
+  override CULIBFLAGSRPATH = -Xlinker -rpath,$(LIBDIRRPATH)
   # RPATH to common lib when linking cuda/cpp libs
-  override CXXLIBFLAGSRPATH2 = -Wl,-rpath='$$ORIGIN'
-  override CULIBFLAGSRPATH2 = -Xlinker -rpath='$$ORIGIN'
+  override CXXLIBFLAGSRPATH2 = -Wl,-rpath,'$$ORIGIN'
+  override CULIBFLAGSRPATH2 = -Xlinker -rpath,'$$ORIGIN'
 endif
 
 # Setting LD_LIBRARY_PATH or DYLD_LIBRARY_PATH in the RUNTIME is no longer necessary (neither on Linux nor on Mac)
@@ -494,7 +421,7 @@ override RUNTIME =
 cxx_main=$(BUILDDIR)/check.exe
 fcxx_main=$(BUILDDIR)/fcheck.exe
 
-ifneq ($(GPUCC),)
+ifneq ($(NVCC),)
 cu_main=$(BUILDDIR)/gcheck.exe
 fcu_main=$(BUILDDIR)/fgcheck.exe
 else
@@ -525,16 +452,15 @@ $(BUILDDIR)/.build.$(TAG):
 	@touch $(BUILDDIR)/.build.$(TAG)
 
 # Generic target and build rules: objects from CUDA compilation
-ifneq ($(GPUCC),)
+ifneq ($(NVCC),)
 $(BUILDDIR)/%.o : %.cu *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
 	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
-	$(GPUCC) $(CPPFLAGS) $(GPUFLAGS) $(CUBUILDRULEFLAGS) $< -o $@
+	$(NVCC) $(CPPFLAGS) $(CUFLAGS) -Xcompiler -fPIC -c $< -o $@
 
 $(BUILDDIR)/%_cu.o : %.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
 	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
-	$(GPUCC) $(CPPFLAGS) $(GPUFLAGS) $(CCBUILDRULEFLAGS) $< -o $@
+	$(NVCC) $(CPPFLAGS) $(CUFLAGS) -Xcompiler -fPIC -c -x cu $< -o $@
 endif
-# -x cu in line above
 
 # Generic target and build rules: objects from C++ compilation
 # (NB do not include CUINC here! add it only for NVTX or curand #679)
@@ -543,14 +469,11 @@ $(BUILDDIR)/%.o : %.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
 	$(CXX) $(CPPFLAGS) $(CXXFLAGS) -fPIC -c $< -o $@
 
 # Apply special build flags only to CrossSectionKernel.cc and gCrossSectionKernel.cu (no fast math, see #117)
-# Added edgecase for HIP compilation
 ifeq ($(shell $(CXX) --version | grep ^nvc++),)
 $(BUILDDIR)/CrossSectionKernels.o: CXXFLAGS += -fno-fast-math
 $(BUILDDIR)/CrossSectionKernels.o: CXXFLAGS += -fno-fast-math
-ifeq ($(findstring nvcc,$(GPUCC)),nvcc)
-  $(BUILDDIR)/gCrossSectionKernels.o: GPUFLAGS += -Xcompiler -fno-fast-math
-else
-  $(BUILDDIR)/gCrossSectionKernels.o: GPUFLAGS += -fno-fast-math
+ifneq ($(NVCC),)
+$(BUILDDIR)/gCrossSectionKernels.o: CUFLAGS += -Xcompiler -fno-fast-math
 endif
 endif
 
@@ -566,10 +489,10 @@ ifeq ($(RNDGEN),hasCurand)
 $(BUILDDIR)/CurandRandomNumberKernel.o: CXXFLAGS += $(CUINC)
 endif
 
-# Avoid "warning: builtin __has_trivial_... is deprecated; use __is_trivially_... instead" in GPUCC with icx2023 (#592)
+# Avoid "warning: builtin __has_trivial_... is deprecated; use __is_trivially_... instead" in nvcc with icx2023 (#592)
 ifneq ($(shell $(CXX) --version | egrep '^(Intel)'),)
-ifneq ($(GPUCC),)
-GPUFLAGS += -Wno-deprecated-builtins
+ifneq ($(NVCC),)
+CUFLAGS += -Xcompiler -Wno-deprecated-builtins
 endif
 endif
 
@@ -577,8 +500,8 @@ endif
 # This patch does remove the warning, but I prefer to keep it disabled for the moment...
 ###ifneq ($(shell $(CXX) --version | egrep '^(clang|Apple clang|Intel)'),)
 ###$(BUILDDIR)/CrossSectionKernels.o: CXXFLAGS += -Wno-overriding-t-option
-###ifneq ($(GPUCC),)
-###$(BUILDDIR)/gCrossSectionKernels.o: GPUFLAGS += -Xcompiler -Wno-overriding-t-option
+###ifneq ($(NVCC),)
+###$(BUILDDIR)/gCrossSectionKernels.o: CUFLAGS += -Xcompiler -Wno-overriding-t-option
 ###endif
 ###endif
 
@@ -605,7 +528,7 @@ MG5AMC_CXXLIB = mg5amc_$(processid_short)_cpp
 cxx_objects_lib=$(BUILDDIR)/CPPProcess.o $(BUILDDIR)/MatrixElementKernels.o $(BUILDDIR)/BridgeKernels.o $(BUILDDIR)/CrossSectionKernels.o
 cxx_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel.o $(BUILDDIR)/RamboSamplingKernels.o
 
-ifneq ($(GPUCC),)
+ifneq ($(NVCC),)
 MG5AMC_CULIB = mg5amc_$(processid_short)_cuda
 cu_objects_lib=$(BUILDDIR)/gCPPProcess.o $(BUILDDIR)/gMatrixElementKernels.o $(BUILDDIR)/gBridgeKernels.o $(BUILDDIR)/gCrossSectionKernels.o
 cu_objects_exe=$(BUILDDIR)/gCommonRandomNumberKernel.o $(BUILDDIR)/gRamboSamplingKernels.o
@@ -617,11 +540,11 @@ $(LIBDIR)/lib$(MG5AMC_CXXLIB).so: cxx_objects_lib += $(BUILDDIR)/fbridge.o
 $(LIBDIR)/lib$(MG5AMC_CXXLIB).so: $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib)
 	$(CXX) -shared -o $@ $(cxx_objects_lib) $(CXXLIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB)
 
-ifneq ($(GPUCC),)
+ifneq ($(NVCC),)
 $(LIBDIR)/lib$(MG5AMC_CULIB).so: $(BUILDDIR)/fbridge_cu.o
 $(LIBDIR)/lib$(MG5AMC_CULIB).so: cu_objects_lib += $(BUILDDIR)/fbridge_cu.o
 $(LIBDIR)/lib$(MG5AMC_CULIB).so: $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cu_objects_lib)
-	$(GPUCC) --shared -o $@ $(cu_objects_lib) $(CULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB)
+	$(NVCC) --shared -o $@ $(cu_objects_lib) $(CULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB)
 endif
 
 #-------------------------------------------------------------------------------
@@ -638,16 +561,16 @@ $(cxx_main): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PAT
 $(cxx_main): $(BUILDDIR)/check_sa.o $(LIBDIR)/lib$(MG5AMC_CXXLIB).so $(cxx_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel.o
 	$(CXX) -o $@ $(BUILDDIR)/check_sa.o $(OMPFLAGS) -ldl -pthread $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_CXXLIB) $(cxx_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel.o $(CURANDLIBFLAGS)
 
-ifneq ($(GPUCC),)
+ifneq ($(NVCC),)
 ifneq ($(shell $(CXX) --version | grep ^Intel),)
-$(cu_main): LIBFLAGS += -lintlc # compile with icpx and link with GPUCC (undefined reference to `_intel_fast_memcpy')
-$(cu_main): LIBFLAGS += -lsvml # compile with icpx and link with GPUCC (undefined reference to `__svml_cos4_l9')
+$(cu_main): LIBFLAGS += -lintlc # compile with icpx and link with nvcc (undefined reference to `_intel_fast_memcpy')
+$(cu_main): LIBFLAGS += -lsvml # compile with icpx and link with nvcc (undefined reference to `__svml_cos4_l9')
 else ifneq ($(shell $(CXX) --version | grep ^nvc++),) # support nvc++ #531
 $(cu_main): LIBFLAGS += -L$(patsubst %bin/nvc++,%lib,$(subst ccache ,,$(CXX))) -lnvhpcatm -lnvcpumath -lnvc
 endif
 $(cu_main): LIBFLAGS += $(CULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH
 $(cu_main): $(BUILDDIR)/gcheck_sa.o $(LIBDIR)/lib$(MG5AMC_CULIB).so $(cu_objects_exe) $(BUILDDIR)/gCurandRandomNumberKernel.o
-	$(GPUCC) -o $@ $(BUILDDIR)/gcheck_sa.o $(CUARCHFLAGS) $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe) $(BUILDDIR)/gCurandRandomNumberKernel.o $(CURANDLIBFLAGS)
+	$(NVCC) -o $@ $(BUILDDIR)/gcheck_sa.o $(CUARCHFLAGS) $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe) $(BUILDDIR)/gCurandRandomNumberKernel.o $(CURANDLIBFLAGS)
 endif
 
 #-------------------------------------------------------------------------------
@@ -673,17 +596,17 @@ $(fcxx_main): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PA
 $(fcxx_main): $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler.o $(LIBDIR)/lib$(MG5AMC_CXXLIB).so $(cxx_objects_exe)
 	$(CXX) -o $@ $(BUILDDIR)/fcheck_sa.o $(OMPFLAGS) $(BUILDDIR)/fsampler.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CXXLIB) $(cxx_objects_exe)
 
-ifneq ($(GPUCC),)
+ifneq ($(NVCC),)
 ifneq ($(shell $(CXX) --version | grep ^Intel),)
-$(fcu_main): LIBFLAGS += -lintlc # compile with icpx and link with GPUCC (undefined reference to `_intel_fast_memcpy')
-$(fcu_main): LIBFLAGS += -lsvml # compile with icpx and link with GPUCC (undefined reference to `__svml_cos4_l9')
+$(fcu_main): LIBFLAGS += -lintlc # compile with icpx and link with nvcc (undefined reference to `_intel_fast_memcpy')
+$(fcu_main): LIBFLAGS += -lsvml # compile with icpx and link with nvcc (undefined reference to `__svml_cos4_l9')
 endif
 ifeq ($(UNAME_S),Darwin)
 $(fcu_main): LIBFLAGS += -L$(shell dirname $(shell $(FC) --print-file-name libgfortran.dylib)) # add path to libgfortran on Mac #375
 endif
 $(fcu_main): LIBFLAGS += $(CULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH
 $(fcu_main): $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler_cu.o $(LIBDIR)/lib$(MG5AMC_CULIB).so $(cu_objects_exe)
-	$(GPUCC) -o $@ $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler_cu.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe)
+	$(NVCC) -o $@ $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler_cu.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe)
 endif
 
 #-------------------------------------------------------------------------------
@@ -695,7 +618,7 @@ $(BUILDDIR)/testxxx.o: testxxx_cc_ref.txt
 $(testmain): $(BUILDDIR)/testxxx.o
 $(testmain): cxx_objects_exe += $(BUILDDIR)/testxxx.o # Comment out this line to skip the C++ test of xxx functions
 
-ifneq ($(GPUCC),)
+ifneq ($(NVCC),)
 $(BUILDDIR)/testxxx_cu.o: $(GTESTLIBS)
 $(BUILDDIR)/testxxx_cu.o: INCFLAGS += $(GTESTINC)
 $(BUILDDIR)/testxxx_cu.o: testxxx_cc_ref.txt
@@ -708,7 +631,7 @@ $(BUILDDIR)/testmisc.o: INCFLAGS += $(GTESTINC)
 $(testmain): $(BUILDDIR)/testmisc.o
 $(testmain): cxx_objects_exe += $(BUILDDIR)/testmisc.o # Comment out this line to skip the C++ miscellaneous tests
 
-ifneq ($(GPUCC),)
+ifneq ($(NVCC),)
 $(BUILDDIR)/testmisc_cu.o: $(GTESTLIBS)
 $(BUILDDIR)/testmisc_cu.o: INCFLAGS += $(GTESTINC)
 $(testmain): $(BUILDDIR)/testmisc_cu.o
@@ -720,12 +643,12 @@ $(BUILDDIR)/runTest.o: INCFLAGS += $(GTESTINC)
 $(testmain): $(BUILDDIR)/runTest.o
 $(testmain): cxx_objects_exe += $(BUILDDIR)/runTest.o
 
-ifneq ($(GPUCC),)
+ifneq ($(NVCC),)
 $(BUILDDIR)/runTest_cu.o: $(GTESTLIBS)
 $(BUILDDIR)/runTest_cu.o: INCFLAGS += $(GTESTINC)
 ifneq ($(shell $(CXX) --version | grep ^Intel),)
-$(testmain): LIBFLAGS += -lintlc # compile with icpx and link with GPUCC (undefined reference to `_intel_fast_memcpy')
-$(testmain): LIBFLAGS += -lsvml # compile with icpx and link with GPUCC (undefined reference to `__svml_cos4_l9')
+$(testmain): LIBFLAGS += -lintlc # compile with icpx and link with nvcc (undefined reference to `_intel_fast_memcpy')
+$(testmain): LIBFLAGS += -lsvml # compile with icpx and link with nvcc (undefined reference to `__svml_cos4_l9')
 else ifneq ($(shell $(CXX) --version | grep ^nvc++),) # support nvc++ #531
 $(testmain): LIBFLAGS += -L$(patsubst %bin/nvc++,%lib,$(subst ccache ,,$(CXX))) -lnvhpcatm -lnvcpumath -lnvc
 endif
@@ -749,14 +672,14 @@ $(testmain): LIBFLAGS += -lgomp
 endif
 endif
 
-ifeq ($(GPUCC),) # link only runTest.o
+ifeq ($(NVCC),) # link only runTest.o
 $(testmain): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH
 $(testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(cxx_objects_exe) $(GTESTLIBS)
 	$(CXX) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) -ldl -pthread $(LIBFLAGS)
 else # link both runTest.o and runTest_cu.o
 $(testmain): LIBFLAGS += $(CULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH
 $(testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) $(GTESTLIBS)
-	  $(GPUCC) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) -ldl $(LIBFLAGS) $(CUDATESTFLAGS)
+	$(NVCC) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) -ldl $(LIBFLAGS) -lcuda
 endif
 
 # Use flock (Linux only, no Mac) to allow 'make -j' if googletest has not yet been downloaded https://stackoverflow.com/a/32666215
@@ -859,9 +782,9 @@ ifeq ($(USECCACHE),1)
 	ccache --version | head -1
 endif
 	@echo ""
-	@echo GPUCC=$(GPUCC)
-ifneq ($(GPUCC),)
-	$(GPUCC) --version
+	@echo NVCC=$(NVCC)
+ifneq ($(NVCC),)
+	$(NVCC) --version
 endif
 	@echo ""
 	@echo CXX=$(CXX)
@@ -880,7 +803,7 @@ endif
 
 # Target: check (run the C++ test executable)
 # [NB THIS IS WHAT IS USED IN THE GITHUB CI!]
-ifneq ($(GPUCC),)
+ifneq ($(NVCC),)
 check: runTest cmpFcheck cmpFGcheck
 else
 check: runTest cmpFcheck
diff --git a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/fbridge.cc b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/fbridge.cc
index 2b956730d4..f93c05b0b3 100644
--- a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/fbridge.cc
+++ b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/fbridge.cc
@@ -1,11 +1,11 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: S. Roiser (Oct 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Roiser, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #include "Bridge.h"
 #include "CPPProcess.h"
-#include "GpuRuntime.h"
+#include "CudaRuntime.h"
 
 extern "C"
 {
@@ -22,7 +22,7 @@ extern "C"
    * Using the same Fortran MadEvent code, linking to the hetrerogeneous library would allow access to both CPU and GPU implementations.
    * The specific heterogeneous configuration (how many GPUs, how many threads on each CPU, etc) could be loaded in CUDA/C++ from a data file.
    */
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
   using namespace mg5amcGpu;
 #else
   using namespace mg5amcCpu;
@@ -46,8 +46,8 @@ extern "C"
    */
   void fbridgecreate_( CppObjectInFortran** ppbridge, const int* pnevtF, const int* pnparF, const int* pnp4F )
   {
-#ifdef MGONGPUCPP_GPUIMPL
-    GpuRuntime::setUp();
+#ifdef __CUDACC__
+    CudaRuntime::setUp();
 #endif
     // Create a process object, read parm card and set parameters
     // FIXME: the process instance can happily go out of scope because it is only needed to read parameters?
@@ -69,8 +69,8 @@ extern "C"
     Bridge<FORTRANFPTYPE>* pbridge = dynamic_cast<Bridge<FORTRANFPTYPE>*>( *ppbridge );
     if( pbridge == 0 ) throw std::runtime_error( "fbridgedelete_: invalid Bridge address" );
     delete pbridge;
-#ifdef MGONGPUCPP_GPUIMPL
-    GpuRuntime::tearDown();
+#ifdef __CUDACC__
+    CudaRuntime::tearDown();
 #endif
   }
 
@@ -100,7 +100,7 @@ extern "C"
   {
     Bridge<FORTRANFPTYPE>* pbridge = dynamic_cast<Bridge<FORTRANFPTYPE>*>( *ppbridge );
     if( pbridge == 0 ) throw std::runtime_error( "fbridgesequence_: invalid Bridge address" );
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
     // Use the device/GPU implementation in the CUDA library
     // (there is also a host implementation in this library)
     pbridge->gpu_sequence( momenta, gs, rndhel, rndcol, *pchannelId, mes, selhel, selcol );
diff --git a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/fsampler.cc b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/fsampler.cc
index 3743934f41..2fb445372d 100644
--- a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/fsampler.cc
+++ b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/fsampler.cc
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Feb 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #include "mgOnGpuConfig.h"
 
@@ -13,7 +13,7 @@
 
 //--------------------------------------------------------------------------
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -40,7 +40,7 @@ namespace mg5amcCpu
   private:
     const int m_nevt; // The number of events in each iteration
     int m_iiter;      // The iteration counter (for random number seeding)
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
     HostBufferRndNumMomenta m_hstRndmom; // Memory buffers for random numbers
     HostBufferMomenta m_hstMomenta;      // Memory buffers for momenta
     HostBufferWeights m_hstWeights;      // Memory buffers for sampling weights
@@ -105,7 +105,7 @@ namespace mg5amcCpu
 
 extern "C"
 {
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
   using namespace mg5amcGpu;
 #else
   using namespace mg5amcCpu;
diff --git a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/runTest.cc b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/runTest.cc
index 461ec5c3a5..572e28aaea 100644
--- a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/runTest.cc
+++ b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/runTest.cc
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: S. Hageboeck (Nov 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 
 #include "mgOnGpuConfig.h"
 
@@ -15,7 +15,7 @@
 #include "RandomNumberKernels.h"
 #include "epoch_process_id.h"
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 using namespace mg5amcGpu;
 #else
 using namespace mg5amcCpu;
@@ -32,7 +32,7 @@ struct CUDA_CPU_TestBase : public TestDriverBase
     : TestDriverBase( npar, refFileName ) {}
 };
 
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
 struct CPUTest : public CUDA_CPU_TestBase
 {
   // Struct data members (process, and memory structures for random numbers, momenta, matrix elements and weights on host and device)
@@ -114,7 +114,7 @@ struct CPUTest : public CUDA_CPU_TestBase
 };
 #endif
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 struct CUDATest : public CUDA_CPU_TestBase
 {
   // Reset the device when our test goes out of scope. Note that this should happen after
@@ -123,7 +123,7 @@ struct CUDATest : public CUDA_CPU_TestBase
   {
     ~DeviceReset()
     {
-      gpuDeviceReset(); // this is needed by cuda-memcheck --leak-check full
+      checkCuda( cudaDeviceReset() ); // this is needed by cuda-memcheck --leak-check full
     }
   } deviceResetter;
 
@@ -249,7 +249,7 @@ INSTANTIATE_TEST_SUITE_P( prefix, \
                           test_suite_name, \
                           testing::Values( new CUDATest( MG_EPOCH_REFERENCE_FILE_NAME ) ) );
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 MG_INSTANTIATE_TEST_SUITE_GPU( XTESTID_GPU( MG_EPOCH_PROCESS_ID ), MadgraphTest );
 #else
 MG_INSTANTIATE_TEST_SUITE_CPU( XTESTID_CPU( MG_EPOCH_PROCESS_ID ), MadgraphTest );
diff --git a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/testmisc.cc b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/testmisc.cc
index 2bd7a9fcf9..989aba1fdc 100644
--- a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/testmisc.cc
+++ b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/testmisc.cc
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 //----------------------------------------------------------------------------
 // Use ./runTest.exe --gtest_filter=*misc to run only this test
 //----------------------------------------------------------------------------
@@ -17,7 +17,7 @@
 #include <sstream>
 #include <typeinfo>
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 #define TESTID( s ) s##_GPU_MISC
 #else
 #define TESTID( s ) s##_CPU_MISC
diff --git a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/testxxx.cc b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/testxxx.cc
index ac9c95d539..a66e595176 100644
--- a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/testxxx.cc
+++ b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/testxxx.cc
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Apr 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #include "mgOnGpuConfig.h"
 
@@ -20,7 +20,7 @@
 #include <iomanip>
 #include <iostream>
 #include <vector>
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 #define TESTID( s ) s##_GPU_XXX
 #else
 #define TESTID( s ) s##_CPU_XXX
@@ -41,7 +41,7 @@ TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testxxx )
   assert( nevt % neppM == 0 ); // nevt must be a multiple of neppM
   assert( nevt % neppV == 0 ); // nevt must be a multiple of neppV
   // Fill in the input momenta
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
   mg5amcGpu::PinnedHostBufferMomenta hstMomenta( nevt ); // AOSOA[npagM][npar=4][np4=4][neppM]
 #else
   mg5amcCpu::HostBufferMomenta hstMomenta( nevt ); // AOSOA[npagM][npar=4][np4=4][neppM]
@@ -228,7 +228,7 @@ TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testxxx )
   {
     for( int ievt = 0; ievt < nevt; ievt++ )
     {
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
       using namespace mg5amcGpu;
 #else
       using namespace mg5amcCpu;
diff --git a/epochX/cudacpp/heft_gg_h.sa/src/HelAmps_heft.h b/epochX/cudacpp/heft_gg_h.sa/src/HelAmps_heft.h
index c8cb1ed767..5e79643e1c 100644
--- a/epochX/cudacpp/heft_gg_h.sa/src/HelAmps_heft.h
+++ b/epochX/cudacpp/heft_gg_h.sa/src/HelAmps_heft.h
@@ -4,7 +4,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: A. Valassi (Sep 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
 // MadGraph5_aMC@NLO v. 3.5.0_lo_vect, 2023-06-09
@@ -26,7 +26,7 @@
 //#include <iomanip>
 //#include <iostream>
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/heft_gg_h.sa/src/Parameters_heft.cc b/epochX/cudacpp/heft_gg_h.sa/src/Parameters_heft.cc
index 3741400a1c..fcf1b455b9 100644
--- a/epochX/cudacpp/heft_gg_h.sa/src/Parameters_heft.cc
+++ b/epochX/cudacpp/heft_gg_h.sa/src/Parameters_heft.cc
@@ -4,7 +4,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: A. Valassi (Sep 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
 // MadGraph5_aMC@NLO v. 3.5.0_lo_vect, 2023-06-09
diff --git a/epochX/cudacpp/heft_gg_h.sa/src/Parameters_heft.h b/epochX/cudacpp/heft_gg_h.sa/src/Parameters_heft.h
index 689ae12a60..6d56738204 100644
--- a/epochX/cudacpp/heft_gg_h.sa/src/Parameters_heft.h
+++ b/epochX/cudacpp/heft_gg_h.sa/src/Parameters_heft.h
@@ -222,7 +222,7 @@ namespace Parameters_heft_dependentCouplings
 #pragma GCC diagnostic push
 #pragma GCC diagnostic ignored "-Wunused-variable"  // e.g. <<warning: unused variable ‘mdl_G__exp__2’ [-Wunused-variable]>>
 #pragma GCC diagnostic ignored "-Wunused-parameter" // e.g. <<warning: unused parameter ‘G’ [-Wunused-parameter]>>
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 #pragma nv_diagnostic push
 #pragma nv_diag_suppress 177 // e.g. <<warning #177-D: variable "mdl_G__exp__2" was declared but never referenced>>
 #endif
@@ -273,7 +273,7 @@ namespace Parameters_heft_dependentCouplings
     // End non-SM (e.g. EFT) implementation - special handling of vectors of floats (#439)
     return out;
   }
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 #pragma GCC diagnostic pop
 #pragma nv_diagnostic pop
 #endif
@@ -289,7 +289,7 @@ namespace Parameters_heft_independentCouplings
 
 //==========================================================================
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/heft_gg_h.sa/src/mgOnGpuConfig.h b/epochX/cudacpp/heft_gg_h.sa/src/mgOnGpuConfig.h
index d4e37d19b3..6c0c4919e9 100644
--- a/epochX/cudacpp/heft_gg_h.sa/src/mgOnGpuConfig.h
+++ b/epochX/cudacpp/heft_gg_h.sa/src/mgOnGpuConfig.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jul 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MGONGPUCONFIG_H
 #define MGONGPUCONFIG_H 1
@@ -10,26 +10,13 @@
 // There are two different code bases for standalone_cudacpp (without multichannel) and madevent+cudacpp (with multichannel)
 #undef MGONGPU_SUPPORTS_MULTICHANNEL
 
-// Is this a GPU (CUDA, HIP) or CPU implementation?
-#ifdef __CUDACC__
-#define MGONGPUCPP_GPUIMPL cuda
-#elif defined __HIPCC__
-#define MGONGPUCPP_GPUIMPL hip
-#else
-#undef MGONGPUCPP_GPUIMPL
-#endif
-
 // ** NB1 Throughputs (e.g. 6.8E8) are events/sec for "./gcheck.exe -p 65536 128 12"
 // ** NB2 Baseline on b7g47n0004 fluctuates (probably depends on load on other VMs)
 
 // Choose if curand is supported for generating random numbers
-// For CUDA, by default, it is supported
-// For HIP, by default, it is not supported
 // For C++, by default, do not inline, but allow this macro to be set from outside with e.g. -DMGONGPU_HAS_NO_CURAND
 #ifdef __CUDACC__
 #undef MGONGPU_HAS_NO_CURAND
-#elif defined __HIPCC__
-#define MGONGPU_HAS_NO_CURAND 1
 #else
 //#undef MGONGPU_HAS_NO_CURAND // default
 ////#define MGONGPU_HAS_NO_CURAND 1
@@ -65,28 +52,23 @@
 //#undef MGONGPU_HARDCODE_PARAM // default
 ////#define MGONGPU_HARDCODE_PARAM 1
 
-// Complex type in CUDA: thrust or cucomplex or cxsmpl (CHOOSE ONLY ONE)
+// Complex type in c++: std::complex or cxsmpl (CHOOSE ONLY ONE)
+#ifndef __CUDACC__
+//#define MGONGPU_CPPCXTYPE_STDCOMPLEX 1 // ~8 percent slower on float, same on double (5.1E6/double, 9.4E6/float)
+#define MGONGPU_CPPCXTYPE_CXSMPL 1 // new default (5.1E6/double, 10.2E6/float)
+#endif
+
+// Complex type in cuda: thrust or cucomplex or cxsmpl (CHOOSE ONLY ONE)
 #ifdef __CUDACC__
 #define MGONGPU_CUCXTYPE_THRUST 1 // default (~1.15E9/double, ~3.2E9/float)
 //#define MGONGPU_CUCXTYPE_CUCOMPLEX 1 // ~10 percent slower (1.03E9/double, ~2.8E9/float)
 //#define MGONGPU_CUCXTYPE_CXSMPL 1 // ~10 percent slower (1.00E9/double, ~2.9E9/float)
-
-// Complex type in HIP: cxsmpl (ONLY ONE OPTION POSSIBLE)
-#elif defined __HIPCC__
-#define MGONGPU_CUCXTYPE_CXSMPL 1 // ~10 percent slower (1.00E9/double, ~2.9E9/float)
-
-// Complex type in C++: std::complex or cxsmpl (CHOOSE ONLY ONE)
-#else
-//#define MGONGPU_CPPCXTYPE_STDCOMPLEX 1 // ~8 percent slower on float, same on double (5.1E6/double, 9.4E6/float)
-#define MGONGPU_CPPCXTYPE_CXSMPL 1 // new default (5.1E6/double, 10.2E6/float)
 #endif
 
-// CUDA nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation
+// Cuda nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation
 #ifdef __CUDACC__
-#undef MGONGPU_NSIGHT_DEBUG // default in CUDA
+#undef MGONGPU_NSIGHT_DEBUG // default
 //#define MGONGPU_NSIGHT_DEBUG 1
-#else
-#undef MGONGPU_NSIGHT_DEBUG // only option in HIP or C++
 #endif
 
 // SANITY CHECKS (floating point precision for everything but color algebra #537)
@@ -102,21 +84,17 @@
 #error You cannot use double precision for color algebra and single precision elsewhere
 #endif
 
-// SANITY CHECKS (CUDA complex number implementation)
-#ifdef __CUDACC__
-#if defined MGONGPU_CUCXTYPE_THRUST and defined MGONGPU_CUCXTYPE_CUCOMPLEX
-#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CUCXTYPE_THRUST or MGONGPU_CUCXTYPE_CUCOMPLEX for CUDA
-#elif defined MGONGPU_CUCXTYPE_THRUST and defined MGONGPU_CUCXTYPE_CXSMPL
-#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CUCXTYPE_THRUST or MGONGPU_CUCXTYPE_CXSMPL for CUDA
-#elif defined MGONGPU_CUCXTYPE_CUCOMPLEX and defined MGONGPU_CUCXTYPE_CXSMPL
-#error You must CHOOSE (ONE AND) ONLY ONE OF MGONGPU_CUCXTYPE_CUCOMPLEX or MGONGPU_CUCXTYPE_CXSMPL for CUDA
+// SANITY CHECKS (c++ complex number implementation)
+#ifndef __CUDACC__
+#if defined MGONGPU_CPPCXTYPE_STDCOMPLEX and defined MGONGPU_CPPCXTYPE_CXSMPL
+#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CPPCXTYPE_STDCOMPLEX or MGONGPU_CPPCXTYPE_CXSMPL
 #endif
 #endif
 
-// SANITY CHECKS (C++ complex number implementation)
-#ifndef MGONGPUCPP_GPUIMPL
-#if defined MGONGPU_CPPCXTYPE_STDCOMPLEX and defined MGONGPU_CPPCXTYPE_CXSMPL
-#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CPPCXTYPE_STDCOMPLEX or MGONGPU_CPPCXTYPE_CXSMPL for C++
+// SANITY CHECKS (cuda complex number implementation)
+#ifdef __CUDACC__
+#if defined MGONGPU_CUCXTYPE_THRUST and defined MGONGPU_CUCXTYPE_CUCOMPLEX and defined MGONGPU_CUCXTYPE_CXSMPL
+#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CUCXTYPE_THRUST or MGONGPU_CUCXTYPE_CUCOMPLEX or MGONGPU_CUCXTYPE_CXSMPL
 #endif
 #endif
 
@@ -153,7 +131,7 @@ namespace mgOnGpu
   // Alignment requirement for using reinterpret_cast with SIMD vectorized code
   // (using reinterpret_cast with non aligned memory may lead to segmentation faults!)
   // Only needed for C++ code but can be enforced also in NVCC builds of C++ code using CUDA>=11.2 and C++17 (#318, #319, #333)
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
   constexpr int cppAlign = 64; // alignment requirement for SIMD vectorization (64-byte i.e. 512-bit)
 #endif
 
@@ -164,7 +142,7 @@ using mgOnGpu::fptype;
 using mgOnGpu::fptype2;
 
 // C++ SIMD vectorization width (this will be used to set neppV)
-#ifdef MGONGPUCPP_GPUIMPL // CUDA and HIP implementations have no SIMD
+#ifdef __CUDACC__ // CUDA implementation has no SIMD
 #undef MGONGPU_CPPSIMD
 #elif defined __AVX512VL__ && defined MGONGPU_PVW512 // C++ "512z" AVX512 with 512 width (512-bit ie 64-byte): 8 (DOUBLE) or 16 (FLOAT)
 #ifdef MGONGPU_FPTYPE_DOUBLE
@@ -194,9 +172,9 @@ using mgOnGpu::fptype2;
 #undef MGONGPU_CPPSIMD
 #endif
 
-// CUDA nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation
+// Cuda nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation
 // Arguments (not used so far): text is __FUNCTION__, code is 0 (start) or 1 (end)
-#if defined __CUDA__ && defined MGONGPU_NSIGHT_DEBUG /* clang-format off */
+#if defined __CUDACC__ && defined MGONGPU_NSIGHT_DEBUG /* clang-format off */
 #define mgDebugDeclare() __shared__ float mgDebugCounter[mgOnGpu::ntpbMAX];
 #define mgDebugInitialise() { mgDebugCounter[threadIdx.x] = 0; }
 #define mgDebug( code, text ) { mgDebugCounter[threadIdx.x] += 1; }
@@ -208,8 +186,8 @@ using mgOnGpu::fptype2;
 #define mgDebugFinalise() { /*noop*/ }
 #endif /* clang-format on */
 
-// Define empty CUDA/HIP declaration specifiers for C++
-#ifndef MGONGPUCPP_GPUIMPL
+// Define empty CUDA declaration specifiers for C++
+#ifndef __CUDACC__
 #define __global__
 #define __host__
 #define __device__
diff --git a/epochX/cudacpp/heft_gg_h.sa/src/mgOnGpuCxtypes.h b/epochX/cudacpp/heft_gg_h.sa/src/mgOnGpuCxtypes.h
index 4e7ab03fa2..0cb2f1db7e 100644
--- a/epochX/cudacpp/heft_gg_h.sa/src/mgOnGpuCxtypes.h
+++ b/epochX/cudacpp/heft_gg_h.sa/src/mgOnGpuCxtypes.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022, based on earlier work by D. Smith) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MGONGPUCXTYPES_H
 #define MGONGPUCXTYPES_H 1
@@ -19,7 +19,7 @@
 #include <complex>
 
 // Complex type in cuda: thrust or cucomplex or cxsmpl
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 #if defined MGONGPU_CUCXTYPE_THRUST
 #pragma clang diagnostic push
 #pragma clang diagnostic ignored "-Wtautological-compare" // for icpx2021/clang13 (https://stackoverflow.com/a/15864661)
@@ -201,7 +201,7 @@ namespace mgOnGpu
 {
 
   // --- Type definitions (complex type: cxtype)
-#ifdef MGONGPUCPP_GPUIMPL // cuda
+#ifdef __CUDACC__ // cuda
 #if defined MGONGPU_CUCXTYPE_THRUST
   typedef thrust::complex<fptype> cxtype;
 #elif defined MGONGPU_CUCXTYPE_CUCOMPLEX
@@ -281,7 +281,7 @@ cxmake( const std::complex<double>& c ) // std::complex to cxsmpl (double-to-flo
 
 //==========================================================================
 
-#if defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CUCXTYPE_THRUST // cuda + thrust
+#if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_THRUST // cuda + thrust
 
 //------------------------------
 // CUDA - using thrust::complex
@@ -317,11 +317,11 @@ cxmake( const cxtype& c )
   return c;
 }
 
-#endif // #if defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CUCXTYPE_THRUST
+#endif // #if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_THRUST
 
 //==========================================================================
 
-#if defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CUCXTYPE_CUCOMPLEX // cuda + cucomplex
+#if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_CUCOMPLEX // cuda + cucomplex
 
 //------------------------------
 // CUDA - using cuComplex
@@ -536,11 +536,11 @@ cxmake( const std::complex<fptype>& c ) // std::complex to cucomplex (float-to-f
   return cxmake( c.real(), c.imag() );
 }
 
-#endif // #if defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CUCXTYPE_CUCOMPLEX
+#endif // #if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_CUCOMPLEX
 
 //==========================================================================
 
-#if not defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CPPCXTYPE_STDCOMPLEX // c++ + stdcomplex
+#if not defined __CUDACC__ and defined MGONGPU_CPPCXTYPE_STDCOMPLEX // c++ + stdcomplex
 
 //------------------------------
 // C++ - using std::complex
@@ -584,7 +584,7 @@ cxmake( const std::complex<double>& c ) // std::complex to std::complex (cast do
 }
 #endif
 
-#endif // #if not defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CPPCXTYPE_STDCOMPLEX
+#endif // #if not defined __CUDACC__ and defined MGONGPU_CPPCXTYPE_STDCOMPLEX
 
 //==========================================================================
 
diff --git a/epochX/cudacpp/heft_gg_h.sa/src/mgOnGpuFptypes.h b/epochX/cudacpp/heft_gg_h.sa/src/mgOnGpuFptypes.h
index 6f6cee64d6..a1cde16a67 100644
--- a/epochX/cudacpp/heft_gg_h.sa/src/mgOnGpuFptypes.h
+++ b/epochX/cudacpp/heft_gg_h.sa/src/mgOnGpuFptypes.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MGONGPUFPTYPES_H
 #define MGONGPUFPTYPES_H 1
@@ -13,7 +13,7 @@
 
 //==========================================================================
 
-#ifdef MGONGPUCPP_GPUIMPL // cuda
+#ifdef __CUDACC__ // cuda
 
 //------------------------------
 // Floating point types - Cuda
@@ -57,11 +57,11 @@ fpsqrt( const fptype& f )
 #endif
 }
 
-#endif // #ifdef MGONGPUCPP_GPUIMPL
+#endif // #ifdef __CUDACC__
 
 //==========================================================================
 
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
 
 //------------------------------
 // Floating point types - C++
@@ -85,7 +85,7 @@ fpsqrt( const fptype& f )
   return std::sqrt( f );
 }
 
-#endif // #ifndef MGONGPUCPP_GPUIMPL
+#endif // #ifndef __CUDACC__
 
 //==========================================================================
 
diff --git a/epochX/cudacpp/heft_gg_h.sa/src/mgOnGpuVectors.h b/epochX/cudacpp/heft_gg_h.sa/src/mgOnGpuVectors.h
index 7904b93c61..9d3e82b1e3 100644
--- a/epochX/cudacpp/heft_gg_h.sa/src/mgOnGpuVectors.h
+++ b/epochX/cudacpp/heft_gg_h.sa/src/mgOnGpuVectors.h
@@ -108,7 +108,7 @@ namespace mgOnGpu /* clang-format off */
 #endif
 #endif
 
-#else // i.e #ifndef MGONGPU_CPPSIMD (this includes #ifdef MGONGPUCPP_GPUIMPL)
+#else // i.e #ifndef MGONGPU_CPPSIMD (this includes #ifdef __CUDACC__)
 
   const int neppV = 1;
 
@@ -129,7 +129,7 @@ using mgOnGpu::bool_v;
 
 //--------------------------------------------------------------------------
 
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
 
 // Printout to stream for user defined types
 
@@ -744,11 +744,11 @@ fpvsplit1( const fptype2_v& v )
 
 #endif // #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
 
-#endif // #ifndef MGONGPUCPP_GPUIMPL
+#endif // #ifndef __CUDACC__
 
 //==========================================================================
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 
 //------------------------------
 // Vector types - CUDA
@@ -786,12 +786,12 @@ cxternary( const bool& mask, const cxtype& a, const cxtype& b )
   return ( mask ? a : b );
 }
 
-#endif // #ifdef MGONGPUCPP_GPUIMPL
+#endif // #ifdef __CUDACC__
 
 //==========================================================================
 
 // Scalar-or-vector types: scalar in CUDA, vector or scalar in C++
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 typedef bool bool_sv;
 typedef fptype fptype_sv;
 typedef fptype2 fptype2_sv;
@@ -812,7 +812,7 @@ typedef mgOnGpu::cxtype_ref cxtype_sv_ref;
 #endif
 
 // Scalar-or-vector zeros: scalar in CUDA, vector or scalar in C++
-#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
+#ifdef __CUDACC__ /* clang-format off */
 inline __host__ __device__ cxtype cxzero_sv(){ return cxtype( 0, 0 ); }
 #elif defined MGONGPU_CPPSIMD
 inline cxtype_v cxzero_sv() { return cxtype_v(); } // RRRR=0000 IIII=0000
diff --git a/epochX/cudacpp/heft_gg_h.sa/src/rambo.h b/epochX/cudacpp/heft_gg_h.sa/src/rambo.h
index cd7e1008ea..e02ea52496 100644
--- a/epochX/cudacpp/heft_gg_h.sa/src/rambo.h
+++ b/epochX/cudacpp/heft_gg_h.sa/src/rambo.h
@@ -4,7 +4,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 
 #include "mgOnGpuConfig.h"
@@ -18,7 +18,7 @@
 #include <iostream>
 
 // Simplified rambo version for 2 to N (with N>=2) processes with massless particles
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -83,7 +83,7 @@ namespace mg5amcCpu
       static bool first = true;
       if( first )
       {
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
         if constexpr( M_ACCESS::isOnDevice() ) // avoid
         {
           const int ievt0 = 0;
@@ -166,7 +166,7 @@ namespace mg5amcCpu
     wt = po2log;
     if( nparf != 2 ) wt = ( 2. * nparf - 4. ) * log( energy ) + z[nparf - 1];
 
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
     // issue warnings if weight is too small or too large
     static int iwarn[5] = { 0, 0, 0, 0, 0 };
     if( wt < -180. )

From ef615f54bfb6b3b82649416a1f9556e2a72ff414 Mon Sep 17 00:00:00 2001
From: Andrea Valassi <andrea.valassi@cern.ch>
Date: Tue, 25 Jul 2023 16:39:32 +0200
Subject: [PATCH 393/509] [jthip] regenerate ggttgg.mad - all looks ok! so
 CODEGEN is in sync with ggttgg.mad

But building the latter fails, e.g.

ccache /usr/local/cuda-12.0/bin/nvcc -o fgcheck.exe ./fcheck_sa.o ./fsampler_cu.o -L../../lib -lmg5amc_common -Xlinker -rpath='$ORIGIN/../../lib'  -lgfortran -L../../lib -lmg5amc_gg_ttxgg_cuda ./gCommonRandomNumberKernel.o ./gRamboSamplingKernels.o
/cvmfs/sft.cern.ch/lcg/releases/binutils/2.37-4177a/x86_64-centos8/bin/ld: ./gCPPProcess.o: in function `mg5amcGpu::CPPProcess::initProc(std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const&)':
tmpxft_0035b9d4_00000000-6_gCPPProcess.cudafe1.cpp:(.text+0x9c8): undefined reference to `mg5amcGpu::Parameters_sm::getInstance()'
---
 .../gg_ttgg.mad/CODEGEN_mad_gg_ttgg_log.txt   | 36 +++++++++----------
 .../gg_ttgg.mad/Source/DHELAS/aloha_file.inc  |  2 +-
 2 files changed, 19 insertions(+), 19 deletions(-)

diff --git a/epochX/cudacpp/gg_ttgg.mad/CODEGEN_mad_gg_ttgg_log.txt b/epochX/cudacpp/gg_ttgg.mad/CODEGEN_mad_gg_ttgg_log.txt
index c8f29ccb7d..895de5b9bb 100644
--- a/epochX/cudacpp/gg_ttgg.mad/CODEGEN_mad_gg_ttgg_log.txt
+++ b/epochX/cudacpp/gg_ttgg.mad/CODEGEN_mad_gg_ttgg_log.txt
@@ -62,7 +62,7 @@ generate g g > t t~ g g
 No model currently active, so we import the Standard Model
 INFO: load particles 
 INFO: load vertices 
-[1;32mDEBUG: model prefixing  takes 0.005085468292236328 [0m
+[1;32mDEBUG: model prefixing  takes 0.00452733039855957 [0m
 INFO: Restrict model sm with file models/sm/restrict_default.dat . 
 [1;32mDEBUG: Simplifying conditional expressions [0m
 [1;32mDEBUG: remove interactions: u s w+ at order: QED=1 [0m
@@ -177,7 +177,7 @@ INFO: Creating files in directory P1_gg_ttxgg
 [1;32mDEBUG:  Entering PLUGIN_OneProcessExporter.__init__ [1;30m[model_handling.py at line 1039][0m [0m
 [1;32mDEBUG:  kwargs[prefix] = 0 [1;30m[model_handling.py at line 1040][0m [0m
 [1;32mDEBUG:  proc_id = [0m 1 [1;30m[model_handling.py at line 1045][0m [0m
-[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_SA_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7fad092fa0d0> [1;30m[export_v4.py at line 6174][0m [0m
+[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_SA_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7fcfa8d262e0> [1;30m[export_v4.py at line 6174][0m [0m
 INFO: Creating files in directory . 
 [1;32mDEBUG:  Entering PLUGIN_OneProcessExporter.generate_process_files [1;30m[model_handling.py at line 1297][0m [0m
 [1;32mDEBUG:  self.include_multi_channel is already defined: this is madevent+second_exporter mode [1;30m[model_handling.py at line 1299][0m [0m
@@ -192,15 +192,15 @@ FileWriter <class 'PLUGIN.CUDACPP_SA_OUTPUT.model_handling.PLUGIN_CPPWriter'> fo
 [1;32mDEBUG:  self.support_multichannel, self.include_multi_channel = [0m True [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 0, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 0, 46, 47, 48, 49, 50, 51, 52, 53, 54, 0, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 0, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 0, 88, 89, 90, 91, 92, 93, 0, 94, 95, 96, 97, 98, 99, 0, 100, 101, 102, 103, 104, 105, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] [1;30m[model_handling.py at line 1163][0m [0m
 [1;32mDEBUG:  multi_channel = [0m {1: [1], 2: [2], 3: [3], 4: [4], 5: [5], 6: [6], 7: [7], 8: [8], 9: [9], 10: [10], 11: [11], 12: [12], 13: [13], 14: [14], 15: [15], 16: [16], 17: [17], 18: [18], 19: [19], 20: [20], 21: [21], 22: [22], 23: [23], 24: [24], 25: [25], 26: [26], 27: [27], 28: [28], 29: [29], 30: [30], 31: [32], 32: [33], 33: [34], 34: [35], 35: [36], 36: [37], 37: [38], 38: [39], 39: [40], 40: [41], 41: [42], 42: [43], 43: [44], 44: [45], 45: [46], 46: [48], 47: [49], 48: [50], 49: [51], 50: [52], 51: [53], 52: [54], 53: [55], 54: [56], 55: [58], 56: [59], 57: [60], 58: [61], 59: [62], 60: [63], 61: [64], 62: [65], 63: [66], 64: [67], 65: [68], 66: [69], 67: [70], 68: [71], 69: [72], 70: [74], 71: [75], 72: [76], 73: [77], 74: [78], 75: [79], 76: [80], 77: [81], 78: [82], 79: [83], 80: [84], 81: [85], 82: [86], 83: [87], 84: [88], 85: [89], 86: [90], 87: [91], 88: [93], 89: [94], 90: [95], 91: [96], 92: [97], 93: [98], 94: [100], 95: [101], 96: [102], 97: [103], 98: [104], 99: [105], 100: [107], 101: [108], 102: [109], 103: [110], 104: [111], 105: [112]} [1;30m[model_handling.py at line 1169][0m [0m
 [1;32mDEBUG:  multi_channel_map = [0m {1: [1], 2: [2], 3: [3], 4: [4], 5: [5], 6: [6], 7: [7], 8: [8], 9: [9], 10: [10], 11: [11], 12: [12], 13: [13], 14: [14], 15: [15], 16: [16], 17: [17], 18: [18], 19: [19], 20: [20], 21: [21], 22: [22], 23: [23], 24: [24], 25: [25], 26: [26], 27: [27], 28: [28], 29: [29], 30: [30], 31: [32], 32: [33], 33: [34], 34: [35], 35: [36], 36: [37], 37: [38], 38: [39], 39: [40], 40: [41], 41: [42], 42: [43], 43: [44], 44: [45], 45: [46], 46: [48], 47: [49], 48: [50], 49: [51], 50: [52], 51: [53], 52: [54], 53: [55], 54: [56], 55: [58], 56: [59], 57: [60], 58: [61], 59: [62], 60: [63], 61: [64], 62: [65], 63: [66], 64: [67], 65: [68], 66: [69], 67: [70], 68: [71], 69: [72], 70: [74], 71: [75], 72: [76], 73: [77], 74: [78], 75: [79], 76: [80], 77: [81], 78: [82], 79: [83], 80: [84], 81: [85], 82: [86], 83: [87], 84: [88], 85: [89], 86: [90], 87: [91], 88: [93], 89: [94], 90: [95], 91: [96], 92: [97], 93: [98], 94: [100], 95: [101], 96: [102], 97: [103], 98: [104], 99: [105], 100: [107], 101: [108], 102: [109], 103: [110], 104: [111], 105: [112]} [1;30m[model_handling.py at line 1654][0m [0m
-[1;32mDEBUG:  diag_to_config = [0m {4: 1, 5: 2, 6: 3, 7: 4, 8: 5, 9: 6, 10: 7, 11: 8, 12: 9, 13: 10, 14: 11, 15: 12, 16: 13, 17: 14, 18: 15, 19: 16, 20: 17, 21: 18, 22: 19, 23: 20, 24: 21, 25: 22, 26: 23, 27: 24, 28: 25, 29: 26, 30: 27, 31: 28, 32: 29, 33: 30, 37: 31, 38: 32, 39: 33, 40: 34, 41: 35, 42: 36, 43: 37, 44: 38, 45: 39, 46: 40, 47: 41, 48: 42, 49: 43, 50: 44, 51: 45, 55: 46, 56: 47, 57: 48, 58: 49, 59: 50, 60: 51, 61: 52, 62: 53, 63: 54, 67: 55, 68: 56, 69: 57, 70: 58, 71: 59, 72: 60, 73: 61, 74: 62, 75: 63, 76: 64, 77: 65, 78: 66, 79: 67, 80: 68, 81: 69, 85: 70, 86: 71, 87: 72, 88: 73, 89: 74, 90: 75, 91: 76, 92: 77, 93: 78, 94: 79, 95: 80, 96: 81, 97: 82, 98: 83, 99: 84, 100: 85, 101: 86, 102: 87, 106: 88, 107: 89, 108: 90, 109: 91, 110: 92, 111: 93, 115: 94, 116: 95, 117: 96, 118: 97, 119: 98, 120: 99, 124: 100, 125: 101, 126: 102, 127: 103, 128: 104, 129: 105} [1;30m[model_handling.py at line 1709][0m [0m
-[1;32mDEBUG:  call = [0m vxxxxx( momenta,m_pars->%s, cHel[ihel][%d],%+d, w_sv[%d], %d ); [1;30m[model_handling.py at line 1821][0m [0m
-[1;32mDEBUG:  ('ZERO', 0, -1, 0, 0) [1;30m[model_handling.py at line 1822][0m [0m
-[1;32mDEBUG:  call = [0m vxxxxx( momenta,m_pars->%s, cHel[ihel][%d],%+d, w_sv[%d], %d ); [1;30m[model_handling.py at line 1821][0m [0m
-[1;32mDEBUG:  ('ZERO', 1, -1, 1, 1) [1;30m[model_handling.py at line 1822][0m [0m
-[1;32mDEBUG:  call = [0m vxxxxx( momenta,m_pars->%s, cHel[ihel][%d],%+d, w_sv[%d], %d ); [1;30m[model_handling.py at line 1821][0m [0m
-[1;32mDEBUG:  ('ZERO', 4, 1, 4, 4) [1;30m[model_handling.py at line 1822][0m [0m
-[1;32mDEBUG:  call = [0m vxxxxx( momenta,m_pars->%s, cHel[ihel][%d],%+d, w_sv[%d], %d ); [1;30m[model_handling.py at line 1821][0m [0m
-[1;32mDEBUG:  ('ZERO', 5, 1, 5, 5) [1;30m[model_handling.py at line 1822][0m [0m
+[1;32mDEBUG:  diag_to_config = [0m {4: 1, 5: 2, 6: 3, 7: 4, 8: 5, 9: 6, 10: 7, 11: 8, 12: 9, 13: 10, 14: 11, 15: 12, 16: 13, 17: 14, 18: 15, 19: 16, 20: 17, 21: 18, 22: 19, 23: 20, 24: 21, 25: 22, 26: 23, 27: 24, 28: 25, 29: 26, 30: 27, 31: 28, 32: 29, 33: 30, 37: 31, 38: 32, 39: 33, 40: 34, 41: 35, 42: 36, 43: 37, 44: 38, 45: 39, 46: 40, 47: 41, 48: 42, 49: 43, 50: 44, 51: 45, 55: 46, 56: 47, 57: 48, 58: 49, 59: 50, 60: 51, 61: 52, 62: 53, 63: 54, 67: 55, 68: 56, 69: 57, 70: 58, 71: 59, 72: 60, 73: 61, 74: 62, 75: 63, 76: 64, 77: 65, 78: 66, 79: 67, 80: 68, 81: 69, 85: 70, 86: 71, 87: 72, 88: 73, 89: 74, 90: 75, 91: 76, 92: 77, 93: 78, 94: 79, 95: 80, 96: 81, 97: 82, 98: 83, 99: 84, 100: 85, 101: 86, 102: 87, 106: 88, 107: 89, 108: 90, 109: 91, 110: 92, 111: 93, 115: 94, 116: 95, 117: 96, 118: 97, 119: 98, 120: 99, 124: 100, 125: 101, 126: 102, 127: 103, 128: 104, 129: 105} [1;30m[model_handling.py at line 1711][0m [0m
+[1;32mDEBUG:  call = [0m vxxxxx( momenta,m_pars->%s, cHel[ihel][%d],%+d, w_sv[%d], %d ); [1;30m[model_handling.py at line 1823][0m [0m
+[1;32mDEBUG:  ('ZERO', 0, -1, 0, 0) [1;30m[model_handling.py at line 1824][0m [0m
+[1;32mDEBUG:  call = [0m vxxxxx( momenta,m_pars->%s, cHel[ihel][%d],%+d, w_sv[%d], %d ); [1;30m[model_handling.py at line 1823][0m [0m
+[1;32mDEBUG:  ('ZERO', 1, -1, 1, 1) [1;30m[model_handling.py at line 1824][0m [0m
+[1;32mDEBUG:  call = [0m vxxxxx( momenta,m_pars->%s, cHel[ihel][%d],%+d, w_sv[%d], %d ); [1;30m[model_handling.py at line 1823][0m [0m
+[1;32mDEBUG:  ('ZERO', 4, 1, 4, 4) [1;30m[model_handling.py at line 1824][0m [0m
+[1;32mDEBUG:  call = [0m vxxxxx( momenta,m_pars->%s, cHel[ihel][%d],%+d, w_sv[%d], %d ); [1;30m[model_handling.py at line 1823][0m [0m
+[1;32mDEBUG:  ('ZERO', 5, 1, 5, 5) [1;30m[model_handling.py at line 1824][0m [0m
 INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./. 
 [1;32mDEBUG:  Entering PLUGIN_OneProcessExporter.edit_CMakeLists [1;30m[model_handling.py at line 1343][0m [0m
 [1;32mDEBUG:  Entering PLUGIN_OneProcessExporter.edit_check_sa [1;30m[model_handling.py at line 1352][0m [0m
@@ -217,15 +217,15 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./.
 [1;32mDEBUG:  Done [1;30m[export_cpp.py at line 713][0m [0m
 INFO: Generating Feynman diagrams for Process: g g > t t~ g g WEIGHTED<=4 @1 
 INFO: Finding symmetric diagrams for subprocess group gg_ttxgg 
-Generated helas calls for 1 subprocesses (123 diagrams) in 0.384 s
-Wrote files for 222 helas calls in 0.659 s
+Generated helas calls for 1 subprocesses (123 diagrams) in 0.381 s
+Wrote files for 222 helas calls in 0.657 s
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates VVV1 routines[0m
 ALOHA: aloha creates FFV1 routines[0m
 ALOHA: aloha creates VVVV1 routines[0m
 ALOHA: aloha creates VVVV3 routines[0m
 ALOHA: aloha creates VVVV4 routines[0m
-ALOHA: aloha creates 5 routines in  0.282 s
+ALOHA: aloha creates 5 routines in  0.280 s
 [1;32mDEBUG:  Entering PLUGIN_ProcessExporter.convert_model (create the model) [1;30m[output.py at line 194][0m [0m
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates VVV1 routines[0m
@@ -233,7 +233,7 @@ ALOHA: aloha creates FFV1 routines[0m
 ALOHA: aloha creates VVVV1 routines[0m
 ALOHA: aloha creates VVVV3 routines[0m
 ALOHA: aloha creates VVVV4 routines[0m
-ALOHA: aloha creates 10 routines in  0.271 s
+ALOHA: aloha creates 10 routines in  0.265 s
 <class 'aloha.create_aloha.AbstractRoutine'> VVV1
 <class 'aloha.create_aloha.AbstractRoutine'> VVV1
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
@@ -272,6 +272,6 @@ Type "launch" to generate events from this process, or see
 Run "open index.html" to see more information about this process.
 quit
 
-real	0m3.022s
-user	0m2.797s
-sys	0m0.209s
+real	0m3.139s
+user	0m2.758s
+sys	0m0.263s
diff --git a/epochX/cudacpp/gg_ttgg.mad/Source/DHELAS/aloha_file.inc b/epochX/cudacpp/gg_ttgg.mad/Source/DHELAS/aloha_file.inc
index ec923afd6d..cf4ec946f8 100644
--- a/epochX/cudacpp/gg_ttgg.mad/Source/DHELAS/aloha_file.inc
+++ b/epochX/cudacpp/gg_ttgg.mad/Source/DHELAS/aloha_file.inc
@@ -1 +1 @@
-ALOHARoutine = FFV1_1.o VVVV4_0.o VVVV4P0_1.o FFV1_0.o VVV1_0.o FFV1_2.o VVVV3_0.o VVVV1_0.o VVVV3P0_1.o VVVV1P0_1.o VVV1P0_1.o FFV1P0_3.o
+ALOHARoutine = VVVV3_0.o VVVV4P0_1.o VVVV3P0_1.o VVVV1P0_1.o FFV1_1.o FFV1_2.o VVV1P0_1.o VVV1_0.o FFV1_0.o FFV1P0_3.o VVVV1_0.o VVVV4_0.o

From 70c884ae3129e087c86f7504e2cc40efed95bac6 Mon Sep 17 00:00:00 2001
From: Andrea Valassi <andrea.valassi@cern.ch>
Date: Tue, 25 Jul 2023 16:44:37 +0200
Subject: [PATCH 394/509] [jthip] in ggttgg.mad cudacpp_src.mk, use GPUCC
 instead of NVCC as implemented by Jorgen in Subprocesses

The build now succeeds
---
 epochX/cudacpp/gg_ttgg.mad/src/cudacpp_src.mk | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/epochX/cudacpp/gg_ttgg.mad/src/cudacpp_src.mk b/epochX/cudacpp/gg_ttgg.mad/src/cudacpp_src.mk
index 554d7a704c..f73ff6fa03 100644
--- a/epochX/cudacpp/gg_ttgg.mad/src/cudacpp_src.mk
+++ b/epochX/cudacpp/gg_ttgg.mad/src/cudacpp_src.mk
@@ -38,13 +38,13 @@ endif
 
 #-------------------------------------------------------------------------------
 
-#=== Configure the CUDA compiler (note: NVCC is already exported including ccache)
+#=== Configure the CUDA compiler (note: GPUCC is already exported including ccache)
 
-###$(info NVCC=$(NVCC))
+###$(info GPUCC=$(GPUCC))
 
 #-------------------------------------------------------------------------------
 
-#=== Configure ccache for C++ builds (note: NVCC is already exported including ccache)
+#=== Configure ccache for C++ builds (note: GPUCC is already exported including ccache)
 
 # Enable ccache if USECCACHE=1
 ifeq ($(USECCACHE)$(shell echo $(CXX) | grep ccache),1)
@@ -246,20 +246,20 @@ $(BUILDDIR)/%.o : %.cc *.h $(BUILDDIR)/.build.$(TAG)
 # Generic target and build rules: objects from CUDA compilation
 $(BUILDDIR)/%_cu.o : %.cc *.h $(BUILDDIR)/.build.$(TAG)
 	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
-	$(NVCC) $(CPPFLAGS) $(CUFLAGS) -Xcompiler -fPIC -c -x cu $< -o $@
+	$(GPUCC) $(CPPFLAGS) $(CUFLAGS) -Xcompiler -fPIC -c -x cu $< -o $@
 
 #-------------------------------------------------------------------------------
 
 cxx_objects=$(addprefix $(BUILDDIR)/, Parameters_sm.o read_slha.o)
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 cu_objects=$(addprefix $(BUILDDIR)/, Parameters_sm_cu.o)
 endif
 
 # Target (and build rules): common (src) library
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so : $(cxx_objects) $(cu_objects)
 	@if [ ! -d $(LIBDIR) ]; then echo "mkdir -p $(LIBDIR)"; mkdir -p $(LIBDIR); fi
-	$(NVCC) -shared -o $@ $(cxx_objects) $(cu_objects)
+	$(GPUCC) -shared -o $@ $(cxx_objects) $(cu_objects)
 else
 $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so : $(cxx_objects)
 	@if [ ! -d $(LIBDIR) ]; then echo "mkdir -p $(LIBDIR)"; mkdir -p $(LIBDIR); fi

From a912fc542472dc128b517bc8cadd226d99478d59 Mon Sep 17 00:00:00 2001
From: Andrea Valassi <andrea.valassi@cern.ch>
Date: Tue, 25 Jul 2023 16:54:15 +0200
Subject: [PATCH 395/509] [jthip/namespace] in ggttgg.mad, manually check and
 fix the pending __CUDACC__ (added in the namespace branch) that must be
 changed to MGONGPUCPP_GPUIMPL

---
 .../gg_ttgg.mad/SubProcesses/CurandRandomNumberKernel.cc  | 4 ++--
 .../gg_ttgg.mad/SubProcesses/MemoryAccessAmplitudes.h     | 2 +-
 .../gg_ttgg.mad/SubProcesses/MemoryAccessCouplings.h      | 2 +-
 .../gg_ttgg.mad/SubProcesses/MemoryAccessCouplingsFixed.h | 2 +-
 .../gg_ttgg.mad/SubProcesses/MemoryAccessDenominators.h   | 2 +-
 epochX/cudacpp/gg_ttgg.mad/SubProcesses/MemoryAccessGs.h  | 2 +-
 .../gg_ttgg.mad/SubProcesses/MemoryAccessMatrixElements.h | 2 +-
 .../gg_ttgg.mad/SubProcesses/MemoryAccessNumerators.h     | 2 +-
 .../gg_ttgg.mad/SubProcesses/MemoryAccessWavefunctions.h  | 2 +-
 epochX/cudacpp/gg_ttgg.mad/SubProcesses/testmisc.cc       | 4 ++--
 epochX/cudacpp/gg_ttgg.mad/SubProcesses/testxxx.cc        | 6 +++---
 epochX/cudacpp/gg_ttgg.mad/src/Parameters_sm.cc           | 2 +-
 epochX/cudacpp/gg_ttgg.mad/src/Parameters_sm.h            | 6 +++---
 epochX/cudacpp/gg_ttgg.mad/src/mgOnGpuCxtypes.h           | 8 ++++----
 epochX/cudacpp/gg_ttgg.mad/src/mgOnGpuFptypes.h           | 2 +-
 epochX/cudacpp/gg_ttgg.mad/src/mgOnGpuVectors.h           | 4 ++--
 16 files changed, 26 insertions(+), 26 deletions(-)

diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/CurandRandomNumberKernel.cc b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/CurandRandomNumberKernel.cc
index 38c477c17a..08a16f6f2c 100644
--- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/CurandRandomNumberKernel.cc
+++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/CurandRandomNumberKernel.cc
@@ -22,7 +22,7 @@ inline void assertCurand( curandStatus_t code, const char *file, int line, bool
 }
 #endif /* clang-format on */
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -36,7 +36,7 @@ namespace mg5amcCpu
   {
     if( m_isOnDevice )
     {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
       if( !m_rnarray.isOnDevice() )
         throw std::runtime_error( "CurandRandomNumberKernel on device with a host random number array" );
 #else
diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MemoryAccessAmplitudes.h b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MemoryAccessAmplitudes.h
index 573b3bbbc9..ffb76e93de 100644
--- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MemoryAccessAmplitudes.h
+++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MemoryAccessAmplitudes.h
@@ -15,7 +15,7 @@
 #define MGONGPU_TRIVIAL_AMPLITUDES 1
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MemoryAccessCouplings.h b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MemoryAccessCouplings.h
index 35a3af42e0..3afdf3e554 100644
--- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MemoryAccessCouplings.h
+++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MemoryAccessCouplings.h
@@ -15,7 +15,7 @@
 #include "MemoryBuffers.h"       // for HostBufferCouplings::isaligned
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MemoryAccessCouplingsFixed.h b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MemoryAccessCouplingsFixed.h
index dc0d93afff..ffcdf4dbef 100644
--- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MemoryAccessCouplingsFixed.h
+++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MemoryAccessCouplingsFixed.h
@@ -14,7 +14,7 @@
 //#include "MemoryAccessHelpers.h"
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MemoryAccessDenominators.h b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MemoryAccessDenominators.h
index 3bce635718..66f2d32a6b 100644
--- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MemoryAccessDenominators.h
+++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MemoryAccessDenominators.h
@@ -10,7 +10,7 @@
 #include "MemoryAccessGs.h"
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MemoryAccessGs.h b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MemoryAccessGs.h
index 31311aa375..4c726b30f3 100644
--- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MemoryAccessGs.h
+++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MemoryAccessGs.h
@@ -13,7 +13,7 @@
 #include "MemoryBuffers.h" // for HostBufferMatrixElements::isaligned
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MemoryAccessMatrixElements.h b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MemoryAccessMatrixElements.h
index f32e6fea5b..3741011971 100644
--- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MemoryAccessMatrixElements.h
+++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MemoryAccessMatrixElements.h
@@ -13,7 +13,7 @@
 #include "MemoryBuffers.h" // for HostBufferMatrixElements::isaligned
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MemoryAccessNumerators.h b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MemoryAccessNumerators.h
index b152183b28..18991f4fa6 100644
--- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MemoryAccessNumerators.h
+++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MemoryAccessNumerators.h
@@ -10,7 +10,7 @@
 #include "MemoryAccessGs.h"
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MemoryAccessWavefunctions.h b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MemoryAccessWavefunctions.h
index 5428aaf933..33bef4559e 100644
--- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MemoryAccessWavefunctions.h
+++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MemoryAccessWavefunctions.h
@@ -15,7 +15,7 @@
 #define MGONGPU_TRIVIAL_WAVEFUNCTIONS 1
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/testmisc.cc b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/testmisc.cc
index 8f3480c45f..ba9e59a8a3 100644
--- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/testmisc.cc
+++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/testmisc.cc
@@ -26,7 +26,7 @@
 #define XTESTID( s ) TESTID( s )
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -59,7 +59,7 @@ namespace mg5amcCpu
 
 TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testmisc )
 {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   using namespace mg5amcGpu;
 #else
   using namespace mg5amcCpu;
diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/testxxx.cc b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/testxxx.cc
index 99045901db..8d18f8d0fd 100644
--- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/testxxx.cc
+++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/testxxx.cc
@@ -32,7 +32,7 @@
 
 #define XTESTID( s ) TESTID( s )
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -42,7 +42,7 @@ namespace mg5amcCpu
   int FPEhandlerIevt = -1;
   inline void FPEhandler( int sig )
   {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     std::cerr << "Floating Point Exception (GPU): '" << FPEhandlerMessage << "' ievt=" << FPEhandlerIevt << std::endl;
 #else
     std::cerr << "Floating Point Exception (CPU neppV=" << neppV << "): '" << FPEhandlerMessage << "' ievt=" << FPEhandlerIevt << std::endl;
@@ -53,7 +53,7 @@ namespace mg5amcCpu
 
 TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testxxx )
 {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   using namespace mg5amcGpu;
 #else
   using namespace mg5amcCpu;
diff --git a/epochX/cudacpp/gg_ttgg.mad/src/Parameters_sm.cc b/epochX/cudacpp/gg_ttgg.mad/src/Parameters_sm.cc
index 41fb6e4c0f..01e7d9bcf2 100644
--- a/epochX/cudacpp/gg_ttgg.mad/src/Parameters_sm.cc
+++ b/epochX/cudacpp/gg_ttgg.mad/src/Parameters_sm.cc
@@ -17,7 +17,7 @@
 #include <iomanip>
 #include <iostream>
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 using namespace mg5amcGpu;
 #else
 using namespace mg5amcCpu;
diff --git a/epochX/cudacpp/gg_ttgg.mad/src/Parameters_sm.h b/epochX/cudacpp/gg_ttgg.mad/src/Parameters_sm.h
index af0876f367..b44537e599 100644
--- a/epochX/cudacpp/gg_ttgg.mad/src/Parameters_sm.h
+++ b/epochX/cudacpp/gg_ttgg.mad/src/Parameters_sm.h
@@ -27,7 +27,7 @@
 #include "read_slha.h"
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -93,7 +93,7 @@ namespace mg5amcCpu
 #include <limits>
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -218,7 +218,7 @@ namespace mg5amcCpu
 //==========================================================================
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/gg_ttgg.mad/src/mgOnGpuCxtypes.h b/epochX/cudacpp/gg_ttgg.mad/src/mgOnGpuCxtypes.h
index 687d449117..6ae0c42ecb 100644
--- a/epochX/cudacpp/gg_ttgg.mad/src/mgOnGpuCxtypes.h
+++ b/epochX/cudacpp/gg_ttgg.mad/src/mgOnGpuCxtypes.h
@@ -82,7 +82,7 @@ namespace mgOnGpu /* clang-format off */
 using mgOnGpu::cxsmpl;
 
 // Printout to stream for user defined types
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -209,7 +209,7 @@ namespace mg5amcCpu
 //==========================================================================
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -249,7 +249,7 @@ namespace mg5amcCpu
 //==========================================================================
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -627,7 +627,7 @@ namespace mg5amcCpu
 //==========================================================================
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/gg_ttgg.mad/src/mgOnGpuFptypes.h b/epochX/cudacpp/gg_ttgg.mad/src/mgOnGpuFptypes.h
index e71fa26aec..fa3a02664b 100644
--- a/epochX/cudacpp/gg_ttgg.mad/src/mgOnGpuFptypes.h
+++ b/epochX/cudacpp/gg_ttgg.mad/src/mgOnGpuFptypes.h
@@ -20,7 +20,7 @@ namespace mg5amcCpu
 {
   //==========================================================================
 
-#ifdef __CUDACC__ // cuda
+#ifdef MGONGPUCPP_GPUIMPL // cuda
 
   //------------------------------
   // Floating point types - Cuda
diff --git a/epochX/cudacpp/gg_ttgg.mad/src/mgOnGpuVectors.h b/epochX/cudacpp/gg_ttgg.mad/src/mgOnGpuVectors.h
index 1a1530af9d..1d5722cb30 100644
--- a/epochX/cudacpp/gg_ttgg.mad/src/mgOnGpuVectors.h
+++ b/epochX/cudacpp/gg_ttgg.mad/src/mgOnGpuVectors.h
@@ -32,7 +32,7 @@
 #endif
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -153,7 +153,7 @@ namespace mg5amcCpu
 //==========================================================================
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu

From af0f0d4458fd5089ff47188b5631e6aa8e1014f3 Mon Sep 17 00:00:00 2001
From: Andrea Valassi <andrea.valassi@cern.ch>
Date: Tue, 25 Jul 2023 16:59:59 +0200
Subject: [PATCH 396/509] [jthip/namespace] backport latest changes from
 ggttgg.mad to CODEGEN

---
 .../iolibs/template_files/cpp_model_parameters_cc.inc     | 2 +-
 .../iolibs/template_files/cpp_model_parameters_h.inc      | 6 +++---
 .../iolibs/template_files/gpu/CurandRandomNumberKernel.cc | 4 ++--
 .../iolibs/template_files/gpu/MemoryAccessAmplitudes.h    | 2 +-
 .../iolibs/template_files/gpu/MemoryAccessCouplings.h     | 2 +-
 .../template_files/gpu/MemoryAccessCouplingsFixed.h       | 2 +-
 .../iolibs/template_files/gpu/MemoryAccessDenominators.h  | 2 +-
 .../madgraph/iolibs/template_files/gpu/MemoryAccessGs.h   | 2 +-
 .../template_files/gpu/MemoryAccessMatrixElements.h       | 2 +-
 .../iolibs/template_files/gpu/MemoryAccessNumerators.h    | 2 +-
 .../iolibs/template_files/gpu/MemoryAccessWavefunctions.h | 2 +-
 .../madgraph/iolibs/template_files/gpu/mgOnGpuCxtypes.h   | 8 ++++----
 .../madgraph/iolibs/template_files/gpu/mgOnGpuFptypes.h   | 2 +-
 .../madgraph/iolibs/template_files/gpu/mgOnGpuVectors.h   | 4 ++--
 .../madgraph/iolibs/template_files/gpu/testmisc.cc        | 4 ++--
 .../madgraph/iolibs/template_files/gpu/testxxx.cc         | 6 +++---
 16 files changed, 26 insertions(+), 26 deletions(-)

diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/cpp_model_parameters_cc.inc b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/cpp_model_parameters_cc.inc
index 05b664981d..54ce4c64cf 100644
--- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/cpp_model_parameters_cc.inc
+++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/cpp_model_parameters_cc.inc
@@ -15,7 +15,7 @@
 #include <iomanip>
 #include <iostream>
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 using namespace mg5amcGpu;
 #else
 using namespace mg5amcCpu;
diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/cpp_model_parameters_h.inc b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/cpp_model_parameters_h.inc
index e0b7c27131..94b8dd6444 100644
--- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/cpp_model_parameters_h.inc
+++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/cpp_model_parameters_h.inc
@@ -25,7 +25,7 @@
 #include "read_slha.h"
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -85,7 +85,7 @@ namespace mg5amcCpu
 #include <limits>
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -155,7 +155,7 @@ namespace mg5amcCpu
 //==========================================================================
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/CurandRandomNumberKernel.cc b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/CurandRandomNumberKernel.cc
index 38c477c17a..08a16f6f2c 100644
--- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/CurandRandomNumberKernel.cc
+++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/CurandRandomNumberKernel.cc
@@ -22,7 +22,7 @@ inline void assertCurand( curandStatus_t code, const char *file, int line, bool
 }
 #endif /* clang-format on */
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -36,7 +36,7 @@ namespace mg5amcCpu
   {
     if( m_isOnDevice )
     {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
       if( !m_rnarray.isOnDevice() )
         throw std::runtime_error( "CurandRandomNumberKernel on device with a host random number array" );
 #else
diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MemoryAccessAmplitudes.h b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MemoryAccessAmplitudes.h
index 573b3bbbc9..ffb76e93de 100644
--- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MemoryAccessAmplitudes.h
+++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MemoryAccessAmplitudes.h
@@ -15,7 +15,7 @@
 #define MGONGPU_TRIVIAL_AMPLITUDES 1
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MemoryAccessCouplings.h b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MemoryAccessCouplings.h
index 1afc589b11..b4b76f3842 100644
--- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MemoryAccessCouplings.h
+++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MemoryAccessCouplings.h
@@ -15,7 +15,7 @@
 #include "MemoryBuffers.h"       // for HostBufferCouplings::isaligned
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MemoryAccessCouplingsFixed.h b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MemoryAccessCouplingsFixed.h
index dc0d93afff..ffcdf4dbef 100644
--- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MemoryAccessCouplingsFixed.h
+++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MemoryAccessCouplingsFixed.h
@@ -14,7 +14,7 @@
 //#include "MemoryAccessHelpers.h"
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MemoryAccessDenominators.h b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MemoryAccessDenominators.h
index 3bce635718..66f2d32a6b 100644
--- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MemoryAccessDenominators.h
+++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MemoryAccessDenominators.h
@@ -10,7 +10,7 @@
 #include "MemoryAccessGs.h"
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MemoryAccessGs.h b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MemoryAccessGs.h
index 31311aa375..4c726b30f3 100644
--- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MemoryAccessGs.h
+++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MemoryAccessGs.h
@@ -13,7 +13,7 @@
 #include "MemoryBuffers.h" // for HostBufferMatrixElements::isaligned
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MemoryAccessMatrixElements.h b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MemoryAccessMatrixElements.h
index f32e6fea5b..3741011971 100644
--- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MemoryAccessMatrixElements.h
+++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MemoryAccessMatrixElements.h
@@ -13,7 +13,7 @@
 #include "MemoryBuffers.h" // for HostBufferMatrixElements::isaligned
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MemoryAccessNumerators.h b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MemoryAccessNumerators.h
index b152183b28..18991f4fa6 100644
--- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MemoryAccessNumerators.h
+++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MemoryAccessNumerators.h
@@ -10,7 +10,7 @@
 #include "MemoryAccessGs.h"
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MemoryAccessWavefunctions.h b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MemoryAccessWavefunctions.h
index 5428aaf933..33bef4559e 100644
--- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MemoryAccessWavefunctions.h
+++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MemoryAccessWavefunctions.h
@@ -15,7 +15,7 @@
 #define MGONGPU_TRIVIAL_WAVEFUNCTIONS 1
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/mgOnGpuCxtypes.h b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/mgOnGpuCxtypes.h
index 687d449117..6ae0c42ecb 100644
--- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/mgOnGpuCxtypes.h
+++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/mgOnGpuCxtypes.h
@@ -82,7 +82,7 @@ namespace mgOnGpu /* clang-format off */
 using mgOnGpu::cxsmpl;
 
 // Printout to stream for user defined types
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -209,7 +209,7 @@ namespace mg5amcCpu
 //==========================================================================
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -249,7 +249,7 @@ namespace mg5amcCpu
 //==========================================================================
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -627,7 +627,7 @@ namespace mg5amcCpu
 //==========================================================================
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/mgOnGpuFptypes.h b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/mgOnGpuFptypes.h
index e71fa26aec..fa3a02664b 100644
--- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/mgOnGpuFptypes.h
+++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/mgOnGpuFptypes.h
@@ -20,7 +20,7 @@ namespace mg5amcCpu
 {
   //==========================================================================
 
-#ifdef __CUDACC__ // cuda
+#ifdef MGONGPUCPP_GPUIMPL // cuda
 
   //------------------------------
   // Floating point types - Cuda
diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/mgOnGpuVectors.h b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/mgOnGpuVectors.h
index 1a1530af9d..1d5722cb30 100644
--- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/mgOnGpuVectors.h
+++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/mgOnGpuVectors.h
@@ -32,7 +32,7 @@
 #endif
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -153,7 +153,7 @@ namespace mg5amcCpu
 //==========================================================================
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/testmisc.cc b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/testmisc.cc
index 8f3480c45f..ba9e59a8a3 100644
--- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/testmisc.cc
+++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/testmisc.cc
@@ -26,7 +26,7 @@
 #define XTESTID( s ) TESTID( s )
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -59,7 +59,7 @@ namespace mg5amcCpu
 
 TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testmisc )
 {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   using namespace mg5amcGpu;
 #else
   using namespace mg5amcCpu;
diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/testxxx.cc b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/testxxx.cc
index 9828c108e8..68a7bf876e 100644
--- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/testxxx.cc
+++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/testxxx.cc
@@ -32,7 +32,7 @@
 
 #define XTESTID( s ) TESTID( s )
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -42,7 +42,7 @@ namespace mg5amcCpu
   int FPEhandlerIevt = -1;
   inline void FPEhandler( int sig )
   {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     std::cerr << "Floating Point Exception (GPU): '" << FPEhandlerMessage << "' ievt=" << FPEhandlerIevt << std::endl;
 #else
     std::cerr << "Floating Point Exception (CPU neppV=" << neppV << "): '" << FPEhandlerMessage << "' ievt=" << FPEhandlerIevt << std::endl;
@@ -53,7 +53,7 @@ namespace mg5amcCpu
 
 TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testxxx )
 {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   using namespace mg5amcGpu;
 #else
   using namespace mg5amcCpu;

From 1eb284af9dc999c3d0eb17a558f534c3ae307d72 Mon Sep 17 00:00:00 2001
From: Andrea Valassi <andrea.valassi@cern.ch>
Date: Tue, 25 Jul 2023 17:06:31 +0200
Subject: [PATCH 397/509] [jthip] in CODEGEN, backport also cudacpp_src.mk
 using GPUCC instead of NVCC

---
 .../iolibs/template_files/gpu/cudacpp_src.mk       | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/cudacpp_src.mk b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/cudacpp_src.mk
index dac2e47d1d..f3a26552db 100644
--- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/cudacpp_src.mk
+++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/cudacpp_src.mk
@@ -38,13 +38,13 @@ endif
 
 #-------------------------------------------------------------------------------
 
-#=== Configure the CUDA compiler (note: NVCC is already exported including ccache)
+#=== Configure the CUDA compiler (note: GPUCC is already exported including ccache)
 
-###$(info NVCC=$(NVCC))
+###$(info GPUCC=$(GPUCC))
 
 #-------------------------------------------------------------------------------
 
-#=== Configure ccache for C++ builds (note: NVCC is already exported including ccache)
+#=== Configure ccache for C++ builds (note: GPUCC is already exported including ccache)
 
 # Enable ccache if USECCACHE=1
 ifeq ($(USECCACHE)$(shell echo $(CXX) | grep ccache),1)
@@ -246,20 +246,20 @@ $(BUILDDIR)/%%.o : %%.cc *.h $(BUILDDIR)/.build.$(TAG)
 # Generic target and build rules: objects from CUDA compilation
 $(BUILDDIR)/%%_cu.o : %%.cc *.h $(BUILDDIR)/.build.$(TAG)
 	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
-	$(NVCC) $(CPPFLAGS) $(CUFLAGS) -Xcompiler -fPIC -c -x cu $< -o $@
+	$(GPUCC) $(CPPFLAGS) $(CUFLAGS) -Xcompiler -fPIC -c -x cu $< -o $@
 
 #-------------------------------------------------------------------------------
 
 cxx_objects=$(addprefix $(BUILDDIR)/, Parameters_%(model)s.o read_slha.o)
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 cu_objects=$(addprefix $(BUILDDIR)/, Parameters_%(model)s_cu.o)
 endif
 
 # Target (and build rules): common (src) library
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so : $(cxx_objects) $(cu_objects)
 	@if [ ! -d $(LIBDIR) ]; then echo "mkdir -p $(LIBDIR)"; mkdir -p $(LIBDIR); fi
-	$(NVCC) -shared -o $@ $(cxx_objects) $(cu_objects)
+	$(GPUCC) -shared -o $@ $(cxx_objects) $(cu_objects)
 else
 $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so : $(cxx_objects)
 	@if [ ! -d $(LIBDIR) ]; then echo "mkdir -p $(LIBDIR)"; mkdir -p $(LIBDIR); fi

From 8206a29bafd8d68e40f567c52df4e49319aeecc5 Mon Sep 17 00:00:00 2001
From: Andrea Valassi <andrea.valassi@cern.ch>
Date: Tue, 25 Jul 2023 17:07:09 +0200
Subject: [PATCH 398/509] [jthip] regenerate ggttgg.mad, all ok

---
 .../gg_ttgg.mad/CODEGEN_mad_gg_ttgg_log.txt   | 20 +++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/epochX/cudacpp/gg_ttgg.mad/CODEGEN_mad_gg_ttgg_log.txt b/epochX/cudacpp/gg_ttgg.mad/CODEGEN_mad_gg_ttgg_log.txt
index 895de5b9bb..94169c12b5 100644
--- a/epochX/cudacpp/gg_ttgg.mad/CODEGEN_mad_gg_ttgg_log.txt
+++ b/epochX/cudacpp/gg_ttgg.mad/CODEGEN_mad_gg_ttgg_log.txt
@@ -62,7 +62,7 @@ generate g g > t t~ g g
 No model currently active, so we import the Standard Model
 INFO: load particles 
 INFO: load vertices 
-[1;32mDEBUG: model prefixing  takes 0.00452733039855957 [0m
+[1;32mDEBUG: model prefixing  takes 0.0047986507415771484 [0m
 INFO: Restrict model sm with file models/sm/restrict_default.dat . 
 [1;32mDEBUG: Simplifying conditional expressions [0m
 [1;32mDEBUG: remove interactions: u s w+ at order: QED=1 [0m
@@ -155,7 +155,7 @@ INFO: Please specify coupling orders to bypass this step.
 INFO: Trying coupling order WEIGHTED<=4: WEIGTHED IS QCD+2*QED 
 INFO: Trying process: g g > t t~ g g WEIGHTED<=4 @1  
 INFO: Process has 123 diagrams 
-1 processes with 123 diagrams generated in 0.143 s
+1 processes with 123 diagrams generated in 0.144 s
 Total: 1 processes with 123 diagrams
 output madevent CODEGEN_mad_gg_ttgg --hel_recycling=False --vector_size=16384 --me_exporter=standalone_cudacpp
 Load PLUGIN.CUDACPP_SA_OUTPUT
@@ -177,7 +177,7 @@ INFO: Creating files in directory P1_gg_ttxgg
 [1;32mDEBUG:  Entering PLUGIN_OneProcessExporter.__init__ [1;30m[model_handling.py at line 1039][0m [0m
 [1;32mDEBUG:  kwargs[prefix] = 0 [1;30m[model_handling.py at line 1040][0m [0m
 [1;32mDEBUG:  proc_id = [0m 1 [1;30m[model_handling.py at line 1045][0m [0m
-[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_SA_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7fcfa8d262e0> [1;30m[export_v4.py at line 6174][0m [0m
+[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_SA_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7fad1d3b22e0> [1;30m[export_v4.py at line 6174][0m [0m
 INFO: Creating files in directory . 
 [1;32mDEBUG:  Entering PLUGIN_OneProcessExporter.generate_process_files [1;30m[model_handling.py at line 1297][0m [0m
 [1;32mDEBUG:  self.include_multi_channel is already defined: this is madevent+second_exporter mode [1;30m[model_handling.py at line 1299][0m [0m
@@ -217,15 +217,15 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./.
 [1;32mDEBUG:  Done [1;30m[export_cpp.py at line 713][0m [0m
 INFO: Generating Feynman diagrams for Process: g g > t t~ g g WEIGHTED<=4 @1 
 INFO: Finding symmetric diagrams for subprocess group gg_ttxgg 
-Generated helas calls for 1 subprocesses (123 diagrams) in 0.381 s
-Wrote files for 222 helas calls in 0.657 s
+Generated helas calls for 1 subprocesses (123 diagrams) in 0.385 s
+Wrote files for 222 helas calls in 0.661 s
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates VVV1 routines[0m
 ALOHA: aloha creates FFV1 routines[0m
 ALOHA: aloha creates VVVV1 routines[0m
 ALOHA: aloha creates VVVV3 routines[0m
 ALOHA: aloha creates VVVV4 routines[0m
-ALOHA: aloha creates 5 routines in  0.280 s
+ALOHA: aloha creates 5 routines in  0.394 s
 [1;32mDEBUG:  Entering PLUGIN_ProcessExporter.convert_model (create the model) [1;30m[output.py at line 194][0m [0m
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates VVV1 routines[0m
@@ -233,7 +233,7 @@ ALOHA: aloha creates FFV1 routines[0m
 ALOHA: aloha creates VVVV1 routines[0m
 ALOHA: aloha creates VVVV3 routines[0m
 ALOHA: aloha creates VVVV4 routines[0m
-ALOHA: aloha creates 10 routines in  0.265 s
+ALOHA: aloha creates 10 routines in  0.299 s
 <class 'aloha.create_aloha.AbstractRoutine'> VVV1
 <class 'aloha.create_aloha.AbstractRoutine'> VVV1
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
@@ -272,6 +272,6 @@ Type "launch" to generate events from this process, or see
 Run "open index.html" to see more information about this process.
 quit
 
-real	0m3.139s
-user	0m2.758s
-sys	0m0.263s
+real	0m3.405s
+user	0m2.768s
+sys	0m0.245s

From e32e8ba9742ff7cb897cde205348fd4349034adf Mon Sep 17 00:00:00 2001
From: Andrea Valassi <andrea.valassi@cern.ch>
Date: Wed, 26 Jul 2023 12:51:48 +0200
Subject: [PATCH 399/509] [jthip] go back to origin/fpe ggttgg.mad codegen log
 for easier merging of upstream/master

---
 .../gg_ttgg.mad/CODEGEN_mad_gg_ttgg_log.txt   | 38 +++++++++----------
 1 file changed, 19 insertions(+), 19 deletions(-)

diff --git a/epochX/cudacpp/gg_ttgg.mad/CODEGEN_mad_gg_ttgg_log.txt b/epochX/cudacpp/gg_ttgg.mad/CODEGEN_mad_gg_ttgg_log.txt
index 94169c12b5..c8f29ccb7d 100644
--- a/epochX/cudacpp/gg_ttgg.mad/CODEGEN_mad_gg_ttgg_log.txt
+++ b/epochX/cudacpp/gg_ttgg.mad/CODEGEN_mad_gg_ttgg_log.txt
@@ -62,7 +62,7 @@ generate g g > t t~ g g
 No model currently active, so we import the Standard Model
 INFO: load particles 
 INFO: load vertices 
-[1;32mDEBUG: model prefixing  takes 0.0047986507415771484 [0m
+[1;32mDEBUG: model prefixing  takes 0.005085468292236328 [0m
 INFO: Restrict model sm with file models/sm/restrict_default.dat . 
 [1;32mDEBUG: Simplifying conditional expressions [0m
 [1;32mDEBUG: remove interactions: u s w+ at order: QED=1 [0m
@@ -155,7 +155,7 @@ INFO: Please specify coupling orders to bypass this step.
 INFO: Trying coupling order WEIGHTED<=4: WEIGTHED IS QCD+2*QED 
 INFO: Trying process: g g > t t~ g g WEIGHTED<=4 @1  
 INFO: Process has 123 diagrams 
-1 processes with 123 diagrams generated in 0.144 s
+1 processes with 123 diagrams generated in 0.143 s
 Total: 1 processes with 123 diagrams
 output madevent CODEGEN_mad_gg_ttgg --hel_recycling=False --vector_size=16384 --me_exporter=standalone_cudacpp
 Load PLUGIN.CUDACPP_SA_OUTPUT
@@ -177,7 +177,7 @@ INFO: Creating files in directory P1_gg_ttxgg
 [1;32mDEBUG:  Entering PLUGIN_OneProcessExporter.__init__ [1;30m[model_handling.py at line 1039][0m [0m
 [1;32mDEBUG:  kwargs[prefix] = 0 [1;30m[model_handling.py at line 1040][0m [0m
 [1;32mDEBUG:  proc_id = [0m 1 [1;30m[model_handling.py at line 1045][0m [0m
-[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_SA_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7fad1d3b22e0> [1;30m[export_v4.py at line 6174][0m [0m
+[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_SA_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7fad092fa0d0> [1;30m[export_v4.py at line 6174][0m [0m
 INFO: Creating files in directory . 
 [1;32mDEBUG:  Entering PLUGIN_OneProcessExporter.generate_process_files [1;30m[model_handling.py at line 1297][0m [0m
 [1;32mDEBUG:  self.include_multi_channel is already defined: this is madevent+second_exporter mode [1;30m[model_handling.py at line 1299][0m [0m
@@ -192,15 +192,15 @@ FileWriter <class 'PLUGIN.CUDACPP_SA_OUTPUT.model_handling.PLUGIN_CPPWriter'> fo
 [1;32mDEBUG:  self.support_multichannel, self.include_multi_channel = [0m True [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 0, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 0, 46, 47, 48, 49, 50, 51, 52, 53, 54, 0, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 0, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 0, 88, 89, 90, 91, 92, 93, 0, 94, 95, 96, 97, 98, 99, 0, 100, 101, 102, 103, 104, 105, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] [1;30m[model_handling.py at line 1163][0m [0m
 [1;32mDEBUG:  multi_channel = [0m {1: [1], 2: [2], 3: [3], 4: [4], 5: [5], 6: [6], 7: [7], 8: [8], 9: [9], 10: [10], 11: [11], 12: [12], 13: [13], 14: [14], 15: [15], 16: [16], 17: [17], 18: [18], 19: [19], 20: [20], 21: [21], 22: [22], 23: [23], 24: [24], 25: [25], 26: [26], 27: [27], 28: [28], 29: [29], 30: [30], 31: [32], 32: [33], 33: [34], 34: [35], 35: [36], 36: [37], 37: [38], 38: [39], 39: [40], 40: [41], 41: [42], 42: [43], 43: [44], 44: [45], 45: [46], 46: [48], 47: [49], 48: [50], 49: [51], 50: [52], 51: [53], 52: [54], 53: [55], 54: [56], 55: [58], 56: [59], 57: [60], 58: [61], 59: [62], 60: [63], 61: [64], 62: [65], 63: [66], 64: [67], 65: [68], 66: [69], 67: [70], 68: [71], 69: [72], 70: [74], 71: [75], 72: [76], 73: [77], 74: [78], 75: [79], 76: [80], 77: [81], 78: [82], 79: [83], 80: [84], 81: [85], 82: [86], 83: [87], 84: [88], 85: [89], 86: [90], 87: [91], 88: [93], 89: [94], 90: [95], 91: [96], 92: [97], 93: [98], 94: [100], 95: [101], 96: [102], 97: [103], 98: [104], 99: [105], 100: [107], 101: [108], 102: [109], 103: [110], 104: [111], 105: [112]} [1;30m[model_handling.py at line 1169][0m [0m
 [1;32mDEBUG:  multi_channel_map = [0m {1: [1], 2: [2], 3: [3], 4: [4], 5: [5], 6: [6], 7: [7], 8: [8], 9: [9], 10: [10], 11: [11], 12: [12], 13: [13], 14: [14], 15: [15], 16: [16], 17: [17], 18: [18], 19: [19], 20: [20], 21: [21], 22: [22], 23: [23], 24: [24], 25: [25], 26: [26], 27: [27], 28: [28], 29: [29], 30: [30], 31: [32], 32: [33], 33: [34], 34: [35], 35: [36], 36: [37], 37: [38], 38: [39], 39: [40], 40: [41], 41: [42], 42: [43], 43: [44], 44: [45], 45: [46], 46: [48], 47: [49], 48: [50], 49: [51], 50: [52], 51: [53], 52: [54], 53: [55], 54: [56], 55: [58], 56: [59], 57: [60], 58: [61], 59: [62], 60: [63], 61: [64], 62: [65], 63: [66], 64: [67], 65: [68], 66: [69], 67: [70], 68: [71], 69: [72], 70: [74], 71: [75], 72: [76], 73: [77], 74: [78], 75: [79], 76: [80], 77: [81], 78: [82], 79: [83], 80: [84], 81: [85], 82: [86], 83: [87], 84: [88], 85: [89], 86: [90], 87: [91], 88: [93], 89: [94], 90: [95], 91: [96], 92: [97], 93: [98], 94: [100], 95: [101], 96: [102], 97: [103], 98: [104], 99: [105], 100: [107], 101: [108], 102: [109], 103: [110], 104: [111], 105: [112]} [1;30m[model_handling.py at line 1654][0m [0m
-[1;32mDEBUG:  diag_to_config = [0m {4: 1, 5: 2, 6: 3, 7: 4, 8: 5, 9: 6, 10: 7, 11: 8, 12: 9, 13: 10, 14: 11, 15: 12, 16: 13, 17: 14, 18: 15, 19: 16, 20: 17, 21: 18, 22: 19, 23: 20, 24: 21, 25: 22, 26: 23, 27: 24, 28: 25, 29: 26, 30: 27, 31: 28, 32: 29, 33: 30, 37: 31, 38: 32, 39: 33, 40: 34, 41: 35, 42: 36, 43: 37, 44: 38, 45: 39, 46: 40, 47: 41, 48: 42, 49: 43, 50: 44, 51: 45, 55: 46, 56: 47, 57: 48, 58: 49, 59: 50, 60: 51, 61: 52, 62: 53, 63: 54, 67: 55, 68: 56, 69: 57, 70: 58, 71: 59, 72: 60, 73: 61, 74: 62, 75: 63, 76: 64, 77: 65, 78: 66, 79: 67, 80: 68, 81: 69, 85: 70, 86: 71, 87: 72, 88: 73, 89: 74, 90: 75, 91: 76, 92: 77, 93: 78, 94: 79, 95: 80, 96: 81, 97: 82, 98: 83, 99: 84, 100: 85, 101: 86, 102: 87, 106: 88, 107: 89, 108: 90, 109: 91, 110: 92, 111: 93, 115: 94, 116: 95, 117: 96, 118: 97, 119: 98, 120: 99, 124: 100, 125: 101, 126: 102, 127: 103, 128: 104, 129: 105} [1;30m[model_handling.py at line 1711][0m [0m
-[1;32mDEBUG:  call = [0m vxxxxx( momenta,m_pars->%s, cHel[ihel][%d],%+d, w_sv[%d], %d ); [1;30m[model_handling.py at line 1823][0m [0m
-[1;32mDEBUG:  ('ZERO', 0, -1, 0, 0) [1;30m[model_handling.py at line 1824][0m [0m
-[1;32mDEBUG:  call = [0m vxxxxx( momenta,m_pars->%s, cHel[ihel][%d],%+d, w_sv[%d], %d ); [1;30m[model_handling.py at line 1823][0m [0m
-[1;32mDEBUG:  ('ZERO', 1, -1, 1, 1) [1;30m[model_handling.py at line 1824][0m [0m
-[1;32mDEBUG:  call = [0m vxxxxx( momenta,m_pars->%s, cHel[ihel][%d],%+d, w_sv[%d], %d ); [1;30m[model_handling.py at line 1823][0m [0m
-[1;32mDEBUG:  ('ZERO', 4, 1, 4, 4) [1;30m[model_handling.py at line 1824][0m [0m
-[1;32mDEBUG:  call = [0m vxxxxx( momenta,m_pars->%s, cHel[ihel][%d],%+d, w_sv[%d], %d ); [1;30m[model_handling.py at line 1823][0m [0m
-[1;32mDEBUG:  ('ZERO', 5, 1, 5, 5) [1;30m[model_handling.py at line 1824][0m [0m
+[1;32mDEBUG:  diag_to_config = [0m {4: 1, 5: 2, 6: 3, 7: 4, 8: 5, 9: 6, 10: 7, 11: 8, 12: 9, 13: 10, 14: 11, 15: 12, 16: 13, 17: 14, 18: 15, 19: 16, 20: 17, 21: 18, 22: 19, 23: 20, 24: 21, 25: 22, 26: 23, 27: 24, 28: 25, 29: 26, 30: 27, 31: 28, 32: 29, 33: 30, 37: 31, 38: 32, 39: 33, 40: 34, 41: 35, 42: 36, 43: 37, 44: 38, 45: 39, 46: 40, 47: 41, 48: 42, 49: 43, 50: 44, 51: 45, 55: 46, 56: 47, 57: 48, 58: 49, 59: 50, 60: 51, 61: 52, 62: 53, 63: 54, 67: 55, 68: 56, 69: 57, 70: 58, 71: 59, 72: 60, 73: 61, 74: 62, 75: 63, 76: 64, 77: 65, 78: 66, 79: 67, 80: 68, 81: 69, 85: 70, 86: 71, 87: 72, 88: 73, 89: 74, 90: 75, 91: 76, 92: 77, 93: 78, 94: 79, 95: 80, 96: 81, 97: 82, 98: 83, 99: 84, 100: 85, 101: 86, 102: 87, 106: 88, 107: 89, 108: 90, 109: 91, 110: 92, 111: 93, 115: 94, 116: 95, 117: 96, 118: 97, 119: 98, 120: 99, 124: 100, 125: 101, 126: 102, 127: 103, 128: 104, 129: 105} [1;30m[model_handling.py at line 1709][0m [0m
+[1;32mDEBUG:  call = [0m vxxxxx( momenta,m_pars->%s, cHel[ihel][%d],%+d, w_sv[%d], %d ); [1;30m[model_handling.py at line 1821][0m [0m
+[1;32mDEBUG:  ('ZERO', 0, -1, 0, 0) [1;30m[model_handling.py at line 1822][0m [0m
+[1;32mDEBUG:  call = [0m vxxxxx( momenta,m_pars->%s, cHel[ihel][%d],%+d, w_sv[%d], %d ); [1;30m[model_handling.py at line 1821][0m [0m
+[1;32mDEBUG:  ('ZERO', 1, -1, 1, 1) [1;30m[model_handling.py at line 1822][0m [0m
+[1;32mDEBUG:  call = [0m vxxxxx( momenta,m_pars->%s, cHel[ihel][%d],%+d, w_sv[%d], %d ); [1;30m[model_handling.py at line 1821][0m [0m
+[1;32mDEBUG:  ('ZERO', 4, 1, 4, 4) [1;30m[model_handling.py at line 1822][0m [0m
+[1;32mDEBUG:  call = [0m vxxxxx( momenta,m_pars->%s, cHel[ihel][%d],%+d, w_sv[%d], %d ); [1;30m[model_handling.py at line 1821][0m [0m
+[1;32mDEBUG:  ('ZERO', 5, 1, 5, 5) [1;30m[model_handling.py at line 1822][0m [0m
 INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./. 
 [1;32mDEBUG:  Entering PLUGIN_OneProcessExporter.edit_CMakeLists [1;30m[model_handling.py at line 1343][0m [0m
 [1;32mDEBUG:  Entering PLUGIN_OneProcessExporter.edit_check_sa [1;30m[model_handling.py at line 1352][0m [0m
@@ -217,15 +217,15 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./.
 [1;32mDEBUG:  Done [1;30m[export_cpp.py at line 713][0m [0m
 INFO: Generating Feynman diagrams for Process: g g > t t~ g g WEIGHTED<=4 @1 
 INFO: Finding symmetric diagrams for subprocess group gg_ttxgg 
-Generated helas calls for 1 subprocesses (123 diagrams) in 0.385 s
-Wrote files for 222 helas calls in 0.661 s
+Generated helas calls for 1 subprocesses (123 diagrams) in 0.384 s
+Wrote files for 222 helas calls in 0.659 s
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates VVV1 routines[0m
 ALOHA: aloha creates FFV1 routines[0m
 ALOHA: aloha creates VVVV1 routines[0m
 ALOHA: aloha creates VVVV3 routines[0m
 ALOHA: aloha creates VVVV4 routines[0m
-ALOHA: aloha creates 5 routines in  0.394 s
+ALOHA: aloha creates 5 routines in  0.282 s
 [1;32mDEBUG:  Entering PLUGIN_ProcessExporter.convert_model (create the model) [1;30m[output.py at line 194][0m [0m
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates VVV1 routines[0m
@@ -233,7 +233,7 @@ ALOHA: aloha creates FFV1 routines[0m
 ALOHA: aloha creates VVVV1 routines[0m
 ALOHA: aloha creates VVVV3 routines[0m
 ALOHA: aloha creates VVVV4 routines[0m
-ALOHA: aloha creates 10 routines in  0.299 s
+ALOHA: aloha creates 10 routines in  0.271 s
 <class 'aloha.create_aloha.AbstractRoutine'> VVV1
 <class 'aloha.create_aloha.AbstractRoutine'> VVV1
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
@@ -272,6 +272,6 @@ Type "launch" to generate events from this process, or see
 Run "open index.html" to see more information about this process.
 quit
 
-real	0m3.405s
-user	0m2.768s
-sys	0m0.245s
+real	0m3.022s
+user	0m2.797s
+sys	0m0.209s

From 9d5b6d9f42867250b997b312ef2815286ee2a76c Mon Sep 17 00:00:00 2001
From: Andrea Valassi <andrea.valassi@cern.ch>
Date: Wed, 26 Jul 2023 12:53:41 +0200
Subject: [PATCH 400/509] [jthip] regenerate ggttgg.mad after merging
 upstream/master - all ok (will revert the log)

---
 .../gg_ttgg.mad/CODEGEN_mad_gg_ttgg_log.txt   | 38 +++++++++----------
 1 file changed, 19 insertions(+), 19 deletions(-)

diff --git a/epochX/cudacpp/gg_ttgg.mad/CODEGEN_mad_gg_ttgg_log.txt b/epochX/cudacpp/gg_ttgg.mad/CODEGEN_mad_gg_ttgg_log.txt
index 8c2a3bf79e..859c60f07e 100644
--- a/epochX/cudacpp/gg_ttgg.mad/CODEGEN_mad_gg_ttgg_log.txt
+++ b/epochX/cudacpp/gg_ttgg.mad/CODEGEN_mad_gg_ttgg_log.txt
@@ -62,7 +62,7 @@ generate g g > t t~ g g
 No model currently active, so we import the Standard Model
 INFO: load particles 
 INFO: load vertices 
-[1;32mDEBUG: model prefixing  takes 0.0053708553314208984 [0m
+[1;32mDEBUG: model prefixing  takes 0.005190134048461914 [0m
 INFO: Restrict model sm with file models/sm/restrict_default.dat . 
 [1;32mDEBUG: Simplifying conditional expressions [0m
 [1;32mDEBUG: remove interactions: u s w+ at order: QED=1 [0m
@@ -155,7 +155,7 @@ INFO: Please specify coupling orders to bypass this step.
 INFO: Trying coupling order WEIGHTED<=4: WEIGTHED IS QCD+2*QED 
 INFO: Trying process: g g > t t~ g g WEIGHTED<=4 @1  
 INFO: Process has 123 diagrams 
-1 processes with 123 diagrams generated in 0.144 s
+1 processes with 123 diagrams generated in 0.153 s
 Total: 1 processes with 123 diagrams
 output madevent CODEGEN_mad_gg_ttgg --hel_recycling=False --vector_size=16384 --me_exporter=standalone_cudacpp
 Load PLUGIN.CUDACPP_SA_OUTPUT
@@ -177,7 +177,7 @@ INFO: Creating files in directory P1_gg_ttxgg
 [1;32mDEBUG:  Entering PLUGIN_OneProcessExporter.__init__ [1;30m[model_handling.py at line 1039][0m [0m
 [1;32mDEBUG:  kwargs[prefix] = 0 [1;30m[model_handling.py at line 1040][0m [0m
 [1;32mDEBUG:  proc_id = [0m 1 [1;30m[model_handling.py at line 1045][0m [0m
-[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_SA_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7f0ffa4de340> [1;30m[export_v4.py at line 6174][0m [0m
+[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_SA_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7f544d4af0d0> [1;30m[export_v4.py at line 6174][0m [0m
 INFO: Creating files in directory . 
 [1;32mDEBUG:  Entering PLUGIN_OneProcessExporter.generate_process_files [1;30m[model_handling.py at line 1297][0m [0m
 [1;32mDEBUG:  self.include_multi_channel is already defined: this is madevent+second_exporter mode [1;30m[model_handling.py at line 1299][0m [0m
@@ -192,15 +192,15 @@ FileWriter <class 'PLUGIN.CUDACPP_SA_OUTPUT.model_handling.PLUGIN_CPPWriter'> fo
 [1;32mDEBUG:  self.support_multichannel, self.include_multi_channel = [0m True [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 0, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 0, 46, 47, 48, 49, 50, 51, 52, 53, 54, 0, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 0, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 0, 88, 89, 90, 91, 92, 93, 0, 94, 95, 96, 97, 98, 99, 0, 100, 101, 102, 103, 104, 105, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] [1;30m[model_handling.py at line 1163][0m [0m
 [1;32mDEBUG:  multi_channel = [0m {1: [1], 2: [2], 3: [3], 4: [4], 5: [5], 6: [6], 7: [7], 8: [8], 9: [9], 10: [10], 11: [11], 12: [12], 13: [13], 14: [14], 15: [15], 16: [16], 17: [17], 18: [18], 19: [19], 20: [20], 21: [21], 22: [22], 23: [23], 24: [24], 25: [25], 26: [26], 27: [27], 28: [28], 29: [29], 30: [30], 31: [32], 32: [33], 33: [34], 34: [35], 35: [36], 36: [37], 37: [38], 38: [39], 39: [40], 40: [41], 41: [42], 42: [43], 43: [44], 44: [45], 45: [46], 46: [48], 47: [49], 48: [50], 49: [51], 50: [52], 51: [53], 52: [54], 53: [55], 54: [56], 55: [58], 56: [59], 57: [60], 58: [61], 59: [62], 60: [63], 61: [64], 62: [65], 63: [66], 64: [67], 65: [68], 66: [69], 67: [70], 68: [71], 69: [72], 70: [74], 71: [75], 72: [76], 73: [77], 74: [78], 75: [79], 76: [80], 77: [81], 78: [82], 79: [83], 80: [84], 81: [85], 82: [86], 83: [87], 84: [88], 85: [89], 86: [90], 87: [91], 88: [93], 89: [94], 90: [95], 91: [96], 92: [97], 93: [98], 94: [100], 95: [101], 96: [102], 97: [103], 98: [104], 99: [105], 100: [107], 101: [108], 102: [109], 103: [110], 104: [111], 105: [112]} [1;30m[model_handling.py at line 1169][0m [0m
 [1;32mDEBUG:  multi_channel_map = [0m {1: [1], 2: [2], 3: [3], 4: [4], 5: [5], 6: [6], 7: [7], 8: [8], 9: [9], 10: [10], 11: [11], 12: [12], 13: [13], 14: [14], 15: [15], 16: [16], 17: [17], 18: [18], 19: [19], 20: [20], 21: [21], 22: [22], 23: [23], 24: [24], 25: [25], 26: [26], 27: [27], 28: [28], 29: [29], 30: [30], 31: [32], 32: [33], 33: [34], 34: [35], 35: [36], 36: [37], 37: [38], 38: [39], 39: [40], 40: [41], 41: [42], 42: [43], 43: [44], 44: [45], 45: [46], 46: [48], 47: [49], 48: [50], 49: [51], 50: [52], 51: [53], 52: [54], 53: [55], 54: [56], 55: [58], 56: [59], 57: [60], 58: [61], 59: [62], 60: [63], 61: [64], 62: [65], 63: [66], 64: [67], 65: [68], 66: [69], 67: [70], 68: [71], 69: [72], 70: [74], 71: [75], 72: [76], 73: [77], 74: [78], 75: [79], 76: [80], 77: [81], 78: [82], 79: [83], 80: [84], 81: [85], 82: [86], 83: [87], 84: [88], 85: [89], 86: [90], 87: [91], 88: [93], 89: [94], 90: [95], 91: [96], 92: [97], 93: [98], 94: [100], 95: [101], 96: [102], 97: [103], 98: [104], 99: [105], 100: [107], 101: [108], 102: [109], 103: [110], 104: [111], 105: [112]} [1;30m[model_handling.py at line 1654][0m [0m
-[1;32mDEBUG:  diag_to_config = [0m {4: 1, 5: 2, 6: 3, 7: 4, 8: 5, 9: 6, 10: 7, 11: 8, 12: 9, 13: 10, 14: 11, 15: 12, 16: 13, 17: 14, 18: 15, 19: 16, 20: 17, 21: 18, 22: 19, 23: 20, 24: 21, 25: 22, 26: 23, 27: 24, 28: 25, 29: 26, 30: 27, 31: 28, 32: 29, 33: 30, 37: 31, 38: 32, 39: 33, 40: 34, 41: 35, 42: 36, 43: 37, 44: 38, 45: 39, 46: 40, 47: 41, 48: 42, 49: 43, 50: 44, 51: 45, 55: 46, 56: 47, 57: 48, 58: 49, 59: 50, 60: 51, 61: 52, 62: 53, 63: 54, 67: 55, 68: 56, 69: 57, 70: 58, 71: 59, 72: 60, 73: 61, 74: 62, 75: 63, 76: 64, 77: 65, 78: 66, 79: 67, 80: 68, 81: 69, 85: 70, 86: 71, 87: 72, 88: 73, 89: 74, 90: 75, 91: 76, 92: 77, 93: 78, 94: 79, 95: 80, 96: 81, 97: 82, 98: 83, 99: 84, 100: 85, 101: 86, 102: 87, 106: 88, 107: 89, 108: 90, 109: 91, 110: 92, 111: 93, 115: 94, 116: 95, 117: 96, 118: 97, 119: 98, 120: 99, 124: 100, 125: 101, 126: 102, 127: 103, 128: 104, 129: 105} [1;30m[model_handling.py at line 1709][0m [0m
-[1;32mDEBUG:  call = [0m vxxxxx( momenta,m_pars->%s, cHel[ihel][%d],%+d, w_sv[%d], %d ); [1;30m[model_handling.py at line 1821][0m [0m
-[1;32mDEBUG:  ('ZERO', 0, -1, 0, 0) [1;30m[model_handling.py at line 1822][0m [0m
-[1;32mDEBUG:  call = [0m vxxxxx( momenta,m_pars->%s, cHel[ihel][%d],%+d, w_sv[%d], %d ); [1;30m[model_handling.py at line 1821][0m [0m
-[1;32mDEBUG:  ('ZERO', 1, -1, 1, 1) [1;30m[model_handling.py at line 1822][0m [0m
-[1;32mDEBUG:  call = [0m vxxxxx( momenta,m_pars->%s, cHel[ihel][%d],%+d, w_sv[%d], %d ); [1;30m[model_handling.py at line 1821][0m [0m
-[1;32mDEBUG:  ('ZERO', 4, 1, 4, 4) [1;30m[model_handling.py at line 1822][0m [0m
-[1;32mDEBUG:  call = [0m vxxxxx( momenta,m_pars->%s, cHel[ihel][%d],%+d, w_sv[%d], %d ); [1;30m[model_handling.py at line 1821][0m [0m
-[1;32mDEBUG:  ('ZERO', 5, 1, 5, 5) [1;30m[model_handling.py at line 1822][0m [0m
+[1;32mDEBUG:  diag_to_config = [0m {4: 1, 5: 2, 6: 3, 7: 4, 8: 5, 9: 6, 10: 7, 11: 8, 12: 9, 13: 10, 14: 11, 15: 12, 16: 13, 17: 14, 18: 15, 19: 16, 20: 17, 21: 18, 22: 19, 23: 20, 24: 21, 25: 22, 26: 23, 27: 24, 28: 25, 29: 26, 30: 27, 31: 28, 32: 29, 33: 30, 37: 31, 38: 32, 39: 33, 40: 34, 41: 35, 42: 36, 43: 37, 44: 38, 45: 39, 46: 40, 47: 41, 48: 42, 49: 43, 50: 44, 51: 45, 55: 46, 56: 47, 57: 48, 58: 49, 59: 50, 60: 51, 61: 52, 62: 53, 63: 54, 67: 55, 68: 56, 69: 57, 70: 58, 71: 59, 72: 60, 73: 61, 74: 62, 75: 63, 76: 64, 77: 65, 78: 66, 79: 67, 80: 68, 81: 69, 85: 70, 86: 71, 87: 72, 88: 73, 89: 74, 90: 75, 91: 76, 92: 77, 93: 78, 94: 79, 95: 80, 96: 81, 97: 82, 98: 83, 99: 84, 100: 85, 101: 86, 102: 87, 106: 88, 107: 89, 108: 90, 109: 91, 110: 92, 111: 93, 115: 94, 116: 95, 117: 96, 118: 97, 119: 98, 120: 99, 124: 100, 125: 101, 126: 102, 127: 103, 128: 104, 129: 105} [1;30m[model_handling.py at line 1711][0m [0m
+[1;32mDEBUG:  call = [0m vxxxxx( momenta,m_pars->%s, cHel[ihel][%d],%+d, w_sv[%d], %d ); [1;30m[model_handling.py at line 1823][0m [0m
+[1;32mDEBUG:  ('ZERO', 0, -1, 0, 0) [1;30m[model_handling.py at line 1824][0m [0m
+[1;32mDEBUG:  call = [0m vxxxxx( momenta,m_pars->%s, cHel[ihel][%d],%+d, w_sv[%d], %d ); [1;30m[model_handling.py at line 1823][0m [0m
+[1;32mDEBUG:  ('ZERO', 1, -1, 1, 1) [1;30m[model_handling.py at line 1824][0m [0m
+[1;32mDEBUG:  call = [0m vxxxxx( momenta,m_pars->%s, cHel[ihel][%d],%+d, w_sv[%d], %d ); [1;30m[model_handling.py at line 1823][0m [0m
+[1;32mDEBUG:  ('ZERO', 4, 1, 4, 4) [1;30m[model_handling.py at line 1824][0m [0m
+[1;32mDEBUG:  call = [0m vxxxxx( momenta,m_pars->%s, cHel[ihel][%d],%+d, w_sv[%d], %d ); [1;30m[model_handling.py at line 1823][0m [0m
+[1;32mDEBUG:  ('ZERO', 5, 1, 5, 5) [1;30m[model_handling.py at line 1824][0m [0m
 INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./. 
 [1;32mDEBUG:  Entering PLUGIN_OneProcessExporter.edit_CMakeLists [1;30m[model_handling.py at line 1343][0m [0m
 [1;32mDEBUG:  Entering PLUGIN_OneProcessExporter.edit_check_sa [1;30m[model_handling.py at line 1352][0m [0m
@@ -217,15 +217,15 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./.
 [1;32mDEBUG:  Done [1;30m[export_cpp.py at line 713][0m [0m
 INFO: Generating Feynman diagrams for Process: g g > t t~ g g WEIGHTED<=4 @1 
 INFO: Finding symmetric diagrams for subprocess group gg_ttxgg 
-Generated helas calls for 1 subprocesses (123 diagrams) in 0.385 s
-Wrote files for 222 helas calls in 0.669 s
+Generated helas calls for 1 subprocesses (123 diagrams) in 0.409 s
+Wrote files for 222 helas calls in 0.708 s
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates VVV1 routines[0m
 ALOHA: aloha creates FFV1 routines[0m
 ALOHA: aloha creates VVVV1 routines[0m
 ALOHA: aloha creates VVVV3 routines[0m
 ALOHA: aloha creates VVVV4 routines[0m
-ALOHA: aloha creates 5 routines in  0.277 s
+ALOHA: aloha creates 5 routines in  0.310 s
 [1;32mDEBUG:  Entering PLUGIN_ProcessExporter.convert_model (create the model) [1;30m[output.py at line 194][0m [0m
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates VVV1 routines[0m
@@ -233,7 +233,7 @@ ALOHA: aloha creates FFV1 routines[0m
 ALOHA: aloha creates VVVV1 routines[0m
 ALOHA: aloha creates VVVV3 routines[0m
 ALOHA: aloha creates VVVV4 routines[0m
-ALOHA: aloha creates 10 routines in  0.268 s
+ALOHA: aloha creates 10 routines in  0.296 s
 <class 'aloha.create_aloha.AbstractRoutine'> VVV1
 <class 'aloha.create_aloha.AbstractRoutine'> VVV1
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
@@ -272,6 +272,6 @@ Type "launch" to generate events from this process, or see
 Run "open index.html" to see more information about this process.
 quit
 
-real	0m3.653s
-user	0m3.071s
-sys	0m0.363s
+real	0m3.274s
+user	0m2.978s
+sys	0m0.228s

From e55d57b3e69afe83b08aa39f9c55022e18330386 Mon Sep 17 00:00:00 2001
From: Andrea Valassi <andrea.valassi@cern.ch>
Date: Wed, 26 Jul 2023 12:54:11 +0200
Subject: [PATCH 401/509] [jthip] revert the ggttgg codegen log for easier
 merging later on

Revert "[jthip] regenerate ggttgg.mad after merging upstream/master - all ok (will revert the log)"
This reverts commit 9d5b6d9f42867250b997b312ef2815286ee2a76c.
---
 .../gg_ttgg.mad/CODEGEN_mad_gg_ttgg_log.txt   | 38 +++++++++----------
 1 file changed, 19 insertions(+), 19 deletions(-)

diff --git a/epochX/cudacpp/gg_ttgg.mad/CODEGEN_mad_gg_ttgg_log.txt b/epochX/cudacpp/gg_ttgg.mad/CODEGEN_mad_gg_ttgg_log.txt
index 859c60f07e..8c2a3bf79e 100644
--- a/epochX/cudacpp/gg_ttgg.mad/CODEGEN_mad_gg_ttgg_log.txt
+++ b/epochX/cudacpp/gg_ttgg.mad/CODEGEN_mad_gg_ttgg_log.txt
@@ -62,7 +62,7 @@ generate g g > t t~ g g
 No model currently active, so we import the Standard Model
 INFO: load particles 
 INFO: load vertices 
-[1;32mDEBUG: model prefixing  takes 0.005190134048461914 [0m
+[1;32mDEBUG: model prefixing  takes 0.0053708553314208984 [0m
 INFO: Restrict model sm with file models/sm/restrict_default.dat . 
 [1;32mDEBUG: Simplifying conditional expressions [0m
 [1;32mDEBUG: remove interactions: u s w+ at order: QED=1 [0m
@@ -155,7 +155,7 @@ INFO: Please specify coupling orders to bypass this step.
 INFO: Trying coupling order WEIGHTED<=4: WEIGTHED IS QCD+2*QED 
 INFO: Trying process: g g > t t~ g g WEIGHTED<=4 @1  
 INFO: Process has 123 diagrams 
-1 processes with 123 diagrams generated in 0.153 s
+1 processes with 123 diagrams generated in 0.144 s
 Total: 1 processes with 123 diagrams
 output madevent CODEGEN_mad_gg_ttgg --hel_recycling=False --vector_size=16384 --me_exporter=standalone_cudacpp
 Load PLUGIN.CUDACPP_SA_OUTPUT
@@ -177,7 +177,7 @@ INFO: Creating files in directory P1_gg_ttxgg
 [1;32mDEBUG:  Entering PLUGIN_OneProcessExporter.__init__ [1;30m[model_handling.py at line 1039][0m [0m
 [1;32mDEBUG:  kwargs[prefix] = 0 [1;30m[model_handling.py at line 1040][0m [0m
 [1;32mDEBUG:  proc_id = [0m 1 [1;30m[model_handling.py at line 1045][0m [0m
-[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_SA_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7f544d4af0d0> [1;30m[export_v4.py at line 6174][0m [0m
+[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_SA_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7f0ffa4de340> [1;30m[export_v4.py at line 6174][0m [0m
 INFO: Creating files in directory . 
 [1;32mDEBUG:  Entering PLUGIN_OneProcessExporter.generate_process_files [1;30m[model_handling.py at line 1297][0m [0m
 [1;32mDEBUG:  self.include_multi_channel is already defined: this is madevent+second_exporter mode [1;30m[model_handling.py at line 1299][0m [0m
@@ -192,15 +192,15 @@ FileWriter <class 'PLUGIN.CUDACPP_SA_OUTPUT.model_handling.PLUGIN_CPPWriter'> fo
 [1;32mDEBUG:  self.support_multichannel, self.include_multi_channel = [0m True [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 0, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 0, 46, 47, 48, 49, 50, 51, 52, 53, 54, 0, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 0, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 0, 88, 89, 90, 91, 92, 93, 0, 94, 95, 96, 97, 98, 99, 0, 100, 101, 102, 103, 104, 105, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] [1;30m[model_handling.py at line 1163][0m [0m
 [1;32mDEBUG:  multi_channel = [0m {1: [1], 2: [2], 3: [3], 4: [4], 5: [5], 6: [6], 7: [7], 8: [8], 9: [9], 10: [10], 11: [11], 12: [12], 13: [13], 14: [14], 15: [15], 16: [16], 17: [17], 18: [18], 19: [19], 20: [20], 21: [21], 22: [22], 23: [23], 24: [24], 25: [25], 26: [26], 27: [27], 28: [28], 29: [29], 30: [30], 31: [32], 32: [33], 33: [34], 34: [35], 35: [36], 36: [37], 37: [38], 38: [39], 39: [40], 40: [41], 41: [42], 42: [43], 43: [44], 44: [45], 45: [46], 46: [48], 47: [49], 48: [50], 49: [51], 50: [52], 51: [53], 52: [54], 53: [55], 54: [56], 55: [58], 56: [59], 57: [60], 58: [61], 59: [62], 60: [63], 61: [64], 62: [65], 63: [66], 64: [67], 65: [68], 66: [69], 67: [70], 68: [71], 69: [72], 70: [74], 71: [75], 72: [76], 73: [77], 74: [78], 75: [79], 76: [80], 77: [81], 78: [82], 79: [83], 80: [84], 81: [85], 82: [86], 83: [87], 84: [88], 85: [89], 86: [90], 87: [91], 88: [93], 89: [94], 90: [95], 91: [96], 92: [97], 93: [98], 94: [100], 95: [101], 96: [102], 97: [103], 98: [104], 99: [105], 100: [107], 101: [108], 102: [109], 103: [110], 104: [111], 105: [112]} [1;30m[model_handling.py at line 1169][0m [0m
 [1;32mDEBUG:  multi_channel_map = [0m {1: [1], 2: [2], 3: [3], 4: [4], 5: [5], 6: [6], 7: [7], 8: [8], 9: [9], 10: [10], 11: [11], 12: [12], 13: [13], 14: [14], 15: [15], 16: [16], 17: [17], 18: [18], 19: [19], 20: [20], 21: [21], 22: [22], 23: [23], 24: [24], 25: [25], 26: [26], 27: [27], 28: [28], 29: [29], 30: [30], 31: [32], 32: [33], 33: [34], 34: [35], 35: [36], 36: [37], 37: [38], 38: [39], 39: [40], 40: [41], 41: [42], 42: [43], 43: [44], 44: [45], 45: [46], 46: [48], 47: [49], 48: [50], 49: [51], 50: [52], 51: [53], 52: [54], 53: [55], 54: [56], 55: [58], 56: [59], 57: [60], 58: [61], 59: [62], 60: [63], 61: [64], 62: [65], 63: [66], 64: [67], 65: [68], 66: [69], 67: [70], 68: [71], 69: [72], 70: [74], 71: [75], 72: [76], 73: [77], 74: [78], 75: [79], 76: [80], 77: [81], 78: [82], 79: [83], 80: [84], 81: [85], 82: [86], 83: [87], 84: [88], 85: [89], 86: [90], 87: [91], 88: [93], 89: [94], 90: [95], 91: [96], 92: [97], 93: [98], 94: [100], 95: [101], 96: [102], 97: [103], 98: [104], 99: [105], 100: [107], 101: [108], 102: [109], 103: [110], 104: [111], 105: [112]} [1;30m[model_handling.py at line 1654][0m [0m
-[1;32mDEBUG:  diag_to_config = [0m {4: 1, 5: 2, 6: 3, 7: 4, 8: 5, 9: 6, 10: 7, 11: 8, 12: 9, 13: 10, 14: 11, 15: 12, 16: 13, 17: 14, 18: 15, 19: 16, 20: 17, 21: 18, 22: 19, 23: 20, 24: 21, 25: 22, 26: 23, 27: 24, 28: 25, 29: 26, 30: 27, 31: 28, 32: 29, 33: 30, 37: 31, 38: 32, 39: 33, 40: 34, 41: 35, 42: 36, 43: 37, 44: 38, 45: 39, 46: 40, 47: 41, 48: 42, 49: 43, 50: 44, 51: 45, 55: 46, 56: 47, 57: 48, 58: 49, 59: 50, 60: 51, 61: 52, 62: 53, 63: 54, 67: 55, 68: 56, 69: 57, 70: 58, 71: 59, 72: 60, 73: 61, 74: 62, 75: 63, 76: 64, 77: 65, 78: 66, 79: 67, 80: 68, 81: 69, 85: 70, 86: 71, 87: 72, 88: 73, 89: 74, 90: 75, 91: 76, 92: 77, 93: 78, 94: 79, 95: 80, 96: 81, 97: 82, 98: 83, 99: 84, 100: 85, 101: 86, 102: 87, 106: 88, 107: 89, 108: 90, 109: 91, 110: 92, 111: 93, 115: 94, 116: 95, 117: 96, 118: 97, 119: 98, 120: 99, 124: 100, 125: 101, 126: 102, 127: 103, 128: 104, 129: 105} [1;30m[model_handling.py at line 1711][0m [0m
-[1;32mDEBUG:  call = [0m vxxxxx( momenta,m_pars->%s, cHel[ihel][%d],%+d, w_sv[%d], %d ); [1;30m[model_handling.py at line 1823][0m [0m
-[1;32mDEBUG:  ('ZERO', 0, -1, 0, 0) [1;30m[model_handling.py at line 1824][0m [0m
-[1;32mDEBUG:  call = [0m vxxxxx( momenta,m_pars->%s, cHel[ihel][%d],%+d, w_sv[%d], %d ); [1;30m[model_handling.py at line 1823][0m [0m
-[1;32mDEBUG:  ('ZERO', 1, -1, 1, 1) [1;30m[model_handling.py at line 1824][0m [0m
-[1;32mDEBUG:  call = [0m vxxxxx( momenta,m_pars->%s, cHel[ihel][%d],%+d, w_sv[%d], %d ); [1;30m[model_handling.py at line 1823][0m [0m
-[1;32mDEBUG:  ('ZERO', 4, 1, 4, 4) [1;30m[model_handling.py at line 1824][0m [0m
-[1;32mDEBUG:  call = [0m vxxxxx( momenta,m_pars->%s, cHel[ihel][%d],%+d, w_sv[%d], %d ); [1;30m[model_handling.py at line 1823][0m [0m
-[1;32mDEBUG:  ('ZERO', 5, 1, 5, 5) [1;30m[model_handling.py at line 1824][0m [0m
+[1;32mDEBUG:  diag_to_config = [0m {4: 1, 5: 2, 6: 3, 7: 4, 8: 5, 9: 6, 10: 7, 11: 8, 12: 9, 13: 10, 14: 11, 15: 12, 16: 13, 17: 14, 18: 15, 19: 16, 20: 17, 21: 18, 22: 19, 23: 20, 24: 21, 25: 22, 26: 23, 27: 24, 28: 25, 29: 26, 30: 27, 31: 28, 32: 29, 33: 30, 37: 31, 38: 32, 39: 33, 40: 34, 41: 35, 42: 36, 43: 37, 44: 38, 45: 39, 46: 40, 47: 41, 48: 42, 49: 43, 50: 44, 51: 45, 55: 46, 56: 47, 57: 48, 58: 49, 59: 50, 60: 51, 61: 52, 62: 53, 63: 54, 67: 55, 68: 56, 69: 57, 70: 58, 71: 59, 72: 60, 73: 61, 74: 62, 75: 63, 76: 64, 77: 65, 78: 66, 79: 67, 80: 68, 81: 69, 85: 70, 86: 71, 87: 72, 88: 73, 89: 74, 90: 75, 91: 76, 92: 77, 93: 78, 94: 79, 95: 80, 96: 81, 97: 82, 98: 83, 99: 84, 100: 85, 101: 86, 102: 87, 106: 88, 107: 89, 108: 90, 109: 91, 110: 92, 111: 93, 115: 94, 116: 95, 117: 96, 118: 97, 119: 98, 120: 99, 124: 100, 125: 101, 126: 102, 127: 103, 128: 104, 129: 105} [1;30m[model_handling.py at line 1709][0m [0m
+[1;32mDEBUG:  call = [0m vxxxxx( momenta,m_pars->%s, cHel[ihel][%d],%+d, w_sv[%d], %d ); [1;30m[model_handling.py at line 1821][0m [0m
+[1;32mDEBUG:  ('ZERO', 0, -1, 0, 0) [1;30m[model_handling.py at line 1822][0m [0m
+[1;32mDEBUG:  call = [0m vxxxxx( momenta,m_pars->%s, cHel[ihel][%d],%+d, w_sv[%d], %d ); [1;30m[model_handling.py at line 1821][0m [0m
+[1;32mDEBUG:  ('ZERO', 1, -1, 1, 1) [1;30m[model_handling.py at line 1822][0m [0m
+[1;32mDEBUG:  call = [0m vxxxxx( momenta,m_pars->%s, cHel[ihel][%d],%+d, w_sv[%d], %d ); [1;30m[model_handling.py at line 1821][0m [0m
+[1;32mDEBUG:  ('ZERO', 4, 1, 4, 4) [1;30m[model_handling.py at line 1822][0m [0m
+[1;32mDEBUG:  call = [0m vxxxxx( momenta,m_pars->%s, cHel[ihel][%d],%+d, w_sv[%d], %d ); [1;30m[model_handling.py at line 1821][0m [0m
+[1;32mDEBUG:  ('ZERO', 5, 1, 5, 5) [1;30m[model_handling.py at line 1822][0m [0m
 INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./. 
 [1;32mDEBUG:  Entering PLUGIN_OneProcessExporter.edit_CMakeLists [1;30m[model_handling.py at line 1343][0m [0m
 [1;32mDEBUG:  Entering PLUGIN_OneProcessExporter.edit_check_sa [1;30m[model_handling.py at line 1352][0m [0m
@@ -217,15 +217,15 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./.
 [1;32mDEBUG:  Done [1;30m[export_cpp.py at line 713][0m [0m
 INFO: Generating Feynman diagrams for Process: g g > t t~ g g WEIGHTED<=4 @1 
 INFO: Finding symmetric diagrams for subprocess group gg_ttxgg 
-Generated helas calls for 1 subprocesses (123 diagrams) in 0.409 s
-Wrote files for 222 helas calls in 0.708 s
+Generated helas calls for 1 subprocesses (123 diagrams) in 0.385 s
+Wrote files for 222 helas calls in 0.669 s
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates VVV1 routines[0m
 ALOHA: aloha creates FFV1 routines[0m
 ALOHA: aloha creates VVVV1 routines[0m
 ALOHA: aloha creates VVVV3 routines[0m
 ALOHA: aloha creates VVVV4 routines[0m
-ALOHA: aloha creates 5 routines in  0.310 s
+ALOHA: aloha creates 5 routines in  0.277 s
 [1;32mDEBUG:  Entering PLUGIN_ProcessExporter.convert_model (create the model) [1;30m[output.py at line 194][0m [0m
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates VVV1 routines[0m
@@ -233,7 +233,7 @@ ALOHA: aloha creates FFV1 routines[0m
 ALOHA: aloha creates VVVV1 routines[0m
 ALOHA: aloha creates VVVV3 routines[0m
 ALOHA: aloha creates VVVV4 routines[0m
-ALOHA: aloha creates 10 routines in  0.296 s
+ALOHA: aloha creates 10 routines in  0.268 s
 <class 'aloha.create_aloha.AbstractRoutine'> VVV1
 <class 'aloha.create_aloha.AbstractRoutine'> VVV1
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
@@ -272,6 +272,6 @@ Type "launch" to generate events from this process, or see
 Run "open index.html" to see more information about this process.
 quit
 
-real	0m3.274s
-user	0m2.978s
-sys	0m0.228s
+real	0m3.653s
+user	0m3.071s
+sys	0m0.363s

From 980ccbc03d30a85c6cac137cdcadafdad88e83b6 Mon Sep 17 00:00:00 2001
From: Jorgen T <jorgen.teig@gmail.com>
Date: Mon, 31 Jul 2023 14:50:07 +0200
Subject: [PATCH 402/509] Added Github Artifacts to transfer JSON files

---
 .github/workflows/mi250x_profiler.yml | 40 +++++++++++++++++++++------
 1 file changed, 31 insertions(+), 9 deletions(-)

diff --git a/.github/workflows/mi250x_profiler.yml b/.github/workflows/mi250x_profiler.yml
index e69f810c4f..f291a9d3ff 100644
--- a/.github/workflows/mi250x_profiler.yml
+++ b/.github/workflows/mi250x_profiler.yml
@@ -5,8 +5,8 @@ on:
     - cron:  '00 00 * * *'
 
 jobs:
-  Container Setup:
-    runs-on: [self-hosted, linux]
+  Container_Setup:
+    runs-on: [self-hosted, linux, a100]
     name: Container Setup
     steps:
     - name: Generate runner token
@@ -19,27 +19,49 @@ jobs:
         echo "${{ secrets.SSH_KEY }}" > id_rsa
         chmod 600 id_rsa
         ssh -o StrictHostKeyChecking=no -i id_rsa ${{ secrets.MI250X_PROFILING_HOST }} "\
-          singularity pull ghcr.io/${{ github.repository_owner }}/github_runner:latest && \
-          singularity run -d --name my_container --rm \
+          singularity pull ghcr.io/${{ github.repository_owner }}/github_runner_mi250x:latest && \
+          srun --pty --account=${{ secrets.HPC_ACCOUNT }} -p ${{ secrets.HPC_PROJECT }} --time=03:00:00 singularity run -d --name github_runner_mi250x --rm \
           -e GITHUB_TOKEN=${{ steps.generate_token.outputs.token }} \
           -e REPO_URL=https://github.com/${{ github.repository }} \
-          -e RUNNER_NAME=github_runner \
+          -e RUNNER_NAME=github_runner_mi250x \
           -e GITHUB_RUNNER_TAGS='Linux,x64,mi250x' \
           -e RUNNER_URL=https://github.com/actions/runner/releases/download/v2.303.0/actions-runner-linux-x64-2.303.0.tar.gz \
-          ghcr.io/${{ github.repository_owner }}/github_runner:latest"
+          ghcr.io/${{ github.repository_owner }}/github_runner_mi250x:latest"
 
   HIP_MI250X_Profiling:
     needs: Container Setup
+    runs-on: [self-hosted, linux, mi250x]
     name: HIP MI250X Profiling
     env:
-      CUDA_NAME_PREFIX: sycl_AMD-Epyc-7A53_MI250X_gcc-11.3_cuda-12.0.1
+      CUDA_NAME_PREFIX: sycl_AMD-Epyc-7A53_MI250X_gcc-11.2.1_cuda-12.0.1
       ENABLE_CI_PROFILER: 1
       MADGRAPH4GPU_DB_SECRET: ${{ secrets.MADGRAPH4GPU_DB_SECRET }}
-    runs-on: [self-hosted, linux, mi250x]
     steps:
     - uses: actions/checkout@v2
     - name: Runs SYCL performanceProfiler.py script
       run: cd tools/profiling/;
            python3 performanceProfiler.py -l 'HIP' -b 'master'
+
+    - name: Uploads workplace_mg4gpu directory as an artifact
+      uses: actions/upload-artifact@v3
+      with:
+        name: profiling-results
+        path: tools/profiling/workplace_mg4gpu
+
+  Upload_JSON_files:
+    needs: HIP_MI250X_Profiling
+    runs-on: [self-hosted, linux]
+    name: Upload JSON files to DB
+    env:
+      CUDA_NAME_PREFIX: sycl_AMD-Epyc-7A53_MI250X_gcc-11.2.1_cuda-12.0.1
+      ENABLE_CI_PROFILER: 1
+      MADGRAPH4GPU_DB_SECRET: ${{ secrets.MADGRAPH4GPU_DB_SECRET }}
+    steps:
+    - uses: actions/checkout@v2
+    - name: Download artifact containing profiling data
+      uses: actions/download-artifact@v3
+      with:
+        name: profiling-results
+        path: tools/profiling
     - name: Uploads HIP JSON files to DB
-      run: cd tools/profiling/; python3 sendData.py --absLayer HIP --profiler 1 --branch master
\ No newline at end of file
+      run: cd tools/profiling; python3 sendData.py --absLayer HIP --profiler 1 --branch master
\ No newline at end of file

From 5712fa99119f559842f57325deb960ccdfd09a2e Mon Sep 17 00:00:00 2001
From: Jorgen T <jorgen.teig@gmail.com>
Date: Mon, 31 Jul 2023 15:08:31 +0200
Subject: [PATCH 403/509] Testing on push in workflow

---
 .github/workflows/mi250x_profiler.yml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/mi250x_profiler.yml b/.github/workflows/mi250x_profiler.yml
index f291a9d3ff..4da5e21740 100644
--- a/.github/workflows/mi250x_profiler.yml
+++ b/.github/workflows/mi250x_profiler.yml
@@ -1,8 +1,8 @@
 name: MI250X Performance Profiler
 
 on:
-  schedule:
-    - cron:  '00 00 * * *'
+  push:
+    branches: [ gpu_abstraction ]
 
 jobs:
   Container_Setup:

From 7bbfadb1a07c92dc51d62ad0bf5138ae6ff66a1c Mon Sep 17 00:00:00 2001
From: Jorgen T <jorgen.teig@gmail.com>
Date: Mon, 31 Jul 2023 15:12:45 +0200
Subject: [PATCH 404/509] Fixed typo

---
 .github/workflows/mi250x_profiler.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/mi250x_profiler.yml b/.github/workflows/mi250x_profiler.yml
index 4da5e21740..9976052c3d 100644
--- a/.github/workflows/mi250x_profiler.yml
+++ b/.github/workflows/mi250x_profiler.yml
@@ -29,7 +29,7 @@ jobs:
           ghcr.io/${{ github.repository_owner }}/github_runner_mi250x:latest"
 
   HIP_MI250X_Profiling:
-    needs: Container Setup
+    needs: Container_Setup
     runs-on: [self-hosted, linux, mi250x]
     name: HIP MI250X Profiling
     env:

From e57f4ffa08db203ef7a5ac35e483ac7883be63d8 Mon Sep 17 00:00:00 2001
From: Jorgen T <jorgen.teig@gmail.com>
Date: Mon, 31 Jul 2023 15:15:46 +0200
Subject: [PATCH 405/509] Made runner token not dependent on jq command

---
 .github/workflows/mi250x_profiler.yml | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/mi250x_profiler.yml b/.github/workflows/mi250x_profiler.yml
index 9976052c3d..88e949a1a7 100644
--- a/.github/workflows/mi250x_profiler.yml
+++ b/.github/workflows/mi250x_profiler.yml
@@ -12,7 +12,11 @@ jobs:
     - name: Generate runner token
       id: generate_token
       run: |
-        TOKEN=$(curl -XPOST -fsSL -H "Authorization: token ${{ secrets.PAT }}" -H "Accept: application/vnd.github.v3+json" "https://api.github.com/repos/${{ github.repository }}/actions/runners/registration-token" | jq -r '.token')
+        TOKEN=$(curl -XPOST -fsSL \
+                    -H "Authorization: token ${{ secrets.PAT }}" \
+                    -H "Accept: application/vnd.github.v3+json" \
+                    "https://api.github.com/repos/${{ github.repository }}/actions/runners/registration-token" \
+              | grep -o '"token": *"[^"]*"' | cut -d '"' -f 4)
         echo "::set-output name=token::$TOKEN"
     - name: SSH and run Docker container
       run: |

From 6629f6a2e90646a8946245abb4ac83d2e5eb7e69 Mon Sep 17 00:00:00 2001
From: Jorgen T <jorgen.teig@gmail.com>
Date: Mon, 31 Jul 2023 15:37:34 +0200
Subject: [PATCH 406/509] Added correct prefix to singularity pull command

---
 .github/workflows/mi250x_profiler.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/mi250x_profiler.yml b/.github/workflows/mi250x_profiler.yml
index 88e949a1a7..90c78aa208 100644
--- a/.github/workflows/mi250x_profiler.yml
+++ b/.github/workflows/mi250x_profiler.yml
@@ -23,7 +23,7 @@ jobs:
         echo "${{ secrets.SSH_KEY }}" > id_rsa
         chmod 600 id_rsa
         ssh -o StrictHostKeyChecking=no -i id_rsa ${{ secrets.MI250X_PROFILING_HOST }} "\
-          singularity pull ghcr.io/${{ github.repository_owner }}/github_runner_mi250x:latest && \
+          singularity pull oras://ghcr.io/${{ github.repository_owner }}/github_runner_mi250x:latest && \
           srun --pty --account=${{ secrets.HPC_ACCOUNT }} -p ${{ secrets.HPC_PROJECT }} --time=03:00:00 singularity run -d --name github_runner_mi250x --rm \
           -e GITHUB_TOKEN=${{ steps.generate_token.outputs.token }} \
           -e REPO_URL=https://github.com/${{ github.repository }} \

From e51378bb2a1a109eaa7236f89b39c61dec103eee Mon Sep 17 00:00:00 2001
From: Jorgen T <jorgen.teig@gmail.com>
Date: Mon, 31 Jul 2023 16:18:57 +0200
Subject: [PATCH 407/509] Fixed srun command to start container

---
 .github/workflows/mi250x_profiler.yml | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/.github/workflows/mi250x_profiler.yml b/.github/workflows/mi250x_profiler.yml
index 90c78aa208..24fdb9d8b2 100644
--- a/.github/workflows/mi250x_profiler.yml
+++ b/.github/workflows/mi250x_profiler.yml
@@ -18,19 +18,19 @@ jobs:
                     "https://api.github.com/repos/${{ github.repository }}/actions/runners/registration-token" \
               | grep -o '"token": *"[^"]*"' | cut -d '"' -f 4)
         echo "::set-output name=token::$TOKEN"
+        echo "{token}={$TOKEN}" >> $GITHUB_OUTPUT
     - name: SSH and run Docker container
       run: |
         echo "${{ secrets.SSH_KEY }}" > id_rsa
         chmod 600 id_rsa
         ssh -o StrictHostKeyChecking=no -i id_rsa ${{ secrets.MI250X_PROFILING_HOST }} "\
           singularity pull oras://ghcr.io/${{ github.repository_owner }}/github_runner_mi250x:latest && \
-          srun --pty --account=${{ secrets.HPC_ACCOUNT }} -p ${{ secrets.HPC_PROJECT }} --time=03:00:00 singularity run -d --name github_runner_mi250x --rm \
-          -e GITHUB_TOKEN=${{ steps.generate_token.outputs.token }} \
-          -e REPO_URL=https://github.com/${{ github.repository }} \
-          -e RUNNER_NAME=github_runner_mi250x \
-          -e GITHUB_RUNNER_TAGS='Linux,x64,mi250x' \
-          -e RUNNER_URL=https://github.com/actions/runner/releases/download/v2.303.0/actions-runner-linux-x64-2.303.0.tar.gz \
-          ghcr.io/${{ github.repository_owner }}/github_runner_mi250x:latest"
+          srun --pty --account=${{ secrets.HPC_ACCOUNT }} -p ${{ secrets.HPC_PROJECT }} --time=03:00:00 singularity exec github_runner_mi250x.sif \
+          --env=GITHUB_TOKEN=${{ steps.generate_token.outputs.token }} \
+          --env=REPO_URL=https://github.com/${{ github.repository }} \
+          --env=RUNNER_NAME=github_runner_mi250x \
+          --env=GITHUB_RUNNER_TAGS='Linux,x64,mi250x' \
+          --env=RUNNER_URL=https://github.com/actions/runner/releases/download/v2.303.0/actions-runner-linux-x64-2.303.0.tar.gz
 
   HIP_MI250X_Profiling:
     needs: Container_Setup

From 2e53e3a98508b9e21d504989bdd2d903d412d27d Mon Sep 17 00:00:00 2001
From: Jorgen T <jorgen.teig@gmail.com>
Date: Mon, 31 Jul 2023 16:20:25 +0200
Subject: [PATCH 408/509] Fixed syntax error in srun command

---
 .github/workflows/mi250x_profiler.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/mi250x_profiler.yml b/.github/workflows/mi250x_profiler.yml
index 24fdb9d8b2..8fe9a9c8c0 100644
--- a/.github/workflows/mi250x_profiler.yml
+++ b/.github/workflows/mi250x_profiler.yml
@@ -30,7 +30,7 @@ jobs:
           --env=REPO_URL=https://github.com/${{ github.repository }} \
           --env=RUNNER_NAME=github_runner_mi250x \
           --env=GITHUB_RUNNER_TAGS='Linux,x64,mi250x' \
-          --env=RUNNER_URL=https://github.com/actions/runner/releases/download/v2.303.0/actions-runner-linux-x64-2.303.0.tar.gz
+          --env=RUNNER_URL=https://github.com/actions/runner/releases/download/v2.303.0/actions-runner-linux-x64-2.303.0.tar.gz"
 
   HIP_MI250X_Profiling:
     needs: Container_Setup

From 0b284031bcfdba3b633856a874a05b477aca2309 Mon Sep 17 00:00:00 2001
From: Jorgen T <jorgen.teig@gmail.com>
Date: Mon, 31 Jul 2023 16:28:03 +0200
Subject: [PATCH 409/509] Added a --force to the singularity pull command if
 the image ever gets updated

---
 .github/workflows/mi250x_profiler.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/mi250x_profiler.yml b/.github/workflows/mi250x_profiler.yml
index 8fe9a9c8c0..0342e89af1 100644
--- a/.github/workflows/mi250x_profiler.yml
+++ b/.github/workflows/mi250x_profiler.yml
@@ -24,7 +24,7 @@ jobs:
         echo "${{ secrets.SSH_KEY }}" > id_rsa
         chmod 600 id_rsa
         ssh -o StrictHostKeyChecking=no -i id_rsa ${{ secrets.MI250X_PROFILING_HOST }} "\
-          singularity pull oras://ghcr.io/${{ github.repository_owner }}/github_runner_mi250x:latest && \
+          singularity pull --force oras://ghcr.io/${{ github.repository_owner }}/github_runner_mi250x:latest && \
           srun --pty --account=${{ secrets.HPC_ACCOUNT }} -p ${{ secrets.HPC_PROJECT }} --time=03:00:00 singularity exec github_runner_mi250x.sif \
           --env=GITHUB_TOKEN=${{ steps.generate_token.outputs.token }} \
           --env=REPO_URL=https://github.com/${{ github.repository }} \

From 1bd4880a0a098bebae3be545589d32fd96cada8f Mon Sep 17 00:00:00 2001
From: Jorgen T <jorgen.teig@gmail.com>
Date: Mon, 31 Jul 2023 16:30:48 +0200
Subject: [PATCH 410/509] Added tag in container execution

---
 .github/workflows/mi250x_profiler.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/mi250x_profiler.yml b/.github/workflows/mi250x_profiler.yml
index 0342e89af1..61d45a6843 100644
--- a/.github/workflows/mi250x_profiler.yml
+++ b/.github/workflows/mi250x_profiler.yml
@@ -25,7 +25,7 @@ jobs:
         chmod 600 id_rsa
         ssh -o StrictHostKeyChecking=no -i id_rsa ${{ secrets.MI250X_PROFILING_HOST }} "\
           singularity pull --force oras://ghcr.io/${{ github.repository_owner }}/github_runner_mi250x:latest && \
-          srun --pty --account=${{ secrets.HPC_ACCOUNT }} -p ${{ secrets.HPC_PROJECT }} --time=03:00:00 singularity exec github_runner_mi250x.sif \
+          srun --pty --account=${{ secrets.HPC_ACCOUNT }} -p ${{ secrets.HPC_PROJECT }} --time=03:00:00 singularity exec github_runner_mi250x_latest.sif \
           --env=GITHUB_TOKEN=${{ steps.generate_token.outputs.token }} \
           --env=REPO_URL=https://github.com/${{ github.repository }} \
           --env=RUNNER_NAME=github_runner_mi250x \

From 75afd3ab20ba6ba47ee897576edc442b263324c4 Mon Sep 17 00:00:00 2001
From: Jorgen T <jorgen.teig@gmail.com>
Date: Mon, 31 Jul 2023 16:34:21 +0200
Subject: [PATCH 411/509] Changed the syntax of the srun command

---
 .github/workflows/mi250x_profiler.yml | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/.github/workflows/mi250x_profiler.yml b/.github/workflows/mi250x_profiler.yml
index 61d45a6843..d6a781716a 100644
--- a/.github/workflows/mi250x_profiler.yml
+++ b/.github/workflows/mi250x_profiler.yml
@@ -26,11 +26,11 @@ jobs:
         ssh -o StrictHostKeyChecking=no -i id_rsa ${{ secrets.MI250X_PROFILING_HOST }} "\
           singularity pull --force oras://ghcr.io/${{ github.repository_owner }}/github_runner_mi250x:latest && \
           srun --pty --account=${{ secrets.HPC_ACCOUNT }} -p ${{ secrets.HPC_PROJECT }} --time=03:00:00 singularity exec github_runner_mi250x_latest.sif \
-          --env=GITHUB_TOKEN=${{ steps.generate_token.outputs.token }} \
-          --env=REPO_URL=https://github.com/${{ github.repository }} \
-          --env=RUNNER_NAME=github_runner_mi250x \
-          --env=GITHUB_RUNNER_TAGS='Linux,x64,mi250x' \
-          --env=RUNNER_URL=https://github.com/actions/runner/releases/download/v2.303.0/actions-runner-linux-x64-2.303.0.tar.gz"
+          --env GITHUB_TOKEN='${{ steps.generate_token.outputs.token }}' \
+          --env REPO_URL='https://github.com/${{ github.repository }}' \
+          --env RUNNER_NAME='github_runner_mi250x' \
+          --env GITHUB_RUNNER_TAGS='Linux,x64,mi250x' \
+          --env RUNNER_URL='https://github.com/actions/runner/releases/download/v2.303.0/actions-runner-linux-x64-2.303.0.tar.gz'"
 
   HIP_MI250X_Profiling:
     needs: Container_Setup

From 9a2368642c4f87c096aa534c919dc31e6a9d4c4f Mon Sep 17 00:00:00 2001
From: Jorgen T <jorgen.teig@gmail.com>
Date: Mon, 31 Jul 2023 16:39:58 +0200
Subject: [PATCH 412/509] Fixed srun syntax again

---
 .github/workflows/mi250x_profiler.yml | 21 +++++++++++++--------
 1 file changed, 13 insertions(+), 8 deletions(-)

diff --git a/.github/workflows/mi250x_profiler.yml b/.github/workflows/mi250x_profiler.yml
index d6a781716a..e95958c4bb 100644
--- a/.github/workflows/mi250x_profiler.yml
+++ b/.github/workflows/mi250x_profiler.yml
@@ -11,6 +11,11 @@ jobs:
     steps:
     - name: Generate runner token
       id: generate_token
+      env:
+        SSH_PRIVATE_KEY: ${{ secrets.SSH_KEY }}
+        MI250X_PROFILING_HOST: ${{ secrets.MI250X_PROFILING_HOST }}
+        HPC_ACCOUNT: ${{ secrets.HPC_ACCOUNT }}
+        HPC_PROJECT: ${{ secrets.HPC_PROJECT }}
       run: |
         TOKEN=$(curl -XPOST -fsSL \
                     -H "Authorization: token ${{ secrets.PAT }}" \
@@ -21,16 +26,16 @@ jobs:
         echo "{token}={$TOKEN}" >> $GITHUB_OUTPUT
     - name: SSH and run Docker container
       run: |
-        echo "${{ secrets.SSH_KEY }}" > id_rsa
+        echo "$SSH_PRIVATE_KEY" > id_rsa
         chmod 600 id_rsa
-        ssh -o StrictHostKeyChecking=no -i id_rsa ${{ secrets.MI250X_PROFILING_HOST }} "\
+        ssh -o StrictHostKeyChecking=no -i id_rsa $MI250X_PROFILING_HOST "\
           singularity pull --force oras://ghcr.io/${{ github.repository_owner }}/github_runner_mi250x:latest && \
-          srun --pty --account=${{ secrets.HPC_ACCOUNT }} -p ${{ secrets.HPC_PROJECT }} --time=03:00:00 singularity exec github_runner_mi250x_latest.sif \
-          --env GITHUB_TOKEN='${{ steps.generate_token.outputs.token }}' \
-          --env REPO_URL='https://github.com/${{ github.repository }}' \
-          --env RUNNER_NAME='github_runner_mi250x' \
-          --env GITHUB_RUNNER_TAGS='Linux,x64,mi250x' \
-          --env RUNNER_URL='https://github.com/actions/runner/releases/download/v2.303.0/actions-runner-linux-x64-2.303.0.tar.gz'"
+          srun --account=$HPC_ACCOUNT -p $HPC_PROJECT --time=03:00:00 singularity exec github_runner_mi250x_latest.sif \
+          --env GITHUB_TOKEN='${{ steps.generate_token.outputs.token }}', \
+                REPO_URL='https://github.com/${{ github.repository }}', \
+                RUNNER_NAME='github_runner_mi250x', \
+                GITHUB_RUNNER_TAGS='Linux,x64,mi250x', \
+                RUNNER_URL='https://github.com/actions/runner/releases/download/v2.303.0/actions-runner-linux-x64-2.303.0.tar.gz'"
 
   HIP_MI250X_Profiling:
     needs: Container_Setup

From d46d4f882d33e76fcd5e4b5b8b6fcb2f91be4720 Mon Sep 17 00:00:00 2001
From: Jorgen T <jorgen.teig@gmail.com>
Date: Mon, 31 Jul 2023 16:43:30 +0200
Subject: [PATCH 413/509] Fixed deprecated github feature and fixed env
 variables

---
 .github/workflows/mi250x_profiler.yml | 13 ++++++-------
 1 file changed, 6 insertions(+), 7 deletions(-)

diff --git a/.github/workflows/mi250x_profiler.yml b/.github/workflows/mi250x_profiler.yml
index e95958c4bb..c2cee20fd2 100644
--- a/.github/workflows/mi250x_profiler.yml
+++ b/.github/workflows/mi250x_profiler.yml
@@ -11,20 +11,19 @@ jobs:
     steps:
     - name: Generate runner token
       id: generate_token
-      env:
-        SSH_PRIVATE_KEY: ${{ secrets.SSH_KEY }}
-        MI250X_PROFILING_HOST: ${{ secrets.MI250X_PROFILING_HOST }}
-        HPC_ACCOUNT: ${{ secrets.HPC_ACCOUNT }}
-        HPC_PROJECT: ${{ secrets.HPC_PROJECT }}
       run: |
         TOKEN=$(curl -XPOST -fsSL \
                     -H "Authorization: token ${{ secrets.PAT }}" \
                     -H "Accept: application/vnd.github.v3+json" \
                     "https://api.github.com/repos/${{ github.repository }}/actions/runners/registration-token" \
               | grep -o '"token": *"[^"]*"' | cut -d '"' -f 4)
-        echo "::set-output name=token::$TOKEN"
-        echo "{token}={$TOKEN}" >> $GITHUB_OUTPUT
+        echo "token=$TOKEN" >> $GITHUB_OUTPUT
     - name: SSH and run Docker container
+      env:
+        SSH_PRIVATE_KEY: ${{ secrets.SSH_KEY }}
+        MI250X_PROFILING_HOST: ${{ secrets.MI250X_PROFILING_HOST }}
+        HPC_ACCOUNT: ${{ secrets.HPC_ACCOUNT }}
+        HPC_PROJECT: ${{ secrets.HPC_PROJECT }}
       run: |
         echo "$SSH_PRIVATE_KEY" > id_rsa
         chmod 600 id_rsa

From 61b3ad8bc8872b3fe85777aef27ee7522453dc89 Mon Sep 17 00:00:00 2001
From: Jorgen T <jorgen.teig@gmail.com>
Date: Mon, 31 Jul 2023 16:46:20 +0200
Subject: [PATCH 414/509] Put sif file to be executed last in srun command

---
 .github/workflows/mi250x_profiler.yml | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/mi250x_profiler.yml b/.github/workflows/mi250x_profiler.yml
index c2cee20fd2..590b0f83e1 100644
--- a/.github/workflows/mi250x_profiler.yml
+++ b/.github/workflows/mi250x_profiler.yml
@@ -29,12 +29,13 @@ jobs:
         chmod 600 id_rsa
         ssh -o StrictHostKeyChecking=no -i id_rsa $MI250X_PROFILING_HOST "\
           singularity pull --force oras://ghcr.io/${{ github.repository_owner }}/github_runner_mi250x:latest && \
-          srun --account=$HPC_ACCOUNT -p $HPC_PROJECT --time=03:00:00 singularity exec github_runner_mi250x_latest.sif \
+          srun --account=$HPC_ACCOUNT -p $HPC_PROJECT --time=03:00:00 singularity exec \
           --env GITHUB_TOKEN='${{ steps.generate_token.outputs.token }}', \
                 REPO_URL='https://github.com/${{ github.repository }}', \
                 RUNNER_NAME='github_runner_mi250x', \
                 GITHUB_RUNNER_TAGS='Linux,x64,mi250x', \
-                RUNNER_URL='https://github.com/actions/runner/releases/download/v2.303.0/actions-runner-linux-x64-2.303.0.tar.gz'"
+                RUNNER_URL='https://github.com/actions/runner/releases/download/v2.303.0/actions-runner-linux-x64-2.303.0.tar.gz'
+          github_runner_mi250x_latest.sif"
 
   HIP_MI250X_Profiling:
     needs: Container_Setup

From 395bae3c41fc87bea9c5f1ada3068f499d09d07d Mon Sep 17 00:00:00 2001
From: Jorgen T <jorgen.teig@gmail.com>
Date: Mon, 31 Jul 2023 16:47:39 +0200
Subject: [PATCH 415/509] Forgot backslash after next last line in srun

---
 .github/workflows/mi250x_profiler.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/mi250x_profiler.yml b/.github/workflows/mi250x_profiler.yml
index 590b0f83e1..f0db63b62e 100644
--- a/.github/workflows/mi250x_profiler.yml
+++ b/.github/workflows/mi250x_profiler.yml
@@ -34,7 +34,7 @@ jobs:
                 REPO_URL='https://github.com/${{ github.repository }}', \
                 RUNNER_NAME='github_runner_mi250x', \
                 GITHUB_RUNNER_TAGS='Linux,x64,mi250x', \
-                RUNNER_URL='https://github.com/actions/runner/releases/download/v2.303.0/actions-runner-linux-x64-2.303.0.tar.gz'
+                RUNNER_URL='https://github.com/actions/runner/releases/download/v2.303.0/actions-runner-linux-x64-2.303.0.tar.gz' \
           github_runner_mi250x_latest.sif"
 
   HIP_MI250X_Profiling:

From d7cf2dee293731625d5031009283f814a8c95093 Mon Sep 17 00:00:00 2001
From: Jorgen T <jorgen.teig@gmail.com>
Date: Mon, 31 Jul 2023 16:49:13 +0200
Subject: [PATCH 416/509] Changed srun syntax again

---
 .github/workflows/mi250x_profiler.yml | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

diff --git a/.github/workflows/mi250x_profiler.yml b/.github/workflows/mi250x_profiler.yml
index f0db63b62e..654742c58c 100644
--- a/.github/workflows/mi250x_profiler.yml
+++ b/.github/workflows/mi250x_profiler.yml
@@ -30,11 +30,7 @@ jobs:
         ssh -o StrictHostKeyChecking=no -i id_rsa $MI250X_PROFILING_HOST "\
           singularity pull --force oras://ghcr.io/${{ github.repository_owner }}/github_runner_mi250x:latest && \
           srun --account=$HPC_ACCOUNT -p $HPC_PROJECT --time=03:00:00 singularity exec \
-          --env GITHUB_TOKEN='${{ steps.generate_token.outputs.token }}', \
-                REPO_URL='https://github.com/${{ github.repository }}', \
-                RUNNER_NAME='github_runner_mi250x', \
-                GITHUB_RUNNER_TAGS='Linux,x64,mi250x', \
-                RUNNER_URL='https://github.com/actions/runner/releases/download/v2.303.0/actions-runner-linux-x64-2.303.0.tar.gz' \
+          --env GITHUB_TOKEN='${{ steps.generate_token.outputs.token }}',REPO_URL='https://github.com/${{ github.repository }}',RUNNER_NAME='github_runner_mi250x',GITHUB_RUNNER_TAGS='Linux,x64,mi250x',RUNNER_URL='https://github.com/actions/runner/releases/download/v2.303.0/actions-runner-linux-x64-2.303.0.tar.gz' \
           github_runner_mi250x_latest.sif"
 
   HIP_MI250X_Profiling:

From c7fe97499d2b1a7ca0274e7f6fcebbea233e4d5b Mon Sep 17 00:00:00 2001
From: Jorgen T <jorgen.teig@gmail.com>
Date: Mon, 31 Jul 2023 16:59:28 +0200
Subject: [PATCH 417/509] Added the enviroment variables to the workflow
 instead of the command

---
 .github/workflows/mi250x_profiler.yml | 15 +++++++++------
 1 file changed, 9 insertions(+), 6 deletions(-)

diff --git a/.github/workflows/mi250x_profiler.yml b/.github/workflows/mi250x_profiler.yml
index 654742c58c..a5116a6b3a 100644
--- a/.github/workflows/mi250x_profiler.yml
+++ b/.github/workflows/mi250x_profiler.yml
@@ -20,18 +20,21 @@ jobs:
         echo "token=$TOKEN" >> $GITHUB_OUTPUT
     - name: SSH and run Docker container
       env:
-        SSH_PRIVATE_KEY: ${{ secrets.SSH_KEY }}
+        SSH_PRIVATE_KEY:       ${{ secrets.SSH_KEY }}
         MI250X_PROFILING_HOST: ${{ secrets.MI250X_PROFILING_HOST }}
-        HPC_ACCOUNT: ${{ secrets.HPC_ACCOUNT }}
-        HPC_PROJECT: ${{ secrets.HPC_PROJECT }}
+        HPC_ACCOUNT:           ${{ secrets.HPC_ACCOUNT }}
+        HPC_PROJECT:           ${{ secrets.HPC_PROJECT }}
+        GITHUB_TOKEN:          ${{ steps.generate_token.outputs.token }}
+        REPO_URL:              https://github.com/${{ github.repository }}
+        RUNNER_NAME:           github_runner_mi250x
+        GITHUB_RUNNER_TAGS:    Linux,x64,mi250x
+        RUNNER_URL:            https://github.com/actions/runner/releases/download/v2.303.0/actions-runner-linux-x64-2.303.0.tar.gz
       run: |
         echo "$SSH_PRIVATE_KEY" > id_rsa
         chmod 600 id_rsa
         ssh -o StrictHostKeyChecking=no -i id_rsa $MI250X_PROFILING_HOST "\
           singularity pull --force oras://ghcr.io/${{ github.repository_owner }}/github_runner_mi250x:latest && \
-          srun --account=$HPC_ACCOUNT -p $HPC_PROJECT --time=03:00:00 singularity exec \
-          --env GITHUB_TOKEN='${{ steps.generate_token.outputs.token }}',REPO_URL='https://github.com/${{ github.repository }}',RUNNER_NAME='github_runner_mi250x',GITHUB_RUNNER_TAGS='Linux,x64,mi250x',RUNNER_URL='https://github.com/actions/runner/releases/download/v2.303.0/actions-runner-linux-x64-2.303.0.tar.gz' \
-          github_runner_mi250x_latest.sif"
+          srun --account=$HPC_ACCOUNT -p $HPC_PROJECT --time=03:00:00 singularity exec github_runner_mi250x_latest.sif"
 
   HIP_MI250X_Profiling:
     needs: Container_Setup

From ec7d843e8e9102a4396ed3dfe005202452e9aea2 Mon Sep 17 00:00:00 2001
From: Jorgen T <jorgen.teig@gmail.com>
Date: Mon, 31 Jul 2023 17:19:09 +0200
Subject: [PATCH 418/509] Switched back to run command instead of exec

---
 .github/workflows/mi250x_profiler.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/mi250x_profiler.yml b/.github/workflows/mi250x_profiler.yml
index a5116a6b3a..4a4d4c3fbc 100644
--- a/.github/workflows/mi250x_profiler.yml
+++ b/.github/workflows/mi250x_profiler.yml
@@ -34,7 +34,7 @@ jobs:
         chmod 600 id_rsa
         ssh -o StrictHostKeyChecking=no -i id_rsa $MI250X_PROFILING_HOST "\
           singularity pull --force oras://ghcr.io/${{ github.repository_owner }}/github_runner_mi250x:latest && \
-          srun --account=$HPC_ACCOUNT -p $HPC_PROJECT --time=03:00:00 singularity exec github_runner_mi250x_latest.sif"
+          srun --account=$HPC_ACCOUNT -p $HPC_PROJECT --time=03:00:00 singularity run github_runner_mi250x_latest.sif"
 
   HIP_MI250X_Profiling:
     needs: Container_Setup

From 95bcc39df17c780ded715c819d8716700d02f2a0 Mon Sep 17 00:00:00 2001
From: Jorgen T <jorgen.teig@gmail.com>
Date: Mon, 31 Jul 2023 17:24:17 +0200
Subject: [PATCH 419/509] Added back env variables in the command because they
 were not picked up when run

---
 .github/workflows/mi250x_profiler.yml | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

diff --git a/.github/workflows/mi250x_profiler.yml b/.github/workflows/mi250x_profiler.yml
index 4a4d4c3fbc..c025ab61be 100644
--- a/.github/workflows/mi250x_profiler.yml
+++ b/.github/workflows/mi250x_profiler.yml
@@ -24,17 +24,18 @@ jobs:
         MI250X_PROFILING_HOST: ${{ secrets.MI250X_PROFILING_HOST }}
         HPC_ACCOUNT:           ${{ secrets.HPC_ACCOUNT }}
         HPC_PROJECT:           ${{ secrets.HPC_PROJECT }}
-        GITHUB_TOKEN:          ${{ steps.generate_token.outputs.token }}
-        REPO_URL:              https://github.com/${{ github.repository }}
-        RUNNER_NAME:           github_runner_mi250x
-        GITHUB_RUNNER_TAGS:    Linux,x64,mi250x
-        RUNNER_URL:            https://github.com/actions/runner/releases/download/v2.303.0/actions-runner-linux-x64-2.303.0.tar.gz
       run: |
         echo "$SSH_PRIVATE_KEY" > id_rsa
         chmod 600 id_rsa
         ssh -o StrictHostKeyChecking=no -i id_rsa $MI250X_PROFILING_HOST "\
           singularity pull --force oras://ghcr.io/${{ github.repository_owner }}/github_runner_mi250x:latest && \
-          srun --account=$HPC_ACCOUNT -p $HPC_PROJECT --time=03:00:00 singularity run github_runner_mi250x_latest.sif"
+          srun --account=$HPC_ACCOUNT -p $HPC_PROJECT --time=03:00:00 singularity run \
+            --env GITHUB_TOKEN=${{ steps.generate_token.outputs.token }} \
+            --env REPO_URL=https://github.com/${{ github.repository }} \
+            --env RUNNER_NAME=github_runner_mi250x \
+            --env GITHUB_RUNNER_TAGS='Linux,x64,mi250x' \
+            --env RUNNER_URL=https://github.com/actions/runner/releases/download/v2.303.0/actions-runner-linux-x64-2.303.0.tar.gz \
+            github_runner_mi250x_latest.sif"
 
   HIP_MI250X_Profiling:
     needs: Container_Setup

From 4eb1393d9d1ae64436d94f344d2765eede290ff4 Mon Sep 17 00:00:00 2001
From: Jorgen T <jorgen.teig@gmail.com>
Date: Mon, 31 Jul 2023 17:49:15 +0200
Subject: [PATCH 420/509] Reworked workflow file for mi250x profiling

---
 .github/workflows/mi250x_profiler.yml | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/mi250x_profiler.yml b/.github/workflows/mi250x_profiler.yml
index c025ab61be..22e1855fa2 100644
--- a/.github/workflows/mi250x_profiler.yml
+++ b/.github/workflows/mi250x_profiler.yml
@@ -5,9 +5,9 @@ on:
     branches: [ gpu_abstraction ]
 
 jobs:
-  Container_Setup:
+  Container_Setup_&_Execution:
     runs-on: [self-hosted, linux, a100]
-    name: Container Setup
+    name: Container Setup & Execution
     steps:
     - name: Generate runner token
       id: generate_token
@@ -24,6 +24,7 @@ jobs:
         MI250X_PROFILING_HOST: ${{ secrets.MI250X_PROFILING_HOST }}
         HPC_ACCOUNT:           ${{ secrets.HPC_ACCOUNT }}
         HPC_PROJECT:           ${{ secrets.HPC_PROJECT }}
+      continue-on-error: true
       run: |
         echo "$SSH_PRIVATE_KEY" > id_rsa
         chmod 600 id_rsa
@@ -38,7 +39,6 @@ jobs:
             github_runner_mi250x_latest.sif"
 
   HIP_MI250X_Profiling:
-    needs: Container_Setup
     runs-on: [self-hosted, linux, mi250x]
     name: HIP MI250X Profiling
     env:

From e70adb5241f9700dfb4225bc9325ec04f482fa0b Mon Sep 17 00:00:00 2001
From: Jorgen T <jorgen.teig@gmail.com>
Date: Mon, 31 Jul 2023 17:50:00 +0200
Subject: [PATCH 421/509] Fixed syntax error

---
 .github/workflows/mi250x_profiler.yml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/mi250x_profiler.yml b/.github/workflows/mi250x_profiler.yml
index 22e1855fa2..835d58ac97 100644
--- a/.github/workflows/mi250x_profiler.yml
+++ b/.github/workflows/mi250x_profiler.yml
@@ -5,9 +5,9 @@ on:
     branches: [ gpu_abstraction ]
 
 jobs:
-  Container_Setup_&_Execution:
+  Container_Setup_and_Execution:
     runs-on: [self-hosted, linux, a100]
-    name: Container Setup & Execution
+    name: Container Setup and Execution
     steps:
     - name: Generate runner token
       id: generate_token

From 4305bbc20dedbeebc6ed0b36de4539d453306d65 Mon Sep 17 00:00:00 2001
From: Jorgen T <jorgen.teig@gmail.com>
Date: Mon, 31 Jul 2023 17:59:54 +0200
Subject: [PATCH 422/509] Fixed silly error in performanceProfiler script and
 changed CUDAVerstion to GPUVersion in sendData script

---
 tools/profiling/performanceProfiler.py | 2 +-
 tools/profiling/sendData.py            | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/tools/profiling/performanceProfiler.py b/tools/profiling/performanceProfiler.py
index 8ee234309c..5f8a9ae867 100644
--- a/tools/profiling/performanceProfiler.py
+++ b/tools/profiling/performanceProfiler.py
@@ -45,7 +45,7 @@
 # How many runs in total the program made
 count = 0
 
-for process in args.p:
+for process in pyArgs.p:
     for TPB in THREADS_PER_BLOCK:
         for BPG in BLOCKS_PER_GRID:
             if TPB * BPG > DOUBLE_PRECISION_CONSTANT:
diff --git a/tools/profiling/sendData.py b/tools/profiling/sendData.py
index 207d7d2e7f..7d7151bf29 100644
--- a/tools/profiling/sendData.py
+++ b/tools/profiling/sendData.py
@@ -123,11 +123,11 @@
 
                 GCCVersion = fileNameParts[6].split('-')[1]
 
-                CUDAVersion = fileNameParts[7].split('-')[1]
+                GPUVersion = fileNameParts[7].split('-')[1]
 
                 gridsize = data[0]["NumThreadsPerBlock"] * data[0]["NumBlocksPerGrid"]
 
-                DBdata = f'{physicsProcess},CPU={CPU},GPU={GPU},AbstractionLayer={args.absLayer},GCCVersion={GCCVersion},CUDAVersion={CUDAVersion},NumThreadsPerBlock={data[0]["NumThreadsPerBlock"]},NumBlocksPerGrid={data[0]["NumBlocksPerGrid"]},NumIterations={data[0]["NumIterations"]} Gridsize={gridsize}'
+                DBdata = f'{physicsProcess},CPU={CPU},GPU={GPU},AbstractionLayer={args.absLayer},GCCVersion={GCCVersion},GPUVersion={GPUVersion},NumThreadsPerBlock={data[0]["NumThreadsPerBlock"]},NumBlocksPerGrid={data[0]["NumBlocksPerGrid"]},NumIterations={data[0]["NumIterations"]} Gridsize={gridsize}'
 
                 for field in FIELDS:
                     value = float(re.findall(r'[\d.]+',data[0][field])[0])

From eeb1d9ba5228c016eb28d172b250049aaa8b2da5 Mon Sep 17 00:00:00 2001
From: Jorgen T <jorgen.teig@gmail.com>
Date: Mon, 31 Jul 2023 18:02:04 +0200
Subject: [PATCH 423/509] Added elif to HIP building as well in
 performanceProfiler

---
 tools/profiling/performanceProfiler.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/profiling/performanceProfiler.py b/tools/profiling/performanceProfiler.py
index 5f8a9ae867..f1b5fcf33f 100644
--- a/tools/profiling/performanceProfiler.py
+++ b/tools/profiling/performanceProfiler.py
@@ -65,7 +65,7 @@
                                 "-b", str(BPG),
                                 "-r", str(pyArgs.b).lower()]
 
-                elif pyArgs.l.upper() == 'CUDA':
+                elif pyArgs.l.upper() == 'CUDA' or pyArgs.l.upper() == 'HIP':
 
                     bashArgs = ["./buildCUDAProcess.sh",
                                 "-n", process,

From b41b33cdeda5f9f4947c4d11dc6a65d78373c71e Mon Sep 17 00:00:00 2001
From: Jorgen T <jorgen.teig@gmail.com>
Date: Mon, 31 Jul 2023 18:05:25 +0200
Subject: [PATCH 424/509] Added correct modules for compilation

---
 .github/workflows/mi250x_profiler.yml | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/.github/workflows/mi250x_profiler.yml b/.github/workflows/mi250x_profiler.yml
index 835d58ac97..071ef27a9a 100644
--- a/.github/workflows/mi250x_profiler.yml
+++ b/.github/workflows/mi250x_profiler.yml
@@ -30,6 +30,8 @@ jobs:
         chmod 600 id_rsa
         ssh -o StrictHostKeyChecking=no -i id_rsa $MI250X_PROFILING_HOST "\
           singularity pull --force oras://ghcr.io/${{ github.repository_owner }}/github_runner_mi250x:latest && \
+          module load gcc/11.2.0 && \
+          export FC=`which gfortran` && \
           srun --account=$HPC_ACCOUNT -p $HPC_PROJECT --time=03:00:00 singularity run \
             --env GITHUB_TOKEN=${{ steps.generate_token.outputs.token }} \
             --env REPO_URL=https://github.com/${{ github.repository }} \

From c4532c61186574153bfb1054a3d3dfa3a9e833ac Mon Sep 17 00:00:00 2001
From: Jorgen T <jorgen.teig@gmail.com>
Date: Mon, 31 Jul 2023 18:06:22 +0200
Subject: [PATCH 425/509] Fixed correct name prefix for DB JSON file uploads

---
 .github/workflows/mi250x_profiler.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/mi250x_profiler.yml b/.github/workflows/mi250x_profiler.yml
index 071ef27a9a..ce490b1f77 100644
--- a/.github/workflows/mi250x_profiler.yml
+++ b/.github/workflows/mi250x_profiler.yml
@@ -44,7 +44,7 @@ jobs:
     runs-on: [self-hosted, linux, mi250x]
     name: HIP MI250X Profiling
     env:
-      CUDA_NAME_PREFIX: sycl_AMD-Epyc-7A53_MI250X_gcc-11.2.1_cuda-12.0.1
+      CUDA_NAME_PREFIX: sycl_AMD-Epyc-7A53_MI250X_gcc-11.2.0_rocm-5.2.3
       ENABLE_CI_PROFILER: 1
       MADGRAPH4GPU_DB_SECRET: ${{ secrets.MADGRAPH4GPU_DB_SECRET }}
     steps:

From f1371e94dfc75d0e8e54c467075502333e1b7f6e Mon Sep 17 00:00:00 2001
From: Jorgen T <jorgen.teig@gmail.com>
Date: Mon, 31 Jul 2023 18:12:46 +0200
Subject: [PATCH 426/509] Chane directory to one with more space

---
 .github/workflows/mi250x_profiler.yml | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/mi250x_profiler.yml b/.github/workflows/mi250x_profiler.yml
index ce490b1f77..40b72d4ae6 100644
--- a/.github/workflows/mi250x_profiler.yml
+++ b/.github/workflows/mi250x_profiler.yml
@@ -22,13 +22,15 @@ jobs:
       env:
         SSH_PRIVATE_KEY:       ${{ secrets.SSH_KEY }}
         MI250X_PROFILING_HOST: ${{ secrets.MI250X_PROFILING_HOST }}
+        MI250X_PROFILING_USER: ${{ secrets.MI250X_PROFILING_USER }}
         HPC_ACCOUNT:           ${{ secrets.HPC_ACCOUNT }}
         HPC_PROJECT:           ${{ secrets.HPC_PROJECT }}
       continue-on-error: true
       run: |
         echo "$SSH_PRIVATE_KEY" > id_rsa
         chmod 600 id_rsa
-        ssh -o StrictHostKeyChecking=no -i id_rsa $MI250X_PROFILING_HOST "\
+        ssh -o StrictHostKeyChecking=no -i id_rsa $MI250X_PROFILING_USER@$MI250X_PROFILING_HOST "\
+          cd /scratch/$HPC_ACCOUNT/$MI250X_PROFILING_USER/ && \
           singularity pull --force oras://ghcr.io/${{ github.repository_owner }}/github_runner_mi250x:latest && \
           module load gcc/11.2.0 && \
           export FC=`which gfortran` && \

From d1a8d31d0443ec900fd66e9617df4f722aac4196 Mon Sep 17 00:00:00 2001
From: Jorgen T <jorgen.teig@gmail.com>
Date: Tue, 1 Aug 2023 11:34:33 +0200
Subject: [PATCH 427/509] Added singularity cachedir env variable and exported
 correct CXX variable

---
 .github/workflows/mi250x_profiler.yml | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/.github/workflows/mi250x_profiler.yml b/.github/workflows/mi250x_profiler.yml
index 40b72d4ae6..e4dbcbc39f 100644
--- a/.github/workflows/mi250x_profiler.yml
+++ b/.github/workflows/mi250x_profiler.yml
@@ -25,6 +25,7 @@ jobs:
         MI250X_PROFILING_USER: ${{ secrets.MI250X_PROFILING_USER }}
         HPC_ACCOUNT:           ${{ secrets.HPC_ACCOUNT }}
         HPC_PROJECT:           ${{ secrets.HPC_PROJECT }}
+        SINGULARITY_CACHEDIR:  /scratch/$HPC_ACCOUNT/$MI250X_PROFILING_USER/
       continue-on-error: true
       run: |
         echo "$SSH_PRIVATE_KEY" > id_rsa
@@ -33,6 +34,7 @@ jobs:
           cd /scratch/$HPC_ACCOUNT/$MI250X_PROFILING_USER/ && \
           singularity pull --force oras://ghcr.io/${{ github.repository_owner }}/github_runner_mi250x:latest && \
           module load gcc/11.2.0 && \
+          export CXX=`which g++` && \
           export FC=`which gfortran` && \
           srun --account=$HPC_ACCOUNT -p $HPC_PROJECT --time=03:00:00 singularity run \
             --env GITHUB_TOKEN=${{ steps.generate_token.outputs.token }} \

From 217b752b01021dec9885ae74886daa8dfd19d2e3 Mon Sep 17 00:00:00 2001
From: Jorgen T <jorgen.teig@gmail.com>
Date: Tue, 1 Aug 2023 14:37:54 +0200
Subject: [PATCH 428/509] Moved the slurm command to the profiler

---
 .github/workflows/mi250x_profiler.yml | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/mi250x_profiler.yml b/.github/workflows/mi250x_profiler.yml
index e4dbcbc39f..b7e10a1086 100644
--- a/.github/workflows/mi250x_profiler.yml
+++ b/.github/workflows/mi250x_profiler.yml
@@ -33,9 +33,6 @@ jobs:
         ssh -o StrictHostKeyChecking=no -i id_rsa $MI250X_PROFILING_USER@$MI250X_PROFILING_HOST "\
           cd /scratch/$HPC_ACCOUNT/$MI250X_PROFILING_USER/ && \
           singularity pull --force oras://ghcr.io/${{ github.repository_owner }}/github_runner_mi250x:latest && \
-          module load gcc/11.2.0 && \
-          export CXX=`which g++` && \
-          export FC=`which gfortran` && \
           srun --account=$HPC_ACCOUNT -p $HPC_PROJECT --time=03:00:00 singularity run \
             --env GITHUB_TOKEN=${{ steps.generate_token.outputs.token }} \
             --env REPO_URL=https://github.com/${{ github.repository }} \
@@ -55,6 +52,9 @@ jobs:
     - uses: actions/checkout@v2
     - name: Runs SYCL performanceProfiler.py script
       run: cd tools/profiling/;
+           module load gcc/11.2.0
+           export CXX=`which g++`
+           export FC=`which gfortran`
            python3 performanceProfiler.py -l 'HIP' -b 'master'
 
     - name: Uploads workplace_mg4gpu directory as an artifact

From 412b30d288c3068de1ee9e7c837f81567b523b32 Mon Sep 17 00:00:00 2001
From: Jorgen T <jorgen.teig@gmail.com>
Date: Tue, 1 Aug 2023 14:39:01 +0200
Subject: [PATCH 429/509] Added linebreaks for correct execution

---
 .github/workflows/mi250x_profiler.yml | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/mi250x_profiler.yml b/.github/workflows/mi250x_profiler.yml
index b7e10a1086..7c01113df3 100644
--- a/.github/workflows/mi250x_profiler.yml
+++ b/.github/workflows/mi250x_profiler.yml
@@ -52,9 +52,9 @@ jobs:
     - uses: actions/checkout@v2
     - name: Runs SYCL performanceProfiler.py script
       run: cd tools/profiling/;
-           module load gcc/11.2.0
-           export CXX=`which g++`
-           export FC=`which gfortran`
+           module load gcc/11.2.0;
+           export CXX=`which g++`;
+           export FC=`which gfortran`;
            python3 performanceProfiler.py -l 'HIP' -b 'master'
 
     - name: Uploads workplace_mg4gpu directory as an artifact

From 1aff7a426cec8ff2f9fb4d00ef3e839577231a20 Mon Sep 17 00:00:00 2001
From: Jorgen T <jorgen.teig@gmail.com>
Date: Tue, 1 Aug 2023 16:02:12 +0200
Subject: [PATCH 430/509] Remove unneccessary things in workflow

---
 .github/workflows/mi250x_profiler.yml | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/.github/workflows/mi250x_profiler.yml b/.github/workflows/mi250x_profiler.yml
index 7c01113df3..7940d0677c 100644
--- a/.github/workflows/mi250x_profiler.yml
+++ b/.github/workflows/mi250x_profiler.yml
@@ -52,9 +52,6 @@ jobs:
     - uses: actions/checkout@v2
     - name: Runs SYCL performanceProfiler.py script
       run: cd tools/profiling/;
-           module load gcc/11.2.0;
-           export CXX=`which g++`;
-           export FC=`which gfortran`;
            python3 performanceProfiler.py -l 'HIP' -b 'master'
 
     - name: Uploads workplace_mg4gpu directory as an artifact

From 9b5ba925e54af6ed448877af571e9bdb222dd69a Mon Sep 17 00:00:00 2001
From: Jorgen T <jorgen.teig@gmail.com>
Date: Tue, 1 Aug 2023 16:17:16 +0200
Subject: [PATCH 431/509] Updated name prefix for profiling

---
 .github/workflows/mi250x_profiler.yml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/mi250x_profiler.yml b/.github/workflows/mi250x_profiler.yml
index 7940d0677c..5d3006fab6 100644
--- a/.github/workflows/mi250x_profiler.yml
+++ b/.github/workflows/mi250x_profiler.yml
@@ -45,7 +45,7 @@ jobs:
     runs-on: [self-hosted, linux, mi250x]
     name: HIP MI250X Profiling
     env:
-      CUDA_NAME_PREFIX: sycl_AMD-Epyc-7A53_MI250X_gcc-11.2.0_rocm-5.2.3
+      CUDA_NAME_PREFIX: sycl_AMD-Epyc-7A53_MI250X_gcc-11.2.1_rocm-5.2.3
       ENABLE_CI_PROFILER: 1
       MADGRAPH4GPU_DB_SECRET: ${{ secrets.MADGRAPH4GPU_DB_SECRET }}
     steps:
@@ -65,7 +65,7 @@ jobs:
     runs-on: [self-hosted, linux]
     name: Upload JSON files to DB
     env:
-      CUDA_NAME_PREFIX: sycl_AMD-Epyc-7A53_MI250X_gcc-11.2.1_cuda-12.0.1
+      CUDA_NAME_PREFIX: sycl_AMD-Epyc-7A53_MI250X_gcc-11.2.1_rocm-5.2.3
       ENABLE_CI_PROFILER: 1
       MADGRAPH4GPU_DB_SECRET: ${{ secrets.MADGRAPH4GPU_DB_SECRET }}
     steps:

From a97f9ff0d794925c60a279aae1e689a1d18f6145 Mon Sep 17 00:00:00 2001
From: Jorgen T <jorgen.teig@gmail.com>
Date: Tue, 1 Aug 2023 16:30:11 +0200
Subject: [PATCH 432/509] Added singularity tmpdir variable

---
 .github/workflows/mi250x_profiler.yml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.github/workflows/mi250x_profiler.yml b/.github/workflows/mi250x_profiler.yml
index 5d3006fab6..29967dcd04 100644
--- a/.github/workflows/mi250x_profiler.yml
+++ b/.github/workflows/mi250x_profiler.yml
@@ -26,6 +26,7 @@ jobs:
         HPC_ACCOUNT:           ${{ secrets.HPC_ACCOUNT }}
         HPC_PROJECT:           ${{ secrets.HPC_PROJECT }}
         SINGULARITY_CACHEDIR:  /scratch/$HPC_ACCOUNT/$MI250X_PROFILING_USER/
+        SINGULARITY_TMPDIR:    /scratch/$HPC_ACCOUNT/$MI250X_PROFILING_USER/
       continue-on-error: true
       run: |
         echo "$SSH_PRIVATE_KEY" > id_rsa

From 7fe49f709a6a813fb7d48ce6de174cdfbe0b9ee2 Mon Sep 17 00:00:00 2001
From: Jorgen T <jorgen.teig@gmail.com>
Date: Thu, 3 Aug 2023 15:40:07 +0200
Subject: [PATCH 433/509] Added HIP include in mgOnGPUconfig.h for correct
 compilation on HIP

---
 epochX/cudacpp/ee_mumu.mad/SubProcesses/GpuAbstraction.h  | 2 --
 epochX/cudacpp/ee_mumu.mad/src/mgOnGpuConfig.h            | 1 +
 epochX/cudacpp/ee_mumu.sa/SubProcesses/GpuAbstraction.h   | 2 --
 epochX/cudacpp/ee_mumu.sa/src/mgOnGpuConfig.h             | 1 +
 epochX/cudacpp/gg_tt.mad/SubProcesses/GpuAbstraction.h    | 2 --
 epochX/cudacpp/gg_tt.mad/src/mgOnGpuConfig.h              | 1 +
 epochX/cudacpp/gg_tt.sa/SubProcesses/GpuAbstraction.h     | 2 --
 epochX/cudacpp/gg_tt.sa/src/mgOnGpuConfig.h               | 1 +
 epochX/cudacpp/gg_tt01g.mad/SubProcesses/GpuAbstraction.h | 2 --
 epochX/cudacpp/gg_tt01g.mad/src/mgOnGpuConfig.h           | 1 +
 epochX/cudacpp/gg_ttg.mad/SubProcesses/GpuAbstraction.h   | 2 --
 epochX/cudacpp/gg_ttg.mad/src/mgOnGpuConfig.h             | 1 +
 epochX/cudacpp/gg_ttg.sa/SubProcesses/GpuAbstraction.h    | 2 --
 epochX/cudacpp/gg_ttg.sa/src/mgOnGpuConfig.h              | 1 +
 epochX/cudacpp/gg_ttgg.sa/SubProcesses/GpuAbstraction.h   | 2 --
 epochX/cudacpp/gg_ttgg.sa/src/mgOnGpuConfig.h             | 1 +
 epochX/cudacpp/gg_ttggg.mad/SubProcesses/GpuAbstraction.h | 2 --
 epochX/cudacpp/gg_ttggg.mad/src/mgOnGpuConfig.h           | 1 +
 epochX/cudacpp/gg_ttggg.sa/SubProcesses/GpuAbstraction.h  | 2 --
 epochX/cudacpp/gg_ttggg.sa/src/mgOnGpuConfig.h            | 1 +
 epochX/cudacpp/gq_ttq.mad/SubProcesses/GpuAbstraction.h   | 2 --
 epochX/cudacpp/gq_ttq.mad/src/mgOnGpuConfig.h             | 1 +
 22 files changed, 11 insertions(+), 22 deletions(-)

diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/GpuAbstraction.h b/epochX/cudacpp/ee_mumu.mad/SubProcesses/GpuAbstraction.h
index 6a7d9c05c0..9c467b1e04 100644
--- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/GpuAbstraction.h
+++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/GpuAbstraction.h
@@ -39,8 +39,6 @@
 
 #elif defined __HIPCC__
 
-#include "hip/hip_runtime.h"
-
 #define gpuError_t hipError_t
 #define gpuPeekAtLastError hipPeekAtLastError
 #define gpuGetErrorString hipGetErrorString
diff --git a/epochX/cudacpp/ee_mumu.mad/src/mgOnGpuConfig.h b/epochX/cudacpp/ee_mumu.mad/src/mgOnGpuConfig.h
index 390766116b..f9b5faee54 100644
--- a/epochX/cudacpp/ee_mumu.mad/src/mgOnGpuConfig.h
+++ b/epochX/cudacpp/ee_mumu.mad/src/mgOnGpuConfig.h
@@ -15,6 +15,7 @@
 #define MGONGPUCPP_GPUIMPL cuda
 #elif defined __HIPCC__
 #define MGONGPUCPP_GPUIMPL hip
+#include "hip/hip_runtime.h"
 #else
 #undef MGONGPUCPP_GPUIMPL
 #endif
diff --git a/epochX/cudacpp/ee_mumu.sa/SubProcesses/GpuAbstraction.h b/epochX/cudacpp/ee_mumu.sa/SubProcesses/GpuAbstraction.h
index 6a7d9c05c0..9c467b1e04 100644
--- a/epochX/cudacpp/ee_mumu.sa/SubProcesses/GpuAbstraction.h
+++ b/epochX/cudacpp/ee_mumu.sa/SubProcesses/GpuAbstraction.h
@@ -39,8 +39,6 @@
 
 #elif defined __HIPCC__
 
-#include "hip/hip_runtime.h"
-
 #define gpuError_t hipError_t
 #define gpuPeekAtLastError hipPeekAtLastError
 #define gpuGetErrorString hipGetErrorString
diff --git a/epochX/cudacpp/ee_mumu.sa/src/mgOnGpuConfig.h b/epochX/cudacpp/ee_mumu.sa/src/mgOnGpuConfig.h
index d4e37d19b3..f3b96742ae 100644
--- a/epochX/cudacpp/ee_mumu.sa/src/mgOnGpuConfig.h
+++ b/epochX/cudacpp/ee_mumu.sa/src/mgOnGpuConfig.h
@@ -15,6 +15,7 @@
 #define MGONGPUCPP_GPUIMPL cuda
 #elif defined __HIPCC__
 #define MGONGPUCPP_GPUIMPL hip
+#include "hip/hip_runtime.h"
 #else
 #undef MGONGPUCPP_GPUIMPL
 #endif
diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/GpuAbstraction.h b/epochX/cudacpp/gg_tt.mad/SubProcesses/GpuAbstraction.h
index 6a7d9c05c0..9c467b1e04 100644
--- a/epochX/cudacpp/gg_tt.mad/SubProcesses/GpuAbstraction.h
+++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/GpuAbstraction.h
@@ -39,8 +39,6 @@
 
 #elif defined __HIPCC__
 
-#include "hip/hip_runtime.h"
-
 #define gpuError_t hipError_t
 #define gpuPeekAtLastError hipPeekAtLastError
 #define gpuGetErrorString hipGetErrorString
diff --git a/epochX/cudacpp/gg_tt.mad/src/mgOnGpuConfig.h b/epochX/cudacpp/gg_tt.mad/src/mgOnGpuConfig.h
index 390766116b..f9b5faee54 100644
--- a/epochX/cudacpp/gg_tt.mad/src/mgOnGpuConfig.h
+++ b/epochX/cudacpp/gg_tt.mad/src/mgOnGpuConfig.h
@@ -15,6 +15,7 @@
 #define MGONGPUCPP_GPUIMPL cuda
 #elif defined __HIPCC__
 #define MGONGPUCPP_GPUIMPL hip
+#include "hip/hip_runtime.h"
 #else
 #undef MGONGPUCPP_GPUIMPL
 #endif
diff --git a/epochX/cudacpp/gg_tt.sa/SubProcesses/GpuAbstraction.h b/epochX/cudacpp/gg_tt.sa/SubProcesses/GpuAbstraction.h
index 6a7d9c05c0..9c467b1e04 100644
--- a/epochX/cudacpp/gg_tt.sa/SubProcesses/GpuAbstraction.h
+++ b/epochX/cudacpp/gg_tt.sa/SubProcesses/GpuAbstraction.h
@@ -39,8 +39,6 @@
 
 #elif defined __HIPCC__
 
-#include "hip/hip_runtime.h"
-
 #define gpuError_t hipError_t
 #define gpuPeekAtLastError hipPeekAtLastError
 #define gpuGetErrorString hipGetErrorString
diff --git a/epochX/cudacpp/gg_tt.sa/src/mgOnGpuConfig.h b/epochX/cudacpp/gg_tt.sa/src/mgOnGpuConfig.h
index d4e37d19b3..f3b96742ae 100644
--- a/epochX/cudacpp/gg_tt.sa/src/mgOnGpuConfig.h
+++ b/epochX/cudacpp/gg_tt.sa/src/mgOnGpuConfig.h
@@ -15,6 +15,7 @@
 #define MGONGPUCPP_GPUIMPL cuda
 #elif defined __HIPCC__
 #define MGONGPUCPP_GPUIMPL hip
+#include "hip/hip_runtime.h"
 #else
 #undef MGONGPUCPP_GPUIMPL
 #endif
diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/GpuAbstraction.h b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/GpuAbstraction.h
index 6a7d9c05c0..9c467b1e04 100644
--- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/GpuAbstraction.h
+++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/GpuAbstraction.h
@@ -39,8 +39,6 @@
 
 #elif defined __HIPCC__
 
-#include "hip/hip_runtime.h"
-
 #define gpuError_t hipError_t
 #define gpuPeekAtLastError hipPeekAtLastError
 #define gpuGetErrorString hipGetErrorString
diff --git a/epochX/cudacpp/gg_tt01g.mad/src/mgOnGpuConfig.h b/epochX/cudacpp/gg_tt01g.mad/src/mgOnGpuConfig.h
index 390766116b..f9b5faee54 100644
--- a/epochX/cudacpp/gg_tt01g.mad/src/mgOnGpuConfig.h
+++ b/epochX/cudacpp/gg_tt01g.mad/src/mgOnGpuConfig.h
@@ -15,6 +15,7 @@
 #define MGONGPUCPP_GPUIMPL cuda
 #elif defined __HIPCC__
 #define MGONGPUCPP_GPUIMPL hip
+#include "hip/hip_runtime.h"
 #else
 #undef MGONGPUCPP_GPUIMPL
 #endif
diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/GpuAbstraction.h b/epochX/cudacpp/gg_ttg.mad/SubProcesses/GpuAbstraction.h
index 6a7d9c05c0..9c467b1e04 100644
--- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/GpuAbstraction.h
+++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/GpuAbstraction.h
@@ -39,8 +39,6 @@
 
 #elif defined __HIPCC__
 
-#include "hip/hip_runtime.h"
-
 #define gpuError_t hipError_t
 #define gpuPeekAtLastError hipPeekAtLastError
 #define gpuGetErrorString hipGetErrorString
diff --git a/epochX/cudacpp/gg_ttg.mad/src/mgOnGpuConfig.h b/epochX/cudacpp/gg_ttg.mad/src/mgOnGpuConfig.h
index 390766116b..f9b5faee54 100644
--- a/epochX/cudacpp/gg_ttg.mad/src/mgOnGpuConfig.h
+++ b/epochX/cudacpp/gg_ttg.mad/src/mgOnGpuConfig.h
@@ -15,6 +15,7 @@
 #define MGONGPUCPP_GPUIMPL cuda
 #elif defined __HIPCC__
 #define MGONGPUCPP_GPUIMPL hip
+#include "hip/hip_runtime.h"
 #else
 #undef MGONGPUCPP_GPUIMPL
 #endif
diff --git a/epochX/cudacpp/gg_ttg.sa/SubProcesses/GpuAbstraction.h b/epochX/cudacpp/gg_ttg.sa/SubProcesses/GpuAbstraction.h
index 6a7d9c05c0..9c467b1e04 100644
--- a/epochX/cudacpp/gg_ttg.sa/SubProcesses/GpuAbstraction.h
+++ b/epochX/cudacpp/gg_ttg.sa/SubProcesses/GpuAbstraction.h
@@ -39,8 +39,6 @@
 
 #elif defined __HIPCC__
 
-#include "hip/hip_runtime.h"
-
 #define gpuError_t hipError_t
 #define gpuPeekAtLastError hipPeekAtLastError
 #define gpuGetErrorString hipGetErrorString
diff --git a/epochX/cudacpp/gg_ttg.sa/src/mgOnGpuConfig.h b/epochX/cudacpp/gg_ttg.sa/src/mgOnGpuConfig.h
index d4e37d19b3..f3b96742ae 100644
--- a/epochX/cudacpp/gg_ttg.sa/src/mgOnGpuConfig.h
+++ b/epochX/cudacpp/gg_ttg.sa/src/mgOnGpuConfig.h
@@ -15,6 +15,7 @@
 #define MGONGPUCPP_GPUIMPL cuda
 #elif defined __HIPCC__
 #define MGONGPUCPP_GPUIMPL hip
+#include "hip/hip_runtime.h"
 #else
 #undef MGONGPUCPP_GPUIMPL
 #endif
diff --git a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/GpuAbstraction.h b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/GpuAbstraction.h
index 6a7d9c05c0..9c467b1e04 100644
--- a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/GpuAbstraction.h
+++ b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/GpuAbstraction.h
@@ -39,8 +39,6 @@
 
 #elif defined __HIPCC__
 
-#include "hip/hip_runtime.h"
-
 #define gpuError_t hipError_t
 #define gpuPeekAtLastError hipPeekAtLastError
 #define gpuGetErrorString hipGetErrorString
diff --git a/epochX/cudacpp/gg_ttgg.sa/src/mgOnGpuConfig.h b/epochX/cudacpp/gg_ttgg.sa/src/mgOnGpuConfig.h
index d4e37d19b3..f3b96742ae 100644
--- a/epochX/cudacpp/gg_ttgg.sa/src/mgOnGpuConfig.h
+++ b/epochX/cudacpp/gg_ttgg.sa/src/mgOnGpuConfig.h
@@ -15,6 +15,7 @@
 #define MGONGPUCPP_GPUIMPL cuda
 #elif defined __HIPCC__
 #define MGONGPUCPP_GPUIMPL hip
+#include "hip/hip_runtime.h"
 #else
 #undef MGONGPUCPP_GPUIMPL
 #endif
diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/GpuAbstraction.h b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/GpuAbstraction.h
index 6a7d9c05c0..9c467b1e04 100644
--- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/GpuAbstraction.h
+++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/GpuAbstraction.h
@@ -39,8 +39,6 @@
 
 #elif defined __HIPCC__
 
-#include "hip/hip_runtime.h"
-
 #define gpuError_t hipError_t
 #define gpuPeekAtLastError hipPeekAtLastError
 #define gpuGetErrorString hipGetErrorString
diff --git a/epochX/cudacpp/gg_ttggg.mad/src/mgOnGpuConfig.h b/epochX/cudacpp/gg_ttggg.mad/src/mgOnGpuConfig.h
index 390766116b..f9b5faee54 100644
--- a/epochX/cudacpp/gg_ttggg.mad/src/mgOnGpuConfig.h
+++ b/epochX/cudacpp/gg_ttggg.mad/src/mgOnGpuConfig.h
@@ -15,6 +15,7 @@
 #define MGONGPUCPP_GPUIMPL cuda
 #elif defined __HIPCC__
 #define MGONGPUCPP_GPUIMPL hip
+#include "hip/hip_runtime.h"
 #else
 #undef MGONGPUCPP_GPUIMPL
 #endif
diff --git a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/GpuAbstraction.h b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/GpuAbstraction.h
index 6a7d9c05c0..9c467b1e04 100644
--- a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/GpuAbstraction.h
+++ b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/GpuAbstraction.h
@@ -39,8 +39,6 @@
 
 #elif defined __HIPCC__
 
-#include "hip/hip_runtime.h"
-
 #define gpuError_t hipError_t
 #define gpuPeekAtLastError hipPeekAtLastError
 #define gpuGetErrorString hipGetErrorString
diff --git a/epochX/cudacpp/gg_ttggg.sa/src/mgOnGpuConfig.h b/epochX/cudacpp/gg_ttggg.sa/src/mgOnGpuConfig.h
index d4e37d19b3..f3b96742ae 100644
--- a/epochX/cudacpp/gg_ttggg.sa/src/mgOnGpuConfig.h
+++ b/epochX/cudacpp/gg_ttggg.sa/src/mgOnGpuConfig.h
@@ -15,6 +15,7 @@
 #define MGONGPUCPP_GPUIMPL cuda
 #elif defined __HIPCC__
 #define MGONGPUCPP_GPUIMPL hip
+#include "hip/hip_runtime.h"
 #else
 #undef MGONGPUCPP_GPUIMPL
 #endif
diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/GpuAbstraction.h b/epochX/cudacpp/gq_ttq.mad/SubProcesses/GpuAbstraction.h
index 6a7d9c05c0..9c467b1e04 100644
--- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/GpuAbstraction.h
+++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/GpuAbstraction.h
@@ -39,8 +39,6 @@
 
 #elif defined __HIPCC__
 
-#include "hip/hip_runtime.h"
-
 #define gpuError_t hipError_t
 #define gpuPeekAtLastError hipPeekAtLastError
 #define gpuGetErrorString hipGetErrorString
diff --git a/epochX/cudacpp/gq_ttq.mad/src/mgOnGpuConfig.h b/epochX/cudacpp/gq_ttq.mad/src/mgOnGpuConfig.h
index 390766116b..f9b5faee54 100644
--- a/epochX/cudacpp/gq_ttq.mad/src/mgOnGpuConfig.h
+++ b/epochX/cudacpp/gq_ttq.mad/src/mgOnGpuConfig.h
@@ -15,6 +15,7 @@
 #define MGONGPUCPP_GPUIMPL cuda
 #elif defined __HIPCC__
 #define MGONGPUCPP_GPUIMPL hip
+#include "hip/hip_runtime.h"
 #else
 #undef MGONGPUCPP_GPUIMPL
 #endif

From 61d944fe8067f5be51ab3b0eb8061312c1703fff Mon Sep 17 00:00:00 2001
From: Jorgen T <jorgen.teig@gmail.com>
Date: Thu, 3 Aug 2023 15:45:04 +0200
Subject: [PATCH 434/509] Changed references to CUDA in HIP profiler

---
 .github/workflows/mi250x_profiler.yml |  6 +++---
 tools/profiling/sendData.py           | 14 ++++++++++++++
 2 files changed, 17 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/mi250x_profiler.yml b/.github/workflows/mi250x_profiler.yml
index 29967dcd04..0ea5ac7b5f 100644
--- a/.github/workflows/mi250x_profiler.yml
+++ b/.github/workflows/mi250x_profiler.yml
@@ -46,12 +46,12 @@ jobs:
     runs-on: [self-hosted, linux, mi250x]
     name: HIP MI250X Profiling
     env:
-      CUDA_NAME_PREFIX: sycl_AMD-Epyc-7A53_MI250X_gcc-11.2.1_rocm-5.2.3
+      HIP_NAME_PREFIX: hip_AMD-Epyc-7A53_MI250X_gcc-11.2.1_rocm-5.2.3
       ENABLE_CI_PROFILER: 1
       MADGRAPH4GPU_DB_SECRET: ${{ secrets.MADGRAPH4GPU_DB_SECRET }}
     steps:
     - uses: actions/checkout@v2
-    - name: Runs SYCL performanceProfiler.py script
+    - name: Runs HIP performanceProfiler.py script
       run: cd tools/profiling/;
            python3 performanceProfiler.py -l 'HIP' -b 'master'
 
@@ -66,7 +66,7 @@ jobs:
     runs-on: [self-hosted, linux]
     name: Upload JSON files to DB
     env:
-      CUDA_NAME_PREFIX: sycl_AMD-Epyc-7A53_MI250X_gcc-11.2.1_rocm-5.2.3
+      HIP_NAME_PREFIX: hip_AMD-Epyc-7A53_MI250X_gcc-11.2.1_rocm-5.2.3
       ENABLE_CI_PROFILER: 1
       MADGRAPH4GPU_DB_SECRET: ${{ secrets.MADGRAPH4GPU_DB_SECRET }}
     steps:
diff --git a/tools/profiling/sendData.py b/tools/profiling/sendData.py
index 7d7151bf29..7d7da7d9b7 100644
--- a/tools/profiling/sendData.py
+++ b/tools/profiling/sendData.py
@@ -85,6 +85,20 @@
                 logging.error('CUDA report path does not exist!')
                 sys.exit(1)
 
+        elif args.absLayer.upper() == "HIP":
+
+            hipNamePrefix = os.getenv('HIP_NAME_PREFIX')
+
+            if cudaNamePrefix is None:
+                logging.error('HIP name prefix has not been set!')
+                sys.exit(1)
+
+            reportfolder= "workspace_mg4gpu/" + datetime.datetime.now().strftime('%y-%m-%d') + '_' + hipNamePrefix + '_' + args.branch
+
+            if not os.path.exists(reportfolder):
+                logging.error('CUDA report path does not exist!')
+                sys.exit(1)
+
         else:
             logging.error('No abstraction layer that is supported has been selected!')
             sys.exit(1)

From 240b52c72221fa11a8e56bbe35f196568f5619b6 Mon Sep 17 00:00:00 2001
From: Jorgen T <jorgen.teig@gmail.com>
Date: Mon, 7 Aug 2023 10:48:46 +0200
Subject: [PATCH 435/509] Added back ROCm option for testing in srun command

---
 .github/workflows/mi250x_profiler.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/mi250x_profiler.yml b/.github/workflows/mi250x_profiler.yml
index 0ea5ac7b5f..12fd6627a2 100644
--- a/.github/workflows/mi250x_profiler.yml
+++ b/.github/workflows/mi250x_profiler.yml
@@ -34,7 +34,7 @@ jobs:
         ssh -o StrictHostKeyChecking=no -i id_rsa $MI250X_PROFILING_USER@$MI250X_PROFILING_HOST "\
           cd /scratch/$HPC_ACCOUNT/$MI250X_PROFILING_USER/ && \
           singularity pull --force oras://ghcr.io/${{ github.repository_owner }}/github_runner_mi250x:latest && \
-          srun --account=$HPC_ACCOUNT -p $HPC_PROJECT --time=03:00:00 singularity run \
+          srun --account=$HPC_ACCOUNT -p $HPC_PROJECT --rocm --time=03:00:00 singularity run \
             --env GITHUB_TOKEN=${{ steps.generate_token.outputs.token }} \
             --env REPO_URL=https://github.com/${{ github.repository }} \
             --env RUNNER_NAME=github_runner_mi250x \

From 4bf8d9807f45f330948f5ddf0c2b49c17982d0be Mon Sep 17 00:00:00 2001
From: Jorgen T <jorgen.teig@gmail.com>
Date: Mon, 7 Aug 2023 10:50:13 +0200
Subject: [PATCH 436/509] Changed position of ROCm option

---
 .github/workflows/mi250x_profiler.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/mi250x_profiler.yml b/.github/workflows/mi250x_profiler.yml
index 12fd6627a2..62c5eaa5cc 100644
--- a/.github/workflows/mi250x_profiler.yml
+++ b/.github/workflows/mi250x_profiler.yml
@@ -34,7 +34,7 @@ jobs:
         ssh -o StrictHostKeyChecking=no -i id_rsa $MI250X_PROFILING_USER@$MI250X_PROFILING_HOST "\
           cd /scratch/$HPC_ACCOUNT/$MI250X_PROFILING_USER/ && \
           singularity pull --force oras://ghcr.io/${{ github.repository_owner }}/github_runner_mi250x:latest && \
-          srun --account=$HPC_ACCOUNT -p $HPC_PROJECT --rocm --time=03:00:00 singularity run \
+          srun --rocm --account=$HPC_ACCOUNT -p $HPC_PROJECT --time=03:00:00 singularity run \
             --env GITHUB_TOKEN=${{ steps.generate_token.outputs.token }} \
             --env REPO_URL=https://github.com/${{ github.repository }} \
             --env RUNNER_NAME=github_runner_mi250x \

From c41e2d2f4b4e9792cd988cc1d3fb0b9229bb3fed Mon Sep 17 00:00:00 2001
From: Jorgen T <jorgen.teig@gmail.com>
Date: Mon, 7 Aug 2023 10:56:59 +0200
Subject: [PATCH 437/509] Changed position of ROCm option again

---
 .github/workflows/mi250x_profiler.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/mi250x_profiler.yml b/.github/workflows/mi250x_profiler.yml
index 62c5eaa5cc..65e73529cb 100644
--- a/.github/workflows/mi250x_profiler.yml
+++ b/.github/workflows/mi250x_profiler.yml
@@ -34,7 +34,7 @@ jobs:
         ssh -o StrictHostKeyChecking=no -i id_rsa $MI250X_PROFILING_USER@$MI250X_PROFILING_HOST "\
           cd /scratch/$HPC_ACCOUNT/$MI250X_PROFILING_USER/ && \
           singularity pull --force oras://ghcr.io/${{ github.repository_owner }}/github_runner_mi250x:latest && \
-          srun --rocm --account=$HPC_ACCOUNT -p $HPC_PROJECT --time=03:00:00 singularity run \
+          srun --account=$HPC_ACCOUNT -p $HPC_PROJECT --time=03:00:00 singularity run --rocm \
             --env GITHUB_TOKEN=${{ steps.generate_token.outputs.token }} \
             --env REPO_URL=https://github.com/${{ github.repository }} \
             --env RUNNER_NAME=github_runner_mi250x \

From 2861007cdc78f1aa34a8cc83baebabbf6cfafcfc Mon Sep 17 00:00:00 2001
From: Jorgen T <jorgen.teig@gmail.com>
Date: Tue, 8 Aug 2023 09:44:51 +0200
Subject: [PATCH 438/509] Added gpu option to srun command in MI250x profiling

---
 .github/workflows/mi250x_profiler.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/mi250x_profiler.yml b/.github/workflows/mi250x_profiler.yml
index 65e73529cb..2a408e203c 100644
--- a/.github/workflows/mi250x_profiler.yml
+++ b/.github/workflows/mi250x_profiler.yml
@@ -34,7 +34,7 @@ jobs:
         ssh -o StrictHostKeyChecking=no -i id_rsa $MI250X_PROFILING_USER@$MI250X_PROFILING_HOST "\
           cd /scratch/$HPC_ACCOUNT/$MI250X_PROFILING_USER/ && \
           singularity pull --force oras://ghcr.io/${{ github.repository_owner }}/github_runner_mi250x:latest && \
-          srun --account=$HPC_ACCOUNT -p $HPC_PROJECT --time=03:00:00 singularity run --rocm \
+          srun --account=$HPC_ACCOUNT -p $HPC_PROJECT --gpus=1 --time=03:00:00 singularity run --rocm \
             --env GITHUB_TOKEN=${{ steps.generate_token.outputs.token }} \
             --env REPO_URL=https://github.com/${{ github.repository }} \
             --env RUNNER_NAME=github_runner_mi250x \

From c5eb9390ccb569fd12e3fae4a2e415bed7dfa002 Mon Sep 17 00:00:00 2001
From: Jorgen T <jorgen.teig@gmail.com>
Date: Tue, 8 Aug 2023 16:05:33 +0200
Subject: [PATCH 439/509] Fixed bug in buildCUDAProcess script

---
 tools/profiling/buildCUDAProcess.sh | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tools/profiling/buildCUDAProcess.sh b/tools/profiling/buildCUDAProcess.sh
index b7e7947446..cdbc30225c 100755
--- a/tools/profiling/buildCUDAProcess.sh
+++ b/tools/profiling/buildCUDAProcess.sh
@@ -57,7 +57,7 @@ fi
 # CUDA
 # Check if CUDA_HOME has not been set from the outside, usefull in CI/CD
 if [[ -z "$CUDA_HOME" ]]; then
-    export COMPILER_PATH="`which nvcc 2>/dev/null`" && while [ -L "$compiler" ]; do compiler=`readlink "$compiler"`; done && echo "$$compiler"
+    export COMPILER_PATH="`which nvcc 2>/dev/null`" && while [ -L "$compiler" ]; do compiler=`readlink "$compiler"`; done
 
     if [[ "$COMPILER_PATH" ]]; then
     export CUDA_HOME=$(dirname $(dirname $COMPILER_PATH))
@@ -68,7 +68,7 @@ fi
 # HIP
 # Check if HIP_HOME has not been set from the outside, usefull in CI/CD
 if [[ -z "$HIP_HOME" ]]; then
-    export COMPILER_PATH="`which hipcc 2>/dev/null`" && while [ -L "$compiler" ]; do compiler=`readlink "$compiler"`; done && echo "$$compiler"
+    export COMPILER_PATH="`which hipcc 2>/dev/null`" && while [ -L "$compiler" ]; do compiler=`readlink "$compiler"`; done
 
     if [[ "$COMPILER_PATH" ]]; then
     export HIP_HOME=$(dirname $(dirname $COMPILER_PATH))

From 4bfc6faf1261969f1f3b166ecb981f94c0e2401f Mon Sep 17 00:00:00 2001
From: Jorgen T <jorgen.teig@gmail.com>
Date: Tue, 8 Aug 2023 16:37:37 +0200
Subject: [PATCH 440/509] Improved HIP_HOME/CUDA_HOME assignment

---
 tools/profiling/buildCUDAProcess.sh | 20 ++++++++++++++------
 1 file changed, 14 insertions(+), 6 deletions(-)

diff --git a/tools/profiling/buildCUDAProcess.sh b/tools/profiling/buildCUDAProcess.sh
index cdbc30225c..2dcc7f2ef4 100755
--- a/tools/profiling/buildCUDAProcess.sh
+++ b/tools/profiling/buildCUDAProcess.sh
@@ -57,22 +57,30 @@ fi
 # CUDA
 # Check if CUDA_HOME has not been set from the outside, usefull in CI/CD
 if [[ -z "$CUDA_HOME" ]]; then
-    export COMPILER_PATH="`which nvcc 2>/dev/null`" && while [ -L "$compiler" ]; do compiler=`readlink "$compiler"`; done
+    COMPILER=$(which nvcc 2>/dev/null)
+    while [ -L "$COMPILER" ]; do
+        COMPILER=$(readlink "$COMPILER")
+    done
+    export COMPILER_PATH=$COMPILER
 
     if [[ "$COMPILER_PATH" ]]; then
-    export CUDA_HOME=$(dirname $(dirname $COMPILER_PATH))
-    export PATH=$CUDA_HOME${PATH:+:${PATH}}
+        export CUDA_HOME=$(dirname $(dirname $COMPILER_PATH))
+        export PATH=$CUDA_HOME${PATH:+:${PATH}}
     fi
 fi
 
 # HIP
 # Check if HIP_HOME has not been set from the outside, usefull in CI/CD
 if [[ -z "$HIP_HOME" ]]; then
-    export COMPILER_PATH="`which hipcc 2>/dev/null`" && while [ -L "$compiler" ]; do compiler=`readlink "$compiler"`; done
+    COMPILER=$(which hipcc 2>/dev/null)
+    while [ -L "$COMPILER" ]; do
+        COMPILER=$(readlink "$COMPILER")
+    done
+    export COMPILER_PATH=$COMPILER
 
     if [[ "$COMPILER_PATH" ]]; then
-    export HIP_HOME=$(dirname $(dirname $COMPILER_PATH))
-    export PATH=$HIP_HOME${PATH:+:${PATH}}
+        export HIP_HOME=$(dirname $(dirname $COMPILER_PATH))
+        export PATH=$HIP_HOME${PATH:+:${PATH}}
     fi
 fi
 

From a0658ecad09d15843388efacc2d1576757496ace Mon Sep 17 00:00:00 2001
From: Jorgen T <jorgen.teig@gmail.com>
Date: Wed, 9 Aug 2023 09:54:16 +0200
Subject: [PATCH 441/509] Removed CVMFS for CI tests

---
 .github/workflows/c-cpp.yml | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/.github/workflows/c-cpp.yml b/.github/workflows/c-cpp.yml
index 5170e057d6..8337a41337 100644
--- a/.github/workflows/c-cpp.yml
+++ b/.github/workflows/c-cpp.yml
@@ -65,8 +65,7 @@ jobs:
   GPU:
     runs-on: [self-hosted, linux, a100]
     env:
-      CUDA_HOME: /usr/local/cuda/
-      FC: gfortran
+      CUDA_HOME: /usr/local/cuda-12.0/
       REQUIRE_CUDA: 1
     strategy:
       matrix:
@@ -80,11 +79,11 @@ jobs:
     steps:
     - uses: actions/checkout@v2
     - name: make info
-      run: source /cvmfs/sft.cern.ch/lcg/releases/gcc/11.3.0-ad0f5/x86_64-centos8/setup.sh;
+      run: FC=`which gfortran`;
            make FPTYPE=${{ matrix.precision }} -C ${{ matrix.folder }} info
     - name: make
-      run: source /cvmfs/sft.cern.ch/lcg/releases/gcc/11.3.0-ad0f5/x86_64-centos8/setup.sh;
+      run: FC=`which gfortran`;
            make FPTYPE=${{ matrix.precision }} -C ${{ matrix.folder }}
     - name: make check
-      run: source /cvmfs/sft.cern.ch/lcg/releases/gcc/11.3.0-ad0f5/x86_64-centos8/setup.sh;
+      run: FC=`which gfortran`;
            make FPTYPE=${{ matrix.precision }} -C ${{ matrix.folder }} check

From 441bd79d41221f2f83254151a5a7791175c57e16 Mon Sep 17 00:00:00 2001
From: Jorgen T <jorgen.teig@gmail.com>
Date: Wed, 9 Aug 2023 10:09:24 +0200
Subject: [PATCH 442/509] Revert changes to CUDA_HOME

---
 .github/workflows/c-cpp.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/c-cpp.yml b/.github/workflows/c-cpp.yml
index 8337a41337..fe26071c99 100644
--- a/.github/workflows/c-cpp.yml
+++ b/.github/workflows/c-cpp.yml
@@ -65,7 +65,7 @@ jobs:
   GPU:
     runs-on: [self-hosted, linux, a100]
     env:
-      CUDA_HOME: /usr/local/cuda-12.0/
+      CUDA_HOME: /usr/local/cuda/
       REQUIRE_CUDA: 1
     strategy:
       matrix:

From e89ae42dfc2a05089c1efcc628e98e82fae4d5d1 Mon Sep 17 00:00:00 2001
From: Jorgen T <jorgen.teig@gmail.com>
Date: Wed, 9 Aug 2023 14:17:35 +0200
Subject: [PATCH 443/509] [CODEGEN] Regenerated all .sa and .mad processes with
 new HIP compilation

---
 .../ee_mumu.mad/CODEGEN_mad_ee_mumu_log.txt   |  50 +--
 epochX/cudacpp/ee_mumu.mad/COPYRIGHT          |   1 +
 .../ee_mumu.mad/Cards/me5_configuration.txt   |   4 +-
 .../ee_mumu.mad/Source/DHELAS/aloha_file.inc  |   2 +-
 .../cudacpp/ee_mumu.mad/SubProcesses/Bridge.h |  32 +-
 .../ee_mumu.mad/SubProcesses/BridgeKernels.cc |   9 +-
 .../ee_mumu.mad/SubProcesses/BridgeKernels.h  |   8 +-
 .../SubProcesses/CommonRandomNumberKernel.cc  |   5 +-
 .../SubProcesses/CrossSectionKernels.cc       |   7 +-
 .../SubProcesses/CrossSectionKernels.h        |   6 +-
 .../ee_mumu.mad/SubProcesses/CudaRuntime.h    |  85 -----
 .../SubProcesses/CurandRandomNumberKernel.cc  |  12 +-
 .../SubProcesses/EventStatistics.h            |   4 +-
 .../ee_mumu.mad/SubProcesses/MadgraphTest.h   |   8 +-
 .../SubProcesses/MatrixElementKernels.cc      |  26 +-
 .../SubProcesses/MatrixElementKernels.h       |   8 +-
 .../SubProcesses/MemoryAccessAmplitudes.h     |   2 +-
 .../SubProcesses/MemoryAccessCouplings.h      |   2 +-
 .../SubProcesses/MemoryAccessCouplingsFixed.h |   2 +-
 .../SubProcesses/MemoryAccessDenominators.h   |   2 +-
 .../ee_mumu.mad/SubProcesses/MemoryAccessGs.h |   2 +-
 .../SubProcesses/MemoryAccessHelpers.h        |   4 +-
 .../SubProcesses/MemoryAccessMatrixElements.h |   2 +-
 .../SubProcesses/MemoryAccessMomenta.h        |   6 +-
 .../SubProcesses/MemoryAccessNumerators.h     |   2 +-
 .../SubProcesses/MemoryAccessRandomNumbers.h  |   4 +-
 .../SubProcesses/MemoryAccessVectors.h        |   4 +-
 .../SubProcesses/MemoryAccessWavefunctions.h  |   2 +-
 .../ee_mumu.mad/SubProcesses/MemoryBuffers.h  |  64 ++--
 .../SubProcesses/P1_epem_mupmum/CPPProcess.cc |  64 ++--
 .../SubProcesses/P1_epem_mupmum/CPPProcess.h  |  10 +-
 .../SubProcesses/P1_epem_mupmum/CudaRuntime.h |   1 -
 .../SubProcesses/P1_epem_mupmum/check_sa.cc   | 103 +++---
 .../SubProcesses/RamboSamplingKernels.cc      |  20 +-
 .../SubProcesses/RamboSamplingKernels.h       |   6 +-
 .../SubProcesses/RandomNumberKernels.h        |   6 +-
 .../ee_mumu.mad/SubProcesses/cudacpp.mk       | 297 +++++++++++-------
 .../ee_mumu.mad/SubProcesses/fbridge.cc       |  16 +-
 .../ee_mumu.mad/SubProcesses/fsampler.cc      |   8 +-
 .../ee_mumu.mad/SubProcesses/runTest.cc       |  12 +-
 .../ee_mumu.mad/SubProcesses/testmisc.cc      |   8 +-
 .../ee_mumu.mad/SubProcesses/testxxx.cc       |  14 +-
 .../bin/internal/ufomodel/py3_model.pkl       | Bin 42822 -> 42813 bytes
 epochX/cudacpp/ee_mumu.mad/src/HelAmps_sm.h   |   4 +-
 .../cudacpp/ee_mumu.mad/src/Parameters_sm.cc  |   4 +-
 .../cudacpp/ee_mumu.mad/src/Parameters_sm.h   |  10 +-
 epochX/cudacpp/ee_mumu.mad/src/cudacpp_src.mk |  14 +-
 .../cudacpp/ee_mumu.mad/src/mgOnGpuConfig.h   |  61 ++--
 .../cudacpp/ee_mumu.mad/src/mgOnGpuCxtypes.h  |  26 +-
 .../cudacpp/ee_mumu.mad/src/mgOnGpuFptypes.h  |  12 +-
 .../cudacpp/ee_mumu.mad/src/mgOnGpuVectors.h  |  18 +-
 epochX/cudacpp/ee_mumu.mad/src/rambo.h        |   8 +-
 .../CODEGEN_cudacpp_ee_mumu_log.txt           |  36 +--
 epochX/cudacpp/ee_mumu.sa/COPYRIGHT           |   1 +
 .../cudacpp/ee_mumu.sa/SubProcesses/Bridge.h  |  32 +-
 .../ee_mumu.sa/SubProcesses/BridgeKernels.cc  |   9 +-
 .../ee_mumu.sa/SubProcesses/BridgeKernels.h   |   8 +-
 .../SubProcesses/CommonRandomNumberKernel.cc  |   5 +-
 .../SubProcesses/CrossSectionKernels.cc       |   7 +-
 .../SubProcesses/CrossSectionKernels.h        |   6 +-
 .../ee_mumu.sa/SubProcesses/CudaRuntime.h     |  85 -----
 .../SubProcesses/CurandRandomNumberKernel.cc  |  12 +-
 .../ee_mumu.sa/SubProcesses/EventStatistics.h |   4 +-
 .../ee_mumu.sa/SubProcesses/MadgraphTest.h    |   8 +-
 .../SubProcesses/MatrixElementKernels.cc      |  26 +-
 .../SubProcesses/MatrixElementKernels.h       |   8 +-
 .../SubProcesses/MemoryAccessAmplitudes.h     |   2 +-
 .../SubProcesses/MemoryAccessCouplings.h      |   2 +-
 .../SubProcesses/MemoryAccessCouplingsFixed.h |   2 +-
 .../SubProcesses/MemoryAccessDenominators.h   |   2 +-
 .../ee_mumu.sa/SubProcesses/MemoryAccessGs.h  |   2 +-
 .../SubProcesses/MemoryAccessHelpers.h        |   4 +-
 .../SubProcesses/MemoryAccessMatrixElements.h |   2 +-
 .../SubProcesses/MemoryAccessMomenta.h        |   6 +-
 .../SubProcesses/MemoryAccessNumerators.h     |   2 +-
 .../SubProcesses/MemoryAccessRandomNumbers.h  |   4 +-
 .../SubProcesses/MemoryAccessVectors.h        |   4 +-
 .../SubProcesses/MemoryAccessWavefunctions.h  |   2 +-
 .../ee_mumu.sa/SubProcesses/MemoryBuffers.h   |  64 ++--
 .../P1_Sigma_sm_epem_mupmum/CPPProcess.cc     |  64 ++--
 .../P1_Sigma_sm_epem_mupmum/CPPProcess.h      |  10 +-
 .../P1_Sigma_sm_epem_mupmum/CudaRuntime.h     |   1 -
 .../P1_Sigma_sm_epem_mupmum/check_sa.cc       | 103 +++---
 .../SubProcesses/RamboSamplingKernels.cc      |  20 +-
 .../SubProcesses/RamboSamplingKernels.h       |   6 +-
 .../SubProcesses/RandomNumberKernels.h        |   6 +-
 .../ee_mumu.sa/SubProcesses/cudacpp.mk        | 297 +++++++++++-------
 .../ee_mumu.sa/SubProcesses/fbridge.cc        |  16 +-
 .../ee_mumu.sa/SubProcesses/fsampler.cc       |   8 +-
 .../ee_mumu.sa/SubProcesses/runTest.cc        |  12 +-
 .../ee_mumu.sa/SubProcesses/testmisc.cc       |   8 +-
 .../ee_mumu.sa/SubProcesses/testxxx.cc        |  14 +-
 epochX/cudacpp/ee_mumu.sa/src/HelAmps_sm.h    |   4 +-
 .../cudacpp/ee_mumu.sa/src/Parameters_sm.cc   |   4 +-
 epochX/cudacpp/ee_mumu.sa/src/Parameters_sm.h |  10 +-
 epochX/cudacpp/ee_mumu.sa/src/cudacpp_src.mk  |  14 +-
 epochX/cudacpp/ee_mumu.sa/src/mgOnGpuConfig.h |  71 +++--
 .../cudacpp/ee_mumu.sa/src/mgOnGpuCxtypes.h   |  26 +-
 .../cudacpp/ee_mumu.sa/src/mgOnGpuFptypes.h   |  12 +-
 .../cudacpp/ee_mumu.sa/src/mgOnGpuVectors.h   |  18 +-
 epochX/cudacpp/ee_mumu.sa/src/rambo.h         |   8 +-
 .../gg_tt.mad/CODEGEN_mad_gg_tt_log.txt       |  56 ++--
 epochX/cudacpp/gg_tt.mad/COPYRIGHT            |   1 +
 .../gg_tt.mad/Cards/me5_configuration.txt     |   4 +-
 .../gg_tt.mad/Source/DHELAS/aloha_file.inc    |   2 +-
 .../cudacpp/gg_tt.mad/SubProcesses/Bridge.h   |  32 +-
 .../gg_tt.mad/SubProcesses/BridgeKernels.cc   |   9 +-
 .../gg_tt.mad/SubProcesses/BridgeKernels.h    |   8 +-
 .../SubProcesses/CommonRandomNumberKernel.cc  |   5 +-
 .../SubProcesses/CrossSectionKernels.cc       |   7 +-
 .../SubProcesses/CrossSectionKernels.h        |   6 +-
 .../gg_tt.mad/SubProcesses/CudaRuntime.h      |  85 -----
 .../SubProcesses/CurandRandomNumberKernel.cc  |  12 +-
 .../gg_tt.mad/SubProcesses/EventStatistics.h  |   4 +-
 .../gg_tt.mad/SubProcesses/MadgraphTest.h     |   8 +-
 .../SubProcesses/MatrixElementKernels.cc      |  26 +-
 .../SubProcesses/MatrixElementKernels.h       |   8 +-
 .../SubProcesses/MemoryAccessAmplitudes.h     |   2 +-
 .../SubProcesses/MemoryAccessCouplings.h      |   2 +-
 .../SubProcesses/MemoryAccessCouplingsFixed.h |   2 +-
 .../SubProcesses/MemoryAccessDenominators.h   |   2 +-
 .../gg_tt.mad/SubProcesses/MemoryAccessGs.h   |   2 +-
 .../SubProcesses/MemoryAccessHelpers.h        |   4 +-
 .../SubProcesses/MemoryAccessMatrixElements.h |   2 +-
 .../SubProcesses/MemoryAccessMomenta.h        |   6 +-
 .../SubProcesses/MemoryAccessNumerators.h     |   2 +-
 .../SubProcesses/MemoryAccessRandomNumbers.h  |   4 +-
 .../SubProcesses/MemoryAccessVectors.h        |   4 +-
 .../SubProcesses/MemoryAccessWavefunctions.h  |   2 +-
 .../gg_tt.mad/SubProcesses/MemoryBuffers.h    |  64 ++--
 .../SubProcesses/P1_gg_ttx/CPPProcess.cc      |  62 ++--
 .../SubProcesses/P1_gg_ttx/CPPProcess.h       |  10 +-
 .../SubProcesses/P1_gg_ttx/CudaRuntime.h      |   1 -
 .../SubProcesses/P1_gg_ttx/check_sa.cc        | 103 +++---
 .../SubProcesses/RamboSamplingKernels.cc      |  20 +-
 .../SubProcesses/RamboSamplingKernels.h       |   6 +-
 .../SubProcesses/RandomNumberKernels.h        |   6 +-
 .../cudacpp/gg_tt.mad/SubProcesses/cudacpp.mk | 297 +++++++++++-------
 .../cudacpp/gg_tt.mad/SubProcesses/fbridge.cc |  16 +-
 .../gg_tt.mad/SubProcesses/fsampler.cc        |   8 +-
 .../cudacpp/gg_tt.mad/SubProcesses/runTest.cc |  12 +-
 .../gg_tt.mad/SubProcesses/testmisc.cc        |   8 +-
 .../cudacpp/gg_tt.mad/SubProcesses/testxxx.cc |  14 +-
 .../bin/internal/ufomodel/py3_model.pkl       | Bin 42822 -> 42813 bytes
 epochX/cudacpp/gg_tt.mad/src/HelAmps_sm.h     |   4 +-
 epochX/cudacpp/gg_tt.mad/src/Parameters_sm.cc |   4 +-
 epochX/cudacpp/gg_tt.mad/src/Parameters_sm.h  |  10 +-
 epochX/cudacpp/gg_tt.mad/src/cudacpp_src.mk   |  14 +-
 epochX/cudacpp/gg_tt.mad/src/mgOnGpuConfig.h  |  71 +++--
 epochX/cudacpp/gg_tt.mad/src/mgOnGpuCxtypes.h |  26 +-
 epochX/cudacpp/gg_tt.mad/src/mgOnGpuFptypes.h |  12 +-
 epochX/cudacpp/gg_tt.mad/src/mgOnGpuVectors.h |  18 +-
 epochX/cudacpp/gg_tt.mad/src/rambo.h          |   8 +-
 .../gg_tt.sa/CODEGEN_cudacpp_gg_tt_log.txt    |  44 +--
 epochX/cudacpp/gg_tt.sa/COPYRIGHT             |   1 +
 epochX/cudacpp/gg_tt.sa/SubProcesses/Bridge.h |  32 +-
 .../gg_tt.sa/SubProcesses/BridgeKernels.cc    |   9 +-
 .../gg_tt.sa/SubProcesses/BridgeKernels.h     |   8 +-
 .../SubProcesses/CommonRandomNumberKernel.cc  |   5 +-
 .../SubProcesses/CrossSectionKernels.cc       |   7 +-
 .../SubProcesses/CrossSectionKernels.h        |   6 +-
 .../gg_tt.sa/SubProcesses/CudaRuntime.h       |  85 -----
 .../SubProcesses/CurandRandomNumberKernel.cc  |  12 +-
 .../gg_tt.sa/SubProcesses/EventStatistics.h   |   4 +-
 .../gg_tt.sa/SubProcesses/MadgraphTest.h      |   8 +-
 .../SubProcesses/MatrixElementKernels.cc      |  26 +-
 .../SubProcesses/MatrixElementKernels.h       |   8 +-
 .../SubProcesses/MemoryAccessAmplitudes.h     |   2 +-
 .../SubProcesses/MemoryAccessCouplings.h      |   2 +-
 .../SubProcesses/MemoryAccessCouplingsFixed.h |   2 +-
 .../SubProcesses/MemoryAccessDenominators.h   |   2 +-
 .../gg_tt.sa/SubProcesses/MemoryAccessGs.h    |   2 +-
 .../SubProcesses/MemoryAccessHelpers.h        |   4 +-
 .../SubProcesses/MemoryAccessMatrixElements.h |   2 +-
 .../SubProcesses/MemoryAccessMomenta.h        |   6 +-
 .../SubProcesses/MemoryAccessNumerators.h     |   2 +-
 .../SubProcesses/MemoryAccessRandomNumbers.h  |   4 +-
 .../SubProcesses/MemoryAccessVectors.h        |   4 +-
 .../SubProcesses/MemoryAccessWavefunctions.h  |   2 +-
 .../gg_tt.sa/SubProcesses/MemoryBuffers.h     |  64 ++--
 .../P1_Sigma_sm_gg_ttx/CPPProcess.cc          |  62 ++--
 .../P1_Sigma_sm_gg_ttx/CPPProcess.h           |  10 +-
 .../P1_Sigma_sm_gg_ttx/CudaRuntime.h          |   1 -
 .../P1_Sigma_sm_gg_ttx/check_sa.cc            | 103 +++---
 .../SubProcesses/RamboSamplingKernels.cc      |  20 +-
 .../SubProcesses/RamboSamplingKernels.h       |   6 +-
 .../SubProcesses/RandomNumberKernels.h        |   6 +-
 .../cudacpp/gg_tt.sa/SubProcesses/cudacpp.mk  | 297 +++++++++++-------
 .../cudacpp/gg_tt.sa/SubProcesses/fbridge.cc  |  16 +-
 .../cudacpp/gg_tt.sa/SubProcesses/fsampler.cc |   8 +-
 .../cudacpp/gg_tt.sa/SubProcesses/runTest.cc  |  12 +-
 .../cudacpp/gg_tt.sa/SubProcesses/testmisc.cc |   8 +-
 .../cudacpp/gg_tt.sa/SubProcesses/testxxx.cc  |  14 +-
 epochX/cudacpp/gg_tt.sa/src/HelAmps_sm.h      |   4 +-
 epochX/cudacpp/gg_tt.sa/src/Parameters_sm.cc  |   4 +-
 epochX/cudacpp/gg_tt.sa/src/Parameters_sm.h   |  10 +-
 epochX/cudacpp/gg_tt.sa/src/cudacpp_src.mk    |  14 +-
 epochX/cudacpp/gg_tt.sa/src/mgOnGpuConfig.h   |  71 +++--
 epochX/cudacpp/gg_tt.sa/src/mgOnGpuCxtypes.h  |  26 +-
 epochX/cudacpp/gg_tt.sa/src/mgOnGpuFptypes.h  |  12 +-
 epochX/cudacpp/gg_tt.sa/src/mgOnGpuVectors.h  |  18 +-
 epochX/cudacpp/gg_tt.sa/src/rambo.h           |   8 +-
 .../gg_ttg.mad/CODEGEN_mad_gg_ttg_log.txt     |  62 ++--
 epochX/cudacpp/gg_ttg.mad/COPYRIGHT           |   1 +
 .../gg_ttg.mad/Cards/me5_configuration.txt    |   4 +-
 .../gg_ttg.mad/Source/DHELAS/aloha_file.inc   |   2 +-
 .../cudacpp/gg_ttg.mad/SubProcesses/Bridge.h  |  32 +-
 .../gg_ttg.mad/SubProcesses/BridgeKernels.cc  |   9 +-
 .../gg_ttg.mad/SubProcesses/BridgeKernels.h   |   8 +-
 .../SubProcesses/CommonRandomNumberKernel.cc  |   5 +-
 .../SubProcesses/CrossSectionKernels.cc       |   7 +-
 .../SubProcesses/CrossSectionKernels.h        |   6 +-
 .../gg_ttg.mad/SubProcesses/CudaRuntime.h     |  85 -----
 .../SubProcesses/CurandRandomNumberKernel.cc  |  12 +-
 .../gg_ttg.mad/SubProcesses/EventStatistics.h |   4 +-
 .../gg_ttg.mad/SubProcesses/MadgraphTest.h    |   8 +-
 .../SubProcesses/MatrixElementKernels.cc      |  26 +-
 .../SubProcesses/MatrixElementKernels.h       |   8 +-
 .../SubProcesses/MemoryAccessAmplitudes.h     |   2 +-
 .../SubProcesses/MemoryAccessCouplings.h      |   2 +-
 .../SubProcesses/MemoryAccessCouplingsFixed.h |   2 +-
 .../SubProcesses/MemoryAccessDenominators.h   |   2 +-
 .../gg_ttg.mad/SubProcesses/MemoryAccessGs.h  |   2 +-
 .../SubProcesses/MemoryAccessHelpers.h        |   4 +-
 .../SubProcesses/MemoryAccessMatrixElements.h |   2 +-
 .../SubProcesses/MemoryAccessMomenta.h        |   6 +-
 .../SubProcesses/MemoryAccessNumerators.h     |   2 +-
 .../SubProcesses/MemoryAccessRandomNumbers.h  |   4 +-
 .../SubProcesses/MemoryAccessVectors.h        |   4 +-
 .../SubProcesses/MemoryAccessWavefunctions.h  |   2 +-
 .../gg_ttg.mad/SubProcesses/MemoryBuffers.h   |  64 ++--
 .../SubProcesses/P1_gg_ttxg/CPPProcess.cc     |  62 ++--
 .../SubProcesses/P1_gg_ttxg/CPPProcess.h      |  10 +-
 .../SubProcesses/P1_gg_ttxg/CudaRuntime.h     |   1 -
 .../SubProcesses/P1_gg_ttxg/check_sa.cc       | 103 +++---
 .../SubProcesses/RamboSamplingKernels.cc      |  20 +-
 .../SubProcesses/RamboSamplingKernels.h       |   6 +-
 .../SubProcesses/RandomNumberKernels.h        |   6 +-
 .../gg_ttg.mad/SubProcesses/cudacpp.mk        | 297 +++++++++++-------
 .../gg_ttg.mad/SubProcesses/fbridge.cc        |  16 +-
 .../gg_ttg.mad/SubProcesses/fsampler.cc       |   8 +-
 .../gg_ttg.mad/SubProcesses/runTest.cc        |  12 +-
 .../gg_ttg.mad/SubProcesses/testmisc.cc       |   8 +-
 .../gg_ttg.mad/SubProcesses/testxxx.cc        |  14 +-
 .../bin/internal/ufomodel/py3_model.pkl       | Bin 42822 -> 42813 bytes
 epochX/cudacpp/gg_ttg.mad/src/HelAmps_sm.h    |   4 +-
 .../cudacpp/gg_ttg.mad/src/Parameters_sm.cc   |   4 +-
 epochX/cudacpp/gg_ttg.mad/src/Parameters_sm.h |  10 +-
 epochX/cudacpp/gg_ttg.mad/src/cudacpp_src.mk  |  14 +-
 epochX/cudacpp/gg_ttg.mad/src/mgOnGpuConfig.h |  71 +++--
 .../cudacpp/gg_ttg.mad/src/mgOnGpuCxtypes.h   |  26 +-
 .../cudacpp/gg_ttg.mad/src/mgOnGpuFptypes.h   |  12 +-
 .../cudacpp/gg_ttg.mad/src/mgOnGpuVectors.h   |  18 +-
 epochX/cudacpp/gg_ttg.mad/src/rambo.h         |   8 +-
 .../gg_ttg.sa/CODEGEN_cudacpp_gg_ttg_log.txt  |  52 +--
 epochX/cudacpp/gg_ttg.sa/COPYRIGHT            |   1 +
 .../cudacpp/gg_ttg.sa/SubProcesses/Bridge.h   |  32 +-
 .../gg_ttg.sa/SubProcesses/BridgeKernels.cc   |   9 +-
 .../gg_ttg.sa/SubProcesses/BridgeKernels.h    |   8 +-
 .../SubProcesses/CommonRandomNumberKernel.cc  |   5 +-
 .../SubProcesses/CrossSectionKernels.cc       |   7 +-
 .../SubProcesses/CrossSectionKernels.h        |   6 +-
 .../gg_ttg.sa/SubProcesses/CudaRuntime.h      |  85 -----
 .../SubProcesses/CurandRandomNumberKernel.cc  |  12 +-
 .../gg_ttg.sa/SubProcesses/EventStatistics.h  |   4 +-
 .../gg_ttg.sa/SubProcesses/MadgraphTest.h     |   8 +-
 .../SubProcesses/MatrixElementKernels.cc      |  26 +-
 .../SubProcesses/MatrixElementKernels.h       |   8 +-
 .../SubProcesses/MemoryAccessAmplitudes.h     |   2 +-
 .../SubProcesses/MemoryAccessCouplings.h      |   2 +-
 .../SubProcesses/MemoryAccessCouplingsFixed.h |   2 +-
 .../SubProcesses/MemoryAccessDenominators.h   |   2 +-
 .../gg_ttg.sa/SubProcesses/MemoryAccessGs.h   |   2 +-
 .../SubProcesses/MemoryAccessHelpers.h        |   4 +-
 .../SubProcesses/MemoryAccessMatrixElements.h |   2 +-
 .../SubProcesses/MemoryAccessMomenta.h        |   6 +-
 .../SubProcesses/MemoryAccessNumerators.h     |   2 +-
 .../SubProcesses/MemoryAccessRandomNumbers.h  |   4 +-
 .../SubProcesses/MemoryAccessVectors.h        |   4 +-
 .../SubProcesses/MemoryAccessWavefunctions.h  |   2 +-
 .../gg_ttg.sa/SubProcesses/MemoryBuffers.h    |  64 ++--
 .../P1_Sigma_sm_gg_ttxg/CPPProcess.cc         |  62 ++--
 .../P1_Sigma_sm_gg_ttxg/CPPProcess.h          |  10 +-
 .../P1_Sigma_sm_gg_ttxg/CudaRuntime.h         |   1 -
 .../P1_Sigma_sm_gg_ttxg/check_sa.cc           | 103 +++---
 .../SubProcesses/RamboSamplingKernels.cc      |  20 +-
 .../SubProcesses/RamboSamplingKernels.h       |   6 +-
 .../SubProcesses/RandomNumberKernels.h        |   6 +-
 .../cudacpp/gg_ttg.sa/SubProcesses/cudacpp.mk | 297 +++++++++++-------
 .../cudacpp/gg_ttg.sa/SubProcesses/fbridge.cc |  16 +-
 .../gg_ttg.sa/SubProcesses/fsampler.cc        |   8 +-
 .../cudacpp/gg_ttg.sa/SubProcesses/runTest.cc |  12 +-
 .../gg_ttg.sa/SubProcesses/testmisc.cc        |   8 +-
 .../cudacpp/gg_ttg.sa/SubProcesses/testxxx.cc |  14 +-
 epochX/cudacpp/gg_ttg.sa/src/HelAmps_sm.h     |   4 +-
 epochX/cudacpp/gg_ttg.sa/src/Parameters_sm.cc |   4 +-
 epochX/cudacpp/gg_ttg.sa/src/Parameters_sm.h  |  10 +-
 epochX/cudacpp/gg_ttg.sa/src/cudacpp_src.mk   |  14 +-
 epochX/cudacpp/gg_ttg.sa/src/mgOnGpuConfig.h  |  71 +++--
 epochX/cudacpp/gg_ttg.sa/src/mgOnGpuCxtypes.h |  26 +-
 epochX/cudacpp/gg_ttg.sa/src/mgOnGpuFptypes.h |  12 +-
 epochX/cudacpp/gg_ttg.sa/src/mgOnGpuVectors.h |  18 +-
 epochX/cudacpp/gg_ttg.sa/src/rambo.h          |   8 +-
 .../gg_ttgg.mad/CODEGEN_mad_gg_ttgg_log.txt   |  66 ++--
 .../gg_ttgg.mad/Cards/me5_configuration.txt   |   4 +-
 .../gg_ttgg.mad/Source/DHELAS/aloha_file.inc  |   2 +-
 .../bin/internal/ufomodel/py3_model.pkl       | Bin 42822 -> 42813 bytes
 .../CODEGEN_cudacpp_gg_ttgg_log.txt           |  56 ++--
 epochX/cudacpp/gg_ttgg.sa/COPYRIGHT           |   1 +
 .../cudacpp/gg_ttgg.sa/SubProcesses/Bridge.h  |  32 +-
 .../gg_ttgg.sa/SubProcesses/BridgeKernels.cc  |   9 +-
 .../gg_ttgg.sa/SubProcesses/BridgeKernels.h   |   8 +-
 .../SubProcesses/CommonRandomNumberKernel.cc  |   5 +-
 .../SubProcesses/CrossSectionKernels.cc       |   7 +-
 .../SubProcesses/CrossSectionKernels.h        |   6 +-
 .../gg_ttgg.sa/SubProcesses/CudaRuntime.h     |  85 -----
 .../SubProcesses/CurandRandomNumberKernel.cc  |  12 +-
 .../gg_ttgg.sa/SubProcesses/EventStatistics.h |   4 +-
 .../gg_ttgg.sa/SubProcesses/MadgraphTest.h    |   8 +-
 .../SubProcesses/MatrixElementKernels.cc      |  26 +-
 .../SubProcesses/MatrixElementKernels.h       |   8 +-
 .../SubProcesses/MemoryAccessAmplitudes.h     |   2 +-
 .../SubProcesses/MemoryAccessCouplings.h      |   2 +-
 .../SubProcesses/MemoryAccessCouplingsFixed.h |   2 +-
 .../SubProcesses/MemoryAccessDenominators.h   |   2 +-
 .../gg_ttgg.sa/SubProcesses/MemoryAccessGs.h  |   2 +-
 .../SubProcesses/MemoryAccessHelpers.h        |   4 +-
 .../SubProcesses/MemoryAccessMatrixElements.h |   2 +-
 .../SubProcesses/MemoryAccessMomenta.h        |   6 +-
 .../SubProcesses/MemoryAccessNumerators.h     |   2 +-
 .../SubProcesses/MemoryAccessRandomNumbers.h  |   4 +-
 .../SubProcesses/MemoryAccessVectors.h        |   4 +-
 .../SubProcesses/MemoryAccessWavefunctions.h  |   2 +-
 .../gg_ttgg.sa/SubProcesses/MemoryBuffers.h   |  64 ++--
 .../P1_Sigma_sm_gg_ttxgg/CPPProcess.cc        |  62 ++--
 .../P1_Sigma_sm_gg_ttxgg/CPPProcess.h         |  10 +-
 .../P1_Sigma_sm_gg_ttxgg/CudaRuntime.h        |   1 -
 .../P1_Sigma_sm_gg_ttxgg/check_sa.cc          | 103 +++---
 .../SubProcesses/RamboSamplingKernels.cc      |  20 +-
 .../SubProcesses/RamboSamplingKernels.h       |   6 +-
 .../SubProcesses/RandomNumberKernels.h        |   6 +-
 .../gg_ttgg.sa/SubProcesses/cudacpp.mk        | 297 +++++++++++-------
 .../gg_ttgg.sa/SubProcesses/fbridge.cc        |  16 +-
 .../gg_ttgg.sa/SubProcesses/fsampler.cc       |   8 +-
 .../gg_ttgg.sa/SubProcesses/runTest.cc        |  12 +-
 .../gg_ttgg.sa/SubProcesses/testmisc.cc       |   8 +-
 .../gg_ttgg.sa/SubProcesses/testxxx.cc        |  14 +-
 epochX/cudacpp/gg_ttgg.sa/src/HelAmps_sm.h    |   4 +-
 .../cudacpp/gg_ttgg.sa/src/Parameters_sm.cc   |   4 +-
 epochX/cudacpp/gg_ttgg.sa/src/Parameters_sm.h |  10 +-
 epochX/cudacpp/gg_ttgg.sa/src/cudacpp_src.mk  |  14 +-
 epochX/cudacpp/gg_ttgg.sa/src/mgOnGpuConfig.h |  71 +++--
 .../cudacpp/gg_ttgg.sa/src/mgOnGpuCxtypes.h   |  26 +-
 .../cudacpp/gg_ttgg.sa/src/mgOnGpuFptypes.h   |  12 +-
 .../cudacpp/gg_ttgg.sa/src/mgOnGpuVectors.h   |  18 +-
 epochX/cudacpp/gg_ttgg.sa/src/rambo.h         |   8 +-
 .../gg_ttggg.mad/CODEGEN_mad_gg_ttggg_log.txt |  72 ++---
 epochX/cudacpp/gg_ttggg.mad/COPYRIGHT         |   1 +
 .../gg_ttggg.mad/Cards/me5_configuration.txt  |   4 +-
 .../gg_ttggg.mad/Source/DHELAS/aloha_file.inc |   2 +-
 .../gg_ttggg.mad/SubProcesses/Bridge.h        |  32 +-
 .../SubProcesses/BridgeKernels.cc             |   9 +-
 .../gg_ttggg.mad/SubProcesses/BridgeKernels.h |   8 +-
 .../SubProcesses/CommonRandomNumberKernel.cc  |   5 +-
 .../SubProcesses/CrossSectionKernels.cc       |   7 +-
 .../SubProcesses/CrossSectionKernels.h        |   6 +-
 .../gg_ttggg.mad/SubProcesses/CudaRuntime.h   |  85 -----
 .../SubProcesses/CurandRandomNumberKernel.cc  |  12 +-
 .../SubProcesses/EventStatistics.h            |   4 +-
 .../gg_ttggg.mad/SubProcesses/MadgraphTest.h  |   8 +-
 .../SubProcesses/MatrixElementKernels.cc      |  26 +-
 .../SubProcesses/MatrixElementKernels.h       |   8 +-
 .../SubProcesses/MemoryAccessAmplitudes.h     |   2 +-
 .../SubProcesses/MemoryAccessCouplings.h      |   2 +-
 .../SubProcesses/MemoryAccessCouplingsFixed.h |   2 +-
 .../SubProcesses/MemoryAccessDenominators.h   |   2 +-
 .../SubProcesses/MemoryAccessGs.h             |   2 +-
 .../SubProcesses/MemoryAccessHelpers.h        |   4 +-
 .../SubProcesses/MemoryAccessMatrixElements.h |   2 +-
 .../SubProcesses/MemoryAccessMomenta.h        |   6 +-
 .../SubProcesses/MemoryAccessNumerators.h     |   2 +-
 .../SubProcesses/MemoryAccessRandomNumbers.h  |   4 +-
 .../SubProcesses/MemoryAccessVectors.h        |   4 +-
 .../SubProcesses/MemoryAccessWavefunctions.h  |   2 +-
 .../gg_ttggg.mad/SubProcesses/MemoryBuffers.h |  64 ++--
 .../SubProcesses/P1_gg_ttxggg/CPPProcess.cc   |  62 ++--
 .../SubProcesses/P1_gg_ttxggg/CPPProcess.h    |  10 +-
 .../SubProcesses/P1_gg_ttxggg/CudaRuntime.h   |   1 -
 .../SubProcesses/P1_gg_ttxggg/check_sa.cc     | 103 +++---
 .../SubProcesses/RamboSamplingKernels.cc      |  20 +-
 .../SubProcesses/RamboSamplingKernels.h       |   6 +-
 .../SubProcesses/RandomNumberKernels.h        |   6 +-
 .../gg_ttggg.mad/SubProcesses/cudacpp.mk      | 297 +++++++++++-------
 .../gg_ttggg.mad/SubProcesses/fbridge.cc      |  16 +-
 .../gg_ttggg.mad/SubProcesses/fsampler.cc     |   8 +-
 .../gg_ttggg.mad/SubProcesses/runTest.cc      |  12 +-
 .../gg_ttggg.mad/SubProcesses/testmisc.cc     |   8 +-
 .../gg_ttggg.mad/SubProcesses/testxxx.cc      |  14 +-
 .../bin/internal/ufomodel/py3_model.pkl       | Bin 42822 -> 42813 bytes
 epochX/cudacpp/gg_ttggg.mad/src/HelAmps_sm.h  |   4 +-
 .../cudacpp/gg_ttggg.mad/src/Parameters_sm.cc |   4 +-
 .../cudacpp/gg_ttggg.mad/src/Parameters_sm.h  |  10 +-
 .../cudacpp/gg_ttggg.mad/src/cudacpp_src.mk   |  14 +-
 .../cudacpp/gg_ttggg.mad/src/mgOnGpuConfig.h  |  71 +++--
 .../cudacpp/gg_ttggg.mad/src/mgOnGpuCxtypes.h |  26 +-
 .../cudacpp/gg_ttggg.mad/src/mgOnGpuFptypes.h |  12 +-
 .../cudacpp/gg_ttggg.mad/src/mgOnGpuVectors.h |  18 +-
 epochX/cudacpp/gg_ttggg.mad/src/rambo.h       |   8 +-
 .../CODEGEN_cudacpp_gg_ttggg_log.txt          |  60 ++--
 epochX/cudacpp/gg_ttggg.sa/COPYRIGHT          |   1 +
 .../cudacpp/gg_ttggg.sa/SubProcesses/Bridge.h |  32 +-
 .../gg_ttggg.sa/SubProcesses/BridgeKernels.cc |   9 +-
 .../gg_ttggg.sa/SubProcesses/BridgeKernels.h  |   8 +-
 .../SubProcesses/CommonRandomNumberKernel.cc  |   5 +-
 .../SubProcesses/CrossSectionKernels.cc       |   7 +-
 .../SubProcesses/CrossSectionKernels.h        |   6 +-
 .../gg_ttggg.sa/SubProcesses/CudaRuntime.h    |  85 -----
 .../SubProcesses/CurandRandomNumberKernel.cc  |  12 +-
 .../SubProcesses/EventStatistics.h            |   4 +-
 .../gg_ttggg.sa/SubProcesses/MadgraphTest.h   |   8 +-
 .../SubProcesses/MatrixElementKernels.cc      |  26 +-
 .../SubProcesses/MatrixElementKernels.h       |   8 +-
 .../SubProcesses/MemoryAccessAmplitudes.h     |   2 +-
 .../SubProcesses/MemoryAccessCouplings.h      |   2 +-
 .../SubProcesses/MemoryAccessCouplingsFixed.h |   2 +-
 .../SubProcesses/MemoryAccessDenominators.h   |   2 +-
 .../gg_ttggg.sa/SubProcesses/MemoryAccessGs.h |   2 +-
 .../SubProcesses/MemoryAccessHelpers.h        |   4 +-
 .../SubProcesses/MemoryAccessMatrixElements.h |   2 +-
 .../SubProcesses/MemoryAccessMomenta.h        |   6 +-
 .../SubProcesses/MemoryAccessNumerators.h     |   2 +-
 .../SubProcesses/MemoryAccessRandomNumbers.h  |   4 +-
 .../SubProcesses/MemoryAccessVectors.h        |   4 +-
 .../SubProcesses/MemoryAccessWavefunctions.h  |   2 +-
 .../gg_ttggg.sa/SubProcesses/MemoryBuffers.h  |  64 ++--
 .../P1_Sigma_sm_gg_ttxggg/CPPProcess.cc       |  62 ++--
 .../P1_Sigma_sm_gg_ttxggg/CPPProcess.h        |  10 +-
 .../P1_Sigma_sm_gg_ttxggg/CudaRuntime.h       |   1 -
 .../P1_Sigma_sm_gg_ttxggg/check_sa.cc         | 103 +++---
 .../SubProcesses/RamboSamplingKernels.cc      |  20 +-
 .../SubProcesses/RamboSamplingKernels.h       |   6 +-
 .../SubProcesses/RandomNumberKernels.h        |   6 +-
 .../gg_ttggg.sa/SubProcesses/cudacpp.mk       | 297 +++++++++++-------
 .../gg_ttggg.sa/SubProcesses/fbridge.cc       |  16 +-
 .../gg_ttggg.sa/SubProcesses/fsampler.cc      |   8 +-
 .../gg_ttggg.sa/SubProcesses/runTest.cc       |  12 +-
 .../gg_ttggg.sa/SubProcesses/testmisc.cc      |   8 +-
 .../gg_ttggg.sa/SubProcesses/testxxx.cc       |  14 +-
 epochX/cudacpp/gg_ttggg.sa/src/HelAmps_sm.h   |   4 +-
 .../cudacpp/gg_ttggg.sa/src/Parameters_sm.cc  |   4 +-
 .../cudacpp/gg_ttggg.sa/src/Parameters_sm.h   |  10 +-
 epochX/cudacpp/gg_ttggg.sa/src/cudacpp_src.mk |  14 +-
 .../cudacpp/gg_ttggg.sa/src/mgOnGpuConfig.h   |  71 +++--
 .../cudacpp/gg_ttggg.sa/src/mgOnGpuCxtypes.h  |  26 +-
 .../cudacpp/gg_ttggg.sa/src/mgOnGpuFptypes.h  |  12 +-
 .../cudacpp/gg_ttggg.sa/src/mgOnGpuVectors.h  |  18 +-
 epochX/cudacpp/gg_ttggg.sa/src/rambo.h        |   8 +-
 457 files changed, 5099 insertions(+), 4830 deletions(-)
 delete mode 100644 epochX/cudacpp/ee_mumu.mad/SubProcesses/CudaRuntime.h
 delete mode 120000 epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/CudaRuntime.h
 delete mode 100644 epochX/cudacpp/ee_mumu.sa/SubProcesses/CudaRuntime.h
 delete mode 120000 epochX/cudacpp/ee_mumu.sa/SubProcesses/P1_Sigma_sm_epem_mupmum/CudaRuntime.h
 delete mode 100644 epochX/cudacpp/gg_tt.mad/SubProcesses/CudaRuntime.h
 delete mode 120000 epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/CudaRuntime.h
 delete mode 100644 epochX/cudacpp/gg_tt.sa/SubProcesses/CudaRuntime.h
 delete mode 120000 epochX/cudacpp/gg_tt.sa/SubProcesses/P1_Sigma_sm_gg_ttx/CudaRuntime.h
 delete mode 100644 epochX/cudacpp/gg_ttg.mad/SubProcesses/CudaRuntime.h
 delete mode 120000 epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/CudaRuntime.h
 delete mode 100644 epochX/cudacpp/gg_ttg.sa/SubProcesses/CudaRuntime.h
 delete mode 120000 epochX/cudacpp/gg_ttg.sa/SubProcesses/P1_Sigma_sm_gg_ttxg/CudaRuntime.h
 delete mode 100644 epochX/cudacpp/gg_ttgg.sa/SubProcesses/CudaRuntime.h
 delete mode 120000 epochX/cudacpp/gg_ttgg.sa/SubProcesses/P1_Sigma_sm_gg_ttxgg/CudaRuntime.h
 delete mode 100644 epochX/cudacpp/gg_ttggg.mad/SubProcesses/CudaRuntime.h
 delete mode 120000 epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/CudaRuntime.h
 delete mode 100644 epochX/cudacpp/gg_ttggg.sa/SubProcesses/CudaRuntime.h
 delete mode 120000 epochX/cudacpp/gg_ttggg.sa/SubProcesses/P1_Sigma_sm_gg_ttxggg/CudaRuntime.h

diff --git a/epochX/cudacpp/ee_mumu.mad/CODEGEN_mad_ee_mumu_log.txt b/epochX/cudacpp/ee_mumu.mad/CODEGEN_mad_ee_mumu_log.txt
index 759bbd80d8..50f3467303 100644
--- a/epochX/cudacpp/ee_mumu.mad/CODEGEN_mad_ee_mumu_log.txt
+++ b/epochX/cudacpp/ee_mumu.mad/CODEGEN_mad_ee_mumu_log.txt
@@ -53,7 +53,7 @@ Note that you can still compile and run aMC@NLO with the built-in PDFs
 Using default text editor "vi". Set another one in ./input/mg5_configuration.txt
 Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt
 No valid web browser found. Please set in ./input/mg5_configuration.txt
-import /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_mad_ee_mumu.mg
+import /afs/cern.ch/work/j/jteig/mg5amcnlo/CODEGEN_mad_ee_mumu.mg
 The import format was not given, so we guess it as command
 set stdout_level DEBUG
 set output information to level: 10
@@ -62,7 +62,7 @@ generate e+ e- > mu+ mu-
 No model currently active, so we import the Standard Model
 INFO: load particles 
 INFO: load vertices 
-[1;32mDEBUG: model prefixing  takes 0.005298137664794922 [0m
+[1;32mDEBUG: model prefixing  takes 0.0032465457916259766 [0m
 INFO: Restrict model sm with file models/sm/restrict_default.dat . 
 [1;32mDEBUG: Simplifying conditional expressions [0m
 [1;32mDEBUG: remove interactions: u s w+ at order: QED=1 [0m
@@ -154,7 +154,7 @@ INFO: Checking for minimal orders which gives processes.
 INFO: Please specify coupling orders to bypass this step. 
 INFO: Trying process: e+ e- > mu+ mu- WEIGHTED<=4 @1  
 INFO: Process has 2 diagrams 
-1 processes with 2 diagrams generated in 0.004 s
+1 processes with 2 diagrams generated in 0.003 s
 Total: 1 processes with 2 diagrams
 output madevent CODEGEN_mad_ee_mumu --hel_recycling=False --vector_size=16384 --me_exporter=standalone_cudacpp
 Load PLUGIN.CUDACPP_SA_OUTPUT
@@ -165,10 +165,10 @@ Load PLUGIN.CUDACPP_SA_OUTPUT
 INFO: initialize a new directory: CODEGEN_mad_ee_mumu 
 INFO: remove old information in CODEGEN_mad_ee_mumu 
 [1;32mDEBUG:  Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [1;30m[output.py at line 157][0m [0m
-[1;34mWARNING: File exists /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_mad_ee_mumu [0m
-INFO: Creating subdirectories in directory /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_mad_ee_mumu 
-[1;34mWARNING: File exists /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_mad_ee_mumu/Cards [0m
-[1;34mWARNING: File exists /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_mad_ee_mumu/SubProcesses [0m
+[1;34mWARNING: File exists /afs/cern.ch/work/j/jteig/mg5amcnlo/CODEGEN_mad_ee_mumu [0m
+INFO: Creating subdirectories in directory /afs/cern.ch/work/j/jteig/mg5amcnlo/CODEGEN_mad_ee_mumu 
+[1;34mWARNING: File exists /afs/cern.ch/work/j/jteig/mg5amcnlo/CODEGEN_mad_ee_mumu/Cards [0m
+[1;34mWARNING: File exists /afs/cern.ch/work/j/jteig/mg5amcnlo/CODEGEN_mad_ee_mumu/SubProcesses [0m
 INFO: Organizing processes into subprocess groups 
 INFO: Generating Helas calls for process: e+ e- > mu+ mu- WEIGHTED<=4 @1 
 INFO: Processing color information for process: e+ e- > mu+ mu- @1 
@@ -176,7 +176,7 @@ INFO: Creating files in directory P1_epem_mupmum
 [1;32mDEBUG:  Entering PLUGIN_OneProcessExporter.__init__ [1;30m[model_handling.py at line 1039][0m [0m
 [1;32mDEBUG:  kwargs[prefix] = 0 [1;30m[model_handling.py at line 1040][0m [0m
 [1;32mDEBUG:  proc_id = [0m 1 [1;30m[model_handling.py at line 1045][0m [0m
-[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_SA_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7f0559c1aa00> [1;30m[export_v4.py at line 6174][0m [0m
+[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_SA_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7f983ab093d0> [1;30m[export_v4.py at line 6174][0m [0m
 INFO: Creating files in directory . 
 [1;32mDEBUG:  Entering PLUGIN_OneProcessExporter.generate_process_files [1;30m[model_handling.py at line 1297][0m [0m
 [1;32mDEBUG:  self.include_multi_channel is already defined: this is madevent+second_exporter mode [1;30m[model_handling.py at line 1299][0m [0m
@@ -191,7 +191,7 @@ FileWriter <class 'PLUGIN.CUDACPP_SA_OUTPUT.model_handling.PLUGIN_CPPWriter'> fo
 [1;32mDEBUG:  self.support_multichannel, self.include_multi_channel = [0m True [1, 2] [1;30m[model_handling.py at line 1163][0m [0m
 [1;32mDEBUG:  multi_channel = [0m {1: [0], 2: [1]} [1;30m[model_handling.py at line 1169][0m [0m
 [1;32mDEBUG:  multi_channel_map = [0m {1: [0], 2: [1]} [1;30m[model_handling.py at line 1654][0m [0m
-[1;32mDEBUG:  diag_to_config = [0m {1: 1, 2: 2} [1;30m[model_handling.py at line 1709][0m [0m
+[1;32mDEBUG:  diag_to_config = [0m {1: 1, 2: 2} [1;30m[model_handling.py at line 1711][0m [0m
 INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./. 
 [1;32mDEBUG:  Entering PLUGIN_OneProcessExporter.edit_CMakeLists [1;30m[model_handling.py at line 1343][0m [0m
 [1;32mDEBUG:  Entering PLUGIN_OneProcessExporter.edit_check_sa [1;30m[model_handling.py at line 1352][0m [0m
@@ -201,7 +201,7 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./.
 [1;32mDEBUG:  Entering PLUGIN_OneProcessExporter.edit_testxxx [1;30m[model_handling.py at line 1419][0m [0m
 [1;32mDEBUG:  Entering PLUGIN_OneProcessExporter.edit_memorybuffers [1;30m[model_handling.py at line 1430][0m [0m
 [1;32mDEBUG:  Entering PLUGIN_OneProcessExporter.edit_memoryaccesscouplings [1;30m[model_handling.py at line 1441][0m [0m
-[1;32mDEBUG:  'Copying test reference file: ', template_ref  = [0m Copying test reference file:  /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/../../../test/ref/dump_CPUTest.Sigma_sm_epem_mupmum.txt [1;30m[model_handling.py at line 1335][0m [0m
+[1;32mDEBUG:  'Copying test reference file: ', template_ref  = [0m Copying test reference file:  /afs/cern.ch/work/j/jteig/mg5amcnlo/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/../../../test/ref/dump_CPUTest.Sigma_sm_epem_mupmum.txt [1;30m[model_handling.py at line 1335][0m [0m
 [1;32mDEBUG:  proc_id = [0m 1 [1;30m[export_cpp.py at line 710][0m [0m
 [1;32mDEBUG:  config_map = [0m [1, 2] [1;30m[export_cpp.py at line 711][0m [0m
 [1;32mDEBUG:  subproc_number = [0m 0 [1;30m[export_cpp.py at line 712][0m [0m
@@ -209,20 +209,20 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./.
 [1;34mWARNING: vector code for lepton pdf not implemented. We removed the option to run dressed lepton [0m
 INFO: Generating Feynman diagrams for Process: e+ e- > mu+ mu- WEIGHTED<=4 @1 
 INFO: Finding symmetric diagrams for subprocess group epem_mupmum 
-Generated helas calls for 1 subprocesses (2 diagrams) in 0.004 s
-Wrote files for 8 helas calls in 0.092 s
+Generated helas calls for 1 subprocesses (2 diagrams) in 0.003 s
+Wrote files for 8 helas calls in 0.202 s
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates FFV1 routines[0m
 ALOHA: aloha creates FFV2 routines[0m
 ALOHA: aloha creates FFV4 routines[0m
-ALOHA: aloha creates 3 routines in  0.167 s
+ALOHA: aloha creates 3 routines in  0.139 s
 [1;32mDEBUG:  Entering PLUGIN_ProcessExporter.convert_model (create the model) [1;30m[output.py at line 194][0m [0m
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates FFV1 routines[0m
 ALOHA: aloha creates FFV2 routines[0m
 ALOHA: aloha creates FFV4 routines[0m
 ALOHA: aloha creates FFV2_4 routines[0m
-ALOHA: aloha creates 7 routines in  0.217 s
+ALOHA: aloha creates 7 routines in  0.172 s
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
 <class 'aloha.create_aloha.AbstractRoutine'> FFV2
@@ -231,29 +231,29 @@ ALOHA: aloha creates 7 routines in  0.217 s
 <class 'aloha.create_aloha.AbstractRoutine'> FFV4
 <class 'aloha.create_aloha.AbstractRoutine'> FFV2_4
 <class 'aloha.create_aloha.AbstractRoutine'> FFV2_4
-FileWriter <class 'PLUGIN.CUDACPP_SA_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_mad_ee_mumu/src/./HelAmps_sm.h
-INFO: Created file HelAmps_sm.h in directory /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_mad_ee_mumu/src/. 
+FileWriter <class 'PLUGIN.CUDACPP_SA_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /afs/cern.ch/work/j/jteig/mg5amcnlo/CODEGEN_mad_ee_mumu/src/./HelAmps_sm.h
+INFO: Created file HelAmps_sm.h in directory /afs/cern.ch/work/j/jteig/mg5amcnlo/CODEGEN_mad_ee_mumu/src/. 
 super_write_set_parameters_onlyfixMajorana (hardcoded=False)
 [1;32mDEBUG:  'pardef_lines size =', len(pardef_lines), ', keys size =', len(pardef_lines.keys())  = [0m pardef_lines size = 48 , keys size = 48 [1;30m[model_handling.py at line 729][0m [0m
 super_write_set_parameters_onlyfixMajorana (hardcoded=True)
 [1;32mDEBUG:  'pardef_lines size =', len(pardef_lines), ', keys size =', len(pardef_lines.keys())  = [0m pardef_lines size = 3 , keys size = 3 [1;30m[model_handling.py at line 729][0m [0m
 [1;32mDEBUG:  'pardef_lines size =', len(pardef_lines), ', keys size =', len(pardef_lines.keys())  = [0m pardef_lines size = 3 , keys size = 3 [1;30m[model_handling.py at line 729][0m [0m
-FileWriter <class 'PLUGIN.CUDACPP_SA_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_mad_ee_mumu/src/./Parameters_sm.h
-FileWriter <class 'PLUGIN.CUDACPP_SA_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_mad_ee_mumu/src/./Parameters_sm.cc
+FileWriter <class 'PLUGIN.CUDACPP_SA_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /afs/cern.ch/work/j/jteig/mg5amcnlo/CODEGEN_mad_ee_mumu/src/./Parameters_sm.h
+FileWriter <class 'PLUGIN.CUDACPP_SA_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /afs/cern.ch/work/j/jteig/mg5amcnlo/CODEGEN_mad_ee_mumu/src/./Parameters_sm.cc
 INFO: Created files Parameters_sm.h and Parameters_sm.cc in directory 
-INFO: /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_mad_ee_mumu/src/. and /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_mad_ee_mumu/src/. 
+INFO: /afs/cern.ch/work/j/jteig/mg5amcnlo/CODEGEN_mad_ee_mumu/src/. and /afs/cern.ch/work/j/jteig/mg5amcnlo/CODEGEN_mad_ee_mumu/src/. 
 The option zerowidth_tchannel is modified [True] but will not be written in the configuration files.
 If you want to make this value the default for future session, you can run 'save options --all'
-save configuration file to /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_mad_ee_mumu/Cards/me5_configuration.txt
+save configuration file to /afs/cern.ch/work/j/jteig/mg5amcnlo/CODEGEN_mad_ee_mumu/Cards/me5_configuration.txt
 INFO: Use Fortran compiler gfortran 
 INFO: Use c++ compiler g++ 
 INFO: Generate web pages 
-Output to directory /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_mad_ee_mumu done.
+Output to directory /afs/cern.ch/work/j/jteig/mg5amcnlo/CODEGEN_mad_ee_mumu done.
 Type "launch" to generate events from this process, or see
-/data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_mad_ee_mumu/README
+/afs/cern.ch/work/j/jteig/mg5amcnlo/CODEGEN_mad_ee_mumu/README
 Run "open index.html" to see more information about this process.
 quit
 
-real	0m2.155s
-user	0m1.726s
-sys	0m0.362s
+real	0m4.503s
+user	0m1.166s
+sys	0m1.531s
diff --git a/epochX/cudacpp/ee_mumu.mad/COPYRIGHT b/epochX/cudacpp/ee_mumu.mad/COPYRIGHT
index a134b5fef9..84a883fbb0 100644
--- a/epochX/cudacpp/ee_mumu.mad/COPYRIGHT
+++ b/epochX/cudacpp/ee_mumu.mad/COPYRIGHT
@@ -15,6 +15,7 @@ The full development team currently includes the following authors :
   Stephan Hageboeck (CERN)
   Olivier Mattelaer (Universite Catholique de Louvain, original author)
   Stefan Roiser (CERN, original author)
+  Joergen Teig (CERN)
   Andrea Valassi (CERN, original author)
   Zenny Wettersten (CERN)
 See https://github.com/madgraph5/madgraph4gpu for more details. For the full
diff --git a/epochX/cudacpp/ee_mumu.mad/Cards/me5_configuration.txt b/epochX/cudacpp/ee_mumu.mad/Cards/me5_configuration.txt
index 00d7c6f8d6..9e9ed9d752 100644
--- a/epochX/cudacpp/ee_mumu.mad/Cards/me5_configuration.txt
+++ b/epochX/cudacpp/ee_mumu.mad/Cards/me5_configuration.txt
@@ -234,7 +234,7 @@
 # pineappl = pineappl
 
 
-#mg5_path = /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo 
+#mg5_path = /afs/cern.ch/work/j/jteig/mg5amcnlo 
 
 # MG5 MAIN DIRECTORY
-#mg5_path = /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo
+#mg5_path = /afs/cern.ch/work/j/jteig/mg5amcnlo
diff --git a/epochX/cudacpp/ee_mumu.mad/Source/DHELAS/aloha_file.inc b/epochX/cudacpp/ee_mumu.mad/Source/DHELAS/aloha_file.inc
index 738db319fd..e58e08d7bd 100644
--- a/epochX/cudacpp/ee_mumu.mad/Source/DHELAS/aloha_file.inc
+++ b/epochX/cudacpp/ee_mumu.mad/Source/DHELAS/aloha_file.inc
@@ -1 +1 @@
-ALOHARoutine = FFV1_0.o FFV4_3.o FFV1P0_3.o FFV2_0.o FFV4_0.o FFV2_3.o
+ALOHARoutine = FFV1_0.o FFV1P0_3.o FFV2_0.o FFV2_3.o FFV4_0.o FFV4_3.o
diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/Bridge.h b/epochX/cudacpp/ee_mumu.mad/SubProcesses/Bridge.h
index d7e629cacd..f37c972b24 100644
--- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/Bridge.h
+++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/Bridge.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: S. Roiser (Nov 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Roiser, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef BRIDGE_H
 #define BRIDGE_H 1
@@ -22,7 +22,7 @@
 #include <memory>
 #include <type_traits>
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -82,7 +82,7 @@ namespace mg5amcCpu
     Bridge& operator=( const Bridge& ) = delete;
     Bridge& operator=( Bridge&& ) = delete;
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     /**
      * Set the gpublocks and gputhreads for the gpusequence - throws if evnt != gpublocks*gputhreads
      * (this is needed for BridgeKernel tests rather than for actual production use in Fortran)
@@ -149,7 +149,7 @@ namespace mg5amcCpu
     unsigned int m_nevt; // number of events
     int m_nGoodHel;      // the number of good helicities (-1 initially when they have not yet been calculated)
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     int m_gputhreads; // number of gpu threads (default set from number of events, can be modified)
     int m_gpublocks;  // number of gpu blocks (default set from number of events, can be modified)
     DeviceBuffer<FORTRANFPTYPE, sizePerEventMomenta> m_devMomentaF;
@@ -186,12 +186,12 @@ namespace mg5amcCpu
   // Forward declare transposition methods
   //
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 
   template<typename Tin, typename Tout>
   __global__ void dev_transposeMomentaF2C( const Tin* in, Tout* out, const unsigned int nevt );
 
-#endif // __CUDACC__
+#endif // MGONGPUCPP_GPUIMPL
 
   template<typename Tin, typename Tout>
   void hst_transposeMomentaF2C( const Tin* in, Tout* out, const unsigned int nevt );
@@ -208,7 +208,7 @@ namespace mg5amcCpu
   Bridge<FORTRANFPTYPE>::Bridge( unsigned int nevtF, unsigned int nparF, unsigned int np4F )
     : m_nevt( nevtF )
     , m_nGoodHel( -1 )
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     , m_gputhreads( 256 )                  // default number of gpu threads
     , m_gpublocks( m_nevt / m_gputhreads ) // this ensures m_nevt <= m_gpublocks*m_gputhreads
     , m_devMomentaF( m_nevt )
@@ -232,7 +232,7 @@ namespace mg5amcCpu
   {
     if( nparF != CPPProcess::npar ) throw std::runtime_error( "Bridge constructor: npar mismatch" );
     if( np4F != CPPProcess::np4 ) throw std::runtime_error( "Bridge constructor: np4 mismatch" );
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     if( ( m_nevt < s_gputhreadsmin ) || ( m_nevt % s_gputhreadsmin != 0 ) )
       throw std::runtime_error( "Bridge constructor: nevt should be a multiple of " + std::to_string( s_gputhreadsmin ) );
     while( m_nevt != m_gpublocks * m_gputhreads )
@@ -250,11 +250,11 @@ namespace mg5amcCpu
     std::cout << "WARNING! Instantiate host Bridge (nevt=" << m_nevt << ")" << std::endl;
     CPPProcess process( /*verbose=*/false );
     m_pmek.reset( new MatrixElementKernelHost( m_hstMomentaC, m_hstGs, m_hstRndHel, m_hstRndCol, m_hstMEs, m_hstSelHel, m_hstSelCol, m_nevt ) );
-#endif // __CUDACC__
+#endif // MGONGPUCPP_GPUIMPL
     process.initProc( "../../Cards/param_card.dat" );
   }
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   template<typename FORTRANFPTYPE>
   void Bridge<FORTRANFPTYPE>::set_gpugrid( const int gpublocks, const int gputhreads )
   {
@@ -268,7 +268,7 @@ namespace mg5amcCpu
   }
 #endif
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   template<typename FORTRANFPTYPE>
   void Bridge<FORTRANFPTYPE>::gpu_sequence( const FORTRANFPTYPE* momenta,
                                             const FORTRANFPTYPE* gs,
@@ -283,14 +283,14 @@ namespace mg5amcCpu
     constexpr int neppM = MemoryAccessMomenta::neppM;
     if constexpr( neppM == 1 && std::is_same_v<FORTRANFPTYPE, fptype> )
     {
-      checkCuda( cudaMemcpy( m_devMomentaC.data(), momenta, m_devMomentaC.bytes(), cudaMemcpyHostToDevice ) );
+      gpuMemcpy( m_devMomentaC.data(), momenta, m_devMomentaC.bytes(), gpuMemcpyHostToDevice );
     }
     else
     {
-      checkCuda( cudaMemcpy( m_devMomentaF.data(), momenta, m_devMomentaF.bytes(), cudaMemcpyHostToDevice ) );
+      gpuMemcpy( m_devMomentaF.data(), momenta, m_devMomentaF.bytes(), gpuMemcpyHostToDevice );
       const int thrPerEvt = CPPProcess::npar * CPPProcess::np4; // AV: transpose alg does 1 element per thread (NOT 1 event per thread)
       //const int thrPerEvt = 1; // AV: try new alg with 1 event per thread... this seems slower
-      dev_transposeMomentaF2C<<<m_gpublocks * thrPerEvt, m_gputhreads>>>( m_devMomentaF.data(), m_devMomentaC.data(), m_nevt );
+      gpuLaunchKernel( dev_transposeMomentaF2C, m_gpublocks * thrPerEvt, m_gputhreads, m_devMomentaF.data(), m_devMomentaC.data(), m_nevt );
     }
     if constexpr( std::is_same_v<FORTRANFPTYPE, fptype> )
     {
@@ -333,7 +333,7 @@ namespace mg5amcCpu
   }
 #endif
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   template<typename FORTRANFPTYPE>
   void Bridge<FORTRANFPTYPE>::cpu_sequence( const FORTRANFPTYPE* momenta,
                                             const FORTRANFPTYPE* gs,
@@ -388,7 +388,7 @@ namespace mg5amcCpu
   // - C++ array: momenta[npagM][npar][np4][neppM] with nevt=npagM*neppM (AOSOA)
   //
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   template<typename Tin, typename Tout>
   __global__ void dev_transposeMomentaF2C( const Tin* in, Tout* out, const unsigned int nevt )
   {
diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/BridgeKernels.cc b/epochX/cudacpp/ee_mumu.mad/SubProcesses/BridgeKernels.cc
index d58066c9c1..eaf4037a24 100644
--- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/BridgeKernels.cc
+++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/BridgeKernels.cc
@@ -1,17 +1,18 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #include "BridgeKernels.h"
 
+#include "GpuAbstraction.h"
 #include "MemoryAccessMomenta.h"
 
 #include <sstream>
 
 //============================================================================
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -45,7 +46,7 @@ namespace mg5amcCpu
 
 //============================================================================
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 namespace mg5amcCpu
 {
 
@@ -96,7 +97,7 @@ namespace mg5amcCpu
 
 //============================================================================
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 {
 
diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/BridgeKernels.h b/epochX/cudacpp/ee_mumu.mad/SubProcesses/BridgeKernels.h
index 15eb4bff4d..3efef8ce97 100644
--- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/BridgeKernels.h
+++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/BridgeKernels.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef BRIDGEKERNELS_H
 #define BRIDGEKERNELS_H 1
@@ -12,7 +12,7 @@
 #include "MatrixElementKernels.h"
 #include "MemoryBuffers.h"
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -49,7 +49,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A Bridge wrapper class encapsulating matrix element calculations on a CPU host
   class BridgeKernelHost final : public BridgeKernelBase
   {
@@ -89,7 +89,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   // A Bridge wrapper class encapsulating matrix element calculations on a GPU device
   class BridgeKernelDevice : public BridgeKernelBase
   {
diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/CommonRandomNumberKernel.cc b/epochX/cudacpp/ee_mumu.mad/SubProcesses/CommonRandomNumberKernel.cc
index 985b39f576..010bc4cbd0 100644
--- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/CommonRandomNumberKernel.cc
+++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/CommonRandomNumberKernel.cc
@@ -1,15 +1,16 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #include "CommonRandomNumbers.h"
+#include "GpuAbstraction.h"
 #include "MemoryBuffers.h"
 #include "RandomNumberKernels.h"
 
 #include <cassert>
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/CrossSectionKernels.cc b/epochX/cudacpp/ee_mumu.mad/SubProcesses/CrossSectionKernels.cc
index 0b355a3c8d..c15b39844d 100644
--- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/CrossSectionKernels.cc
+++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/CrossSectionKernels.cc
@@ -1,10 +1,11 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #include "CrossSectionKernels.h"
 
+#include "GpuAbstraction.h"
 #include "MemoryAccessMatrixElements.h"
 #include "MemoryAccessWeights.h"
 #include "MemoryBuffers.h"
@@ -77,7 +78,7 @@ debug_me_is_abnormal( const fptype& me, size_t ievtALL )
 
 //============================================================================
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -185,7 +186,7 @@ namespace mg5amcCpu
 
 //============================================================================
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 {
 
diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/CrossSectionKernels.h b/epochX/cudacpp/ee_mumu.mad/SubProcesses/CrossSectionKernels.h
index 7933ca4bbf..4d9659e04e 100644
--- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/CrossSectionKernels.h
+++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/CrossSectionKernels.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef CROSSSECTIONKERNELS_H
 #define CROSSSECTIONKERNELS_H 1
@@ -13,7 +13,7 @@
 
 //============================================================================
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -96,7 +96,7 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
   /*
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   // A class encapsulating the calculation of event statistics on a GPU device
   class CrossSectionKernelDevice : public CrossSectionKernelBase, public NumberOfEvents
   {
diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/CudaRuntime.h b/epochX/cudacpp/ee_mumu.mad/SubProcesses/CudaRuntime.h
deleted file mode 100644
index 64ce52f4b3..0000000000
--- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/CudaRuntime.h
+++ /dev/null
@@ -1,85 +0,0 @@
-// Copyright (C) 2020-2023 CERN and UCLouvain.
-// Licensed under the GNU Lesser General Public License (version 3 or later).
-// Created by: S. Roiser (Jul 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
-
-#ifndef MG5AMC_CUDARUNTIME_H
-#define MG5AMC_CUDARUNTIME_H 1
-
-// MG5AMC on GPU uses the CUDA runtime API, not the lower level CUDA driver API
-// See https://docs.nvidia.com/cuda/cuda-runtime-api/driver-vs-runtime-api.html#driver-vs-runtime-api
-
-#include <cassert>
-#include <iostream>
-
-//--------------------------------------------------------------------------
-
-// See https://stackoverflow.com/a/14038590
-#ifdef __CUDACC__ /* clang-format off */
-#define checkCuda( code ) { assertCuda( code, __FILE__, __LINE__ ); }
-inline void assertCuda( cudaError_t code, const char* file, int line, bool abort = true )
-{
-  if( code != cudaSuccess )
-  {
-    printf( "ERROR! assertCuda: '%s' (%d) in %s:%d\n", cudaGetErrorString( code ), code, file, line );
-    if( abort ) assert( code == cudaSuccess );
-  }
-}
-#endif /* clang-format on */
-
-//--------------------------------------------------------------------------
-
-#ifdef __CUDACC__
-namespace mg5amcGpu
-{
-  // Instantiate a CudaRuntime at the beginnining of the application's main to
-  // invoke cudaSetDevice(0) in the constructor and book a cudaDeviceReset() call in the destructor
-  // *** FIXME! This will all need to be designed differently when going to multi-GPU nodes! ***
-  struct CudaRuntime final
-  {
-    CudaRuntime( const bool debug = true )
-      : m_debug( debug ) { setUp( m_debug ); }
-    ~CudaRuntime() { tearDown( m_debug ); }
-    CudaRuntime( const CudaRuntime& ) = delete;
-    CudaRuntime( CudaRuntime&& ) = delete;
-    CudaRuntime& operator=( const CudaRuntime& ) = delete;
-    CudaRuntime& operator=( CudaRuntime&& ) = delete;
-    bool m_debug;
-
-    // Set up CUDA application
-    // ** NB: strictly speaking this is not needed when using the CUDA runtime API **
-    // Calling cudaSetDevice on startup is useful to properly book-keep the time spent in CUDA initialization
-    static void setUp( const bool debug = true )
-    {
-      // ** NB: it is useful to call cudaSetDevice, or cudaFree, to properly book-keep the time spent in CUDA initialization
-      // ** NB: otherwise, the first CUDA operation (eg a cudaMemcpyToSymbol in CPPProcess ctor) appears to take much longer!
-      /*
-      // [We initially added cudaFree(0) to "ease profile analysis" only because it shows up as a big recognizable block!]
-      // No explicit initialization is needed: https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#initialization
-      // It is not clear what cudaFree(0) does at all: https://stackoverflow.com/questions/69967813/
-      if ( debug ) std::cout << "__CudaRuntime: calling cudaFree(0)" << std::endl;
-      checkCuda( cudaFree( 0 ) ); // SLOW!
-      */
-      // Replace cudaFree(0) by cudaSetDevice(0), even if it is not really needed either
-      // (but see https://developer.nvidia.com/blog/cuda-pro-tip-always-set-current-device-avoid-multithreading-bugs)
-      if( debug ) std::cout << "__CudaRuntime: calling cudaSetDevice(0)" << std::endl;
-      checkCuda( cudaSetDevice( 0 ) ); // SLOW!
-    }
-
-    // Tear down CUDA application (call cudaDeviceReset)
-    // ** NB: strictly speaking this is not needed when using the CUDA runtime API **
-    // Calling cudaDeviceReset on shutdown is only needed for checking memory leaks in cuda-memcheck
-    // See https://docs.nvidia.com/cuda/cuda-memcheck/index.html#leak-checking
-    static void tearDown( const bool debug = true )
-    {
-      if( debug ) std::cout << "__CudaRuntime: calling cudaDeviceReset()" << std::endl;
-      checkCuda( cudaDeviceReset() );
-    }
-  };
-
-}
-#endif
-
-//--------------------------------------------------------------------------
-
-#endif // MG5AMC_CUDARUNTIME_H
diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/CurandRandomNumberKernel.cc b/epochX/cudacpp/ee_mumu.mad/SubProcesses/CurandRandomNumberKernel.cc
index eb56333b03..08a16f6f2c 100644
--- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/CurandRandomNumberKernel.cc
+++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/CurandRandomNumberKernel.cc
@@ -1,9 +1,9 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
-#include "CudaRuntime.h"
+#include "GpuRuntime.h"
 #include "MemoryBuffers.h"
 #include "RandomNumberKernels.h"
 
@@ -22,7 +22,7 @@ inline void assertCurand( curandStatus_t code, const char *file, int line, bool
 }
 #endif /* clang-format on */
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -36,7 +36,7 @@ namespace mg5amcCpu
   {
     if( m_isOnDevice )
     {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
       if( !m_rnarray.isOnDevice() )
         throw std::runtime_error( "CurandRandomNumberKernel on device with a host random number array" );
 #else
@@ -114,7 +114,7 @@ namespace mg5amcCpu
     /*
     printf( "\nCurandRandomNumberKernel::generateRnarray size = %d\n", (int)m_rnarray.size() );
     fptype* data = m_rnarray.data();
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     if( m_rnarray.isOnDevice() )
     {
       data = new fptype[m_rnarray.size()]();
@@ -123,7 +123,7 @@ namespace mg5amcCpu
 #endif
     for( int i = 0; i < ( (int)m_rnarray.size() / 4 ); i++ )
       printf( "[%4d] %f %f %f %f\n", i * 4, data[i * 4], data[i * 4 + 2], data[i * 4 + 2], data[i * 4 + 3] );
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     if( m_rnarray.isOnDevice() ) delete[] data;
 #endif
     */
diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/EventStatistics.h b/epochX/cudacpp/ee_mumu.mad/SubProcesses/EventStatistics.h
index 48b51e0a49..b425a5bade 100644
--- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/EventStatistics.h
+++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/EventStatistics.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef EventStatistics_H
 #define EventStatistics_H 1
@@ -16,7 +16,7 @@
 #include <limits>
 #include <string>
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/MadgraphTest.h b/epochX/cudacpp/ee_mumu.mad/SubProcesses/MadgraphTest.h
index ffe3b84d53..176338151a 100644
--- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/MadgraphTest.h
+++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/MadgraphTest.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: S. Hageboeck (Dec 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MADGRAPHTEST_H_
 #define MADGRAPHTEST_H_ 1
@@ -21,7 +21,7 @@
 #include <string>
 #include <vector>
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 using mg5amcGpu::CPPProcess;
 #else
 using mg5amcCpu::CPPProcess;
@@ -200,7 +200,7 @@ class MadgraphTest : public testing::TestWithParam<TestDriverBase*>
 
 // Since we link both the CPU-only and GPU tests into the same executable, we prevent
 // a multiply defined symbol by only compiling this in the non-CUDA phase:
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 
 /// Compare momenta and matrix elements.
 /// This uses an implementation of TestDriverBase to run a madgraph workflow,
@@ -309,6 +309,6 @@ TEST_P( MadgraphTest, CompareMomentaAndME )
   }
 }
 
-#endif // __CUDACC__
+#endif // MGONGPUCPP_GPUIMPL
 
 #endif /* MADGRAPHTEST_H_ */
diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/MatrixElementKernels.cc b/epochX/cudacpp/ee_mumu.mad/SubProcesses/MatrixElementKernels.cc
index 30257195b6..d6d6c4f179 100644
--- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/MatrixElementKernels.cc
+++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/MatrixElementKernels.cc
@@ -1,12 +1,12 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #include "MatrixElementKernels.h"
 
 #include "CPPProcess.h"
-#include "CudaRuntime.h"
+#include "GpuRuntime.h" // Includes the abstraction for Nvidia/AMD compilation
 #include "MemoryAccessMomenta.h"
 #include "MemoryBuffers.h"
 
@@ -14,7 +14,7 @@
 
 //============================================================================
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 namespace mg5amcCpu
 {
 
@@ -143,7 +143,7 @@ namespace mg5amcCpu
 
 //============================================================================
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 {
 
@@ -202,13 +202,13 @@ namespace mg5amcGpu
     PinnedHostBufferHelicityMask hstIsGoodHel( ncomb );
     DeviceBufferHelicityMask devIsGoodHel( ncomb );
     // ... 0d1. Compute good helicity mask on the device
-    computeDependentCouplings<<<m_gpublocks, m_gputhreads>>>( m_gs.data(), m_couplings.data() );
+    gpuLaunchKernel( computeDependentCouplings, m_gpublocks, m_gputhreads, m_gs.data(), m_couplings.data() );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    sigmaKin_getGoodHel<<<m_gpublocks, m_gputhreads>>>( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_numerators.data(), m_denominators.data(), devIsGoodHel.data() );
+    gpuLaunchKernel( sigmaKin_getGoodHel, m_gpublocks, m_gputhreads, m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_numerators.data(), m_denominators.data(), devIsGoodHel.data() );
 #else
-    sigmaKin_getGoodHel<<<m_gpublocks, m_gputhreads>>>( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), devIsGoodHel.data() );
+    gpuLaunchKernel( sigmaKin_getGoodHel, m_gpublocks, m_gputhreads, m_momenta.data(), m_couplings.data(), m_matrixElements.data(), devIsGoodHel.data() );
 #endif
-    checkCuda( cudaPeekAtLastError() );
+    checkGpu( gpuPeekAtLastError() );
     // ... 0d2. Copy back good helicity mask to the host
     copyHostFromDevice( hstIsGoodHel, devIsGoodHel );
     // ... 0d3. Copy back good helicity list to constant memory on the device
@@ -219,19 +219,19 @@ namespace mg5amcGpu
 
   void MatrixElementKernelDevice::computeMatrixElements( const unsigned int channelId )
   {
-    computeDependentCouplings<<<m_gpublocks, m_gputhreads>>>( m_gs.data(), m_couplings.data() );
+    gpuLaunchKernel( computeDependentCouplings, m_gpublocks, m_gputhreads, m_gs.data(), m_couplings.data() );
 #ifndef MGONGPU_NSIGHT_DEBUG
     constexpr unsigned int sharedMemSize = 0;
 #else
     constexpr unsigned int sharedMemSize = ntpbMAX * sizeof( float );
 #endif
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    sigmaKin<<<m_gpublocks, m_gputhreads, sharedMemSize>>>( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), channelId, m_numerators.data(), m_denominators.data(), m_selhel.data(), m_selcol.data() );
+    gpuLaunchKernelSharedMem( sigmaKin, m_gpublocks, m_gputhreads, sharedMemSize, m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), channelId, m_numerators.data(), m_denominators.data(), m_selhel.data(), m_selcol.data() );
 #else
-    sigmaKin<<<m_gpublocks, m_gputhreads, sharedMemSize>>>( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), m_selhel.data(), m_selcol.data() );
+    gpuLaunchKernelSharedMem( sigmaKin, m_gpublocks, m_gputhreads, sharedMemSize, m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), m_selhel.data(), m_selcol.data() );
 #endif
-    checkCuda( cudaPeekAtLastError() );
-    checkCuda( cudaDeviceSynchronize() );
+    checkGpu( gpuPeekAtLastError() );
+    checkGpu( gpuDeviceSynchronize() );
   }
 
   //--------------------------------------------------------------------------
diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/MatrixElementKernels.h b/epochX/cudacpp/ee_mumu.mad/SubProcesses/MatrixElementKernels.h
index 23e84757a2..72bd8f195b 100644
--- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/MatrixElementKernels.h
+++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/MatrixElementKernels.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MATRIXELEMENTKERNELS_H
 #define MATRIXELEMENTKERNELS_H 1
@@ -10,7 +10,7 @@
 
 #include "MemoryBuffers.h"
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -81,7 +81,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating matrix element calculations on a CPU host
   class MatrixElementKernelHost final : public MatrixElementKernelBase, public NumberOfEvents
   {
@@ -130,7 +130,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   // A class encapsulating matrix element calculations on a GPU device
   class MatrixElementKernelDevice : public MatrixElementKernelBase, public NumberOfEvents
   {
diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/MemoryAccessAmplitudes.h b/epochX/cudacpp/ee_mumu.mad/SubProcesses/MemoryAccessAmplitudes.h
index 573b3bbbc9..ffb76e93de 100644
--- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/MemoryAccessAmplitudes.h
+++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/MemoryAccessAmplitudes.h
@@ -15,7 +15,7 @@
 #define MGONGPU_TRIVIAL_AMPLITUDES 1
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/MemoryAccessCouplings.h b/epochX/cudacpp/ee_mumu.mad/SubProcesses/MemoryAccessCouplings.h
index 35a3af42e0..3afdf3e554 100644
--- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/MemoryAccessCouplings.h
+++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/MemoryAccessCouplings.h
@@ -15,7 +15,7 @@
 #include "MemoryBuffers.h"       // for HostBufferCouplings::isaligned
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/MemoryAccessCouplingsFixed.h b/epochX/cudacpp/ee_mumu.mad/SubProcesses/MemoryAccessCouplingsFixed.h
index dc0d93afff..ffcdf4dbef 100644
--- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/MemoryAccessCouplingsFixed.h
+++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/MemoryAccessCouplingsFixed.h
@@ -14,7 +14,7 @@
 //#include "MemoryAccessHelpers.h"
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/MemoryAccessDenominators.h b/epochX/cudacpp/ee_mumu.mad/SubProcesses/MemoryAccessDenominators.h
index 3bce635718..66f2d32a6b 100644
--- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/MemoryAccessDenominators.h
+++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/MemoryAccessDenominators.h
@@ -10,7 +10,7 @@
 #include "MemoryAccessGs.h"
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/MemoryAccessGs.h b/epochX/cudacpp/ee_mumu.mad/SubProcesses/MemoryAccessGs.h
index 31311aa375..4c726b30f3 100644
--- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/MemoryAccessGs.h
+++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/MemoryAccessGs.h
@@ -13,7 +13,7 @@
 #include "MemoryBuffers.h" // for HostBufferMatrixElements::isaligned
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/MemoryAccessHelpers.h b/epochX/cudacpp/ee_mumu.mad/SubProcesses/MemoryAccessHelpers.h
index c82a6c7635..db73e4e064 100644
--- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/MemoryAccessHelpers.h
+++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/MemoryAccessHelpers.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MemoryAccessHelpers_H
 #define MemoryAccessHelpers_H 1
@@ -105,7 +105,7 @@ class KernelAccessHelper : public MemoryAccessHelper<T>
     }
     else
     {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
       const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid
       //printf( "kernelAccessRecord: ievt=%d threadId=%d\n", ievt, threadIdx.x );
       return T::ieventAccessRecord( buffer, ievt ); // NB fptype and fptype_sv coincide for CUDA
diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/MemoryAccessMatrixElements.h b/epochX/cudacpp/ee_mumu.mad/SubProcesses/MemoryAccessMatrixElements.h
index f32e6fea5b..3741011971 100644
--- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/MemoryAccessMatrixElements.h
+++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/MemoryAccessMatrixElements.h
@@ -13,7 +13,7 @@
 #include "MemoryBuffers.h" // for HostBufferMatrixElements::isaligned
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/MemoryAccessMomenta.h b/epochX/cudacpp/ee_mumu.mad/SubProcesses/MemoryAccessMomenta.h
index 29266de32c..3be229d392 100644
--- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/MemoryAccessMomenta.h
+++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/MemoryAccessMomenta.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MemoryAccessMomenta_H
 #define MemoryAccessMomenta_H 1
@@ -13,7 +13,7 @@
 #include "MemoryAccessVectors.h"
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -30,7 +30,7 @@ namespace mg5amcCpu
 
     // Number of Events Per Page in the momenta AOSOA memory buffer layout
     // (these are all best kept as a compile-time constants: see issue #23)
-#ifdef __CUDACC__ /* clang-format off */
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
     // -----------------------------------------------------------------------------------------------
     // --- GPUs: neppM is best set to a power of 2 times the number of fptype's in a 32-byte cacheline
     // --- This is relevant to ensure coalesced access to momenta in global memory
diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/MemoryAccessNumerators.h b/epochX/cudacpp/ee_mumu.mad/SubProcesses/MemoryAccessNumerators.h
index b152183b28..18991f4fa6 100644
--- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/MemoryAccessNumerators.h
+++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/MemoryAccessNumerators.h
@@ -10,7 +10,7 @@
 #include "MemoryAccessGs.h"
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/MemoryAccessRandomNumbers.h b/epochX/cudacpp/ee_mumu.mad/SubProcesses/MemoryAccessRandomNumbers.h
index e2988d39f3..40cb089135 100644
--- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/MemoryAccessRandomNumbers.h
+++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/MemoryAccessRandomNumbers.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MemoryAccessRandomNumbers_H
 #define MemoryAccessRandomNumbers_H 1
@@ -11,7 +11,7 @@
 #include "CPPProcess.h"
 #include "MemoryAccessHelpers.h"
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 using mg5amcGpu::CPPProcess;
 #else
 using mg5amcCpu::CPPProcess;
diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/MemoryAccessVectors.h b/epochX/cudacpp/ee_mumu.mad/SubProcesses/MemoryAccessVectors.h
index e9b197368e..08faccff0f 100644
--- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/MemoryAccessVectors.h
+++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/MemoryAccessVectors.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MemoryAccessVectors_H
 #define MemoryAccessVectors_H 1
@@ -10,7 +10,7 @@
 
 #include "mgOnGpuVectors.h"
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 namespace mg5amcCpu // this is only needed for CPU SIMD vectorization
 {
 
diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/MemoryAccessWavefunctions.h b/epochX/cudacpp/ee_mumu.mad/SubProcesses/MemoryAccessWavefunctions.h
index 5428aaf933..33bef4559e 100644
--- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/MemoryAccessWavefunctions.h
+++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/MemoryAccessWavefunctions.h
@@ -15,7 +15,7 @@
 #define MGONGPU_TRIVIAL_WAVEFUNCTIONS 1
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/MemoryBuffers.h b/epochX/cudacpp/ee_mumu.mad/SubProcesses/MemoryBuffers.h
index 3093e6ed18..7756a71621 100644
--- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/MemoryBuffers.h
+++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/MemoryBuffers.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021, based on earlier work by S. Hageboeck) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Roiser, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MemoryBuffers_H
 #define MemoryBuffers_H 1
@@ -11,12 +11,12 @@
 #include "mgOnGpuCxtypes.h"
 
 #include "CPPProcess.h"
-#include "CudaRuntime.h"
+#include "GpuRuntime.h"
 #include "Parameters_sm.h"
 
 #include <sstream>
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -87,7 +87,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   constexpr bool HostBufferALIGNED = false;   // ismisaligned=false
   constexpr bool HostBufferMISALIGNED = true; // ismisaligned=true
 
@@ -119,7 +119,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   // A class encapsulating a CUDA pinned host buffer
   template<typename T>
   class PinnedHostBufferBase : public BufferBase<T>
@@ -128,18 +128,18 @@ namespace mg5amcCpu
     PinnedHostBufferBase( const size_t size )
       : BufferBase<T>( size, false )
     {
-      checkCuda( cudaMallocHost( &( this->m_data ), this->bytes() ) );
+      gpuMallocHost( &( this->m_data ), this->bytes() );
     }
     virtual ~PinnedHostBufferBase()
     {
-      checkCuda( cudaFreeHost( this->m_data ) );
+      gpuFreeHost( this->m_data );
     }
   };
 #endif
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   // A class encapsulating a CUDA device buffer
   template<typename T>
   class DeviceBufferBase : public BufferBase<T>
@@ -148,18 +148,18 @@ namespace mg5amcCpu
     DeviceBufferBase( const size_t size )
       : BufferBase<T>( size, true )
     {
-      checkCuda( cudaMalloc( &( this->m_data ), this->bytes() ) );
+      gpuMalloc( &( this->m_data ), this->bytes() );
     }
     virtual ~DeviceBufferBase()
     {
-      checkCuda( cudaFree( this->m_data ) );
+      gpuFree( this->m_data );
     }
   };
 #endif
 
   //--------------------------------------------------------------------------
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for a given number of events
   template<typename T, size_t sizePerEvent, bool ismisaligned>
   class HostBuffer : public HostBufferBase<T, ismisaligned>, virtual private NumberOfEvents
@@ -175,7 +175,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   // A class encapsulating a CUDA pinned host buffer for a given number of events
   template<typename T, size_t sizePerEvent>
   class PinnedHostBuffer : public PinnedHostBufferBase<T>, virtual private NumberOfEvents
@@ -191,7 +191,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   // A class encapsulating a CUDA device buffer for a given number of events
   template<typename T, size_t sizePerEvent>
   class DeviceBuffer : public DeviceBufferBase<T>, virtual private NumberOfEvents
@@ -213,7 +213,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for momenta random numbers
   constexpr size_t sizePerEventRndNumMomenta = MemoryBuffers::np4 * MemoryBuffers::nparf;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for momenta random numbers
   typedef HostBuffer<fptype, sizePerEventRndNumMomenta, HostBufferALIGNED> HostBufferRndNumMomenta;
 #else
@@ -232,7 +232,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer with ONE fptype per event
   constexpr size_t sizePerEventOneFp = 1;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer with ONE fptype per event
   typedef HostBuffer<fptype, sizePerEventOneFp, HostBufferALIGNED> HostBufferOneFp;
 #else
@@ -257,7 +257,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for Gs
   constexpr size_t sizePerEventGs = 1;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for gs
   typedef HostBuffer<fptype, sizePerEventGs, HostBufferALIGNED> HostBufferGs;
 #else
@@ -276,7 +276,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for numerators
   constexpr size_t sizePerEventNumerators = 1;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for gs
   typedef HostBuffer<fptype, sizePerEventNumerators, HostBufferALIGNED> HostBufferNumerators;
 #else
@@ -296,7 +296,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for denominators
   constexpr size_t sizePerEventDenominators = 1;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for gs
   typedef HostBuffer<fptype, sizePerEventDenominators, HostBufferALIGNED> HostBufferDenominators;
 #else
@@ -315,7 +315,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for random numbers
   constexpr size_t sizePerEventCouplings = MemoryBuffers::ndcoup * MemoryBuffers::nx2;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for gs
   typedef HostBuffer<fptype, sizePerEventCouplings, HostBufferALIGNED> HostBufferCouplings;
 #else
@@ -333,7 +333,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for momenta
   constexpr size_t sizePerEventMomenta = MemoryBuffers::np4 * MemoryBuffers::npar;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for momenta
   typedef HostBuffer<fptype, sizePerEventMomenta, HostBufferALIGNED> HostBufferMomenta;
   //typedef HostBuffer<fptype, sizePerEventMomenta, HostBufferMISALIGNED> HostBufferMomenta; // TEST MISALIGNMENT!
@@ -352,7 +352,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for sampling weights
   constexpr size_t sizePerEventWeights = 1;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for sampling weights
   typedef HostBuffer<fptype, sizePerEventWeights, HostBufferALIGNED> HostBufferWeights;
 #else
@@ -370,7 +370,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for matrix elements
   constexpr size_t sizePerEventMatrixElements = 1;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for matrix elements
   typedef HostBuffer<fptype, sizePerEventMatrixElements, HostBufferALIGNED> HostBufferMatrixElements;
 #else
@@ -385,7 +385,7 @@ namespace mg5amcCpu
   // A base class encapsulating a memory buffer for the helicity mask
   typedef BufferBase<bool> BufferHelicityMask;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for the helicity mask
   typedef HostBufferBase<bool, HostBufferALIGNED> HostBufferHelicityMask;
 #else
@@ -403,7 +403,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for wavefunctions
   constexpr size_t sizePerEventWavefunctions = MemoryBuffers::nw6 * MemoryBuffers::nx2;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for wavefunctions
   typedef HostBuffer<fptype, sizePerEventWavefunctions, HostBufferALIGNED> HostBufferWavefunctions;
 #else
@@ -421,7 +421,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for helicity random numbers
   constexpr size_t sizePerEventRndNumHelicity = 1;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for helicity random numbers
   typedef HostBuffer<fptype, sizePerEventRndNumHelicity, HostBufferALIGNED> HostBufferRndNumHelicity;
 #else
@@ -439,7 +439,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for color random numbers
   constexpr size_t sizePerEventRndNumColor = 1;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for color random numbers
   typedef HostBuffer<fptype, sizePerEventRndNumColor, HostBufferALIGNED> HostBufferRndNumColor;
 #else
@@ -457,7 +457,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for helicity selection
   constexpr size_t sizePerEventSelectedHelicity = 1;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for helicity selection
   typedef HostBuffer<int, sizePerEventSelectedHelicity, HostBufferALIGNED> HostBufferSelectedHelicity;
 #else
@@ -475,7 +475,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for color selection
   constexpr size_t sizePerEventSelectedColor = 1;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for color selection
   typedef HostBuffer<int, sizePerEventSelectedColor, HostBufferALIGNED> HostBufferSelectedColor;
 #else
@@ -487,7 +487,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   template<class Tdst, class Tsrc>
   void copyDeviceFromHost( Tdst& dst, const Tsrc& src ) // keep the same order of arguments as in memcpy
   {
@@ -504,13 +504,13 @@ namespace mg5amcCpu
       throw std::runtime_error( sstr.str() );
     }
     // NB (PR #45): cudaMemcpy involves an intermediate memcpy to pinned memory if host array is a not a pinned host array
-    checkCuda( cudaMemcpy( dst.data(), src.data(), src.bytes(), cudaMemcpyHostToDevice ) );
+    gpuMemcpy( dst.data(), src.data(), src.bytes(), gpuMemcpyHostToDevice );
   }
 #endif
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   template<class Tdst, class Tsrc>
   void copyHostFromDevice( Tdst& dst, const Tsrc& src ) // keep the same order of arguments as in memcpy
   {
@@ -527,7 +527,7 @@ namespace mg5amcCpu
       throw std::runtime_error( sstr.str() );
     }
     // NB (PR #45): cudaMemcpy involves an intermediate memcpy to pinned memory if host array is a not a pinned host array
-    checkCuda( cudaMemcpy( dst.data(), src.data(), src.bytes(), cudaMemcpyDeviceToHost ) );
+    gpuMemcpy( dst.data(), src.data(), src.bytes(), gpuMemcpyDeviceToHost );
   }
 #endif
 
diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/CPPProcess.cc b/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/CPPProcess.cc
index 1d0299e4e6..9a16d0301d 100644
--- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/CPPProcess.cc
+++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/CPPProcess.cc
@@ -4,7 +4,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
 // MadGraph5_aMC@NLO v. 3.5.0_lo_vect, 2023-06-09
@@ -16,7 +16,6 @@
 
 #include "mgOnGpuConfig.h"
 
-#include "CudaRuntime.h"
 #include "HelAmps_sm.h"
 #include "MemoryAccessAmplitudes.h"
 #include "MemoryAccessCouplings.h"
@@ -46,7 +45,7 @@
 // Class member functions for calculating the matrix elements for
 // Process: e+ e- > mu+ mu- WEIGHTED<=4 @1
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -80,7 +79,7 @@ namespace mg5amcCpu
   __device__ const fptype cIPD[2] = { (fptype)Parameters_sm::mdl_MZ, (fptype)Parameters_sm::mdl_WZ };
   __device__ const fptype cIPC[6] = { (fptype)Parameters_sm::GC_3.real(), (fptype)Parameters_sm::GC_3.imag(), (fptype)Parameters_sm::GC_50.real(), (fptype)Parameters_sm::GC_50.imag(), (fptype)Parameters_sm::GC_59.real(), (fptype)Parameters_sm::GC_59.imag() };
 #else
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   __device__ __constant__ fptype cIPD[2];
   __device__ __constant__ fptype cIPC[6];
 #else
@@ -90,7 +89,7 @@ namespace mg5amcCpu
 #endif
 
   // Helicity combinations (and filtering of "good" helicity combinations)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   __device__ __constant__ short cHel[ncomb][npar];
   __device__ __constant__ int cNGoodHel;
   __device__ __constant__ int cGoodHel[ncomb];
@@ -118,13 +117,13 @@ namespace mg5amcCpu
                            fptype* allDenominators,       // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
                            fptype_sv* jamp2_sv            // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled)
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
                            , const int ievt00             // input: first event number in current C++ event page (for CUDA, ievt depends on threadid)
 #endif
                            )
   //ALWAYS_INLINE // attributes are not permitted in a function definition
   {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     using namespace mg5amcGpu;
     using M_ACCESS = DeviceAccessMomenta;         // non-trivial access: buffer includes all events
     using E_ACCESS = DeviceAccessMatrixElements;  // non-trivial access: buffer includes all events
@@ -151,7 +150,7 @@ namespace mg5amcCpu
 #endif /* clang-format on */
     mgDebug( 0, __FUNCTION__ );
     //printf( "calculate_wavefunctions: ihel=%2d\n", ihel );
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
     //printf( "calculate_wavefunctions: ievt00=%d\n", ievt00 );
 #endif
 
@@ -187,7 +186,7 @@ namespace mg5amcCpu
 #endif
     for( int iParity = 0; iParity < nParity; ++iParity )
     { // START LOOP ON IPARITY
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
       const int ievt0 = ievt00 + iParity * neppV;
 #endif
       constexpr size_t nxcoup = ndcoup + nicoup; // both dependent and independent couplings
@@ -200,8 +199,10 @@ namespace mg5amcCpu
         allCOUPs[idcoup] = CD_ACCESS::idcoupAccessBufferConst( allcouplings, idcoup ); // dependent couplings, vary event-by-event
       for( size_t iicoup = 0; iicoup < nicoup; iicoup++ )
         allCOUPs[ndcoup + iicoup] = CI_ACCESS::iicoupAccessBufferConst( cIPC, iicoup ); // independent couplings, fixed for all events
+#ifdef MGONGPUCPP_GPUIMPL
 #ifdef __CUDACC__
 #pragma nv_diagnostic pop
+#endif
       // CUDA kernels take input/output buffers with momenta/MEs for all events
       const fptype* momenta = allmomenta;
       const fptype* COUPs[nxcoup];
@@ -238,7 +239,7 @@ namespace mg5amcCpu
       // *** DIAGRAM 1 OF 2 ***
 
       // Wavefunction(s) for diagram number 1
-#if not( defined __CUDACC__ and defined MGONGPU_TEST_DIVERGENCE )
+#if not( defined MGONGPUCPP_GPUIMPL and defined MGONGPU_TEST_DIVERGENCE )
       opzxxx<M_ACCESS, W_ACCESS>( momenta, cHel[ihel][0], -1, w_fp[0], 0 ); // NB: opzxxx only uses pz
 #else
       if( ( blockDim.x * blockIdx.x + threadIdx.x ) % 2 == 0 )
@@ -293,7 +294,7 @@ namespace mg5amcCpu
       // [NB do keep 'static' for these constexpr arrays, see issue #283]
       static constexpr fptype2 cf[ncolor][ncolor] = { { 1 } }; // 2-D array[1][1]
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
       // Pre-compute a constexpr triangular color matrix properly normalized #475
       struct TriangularNormalizedColorMatrix
       {
@@ -350,7 +351,7 @@ namespace mg5amcCpu
 #endif
       for( int icol = 0; icol < ncolor; icol++ )
       {
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
         // === C++ START ===
         // Diagonal terms
 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
@@ -409,7 +410,7 @@ namespace mg5amcCpu
       MEs_sv_previous += deltaMEs_previous;
 #endif
       /*
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
       if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", blockDim.x * blockIdx.x + threadIdx.x, ihel, MEs_sv );
 #else
 #ifdef MGONGPU_CPPSIMD
@@ -456,8 +457,8 @@ namespace mg5amcCpu
       { 1, -1, 1, 1 },
       { 1, -1, -1, -1 },
       { 1, -1, -1, 1 } };
-#ifdef __CUDACC__
-    checkCuda( cudaMemcpyToSymbol( cHel, tHel, ncomb * npar * sizeof( short ) ) );
+#ifdef MGONGPUCPP_GPUIMPL
+    gpuMemcpyToSymbol( cHel, tHel, ncomb * npar * sizeof( short ) );
 #else
     memcpy( cHel, tHel, ncomb * npar * sizeof( short ) );
 #endif
@@ -497,9 +498,9 @@ namespace mg5amcCpu
     // Then copy them to CUDA constant memory (issue #39) or its C++ emulation in file-scope static memory
     const fptype tIPD[2] = { (fptype)m_pars->mdl_MZ, (fptype)m_pars->mdl_WZ };
     const cxtype tIPC[3] = { cxmake( m_pars->GC_3 ), cxmake( m_pars->GC_50 ), cxmake( m_pars->GC_59 ) };
-#ifdef __CUDACC__
-    checkCuda( cudaMemcpyToSymbol( cIPD, tIPD, 2 * sizeof( fptype ) ) );
-    checkCuda( cudaMemcpyToSymbol( cIPC, tIPC, 3 * sizeof( cxtype ) ) );
+#ifdef MGONGPUCPP_GPUIMPL
+    gpuMemcpyToSymbol( cIPD, tIPD, 2 * sizeof( fptype ) );
+    gpuMemcpyToSymbol( cIPC, tIPC, 3 * sizeof( cxtype ) );
 #else
     memcpy( cIPD, tIPD, 2 * sizeof( fptype ) );
     memcpy( cIPC, tIPC, 3 * sizeof( cxtype ) );
@@ -536,7 +537,7 @@ namespace mg5amcCpu
   {
     std::stringstream out;
     // CUDA version (NVCC)
-    // [Use __NVCC__ instead of __CUDACC__ here!]
+    // [Use __NVCC__ instead of MGONGPUCPP_GPUIMPL here!]
     // [This tests if 'nvcc' was used even to build a .cc file, even if not necessarily 'nvcc -x cu' for a .cu file]
     // [Check 'nvcc --compiler-options -dM -E dummy.c | grep CUDA': see https://stackoverflow.com/a/53713712]
 #ifdef __NVCC__
@@ -601,12 +602,12 @@ namespace mg5amcCpu
   __global__ void /* clang-format off */
   computeDependentCouplings( const fptype* allgs, // input: Gs[nevt]
                              fptype* allcouplings // output: couplings[nevt*ndcoup*2]
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
                              , const int nevt     // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #endif
   ) /* clang-format on */
   {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     using namespace mg5amcGpu;
     using G_ACCESS = DeviceAccessGs;
     using C_ACCESS = DeviceAccessCouplings;
@@ -627,7 +628,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__ /* clang-format off */
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
   __global__ void
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
@@ -756,9 +757,9 @@ namespace mg5amcCpu
         nGoodHel++;
       }
     }
-#ifdef __CUDACC__
-    checkCuda( cudaMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) ) );
-    checkCuda( cudaMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) ) );
+#ifdef MGONGPUCPP_GPUIMPL
+    gpuMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) );
+    gpuMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) );
 #else
     cNGoodHel = nGoodHel;
     for( int ihel = 0; ihel < ncomb; ihel++ ) cGoodHel[ihel] = goodHel[ihel];
@@ -782,7 +783,7 @@ namespace mg5amcCpu
 #endif
             int* allselhel,                // output: helicity selection[nevt]
             int* allselcol                 // output: helicity selection[nevt]
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
             , const int nevt               // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #endif
             ) /* clang-format on */
@@ -803,7 +804,7 @@ namespace mg5amcCpu
     // Denominators: spins, colors and identical particles
     constexpr int helcolDenominators[1] = { 4 }; // assume nprocesses == 1 (#272 and #343)
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     // Remember: in CUDA this is a kernel for one event, in c++ this processes n events
     const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid
 #else
@@ -817,9 +818,12 @@ namespace mg5amcCpu
 #endif
 
     // Start sigmaKin_lines
+
+#include "GpuAbstraction.h"
+
     // === PART 0 - INITIALISATION (before calculate_wavefunctions) ===
     // Reset the "matrix elements" - running sums of |M|^2 over helicities for the given event
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     allMEs[ievt] = 0;
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     allNumerators[ievt] = 0;
@@ -847,7 +851,7 @@ namespace mg5amcCpu
     // === PART 1 - HELICITY LOOP: CALCULATE WAVEFUNCTIONS ===
     // (in both CUDA and C++, using precomputed good helicities)
 
-#ifdef __CUDACC__ // CUDA OR C++
+#ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++
 
     // *** START OF PART 1a - CUDA (one event per CPU thread) ***
     // Running sum of partial amplitudes squared for event by event color selection (#402)
@@ -1051,7 +1055,7 @@ namespace mg5amcCpu
     // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event
     // [NB 'sum over final spins, average over initial spins', eg see
     // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf]
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     allMEs[ievt] /= helcolDenominators[0];
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     if( channelId > 0 ) allMEs[ievt] *= allNumerators[ievt] / allDenominators[ievt];
diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/CPPProcess.h b/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/CPPProcess.h
index 08d6c29e7b..ebbc2800d3 100644
--- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/CPPProcess.h
+++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/CPPProcess.h
@@ -4,7 +4,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
 // MadGraph5_aMC@NLO v. 3.5.0_lo_vect, 2023-06-09
@@ -25,7 +25,7 @@
 
 //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -107,7 +107,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   __global__ void
   computeDependentCouplings( const fptype* allgs,    // input: Gs[nevt]
                              fptype* allcouplings ); // output: couplings[nevt*ndcoup*2]
@@ -120,7 +120,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__ /* clang-format off */
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
   __global__ void
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
@@ -150,7 +150,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__ /* clang-format off */
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
   __global__ void
   sigmaKin( const fptype* allmomenta,      // input: momenta[nevt*npar*4]
             const fptype* allcouplings,    // input: couplings[nevt*ndcoup*2]
diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/CudaRuntime.h b/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/CudaRuntime.h
deleted file mode 120000
index ce9e1a487a..0000000000
--- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/CudaRuntime.h
+++ /dev/null
@@ -1 +0,0 @@
-../CudaRuntime.h
\ No newline at end of file
diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/check_sa.cc b/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/check_sa.cc
index f1e75b9252..1bad694d1c 100644
--- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/check_sa.cc
+++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/check_sa.cc
@@ -4,7 +4,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: O. Mattelaer (Nov 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 
 #include "mgOnGpuConfig.h"
@@ -12,6 +12,7 @@
 #include "BridgeKernels.h"
 #include "CPPProcess.h"
 #include "CrossSectionKernels.h"
+#include "GpuRuntime.h"
 #include "MatrixElementKernels.h"
 #include "MemoryAccessMatrixElements.h"
 #include "MemoryAccessMomenta.h"
@@ -63,7 +64,7 @@ usage( char* argv0, int ret = 1 )
   std::cout << std::endl;
   std::cout << "Summary stats are always computed: '-p' and '-j' only control their printout" << std::endl;
   std::cout << "The '-d' flag only enables NaN/abnormal warnings and OMP debugging" << std::endl;
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 #ifdef _OPENMP
   std::cout << std::endl;
   std::cout << "Use the OMP_NUM_THREADS environment variable to control OMP multi-threading" << std::endl;
@@ -77,7 +78,7 @@ int
 main( int argc, char** argv )
 {
   // Namespaces for CUDA and C++ (FIXME - eventually use the same namespace everywhere...)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   using namespace mg5amcGpu;
 #else
   using namespace mg5amcCpu;
@@ -103,11 +104,11 @@ main( int argc, char** argv )
     CurandDevice = 2
   };
 #ifdef __CUDACC__
-  RandomNumberMode rndgen = RandomNumberMode::CurandDevice; // default on GPU
+  RandomNumberMode rndgen = RandomNumberMode::CurandDevice; // default on CUDA GPU
 #elif not defined MGONGPU_HAS_NO_CURAND
   RandomNumberMode rndgen = RandomNumberMode::CurandHost;  // default on CPU if build has curand
 #else
-  RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // default on CPU if build has no curand
+  RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // default on CPU if build has no curand and on HIP GPU
 #endif
   // Rambo sampling mode (NB RamboHost implies CommonRandom or CurandHost!)
   enum class RamboSamplingMode
@@ -115,7 +116,7 @@ main( int argc, char** argv )
     RamboHost = 1,
     RamboDevice = 2
   };
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   RamboSamplingMode rmbsmp = RamboSamplingMode::RamboDevice; // default on GPU
 #else
   RamboSamplingMode rmbsmp = RamboSamplingMode::RamboHost; // default on CPU
@@ -148,7 +149,7 @@ main( int argc, char** argv )
 #ifdef __CUDACC__
       rndgen = RandomNumberMode::CurandDevice;
 #else
-      throw std::runtime_error( "CurandDevice is not supported on CPUs" );
+      throw std::runtime_error( "CurandDevice is not supported on CPUs or on HIP GPUs" );
 #endif
     }
     else if( arg == "--curhst" )
@@ -165,7 +166,7 @@ main( int argc, char** argv )
     }
     else if( arg == "--rmbdev" )
     {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
       rmbsmp = RamboSamplingMode::RamboDevice;
 #else
       throw std::runtime_error( "RamboDevice is not supported on CPUs" );
@@ -239,13 +240,13 @@ main( int argc, char** argv )
     return usage( argv[0] );
   }
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 #ifdef _OPENMP
   ompnumthreadsNotSetMeansOneThread( debug ? 1 : 0 ); // quiet(-1), info(0), debug(1)
 #endif
 #endif
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // Fail gently and avoid "Illegal instruction (core dumped)" if the host does not support the SIMD used in the ME calculation
   // Note: this prevents a crash on pmpe04 but not on some github CI nodes?
   // [NB: SIMD vectorization in mg5amc C++ code is only used in the ME calculation below MatrixElementKernelHost!]
@@ -263,14 +264,14 @@ main( int argc, char** argv )
 
   // === STEP 0 - INITIALISE
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 
-  // --- 00. Initialise cuda
-  // Instantiate a CudaRuntime at the beginnining of the application's main to
-  // invoke cudaSetDevice(0) in the constructor and book a cudaDeviceReset() call in the destructor
-  const std::string cdinKey = "00 CudaInit";
+  // --- 00. Initialise GPU
+  // Instantiate a GpuRuntime at the beginnining of the application's main.
+  // For CUDA this invokes cudaSetDevice(0) in the constructor and books a cudaDeviceReset() call in the destructor.
+  const std::string cdinKey = "00 GpuInit";
   timermap.start( cdinKey );
-  CudaRuntime cudaRuntime( debug );
+  GpuRuntime GpuRuntime( debug );
 #endif
 
   // --- 0a. Initialise physics process
@@ -292,7 +293,7 @@ main( int argc, char** argv )
   timermap.start( alloKey );
 
   // Memory buffers for random numbers for momenta
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferRndNumMomenta hstRndmom( nevt );
 #else
   PinnedHostBufferRndNumMomenta hstRndmom( nevt );
@@ -300,7 +301,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for sampling weights
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferWeights hstWeights( nevt );
 #else
   PinnedHostBufferWeights hstWeights( nevt );
@@ -308,7 +309,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for momenta
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferMomenta hstMomenta( nevt );
 #else
   PinnedHostBufferMomenta hstMomenta( nevt );
@@ -316,7 +317,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for Gs
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferGs hstGs( nevt );
 #else
   PinnedHostBufferGs hstGs( nevt );
@@ -333,7 +334,7 @@ main( int argc, char** argv )
   }
 
   // Memory buffers for matrix elements
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferMatrixElements hstMatrixElements( nevt );
 #else
   PinnedHostBufferMatrixElements hstMatrixElements( nevt );
@@ -342,7 +343,7 @@ main( int argc, char** argv )
 
   // Memory buffers for random numbers for helicity selection
   // *** NB #403 these buffers always remain initialised at 0: no need for helicity choice in gcheck/check (no LHE produced) ***
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferRndNumHelicity hstRndHel( nevt );
 #else
   PinnedHostBufferRndNumHelicity hstRndHel( nevt );
@@ -351,7 +352,7 @@ main( int argc, char** argv )
 
   // Memory buffers for random numbers for color selection
   // *** NB #402 these buffers always remain initialised at 0: no need for color choice in gcheck/check (no LHE produced) ***
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferRndNumColor hstRndCol( nevt );
 #else
   PinnedHostBufferRndNumColor hstRndCol( nevt );
@@ -359,7 +360,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for helicity selection
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferSelectedHelicity hstSelHel( nevt );
 #else
   PinnedHostBufferSelectedHelicity hstSelHel( nevt );
@@ -367,7 +368,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for color selection
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferSelectedColor hstSelCol( nevt );
 #else
   PinnedHostBufferSelectedColor hstSelCol( nevt );
@@ -403,7 +404,7 @@ main( int argc, char** argv )
 #else
   else
   {
-    throw std::logic_error( "CurandDevice is not supported on CPUs" ); // INTERNAL ERROR (no path to this statement)
+    throw std::logic_error( "CurandDevice is not supported on CPUs or HIP GPUs" ); // INTERNAL ERROR (no path to this statement)
   }
 #endif
 #else
@@ -421,7 +422,7 @@ main( int argc, char** argv )
   }
   else
   {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     prsk.reset( new RamboSamplingKernelDevice( energy, devRndmom, devMomenta, devWeights, gpublocks, gputhreads ) );
 #else
     throw std::logic_error( "RamboDevice is not supported on CPUs" ); // INTERNAL ERROR (no path to this statement)
@@ -432,7 +433,7 @@ main( int argc, char** argv )
   std::unique_ptr<MatrixElementKernelBase> pmek;
   if( !bridge )
   {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     pmek.reset( new MatrixElementKernelDevice( devMomenta, devGs, devRndHel, devRndCol, devMatrixElements, devSelHel, devSelCol, gpublocks, gputhreads ) );
 #else
     pmek.reset( new MatrixElementKernelHost( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, nevt ) );
@@ -440,7 +441,7 @@ main( int argc, char** argv )
   }
   else
   {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     pmek.reset( new BridgeKernelDevice( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, gpublocks, gputhreads ) );
 #else
     pmek.reset( new BridgeKernelHost( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, nevt ) );
@@ -482,7 +483,7 @@ main( int argc, char** argv )
     prnk->generateRnarray();
     //std::cout << "Got random numbers" << std::endl;
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     if( rndgen != RandomNumberMode::CurandDevice && rmbsmp == RamboSamplingMode::RamboDevice )
     {
       // --- 1c. Copy rndmom from host to device
@@ -514,7 +515,7 @@ main( int argc, char** argv )
     prsk->getMomentaFinal();
     //std::cout << "Got final momenta" << std::endl;
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     if( rmbsmp == RamboSamplingMode::RamboDevice )
     {
       // --- 2c. CopyDToH Weights
@@ -559,7 +560,7 @@ main( int argc, char** argv )
       dynamic_cast<BridgeKernelBase*>( pmek.get() )->transposeInputMomentaC2F();
     }
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     // --- 2d. CopyHToD Momenta
     const std::string gKey = "0.. CpHTDg";
     rambtime += timermap.start( gKey ); // FIXME! NOT A RAMBO TIMER!
@@ -588,7 +589,7 @@ main( int argc, char** argv )
     wv3atime += timermap.stop(); // calc only
     wavetime += wv3atime;        // calc plus copy
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     if( !bridge )
     {
       // --- 3b. CopyDToH MEs
@@ -731,15 +732,19 @@ main( int argc, char** argv )
     rndgentxt = "CURAND DEVICE";
 #ifdef __CUDACC__
   rndgentxt += " (CUDA code)";
+#elif defined __HIPCC__
+  rndgentxt += " (HIP code)";
 #else
   rndgentxt += " (C++ code)";
 #endif
 
   // Workflow description summary
   std::string wrkflwtxt;
-  // -- CUDA or C++?
+  // -- CUDA or HIP or C++?
 #ifdef __CUDACC__
   wrkflwtxt += "CUD:";
+#elif defined __HIPCC__
+  wrkflwtxt += "HIP:";
 #else
   wrkflwtxt += "CPP:";
 #endif
@@ -764,6 +769,12 @@ main( int argc, char** argv )
 #else
   wrkflwtxt += "???:"; // no path to this statement
 #endif
+#elif defined __HIPCC__
+#if defined MGONGPU_CUCXTYPE_CXSMPL
+  wrkflwtxt += "CXS:";
+#else
+  wrkflwtxt += "???:"; // no path to this statement
+#endif
 #else
 #if defined MGONGPU_CPPCXTYPE_STDCOMPLEX
   wrkflwtxt += "STX:";
@@ -789,7 +800,7 @@ main( int argc, char** argv )
     wrkflwtxt += "RMBDEV+";
   else
     wrkflwtxt += "??????+"; // no path to this statement
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   // -- HOST or DEVICE matrix elements? Standalone MEs or BRIDGE?
   if( !bridge )
     wrkflwtxt += "MESDEV";
@@ -845,7 +856,7 @@ main( int argc, char** argv )
 
   if( perf )
   {
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 #ifdef _OPENMP
     // Get the output of "nproc --all" (https://stackoverflow.com/a/478960)
     std::string nprocall;
@@ -866,6 +877,8 @@ main( int argc, char** argv )
     std::cout << std::string( SEP79, '*' ) << std::endl
 #ifdef __CUDACC__
               << "Process                     = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_CUDA"
+#elif defined __HIPCC__
+              << "Process                     = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_HIP"
 #else
               << "Process                     = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_CPP"
 #endif
@@ -892,21 +905,21 @@ main( int argc, char** argv )
 #elif defined MGONGPU_FPTYPE_FLOAT
               << "FP precision                = FLOAT (NaN/abnormal=" << nabn << ", zero=" << nzero << ")" << std::endl
 #endif
-#ifdef __CUDACC__
 #if defined MGONGPU_CUCXTYPE_CUCOMPLEX
               << "Complex type                = CUCOMPLEX" << std::endl
 #elif defined MGONGPU_CUCXTYPE_THRUST
               << "Complex type                = THRUST::COMPLEX" << std::endl
-#endif
-#else
+#elif defined MGONGPU_CUCXTYPE_CXSMPL
               << "Complex type                = STD::COMPLEX" << std::endl
+#else
+              << "Complex type                = ???" << std::endl // no path to this statement...
 #endif
               << "RanNumb memory layout       = AOSOA[" << neppR << "]"
               << ( neppR == 1 ? " == AOS" : "" )
               << " [HARDCODED FOR REPRODUCIBILITY]" << std::endl
               << "Momenta memory layout       = AOSOA[" << neppM << "]"
               << ( neppM == 1 ? " == AOS" : "" ) << std::endl
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     //<< "Wavefunction GPU memory     = LOCAL" << std::endl
 #else
 #if !defined MGONGPU_CPPSIMD
@@ -937,7 +950,7 @@ main( int argc, char** argv )
 #endif
 #endif
               << "Random number generation    = " << rndgentxt << std::endl
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 #ifdef _OPENMP
               << "OMP threads / `nproc --all` = " << omp_get_max_threads() << " / " << nprocall // includes a newline
 #endif
@@ -1033,14 +1046,14 @@ main( int argc, char** argv )
              << "\"FLOAT (NaN/abnormal=" << nabn << ")\"," << std::endl
 #endif
              << "\"Complex type\": "
-#ifdef __CUDACC__
 #if defined MGONGPU_CUCXTYPE_CUCOMPLEX
              << "\"CUCOMPLEX\"," << std::endl
 #elif defined MGONGPU_CUCXTYPE_THRUST
              << "\"THRUST::COMPLEX\"," << std::endl
-#endif
-#else
+#elif defined MGONGPU_CUCXTYPE_CXSMPL
              << "\"STD::COMPLEX\"," << std::endl
+#else
+             << "\"???\"," << std::endl                           // no path to this statement...
 #endif
              << "\"RanNumb memory layout\": "
              << "\"AOSOA[" << neppR << "]\""
@@ -1048,7 +1061,7 @@ main( int argc, char** argv )
              << "\"Momenta memory layout\": "
              << "\"AOSOA[" << neppM << "]\""
              << ( neppM == 1 ? " == AOS" : "" ) << ", " << std::endl
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     //<< "\"Wavefunction GPU memory\": " << "\"LOCAL\"," << std::endl
 #endif
              << "\"Curand generation\": "
diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/RamboSamplingKernels.cc b/epochX/cudacpp/ee_mumu.mad/SubProcesses/RamboSamplingKernels.cc
index da68aa9255..79abbcc4f8 100644
--- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/RamboSamplingKernels.cc
+++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/RamboSamplingKernels.cc
@@ -1,11 +1,11 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #include "RamboSamplingKernels.h"
 
-#include "CudaRuntime.h"
+#include "GpuRuntime.h"
 #include "MemoryAccessMomenta.h"
 #include "MemoryAccessRandomNumbers.h"
 #include "MemoryAccessWeights.h"
@@ -14,7 +14,7 @@
 
 #include <sstream>
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -92,7 +92,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   RamboSamplingKernelDevice::RamboSamplingKernelDevice( const fptype energy,               // input: energy
                                                         const BufferRndNumMomenta& rndmom, // input: random numbers in [0,1]
                                                         BufferMomenta& momenta,            // output: momenta
@@ -135,7 +135,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   __global__ void
   getMomentaInitialDevice( const fptype energy,
                            fptype* momenta )
@@ -147,17 +147,17 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   void
   RamboSamplingKernelDevice::getMomentaInitial()
   {
-    getMomentaInitialDevice<<<m_gpublocks, m_gputhreads>>>( m_energy, m_momenta.data() );
+    gpuLaunchKernel( getMomentaInitialDevice, m_gpublocks, m_gputhreads, m_energy, m_momenta.data() );
   }
 #endif
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   __global__ void
   getMomentaFinalDevice( const fptype energy,
                          const fptype* rndmom,
@@ -171,11 +171,11 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   void
   RamboSamplingKernelDevice::getMomentaFinal()
   {
-    getMomentaFinalDevice<<<m_gpublocks, m_gputhreads>>>( m_energy, m_rndmom.data(), m_momenta.data(), m_weights.data() );
+    gpuLaunchKernel( getMomentaFinalDevice, m_gpublocks, m_gputhreads, m_energy, m_rndmom.data(), m_momenta.data(), m_weights.data() );
   }
 #endif
 
diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/RamboSamplingKernels.h b/epochX/cudacpp/ee_mumu.mad/SubProcesses/RamboSamplingKernels.h
index 184089efd7..7c214cd74b 100644
--- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/RamboSamplingKernels.h
+++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/RamboSamplingKernels.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef RAMBOSAMPLINGKERNELS_H
 #define RAMBOSAMPLINGKERNELS_H 1
@@ -10,7 +10,7 @@
 
 #include "MemoryBuffers.h"
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -93,7 +93,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   // A class encapsulating RAMBO phase space sampling on a GPU device
   class RamboSamplingKernelDevice final : public SamplingKernelBase, public NumberOfEvents
   {
diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/RandomNumberKernels.h b/epochX/cudacpp/ee_mumu.mad/SubProcesses/RandomNumberKernels.h
index 188a72c2c9..21d63beeac 100644
--- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/RandomNumberKernels.h
+++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/RandomNumberKernels.h
@@ -1,14 +1,14 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef RANDOMNUMBERKERNELS_H
 #define RANDOMNUMBERKERNELS_H 1
 
 #include "mgOnGpuConfig.h"
 
-// NB This must come AFTER mgOnGpuConfig.h which contains our definition of __global__ when __CUDACC__ is not defined
+// NB This must come AFTER mgOnGpuConfig.h which contains our definition of __global__ when MGONGPUCPP_GPUIMPL is not defined
 #ifndef MGONGPU_HAS_NO_CURAND
 //#include "curand.h"
 struct curandGenerator_st; // forward definition from curand.h
@@ -16,7 +16,7 @@ struct curandGenerator_st; // forward definition from curand.h
 
 #include "MemoryBuffers.h"
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/cudacpp.mk b/epochX/cudacpp/ee_mumu.mad/SubProcesses/cudacpp.mk
index 43cee0977e..77334d2c04 100644
--- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/cudacpp.mk
+++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/cudacpp.mk
@@ -1,7 +1,7 @@
 # Copyright (C) 2020-2023 CERN and UCLouvain.
 # Licensed under the GNU Lesser General Public License (version 3 or later).
 # Created by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin.
-# Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+# Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 
 #=== Determine the name of this makefile (https://ftp.gnu.org/old-gnu/Manuals/make-3.80/html_node/make_17.html)
 #=== NB: different names (e.g. cudacpp.mk and cudacpp_src.mk) are used in the Subprocess and src directories
@@ -32,7 +32,7 @@ UNAME_P := $(shell uname -p)
 #=== Configure common compiler flags for C++ and CUDA
 
 INCFLAGS = -I.
-OPTFLAGS = -O3 # this ends up in CUFLAGS too (should it?), cannot add -Ofast or -ffast-math here
+OPTFLAGS = -O3 # this ends up in GPUFLAGS too (should it?), cannot add -Ofast or -ffast-math here
 
 # Dependency on src directory
 MG5AMC_COMMONLIB = mg5amc_common
@@ -103,69 +103,135 @@ endif
 
 #-------------------------------------------------------------------------------
 
-#=== Configure the CUDA compiler
-
-# If CXX is not a single word (example "clang++ --gcc-toolchain...") then disable CUDA builds (issue #505)
-# This is because it is impossible to pass this to "CUFLAGS += -ccbin <host-compiler>" below
-ifneq ($(words $(subst ccache ,,$(CXX))),1) # allow at most "CXX=ccache <host-compiler>" from outside
-  $(warning CUDA builds are not supported for multi-word CXX "$(CXX)")
-  override CUDA_HOME=disabled
-endif
-
-# If CUDA_HOME is not set, try to set it from the location of nvcc
-ifndef CUDA_HOME
-  CUDA_HOME = $(patsubst %bin/nvcc,%,$(shell which nvcc 2>/dev/null))
-  $(warning CUDA_HOME was not set: using "$(CUDA_HOME)")
-endif
-
-# Set NVCC as $(CUDA_HOME)/bin/nvcc if it exists
-ifneq ($(wildcard $(CUDA_HOME)/bin/nvcc),)
-  NVCC = $(CUDA_HOME)/bin/nvcc
-  USE_NVTX ?=-DUSE_NVTX
-  # See https://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/index.html
-  # See https://arnon.dk/matching-sm-architectures-arch-and-gencode-for-various-nvidia-cards/
-  # Default: use compute capability 70 for V100 (CERN lxbatch, CERN itscrd, Juwels Cluster).
-  # Embed device code for 70, and PTX for 70+.
-  # Export MADGRAPH_CUDA_ARCHITECTURE (comma-separated list) to use another value or list of values (see #533).
-  # Examples: use 60 for P100 (Piz Daint), 80 for A100 (Juwels Booster, NVidia raplab/Curiosity).
-  MADGRAPH_CUDA_ARCHITECTURE ?= 70
-  ###CUARCHFLAGS = -gencode arch=compute_$(MADGRAPH_CUDA_ARCHITECTURE),code=compute_$(MADGRAPH_CUDA_ARCHITECTURE) -gencode arch=compute_$(MADGRAPH_CUDA_ARCHITECTURE),code=sm_$(MADGRAPH_CUDA_ARCHITECTURE) # Older implementation (AV): go back to this one for multi-GPU support #533
-  ###CUARCHFLAGS = --gpu-architecture=compute_$(MADGRAPH_CUDA_ARCHITECTURE) --gpu-code=sm_$(MADGRAPH_CUDA_ARCHITECTURE),compute_$(MADGRAPH_CUDA_ARCHITECTURE) # Newer implementation (SH): cannot use this as-is for multi-GPU support #533
-  comma:=,
-  CUARCHFLAGS = $(foreach arch,$(subst $(comma), ,$(MADGRAPH_CUDA_ARCHITECTURE)),-gencode arch=compute_$(arch),code=compute_$(arch) -gencode arch=compute_$(arch),code=sm_$(arch))
-  CUINC = -I$(CUDA_HOME)/include/
-  CURANDLIBFLAGS = -L$(CUDA_HOME)/lib64/ -lcurand # NB: -lcuda is not needed here!
-  CUOPTFLAGS = -lineinfo
-  CUFLAGS = $(foreach opt, $(OPTFLAGS), -Xcompiler $(opt)) $(CUOPTFLAGS) $(INCFLAGS) $(CUINC) $(USE_NVTX) $(CUARCHFLAGS) -use_fast_math
-  ###CUFLAGS += -Xcompiler -Wall -Xcompiler -Wextra -Xcompiler -Wshadow
-  ###NVCC_VERSION = $(shell $(NVCC) --version | grep 'Cuda compilation tools' | cut -d' ' -f5 | cut -d, -f1)
-  CUFLAGS += -std=c++17 # need CUDA >= 11.2 (see #333): this is enforced in mgOnGpuConfig.h
-  # Without -maxrregcount: baseline throughput: 6.5E8 (16384 32 12) up to 7.3E8 (65536 128 12)
-  ###CUFLAGS+= --maxrregcount 160 # improves throughput: 6.9E8 (16384 32 12) up to 7.7E8 (65536 128 12)
-  ###CUFLAGS+= --maxrregcount 128 # improves throughput: 7.3E8 (16384 32 12) up to 7.6E8 (65536 128 12)
-  ###CUFLAGS+= --maxrregcount 96 # degrades throughput: 4.1E8 (16384 32 12) up to 4.5E8 (65536 128 12)
-  ###CUFLAGS+= --maxrregcount 64 # degrades throughput: 1.7E8 (16384 32 12) flat at 1.7E8 (65536 128 12)
-else ifneq ($(origin REQUIRE_CUDA),undefined)
-  # If REQUIRE_CUDA is set but no cuda is found, stop here (e.g. for CI tests on GPU #443)
-  $(error No cuda installation found (set CUDA_HOME or make nvcc visible in PATH))
-else
-  # No cuda. Switch cuda compilation off and go to common random numbers in C++
-  $(warning CUDA_HOME is not set or is invalid: export CUDA_HOME to compile with cuda)
-  override NVCC=
-  override USE_NVTX=
-  override CUINC=
-  override CURANDLIBFLAGS=
-endif
-export NVCC
-export CUFLAGS
+CUDA_COMPILER_PATH := $(shell compiler="`which nvcc 2>/dev/null`" && while [ -L "$$compiler" ]; do compiler=`readlink "$$compiler"`; done && echo "$$compiler")
+HIP_COMPILER_PATH := $(shell compiler="`which hipcc 2>/dev/null`" && while [ -L "$$compiler" ]; do compiler=`readlink "$$compiler"`; done && echo "$$compiler")
+
+ifeq ($(findstring nvcc,$(CUDA_COMPILER_PATH)),nvcc)
+  #=== Configure the CUDA compiler
+
+  # If CXX is not a single word (example "clang++ --gcc-toolchain...") then disable CUDA builds (issue #505)
+  # This is because it is impossible to pass this to "GPUFLAGS += -ccbin <host-compiler>" below
+  ifneq ($(words $(subst ccache ,,$(CXX))),1) # allow at most "CXX=ccache <host-compiler>" from outside
+    $(warning CUDA builds are not supported for multi-word CXX "$(CXX)")
+    override CUDA_HOME=disabled
+  endif
+
+  # If CUDA_HOME is not set, try to set it from the location of nvcc
+  ifndef CUDA_HOME
+    CUDA_HOME = $(patsubst %bin/nvcc,%,$(shell which nvcc 2>/dev/null))
+    $(warning CUDA_HOME was not set: using "$(CUDA_HOME)")
+  endif
+
+  # Set GPUCC as $(CUDA_HOME)/bin/nvcc if it exists
+  ifneq ($(wildcard $(CUDA_HOME)/bin/nvcc),)
+    GPUCC = $(CUDA_HOME)/bin/nvcc
+    USE_NVTX ?=-DUSE_NVTX
+    # See https://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/index.html
+    # See https://arnon.dk/matching-sm-architectures-arch-and-gencode-for-various-nvidia-cards/
+    # Default: use compute capability 70 for V100 (CERN lxbatch, CERN itscrd, Juwels Cluster).
+    # Embed device code for 70, and PTX for 70+.
+    # Export MADGRAPH_CUDA_ARCHITECTURE (comma-separated list) to use another value or list of values (see #533).
+    # Examples: use 60 for P100 (Piz Daint), 80 for A100 (Juwels Booster, NVidia raplab/Curiosity).
+    MADGRAPH_CUDA_ARCHITECTURE ?= 70
+    ###CUARCHFLAGS = -gencode arch=compute_$(MADGRAPH_CUDA_ARCHITECTURE),code=compute_$(MADGRAPH_CUDA_ARCHITECTURE) -gencode arch=compute_$(MADGRAPH_CUDA_ARCHITECTURE),code=sm_$(MADGRAPH_CUDA_ARCHITECTURE) # Older implementation (AV): go back to this one for multi-GPU support #533
+    ###CUARCHFLAGS = --gpu-architecture=compute_$(MADGRAPH_CUDA_ARCHITECTURE) --gpu-code=sm_$(MADGRAPH_CUDA_ARCHITECTURE),compute_$(MADGRAPH_CUDA_ARCHITECTURE) # Newer implementation (SH): cannot use this as-is for multi-GPU support #533
+    comma:=,
+    CUARCHFLAGS = $(foreach arch,$(subst $(comma), ,$(MADGRAPH_CUDA_ARCHITECTURE)),-gencode arch=compute_$(arch),code=compute_$(arch) -gencode arch=compute_$(arch),code=sm_$(arch))
+    CUINC = -I$(CUDA_HOME)/include/
+    CURANDLIBFLAGS = -L$(CUDA_HOME)/lib64/ -lcurand # NB: -lcuda is not needed here!
+    CUOPTFLAGS = -lineinfo
+    GPUFLAGS = $(foreach opt, $(OPTFLAGS), -Xcompiler $(opt)) $(CUOPTFLAGS) $(INCFLAGS) $(CUINC) $(USE_NVTX) $(CUARCHFLAGS) -use_fast_math
+    ###GPUFLAGS += -Xcompiler -Wall -Xcompiler -Wextra -Xcompiler -Wshadow
+    ###GPUCC_VERSION = $(shell $(GPUCC) --version | grep 'Cuda compilation tools' | cut -d' ' -f5 | cut -d, -f1)
+    GPUFLAGS += -std=c++17 # need CUDA >= 11.2 (see #333): this is enforced in mgOnGpuConfig.h
+    # Without -maxrregcount: baseline throughput: 6.5E8 (16384 32 12) up to 7.3E8 (65536 128 12)
+    ###GPUFLAGS+= --maxrregcount 160 # improves throughput: 6.9E8 (16384 32 12) up to 7.7E8 (65536 128 12)
+    ###GPUFLAGS+= --maxrregcount 128 # improves throughput: 7.3E8 (16384 32 12) up to 7.6E8 (65536 128 12)
+    ###GPUFLAGS+= --maxrregcount 96 # degrades throughput: 4.1E8 (16384 32 12) up to 4.5E8 (65536 128 12)
+    ###GPUFLAGS+= --maxrregcount 64 # degrades throughput: 1.7E8 (16384 32 12) flat at 1.7E8 (65536 128 12)
+    CUBUILDRULEFLAGS = -Xcompiler -fPIC -c
+    CCBUILDRULEFLAGS = -Xcompiler -fPIC -c -x cu
+    CUDATESTFLAGS = -lcuda
+  else ifneq ($(origin REQUIRE_CUDA),undefined)
+    # If REQUIRE_CUDA is set but no cuda is found, stop here (e.g. for CI tests on GPU #443)
+    $(error No cuda installation found (set CUDA_HOME or make GPUCC visible in PATH))
+  else
+    # No cuda. Switch cuda compilation off and go to common random numbers in C++
+    $(warning CUDA_HOME is not set or is invalid: export CUDA_HOME to compile with cuda)
+    override GPUCC=
+    override USE_NVTX=
+    override CUINC=
+    override CURANDLIBFLAGS=
+  endif
+  export GPUCC
+  export GPUFLAGS
+
+  # Set the host C++ compiler for GPUCC via "-ccbin <host-compiler>"
+  # (NB issue #505: this must be a single word, "clang++ --gcc-toolchain..." is not supported)
+  GPUFLAGS += -ccbin $(shell which $(subst ccache ,,$(CXX)))
+
+  # Allow newer (unsupported) C++ compilers with older versions of CUDA if ALLOW_UNSUPPORTED_COMPILER_IN_CUDA is set (#504)
+  ifneq ($(origin ALLOW_UNSUPPORTED_COMPILER_IN_CUDA),undefined)
+  GPUFLAGS += -allow-unsupported-compiler
+  endif
+
+else ifeq ($(findstring hipcc,$(HIP_COMPILER_PATH)),hipcc)
+  #=== Configure the HIP compiler
+
+  # If CXX is not a single word (example "clang++ --gcc-toolchain...") then disable CUDA builds (issue #505)
+  # This is because it is impossible to pass this to "GPUFLAGS += -ccbin <host-compiler>" below
+  ifneq ($(words $(subst ccache ,,$(CXX))),1) # allow at most "CXX=ccache <host-compiler>" from outside
+    $(warning CUDA builds are not supported for multi-word CXX "$(CXX)")
+    override CUDA_HOME=disabled
+  endif
+
+  # If HIP_HOME is not set, try to set it from the location of GPUCC
+  ifndef HIP_HOME
+    HIP_HOME = $(patsubst %bin/hipcc,%,$(HIP_COMPILER_PATH))
+    $(warning HIP_HOME was not set: using "$(HIP_HOME)")
+  endif
 
-# Set the host C++ compiler for nvcc via "-ccbin <host-compiler>"
-# (NB issue #505: this must be a single word, "clang++ --gcc-toolchain..." is not supported)
-CUFLAGS += -ccbin $(shell which $(subst ccache ,,$(CXX)))
+  # Set GPUCC as $(HIP_HOME)/bin/hipcc if it exists
+  ifneq ($(wildcard $(HIP_HOME)/bin/hipcc),)
+    GPUCC = $(HIP_HOME)/bin/hipcc
+
+    # Should maybe find something equivelant to this in HIP
+    #USE_NVTX ?=-DUSE_NVTX
+
+    HIPARCHFLAGS = -target x86_64-linux-gnu --offload-arch=gfx90a
+    HIPINC = -I$(HIP_HOME)/include/
+
+    # -DHIP_FAST_MATH equivelant to -use_fast_math in HIP 
+    # (But only for single precision line 208: https://rocm-developer-tools.github.io/HIP/hcc__detail_2math__functions_8h_source.html)
+    GPUFLAGS = $(OPTFLAGS) $(CUOPTFLAGS) $(INCFLAGS) $(HIPINC) $(HIPARCHFLAGS) -DHIP_FAST_MATH -DHIP_PLATFORM=amd -fPIC
+    ###GPUFLAGS += -Xcompiler -Wall -Xcompiler -Wextra -Xcompiler -Wshadow
+    GPUFLAGS += -std=c++17
+    # Without -maxrregcount: baseline throughput: 6.5E8 (16384 32 12) up to 7.3E8 (65536 128 12)
+    ###GPUFLAGS+= --maxrregcount 160 # improves throughput: 6.9E8 (16384 32 12) up to 7.7E8 (65536 128 12)
+    ###GPUFLAGS+= --maxrregcount 128 # improves throughput: 7.3E8 (16384 32 12) up to 7.6E8 (65536 128 12)
+    ###GPUFLAGS+= --maxrregcount 96 # degrades throughput: 4.1E8 (16384 32 12) up to 4.5E8 (65536 128 12)
+    ###GPUFLAGS+= --maxrregcount 64 # degrades throughput: 1.7E8 (16384 32 12) flat at 1.7E8 (65536 128 12)
+
+    CUBUILDRULEFLAGS = -fPIC -c
+    CCBUILDRULEFLAGS = -fPIC -c
+
+  else ifneq ($(origin REQUIRE_HIP),undefined)
+    # If REQUIRE_HIP is set but no cuda is found, stop here (e.g. for CI tests on GPU #443)
+    $(error No hip installation found (set HIP_HOME or make GPUCC visible in PATH))
+  else
+    # No hip. Switch hip compilation off and go to common random numbers in C++
+    $(warning HIP_HOME is not set or is invalid: export HIP_HOME to compile with hip)
+    override GPUCC=
+    override USE_NVTX=
+    override CUINC=
+    override CURANDLIBFLAGS=
+  endif
+
+  # Allow newer (unsupported) C++ compilers with older versions of CUDA if ALLOW_UNSUPPORTED_COMPILER_IN_CUDA is set (#504)
+  ifneq ($(origin ALLOW_UNSUPPORTED_COMPILER_IN_CUDA),undefined)
+  GPUFLAGS += -allow-unsupported-compiler
+  endif
 
-# Allow newer (unsupported) C++ compilers with older versions of CUDA if ALLOW_UNSUPPORTED_COMPILER_IN_CUDA is set (#504)
-ifneq ($(origin ALLOW_UNSUPPORTED_COMPILER_IN_CUDA),undefined)
-CUFLAGS += -allow-unsupported-compiler
 endif
 
 #-------------------------------------------------------------------------------
@@ -179,9 +245,9 @@ endif
 #ifeq ($(USECCACHE)$(shell echo $(AR) | grep ccache),1)
 #  override AR:=ccache $(AR)
 #endif
-ifneq ($(NVCC),)
-  ifeq ($(USECCACHE)$(shell echo $(NVCC) | grep ccache),1)
-    override NVCC:=ccache $(NVCC)
+ifneq ($(GPUCC),)
+  ifeq ($(USECCACHE)$(shell echo $(GPUCC) | grep ccache),1)
+    override GPUCC:=ccache $(GPUCC)
   endif
 endif
 
@@ -205,7 +271,7 @@ endif
 
 # PowerPC-specific CUDA compiler flags (to be reviewed!)
 ifeq ($(UNAME_P),ppc64le)
-  CUFLAGS+= -Xcompiler -mno-float128
+  GPUFLAGS+= -Xcompiler -mno-float128
 endif
 
 #-------------------------------------------------------------------------------
@@ -215,10 +281,10 @@ endif
 # Set the default OMPFLAGS choice
 ifneq ($(shell $(CXX) --version | egrep '^Intel'),)
 override OMPFLAGS = -fopenmp
-###override OMPFLAGS = # disable OpenMP MT on Intel (was ok without nvcc but not ok with nvcc before #578)
+###override OMPFLAGS = # disable OpenMP MT on Intel (was ok without GPUCC but not ok with GPUCC before #578)
 else ifneq ($(shell $(CXX) --version | egrep '^(clang)'),)
 override OMPFLAGS = -fopenmp
-###override OMPFLAGS = # disable OpenMP MT on clang (was not ok without or with nvcc before #578)
+###override OMPFLAGS = # disable OpenMP MT on clang (was not ok without or with GPUCC before #578)
 else ifneq ($(shell $(CXX) --version | egrep '^(Apple clang)'),)
 override OMPFLAGS = # disable OpenMP MT on Apple clang (builds fail in the CI #578)
 else
@@ -269,7 +335,10 @@ endif
 
 # Set the default RNDGEN (random number generator) choice
 ifeq ($(RNDGEN),)
-  ifeq ($(NVCC),)
+  ifeq ($(GPUCC),)
+    override RNDGEN = hasNoCurand
+  # Edgecase for HIP compilation
+  else ifeq ($(findstring hipcc,$(GPUCC)),hipcc)
     override RNDGEN = hasNoCurand
   else ifeq ($(RNDGEN),)
     override RNDGEN = hasCurand
@@ -344,13 +413,13 @@ CXXFLAGS+= $(AVXFLAGS)
 $(info FPTYPE=$(FPTYPE))
 ifeq ($(FPTYPE),d)
   CXXFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_DOUBLE
-  CUFLAGS  += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_DOUBLE
+  GPUFLAGS  += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_DOUBLE
 else ifeq ($(FPTYPE),f)
   CXXFLAGS += -DMGONGPU_FPTYPE_FLOAT -DMGONGPU_FPTYPE2_FLOAT
-  CUFLAGS  += -DMGONGPU_FPTYPE_FLOAT -DMGONGPU_FPTYPE2_FLOAT
+  GPUFLAGS  += -DMGONGPU_FPTYPE_FLOAT -DMGONGPU_FPTYPE2_FLOAT
 else ifeq ($(FPTYPE),m)
   CXXFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_FLOAT
-  CUFLAGS  += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_FLOAT
+  GPUFLAGS  += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_FLOAT
 else
   $(error Unknown FPTYPE='$(FPTYPE)': only 'd', 'f' and 'm' are supported)
 endif
@@ -359,7 +428,7 @@ endif
 $(info HELINL=$(HELINL))
 ifeq ($(HELINL),1)
   CXXFLAGS += -DMGONGPU_INLINE_HELAMPS
-  CUFLAGS  += -DMGONGPU_INLINE_HELAMPS
+  GPUFLAGS  += -DMGONGPU_INLINE_HELAMPS
 else ifneq ($(HELINL),0)
   $(error Unknown HELINL='$(HELINL)': only '0' and '1' are supported)
 endif
@@ -368,7 +437,7 @@ endif
 $(info HRDCOD=$(HRDCOD))
 ifeq ($(HRDCOD),1)
   CXXFLAGS += -DMGONGPU_HARDCODE_PARAM
-  CUFLAGS  += -DMGONGPU_HARDCODE_PARAM
+  GPUFLAGS  += -DMGONGPU_HARDCODE_PARAM
 else ifneq ($(HRDCOD),0)
   $(error Unknown HRDCOD='$(HRDCOD)': only '0' and '1' are supported)
 endif
@@ -420,11 +489,11 @@ ifeq ($(UNAME_S),Darwin)
   override CULIBFLAGSRPATH2 =
 else
   # RPATH to cuda/cpp libs when linking executables
-  override CXXLIBFLAGSRPATH = -Wl,-rpath,$(LIBDIRRPATH)
-  override CULIBFLAGSRPATH = -Xlinker -rpath,$(LIBDIRRPATH)
+  override CXXLIBFLAGSRPATH = -Wl,-rpath=$(LIBDIRRPATH)
+  override CULIBFLAGSRPATH = -Xlinker -rpath=$(LIBDIRRPATH)
   # RPATH to common lib when linking cuda/cpp libs
-  override CXXLIBFLAGSRPATH2 = -Wl,-rpath,'$$ORIGIN'
-  override CULIBFLAGSRPATH2 = -Xlinker -rpath,'$$ORIGIN'
+  override CXXLIBFLAGSRPATH2 = -Wl,-rpath='$$ORIGIN'
+  override CULIBFLAGSRPATH2 = -Xlinker -rpath='$$ORIGIN'
 endif
 
 # Setting LD_LIBRARY_PATH or DYLD_LIBRARY_PATH in the RUNTIME is no longer necessary (neither on Linux nor on Mac)
@@ -437,7 +506,7 @@ override RUNTIME =
 cxx_main=$(BUILDDIR)/check.exe
 fcxx_main=$(BUILDDIR)/fcheck.exe
 
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 cu_main=$(BUILDDIR)/gcheck.exe
 fcu_main=$(BUILDDIR)/fgcheck.exe
 else
@@ -468,15 +537,16 @@ $(BUILDDIR)/.build.$(TAG):
 	@touch $(BUILDDIR)/.build.$(TAG)
 
 # Generic target and build rules: objects from CUDA compilation
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 $(BUILDDIR)/%.o : %.cu *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
 	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
-	$(NVCC) $(CPPFLAGS) $(CUFLAGS) -Xcompiler -fPIC -c $< -o $@
+	$(GPUCC) $(CPPFLAGS) $(GPUFLAGS) $(CUBUILDRULEFLAGS) $< -o $@
 
 $(BUILDDIR)/%_cu.o : %.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
 	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
-	$(NVCC) $(CPPFLAGS) $(CUFLAGS) -Xcompiler -fPIC -c -x cu $< -o $@
+	$(GPUCC) $(CPPFLAGS) $(GPUFLAGS) $(CCBUILDRULEFLAGS) $< -o $@
 endif
+# -x cu in line above
 
 # Generic target and build rules: objects from C++ compilation
 # (NB do not include CUINC here! add it only for NVTX or curand #679)
@@ -485,11 +555,14 @@ $(BUILDDIR)/%.o : %.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
 	$(CXX) $(CPPFLAGS) $(CXXFLAGS) -fPIC -c $< -o $@
 
 # Apply special build flags only to CrossSectionKernel.cc and gCrossSectionKernel.cu (no fast math, see #117 and #516)
+# Added edgecase for HIP compilation
 ifeq ($(shell $(CXX) --version | grep ^nvc++),)
 $(BUILDDIR)/CrossSectionKernels.o: CXXFLAGS := $(filter-out -ffast-math,$(CXXFLAGS))
 $(BUILDDIR)/CrossSectionKernels.o: CXXFLAGS += -fno-fast-math
-ifneq ($(NVCC),)
-$(BUILDDIR)/gCrossSectionKernels.o: CUFLAGS += -Xcompiler -fno-fast-math
+ifeq ($(findstring nvcc,$(GPUCC)),nvcc)
+  $(BUILDDIR)/gCrossSectionKernels.o: GPUFLAGS += -Xcompiler -fno-fast-math
+else
+  $(BUILDDIR)/gCrossSectionKernels.o: GPUFLAGS += -fno-fast-math
 endif
 endif
 
@@ -505,10 +578,10 @@ ifeq ($(RNDGEN),hasCurand)
 $(BUILDDIR)/CurandRandomNumberKernel.o: CXXFLAGS += $(CUINC)
 endif
 
-# Avoid "warning: builtin __has_trivial_... is deprecated; use __is_trivially_... instead" in nvcc with icx2023 (#592)
+# Avoid "warning: builtin __has_trivial_... is deprecated; use __is_trivially_... instead" in GPUCC with icx2023 (#592)
 ifneq ($(shell $(CXX) --version | egrep '^(Intel)'),)
-ifneq ($(NVCC),)
-CUFLAGS += -Xcompiler -Wno-deprecated-builtins
+ifneq ($(GPUCC),)
+GPUFLAGS += -Wno-deprecated-builtins
 endif
 endif
 
@@ -516,8 +589,8 @@ endif
 # This patch does remove the warning, but I prefer to keep it disabled for the moment...
 ###ifneq ($(shell $(CXX) --version | egrep '^(clang|Apple clang|Intel)'),)
 ###$(BUILDDIR)/CrossSectionKernels.o: CXXFLAGS += -Wno-overriding-t-option
-###ifneq ($(NVCC),)
-###$(BUILDDIR)/gCrossSectionKernels.o: CUFLAGS += -Xcompiler -Wno-overriding-t-option
+###ifneq ($(GPUCC),)
+###$(BUILDDIR)/gCrossSectionKernels.o: GPUFLAGS += -Xcompiler -Wno-overriding-t-option
 ###endif
 ###endif
 
@@ -544,7 +617,7 @@ MG5AMC_CXXLIB = mg5amc_$(processid_short)_cpp
 cxx_objects_lib=$(BUILDDIR)/CPPProcess.o $(BUILDDIR)/MatrixElementKernels.o $(BUILDDIR)/BridgeKernels.o $(BUILDDIR)/CrossSectionKernels.o
 cxx_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel.o $(BUILDDIR)/RamboSamplingKernels.o
 
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 MG5AMC_CULIB = mg5amc_$(processid_short)_cuda
 cu_objects_lib=$(BUILDDIR)/gCPPProcess.o $(BUILDDIR)/gMatrixElementKernels.o $(BUILDDIR)/gBridgeKernels.o $(BUILDDIR)/gCrossSectionKernels.o
 cu_objects_exe=$(BUILDDIR)/gCommonRandomNumberKernel.o $(BUILDDIR)/gRamboSamplingKernels.o
@@ -556,11 +629,11 @@ $(LIBDIR)/lib$(MG5AMC_CXXLIB).so: cxx_objects_lib += $(BUILDDIR)/fbridge.o
 $(LIBDIR)/lib$(MG5AMC_CXXLIB).so: $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib)
 	$(CXX) -shared -o $@ $(cxx_objects_lib) $(CXXLIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB)
 
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 $(LIBDIR)/lib$(MG5AMC_CULIB).so: $(BUILDDIR)/fbridge_cu.o
 $(LIBDIR)/lib$(MG5AMC_CULIB).so: cu_objects_lib += $(BUILDDIR)/fbridge_cu.o
 $(LIBDIR)/lib$(MG5AMC_CULIB).so: $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cu_objects_lib)
-	$(NVCC) --shared -o $@ $(cu_objects_lib) $(CULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB)
+	$(GPUCC) --shared -o $@ $(cu_objects_lib) $(CULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB)
 endif
 
 #-------------------------------------------------------------------------------
@@ -577,16 +650,16 @@ $(cxx_main): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PAT
 $(cxx_main): $(BUILDDIR)/check_sa.o $(LIBDIR)/lib$(MG5AMC_CXXLIB).so $(cxx_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel.o
 	$(CXX) -o $@ $(BUILDDIR)/check_sa.o $(OMPFLAGS) -ldl -pthread $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_CXXLIB) $(cxx_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel.o $(CURANDLIBFLAGS)
 
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 ifneq ($(shell $(CXX) --version | grep ^Intel),)
-$(cu_main): LIBFLAGS += -lintlc # compile with icpx and link with nvcc (undefined reference to `_intel_fast_memcpy')
-$(cu_main): LIBFLAGS += -lsvml # compile with icpx and link with nvcc (undefined reference to `__svml_cos4_l9')
+$(cu_main): LIBFLAGS += -lintlc # compile with icpx and link with GPUCC (undefined reference to `_intel_fast_memcpy')
+$(cu_main): LIBFLAGS += -lsvml # compile with icpx and link with GPUCC (undefined reference to `__svml_cos4_l9')
 else ifneq ($(shell $(CXX) --version | grep ^nvc++),) # support nvc++ #531
 $(cu_main): LIBFLAGS += -L$(patsubst %bin/nvc++,%lib,$(subst ccache ,,$(CXX))) -lnvhpcatm -lnvcpumath -lnvc
 endif
 $(cu_main): LIBFLAGS += $(CULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH
 $(cu_main): $(BUILDDIR)/gcheck_sa.o $(LIBDIR)/lib$(MG5AMC_CULIB).so $(cu_objects_exe) $(BUILDDIR)/gCurandRandomNumberKernel.o
-	$(NVCC) -o $@ $(BUILDDIR)/gcheck_sa.o $(CUARCHFLAGS) $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe) $(BUILDDIR)/gCurandRandomNumberKernel.o $(CURANDLIBFLAGS)
+	$(GPUCC) -o $@ $(BUILDDIR)/gcheck_sa.o $(CUARCHFLAGS) $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe) $(BUILDDIR)/gCurandRandomNumberKernel.o $(CURANDLIBFLAGS)
 endif
 
 #-------------------------------------------------------------------------------
@@ -612,17 +685,17 @@ $(fcxx_main): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PA
 $(fcxx_main): $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler.o $(LIBDIR)/lib$(MG5AMC_CXXLIB).so $(cxx_objects_exe)
 	$(CXX) -o $@ $(BUILDDIR)/fcheck_sa.o $(OMPFLAGS) $(BUILDDIR)/fsampler.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CXXLIB) $(cxx_objects_exe)
 
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 ifneq ($(shell $(CXX) --version | grep ^Intel),)
-$(fcu_main): LIBFLAGS += -lintlc # compile with icpx and link with nvcc (undefined reference to `_intel_fast_memcpy')
-$(fcu_main): LIBFLAGS += -lsvml # compile with icpx and link with nvcc (undefined reference to `__svml_cos4_l9')
+$(fcu_main): LIBFLAGS += -lintlc # compile with icpx and link with GPUCC (undefined reference to `_intel_fast_memcpy')
+$(fcu_main): LIBFLAGS += -lsvml # compile with icpx and link with GPUCC (undefined reference to `__svml_cos4_l9')
 endif
 ifeq ($(UNAME_S),Darwin)
 $(fcu_main): LIBFLAGS += -L$(shell dirname $(shell $(FC) --print-file-name libgfortran.dylib)) # add path to libgfortran on Mac #375
 endif
 $(fcu_main): LIBFLAGS += $(CULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH
 $(fcu_main): $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler_cu.o $(LIBDIR)/lib$(MG5AMC_CULIB).so $(cu_objects_exe)
-	$(NVCC) -o $@ $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler_cu.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe)
+	$(GPUCC) -o $@ $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler_cu.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe)
 endif
 
 #-------------------------------------------------------------------------------
@@ -634,7 +707,7 @@ $(BUILDDIR)/testxxx.o: testxxx_cc_ref.txt
 $(testmain): $(BUILDDIR)/testxxx.o
 $(testmain): cxx_objects_exe += $(BUILDDIR)/testxxx.o # Comment out this line to skip the C++ test of xxx functions
 
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 $(BUILDDIR)/testxxx_cu.o: $(GTESTLIBS)
 $(BUILDDIR)/testxxx_cu.o: INCFLAGS += $(GTESTINC)
 $(BUILDDIR)/testxxx_cu.o: testxxx_cc_ref.txt
@@ -647,7 +720,7 @@ $(BUILDDIR)/testmisc.o: INCFLAGS += $(GTESTINC)
 $(testmain): $(BUILDDIR)/testmisc.o
 $(testmain): cxx_objects_exe += $(BUILDDIR)/testmisc.o # Comment out this line to skip the C++ miscellaneous tests
 
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 $(BUILDDIR)/testmisc_cu.o: $(GTESTLIBS)
 $(BUILDDIR)/testmisc_cu.o: INCFLAGS += $(GTESTINC)
 $(testmain): $(BUILDDIR)/testmisc_cu.o
@@ -659,12 +732,12 @@ $(BUILDDIR)/runTest.o: INCFLAGS += $(GTESTINC)
 $(testmain): $(BUILDDIR)/runTest.o
 $(testmain): cxx_objects_exe += $(BUILDDIR)/runTest.o
 
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 $(BUILDDIR)/runTest_cu.o: $(GTESTLIBS)
 $(BUILDDIR)/runTest_cu.o: INCFLAGS += $(GTESTINC)
 ifneq ($(shell $(CXX) --version | grep ^Intel),)
-$(testmain): LIBFLAGS += -lintlc # compile with icpx and link with nvcc (undefined reference to `_intel_fast_memcpy')
-$(testmain): LIBFLAGS += -lsvml # compile with icpx and link with nvcc (undefined reference to `__svml_cos4_l9')
+$(testmain): LIBFLAGS += -lintlc # compile with icpx and link with GPUCC (undefined reference to `_intel_fast_memcpy')
+$(testmain): LIBFLAGS += -lsvml # compile with icpx and link with GPUCC (undefined reference to `__svml_cos4_l9')
 else ifneq ($(shell $(CXX) --version | grep ^nvc++),) # support nvc++ #531
 $(testmain): LIBFLAGS += -L$(patsubst %bin/nvc++,%lib,$(subst ccache ,,$(CXX))) -lnvhpcatm -lnvcpumath -lnvc
 endif
@@ -688,14 +761,14 @@ $(testmain): LIBFLAGS += -lgomp
 endif
 endif
 
-ifeq ($(NVCC),) # link only runTest.o
+ifeq ($(GPUCC),) # link only runTest.o
 $(testmain): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH
 $(testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(cxx_objects_exe) $(GTESTLIBS)
 	$(CXX) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) -ldl -pthread $(LIBFLAGS)
 else # link both runTest.o and runTest_cu.o
 $(testmain): LIBFLAGS += $(CULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH
 $(testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) $(GTESTLIBS)
-	$(NVCC) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) -ldl $(LIBFLAGS) -lcuda
+	  $(GPUCC) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) -ldl $(LIBFLAGS) $(CUDATESTFLAGS)
 endif
 
 # Use flock (Linux only, no Mac) to allow 'make -j' if googletest has not yet been downloaded https://stackoverflow.com/a/32666215
@@ -798,9 +871,9 @@ ifeq ($(USECCACHE),1)
 	ccache --version | head -1
 endif
 	@echo ""
-	@echo NVCC=$(NVCC)
-ifneq ($(NVCC),)
-	$(NVCC) --version
+	@echo GPUCC=$(GPUCC)
+ifneq ($(GPUCC),)
+	$(GPUCC) --version
 endif
 	@echo ""
 	@echo CXX=$(CXX)
@@ -819,7 +892,7 @@ endif
 
 # Target: check (run the C++ test executable)
 # [NB THIS IS WHAT IS USED IN THE GITHUB CI!]
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 check: runTest cmpFcheck cmpFGcheck
 else
 check: runTest cmpFcheck
diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/fbridge.cc b/epochX/cudacpp/ee_mumu.mad/SubProcesses/fbridge.cc
index f93c05b0b3..2b956730d4 100644
--- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/fbridge.cc
+++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/fbridge.cc
@@ -1,11 +1,11 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: S. Roiser (Oct 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Roiser, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #include "Bridge.h"
 #include "CPPProcess.h"
-#include "CudaRuntime.h"
+#include "GpuRuntime.h"
 
 extern "C"
 {
@@ -22,7 +22,7 @@ extern "C"
    * Using the same Fortran MadEvent code, linking to the hetrerogeneous library would allow access to both CPU and GPU implementations.
    * The specific heterogeneous configuration (how many GPUs, how many threads on each CPU, etc) could be loaded in CUDA/C++ from a data file.
    */
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   using namespace mg5amcGpu;
 #else
   using namespace mg5amcCpu;
@@ -46,8 +46,8 @@ extern "C"
    */
   void fbridgecreate_( CppObjectInFortran** ppbridge, const int* pnevtF, const int* pnparF, const int* pnp4F )
   {
-#ifdef __CUDACC__
-    CudaRuntime::setUp();
+#ifdef MGONGPUCPP_GPUIMPL
+    GpuRuntime::setUp();
 #endif
     // Create a process object, read parm card and set parameters
     // FIXME: the process instance can happily go out of scope because it is only needed to read parameters?
@@ -69,8 +69,8 @@ extern "C"
     Bridge<FORTRANFPTYPE>* pbridge = dynamic_cast<Bridge<FORTRANFPTYPE>*>( *ppbridge );
     if( pbridge == 0 ) throw std::runtime_error( "fbridgedelete_: invalid Bridge address" );
     delete pbridge;
-#ifdef __CUDACC__
-    CudaRuntime::tearDown();
+#ifdef MGONGPUCPP_GPUIMPL
+    GpuRuntime::tearDown();
 #endif
   }
 
@@ -100,7 +100,7 @@ extern "C"
   {
     Bridge<FORTRANFPTYPE>* pbridge = dynamic_cast<Bridge<FORTRANFPTYPE>*>( *ppbridge );
     if( pbridge == 0 ) throw std::runtime_error( "fbridgesequence_: invalid Bridge address" );
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     // Use the device/GPU implementation in the CUDA library
     // (there is also a host implementation in this library)
     pbridge->gpu_sequence( momenta, gs, rndhel, rndcol, *pchannelId, mes, selhel, selcol );
diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/fsampler.cc b/epochX/cudacpp/ee_mumu.mad/SubProcesses/fsampler.cc
index 2fb445372d..3743934f41 100644
--- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/fsampler.cc
+++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/fsampler.cc
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Feb 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #include "mgOnGpuConfig.h"
 
@@ -13,7 +13,7 @@
 
 //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -40,7 +40,7 @@ namespace mg5amcCpu
   private:
     const int m_nevt; // The number of events in each iteration
     int m_iiter;      // The iteration counter (for random number seeding)
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
     HostBufferRndNumMomenta m_hstRndmom; // Memory buffers for random numbers
     HostBufferMomenta m_hstMomenta;      // Memory buffers for momenta
     HostBufferWeights m_hstWeights;      // Memory buffers for sampling weights
@@ -105,7 +105,7 @@ namespace mg5amcCpu
 
 extern "C"
 {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   using namespace mg5amcGpu;
 #else
   using namespace mg5amcCpu;
diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/runTest.cc b/epochX/cudacpp/ee_mumu.mad/SubProcesses/runTest.cc
index df7488178e..4c9bc9ee6b 100644
--- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/runTest.cc
+++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/runTest.cc
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: S. Hageboeck (Nov 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 //----------------------------------------------------------------------------
 // Use ./runTest.exe --gtest_filter=*xxx to run only testxxx.cc tests
 //----------------------------------------------------------------------------
@@ -18,7 +18,7 @@
 #include "RandomNumberKernels.h"
 #include "epoch_process_id.h"
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 using namespace mg5amcGpu;
 #else
 using namespace mg5amcCpu;
@@ -35,7 +35,7 @@ struct CUDA_CPU_TestBase : public TestDriverBase
     : TestDriverBase( npar, refFileName ) {}
 };
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 struct CPUTest : public CUDA_CPU_TestBase
 {
   // Struct data members (process, and memory structures for random numbers, momenta, matrix elements and weights on host and device)
@@ -117,7 +117,7 @@ struct CPUTest : public CUDA_CPU_TestBase
 };
 #endif
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 struct CUDATest : public CUDA_CPU_TestBase
 {
   // Reset the device when our test goes out of scope. Note that this should happen after
@@ -126,7 +126,7 @@ struct CUDATest : public CUDA_CPU_TestBase
   {
     ~DeviceReset()
     {
-      checkCuda( cudaDeviceReset() ); // this is needed by cuda-memcheck --leak-check full
+      gpuDeviceReset(); // this is needed by cuda-memcheck --leak-check full
     }
   } deviceResetter;
 
@@ -252,7 +252,7 @@ INSTANTIATE_TEST_SUITE_P( prefix, \
                           test_suite_name, \
                           testing::Values( new CUDATest( MG_EPOCH_REFERENCE_FILE_NAME ) ) );
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 MG_INSTANTIATE_TEST_SUITE_GPU( XTESTID_GPU( MG_EPOCH_PROCESS_ID ), MadgraphTest );
 #else
 MG_INSTANTIATE_TEST_SUITE_CPU( XTESTID_CPU( MG_EPOCH_PROCESS_ID ), MadgraphTest );
diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/testmisc.cc b/epochX/cudacpp/ee_mumu.mad/SubProcesses/testmisc.cc
index 895d6eeb56..ba9e59a8a3 100644
--- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/testmisc.cc
+++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/testmisc.cc
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 //----------------------------------------------------------------------------
 // Use ./runTest.exe --gtest_filter=*misc to run only testmisc.cc tests
 //----------------------------------------------------------------------------
@@ -17,7 +17,7 @@
 #include <sstream>
 #include <typeinfo>
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 #define TESTID( s ) s##_GPU_MISC
 #else
 #define TESTID( s ) s##_CPU_MISC
@@ -26,7 +26,7 @@
 #define XTESTID( s ) TESTID( s )
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -59,7 +59,7 @@ namespace mg5amcCpu
 
 TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testmisc )
 {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   using namespace mg5amcGpu;
 #else
   using namespace mg5amcCpu;
diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/testxxx.cc b/epochX/cudacpp/ee_mumu.mad/SubProcesses/testxxx.cc
index e40f635e46..016bc0f472 100644
--- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/testxxx.cc
+++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/testxxx.cc
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Apr 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 //----------------------------------------------------------------------------
 // Use ./runTest.exe --gtest_filter=*xxx to run only testxxx.cc tests
 //----------------------------------------------------------------------------
@@ -24,7 +24,7 @@
 #include <iomanip>
 #include <iostream>
 #include <vector>
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 #define TESTID( s ) s##_GPU_XXX
 #else
 #define TESTID( s ) s##_CPU_XXX
@@ -32,7 +32,7 @@
 
 #define XTESTID( s ) TESTID( s )
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -42,7 +42,7 @@ namespace mg5amcCpu
   int FPEhandlerIevt = -1;
   inline void FPEhandler( int sig )
   {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     std::cerr << "Floating Point Exception (GPU): '" << FPEhandlerMessage << "' ievt=" << FPEhandlerIevt << std::endl;
 #else
     std::cerr << "Floating Point Exception (CPU neppV=" << neppV << "): '" << FPEhandlerMessage << "' ievt=" << FPEhandlerIevt << std::endl;
@@ -53,7 +53,7 @@ namespace mg5amcCpu
 
 TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testxxx )
 {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   using namespace mg5amcGpu;
 #else
   using namespace mg5amcCpu;
@@ -76,7 +76,7 @@ TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testxxx )
   assert( nevt % neppM == 0 ); // nevt must be a multiple of neppM
   assert( nevt % neppV == 0 ); // nevt must be a multiple of neppV
   // Fill in the input momenta
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   mg5amcGpu::PinnedHostBufferMomenta hstMomenta( nevt ); // AOSOA[npagM][npar=4][np4=4][neppM]
 #else
   mg5amcCpu::HostBufferMomenta hstMomenta( nevt ); // AOSOA[npagM][npar=4][np4=4][neppM]
@@ -321,7 +321,7 @@ TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testxxx )
   {
     for( int ievt = 0; ievt < nevt; ievt++ )
     {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
       using namespace mg5amcGpu;
 #else
       using namespace mg5amcCpu;
diff --git a/epochX/cudacpp/ee_mumu.mad/bin/internal/ufomodel/py3_model.pkl b/epochX/cudacpp/ee_mumu.mad/bin/internal/ufomodel/py3_model.pkl
index c67363a41f7befb95905219d5f8068b362abbe2f..b6989c1453094d7f45cf2ee4b2124efa29e9064b 100644
GIT binary patch
delta 44
zcmX?hj%n{XrVZZ9<PsSe81xg<iuIFIi}Li6GxW>zi?a2z^s`D*Gt>1a7cIL20CztS
A_y7O^

delta 53
zcmdmcj_KGrrVZZ9)Uy~E81z#TOA_@H%Mx=Ei;FY$-2+0642+EReceqHeVz5wGZM>m
JCuc6Z0ssYc66F8@

diff --git a/epochX/cudacpp/ee_mumu.mad/src/HelAmps_sm.h b/epochX/cudacpp/ee_mumu.mad/src/HelAmps_sm.h
index d6827356eb..7f45321a6d 100644
--- a/epochX/cudacpp/ee_mumu.mad/src/HelAmps_sm.h
+++ b/epochX/cudacpp/ee_mumu.mad/src/HelAmps_sm.h
@@ -5,7 +5,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: A. Valassi (Sep 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
 // MadGraph5_aMC@NLO v. 3.5.0_lo_vect, 2023-06-09
@@ -28,7 +28,7 @@
 //#include <iomanip>
 //#include <iostream>
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/ee_mumu.mad/src/Parameters_sm.cc b/epochX/cudacpp/ee_mumu.mad/src/Parameters_sm.cc
index bb5bb59266..ba2601b809 100644
--- a/epochX/cudacpp/ee_mumu.mad/src/Parameters_sm.cc
+++ b/epochX/cudacpp/ee_mumu.mad/src/Parameters_sm.cc
@@ -4,7 +4,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: A. Valassi (Sep 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
 // MadGraph5_aMC@NLO v. 3.5.0_lo_vect, 2023-06-09
@@ -17,7 +17,7 @@
 #include <iomanip>
 #include <iostream>
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 using namespace mg5amcGpu;
 #else
 using namespace mg5amcCpu;
diff --git a/epochX/cudacpp/ee_mumu.mad/src/Parameters_sm.h b/epochX/cudacpp/ee_mumu.mad/src/Parameters_sm.h
index 0a9d742cda..bc5174f191 100644
--- a/epochX/cudacpp/ee_mumu.mad/src/Parameters_sm.h
+++ b/epochX/cudacpp/ee_mumu.mad/src/Parameters_sm.h
@@ -27,7 +27,7 @@
 #include "read_slha.h"
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -93,7 +93,7 @@ namespace mg5amcCpu
 #include <limits>
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -218,7 +218,7 @@ namespace mg5amcCpu
 //==========================================================================
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -235,7 +235,7 @@ namespace mg5amcCpu
 #pragma GCC diagnostic push
 #pragma GCC diagnostic ignored "-Wunused-variable"  // e.g. <<warning: unused variable ‘mdl_G__exp__2’ [-Wunused-variable]>>
 #pragma GCC diagnostic ignored "-Wunused-parameter" // e.g. <<warning: unused parameter ‘G’ [-Wunused-parameter]>>
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 #pragma nv_diagnostic push
 #pragma nv_diag_suppress 177 // e.g. <<warning #177-D: variable "mdl_G__exp__2" was declared but never referenced>>
 #endif
@@ -259,7 +259,7 @@ namespace mg5amcCpu
       // End SM implementation - no special handling of vectors of floats as in EFT (#439)
       return out;
     }
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 #pragma GCC diagnostic pop
 #pragma nv_diagnostic pop
 #endif
diff --git a/epochX/cudacpp/ee_mumu.mad/src/cudacpp_src.mk b/epochX/cudacpp/ee_mumu.mad/src/cudacpp_src.mk
index 554d7a704c..f73ff6fa03 100644
--- a/epochX/cudacpp/ee_mumu.mad/src/cudacpp_src.mk
+++ b/epochX/cudacpp/ee_mumu.mad/src/cudacpp_src.mk
@@ -38,13 +38,13 @@ endif
 
 #-------------------------------------------------------------------------------
 
-#=== Configure the CUDA compiler (note: NVCC is already exported including ccache)
+#=== Configure the CUDA compiler (note: GPUCC is already exported including ccache)
 
-###$(info NVCC=$(NVCC))
+###$(info GPUCC=$(GPUCC))
 
 #-------------------------------------------------------------------------------
 
-#=== Configure ccache for C++ builds (note: NVCC is already exported including ccache)
+#=== Configure ccache for C++ builds (note: GPUCC is already exported including ccache)
 
 # Enable ccache if USECCACHE=1
 ifeq ($(USECCACHE)$(shell echo $(CXX) | grep ccache),1)
@@ -246,20 +246,20 @@ $(BUILDDIR)/%.o : %.cc *.h $(BUILDDIR)/.build.$(TAG)
 # Generic target and build rules: objects from CUDA compilation
 $(BUILDDIR)/%_cu.o : %.cc *.h $(BUILDDIR)/.build.$(TAG)
 	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
-	$(NVCC) $(CPPFLAGS) $(CUFLAGS) -Xcompiler -fPIC -c -x cu $< -o $@
+	$(GPUCC) $(CPPFLAGS) $(CUFLAGS) -Xcompiler -fPIC -c -x cu $< -o $@
 
 #-------------------------------------------------------------------------------
 
 cxx_objects=$(addprefix $(BUILDDIR)/, Parameters_sm.o read_slha.o)
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 cu_objects=$(addprefix $(BUILDDIR)/, Parameters_sm_cu.o)
 endif
 
 # Target (and build rules): common (src) library
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so : $(cxx_objects) $(cu_objects)
 	@if [ ! -d $(LIBDIR) ]; then echo "mkdir -p $(LIBDIR)"; mkdir -p $(LIBDIR); fi
-	$(NVCC) -shared -o $@ $(cxx_objects) $(cu_objects)
+	$(GPUCC) -shared -o $@ $(cxx_objects) $(cu_objects)
 else
 $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so : $(cxx_objects)
 	@if [ ! -d $(LIBDIR) ]; then echo "mkdir -p $(LIBDIR)"; mkdir -p $(LIBDIR); fi
diff --git a/epochX/cudacpp/ee_mumu.mad/src/mgOnGpuConfig.h b/epochX/cudacpp/ee_mumu.mad/src/mgOnGpuConfig.h
index f2227ff0d8..e540c8587c 100644
--- a/epochX/cudacpp/ee_mumu.mad/src/mgOnGpuConfig.h
+++ b/epochX/cudacpp/ee_mumu.mad/src/mgOnGpuConfig.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jul 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MGONGPUCONFIG_H
 #define MGONGPUCONFIG_H 1
@@ -24,9 +24,13 @@
 // ** NB2 Baseline on b7g47n0004 fluctuates (probably depends on load on other VMs)
 
 // Choose if curand is supported for generating random numbers
+// For CUDA, by default, it is supported
+// For HIP, by default, it is not supported
 // For C++, by default, do not inline, but allow this macro to be set from outside with e.g. -DMGONGPU_HAS_NO_CURAND
 #ifdef __CUDACC__
 #undef MGONGPU_HAS_NO_CURAND
+#elif defined __HIPCC__
+#define MGONGPU_HAS_NO_CURAND 1
 #else
 //#undef MGONGPU_HAS_NO_CURAND // default
 ////#define MGONGPU_HAS_NO_CURAND 1
@@ -62,23 +66,28 @@
 //#undef MGONGPU_HARDCODE_PARAM // default
 ////#define MGONGPU_HARDCODE_PARAM 1
 
-// Complex type in c++: std::complex or cxsmpl (CHOOSE ONLY ONE)
-#ifndef __CUDACC__
-//#define MGONGPU_CPPCXTYPE_STDCOMPLEX 1 // ~8 percent slower on float, same on double (5.1E6/double, 9.4E6/float)
-#define MGONGPU_CPPCXTYPE_CXSMPL 1 // new default (5.1E6/double, 10.2E6/float)
-#endif
-
-// Complex type in cuda: thrust or cucomplex or cxsmpl (CHOOSE ONLY ONE)
+// Complex type in CUDA: thrust or cucomplex or cxsmpl (CHOOSE ONLY ONE)
 #ifdef __CUDACC__
 #define MGONGPU_CUCXTYPE_THRUST 1 // default (~1.15E9/double, ~3.2E9/float)
 //#define MGONGPU_CUCXTYPE_CUCOMPLEX 1 // ~10 percent slower (1.03E9/double, ~2.8E9/float)
 //#define MGONGPU_CUCXTYPE_CXSMPL 1 // ~10 percent slower (1.00E9/double, ~2.9E9/float)
+
+// Complex type in HIP: cxsmpl (ONLY ONE OPTION POSSIBLE)
+#elif defined __HIPCC__
+#define MGONGPU_CUCXTYPE_CXSMPL 1 // ~10 percent slower (1.00E9/double, ~2.9E9/float)
+
+// Complex type in C++: std::complex or cxsmpl (CHOOSE ONLY ONE)
+#else
+//#define MGONGPU_CPPCXTYPE_STDCOMPLEX 1 // ~8 percent slower on float, same on double (5.1E6/double, 9.4E6/float)
+#define MGONGPU_CPPCXTYPE_CXSMPL 1 // new default (5.1E6/double, 10.2E6/float)
 #endif
 
-// Cuda nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation
+// CUDA nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation
 #ifdef __CUDACC__
-#undef MGONGPU_NSIGHT_DEBUG // default
+#undef MGONGPU_NSIGHT_DEBUG // default in CUDA
 //#define MGONGPU_NSIGHT_DEBUG 1
+#else
+#undef MGONGPU_NSIGHT_DEBUG // only option in HIP or C++
 #endif
 
 // SANITY CHECKS (floating point precision for everything but color algebra #537)
@@ -94,17 +103,21 @@
 #error You cannot use double precision for color algebra and single precision elsewhere
 #endif
 
-// SANITY CHECKS (c++ complex number implementation)
-#ifndef __CUDACC__
-#if defined MGONGPU_CPPCXTYPE_STDCOMPLEX and defined MGONGPU_CPPCXTYPE_CXSMPL
-#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CPPCXTYPE_STDCOMPLEX or MGONGPU_CPPCXTYPE_CXSMPL
+// SANITY CHECKS (CUDA complex number implementation)
+#ifdef __CUDACC__
+#if defined MGONGPU_CUCXTYPE_THRUST and defined MGONGPU_CUCXTYPE_CUCOMPLEX
+#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CUCXTYPE_THRUST or MGONGPU_CUCXTYPE_CUCOMPLEX for CUDA
+#elif defined MGONGPU_CUCXTYPE_THRUST and defined MGONGPU_CUCXTYPE_CXSMPL
+#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CUCXTYPE_THRUST or MGONGPU_CUCXTYPE_CXSMPL for CUDA
+#elif defined MGONGPU_CUCXTYPE_CUCOMPLEX and defined MGONGPU_CUCXTYPE_CXSMPL
+#error You must CHOOSE (ONE AND) ONLY ONE OF MGONGPU_CUCXTYPE_CUCOMPLEX or MGONGPU_CUCXTYPE_CXSMPL for CUDA
 #endif
 #endif
 
-// SANITY CHECKS (cuda complex number implementation)
-#ifdef __CUDACC__
-#if defined MGONGPU_CUCXTYPE_THRUST and defined MGONGPU_CUCXTYPE_CUCOMPLEX and defined MGONGPU_CUCXTYPE_CXSMPL
-#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CUCXTYPE_THRUST or MGONGPU_CUCXTYPE_CUCOMPLEX or MGONGPU_CUCXTYPE_CXSMPL
+// SANITY CHECKS (C++ complex number implementation)
+#ifndef MGONGPUCPP_GPUIMPL
+#if defined MGONGPU_CPPCXTYPE_STDCOMPLEX and defined MGONGPU_CPPCXTYPE_CXSMPL
+#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CPPCXTYPE_STDCOMPLEX or MGONGPU_CPPCXTYPE_CXSMPL for C++
 #endif
 #endif
 
@@ -142,7 +155,7 @@ namespace mgOnGpu
   // Alignment requirement for using reinterpret_cast with SIMD vectorized code
   // (using reinterpret_cast with non aligned memory may lead to segmentation faults!)
   // Only needed for C++ code but can be enforced also in NVCC builds of C++ code using CUDA>=11.2 and C++17 (#318, #319, #333)
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   constexpr int cppAlign = 64; // alignment requirement for SIMD vectorization (64-byte i.e. 512-bit)
 #endif
 
@@ -153,7 +166,7 @@ using mgOnGpu::fptype;
 using mgOnGpu::fptype2;
 
 // C++ SIMD vectorization width (this will be used to set neppV)
-#ifdef __CUDACC__ // CUDA implementation has no SIMD
+#ifdef MGONGPUCPP_GPUIMPL // CUDA and HIP implementations have no SIMD
 #undef MGONGPU_CPPSIMD
 #elif defined __AVX512VL__ && defined MGONGPU_PVW512 // C++ "512z" AVX512 with 512 width (512-bit ie 64-byte): 8 (DOUBLE) or 16 (FLOAT)
 #ifdef MGONGPU_FPTYPE_DOUBLE
@@ -183,9 +196,9 @@ using mgOnGpu::fptype2;
 #undef MGONGPU_CPPSIMD
 #endif
 
-// Cuda nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation
+// CUDA nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation
 // Arguments (not used so far): text is __FUNCTION__, code is 0 (start) or 1 (end)
-#if defined __CUDACC__ && defined MGONGPU_NSIGHT_DEBUG /* clang-format off */
+#if defined __CUDA__ && defined MGONGPU_NSIGHT_DEBUG /* clang-format off */
 #define mgDebugDeclare() __shared__ float mgDebugCounter[mgOnGpu::ntpbMAX];
 #define mgDebugInitialise() { mgDebugCounter[threadIdx.x] = 0; }
 #define mgDebug( code, text ) { mgDebugCounter[threadIdx.x] += 1; }
@@ -197,8 +210,8 @@ using mgOnGpu::fptype2;
 #define mgDebugFinalise() { /*noop*/ }
 #endif /* clang-format on */
 
-// Define empty CUDA declaration specifiers for C++
-#ifndef __CUDACC__
+// Define empty CUDA/HIP declaration specifiers for C++
+#ifndef MGONGPUCPP_GPUIMPL
 #define __global__
 #define __host__
 #define __device__
diff --git a/epochX/cudacpp/ee_mumu.mad/src/mgOnGpuCxtypes.h b/epochX/cudacpp/ee_mumu.mad/src/mgOnGpuCxtypes.h
index b56348bc58..6ae0c42ecb 100644
--- a/epochX/cudacpp/ee_mumu.mad/src/mgOnGpuCxtypes.h
+++ b/epochX/cudacpp/ee_mumu.mad/src/mgOnGpuCxtypes.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022, based on earlier work by D. Smith) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MGONGPUCXTYPES_H
 #define MGONGPUCXTYPES_H 1
@@ -19,7 +19,7 @@
 #include <complex>
 
 // Complex type in cuda: thrust or cucomplex or cxsmpl
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 #if defined MGONGPU_CUCXTYPE_THRUST
 #pragma clang diagnostic push
 #pragma clang diagnostic ignored "-Wtautological-compare" // for icpx2021/clang13 (https://stackoverflow.com/a/15864661)
@@ -82,7 +82,7 @@ namespace mgOnGpu /* clang-format off */
 using mgOnGpu::cxsmpl;
 
 // Printout to stream for user defined types
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -209,14 +209,14 @@ namespace mg5amcCpu
 //==========================================================================
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
 #endif
 {
   // --- Type definitions (complex type: cxtype)
-#ifdef __CUDACC__ // cuda
+#ifdef MGONGPUCPP_GPUIMPL // cuda
 #if defined MGONGPU_CUCXTYPE_THRUST
   typedef thrust::complex<fptype> cxtype;
 #elif defined MGONGPU_CUCXTYPE_CUCOMPLEX
@@ -249,7 +249,7 @@ namespace mg5amcCpu
 //==========================================================================
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -301,7 +301,7 @@ namespace mg5amcCpu
 
   //==========================================================================
 
-#if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_THRUST // cuda + thrust
+#if defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CUCXTYPE_THRUST // cuda + thrust
 
   //------------------------------
   // CUDA - using thrust::complex
@@ -337,11 +337,11 @@ namespace mg5amcCpu
     return c;
   }
 
-#endif // #if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_THRUST
+#endif // #if defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CUCXTYPE_THRUST
 
   //==========================================================================
 
-#if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_CUCOMPLEX // cuda + cucomplex
+#if defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CUCXTYPE_CUCOMPLEX // cuda + cucomplex
 
   //------------------------------
   // CUDA - using cuComplex
@@ -556,11 +556,11 @@ namespace mg5amcCpu
     return cxmake( c.real(), c.imag() );
   }
 
-#endif // #if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_CUCOMPLEX
+#endif // #if defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CUCXTYPE_CUCOMPLEX
 
   //==========================================================================
 
-#if not defined __CUDACC__ and defined MGONGPU_CPPCXTYPE_STDCOMPLEX // c++ + stdcomplex
+#if not defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CPPCXTYPE_STDCOMPLEX // c++ + stdcomplex
 
   //------------------------------
   // C++ - using std::complex
@@ -604,7 +604,7 @@ namespace mg5amcCpu
   }
 #endif
 
-#endif // #if not defined __CUDACC__ and defined MGONGPU_CPPCXTYPE_STDCOMPLEX
+#endif // #if not defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CPPCXTYPE_STDCOMPLEX
 
   //==========================================================================
 
@@ -627,7 +627,7 @@ namespace mg5amcCpu
 //==========================================================================
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/ee_mumu.mad/src/mgOnGpuFptypes.h b/epochX/cudacpp/ee_mumu.mad/src/mgOnGpuFptypes.h
index 905c97d700..fa3a02664b 100644
--- a/epochX/cudacpp/ee_mumu.mad/src/mgOnGpuFptypes.h
+++ b/epochX/cudacpp/ee_mumu.mad/src/mgOnGpuFptypes.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MGONGPUFPTYPES_H
 #define MGONGPUFPTYPES_H 1
@@ -12,7 +12,7 @@
 #include <cmath>
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL // cuda
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -20,7 +20,7 @@ namespace mg5amcCpu
 {
   //==========================================================================
 
-#ifdef __CUDACC__ // cuda
+#ifdef MGONGPUCPP_GPUIMPL // cuda
 
   //------------------------------
   // Floating point types - Cuda
@@ -64,11 +64,11 @@ namespace mg5amcCpu
 #endif
   }
 
-#endif // #ifdef __CUDACC__
+#endif // #ifdef MGONGPUCPP_GPUIMPL
 
   //==========================================================================
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 
   //------------------------------
   // Floating point types - C++
@@ -92,7 +92,7 @@ namespace mg5amcCpu
     return std::sqrt( f );
   }
 
-#endif // #ifndef __CUDACC__
+#endif // #ifndef MGONGPUCPP_GPUIMPL
 
   //==========================================================================
 
diff --git a/epochX/cudacpp/ee_mumu.mad/src/mgOnGpuVectors.h b/epochX/cudacpp/ee_mumu.mad/src/mgOnGpuVectors.h
index e1299ba81e..cdae04326b 100644
--- a/epochX/cudacpp/ee_mumu.mad/src/mgOnGpuVectors.h
+++ b/epochX/cudacpp/ee_mumu.mad/src/mgOnGpuVectors.h
@@ -32,7 +32,7 @@
 #endif
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -131,7 +131,7 @@ namespace mg5amcCpu
 #endif
 #endif
 
-#else // i.e #ifndef MGONGPU_CPPSIMD (this includes #ifdef __CUDACC__)
+#else // i.e #ifndef MGONGPU_CPPSIMD (this includes #ifdef MGONGPUCPP_GPUIMPL)
 
   const int neppV = 1;
 
@@ -153,13 +153,13 @@ namespace mg5amcCpu
 //==========================================================================
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
 #endif
 {
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 
   // Printout to stream for user defined types
 
@@ -805,11 +805,11 @@ namespace mg5amcCpu
 
 #endif // #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
 
-#endif // #ifndef __CUDACC__
+#endif // #ifndef MGONGPUCPP_GPUIMPL
 
   //==========================================================================
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 
   //------------------------------
   // Vector types - CUDA
@@ -853,12 +853,12 @@ namespace mg5amcCpu
     return mask;
   }
 
-#endif // #ifdef __CUDACC__
+#endif // #ifdef MGONGPUCPP_GPUIMPL
 
   //==========================================================================
 
   // Scalar-or-vector types: scalar in CUDA, vector or scalar in C++
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   typedef bool bool_sv;
   typedef fptype fptype_sv;
   typedef fptype2 fptype2_sv;
@@ -879,7 +879,7 @@ namespace mg5amcCpu
 #endif
 
   // Scalar-or-vector zeros: scalar in CUDA, vector or scalar in C++
-#ifdef __CUDACC__ /* clang-format off */
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
   inline __host__ __device__ cxtype cxzero_sv(){ return cxtype( 0, 0 ); }
 #elif defined MGONGPU_CPPSIMD
   inline cxtype_v cxzero_sv() { return cxtype_v(); } // RRRR=0000 IIII=0000
diff --git a/epochX/cudacpp/ee_mumu.mad/src/rambo.h b/epochX/cudacpp/ee_mumu.mad/src/rambo.h
index e02ea52496..cd7e1008ea 100644
--- a/epochX/cudacpp/ee_mumu.mad/src/rambo.h
+++ b/epochX/cudacpp/ee_mumu.mad/src/rambo.h
@@ -4,7 +4,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 
 #include "mgOnGpuConfig.h"
@@ -18,7 +18,7 @@
 #include <iostream>
 
 // Simplified rambo version for 2 to N (with N>=2) processes with massless particles
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -83,7 +83,7 @@ namespace mg5amcCpu
       static bool first = true;
       if( first )
       {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
         if constexpr( M_ACCESS::isOnDevice() ) // avoid
         {
           const int ievt0 = 0;
@@ -166,7 +166,7 @@ namespace mg5amcCpu
     wt = po2log;
     if( nparf != 2 ) wt = ( 2. * nparf - 4. ) * log( energy ) + z[nparf - 1];
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
     // issue warnings if weight is too small or too large
     static int iwarn[5] = { 0, 0, 0, 0, 0 };
     if( wt < -180. )
diff --git a/epochX/cudacpp/ee_mumu.sa/CODEGEN_cudacpp_ee_mumu_log.txt b/epochX/cudacpp/ee_mumu.sa/CODEGEN_cudacpp_ee_mumu_log.txt
index 275576a43d..eef31d6909 100644
--- a/epochX/cudacpp/ee_mumu.sa/CODEGEN_cudacpp_ee_mumu_log.txt
+++ b/epochX/cudacpp/ee_mumu.sa/CODEGEN_cudacpp_ee_mumu_log.txt
@@ -53,7 +53,7 @@ Note that you can still compile and run aMC@NLO with the built-in PDFs
 Using default text editor "vi". Set another one in ./input/mg5_configuration.txt
 Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt
 No valid web browser found. Please set in ./input/mg5_configuration.txt
-import /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_cudacpp_ee_mumu.mg
+import /afs/cern.ch/work/j/jteig/mg5amcnlo/CODEGEN_cudacpp_ee_mumu.mg
 The import format was not given, so we guess it as command
 set stdout_level DEBUG
 set output information to level: 10
@@ -62,7 +62,7 @@ generate e+ e- > mu+ mu-
 No model currently active, so we import the Standard Model
 INFO: load particles 
 INFO: load vertices 
-[1;32mDEBUG: model prefixing  takes 0.0053577423095703125 [0m
+[1;32mDEBUG: model prefixing  takes 0.0036804676055908203 [0m
 INFO: Restrict model sm with file models/sm/restrict_default.dat . 
 [1;32mDEBUG: Simplifying conditional expressions [0m
 [1;32mDEBUG: remove interactions: u s w+ at order: QED=1 [0m
@@ -162,7 +162,7 @@ Load PLUGIN.CUDACPP_SA_OUTPUT
 [1;32mDEBUG:  cformat = [0m plugin [1;30m[export_cpp.py at line 3071][0m [0m
 [1;32mDEBUG:  Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [1;30m[output.py at line 152][0m [0m
 [1;32mDEBUG:  Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [1;30m[output.py at line 157][0m [0m
-INFO: Creating subdirectories in directory /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_cudacpp_ee_mumu 
+INFO: Creating subdirectories in directory /afs/cern.ch/work/j/jteig/mg5amcnlo/CODEGEN_cudacpp_ee_mumu 
 INFO: Organizing processes into subprocess groups 
 INFO: Generating Helas calls for process: e+ e- > mu+ mu- WEIGHTED<=4 @1 
 INFO: Processing color information for process: e+ e- > mu+ mu- @1 
@@ -172,12 +172,12 @@ INFO: Processing color information for process: e+ e- > mu+ mu- @1
 [1;32mDEBUG:    type(me)=<class 'int'> me=0 [1;30m[output.py at line 189][0m [0m
 [1;32mDEBUG:  Entering PLUGIN_OneProcessExporter.__init__ [1;30m[model_handling.py at line 1039][0m [0m
 [1;32mDEBUG:  proc_id = [0m 0 [1;30m[model_handling.py at line 1045][0m [0m
-INFO: Creating files in directory /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_cudacpp_ee_mumu/SubProcesses/P1_Sigma_sm_epem_mupmum 
+INFO: Creating files in directory /afs/cern.ch/work/j/jteig/mg5amcnlo/CODEGEN_cudacpp_ee_mumu/SubProcesses/P1_Sigma_sm_epem_mupmum 
 [1;32mDEBUG:  Entering PLUGIN_OneProcessExporter.generate_process_files [1;30m[model_handling.py at line 1297][0m [0m
 [1;32mDEBUG:  self.include_multi_channel is not yet defined: this is standalone_cudacpp mode [1;30m[model_handling.py at line 1301][0m [0m
-FileWriter <class 'PLUGIN.CUDACPP_SA_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_cudacpp_ee_mumu/SubProcesses/P1_Sigma_sm_epem_mupmum/./CPPProcess.h
+FileWriter <class 'PLUGIN.CUDACPP_SA_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /afs/cern.ch/work/j/jteig/mg5amcnlo/CODEGEN_cudacpp_ee_mumu/SubProcesses/P1_Sigma_sm_epem_mupmum/./CPPProcess.h
 [1;32mDEBUG:  Entering PLUGIN_OneProcessExporter.write_process_h_file [1;30m[model_handling.py at line 1453][0m [0m
-FileWriter <class 'PLUGIN.CUDACPP_SA_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_cudacpp_ee_mumu/SubProcesses/P1_Sigma_sm_epem_mupmum/./CPPProcess.cc
+FileWriter <class 'PLUGIN.CUDACPP_SA_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /afs/cern.ch/work/j/jteig/mg5amcnlo/CODEGEN_cudacpp_ee_mumu/SubProcesses/P1_Sigma_sm_epem_mupmum/./CPPProcess.cc
 [1;32mDEBUG:  Entering PLUGIN_OneProcessExporter.write_process_cc_file [1;30m[model_handling.py at line 1475][0m [0m
 [1;32mDEBUG:  Entering PLUGIN_OneProcessExporter.get_sigmaKin_lines [1;30m[model_handling.py at line 1143][0m [0m
 [1;32mDEBUG:  self.include_multi_channel = [0m False [1;30m[model_handling.py at line 1144][0m [0m
@@ -185,8 +185,8 @@ FileWriter <class 'PLUGIN.CUDACPP_SA_OUTPUT.model_handling.PLUGIN_CPPWriter'> fo
 [1;32mDEBUG:  type(self.helas_call_writer) = [0m <class 'PLUGIN.CUDACPP_SA_OUTPUT.model_handling.PLUGIN_GPUFOHelasCallWriter'> [1;30m[model_handling.py at line 1162][0m [0m
 [1;32mDEBUG:  self.support_multichannel, self.include_multi_channel = [0m True False [1;30m[model_handling.py at line 1163][0m [0m
 [1;32mDEBUG:  multi_channel_map = [0m None [1;30m[model_handling.py at line 1654][0m [0m
-[1;32mDEBUG:  diag_to_config = [0m {} [1;30m[model_handling.py at line 1709][0m [0m
-INFO: Created files CPPProcess.h and CPPProcess.cc in directory /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_cudacpp_ee_mumu/SubProcesses/P1_Sigma_sm_epem_mupmum/. 
+[1;32mDEBUG:  diag_to_config = [0m {} [1;30m[model_handling.py at line 1711][0m [0m
+INFO: Created files CPPProcess.h and CPPProcess.cc in directory /afs/cern.ch/work/j/jteig/mg5amcnlo/CODEGEN_cudacpp_ee_mumu/SubProcesses/P1_Sigma_sm_epem_mupmum/. 
 [1;32mDEBUG:  Entering PLUGIN_OneProcessExporter.edit_CMakeLists [1;30m[model_handling.py at line 1343][0m [0m
 [1;32mDEBUG:  Entering PLUGIN_OneProcessExporter.edit_check_sa [1;30m[model_handling.py at line 1352][0m [0m
 [1;32mDEBUG:  Entering PLUGIN_OneProcessExporter.edit_mgonGPU [1;30m[model_handling.py at line 1369][0m [0m
@@ -194,7 +194,7 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory /data/avalassi/G
 [1;32mDEBUG:  Entering PLUGIN_OneProcessExporter.edit_testxxx [1;30m[model_handling.py at line 1419][0m [0m
 [1;32mDEBUG:  Entering PLUGIN_OneProcessExporter.edit_memorybuffers [1;30m[model_handling.py at line 1430][0m [0m
 [1;32mDEBUG:  Entering PLUGIN_OneProcessExporter.edit_memoryaccesscouplings [1;30m[model_handling.py at line 1441][0m [0m
-[1;32mDEBUG:  'Copying test reference file: ', template_ref  = [0m Copying test reference file:  /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/../../../test/ref/dump_CPUTest.Sigma_sm_epem_mupmum.txt [1;30m[model_handling.py at line 1335][0m [0m
+[1;32mDEBUG:  'Copying test reference file: ', template_ref  = [0m Copying test reference file:  /afs/cern.ch/work/j/jteig/mg5amcnlo/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/../../../test/ref/dump_CPUTest.Sigma_sm_epem_mupmum.txt [1;30m[model_handling.py at line 1335][0m [0m
 Generated helas calls for 1 subprocesses (2 diagrams) in 0.003 s
 [1;32mDEBUG:  Entering PLUGIN_ProcessExporter.convert_model (create the model) [1;30m[output.py at line 194][0m [0m
 ALOHA: aloha starts to compute helicity amplitudes
@@ -202,7 +202,7 @@ ALOHA: aloha creates FFV1 routines[0m
 ALOHA: aloha creates FFV2 routines[0m
 ALOHA: aloha creates FFV4 routines[0m
 ALOHA: aloha creates FFV2_4 routines[0m
-ALOHA: aloha creates 4 routines in  0.216 s
+ALOHA: aloha creates 4 routines in  0.215 s
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
 <class 'aloha.create_aloha.AbstractRoutine'> FFV2
@@ -211,20 +211,20 @@ ALOHA: aloha creates 4 routines in  0.216 s
 <class 'aloha.create_aloha.AbstractRoutine'> FFV4
 <class 'aloha.create_aloha.AbstractRoutine'> FFV2_4
 <class 'aloha.create_aloha.AbstractRoutine'> FFV2_4
-FileWriter <class 'PLUGIN.CUDACPP_SA_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_cudacpp_ee_mumu/src/./HelAmps_sm.h
-INFO: Created file HelAmps_sm.h in directory /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_cudacpp_ee_mumu/src/. 
+FileWriter <class 'PLUGIN.CUDACPP_SA_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /afs/cern.ch/work/j/jteig/mg5amcnlo/CODEGEN_cudacpp_ee_mumu/src/./HelAmps_sm.h
+INFO: Created file HelAmps_sm.h in directory /afs/cern.ch/work/j/jteig/mg5amcnlo/CODEGEN_cudacpp_ee_mumu/src/. 
 super_write_set_parameters_onlyfixMajorana (hardcoded=False)
 [1;32mDEBUG:  'pardef_lines size =', len(pardef_lines), ', keys size =', len(pardef_lines.keys())  = [0m pardef_lines size = 48 , keys size = 48 [1;30m[model_handling.py at line 729][0m [0m
 super_write_set_parameters_onlyfixMajorana (hardcoded=True)
 [1;32mDEBUG:  'pardef_lines size =', len(pardef_lines), ', keys size =', len(pardef_lines.keys())  = [0m pardef_lines size = 3 , keys size = 3 [1;30m[model_handling.py at line 729][0m [0m
 [1;32mDEBUG:  'pardef_lines size =', len(pardef_lines), ', keys size =', len(pardef_lines.keys())  = [0m pardef_lines size = 3 , keys size = 3 [1;30m[model_handling.py at line 729][0m [0m
-FileWriter <class 'PLUGIN.CUDACPP_SA_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_cudacpp_ee_mumu/src/./Parameters_sm.h
-FileWriter <class 'PLUGIN.CUDACPP_SA_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_cudacpp_ee_mumu/src/./Parameters_sm.cc
+FileWriter <class 'PLUGIN.CUDACPP_SA_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /afs/cern.ch/work/j/jteig/mg5amcnlo/CODEGEN_cudacpp_ee_mumu/src/./Parameters_sm.h
+FileWriter <class 'PLUGIN.CUDACPP_SA_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /afs/cern.ch/work/j/jteig/mg5amcnlo/CODEGEN_cudacpp_ee_mumu/src/./Parameters_sm.cc
 INFO: Created files Parameters_sm.h and Parameters_sm.cc in directory 
-INFO: /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_cudacpp_ee_mumu/src/. and /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_cudacpp_ee_mumu/src/. 
+INFO: /afs/cern.ch/work/j/jteig/mg5amcnlo/CODEGEN_cudacpp_ee_mumu/src/. and /afs/cern.ch/work/j/jteig/mg5amcnlo/CODEGEN_cudacpp_ee_mumu/src/. 
 [1;32mDEBUG:  Entering PLUGIN_ProcessExporter.finalize [1;30m[output.py at line 203][0m [0m
 quit
 
-real	0m0.960s
-user	0m0.762s
-sys	0m0.119s
+real	0m0.899s
+user	0m0.522s
+sys	0m0.169s
diff --git a/epochX/cudacpp/ee_mumu.sa/COPYRIGHT b/epochX/cudacpp/ee_mumu.sa/COPYRIGHT
index a134b5fef9..84a883fbb0 100644
--- a/epochX/cudacpp/ee_mumu.sa/COPYRIGHT
+++ b/epochX/cudacpp/ee_mumu.sa/COPYRIGHT
@@ -15,6 +15,7 @@ The full development team currently includes the following authors :
   Stephan Hageboeck (CERN)
   Olivier Mattelaer (Universite Catholique de Louvain, original author)
   Stefan Roiser (CERN, original author)
+  Joergen Teig (CERN)
   Andrea Valassi (CERN, original author)
   Zenny Wettersten (CERN)
 See https://github.com/madgraph5/madgraph4gpu for more details. For the full
diff --git a/epochX/cudacpp/ee_mumu.sa/SubProcesses/Bridge.h b/epochX/cudacpp/ee_mumu.sa/SubProcesses/Bridge.h
index d7e629cacd..f37c972b24 100644
--- a/epochX/cudacpp/ee_mumu.sa/SubProcesses/Bridge.h
+++ b/epochX/cudacpp/ee_mumu.sa/SubProcesses/Bridge.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: S. Roiser (Nov 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Roiser, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef BRIDGE_H
 #define BRIDGE_H 1
@@ -22,7 +22,7 @@
 #include <memory>
 #include <type_traits>
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -82,7 +82,7 @@ namespace mg5amcCpu
     Bridge& operator=( const Bridge& ) = delete;
     Bridge& operator=( Bridge&& ) = delete;
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     /**
      * Set the gpublocks and gputhreads for the gpusequence - throws if evnt != gpublocks*gputhreads
      * (this is needed for BridgeKernel tests rather than for actual production use in Fortran)
@@ -149,7 +149,7 @@ namespace mg5amcCpu
     unsigned int m_nevt; // number of events
     int m_nGoodHel;      // the number of good helicities (-1 initially when they have not yet been calculated)
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     int m_gputhreads; // number of gpu threads (default set from number of events, can be modified)
     int m_gpublocks;  // number of gpu blocks (default set from number of events, can be modified)
     DeviceBuffer<FORTRANFPTYPE, sizePerEventMomenta> m_devMomentaF;
@@ -186,12 +186,12 @@ namespace mg5amcCpu
   // Forward declare transposition methods
   //
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 
   template<typename Tin, typename Tout>
   __global__ void dev_transposeMomentaF2C( const Tin* in, Tout* out, const unsigned int nevt );
 
-#endif // __CUDACC__
+#endif // MGONGPUCPP_GPUIMPL
 
   template<typename Tin, typename Tout>
   void hst_transposeMomentaF2C( const Tin* in, Tout* out, const unsigned int nevt );
@@ -208,7 +208,7 @@ namespace mg5amcCpu
   Bridge<FORTRANFPTYPE>::Bridge( unsigned int nevtF, unsigned int nparF, unsigned int np4F )
     : m_nevt( nevtF )
     , m_nGoodHel( -1 )
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     , m_gputhreads( 256 )                  // default number of gpu threads
     , m_gpublocks( m_nevt / m_gputhreads ) // this ensures m_nevt <= m_gpublocks*m_gputhreads
     , m_devMomentaF( m_nevt )
@@ -232,7 +232,7 @@ namespace mg5amcCpu
   {
     if( nparF != CPPProcess::npar ) throw std::runtime_error( "Bridge constructor: npar mismatch" );
     if( np4F != CPPProcess::np4 ) throw std::runtime_error( "Bridge constructor: np4 mismatch" );
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     if( ( m_nevt < s_gputhreadsmin ) || ( m_nevt % s_gputhreadsmin != 0 ) )
       throw std::runtime_error( "Bridge constructor: nevt should be a multiple of " + std::to_string( s_gputhreadsmin ) );
     while( m_nevt != m_gpublocks * m_gputhreads )
@@ -250,11 +250,11 @@ namespace mg5amcCpu
     std::cout << "WARNING! Instantiate host Bridge (nevt=" << m_nevt << ")" << std::endl;
     CPPProcess process( /*verbose=*/false );
     m_pmek.reset( new MatrixElementKernelHost( m_hstMomentaC, m_hstGs, m_hstRndHel, m_hstRndCol, m_hstMEs, m_hstSelHel, m_hstSelCol, m_nevt ) );
-#endif // __CUDACC__
+#endif // MGONGPUCPP_GPUIMPL
     process.initProc( "../../Cards/param_card.dat" );
   }
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   template<typename FORTRANFPTYPE>
   void Bridge<FORTRANFPTYPE>::set_gpugrid( const int gpublocks, const int gputhreads )
   {
@@ -268,7 +268,7 @@ namespace mg5amcCpu
   }
 #endif
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   template<typename FORTRANFPTYPE>
   void Bridge<FORTRANFPTYPE>::gpu_sequence( const FORTRANFPTYPE* momenta,
                                             const FORTRANFPTYPE* gs,
@@ -283,14 +283,14 @@ namespace mg5amcCpu
     constexpr int neppM = MemoryAccessMomenta::neppM;
     if constexpr( neppM == 1 && std::is_same_v<FORTRANFPTYPE, fptype> )
     {
-      checkCuda( cudaMemcpy( m_devMomentaC.data(), momenta, m_devMomentaC.bytes(), cudaMemcpyHostToDevice ) );
+      gpuMemcpy( m_devMomentaC.data(), momenta, m_devMomentaC.bytes(), gpuMemcpyHostToDevice );
     }
     else
     {
-      checkCuda( cudaMemcpy( m_devMomentaF.data(), momenta, m_devMomentaF.bytes(), cudaMemcpyHostToDevice ) );
+      gpuMemcpy( m_devMomentaF.data(), momenta, m_devMomentaF.bytes(), gpuMemcpyHostToDevice );
       const int thrPerEvt = CPPProcess::npar * CPPProcess::np4; // AV: transpose alg does 1 element per thread (NOT 1 event per thread)
       //const int thrPerEvt = 1; // AV: try new alg with 1 event per thread... this seems slower
-      dev_transposeMomentaF2C<<<m_gpublocks * thrPerEvt, m_gputhreads>>>( m_devMomentaF.data(), m_devMomentaC.data(), m_nevt );
+      gpuLaunchKernel( dev_transposeMomentaF2C, m_gpublocks * thrPerEvt, m_gputhreads, m_devMomentaF.data(), m_devMomentaC.data(), m_nevt );
     }
     if constexpr( std::is_same_v<FORTRANFPTYPE, fptype> )
     {
@@ -333,7 +333,7 @@ namespace mg5amcCpu
   }
 #endif
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   template<typename FORTRANFPTYPE>
   void Bridge<FORTRANFPTYPE>::cpu_sequence( const FORTRANFPTYPE* momenta,
                                             const FORTRANFPTYPE* gs,
@@ -388,7 +388,7 @@ namespace mg5amcCpu
   // - C++ array: momenta[npagM][npar][np4][neppM] with nevt=npagM*neppM (AOSOA)
   //
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   template<typename Tin, typename Tout>
   __global__ void dev_transposeMomentaF2C( const Tin* in, Tout* out, const unsigned int nevt )
   {
diff --git a/epochX/cudacpp/ee_mumu.sa/SubProcesses/BridgeKernels.cc b/epochX/cudacpp/ee_mumu.sa/SubProcesses/BridgeKernels.cc
index d58066c9c1..eaf4037a24 100644
--- a/epochX/cudacpp/ee_mumu.sa/SubProcesses/BridgeKernels.cc
+++ b/epochX/cudacpp/ee_mumu.sa/SubProcesses/BridgeKernels.cc
@@ -1,17 +1,18 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #include "BridgeKernels.h"
 
+#include "GpuAbstraction.h"
 #include "MemoryAccessMomenta.h"
 
 #include <sstream>
 
 //============================================================================
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -45,7 +46,7 @@ namespace mg5amcCpu
 
 //============================================================================
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 namespace mg5amcCpu
 {
 
@@ -96,7 +97,7 @@ namespace mg5amcCpu
 
 //============================================================================
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 {
 
diff --git a/epochX/cudacpp/ee_mumu.sa/SubProcesses/BridgeKernels.h b/epochX/cudacpp/ee_mumu.sa/SubProcesses/BridgeKernels.h
index 15eb4bff4d..3efef8ce97 100644
--- a/epochX/cudacpp/ee_mumu.sa/SubProcesses/BridgeKernels.h
+++ b/epochX/cudacpp/ee_mumu.sa/SubProcesses/BridgeKernels.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef BRIDGEKERNELS_H
 #define BRIDGEKERNELS_H 1
@@ -12,7 +12,7 @@
 #include "MatrixElementKernels.h"
 #include "MemoryBuffers.h"
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -49,7 +49,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A Bridge wrapper class encapsulating matrix element calculations on a CPU host
   class BridgeKernelHost final : public BridgeKernelBase
   {
@@ -89,7 +89,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   // A Bridge wrapper class encapsulating matrix element calculations on a GPU device
   class BridgeKernelDevice : public BridgeKernelBase
   {
diff --git a/epochX/cudacpp/ee_mumu.sa/SubProcesses/CommonRandomNumberKernel.cc b/epochX/cudacpp/ee_mumu.sa/SubProcesses/CommonRandomNumberKernel.cc
index 985b39f576..010bc4cbd0 100644
--- a/epochX/cudacpp/ee_mumu.sa/SubProcesses/CommonRandomNumberKernel.cc
+++ b/epochX/cudacpp/ee_mumu.sa/SubProcesses/CommonRandomNumberKernel.cc
@@ -1,15 +1,16 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #include "CommonRandomNumbers.h"
+#include "GpuAbstraction.h"
 #include "MemoryBuffers.h"
 #include "RandomNumberKernels.h"
 
 #include <cassert>
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/ee_mumu.sa/SubProcesses/CrossSectionKernels.cc b/epochX/cudacpp/ee_mumu.sa/SubProcesses/CrossSectionKernels.cc
index 0b355a3c8d..c15b39844d 100644
--- a/epochX/cudacpp/ee_mumu.sa/SubProcesses/CrossSectionKernels.cc
+++ b/epochX/cudacpp/ee_mumu.sa/SubProcesses/CrossSectionKernels.cc
@@ -1,10 +1,11 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #include "CrossSectionKernels.h"
 
+#include "GpuAbstraction.h"
 #include "MemoryAccessMatrixElements.h"
 #include "MemoryAccessWeights.h"
 #include "MemoryBuffers.h"
@@ -77,7 +78,7 @@ debug_me_is_abnormal( const fptype& me, size_t ievtALL )
 
 //============================================================================
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -185,7 +186,7 @@ namespace mg5amcCpu
 
 //============================================================================
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 {
 
diff --git a/epochX/cudacpp/ee_mumu.sa/SubProcesses/CrossSectionKernels.h b/epochX/cudacpp/ee_mumu.sa/SubProcesses/CrossSectionKernels.h
index 7933ca4bbf..4d9659e04e 100644
--- a/epochX/cudacpp/ee_mumu.sa/SubProcesses/CrossSectionKernels.h
+++ b/epochX/cudacpp/ee_mumu.sa/SubProcesses/CrossSectionKernels.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef CROSSSECTIONKERNELS_H
 #define CROSSSECTIONKERNELS_H 1
@@ -13,7 +13,7 @@
 
 //============================================================================
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -96,7 +96,7 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
   /*
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   // A class encapsulating the calculation of event statistics on a GPU device
   class CrossSectionKernelDevice : public CrossSectionKernelBase, public NumberOfEvents
   {
diff --git a/epochX/cudacpp/ee_mumu.sa/SubProcesses/CudaRuntime.h b/epochX/cudacpp/ee_mumu.sa/SubProcesses/CudaRuntime.h
deleted file mode 100644
index 64ce52f4b3..0000000000
--- a/epochX/cudacpp/ee_mumu.sa/SubProcesses/CudaRuntime.h
+++ /dev/null
@@ -1,85 +0,0 @@
-// Copyright (C) 2020-2023 CERN and UCLouvain.
-// Licensed under the GNU Lesser General Public License (version 3 or later).
-// Created by: S. Roiser (Jul 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
-
-#ifndef MG5AMC_CUDARUNTIME_H
-#define MG5AMC_CUDARUNTIME_H 1
-
-// MG5AMC on GPU uses the CUDA runtime API, not the lower level CUDA driver API
-// See https://docs.nvidia.com/cuda/cuda-runtime-api/driver-vs-runtime-api.html#driver-vs-runtime-api
-
-#include <cassert>
-#include <iostream>
-
-//--------------------------------------------------------------------------
-
-// See https://stackoverflow.com/a/14038590
-#ifdef __CUDACC__ /* clang-format off */
-#define checkCuda( code ) { assertCuda( code, __FILE__, __LINE__ ); }
-inline void assertCuda( cudaError_t code, const char* file, int line, bool abort = true )
-{
-  if( code != cudaSuccess )
-  {
-    printf( "ERROR! assertCuda: '%s' (%d) in %s:%d\n", cudaGetErrorString( code ), code, file, line );
-    if( abort ) assert( code == cudaSuccess );
-  }
-}
-#endif /* clang-format on */
-
-//--------------------------------------------------------------------------
-
-#ifdef __CUDACC__
-namespace mg5amcGpu
-{
-  // Instantiate a CudaRuntime at the beginnining of the application's main to
-  // invoke cudaSetDevice(0) in the constructor and book a cudaDeviceReset() call in the destructor
-  // *** FIXME! This will all need to be designed differently when going to multi-GPU nodes! ***
-  struct CudaRuntime final
-  {
-    CudaRuntime( const bool debug = true )
-      : m_debug( debug ) { setUp( m_debug ); }
-    ~CudaRuntime() { tearDown( m_debug ); }
-    CudaRuntime( const CudaRuntime& ) = delete;
-    CudaRuntime( CudaRuntime&& ) = delete;
-    CudaRuntime& operator=( const CudaRuntime& ) = delete;
-    CudaRuntime& operator=( CudaRuntime&& ) = delete;
-    bool m_debug;
-
-    // Set up CUDA application
-    // ** NB: strictly speaking this is not needed when using the CUDA runtime API **
-    // Calling cudaSetDevice on startup is useful to properly book-keep the time spent in CUDA initialization
-    static void setUp( const bool debug = true )
-    {
-      // ** NB: it is useful to call cudaSetDevice, or cudaFree, to properly book-keep the time spent in CUDA initialization
-      // ** NB: otherwise, the first CUDA operation (eg a cudaMemcpyToSymbol in CPPProcess ctor) appears to take much longer!
-      /*
-      // [We initially added cudaFree(0) to "ease profile analysis" only because it shows up as a big recognizable block!]
-      // No explicit initialization is needed: https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#initialization
-      // It is not clear what cudaFree(0) does at all: https://stackoverflow.com/questions/69967813/
-      if ( debug ) std::cout << "__CudaRuntime: calling cudaFree(0)" << std::endl;
-      checkCuda( cudaFree( 0 ) ); // SLOW!
-      */
-      // Replace cudaFree(0) by cudaSetDevice(0), even if it is not really needed either
-      // (but see https://developer.nvidia.com/blog/cuda-pro-tip-always-set-current-device-avoid-multithreading-bugs)
-      if( debug ) std::cout << "__CudaRuntime: calling cudaSetDevice(0)" << std::endl;
-      checkCuda( cudaSetDevice( 0 ) ); // SLOW!
-    }
-
-    // Tear down CUDA application (call cudaDeviceReset)
-    // ** NB: strictly speaking this is not needed when using the CUDA runtime API **
-    // Calling cudaDeviceReset on shutdown is only needed for checking memory leaks in cuda-memcheck
-    // See https://docs.nvidia.com/cuda/cuda-memcheck/index.html#leak-checking
-    static void tearDown( const bool debug = true )
-    {
-      if( debug ) std::cout << "__CudaRuntime: calling cudaDeviceReset()" << std::endl;
-      checkCuda( cudaDeviceReset() );
-    }
-  };
-
-}
-#endif
-
-//--------------------------------------------------------------------------
-
-#endif // MG5AMC_CUDARUNTIME_H
diff --git a/epochX/cudacpp/ee_mumu.sa/SubProcesses/CurandRandomNumberKernel.cc b/epochX/cudacpp/ee_mumu.sa/SubProcesses/CurandRandomNumberKernel.cc
index eb56333b03..08a16f6f2c 100644
--- a/epochX/cudacpp/ee_mumu.sa/SubProcesses/CurandRandomNumberKernel.cc
+++ b/epochX/cudacpp/ee_mumu.sa/SubProcesses/CurandRandomNumberKernel.cc
@@ -1,9 +1,9 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
-#include "CudaRuntime.h"
+#include "GpuRuntime.h"
 #include "MemoryBuffers.h"
 #include "RandomNumberKernels.h"
 
@@ -22,7 +22,7 @@ inline void assertCurand( curandStatus_t code, const char *file, int line, bool
 }
 #endif /* clang-format on */
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -36,7 +36,7 @@ namespace mg5amcCpu
   {
     if( m_isOnDevice )
     {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
       if( !m_rnarray.isOnDevice() )
         throw std::runtime_error( "CurandRandomNumberKernel on device with a host random number array" );
 #else
@@ -114,7 +114,7 @@ namespace mg5amcCpu
     /*
     printf( "\nCurandRandomNumberKernel::generateRnarray size = %d\n", (int)m_rnarray.size() );
     fptype* data = m_rnarray.data();
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     if( m_rnarray.isOnDevice() )
     {
       data = new fptype[m_rnarray.size()]();
@@ -123,7 +123,7 @@ namespace mg5amcCpu
 #endif
     for( int i = 0; i < ( (int)m_rnarray.size() / 4 ); i++ )
       printf( "[%4d] %f %f %f %f\n", i * 4, data[i * 4], data[i * 4 + 2], data[i * 4 + 2], data[i * 4 + 3] );
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     if( m_rnarray.isOnDevice() ) delete[] data;
 #endif
     */
diff --git a/epochX/cudacpp/ee_mumu.sa/SubProcesses/EventStatistics.h b/epochX/cudacpp/ee_mumu.sa/SubProcesses/EventStatistics.h
index 48b51e0a49..b425a5bade 100644
--- a/epochX/cudacpp/ee_mumu.sa/SubProcesses/EventStatistics.h
+++ b/epochX/cudacpp/ee_mumu.sa/SubProcesses/EventStatistics.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef EventStatistics_H
 #define EventStatistics_H 1
@@ -16,7 +16,7 @@
 #include <limits>
 #include <string>
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/ee_mumu.sa/SubProcesses/MadgraphTest.h b/epochX/cudacpp/ee_mumu.sa/SubProcesses/MadgraphTest.h
index ffe3b84d53..176338151a 100644
--- a/epochX/cudacpp/ee_mumu.sa/SubProcesses/MadgraphTest.h
+++ b/epochX/cudacpp/ee_mumu.sa/SubProcesses/MadgraphTest.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: S. Hageboeck (Dec 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MADGRAPHTEST_H_
 #define MADGRAPHTEST_H_ 1
@@ -21,7 +21,7 @@
 #include <string>
 #include <vector>
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 using mg5amcGpu::CPPProcess;
 #else
 using mg5amcCpu::CPPProcess;
@@ -200,7 +200,7 @@ class MadgraphTest : public testing::TestWithParam<TestDriverBase*>
 
 // Since we link both the CPU-only and GPU tests into the same executable, we prevent
 // a multiply defined symbol by only compiling this in the non-CUDA phase:
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 
 /// Compare momenta and matrix elements.
 /// This uses an implementation of TestDriverBase to run a madgraph workflow,
@@ -309,6 +309,6 @@ TEST_P( MadgraphTest, CompareMomentaAndME )
   }
 }
 
-#endif // __CUDACC__
+#endif // MGONGPUCPP_GPUIMPL
 
 #endif /* MADGRAPHTEST_H_ */
diff --git a/epochX/cudacpp/ee_mumu.sa/SubProcesses/MatrixElementKernels.cc b/epochX/cudacpp/ee_mumu.sa/SubProcesses/MatrixElementKernels.cc
index 30257195b6..d6d6c4f179 100644
--- a/epochX/cudacpp/ee_mumu.sa/SubProcesses/MatrixElementKernels.cc
+++ b/epochX/cudacpp/ee_mumu.sa/SubProcesses/MatrixElementKernels.cc
@@ -1,12 +1,12 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #include "MatrixElementKernels.h"
 
 #include "CPPProcess.h"
-#include "CudaRuntime.h"
+#include "GpuRuntime.h" // Includes the abstraction for Nvidia/AMD compilation
 #include "MemoryAccessMomenta.h"
 #include "MemoryBuffers.h"
 
@@ -14,7 +14,7 @@
 
 //============================================================================
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 namespace mg5amcCpu
 {
 
@@ -143,7 +143,7 @@ namespace mg5amcCpu
 
 //============================================================================
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 {
 
@@ -202,13 +202,13 @@ namespace mg5amcGpu
     PinnedHostBufferHelicityMask hstIsGoodHel( ncomb );
     DeviceBufferHelicityMask devIsGoodHel( ncomb );
     // ... 0d1. Compute good helicity mask on the device
-    computeDependentCouplings<<<m_gpublocks, m_gputhreads>>>( m_gs.data(), m_couplings.data() );
+    gpuLaunchKernel( computeDependentCouplings, m_gpublocks, m_gputhreads, m_gs.data(), m_couplings.data() );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    sigmaKin_getGoodHel<<<m_gpublocks, m_gputhreads>>>( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_numerators.data(), m_denominators.data(), devIsGoodHel.data() );
+    gpuLaunchKernel( sigmaKin_getGoodHel, m_gpublocks, m_gputhreads, m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_numerators.data(), m_denominators.data(), devIsGoodHel.data() );
 #else
-    sigmaKin_getGoodHel<<<m_gpublocks, m_gputhreads>>>( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), devIsGoodHel.data() );
+    gpuLaunchKernel( sigmaKin_getGoodHel, m_gpublocks, m_gputhreads, m_momenta.data(), m_couplings.data(), m_matrixElements.data(), devIsGoodHel.data() );
 #endif
-    checkCuda( cudaPeekAtLastError() );
+    checkGpu( gpuPeekAtLastError() );
     // ... 0d2. Copy back good helicity mask to the host
     copyHostFromDevice( hstIsGoodHel, devIsGoodHel );
     // ... 0d3. Copy back good helicity list to constant memory on the device
@@ -219,19 +219,19 @@ namespace mg5amcGpu
 
   void MatrixElementKernelDevice::computeMatrixElements( const unsigned int channelId )
   {
-    computeDependentCouplings<<<m_gpublocks, m_gputhreads>>>( m_gs.data(), m_couplings.data() );
+    gpuLaunchKernel( computeDependentCouplings, m_gpublocks, m_gputhreads, m_gs.data(), m_couplings.data() );
 #ifndef MGONGPU_NSIGHT_DEBUG
     constexpr unsigned int sharedMemSize = 0;
 #else
     constexpr unsigned int sharedMemSize = ntpbMAX * sizeof( float );
 #endif
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    sigmaKin<<<m_gpublocks, m_gputhreads, sharedMemSize>>>( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), channelId, m_numerators.data(), m_denominators.data(), m_selhel.data(), m_selcol.data() );
+    gpuLaunchKernelSharedMem( sigmaKin, m_gpublocks, m_gputhreads, sharedMemSize, m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), channelId, m_numerators.data(), m_denominators.data(), m_selhel.data(), m_selcol.data() );
 #else
-    sigmaKin<<<m_gpublocks, m_gputhreads, sharedMemSize>>>( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), m_selhel.data(), m_selcol.data() );
+    gpuLaunchKernelSharedMem( sigmaKin, m_gpublocks, m_gputhreads, sharedMemSize, m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), m_selhel.data(), m_selcol.data() );
 #endif
-    checkCuda( cudaPeekAtLastError() );
-    checkCuda( cudaDeviceSynchronize() );
+    checkGpu( gpuPeekAtLastError() );
+    checkGpu( gpuDeviceSynchronize() );
   }
 
   //--------------------------------------------------------------------------
diff --git a/epochX/cudacpp/ee_mumu.sa/SubProcesses/MatrixElementKernels.h b/epochX/cudacpp/ee_mumu.sa/SubProcesses/MatrixElementKernels.h
index 23e84757a2..72bd8f195b 100644
--- a/epochX/cudacpp/ee_mumu.sa/SubProcesses/MatrixElementKernels.h
+++ b/epochX/cudacpp/ee_mumu.sa/SubProcesses/MatrixElementKernels.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MATRIXELEMENTKERNELS_H
 #define MATRIXELEMENTKERNELS_H 1
@@ -10,7 +10,7 @@
 
 #include "MemoryBuffers.h"
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -81,7 +81,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating matrix element calculations on a CPU host
   class MatrixElementKernelHost final : public MatrixElementKernelBase, public NumberOfEvents
   {
@@ -130,7 +130,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   // A class encapsulating matrix element calculations on a GPU device
   class MatrixElementKernelDevice : public MatrixElementKernelBase, public NumberOfEvents
   {
diff --git a/epochX/cudacpp/ee_mumu.sa/SubProcesses/MemoryAccessAmplitudes.h b/epochX/cudacpp/ee_mumu.sa/SubProcesses/MemoryAccessAmplitudes.h
index 573b3bbbc9..ffb76e93de 100644
--- a/epochX/cudacpp/ee_mumu.sa/SubProcesses/MemoryAccessAmplitudes.h
+++ b/epochX/cudacpp/ee_mumu.sa/SubProcesses/MemoryAccessAmplitudes.h
@@ -15,7 +15,7 @@
 #define MGONGPU_TRIVIAL_AMPLITUDES 1
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/ee_mumu.sa/SubProcesses/MemoryAccessCouplings.h b/epochX/cudacpp/ee_mumu.sa/SubProcesses/MemoryAccessCouplings.h
index 35a3af42e0..3afdf3e554 100644
--- a/epochX/cudacpp/ee_mumu.sa/SubProcesses/MemoryAccessCouplings.h
+++ b/epochX/cudacpp/ee_mumu.sa/SubProcesses/MemoryAccessCouplings.h
@@ -15,7 +15,7 @@
 #include "MemoryBuffers.h"       // for HostBufferCouplings::isaligned
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/ee_mumu.sa/SubProcesses/MemoryAccessCouplingsFixed.h b/epochX/cudacpp/ee_mumu.sa/SubProcesses/MemoryAccessCouplingsFixed.h
index dc0d93afff..ffcdf4dbef 100644
--- a/epochX/cudacpp/ee_mumu.sa/SubProcesses/MemoryAccessCouplingsFixed.h
+++ b/epochX/cudacpp/ee_mumu.sa/SubProcesses/MemoryAccessCouplingsFixed.h
@@ -14,7 +14,7 @@
 //#include "MemoryAccessHelpers.h"
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/ee_mumu.sa/SubProcesses/MemoryAccessDenominators.h b/epochX/cudacpp/ee_mumu.sa/SubProcesses/MemoryAccessDenominators.h
index 3bce635718..66f2d32a6b 100644
--- a/epochX/cudacpp/ee_mumu.sa/SubProcesses/MemoryAccessDenominators.h
+++ b/epochX/cudacpp/ee_mumu.sa/SubProcesses/MemoryAccessDenominators.h
@@ -10,7 +10,7 @@
 #include "MemoryAccessGs.h"
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/ee_mumu.sa/SubProcesses/MemoryAccessGs.h b/epochX/cudacpp/ee_mumu.sa/SubProcesses/MemoryAccessGs.h
index 31311aa375..4c726b30f3 100644
--- a/epochX/cudacpp/ee_mumu.sa/SubProcesses/MemoryAccessGs.h
+++ b/epochX/cudacpp/ee_mumu.sa/SubProcesses/MemoryAccessGs.h
@@ -13,7 +13,7 @@
 #include "MemoryBuffers.h" // for HostBufferMatrixElements::isaligned
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/ee_mumu.sa/SubProcesses/MemoryAccessHelpers.h b/epochX/cudacpp/ee_mumu.sa/SubProcesses/MemoryAccessHelpers.h
index c82a6c7635..db73e4e064 100644
--- a/epochX/cudacpp/ee_mumu.sa/SubProcesses/MemoryAccessHelpers.h
+++ b/epochX/cudacpp/ee_mumu.sa/SubProcesses/MemoryAccessHelpers.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MemoryAccessHelpers_H
 #define MemoryAccessHelpers_H 1
@@ -105,7 +105,7 @@ class KernelAccessHelper : public MemoryAccessHelper<T>
     }
     else
     {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
       const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid
       //printf( "kernelAccessRecord: ievt=%d threadId=%d\n", ievt, threadIdx.x );
       return T::ieventAccessRecord( buffer, ievt ); // NB fptype and fptype_sv coincide for CUDA
diff --git a/epochX/cudacpp/ee_mumu.sa/SubProcesses/MemoryAccessMatrixElements.h b/epochX/cudacpp/ee_mumu.sa/SubProcesses/MemoryAccessMatrixElements.h
index f32e6fea5b..3741011971 100644
--- a/epochX/cudacpp/ee_mumu.sa/SubProcesses/MemoryAccessMatrixElements.h
+++ b/epochX/cudacpp/ee_mumu.sa/SubProcesses/MemoryAccessMatrixElements.h
@@ -13,7 +13,7 @@
 #include "MemoryBuffers.h" // for HostBufferMatrixElements::isaligned
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/ee_mumu.sa/SubProcesses/MemoryAccessMomenta.h b/epochX/cudacpp/ee_mumu.sa/SubProcesses/MemoryAccessMomenta.h
index 29266de32c..3be229d392 100644
--- a/epochX/cudacpp/ee_mumu.sa/SubProcesses/MemoryAccessMomenta.h
+++ b/epochX/cudacpp/ee_mumu.sa/SubProcesses/MemoryAccessMomenta.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MemoryAccessMomenta_H
 #define MemoryAccessMomenta_H 1
@@ -13,7 +13,7 @@
 #include "MemoryAccessVectors.h"
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -30,7 +30,7 @@ namespace mg5amcCpu
 
     // Number of Events Per Page in the momenta AOSOA memory buffer layout
     // (these are all best kept as a compile-time constants: see issue #23)
-#ifdef __CUDACC__ /* clang-format off */
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
     // -----------------------------------------------------------------------------------------------
     // --- GPUs: neppM is best set to a power of 2 times the number of fptype's in a 32-byte cacheline
     // --- This is relevant to ensure coalesced access to momenta in global memory
diff --git a/epochX/cudacpp/ee_mumu.sa/SubProcesses/MemoryAccessNumerators.h b/epochX/cudacpp/ee_mumu.sa/SubProcesses/MemoryAccessNumerators.h
index b152183b28..18991f4fa6 100644
--- a/epochX/cudacpp/ee_mumu.sa/SubProcesses/MemoryAccessNumerators.h
+++ b/epochX/cudacpp/ee_mumu.sa/SubProcesses/MemoryAccessNumerators.h
@@ -10,7 +10,7 @@
 #include "MemoryAccessGs.h"
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/ee_mumu.sa/SubProcesses/MemoryAccessRandomNumbers.h b/epochX/cudacpp/ee_mumu.sa/SubProcesses/MemoryAccessRandomNumbers.h
index e2988d39f3..40cb089135 100644
--- a/epochX/cudacpp/ee_mumu.sa/SubProcesses/MemoryAccessRandomNumbers.h
+++ b/epochX/cudacpp/ee_mumu.sa/SubProcesses/MemoryAccessRandomNumbers.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MemoryAccessRandomNumbers_H
 #define MemoryAccessRandomNumbers_H 1
@@ -11,7 +11,7 @@
 #include "CPPProcess.h"
 #include "MemoryAccessHelpers.h"
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 using mg5amcGpu::CPPProcess;
 #else
 using mg5amcCpu::CPPProcess;
diff --git a/epochX/cudacpp/ee_mumu.sa/SubProcesses/MemoryAccessVectors.h b/epochX/cudacpp/ee_mumu.sa/SubProcesses/MemoryAccessVectors.h
index e9b197368e..08faccff0f 100644
--- a/epochX/cudacpp/ee_mumu.sa/SubProcesses/MemoryAccessVectors.h
+++ b/epochX/cudacpp/ee_mumu.sa/SubProcesses/MemoryAccessVectors.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MemoryAccessVectors_H
 #define MemoryAccessVectors_H 1
@@ -10,7 +10,7 @@
 
 #include "mgOnGpuVectors.h"
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 namespace mg5amcCpu // this is only needed for CPU SIMD vectorization
 {
 
diff --git a/epochX/cudacpp/ee_mumu.sa/SubProcesses/MemoryAccessWavefunctions.h b/epochX/cudacpp/ee_mumu.sa/SubProcesses/MemoryAccessWavefunctions.h
index 5428aaf933..33bef4559e 100644
--- a/epochX/cudacpp/ee_mumu.sa/SubProcesses/MemoryAccessWavefunctions.h
+++ b/epochX/cudacpp/ee_mumu.sa/SubProcesses/MemoryAccessWavefunctions.h
@@ -15,7 +15,7 @@
 #define MGONGPU_TRIVIAL_WAVEFUNCTIONS 1
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/ee_mumu.sa/SubProcesses/MemoryBuffers.h b/epochX/cudacpp/ee_mumu.sa/SubProcesses/MemoryBuffers.h
index 3093e6ed18..7756a71621 100644
--- a/epochX/cudacpp/ee_mumu.sa/SubProcesses/MemoryBuffers.h
+++ b/epochX/cudacpp/ee_mumu.sa/SubProcesses/MemoryBuffers.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021, based on earlier work by S. Hageboeck) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Roiser, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MemoryBuffers_H
 #define MemoryBuffers_H 1
@@ -11,12 +11,12 @@
 #include "mgOnGpuCxtypes.h"
 
 #include "CPPProcess.h"
-#include "CudaRuntime.h"
+#include "GpuRuntime.h"
 #include "Parameters_sm.h"
 
 #include <sstream>
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -87,7 +87,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   constexpr bool HostBufferALIGNED = false;   // ismisaligned=false
   constexpr bool HostBufferMISALIGNED = true; // ismisaligned=true
 
@@ -119,7 +119,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   // A class encapsulating a CUDA pinned host buffer
   template<typename T>
   class PinnedHostBufferBase : public BufferBase<T>
@@ -128,18 +128,18 @@ namespace mg5amcCpu
     PinnedHostBufferBase( const size_t size )
       : BufferBase<T>( size, false )
     {
-      checkCuda( cudaMallocHost( &( this->m_data ), this->bytes() ) );
+      gpuMallocHost( &( this->m_data ), this->bytes() );
     }
     virtual ~PinnedHostBufferBase()
     {
-      checkCuda( cudaFreeHost( this->m_data ) );
+      gpuFreeHost( this->m_data );
     }
   };
 #endif
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   // A class encapsulating a CUDA device buffer
   template<typename T>
   class DeviceBufferBase : public BufferBase<T>
@@ -148,18 +148,18 @@ namespace mg5amcCpu
     DeviceBufferBase( const size_t size )
       : BufferBase<T>( size, true )
     {
-      checkCuda( cudaMalloc( &( this->m_data ), this->bytes() ) );
+      gpuMalloc( &( this->m_data ), this->bytes() );
     }
     virtual ~DeviceBufferBase()
     {
-      checkCuda( cudaFree( this->m_data ) );
+      gpuFree( this->m_data );
     }
   };
 #endif
 
   //--------------------------------------------------------------------------
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for a given number of events
   template<typename T, size_t sizePerEvent, bool ismisaligned>
   class HostBuffer : public HostBufferBase<T, ismisaligned>, virtual private NumberOfEvents
@@ -175,7 +175,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   // A class encapsulating a CUDA pinned host buffer for a given number of events
   template<typename T, size_t sizePerEvent>
   class PinnedHostBuffer : public PinnedHostBufferBase<T>, virtual private NumberOfEvents
@@ -191,7 +191,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   // A class encapsulating a CUDA device buffer for a given number of events
   template<typename T, size_t sizePerEvent>
   class DeviceBuffer : public DeviceBufferBase<T>, virtual private NumberOfEvents
@@ -213,7 +213,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for momenta random numbers
   constexpr size_t sizePerEventRndNumMomenta = MemoryBuffers::np4 * MemoryBuffers::nparf;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for momenta random numbers
   typedef HostBuffer<fptype, sizePerEventRndNumMomenta, HostBufferALIGNED> HostBufferRndNumMomenta;
 #else
@@ -232,7 +232,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer with ONE fptype per event
   constexpr size_t sizePerEventOneFp = 1;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer with ONE fptype per event
   typedef HostBuffer<fptype, sizePerEventOneFp, HostBufferALIGNED> HostBufferOneFp;
 #else
@@ -257,7 +257,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for Gs
   constexpr size_t sizePerEventGs = 1;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for gs
   typedef HostBuffer<fptype, sizePerEventGs, HostBufferALIGNED> HostBufferGs;
 #else
@@ -276,7 +276,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for numerators
   constexpr size_t sizePerEventNumerators = 1;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for gs
   typedef HostBuffer<fptype, sizePerEventNumerators, HostBufferALIGNED> HostBufferNumerators;
 #else
@@ -296,7 +296,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for denominators
   constexpr size_t sizePerEventDenominators = 1;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for gs
   typedef HostBuffer<fptype, sizePerEventDenominators, HostBufferALIGNED> HostBufferDenominators;
 #else
@@ -315,7 +315,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for random numbers
   constexpr size_t sizePerEventCouplings = MemoryBuffers::ndcoup * MemoryBuffers::nx2;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for gs
   typedef HostBuffer<fptype, sizePerEventCouplings, HostBufferALIGNED> HostBufferCouplings;
 #else
@@ -333,7 +333,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for momenta
   constexpr size_t sizePerEventMomenta = MemoryBuffers::np4 * MemoryBuffers::npar;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for momenta
   typedef HostBuffer<fptype, sizePerEventMomenta, HostBufferALIGNED> HostBufferMomenta;
   //typedef HostBuffer<fptype, sizePerEventMomenta, HostBufferMISALIGNED> HostBufferMomenta; // TEST MISALIGNMENT!
@@ -352,7 +352,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for sampling weights
   constexpr size_t sizePerEventWeights = 1;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for sampling weights
   typedef HostBuffer<fptype, sizePerEventWeights, HostBufferALIGNED> HostBufferWeights;
 #else
@@ -370,7 +370,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for matrix elements
   constexpr size_t sizePerEventMatrixElements = 1;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for matrix elements
   typedef HostBuffer<fptype, sizePerEventMatrixElements, HostBufferALIGNED> HostBufferMatrixElements;
 #else
@@ -385,7 +385,7 @@ namespace mg5amcCpu
   // A base class encapsulating a memory buffer for the helicity mask
   typedef BufferBase<bool> BufferHelicityMask;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for the helicity mask
   typedef HostBufferBase<bool, HostBufferALIGNED> HostBufferHelicityMask;
 #else
@@ -403,7 +403,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for wavefunctions
   constexpr size_t sizePerEventWavefunctions = MemoryBuffers::nw6 * MemoryBuffers::nx2;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for wavefunctions
   typedef HostBuffer<fptype, sizePerEventWavefunctions, HostBufferALIGNED> HostBufferWavefunctions;
 #else
@@ -421,7 +421,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for helicity random numbers
   constexpr size_t sizePerEventRndNumHelicity = 1;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for helicity random numbers
   typedef HostBuffer<fptype, sizePerEventRndNumHelicity, HostBufferALIGNED> HostBufferRndNumHelicity;
 #else
@@ -439,7 +439,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for color random numbers
   constexpr size_t sizePerEventRndNumColor = 1;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for color random numbers
   typedef HostBuffer<fptype, sizePerEventRndNumColor, HostBufferALIGNED> HostBufferRndNumColor;
 #else
@@ -457,7 +457,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for helicity selection
   constexpr size_t sizePerEventSelectedHelicity = 1;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for helicity selection
   typedef HostBuffer<int, sizePerEventSelectedHelicity, HostBufferALIGNED> HostBufferSelectedHelicity;
 #else
@@ -475,7 +475,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for color selection
   constexpr size_t sizePerEventSelectedColor = 1;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for color selection
   typedef HostBuffer<int, sizePerEventSelectedColor, HostBufferALIGNED> HostBufferSelectedColor;
 #else
@@ -487,7 +487,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   template<class Tdst, class Tsrc>
   void copyDeviceFromHost( Tdst& dst, const Tsrc& src ) // keep the same order of arguments as in memcpy
   {
@@ -504,13 +504,13 @@ namespace mg5amcCpu
       throw std::runtime_error( sstr.str() );
     }
     // NB (PR #45): cudaMemcpy involves an intermediate memcpy to pinned memory if host array is a not a pinned host array
-    checkCuda( cudaMemcpy( dst.data(), src.data(), src.bytes(), cudaMemcpyHostToDevice ) );
+    gpuMemcpy( dst.data(), src.data(), src.bytes(), gpuMemcpyHostToDevice );
   }
 #endif
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   template<class Tdst, class Tsrc>
   void copyHostFromDevice( Tdst& dst, const Tsrc& src ) // keep the same order of arguments as in memcpy
   {
@@ -527,7 +527,7 @@ namespace mg5amcCpu
       throw std::runtime_error( sstr.str() );
     }
     // NB (PR #45): cudaMemcpy involves an intermediate memcpy to pinned memory if host array is a not a pinned host array
-    checkCuda( cudaMemcpy( dst.data(), src.data(), src.bytes(), cudaMemcpyDeviceToHost ) );
+    gpuMemcpy( dst.data(), src.data(), src.bytes(), gpuMemcpyDeviceToHost );
   }
 #endif
 
diff --git a/epochX/cudacpp/ee_mumu.sa/SubProcesses/P1_Sigma_sm_epem_mupmum/CPPProcess.cc b/epochX/cudacpp/ee_mumu.sa/SubProcesses/P1_Sigma_sm_epem_mupmum/CPPProcess.cc
index 8bbc9ba493..709a3d6cdf 100644
--- a/epochX/cudacpp/ee_mumu.sa/SubProcesses/P1_Sigma_sm_epem_mupmum/CPPProcess.cc
+++ b/epochX/cudacpp/ee_mumu.sa/SubProcesses/P1_Sigma_sm_epem_mupmum/CPPProcess.cc
@@ -4,7 +4,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
 // MadGraph5_aMC@NLO v. 3.5.0_lo_vect, 2023-06-09
@@ -16,7 +16,6 @@
 
 #include "mgOnGpuConfig.h"
 
-#include "CudaRuntime.h"
 #include "HelAmps_sm.h"
 #include "MemoryAccessAmplitudes.h"
 #include "MemoryAccessCouplings.h"
@@ -46,7 +45,7 @@
 // Class member functions for calculating the matrix elements for
 // Process: e+ e- > mu+ mu- WEIGHTED<=4 @1
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -80,7 +79,7 @@ namespace mg5amcCpu
   __device__ const fptype cIPD[2] = { (fptype)Parameters_sm::mdl_MZ, (fptype)Parameters_sm::mdl_WZ };
   __device__ const fptype cIPC[6] = { (fptype)Parameters_sm::GC_3.real(), (fptype)Parameters_sm::GC_3.imag(), (fptype)Parameters_sm::GC_50.real(), (fptype)Parameters_sm::GC_50.imag(), (fptype)Parameters_sm::GC_59.real(), (fptype)Parameters_sm::GC_59.imag() };
 #else
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   __device__ __constant__ fptype cIPD[2];
   __device__ __constant__ fptype cIPC[6];
 #else
@@ -90,7 +89,7 @@ namespace mg5amcCpu
 #endif
 
   // Helicity combinations (and filtering of "good" helicity combinations)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   __device__ __constant__ short cHel[ncomb][npar];
   __device__ __constant__ int cNGoodHel;
   __device__ __constant__ int cGoodHel[ncomb];
@@ -118,13 +117,13 @@ namespace mg5amcCpu
                            fptype* allDenominators,       // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
                            fptype_sv* jamp2_sv            // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled)
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
                            , const int ievt00             // input: first event number in current C++ event page (for CUDA, ievt depends on threadid)
 #endif
                            )
   //ALWAYS_INLINE // attributes are not permitted in a function definition
   {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     using namespace mg5amcGpu;
     using M_ACCESS = DeviceAccessMomenta;         // non-trivial access: buffer includes all events
     using E_ACCESS = DeviceAccessMatrixElements;  // non-trivial access: buffer includes all events
@@ -151,7 +150,7 @@ namespace mg5amcCpu
 #endif /* clang-format on */
     mgDebug( 0, __FUNCTION__ );
     //printf( "calculate_wavefunctions: ihel=%2d\n", ihel );
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
     //printf( "calculate_wavefunctions: ievt00=%d\n", ievt00 );
 #endif
 
@@ -187,7 +186,7 @@ namespace mg5amcCpu
 #endif
     for( int iParity = 0; iParity < nParity; ++iParity )
     { // START LOOP ON IPARITY
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
       const int ievt0 = ievt00 + iParity * neppV;
 #endif
       constexpr size_t nxcoup = ndcoup + nicoup; // both dependent and independent couplings
@@ -200,8 +199,10 @@ namespace mg5amcCpu
         allCOUPs[idcoup] = CD_ACCESS::idcoupAccessBufferConst( allcouplings, idcoup ); // dependent couplings, vary event-by-event
       for( size_t iicoup = 0; iicoup < nicoup; iicoup++ )
         allCOUPs[ndcoup + iicoup] = CI_ACCESS::iicoupAccessBufferConst( cIPC, iicoup ); // independent couplings, fixed for all events
+#ifdef MGONGPUCPP_GPUIMPL
 #ifdef __CUDACC__
 #pragma nv_diagnostic pop
+#endif
       // CUDA kernels take input/output buffers with momenta/MEs for all events
       const fptype* momenta = allmomenta;
       const fptype* COUPs[nxcoup];
@@ -238,7 +239,7 @@ namespace mg5amcCpu
       // *** DIAGRAM 1 OF 2 ***
 
       // Wavefunction(s) for diagram number 1
-#if not( defined __CUDACC__ and defined MGONGPU_TEST_DIVERGENCE )
+#if not( defined MGONGPUCPP_GPUIMPL and defined MGONGPU_TEST_DIVERGENCE )
       opzxxx<M_ACCESS, W_ACCESS>( momenta, cHel[ihel][0], -1, w_fp[0], 0 ); // NB: opzxxx only uses pz
 #else
       if( ( blockDim.x * blockIdx.x + threadIdx.x ) % 2 == 0 )
@@ -291,7 +292,7 @@ namespace mg5amcCpu
       // [NB do keep 'static' for these constexpr arrays, see issue #283]
       static constexpr fptype2 cf[ncolor][ncolor] = { { 1 } }; // 2-D array[1][1]
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
       // Pre-compute a constexpr triangular color matrix properly normalized #475
       struct TriangularNormalizedColorMatrix
       {
@@ -348,7 +349,7 @@ namespace mg5amcCpu
 #endif
       for( int icol = 0; icol < ncolor; icol++ )
       {
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
         // === C++ START ===
         // Diagonal terms
 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
@@ -407,7 +408,7 @@ namespace mg5amcCpu
       MEs_sv_previous += deltaMEs_previous;
 #endif
       /*
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
       if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", blockDim.x * blockIdx.x + threadIdx.x, ihel, MEs_sv );
 #else
 #ifdef MGONGPU_CPPSIMD
@@ -454,8 +455,8 @@ namespace mg5amcCpu
       { 1, -1, 1, 1 },
       { 1, -1, -1, -1 },
       { 1, -1, -1, 1 } };
-#ifdef __CUDACC__
-    checkCuda( cudaMemcpyToSymbol( cHel, tHel, ncomb * npar * sizeof( short ) ) );
+#ifdef MGONGPUCPP_GPUIMPL
+    gpuMemcpyToSymbol( cHel, tHel, ncomb * npar * sizeof( short ) );
 #else
     memcpy( cHel, tHel, ncomb * npar * sizeof( short ) );
 #endif
@@ -495,9 +496,9 @@ namespace mg5amcCpu
     // Then copy them to CUDA constant memory (issue #39) or its C++ emulation in file-scope static memory
     const fptype tIPD[2] = { (fptype)m_pars->mdl_MZ, (fptype)m_pars->mdl_WZ };
     const cxtype tIPC[3] = { cxmake( m_pars->GC_3 ), cxmake( m_pars->GC_50 ), cxmake( m_pars->GC_59 ) };
-#ifdef __CUDACC__
-    checkCuda( cudaMemcpyToSymbol( cIPD, tIPD, 2 * sizeof( fptype ) ) );
-    checkCuda( cudaMemcpyToSymbol( cIPC, tIPC, 3 * sizeof( cxtype ) ) );
+#ifdef MGONGPUCPP_GPUIMPL
+    gpuMemcpyToSymbol( cIPD, tIPD, 2 * sizeof( fptype ) );
+    gpuMemcpyToSymbol( cIPC, tIPC, 3 * sizeof( cxtype ) );
 #else
     memcpy( cIPD, tIPD, 2 * sizeof( fptype ) );
     memcpy( cIPC, tIPC, 3 * sizeof( cxtype ) );
@@ -534,7 +535,7 @@ namespace mg5amcCpu
   {
     std::stringstream out;
     // CUDA version (NVCC)
-    // [Use __NVCC__ instead of __CUDACC__ here!]
+    // [Use __NVCC__ instead of MGONGPUCPP_GPUIMPL here!]
     // [This tests if 'nvcc' was used even to build a .cc file, even if not necessarily 'nvcc -x cu' for a .cu file]
     // [Check 'nvcc --compiler-options -dM -E dummy.c | grep CUDA': see https://stackoverflow.com/a/53713712]
 #ifdef __NVCC__
@@ -599,12 +600,12 @@ namespace mg5amcCpu
   __global__ void /* clang-format off */
   computeDependentCouplings( const fptype* allgs, // input: Gs[nevt]
                              fptype* allcouplings // output: couplings[nevt*ndcoup*2]
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
                              , const int nevt     // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #endif
   ) /* clang-format on */
   {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     using namespace mg5amcGpu;
     using G_ACCESS = DeviceAccessGs;
     using C_ACCESS = DeviceAccessCouplings;
@@ -625,7 +626,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__ /* clang-format off */
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
   __global__ void
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
@@ -754,9 +755,9 @@ namespace mg5amcCpu
         nGoodHel++;
       }
     }
-#ifdef __CUDACC__
-    checkCuda( cudaMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) ) );
-    checkCuda( cudaMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) ) );
+#ifdef MGONGPUCPP_GPUIMPL
+    gpuMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) );
+    gpuMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) );
 #else
     cNGoodHel = nGoodHel;
     for( int ihel = 0; ihel < ncomb; ihel++ ) cGoodHel[ihel] = goodHel[ihel];
@@ -780,7 +781,7 @@ namespace mg5amcCpu
 #endif
             int* allselhel,                // output: helicity selection[nevt]
             int* allselcol                 // output: helicity selection[nevt]
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
             , const int nevt               // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #endif
             ) /* clang-format on */
@@ -801,7 +802,7 @@ namespace mg5amcCpu
     // Denominators: spins, colors and identical particles
     constexpr int helcolDenominators[1] = { 4 }; // assume nprocesses == 1 (#272 and #343)
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     // Remember: in CUDA this is a kernel for one event, in c++ this processes n events
     const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid
 #else
@@ -815,9 +816,12 @@ namespace mg5amcCpu
 #endif
 
     // Start sigmaKin_lines
+
+#include "GpuAbstraction.h"
+
     // === PART 0 - INITIALISATION (before calculate_wavefunctions) ===
     // Reset the "matrix elements" - running sums of |M|^2 over helicities for the given event
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     allMEs[ievt] = 0;
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     allNumerators[ievt] = 0;
@@ -845,7 +849,7 @@ namespace mg5amcCpu
     // === PART 1 - HELICITY LOOP: CALCULATE WAVEFUNCTIONS ===
     // (in both CUDA and C++, using precomputed good helicities)
 
-#ifdef __CUDACC__ // CUDA OR C++
+#ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++
 
     // *** START OF PART 1a - CUDA (one event per CPU thread) ***
     // Running sum of partial amplitudes squared for event by event color selection (#402)
@@ -1049,7 +1053,7 @@ namespace mg5amcCpu
     // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event
     // [NB 'sum over final spins, average over initial spins', eg see
     // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf]
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     allMEs[ievt] /= helcolDenominators[0];
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     if( channelId > 0 ) allMEs[ievt] *= allNumerators[ievt] / allDenominators[ievt];
diff --git a/epochX/cudacpp/ee_mumu.sa/SubProcesses/P1_Sigma_sm_epem_mupmum/CPPProcess.h b/epochX/cudacpp/ee_mumu.sa/SubProcesses/P1_Sigma_sm_epem_mupmum/CPPProcess.h
index 08d6c29e7b..ebbc2800d3 100644
--- a/epochX/cudacpp/ee_mumu.sa/SubProcesses/P1_Sigma_sm_epem_mupmum/CPPProcess.h
+++ b/epochX/cudacpp/ee_mumu.sa/SubProcesses/P1_Sigma_sm_epem_mupmum/CPPProcess.h
@@ -4,7 +4,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
 // MadGraph5_aMC@NLO v. 3.5.0_lo_vect, 2023-06-09
@@ -25,7 +25,7 @@
 
 //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -107,7 +107,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   __global__ void
   computeDependentCouplings( const fptype* allgs,    // input: Gs[nevt]
                              fptype* allcouplings ); // output: couplings[nevt*ndcoup*2]
@@ -120,7 +120,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__ /* clang-format off */
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
   __global__ void
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
@@ -150,7 +150,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__ /* clang-format off */
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
   __global__ void
   sigmaKin( const fptype* allmomenta,      // input: momenta[nevt*npar*4]
             const fptype* allcouplings,    // input: couplings[nevt*ndcoup*2]
diff --git a/epochX/cudacpp/ee_mumu.sa/SubProcesses/P1_Sigma_sm_epem_mupmum/CudaRuntime.h b/epochX/cudacpp/ee_mumu.sa/SubProcesses/P1_Sigma_sm_epem_mupmum/CudaRuntime.h
deleted file mode 120000
index ce9e1a487a..0000000000
--- a/epochX/cudacpp/ee_mumu.sa/SubProcesses/P1_Sigma_sm_epem_mupmum/CudaRuntime.h
+++ /dev/null
@@ -1 +0,0 @@
-../CudaRuntime.h
\ No newline at end of file
diff --git a/epochX/cudacpp/ee_mumu.sa/SubProcesses/P1_Sigma_sm_epem_mupmum/check_sa.cc b/epochX/cudacpp/ee_mumu.sa/SubProcesses/P1_Sigma_sm_epem_mupmum/check_sa.cc
index f1e75b9252..1bad694d1c 100644
--- a/epochX/cudacpp/ee_mumu.sa/SubProcesses/P1_Sigma_sm_epem_mupmum/check_sa.cc
+++ b/epochX/cudacpp/ee_mumu.sa/SubProcesses/P1_Sigma_sm_epem_mupmum/check_sa.cc
@@ -4,7 +4,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: O. Mattelaer (Nov 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 
 #include "mgOnGpuConfig.h"
@@ -12,6 +12,7 @@
 #include "BridgeKernels.h"
 #include "CPPProcess.h"
 #include "CrossSectionKernels.h"
+#include "GpuRuntime.h"
 #include "MatrixElementKernels.h"
 #include "MemoryAccessMatrixElements.h"
 #include "MemoryAccessMomenta.h"
@@ -63,7 +64,7 @@ usage( char* argv0, int ret = 1 )
   std::cout << std::endl;
   std::cout << "Summary stats are always computed: '-p' and '-j' only control their printout" << std::endl;
   std::cout << "The '-d' flag only enables NaN/abnormal warnings and OMP debugging" << std::endl;
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 #ifdef _OPENMP
   std::cout << std::endl;
   std::cout << "Use the OMP_NUM_THREADS environment variable to control OMP multi-threading" << std::endl;
@@ -77,7 +78,7 @@ int
 main( int argc, char** argv )
 {
   // Namespaces for CUDA and C++ (FIXME - eventually use the same namespace everywhere...)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   using namespace mg5amcGpu;
 #else
   using namespace mg5amcCpu;
@@ -103,11 +104,11 @@ main( int argc, char** argv )
     CurandDevice = 2
   };
 #ifdef __CUDACC__
-  RandomNumberMode rndgen = RandomNumberMode::CurandDevice; // default on GPU
+  RandomNumberMode rndgen = RandomNumberMode::CurandDevice; // default on CUDA GPU
 #elif not defined MGONGPU_HAS_NO_CURAND
   RandomNumberMode rndgen = RandomNumberMode::CurandHost;  // default on CPU if build has curand
 #else
-  RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // default on CPU if build has no curand
+  RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // default on CPU if build has no curand and on HIP GPU
 #endif
   // Rambo sampling mode (NB RamboHost implies CommonRandom or CurandHost!)
   enum class RamboSamplingMode
@@ -115,7 +116,7 @@ main( int argc, char** argv )
     RamboHost = 1,
     RamboDevice = 2
   };
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   RamboSamplingMode rmbsmp = RamboSamplingMode::RamboDevice; // default on GPU
 #else
   RamboSamplingMode rmbsmp = RamboSamplingMode::RamboHost; // default on CPU
@@ -148,7 +149,7 @@ main( int argc, char** argv )
 #ifdef __CUDACC__
       rndgen = RandomNumberMode::CurandDevice;
 #else
-      throw std::runtime_error( "CurandDevice is not supported on CPUs" );
+      throw std::runtime_error( "CurandDevice is not supported on CPUs or on HIP GPUs" );
 #endif
     }
     else if( arg == "--curhst" )
@@ -165,7 +166,7 @@ main( int argc, char** argv )
     }
     else if( arg == "--rmbdev" )
     {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
       rmbsmp = RamboSamplingMode::RamboDevice;
 #else
       throw std::runtime_error( "RamboDevice is not supported on CPUs" );
@@ -239,13 +240,13 @@ main( int argc, char** argv )
     return usage( argv[0] );
   }
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 #ifdef _OPENMP
   ompnumthreadsNotSetMeansOneThread( debug ? 1 : 0 ); // quiet(-1), info(0), debug(1)
 #endif
 #endif
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // Fail gently and avoid "Illegal instruction (core dumped)" if the host does not support the SIMD used in the ME calculation
   // Note: this prevents a crash on pmpe04 but not on some github CI nodes?
   // [NB: SIMD vectorization in mg5amc C++ code is only used in the ME calculation below MatrixElementKernelHost!]
@@ -263,14 +264,14 @@ main( int argc, char** argv )
 
   // === STEP 0 - INITIALISE
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 
-  // --- 00. Initialise cuda
-  // Instantiate a CudaRuntime at the beginnining of the application's main to
-  // invoke cudaSetDevice(0) in the constructor and book a cudaDeviceReset() call in the destructor
-  const std::string cdinKey = "00 CudaInit";
+  // --- 00. Initialise GPU
+  // Instantiate a GpuRuntime at the beginnining of the application's main.
+  // For CUDA this invokes cudaSetDevice(0) in the constructor and books a cudaDeviceReset() call in the destructor.
+  const std::string cdinKey = "00 GpuInit";
   timermap.start( cdinKey );
-  CudaRuntime cudaRuntime( debug );
+  GpuRuntime GpuRuntime( debug );
 #endif
 
   // --- 0a. Initialise physics process
@@ -292,7 +293,7 @@ main( int argc, char** argv )
   timermap.start( alloKey );
 
   // Memory buffers for random numbers for momenta
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferRndNumMomenta hstRndmom( nevt );
 #else
   PinnedHostBufferRndNumMomenta hstRndmom( nevt );
@@ -300,7 +301,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for sampling weights
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferWeights hstWeights( nevt );
 #else
   PinnedHostBufferWeights hstWeights( nevt );
@@ -308,7 +309,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for momenta
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferMomenta hstMomenta( nevt );
 #else
   PinnedHostBufferMomenta hstMomenta( nevt );
@@ -316,7 +317,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for Gs
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferGs hstGs( nevt );
 #else
   PinnedHostBufferGs hstGs( nevt );
@@ -333,7 +334,7 @@ main( int argc, char** argv )
   }
 
   // Memory buffers for matrix elements
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferMatrixElements hstMatrixElements( nevt );
 #else
   PinnedHostBufferMatrixElements hstMatrixElements( nevt );
@@ -342,7 +343,7 @@ main( int argc, char** argv )
 
   // Memory buffers for random numbers for helicity selection
   // *** NB #403 these buffers always remain initialised at 0: no need for helicity choice in gcheck/check (no LHE produced) ***
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferRndNumHelicity hstRndHel( nevt );
 #else
   PinnedHostBufferRndNumHelicity hstRndHel( nevt );
@@ -351,7 +352,7 @@ main( int argc, char** argv )
 
   // Memory buffers for random numbers for color selection
   // *** NB #402 these buffers always remain initialised at 0: no need for color choice in gcheck/check (no LHE produced) ***
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferRndNumColor hstRndCol( nevt );
 #else
   PinnedHostBufferRndNumColor hstRndCol( nevt );
@@ -359,7 +360,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for helicity selection
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferSelectedHelicity hstSelHel( nevt );
 #else
   PinnedHostBufferSelectedHelicity hstSelHel( nevt );
@@ -367,7 +368,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for color selection
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferSelectedColor hstSelCol( nevt );
 #else
   PinnedHostBufferSelectedColor hstSelCol( nevt );
@@ -403,7 +404,7 @@ main( int argc, char** argv )
 #else
   else
   {
-    throw std::logic_error( "CurandDevice is not supported on CPUs" ); // INTERNAL ERROR (no path to this statement)
+    throw std::logic_error( "CurandDevice is not supported on CPUs or HIP GPUs" ); // INTERNAL ERROR (no path to this statement)
   }
 #endif
 #else
@@ -421,7 +422,7 @@ main( int argc, char** argv )
   }
   else
   {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     prsk.reset( new RamboSamplingKernelDevice( energy, devRndmom, devMomenta, devWeights, gpublocks, gputhreads ) );
 #else
     throw std::logic_error( "RamboDevice is not supported on CPUs" ); // INTERNAL ERROR (no path to this statement)
@@ -432,7 +433,7 @@ main( int argc, char** argv )
   std::unique_ptr<MatrixElementKernelBase> pmek;
   if( !bridge )
   {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     pmek.reset( new MatrixElementKernelDevice( devMomenta, devGs, devRndHel, devRndCol, devMatrixElements, devSelHel, devSelCol, gpublocks, gputhreads ) );
 #else
     pmek.reset( new MatrixElementKernelHost( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, nevt ) );
@@ -440,7 +441,7 @@ main( int argc, char** argv )
   }
   else
   {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     pmek.reset( new BridgeKernelDevice( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, gpublocks, gputhreads ) );
 #else
     pmek.reset( new BridgeKernelHost( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, nevt ) );
@@ -482,7 +483,7 @@ main( int argc, char** argv )
     prnk->generateRnarray();
     //std::cout << "Got random numbers" << std::endl;
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     if( rndgen != RandomNumberMode::CurandDevice && rmbsmp == RamboSamplingMode::RamboDevice )
     {
       // --- 1c. Copy rndmom from host to device
@@ -514,7 +515,7 @@ main( int argc, char** argv )
     prsk->getMomentaFinal();
     //std::cout << "Got final momenta" << std::endl;
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     if( rmbsmp == RamboSamplingMode::RamboDevice )
     {
       // --- 2c. CopyDToH Weights
@@ -559,7 +560,7 @@ main( int argc, char** argv )
       dynamic_cast<BridgeKernelBase*>( pmek.get() )->transposeInputMomentaC2F();
     }
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     // --- 2d. CopyHToD Momenta
     const std::string gKey = "0.. CpHTDg";
     rambtime += timermap.start( gKey ); // FIXME! NOT A RAMBO TIMER!
@@ -588,7 +589,7 @@ main( int argc, char** argv )
     wv3atime += timermap.stop(); // calc only
     wavetime += wv3atime;        // calc plus copy
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     if( !bridge )
     {
       // --- 3b. CopyDToH MEs
@@ -731,15 +732,19 @@ main( int argc, char** argv )
     rndgentxt = "CURAND DEVICE";
 #ifdef __CUDACC__
   rndgentxt += " (CUDA code)";
+#elif defined __HIPCC__
+  rndgentxt += " (HIP code)";
 #else
   rndgentxt += " (C++ code)";
 #endif
 
   // Workflow description summary
   std::string wrkflwtxt;
-  // -- CUDA or C++?
+  // -- CUDA or HIP or C++?
 #ifdef __CUDACC__
   wrkflwtxt += "CUD:";
+#elif defined __HIPCC__
+  wrkflwtxt += "HIP:";
 #else
   wrkflwtxt += "CPP:";
 #endif
@@ -764,6 +769,12 @@ main( int argc, char** argv )
 #else
   wrkflwtxt += "???:"; // no path to this statement
 #endif
+#elif defined __HIPCC__
+#if defined MGONGPU_CUCXTYPE_CXSMPL
+  wrkflwtxt += "CXS:";
+#else
+  wrkflwtxt += "???:"; // no path to this statement
+#endif
 #else
 #if defined MGONGPU_CPPCXTYPE_STDCOMPLEX
   wrkflwtxt += "STX:";
@@ -789,7 +800,7 @@ main( int argc, char** argv )
     wrkflwtxt += "RMBDEV+";
   else
     wrkflwtxt += "??????+"; // no path to this statement
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   // -- HOST or DEVICE matrix elements? Standalone MEs or BRIDGE?
   if( !bridge )
     wrkflwtxt += "MESDEV";
@@ -845,7 +856,7 @@ main( int argc, char** argv )
 
   if( perf )
   {
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 #ifdef _OPENMP
     // Get the output of "nproc --all" (https://stackoverflow.com/a/478960)
     std::string nprocall;
@@ -866,6 +877,8 @@ main( int argc, char** argv )
     std::cout << std::string( SEP79, '*' ) << std::endl
 #ifdef __CUDACC__
               << "Process                     = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_CUDA"
+#elif defined __HIPCC__
+              << "Process                     = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_HIP"
 #else
               << "Process                     = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_CPP"
 #endif
@@ -892,21 +905,21 @@ main( int argc, char** argv )
 #elif defined MGONGPU_FPTYPE_FLOAT
               << "FP precision                = FLOAT (NaN/abnormal=" << nabn << ", zero=" << nzero << ")" << std::endl
 #endif
-#ifdef __CUDACC__
 #if defined MGONGPU_CUCXTYPE_CUCOMPLEX
               << "Complex type                = CUCOMPLEX" << std::endl
 #elif defined MGONGPU_CUCXTYPE_THRUST
               << "Complex type                = THRUST::COMPLEX" << std::endl
-#endif
-#else
+#elif defined MGONGPU_CUCXTYPE_CXSMPL
               << "Complex type                = STD::COMPLEX" << std::endl
+#else
+              << "Complex type                = ???" << std::endl // no path to this statement...
 #endif
               << "RanNumb memory layout       = AOSOA[" << neppR << "]"
               << ( neppR == 1 ? " == AOS" : "" )
               << " [HARDCODED FOR REPRODUCIBILITY]" << std::endl
               << "Momenta memory layout       = AOSOA[" << neppM << "]"
               << ( neppM == 1 ? " == AOS" : "" ) << std::endl
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     //<< "Wavefunction GPU memory     = LOCAL" << std::endl
 #else
 #if !defined MGONGPU_CPPSIMD
@@ -937,7 +950,7 @@ main( int argc, char** argv )
 #endif
 #endif
               << "Random number generation    = " << rndgentxt << std::endl
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 #ifdef _OPENMP
               << "OMP threads / `nproc --all` = " << omp_get_max_threads() << " / " << nprocall // includes a newline
 #endif
@@ -1033,14 +1046,14 @@ main( int argc, char** argv )
              << "\"FLOAT (NaN/abnormal=" << nabn << ")\"," << std::endl
 #endif
              << "\"Complex type\": "
-#ifdef __CUDACC__
 #if defined MGONGPU_CUCXTYPE_CUCOMPLEX
              << "\"CUCOMPLEX\"," << std::endl
 #elif defined MGONGPU_CUCXTYPE_THRUST
              << "\"THRUST::COMPLEX\"," << std::endl
-#endif
-#else
+#elif defined MGONGPU_CUCXTYPE_CXSMPL
              << "\"STD::COMPLEX\"," << std::endl
+#else
+             << "\"???\"," << std::endl                           // no path to this statement...
 #endif
              << "\"RanNumb memory layout\": "
              << "\"AOSOA[" << neppR << "]\""
@@ -1048,7 +1061,7 @@ main( int argc, char** argv )
              << "\"Momenta memory layout\": "
              << "\"AOSOA[" << neppM << "]\""
              << ( neppM == 1 ? " == AOS" : "" ) << ", " << std::endl
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     //<< "\"Wavefunction GPU memory\": " << "\"LOCAL\"," << std::endl
 #endif
              << "\"Curand generation\": "
diff --git a/epochX/cudacpp/ee_mumu.sa/SubProcesses/RamboSamplingKernels.cc b/epochX/cudacpp/ee_mumu.sa/SubProcesses/RamboSamplingKernels.cc
index da68aa9255..79abbcc4f8 100644
--- a/epochX/cudacpp/ee_mumu.sa/SubProcesses/RamboSamplingKernels.cc
+++ b/epochX/cudacpp/ee_mumu.sa/SubProcesses/RamboSamplingKernels.cc
@@ -1,11 +1,11 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #include "RamboSamplingKernels.h"
 
-#include "CudaRuntime.h"
+#include "GpuRuntime.h"
 #include "MemoryAccessMomenta.h"
 #include "MemoryAccessRandomNumbers.h"
 #include "MemoryAccessWeights.h"
@@ -14,7 +14,7 @@
 
 #include <sstream>
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -92,7 +92,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   RamboSamplingKernelDevice::RamboSamplingKernelDevice( const fptype energy,               // input: energy
                                                         const BufferRndNumMomenta& rndmom, // input: random numbers in [0,1]
                                                         BufferMomenta& momenta,            // output: momenta
@@ -135,7 +135,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   __global__ void
   getMomentaInitialDevice( const fptype energy,
                            fptype* momenta )
@@ -147,17 +147,17 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   void
   RamboSamplingKernelDevice::getMomentaInitial()
   {
-    getMomentaInitialDevice<<<m_gpublocks, m_gputhreads>>>( m_energy, m_momenta.data() );
+    gpuLaunchKernel( getMomentaInitialDevice, m_gpublocks, m_gputhreads, m_energy, m_momenta.data() );
   }
 #endif
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   __global__ void
   getMomentaFinalDevice( const fptype energy,
                          const fptype* rndmom,
@@ -171,11 +171,11 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   void
   RamboSamplingKernelDevice::getMomentaFinal()
   {
-    getMomentaFinalDevice<<<m_gpublocks, m_gputhreads>>>( m_energy, m_rndmom.data(), m_momenta.data(), m_weights.data() );
+    gpuLaunchKernel( getMomentaFinalDevice, m_gpublocks, m_gputhreads, m_energy, m_rndmom.data(), m_momenta.data(), m_weights.data() );
   }
 #endif
 
diff --git a/epochX/cudacpp/ee_mumu.sa/SubProcesses/RamboSamplingKernels.h b/epochX/cudacpp/ee_mumu.sa/SubProcesses/RamboSamplingKernels.h
index 184089efd7..7c214cd74b 100644
--- a/epochX/cudacpp/ee_mumu.sa/SubProcesses/RamboSamplingKernels.h
+++ b/epochX/cudacpp/ee_mumu.sa/SubProcesses/RamboSamplingKernels.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef RAMBOSAMPLINGKERNELS_H
 #define RAMBOSAMPLINGKERNELS_H 1
@@ -10,7 +10,7 @@
 
 #include "MemoryBuffers.h"
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -93,7 +93,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   // A class encapsulating RAMBO phase space sampling on a GPU device
   class RamboSamplingKernelDevice final : public SamplingKernelBase, public NumberOfEvents
   {
diff --git a/epochX/cudacpp/ee_mumu.sa/SubProcesses/RandomNumberKernels.h b/epochX/cudacpp/ee_mumu.sa/SubProcesses/RandomNumberKernels.h
index 188a72c2c9..21d63beeac 100644
--- a/epochX/cudacpp/ee_mumu.sa/SubProcesses/RandomNumberKernels.h
+++ b/epochX/cudacpp/ee_mumu.sa/SubProcesses/RandomNumberKernels.h
@@ -1,14 +1,14 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef RANDOMNUMBERKERNELS_H
 #define RANDOMNUMBERKERNELS_H 1
 
 #include "mgOnGpuConfig.h"
 
-// NB This must come AFTER mgOnGpuConfig.h which contains our definition of __global__ when __CUDACC__ is not defined
+// NB This must come AFTER mgOnGpuConfig.h which contains our definition of __global__ when MGONGPUCPP_GPUIMPL is not defined
 #ifndef MGONGPU_HAS_NO_CURAND
 //#include "curand.h"
 struct curandGenerator_st; // forward definition from curand.h
@@ -16,7 +16,7 @@ struct curandGenerator_st; // forward definition from curand.h
 
 #include "MemoryBuffers.h"
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/ee_mumu.sa/SubProcesses/cudacpp.mk b/epochX/cudacpp/ee_mumu.sa/SubProcesses/cudacpp.mk
index 43cee0977e..77334d2c04 100644
--- a/epochX/cudacpp/ee_mumu.sa/SubProcesses/cudacpp.mk
+++ b/epochX/cudacpp/ee_mumu.sa/SubProcesses/cudacpp.mk
@@ -1,7 +1,7 @@
 # Copyright (C) 2020-2023 CERN and UCLouvain.
 # Licensed under the GNU Lesser General Public License (version 3 or later).
 # Created by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin.
-# Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+# Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 
 #=== Determine the name of this makefile (https://ftp.gnu.org/old-gnu/Manuals/make-3.80/html_node/make_17.html)
 #=== NB: different names (e.g. cudacpp.mk and cudacpp_src.mk) are used in the Subprocess and src directories
@@ -32,7 +32,7 @@ UNAME_P := $(shell uname -p)
 #=== Configure common compiler flags for C++ and CUDA
 
 INCFLAGS = -I.
-OPTFLAGS = -O3 # this ends up in CUFLAGS too (should it?), cannot add -Ofast or -ffast-math here
+OPTFLAGS = -O3 # this ends up in GPUFLAGS too (should it?), cannot add -Ofast or -ffast-math here
 
 # Dependency on src directory
 MG5AMC_COMMONLIB = mg5amc_common
@@ -103,69 +103,135 @@ endif
 
 #-------------------------------------------------------------------------------
 
-#=== Configure the CUDA compiler
-
-# If CXX is not a single word (example "clang++ --gcc-toolchain...") then disable CUDA builds (issue #505)
-# This is because it is impossible to pass this to "CUFLAGS += -ccbin <host-compiler>" below
-ifneq ($(words $(subst ccache ,,$(CXX))),1) # allow at most "CXX=ccache <host-compiler>" from outside
-  $(warning CUDA builds are not supported for multi-word CXX "$(CXX)")
-  override CUDA_HOME=disabled
-endif
-
-# If CUDA_HOME is not set, try to set it from the location of nvcc
-ifndef CUDA_HOME
-  CUDA_HOME = $(patsubst %bin/nvcc,%,$(shell which nvcc 2>/dev/null))
-  $(warning CUDA_HOME was not set: using "$(CUDA_HOME)")
-endif
-
-# Set NVCC as $(CUDA_HOME)/bin/nvcc if it exists
-ifneq ($(wildcard $(CUDA_HOME)/bin/nvcc),)
-  NVCC = $(CUDA_HOME)/bin/nvcc
-  USE_NVTX ?=-DUSE_NVTX
-  # See https://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/index.html
-  # See https://arnon.dk/matching-sm-architectures-arch-and-gencode-for-various-nvidia-cards/
-  # Default: use compute capability 70 for V100 (CERN lxbatch, CERN itscrd, Juwels Cluster).
-  # Embed device code for 70, and PTX for 70+.
-  # Export MADGRAPH_CUDA_ARCHITECTURE (comma-separated list) to use another value or list of values (see #533).
-  # Examples: use 60 for P100 (Piz Daint), 80 for A100 (Juwels Booster, NVidia raplab/Curiosity).
-  MADGRAPH_CUDA_ARCHITECTURE ?= 70
-  ###CUARCHFLAGS = -gencode arch=compute_$(MADGRAPH_CUDA_ARCHITECTURE),code=compute_$(MADGRAPH_CUDA_ARCHITECTURE) -gencode arch=compute_$(MADGRAPH_CUDA_ARCHITECTURE),code=sm_$(MADGRAPH_CUDA_ARCHITECTURE) # Older implementation (AV): go back to this one for multi-GPU support #533
-  ###CUARCHFLAGS = --gpu-architecture=compute_$(MADGRAPH_CUDA_ARCHITECTURE) --gpu-code=sm_$(MADGRAPH_CUDA_ARCHITECTURE),compute_$(MADGRAPH_CUDA_ARCHITECTURE) # Newer implementation (SH): cannot use this as-is for multi-GPU support #533
-  comma:=,
-  CUARCHFLAGS = $(foreach arch,$(subst $(comma), ,$(MADGRAPH_CUDA_ARCHITECTURE)),-gencode arch=compute_$(arch),code=compute_$(arch) -gencode arch=compute_$(arch),code=sm_$(arch))
-  CUINC = -I$(CUDA_HOME)/include/
-  CURANDLIBFLAGS = -L$(CUDA_HOME)/lib64/ -lcurand # NB: -lcuda is not needed here!
-  CUOPTFLAGS = -lineinfo
-  CUFLAGS = $(foreach opt, $(OPTFLAGS), -Xcompiler $(opt)) $(CUOPTFLAGS) $(INCFLAGS) $(CUINC) $(USE_NVTX) $(CUARCHFLAGS) -use_fast_math
-  ###CUFLAGS += -Xcompiler -Wall -Xcompiler -Wextra -Xcompiler -Wshadow
-  ###NVCC_VERSION = $(shell $(NVCC) --version | grep 'Cuda compilation tools' | cut -d' ' -f5 | cut -d, -f1)
-  CUFLAGS += -std=c++17 # need CUDA >= 11.2 (see #333): this is enforced in mgOnGpuConfig.h
-  # Without -maxrregcount: baseline throughput: 6.5E8 (16384 32 12) up to 7.3E8 (65536 128 12)
-  ###CUFLAGS+= --maxrregcount 160 # improves throughput: 6.9E8 (16384 32 12) up to 7.7E8 (65536 128 12)
-  ###CUFLAGS+= --maxrregcount 128 # improves throughput: 7.3E8 (16384 32 12) up to 7.6E8 (65536 128 12)
-  ###CUFLAGS+= --maxrregcount 96 # degrades throughput: 4.1E8 (16384 32 12) up to 4.5E8 (65536 128 12)
-  ###CUFLAGS+= --maxrregcount 64 # degrades throughput: 1.7E8 (16384 32 12) flat at 1.7E8 (65536 128 12)
-else ifneq ($(origin REQUIRE_CUDA),undefined)
-  # If REQUIRE_CUDA is set but no cuda is found, stop here (e.g. for CI tests on GPU #443)
-  $(error No cuda installation found (set CUDA_HOME or make nvcc visible in PATH))
-else
-  # No cuda. Switch cuda compilation off and go to common random numbers in C++
-  $(warning CUDA_HOME is not set or is invalid: export CUDA_HOME to compile with cuda)
-  override NVCC=
-  override USE_NVTX=
-  override CUINC=
-  override CURANDLIBFLAGS=
-endif
-export NVCC
-export CUFLAGS
+CUDA_COMPILER_PATH := $(shell compiler="`which nvcc 2>/dev/null`" && while [ -L "$$compiler" ]; do compiler=`readlink "$$compiler"`; done && echo "$$compiler")
+HIP_COMPILER_PATH := $(shell compiler="`which hipcc 2>/dev/null`" && while [ -L "$$compiler" ]; do compiler=`readlink "$$compiler"`; done && echo "$$compiler")
+
+ifeq ($(findstring nvcc,$(CUDA_COMPILER_PATH)),nvcc)
+  #=== Configure the CUDA compiler
+
+  # If CXX is not a single word (example "clang++ --gcc-toolchain...") then disable CUDA builds (issue #505)
+  # This is because it is impossible to pass this to "GPUFLAGS += -ccbin <host-compiler>" below
+  ifneq ($(words $(subst ccache ,,$(CXX))),1) # allow at most "CXX=ccache <host-compiler>" from outside
+    $(warning CUDA builds are not supported for multi-word CXX "$(CXX)")
+    override CUDA_HOME=disabled
+  endif
+
+  # If CUDA_HOME is not set, try to set it from the location of nvcc
+  ifndef CUDA_HOME
+    CUDA_HOME = $(patsubst %bin/nvcc,%,$(shell which nvcc 2>/dev/null))
+    $(warning CUDA_HOME was not set: using "$(CUDA_HOME)")
+  endif
+
+  # Set GPUCC as $(CUDA_HOME)/bin/nvcc if it exists
+  ifneq ($(wildcard $(CUDA_HOME)/bin/nvcc),)
+    GPUCC = $(CUDA_HOME)/bin/nvcc
+    USE_NVTX ?=-DUSE_NVTX
+    # See https://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/index.html
+    # See https://arnon.dk/matching-sm-architectures-arch-and-gencode-for-various-nvidia-cards/
+    # Default: use compute capability 70 for V100 (CERN lxbatch, CERN itscrd, Juwels Cluster).
+    # Embed device code for 70, and PTX for 70+.
+    # Export MADGRAPH_CUDA_ARCHITECTURE (comma-separated list) to use another value or list of values (see #533).
+    # Examples: use 60 for P100 (Piz Daint), 80 for A100 (Juwels Booster, NVidia raplab/Curiosity).
+    MADGRAPH_CUDA_ARCHITECTURE ?= 70
+    ###CUARCHFLAGS = -gencode arch=compute_$(MADGRAPH_CUDA_ARCHITECTURE),code=compute_$(MADGRAPH_CUDA_ARCHITECTURE) -gencode arch=compute_$(MADGRAPH_CUDA_ARCHITECTURE),code=sm_$(MADGRAPH_CUDA_ARCHITECTURE) # Older implementation (AV): go back to this one for multi-GPU support #533
+    ###CUARCHFLAGS = --gpu-architecture=compute_$(MADGRAPH_CUDA_ARCHITECTURE) --gpu-code=sm_$(MADGRAPH_CUDA_ARCHITECTURE),compute_$(MADGRAPH_CUDA_ARCHITECTURE) # Newer implementation (SH): cannot use this as-is for multi-GPU support #533
+    comma:=,
+    CUARCHFLAGS = $(foreach arch,$(subst $(comma), ,$(MADGRAPH_CUDA_ARCHITECTURE)),-gencode arch=compute_$(arch),code=compute_$(arch) -gencode arch=compute_$(arch),code=sm_$(arch))
+    CUINC = -I$(CUDA_HOME)/include/
+    CURANDLIBFLAGS = -L$(CUDA_HOME)/lib64/ -lcurand # NB: -lcuda is not needed here!
+    CUOPTFLAGS = -lineinfo
+    GPUFLAGS = $(foreach opt, $(OPTFLAGS), -Xcompiler $(opt)) $(CUOPTFLAGS) $(INCFLAGS) $(CUINC) $(USE_NVTX) $(CUARCHFLAGS) -use_fast_math
+    ###GPUFLAGS += -Xcompiler -Wall -Xcompiler -Wextra -Xcompiler -Wshadow
+    ###GPUCC_VERSION = $(shell $(GPUCC) --version | grep 'Cuda compilation tools' | cut -d' ' -f5 | cut -d, -f1)
+    GPUFLAGS += -std=c++17 # need CUDA >= 11.2 (see #333): this is enforced in mgOnGpuConfig.h
+    # Without -maxrregcount: baseline throughput: 6.5E8 (16384 32 12) up to 7.3E8 (65536 128 12)
+    ###GPUFLAGS+= --maxrregcount 160 # improves throughput: 6.9E8 (16384 32 12) up to 7.7E8 (65536 128 12)
+    ###GPUFLAGS+= --maxrregcount 128 # improves throughput: 7.3E8 (16384 32 12) up to 7.6E8 (65536 128 12)
+    ###GPUFLAGS+= --maxrregcount 96 # degrades throughput: 4.1E8 (16384 32 12) up to 4.5E8 (65536 128 12)
+    ###GPUFLAGS+= --maxrregcount 64 # degrades throughput: 1.7E8 (16384 32 12) flat at 1.7E8 (65536 128 12)
+    CUBUILDRULEFLAGS = -Xcompiler -fPIC -c
+    CCBUILDRULEFLAGS = -Xcompiler -fPIC -c -x cu
+    CUDATESTFLAGS = -lcuda
+  else ifneq ($(origin REQUIRE_CUDA),undefined)
+    # If REQUIRE_CUDA is set but no cuda is found, stop here (e.g. for CI tests on GPU #443)
+    $(error No cuda installation found (set CUDA_HOME or make GPUCC visible in PATH))
+  else
+    # No cuda. Switch cuda compilation off and go to common random numbers in C++
+    $(warning CUDA_HOME is not set or is invalid: export CUDA_HOME to compile with cuda)
+    override GPUCC=
+    override USE_NVTX=
+    override CUINC=
+    override CURANDLIBFLAGS=
+  endif
+  export GPUCC
+  export GPUFLAGS
+
+  # Set the host C++ compiler for GPUCC via "-ccbin <host-compiler>"
+  # (NB issue #505: this must be a single word, "clang++ --gcc-toolchain..." is not supported)
+  GPUFLAGS += -ccbin $(shell which $(subst ccache ,,$(CXX)))
+
+  # Allow newer (unsupported) C++ compilers with older versions of CUDA if ALLOW_UNSUPPORTED_COMPILER_IN_CUDA is set (#504)
+  ifneq ($(origin ALLOW_UNSUPPORTED_COMPILER_IN_CUDA),undefined)
+  GPUFLAGS += -allow-unsupported-compiler
+  endif
+
+else ifeq ($(findstring hipcc,$(HIP_COMPILER_PATH)),hipcc)
+  #=== Configure the HIP compiler
+
+  # If CXX is not a single word (example "clang++ --gcc-toolchain...") then disable CUDA builds (issue #505)
+  # This is because it is impossible to pass this to "GPUFLAGS += -ccbin <host-compiler>" below
+  ifneq ($(words $(subst ccache ,,$(CXX))),1) # allow at most "CXX=ccache <host-compiler>" from outside
+    $(warning CUDA builds are not supported for multi-word CXX "$(CXX)")
+    override CUDA_HOME=disabled
+  endif
+
+  # If HIP_HOME is not set, try to set it from the location of GPUCC
+  ifndef HIP_HOME
+    HIP_HOME = $(patsubst %bin/hipcc,%,$(HIP_COMPILER_PATH))
+    $(warning HIP_HOME was not set: using "$(HIP_HOME)")
+  endif
 
-# Set the host C++ compiler for nvcc via "-ccbin <host-compiler>"
-# (NB issue #505: this must be a single word, "clang++ --gcc-toolchain..." is not supported)
-CUFLAGS += -ccbin $(shell which $(subst ccache ,,$(CXX)))
+  # Set GPUCC as $(HIP_HOME)/bin/hipcc if it exists
+  ifneq ($(wildcard $(HIP_HOME)/bin/hipcc),)
+    GPUCC = $(HIP_HOME)/bin/hipcc
+
+    # Should maybe find something equivelant to this in HIP
+    #USE_NVTX ?=-DUSE_NVTX
+
+    HIPARCHFLAGS = -target x86_64-linux-gnu --offload-arch=gfx90a
+    HIPINC = -I$(HIP_HOME)/include/
+
+    # -DHIP_FAST_MATH equivelant to -use_fast_math in HIP 
+    # (But only for single precision line 208: https://rocm-developer-tools.github.io/HIP/hcc__detail_2math__functions_8h_source.html)
+    GPUFLAGS = $(OPTFLAGS) $(CUOPTFLAGS) $(INCFLAGS) $(HIPINC) $(HIPARCHFLAGS) -DHIP_FAST_MATH -DHIP_PLATFORM=amd -fPIC
+    ###GPUFLAGS += -Xcompiler -Wall -Xcompiler -Wextra -Xcompiler -Wshadow
+    GPUFLAGS += -std=c++17
+    # Without -maxrregcount: baseline throughput: 6.5E8 (16384 32 12) up to 7.3E8 (65536 128 12)
+    ###GPUFLAGS+= --maxrregcount 160 # improves throughput: 6.9E8 (16384 32 12) up to 7.7E8 (65536 128 12)
+    ###GPUFLAGS+= --maxrregcount 128 # improves throughput: 7.3E8 (16384 32 12) up to 7.6E8 (65536 128 12)
+    ###GPUFLAGS+= --maxrregcount 96 # degrades throughput: 4.1E8 (16384 32 12) up to 4.5E8 (65536 128 12)
+    ###GPUFLAGS+= --maxrregcount 64 # degrades throughput: 1.7E8 (16384 32 12) flat at 1.7E8 (65536 128 12)
+
+    CUBUILDRULEFLAGS = -fPIC -c
+    CCBUILDRULEFLAGS = -fPIC -c
+
+  else ifneq ($(origin REQUIRE_HIP),undefined)
+    # If REQUIRE_HIP is set but no cuda is found, stop here (e.g. for CI tests on GPU #443)
+    $(error No hip installation found (set HIP_HOME or make GPUCC visible in PATH))
+  else
+    # No hip. Switch hip compilation off and go to common random numbers in C++
+    $(warning HIP_HOME is not set or is invalid: export HIP_HOME to compile with hip)
+    override GPUCC=
+    override USE_NVTX=
+    override CUINC=
+    override CURANDLIBFLAGS=
+  endif
+
+  # Allow newer (unsupported) C++ compilers with older versions of CUDA if ALLOW_UNSUPPORTED_COMPILER_IN_CUDA is set (#504)
+  ifneq ($(origin ALLOW_UNSUPPORTED_COMPILER_IN_CUDA),undefined)
+  GPUFLAGS += -allow-unsupported-compiler
+  endif
 
-# Allow newer (unsupported) C++ compilers with older versions of CUDA if ALLOW_UNSUPPORTED_COMPILER_IN_CUDA is set (#504)
-ifneq ($(origin ALLOW_UNSUPPORTED_COMPILER_IN_CUDA),undefined)
-CUFLAGS += -allow-unsupported-compiler
 endif
 
 #-------------------------------------------------------------------------------
@@ -179,9 +245,9 @@ endif
 #ifeq ($(USECCACHE)$(shell echo $(AR) | grep ccache),1)
 #  override AR:=ccache $(AR)
 #endif
-ifneq ($(NVCC),)
-  ifeq ($(USECCACHE)$(shell echo $(NVCC) | grep ccache),1)
-    override NVCC:=ccache $(NVCC)
+ifneq ($(GPUCC),)
+  ifeq ($(USECCACHE)$(shell echo $(GPUCC) | grep ccache),1)
+    override GPUCC:=ccache $(GPUCC)
   endif
 endif
 
@@ -205,7 +271,7 @@ endif
 
 # PowerPC-specific CUDA compiler flags (to be reviewed!)
 ifeq ($(UNAME_P),ppc64le)
-  CUFLAGS+= -Xcompiler -mno-float128
+  GPUFLAGS+= -Xcompiler -mno-float128
 endif
 
 #-------------------------------------------------------------------------------
@@ -215,10 +281,10 @@ endif
 # Set the default OMPFLAGS choice
 ifneq ($(shell $(CXX) --version | egrep '^Intel'),)
 override OMPFLAGS = -fopenmp
-###override OMPFLAGS = # disable OpenMP MT on Intel (was ok without nvcc but not ok with nvcc before #578)
+###override OMPFLAGS = # disable OpenMP MT on Intel (was ok without GPUCC but not ok with GPUCC before #578)
 else ifneq ($(shell $(CXX) --version | egrep '^(clang)'),)
 override OMPFLAGS = -fopenmp
-###override OMPFLAGS = # disable OpenMP MT on clang (was not ok without or with nvcc before #578)
+###override OMPFLAGS = # disable OpenMP MT on clang (was not ok without or with GPUCC before #578)
 else ifneq ($(shell $(CXX) --version | egrep '^(Apple clang)'),)
 override OMPFLAGS = # disable OpenMP MT on Apple clang (builds fail in the CI #578)
 else
@@ -269,7 +335,10 @@ endif
 
 # Set the default RNDGEN (random number generator) choice
 ifeq ($(RNDGEN),)
-  ifeq ($(NVCC),)
+  ifeq ($(GPUCC),)
+    override RNDGEN = hasNoCurand
+  # Edgecase for HIP compilation
+  else ifeq ($(findstring hipcc,$(GPUCC)),hipcc)
     override RNDGEN = hasNoCurand
   else ifeq ($(RNDGEN),)
     override RNDGEN = hasCurand
@@ -344,13 +413,13 @@ CXXFLAGS+= $(AVXFLAGS)
 $(info FPTYPE=$(FPTYPE))
 ifeq ($(FPTYPE),d)
   CXXFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_DOUBLE
-  CUFLAGS  += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_DOUBLE
+  GPUFLAGS  += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_DOUBLE
 else ifeq ($(FPTYPE),f)
   CXXFLAGS += -DMGONGPU_FPTYPE_FLOAT -DMGONGPU_FPTYPE2_FLOAT
-  CUFLAGS  += -DMGONGPU_FPTYPE_FLOAT -DMGONGPU_FPTYPE2_FLOAT
+  GPUFLAGS  += -DMGONGPU_FPTYPE_FLOAT -DMGONGPU_FPTYPE2_FLOAT
 else ifeq ($(FPTYPE),m)
   CXXFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_FLOAT
-  CUFLAGS  += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_FLOAT
+  GPUFLAGS  += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_FLOAT
 else
   $(error Unknown FPTYPE='$(FPTYPE)': only 'd', 'f' and 'm' are supported)
 endif
@@ -359,7 +428,7 @@ endif
 $(info HELINL=$(HELINL))
 ifeq ($(HELINL),1)
   CXXFLAGS += -DMGONGPU_INLINE_HELAMPS
-  CUFLAGS  += -DMGONGPU_INLINE_HELAMPS
+  GPUFLAGS  += -DMGONGPU_INLINE_HELAMPS
 else ifneq ($(HELINL),0)
   $(error Unknown HELINL='$(HELINL)': only '0' and '1' are supported)
 endif
@@ -368,7 +437,7 @@ endif
 $(info HRDCOD=$(HRDCOD))
 ifeq ($(HRDCOD),1)
   CXXFLAGS += -DMGONGPU_HARDCODE_PARAM
-  CUFLAGS  += -DMGONGPU_HARDCODE_PARAM
+  GPUFLAGS  += -DMGONGPU_HARDCODE_PARAM
 else ifneq ($(HRDCOD),0)
   $(error Unknown HRDCOD='$(HRDCOD)': only '0' and '1' are supported)
 endif
@@ -420,11 +489,11 @@ ifeq ($(UNAME_S),Darwin)
   override CULIBFLAGSRPATH2 =
 else
   # RPATH to cuda/cpp libs when linking executables
-  override CXXLIBFLAGSRPATH = -Wl,-rpath,$(LIBDIRRPATH)
-  override CULIBFLAGSRPATH = -Xlinker -rpath,$(LIBDIRRPATH)
+  override CXXLIBFLAGSRPATH = -Wl,-rpath=$(LIBDIRRPATH)
+  override CULIBFLAGSRPATH = -Xlinker -rpath=$(LIBDIRRPATH)
   # RPATH to common lib when linking cuda/cpp libs
-  override CXXLIBFLAGSRPATH2 = -Wl,-rpath,'$$ORIGIN'
-  override CULIBFLAGSRPATH2 = -Xlinker -rpath,'$$ORIGIN'
+  override CXXLIBFLAGSRPATH2 = -Wl,-rpath='$$ORIGIN'
+  override CULIBFLAGSRPATH2 = -Xlinker -rpath='$$ORIGIN'
 endif
 
 # Setting LD_LIBRARY_PATH or DYLD_LIBRARY_PATH in the RUNTIME is no longer necessary (neither on Linux nor on Mac)
@@ -437,7 +506,7 @@ override RUNTIME =
 cxx_main=$(BUILDDIR)/check.exe
 fcxx_main=$(BUILDDIR)/fcheck.exe
 
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 cu_main=$(BUILDDIR)/gcheck.exe
 fcu_main=$(BUILDDIR)/fgcheck.exe
 else
@@ -468,15 +537,16 @@ $(BUILDDIR)/.build.$(TAG):
 	@touch $(BUILDDIR)/.build.$(TAG)
 
 # Generic target and build rules: objects from CUDA compilation
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 $(BUILDDIR)/%.o : %.cu *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
 	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
-	$(NVCC) $(CPPFLAGS) $(CUFLAGS) -Xcompiler -fPIC -c $< -o $@
+	$(GPUCC) $(CPPFLAGS) $(GPUFLAGS) $(CUBUILDRULEFLAGS) $< -o $@
 
 $(BUILDDIR)/%_cu.o : %.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
 	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
-	$(NVCC) $(CPPFLAGS) $(CUFLAGS) -Xcompiler -fPIC -c -x cu $< -o $@
+	$(GPUCC) $(CPPFLAGS) $(GPUFLAGS) $(CCBUILDRULEFLAGS) $< -o $@
 endif
+# -x cu in line above
 
 # Generic target and build rules: objects from C++ compilation
 # (NB do not include CUINC here! add it only for NVTX or curand #679)
@@ -485,11 +555,14 @@ $(BUILDDIR)/%.o : %.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
 	$(CXX) $(CPPFLAGS) $(CXXFLAGS) -fPIC -c $< -o $@
 
 # Apply special build flags only to CrossSectionKernel.cc and gCrossSectionKernel.cu (no fast math, see #117 and #516)
+# Added edgecase for HIP compilation
 ifeq ($(shell $(CXX) --version | grep ^nvc++),)
 $(BUILDDIR)/CrossSectionKernels.o: CXXFLAGS := $(filter-out -ffast-math,$(CXXFLAGS))
 $(BUILDDIR)/CrossSectionKernels.o: CXXFLAGS += -fno-fast-math
-ifneq ($(NVCC),)
-$(BUILDDIR)/gCrossSectionKernels.o: CUFLAGS += -Xcompiler -fno-fast-math
+ifeq ($(findstring nvcc,$(GPUCC)),nvcc)
+  $(BUILDDIR)/gCrossSectionKernels.o: GPUFLAGS += -Xcompiler -fno-fast-math
+else
+  $(BUILDDIR)/gCrossSectionKernels.o: GPUFLAGS += -fno-fast-math
 endif
 endif
 
@@ -505,10 +578,10 @@ ifeq ($(RNDGEN),hasCurand)
 $(BUILDDIR)/CurandRandomNumberKernel.o: CXXFLAGS += $(CUINC)
 endif
 
-# Avoid "warning: builtin __has_trivial_... is deprecated; use __is_trivially_... instead" in nvcc with icx2023 (#592)
+# Avoid "warning: builtin __has_trivial_... is deprecated; use __is_trivially_... instead" in GPUCC with icx2023 (#592)
 ifneq ($(shell $(CXX) --version | egrep '^(Intel)'),)
-ifneq ($(NVCC),)
-CUFLAGS += -Xcompiler -Wno-deprecated-builtins
+ifneq ($(GPUCC),)
+GPUFLAGS += -Wno-deprecated-builtins
 endif
 endif
 
@@ -516,8 +589,8 @@ endif
 # This patch does remove the warning, but I prefer to keep it disabled for the moment...
 ###ifneq ($(shell $(CXX) --version | egrep '^(clang|Apple clang|Intel)'),)
 ###$(BUILDDIR)/CrossSectionKernels.o: CXXFLAGS += -Wno-overriding-t-option
-###ifneq ($(NVCC),)
-###$(BUILDDIR)/gCrossSectionKernels.o: CUFLAGS += -Xcompiler -Wno-overriding-t-option
+###ifneq ($(GPUCC),)
+###$(BUILDDIR)/gCrossSectionKernels.o: GPUFLAGS += -Xcompiler -Wno-overriding-t-option
 ###endif
 ###endif
 
@@ -544,7 +617,7 @@ MG5AMC_CXXLIB = mg5amc_$(processid_short)_cpp
 cxx_objects_lib=$(BUILDDIR)/CPPProcess.o $(BUILDDIR)/MatrixElementKernels.o $(BUILDDIR)/BridgeKernels.o $(BUILDDIR)/CrossSectionKernels.o
 cxx_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel.o $(BUILDDIR)/RamboSamplingKernels.o
 
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 MG5AMC_CULIB = mg5amc_$(processid_short)_cuda
 cu_objects_lib=$(BUILDDIR)/gCPPProcess.o $(BUILDDIR)/gMatrixElementKernels.o $(BUILDDIR)/gBridgeKernels.o $(BUILDDIR)/gCrossSectionKernels.o
 cu_objects_exe=$(BUILDDIR)/gCommonRandomNumberKernel.o $(BUILDDIR)/gRamboSamplingKernels.o
@@ -556,11 +629,11 @@ $(LIBDIR)/lib$(MG5AMC_CXXLIB).so: cxx_objects_lib += $(BUILDDIR)/fbridge.o
 $(LIBDIR)/lib$(MG5AMC_CXXLIB).so: $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib)
 	$(CXX) -shared -o $@ $(cxx_objects_lib) $(CXXLIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB)
 
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 $(LIBDIR)/lib$(MG5AMC_CULIB).so: $(BUILDDIR)/fbridge_cu.o
 $(LIBDIR)/lib$(MG5AMC_CULIB).so: cu_objects_lib += $(BUILDDIR)/fbridge_cu.o
 $(LIBDIR)/lib$(MG5AMC_CULIB).so: $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cu_objects_lib)
-	$(NVCC) --shared -o $@ $(cu_objects_lib) $(CULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB)
+	$(GPUCC) --shared -o $@ $(cu_objects_lib) $(CULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB)
 endif
 
 #-------------------------------------------------------------------------------
@@ -577,16 +650,16 @@ $(cxx_main): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PAT
 $(cxx_main): $(BUILDDIR)/check_sa.o $(LIBDIR)/lib$(MG5AMC_CXXLIB).so $(cxx_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel.o
 	$(CXX) -o $@ $(BUILDDIR)/check_sa.o $(OMPFLAGS) -ldl -pthread $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_CXXLIB) $(cxx_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel.o $(CURANDLIBFLAGS)
 
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 ifneq ($(shell $(CXX) --version | grep ^Intel),)
-$(cu_main): LIBFLAGS += -lintlc # compile with icpx and link with nvcc (undefined reference to `_intel_fast_memcpy')
-$(cu_main): LIBFLAGS += -lsvml # compile with icpx and link with nvcc (undefined reference to `__svml_cos4_l9')
+$(cu_main): LIBFLAGS += -lintlc # compile with icpx and link with GPUCC (undefined reference to `_intel_fast_memcpy')
+$(cu_main): LIBFLAGS += -lsvml # compile with icpx and link with GPUCC (undefined reference to `__svml_cos4_l9')
 else ifneq ($(shell $(CXX) --version | grep ^nvc++),) # support nvc++ #531
 $(cu_main): LIBFLAGS += -L$(patsubst %bin/nvc++,%lib,$(subst ccache ,,$(CXX))) -lnvhpcatm -lnvcpumath -lnvc
 endif
 $(cu_main): LIBFLAGS += $(CULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH
 $(cu_main): $(BUILDDIR)/gcheck_sa.o $(LIBDIR)/lib$(MG5AMC_CULIB).so $(cu_objects_exe) $(BUILDDIR)/gCurandRandomNumberKernel.o
-	$(NVCC) -o $@ $(BUILDDIR)/gcheck_sa.o $(CUARCHFLAGS) $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe) $(BUILDDIR)/gCurandRandomNumberKernel.o $(CURANDLIBFLAGS)
+	$(GPUCC) -o $@ $(BUILDDIR)/gcheck_sa.o $(CUARCHFLAGS) $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe) $(BUILDDIR)/gCurandRandomNumberKernel.o $(CURANDLIBFLAGS)
 endif
 
 #-------------------------------------------------------------------------------
@@ -612,17 +685,17 @@ $(fcxx_main): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PA
 $(fcxx_main): $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler.o $(LIBDIR)/lib$(MG5AMC_CXXLIB).so $(cxx_objects_exe)
 	$(CXX) -o $@ $(BUILDDIR)/fcheck_sa.o $(OMPFLAGS) $(BUILDDIR)/fsampler.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CXXLIB) $(cxx_objects_exe)
 
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 ifneq ($(shell $(CXX) --version | grep ^Intel),)
-$(fcu_main): LIBFLAGS += -lintlc # compile with icpx and link with nvcc (undefined reference to `_intel_fast_memcpy')
-$(fcu_main): LIBFLAGS += -lsvml # compile with icpx and link with nvcc (undefined reference to `__svml_cos4_l9')
+$(fcu_main): LIBFLAGS += -lintlc # compile with icpx and link with GPUCC (undefined reference to `_intel_fast_memcpy')
+$(fcu_main): LIBFLAGS += -lsvml # compile with icpx and link with GPUCC (undefined reference to `__svml_cos4_l9')
 endif
 ifeq ($(UNAME_S),Darwin)
 $(fcu_main): LIBFLAGS += -L$(shell dirname $(shell $(FC) --print-file-name libgfortran.dylib)) # add path to libgfortran on Mac #375
 endif
 $(fcu_main): LIBFLAGS += $(CULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH
 $(fcu_main): $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler_cu.o $(LIBDIR)/lib$(MG5AMC_CULIB).so $(cu_objects_exe)
-	$(NVCC) -o $@ $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler_cu.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe)
+	$(GPUCC) -o $@ $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler_cu.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe)
 endif
 
 #-------------------------------------------------------------------------------
@@ -634,7 +707,7 @@ $(BUILDDIR)/testxxx.o: testxxx_cc_ref.txt
 $(testmain): $(BUILDDIR)/testxxx.o
 $(testmain): cxx_objects_exe += $(BUILDDIR)/testxxx.o # Comment out this line to skip the C++ test of xxx functions
 
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 $(BUILDDIR)/testxxx_cu.o: $(GTESTLIBS)
 $(BUILDDIR)/testxxx_cu.o: INCFLAGS += $(GTESTINC)
 $(BUILDDIR)/testxxx_cu.o: testxxx_cc_ref.txt
@@ -647,7 +720,7 @@ $(BUILDDIR)/testmisc.o: INCFLAGS += $(GTESTINC)
 $(testmain): $(BUILDDIR)/testmisc.o
 $(testmain): cxx_objects_exe += $(BUILDDIR)/testmisc.o # Comment out this line to skip the C++ miscellaneous tests
 
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 $(BUILDDIR)/testmisc_cu.o: $(GTESTLIBS)
 $(BUILDDIR)/testmisc_cu.o: INCFLAGS += $(GTESTINC)
 $(testmain): $(BUILDDIR)/testmisc_cu.o
@@ -659,12 +732,12 @@ $(BUILDDIR)/runTest.o: INCFLAGS += $(GTESTINC)
 $(testmain): $(BUILDDIR)/runTest.o
 $(testmain): cxx_objects_exe += $(BUILDDIR)/runTest.o
 
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 $(BUILDDIR)/runTest_cu.o: $(GTESTLIBS)
 $(BUILDDIR)/runTest_cu.o: INCFLAGS += $(GTESTINC)
 ifneq ($(shell $(CXX) --version | grep ^Intel),)
-$(testmain): LIBFLAGS += -lintlc # compile with icpx and link with nvcc (undefined reference to `_intel_fast_memcpy')
-$(testmain): LIBFLAGS += -lsvml # compile with icpx and link with nvcc (undefined reference to `__svml_cos4_l9')
+$(testmain): LIBFLAGS += -lintlc # compile with icpx and link with GPUCC (undefined reference to `_intel_fast_memcpy')
+$(testmain): LIBFLAGS += -lsvml # compile with icpx and link with GPUCC (undefined reference to `__svml_cos4_l9')
 else ifneq ($(shell $(CXX) --version | grep ^nvc++),) # support nvc++ #531
 $(testmain): LIBFLAGS += -L$(patsubst %bin/nvc++,%lib,$(subst ccache ,,$(CXX))) -lnvhpcatm -lnvcpumath -lnvc
 endif
@@ -688,14 +761,14 @@ $(testmain): LIBFLAGS += -lgomp
 endif
 endif
 
-ifeq ($(NVCC),) # link only runTest.o
+ifeq ($(GPUCC),) # link only runTest.o
 $(testmain): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH
 $(testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(cxx_objects_exe) $(GTESTLIBS)
 	$(CXX) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) -ldl -pthread $(LIBFLAGS)
 else # link both runTest.o and runTest_cu.o
 $(testmain): LIBFLAGS += $(CULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH
 $(testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) $(GTESTLIBS)
-	$(NVCC) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) -ldl $(LIBFLAGS) -lcuda
+	  $(GPUCC) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) -ldl $(LIBFLAGS) $(CUDATESTFLAGS)
 endif
 
 # Use flock (Linux only, no Mac) to allow 'make -j' if googletest has not yet been downloaded https://stackoverflow.com/a/32666215
@@ -798,9 +871,9 @@ ifeq ($(USECCACHE),1)
 	ccache --version | head -1
 endif
 	@echo ""
-	@echo NVCC=$(NVCC)
-ifneq ($(NVCC),)
-	$(NVCC) --version
+	@echo GPUCC=$(GPUCC)
+ifneq ($(GPUCC),)
+	$(GPUCC) --version
 endif
 	@echo ""
 	@echo CXX=$(CXX)
@@ -819,7 +892,7 @@ endif
 
 # Target: check (run the C++ test executable)
 # [NB THIS IS WHAT IS USED IN THE GITHUB CI!]
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 check: runTest cmpFcheck cmpFGcheck
 else
 check: runTest cmpFcheck
diff --git a/epochX/cudacpp/ee_mumu.sa/SubProcesses/fbridge.cc b/epochX/cudacpp/ee_mumu.sa/SubProcesses/fbridge.cc
index f93c05b0b3..2b956730d4 100644
--- a/epochX/cudacpp/ee_mumu.sa/SubProcesses/fbridge.cc
+++ b/epochX/cudacpp/ee_mumu.sa/SubProcesses/fbridge.cc
@@ -1,11 +1,11 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: S. Roiser (Oct 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Roiser, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #include "Bridge.h"
 #include "CPPProcess.h"
-#include "CudaRuntime.h"
+#include "GpuRuntime.h"
 
 extern "C"
 {
@@ -22,7 +22,7 @@ extern "C"
    * Using the same Fortran MadEvent code, linking to the hetrerogeneous library would allow access to both CPU and GPU implementations.
    * The specific heterogeneous configuration (how many GPUs, how many threads on each CPU, etc) could be loaded in CUDA/C++ from a data file.
    */
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   using namespace mg5amcGpu;
 #else
   using namespace mg5amcCpu;
@@ -46,8 +46,8 @@ extern "C"
    */
   void fbridgecreate_( CppObjectInFortran** ppbridge, const int* pnevtF, const int* pnparF, const int* pnp4F )
   {
-#ifdef __CUDACC__
-    CudaRuntime::setUp();
+#ifdef MGONGPUCPP_GPUIMPL
+    GpuRuntime::setUp();
 #endif
     // Create a process object, read parm card and set parameters
     // FIXME: the process instance can happily go out of scope because it is only needed to read parameters?
@@ -69,8 +69,8 @@ extern "C"
     Bridge<FORTRANFPTYPE>* pbridge = dynamic_cast<Bridge<FORTRANFPTYPE>*>( *ppbridge );
     if( pbridge == 0 ) throw std::runtime_error( "fbridgedelete_: invalid Bridge address" );
     delete pbridge;
-#ifdef __CUDACC__
-    CudaRuntime::tearDown();
+#ifdef MGONGPUCPP_GPUIMPL
+    GpuRuntime::tearDown();
 #endif
   }
 
@@ -100,7 +100,7 @@ extern "C"
   {
     Bridge<FORTRANFPTYPE>* pbridge = dynamic_cast<Bridge<FORTRANFPTYPE>*>( *ppbridge );
     if( pbridge == 0 ) throw std::runtime_error( "fbridgesequence_: invalid Bridge address" );
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     // Use the device/GPU implementation in the CUDA library
     // (there is also a host implementation in this library)
     pbridge->gpu_sequence( momenta, gs, rndhel, rndcol, *pchannelId, mes, selhel, selcol );
diff --git a/epochX/cudacpp/ee_mumu.sa/SubProcesses/fsampler.cc b/epochX/cudacpp/ee_mumu.sa/SubProcesses/fsampler.cc
index 2fb445372d..3743934f41 100644
--- a/epochX/cudacpp/ee_mumu.sa/SubProcesses/fsampler.cc
+++ b/epochX/cudacpp/ee_mumu.sa/SubProcesses/fsampler.cc
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Feb 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #include "mgOnGpuConfig.h"
 
@@ -13,7 +13,7 @@
 
 //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -40,7 +40,7 @@ namespace mg5amcCpu
   private:
     const int m_nevt; // The number of events in each iteration
     int m_iiter;      // The iteration counter (for random number seeding)
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
     HostBufferRndNumMomenta m_hstRndmom; // Memory buffers for random numbers
     HostBufferMomenta m_hstMomenta;      // Memory buffers for momenta
     HostBufferWeights m_hstWeights;      // Memory buffers for sampling weights
@@ -105,7 +105,7 @@ namespace mg5amcCpu
 
 extern "C"
 {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   using namespace mg5amcGpu;
 #else
   using namespace mg5amcCpu;
diff --git a/epochX/cudacpp/ee_mumu.sa/SubProcesses/runTest.cc b/epochX/cudacpp/ee_mumu.sa/SubProcesses/runTest.cc
index df7488178e..4c9bc9ee6b 100644
--- a/epochX/cudacpp/ee_mumu.sa/SubProcesses/runTest.cc
+++ b/epochX/cudacpp/ee_mumu.sa/SubProcesses/runTest.cc
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: S. Hageboeck (Nov 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 //----------------------------------------------------------------------------
 // Use ./runTest.exe --gtest_filter=*xxx to run only testxxx.cc tests
 //----------------------------------------------------------------------------
@@ -18,7 +18,7 @@
 #include "RandomNumberKernels.h"
 #include "epoch_process_id.h"
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 using namespace mg5amcGpu;
 #else
 using namespace mg5amcCpu;
@@ -35,7 +35,7 @@ struct CUDA_CPU_TestBase : public TestDriverBase
     : TestDriverBase( npar, refFileName ) {}
 };
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 struct CPUTest : public CUDA_CPU_TestBase
 {
   // Struct data members (process, and memory structures for random numbers, momenta, matrix elements and weights on host and device)
@@ -117,7 +117,7 @@ struct CPUTest : public CUDA_CPU_TestBase
 };
 #endif
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 struct CUDATest : public CUDA_CPU_TestBase
 {
   // Reset the device when our test goes out of scope. Note that this should happen after
@@ -126,7 +126,7 @@ struct CUDATest : public CUDA_CPU_TestBase
   {
     ~DeviceReset()
     {
-      checkCuda( cudaDeviceReset() ); // this is needed by cuda-memcheck --leak-check full
+      gpuDeviceReset(); // this is needed by cuda-memcheck --leak-check full
     }
   } deviceResetter;
 
@@ -252,7 +252,7 @@ INSTANTIATE_TEST_SUITE_P( prefix, \
                           test_suite_name, \
                           testing::Values( new CUDATest( MG_EPOCH_REFERENCE_FILE_NAME ) ) );
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 MG_INSTANTIATE_TEST_SUITE_GPU( XTESTID_GPU( MG_EPOCH_PROCESS_ID ), MadgraphTest );
 #else
 MG_INSTANTIATE_TEST_SUITE_CPU( XTESTID_CPU( MG_EPOCH_PROCESS_ID ), MadgraphTest );
diff --git a/epochX/cudacpp/ee_mumu.sa/SubProcesses/testmisc.cc b/epochX/cudacpp/ee_mumu.sa/SubProcesses/testmisc.cc
index 895d6eeb56..ba9e59a8a3 100644
--- a/epochX/cudacpp/ee_mumu.sa/SubProcesses/testmisc.cc
+++ b/epochX/cudacpp/ee_mumu.sa/SubProcesses/testmisc.cc
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 //----------------------------------------------------------------------------
 // Use ./runTest.exe --gtest_filter=*misc to run only testmisc.cc tests
 //----------------------------------------------------------------------------
@@ -17,7 +17,7 @@
 #include <sstream>
 #include <typeinfo>
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 #define TESTID( s ) s##_GPU_MISC
 #else
 #define TESTID( s ) s##_CPU_MISC
@@ -26,7 +26,7 @@
 #define XTESTID( s ) TESTID( s )
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -59,7 +59,7 @@ namespace mg5amcCpu
 
 TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testmisc )
 {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   using namespace mg5amcGpu;
 #else
   using namespace mg5amcCpu;
diff --git a/epochX/cudacpp/ee_mumu.sa/SubProcesses/testxxx.cc b/epochX/cudacpp/ee_mumu.sa/SubProcesses/testxxx.cc
index e40f635e46..016bc0f472 100644
--- a/epochX/cudacpp/ee_mumu.sa/SubProcesses/testxxx.cc
+++ b/epochX/cudacpp/ee_mumu.sa/SubProcesses/testxxx.cc
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Apr 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 //----------------------------------------------------------------------------
 // Use ./runTest.exe --gtest_filter=*xxx to run only testxxx.cc tests
 //----------------------------------------------------------------------------
@@ -24,7 +24,7 @@
 #include <iomanip>
 #include <iostream>
 #include <vector>
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 #define TESTID( s ) s##_GPU_XXX
 #else
 #define TESTID( s ) s##_CPU_XXX
@@ -32,7 +32,7 @@
 
 #define XTESTID( s ) TESTID( s )
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -42,7 +42,7 @@ namespace mg5amcCpu
   int FPEhandlerIevt = -1;
   inline void FPEhandler( int sig )
   {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     std::cerr << "Floating Point Exception (GPU): '" << FPEhandlerMessage << "' ievt=" << FPEhandlerIevt << std::endl;
 #else
     std::cerr << "Floating Point Exception (CPU neppV=" << neppV << "): '" << FPEhandlerMessage << "' ievt=" << FPEhandlerIevt << std::endl;
@@ -53,7 +53,7 @@ namespace mg5amcCpu
 
 TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testxxx )
 {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   using namespace mg5amcGpu;
 #else
   using namespace mg5amcCpu;
@@ -76,7 +76,7 @@ TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testxxx )
   assert( nevt % neppM == 0 ); // nevt must be a multiple of neppM
   assert( nevt % neppV == 0 ); // nevt must be a multiple of neppV
   // Fill in the input momenta
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   mg5amcGpu::PinnedHostBufferMomenta hstMomenta( nevt ); // AOSOA[npagM][npar=4][np4=4][neppM]
 #else
   mg5amcCpu::HostBufferMomenta hstMomenta( nevt ); // AOSOA[npagM][npar=4][np4=4][neppM]
@@ -321,7 +321,7 @@ TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testxxx )
   {
     for( int ievt = 0; ievt < nevt; ievt++ )
     {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
       using namespace mg5amcGpu;
 #else
       using namespace mg5amcCpu;
diff --git a/epochX/cudacpp/ee_mumu.sa/src/HelAmps_sm.h b/epochX/cudacpp/ee_mumu.sa/src/HelAmps_sm.h
index d6827356eb..7f45321a6d 100644
--- a/epochX/cudacpp/ee_mumu.sa/src/HelAmps_sm.h
+++ b/epochX/cudacpp/ee_mumu.sa/src/HelAmps_sm.h
@@ -5,7 +5,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: A. Valassi (Sep 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
 // MadGraph5_aMC@NLO v. 3.5.0_lo_vect, 2023-06-09
@@ -28,7 +28,7 @@
 //#include <iomanip>
 //#include <iostream>
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/ee_mumu.sa/src/Parameters_sm.cc b/epochX/cudacpp/ee_mumu.sa/src/Parameters_sm.cc
index bb5bb59266..ba2601b809 100644
--- a/epochX/cudacpp/ee_mumu.sa/src/Parameters_sm.cc
+++ b/epochX/cudacpp/ee_mumu.sa/src/Parameters_sm.cc
@@ -4,7 +4,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: A. Valassi (Sep 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
 // MadGraph5_aMC@NLO v. 3.5.0_lo_vect, 2023-06-09
@@ -17,7 +17,7 @@
 #include <iomanip>
 #include <iostream>
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 using namespace mg5amcGpu;
 #else
 using namespace mg5amcCpu;
diff --git a/epochX/cudacpp/ee_mumu.sa/src/Parameters_sm.h b/epochX/cudacpp/ee_mumu.sa/src/Parameters_sm.h
index 0a9d742cda..bc5174f191 100644
--- a/epochX/cudacpp/ee_mumu.sa/src/Parameters_sm.h
+++ b/epochX/cudacpp/ee_mumu.sa/src/Parameters_sm.h
@@ -27,7 +27,7 @@
 #include "read_slha.h"
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -93,7 +93,7 @@ namespace mg5amcCpu
 #include <limits>
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -218,7 +218,7 @@ namespace mg5amcCpu
 //==========================================================================
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -235,7 +235,7 @@ namespace mg5amcCpu
 #pragma GCC diagnostic push
 #pragma GCC diagnostic ignored "-Wunused-variable"  // e.g. <<warning: unused variable ‘mdl_G__exp__2’ [-Wunused-variable]>>
 #pragma GCC diagnostic ignored "-Wunused-parameter" // e.g. <<warning: unused parameter ‘G’ [-Wunused-parameter]>>
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 #pragma nv_diagnostic push
 #pragma nv_diag_suppress 177 // e.g. <<warning #177-D: variable "mdl_G__exp__2" was declared but never referenced>>
 #endif
@@ -259,7 +259,7 @@ namespace mg5amcCpu
       // End SM implementation - no special handling of vectors of floats as in EFT (#439)
       return out;
     }
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 #pragma GCC diagnostic pop
 #pragma nv_diagnostic pop
 #endif
diff --git a/epochX/cudacpp/ee_mumu.sa/src/cudacpp_src.mk b/epochX/cudacpp/ee_mumu.sa/src/cudacpp_src.mk
index 554d7a704c..f73ff6fa03 100644
--- a/epochX/cudacpp/ee_mumu.sa/src/cudacpp_src.mk
+++ b/epochX/cudacpp/ee_mumu.sa/src/cudacpp_src.mk
@@ -38,13 +38,13 @@ endif
 
 #-------------------------------------------------------------------------------
 
-#=== Configure the CUDA compiler (note: NVCC is already exported including ccache)
+#=== Configure the CUDA compiler (note: GPUCC is already exported including ccache)
 
-###$(info NVCC=$(NVCC))
+###$(info GPUCC=$(GPUCC))
 
 #-------------------------------------------------------------------------------
 
-#=== Configure ccache for C++ builds (note: NVCC is already exported including ccache)
+#=== Configure ccache for C++ builds (note: GPUCC is already exported including ccache)
 
 # Enable ccache if USECCACHE=1
 ifeq ($(USECCACHE)$(shell echo $(CXX) | grep ccache),1)
@@ -246,20 +246,20 @@ $(BUILDDIR)/%.o : %.cc *.h $(BUILDDIR)/.build.$(TAG)
 # Generic target and build rules: objects from CUDA compilation
 $(BUILDDIR)/%_cu.o : %.cc *.h $(BUILDDIR)/.build.$(TAG)
 	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
-	$(NVCC) $(CPPFLAGS) $(CUFLAGS) -Xcompiler -fPIC -c -x cu $< -o $@
+	$(GPUCC) $(CPPFLAGS) $(CUFLAGS) -Xcompiler -fPIC -c -x cu $< -o $@
 
 #-------------------------------------------------------------------------------
 
 cxx_objects=$(addprefix $(BUILDDIR)/, Parameters_sm.o read_slha.o)
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 cu_objects=$(addprefix $(BUILDDIR)/, Parameters_sm_cu.o)
 endif
 
 # Target (and build rules): common (src) library
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so : $(cxx_objects) $(cu_objects)
 	@if [ ! -d $(LIBDIR) ]; then echo "mkdir -p $(LIBDIR)"; mkdir -p $(LIBDIR); fi
-	$(NVCC) -shared -o $@ $(cxx_objects) $(cu_objects)
+	$(GPUCC) -shared -o $@ $(cxx_objects) $(cu_objects)
 else
 $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so : $(cxx_objects)
 	@if [ ! -d $(LIBDIR) ]; then echo "mkdir -p $(LIBDIR)"; mkdir -p $(LIBDIR); fi
diff --git a/epochX/cudacpp/ee_mumu.sa/src/mgOnGpuConfig.h b/epochX/cudacpp/ee_mumu.sa/src/mgOnGpuConfig.h
index c0f067f1d8..205accb85b 100644
--- a/epochX/cudacpp/ee_mumu.sa/src/mgOnGpuConfig.h
+++ b/epochX/cudacpp/ee_mumu.sa/src/mgOnGpuConfig.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jul 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MGONGPUCONFIG_H
 #define MGONGPUCONFIG_H 1
@@ -10,13 +10,27 @@
 // There are two different code bases for standalone_cudacpp (without multichannel) and madevent+cudacpp (with multichannel)
 #undef MGONGPU_SUPPORTS_MULTICHANNEL
 
+// Is this a GPU (CUDA, HIP) or CPU implementation?
+#ifdef __CUDACC__
+#define MGONGPUCPP_GPUIMPL cuda
+#elif defined __HIPCC__
+#define MGONGPUCPP_GPUIMPL hip
+#include "hip/hip_runtime.h"
+#else
+#undef MGONGPUCPP_GPUIMPL
+#endif
+
 // ** NB1 Throughputs (e.g. 6.8E8) are events/sec for "./gcheck.exe -p 65536 128 12"
 // ** NB2 Baseline on b7g47n0004 fluctuates (probably depends on load on other VMs)
 
 // Choose if curand is supported for generating random numbers
+// For CUDA, by default, it is supported
+// For HIP, by default, it is not supported
 // For C++, by default, do not inline, but allow this macro to be set from outside with e.g. -DMGONGPU_HAS_NO_CURAND
 #ifdef __CUDACC__
 #undef MGONGPU_HAS_NO_CURAND
+#elif defined __HIPCC__
+#define MGONGPU_HAS_NO_CURAND 1
 #else
 //#undef MGONGPU_HAS_NO_CURAND // default
 ////#define MGONGPU_HAS_NO_CURAND 1
@@ -52,23 +66,28 @@
 //#undef MGONGPU_HARDCODE_PARAM // default
 ////#define MGONGPU_HARDCODE_PARAM 1
 
-// Complex type in c++: std::complex or cxsmpl (CHOOSE ONLY ONE)
-#ifndef __CUDACC__
-//#define MGONGPU_CPPCXTYPE_STDCOMPLEX 1 // ~8 percent slower on float, same on double (5.1E6/double, 9.4E6/float)
-#define MGONGPU_CPPCXTYPE_CXSMPL 1 // new default (5.1E6/double, 10.2E6/float)
-#endif
-
-// Complex type in cuda: thrust or cucomplex or cxsmpl (CHOOSE ONLY ONE)
+// Complex type in CUDA: thrust or cucomplex or cxsmpl (CHOOSE ONLY ONE)
 #ifdef __CUDACC__
 #define MGONGPU_CUCXTYPE_THRUST 1 // default (~1.15E9/double, ~3.2E9/float)
 //#define MGONGPU_CUCXTYPE_CUCOMPLEX 1 // ~10 percent slower (1.03E9/double, ~2.8E9/float)
 //#define MGONGPU_CUCXTYPE_CXSMPL 1 // ~10 percent slower (1.00E9/double, ~2.9E9/float)
+
+// Complex type in HIP: cxsmpl (ONLY ONE OPTION POSSIBLE)
+#elif defined __HIPCC__
+#define MGONGPU_CUCXTYPE_CXSMPL 1 // ~10 percent slower (1.00E9/double, ~2.9E9/float)
+
+// Complex type in C++: std::complex or cxsmpl (CHOOSE ONLY ONE)
+#else
+//#define MGONGPU_CPPCXTYPE_STDCOMPLEX 1 // ~8 percent slower on float, same on double (5.1E6/double, 9.4E6/float)
+#define MGONGPU_CPPCXTYPE_CXSMPL 1 // new default (5.1E6/double, 10.2E6/float)
 #endif
 
-// Cuda nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation
+// CUDA nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation
 #ifdef __CUDACC__
-#undef MGONGPU_NSIGHT_DEBUG // default
+#undef MGONGPU_NSIGHT_DEBUG // default in CUDA
 //#define MGONGPU_NSIGHT_DEBUG 1
+#else
+#undef MGONGPU_NSIGHT_DEBUG // only option in HIP or C++
 #endif
 
 // SANITY CHECKS (floating point precision for everything but color algebra #537)
@@ -84,17 +103,21 @@
 #error You cannot use double precision for color algebra and single precision elsewhere
 #endif
 
-// SANITY CHECKS (c++ complex number implementation)
-#ifndef __CUDACC__
-#if defined MGONGPU_CPPCXTYPE_STDCOMPLEX and defined MGONGPU_CPPCXTYPE_CXSMPL
-#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CPPCXTYPE_STDCOMPLEX or MGONGPU_CPPCXTYPE_CXSMPL
+// SANITY CHECKS (CUDA complex number implementation)
+#ifdef __CUDACC__
+#if defined MGONGPU_CUCXTYPE_THRUST and defined MGONGPU_CUCXTYPE_CUCOMPLEX
+#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CUCXTYPE_THRUST or MGONGPU_CUCXTYPE_CUCOMPLEX for CUDA
+#elif defined MGONGPU_CUCXTYPE_THRUST and defined MGONGPU_CUCXTYPE_CXSMPL
+#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CUCXTYPE_THRUST or MGONGPU_CUCXTYPE_CXSMPL for CUDA
+#elif defined MGONGPU_CUCXTYPE_CUCOMPLEX and defined MGONGPU_CUCXTYPE_CXSMPL
+#error You must CHOOSE (ONE AND) ONLY ONE OF MGONGPU_CUCXTYPE_CUCOMPLEX or MGONGPU_CUCXTYPE_CXSMPL for CUDA
 #endif
 #endif
 
-// SANITY CHECKS (cuda complex number implementation)
-#ifdef __CUDACC__
-#if defined MGONGPU_CUCXTYPE_THRUST and defined MGONGPU_CUCXTYPE_CUCOMPLEX and defined MGONGPU_CUCXTYPE_CXSMPL
-#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CUCXTYPE_THRUST or MGONGPU_CUCXTYPE_CUCOMPLEX or MGONGPU_CUCXTYPE_CXSMPL
+// SANITY CHECKS (C++ complex number implementation)
+#ifndef MGONGPUCPP_GPUIMPL
+#if defined MGONGPU_CPPCXTYPE_STDCOMPLEX and defined MGONGPU_CPPCXTYPE_CXSMPL
+#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CPPCXTYPE_STDCOMPLEX or MGONGPU_CPPCXTYPE_CXSMPL for C++
 #endif
 #endif
 
@@ -132,7 +155,7 @@ namespace mgOnGpu
   // Alignment requirement for using reinterpret_cast with SIMD vectorized code
   // (using reinterpret_cast with non aligned memory may lead to segmentation faults!)
   // Only needed for C++ code but can be enforced also in NVCC builds of C++ code using CUDA>=11.2 and C++17 (#318, #319, #333)
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   constexpr int cppAlign = 64; // alignment requirement for SIMD vectorization (64-byte i.e. 512-bit)
 #endif
 
@@ -143,7 +166,7 @@ using mgOnGpu::fptype;
 using mgOnGpu::fptype2;
 
 // C++ SIMD vectorization width (this will be used to set neppV)
-#ifdef __CUDACC__ // CUDA implementation has no SIMD
+#ifdef MGONGPUCPP_GPUIMPL // CUDA and HIP implementations have no SIMD
 #undef MGONGPU_CPPSIMD
 #elif defined __AVX512VL__ && defined MGONGPU_PVW512 // C++ "512z" AVX512 with 512 width (512-bit ie 64-byte): 8 (DOUBLE) or 16 (FLOAT)
 #ifdef MGONGPU_FPTYPE_DOUBLE
@@ -173,9 +196,9 @@ using mgOnGpu::fptype2;
 #undef MGONGPU_CPPSIMD
 #endif
 
-// Cuda nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation
+// CUDA nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation
 // Arguments (not used so far): text is __FUNCTION__, code is 0 (start) or 1 (end)
-#if defined __CUDACC__ && defined MGONGPU_NSIGHT_DEBUG /* clang-format off */
+#if defined __CUDA__ && defined MGONGPU_NSIGHT_DEBUG /* clang-format off */
 #define mgDebugDeclare() __shared__ float mgDebugCounter[mgOnGpu::ntpbMAX];
 #define mgDebugInitialise() { mgDebugCounter[threadIdx.x] = 0; }
 #define mgDebug( code, text ) { mgDebugCounter[threadIdx.x] += 1; }
@@ -187,8 +210,8 @@ using mgOnGpu::fptype2;
 #define mgDebugFinalise() { /*noop*/ }
 #endif /* clang-format on */
 
-// Define empty CUDA declaration specifiers for C++
-#ifndef __CUDACC__
+// Define empty CUDA/HIP declaration specifiers for C++
+#ifndef MGONGPUCPP_GPUIMPL
 #define __global__
 #define __host__
 #define __device__
diff --git a/epochX/cudacpp/ee_mumu.sa/src/mgOnGpuCxtypes.h b/epochX/cudacpp/ee_mumu.sa/src/mgOnGpuCxtypes.h
index b56348bc58..6ae0c42ecb 100644
--- a/epochX/cudacpp/ee_mumu.sa/src/mgOnGpuCxtypes.h
+++ b/epochX/cudacpp/ee_mumu.sa/src/mgOnGpuCxtypes.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022, based on earlier work by D. Smith) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MGONGPUCXTYPES_H
 #define MGONGPUCXTYPES_H 1
@@ -19,7 +19,7 @@
 #include <complex>
 
 // Complex type in cuda: thrust or cucomplex or cxsmpl
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 #if defined MGONGPU_CUCXTYPE_THRUST
 #pragma clang diagnostic push
 #pragma clang diagnostic ignored "-Wtautological-compare" // for icpx2021/clang13 (https://stackoverflow.com/a/15864661)
@@ -82,7 +82,7 @@ namespace mgOnGpu /* clang-format off */
 using mgOnGpu::cxsmpl;
 
 // Printout to stream for user defined types
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -209,14 +209,14 @@ namespace mg5amcCpu
 //==========================================================================
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
 #endif
 {
   // --- Type definitions (complex type: cxtype)
-#ifdef __CUDACC__ // cuda
+#ifdef MGONGPUCPP_GPUIMPL // cuda
 #if defined MGONGPU_CUCXTYPE_THRUST
   typedef thrust::complex<fptype> cxtype;
 #elif defined MGONGPU_CUCXTYPE_CUCOMPLEX
@@ -249,7 +249,7 @@ namespace mg5amcCpu
 //==========================================================================
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -301,7 +301,7 @@ namespace mg5amcCpu
 
   //==========================================================================
 
-#if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_THRUST // cuda + thrust
+#if defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CUCXTYPE_THRUST // cuda + thrust
 
   //------------------------------
   // CUDA - using thrust::complex
@@ -337,11 +337,11 @@ namespace mg5amcCpu
     return c;
   }
 
-#endif // #if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_THRUST
+#endif // #if defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CUCXTYPE_THRUST
 
   //==========================================================================
 
-#if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_CUCOMPLEX // cuda + cucomplex
+#if defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CUCXTYPE_CUCOMPLEX // cuda + cucomplex
 
   //------------------------------
   // CUDA - using cuComplex
@@ -556,11 +556,11 @@ namespace mg5amcCpu
     return cxmake( c.real(), c.imag() );
   }
 
-#endif // #if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_CUCOMPLEX
+#endif // #if defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CUCXTYPE_CUCOMPLEX
 
   //==========================================================================
 
-#if not defined __CUDACC__ and defined MGONGPU_CPPCXTYPE_STDCOMPLEX // c++ + stdcomplex
+#if not defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CPPCXTYPE_STDCOMPLEX // c++ + stdcomplex
 
   //------------------------------
   // C++ - using std::complex
@@ -604,7 +604,7 @@ namespace mg5amcCpu
   }
 #endif
 
-#endif // #if not defined __CUDACC__ and defined MGONGPU_CPPCXTYPE_STDCOMPLEX
+#endif // #if not defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CPPCXTYPE_STDCOMPLEX
 
   //==========================================================================
 
@@ -627,7 +627,7 @@ namespace mg5amcCpu
 //==========================================================================
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/ee_mumu.sa/src/mgOnGpuFptypes.h b/epochX/cudacpp/ee_mumu.sa/src/mgOnGpuFptypes.h
index 905c97d700..fa3a02664b 100644
--- a/epochX/cudacpp/ee_mumu.sa/src/mgOnGpuFptypes.h
+++ b/epochX/cudacpp/ee_mumu.sa/src/mgOnGpuFptypes.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MGONGPUFPTYPES_H
 #define MGONGPUFPTYPES_H 1
@@ -12,7 +12,7 @@
 #include <cmath>
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL // cuda
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -20,7 +20,7 @@ namespace mg5amcCpu
 {
   //==========================================================================
 
-#ifdef __CUDACC__ // cuda
+#ifdef MGONGPUCPP_GPUIMPL // cuda
 
   //------------------------------
   // Floating point types - Cuda
@@ -64,11 +64,11 @@ namespace mg5amcCpu
 #endif
   }
 
-#endif // #ifdef __CUDACC__
+#endif // #ifdef MGONGPUCPP_GPUIMPL
 
   //==========================================================================
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 
   //------------------------------
   // Floating point types - C++
@@ -92,7 +92,7 @@ namespace mg5amcCpu
     return std::sqrt( f );
   }
 
-#endif // #ifndef __CUDACC__
+#endif // #ifndef MGONGPUCPP_GPUIMPL
 
   //==========================================================================
 
diff --git a/epochX/cudacpp/ee_mumu.sa/src/mgOnGpuVectors.h b/epochX/cudacpp/ee_mumu.sa/src/mgOnGpuVectors.h
index e1299ba81e..cdae04326b 100644
--- a/epochX/cudacpp/ee_mumu.sa/src/mgOnGpuVectors.h
+++ b/epochX/cudacpp/ee_mumu.sa/src/mgOnGpuVectors.h
@@ -32,7 +32,7 @@
 #endif
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -131,7 +131,7 @@ namespace mg5amcCpu
 #endif
 #endif
 
-#else // i.e #ifndef MGONGPU_CPPSIMD (this includes #ifdef __CUDACC__)
+#else // i.e #ifndef MGONGPU_CPPSIMD (this includes #ifdef MGONGPUCPP_GPUIMPL)
 
   const int neppV = 1;
 
@@ -153,13 +153,13 @@ namespace mg5amcCpu
 //==========================================================================
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
 #endif
 {
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 
   // Printout to stream for user defined types
 
@@ -805,11 +805,11 @@ namespace mg5amcCpu
 
 #endif // #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
 
-#endif // #ifndef __CUDACC__
+#endif // #ifndef MGONGPUCPP_GPUIMPL
 
   //==========================================================================
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 
   //------------------------------
   // Vector types - CUDA
@@ -853,12 +853,12 @@ namespace mg5amcCpu
     return mask;
   }
 
-#endif // #ifdef __CUDACC__
+#endif // #ifdef MGONGPUCPP_GPUIMPL
 
   //==========================================================================
 
   // Scalar-or-vector types: scalar in CUDA, vector or scalar in C++
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   typedef bool bool_sv;
   typedef fptype fptype_sv;
   typedef fptype2 fptype2_sv;
@@ -879,7 +879,7 @@ namespace mg5amcCpu
 #endif
 
   // Scalar-or-vector zeros: scalar in CUDA, vector or scalar in C++
-#ifdef __CUDACC__ /* clang-format off */
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
   inline __host__ __device__ cxtype cxzero_sv(){ return cxtype( 0, 0 ); }
 #elif defined MGONGPU_CPPSIMD
   inline cxtype_v cxzero_sv() { return cxtype_v(); } // RRRR=0000 IIII=0000
diff --git a/epochX/cudacpp/ee_mumu.sa/src/rambo.h b/epochX/cudacpp/ee_mumu.sa/src/rambo.h
index e02ea52496..cd7e1008ea 100644
--- a/epochX/cudacpp/ee_mumu.sa/src/rambo.h
+++ b/epochX/cudacpp/ee_mumu.sa/src/rambo.h
@@ -4,7 +4,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 
 #include "mgOnGpuConfig.h"
@@ -18,7 +18,7 @@
 #include <iostream>
 
 // Simplified rambo version for 2 to N (with N>=2) processes with massless particles
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -83,7 +83,7 @@ namespace mg5amcCpu
       static bool first = true;
       if( first )
       {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
         if constexpr( M_ACCESS::isOnDevice() ) // avoid
         {
           const int ievt0 = 0;
@@ -166,7 +166,7 @@ namespace mg5amcCpu
     wt = po2log;
     if( nparf != 2 ) wt = ( 2. * nparf - 4. ) * log( energy ) + z[nparf - 1];
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
     // issue warnings if weight is too small or too large
     static int iwarn[5] = { 0, 0, 0, 0, 0 };
     if( wt < -180. )
diff --git a/epochX/cudacpp/gg_tt.mad/CODEGEN_mad_gg_tt_log.txt b/epochX/cudacpp/gg_tt.mad/CODEGEN_mad_gg_tt_log.txt
index dd4e2c0fba..66404e5be8 100644
--- a/epochX/cudacpp/gg_tt.mad/CODEGEN_mad_gg_tt_log.txt
+++ b/epochX/cudacpp/gg_tt.mad/CODEGEN_mad_gg_tt_log.txt
@@ -53,7 +53,7 @@ Note that you can still compile and run aMC@NLO with the built-in PDFs
 Using default text editor "vi". Set another one in ./input/mg5_configuration.txt
 Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt
 No valid web browser found. Please set in ./input/mg5_configuration.txt
-import /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_mad_gg_tt.mg
+import /afs/cern.ch/work/j/jteig/mg5amcnlo/CODEGEN_mad_gg_tt.mg
 The import format was not given, so we guess it as command
 set stdout_level DEBUG
 set output information to level: 10
@@ -62,7 +62,7 @@ generate g g > t t~
 No model currently active, so we import the Standard Model
 INFO: load particles 
 INFO: load vertices 
-[1;32mDEBUG: model prefixing  takes 0.0052073001861572266 [0m
+[1;32mDEBUG: model prefixing  takes 0.003694295883178711 [0m
 INFO: Restrict model sm with file models/sm/restrict_default.dat . 
 [1;32mDEBUG: Simplifying conditional expressions [0m
 [1;32mDEBUG: remove interactions: u s w+ at order: QED=1 [0m
@@ -155,7 +155,7 @@ INFO: Please specify coupling orders to bypass this step.
 INFO: Trying coupling order WEIGHTED<=2: WEIGTHED IS QCD+2*QED 
 INFO: Trying process: g g > t t~ WEIGHTED<=2 @1  
 INFO: Process has 3 diagrams 
-1 processes with 3 diagrams generated in 0.007 s
+1 processes with 3 diagrams generated in 0.006 s
 Total: 1 processes with 3 diagrams
 output madevent CODEGEN_mad_gg_tt --hel_recycling=False --vector_size=16384 --me_exporter=standalone_cudacpp
 Load PLUGIN.CUDACPP_SA_OUTPUT
@@ -166,10 +166,10 @@ Load PLUGIN.CUDACPP_SA_OUTPUT
 INFO: initialize a new directory: CODEGEN_mad_gg_tt 
 INFO: remove old information in CODEGEN_mad_gg_tt 
 [1;32mDEBUG:  Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [1;30m[output.py at line 157][0m [0m
-[1;34mWARNING: File exists /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_mad_gg_tt [0m
-INFO: Creating subdirectories in directory /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_mad_gg_tt 
-[1;34mWARNING: File exists /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_mad_gg_tt/Cards [0m
-[1;34mWARNING: File exists /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_mad_gg_tt/SubProcesses [0m
+[1;34mWARNING: File exists /afs/cern.ch/work/j/jteig/mg5amcnlo/CODEGEN_mad_gg_tt [0m
+INFO: Creating subdirectories in directory /afs/cern.ch/work/j/jteig/mg5amcnlo/CODEGEN_mad_gg_tt 
+[1;34mWARNING: File exists /afs/cern.ch/work/j/jteig/mg5amcnlo/CODEGEN_mad_gg_tt/Cards [0m
+[1;34mWARNING: File exists /afs/cern.ch/work/j/jteig/mg5amcnlo/CODEGEN_mad_gg_tt/SubProcesses [0m
 INFO: Organizing processes into subprocess groups 
 INFO: Generating Helas calls for process: g g > t t~ WEIGHTED<=2 @1 
 INFO: Processing color information for process: g g > t t~ @1 
@@ -177,7 +177,7 @@ INFO: Creating files in directory P1_gg_ttx
 [1;32mDEBUG:  Entering PLUGIN_OneProcessExporter.__init__ [1;30m[model_handling.py at line 1039][0m [0m
 [1;32mDEBUG:  kwargs[prefix] = 0 [1;30m[model_handling.py at line 1040][0m [0m
 [1;32mDEBUG:  proc_id = [0m 1 [1;30m[model_handling.py at line 1045][0m [0m
-[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_SA_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7f95c351daf0> [1;30m[export_v4.py at line 6174][0m [0m
+[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_SA_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7f11940656a0> [1;30m[export_v4.py at line 6174][0m [0m
 INFO: Creating files in directory . 
 [1;32mDEBUG:  Entering PLUGIN_OneProcessExporter.generate_process_files [1;30m[model_handling.py at line 1297][0m [0m
 [1;32mDEBUG:  self.include_multi_channel is already defined: this is madevent+second_exporter mode [1;30m[model_handling.py at line 1299][0m [0m
@@ -192,11 +192,11 @@ FileWriter <class 'PLUGIN.CUDACPP_SA_OUTPUT.model_handling.PLUGIN_CPPWriter'> fo
 [1;32mDEBUG:  self.support_multichannel, self.include_multi_channel = [0m True [1, 2, 3] [1;30m[model_handling.py at line 1163][0m [0m
 [1;32mDEBUG:  multi_channel = [0m {1: [0], 2: [1], 3: [2]} [1;30m[model_handling.py at line 1169][0m [0m
 [1;32mDEBUG:  multi_channel_map = [0m {1: [0], 2: [1], 3: [2]} [1;30m[model_handling.py at line 1654][0m [0m
-[1;32mDEBUG:  diag_to_config = [0m {1: 1, 2: 2, 3: 3} [1;30m[model_handling.py at line 1709][0m [0m
-[1;32mDEBUG:  call = [0m vxxxxx( momenta,m_pars->%s, cHel[ihel][%d],%+d, w_sv[%d], %d ); [1;30m[model_handling.py at line 1821][0m [0m
-[1;32mDEBUG:  ('ZERO', 0, -1, 0, 0) [1;30m[model_handling.py at line 1822][0m [0m
-[1;32mDEBUG:  call = [0m vxxxxx( momenta,m_pars->%s, cHel[ihel][%d],%+d, w_sv[%d], %d ); [1;30m[model_handling.py at line 1821][0m [0m
-[1;32mDEBUG:  ('ZERO', 1, -1, 1, 1) [1;30m[model_handling.py at line 1822][0m [0m
+[1;32mDEBUG:  diag_to_config = [0m {1: 1, 2: 2, 3: 3} [1;30m[model_handling.py at line 1711][0m [0m
+[1;32mDEBUG:  call = [0m vxxxxx( momenta,m_pars->%s, cHel[ihel][%d],%+d, w_sv[%d], %d ); [1;30m[model_handling.py at line 1823][0m [0m
+[1;32mDEBUG:  ('ZERO', 0, -1, 0, 0) [1;30m[model_handling.py at line 1824][0m [0m
+[1;32mDEBUG:  call = [0m vxxxxx( momenta,m_pars->%s, cHel[ihel][%d],%+d, w_sv[%d], %d ); [1;30m[model_handling.py at line 1823][0m [0m
+[1;32mDEBUG:  ('ZERO', 1, -1, 1, 1) [1;30m[model_handling.py at line 1824][0m [0m
 INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./. 
 [1;32mDEBUG:  Entering PLUGIN_OneProcessExporter.edit_CMakeLists [1;30m[model_handling.py at line 1343][0m [0m
 [1;32mDEBUG:  Entering PLUGIN_OneProcessExporter.edit_check_sa [1;30m[model_handling.py at line 1352][0m [0m
@@ -206,7 +206,7 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./.
 [1;32mDEBUG:  Entering PLUGIN_OneProcessExporter.edit_testxxx [1;30m[model_handling.py at line 1419][0m [0m
 [1;32mDEBUG:  Entering PLUGIN_OneProcessExporter.edit_memorybuffers [1;30m[model_handling.py at line 1430][0m [0m
 [1;32mDEBUG:  Entering PLUGIN_OneProcessExporter.edit_memoryaccesscouplings [1;30m[model_handling.py at line 1441][0m [0m
-[1;32mDEBUG:  'Copying test reference file: ', template_ref  = [0m Copying test reference file:  /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/../../../test/ref/dump_CPUTest.Sigma_sm_gg_ttx.txt [1;30m[model_handling.py at line 1335][0m [0m
+[1;32mDEBUG:  'Copying test reference file: ', template_ref  = [0m Copying test reference file:  /afs/cern.ch/work/j/jteig/mg5amcnlo/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/../../../test/ref/dump_CPUTest.Sigma_sm_gg_ttx.txt [1;30m[model_handling.py at line 1335][0m [0m
 [1;32mDEBUG:  proc_id = [0m 1 [1;30m[export_cpp.py at line 710][0m [0m
 [1;32mDEBUG:  config_map = [0m [1, 2, 3] [1;30m[export_cpp.py at line 711][0m [0m
 [1;32mDEBUG:  subproc_number = [0m 0 [1;30m[export_cpp.py at line 712][0m [0m
@@ -214,22 +214,22 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./.
 INFO: Generating Feynman diagrams for Process: g g > t t~ WEIGHTED<=2 @1 
 INFO: Finding symmetric diagrams for subprocess group gg_ttx 
 Generated helas calls for 1 subprocesses (3 diagrams) in 0.006 s
-Wrote files for 10 helas calls in 0.102 s
+Wrote files for 10 helas calls in 0.231 s
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates VVV1 set of routines with options: P0[0m
 ALOHA: aloha creates FFV1 routines[0m
-ALOHA: aloha creates 2 routines in  0.114 s
+ALOHA: aloha creates 2 routines in  0.122 s
 [1;32mDEBUG:  Entering PLUGIN_ProcessExporter.convert_model (create the model) [1;30m[output.py at line 194][0m [0m
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates VVV1 set of routines with options: P0[0m
 ALOHA: aloha creates FFV1 routines[0m
-ALOHA: aloha creates 4 routines in  0.111 s
+ALOHA: aloha creates 4 routines in  0.096 s
 <class 'aloha.create_aloha.AbstractRoutine'> VVV1
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
-FileWriter <class 'PLUGIN.CUDACPP_SA_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_mad_gg_tt/src/./HelAmps_sm.h
-INFO: Created file HelAmps_sm.h in directory /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_mad_gg_tt/src/. 
+FileWriter <class 'PLUGIN.CUDACPP_SA_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /afs/cern.ch/work/j/jteig/mg5amcnlo/CODEGEN_mad_gg_tt/src/./HelAmps_sm.h
+INFO: Created file HelAmps_sm.h in directory /afs/cern.ch/work/j/jteig/mg5amcnlo/CODEGEN_mad_gg_tt/src/. 
 super_write_set_parameters_onlyfixMajorana (hardcoded=False)
 [1;32mDEBUG:  'pardef_lines size =', len(pardef_lines), ', keys size =', len(pardef_lines.keys())  = [0m pardef_lines size = 48 , keys size = 48 [1;30m[model_handling.py at line 729][0m [0m
 super_write_set_parameters_onlyfixMajorana (hardcoded=True)
@@ -238,22 +238,22 @@ super_write_set_parameters_onlyfixMajorana (hardcoded=True)
 [1;32mDEBUG:  'pardef_lines size =', len(pardef_lines), ', keys size =', len(pardef_lines.keys())  = [0m pardef_lines size = 3 , keys size = 3 [1;30m[model_handling.py at line 729][0m [0m
 [1;32mDEBUG:  'pardef_lines size =', len(pardef_lines), ', keys size =', len(pardef_lines.keys())  = [0m pardef_lines size = 2 , keys size = 2 [1;30m[model_handling.py at line 729][0m [0m
 [1;32mDEBUG:  'pardef_lines size =', len(pardef_lines), ', keys size =', len(pardef_lines.keys())  = [0m pardef_lines size = 2 , keys size = 2 [1;30m[model_handling.py at line 729][0m [0m
-FileWriter <class 'PLUGIN.CUDACPP_SA_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_mad_gg_tt/src/./Parameters_sm.h
-FileWriter <class 'PLUGIN.CUDACPP_SA_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_mad_gg_tt/src/./Parameters_sm.cc
+FileWriter <class 'PLUGIN.CUDACPP_SA_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /afs/cern.ch/work/j/jteig/mg5amcnlo/CODEGEN_mad_gg_tt/src/./Parameters_sm.h
+FileWriter <class 'PLUGIN.CUDACPP_SA_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /afs/cern.ch/work/j/jteig/mg5amcnlo/CODEGEN_mad_gg_tt/src/./Parameters_sm.cc
 INFO: Created files Parameters_sm.h and Parameters_sm.cc in directory 
-INFO: /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_mad_gg_tt/src/. and /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_mad_gg_tt/src/. 
+INFO: /afs/cern.ch/work/j/jteig/mg5amcnlo/CODEGEN_mad_gg_tt/src/. and /afs/cern.ch/work/j/jteig/mg5amcnlo/CODEGEN_mad_gg_tt/src/. 
 The option zerowidth_tchannel is modified [True] but will not be written in the configuration files.
 If you want to make this value the default for future session, you can run 'save options --all'
-save configuration file to /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_mad_gg_tt/Cards/me5_configuration.txt
+save configuration file to /afs/cern.ch/work/j/jteig/mg5amcnlo/CODEGEN_mad_gg_tt/Cards/me5_configuration.txt
 INFO: Use Fortran compiler gfortran 
 INFO: Use c++ compiler g++ 
 INFO: Generate web pages 
-Output to directory /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_mad_gg_tt done.
+Output to directory /afs/cern.ch/work/j/jteig/mg5amcnlo/CODEGEN_mad_gg_tt done.
 Type "launch" to generate events from this process, or see
-/data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_mad_gg_tt/README
+/afs/cern.ch/work/j/jteig/mg5amcnlo/CODEGEN_mad_gg_tt/README
 Run "open index.html" to see more information about this process.
 quit
 
-real	0m2.501s
-user	0m1.613s
-sys	0m0.365s
+real	0m4.821s
+user	0m1.257s
+sys	0m1.715s
diff --git a/epochX/cudacpp/gg_tt.mad/COPYRIGHT b/epochX/cudacpp/gg_tt.mad/COPYRIGHT
index a134b5fef9..84a883fbb0 100644
--- a/epochX/cudacpp/gg_tt.mad/COPYRIGHT
+++ b/epochX/cudacpp/gg_tt.mad/COPYRIGHT
@@ -15,6 +15,7 @@ The full development team currently includes the following authors :
   Stephan Hageboeck (CERN)
   Olivier Mattelaer (Universite Catholique de Louvain, original author)
   Stefan Roiser (CERN, original author)
+  Joergen Teig (CERN)
   Andrea Valassi (CERN, original author)
   Zenny Wettersten (CERN)
 See https://github.com/madgraph5/madgraph4gpu for more details. For the full
diff --git a/epochX/cudacpp/gg_tt.mad/Cards/me5_configuration.txt b/epochX/cudacpp/gg_tt.mad/Cards/me5_configuration.txt
index 00d7c6f8d6..9e9ed9d752 100644
--- a/epochX/cudacpp/gg_tt.mad/Cards/me5_configuration.txt
+++ b/epochX/cudacpp/gg_tt.mad/Cards/me5_configuration.txt
@@ -234,7 +234,7 @@
 # pineappl = pineappl
 
 
-#mg5_path = /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo 
+#mg5_path = /afs/cern.ch/work/j/jteig/mg5amcnlo 
 
 # MG5 MAIN DIRECTORY
-#mg5_path = /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo
+#mg5_path = /afs/cern.ch/work/j/jteig/mg5amcnlo
diff --git a/epochX/cudacpp/gg_tt.mad/Source/DHELAS/aloha_file.inc b/epochX/cudacpp/gg_tt.mad/Source/DHELAS/aloha_file.inc
index 5597c614b0..eaf8cc8601 100644
--- a/epochX/cudacpp/gg_tt.mad/Source/DHELAS/aloha_file.inc
+++ b/epochX/cudacpp/gg_tt.mad/Source/DHELAS/aloha_file.inc
@@ -1 +1 @@
-ALOHARoutine = FFV1_1.o FFV1_0.o FFV1_2.o VVV1P0_1.o
+ALOHARoutine = VVV1P0_1.o FFV1_0.o FFV1_1.o FFV1_2.o
diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/Bridge.h b/epochX/cudacpp/gg_tt.mad/SubProcesses/Bridge.h
index d7e629cacd..f37c972b24 100644
--- a/epochX/cudacpp/gg_tt.mad/SubProcesses/Bridge.h
+++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/Bridge.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: S. Roiser (Nov 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Roiser, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef BRIDGE_H
 #define BRIDGE_H 1
@@ -22,7 +22,7 @@
 #include <memory>
 #include <type_traits>
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -82,7 +82,7 @@ namespace mg5amcCpu
     Bridge& operator=( const Bridge& ) = delete;
     Bridge& operator=( Bridge&& ) = delete;
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     /**
      * Set the gpublocks and gputhreads for the gpusequence - throws if evnt != gpublocks*gputhreads
      * (this is needed for BridgeKernel tests rather than for actual production use in Fortran)
@@ -149,7 +149,7 @@ namespace mg5amcCpu
     unsigned int m_nevt; // number of events
     int m_nGoodHel;      // the number of good helicities (-1 initially when they have not yet been calculated)
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     int m_gputhreads; // number of gpu threads (default set from number of events, can be modified)
     int m_gpublocks;  // number of gpu blocks (default set from number of events, can be modified)
     DeviceBuffer<FORTRANFPTYPE, sizePerEventMomenta> m_devMomentaF;
@@ -186,12 +186,12 @@ namespace mg5amcCpu
   // Forward declare transposition methods
   //
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 
   template<typename Tin, typename Tout>
   __global__ void dev_transposeMomentaF2C( const Tin* in, Tout* out, const unsigned int nevt );
 
-#endif // __CUDACC__
+#endif // MGONGPUCPP_GPUIMPL
 
   template<typename Tin, typename Tout>
   void hst_transposeMomentaF2C( const Tin* in, Tout* out, const unsigned int nevt );
@@ -208,7 +208,7 @@ namespace mg5amcCpu
   Bridge<FORTRANFPTYPE>::Bridge( unsigned int nevtF, unsigned int nparF, unsigned int np4F )
     : m_nevt( nevtF )
     , m_nGoodHel( -1 )
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     , m_gputhreads( 256 )                  // default number of gpu threads
     , m_gpublocks( m_nevt / m_gputhreads ) // this ensures m_nevt <= m_gpublocks*m_gputhreads
     , m_devMomentaF( m_nevt )
@@ -232,7 +232,7 @@ namespace mg5amcCpu
   {
     if( nparF != CPPProcess::npar ) throw std::runtime_error( "Bridge constructor: npar mismatch" );
     if( np4F != CPPProcess::np4 ) throw std::runtime_error( "Bridge constructor: np4 mismatch" );
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     if( ( m_nevt < s_gputhreadsmin ) || ( m_nevt % s_gputhreadsmin != 0 ) )
       throw std::runtime_error( "Bridge constructor: nevt should be a multiple of " + std::to_string( s_gputhreadsmin ) );
     while( m_nevt != m_gpublocks * m_gputhreads )
@@ -250,11 +250,11 @@ namespace mg5amcCpu
     std::cout << "WARNING! Instantiate host Bridge (nevt=" << m_nevt << ")" << std::endl;
     CPPProcess process( /*verbose=*/false );
     m_pmek.reset( new MatrixElementKernelHost( m_hstMomentaC, m_hstGs, m_hstRndHel, m_hstRndCol, m_hstMEs, m_hstSelHel, m_hstSelCol, m_nevt ) );
-#endif // __CUDACC__
+#endif // MGONGPUCPP_GPUIMPL
     process.initProc( "../../Cards/param_card.dat" );
   }
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   template<typename FORTRANFPTYPE>
   void Bridge<FORTRANFPTYPE>::set_gpugrid( const int gpublocks, const int gputhreads )
   {
@@ -268,7 +268,7 @@ namespace mg5amcCpu
   }
 #endif
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   template<typename FORTRANFPTYPE>
   void Bridge<FORTRANFPTYPE>::gpu_sequence( const FORTRANFPTYPE* momenta,
                                             const FORTRANFPTYPE* gs,
@@ -283,14 +283,14 @@ namespace mg5amcCpu
     constexpr int neppM = MemoryAccessMomenta::neppM;
     if constexpr( neppM == 1 && std::is_same_v<FORTRANFPTYPE, fptype> )
     {
-      checkCuda( cudaMemcpy( m_devMomentaC.data(), momenta, m_devMomentaC.bytes(), cudaMemcpyHostToDevice ) );
+      gpuMemcpy( m_devMomentaC.data(), momenta, m_devMomentaC.bytes(), gpuMemcpyHostToDevice );
     }
     else
     {
-      checkCuda( cudaMemcpy( m_devMomentaF.data(), momenta, m_devMomentaF.bytes(), cudaMemcpyHostToDevice ) );
+      gpuMemcpy( m_devMomentaF.data(), momenta, m_devMomentaF.bytes(), gpuMemcpyHostToDevice );
       const int thrPerEvt = CPPProcess::npar * CPPProcess::np4; // AV: transpose alg does 1 element per thread (NOT 1 event per thread)
       //const int thrPerEvt = 1; // AV: try new alg with 1 event per thread... this seems slower
-      dev_transposeMomentaF2C<<<m_gpublocks * thrPerEvt, m_gputhreads>>>( m_devMomentaF.data(), m_devMomentaC.data(), m_nevt );
+      gpuLaunchKernel( dev_transposeMomentaF2C, m_gpublocks * thrPerEvt, m_gputhreads, m_devMomentaF.data(), m_devMomentaC.data(), m_nevt );
     }
     if constexpr( std::is_same_v<FORTRANFPTYPE, fptype> )
     {
@@ -333,7 +333,7 @@ namespace mg5amcCpu
   }
 #endif
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   template<typename FORTRANFPTYPE>
   void Bridge<FORTRANFPTYPE>::cpu_sequence( const FORTRANFPTYPE* momenta,
                                             const FORTRANFPTYPE* gs,
@@ -388,7 +388,7 @@ namespace mg5amcCpu
   // - C++ array: momenta[npagM][npar][np4][neppM] with nevt=npagM*neppM (AOSOA)
   //
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   template<typename Tin, typename Tout>
   __global__ void dev_transposeMomentaF2C( const Tin* in, Tout* out, const unsigned int nevt )
   {
diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/BridgeKernels.cc b/epochX/cudacpp/gg_tt.mad/SubProcesses/BridgeKernels.cc
index d58066c9c1..eaf4037a24 100644
--- a/epochX/cudacpp/gg_tt.mad/SubProcesses/BridgeKernels.cc
+++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/BridgeKernels.cc
@@ -1,17 +1,18 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #include "BridgeKernels.h"
 
+#include "GpuAbstraction.h"
 #include "MemoryAccessMomenta.h"
 
 #include <sstream>
 
 //============================================================================
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -45,7 +46,7 @@ namespace mg5amcCpu
 
 //============================================================================
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 namespace mg5amcCpu
 {
 
@@ -96,7 +97,7 @@ namespace mg5amcCpu
 
 //============================================================================
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 {
 
diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/BridgeKernels.h b/epochX/cudacpp/gg_tt.mad/SubProcesses/BridgeKernels.h
index 15eb4bff4d..3efef8ce97 100644
--- a/epochX/cudacpp/gg_tt.mad/SubProcesses/BridgeKernels.h
+++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/BridgeKernels.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef BRIDGEKERNELS_H
 #define BRIDGEKERNELS_H 1
@@ -12,7 +12,7 @@
 #include "MatrixElementKernels.h"
 #include "MemoryBuffers.h"
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -49,7 +49,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A Bridge wrapper class encapsulating matrix element calculations on a CPU host
   class BridgeKernelHost final : public BridgeKernelBase
   {
@@ -89,7 +89,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   // A Bridge wrapper class encapsulating matrix element calculations on a GPU device
   class BridgeKernelDevice : public BridgeKernelBase
   {
diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/CommonRandomNumberKernel.cc b/epochX/cudacpp/gg_tt.mad/SubProcesses/CommonRandomNumberKernel.cc
index 985b39f576..010bc4cbd0 100644
--- a/epochX/cudacpp/gg_tt.mad/SubProcesses/CommonRandomNumberKernel.cc
+++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/CommonRandomNumberKernel.cc
@@ -1,15 +1,16 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #include "CommonRandomNumbers.h"
+#include "GpuAbstraction.h"
 #include "MemoryBuffers.h"
 #include "RandomNumberKernels.h"
 
 #include <cassert>
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/CrossSectionKernels.cc b/epochX/cudacpp/gg_tt.mad/SubProcesses/CrossSectionKernels.cc
index 0b355a3c8d..c15b39844d 100644
--- a/epochX/cudacpp/gg_tt.mad/SubProcesses/CrossSectionKernels.cc
+++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/CrossSectionKernels.cc
@@ -1,10 +1,11 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #include "CrossSectionKernels.h"
 
+#include "GpuAbstraction.h"
 #include "MemoryAccessMatrixElements.h"
 #include "MemoryAccessWeights.h"
 #include "MemoryBuffers.h"
@@ -77,7 +78,7 @@ debug_me_is_abnormal( const fptype& me, size_t ievtALL )
 
 //============================================================================
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -185,7 +186,7 @@ namespace mg5amcCpu
 
 //============================================================================
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 {
 
diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/CrossSectionKernels.h b/epochX/cudacpp/gg_tt.mad/SubProcesses/CrossSectionKernels.h
index 7933ca4bbf..4d9659e04e 100644
--- a/epochX/cudacpp/gg_tt.mad/SubProcesses/CrossSectionKernels.h
+++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/CrossSectionKernels.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef CROSSSECTIONKERNELS_H
 #define CROSSSECTIONKERNELS_H 1
@@ -13,7 +13,7 @@
 
 //============================================================================
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -96,7 +96,7 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
   /*
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   // A class encapsulating the calculation of event statistics on a GPU device
   class CrossSectionKernelDevice : public CrossSectionKernelBase, public NumberOfEvents
   {
diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/CudaRuntime.h b/epochX/cudacpp/gg_tt.mad/SubProcesses/CudaRuntime.h
deleted file mode 100644
index 64ce52f4b3..0000000000
--- a/epochX/cudacpp/gg_tt.mad/SubProcesses/CudaRuntime.h
+++ /dev/null
@@ -1,85 +0,0 @@
-// Copyright (C) 2020-2023 CERN and UCLouvain.
-// Licensed under the GNU Lesser General Public License (version 3 or later).
-// Created by: S. Roiser (Jul 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
-
-#ifndef MG5AMC_CUDARUNTIME_H
-#define MG5AMC_CUDARUNTIME_H 1
-
-// MG5AMC on GPU uses the CUDA runtime API, not the lower level CUDA driver API
-// See https://docs.nvidia.com/cuda/cuda-runtime-api/driver-vs-runtime-api.html#driver-vs-runtime-api
-
-#include <cassert>
-#include <iostream>
-
-//--------------------------------------------------------------------------
-
-// See https://stackoverflow.com/a/14038590
-#ifdef __CUDACC__ /* clang-format off */
-#define checkCuda( code ) { assertCuda( code, __FILE__, __LINE__ ); }
-inline void assertCuda( cudaError_t code, const char* file, int line, bool abort = true )
-{
-  if( code != cudaSuccess )
-  {
-    printf( "ERROR! assertCuda: '%s' (%d) in %s:%d\n", cudaGetErrorString( code ), code, file, line );
-    if( abort ) assert( code == cudaSuccess );
-  }
-}
-#endif /* clang-format on */
-
-//--------------------------------------------------------------------------
-
-#ifdef __CUDACC__
-namespace mg5amcGpu
-{
-  // Instantiate a CudaRuntime at the beginnining of the application's main to
-  // invoke cudaSetDevice(0) in the constructor and book a cudaDeviceReset() call in the destructor
-  // *** FIXME! This will all need to be designed differently when going to multi-GPU nodes! ***
-  struct CudaRuntime final
-  {
-    CudaRuntime( const bool debug = true )
-      : m_debug( debug ) { setUp( m_debug ); }
-    ~CudaRuntime() { tearDown( m_debug ); }
-    CudaRuntime( const CudaRuntime& ) = delete;
-    CudaRuntime( CudaRuntime&& ) = delete;
-    CudaRuntime& operator=( const CudaRuntime& ) = delete;
-    CudaRuntime& operator=( CudaRuntime&& ) = delete;
-    bool m_debug;
-
-    // Set up CUDA application
-    // ** NB: strictly speaking this is not needed when using the CUDA runtime API **
-    // Calling cudaSetDevice on startup is useful to properly book-keep the time spent in CUDA initialization
-    static void setUp( const bool debug = true )
-    {
-      // ** NB: it is useful to call cudaSetDevice, or cudaFree, to properly book-keep the time spent in CUDA initialization
-      // ** NB: otherwise, the first CUDA operation (eg a cudaMemcpyToSymbol in CPPProcess ctor) appears to take much longer!
-      /*
-      // [We initially added cudaFree(0) to "ease profile analysis" only because it shows up as a big recognizable block!]
-      // No explicit initialization is needed: https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#initialization
-      // It is not clear what cudaFree(0) does at all: https://stackoverflow.com/questions/69967813/
-      if ( debug ) std::cout << "__CudaRuntime: calling cudaFree(0)" << std::endl;
-      checkCuda( cudaFree( 0 ) ); // SLOW!
-      */
-      // Replace cudaFree(0) by cudaSetDevice(0), even if it is not really needed either
-      // (but see https://developer.nvidia.com/blog/cuda-pro-tip-always-set-current-device-avoid-multithreading-bugs)
-      if( debug ) std::cout << "__CudaRuntime: calling cudaSetDevice(0)" << std::endl;
-      checkCuda( cudaSetDevice( 0 ) ); // SLOW!
-    }
-
-    // Tear down CUDA application (call cudaDeviceReset)
-    // ** NB: strictly speaking this is not needed when using the CUDA runtime API **
-    // Calling cudaDeviceReset on shutdown is only needed for checking memory leaks in cuda-memcheck
-    // See https://docs.nvidia.com/cuda/cuda-memcheck/index.html#leak-checking
-    static void tearDown( const bool debug = true )
-    {
-      if( debug ) std::cout << "__CudaRuntime: calling cudaDeviceReset()" << std::endl;
-      checkCuda( cudaDeviceReset() );
-    }
-  };
-
-}
-#endif
-
-//--------------------------------------------------------------------------
-
-#endif // MG5AMC_CUDARUNTIME_H
diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/CurandRandomNumberKernel.cc b/epochX/cudacpp/gg_tt.mad/SubProcesses/CurandRandomNumberKernel.cc
index eb56333b03..08a16f6f2c 100644
--- a/epochX/cudacpp/gg_tt.mad/SubProcesses/CurandRandomNumberKernel.cc
+++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/CurandRandomNumberKernel.cc
@@ -1,9 +1,9 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
-#include "CudaRuntime.h"
+#include "GpuRuntime.h"
 #include "MemoryBuffers.h"
 #include "RandomNumberKernels.h"
 
@@ -22,7 +22,7 @@ inline void assertCurand( curandStatus_t code, const char *file, int line, bool
 }
 #endif /* clang-format on */
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -36,7 +36,7 @@ namespace mg5amcCpu
   {
     if( m_isOnDevice )
     {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
       if( !m_rnarray.isOnDevice() )
         throw std::runtime_error( "CurandRandomNumberKernel on device with a host random number array" );
 #else
@@ -114,7 +114,7 @@ namespace mg5amcCpu
     /*
     printf( "\nCurandRandomNumberKernel::generateRnarray size = %d\n", (int)m_rnarray.size() );
     fptype* data = m_rnarray.data();
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     if( m_rnarray.isOnDevice() )
     {
       data = new fptype[m_rnarray.size()]();
@@ -123,7 +123,7 @@ namespace mg5amcCpu
 #endif
     for( int i = 0; i < ( (int)m_rnarray.size() / 4 ); i++ )
       printf( "[%4d] %f %f %f %f\n", i * 4, data[i * 4], data[i * 4 + 2], data[i * 4 + 2], data[i * 4 + 3] );
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     if( m_rnarray.isOnDevice() ) delete[] data;
 #endif
     */
diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/EventStatistics.h b/epochX/cudacpp/gg_tt.mad/SubProcesses/EventStatistics.h
index 48b51e0a49..b425a5bade 100644
--- a/epochX/cudacpp/gg_tt.mad/SubProcesses/EventStatistics.h
+++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/EventStatistics.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef EventStatistics_H
 #define EventStatistics_H 1
@@ -16,7 +16,7 @@
 #include <limits>
 #include <string>
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/MadgraphTest.h b/epochX/cudacpp/gg_tt.mad/SubProcesses/MadgraphTest.h
index ffe3b84d53..176338151a 100644
--- a/epochX/cudacpp/gg_tt.mad/SubProcesses/MadgraphTest.h
+++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/MadgraphTest.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: S. Hageboeck (Dec 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MADGRAPHTEST_H_
 #define MADGRAPHTEST_H_ 1
@@ -21,7 +21,7 @@
 #include <string>
 #include <vector>
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 using mg5amcGpu::CPPProcess;
 #else
 using mg5amcCpu::CPPProcess;
@@ -200,7 +200,7 @@ class MadgraphTest : public testing::TestWithParam<TestDriverBase*>
 
 // Since we link both the CPU-only and GPU tests into the same executable, we prevent
 // a multiply defined symbol by only compiling this in the non-CUDA phase:
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 
 /// Compare momenta and matrix elements.
 /// This uses an implementation of TestDriverBase to run a madgraph workflow,
@@ -309,6 +309,6 @@ TEST_P( MadgraphTest, CompareMomentaAndME )
   }
 }
 
-#endif // __CUDACC__
+#endif // MGONGPUCPP_GPUIMPL
 
 #endif /* MADGRAPHTEST_H_ */
diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/MatrixElementKernels.cc b/epochX/cudacpp/gg_tt.mad/SubProcesses/MatrixElementKernels.cc
index 30257195b6..d6d6c4f179 100644
--- a/epochX/cudacpp/gg_tt.mad/SubProcesses/MatrixElementKernels.cc
+++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/MatrixElementKernels.cc
@@ -1,12 +1,12 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #include "MatrixElementKernels.h"
 
 #include "CPPProcess.h"
-#include "CudaRuntime.h"
+#include "GpuRuntime.h" // Includes the abstraction for Nvidia/AMD compilation
 #include "MemoryAccessMomenta.h"
 #include "MemoryBuffers.h"
 
@@ -14,7 +14,7 @@
 
 //============================================================================
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 namespace mg5amcCpu
 {
 
@@ -143,7 +143,7 @@ namespace mg5amcCpu
 
 //============================================================================
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 {
 
@@ -202,13 +202,13 @@ namespace mg5amcGpu
     PinnedHostBufferHelicityMask hstIsGoodHel( ncomb );
     DeviceBufferHelicityMask devIsGoodHel( ncomb );
     // ... 0d1. Compute good helicity mask on the device
-    computeDependentCouplings<<<m_gpublocks, m_gputhreads>>>( m_gs.data(), m_couplings.data() );
+    gpuLaunchKernel( computeDependentCouplings, m_gpublocks, m_gputhreads, m_gs.data(), m_couplings.data() );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    sigmaKin_getGoodHel<<<m_gpublocks, m_gputhreads>>>( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_numerators.data(), m_denominators.data(), devIsGoodHel.data() );
+    gpuLaunchKernel( sigmaKin_getGoodHel, m_gpublocks, m_gputhreads, m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_numerators.data(), m_denominators.data(), devIsGoodHel.data() );
 #else
-    sigmaKin_getGoodHel<<<m_gpublocks, m_gputhreads>>>( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), devIsGoodHel.data() );
+    gpuLaunchKernel( sigmaKin_getGoodHel, m_gpublocks, m_gputhreads, m_momenta.data(), m_couplings.data(), m_matrixElements.data(), devIsGoodHel.data() );
 #endif
-    checkCuda( cudaPeekAtLastError() );
+    checkGpu( gpuPeekAtLastError() );
     // ... 0d2. Copy back good helicity mask to the host
     copyHostFromDevice( hstIsGoodHel, devIsGoodHel );
     // ... 0d3. Copy back good helicity list to constant memory on the device
@@ -219,19 +219,19 @@ namespace mg5amcGpu
 
   void MatrixElementKernelDevice::computeMatrixElements( const unsigned int channelId )
   {
-    computeDependentCouplings<<<m_gpublocks, m_gputhreads>>>( m_gs.data(), m_couplings.data() );
+    gpuLaunchKernel( computeDependentCouplings, m_gpublocks, m_gputhreads, m_gs.data(), m_couplings.data() );
 #ifndef MGONGPU_NSIGHT_DEBUG
     constexpr unsigned int sharedMemSize = 0;
 #else
     constexpr unsigned int sharedMemSize = ntpbMAX * sizeof( float );
 #endif
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    sigmaKin<<<m_gpublocks, m_gputhreads, sharedMemSize>>>( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), channelId, m_numerators.data(), m_denominators.data(), m_selhel.data(), m_selcol.data() );
+    gpuLaunchKernelSharedMem( sigmaKin, m_gpublocks, m_gputhreads, sharedMemSize, m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), channelId, m_numerators.data(), m_denominators.data(), m_selhel.data(), m_selcol.data() );
 #else
-    sigmaKin<<<m_gpublocks, m_gputhreads, sharedMemSize>>>( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), m_selhel.data(), m_selcol.data() );
+    gpuLaunchKernelSharedMem( sigmaKin, m_gpublocks, m_gputhreads, sharedMemSize, m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), m_selhel.data(), m_selcol.data() );
 #endif
-    checkCuda( cudaPeekAtLastError() );
-    checkCuda( cudaDeviceSynchronize() );
+    checkGpu( gpuPeekAtLastError() );
+    checkGpu( gpuDeviceSynchronize() );
   }
 
   //--------------------------------------------------------------------------
diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/MatrixElementKernels.h b/epochX/cudacpp/gg_tt.mad/SubProcesses/MatrixElementKernels.h
index 23e84757a2..72bd8f195b 100644
--- a/epochX/cudacpp/gg_tt.mad/SubProcesses/MatrixElementKernels.h
+++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/MatrixElementKernels.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MATRIXELEMENTKERNELS_H
 #define MATRIXELEMENTKERNELS_H 1
@@ -10,7 +10,7 @@
 
 #include "MemoryBuffers.h"
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -81,7 +81,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating matrix element calculations on a CPU host
   class MatrixElementKernelHost final : public MatrixElementKernelBase, public NumberOfEvents
   {
@@ -130,7 +130,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   // A class encapsulating matrix element calculations on a GPU device
   class MatrixElementKernelDevice : public MatrixElementKernelBase, public NumberOfEvents
   {
diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryAccessAmplitudes.h b/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryAccessAmplitudes.h
index 573b3bbbc9..ffb76e93de 100644
--- a/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryAccessAmplitudes.h
+++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryAccessAmplitudes.h
@@ -15,7 +15,7 @@
 #define MGONGPU_TRIVIAL_AMPLITUDES 1
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryAccessCouplings.h b/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryAccessCouplings.h
index 35a3af42e0..3afdf3e554 100644
--- a/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryAccessCouplings.h
+++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryAccessCouplings.h
@@ -15,7 +15,7 @@
 #include "MemoryBuffers.h"       // for HostBufferCouplings::isaligned
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryAccessCouplingsFixed.h b/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryAccessCouplingsFixed.h
index dc0d93afff..ffcdf4dbef 100644
--- a/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryAccessCouplingsFixed.h
+++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryAccessCouplingsFixed.h
@@ -14,7 +14,7 @@
 //#include "MemoryAccessHelpers.h"
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryAccessDenominators.h b/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryAccessDenominators.h
index 3bce635718..66f2d32a6b 100644
--- a/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryAccessDenominators.h
+++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryAccessDenominators.h
@@ -10,7 +10,7 @@
 #include "MemoryAccessGs.h"
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryAccessGs.h b/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryAccessGs.h
index 31311aa375..4c726b30f3 100644
--- a/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryAccessGs.h
+++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryAccessGs.h
@@ -13,7 +13,7 @@
 #include "MemoryBuffers.h" // for HostBufferMatrixElements::isaligned
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryAccessHelpers.h b/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryAccessHelpers.h
index c82a6c7635..db73e4e064 100644
--- a/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryAccessHelpers.h
+++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryAccessHelpers.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MemoryAccessHelpers_H
 #define MemoryAccessHelpers_H 1
@@ -105,7 +105,7 @@ class KernelAccessHelper : public MemoryAccessHelper<T>
     }
     else
     {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
       const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid
       //printf( "kernelAccessRecord: ievt=%d threadId=%d\n", ievt, threadIdx.x );
       return T::ieventAccessRecord( buffer, ievt ); // NB fptype and fptype_sv coincide for CUDA
diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryAccessMatrixElements.h b/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryAccessMatrixElements.h
index f32e6fea5b..3741011971 100644
--- a/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryAccessMatrixElements.h
+++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryAccessMatrixElements.h
@@ -13,7 +13,7 @@
 #include "MemoryBuffers.h" // for HostBufferMatrixElements::isaligned
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryAccessMomenta.h b/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryAccessMomenta.h
index 29266de32c..3be229d392 100644
--- a/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryAccessMomenta.h
+++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryAccessMomenta.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MemoryAccessMomenta_H
 #define MemoryAccessMomenta_H 1
@@ -13,7 +13,7 @@
 #include "MemoryAccessVectors.h"
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -30,7 +30,7 @@ namespace mg5amcCpu
 
     // Number of Events Per Page in the momenta AOSOA memory buffer layout
     // (these are all best kept as a compile-time constants: see issue #23)
-#ifdef __CUDACC__ /* clang-format off */
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
     // -----------------------------------------------------------------------------------------------
     // --- GPUs: neppM is best set to a power of 2 times the number of fptype's in a 32-byte cacheline
     // --- This is relevant to ensure coalesced access to momenta in global memory
diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryAccessNumerators.h b/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryAccessNumerators.h
index b152183b28..18991f4fa6 100644
--- a/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryAccessNumerators.h
+++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryAccessNumerators.h
@@ -10,7 +10,7 @@
 #include "MemoryAccessGs.h"
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryAccessRandomNumbers.h b/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryAccessRandomNumbers.h
index e2988d39f3..40cb089135 100644
--- a/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryAccessRandomNumbers.h
+++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryAccessRandomNumbers.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MemoryAccessRandomNumbers_H
 #define MemoryAccessRandomNumbers_H 1
@@ -11,7 +11,7 @@
 #include "CPPProcess.h"
 #include "MemoryAccessHelpers.h"
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 using mg5amcGpu::CPPProcess;
 #else
 using mg5amcCpu::CPPProcess;
diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryAccessVectors.h b/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryAccessVectors.h
index e9b197368e..08faccff0f 100644
--- a/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryAccessVectors.h
+++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryAccessVectors.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MemoryAccessVectors_H
 #define MemoryAccessVectors_H 1
@@ -10,7 +10,7 @@
 
 #include "mgOnGpuVectors.h"
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 namespace mg5amcCpu // this is only needed for CPU SIMD vectorization
 {
 
diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryAccessWavefunctions.h b/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryAccessWavefunctions.h
index 5428aaf933..33bef4559e 100644
--- a/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryAccessWavefunctions.h
+++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryAccessWavefunctions.h
@@ -15,7 +15,7 @@
 #define MGONGPU_TRIVIAL_WAVEFUNCTIONS 1
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryBuffers.h b/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryBuffers.h
index 3093e6ed18..7756a71621 100644
--- a/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryBuffers.h
+++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryBuffers.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021, based on earlier work by S. Hageboeck) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Roiser, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MemoryBuffers_H
 #define MemoryBuffers_H 1
@@ -11,12 +11,12 @@
 #include "mgOnGpuCxtypes.h"
 
 #include "CPPProcess.h"
-#include "CudaRuntime.h"
+#include "GpuRuntime.h"
 #include "Parameters_sm.h"
 
 #include <sstream>
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -87,7 +87,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   constexpr bool HostBufferALIGNED = false;   // ismisaligned=false
   constexpr bool HostBufferMISALIGNED = true; // ismisaligned=true
 
@@ -119,7 +119,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   // A class encapsulating a CUDA pinned host buffer
   template<typename T>
   class PinnedHostBufferBase : public BufferBase<T>
@@ -128,18 +128,18 @@ namespace mg5amcCpu
     PinnedHostBufferBase( const size_t size )
       : BufferBase<T>( size, false )
     {
-      checkCuda( cudaMallocHost( &( this->m_data ), this->bytes() ) );
+      gpuMallocHost( &( this->m_data ), this->bytes() );
     }
     virtual ~PinnedHostBufferBase()
     {
-      checkCuda( cudaFreeHost( this->m_data ) );
+      gpuFreeHost( this->m_data );
     }
   };
 #endif
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   // A class encapsulating a CUDA device buffer
   template<typename T>
   class DeviceBufferBase : public BufferBase<T>
@@ -148,18 +148,18 @@ namespace mg5amcCpu
     DeviceBufferBase( const size_t size )
       : BufferBase<T>( size, true )
     {
-      checkCuda( cudaMalloc( &( this->m_data ), this->bytes() ) );
+      gpuMalloc( &( this->m_data ), this->bytes() );
     }
     virtual ~DeviceBufferBase()
     {
-      checkCuda( cudaFree( this->m_data ) );
+      gpuFree( this->m_data );
     }
   };
 #endif
 
   //--------------------------------------------------------------------------
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for a given number of events
   template<typename T, size_t sizePerEvent, bool ismisaligned>
   class HostBuffer : public HostBufferBase<T, ismisaligned>, virtual private NumberOfEvents
@@ -175,7 +175,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   // A class encapsulating a CUDA pinned host buffer for a given number of events
   template<typename T, size_t sizePerEvent>
   class PinnedHostBuffer : public PinnedHostBufferBase<T>, virtual private NumberOfEvents
@@ -191,7 +191,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   // A class encapsulating a CUDA device buffer for a given number of events
   template<typename T, size_t sizePerEvent>
   class DeviceBuffer : public DeviceBufferBase<T>, virtual private NumberOfEvents
@@ -213,7 +213,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for momenta random numbers
   constexpr size_t sizePerEventRndNumMomenta = MemoryBuffers::np4 * MemoryBuffers::nparf;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for momenta random numbers
   typedef HostBuffer<fptype, sizePerEventRndNumMomenta, HostBufferALIGNED> HostBufferRndNumMomenta;
 #else
@@ -232,7 +232,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer with ONE fptype per event
   constexpr size_t sizePerEventOneFp = 1;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer with ONE fptype per event
   typedef HostBuffer<fptype, sizePerEventOneFp, HostBufferALIGNED> HostBufferOneFp;
 #else
@@ -257,7 +257,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for Gs
   constexpr size_t sizePerEventGs = 1;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for gs
   typedef HostBuffer<fptype, sizePerEventGs, HostBufferALIGNED> HostBufferGs;
 #else
@@ -276,7 +276,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for numerators
   constexpr size_t sizePerEventNumerators = 1;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for gs
   typedef HostBuffer<fptype, sizePerEventNumerators, HostBufferALIGNED> HostBufferNumerators;
 #else
@@ -296,7 +296,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for denominators
   constexpr size_t sizePerEventDenominators = 1;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for gs
   typedef HostBuffer<fptype, sizePerEventDenominators, HostBufferALIGNED> HostBufferDenominators;
 #else
@@ -315,7 +315,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for random numbers
   constexpr size_t sizePerEventCouplings = MemoryBuffers::ndcoup * MemoryBuffers::nx2;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for gs
   typedef HostBuffer<fptype, sizePerEventCouplings, HostBufferALIGNED> HostBufferCouplings;
 #else
@@ -333,7 +333,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for momenta
   constexpr size_t sizePerEventMomenta = MemoryBuffers::np4 * MemoryBuffers::npar;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for momenta
   typedef HostBuffer<fptype, sizePerEventMomenta, HostBufferALIGNED> HostBufferMomenta;
   //typedef HostBuffer<fptype, sizePerEventMomenta, HostBufferMISALIGNED> HostBufferMomenta; // TEST MISALIGNMENT!
@@ -352,7 +352,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for sampling weights
   constexpr size_t sizePerEventWeights = 1;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for sampling weights
   typedef HostBuffer<fptype, sizePerEventWeights, HostBufferALIGNED> HostBufferWeights;
 #else
@@ -370,7 +370,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for matrix elements
   constexpr size_t sizePerEventMatrixElements = 1;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for matrix elements
   typedef HostBuffer<fptype, sizePerEventMatrixElements, HostBufferALIGNED> HostBufferMatrixElements;
 #else
@@ -385,7 +385,7 @@ namespace mg5amcCpu
   // A base class encapsulating a memory buffer for the helicity mask
   typedef BufferBase<bool> BufferHelicityMask;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for the helicity mask
   typedef HostBufferBase<bool, HostBufferALIGNED> HostBufferHelicityMask;
 #else
@@ -403,7 +403,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for wavefunctions
   constexpr size_t sizePerEventWavefunctions = MemoryBuffers::nw6 * MemoryBuffers::nx2;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for wavefunctions
   typedef HostBuffer<fptype, sizePerEventWavefunctions, HostBufferALIGNED> HostBufferWavefunctions;
 #else
@@ -421,7 +421,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for helicity random numbers
   constexpr size_t sizePerEventRndNumHelicity = 1;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for helicity random numbers
   typedef HostBuffer<fptype, sizePerEventRndNumHelicity, HostBufferALIGNED> HostBufferRndNumHelicity;
 #else
@@ -439,7 +439,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for color random numbers
   constexpr size_t sizePerEventRndNumColor = 1;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for color random numbers
   typedef HostBuffer<fptype, sizePerEventRndNumColor, HostBufferALIGNED> HostBufferRndNumColor;
 #else
@@ -457,7 +457,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for helicity selection
   constexpr size_t sizePerEventSelectedHelicity = 1;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for helicity selection
   typedef HostBuffer<int, sizePerEventSelectedHelicity, HostBufferALIGNED> HostBufferSelectedHelicity;
 #else
@@ -475,7 +475,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for color selection
   constexpr size_t sizePerEventSelectedColor = 1;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for color selection
   typedef HostBuffer<int, sizePerEventSelectedColor, HostBufferALIGNED> HostBufferSelectedColor;
 #else
@@ -487,7 +487,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   template<class Tdst, class Tsrc>
   void copyDeviceFromHost( Tdst& dst, const Tsrc& src ) // keep the same order of arguments as in memcpy
   {
@@ -504,13 +504,13 @@ namespace mg5amcCpu
       throw std::runtime_error( sstr.str() );
     }
     // NB (PR #45): cudaMemcpy involves an intermediate memcpy to pinned memory if host array is a not a pinned host array
-    checkCuda( cudaMemcpy( dst.data(), src.data(), src.bytes(), cudaMemcpyHostToDevice ) );
+    gpuMemcpy( dst.data(), src.data(), src.bytes(), gpuMemcpyHostToDevice );
   }
 #endif
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   template<class Tdst, class Tsrc>
   void copyHostFromDevice( Tdst& dst, const Tsrc& src ) // keep the same order of arguments as in memcpy
   {
@@ -527,7 +527,7 @@ namespace mg5amcCpu
       throw std::runtime_error( sstr.str() );
     }
     // NB (PR #45): cudaMemcpy involves an intermediate memcpy to pinned memory if host array is a not a pinned host array
-    checkCuda( cudaMemcpy( dst.data(), src.data(), src.bytes(), cudaMemcpyDeviceToHost ) );
+    gpuMemcpy( dst.data(), src.data(), src.bytes(), gpuMemcpyDeviceToHost );
   }
 #endif
 
diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/CPPProcess.cc b/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/CPPProcess.cc
index a4cc98e6b1..62fa7f0088 100644
--- a/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/CPPProcess.cc
+++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/CPPProcess.cc
@@ -4,7 +4,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
 // MadGraph5_aMC@NLO v. 3.5.0_lo_vect, 2023-06-09
@@ -16,7 +16,6 @@
 
 #include "mgOnGpuConfig.h"
 
-#include "CudaRuntime.h"
 #include "HelAmps_sm.h"
 #include "MemoryAccessAmplitudes.h"
 #include "MemoryAccessCouplings.h"
@@ -46,7 +45,7 @@
 // Class member functions for calculating the matrix elements for
 // Process: g g > t t~ WEIGHTED<=2 @1
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -80,7 +79,7 @@ namespace mg5amcCpu
   __device__ const fptype cIPD[2] = { (fptype)Parameters_sm::mdl_MT, (fptype)Parameters_sm::mdl_WT };
   __device__ const fptype* cIPC = nullptr; // unused as nicoup=0
 #else
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   __device__ __constant__ fptype cIPD[2];
   __device__ __constant__ fptype* cIPC = nullptr; // unused as nicoup=0
 #else
@@ -90,7 +89,7 @@ namespace mg5amcCpu
 #endif
 
   // Helicity combinations (and filtering of "good" helicity combinations)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   __device__ __constant__ short cHel[ncomb][npar];
   __device__ __constant__ int cNGoodHel;
   __device__ __constant__ int cGoodHel[ncomb];
@@ -118,13 +117,13 @@ namespace mg5amcCpu
                            fptype* allDenominators,       // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
                            fptype_sv* jamp2_sv            // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled)
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
                            , const int ievt00             // input: first event number in current C++ event page (for CUDA, ievt depends on threadid)
 #endif
                            )
   //ALWAYS_INLINE // attributes are not permitted in a function definition
   {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     using namespace mg5amcGpu;
     using M_ACCESS = DeviceAccessMomenta;         // non-trivial access: buffer includes all events
     using E_ACCESS = DeviceAccessMatrixElements;  // non-trivial access: buffer includes all events
@@ -151,7 +150,7 @@ namespace mg5amcCpu
 #endif /* clang-format on */
     mgDebug( 0, __FUNCTION__ );
     //printf( "calculate_wavefunctions: ihel=%2d\n", ihel );
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
     //printf( "calculate_wavefunctions: ievt00=%d\n", ievt00 );
 #endif
 
@@ -187,7 +186,7 @@ namespace mg5amcCpu
 #endif
     for( int iParity = 0; iParity < nParity; ++iParity )
     { // START LOOP ON IPARITY
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
       const int ievt0 = ievt00 + iParity * neppV;
 #endif
       constexpr size_t nxcoup = ndcoup + nicoup; // both dependent and independent couplings
@@ -200,8 +199,10 @@ namespace mg5amcCpu
         allCOUPs[idcoup] = CD_ACCESS::idcoupAccessBufferConst( allcouplings, idcoup ); // dependent couplings, vary event-by-event
       for( size_t iicoup = 0; iicoup < nicoup; iicoup++ )
         allCOUPs[ndcoup + iicoup] = CI_ACCESS::iicoupAccessBufferConst( cIPC, iicoup ); // independent couplings, fixed for all events
+#ifdef MGONGPUCPP_GPUIMPL
 #ifdef __CUDACC__
 #pragma nv_diagnostic pop
+#endif
       // CUDA kernels take input/output buffers with momenta/MEs for all events
       const fptype* momenta = allmomenta;
       const fptype* COUPs[nxcoup];
@@ -302,7 +303,7 @@ namespace mg5amcCpu
         { 16, -2 },
         { -2, 16 } }; // 2-D array[2][2]
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
       // Pre-compute a constexpr triangular color matrix properly normalized #475
       struct TriangularNormalizedColorMatrix
       {
@@ -359,7 +360,7 @@ namespace mg5amcCpu
 #endif
       for( int icol = 0; icol < ncolor; icol++ )
       {
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
         // === C++ START ===
         // Diagonal terms
 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
@@ -418,7 +419,7 @@ namespace mg5amcCpu
       MEs_sv_previous += deltaMEs_previous;
 #endif
       /*
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
       if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", blockDim.x * blockIdx.x + threadIdx.x, ihel, MEs_sv );
 #else
 #ifdef MGONGPU_CPPSIMD
@@ -465,8 +466,8 @@ namespace mg5amcCpu
       { 1, 1, -1, -1 },
       { 1, 1, 1, 1 },
       { 1, 1, 1, -1 } };
-#ifdef __CUDACC__
-    checkCuda( cudaMemcpyToSymbol( cHel, tHel, ncomb * npar * sizeof( short ) ) );
+#ifdef MGONGPUCPP_GPUIMPL
+    gpuMemcpyToSymbol( cHel, tHel, ncomb * npar * sizeof( short ) );
 #else
     memcpy( cHel, tHel, ncomb * npar * sizeof( short ) );
 #endif
@@ -506,9 +507,9 @@ namespace mg5amcCpu
     // Then copy them to CUDA constant memory (issue #39) or its C++ emulation in file-scope static memory
     const fptype tIPD[2] = { (fptype)m_pars->mdl_MT, (fptype)m_pars->mdl_WT };
     //const cxtype tIPC[0] = { ... }; // nicoup=0
-#ifdef __CUDACC__
-    checkCuda( cudaMemcpyToSymbol( cIPD, tIPD, 2 * sizeof( fptype ) ) );
-    //checkCuda( cudaMemcpyToSymbol( cIPC, tIPC, 0 * sizeof( cxtype ) ) ); // nicoup=0
+#ifdef MGONGPUCPP_GPUIMPL
+    gpuMemcpyToSymbol( cIPD, tIPD, 2 * sizeof( fptype ) );
+    //gpuMemcpyToSymbol( cIPC, tIPC, 0 * sizeof( cxtype ) ); // nicoup=0
 #else
     memcpy( cIPD, tIPD, 2 * sizeof( fptype ) );
     //memcpy( cIPC, tIPC, 0 * sizeof( cxtype ) ); // nicoup=0
@@ -544,7 +545,7 @@ namespace mg5amcCpu
   {
     std::stringstream out;
     // CUDA version (NVCC)
-    // [Use __NVCC__ instead of __CUDACC__ here!]
+    // [Use __NVCC__ instead of MGONGPUCPP_GPUIMPL here!]
     // [This tests if 'nvcc' was used even to build a .cc file, even if not necessarily 'nvcc -x cu' for a .cu file]
     // [Check 'nvcc --compiler-options -dM -E dummy.c | grep CUDA': see https://stackoverflow.com/a/53713712]
 #ifdef __NVCC__
@@ -609,12 +610,12 @@ namespace mg5amcCpu
   __global__ void /* clang-format off */
   computeDependentCouplings( const fptype* allgs, // input: Gs[nevt]
                              fptype* allcouplings // output: couplings[nevt*ndcoup*2]
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
                              , const int nevt     // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #endif
   ) /* clang-format on */
   {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     using namespace mg5amcGpu;
     using G_ACCESS = DeviceAccessGs;
     using C_ACCESS = DeviceAccessCouplings;
@@ -635,7 +636,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__ /* clang-format off */
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
   __global__ void
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
@@ -764,9 +765,9 @@ namespace mg5amcCpu
         nGoodHel++;
       }
     }
-#ifdef __CUDACC__
-    checkCuda( cudaMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) ) );
-    checkCuda( cudaMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) ) );
+#ifdef MGONGPUCPP_GPUIMPL
+    gpuMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) );
+    gpuMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) );
 #else
     cNGoodHel = nGoodHel;
     for( int ihel = 0; ihel < ncomb; ihel++ ) cGoodHel[ihel] = goodHel[ihel];
@@ -790,7 +791,7 @@ namespace mg5amcCpu
 #endif
             int* allselhel,                // output: helicity selection[nevt]
             int* allselcol                 // output: helicity selection[nevt]
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
             , const int nevt               // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #endif
             ) /* clang-format on */
@@ -811,7 +812,7 @@ namespace mg5amcCpu
     // Denominators: spins, colors and identical particles
     constexpr int helcolDenominators[1] = { 256 }; // assume nprocesses == 1 (#272 and #343)
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     // Remember: in CUDA this is a kernel for one event, in c++ this processes n events
     const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid
 #else
@@ -825,9 +826,12 @@ namespace mg5amcCpu
 #endif
 
     // Start sigmaKin_lines
+
+#include "GpuAbstraction.h"
+
     // === PART 0 - INITIALISATION (before calculate_wavefunctions) ===
     // Reset the "matrix elements" - running sums of |M|^2 over helicities for the given event
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     allMEs[ievt] = 0;
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     allNumerators[ievt] = 0;
@@ -855,7 +859,7 @@ namespace mg5amcCpu
     // === PART 1 - HELICITY LOOP: CALCULATE WAVEFUNCTIONS ===
     // (in both CUDA and C++, using precomputed good helicities)
 
-#ifdef __CUDACC__ // CUDA OR C++
+#ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++
 
     // *** START OF PART 1a - CUDA (one event per CPU thread) ***
     // Running sum of partial amplitudes squared for event by event color selection (#402)
@@ -1059,7 +1063,7 @@ namespace mg5amcCpu
     // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event
     // [NB 'sum over final spins, average over initial spins', eg see
     // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf]
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     allMEs[ievt] /= helcolDenominators[0];
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     if( channelId > 0 ) allMEs[ievt] *= allNumerators[ievt] / allDenominators[ievt];
diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/CPPProcess.h b/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/CPPProcess.h
index 51f966d10f..5a6e96d9e8 100644
--- a/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/CPPProcess.h
+++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/CPPProcess.h
@@ -4,7 +4,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
 // MadGraph5_aMC@NLO v. 3.5.0_lo_vect, 2023-06-09
@@ -25,7 +25,7 @@
 
 //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -107,7 +107,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   __global__ void
   computeDependentCouplings( const fptype* allgs,    // input: Gs[nevt]
                              fptype* allcouplings ); // output: couplings[nevt*ndcoup*2]
@@ -120,7 +120,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__ /* clang-format off */
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
   __global__ void
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
@@ -150,7 +150,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__ /* clang-format off */
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
   __global__ void
   sigmaKin( const fptype* allmomenta,      // input: momenta[nevt*npar*4]
             const fptype* allcouplings,    // input: couplings[nevt*ndcoup*2]
diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/CudaRuntime.h b/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/CudaRuntime.h
deleted file mode 120000
index ce9e1a487a..0000000000
--- a/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/CudaRuntime.h
+++ /dev/null
@@ -1 +0,0 @@
-../CudaRuntime.h
\ No newline at end of file
diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/check_sa.cc b/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/check_sa.cc
index f1e75b9252..1bad694d1c 100644
--- a/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/check_sa.cc
+++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/check_sa.cc
@@ -4,7 +4,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: O. Mattelaer (Nov 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 
 #include "mgOnGpuConfig.h"
@@ -12,6 +12,7 @@
 #include "BridgeKernels.h"
 #include "CPPProcess.h"
 #include "CrossSectionKernels.h"
+#include "GpuRuntime.h"
 #include "MatrixElementKernels.h"
 #include "MemoryAccessMatrixElements.h"
 #include "MemoryAccessMomenta.h"
@@ -63,7 +64,7 @@ usage( char* argv0, int ret = 1 )
   std::cout << std::endl;
   std::cout << "Summary stats are always computed: '-p' and '-j' only control their printout" << std::endl;
   std::cout << "The '-d' flag only enables NaN/abnormal warnings and OMP debugging" << std::endl;
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 #ifdef _OPENMP
   std::cout << std::endl;
   std::cout << "Use the OMP_NUM_THREADS environment variable to control OMP multi-threading" << std::endl;
@@ -77,7 +78,7 @@ int
 main( int argc, char** argv )
 {
   // Namespaces for CUDA and C++ (FIXME - eventually use the same namespace everywhere...)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   using namespace mg5amcGpu;
 #else
   using namespace mg5amcCpu;
@@ -103,11 +104,11 @@ main( int argc, char** argv )
     CurandDevice = 2
   };
 #ifdef __CUDACC__
-  RandomNumberMode rndgen = RandomNumberMode::CurandDevice; // default on GPU
+  RandomNumberMode rndgen = RandomNumberMode::CurandDevice; // default on CUDA GPU
 #elif not defined MGONGPU_HAS_NO_CURAND
   RandomNumberMode rndgen = RandomNumberMode::CurandHost;  // default on CPU if build has curand
 #else
-  RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // default on CPU if build has no curand
+  RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // default on CPU if build has no curand and on HIP GPU
 #endif
   // Rambo sampling mode (NB RamboHost implies CommonRandom or CurandHost!)
   enum class RamboSamplingMode
@@ -115,7 +116,7 @@ main( int argc, char** argv )
     RamboHost = 1,
     RamboDevice = 2
   };
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   RamboSamplingMode rmbsmp = RamboSamplingMode::RamboDevice; // default on GPU
 #else
   RamboSamplingMode rmbsmp = RamboSamplingMode::RamboHost; // default on CPU
@@ -148,7 +149,7 @@ main( int argc, char** argv )
 #ifdef __CUDACC__
       rndgen = RandomNumberMode::CurandDevice;
 #else
-      throw std::runtime_error( "CurandDevice is not supported on CPUs" );
+      throw std::runtime_error( "CurandDevice is not supported on CPUs or on HIP GPUs" );
 #endif
     }
     else if( arg == "--curhst" )
@@ -165,7 +166,7 @@ main( int argc, char** argv )
     }
     else if( arg == "--rmbdev" )
     {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
       rmbsmp = RamboSamplingMode::RamboDevice;
 #else
       throw std::runtime_error( "RamboDevice is not supported on CPUs" );
@@ -239,13 +240,13 @@ main( int argc, char** argv )
     return usage( argv[0] );
   }
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 #ifdef _OPENMP
   ompnumthreadsNotSetMeansOneThread( debug ? 1 : 0 ); // quiet(-1), info(0), debug(1)
 #endif
 #endif
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // Fail gently and avoid "Illegal instruction (core dumped)" if the host does not support the SIMD used in the ME calculation
   // Note: this prevents a crash on pmpe04 but not on some github CI nodes?
   // [NB: SIMD vectorization in mg5amc C++ code is only used in the ME calculation below MatrixElementKernelHost!]
@@ -263,14 +264,14 @@ main( int argc, char** argv )
 
   // === STEP 0 - INITIALISE
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 
-  // --- 00. Initialise cuda
-  // Instantiate a CudaRuntime at the beginnining of the application's main to
-  // invoke cudaSetDevice(0) in the constructor and book a cudaDeviceReset() call in the destructor
-  const std::string cdinKey = "00 CudaInit";
+  // --- 00. Initialise GPU
+  // Instantiate a GpuRuntime at the beginnining of the application's main.
+  // For CUDA this invokes cudaSetDevice(0) in the constructor and books a cudaDeviceReset() call in the destructor.
+  const std::string cdinKey = "00 GpuInit";
   timermap.start( cdinKey );
-  CudaRuntime cudaRuntime( debug );
+  GpuRuntime GpuRuntime( debug );
 #endif
 
   // --- 0a. Initialise physics process
@@ -292,7 +293,7 @@ main( int argc, char** argv )
   timermap.start( alloKey );
 
   // Memory buffers for random numbers for momenta
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferRndNumMomenta hstRndmom( nevt );
 #else
   PinnedHostBufferRndNumMomenta hstRndmom( nevt );
@@ -300,7 +301,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for sampling weights
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferWeights hstWeights( nevt );
 #else
   PinnedHostBufferWeights hstWeights( nevt );
@@ -308,7 +309,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for momenta
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferMomenta hstMomenta( nevt );
 #else
   PinnedHostBufferMomenta hstMomenta( nevt );
@@ -316,7 +317,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for Gs
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferGs hstGs( nevt );
 #else
   PinnedHostBufferGs hstGs( nevt );
@@ -333,7 +334,7 @@ main( int argc, char** argv )
   }
 
   // Memory buffers for matrix elements
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferMatrixElements hstMatrixElements( nevt );
 #else
   PinnedHostBufferMatrixElements hstMatrixElements( nevt );
@@ -342,7 +343,7 @@ main( int argc, char** argv )
 
   // Memory buffers for random numbers for helicity selection
   // *** NB #403 these buffers always remain initialised at 0: no need for helicity choice in gcheck/check (no LHE produced) ***
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferRndNumHelicity hstRndHel( nevt );
 #else
   PinnedHostBufferRndNumHelicity hstRndHel( nevt );
@@ -351,7 +352,7 @@ main( int argc, char** argv )
 
   // Memory buffers for random numbers for color selection
   // *** NB #402 these buffers always remain initialised at 0: no need for color choice in gcheck/check (no LHE produced) ***
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferRndNumColor hstRndCol( nevt );
 #else
   PinnedHostBufferRndNumColor hstRndCol( nevt );
@@ -359,7 +360,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for helicity selection
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferSelectedHelicity hstSelHel( nevt );
 #else
   PinnedHostBufferSelectedHelicity hstSelHel( nevt );
@@ -367,7 +368,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for color selection
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferSelectedColor hstSelCol( nevt );
 #else
   PinnedHostBufferSelectedColor hstSelCol( nevt );
@@ -403,7 +404,7 @@ main( int argc, char** argv )
 #else
   else
   {
-    throw std::logic_error( "CurandDevice is not supported on CPUs" ); // INTERNAL ERROR (no path to this statement)
+    throw std::logic_error( "CurandDevice is not supported on CPUs or HIP GPUs" ); // INTERNAL ERROR (no path to this statement)
   }
 #endif
 #else
@@ -421,7 +422,7 @@ main( int argc, char** argv )
   }
   else
   {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     prsk.reset( new RamboSamplingKernelDevice( energy, devRndmom, devMomenta, devWeights, gpublocks, gputhreads ) );
 #else
     throw std::logic_error( "RamboDevice is not supported on CPUs" ); // INTERNAL ERROR (no path to this statement)
@@ -432,7 +433,7 @@ main( int argc, char** argv )
   std::unique_ptr<MatrixElementKernelBase> pmek;
   if( !bridge )
   {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     pmek.reset( new MatrixElementKernelDevice( devMomenta, devGs, devRndHel, devRndCol, devMatrixElements, devSelHel, devSelCol, gpublocks, gputhreads ) );
 #else
     pmek.reset( new MatrixElementKernelHost( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, nevt ) );
@@ -440,7 +441,7 @@ main( int argc, char** argv )
   }
   else
   {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     pmek.reset( new BridgeKernelDevice( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, gpublocks, gputhreads ) );
 #else
     pmek.reset( new BridgeKernelHost( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, nevt ) );
@@ -482,7 +483,7 @@ main( int argc, char** argv )
     prnk->generateRnarray();
     //std::cout << "Got random numbers" << std::endl;
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     if( rndgen != RandomNumberMode::CurandDevice && rmbsmp == RamboSamplingMode::RamboDevice )
     {
       // --- 1c. Copy rndmom from host to device
@@ -514,7 +515,7 @@ main( int argc, char** argv )
     prsk->getMomentaFinal();
     //std::cout << "Got final momenta" << std::endl;
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     if( rmbsmp == RamboSamplingMode::RamboDevice )
     {
       // --- 2c. CopyDToH Weights
@@ -559,7 +560,7 @@ main( int argc, char** argv )
       dynamic_cast<BridgeKernelBase*>( pmek.get() )->transposeInputMomentaC2F();
     }
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     // --- 2d. CopyHToD Momenta
     const std::string gKey = "0.. CpHTDg";
     rambtime += timermap.start( gKey ); // FIXME! NOT A RAMBO TIMER!
@@ -588,7 +589,7 @@ main( int argc, char** argv )
     wv3atime += timermap.stop(); // calc only
     wavetime += wv3atime;        // calc plus copy
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     if( !bridge )
     {
       // --- 3b. CopyDToH MEs
@@ -731,15 +732,19 @@ main( int argc, char** argv )
     rndgentxt = "CURAND DEVICE";
 #ifdef __CUDACC__
   rndgentxt += " (CUDA code)";
+#elif defined __HIPCC__
+  rndgentxt += " (HIP code)";
 #else
   rndgentxt += " (C++ code)";
 #endif
 
   // Workflow description summary
   std::string wrkflwtxt;
-  // -- CUDA or C++?
+  // -- CUDA or HIP or C++?
 #ifdef __CUDACC__
   wrkflwtxt += "CUD:";
+#elif defined __HIPCC__
+  wrkflwtxt += "HIP:";
 #else
   wrkflwtxt += "CPP:";
 #endif
@@ -764,6 +769,12 @@ main( int argc, char** argv )
 #else
   wrkflwtxt += "???:"; // no path to this statement
 #endif
+#elif defined __HIPCC__
+#if defined MGONGPU_CUCXTYPE_CXSMPL
+  wrkflwtxt += "CXS:";
+#else
+  wrkflwtxt += "???:"; // no path to this statement
+#endif
 #else
 #if defined MGONGPU_CPPCXTYPE_STDCOMPLEX
   wrkflwtxt += "STX:";
@@ -789,7 +800,7 @@ main( int argc, char** argv )
     wrkflwtxt += "RMBDEV+";
   else
     wrkflwtxt += "??????+"; // no path to this statement
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   // -- HOST or DEVICE matrix elements? Standalone MEs or BRIDGE?
   if( !bridge )
     wrkflwtxt += "MESDEV";
@@ -845,7 +856,7 @@ main( int argc, char** argv )
 
   if( perf )
   {
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 #ifdef _OPENMP
     // Get the output of "nproc --all" (https://stackoverflow.com/a/478960)
     std::string nprocall;
@@ -866,6 +877,8 @@ main( int argc, char** argv )
     std::cout << std::string( SEP79, '*' ) << std::endl
 #ifdef __CUDACC__
               << "Process                     = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_CUDA"
+#elif defined __HIPCC__
+              << "Process                     = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_HIP"
 #else
               << "Process                     = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_CPP"
 #endif
@@ -892,21 +905,21 @@ main( int argc, char** argv )
 #elif defined MGONGPU_FPTYPE_FLOAT
               << "FP precision                = FLOAT (NaN/abnormal=" << nabn << ", zero=" << nzero << ")" << std::endl
 #endif
-#ifdef __CUDACC__
 #if defined MGONGPU_CUCXTYPE_CUCOMPLEX
               << "Complex type                = CUCOMPLEX" << std::endl
 #elif defined MGONGPU_CUCXTYPE_THRUST
               << "Complex type                = THRUST::COMPLEX" << std::endl
-#endif
-#else
+#elif defined MGONGPU_CUCXTYPE_CXSMPL
               << "Complex type                = STD::COMPLEX" << std::endl
+#else
+              << "Complex type                = ???" << std::endl // no path to this statement...
 #endif
               << "RanNumb memory layout       = AOSOA[" << neppR << "]"
               << ( neppR == 1 ? " == AOS" : "" )
               << " [HARDCODED FOR REPRODUCIBILITY]" << std::endl
               << "Momenta memory layout       = AOSOA[" << neppM << "]"
               << ( neppM == 1 ? " == AOS" : "" ) << std::endl
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     //<< "Wavefunction GPU memory     = LOCAL" << std::endl
 #else
 #if !defined MGONGPU_CPPSIMD
@@ -937,7 +950,7 @@ main( int argc, char** argv )
 #endif
 #endif
               << "Random number generation    = " << rndgentxt << std::endl
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 #ifdef _OPENMP
               << "OMP threads / `nproc --all` = " << omp_get_max_threads() << " / " << nprocall // includes a newline
 #endif
@@ -1033,14 +1046,14 @@ main( int argc, char** argv )
              << "\"FLOAT (NaN/abnormal=" << nabn << ")\"," << std::endl
 #endif
              << "\"Complex type\": "
-#ifdef __CUDACC__
 #if defined MGONGPU_CUCXTYPE_CUCOMPLEX
              << "\"CUCOMPLEX\"," << std::endl
 #elif defined MGONGPU_CUCXTYPE_THRUST
              << "\"THRUST::COMPLEX\"," << std::endl
-#endif
-#else
+#elif defined MGONGPU_CUCXTYPE_CXSMPL
              << "\"STD::COMPLEX\"," << std::endl
+#else
+             << "\"???\"," << std::endl                           // no path to this statement...
 #endif
              << "\"RanNumb memory layout\": "
              << "\"AOSOA[" << neppR << "]\""
@@ -1048,7 +1061,7 @@ main( int argc, char** argv )
              << "\"Momenta memory layout\": "
              << "\"AOSOA[" << neppM << "]\""
              << ( neppM == 1 ? " == AOS" : "" ) << ", " << std::endl
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     //<< "\"Wavefunction GPU memory\": " << "\"LOCAL\"," << std::endl
 #endif
              << "\"Curand generation\": "
diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/RamboSamplingKernels.cc b/epochX/cudacpp/gg_tt.mad/SubProcesses/RamboSamplingKernels.cc
index da68aa9255..79abbcc4f8 100644
--- a/epochX/cudacpp/gg_tt.mad/SubProcesses/RamboSamplingKernels.cc
+++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/RamboSamplingKernels.cc
@@ -1,11 +1,11 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #include "RamboSamplingKernels.h"
 
-#include "CudaRuntime.h"
+#include "GpuRuntime.h"
 #include "MemoryAccessMomenta.h"
 #include "MemoryAccessRandomNumbers.h"
 #include "MemoryAccessWeights.h"
@@ -14,7 +14,7 @@
 
 #include <sstream>
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -92,7 +92,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   RamboSamplingKernelDevice::RamboSamplingKernelDevice( const fptype energy,               // input: energy
                                                         const BufferRndNumMomenta& rndmom, // input: random numbers in [0,1]
                                                         BufferMomenta& momenta,            // output: momenta
@@ -135,7 +135,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   __global__ void
   getMomentaInitialDevice( const fptype energy,
                            fptype* momenta )
@@ -147,17 +147,17 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   void
   RamboSamplingKernelDevice::getMomentaInitial()
   {
-    getMomentaInitialDevice<<<m_gpublocks, m_gputhreads>>>( m_energy, m_momenta.data() );
+    gpuLaunchKernel( getMomentaInitialDevice, m_gpublocks, m_gputhreads, m_energy, m_momenta.data() );
   }
 #endif
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   __global__ void
   getMomentaFinalDevice( const fptype energy,
                          const fptype* rndmom,
@@ -171,11 +171,11 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   void
   RamboSamplingKernelDevice::getMomentaFinal()
   {
-    getMomentaFinalDevice<<<m_gpublocks, m_gputhreads>>>( m_energy, m_rndmom.data(), m_momenta.data(), m_weights.data() );
+    gpuLaunchKernel( getMomentaFinalDevice, m_gpublocks, m_gputhreads, m_energy, m_rndmom.data(), m_momenta.data(), m_weights.data() );
   }
 #endif
 
diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/RamboSamplingKernels.h b/epochX/cudacpp/gg_tt.mad/SubProcesses/RamboSamplingKernels.h
index 184089efd7..7c214cd74b 100644
--- a/epochX/cudacpp/gg_tt.mad/SubProcesses/RamboSamplingKernels.h
+++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/RamboSamplingKernels.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef RAMBOSAMPLINGKERNELS_H
 #define RAMBOSAMPLINGKERNELS_H 1
@@ -10,7 +10,7 @@
 
 #include "MemoryBuffers.h"
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -93,7 +93,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   // A class encapsulating RAMBO phase space sampling on a GPU device
   class RamboSamplingKernelDevice final : public SamplingKernelBase, public NumberOfEvents
   {
diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/RandomNumberKernels.h b/epochX/cudacpp/gg_tt.mad/SubProcesses/RandomNumberKernels.h
index 188a72c2c9..21d63beeac 100644
--- a/epochX/cudacpp/gg_tt.mad/SubProcesses/RandomNumberKernels.h
+++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/RandomNumberKernels.h
@@ -1,14 +1,14 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef RANDOMNUMBERKERNELS_H
 #define RANDOMNUMBERKERNELS_H 1
 
 #include "mgOnGpuConfig.h"
 
-// NB This must come AFTER mgOnGpuConfig.h which contains our definition of __global__ when __CUDACC__ is not defined
+// NB This must come AFTER mgOnGpuConfig.h which contains our definition of __global__ when MGONGPUCPP_GPUIMPL is not defined
 #ifndef MGONGPU_HAS_NO_CURAND
 //#include "curand.h"
 struct curandGenerator_st; // forward definition from curand.h
@@ -16,7 +16,7 @@ struct curandGenerator_st; // forward definition from curand.h
 
 #include "MemoryBuffers.h"
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/cudacpp.mk b/epochX/cudacpp/gg_tt.mad/SubProcesses/cudacpp.mk
index 43cee0977e..77334d2c04 100644
--- a/epochX/cudacpp/gg_tt.mad/SubProcesses/cudacpp.mk
+++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/cudacpp.mk
@@ -1,7 +1,7 @@
 # Copyright (C) 2020-2023 CERN and UCLouvain.
 # Licensed under the GNU Lesser General Public License (version 3 or later).
 # Created by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin.
-# Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+# Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 
 #=== Determine the name of this makefile (https://ftp.gnu.org/old-gnu/Manuals/make-3.80/html_node/make_17.html)
 #=== NB: different names (e.g. cudacpp.mk and cudacpp_src.mk) are used in the Subprocess and src directories
@@ -32,7 +32,7 @@ UNAME_P := $(shell uname -p)
 #=== Configure common compiler flags for C++ and CUDA
 
 INCFLAGS = -I.
-OPTFLAGS = -O3 # this ends up in CUFLAGS too (should it?), cannot add -Ofast or -ffast-math here
+OPTFLAGS = -O3 # this ends up in GPUFLAGS too (should it?), cannot add -Ofast or -ffast-math here
 
 # Dependency on src directory
 MG5AMC_COMMONLIB = mg5amc_common
@@ -103,69 +103,135 @@ endif
 
 #-------------------------------------------------------------------------------
 
-#=== Configure the CUDA compiler
-
-# If CXX is not a single word (example "clang++ --gcc-toolchain...") then disable CUDA builds (issue #505)
-# This is because it is impossible to pass this to "CUFLAGS += -ccbin <host-compiler>" below
-ifneq ($(words $(subst ccache ,,$(CXX))),1) # allow at most "CXX=ccache <host-compiler>" from outside
-  $(warning CUDA builds are not supported for multi-word CXX "$(CXX)")
-  override CUDA_HOME=disabled
-endif
-
-# If CUDA_HOME is not set, try to set it from the location of nvcc
-ifndef CUDA_HOME
-  CUDA_HOME = $(patsubst %bin/nvcc,%,$(shell which nvcc 2>/dev/null))
-  $(warning CUDA_HOME was not set: using "$(CUDA_HOME)")
-endif
-
-# Set NVCC as $(CUDA_HOME)/bin/nvcc if it exists
-ifneq ($(wildcard $(CUDA_HOME)/bin/nvcc),)
-  NVCC = $(CUDA_HOME)/bin/nvcc
-  USE_NVTX ?=-DUSE_NVTX
-  # See https://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/index.html
-  # See https://arnon.dk/matching-sm-architectures-arch-and-gencode-for-various-nvidia-cards/
-  # Default: use compute capability 70 for V100 (CERN lxbatch, CERN itscrd, Juwels Cluster).
-  # Embed device code for 70, and PTX for 70+.
-  # Export MADGRAPH_CUDA_ARCHITECTURE (comma-separated list) to use another value or list of values (see #533).
-  # Examples: use 60 for P100 (Piz Daint), 80 for A100 (Juwels Booster, NVidia raplab/Curiosity).
-  MADGRAPH_CUDA_ARCHITECTURE ?= 70
-  ###CUARCHFLAGS = -gencode arch=compute_$(MADGRAPH_CUDA_ARCHITECTURE),code=compute_$(MADGRAPH_CUDA_ARCHITECTURE) -gencode arch=compute_$(MADGRAPH_CUDA_ARCHITECTURE),code=sm_$(MADGRAPH_CUDA_ARCHITECTURE) # Older implementation (AV): go back to this one for multi-GPU support #533
-  ###CUARCHFLAGS = --gpu-architecture=compute_$(MADGRAPH_CUDA_ARCHITECTURE) --gpu-code=sm_$(MADGRAPH_CUDA_ARCHITECTURE),compute_$(MADGRAPH_CUDA_ARCHITECTURE) # Newer implementation (SH): cannot use this as-is for multi-GPU support #533
-  comma:=,
-  CUARCHFLAGS = $(foreach arch,$(subst $(comma), ,$(MADGRAPH_CUDA_ARCHITECTURE)),-gencode arch=compute_$(arch),code=compute_$(arch) -gencode arch=compute_$(arch),code=sm_$(arch))
-  CUINC = -I$(CUDA_HOME)/include/
-  CURANDLIBFLAGS = -L$(CUDA_HOME)/lib64/ -lcurand # NB: -lcuda is not needed here!
-  CUOPTFLAGS = -lineinfo
-  CUFLAGS = $(foreach opt, $(OPTFLAGS), -Xcompiler $(opt)) $(CUOPTFLAGS) $(INCFLAGS) $(CUINC) $(USE_NVTX) $(CUARCHFLAGS) -use_fast_math
-  ###CUFLAGS += -Xcompiler -Wall -Xcompiler -Wextra -Xcompiler -Wshadow
-  ###NVCC_VERSION = $(shell $(NVCC) --version | grep 'Cuda compilation tools' | cut -d' ' -f5 | cut -d, -f1)
-  CUFLAGS += -std=c++17 # need CUDA >= 11.2 (see #333): this is enforced in mgOnGpuConfig.h
-  # Without -maxrregcount: baseline throughput: 6.5E8 (16384 32 12) up to 7.3E8 (65536 128 12)
-  ###CUFLAGS+= --maxrregcount 160 # improves throughput: 6.9E8 (16384 32 12) up to 7.7E8 (65536 128 12)
-  ###CUFLAGS+= --maxrregcount 128 # improves throughput: 7.3E8 (16384 32 12) up to 7.6E8 (65536 128 12)
-  ###CUFLAGS+= --maxrregcount 96 # degrades throughput: 4.1E8 (16384 32 12) up to 4.5E8 (65536 128 12)
-  ###CUFLAGS+= --maxrregcount 64 # degrades throughput: 1.7E8 (16384 32 12) flat at 1.7E8 (65536 128 12)
-else ifneq ($(origin REQUIRE_CUDA),undefined)
-  # If REQUIRE_CUDA is set but no cuda is found, stop here (e.g. for CI tests on GPU #443)
-  $(error No cuda installation found (set CUDA_HOME or make nvcc visible in PATH))
-else
-  # No cuda. Switch cuda compilation off and go to common random numbers in C++
-  $(warning CUDA_HOME is not set or is invalid: export CUDA_HOME to compile with cuda)
-  override NVCC=
-  override USE_NVTX=
-  override CUINC=
-  override CURANDLIBFLAGS=
-endif
-export NVCC
-export CUFLAGS
+CUDA_COMPILER_PATH := $(shell compiler="`which nvcc 2>/dev/null`" && while [ -L "$$compiler" ]; do compiler=`readlink "$$compiler"`; done && echo "$$compiler")
+HIP_COMPILER_PATH := $(shell compiler="`which hipcc 2>/dev/null`" && while [ -L "$$compiler" ]; do compiler=`readlink "$$compiler"`; done && echo "$$compiler")
+
+ifeq ($(findstring nvcc,$(CUDA_COMPILER_PATH)),nvcc)
+  #=== Configure the CUDA compiler
+
+  # If CXX is not a single word (example "clang++ --gcc-toolchain...") then disable CUDA builds (issue #505)
+  # This is because it is impossible to pass this to "GPUFLAGS += -ccbin <host-compiler>" below
+  ifneq ($(words $(subst ccache ,,$(CXX))),1) # allow at most "CXX=ccache <host-compiler>" from outside
+    $(warning CUDA builds are not supported for multi-word CXX "$(CXX)")
+    override CUDA_HOME=disabled
+  endif
+
+  # If CUDA_HOME is not set, try to set it from the location of nvcc
+  ifndef CUDA_HOME
+    CUDA_HOME = $(patsubst %bin/nvcc,%,$(shell which nvcc 2>/dev/null))
+    $(warning CUDA_HOME was not set: using "$(CUDA_HOME)")
+  endif
+
+  # Set GPUCC as $(CUDA_HOME)/bin/nvcc if it exists
+  ifneq ($(wildcard $(CUDA_HOME)/bin/nvcc),)
+    GPUCC = $(CUDA_HOME)/bin/nvcc
+    USE_NVTX ?=-DUSE_NVTX
+    # See https://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/index.html
+    # See https://arnon.dk/matching-sm-architectures-arch-and-gencode-for-various-nvidia-cards/
+    # Default: use compute capability 70 for V100 (CERN lxbatch, CERN itscrd, Juwels Cluster).
+    # Embed device code for 70, and PTX for 70+.
+    # Export MADGRAPH_CUDA_ARCHITECTURE (comma-separated list) to use another value or list of values (see #533).
+    # Examples: use 60 for P100 (Piz Daint), 80 for A100 (Juwels Booster, NVidia raplab/Curiosity).
+    MADGRAPH_CUDA_ARCHITECTURE ?= 70
+    ###CUARCHFLAGS = -gencode arch=compute_$(MADGRAPH_CUDA_ARCHITECTURE),code=compute_$(MADGRAPH_CUDA_ARCHITECTURE) -gencode arch=compute_$(MADGRAPH_CUDA_ARCHITECTURE),code=sm_$(MADGRAPH_CUDA_ARCHITECTURE) # Older implementation (AV): go back to this one for multi-GPU support #533
+    ###CUARCHFLAGS = --gpu-architecture=compute_$(MADGRAPH_CUDA_ARCHITECTURE) --gpu-code=sm_$(MADGRAPH_CUDA_ARCHITECTURE),compute_$(MADGRAPH_CUDA_ARCHITECTURE) # Newer implementation (SH): cannot use this as-is for multi-GPU support #533
+    comma:=,
+    CUARCHFLAGS = $(foreach arch,$(subst $(comma), ,$(MADGRAPH_CUDA_ARCHITECTURE)),-gencode arch=compute_$(arch),code=compute_$(arch) -gencode arch=compute_$(arch),code=sm_$(arch))
+    CUINC = -I$(CUDA_HOME)/include/
+    CURANDLIBFLAGS = -L$(CUDA_HOME)/lib64/ -lcurand # NB: -lcuda is not needed here!
+    CUOPTFLAGS = -lineinfo
+    GPUFLAGS = $(foreach opt, $(OPTFLAGS), -Xcompiler $(opt)) $(CUOPTFLAGS) $(INCFLAGS) $(CUINC) $(USE_NVTX) $(CUARCHFLAGS) -use_fast_math
+    ###GPUFLAGS += -Xcompiler -Wall -Xcompiler -Wextra -Xcompiler -Wshadow
+    ###GPUCC_VERSION = $(shell $(GPUCC) --version | grep 'Cuda compilation tools' | cut -d' ' -f5 | cut -d, -f1)
+    GPUFLAGS += -std=c++17 # need CUDA >= 11.2 (see #333): this is enforced in mgOnGpuConfig.h
+    # Without -maxrregcount: baseline throughput: 6.5E8 (16384 32 12) up to 7.3E8 (65536 128 12)
+    ###GPUFLAGS+= --maxrregcount 160 # improves throughput: 6.9E8 (16384 32 12) up to 7.7E8 (65536 128 12)
+    ###GPUFLAGS+= --maxrregcount 128 # improves throughput: 7.3E8 (16384 32 12) up to 7.6E8 (65536 128 12)
+    ###GPUFLAGS+= --maxrregcount 96 # degrades throughput: 4.1E8 (16384 32 12) up to 4.5E8 (65536 128 12)
+    ###GPUFLAGS+= --maxrregcount 64 # degrades throughput: 1.7E8 (16384 32 12) flat at 1.7E8 (65536 128 12)
+    CUBUILDRULEFLAGS = -Xcompiler -fPIC -c
+    CCBUILDRULEFLAGS = -Xcompiler -fPIC -c -x cu
+    CUDATESTFLAGS = -lcuda
+  else ifneq ($(origin REQUIRE_CUDA),undefined)
+    # If REQUIRE_CUDA is set but no cuda is found, stop here (e.g. for CI tests on GPU #443)
+    $(error No cuda installation found (set CUDA_HOME or make GPUCC visible in PATH))
+  else
+    # No cuda. Switch cuda compilation off and go to common random numbers in C++
+    $(warning CUDA_HOME is not set or is invalid: export CUDA_HOME to compile with cuda)
+    override GPUCC=
+    override USE_NVTX=
+    override CUINC=
+    override CURANDLIBFLAGS=
+  endif
+  export GPUCC
+  export GPUFLAGS
+
+  # Set the host C++ compiler for GPUCC via "-ccbin <host-compiler>"
+  # (NB issue #505: this must be a single word, "clang++ --gcc-toolchain..." is not supported)
+  GPUFLAGS += -ccbin $(shell which $(subst ccache ,,$(CXX)))
+
+  # Allow newer (unsupported) C++ compilers with older versions of CUDA if ALLOW_UNSUPPORTED_COMPILER_IN_CUDA is set (#504)
+  ifneq ($(origin ALLOW_UNSUPPORTED_COMPILER_IN_CUDA),undefined)
+  GPUFLAGS += -allow-unsupported-compiler
+  endif
+
+else ifeq ($(findstring hipcc,$(HIP_COMPILER_PATH)),hipcc)
+  #=== Configure the HIP compiler
+
+  # If CXX is not a single word (example "clang++ --gcc-toolchain...") then disable CUDA builds (issue #505)
+  # This is because it is impossible to pass this to "GPUFLAGS += -ccbin <host-compiler>" below
+  ifneq ($(words $(subst ccache ,,$(CXX))),1) # allow at most "CXX=ccache <host-compiler>" from outside
+    $(warning CUDA builds are not supported for multi-word CXX "$(CXX)")
+    override CUDA_HOME=disabled
+  endif
+
+  # If HIP_HOME is not set, try to set it from the location of GPUCC
+  ifndef HIP_HOME
+    HIP_HOME = $(patsubst %bin/hipcc,%,$(HIP_COMPILER_PATH))
+    $(warning HIP_HOME was not set: using "$(HIP_HOME)")
+  endif
 
-# Set the host C++ compiler for nvcc via "-ccbin <host-compiler>"
-# (NB issue #505: this must be a single word, "clang++ --gcc-toolchain..." is not supported)
-CUFLAGS += -ccbin $(shell which $(subst ccache ,,$(CXX)))
+  # Set GPUCC as $(HIP_HOME)/bin/hipcc if it exists
+  ifneq ($(wildcard $(HIP_HOME)/bin/hipcc),)
+    GPUCC = $(HIP_HOME)/bin/hipcc
+
+    # Should maybe find something equivelant to this in HIP
+    #USE_NVTX ?=-DUSE_NVTX
+
+    HIPARCHFLAGS = -target x86_64-linux-gnu --offload-arch=gfx90a
+    HIPINC = -I$(HIP_HOME)/include/
+
+    # -DHIP_FAST_MATH equivelant to -use_fast_math in HIP 
+    # (But only for single precision line 208: https://rocm-developer-tools.github.io/HIP/hcc__detail_2math__functions_8h_source.html)
+    GPUFLAGS = $(OPTFLAGS) $(CUOPTFLAGS) $(INCFLAGS) $(HIPINC) $(HIPARCHFLAGS) -DHIP_FAST_MATH -DHIP_PLATFORM=amd -fPIC
+    ###GPUFLAGS += -Xcompiler -Wall -Xcompiler -Wextra -Xcompiler -Wshadow
+    GPUFLAGS += -std=c++17
+    # Without -maxrregcount: baseline throughput: 6.5E8 (16384 32 12) up to 7.3E8 (65536 128 12)
+    ###GPUFLAGS+= --maxrregcount 160 # improves throughput: 6.9E8 (16384 32 12) up to 7.7E8 (65536 128 12)
+    ###GPUFLAGS+= --maxrregcount 128 # improves throughput: 7.3E8 (16384 32 12) up to 7.6E8 (65536 128 12)
+    ###GPUFLAGS+= --maxrregcount 96 # degrades throughput: 4.1E8 (16384 32 12) up to 4.5E8 (65536 128 12)
+    ###GPUFLAGS+= --maxrregcount 64 # degrades throughput: 1.7E8 (16384 32 12) flat at 1.7E8 (65536 128 12)
+
+    CUBUILDRULEFLAGS = -fPIC -c
+    CCBUILDRULEFLAGS = -fPIC -c
+
+  else ifneq ($(origin REQUIRE_HIP),undefined)
+    # If REQUIRE_HIP is set but no cuda is found, stop here (e.g. for CI tests on GPU #443)
+    $(error No hip installation found (set HIP_HOME or make GPUCC visible in PATH))
+  else
+    # No hip. Switch hip compilation off and go to common random numbers in C++
+    $(warning HIP_HOME is not set or is invalid: export HIP_HOME to compile with hip)
+    override GPUCC=
+    override USE_NVTX=
+    override CUINC=
+    override CURANDLIBFLAGS=
+  endif
+
+  # Allow newer (unsupported) C++ compilers with older versions of CUDA if ALLOW_UNSUPPORTED_COMPILER_IN_CUDA is set (#504)
+  ifneq ($(origin ALLOW_UNSUPPORTED_COMPILER_IN_CUDA),undefined)
+  GPUFLAGS += -allow-unsupported-compiler
+  endif
 
-# Allow newer (unsupported) C++ compilers with older versions of CUDA if ALLOW_UNSUPPORTED_COMPILER_IN_CUDA is set (#504)
-ifneq ($(origin ALLOW_UNSUPPORTED_COMPILER_IN_CUDA),undefined)
-CUFLAGS += -allow-unsupported-compiler
 endif
 
 #-------------------------------------------------------------------------------
@@ -179,9 +245,9 @@ endif
 #ifeq ($(USECCACHE)$(shell echo $(AR) | grep ccache),1)
 #  override AR:=ccache $(AR)
 #endif
-ifneq ($(NVCC),)
-  ifeq ($(USECCACHE)$(shell echo $(NVCC) | grep ccache),1)
-    override NVCC:=ccache $(NVCC)
+ifneq ($(GPUCC),)
+  ifeq ($(USECCACHE)$(shell echo $(GPUCC) | grep ccache),1)
+    override GPUCC:=ccache $(GPUCC)
   endif
 endif
 
@@ -205,7 +271,7 @@ endif
 
 # PowerPC-specific CUDA compiler flags (to be reviewed!)
 ifeq ($(UNAME_P),ppc64le)
-  CUFLAGS+= -Xcompiler -mno-float128
+  GPUFLAGS+= -Xcompiler -mno-float128
 endif
 
 #-------------------------------------------------------------------------------
@@ -215,10 +281,10 @@ endif
 # Set the default OMPFLAGS choice
 ifneq ($(shell $(CXX) --version | egrep '^Intel'),)
 override OMPFLAGS = -fopenmp
-###override OMPFLAGS = # disable OpenMP MT on Intel (was ok without nvcc but not ok with nvcc before #578)
+###override OMPFLAGS = # disable OpenMP MT on Intel (was ok without GPUCC but not ok with GPUCC before #578)
 else ifneq ($(shell $(CXX) --version | egrep '^(clang)'),)
 override OMPFLAGS = -fopenmp
-###override OMPFLAGS = # disable OpenMP MT on clang (was not ok without or with nvcc before #578)
+###override OMPFLAGS = # disable OpenMP MT on clang (was not ok without or with GPUCC before #578)
 else ifneq ($(shell $(CXX) --version | egrep '^(Apple clang)'),)
 override OMPFLAGS = # disable OpenMP MT on Apple clang (builds fail in the CI #578)
 else
@@ -269,7 +335,10 @@ endif
 
 # Set the default RNDGEN (random number generator) choice
 ifeq ($(RNDGEN),)
-  ifeq ($(NVCC),)
+  ifeq ($(GPUCC),)
+    override RNDGEN = hasNoCurand
+  # Edgecase for HIP compilation
+  else ifeq ($(findstring hipcc,$(GPUCC)),hipcc)
     override RNDGEN = hasNoCurand
   else ifeq ($(RNDGEN),)
     override RNDGEN = hasCurand
@@ -344,13 +413,13 @@ CXXFLAGS+= $(AVXFLAGS)
 $(info FPTYPE=$(FPTYPE))
 ifeq ($(FPTYPE),d)
   CXXFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_DOUBLE
-  CUFLAGS  += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_DOUBLE
+  GPUFLAGS  += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_DOUBLE
 else ifeq ($(FPTYPE),f)
   CXXFLAGS += -DMGONGPU_FPTYPE_FLOAT -DMGONGPU_FPTYPE2_FLOAT
-  CUFLAGS  += -DMGONGPU_FPTYPE_FLOAT -DMGONGPU_FPTYPE2_FLOAT
+  GPUFLAGS  += -DMGONGPU_FPTYPE_FLOAT -DMGONGPU_FPTYPE2_FLOAT
 else ifeq ($(FPTYPE),m)
   CXXFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_FLOAT
-  CUFLAGS  += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_FLOAT
+  GPUFLAGS  += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_FLOAT
 else
   $(error Unknown FPTYPE='$(FPTYPE)': only 'd', 'f' and 'm' are supported)
 endif
@@ -359,7 +428,7 @@ endif
 $(info HELINL=$(HELINL))
 ifeq ($(HELINL),1)
   CXXFLAGS += -DMGONGPU_INLINE_HELAMPS
-  CUFLAGS  += -DMGONGPU_INLINE_HELAMPS
+  GPUFLAGS  += -DMGONGPU_INLINE_HELAMPS
 else ifneq ($(HELINL),0)
   $(error Unknown HELINL='$(HELINL)': only '0' and '1' are supported)
 endif
@@ -368,7 +437,7 @@ endif
 $(info HRDCOD=$(HRDCOD))
 ifeq ($(HRDCOD),1)
   CXXFLAGS += -DMGONGPU_HARDCODE_PARAM
-  CUFLAGS  += -DMGONGPU_HARDCODE_PARAM
+  GPUFLAGS  += -DMGONGPU_HARDCODE_PARAM
 else ifneq ($(HRDCOD),0)
   $(error Unknown HRDCOD='$(HRDCOD)': only '0' and '1' are supported)
 endif
@@ -420,11 +489,11 @@ ifeq ($(UNAME_S),Darwin)
   override CULIBFLAGSRPATH2 =
 else
   # RPATH to cuda/cpp libs when linking executables
-  override CXXLIBFLAGSRPATH = -Wl,-rpath,$(LIBDIRRPATH)
-  override CULIBFLAGSRPATH = -Xlinker -rpath,$(LIBDIRRPATH)
+  override CXXLIBFLAGSRPATH = -Wl,-rpath=$(LIBDIRRPATH)
+  override CULIBFLAGSRPATH = -Xlinker -rpath=$(LIBDIRRPATH)
   # RPATH to common lib when linking cuda/cpp libs
-  override CXXLIBFLAGSRPATH2 = -Wl,-rpath,'$$ORIGIN'
-  override CULIBFLAGSRPATH2 = -Xlinker -rpath,'$$ORIGIN'
+  override CXXLIBFLAGSRPATH2 = -Wl,-rpath='$$ORIGIN'
+  override CULIBFLAGSRPATH2 = -Xlinker -rpath='$$ORIGIN'
 endif
 
 # Setting LD_LIBRARY_PATH or DYLD_LIBRARY_PATH in the RUNTIME is no longer necessary (neither on Linux nor on Mac)
@@ -437,7 +506,7 @@ override RUNTIME =
 cxx_main=$(BUILDDIR)/check.exe
 fcxx_main=$(BUILDDIR)/fcheck.exe
 
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 cu_main=$(BUILDDIR)/gcheck.exe
 fcu_main=$(BUILDDIR)/fgcheck.exe
 else
@@ -468,15 +537,16 @@ $(BUILDDIR)/.build.$(TAG):
 	@touch $(BUILDDIR)/.build.$(TAG)
 
 # Generic target and build rules: objects from CUDA compilation
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 $(BUILDDIR)/%.o : %.cu *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
 	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
-	$(NVCC) $(CPPFLAGS) $(CUFLAGS) -Xcompiler -fPIC -c $< -o $@
+	$(GPUCC) $(CPPFLAGS) $(GPUFLAGS) $(CUBUILDRULEFLAGS) $< -o $@
 
 $(BUILDDIR)/%_cu.o : %.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
 	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
-	$(NVCC) $(CPPFLAGS) $(CUFLAGS) -Xcompiler -fPIC -c -x cu $< -o $@
+	$(GPUCC) $(CPPFLAGS) $(GPUFLAGS) $(CCBUILDRULEFLAGS) $< -o $@
 endif
+# -x cu in line above
 
 # Generic target and build rules: objects from C++ compilation
 # (NB do not include CUINC here! add it only for NVTX or curand #679)
@@ -485,11 +555,14 @@ $(BUILDDIR)/%.o : %.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
 	$(CXX) $(CPPFLAGS) $(CXXFLAGS) -fPIC -c $< -o $@
 
 # Apply special build flags only to CrossSectionKernel.cc and gCrossSectionKernel.cu (no fast math, see #117 and #516)
+# Added edgecase for HIP compilation
 ifeq ($(shell $(CXX) --version | grep ^nvc++),)
 $(BUILDDIR)/CrossSectionKernels.o: CXXFLAGS := $(filter-out -ffast-math,$(CXXFLAGS))
 $(BUILDDIR)/CrossSectionKernels.o: CXXFLAGS += -fno-fast-math
-ifneq ($(NVCC),)
-$(BUILDDIR)/gCrossSectionKernels.o: CUFLAGS += -Xcompiler -fno-fast-math
+ifeq ($(findstring nvcc,$(GPUCC)),nvcc)
+  $(BUILDDIR)/gCrossSectionKernels.o: GPUFLAGS += -Xcompiler -fno-fast-math
+else
+  $(BUILDDIR)/gCrossSectionKernels.o: GPUFLAGS += -fno-fast-math
 endif
 endif
 
@@ -505,10 +578,10 @@ ifeq ($(RNDGEN),hasCurand)
 $(BUILDDIR)/CurandRandomNumberKernel.o: CXXFLAGS += $(CUINC)
 endif
 
-# Avoid "warning: builtin __has_trivial_... is deprecated; use __is_trivially_... instead" in nvcc with icx2023 (#592)
+# Avoid "warning: builtin __has_trivial_... is deprecated; use __is_trivially_... instead" in GPUCC with icx2023 (#592)
 ifneq ($(shell $(CXX) --version | egrep '^(Intel)'),)
-ifneq ($(NVCC),)
-CUFLAGS += -Xcompiler -Wno-deprecated-builtins
+ifneq ($(GPUCC),)
+GPUFLAGS += -Wno-deprecated-builtins
 endif
 endif
 
@@ -516,8 +589,8 @@ endif
 # This patch does remove the warning, but I prefer to keep it disabled for the moment...
 ###ifneq ($(shell $(CXX) --version | egrep '^(clang|Apple clang|Intel)'),)
 ###$(BUILDDIR)/CrossSectionKernels.o: CXXFLAGS += -Wno-overriding-t-option
-###ifneq ($(NVCC),)
-###$(BUILDDIR)/gCrossSectionKernels.o: CUFLAGS += -Xcompiler -Wno-overriding-t-option
+###ifneq ($(GPUCC),)
+###$(BUILDDIR)/gCrossSectionKernels.o: GPUFLAGS += -Xcompiler -Wno-overriding-t-option
 ###endif
 ###endif
 
@@ -544,7 +617,7 @@ MG5AMC_CXXLIB = mg5amc_$(processid_short)_cpp
 cxx_objects_lib=$(BUILDDIR)/CPPProcess.o $(BUILDDIR)/MatrixElementKernels.o $(BUILDDIR)/BridgeKernels.o $(BUILDDIR)/CrossSectionKernels.o
 cxx_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel.o $(BUILDDIR)/RamboSamplingKernels.o
 
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 MG5AMC_CULIB = mg5amc_$(processid_short)_cuda
 cu_objects_lib=$(BUILDDIR)/gCPPProcess.o $(BUILDDIR)/gMatrixElementKernels.o $(BUILDDIR)/gBridgeKernels.o $(BUILDDIR)/gCrossSectionKernels.o
 cu_objects_exe=$(BUILDDIR)/gCommonRandomNumberKernel.o $(BUILDDIR)/gRamboSamplingKernels.o
@@ -556,11 +629,11 @@ $(LIBDIR)/lib$(MG5AMC_CXXLIB).so: cxx_objects_lib += $(BUILDDIR)/fbridge.o
 $(LIBDIR)/lib$(MG5AMC_CXXLIB).so: $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib)
 	$(CXX) -shared -o $@ $(cxx_objects_lib) $(CXXLIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB)
 
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 $(LIBDIR)/lib$(MG5AMC_CULIB).so: $(BUILDDIR)/fbridge_cu.o
 $(LIBDIR)/lib$(MG5AMC_CULIB).so: cu_objects_lib += $(BUILDDIR)/fbridge_cu.o
 $(LIBDIR)/lib$(MG5AMC_CULIB).so: $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cu_objects_lib)
-	$(NVCC) --shared -o $@ $(cu_objects_lib) $(CULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB)
+	$(GPUCC) --shared -o $@ $(cu_objects_lib) $(CULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB)
 endif
 
 #-------------------------------------------------------------------------------
@@ -577,16 +650,16 @@ $(cxx_main): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PAT
 $(cxx_main): $(BUILDDIR)/check_sa.o $(LIBDIR)/lib$(MG5AMC_CXXLIB).so $(cxx_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel.o
 	$(CXX) -o $@ $(BUILDDIR)/check_sa.o $(OMPFLAGS) -ldl -pthread $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_CXXLIB) $(cxx_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel.o $(CURANDLIBFLAGS)
 
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 ifneq ($(shell $(CXX) --version | grep ^Intel),)
-$(cu_main): LIBFLAGS += -lintlc # compile with icpx and link with nvcc (undefined reference to `_intel_fast_memcpy')
-$(cu_main): LIBFLAGS += -lsvml # compile with icpx and link with nvcc (undefined reference to `__svml_cos4_l9')
+$(cu_main): LIBFLAGS += -lintlc # compile with icpx and link with GPUCC (undefined reference to `_intel_fast_memcpy')
+$(cu_main): LIBFLAGS += -lsvml # compile with icpx and link with GPUCC (undefined reference to `__svml_cos4_l9')
 else ifneq ($(shell $(CXX) --version | grep ^nvc++),) # support nvc++ #531
 $(cu_main): LIBFLAGS += -L$(patsubst %bin/nvc++,%lib,$(subst ccache ,,$(CXX))) -lnvhpcatm -lnvcpumath -lnvc
 endif
 $(cu_main): LIBFLAGS += $(CULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH
 $(cu_main): $(BUILDDIR)/gcheck_sa.o $(LIBDIR)/lib$(MG5AMC_CULIB).so $(cu_objects_exe) $(BUILDDIR)/gCurandRandomNumberKernel.o
-	$(NVCC) -o $@ $(BUILDDIR)/gcheck_sa.o $(CUARCHFLAGS) $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe) $(BUILDDIR)/gCurandRandomNumberKernel.o $(CURANDLIBFLAGS)
+	$(GPUCC) -o $@ $(BUILDDIR)/gcheck_sa.o $(CUARCHFLAGS) $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe) $(BUILDDIR)/gCurandRandomNumberKernel.o $(CURANDLIBFLAGS)
 endif
 
 #-------------------------------------------------------------------------------
@@ -612,17 +685,17 @@ $(fcxx_main): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PA
 $(fcxx_main): $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler.o $(LIBDIR)/lib$(MG5AMC_CXXLIB).so $(cxx_objects_exe)
 	$(CXX) -o $@ $(BUILDDIR)/fcheck_sa.o $(OMPFLAGS) $(BUILDDIR)/fsampler.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CXXLIB) $(cxx_objects_exe)
 
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 ifneq ($(shell $(CXX) --version | grep ^Intel),)
-$(fcu_main): LIBFLAGS += -lintlc # compile with icpx and link with nvcc (undefined reference to `_intel_fast_memcpy')
-$(fcu_main): LIBFLAGS += -lsvml # compile with icpx and link with nvcc (undefined reference to `__svml_cos4_l9')
+$(fcu_main): LIBFLAGS += -lintlc # compile with icpx and link with GPUCC (undefined reference to `_intel_fast_memcpy')
+$(fcu_main): LIBFLAGS += -lsvml # compile with icpx and link with GPUCC (undefined reference to `__svml_cos4_l9')
 endif
 ifeq ($(UNAME_S),Darwin)
 $(fcu_main): LIBFLAGS += -L$(shell dirname $(shell $(FC) --print-file-name libgfortran.dylib)) # add path to libgfortran on Mac #375
 endif
 $(fcu_main): LIBFLAGS += $(CULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH
 $(fcu_main): $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler_cu.o $(LIBDIR)/lib$(MG5AMC_CULIB).so $(cu_objects_exe)
-	$(NVCC) -o $@ $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler_cu.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe)
+	$(GPUCC) -o $@ $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler_cu.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe)
 endif
 
 #-------------------------------------------------------------------------------
@@ -634,7 +707,7 @@ $(BUILDDIR)/testxxx.o: testxxx_cc_ref.txt
 $(testmain): $(BUILDDIR)/testxxx.o
 $(testmain): cxx_objects_exe += $(BUILDDIR)/testxxx.o # Comment out this line to skip the C++ test of xxx functions
 
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 $(BUILDDIR)/testxxx_cu.o: $(GTESTLIBS)
 $(BUILDDIR)/testxxx_cu.o: INCFLAGS += $(GTESTINC)
 $(BUILDDIR)/testxxx_cu.o: testxxx_cc_ref.txt
@@ -647,7 +720,7 @@ $(BUILDDIR)/testmisc.o: INCFLAGS += $(GTESTINC)
 $(testmain): $(BUILDDIR)/testmisc.o
 $(testmain): cxx_objects_exe += $(BUILDDIR)/testmisc.o # Comment out this line to skip the C++ miscellaneous tests
 
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 $(BUILDDIR)/testmisc_cu.o: $(GTESTLIBS)
 $(BUILDDIR)/testmisc_cu.o: INCFLAGS += $(GTESTINC)
 $(testmain): $(BUILDDIR)/testmisc_cu.o
@@ -659,12 +732,12 @@ $(BUILDDIR)/runTest.o: INCFLAGS += $(GTESTINC)
 $(testmain): $(BUILDDIR)/runTest.o
 $(testmain): cxx_objects_exe += $(BUILDDIR)/runTest.o
 
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 $(BUILDDIR)/runTest_cu.o: $(GTESTLIBS)
 $(BUILDDIR)/runTest_cu.o: INCFLAGS += $(GTESTINC)
 ifneq ($(shell $(CXX) --version | grep ^Intel),)
-$(testmain): LIBFLAGS += -lintlc # compile with icpx and link with nvcc (undefined reference to `_intel_fast_memcpy')
-$(testmain): LIBFLAGS += -lsvml # compile with icpx and link with nvcc (undefined reference to `__svml_cos4_l9')
+$(testmain): LIBFLAGS += -lintlc # compile with icpx and link with GPUCC (undefined reference to `_intel_fast_memcpy')
+$(testmain): LIBFLAGS += -lsvml # compile with icpx and link with GPUCC (undefined reference to `__svml_cos4_l9')
 else ifneq ($(shell $(CXX) --version | grep ^nvc++),) # support nvc++ #531
 $(testmain): LIBFLAGS += -L$(patsubst %bin/nvc++,%lib,$(subst ccache ,,$(CXX))) -lnvhpcatm -lnvcpumath -lnvc
 endif
@@ -688,14 +761,14 @@ $(testmain): LIBFLAGS += -lgomp
 endif
 endif
 
-ifeq ($(NVCC),) # link only runTest.o
+ifeq ($(GPUCC),) # link only runTest.o
 $(testmain): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH
 $(testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(cxx_objects_exe) $(GTESTLIBS)
 	$(CXX) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) -ldl -pthread $(LIBFLAGS)
 else # link both runTest.o and runTest_cu.o
 $(testmain): LIBFLAGS += $(CULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH
 $(testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) $(GTESTLIBS)
-	$(NVCC) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) -ldl $(LIBFLAGS) -lcuda
+	  $(GPUCC) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) -ldl $(LIBFLAGS) $(CUDATESTFLAGS)
 endif
 
 # Use flock (Linux only, no Mac) to allow 'make -j' if googletest has not yet been downloaded https://stackoverflow.com/a/32666215
@@ -798,9 +871,9 @@ ifeq ($(USECCACHE),1)
 	ccache --version | head -1
 endif
 	@echo ""
-	@echo NVCC=$(NVCC)
-ifneq ($(NVCC),)
-	$(NVCC) --version
+	@echo GPUCC=$(GPUCC)
+ifneq ($(GPUCC),)
+	$(GPUCC) --version
 endif
 	@echo ""
 	@echo CXX=$(CXX)
@@ -819,7 +892,7 @@ endif
 
 # Target: check (run the C++ test executable)
 # [NB THIS IS WHAT IS USED IN THE GITHUB CI!]
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 check: runTest cmpFcheck cmpFGcheck
 else
 check: runTest cmpFcheck
diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/fbridge.cc b/epochX/cudacpp/gg_tt.mad/SubProcesses/fbridge.cc
index f93c05b0b3..2b956730d4 100644
--- a/epochX/cudacpp/gg_tt.mad/SubProcesses/fbridge.cc
+++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/fbridge.cc
@@ -1,11 +1,11 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: S. Roiser (Oct 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Roiser, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #include "Bridge.h"
 #include "CPPProcess.h"
-#include "CudaRuntime.h"
+#include "GpuRuntime.h"
 
 extern "C"
 {
@@ -22,7 +22,7 @@ extern "C"
    * Using the same Fortran MadEvent code, linking to the hetrerogeneous library would allow access to both CPU and GPU implementations.
    * The specific heterogeneous configuration (how many GPUs, how many threads on each CPU, etc) could be loaded in CUDA/C++ from a data file.
    */
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   using namespace mg5amcGpu;
 #else
   using namespace mg5amcCpu;
@@ -46,8 +46,8 @@ extern "C"
    */
   void fbridgecreate_( CppObjectInFortran** ppbridge, const int* pnevtF, const int* pnparF, const int* pnp4F )
   {
-#ifdef __CUDACC__
-    CudaRuntime::setUp();
+#ifdef MGONGPUCPP_GPUIMPL
+    GpuRuntime::setUp();
 #endif
     // Create a process object, read parm card and set parameters
     // FIXME: the process instance can happily go out of scope because it is only needed to read parameters?
@@ -69,8 +69,8 @@ extern "C"
     Bridge<FORTRANFPTYPE>* pbridge = dynamic_cast<Bridge<FORTRANFPTYPE>*>( *ppbridge );
     if( pbridge == 0 ) throw std::runtime_error( "fbridgedelete_: invalid Bridge address" );
     delete pbridge;
-#ifdef __CUDACC__
-    CudaRuntime::tearDown();
+#ifdef MGONGPUCPP_GPUIMPL
+    GpuRuntime::tearDown();
 #endif
   }
 
@@ -100,7 +100,7 @@ extern "C"
   {
     Bridge<FORTRANFPTYPE>* pbridge = dynamic_cast<Bridge<FORTRANFPTYPE>*>( *ppbridge );
     if( pbridge == 0 ) throw std::runtime_error( "fbridgesequence_: invalid Bridge address" );
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     // Use the device/GPU implementation in the CUDA library
     // (there is also a host implementation in this library)
     pbridge->gpu_sequence( momenta, gs, rndhel, rndcol, *pchannelId, mes, selhel, selcol );
diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/fsampler.cc b/epochX/cudacpp/gg_tt.mad/SubProcesses/fsampler.cc
index 2fb445372d..3743934f41 100644
--- a/epochX/cudacpp/gg_tt.mad/SubProcesses/fsampler.cc
+++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/fsampler.cc
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Feb 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #include "mgOnGpuConfig.h"
 
@@ -13,7 +13,7 @@
 
 //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -40,7 +40,7 @@ namespace mg5amcCpu
   private:
     const int m_nevt; // The number of events in each iteration
     int m_iiter;      // The iteration counter (for random number seeding)
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
     HostBufferRndNumMomenta m_hstRndmom; // Memory buffers for random numbers
     HostBufferMomenta m_hstMomenta;      // Memory buffers for momenta
     HostBufferWeights m_hstWeights;      // Memory buffers for sampling weights
@@ -105,7 +105,7 @@ namespace mg5amcCpu
 
 extern "C"
 {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   using namespace mg5amcGpu;
 #else
   using namespace mg5amcCpu;
diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/runTest.cc b/epochX/cudacpp/gg_tt.mad/SubProcesses/runTest.cc
index df7488178e..4c9bc9ee6b 100644
--- a/epochX/cudacpp/gg_tt.mad/SubProcesses/runTest.cc
+++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/runTest.cc
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: S. Hageboeck (Nov 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 //----------------------------------------------------------------------------
 // Use ./runTest.exe --gtest_filter=*xxx to run only testxxx.cc tests
 //----------------------------------------------------------------------------
@@ -18,7 +18,7 @@
 #include "RandomNumberKernels.h"
 #include "epoch_process_id.h"
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 using namespace mg5amcGpu;
 #else
 using namespace mg5amcCpu;
@@ -35,7 +35,7 @@ struct CUDA_CPU_TestBase : public TestDriverBase
     : TestDriverBase( npar, refFileName ) {}
 };
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 struct CPUTest : public CUDA_CPU_TestBase
 {
   // Struct data members (process, and memory structures for random numbers, momenta, matrix elements and weights on host and device)
@@ -117,7 +117,7 @@ struct CPUTest : public CUDA_CPU_TestBase
 };
 #endif
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 struct CUDATest : public CUDA_CPU_TestBase
 {
   // Reset the device when our test goes out of scope. Note that this should happen after
@@ -126,7 +126,7 @@ struct CUDATest : public CUDA_CPU_TestBase
   {
     ~DeviceReset()
     {
-      checkCuda( cudaDeviceReset() ); // this is needed by cuda-memcheck --leak-check full
+      gpuDeviceReset(); // this is needed by cuda-memcheck --leak-check full
     }
   } deviceResetter;
 
@@ -252,7 +252,7 @@ INSTANTIATE_TEST_SUITE_P( prefix, \
                           test_suite_name, \
                           testing::Values( new CUDATest( MG_EPOCH_REFERENCE_FILE_NAME ) ) );
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 MG_INSTANTIATE_TEST_SUITE_GPU( XTESTID_GPU( MG_EPOCH_PROCESS_ID ), MadgraphTest );
 #else
 MG_INSTANTIATE_TEST_SUITE_CPU( XTESTID_CPU( MG_EPOCH_PROCESS_ID ), MadgraphTest );
diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/testmisc.cc b/epochX/cudacpp/gg_tt.mad/SubProcesses/testmisc.cc
index 895d6eeb56..ba9e59a8a3 100644
--- a/epochX/cudacpp/gg_tt.mad/SubProcesses/testmisc.cc
+++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/testmisc.cc
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 //----------------------------------------------------------------------------
 // Use ./runTest.exe --gtest_filter=*misc to run only testmisc.cc tests
 //----------------------------------------------------------------------------
@@ -17,7 +17,7 @@
 #include <sstream>
 #include <typeinfo>
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 #define TESTID( s ) s##_GPU_MISC
 #else
 #define TESTID( s ) s##_CPU_MISC
@@ -26,7 +26,7 @@
 #define XTESTID( s ) TESTID( s )
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -59,7 +59,7 @@ namespace mg5amcCpu
 
 TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testmisc )
 {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   using namespace mg5amcGpu;
 #else
   using namespace mg5amcCpu;
diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/testxxx.cc b/epochX/cudacpp/gg_tt.mad/SubProcesses/testxxx.cc
index e40f635e46..016bc0f472 100644
--- a/epochX/cudacpp/gg_tt.mad/SubProcesses/testxxx.cc
+++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/testxxx.cc
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Apr 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 //----------------------------------------------------------------------------
 // Use ./runTest.exe --gtest_filter=*xxx to run only testxxx.cc tests
 //----------------------------------------------------------------------------
@@ -24,7 +24,7 @@
 #include <iomanip>
 #include <iostream>
 #include <vector>
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 #define TESTID( s ) s##_GPU_XXX
 #else
 #define TESTID( s ) s##_CPU_XXX
@@ -32,7 +32,7 @@
 
 #define XTESTID( s ) TESTID( s )
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -42,7 +42,7 @@ namespace mg5amcCpu
   int FPEhandlerIevt = -1;
   inline void FPEhandler( int sig )
   {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     std::cerr << "Floating Point Exception (GPU): '" << FPEhandlerMessage << "' ievt=" << FPEhandlerIevt << std::endl;
 #else
     std::cerr << "Floating Point Exception (CPU neppV=" << neppV << "): '" << FPEhandlerMessage << "' ievt=" << FPEhandlerIevt << std::endl;
@@ -53,7 +53,7 @@ namespace mg5amcCpu
 
 TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testxxx )
 {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   using namespace mg5amcGpu;
 #else
   using namespace mg5amcCpu;
@@ -76,7 +76,7 @@ TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testxxx )
   assert( nevt % neppM == 0 ); // nevt must be a multiple of neppM
   assert( nevt % neppV == 0 ); // nevt must be a multiple of neppV
   // Fill in the input momenta
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   mg5amcGpu::PinnedHostBufferMomenta hstMomenta( nevt ); // AOSOA[npagM][npar=4][np4=4][neppM]
 #else
   mg5amcCpu::HostBufferMomenta hstMomenta( nevt ); // AOSOA[npagM][npar=4][np4=4][neppM]
@@ -321,7 +321,7 @@ TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testxxx )
   {
     for( int ievt = 0; ievt < nevt; ievt++ )
     {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
       using namespace mg5amcGpu;
 #else
       using namespace mg5amcCpu;
diff --git a/epochX/cudacpp/gg_tt.mad/bin/internal/ufomodel/py3_model.pkl b/epochX/cudacpp/gg_tt.mad/bin/internal/ufomodel/py3_model.pkl
index c67363a41f7befb95905219d5f8068b362abbe2f..b6989c1453094d7f45cf2ee4b2124efa29e9064b 100644
GIT binary patch
delta 44
zcmX?hj%n{XrVZZ9<PsSe81xg<iuIFIi}Li6GxW>zi?a2z^s`D*Gt>1a7cIL20CztS
A_y7O^

delta 53
zcmdmcj_KGrrVZZ9)Uy~E81z#TOA_@H%Mx=Ei;FY$-2+0642+EReceqHeVz5wGZM>m
JCuc6Z0ssYc66F8@

diff --git a/epochX/cudacpp/gg_tt.mad/src/HelAmps_sm.h b/epochX/cudacpp/gg_tt.mad/src/HelAmps_sm.h
index f6d1694588..a5fa3aa3f3 100644
--- a/epochX/cudacpp/gg_tt.mad/src/HelAmps_sm.h
+++ b/epochX/cudacpp/gg_tt.mad/src/HelAmps_sm.h
@@ -5,7 +5,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: A. Valassi (Sep 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
 // MadGraph5_aMC@NLO v. 3.5.0_lo_vect, 2023-06-09
@@ -28,7 +28,7 @@
 //#include <iomanip>
 //#include <iostream>
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/gg_tt.mad/src/Parameters_sm.cc b/epochX/cudacpp/gg_tt.mad/src/Parameters_sm.cc
index b9668e2a87..be86753c20 100644
--- a/epochX/cudacpp/gg_tt.mad/src/Parameters_sm.cc
+++ b/epochX/cudacpp/gg_tt.mad/src/Parameters_sm.cc
@@ -4,7 +4,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: A. Valassi (Sep 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
 // MadGraph5_aMC@NLO v. 3.5.0_lo_vect, 2023-06-09
@@ -17,7 +17,7 @@
 #include <iomanip>
 #include <iostream>
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 using namespace mg5amcGpu;
 #else
 using namespace mg5amcCpu;
diff --git a/epochX/cudacpp/gg_tt.mad/src/Parameters_sm.h b/epochX/cudacpp/gg_tt.mad/src/Parameters_sm.h
index 69241c2b3b..735b46cd38 100644
--- a/epochX/cudacpp/gg_tt.mad/src/Parameters_sm.h
+++ b/epochX/cudacpp/gg_tt.mad/src/Parameters_sm.h
@@ -27,7 +27,7 @@
 #include "read_slha.h"
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -93,7 +93,7 @@ namespace mg5amcCpu
 #include <limits>
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -217,7 +217,7 @@ namespace mg5amcCpu
 //==========================================================================
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -236,7 +236,7 @@ namespace mg5amcCpu
 #pragma GCC diagnostic push
 #pragma GCC diagnostic ignored "-Wunused-variable"  // e.g. <<warning: unused variable ‘mdl_G__exp__2’ [-Wunused-variable]>>
 #pragma GCC diagnostic ignored "-Wunused-parameter" // e.g. <<warning: unused parameter ‘G’ [-Wunused-parameter]>>
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 #pragma nv_diagnostic push
 #pragma nv_diag_suppress 177 // e.g. <<warning #177-D: variable "mdl_G__exp__2" was declared but never referenced>>
 #endif
@@ -263,7 +263,7 @@ namespace mg5amcCpu
       // End SM implementation - no special handling of vectors of floats as in EFT (#439)
       return out;
     }
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 #pragma GCC diagnostic pop
 #pragma nv_diagnostic pop
 #endif
diff --git a/epochX/cudacpp/gg_tt.mad/src/cudacpp_src.mk b/epochX/cudacpp/gg_tt.mad/src/cudacpp_src.mk
index 554d7a704c..f73ff6fa03 100644
--- a/epochX/cudacpp/gg_tt.mad/src/cudacpp_src.mk
+++ b/epochX/cudacpp/gg_tt.mad/src/cudacpp_src.mk
@@ -38,13 +38,13 @@ endif
 
 #-------------------------------------------------------------------------------
 
-#=== Configure the CUDA compiler (note: NVCC is already exported including ccache)
+#=== Configure the CUDA compiler (note: GPUCC is already exported including ccache)
 
-###$(info NVCC=$(NVCC))
+###$(info GPUCC=$(GPUCC))
 
 #-------------------------------------------------------------------------------
 
-#=== Configure ccache for C++ builds (note: NVCC is already exported including ccache)
+#=== Configure ccache for C++ builds (note: GPUCC is already exported including ccache)
 
 # Enable ccache if USECCACHE=1
 ifeq ($(USECCACHE)$(shell echo $(CXX) | grep ccache),1)
@@ -246,20 +246,20 @@ $(BUILDDIR)/%.o : %.cc *.h $(BUILDDIR)/.build.$(TAG)
 # Generic target and build rules: objects from CUDA compilation
 $(BUILDDIR)/%_cu.o : %.cc *.h $(BUILDDIR)/.build.$(TAG)
 	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
-	$(NVCC) $(CPPFLAGS) $(CUFLAGS) -Xcompiler -fPIC -c -x cu $< -o $@
+	$(GPUCC) $(CPPFLAGS) $(CUFLAGS) -Xcompiler -fPIC -c -x cu $< -o $@
 
 #-------------------------------------------------------------------------------
 
 cxx_objects=$(addprefix $(BUILDDIR)/, Parameters_sm.o read_slha.o)
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 cu_objects=$(addprefix $(BUILDDIR)/, Parameters_sm_cu.o)
 endif
 
 # Target (and build rules): common (src) library
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so : $(cxx_objects) $(cu_objects)
 	@if [ ! -d $(LIBDIR) ]; then echo "mkdir -p $(LIBDIR)"; mkdir -p $(LIBDIR); fi
-	$(NVCC) -shared -o $@ $(cxx_objects) $(cu_objects)
+	$(GPUCC) -shared -o $@ $(cxx_objects) $(cu_objects)
 else
 $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so : $(cxx_objects)
 	@if [ ! -d $(LIBDIR) ]; then echo "mkdir -p $(LIBDIR)"; mkdir -p $(LIBDIR); fi
diff --git a/epochX/cudacpp/gg_tt.mad/src/mgOnGpuConfig.h b/epochX/cudacpp/gg_tt.mad/src/mgOnGpuConfig.h
index cacab1031a..e540c8587c 100644
--- a/epochX/cudacpp/gg_tt.mad/src/mgOnGpuConfig.h
+++ b/epochX/cudacpp/gg_tt.mad/src/mgOnGpuConfig.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jul 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MGONGPUCONFIG_H
 #define MGONGPUCONFIG_H 1
@@ -10,13 +10,27 @@
 // There are two different code bases for standalone_cudacpp (without multichannel) and madevent+cudacpp (with multichannel)
 #define MGONGPU_SUPPORTS_MULTICHANNEL 1
 
+// Is this a GPU (CUDA, HIP) or CPU implementation?
+#ifdef __CUDACC__
+#define MGONGPUCPP_GPUIMPL cuda
+#elif defined __HIPCC__
+#define MGONGPUCPP_GPUIMPL hip
+#include "hip/hip_runtime.h"
+#else
+#undef MGONGPUCPP_GPUIMPL
+#endif
+
 // ** NB1 Throughputs (e.g. 6.8E8) are events/sec for "./gcheck.exe -p 65536 128 12"
 // ** NB2 Baseline on b7g47n0004 fluctuates (probably depends on load on other VMs)
 
 // Choose if curand is supported for generating random numbers
+// For CUDA, by default, it is supported
+// For HIP, by default, it is not supported
 // For C++, by default, do not inline, but allow this macro to be set from outside with e.g. -DMGONGPU_HAS_NO_CURAND
 #ifdef __CUDACC__
 #undef MGONGPU_HAS_NO_CURAND
+#elif defined __HIPCC__
+#define MGONGPU_HAS_NO_CURAND 1
 #else
 //#undef MGONGPU_HAS_NO_CURAND // default
 ////#define MGONGPU_HAS_NO_CURAND 1
@@ -52,23 +66,28 @@
 //#undef MGONGPU_HARDCODE_PARAM // default
 ////#define MGONGPU_HARDCODE_PARAM 1
 
-// Complex type in c++: std::complex or cxsmpl (CHOOSE ONLY ONE)
-#ifndef __CUDACC__
-//#define MGONGPU_CPPCXTYPE_STDCOMPLEX 1 // ~8 percent slower on float, same on double (5.1E6/double, 9.4E6/float)
-#define MGONGPU_CPPCXTYPE_CXSMPL 1 // new default (5.1E6/double, 10.2E6/float)
-#endif
-
-// Complex type in cuda: thrust or cucomplex or cxsmpl (CHOOSE ONLY ONE)
+// Complex type in CUDA: thrust or cucomplex or cxsmpl (CHOOSE ONLY ONE)
 #ifdef __CUDACC__
 #define MGONGPU_CUCXTYPE_THRUST 1 // default (~1.15E9/double, ~3.2E9/float)
 //#define MGONGPU_CUCXTYPE_CUCOMPLEX 1 // ~10 percent slower (1.03E9/double, ~2.8E9/float)
 //#define MGONGPU_CUCXTYPE_CXSMPL 1 // ~10 percent slower (1.00E9/double, ~2.9E9/float)
+
+// Complex type in HIP: cxsmpl (ONLY ONE OPTION POSSIBLE)
+#elif defined __HIPCC__
+#define MGONGPU_CUCXTYPE_CXSMPL 1 // ~10 percent slower (1.00E9/double, ~2.9E9/float)
+
+// Complex type in C++: std::complex or cxsmpl (CHOOSE ONLY ONE)
+#else
+//#define MGONGPU_CPPCXTYPE_STDCOMPLEX 1 // ~8 percent slower on float, same on double (5.1E6/double, 9.4E6/float)
+#define MGONGPU_CPPCXTYPE_CXSMPL 1 // new default (5.1E6/double, 10.2E6/float)
 #endif
 
-// Cuda nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation
+// CUDA nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation
 #ifdef __CUDACC__
-#undef MGONGPU_NSIGHT_DEBUG // default
+#undef MGONGPU_NSIGHT_DEBUG // default in CUDA
 //#define MGONGPU_NSIGHT_DEBUG 1
+#else
+#undef MGONGPU_NSIGHT_DEBUG // only option in HIP or C++
 #endif
 
 // SANITY CHECKS (floating point precision for everything but color algebra #537)
@@ -84,17 +103,21 @@
 #error You cannot use double precision for color algebra and single precision elsewhere
 #endif
 
-// SANITY CHECKS (c++ complex number implementation)
-#ifndef __CUDACC__
-#if defined MGONGPU_CPPCXTYPE_STDCOMPLEX and defined MGONGPU_CPPCXTYPE_CXSMPL
-#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CPPCXTYPE_STDCOMPLEX or MGONGPU_CPPCXTYPE_CXSMPL
+// SANITY CHECKS (CUDA complex number implementation)
+#ifdef __CUDACC__
+#if defined MGONGPU_CUCXTYPE_THRUST and defined MGONGPU_CUCXTYPE_CUCOMPLEX
+#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CUCXTYPE_THRUST or MGONGPU_CUCXTYPE_CUCOMPLEX for CUDA
+#elif defined MGONGPU_CUCXTYPE_THRUST and defined MGONGPU_CUCXTYPE_CXSMPL
+#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CUCXTYPE_THRUST or MGONGPU_CUCXTYPE_CXSMPL for CUDA
+#elif defined MGONGPU_CUCXTYPE_CUCOMPLEX and defined MGONGPU_CUCXTYPE_CXSMPL
+#error You must CHOOSE (ONE AND) ONLY ONE OF MGONGPU_CUCXTYPE_CUCOMPLEX or MGONGPU_CUCXTYPE_CXSMPL for CUDA
 #endif
 #endif
 
-// SANITY CHECKS (cuda complex number implementation)
-#ifdef __CUDACC__
-#if defined MGONGPU_CUCXTYPE_THRUST and defined MGONGPU_CUCXTYPE_CUCOMPLEX and defined MGONGPU_CUCXTYPE_CXSMPL
-#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CUCXTYPE_THRUST or MGONGPU_CUCXTYPE_CUCOMPLEX or MGONGPU_CUCXTYPE_CXSMPL
+// SANITY CHECKS (C++ complex number implementation)
+#ifndef MGONGPUCPP_GPUIMPL
+#if defined MGONGPU_CPPCXTYPE_STDCOMPLEX and defined MGONGPU_CPPCXTYPE_CXSMPL
+#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CPPCXTYPE_STDCOMPLEX or MGONGPU_CPPCXTYPE_CXSMPL for C++
 #endif
 #endif
 
@@ -132,7 +155,7 @@ namespace mgOnGpu
   // Alignment requirement for using reinterpret_cast with SIMD vectorized code
   // (using reinterpret_cast with non aligned memory may lead to segmentation faults!)
   // Only needed for C++ code but can be enforced also in NVCC builds of C++ code using CUDA>=11.2 and C++17 (#318, #319, #333)
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   constexpr int cppAlign = 64; // alignment requirement for SIMD vectorization (64-byte i.e. 512-bit)
 #endif
 
@@ -143,7 +166,7 @@ using mgOnGpu::fptype;
 using mgOnGpu::fptype2;
 
 // C++ SIMD vectorization width (this will be used to set neppV)
-#ifdef __CUDACC__ // CUDA implementation has no SIMD
+#ifdef MGONGPUCPP_GPUIMPL // CUDA and HIP implementations have no SIMD
 #undef MGONGPU_CPPSIMD
 #elif defined __AVX512VL__ && defined MGONGPU_PVW512 // C++ "512z" AVX512 with 512 width (512-bit ie 64-byte): 8 (DOUBLE) or 16 (FLOAT)
 #ifdef MGONGPU_FPTYPE_DOUBLE
@@ -173,9 +196,9 @@ using mgOnGpu::fptype2;
 #undef MGONGPU_CPPSIMD
 #endif
 
-// Cuda nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation
+// CUDA nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation
 // Arguments (not used so far): text is __FUNCTION__, code is 0 (start) or 1 (end)
-#if defined __CUDACC__ && defined MGONGPU_NSIGHT_DEBUG /* clang-format off */
+#if defined __CUDA__ && defined MGONGPU_NSIGHT_DEBUG /* clang-format off */
 #define mgDebugDeclare() __shared__ float mgDebugCounter[mgOnGpu::ntpbMAX];
 #define mgDebugInitialise() { mgDebugCounter[threadIdx.x] = 0; }
 #define mgDebug( code, text ) { mgDebugCounter[threadIdx.x] += 1; }
@@ -187,8 +210,8 @@ using mgOnGpu::fptype2;
 #define mgDebugFinalise() { /*noop*/ }
 #endif /* clang-format on */
 
-// Define empty CUDA declaration specifiers for C++
-#ifndef __CUDACC__
+// Define empty CUDA/HIP declaration specifiers for C++
+#ifndef MGONGPUCPP_GPUIMPL
 #define __global__
 #define __host__
 #define __device__
diff --git a/epochX/cudacpp/gg_tt.mad/src/mgOnGpuCxtypes.h b/epochX/cudacpp/gg_tt.mad/src/mgOnGpuCxtypes.h
index b56348bc58..6ae0c42ecb 100644
--- a/epochX/cudacpp/gg_tt.mad/src/mgOnGpuCxtypes.h
+++ b/epochX/cudacpp/gg_tt.mad/src/mgOnGpuCxtypes.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022, based on earlier work by D. Smith) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MGONGPUCXTYPES_H
 #define MGONGPUCXTYPES_H 1
@@ -19,7 +19,7 @@
 #include <complex>
 
 // Complex type in cuda: thrust or cucomplex or cxsmpl
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 #if defined MGONGPU_CUCXTYPE_THRUST
 #pragma clang diagnostic push
 #pragma clang diagnostic ignored "-Wtautological-compare" // for icpx2021/clang13 (https://stackoverflow.com/a/15864661)
@@ -82,7 +82,7 @@ namespace mgOnGpu /* clang-format off */
 using mgOnGpu::cxsmpl;
 
 // Printout to stream for user defined types
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -209,14 +209,14 @@ namespace mg5amcCpu
 //==========================================================================
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
 #endif
 {
   // --- Type definitions (complex type: cxtype)
-#ifdef __CUDACC__ // cuda
+#ifdef MGONGPUCPP_GPUIMPL // cuda
 #if defined MGONGPU_CUCXTYPE_THRUST
   typedef thrust::complex<fptype> cxtype;
 #elif defined MGONGPU_CUCXTYPE_CUCOMPLEX
@@ -249,7 +249,7 @@ namespace mg5amcCpu
 //==========================================================================
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -301,7 +301,7 @@ namespace mg5amcCpu
 
   //==========================================================================
 
-#if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_THRUST // cuda + thrust
+#if defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CUCXTYPE_THRUST // cuda + thrust
 
   //------------------------------
   // CUDA - using thrust::complex
@@ -337,11 +337,11 @@ namespace mg5amcCpu
     return c;
   }
 
-#endif // #if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_THRUST
+#endif // #if defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CUCXTYPE_THRUST
 
   //==========================================================================
 
-#if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_CUCOMPLEX // cuda + cucomplex
+#if defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CUCXTYPE_CUCOMPLEX // cuda + cucomplex
 
   //------------------------------
   // CUDA - using cuComplex
@@ -556,11 +556,11 @@ namespace mg5amcCpu
     return cxmake( c.real(), c.imag() );
   }
 
-#endif // #if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_CUCOMPLEX
+#endif // #if defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CUCXTYPE_CUCOMPLEX
 
   //==========================================================================
 
-#if not defined __CUDACC__ and defined MGONGPU_CPPCXTYPE_STDCOMPLEX // c++ + stdcomplex
+#if not defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CPPCXTYPE_STDCOMPLEX // c++ + stdcomplex
 
   //------------------------------
   // C++ - using std::complex
@@ -604,7 +604,7 @@ namespace mg5amcCpu
   }
 #endif
 
-#endif // #if not defined __CUDACC__ and defined MGONGPU_CPPCXTYPE_STDCOMPLEX
+#endif // #if not defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CPPCXTYPE_STDCOMPLEX
 
   //==========================================================================
 
@@ -627,7 +627,7 @@ namespace mg5amcCpu
 //==========================================================================
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/gg_tt.mad/src/mgOnGpuFptypes.h b/epochX/cudacpp/gg_tt.mad/src/mgOnGpuFptypes.h
index 905c97d700..fa3a02664b 100644
--- a/epochX/cudacpp/gg_tt.mad/src/mgOnGpuFptypes.h
+++ b/epochX/cudacpp/gg_tt.mad/src/mgOnGpuFptypes.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MGONGPUFPTYPES_H
 #define MGONGPUFPTYPES_H 1
@@ -12,7 +12,7 @@
 #include <cmath>
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL // cuda
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -20,7 +20,7 @@ namespace mg5amcCpu
 {
   //==========================================================================
 
-#ifdef __CUDACC__ // cuda
+#ifdef MGONGPUCPP_GPUIMPL // cuda
 
   //------------------------------
   // Floating point types - Cuda
@@ -64,11 +64,11 @@ namespace mg5amcCpu
 #endif
   }
 
-#endif // #ifdef __CUDACC__
+#endif // #ifdef MGONGPUCPP_GPUIMPL
 
   //==========================================================================
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 
   //------------------------------
   // Floating point types - C++
@@ -92,7 +92,7 @@ namespace mg5amcCpu
     return std::sqrt( f );
   }
 
-#endif // #ifndef __CUDACC__
+#endif // #ifndef MGONGPUCPP_GPUIMPL
 
   //==========================================================================
 
diff --git a/epochX/cudacpp/gg_tt.mad/src/mgOnGpuVectors.h b/epochX/cudacpp/gg_tt.mad/src/mgOnGpuVectors.h
index e1299ba81e..cdae04326b 100644
--- a/epochX/cudacpp/gg_tt.mad/src/mgOnGpuVectors.h
+++ b/epochX/cudacpp/gg_tt.mad/src/mgOnGpuVectors.h
@@ -32,7 +32,7 @@
 #endif
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -131,7 +131,7 @@ namespace mg5amcCpu
 #endif
 #endif
 
-#else // i.e #ifndef MGONGPU_CPPSIMD (this includes #ifdef __CUDACC__)
+#else // i.e #ifndef MGONGPU_CPPSIMD (this includes #ifdef MGONGPUCPP_GPUIMPL)
 
   const int neppV = 1;
 
@@ -153,13 +153,13 @@ namespace mg5amcCpu
 //==========================================================================
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
 #endif
 {
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 
   // Printout to stream for user defined types
 
@@ -805,11 +805,11 @@ namespace mg5amcCpu
 
 #endif // #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
 
-#endif // #ifndef __CUDACC__
+#endif // #ifndef MGONGPUCPP_GPUIMPL
 
   //==========================================================================
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 
   //------------------------------
   // Vector types - CUDA
@@ -853,12 +853,12 @@ namespace mg5amcCpu
     return mask;
   }
 
-#endif // #ifdef __CUDACC__
+#endif // #ifdef MGONGPUCPP_GPUIMPL
 
   //==========================================================================
 
   // Scalar-or-vector types: scalar in CUDA, vector or scalar in C++
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   typedef bool bool_sv;
   typedef fptype fptype_sv;
   typedef fptype2 fptype2_sv;
@@ -879,7 +879,7 @@ namespace mg5amcCpu
 #endif
 
   // Scalar-or-vector zeros: scalar in CUDA, vector or scalar in C++
-#ifdef __CUDACC__ /* clang-format off */
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
   inline __host__ __device__ cxtype cxzero_sv(){ return cxtype( 0, 0 ); }
 #elif defined MGONGPU_CPPSIMD
   inline cxtype_v cxzero_sv() { return cxtype_v(); } // RRRR=0000 IIII=0000
diff --git a/epochX/cudacpp/gg_tt.mad/src/rambo.h b/epochX/cudacpp/gg_tt.mad/src/rambo.h
index e02ea52496..cd7e1008ea 100644
--- a/epochX/cudacpp/gg_tt.mad/src/rambo.h
+++ b/epochX/cudacpp/gg_tt.mad/src/rambo.h
@@ -4,7 +4,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 
 #include "mgOnGpuConfig.h"
@@ -18,7 +18,7 @@
 #include <iostream>
 
 // Simplified rambo version for 2 to N (with N>=2) processes with massless particles
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -83,7 +83,7 @@ namespace mg5amcCpu
       static bool first = true;
       if( first )
       {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
         if constexpr( M_ACCESS::isOnDevice() ) // avoid
         {
           const int ievt0 = 0;
@@ -166,7 +166,7 @@ namespace mg5amcCpu
     wt = po2log;
     if( nparf != 2 ) wt = ( 2. * nparf - 4. ) * log( energy ) + z[nparf - 1];
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
     // issue warnings if weight is too small or too large
     static int iwarn[5] = { 0, 0, 0, 0, 0 };
     if( wt < -180. )
diff --git a/epochX/cudacpp/gg_tt.sa/CODEGEN_cudacpp_gg_tt_log.txt b/epochX/cudacpp/gg_tt.sa/CODEGEN_cudacpp_gg_tt_log.txt
index d85f35fcaa..32dec0cb3a 100644
--- a/epochX/cudacpp/gg_tt.sa/CODEGEN_cudacpp_gg_tt_log.txt
+++ b/epochX/cudacpp/gg_tt.sa/CODEGEN_cudacpp_gg_tt_log.txt
@@ -53,7 +53,7 @@ Note that you can still compile and run aMC@NLO with the built-in PDFs
 Using default text editor "vi". Set another one in ./input/mg5_configuration.txt
 Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt
 No valid web browser found. Please set in ./input/mg5_configuration.txt
-import /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_cudacpp_gg_tt.mg
+import /afs/cern.ch/work/j/jteig/mg5amcnlo/CODEGEN_cudacpp_gg_tt.mg
 The import format was not given, so we guess it as command
 set stdout_level DEBUG
 set output information to level: 10
@@ -62,7 +62,7 @@ generate g g > t t~
 No model currently active, so we import the Standard Model
 INFO: load particles 
 INFO: load vertices 
-[1;32mDEBUG: model prefixing  takes 0.005174160003662109 [0m
+[1;32mDEBUG: model prefixing  takes 0.0037648677825927734 [0m
 INFO: Restrict model sm with file models/sm/restrict_default.dat . 
 [1;32mDEBUG: Simplifying conditional expressions [0m
 [1;32mDEBUG: remove interactions: u s w+ at order: QED=1 [0m
@@ -163,7 +163,7 @@ Load PLUGIN.CUDACPP_SA_OUTPUT
 [1;32mDEBUG:  cformat = [0m plugin [1;30m[export_cpp.py at line 3071][0m [0m
 [1;32mDEBUG:  Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [1;30m[output.py at line 152][0m [0m
 [1;32mDEBUG:  Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [1;30m[output.py at line 157][0m [0m
-INFO: Creating subdirectories in directory /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_cudacpp_gg_tt 
+INFO: Creating subdirectories in directory /afs/cern.ch/work/j/jteig/mg5amcnlo/CODEGEN_cudacpp_gg_tt 
 INFO: Organizing processes into subprocess groups 
 INFO: Generating Helas calls for process: g g > t t~ WEIGHTED<=2 @1 
 INFO: Processing color information for process: g g > t t~ @1 
@@ -173,12 +173,12 @@ INFO: Processing color information for process: g g > t t~ @1
 [1;32mDEBUG:    type(me)=<class 'int'> me=0 [1;30m[output.py at line 189][0m [0m
 [1;32mDEBUG:  Entering PLUGIN_OneProcessExporter.__init__ [1;30m[model_handling.py at line 1039][0m [0m
 [1;32mDEBUG:  proc_id = [0m 0 [1;30m[model_handling.py at line 1045][0m [0m
-INFO: Creating files in directory /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_cudacpp_gg_tt/SubProcesses/P1_Sigma_sm_gg_ttx 
+INFO: Creating files in directory /afs/cern.ch/work/j/jteig/mg5amcnlo/CODEGEN_cudacpp_gg_tt/SubProcesses/P1_Sigma_sm_gg_ttx 
 [1;32mDEBUG:  Entering PLUGIN_OneProcessExporter.generate_process_files [1;30m[model_handling.py at line 1297][0m [0m
 [1;32mDEBUG:  self.include_multi_channel is not yet defined: this is standalone_cudacpp mode [1;30m[model_handling.py at line 1301][0m [0m
-FileWriter <class 'PLUGIN.CUDACPP_SA_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_cudacpp_gg_tt/SubProcesses/P1_Sigma_sm_gg_ttx/./CPPProcess.h
+FileWriter <class 'PLUGIN.CUDACPP_SA_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /afs/cern.ch/work/j/jteig/mg5amcnlo/CODEGEN_cudacpp_gg_tt/SubProcesses/P1_Sigma_sm_gg_ttx/./CPPProcess.h
 [1;32mDEBUG:  Entering PLUGIN_OneProcessExporter.write_process_h_file [1;30m[model_handling.py at line 1453][0m [0m
-FileWriter <class 'PLUGIN.CUDACPP_SA_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_cudacpp_gg_tt/SubProcesses/P1_Sigma_sm_gg_ttx/./CPPProcess.cc
+FileWriter <class 'PLUGIN.CUDACPP_SA_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /afs/cern.ch/work/j/jteig/mg5amcnlo/CODEGEN_cudacpp_gg_tt/SubProcesses/P1_Sigma_sm_gg_ttx/./CPPProcess.cc
 [1;32mDEBUG:  Entering PLUGIN_OneProcessExporter.write_process_cc_file [1;30m[model_handling.py at line 1475][0m [0m
 [1;32mDEBUG:  Entering PLUGIN_OneProcessExporter.get_sigmaKin_lines [1;30m[model_handling.py at line 1143][0m [0m
 [1;32mDEBUG:  self.include_multi_channel = [0m False [1;30m[model_handling.py at line 1144][0m [0m
@@ -186,12 +186,12 @@ FileWriter <class 'PLUGIN.CUDACPP_SA_OUTPUT.model_handling.PLUGIN_CPPWriter'> fo
 [1;32mDEBUG:  type(self.helas_call_writer) = [0m <class 'PLUGIN.CUDACPP_SA_OUTPUT.model_handling.PLUGIN_GPUFOHelasCallWriter'> [1;30m[model_handling.py at line 1162][0m [0m
 [1;32mDEBUG:  self.support_multichannel, self.include_multi_channel = [0m True False [1;30m[model_handling.py at line 1163][0m [0m
 [1;32mDEBUG:  multi_channel_map = [0m None [1;30m[model_handling.py at line 1654][0m [0m
-[1;32mDEBUG:  diag_to_config = [0m {} [1;30m[model_handling.py at line 1709][0m [0m
-[1;32mDEBUG:  call = [0m vxxxxx( momenta,m_pars->%s, cHel[ihel][%d],%+d, w_sv[%d], %d ); [1;30m[model_handling.py at line 1821][0m [0m
-[1;32mDEBUG:  ('ZERO', 0, -1, 0, 0) [1;30m[model_handling.py at line 1822][0m [0m
-[1;32mDEBUG:  call = [0m vxxxxx( momenta,m_pars->%s, cHel[ihel][%d],%+d, w_sv[%d], %d ); [1;30m[model_handling.py at line 1821][0m [0m
-[1;32mDEBUG:  ('ZERO', 1, -1, 1, 1) [1;30m[model_handling.py at line 1822][0m [0m
-INFO: Created files CPPProcess.h and CPPProcess.cc in directory /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_cudacpp_gg_tt/SubProcesses/P1_Sigma_sm_gg_ttx/. 
+[1;32mDEBUG:  diag_to_config = [0m {} [1;30m[model_handling.py at line 1711][0m [0m
+[1;32mDEBUG:  call = [0m vxxxxx( momenta,m_pars->%s, cHel[ihel][%d],%+d, w_sv[%d], %d ); [1;30m[model_handling.py at line 1823][0m [0m
+[1;32mDEBUG:  ('ZERO', 0, -1, 0, 0) [1;30m[model_handling.py at line 1824][0m [0m
+[1;32mDEBUG:  call = [0m vxxxxx( momenta,m_pars->%s, cHel[ihel][%d],%+d, w_sv[%d], %d ); [1;30m[model_handling.py at line 1823][0m [0m
+[1;32mDEBUG:  ('ZERO', 1, -1, 1, 1) [1;30m[model_handling.py at line 1824][0m [0m
+INFO: Created files CPPProcess.h and CPPProcess.cc in directory /afs/cern.ch/work/j/jteig/mg5amcnlo/CODEGEN_cudacpp_gg_tt/SubProcesses/P1_Sigma_sm_gg_ttx/. 
 [1;32mDEBUG:  Entering PLUGIN_OneProcessExporter.edit_CMakeLists [1;30m[model_handling.py at line 1343][0m [0m
 [1;32mDEBUG:  Entering PLUGIN_OneProcessExporter.edit_check_sa [1;30m[model_handling.py at line 1352][0m [0m
 [1;32mDEBUG:  Entering PLUGIN_OneProcessExporter.edit_mgonGPU [1;30m[model_handling.py at line 1369][0m [0m
@@ -199,19 +199,19 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory /data/avalassi/G
 [1;32mDEBUG:  Entering PLUGIN_OneProcessExporter.edit_testxxx [1;30m[model_handling.py at line 1419][0m [0m
 [1;32mDEBUG:  Entering PLUGIN_OneProcessExporter.edit_memorybuffers [1;30m[model_handling.py at line 1430][0m [0m
 [1;32mDEBUG:  Entering PLUGIN_OneProcessExporter.edit_memoryaccesscouplings [1;30m[model_handling.py at line 1441][0m [0m
-[1;32mDEBUG:  'Copying test reference file: ', template_ref  = [0m Copying test reference file:  /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/../../../test/ref/dump_CPUTest.Sigma_sm_gg_ttx.txt [1;30m[model_handling.py at line 1335][0m [0m
+[1;32mDEBUG:  'Copying test reference file: ', template_ref  = [0m Copying test reference file:  /afs/cern.ch/work/j/jteig/mg5amcnlo/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/../../../test/ref/dump_CPUTest.Sigma_sm_gg_ttx.txt [1;30m[model_handling.py at line 1335][0m [0m
 Generated helas calls for 1 subprocesses (3 diagrams) in 0.005 s
 [1;32mDEBUG:  Entering PLUGIN_ProcessExporter.convert_model (create the model) [1;30m[output.py at line 194][0m [0m
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates VVV1 set of routines with options: P0[0m
 ALOHA: aloha creates FFV1 routines[0m
-ALOHA: aloha creates 2 routines in  0.114 s
+ALOHA: aloha creates 2 routines in  0.108 s
 <class 'aloha.create_aloha.AbstractRoutine'> VVV1
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
-FileWriter <class 'PLUGIN.CUDACPP_SA_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_cudacpp_gg_tt/src/./HelAmps_sm.h
-INFO: Created file HelAmps_sm.h in directory /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_cudacpp_gg_tt/src/. 
+FileWriter <class 'PLUGIN.CUDACPP_SA_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /afs/cern.ch/work/j/jteig/mg5amcnlo/CODEGEN_cudacpp_gg_tt/src/./HelAmps_sm.h
+INFO: Created file HelAmps_sm.h in directory /afs/cern.ch/work/j/jteig/mg5amcnlo/CODEGEN_cudacpp_gg_tt/src/. 
 super_write_set_parameters_onlyfixMajorana (hardcoded=False)
 [1;32mDEBUG:  'pardef_lines size =', len(pardef_lines), ', keys size =', len(pardef_lines.keys())  = [0m pardef_lines size = 48 , keys size = 48 [1;30m[model_handling.py at line 729][0m [0m
 super_write_set_parameters_onlyfixMajorana (hardcoded=True)
@@ -220,13 +220,13 @@ super_write_set_parameters_onlyfixMajorana (hardcoded=True)
 [1;32mDEBUG:  'pardef_lines size =', len(pardef_lines), ', keys size =', len(pardef_lines.keys())  = [0m pardef_lines size = 3 , keys size = 3 [1;30m[model_handling.py at line 729][0m [0m
 [1;32mDEBUG:  'pardef_lines size =', len(pardef_lines), ', keys size =', len(pardef_lines.keys())  = [0m pardef_lines size = 2 , keys size = 2 [1;30m[model_handling.py at line 729][0m [0m
 [1;32mDEBUG:  'pardef_lines size =', len(pardef_lines), ', keys size =', len(pardef_lines.keys())  = [0m pardef_lines size = 2 , keys size = 2 [1;30m[model_handling.py at line 729][0m [0m
-FileWriter <class 'PLUGIN.CUDACPP_SA_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_cudacpp_gg_tt/src/./Parameters_sm.h
-FileWriter <class 'PLUGIN.CUDACPP_SA_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_cudacpp_gg_tt/src/./Parameters_sm.cc
+FileWriter <class 'PLUGIN.CUDACPP_SA_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /afs/cern.ch/work/j/jteig/mg5amcnlo/CODEGEN_cudacpp_gg_tt/src/./Parameters_sm.h
+FileWriter <class 'PLUGIN.CUDACPP_SA_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /afs/cern.ch/work/j/jteig/mg5amcnlo/CODEGEN_cudacpp_gg_tt/src/./Parameters_sm.cc
 INFO: Created files Parameters_sm.h and Parameters_sm.cc in directory 
-INFO: /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_cudacpp_gg_tt/src/. and /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_cudacpp_gg_tt/src/. 
+INFO: /afs/cern.ch/work/j/jteig/mg5amcnlo/CODEGEN_cudacpp_gg_tt/src/. and /afs/cern.ch/work/j/jteig/mg5amcnlo/CODEGEN_cudacpp_gg_tt/src/. 
 [1;32mDEBUG:  Entering PLUGIN_ProcessExporter.finalize [1;30m[output.py at line 203][0m [0m
 quit
 
-real	0m0.866s
-user	0m0.677s
-sys	0m0.118s
+real	0m1.176s
+user	0m0.435s
+sys	0m0.168s
diff --git a/epochX/cudacpp/gg_tt.sa/COPYRIGHT b/epochX/cudacpp/gg_tt.sa/COPYRIGHT
index a134b5fef9..84a883fbb0 100644
--- a/epochX/cudacpp/gg_tt.sa/COPYRIGHT
+++ b/epochX/cudacpp/gg_tt.sa/COPYRIGHT
@@ -15,6 +15,7 @@ The full development team currently includes the following authors :
   Stephan Hageboeck (CERN)
   Olivier Mattelaer (Universite Catholique de Louvain, original author)
   Stefan Roiser (CERN, original author)
+  Joergen Teig (CERN)
   Andrea Valassi (CERN, original author)
   Zenny Wettersten (CERN)
 See https://github.com/madgraph5/madgraph4gpu for more details. For the full
diff --git a/epochX/cudacpp/gg_tt.sa/SubProcesses/Bridge.h b/epochX/cudacpp/gg_tt.sa/SubProcesses/Bridge.h
index d7e629cacd..f37c972b24 100644
--- a/epochX/cudacpp/gg_tt.sa/SubProcesses/Bridge.h
+++ b/epochX/cudacpp/gg_tt.sa/SubProcesses/Bridge.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: S. Roiser (Nov 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Roiser, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef BRIDGE_H
 #define BRIDGE_H 1
@@ -22,7 +22,7 @@
 #include <memory>
 #include <type_traits>
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -82,7 +82,7 @@ namespace mg5amcCpu
     Bridge& operator=( const Bridge& ) = delete;
     Bridge& operator=( Bridge&& ) = delete;
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     /**
      * Set the gpublocks and gputhreads for the gpusequence - throws if evnt != gpublocks*gputhreads
      * (this is needed for BridgeKernel tests rather than for actual production use in Fortran)
@@ -149,7 +149,7 @@ namespace mg5amcCpu
     unsigned int m_nevt; // number of events
     int m_nGoodHel;      // the number of good helicities (-1 initially when they have not yet been calculated)
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     int m_gputhreads; // number of gpu threads (default set from number of events, can be modified)
     int m_gpublocks;  // number of gpu blocks (default set from number of events, can be modified)
     DeviceBuffer<FORTRANFPTYPE, sizePerEventMomenta> m_devMomentaF;
@@ -186,12 +186,12 @@ namespace mg5amcCpu
   // Forward declare transposition methods
   //
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 
   template<typename Tin, typename Tout>
   __global__ void dev_transposeMomentaF2C( const Tin* in, Tout* out, const unsigned int nevt );
 
-#endif // __CUDACC__
+#endif // MGONGPUCPP_GPUIMPL
 
   template<typename Tin, typename Tout>
   void hst_transposeMomentaF2C( const Tin* in, Tout* out, const unsigned int nevt );
@@ -208,7 +208,7 @@ namespace mg5amcCpu
   Bridge<FORTRANFPTYPE>::Bridge( unsigned int nevtF, unsigned int nparF, unsigned int np4F )
     : m_nevt( nevtF )
     , m_nGoodHel( -1 )
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     , m_gputhreads( 256 )                  // default number of gpu threads
     , m_gpublocks( m_nevt / m_gputhreads ) // this ensures m_nevt <= m_gpublocks*m_gputhreads
     , m_devMomentaF( m_nevt )
@@ -232,7 +232,7 @@ namespace mg5amcCpu
   {
     if( nparF != CPPProcess::npar ) throw std::runtime_error( "Bridge constructor: npar mismatch" );
     if( np4F != CPPProcess::np4 ) throw std::runtime_error( "Bridge constructor: np4 mismatch" );
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     if( ( m_nevt < s_gputhreadsmin ) || ( m_nevt % s_gputhreadsmin != 0 ) )
       throw std::runtime_error( "Bridge constructor: nevt should be a multiple of " + std::to_string( s_gputhreadsmin ) );
     while( m_nevt != m_gpublocks * m_gputhreads )
@@ -250,11 +250,11 @@ namespace mg5amcCpu
     std::cout << "WARNING! Instantiate host Bridge (nevt=" << m_nevt << ")" << std::endl;
     CPPProcess process( /*verbose=*/false );
     m_pmek.reset( new MatrixElementKernelHost( m_hstMomentaC, m_hstGs, m_hstRndHel, m_hstRndCol, m_hstMEs, m_hstSelHel, m_hstSelCol, m_nevt ) );
-#endif // __CUDACC__
+#endif // MGONGPUCPP_GPUIMPL
     process.initProc( "../../Cards/param_card.dat" );
   }
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   template<typename FORTRANFPTYPE>
   void Bridge<FORTRANFPTYPE>::set_gpugrid( const int gpublocks, const int gputhreads )
   {
@@ -268,7 +268,7 @@ namespace mg5amcCpu
   }
 #endif
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   template<typename FORTRANFPTYPE>
   void Bridge<FORTRANFPTYPE>::gpu_sequence( const FORTRANFPTYPE* momenta,
                                             const FORTRANFPTYPE* gs,
@@ -283,14 +283,14 @@ namespace mg5amcCpu
     constexpr int neppM = MemoryAccessMomenta::neppM;
     if constexpr( neppM == 1 && std::is_same_v<FORTRANFPTYPE, fptype> )
     {
-      checkCuda( cudaMemcpy( m_devMomentaC.data(), momenta, m_devMomentaC.bytes(), cudaMemcpyHostToDevice ) );
+      gpuMemcpy( m_devMomentaC.data(), momenta, m_devMomentaC.bytes(), gpuMemcpyHostToDevice );
     }
     else
     {
-      checkCuda( cudaMemcpy( m_devMomentaF.data(), momenta, m_devMomentaF.bytes(), cudaMemcpyHostToDevice ) );
+      gpuMemcpy( m_devMomentaF.data(), momenta, m_devMomentaF.bytes(), gpuMemcpyHostToDevice );
       const int thrPerEvt = CPPProcess::npar * CPPProcess::np4; // AV: transpose alg does 1 element per thread (NOT 1 event per thread)
       //const int thrPerEvt = 1; // AV: try new alg with 1 event per thread... this seems slower
-      dev_transposeMomentaF2C<<<m_gpublocks * thrPerEvt, m_gputhreads>>>( m_devMomentaF.data(), m_devMomentaC.data(), m_nevt );
+      gpuLaunchKernel( dev_transposeMomentaF2C, m_gpublocks * thrPerEvt, m_gputhreads, m_devMomentaF.data(), m_devMomentaC.data(), m_nevt );
     }
     if constexpr( std::is_same_v<FORTRANFPTYPE, fptype> )
     {
@@ -333,7 +333,7 @@ namespace mg5amcCpu
   }
 #endif
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   template<typename FORTRANFPTYPE>
   void Bridge<FORTRANFPTYPE>::cpu_sequence( const FORTRANFPTYPE* momenta,
                                             const FORTRANFPTYPE* gs,
@@ -388,7 +388,7 @@ namespace mg5amcCpu
   // - C++ array: momenta[npagM][npar][np4][neppM] with nevt=npagM*neppM (AOSOA)
   //
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   template<typename Tin, typename Tout>
   __global__ void dev_transposeMomentaF2C( const Tin* in, Tout* out, const unsigned int nevt )
   {
diff --git a/epochX/cudacpp/gg_tt.sa/SubProcesses/BridgeKernels.cc b/epochX/cudacpp/gg_tt.sa/SubProcesses/BridgeKernels.cc
index d58066c9c1..eaf4037a24 100644
--- a/epochX/cudacpp/gg_tt.sa/SubProcesses/BridgeKernels.cc
+++ b/epochX/cudacpp/gg_tt.sa/SubProcesses/BridgeKernels.cc
@@ -1,17 +1,18 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #include "BridgeKernels.h"
 
+#include "GpuAbstraction.h"
 #include "MemoryAccessMomenta.h"
 
 #include <sstream>
 
 //============================================================================
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -45,7 +46,7 @@ namespace mg5amcCpu
 
 //============================================================================
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 namespace mg5amcCpu
 {
 
@@ -96,7 +97,7 @@ namespace mg5amcCpu
 
 //============================================================================
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 {
 
diff --git a/epochX/cudacpp/gg_tt.sa/SubProcesses/BridgeKernels.h b/epochX/cudacpp/gg_tt.sa/SubProcesses/BridgeKernels.h
index 15eb4bff4d..3efef8ce97 100644
--- a/epochX/cudacpp/gg_tt.sa/SubProcesses/BridgeKernels.h
+++ b/epochX/cudacpp/gg_tt.sa/SubProcesses/BridgeKernels.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef BRIDGEKERNELS_H
 #define BRIDGEKERNELS_H 1
@@ -12,7 +12,7 @@
 #include "MatrixElementKernels.h"
 #include "MemoryBuffers.h"
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -49,7 +49,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A Bridge wrapper class encapsulating matrix element calculations on a CPU host
   class BridgeKernelHost final : public BridgeKernelBase
   {
@@ -89,7 +89,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   // A Bridge wrapper class encapsulating matrix element calculations on a GPU device
   class BridgeKernelDevice : public BridgeKernelBase
   {
diff --git a/epochX/cudacpp/gg_tt.sa/SubProcesses/CommonRandomNumberKernel.cc b/epochX/cudacpp/gg_tt.sa/SubProcesses/CommonRandomNumberKernel.cc
index 985b39f576..010bc4cbd0 100644
--- a/epochX/cudacpp/gg_tt.sa/SubProcesses/CommonRandomNumberKernel.cc
+++ b/epochX/cudacpp/gg_tt.sa/SubProcesses/CommonRandomNumberKernel.cc
@@ -1,15 +1,16 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #include "CommonRandomNumbers.h"
+#include "GpuAbstraction.h"
 #include "MemoryBuffers.h"
 #include "RandomNumberKernels.h"
 
 #include <cassert>
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/gg_tt.sa/SubProcesses/CrossSectionKernels.cc b/epochX/cudacpp/gg_tt.sa/SubProcesses/CrossSectionKernels.cc
index 0b355a3c8d..c15b39844d 100644
--- a/epochX/cudacpp/gg_tt.sa/SubProcesses/CrossSectionKernels.cc
+++ b/epochX/cudacpp/gg_tt.sa/SubProcesses/CrossSectionKernels.cc
@@ -1,10 +1,11 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #include "CrossSectionKernels.h"
 
+#include "GpuAbstraction.h"
 #include "MemoryAccessMatrixElements.h"
 #include "MemoryAccessWeights.h"
 #include "MemoryBuffers.h"
@@ -77,7 +78,7 @@ debug_me_is_abnormal( const fptype& me, size_t ievtALL )
 
 //============================================================================
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -185,7 +186,7 @@ namespace mg5amcCpu
 
 //============================================================================
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 {
 
diff --git a/epochX/cudacpp/gg_tt.sa/SubProcesses/CrossSectionKernels.h b/epochX/cudacpp/gg_tt.sa/SubProcesses/CrossSectionKernels.h
index 7933ca4bbf..4d9659e04e 100644
--- a/epochX/cudacpp/gg_tt.sa/SubProcesses/CrossSectionKernels.h
+++ b/epochX/cudacpp/gg_tt.sa/SubProcesses/CrossSectionKernels.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef CROSSSECTIONKERNELS_H
 #define CROSSSECTIONKERNELS_H 1
@@ -13,7 +13,7 @@
 
 //============================================================================
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -96,7 +96,7 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
   /*
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   // A class encapsulating the calculation of event statistics on a GPU device
   class CrossSectionKernelDevice : public CrossSectionKernelBase, public NumberOfEvents
   {
diff --git a/epochX/cudacpp/gg_tt.sa/SubProcesses/CudaRuntime.h b/epochX/cudacpp/gg_tt.sa/SubProcesses/CudaRuntime.h
deleted file mode 100644
index 64ce52f4b3..0000000000
--- a/epochX/cudacpp/gg_tt.sa/SubProcesses/CudaRuntime.h
+++ /dev/null
@@ -1,85 +0,0 @@
-// Copyright (C) 2020-2023 CERN and UCLouvain.
-// Licensed under the GNU Lesser General Public License (version 3 or later).
-// Created by: S. Roiser (Jul 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
-
-#ifndef MG5AMC_CUDARUNTIME_H
-#define MG5AMC_CUDARUNTIME_H 1
-
-// MG5AMC on GPU uses the CUDA runtime API, not the lower level CUDA driver API
-// See https://docs.nvidia.com/cuda/cuda-runtime-api/driver-vs-runtime-api.html#driver-vs-runtime-api
-
-#include <cassert>
-#include <iostream>
-
-//--------------------------------------------------------------------------
-
-// See https://stackoverflow.com/a/14038590
-#ifdef __CUDACC__ /* clang-format off */
-#define checkCuda( code ) { assertCuda( code, __FILE__, __LINE__ ); }
-inline void assertCuda( cudaError_t code, const char* file, int line, bool abort = true )
-{
-  if( code != cudaSuccess )
-  {
-    printf( "ERROR! assertCuda: '%s' (%d) in %s:%d\n", cudaGetErrorString( code ), code, file, line );
-    if( abort ) assert( code == cudaSuccess );
-  }
-}
-#endif /* clang-format on */
-
-//--------------------------------------------------------------------------
-
-#ifdef __CUDACC__
-namespace mg5amcGpu
-{
-  // Instantiate a CudaRuntime at the beginnining of the application's main to
-  // invoke cudaSetDevice(0) in the constructor and book a cudaDeviceReset() call in the destructor
-  // *** FIXME! This will all need to be designed differently when going to multi-GPU nodes! ***
-  struct CudaRuntime final
-  {
-    CudaRuntime( const bool debug = true )
-      : m_debug( debug ) { setUp( m_debug ); }
-    ~CudaRuntime() { tearDown( m_debug ); }
-    CudaRuntime( const CudaRuntime& ) = delete;
-    CudaRuntime( CudaRuntime&& ) = delete;
-    CudaRuntime& operator=( const CudaRuntime& ) = delete;
-    CudaRuntime& operator=( CudaRuntime&& ) = delete;
-    bool m_debug;
-
-    // Set up CUDA application
-    // ** NB: strictly speaking this is not needed when using the CUDA runtime API **
-    // Calling cudaSetDevice on startup is useful to properly book-keep the time spent in CUDA initialization
-    static void setUp( const bool debug = true )
-    {
-      // ** NB: it is useful to call cudaSetDevice, or cudaFree, to properly book-keep the time spent in CUDA initialization
-      // ** NB: otherwise, the first CUDA operation (eg a cudaMemcpyToSymbol in CPPProcess ctor) appears to take much longer!
-      /*
-      // [We initially added cudaFree(0) to "ease profile analysis" only because it shows up as a big recognizable block!]
-      // No explicit initialization is needed: https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#initialization
-      // It is not clear what cudaFree(0) does at all: https://stackoverflow.com/questions/69967813/
-      if ( debug ) std::cout << "__CudaRuntime: calling cudaFree(0)" << std::endl;
-      checkCuda( cudaFree( 0 ) ); // SLOW!
-      */
-      // Replace cudaFree(0) by cudaSetDevice(0), even if it is not really needed either
-      // (but see https://developer.nvidia.com/blog/cuda-pro-tip-always-set-current-device-avoid-multithreading-bugs)
-      if( debug ) std::cout << "__CudaRuntime: calling cudaSetDevice(0)" << std::endl;
-      checkCuda( cudaSetDevice( 0 ) ); // SLOW!
-    }
-
-    // Tear down CUDA application (call cudaDeviceReset)
-    // ** NB: strictly speaking this is not needed when using the CUDA runtime API **
-    // Calling cudaDeviceReset on shutdown is only needed for checking memory leaks in cuda-memcheck
-    // See https://docs.nvidia.com/cuda/cuda-memcheck/index.html#leak-checking
-    static void tearDown( const bool debug = true )
-    {
-      if( debug ) std::cout << "__CudaRuntime: calling cudaDeviceReset()" << std::endl;
-      checkCuda( cudaDeviceReset() );
-    }
-  };
-
-}
-#endif
-
-//--------------------------------------------------------------------------
-
-#endif // MG5AMC_CUDARUNTIME_H
diff --git a/epochX/cudacpp/gg_tt.sa/SubProcesses/CurandRandomNumberKernel.cc b/epochX/cudacpp/gg_tt.sa/SubProcesses/CurandRandomNumberKernel.cc
index eb56333b03..08a16f6f2c 100644
--- a/epochX/cudacpp/gg_tt.sa/SubProcesses/CurandRandomNumberKernel.cc
+++ b/epochX/cudacpp/gg_tt.sa/SubProcesses/CurandRandomNumberKernel.cc
@@ -1,9 +1,9 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
-#include "CudaRuntime.h"
+#include "GpuRuntime.h"
 #include "MemoryBuffers.h"
 #include "RandomNumberKernels.h"
 
@@ -22,7 +22,7 @@ inline void assertCurand( curandStatus_t code, const char *file, int line, bool
 }
 #endif /* clang-format on */
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -36,7 +36,7 @@ namespace mg5amcCpu
   {
     if( m_isOnDevice )
     {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
       if( !m_rnarray.isOnDevice() )
         throw std::runtime_error( "CurandRandomNumberKernel on device with a host random number array" );
 #else
@@ -114,7 +114,7 @@ namespace mg5amcCpu
     /*
     printf( "\nCurandRandomNumberKernel::generateRnarray size = %d\n", (int)m_rnarray.size() );
     fptype* data = m_rnarray.data();
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     if( m_rnarray.isOnDevice() )
     {
       data = new fptype[m_rnarray.size()]();
@@ -123,7 +123,7 @@ namespace mg5amcCpu
 #endif
     for( int i = 0; i < ( (int)m_rnarray.size() / 4 ); i++ )
       printf( "[%4d] %f %f %f %f\n", i * 4, data[i * 4], data[i * 4 + 2], data[i * 4 + 2], data[i * 4 + 3] );
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     if( m_rnarray.isOnDevice() ) delete[] data;
 #endif
     */
diff --git a/epochX/cudacpp/gg_tt.sa/SubProcesses/EventStatistics.h b/epochX/cudacpp/gg_tt.sa/SubProcesses/EventStatistics.h
index 48b51e0a49..b425a5bade 100644
--- a/epochX/cudacpp/gg_tt.sa/SubProcesses/EventStatistics.h
+++ b/epochX/cudacpp/gg_tt.sa/SubProcesses/EventStatistics.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef EventStatistics_H
 #define EventStatistics_H 1
@@ -16,7 +16,7 @@
 #include <limits>
 #include <string>
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/gg_tt.sa/SubProcesses/MadgraphTest.h b/epochX/cudacpp/gg_tt.sa/SubProcesses/MadgraphTest.h
index ffe3b84d53..176338151a 100644
--- a/epochX/cudacpp/gg_tt.sa/SubProcesses/MadgraphTest.h
+++ b/epochX/cudacpp/gg_tt.sa/SubProcesses/MadgraphTest.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: S. Hageboeck (Dec 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MADGRAPHTEST_H_
 #define MADGRAPHTEST_H_ 1
@@ -21,7 +21,7 @@
 #include <string>
 #include <vector>
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 using mg5amcGpu::CPPProcess;
 #else
 using mg5amcCpu::CPPProcess;
@@ -200,7 +200,7 @@ class MadgraphTest : public testing::TestWithParam<TestDriverBase*>
 
 // Since we link both the CPU-only and GPU tests into the same executable, we prevent
 // a multiply defined symbol by only compiling this in the non-CUDA phase:
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 
 /// Compare momenta and matrix elements.
 /// This uses an implementation of TestDriverBase to run a madgraph workflow,
@@ -309,6 +309,6 @@ TEST_P( MadgraphTest, CompareMomentaAndME )
   }
 }
 
-#endif // __CUDACC__
+#endif // MGONGPUCPP_GPUIMPL
 
 #endif /* MADGRAPHTEST_H_ */
diff --git a/epochX/cudacpp/gg_tt.sa/SubProcesses/MatrixElementKernels.cc b/epochX/cudacpp/gg_tt.sa/SubProcesses/MatrixElementKernels.cc
index 30257195b6..d6d6c4f179 100644
--- a/epochX/cudacpp/gg_tt.sa/SubProcesses/MatrixElementKernels.cc
+++ b/epochX/cudacpp/gg_tt.sa/SubProcesses/MatrixElementKernels.cc
@@ -1,12 +1,12 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #include "MatrixElementKernels.h"
 
 #include "CPPProcess.h"
-#include "CudaRuntime.h"
+#include "GpuRuntime.h" // Includes the abstraction for Nvidia/AMD compilation
 #include "MemoryAccessMomenta.h"
 #include "MemoryBuffers.h"
 
@@ -14,7 +14,7 @@
 
 //============================================================================
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 namespace mg5amcCpu
 {
 
@@ -143,7 +143,7 @@ namespace mg5amcCpu
 
 //============================================================================
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 {
 
@@ -202,13 +202,13 @@ namespace mg5amcGpu
     PinnedHostBufferHelicityMask hstIsGoodHel( ncomb );
     DeviceBufferHelicityMask devIsGoodHel( ncomb );
     // ... 0d1. Compute good helicity mask on the device
-    computeDependentCouplings<<<m_gpublocks, m_gputhreads>>>( m_gs.data(), m_couplings.data() );
+    gpuLaunchKernel( computeDependentCouplings, m_gpublocks, m_gputhreads, m_gs.data(), m_couplings.data() );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    sigmaKin_getGoodHel<<<m_gpublocks, m_gputhreads>>>( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_numerators.data(), m_denominators.data(), devIsGoodHel.data() );
+    gpuLaunchKernel( sigmaKin_getGoodHel, m_gpublocks, m_gputhreads, m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_numerators.data(), m_denominators.data(), devIsGoodHel.data() );
 #else
-    sigmaKin_getGoodHel<<<m_gpublocks, m_gputhreads>>>( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), devIsGoodHel.data() );
+    gpuLaunchKernel( sigmaKin_getGoodHel, m_gpublocks, m_gputhreads, m_momenta.data(), m_couplings.data(), m_matrixElements.data(), devIsGoodHel.data() );
 #endif
-    checkCuda( cudaPeekAtLastError() );
+    checkGpu( gpuPeekAtLastError() );
     // ... 0d2. Copy back good helicity mask to the host
     copyHostFromDevice( hstIsGoodHel, devIsGoodHel );
     // ... 0d3. Copy back good helicity list to constant memory on the device
@@ -219,19 +219,19 @@ namespace mg5amcGpu
 
   void MatrixElementKernelDevice::computeMatrixElements( const unsigned int channelId )
   {
-    computeDependentCouplings<<<m_gpublocks, m_gputhreads>>>( m_gs.data(), m_couplings.data() );
+    gpuLaunchKernel( computeDependentCouplings, m_gpublocks, m_gputhreads, m_gs.data(), m_couplings.data() );
 #ifndef MGONGPU_NSIGHT_DEBUG
     constexpr unsigned int sharedMemSize = 0;
 #else
     constexpr unsigned int sharedMemSize = ntpbMAX * sizeof( float );
 #endif
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    sigmaKin<<<m_gpublocks, m_gputhreads, sharedMemSize>>>( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), channelId, m_numerators.data(), m_denominators.data(), m_selhel.data(), m_selcol.data() );
+    gpuLaunchKernelSharedMem( sigmaKin, m_gpublocks, m_gputhreads, sharedMemSize, m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), channelId, m_numerators.data(), m_denominators.data(), m_selhel.data(), m_selcol.data() );
 #else
-    sigmaKin<<<m_gpublocks, m_gputhreads, sharedMemSize>>>( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), m_selhel.data(), m_selcol.data() );
+    gpuLaunchKernelSharedMem( sigmaKin, m_gpublocks, m_gputhreads, sharedMemSize, m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), m_selhel.data(), m_selcol.data() );
 #endif
-    checkCuda( cudaPeekAtLastError() );
-    checkCuda( cudaDeviceSynchronize() );
+    checkGpu( gpuPeekAtLastError() );
+    checkGpu( gpuDeviceSynchronize() );
   }
 
   //--------------------------------------------------------------------------
diff --git a/epochX/cudacpp/gg_tt.sa/SubProcesses/MatrixElementKernels.h b/epochX/cudacpp/gg_tt.sa/SubProcesses/MatrixElementKernels.h
index 23e84757a2..72bd8f195b 100644
--- a/epochX/cudacpp/gg_tt.sa/SubProcesses/MatrixElementKernels.h
+++ b/epochX/cudacpp/gg_tt.sa/SubProcesses/MatrixElementKernels.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MATRIXELEMENTKERNELS_H
 #define MATRIXELEMENTKERNELS_H 1
@@ -10,7 +10,7 @@
 
 #include "MemoryBuffers.h"
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -81,7 +81,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating matrix element calculations on a CPU host
   class MatrixElementKernelHost final : public MatrixElementKernelBase, public NumberOfEvents
   {
@@ -130,7 +130,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   // A class encapsulating matrix element calculations on a GPU device
   class MatrixElementKernelDevice : public MatrixElementKernelBase, public NumberOfEvents
   {
diff --git a/epochX/cudacpp/gg_tt.sa/SubProcesses/MemoryAccessAmplitudes.h b/epochX/cudacpp/gg_tt.sa/SubProcesses/MemoryAccessAmplitudes.h
index 573b3bbbc9..ffb76e93de 100644
--- a/epochX/cudacpp/gg_tt.sa/SubProcesses/MemoryAccessAmplitudes.h
+++ b/epochX/cudacpp/gg_tt.sa/SubProcesses/MemoryAccessAmplitudes.h
@@ -15,7 +15,7 @@
 #define MGONGPU_TRIVIAL_AMPLITUDES 1
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/gg_tt.sa/SubProcesses/MemoryAccessCouplings.h b/epochX/cudacpp/gg_tt.sa/SubProcesses/MemoryAccessCouplings.h
index 35a3af42e0..3afdf3e554 100644
--- a/epochX/cudacpp/gg_tt.sa/SubProcesses/MemoryAccessCouplings.h
+++ b/epochX/cudacpp/gg_tt.sa/SubProcesses/MemoryAccessCouplings.h
@@ -15,7 +15,7 @@
 #include "MemoryBuffers.h"       // for HostBufferCouplings::isaligned
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/gg_tt.sa/SubProcesses/MemoryAccessCouplingsFixed.h b/epochX/cudacpp/gg_tt.sa/SubProcesses/MemoryAccessCouplingsFixed.h
index dc0d93afff..ffcdf4dbef 100644
--- a/epochX/cudacpp/gg_tt.sa/SubProcesses/MemoryAccessCouplingsFixed.h
+++ b/epochX/cudacpp/gg_tt.sa/SubProcesses/MemoryAccessCouplingsFixed.h
@@ -14,7 +14,7 @@
 //#include "MemoryAccessHelpers.h"
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/gg_tt.sa/SubProcesses/MemoryAccessDenominators.h b/epochX/cudacpp/gg_tt.sa/SubProcesses/MemoryAccessDenominators.h
index 3bce635718..66f2d32a6b 100644
--- a/epochX/cudacpp/gg_tt.sa/SubProcesses/MemoryAccessDenominators.h
+++ b/epochX/cudacpp/gg_tt.sa/SubProcesses/MemoryAccessDenominators.h
@@ -10,7 +10,7 @@
 #include "MemoryAccessGs.h"
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/gg_tt.sa/SubProcesses/MemoryAccessGs.h b/epochX/cudacpp/gg_tt.sa/SubProcesses/MemoryAccessGs.h
index 31311aa375..4c726b30f3 100644
--- a/epochX/cudacpp/gg_tt.sa/SubProcesses/MemoryAccessGs.h
+++ b/epochX/cudacpp/gg_tt.sa/SubProcesses/MemoryAccessGs.h
@@ -13,7 +13,7 @@
 #include "MemoryBuffers.h" // for HostBufferMatrixElements::isaligned
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/gg_tt.sa/SubProcesses/MemoryAccessHelpers.h b/epochX/cudacpp/gg_tt.sa/SubProcesses/MemoryAccessHelpers.h
index c82a6c7635..db73e4e064 100644
--- a/epochX/cudacpp/gg_tt.sa/SubProcesses/MemoryAccessHelpers.h
+++ b/epochX/cudacpp/gg_tt.sa/SubProcesses/MemoryAccessHelpers.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MemoryAccessHelpers_H
 #define MemoryAccessHelpers_H 1
@@ -105,7 +105,7 @@ class KernelAccessHelper : public MemoryAccessHelper<T>
     }
     else
     {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
       const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid
       //printf( "kernelAccessRecord: ievt=%d threadId=%d\n", ievt, threadIdx.x );
       return T::ieventAccessRecord( buffer, ievt ); // NB fptype and fptype_sv coincide for CUDA
diff --git a/epochX/cudacpp/gg_tt.sa/SubProcesses/MemoryAccessMatrixElements.h b/epochX/cudacpp/gg_tt.sa/SubProcesses/MemoryAccessMatrixElements.h
index f32e6fea5b..3741011971 100644
--- a/epochX/cudacpp/gg_tt.sa/SubProcesses/MemoryAccessMatrixElements.h
+++ b/epochX/cudacpp/gg_tt.sa/SubProcesses/MemoryAccessMatrixElements.h
@@ -13,7 +13,7 @@
 #include "MemoryBuffers.h" // for HostBufferMatrixElements::isaligned
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/gg_tt.sa/SubProcesses/MemoryAccessMomenta.h b/epochX/cudacpp/gg_tt.sa/SubProcesses/MemoryAccessMomenta.h
index 29266de32c..3be229d392 100644
--- a/epochX/cudacpp/gg_tt.sa/SubProcesses/MemoryAccessMomenta.h
+++ b/epochX/cudacpp/gg_tt.sa/SubProcesses/MemoryAccessMomenta.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MemoryAccessMomenta_H
 #define MemoryAccessMomenta_H 1
@@ -13,7 +13,7 @@
 #include "MemoryAccessVectors.h"
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -30,7 +30,7 @@ namespace mg5amcCpu
 
     // Number of Events Per Page in the momenta AOSOA memory buffer layout
     // (these are all best kept as a compile-time constants: see issue #23)
-#ifdef __CUDACC__ /* clang-format off */
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
     // -----------------------------------------------------------------------------------------------
     // --- GPUs: neppM is best set to a power of 2 times the number of fptype's in a 32-byte cacheline
     // --- This is relevant to ensure coalesced access to momenta in global memory
diff --git a/epochX/cudacpp/gg_tt.sa/SubProcesses/MemoryAccessNumerators.h b/epochX/cudacpp/gg_tt.sa/SubProcesses/MemoryAccessNumerators.h
index b152183b28..18991f4fa6 100644
--- a/epochX/cudacpp/gg_tt.sa/SubProcesses/MemoryAccessNumerators.h
+++ b/epochX/cudacpp/gg_tt.sa/SubProcesses/MemoryAccessNumerators.h
@@ -10,7 +10,7 @@
 #include "MemoryAccessGs.h"
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/gg_tt.sa/SubProcesses/MemoryAccessRandomNumbers.h b/epochX/cudacpp/gg_tt.sa/SubProcesses/MemoryAccessRandomNumbers.h
index e2988d39f3..40cb089135 100644
--- a/epochX/cudacpp/gg_tt.sa/SubProcesses/MemoryAccessRandomNumbers.h
+++ b/epochX/cudacpp/gg_tt.sa/SubProcesses/MemoryAccessRandomNumbers.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MemoryAccessRandomNumbers_H
 #define MemoryAccessRandomNumbers_H 1
@@ -11,7 +11,7 @@
 #include "CPPProcess.h"
 #include "MemoryAccessHelpers.h"
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 using mg5amcGpu::CPPProcess;
 #else
 using mg5amcCpu::CPPProcess;
diff --git a/epochX/cudacpp/gg_tt.sa/SubProcesses/MemoryAccessVectors.h b/epochX/cudacpp/gg_tt.sa/SubProcesses/MemoryAccessVectors.h
index e9b197368e..08faccff0f 100644
--- a/epochX/cudacpp/gg_tt.sa/SubProcesses/MemoryAccessVectors.h
+++ b/epochX/cudacpp/gg_tt.sa/SubProcesses/MemoryAccessVectors.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MemoryAccessVectors_H
 #define MemoryAccessVectors_H 1
@@ -10,7 +10,7 @@
 
 #include "mgOnGpuVectors.h"
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 namespace mg5amcCpu // this is only needed for CPU SIMD vectorization
 {
 
diff --git a/epochX/cudacpp/gg_tt.sa/SubProcesses/MemoryAccessWavefunctions.h b/epochX/cudacpp/gg_tt.sa/SubProcesses/MemoryAccessWavefunctions.h
index 5428aaf933..33bef4559e 100644
--- a/epochX/cudacpp/gg_tt.sa/SubProcesses/MemoryAccessWavefunctions.h
+++ b/epochX/cudacpp/gg_tt.sa/SubProcesses/MemoryAccessWavefunctions.h
@@ -15,7 +15,7 @@
 #define MGONGPU_TRIVIAL_WAVEFUNCTIONS 1
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/gg_tt.sa/SubProcesses/MemoryBuffers.h b/epochX/cudacpp/gg_tt.sa/SubProcesses/MemoryBuffers.h
index 3093e6ed18..7756a71621 100644
--- a/epochX/cudacpp/gg_tt.sa/SubProcesses/MemoryBuffers.h
+++ b/epochX/cudacpp/gg_tt.sa/SubProcesses/MemoryBuffers.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021, based on earlier work by S. Hageboeck) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Roiser, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MemoryBuffers_H
 #define MemoryBuffers_H 1
@@ -11,12 +11,12 @@
 #include "mgOnGpuCxtypes.h"
 
 #include "CPPProcess.h"
-#include "CudaRuntime.h"
+#include "GpuRuntime.h"
 #include "Parameters_sm.h"
 
 #include <sstream>
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -87,7 +87,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   constexpr bool HostBufferALIGNED = false;   // ismisaligned=false
   constexpr bool HostBufferMISALIGNED = true; // ismisaligned=true
 
@@ -119,7 +119,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   // A class encapsulating a CUDA pinned host buffer
   template<typename T>
   class PinnedHostBufferBase : public BufferBase<T>
@@ -128,18 +128,18 @@ namespace mg5amcCpu
     PinnedHostBufferBase( const size_t size )
       : BufferBase<T>( size, false )
     {
-      checkCuda( cudaMallocHost( &( this->m_data ), this->bytes() ) );
+      gpuMallocHost( &( this->m_data ), this->bytes() );
     }
     virtual ~PinnedHostBufferBase()
     {
-      checkCuda( cudaFreeHost( this->m_data ) );
+      gpuFreeHost( this->m_data );
     }
   };
 #endif
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   // A class encapsulating a CUDA device buffer
   template<typename T>
   class DeviceBufferBase : public BufferBase<T>
@@ -148,18 +148,18 @@ namespace mg5amcCpu
     DeviceBufferBase( const size_t size )
       : BufferBase<T>( size, true )
     {
-      checkCuda( cudaMalloc( &( this->m_data ), this->bytes() ) );
+      gpuMalloc( &( this->m_data ), this->bytes() );
     }
     virtual ~DeviceBufferBase()
     {
-      checkCuda( cudaFree( this->m_data ) );
+      gpuFree( this->m_data );
     }
   };
 #endif
 
   //--------------------------------------------------------------------------
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for a given number of events
   template<typename T, size_t sizePerEvent, bool ismisaligned>
   class HostBuffer : public HostBufferBase<T, ismisaligned>, virtual private NumberOfEvents
@@ -175,7 +175,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   // A class encapsulating a CUDA pinned host buffer for a given number of events
   template<typename T, size_t sizePerEvent>
   class PinnedHostBuffer : public PinnedHostBufferBase<T>, virtual private NumberOfEvents
@@ -191,7 +191,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   // A class encapsulating a CUDA device buffer for a given number of events
   template<typename T, size_t sizePerEvent>
   class DeviceBuffer : public DeviceBufferBase<T>, virtual private NumberOfEvents
@@ -213,7 +213,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for momenta random numbers
   constexpr size_t sizePerEventRndNumMomenta = MemoryBuffers::np4 * MemoryBuffers::nparf;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for momenta random numbers
   typedef HostBuffer<fptype, sizePerEventRndNumMomenta, HostBufferALIGNED> HostBufferRndNumMomenta;
 #else
@@ -232,7 +232,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer with ONE fptype per event
   constexpr size_t sizePerEventOneFp = 1;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer with ONE fptype per event
   typedef HostBuffer<fptype, sizePerEventOneFp, HostBufferALIGNED> HostBufferOneFp;
 #else
@@ -257,7 +257,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for Gs
   constexpr size_t sizePerEventGs = 1;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for gs
   typedef HostBuffer<fptype, sizePerEventGs, HostBufferALIGNED> HostBufferGs;
 #else
@@ -276,7 +276,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for numerators
   constexpr size_t sizePerEventNumerators = 1;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for gs
   typedef HostBuffer<fptype, sizePerEventNumerators, HostBufferALIGNED> HostBufferNumerators;
 #else
@@ -296,7 +296,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for denominators
   constexpr size_t sizePerEventDenominators = 1;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for gs
   typedef HostBuffer<fptype, sizePerEventDenominators, HostBufferALIGNED> HostBufferDenominators;
 #else
@@ -315,7 +315,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for random numbers
   constexpr size_t sizePerEventCouplings = MemoryBuffers::ndcoup * MemoryBuffers::nx2;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for gs
   typedef HostBuffer<fptype, sizePerEventCouplings, HostBufferALIGNED> HostBufferCouplings;
 #else
@@ -333,7 +333,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for momenta
   constexpr size_t sizePerEventMomenta = MemoryBuffers::np4 * MemoryBuffers::npar;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for momenta
   typedef HostBuffer<fptype, sizePerEventMomenta, HostBufferALIGNED> HostBufferMomenta;
   //typedef HostBuffer<fptype, sizePerEventMomenta, HostBufferMISALIGNED> HostBufferMomenta; // TEST MISALIGNMENT!
@@ -352,7 +352,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for sampling weights
   constexpr size_t sizePerEventWeights = 1;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for sampling weights
   typedef HostBuffer<fptype, sizePerEventWeights, HostBufferALIGNED> HostBufferWeights;
 #else
@@ -370,7 +370,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for matrix elements
   constexpr size_t sizePerEventMatrixElements = 1;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for matrix elements
   typedef HostBuffer<fptype, sizePerEventMatrixElements, HostBufferALIGNED> HostBufferMatrixElements;
 #else
@@ -385,7 +385,7 @@ namespace mg5amcCpu
   // A base class encapsulating a memory buffer for the helicity mask
   typedef BufferBase<bool> BufferHelicityMask;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for the helicity mask
   typedef HostBufferBase<bool, HostBufferALIGNED> HostBufferHelicityMask;
 #else
@@ -403,7 +403,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for wavefunctions
   constexpr size_t sizePerEventWavefunctions = MemoryBuffers::nw6 * MemoryBuffers::nx2;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for wavefunctions
   typedef HostBuffer<fptype, sizePerEventWavefunctions, HostBufferALIGNED> HostBufferWavefunctions;
 #else
@@ -421,7 +421,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for helicity random numbers
   constexpr size_t sizePerEventRndNumHelicity = 1;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for helicity random numbers
   typedef HostBuffer<fptype, sizePerEventRndNumHelicity, HostBufferALIGNED> HostBufferRndNumHelicity;
 #else
@@ -439,7 +439,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for color random numbers
   constexpr size_t sizePerEventRndNumColor = 1;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for color random numbers
   typedef HostBuffer<fptype, sizePerEventRndNumColor, HostBufferALIGNED> HostBufferRndNumColor;
 #else
@@ -457,7 +457,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for helicity selection
   constexpr size_t sizePerEventSelectedHelicity = 1;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for helicity selection
   typedef HostBuffer<int, sizePerEventSelectedHelicity, HostBufferALIGNED> HostBufferSelectedHelicity;
 #else
@@ -475,7 +475,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for color selection
   constexpr size_t sizePerEventSelectedColor = 1;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for color selection
   typedef HostBuffer<int, sizePerEventSelectedColor, HostBufferALIGNED> HostBufferSelectedColor;
 #else
@@ -487,7 +487,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   template<class Tdst, class Tsrc>
   void copyDeviceFromHost( Tdst& dst, const Tsrc& src ) // keep the same order of arguments as in memcpy
   {
@@ -504,13 +504,13 @@ namespace mg5amcCpu
       throw std::runtime_error( sstr.str() );
     }
     // NB (PR #45): cudaMemcpy involves an intermediate memcpy to pinned memory if host array is a not a pinned host array
-    checkCuda( cudaMemcpy( dst.data(), src.data(), src.bytes(), cudaMemcpyHostToDevice ) );
+    gpuMemcpy( dst.data(), src.data(), src.bytes(), gpuMemcpyHostToDevice );
   }
 #endif
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   template<class Tdst, class Tsrc>
   void copyHostFromDevice( Tdst& dst, const Tsrc& src ) // keep the same order of arguments as in memcpy
   {
@@ -527,7 +527,7 @@ namespace mg5amcCpu
       throw std::runtime_error( sstr.str() );
     }
     // NB (PR #45): cudaMemcpy involves an intermediate memcpy to pinned memory if host array is a not a pinned host array
-    checkCuda( cudaMemcpy( dst.data(), src.data(), src.bytes(), cudaMemcpyDeviceToHost ) );
+    gpuMemcpy( dst.data(), src.data(), src.bytes(), gpuMemcpyDeviceToHost );
   }
 #endif
 
diff --git a/epochX/cudacpp/gg_tt.sa/SubProcesses/P1_Sigma_sm_gg_ttx/CPPProcess.cc b/epochX/cudacpp/gg_tt.sa/SubProcesses/P1_Sigma_sm_gg_ttx/CPPProcess.cc
index 327b69d008..b0d93e9401 100644
--- a/epochX/cudacpp/gg_tt.sa/SubProcesses/P1_Sigma_sm_gg_ttx/CPPProcess.cc
+++ b/epochX/cudacpp/gg_tt.sa/SubProcesses/P1_Sigma_sm_gg_ttx/CPPProcess.cc
@@ -4,7 +4,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
 // MadGraph5_aMC@NLO v. 3.5.0_lo_vect, 2023-06-09
@@ -16,7 +16,6 @@
 
 #include "mgOnGpuConfig.h"
 
-#include "CudaRuntime.h"
 #include "HelAmps_sm.h"
 #include "MemoryAccessAmplitudes.h"
 #include "MemoryAccessCouplings.h"
@@ -46,7 +45,7 @@
 // Class member functions for calculating the matrix elements for
 // Process: g g > t t~ WEIGHTED<=2 @1
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -80,7 +79,7 @@ namespace mg5amcCpu
   __device__ const fptype cIPD[2] = { (fptype)Parameters_sm::mdl_MT, (fptype)Parameters_sm::mdl_WT };
   __device__ const fptype* cIPC = nullptr; // unused as nicoup=0
 #else
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   __device__ __constant__ fptype cIPD[2];
   __device__ __constant__ fptype* cIPC = nullptr; // unused as nicoup=0
 #else
@@ -90,7 +89,7 @@ namespace mg5amcCpu
 #endif
 
   // Helicity combinations (and filtering of "good" helicity combinations)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   __device__ __constant__ short cHel[ncomb][npar];
   __device__ __constant__ int cNGoodHel;
   __device__ __constant__ int cGoodHel[ncomb];
@@ -118,13 +117,13 @@ namespace mg5amcCpu
                            fptype* allDenominators,       // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
                            fptype_sv* jamp2_sv            // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled)
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
                            , const int ievt00             // input: first event number in current C++ event page (for CUDA, ievt depends on threadid)
 #endif
                            )
   //ALWAYS_INLINE // attributes are not permitted in a function definition
   {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     using namespace mg5amcGpu;
     using M_ACCESS = DeviceAccessMomenta;         // non-trivial access: buffer includes all events
     using E_ACCESS = DeviceAccessMatrixElements;  // non-trivial access: buffer includes all events
@@ -151,7 +150,7 @@ namespace mg5amcCpu
 #endif /* clang-format on */
     mgDebug( 0, __FUNCTION__ );
     //printf( "calculate_wavefunctions: ihel=%2d\n", ihel );
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
     //printf( "calculate_wavefunctions: ievt00=%d\n", ievt00 );
 #endif
 
@@ -187,7 +186,7 @@ namespace mg5amcCpu
 #endif
     for( int iParity = 0; iParity < nParity; ++iParity )
     { // START LOOP ON IPARITY
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
       const int ievt0 = ievt00 + iParity * neppV;
 #endif
       constexpr size_t nxcoup = ndcoup + nicoup; // both dependent and independent couplings
@@ -200,8 +199,10 @@ namespace mg5amcCpu
         allCOUPs[idcoup] = CD_ACCESS::idcoupAccessBufferConst( allcouplings, idcoup ); // dependent couplings, vary event-by-event
       for( size_t iicoup = 0; iicoup < nicoup; iicoup++ )
         allCOUPs[ndcoup + iicoup] = CI_ACCESS::iicoupAccessBufferConst( cIPC, iicoup ); // independent couplings, fixed for all events
+#ifdef MGONGPUCPP_GPUIMPL
 #ifdef __CUDACC__
 #pragma nv_diagnostic pop
+#endif
       // CUDA kernels take input/output buffers with momenta/MEs for all events
       const fptype* momenta = allmomenta;
       const fptype* COUPs[nxcoup];
@@ -299,7 +300,7 @@ namespace mg5amcCpu
         { 16, -2 },
         { -2, 16 } }; // 2-D array[2][2]
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
       // Pre-compute a constexpr triangular color matrix properly normalized #475
       struct TriangularNormalizedColorMatrix
       {
@@ -356,7 +357,7 @@ namespace mg5amcCpu
 #endif
       for( int icol = 0; icol < ncolor; icol++ )
       {
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
         // === C++ START ===
         // Diagonal terms
 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
@@ -415,7 +416,7 @@ namespace mg5amcCpu
       MEs_sv_previous += deltaMEs_previous;
 #endif
       /*
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
       if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", blockDim.x * blockIdx.x + threadIdx.x, ihel, MEs_sv );
 #else
 #ifdef MGONGPU_CPPSIMD
@@ -462,8 +463,8 @@ namespace mg5amcCpu
       { 1, 1, -1, -1 },
       { 1, 1, 1, 1 },
       { 1, 1, 1, -1 } };
-#ifdef __CUDACC__
-    checkCuda( cudaMemcpyToSymbol( cHel, tHel, ncomb * npar * sizeof( short ) ) );
+#ifdef MGONGPUCPP_GPUIMPL
+    gpuMemcpyToSymbol( cHel, tHel, ncomb * npar * sizeof( short ) );
 #else
     memcpy( cHel, tHel, ncomb * npar * sizeof( short ) );
 #endif
@@ -503,9 +504,9 @@ namespace mg5amcCpu
     // Then copy them to CUDA constant memory (issue #39) or its C++ emulation in file-scope static memory
     const fptype tIPD[2] = { (fptype)m_pars->mdl_MT, (fptype)m_pars->mdl_WT };
     //const cxtype tIPC[0] = { ... }; // nicoup=0
-#ifdef __CUDACC__
-    checkCuda( cudaMemcpyToSymbol( cIPD, tIPD, 2 * sizeof( fptype ) ) );
-    //checkCuda( cudaMemcpyToSymbol( cIPC, tIPC, 0 * sizeof( cxtype ) ) ); // nicoup=0
+#ifdef MGONGPUCPP_GPUIMPL
+    gpuMemcpyToSymbol( cIPD, tIPD, 2 * sizeof( fptype ) );
+    //gpuMemcpyToSymbol( cIPC, tIPC, 0 * sizeof( cxtype ) ); // nicoup=0
 #else
     memcpy( cIPD, tIPD, 2 * sizeof( fptype ) );
     //memcpy( cIPC, tIPC, 0 * sizeof( cxtype ) ); // nicoup=0
@@ -541,7 +542,7 @@ namespace mg5amcCpu
   {
     std::stringstream out;
     // CUDA version (NVCC)
-    // [Use __NVCC__ instead of __CUDACC__ here!]
+    // [Use __NVCC__ instead of MGONGPUCPP_GPUIMPL here!]
     // [This tests if 'nvcc' was used even to build a .cc file, even if not necessarily 'nvcc -x cu' for a .cu file]
     // [Check 'nvcc --compiler-options -dM -E dummy.c | grep CUDA': see https://stackoverflow.com/a/53713712]
 #ifdef __NVCC__
@@ -606,12 +607,12 @@ namespace mg5amcCpu
   __global__ void /* clang-format off */
   computeDependentCouplings( const fptype* allgs, // input: Gs[nevt]
                              fptype* allcouplings // output: couplings[nevt*ndcoup*2]
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
                              , const int nevt     // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #endif
   ) /* clang-format on */
   {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     using namespace mg5amcGpu;
     using G_ACCESS = DeviceAccessGs;
     using C_ACCESS = DeviceAccessCouplings;
@@ -632,7 +633,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__ /* clang-format off */
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
   __global__ void
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
@@ -761,9 +762,9 @@ namespace mg5amcCpu
         nGoodHel++;
       }
     }
-#ifdef __CUDACC__
-    checkCuda( cudaMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) ) );
-    checkCuda( cudaMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) ) );
+#ifdef MGONGPUCPP_GPUIMPL
+    gpuMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) );
+    gpuMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) );
 #else
     cNGoodHel = nGoodHel;
     for( int ihel = 0; ihel < ncomb; ihel++ ) cGoodHel[ihel] = goodHel[ihel];
@@ -787,7 +788,7 @@ namespace mg5amcCpu
 #endif
             int* allselhel,                // output: helicity selection[nevt]
             int* allselcol                 // output: helicity selection[nevt]
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
             , const int nevt               // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #endif
             ) /* clang-format on */
@@ -808,7 +809,7 @@ namespace mg5amcCpu
     // Denominators: spins, colors and identical particles
     constexpr int helcolDenominators[1] = { 256 }; // assume nprocesses == 1 (#272 and #343)
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     // Remember: in CUDA this is a kernel for one event, in c++ this processes n events
     const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid
 #else
@@ -822,9 +823,12 @@ namespace mg5amcCpu
 #endif
 
     // Start sigmaKin_lines
+
+#include "GpuAbstraction.h"
+
     // === PART 0 - INITIALISATION (before calculate_wavefunctions) ===
     // Reset the "matrix elements" - running sums of |M|^2 over helicities for the given event
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     allMEs[ievt] = 0;
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     allNumerators[ievt] = 0;
@@ -852,7 +856,7 @@ namespace mg5amcCpu
     // === PART 1 - HELICITY LOOP: CALCULATE WAVEFUNCTIONS ===
     // (in both CUDA and C++, using precomputed good helicities)
 
-#ifdef __CUDACC__ // CUDA OR C++
+#ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++
 
     // *** START OF PART 1a - CUDA (one event per CPU thread) ***
     // Running sum of partial amplitudes squared for event by event color selection (#402)
@@ -1056,7 +1060,7 @@ namespace mg5amcCpu
     // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event
     // [NB 'sum over final spins, average over initial spins', eg see
     // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf]
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     allMEs[ievt] /= helcolDenominators[0];
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     if( channelId > 0 ) allMEs[ievt] *= allNumerators[ievt] / allDenominators[ievt];
diff --git a/epochX/cudacpp/gg_tt.sa/SubProcesses/P1_Sigma_sm_gg_ttx/CPPProcess.h b/epochX/cudacpp/gg_tt.sa/SubProcesses/P1_Sigma_sm_gg_ttx/CPPProcess.h
index 51f966d10f..5a6e96d9e8 100644
--- a/epochX/cudacpp/gg_tt.sa/SubProcesses/P1_Sigma_sm_gg_ttx/CPPProcess.h
+++ b/epochX/cudacpp/gg_tt.sa/SubProcesses/P1_Sigma_sm_gg_ttx/CPPProcess.h
@@ -4,7 +4,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
 // MadGraph5_aMC@NLO v. 3.5.0_lo_vect, 2023-06-09
@@ -25,7 +25,7 @@
 
 //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -107,7 +107,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   __global__ void
   computeDependentCouplings( const fptype* allgs,    // input: Gs[nevt]
                              fptype* allcouplings ); // output: couplings[nevt*ndcoup*2]
@@ -120,7 +120,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__ /* clang-format off */
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
   __global__ void
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
@@ -150,7 +150,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__ /* clang-format off */
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
   __global__ void
   sigmaKin( const fptype* allmomenta,      // input: momenta[nevt*npar*4]
             const fptype* allcouplings,    // input: couplings[nevt*ndcoup*2]
diff --git a/epochX/cudacpp/gg_tt.sa/SubProcesses/P1_Sigma_sm_gg_ttx/CudaRuntime.h b/epochX/cudacpp/gg_tt.sa/SubProcesses/P1_Sigma_sm_gg_ttx/CudaRuntime.h
deleted file mode 120000
index ce9e1a487a..0000000000
--- a/epochX/cudacpp/gg_tt.sa/SubProcesses/P1_Sigma_sm_gg_ttx/CudaRuntime.h
+++ /dev/null
@@ -1 +0,0 @@
-../CudaRuntime.h
\ No newline at end of file
diff --git a/epochX/cudacpp/gg_tt.sa/SubProcesses/P1_Sigma_sm_gg_ttx/check_sa.cc b/epochX/cudacpp/gg_tt.sa/SubProcesses/P1_Sigma_sm_gg_ttx/check_sa.cc
index f1e75b9252..1bad694d1c 100644
--- a/epochX/cudacpp/gg_tt.sa/SubProcesses/P1_Sigma_sm_gg_ttx/check_sa.cc
+++ b/epochX/cudacpp/gg_tt.sa/SubProcesses/P1_Sigma_sm_gg_ttx/check_sa.cc
@@ -4,7 +4,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: O. Mattelaer (Nov 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 
 #include "mgOnGpuConfig.h"
@@ -12,6 +12,7 @@
 #include "BridgeKernels.h"
 #include "CPPProcess.h"
 #include "CrossSectionKernels.h"
+#include "GpuRuntime.h"
 #include "MatrixElementKernels.h"
 #include "MemoryAccessMatrixElements.h"
 #include "MemoryAccessMomenta.h"
@@ -63,7 +64,7 @@ usage( char* argv0, int ret = 1 )
   std::cout << std::endl;
   std::cout << "Summary stats are always computed: '-p' and '-j' only control their printout" << std::endl;
   std::cout << "The '-d' flag only enables NaN/abnormal warnings and OMP debugging" << std::endl;
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 #ifdef _OPENMP
   std::cout << std::endl;
   std::cout << "Use the OMP_NUM_THREADS environment variable to control OMP multi-threading" << std::endl;
@@ -77,7 +78,7 @@ int
 main( int argc, char** argv )
 {
   // Namespaces for CUDA and C++ (FIXME - eventually use the same namespace everywhere...)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   using namespace mg5amcGpu;
 #else
   using namespace mg5amcCpu;
@@ -103,11 +104,11 @@ main( int argc, char** argv )
     CurandDevice = 2
   };
 #ifdef __CUDACC__
-  RandomNumberMode rndgen = RandomNumberMode::CurandDevice; // default on GPU
+  RandomNumberMode rndgen = RandomNumberMode::CurandDevice; // default on CUDA GPU
 #elif not defined MGONGPU_HAS_NO_CURAND
   RandomNumberMode rndgen = RandomNumberMode::CurandHost;  // default on CPU if build has curand
 #else
-  RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // default on CPU if build has no curand
+  RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // default on CPU if build has no curand and on HIP GPU
 #endif
   // Rambo sampling mode (NB RamboHost implies CommonRandom or CurandHost!)
   enum class RamboSamplingMode
@@ -115,7 +116,7 @@ main( int argc, char** argv )
     RamboHost = 1,
     RamboDevice = 2
   };
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   RamboSamplingMode rmbsmp = RamboSamplingMode::RamboDevice; // default on GPU
 #else
   RamboSamplingMode rmbsmp = RamboSamplingMode::RamboHost; // default on CPU
@@ -148,7 +149,7 @@ main( int argc, char** argv )
 #ifdef __CUDACC__
       rndgen = RandomNumberMode::CurandDevice;
 #else
-      throw std::runtime_error( "CurandDevice is not supported on CPUs" );
+      throw std::runtime_error( "CurandDevice is not supported on CPUs or on HIP GPUs" );
 #endif
     }
     else if( arg == "--curhst" )
@@ -165,7 +166,7 @@ main( int argc, char** argv )
     }
     else if( arg == "--rmbdev" )
     {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
       rmbsmp = RamboSamplingMode::RamboDevice;
 #else
       throw std::runtime_error( "RamboDevice is not supported on CPUs" );
@@ -239,13 +240,13 @@ main( int argc, char** argv )
     return usage( argv[0] );
   }
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 #ifdef _OPENMP
   ompnumthreadsNotSetMeansOneThread( debug ? 1 : 0 ); // quiet(-1), info(0), debug(1)
 #endif
 #endif
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // Fail gently and avoid "Illegal instruction (core dumped)" if the host does not support the SIMD used in the ME calculation
   // Note: this prevents a crash on pmpe04 but not on some github CI nodes?
   // [NB: SIMD vectorization in mg5amc C++ code is only used in the ME calculation below MatrixElementKernelHost!]
@@ -263,14 +264,14 @@ main( int argc, char** argv )
 
   // === STEP 0 - INITIALISE
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 
-  // --- 00. Initialise cuda
-  // Instantiate a CudaRuntime at the beginnining of the application's main to
-  // invoke cudaSetDevice(0) in the constructor and book a cudaDeviceReset() call in the destructor
-  const std::string cdinKey = "00 CudaInit";
+  // --- 00. Initialise GPU
+  // Instantiate a GpuRuntime at the beginnining of the application's main.
+  // For CUDA this invokes cudaSetDevice(0) in the constructor and books a cudaDeviceReset() call in the destructor.
+  const std::string cdinKey = "00 GpuInit";
   timermap.start( cdinKey );
-  CudaRuntime cudaRuntime( debug );
+  GpuRuntime GpuRuntime( debug );
 #endif
 
   // --- 0a. Initialise physics process
@@ -292,7 +293,7 @@ main( int argc, char** argv )
   timermap.start( alloKey );
 
   // Memory buffers for random numbers for momenta
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferRndNumMomenta hstRndmom( nevt );
 #else
   PinnedHostBufferRndNumMomenta hstRndmom( nevt );
@@ -300,7 +301,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for sampling weights
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferWeights hstWeights( nevt );
 #else
   PinnedHostBufferWeights hstWeights( nevt );
@@ -308,7 +309,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for momenta
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferMomenta hstMomenta( nevt );
 #else
   PinnedHostBufferMomenta hstMomenta( nevt );
@@ -316,7 +317,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for Gs
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferGs hstGs( nevt );
 #else
   PinnedHostBufferGs hstGs( nevt );
@@ -333,7 +334,7 @@ main( int argc, char** argv )
   }
 
   // Memory buffers for matrix elements
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferMatrixElements hstMatrixElements( nevt );
 #else
   PinnedHostBufferMatrixElements hstMatrixElements( nevt );
@@ -342,7 +343,7 @@ main( int argc, char** argv )
 
   // Memory buffers for random numbers for helicity selection
   // *** NB #403 these buffers always remain initialised at 0: no need for helicity choice in gcheck/check (no LHE produced) ***
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferRndNumHelicity hstRndHel( nevt );
 #else
   PinnedHostBufferRndNumHelicity hstRndHel( nevt );
@@ -351,7 +352,7 @@ main( int argc, char** argv )
 
   // Memory buffers for random numbers for color selection
   // *** NB #402 these buffers always remain initialised at 0: no need for color choice in gcheck/check (no LHE produced) ***
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferRndNumColor hstRndCol( nevt );
 #else
   PinnedHostBufferRndNumColor hstRndCol( nevt );
@@ -359,7 +360,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for helicity selection
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferSelectedHelicity hstSelHel( nevt );
 #else
   PinnedHostBufferSelectedHelicity hstSelHel( nevt );
@@ -367,7 +368,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for color selection
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferSelectedColor hstSelCol( nevt );
 #else
   PinnedHostBufferSelectedColor hstSelCol( nevt );
@@ -403,7 +404,7 @@ main( int argc, char** argv )
 #else
   else
   {
-    throw std::logic_error( "CurandDevice is not supported on CPUs" ); // INTERNAL ERROR (no path to this statement)
+    throw std::logic_error( "CurandDevice is not supported on CPUs or HIP GPUs" ); // INTERNAL ERROR (no path to this statement)
   }
 #endif
 #else
@@ -421,7 +422,7 @@ main( int argc, char** argv )
   }
   else
   {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     prsk.reset( new RamboSamplingKernelDevice( energy, devRndmom, devMomenta, devWeights, gpublocks, gputhreads ) );
 #else
     throw std::logic_error( "RamboDevice is not supported on CPUs" ); // INTERNAL ERROR (no path to this statement)
@@ -432,7 +433,7 @@ main( int argc, char** argv )
   std::unique_ptr<MatrixElementKernelBase> pmek;
   if( !bridge )
   {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     pmek.reset( new MatrixElementKernelDevice( devMomenta, devGs, devRndHel, devRndCol, devMatrixElements, devSelHel, devSelCol, gpublocks, gputhreads ) );
 #else
     pmek.reset( new MatrixElementKernelHost( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, nevt ) );
@@ -440,7 +441,7 @@ main( int argc, char** argv )
   }
   else
   {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     pmek.reset( new BridgeKernelDevice( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, gpublocks, gputhreads ) );
 #else
     pmek.reset( new BridgeKernelHost( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, nevt ) );
@@ -482,7 +483,7 @@ main( int argc, char** argv )
     prnk->generateRnarray();
     //std::cout << "Got random numbers" << std::endl;
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     if( rndgen != RandomNumberMode::CurandDevice && rmbsmp == RamboSamplingMode::RamboDevice )
     {
       // --- 1c. Copy rndmom from host to device
@@ -514,7 +515,7 @@ main( int argc, char** argv )
     prsk->getMomentaFinal();
     //std::cout << "Got final momenta" << std::endl;
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     if( rmbsmp == RamboSamplingMode::RamboDevice )
     {
       // --- 2c. CopyDToH Weights
@@ -559,7 +560,7 @@ main( int argc, char** argv )
       dynamic_cast<BridgeKernelBase*>( pmek.get() )->transposeInputMomentaC2F();
     }
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     // --- 2d. CopyHToD Momenta
     const std::string gKey = "0.. CpHTDg";
     rambtime += timermap.start( gKey ); // FIXME! NOT A RAMBO TIMER!
@@ -588,7 +589,7 @@ main( int argc, char** argv )
     wv3atime += timermap.stop(); // calc only
     wavetime += wv3atime;        // calc plus copy
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     if( !bridge )
     {
       // --- 3b. CopyDToH MEs
@@ -731,15 +732,19 @@ main( int argc, char** argv )
     rndgentxt = "CURAND DEVICE";
 #ifdef __CUDACC__
   rndgentxt += " (CUDA code)";
+#elif defined __HIPCC__
+  rndgentxt += " (HIP code)";
 #else
   rndgentxt += " (C++ code)";
 #endif
 
   // Workflow description summary
   std::string wrkflwtxt;
-  // -- CUDA or C++?
+  // -- CUDA or HIP or C++?
 #ifdef __CUDACC__
   wrkflwtxt += "CUD:";
+#elif defined __HIPCC__
+  wrkflwtxt += "HIP:";
 #else
   wrkflwtxt += "CPP:";
 #endif
@@ -764,6 +769,12 @@ main( int argc, char** argv )
 #else
   wrkflwtxt += "???:"; // no path to this statement
 #endif
+#elif defined __HIPCC__
+#if defined MGONGPU_CUCXTYPE_CXSMPL
+  wrkflwtxt += "CXS:";
+#else
+  wrkflwtxt += "???:"; // no path to this statement
+#endif
 #else
 #if defined MGONGPU_CPPCXTYPE_STDCOMPLEX
   wrkflwtxt += "STX:";
@@ -789,7 +800,7 @@ main( int argc, char** argv )
     wrkflwtxt += "RMBDEV+";
   else
     wrkflwtxt += "??????+"; // no path to this statement
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   // -- HOST or DEVICE matrix elements? Standalone MEs or BRIDGE?
   if( !bridge )
     wrkflwtxt += "MESDEV";
@@ -845,7 +856,7 @@ main( int argc, char** argv )
 
   if( perf )
   {
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 #ifdef _OPENMP
     // Get the output of "nproc --all" (https://stackoverflow.com/a/478960)
     std::string nprocall;
@@ -866,6 +877,8 @@ main( int argc, char** argv )
     std::cout << std::string( SEP79, '*' ) << std::endl
 #ifdef __CUDACC__
               << "Process                     = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_CUDA"
+#elif defined __HIPCC__
+              << "Process                     = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_HIP"
 #else
               << "Process                     = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_CPP"
 #endif
@@ -892,21 +905,21 @@ main( int argc, char** argv )
 #elif defined MGONGPU_FPTYPE_FLOAT
               << "FP precision                = FLOAT (NaN/abnormal=" << nabn << ", zero=" << nzero << ")" << std::endl
 #endif
-#ifdef __CUDACC__
 #if defined MGONGPU_CUCXTYPE_CUCOMPLEX
               << "Complex type                = CUCOMPLEX" << std::endl
 #elif defined MGONGPU_CUCXTYPE_THRUST
               << "Complex type                = THRUST::COMPLEX" << std::endl
-#endif
-#else
+#elif defined MGONGPU_CUCXTYPE_CXSMPL
               << "Complex type                = STD::COMPLEX" << std::endl
+#else
+              << "Complex type                = ???" << std::endl // no path to this statement...
 #endif
               << "RanNumb memory layout       = AOSOA[" << neppR << "]"
               << ( neppR == 1 ? " == AOS" : "" )
               << " [HARDCODED FOR REPRODUCIBILITY]" << std::endl
               << "Momenta memory layout       = AOSOA[" << neppM << "]"
               << ( neppM == 1 ? " == AOS" : "" ) << std::endl
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     //<< "Wavefunction GPU memory     = LOCAL" << std::endl
 #else
 #if !defined MGONGPU_CPPSIMD
@@ -937,7 +950,7 @@ main( int argc, char** argv )
 #endif
 #endif
               << "Random number generation    = " << rndgentxt << std::endl
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 #ifdef _OPENMP
               << "OMP threads / `nproc --all` = " << omp_get_max_threads() << " / " << nprocall // includes a newline
 #endif
@@ -1033,14 +1046,14 @@ main( int argc, char** argv )
              << "\"FLOAT (NaN/abnormal=" << nabn << ")\"," << std::endl
 #endif
              << "\"Complex type\": "
-#ifdef __CUDACC__
 #if defined MGONGPU_CUCXTYPE_CUCOMPLEX
              << "\"CUCOMPLEX\"," << std::endl
 #elif defined MGONGPU_CUCXTYPE_THRUST
              << "\"THRUST::COMPLEX\"," << std::endl
-#endif
-#else
+#elif defined MGONGPU_CUCXTYPE_CXSMPL
              << "\"STD::COMPLEX\"," << std::endl
+#else
+             << "\"???\"," << std::endl                           // no path to this statement...
 #endif
              << "\"RanNumb memory layout\": "
              << "\"AOSOA[" << neppR << "]\""
@@ -1048,7 +1061,7 @@ main( int argc, char** argv )
              << "\"Momenta memory layout\": "
              << "\"AOSOA[" << neppM << "]\""
              << ( neppM == 1 ? " == AOS" : "" ) << ", " << std::endl
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     //<< "\"Wavefunction GPU memory\": " << "\"LOCAL\"," << std::endl
 #endif
              << "\"Curand generation\": "
diff --git a/epochX/cudacpp/gg_tt.sa/SubProcesses/RamboSamplingKernels.cc b/epochX/cudacpp/gg_tt.sa/SubProcesses/RamboSamplingKernels.cc
index da68aa9255..79abbcc4f8 100644
--- a/epochX/cudacpp/gg_tt.sa/SubProcesses/RamboSamplingKernels.cc
+++ b/epochX/cudacpp/gg_tt.sa/SubProcesses/RamboSamplingKernels.cc
@@ -1,11 +1,11 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #include "RamboSamplingKernels.h"
 
-#include "CudaRuntime.h"
+#include "GpuRuntime.h"
 #include "MemoryAccessMomenta.h"
 #include "MemoryAccessRandomNumbers.h"
 #include "MemoryAccessWeights.h"
@@ -14,7 +14,7 @@
 
 #include <sstream>
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -92,7 +92,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   RamboSamplingKernelDevice::RamboSamplingKernelDevice( const fptype energy,               // input: energy
                                                         const BufferRndNumMomenta& rndmom, // input: random numbers in [0,1]
                                                         BufferMomenta& momenta,            // output: momenta
@@ -135,7 +135,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   __global__ void
   getMomentaInitialDevice( const fptype energy,
                            fptype* momenta )
@@ -147,17 +147,17 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   void
   RamboSamplingKernelDevice::getMomentaInitial()
   {
-    getMomentaInitialDevice<<<m_gpublocks, m_gputhreads>>>( m_energy, m_momenta.data() );
+    gpuLaunchKernel( getMomentaInitialDevice, m_gpublocks, m_gputhreads, m_energy, m_momenta.data() );
   }
 #endif
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   __global__ void
   getMomentaFinalDevice( const fptype energy,
                          const fptype* rndmom,
@@ -171,11 +171,11 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   void
   RamboSamplingKernelDevice::getMomentaFinal()
   {
-    getMomentaFinalDevice<<<m_gpublocks, m_gputhreads>>>( m_energy, m_rndmom.data(), m_momenta.data(), m_weights.data() );
+    gpuLaunchKernel( getMomentaFinalDevice, m_gpublocks, m_gputhreads, m_energy, m_rndmom.data(), m_momenta.data(), m_weights.data() );
   }
 #endif
 
diff --git a/epochX/cudacpp/gg_tt.sa/SubProcesses/RamboSamplingKernels.h b/epochX/cudacpp/gg_tt.sa/SubProcesses/RamboSamplingKernels.h
index 184089efd7..7c214cd74b 100644
--- a/epochX/cudacpp/gg_tt.sa/SubProcesses/RamboSamplingKernels.h
+++ b/epochX/cudacpp/gg_tt.sa/SubProcesses/RamboSamplingKernels.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef RAMBOSAMPLINGKERNELS_H
 #define RAMBOSAMPLINGKERNELS_H 1
@@ -10,7 +10,7 @@
 
 #include "MemoryBuffers.h"
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -93,7 +93,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   // A class encapsulating RAMBO phase space sampling on a GPU device
   class RamboSamplingKernelDevice final : public SamplingKernelBase, public NumberOfEvents
   {
diff --git a/epochX/cudacpp/gg_tt.sa/SubProcesses/RandomNumberKernels.h b/epochX/cudacpp/gg_tt.sa/SubProcesses/RandomNumberKernels.h
index 188a72c2c9..21d63beeac 100644
--- a/epochX/cudacpp/gg_tt.sa/SubProcesses/RandomNumberKernels.h
+++ b/epochX/cudacpp/gg_tt.sa/SubProcesses/RandomNumberKernels.h
@@ -1,14 +1,14 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef RANDOMNUMBERKERNELS_H
 #define RANDOMNUMBERKERNELS_H 1
 
 #include "mgOnGpuConfig.h"
 
-// NB This must come AFTER mgOnGpuConfig.h which contains our definition of __global__ when __CUDACC__ is not defined
+// NB This must come AFTER mgOnGpuConfig.h which contains our definition of __global__ when MGONGPUCPP_GPUIMPL is not defined
 #ifndef MGONGPU_HAS_NO_CURAND
 //#include "curand.h"
 struct curandGenerator_st; // forward definition from curand.h
@@ -16,7 +16,7 @@ struct curandGenerator_st; // forward definition from curand.h
 
 #include "MemoryBuffers.h"
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/gg_tt.sa/SubProcesses/cudacpp.mk b/epochX/cudacpp/gg_tt.sa/SubProcesses/cudacpp.mk
index 43cee0977e..77334d2c04 100644
--- a/epochX/cudacpp/gg_tt.sa/SubProcesses/cudacpp.mk
+++ b/epochX/cudacpp/gg_tt.sa/SubProcesses/cudacpp.mk
@@ -1,7 +1,7 @@
 # Copyright (C) 2020-2023 CERN and UCLouvain.
 # Licensed under the GNU Lesser General Public License (version 3 or later).
 # Created by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin.
-# Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+# Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 
 #=== Determine the name of this makefile (https://ftp.gnu.org/old-gnu/Manuals/make-3.80/html_node/make_17.html)
 #=== NB: different names (e.g. cudacpp.mk and cudacpp_src.mk) are used in the Subprocess and src directories
@@ -32,7 +32,7 @@ UNAME_P := $(shell uname -p)
 #=== Configure common compiler flags for C++ and CUDA
 
 INCFLAGS = -I.
-OPTFLAGS = -O3 # this ends up in CUFLAGS too (should it?), cannot add -Ofast or -ffast-math here
+OPTFLAGS = -O3 # this ends up in GPUFLAGS too (should it?), cannot add -Ofast or -ffast-math here
 
 # Dependency on src directory
 MG5AMC_COMMONLIB = mg5amc_common
@@ -103,69 +103,135 @@ endif
 
 #-------------------------------------------------------------------------------
 
-#=== Configure the CUDA compiler
-
-# If CXX is not a single word (example "clang++ --gcc-toolchain...") then disable CUDA builds (issue #505)
-# This is because it is impossible to pass this to "CUFLAGS += -ccbin <host-compiler>" below
-ifneq ($(words $(subst ccache ,,$(CXX))),1) # allow at most "CXX=ccache <host-compiler>" from outside
-  $(warning CUDA builds are not supported for multi-word CXX "$(CXX)")
-  override CUDA_HOME=disabled
-endif
-
-# If CUDA_HOME is not set, try to set it from the location of nvcc
-ifndef CUDA_HOME
-  CUDA_HOME = $(patsubst %bin/nvcc,%,$(shell which nvcc 2>/dev/null))
-  $(warning CUDA_HOME was not set: using "$(CUDA_HOME)")
-endif
-
-# Set NVCC as $(CUDA_HOME)/bin/nvcc if it exists
-ifneq ($(wildcard $(CUDA_HOME)/bin/nvcc),)
-  NVCC = $(CUDA_HOME)/bin/nvcc
-  USE_NVTX ?=-DUSE_NVTX
-  # See https://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/index.html
-  # See https://arnon.dk/matching-sm-architectures-arch-and-gencode-for-various-nvidia-cards/
-  # Default: use compute capability 70 for V100 (CERN lxbatch, CERN itscrd, Juwels Cluster).
-  # Embed device code for 70, and PTX for 70+.
-  # Export MADGRAPH_CUDA_ARCHITECTURE (comma-separated list) to use another value or list of values (see #533).
-  # Examples: use 60 for P100 (Piz Daint), 80 for A100 (Juwels Booster, NVidia raplab/Curiosity).
-  MADGRAPH_CUDA_ARCHITECTURE ?= 70
-  ###CUARCHFLAGS = -gencode arch=compute_$(MADGRAPH_CUDA_ARCHITECTURE),code=compute_$(MADGRAPH_CUDA_ARCHITECTURE) -gencode arch=compute_$(MADGRAPH_CUDA_ARCHITECTURE),code=sm_$(MADGRAPH_CUDA_ARCHITECTURE) # Older implementation (AV): go back to this one for multi-GPU support #533
-  ###CUARCHFLAGS = --gpu-architecture=compute_$(MADGRAPH_CUDA_ARCHITECTURE) --gpu-code=sm_$(MADGRAPH_CUDA_ARCHITECTURE),compute_$(MADGRAPH_CUDA_ARCHITECTURE) # Newer implementation (SH): cannot use this as-is for multi-GPU support #533
-  comma:=,
-  CUARCHFLAGS = $(foreach arch,$(subst $(comma), ,$(MADGRAPH_CUDA_ARCHITECTURE)),-gencode arch=compute_$(arch),code=compute_$(arch) -gencode arch=compute_$(arch),code=sm_$(arch))
-  CUINC = -I$(CUDA_HOME)/include/
-  CURANDLIBFLAGS = -L$(CUDA_HOME)/lib64/ -lcurand # NB: -lcuda is not needed here!
-  CUOPTFLAGS = -lineinfo
-  CUFLAGS = $(foreach opt, $(OPTFLAGS), -Xcompiler $(opt)) $(CUOPTFLAGS) $(INCFLAGS) $(CUINC) $(USE_NVTX) $(CUARCHFLAGS) -use_fast_math
-  ###CUFLAGS += -Xcompiler -Wall -Xcompiler -Wextra -Xcompiler -Wshadow
-  ###NVCC_VERSION = $(shell $(NVCC) --version | grep 'Cuda compilation tools' | cut -d' ' -f5 | cut -d, -f1)
-  CUFLAGS += -std=c++17 # need CUDA >= 11.2 (see #333): this is enforced in mgOnGpuConfig.h
-  # Without -maxrregcount: baseline throughput: 6.5E8 (16384 32 12) up to 7.3E8 (65536 128 12)
-  ###CUFLAGS+= --maxrregcount 160 # improves throughput: 6.9E8 (16384 32 12) up to 7.7E8 (65536 128 12)
-  ###CUFLAGS+= --maxrregcount 128 # improves throughput: 7.3E8 (16384 32 12) up to 7.6E8 (65536 128 12)
-  ###CUFLAGS+= --maxrregcount 96 # degrades throughput: 4.1E8 (16384 32 12) up to 4.5E8 (65536 128 12)
-  ###CUFLAGS+= --maxrregcount 64 # degrades throughput: 1.7E8 (16384 32 12) flat at 1.7E8 (65536 128 12)
-else ifneq ($(origin REQUIRE_CUDA),undefined)
-  # If REQUIRE_CUDA is set but no cuda is found, stop here (e.g. for CI tests on GPU #443)
-  $(error No cuda installation found (set CUDA_HOME or make nvcc visible in PATH))
-else
-  # No cuda. Switch cuda compilation off and go to common random numbers in C++
-  $(warning CUDA_HOME is not set or is invalid: export CUDA_HOME to compile with cuda)
-  override NVCC=
-  override USE_NVTX=
-  override CUINC=
-  override CURANDLIBFLAGS=
-endif
-export NVCC
-export CUFLAGS
+CUDA_COMPILER_PATH := $(shell compiler="`which nvcc 2>/dev/null`" && while [ -L "$$compiler" ]; do compiler=`readlink "$$compiler"`; done && echo "$$compiler")
+HIP_COMPILER_PATH := $(shell compiler="`which hipcc 2>/dev/null`" && while [ -L "$$compiler" ]; do compiler=`readlink "$$compiler"`; done && echo "$$compiler")
+
+ifeq ($(findstring nvcc,$(CUDA_COMPILER_PATH)),nvcc)
+  #=== Configure the CUDA compiler
+
+  # If CXX is not a single word (example "clang++ --gcc-toolchain...") then disable CUDA builds (issue #505)
+  # This is because it is impossible to pass this to "GPUFLAGS += -ccbin <host-compiler>" below
+  ifneq ($(words $(subst ccache ,,$(CXX))),1) # allow at most "CXX=ccache <host-compiler>" from outside
+    $(warning CUDA builds are not supported for multi-word CXX "$(CXX)")
+    override CUDA_HOME=disabled
+  endif
+
+  # If CUDA_HOME is not set, try to set it from the location of nvcc
+  ifndef CUDA_HOME
+    CUDA_HOME = $(patsubst %bin/nvcc,%,$(shell which nvcc 2>/dev/null))
+    $(warning CUDA_HOME was not set: using "$(CUDA_HOME)")
+  endif
+
+  # Set GPUCC as $(CUDA_HOME)/bin/nvcc if it exists
+  ifneq ($(wildcard $(CUDA_HOME)/bin/nvcc),)
+    GPUCC = $(CUDA_HOME)/bin/nvcc
+    USE_NVTX ?=-DUSE_NVTX
+    # See https://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/index.html
+    # See https://arnon.dk/matching-sm-architectures-arch-and-gencode-for-various-nvidia-cards/
+    # Default: use compute capability 70 for V100 (CERN lxbatch, CERN itscrd, Juwels Cluster).
+    # Embed device code for 70, and PTX for 70+.
+    # Export MADGRAPH_CUDA_ARCHITECTURE (comma-separated list) to use another value or list of values (see #533).
+    # Examples: use 60 for P100 (Piz Daint), 80 for A100 (Juwels Booster, NVidia raplab/Curiosity).
+    MADGRAPH_CUDA_ARCHITECTURE ?= 70
+    ###CUARCHFLAGS = -gencode arch=compute_$(MADGRAPH_CUDA_ARCHITECTURE),code=compute_$(MADGRAPH_CUDA_ARCHITECTURE) -gencode arch=compute_$(MADGRAPH_CUDA_ARCHITECTURE),code=sm_$(MADGRAPH_CUDA_ARCHITECTURE) # Older implementation (AV): go back to this one for multi-GPU support #533
+    ###CUARCHFLAGS = --gpu-architecture=compute_$(MADGRAPH_CUDA_ARCHITECTURE) --gpu-code=sm_$(MADGRAPH_CUDA_ARCHITECTURE),compute_$(MADGRAPH_CUDA_ARCHITECTURE) # Newer implementation (SH): cannot use this as-is for multi-GPU support #533
+    comma:=,
+    CUARCHFLAGS = $(foreach arch,$(subst $(comma), ,$(MADGRAPH_CUDA_ARCHITECTURE)),-gencode arch=compute_$(arch),code=compute_$(arch) -gencode arch=compute_$(arch),code=sm_$(arch))
+    CUINC = -I$(CUDA_HOME)/include/
+    CURANDLIBFLAGS = -L$(CUDA_HOME)/lib64/ -lcurand # NB: -lcuda is not needed here!
+    CUOPTFLAGS = -lineinfo
+    GPUFLAGS = $(foreach opt, $(OPTFLAGS), -Xcompiler $(opt)) $(CUOPTFLAGS) $(INCFLAGS) $(CUINC) $(USE_NVTX) $(CUARCHFLAGS) -use_fast_math
+    ###GPUFLAGS += -Xcompiler -Wall -Xcompiler -Wextra -Xcompiler -Wshadow
+    ###GPUCC_VERSION = $(shell $(GPUCC) --version | grep 'Cuda compilation tools' | cut -d' ' -f5 | cut -d, -f1)
+    GPUFLAGS += -std=c++17 # need CUDA >= 11.2 (see #333): this is enforced in mgOnGpuConfig.h
+    # Without -maxrregcount: baseline throughput: 6.5E8 (16384 32 12) up to 7.3E8 (65536 128 12)
+    ###GPUFLAGS+= --maxrregcount 160 # improves throughput: 6.9E8 (16384 32 12) up to 7.7E8 (65536 128 12)
+    ###GPUFLAGS+= --maxrregcount 128 # improves throughput: 7.3E8 (16384 32 12) up to 7.6E8 (65536 128 12)
+    ###GPUFLAGS+= --maxrregcount 96 # degrades throughput: 4.1E8 (16384 32 12) up to 4.5E8 (65536 128 12)
+    ###GPUFLAGS+= --maxrregcount 64 # degrades throughput: 1.7E8 (16384 32 12) flat at 1.7E8 (65536 128 12)
+    CUBUILDRULEFLAGS = -Xcompiler -fPIC -c
+    CCBUILDRULEFLAGS = -Xcompiler -fPIC -c -x cu
+    CUDATESTFLAGS = -lcuda
+  else ifneq ($(origin REQUIRE_CUDA),undefined)
+    # If REQUIRE_CUDA is set but no cuda is found, stop here (e.g. for CI tests on GPU #443)
+    $(error No cuda installation found (set CUDA_HOME or make GPUCC visible in PATH))
+  else
+    # No cuda. Switch cuda compilation off and go to common random numbers in C++
+    $(warning CUDA_HOME is not set or is invalid: export CUDA_HOME to compile with cuda)
+    override GPUCC=
+    override USE_NVTX=
+    override CUINC=
+    override CURANDLIBFLAGS=
+  endif
+  export GPUCC
+  export GPUFLAGS
+
+  # Set the host C++ compiler for GPUCC via "-ccbin <host-compiler>"
+  # (NB issue #505: this must be a single word, "clang++ --gcc-toolchain..." is not supported)
+  GPUFLAGS += -ccbin $(shell which $(subst ccache ,,$(CXX)))
+
+  # Allow newer (unsupported) C++ compilers with older versions of CUDA if ALLOW_UNSUPPORTED_COMPILER_IN_CUDA is set (#504)
+  ifneq ($(origin ALLOW_UNSUPPORTED_COMPILER_IN_CUDA),undefined)
+  GPUFLAGS += -allow-unsupported-compiler
+  endif
+
+else ifeq ($(findstring hipcc,$(HIP_COMPILER_PATH)),hipcc)
+  #=== Configure the HIP compiler
+
+  # If CXX is not a single word (example "clang++ --gcc-toolchain...") then disable CUDA builds (issue #505)
+  # This is because it is impossible to pass this to "GPUFLAGS += -ccbin <host-compiler>" below
+  ifneq ($(words $(subst ccache ,,$(CXX))),1) # allow at most "CXX=ccache <host-compiler>" from outside
+    $(warning CUDA builds are not supported for multi-word CXX "$(CXX)")
+    override CUDA_HOME=disabled
+  endif
+
+  # If HIP_HOME is not set, try to set it from the location of GPUCC
+  ifndef HIP_HOME
+    HIP_HOME = $(patsubst %bin/hipcc,%,$(HIP_COMPILER_PATH))
+    $(warning HIP_HOME was not set: using "$(HIP_HOME)")
+  endif
 
-# Set the host C++ compiler for nvcc via "-ccbin <host-compiler>"
-# (NB issue #505: this must be a single word, "clang++ --gcc-toolchain..." is not supported)
-CUFLAGS += -ccbin $(shell which $(subst ccache ,,$(CXX)))
+  # Set GPUCC as $(HIP_HOME)/bin/hipcc if it exists
+  ifneq ($(wildcard $(HIP_HOME)/bin/hipcc),)
+    GPUCC = $(HIP_HOME)/bin/hipcc
+
+    # Should maybe find something equivelant to this in HIP
+    #USE_NVTX ?=-DUSE_NVTX
+
+    HIPARCHFLAGS = -target x86_64-linux-gnu --offload-arch=gfx90a
+    HIPINC = -I$(HIP_HOME)/include/
+
+    # -DHIP_FAST_MATH equivelant to -use_fast_math in HIP 
+    # (But only for single precision line 208: https://rocm-developer-tools.github.io/HIP/hcc__detail_2math__functions_8h_source.html)
+    GPUFLAGS = $(OPTFLAGS) $(CUOPTFLAGS) $(INCFLAGS) $(HIPINC) $(HIPARCHFLAGS) -DHIP_FAST_MATH -DHIP_PLATFORM=amd -fPIC
+    ###GPUFLAGS += -Xcompiler -Wall -Xcompiler -Wextra -Xcompiler -Wshadow
+    GPUFLAGS += -std=c++17
+    # Without -maxrregcount: baseline throughput: 6.5E8 (16384 32 12) up to 7.3E8 (65536 128 12)
+    ###GPUFLAGS+= --maxrregcount 160 # improves throughput: 6.9E8 (16384 32 12) up to 7.7E8 (65536 128 12)
+    ###GPUFLAGS+= --maxrregcount 128 # improves throughput: 7.3E8 (16384 32 12) up to 7.6E8 (65536 128 12)
+    ###GPUFLAGS+= --maxrregcount 96 # degrades throughput: 4.1E8 (16384 32 12) up to 4.5E8 (65536 128 12)
+    ###GPUFLAGS+= --maxrregcount 64 # degrades throughput: 1.7E8 (16384 32 12) flat at 1.7E8 (65536 128 12)
+
+    CUBUILDRULEFLAGS = -fPIC -c
+    CCBUILDRULEFLAGS = -fPIC -c
+
+  else ifneq ($(origin REQUIRE_HIP),undefined)
+    # If REQUIRE_HIP is set but no cuda is found, stop here (e.g. for CI tests on GPU #443)
+    $(error No hip installation found (set HIP_HOME or make GPUCC visible in PATH))
+  else
+    # No hip. Switch hip compilation off and go to common random numbers in C++
+    $(warning HIP_HOME is not set or is invalid: export HIP_HOME to compile with hip)
+    override GPUCC=
+    override USE_NVTX=
+    override CUINC=
+    override CURANDLIBFLAGS=
+  endif
+
+  # Allow newer (unsupported) C++ compilers with older versions of CUDA if ALLOW_UNSUPPORTED_COMPILER_IN_CUDA is set (#504)
+  ifneq ($(origin ALLOW_UNSUPPORTED_COMPILER_IN_CUDA),undefined)
+  GPUFLAGS += -allow-unsupported-compiler
+  endif
 
-# Allow newer (unsupported) C++ compilers with older versions of CUDA if ALLOW_UNSUPPORTED_COMPILER_IN_CUDA is set (#504)
-ifneq ($(origin ALLOW_UNSUPPORTED_COMPILER_IN_CUDA),undefined)
-CUFLAGS += -allow-unsupported-compiler
 endif
 
 #-------------------------------------------------------------------------------
@@ -179,9 +245,9 @@ endif
 #ifeq ($(USECCACHE)$(shell echo $(AR) | grep ccache),1)
 #  override AR:=ccache $(AR)
 #endif
-ifneq ($(NVCC),)
-  ifeq ($(USECCACHE)$(shell echo $(NVCC) | grep ccache),1)
-    override NVCC:=ccache $(NVCC)
+ifneq ($(GPUCC),)
+  ifeq ($(USECCACHE)$(shell echo $(GPUCC) | grep ccache),1)
+    override GPUCC:=ccache $(GPUCC)
   endif
 endif
 
@@ -205,7 +271,7 @@ endif
 
 # PowerPC-specific CUDA compiler flags (to be reviewed!)
 ifeq ($(UNAME_P),ppc64le)
-  CUFLAGS+= -Xcompiler -mno-float128
+  GPUFLAGS+= -Xcompiler -mno-float128
 endif
 
 #-------------------------------------------------------------------------------
@@ -215,10 +281,10 @@ endif
 # Set the default OMPFLAGS choice
 ifneq ($(shell $(CXX) --version | egrep '^Intel'),)
 override OMPFLAGS = -fopenmp
-###override OMPFLAGS = # disable OpenMP MT on Intel (was ok without nvcc but not ok with nvcc before #578)
+###override OMPFLAGS = # disable OpenMP MT on Intel (was ok without GPUCC but not ok with GPUCC before #578)
 else ifneq ($(shell $(CXX) --version | egrep '^(clang)'),)
 override OMPFLAGS = -fopenmp
-###override OMPFLAGS = # disable OpenMP MT on clang (was not ok without or with nvcc before #578)
+###override OMPFLAGS = # disable OpenMP MT on clang (was not ok without or with GPUCC before #578)
 else ifneq ($(shell $(CXX) --version | egrep '^(Apple clang)'),)
 override OMPFLAGS = # disable OpenMP MT on Apple clang (builds fail in the CI #578)
 else
@@ -269,7 +335,10 @@ endif
 
 # Set the default RNDGEN (random number generator) choice
 ifeq ($(RNDGEN),)
-  ifeq ($(NVCC),)
+  ifeq ($(GPUCC),)
+    override RNDGEN = hasNoCurand
+  # Edgecase for HIP compilation
+  else ifeq ($(findstring hipcc,$(GPUCC)),hipcc)
     override RNDGEN = hasNoCurand
   else ifeq ($(RNDGEN),)
     override RNDGEN = hasCurand
@@ -344,13 +413,13 @@ CXXFLAGS+= $(AVXFLAGS)
 $(info FPTYPE=$(FPTYPE))
 ifeq ($(FPTYPE),d)
   CXXFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_DOUBLE
-  CUFLAGS  += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_DOUBLE
+  GPUFLAGS  += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_DOUBLE
 else ifeq ($(FPTYPE),f)
   CXXFLAGS += -DMGONGPU_FPTYPE_FLOAT -DMGONGPU_FPTYPE2_FLOAT
-  CUFLAGS  += -DMGONGPU_FPTYPE_FLOAT -DMGONGPU_FPTYPE2_FLOAT
+  GPUFLAGS  += -DMGONGPU_FPTYPE_FLOAT -DMGONGPU_FPTYPE2_FLOAT
 else ifeq ($(FPTYPE),m)
   CXXFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_FLOAT
-  CUFLAGS  += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_FLOAT
+  GPUFLAGS  += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_FLOAT
 else
   $(error Unknown FPTYPE='$(FPTYPE)': only 'd', 'f' and 'm' are supported)
 endif
@@ -359,7 +428,7 @@ endif
 $(info HELINL=$(HELINL))
 ifeq ($(HELINL),1)
   CXXFLAGS += -DMGONGPU_INLINE_HELAMPS
-  CUFLAGS  += -DMGONGPU_INLINE_HELAMPS
+  GPUFLAGS  += -DMGONGPU_INLINE_HELAMPS
 else ifneq ($(HELINL),0)
   $(error Unknown HELINL='$(HELINL)': only '0' and '1' are supported)
 endif
@@ -368,7 +437,7 @@ endif
 $(info HRDCOD=$(HRDCOD))
 ifeq ($(HRDCOD),1)
   CXXFLAGS += -DMGONGPU_HARDCODE_PARAM
-  CUFLAGS  += -DMGONGPU_HARDCODE_PARAM
+  GPUFLAGS  += -DMGONGPU_HARDCODE_PARAM
 else ifneq ($(HRDCOD),0)
   $(error Unknown HRDCOD='$(HRDCOD)': only '0' and '1' are supported)
 endif
@@ -420,11 +489,11 @@ ifeq ($(UNAME_S),Darwin)
   override CULIBFLAGSRPATH2 =
 else
   # RPATH to cuda/cpp libs when linking executables
-  override CXXLIBFLAGSRPATH = -Wl,-rpath,$(LIBDIRRPATH)
-  override CULIBFLAGSRPATH = -Xlinker -rpath,$(LIBDIRRPATH)
+  override CXXLIBFLAGSRPATH = -Wl,-rpath=$(LIBDIRRPATH)
+  override CULIBFLAGSRPATH = -Xlinker -rpath=$(LIBDIRRPATH)
   # RPATH to common lib when linking cuda/cpp libs
-  override CXXLIBFLAGSRPATH2 = -Wl,-rpath,'$$ORIGIN'
-  override CULIBFLAGSRPATH2 = -Xlinker -rpath,'$$ORIGIN'
+  override CXXLIBFLAGSRPATH2 = -Wl,-rpath='$$ORIGIN'
+  override CULIBFLAGSRPATH2 = -Xlinker -rpath='$$ORIGIN'
 endif
 
 # Setting LD_LIBRARY_PATH or DYLD_LIBRARY_PATH in the RUNTIME is no longer necessary (neither on Linux nor on Mac)
@@ -437,7 +506,7 @@ override RUNTIME =
 cxx_main=$(BUILDDIR)/check.exe
 fcxx_main=$(BUILDDIR)/fcheck.exe
 
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 cu_main=$(BUILDDIR)/gcheck.exe
 fcu_main=$(BUILDDIR)/fgcheck.exe
 else
@@ -468,15 +537,16 @@ $(BUILDDIR)/.build.$(TAG):
 	@touch $(BUILDDIR)/.build.$(TAG)
 
 # Generic target and build rules: objects from CUDA compilation
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 $(BUILDDIR)/%.o : %.cu *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
 	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
-	$(NVCC) $(CPPFLAGS) $(CUFLAGS) -Xcompiler -fPIC -c $< -o $@
+	$(GPUCC) $(CPPFLAGS) $(GPUFLAGS) $(CUBUILDRULEFLAGS) $< -o $@
 
 $(BUILDDIR)/%_cu.o : %.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
 	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
-	$(NVCC) $(CPPFLAGS) $(CUFLAGS) -Xcompiler -fPIC -c -x cu $< -o $@
+	$(GPUCC) $(CPPFLAGS) $(GPUFLAGS) $(CCBUILDRULEFLAGS) $< -o $@
 endif
+# -x cu in line above
 
 # Generic target and build rules: objects from C++ compilation
 # (NB do not include CUINC here! add it only for NVTX or curand #679)
@@ -485,11 +555,14 @@ $(BUILDDIR)/%.o : %.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
 	$(CXX) $(CPPFLAGS) $(CXXFLAGS) -fPIC -c $< -o $@
 
 # Apply special build flags only to CrossSectionKernel.cc and gCrossSectionKernel.cu (no fast math, see #117 and #516)
+# Added edgecase for HIP compilation
 ifeq ($(shell $(CXX) --version | grep ^nvc++),)
 $(BUILDDIR)/CrossSectionKernels.o: CXXFLAGS := $(filter-out -ffast-math,$(CXXFLAGS))
 $(BUILDDIR)/CrossSectionKernels.o: CXXFLAGS += -fno-fast-math
-ifneq ($(NVCC),)
-$(BUILDDIR)/gCrossSectionKernels.o: CUFLAGS += -Xcompiler -fno-fast-math
+ifeq ($(findstring nvcc,$(GPUCC)),nvcc)
+  $(BUILDDIR)/gCrossSectionKernels.o: GPUFLAGS += -Xcompiler -fno-fast-math
+else
+  $(BUILDDIR)/gCrossSectionKernels.o: GPUFLAGS += -fno-fast-math
 endif
 endif
 
@@ -505,10 +578,10 @@ ifeq ($(RNDGEN),hasCurand)
 $(BUILDDIR)/CurandRandomNumberKernel.o: CXXFLAGS += $(CUINC)
 endif
 
-# Avoid "warning: builtin __has_trivial_... is deprecated; use __is_trivially_... instead" in nvcc with icx2023 (#592)
+# Avoid "warning: builtin __has_trivial_... is deprecated; use __is_trivially_... instead" in GPUCC with icx2023 (#592)
 ifneq ($(shell $(CXX) --version | egrep '^(Intel)'),)
-ifneq ($(NVCC),)
-CUFLAGS += -Xcompiler -Wno-deprecated-builtins
+ifneq ($(GPUCC),)
+GPUFLAGS += -Wno-deprecated-builtins
 endif
 endif
 
@@ -516,8 +589,8 @@ endif
 # This patch does remove the warning, but I prefer to keep it disabled for the moment...
 ###ifneq ($(shell $(CXX) --version | egrep '^(clang|Apple clang|Intel)'),)
 ###$(BUILDDIR)/CrossSectionKernels.o: CXXFLAGS += -Wno-overriding-t-option
-###ifneq ($(NVCC),)
-###$(BUILDDIR)/gCrossSectionKernels.o: CUFLAGS += -Xcompiler -Wno-overriding-t-option
+###ifneq ($(GPUCC),)
+###$(BUILDDIR)/gCrossSectionKernels.o: GPUFLAGS += -Xcompiler -Wno-overriding-t-option
 ###endif
 ###endif
 
@@ -544,7 +617,7 @@ MG5AMC_CXXLIB = mg5amc_$(processid_short)_cpp
 cxx_objects_lib=$(BUILDDIR)/CPPProcess.o $(BUILDDIR)/MatrixElementKernels.o $(BUILDDIR)/BridgeKernels.o $(BUILDDIR)/CrossSectionKernels.o
 cxx_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel.o $(BUILDDIR)/RamboSamplingKernels.o
 
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 MG5AMC_CULIB = mg5amc_$(processid_short)_cuda
 cu_objects_lib=$(BUILDDIR)/gCPPProcess.o $(BUILDDIR)/gMatrixElementKernels.o $(BUILDDIR)/gBridgeKernels.o $(BUILDDIR)/gCrossSectionKernels.o
 cu_objects_exe=$(BUILDDIR)/gCommonRandomNumberKernel.o $(BUILDDIR)/gRamboSamplingKernels.o
@@ -556,11 +629,11 @@ $(LIBDIR)/lib$(MG5AMC_CXXLIB).so: cxx_objects_lib += $(BUILDDIR)/fbridge.o
 $(LIBDIR)/lib$(MG5AMC_CXXLIB).so: $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib)
 	$(CXX) -shared -o $@ $(cxx_objects_lib) $(CXXLIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB)
 
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 $(LIBDIR)/lib$(MG5AMC_CULIB).so: $(BUILDDIR)/fbridge_cu.o
 $(LIBDIR)/lib$(MG5AMC_CULIB).so: cu_objects_lib += $(BUILDDIR)/fbridge_cu.o
 $(LIBDIR)/lib$(MG5AMC_CULIB).so: $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cu_objects_lib)
-	$(NVCC) --shared -o $@ $(cu_objects_lib) $(CULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB)
+	$(GPUCC) --shared -o $@ $(cu_objects_lib) $(CULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB)
 endif
 
 #-------------------------------------------------------------------------------
@@ -577,16 +650,16 @@ $(cxx_main): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PAT
 $(cxx_main): $(BUILDDIR)/check_sa.o $(LIBDIR)/lib$(MG5AMC_CXXLIB).so $(cxx_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel.o
 	$(CXX) -o $@ $(BUILDDIR)/check_sa.o $(OMPFLAGS) -ldl -pthread $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_CXXLIB) $(cxx_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel.o $(CURANDLIBFLAGS)
 
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 ifneq ($(shell $(CXX) --version | grep ^Intel),)
-$(cu_main): LIBFLAGS += -lintlc # compile with icpx and link with nvcc (undefined reference to `_intel_fast_memcpy')
-$(cu_main): LIBFLAGS += -lsvml # compile with icpx and link with nvcc (undefined reference to `__svml_cos4_l9')
+$(cu_main): LIBFLAGS += -lintlc # compile with icpx and link with GPUCC (undefined reference to `_intel_fast_memcpy')
+$(cu_main): LIBFLAGS += -lsvml # compile with icpx and link with GPUCC (undefined reference to `__svml_cos4_l9')
 else ifneq ($(shell $(CXX) --version | grep ^nvc++),) # support nvc++ #531
 $(cu_main): LIBFLAGS += -L$(patsubst %bin/nvc++,%lib,$(subst ccache ,,$(CXX))) -lnvhpcatm -lnvcpumath -lnvc
 endif
 $(cu_main): LIBFLAGS += $(CULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH
 $(cu_main): $(BUILDDIR)/gcheck_sa.o $(LIBDIR)/lib$(MG5AMC_CULIB).so $(cu_objects_exe) $(BUILDDIR)/gCurandRandomNumberKernel.o
-	$(NVCC) -o $@ $(BUILDDIR)/gcheck_sa.o $(CUARCHFLAGS) $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe) $(BUILDDIR)/gCurandRandomNumberKernel.o $(CURANDLIBFLAGS)
+	$(GPUCC) -o $@ $(BUILDDIR)/gcheck_sa.o $(CUARCHFLAGS) $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe) $(BUILDDIR)/gCurandRandomNumberKernel.o $(CURANDLIBFLAGS)
 endif
 
 #-------------------------------------------------------------------------------
@@ -612,17 +685,17 @@ $(fcxx_main): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PA
 $(fcxx_main): $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler.o $(LIBDIR)/lib$(MG5AMC_CXXLIB).so $(cxx_objects_exe)
 	$(CXX) -o $@ $(BUILDDIR)/fcheck_sa.o $(OMPFLAGS) $(BUILDDIR)/fsampler.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CXXLIB) $(cxx_objects_exe)
 
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 ifneq ($(shell $(CXX) --version | grep ^Intel),)
-$(fcu_main): LIBFLAGS += -lintlc # compile with icpx and link with nvcc (undefined reference to `_intel_fast_memcpy')
-$(fcu_main): LIBFLAGS += -lsvml # compile with icpx and link with nvcc (undefined reference to `__svml_cos4_l9')
+$(fcu_main): LIBFLAGS += -lintlc # compile with icpx and link with GPUCC (undefined reference to `_intel_fast_memcpy')
+$(fcu_main): LIBFLAGS += -lsvml # compile with icpx and link with GPUCC (undefined reference to `__svml_cos4_l9')
 endif
 ifeq ($(UNAME_S),Darwin)
 $(fcu_main): LIBFLAGS += -L$(shell dirname $(shell $(FC) --print-file-name libgfortran.dylib)) # add path to libgfortran on Mac #375
 endif
 $(fcu_main): LIBFLAGS += $(CULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH
 $(fcu_main): $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler_cu.o $(LIBDIR)/lib$(MG5AMC_CULIB).so $(cu_objects_exe)
-	$(NVCC) -o $@ $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler_cu.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe)
+	$(GPUCC) -o $@ $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler_cu.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe)
 endif
 
 #-------------------------------------------------------------------------------
@@ -634,7 +707,7 @@ $(BUILDDIR)/testxxx.o: testxxx_cc_ref.txt
 $(testmain): $(BUILDDIR)/testxxx.o
 $(testmain): cxx_objects_exe += $(BUILDDIR)/testxxx.o # Comment out this line to skip the C++ test of xxx functions
 
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 $(BUILDDIR)/testxxx_cu.o: $(GTESTLIBS)
 $(BUILDDIR)/testxxx_cu.o: INCFLAGS += $(GTESTINC)
 $(BUILDDIR)/testxxx_cu.o: testxxx_cc_ref.txt
@@ -647,7 +720,7 @@ $(BUILDDIR)/testmisc.o: INCFLAGS += $(GTESTINC)
 $(testmain): $(BUILDDIR)/testmisc.o
 $(testmain): cxx_objects_exe += $(BUILDDIR)/testmisc.o # Comment out this line to skip the C++ miscellaneous tests
 
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 $(BUILDDIR)/testmisc_cu.o: $(GTESTLIBS)
 $(BUILDDIR)/testmisc_cu.o: INCFLAGS += $(GTESTINC)
 $(testmain): $(BUILDDIR)/testmisc_cu.o
@@ -659,12 +732,12 @@ $(BUILDDIR)/runTest.o: INCFLAGS += $(GTESTINC)
 $(testmain): $(BUILDDIR)/runTest.o
 $(testmain): cxx_objects_exe += $(BUILDDIR)/runTest.o
 
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 $(BUILDDIR)/runTest_cu.o: $(GTESTLIBS)
 $(BUILDDIR)/runTest_cu.o: INCFLAGS += $(GTESTINC)
 ifneq ($(shell $(CXX) --version | grep ^Intel),)
-$(testmain): LIBFLAGS += -lintlc # compile with icpx and link with nvcc (undefined reference to `_intel_fast_memcpy')
-$(testmain): LIBFLAGS += -lsvml # compile with icpx and link with nvcc (undefined reference to `__svml_cos4_l9')
+$(testmain): LIBFLAGS += -lintlc # compile with icpx and link with GPUCC (undefined reference to `_intel_fast_memcpy')
+$(testmain): LIBFLAGS += -lsvml # compile with icpx and link with GPUCC (undefined reference to `__svml_cos4_l9')
 else ifneq ($(shell $(CXX) --version | grep ^nvc++),) # support nvc++ #531
 $(testmain): LIBFLAGS += -L$(patsubst %bin/nvc++,%lib,$(subst ccache ,,$(CXX))) -lnvhpcatm -lnvcpumath -lnvc
 endif
@@ -688,14 +761,14 @@ $(testmain): LIBFLAGS += -lgomp
 endif
 endif
 
-ifeq ($(NVCC),) # link only runTest.o
+ifeq ($(GPUCC),) # link only runTest.o
 $(testmain): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH
 $(testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(cxx_objects_exe) $(GTESTLIBS)
 	$(CXX) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) -ldl -pthread $(LIBFLAGS)
 else # link both runTest.o and runTest_cu.o
 $(testmain): LIBFLAGS += $(CULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH
 $(testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) $(GTESTLIBS)
-	$(NVCC) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) -ldl $(LIBFLAGS) -lcuda
+	  $(GPUCC) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) -ldl $(LIBFLAGS) $(CUDATESTFLAGS)
 endif
 
 # Use flock (Linux only, no Mac) to allow 'make -j' if googletest has not yet been downloaded https://stackoverflow.com/a/32666215
@@ -798,9 +871,9 @@ ifeq ($(USECCACHE),1)
 	ccache --version | head -1
 endif
 	@echo ""
-	@echo NVCC=$(NVCC)
-ifneq ($(NVCC),)
-	$(NVCC) --version
+	@echo GPUCC=$(GPUCC)
+ifneq ($(GPUCC),)
+	$(GPUCC) --version
 endif
 	@echo ""
 	@echo CXX=$(CXX)
@@ -819,7 +892,7 @@ endif
 
 # Target: check (run the C++ test executable)
 # [NB THIS IS WHAT IS USED IN THE GITHUB CI!]
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 check: runTest cmpFcheck cmpFGcheck
 else
 check: runTest cmpFcheck
diff --git a/epochX/cudacpp/gg_tt.sa/SubProcesses/fbridge.cc b/epochX/cudacpp/gg_tt.sa/SubProcesses/fbridge.cc
index f93c05b0b3..2b956730d4 100644
--- a/epochX/cudacpp/gg_tt.sa/SubProcesses/fbridge.cc
+++ b/epochX/cudacpp/gg_tt.sa/SubProcesses/fbridge.cc
@@ -1,11 +1,11 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: S. Roiser (Oct 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Roiser, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #include "Bridge.h"
 #include "CPPProcess.h"
-#include "CudaRuntime.h"
+#include "GpuRuntime.h"
 
 extern "C"
 {
@@ -22,7 +22,7 @@ extern "C"
    * Using the same Fortran MadEvent code, linking to the hetrerogeneous library would allow access to both CPU and GPU implementations.
    * The specific heterogeneous configuration (how many GPUs, how many threads on each CPU, etc) could be loaded in CUDA/C++ from a data file.
    */
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   using namespace mg5amcGpu;
 #else
   using namespace mg5amcCpu;
@@ -46,8 +46,8 @@ extern "C"
    */
   void fbridgecreate_( CppObjectInFortran** ppbridge, const int* pnevtF, const int* pnparF, const int* pnp4F )
   {
-#ifdef __CUDACC__
-    CudaRuntime::setUp();
+#ifdef MGONGPUCPP_GPUIMPL
+    GpuRuntime::setUp();
 #endif
     // Create a process object, read parm card and set parameters
     // FIXME: the process instance can happily go out of scope because it is only needed to read parameters?
@@ -69,8 +69,8 @@ extern "C"
     Bridge<FORTRANFPTYPE>* pbridge = dynamic_cast<Bridge<FORTRANFPTYPE>*>( *ppbridge );
     if( pbridge == 0 ) throw std::runtime_error( "fbridgedelete_: invalid Bridge address" );
     delete pbridge;
-#ifdef __CUDACC__
-    CudaRuntime::tearDown();
+#ifdef MGONGPUCPP_GPUIMPL
+    GpuRuntime::tearDown();
 #endif
   }
 
@@ -100,7 +100,7 @@ extern "C"
   {
     Bridge<FORTRANFPTYPE>* pbridge = dynamic_cast<Bridge<FORTRANFPTYPE>*>( *ppbridge );
     if( pbridge == 0 ) throw std::runtime_error( "fbridgesequence_: invalid Bridge address" );
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     // Use the device/GPU implementation in the CUDA library
     // (there is also a host implementation in this library)
     pbridge->gpu_sequence( momenta, gs, rndhel, rndcol, *pchannelId, mes, selhel, selcol );
diff --git a/epochX/cudacpp/gg_tt.sa/SubProcesses/fsampler.cc b/epochX/cudacpp/gg_tt.sa/SubProcesses/fsampler.cc
index 2fb445372d..3743934f41 100644
--- a/epochX/cudacpp/gg_tt.sa/SubProcesses/fsampler.cc
+++ b/epochX/cudacpp/gg_tt.sa/SubProcesses/fsampler.cc
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Feb 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #include "mgOnGpuConfig.h"
 
@@ -13,7 +13,7 @@
 
 //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -40,7 +40,7 @@ namespace mg5amcCpu
   private:
     const int m_nevt; // The number of events in each iteration
     int m_iiter;      // The iteration counter (for random number seeding)
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
     HostBufferRndNumMomenta m_hstRndmom; // Memory buffers for random numbers
     HostBufferMomenta m_hstMomenta;      // Memory buffers for momenta
     HostBufferWeights m_hstWeights;      // Memory buffers for sampling weights
@@ -105,7 +105,7 @@ namespace mg5amcCpu
 
 extern "C"
 {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   using namespace mg5amcGpu;
 #else
   using namespace mg5amcCpu;
diff --git a/epochX/cudacpp/gg_tt.sa/SubProcesses/runTest.cc b/epochX/cudacpp/gg_tt.sa/SubProcesses/runTest.cc
index df7488178e..4c9bc9ee6b 100644
--- a/epochX/cudacpp/gg_tt.sa/SubProcesses/runTest.cc
+++ b/epochX/cudacpp/gg_tt.sa/SubProcesses/runTest.cc
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: S. Hageboeck (Nov 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 //----------------------------------------------------------------------------
 // Use ./runTest.exe --gtest_filter=*xxx to run only testxxx.cc tests
 //----------------------------------------------------------------------------
@@ -18,7 +18,7 @@
 #include "RandomNumberKernels.h"
 #include "epoch_process_id.h"
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 using namespace mg5amcGpu;
 #else
 using namespace mg5amcCpu;
@@ -35,7 +35,7 @@ struct CUDA_CPU_TestBase : public TestDriverBase
     : TestDriverBase( npar, refFileName ) {}
 };
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 struct CPUTest : public CUDA_CPU_TestBase
 {
   // Struct data members (process, and memory structures for random numbers, momenta, matrix elements and weights on host and device)
@@ -117,7 +117,7 @@ struct CPUTest : public CUDA_CPU_TestBase
 };
 #endif
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 struct CUDATest : public CUDA_CPU_TestBase
 {
   // Reset the device when our test goes out of scope. Note that this should happen after
@@ -126,7 +126,7 @@ struct CUDATest : public CUDA_CPU_TestBase
   {
     ~DeviceReset()
     {
-      checkCuda( cudaDeviceReset() ); // this is needed by cuda-memcheck --leak-check full
+      gpuDeviceReset(); // this is needed by cuda-memcheck --leak-check full
     }
   } deviceResetter;
 
@@ -252,7 +252,7 @@ INSTANTIATE_TEST_SUITE_P( prefix, \
                           test_suite_name, \
                           testing::Values( new CUDATest( MG_EPOCH_REFERENCE_FILE_NAME ) ) );
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 MG_INSTANTIATE_TEST_SUITE_GPU( XTESTID_GPU( MG_EPOCH_PROCESS_ID ), MadgraphTest );
 #else
 MG_INSTANTIATE_TEST_SUITE_CPU( XTESTID_CPU( MG_EPOCH_PROCESS_ID ), MadgraphTest );
diff --git a/epochX/cudacpp/gg_tt.sa/SubProcesses/testmisc.cc b/epochX/cudacpp/gg_tt.sa/SubProcesses/testmisc.cc
index 895d6eeb56..ba9e59a8a3 100644
--- a/epochX/cudacpp/gg_tt.sa/SubProcesses/testmisc.cc
+++ b/epochX/cudacpp/gg_tt.sa/SubProcesses/testmisc.cc
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 //----------------------------------------------------------------------------
 // Use ./runTest.exe --gtest_filter=*misc to run only testmisc.cc tests
 //----------------------------------------------------------------------------
@@ -17,7 +17,7 @@
 #include <sstream>
 #include <typeinfo>
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 #define TESTID( s ) s##_GPU_MISC
 #else
 #define TESTID( s ) s##_CPU_MISC
@@ -26,7 +26,7 @@
 #define XTESTID( s ) TESTID( s )
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -59,7 +59,7 @@ namespace mg5amcCpu
 
 TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testmisc )
 {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   using namespace mg5amcGpu;
 #else
   using namespace mg5amcCpu;
diff --git a/epochX/cudacpp/gg_tt.sa/SubProcesses/testxxx.cc b/epochX/cudacpp/gg_tt.sa/SubProcesses/testxxx.cc
index e40f635e46..016bc0f472 100644
--- a/epochX/cudacpp/gg_tt.sa/SubProcesses/testxxx.cc
+++ b/epochX/cudacpp/gg_tt.sa/SubProcesses/testxxx.cc
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Apr 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 //----------------------------------------------------------------------------
 // Use ./runTest.exe --gtest_filter=*xxx to run only testxxx.cc tests
 //----------------------------------------------------------------------------
@@ -24,7 +24,7 @@
 #include <iomanip>
 #include <iostream>
 #include <vector>
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 #define TESTID( s ) s##_GPU_XXX
 #else
 #define TESTID( s ) s##_CPU_XXX
@@ -32,7 +32,7 @@
 
 #define XTESTID( s ) TESTID( s )
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -42,7 +42,7 @@ namespace mg5amcCpu
   int FPEhandlerIevt = -1;
   inline void FPEhandler( int sig )
   {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     std::cerr << "Floating Point Exception (GPU): '" << FPEhandlerMessage << "' ievt=" << FPEhandlerIevt << std::endl;
 #else
     std::cerr << "Floating Point Exception (CPU neppV=" << neppV << "): '" << FPEhandlerMessage << "' ievt=" << FPEhandlerIevt << std::endl;
@@ -53,7 +53,7 @@ namespace mg5amcCpu
 
 TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testxxx )
 {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   using namespace mg5amcGpu;
 #else
   using namespace mg5amcCpu;
@@ -76,7 +76,7 @@ TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testxxx )
   assert( nevt % neppM == 0 ); // nevt must be a multiple of neppM
   assert( nevt % neppV == 0 ); // nevt must be a multiple of neppV
   // Fill in the input momenta
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   mg5amcGpu::PinnedHostBufferMomenta hstMomenta( nevt ); // AOSOA[npagM][npar=4][np4=4][neppM]
 #else
   mg5amcCpu::HostBufferMomenta hstMomenta( nevt ); // AOSOA[npagM][npar=4][np4=4][neppM]
@@ -321,7 +321,7 @@ TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testxxx )
   {
     for( int ievt = 0; ievt < nevt; ievt++ )
     {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
       using namespace mg5amcGpu;
 #else
       using namespace mg5amcCpu;
diff --git a/epochX/cudacpp/gg_tt.sa/src/HelAmps_sm.h b/epochX/cudacpp/gg_tt.sa/src/HelAmps_sm.h
index f6d1694588..a5fa3aa3f3 100644
--- a/epochX/cudacpp/gg_tt.sa/src/HelAmps_sm.h
+++ b/epochX/cudacpp/gg_tt.sa/src/HelAmps_sm.h
@@ -5,7 +5,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: A. Valassi (Sep 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
 // MadGraph5_aMC@NLO v. 3.5.0_lo_vect, 2023-06-09
@@ -28,7 +28,7 @@
 //#include <iomanip>
 //#include <iostream>
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/gg_tt.sa/src/Parameters_sm.cc b/epochX/cudacpp/gg_tt.sa/src/Parameters_sm.cc
index b9668e2a87..be86753c20 100644
--- a/epochX/cudacpp/gg_tt.sa/src/Parameters_sm.cc
+++ b/epochX/cudacpp/gg_tt.sa/src/Parameters_sm.cc
@@ -4,7 +4,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: A. Valassi (Sep 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
 // MadGraph5_aMC@NLO v. 3.5.0_lo_vect, 2023-06-09
@@ -17,7 +17,7 @@
 #include <iomanip>
 #include <iostream>
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 using namespace mg5amcGpu;
 #else
 using namespace mg5amcCpu;
diff --git a/epochX/cudacpp/gg_tt.sa/src/Parameters_sm.h b/epochX/cudacpp/gg_tt.sa/src/Parameters_sm.h
index 69241c2b3b..735b46cd38 100644
--- a/epochX/cudacpp/gg_tt.sa/src/Parameters_sm.h
+++ b/epochX/cudacpp/gg_tt.sa/src/Parameters_sm.h
@@ -27,7 +27,7 @@
 #include "read_slha.h"
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -93,7 +93,7 @@ namespace mg5amcCpu
 #include <limits>
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -217,7 +217,7 @@ namespace mg5amcCpu
 //==========================================================================
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -236,7 +236,7 @@ namespace mg5amcCpu
 #pragma GCC diagnostic push
 #pragma GCC diagnostic ignored "-Wunused-variable"  // e.g. <<warning: unused variable ‘mdl_G__exp__2’ [-Wunused-variable]>>
 #pragma GCC diagnostic ignored "-Wunused-parameter" // e.g. <<warning: unused parameter ‘G’ [-Wunused-parameter]>>
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 #pragma nv_diagnostic push
 #pragma nv_diag_suppress 177 // e.g. <<warning #177-D: variable "mdl_G__exp__2" was declared but never referenced>>
 #endif
@@ -263,7 +263,7 @@ namespace mg5amcCpu
       // End SM implementation - no special handling of vectors of floats as in EFT (#439)
       return out;
     }
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 #pragma GCC diagnostic pop
 #pragma nv_diagnostic pop
 #endif
diff --git a/epochX/cudacpp/gg_tt.sa/src/cudacpp_src.mk b/epochX/cudacpp/gg_tt.sa/src/cudacpp_src.mk
index 554d7a704c..f73ff6fa03 100644
--- a/epochX/cudacpp/gg_tt.sa/src/cudacpp_src.mk
+++ b/epochX/cudacpp/gg_tt.sa/src/cudacpp_src.mk
@@ -38,13 +38,13 @@ endif
 
 #-------------------------------------------------------------------------------
 
-#=== Configure the CUDA compiler (note: NVCC is already exported including ccache)
+#=== Configure the CUDA compiler (note: GPUCC is already exported including ccache)
 
-###$(info NVCC=$(NVCC))
+###$(info GPUCC=$(GPUCC))
 
 #-------------------------------------------------------------------------------
 
-#=== Configure ccache for C++ builds (note: NVCC is already exported including ccache)
+#=== Configure ccache for C++ builds (note: GPUCC is already exported including ccache)
 
 # Enable ccache if USECCACHE=1
 ifeq ($(USECCACHE)$(shell echo $(CXX) | grep ccache),1)
@@ -246,20 +246,20 @@ $(BUILDDIR)/%.o : %.cc *.h $(BUILDDIR)/.build.$(TAG)
 # Generic target and build rules: objects from CUDA compilation
 $(BUILDDIR)/%_cu.o : %.cc *.h $(BUILDDIR)/.build.$(TAG)
 	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
-	$(NVCC) $(CPPFLAGS) $(CUFLAGS) -Xcompiler -fPIC -c -x cu $< -o $@
+	$(GPUCC) $(CPPFLAGS) $(CUFLAGS) -Xcompiler -fPIC -c -x cu $< -o $@
 
 #-------------------------------------------------------------------------------
 
 cxx_objects=$(addprefix $(BUILDDIR)/, Parameters_sm.o read_slha.o)
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 cu_objects=$(addprefix $(BUILDDIR)/, Parameters_sm_cu.o)
 endif
 
 # Target (and build rules): common (src) library
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so : $(cxx_objects) $(cu_objects)
 	@if [ ! -d $(LIBDIR) ]; then echo "mkdir -p $(LIBDIR)"; mkdir -p $(LIBDIR); fi
-	$(NVCC) -shared -o $@ $(cxx_objects) $(cu_objects)
+	$(GPUCC) -shared -o $@ $(cxx_objects) $(cu_objects)
 else
 $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so : $(cxx_objects)
 	@if [ ! -d $(LIBDIR) ]; then echo "mkdir -p $(LIBDIR)"; mkdir -p $(LIBDIR); fi
diff --git a/epochX/cudacpp/gg_tt.sa/src/mgOnGpuConfig.h b/epochX/cudacpp/gg_tt.sa/src/mgOnGpuConfig.h
index c0f067f1d8..205accb85b 100644
--- a/epochX/cudacpp/gg_tt.sa/src/mgOnGpuConfig.h
+++ b/epochX/cudacpp/gg_tt.sa/src/mgOnGpuConfig.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jul 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MGONGPUCONFIG_H
 #define MGONGPUCONFIG_H 1
@@ -10,13 +10,27 @@
 // There are two different code bases for standalone_cudacpp (without multichannel) and madevent+cudacpp (with multichannel)
 #undef MGONGPU_SUPPORTS_MULTICHANNEL
 
+// Is this a GPU (CUDA, HIP) or CPU implementation?
+#ifdef __CUDACC__
+#define MGONGPUCPP_GPUIMPL cuda
+#elif defined __HIPCC__
+#define MGONGPUCPP_GPUIMPL hip
+#include "hip/hip_runtime.h"
+#else
+#undef MGONGPUCPP_GPUIMPL
+#endif
+
 // ** NB1 Throughputs (e.g. 6.8E8) are events/sec for "./gcheck.exe -p 65536 128 12"
 // ** NB2 Baseline on b7g47n0004 fluctuates (probably depends on load on other VMs)
 
 // Choose if curand is supported for generating random numbers
+// For CUDA, by default, it is supported
+// For HIP, by default, it is not supported
 // For C++, by default, do not inline, but allow this macro to be set from outside with e.g. -DMGONGPU_HAS_NO_CURAND
 #ifdef __CUDACC__
 #undef MGONGPU_HAS_NO_CURAND
+#elif defined __HIPCC__
+#define MGONGPU_HAS_NO_CURAND 1
 #else
 //#undef MGONGPU_HAS_NO_CURAND // default
 ////#define MGONGPU_HAS_NO_CURAND 1
@@ -52,23 +66,28 @@
 //#undef MGONGPU_HARDCODE_PARAM // default
 ////#define MGONGPU_HARDCODE_PARAM 1
 
-// Complex type in c++: std::complex or cxsmpl (CHOOSE ONLY ONE)
-#ifndef __CUDACC__
-//#define MGONGPU_CPPCXTYPE_STDCOMPLEX 1 // ~8 percent slower on float, same on double (5.1E6/double, 9.4E6/float)
-#define MGONGPU_CPPCXTYPE_CXSMPL 1 // new default (5.1E6/double, 10.2E6/float)
-#endif
-
-// Complex type in cuda: thrust or cucomplex or cxsmpl (CHOOSE ONLY ONE)
+// Complex type in CUDA: thrust or cucomplex or cxsmpl (CHOOSE ONLY ONE)
 #ifdef __CUDACC__
 #define MGONGPU_CUCXTYPE_THRUST 1 // default (~1.15E9/double, ~3.2E9/float)
 //#define MGONGPU_CUCXTYPE_CUCOMPLEX 1 // ~10 percent slower (1.03E9/double, ~2.8E9/float)
 //#define MGONGPU_CUCXTYPE_CXSMPL 1 // ~10 percent slower (1.00E9/double, ~2.9E9/float)
+
+// Complex type in HIP: cxsmpl (ONLY ONE OPTION POSSIBLE)
+#elif defined __HIPCC__
+#define MGONGPU_CUCXTYPE_CXSMPL 1 // ~10 percent slower (1.00E9/double, ~2.9E9/float)
+
+// Complex type in C++: std::complex or cxsmpl (CHOOSE ONLY ONE)
+#else
+//#define MGONGPU_CPPCXTYPE_STDCOMPLEX 1 // ~8 percent slower on float, same on double (5.1E6/double, 9.4E6/float)
+#define MGONGPU_CPPCXTYPE_CXSMPL 1 // new default (5.1E6/double, 10.2E6/float)
 #endif
 
-// Cuda nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation
+// CUDA nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation
 #ifdef __CUDACC__
-#undef MGONGPU_NSIGHT_DEBUG // default
+#undef MGONGPU_NSIGHT_DEBUG // default in CUDA
 //#define MGONGPU_NSIGHT_DEBUG 1
+#else
+#undef MGONGPU_NSIGHT_DEBUG // only option in HIP or C++
 #endif
 
 // SANITY CHECKS (floating point precision for everything but color algebra #537)
@@ -84,17 +103,21 @@
 #error You cannot use double precision for color algebra and single precision elsewhere
 #endif
 
-// SANITY CHECKS (c++ complex number implementation)
-#ifndef __CUDACC__
-#if defined MGONGPU_CPPCXTYPE_STDCOMPLEX and defined MGONGPU_CPPCXTYPE_CXSMPL
-#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CPPCXTYPE_STDCOMPLEX or MGONGPU_CPPCXTYPE_CXSMPL
+// SANITY CHECKS (CUDA complex number implementation)
+#ifdef __CUDACC__
+#if defined MGONGPU_CUCXTYPE_THRUST and defined MGONGPU_CUCXTYPE_CUCOMPLEX
+#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CUCXTYPE_THRUST or MGONGPU_CUCXTYPE_CUCOMPLEX for CUDA
+#elif defined MGONGPU_CUCXTYPE_THRUST and defined MGONGPU_CUCXTYPE_CXSMPL
+#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CUCXTYPE_THRUST or MGONGPU_CUCXTYPE_CXSMPL for CUDA
+#elif defined MGONGPU_CUCXTYPE_CUCOMPLEX and defined MGONGPU_CUCXTYPE_CXSMPL
+#error You must CHOOSE (ONE AND) ONLY ONE OF MGONGPU_CUCXTYPE_CUCOMPLEX or MGONGPU_CUCXTYPE_CXSMPL for CUDA
 #endif
 #endif
 
-// SANITY CHECKS (cuda complex number implementation)
-#ifdef __CUDACC__
-#if defined MGONGPU_CUCXTYPE_THRUST and defined MGONGPU_CUCXTYPE_CUCOMPLEX and defined MGONGPU_CUCXTYPE_CXSMPL
-#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CUCXTYPE_THRUST or MGONGPU_CUCXTYPE_CUCOMPLEX or MGONGPU_CUCXTYPE_CXSMPL
+// SANITY CHECKS (C++ complex number implementation)
+#ifndef MGONGPUCPP_GPUIMPL
+#if defined MGONGPU_CPPCXTYPE_STDCOMPLEX and defined MGONGPU_CPPCXTYPE_CXSMPL
+#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CPPCXTYPE_STDCOMPLEX or MGONGPU_CPPCXTYPE_CXSMPL for C++
 #endif
 #endif
 
@@ -132,7 +155,7 @@ namespace mgOnGpu
   // Alignment requirement for using reinterpret_cast with SIMD vectorized code
   // (using reinterpret_cast with non aligned memory may lead to segmentation faults!)
   // Only needed for C++ code but can be enforced also in NVCC builds of C++ code using CUDA>=11.2 and C++17 (#318, #319, #333)
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   constexpr int cppAlign = 64; // alignment requirement for SIMD vectorization (64-byte i.e. 512-bit)
 #endif
 
@@ -143,7 +166,7 @@ using mgOnGpu::fptype;
 using mgOnGpu::fptype2;
 
 // C++ SIMD vectorization width (this will be used to set neppV)
-#ifdef __CUDACC__ // CUDA implementation has no SIMD
+#ifdef MGONGPUCPP_GPUIMPL // CUDA and HIP implementations have no SIMD
 #undef MGONGPU_CPPSIMD
 #elif defined __AVX512VL__ && defined MGONGPU_PVW512 // C++ "512z" AVX512 with 512 width (512-bit ie 64-byte): 8 (DOUBLE) or 16 (FLOAT)
 #ifdef MGONGPU_FPTYPE_DOUBLE
@@ -173,9 +196,9 @@ using mgOnGpu::fptype2;
 #undef MGONGPU_CPPSIMD
 #endif
 
-// Cuda nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation
+// CUDA nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation
 // Arguments (not used so far): text is __FUNCTION__, code is 0 (start) or 1 (end)
-#if defined __CUDACC__ && defined MGONGPU_NSIGHT_DEBUG /* clang-format off */
+#if defined __CUDA__ && defined MGONGPU_NSIGHT_DEBUG /* clang-format off */
 #define mgDebugDeclare() __shared__ float mgDebugCounter[mgOnGpu::ntpbMAX];
 #define mgDebugInitialise() { mgDebugCounter[threadIdx.x] = 0; }
 #define mgDebug( code, text ) { mgDebugCounter[threadIdx.x] += 1; }
@@ -187,8 +210,8 @@ using mgOnGpu::fptype2;
 #define mgDebugFinalise() { /*noop*/ }
 #endif /* clang-format on */
 
-// Define empty CUDA declaration specifiers for C++
-#ifndef __CUDACC__
+// Define empty CUDA/HIP declaration specifiers for C++
+#ifndef MGONGPUCPP_GPUIMPL
 #define __global__
 #define __host__
 #define __device__
diff --git a/epochX/cudacpp/gg_tt.sa/src/mgOnGpuCxtypes.h b/epochX/cudacpp/gg_tt.sa/src/mgOnGpuCxtypes.h
index b56348bc58..6ae0c42ecb 100644
--- a/epochX/cudacpp/gg_tt.sa/src/mgOnGpuCxtypes.h
+++ b/epochX/cudacpp/gg_tt.sa/src/mgOnGpuCxtypes.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022, based on earlier work by D. Smith) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MGONGPUCXTYPES_H
 #define MGONGPUCXTYPES_H 1
@@ -19,7 +19,7 @@
 #include <complex>
 
 // Complex type in cuda: thrust or cucomplex or cxsmpl
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 #if defined MGONGPU_CUCXTYPE_THRUST
 #pragma clang diagnostic push
 #pragma clang diagnostic ignored "-Wtautological-compare" // for icpx2021/clang13 (https://stackoverflow.com/a/15864661)
@@ -82,7 +82,7 @@ namespace mgOnGpu /* clang-format off */
 using mgOnGpu::cxsmpl;
 
 // Printout to stream for user defined types
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -209,14 +209,14 @@ namespace mg5amcCpu
 //==========================================================================
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
 #endif
 {
   // --- Type definitions (complex type: cxtype)
-#ifdef __CUDACC__ // cuda
+#ifdef MGONGPUCPP_GPUIMPL // cuda
 #if defined MGONGPU_CUCXTYPE_THRUST
   typedef thrust::complex<fptype> cxtype;
 #elif defined MGONGPU_CUCXTYPE_CUCOMPLEX
@@ -249,7 +249,7 @@ namespace mg5amcCpu
 //==========================================================================
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -301,7 +301,7 @@ namespace mg5amcCpu
 
   //==========================================================================
 
-#if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_THRUST // cuda + thrust
+#if defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CUCXTYPE_THRUST // cuda + thrust
 
   //------------------------------
   // CUDA - using thrust::complex
@@ -337,11 +337,11 @@ namespace mg5amcCpu
     return c;
   }
 
-#endif // #if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_THRUST
+#endif // #if defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CUCXTYPE_THRUST
 
   //==========================================================================
 
-#if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_CUCOMPLEX // cuda + cucomplex
+#if defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CUCXTYPE_CUCOMPLEX // cuda + cucomplex
 
   //------------------------------
   // CUDA - using cuComplex
@@ -556,11 +556,11 @@ namespace mg5amcCpu
     return cxmake( c.real(), c.imag() );
   }
 
-#endif // #if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_CUCOMPLEX
+#endif // #if defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CUCXTYPE_CUCOMPLEX
 
   //==========================================================================
 
-#if not defined __CUDACC__ and defined MGONGPU_CPPCXTYPE_STDCOMPLEX // c++ + stdcomplex
+#if not defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CPPCXTYPE_STDCOMPLEX // c++ + stdcomplex
 
   //------------------------------
   // C++ - using std::complex
@@ -604,7 +604,7 @@ namespace mg5amcCpu
   }
 #endif
 
-#endif // #if not defined __CUDACC__ and defined MGONGPU_CPPCXTYPE_STDCOMPLEX
+#endif // #if not defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CPPCXTYPE_STDCOMPLEX
 
   //==========================================================================
 
@@ -627,7 +627,7 @@ namespace mg5amcCpu
 //==========================================================================
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/gg_tt.sa/src/mgOnGpuFptypes.h b/epochX/cudacpp/gg_tt.sa/src/mgOnGpuFptypes.h
index 905c97d700..fa3a02664b 100644
--- a/epochX/cudacpp/gg_tt.sa/src/mgOnGpuFptypes.h
+++ b/epochX/cudacpp/gg_tt.sa/src/mgOnGpuFptypes.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MGONGPUFPTYPES_H
 #define MGONGPUFPTYPES_H 1
@@ -12,7 +12,7 @@
 #include <cmath>
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL // cuda
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -20,7 +20,7 @@ namespace mg5amcCpu
 {
   //==========================================================================
 
-#ifdef __CUDACC__ // cuda
+#ifdef MGONGPUCPP_GPUIMPL // cuda
 
   //------------------------------
   // Floating point types - Cuda
@@ -64,11 +64,11 @@ namespace mg5amcCpu
 #endif
   }
 
-#endif // #ifdef __CUDACC__
+#endif // #ifdef MGONGPUCPP_GPUIMPL
 
   //==========================================================================
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 
   //------------------------------
   // Floating point types - C++
@@ -92,7 +92,7 @@ namespace mg5amcCpu
     return std::sqrt( f );
   }
 
-#endif // #ifndef __CUDACC__
+#endif // #ifndef MGONGPUCPP_GPUIMPL
 
   //==========================================================================
 
diff --git a/epochX/cudacpp/gg_tt.sa/src/mgOnGpuVectors.h b/epochX/cudacpp/gg_tt.sa/src/mgOnGpuVectors.h
index e1299ba81e..cdae04326b 100644
--- a/epochX/cudacpp/gg_tt.sa/src/mgOnGpuVectors.h
+++ b/epochX/cudacpp/gg_tt.sa/src/mgOnGpuVectors.h
@@ -32,7 +32,7 @@
 #endif
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -131,7 +131,7 @@ namespace mg5amcCpu
 #endif
 #endif
 
-#else // i.e #ifndef MGONGPU_CPPSIMD (this includes #ifdef __CUDACC__)
+#else // i.e #ifndef MGONGPU_CPPSIMD (this includes #ifdef MGONGPUCPP_GPUIMPL)
 
   const int neppV = 1;
 
@@ -153,13 +153,13 @@ namespace mg5amcCpu
 //==========================================================================
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
 #endif
 {
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 
   // Printout to stream for user defined types
 
@@ -805,11 +805,11 @@ namespace mg5amcCpu
 
 #endif // #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
 
-#endif // #ifndef __CUDACC__
+#endif // #ifndef MGONGPUCPP_GPUIMPL
 
   //==========================================================================
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 
   //------------------------------
   // Vector types - CUDA
@@ -853,12 +853,12 @@ namespace mg5amcCpu
     return mask;
   }
 
-#endif // #ifdef __CUDACC__
+#endif // #ifdef MGONGPUCPP_GPUIMPL
 
   //==========================================================================
 
   // Scalar-or-vector types: scalar in CUDA, vector or scalar in C++
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   typedef bool bool_sv;
   typedef fptype fptype_sv;
   typedef fptype2 fptype2_sv;
@@ -879,7 +879,7 @@ namespace mg5amcCpu
 #endif
 
   // Scalar-or-vector zeros: scalar in CUDA, vector or scalar in C++
-#ifdef __CUDACC__ /* clang-format off */
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
   inline __host__ __device__ cxtype cxzero_sv(){ return cxtype( 0, 0 ); }
 #elif defined MGONGPU_CPPSIMD
   inline cxtype_v cxzero_sv() { return cxtype_v(); } // RRRR=0000 IIII=0000
diff --git a/epochX/cudacpp/gg_tt.sa/src/rambo.h b/epochX/cudacpp/gg_tt.sa/src/rambo.h
index e02ea52496..cd7e1008ea 100644
--- a/epochX/cudacpp/gg_tt.sa/src/rambo.h
+++ b/epochX/cudacpp/gg_tt.sa/src/rambo.h
@@ -4,7 +4,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 
 #include "mgOnGpuConfig.h"
@@ -18,7 +18,7 @@
 #include <iostream>
 
 // Simplified rambo version for 2 to N (with N>=2) processes with massless particles
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -83,7 +83,7 @@ namespace mg5amcCpu
       static bool first = true;
       if( first )
       {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
         if constexpr( M_ACCESS::isOnDevice() ) // avoid
         {
           const int ievt0 = 0;
@@ -166,7 +166,7 @@ namespace mg5amcCpu
     wt = po2log;
     if( nparf != 2 ) wt = ( 2. * nparf - 4. ) * log( energy ) + z[nparf - 1];
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
     // issue warnings if weight is too small or too large
     static int iwarn[5] = { 0, 0, 0, 0, 0 };
     if( wt < -180. )
diff --git a/epochX/cudacpp/gg_ttg.mad/CODEGEN_mad_gg_ttg_log.txt b/epochX/cudacpp/gg_ttg.mad/CODEGEN_mad_gg_ttg_log.txt
index f3823147cb..cb05831ae6 100644
--- a/epochX/cudacpp/gg_ttg.mad/CODEGEN_mad_gg_ttg_log.txt
+++ b/epochX/cudacpp/gg_ttg.mad/CODEGEN_mad_gg_ttg_log.txt
@@ -53,7 +53,7 @@ Note that you can still compile and run aMC@NLO with the built-in PDFs
 Using default text editor "vi". Set another one in ./input/mg5_configuration.txt
 Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt
 No valid web browser found. Please set in ./input/mg5_configuration.txt
-import /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_mad_gg_ttg.mg
+import /afs/cern.ch/work/j/jteig/mg5amcnlo/CODEGEN_mad_gg_ttg.mg
 The import format was not given, so we guess it as command
 set stdout_level DEBUG
 set output information to level: 10
@@ -62,7 +62,7 @@ generate g g > t t~ g
 No model currently active, so we import the Standard Model
 INFO: load particles 
 INFO: load vertices 
-[1;32mDEBUG: model prefixing  takes 0.005174398422241211 [0m
+[1;32mDEBUG: model prefixing  takes 0.0036163330078125 [0m
 INFO: Restrict model sm with file models/sm/restrict_default.dat . 
 [1;32mDEBUG: Simplifying conditional expressions [0m
 [1;32mDEBUG: remove interactions: u s w+ at order: QED=1 [0m
@@ -155,7 +155,7 @@ INFO: Please specify coupling orders to bypass this step.
 INFO: Trying coupling order WEIGHTED<=3: WEIGTHED IS QCD+2*QED 
 INFO: Trying process: g g > t t~ g WEIGHTED<=3 @1  
 INFO: Process has 16 diagrams 
-1 processes with 16 diagrams generated in 0.020 s
+1 processes with 16 diagrams generated in 0.016 s
 Total: 1 processes with 16 diagrams
 output madevent CODEGEN_mad_gg_ttg --hel_recycling=False --vector_size=16384 --me_exporter=standalone_cudacpp
 Load PLUGIN.CUDACPP_SA_OUTPUT
@@ -166,10 +166,10 @@ Load PLUGIN.CUDACPP_SA_OUTPUT
 INFO: initialize a new directory: CODEGEN_mad_gg_ttg 
 INFO: remove old information in CODEGEN_mad_gg_ttg 
 [1;32mDEBUG:  Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [1;30m[output.py at line 157][0m [0m
-[1;34mWARNING: File exists /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_mad_gg_ttg [0m
-INFO: Creating subdirectories in directory /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_mad_gg_ttg 
-[1;34mWARNING: File exists /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_mad_gg_ttg/Cards [0m
-[1;34mWARNING: File exists /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_mad_gg_ttg/SubProcesses [0m
+[1;34mWARNING: File exists /afs/cern.ch/work/j/jteig/mg5amcnlo/CODEGEN_mad_gg_ttg [0m
+INFO: Creating subdirectories in directory /afs/cern.ch/work/j/jteig/mg5amcnlo/CODEGEN_mad_gg_ttg 
+[1;34mWARNING: File exists /afs/cern.ch/work/j/jteig/mg5amcnlo/CODEGEN_mad_gg_ttg/Cards [0m
+[1;34mWARNING: File exists /afs/cern.ch/work/j/jteig/mg5amcnlo/CODEGEN_mad_gg_ttg/SubProcesses [0m
 INFO: Organizing processes into subprocess groups 
 INFO: Generating Helas calls for process: g g > t t~ g WEIGHTED<=3 @1 
 INFO: Processing color information for process: g g > t t~ g @1 
@@ -177,7 +177,7 @@ INFO: Creating files in directory P1_gg_ttxg
 [1;32mDEBUG:  Entering PLUGIN_OneProcessExporter.__init__ [1;30m[model_handling.py at line 1039][0m [0m
 [1;32mDEBUG:  kwargs[prefix] = 0 [1;30m[model_handling.py at line 1040][0m [0m
 [1;32mDEBUG:  proc_id = [0m 1 [1;30m[model_handling.py at line 1045][0m [0m
-[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_SA_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7f92d6f52e20> [1;30m[export_v4.py at line 6174][0m [0m
+[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_SA_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7fa71e369ee0> [1;30m[export_v4.py at line 6174][0m [0m
 INFO: Creating files in directory . 
 [1;32mDEBUG:  Entering PLUGIN_OneProcessExporter.generate_process_files [1;30m[model_handling.py at line 1297][0m [0m
 [1;32mDEBUG:  self.include_multi_channel is already defined: this is madevent+second_exporter mode [1;30m[model_handling.py at line 1299][0m [0m
@@ -192,13 +192,13 @@ FileWriter <class 'PLUGIN.CUDACPP_SA_OUTPUT.model_handling.PLUGIN_CPPWriter'> fo
 [1;32mDEBUG:  self.support_multichannel, self.include_multi_channel = [0m True [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 0] [1;30m[model_handling.py at line 1163][0m [0m
 [1;32mDEBUG:  multi_channel = [0m {1: [0], 2: [1], 3: [2], 4: [3], 5: [4], 6: [5], 7: [6], 8: [7], 9: [8], 10: [9], 11: [10], 12: [11], 13: [12], 14: [13], 15: [14]} [1;30m[model_handling.py at line 1169][0m [0m
 [1;32mDEBUG:  multi_channel_map = [0m {1: [0], 2: [1], 3: [2], 4: [3], 5: [4], 6: [5], 7: [6], 8: [7], 9: [8], 10: [9], 11: [10], 12: [11], 13: [12], 14: [13], 15: [14]} [1;30m[model_handling.py at line 1654][0m [0m
-[1;32mDEBUG:  diag_to_config = [0m {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14, 15: 15} [1;30m[model_handling.py at line 1709][0m [0m
-[1;32mDEBUG:  call = [0m vxxxxx( momenta,m_pars->%s, cHel[ihel][%d],%+d, w_sv[%d], %d ); [1;30m[model_handling.py at line 1821][0m [0m
-[1;32mDEBUG:  ('ZERO', 0, -1, 0, 0) [1;30m[model_handling.py at line 1822][0m [0m
-[1;32mDEBUG:  call = [0m vxxxxx( momenta,m_pars->%s, cHel[ihel][%d],%+d, w_sv[%d], %d ); [1;30m[model_handling.py at line 1821][0m [0m
-[1;32mDEBUG:  ('ZERO', 1, -1, 1, 1) [1;30m[model_handling.py at line 1822][0m [0m
-[1;32mDEBUG:  call = [0m vxxxxx( momenta,m_pars->%s, cHel[ihel][%d],%+d, w_sv[%d], %d ); [1;30m[model_handling.py at line 1821][0m [0m
-[1;32mDEBUG:  ('ZERO', 4, 1, 4, 4) [1;30m[model_handling.py at line 1822][0m [0m
+[1;32mDEBUG:  diag_to_config = [0m {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14, 15: 15} [1;30m[model_handling.py at line 1711][0m [0m
+[1;32mDEBUG:  call = [0m vxxxxx( momenta,m_pars->%s, cHel[ihel][%d],%+d, w_sv[%d], %d ); [1;30m[model_handling.py at line 1823][0m [0m
+[1;32mDEBUG:  ('ZERO', 0, -1, 0, 0) [1;30m[model_handling.py at line 1824][0m [0m
+[1;32mDEBUG:  call = [0m vxxxxx( momenta,m_pars->%s, cHel[ihel][%d],%+d, w_sv[%d], %d ); [1;30m[model_handling.py at line 1823][0m [0m
+[1;32mDEBUG:  ('ZERO', 1, -1, 1, 1) [1;30m[model_handling.py at line 1824][0m [0m
+[1;32mDEBUG:  call = [0m vxxxxx( momenta,m_pars->%s, cHel[ihel][%d],%+d, w_sv[%d], %d ); [1;30m[model_handling.py at line 1823][0m [0m
+[1;32mDEBUG:  ('ZERO', 4, 1, 4, 4) [1;30m[model_handling.py at line 1824][0m [0m
 INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./. 
 [1;32mDEBUG:  Entering PLUGIN_OneProcessExporter.edit_CMakeLists [1;30m[model_handling.py at line 1343][0m [0m
 [1;32mDEBUG:  Entering PLUGIN_OneProcessExporter.edit_check_sa [1;30m[model_handling.py at line 1352][0m [0m
@@ -208,22 +208,22 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./.
 [1;32mDEBUG:  Entering PLUGIN_OneProcessExporter.edit_testxxx [1;30m[model_handling.py at line 1419][0m [0m
 [1;32mDEBUG:  Entering PLUGIN_OneProcessExporter.edit_memorybuffers [1;30m[model_handling.py at line 1430][0m [0m
 [1;32mDEBUG:  Entering PLUGIN_OneProcessExporter.edit_memoryaccesscouplings [1;30m[model_handling.py at line 1441][0m [0m
-[1;32mDEBUG:  'Copying test reference file: ', template_ref  = [0m Copying test reference file:  /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/../../../test/ref/dump_CPUTest.Sigma_sm_gg_ttxg.txt [1;30m[model_handling.py at line 1335][0m [0m
+[1;32mDEBUG:  'Copying test reference file: ', template_ref  = [0m Copying test reference file:  /afs/cern.ch/work/j/jteig/mg5amcnlo/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/../../../test/ref/dump_CPUTest.Sigma_sm_gg_ttxg.txt [1;30m[model_handling.py at line 1335][0m [0m
 [1;32mDEBUG:  proc_id = [0m 1 [1;30m[export_cpp.py at line 710][0m [0m
 [1;32mDEBUG:  config_map = [0m [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 0] [1;30m[export_cpp.py at line 711][0m [0m
 [1;32mDEBUG:  subproc_number = [0m 0 [1;30m[export_cpp.py at line 712][0m [0m
 [1;32mDEBUG:  Done [1;30m[export_cpp.py at line 713][0m [0m
 INFO: Generating Feynman diagrams for Process: g g > t t~ g WEIGHTED<=3 @1 
 INFO: Finding symmetric diagrams for subprocess group gg_ttxg 
-Generated helas calls for 1 subprocesses (16 diagrams) in 0.035 s
-Wrote files for 36 helas calls in 0.151 s
+Generated helas calls for 1 subprocesses (16 diagrams) in 0.029 s
+Wrote files for 36 helas calls in 0.285 s
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates VVV1 routines[0m
 ALOHA: aloha creates FFV1 routines[0m
 ALOHA: aloha creates VVVV1 set of routines with options: P0[0m
 ALOHA: aloha creates VVVV3 set of routines with options: P0[0m
 ALOHA: aloha creates VVVV4 set of routines with options: P0[0m
-ALOHA: aloha creates 5 routines in  0.265 s
+ALOHA: aloha creates 5 routines in  0.247 s
 [1;32mDEBUG:  Entering PLUGIN_ProcessExporter.convert_model (create the model) [1;30m[output.py at line 194][0m [0m
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates VVV1 routines[0m
@@ -231,7 +231,7 @@ ALOHA: aloha creates FFV1 routines[0m
 ALOHA: aloha creates VVVV1 set of routines with options: P0[0m
 ALOHA: aloha creates VVVV3 set of routines with options: P0[0m
 ALOHA: aloha creates VVVV4 set of routines with options: P0[0m
-ALOHA: aloha creates 10 routines in  0.284 s
+ALOHA: aloha creates 10 routines in  0.229 s
 <class 'aloha.create_aloha.AbstractRoutine'> VVV1
 <class 'aloha.create_aloha.AbstractRoutine'> VVV1
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
@@ -241,8 +241,8 @@ ALOHA: aloha creates 10 routines in  0.284 s
 <class 'aloha.create_aloha.AbstractRoutine'> VVVV1
 <class 'aloha.create_aloha.AbstractRoutine'> VVVV3
 <class 'aloha.create_aloha.AbstractRoutine'> VVVV4
-FileWriter <class 'PLUGIN.CUDACPP_SA_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_mad_gg_ttg/src/./HelAmps_sm.h
-INFO: Created file HelAmps_sm.h in directory /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_mad_gg_ttg/src/. 
+FileWriter <class 'PLUGIN.CUDACPP_SA_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /afs/cern.ch/work/j/jteig/mg5amcnlo/CODEGEN_mad_gg_ttg/src/./HelAmps_sm.h
+INFO: Created file HelAmps_sm.h in directory /afs/cern.ch/work/j/jteig/mg5amcnlo/CODEGEN_mad_gg_ttg/src/. 
 super_write_set_parameters_onlyfixMajorana (hardcoded=False)
 [1;32mDEBUG:  'pardef_lines size =', len(pardef_lines), ', keys size =', len(pardef_lines.keys())  = [0m pardef_lines size = 48 , keys size = 48 [1;30m[model_handling.py at line 729][0m [0m
 super_write_set_parameters_onlyfixMajorana (hardcoded=True)
@@ -251,22 +251,22 @@ super_write_set_parameters_onlyfixMajorana (hardcoded=True)
 [1;32mDEBUG:  'pardef_lines size =', len(pardef_lines), ', keys size =', len(pardef_lines.keys())  = [0m pardef_lines size = 3 , keys size = 3 [1;30m[model_handling.py at line 729][0m [0m
 [1;32mDEBUG:  'pardef_lines size =', len(pardef_lines), ', keys size =', len(pardef_lines.keys())  = [0m pardef_lines size = 3 , keys size = 3 [1;30m[model_handling.py at line 729][0m [0m
 [1;32mDEBUG:  'pardef_lines size =', len(pardef_lines), ', keys size =', len(pardef_lines.keys())  = [0m pardef_lines size = 3 , keys size = 3 [1;30m[model_handling.py at line 729][0m [0m
-FileWriter <class 'PLUGIN.CUDACPP_SA_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_mad_gg_ttg/src/./Parameters_sm.h
-FileWriter <class 'PLUGIN.CUDACPP_SA_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_mad_gg_ttg/src/./Parameters_sm.cc
+FileWriter <class 'PLUGIN.CUDACPP_SA_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /afs/cern.ch/work/j/jteig/mg5amcnlo/CODEGEN_mad_gg_ttg/src/./Parameters_sm.h
+FileWriter <class 'PLUGIN.CUDACPP_SA_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /afs/cern.ch/work/j/jteig/mg5amcnlo/CODEGEN_mad_gg_ttg/src/./Parameters_sm.cc
 INFO: Created files Parameters_sm.h and Parameters_sm.cc in directory 
-INFO: /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_mad_gg_ttg/src/. and /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_mad_gg_ttg/src/. 
+INFO: /afs/cern.ch/work/j/jteig/mg5amcnlo/CODEGEN_mad_gg_ttg/src/. and /afs/cern.ch/work/j/jteig/mg5amcnlo/CODEGEN_mad_gg_ttg/src/. 
 The option zerowidth_tchannel is modified [True] but will not be written in the configuration files.
 If you want to make this value the default for future session, you can run 'save options --all'
-save configuration file to /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_mad_gg_ttg/Cards/me5_configuration.txt
+save configuration file to /afs/cern.ch/work/j/jteig/mg5amcnlo/CODEGEN_mad_gg_ttg/Cards/me5_configuration.txt
 INFO: Use Fortran compiler gfortran 
 INFO: Use c++ compiler g++ 
 INFO: Generate web pages 
-Output to directory /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_mad_gg_ttg done.
+Output to directory /afs/cern.ch/work/j/jteig/mg5amcnlo/CODEGEN_mad_gg_ttg done.
 Type "launch" to generate events from this process, or see
-/data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_mad_gg_ttg/README
+/afs/cern.ch/work/j/jteig/mg5amcnlo/CODEGEN_mad_gg_ttg/README
 Run "open index.html" to see more information about this process.
 quit
 
-real	0m2.462s
-user	0m2.051s
-sys	0m0.329s
+real	0m4.906s
+user	0m1.545s
+sys	0m1.926s
diff --git a/epochX/cudacpp/gg_ttg.mad/COPYRIGHT b/epochX/cudacpp/gg_ttg.mad/COPYRIGHT
index a134b5fef9..84a883fbb0 100644
--- a/epochX/cudacpp/gg_ttg.mad/COPYRIGHT
+++ b/epochX/cudacpp/gg_ttg.mad/COPYRIGHT
@@ -15,6 +15,7 @@ The full development team currently includes the following authors :
   Stephan Hageboeck (CERN)
   Olivier Mattelaer (Universite Catholique de Louvain, original author)
   Stefan Roiser (CERN, original author)
+  Joergen Teig (CERN)
   Andrea Valassi (CERN, original author)
   Zenny Wettersten (CERN)
 See https://github.com/madgraph5/madgraph4gpu for more details. For the full
diff --git a/epochX/cudacpp/gg_ttg.mad/Cards/me5_configuration.txt b/epochX/cudacpp/gg_ttg.mad/Cards/me5_configuration.txt
index 00d7c6f8d6..9e9ed9d752 100644
--- a/epochX/cudacpp/gg_ttg.mad/Cards/me5_configuration.txt
+++ b/epochX/cudacpp/gg_ttg.mad/Cards/me5_configuration.txt
@@ -234,7 +234,7 @@
 # pineappl = pineappl
 
 
-#mg5_path = /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo 
+#mg5_path = /afs/cern.ch/work/j/jteig/mg5amcnlo 
 
 # MG5 MAIN DIRECTORY
-#mg5_path = /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo
+#mg5_path = /afs/cern.ch/work/j/jteig/mg5amcnlo
diff --git a/epochX/cudacpp/gg_ttg.mad/Source/DHELAS/aloha_file.inc b/epochX/cudacpp/gg_ttg.mad/Source/DHELAS/aloha_file.inc
index 50c12b0804..9d01a65cec 100644
--- a/epochX/cudacpp/gg_ttg.mad/Source/DHELAS/aloha_file.inc
+++ b/epochX/cudacpp/gg_ttg.mad/Source/DHELAS/aloha_file.inc
@@ -1 +1 @@
-ALOHARoutine = FFV1_1.o VVVV4P0_1.o FFV1_0.o VVV1_0.o FFV1_2.o VVVV3P0_1.o VVVV1P0_1.o VVV1P0_1.o FFV1P0_3.o
+ALOHARoutine = VVV1_0.o VVV1P0_1.o FFV1_0.o FFV1_1.o FFV1_2.o FFV1P0_3.o VVVV1P0_1.o VVVV3P0_1.o VVVV4P0_1.o
diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/Bridge.h b/epochX/cudacpp/gg_ttg.mad/SubProcesses/Bridge.h
index d7e629cacd..f37c972b24 100644
--- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/Bridge.h
+++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/Bridge.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: S. Roiser (Nov 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Roiser, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef BRIDGE_H
 #define BRIDGE_H 1
@@ -22,7 +22,7 @@
 #include <memory>
 #include <type_traits>
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -82,7 +82,7 @@ namespace mg5amcCpu
     Bridge& operator=( const Bridge& ) = delete;
     Bridge& operator=( Bridge&& ) = delete;
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     /**
      * Set the gpublocks and gputhreads for the gpusequence - throws if evnt != gpublocks*gputhreads
      * (this is needed for BridgeKernel tests rather than for actual production use in Fortran)
@@ -149,7 +149,7 @@ namespace mg5amcCpu
     unsigned int m_nevt; // number of events
     int m_nGoodHel;      // the number of good helicities (-1 initially when they have not yet been calculated)
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     int m_gputhreads; // number of gpu threads (default set from number of events, can be modified)
     int m_gpublocks;  // number of gpu blocks (default set from number of events, can be modified)
     DeviceBuffer<FORTRANFPTYPE, sizePerEventMomenta> m_devMomentaF;
@@ -186,12 +186,12 @@ namespace mg5amcCpu
   // Forward declare transposition methods
   //
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 
   template<typename Tin, typename Tout>
   __global__ void dev_transposeMomentaF2C( const Tin* in, Tout* out, const unsigned int nevt );
 
-#endif // __CUDACC__
+#endif // MGONGPUCPP_GPUIMPL
 
   template<typename Tin, typename Tout>
   void hst_transposeMomentaF2C( const Tin* in, Tout* out, const unsigned int nevt );
@@ -208,7 +208,7 @@ namespace mg5amcCpu
   Bridge<FORTRANFPTYPE>::Bridge( unsigned int nevtF, unsigned int nparF, unsigned int np4F )
     : m_nevt( nevtF )
     , m_nGoodHel( -1 )
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     , m_gputhreads( 256 )                  // default number of gpu threads
     , m_gpublocks( m_nevt / m_gputhreads ) // this ensures m_nevt <= m_gpublocks*m_gputhreads
     , m_devMomentaF( m_nevt )
@@ -232,7 +232,7 @@ namespace mg5amcCpu
   {
     if( nparF != CPPProcess::npar ) throw std::runtime_error( "Bridge constructor: npar mismatch" );
     if( np4F != CPPProcess::np4 ) throw std::runtime_error( "Bridge constructor: np4 mismatch" );
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     if( ( m_nevt < s_gputhreadsmin ) || ( m_nevt % s_gputhreadsmin != 0 ) )
       throw std::runtime_error( "Bridge constructor: nevt should be a multiple of " + std::to_string( s_gputhreadsmin ) );
     while( m_nevt != m_gpublocks * m_gputhreads )
@@ -250,11 +250,11 @@ namespace mg5amcCpu
     std::cout << "WARNING! Instantiate host Bridge (nevt=" << m_nevt << ")" << std::endl;
     CPPProcess process( /*verbose=*/false );
     m_pmek.reset( new MatrixElementKernelHost( m_hstMomentaC, m_hstGs, m_hstRndHel, m_hstRndCol, m_hstMEs, m_hstSelHel, m_hstSelCol, m_nevt ) );
-#endif // __CUDACC__
+#endif // MGONGPUCPP_GPUIMPL
     process.initProc( "../../Cards/param_card.dat" );
   }
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   template<typename FORTRANFPTYPE>
   void Bridge<FORTRANFPTYPE>::set_gpugrid( const int gpublocks, const int gputhreads )
   {
@@ -268,7 +268,7 @@ namespace mg5amcCpu
   }
 #endif
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   template<typename FORTRANFPTYPE>
   void Bridge<FORTRANFPTYPE>::gpu_sequence( const FORTRANFPTYPE* momenta,
                                             const FORTRANFPTYPE* gs,
@@ -283,14 +283,14 @@ namespace mg5amcCpu
     constexpr int neppM = MemoryAccessMomenta::neppM;
     if constexpr( neppM == 1 && std::is_same_v<FORTRANFPTYPE, fptype> )
     {
-      checkCuda( cudaMemcpy( m_devMomentaC.data(), momenta, m_devMomentaC.bytes(), cudaMemcpyHostToDevice ) );
+      gpuMemcpy( m_devMomentaC.data(), momenta, m_devMomentaC.bytes(), gpuMemcpyHostToDevice );
     }
     else
     {
-      checkCuda( cudaMemcpy( m_devMomentaF.data(), momenta, m_devMomentaF.bytes(), cudaMemcpyHostToDevice ) );
+      gpuMemcpy( m_devMomentaF.data(), momenta, m_devMomentaF.bytes(), gpuMemcpyHostToDevice );
       const int thrPerEvt = CPPProcess::npar * CPPProcess::np4; // AV: transpose alg does 1 element per thread (NOT 1 event per thread)
       //const int thrPerEvt = 1; // AV: try new alg with 1 event per thread... this seems slower
-      dev_transposeMomentaF2C<<<m_gpublocks * thrPerEvt, m_gputhreads>>>( m_devMomentaF.data(), m_devMomentaC.data(), m_nevt );
+      gpuLaunchKernel( dev_transposeMomentaF2C, m_gpublocks * thrPerEvt, m_gputhreads, m_devMomentaF.data(), m_devMomentaC.data(), m_nevt );
     }
     if constexpr( std::is_same_v<FORTRANFPTYPE, fptype> )
     {
@@ -333,7 +333,7 @@ namespace mg5amcCpu
   }
 #endif
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   template<typename FORTRANFPTYPE>
   void Bridge<FORTRANFPTYPE>::cpu_sequence( const FORTRANFPTYPE* momenta,
                                             const FORTRANFPTYPE* gs,
@@ -388,7 +388,7 @@ namespace mg5amcCpu
   // - C++ array: momenta[npagM][npar][np4][neppM] with nevt=npagM*neppM (AOSOA)
   //
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   template<typename Tin, typename Tout>
   __global__ void dev_transposeMomentaF2C( const Tin* in, Tout* out, const unsigned int nevt )
   {
diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/BridgeKernels.cc b/epochX/cudacpp/gg_ttg.mad/SubProcesses/BridgeKernels.cc
index d58066c9c1..eaf4037a24 100644
--- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/BridgeKernels.cc
+++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/BridgeKernels.cc
@@ -1,17 +1,18 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #include "BridgeKernels.h"
 
+#include "GpuAbstraction.h"
 #include "MemoryAccessMomenta.h"
 
 #include <sstream>
 
 //============================================================================
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -45,7 +46,7 @@ namespace mg5amcCpu
 
 //============================================================================
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 namespace mg5amcCpu
 {
 
@@ -96,7 +97,7 @@ namespace mg5amcCpu
 
 //============================================================================
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 {
 
diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/BridgeKernels.h b/epochX/cudacpp/gg_ttg.mad/SubProcesses/BridgeKernels.h
index 15eb4bff4d..3efef8ce97 100644
--- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/BridgeKernels.h
+++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/BridgeKernels.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef BRIDGEKERNELS_H
 #define BRIDGEKERNELS_H 1
@@ -12,7 +12,7 @@
 #include "MatrixElementKernels.h"
 #include "MemoryBuffers.h"
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -49,7 +49,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A Bridge wrapper class encapsulating matrix element calculations on a CPU host
   class BridgeKernelHost final : public BridgeKernelBase
   {
@@ -89,7 +89,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   // A Bridge wrapper class encapsulating matrix element calculations on a GPU device
   class BridgeKernelDevice : public BridgeKernelBase
   {
diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/CommonRandomNumberKernel.cc b/epochX/cudacpp/gg_ttg.mad/SubProcesses/CommonRandomNumberKernel.cc
index 985b39f576..010bc4cbd0 100644
--- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/CommonRandomNumberKernel.cc
+++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/CommonRandomNumberKernel.cc
@@ -1,15 +1,16 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #include "CommonRandomNumbers.h"
+#include "GpuAbstraction.h"
 #include "MemoryBuffers.h"
 #include "RandomNumberKernels.h"
 
 #include <cassert>
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/CrossSectionKernels.cc b/epochX/cudacpp/gg_ttg.mad/SubProcesses/CrossSectionKernels.cc
index 0b355a3c8d..c15b39844d 100644
--- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/CrossSectionKernels.cc
+++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/CrossSectionKernels.cc
@@ -1,10 +1,11 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #include "CrossSectionKernels.h"
 
+#include "GpuAbstraction.h"
 #include "MemoryAccessMatrixElements.h"
 #include "MemoryAccessWeights.h"
 #include "MemoryBuffers.h"
@@ -77,7 +78,7 @@ debug_me_is_abnormal( const fptype& me, size_t ievtALL )
 
 //============================================================================
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -185,7 +186,7 @@ namespace mg5amcCpu
 
 //============================================================================
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 {
 
diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/CrossSectionKernels.h b/epochX/cudacpp/gg_ttg.mad/SubProcesses/CrossSectionKernels.h
index 7933ca4bbf..4d9659e04e 100644
--- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/CrossSectionKernels.h
+++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/CrossSectionKernels.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef CROSSSECTIONKERNELS_H
 #define CROSSSECTIONKERNELS_H 1
@@ -13,7 +13,7 @@
 
 //============================================================================
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -96,7 +96,7 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
   /*
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   // A class encapsulating the calculation of event statistics on a GPU device
   class CrossSectionKernelDevice : public CrossSectionKernelBase, public NumberOfEvents
   {
diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/CudaRuntime.h b/epochX/cudacpp/gg_ttg.mad/SubProcesses/CudaRuntime.h
deleted file mode 100644
index 64ce52f4b3..0000000000
--- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/CudaRuntime.h
+++ /dev/null
@@ -1,85 +0,0 @@
-// Copyright (C) 2020-2023 CERN and UCLouvain.
-// Licensed under the GNU Lesser General Public License (version 3 or later).
-// Created by: S. Roiser (Jul 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
-
-#ifndef MG5AMC_CUDARUNTIME_H
-#define MG5AMC_CUDARUNTIME_H 1
-
-// MG5AMC on GPU uses the CUDA runtime API, not the lower level CUDA driver API
-// See https://docs.nvidia.com/cuda/cuda-runtime-api/driver-vs-runtime-api.html#driver-vs-runtime-api
-
-#include <cassert>
-#include <iostream>
-
-//--------------------------------------------------------------------------
-
-// See https://stackoverflow.com/a/14038590
-#ifdef __CUDACC__ /* clang-format off */
-#define checkCuda( code ) { assertCuda( code, __FILE__, __LINE__ ); }
-inline void assertCuda( cudaError_t code, const char* file, int line, bool abort = true )
-{
-  if( code != cudaSuccess )
-  {
-    printf( "ERROR! assertCuda: '%s' (%d) in %s:%d\n", cudaGetErrorString( code ), code, file, line );
-    if( abort ) assert( code == cudaSuccess );
-  }
-}
-#endif /* clang-format on */
-
-//--------------------------------------------------------------------------
-
-#ifdef __CUDACC__
-namespace mg5amcGpu
-{
-  // Instantiate a CudaRuntime at the beginnining of the application's main to
-  // invoke cudaSetDevice(0) in the constructor and book a cudaDeviceReset() call in the destructor
-  // *** FIXME! This will all need to be designed differently when going to multi-GPU nodes! ***
-  struct CudaRuntime final
-  {
-    CudaRuntime( const bool debug = true )
-      : m_debug( debug ) { setUp( m_debug ); }
-    ~CudaRuntime() { tearDown( m_debug ); }
-    CudaRuntime( const CudaRuntime& ) = delete;
-    CudaRuntime( CudaRuntime&& ) = delete;
-    CudaRuntime& operator=( const CudaRuntime& ) = delete;
-    CudaRuntime& operator=( CudaRuntime&& ) = delete;
-    bool m_debug;
-
-    // Set up CUDA application
-    // ** NB: strictly speaking this is not needed when using the CUDA runtime API **
-    // Calling cudaSetDevice on startup is useful to properly book-keep the time spent in CUDA initialization
-    static void setUp( const bool debug = true )
-    {
-      // ** NB: it is useful to call cudaSetDevice, or cudaFree, to properly book-keep the time spent in CUDA initialization
-      // ** NB: otherwise, the first CUDA operation (eg a cudaMemcpyToSymbol in CPPProcess ctor) appears to take much longer!
-      /*
-      // [We initially added cudaFree(0) to "ease profile analysis" only because it shows up as a big recognizable block!]
-      // No explicit initialization is needed: https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#initialization
-      // It is not clear what cudaFree(0) does at all: https://stackoverflow.com/questions/69967813/
-      if ( debug ) std::cout << "__CudaRuntime: calling cudaFree(0)" << std::endl;
-      checkCuda( cudaFree( 0 ) ); // SLOW!
-      */
-      // Replace cudaFree(0) by cudaSetDevice(0), even if it is not really needed either
-      // (but see https://developer.nvidia.com/blog/cuda-pro-tip-always-set-current-device-avoid-multithreading-bugs)
-      if( debug ) std::cout << "__CudaRuntime: calling cudaSetDevice(0)" << std::endl;
-      checkCuda( cudaSetDevice( 0 ) ); // SLOW!
-    }
-
-    // Tear down CUDA application (call cudaDeviceReset)
-    // ** NB: strictly speaking this is not needed when using the CUDA runtime API **
-    // Calling cudaDeviceReset on shutdown is only needed for checking memory leaks in cuda-memcheck
-    // See https://docs.nvidia.com/cuda/cuda-memcheck/index.html#leak-checking
-    static void tearDown( const bool debug = true )
-    {
-      if( debug ) std::cout << "__CudaRuntime: calling cudaDeviceReset()" << std::endl;
-      checkCuda( cudaDeviceReset() );
-    }
-  };
-
-}
-#endif
-
-//--------------------------------------------------------------------------
-
-#endif // MG5AMC_CUDARUNTIME_H
diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/CurandRandomNumberKernel.cc b/epochX/cudacpp/gg_ttg.mad/SubProcesses/CurandRandomNumberKernel.cc
index eb56333b03..08a16f6f2c 100644
--- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/CurandRandomNumberKernel.cc
+++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/CurandRandomNumberKernel.cc
@@ -1,9 +1,9 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
-#include "CudaRuntime.h"
+#include "GpuRuntime.h"
 #include "MemoryBuffers.h"
 #include "RandomNumberKernels.h"
 
@@ -22,7 +22,7 @@ inline void assertCurand( curandStatus_t code, const char *file, int line, bool
 }
 #endif /* clang-format on */
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -36,7 +36,7 @@ namespace mg5amcCpu
   {
     if( m_isOnDevice )
     {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
       if( !m_rnarray.isOnDevice() )
         throw std::runtime_error( "CurandRandomNumberKernel on device with a host random number array" );
 #else
@@ -114,7 +114,7 @@ namespace mg5amcCpu
     /*
     printf( "\nCurandRandomNumberKernel::generateRnarray size = %d\n", (int)m_rnarray.size() );
     fptype* data = m_rnarray.data();
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     if( m_rnarray.isOnDevice() )
     {
       data = new fptype[m_rnarray.size()]();
@@ -123,7 +123,7 @@ namespace mg5amcCpu
 #endif
     for( int i = 0; i < ( (int)m_rnarray.size() / 4 ); i++ )
       printf( "[%4d] %f %f %f %f\n", i * 4, data[i * 4], data[i * 4 + 2], data[i * 4 + 2], data[i * 4 + 3] );
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     if( m_rnarray.isOnDevice() ) delete[] data;
 #endif
     */
diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/EventStatistics.h b/epochX/cudacpp/gg_ttg.mad/SubProcesses/EventStatistics.h
index 48b51e0a49..b425a5bade 100644
--- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/EventStatistics.h
+++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/EventStatistics.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef EventStatistics_H
 #define EventStatistics_H 1
@@ -16,7 +16,7 @@
 #include <limits>
 #include <string>
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/MadgraphTest.h b/epochX/cudacpp/gg_ttg.mad/SubProcesses/MadgraphTest.h
index ffe3b84d53..176338151a 100644
--- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/MadgraphTest.h
+++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/MadgraphTest.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: S. Hageboeck (Dec 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MADGRAPHTEST_H_
 #define MADGRAPHTEST_H_ 1
@@ -21,7 +21,7 @@
 #include <string>
 #include <vector>
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 using mg5amcGpu::CPPProcess;
 #else
 using mg5amcCpu::CPPProcess;
@@ -200,7 +200,7 @@ class MadgraphTest : public testing::TestWithParam<TestDriverBase*>
 
 // Since we link both the CPU-only and GPU tests into the same executable, we prevent
 // a multiply defined symbol by only compiling this in the non-CUDA phase:
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 
 /// Compare momenta and matrix elements.
 /// This uses an implementation of TestDriverBase to run a madgraph workflow,
@@ -309,6 +309,6 @@ TEST_P( MadgraphTest, CompareMomentaAndME )
   }
 }
 
-#endif // __CUDACC__
+#endif // MGONGPUCPP_GPUIMPL
 
 #endif /* MADGRAPHTEST_H_ */
diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/MatrixElementKernels.cc b/epochX/cudacpp/gg_ttg.mad/SubProcesses/MatrixElementKernels.cc
index 30257195b6..d6d6c4f179 100644
--- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/MatrixElementKernels.cc
+++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/MatrixElementKernels.cc
@@ -1,12 +1,12 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #include "MatrixElementKernels.h"
 
 #include "CPPProcess.h"
-#include "CudaRuntime.h"
+#include "GpuRuntime.h" // Includes the abstraction for Nvidia/AMD compilation
 #include "MemoryAccessMomenta.h"
 #include "MemoryBuffers.h"
 
@@ -14,7 +14,7 @@
 
 //============================================================================
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 namespace mg5amcCpu
 {
 
@@ -143,7 +143,7 @@ namespace mg5amcCpu
 
 //============================================================================
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 {
 
@@ -202,13 +202,13 @@ namespace mg5amcGpu
     PinnedHostBufferHelicityMask hstIsGoodHel( ncomb );
     DeviceBufferHelicityMask devIsGoodHel( ncomb );
     // ... 0d1. Compute good helicity mask on the device
-    computeDependentCouplings<<<m_gpublocks, m_gputhreads>>>( m_gs.data(), m_couplings.data() );
+    gpuLaunchKernel( computeDependentCouplings, m_gpublocks, m_gputhreads, m_gs.data(), m_couplings.data() );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    sigmaKin_getGoodHel<<<m_gpublocks, m_gputhreads>>>( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_numerators.data(), m_denominators.data(), devIsGoodHel.data() );
+    gpuLaunchKernel( sigmaKin_getGoodHel, m_gpublocks, m_gputhreads, m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_numerators.data(), m_denominators.data(), devIsGoodHel.data() );
 #else
-    sigmaKin_getGoodHel<<<m_gpublocks, m_gputhreads>>>( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), devIsGoodHel.data() );
+    gpuLaunchKernel( sigmaKin_getGoodHel, m_gpublocks, m_gputhreads, m_momenta.data(), m_couplings.data(), m_matrixElements.data(), devIsGoodHel.data() );
 #endif
-    checkCuda( cudaPeekAtLastError() );
+    checkGpu( gpuPeekAtLastError() );
     // ... 0d2. Copy back good helicity mask to the host
     copyHostFromDevice( hstIsGoodHel, devIsGoodHel );
     // ... 0d3. Copy back good helicity list to constant memory on the device
@@ -219,19 +219,19 @@ namespace mg5amcGpu
 
   void MatrixElementKernelDevice::computeMatrixElements( const unsigned int channelId )
   {
-    computeDependentCouplings<<<m_gpublocks, m_gputhreads>>>( m_gs.data(), m_couplings.data() );
+    gpuLaunchKernel( computeDependentCouplings, m_gpublocks, m_gputhreads, m_gs.data(), m_couplings.data() );
 #ifndef MGONGPU_NSIGHT_DEBUG
     constexpr unsigned int sharedMemSize = 0;
 #else
     constexpr unsigned int sharedMemSize = ntpbMAX * sizeof( float );
 #endif
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    sigmaKin<<<m_gpublocks, m_gputhreads, sharedMemSize>>>( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), channelId, m_numerators.data(), m_denominators.data(), m_selhel.data(), m_selcol.data() );
+    gpuLaunchKernelSharedMem( sigmaKin, m_gpublocks, m_gputhreads, sharedMemSize, m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), channelId, m_numerators.data(), m_denominators.data(), m_selhel.data(), m_selcol.data() );
 #else
-    sigmaKin<<<m_gpublocks, m_gputhreads, sharedMemSize>>>( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), m_selhel.data(), m_selcol.data() );
+    gpuLaunchKernelSharedMem( sigmaKin, m_gpublocks, m_gputhreads, sharedMemSize, m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), m_selhel.data(), m_selcol.data() );
 #endif
-    checkCuda( cudaPeekAtLastError() );
-    checkCuda( cudaDeviceSynchronize() );
+    checkGpu( gpuPeekAtLastError() );
+    checkGpu( gpuDeviceSynchronize() );
   }
 
   //--------------------------------------------------------------------------
diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/MatrixElementKernels.h b/epochX/cudacpp/gg_ttg.mad/SubProcesses/MatrixElementKernels.h
index 23e84757a2..72bd8f195b 100644
--- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/MatrixElementKernels.h
+++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/MatrixElementKernels.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MATRIXELEMENTKERNELS_H
 #define MATRIXELEMENTKERNELS_H 1
@@ -10,7 +10,7 @@
 
 #include "MemoryBuffers.h"
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -81,7 +81,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating matrix element calculations on a CPU host
   class MatrixElementKernelHost final : public MatrixElementKernelBase, public NumberOfEvents
   {
@@ -130,7 +130,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   // A class encapsulating matrix element calculations on a GPU device
   class MatrixElementKernelDevice : public MatrixElementKernelBase, public NumberOfEvents
   {
diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/MemoryAccessAmplitudes.h b/epochX/cudacpp/gg_ttg.mad/SubProcesses/MemoryAccessAmplitudes.h
index 573b3bbbc9..ffb76e93de 100644
--- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/MemoryAccessAmplitudes.h
+++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/MemoryAccessAmplitudes.h
@@ -15,7 +15,7 @@
 #define MGONGPU_TRIVIAL_AMPLITUDES 1
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/MemoryAccessCouplings.h b/epochX/cudacpp/gg_ttg.mad/SubProcesses/MemoryAccessCouplings.h
index 35a3af42e0..3afdf3e554 100644
--- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/MemoryAccessCouplings.h
+++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/MemoryAccessCouplings.h
@@ -15,7 +15,7 @@
 #include "MemoryBuffers.h"       // for HostBufferCouplings::isaligned
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/MemoryAccessCouplingsFixed.h b/epochX/cudacpp/gg_ttg.mad/SubProcesses/MemoryAccessCouplingsFixed.h
index dc0d93afff..ffcdf4dbef 100644
--- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/MemoryAccessCouplingsFixed.h
+++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/MemoryAccessCouplingsFixed.h
@@ -14,7 +14,7 @@
 //#include "MemoryAccessHelpers.h"
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/MemoryAccessDenominators.h b/epochX/cudacpp/gg_ttg.mad/SubProcesses/MemoryAccessDenominators.h
index 3bce635718..66f2d32a6b 100644
--- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/MemoryAccessDenominators.h
+++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/MemoryAccessDenominators.h
@@ -10,7 +10,7 @@
 #include "MemoryAccessGs.h"
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/MemoryAccessGs.h b/epochX/cudacpp/gg_ttg.mad/SubProcesses/MemoryAccessGs.h
index 31311aa375..4c726b30f3 100644
--- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/MemoryAccessGs.h
+++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/MemoryAccessGs.h
@@ -13,7 +13,7 @@
 #include "MemoryBuffers.h" // for HostBufferMatrixElements::isaligned
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/MemoryAccessHelpers.h b/epochX/cudacpp/gg_ttg.mad/SubProcesses/MemoryAccessHelpers.h
index c82a6c7635..db73e4e064 100644
--- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/MemoryAccessHelpers.h
+++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/MemoryAccessHelpers.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MemoryAccessHelpers_H
 #define MemoryAccessHelpers_H 1
@@ -105,7 +105,7 @@ class KernelAccessHelper : public MemoryAccessHelper<T>
     }
     else
     {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
       const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid
       //printf( "kernelAccessRecord: ievt=%d threadId=%d\n", ievt, threadIdx.x );
       return T::ieventAccessRecord( buffer, ievt ); // NB fptype and fptype_sv coincide for CUDA
diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/MemoryAccessMatrixElements.h b/epochX/cudacpp/gg_ttg.mad/SubProcesses/MemoryAccessMatrixElements.h
index f32e6fea5b..3741011971 100644
--- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/MemoryAccessMatrixElements.h
+++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/MemoryAccessMatrixElements.h
@@ -13,7 +13,7 @@
 #include "MemoryBuffers.h" // for HostBufferMatrixElements::isaligned
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/MemoryAccessMomenta.h b/epochX/cudacpp/gg_ttg.mad/SubProcesses/MemoryAccessMomenta.h
index 29266de32c..3be229d392 100644
--- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/MemoryAccessMomenta.h
+++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/MemoryAccessMomenta.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MemoryAccessMomenta_H
 #define MemoryAccessMomenta_H 1
@@ -13,7 +13,7 @@
 #include "MemoryAccessVectors.h"
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -30,7 +30,7 @@ namespace mg5amcCpu
 
     // Number of Events Per Page in the momenta AOSOA memory buffer layout
     // (these are all best kept as a compile-time constants: see issue #23)
-#ifdef __CUDACC__ /* clang-format off */
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
     // -----------------------------------------------------------------------------------------------
     // --- GPUs: neppM is best set to a power of 2 times the number of fptype's in a 32-byte cacheline
     // --- This is relevant to ensure coalesced access to momenta in global memory
diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/MemoryAccessNumerators.h b/epochX/cudacpp/gg_ttg.mad/SubProcesses/MemoryAccessNumerators.h
index b152183b28..18991f4fa6 100644
--- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/MemoryAccessNumerators.h
+++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/MemoryAccessNumerators.h
@@ -10,7 +10,7 @@
 #include "MemoryAccessGs.h"
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/MemoryAccessRandomNumbers.h b/epochX/cudacpp/gg_ttg.mad/SubProcesses/MemoryAccessRandomNumbers.h
index e2988d39f3..40cb089135 100644
--- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/MemoryAccessRandomNumbers.h
+++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/MemoryAccessRandomNumbers.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MemoryAccessRandomNumbers_H
 #define MemoryAccessRandomNumbers_H 1
@@ -11,7 +11,7 @@
 #include "CPPProcess.h"
 #include "MemoryAccessHelpers.h"
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 using mg5amcGpu::CPPProcess;
 #else
 using mg5amcCpu::CPPProcess;
diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/MemoryAccessVectors.h b/epochX/cudacpp/gg_ttg.mad/SubProcesses/MemoryAccessVectors.h
index e9b197368e..08faccff0f 100644
--- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/MemoryAccessVectors.h
+++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/MemoryAccessVectors.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MemoryAccessVectors_H
 #define MemoryAccessVectors_H 1
@@ -10,7 +10,7 @@
 
 #include "mgOnGpuVectors.h"
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 namespace mg5amcCpu // this is only needed for CPU SIMD vectorization
 {
 
diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/MemoryAccessWavefunctions.h b/epochX/cudacpp/gg_ttg.mad/SubProcesses/MemoryAccessWavefunctions.h
index 5428aaf933..33bef4559e 100644
--- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/MemoryAccessWavefunctions.h
+++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/MemoryAccessWavefunctions.h
@@ -15,7 +15,7 @@
 #define MGONGPU_TRIVIAL_WAVEFUNCTIONS 1
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/MemoryBuffers.h b/epochX/cudacpp/gg_ttg.mad/SubProcesses/MemoryBuffers.h
index 3093e6ed18..7756a71621 100644
--- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/MemoryBuffers.h
+++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/MemoryBuffers.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021, based on earlier work by S. Hageboeck) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Roiser, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MemoryBuffers_H
 #define MemoryBuffers_H 1
@@ -11,12 +11,12 @@
 #include "mgOnGpuCxtypes.h"
 
 #include "CPPProcess.h"
-#include "CudaRuntime.h"
+#include "GpuRuntime.h"
 #include "Parameters_sm.h"
 
 #include <sstream>
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -87,7 +87,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   constexpr bool HostBufferALIGNED = false;   // ismisaligned=false
   constexpr bool HostBufferMISALIGNED = true; // ismisaligned=true
 
@@ -119,7 +119,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   // A class encapsulating a CUDA pinned host buffer
   template<typename T>
   class PinnedHostBufferBase : public BufferBase<T>
@@ -128,18 +128,18 @@ namespace mg5amcCpu
     PinnedHostBufferBase( const size_t size )
       : BufferBase<T>( size, false )
     {
-      checkCuda( cudaMallocHost( &( this->m_data ), this->bytes() ) );
+      gpuMallocHost( &( this->m_data ), this->bytes() );
     }
     virtual ~PinnedHostBufferBase()
     {
-      checkCuda( cudaFreeHost( this->m_data ) );
+      gpuFreeHost( this->m_data );
     }
   };
 #endif
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   // A class encapsulating a CUDA device buffer
   template<typename T>
   class DeviceBufferBase : public BufferBase<T>
@@ -148,18 +148,18 @@ namespace mg5amcCpu
     DeviceBufferBase( const size_t size )
       : BufferBase<T>( size, true )
     {
-      checkCuda( cudaMalloc( &( this->m_data ), this->bytes() ) );
+      gpuMalloc( &( this->m_data ), this->bytes() );
     }
     virtual ~DeviceBufferBase()
     {
-      checkCuda( cudaFree( this->m_data ) );
+      gpuFree( this->m_data );
     }
   };
 #endif
 
   //--------------------------------------------------------------------------
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for a given number of events
   template<typename T, size_t sizePerEvent, bool ismisaligned>
   class HostBuffer : public HostBufferBase<T, ismisaligned>, virtual private NumberOfEvents
@@ -175,7 +175,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   // A class encapsulating a CUDA pinned host buffer for a given number of events
   template<typename T, size_t sizePerEvent>
   class PinnedHostBuffer : public PinnedHostBufferBase<T>, virtual private NumberOfEvents
@@ -191,7 +191,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   // A class encapsulating a CUDA device buffer for a given number of events
   template<typename T, size_t sizePerEvent>
   class DeviceBuffer : public DeviceBufferBase<T>, virtual private NumberOfEvents
@@ -213,7 +213,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for momenta random numbers
   constexpr size_t sizePerEventRndNumMomenta = MemoryBuffers::np4 * MemoryBuffers::nparf;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for momenta random numbers
   typedef HostBuffer<fptype, sizePerEventRndNumMomenta, HostBufferALIGNED> HostBufferRndNumMomenta;
 #else
@@ -232,7 +232,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer with ONE fptype per event
   constexpr size_t sizePerEventOneFp = 1;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer with ONE fptype per event
   typedef HostBuffer<fptype, sizePerEventOneFp, HostBufferALIGNED> HostBufferOneFp;
 #else
@@ -257,7 +257,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for Gs
   constexpr size_t sizePerEventGs = 1;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for gs
   typedef HostBuffer<fptype, sizePerEventGs, HostBufferALIGNED> HostBufferGs;
 #else
@@ -276,7 +276,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for numerators
   constexpr size_t sizePerEventNumerators = 1;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for gs
   typedef HostBuffer<fptype, sizePerEventNumerators, HostBufferALIGNED> HostBufferNumerators;
 #else
@@ -296,7 +296,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for denominators
   constexpr size_t sizePerEventDenominators = 1;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for gs
   typedef HostBuffer<fptype, sizePerEventDenominators, HostBufferALIGNED> HostBufferDenominators;
 #else
@@ -315,7 +315,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for random numbers
   constexpr size_t sizePerEventCouplings = MemoryBuffers::ndcoup * MemoryBuffers::nx2;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for gs
   typedef HostBuffer<fptype, sizePerEventCouplings, HostBufferALIGNED> HostBufferCouplings;
 #else
@@ -333,7 +333,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for momenta
   constexpr size_t sizePerEventMomenta = MemoryBuffers::np4 * MemoryBuffers::npar;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for momenta
   typedef HostBuffer<fptype, sizePerEventMomenta, HostBufferALIGNED> HostBufferMomenta;
   //typedef HostBuffer<fptype, sizePerEventMomenta, HostBufferMISALIGNED> HostBufferMomenta; // TEST MISALIGNMENT!
@@ -352,7 +352,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for sampling weights
   constexpr size_t sizePerEventWeights = 1;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for sampling weights
   typedef HostBuffer<fptype, sizePerEventWeights, HostBufferALIGNED> HostBufferWeights;
 #else
@@ -370,7 +370,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for matrix elements
   constexpr size_t sizePerEventMatrixElements = 1;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for matrix elements
   typedef HostBuffer<fptype, sizePerEventMatrixElements, HostBufferALIGNED> HostBufferMatrixElements;
 #else
@@ -385,7 +385,7 @@ namespace mg5amcCpu
   // A base class encapsulating a memory buffer for the helicity mask
   typedef BufferBase<bool> BufferHelicityMask;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for the helicity mask
   typedef HostBufferBase<bool, HostBufferALIGNED> HostBufferHelicityMask;
 #else
@@ -403,7 +403,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for wavefunctions
   constexpr size_t sizePerEventWavefunctions = MemoryBuffers::nw6 * MemoryBuffers::nx2;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for wavefunctions
   typedef HostBuffer<fptype, sizePerEventWavefunctions, HostBufferALIGNED> HostBufferWavefunctions;
 #else
@@ -421,7 +421,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for helicity random numbers
   constexpr size_t sizePerEventRndNumHelicity = 1;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for helicity random numbers
   typedef HostBuffer<fptype, sizePerEventRndNumHelicity, HostBufferALIGNED> HostBufferRndNumHelicity;
 #else
@@ -439,7 +439,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for color random numbers
   constexpr size_t sizePerEventRndNumColor = 1;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for color random numbers
   typedef HostBuffer<fptype, sizePerEventRndNumColor, HostBufferALIGNED> HostBufferRndNumColor;
 #else
@@ -457,7 +457,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for helicity selection
   constexpr size_t sizePerEventSelectedHelicity = 1;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for helicity selection
   typedef HostBuffer<int, sizePerEventSelectedHelicity, HostBufferALIGNED> HostBufferSelectedHelicity;
 #else
@@ -475,7 +475,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for color selection
   constexpr size_t sizePerEventSelectedColor = 1;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for color selection
   typedef HostBuffer<int, sizePerEventSelectedColor, HostBufferALIGNED> HostBufferSelectedColor;
 #else
@@ -487,7 +487,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   template<class Tdst, class Tsrc>
   void copyDeviceFromHost( Tdst& dst, const Tsrc& src ) // keep the same order of arguments as in memcpy
   {
@@ -504,13 +504,13 @@ namespace mg5amcCpu
       throw std::runtime_error( sstr.str() );
     }
     // NB (PR #45): cudaMemcpy involves an intermediate memcpy to pinned memory if host array is a not a pinned host array
-    checkCuda( cudaMemcpy( dst.data(), src.data(), src.bytes(), cudaMemcpyHostToDevice ) );
+    gpuMemcpy( dst.data(), src.data(), src.bytes(), gpuMemcpyHostToDevice );
   }
 #endif
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   template<class Tdst, class Tsrc>
   void copyHostFromDevice( Tdst& dst, const Tsrc& src ) // keep the same order of arguments as in memcpy
   {
@@ -527,7 +527,7 @@ namespace mg5amcCpu
       throw std::runtime_error( sstr.str() );
     }
     // NB (PR #45): cudaMemcpy involves an intermediate memcpy to pinned memory if host array is a not a pinned host array
-    checkCuda( cudaMemcpy( dst.data(), src.data(), src.bytes(), cudaMemcpyDeviceToHost ) );
+    gpuMemcpy( dst.data(), src.data(), src.bytes(), gpuMemcpyDeviceToHost );
   }
 #endif
 
diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/CPPProcess.cc b/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/CPPProcess.cc
index 5856e464ed..389a5d98b3 100644
--- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/CPPProcess.cc
+++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/CPPProcess.cc
@@ -4,7 +4,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
 // MadGraph5_aMC@NLO v. 3.5.0_lo_vect, 2023-06-09
@@ -16,7 +16,6 @@
 
 #include "mgOnGpuConfig.h"
 
-#include "CudaRuntime.h"
 #include "HelAmps_sm.h"
 #include "MemoryAccessAmplitudes.h"
 #include "MemoryAccessCouplings.h"
@@ -46,7 +45,7 @@
 // Class member functions for calculating the matrix elements for
 // Process: g g > t t~ g WEIGHTED<=3 @1
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -80,7 +79,7 @@ namespace mg5amcCpu
   __device__ const fptype cIPD[2] = { (fptype)Parameters_sm::mdl_MT, (fptype)Parameters_sm::mdl_WT };
   __device__ const fptype* cIPC = nullptr; // unused as nicoup=0
 #else
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   __device__ __constant__ fptype cIPD[2];
   __device__ __constant__ fptype* cIPC = nullptr; // unused as nicoup=0
 #else
@@ -90,7 +89,7 @@ namespace mg5amcCpu
 #endif
 
   // Helicity combinations (and filtering of "good" helicity combinations)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   __device__ __constant__ short cHel[ncomb][npar];
   __device__ __constant__ int cNGoodHel;
   __device__ __constant__ int cGoodHel[ncomb];
@@ -118,13 +117,13 @@ namespace mg5amcCpu
                            fptype* allDenominators,       // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
                            fptype_sv* jamp2_sv            // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled)
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
                            , const int ievt00             // input: first event number in current C++ event page (for CUDA, ievt depends on threadid)
 #endif
                            )
   //ALWAYS_INLINE // attributes are not permitted in a function definition
   {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     using namespace mg5amcGpu;
     using M_ACCESS = DeviceAccessMomenta;         // non-trivial access: buffer includes all events
     using E_ACCESS = DeviceAccessMatrixElements;  // non-trivial access: buffer includes all events
@@ -151,7 +150,7 @@ namespace mg5amcCpu
 #endif /* clang-format on */
     mgDebug( 0, __FUNCTION__ );
     //printf( "calculate_wavefunctions: ihel=%2d\n", ihel );
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
     //printf( "calculate_wavefunctions: ievt00=%d\n", ievt00 );
 #endif
 
@@ -187,7 +186,7 @@ namespace mg5amcCpu
 #endif
     for( int iParity = 0; iParity < nParity; ++iParity )
     { // START LOOP ON IPARITY
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
       const int ievt0 = ievt00 + iParity * neppV;
 #endif
       constexpr size_t nxcoup = ndcoup + nicoup; // both dependent and independent couplings
@@ -200,8 +199,10 @@ namespace mg5amcCpu
         allCOUPs[idcoup] = CD_ACCESS::idcoupAccessBufferConst( allcouplings, idcoup ); // dependent couplings, vary event-by-event
       for( size_t iicoup = 0; iicoup < nicoup; iicoup++ )
         allCOUPs[ndcoup + iicoup] = CI_ACCESS::iicoupAccessBufferConst( cIPC, iicoup ); // independent couplings, fixed for all events
+#ifdef MGONGPUCPP_GPUIMPL
 #ifdef __CUDACC__
 #pragma nv_diagnostic pop
+#endif
       // CUDA kernels take input/output buffers with momenta/MEs for all events
       const fptype* momenta = allmomenta;
       const fptype* COUPs[nxcoup];
@@ -505,7 +506,7 @@ namespace mg5amcCpu
         { 1, -8, 10, 1, 64, -8 },
         { 10, 1, 1, -8, -8, 64 } }; // 2-D array[6][6]
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
       // Pre-compute a constexpr triangular color matrix properly normalized #475
       struct TriangularNormalizedColorMatrix
       {
@@ -562,7 +563,7 @@ namespace mg5amcCpu
 #endif
       for( int icol = 0; icol < ncolor; icol++ )
       {
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
         // === C++ START ===
         // Diagonal terms
 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
@@ -621,7 +622,7 @@ namespace mg5amcCpu
       MEs_sv_previous += deltaMEs_previous;
 #endif
       /*
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
       if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", blockDim.x * blockIdx.x + threadIdx.x, ihel, MEs_sv );
 #else
 #ifdef MGONGPU_CPPSIMD
@@ -684,8 +685,8 @@ namespace mg5amcCpu
       { 1, 1, 1, 1, 1 },
       { 1, 1, 1, -1, -1 },
       { 1, 1, 1, -1, 1 } };
-#ifdef __CUDACC__
-    checkCuda( cudaMemcpyToSymbol( cHel, tHel, ncomb * npar * sizeof( short ) ) );
+#ifdef MGONGPUCPP_GPUIMPL
+    gpuMemcpyToSymbol( cHel, tHel, ncomb * npar * sizeof( short ) );
 #else
     memcpy( cHel, tHel, ncomb * npar * sizeof( short ) );
 #endif
@@ -726,9 +727,9 @@ namespace mg5amcCpu
     // Then copy them to CUDA constant memory (issue #39) or its C++ emulation in file-scope static memory
     const fptype tIPD[2] = { (fptype)m_pars->mdl_MT, (fptype)m_pars->mdl_WT };
     //const cxtype tIPC[0] = { ... }; // nicoup=0
-#ifdef __CUDACC__
-    checkCuda( cudaMemcpyToSymbol( cIPD, tIPD, 2 * sizeof( fptype ) ) );
-    //checkCuda( cudaMemcpyToSymbol( cIPC, tIPC, 0 * sizeof( cxtype ) ) ); // nicoup=0
+#ifdef MGONGPUCPP_GPUIMPL
+    gpuMemcpyToSymbol( cIPD, tIPD, 2 * sizeof( fptype ) );
+    //gpuMemcpyToSymbol( cIPC, tIPC, 0 * sizeof( cxtype ) ); // nicoup=0
 #else
     memcpy( cIPD, tIPD, 2 * sizeof( fptype ) );
     //memcpy( cIPC, tIPC, 0 * sizeof( cxtype ) ); // nicoup=0
@@ -765,7 +766,7 @@ namespace mg5amcCpu
   {
     std::stringstream out;
     // CUDA version (NVCC)
-    // [Use __NVCC__ instead of __CUDACC__ here!]
+    // [Use __NVCC__ instead of MGONGPUCPP_GPUIMPL here!]
     // [This tests if 'nvcc' was used even to build a .cc file, even if not necessarily 'nvcc -x cu' for a .cu file]
     // [Check 'nvcc --compiler-options -dM -E dummy.c | grep CUDA': see https://stackoverflow.com/a/53713712]
 #ifdef __NVCC__
@@ -830,12 +831,12 @@ namespace mg5amcCpu
   __global__ void /* clang-format off */
   computeDependentCouplings( const fptype* allgs, // input: Gs[nevt]
                              fptype* allcouplings // output: couplings[nevt*ndcoup*2]
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
                              , const int nevt     // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #endif
   ) /* clang-format on */
   {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     using namespace mg5amcGpu;
     using G_ACCESS = DeviceAccessGs;
     using C_ACCESS = DeviceAccessCouplings;
@@ -856,7 +857,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__ /* clang-format off */
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
   __global__ void
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
@@ -985,9 +986,9 @@ namespace mg5amcCpu
         nGoodHel++;
       }
     }
-#ifdef __CUDACC__
-    checkCuda( cudaMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) ) );
-    checkCuda( cudaMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) ) );
+#ifdef MGONGPUCPP_GPUIMPL
+    gpuMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) );
+    gpuMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) );
 #else
     cNGoodHel = nGoodHel;
     for( int ihel = 0; ihel < ncomb; ihel++ ) cGoodHel[ihel] = goodHel[ihel];
@@ -1011,7 +1012,7 @@ namespace mg5amcCpu
 #endif
             int* allselhel,                // output: helicity selection[nevt]
             int* allselcol                 // output: helicity selection[nevt]
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
             , const int nevt               // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #endif
             ) /* clang-format on */
@@ -1032,7 +1033,7 @@ namespace mg5amcCpu
     // Denominators: spins, colors and identical particles
     constexpr int helcolDenominators[1] = { 256 }; // assume nprocesses == 1 (#272 and #343)
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     // Remember: in CUDA this is a kernel for one event, in c++ this processes n events
     const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid
 #else
@@ -1046,9 +1047,12 @@ namespace mg5amcCpu
 #endif
 
     // Start sigmaKin_lines
+
+#include "GpuAbstraction.h"
+
     // === PART 0 - INITIALISATION (before calculate_wavefunctions) ===
     // Reset the "matrix elements" - running sums of |M|^2 over helicities for the given event
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     allMEs[ievt] = 0;
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     allNumerators[ievt] = 0;
@@ -1076,7 +1080,7 @@ namespace mg5amcCpu
     // === PART 1 - HELICITY LOOP: CALCULATE WAVEFUNCTIONS ===
     // (in both CUDA and C++, using precomputed good helicities)
 
-#ifdef __CUDACC__ // CUDA OR C++
+#ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++
 
     // *** START OF PART 1a - CUDA (one event per CPU thread) ***
     // Running sum of partial amplitudes squared for event by event color selection (#402)
@@ -1280,7 +1284,7 @@ namespace mg5amcCpu
     // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event
     // [NB 'sum over final spins, average over initial spins', eg see
     // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf]
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     allMEs[ievt] /= helcolDenominators[0];
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     if( channelId > 0 ) allMEs[ievt] *= allNumerators[ievt] / allDenominators[ievt];
diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/CPPProcess.h b/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/CPPProcess.h
index 0edca1b52a..ff2cb4ab9a 100644
--- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/CPPProcess.h
+++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/CPPProcess.h
@@ -4,7 +4,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
 // MadGraph5_aMC@NLO v. 3.5.0_lo_vect, 2023-06-09
@@ -25,7 +25,7 @@
 
 //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -107,7 +107,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   __global__ void
   computeDependentCouplings( const fptype* allgs,    // input: Gs[nevt]
                              fptype* allcouplings ); // output: couplings[nevt*ndcoup*2]
@@ -120,7 +120,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__ /* clang-format off */
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
   __global__ void
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
@@ -150,7 +150,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__ /* clang-format off */
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
   __global__ void
   sigmaKin( const fptype* allmomenta,      // input: momenta[nevt*npar*4]
             const fptype* allcouplings,    // input: couplings[nevt*ndcoup*2]
diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/CudaRuntime.h b/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/CudaRuntime.h
deleted file mode 120000
index ce9e1a487a..0000000000
--- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/CudaRuntime.h
+++ /dev/null
@@ -1 +0,0 @@
-../CudaRuntime.h
\ No newline at end of file
diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/check_sa.cc b/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/check_sa.cc
index f1e75b9252..1bad694d1c 100644
--- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/check_sa.cc
+++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/check_sa.cc
@@ -4,7 +4,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: O. Mattelaer (Nov 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 
 #include "mgOnGpuConfig.h"
@@ -12,6 +12,7 @@
 #include "BridgeKernels.h"
 #include "CPPProcess.h"
 #include "CrossSectionKernels.h"
+#include "GpuRuntime.h"
 #include "MatrixElementKernels.h"
 #include "MemoryAccessMatrixElements.h"
 #include "MemoryAccessMomenta.h"
@@ -63,7 +64,7 @@ usage( char* argv0, int ret = 1 )
   std::cout << std::endl;
   std::cout << "Summary stats are always computed: '-p' and '-j' only control their printout" << std::endl;
   std::cout << "The '-d' flag only enables NaN/abnormal warnings and OMP debugging" << std::endl;
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 #ifdef _OPENMP
   std::cout << std::endl;
   std::cout << "Use the OMP_NUM_THREADS environment variable to control OMP multi-threading" << std::endl;
@@ -77,7 +78,7 @@ int
 main( int argc, char** argv )
 {
   // Namespaces for CUDA and C++ (FIXME - eventually use the same namespace everywhere...)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   using namespace mg5amcGpu;
 #else
   using namespace mg5amcCpu;
@@ -103,11 +104,11 @@ main( int argc, char** argv )
     CurandDevice = 2
   };
 #ifdef __CUDACC__
-  RandomNumberMode rndgen = RandomNumberMode::CurandDevice; // default on GPU
+  RandomNumberMode rndgen = RandomNumberMode::CurandDevice; // default on CUDA GPU
 #elif not defined MGONGPU_HAS_NO_CURAND
   RandomNumberMode rndgen = RandomNumberMode::CurandHost;  // default on CPU if build has curand
 #else
-  RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // default on CPU if build has no curand
+  RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // default on CPU if build has no curand and on HIP GPU
 #endif
   // Rambo sampling mode (NB RamboHost implies CommonRandom or CurandHost!)
   enum class RamboSamplingMode
@@ -115,7 +116,7 @@ main( int argc, char** argv )
     RamboHost = 1,
     RamboDevice = 2
   };
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   RamboSamplingMode rmbsmp = RamboSamplingMode::RamboDevice; // default on GPU
 #else
   RamboSamplingMode rmbsmp = RamboSamplingMode::RamboHost; // default on CPU
@@ -148,7 +149,7 @@ main( int argc, char** argv )
 #ifdef __CUDACC__
       rndgen = RandomNumberMode::CurandDevice;
 #else
-      throw std::runtime_error( "CurandDevice is not supported on CPUs" );
+      throw std::runtime_error( "CurandDevice is not supported on CPUs or on HIP GPUs" );
 #endif
     }
     else if( arg == "--curhst" )
@@ -165,7 +166,7 @@ main( int argc, char** argv )
     }
     else if( arg == "--rmbdev" )
     {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
       rmbsmp = RamboSamplingMode::RamboDevice;
 #else
       throw std::runtime_error( "RamboDevice is not supported on CPUs" );
@@ -239,13 +240,13 @@ main( int argc, char** argv )
     return usage( argv[0] );
   }
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 #ifdef _OPENMP
   ompnumthreadsNotSetMeansOneThread( debug ? 1 : 0 ); // quiet(-1), info(0), debug(1)
 #endif
 #endif
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // Fail gently and avoid "Illegal instruction (core dumped)" if the host does not support the SIMD used in the ME calculation
   // Note: this prevents a crash on pmpe04 but not on some github CI nodes?
   // [NB: SIMD vectorization in mg5amc C++ code is only used in the ME calculation below MatrixElementKernelHost!]
@@ -263,14 +264,14 @@ main( int argc, char** argv )
 
   // === STEP 0 - INITIALISE
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 
-  // --- 00. Initialise cuda
-  // Instantiate a CudaRuntime at the beginnining of the application's main to
-  // invoke cudaSetDevice(0) in the constructor and book a cudaDeviceReset() call in the destructor
-  const std::string cdinKey = "00 CudaInit";
+  // --- 00. Initialise GPU
+  // Instantiate a GpuRuntime at the beginnining of the application's main.
+  // For CUDA this invokes cudaSetDevice(0) in the constructor and books a cudaDeviceReset() call in the destructor.
+  const std::string cdinKey = "00 GpuInit";
   timermap.start( cdinKey );
-  CudaRuntime cudaRuntime( debug );
+  GpuRuntime GpuRuntime( debug );
 #endif
 
   // --- 0a. Initialise physics process
@@ -292,7 +293,7 @@ main( int argc, char** argv )
   timermap.start( alloKey );
 
   // Memory buffers for random numbers for momenta
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferRndNumMomenta hstRndmom( nevt );
 #else
   PinnedHostBufferRndNumMomenta hstRndmom( nevt );
@@ -300,7 +301,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for sampling weights
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferWeights hstWeights( nevt );
 #else
   PinnedHostBufferWeights hstWeights( nevt );
@@ -308,7 +309,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for momenta
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferMomenta hstMomenta( nevt );
 #else
   PinnedHostBufferMomenta hstMomenta( nevt );
@@ -316,7 +317,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for Gs
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferGs hstGs( nevt );
 #else
   PinnedHostBufferGs hstGs( nevt );
@@ -333,7 +334,7 @@ main( int argc, char** argv )
   }
 
   // Memory buffers for matrix elements
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferMatrixElements hstMatrixElements( nevt );
 #else
   PinnedHostBufferMatrixElements hstMatrixElements( nevt );
@@ -342,7 +343,7 @@ main( int argc, char** argv )
 
   // Memory buffers for random numbers for helicity selection
   // *** NB #403 these buffers always remain initialised at 0: no need for helicity choice in gcheck/check (no LHE produced) ***
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferRndNumHelicity hstRndHel( nevt );
 #else
   PinnedHostBufferRndNumHelicity hstRndHel( nevt );
@@ -351,7 +352,7 @@ main( int argc, char** argv )
 
   // Memory buffers for random numbers for color selection
   // *** NB #402 these buffers always remain initialised at 0: no need for color choice in gcheck/check (no LHE produced) ***
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferRndNumColor hstRndCol( nevt );
 #else
   PinnedHostBufferRndNumColor hstRndCol( nevt );
@@ -359,7 +360,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for helicity selection
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferSelectedHelicity hstSelHel( nevt );
 #else
   PinnedHostBufferSelectedHelicity hstSelHel( nevt );
@@ -367,7 +368,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for color selection
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferSelectedColor hstSelCol( nevt );
 #else
   PinnedHostBufferSelectedColor hstSelCol( nevt );
@@ -403,7 +404,7 @@ main( int argc, char** argv )
 #else
   else
   {
-    throw std::logic_error( "CurandDevice is not supported on CPUs" ); // INTERNAL ERROR (no path to this statement)
+    throw std::logic_error( "CurandDevice is not supported on CPUs or HIP GPUs" ); // INTERNAL ERROR (no path to this statement)
   }
 #endif
 #else
@@ -421,7 +422,7 @@ main( int argc, char** argv )
   }
   else
   {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     prsk.reset( new RamboSamplingKernelDevice( energy, devRndmom, devMomenta, devWeights, gpublocks, gputhreads ) );
 #else
     throw std::logic_error( "RamboDevice is not supported on CPUs" ); // INTERNAL ERROR (no path to this statement)
@@ -432,7 +433,7 @@ main( int argc, char** argv )
   std::unique_ptr<MatrixElementKernelBase> pmek;
   if( !bridge )
   {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     pmek.reset( new MatrixElementKernelDevice( devMomenta, devGs, devRndHel, devRndCol, devMatrixElements, devSelHel, devSelCol, gpublocks, gputhreads ) );
 #else
     pmek.reset( new MatrixElementKernelHost( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, nevt ) );
@@ -440,7 +441,7 @@ main( int argc, char** argv )
   }
   else
   {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     pmek.reset( new BridgeKernelDevice( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, gpublocks, gputhreads ) );
 #else
     pmek.reset( new BridgeKernelHost( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, nevt ) );
@@ -482,7 +483,7 @@ main( int argc, char** argv )
     prnk->generateRnarray();
     //std::cout << "Got random numbers" << std::endl;
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     if( rndgen != RandomNumberMode::CurandDevice && rmbsmp == RamboSamplingMode::RamboDevice )
     {
       // --- 1c. Copy rndmom from host to device
@@ -514,7 +515,7 @@ main( int argc, char** argv )
     prsk->getMomentaFinal();
     //std::cout << "Got final momenta" << std::endl;
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     if( rmbsmp == RamboSamplingMode::RamboDevice )
     {
       // --- 2c. CopyDToH Weights
@@ -559,7 +560,7 @@ main( int argc, char** argv )
       dynamic_cast<BridgeKernelBase*>( pmek.get() )->transposeInputMomentaC2F();
     }
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     // --- 2d. CopyHToD Momenta
     const std::string gKey = "0.. CpHTDg";
     rambtime += timermap.start( gKey ); // FIXME! NOT A RAMBO TIMER!
@@ -588,7 +589,7 @@ main( int argc, char** argv )
     wv3atime += timermap.stop(); // calc only
     wavetime += wv3atime;        // calc plus copy
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     if( !bridge )
     {
       // --- 3b. CopyDToH MEs
@@ -731,15 +732,19 @@ main( int argc, char** argv )
     rndgentxt = "CURAND DEVICE";
 #ifdef __CUDACC__
   rndgentxt += " (CUDA code)";
+#elif defined __HIPCC__
+  rndgentxt += " (HIP code)";
 #else
   rndgentxt += " (C++ code)";
 #endif
 
   // Workflow description summary
   std::string wrkflwtxt;
-  // -- CUDA or C++?
+  // -- CUDA or HIP or C++?
 #ifdef __CUDACC__
   wrkflwtxt += "CUD:";
+#elif defined __HIPCC__
+  wrkflwtxt += "HIP:";
 #else
   wrkflwtxt += "CPP:";
 #endif
@@ -764,6 +769,12 @@ main( int argc, char** argv )
 #else
   wrkflwtxt += "???:"; // no path to this statement
 #endif
+#elif defined __HIPCC__
+#if defined MGONGPU_CUCXTYPE_CXSMPL
+  wrkflwtxt += "CXS:";
+#else
+  wrkflwtxt += "???:"; // no path to this statement
+#endif
 #else
 #if defined MGONGPU_CPPCXTYPE_STDCOMPLEX
   wrkflwtxt += "STX:";
@@ -789,7 +800,7 @@ main( int argc, char** argv )
     wrkflwtxt += "RMBDEV+";
   else
     wrkflwtxt += "??????+"; // no path to this statement
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   // -- HOST or DEVICE matrix elements? Standalone MEs or BRIDGE?
   if( !bridge )
     wrkflwtxt += "MESDEV";
@@ -845,7 +856,7 @@ main( int argc, char** argv )
 
   if( perf )
   {
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 #ifdef _OPENMP
     // Get the output of "nproc --all" (https://stackoverflow.com/a/478960)
     std::string nprocall;
@@ -866,6 +877,8 @@ main( int argc, char** argv )
     std::cout << std::string( SEP79, '*' ) << std::endl
 #ifdef __CUDACC__
               << "Process                     = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_CUDA"
+#elif defined __HIPCC__
+              << "Process                     = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_HIP"
 #else
               << "Process                     = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_CPP"
 #endif
@@ -892,21 +905,21 @@ main( int argc, char** argv )
 #elif defined MGONGPU_FPTYPE_FLOAT
               << "FP precision                = FLOAT (NaN/abnormal=" << nabn << ", zero=" << nzero << ")" << std::endl
 #endif
-#ifdef __CUDACC__
 #if defined MGONGPU_CUCXTYPE_CUCOMPLEX
               << "Complex type                = CUCOMPLEX" << std::endl
 #elif defined MGONGPU_CUCXTYPE_THRUST
               << "Complex type                = THRUST::COMPLEX" << std::endl
-#endif
-#else
+#elif defined MGONGPU_CUCXTYPE_CXSMPL
               << "Complex type                = STD::COMPLEX" << std::endl
+#else
+              << "Complex type                = ???" << std::endl // no path to this statement...
 #endif
               << "RanNumb memory layout       = AOSOA[" << neppR << "]"
               << ( neppR == 1 ? " == AOS" : "" )
               << " [HARDCODED FOR REPRODUCIBILITY]" << std::endl
               << "Momenta memory layout       = AOSOA[" << neppM << "]"
               << ( neppM == 1 ? " == AOS" : "" ) << std::endl
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     //<< "Wavefunction GPU memory     = LOCAL" << std::endl
 #else
 #if !defined MGONGPU_CPPSIMD
@@ -937,7 +950,7 @@ main( int argc, char** argv )
 #endif
 #endif
               << "Random number generation    = " << rndgentxt << std::endl
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 #ifdef _OPENMP
               << "OMP threads / `nproc --all` = " << omp_get_max_threads() << " / " << nprocall // includes a newline
 #endif
@@ -1033,14 +1046,14 @@ main( int argc, char** argv )
              << "\"FLOAT (NaN/abnormal=" << nabn << ")\"," << std::endl
 #endif
              << "\"Complex type\": "
-#ifdef __CUDACC__
 #if defined MGONGPU_CUCXTYPE_CUCOMPLEX
              << "\"CUCOMPLEX\"," << std::endl
 #elif defined MGONGPU_CUCXTYPE_THRUST
              << "\"THRUST::COMPLEX\"," << std::endl
-#endif
-#else
+#elif defined MGONGPU_CUCXTYPE_CXSMPL
              << "\"STD::COMPLEX\"," << std::endl
+#else
+             << "\"???\"," << std::endl                           // no path to this statement...
 #endif
              << "\"RanNumb memory layout\": "
              << "\"AOSOA[" << neppR << "]\""
@@ -1048,7 +1061,7 @@ main( int argc, char** argv )
              << "\"Momenta memory layout\": "
              << "\"AOSOA[" << neppM << "]\""
              << ( neppM == 1 ? " == AOS" : "" ) << ", " << std::endl
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     //<< "\"Wavefunction GPU memory\": " << "\"LOCAL\"," << std::endl
 #endif
              << "\"Curand generation\": "
diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/RamboSamplingKernels.cc b/epochX/cudacpp/gg_ttg.mad/SubProcesses/RamboSamplingKernels.cc
index da68aa9255..79abbcc4f8 100644
--- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/RamboSamplingKernels.cc
+++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/RamboSamplingKernels.cc
@@ -1,11 +1,11 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #include "RamboSamplingKernels.h"
 
-#include "CudaRuntime.h"
+#include "GpuRuntime.h"
 #include "MemoryAccessMomenta.h"
 #include "MemoryAccessRandomNumbers.h"
 #include "MemoryAccessWeights.h"
@@ -14,7 +14,7 @@
 
 #include <sstream>
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -92,7 +92,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   RamboSamplingKernelDevice::RamboSamplingKernelDevice( const fptype energy,               // input: energy
                                                         const BufferRndNumMomenta& rndmom, // input: random numbers in [0,1]
                                                         BufferMomenta& momenta,            // output: momenta
@@ -135,7 +135,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   __global__ void
   getMomentaInitialDevice( const fptype energy,
                            fptype* momenta )
@@ -147,17 +147,17 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   void
   RamboSamplingKernelDevice::getMomentaInitial()
   {
-    getMomentaInitialDevice<<<m_gpublocks, m_gputhreads>>>( m_energy, m_momenta.data() );
+    gpuLaunchKernel( getMomentaInitialDevice, m_gpublocks, m_gputhreads, m_energy, m_momenta.data() );
   }
 #endif
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   __global__ void
   getMomentaFinalDevice( const fptype energy,
                          const fptype* rndmom,
@@ -171,11 +171,11 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   void
   RamboSamplingKernelDevice::getMomentaFinal()
   {
-    getMomentaFinalDevice<<<m_gpublocks, m_gputhreads>>>( m_energy, m_rndmom.data(), m_momenta.data(), m_weights.data() );
+    gpuLaunchKernel( getMomentaFinalDevice, m_gpublocks, m_gputhreads, m_energy, m_rndmom.data(), m_momenta.data(), m_weights.data() );
   }
 #endif
 
diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/RamboSamplingKernels.h b/epochX/cudacpp/gg_ttg.mad/SubProcesses/RamboSamplingKernels.h
index 184089efd7..7c214cd74b 100644
--- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/RamboSamplingKernels.h
+++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/RamboSamplingKernels.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef RAMBOSAMPLINGKERNELS_H
 #define RAMBOSAMPLINGKERNELS_H 1
@@ -10,7 +10,7 @@
 
 #include "MemoryBuffers.h"
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -93,7 +93,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   // A class encapsulating RAMBO phase space sampling on a GPU device
   class RamboSamplingKernelDevice final : public SamplingKernelBase, public NumberOfEvents
   {
diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/RandomNumberKernels.h b/epochX/cudacpp/gg_ttg.mad/SubProcesses/RandomNumberKernels.h
index 188a72c2c9..21d63beeac 100644
--- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/RandomNumberKernels.h
+++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/RandomNumberKernels.h
@@ -1,14 +1,14 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef RANDOMNUMBERKERNELS_H
 #define RANDOMNUMBERKERNELS_H 1
 
 #include "mgOnGpuConfig.h"
 
-// NB This must come AFTER mgOnGpuConfig.h which contains our definition of __global__ when __CUDACC__ is not defined
+// NB This must come AFTER mgOnGpuConfig.h which contains our definition of __global__ when MGONGPUCPP_GPUIMPL is not defined
 #ifndef MGONGPU_HAS_NO_CURAND
 //#include "curand.h"
 struct curandGenerator_st; // forward definition from curand.h
@@ -16,7 +16,7 @@ struct curandGenerator_st; // forward definition from curand.h
 
 #include "MemoryBuffers.h"
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/cudacpp.mk b/epochX/cudacpp/gg_ttg.mad/SubProcesses/cudacpp.mk
index 43cee0977e..77334d2c04 100644
--- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/cudacpp.mk
+++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/cudacpp.mk
@@ -1,7 +1,7 @@
 # Copyright (C) 2020-2023 CERN and UCLouvain.
 # Licensed under the GNU Lesser General Public License (version 3 or later).
 # Created by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin.
-# Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+# Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 
 #=== Determine the name of this makefile (https://ftp.gnu.org/old-gnu/Manuals/make-3.80/html_node/make_17.html)
 #=== NB: different names (e.g. cudacpp.mk and cudacpp_src.mk) are used in the Subprocess and src directories
@@ -32,7 +32,7 @@ UNAME_P := $(shell uname -p)
 #=== Configure common compiler flags for C++ and CUDA
 
 INCFLAGS = -I.
-OPTFLAGS = -O3 # this ends up in CUFLAGS too (should it?), cannot add -Ofast or -ffast-math here
+OPTFLAGS = -O3 # this ends up in GPUFLAGS too (should it?), cannot add -Ofast or -ffast-math here
 
 # Dependency on src directory
 MG5AMC_COMMONLIB = mg5amc_common
@@ -103,69 +103,135 @@ endif
 
 #-------------------------------------------------------------------------------
 
-#=== Configure the CUDA compiler
-
-# If CXX is not a single word (example "clang++ --gcc-toolchain...") then disable CUDA builds (issue #505)
-# This is because it is impossible to pass this to "CUFLAGS += -ccbin <host-compiler>" below
-ifneq ($(words $(subst ccache ,,$(CXX))),1) # allow at most "CXX=ccache <host-compiler>" from outside
-  $(warning CUDA builds are not supported for multi-word CXX "$(CXX)")
-  override CUDA_HOME=disabled
-endif
-
-# If CUDA_HOME is not set, try to set it from the location of nvcc
-ifndef CUDA_HOME
-  CUDA_HOME = $(patsubst %bin/nvcc,%,$(shell which nvcc 2>/dev/null))
-  $(warning CUDA_HOME was not set: using "$(CUDA_HOME)")
-endif
-
-# Set NVCC as $(CUDA_HOME)/bin/nvcc if it exists
-ifneq ($(wildcard $(CUDA_HOME)/bin/nvcc),)
-  NVCC = $(CUDA_HOME)/bin/nvcc
-  USE_NVTX ?=-DUSE_NVTX
-  # See https://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/index.html
-  # See https://arnon.dk/matching-sm-architectures-arch-and-gencode-for-various-nvidia-cards/
-  # Default: use compute capability 70 for V100 (CERN lxbatch, CERN itscrd, Juwels Cluster).
-  # Embed device code for 70, and PTX for 70+.
-  # Export MADGRAPH_CUDA_ARCHITECTURE (comma-separated list) to use another value or list of values (see #533).
-  # Examples: use 60 for P100 (Piz Daint), 80 for A100 (Juwels Booster, NVidia raplab/Curiosity).
-  MADGRAPH_CUDA_ARCHITECTURE ?= 70
-  ###CUARCHFLAGS = -gencode arch=compute_$(MADGRAPH_CUDA_ARCHITECTURE),code=compute_$(MADGRAPH_CUDA_ARCHITECTURE) -gencode arch=compute_$(MADGRAPH_CUDA_ARCHITECTURE),code=sm_$(MADGRAPH_CUDA_ARCHITECTURE) # Older implementation (AV): go back to this one for multi-GPU support #533
-  ###CUARCHFLAGS = --gpu-architecture=compute_$(MADGRAPH_CUDA_ARCHITECTURE) --gpu-code=sm_$(MADGRAPH_CUDA_ARCHITECTURE),compute_$(MADGRAPH_CUDA_ARCHITECTURE) # Newer implementation (SH): cannot use this as-is for multi-GPU support #533
-  comma:=,
-  CUARCHFLAGS = $(foreach arch,$(subst $(comma), ,$(MADGRAPH_CUDA_ARCHITECTURE)),-gencode arch=compute_$(arch),code=compute_$(arch) -gencode arch=compute_$(arch),code=sm_$(arch))
-  CUINC = -I$(CUDA_HOME)/include/
-  CURANDLIBFLAGS = -L$(CUDA_HOME)/lib64/ -lcurand # NB: -lcuda is not needed here!
-  CUOPTFLAGS = -lineinfo
-  CUFLAGS = $(foreach opt, $(OPTFLAGS), -Xcompiler $(opt)) $(CUOPTFLAGS) $(INCFLAGS) $(CUINC) $(USE_NVTX) $(CUARCHFLAGS) -use_fast_math
-  ###CUFLAGS += -Xcompiler -Wall -Xcompiler -Wextra -Xcompiler -Wshadow
-  ###NVCC_VERSION = $(shell $(NVCC) --version | grep 'Cuda compilation tools' | cut -d' ' -f5 | cut -d, -f1)
-  CUFLAGS += -std=c++17 # need CUDA >= 11.2 (see #333): this is enforced in mgOnGpuConfig.h
-  # Without -maxrregcount: baseline throughput: 6.5E8 (16384 32 12) up to 7.3E8 (65536 128 12)
-  ###CUFLAGS+= --maxrregcount 160 # improves throughput: 6.9E8 (16384 32 12) up to 7.7E8 (65536 128 12)
-  ###CUFLAGS+= --maxrregcount 128 # improves throughput: 7.3E8 (16384 32 12) up to 7.6E8 (65536 128 12)
-  ###CUFLAGS+= --maxrregcount 96 # degrades throughput: 4.1E8 (16384 32 12) up to 4.5E8 (65536 128 12)
-  ###CUFLAGS+= --maxrregcount 64 # degrades throughput: 1.7E8 (16384 32 12) flat at 1.7E8 (65536 128 12)
-else ifneq ($(origin REQUIRE_CUDA),undefined)
-  # If REQUIRE_CUDA is set but no cuda is found, stop here (e.g. for CI tests on GPU #443)
-  $(error No cuda installation found (set CUDA_HOME or make nvcc visible in PATH))
-else
-  # No cuda. Switch cuda compilation off and go to common random numbers in C++
-  $(warning CUDA_HOME is not set or is invalid: export CUDA_HOME to compile with cuda)
-  override NVCC=
-  override USE_NVTX=
-  override CUINC=
-  override CURANDLIBFLAGS=
-endif
-export NVCC
-export CUFLAGS
+CUDA_COMPILER_PATH := $(shell compiler="`which nvcc 2>/dev/null`" && while [ -L "$$compiler" ]; do compiler=`readlink "$$compiler"`; done && echo "$$compiler")
+HIP_COMPILER_PATH := $(shell compiler="`which hipcc 2>/dev/null`" && while [ -L "$$compiler" ]; do compiler=`readlink "$$compiler"`; done && echo "$$compiler")
+
+ifeq ($(findstring nvcc,$(CUDA_COMPILER_PATH)),nvcc)
+  #=== Configure the CUDA compiler
+
+  # If CXX is not a single word (example "clang++ --gcc-toolchain...") then disable CUDA builds (issue #505)
+  # This is because it is impossible to pass this to "GPUFLAGS += -ccbin <host-compiler>" below
+  ifneq ($(words $(subst ccache ,,$(CXX))),1) # allow at most "CXX=ccache <host-compiler>" from outside
+    $(warning CUDA builds are not supported for multi-word CXX "$(CXX)")
+    override CUDA_HOME=disabled
+  endif
+
+  # If CUDA_HOME is not set, try to set it from the location of nvcc
+  ifndef CUDA_HOME
+    CUDA_HOME = $(patsubst %bin/nvcc,%,$(shell which nvcc 2>/dev/null))
+    $(warning CUDA_HOME was not set: using "$(CUDA_HOME)")
+  endif
+
+  # Set GPUCC as $(CUDA_HOME)/bin/nvcc if it exists
+  ifneq ($(wildcard $(CUDA_HOME)/bin/nvcc),)
+    GPUCC = $(CUDA_HOME)/bin/nvcc
+    USE_NVTX ?=-DUSE_NVTX
+    # See https://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/index.html
+    # See https://arnon.dk/matching-sm-architectures-arch-and-gencode-for-various-nvidia-cards/
+    # Default: use compute capability 70 for V100 (CERN lxbatch, CERN itscrd, Juwels Cluster).
+    # Embed device code for 70, and PTX for 70+.
+    # Export MADGRAPH_CUDA_ARCHITECTURE (comma-separated list) to use another value or list of values (see #533).
+    # Examples: use 60 for P100 (Piz Daint), 80 for A100 (Juwels Booster, NVidia raplab/Curiosity).
+    MADGRAPH_CUDA_ARCHITECTURE ?= 70
+    ###CUARCHFLAGS = -gencode arch=compute_$(MADGRAPH_CUDA_ARCHITECTURE),code=compute_$(MADGRAPH_CUDA_ARCHITECTURE) -gencode arch=compute_$(MADGRAPH_CUDA_ARCHITECTURE),code=sm_$(MADGRAPH_CUDA_ARCHITECTURE) # Older implementation (AV): go back to this one for multi-GPU support #533
+    ###CUARCHFLAGS = --gpu-architecture=compute_$(MADGRAPH_CUDA_ARCHITECTURE) --gpu-code=sm_$(MADGRAPH_CUDA_ARCHITECTURE),compute_$(MADGRAPH_CUDA_ARCHITECTURE) # Newer implementation (SH): cannot use this as-is for multi-GPU support #533
+    comma:=,
+    CUARCHFLAGS = $(foreach arch,$(subst $(comma), ,$(MADGRAPH_CUDA_ARCHITECTURE)),-gencode arch=compute_$(arch),code=compute_$(arch) -gencode arch=compute_$(arch),code=sm_$(arch))
+    CUINC = -I$(CUDA_HOME)/include/
+    CURANDLIBFLAGS = -L$(CUDA_HOME)/lib64/ -lcurand # NB: -lcuda is not needed here!
+    CUOPTFLAGS = -lineinfo
+    GPUFLAGS = $(foreach opt, $(OPTFLAGS), -Xcompiler $(opt)) $(CUOPTFLAGS) $(INCFLAGS) $(CUINC) $(USE_NVTX) $(CUARCHFLAGS) -use_fast_math
+    ###GPUFLAGS += -Xcompiler -Wall -Xcompiler -Wextra -Xcompiler -Wshadow
+    ###GPUCC_VERSION = $(shell $(GPUCC) --version | grep 'Cuda compilation tools' | cut -d' ' -f5 | cut -d, -f1)
+    GPUFLAGS += -std=c++17 # need CUDA >= 11.2 (see #333): this is enforced in mgOnGpuConfig.h
+    # Without -maxrregcount: baseline throughput: 6.5E8 (16384 32 12) up to 7.3E8 (65536 128 12)
+    ###GPUFLAGS+= --maxrregcount 160 # improves throughput: 6.9E8 (16384 32 12) up to 7.7E8 (65536 128 12)
+    ###GPUFLAGS+= --maxrregcount 128 # improves throughput: 7.3E8 (16384 32 12) up to 7.6E8 (65536 128 12)
+    ###GPUFLAGS+= --maxrregcount 96 # degrades throughput: 4.1E8 (16384 32 12) up to 4.5E8 (65536 128 12)
+    ###GPUFLAGS+= --maxrregcount 64 # degrades throughput: 1.7E8 (16384 32 12) flat at 1.7E8 (65536 128 12)
+    CUBUILDRULEFLAGS = -Xcompiler -fPIC -c
+    CCBUILDRULEFLAGS = -Xcompiler -fPIC -c -x cu
+    CUDATESTFLAGS = -lcuda
+  else ifneq ($(origin REQUIRE_CUDA),undefined)
+    # If REQUIRE_CUDA is set but no cuda is found, stop here (e.g. for CI tests on GPU #443)
+    $(error No cuda installation found (set CUDA_HOME or make GPUCC visible in PATH))
+  else
+    # No cuda. Switch cuda compilation off and go to common random numbers in C++
+    $(warning CUDA_HOME is not set or is invalid: export CUDA_HOME to compile with cuda)
+    override GPUCC=
+    override USE_NVTX=
+    override CUINC=
+    override CURANDLIBFLAGS=
+  endif
+  export GPUCC
+  export GPUFLAGS
+
+  # Set the host C++ compiler for GPUCC via "-ccbin <host-compiler>"
+  # (NB issue #505: this must be a single word, "clang++ --gcc-toolchain..." is not supported)
+  GPUFLAGS += -ccbin $(shell which $(subst ccache ,,$(CXX)))
+
+  # Allow newer (unsupported) C++ compilers with older versions of CUDA if ALLOW_UNSUPPORTED_COMPILER_IN_CUDA is set (#504)
+  ifneq ($(origin ALLOW_UNSUPPORTED_COMPILER_IN_CUDA),undefined)
+  GPUFLAGS += -allow-unsupported-compiler
+  endif
+
+else ifeq ($(findstring hipcc,$(HIP_COMPILER_PATH)),hipcc)
+  #=== Configure the HIP compiler
+
+  # If CXX is not a single word (example "clang++ --gcc-toolchain...") then disable CUDA builds (issue #505)
+  # This is because it is impossible to pass this to "GPUFLAGS += -ccbin <host-compiler>" below
+  ifneq ($(words $(subst ccache ,,$(CXX))),1) # allow at most "CXX=ccache <host-compiler>" from outside
+    $(warning CUDA builds are not supported for multi-word CXX "$(CXX)")
+    override CUDA_HOME=disabled
+  endif
+
+  # If HIP_HOME is not set, try to set it from the location of GPUCC
+  ifndef HIP_HOME
+    HIP_HOME = $(patsubst %bin/hipcc,%,$(HIP_COMPILER_PATH))
+    $(warning HIP_HOME was not set: using "$(HIP_HOME)")
+  endif
 
-# Set the host C++ compiler for nvcc via "-ccbin <host-compiler>"
-# (NB issue #505: this must be a single word, "clang++ --gcc-toolchain..." is not supported)
-CUFLAGS += -ccbin $(shell which $(subst ccache ,,$(CXX)))
+  # Set GPUCC as $(HIP_HOME)/bin/hipcc if it exists
+  ifneq ($(wildcard $(HIP_HOME)/bin/hipcc),)
+    GPUCC = $(HIP_HOME)/bin/hipcc
+
+    # Should maybe find something equivelant to this in HIP
+    #USE_NVTX ?=-DUSE_NVTX
+
+    HIPARCHFLAGS = -target x86_64-linux-gnu --offload-arch=gfx90a
+    HIPINC = -I$(HIP_HOME)/include/
+
+    # -DHIP_FAST_MATH equivelant to -use_fast_math in HIP 
+    # (But only for single precision line 208: https://rocm-developer-tools.github.io/HIP/hcc__detail_2math__functions_8h_source.html)
+    GPUFLAGS = $(OPTFLAGS) $(CUOPTFLAGS) $(INCFLAGS) $(HIPINC) $(HIPARCHFLAGS) -DHIP_FAST_MATH -DHIP_PLATFORM=amd -fPIC
+    ###GPUFLAGS += -Xcompiler -Wall -Xcompiler -Wextra -Xcompiler -Wshadow
+    GPUFLAGS += -std=c++17
+    # Without -maxrregcount: baseline throughput: 6.5E8 (16384 32 12) up to 7.3E8 (65536 128 12)
+    ###GPUFLAGS+= --maxrregcount 160 # improves throughput: 6.9E8 (16384 32 12) up to 7.7E8 (65536 128 12)
+    ###GPUFLAGS+= --maxrregcount 128 # improves throughput: 7.3E8 (16384 32 12) up to 7.6E8 (65536 128 12)
+    ###GPUFLAGS+= --maxrregcount 96 # degrades throughput: 4.1E8 (16384 32 12) up to 4.5E8 (65536 128 12)
+    ###GPUFLAGS+= --maxrregcount 64 # degrades throughput: 1.7E8 (16384 32 12) flat at 1.7E8 (65536 128 12)
+
+    CUBUILDRULEFLAGS = -fPIC -c
+    CCBUILDRULEFLAGS = -fPIC -c
+
+  else ifneq ($(origin REQUIRE_HIP),undefined)
+    # If REQUIRE_HIP is set but no cuda is found, stop here (e.g. for CI tests on GPU #443)
+    $(error No hip installation found (set HIP_HOME or make GPUCC visible in PATH))
+  else
+    # No hip. Switch hip compilation off and go to common random numbers in C++
+    $(warning HIP_HOME is not set or is invalid: export HIP_HOME to compile with hip)
+    override GPUCC=
+    override USE_NVTX=
+    override CUINC=
+    override CURANDLIBFLAGS=
+  endif
+
+  # Allow newer (unsupported) C++ compilers with older versions of CUDA if ALLOW_UNSUPPORTED_COMPILER_IN_CUDA is set (#504)
+  ifneq ($(origin ALLOW_UNSUPPORTED_COMPILER_IN_CUDA),undefined)
+  GPUFLAGS += -allow-unsupported-compiler
+  endif
 
-# Allow newer (unsupported) C++ compilers with older versions of CUDA if ALLOW_UNSUPPORTED_COMPILER_IN_CUDA is set (#504)
-ifneq ($(origin ALLOW_UNSUPPORTED_COMPILER_IN_CUDA),undefined)
-CUFLAGS += -allow-unsupported-compiler
 endif
 
 #-------------------------------------------------------------------------------
@@ -179,9 +245,9 @@ endif
 #ifeq ($(USECCACHE)$(shell echo $(AR) | grep ccache),1)
 #  override AR:=ccache $(AR)
 #endif
-ifneq ($(NVCC),)
-  ifeq ($(USECCACHE)$(shell echo $(NVCC) | grep ccache),1)
-    override NVCC:=ccache $(NVCC)
+ifneq ($(GPUCC),)
+  ifeq ($(USECCACHE)$(shell echo $(GPUCC) | grep ccache),1)
+    override GPUCC:=ccache $(GPUCC)
   endif
 endif
 
@@ -205,7 +271,7 @@ endif
 
 # PowerPC-specific CUDA compiler flags (to be reviewed!)
 ifeq ($(UNAME_P),ppc64le)
-  CUFLAGS+= -Xcompiler -mno-float128
+  GPUFLAGS+= -Xcompiler -mno-float128
 endif
 
 #-------------------------------------------------------------------------------
@@ -215,10 +281,10 @@ endif
 # Set the default OMPFLAGS choice
 ifneq ($(shell $(CXX) --version | egrep '^Intel'),)
 override OMPFLAGS = -fopenmp
-###override OMPFLAGS = # disable OpenMP MT on Intel (was ok without nvcc but not ok with nvcc before #578)
+###override OMPFLAGS = # disable OpenMP MT on Intel (was ok without GPUCC but not ok with GPUCC before #578)
 else ifneq ($(shell $(CXX) --version | egrep '^(clang)'),)
 override OMPFLAGS = -fopenmp
-###override OMPFLAGS = # disable OpenMP MT on clang (was not ok without or with nvcc before #578)
+###override OMPFLAGS = # disable OpenMP MT on clang (was not ok without or with GPUCC before #578)
 else ifneq ($(shell $(CXX) --version | egrep '^(Apple clang)'),)
 override OMPFLAGS = # disable OpenMP MT on Apple clang (builds fail in the CI #578)
 else
@@ -269,7 +335,10 @@ endif
 
 # Set the default RNDGEN (random number generator) choice
 ifeq ($(RNDGEN),)
-  ifeq ($(NVCC),)
+  ifeq ($(GPUCC),)
+    override RNDGEN = hasNoCurand
+  # Edgecase for HIP compilation
+  else ifeq ($(findstring hipcc,$(GPUCC)),hipcc)
     override RNDGEN = hasNoCurand
   else ifeq ($(RNDGEN),)
     override RNDGEN = hasCurand
@@ -344,13 +413,13 @@ CXXFLAGS+= $(AVXFLAGS)
 $(info FPTYPE=$(FPTYPE))
 ifeq ($(FPTYPE),d)
   CXXFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_DOUBLE
-  CUFLAGS  += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_DOUBLE
+  GPUFLAGS  += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_DOUBLE
 else ifeq ($(FPTYPE),f)
   CXXFLAGS += -DMGONGPU_FPTYPE_FLOAT -DMGONGPU_FPTYPE2_FLOAT
-  CUFLAGS  += -DMGONGPU_FPTYPE_FLOAT -DMGONGPU_FPTYPE2_FLOAT
+  GPUFLAGS  += -DMGONGPU_FPTYPE_FLOAT -DMGONGPU_FPTYPE2_FLOAT
 else ifeq ($(FPTYPE),m)
   CXXFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_FLOAT
-  CUFLAGS  += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_FLOAT
+  GPUFLAGS  += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_FLOAT
 else
   $(error Unknown FPTYPE='$(FPTYPE)': only 'd', 'f' and 'm' are supported)
 endif
@@ -359,7 +428,7 @@ endif
 $(info HELINL=$(HELINL))
 ifeq ($(HELINL),1)
   CXXFLAGS += -DMGONGPU_INLINE_HELAMPS
-  CUFLAGS  += -DMGONGPU_INLINE_HELAMPS
+  GPUFLAGS  += -DMGONGPU_INLINE_HELAMPS
 else ifneq ($(HELINL),0)
   $(error Unknown HELINL='$(HELINL)': only '0' and '1' are supported)
 endif
@@ -368,7 +437,7 @@ endif
 $(info HRDCOD=$(HRDCOD))
 ifeq ($(HRDCOD),1)
   CXXFLAGS += -DMGONGPU_HARDCODE_PARAM
-  CUFLAGS  += -DMGONGPU_HARDCODE_PARAM
+  GPUFLAGS  += -DMGONGPU_HARDCODE_PARAM
 else ifneq ($(HRDCOD),0)
   $(error Unknown HRDCOD='$(HRDCOD)': only '0' and '1' are supported)
 endif
@@ -420,11 +489,11 @@ ifeq ($(UNAME_S),Darwin)
   override CULIBFLAGSRPATH2 =
 else
   # RPATH to cuda/cpp libs when linking executables
-  override CXXLIBFLAGSRPATH = -Wl,-rpath,$(LIBDIRRPATH)
-  override CULIBFLAGSRPATH = -Xlinker -rpath,$(LIBDIRRPATH)
+  override CXXLIBFLAGSRPATH = -Wl,-rpath=$(LIBDIRRPATH)
+  override CULIBFLAGSRPATH = -Xlinker -rpath=$(LIBDIRRPATH)
   # RPATH to common lib when linking cuda/cpp libs
-  override CXXLIBFLAGSRPATH2 = -Wl,-rpath,'$$ORIGIN'
-  override CULIBFLAGSRPATH2 = -Xlinker -rpath,'$$ORIGIN'
+  override CXXLIBFLAGSRPATH2 = -Wl,-rpath='$$ORIGIN'
+  override CULIBFLAGSRPATH2 = -Xlinker -rpath='$$ORIGIN'
 endif
 
 # Setting LD_LIBRARY_PATH or DYLD_LIBRARY_PATH in the RUNTIME is no longer necessary (neither on Linux nor on Mac)
@@ -437,7 +506,7 @@ override RUNTIME =
 cxx_main=$(BUILDDIR)/check.exe
 fcxx_main=$(BUILDDIR)/fcheck.exe
 
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 cu_main=$(BUILDDIR)/gcheck.exe
 fcu_main=$(BUILDDIR)/fgcheck.exe
 else
@@ -468,15 +537,16 @@ $(BUILDDIR)/.build.$(TAG):
 	@touch $(BUILDDIR)/.build.$(TAG)
 
 # Generic target and build rules: objects from CUDA compilation
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 $(BUILDDIR)/%.o : %.cu *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
 	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
-	$(NVCC) $(CPPFLAGS) $(CUFLAGS) -Xcompiler -fPIC -c $< -o $@
+	$(GPUCC) $(CPPFLAGS) $(GPUFLAGS) $(CUBUILDRULEFLAGS) $< -o $@
 
 $(BUILDDIR)/%_cu.o : %.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
 	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
-	$(NVCC) $(CPPFLAGS) $(CUFLAGS) -Xcompiler -fPIC -c -x cu $< -o $@
+	$(GPUCC) $(CPPFLAGS) $(GPUFLAGS) $(CCBUILDRULEFLAGS) $< -o $@
 endif
+# -x cu in line above
 
 # Generic target and build rules: objects from C++ compilation
 # (NB do not include CUINC here! add it only for NVTX or curand #679)
@@ -485,11 +555,14 @@ $(BUILDDIR)/%.o : %.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
 	$(CXX) $(CPPFLAGS) $(CXXFLAGS) -fPIC -c $< -o $@
 
 # Apply special build flags only to CrossSectionKernel.cc and gCrossSectionKernel.cu (no fast math, see #117 and #516)
+# Added edgecase for HIP compilation
 ifeq ($(shell $(CXX) --version | grep ^nvc++),)
 $(BUILDDIR)/CrossSectionKernels.o: CXXFLAGS := $(filter-out -ffast-math,$(CXXFLAGS))
 $(BUILDDIR)/CrossSectionKernels.o: CXXFLAGS += -fno-fast-math
-ifneq ($(NVCC),)
-$(BUILDDIR)/gCrossSectionKernels.o: CUFLAGS += -Xcompiler -fno-fast-math
+ifeq ($(findstring nvcc,$(GPUCC)),nvcc)
+  $(BUILDDIR)/gCrossSectionKernels.o: GPUFLAGS += -Xcompiler -fno-fast-math
+else
+  $(BUILDDIR)/gCrossSectionKernels.o: GPUFLAGS += -fno-fast-math
 endif
 endif
 
@@ -505,10 +578,10 @@ ifeq ($(RNDGEN),hasCurand)
 $(BUILDDIR)/CurandRandomNumberKernel.o: CXXFLAGS += $(CUINC)
 endif
 
-# Avoid "warning: builtin __has_trivial_... is deprecated; use __is_trivially_... instead" in nvcc with icx2023 (#592)
+# Avoid "warning: builtin __has_trivial_... is deprecated; use __is_trivially_... instead" in GPUCC with icx2023 (#592)
 ifneq ($(shell $(CXX) --version | egrep '^(Intel)'),)
-ifneq ($(NVCC),)
-CUFLAGS += -Xcompiler -Wno-deprecated-builtins
+ifneq ($(GPUCC),)
+GPUFLAGS += -Wno-deprecated-builtins
 endif
 endif
 
@@ -516,8 +589,8 @@ endif
 # This patch does remove the warning, but I prefer to keep it disabled for the moment...
 ###ifneq ($(shell $(CXX) --version | egrep '^(clang|Apple clang|Intel)'),)
 ###$(BUILDDIR)/CrossSectionKernels.o: CXXFLAGS += -Wno-overriding-t-option
-###ifneq ($(NVCC),)
-###$(BUILDDIR)/gCrossSectionKernels.o: CUFLAGS += -Xcompiler -Wno-overriding-t-option
+###ifneq ($(GPUCC),)
+###$(BUILDDIR)/gCrossSectionKernels.o: GPUFLAGS += -Xcompiler -Wno-overriding-t-option
 ###endif
 ###endif
 
@@ -544,7 +617,7 @@ MG5AMC_CXXLIB = mg5amc_$(processid_short)_cpp
 cxx_objects_lib=$(BUILDDIR)/CPPProcess.o $(BUILDDIR)/MatrixElementKernels.o $(BUILDDIR)/BridgeKernels.o $(BUILDDIR)/CrossSectionKernels.o
 cxx_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel.o $(BUILDDIR)/RamboSamplingKernels.o
 
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 MG5AMC_CULIB = mg5amc_$(processid_short)_cuda
 cu_objects_lib=$(BUILDDIR)/gCPPProcess.o $(BUILDDIR)/gMatrixElementKernels.o $(BUILDDIR)/gBridgeKernels.o $(BUILDDIR)/gCrossSectionKernels.o
 cu_objects_exe=$(BUILDDIR)/gCommonRandomNumberKernel.o $(BUILDDIR)/gRamboSamplingKernels.o
@@ -556,11 +629,11 @@ $(LIBDIR)/lib$(MG5AMC_CXXLIB).so: cxx_objects_lib += $(BUILDDIR)/fbridge.o
 $(LIBDIR)/lib$(MG5AMC_CXXLIB).so: $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib)
 	$(CXX) -shared -o $@ $(cxx_objects_lib) $(CXXLIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB)
 
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 $(LIBDIR)/lib$(MG5AMC_CULIB).so: $(BUILDDIR)/fbridge_cu.o
 $(LIBDIR)/lib$(MG5AMC_CULIB).so: cu_objects_lib += $(BUILDDIR)/fbridge_cu.o
 $(LIBDIR)/lib$(MG5AMC_CULIB).so: $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cu_objects_lib)
-	$(NVCC) --shared -o $@ $(cu_objects_lib) $(CULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB)
+	$(GPUCC) --shared -o $@ $(cu_objects_lib) $(CULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB)
 endif
 
 #-------------------------------------------------------------------------------
@@ -577,16 +650,16 @@ $(cxx_main): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PAT
 $(cxx_main): $(BUILDDIR)/check_sa.o $(LIBDIR)/lib$(MG5AMC_CXXLIB).so $(cxx_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel.o
 	$(CXX) -o $@ $(BUILDDIR)/check_sa.o $(OMPFLAGS) -ldl -pthread $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_CXXLIB) $(cxx_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel.o $(CURANDLIBFLAGS)
 
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 ifneq ($(shell $(CXX) --version | grep ^Intel),)
-$(cu_main): LIBFLAGS += -lintlc # compile with icpx and link with nvcc (undefined reference to `_intel_fast_memcpy')
-$(cu_main): LIBFLAGS += -lsvml # compile with icpx and link with nvcc (undefined reference to `__svml_cos4_l9')
+$(cu_main): LIBFLAGS += -lintlc # compile with icpx and link with GPUCC (undefined reference to `_intel_fast_memcpy')
+$(cu_main): LIBFLAGS += -lsvml # compile with icpx and link with GPUCC (undefined reference to `__svml_cos4_l9')
 else ifneq ($(shell $(CXX) --version | grep ^nvc++),) # support nvc++ #531
 $(cu_main): LIBFLAGS += -L$(patsubst %bin/nvc++,%lib,$(subst ccache ,,$(CXX))) -lnvhpcatm -lnvcpumath -lnvc
 endif
 $(cu_main): LIBFLAGS += $(CULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH
 $(cu_main): $(BUILDDIR)/gcheck_sa.o $(LIBDIR)/lib$(MG5AMC_CULIB).so $(cu_objects_exe) $(BUILDDIR)/gCurandRandomNumberKernel.o
-	$(NVCC) -o $@ $(BUILDDIR)/gcheck_sa.o $(CUARCHFLAGS) $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe) $(BUILDDIR)/gCurandRandomNumberKernel.o $(CURANDLIBFLAGS)
+	$(GPUCC) -o $@ $(BUILDDIR)/gcheck_sa.o $(CUARCHFLAGS) $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe) $(BUILDDIR)/gCurandRandomNumberKernel.o $(CURANDLIBFLAGS)
 endif
 
 #-------------------------------------------------------------------------------
@@ -612,17 +685,17 @@ $(fcxx_main): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PA
 $(fcxx_main): $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler.o $(LIBDIR)/lib$(MG5AMC_CXXLIB).so $(cxx_objects_exe)
 	$(CXX) -o $@ $(BUILDDIR)/fcheck_sa.o $(OMPFLAGS) $(BUILDDIR)/fsampler.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CXXLIB) $(cxx_objects_exe)
 
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 ifneq ($(shell $(CXX) --version | grep ^Intel),)
-$(fcu_main): LIBFLAGS += -lintlc # compile with icpx and link with nvcc (undefined reference to `_intel_fast_memcpy')
-$(fcu_main): LIBFLAGS += -lsvml # compile with icpx and link with nvcc (undefined reference to `__svml_cos4_l9')
+$(fcu_main): LIBFLAGS += -lintlc # compile with icpx and link with GPUCC (undefined reference to `_intel_fast_memcpy')
+$(fcu_main): LIBFLAGS += -lsvml # compile with icpx and link with GPUCC (undefined reference to `__svml_cos4_l9')
 endif
 ifeq ($(UNAME_S),Darwin)
 $(fcu_main): LIBFLAGS += -L$(shell dirname $(shell $(FC) --print-file-name libgfortran.dylib)) # add path to libgfortran on Mac #375
 endif
 $(fcu_main): LIBFLAGS += $(CULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH
 $(fcu_main): $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler_cu.o $(LIBDIR)/lib$(MG5AMC_CULIB).so $(cu_objects_exe)
-	$(NVCC) -o $@ $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler_cu.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe)
+	$(GPUCC) -o $@ $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler_cu.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe)
 endif
 
 #-------------------------------------------------------------------------------
@@ -634,7 +707,7 @@ $(BUILDDIR)/testxxx.o: testxxx_cc_ref.txt
 $(testmain): $(BUILDDIR)/testxxx.o
 $(testmain): cxx_objects_exe += $(BUILDDIR)/testxxx.o # Comment out this line to skip the C++ test of xxx functions
 
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 $(BUILDDIR)/testxxx_cu.o: $(GTESTLIBS)
 $(BUILDDIR)/testxxx_cu.o: INCFLAGS += $(GTESTINC)
 $(BUILDDIR)/testxxx_cu.o: testxxx_cc_ref.txt
@@ -647,7 +720,7 @@ $(BUILDDIR)/testmisc.o: INCFLAGS += $(GTESTINC)
 $(testmain): $(BUILDDIR)/testmisc.o
 $(testmain): cxx_objects_exe += $(BUILDDIR)/testmisc.o # Comment out this line to skip the C++ miscellaneous tests
 
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 $(BUILDDIR)/testmisc_cu.o: $(GTESTLIBS)
 $(BUILDDIR)/testmisc_cu.o: INCFLAGS += $(GTESTINC)
 $(testmain): $(BUILDDIR)/testmisc_cu.o
@@ -659,12 +732,12 @@ $(BUILDDIR)/runTest.o: INCFLAGS += $(GTESTINC)
 $(testmain): $(BUILDDIR)/runTest.o
 $(testmain): cxx_objects_exe += $(BUILDDIR)/runTest.o
 
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 $(BUILDDIR)/runTest_cu.o: $(GTESTLIBS)
 $(BUILDDIR)/runTest_cu.o: INCFLAGS += $(GTESTINC)
 ifneq ($(shell $(CXX) --version | grep ^Intel),)
-$(testmain): LIBFLAGS += -lintlc # compile with icpx and link with nvcc (undefined reference to `_intel_fast_memcpy')
-$(testmain): LIBFLAGS += -lsvml # compile with icpx and link with nvcc (undefined reference to `__svml_cos4_l9')
+$(testmain): LIBFLAGS += -lintlc # compile with icpx and link with GPUCC (undefined reference to `_intel_fast_memcpy')
+$(testmain): LIBFLAGS += -lsvml # compile with icpx and link with GPUCC (undefined reference to `__svml_cos4_l9')
 else ifneq ($(shell $(CXX) --version | grep ^nvc++),) # support nvc++ #531
 $(testmain): LIBFLAGS += -L$(patsubst %bin/nvc++,%lib,$(subst ccache ,,$(CXX))) -lnvhpcatm -lnvcpumath -lnvc
 endif
@@ -688,14 +761,14 @@ $(testmain): LIBFLAGS += -lgomp
 endif
 endif
 
-ifeq ($(NVCC),) # link only runTest.o
+ifeq ($(GPUCC),) # link only runTest.o
 $(testmain): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH
 $(testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(cxx_objects_exe) $(GTESTLIBS)
 	$(CXX) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) -ldl -pthread $(LIBFLAGS)
 else # link both runTest.o and runTest_cu.o
 $(testmain): LIBFLAGS += $(CULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH
 $(testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) $(GTESTLIBS)
-	$(NVCC) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) -ldl $(LIBFLAGS) -lcuda
+	  $(GPUCC) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) -ldl $(LIBFLAGS) $(CUDATESTFLAGS)
 endif
 
 # Use flock (Linux only, no Mac) to allow 'make -j' if googletest has not yet been downloaded https://stackoverflow.com/a/32666215
@@ -798,9 +871,9 @@ ifeq ($(USECCACHE),1)
 	ccache --version | head -1
 endif
 	@echo ""
-	@echo NVCC=$(NVCC)
-ifneq ($(NVCC),)
-	$(NVCC) --version
+	@echo GPUCC=$(GPUCC)
+ifneq ($(GPUCC),)
+	$(GPUCC) --version
 endif
 	@echo ""
 	@echo CXX=$(CXX)
@@ -819,7 +892,7 @@ endif
 
 # Target: check (run the C++ test executable)
 # [NB THIS IS WHAT IS USED IN THE GITHUB CI!]
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 check: runTest cmpFcheck cmpFGcheck
 else
 check: runTest cmpFcheck
diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/fbridge.cc b/epochX/cudacpp/gg_ttg.mad/SubProcesses/fbridge.cc
index f93c05b0b3..2b956730d4 100644
--- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/fbridge.cc
+++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/fbridge.cc
@@ -1,11 +1,11 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: S. Roiser (Oct 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Roiser, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #include "Bridge.h"
 #include "CPPProcess.h"
-#include "CudaRuntime.h"
+#include "GpuRuntime.h"
 
 extern "C"
 {
@@ -22,7 +22,7 @@ extern "C"
    * Using the same Fortran MadEvent code, linking to the hetrerogeneous library would allow access to both CPU and GPU implementations.
    * The specific heterogeneous configuration (how many GPUs, how many threads on each CPU, etc) could be loaded in CUDA/C++ from a data file.
    */
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   using namespace mg5amcGpu;
 #else
   using namespace mg5amcCpu;
@@ -46,8 +46,8 @@ extern "C"
    */
   void fbridgecreate_( CppObjectInFortran** ppbridge, const int* pnevtF, const int* pnparF, const int* pnp4F )
   {
-#ifdef __CUDACC__
-    CudaRuntime::setUp();
+#ifdef MGONGPUCPP_GPUIMPL
+    GpuRuntime::setUp();
 #endif
     // Create a process object, read parm card and set parameters
     // FIXME: the process instance can happily go out of scope because it is only needed to read parameters?
@@ -69,8 +69,8 @@ extern "C"
     Bridge<FORTRANFPTYPE>* pbridge = dynamic_cast<Bridge<FORTRANFPTYPE>*>( *ppbridge );
     if( pbridge == 0 ) throw std::runtime_error( "fbridgedelete_: invalid Bridge address" );
     delete pbridge;
-#ifdef __CUDACC__
-    CudaRuntime::tearDown();
+#ifdef MGONGPUCPP_GPUIMPL
+    GpuRuntime::tearDown();
 #endif
   }
 
@@ -100,7 +100,7 @@ extern "C"
   {
     Bridge<FORTRANFPTYPE>* pbridge = dynamic_cast<Bridge<FORTRANFPTYPE>*>( *ppbridge );
     if( pbridge == 0 ) throw std::runtime_error( "fbridgesequence_: invalid Bridge address" );
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     // Use the device/GPU implementation in the CUDA library
     // (there is also a host implementation in this library)
     pbridge->gpu_sequence( momenta, gs, rndhel, rndcol, *pchannelId, mes, selhel, selcol );
diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/fsampler.cc b/epochX/cudacpp/gg_ttg.mad/SubProcesses/fsampler.cc
index 2fb445372d..3743934f41 100644
--- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/fsampler.cc
+++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/fsampler.cc
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Feb 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #include "mgOnGpuConfig.h"
 
@@ -13,7 +13,7 @@
 
 //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -40,7 +40,7 @@ namespace mg5amcCpu
   private:
     const int m_nevt; // The number of events in each iteration
     int m_iiter;      // The iteration counter (for random number seeding)
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
     HostBufferRndNumMomenta m_hstRndmom; // Memory buffers for random numbers
     HostBufferMomenta m_hstMomenta;      // Memory buffers for momenta
     HostBufferWeights m_hstWeights;      // Memory buffers for sampling weights
@@ -105,7 +105,7 @@ namespace mg5amcCpu
 
 extern "C"
 {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   using namespace mg5amcGpu;
 #else
   using namespace mg5amcCpu;
diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/runTest.cc b/epochX/cudacpp/gg_ttg.mad/SubProcesses/runTest.cc
index df7488178e..4c9bc9ee6b 100644
--- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/runTest.cc
+++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/runTest.cc
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: S. Hageboeck (Nov 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 //----------------------------------------------------------------------------
 // Use ./runTest.exe --gtest_filter=*xxx to run only testxxx.cc tests
 //----------------------------------------------------------------------------
@@ -18,7 +18,7 @@
 #include "RandomNumberKernels.h"
 #include "epoch_process_id.h"
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 using namespace mg5amcGpu;
 #else
 using namespace mg5amcCpu;
@@ -35,7 +35,7 @@ struct CUDA_CPU_TestBase : public TestDriverBase
     : TestDriverBase( npar, refFileName ) {}
 };
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 struct CPUTest : public CUDA_CPU_TestBase
 {
   // Struct data members (process, and memory structures for random numbers, momenta, matrix elements and weights on host and device)
@@ -117,7 +117,7 @@ struct CPUTest : public CUDA_CPU_TestBase
 };
 #endif
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 struct CUDATest : public CUDA_CPU_TestBase
 {
   // Reset the device when our test goes out of scope. Note that this should happen after
@@ -126,7 +126,7 @@ struct CUDATest : public CUDA_CPU_TestBase
   {
     ~DeviceReset()
     {
-      checkCuda( cudaDeviceReset() ); // this is needed by cuda-memcheck --leak-check full
+      gpuDeviceReset(); // this is needed by cuda-memcheck --leak-check full
     }
   } deviceResetter;
 
@@ -252,7 +252,7 @@ INSTANTIATE_TEST_SUITE_P( prefix, \
                           test_suite_name, \
                           testing::Values( new CUDATest( MG_EPOCH_REFERENCE_FILE_NAME ) ) );
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 MG_INSTANTIATE_TEST_SUITE_GPU( XTESTID_GPU( MG_EPOCH_PROCESS_ID ), MadgraphTest );
 #else
 MG_INSTANTIATE_TEST_SUITE_CPU( XTESTID_CPU( MG_EPOCH_PROCESS_ID ), MadgraphTest );
diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/testmisc.cc b/epochX/cudacpp/gg_ttg.mad/SubProcesses/testmisc.cc
index 895d6eeb56..ba9e59a8a3 100644
--- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/testmisc.cc
+++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/testmisc.cc
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 //----------------------------------------------------------------------------
 // Use ./runTest.exe --gtest_filter=*misc to run only testmisc.cc tests
 //----------------------------------------------------------------------------
@@ -17,7 +17,7 @@
 #include <sstream>
 #include <typeinfo>
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 #define TESTID( s ) s##_GPU_MISC
 #else
 #define TESTID( s ) s##_CPU_MISC
@@ -26,7 +26,7 @@
 #define XTESTID( s ) TESTID( s )
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -59,7 +59,7 @@ namespace mg5amcCpu
 
 TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testmisc )
 {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   using namespace mg5amcGpu;
 #else
   using namespace mg5amcCpu;
diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/testxxx.cc b/epochX/cudacpp/gg_ttg.mad/SubProcesses/testxxx.cc
index e40f635e46..016bc0f472 100644
--- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/testxxx.cc
+++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/testxxx.cc
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Apr 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 //----------------------------------------------------------------------------
 // Use ./runTest.exe --gtest_filter=*xxx to run only testxxx.cc tests
 //----------------------------------------------------------------------------
@@ -24,7 +24,7 @@
 #include <iomanip>
 #include <iostream>
 #include <vector>
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 #define TESTID( s ) s##_GPU_XXX
 #else
 #define TESTID( s ) s##_CPU_XXX
@@ -32,7 +32,7 @@
 
 #define XTESTID( s ) TESTID( s )
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -42,7 +42,7 @@ namespace mg5amcCpu
   int FPEhandlerIevt = -1;
   inline void FPEhandler( int sig )
   {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     std::cerr << "Floating Point Exception (GPU): '" << FPEhandlerMessage << "' ievt=" << FPEhandlerIevt << std::endl;
 #else
     std::cerr << "Floating Point Exception (CPU neppV=" << neppV << "): '" << FPEhandlerMessage << "' ievt=" << FPEhandlerIevt << std::endl;
@@ -53,7 +53,7 @@ namespace mg5amcCpu
 
 TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testxxx )
 {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   using namespace mg5amcGpu;
 #else
   using namespace mg5amcCpu;
@@ -76,7 +76,7 @@ TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testxxx )
   assert( nevt % neppM == 0 ); // nevt must be a multiple of neppM
   assert( nevt % neppV == 0 ); // nevt must be a multiple of neppV
   // Fill in the input momenta
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   mg5amcGpu::PinnedHostBufferMomenta hstMomenta( nevt ); // AOSOA[npagM][npar=4][np4=4][neppM]
 #else
   mg5amcCpu::HostBufferMomenta hstMomenta( nevt ); // AOSOA[npagM][npar=4][np4=4][neppM]
@@ -321,7 +321,7 @@ TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testxxx )
   {
     for( int ievt = 0; ievt < nevt; ievt++ )
     {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
       using namespace mg5amcGpu;
 #else
       using namespace mg5amcCpu;
diff --git a/epochX/cudacpp/gg_ttg.mad/bin/internal/ufomodel/py3_model.pkl b/epochX/cudacpp/gg_ttg.mad/bin/internal/ufomodel/py3_model.pkl
index c67363a41f7befb95905219d5f8068b362abbe2f..b6989c1453094d7f45cf2ee4b2124efa29e9064b 100644
GIT binary patch
delta 44
zcmX?hj%n{XrVZZ9<PsSe81xg<iuIFIi}Li6GxW>zi?a2z^s`D*Gt>1a7cIL20CztS
A_y7O^

delta 53
zcmdmcj_KGrrVZZ9)Uy~E81z#TOA_@H%Mx=Ei;FY$-2+0642+EReceqHeVz5wGZM>m
JCuc6Z0ssYc66F8@

diff --git a/epochX/cudacpp/gg_ttg.mad/src/HelAmps_sm.h b/epochX/cudacpp/gg_ttg.mad/src/HelAmps_sm.h
index 03afcd6a5f..2af1c4d232 100644
--- a/epochX/cudacpp/gg_ttg.mad/src/HelAmps_sm.h
+++ b/epochX/cudacpp/gg_ttg.mad/src/HelAmps_sm.h
@@ -5,7 +5,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: A. Valassi (Sep 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
 // MadGraph5_aMC@NLO v. 3.5.0_lo_vect, 2023-06-09
@@ -28,7 +28,7 @@
 //#include <iomanip>
 //#include <iostream>
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/gg_ttg.mad/src/Parameters_sm.cc b/epochX/cudacpp/gg_ttg.mad/src/Parameters_sm.cc
index 22fdd96a68..01e7d9bcf2 100644
--- a/epochX/cudacpp/gg_ttg.mad/src/Parameters_sm.cc
+++ b/epochX/cudacpp/gg_ttg.mad/src/Parameters_sm.cc
@@ -4,7 +4,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: A. Valassi (Sep 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
 // MadGraph5_aMC@NLO v. 3.5.0_lo_vect, 2023-06-09
@@ -17,7 +17,7 @@
 #include <iomanip>
 #include <iostream>
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 using namespace mg5amcGpu;
 #else
 using namespace mg5amcCpu;
diff --git a/epochX/cudacpp/gg_ttg.mad/src/Parameters_sm.h b/epochX/cudacpp/gg_ttg.mad/src/Parameters_sm.h
index 11fd9e3c74..b44537e599 100644
--- a/epochX/cudacpp/gg_ttg.mad/src/Parameters_sm.h
+++ b/epochX/cudacpp/gg_ttg.mad/src/Parameters_sm.h
@@ -27,7 +27,7 @@
 #include "read_slha.h"
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -93,7 +93,7 @@ namespace mg5amcCpu
 #include <limits>
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -218,7 +218,7 @@ namespace mg5amcCpu
 //==========================================================================
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -239,7 +239,7 @@ namespace mg5amcCpu
 #pragma GCC diagnostic push
 #pragma GCC diagnostic ignored "-Wunused-variable"  // e.g. <<warning: unused variable ‘mdl_G__exp__2’ [-Wunused-variable]>>
 #pragma GCC diagnostic ignored "-Wunused-parameter" // e.g. <<warning: unused parameter ‘G’ [-Wunused-parameter]>>
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 #pragma nv_diagnostic push
 #pragma nv_diag_suppress 177 // e.g. <<warning #177-D: variable "mdl_G__exp__2" was declared but never referenced>>
 #endif
@@ -267,7 +267,7 @@ namespace mg5amcCpu
       // End SM implementation - no special handling of vectors of floats as in EFT (#439)
       return out;
     }
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 #pragma GCC diagnostic pop
 #pragma nv_diagnostic pop
 #endif
diff --git a/epochX/cudacpp/gg_ttg.mad/src/cudacpp_src.mk b/epochX/cudacpp/gg_ttg.mad/src/cudacpp_src.mk
index 554d7a704c..f73ff6fa03 100644
--- a/epochX/cudacpp/gg_ttg.mad/src/cudacpp_src.mk
+++ b/epochX/cudacpp/gg_ttg.mad/src/cudacpp_src.mk
@@ -38,13 +38,13 @@ endif
 
 #-------------------------------------------------------------------------------
 
-#=== Configure the CUDA compiler (note: NVCC is already exported including ccache)
+#=== Configure the CUDA compiler (note: GPUCC is already exported including ccache)
 
-###$(info NVCC=$(NVCC))
+###$(info GPUCC=$(GPUCC))
 
 #-------------------------------------------------------------------------------
 
-#=== Configure ccache for C++ builds (note: NVCC is already exported including ccache)
+#=== Configure ccache for C++ builds (note: GPUCC is already exported including ccache)
 
 # Enable ccache if USECCACHE=1
 ifeq ($(USECCACHE)$(shell echo $(CXX) | grep ccache),1)
@@ -246,20 +246,20 @@ $(BUILDDIR)/%.o : %.cc *.h $(BUILDDIR)/.build.$(TAG)
 # Generic target and build rules: objects from CUDA compilation
 $(BUILDDIR)/%_cu.o : %.cc *.h $(BUILDDIR)/.build.$(TAG)
 	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
-	$(NVCC) $(CPPFLAGS) $(CUFLAGS) -Xcompiler -fPIC -c -x cu $< -o $@
+	$(GPUCC) $(CPPFLAGS) $(CUFLAGS) -Xcompiler -fPIC -c -x cu $< -o $@
 
 #-------------------------------------------------------------------------------
 
 cxx_objects=$(addprefix $(BUILDDIR)/, Parameters_sm.o read_slha.o)
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 cu_objects=$(addprefix $(BUILDDIR)/, Parameters_sm_cu.o)
 endif
 
 # Target (and build rules): common (src) library
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so : $(cxx_objects) $(cu_objects)
 	@if [ ! -d $(LIBDIR) ]; then echo "mkdir -p $(LIBDIR)"; mkdir -p $(LIBDIR); fi
-	$(NVCC) -shared -o $@ $(cxx_objects) $(cu_objects)
+	$(GPUCC) -shared -o $@ $(cxx_objects) $(cu_objects)
 else
 $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so : $(cxx_objects)
 	@if [ ! -d $(LIBDIR) ]; then echo "mkdir -p $(LIBDIR)"; mkdir -p $(LIBDIR); fi
diff --git a/epochX/cudacpp/gg_ttg.mad/src/mgOnGpuConfig.h b/epochX/cudacpp/gg_ttg.mad/src/mgOnGpuConfig.h
index cacab1031a..e540c8587c 100644
--- a/epochX/cudacpp/gg_ttg.mad/src/mgOnGpuConfig.h
+++ b/epochX/cudacpp/gg_ttg.mad/src/mgOnGpuConfig.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jul 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MGONGPUCONFIG_H
 #define MGONGPUCONFIG_H 1
@@ -10,13 +10,27 @@
 // There are two different code bases for standalone_cudacpp (without multichannel) and madevent+cudacpp (with multichannel)
 #define MGONGPU_SUPPORTS_MULTICHANNEL 1
 
+// Is this a GPU (CUDA, HIP) or CPU implementation?
+#ifdef __CUDACC__
+#define MGONGPUCPP_GPUIMPL cuda
+#elif defined __HIPCC__
+#define MGONGPUCPP_GPUIMPL hip
+#include "hip/hip_runtime.h"
+#else
+#undef MGONGPUCPP_GPUIMPL
+#endif
+
 // ** NB1 Throughputs (e.g. 6.8E8) are events/sec for "./gcheck.exe -p 65536 128 12"
 // ** NB2 Baseline on b7g47n0004 fluctuates (probably depends on load on other VMs)
 
 // Choose if curand is supported for generating random numbers
+// For CUDA, by default, it is supported
+// For HIP, by default, it is not supported
 // For C++, by default, do not inline, but allow this macro to be set from outside with e.g. -DMGONGPU_HAS_NO_CURAND
 #ifdef __CUDACC__
 #undef MGONGPU_HAS_NO_CURAND
+#elif defined __HIPCC__
+#define MGONGPU_HAS_NO_CURAND 1
 #else
 //#undef MGONGPU_HAS_NO_CURAND // default
 ////#define MGONGPU_HAS_NO_CURAND 1
@@ -52,23 +66,28 @@
 //#undef MGONGPU_HARDCODE_PARAM // default
 ////#define MGONGPU_HARDCODE_PARAM 1
 
-// Complex type in c++: std::complex or cxsmpl (CHOOSE ONLY ONE)
-#ifndef __CUDACC__
-//#define MGONGPU_CPPCXTYPE_STDCOMPLEX 1 // ~8 percent slower on float, same on double (5.1E6/double, 9.4E6/float)
-#define MGONGPU_CPPCXTYPE_CXSMPL 1 // new default (5.1E6/double, 10.2E6/float)
-#endif
-
-// Complex type in cuda: thrust or cucomplex or cxsmpl (CHOOSE ONLY ONE)
+// Complex type in CUDA: thrust or cucomplex or cxsmpl (CHOOSE ONLY ONE)
 #ifdef __CUDACC__
 #define MGONGPU_CUCXTYPE_THRUST 1 // default (~1.15E9/double, ~3.2E9/float)
 //#define MGONGPU_CUCXTYPE_CUCOMPLEX 1 // ~10 percent slower (1.03E9/double, ~2.8E9/float)
 //#define MGONGPU_CUCXTYPE_CXSMPL 1 // ~10 percent slower (1.00E9/double, ~2.9E9/float)
+
+// Complex type in HIP: cxsmpl (ONLY ONE OPTION POSSIBLE)
+#elif defined __HIPCC__
+#define MGONGPU_CUCXTYPE_CXSMPL 1 // ~10 percent slower (1.00E9/double, ~2.9E9/float)
+
+// Complex type in C++: std::complex or cxsmpl (CHOOSE ONLY ONE)
+#else
+//#define MGONGPU_CPPCXTYPE_STDCOMPLEX 1 // ~8 percent slower on float, same on double (5.1E6/double, 9.4E6/float)
+#define MGONGPU_CPPCXTYPE_CXSMPL 1 // new default (5.1E6/double, 10.2E6/float)
 #endif
 
-// Cuda nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation
+// CUDA nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation
 #ifdef __CUDACC__
-#undef MGONGPU_NSIGHT_DEBUG // default
+#undef MGONGPU_NSIGHT_DEBUG // default in CUDA
 //#define MGONGPU_NSIGHT_DEBUG 1
+#else
+#undef MGONGPU_NSIGHT_DEBUG // only option in HIP or C++
 #endif
 
 // SANITY CHECKS (floating point precision for everything but color algebra #537)
@@ -84,17 +103,21 @@
 #error You cannot use double precision for color algebra and single precision elsewhere
 #endif
 
-// SANITY CHECKS (c++ complex number implementation)
-#ifndef __CUDACC__
-#if defined MGONGPU_CPPCXTYPE_STDCOMPLEX and defined MGONGPU_CPPCXTYPE_CXSMPL
-#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CPPCXTYPE_STDCOMPLEX or MGONGPU_CPPCXTYPE_CXSMPL
+// SANITY CHECKS (CUDA complex number implementation)
+#ifdef __CUDACC__
+#if defined MGONGPU_CUCXTYPE_THRUST and defined MGONGPU_CUCXTYPE_CUCOMPLEX
+#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CUCXTYPE_THRUST or MGONGPU_CUCXTYPE_CUCOMPLEX for CUDA
+#elif defined MGONGPU_CUCXTYPE_THRUST and defined MGONGPU_CUCXTYPE_CXSMPL
+#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CUCXTYPE_THRUST or MGONGPU_CUCXTYPE_CXSMPL for CUDA
+#elif defined MGONGPU_CUCXTYPE_CUCOMPLEX and defined MGONGPU_CUCXTYPE_CXSMPL
+#error You must CHOOSE (ONE AND) ONLY ONE OF MGONGPU_CUCXTYPE_CUCOMPLEX or MGONGPU_CUCXTYPE_CXSMPL for CUDA
 #endif
 #endif
 
-// SANITY CHECKS (cuda complex number implementation)
-#ifdef __CUDACC__
-#if defined MGONGPU_CUCXTYPE_THRUST and defined MGONGPU_CUCXTYPE_CUCOMPLEX and defined MGONGPU_CUCXTYPE_CXSMPL
-#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CUCXTYPE_THRUST or MGONGPU_CUCXTYPE_CUCOMPLEX or MGONGPU_CUCXTYPE_CXSMPL
+// SANITY CHECKS (C++ complex number implementation)
+#ifndef MGONGPUCPP_GPUIMPL
+#if defined MGONGPU_CPPCXTYPE_STDCOMPLEX and defined MGONGPU_CPPCXTYPE_CXSMPL
+#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CPPCXTYPE_STDCOMPLEX or MGONGPU_CPPCXTYPE_CXSMPL for C++
 #endif
 #endif
 
@@ -132,7 +155,7 @@ namespace mgOnGpu
   // Alignment requirement for using reinterpret_cast with SIMD vectorized code
   // (using reinterpret_cast with non aligned memory may lead to segmentation faults!)
   // Only needed for C++ code but can be enforced also in NVCC builds of C++ code using CUDA>=11.2 and C++17 (#318, #319, #333)
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   constexpr int cppAlign = 64; // alignment requirement for SIMD vectorization (64-byte i.e. 512-bit)
 #endif
 
@@ -143,7 +166,7 @@ using mgOnGpu::fptype;
 using mgOnGpu::fptype2;
 
 // C++ SIMD vectorization width (this will be used to set neppV)
-#ifdef __CUDACC__ // CUDA implementation has no SIMD
+#ifdef MGONGPUCPP_GPUIMPL // CUDA and HIP implementations have no SIMD
 #undef MGONGPU_CPPSIMD
 #elif defined __AVX512VL__ && defined MGONGPU_PVW512 // C++ "512z" AVX512 with 512 width (512-bit ie 64-byte): 8 (DOUBLE) or 16 (FLOAT)
 #ifdef MGONGPU_FPTYPE_DOUBLE
@@ -173,9 +196,9 @@ using mgOnGpu::fptype2;
 #undef MGONGPU_CPPSIMD
 #endif
 
-// Cuda nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation
+// CUDA nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation
 // Arguments (not used so far): text is __FUNCTION__, code is 0 (start) or 1 (end)
-#if defined __CUDACC__ && defined MGONGPU_NSIGHT_DEBUG /* clang-format off */
+#if defined __CUDA__ && defined MGONGPU_NSIGHT_DEBUG /* clang-format off */
 #define mgDebugDeclare() __shared__ float mgDebugCounter[mgOnGpu::ntpbMAX];
 #define mgDebugInitialise() { mgDebugCounter[threadIdx.x] = 0; }
 #define mgDebug( code, text ) { mgDebugCounter[threadIdx.x] += 1; }
@@ -187,8 +210,8 @@ using mgOnGpu::fptype2;
 #define mgDebugFinalise() { /*noop*/ }
 #endif /* clang-format on */
 
-// Define empty CUDA declaration specifiers for C++
-#ifndef __CUDACC__
+// Define empty CUDA/HIP declaration specifiers for C++
+#ifndef MGONGPUCPP_GPUIMPL
 #define __global__
 #define __host__
 #define __device__
diff --git a/epochX/cudacpp/gg_ttg.mad/src/mgOnGpuCxtypes.h b/epochX/cudacpp/gg_ttg.mad/src/mgOnGpuCxtypes.h
index b56348bc58..6ae0c42ecb 100644
--- a/epochX/cudacpp/gg_ttg.mad/src/mgOnGpuCxtypes.h
+++ b/epochX/cudacpp/gg_ttg.mad/src/mgOnGpuCxtypes.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022, based on earlier work by D. Smith) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MGONGPUCXTYPES_H
 #define MGONGPUCXTYPES_H 1
@@ -19,7 +19,7 @@
 #include <complex>
 
 // Complex type in cuda: thrust or cucomplex or cxsmpl
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 #if defined MGONGPU_CUCXTYPE_THRUST
 #pragma clang diagnostic push
 #pragma clang diagnostic ignored "-Wtautological-compare" // for icpx2021/clang13 (https://stackoverflow.com/a/15864661)
@@ -82,7 +82,7 @@ namespace mgOnGpu /* clang-format off */
 using mgOnGpu::cxsmpl;
 
 // Printout to stream for user defined types
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -209,14 +209,14 @@ namespace mg5amcCpu
 //==========================================================================
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
 #endif
 {
   // --- Type definitions (complex type: cxtype)
-#ifdef __CUDACC__ // cuda
+#ifdef MGONGPUCPP_GPUIMPL // cuda
 #if defined MGONGPU_CUCXTYPE_THRUST
   typedef thrust::complex<fptype> cxtype;
 #elif defined MGONGPU_CUCXTYPE_CUCOMPLEX
@@ -249,7 +249,7 @@ namespace mg5amcCpu
 //==========================================================================
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -301,7 +301,7 @@ namespace mg5amcCpu
 
   //==========================================================================
 
-#if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_THRUST // cuda + thrust
+#if defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CUCXTYPE_THRUST // cuda + thrust
 
   //------------------------------
   // CUDA - using thrust::complex
@@ -337,11 +337,11 @@ namespace mg5amcCpu
     return c;
   }
 
-#endif // #if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_THRUST
+#endif // #if defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CUCXTYPE_THRUST
 
   //==========================================================================
 
-#if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_CUCOMPLEX // cuda + cucomplex
+#if defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CUCXTYPE_CUCOMPLEX // cuda + cucomplex
 
   //------------------------------
   // CUDA - using cuComplex
@@ -556,11 +556,11 @@ namespace mg5amcCpu
     return cxmake( c.real(), c.imag() );
   }
 
-#endif // #if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_CUCOMPLEX
+#endif // #if defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CUCXTYPE_CUCOMPLEX
 
   //==========================================================================
 
-#if not defined __CUDACC__ and defined MGONGPU_CPPCXTYPE_STDCOMPLEX // c++ + stdcomplex
+#if not defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CPPCXTYPE_STDCOMPLEX // c++ + stdcomplex
 
   //------------------------------
   // C++ - using std::complex
@@ -604,7 +604,7 @@ namespace mg5amcCpu
   }
 #endif
 
-#endif // #if not defined __CUDACC__ and defined MGONGPU_CPPCXTYPE_STDCOMPLEX
+#endif // #if not defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CPPCXTYPE_STDCOMPLEX
 
   //==========================================================================
 
@@ -627,7 +627,7 @@ namespace mg5amcCpu
 //==========================================================================
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/gg_ttg.mad/src/mgOnGpuFptypes.h b/epochX/cudacpp/gg_ttg.mad/src/mgOnGpuFptypes.h
index 905c97d700..fa3a02664b 100644
--- a/epochX/cudacpp/gg_ttg.mad/src/mgOnGpuFptypes.h
+++ b/epochX/cudacpp/gg_ttg.mad/src/mgOnGpuFptypes.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MGONGPUFPTYPES_H
 #define MGONGPUFPTYPES_H 1
@@ -12,7 +12,7 @@
 #include <cmath>
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL // cuda
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -20,7 +20,7 @@ namespace mg5amcCpu
 {
   //==========================================================================
 
-#ifdef __CUDACC__ // cuda
+#ifdef MGONGPUCPP_GPUIMPL // cuda
 
   //------------------------------
   // Floating point types - Cuda
@@ -64,11 +64,11 @@ namespace mg5amcCpu
 #endif
   }
 
-#endif // #ifdef __CUDACC__
+#endif // #ifdef MGONGPUCPP_GPUIMPL
 
   //==========================================================================
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 
   //------------------------------
   // Floating point types - C++
@@ -92,7 +92,7 @@ namespace mg5amcCpu
     return std::sqrt( f );
   }
 
-#endif // #ifndef __CUDACC__
+#endif // #ifndef MGONGPUCPP_GPUIMPL
 
   //==========================================================================
 
diff --git a/epochX/cudacpp/gg_ttg.mad/src/mgOnGpuVectors.h b/epochX/cudacpp/gg_ttg.mad/src/mgOnGpuVectors.h
index e1299ba81e..cdae04326b 100644
--- a/epochX/cudacpp/gg_ttg.mad/src/mgOnGpuVectors.h
+++ b/epochX/cudacpp/gg_ttg.mad/src/mgOnGpuVectors.h
@@ -32,7 +32,7 @@
 #endif
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -131,7 +131,7 @@ namespace mg5amcCpu
 #endif
 #endif
 
-#else // i.e #ifndef MGONGPU_CPPSIMD (this includes #ifdef __CUDACC__)
+#else // i.e #ifndef MGONGPU_CPPSIMD (this includes #ifdef MGONGPUCPP_GPUIMPL)
 
   const int neppV = 1;
 
@@ -153,13 +153,13 @@ namespace mg5amcCpu
 //==========================================================================
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
 #endif
 {
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 
   // Printout to stream for user defined types
 
@@ -805,11 +805,11 @@ namespace mg5amcCpu
 
 #endif // #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
 
-#endif // #ifndef __CUDACC__
+#endif // #ifndef MGONGPUCPP_GPUIMPL
 
   //==========================================================================
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 
   //------------------------------
   // Vector types - CUDA
@@ -853,12 +853,12 @@ namespace mg5amcCpu
     return mask;
   }
 
-#endif // #ifdef __CUDACC__
+#endif // #ifdef MGONGPUCPP_GPUIMPL
 
   //==========================================================================
 
   // Scalar-or-vector types: scalar in CUDA, vector or scalar in C++
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   typedef bool bool_sv;
   typedef fptype fptype_sv;
   typedef fptype2 fptype2_sv;
@@ -879,7 +879,7 @@ namespace mg5amcCpu
 #endif
 
   // Scalar-or-vector zeros: scalar in CUDA, vector or scalar in C++
-#ifdef __CUDACC__ /* clang-format off */
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
   inline __host__ __device__ cxtype cxzero_sv(){ return cxtype( 0, 0 ); }
 #elif defined MGONGPU_CPPSIMD
   inline cxtype_v cxzero_sv() { return cxtype_v(); } // RRRR=0000 IIII=0000
diff --git a/epochX/cudacpp/gg_ttg.mad/src/rambo.h b/epochX/cudacpp/gg_ttg.mad/src/rambo.h
index e02ea52496..cd7e1008ea 100644
--- a/epochX/cudacpp/gg_ttg.mad/src/rambo.h
+++ b/epochX/cudacpp/gg_ttg.mad/src/rambo.h
@@ -4,7 +4,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 
 #include "mgOnGpuConfig.h"
@@ -18,7 +18,7 @@
 #include <iostream>
 
 // Simplified rambo version for 2 to N (with N>=2) processes with massless particles
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -83,7 +83,7 @@ namespace mg5amcCpu
       static bool first = true;
       if( first )
       {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
         if constexpr( M_ACCESS::isOnDevice() ) // avoid
         {
           const int ievt0 = 0;
@@ -166,7 +166,7 @@ namespace mg5amcCpu
     wt = po2log;
     if( nparf != 2 ) wt = ( 2. * nparf - 4. ) * log( energy ) + z[nparf - 1];
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
     // issue warnings if weight is too small or too large
     static int iwarn[5] = { 0, 0, 0, 0, 0 };
     if( wt < -180. )
diff --git a/epochX/cudacpp/gg_ttg.sa/CODEGEN_cudacpp_gg_ttg_log.txt b/epochX/cudacpp/gg_ttg.sa/CODEGEN_cudacpp_gg_ttg_log.txt
index 7ec640308e..8658acc7f4 100644
--- a/epochX/cudacpp/gg_ttg.sa/CODEGEN_cudacpp_gg_ttg_log.txt
+++ b/epochX/cudacpp/gg_ttg.sa/CODEGEN_cudacpp_gg_ttg_log.txt
@@ -53,7 +53,7 @@ Note that you can still compile and run aMC@NLO with the built-in PDFs
 Using default text editor "vi". Set another one in ./input/mg5_configuration.txt
 Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt
 No valid web browser found. Please set in ./input/mg5_configuration.txt
-import /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_cudacpp_gg_ttg.mg
+import /afs/cern.ch/work/j/jteig/mg5amcnlo/CODEGEN_cudacpp_gg_ttg.mg
 The import format was not given, so we guess it as command
 set stdout_level DEBUG
 set output information to level: 10
@@ -62,7 +62,7 @@ generate g g > t t~ g
 No model currently active, so we import the Standard Model
 INFO: load particles 
 INFO: load vertices 
-[1;32mDEBUG: model prefixing  takes 0.0051500797271728516 [0m
+[1;32mDEBUG: model prefixing  takes 0.0037899017333984375 [0m
 INFO: Restrict model sm with file models/sm/restrict_default.dat . 
 [1;32mDEBUG: Simplifying conditional expressions [0m
 [1;32mDEBUG: remove interactions: u s w+ at order: QED=1 [0m
@@ -155,7 +155,7 @@ INFO: Please specify coupling orders to bypass this step.
 INFO: Trying coupling order WEIGHTED<=3: WEIGTHED IS QCD+2*QED 
 INFO: Trying process: g g > t t~ g WEIGHTED<=3 @1  
 INFO: Process has 16 diagrams 
-1 processes with 16 diagrams generated in 0.019 s
+1 processes with 16 diagrams generated in 0.017 s
 Total: 1 processes with 16 diagrams
 output standalone_cudacpp CODEGEN_cudacpp_gg_ttg
 Load PLUGIN.CUDACPP_SA_OUTPUT
@@ -163,7 +163,7 @@ Load PLUGIN.CUDACPP_SA_OUTPUT
 [1;32mDEBUG:  cformat = [0m plugin [1;30m[export_cpp.py at line 3071][0m [0m
 [1;32mDEBUG:  Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [1;30m[output.py at line 152][0m [0m
 [1;32mDEBUG:  Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [1;30m[output.py at line 157][0m [0m
-INFO: Creating subdirectories in directory /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_cudacpp_gg_ttg 
+INFO: Creating subdirectories in directory /afs/cern.ch/work/j/jteig/mg5amcnlo/CODEGEN_cudacpp_gg_ttg 
 INFO: Organizing processes into subprocess groups 
 INFO: Generating Helas calls for process: g g > t t~ g WEIGHTED<=3 @1 
 INFO: Processing color information for process: g g > t t~ g @1 
@@ -173,12 +173,12 @@ INFO: Processing color information for process: g g > t t~ g @1
 [1;32mDEBUG:    type(me)=<class 'int'> me=0 [1;30m[output.py at line 189][0m [0m
 [1;32mDEBUG:  Entering PLUGIN_OneProcessExporter.__init__ [1;30m[model_handling.py at line 1039][0m [0m
 [1;32mDEBUG:  proc_id = [0m 0 [1;30m[model_handling.py at line 1045][0m [0m
-INFO: Creating files in directory /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_cudacpp_gg_ttg/SubProcesses/P1_Sigma_sm_gg_ttxg 
+INFO: Creating files in directory /afs/cern.ch/work/j/jteig/mg5amcnlo/CODEGEN_cudacpp_gg_ttg/SubProcesses/P1_Sigma_sm_gg_ttxg 
 [1;32mDEBUG:  Entering PLUGIN_OneProcessExporter.generate_process_files [1;30m[model_handling.py at line 1297][0m [0m
 [1;32mDEBUG:  self.include_multi_channel is not yet defined: this is standalone_cudacpp mode [1;30m[model_handling.py at line 1301][0m [0m
-FileWriter <class 'PLUGIN.CUDACPP_SA_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_cudacpp_gg_ttg/SubProcesses/P1_Sigma_sm_gg_ttxg/./CPPProcess.h
+FileWriter <class 'PLUGIN.CUDACPP_SA_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /afs/cern.ch/work/j/jteig/mg5amcnlo/CODEGEN_cudacpp_gg_ttg/SubProcesses/P1_Sigma_sm_gg_ttxg/./CPPProcess.h
 [1;32mDEBUG:  Entering PLUGIN_OneProcessExporter.write_process_h_file [1;30m[model_handling.py at line 1453][0m [0m
-FileWriter <class 'PLUGIN.CUDACPP_SA_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_cudacpp_gg_ttg/SubProcesses/P1_Sigma_sm_gg_ttxg/./CPPProcess.cc
+FileWriter <class 'PLUGIN.CUDACPP_SA_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /afs/cern.ch/work/j/jteig/mg5amcnlo/CODEGEN_cudacpp_gg_ttg/SubProcesses/P1_Sigma_sm_gg_ttxg/./CPPProcess.cc
 [1;32mDEBUG:  Entering PLUGIN_OneProcessExporter.write_process_cc_file [1;30m[model_handling.py at line 1475][0m [0m
 [1;32mDEBUG:  Entering PLUGIN_OneProcessExporter.get_sigmaKin_lines [1;30m[model_handling.py at line 1143][0m [0m
 [1;32mDEBUG:  self.include_multi_channel = [0m False [1;30m[model_handling.py at line 1144][0m [0m
@@ -186,14 +186,14 @@ FileWriter <class 'PLUGIN.CUDACPP_SA_OUTPUT.model_handling.PLUGIN_CPPWriter'> fo
 [1;32mDEBUG:  type(self.helas_call_writer) = [0m <class 'PLUGIN.CUDACPP_SA_OUTPUT.model_handling.PLUGIN_GPUFOHelasCallWriter'> [1;30m[model_handling.py at line 1162][0m [0m
 [1;32mDEBUG:  self.support_multichannel, self.include_multi_channel = [0m True False [1;30m[model_handling.py at line 1163][0m [0m
 [1;32mDEBUG:  multi_channel_map = [0m None [1;30m[model_handling.py at line 1654][0m [0m
-[1;32mDEBUG:  diag_to_config = [0m {} [1;30m[model_handling.py at line 1709][0m [0m
-[1;32mDEBUG:  call = [0m vxxxxx( momenta,m_pars->%s, cHel[ihel][%d],%+d, w_sv[%d], %d ); [1;30m[model_handling.py at line 1821][0m [0m
-[1;32mDEBUG:  ('ZERO', 0, -1, 0, 0) [1;30m[model_handling.py at line 1822][0m [0m
-[1;32mDEBUG:  call = [0m vxxxxx( momenta,m_pars->%s, cHel[ihel][%d],%+d, w_sv[%d], %d ); [1;30m[model_handling.py at line 1821][0m [0m
-[1;32mDEBUG:  ('ZERO', 1, -1, 1, 1) [1;30m[model_handling.py at line 1822][0m [0m
-[1;32mDEBUG:  call = [0m vxxxxx( momenta,m_pars->%s, cHel[ihel][%d],%+d, w_sv[%d], %d ); [1;30m[model_handling.py at line 1821][0m [0m
-[1;32mDEBUG:  ('ZERO', 4, 1, 4, 4) [1;30m[model_handling.py at line 1822][0m [0m
-INFO: Created files CPPProcess.h and CPPProcess.cc in directory /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_cudacpp_gg_ttg/SubProcesses/P1_Sigma_sm_gg_ttxg/. 
+[1;32mDEBUG:  diag_to_config = [0m {} [1;30m[model_handling.py at line 1711][0m [0m
+[1;32mDEBUG:  call = [0m vxxxxx( momenta,m_pars->%s, cHel[ihel][%d],%+d, w_sv[%d], %d ); [1;30m[model_handling.py at line 1823][0m [0m
+[1;32mDEBUG:  ('ZERO', 0, -1, 0, 0) [1;30m[model_handling.py at line 1824][0m [0m
+[1;32mDEBUG:  call = [0m vxxxxx( momenta,m_pars->%s, cHel[ihel][%d],%+d, w_sv[%d], %d ); [1;30m[model_handling.py at line 1823][0m [0m
+[1;32mDEBUG:  ('ZERO', 1, -1, 1, 1) [1;30m[model_handling.py at line 1824][0m [0m
+[1;32mDEBUG:  call = [0m vxxxxx( momenta,m_pars->%s, cHel[ihel][%d],%+d, w_sv[%d], %d ); [1;30m[model_handling.py at line 1823][0m [0m
+[1;32mDEBUG:  ('ZERO', 4, 1, 4, 4) [1;30m[model_handling.py at line 1824][0m [0m
+INFO: Created files CPPProcess.h and CPPProcess.cc in directory /afs/cern.ch/work/j/jteig/mg5amcnlo/CODEGEN_cudacpp_gg_ttg/SubProcesses/P1_Sigma_sm_gg_ttxg/. 
 [1;32mDEBUG:  Entering PLUGIN_OneProcessExporter.edit_CMakeLists [1;30m[model_handling.py at line 1343][0m [0m
 [1;32mDEBUG:  Entering PLUGIN_OneProcessExporter.edit_check_sa [1;30m[model_handling.py at line 1352][0m [0m
 [1;32mDEBUG:  Entering PLUGIN_OneProcessExporter.edit_mgonGPU [1;30m[model_handling.py at line 1369][0m [0m
@@ -201,8 +201,8 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory /data/avalassi/G
 [1;32mDEBUG:  Entering PLUGIN_OneProcessExporter.edit_testxxx [1;30m[model_handling.py at line 1419][0m [0m
 [1;32mDEBUG:  Entering PLUGIN_OneProcessExporter.edit_memorybuffers [1;30m[model_handling.py at line 1430][0m [0m
 [1;32mDEBUG:  Entering PLUGIN_OneProcessExporter.edit_memoryaccesscouplings [1;30m[model_handling.py at line 1441][0m [0m
-[1;32mDEBUG:  'Copying test reference file: ', template_ref  = [0m Copying test reference file:  /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/../../../test/ref/dump_CPUTest.Sigma_sm_gg_ttxg.txt [1;30m[model_handling.py at line 1335][0m [0m
-Generated helas calls for 1 subprocesses (16 diagrams) in 0.034 s
+[1;32mDEBUG:  'Copying test reference file: ', template_ref  = [0m Copying test reference file:  /afs/cern.ch/work/j/jteig/mg5amcnlo/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/../../../test/ref/dump_CPUTest.Sigma_sm_gg_ttxg.txt [1;30m[model_handling.py at line 1335][0m [0m
+Generated helas calls for 1 subprocesses (16 diagrams) in 0.028 s
 [1;32mDEBUG:  Entering PLUGIN_ProcessExporter.convert_model (create the model) [1;30m[output.py at line 194][0m [0m
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates VVV1 routines[0m
@@ -210,7 +210,7 @@ ALOHA: aloha creates FFV1 routines[0m
 ALOHA: aloha creates VVVV1 set of routines with options: P0[0m
 ALOHA: aloha creates VVVV3 set of routines with options: P0[0m
 ALOHA: aloha creates VVVV4 set of routines with options: P0[0m
-ALOHA: aloha creates 5 routines in  0.269 s
+ALOHA: aloha creates 5 routines in  0.242 s
 <class 'aloha.create_aloha.AbstractRoutine'> VVV1
 <class 'aloha.create_aloha.AbstractRoutine'> VVV1
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
@@ -220,8 +220,8 @@ ALOHA: aloha creates 5 routines in  0.269 s
 <class 'aloha.create_aloha.AbstractRoutine'> VVVV1
 <class 'aloha.create_aloha.AbstractRoutine'> VVVV3
 <class 'aloha.create_aloha.AbstractRoutine'> VVVV4
-FileWriter <class 'PLUGIN.CUDACPP_SA_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_cudacpp_gg_ttg/src/./HelAmps_sm.h
-INFO: Created file HelAmps_sm.h in directory /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_cudacpp_gg_ttg/src/. 
+FileWriter <class 'PLUGIN.CUDACPP_SA_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /afs/cern.ch/work/j/jteig/mg5amcnlo/CODEGEN_cudacpp_gg_ttg/src/./HelAmps_sm.h
+INFO: Created file HelAmps_sm.h in directory /afs/cern.ch/work/j/jteig/mg5amcnlo/CODEGEN_cudacpp_gg_ttg/src/. 
 super_write_set_parameters_onlyfixMajorana (hardcoded=False)
 [1;32mDEBUG:  'pardef_lines size =', len(pardef_lines), ', keys size =', len(pardef_lines.keys())  = [0m pardef_lines size = 48 , keys size = 48 [1;30m[model_handling.py at line 729][0m [0m
 super_write_set_parameters_onlyfixMajorana (hardcoded=True)
@@ -230,13 +230,13 @@ super_write_set_parameters_onlyfixMajorana (hardcoded=True)
 [1;32mDEBUG:  'pardef_lines size =', len(pardef_lines), ', keys size =', len(pardef_lines.keys())  = [0m pardef_lines size = 3 , keys size = 3 [1;30m[model_handling.py at line 729][0m [0m
 [1;32mDEBUG:  'pardef_lines size =', len(pardef_lines), ', keys size =', len(pardef_lines.keys())  = [0m pardef_lines size = 3 , keys size = 3 [1;30m[model_handling.py at line 729][0m [0m
 [1;32mDEBUG:  'pardef_lines size =', len(pardef_lines), ', keys size =', len(pardef_lines.keys())  = [0m pardef_lines size = 3 , keys size = 3 [1;30m[model_handling.py at line 729][0m [0m
-FileWriter <class 'PLUGIN.CUDACPP_SA_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_cudacpp_gg_ttg/src/./Parameters_sm.h
-FileWriter <class 'PLUGIN.CUDACPP_SA_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_cudacpp_gg_ttg/src/./Parameters_sm.cc
+FileWriter <class 'PLUGIN.CUDACPP_SA_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /afs/cern.ch/work/j/jteig/mg5amcnlo/CODEGEN_cudacpp_gg_ttg/src/./Parameters_sm.h
+FileWriter <class 'PLUGIN.CUDACPP_SA_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /afs/cern.ch/work/j/jteig/mg5amcnlo/CODEGEN_cudacpp_gg_ttg/src/./Parameters_sm.cc
 INFO: Created files Parameters_sm.h and Parameters_sm.cc in directory 
-INFO: /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_cudacpp_gg_ttg/src/. and /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_cudacpp_gg_ttg/src/. 
+INFO: /afs/cern.ch/work/j/jteig/mg5amcnlo/CODEGEN_cudacpp_gg_ttg/src/. and /afs/cern.ch/work/j/jteig/mg5amcnlo/CODEGEN_cudacpp_gg_ttg/src/. 
 [1;32mDEBUG:  Entering PLUGIN_ProcessExporter.finalize [1;30m[output.py at line 203][0m [0m
 quit
 
-real	0m1.092s
-user	0m0.905s
-sys	0m0.108s
+real	0m1.318s
+user	0m0.623s
+sys	0m0.179s
diff --git a/epochX/cudacpp/gg_ttg.sa/COPYRIGHT b/epochX/cudacpp/gg_ttg.sa/COPYRIGHT
index a134b5fef9..84a883fbb0 100644
--- a/epochX/cudacpp/gg_ttg.sa/COPYRIGHT
+++ b/epochX/cudacpp/gg_ttg.sa/COPYRIGHT
@@ -15,6 +15,7 @@ The full development team currently includes the following authors :
   Stephan Hageboeck (CERN)
   Olivier Mattelaer (Universite Catholique de Louvain, original author)
   Stefan Roiser (CERN, original author)
+  Joergen Teig (CERN)
   Andrea Valassi (CERN, original author)
   Zenny Wettersten (CERN)
 See https://github.com/madgraph5/madgraph4gpu for more details. For the full
diff --git a/epochX/cudacpp/gg_ttg.sa/SubProcesses/Bridge.h b/epochX/cudacpp/gg_ttg.sa/SubProcesses/Bridge.h
index d7e629cacd..f37c972b24 100644
--- a/epochX/cudacpp/gg_ttg.sa/SubProcesses/Bridge.h
+++ b/epochX/cudacpp/gg_ttg.sa/SubProcesses/Bridge.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: S. Roiser (Nov 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Roiser, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef BRIDGE_H
 #define BRIDGE_H 1
@@ -22,7 +22,7 @@
 #include <memory>
 #include <type_traits>
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -82,7 +82,7 @@ namespace mg5amcCpu
     Bridge& operator=( const Bridge& ) = delete;
     Bridge& operator=( Bridge&& ) = delete;
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     /**
      * Set the gpublocks and gputhreads for the gpusequence - throws if evnt != gpublocks*gputhreads
      * (this is needed for BridgeKernel tests rather than for actual production use in Fortran)
@@ -149,7 +149,7 @@ namespace mg5amcCpu
     unsigned int m_nevt; // number of events
     int m_nGoodHel;      // the number of good helicities (-1 initially when they have not yet been calculated)
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     int m_gputhreads; // number of gpu threads (default set from number of events, can be modified)
     int m_gpublocks;  // number of gpu blocks (default set from number of events, can be modified)
     DeviceBuffer<FORTRANFPTYPE, sizePerEventMomenta> m_devMomentaF;
@@ -186,12 +186,12 @@ namespace mg5amcCpu
   // Forward declare transposition methods
   //
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 
   template<typename Tin, typename Tout>
   __global__ void dev_transposeMomentaF2C( const Tin* in, Tout* out, const unsigned int nevt );
 
-#endif // __CUDACC__
+#endif // MGONGPUCPP_GPUIMPL
 
   template<typename Tin, typename Tout>
   void hst_transposeMomentaF2C( const Tin* in, Tout* out, const unsigned int nevt );
@@ -208,7 +208,7 @@ namespace mg5amcCpu
   Bridge<FORTRANFPTYPE>::Bridge( unsigned int nevtF, unsigned int nparF, unsigned int np4F )
     : m_nevt( nevtF )
     , m_nGoodHel( -1 )
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     , m_gputhreads( 256 )                  // default number of gpu threads
     , m_gpublocks( m_nevt / m_gputhreads ) // this ensures m_nevt <= m_gpublocks*m_gputhreads
     , m_devMomentaF( m_nevt )
@@ -232,7 +232,7 @@ namespace mg5amcCpu
   {
     if( nparF != CPPProcess::npar ) throw std::runtime_error( "Bridge constructor: npar mismatch" );
     if( np4F != CPPProcess::np4 ) throw std::runtime_error( "Bridge constructor: np4 mismatch" );
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     if( ( m_nevt < s_gputhreadsmin ) || ( m_nevt % s_gputhreadsmin != 0 ) )
       throw std::runtime_error( "Bridge constructor: nevt should be a multiple of " + std::to_string( s_gputhreadsmin ) );
     while( m_nevt != m_gpublocks * m_gputhreads )
@@ -250,11 +250,11 @@ namespace mg5amcCpu
     std::cout << "WARNING! Instantiate host Bridge (nevt=" << m_nevt << ")" << std::endl;
     CPPProcess process( /*verbose=*/false );
     m_pmek.reset( new MatrixElementKernelHost( m_hstMomentaC, m_hstGs, m_hstRndHel, m_hstRndCol, m_hstMEs, m_hstSelHel, m_hstSelCol, m_nevt ) );
-#endif // __CUDACC__
+#endif // MGONGPUCPP_GPUIMPL
     process.initProc( "../../Cards/param_card.dat" );
   }
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   template<typename FORTRANFPTYPE>
   void Bridge<FORTRANFPTYPE>::set_gpugrid( const int gpublocks, const int gputhreads )
   {
@@ -268,7 +268,7 @@ namespace mg5amcCpu
   }
 #endif
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   template<typename FORTRANFPTYPE>
   void Bridge<FORTRANFPTYPE>::gpu_sequence( const FORTRANFPTYPE* momenta,
                                             const FORTRANFPTYPE* gs,
@@ -283,14 +283,14 @@ namespace mg5amcCpu
     constexpr int neppM = MemoryAccessMomenta::neppM;
     if constexpr( neppM == 1 && std::is_same_v<FORTRANFPTYPE, fptype> )
     {
-      checkCuda( cudaMemcpy( m_devMomentaC.data(), momenta, m_devMomentaC.bytes(), cudaMemcpyHostToDevice ) );
+      gpuMemcpy( m_devMomentaC.data(), momenta, m_devMomentaC.bytes(), gpuMemcpyHostToDevice );
     }
     else
     {
-      checkCuda( cudaMemcpy( m_devMomentaF.data(), momenta, m_devMomentaF.bytes(), cudaMemcpyHostToDevice ) );
+      gpuMemcpy( m_devMomentaF.data(), momenta, m_devMomentaF.bytes(), gpuMemcpyHostToDevice );
       const int thrPerEvt = CPPProcess::npar * CPPProcess::np4; // AV: transpose alg does 1 element per thread (NOT 1 event per thread)
       //const int thrPerEvt = 1; // AV: try new alg with 1 event per thread... this seems slower
-      dev_transposeMomentaF2C<<<m_gpublocks * thrPerEvt, m_gputhreads>>>( m_devMomentaF.data(), m_devMomentaC.data(), m_nevt );
+      gpuLaunchKernel( dev_transposeMomentaF2C, m_gpublocks * thrPerEvt, m_gputhreads, m_devMomentaF.data(), m_devMomentaC.data(), m_nevt );
     }
     if constexpr( std::is_same_v<FORTRANFPTYPE, fptype> )
     {
@@ -333,7 +333,7 @@ namespace mg5amcCpu
   }
 #endif
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   template<typename FORTRANFPTYPE>
   void Bridge<FORTRANFPTYPE>::cpu_sequence( const FORTRANFPTYPE* momenta,
                                             const FORTRANFPTYPE* gs,
@@ -388,7 +388,7 @@ namespace mg5amcCpu
   // - C++ array: momenta[npagM][npar][np4][neppM] with nevt=npagM*neppM (AOSOA)
   //
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   template<typename Tin, typename Tout>
   __global__ void dev_transposeMomentaF2C( const Tin* in, Tout* out, const unsigned int nevt )
   {
diff --git a/epochX/cudacpp/gg_ttg.sa/SubProcesses/BridgeKernels.cc b/epochX/cudacpp/gg_ttg.sa/SubProcesses/BridgeKernels.cc
index d58066c9c1..eaf4037a24 100644
--- a/epochX/cudacpp/gg_ttg.sa/SubProcesses/BridgeKernels.cc
+++ b/epochX/cudacpp/gg_ttg.sa/SubProcesses/BridgeKernels.cc
@@ -1,17 +1,18 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #include "BridgeKernels.h"
 
+#include "GpuAbstraction.h"
 #include "MemoryAccessMomenta.h"
 
 #include <sstream>
 
 //============================================================================
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -45,7 +46,7 @@ namespace mg5amcCpu
 
 //============================================================================
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 namespace mg5amcCpu
 {
 
@@ -96,7 +97,7 @@ namespace mg5amcCpu
 
 //============================================================================
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 {
 
diff --git a/epochX/cudacpp/gg_ttg.sa/SubProcesses/BridgeKernels.h b/epochX/cudacpp/gg_ttg.sa/SubProcesses/BridgeKernels.h
index 15eb4bff4d..3efef8ce97 100644
--- a/epochX/cudacpp/gg_ttg.sa/SubProcesses/BridgeKernels.h
+++ b/epochX/cudacpp/gg_ttg.sa/SubProcesses/BridgeKernels.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef BRIDGEKERNELS_H
 #define BRIDGEKERNELS_H 1
@@ -12,7 +12,7 @@
 #include "MatrixElementKernels.h"
 #include "MemoryBuffers.h"
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -49,7 +49,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A Bridge wrapper class encapsulating matrix element calculations on a CPU host
   class BridgeKernelHost final : public BridgeKernelBase
   {
@@ -89,7 +89,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   // A Bridge wrapper class encapsulating matrix element calculations on a GPU device
   class BridgeKernelDevice : public BridgeKernelBase
   {
diff --git a/epochX/cudacpp/gg_ttg.sa/SubProcesses/CommonRandomNumberKernel.cc b/epochX/cudacpp/gg_ttg.sa/SubProcesses/CommonRandomNumberKernel.cc
index 985b39f576..010bc4cbd0 100644
--- a/epochX/cudacpp/gg_ttg.sa/SubProcesses/CommonRandomNumberKernel.cc
+++ b/epochX/cudacpp/gg_ttg.sa/SubProcesses/CommonRandomNumberKernel.cc
@@ -1,15 +1,16 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #include "CommonRandomNumbers.h"
+#include "GpuAbstraction.h"
 #include "MemoryBuffers.h"
 #include "RandomNumberKernels.h"
 
 #include <cassert>
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/gg_ttg.sa/SubProcesses/CrossSectionKernels.cc b/epochX/cudacpp/gg_ttg.sa/SubProcesses/CrossSectionKernels.cc
index 0b355a3c8d..c15b39844d 100644
--- a/epochX/cudacpp/gg_ttg.sa/SubProcesses/CrossSectionKernels.cc
+++ b/epochX/cudacpp/gg_ttg.sa/SubProcesses/CrossSectionKernels.cc
@@ -1,10 +1,11 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #include "CrossSectionKernels.h"
 
+#include "GpuAbstraction.h"
 #include "MemoryAccessMatrixElements.h"
 #include "MemoryAccessWeights.h"
 #include "MemoryBuffers.h"
@@ -77,7 +78,7 @@ debug_me_is_abnormal( const fptype& me, size_t ievtALL )
 
 //============================================================================
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -185,7 +186,7 @@ namespace mg5amcCpu
 
 //============================================================================
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 {
 
diff --git a/epochX/cudacpp/gg_ttg.sa/SubProcesses/CrossSectionKernels.h b/epochX/cudacpp/gg_ttg.sa/SubProcesses/CrossSectionKernels.h
index 7933ca4bbf..4d9659e04e 100644
--- a/epochX/cudacpp/gg_ttg.sa/SubProcesses/CrossSectionKernels.h
+++ b/epochX/cudacpp/gg_ttg.sa/SubProcesses/CrossSectionKernels.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef CROSSSECTIONKERNELS_H
 #define CROSSSECTIONKERNELS_H 1
@@ -13,7 +13,7 @@
 
 //============================================================================
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -96,7 +96,7 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
   /*
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   // A class encapsulating the calculation of event statistics on a GPU device
   class CrossSectionKernelDevice : public CrossSectionKernelBase, public NumberOfEvents
   {
diff --git a/epochX/cudacpp/gg_ttg.sa/SubProcesses/CudaRuntime.h b/epochX/cudacpp/gg_ttg.sa/SubProcesses/CudaRuntime.h
deleted file mode 100644
index 64ce52f4b3..0000000000
--- a/epochX/cudacpp/gg_ttg.sa/SubProcesses/CudaRuntime.h
+++ /dev/null
@@ -1,85 +0,0 @@
-// Copyright (C) 2020-2023 CERN and UCLouvain.
-// Licensed under the GNU Lesser General Public License (version 3 or later).
-// Created by: S. Roiser (Jul 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
-
-#ifndef MG5AMC_CUDARUNTIME_H
-#define MG5AMC_CUDARUNTIME_H 1
-
-// MG5AMC on GPU uses the CUDA runtime API, not the lower level CUDA driver API
-// See https://docs.nvidia.com/cuda/cuda-runtime-api/driver-vs-runtime-api.html#driver-vs-runtime-api
-
-#include <cassert>
-#include <iostream>
-
-//--------------------------------------------------------------------------
-
-// See https://stackoverflow.com/a/14038590
-#ifdef __CUDACC__ /* clang-format off */
-#define checkCuda( code ) { assertCuda( code, __FILE__, __LINE__ ); }
-inline void assertCuda( cudaError_t code, const char* file, int line, bool abort = true )
-{
-  if( code != cudaSuccess )
-  {
-    printf( "ERROR! assertCuda: '%s' (%d) in %s:%d\n", cudaGetErrorString( code ), code, file, line );
-    if( abort ) assert( code == cudaSuccess );
-  }
-}
-#endif /* clang-format on */
-
-//--------------------------------------------------------------------------
-
-#ifdef __CUDACC__
-namespace mg5amcGpu
-{
-  // Instantiate a CudaRuntime at the beginnining of the application's main to
-  // invoke cudaSetDevice(0) in the constructor and book a cudaDeviceReset() call in the destructor
-  // *** FIXME! This will all need to be designed differently when going to multi-GPU nodes! ***
-  struct CudaRuntime final
-  {
-    CudaRuntime( const bool debug = true )
-      : m_debug( debug ) { setUp( m_debug ); }
-    ~CudaRuntime() { tearDown( m_debug ); }
-    CudaRuntime( const CudaRuntime& ) = delete;
-    CudaRuntime( CudaRuntime&& ) = delete;
-    CudaRuntime& operator=( const CudaRuntime& ) = delete;
-    CudaRuntime& operator=( CudaRuntime&& ) = delete;
-    bool m_debug;
-
-    // Set up CUDA application
-    // ** NB: strictly speaking this is not needed when using the CUDA runtime API **
-    // Calling cudaSetDevice on startup is useful to properly book-keep the time spent in CUDA initialization
-    static void setUp( const bool debug = true )
-    {
-      // ** NB: it is useful to call cudaSetDevice, or cudaFree, to properly book-keep the time spent in CUDA initialization
-      // ** NB: otherwise, the first CUDA operation (eg a cudaMemcpyToSymbol in CPPProcess ctor) appears to take much longer!
-      /*
-      // [We initially added cudaFree(0) to "ease profile analysis" only because it shows up as a big recognizable block!]
-      // No explicit initialization is needed: https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#initialization
-      // It is not clear what cudaFree(0) does at all: https://stackoverflow.com/questions/69967813/
-      if ( debug ) std::cout << "__CudaRuntime: calling cudaFree(0)" << std::endl;
-      checkCuda( cudaFree( 0 ) ); // SLOW!
-      */
-      // Replace cudaFree(0) by cudaSetDevice(0), even if it is not really needed either
-      // (but see https://developer.nvidia.com/blog/cuda-pro-tip-always-set-current-device-avoid-multithreading-bugs)
-      if( debug ) std::cout << "__CudaRuntime: calling cudaSetDevice(0)" << std::endl;
-      checkCuda( cudaSetDevice( 0 ) ); // SLOW!
-    }
-
-    // Tear down CUDA application (call cudaDeviceReset)
-    // ** NB: strictly speaking this is not needed when using the CUDA runtime API **
-    // Calling cudaDeviceReset on shutdown is only needed for checking memory leaks in cuda-memcheck
-    // See https://docs.nvidia.com/cuda/cuda-memcheck/index.html#leak-checking
-    static void tearDown( const bool debug = true )
-    {
-      if( debug ) std::cout << "__CudaRuntime: calling cudaDeviceReset()" << std::endl;
-      checkCuda( cudaDeviceReset() );
-    }
-  };
-
-}
-#endif
-
-//--------------------------------------------------------------------------
-
-#endif // MG5AMC_CUDARUNTIME_H
diff --git a/epochX/cudacpp/gg_ttg.sa/SubProcesses/CurandRandomNumberKernel.cc b/epochX/cudacpp/gg_ttg.sa/SubProcesses/CurandRandomNumberKernel.cc
index eb56333b03..08a16f6f2c 100644
--- a/epochX/cudacpp/gg_ttg.sa/SubProcesses/CurandRandomNumberKernel.cc
+++ b/epochX/cudacpp/gg_ttg.sa/SubProcesses/CurandRandomNumberKernel.cc
@@ -1,9 +1,9 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
-#include "CudaRuntime.h"
+#include "GpuRuntime.h"
 #include "MemoryBuffers.h"
 #include "RandomNumberKernels.h"
 
@@ -22,7 +22,7 @@ inline void assertCurand( curandStatus_t code, const char *file, int line, bool
 }
 #endif /* clang-format on */
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -36,7 +36,7 @@ namespace mg5amcCpu
   {
     if( m_isOnDevice )
     {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
       if( !m_rnarray.isOnDevice() )
         throw std::runtime_error( "CurandRandomNumberKernel on device with a host random number array" );
 #else
@@ -114,7 +114,7 @@ namespace mg5amcCpu
     /*
     printf( "\nCurandRandomNumberKernel::generateRnarray size = %d\n", (int)m_rnarray.size() );
     fptype* data = m_rnarray.data();
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     if( m_rnarray.isOnDevice() )
     {
       data = new fptype[m_rnarray.size()]();
@@ -123,7 +123,7 @@ namespace mg5amcCpu
 #endif
     for( int i = 0; i < ( (int)m_rnarray.size() / 4 ); i++ )
       printf( "[%4d] %f %f %f %f\n", i * 4, data[i * 4], data[i * 4 + 2], data[i * 4 + 2], data[i * 4 + 3] );
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     if( m_rnarray.isOnDevice() ) delete[] data;
 #endif
     */
diff --git a/epochX/cudacpp/gg_ttg.sa/SubProcesses/EventStatistics.h b/epochX/cudacpp/gg_ttg.sa/SubProcesses/EventStatistics.h
index 48b51e0a49..b425a5bade 100644
--- a/epochX/cudacpp/gg_ttg.sa/SubProcesses/EventStatistics.h
+++ b/epochX/cudacpp/gg_ttg.sa/SubProcesses/EventStatistics.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef EventStatistics_H
 #define EventStatistics_H 1
@@ -16,7 +16,7 @@
 #include <limits>
 #include <string>
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/gg_ttg.sa/SubProcesses/MadgraphTest.h b/epochX/cudacpp/gg_ttg.sa/SubProcesses/MadgraphTest.h
index ffe3b84d53..176338151a 100644
--- a/epochX/cudacpp/gg_ttg.sa/SubProcesses/MadgraphTest.h
+++ b/epochX/cudacpp/gg_ttg.sa/SubProcesses/MadgraphTest.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: S. Hageboeck (Dec 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MADGRAPHTEST_H_
 #define MADGRAPHTEST_H_ 1
@@ -21,7 +21,7 @@
 #include <string>
 #include <vector>
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 using mg5amcGpu::CPPProcess;
 #else
 using mg5amcCpu::CPPProcess;
@@ -200,7 +200,7 @@ class MadgraphTest : public testing::TestWithParam<TestDriverBase*>
 
 // Since we link both the CPU-only and GPU tests into the same executable, we prevent
 // a multiply defined symbol by only compiling this in the non-CUDA phase:
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 
 /// Compare momenta and matrix elements.
 /// This uses an implementation of TestDriverBase to run a madgraph workflow,
@@ -309,6 +309,6 @@ TEST_P( MadgraphTest, CompareMomentaAndME )
   }
 }
 
-#endif // __CUDACC__
+#endif // MGONGPUCPP_GPUIMPL
 
 #endif /* MADGRAPHTEST_H_ */
diff --git a/epochX/cudacpp/gg_ttg.sa/SubProcesses/MatrixElementKernels.cc b/epochX/cudacpp/gg_ttg.sa/SubProcesses/MatrixElementKernels.cc
index 30257195b6..d6d6c4f179 100644
--- a/epochX/cudacpp/gg_ttg.sa/SubProcesses/MatrixElementKernels.cc
+++ b/epochX/cudacpp/gg_ttg.sa/SubProcesses/MatrixElementKernels.cc
@@ -1,12 +1,12 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #include "MatrixElementKernels.h"
 
 #include "CPPProcess.h"
-#include "CudaRuntime.h"
+#include "GpuRuntime.h" // Includes the abstraction for Nvidia/AMD compilation
 #include "MemoryAccessMomenta.h"
 #include "MemoryBuffers.h"
 
@@ -14,7 +14,7 @@
 
 //============================================================================
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 namespace mg5amcCpu
 {
 
@@ -143,7 +143,7 @@ namespace mg5amcCpu
 
 //============================================================================
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 {
 
@@ -202,13 +202,13 @@ namespace mg5amcGpu
     PinnedHostBufferHelicityMask hstIsGoodHel( ncomb );
     DeviceBufferHelicityMask devIsGoodHel( ncomb );
     // ... 0d1. Compute good helicity mask on the device
-    computeDependentCouplings<<<m_gpublocks, m_gputhreads>>>( m_gs.data(), m_couplings.data() );
+    gpuLaunchKernel( computeDependentCouplings, m_gpublocks, m_gputhreads, m_gs.data(), m_couplings.data() );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    sigmaKin_getGoodHel<<<m_gpublocks, m_gputhreads>>>( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_numerators.data(), m_denominators.data(), devIsGoodHel.data() );
+    gpuLaunchKernel( sigmaKin_getGoodHel, m_gpublocks, m_gputhreads, m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_numerators.data(), m_denominators.data(), devIsGoodHel.data() );
 #else
-    sigmaKin_getGoodHel<<<m_gpublocks, m_gputhreads>>>( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), devIsGoodHel.data() );
+    gpuLaunchKernel( sigmaKin_getGoodHel, m_gpublocks, m_gputhreads, m_momenta.data(), m_couplings.data(), m_matrixElements.data(), devIsGoodHel.data() );
 #endif
-    checkCuda( cudaPeekAtLastError() );
+    checkGpu( gpuPeekAtLastError() );
     // ... 0d2. Copy back good helicity mask to the host
     copyHostFromDevice( hstIsGoodHel, devIsGoodHel );
     // ... 0d3. Copy back good helicity list to constant memory on the device
@@ -219,19 +219,19 @@ namespace mg5amcGpu
 
   void MatrixElementKernelDevice::computeMatrixElements( const unsigned int channelId )
   {
-    computeDependentCouplings<<<m_gpublocks, m_gputhreads>>>( m_gs.data(), m_couplings.data() );
+    gpuLaunchKernel( computeDependentCouplings, m_gpublocks, m_gputhreads, m_gs.data(), m_couplings.data() );
 #ifndef MGONGPU_NSIGHT_DEBUG
     constexpr unsigned int sharedMemSize = 0;
 #else
     constexpr unsigned int sharedMemSize = ntpbMAX * sizeof( float );
 #endif
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    sigmaKin<<<m_gpublocks, m_gputhreads, sharedMemSize>>>( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), channelId, m_numerators.data(), m_denominators.data(), m_selhel.data(), m_selcol.data() );
+    gpuLaunchKernelSharedMem( sigmaKin, m_gpublocks, m_gputhreads, sharedMemSize, m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), channelId, m_numerators.data(), m_denominators.data(), m_selhel.data(), m_selcol.data() );
 #else
-    sigmaKin<<<m_gpublocks, m_gputhreads, sharedMemSize>>>( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), m_selhel.data(), m_selcol.data() );
+    gpuLaunchKernelSharedMem( sigmaKin, m_gpublocks, m_gputhreads, sharedMemSize, m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), m_selhel.data(), m_selcol.data() );
 #endif
-    checkCuda( cudaPeekAtLastError() );
-    checkCuda( cudaDeviceSynchronize() );
+    checkGpu( gpuPeekAtLastError() );
+    checkGpu( gpuDeviceSynchronize() );
   }
 
   //--------------------------------------------------------------------------
diff --git a/epochX/cudacpp/gg_ttg.sa/SubProcesses/MatrixElementKernels.h b/epochX/cudacpp/gg_ttg.sa/SubProcesses/MatrixElementKernels.h
index 23e84757a2..72bd8f195b 100644
--- a/epochX/cudacpp/gg_ttg.sa/SubProcesses/MatrixElementKernels.h
+++ b/epochX/cudacpp/gg_ttg.sa/SubProcesses/MatrixElementKernels.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MATRIXELEMENTKERNELS_H
 #define MATRIXELEMENTKERNELS_H 1
@@ -10,7 +10,7 @@
 
 #include "MemoryBuffers.h"
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -81,7 +81,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating matrix element calculations on a CPU host
   class MatrixElementKernelHost final : public MatrixElementKernelBase, public NumberOfEvents
   {
@@ -130,7 +130,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   // A class encapsulating matrix element calculations on a GPU device
   class MatrixElementKernelDevice : public MatrixElementKernelBase, public NumberOfEvents
   {
diff --git a/epochX/cudacpp/gg_ttg.sa/SubProcesses/MemoryAccessAmplitudes.h b/epochX/cudacpp/gg_ttg.sa/SubProcesses/MemoryAccessAmplitudes.h
index 573b3bbbc9..ffb76e93de 100644
--- a/epochX/cudacpp/gg_ttg.sa/SubProcesses/MemoryAccessAmplitudes.h
+++ b/epochX/cudacpp/gg_ttg.sa/SubProcesses/MemoryAccessAmplitudes.h
@@ -15,7 +15,7 @@
 #define MGONGPU_TRIVIAL_AMPLITUDES 1
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/gg_ttg.sa/SubProcesses/MemoryAccessCouplings.h b/epochX/cudacpp/gg_ttg.sa/SubProcesses/MemoryAccessCouplings.h
index 35a3af42e0..3afdf3e554 100644
--- a/epochX/cudacpp/gg_ttg.sa/SubProcesses/MemoryAccessCouplings.h
+++ b/epochX/cudacpp/gg_ttg.sa/SubProcesses/MemoryAccessCouplings.h
@@ -15,7 +15,7 @@
 #include "MemoryBuffers.h"       // for HostBufferCouplings::isaligned
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/gg_ttg.sa/SubProcesses/MemoryAccessCouplingsFixed.h b/epochX/cudacpp/gg_ttg.sa/SubProcesses/MemoryAccessCouplingsFixed.h
index dc0d93afff..ffcdf4dbef 100644
--- a/epochX/cudacpp/gg_ttg.sa/SubProcesses/MemoryAccessCouplingsFixed.h
+++ b/epochX/cudacpp/gg_ttg.sa/SubProcesses/MemoryAccessCouplingsFixed.h
@@ -14,7 +14,7 @@
 //#include "MemoryAccessHelpers.h"
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/gg_ttg.sa/SubProcesses/MemoryAccessDenominators.h b/epochX/cudacpp/gg_ttg.sa/SubProcesses/MemoryAccessDenominators.h
index 3bce635718..66f2d32a6b 100644
--- a/epochX/cudacpp/gg_ttg.sa/SubProcesses/MemoryAccessDenominators.h
+++ b/epochX/cudacpp/gg_ttg.sa/SubProcesses/MemoryAccessDenominators.h
@@ -10,7 +10,7 @@
 #include "MemoryAccessGs.h"
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/gg_ttg.sa/SubProcesses/MemoryAccessGs.h b/epochX/cudacpp/gg_ttg.sa/SubProcesses/MemoryAccessGs.h
index 31311aa375..4c726b30f3 100644
--- a/epochX/cudacpp/gg_ttg.sa/SubProcesses/MemoryAccessGs.h
+++ b/epochX/cudacpp/gg_ttg.sa/SubProcesses/MemoryAccessGs.h
@@ -13,7 +13,7 @@
 #include "MemoryBuffers.h" // for HostBufferMatrixElements::isaligned
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/gg_ttg.sa/SubProcesses/MemoryAccessHelpers.h b/epochX/cudacpp/gg_ttg.sa/SubProcesses/MemoryAccessHelpers.h
index c82a6c7635..db73e4e064 100644
--- a/epochX/cudacpp/gg_ttg.sa/SubProcesses/MemoryAccessHelpers.h
+++ b/epochX/cudacpp/gg_ttg.sa/SubProcesses/MemoryAccessHelpers.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MemoryAccessHelpers_H
 #define MemoryAccessHelpers_H 1
@@ -105,7 +105,7 @@ class KernelAccessHelper : public MemoryAccessHelper<T>
     }
     else
     {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
       const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid
       //printf( "kernelAccessRecord: ievt=%d threadId=%d\n", ievt, threadIdx.x );
       return T::ieventAccessRecord( buffer, ievt ); // NB fptype and fptype_sv coincide for CUDA
diff --git a/epochX/cudacpp/gg_ttg.sa/SubProcesses/MemoryAccessMatrixElements.h b/epochX/cudacpp/gg_ttg.sa/SubProcesses/MemoryAccessMatrixElements.h
index f32e6fea5b..3741011971 100644
--- a/epochX/cudacpp/gg_ttg.sa/SubProcesses/MemoryAccessMatrixElements.h
+++ b/epochX/cudacpp/gg_ttg.sa/SubProcesses/MemoryAccessMatrixElements.h
@@ -13,7 +13,7 @@
 #include "MemoryBuffers.h" // for HostBufferMatrixElements::isaligned
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/gg_ttg.sa/SubProcesses/MemoryAccessMomenta.h b/epochX/cudacpp/gg_ttg.sa/SubProcesses/MemoryAccessMomenta.h
index 29266de32c..3be229d392 100644
--- a/epochX/cudacpp/gg_ttg.sa/SubProcesses/MemoryAccessMomenta.h
+++ b/epochX/cudacpp/gg_ttg.sa/SubProcesses/MemoryAccessMomenta.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MemoryAccessMomenta_H
 #define MemoryAccessMomenta_H 1
@@ -13,7 +13,7 @@
 #include "MemoryAccessVectors.h"
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -30,7 +30,7 @@ namespace mg5amcCpu
 
     // Number of Events Per Page in the momenta AOSOA memory buffer layout
     // (these are all best kept as a compile-time constants: see issue #23)
-#ifdef __CUDACC__ /* clang-format off */
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
     // -----------------------------------------------------------------------------------------------
     // --- GPUs: neppM is best set to a power of 2 times the number of fptype's in a 32-byte cacheline
     // --- This is relevant to ensure coalesced access to momenta in global memory
diff --git a/epochX/cudacpp/gg_ttg.sa/SubProcesses/MemoryAccessNumerators.h b/epochX/cudacpp/gg_ttg.sa/SubProcesses/MemoryAccessNumerators.h
index b152183b28..18991f4fa6 100644
--- a/epochX/cudacpp/gg_ttg.sa/SubProcesses/MemoryAccessNumerators.h
+++ b/epochX/cudacpp/gg_ttg.sa/SubProcesses/MemoryAccessNumerators.h
@@ -10,7 +10,7 @@
 #include "MemoryAccessGs.h"
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/gg_ttg.sa/SubProcesses/MemoryAccessRandomNumbers.h b/epochX/cudacpp/gg_ttg.sa/SubProcesses/MemoryAccessRandomNumbers.h
index e2988d39f3..40cb089135 100644
--- a/epochX/cudacpp/gg_ttg.sa/SubProcesses/MemoryAccessRandomNumbers.h
+++ b/epochX/cudacpp/gg_ttg.sa/SubProcesses/MemoryAccessRandomNumbers.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MemoryAccessRandomNumbers_H
 #define MemoryAccessRandomNumbers_H 1
@@ -11,7 +11,7 @@
 #include "CPPProcess.h"
 #include "MemoryAccessHelpers.h"
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 using mg5amcGpu::CPPProcess;
 #else
 using mg5amcCpu::CPPProcess;
diff --git a/epochX/cudacpp/gg_ttg.sa/SubProcesses/MemoryAccessVectors.h b/epochX/cudacpp/gg_ttg.sa/SubProcesses/MemoryAccessVectors.h
index e9b197368e..08faccff0f 100644
--- a/epochX/cudacpp/gg_ttg.sa/SubProcesses/MemoryAccessVectors.h
+++ b/epochX/cudacpp/gg_ttg.sa/SubProcesses/MemoryAccessVectors.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MemoryAccessVectors_H
 #define MemoryAccessVectors_H 1
@@ -10,7 +10,7 @@
 
 #include "mgOnGpuVectors.h"
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 namespace mg5amcCpu // this is only needed for CPU SIMD vectorization
 {
 
diff --git a/epochX/cudacpp/gg_ttg.sa/SubProcesses/MemoryAccessWavefunctions.h b/epochX/cudacpp/gg_ttg.sa/SubProcesses/MemoryAccessWavefunctions.h
index 5428aaf933..33bef4559e 100644
--- a/epochX/cudacpp/gg_ttg.sa/SubProcesses/MemoryAccessWavefunctions.h
+++ b/epochX/cudacpp/gg_ttg.sa/SubProcesses/MemoryAccessWavefunctions.h
@@ -15,7 +15,7 @@
 #define MGONGPU_TRIVIAL_WAVEFUNCTIONS 1
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/gg_ttg.sa/SubProcesses/MemoryBuffers.h b/epochX/cudacpp/gg_ttg.sa/SubProcesses/MemoryBuffers.h
index 3093e6ed18..7756a71621 100644
--- a/epochX/cudacpp/gg_ttg.sa/SubProcesses/MemoryBuffers.h
+++ b/epochX/cudacpp/gg_ttg.sa/SubProcesses/MemoryBuffers.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021, based on earlier work by S. Hageboeck) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Roiser, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MemoryBuffers_H
 #define MemoryBuffers_H 1
@@ -11,12 +11,12 @@
 #include "mgOnGpuCxtypes.h"
 
 #include "CPPProcess.h"
-#include "CudaRuntime.h"
+#include "GpuRuntime.h"
 #include "Parameters_sm.h"
 
 #include <sstream>
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -87,7 +87,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   constexpr bool HostBufferALIGNED = false;   // ismisaligned=false
   constexpr bool HostBufferMISALIGNED = true; // ismisaligned=true
 
@@ -119,7 +119,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   // A class encapsulating a CUDA pinned host buffer
   template<typename T>
   class PinnedHostBufferBase : public BufferBase<T>
@@ -128,18 +128,18 @@ namespace mg5amcCpu
     PinnedHostBufferBase( const size_t size )
       : BufferBase<T>( size, false )
     {
-      checkCuda( cudaMallocHost( &( this->m_data ), this->bytes() ) );
+      gpuMallocHost( &( this->m_data ), this->bytes() );
     }
     virtual ~PinnedHostBufferBase()
     {
-      checkCuda( cudaFreeHost( this->m_data ) );
+      gpuFreeHost( this->m_data );
     }
   };
 #endif
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   // A class encapsulating a CUDA device buffer
   template<typename T>
   class DeviceBufferBase : public BufferBase<T>
@@ -148,18 +148,18 @@ namespace mg5amcCpu
     DeviceBufferBase( const size_t size )
       : BufferBase<T>( size, true )
     {
-      checkCuda( cudaMalloc( &( this->m_data ), this->bytes() ) );
+      gpuMalloc( &( this->m_data ), this->bytes() );
     }
     virtual ~DeviceBufferBase()
     {
-      checkCuda( cudaFree( this->m_data ) );
+      gpuFree( this->m_data );
     }
   };
 #endif
 
   //--------------------------------------------------------------------------
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for a given number of events
   template<typename T, size_t sizePerEvent, bool ismisaligned>
   class HostBuffer : public HostBufferBase<T, ismisaligned>, virtual private NumberOfEvents
@@ -175,7 +175,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   // A class encapsulating a CUDA pinned host buffer for a given number of events
   template<typename T, size_t sizePerEvent>
   class PinnedHostBuffer : public PinnedHostBufferBase<T>, virtual private NumberOfEvents
@@ -191,7 +191,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   // A class encapsulating a CUDA device buffer for a given number of events
   template<typename T, size_t sizePerEvent>
   class DeviceBuffer : public DeviceBufferBase<T>, virtual private NumberOfEvents
@@ -213,7 +213,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for momenta random numbers
   constexpr size_t sizePerEventRndNumMomenta = MemoryBuffers::np4 * MemoryBuffers::nparf;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for momenta random numbers
   typedef HostBuffer<fptype, sizePerEventRndNumMomenta, HostBufferALIGNED> HostBufferRndNumMomenta;
 #else
@@ -232,7 +232,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer with ONE fptype per event
   constexpr size_t sizePerEventOneFp = 1;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer with ONE fptype per event
   typedef HostBuffer<fptype, sizePerEventOneFp, HostBufferALIGNED> HostBufferOneFp;
 #else
@@ -257,7 +257,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for Gs
   constexpr size_t sizePerEventGs = 1;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for gs
   typedef HostBuffer<fptype, sizePerEventGs, HostBufferALIGNED> HostBufferGs;
 #else
@@ -276,7 +276,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for numerators
   constexpr size_t sizePerEventNumerators = 1;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for gs
   typedef HostBuffer<fptype, sizePerEventNumerators, HostBufferALIGNED> HostBufferNumerators;
 #else
@@ -296,7 +296,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for denominators
   constexpr size_t sizePerEventDenominators = 1;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for gs
   typedef HostBuffer<fptype, sizePerEventDenominators, HostBufferALIGNED> HostBufferDenominators;
 #else
@@ -315,7 +315,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for random numbers
   constexpr size_t sizePerEventCouplings = MemoryBuffers::ndcoup * MemoryBuffers::nx2;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for gs
   typedef HostBuffer<fptype, sizePerEventCouplings, HostBufferALIGNED> HostBufferCouplings;
 #else
@@ -333,7 +333,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for momenta
   constexpr size_t sizePerEventMomenta = MemoryBuffers::np4 * MemoryBuffers::npar;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for momenta
   typedef HostBuffer<fptype, sizePerEventMomenta, HostBufferALIGNED> HostBufferMomenta;
   //typedef HostBuffer<fptype, sizePerEventMomenta, HostBufferMISALIGNED> HostBufferMomenta; // TEST MISALIGNMENT!
@@ -352,7 +352,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for sampling weights
   constexpr size_t sizePerEventWeights = 1;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for sampling weights
   typedef HostBuffer<fptype, sizePerEventWeights, HostBufferALIGNED> HostBufferWeights;
 #else
@@ -370,7 +370,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for matrix elements
   constexpr size_t sizePerEventMatrixElements = 1;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for matrix elements
   typedef HostBuffer<fptype, sizePerEventMatrixElements, HostBufferALIGNED> HostBufferMatrixElements;
 #else
@@ -385,7 +385,7 @@ namespace mg5amcCpu
   // A base class encapsulating a memory buffer for the helicity mask
   typedef BufferBase<bool> BufferHelicityMask;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for the helicity mask
   typedef HostBufferBase<bool, HostBufferALIGNED> HostBufferHelicityMask;
 #else
@@ -403,7 +403,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for wavefunctions
   constexpr size_t sizePerEventWavefunctions = MemoryBuffers::nw6 * MemoryBuffers::nx2;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for wavefunctions
   typedef HostBuffer<fptype, sizePerEventWavefunctions, HostBufferALIGNED> HostBufferWavefunctions;
 #else
@@ -421,7 +421,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for helicity random numbers
   constexpr size_t sizePerEventRndNumHelicity = 1;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for helicity random numbers
   typedef HostBuffer<fptype, sizePerEventRndNumHelicity, HostBufferALIGNED> HostBufferRndNumHelicity;
 #else
@@ -439,7 +439,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for color random numbers
   constexpr size_t sizePerEventRndNumColor = 1;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for color random numbers
   typedef HostBuffer<fptype, sizePerEventRndNumColor, HostBufferALIGNED> HostBufferRndNumColor;
 #else
@@ -457,7 +457,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for helicity selection
   constexpr size_t sizePerEventSelectedHelicity = 1;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for helicity selection
   typedef HostBuffer<int, sizePerEventSelectedHelicity, HostBufferALIGNED> HostBufferSelectedHelicity;
 #else
@@ -475,7 +475,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for color selection
   constexpr size_t sizePerEventSelectedColor = 1;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for color selection
   typedef HostBuffer<int, sizePerEventSelectedColor, HostBufferALIGNED> HostBufferSelectedColor;
 #else
@@ -487,7 +487,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   template<class Tdst, class Tsrc>
   void copyDeviceFromHost( Tdst& dst, const Tsrc& src ) // keep the same order of arguments as in memcpy
   {
@@ -504,13 +504,13 @@ namespace mg5amcCpu
       throw std::runtime_error( sstr.str() );
     }
     // NB (PR #45): cudaMemcpy involves an intermediate memcpy to pinned memory if host array is a not a pinned host array
-    checkCuda( cudaMemcpy( dst.data(), src.data(), src.bytes(), cudaMemcpyHostToDevice ) );
+    gpuMemcpy( dst.data(), src.data(), src.bytes(), gpuMemcpyHostToDevice );
   }
 #endif
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   template<class Tdst, class Tsrc>
   void copyHostFromDevice( Tdst& dst, const Tsrc& src ) // keep the same order of arguments as in memcpy
   {
@@ -527,7 +527,7 @@ namespace mg5amcCpu
       throw std::runtime_error( sstr.str() );
     }
     // NB (PR #45): cudaMemcpy involves an intermediate memcpy to pinned memory if host array is a not a pinned host array
-    checkCuda( cudaMemcpy( dst.data(), src.data(), src.bytes(), cudaMemcpyDeviceToHost ) );
+    gpuMemcpy( dst.data(), src.data(), src.bytes(), gpuMemcpyDeviceToHost );
   }
 #endif
 
diff --git a/epochX/cudacpp/gg_ttg.sa/SubProcesses/P1_Sigma_sm_gg_ttxg/CPPProcess.cc b/epochX/cudacpp/gg_ttg.sa/SubProcesses/P1_Sigma_sm_gg_ttxg/CPPProcess.cc
index 09575d4a91..b723717621 100644
--- a/epochX/cudacpp/gg_ttg.sa/SubProcesses/P1_Sigma_sm_gg_ttxg/CPPProcess.cc
+++ b/epochX/cudacpp/gg_ttg.sa/SubProcesses/P1_Sigma_sm_gg_ttxg/CPPProcess.cc
@@ -4,7 +4,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
 // MadGraph5_aMC@NLO v. 3.5.0_lo_vect, 2023-06-09
@@ -16,7 +16,6 @@
 
 #include "mgOnGpuConfig.h"
 
-#include "CudaRuntime.h"
 #include "HelAmps_sm.h"
 #include "MemoryAccessAmplitudes.h"
 #include "MemoryAccessCouplings.h"
@@ -46,7 +45,7 @@
 // Class member functions for calculating the matrix elements for
 // Process: g g > t t~ g WEIGHTED<=3 @1
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -80,7 +79,7 @@ namespace mg5amcCpu
   __device__ const fptype cIPD[2] = { (fptype)Parameters_sm::mdl_MT, (fptype)Parameters_sm::mdl_WT };
   __device__ const fptype* cIPC = nullptr; // unused as nicoup=0
 #else
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   __device__ __constant__ fptype cIPD[2];
   __device__ __constant__ fptype* cIPC = nullptr; // unused as nicoup=0
 #else
@@ -90,7 +89,7 @@ namespace mg5amcCpu
 #endif
 
   // Helicity combinations (and filtering of "good" helicity combinations)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   __device__ __constant__ short cHel[ncomb][npar];
   __device__ __constant__ int cNGoodHel;
   __device__ __constant__ int cGoodHel[ncomb];
@@ -118,13 +117,13 @@ namespace mg5amcCpu
                            fptype* allDenominators,       // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
                            fptype_sv* jamp2_sv            // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled)
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
                            , const int ievt00             // input: first event number in current C++ event page (for CUDA, ievt depends on threadid)
 #endif
                            )
   //ALWAYS_INLINE // attributes are not permitted in a function definition
   {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     using namespace mg5amcGpu;
     using M_ACCESS = DeviceAccessMomenta;         // non-trivial access: buffer includes all events
     using E_ACCESS = DeviceAccessMatrixElements;  // non-trivial access: buffer includes all events
@@ -151,7 +150,7 @@ namespace mg5amcCpu
 #endif /* clang-format on */
     mgDebug( 0, __FUNCTION__ );
     //printf( "calculate_wavefunctions: ihel=%2d\n", ihel );
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
     //printf( "calculate_wavefunctions: ievt00=%d\n", ievt00 );
 #endif
 
@@ -187,7 +186,7 @@ namespace mg5amcCpu
 #endif
     for( int iParity = 0; iParity < nParity; ++iParity )
     { // START LOOP ON IPARITY
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
       const int ievt0 = ievt00 + iParity * neppV;
 #endif
       constexpr size_t nxcoup = ndcoup + nicoup; // both dependent and independent couplings
@@ -200,8 +199,10 @@ namespace mg5amcCpu
         allCOUPs[idcoup] = CD_ACCESS::idcoupAccessBufferConst( allcouplings, idcoup ); // dependent couplings, vary event-by-event
       for( size_t iicoup = 0; iicoup < nicoup; iicoup++ )
         allCOUPs[ndcoup + iicoup] = CI_ACCESS::iicoupAccessBufferConst( cIPC, iicoup ); // independent couplings, fixed for all events
+#ifdef MGONGPUCPP_GPUIMPL
 #ifdef __CUDACC__
 #pragma nv_diagnostic pop
+#endif
       // CUDA kernels take input/output buffers with momenta/MEs for all events
       const fptype* momenta = allmomenta;
       const fptype* COUPs[nxcoup];
@@ -499,7 +500,7 @@ namespace mg5amcCpu
         { 1, -8, 10, 1, 64, -8 },
         { 10, 1, 1, -8, -8, 64 } }; // 2-D array[6][6]
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
       // Pre-compute a constexpr triangular color matrix properly normalized #475
       struct TriangularNormalizedColorMatrix
       {
@@ -556,7 +557,7 @@ namespace mg5amcCpu
 #endif
       for( int icol = 0; icol < ncolor; icol++ )
       {
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
         // === C++ START ===
         // Diagonal terms
 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
@@ -615,7 +616,7 @@ namespace mg5amcCpu
       MEs_sv_previous += deltaMEs_previous;
 #endif
       /*
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
       if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", blockDim.x * blockIdx.x + threadIdx.x, ihel, MEs_sv );
 #else
 #ifdef MGONGPU_CPPSIMD
@@ -678,8 +679,8 @@ namespace mg5amcCpu
       { 1, 1, 1, 1, 1 },
       { 1, 1, 1, -1, -1 },
       { 1, 1, 1, -1, 1 } };
-#ifdef __CUDACC__
-    checkCuda( cudaMemcpyToSymbol( cHel, tHel, ncomb * npar * sizeof( short ) ) );
+#ifdef MGONGPUCPP_GPUIMPL
+    gpuMemcpyToSymbol( cHel, tHel, ncomb * npar * sizeof( short ) );
 #else
     memcpy( cHel, tHel, ncomb * npar * sizeof( short ) );
 #endif
@@ -720,9 +721,9 @@ namespace mg5amcCpu
     // Then copy them to CUDA constant memory (issue #39) or its C++ emulation in file-scope static memory
     const fptype tIPD[2] = { (fptype)m_pars->mdl_MT, (fptype)m_pars->mdl_WT };
     //const cxtype tIPC[0] = { ... }; // nicoup=0
-#ifdef __CUDACC__
-    checkCuda( cudaMemcpyToSymbol( cIPD, tIPD, 2 * sizeof( fptype ) ) );
-    //checkCuda( cudaMemcpyToSymbol( cIPC, tIPC, 0 * sizeof( cxtype ) ) ); // nicoup=0
+#ifdef MGONGPUCPP_GPUIMPL
+    gpuMemcpyToSymbol( cIPD, tIPD, 2 * sizeof( fptype ) );
+    //gpuMemcpyToSymbol( cIPC, tIPC, 0 * sizeof( cxtype ) ); // nicoup=0
 #else
     memcpy( cIPD, tIPD, 2 * sizeof( fptype ) );
     //memcpy( cIPC, tIPC, 0 * sizeof( cxtype ) ); // nicoup=0
@@ -759,7 +760,7 @@ namespace mg5amcCpu
   {
     std::stringstream out;
     // CUDA version (NVCC)
-    // [Use __NVCC__ instead of __CUDACC__ here!]
+    // [Use __NVCC__ instead of MGONGPUCPP_GPUIMPL here!]
     // [This tests if 'nvcc' was used even to build a .cc file, even if not necessarily 'nvcc -x cu' for a .cu file]
     // [Check 'nvcc --compiler-options -dM -E dummy.c | grep CUDA': see https://stackoverflow.com/a/53713712]
 #ifdef __NVCC__
@@ -824,12 +825,12 @@ namespace mg5amcCpu
   __global__ void /* clang-format off */
   computeDependentCouplings( const fptype* allgs, // input: Gs[nevt]
                              fptype* allcouplings // output: couplings[nevt*ndcoup*2]
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
                              , const int nevt     // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #endif
   ) /* clang-format on */
   {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     using namespace mg5amcGpu;
     using G_ACCESS = DeviceAccessGs;
     using C_ACCESS = DeviceAccessCouplings;
@@ -850,7 +851,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__ /* clang-format off */
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
   __global__ void
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
@@ -979,9 +980,9 @@ namespace mg5amcCpu
         nGoodHel++;
       }
     }
-#ifdef __CUDACC__
-    checkCuda( cudaMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) ) );
-    checkCuda( cudaMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) ) );
+#ifdef MGONGPUCPP_GPUIMPL
+    gpuMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) );
+    gpuMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) );
 #else
     cNGoodHel = nGoodHel;
     for( int ihel = 0; ihel < ncomb; ihel++ ) cGoodHel[ihel] = goodHel[ihel];
@@ -1005,7 +1006,7 @@ namespace mg5amcCpu
 #endif
             int* allselhel,                // output: helicity selection[nevt]
             int* allselcol                 // output: helicity selection[nevt]
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
             , const int nevt               // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #endif
             ) /* clang-format on */
@@ -1026,7 +1027,7 @@ namespace mg5amcCpu
     // Denominators: spins, colors and identical particles
     constexpr int helcolDenominators[1] = { 256 }; // assume nprocesses == 1 (#272 and #343)
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     // Remember: in CUDA this is a kernel for one event, in c++ this processes n events
     const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid
 #else
@@ -1040,9 +1041,12 @@ namespace mg5amcCpu
 #endif
 
     // Start sigmaKin_lines
+
+#include "GpuAbstraction.h"
+
     // === PART 0 - INITIALISATION (before calculate_wavefunctions) ===
     // Reset the "matrix elements" - running sums of |M|^2 over helicities for the given event
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     allMEs[ievt] = 0;
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     allNumerators[ievt] = 0;
@@ -1070,7 +1074,7 @@ namespace mg5amcCpu
     // === PART 1 - HELICITY LOOP: CALCULATE WAVEFUNCTIONS ===
     // (in both CUDA and C++, using precomputed good helicities)
 
-#ifdef __CUDACC__ // CUDA OR C++
+#ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++
 
     // *** START OF PART 1a - CUDA (one event per CPU thread) ***
     // Running sum of partial amplitudes squared for event by event color selection (#402)
@@ -1274,7 +1278,7 @@ namespace mg5amcCpu
     // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event
     // [NB 'sum over final spins, average over initial spins', eg see
     // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf]
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     allMEs[ievt] /= helcolDenominators[0];
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     if( channelId > 0 ) allMEs[ievt] *= allNumerators[ievt] / allDenominators[ievt];
diff --git a/epochX/cudacpp/gg_ttg.sa/SubProcesses/P1_Sigma_sm_gg_ttxg/CPPProcess.h b/epochX/cudacpp/gg_ttg.sa/SubProcesses/P1_Sigma_sm_gg_ttxg/CPPProcess.h
index 0edca1b52a..ff2cb4ab9a 100644
--- a/epochX/cudacpp/gg_ttg.sa/SubProcesses/P1_Sigma_sm_gg_ttxg/CPPProcess.h
+++ b/epochX/cudacpp/gg_ttg.sa/SubProcesses/P1_Sigma_sm_gg_ttxg/CPPProcess.h
@@ -4,7 +4,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
 // MadGraph5_aMC@NLO v. 3.5.0_lo_vect, 2023-06-09
@@ -25,7 +25,7 @@
 
 //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -107,7 +107,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   __global__ void
   computeDependentCouplings( const fptype* allgs,    // input: Gs[nevt]
                              fptype* allcouplings ); // output: couplings[nevt*ndcoup*2]
@@ -120,7 +120,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__ /* clang-format off */
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
   __global__ void
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
@@ -150,7 +150,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__ /* clang-format off */
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
   __global__ void
   sigmaKin( const fptype* allmomenta,      // input: momenta[nevt*npar*4]
             const fptype* allcouplings,    // input: couplings[nevt*ndcoup*2]
diff --git a/epochX/cudacpp/gg_ttg.sa/SubProcesses/P1_Sigma_sm_gg_ttxg/CudaRuntime.h b/epochX/cudacpp/gg_ttg.sa/SubProcesses/P1_Sigma_sm_gg_ttxg/CudaRuntime.h
deleted file mode 120000
index ce9e1a487a..0000000000
--- a/epochX/cudacpp/gg_ttg.sa/SubProcesses/P1_Sigma_sm_gg_ttxg/CudaRuntime.h
+++ /dev/null
@@ -1 +0,0 @@
-../CudaRuntime.h
\ No newline at end of file
diff --git a/epochX/cudacpp/gg_ttg.sa/SubProcesses/P1_Sigma_sm_gg_ttxg/check_sa.cc b/epochX/cudacpp/gg_ttg.sa/SubProcesses/P1_Sigma_sm_gg_ttxg/check_sa.cc
index f1e75b9252..1bad694d1c 100644
--- a/epochX/cudacpp/gg_ttg.sa/SubProcesses/P1_Sigma_sm_gg_ttxg/check_sa.cc
+++ b/epochX/cudacpp/gg_ttg.sa/SubProcesses/P1_Sigma_sm_gg_ttxg/check_sa.cc
@@ -4,7 +4,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: O. Mattelaer (Nov 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 
 #include "mgOnGpuConfig.h"
@@ -12,6 +12,7 @@
 #include "BridgeKernels.h"
 #include "CPPProcess.h"
 #include "CrossSectionKernels.h"
+#include "GpuRuntime.h"
 #include "MatrixElementKernels.h"
 #include "MemoryAccessMatrixElements.h"
 #include "MemoryAccessMomenta.h"
@@ -63,7 +64,7 @@ usage( char* argv0, int ret = 1 )
   std::cout << std::endl;
   std::cout << "Summary stats are always computed: '-p' and '-j' only control their printout" << std::endl;
   std::cout << "The '-d' flag only enables NaN/abnormal warnings and OMP debugging" << std::endl;
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 #ifdef _OPENMP
   std::cout << std::endl;
   std::cout << "Use the OMP_NUM_THREADS environment variable to control OMP multi-threading" << std::endl;
@@ -77,7 +78,7 @@ int
 main( int argc, char** argv )
 {
   // Namespaces for CUDA and C++ (FIXME - eventually use the same namespace everywhere...)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   using namespace mg5amcGpu;
 #else
   using namespace mg5amcCpu;
@@ -103,11 +104,11 @@ main( int argc, char** argv )
     CurandDevice = 2
   };
 #ifdef __CUDACC__
-  RandomNumberMode rndgen = RandomNumberMode::CurandDevice; // default on GPU
+  RandomNumberMode rndgen = RandomNumberMode::CurandDevice; // default on CUDA GPU
 #elif not defined MGONGPU_HAS_NO_CURAND
   RandomNumberMode rndgen = RandomNumberMode::CurandHost;  // default on CPU if build has curand
 #else
-  RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // default on CPU if build has no curand
+  RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // default on CPU if build has no curand and on HIP GPU
 #endif
   // Rambo sampling mode (NB RamboHost implies CommonRandom or CurandHost!)
   enum class RamboSamplingMode
@@ -115,7 +116,7 @@ main( int argc, char** argv )
     RamboHost = 1,
     RamboDevice = 2
   };
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   RamboSamplingMode rmbsmp = RamboSamplingMode::RamboDevice; // default on GPU
 #else
   RamboSamplingMode rmbsmp = RamboSamplingMode::RamboHost; // default on CPU
@@ -148,7 +149,7 @@ main( int argc, char** argv )
 #ifdef __CUDACC__
       rndgen = RandomNumberMode::CurandDevice;
 #else
-      throw std::runtime_error( "CurandDevice is not supported on CPUs" );
+      throw std::runtime_error( "CurandDevice is not supported on CPUs or on HIP GPUs" );
 #endif
     }
     else if( arg == "--curhst" )
@@ -165,7 +166,7 @@ main( int argc, char** argv )
     }
     else if( arg == "--rmbdev" )
     {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
       rmbsmp = RamboSamplingMode::RamboDevice;
 #else
       throw std::runtime_error( "RamboDevice is not supported on CPUs" );
@@ -239,13 +240,13 @@ main( int argc, char** argv )
     return usage( argv[0] );
   }
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 #ifdef _OPENMP
   ompnumthreadsNotSetMeansOneThread( debug ? 1 : 0 ); // quiet(-1), info(0), debug(1)
 #endif
 #endif
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // Fail gently and avoid "Illegal instruction (core dumped)" if the host does not support the SIMD used in the ME calculation
   // Note: this prevents a crash on pmpe04 but not on some github CI nodes?
   // [NB: SIMD vectorization in mg5amc C++ code is only used in the ME calculation below MatrixElementKernelHost!]
@@ -263,14 +264,14 @@ main( int argc, char** argv )
 
   // === STEP 0 - INITIALISE
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 
-  // --- 00. Initialise cuda
-  // Instantiate a CudaRuntime at the beginnining of the application's main to
-  // invoke cudaSetDevice(0) in the constructor and book a cudaDeviceReset() call in the destructor
-  const std::string cdinKey = "00 CudaInit";
+  // --- 00. Initialise GPU
+  // Instantiate a GpuRuntime at the beginnining of the application's main.
+  // For CUDA this invokes cudaSetDevice(0) in the constructor and books a cudaDeviceReset() call in the destructor.
+  const std::string cdinKey = "00 GpuInit";
   timermap.start( cdinKey );
-  CudaRuntime cudaRuntime( debug );
+  GpuRuntime GpuRuntime( debug );
 #endif
 
   // --- 0a. Initialise physics process
@@ -292,7 +293,7 @@ main( int argc, char** argv )
   timermap.start( alloKey );
 
   // Memory buffers for random numbers for momenta
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferRndNumMomenta hstRndmom( nevt );
 #else
   PinnedHostBufferRndNumMomenta hstRndmom( nevt );
@@ -300,7 +301,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for sampling weights
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferWeights hstWeights( nevt );
 #else
   PinnedHostBufferWeights hstWeights( nevt );
@@ -308,7 +309,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for momenta
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferMomenta hstMomenta( nevt );
 #else
   PinnedHostBufferMomenta hstMomenta( nevt );
@@ -316,7 +317,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for Gs
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferGs hstGs( nevt );
 #else
   PinnedHostBufferGs hstGs( nevt );
@@ -333,7 +334,7 @@ main( int argc, char** argv )
   }
 
   // Memory buffers for matrix elements
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferMatrixElements hstMatrixElements( nevt );
 #else
   PinnedHostBufferMatrixElements hstMatrixElements( nevt );
@@ -342,7 +343,7 @@ main( int argc, char** argv )
 
   // Memory buffers for random numbers for helicity selection
   // *** NB #403 these buffers always remain initialised at 0: no need for helicity choice in gcheck/check (no LHE produced) ***
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferRndNumHelicity hstRndHel( nevt );
 #else
   PinnedHostBufferRndNumHelicity hstRndHel( nevt );
@@ -351,7 +352,7 @@ main( int argc, char** argv )
 
   // Memory buffers for random numbers for color selection
   // *** NB #402 these buffers always remain initialised at 0: no need for color choice in gcheck/check (no LHE produced) ***
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferRndNumColor hstRndCol( nevt );
 #else
   PinnedHostBufferRndNumColor hstRndCol( nevt );
@@ -359,7 +360,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for helicity selection
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferSelectedHelicity hstSelHel( nevt );
 #else
   PinnedHostBufferSelectedHelicity hstSelHel( nevt );
@@ -367,7 +368,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for color selection
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferSelectedColor hstSelCol( nevt );
 #else
   PinnedHostBufferSelectedColor hstSelCol( nevt );
@@ -403,7 +404,7 @@ main( int argc, char** argv )
 #else
   else
   {
-    throw std::logic_error( "CurandDevice is not supported on CPUs" ); // INTERNAL ERROR (no path to this statement)
+    throw std::logic_error( "CurandDevice is not supported on CPUs or HIP GPUs" ); // INTERNAL ERROR (no path to this statement)
   }
 #endif
 #else
@@ -421,7 +422,7 @@ main( int argc, char** argv )
   }
   else
   {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     prsk.reset( new RamboSamplingKernelDevice( energy, devRndmom, devMomenta, devWeights, gpublocks, gputhreads ) );
 #else
     throw std::logic_error( "RamboDevice is not supported on CPUs" ); // INTERNAL ERROR (no path to this statement)
@@ -432,7 +433,7 @@ main( int argc, char** argv )
   std::unique_ptr<MatrixElementKernelBase> pmek;
   if( !bridge )
   {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     pmek.reset( new MatrixElementKernelDevice( devMomenta, devGs, devRndHel, devRndCol, devMatrixElements, devSelHel, devSelCol, gpublocks, gputhreads ) );
 #else
     pmek.reset( new MatrixElementKernelHost( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, nevt ) );
@@ -440,7 +441,7 @@ main( int argc, char** argv )
   }
   else
   {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     pmek.reset( new BridgeKernelDevice( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, gpublocks, gputhreads ) );
 #else
     pmek.reset( new BridgeKernelHost( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, nevt ) );
@@ -482,7 +483,7 @@ main( int argc, char** argv )
     prnk->generateRnarray();
     //std::cout << "Got random numbers" << std::endl;
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     if( rndgen != RandomNumberMode::CurandDevice && rmbsmp == RamboSamplingMode::RamboDevice )
     {
       // --- 1c. Copy rndmom from host to device
@@ -514,7 +515,7 @@ main( int argc, char** argv )
     prsk->getMomentaFinal();
     //std::cout << "Got final momenta" << std::endl;
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     if( rmbsmp == RamboSamplingMode::RamboDevice )
     {
       // --- 2c. CopyDToH Weights
@@ -559,7 +560,7 @@ main( int argc, char** argv )
       dynamic_cast<BridgeKernelBase*>( pmek.get() )->transposeInputMomentaC2F();
     }
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     // --- 2d. CopyHToD Momenta
     const std::string gKey = "0.. CpHTDg";
     rambtime += timermap.start( gKey ); // FIXME! NOT A RAMBO TIMER!
@@ -588,7 +589,7 @@ main( int argc, char** argv )
     wv3atime += timermap.stop(); // calc only
     wavetime += wv3atime;        // calc plus copy
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     if( !bridge )
     {
       // --- 3b. CopyDToH MEs
@@ -731,15 +732,19 @@ main( int argc, char** argv )
     rndgentxt = "CURAND DEVICE";
 #ifdef __CUDACC__
   rndgentxt += " (CUDA code)";
+#elif defined __HIPCC__
+  rndgentxt += " (HIP code)";
 #else
   rndgentxt += " (C++ code)";
 #endif
 
   // Workflow description summary
   std::string wrkflwtxt;
-  // -- CUDA or C++?
+  // -- CUDA or HIP or C++?
 #ifdef __CUDACC__
   wrkflwtxt += "CUD:";
+#elif defined __HIPCC__
+  wrkflwtxt += "HIP:";
 #else
   wrkflwtxt += "CPP:";
 #endif
@@ -764,6 +769,12 @@ main( int argc, char** argv )
 #else
   wrkflwtxt += "???:"; // no path to this statement
 #endif
+#elif defined __HIPCC__
+#if defined MGONGPU_CUCXTYPE_CXSMPL
+  wrkflwtxt += "CXS:";
+#else
+  wrkflwtxt += "???:"; // no path to this statement
+#endif
 #else
 #if defined MGONGPU_CPPCXTYPE_STDCOMPLEX
   wrkflwtxt += "STX:";
@@ -789,7 +800,7 @@ main( int argc, char** argv )
     wrkflwtxt += "RMBDEV+";
   else
     wrkflwtxt += "??????+"; // no path to this statement
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   // -- HOST or DEVICE matrix elements? Standalone MEs or BRIDGE?
   if( !bridge )
     wrkflwtxt += "MESDEV";
@@ -845,7 +856,7 @@ main( int argc, char** argv )
 
   if( perf )
   {
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 #ifdef _OPENMP
     // Get the output of "nproc --all" (https://stackoverflow.com/a/478960)
     std::string nprocall;
@@ -866,6 +877,8 @@ main( int argc, char** argv )
     std::cout << std::string( SEP79, '*' ) << std::endl
 #ifdef __CUDACC__
               << "Process                     = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_CUDA"
+#elif defined __HIPCC__
+              << "Process                     = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_HIP"
 #else
               << "Process                     = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_CPP"
 #endif
@@ -892,21 +905,21 @@ main( int argc, char** argv )
 #elif defined MGONGPU_FPTYPE_FLOAT
               << "FP precision                = FLOAT (NaN/abnormal=" << nabn << ", zero=" << nzero << ")" << std::endl
 #endif
-#ifdef __CUDACC__
 #if defined MGONGPU_CUCXTYPE_CUCOMPLEX
               << "Complex type                = CUCOMPLEX" << std::endl
 #elif defined MGONGPU_CUCXTYPE_THRUST
               << "Complex type                = THRUST::COMPLEX" << std::endl
-#endif
-#else
+#elif defined MGONGPU_CUCXTYPE_CXSMPL
               << "Complex type                = STD::COMPLEX" << std::endl
+#else
+              << "Complex type                = ???" << std::endl // no path to this statement...
 #endif
               << "RanNumb memory layout       = AOSOA[" << neppR << "]"
               << ( neppR == 1 ? " == AOS" : "" )
               << " [HARDCODED FOR REPRODUCIBILITY]" << std::endl
               << "Momenta memory layout       = AOSOA[" << neppM << "]"
               << ( neppM == 1 ? " == AOS" : "" ) << std::endl
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     //<< "Wavefunction GPU memory     = LOCAL" << std::endl
 #else
 #if !defined MGONGPU_CPPSIMD
@@ -937,7 +950,7 @@ main( int argc, char** argv )
 #endif
 #endif
               << "Random number generation    = " << rndgentxt << std::endl
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 #ifdef _OPENMP
               << "OMP threads / `nproc --all` = " << omp_get_max_threads() << " / " << nprocall // includes a newline
 #endif
@@ -1033,14 +1046,14 @@ main( int argc, char** argv )
              << "\"FLOAT (NaN/abnormal=" << nabn << ")\"," << std::endl
 #endif
              << "\"Complex type\": "
-#ifdef __CUDACC__
 #if defined MGONGPU_CUCXTYPE_CUCOMPLEX
              << "\"CUCOMPLEX\"," << std::endl
 #elif defined MGONGPU_CUCXTYPE_THRUST
              << "\"THRUST::COMPLEX\"," << std::endl
-#endif
-#else
+#elif defined MGONGPU_CUCXTYPE_CXSMPL
              << "\"STD::COMPLEX\"," << std::endl
+#else
+             << "\"???\"," << std::endl                           // no path to this statement...
 #endif
              << "\"RanNumb memory layout\": "
              << "\"AOSOA[" << neppR << "]\""
@@ -1048,7 +1061,7 @@ main( int argc, char** argv )
              << "\"Momenta memory layout\": "
              << "\"AOSOA[" << neppM << "]\""
              << ( neppM == 1 ? " == AOS" : "" ) << ", " << std::endl
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     //<< "\"Wavefunction GPU memory\": " << "\"LOCAL\"," << std::endl
 #endif
              << "\"Curand generation\": "
diff --git a/epochX/cudacpp/gg_ttg.sa/SubProcesses/RamboSamplingKernels.cc b/epochX/cudacpp/gg_ttg.sa/SubProcesses/RamboSamplingKernels.cc
index da68aa9255..79abbcc4f8 100644
--- a/epochX/cudacpp/gg_ttg.sa/SubProcesses/RamboSamplingKernels.cc
+++ b/epochX/cudacpp/gg_ttg.sa/SubProcesses/RamboSamplingKernels.cc
@@ -1,11 +1,11 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #include "RamboSamplingKernels.h"
 
-#include "CudaRuntime.h"
+#include "GpuRuntime.h"
 #include "MemoryAccessMomenta.h"
 #include "MemoryAccessRandomNumbers.h"
 #include "MemoryAccessWeights.h"
@@ -14,7 +14,7 @@
 
 #include <sstream>
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -92,7 +92,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   RamboSamplingKernelDevice::RamboSamplingKernelDevice( const fptype energy,               // input: energy
                                                         const BufferRndNumMomenta& rndmom, // input: random numbers in [0,1]
                                                         BufferMomenta& momenta,            // output: momenta
@@ -135,7 +135,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   __global__ void
   getMomentaInitialDevice( const fptype energy,
                            fptype* momenta )
@@ -147,17 +147,17 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   void
   RamboSamplingKernelDevice::getMomentaInitial()
   {
-    getMomentaInitialDevice<<<m_gpublocks, m_gputhreads>>>( m_energy, m_momenta.data() );
+    gpuLaunchKernel( getMomentaInitialDevice, m_gpublocks, m_gputhreads, m_energy, m_momenta.data() );
   }
 #endif
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   __global__ void
   getMomentaFinalDevice( const fptype energy,
                          const fptype* rndmom,
@@ -171,11 +171,11 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   void
   RamboSamplingKernelDevice::getMomentaFinal()
   {
-    getMomentaFinalDevice<<<m_gpublocks, m_gputhreads>>>( m_energy, m_rndmom.data(), m_momenta.data(), m_weights.data() );
+    gpuLaunchKernel( getMomentaFinalDevice, m_gpublocks, m_gputhreads, m_energy, m_rndmom.data(), m_momenta.data(), m_weights.data() );
   }
 #endif
 
diff --git a/epochX/cudacpp/gg_ttg.sa/SubProcesses/RamboSamplingKernels.h b/epochX/cudacpp/gg_ttg.sa/SubProcesses/RamboSamplingKernels.h
index 184089efd7..7c214cd74b 100644
--- a/epochX/cudacpp/gg_ttg.sa/SubProcesses/RamboSamplingKernels.h
+++ b/epochX/cudacpp/gg_ttg.sa/SubProcesses/RamboSamplingKernels.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef RAMBOSAMPLINGKERNELS_H
 #define RAMBOSAMPLINGKERNELS_H 1
@@ -10,7 +10,7 @@
 
 #include "MemoryBuffers.h"
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -93,7 +93,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   // A class encapsulating RAMBO phase space sampling on a GPU device
   class RamboSamplingKernelDevice final : public SamplingKernelBase, public NumberOfEvents
   {
diff --git a/epochX/cudacpp/gg_ttg.sa/SubProcesses/RandomNumberKernels.h b/epochX/cudacpp/gg_ttg.sa/SubProcesses/RandomNumberKernels.h
index 188a72c2c9..21d63beeac 100644
--- a/epochX/cudacpp/gg_ttg.sa/SubProcesses/RandomNumberKernels.h
+++ b/epochX/cudacpp/gg_ttg.sa/SubProcesses/RandomNumberKernels.h
@@ -1,14 +1,14 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef RANDOMNUMBERKERNELS_H
 #define RANDOMNUMBERKERNELS_H 1
 
 #include "mgOnGpuConfig.h"
 
-// NB This must come AFTER mgOnGpuConfig.h which contains our definition of __global__ when __CUDACC__ is not defined
+// NB This must come AFTER mgOnGpuConfig.h which contains our definition of __global__ when MGONGPUCPP_GPUIMPL is not defined
 #ifndef MGONGPU_HAS_NO_CURAND
 //#include "curand.h"
 struct curandGenerator_st; // forward definition from curand.h
@@ -16,7 +16,7 @@ struct curandGenerator_st; // forward definition from curand.h
 
 #include "MemoryBuffers.h"
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/gg_ttg.sa/SubProcesses/cudacpp.mk b/epochX/cudacpp/gg_ttg.sa/SubProcesses/cudacpp.mk
index 43cee0977e..77334d2c04 100644
--- a/epochX/cudacpp/gg_ttg.sa/SubProcesses/cudacpp.mk
+++ b/epochX/cudacpp/gg_ttg.sa/SubProcesses/cudacpp.mk
@@ -1,7 +1,7 @@
 # Copyright (C) 2020-2023 CERN and UCLouvain.
 # Licensed under the GNU Lesser General Public License (version 3 or later).
 # Created by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin.
-# Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+# Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 
 #=== Determine the name of this makefile (https://ftp.gnu.org/old-gnu/Manuals/make-3.80/html_node/make_17.html)
 #=== NB: different names (e.g. cudacpp.mk and cudacpp_src.mk) are used in the Subprocess and src directories
@@ -32,7 +32,7 @@ UNAME_P := $(shell uname -p)
 #=== Configure common compiler flags for C++ and CUDA
 
 INCFLAGS = -I.
-OPTFLAGS = -O3 # this ends up in CUFLAGS too (should it?), cannot add -Ofast or -ffast-math here
+OPTFLAGS = -O3 # this ends up in GPUFLAGS too (should it?), cannot add -Ofast or -ffast-math here
 
 # Dependency on src directory
 MG5AMC_COMMONLIB = mg5amc_common
@@ -103,69 +103,135 @@ endif
 
 #-------------------------------------------------------------------------------
 
-#=== Configure the CUDA compiler
-
-# If CXX is not a single word (example "clang++ --gcc-toolchain...") then disable CUDA builds (issue #505)
-# This is because it is impossible to pass this to "CUFLAGS += -ccbin <host-compiler>" below
-ifneq ($(words $(subst ccache ,,$(CXX))),1) # allow at most "CXX=ccache <host-compiler>" from outside
-  $(warning CUDA builds are not supported for multi-word CXX "$(CXX)")
-  override CUDA_HOME=disabled
-endif
-
-# If CUDA_HOME is not set, try to set it from the location of nvcc
-ifndef CUDA_HOME
-  CUDA_HOME = $(patsubst %bin/nvcc,%,$(shell which nvcc 2>/dev/null))
-  $(warning CUDA_HOME was not set: using "$(CUDA_HOME)")
-endif
-
-# Set NVCC as $(CUDA_HOME)/bin/nvcc if it exists
-ifneq ($(wildcard $(CUDA_HOME)/bin/nvcc),)
-  NVCC = $(CUDA_HOME)/bin/nvcc
-  USE_NVTX ?=-DUSE_NVTX
-  # See https://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/index.html
-  # See https://arnon.dk/matching-sm-architectures-arch-and-gencode-for-various-nvidia-cards/
-  # Default: use compute capability 70 for V100 (CERN lxbatch, CERN itscrd, Juwels Cluster).
-  # Embed device code for 70, and PTX for 70+.
-  # Export MADGRAPH_CUDA_ARCHITECTURE (comma-separated list) to use another value or list of values (see #533).
-  # Examples: use 60 for P100 (Piz Daint), 80 for A100 (Juwels Booster, NVidia raplab/Curiosity).
-  MADGRAPH_CUDA_ARCHITECTURE ?= 70
-  ###CUARCHFLAGS = -gencode arch=compute_$(MADGRAPH_CUDA_ARCHITECTURE),code=compute_$(MADGRAPH_CUDA_ARCHITECTURE) -gencode arch=compute_$(MADGRAPH_CUDA_ARCHITECTURE),code=sm_$(MADGRAPH_CUDA_ARCHITECTURE) # Older implementation (AV): go back to this one for multi-GPU support #533
-  ###CUARCHFLAGS = --gpu-architecture=compute_$(MADGRAPH_CUDA_ARCHITECTURE) --gpu-code=sm_$(MADGRAPH_CUDA_ARCHITECTURE),compute_$(MADGRAPH_CUDA_ARCHITECTURE) # Newer implementation (SH): cannot use this as-is for multi-GPU support #533
-  comma:=,
-  CUARCHFLAGS = $(foreach arch,$(subst $(comma), ,$(MADGRAPH_CUDA_ARCHITECTURE)),-gencode arch=compute_$(arch),code=compute_$(arch) -gencode arch=compute_$(arch),code=sm_$(arch))
-  CUINC = -I$(CUDA_HOME)/include/
-  CURANDLIBFLAGS = -L$(CUDA_HOME)/lib64/ -lcurand # NB: -lcuda is not needed here!
-  CUOPTFLAGS = -lineinfo
-  CUFLAGS = $(foreach opt, $(OPTFLAGS), -Xcompiler $(opt)) $(CUOPTFLAGS) $(INCFLAGS) $(CUINC) $(USE_NVTX) $(CUARCHFLAGS) -use_fast_math
-  ###CUFLAGS += -Xcompiler -Wall -Xcompiler -Wextra -Xcompiler -Wshadow
-  ###NVCC_VERSION = $(shell $(NVCC) --version | grep 'Cuda compilation tools' | cut -d' ' -f5 | cut -d, -f1)
-  CUFLAGS += -std=c++17 # need CUDA >= 11.2 (see #333): this is enforced in mgOnGpuConfig.h
-  # Without -maxrregcount: baseline throughput: 6.5E8 (16384 32 12) up to 7.3E8 (65536 128 12)
-  ###CUFLAGS+= --maxrregcount 160 # improves throughput: 6.9E8 (16384 32 12) up to 7.7E8 (65536 128 12)
-  ###CUFLAGS+= --maxrregcount 128 # improves throughput: 7.3E8 (16384 32 12) up to 7.6E8 (65536 128 12)
-  ###CUFLAGS+= --maxrregcount 96 # degrades throughput: 4.1E8 (16384 32 12) up to 4.5E8 (65536 128 12)
-  ###CUFLAGS+= --maxrregcount 64 # degrades throughput: 1.7E8 (16384 32 12) flat at 1.7E8 (65536 128 12)
-else ifneq ($(origin REQUIRE_CUDA),undefined)
-  # If REQUIRE_CUDA is set but no cuda is found, stop here (e.g. for CI tests on GPU #443)
-  $(error No cuda installation found (set CUDA_HOME or make nvcc visible in PATH))
-else
-  # No cuda. Switch cuda compilation off and go to common random numbers in C++
-  $(warning CUDA_HOME is not set or is invalid: export CUDA_HOME to compile with cuda)
-  override NVCC=
-  override USE_NVTX=
-  override CUINC=
-  override CURANDLIBFLAGS=
-endif
-export NVCC
-export CUFLAGS
+CUDA_COMPILER_PATH := $(shell compiler="`which nvcc 2>/dev/null`" && while [ -L "$$compiler" ]; do compiler=`readlink "$$compiler"`; done && echo "$$compiler")
+HIP_COMPILER_PATH := $(shell compiler="`which hipcc 2>/dev/null`" && while [ -L "$$compiler" ]; do compiler=`readlink "$$compiler"`; done && echo "$$compiler")
+
+ifeq ($(findstring nvcc,$(CUDA_COMPILER_PATH)),nvcc)
+  #=== Configure the CUDA compiler
+
+  # If CXX is not a single word (example "clang++ --gcc-toolchain...") then disable CUDA builds (issue #505)
+  # This is because it is impossible to pass this to "GPUFLAGS += -ccbin <host-compiler>" below
+  ifneq ($(words $(subst ccache ,,$(CXX))),1) # allow at most "CXX=ccache <host-compiler>" from outside
+    $(warning CUDA builds are not supported for multi-word CXX "$(CXX)")
+    override CUDA_HOME=disabled
+  endif
+
+  # If CUDA_HOME is not set, try to set it from the location of nvcc
+  ifndef CUDA_HOME
+    CUDA_HOME = $(patsubst %bin/nvcc,%,$(shell which nvcc 2>/dev/null))
+    $(warning CUDA_HOME was not set: using "$(CUDA_HOME)")
+  endif
+
+  # Set GPUCC as $(CUDA_HOME)/bin/nvcc if it exists
+  ifneq ($(wildcard $(CUDA_HOME)/bin/nvcc),)
+    GPUCC = $(CUDA_HOME)/bin/nvcc
+    USE_NVTX ?=-DUSE_NVTX
+    # See https://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/index.html
+    # See https://arnon.dk/matching-sm-architectures-arch-and-gencode-for-various-nvidia-cards/
+    # Default: use compute capability 70 for V100 (CERN lxbatch, CERN itscrd, Juwels Cluster).
+    # Embed device code for 70, and PTX for 70+.
+    # Export MADGRAPH_CUDA_ARCHITECTURE (comma-separated list) to use another value or list of values (see #533).
+    # Examples: use 60 for P100 (Piz Daint), 80 for A100 (Juwels Booster, NVidia raplab/Curiosity).
+    MADGRAPH_CUDA_ARCHITECTURE ?= 70
+    ###CUARCHFLAGS = -gencode arch=compute_$(MADGRAPH_CUDA_ARCHITECTURE),code=compute_$(MADGRAPH_CUDA_ARCHITECTURE) -gencode arch=compute_$(MADGRAPH_CUDA_ARCHITECTURE),code=sm_$(MADGRAPH_CUDA_ARCHITECTURE) # Older implementation (AV): go back to this one for multi-GPU support #533
+    ###CUARCHFLAGS = --gpu-architecture=compute_$(MADGRAPH_CUDA_ARCHITECTURE) --gpu-code=sm_$(MADGRAPH_CUDA_ARCHITECTURE),compute_$(MADGRAPH_CUDA_ARCHITECTURE) # Newer implementation (SH): cannot use this as-is for multi-GPU support #533
+    comma:=,
+    CUARCHFLAGS = $(foreach arch,$(subst $(comma), ,$(MADGRAPH_CUDA_ARCHITECTURE)),-gencode arch=compute_$(arch),code=compute_$(arch) -gencode arch=compute_$(arch),code=sm_$(arch))
+    CUINC = -I$(CUDA_HOME)/include/
+    CURANDLIBFLAGS = -L$(CUDA_HOME)/lib64/ -lcurand # NB: -lcuda is not needed here!
+    CUOPTFLAGS = -lineinfo
+    GPUFLAGS = $(foreach opt, $(OPTFLAGS), -Xcompiler $(opt)) $(CUOPTFLAGS) $(INCFLAGS) $(CUINC) $(USE_NVTX) $(CUARCHFLAGS) -use_fast_math
+    ###GPUFLAGS += -Xcompiler -Wall -Xcompiler -Wextra -Xcompiler -Wshadow
+    ###GPUCC_VERSION = $(shell $(GPUCC) --version | grep 'Cuda compilation tools' | cut -d' ' -f5 | cut -d, -f1)
+    GPUFLAGS += -std=c++17 # need CUDA >= 11.2 (see #333): this is enforced in mgOnGpuConfig.h
+    # Without -maxrregcount: baseline throughput: 6.5E8 (16384 32 12) up to 7.3E8 (65536 128 12)
+    ###GPUFLAGS+= --maxrregcount 160 # improves throughput: 6.9E8 (16384 32 12) up to 7.7E8 (65536 128 12)
+    ###GPUFLAGS+= --maxrregcount 128 # improves throughput: 7.3E8 (16384 32 12) up to 7.6E8 (65536 128 12)
+    ###GPUFLAGS+= --maxrregcount 96 # degrades throughput: 4.1E8 (16384 32 12) up to 4.5E8 (65536 128 12)
+    ###GPUFLAGS+= --maxrregcount 64 # degrades throughput: 1.7E8 (16384 32 12) flat at 1.7E8 (65536 128 12)
+    CUBUILDRULEFLAGS = -Xcompiler -fPIC -c
+    CCBUILDRULEFLAGS = -Xcompiler -fPIC -c -x cu
+    CUDATESTFLAGS = -lcuda
+  else ifneq ($(origin REQUIRE_CUDA),undefined)
+    # If REQUIRE_CUDA is set but no cuda is found, stop here (e.g. for CI tests on GPU #443)
+    $(error No cuda installation found (set CUDA_HOME or make GPUCC visible in PATH))
+  else
+    # No cuda. Switch cuda compilation off and go to common random numbers in C++
+    $(warning CUDA_HOME is not set or is invalid: export CUDA_HOME to compile with cuda)
+    override GPUCC=
+    override USE_NVTX=
+    override CUINC=
+    override CURANDLIBFLAGS=
+  endif
+  export GPUCC
+  export GPUFLAGS
+
+  # Set the host C++ compiler for GPUCC via "-ccbin <host-compiler>"
+  # (NB issue #505: this must be a single word, "clang++ --gcc-toolchain..." is not supported)
+  GPUFLAGS += -ccbin $(shell which $(subst ccache ,,$(CXX)))
+
+  # Allow newer (unsupported) C++ compilers with older versions of CUDA if ALLOW_UNSUPPORTED_COMPILER_IN_CUDA is set (#504)
+  ifneq ($(origin ALLOW_UNSUPPORTED_COMPILER_IN_CUDA),undefined)
+  GPUFLAGS += -allow-unsupported-compiler
+  endif
+
+else ifeq ($(findstring hipcc,$(HIP_COMPILER_PATH)),hipcc)
+  #=== Configure the HIP compiler
+
+  # If CXX is not a single word (example "clang++ --gcc-toolchain...") then disable CUDA builds (issue #505)
+  # This is because it is impossible to pass this to "GPUFLAGS += -ccbin <host-compiler>" below
+  ifneq ($(words $(subst ccache ,,$(CXX))),1) # allow at most "CXX=ccache <host-compiler>" from outside
+    $(warning CUDA builds are not supported for multi-word CXX "$(CXX)")
+    override CUDA_HOME=disabled
+  endif
+
+  # If HIP_HOME is not set, try to set it from the location of GPUCC
+  ifndef HIP_HOME
+    HIP_HOME = $(patsubst %bin/hipcc,%,$(HIP_COMPILER_PATH))
+    $(warning HIP_HOME was not set: using "$(HIP_HOME)")
+  endif
 
-# Set the host C++ compiler for nvcc via "-ccbin <host-compiler>"
-# (NB issue #505: this must be a single word, "clang++ --gcc-toolchain..." is not supported)
-CUFLAGS += -ccbin $(shell which $(subst ccache ,,$(CXX)))
+  # Set GPUCC as $(HIP_HOME)/bin/hipcc if it exists
+  ifneq ($(wildcard $(HIP_HOME)/bin/hipcc),)
+    GPUCC = $(HIP_HOME)/bin/hipcc
+
+    # Should maybe find something equivelant to this in HIP
+    #USE_NVTX ?=-DUSE_NVTX
+
+    HIPARCHFLAGS = -target x86_64-linux-gnu --offload-arch=gfx90a
+    HIPINC = -I$(HIP_HOME)/include/
+
+    # -DHIP_FAST_MATH equivelant to -use_fast_math in HIP 
+    # (But only for single precision line 208: https://rocm-developer-tools.github.io/HIP/hcc__detail_2math__functions_8h_source.html)
+    GPUFLAGS = $(OPTFLAGS) $(CUOPTFLAGS) $(INCFLAGS) $(HIPINC) $(HIPARCHFLAGS) -DHIP_FAST_MATH -DHIP_PLATFORM=amd -fPIC
+    ###GPUFLAGS += -Xcompiler -Wall -Xcompiler -Wextra -Xcompiler -Wshadow
+    GPUFLAGS += -std=c++17
+    # Without -maxrregcount: baseline throughput: 6.5E8 (16384 32 12) up to 7.3E8 (65536 128 12)
+    ###GPUFLAGS+= --maxrregcount 160 # improves throughput: 6.9E8 (16384 32 12) up to 7.7E8 (65536 128 12)
+    ###GPUFLAGS+= --maxrregcount 128 # improves throughput: 7.3E8 (16384 32 12) up to 7.6E8 (65536 128 12)
+    ###GPUFLAGS+= --maxrregcount 96 # degrades throughput: 4.1E8 (16384 32 12) up to 4.5E8 (65536 128 12)
+    ###GPUFLAGS+= --maxrregcount 64 # degrades throughput: 1.7E8 (16384 32 12) flat at 1.7E8 (65536 128 12)
+
+    CUBUILDRULEFLAGS = -fPIC -c
+    CCBUILDRULEFLAGS = -fPIC -c
+
+  else ifneq ($(origin REQUIRE_HIP),undefined)
+    # If REQUIRE_HIP is set but no cuda is found, stop here (e.g. for CI tests on GPU #443)
+    $(error No hip installation found (set HIP_HOME or make GPUCC visible in PATH))
+  else
+    # No hip. Switch hip compilation off and go to common random numbers in C++
+    $(warning HIP_HOME is not set or is invalid: export HIP_HOME to compile with hip)
+    override GPUCC=
+    override USE_NVTX=
+    override CUINC=
+    override CURANDLIBFLAGS=
+  endif
+
+  # Allow newer (unsupported) C++ compilers with older versions of CUDA if ALLOW_UNSUPPORTED_COMPILER_IN_CUDA is set (#504)
+  ifneq ($(origin ALLOW_UNSUPPORTED_COMPILER_IN_CUDA),undefined)
+  GPUFLAGS += -allow-unsupported-compiler
+  endif
 
-# Allow newer (unsupported) C++ compilers with older versions of CUDA if ALLOW_UNSUPPORTED_COMPILER_IN_CUDA is set (#504)
-ifneq ($(origin ALLOW_UNSUPPORTED_COMPILER_IN_CUDA),undefined)
-CUFLAGS += -allow-unsupported-compiler
 endif
 
 #-------------------------------------------------------------------------------
@@ -179,9 +245,9 @@ endif
 #ifeq ($(USECCACHE)$(shell echo $(AR) | grep ccache),1)
 #  override AR:=ccache $(AR)
 #endif
-ifneq ($(NVCC),)
-  ifeq ($(USECCACHE)$(shell echo $(NVCC) | grep ccache),1)
-    override NVCC:=ccache $(NVCC)
+ifneq ($(GPUCC),)
+  ifeq ($(USECCACHE)$(shell echo $(GPUCC) | grep ccache),1)
+    override GPUCC:=ccache $(GPUCC)
   endif
 endif
 
@@ -205,7 +271,7 @@ endif
 
 # PowerPC-specific CUDA compiler flags (to be reviewed!)
 ifeq ($(UNAME_P),ppc64le)
-  CUFLAGS+= -Xcompiler -mno-float128
+  GPUFLAGS+= -Xcompiler -mno-float128
 endif
 
 #-------------------------------------------------------------------------------
@@ -215,10 +281,10 @@ endif
 # Set the default OMPFLAGS choice
 ifneq ($(shell $(CXX) --version | egrep '^Intel'),)
 override OMPFLAGS = -fopenmp
-###override OMPFLAGS = # disable OpenMP MT on Intel (was ok without nvcc but not ok with nvcc before #578)
+###override OMPFLAGS = # disable OpenMP MT on Intel (was ok without GPUCC but not ok with GPUCC before #578)
 else ifneq ($(shell $(CXX) --version | egrep '^(clang)'),)
 override OMPFLAGS = -fopenmp
-###override OMPFLAGS = # disable OpenMP MT on clang (was not ok without or with nvcc before #578)
+###override OMPFLAGS = # disable OpenMP MT on clang (was not ok without or with GPUCC before #578)
 else ifneq ($(shell $(CXX) --version | egrep '^(Apple clang)'),)
 override OMPFLAGS = # disable OpenMP MT on Apple clang (builds fail in the CI #578)
 else
@@ -269,7 +335,10 @@ endif
 
 # Set the default RNDGEN (random number generator) choice
 ifeq ($(RNDGEN),)
-  ifeq ($(NVCC),)
+  ifeq ($(GPUCC),)
+    override RNDGEN = hasNoCurand
+  # Edgecase for HIP compilation
+  else ifeq ($(findstring hipcc,$(GPUCC)),hipcc)
     override RNDGEN = hasNoCurand
   else ifeq ($(RNDGEN),)
     override RNDGEN = hasCurand
@@ -344,13 +413,13 @@ CXXFLAGS+= $(AVXFLAGS)
 $(info FPTYPE=$(FPTYPE))
 ifeq ($(FPTYPE),d)
   CXXFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_DOUBLE
-  CUFLAGS  += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_DOUBLE
+  GPUFLAGS  += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_DOUBLE
 else ifeq ($(FPTYPE),f)
   CXXFLAGS += -DMGONGPU_FPTYPE_FLOAT -DMGONGPU_FPTYPE2_FLOAT
-  CUFLAGS  += -DMGONGPU_FPTYPE_FLOAT -DMGONGPU_FPTYPE2_FLOAT
+  GPUFLAGS  += -DMGONGPU_FPTYPE_FLOAT -DMGONGPU_FPTYPE2_FLOAT
 else ifeq ($(FPTYPE),m)
   CXXFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_FLOAT
-  CUFLAGS  += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_FLOAT
+  GPUFLAGS  += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_FLOAT
 else
   $(error Unknown FPTYPE='$(FPTYPE)': only 'd', 'f' and 'm' are supported)
 endif
@@ -359,7 +428,7 @@ endif
 $(info HELINL=$(HELINL))
 ifeq ($(HELINL),1)
   CXXFLAGS += -DMGONGPU_INLINE_HELAMPS
-  CUFLAGS  += -DMGONGPU_INLINE_HELAMPS
+  GPUFLAGS  += -DMGONGPU_INLINE_HELAMPS
 else ifneq ($(HELINL),0)
   $(error Unknown HELINL='$(HELINL)': only '0' and '1' are supported)
 endif
@@ -368,7 +437,7 @@ endif
 $(info HRDCOD=$(HRDCOD))
 ifeq ($(HRDCOD),1)
   CXXFLAGS += -DMGONGPU_HARDCODE_PARAM
-  CUFLAGS  += -DMGONGPU_HARDCODE_PARAM
+  GPUFLAGS  += -DMGONGPU_HARDCODE_PARAM
 else ifneq ($(HRDCOD),0)
   $(error Unknown HRDCOD='$(HRDCOD)': only '0' and '1' are supported)
 endif
@@ -420,11 +489,11 @@ ifeq ($(UNAME_S),Darwin)
   override CULIBFLAGSRPATH2 =
 else
   # RPATH to cuda/cpp libs when linking executables
-  override CXXLIBFLAGSRPATH = -Wl,-rpath,$(LIBDIRRPATH)
-  override CULIBFLAGSRPATH = -Xlinker -rpath,$(LIBDIRRPATH)
+  override CXXLIBFLAGSRPATH = -Wl,-rpath=$(LIBDIRRPATH)
+  override CULIBFLAGSRPATH = -Xlinker -rpath=$(LIBDIRRPATH)
   # RPATH to common lib when linking cuda/cpp libs
-  override CXXLIBFLAGSRPATH2 = -Wl,-rpath,'$$ORIGIN'
-  override CULIBFLAGSRPATH2 = -Xlinker -rpath,'$$ORIGIN'
+  override CXXLIBFLAGSRPATH2 = -Wl,-rpath='$$ORIGIN'
+  override CULIBFLAGSRPATH2 = -Xlinker -rpath='$$ORIGIN'
 endif
 
 # Setting LD_LIBRARY_PATH or DYLD_LIBRARY_PATH in the RUNTIME is no longer necessary (neither on Linux nor on Mac)
@@ -437,7 +506,7 @@ override RUNTIME =
 cxx_main=$(BUILDDIR)/check.exe
 fcxx_main=$(BUILDDIR)/fcheck.exe
 
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 cu_main=$(BUILDDIR)/gcheck.exe
 fcu_main=$(BUILDDIR)/fgcheck.exe
 else
@@ -468,15 +537,16 @@ $(BUILDDIR)/.build.$(TAG):
 	@touch $(BUILDDIR)/.build.$(TAG)
 
 # Generic target and build rules: objects from CUDA compilation
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 $(BUILDDIR)/%.o : %.cu *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
 	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
-	$(NVCC) $(CPPFLAGS) $(CUFLAGS) -Xcompiler -fPIC -c $< -o $@
+	$(GPUCC) $(CPPFLAGS) $(GPUFLAGS) $(CUBUILDRULEFLAGS) $< -o $@
 
 $(BUILDDIR)/%_cu.o : %.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
 	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
-	$(NVCC) $(CPPFLAGS) $(CUFLAGS) -Xcompiler -fPIC -c -x cu $< -o $@
+	$(GPUCC) $(CPPFLAGS) $(GPUFLAGS) $(CCBUILDRULEFLAGS) $< -o $@
 endif
+# -x cu in line above
 
 # Generic target and build rules: objects from C++ compilation
 # (NB do not include CUINC here! add it only for NVTX or curand #679)
@@ -485,11 +555,14 @@ $(BUILDDIR)/%.o : %.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
 	$(CXX) $(CPPFLAGS) $(CXXFLAGS) -fPIC -c $< -o $@
 
 # Apply special build flags only to CrossSectionKernel.cc and gCrossSectionKernel.cu (no fast math, see #117 and #516)
+# Added edgecase for HIP compilation
 ifeq ($(shell $(CXX) --version | grep ^nvc++),)
 $(BUILDDIR)/CrossSectionKernels.o: CXXFLAGS := $(filter-out -ffast-math,$(CXXFLAGS))
 $(BUILDDIR)/CrossSectionKernels.o: CXXFLAGS += -fno-fast-math
-ifneq ($(NVCC),)
-$(BUILDDIR)/gCrossSectionKernels.o: CUFLAGS += -Xcompiler -fno-fast-math
+ifeq ($(findstring nvcc,$(GPUCC)),nvcc)
+  $(BUILDDIR)/gCrossSectionKernels.o: GPUFLAGS += -Xcompiler -fno-fast-math
+else
+  $(BUILDDIR)/gCrossSectionKernels.o: GPUFLAGS += -fno-fast-math
 endif
 endif
 
@@ -505,10 +578,10 @@ ifeq ($(RNDGEN),hasCurand)
 $(BUILDDIR)/CurandRandomNumberKernel.o: CXXFLAGS += $(CUINC)
 endif
 
-# Avoid "warning: builtin __has_trivial_... is deprecated; use __is_trivially_... instead" in nvcc with icx2023 (#592)
+# Avoid "warning: builtin __has_trivial_... is deprecated; use __is_trivially_... instead" in GPUCC with icx2023 (#592)
 ifneq ($(shell $(CXX) --version | egrep '^(Intel)'),)
-ifneq ($(NVCC),)
-CUFLAGS += -Xcompiler -Wno-deprecated-builtins
+ifneq ($(GPUCC),)
+GPUFLAGS += -Wno-deprecated-builtins
 endif
 endif
 
@@ -516,8 +589,8 @@ endif
 # This patch does remove the warning, but I prefer to keep it disabled for the moment...
 ###ifneq ($(shell $(CXX) --version | egrep '^(clang|Apple clang|Intel)'),)
 ###$(BUILDDIR)/CrossSectionKernels.o: CXXFLAGS += -Wno-overriding-t-option
-###ifneq ($(NVCC),)
-###$(BUILDDIR)/gCrossSectionKernels.o: CUFLAGS += -Xcompiler -Wno-overriding-t-option
+###ifneq ($(GPUCC),)
+###$(BUILDDIR)/gCrossSectionKernels.o: GPUFLAGS += -Xcompiler -Wno-overriding-t-option
 ###endif
 ###endif
 
@@ -544,7 +617,7 @@ MG5AMC_CXXLIB = mg5amc_$(processid_short)_cpp
 cxx_objects_lib=$(BUILDDIR)/CPPProcess.o $(BUILDDIR)/MatrixElementKernels.o $(BUILDDIR)/BridgeKernels.o $(BUILDDIR)/CrossSectionKernels.o
 cxx_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel.o $(BUILDDIR)/RamboSamplingKernels.o
 
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 MG5AMC_CULIB = mg5amc_$(processid_short)_cuda
 cu_objects_lib=$(BUILDDIR)/gCPPProcess.o $(BUILDDIR)/gMatrixElementKernels.o $(BUILDDIR)/gBridgeKernels.o $(BUILDDIR)/gCrossSectionKernels.o
 cu_objects_exe=$(BUILDDIR)/gCommonRandomNumberKernel.o $(BUILDDIR)/gRamboSamplingKernels.o
@@ -556,11 +629,11 @@ $(LIBDIR)/lib$(MG5AMC_CXXLIB).so: cxx_objects_lib += $(BUILDDIR)/fbridge.o
 $(LIBDIR)/lib$(MG5AMC_CXXLIB).so: $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib)
 	$(CXX) -shared -o $@ $(cxx_objects_lib) $(CXXLIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB)
 
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 $(LIBDIR)/lib$(MG5AMC_CULIB).so: $(BUILDDIR)/fbridge_cu.o
 $(LIBDIR)/lib$(MG5AMC_CULIB).so: cu_objects_lib += $(BUILDDIR)/fbridge_cu.o
 $(LIBDIR)/lib$(MG5AMC_CULIB).so: $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cu_objects_lib)
-	$(NVCC) --shared -o $@ $(cu_objects_lib) $(CULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB)
+	$(GPUCC) --shared -o $@ $(cu_objects_lib) $(CULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB)
 endif
 
 #-------------------------------------------------------------------------------
@@ -577,16 +650,16 @@ $(cxx_main): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PAT
 $(cxx_main): $(BUILDDIR)/check_sa.o $(LIBDIR)/lib$(MG5AMC_CXXLIB).so $(cxx_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel.o
 	$(CXX) -o $@ $(BUILDDIR)/check_sa.o $(OMPFLAGS) -ldl -pthread $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_CXXLIB) $(cxx_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel.o $(CURANDLIBFLAGS)
 
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 ifneq ($(shell $(CXX) --version | grep ^Intel),)
-$(cu_main): LIBFLAGS += -lintlc # compile with icpx and link with nvcc (undefined reference to `_intel_fast_memcpy')
-$(cu_main): LIBFLAGS += -lsvml # compile with icpx and link with nvcc (undefined reference to `__svml_cos4_l9')
+$(cu_main): LIBFLAGS += -lintlc # compile with icpx and link with GPUCC (undefined reference to `_intel_fast_memcpy')
+$(cu_main): LIBFLAGS += -lsvml # compile with icpx and link with GPUCC (undefined reference to `__svml_cos4_l9')
 else ifneq ($(shell $(CXX) --version | grep ^nvc++),) # support nvc++ #531
 $(cu_main): LIBFLAGS += -L$(patsubst %bin/nvc++,%lib,$(subst ccache ,,$(CXX))) -lnvhpcatm -lnvcpumath -lnvc
 endif
 $(cu_main): LIBFLAGS += $(CULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH
 $(cu_main): $(BUILDDIR)/gcheck_sa.o $(LIBDIR)/lib$(MG5AMC_CULIB).so $(cu_objects_exe) $(BUILDDIR)/gCurandRandomNumberKernel.o
-	$(NVCC) -o $@ $(BUILDDIR)/gcheck_sa.o $(CUARCHFLAGS) $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe) $(BUILDDIR)/gCurandRandomNumberKernel.o $(CURANDLIBFLAGS)
+	$(GPUCC) -o $@ $(BUILDDIR)/gcheck_sa.o $(CUARCHFLAGS) $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe) $(BUILDDIR)/gCurandRandomNumberKernel.o $(CURANDLIBFLAGS)
 endif
 
 #-------------------------------------------------------------------------------
@@ -612,17 +685,17 @@ $(fcxx_main): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PA
 $(fcxx_main): $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler.o $(LIBDIR)/lib$(MG5AMC_CXXLIB).so $(cxx_objects_exe)
 	$(CXX) -o $@ $(BUILDDIR)/fcheck_sa.o $(OMPFLAGS) $(BUILDDIR)/fsampler.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CXXLIB) $(cxx_objects_exe)
 
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 ifneq ($(shell $(CXX) --version | grep ^Intel),)
-$(fcu_main): LIBFLAGS += -lintlc # compile with icpx and link with nvcc (undefined reference to `_intel_fast_memcpy')
-$(fcu_main): LIBFLAGS += -lsvml # compile with icpx and link with nvcc (undefined reference to `__svml_cos4_l9')
+$(fcu_main): LIBFLAGS += -lintlc # compile with icpx and link with GPUCC (undefined reference to `_intel_fast_memcpy')
+$(fcu_main): LIBFLAGS += -lsvml # compile with icpx and link with GPUCC (undefined reference to `__svml_cos4_l9')
 endif
 ifeq ($(UNAME_S),Darwin)
 $(fcu_main): LIBFLAGS += -L$(shell dirname $(shell $(FC) --print-file-name libgfortran.dylib)) # add path to libgfortran on Mac #375
 endif
 $(fcu_main): LIBFLAGS += $(CULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH
 $(fcu_main): $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler_cu.o $(LIBDIR)/lib$(MG5AMC_CULIB).so $(cu_objects_exe)
-	$(NVCC) -o $@ $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler_cu.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe)
+	$(GPUCC) -o $@ $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler_cu.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe)
 endif
 
 #-------------------------------------------------------------------------------
@@ -634,7 +707,7 @@ $(BUILDDIR)/testxxx.o: testxxx_cc_ref.txt
 $(testmain): $(BUILDDIR)/testxxx.o
 $(testmain): cxx_objects_exe += $(BUILDDIR)/testxxx.o # Comment out this line to skip the C++ test of xxx functions
 
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 $(BUILDDIR)/testxxx_cu.o: $(GTESTLIBS)
 $(BUILDDIR)/testxxx_cu.o: INCFLAGS += $(GTESTINC)
 $(BUILDDIR)/testxxx_cu.o: testxxx_cc_ref.txt
@@ -647,7 +720,7 @@ $(BUILDDIR)/testmisc.o: INCFLAGS += $(GTESTINC)
 $(testmain): $(BUILDDIR)/testmisc.o
 $(testmain): cxx_objects_exe += $(BUILDDIR)/testmisc.o # Comment out this line to skip the C++ miscellaneous tests
 
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 $(BUILDDIR)/testmisc_cu.o: $(GTESTLIBS)
 $(BUILDDIR)/testmisc_cu.o: INCFLAGS += $(GTESTINC)
 $(testmain): $(BUILDDIR)/testmisc_cu.o
@@ -659,12 +732,12 @@ $(BUILDDIR)/runTest.o: INCFLAGS += $(GTESTINC)
 $(testmain): $(BUILDDIR)/runTest.o
 $(testmain): cxx_objects_exe += $(BUILDDIR)/runTest.o
 
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 $(BUILDDIR)/runTest_cu.o: $(GTESTLIBS)
 $(BUILDDIR)/runTest_cu.o: INCFLAGS += $(GTESTINC)
 ifneq ($(shell $(CXX) --version | grep ^Intel),)
-$(testmain): LIBFLAGS += -lintlc # compile with icpx and link with nvcc (undefined reference to `_intel_fast_memcpy')
-$(testmain): LIBFLAGS += -lsvml # compile with icpx and link with nvcc (undefined reference to `__svml_cos4_l9')
+$(testmain): LIBFLAGS += -lintlc # compile with icpx and link with GPUCC (undefined reference to `_intel_fast_memcpy')
+$(testmain): LIBFLAGS += -lsvml # compile with icpx and link with GPUCC (undefined reference to `__svml_cos4_l9')
 else ifneq ($(shell $(CXX) --version | grep ^nvc++),) # support nvc++ #531
 $(testmain): LIBFLAGS += -L$(patsubst %bin/nvc++,%lib,$(subst ccache ,,$(CXX))) -lnvhpcatm -lnvcpumath -lnvc
 endif
@@ -688,14 +761,14 @@ $(testmain): LIBFLAGS += -lgomp
 endif
 endif
 
-ifeq ($(NVCC),) # link only runTest.o
+ifeq ($(GPUCC),) # link only runTest.o
 $(testmain): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH
 $(testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(cxx_objects_exe) $(GTESTLIBS)
 	$(CXX) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) -ldl -pthread $(LIBFLAGS)
 else # link both runTest.o and runTest_cu.o
 $(testmain): LIBFLAGS += $(CULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH
 $(testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) $(GTESTLIBS)
-	$(NVCC) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) -ldl $(LIBFLAGS) -lcuda
+	  $(GPUCC) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) -ldl $(LIBFLAGS) $(CUDATESTFLAGS)
 endif
 
 # Use flock (Linux only, no Mac) to allow 'make -j' if googletest has not yet been downloaded https://stackoverflow.com/a/32666215
@@ -798,9 +871,9 @@ ifeq ($(USECCACHE),1)
 	ccache --version | head -1
 endif
 	@echo ""
-	@echo NVCC=$(NVCC)
-ifneq ($(NVCC),)
-	$(NVCC) --version
+	@echo GPUCC=$(GPUCC)
+ifneq ($(GPUCC),)
+	$(GPUCC) --version
 endif
 	@echo ""
 	@echo CXX=$(CXX)
@@ -819,7 +892,7 @@ endif
 
 # Target: check (run the C++ test executable)
 # [NB THIS IS WHAT IS USED IN THE GITHUB CI!]
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 check: runTest cmpFcheck cmpFGcheck
 else
 check: runTest cmpFcheck
diff --git a/epochX/cudacpp/gg_ttg.sa/SubProcesses/fbridge.cc b/epochX/cudacpp/gg_ttg.sa/SubProcesses/fbridge.cc
index f93c05b0b3..2b956730d4 100644
--- a/epochX/cudacpp/gg_ttg.sa/SubProcesses/fbridge.cc
+++ b/epochX/cudacpp/gg_ttg.sa/SubProcesses/fbridge.cc
@@ -1,11 +1,11 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: S. Roiser (Oct 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Roiser, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #include "Bridge.h"
 #include "CPPProcess.h"
-#include "CudaRuntime.h"
+#include "GpuRuntime.h"
 
 extern "C"
 {
@@ -22,7 +22,7 @@ extern "C"
    * Using the same Fortran MadEvent code, linking to the hetrerogeneous library would allow access to both CPU and GPU implementations.
    * The specific heterogeneous configuration (how many GPUs, how many threads on each CPU, etc) could be loaded in CUDA/C++ from a data file.
    */
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   using namespace mg5amcGpu;
 #else
   using namespace mg5amcCpu;
@@ -46,8 +46,8 @@ extern "C"
    */
   void fbridgecreate_( CppObjectInFortran** ppbridge, const int* pnevtF, const int* pnparF, const int* pnp4F )
   {
-#ifdef __CUDACC__
-    CudaRuntime::setUp();
+#ifdef MGONGPUCPP_GPUIMPL
+    GpuRuntime::setUp();
 #endif
     // Create a process object, read parm card and set parameters
     // FIXME: the process instance can happily go out of scope because it is only needed to read parameters?
@@ -69,8 +69,8 @@ extern "C"
     Bridge<FORTRANFPTYPE>* pbridge = dynamic_cast<Bridge<FORTRANFPTYPE>*>( *ppbridge );
     if( pbridge == 0 ) throw std::runtime_error( "fbridgedelete_: invalid Bridge address" );
     delete pbridge;
-#ifdef __CUDACC__
-    CudaRuntime::tearDown();
+#ifdef MGONGPUCPP_GPUIMPL
+    GpuRuntime::tearDown();
 #endif
   }
 
@@ -100,7 +100,7 @@ extern "C"
   {
     Bridge<FORTRANFPTYPE>* pbridge = dynamic_cast<Bridge<FORTRANFPTYPE>*>( *ppbridge );
     if( pbridge == 0 ) throw std::runtime_error( "fbridgesequence_: invalid Bridge address" );
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     // Use the device/GPU implementation in the CUDA library
     // (there is also a host implementation in this library)
     pbridge->gpu_sequence( momenta, gs, rndhel, rndcol, *pchannelId, mes, selhel, selcol );
diff --git a/epochX/cudacpp/gg_ttg.sa/SubProcesses/fsampler.cc b/epochX/cudacpp/gg_ttg.sa/SubProcesses/fsampler.cc
index 2fb445372d..3743934f41 100644
--- a/epochX/cudacpp/gg_ttg.sa/SubProcesses/fsampler.cc
+++ b/epochX/cudacpp/gg_ttg.sa/SubProcesses/fsampler.cc
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Feb 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #include "mgOnGpuConfig.h"
 
@@ -13,7 +13,7 @@
 
 //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -40,7 +40,7 @@ namespace mg5amcCpu
   private:
     const int m_nevt; // The number of events in each iteration
     int m_iiter;      // The iteration counter (for random number seeding)
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
     HostBufferRndNumMomenta m_hstRndmom; // Memory buffers for random numbers
     HostBufferMomenta m_hstMomenta;      // Memory buffers for momenta
     HostBufferWeights m_hstWeights;      // Memory buffers for sampling weights
@@ -105,7 +105,7 @@ namespace mg5amcCpu
 
 extern "C"
 {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   using namespace mg5amcGpu;
 #else
   using namespace mg5amcCpu;
diff --git a/epochX/cudacpp/gg_ttg.sa/SubProcesses/runTest.cc b/epochX/cudacpp/gg_ttg.sa/SubProcesses/runTest.cc
index df7488178e..4c9bc9ee6b 100644
--- a/epochX/cudacpp/gg_ttg.sa/SubProcesses/runTest.cc
+++ b/epochX/cudacpp/gg_ttg.sa/SubProcesses/runTest.cc
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: S. Hageboeck (Nov 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 //----------------------------------------------------------------------------
 // Use ./runTest.exe --gtest_filter=*xxx to run only testxxx.cc tests
 //----------------------------------------------------------------------------
@@ -18,7 +18,7 @@
 #include "RandomNumberKernels.h"
 #include "epoch_process_id.h"
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 using namespace mg5amcGpu;
 #else
 using namespace mg5amcCpu;
@@ -35,7 +35,7 @@ struct CUDA_CPU_TestBase : public TestDriverBase
     : TestDriverBase( npar, refFileName ) {}
 };
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 struct CPUTest : public CUDA_CPU_TestBase
 {
   // Struct data members (process, and memory structures for random numbers, momenta, matrix elements and weights on host and device)
@@ -117,7 +117,7 @@ struct CPUTest : public CUDA_CPU_TestBase
 };
 #endif
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 struct CUDATest : public CUDA_CPU_TestBase
 {
   // Reset the device when our test goes out of scope. Note that this should happen after
@@ -126,7 +126,7 @@ struct CUDATest : public CUDA_CPU_TestBase
   {
     ~DeviceReset()
     {
-      checkCuda( cudaDeviceReset() ); // this is needed by cuda-memcheck --leak-check full
+      gpuDeviceReset(); // this is needed by cuda-memcheck --leak-check full
     }
   } deviceResetter;
 
@@ -252,7 +252,7 @@ INSTANTIATE_TEST_SUITE_P( prefix, \
                           test_suite_name, \
                           testing::Values( new CUDATest( MG_EPOCH_REFERENCE_FILE_NAME ) ) );
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 MG_INSTANTIATE_TEST_SUITE_GPU( XTESTID_GPU( MG_EPOCH_PROCESS_ID ), MadgraphTest );
 #else
 MG_INSTANTIATE_TEST_SUITE_CPU( XTESTID_CPU( MG_EPOCH_PROCESS_ID ), MadgraphTest );
diff --git a/epochX/cudacpp/gg_ttg.sa/SubProcesses/testmisc.cc b/epochX/cudacpp/gg_ttg.sa/SubProcesses/testmisc.cc
index 895d6eeb56..ba9e59a8a3 100644
--- a/epochX/cudacpp/gg_ttg.sa/SubProcesses/testmisc.cc
+++ b/epochX/cudacpp/gg_ttg.sa/SubProcesses/testmisc.cc
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 //----------------------------------------------------------------------------
 // Use ./runTest.exe --gtest_filter=*misc to run only testmisc.cc tests
 //----------------------------------------------------------------------------
@@ -17,7 +17,7 @@
 #include <sstream>
 #include <typeinfo>
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 #define TESTID( s ) s##_GPU_MISC
 #else
 #define TESTID( s ) s##_CPU_MISC
@@ -26,7 +26,7 @@
 #define XTESTID( s ) TESTID( s )
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -59,7 +59,7 @@ namespace mg5amcCpu
 
 TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testmisc )
 {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   using namespace mg5amcGpu;
 #else
   using namespace mg5amcCpu;
diff --git a/epochX/cudacpp/gg_ttg.sa/SubProcesses/testxxx.cc b/epochX/cudacpp/gg_ttg.sa/SubProcesses/testxxx.cc
index e40f635e46..016bc0f472 100644
--- a/epochX/cudacpp/gg_ttg.sa/SubProcesses/testxxx.cc
+++ b/epochX/cudacpp/gg_ttg.sa/SubProcesses/testxxx.cc
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Apr 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 //----------------------------------------------------------------------------
 // Use ./runTest.exe --gtest_filter=*xxx to run only testxxx.cc tests
 //----------------------------------------------------------------------------
@@ -24,7 +24,7 @@
 #include <iomanip>
 #include <iostream>
 #include <vector>
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 #define TESTID( s ) s##_GPU_XXX
 #else
 #define TESTID( s ) s##_CPU_XXX
@@ -32,7 +32,7 @@
 
 #define XTESTID( s ) TESTID( s )
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -42,7 +42,7 @@ namespace mg5amcCpu
   int FPEhandlerIevt = -1;
   inline void FPEhandler( int sig )
   {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     std::cerr << "Floating Point Exception (GPU): '" << FPEhandlerMessage << "' ievt=" << FPEhandlerIevt << std::endl;
 #else
     std::cerr << "Floating Point Exception (CPU neppV=" << neppV << "): '" << FPEhandlerMessage << "' ievt=" << FPEhandlerIevt << std::endl;
@@ -53,7 +53,7 @@ namespace mg5amcCpu
 
 TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testxxx )
 {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   using namespace mg5amcGpu;
 #else
   using namespace mg5amcCpu;
@@ -76,7 +76,7 @@ TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testxxx )
   assert( nevt % neppM == 0 ); // nevt must be a multiple of neppM
   assert( nevt % neppV == 0 ); // nevt must be a multiple of neppV
   // Fill in the input momenta
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   mg5amcGpu::PinnedHostBufferMomenta hstMomenta( nevt ); // AOSOA[npagM][npar=4][np4=4][neppM]
 #else
   mg5amcCpu::HostBufferMomenta hstMomenta( nevt ); // AOSOA[npagM][npar=4][np4=4][neppM]
@@ -321,7 +321,7 @@ TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testxxx )
   {
     for( int ievt = 0; ievt < nevt; ievt++ )
     {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
       using namespace mg5amcGpu;
 #else
       using namespace mg5amcCpu;
diff --git a/epochX/cudacpp/gg_ttg.sa/src/HelAmps_sm.h b/epochX/cudacpp/gg_ttg.sa/src/HelAmps_sm.h
index 03afcd6a5f..2af1c4d232 100644
--- a/epochX/cudacpp/gg_ttg.sa/src/HelAmps_sm.h
+++ b/epochX/cudacpp/gg_ttg.sa/src/HelAmps_sm.h
@@ -5,7 +5,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: A. Valassi (Sep 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
 // MadGraph5_aMC@NLO v. 3.5.0_lo_vect, 2023-06-09
@@ -28,7 +28,7 @@
 //#include <iomanip>
 //#include <iostream>
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/gg_ttg.sa/src/Parameters_sm.cc b/epochX/cudacpp/gg_ttg.sa/src/Parameters_sm.cc
index 22fdd96a68..01e7d9bcf2 100644
--- a/epochX/cudacpp/gg_ttg.sa/src/Parameters_sm.cc
+++ b/epochX/cudacpp/gg_ttg.sa/src/Parameters_sm.cc
@@ -4,7 +4,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: A. Valassi (Sep 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
 // MadGraph5_aMC@NLO v. 3.5.0_lo_vect, 2023-06-09
@@ -17,7 +17,7 @@
 #include <iomanip>
 #include <iostream>
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 using namespace mg5amcGpu;
 #else
 using namespace mg5amcCpu;
diff --git a/epochX/cudacpp/gg_ttg.sa/src/Parameters_sm.h b/epochX/cudacpp/gg_ttg.sa/src/Parameters_sm.h
index 11fd9e3c74..b44537e599 100644
--- a/epochX/cudacpp/gg_ttg.sa/src/Parameters_sm.h
+++ b/epochX/cudacpp/gg_ttg.sa/src/Parameters_sm.h
@@ -27,7 +27,7 @@
 #include "read_slha.h"
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -93,7 +93,7 @@ namespace mg5amcCpu
 #include <limits>
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -218,7 +218,7 @@ namespace mg5amcCpu
 //==========================================================================
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -239,7 +239,7 @@ namespace mg5amcCpu
 #pragma GCC diagnostic push
 #pragma GCC diagnostic ignored "-Wunused-variable"  // e.g. <<warning: unused variable ‘mdl_G__exp__2’ [-Wunused-variable]>>
 #pragma GCC diagnostic ignored "-Wunused-parameter" // e.g. <<warning: unused parameter ‘G’ [-Wunused-parameter]>>
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 #pragma nv_diagnostic push
 #pragma nv_diag_suppress 177 // e.g. <<warning #177-D: variable "mdl_G__exp__2" was declared but never referenced>>
 #endif
@@ -267,7 +267,7 @@ namespace mg5amcCpu
       // End SM implementation - no special handling of vectors of floats as in EFT (#439)
       return out;
     }
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 #pragma GCC diagnostic pop
 #pragma nv_diagnostic pop
 #endif
diff --git a/epochX/cudacpp/gg_ttg.sa/src/cudacpp_src.mk b/epochX/cudacpp/gg_ttg.sa/src/cudacpp_src.mk
index 554d7a704c..f73ff6fa03 100644
--- a/epochX/cudacpp/gg_ttg.sa/src/cudacpp_src.mk
+++ b/epochX/cudacpp/gg_ttg.sa/src/cudacpp_src.mk
@@ -38,13 +38,13 @@ endif
 
 #-------------------------------------------------------------------------------
 
-#=== Configure the CUDA compiler (note: NVCC is already exported including ccache)
+#=== Configure the CUDA compiler (note: GPUCC is already exported including ccache)
 
-###$(info NVCC=$(NVCC))
+###$(info GPUCC=$(GPUCC))
 
 #-------------------------------------------------------------------------------
 
-#=== Configure ccache for C++ builds (note: NVCC is already exported including ccache)
+#=== Configure ccache for C++ builds (note: GPUCC is already exported including ccache)
 
 # Enable ccache if USECCACHE=1
 ifeq ($(USECCACHE)$(shell echo $(CXX) | grep ccache),1)
@@ -246,20 +246,20 @@ $(BUILDDIR)/%.o : %.cc *.h $(BUILDDIR)/.build.$(TAG)
 # Generic target and build rules: objects from CUDA compilation
 $(BUILDDIR)/%_cu.o : %.cc *.h $(BUILDDIR)/.build.$(TAG)
 	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
-	$(NVCC) $(CPPFLAGS) $(CUFLAGS) -Xcompiler -fPIC -c -x cu $< -o $@
+	$(GPUCC) $(CPPFLAGS) $(CUFLAGS) -Xcompiler -fPIC -c -x cu $< -o $@
 
 #-------------------------------------------------------------------------------
 
 cxx_objects=$(addprefix $(BUILDDIR)/, Parameters_sm.o read_slha.o)
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 cu_objects=$(addprefix $(BUILDDIR)/, Parameters_sm_cu.o)
 endif
 
 # Target (and build rules): common (src) library
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so : $(cxx_objects) $(cu_objects)
 	@if [ ! -d $(LIBDIR) ]; then echo "mkdir -p $(LIBDIR)"; mkdir -p $(LIBDIR); fi
-	$(NVCC) -shared -o $@ $(cxx_objects) $(cu_objects)
+	$(GPUCC) -shared -o $@ $(cxx_objects) $(cu_objects)
 else
 $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so : $(cxx_objects)
 	@if [ ! -d $(LIBDIR) ]; then echo "mkdir -p $(LIBDIR)"; mkdir -p $(LIBDIR); fi
diff --git a/epochX/cudacpp/gg_ttg.sa/src/mgOnGpuConfig.h b/epochX/cudacpp/gg_ttg.sa/src/mgOnGpuConfig.h
index c0f067f1d8..205accb85b 100644
--- a/epochX/cudacpp/gg_ttg.sa/src/mgOnGpuConfig.h
+++ b/epochX/cudacpp/gg_ttg.sa/src/mgOnGpuConfig.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jul 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MGONGPUCONFIG_H
 #define MGONGPUCONFIG_H 1
@@ -10,13 +10,27 @@
 // There are two different code bases for standalone_cudacpp (without multichannel) and madevent+cudacpp (with multichannel)
 #undef MGONGPU_SUPPORTS_MULTICHANNEL
 
+// Is this a GPU (CUDA, HIP) or CPU implementation?
+#ifdef __CUDACC__
+#define MGONGPUCPP_GPUIMPL cuda
+#elif defined __HIPCC__
+#define MGONGPUCPP_GPUIMPL hip
+#include "hip/hip_runtime.h"
+#else
+#undef MGONGPUCPP_GPUIMPL
+#endif
+
 // ** NB1 Throughputs (e.g. 6.8E8) are events/sec for "./gcheck.exe -p 65536 128 12"
 // ** NB2 Baseline on b7g47n0004 fluctuates (probably depends on load on other VMs)
 
 // Choose if curand is supported for generating random numbers
+// For CUDA, by default, it is supported
+// For HIP, by default, it is not supported
 // For C++, by default, do not inline, but allow this macro to be set from outside with e.g. -DMGONGPU_HAS_NO_CURAND
 #ifdef __CUDACC__
 #undef MGONGPU_HAS_NO_CURAND
+#elif defined __HIPCC__
+#define MGONGPU_HAS_NO_CURAND 1
 #else
 //#undef MGONGPU_HAS_NO_CURAND // default
 ////#define MGONGPU_HAS_NO_CURAND 1
@@ -52,23 +66,28 @@
 //#undef MGONGPU_HARDCODE_PARAM // default
 ////#define MGONGPU_HARDCODE_PARAM 1
 
-// Complex type in c++: std::complex or cxsmpl (CHOOSE ONLY ONE)
-#ifndef __CUDACC__
-//#define MGONGPU_CPPCXTYPE_STDCOMPLEX 1 // ~8 percent slower on float, same on double (5.1E6/double, 9.4E6/float)
-#define MGONGPU_CPPCXTYPE_CXSMPL 1 // new default (5.1E6/double, 10.2E6/float)
-#endif
-
-// Complex type in cuda: thrust or cucomplex or cxsmpl (CHOOSE ONLY ONE)
+// Complex type in CUDA: thrust or cucomplex or cxsmpl (CHOOSE ONLY ONE)
 #ifdef __CUDACC__
 #define MGONGPU_CUCXTYPE_THRUST 1 // default (~1.15E9/double, ~3.2E9/float)
 //#define MGONGPU_CUCXTYPE_CUCOMPLEX 1 // ~10 percent slower (1.03E9/double, ~2.8E9/float)
 //#define MGONGPU_CUCXTYPE_CXSMPL 1 // ~10 percent slower (1.00E9/double, ~2.9E9/float)
+
+// Complex type in HIP: cxsmpl (ONLY ONE OPTION POSSIBLE)
+#elif defined __HIPCC__
+#define MGONGPU_CUCXTYPE_CXSMPL 1 // ~10 percent slower (1.00E9/double, ~2.9E9/float)
+
+// Complex type in C++: std::complex or cxsmpl (CHOOSE ONLY ONE)
+#else
+//#define MGONGPU_CPPCXTYPE_STDCOMPLEX 1 // ~8 percent slower on float, same on double (5.1E6/double, 9.4E6/float)
+#define MGONGPU_CPPCXTYPE_CXSMPL 1 // new default (5.1E6/double, 10.2E6/float)
 #endif
 
-// Cuda nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation
+// CUDA nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation
 #ifdef __CUDACC__
-#undef MGONGPU_NSIGHT_DEBUG // default
+#undef MGONGPU_NSIGHT_DEBUG // default in CUDA
 //#define MGONGPU_NSIGHT_DEBUG 1
+#else
+#undef MGONGPU_NSIGHT_DEBUG // only option in HIP or C++
 #endif
 
 // SANITY CHECKS (floating point precision for everything but color algebra #537)
@@ -84,17 +103,21 @@
 #error You cannot use double precision for color algebra and single precision elsewhere
 #endif
 
-// SANITY CHECKS (c++ complex number implementation)
-#ifndef __CUDACC__
-#if defined MGONGPU_CPPCXTYPE_STDCOMPLEX and defined MGONGPU_CPPCXTYPE_CXSMPL
-#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CPPCXTYPE_STDCOMPLEX or MGONGPU_CPPCXTYPE_CXSMPL
+// SANITY CHECKS (CUDA complex number implementation)
+#ifdef __CUDACC__
+#if defined MGONGPU_CUCXTYPE_THRUST and defined MGONGPU_CUCXTYPE_CUCOMPLEX
+#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CUCXTYPE_THRUST or MGONGPU_CUCXTYPE_CUCOMPLEX for CUDA
+#elif defined MGONGPU_CUCXTYPE_THRUST and defined MGONGPU_CUCXTYPE_CXSMPL
+#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CUCXTYPE_THRUST or MGONGPU_CUCXTYPE_CXSMPL for CUDA
+#elif defined MGONGPU_CUCXTYPE_CUCOMPLEX and defined MGONGPU_CUCXTYPE_CXSMPL
+#error You must CHOOSE (ONE AND) ONLY ONE OF MGONGPU_CUCXTYPE_CUCOMPLEX or MGONGPU_CUCXTYPE_CXSMPL for CUDA
 #endif
 #endif
 
-// SANITY CHECKS (cuda complex number implementation)
-#ifdef __CUDACC__
-#if defined MGONGPU_CUCXTYPE_THRUST and defined MGONGPU_CUCXTYPE_CUCOMPLEX and defined MGONGPU_CUCXTYPE_CXSMPL
-#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CUCXTYPE_THRUST or MGONGPU_CUCXTYPE_CUCOMPLEX or MGONGPU_CUCXTYPE_CXSMPL
+// SANITY CHECKS (C++ complex number implementation)
+#ifndef MGONGPUCPP_GPUIMPL
+#if defined MGONGPU_CPPCXTYPE_STDCOMPLEX and defined MGONGPU_CPPCXTYPE_CXSMPL
+#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CPPCXTYPE_STDCOMPLEX or MGONGPU_CPPCXTYPE_CXSMPL for C++
 #endif
 #endif
 
@@ -132,7 +155,7 @@ namespace mgOnGpu
   // Alignment requirement for using reinterpret_cast with SIMD vectorized code
   // (using reinterpret_cast with non aligned memory may lead to segmentation faults!)
   // Only needed for C++ code but can be enforced also in NVCC builds of C++ code using CUDA>=11.2 and C++17 (#318, #319, #333)
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   constexpr int cppAlign = 64; // alignment requirement for SIMD vectorization (64-byte i.e. 512-bit)
 #endif
 
@@ -143,7 +166,7 @@ using mgOnGpu::fptype;
 using mgOnGpu::fptype2;
 
 // C++ SIMD vectorization width (this will be used to set neppV)
-#ifdef __CUDACC__ // CUDA implementation has no SIMD
+#ifdef MGONGPUCPP_GPUIMPL // CUDA and HIP implementations have no SIMD
 #undef MGONGPU_CPPSIMD
 #elif defined __AVX512VL__ && defined MGONGPU_PVW512 // C++ "512z" AVX512 with 512 width (512-bit ie 64-byte): 8 (DOUBLE) or 16 (FLOAT)
 #ifdef MGONGPU_FPTYPE_DOUBLE
@@ -173,9 +196,9 @@ using mgOnGpu::fptype2;
 #undef MGONGPU_CPPSIMD
 #endif
 
-// Cuda nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation
+// CUDA nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation
 // Arguments (not used so far): text is __FUNCTION__, code is 0 (start) or 1 (end)
-#if defined __CUDACC__ && defined MGONGPU_NSIGHT_DEBUG /* clang-format off */
+#if defined __CUDA__ && defined MGONGPU_NSIGHT_DEBUG /* clang-format off */
 #define mgDebugDeclare() __shared__ float mgDebugCounter[mgOnGpu::ntpbMAX];
 #define mgDebugInitialise() { mgDebugCounter[threadIdx.x] = 0; }
 #define mgDebug( code, text ) { mgDebugCounter[threadIdx.x] += 1; }
@@ -187,8 +210,8 @@ using mgOnGpu::fptype2;
 #define mgDebugFinalise() { /*noop*/ }
 #endif /* clang-format on */
 
-// Define empty CUDA declaration specifiers for C++
-#ifndef __CUDACC__
+// Define empty CUDA/HIP declaration specifiers for C++
+#ifndef MGONGPUCPP_GPUIMPL
 #define __global__
 #define __host__
 #define __device__
diff --git a/epochX/cudacpp/gg_ttg.sa/src/mgOnGpuCxtypes.h b/epochX/cudacpp/gg_ttg.sa/src/mgOnGpuCxtypes.h
index b56348bc58..6ae0c42ecb 100644
--- a/epochX/cudacpp/gg_ttg.sa/src/mgOnGpuCxtypes.h
+++ b/epochX/cudacpp/gg_ttg.sa/src/mgOnGpuCxtypes.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022, based on earlier work by D. Smith) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MGONGPUCXTYPES_H
 #define MGONGPUCXTYPES_H 1
@@ -19,7 +19,7 @@
 #include <complex>
 
 // Complex type in cuda: thrust or cucomplex or cxsmpl
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 #if defined MGONGPU_CUCXTYPE_THRUST
 #pragma clang diagnostic push
 #pragma clang diagnostic ignored "-Wtautological-compare" // for icpx2021/clang13 (https://stackoverflow.com/a/15864661)
@@ -82,7 +82,7 @@ namespace mgOnGpu /* clang-format off */
 using mgOnGpu::cxsmpl;
 
 // Printout to stream for user defined types
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -209,14 +209,14 @@ namespace mg5amcCpu
 //==========================================================================
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
 #endif
 {
   // --- Type definitions (complex type: cxtype)
-#ifdef __CUDACC__ // cuda
+#ifdef MGONGPUCPP_GPUIMPL // cuda
 #if defined MGONGPU_CUCXTYPE_THRUST
   typedef thrust::complex<fptype> cxtype;
 #elif defined MGONGPU_CUCXTYPE_CUCOMPLEX
@@ -249,7 +249,7 @@ namespace mg5amcCpu
 //==========================================================================
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -301,7 +301,7 @@ namespace mg5amcCpu
 
   //==========================================================================
 
-#if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_THRUST // cuda + thrust
+#if defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CUCXTYPE_THRUST // cuda + thrust
 
   //------------------------------
   // CUDA - using thrust::complex
@@ -337,11 +337,11 @@ namespace mg5amcCpu
     return c;
   }
 
-#endif // #if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_THRUST
+#endif // #if defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CUCXTYPE_THRUST
 
   //==========================================================================
 
-#if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_CUCOMPLEX // cuda + cucomplex
+#if defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CUCXTYPE_CUCOMPLEX // cuda + cucomplex
 
   //------------------------------
   // CUDA - using cuComplex
@@ -556,11 +556,11 @@ namespace mg5amcCpu
     return cxmake( c.real(), c.imag() );
   }
 
-#endif // #if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_CUCOMPLEX
+#endif // #if defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CUCXTYPE_CUCOMPLEX
 
   //==========================================================================
 
-#if not defined __CUDACC__ and defined MGONGPU_CPPCXTYPE_STDCOMPLEX // c++ + stdcomplex
+#if not defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CPPCXTYPE_STDCOMPLEX // c++ + stdcomplex
 
   //------------------------------
   // C++ - using std::complex
@@ -604,7 +604,7 @@ namespace mg5amcCpu
   }
 #endif
 
-#endif // #if not defined __CUDACC__ and defined MGONGPU_CPPCXTYPE_STDCOMPLEX
+#endif // #if not defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CPPCXTYPE_STDCOMPLEX
 
   //==========================================================================
 
@@ -627,7 +627,7 @@ namespace mg5amcCpu
 //==========================================================================
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/gg_ttg.sa/src/mgOnGpuFptypes.h b/epochX/cudacpp/gg_ttg.sa/src/mgOnGpuFptypes.h
index 905c97d700..fa3a02664b 100644
--- a/epochX/cudacpp/gg_ttg.sa/src/mgOnGpuFptypes.h
+++ b/epochX/cudacpp/gg_ttg.sa/src/mgOnGpuFptypes.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MGONGPUFPTYPES_H
 #define MGONGPUFPTYPES_H 1
@@ -12,7 +12,7 @@
 #include <cmath>
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL // cuda
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -20,7 +20,7 @@ namespace mg5amcCpu
 {
   //==========================================================================
 
-#ifdef __CUDACC__ // cuda
+#ifdef MGONGPUCPP_GPUIMPL // cuda
 
   //------------------------------
   // Floating point types - Cuda
@@ -64,11 +64,11 @@ namespace mg5amcCpu
 #endif
   }
 
-#endif // #ifdef __CUDACC__
+#endif // #ifdef MGONGPUCPP_GPUIMPL
 
   //==========================================================================
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 
   //------------------------------
   // Floating point types - C++
@@ -92,7 +92,7 @@ namespace mg5amcCpu
     return std::sqrt( f );
   }
 
-#endif // #ifndef __CUDACC__
+#endif // #ifndef MGONGPUCPP_GPUIMPL
 
   //==========================================================================
 
diff --git a/epochX/cudacpp/gg_ttg.sa/src/mgOnGpuVectors.h b/epochX/cudacpp/gg_ttg.sa/src/mgOnGpuVectors.h
index e1299ba81e..cdae04326b 100644
--- a/epochX/cudacpp/gg_ttg.sa/src/mgOnGpuVectors.h
+++ b/epochX/cudacpp/gg_ttg.sa/src/mgOnGpuVectors.h
@@ -32,7 +32,7 @@
 #endif
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -131,7 +131,7 @@ namespace mg5amcCpu
 #endif
 #endif
 
-#else // i.e #ifndef MGONGPU_CPPSIMD (this includes #ifdef __CUDACC__)
+#else // i.e #ifndef MGONGPU_CPPSIMD (this includes #ifdef MGONGPUCPP_GPUIMPL)
 
   const int neppV = 1;
 
@@ -153,13 +153,13 @@ namespace mg5amcCpu
 //==========================================================================
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
 #endif
 {
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 
   // Printout to stream for user defined types
 
@@ -805,11 +805,11 @@ namespace mg5amcCpu
 
 #endif // #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
 
-#endif // #ifndef __CUDACC__
+#endif // #ifndef MGONGPUCPP_GPUIMPL
 
   //==========================================================================
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 
   //------------------------------
   // Vector types - CUDA
@@ -853,12 +853,12 @@ namespace mg5amcCpu
     return mask;
   }
 
-#endif // #ifdef __CUDACC__
+#endif // #ifdef MGONGPUCPP_GPUIMPL
 
   //==========================================================================
 
   // Scalar-or-vector types: scalar in CUDA, vector or scalar in C++
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   typedef bool bool_sv;
   typedef fptype fptype_sv;
   typedef fptype2 fptype2_sv;
@@ -879,7 +879,7 @@ namespace mg5amcCpu
 #endif
 
   // Scalar-or-vector zeros: scalar in CUDA, vector or scalar in C++
-#ifdef __CUDACC__ /* clang-format off */
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
   inline __host__ __device__ cxtype cxzero_sv(){ return cxtype( 0, 0 ); }
 #elif defined MGONGPU_CPPSIMD
   inline cxtype_v cxzero_sv() { return cxtype_v(); } // RRRR=0000 IIII=0000
diff --git a/epochX/cudacpp/gg_ttg.sa/src/rambo.h b/epochX/cudacpp/gg_ttg.sa/src/rambo.h
index e02ea52496..cd7e1008ea 100644
--- a/epochX/cudacpp/gg_ttg.sa/src/rambo.h
+++ b/epochX/cudacpp/gg_ttg.sa/src/rambo.h
@@ -4,7 +4,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 
 #include "mgOnGpuConfig.h"
@@ -18,7 +18,7 @@
 #include <iostream>
 
 // Simplified rambo version for 2 to N (with N>=2) processes with massless particles
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -83,7 +83,7 @@ namespace mg5amcCpu
       static bool first = true;
       if( first )
       {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
         if constexpr( M_ACCESS::isOnDevice() ) // avoid
         {
           const int ievt0 = 0;
@@ -166,7 +166,7 @@ namespace mg5amcCpu
     wt = po2log;
     if( nparf != 2 ) wt = ( 2. * nparf - 4. ) * log( energy ) + z[nparf - 1];
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
     // issue warnings if weight is too small or too large
     static int iwarn[5] = { 0, 0, 0, 0, 0 };
     if( wt < -180. )
diff --git a/epochX/cudacpp/gg_ttgg.mad/CODEGEN_mad_gg_ttgg_log.txt b/epochX/cudacpp/gg_ttgg.mad/CODEGEN_mad_gg_ttgg_log.txt
index 8c2a3bf79e..17d1780ff2 100644
--- a/epochX/cudacpp/gg_ttgg.mad/CODEGEN_mad_gg_ttgg_log.txt
+++ b/epochX/cudacpp/gg_ttgg.mad/CODEGEN_mad_gg_ttgg_log.txt
@@ -53,7 +53,7 @@ Note that you can still compile and run aMC@NLO with the built-in PDFs
 Using default text editor "vi". Set another one in ./input/mg5_configuration.txt
 Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt
 No valid web browser found. Please set in ./input/mg5_configuration.txt
-import /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_mad_gg_ttgg.mg
+import /afs/cern.ch/work/j/jteig/mg5amcnlo/CODEGEN_mad_gg_ttgg.mg
 The import format was not given, so we guess it as command
 set stdout_level DEBUG
 set output information to level: 10
@@ -62,7 +62,7 @@ generate g g > t t~ g g
 No model currently active, so we import the Standard Model
 INFO: load particles 
 INFO: load vertices 
-[1;32mDEBUG: model prefixing  takes 0.0053708553314208984 [0m
+[1;32mDEBUG: model prefixing  takes 0.0033278465270996094 [0m
 INFO: Restrict model sm with file models/sm/restrict_default.dat . 
 [1;32mDEBUG: Simplifying conditional expressions [0m
 [1;32mDEBUG: remove interactions: u s w+ at order: QED=1 [0m
@@ -155,7 +155,7 @@ INFO: Please specify coupling orders to bypass this step.
 INFO: Trying coupling order WEIGHTED<=4: WEIGTHED IS QCD+2*QED 
 INFO: Trying process: g g > t t~ g g WEIGHTED<=4 @1  
 INFO: Process has 123 diagrams 
-1 processes with 123 diagrams generated in 0.144 s
+1 processes with 123 diagrams generated in 0.114 s
 Total: 1 processes with 123 diagrams
 output madevent CODEGEN_mad_gg_ttgg --hel_recycling=False --vector_size=16384 --me_exporter=standalone_cudacpp
 Load PLUGIN.CUDACPP_SA_OUTPUT
@@ -166,10 +166,10 @@ Load PLUGIN.CUDACPP_SA_OUTPUT
 INFO: initialize a new directory: CODEGEN_mad_gg_ttgg 
 INFO: remove old information in CODEGEN_mad_gg_ttgg 
 [1;32mDEBUG:  Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [1;30m[output.py at line 157][0m [0m
-[1;34mWARNING: File exists /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_mad_gg_ttgg [0m
-INFO: Creating subdirectories in directory /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_mad_gg_ttgg 
-[1;34mWARNING: File exists /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_mad_gg_ttgg/Cards [0m
-[1;34mWARNING: File exists /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_mad_gg_ttgg/SubProcesses [0m
+[1;34mWARNING: File exists /afs/cern.ch/work/j/jteig/mg5amcnlo/CODEGEN_mad_gg_ttgg [0m
+INFO: Creating subdirectories in directory /afs/cern.ch/work/j/jteig/mg5amcnlo/CODEGEN_mad_gg_ttgg 
+[1;34mWARNING: File exists /afs/cern.ch/work/j/jteig/mg5amcnlo/CODEGEN_mad_gg_ttgg/Cards [0m
+[1;34mWARNING: File exists /afs/cern.ch/work/j/jteig/mg5amcnlo/CODEGEN_mad_gg_ttgg/SubProcesses [0m
 INFO: Organizing processes into subprocess groups 
 INFO: Generating Helas calls for process: g g > t t~ g g WEIGHTED<=4 @1 
 INFO: Processing color information for process: g g > t t~ g g @1 
@@ -177,7 +177,7 @@ INFO: Creating files in directory P1_gg_ttxgg
 [1;32mDEBUG:  Entering PLUGIN_OneProcessExporter.__init__ [1;30m[model_handling.py at line 1039][0m [0m
 [1;32mDEBUG:  kwargs[prefix] = 0 [1;30m[model_handling.py at line 1040][0m [0m
 [1;32mDEBUG:  proc_id = [0m 1 [1;30m[model_handling.py at line 1045][0m [0m
-[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_SA_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7f0ffa4de340> [1;30m[export_v4.py at line 6174][0m [0m
+[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_SA_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7f55ae2b4df0> [1;30m[export_v4.py at line 6174][0m [0m
 INFO: Creating files in directory . 
 [1;32mDEBUG:  Entering PLUGIN_OneProcessExporter.generate_process_files [1;30m[model_handling.py at line 1297][0m [0m
 [1;32mDEBUG:  self.include_multi_channel is already defined: this is madevent+second_exporter mode [1;30m[model_handling.py at line 1299][0m [0m
@@ -192,15 +192,15 @@ FileWriter <class 'PLUGIN.CUDACPP_SA_OUTPUT.model_handling.PLUGIN_CPPWriter'> fo
 [1;32mDEBUG:  self.support_multichannel, self.include_multi_channel = [0m True [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 0, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 0, 46, 47, 48, 49, 50, 51, 52, 53, 54, 0, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 0, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 0, 88, 89, 90, 91, 92, 93, 0, 94, 95, 96, 97, 98, 99, 0, 100, 101, 102, 103, 104, 105, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] [1;30m[model_handling.py at line 1163][0m [0m
 [1;32mDEBUG:  multi_channel = [0m {1: [1], 2: [2], 3: [3], 4: [4], 5: [5], 6: [6], 7: [7], 8: [8], 9: [9], 10: [10], 11: [11], 12: [12], 13: [13], 14: [14], 15: [15], 16: [16], 17: [17], 18: [18], 19: [19], 20: [20], 21: [21], 22: [22], 23: [23], 24: [24], 25: [25], 26: [26], 27: [27], 28: [28], 29: [29], 30: [30], 31: [32], 32: [33], 33: [34], 34: [35], 35: [36], 36: [37], 37: [38], 38: [39], 39: [40], 40: [41], 41: [42], 42: [43], 43: [44], 44: [45], 45: [46], 46: [48], 47: [49], 48: [50], 49: [51], 50: [52], 51: [53], 52: [54], 53: [55], 54: [56], 55: [58], 56: [59], 57: [60], 58: [61], 59: [62], 60: [63], 61: [64], 62: [65], 63: [66], 64: [67], 65: [68], 66: [69], 67: [70], 68: [71], 69: [72], 70: [74], 71: [75], 72: [76], 73: [77], 74: [78], 75: [79], 76: [80], 77: [81], 78: [82], 79: [83], 80: [84], 81: [85], 82: [86], 83: [87], 84: [88], 85: [89], 86: [90], 87: [91], 88: [93], 89: [94], 90: [95], 91: [96], 92: [97], 93: [98], 94: [100], 95: [101], 96: [102], 97: [103], 98: [104], 99: [105], 100: [107], 101: [108], 102: [109], 103: [110], 104: [111], 105: [112]} [1;30m[model_handling.py at line 1169][0m [0m
 [1;32mDEBUG:  multi_channel_map = [0m {1: [1], 2: [2], 3: [3], 4: [4], 5: [5], 6: [6], 7: [7], 8: [8], 9: [9], 10: [10], 11: [11], 12: [12], 13: [13], 14: [14], 15: [15], 16: [16], 17: [17], 18: [18], 19: [19], 20: [20], 21: [21], 22: [22], 23: [23], 24: [24], 25: [25], 26: [26], 27: [27], 28: [28], 29: [29], 30: [30], 31: [32], 32: [33], 33: [34], 34: [35], 35: [36], 36: [37], 37: [38], 38: [39], 39: [40], 40: [41], 41: [42], 42: [43], 43: [44], 44: [45], 45: [46], 46: [48], 47: [49], 48: [50], 49: [51], 50: [52], 51: [53], 52: [54], 53: [55], 54: [56], 55: [58], 56: [59], 57: [60], 58: [61], 59: [62], 60: [63], 61: [64], 62: [65], 63: [66], 64: [67], 65: [68], 66: [69], 67: [70], 68: [71], 69: [72], 70: [74], 71: [75], 72: [76], 73: [77], 74: [78], 75: [79], 76: [80], 77: [81], 78: [82], 79: [83], 80: [84], 81: [85], 82: [86], 83: [87], 84: [88], 85: [89], 86: [90], 87: [91], 88: [93], 89: [94], 90: [95], 91: [96], 92: [97], 93: [98], 94: [100], 95: [101], 96: [102], 97: [103], 98: [104], 99: [105], 100: [107], 101: [108], 102: [109], 103: [110], 104: [111], 105: [112]} [1;30m[model_handling.py at line 1654][0m [0m
-[1;32mDEBUG:  diag_to_config = [0m {4: 1, 5: 2, 6: 3, 7: 4, 8: 5, 9: 6, 10: 7, 11: 8, 12: 9, 13: 10, 14: 11, 15: 12, 16: 13, 17: 14, 18: 15, 19: 16, 20: 17, 21: 18, 22: 19, 23: 20, 24: 21, 25: 22, 26: 23, 27: 24, 28: 25, 29: 26, 30: 27, 31: 28, 32: 29, 33: 30, 37: 31, 38: 32, 39: 33, 40: 34, 41: 35, 42: 36, 43: 37, 44: 38, 45: 39, 46: 40, 47: 41, 48: 42, 49: 43, 50: 44, 51: 45, 55: 46, 56: 47, 57: 48, 58: 49, 59: 50, 60: 51, 61: 52, 62: 53, 63: 54, 67: 55, 68: 56, 69: 57, 70: 58, 71: 59, 72: 60, 73: 61, 74: 62, 75: 63, 76: 64, 77: 65, 78: 66, 79: 67, 80: 68, 81: 69, 85: 70, 86: 71, 87: 72, 88: 73, 89: 74, 90: 75, 91: 76, 92: 77, 93: 78, 94: 79, 95: 80, 96: 81, 97: 82, 98: 83, 99: 84, 100: 85, 101: 86, 102: 87, 106: 88, 107: 89, 108: 90, 109: 91, 110: 92, 111: 93, 115: 94, 116: 95, 117: 96, 118: 97, 119: 98, 120: 99, 124: 100, 125: 101, 126: 102, 127: 103, 128: 104, 129: 105} [1;30m[model_handling.py at line 1709][0m [0m
-[1;32mDEBUG:  call = [0m vxxxxx( momenta,m_pars->%s, cHel[ihel][%d],%+d, w_sv[%d], %d ); [1;30m[model_handling.py at line 1821][0m [0m
-[1;32mDEBUG:  ('ZERO', 0, -1, 0, 0) [1;30m[model_handling.py at line 1822][0m [0m
-[1;32mDEBUG:  call = [0m vxxxxx( momenta,m_pars->%s, cHel[ihel][%d],%+d, w_sv[%d], %d ); [1;30m[model_handling.py at line 1821][0m [0m
-[1;32mDEBUG:  ('ZERO', 1, -1, 1, 1) [1;30m[model_handling.py at line 1822][0m [0m
-[1;32mDEBUG:  call = [0m vxxxxx( momenta,m_pars->%s, cHel[ihel][%d],%+d, w_sv[%d], %d ); [1;30m[model_handling.py at line 1821][0m [0m
-[1;32mDEBUG:  ('ZERO', 4, 1, 4, 4) [1;30m[model_handling.py at line 1822][0m [0m
-[1;32mDEBUG:  call = [0m vxxxxx( momenta,m_pars->%s, cHel[ihel][%d],%+d, w_sv[%d], %d ); [1;30m[model_handling.py at line 1821][0m [0m
-[1;32mDEBUG:  ('ZERO', 5, 1, 5, 5) [1;30m[model_handling.py at line 1822][0m [0m
+[1;32mDEBUG:  diag_to_config = [0m {4: 1, 5: 2, 6: 3, 7: 4, 8: 5, 9: 6, 10: 7, 11: 8, 12: 9, 13: 10, 14: 11, 15: 12, 16: 13, 17: 14, 18: 15, 19: 16, 20: 17, 21: 18, 22: 19, 23: 20, 24: 21, 25: 22, 26: 23, 27: 24, 28: 25, 29: 26, 30: 27, 31: 28, 32: 29, 33: 30, 37: 31, 38: 32, 39: 33, 40: 34, 41: 35, 42: 36, 43: 37, 44: 38, 45: 39, 46: 40, 47: 41, 48: 42, 49: 43, 50: 44, 51: 45, 55: 46, 56: 47, 57: 48, 58: 49, 59: 50, 60: 51, 61: 52, 62: 53, 63: 54, 67: 55, 68: 56, 69: 57, 70: 58, 71: 59, 72: 60, 73: 61, 74: 62, 75: 63, 76: 64, 77: 65, 78: 66, 79: 67, 80: 68, 81: 69, 85: 70, 86: 71, 87: 72, 88: 73, 89: 74, 90: 75, 91: 76, 92: 77, 93: 78, 94: 79, 95: 80, 96: 81, 97: 82, 98: 83, 99: 84, 100: 85, 101: 86, 102: 87, 106: 88, 107: 89, 108: 90, 109: 91, 110: 92, 111: 93, 115: 94, 116: 95, 117: 96, 118: 97, 119: 98, 120: 99, 124: 100, 125: 101, 126: 102, 127: 103, 128: 104, 129: 105} [1;30m[model_handling.py at line 1711][0m [0m
+[1;32mDEBUG:  call = [0m vxxxxx( momenta,m_pars->%s, cHel[ihel][%d],%+d, w_sv[%d], %d ); [1;30m[model_handling.py at line 1823][0m [0m
+[1;32mDEBUG:  ('ZERO', 0, -1, 0, 0) [1;30m[model_handling.py at line 1824][0m [0m
+[1;32mDEBUG:  call = [0m vxxxxx( momenta,m_pars->%s, cHel[ihel][%d],%+d, w_sv[%d], %d ); [1;30m[model_handling.py at line 1823][0m [0m
+[1;32mDEBUG:  ('ZERO', 1, -1, 1, 1) [1;30m[model_handling.py at line 1824][0m [0m
+[1;32mDEBUG:  call = [0m vxxxxx( momenta,m_pars->%s, cHel[ihel][%d],%+d, w_sv[%d], %d ); [1;30m[model_handling.py at line 1823][0m [0m
+[1;32mDEBUG:  ('ZERO', 4, 1, 4, 4) [1;30m[model_handling.py at line 1824][0m [0m
+[1;32mDEBUG:  call = [0m vxxxxx( momenta,m_pars->%s, cHel[ihel][%d],%+d, w_sv[%d], %d ); [1;30m[model_handling.py at line 1823][0m [0m
+[1;32mDEBUG:  ('ZERO', 5, 1, 5, 5) [1;30m[model_handling.py at line 1824][0m [0m
 INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./. 
 [1;32mDEBUG:  Entering PLUGIN_OneProcessExporter.edit_CMakeLists [1;30m[model_handling.py at line 1343][0m [0m
 [1;32mDEBUG:  Entering PLUGIN_OneProcessExporter.edit_check_sa [1;30m[model_handling.py at line 1352][0m [0m
@@ -210,22 +210,22 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./.
 [1;32mDEBUG:  Entering PLUGIN_OneProcessExporter.edit_testxxx [1;30m[model_handling.py at line 1419][0m [0m
 [1;32mDEBUG:  Entering PLUGIN_OneProcessExporter.edit_memorybuffers [1;30m[model_handling.py at line 1430][0m [0m
 [1;32mDEBUG:  Entering PLUGIN_OneProcessExporter.edit_memoryaccesscouplings [1;30m[model_handling.py at line 1441][0m [0m
-[1;32mDEBUG:  'Copying test reference file: ', template_ref  = [0m Copying test reference file:  /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/../../../test/ref/dump_CPUTest.Sigma_sm_gg_ttxgg.txt [1;30m[model_handling.py at line 1335][0m [0m
+[1;32mDEBUG:  'Copying test reference file: ', template_ref  = [0m Copying test reference file:  /afs/cern.ch/work/j/jteig/mg5amcnlo/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/../../../test/ref/dump_CPUTest.Sigma_sm_gg_ttxgg.txt [1;30m[model_handling.py at line 1335][0m [0m
 [1;32mDEBUG:  proc_id = [0m 1 [1;30m[export_cpp.py at line 710][0m [0m
 [1;32mDEBUG:  config_map = [0m [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 0, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 0, 46, 47, 48, 49, 50, 51, 52, 53, 54, 0, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 0, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 0, 88, 89, 90, 91, 92, 93, 0, 94, 95, 96, 97, 98, 99, 0, 100, 101, 102, 103, 104, 105, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] [1;30m[export_cpp.py at line 711][0m [0m
 [1;32mDEBUG:  subproc_number = [0m 0 [1;30m[export_cpp.py at line 712][0m [0m
 [1;32mDEBUG:  Done [1;30m[export_cpp.py at line 713][0m [0m
 INFO: Generating Feynman diagrams for Process: g g > t t~ g g WEIGHTED<=4 @1 
 INFO: Finding symmetric diagrams for subprocess group gg_ttxgg 
-Generated helas calls for 1 subprocesses (123 diagrams) in 0.385 s
-Wrote files for 222 helas calls in 0.669 s
+Generated helas calls for 1 subprocesses (123 diagrams) in 0.305 s
+Wrote files for 222 helas calls in 0.676 s
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates VVV1 routines[0m
 ALOHA: aloha creates FFV1 routines[0m
 ALOHA: aloha creates VVVV1 routines[0m
 ALOHA: aloha creates VVVV3 routines[0m
 ALOHA: aloha creates VVVV4 routines[0m
-ALOHA: aloha creates 5 routines in  0.277 s
+ALOHA: aloha creates 5 routines in  0.233 s
 [1;32mDEBUG:  Entering PLUGIN_ProcessExporter.convert_model (create the model) [1;30m[output.py at line 194][0m [0m
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates VVV1 routines[0m
@@ -233,7 +233,7 @@ ALOHA: aloha creates FFV1 routines[0m
 ALOHA: aloha creates VVVV1 routines[0m
 ALOHA: aloha creates VVVV3 routines[0m
 ALOHA: aloha creates VVVV4 routines[0m
-ALOHA: aloha creates 10 routines in  0.268 s
+ALOHA: aloha creates 10 routines in  0.211 s
 <class 'aloha.create_aloha.AbstractRoutine'> VVV1
 <class 'aloha.create_aloha.AbstractRoutine'> VVV1
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
@@ -246,8 +246,8 @@ ALOHA: aloha creates 10 routines in  0.268 s
 <class 'aloha.create_aloha.AbstractRoutine'> VVVV3
 <class 'aloha.create_aloha.AbstractRoutine'> VVVV4
 <class 'aloha.create_aloha.AbstractRoutine'> VVVV4
-FileWriter <class 'PLUGIN.CUDACPP_SA_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_mad_gg_ttgg/src/./HelAmps_sm.h
-INFO: Created file HelAmps_sm.h in directory /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_mad_gg_ttgg/src/. 
+FileWriter <class 'PLUGIN.CUDACPP_SA_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /afs/cern.ch/work/j/jteig/mg5amcnlo/CODEGEN_mad_gg_ttgg/src/./HelAmps_sm.h
+INFO: Created file HelAmps_sm.h in directory /afs/cern.ch/work/j/jteig/mg5amcnlo/CODEGEN_mad_gg_ttgg/src/. 
 super_write_set_parameters_onlyfixMajorana (hardcoded=False)
 [1;32mDEBUG:  'pardef_lines size =', len(pardef_lines), ', keys size =', len(pardef_lines.keys())  = [0m pardef_lines size = 48 , keys size = 48 [1;30m[model_handling.py at line 729][0m [0m
 super_write_set_parameters_onlyfixMajorana (hardcoded=True)
@@ -256,22 +256,22 @@ super_write_set_parameters_onlyfixMajorana (hardcoded=True)
 [1;32mDEBUG:  'pardef_lines size =', len(pardef_lines), ', keys size =', len(pardef_lines.keys())  = [0m pardef_lines size = 3 , keys size = 3 [1;30m[model_handling.py at line 729][0m [0m
 [1;32mDEBUG:  'pardef_lines size =', len(pardef_lines), ', keys size =', len(pardef_lines.keys())  = [0m pardef_lines size = 3 , keys size = 3 [1;30m[model_handling.py at line 729][0m [0m
 [1;32mDEBUG:  'pardef_lines size =', len(pardef_lines), ', keys size =', len(pardef_lines.keys())  = [0m pardef_lines size = 3 , keys size = 3 [1;30m[model_handling.py at line 729][0m [0m
-FileWriter <class 'PLUGIN.CUDACPP_SA_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_mad_gg_ttgg/src/./Parameters_sm.h
-FileWriter <class 'PLUGIN.CUDACPP_SA_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_mad_gg_ttgg/src/./Parameters_sm.cc
+FileWriter <class 'PLUGIN.CUDACPP_SA_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /afs/cern.ch/work/j/jteig/mg5amcnlo/CODEGEN_mad_gg_ttgg/src/./Parameters_sm.h
+FileWriter <class 'PLUGIN.CUDACPP_SA_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /afs/cern.ch/work/j/jteig/mg5amcnlo/CODEGEN_mad_gg_ttgg/src/./Parameters_sm.cc
 INFO: Created files Parameters_sm.h and Parameters_sm.cc in directory 
-INFO: /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_mad_gg_ttgg/src/. and /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_mad_gg_ttgg/src/. 
+INFO: /afs/cern.ch/work/j/jteig/mg5amcnlo/CODEGEN_mad_gg_ttgg/src/. and /afs/cern.ch/work/j/jteig/mg5amcnlo/CODEGEN_mad_gg_ttgg/src/. 
 The option zerowidth_tchannel is modified [True] but will not be written in the configuration files.
 If you want to make this value the default for future session, you can run 'save options --all'
-save configuration file to /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_mad_gg_ttgg/Cards/me5_configuration.txt
+save configuration file to /afs/cern.ch/work/j/jteig/mg5amcnlo/CODEGEN_mad_gg_ttgg/Cards/me5_configuration.txt
 INFO: Use Fortran compiler gfortran 
 INFO: Use c++ compiler g++ 
 INFO: Generate web pages 
-Output to directory /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_mad_gg_ttgg done.
+Output to directory /afs/cern.ch/work/j/jteig/mg5amcnlo/CODEGEN_mad_gg_ttgg done.
 Type "launch" to generate events from this process, or see
-/data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_mad_gg_ttgg/README
+/afs/cern.ch/work/j/jteig/mg5amcnlo/CODEGEN_mad_gg_ttgg/README
 Run "open index.html" to see more information about this process.
 quit
 
-real	0m3.653s
-user	0m3.071s
-sys	0m0.363s
+real	0m5.318s
+user	0m2.189s
+sys	0m1.585s
diff --git a/epochX/cudacpp/gg_ttgg.mad/Cards/me5_configuration.txt b/epochX/cudacpp/gg_ttgg.mad/Cards/me5_configuration.txt
index 00d7c6f8d6..9e9ed9d752 100644
--- a/epochX/cudacpp/gg_ttgg.mad/Cards/me5_configuration.txt
+++ b/epochX/cudacpp/gg_ttgg.mad/Cards/me5_configuration.txt
@@ -234,7 +234,7 @@
 # pineappl = pineappl
 
 
-#mg5_path = /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo 
+#mg5_path = /afs/cern.ch/work/j/jteig/mg5amcnlo 
 
 # MG5 MAIN DIRECTORY
-#mg5_path = /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo
+#mg5_path = /afs/cern.ch/work/j/jteig/mg5amcnlo
diff --git a/epochX/cudacpp/gg_ttgg.mad/Source/DHELAS/aloha_file.inc b/epochX/cudacpp/gg_ttgg.mad/Source/DHELAS/aloha_file.inc
index ec923afd6d..1b5bf6ec54 100644
--- a/epochX/cudacpp/gg_ttgg.mad/Source/DHELAS/aloha_file.inc
+++ b/epochX/cudacpp/gg_ttgg.mad/Source/DHELAS/aloha_file.inc
@@ -1 +1 @@
-ALOHARoutine = FFV1_1.o VVVV4_0.o VVVV4P0_1.o FFV1_0.o VVV1_0.o FFV1_2.o VVVV3_0.o VVVV1_0.o VVVV3P0_1.o VVVV1P0_1.o VVV1P0_1.o FFV1P0_3.o
+ALOHARoutine = VVV1_0.o VVV1P0_1.o FFV1_0.o FFV1_1.o FFV1_2.o FFV1P0_3.o VVVV1_0.o VVVV1P0_1.o VVVV3_0.o VVVV3P0_1.o VVVV4_0.o VVVV4P0_1.o
diff --git a/epochX/cudacpp/gg_ttgg.mad/bin/internal/ufomodel/py3_model.pkl b/epochX/cudacpp/gg_ttgg.mad/bin/internal/ufomodel/py3_model.pkl
index c67363a41f7befb95905219d5f8068b362abbe2f..b6989c1453094d7f45cf2ee4b2124efa29e9064b 100644
GIT binary patch
delta 44
zcmX?hj%n{XrVZZ9<PsSe81xg<iuIFIi}Li6GxW>zi?a2z^s`D*Gt>1a7cIL20CztS
A_y7O^

delta 53
zcmdmcj_KGrrVZZ9)Uy~E81z#TOA_@H%Mx=Ei;FY$-2+0642+EReceqHeVz5wGZM>m
JCuc6Z0ssYc66F8@

diff --git a/epochX/cudacpp/gg_ttgg.sa/CODEGEN_cudacpp_gg_ttgg_log.txt b/epochX/cudacpp/gg_ttgg.sa/CODEGEN_cudacpp_gg_ttgg_log.txt
index 7204c0dd4c..02e4d54a65 100644
--- a/epochX/cudacpp/gg_ttgg.sa/CODEGEN_cudacpp_gg_ttgg_log.txt
+++ b/epochX/cudacpp/gg_ttgg.sa/CODEGEN_cudacpp_gg_ttgg_log.txt
@@ -53,7 +53,7 @@ Note that you can still compile and run aMC@NLO with the built-in PDFs
 Using default text editor "vi". Set another one in ./input/mg5_configuration.txt
 Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt
 No valid web browser found. Please set in ./input/mg5_configuration.txt
-import /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_cudacpp_gg_ttgg.mg
+import /afs/cern.ch/work/j/jteig/mg5amcnlo/CODEGEN_cudacpp_gg_ttgg.mg
 The import format was not given, so we guess it as command
 set stdout_level DEBUG
 set output information to level: 10
@@ -62,7 +62,7 @@ generate g g > t t~ g g
 No model currently active, so we import the Standard Model
 INFO: load particles 
 INFO: load vertices 
-[1;32mDEBUG: model prefixing  takes 0.005178928375244141 [0m
+[1;32mDEBUG: model prefixing  takes 0.0037162303924560547 [0m
 INFO: Restrict model sm with file models/sm/restrict_default.dat . 
 [1;32mDEBUG: Simplifying conditional expressions [0m
 [1;32mDEBUG: remove interactions: u s w+ at order: QED=1 [0m
@@ -155,7 +155,7 @@ INFO: Please specify coupling orders to bypass this step.
 INFO: Trying coupling order WEIGHTED<=4: WEIGTHED IS QCD+2*QED 
 INFO: Trying process: g g > t t~ g g WEIGHTED<=4 @1  
 INFO: Process has 123 diagrams 
-1 processes with 123 diagrams generated in 0.145 s
+1 processes with 123 diagrams generated in 0.122 s
 Total: 1 processes with 123 diagrams
 output standalone_cudacpp CODEGEN_cudacpp_gg_ttgg
 Load PLUGIN.CUDACPP_SA_OUTPUT
@@ -163,7 +163,7 @@ Load PLUGIN.CUDACPP_SA_OUTPUT
 [1;32mDEBUG:  cformat = [0m plugin [1;30m[export_cpp.py at line 3071][0m [0m
 [1;32mDEBUG:  Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [1;30m[output.py at line 152][0m [0m
 [1;32mDEBUG:  Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [1;30m[output.py at line 157][0m [0m
-INFO: Creating subdirectories in directory /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_cudacpp_gg_ttgg 
+INFO: Creating subdirectories in directory /afs/cern.ch/work/j/jteig/mg5amcnlo/CODEGEN_cudacpp_gg_ttgg 
 INFO: Organizing processes into subprocess groups 
 INFO: Generating Helas calls for process: g g > t t~ g g WEIGHTED<=4 @1 
 INFO: Processing color information for process: g g > t t~ g g @1 
@@ -173,12 +173,12 @@ INFO: Processing color information for process: g g > t t~ g g @1
 [1;32mDEBUG:    type(me)=<class 'int'> me=0 [1;30m[output.py at line 189][0m [0m
 [1;32mDEBUG:  Entering PLUGIN_OneProcessExporter.__init__ [1;30m[model_handling.py at line 1039][0m [0m
 [1;32mDEBUG:  proc_id = [0m 0 [1;30m[model_handling.py at line 1045][0m [0m
-INFO: Creating files in directory /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_cudacpp_gg_ttgg/SubProcesses/P1_Sigma_sm_gg_ttxgg 
+INFO: Creating files in directory /afs/cern.ch/work/j/jteig/mg5amcnlo/CODEGEN_cudacpp_gg_ttgg/SubProcesses/P1_Sigma_sm_gg_ttxgg 
 [1;32mDEBUG:  Entering PLUGIN_OneProcessExporter.generate_process_files [1;30m[model_handling.py at line 1297][0m [0m
 [1;32mDEBUG:  self.include_multi_channel is not yet defined: this is standalone_cudacpp mode [1;30m[model_handling.py at line 1301][0m [0m
-FileWriter <class 'PLUGIN.CUDACPP_SA_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_cudacpp_gg_ttgg/SubProcesses/P1_Sigma_sm_gg_ttxgg/./CPPProcess.h
+FileWriter <class 'PLUGIN.CUDACPP_SA_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /afs/cern.ch/work/j/jteig/mg5amcnlo/CODEGEN_cudacpp_gg_ttgg/SubProcesses/P1_Sigma_sm_gg_ttxgg/./CPPProcess.h
 [1;32mDEBUG:  Entering PLUGIN_OneProcessExporter.write_process_h_file [1;30m[model_handling.py at line 1453][0m [0m
-FileWriter <class 'PLUGIN.CUDACPP_SA_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_cudacpp_gg_ttgg/SubProcesses/P1_Sigma_sm_gg_ttxgg/./CPPProcess.cc
+FileWriter <class 'PLUGIN.CUDACPP_SA_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /afs/cern.ch/work/j/jteig/mg5amcnlo/CODEGEN_cudacpp_gg_ttgg/SubProcesses/P1_Sigma_sm_gg_ttxgg/./CPPProcess.cc
 [1;32mDEBUG:  Entering PLUGIN_OneProcessExporter.write_process_cc_file [1;30m[model_handling.py at line 1475][0m [0m
 [1;32mDEBUG:  Entering PLUGIN_OneProcessExporter.get_sigmaKin_lines [1;30m[model_handling.py at line 1143][0m [0m
 [1;32mDEBUG:  self.include_multi_channel = [0m False [1;30m[model_handling.py at line 1144][0m [0m
@@ -186,16 +186,16 @@ FileWriter <class 'PLUGIN.CUDACPP_SA_OUTPUT.model_handling.PLUGIN_CPPWriter'> fo
 [1;32mDEBUG:  type(self.helas_call_writer) = [0m <class 'PLUGIN.CUDACPP_SA_OUTPUT.model_handling.PLUGIN_GPUFOHelasCallWriter'> [1;30m[model_handling.py at line 1162][0m [0m
 [1;32mDEBUG:  self.support_multichannel, self.include_multi_channel = [0m True False [1;30m[model_handling.py at line 1163][0m [0m
 [1;32mDEBUG:  multi_channel_map = [0m None [1;30m[model_handling.py at line 1654][0m [0m
-[1;32mDEBUG:  diag_to_config = [0m {} [1;30m[model_handling.py at line 1709][0m [0m
-[1;32mDEBUG:  call = [0m vxxxxx( momenta,m_pars->%s, cHel[ihel][%d],%+d, w_sv[%d], %d ); [1;30m[model_handling.py at line 1821][0m [0m
-[1;32mDEBUG:  ('ZERO', 0, -1, 0, 0) [1;30m[model_handling.py at line 1822][0m [0m
-[1;32mDEBUG:  call = [0m vxxxxx( momenta,m_pars->%s, cHel[ihel][%d],%+d, w_sv[%d], %d ); [1;30m[model_handling.py at line 1821][0m [0m
-[1;32mDEBUG:  ('ZERO', 1, -1, 1, 1) [1;30m[model_handling.py at line 1822][0m [0m
-[1;32mDEBUG:  call = [0m vxxxxx( momenta,m_pars->%s, cHel[ihel][%d],%+d, w_sv[%d], %d ); [1;30m[model_handling.py at line 1821][0m [0m
-[1;32mDEBUG:  ('ZERO', 4, 1, 4, 4) [1;30m[model_handling.py at line 1822][0m [0m
-[1;32mDEBUG:  call = [0m vxxxxx( momenta,m_pars->%s, cHel[ihel][%d],%+d, w_sv[%d], %d ); [1;30m[model_handling.py at line 1821][0m [0m
-[1;32mDEBUG:  ('ZERO', 5, 1, 5, 5) [1;30m[model_handling.py at line 1822][0m [0m
-INFO: Created files CPPProcess.h and CPPProcess.cc in directory /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_cudacpp_gg_ttgg/SubProcesses/P1_Sigma_sm_gg_ttxgg/. 
+[1;32mDEBUG:  diag_to_config = [0m {} [1;30m[model_handling.py at line 1711][0m [0m
+[1;32mDEBUG:  call = [0m vxxxxx( momenta,m_pars->%s, cHel[ihel][%d],%+d, w_sv[%d], %d ); [1;30m[model_handling.py at line 1823][0m [0m
+[1;32mDEBUG:  ('ZERO', 0, -1, 0, 0) [1;30m[model_handling.py at line 1824][0m [0m
+[1;32mDEBUG:  call = [0m vxxxxx( momenta,m_pars->%s, cHel[ihel][%d],%+d, w_sv[%d], %d ); [1;30m[model_handling.py at line 1823][0m [0m
+[1;32mDEBUG:  ('ZERO', 1, -1, 1, 1) [1;30m[model_handling.py at line 1824][0m [0m
+[1;32mDEBUG:  call = [0m vxxxxx( momenta,m_pars->%s, cHel[ihel][%d],%+d, w_sv[%d], %d ); [1;30m[model_handling.py at line 1823][0m [0m
+[1;32mDEBUG:  ('ZERO', 4, 1, 4, 4) [1;30m[model_handling.py at line 1824][0m [0m
+[1;32mDEBUG:  call = [0m vxxxxx( momenta,m_pars->%s, cHel[ihel][%d],%+d, w_sv[%d], %d ); [1;30m[model_handling.py at line 1823][0m [0m
+[1;32mDEBUG:  ('ZERO', 5, 1, 5, 5) [1;30m[model_handling.py at line 1824][0m [0m
+INFO: Created files CPPProcess.h and CPPProcess.cc in directory /afs/cern.ch/work/j/jteig/mg5amcnlo/CODEGEN_cudacpp_gg_ttgg/SubProcesses/P1_Sigma_sm_gg_ttxgg/. 
 [1;32mDEBUG:  Entering PLUGIN_OneProcessExporter.edit_CMakeLists [1;30m[model_handling.py at line 1343][0m [0m
 [1;32mDEBUG:  Entering PLUGIN_OneProcessExporter.edit_check_sa [1;30m[model_handling.py at line 1352][0m [0m
 [1;32mDEBUG:  Entering PLUGIN_OneProcessExporter.edit_mgonGPU [1;30m[model_handling.py at line 1369][0m [0m
@@ -203,8 +203,8 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory /data/avalassi/G
 [1;32mDEBUG:  Entering PLUGIN_OneProcessExporter.edit_testxxx [1;30m[model_handling.py at line 1419][0m [0m
 [1;32mDEBUG:  Entering PLUGIN_OneProcessExporter.edit_memorybuffers [1;30m[model_handling.py at line 1430][0m [0m
 [1;32mDEBUG:  Entering PLUGIN_OneProcessExporter.edit_memoryaccesscouplings [1;30m[model_handling.py at line 1441][0m [0m
-[1;32mDEBUG:  'Copying test reference file: ', template_ref  = [0m Copying test reference file:  /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/../../../test/ref/dump_CPUTest.Sigma_sm_gg_ttxgg.txt [1;30m[model_handling.py at line 1335][0m [0m
-Generated helas calls for 1 subprocesses (123 diagrams) in 0.378 s
+[1;32mDEBUG:  'Copying test reference file: ', template_ref  = [0m Copying test reference file:  /afs/cern.ch/work/j/jteig/mg5amcnlo/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/../../../test/ref/dump_CPUTest.Sigma_sm_gg_ttxgg.txt [1;30m[model_handling.py at line 1335][0m [0m
+Generated helas calls for 1 subprocesses (123 diagrams) in 0.305 s
 [1;32mDEBUG:  Entering PLUGIN_ProcessExporter.convert_model (create the model) [1;30m[output.py at line 194][0m [0m
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates VVV1 routines[0m
@@ -212,7 +212,7 @@ ALOHA: aloha creates FFV1 routines[0m
 ALOHA: aloha creates VVVV1 routines[0m
 ALOHA: aloha creates VVVV3 routines[0m
 ALOHA: aloha creates VVVV4 routines[0m
-ALOHA: aloha creates 5 routines in  0.271 s
+ALOHA: aloha creates 5 routines in  0.234 s
 <class 'aloha.create_aloha.AbstractRoutine'> VVV1
 <class 'aloha.create_aloha.AbstractRoutine'> VVV1
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
@@ -225,8 +225,8 @@ ALOHA: aloha creates 5 routines in  0.271 s
 <class 'aloha.create_aloha.AbstractRoutine'> VVVV3
 <class 'aloha.create_aloha.AbstractRoutine'> VVVV4
 <class 'aloha.create_aloha.AbstractRoutine'> VVVV4
-FileWriter <class 'PLUGIN.CUDACPP_SA_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_cudacpp_gg_ttgg/src/./HelAmps_sm.h
-INFO: Created file HelAmps_sm.h in directory /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_cudacpp_gg_ttgg/src/. 
+FileWriter <class 'PLUGIN.CUDACPP_SA_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /afs/cern.ch/work/j/jteig/mg5amcnlo/CODEGEN_cudacpp_gg_ttgg/src/./HelAmps_sm.h
+INFO: Created file HelAmps_sm.h in directory /afs/cern.ch/work/j/jteig/mg5amcnlo/CODEGEN_cudacpp_gg_ttgg/src/. 
 super_write_set_parameters_onlyfixMajorana (hardcoded=False)
 [1;32mDEBUG:  'pardef_lines size =', len(pardef_lines), ', keys size =', len(pardef_lines.keys())  = [0m pardef_lines size = 48 , keys size = 48 [1;30m[model_handling.py at line 729][0m [0m
 super_write_set_parameters_onlyfixMajorana (hardcoded=True)
@@ -235,13 +235,13 @@ super_write_set_parameters_onlyfixMajorana (hardcoded=True)
 [1;32mDEBUG:  'pardef_lines size =', len(pardef_lines), ', keys size =', len(pardef_lines.keys())  = [0m pardef_lines size = 3 , keys size = 3 [1;30m[model_handling.py at line 729][0m [0m
 [1;32mDEBUG:  'pardef_lines size =', len(pardef_lines), ', keys size =', len(pardef_lines.keys())  = [0m pardef_lines size = 3 , keys size = 3 [1;30m[model_handling.py at line 729][0m [0m
 [1;32mDEBUG:  'pardef_lines size =', len(pardef_lines), ', keys size =', len(pardef_lines.keys())  = [0m pardef_lines size = 3 , keys size = 3 [1;30m[model_handling.py at line 729][0m [0m
-FileWriter <class 'PLUGIN.CUDACPP_SA_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_cudacpp_gg_ttgg/src/./Parameters_sm.h
-FileWriter <class 'PLUGIN.CUDACPP_SA_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_cudacpp_gg_ttgg/src/./Parameters_sm.cc
+FileWriter <class 'PLUGIN.CUDACPP_SA_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /afs/cern.ch/work/j/jteig/mg5amcnlo/CODEGEN_cudacpp_gg_ttgg/src/./Parameters_sm.h
+FileWriter <class 'PLUGIN.CUDACPP_SA_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /afs/cern.ch/work/j/jteig/mg5amcnlo/CODEGEN_cudacpp_gg_ttgg/src/./Parameters_sm.cc
 INFO: Created files Parameters_sm.h and Parameters_sm.cc in directory 
-INFO: /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_cudacpp_gg_ttgg/src/. and /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_cudacpp_gg_ttgg/src/. 
+INFO: /afs/cern.ch/work/j/jteig/mg5amcnlo/CODEGEN_cudacpp_gg_ttgg/src/. and /afs/cern.ch/work/j/jteig/mg5amcnlo/CODEGEN_cudacpp_gg_ttgg/src/. 
 [1;32mDEBUG:  Entering PLUGIN_ProcessExporter.finalize [1;30m[output.py at line 203][0m [0m
 quit
 
-real	0m1.686s
-user	0m1.484s
-sys	0m0.120s
+real	0m1.437s
+user	0m1.070s
+sys	0m0.150s
diff --git a/epochX/cudacpp/gg_ttgg.sa/COPYRIGHT b/epochX/cudacpp/gg_ttgg.sa/COPYRIGHT
index a134b5fef9..84a883fbb0 100644
--- a/epochX/cudacpp/gg_ttgg.sa/COPYRIGHT
+++ b/epochX/cudacpp/gg_ttgg.sa/COPYRIGHT
@@ -15,6 +15,7 @@ The full development team currently includes the following authors :
   Stephan Hageboeck (CERN)
   Olivier Mattelaer (Universite Catholique de Louvain, original author)
   Stefan Roiser (CERN, original author)
+  Joergen Teig (CERN)
   Andrea Valassi (CERN, original author)
   Zenny Wettersten (CERN)
 See https://github.com/madgraph5/madgraph4gpu for more details. For the full
diff --git a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/Bridge.h b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/Bridge.h
index d7e629cacd..f37c972b24 100644
--- a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/Bridge.h
+++ b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/Bridge.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: S. Roiser (Nov 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Roiser, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef BRIDGE_H
 #define BRIDGE_H 1
@@ -22,7 +22,7 @@
 #include <memory>
 #include <type_traits>
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -82,7 +82,7 @@ namespace mg5amcCpu
     Bridge& operator=( const Bridge& ) = delete;
     Bridge& operator=( Bridge&& ) = delete;
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     /**
      * Set the gpublocks and gputhreads for the gpusequence - throws if evnt != gpublocks*gputhreads
      * (this is needed for BridgeKernel tests rather than for actual production use in Fortran)
@@ -149,7 +149,7 @@ namespace mg5amcCpu
     unsigned int m_nevt; // number of events
     int m_nGoodHel;      // the number of good helicities (-1 initially when they have not yet been calculated)
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     int m_gputhreads; // number of gpu threads (default set from number of events, can be modified)
     int m_gpublocks;  // number of gpu blocks (default set from number of events, can be modified)
     DeviceBuffer<FORTRANFPTYPE, sizePerEventMomenta> m_devMomentaF;
@@ -186,12 +186,12 @@ namespace mg5amcCpu
   // Forward declare transposition methods
   //
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 
   template<typename Tin, typename Tout>
   __global__ void dev_transposeMomentaF2C( const Tin* in, Tout* out, const unsigned int nevt );
 
-#endif // __CUDACC__
+#endif // MGONGPUCPP_GPUIMPL
 
   template<typename Tin, typename Tout>
   void hst_transposeMomentaF2C( const Tin* in, Tout* out, const unsigned int nevt );
@@ -208,7 +208,7 @@ namespace mg5amcCpu
   Bridge<FORTRANFPTYPE>::Bridge( unsigned int nevtF, unsigned int nparF, unsigned int np4F )
     : m_nevt( nevtF )
     , m_nGoodHel( -1 )
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     , m_gputhreads( 256 )                  // default number of gpu threads
     , m_gpublocks( m_nevt / m_gputhreads ) // this ensures m_nevt <= m_gpublocks*m_gputhreads
     , m_devMomentaF( m_nevt )
@@ -232,7 +232,7 @@ namespace mg5amcCpu
   {
     if( nparF != CPPProcess::npar ) throw std::runtime_error( "Bridge constructor: npar mismatch" );
     if( np4F != CPPProcess::np4 ) throw std::runtime_error( "Bridge constructor: np4 mismatch" );
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     if( ( m_nevt < s_gputhreadsmin ) || ( m_nevt % s_gputhreadsmin != 0 ) )
       throw std::runtime_error( "Bridge constructor: nevt should be a multiple of " + std::to_string( s_gputhreadsmin ) );
     while( m_nevt != m_gpublocks * m_gputhreads )
@@ -250,11 +250,11 @@ namespace mg5amcCpu
     std::cout << "WARNING! Instantiate host Bridge (nevt=" << m_nevt << ")" << std::endl;
     CPPProcess process( /*verbose=*/false );
     m_pmek.reset( new MatrixElementKernelHost( m_hstMomentaC, m_hstGs, m_hstRndHel, m_hstRndCol, m_hstMEs, m_hstSelHel, m_hstSelCol, m_nevt ) );
-#endif // __CUDACC__
+#endif // MGONGPUCPP_GPUIMPL
     process.initProc( "../../Cards/param_card.dat" );
   }
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   template<typename FORTRANFPTYPE>
   void Bridge<FORTRANFPTYPE>::set_gpugrid( const int gpublocks, const int gputhreads )
   {
@@ -268,7 +268,7 @@ namespace mg5amcCpu
   }
 #endif
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   template<typename FORTRANFPTYPE>
   void Bridge<FORTRANFPTYPE>::gpu_sequence( const FORTRANFPTYPE* momenta,
                                             const FORTRANFPTYPE* gs,
@@ -283,14 +283,14 @@ namespace mg5amcCpu
     constexpr int neppM = MemoryAccessMomenta::neppM;
     if constexpr( neppM == 1 && std::is_same_v<FORTRANFPTYPE, fptype> )
     {
-      checkCuda( cudaMemcpy( m_devMomentaC.data(), momenta, m_devMomentaC.bytes(), cudaMemcpyHostToDevice ) );
+      gpuMemcpy( m_devMomentaC.data(), momenta, m_devMomentaC.bytes(), gpuMemcpyHostToDevice );
     }
     else
     {
-      checkCuda( cudaMemcpy( m_devMomentaF.data(), momenta, m_devMomentaF.bytes(), cudaMemcpyHostToDevice ) );
+      gpuMemcpy( m_devMomentaF.data(), momenta, m_devMomentaF.bytes(), gpuMemcpyHostToDevice );
       const int thrPerEvt = CPPProcess::npar * CPPProcess::np4; // AV: transpose alg does 1 element per thread (NOT 1 event per thread)
       //const int thrPerEvt = 1; // AV: try new alg with 1 event per thread... this seems slower
-      dev_transposeMomentaF2C<<<m_gpublocks * thrPerEvt, m_gputhreads>>>( m_devMomentaF.data(), m_devMomentaC.data(), m_nevt );
+      gpuLaunchKernel( dev_transposeMomentaF2C, m_gpublocks * thrPerEvt, m_gputhreads, m_devMomentaF.data(), m_devMomentaC.data(), m_nevt );
     }
     if constexpr( std::is_same_v<FORTRANFPTYPE, fptype> )
     {
@@ -333,7 +333,7 @@ namespace mg5amcCpu
   }
 #endif
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   template<typename FORTRANFPTYPE>
   void Bridge<FORTRANFPTYPE>::cpu_sequence( const FORTRANFPTYPE* momenta,
                                             const FORTRANFPTYPE* gs,
@@ -388,7 +388,7 @@ namespace mg5amcCpu
   // - C++ array: momenta[npagM][npar][np4][neppM] with nevt=npagM*neppM (AOSOA)
   //
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   template<typename Tin, typename Tout>
   __global__ void dev_transposeMomentaF2C( const Tin* in, Tout* out, const unsigned int nevt )
   {
diff --git a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/BridgeKernels.cc b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/BridgeKernels.cc
index d58066c9c1..eaf4037a24 100644
--- a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/BridgeKernels.cc
+++ b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/BridgeKernels.cc
@@ -1,17 +1,18 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #include "BridgeKernels.h"
 
+#include "GpuAbstraction.h"
 #include "MemoryAccessMomenta.h"
 
 #include <sstream>
 
 //============================================================================
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -45,7 +46,7 @@ namespace mg5amcCpu
 
 //============================================================================
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 namespace mg5amcCpu
 {
 
@@ -96,7 +97,7 @@ namespace mg5amcCpu
 
 //============================================================================
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 {
 
diff --git a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/BridgeKernels.h b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/BridgeKernels.h
index 15eb4bff4d..3efef8ce97 100644
--- a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/BridgeKernels.h
+++ b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/BridgeKernels.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef BRIDGEKERNELS_H
 #define BRIDGEKERNELS_H 1
@@ -12,7 +12,7 @@
 #include "MatrixElementKernels.h"
 #include "MemoryBuffers.h"
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -49,7 +49,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A Bridge wrapper class encapsulating matrix element calculations on a CPU host
   class BridgeKernelHost final : public BridgeKernelBase
   {
@@ -89,7 +89,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   // A Bridge wrapper class encapsulating matrix element calculations on a GPU device
   class BridgeKernelDevice : public BridgeKernelBase
   {
diff --git a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/CommonRandomNumberKernel.cc b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/CommonRandomNumberKernel.cc
index 985b39f576..010bc4cbd0 100644
--- a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/CommonRandomNumberKernel.cc
+++ b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/CommonRandomNumberKernel.cc
@@ -1,15 +1,16 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #include "CommonRandomNumbers.h"
+#include "GpuAbstraction.h"
 #include "MemoryBuffers.h"
 #include "RandomNumberKernels.h"
 
 #include <cassert>
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/CrossSectionKernels.cc b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/CrossSectionKernels.cc
index 0b355a3c8d..c15b39844d 100644
--- a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/CrossSectionKernels.cc
+++ b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/CrossSectionKernels.cc
@@ -1,10 +1,11 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #include "CrossSectionKernels.h"
 
+#include "GpuAbstraction.h"
 #include "MemoryAccessMatrixElements.h"
 #include "MemoryAccessWeights.h"
 #include "MemoryBuffers.h"
@@ -77,7 +78,7 @@ debug_me_is_abnormal( const fptype& me, size_t ievtALL )
 
 //============================================================================
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -185,7 +186,7 @@ namespace mg5amcCpu
 
 //============================================================================
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 {
 
diff --git a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/CrossSectionKernels.h b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/CrossSectionKernels.h
index 7933ca4bbf..4d9659e04e 100644
--- a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/CrossSectionKernels.h
+++ b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/CrossSectionKernels.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef CROSSSECTIONKERNELS_H
 #define CROSSSECTIONKERNELS_H 1
@@ -13,7 +13,7 @@
 
 //============================================================================
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -96,7 +96,7 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
   /*
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   // A class encapsulating the calculation of event statistics on a GPU device
   class CrossSectionKernelDevice : public CrossSectionKernelBase, public NumberOfEvents
   {
diff --git a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/CudaRuntime.h b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/CudaRuntime.h
deleted file mode 100644
index 64ce52f4b3..0000000000
--- a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/CudaRuntime.h
+++ /dev/null
@@ -1,85 +0,0 @@
-// Copyright (C) 2020-2023 CERN and UCLouvain.
-// Licensed under the GNU Lesser General Public License (version 3 or later).
-// Created by: S. Roiser (Jul 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
-
-#ifndef MG5AMC_CUDARUNTIME_H
-#define MG5AMC_CUDARUNTIME_H 1
-
-// MG5AMC on GPU uses the CUDA runtime API, not the lower level CUDA driver API
-// See https://docs.nvidia.com/cuda/cuda-runtime-api/driver-vs-runtime-api.html#driver-vs-runtime-api
-
-#include <cassert>
-#include <iostream>
-
-//--------------------------------------------------------------------------
-
-// See https://stackoverflow.com/a/14038590
-#ifdef __CUDACC__ /* clang-format off */
-#define checkCuda( code ) { assertCuda( code, __FILE__, __LINE__ ); }
-inline void assertCuda( cudaError_t code, const char* file, int line, bool abort = true )
-{
-  if( code != cudaSuccess )
-  {
-    printf( "ERROR! assertCuda: '%s' (%d) in %s:%d\n", cudaGetErrorString( code ), code, file, line );
-    if( abort ) assert( code == cudaSuccess );
-  }
-}
-#endif /* clang-format on */
-
-//--------------------------------------------------------------------------
-
-#ifdef __CUDACC__
-namespace mg5amcGpu
-{
-  // Instantiate a CudaRuntime at the beginnining of the application's main to
-  // invoke cudaSetDevice(0) in the constructor and book a cudaDeviceReset() call in the destructor
-  // *** FIXME! This will all need to be designed differently when going to multi-GPU nodes! ***
-  struct CudaRuntime final
-  {
-    CudaRuntime( const bool debug = true )
-      : m_debug( debug ) { setUp( m_debug ); }
-    ~CudaRuntime() { tearDown( m_debug ); }
-    CudaRuntime( const CudaRuntime& ) = delete;
-    CudaRuntime( CudaRuntime&& ) = delete;
-    CudaRuntime& operator=( const CudaRuntime& ) = delete;
-    CudaRuntime& operator=( CudaRuntime&& ) = delete;
-    bool m_debug;
-
-    // Set up CUDA application
-    // ** NB: strictly speaking this is not needed when using the CUDA runtime API **
-    // Calling cudaSetDevice on startup is useful to properly book-keep the time spent in CUDA initialization
-    static void setUp( const bool debug = true )
-    {
-      // ** NB: it is useful to call cudaSetDevice, or cudaFree, to properly book-keep the time spent in CUDA initialization
-      // ** NB: otherwise, the first CUDA operation (eg a cudaMemcpyToSymbol in CPPProcess ctor) appears to take much longer!
-      /*
-      // [We initially added cudaFree(0) to "ease profile analysis" only because it shows up as a big recognizable block!]
-      // No explicit initialization is needed: https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#initialization
-      // It is not clear what cudaFree(0) does at all: https://stackoverflow.com/questions/69967813/
-      if ( debug ) std::cout << "__CudaRuntime: calling cudaFree(0)" << std::endl;
-      checkCuda( cudaFree( 0 ) ); // SLOW!
-      */
-      // Replace cudaFree(0) by cudaSetDevice(0), even if it is not really needed either
-      // (but see https://developer.nvidia.com/blog/cuda-pro-tip-always-set-current-device-avoid-multithreading-bugs)
-      if( debug ) std::cout << "__CudaRuntime: calling cudaSetDevice(0)" << std::endl;
-      checkCuda( cudaSetDevice( 0 ) ); // SLOW!
-    }
-
-    // Tear down CUDA application (call cudaDeviceReset)
-    // ** NB: strictly speaking this is not needed when using the CUDA runtime API **
-    // Calling cudaDeviceReset on shutdown is only needed for checking memory leaks in cuda-memcheck
-    // See https://docs.nvidia.com/cuda/cuda-memcheck/index.html#leak-checking
-    static void tearDown( const bool debug = true )
-    {
-      if( debug ) std::cout << "__CudaRuntime: calling cudaDeviceReset()" << std::endl;
-      checkCuda( cudaDeviceReset() );
-    }
-  };
-
-}
-#endif
-
-//--------------------------------------------------------------------------
-
-#endif // MG5AMC_CUDARUNTIME_H
diff --git a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/CurandRandomNumberKernel.cc b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/CurandRandomNumberKernel.cc
index eb56333b03..08a16f6f2c 100644
--- a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/CurandRandomNumberKernel.cc
+++ b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/CurandRandomNumberKernel.cc
@@ -1,9 +1,9 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
-#include "CudaRuntime.h"
+#include "GpuRuntime.h"
 #include "MemoryBuffers.h"
 #include "RandomNumberKernels.h"
 
@@ -22,7 +22,7 @@ inline void assertCurand( curandStatus_t code, const char *file, int line, bool
 }
 #endif /* clang-format on */
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -36,7 +36,7 @@ namespace mg5amcCpu
   {
     if( m_isOnDevice )
     {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
       if( !m_rnarray.isOnDevice() )
         throw std::runtime_error( "CurandRandomNumberKernel on device with a host random number array" );
 #else
@@ -114,7 +114,7 @@ namespace mg5amcCpu
     /*
     printf( "\nCurandRandomNumberKernel::generateRnarray size = %d\n", (int)m_rnarray.size() );
     fptype* data = m_rnarray.data();
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     if( m_rnarray.isOnDevice() )
     {
       data = new fptype[m_rnarray.size()]();
@@ -123,7 +123,7 @@ namespace mg5amcCpu
 #endif
     for( int i = 0; i < ( (int)m_rnarray.size() / 4 ); i++ )
       printf( "[%4d] %f %f %f %f\n", i * 4, data[i * 4], data[i * 4 + 2], data[i * 4 + 2], data[i * 4 + 3] );
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     if( m_rnarray.isOnDevice() ) delete[] data;
 #endif
     */
diff --git a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/EventStatistics.h b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/EventStatistics.h
index 48b51e0a49..b425a5bade 100644
--- a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/EventStatistics.h
+++ b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/EventStatistics.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef EventStatistics_H
 #define EventStatistics_H 1
@@ -16,7 +16,7 @@
 #include <limits>
 #include <string>
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MadgraphTest.h b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MadgraphTest.h
index ffe3b84d53..176338151a 100644
--- a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MadgraphTest.h
+++ b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MadgraphTest.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: S. Hageboeck (Dec 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MADGRAPHTEST_H_
 #define MADGRAPHTEST_H_ 1
@@ -21,7 +21,7 @@
 #include <string>
 #include <vector>
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 using mg5amcGpu::CPPProcess;
 #else
 using mg5amcCpu::CPPProcess;
@@ -200,7 +200,7 @@ class MadgraphTest : public testing::TestWithParam<TestDriverBase*>
 
 // Since we link both the CPU-only and GPU tests into the same executable, we prevent
 // a multiply defined symbol by only compiling this in the non-CUDA phase:
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 
 /// Compare momenta and matrix elements.
 /// This uses an implementation of TestDriverBase to run a madgraph workflow,
@@ -309,6 +309,6 @@ TEST_P( MadgraphTest, CompareMomentaAndME )
   }
 }
 
-#endif // __CUDACC__
+#endif // MGONGPUCPP_GPUIMPL
 
 #endif /* MADGRAPHTEST_H_ */
diff --git a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MatrixElementKernels.cc b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MatrixElementKernels.cc
index 30257195b6..d6d6c4f179 100644
--- a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MatrixElementKernels.cc
+++ b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MatrixElementKernels.cc
@@ -1,12 +1,12 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #include "MatrixElementKernels.h"
 
 #include "CPPProcess.h"
-#include "CudaRuntime.h"
+#include "GpuRuntime.h" // Includes the abstraction for Nvidia/AMD compilation
 #include "MemoryAccessMomenta.h"
 #include "MemoryBuffers.h"
 
@@ -14,7 +14,7 @@
 
 //============================================================================
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 namespace mg5amcCpu
 {
 
@@ -143,7 +143,7 @@ namespace mg5amcCpu
 
 //============================================================================
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 {
 
@@ -202,13 +202,13 @@ namespace mg5amcGpu
     PinnedHostBufferHelicityMask hstIsGoodHel( ncomb );
     DeviceBufferHelicityMask devIsGoodHel( ncomb );
     // ... 0d1. Compute good helicity mask on the device
-    computeDependentCouplings<<<m_gpublocks, m_gputhreads>>>( m_gs.data(), m_couplings.data() );
+    gpuLaunchKernel( computeDependentCouplings, m_gpublocks, m_gputhreads, m_gs.data(), m_couplings.data() );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    sigmaKin_getGoodHel<<<m_gpublocks, m_gputhreads>>>( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_numerators.data(), m_denominators.data(), devIsGoodHel.data() );
+    gpuLaunchKernel( sigmaKin_getGoodHel, m_gpublocks, m_gputhreads, m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_numerators.data(), m_denominators.data(), devIsGoodHel.data() );
 #else
-    sigmaKin_getGoodHel<<<m_gpublocks, m_gputhreads>>>( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), devIsGoodHel.data() );
+    gpuLaunchKernel( sigmaKin_getGoodHel, m_gpublocks, m_gputhreads, m_momenta.data(), m_couplings.data(), m_matrixElements.data(), devIsGoodHel.data() );
 #endif
-    checkCuda( cudaPeekAtLastError() );
+    checkGpu( gpuPeekAtLastError() );
     // ... 0d2. Copy back good helicity mask to the host
     copyHostFromDevice( hstIsGoodHel, devIsGoodHel );
     // ... 0d3. Copy back good helicity list to constant memory on the device
@@ -219,19 +219,19 @@ namespace mg5amcGpu
 
   void MatrixElementKernelDevice::computeMatrixElements( const unsigned int channelId )
   {
-    computeDependentCouplings<<<m_gpublocks, m_gputhreads>>>( m_gs.data(), m_couplings.data() );
+    gpuLaunchKernel( computeDependentCouplings, m_gpublocks, m_gputhreads, m_gs.data(), m_couplings.data() );
 #ifndef MGONGPU_NSIGHT_DEBUG
     constexpr unsigned int sharedMemSize = 0;
 #else
     constexpr unsigned int sharedMemSize = ntpbMAX * sizeof( float );
 #endif
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    sigmaKin<<<m_gpublocks, m_gputhreads, sharedMemSize>>>( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), channelId, m_numerators.data(), m_denominators.data(), m_selhel.data(), m_selcol.data() );
+    gpuLaunchKernelSharedMem( sigmaKin, m_gpublocks, m_gputhreads, sharedMemSize, m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), channelId, m_numerators.data(), m_denominators.data(), m_selhel.data(), m_selcol.data() );
 #else
-    sigmaKin<<<m_gpublocks, m_gputhreads, sharedMemSize>>>( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), m_selhel.data(), m_selcol.data() );
+    gpuLaunchKernelSharedMem( sigmaKin, m_gpublocks, m_gputhreads, sharedMemSize, m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), m_selhel.data(), m_selcol.data() );
 #endif
-    checkCuda( cudaPeekAtLastError() );
-    checkCuda( cudaDeviceSynchronize() );
+    checkGpu( gpuPeekAtLastError() );
+    checkGpu( gpuDeviceSynchronize() );
   }
 
   //--------------------------------------------------------------------------
diff --git a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MatrixElementKernels.h b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MatrixElementKernels.h
index 23e84757a2..72bd8f195b 100644
--- a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MatrixElementKernels.h
+++ b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MatrixElementKernels.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MATRIXELEMENTKERNELS_H
 #define MATRIXELEMENTKERNELS_H 1
@@ -10,7 +10,7 @@
 
 #include "MemoryBuffers.h"
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -81,7 +81,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating matrix element calculations on a CPU host
   class MatrixElementKernelHost final : public MatrixElementKernelBase, public NumberOfEvents
   {
@@ -130,7 +130,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   // A class encapsulating matrix element calculations on a GPU device
   class MatrixElementKernelDevice : public MatrixElementKernelBase, public NumberOfEvents
   {
diff --git a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MemoryAccessAmplitudes.h b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MemoryAccessAmplitudes.h
index 573b3bbbc9..ffb76e93de 100644
--- a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MemoryAccessAmplitudes.h
+++ b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MemoryAccessAmplitudes.h
@@ -15,7 +15,7 @@
 #define MGONGPU_TRIVIAL_AMPLITUDES 1
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MemoryAccessCouplings.h b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MemoryAccessCouplings.h
index 35a3af42e0..3afdf3e554 100644
--- a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MemoryAccessCouplings.h
+++ b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MemoryAccessCouplings.h
@@ -15,7 +15,7 @@
 #include "MemoryBuffers.h"       // for HostBufferCouplings::isaligned
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MemoryAccessCouplingsFixed.h b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MemoryAccessCouplingsFixed.h
index dc0d93afff..ffcdf4dbef 100644
--- a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MemoryAccessCouplingsFixed.h
+++ b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MemoryAccessCouplingsFixed.h
@@ -14,7 +14,7 @@
 //#include "MemoryAccessHelpers.h"
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MemoryAccessDenominators.h b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MemoryAccessDenominators.h
index 3bce635718..66f2d32a6b 100644
--- a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MemoryAccessDenominators.h
+++ b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MemoryAccessDenominators.h
@@ -10,7 +10,7 @@
 #include "MemoryAccessGs.h"
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MemoryAccessGs.h b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MemoryAccessGs.h
index 31311aa375..4c726b30f3 100644
--- a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MemoryAccessGs.h
+++ b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MemoryAccessGs.h
@@ -13,7 +13,7 @@
 #include "MemoryBuffers.h" // for HostBufferMatrixElements::isaligned
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MemoryAccessHelpers.h b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MemoryAccessHelpers.h
index c82a6c7635..db73e4e064 100644
--- a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MemoryAccessHelpers.h
+++ b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MemoryAccessHelpers.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MemoryAccessHelpers_H
 #define MemoryAccessHelpers_H 1
@@ -105,7 +105,7 @@ class KernelAccessHelper : public MemoryAccessHelper<T>
     }
     else
     {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
       const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid
       //printf( "kernelAccessRecord: ievt=%d threadId=%d\n", ievt, threadIdx.x );
       return T::ieventAccessRecord( buffer, ievt ); // NB fptype and fptype_sv coincide for CUDA
diff --git a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MemoryAccessMatrixElements.h b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MemoryAccessMatrixElements.h
index f32e6fea5b..3741011971 100644
--- a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MemoryAccessMatrixElements.h
+++ b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MemoryAccessMatrixElements.h
@@ -13,7 +13,7 @@
 #include "MemoryBuffers.h" // for HostBufferMatrixElements::isaligned
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MemoryAccessMomenta.h b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MemoryAccessMomenta.h
index 29266de32c..3be229d392 100644
--- a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MemoryAccessMomenta.h
+++ b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MemoryAccessMomenta.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MemoryAccessMomenta_H
 #define MemoryAccessMomenta_H 1
@@ -13,7 +13,7 @@
 #include "MemoryAccessVectors.h"
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -30,7 +30,7 @@ namespace mg5amcCpu
 
     // Number of Events Per Page in the momenta AOSOA memory buffer layout
     // (these are all best kept as a compile-time constants: see issue #23)
-#ifdef __CUDACC__ /* clang-format off */
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
     // -----------------------------------------------------------------------------------------------
     // --- GPUs: neppM is best set to a power of 2 times the number of fptype's in a 32-byte cacheline
     // --- This is relevant to ensure coalesced access to momenta in global memory
diff --git a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MemoryAccessNumerators.h b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MemoryAccessNumerators.h
index b152183b28..18991f4fa6 100644
--- a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MemoryAccessNumerators.h
+++ b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MemoryAccessNumerators.h
@@ -10,7 +10,7 @@
 #include "MemoryAccessGs.h"
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MemoryAccessRandomNumbers.h b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MemoryAccessRandomNumbers.h
index e2988d39f3..40cb089135 100644
--- a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MemoryAccessRandomNumbers.h
+++ b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MemoryAccessRandomNumbers.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MemoryAccessRandomNumbers_H
 #define MemoryAccessRandomNumbers_H 1
@@ -11,7 +11,7 @@
 #include "CPPProcess.h"
 #include "MemoryAccessHelpers.h"
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 using mg5amcGpu::CPPProcess;
 #else
 using mg5amcCpu::CPPProcess;
diff --git a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MemoryAccessVectors.h b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MemoryAccessVectors.h
index e9b197368e..08faccff0f 100644
--- a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MemoryAccessVectors.h
+++ b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MemoryAccessVectors.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MemoryAccessVectors_H
 #define MemoryAccessVectors_H 1
@@ -10,7 +10,7 @@
 
 #include "mgOnGpuVectors.h"
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 namespace mg5amcCpu // this is only needed for CPU SIMD vectorization
 {
 
diff --git a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MemoryAccessWavefunctions.h b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MemoryAccessWavefunctions.h
index 5428aaf933..33bef4559e 100644
--- a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MemoryAccessWavefunctions.h
+++ b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MemoryAccessWavefunctions.h
@@ -15,7 +15,7 @@
 #define MGONGPU_TRIVIAL_WAVEFUNCTIONS 1
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MemoryBuffers.h b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MemoryBuffers.h
index 3093e6ed18..7756a71621 100644
--- a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MemoryBuffers.h
+++ b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MemoryBuffers.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021, based on earlier work by S. Hageboeck) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Roiser, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MemoryBuffers_H
 #define MemoryBuffers_H 1
@@ -11,12 +11,12 @@
 #include "mgOnGpuCxtypes.h"
 
 #include "CPPProcess.h"
-#include "CudaRuntime.h"
+#include "GpuRuntime.h"
 #include "Parameters_sm.h"
 
 #include <sstream>
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -87,7 +87,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   constexpr bool HostBufferALIGNED = false;   // ismisaligned=false
   constexpr bool HostBufferMISALIGNED = true; // ismisaligned=true
 
@@ -119,7 +119,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   // A class encapsulating a CUDA pinned host buffer
   template<typename T>
   class PinnedHostBufferBase : public BufferBase<T>
@@ -128,18 +128,18 @@ namespace mg5amcCpu
     PinnedHostBufferBase( const size_t size )
       : BufferBase<T>( size, false )
     {
-      checkCuda( cudaMallocHost( &( this->m_data ), this->bytes() ) );
+      gpuMallocHost( &( this->m_data ), this->bytes() );
     }
     virtual ~PinnedHostBufferBase()
     {
-      checkCuda( cudaFreeHost( this->m_data ) );
+      gpuFreeHost( this->m_data );
     }
   };
 #endif
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   // A class encapsulating a CUDA device buffer
   template<typename T>
   class DeviceBufferBase : public BufferBase<T>
@@ -148,18 +148,18 @@ namespace mg5amcCpu
     DeviceBufferBase( const size_t size )
       : BufferBase<T>( size, true )
     {
-      checkCuda( cudaMalloc( &( this->m_data ), this->bytes() ) );
+      gpuMalloc( &( this->m_data ), this->bytes() );
     }
     virtual ~DeviceBufferBase()
     {
-      checkCuda( cudaFree( this->m_data ) );
+      gpuFree( this->m_data );
     }
   };
 #endif
 
   //--------------------------------------------------------------------------
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for a given number of events
   template<typename T, size_t sizePerEvent, bool ismisaligned>
   class HostBuffer : public HostBufferBase<T, ismisaligned>, virtual private NumberOfEvents
@@ -175,7 +175,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   // A class encapsulating a CUDA pinned host buffer for a given number of events
   template<typename T, size_t sizePerEvent>
   class PinnedHostBuffer : public PinnedHostBufferBase<T>, virtual private NumberOfEvents
@@ -191,7 +191,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   // A class encapsulating a CUDA device buffer for a given number of events
   template<typename T, size_t sizePerEvent>
   class DeviceBuffer : public DeviceBufferBase<T>, virtual private NumberOfEvents
@@ -213,7 +213,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for momenta random numbers
   constexpr size_t sizePerEventRndNumMomenta = MemoryBuffers::np4 * MemoryBuffers::nparf;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for momenta random numbers
   typedef HostBuffer<fptype, sizePerEventRndNumMomenta, HostBufferALIGNED> HostBufferRndNumMomenta;
 #else
@@ -232,7 +232,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer with ONE fptype per event
   constexpr size_t sizePerEventOneFp = 1;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer with ONE fptype per event
   typedef HostBuffer<fptype, sizePerEventOneFp, HostBufferALIGNED> HostBufferOneFp;
 #else
@@ -257,7 +257,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for Gs
   constexpr size_t sizePerEventGs = 1;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for gs
   typedef HostBuffer<fptype, sizePerEventGs, HostBufferALIGNED> HostBufferGs;
 #else
@@ -276,7 +276,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for numerators
   constexpr size_t sizePerEventNumerators = 1;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for gs
   typedef HostBuffer<fptype, sizePerEventNumerators, HostBufferALIGNED> HostBufferNumerators;
 #else
@@ -296,7 +296,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for denominators
   constexpr size_t sizePerEventDenominators = 1;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for gs
   typedef HostBuffer<fptype, sizePerEventDenominators, HostBufferALIGNED> HostBufferDenominators;
 #else
@@ -315,7 +315,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for random numbers
   constexpr size_t sizePerEventCouplings = MemoryBuffers::ndcoup * MemoryBuffers::nx2;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for gs
   typedef HostBuffer<fptype, sizePerEventCouplings, HostBufferALIGNED> HostBufferCouplings;
 #else
@@ -333,7 +333,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for momenta
   constexpr size_t sizePerEventMomenta = MemoryBuffers::np4 * MemoryBuffers::npar;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for momenta
   typedef HostBuffer<fptype, sizePerEventMomenta, HostBufferALIGNED> HostBufferMomenta;
   //typedef HostBuffer<fptype, sizePerEventMomenta, HostBufferMISALIGNED> HostBufferMomenta; // TEST MISALIGNMENT!
@@ -352,7 +352,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for sampling weights
   constexpr size_t sizePerEventWeights = 1;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for sampling weights
   typedef HostBuffer<fptype, sizePerEventWeights, HostBufferALIGNED> HostBufferWeights;
 #else
@@ -370,7 +370,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for matrix elements
   constexpr size_t sizePerEventMatrixElements = 1;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for matrix elements
   typedef HostBuffer<fptype, sizePerEventMatrixElements, HostBufferALIGNED> HostBufferMatrixElements;
 #else
@@ -385,7 +385,7 @@ namespace mg5amcCpu
   // A base class encapsulating a memory buffer for the helicity mask
   typedef BufferBase<bool> BufferHelicityMask;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for the helicity mask
   typedef HostBufferBase<bool, HostBufferALIGNED> HostBufferHelicityMask;
 #else
@@ -403,7 +403,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for wavefunctions
   constexpr size_t sizePerEventWavefunctions = MemoryBuffers::nw6 * MemoryBuffers::nx2;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for wavefunctions
   typedef HostBuffer<fptype, sizePerEventWavefunctions, HostBufferALIGNED> HostBufferWavefunctions;
 #else
@@ -421,7 +421,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for helicity random numbers
   constexpr size_t sizePerEventRndNumHelicity = 1;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for helicity random numbers
   typedef HostBuffer<fptype, sizePerEventRndNumHelicity, HostBufferALIGNED> HostBufferRndNumHelicity;
 #else
@@ -439,7 +439,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for color random numbers
   constexpr size_t sizePerEventRndNumColor = 1;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for color random numbers
   typedef HostBuffer<fptype, sizePerEventRndNumColor, HostBufferALIGNED> HostBufferRndNumColor;
 #else
@@ -457,7 +457,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for helicity selection
   constexpr size_t sizePerEventSelectedHelicity = 1;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for helicity selection
   typedef HostBuffer<int, sizePerEventSelectedHelicity, HostBufferALIGNED> HostBufferSelectedHelicity;
 #else
@@ -475,7 +475,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for color selection
   constexpr size_t sizePerEventSelectedColor = 1;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for color selection
   typedef HostBuffer<int, sizePerEventSelectedColor, HostBufferALIGNED> HostBufferSelectedColor;
 #else
@@ -487,7 +487,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   template<class Tdst, class Tsrc>
   void copyDeviceFromHost( Tdst& dst, const Tsrc& src ) // keep the same order of arguments as in memcpy
   {
@@ -504,13 +504,13 @@ namespace mg5amcCpu
       throw std::runtime_error( sstr.str() );
     }
     // NB (PR #45): cudaMemcpy involves an intermediate memcpy to pinned memory if host array is a not a pinned host array
-    checkCuda( cudaMemcpy( dst.data(), src.data(), src.bytes(), cudaMemcpyHostToDevice ) );
+    gpuMemcpy( dst.data(), src.data(), src.bytes(), gpuMemcpyHostToDevice );
   }
 #endif
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   template<class Tdst, class Tsrc>
   void copyHostFromDevice( Tdst& dst, const Tsrc& src ) // keep the same order of arguments as in memcpy
   {
@@ -527,7 +527,7 @@ namespace mg5amcCpu
       throw std::runtime_error( sstr.str() );
     }
     // NB (PR #45): cudaMemcpy involves an intermediate memcpy to pinned memory if host array is a not a pinned host array
-    checkCuda( cudaMemcpy( dst.data(), src.data(), src.bytes(), cudaMemcpyDeviceToHost ) );
+    gpuMemcpy( dst.data(), src.data(), src.bytes(), gpuMemcpyDeviceToHost );
   }
 #endif
 
diff --git a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/P1_Sigma_sm_gg_ttxgg/CPPProcess.cc b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/P1_Sigma_sm_gg_ttxgg/CPPProcess.cc
index 53ef4c5751..0d88d93225 100644
--- a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/P1_Sigma_sm_gg_ttxgg/CPPProcess.cc
+++ b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/P1_Sigma_sm_gg_ttxgg/CPPProcess.cc
@@ -4,7 +4,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
 // MadGraph5_aMC@NLO v. 3.5.0_lo_vect, 2023-06-09
@@ -16,7 +16,6 @@
 
 #include "mgOnGpuConfig.h"
 
-#include "CudaRuntime.h"
 #include "HelAmps_sm.h"
 #include "MemoryAccessAmplitudes.h"
 #include "MemoryAccessCouplings.h"
@@ -46,7 +45,7 @@
 // Class member functions for calculating the matrix elements for
 // Process: g g > t t~ g g WEIGHTED<=4 @1
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -80,7 +79,7 @@ namespace mg5amcCpu
   __device__ const fptype cIPD[2] = { (fptype)Parameters_sm::mdl_MT, (fptype)Parameters_sm::mdl_WT };
   __device__ const fptype* cIPC = nullptr; // unused as nicoup=0
 #else
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   __device__ __constant__ fptype cIPD[2];
   __device__ __constant__ fptype* cIPC = nullptr; // unused as nicoup=0
 #else
@@ -90,7 +89,7 @@ namespace mg5amcCpu
 #endif
 
   // Helicity combinations (and filtering of "good" helicity combinations)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   __device__ __constant__ short cHel[ncomb][npar];
   __device__ __constant__ int cNGoodHel;
   __device__ __constant__ int cGoodHel[ncomb];
@@ -118,13 +117,13 @@ namespace mg5amcCpu
                            fptype* allDenominators,       // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
                            fptype_sv* jamp2_sv            // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled)
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
                            , const int ievt00             // input: first event number in current C++ event page (for CUDA, ievt depends on threadid)
 #endif
                            )
   //ALWAYS_INLINE // attributes are not permitted in a function definition
   {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     using namespace mg5amcGpu;
     using M_ACCESS = DeviceAccessMomenta;         // non-trivial access: buffer includes all events
     using E_ACCESS = DeviceAccessMatrixElements;  // non-trivial access: buffer includes all events
@@ -151,7 +150,7 @@ namespace mg5amcCpu
 #endif /* clang-format on */
     mgDebug( 0, __FUNCTION__ );
     //printf( "calculate_wavefunctions: ihel=%2d\n", ihel );
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
     //printf( "calculate_wavefunctions: ievt00=%d\n", ievt00 );
 #endif
 
@@ -187,7 +186,7 @@ namespace mg5amcCpu
 #endif
     for( int iParity = 0; iParity < nParity; ++iParity )
     { // START LOOP ON IPARITY
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
       const int ievt0 = ievt00 + iParity * neppV;
 #endif
       constexpr size_t nxcoup = ndcoup + nicoup; // both dependent and independent couplings
@@ -200,8 +199,10 @@ namespace mg5amcCpu
         allCOUPs[idcoup] = CD_ACCESS::idcoupAccessBufferConst( allcouplings, idcoup ); // dependent couplings, vary event-by-event
       for( size_t iicoup = 0; iicoup < nicoup; iicoup++ )
         allCOUPs[ndcoup + iicoup] = CI_ACCESS::iicoupAccessBufferConst( cIPC, iicoup ); // independent couplings, fixed for all events
+#ifdef MGONGPUCPP_GPUIMPL
 #ifdef __CUDACC__
 #pragma nv_diagnostic pop
+#endif
       // CUDA kernels take input/output buffers with momenta/MEs for all events
       const fptype* momenta = allmomenta;
       const fptype* COUPs[nxcoup];
@@ -2474,7 +2475,7 @@ namespace mg5amcCpu
         { 62, 71, -10, 80, -1, 8, -28, 62, 62, -10, -10, -1, -1, 8, -10, -1, -64, 8, 8, -64, 80, 8, 512, -64 },
         { -28, 62, 62, -10, -10, -1, 62, 71, -10, 80, -1, 8, -10, -1, -1, 8, 8, -64, 80, 8, 8, -64, -64, 512 } }; // 2-D array[24][24]
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
       // Pre-compute a constexpr triangular color matrix properly normalized #475
       struct TriangularNormalizedColorMatrix
       {
@@ -2531,7 +2532,7 @@ namespace mg5amcCpu
 #endif
       for( int icol = 0; icol < ncolor; icol++ )
       {
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
         // === C++ START ===
         // Diagonal terms
 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
@@ -2590,7 +2591,7 @@ namespace mg5amcCpu
       MEs_sv_previous += deltaMEs_previous;
 #endif
       /*
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
       if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", blockDim.x * blockIdx.x + threadIdx.x, ihel, MEs_sv );
 #else
 #ifdef MGONGPU_CPPSIMD
@@ -2685,8 +2686,8 @@ namespace mg5amcCpu
       { 1, 1, 1, -1, -1, 1 },
       { 1, 1, 1, -1, 1, -1 },
       { 1, 1, 1, -1, 1, 1 } };
-#ifdef __CUDACC__
-    checkCuda( cudaMemcpyToSymbol( cHel, tHel, ncomb * npar * sizeof( short ) ) );
+#ifdef MGONGPUCPP_GPUIMPL
+    gpuMemcpyToSymbol( cHel, tHel, ncomb * npar * sizeof( short ) );
 #else
     memcpy( cHel, tHel, ncomb * npar * sizeof( short ) );
 #endif
@@ -2728,9 +2729,9 @@ namespace mg5amcCpu
     // Then copy them to CUDA constant memory (issue #39) or its C++ emulation in file-scope static memory
     const fptype tIPD[2] = { (fptype)m_pars->mdl_MT, (fptype)m_pars->mdl_WT };
     //const cxtype tIPC[0] = { ... }; // nicoup=0
-#ifdef __CUDACC__
-    checkCuda( cudaMemcpyToSymbol( cIPD, tIPD, 2 * sizeof( fptype ) ) );
-    //checkCuda( cudaMemcpyToSymbol( cIPC, tIPC, 0 * sizeof( cxtype ) ) ); // nicoup=0
+#ifdef MGONGPUCPP_GPUIMPL
+    gpuMemcpyToSymbol( cIPD, tIPD, 2 * sizeof( fptype ) );
+    //gpuMemcpyToSymbol( cIPC, tIPC, 0 * sizeof( cxtype ) ); // nicoup=0
 #else
     memcpy( cIPD, tIPD, 2 * sizeof( fptype ) );
     //memcpy( cIPC, tIPC, 0 * sizeof( cxtype ) ); // nicoup=0
@@ -2768,7 +2769,7 @@ namespace mg5amcCpu
   {
     std::stringstream out;
     // CUDA version (NVCC)
-    // [Use __NVCC__ instead of __CUDACC__ here!]
+    // [Use __NVCC__ instead of MGONGPUCPP_GPUIMPL here!]
     // [This tests if 'nvcc' was used even to build a .cc file, even if not necessarily 'nvcc -x cu' for a .cu file]
     // [Check 'nvcc --compiler-options -dM -E dummy.c | grep CUDA': see https://stackoverflow.com/a/53713712]
 #ifdef __NVCC__
@@ -2833,12 +2834,12 @@ namespace mg5amcCpu
   __global__ void /* clang-format off */
   computeDependentCouplings( const fptype* allgs, // input: Gs[nevt]
                              fptype* allcouplings // output: couplings[nevt*ndcoup*2]
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
                              , const int nevt     // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #endif
   ) /* clang-format on */
   {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     using namespace mg5amcGpu;
     using G_ACCESS = DeviceAccessGs;
     using C_ACCESS = DeviceAccessCouplings;
@@ -2859,7 +2860,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__ /* clang-format off */
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
   __global__ void
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
@@ -2988,9 +2989,9 @@ namespace mg5amcCpu
         nGoodHel++;
       }
     }
-#ifdef __CUDACC__
-    checkCuda( cudaMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) ) );
-    checkCuda( cudaMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) ) );
+#ifdef MGONGPUCPP_GPUIMPL
+    gpuMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) );
+    gpuMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) );
 #else
     cNGoodHel = nGoodHel;
     for( int ihel = 0; ihel < ncomb; ihel++ ) cGoodHel[ihel] = goodHel[ihel];
@@ -3014,7 +3015,7 @@ namespace mg5amcCpu
 #endif
             int* allselhel,                // output: helicity selection[nevt]
             int* allselcol                 // output: helicity selection[nevt]
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
             , const int nevt               // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #endif
             ) /* clang-format on */
@@ -3035,7 +3036,7 @@ namespace mg5amcCpu
     // Denominators: spins, colors and identical particles
     constexpr int helcolDenominators[1] = { 512 }; // assume nprocesses == 1 (#272 and #343)
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     // Remember: in CUDA this is a kernel for one event, in c++ this processes n events
     const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid
 #else
@@ -3049,9 +3050,12 @@ namespace mg5amcCpu
 #endif
 
     // Start sigmaKin_lines
+
+#include "GpuAbstraction.h"
+
     // === PART 0 - INITIALISATION (before calculate_wavefunctions) ===
     // Reset the "matrix elements" - running sums of |M|^2 over helicities for the given event
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     allMEs[ievt] = 0;
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     allNumerators[ievt] = 0;
@@ -3079,7 +3083,7 @@ namespace mg5amcCpu
     // === PART 1 - HELICITY LOOP: CALCULATE WAVEFUNCTIONS ===
     // (in both CUDA and C++, using precomputed good helicities)
 
-#ifdef __CUDACC__ // CUDA OR C++
+#ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++
 
     // *** START OF PART 1a - CUDA (one event per CPU thread) ***
     // Running sum of partial amplitudes squared for event by event color selection (#402)
@@ -3283,7 +3287,7 @@ namespace mg5amcCpu
     // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event
     // [NB 'sum over final spins, average over initial spins', eg see
     // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf]
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     allMEs[ievt] /= helcolDenominators[0];
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     if( channelId > 0 ) allMEs[ievt] *= allNumerators[ievt] / allDenominators[ievt];
diff --git a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/P1_Sigma_sm_gg_ttxgg/CPPProcess.h b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/P1_Sigma_sm_gg_ttxgg/CPPProcess.h
index b3323a7a84..5fa603d43c 100644
--- a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/P1_Sigma_sm_gg_ttxgg/CPPProcess.h
+++ b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/P1_Sigma_sm_gg_ttxgg/CPPProcess.h
@@ -4,7 +4,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
 // MadGraph5_aMC@NLO v. 3.5.0_lo_vect, 2023-06-09
@@ -25,7 +25,7 @@
 
 //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -107,7 +107,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   __global__ void
   computeDependentCouplings( const fptype* allgs,    // input: Gs[nevt]
                              fptype* allcouplings ); // output: couplings[nevt*ndcoup*2]
@@ -120,7 +120,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__ /* clang-format off */
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
   __global__ void
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
@@ -150,7 +150,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__ /* clang-format off */
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
   __global__ void
   sigmaKin( const fptype* allmomenta,      // input: momenta[nevt*npar*4]
             const fptype* allcouplings,    // input: couplings[nevt*ndcoup*2]
diff --git a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/P1_Sigma_sm_gg_ttxgg/CudaRuntime.h b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/P1_Sigma_sm_gg_ttxgg/CudaRuntime.h
deleted file mode 120000
index ce9e1a487a..0000000000
--- a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/P1_Sigma_sm_gg_ttxgg/CudaRuntime.h
+++ /dev/null
@@ -1 +0,0 @@
-../CudaRuntime.h
\ No newline at end of file
diff --git a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/P1_Sigma_sm_gg_ttxgg/check_sa.cc b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/P1_Sigma_sm_gg_ttxgg/check_sa.cc
index f1e75b9252..1bad694d1c 100644
--- a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/P1_Sigma_sm_gg_ttxgg/check_sa.cc
+++ b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/P1_Sigma_sm_gg_ttxgg/check_sa.cc
@@ -4,7 +4,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: O. Mattelaer (Nov 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 
 #include "mgOnGpuConfig.h"
@@ -12,6 +12,7 @@
 #include "BridgeKernels.h"
 #include "CPPProcess.h"
 #include "CrossSectionKernels.h"
+#include "GpuRuntime.h"
 #include "MatrixElementKernels.h"
 #include "MemoryAccessMatrixElements.h"
 #include "MemoryAccessMomenta.h"
@@ -63,7 +64,7 @@ usage( char* argv0, int ret = 1 )
   std::cout << std::endl;
   std::cout << "Summary stats are always computed: '-p' and '-j' only control their printout" << std::endl;
   std::cout << "The '-d' flag only enables NaN/abnormal warnings and OMP debugging" << std::endl;
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 #ifdef _OPENMP
   std::cout << std::endl;
   std::cout << "Use the OMP_NUM_THREADS environment variable to control OMP multi-threading" << std::endl;
@@ -77,7 +78,7 @@ int
 main( int argc, char** argv )
 {
   // Namespaces for CUDA and C++ (FIXME - eventually use the same namespace everywhere...)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   using namespace mg5amcGpu;
 #else
   using namespace mg5amcCpu;
@@ -103,11 +104,11 @@ main( int argc, char** argv )
     CurandDevice = 2
   };
 #ifdef __CUDACC__
-  RandomNumberMode rndgen = RandomNumberMode::CurandDevice; // default on GPU
+  RandomNumberMode rndgen = RandomNumberMode::CurandDevice; // default on CUDA GPU
 #elif not defined MGONGPU_HAS_NO_CURAND
   RandomNumberMode rndgen = RandomNumberMode::CurandHost;  // default on CPU if build has curand
 #else
-  RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // default on CPU if build has no curand
+  RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // default on CPU if build has no curand and on HIP GPU
 #endif
   // Rambo sampling mode (NB RamboHost implies CommonRandom or CurandHost!)
   enum class RamboSamplingMode
@@ -115,7 +116,7 @@ main( int argc, char** argv )
     RamboHost = 1,
     RamboDevice = 2
   };
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   RamboSamplingMode rmbsmp = RamboSamplingMode::RamboDevice; // default on GPU
 #else
   RamboSamplingMode rmbsmp = RamboSamplingMode::RamboHost; // default on CPU
@@ -148,7 +149,7 @@ main( int argc, char** argv )
 #ifdef __CUDACC__
       rndgen = RandomNumberMode::CurandDevice;
 #else
-      throw std::runtime_error( "CurandDevice is not supported on CPUs" );
+      throw std::runtime_error( "CurandDevice is not supported on CPUs or on HIP GPUs" );
 #endif
     }
     else if( arg == "--curhst" )
@@ -165,7 +166,7 @@ main( int argc, char** argv )
     }
     else if( arg == "--rmbdev" )
     {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
       rmbsmp = RamboSamplingMode::RamboDevice;
 #else
       throw std::runtime_error( "RamboDevice is not supported on CPUs" );
@@ -239,13 +240,13 @@ main( int argc, char** argv )
     return usage( argv[0] );
   }
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 #ifdef _OPENMP
   ompnumthreadsNotSetMeansOneThread( debug ? 1 : 0 ); // quiet(-1), info(0), debug(1)
 #endif
 #endif
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // Fail gently and avoid "Illegal instruction (core dumped)" if the host does not support the SIMD used in the ME calculation
   // Note: this prevents a crash on pmpe04 but not on some github CI nodes?
   // [NB: SIMD vectorization in mg5amc C++ code is only used in the ME calculation below MatrixElementKernelHost!]
@@ -263,14 +264,14 @@ main( int argc, char** argv )
 
   // === STEP 0 - INITIALISE
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 
-  // --- 00. Initialise cuda
-  // Instantiate a CudaRuntime at the beginnining of the application's main to
-  // invoke cudaSetDevice(0) in the constructor and book a cudaDeviceReset() call in the destructor
-  const std::string cdinKey = "00 CudaInit";
+  // --- 00. Initialise GPU
+  // Instantiate a GpuRuntime at the beginnining of the application's main.
+  // For CUDA this invokes cudaSetDevice(0) in the constructor and books a cudaDeviceReset() call in the destructor.
+  const std::string cdinKey = "00 GpuInit";
   timermap.start( cdinKey );
-  CudaRuntime cudaRuntime( debug );
+  GpuRuntime GpuRuntime( debug );
 #endif
 
   // --- 0a. Initialise physics process
@@ -292,7 +293,7 @@ main( int argc, char** argv )
   timermap.start( alloKey );
 
   // Memory buffers for random numbers for momenta
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferRndNumMomenta hstRndmom( nevt );
 #else
   PinnedHostBufferRndNumMomenta hstRndmom( nevt );
@@ -300,7 +301,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for sampling weights
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferWeights hstWeights( nevt );
 #else
   PinnedHostBufferWeights hstWeights( nevt );
@@ -308,7 +309,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for momenta
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferMomenta hstMomenta( nevt );
 #else
   PinnedHostBufferMomenta hstMomenta( nevt );
@@ -316,7 +317,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for Gs
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferGs hstGs( nevt );
 #else
   PinnedHostBufferGs hstGs( nevt );
@@ -333,7 +334,7 @@ main( int argc, char** argv )
   }
 
   // Memory buffers for matrix elements
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferMatrixElements hstMatrixElements( nevt );
 #else
   PinnedHostBufferMatrixElements hstMatrixElements( nevt );
@@ -342,7 +343,7 @@ main( int argc, char** argv )
 
   // Memory buffers for random numbers for helicity selection
   // *** NB #403 these buffers always remain initialised at 0: no need for helicity choice in gcheck/check (no LHE produced) ***
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferRndNumHelicity hstRndHel( nevt );
 #else
   PinnedHostBufferRndNumHelicity hstRndHel( nevt );
@@ -351,7 +352,7 @@ main( int argc, char** argv )
 
   // Memory buffers for random numbers for color selection
   // *** NB #402 these buffers always remain initialised at 0: no need for color choice in gcheck/check (no LHE produced) ***
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferRndNumColor hstRndCol( nevt );
 #else
   PinnedHostBufferRndNumColor hstRndCol( nevt );
@@ -359,7 +360,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for helicity selection
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferSelectedHelicity hstSelHel( nevt );
 #else
   PinnedHostBufferSelectedHelicity hstSelHel( nevt );
@@ -367,7 +368,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for color selection
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferSelectedColor hstSelCol( nevt );
 #else
   PinnedHostBufferSelectedColor hstSelCol( nevt );
@@ -403,7 +404,7 @@ main( int argc, char** argv )
 #else
   else
   {
-    throw std::logic_error( "CurandDevice is not supported on CPUs" ); // INTERNAL ERROR (no path to this statement)
+    throw std::logic_error( "CurandDevice is not supported on CPUs or HIP GPUs" ); // INTERNAL ERROR (no path to this statement)
   }
 #endif
 #else
@@ -421,7 +422,7 @@ main( int argc, char** argv )
   }
   else
   {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     prsk.reset( new RamboSamplingKernelDevice( energy, devRndmom, devMomenta, devWeights, gpublocks, gputhreads ) );
 #else
     throw std::logic_error( "RamboDevice is not supported on CPUs" ); // INTERNAL ERROR (no path to this statement)
@@ -432,7 +433,7 @@ main( int argc, char** argv )
   std::unique_ptr<MatrixElementKernelBase> pmek;
   if( !bridge )
   {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     pmek.reset( new MatrixElementKernelDevice( devMomenta, devGs, devRndHel, devRndCol, devMatrixElements, devSelHel, devSelCol, gpublocks, gputhreads ) );
 #else
     pmek.reset( new MatrixElementKernelHost( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, nevt ) );
@@ -440,7 +441,7 @@ main( int argc, char** argv )
   }
   else
   {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     pmek.reset( new BridgeKernelDevice( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, gpublocks, gputhreads ) );
 #else
     pmek.reset( new BridgeKernelHost( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, nevt ) );
@@ -482,7 +483,7 @@ main( int argc, char** argv )
     prnk->generateRnarray();
     //std::cout << "Got random numbers" << std::endl;
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     if( rndgen != RandomNumberMode::CurandDevice && rmbsmp == RamboSamplingMode::RamboDevice )
     {
       // --- 1c. Copy rndmom from host to device
@@ -514,7 +515,7 @@ main( int argc, char** argv )
     prsk->getMomentaFinal();
     //std::cout << "Got final momenta" << std::endl;
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     if( rmbsmp == RamboSamplingMode::RamboDevice )
     {
       // --- 2c. CopyDToH Weights
@@ -559,7 +560,7 @@ main( int argc, char** argv )
       dynamic_cast<BridgeKernelBase*>( pmek.get() )->transposeInputMomentaC2F();
     }
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     // --- 2d. CopyHToD Momenta
     const std::string gKey = "0.. CpHTDg";
     rambtime += timermap.start( gKey ); // FIXME! NOT A RAMBO TIMER!
@@ -588,7 +589,7 @@ main( int argc, char** argv )
     wv3atime += timermap.stop(); // calc only
     wavetime += wv3atime;        // calc plus copy
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     if( !bridge )
     {
       // --- 3b. CopyDToH MEs
@@ -731,15 +732,19 @@ main( int argc, char** argv )
     rndgentxt = "CURAND DEVICE";
 #ifdef __CUDACC__
   rndgentxt += " (CUDA code)";
+#elif defined __HIPCC__
+  rndgentxt += " (HIP code)";
 #else
   rndgentxt += " (C++ code)";
 #endif
 
   // Workflow description summary
   std::string wrkflwtxt;
-  // -- CUDA or C++?
+  // -- CUDA or HIP or C++?
 #ifdef __CUDACC__
   wrkflwtxt += "CUD:";
+#elif defined __HIPCC__
+  wrkflwtxt += "HIP:";
 #else
   wrkflwtxt += "CPP:";
 #endif
@@ -764,6 +769,12 @@ main( int argc, char** argv )
 #else
   wrkflwtxt += "???:"; // no path to this statement
 #endif
+#elif defined __HIPCC__
+#if defined MGONGPU_CUCXTYPE_CXSMPL
+  wrkflwtxt += "CXS:";
+#else
+  wrkflwtxt += "???:"; // no path to this statement
+#endif
 #else
 #if defined MGONGPU_CPPCXTYPE_STDCOMPLEX
   wrkflwtxt += "STX:";
@@ -789,7 +800,7 @@ main( int argc, char** argv )
     wrkflwtxt += "RMBDEV+";
   else
     wrkflwtxt += "??????+"; // no path to this statement
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   // -- HOST or DEVICE matrix elements? Standalone MEs or BRIDGE?
   if( !bridge )
     wrkflwtxt += "MESDEV";
@@ -845,7 +856,7 @@ main( int argc, char** argv )
 
   if( perf )
   {
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 #ifdef _OPENMP
     // Get the output of "nproc --all" (https://stackoverflow.com/a/478960)
     std::string nprocall;
@@ -866,6 +877,8 @@ main( int argc, char** argv )
     std::cout << std::string( SEP79, '*' ) << std::endl
 #ifdef __CUDACC__
               << "Process                     = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_CUDA"
+#elif defined __HIPCC__
+              << "Process                     = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_HIP"
 #else
               << "Process                     = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_CPP"
 #endif
@@ -892,21 +905,21 @@ main( int argc, char** argv )
 #elif defined MGONGPU_FPTYPE_FLOAT
               << "FP precision                = FLOAT (NaN/abnormal=" << nabn << ", zero=" << nzero << ")" << std::endl
 #endif
-#ifdef __CUDACC__
 #if defined MGONGPU_CUCXTYPE_CUCOMPLEX
               << "Complex type                = CUCOMPLEX" << std::endl
 #elif defined MGONGPU_CUCXTYPE_THRUST
               << "Complex type                = THRUST::COMPLEX" << std::endl
-#endif
-#else
+#elif defined MGONGPU_CUCXTYPE_CXSMPL
               << "Complex type                = STD::COMPLEX" << std::endl
+#else
+              << "Complex type                = ???" << std::endl // no path to this statement...
 #endif
               << "RanNumb memory layout       = AOSOA[" << neppR << "]"
               << ( neppR == 1 ? " == AOS" : "" )
               << " [HARDCODED FOR REPRODUCIBILITY]" << std::endl
               << "Momenta memory layout       = AOSOA[" << neppM << "]"
               << ( neppM == 1 ? " == AOS" : "" ) << std::endl
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     //<< "Wavefunction GPU memory     = LOCAL" << std::endl
 #else
 #if !defined MGONGPU_CPPSIMD
@@ -937,7 +950,7 @@ main( int argc, char** argv )
 #endif
 #endif
               << "Random number generation    = " << rndgentxt << std::endl
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 #ifdef _OPENMP
               << "OMP threads / `nproc --all` = " << omp_get_max_threads() << " / " << nprocall // includes a newline
 #endif
@@ -1033,14 +1046,14 @@ main( int argc, char** argv )
              << "\"FLOAT (NaN/abnormal=" << nabn << ")\"," << std::endl
 #endif
              << "\"Complex type\": "
-#ifdef __CUDACC__
 #if defined MGONGPU_CUCXTYPE_CUCOMPLEX
              << "\"CUCOMPLEX\"," << std::endl
 #elif defined MGONGPU_CUCXTYPE_THRUST
              << "\"THRUST::COMPLEX\"," << std::endl
-#endif
-#else
+#elif defined MGONGPU_CUCXTYPE_CXSMPL
              << "\"STD::COMPLEX\"," << std::endl
+#else
+             << "\"???\"," << std::endl                           // no path to this statement...
 #endif
              << "\"RanNumb memory layout\": "
              << "\"AOSOA[" << neppR << "]\""
@@ -1048,7 +1061,7 @@ main( int argc, char** argv )
              << "\"Momenta memory layout\": "
              << "\"AOSOA[" << neppM << "]\""
              << ( neppM == 1 ? " == AOS" : "" ) << ", " << std::endl
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     //<< "\"Wavefunction GPU memory\": " << "\"LOCAL\"," << std::endl
 #endif
              << "\"Curand generation\": "
diff --git a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/RamboSamplingKernels.cc b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/RamboSamplingKernels.cc
index da68aa9255..79abbcc4f8 100644
--- a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/RamboSamplingKernels.cc
+++ b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/RamboSamplingKernels.cc
@@ -1,11 +1,11 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #include "RamboSamplingKernels.h"
 
-#include "CudaRuntime.h"
+#include "GpuRuntime.h"
 #include "MemoryAccessMomenta.h"
 #include "MemoryAccessRandomNumbers.h"
 #include "MemoryAccessWeights.h"
@@ -14,7 +14,7 @@
 
 #include <sstream>
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -92,7 +92,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   RamboSamplingKernelDevice::RamboSamplingKernelDevice( const fptype energy,               // input: energy
                                                         const BufferRndNumMomenta& rndmom, // input: random numbers in [0,1]
                                                         BufferMomenta& momenta,            // output: momenta
@@ -135,7 +135,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   __global__ void
   getMomentaInitialDevice( const fptype energy,
                            fptype* momenta )
@@ -147,17 +147,17 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   void
   RamboSamplingKernelDevice::getMomentaInitial()
   {
-    getMomentaInitialDevice<<<m_gpublocks, m_gputhreads>>>( m_energy, m_momenta.data() );
+    gpuLaunchKernel( getMomentaInitialDevice, m_gpublocks, m_gputhreads, m_energy, m_momenta.data() );
   }
 #endif
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   __global__ void
   getMomentaFinalDevice( const fptype energy,
                          const fptype* rndmom,
@@ -171,11 +171,11 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   void
   RamboSamplingKernelDevice::getMomentaFinal()
   {
-    getMomentaFinalDevice<<<m_gpublocks, m_gputhreads>>>( m_energy, m_rndmom.data(), m_momenta.data(), m_weights.data() );
+    gpuLaunchKernel( getMomentaFinalDevice, m_gpublocks, m_gputhreads, m_energy, m_rndmom.data(), m_momenta.data(), m_weights.data() );
   }
 #endif
 
diff --git a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/RamboSamplingKernels.h b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/RamboSamplingKernels.h
index 184089efd7..7c214cd74b 100644
--- a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/RamboSamplingKernels.h
+++ b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/RamboSamplingKernels.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef RAMBOSAMPLINGKERNELS_H
 #define RAMBOSAMPLINGKERNELS_H 1
@@ -10,7 +10,7 @@
 
 #include "MemoryBuffers.h"
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -93,7 +93,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   // A class encapsulating RAMBO phase space sampling on a GPU device
   class RamboSamplingKernelDevice final : public SamplingKernelBase, public NumberOfEvents
   {
diff --git a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/RandomNumberKernels.h b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/RandomNumberKernels.h
index 188a72c2c9..21d63beeac 100644
--- a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/RandomNumberKernels.h
+++ b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/RandomNumberKernels.h
@@ -1,14 +1,14 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef RANDOMNUMBERKERNELS_H
 #define RANDOMNUMBERKERNELS_H 1
 
 #include "mgOnGpuConfig.h"
 
-// NB This must come AFTER mgOnGpuConfig.h which contains our definition of __global__ when __CUDACC__ is not defined
+// NB This must come AFTER mgOnGpuConfig.h which contains our definition of __global__ when MGONGPUCPP_GPUIMPL is not defined
 #ifndef MGONGPU_HAS_NO_CURAND
 //#include "curand.h"
 struct curandGenerator_st; // forward definition from curand.h
@@ -16,7 +16,7 @@ struct curandGenerator_st; // forward definition from curand.h
 
 #include "MemoryBuffers.h"
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/cudacpp.mk b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/cudacpp.mk
index 43cee0977e..77334d2c04 100644
--- a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/cudacpp.mk
+++ b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/cudacpp.mk
@@ -1,7 +1,7 @@
 # Copyright (C) 2020-2023 CERN and UCLouvain.
 # Licensed under the GNU Lesser General Public License (version 3 or later).
 # Created by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin.
-# Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+# Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 
 #=== Determine the name of this makefile (https://ftp.gnu.org/old-gnu/Manuals/make-3.80/html_node/make_17.html)
 #=== NB: different names (e.g. cudacpp.mk and cudacpp_src.mk) are used in the Subprocess and src directories
@@ -32,7 +32,7 @@ UNAME_P := $(shell uname -p)
 #=== Configure common compiler flags for C++ and CUDA
 
 INCFLAGS = -I.
-OPTFLAGS = -O3 # this ends up in CUFLAGS too (should it?), cannot add -Ofast or -ffast-math here
+OPTFLAGS = -O3 # this ends up in GPUFLAGS too (should it?), cannot add -Ofast or -ffast-math here
 
 # Dependency on src directory
 MG5AMC_COMMONLIB = mg5amc_common
@@ -103,69 +103,135 @@ endif
 
 #-------------------------------------------------------------------------------
 
-#=== Configure the CUDA compiler
-
-# If CXX is not a single word (example "clang++ --gcc-toolchain...") then disable CUDA builds (issue #505)
-# This is because it is impossible to pass this to "CUFLAGS += -ccbin <host-compiler>" below
-ifneq ($(words $(subst ccache ,,$(CXX))),1) # allow at most "CXX=ccache <host-compiler>" from outside
-  $(warning CUDA builds are not supported for multi-word CXX "$(CXX)")
-  override CUDA_HOME=disabled
-endif
-
-# If CUDA_HOME is not set, try to set it from the location of nvcc
-ifndef CUDA_HOME
-  CUDA_HOME = $(patsubst %bin/nvcc,%,$(shell which nvcc 2>/dev/null))
-  $(warning CUDA_HOME was not set: using "$(CUDA_HOME)")
-endif
-
-# Set NVCC as $(CUDA_HOME)/bin/nvcc if it exists
-ifneq ($(wildcard $(CUDA_HOME)/bin/nvcc),)
-  NVCC = $(CUDA_HOME)/bin/nvcc
-  USE_NVTX ?=-DUSE_NVTX
-  # See https://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/index.html
-  # See https://arnon.dk/matching-sm-architectures-arch-and-gencode-for-various-nvidia-cards/
-  # Default: use compute capability 70 for V100 (CERN lxbatch, CERN itscrd, Juwels Cluster).
-  # Embed device code for 70, and PTX for 70+.
-  # Export MADGRAPH_CUDA_ARCHITECTURE (comma-separated list) to use another value or list of values (see #533).
-  # Examples: use 60 for P100 (Piz Daint), 80 for A100 (Juwels Booster, NVidia raplab/Curiosity).
-  MADGRAPH_CUDA_ARCHITECTURE ?= 70
-  ###CUARCHFLAGS = -gencode arch=compute_$(MADGRAPH_CUDA_ARCHITECTURE),code=compute_$(MADGRAPH_CUDA_ARCHITECTURE) -gencode arch=compute_$(MADGRAPH_CUDA_ARCHITECTURE),code=sm_$(MADGRAPH_CUDA_ARCHITECTURE) # Older implementation (AV): go back to this one for multi-GPU support #533
-  ###CUARCHFLAGS = --gpu-architecture=compute_$(MADGRAPH_CUDA_ARCHITECTURE) --gpu-code=sm_$(MADGRAPH_CUDA_ARCHITECTURE),compute_$(MADGRAPH_CUDA_ARCHITECTURE) # Newer implementation (SH): cannot use this as-is for multi-GPU support #533
-  comma:=,
-  CUARCHFLAGS = $(foreach arch,$(subst $(comma), ,$(MADGRAPH_CUDA_ARCHITECTURE)),-gencode arch=compute_$(arch),code=compute_$(arch) -gencode arch=compute_$(arch),code=sm_$(arch))
-  CUINC = -I$(CUDA_HOME)/include/
-  CURANDLIBFLAGS = -L$(CUDA_HOME)/lib64/ -lcurand # NB: -lcuda is not needed here!
-  CUOPTFLAGS = -lineinfo
-  CUFLAGS = $(foreach opt, $(OPTFLAGS), -Xcompiler $(opt)) $(CUOPTFLAGS) $(INCFLAGS) $(CUINC) $(USE_NVTX) $(CUARCHFLAGS) -use_fast_math
-  ###CUFLAGS += -Xcompiler -Wall -Xcompiler -Wextra -Xcompiler -Wshadow
-  ###NVCC_VERSION = $(shell $(NVCC) --version | grep 'Cuda compilation tools' | cut -d' ' -f5 | cut -d, -f1)
-  CUFLAGS += -std=c++17 # need CUDA >= 11.2 (see #333): this is enforced in mgOnGpuConfig.h
-  # Without -maxrregcount: baseline throughput: 6.5E8 (16384 32 12) up to 7.3E8 (65536 128 12)
-  ###CUFLAGS+= --maxrregcount 160 # improves throughput: 6.9E8 (16384 32 12) up to 7.7E8 (65536 128 12)
-  ###CUFLAGS+= --maxrregcount 128 # improves throughput: 7.3E8 (16384 32 12) up to 7.6E8 (65536 128 12)
-  ###CUFLAGS+= --maxrregcount 96 # degrades throughput: 4.1E8 (16384 32 12) up to 4.5E8 (65536 128 12)
-  ###CUFLAGS+= --maxrregcount 64 # degrades throughput: 1.7E8 (16384 32 12) flat at 1.7E8 (65536 128 12)
-else ifneq ($(origin REQUIRE_CUDA),undefined)
-  # If REQUIRE_CUDA is set but no cuda is found, stop here (e.g. for CI tests on GPU #443)
-  $(error No cuda installation found (set CUDA_HOME or make nvcc visible in PATH))
-else
-  # No cuda. Switch cuda compilation off and go to common random numbers in C++
-  $(warning CUDA_HOME is not set or is invalid: export CUDA_HOME to compile with cuda)
-  override NVCC=
-  override USE_NVTX=
-  override CUINC=
-  override CURANDLIBFLAGS=
-endif
-export NVCC
-export CUFLAGS
+CUDA_COMPILER_PATH := $(shell compiler="`which nvcc 2>/dev/null`" && while [ -L "$$compiler" ]; do compiler=`readlink "$$compiler"`; done && echo "$$compiler")
+HIP_COMPILER_PATH := $(shell compiler="`which hipcc 2>/dev/null`" && while [ -L "$$compiler" ]; do compiler=`readlink "$$compiler"`; done && echo "$$compiler")
+
+ifeq ($(findstring nvcc,$(CUDA_COMPILER_PATH)),nvcc)
+  #=== Configure the CUDA compiler
+
+  # If CXX is not a single word (example "clang++ --gcc-toolchain...") then disable CUDA builds (issue #505)
+  # This is because it is impossible to pass this to "GPUFLAGS += -ccbin <host-compiler>" below
+  ifneq ($(words $(subst ccache ,,$(CXX))),1) # allow at most "CXX=ccache <host-compiler>" from outside
+    $(warning CUDA builds are not supported for multi-word CXX "$(CXX)")
+    override CUDA_HOME=disabled
+  endif
+
+  # If CUDA_HOME is not set, try to set it from the location of nvcc
+  ifndef CUDA_HOME
+    CUDA_HOME = $(patsubst %bin/nvcc,%,$(shell which nvcc 2>/dev/null))
+    $(warning CUDA_HOME was not set: using "$(CUDA_HOME)")
+  endif
+
+  # Set GPUCC as $(CUDA_HOME)/bin/nvcc if it exists
+  ifneq ($(wildcard $(CUDA_HOME)/bin/nvcc),)
+    GPUCC = $(CUDA_HOME)/bin/nvcc
+    USE_NVTX ?=-DUSE_NVTX
+    # See https://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/index.html
+    # See https://arnon.dk/matching-sm-architectures-arch-and-gencode-for-various-nvidia-cards/
+    # Default: use compute capability 70 for V100 (CERN lxbatch, CERN itscrd, Juwels Cluster).
+    # Embed device code for 70, and PTX for 70+.
+    # Export MADGRAPH_CUDA_ARCHITECTURE (comma-separated list) to use another value or list of values (see #533).
+    # Examples: use 60 for P100 (Piz Daint), 80 for A100 (Juwels Booster, NVidia raplab/Curiosity).
+    MADGRAPH_CUDA_ARCHITECTURE ?= 70
+    ###CUARCHFLAGS = -gencode arch=compute_$(MADGRAPH_CUDA_ARCHITECTURE),code=compute_$(MADGRAPH_CUDA_ARCHITECTURE) -gencode arch=compute_$(MADGRAPH_CUDA_ARCHITECTURE),code=sm_$(MADGRAPH_CUDA_ARCHITECTURE) # Older implementation (AV): go back to this one for multi-GPU support #533
+    ###CUARCHFLAGS = --gpu-architecture=compute_$(MADGRAPH_CUDA_ARCHITECTURE) --gpu-code=sm_$(MADGRAPH_CUDA_ARCHITECTURE),compute_$(MADGRAPH_CUDA_ARCHITECTURE) # Newer implementation (SH): cannot use this as-is for multi-GPU support #533
+    comma:=,
+    CUARCHFLAGS = $(foreach arch,$(subst $(comma), ,$(MADGRAPH_CUDA_ARCHITECTURE)),-gencode arch=compute_$(arch),code=compute_$(arch) -gencode arch=compute_$(arch),code=sm_$(arch))
+    CUINC = -I$(CUDA_HOME)/include/
+    CURANDLIBFLAGS = -L$(CUDA_HOME)/lib64/ -lcurand # NB: -lcuda is not needed here!
+    CUOPTFLAGS = -lineinfo
+    GPUFLAGS = $(foreach opt, $(OPTFLAGS), -Xcompiler $(opt)) $(CUOPTFLAGS) $(INCFLAGS) $(CUINC) $(USE_NVTX) $(CUARCHFLAGS) -use_fast_math
+    ###GPUFLAGS += -Xcompiler -Wall -Xcompiler -Wextra -Xcompiler -Wshadow
+    ###GPUCC_VERSION = $(shell $(GPUCC) --version | grep 'Cuda compilation tools' | cut -d' ' -f5 | cut -d, -f1)
+    GPUFLAGS += -std=c++17 # need CUDA >= 11.2 (see #333): this is enforced in mgOnGpuConfig.h
+    # Without -maxrregcount: baseline throughput: 6.5E8 (16384 32 12) up to 7.3E8 (65536 128 12)
+    ###GPUFLAGS+= --maxrregcount 160 # improves throughput: 6.9E8 (16384 32 12) up to 7.7E8 (65536 128 12)
+    ###GPUFLAGS+= --maxrregcount 128 # improves throughput: 7.3E8 (16384 32 12) up to 7.6E8 (65536 128 12)
+    ###GPUFLAGS+= --maxrregcount 96 # degrades throughput: 4.1E8 (16384 32 12) up to 4.5E8 (65536 128 12)
+    ###GPUFLAGS+= --maxrregcount 64 # degrades throughput: 1.7E8 (16384 32 12) flat at 1.7E8 (65536 128 12)
+    CUBUILDRULEFLAGS = -Xcompiler -fPIC -c
+    CCBUILDRULEFLAGS = -Xcompiler -fPIC -c -x cu
+    CUDATESTFLAGS = -lcuda
+  else ifneq ($(origin REQUIRE_CUDA),undefined)
+    # If REQUIRE_CUDA is set but no cuda is found, stop here (e.g. for CI tests on GPU #443)
+    $(error No cuda installation found (set CUDA_HOME or make GPUCC visible in PATH))
+  else
+    # No cuda. Switch cuda compilation off and go to common random numbers in C++
+    $(warning CUDA_HOME is not set or is invalid: export CUDA_HOME to compile with cuda)
+    override GPUCC=
+    override USE_NVTX=
+    override CUINC=
+    override CURANDLIBFLAGS=
+  endif
+  export GPUCC
+  export GPUFLAGS
+
+  # Set the host C++ compiler for GPUCC via "-ccbin <host-compiler>"
+  # (NB issue #505: this must be a single word, "clang++ --gcc-toolchain..." is not supported)
+  GPUFLAGS += -ccbin $(shell which $(subst ccache ,,$(CXX)))
+
+  # Allow newer (unsupported) C++ compilers with older versions of CUDA if ALLOW_UNSUPPORTED_COMPILER_IN_CUDA is set (#504)
+  ifneq ($(origin ALLOW_UNSUPPORTED_COMPILER_IN_CUDA),undefined)
+  GPUFLAGS += -allow-unsupported-compiler
+  endif
+
+else ifeq ($(findstring hipcc,$(HIP_COMPILER_PATH)),hipcc)
+  #=== Configure the HIP compiler
+
+  # If CXX is not a single word (example "clang++ --gcc-toolchain...") then disable CUDA builds (issue #505)
+  # This is because it is impossible to pass this to "GPUFLAGS += -ccbin <host-compiler>" below
+  ifneq ($(words $(subst ccache ,,$(CXX))),1) # allow at most "CXX=ccache <host-compiler>" from outside
+    $(warning CUDA builds are not supported for multi-word CXX "$(CXX)")
+    override CUDA_HOME=disabled
+  endif
+
+  # If HIP_HOME is not set, try to set it from the location of GPUCC
+  ifndef HIP_HOME
+    HIP_HOME = $(patsubst %bin/hipcc,%,$(HIP_COMPILER_PATH))
+    $(warning HIP_HOME was not set: using "$(HIP_HOME)")
+  endif
 
-# Set the host C++ compiler for nvcc via "-ccbin <host-compiler>"
-# (NB issue #505: this must be a single word, "clang++ --gcc-toolchain..." is not supported)
-CUFLAGS += -ccbin $(shell which $(subst ccache ,,$(CXX)))
+  # Set GPUCC as $(HIP_HOME)/bin/hipcc if it exists
+  ifneq ($(wildcard $(HIP_HOME)/bin/hipcc),)
+    GPUCC = $(HIP_HOME)/bin/hipcc
+
+    # Should maybe find something equivelant to this in HIP
+    #USE_NVTX ?=-DUSE_NVTX
+
+    HIPARCHFLAGS = -target x86_64-linux-gnu --offload-arch=gfx90a
+    HIPINC = -I$(HIP_HOME)/include/
+
+    # -DHIP_FAST_MATH equivelant to -use_fast_math in HIP 
+    # (But only for single precision line 208: https://rocm-developer-tools.github.io/HIP/hcc__detail_2math__functions_8h_source.html)
+    GPUFLAGS = $(OPTFLAGS) $(CUOPTFLAGS) $(INCFLAGS) $(HIPINC) $(HIPARCHFLAGS) -DHIP_FAST_MATH -DHIP_PLATFORM=amd -fPIC
+    ###GPUFLAGS += -Xcompiler -Wall -Xcompiler -Wextra -Xcompiler -Wshadow
+    GPUFLAGS += -std=c++17
+    # Without -maxrregcount: baseline throughput: 6.5E8 (16384 32 12) up to 7.3E8 (65536 128 12)
+    ###GPUFLAGS+= --maxrregcount 160 # improves throughput: 6.9E8 (16384 32 12) up to 7.7E8 (65536 128 12)
+    ###GPUFLAGS+= --maxrregcount 128 # improves throughput: 7.3E8 (16384 32 12) up to 7.6E8 (65536 128 12)
+    ###GPUFLAGS+= --maxrregcount 96 # degrades throughput: 4.1E8 (16384 32 12) up to 4.5E8 (65536 128 12)
+    ###GPUFLAGS+= --maxrregcount 64 # degrades throughput: 1.7E8 (16384 32 12) flat at 1.7E8 (65536 128 12)
+
+    CUBUILDRULEFLAGS = -fPIC -c
+    CCBUILDRULEFLAGS = -fPIC -c
+
+  else ifneq ($(origin REQUIRE_HIP),undefined)
+    # If REQUIRE_HIP is set but no cuda is found, stop here (e.g. for CI tests on GPU #443)
+    $(error No hip installation found (set HIP_HOME or make GPUCC visible in PATH))
+  else
+    # No hip. Switch hip compilation off and go to common random numbers in C++
+    $(warning HIP_HOME is not set or is invalid: export HIP_HOME to compile with hip)
+    override GPUCC=
+    override USE_NVTX=
+    override CUINC=
+    override CURANDLIBFLAGS=
+  endif
+
+  # Allow newer (unsupported) C++ compilers with older versions of CUDA if ALLOW_UNSUPPORTED_COMPILER_IN_CUDA is set (#504)
+  ifneq ($(origin ALLOW_UNSUPPORTED_COMPILER_IN_CUDA),undefined)
+  GPUFLAGS += -allow-unsupported-compiler
+  endif
 
-# Allow newer (unsupported) C++ compilers with older versions of CUDA if ALLOW_UNSUPPORTED_COMPILER_IN_CUDA is set (#504)
-ifneq ($(origin ALLOW_UNSUPPORTED_COMPILER_IN_CUDA),undefined)
-CUFLAGS += -allow-unsupported-compiler
 endif
 
 #-------------------------------------------------------------------------------
@@ -179,9 +245,9 @@ endif
 #ifeq ($(USECCACHE)$(shell echo $(AR) | grep ccache),1)
 #  override AR:=ccache $(AR)
 #endif
-ifneq ($(NVCC),)
-  ifeq ($(USECCACHE)$(shell echo $(NVCC) | grep ccache),1)
-    override NVCC:=ccache $(NVCC)
+ifneq ($(GPUCC),)
+  ifeq ($(USECCACHE)$(shell echo $(GPUCC) | grep ccache),1)
+    override GPUCC:=ccache $(GPUCC)
   endif
 endif
 
@@ -205,7 +271,7 @@ endif
 
 # PowerPC-specific CUDA compiler flags (to be reviewed!)
 ifeq ($(UNAME_P),ppc64le)
-  CUFLAGS+= -Xcompiler -mno-float128
+  GPUFLAGS+= -Xcompiler -mno-float128
 endif
 
 #-------------------------------------------------------------------------------
@@ -215,10 +281,10 @@ endif
 # Set the default OMPFLAGS choice
 ifneq ($(shell $(CXX) --version | egrep '^Intel'),)
 override OMPFLAGS = -fopenmp
-###override OMPFLAGS = # disable OpenMP MT on Intel (was ok without nvcc but not ok with nvcc before #578)
+###override OMPFLAGS = # disable OpenMP MT on Intel (was ok without GPUCC but not ok with GPUCC before #578)
 else ifneq ($(shell $(CXX) --version | egrep '^(clang)'),)
 override OMPFLAGS = -fopenmp
-###override OMPFLAGS = # disable OpenMP MT on clang (was not ok without or with nvcc before #578)
+###override OMPFLAGS = # disable OpenMP MT on clang (was not ok without or with GPUCC before #578)
 else ifneq ($(shell $(CXX) --version | egrep '^(Apple clang)'),)
 override OMPFLAGS = # disable OpenMP MT on Apple clang (builds fail in the CI #578)
 else
@@ -269,7 +335,10 @@ endif
 
 # Set the default RNDGEN (random number generator) choice
 ifeq ($(RNDGEN),)
-  ifeq ($(NVCC),)
+  ifeq ($(GPUCC),)
+    override RNDGEN = hasNoCurand
+  # Edgecase for HIP compilation
+  else ifeq ($(findstring hipcc,$(GPUCC)),hipcc)
     override RNDGEN = hasNoCurand
   else ifeq ($(RNDGEN),)
     override RNDGEN = hasCurand
@@ -344,13 +413,13 @@ CXXFLAGS+= $(AVXFLAGS)
 $(info FPTYPE=$(FPTYPE))
 ifeq ($(FPTYPE),d)
   CXXFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_DOUBLE
-  CUFLAGS  += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_DOUBLE
+  GPUFLAGS  += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_DOUBLE
 else ifeq ($(FPTYPE),f)
   CXXFLAGS += -DMGONGPU_FPTYPE_FLOAT -DMGONGPU_FPTYPE2_FLOAT
-  CUFLAGS  += -DMGONGPU_FPTYPE_FLOAT -DMGONGPU_FPTYPE2_FLOAT
+  GPUFLAGS  += -DMGONGPU_FPTYPE_FLOAT -DMGONGPU_FPTYPE2_FLOAT
 else ifeq ($(FPTYPE),m)
   CXXFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_FLOAT
-  CUFLAGS  += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_FLOAT
+  GPUFLAGS  += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_FLOAT
 else
   $(error Unknown FPTYPE='$(FPTYPE)': only 'd', 'f' and 'm' are supported)
 endif
@@ -359,7 +428,7 @@ endif
 $(info HELINL=$(HELINL))
 ifeq ($(HELINL),1)
   CXXFLAGS += -DMGONGPU_INLINE_HELAMPS
-  CUFLAGS  += -DMGONGPU_INLINE_HELAMPS
+  GPUFLAGS  += -DMGONGPU_INLINE_HELAMPS
 else ifneq ($(HELINL),0)
   $(error Unknown HELINL='$(HELINL)': only '0' and '1' are supported)
 endif
@@ -368,7 +437,7 @@ endif
 $(info HRDCOD=$(HRDCOD))
 ifeq ($(HRDCOD),1)
   CXXFLAGS += -DMGONGPU_HARDCODE_PARAM
-  CUFLAGS  += -DMGONGPU_HARDCODE_PARAM
+  GPUFLAGS  += -DMGONGPU_HARDCODE_PARAM
 else ifneq ($(HRDCOD),0)
   $(error Unknown HRDCOD='$(HRDCOD)': only '0' and '1' are supported)
 endif
@@ -420,11 +489,11 @@ ifeq ($(UNAME_S),Darwin)
   override CULIBFLAGSRPATH2 =
 else
   # RPATH to cuda/cpp libs when linking executables
-  override CXXLIBFLAGSRPATH = -Wl,-rpath,$(LIBDIRRPATH)
-  override CULIBFLAGSRPATH = -Xlinker -rpath,$(LIBDIRRPATH)
+  override CXXLIBFLAGSRPATH = -Wl,-rpath=$(LIBDIRRPATH)
+  override CULIBFLAGSRPATH = -Xlinker -rpath=$(LIBDIRRPATH)
   # RPATH to common lib when linking cuda/cpp libs
-  override CXXLIBFLAGSRPATH2 = -Wl,-rpath,'$$ORIGIN'
-  override CULIBFLAGSRPATH2 = -Xlinker -rpath,'$$ORIGIN'
+  override CXXLIBFLAGSRPATH2 = -Wl,-rpath='$$ORIGIN'
+  override CULIBFLAGSRPATH2 = -Xlinker -rpath='$$ORIGIN'
 endif
 
 # Setting LD_LIBRARY_PATH or DYLD_LIBRARY_PATH in the RUNTIME is no longer necessary (neither on Linux nor on Mac)
@@ -437,7 +506,7 @@ override RUNTIME =
 cxx_main=$(BUILDDIR)/check.exe
 fcxx_main=$(BUILDDIR)/fcheck.exe
 
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 cu_main=$(BUILDDIR)/gcheck.exe
 fcu_main=$(BUILDDIR)/fgcheck.exe
 else
@@ -468,15 +537,16 @@ $(BUILDDIR)/.build.$(TAG):
 	@touch $(BUILDDIR)/.build.$(TAG)
 
 # Generic target and build rules: objects from CUDA compilation
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 $(BUILDDIR)/%.o : %.cu *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
 	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
-	$(NVCC) $(CPPFLAGS) $(CUFLAGS) -Xcompiler -fPIC -c $< -o $@
+	$(GPUCC) $(CPPFLAGS) $(GPUFLAGS) $(CUBUILDRULEFLAGS) $< -o $@
 
 $(BUILDDIR)/%_cu.o : %.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
 	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
-	$(NVCC) $(CPPFLAGS) $(CUFLAGS) -Xcompiler -fPIC -c -x cu $< -o $@
+	$(GPUCC) $(CPPFLAGS) $(GPUFLAGS) $(CCBUILDRULEFLAGS) $< -o $@
 endif
+# -x cu in line above
 
 # Generic target and build rules: objects from C++ compilation
 # (NB do not include CUINC here! add it only for NVTX or curand #679)
@@ -485,11 +555,14 @@ $(BUILDDIR)/%.o : %.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
 	$(CXX) $(CPPFLAGS) $(CXXFLAGS) -fPIC -c $< -o $@
 
 # Apply special build flags only to CrossSectionKernel.cc and gCrossSectionKernel.cu (no fast math, see #117 and #516)
+# Added edgecase for HIP compilation
 ifeq ($(shell $(CXX) --version | grep ^nvc++),)
 $(BUILDDIR)/CrossSectionKernels.o: CXXFLAGS := $(filter-out -ffast-math,$(CXXFLAGS))
 $(BUILDDIR)/CrossSectionKernels.o: CXXFLAGS += -fno-fast-math
-ifneq ($(NVCC),)
-$(BUILDDIR)/gCrossSectionKernels.o: CUFLAGS += -Xcompiler -fno-fast-math
+ifeq ($(findstring nvcc,$(GPUCC)),nvcc)
+  $(BUILDDIR)/gCrossSectionKernels.o: GPUFLAGS += -Xcompiler -fno-fast-math
+else
+  $(BUILDDIR)/gCrossSectionKernels.o: GPUFLAGS += -fno-fast-math
 endif
 endif
 
@@ -505,10 +578,10 @@ ifeq ($(RNDGEN),hasCurand)
 $(BUILDDIR)/CurandRandomNumberKernel.o: CXXFLAGS += $(CUINC)
 endif
 
-# Avoid "warning: builtin __has_trivial_... is deprecated; use __is_trivially_... instead" in nvcc with icx2023 (#592)
+# Avoid "warning: builtin __has_trivial_... is deprecated; use __is_trivially_... instead" in GPUCC with icx2023 (#592)
 ifneq ($(shell $(CXX) --version | egrep '^(Intel)'),)
-ifneq ($(NVCC),)
-CUFLAGS += -Xcompiler -Wno-deprecated-builtins
+ifneq ($(GPUCC),)
+GPUFLAGS += -Wno-deprecated-builtins
 endif
 endif
 
@@ -516,8 +589,8 @@ endif
 # This patch does remove the warning, but I prefer to keep it disabled for the moment...
 ###ifneq ($(shell $(CXX) --version | egrep '^(clang|Apple clang|Intel)'),)
 ###$(BUILDDIR)/CrossSectionKernels.o: CXXFLAGS += -Wno-overriding-t-option
-###ifneq ($(NVCC),)
-###$(BUILDDIR)/gCrossSectionKernels.o: CUFLAGS += -Xcompiler -Wno-overriding-t-option
+###ifneq ($(GPUCC),)
+###$(BUILDDIR)/gCrossSectionKernels.o: GPUFLAGS += -Xcompiler -Wno-overriding-t-option
 ###endif
 ###endif
 
@@ -544,7 +617,7 @@ MG5AMC_CXXLIB = mg5amc_$(processid_short)_cpp
 cxx_objects_lib=$(BUILDDIR)/CPPProcess.o $(BUILDDIR)/MatrixElementKernels.o $(BUILDDIR)/BridgeKernels.o $(BUILDDIR)/CrossSectionKernels.o
 cxx_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel.o $(BUILDDIR)/RamboSamplingKernels.o
 
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 MG5AMC_CULIB = mg5amc_$(processid_short)_cuda
 cu_objects_lib=$(BUILDDIR)/gCPPProcess.o $(BUILDDIR)/gMatrixElementKernels.o $(BUILDDIR)/gBridgeKernels.o $(BUILDDIR)/gCrossSectionKernels.o
 cu_objects_exe=$(BUILDDIR)/gCommonRandomNumberKernel.o $(BUILDDIR)/gRamboSamplingKernels.o
@@ -556,11 +629,11 @@ $(LIBDIR)/lib$(MG5AMC_CXXLIB).so: cxx_objects_lib += $(BUILDDIR)/fbridge.o
 $(LIBDIR)/lib$(MG5AMC_CXXLIB).so: $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib)
 	$(CXX) -shared -o $@ $(cxx_objects_lib) $(CXXLIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB)
 
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 $(LIBDIR)/lib$(MG5AMC_CULIB).so: $(BUILDDIR)/fbridge_cu.o
 $(LIBDIR)/lib$(MG5AMC_CULIB).so: cu_objects_lib += $(BUILDDIR)/fbridge_cu.o
 $(LIBDIR)/lib$(MG5AMC_CULIB).so: $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cu_objects_lib)
-	$(NVCC) --shared -o $@ $(cu_objects_lib) $(CULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB)
+	$(GPUCC) --shared -o $@ $(cu_objects_lib) $(CULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB)
 endif
 
 #-------------------------------------------------------------------------------
@@ -577,16 +650,16 @@ $(cxx_main): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PAT
 $(cxx_main): $(BUILDDIR)/check_sa.o $(LIBDIR)/lib$(MG5AMC_CXXLIB).so $(cxx_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel.o
 	$(CXX) -o $@ $(BUILDDIR)/check_sa.o $(OMPFLAGS) -ldl -pthread $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_CXXLIB) $(cxx_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel.o $(CURANDLIBFLAGS)
 
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 ifneq ($(shell $(CXX) --version | grep ^Intel),)
-$(cu_main): LIBFLAGS += -lintlc # compile with icpx and link with nvcc (undefined reference to `_intel_fast_memcpy')
-$(cu_main): LIBFLAGS += -lsvml # compile with icpx and link with nvcc (undefined reference to `__svml_cos4_l9')
+$(cu_main): LIBFLAGS += -lintlc # compile with icpx and link with GPUCC (undefined reference to `_intel_fast_memcpy')
+$(cu_main): LIBFLAGS += -lsvml # compile with icpx and link with GPUCC (undefined reference to `__svml_cos4_l9')
 else ifneq ($(shell $(CXX) --version | grep ^nvc++),) # support nvc++ #531
 $(cu_main): LIBFLAGS += -L$(patsubst %bin/nvc++,%lib,$(subst ccache ,,$(CXX))) -lnvhpcatm -lnvcpumath -lnvc
 endif
 $(cu_main): LIBFLAGS += $(CULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH
 $(cu_main): $(BUILDDIR)/gcheck_sa.o $(LIBDIR)/lib$(MG5AMC_CULIB).so $(cu_objects_exe) $(BUILDDIR)/gCurandRandomNumberKernel.o
-	$(NVCC) -o $@ $(BUILDDIR)/gcheck_sa.o $(CUARCHFLAGS) $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe) $(BUILDDIR)/gCurandRandomNumberKernel.o $(CURANDLIBFLAGS)
+	$(GPUCC) -o $@ $(BUILDDIR)/gcheck_sa.o $(CUARCHFLAGS) $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe) $(BUILDDIR)/gCurandRandomNumberKernel.o $(CURANDLIBFLAGS)
 endif
 
 #-------------------------------------------------------------------------------
@@ -612,17 +685,17 @@ $(fcxx_main): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PA
 $(fcxx_main): $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler.o $(LIBDIR)/lib$(MG5AMC_CXXLIB).so $(cxx_objects_exe)
 	$(CXX) -o $@ $(BUILDDIR)/fcheck_sa.o $(OMPFLAGS) $(BUILDDIR)/fsampler.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CXXLIB) $(cxx_objects_exe)
 
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 ifneq ($(shell $(CXX) --version | grep ^Intel),)
-$(fcu_main): LIBFLAGS += -lintlc # compile with icpx and link with nvcc (undefined reference to `_intel_fast_memcpy')
-$(fcu_main): LIBFLAGS += -lsvml # compile with icpx and link with nvcc (undefined reference to `__svml_cos4_l9')
+$(fcu_main): LIBFLAGS += -lintlc # compile with icpx and link with GPUCC (undefined reference to `_intel_fast_memcpy')
+$(fcu_main): LIBFLAGS += -lsvml # compile with icpx and link with GPUCC (undefined reference to `__svml_cos4_l9')
 endif
 ifeq ($(UNAME_S),Darwin)
 $(fcu_main): LIBFLAGS += -L$(shell dirname $(shell $(FC) --print-file-name libgfortran.dylib)) # add path to libgfortran on Mac #375
 endif
 $(fcu_main): LIBFLAGS += $(CULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH
 $(fcu_main): $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler_cu.o $(LIBDIR)/lib$(MG5AMC_CULIB).so $(cu_objects_exe)
-	$(NVCC) -o $@ $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler_cu.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe)
+	$(GPUCC) -o $@ $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler_cu.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe)
 endif
 
 #-------------------------------------------------------------------------------
@@ -634,7 +707,7 @@ $(BUILDDIR)/testxxx.o: testxxx_cc_ref.txt
 $(testmain): $(BUILDDIR)/testxxx.o
 $(testmain): cxx_objects_exe += $(BUILDDIR)/testxxx.o # Comment out this line to skip the C++ test of xxx functions
 
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 $(BUILDDIR)/testxxx_cu.o: $(GTESTLIBS)
 $(BUILDDIR)/testxxx_cu.o: INCFLAGS += $(GTESTINC)
 $(BUILDDIR)/testxxx_cu.o: testxxx_cc_ref.txt
@@ -647,7 +720,7 @@ $(BUILDDIR)/testmisc.o: INCFLAGS += $(GTESTINC)
 $(testmain): $(BUILDDIR)/testmisc.o
 $(testmain): cxx_objects_exe += $(BUILDDIR)/testmisc.o # Comment out this line to skip the C++ miscellaneous tests
 
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 $(BUILDDIR)/testmisc_cu.o: $(GTESTLIBS)
 $(BUILDDIR)/testmisc_cu.o: INCFLAGS += $(GTESTINC)
 $(testmain): $(BUILDDIR)/testmisc_cu.o
@@ -659,12 +732,12 @@ $(BUILDDIR)/runTest.o: INCFLAGS += $(GTESTINC)
 $(testmain): $(BUILDDIR)/runTest.o
 $(testmain): cxx_objects_exe += $(BUILDDIR)/runTest.o
 
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 $(BUILDDIR)/runTest_cu.o: $(GTESTLIBS)
 $(BUILDDIR)/runTest_cu.o: INCFLAGS += $(GTESTINC)
 ifneq ($(shell $(CXX) --version | grep ^Intel),)
-$(testmain): LIBFLAGS += -lintlc # compile with icpx and link with nvcc (undefined reference to `_intel_fast_memcpy')
-$(testmain): LIBFLAGS += -lsvml # compile with icpx and link with nvcc (undefined reference to `__svml_cos4_l9')
+$(testmain): LIBFLAGS += -lintlc # compile with icpx and link with GPUCC (undefined reference to `_intel_fast_memcpy')
+$(testmain): LIBFLAGS += -lsvml # compile with icpx and link with GPUCC (undefined reference to `__svml_cos4_l9')
 else ifneq ($(shell $(CXX) --version | grep ^nvc++),) # support nvc++ #531
 $(testmain): LIBFLAGS += -L$(patsubst %bin/nvc++,%lib,$(subst ccache ,,$(CXX))) -lnvhpcatm -lnvcpumath -lnvc
 endif
@@ -688,14 +761,14 @@ $(testmain): LIBFLAGS += -lgomp
 endif
 endif
 
-ifeq ($(NVCC),) # link only runTest.o
+ifeq ($(GPUCC),) # link only runTest.o
 $(testmain): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH
 $(testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(cxx_objects_exe) $(GTESTLIBS)
 	$(CXX) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) -ldl -pthread $(LIBFLAGS)
 else # link both runTest.o and runTest_cu.o
 $(testmain): LIBFLAGS += $(CULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH
 $(testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) $(GTESTLIBS)
-	$(NVCC) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) -ldl $(LIBFLAGS) -lcuda
+	  $(GPUCC) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) -ldl $(LIBFLAGS) $(CUDATESTFLAGS)
 endif
 
 # Use flock (Linux only, no Mac) to allow 'make -j' if googletest has not yet been downloaded https://stackoverflow.com/a/32666215
@@ -798,9 +871,9 @@ ifeq ($(USECCACHE),1)
 	ccache --version | head -1
 endif
 	@echo ""
-	@echo NVCC=$(NVCC)
-ifneq ($(NVCC),)
-	$(NVCC) --version
+	@echo GPUCC=$(GPUCC)
+ifneq ($(GPUCC),)
+	$(GPUCC) --version
 endif
 	@echo ""
 	@echo CXX=$(CXX)
@@ -819,7 +892,7 @@ endif
 
 # Target: check (run the C++ test executable)
 # [NB THIS IS WHAT IS USED IN THE GITHUB CI!]
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 check: runTest cmpFcheck cmpFGcheck
 else
 check: runTest cmpFcheck
diff --git a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/fbridge.cc b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/fbridge.cc
index f93c05b0b3..2b956730d4 100644
--- a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/fbridge.cc
+++ b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/fbridge.cc
@@ -1,11 +1,11 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: S. Roiser (Oct 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Roiser, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #include "Bridge.h"
 #include "CPPProcess.h"
-#include "CudaRuntime.h"
+#include "GpuRuntime.h"
 
 extern "C"
 {
@@ -22,7 +22,7 @@ extern "C"
    * Using the same Fortran MadEvent code, linking to the hetrerogeneous library would allow access to both CPU and GPU implementations.
    * The specific heterogeneous configuration (how many GPUs, how many threads on each CPU, etc) could be loaded in CUDA/C++ from a data file.
    */
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   using namespace mg5amcGpu;
 #else
   using namespace mg5amcCpu;
@@ -46,8 +46,8 @@ extern "C"
    */
   void fbridgecreate_( CppObjectInFortran** ppbridge, const int* pnevtF, const int* pnparF, const int* pnp4F )
   {
-#ifdef __CUDACC__
-    CudaRuntime::setUp();
+#ifdef MGONGPUCPP_GPUIMPL
+    GpuRuntime::setUp();
 #endif
     // Create a process object, read parm card and set parameters
     // FIXME: the process instance can happily go out of scope because it is only needed to read parameters?
@@ -69,8 +69,8 @@ extern "C"
     Bridge<FORTRANFPTYPE>* pbridge = dynamic_cast<Bridge<FORTRANFPTYPE>*>( *ppbridge );
     if( pbridge == 0 ) throw std::runtime_error( "fbridgedelete_: invalid Bridge address" );
     delete pbridge;
-#ifdef __CUDACC__
-    CudaRuntime::tearDown();
+#ifdef MGONGPUCPP_GPUIMPL
+    GpuRuntime::tearDown();
 #endif
   }
 
@@ -100,7 +100,7 @@ extern "C"
   {
     Bridge<FORTRANFPTYPE>* pbridge = dynamic_cast<Bridge<FORTRANFPTYPE>*>( *ppbridge );
     if( pbridge == 0 ) throw std::runtime_error( "fbridgesequence_: invalid Bridge address" );
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     // Use the device/GPU implementation in the CUDA library
     // (there is also a host implementation in this library)
     pbridge->gpu_sequence( momenta, gs, rndhel, rndcol, *pchannelId, mes, selhel, selcol );
diff --git a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/fsampler.cc b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/fsampler.cc
index 2fb445372d..3743934f41 100644
--- a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/fsampler.cc
+++ b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/fsampler.cc
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Feb 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #include "mgOnGpuConfig.h"
 
@@ -13,7 +13,7 @@
 
 //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -40,7 +40,7 @@ namespace mg5amcCpu
   private:
     const int m_nevt; // The number of events in each iteration
     int m_iiter;      // The iteration counter (for random number seeding)
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
     HostBufferRndNumMomenta m_hstRndmom; // Memory buffers for random numbers
     HostBufferMomenta m_hstMomenta;      // Memory buffers for momenta
     HostBufferWeights m_hstWeights;      // Memory buffers for sampling weights
@@ -105,7 +105,7 @@ namespace mg5amcCpu
 
 extern "C"
 {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   using namespace mg5amcGpu;
 #else
   using namespace mg5amcCpu;
diff --git a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/runTest.cc b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/runTest.cc
index df7488178e..4c9bc9ee6b 100644
--- a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/runTest.cc
+++ b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/runTest.cc
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: S. Hageboeck (Nov 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 //----------------------------------------------------------------------------
 // Use ./runTest.exe --gtest_filter=*xxx to run only testxxx.cc tests
 //----------------------------------------------------------------------------
@@ -18,7 +18,7 @@
 #include "RandomNumberKernels.h"
 #include "epoch_process_id.h"
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 using namespace mg5amcGpu;
 #else
 using namespace mg5amcCpu;
@@ -35,7 +35,7 @@ struct CUDA_CPU_TestBase : public TestDriverBase
     : TestDriverBase( npar, refFileName ) {}
 };
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 struct CPUTest : public CUDA_CPU_TestBase
 {
   // Struct data members (process, and memory structures for random numbers, momenta, matrix elements and weights on host and device)
@@ -117,7 +117,7 @@ struct CPUTest : public CUDA_CPU_TestBase
 };
 #endif
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 struct CUDATest : public CUDA_CPU_TestBase
 {
   // Reset the device when our test goes out of scope. Note that this should happen after
@@ -126,7 +126,7 @@ struct CUDATest : public CUDA_CPU_TestBase
   {
     ~DeviceReset()
     {
-      checkCuda( cudaDeviceReset() ); // this is needed by cuda-memcheck --leak-check full
+      gpuDeviceReset(); // this is needed by cuda-memcheck --leak-check full
     }
   } deviceResetter;
 
@@ -252,7 +252,7 @@ INSTANTIATE_TEST_SUITE_P( prefix, \
                           test_suite_name, \
                           testing::Values( new CUDATest( MG_EPOCH_REFERENCE_FILE_NAME ) ) );
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 MG_INSTANTIATE_TEST_SUITE_GPU( XTESTID_GPU( MG_EPOCH_PROCESS_ID ), MadgraphTest );
 #else
 MG_INSTANTIATE_TEST_SUITE_CPU( XTESTID_CPU( MG_EPOCH_PROCESS_ID ), MadgraphTest );
diff --git a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/testmisc.cc b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/testmisc.cc
index 895d6eeb56..ba9e59a8a3 100644
--- a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/testmisc.cc
+++ b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/testmisc.cc
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 //----------------------------------------------------------------------------
 // Use ./runTest.exe --gtest_filter=*misc to run only testmisc.cc tests
 //----------------------------------------------------------------------------
@@ -17,7 +17,7 @@
 #include <sstream>
 #include <typeinfo>
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 #define TESTID( s ) s##_GPU_MISC
 #else
 #define TESTID( s ) s##_CPU_MISC
@@ -26,7 +26,7 @@
 #define XTESTID( s ) TESTID( s )
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -59,7 +59,7 @@ namespace mg5amcCpu
 
 TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testmisc )
 {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   using namespace mg5amcGpu;
 #else
   using namespace mg5amcCpu;
diff --git a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/testxxx.cc b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/testxxx.cc
index e40f635e46..016bc0f472 100644
--- a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/testxxx.cc
+++ b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/testxxx.cc
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Apr 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 //----------------------------------------------------------------------------
 // Use ./runTest.exe --gtest_filter=*xxx to run only testxxx.cc tests
 //----------------------------------------------------------------------------
@@ -24,7 +24,7 @@
 #include <iomanip>
 #include <iostream>
 #include <vector>
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 #define TESTID( s ) s##_GPU_XXX
 #else
 #define TESTID( s ) s##_CPU_XXX
@@ -32,7 +32,7 @@
 
 #define XTESTID( s ) TESTID( s )
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -42,7 +42,7 @@ namespace mg5amcCpu
   int FPEhandlerIevt = -1;
   inline void FPEhandler( int sig )
   {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     std::cerr << "Floating Point Exception (GPU): '" << FPEhandlerMessage << "' ievt=" << FPEhandlerIevt << std::endl;
 #else
     std::cerr << "Floating Point Exception (CPU neppV=" << neppV << "): '" << FPEhandlerMessage << "' ievt=" << FPEhandlerIevt << std::endl;
@@ -53,7 +53,7 @@ namespace mg5amcCpu
 
 TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testxxx )
 {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   using namespace mg5amcGpu;
 #else
   using namespace mg5amcCpu;
@@ -76,7 +76,7 @@ TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testxxx )
   assert( nevt % neppM == 0 ); // nevt must be a multiple of neppM
   assert( nevt % neppV == 0 ); // nevt must be a multiple of neppV
   // Fill in the input momenta
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   mg5amcGpu::PinnedHostBufferMomenta hstMomenta( nevt ); // AOSOA[npagM][npar=4][np4=4][neppM]
 #else
   mg5amcCpu::HostBufferMomenta hstMomenta( nevt ); // AOSOA[npagM][npar=4][np4=4][neppM]
@@ -321,7 +321,7 @@ TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testxxx )
   {
     for( int ievt = 0; ievt < nevt; ievt++ )
     {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
       using namespace mg5amcGpu;
 #else
       using namespace mg5amcCpu;
diff --git a/epochX/cudacpp/gg_ttgg.sa/src/HelAmps_sm.h b/epochX/cudacpp/gg_ttgg.sa/src/HelAmps_sm.h
index 6db5ca82f3..f4ea9f0a8a 100644
--- a/epochX/cudacpp/gg_ttgg.sa/src/HelAmps_sm.h
+++ b/epochX/cudacpp/gg_ttgg.sa/src/HelAmps_sm.h
@@ -5,7 +5,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: A. Valassi (Sep 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
 // MadGraph5_aMC@NLO v. 3.5.0_lo_vect, 2023-06-09
@@ -28,7 +28,7 @@
 //#include <iomanip>
 //#include <iostream>
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/gg_ttgg.sa/src/Parameters_sm.cc b/epochX/cudacpp/gg_ttgg.sa/src/Parameters_sm.cc
index 22fdd96a68..01e7d9bcf2 100644
--- a/epochX/cudacpp/gg_ttgg.sa/src/Parameters_sm.cc
+++ b/epochX/cudacpp/gg_ttgg.sa/src/Parameters_sm.cc
@@ -4,7 +4,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: A. Valassi (Sep 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
 // MadGraph5_aMC@NLO v. 3.5.0_lo_vect, 2023-06-09
@@ -17,7 +17,7 @@
 #include <iomanip>
 #include <iostream>
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 using namespace mg5amcGpu;
 #else
 using namespace mg5amcCpu;
diff --git a/epochX/cudacpp/gg_ttgg.sa/src/Parameters_sm.h b/epochX/cudacpp/gg_ttgg.sa/src/Parameters_sm.h
index 11fd9e3c74..b44537e599 100644
--- a/epochX/cudacpp/gg_ttgg.sa/src/Parameters_sm.h
+++ b/epochX/cudacpp/gg_ttgg.sa/src/Parameters_sm.h
@@ -27,7 +27,7 @@
 #include "read_slha.h"
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -93,7 +93,7 @@ namespace mg5amcCpu
 #include <limits>
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -218,7 +218,7 @@ namespace mg5amcCpu
 //==========================================================================
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -239,7 +239,7 @@ namespace mg5amcCpu
 #pragma GCC diagnostic push
 #pragma GCC diagnostic ignored "-Wunused-variable"  // e.g. <<warning: unused variable ‘mdl_G__exp__2’ [-Wunused-variable]>>
 #pragma GCC diagnostic ignored "-Wunused-parameter" // e.g. <<warning: unused parameter ‘G’ [-Wunused-parameter]>>
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 #pragma nv_diagnostic push
 #pragma nv_diag_suppress 177 // e.g. <<warning #177-D: variable "mdl_G__exp__2" was declared but never referenced>>
 #endif
@@ -267,7 +267,7 @@ namespace mg5amcCpu
       // End SM implementation - no special handling of vectors of floats as in EFT (#439)
       return out;
     }
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 #pragma GCC diagnostic pop
 #pragma nv_diagnostic pop
 #endif
diff --git a/epochX/cudacpp/gg_ttgg.sa/src/cudacpp_src.mk b/epochX/cudacpp/gg_ttgg.sa/src/cudacpp_src.mk
index 554d7a704c..f73ff6fa03 100644
--- a/epochX/cudacpp/gg_ttgg.sa/src/cudacpp_src.mk
+++ b/epochX/cudacpp/gg_ttgg.sa/src/cudacpp_src.mk
@@ -38,13 +38,13 @@ endif
 
 #-------------------------------------------------------------------------------
 
-#=== Configure the CUDA compiler (note: NVCC is already exported including ccache)
+#=== Configure the CUDA compiler (note: GPUCC is already exported including ccache)
 
-###$(info NVCC=$(NVCC))
+###$(info GPUCC=$(GPUCC))
 
 #-------------------------------------------------------------------------------
 
-#=== Configure ccache for C++ builds (note: NVCC is already exported including ccache)
+#=== Configure ccache for C++ builds (note: GPUCC is already exported including ccache)
 
 # Enable ccache if USECCACHE=1
 ifeq ($(USECCACHE)$(shell echo $(CXX) | grep ccache),1)
@@ -246,20 +246,20 @@ $(BUILDDIR)/%.o : %.cc *.h $(BUILDDIR)/.build.$(TAG)
 # Generic target and build rules: objects from CUDA compilation
 $(BUILDDIR)/%_cu.o : %.cc *.h $(BUILDDIR)/.build.$(TAG)
 	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
-	$(NVCC) $(CPPFLAGS) $(CUFLAGS) -Xcompiler -fPIC -c -x cu $< -o $@
+	$(GPUCC) $(CPPFLAGS) $(CUFLAGS) -Xcompiler -fPIC -c -x cu $< -o $@
 
 #-------------------------------------------------------------------------------
 
 cxx_objects=$(addprefix $(BUILDDIR)/, Parameters_sm.o read_slha.o)
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 cu_objects=$(addprefix $(BUILDDIR)/, Parameters_sm_cu.o)
 endif
 
 # Target (and build rules): common (src) library
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so : $(cxx_objects) $(cu_objects)
 	@if [ ! -d $(LIBDIR) ]; then echo "mkdir -p $(LIBDIR)"; mkdir -p $(LIBDIR); fi
-	$(NVCC) -shared -o $@ $(cxx_objects) $(cu_objects)
+	$(GPUCC) -shared -o $@ $(cxx_objects) $(cu_objects)
 else
 $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so : $(cxx_objects)
 	@if [ ! -d $(LIBDIR) ]; then echo "mkdir -p $(LIBDIR)"; mkdir -p $(LIBDIR); fi
diff --git a/epochX/cudacpp/gg_ttgg.sa/src/mgOnGpuConfig.h b/epochX/cudacpp/gg_ttgg.sa/src/mgOnGpuConfig.h
index c0f067f1d8..205accb85b 100644
--- a/epochX/cudacpp/gg_ttgg.sa/src/mgOnGpuConfig.h
+++ b/epochX/cudacpp/gg_ttgg.sa/src/mgOnGpuConfig.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jul 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MGONGPUCONFIG_H
 #define MGONGPUCONFIG_H 1
@@ -10,13 +10,27 @@
 // There are two different code bases for standalone_cudacpp (without multichannel) and madevent+cudacpp (with multichannel)
 #undef MGONGPU_SUPPORTS_MULTICHANNEL
 
+// Is this a GPU (CUDA, HIP) or CPU implementation?
+#ifdef __CUDACC__
+#define MGONGPUCPP_GPUIMPL cuda
+#elif defined __HIPCC__
+#define MGONGPUCPP_GPUIMPL hip
+#include "hip/hip_runtime.h"
+#else
+#undef MGONGPUCPP_GPUIMPL
+#endif
+
 // ** NB1 Throughputs (e.g. 6.8E8) are events/sec for "./gcheck.exe -p 65536 128 12"
 // ** NB2 Baseline on b7g47n0004 fluctuates (probably depends on load on other VMs)
 
 // Choose if curand is supported for generating random numbers
+// For CUDA, by default, it is supported
+// For HIP, by default, it is not supported
 // For C++, by default, do not inline, but allow this macro to be set from outside with e.g. -DMGONGPU_HAS_NO_CURAND
 #ifdef __CUDACC__
 #undef MGONGPU_HAS_NO_CURAND
+#elif defined __HIPCC__
+#define MGONGPU_HAS_NO_CURAND 1
 #else
 //#undef MGONGPU_HAS_NO_CURAND // default
 ////#define MGONGPU_HAS_NO_CURAND 1
@@ -52,23 +66,28 @@
 //#undef MGONGPU_HARDCODE_PARAM // default
 ////#define MGONGPU_HARDCODE_PARAM 1
 
-// Complex type in c++: std::complex or cxsmpl (CHOOSE ONLY ONE)
-#ifndef __CUDACC__
-//#define MGONGPU_CPPCXTYPE_STDCOMPLEX 1 // ~8 percent slower on float, same on double (5.1E6/double, 9.4E6/float)
-#define MGONGPU_CPPCXTYPE_CXSMPL 1 // new default (5.1E6/double, 10.2E6/float)
-#endif
-
-// Complex type in cuda: thrust or cucomplex or cxsmpl (CHOOSE ONLY ONE)
+// Complex type in CUDA: thrust or cucomplex or cxsmpl (CHOOSE ONLY ONE)
 #ifdef __CUDACC__
 #define MGONGPU_CUCXTYPE_THRUST 1 // default (~1.15E9/double, ~3.2E9/float)
 //#define MGONGPU_CUCXTYPE_CUCOMPLEX 1 // ~10 percent slower (1.03E9/double, ~2.8E9/float)
 //#define MGONGPU_CUCXTYPE_CXSMPL 1 // ~10 percent slower (1.00E9/double, ~2.9E9/float)
+
+// Complex type in HIP: cxsmpl (ONLY ONE OPTION POSSIBLE)
+#elif defined __HIPCC__
+#define MGONGPU_CUCXTYPE_CXSMPL 1 // ~10 percent slower (1.00E9/double, ~2.9E9/float)
+
+// Complex type in C++: std::complex or cxsmpl (CHOOSE ONLY ONE)
+#else
+//#define MGONGPU_CPPCXTYPE_STDCOMPLEX 1 // ~8 percent slower on float, same on double (5.1E6/double, 9.4E6/float)
+#define MGONGPU_CPPCXTYPE_CXSMPL 1 // new default (5.1E6/double, 10.2E6/float)
 #endif
 
-// Cuda nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation
+// CUDA nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation
 #ifdef __CUDACC__
-#undef MGONGPU_NSIGHT_DEBUG // default
+#undef MGONGPU_NSIGHT_DEBUG // default in CUDA
 //#define MGONGPU_NSIGHT_DEBUG 1
+#else
+#undef MGONGPU_NSIGHT_DEBUG // only option in HIP or C++
 #endif
 
 // SANITY CHECKS (floating point precision for everything but color algebra #537)
@@ -84,17 +103,21 @@
 #error You cannot use double precision for color algebra and single precision elsewhere
 #endif
 
-// SANITY CHECKS (c++ complex number implementation)
-#ifndef __CUDACC__
-#if defined MGONGPU_CPPCXTYPE_STDCOMPLEX and defined MGONGPU_CPPCXTYPE_CXSMPL
-#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CPPCXTYPE_STDCOMPLEX or MGONGPU_CPPCXTYPE_CXSMPL
+// SANITY CHECKS (CUDA complex number implementation)
+#ifdef __CUDACC__
+#if defined MGONGPU_CUCXTYPE_THRUST and defined MGONGPU_CUCXTYPE_CUCOMPLEX
+#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CUCXTYPE_THRUST or MGONGPU_CUCXTYPE_CUCOMPLEX for CUDA
+#elif defined MGONGPU_CUCXTYPE_THRUST and defined MGONGPU_CUCXTYPE_CXSMPL
+#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CUCXTYPE_THRUST or MGONGPU_CUCXTYPE_CXSMPL for CUDA
+#elif defined MGONGPU_CUCXTYPE_CUCOMPLEX and defined MGONGPU_CUCXTYPE_CXSMPL
+#error You must CHOOSE (ONE AND) ONLY ONE OF MGONGPU_CUCXTYPE_CUCOMPLEX or MGONGPU_CUCXTYPE_CXSMPL for CUDA
 #endif
 #endif
 
-// SANITY CHECKS (cuda complex number implementation)
-#ifdef __CUDACC__
-#if defined MGONGPU_CUCXTYPE_THRUST and defined MGONGPU_CUCXTYPE_CUCOMPLEX and defined MGONGPU_CUCXTYPE_CXSMPL
-#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CUCXTYPE_THRUST or MGONGPU_CUCXTYPE_CUCOMPLEX or MGONGPU_CUCXTYPE_CXSMPL
+// SANITY CHECKS (C++ complex number implementation)
+#ifndef MGONGPUCPP_GPUIMPL
+#if defined MGONGPU_CPPCXTYPE_STDCOMPLEX and defined MGONGPU_CPPCXTYPE_CXSMPL
+#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CPPCXTYPE_STDCOMPLEX or MGONGPU_CPPCXTYPE_CXSMPL for C++
 #endif
 #endif
 
@@ -132,7 +155,7 @@ namespace mgOnGpu
   // Alignment requirement for using reinterpret_cast with SIMD vectorized code
   // (using reinterpret_cast with non aligned memory may lead to segmentation faults!)
   // Only needed for C++ code but can be enforced also in NVCC builds of C++ code using CUDA>=11.2 and C++17 (#318, #319, #333)
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   constexpr int cppAlign = 64; // alignment requirement for SIMD vectorization (64-byte i.e. 512-bit)
 #endif
 
@@ -143,7 +166,7 @@ using mgOnGpu::fptype;
 using mgOnGpu::fptype2;
 
 // C++ SIMD vectorization width (this will be used to set neppV)
-#ifdef __CUDACC__ // CUDA implementation has no SIMD
+#ifdef MGONGPUCPP_GPUIMPL // CUDA and HIP implementations have no SIMD
 #undef MGONGPU_CPPSIMD
 #elif defined __AVX512VL__ && defined MGONGPU_PVW512 // C++ "512z" AVX512 with 512 width (512-bit ie 64-byte): 8 (DOUBLE) or 16 (FLOAT)
 #ifdef MGONGPU_FPTYPE_DOUBLE
@@ -173,9 +196,9 @@ using mgOnGpu::fptype2;
 #undef MGONGPU_CPPSIMD
 #endif
 
-// Cuda nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation
+// CUDA nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation
 // Arguments (not used so far): text is __FUNCTION__, code is 0 (start) or 1 (end)
-#if defined __CUDACC__ && defined MGONGPU_NSIGHT_DEBUG /* clang-format off */
+#if defined __CUDA__ && defined MGONGPU_NSIGHT_DEBUG /* clang-format off */
 #define mgDebugDeclare() __shared__ float mgDebugCounter[mgOnGpu::ntpbMAX];
 #define mgDebugInitialise() { mgDebugCounter[threadIdx.x] = 0; }
 #define mgDebug( code, text ) { mgDebugCounter[threadIdx.x] += 1; }
@@ -187,8 +210,8 @@ using mgOnGpu::fptype2;
 #define mgDebugFinalise() { /*noop*/ }
 #endif /* clang-format on */
 
-// Define empty CUDA declaration specifiers for C++
-#ifndef __CUDACC__
+// Define empty CUDA/HIP declaration specifiers for C++
+#ifndef MGONGPUCPP_GPUIMPL
 #define __global__
 #define __host__
 #define __device__
diff --git a/epochX/cudacpp/gg_ttgg.sa/src/mgOnGpuCxtypes.h b/epochX/cudacpp/gg_ttgg.sa/src/mgOnGpuCxtypes.h
index b56348bc58..6ae0c42ecb 100644
--- a/epochX/cudacpp/gg_ttgg.sa/src/mgOnGpuCxtypes.h
+++ b/epochX/cudacpp/gg_ttgg.sa/src/mgOnGpuCxtypes.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022, based on earlier work by D. Smith) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MGONGPUCXTYPES_H
 #define MGONGPUCXTYPES_H 1
@@ -19,7 +19,7 @@
 #include <complex>
 
 // Complex type in cuda: thrust or cucomplex or cxsmpl
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 #if defined MGONGPU_CUCXTYPE_THRUST
 #pragma clang diagnostic push
 #pragma clang diagnostic ignored "-Wtautological-compare" // for icpx2021/clang13 (https://stackoverflow.com/a/15864661)
@@ -82,7 +82,7 @@ namespace mgOnGpu /* clang-format off */
 using mgOnGpu::cxsmpl;
 
 // Printout to stream for user defined types
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -209,14 +209,14 @@ namespace mg5amcCpu
 //==========================================================================
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
 #endif
 {
   // --- Type definitions (complex type: cxtype)
-#ifdef __CUDACC__ // cuda
+#ifdef MGONGPUCPP_GPUIMPL // cuda
 #if defined MGONGPU_CUCXTYPE_THRUST
   typedef thrust::complex<fptype> cxtype;
 #elif defined MGONGPU_CUCXTYPE_CUCOMPLEX
@@ -249,7 +249,7 @@ namespace mg5amcCpu
 //==========================================================================
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -301,7 +301,7 @@ namespace mg5amcCpu
 
   //==========================================================================
 
-#if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_THRUST // cuda + thrust
+#if defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CUCXTYPE_THRUST // cuda + thrust
 
   //------------------------------
   // CUDA - using thrust::complex
@@ -337,11 +337,11 @@ namespace mg5amcCpu
     return c;
   }
 
-#endif // #if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_THRUST
+#endif // #if defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CUCXTYPE_THRUST
 
   //==========================================================================
 
-#if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_CUCOMPLEX // cuda + cucomplex
+#if defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CUCXTYPE_CUCOMPLEX // cuda + cucomplex
 
   //------------------------------
   // CUDA - using cuComplex
@@ -556,11 +556,11 @@ namespace mg5amcCpu
     return cxmake( c.real(), c.imag() );
   }
 
-#endif // #if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_CUCOMPLEX
+#endif // #if defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CUCXTYPE_CUCOMPLEX
 
   //==========================================================================
 
-#if not defined __CUDACC__ and defined MGONGPU_CPPCXTYPE_STDCOMPLEX // c++ + stdcomplex
+#if not defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CPPCXTYPE_STDCOMPLEX // c++ + stdcomplex
 
   //------------------------------
   // C++ - using std::complex
@@ -604,7 +604,7 @@ namespace mg5amcCpu
   }
 #endif
 
-#endif // #if not defined __CUDACC__ and defined MGONGPU_CPPCXTYPE_STDCOMPLEX
+#endif // #if not defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CPPCXTYPE_STDCOMPLEX
 
   //==========================================================================
 
@@ -627,7 +627,7 @@ namespace mg5amcCpu
 //==========================================================================
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/gg_ttgg.sa/src/mgOnGpuFptypes.h b/epochX/cudacpp/gg_ttgg.sa/src/mgOnGpuFptypes.h
index 905c97d700..fa3a02664b 100644
--- a/epochX/cudacpp/gg_ttgg.sa/src/mgOnGpuFptypes.h
+++ b/epochX/cudacpp/gg_ttgg.sa/src/mgOnGpuFptypes.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MGONGPUFPTYPES_H
 #define MGONGPUFPTYPES_H 1
@@ -12,7 +12,7 @@
 #include <cmath>
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL // cuda
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -20,7 +20,7 @@ namespace mg5amcCpu
 {
   //==========================================================================
 
-#ifdef __CUDACC__ // cuda
+#ifdef MGONGPUCPP_GPUIMPL // cuda
 
   //------------------------------
   // Floating point types - Cuda
@@ -64,11 +64,11 @@ namespace mg5amcCpu
 #endif
   }
 
-#endif // #ifdef __CUDACC__
+#endif // #ifdef MGONGPUCPP_GPUIMPL
 
   //==========================================================================
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 
   //------------------------------
   // Floating point types - C++
@@ -92,7 +92,7 @@ namespace mg5amcCpu
     return std::sqrt( f );
   }
 
-#endif // #ifndef __CUDACC__
+#endif // #ifndef MGONGPUCPP_GPUIMPL
 
   //==========================================================================
 
diff --git a/epochX/cudacpp/gg_ttgg.sa/src/mgOnGpuVectors.h b/epochX/cudacpp/gg_ttgg.sa/src/mgOnGpuVectors.h
index e1299ba81e..cdae04326b 100644
--- a/epochX/cudacpp/gg_ttgg.sa/src/mgOnGpuVectors.h
+++ b/epochX/cudacpp/gg_ttgg.sa/src/mgOnGpuVectors.h
@@ -32,7 +32,7 @@
 #endif
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -131,7 +131,7 @@ namespace mg5amcCpu
 #endif
 #endif
 
-#else // i.e #ifndef MGONGPU_CPPSIMD (this includes #ifdef __CUDACC__)
+#else // i.e #ifndef MGONGPU_CPPSIMD (this includes #ifdef MGONGPUCPP_GPUIMPL)
 
   const int neppV = 1;
 
@@ -153,13 +153,13 @@ namespace mg5amcCpu
 //==========================================================================
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
 #endif
 {
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 
   // Printout to stream for user defined types
 
@@ -805,11 +805,11 @@ namespace mg5amcCpu
 
 #endif // #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
 
-#endif // #ifndef __CUDACC__
+#endif // #ifndef MGONGPUCPP_GPUIMPL
 
   //==========================================================================
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 
   //------------------------------
   // Vector types - CUDA
@@ -853,12 +853,12 @@ namespace mg5amcCpu
     return mask;
   }
 
-#endif // #ifdef __CUDACC__
+#endif // #ifdef MGONGPUCPP_GPUIMPL
 
   //==========================================================================
 
   // Scalar-or-vector types: scalar in CUDA, vector or scalar in C++
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   typedef bool bool_sv;
   typedef fptype fptype_sv;
   typedef fptype2 fptype2_sv;
@@ -879,7 +879,7 @@ namespace mg5amcCpu
 #endif
 
   // Scalar-or-vector zeros: scalar in CUDA, vector or scalar in C++
-#ifdef __CUDACC__ /* clang-format off */
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
   inline __host__ __device__ cxtype cxzero_sv(){ return cxtype( 0, 0 ); }
 #elif defined MGONGPU_CPPSIMD
   inline cxtype_v cxzero_sv() { return cxtype_v(); } // RRRR=0000 IIII=0000
diff --git a/epochX/cudacpp/gg_ttgg.sa/src/rambo.h b/epochX/cudacpp/gg_ttgg.sa/src/rambo.h
index e02ea52496..cd7e1008ea 100644
--- a/epochX/cudacpp/gg_ttgg.sa/src/rambo.h
+++ b/epochX/cudacpp/gg_ttgg.sa/src/rambo.h
@@ -4,7 +4,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 
 #include "mgOnGpuConfig.h"
@@ -18,7 +18,7 @@
 #include <iostream>
 
 // Simplified rambo version for 2 to N (with N>=2) processes with massless particles
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -83,7 +83,7 @@ namespace mg5amcCpu
       static bool first = true;
       if( first )
       {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
         if constexpr( M_ACCESS::isOnDevice() ) // avoid
         {
           const int ievt0 = 0;
@@ -166,7 +166,7 @@ namespace mg5amcCpu
     wt = po2log;
     if( nparf != 2 ) wt = ( 2. * nparf - 4. ) * log( energy ) + z[nparf - 1];
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
     // issue warnings if weight is too small or too large
     static int iwarn[5] = { 0, 0, 0, 0, 0 };
     if( wt < -180. )
diff --git a/epochX/cudacpp/gg_ttggg.mad/CODEGEN_mad_gg_ttggg_log.txt b/epochX/cudacpp/gg_ttggg.mad/CODEGEN_mad_gg_ttggg_log.txt
index f8ad8149f8..a335b4d0a4 100644
--- a/epochX/cudacpp/gg_ttggg.mad/CODEGEN_mad_gg_ttggg_log.txt
+++ b/epochX/cudacpp/gg_ttggg.mad/CODEGEN_mad_gg_ttggg_log.txt
@@ -53,7 +53,7 @@ Note that you can still compile and run aMC@NLO with the built-in PDFs
 Using default text editor "vi". Set another one in ./input/mg5_configuration.txt
 Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt
 No valid web browser found. Please set in ./input/mg5_configuration.txt
-import /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_mad_gg_ttggg.mg
+import /afs/cern.ch/work/j/jteig/mg5amcnlo/CODEGEN_mad_gg_ttggg.mg
 The import format was not given, so we guess it as command
 set stdout_level DEBUG
 set output information to level: 10
@@ -62,7 +62,7 @@ generate g g > t t~ g g g
 No model currently active, so we import the Standard Model
 INFO: load particles 
 INFO: load vertices 
-[1;32mDEBUG: model prefixing  takes 0.00522160530090332 [0m
+[1;32mDEBUG: model prefixing  takes 0.003568410873413086 [0m
 INFO: Restrict model sm with file models/sm/restrict_default.dat . 
 [1;32mDEBUG: Simplifying conditional expressions [0m
 [1;32mDEBUG: remove interactions: u s w+ at order: QED=1 [0m
@@ -155,7 +155,7 @@ INFO: Please specify coupling orders to bypass this step.
 INFO: Trying coupling order WEIGHTED<=5: WEIGTHED IS QCD+2*QED 
 INFO: Trying process: g g > t t~ g g g WEIGHTED<=5 @1  
 INFO: Process has 1240 diagrams 
-1 processes with 1240 diagrams generated in 1.741 s
+1 processes with 1240 diagrams generated in 1.395 s
 Total: 1 processes with 1240 diagrams
 output madevent CODEGEN_mad_gg_ttggg --hel_recycling=False --vector_size=16384 --me_exporter=standalone_cudacpp
 Load PLUGIN.CUDACPP_SA_OUTPUT
@@ -166,20 +166,20 @@ Load PLUGIN.CUDACPP_SA_OUTPUT
 INFO: initialize a new directory: CODEGEN_mad_gg_ttggg 
 INFO: remove old information in CODEGEN_mad_gg_ttggg 
 [1;32mDEBUG:  Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [1;30m[output.py at line 157][0m [0m
-[1;34mWARNING: File exists /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_mad_gg_ttggg [0m
-INFO: Creating subdirectories in directory /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_mad_gg_ttggg 
-[1;34mWARNING: File exists /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_mad_gg_ttggg/Cards [0m
-[1;34mWARNING: File exists /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_mad_gg_ttggg/SubProcesses [0m
+[1;34mWARNING: File exists /afs/cern.ch/work/j/jteig/mg5amcnlo/CODEGEN_mad_gg_ttggg [0m
+INFO: Creating subdirectories in directory /afs/cern.ch/work/j/jteig/mg5amcnlo/CODEGEN_mad_gg_ttggg 
+[1;34mWARNING: File exists /afs/cern.ch/work/j/jteig/mg5amcnlo/CODEGEN_mad_gg_ttggg/Cards [0m
+[1;34mWARNING: File exists /afs/cern.ch/work/j/jteig/mg5amcnlo/CODEGEN_mad_gg_ttggg/SubProcesses [0m
 INFO: Organizing processes into subprocess groups 
 INFO: Generating Helas calls for process: g g > t t~ g g g WEIGHTED<=5 @1 
 INFO: Processing color information for process: g g > t t~ g g g @1 
 INFO: Creating files in directory P1_gg_ttxggg 
 INFO: Computing Color-Flow optimization [15120 term] 
-INFO: Color-Flow passed to 1592 term in 32s. Introduce 2768 contraction 
+INFO: Color-Flow passed to 1592 term in 24s. Introduce 2768 contraction 
 [1;32mDEBUG:  Entering PLUGIN_OneProcessExporter.__init__ [1;30m[model_handling.py at line 1039][0m [0m
 [1;32mDEBUG:  kwargs[prefix] = 0 [1;30m[model_handling.py at line 1040][0m [0m
 [1;32mDEBUG:  proc_id = [0m 1 [1;30m[model_handling.py at line 1045][0m [0m
-[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_SA_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7f3649890b80> [1;30m[export_v4.py at line 6174][0m [0m
+[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_SA_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7fea7f73c4f0> [1;30m[export_v4.py at line 6174][0m [0m
 INFO: Creating files in directory . 
 [1;32mDEBUG:  Entering PLUGIN_OneProcessExporter.generate_process_files [1;30m[model_handling.py at line 1297][0m [0m
 [1;32mDEBUG:  self.include_multi_channel is already defined: this is madevent+second_exporter mode [1;30m[model_handling.py at line 1299][0m [0m
@@ -194,17 +194,17 @@ FileWriter <class 'PLUGIN.CUDACPP_SA_OUTPUT.model_handling.PLUGIN_CPPWriter'> fo
 [1;32mDEBUG:  self.support_multichannel, self.include_multi_channel = [0m True [1, 2, 0, 3, 4, 0, 5, 6, 0, 0, 0, 0, 0, 7, 8, 9, 0, 10, 11, 12, 0, 13, 14, 15, 0, 16, 17, 18, 19, 20, 21, 0, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 0, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 0, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 0, 67, 68, 69, 70, 71, 72, 73, 74, 75, 0, 76, 77, 78, 79, 80, 81, 82, 83, 84, 0, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 0, 0, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 0, 121, 122, 0, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 0, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 0, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196, 0, 197, 198, 199, 200, 201, 202, 0, 203, 204, 205, 206, 207, 208, 0, 209, 210, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223, 224, 225, 0, 226, 227, 0, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239, 240, 241, 242, 0, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254, 255, 256, 257, 0, 258, 259, 260, 261, 262, 263, 264, 265, 266, 267, 268, 269, 270, 271, 272, 273, 274, 275, 276, 277, 278, 279, 280, 281, 282, 283, 284, 285, 286, 287, 288, 289, 290, 291, 292, 293, 294, 295, 296, 297, 298, 299, 300, 301, 0, 302, 303, 304, 305, 306, 307, 0, 308, 309, 310, 311, 312, 313, 0, 314, 315, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 316, 317, 318, 319, 320, 321, 0, 322, 323, 324, 325, 326, 327, 328, 329, 330, 331, 332, 333, 334, 335, 336, 0, 337, 338, 339, 340, 341, 342, 343, 344, 345, 346, 347, 348, 349, 350, 351, 0, 352, 353, 354, 355, 356, 357, 358, 359, 360, 361, 362, 363, 364, 365, 366, 0, 367, 368, 369, 370, 371, 372, 373, 374, 375, 376, 377, 0, 378, 379, 0, 380, 381, 0, 0, 0, 0, 0, 382, 383, 384, 385, 386, 387, 388, 389, 390, 0, 391, 392, 393, 394, 395, 396, 397, 398, 399, 0, 400, 401, 402, 403, 404, 405, 406, 407, 408, 0, 409, 410, 411, 412, 413, 414, 0, 415, 416, 417, 418, 419, 420, 0, 0, 0, 421, 422, 423, 424, 425, 426, 0, 427, 428, 429, 430, 431, 432, 433, 434, 435, 436, 437, 438, 439, 440, 441, 0, 442, 443, 444, 445, 446, 447, 448, 449, 450, 451, 452, 453, 454, 455, 456, 0, 457, 458, 459, 460, 461, 462, 463, 464, 465, 466, 467, 468, 469, 470, 471, 0, 472, 473, 474, 475, 476, 477, 478, 479, 480, 481, 482, 0, 483, 484, 0, 485, 486, 0, 0, 0, 0, 0, 487, 488, 489, 490, 491, 492, 493, 494, 495, 0, 496, 497, 498, 499, 500, 501, 502, 503, 504, 0, 505, 506, 507, 508, 509, 510, 511, 512, 513, 0, 514, 515, 516, 517, 518, 519, 0, 520, 521, 522, 523, 524, 525, 0, 0, 0, 526, 527, 528, 529, 530, 531, 0, 532, 533, 534, 535, 536, 537, 538, 539, 540, 541, 542, 543, 544, 545, 546, 0, 547, 548, 549, 550, 551, 552, 553, 554, 555, 556, 557, 558, 559, 560, 561, 0, 562, 563, 564, 565, 566, 567, 568, 569, 570, 571, 572, 573, 574, 575, 576, 0, 577, 578, 579, 580, 581, 582, 583, 584, 585, 586, 587, 0, 588, 589, 0, 590, 591, 0, 0, 0, 0, 0, 592, 593, 594, 595, 596, 597, 598, 599, 600, 0, 601, 602, 603, 604, 605, 606, 607, 608, 609, 0, 610, 611, 612, 613, 614, 615, 616, 617, 618, 0, 619, 620, 621, 622, 623, 624, 0, 625, 626, 627, 628, 629, 630, 0, 0, 0, 631, 632, 633, 634, 635, 636, 637, 638, 639, 640, 641, 642, 643, 644, 645, 646, 647, 648, 649, 650, 651, 652, 653, 654, 655, 656, 657, 658, 659, 660, 661, 662, 663, 0, 664, 665, 666, 667, 668, 669, 0, 670, 671, 672, 673, 674, 675, 0, 0, 0, 676, 677, 678, 679, 680, 681, 682, 683, 684, 685, 686, 687, 688, 689, 690, 691, 692, 693, 694, 695, 696, 697, 698, 699, 700, 701, 702, 703, 704, 705, 706, 707, 708, 0, 709, 710, 711, 712, 713, 714, 0, 715, 716, 717, 718, 719, 720, 0, 0, 0, 721, 722, 0, 723, 724, 0, 725, 726, 0, 0, 0, 0, 0, 727, 728, 729, 730, 731, 732, 733, 734, 735, 0, 736, 737, 738, 739, 740, 741, 742, 743, 744, 0, 745, 746, 747, 748, 749, 750, 751, 752, 753, 0, 754, 755, 756, 757, 758, 759, 0, 760, 761, 762, 763, 764, 765, 766, 767, 0, 768, 769, 0, 770, 771, 0, 0, 0, 0, 0, 772, 773, 774, 775, 776, 777, 778, 779, 780, 0, 781, 782, 783, 784, 785, 786, 787, 788, 789, 0, 790, 791, 792, 793, 794, 795, 796, 797, 798, 0, 799, 800, 801, 802, 803, 804, 0, 805, 806, 807, 808, 809, 810, 811, 812, 0, 813, 814, 0, 815, 816, 0, 0, 0, 0, 0, 817, 818, 819, 820, 821, 822, 823, 824, 825, 0, 826, 827, 828, 829, 830, 831, 832, 833, 834, 0, 835, 836, 837, 838, 839, 840, 841, 842, 843, 0, 844, 845, 846, 847, 848, 849, 0, 850, 851, 852, 853, 854, 855, 856, 857, 0, 858, 859, 0, 860, 861, 0, 0, 0, 0, 862, 863, 0, 864, 865, 0, 866, 867, 0, 0, 0, 0, 868, 869, 0, 870, 871, 0, 872, 873, 0, 0, 0, 0, 0, 0, 0, 874, 875, 876, 877, 878, 879, 880, 881, 882, 883, 884, 885, 886, 887, 888, 889, 890, 891, 0, 892, 893, 894, 895, 896, 897, 898, 899, 900, 901, 902, 903, 904, 905, 906, 907, 908, 909, 0, 910, 911, 912, 913, 914, 915, 916, 917, 918, 919, 920, 921, 922, 923, 924, 925, 926, 927, 0, 928, 929, 930, 931, 932, 933, 0, 934, 935, 936, 937, 938, 939, 0, 940, 941, 942, 943, 944, 945, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] [1;30m[model_handling.py at line 1163][0m [0m
 [1;32mDEBUG:  multi_channel = [0m {1: [0], 2: [1], 3: [3], 4: [4], 5: [6], 6: [7], 7: [13], 8: [14], 9: [15], 10: [17], 11: [18], 12: [19], 13: [21], 14: [22], 15: [23], 16: [25], 17: [26], 18: [27], 19: [28], 20: [29], 21: [30], 22: [32], 23: [33], 24: [34], 25: [35], 26: [36], 27: [37], 28: [38], 29: [39], 30: [40], 31: [41], 32: [42], 33: [43], 34: [44], 35: [45], 36: [46], 37: [48], 38: [49], 39: [50], 40: [51], 41: [52], 42: [53], 43: [54], 44: [55], 45: [56], 46: [57], 47: [58], 48: [59], 49: [60], 50: [61], 51: [62], 52: [64], 53: [65], 54: [66], 55: [67], 56: [68], 57: [69], 58: [70], 59: [71], 60: [72], 61: [73], 62: [74], 63: [75], 64: [76], 65: [77], 66: [78], 67: [80], 68: [81], 69: [82], 70: [83], 71: [84], 72: [85], 73: [86], 74: [87], 75: [88], 76: [90], 77: [91], 78: [92], 79: [93], 80: [94], 81: [95], 82: [96], 83: [97], 84: [98], 85: [100], 86: [101], 87: [102], 88: [103], 89: [104], 90: [105], 91: [106], 92: [107], 93: [108], 94: [109], 95: [110], 96: [111], 97: [112], 98: [113], 99: [114], 100: [115], 101: [116], 102: [117], 103: [118], 104: [119], 105: [120], 106: [123], 107: [124], 108: [125], 109: [126], 110: [127], 111: [128], 112: [129], 113: [130], 114: [131], 115: [132], 116: [133], 117: [134], 118: [135], 119: [136], 120: [137], 121: [139], 122: [140], 123: [142], 124: [143], 125: [144], 126: [145], 127: [146], 128: [147], 129: [148], 130: [149], 131: [150], 132: [151], 133: [152], 134: [153], 135: [154], 136: [155], 137: [156], 138: [158], 139: [159], 140: [160], 141: [161], 142: [162], 143: [163], 144: [164], 145: [165], 146: [166], 147: [167], 148: [168], 149: [169], 150: [170], 151: [171], 152: [172], 153: [174], 154: [175], 155: [176], 156: [177], 157: [178], 158: [179], 159: [180], 160: [181], 161: [182], 162: [183], 163: [184], 164: [185], 165: [186], 166: [187], 167: [188], 168: [189], 169: [190], 170: [191], 171: [192], 172: [193], 173: [194], 174: [195], 175: [196], 176: [197], 177: [198], 178: [199], 179: [200], 180: [201], 181: [202], 182: [203], 183: [204], 184: [205], 185: [206], 186: [207], 187: [208], 188: [209], 189: [210], 190: [211], 191: [212], 192: [213], 193: [214], 194: [215], 195: [216], 196: [217], 197: [219], 198: [220], 199: [221], 200: [222], 201: [223], 202: [224], 203: [226], 204: [227], 205: [228], 206: [229], 207: [230], 208: [231], 209: [233], 210: [234], 211: [246], 212: [247], 213: [248], 214: [249], 215: [250], 216: [251], 217: [252], 218: [253], 219: [254], 220: [255], 221: [256], 222: [257], 223: [258], 224: [259], 225: [260], 226: [262], 227: [263], 228: [265], 229: [266], 230: [267], 231: [268], 232: [269], 233: [270], 234: [271], 235: [272], 236: [273], 237: [274], 238: [275], 239: [276], 240: [277], 241: [278], 242: [279], 243: [281], 244: [282], 245: [283], 246: [284], 247: [285], 248: [286], 249: [287], 250: [288], 251: [289], 252: [290], 253: [291], 254: [292], 255: [293], 256: [294], 257: [295], 258: [297], 259: [298], 260: [299], 261: [300], 262: [301], 263: [302], 264: [303], 265: [304], 266: [305], 267: [306], 268: [307], 269: [308], 270: [309], 271: [310], 272: [311], 273: [312], 274: [313], 275: [314], 276: [315], 277: [316], 278: [317], 279: [318], 280: [319], 281: [320], 282: [321], 283: [322], 284: [323], 285: [324], 286: [325], 287: [326], 288: [327], 289: [328], 290: [329], 291: [330], 292: [331], 293: [332], 294: [333], 295: [334], 296: [335], 297: [336], 298: [337], 299: [338], 300: [339], 301: [340], 302: [342], 303: [343], 304: [344], 305: [345], 306: [346], 307: [347], 308: [349], 309: [350], 310: [351], 311: [352], 312: [353], 313: [354], 314: [356], 315: [357], 316: [369], 317: [370], 318: [371], 319: [372], 320: [373], 321: [374], 322: [376], 323: [377], 324: [378], 325: [379], 326: [380], 327: [381], 328: [382], 329: [383], 330: [384], 331: [385], 332: [386], 333: [387], 334: [388], 335: [389], 336: [390], 337: [392], 338: [393], 339: [394], 340: [395], 341: [396], 342: [397], 343: [398], 344: [399], 345: [400], 346: [401], 347: [402], 348: [403], 349: [404], 350: [405], 351: [406], 352: [408], 353: [409], 354: [410], 355: [411], 356: [412], 357: [413], 358: [414], 359: [415], 360: [416], 361: [417], 362: [418], 363: [419], 364: [420], 365: [421], 366: [422], 367: [424], 368: [425], 369: [426], 370: [427], 371: [428], 372: [429], 373: [430], 374: [431], 375: [432], 376: [433], 377: [434], 378: [436], 379: [437], 380: [439], 381: [440], 382: [446], 383: [447], 384: [448], 385: [449], 386: [450], 387: [451], 388: [452], 389: [453], 390: [454], 391: [456], 392: [457], 393: [458], 394: [459], 395: [460], 396: [461], 397: [462], 398: [463], 399: [464], 400: [466], 401: [467], 402: [468], 403: [469], 404: [470], 405: [471], 406: [472], 407: [473], 408: [474], 409: [476], 410: [477], 411: [478], 412: [479], 413: [480], 414: [481], 415: [483], 416: [484], 417: [485], 418: [486], 419: [487], 420: [488], 421: [492], 422: [493], 423: [494], 424: [495], 425: [496], 426: [497], 427: [499], 428: [500], 429: [501], 430: [502], 431: [503], 432: [504], 433: [505], 434: [506], 435: [507], 436: [508], 437: [509], 438: [510], 439: [511], 440: [512], 441: [513], 442: [515], 443: [516], 444: [517], 445: [518], 446: [519], 447: [520], 448: [521], 449: [522], 450: [523], 451: [524], 452: [525], 453: [526], 454: [527], 455: [528], 456: [529], 457: [531], 458: [532], 459: [533], 460: [534], 461: [535], 462: [536], 463: [537], 464: [538], 465: [539], 466: [540], 467: [541], 468: [542], 469: [543], 470: [544], 471: [545], 472: [547], 473: [548], 474: [549], 475: [550], 476: [551], 477: [552], 478: [553], 479: [554], 480: [555], 481: [556], 482: [557], 483: [559], 484: [560], 485: [562], 486: [563], 487: [569], 488: [570], 489: [571], 490: [572], 491: [573], 492: [574], 493: [575], 494: [576], 495: [577], 496: [579], 497: [580], 498: [581], 499: [582], 500: [583], 501: [584], 502: [585], 503: [586], 504: [587], 505: [589], 506: [590], 507: [591], 508: [592], 509: [593], 510: [594], 511: [595], 512: [596], 513: [597], 514: [599], 515: [600], 516: [601], 517: [602], 518: [603], 519: [604], 520: [606], 521: [607], 522: [608], 523: [609], 524: [610], 525: [611], 526: [615], 527: [616], 528: [617], 529: [618], 530: [619], 531: [620], 532: [622], 533: [623], 534: [624], 535: [625], 536: [626], 537: [627], 538: [628], 539: [629], 540: [630], 541: [631], 542: [632], 543: [633], 544: [634], 545: [635], 546: [636], 547: [638], 548: [639], 549: [640], 550: [641], 551: [642], 552: [643], 553: [644], 554: [645], 555: [646], 556: [647], 557: [648], 558: [649], 559: [650], 560: [651], 561: [652], 562: [654], 563: [655], 564: [656], 565: [657], 566: [658], 567: [659], 568: [660], 569: [661], 570: [662], 571: [663], 572: [664], 573: [665], 574: [666], 575: [667], 576: [668], 577: [670], 578: [671], 579: [672], 580: [673], 581: [674], 582: [675], 583: [676], 584: [677], 585: [678], 586: [679], 587: [680], 588: [682], 589: [683], 590: [685], 591: [686], 592: [692], 593: [693], 594: [694], 595: [695], 596: [696], 597: [697], 598: [698], 599: [699], 600: [700], 601: [702], 602: [703], 603: [704], 604: [705], 605: [706], 606: [707], 607: [708], 608: [709], 609: [710], 610: [712], 611: [713], 612: [714], 613: [715], 614: [716], 615: [717], 616: [718], 617: [719], 618: [720], 619: [722], 620: [723], 621: [724], 622: [725], 623: [726], 624: [727], 625: [729], 626: [730], 627: [731], 628: [732], 629: [733], 630: [734], 631: [738], 632: [739], 633: [740], 634: [741], 635: [742], 636: [743], 637: [744], 638: [745], 639: [746], 640: [747], 641: [748], 642: [749], 643: [750], 644: [751], 645: [752], 646: [753], 647: [754], 648: [755], 649: [756], 650: [757], 651: [758], 652: [759], 653: [760], 654: [761], 655: [762], 656: [763], 657: [764], 658: [765], 659: [766], 660: [767], 661: [768], 662: [769], 663: [770], 664: [772], 665: [773], 666: [774], 667: [775], 668: [776], 669: [777], 670: [779], 671: [780], 672: [781], 673: [782], 674: [783], 675: [784], 676: [788], 677: [789], 678: [790], 679: [791], 680: [792], 681: [793], 682: [794], 683: [795], 684: [796], 685: [797], 686: [798], 687: [799], 688: [800], 689: [801], 690: [802], 691: [803], 692: [804], 693: [805], 694: [806], 695: [807], 696: [808], 697: [809], 698: [810], 699: [811], 700: [812], 701: [813], 702: [814], 703: [815], 704: [816], 705: [817], 706: [818], 707: [819], 708: [820], 709: [822], 710: [823], 711: [824], 712: [825], 713: [826], 714: [827], 715: [829], 716: [830], 717: [831], 718: [832], 719: [833], 720: [834], 721: [838], 722: [839], 723: [841], 724: [842], 725: [844], 726: [845], 727: [851], 728: [852], 729: [853], 730: [854], 731: [855], 732: [856], 733: [857], 734: [858], 735: [859], 736: [861], 737: [862], 738: [863], 739: [864], 740: [865], 741: [866], 742: [867], 743: [868], 744: [869], 745: [871], 746: [872], 747: [873], 748: [874], 749: [875], 750: [876], 751: [877], 752: [878], 753: [879], 754: [881], 755: [882], 756: [883], 757: [884], 758: [885], 759: [886], 760: [888], 761: [889], 762: [890], 763: [891], 764: [892], 765: [893], 766: [894], 767: [895], 768: [897], 769: [898], 770: [900], 771: [901], 772: [907], 773: [908], 774: [909], 775: [910], 776: [911], 777: [912], 778: [913], 779: [914], 780: [915], 781: [917], 782: [918], 783: [919], 784: [920], 785: [921], 786: [922], 787: [923], 788: [924], 789: [925], 790: [927], 791: [928], 792: [929], 793: [930], 794: [931], 795: [932], 796: [933], 797: [934], 798: [935], 799: [937], 800: [938], 801: [939], 802: [940], 803: [941], 804: [942], 805: [944], 806: [945], 807: [946], 808: [947], 809: [948], 810: [949], 811: [950], 812: [951], 813: [953], 814: [954], 815: [956], 816: [957], 817: [963], 818: [964], 819: [965], 820: [966], 821: [967], 822: [968], 823: [969], 824: [970], 825: [971], 826: [973], 827: [974], 828: [975], 829: [976], 830: [977], 831: [978], 832: [979], 833: [980], 834: [981], 835: [983], 836: [984], 837: [985], 838: [986], 839: [987], 840: [988], 841: [989], 842: [990], 843: [991], 844: [993], 845: [994], 846: [995], 847: [996], 848: [997], 849: [998], 850: [1000], 851: [1001], 852: [1002], 853: [1003], 854: [1004], 855: [1005], 856: [1006], 857: [1007], 858: [1009], 859: [1010], 860: [1012], 861: [1013], 862: [1018], 863: [1019], 864: [1021], 865: [1022], 866: [1024], 867: [1025], 868: [1030], 869: [1031], 870: [1033], 871: [1034], 872: [1036], 873: [1037], 874: [1045], 875: [1046], 876: [1047], 877: [1048], 878: [1049], 879: [1050], 880: [1051], 881: [1052], 882: [1053], 883: [1054], 884: [1055], 885: [1056], 886: [1057], 887: [1058], 888: [1059], 889: [1060], 890: [1061], 891: [1062], 892: [1064], 893: [1065], 894: [1066], 895: [1067], 896: [1068], 897: [1069], 898: [1070], 899: [1071], 900: [1072], 901: [1073], 902: [1074], 903: [1075], 904: [1076], 905: [1077], 906: [1078], 907: [1079], 908: [1080], 909: [1081], 910: [1083], 911: [1084], 912: [1085], 913: [1086], 914: [1087], 915: [1088], 916: [1089], 917: [1090], 918: [1091], 919: [1092], 920: [1093], 921: [1094], 922: [1095], 923: [1096], 924: [1097], 925: [1098], 926: [1099], 927: [1100], 928: [1102], 929: [1103], 930: [1104], 931: [1105], 932: [1106], 933: [1107], 934: [1109], 935: [1110], 936: [1111], 937: [1112], 938: [1113], 939: [1114], 940: [1116], 941: [1117], 942: [1118], 943: [1119], 944: [1120], 945: [1121]} [1;30m[model_handling.py at line 1169][0m [0m
 [1;32mDEBUG:  multi_channel_map = [0m {1: [0], 2: [1], 3: [3], 4: [4], 5: [6], 6: [7], 7: [13], 8: [14], 9: [15], 10: [17], 11: [18], 12: [19], 13: [21], 14: [22], 15: [23], 16: [25], 17: [26], 18: [27], 19: [28], 20: [29], 21: [30], 22: [32], 23: [33], 24: [34], 25: [35], 26: [36], 27: [37], 28: [38], 29: [39], 30: [40], 31: [41], 32: [42], 33: [43], 34: [44], 35: [45], 36: [46], 37: [48], 38: [49], 39: [50], 40: [51], 41: [52], 42: [53], 43: [54], 44: [55], 45: [56], 46: [57], 47: [58], 48: [59], 49: [60], 50: [61], 51: [62], 52: [64], 53: [65], 54: [66], 55: [67], 56: [68], 57: [69], 58: [70], 59: [71], 60: [72], 61: [73], 62: [74], 63: [75], 64: [76], 65: [77], 66: [78], 67: [80], 68: [81], 69: [82], 70: [83], 71: [84], 72: [85], 73: [86], 74: [87], 75: [88], 76: [90], 77: [91], 78: [92], 79: [93], 80: [94], 81: [95], 82: [96], 83: [97], 84: [98], 85: [100], 86: [101], 87: [102], 88: [103], 89: [104], 90: [105], 91: [106], 92: [107], 93: [108], 94: [109], 95: [110], 96: [111], 97: [112], 98: [113], 99: [114], 100: [115], 101: [116], 102: [117], 103: [118], 104: [119], 105: [120], 106: [123], 107: [124], 108: [125], 109: [126], 110: [127], 111: [128], 112: [129], 113: [130], 114: [131], 115: [132], 116: [133], 117: [134], 118: [135], 119: [136], 120: [137], 121: [139], 122: [140], 123: [142], 124: [143], 125: [144], 126: [145], 127: [146], 128: [147], 129: [148], 130: [149], 131: [150], 132: [151], 133: [152], 134: [153], 135: [154], 136: [155], 137: [156], 138: [158], 139: [159], 140: [160], 141: [161], 142: [162], 143: [163], 144: [164], 145: [165], 146: [166], 147: [167], 148: [168], 149: [169], 150: [170], 151: [171], 152: [172], 153: [174], 154: [175], 155: [176], 156: [177], 157: [178], 158: [179], 159: [180], 160: [181], 161: [182], 162: [183], 163: [184], 164: [185], 165: [186], 166: [187], 167: [188], 168: [189], 169: [190], 170: [191], 171: [192], 172: [193], 173: [194], 174: [195], 175: [196], 176: [197], 177: [198], 178: [199], 179: [200], 180: [201], 181: [202], 182: [203], 183: [204], 184: [205], 185: [206], 186: [207], 187: [208], 188: [209], 189: [210], 190: [211], 191: [212], 192: [213], 193: [214], 194: [215], 195: [216], 196: [217], 197: [219], 198: [220], 199: [221], 200: [222], 201: [223], 202: [224], 203: [226], 204: [227], 205: [228], 206: [229], 207: [230], 208: [231], 209: [233], 210: [234], 211: [246], 212: [247], 213: [248], 214: [249], 215: [250], 216: [251], 217: [252], 218: [253], 219: [254], 220: [255], 221: [256], 222: [257], 223: [258], 224: [259], 225: [260], 226: [262], 227: [263], 228: [265], 229: [266], 230: [267], 231: [268], 232: [269], 233: [270], 234: [271], 235: [272], 236: [273], 237: [274], 238: [275], 239: [276], 240: [277], 241: [278], 242: [279], 243: [281], 244: [282], 245: [283], 246: [284], 247: [285], 248: [286], 249: [287], 250: [288], 251: [289], 252: [290], 253: [291], 254: [292], 255: [293], 256: [294], 257: [295], 258: [297], 259: [298], 260: [299], 261: [300], 262: [301], 263: [302], 264: [303], 265: [304], 266: [305], 267: [306], 268: [307], 269: [308], 270: [309], 271: [310], 272: [311], 273: [312], 274: [313], 275: [314], 276: [315], 277: [316], 278: [317], 279: [318], 280: [319], 281: [320], 282: [321], 283: [322], 284: [323], 285: [324], 286: [325], 287: [326], 288: [327], 289: [328], 290: [329], 291: [330], 292: [331], 293: [332], 294: [333], 295: [334], 296: [335], 297: [336], 298: [337], 299: [338], 300: [339], 301: [340], 302: [342], 303: [343], 304: [344], 305: [345], 306: [346], 307: [347], 308: [349], 309: [350], 310: [351], 311: [352], 312: [353], 313: [354], 314: [356], 315: [357], 316: [369], 317: [370], 318: [371], 319: [372], 320: [373], 321: [374], 322: [376], 323: [377], 324: [378], 325: [379], 326: [380], 327: [381], 328: [382], 329: [383], 330: [384], 331: [385], 332: [386], 333: [387], 334: [388], 335: [389], 336: [390], 337: [392], 338: [393], 339: [394], 340: [395], 341: [396], 342: [397], 343: [398], 344: [399], 345: [400], 346: [401], 347: [402], 348: [403], 349: [404], 350: [405], 351: [406], 352: [408], 353: [409], 354: [410], 355: [411], 356: [412], 357: [413], 358: [414], 359: [415], 360: [416], 361: [417], 362: [418], 363: [419], 364: [420], 365: [421], 366: [422], 367: [424], 368: [425], 369: [426], 370: [427], 371: [428], 372: [429], 373: [430], 374: [431], 375: [432], 376: [433], 377: [434], 378: [436], 379: [437], 380: [439], 381: [440], 382: [446], 383: [447], 384: [448], 385: [449], 386: [450], 387: [451], 388: [452], 389: [453], 390: [454], 391: [456], 392: [457], 393: [458], 394: [459], 395: [460], 396: [461], 397: [462], 398: [463], 399: [464], 400: [466], 401: [467], 402: [468], 403: [469], 404: [470], 405: [471], 406: [472], 407: [473], 408: [474], 409: [476], 410: [477], 411: [478], 412: [479], 413: [480], 414: [481], 415: [483], 416: [484], 417: [485], 418: [486], 419: [487], 420: [488], 421: [492], 422: [493], 423: [494], 424: [495], 425: [496], 426: [497], 427: [499], 428: [500], 429: [501], 430: [502], 431: [503], 432: [504], 433: [505], 434: [506], 435: [507], 436: [508], 437: [509], 438: [510], 439: [511], 440: [512], 441: [513], 442: [515], 443: [516], 444: [517], 445: [518], 446: [519], 447: [520], 448: [521], 449: [522], 450: [523], 451: [524], 452: [525], 453: [526], 454: [527], 455: [528], 456: [529], 457: [531], 458: [532], 459: [533], 460: [534], 461: [535], 462: [536], 463: [537], 464: [538], 465: [539], 466: [540], 467: [541], 468: [542], 469: [543], 470: [544], 471: [545], 472: [547], 473: [548], 474: [549], 475: [550], 476: [551], 477: [552], 478: [553], 479: [554], 480: [555], 481: [556], 482: [557], 483: [559], 484: [560], 485: [562], 486: [563], 487: [569], 488: [570], 489: [571], 490: [572], 491: [573], 492: [574], 493: [575], 494: [576], 495: [577], 496: [579], 497: [580], 498: [581], 499: [582], 500: [583], 501: [584], 502: [585], 503: [586], 504: [587], 505: [589], 506: [590], 507: [591], 508: [592], 509: [593], 510: [594], 511: [595], 512: [596], 513: [597], 514: [599], 515: [600], 516: [601], 517: [602], 518: [603], 519: [604], 520: [606], 521: [607], 522: [608], 523: [609], 524: [610], 525: [611], 526: [615], 527: [616], 528: [617], 529: [618], 530: [619], 531: [620], 532: [622], 533: [623], 534: [624], 535: [625], 536: [626], 537: [627], 538: [628], 539: [629], 540: [630], 541: [631], 542: [632], 543: [633], 544: [634], 545: [635], 546: [636], 547: [638], 548: [639], 549: [640], 550: [641], 551: [642], 552: [643], 553: [644], 554: [645], 555: [646], 556: [647], 557: [648], 558: [649], 559: [650], 560: [651], 561: [652], 562: [654], 563: [655], 564: [656], 565: [657], 566: [658], 567: [659], 568: [660], 569: [661], 570: [662], 571: [663], 572: [664], 573: [665], 574: [666], 575: [667], 576: [668], 577: [670], 578: [671], 579: [672], 580: [673], 581: [674], 582: [675], 583: [676], 584: [677], 585: [678], 586: [679], 587: [680], 588: [682], 589: [683], 590: [685], 591: [686], 592: [692], 593: [693], 594: [694], 595: [695], 596: [696], 597: [697], 598: [698], 599: [699], 600: [700], 601: [702], 602: [703], 603: [704], 604: [705], 605: [706], 606: [707], 607: [708], 608: [709], 609: [710], 610: [712], 611: [713], 612: [714], 613: [715], 614: [716], 615: [717], 616: [718], 617: [719], 618: [720], 619: [722], 620: [723], 621: [724], 622: [725], 623: [726], 624: [727], 625: [729], 626: [730], 627: [731], 628: [732], 629: [733], 630: [734], 631: [738], 632: [739], 633: [740], 634: [741], 635: [742], 636: [743], 637: [744], 638: [745], 639: [746], 640: [747], 641: [748], 642: [749], 643: [750], 644: [751], 645: [752], 646: [753], 647: [754], 648: [755], 649: [756], 650: [757], 651: [758], 652: [759], 653: [760], 654: [761], 655: [762], 656: [763], 657: [764], 658: [765], 659: [766], 660: [767], 661: [768], 662: [769], 663: [770], 664: [772], 665: [773], 666: [774], 667: [775], 668: [776], 669: [777], 670: [779], 671: [780], 672: [781], 673: [782], 674: [783], 675: [784], 676: [788], 677: [789], 678: [790], 679: [791], 680: [792], 681: [793], 682: [794], 683: [795], 684: [796], 685: [797], 686: [798], 687: [799], 688: [800], 689: [801], 690: [802], 691: [803], 692: [804], 693: [805], 694: [806], 695: [807], 696: [808], 697: [809], 698: [810], 699: [811], 700: [812], 701: [813], 702: [814], 703: [815], 704: [816], 705: [817], 706: [818], 707: [819], 708: [820], 709: [822], 710: [823], 711: [824], 712: [825], 713: [826], 714: [827], 715: [829], 716: [830], 717: [831], 718: [832], 719: [833], 720: [834], 721: [838], 722: [839], 723: [841], 724: [842], 725: [844], 726: [845], 727: [851], 728: [852], 729: [853], 730: [854], 731: [855], 732: [856], 733: [857], 734: [858], 735: [859], 736: [861], 737: [862], 738: [863], 739: [864], 740: [865], 741: [866], 742: [867], 743: [868], 744: [869], 745: [871], 746: [872], 747: [873], 748: [874], 749: [875], 750: [876], 751: [877], 752: [878], 753: [879], 754: [881], 755: [882], 756: [883], 757: [884], 758: [885], 759: [886], 760: [888], 761: [889], 762: [890], 763: [891], 764: [892], 765: [893], 766: [894], 767: [895], 768: [897], 769: [898], 770: [900], 771: [901], 772: [907], 773: [908], 774: [909], 775: [910], 776: [911], 777: [912], 778: [913], 779: [914], 780: [915], 781: [917], 782: [918], 783: [919], 784: [920], 785: [921], 786: [922], 787: [923], 788: [924], 789: [925], 790: [927], 791: [928], 792: [929], 793: [930], 794: [931], 795: [932], 796: [933], 797: [934], 798: [935], 799: [937], 800: [938], 801: [939], 802: [940], 803: [941], 804: [942], 805: [944], 806: [945], 807: [946], 808: [947], 809: [948], 810: [949], 811: [950], 812: [951], 813: [953], 814: [954], 815: [956], 816: [957], 817: [963], 818: [964], 819: [965], 820: [966], 821: [967], 822: [968], 823: [969], 824: [970], 825: [971], 826: [973], 827: [974], 828: [975], 829: [976], 830: [977], 831: [978], 832: [979], 833: [980], 834: [981], 835: [983], 836: [984], 837: [985], 838: [986], 839: [987], 840: [988], 841: [989], 842: [990], 843: [991], 844: [993], 845: [994], 846: [995], 847: [996], 848: [997], 849: [998], 850: [1000], 851: [1001], 852: [1002], 853: [1003], 854: [1004], 855: [1005], 856: [1006], 857: [1007], 858: [1009], 859: [1010], 860: [1012], 861: [1013], 862: [1018], 863: [1019], 864: [1021], 865: [1022], 866: [1024], 867: [1025], 868: [1030], 869: [1031], 870: [1033], 871: [1034], 872: [1036], 873: [1037], 874: [1045], 875: [1046], 876: [1047], 877: [1048], 878: [1049], 879: [1050], 880: [1051], 881: [1052], 882: [1053], 883: [1054], 884: [1055], 885: [1056], 886: [1057], 887: [1058], 888: [1059], 889: [1060], 890: [1061], 891: [1062], 892: [1064], 893: [1065], 894: [1066], 895: [1067], 896: [1068], 897: [1069], 898: [1070], 899: [1071], 900: [1072], 901: [1073], 902: [1074], 903: [1075], 904: [1076], 905: [1077], 906: [1078], 907: [1079], 908: [1080], 909: [1081], 910: [1083], 911: [1084], 912: [1085], 913: [1086], 914: [1087], 915: [1088], 916: [1089], 917: [1090], 918: [1091], 919: [1092], 920: [1093], 921: [1094], 922: [1095], 923: [1096], 924: [1097], 925: [1098], 926: [1099], 927: [1100], 928: [1102], 929: [1103], 930: [1104], 931: [1105], 932: [1106], 933: [1107], 934: [1109], 935: [1110], 936: [1111], 937: [1112], 938: [1113], 939: [1114], 940: [1116], 941: [1117], 942: [1118], 943: [1119], 944: [1120], 945: [1121]} [1;30m[model_handling.py at line 1654][0m [0m
-[1;32mDEBUG:  diag_to_config = [0m {1: 1, 2: 2, 6: 3, 7: 4, 11: 5, 12: 6, 28: 7, 29: 8, 30: 9, 34: 10, 35: 11, 36: 12, 40: 13, 41: 14, 42: 15, 46: 16, 47: 17, 48: 18, 49: 19, 50: 20, 51: 21, 55: 22, 56: 23, 57: 24, 58: 25, 59: 26, 60: 27, 61: 28, 62: 29, 63: 30, 64: 31, 65: 32, 66: 33, 67: 34, 68: 35, 69: 36, 73: 37, 74: 38, 75: 39, 76: 40, 77: 41, 78: 42, 79: 43, 80: 44, 81: 45, 82: 46, 83: 47, 84: 48, 85: 49, 86: 50, 87: 51, 91: 52, 92: 53, 93: 54, 94: 55, 95: 56, 96: 57, 97: 58, 98: 59, 99: 60, 100: 61, 101: 62, 102: 63, 103: 64, 104: 65, 105: 66, 109: 67, 110: 68, 111: 69, 112: 70, 113: 71, 114: 72, 115: 73, 116: 74, 117: 75, 121: 76, 122: 77, 123: 78, 124: 79, 125: 80, 126: 81, 127: 82, 128: 83, 129: 84, 133: 85, 134: 86, 135: 87, 136: 88, 137: 89, 138: 90, 139: 91, 140: 92, 141: 93, 142: 94, 143: 95, 144: 96, 145: 97, 146: 98, 147: 99, 148: 100, 149: 101, 150: 102, 151: 103, 152: 104, 153: 105, 160: 106, 161: 107, 162: 108, 163: 109, 164: 110, 165: 111, 166: 112, 167: 113, 168: 114, 169: 115, 170: 116, 171: 117, 172: 118, 173: 119, 174: 120, 178: 121, 179: 122, 183: 123, 184: 124, 185: 125, 186: 126, 187: 127, 188: 128, 189: 129, 190: 130, 191: 131, 192: 132, 193: 133, 194: 134, 195: 135, 196: 136, 197: 137, 201: 138, 202: 139, 203: 140, 204: 141, 205: 142, 206: 143, 207: 144, 208: 145, 209: 146, 210: 147, 211: 148, 212: 149, 213: 150, 214: 151, 215: 152, 219: 153, 220: 154, 221: 155, 222: 156, 223: 157, 224: 158, 225: 159, 226: 160, 227: 161, 228: 162, 229: 163, 230: 164, 231: 165, 232: 166, 233: 167, 234: 168, 235: 169, 236: 170, 237: 171, 238: 172, 239: 173, 240: 174, 241: 175, 242: 176, 243: 177, 244: 178, 245: 179, 246: 180, 247: 181, 248: 182, 249: 183, 250: 184, 251: 185, 252: 186, 253: 187, 254: 188, 255: 189, 256: 190, 257: 191, 258: 192, 259: 193, 260: 194, 261: 195, 262: 196, 266: 197, 267: 198, 268: 199, 269: 200, 270: 201, 271: 202, 275: 203, 276: 204, 277: 205, 278: 206, 279: 207, 280: 208, 284: 209, 285: 210, 319: 211, 320: 212, 321: 213, 322: 214, 323: 215, 324: 216, 325: 217, 326: 218, 327: 219, 328: 220, 329: 221, 330: 222, 331: 223, 332: 224, 333: 225, 337: 226, 338: 227, 342: 228, 343: 229, 344: 230, 345: 231, 346: 232, 347: 233, 348: 234, 349: 235, 350: 236, 351: 237, 352: 238, 353: 239, 354: 240, 355: 241, 356: 242, 360: 243, 361: 244, 362: 245, 363: 246, 364: 247, 365: 248, 366: 249, 367: 250, 368: 251, 369: 252, 370: 253, 371: 254, 372: 255, 373: 256, 374: 257, 378: 258, 379: 259, 380: 260, 381: 261, 382: 262, 383: 263, 384: 264, 385: 265, 386: 266, 387: 267, 388: 268, 389: 269, 390: 270, 391: 271, 392: 272, 393: 273, 394: 274, 395: 275, 396: 276, 397: 277, 398: 278, 399: 279, 400: 280, 401: 281, 402: 282, 403: 283, 404: 284, 405: 285, 406: 286, 407: 287, 408: 288, 409: 289, 410: 290, 411: 291, 412: 292, 413: 293, 414: 294, 415: 295, 416: 296, 417: 297, 418: 298, 419: 299, 420: 300, 421: 301, 425: 302, 426: 303, 427: 304, 428: 305, 429: 306, 430: 307, 434: 308, 435: 309, 436: 310, 437: 311, 438: 312, 439: 313, 443: 314, 444: 315, 478: 316, 479: 317, 480: 318, 481: 319, 482: 320, 483: 321, 487: 322, 488: 323, 489: 324, 490: 325, 491: 326, 492: 327, 493: 328, 494: 329, 495: 330, 496: 331, 497: 332, 498: 333, 499: 334, 500: 335, 501: 336, 505: 337, 506: 338, 507: 339, 508: 340, 509: 341, 510: 342, 511: 343, 512: 344, 513: 345, 514: 346, 515: 347, 516: 348, 517: 349, 518: 350, 519: 351, 523: 352, 524: 353, 525: 354, 526: 355, 527: 356, 528: 357, 529: 358, 530: 359, 531: 360, 532: 361, 533: 362, 534: 363, 535: 364, 536: 365, 537: 366, 541: 367, 542: 368, 543: 369, 544: 370, 545: 371, 546: 372, 547: 373, 548: 374, 549: 375, 550: 376, 551: 377, 555: 378, 556: 379, 560: 380, 561: 381, 577: 382, 578: 383, 579: 384, 580: 385, 581: 386, 582: 387, 583: 388, 584: 389, 585: 390, 589: 391, 590: 392, 591: 393, 592: 394, 593: 395, 594: 396, 595: 397, 596: 398, 597: 399, 601: 400, 602: 401, 603: 402, 604: 403, 605: 404, 606: 405, 607: 406, 608: 407, 609: 408, 613: 409, 614: 410, 615: 411, 616: 412, 617: 413, 618: 414, 622: 415, 623: 416, 624: 417, 625: 418, 626: 419, 627: 420, 637: 421, 638: 422, 639: 423, 640: 424, 641: 425, 642: 426, 646: 427, 647: 428, 648: 429, 649: 430, 650: 431, 651: 432, 652: 433, 653: 434, 654: 435, 655: 436, 656: 437, 657: 438, 658: 439, 659: 440, 660: 441, 664: 442, 665: 443, 666: 444, 667: 445, 668: 446, 669: 447, 670: 448, 671: 449, 672: 450, 673: 451, 674: 452, 675: 453, 676: 454, 677: 455, 678: 456, 682: 457, 683: 458, 684: 459, 685: 460, 686: 461, 687: 462, 688: 463, 689: 464, 690: 465, 691: 466, 692: 467, 693: 468, 694: 469, 695: 470, 696: 471, 700: 472, 701: 473, 702: 474, 703: 475, 704: 476, 705: 477, 706: 478, 707: 479, 708: 480, 709: 481, 710: 482, 714: 483, 715: 484, 719: 485, 720: 486, 736: 487, 737: 488, 738: 489, 739: 490, 740: 491, 741: 492, 742: 493, 743: 494, 744: 495, 748: 496, 749: 497, 750: 498, 751: 499, 752: 500, 753: 501, 754: 502, 755: 503, 756: 504, 760: 505, 761: 506, 762: 507, 763: 508, 764: 509, 765: 510, 766: 511, 767: 512, 768: 513, 772: 514, 773: 515, 774: 516, 775: 517, 776: 518, 777: 519, 781: 520, 782: 521, 783: 522, 784: 523, 785: 524, 786: 525, 796: 526, 797: 527, 798: 528, 799: 529, 800: 530, 801: 531, 805: 532, 806: 533, 807: 534, 808: 535, 809: 536, 810: 537, 811: 538, 812: 539, 813: 540, 814: 541, 815: 542, 816: 543, 817: 544, 818: 545, 819: 546, 823: 547, 824: 548, 825: 549, 826: 550, 827: 551, 828: 552, 829: 553, 830: 554, 831: 555, 832: 556, 833: 557, 834: 558, 835: 559, 836: 560, 837: 561, 841: 562, 842: 563, 843: 564, 844: 565, 845: 566, 846: 567, 847: 568, 848: 569, 849: 570, 850: 571, 851: 572, 852: 573, 853: 574, 854: 575, 855: 576, 859: 577, 860: 578, 861: 579, 862: 580, 863: 581, 864: 582, 865: 583, 866: 584, 867: 585, 868: 586, 869: 587, 873: 588, 874: 589, 878: 590, 879: 591, 895: 592, 896: 593, 897: 594, 898: 595, 899: 596, 900: 597, 901: 598, 902: 599, 903: 600, 907: 601, 908: 602, 909: 603, 910: 604, 911: 605, 912: 606, 913: 607, 914: 608, 915: 609, 919: 610, 920: 611, 921: 612, 922: 613, 923: 614, 924: 615, 925: 616, 926: 617, 927: 618, 931: 619, 932: 620, 933: 621, 934: 622, 935: 623, 936: 624, 940: 625, 941: 626, 942: 627, 943: 628, 944: 629, 945: 630, 955: 631, 956: 632, 957: 633, 958: 634, 959: 635, 960: 636, 961: 637, 962: 638, 963: 639, 964: 640, 965: 641, 966: 642, 967: 643, 968: 644, 969: 645, 970: 646, 971: 647, 972: 648, 973: 649, 974: 650, 975: 651, 976: 652, 977: 653, 978: 654, 979: 655, 980: 656, 981: 657, 982: 658, 983: 659, 984: 660, 985: 661, 986: 662, 987: 663, 991: 664, 992: 665, 993: 666, 994: 667, 995: 668, 996: 669, 1000: 670, 1001: 671, 1002: 672, 1003: 673, 1004: 674, 1005: 675, 1015: 676, 1016: 677, 1017: 678, 1018: 679, 1019: 680, 1020: 681, 1021: 682, 1022: 683, 1023: 684, 1024: 685, 1025: 686, 1026: 687, 1027: 688, 1028: 689, 1029: 690, 1030: 691, 1031: 692, 1032: 693, 1033: 694, 1034: 695, 1035: 696, 1036: 697, 1037: 698, 1038: 699, 1039: 700, 1040: 701, 1041: 702, 1042: 703, 1043: 704, 1044: 705, 1045: 706, 1046: 707, 1047: 708, 1051: 709, 1052: 710, 1053: 711, 1054: 712, 1055: 713, 1056: 714, 1060: 715, 1061: 716, 1062: 717, 1063: 718, 1064: 719, 1065: 720, 1075: 721, 1076: 722, 1080: 723, 1081: 724, 1085: 725, 1086: 726, 1102: 727, 1103: 728, 1104: 729, 1105: 730, 1106: 731, 1107: 732, 1108: 733, 1109: 734, 1110: 735, 1114: 736, 1115: 737, 1116: 738, 1117: 739, 1118: 740, 1119: 741, 1120: 742, 1121: 743, 1122: 744, 1126: 745, 1127: 746, 1128: 747, 1129: 748, 1130: 749, 1131: 750, 1132: 751, 1133: 752, 1134: 753, 1138: 754, 1139: 755, 1140: 756, 1141: 757, 1142: 758, 1143: 759, 1147: 760, 1148: 761, 1149: 762, 1150: 763, 1151: 764, 1152: 765, 1153: 766, 1154: 767, 1158: 768, 1159: 769, 1163: 770, 1164: 771, 1180: 772, 1181: 773, 1182: 774, 1183: 775, 1184: 776, 1185: 777, 1186: 778, 1187: 779, 1188: 780, 1192: 781, 1193: 782, 1194: 783, 1195: 784, 1196: 785, 1197: 786, 1198: 787, 1199: 788, 1200: 789, 1204: 790, 1205: 791, 1206: 792, 1207: 793, 1208: 794, 1209: 795, 1210: 796, 1211: 797, 1212: 798, 1216: 799, 1217: 800, 1218: 801, 1219: 802, 1220: 803, 1221: 804, 1225: 805, 1226: 806, 1227: 807, 1228: 808, 1229: 809, 1230: 810, 1231: 811, 1232: 812, 1236: 813, 1237: 814, 1241: 815, 1242: 816, 1258: 817, 1259: 818, 1260: 819, 1261: 820, 1262: 821, 1263: 822, 1264: 823, 1265: 824, 1266: 825, 1270: 826, 1271: 827, 1272: 828, 1273: 829, 1274: 830, 1275: 831, 1276: 832, 1277: 833, 1278: 834, 1282: 835, 1283: 836, 1284: 837, 1285: 838, 1286: 839, 1287: 840, 1288: 841, 1289: 842, 1290: 843, 1294: 844, 1295: 845, 1296: 846, 1297: 847, 1298: 848, 1299: 849, 1303: 850, 1304: 851, 1305: 852, 1306: 853, 1307: 854, 1308: 855, 1309: 856, 1310: 857, 1314: 858, 1315: 859, 1319: 860, 1320: 861, 1333: 862, 1334: 863, 1338: 864, 1339: 865, 1343: 866, 1344: 867, 1357: 868, 1358: 869, 1362: 870, 1363: 871, 1367: 872, 1368: 873, 1396: 874, 1397: 875, 1398: 876, 1399: 877, 1400: 878, 1401: 879, 1402: 880, 1403: 881, 1404: 882, 1405: 883, 1406: 884, 1407: 885, 1408: 886, 1409: 887, 1410: 888, 1411: 889, 1412: 890, 1413: 891, 1417: 892, 1418: 893, 1419: 894, 1420: 895, 1421: 896, 1422: 897, 1423: 898, 1424: 899, 1425: 900, 1426: 901, 1427: 902, 1428: 903, 1429: 904, 1430: 905, 1431: 906, 1432: 907, 1433: 908, 1434: 909, 1438: 910, 1439: 911, 1440: 912, 1441: 913, 1442: 914, 1443: 915, 1444: 916, 1445: 917, 1446: 918, 1447: 919, 1448: 920, 1449: 921, 1450: 922, 1451: 923, 1452: 924, 1453: 925, 1454: 926, 1455: 927, 1459: 928, 1460: 929, 1461: 930, 1462: 931, 1463: 932, 1464: 933, 1468: 934, 1469: 935, 1470: 936, 1471: 937, 1472: 938, 1473: 939, 1477: 940, 1478: 941, 1479: 942, 1480: 943, 1481: 944, 1482: 945} [1;30m[model_handling.py at line 1709][0m [0m
-[1;32mDEBUG:  call = [0m vxxxxx( momenta,m_pars->%s, cHel[ihel][%d],%+d, w_sv[%d], %d ); [1;30m[model_handling.py at line 1821][0m [0m
-[1;32mDEBUG:  ('ZERO', 0, -1, 0, 0) [1;30m[model_handling.py at line 1822][0m [0m
-[1;32mDEBUG:  call = [0m vxxxxx( momenta,m_pars->%s, cHel[ihel][%d],%+d, w_sv[%d], %d ); [1;30m[model_handling.py at line 1821][0m [0m
-[1;32mDEBUG:  ('ZERO', 1, -1, 1, 1) [1;30m[model_handling.py at line 1822][0m [0m
-[1;32mDEBUG:  call = [0m vxxxxx( momenta,m_pars->%s, cHel[ihel][%d],%+d, w_sv[%d], %d ); [1;30m[model_handling.py at line 1821][0m [0m
-[1;32mDEBUG:  ('ZERO', 4, 1, 4, 4) [1;30m[model_handling.py at line 1822][0m [0m
-[1;32mDEBUG:  call = [0m vxxxxx( momenta,m_pars->%s, cHel[ihel][%d],%+d, w_sv[%d], %d ); [1;30m[model_handling.py at line 1821][0m [0m
-[1;32mDEBUG:  ('ZERO', 5, 1, 5, 5) [1;30m[model_handling.py at line 1822][0m [0m
-[1;32mDEBUG:  call = [0m vxxxxx( momenta,m_pars->%s, cHel[ihel][%d],%+d, w_sv[%d], %d ); [1;30m[model_handling.py at line 1821][0m [0m
-[1;32mDEBUG:  ('ZERO', 6, 1, 6, 6) [1;30m[model_handling.py at line 1822][0m [0m
+[1;32mDEBUG:  diag_to_config = [0m {1: 1, 2: 2, 6: 3, 7: 4, 11: 5, 12: 6, 28: 7, 29: 8, 30: 9, 34: 10, 35: 11, 36: 12, 40: 13, 41: 14, 42: 15, 46: 16, 47: 17, 48: 18, 49: 19, 50: 20, 51: 21, 55: 22, 56: 23, 57: 24, 58: 25, 59: 26, 60: 27, 61: 28, 62: 29, 63: 30, 64: 31, 65: 32, 66: 33, 67: 34, 68: 35, 69: 36, 73: 37, 74: 38, 75: 39, 76: 40, 77: 41, 78: 42, 79: 43, 80: 44, 81: 45, 82: 46, 83: 47, 84: 48, 85: 49, 86: 50, 87: 51, 91: 52, 92: 53, 93: 54, 94: 55, 95: 56, 96: 57, 97: 58, 98: 59, 99: 60, 100: 61, 101: 62, 102: 63, 103: 64, 104: 65, 105: 66, 109: 67, 110: 68, 111: 69, 112: 70, 113: 71, 114: 72, 115: 73, 116: 74, 117: 75, 121: 76, 122: 77, 123: 78, 124: 79, 125: 80, 126: 81, 127: 82, 128: 83, 129: 84, 133: 85, 134: 86, 135: 87, 136: 88, 137: 89, 138: 90, 139: 91, 140: 92, 141: 93, 142: 94, 143: 95, 144: 96, 145: 97, 146: 98, 147: 99, 148: 100, 149: 101, 150: 102, 151: 103, 152: 104, 153: 105, 160: 106, 161: 107, 162: 108, 163: 109, 164: 110, 165: 111, 166: 112, 167: 113, 168: 114, 169: 115, 170: 116, 171: 117, 172: 118, 173: 119, 174: 120, 178: 121, 179: 122, 183: 123, 184: 124, 185: 125, 186: 126, 187: 127, 188: 128, 189: 129, 190: 130, 191: 131, 192: 132, 193: 133, 194: 134, 195: 135, 196: 136, 197: 137, 201: 138, 202: 139, 203: 140, 204: 141, 205: 142, 206: 143, 207: 144, 208: 145, 209: 146, 210: 147, 211: 148, 212: 149, 213: 150, 214: 151, 215: 152, 219: 153, 220: 154, 221: 155, 222: 156, 223: 157, 224: 158, 225: 159, 226: 160, 227: 161, 228: 162, 229: 163, 230: 164, 231: 165, 232: 166, 233: 167, 234: 168, 235: 169, 236: 170, 237: 171, 238: 172, 239: 173, 240: 174, 241: 175, 242: 176, 243: 177, 244: 178, 245: 179, 246: 180, 247: 181, 248: 182, 249: 183, 250: 184, 251: 185, 252: 186, 253: 187, 254: 188, 255: 189, 256: 190, 257: 191, 258: 192, 259: 193, 260: 194, 261: 195, 262: 196, 266: 197, 267: 198, 268: 199, 269: 200, 270: 201, 271: 202, 275: 203, 276: 204, 277: 205, 278: 206, 279: 207, 280: 208, 284: 209, 285: 210, 319: 211, 320: 212, 321: 213, 322: 214, 323: 215, 324: 216, 325: 217, 326: 218, 327: 219, 328: 220, 329: 221, 330: 222, 331: 223, 332: 224, 333: 225, 337: 226, 338: 227, 342: 228, 343: 229, 344: 230, 345: 231, 346: 232, 347: 233, 348: 234, 349: 235, 350: 236, 351: 237, 352: 238, 353: 239, 354: 240, 355: 241, 356: 242, 360: 243, 361: 244, 362: 245, 363: 246, 364: 247, 365: 248, 366: 249, 367: 250, 368: 251, 369: 252, 370: 253, 371: 254, 372: 255, 373: 256, 374: 257, 378: 258, 379: 259, 380: 260, 381: 261, 382: 262, 383: 263, 384: 264, 385: 265, 386: 266, 387: 267, 388: 268, 389: 269, 390: 270, 391: 271, 392: 272, 393: 273, 394: 274, 395: 275, 396: 276, 397: 277, 398: 278, 399: 279, 400: 280, 401: 281, 402: 282, 403: 283, 404: 284, 405: 285, 406: 286, 407: 287, 408: 288, 409: 289, 410: 290, 411: 291, 412: 292, 413: 293, 414: 294, 415: 295, 416: 296, 417: 297, 418: 298, 419: 299, 420: 300, 421: 301, 425: 302, 426: 303, 427: 304, 428: 305, 429: 306, 430: 307, 434: 308, 435: 309, 436: 310, 437: 311, 438: 312, 439: 313, 443: 314, 444: 315, 478: 316, 479: 317, 480: 318, 481: 319, 482: 320, 483: 321, 487: 322, 488: 323, 489: 324, 490: 325, 491: 326, 492: 327, 493: 328, 494: 329, 495: 330, 496: 331, 497: 332, 498: 333, 499: 334, 500: 335, 501: 336, 505: 337, 506: 338, 507: 339, 508: 340, 509: 341, 510: 342, 511: 343, 512: 344, 513: 345, 514: 346, 515: 347, 516: 348, 517: 349, 518: 350, 519: 351, 523: 352, 524: 353, 525: 354, 526: 355, 527: 356, 528: 357, 529: 358, 530: 359, 531: 360, 532: 361, 533: 362, 534: 363, 535: 364, 536: 365, 537: 366, 541: 367, 542: 368, 543: 369, 544: 370, 545: 371, 546: 372, 547: 373, 548: 374, 549: 375, 550: 376, 551: 377, 555: 378, 556: 379, 560: 380, 561: 381, 577: 382, 578: 383, 579: 384, 580: 385, 581: 386, 582: 387, 583: 388, 584: 389, 585: 390, 589: 391, 590: 392, 591: 393, 592: 394, 593: 395, 594: 396, 595: 397, 596: 398, 597: 399, 601: 400, 602: 401, 603: 402, 604: 403, 605: 404, 606: 405, 607: 406, 608: 407, 609: 408, 613: 409, 614: 410, 615: 411, 616: 412, 617: 413, 618: 414, 622: 415, 623: 416, 624: 417, 625: 418, 626: 419, 627: 420, 637: 421, 638: 422, 639: 423, 640: 424, 641: 425, 642: 426, 646: 427, 647: 428, 648: 429, 649: 430, 650: 431, 651: 432, 652: 433, 653: 434, 654: 435, 655: 436, 656: 437, 657: 438, 658: 439, 659: 440, 660: 441, 664: 442, 665: 443, 666: 444, 667: 445, 668: 446, 669: 447, 670: 448, 671: 449, 672: 450, 673: 451, 674: 452, 675: 453, 676: 454, 677: 455, 678: 456, 682: 457, 683: 458, 684: 459, 685: 460, 686: 461, 687: 462, 688: 463, 689: 464, 690: 465, 691: 466, 692: 467, 693: 468, 694: 469, 695: 470, 696: 471, 700: 472, 701: 473, 702: 474, 703: 475, 704: 476, 705: 477, 706: 478, 707: 479, 708: 480, 709: 481, 710: 482, 714: 483, 715: 484, 719: 485, 720: 486, 736: 487, 737: 488, 738: 489, 739: 490, 740: 491, 741: 492, 742: 493, 743: 494, 744: 495, 748: 496, 749: 497, 750: 498, 751: 499, 752: 500, 753: 501, 754: 502, 755: 503, 756: 504, 760: 505, 761: 506, 762: 507, 763: 508, 764: 509, 765: 510, 766: 511, 767: 512, 768: 513, 772: 514, 773: 515, 774: 516, 775: 517, 776: 518, 777: 519, 781: 520, 782: 521, 783: 522, 784: 523, 785: 524, 786: 525, 796: 526, 797: 527, 798: 528, 799: 529, 800: 530, 801: 531, 805: 532, 806: 533, 807: 534, 808: 535, 809: 536, 810: 537, 811: 538, 812: 539, 813: 540, 814: 541, 815: 542, 816: 543, 817: 544, 818: 545, 819: 546, 823: 547, 824: 548, 825: 549, 826: 550, 827: 551, 828: 552, 829: 553, 830: 554, 831: 555, 832: 556, 833: 557, 834: 558, 835: 559, 836: 560, 837: 561, 841: 562, 842: 563, 843: 564, 844: 565, 845: 566, 846: 567, 847: 568, 848: 569, 849: 570, 850: 571, 851: 572, 852: 573, 853: 574, 854: 575, 855: 576, 859: 577, 860: 578, 861: 579, 862: 580, 863: 581, 864: 582, 865: 583, 866: 584, 867: 585, 868: 586, 869: 587, 873: 588, 874: 589, 878: 590, 879: 591, 895: 592, 896: 593, 897: 594, 898: 595, 899: 596, 900: 597, 901: 598, 902: 599, 903: 600, 907: 601, 908: 602, 909: 603, 910: 604, 911: 605, 912: 606, 913: 607, 914: 608, 915: 609, 919: 610, 920: 611, 921: 612, 922: 613, 923: 614, 924: 615, 925: 616, 926: 617, 927: 618, 931: 619, 932: 620, 933: 621, 934: 622, 935: 623, 936: 624, 940: 625, 941: 626, 942: 627, 943: 628, 944: 629, 945: 630, 955: 631, 956: 632, 957: 633, 958: 634, 959: 635, 960: 636, 961: 637, 962: 638, 963: 639, 964: 640, 965: 641, 966: 642, 967: 643, 968: 644, 969: 645, 970: 646, 971: 647, 972: 648, 973: 649, 974: 650, 975: 651, 976: 652, 977: 653, 978: 654, 979: 655, 980: 656, 981: 657, 982: 658, 983: 659, 984: 660, 985: 661, 986: 662, 987: 663, 991: 664, 992: 665, 993: 666, 994: 667, 995: 668, 996: 669, 1000: 670, 1001: 671, 1002: 672, 1003: 673, 1004: 674, 1005: 675, 1015: 676, 1016: 677, 1017: 678, 1018: 679, 1019: 680, 1020: 681, 1021: 682, 1022: 683, 1023: 684, 1024: 685, 1025: 686, 1026: 687, 1027: 688, 1028: 689, 1029: 690, 1030: 691, 1031: 692, 1032: 693, 1033: 694, 1034: 695, 1035: 696, 1036: 697, 1037: 698, 1038: 699, 1039: 700, 1040: 701, 1041: 702, 1042: 703, 1043: 704, 1044: 705, 1045: 706, 1046: 707, 1047: 708, 1051: 709, 1052: 710, 1053: 711, 1054: 712, 1055: 713, 1056: 714, 1060: 715, 1061: 716, 1062: 717, 1063: 718, 1064: 719, 1065: 720, 1075: 721, 1076: 722, 1080: 723, 1081: 724, 1085: 725, 1086: 726, 1102: 727, 1103: 728, 1104: 729, 1105: 730, 1106: 731, 1107: 732, 1108: 733, 1109: 734, 1110: 735, 1114: 736, 1115: 737, 1116: 738, 1117: 739, 1118: 740, 1119: 741, 1120: 742, 1121: 743, 1122: 744, 1126: 745, 1127: 746, 1128: 747, 1129: 748, 1130: 749, 1131: 750, 1132: 751, 1133: 752, 1134: 753, 1138: 754, 1139: 755, 1140: 756, 1141: 757, 1142: 758, 1143: 759, 1147: 760, 1148: 761, 1149: 762, 1150: 763, 1151: 764, 1152: 765, 1153: 766, 1154: 767, 1158: 768, 1159: 769, 1163: 770, 1164: 771, 1180: 772, 1181: 773, 1182: 774, 1183: 775, 1184: 776, 1185: 777, 1186: 778, 1187: 779, 1188: 780, 1192: 781, 1193: 782, 1194: 783, 1195: 784, 1196: 785, 1197: 786, 1198: 787, 1199: 788, 1200: 789, 1204: 790, 1205: 791, 1206: 792, 1207: 793, 1208: 794, 1209: 795, 1210: 796, 1211: 797, 1212: 798, 1216: 799, 1217: 800, 1218: 801, 1219: 802, 1220: 803, 1221: 804, 1225: 805, 1226: 806, 1227: 807, 1228: 808, 1229: 809, 1230: 810, 1231: 811, 1232: 812, 1236: 813, 1237: 814, 1241: 815, 1242: 816, 1258: 817, 1259: 818, 1260: 819, 1261: 820, 1262: 821, 1263: 822, 1264: 823, 1265: 824, 1266: 825, 1270: 826, 1271: 827, 1272: 828, 1273: 829, 1274: 830, 1275: 831, 1276: 832, 1277: 833, 1278: 834, 1282: 835, 1283: 836, 1284: 837, 1285: 838, 1286: 839, 1287: 840, 1288: 841, 1289: 842, 1290: 843, 1294: 844, 1295: 845, 1296: 846, 1297: 847, 1298: 848, 1299: 849, 1303: 850, 1304: 851, 1305: 852, 1306: 853, 1307: 854, 1308: 855, 1309: 856, 1310: 857, 1314: 858, 1315: 859, 1319: 860, 1320: 861, 1333: 862, 1334: 863, 1338: 864, 1339: 865, 1343: 866, 1344: 867, 1357: 868, 1358: 869, 1362: 870, 1363: 871, 1367: 872, 1368: 873, 1396: 874, 1397: 875, 1398: 876, 1399: 877, 1400: 878, 1401: 879, 1402: 880, 1403: 881, 1404: 882, 1405: 883, 1406: 884, 1407: 885, 1408: 886, 1409: 887, 1410: 888, 1411: 889, 1412: 890, 1413: 891, 1417: 892, 1418: 893, 1419: 894, 1420: 895, 1421: 896, 1422: 897, 1423: 898, 1424: 899, 1425: 900, 1426: 901, 1427: 902, 1428: 903, 1429: 904, 1430: 905, 1431: 906, 1432: 907, 1433: 908, 1434: 909, 1438: 910, 1439: 911, 1440: 912, 1441: 913, 1442: 914, 1443: 915, 1444: 916, 1445: 917, 1446: 918, 1447: 919, 1448: 920, 1449: 921, 1450: 922, 1451: 923, 1452: 924, 1453: 925, 1454: 926, 1455: 927, 1459: 928, 1460: 929, 1461: 930, 1462: 931, 1463: 932, 1464: 933, 1468: 934, 1469: 935, 1470: 936, 1471: 937, 1472: 938, 1473: 939, 1477: 940, 1478: 941, 1479: 942, 1480: 943, 1481: 944, 1482: 945} [1;30m[model_handling.py at line 1711][0m [0m
+[1;32mDEBUG:  call = [0m vxxxxx( momenta,m_pars->%s, cHel[ihel][%d],%+d, w_sv[%d], %d ); [1;30m[model_handling.py at line 1823][0m [0m
+[1;32mDEBUG:  ('ZERO', 0, -1, 0, 0) [1;30m[model_handling.py at line 1824][0m [0m
+[1;32mDEBUG:  call = [0m vxxxxx( momenta,m_pars->%s, cHel[ihel][%d],%+d, w_sv[%d], %d ); [1;30m[model_handling.py at line 1823][0m [0m
+[1;32mDEBUG:  ('ZERO', 1, -1, 1, 1) [1;30m[model_handling.py at line 1824][0m [0m
+[1;32mDEBUG:  call = [0m vxxxxx( momenta,m_pars->%s, cHel[ihel][%d],%+d, w_sv[%d], %d ); [1;30m[model_handling.py at line 1823][0m [0m
+[1;32mDEBUG:  ('ZERO', 4, 1, 4, 4) [1;30m[model_handling.py at line 1824][0m [0m
+[1;32mDEBUG:  call = [0m vxxxxx( momenta,m_pars->%s, cHel[ihel][%d],%+d, w_sv[%d], %d ); [1;30m[model_handling.py at line 1823][0m [0m
+[1;32mDEBUG:  ('ZERO', 5, 1, 5, 5) [1;30m[model_handling.py at line 1824][0m [0m
+[1;32mDEBUG:  call = [0m vxxxxx( momenta,m_pars->%s, cHel[ihel][%d],%+d, w_sv[%d], %d ); [1;30m[model_handling.py at line 1823][0m [0m
+[1;32mDEBUG:  ('ZERO', 6, 1, 6, 6) [1;30m[model_handling.py at line 1824][0m [0m
 INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./. 
 [1;32mDEBUG:  Entering PLUGIN_OneProcessExporter.edit_CMakeLists [1;30m[model_handling.py at line 1343][0m [0m
 [1;32mDEBUG:  Entering PLUGIN_OneProcessExporter.edit_check_sa [1;30m[model_handling.py at line 1352][0m [0m
@@ -214,22 +214,22 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./.
 [1;32mDEBUG:  Entering PLUGIN_OneProcessExporter.edit_testxxx [1;30m[model_handling.py at line 1419][0m [0m
 [1;32mDEBUG:  Entering PLUGIN_OneProcessExporter.edit_memorybuffers [1;30m[model_handling.py at line 1430][0m [0m
 [1;32mDEBUG:  Entering PLUGIN_OneProcessExporter.edit_memoryaccesscouplings [1;30m[model_handling.py at line 1441][0m [0m
-[1;32mDEBUG:  'Copying test reference file: ', template_ref  = [0m Copying test reference file:  /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/../../../test/ref/dump_CPUTest.Sigma_sm_gg_ttxggg.txt [1;30m[model_handling.py at line 1335][0m [0m
+[1;32mDEBUG:  'Copying test reference file: ', template_ref  = [0m Copying test reference file:  /afs/cern.ch/work/j/jteig/mg5amcnlo/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/../../../test/ref/dump_CPUTest.Sigma_sm_gg_ttxggg.txt [1;30m[model_handling.py at line 1335][0m [0m
 [1;32mDEBUG:  proc_id = [0m 1 [1;30m[export_cpp.py at line 710][0m [0m
 [1;32mDEBUG:  config_map = [0m [1, 2, 0, 3, 4, 0, 5, 6, 0, 0, 0, 0, 0, 7, 8, 9, 0, 10, 11, 12, 0, 13, 14, 15, 0, 16, 17, 18, 19, 20, 21, 0, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 0, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 0, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 0, 67, 68, 69, 70, 71, 72, 73, 74, 75, 0, 76, 77, 78, 79, 80, 81, 82, 83, 84, 0, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 0, 0, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 0, 121, 122, 0, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 0, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 0, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196, 0, 197, 198, 199, 200, 201, 202, 0, 203, 204, 205, 206, 207, 208, 0, 209, 210, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223, 224, 225, 0, 226, 227, 0, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239, 240, 241, 242, 0, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254, 255, 256, 257, 0, 258, 259, 260, 261, 262, 263, 264, 265, 266, 267, 268, 269, 270, 271, 272, 273, 274, 275, 276, 277, 278, 279, 280, 281, 282, 283, 284, 285, 286, 287, 288, 289, 290, 291, 292, 293, 294, 295, 296, 297, 298, 299, 300, 301, 0, 302, 303, 304, 305, 306, 307, 0, 308, 309, 310, 311, 312, 313, 0, 314, 315, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 316, 317, 318, 319, 320, 321, 0, 322, 323, 324, 325, 326, 327, 328, 329, 330, 331, 332, 333, 334, 335, 336, 0, 337, 338, 339, 340, 341, 342, 343, 344, 345, 346, 347, 348, 349, 350, 351, 0, 352, 353, 354, 355, 356, 357, 358, 359, 360, 361, 362, 363, 364, 365, 366, 0, 367, 368, 369, 370, 371, 372, 373, 374, 375, 376, 377, 0, 378, 379, 0, 380, 381, 0, 0, 0, 0, 0, 382, 383, 384, 385, 386, 387, 388, 389, 390, 0, 391, 392, 393, 394, 395, 396, 397, 398, 399, 0, 400, 401, 402, 403, 404, 405, 406, 407, 408, 0, 409, 410, 411, 412, 413, 414, 0, 415, 416, 417, 418, 419, 420, 0, 0, 0, 421, 422, 423, 424, 425, 426, 0, 427, 428, 429, 430, 431, 432, 433, 434, 435, 436, 437, 438, 439, 440, 441, 0, 442, 443, 444, 445, 446, 447, 448, 449, 450, 451, 452, 453, 454, 455, 456, 0, 457, 458, 459, 460, 461, 462, 463, 464, 465, 466, 467, 468, 469, 470, 471, 0, 472, 473, 474, 475, 476, 477, 478, 479, 480, 481, 482, 0, 483, 484, 0, 485, 486, 0, 0, 0, 0, 0, 487, 488, 489, 490, 491, 492, 493, 494, 495, 0, 496, 497, 498, 499, 500, 501, 502, 503, 504, 0, 505, 506, 507, 508, 509, 510, 511, 512, 513, 0, 514, 515, 516, 517, 518, 519, 0, 520, 521, 522, 523, 524, 525, 0, 0, 0, 526, 527, 528, 529, 530, 531, 0, 532, 533, 534, 535, 536, 537, 538, 539, 540, 541, 542, 543, 544, 545, 546, 0, 547, 548, 549, 550, 551, 552, 553, 554, 555, 556, 557, 558, 559, 560, 561, 0, 562, 563, 564, 565, 566, 567, 568, 569, 570, 571, 572, 573, 574, 575, 576, 0, 577, 578, 579, 580, 581, 582, 583, 584, 585, 586, 587, 0, 588, 589, 0, 590, 591, 0, 0, 0, 0, 0, 592, 593, 594, 595, 596, 597, 598, 599, 600, 0, 601, 602, 603, 604, 605, 606, 607, 608, 609, 0, 610, 611, 612, 613, 614, 615, 616, 617, 618, 0, 619, 620, 621, 622, 623, 624, 0, 625, 626, 627, 628, 629, 630, 0, 0, 0, 631, 632, 633, 634, 635, 636, 637, 638, 639, 640, 641, 642, 643, 644, 645, 646, 647, 648, 649, 650, 651, 652, 653, 654, 655, 656, 657, 658, 659, 660, 661, 662, 663, 0, 664, 665, 666, 667, 668, 669, 0, 670, 671, 672, 673, 674, 675, 0, 0, 0, 676, 677, 678, 679, 680, 681, 682, 683, 684, 685, 686, 687, 688, 689, 690, 691, 692, 693, 694, 695, 696, 697, 698, 699, 700, 701, 702, 703, 704, 705, 706, 707, 708, 0, 709, 710, 711, 712, 713, 714, 0, 715, 716, 717, 718, 719, 720, 0, 0, 0, 721, 722, 0, 723, 724, 0, 725, 726, 0, 0, 0, 0, 0, 727, 728, 729, 730, 731, 732, 733, 734, 735, 0, 736, 737, 738, 739, 740, 741, 742, 743, 744, 0, 745, 746, 747, 748, 749, 750, 751, 752, 753, 0, 754, 755, 756, 757, 758, 759, 0, 760, 761, 762, 763, 764, 765, 766, 767, 0, 768, 769, 0, 770, 771, 0, 0, 0, 0, 0, 772, 773, 774, 775, 776, 777, 778, 779, 780, 0, 781, 782, 783, 784, 785, 786, 787, 788, 789, 0, 790, 791, 792, 793, 794, 795, 796, 797, 798, 0, 799, 800, 801, 802, 803, 804, 0, 805, 806, 807, 808, 809, 810, 811, 812, 0, 813, 814, 0, 815, 816, 0, 0, 0, 0, 0, 817, 818, 819, 820, 821, 822, 823, 824, 825, 0, 826, 827, 828, 829, 830, 831, 832, 833, 834, 0, 835, 836, 837, 838, 839, 840, 841, 842, 843, 0, 844, 845, 846, 847, 848, 849, 0, 850, 851, 852, 853, 854, 855, 856, 857, 0, 858, 859, 0, 860, 861, 0, 0, 0, 0, 862, 863, 0, 864, 865, 0, 866, 867, 0, 0, 0, 0, 868, 869, 0, 870, 871, 0, 872, 873, 0, 0, 0, 0, 0, 0, 0, 874, 875, 876, 877, 878, 879, 880, 881, 882, 883, 884, 885, 886, 887, 888, 889, 890, 891, 0, 892, 893, 894, 895, 896, 897, 898, 899, 900, 901, 902, 903, 904, 905, 906, 907, 908, 909, 0, 910, 911, 912, 913, 914, 915, 916, 917, 918, 919, 920, 921, 922, 923, 924, 925, 926, 927, 0, 928, 929, 930, 931, 932, 933, 0, 934, 935, 936, 937, 938, 939, 0, 940, 941, 942, 943, 944, 945, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] [1;30m[export_cpp.py at line 711][0m [0m
 [1;32mDEBUG:  subproc_number = [0m 0 [1;30m[export_cpp.py at line 712][0m [0m
 [1;32mDEBUG:  Done [1;30m[export_cpp.py at line 713][0m [0m
 INFO: Generating Feynman diagrams for Process: g g > t t~ g g g WEIGHTED<=5 @1 
 INFO: Finding symmetric diagrams for subprocess group gg_ttxggg 
-Generated helas calls for 1 subprocesses (1240 diagrams) in 6.012 s
-Wrote files for 2281 helas calls in 41.927 s
+Generated helas calls for 1 subprocesses (1240 diagrams) in 4.737 s
+Wrote files for 2281 helas calls in 32.091 s
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates VVV1 routines[0m
 ALOHA: aloha creates FFV1 routines[0m
 ALOHA: aloha creates VVVV1 routines[0m
 ALOHA: aloha creates VVVV3 routines[0m
 ALOHA: aloha creates VVVV4 routines[0m
-ALOHA: aloha creates 5 routines in  0.328 s
+ALOHA: aloha creates 5 routines in  0.233 s
 [1;32mDEBUG:  Entering PLUGIN_ProcessExporter.convert_model (create the model) [1;30m[output.py at line 194][0m [0m
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates VVV1 routines[0m
@@ -237,7 +237,7 @@ ALOHA: aloha creates FFV1 routines[0m
 ALOHA: aloha creates VVVV1 routines[0m
 ALOHA: aloha creates VVVV3 routines[0m
 ALOHA: aloha creates VVVV4 routines[0m
-ALOHA: aloha creates 10 routines in  0.269 s
+ALOHA: aloha creates 10 routines in  0.226 s
 <class 'aloha.create_aloha.AbstractRoutine'> VVV1
 <class 'aloha.create_aloha.AbstractRoutine'> VVV1
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
@@ -250,8 +250,8 @@ ALOHA: aloha creates 10 routines in  0.269 s
 <class 'aloha.create_aloha.AbstractRoutine'> VVVV3
 <class 'aloha.create_aloha.AbstractRoutine'> VVVV4
 <class 'aloha.create_aloha.AbstractRoutine'> VVVV4
-FileWriter <class 'PLUGIN.CUDACPP_SA_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_mad_gg_ttggg/src/./HelAmps_sm.h
-INFO: Created file HelAmps_sm.h in directory /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_mad_gg_ttggg/src/. 
+FileWriter <class 'PLUGIN.CUDACPP_SA_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /afs/cern.ch/work/j/jteig/mg5amcnlo/CODEGEN_mad_gg_ttggg/src/./HelAmps_sm.h
+INFO: Created file HelAmps_sm.h in directory /afs/cern.ch/work/j/jteig/mg5amcnlo/CODEGEN_mad_gg_ttggg/src/. 
 super_write_set_parameters_onlyfixMajorana (hardcoded=False)
 [1;32mDEBUG:  'pardef_lines size =', len(pardef_lines), ', keys size =', len(pardef_lines.keys())  = [0m pardef_lines size = 48 , keys size = 48 [1;30m[model_handling.py at line 729][0m [0m
 super_write_set_parameters_onlyfixMajorana (hardcoded=True)
@@ -260,22 +260,22 @@ super_write_set_parameters_onlyfixMajorana (hardcoded=True)
 [1;32mDEBUG:  'pardef_lines size =', len(pardef_lines), ', keys size =', len(pardef_lines.keys())  = [0m pardef_lines size = 3 , keys size = 3 [1;30m[model_handling.py at line 729][0m [0m
 [1;32mDEBUG:  'pardef_lines size =', len(pardef_lines), ', keys size =', len(pardef_lines.keys())  = [0m pardef_lines size = 3 , keys size = 3 [1;30m[model_handling.py at line 729][0m [0m
 [1;32mDEBUG:  'pardef_lines size =', len(pardef_lines), ', keys size =', len(pardef_lines.keys())  = [0m pardef_lines size = 3 , keys size = 3 [1;30m[model_handling.py at line 729][0m [0m
-FileWriter <class 'PLUGIN.CUDACPP_SA_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_mad_gg_ttggg/src/./Parameters_sm.h
-FileWriter <class 'PLUGIN.CUDACPP_SA_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_mad_gg_ttggg/src/./Parameters_sm.cc
+FileWriter <class 'PLUGIN.CUDACPP_SA_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /afs/cern.ch/work/j/jteig/mg5amcnlo/CODEGEN_mad_gg_ttggg/src/./Parameters_sm.h
+FileWriter <class 'PLUGIN.CUDACPP_SA_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /afs/cern.ch/work/j/jteig/mg5amcnlo/CODEGEN_mad_gg_ttggg/src/./Parameters_sm.cc
 INFO: Created files Parameters_sm.h and Parameters_sm.cc in directory 
-INFO: /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_mad_gg_ttggg/src/. and /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_mad_gg_ttggg/src/. 
+INFO: /afs/cern.ch/work/j/jteig/mg5amcnlo/CODEGEN_mad_gg_ttggg/src/. and /afs/cern.ch/work/j/jteig/mg5amcnlo/CODEGEN_mad_gg_ttggg/src/. 
 The option zerowidth_tchannel is modified [True] but will not be written in the configuration files.
 If you want to make this value the default for future session, you can run 'save options --all'
-save configuration file to /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_mad_gg_ttggg/Cards/me5_configuration.txt
+save configuration file to /afs/cern.ch/work/j/jteig/mg5amcnlo/CODEGEN_mad_gg_ttggg/Cards/me5_configuration.txt
 INFO: Use Fortran compiler gfortran 
 INFO: Use c++ compiler g++ 
 INFO: Generate web pages 
-Output to directory /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_mad_gg_ttggg done.
+Output to directory /afs/cern.ch/work/j/jteig/mg5amcnlo/CODEGEN_mad_gg_ttggg done.
 Type "launch" to generate events from this process, or see
-/data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_mad_gg_ttggg/README
+/afs/cern.ch/work/j/jteig/mg5amcnlo/CODEGEN_mad_gg_ttggg/README
 Run "open index.html" to see more information about this process.
 quit
 
-real	0m52.327s
-user	0m50.885s
-sys	0m1.141s
+real	0m43.400s
+user	0m38.584s
+sys	0m3.126s
diff --git a/epochX/cudacpp/gg_ttggg.mad/COPYRIGHT b/epochX/cudacpp/gg_ttggg.mad/COPYRIGHT
index a134b5fef9..84a883fbb0 100644
--- a/epochX/cudacpp/gg_ttggg.mad/COPYRIGHT
+++ b/epochX/cudacpp/gg_ttggg.mad/COPYRIGHT
@@ -15,6 +15,7 @@ The full development team currently includes the following authors :
   Stephan Hageboeck (CERN)
   Olivier Mattelaer (Universite Catholique de Louvain, original author)
   Stefan Roiser (CERN, original author)
+  Joergen Teig (CERN)
   Andrea Valassi (CERN, original author)
   Zenny Wettersten (CERN)
 See https://github.com/madgraph5/madgraph4gpu for more details. For the full
diff --git a/epochX/cudacpp/gg_ttggg.mad/Cards/me5_configuration.txt b/epochX/cudacpp/gg_ttggg.mad/Cards/me5_configuration.txt
index 00d7c6f8d6..9e9ed9d752 100644
--- a/epochX/cudacpp/gg_ttggg.mad/Cards/me5_configuration.txt
+++ b/epochX/cudacpp/gg_ttggg.mad/Cards/me5_configuration.txt
@@ -234,7 +234,7 @@
 # pineappl = pineappl
 
 
-#mg5_path = /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo 
+#mg5_path = /afs/cern.ch/work/j/jteig/mg5amcnlo 
 
 # MG5 MAIN DIRECTORY
-#mg5_path = /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo
+#mg5_path = /afs/cern.ch/work/j/jteig/mg5amcnlo
diff --git a/epochX/cudacpp/gg_ttggg.mad/Source/DHELAS/aloha_file.inc b/epochX/cudacpp/gg_ttggg.mad/Source/DHELAS/aloha_file.inc
index ec923afd6d..1b5bf6ec54 100644
--- a/epochX/cudacpp/gg_ttggg.mad/Source/DHELAS/aloha_file.inc
+++ b/epochX/cudacpp/gg_ttggg.mad/Source/DHELAS/aloha_file.inc
@@ -1 +1 @@
-ALOHARoutine = FFV1_1.o VVVV4_0.o VVVV4P0_1.o FFV1_0.o VVV1_0.o FFV1_2.o VVVV3_0.o VVVV1_0.o VVVV3P0_1.o VVVV1P0_1.o VVV1P0_1.o FFV1P0_3.o
+ALOHARoutine = VVV1_0.o VVV1P0_1.o FFV1_0.o FFV1_1.o FFV1_2.o FFV1P0_3.o VVVV1_0.o VVVV1P0_1.o VVVV3_0.o VVVV3P0_1.o VVVV4_0.o VVVV4P0_1.o
diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/Bridge.h b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/Bridge.h
index d7e629cacd..f37c972b24 100644
--- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/Bridge.h
+++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/Bridge.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: S. Roiser (Nov 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Roiser, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef BRIDGE_H
 #define BRIDGE_H 1
@@ -22,7 +22,7 @@
 #include <memory>
 #include <type_traits>
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -82,7 +82,7 @@ namespace mg5amcCpu
     Bridge& operator=( const Bridge& ) = delete;
     Bridge& operator=( Bridge&& ) = delete;
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     /**
      * Set the gpublocks and gputhreads for the gpusequence - throws if evnt != gpublocks*gputhreads
      * (this is needed for BridgeKernel tests rather than for actual production use in Fortran)
@@ -149,7 +149,7 @@ namespace mg5amcCpu
     unsigned int m_nevt; // number of events
     int m_nGoodHel;      // the number of good helicities (-1 initially when they have not yet been calculated)
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     int m_gputhreads; // number of gpu threads (default set from number of events, can be modified)
     int m_gpublocks;  // number of gpu blocks (default set from number of events, can be modified)
     DeviceBuffer<FORTRANFPTYPE, sizePerEventMomenta> m_devMomentaF;
@@ -186,12 +186,12 @@ namespace mg5amcCpu
   // Forward declare transposition methods
   //
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 
   template<typename Tin, typename Tout>
   __global__ void dev_transposeMomentaF2C( const Tin* in, Tout* out, const unsigned int nevt );
 
-#endif // __CUDACC__
+#endif // MGONGPUCPP_GPUIMPL
 
   template<typename Tin, typename Tout>
   void hst_transposeMomentaF2C( const Tin* in, Tout* out, const unsigned int nevt );
@@ -208,7 +208,7 @@ namespace mg5amcCpu
   Bridge<FORTRANFPTYPE>::Bridge( unsigned int nevtF, unsigned int nparF, unsigned int np4F )
     : m_nevt( nevtF )
     , m_nGoodHel( -1 )
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     , m_gputhreads( 256 )                  // default number of gpu threads
     , m_gpublocks( m_nevt / m_gputhreads ) // this ensures m_nevt <= m_gpublocks*m_gputhreads
     , m_devMomentaF( m_nevt )
@@ -232,7 +232,7 @@ namespace mg5amcCpu
   {
     if( nparF != CPPProcess::npar ) throw std::runtime_error( "Bridge constructor: npar mismatch" );
     if( np4F != CPPProcess::np4 ) throw std::runtime_error( "Bridge constructor: np4 mismatch" );
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     if( ( m_nevt < s_gputhreadsmin ) || ( m_nevt % s_gputhreadsmin != 0 ) )
       throw std::runtime_error( "Bridge constructor: nevt should be a multiple of " + std::to_string( s_gputhreadsmin ) );
     while( m_nevt != m_gpublocks * m_gputhreads )
@@ -250,11 +250,11 @@ namespace mg5amcCpu
     std::cout << "WARNING! Instantiate host Bridge (nevt=" << m_nevt << ")" << std::endl;
     CPPProcess process( /*verbose=*/false );
     m_pmek.reset( new MatrixElementKernelHost( m_hstMomentaC, m_hstGs, m_hstRndHel, m_hstRndCol, m_hstMEs, m_hstSelHel, m_hstSelCol, m_nevt ) );
-#endif // __CUDACC__
+#endif // MGONGPUCPP_GPUIMPL
     process.initProc( "../../Cards/param_card.dat" );
   }
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   template<typename FORTRANFPTYPE>
   void Bridge<FORTRANFPTYPE>::set_gpugrid( const int gpublocks, const int gputhreads )
   {
@@ -268,7 +268,7 @@ namespace mg5amcCpu
   }
 #endif
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   template<typename FORTRANFPTYPE>
   void Bridge<FORTRANFPTYPE>::gpu_sequence( const FORTRANFPTYPE* momenta,
                                             const FORTRANFPTYPE* gs,
@@ -283,14 +283,14 @@ namespace mg5amcCpu
     constexpr int neppM = MemoryAccessMomenta::neppM;
     if constexpr( neppM == 1 && std::is_same_v<FORTRANFPTYPE, fptype> )
     {
-      checkCuda( cudaMemcpy( m_devMomentaC.data(), momenta, m_devMomentaC.bytes(), cudaMemcpyHostToDevice ) );
+      gpuMemcpy( m_devMomentaC.data(), momenta, m_devMomentaC.bytes(), gpuMemcpyHostToDevice );
     }
     else
     {
-      checkCuda( cudaMemcpy( m_devMomentaF.data(), momenta, m_devMomentaF.bytes(), cudaMemcpyHostToDevice ) );
+      gpuMemcpy( m_devMomentaF.data(), momenta, m_devMomentaF.bytes(), gpuMemcpyHostToDevice );
       const int thrPerEvt = CPPProcess::npar * CPPProcess::np4; // AV: transpose alg does 1 element per thread (NOT 1 event per thread)
       //const int thrPerEvt = 1; // AV: try new alg with 1 event per thread... this seems slower
-      dev_transposeMomentaF2C<<<m_gpublocks * thrPerEvt, m_gputhreads>>>( m_devMomentaF.data(), m_devMomentaC.data(), m_nevt );
+      gpuLaunchKernel( dev_transposeMomentaF2C, m_gpublocks * thrPerEvt, m_gputhreads, m_devMomentaF.data(), m_devMomentaC.data(), m_nevt );
     }
     if constexpr( std::is_same_v<FORTRANFPTYPE, fptype> )
     {
@@ -333,7 +333,7 @@ namespace mg5amcCpu
   }
 #endif
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   template<typename FORTRANFPTYPE>
   void Bridge<FORTRANFPTYPE>::cpu_sequence( const FORTRANFPTYPE* momenta,
                                             const FORTRANFPTYPE* gs,
@@ -388,7 +388,7 @@ namespace mg5amcCpu
   // - C++ array: momenta[npagM][npar][np4][neppM] with nevt=npagM*neppM (AOSOA)
   //
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   template<typename Tin, typename Tout>
   __global__ void dev_transposeMomentaF2C( const Tin* in, Tout* out, const unsigned int nevt )
   {
diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/BridgeKernels.cc b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/BridgeKernels.cc
index d58066c9c1..eaf4037a24 100644
--- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/BridgeKernels.cc
+++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/BridgeKernels.cc
@@ -1,17 +1,18 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #include "BridgeKernels.h"
 
+#include "GpuAbstraction.h"
 #include "MemoryAccessMomenta.h"
 
 #include <sstream>
 
 //============================================================================
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -45,7 +46,7 @@ namespace mg5amcCpu
 
 //============================================================================
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 namespace mg5amcCpu
 {
 
@@ -96,7 +97,7 @@ namespace mg5amcCpu
 
 //============================================================================
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 {
 
diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/BridgeKernels.h b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/BridgeKernels.h
index 15eb4bff4d..3efef8ce97 100644
--- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/BridgeKernels.h
+++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/BridgeKernels.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef BRIDGEKERNELS_H
 #define BRIDGEKERNELS_H 1
@@ -12,7 +12,7 @@
 #include "MatrixElementKernels.h"
 #include "MemoryBuffers.h"
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -49,7 +49,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A Bridge wrapper class encapsulating matrix element calculations on a CPU host
   class BridgeKernelHost final : public BridgeKernelBase
   {
@@ -89,7 +89,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   // A Bridge wrapper class encapsulating matrix element calculations on a GPU device
   class BridgeKernelDevice : public BridgeKernelBase
   {
diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/CommonRandomNumberKernel.cc b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/CommonRandomNumberKernel.cc
index 985b39f576..010bc4cbd0 100644
--- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/CommonRandomNumberKernel.cc
+++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/CommonRandomNumberKernel.cc
@@ -1,15 +1,16 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #include "CommonRandomNumbers.h"
+#include "GpuAbstraction.h"
 #include "MemoryBuffers.h"
 #include "RandomNumberKernels.h"
 
 #include <cassert>
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/CrossSectionKernels.cc b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/CrossSectionKernels.cc
index 0b355a3c8d..c15b39844d 100644
--- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/CrossSectionKernels.cc
+++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/CrossSectionKernels.cc
@@ -1,10 +1,11 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #include "CrossSectionKernels.h"
 
+#include "GpuAbstraction.h"
 #include "MemoryAccessMatrixElements.h"
 #include "MemoryAccessWeights.h"
 #include "MemoryBuffers.h"
@@ -77,7 +78,7 @@ debug_me_is_abnormal( const fptype& me, size_t ievtALL )
 
 //============================================================================
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -185,7 +186,7 @@ namespace mg5amcCpu
 
 //============================================================================
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 {
 
diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/CrossSectionKernels.h b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/CrossSectionKernels.h
index 7933ca4bbf..4d9659e04e 100644
--- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/CrossSectionKernels.h
+++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/CrossSectionKernels.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef CROSSSECTIONKERNELS_H
 #define CROSSSECTIONKERNELS_H 1
@@ -13,7 +13,7 @@
 
 //============================================================================
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -96,7 +96,7 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
   /*
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   // A class encapsulating the calculation of event statistics on a GPU device
   class CrossSectionKernelDevice : public CrossSectionKernelBase, public NumberOfEvents
   {
diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/CudaRuntime.h b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/CudaRuntime.h
deleted file mode 100644
index 64ce52f4b3..0000000000
--- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/CudaRuntime.h
+++ /dev/null
@@ -1,85 +0,0 @@
-// Copyright (C) 2020-2023 CERN and UCLouvain.
-// Licensed under the GNU Lesser General Public License (version 3 or later).
-// Created by: S. Roiser (Jul 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
-
-#ifndef MG5AMC_CUDARUNTIME_H
-#define MG5AMC_CUDARUNTIME_H 1
-
-// MG5AMC on GPU uses the CUDA runtime API, not the lower level CUDA driver API
-// See https://docs.nvidia.com/cuda/cuda-runtime-api/driver-vs-runtime-api.html#driver-vs-runtime-api
-
-#include <cassert>
-#include <iostream>
-
-//--------------------------------------------------------------------------
-
-// See https://stackoverflow.com/a/14038590
-#ifdef __CUDACC__ /* clang-format off */
-#define checkCuda( code ) { assertCuda( code, __FILE__, __LINE__ ); }
-inline void assertCuda( cudaError_t code, const char* file, int line, bool abort = true )
-{
-  if( code != cudaSuccess )
-  {
-    printf( "ERROR! assertCuda: '%s' (%d) in %s:%d\n", cudaGetErrorString( code ), code, file, line );
-    if( abort ) assert( code == cudaSuccess );
-  }
-}
-#endif /* clang-format on */
-
-//--------------------------------------------------------------------------
-
-#ifdef __CUDACC__
-namespace mg5amcGpu
-{
-  // Instantiate a CudaRuntime at the beginnining of the application's main to
-  // invoke cudaSetDevice(0) in the constructor and book a cudaDeviceReset() call in the destructor
-  // *** FIXME! This will all need to be designed differently when going to multi-GPU nodes! ***
-  struct CudaRuntime final
-  {
-    CudaRuntime( const bool debug = true )
-      : m_debug( debug ) { setUp( m_debug ); }
-    ~CudaRuntime() { tearDown( m_debug ); }
-    CudaRuntime( const CudaRuntime& ) = delete;
-    CudaRuntime( CudaRuntime&& ) = delete;
-    CudaRuntime& operator=( const CudaRuntime& ) = delete;
-    CudaRuntime& operator=( CudaRuntime&& ) = delete;
-    bool m_debug;
-
-    // Set up CUDA application
-    // ** NB: strictly speaking this is not needed when using the CUDA runtime API **
-    // Calling cudaSetDevice on startup is useful to properly book-keep the time spent in CUDA initialization
-    static void setUp( const bool debug = true )
-    {
-      // ** NB: it is useful to call cudaSetDevice, or cudaFree, to properly book-keep the time spent in CUDA initialization
-      // ** NB: otherwise, the first CUDA operation (eg a cudaMemcpyToSymbol in CPPProcess ctor) appears to take much longer!
-      /*
-      // [We initially added cudaFree(0) to "ease profile analysis" only because it shows up as a big recognizable block!]
-      // No explicit initialization is needed: https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#initialization
-      // It is not clear what cudaFree(0) does at all: https://stackoverflow.com/questions/69967813/
-      if ( debug ) std::cout << "__CudaRuntime: calling cudaFree(0)" << std::endl;
-      checkCuda( cudaFree( 0 ) ); // SLOW!
-      */
-      // Replace cudaFree(0) by cudaSetDevice(0), even if it is not really needed either
-      // (but see https://developer.nvidia.com/blog/cuda-pro-tip-always-set-current-device-avoid-multithreading-bugs)
-      if( debug ) std::cout << "__CudaRuntime: calling cudaSetDevice(0)" << std::endl;
-      checkCuda( cudaSetDevice( 0 ) ); // SLOW!
-    }
-
-    // Tear down CUDA application (call cudaDeviceReset)
-    // ** NB: strictly speaking this is not needed when using the CUDA runtime API **
-    // Calling cudaDeviceReset on shutdown is only needed for checking memory leaks in cuda-memcheck
-    // See https://docs.nvidia.com/cuda/cuda-memcheck/index.html#leak-checking
-    static void tearDown( const bool debug = true )
-    {
-      if( debug ) std::cout << "__CudaRuntime: calling cudaDeviceReset()" << std::endl;
-      checkCuda( cudaDeviceReset() );
-    }
-  };
-
-}
-#endif
-
-//--------------------------------------------------------------------------
-
-#endif // MG5AMC_CUDARUNTIME_H
diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/CurandRandomNumberKernel.cc b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/CurandRandomNumberKernel.cc
index eb56333b03..08a16f6f2c 100644
--- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/CurandRandomNumberKernel.cc
+++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/CurandRandomNumberKernel.cc
@@ -1,9 +1,9 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
-#include "CudaRuntime.h"
+#include "GpuRuntime.h"
 #include "MemoryBuffers.h"
 #include "RandomNumberKernels.h"
 
@@ -22,7 +22,7 @@ inline void assertCurand( curandStatus_t code, const char *file, int line, bool
 }
 #endif /* clang-format on */
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -36,7 +36,7 @@ namespace mg5amcCpu
   {
     if( m_isOnDevice )
     {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
       if( !m_rnarray.isOnDevice() )
         throw std::runtime_error( "CurandRandomNumberKernel on device with a host random number array" );
 #else
@@ -114,7 +114,7 @@ namespace mg5amcCpu
     /*
     printf( "\nCurandRandomNumberKernel::generateRnarray size = %d\n", (int)m_rnarray.size() );
     fptype* data = m_rnarray.data();
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     if( m_rnarray.isOnDevice() )
     {
       data = new fptype[m_rnarray.size()]();
@@ -123,7 +123,7 @@ namespace mg5amcCpu
 #endif
     for( int i = 0; i < ( (int)m_rnarray.size() / 4 ); i++ )
       printf( "[%4d] %f %f %f %f\n", i * 4, data[i * 4], data[i * 4 + 2], data[i * 4 + 2], data[i * 4 + 3] );
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     if( m_rnarray.isOnDevice() ) delete[] data;
 #endif
     */
diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/EventStatistics.h b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/EventStatistics.h
index 48b51e0a49..b425a5bade 100644
--- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/EventStatistics.h
+++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/EventStatistics.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef EventStatistics_H
 #define EventStatistics_H 1
@@ -16,7 +16,7 @@
 #include <limits>
 #include <string>
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MadgraphTest.h b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MadgraphTest.h
index ffe3b84d53..176338151a 100644
--- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MadgraphTest.h
+++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MadgraphTest.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: S. Hageboeck (Dec 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MADGRAPHTEST_H_
 #define MADGRAPHTEST_H_ 1
@@ -21,7 +21,7 @@
 #include <string>
 #include <vector>
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 using mg5amcGpu::CPPProcess;
 #else
 using mg5amcCpu::CPPProcess;
@@ -200,7 +200,7 @@ class MadgraphTest : public testing::TestWithParam<TestDriverBase*>
 
 // Since we link both the CPU-only and GPU tests into the same executable, we prevent
 // a multiply defined symbol by only compiling this in the non-CUDA phase:
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 
 /// Compare momenta and matrix elements.
 /// This uses an implementation of TestDriverBase to run a madgraph workflow,
@@ -309,6 +309,6 @@ TEST_P( MadgraphTest, CompareMomentaAndME )
   }
 }
 
-#endif // __CUDACC__
+#endif // MGONGPUCPP_GPUIMPL
 
 #endif /* MADGRAPHTEST_H_ */
diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MatrixElementKernels.cc b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MatrixElementKernels.cc
index 30257195b6..d6d6c4f179 100644
--- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MatrixElementKernels.cc
+++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MatrixElementKernels.cc
@@ -1,12 +1,12 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #include "MatrixElementKernels.h"
 
 #include "CPPProcess.h"
-#include "CudaRuntime.h"
+#include "GpuRuntime.h" // Includes the abstraction for Nvidia/AMD compilation
 #include "MemoryAccessMomenta.h"
 #include "MemoryBuffers.h"
 
@@ -14,7 +14,7 @@
 
 //============================================================================
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 namespace mg5amcCpu
 {
 
@@ -143,7 +143,7 @@ namespace mg5amcCpu
 
 //============================================================================
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 {
 
@@ -202,13 +202,13 @@ namespace mg5amcGpu
     PinnedHostBufferHelicityMask hstIsGoodHel( ncomb );
     DeviceBufferHelicityMask devIsGoodHel( ncomb );
     // ... 0d1. Compute good helicity mask on the device
-    computeDependentCouplings<<<m_gpublocks, m_gputhreads>>>( m_gs.data(), m_couplings.data() );
+    gpuLaunchKernel( computeDependentCouplings, m_gpublocks, m_gputhreads, m_gs.data(), m_couplings.data() );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    sigmaKin_getGoodHel<<<m_gpublocks, m_gputhreads>>>( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_numerators.data(), m_denominators.data(), devIsGoodHel.data() );
+    gpuLaunchKernel( sigmaKin_getGoodHel, m_gpublocks, m_gputhreads, m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_numerators.data(), m_denominators.data(), devIsGoodHel.data() );
 #else
-    sigmaKin_getGoodHel<<<m_gpublocks, m_gputhreads>>>( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), devIsGoodHel.data() );
+    gpuLaunchKernel( sigmaKin_getGoodHel, m_gpublocks, m_gputhreads, m_momenta.data(), m_couplings.data(), m_matrixElements.data(), devIsGoodHel.data() );
 #endif
-    checkCuda( cudaPeekAtLastError() );
+    checkGpu( gpuPeekAtLastError() );
     // ... 0d2. Copy back good helicity mask to the host
     copyHostFromDevice( hstIsGoodHel, devIsGoodHel );
     // ... 0d3. Copy back good helicity list to constant memory on the device
@@ -219,19 +219,19 @@ namespace mg5amcGpu
 
   void MatrixElementKernelDevice::computeMatrixElements( const unsigned int channelId )
   {
-    computeDependentCouplings<<<m_gpublocks, m_gputhreads>>>( m_gs.data(), m_couplings.data() );
+    gpuLaunchKernel( computeDependentCouplings, m_gpublocks, m_gputhreads, m_gs.data(), m_couplings.data() );
 #ifndef MGONGPU_NSIGHT_DEBUG
     constexpr unsigned int sharedMemSize = 0;
 #else
     constexpr unsigned int sharedMemSize = ntpbMAX * sizeof( float );
 #endif
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    sigmaKin<<<m_gpublocks, m_gputhreads, sharedMemSize>>>( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), channelId, m_numerators.data(), m_denominators.data(), m_selhel.data(), m_selcol.data() );
+    gpuLaunchKernelSharedMem( sigmaKin, m_gpublocks, m_gputhreads, sharedMemSize, m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), channelId, m_numerators.data(), m_denominators.data(), m_selhel.data(), m_selcol.data() );
 #else
-    sigmaKin<<<m_gpublocks, m_gputhreads, sharedMemSize>>>( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), m_selhel.data(), m_selcol.data() );
+    gpuLaunchKernelSharedMem( sigmaKin, m_gpublocks, m_gputhreads, sharedMemSize, m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), m_selhel.data(), m_selcol.data() );
 #endif
-    checkCuda( cudaPeekAtLastError() );
-    checkCuda( cudaDeviceSynchronize() );
+    checkGpu( gpuPeekAtLastError() );
+    checkGpu( gpuDeviceSynchronize() );
   }
 
   //--------------------------------------------------------------------------
diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MatrixElementKernels.h b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MatrixElementKernels.h
index 23e84757a2..72bd8f195b 100644
--- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MatrixElementKernels.h
+++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MatrixElementKernels.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MATRIXELEMENTKERNELS_H
 #define MATRIXELEMENTKERNELS_H 1
@@ -10,7 +10,7 @@
 
 #include "MemoryBuffers.h"
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -81,7 +81,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating matrix element calculations on a CPU host
   class MatrixElementKernelHost final : public MatrixElementKernelBase, public NumberOfEvents
   {
@@ -130,7 +130,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   // A class encapsulating matrix element calculations on a GPU device
   class MatrixElementKernelDevice : public MatrixElementKernelBase, public NumberOfEvents
   {
diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MemoryAccessAmplitudes.h b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MemoryAccessAmplitudes.h
index 573b3bbbc9..ffb76e93de 100644
--- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MemoryAccessAmplitudes.h
+++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MemoryAccessAmplitudes.h
@@ -15,7 +15,7 @@
 #define MGONGPU_TRIVIAL_AMPLITUDES 1
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MemoryAccessCouplings.h b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MemoryAccessCouplings.h
index 35a3af42e0..3afdf3e554 100644
--- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MemoryAccessCouplings.h
+++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MemoryAccessCouplings.h
@@ -15,7 +15,7 @@
 #include "MemoryBuffers.h"       // for HostBufferCouplings::isaligned
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MemoryAccessCouplingsFixed.h b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MemoryAccessCouplingsFixed.h
index dc0d93afff..ffcdf4dbef 100644
--- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MemoryAccessCouplingsFixed.h
+++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MemoryAccessCouplingsFixed.h
@@ -14,7 +14,7 @@
 //#include "MemoryAccessHelpers.h"
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MemoryAccessDenominators.h b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MemoryAccessDenominators.h
index 3bce635718..66f2d32a6b 100644
--- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MemoryAccessDenominators.h
+++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MemoryAccessDenominators.h
@@ -10,7 +10,7 @@
 #include "MemoryAccessGs.h"
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MemoryAccessGs.h b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MemoryAccessGs.h
index 31311aa375..4c726b30f3 100644
--- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MemoryAccessGs.h
+++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MemoryAccessGs.h
@@ -13,7 +13,7 @@
 #include "MemoryBuffers.h" // for HostBufferMatrixElements::isaligned
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MemoryAccessHelpers.h b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MemoryAccessHelpers.h
index c82a6c7635..db73e4e064 100644
--- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MemoryAccessHelpers.h
+++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MemoryAccessHelpers.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MemoryAccessHelpers_H
 #define MemoryAccessHelpers_H 1
@@ -105,7 +105,7 @@ class KernelAccessHelper : public MemoryAccessHelper<T>
     }
     else
     {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
       const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid
       //printf( "kernelAccessRecord: ievt=%d threadId=%d\n", ievt, threadIdx.x );
       return T::ieventAccessRecord( buffer, ievt ); // NB fptype and fptype_sv coincide for CUDA
diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MemoryAccessMatrixElements.h b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MemoryAccessMatrixElements.h
index f32e6fea5b..3741011971 100644
--- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MemoryAccessMatrixElements.h
+++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MemoryAccessMatrixElements.h
@@ -13,7 +13,7 @@
 #include "MemoryBuffers.h" // for HostBufferMatrixElements::isaligned
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MemoryAccessMomenta.h b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MemoryAccessMomenta.h
index 29266de32c..3be229d392 100644
--- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MemoryAccessMomenta.h
+++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MemoryAccessMomenta.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MemoryAccessMomenta_H
 #define MemoryAccessMomenta_H 1
@@ -13,7 +13,7 @@
 #include "MemoryAccessVectors.h"
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -30,7 +30,7 @@ namespace mg5amcCpu
 
     // Number of Events Per Page in the momenta AOSOA memory buffer layout
     // (these are all best kept as a compile-time constants: see issue #23)
-#ifdef __CUDACC__ /* clang-format off */
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
     // -----------------------------------------------------------------------------------------------
     // --- GPUs: neppM is best set to a power of 2 times the number of fptype's in a 32-byte cacheline
     // --- This is relevant to ensure coalesced access to momenta in global memory
diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MemoryAccessNumerators.h b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MemoryAccessNumerators.h
index b152183b28..18991f4fa6 100644
--- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MemoryAccessNumerators.h
+++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MemoryAccessNumerators.h
@@ -10,7 +10,7 @@
 #include "MemoryAccessGs.h"
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MemoryAccessRandomNumbers.h b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MemoryAccessRandomNumbers.h
index e2988d39f3..40cb089135 100644
--- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MemoryAccessRandomNumbers.h
+++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MemoryAccessRandomNumbers.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MemoryAccessRandomNumbers_H
 #define MemoryAccessRandomNumbers_H 1
@@ -11,7 +11,7 @@
 #include "CPPProcess.h"
 #include "MemoryAccessHelpers.h"
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 using mg5amcGpu::CPPProcess;
 #else
 using mg5amcCpu::CPPProcess;
diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MemoryAccessVectors.h b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MemoryAccessVectors.h
index e9b197368e..08faccff0f 100644
--- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MemoryAccessVectors.h
+++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MemoryAccessVectors.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MemoryAccessVectors_H
 #define MemoryAccessVectors_H 1
@@ -10,7 +10,7 @@
 
 #include "mgOnGpuVectors.h"
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 namespace mg5amcCpu // this is only needed for CPU SIMD vectorization
 {
 
diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MemoryAccessWavefunctions.h b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MemoryAccessWavefunctions.h
index 5428aaf933..33bef4559e 100644
--- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MemoryAccessWavefunctions.h
+++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MemoryAccessWavefunctions.h
@@ -15,7 +15,7 @@
 #define MGONGPU_TRIVIAL_WAVEFUNCTIONS 1
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MemoryBuffers.h b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MemoryBuffers.h
index 3093e6ed18..7756a71621 100644
--- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MemoryBuffers.h
+++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MemoryBuffers.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021, based on earlier work by S. Hageboeck) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Roiser, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MemoryBuffers_H
 #define MemoryBuffers_H 1
@@ -11,12 +11,12 @@
 #include "mgOnGpuCxtypes.h"
 
 #include "CPPProcess.h"
-#include "CudaRuntime.h"
+#include "GpuRuntime.h"
 #include "Parameters_sm.h"
 
 #include <sstream>
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -87,7 +87,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   constexpr bool HostBufferALIGNED = false;   // ismisaligned=false
   constexpr bool HostBufferMISALIGNED = true; // ismisaligned=true
 
@@ -119,7 +119,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   // A class encapsulating a CUDA pinned host buffer
   template<typename T>
   class PinnedHostBufferBase : public BufferBase<T>
@@ -128,18 +128,18 @@ namespace mg5amcCpu
     PinnedHostBufferBase( const size_t size )
       : BufferBase<T>( size, false )
     {
-      checkCuda( cudaMallocHost( &( this->m_data ), this->bytes() ) );
+      gpuMallocHost( &( this->m_data ), this->bytes() );
     }
     virtual ~PinnedHostBufferBase()
     {
-      checkCuda( cudaFreeHost( this->m_data ) );
+      gpuFreeHost( this->m_data );
     }
   };
 #endif
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   // A class encapsulating a CUDA device buffer
   template<typename T>
   class DeviceBufferBase : public BufferBase<T>
@@ -148,18 +148,18 @@ namespace mg5amcCpu
     DeviceBufferBase( const size_t size )
       : BufferBase<T>( size, true )
     {
-      checkCuda( cudaMalloc( &( this->m_data ), this->bytes() ) );
+      gpuMalloc( &( this->m_data ), this->bytes() );
     }
     virtual ~DeviceBufferBase()
     {
-      checkCuda( cudaFree( this->m_data ) );
+      gpuFree( this->m_data );
     }
   };
 #endif
 
   //--------------------------------------------------------------------------
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for a given number of events
   template<typename T, size_t sizePerEvent, bool ismisaligned>
   class HostBuffer : public HostBufferBase<T, ismisaligned>, virtual private NumberOfEvents
@@ -175,7 +175,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   // A class encapsulating a CUDA pinned host buffer for a given number of events
   template<typename T, size_t sizePerEvent>
   class PinnedHostBuffer : public PinnedHostBufferBase<T>, virtual private NumberOfEvents
@@ -191,7 +191,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   // A class encapsulating a CUDA device buffer for a given number of events
   template<typename T, size_t sizePerEvent>
   class DeviceBuffer : public DeviceBufferBase<T>, virtual private NumberOfEvents
@@ -213,7 +213,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for momenta random numbers
   constexpr size_t sizePerEventRndNumMomenta = MemoryBuffers::np4 * MemoryBuffers::nparf;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for momenta random numbers
   typedef HostBuffer<fptype, sizePerEventRndNumMomenta, HostBufferALIGNED> HostBufferRndNumMomenta;
 #else
@@ -232,7 +232,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer with ONE fptype per event
   constexpr size_t sizePerEventOneFp = 1;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer with ONE fptype per event
   typedef HostBuffer<fptype, sizePerEventOneFp, HostBufferALIGNED> HostBufferOneFp;
 #else
@@ -257,7 +257,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for Gs
   constexpr size_t sizePerEventGs = 1;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for gs
   typedef HostBuffer<fptype, sizePerEventGs, HostBufferALIGNED> HostBufferGs;
 #else
@@ -276,7 +276,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for numerators
   constexpr size_t sizePerEventNumerators = 1;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for gs
   typedef HostBuffer<fptype, sizePerEventNumerators, HostBufferALIGNED> HostBufferNumerators;
 #else
@@ -296,7 +296,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for denominators
   constexpr size_t sizePerEventDenominators = 1;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for gs
   typedef HostBuffer<fptype, sizePerEventDenominators, HostBufferALIGNED> HostBufferDenominators;
 #else
@@ -315,7 +315,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for random numbers
   constexpr size_t sizePerEventCouplings = MemoryBuffers::ndcoup * MemoryBuffers::nx2;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for gs
   typedef HostBuffer<fptype, sizePerEventCouplings, HostBufferALIGNED> HostBufferCouplings;
 #else
@@ -333,7 +333,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for momenta
   constexpr size_t sizePerEventMomenta = MemoryBuffers::np4 * MemoryBuffers::npar;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for momenta
   typedef HostBuffer<fptype, sizePerEventMomenta, HostBufferALIGNED> HostBufferMomenta;
   //typedef HostBuffer<fptype, sizePerEventMomenta, HostBufferMISALIGNED> HostBufferMomenta; // TEST MISALIGNMENT!
@@ -352,7 +352,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for sampling weights
   constexpr size_t sizePerEventWeights = 1;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for sampling weights
   typedef HostBuffer<fptype, sizePerEventWeights, HostBufferALIGNED> HostBufferWeights;
 #else
@@ -370,7 +370,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for matrix elements
   constexpr size_t sizePerEventMatrixElements = 1;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for matrix elements
   typedef HostBuffer<fptype, sizePerEventMatrixElements, HostBufferALIGNED> HostBufferMatrixElements;
 #else
@@ -385,7 +385,7 @@ namespace mg5amcCpu
   // A base class encapsulating a memory buffer for the helicity mask
   typedef BufferBase<bool> BufferHelicityMask;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for the helicity mask
   typedef HostBufferBase<bool, HostBufferALIGNED> HostBufferHelicityMask;
 #else
@@ -403,7 +403,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for wavefunctions
   constexpr size_t sizePerEventWavefunctions = MemoryBuffers::nw6 * MemoryBuffers::nx2;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for wavefunctions
   typedef HostBuffer<fptype, sizePerEventWavefunctions, HostBufferALIGNED> HostBufferWavefunctions;
 #else
@@ -421,7 +421,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for helicity random numbers
   constexpr size_t sizePerEventRndNumHelicity = 1;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for helicity random numbers
   typedef HostBuffer<fptype, sizePerEventRndNumHelicity, HostBufferALIGNED> HostBufferRndNumHelicity;
 #else
@@ -439,7 +439,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for color random numbers
   constexpr size_t sizePerEventRndNumColor = 1;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for color random numbers
   typedef HostBuffer<fptype, sizePerEventRndNumColor, HostBufferALIGNED> HostBufferRndNumColor;
 #else
@@ -457,7 +457,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for helicity selection
   constexpr size_t sizePerEventSelectedHelicity = 1;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for helicity selection
   typedef HostBuffer<int, sizePerEventSelectedHelicity, HostBufferALIGNED> HostBufferSelectedHelicity;
 #else
@@ -475,7 +475,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for color selection
   constexpr size_t sizePerEventSelectedColor = 1;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for color selection
   typedef HostBuffer<int, sizePerEventSelectedColor, HostBufferALIGNED> HostBufferSelectedColor;
 #else
@@ -487,7 +487,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   template<class Tdst, class Tsrc>
   void copyDeviceFromHost( Tdst& dst, const Tsrc& src ) // keep the same order of arguments as in memcpy
   {
@@ -504,13 +504,13 @@ namespace mg5amcCpu
       throw std::runtime_error( sstr.str() );
     }
     // NB (PR #45): cudaMemcpy involves an intermediate memcpy to pinned memory if host array is a not a pinned host array
-    checkCuda( cudaMemcpy( dst.data(), src.data(), src.bytes(), cudaMemcpyHostToDevice ) );
+    gpuMemcpy( dst.data(), src.data(), src.bytes(), gpuMemcpyHostToDevice );
   }
 #endif
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   template<class Tdst, class Tsrc>
   void copyHostFromDevice( Tdst& dst, const Tsrc& src ) // keep the same order of arguments as in memcpy
   {
@@ -527,7 +527,7 @@ namespace mg5amcCpu
       throw std::runtime_error( sstr.str() );
     }
     // NB (PR #45): cudaMemcpy involves an intermediate memcpy to pinned memory if host array is a not a pinned host array
-    checkCuda( cudaMemcpy( dst.data(), src.data(), src.bytes(), cudaMemcpyDeviceToHost ) );
+    gpuMemcpy( dst.data(), src.data(), src.bytes(), gpuMemcpyDeviceToHost );
   }
 #endif
 
diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/CPPProcess.cc b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/CPPProcess.cc
index 5459588505..caf3f4c49d 100644
--- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/CPPProcess.cc
+++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/CPPProcess.cc
@@ -4,7 +4,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
 // MadGraph5_aMC@NLO v. 3.5.0_lo_vect, 2023-06-09
@@ -16,7 +16,6 @@
 
 #include "mgOnGpuConfig.h"
 
-#include "CudaRuntime.h"
 #include "HelAmps_sm.h"
 #include "MemoryAccessAmplitudes.h"
 #include "MemoryAccessCouplings.h"
@@ -46,7 +45,7 @@
 // Class member functions for calculating the matrix elements for
 // Process: g g > t t~ g g g WEIGHTED<=5 @1
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -80,7 +79,7 @@ namespace mg5amcCpu
   __device__ const fptype cIPD[2] = { (fptype)Parameters_sm::mdl_MT, (fptype)Parameters_sm::mdl_WT };
   __device__ const fptype* cIPC = nullptr; // unused as nicoup=0
 #else
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   __device__ __constant__ fptype cIPD[2];
   __device__ __constant__ fptype* cIPC = nullptr; // unused as nicoup=0
 #else
@@ -90,7 +89,7 @@ namespace mg5amcCpu
 #endif
 
   // Helicity combinations (and filtering of "good" helicity combinations)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   __device__ __constant__ short cHel[ncomb][npar];
   __device__ __constant__ int cNGoodHel;
   __device__ __constant__ int cGoodHel[ncomb];
@@ -118,13 +117,13 @@ namespace mg5amcCpu
                            fptype* allDenominators,       // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
                            fptype_sv* jamp2_sv            // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled)
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
                            , const int ievt00             // input: first event number in current C++ event page (for CUDA, ievt depends on threadid)
 #endif
                            )
   //ALWAYS_INLINE // attributes are not permitted in a function definition
   {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     using namespace mg5amcGpu;
     using M_ACCESS = DeviceAccessMomenta;         // non-trivial access: buffer includes all events
     using E_ACCESS = DeviceAccessMatrixElements;  // non-trivial access: buffer includes all events
@@ -151,7 +150,7 @@ namespace mg5amcCpu
 #endif /* clang-format on */
     mgDebug( 0, __FUNCTION__ );
     //printf( "calculate_wavefunctions: ihel=%2d\n", ihel );
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
     //printf( "calculate_wavefunctions: ievt00=%d\n", ievt00 );
 #endif
 
@@ -187,7 +186,7 @@ namespace mg5amcCpu
 #endif
     for( int iParity = 0; iParity < nParity; ++iParity )
     { // START LOOP ON IPARITY
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
       const int ievt0 = ievt00 + iParity * neppV;
 #endif
       constexpr size_t nxcoup = ndcoup + nicoup; // both dependent and independent couplings
@@ -200,8 +199,10 @@ namespace mg5amcCpu
         allCOUPs[idcoup] = CD_ACCESS::idcoupAccessBufferConst( allcouplings, idcoup ); // dependent couplings, vary event-by-event
       for( size_t iicoup = 0; iicoup < nicoup; iicoup++ )
         allCOUPs[ndcoup + iicoup] = CI_ACCESS::iicoupAccessBufferConst( cIPC, iicoup ); // independent couplings, fixed for all events
+#ifdef MGONGPUCPP_GPUIMPL
 #ifdef __CUDACC__
 #pragma nv_diagnostic pop
+#endif
       // CUDA kernels take input/output buffers with momenta/MEs for all events
       const fptype* momenta = allmomenta;
       const fptype* COUPs[nxcoup];
@@ -30018,7 +30019,7 @@ namespace mg5amcCpu
         { -116, 442, 442, -134, -134, 505, -44, -134, 28, -224, -62, 496, -53, 19, -62, 496, 10, -80, -62, -71, 10, -80, 1, -8, 136, -116, -116, -44, -44, 514, -116, 442, -44, 28, -53, -62, -44, -53, 514, -62, 100, 10, 28, -62, -62, 10, 10, 1, 514, 505, -62, 496, -71, 568, -44, -134, -53, -62, 19, -71, 10, -80, 100, 10, 640, -80, 1, -8, 10, 1, 64, -8, -62, -71, 10, -80, 1, -8, 28, -62, -62, 10, 10, 1, 1, -8, 10, 1, 64, -8, -8, 64, -80, -8, -512, 64, 496, 568, -80, 640, -8, 64, -224, 496, 496, -80, -80, -8, -8, 64, -80, -8, -512, 64, 64, -512, 640, 64, 4096, -512 },
         { 136, -116, -116, -44, -44, 514, -116, 442, -44, 28, -53, -62, -44, -53, 514, -62, 100, 10, 28, -62, -62, 10, 10, 1, -116, 442, 442, -134, -134, 505, -44, -134, 28, -224, -62, 496, -53, 19, -62, 496, 10, -80, -62, -71, 10, -80, 1, -8, -44, -134, -53, -62, 19, -71, 514, 505, -62, 496, -71, 568, 100, 10, 10, -80, -80, 640, 10, 1, 1, -8, -8, 64, 28, -62, -62, 10, 10, 1, -62, -71, 10, -80, 1, -8, 10, 1, 1, -8, -8, 64, -80, -8, -8, 64, 64, -512, -224, 496, 496, -80, -80, -8, 496, 568, -80, 640, -8, 64, -80, -8, -8, 64, 64, -512, 640, 64, 64, -512, -512, 4096 } }; // 2-D array[120][120]
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
       // Pre-compute a constexpr triangular color matrix properly normalized #475
       struct TriangularNormalizedColorMatrix
       {
@@ -30075,7 +30076,7 @@ namespace mg5amcCpu
 #endif
       for( int icol = 0; icol < ncolor; icol++ )
       {
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
         // === C++ START ===
         // Diagonal terms
 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
@@ -30134,7 +30135,7 @@ namespace mg5amcCpu
       MEs_sv_previous += deltaMEs_previous;
 #endif
       /*
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
       if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", blockDim.x * blockIdx.x + threadIdx.x, ihel, MEs_sv );
 #else
 #ifdef MGONGPU_CPPSIMD
@@ -30293,8 +30294,8 @@ namespace mg5amcCpu
       { 1, 1, 1, -1, 1, -1, 1 },
       { 1, 1, 1, -1, 1, 1, -1 },
       { 1, 1, 1, -1, 1, 1, 1 } };
-#ifdef __CUDACC__
-    checkCuda( cudaMemcpyToSymbol( cHel, tHel, ncomb * npar * sizeof( short ) ) );
+#ifdef MGONGPUCPP_GPUIMPL
+    gpuMemcpyToSymbol( cHel, tHel, ncomb * npar * sizeof( short ) );
 #else
     memcpy( cHel, tHel, ncomb * npar * sizeof( short ) );
 #endif
@@ -30337,9 +30338,9 @@ namespace mg5amcCpu
     // Then copy them to CUDA constant memory (issue #39) or its C++ emulation in file-scope static memory
     const fptype tIPD[2] = { (fptype)m_pars->mdl_MT, (fptype)m_pars->mdl_WT };
     //const cxtype tIPC[0] = { ... }; // nicoup=0
-#ifdef __CUDACC__
-    checkCuda( cudaMemcpyToSymbol( cIPD, tIPD, 2 * sizeof( fptype ) ) );
-    //checkCuda( cudaMemcpyToSymbol( cIPC, tIPC, 0 * sizeof( cxtype ) ) ); // nicoup=0
+#ifdef MGONGPUCPP_GPUIMPL
+    gpuMemcpyToSymbol( cIPD, tIPD, 2 * sizeof( fptype ) );
+    //gpuMemcpyToSymbol( cIPC, tIPC, 0 * sizeof( cxtype ) ); // nicoup=0
 #else
     memcpy( cIPD, tIPD, 2 * sizeof( fptype ) );
     //memcpy( cIPC, tIPC, 0 * sizeof( cxtype ) ); // nicoup=0
@@ -30378,7 +30379,7 @@ namespace mg5amcCpu
   {
     std::stringstream out;
     // CUDA version (NVCC)
-    // [Use __NVCC__ instead of __CUDACC__ here!]
+    // [Use __NVCC__ instead of MGONGPUCPP_GPUIMPL here!]
     // [This tests if 'nvcc' was used even to build a .cc file, even if not necessarily 'nvcc -x cu' for a .cu file]
     // [Check 'nvcc --compiler-options -dM -E dummy.c | grep CUDA': see https://stackoverflow.com/a/53713712]
 #ifdef __NVCC__
@@ -30443,12 +30444,12 @@ namespace mg5amcCpu
   __global__ void /* clang-format off */
   computeDependentCouplings( const fptype* allgs, // input: Gs[nevt]
                              fptype* allcouplings // output: couplings[nevt*ndcoup*2]
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
                              , const int nevt     // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #endif
   ) /* clang-format on */
   {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     using namespace mg5amcGpu;
     using G_ACCESS = DeviceAccessGs;
     using C_ACCESS = DeviceAccessCouplings;
@@ -30469,7 +30470,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__ /* clang-format off */
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
   __global__ void
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
@@ -30598,9 +30599,9 @@ namespace mg5amcCpu
         nGoodHel++;
       }
     }
-#ifdef __CUDACC__
-    checkCuda( cudaMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) ) );
-    checkCuda( cudaMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) ) );
+#ifdef MGONGPUCPP_GPUIMPL
+    gpuMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) );
+    gpuMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) );
 #else
     cNGoodHel = nGoodHel;
     for( int ihel = 0; ihel < ncomb; ihel++ ) cGoodHel[ihel] = goodHel[ihel];
@@ -30624,7 +30625,7 @@ namespace mg5amcCpu
 #endif
             int* allselhel,                // output: helicity selection[nevt]
             int* allselcol                 // output: helicity selection[nevt]
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
             , const int nevt               // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #endif
             ) /* clang-format on */
@@ -30645,7 +30646,7 @@ namespace mg5amcCpu
     // Denominators: spins, colors and identical particles
     constexpr int helcolDenominators[1] = { 1536 }; // assume nprocesses == 1 (#272 and #343)
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     // Remember: in CUDA this is a kernel for one event, in c++ this processes n events
     const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid
 #else
@@ -30659,9 +30660,12 @@ namespace mg5amcCpu
 #endif
 
     // Start sigmaKin_lines
+
+#include "GpuAbstraction.h"
+
     // === PART 0 - INITIALISATION (before calculate_wavefunctions) ===
     // Reset the "matrix elements" - running sums of |M|^2 over helicities for the given event
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     allMEs[ievt] = 0;
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     allNumerators[ievt] = 0;
@@ -30689,7 +30693,7 @@ namespace mg5amcCpu
     // === PART 1 - HELICITY LOOP: CALCULATE WAVEFUNCTIONS ===
     // (in both CUDA and C++, using precomputed good helicities)
 
-#ifdef __CUDACC__ // CUDA OR C++
+#ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++
 
     // *** START OF PART 1a - CUDA (one event per CPU thread) ***
     // Running sum of partial amplitudes squared for event by event color selection (#402)
@@ -30893,7 +30897,7 @@ namespace mg5amcCpu
     // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event
     // [NB 'sum over final spins, average over initial spins', eg see
     // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf]
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     allMEs[ievt] /= helcolDenominators[0];
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     if( channelId > 0 ) allMEs[ievt] *= allNumerators[ievt] / allDenominators[ievt];
diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/CPPProcess.h b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/CPPProcess.h
index d1dd4d6150..b1f469b1c9 100644
--- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/CPPProcess.h
+++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/CPPProcess.h
@@ -4,7 +4,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
 // MadGraph5_aMC@NLO v. 3.5.0_lo_vect, 2023-06-09
@@ -25,7 +25,7 @@
 
 //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -107,7 +107,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   __global__ void
   computeDependentCouplings( const fptype* allgs,    // input: Gs[nevt]
                              fptype* allcouplings ); // output: couplings[nevt*ndcoup*2]
@@ -120,7 +120,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__ /* clang-format off */
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
   __global__ void
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
@@ -150,7 +150,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__ /* clang-format off */
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
   __global__ void
   sigmaKin( const fptype* allmomenta,      // input: momenta[nevt*npar*4]
             const fptype* allcouplings,    // input: couplings[nevt*ndcoup*2]
diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/CudaRuntime.h b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/CudaRuntime.h
deleted file mode 120000
index ce9e1a487a..0000000000
--- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/CudaRuntime.h
+++ /dev/null
@@ -1 +0,0 @@
-../CudaRuntime.h
\ No newline at end of file
diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/check_sa.cc b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/check_sa.cc
index f1e75b9252..1bad694d1c 100644
--- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/check_sa.cc
+++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/check_sa.cc
@@ -4,7 +4,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: O. Mattelaer (Nov 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 
 #include "mgOnGpuConfig.h"
@@ -12,6 +12,7 @@
 #include "BridgeKernels.h"
 #include "CPPProcess.h"
 #include "CrossSectionKernels.h"
+#include "GpuRuntime.h"
 #include "MatrixElementKernels.h"
 #include "MemoryAccessMatrixElements.h"
 #include "MemoryAccessMomenta.h"
@@ -63,7 +64,7 @@ usage( char* argv0, int ret = 1 )
   std::cout << std::endl;
   std::cout << "Summary stats are always computed: '-p' and '-j' only control their printout" << std::endl;
   std::cout << "The '-d' flag only enables NaN/abnormal warnings and OMP debugging" << std::endl;
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 #ifdef _OPENMP
   std::cout << std::endl;
   std::cout << "Use the OMP_NUM_THREADS environment variable to control OMP multi-threading" << std::endl;
@@ -77,7 +78,7 @@ int
 main( int argc, char** argv )
 {
   // Namespaces for CUDA and C++ (FIXME - eventually use the same namespace everywhere...)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   using namespace mg5amcGpu;
 #else
   using namespace mg5amcCpu;
@@ -103,11 +104,11 @@ main( int argc, char** argv )
     CurandDevice = 2
   };
 #ifdef __CUDACC__
-  RandomNumberMode rndgen = RandomNumberMode::CurandDevice; // default on GPU
+  RandomNumberMode rndgen = RandomNumberMode::CurandDevice; // default on CUDA GPU
 #elif not defined MGONGPU_HAS_NO_CURAND
   RandomNumberMode rndgen = RandomNumberMode::CurandHost;  // default on CPU if build has curand
 #else
-  RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // default on CPU if build has no curand
+  RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // default on CPU if build has no curand and on HIP GPU
 #endif
   // Rambo sampling mode (NB RamboHost implies CommonRandom or CurandHost!)
   enum class RamboSamplingMode
@@ -115,7 +116,7 @@ main( int argc, char** argv )
     RamboHost = 1,
     RamboDevice = 2
   };
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   RamboSamplingMode rmbsmp = RamboSamplingMode::RamboDevice; // default on GPU
 #else
   RamboSamplingMode rmbsmp = RamboSamplingMode::RamboHost; // default on CPU
@@ -148,7 +149,7 @@ main( int argc, char** argv )
 #ifdef __CUDACC__
       rndgen = RandomNumberMode::CurandDevice;
 #else
-      throw std::runtime_error( "CurandDevice is not supported on CPUs" );
+      throw std::runtime_error( "CurandDevice is not supported on CPUs or on HIP GPUs" );
 #endif
     }
     else if( arg == "--curhst" )
@@ -165,7 +166,7 @@ main( int argc, char** argv )
     }
     else if( arg == "--rmbdev" )
     {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
       rmbsmp = RamboSamplingMode::RamboDevice;
 #else
       throw std::runtime_error( "RamboDevice is not supported on CPUs" );
@@ -239,13 +240,13 @@ main( int argc, char** argv )
     return usage( argv[0] );
   }
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 #ifdef _OPENMP
   ompnumthreadsNotSetMeansOneThread( debug ? 1 : 0 ); // quiet(-1), info(0), debug(1)
 #endif
 #endif
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // Fail gently and avoid "Illegal instruction (core dumped)" if the host does not support the SIMD used in the ME calculation
   // Note: this prevents a crash on pmpe04 but not on some github CI nodes?
   // [NB: SIMD vectorization in mg5amc C++ code is only used in the ME calculation below MatrixElementKernelHost!]
@@ -263,14 +264,14 @@ main( int argc, char** argv )
 
   // === STEP 0 - INITIALISE
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 
-  // --- 00. Initialise cuda
-  // Instantiate a CudaRuntime at the beginnining of the application's main to
-  // invoke cudaSetDevice(0) in the constructor and book a cudaDeviceReset() call in the destructor
-  const std::string cdinKey = "00 CudaInit";
+  // --- 00. Initialise GPU
+  // Instantiate a GpuRuntime at the beginnining of the application's main.
+  // For CUDA this invokes cudaSetDevice(0) in the constructor and books a cudaDeviceReset() call in the destructor.
+  const std::string cdinKey = "00 GpuInit";
   timermap.start( cdinKey );
-  CudaRuntime cudaRuntime( debug );
+  GpuRuntime GpuRuntime( debug );
 #endif
 
   // --- 0a. Initialise physics process
@@ -292,7 +293,7 @@ main( int argc, char** argv )
   timermap.start( alloKey );
 
   // Memory buffers for random numbers for momenta
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferRndNumMomenta hstRndmom( nevt );
 #else
   PinnedHostBufferRndNumMomenta hstRndmom( nevt );
@@ -300,7 +301,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for sampling weights
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferWeights hstWeights( nevt );
 #else
   PinnedHostBufferWeights hstWeights( nevt );
@@ -308,7 +309,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for momenta
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferMomenta hstMomenta( nevt );
 #else
   PinnedHostBufferMomenta hstMomenta( nevt );
@@ -316,7 +317,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for Gs
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferGs hstGs( nevt );
 #else
   PinnedHostBufferGs hstGs( nevt );
@@ -333,7 +334,7 @@ main( int argc, char** argv )
   }
 
   // Memory buffers for matrix elements
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferMatrixElements hstMatrixElements( nevt );
 #else
   PinnedHostBufferMatrixElements hstMatrixElements( nevt );
@@ -342,7 +343,7 @@ main( int argc, char** argv )
 
   // Memory buffers for random numbers for helicity selection
   // *** NB #403 these buffers always remain initialised at 0: no need for helicity choice in gcheck/check (no LHE produced) ***
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferRndNumHelicity hstRndHel( nevt );
 #else
   PinnedHostBufferRndNumHelicity hstRndHel( nevt );
@@ -351,7 +352,7 @@ main( int argc, char** argv )
 
   // Memory buffers for random numbers for color selection
   // *** NB #402 these buffers always remain initialised at 0: no need for color choice in gcheck/check (no LHE produced) ***
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferRndNumColor hstRndCol( nevt );
 #else
   PinnedHostBufferRndNumColor hstRndCol( nevt );
@@ -359,7 +360,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for helicity selection
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferSelectedHelicity hstSelHel( nevt );
 #else
   PinnedHostBufferSelectedHelicity hstSelHel( nevt );
@@ -367,7 +368,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for color selection
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferSelectedColor hstSelCol( nevt );
 #else
   PinnedHostBufferSelectedColor hstSelCol( nevt );
@@ -403,7 +404,7 @@ main( int argc, char** argv )
 #else
   else
   {
-    throw std::logic_error( "CurandDevice is not supported on CPUs" ); // INTERNAL ERROR (no path to this statement)
+    throw std::logic_error( "CurandDevice is not supported on CPUs or HIP GPUs" ); // INTERNAL ERROR (no path to this statement)
   }
 #endif
 #else
@@ -421,7 +422,7 @@ main( int argc, char** argv )
   }
   else
   {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     prsk.reset( new RamboSamplingKernelDevice( energy, devRndmom, devMomenta, devWeights, gpublocks, gputhreads ) );
 #else
     throw std::logic_error( "RamboDevice is not supported on CPUs" ); // INTERNAL ERROR (no path to this statement)
@@ -432,7 +433,7 @@ main( int argc, char** argv )
   std::unique_ptr<MatrixElementKernelBase> pmek;
   if( !bridge )
   {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     pmek.reset( new MatrixElementKernelDevice( devMomenta, devGs, devRndHel, devRndCol, devMatrixElements, devSelHel, devSelCol, gpublocks, gputhreads ) );
 #else
     pmek.reset( new MatrixElementKernelHost( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, nevt ) );
@@ -440,7 +441,7 @@ main( int argc, char** argv )
   }
   else
   {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     pmek.reset( new BridgeKernelDevice( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, gpublocks, gputhreads ) );
 #else
     pmek.reset( new BridgeKernelHost( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, nevt ) );
@@ -482,7 +483,7 @@ main( int argc, char** argv )
     prnk->generateRnarray();
     //std::cout << "Got random numbers" << std::endl;
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     if( rndgen != RandomNumberMode::CurandDevice && rmbsmp == RamboSamplingMode::RamboDevice )
     {
       // --- 1c. Copy rndmom from host to device
@@ -514,7 +515,7 @@ main( int argc, char** argv )
     prsk->getMomentaFinal();
     //std::cout << "Got final momenta" << std::endl;
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     if( rmbsmp == RamboSamplingMode::RamboDevice )
     {
       // --- 2c. CopyDToH Weights
@@ -559,7 +560,7 @@ main( int argc, char** argv )
       dynamic_cast<BridgeKernelBase*>( pmek.get() )->transposeInputMomentaC2F();
     }
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     // --- 2d. CopyHToD Momenta
     const std::string gKey = "0.. CpHTDg";
     rambtime += timermap.start( gKey ); // FIXME! NOT A RAMBO TIMER!
@@ -588,7 +589,7 @@ main( int argc, char** argv )
     wv3atime += timermap.stop(); // calc only
     wavetime += wv3atime;        // calc plus copy
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     if( !bridge )
     {
       // --- 3b. CopyDToH MEs
@@ -731,15 +732,19 @@ main( int argc, char** argv )
     rndgentxt = "CURAND DEVICE";
 #ifdef __CUDACC__
   rndgentxt += " (CUDA code)";
+#elif defined __HIPCC__
+  rndgentxt += " (HIP code)";
 #else
   rndgentxt += " (C++ code)";
 #endif
 
   // Workflow description summary
   std::string wrkflwtxt;
-  // -- CUDA or C++?
+  // -- CUDA or HIP or C++?
 #ifdef __CUDACC__
   wrkflwtxt += "CUD:";
+#elif defined __HIPCC__
+  wrkflwtxt += "HIP:";
 #else
   wrkflwtxt += "CPP:";
 #endif
@@ -764,6 +769,12 @@ main( int argc, char** argv )
 #else
   wrkflwtxt += "???:"; // no path to this statement
 #endif
+#elif defined __HIPCC__
+#if defined MGONGPU_CUCXTYPE_CXSMPL
+  wrkflwtxt += "CXS:";
+#else
+  wrkflwtxt += "???:"; // no path to this statement
+#endif
 #else
 #if defined MGONGPU_CPPCXTYPE_STDCOMPLEX
   wrkflwtxt += "STX:";
@@ -789,7 +800,7 @@ main( int argc, char** argv )
     wrkflwtxt += "RMBDEV+";
   else
     wrkflwtxt += "??????+"; // no path to this statement
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   // -- HOST or DEVICE matrix elements? Standalone MEs or BRIDGE?
   if( !bridge )
     wrkflwtxt += "MESDEV";
@@ -845,7 +856,7 @@ main( int argc, char** argv )
 
   if( perf )
   {
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 #ifdef _OPENMP
     // Get the output of "nproc --all" (https://stackoverflow.com/a/478960)
     std::string nprocall;
@@ -866,6 +877,8 @@ main( int argc, char** argv )
     std::cout << std::string( SEP79, '*' ) << std::endl
 #ifdef __CUDACC__
               << "Process                     = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_CUDA"
+#elif defined __HIPCC__
+              << "Process                     = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_HIP"
 #else
               << "Process                     = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_CPP"
 #endif
@@ -892,21 +905,21 @@ main( int argc, char** argv )
 #elif defined MGONGPU_FPTYPE_FLOAT
               << "FP precision                = FLOAT (NaN/abnormal=" << nabn << ", zero=" << nzero << ")" << std::endl
 #endif
-#ifdef __CUDACC__
 #if defined MGONGPU_CUCXTYPE_CUCOMPLEX
               << "Complex type                = CUCOMPLEX" << std::endl
 #elif defined MGONGPU_CUCXTYPE_THRUST
               << "Complex type                = THRUST::COMPLEX" << std::endl
-#endif
-#else
+#elif defined MGONGPU_CUCXTYPE_CXSMPL
               << "Complex type                = STD::COMPLEX" << std::endl
+#else
+              << "Complex type                = ???" << std::endl // no path to this statement...
 #endif
               << "RanNumb memory layout       = AOSOA[" << neppR << "]"
               << ( neppR == 1 ? " == AOS" : "" )
               << " [HARDCODED FOR REPRODUCIBILITY]" << std::endl
               << "Momenta memory layout       = AOSOA[" << neppM << "]"
               << ( neppM == 1 ? " == AOS" : "" ) << std::endl
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     //<< "Wavefunction GPU memory     = LOCAL" << std::endl
 #else
 #if !defined MGONGPU_CPPSIMD
@@ -937,7 +950,7 @@ main( int argc, char** argv )
 #endif
 #endif
               << "Random number generation    = " << rndgentxt << std::endl
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 #ifdef _OPENMP
               << "OMP threads / `nproc --all` = " << omp_get_max_threads() << " / " << nprocall // includes a newline
 #endif
@@ -1033,14 +1046,14 @@ main( int argc, char** argv )
              << "\"FLOAT (NaN/abnormal=" << nabn << ")\"," << std::endl
 #endif
              << "\"Complex type\": "
-#ifdef __CUDACC__
 #if defined MGONGPU_CUCXTYPE_CUCOMPLEX
              << "\"CUCOMPLEX\"," << std::endl
 #elif defined MGONGPU_CUCXTYPE_THRUST
              << "\"THRUST::COMPLEX\"," << std::endl
-#endif
-#else
+#elif defined MGONGPU_CUCXTYPE_CXSMPL
              << "\"STD::COMPLEX\"," << std::endl
+#else
+             << "\"???\"," << std::endl                           // no path to this statement...
 #endif
              << "\"RanNumb memory layout\": "
              << "\"AOSOA[" << neppR << "]\""
@@ -1048,7 +1061,7 @@ main( int argc, char** argv )
              << "\"Momenta memory layout\": "
              << "\"AOSOA[" << neppM << "]\""
              << ( neppM == 1 ? " == AOS" : "" ) << ", " << std::endl
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     //<< "\"Wavefunction GPU memory\": " << "\"LOCAL\"," << std::endl
 #endif
              << "\"Curand generation\": "
diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/RamboSamplingKernels.cc b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/RamboSamplingKernels.cc
index da68aa9255..79abbcc4f8 100644
--- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/RamboSamplingKernels.cc
+++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/RamboSamplingKernels.cc
@@ -1,11 +1,11 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #include "RamboSamplingKernels.h"
 
-#include "CudaRuntime.h"
+#include "GpuRuntime.h"
 #include "MemoryAccessMomenta.h"
 #include "MemoryAccessRandomNumbers.h"
 #include "MemoryAccessWeights.h"
@@ -14,7 +14,7 @@
 
 #include <sstream>
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -92,7 +92,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   RamboSamplingKernelDevice::RamboSamplingKernelDevice( const fptype energy,               // input: energy
                                                         const BufferRndNumMomenta& rndmom, // input: random numbers in [0,1]
                                                         BufferMomenta& momenta,            // output: momenta
@@ -135,7 +135,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   __global__ void
   getMomentaInitialDevice( const fptype energy,
                            fptype* momenta )
@@ -147,17 +147,17 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   void
   RamboSamplingKernelDevice::getMomentaInitial()
   {
-    getMomentaInitialDevice<<<m_gpublocks, m_gputhreads>>>( m_energy, m_momenta.data() );
+    gpuLaunchKernel( getMomentaInitialDevice, m_gpublocks, m_gputhreads, m_energy, m_momenta.data() );
   }
 #endif
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   __global__ void
   getMomentaFinalDevice( const fptype energy,
                          const fptype* rndmom,
@@ -171,11 +171,11 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   void
   RamboSamplingKernelDevice::getMomentaFinal()
   {
-    getMomentaFinalDevice<<<m_gpublocks, m_gputhreads>>>( m_energy, m_rndmom.data(), m_momenta.data(), m_weights.data() );
+    gpuLaunchKernel( getMomentaFinalDevice, m_gpublocks, m_gputhreads, m_energy, m_rndmom.data(), m_momenta.data(), m_weights.data() );
   }
 #endif
 
diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/RamboSamplingKernels.h b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/RamboSamplingKernels.h
index 184089efd7..7c214cd74b 100644
--- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/RamboSamplingKernels.h
+++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/RamboSamplingKernels.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef RAMBOSAMPLINGKERNELS_H
 #define RAMBOSAMPLINGKERNELS_H 1
@@ -10,7 +10,7 @@
 
 #include "MemoryBuffers.h"
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -93,7 +93,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   // A class encapsulating RAMBO phase space sampling on a GPU device
   class RamboSamplingKernelDevice final : public SamplingKernelBase, public NumberOfEvents
   {
diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/RandomNumberKernels.h b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/RandomNumberKernels.h
index 188a72c2c9..21d63beeac 100644
--- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/RandomNumberKernels.h
+++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/RandomNumberKernels.h
@@ -1,14 +1,14 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef RANDOMNUMBERKERNELS_H
 #define RANDOMNUMBERKERNELS_H 1
 
 #include "mgOnGpuConfig.h"
 
-// NB This must come AFTER mgOnGpuConfig.h which contains our definition of __global__ when __CUDACC__ is not defined
+// NB This must come AFTER mgOnGpuConfig.h which contains our definition of __global__ when MGONGPUCPP_GPUIMPL is not defined
 #ifndef MGONGPU_HAS_NO_CURAND
 //#include "curand.h"
 struct curandGenerator_st; // forward definition from curand.h
@@ -16,7 +16,7 @@ struct curandGenerator_st; // forward definition from curand.h
 
 #include "MemoryBuffers.h"
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/cudacpp.mk b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/cudacpp.mk
index 43cee0977e..77334d2c04 100644
--- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/cudacpp.mk
+++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/cudacpp.mk
@@ -1,7 +1,7 @@
 # Copyright (C) 2020-2023 CERN and UCLouvain.
 # Licensed under the GNU Lesser General Public License (version 3 or later).
 # Created by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin.
-# Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+# Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 
 #=== Determine the name of this makefile (https://ftp.gnu.org/old-gnu/Manuals/make-3.80/html_node/make_17.html)
 #=== NB: different names (e.g. cudacpp.mk and cudacpp_src.mk) are used in the Subprocess and src directories
@@ -32,7 +32,7 @@ UNAME_P := $(shell uname -p)
 #=== Configure common compiler flags for C++ and CUDA
 
 INCFLAGS = -I.
-OPTFLAGS = -O3 # this ends up in CUFLAGS too (should it?), cannot add -Ofast or -ffast-math here
+OPTFLAGS = -O3 # this ends up in GPUFLAGS too (should it?), cannot add -Ofast or -ffast-math here
 
 # Dependency on src directory
 MG5AMC_COMMONLIB = mg5amc_common
@@ -103,69 +103,135 @@ endif
 
 #-------------------------------------------------------------------------------
 
-#=== Configure the CUDA compiler
-
-# If CXX is not a single word (example "clang++ --gcc-toolchain...") then disable CUDA builds (issue #505)
-# This is because it is impossible to pass this to "CUFLAGS += -ccbin <host-compiler>" below
-ifneq ($(words $(subst ccache ,,$(CXX))),1) # allow at most "CXX=ccache <host-compiler>" from outside
-  $(warning CUDA builds are not supported for multi-word CXX "$(CXX)")
-  override CUDA_HOME=disabled
-endif
-
-# If CUDA_HOME is not set, try to set it from the location of nvcc
-ifndef CUDA_HOME
-  CUDA_HOME = $(patsubst %bin/nvcc,%,$(shell which nvcc 2>/dev/null))
-  $(warning CUDA_HOME was not set: using "$(CUDA_HOME)")
-endif
-
-# Set NVCC as $(CUDA_HOME)/bin/nvcc if it exists
-ifneq ($(wildcard $(CUDA_HOME)/bin/nvcc),)
-  NVCC = $(CUDA_HOME)/bin/nvcc
-  USE_NVTX ?=-DUSE_NVTX
-  # See https://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/index.html
-  # See https://arnon.dk/matching-sm-architectures-arch-and-gencode-for-various-nvidia-cards/
-  # Default: use compute capability 70 for V100 (CERN lxbatch, CERN itscrd, Juwels Cluster).
-  # Embed device code for 70, and PTX for 70+.
-  # Export MADGRAPH_CUDA_ARCHITECTURE (comma-separated list) to use another value or list of values (see #533).
-  # Examples: use 60 for P100 (Piz Daint), 80 for A100 (Juwels Booster, NVidia raplab/Curiosity).
-  MADGRAPH_CUDA_ARCHITECTURE ?= 70
-  ###CUARCHFLAGS = -gencode arch=compute_$(MADGRAPH_CUDA_ARCHITECTURE),code=compute_$(MADGRAPH_CUDA_ARCHITECTURE) -gencode arch=compute_$(MADGRAPH_CUDA_ARCHITECTURE),code=sm_$(MADGRAPH_CUDA_ARCHITECTURE) # Older implementation (AV): go back to this one for multi-GPU support #533
-  ###CUARCHFLAGS = --gpu-architecture=compute_$(MADGRAPH_CUDA_ARCHITECTURE) --gpu-code=sm_$(MADGRAPH_CUDA_ARCHITECTURE),compute_$(MADGRAPH_CUDA_ARCHITECTURE) # Newer implementation (SH): cannot use this as-is for multi-GPU support #533
-  comma:=,
-  CUARCHFLAGS = $(foreach arch,$(subst $(comma), ,$(MADGRAPH_CUDA_ARCHITECTURE)),-gencode arch=compute_$(arch),code=compute_$(arch) -gencode arch=compute_$(arch),code=sm_$(arch))
-  CUINC = -I$(CUDA_HOME)/include/
-  CURANDLIBFLAGS = -L$(CUDA_HOME)/lib64/ -lcurand # NB: -lcuda is not needed here!
-  CUOPTFLAGS = -lineinfo
-  CUFLAGS = $(foreach opt, $(OPTFLAGS), -Xcompiler $(opt)) $(CUOPTFLAGS) $(INCFLAGS) $(CUINC) $(USE_NVTX) $(CUARCHFLAGS) -use_fast_math
-  ###CUFLAGS += -Xcompiler -Wall -Xcompiler -Wextra -Xcompiler -Wshadow
-  ###NVCC_VERSION = $(shell $(NVCC) --version | grep 'Cuda compilation tools' | cut -d' ' -f5 | cut -d, -f1)
-  CUFLAGS += -std=c++17 # need CUDA >= 11.2 (see #333): this is enforced in mgOnGpuConfig.h
-  # Without -maxrregcount: baseline throughput: 6.5E8 (16384 32 12) up to 7.3E8 (65536 128 12)
-  ###CUFLAGS+= --maxrregcount 160 # improves throughput: 6.9E8 (16384 32 12) up to 7.7E8 (65536 128 12)
-  ###CUFLAGS+= --maxrregcount 128 # improves throughput: 7.3E8 (16384 32 12) up to 7.6E8 (65536 128 12)
-  ###CUFLAGS+= --maxrregcount 96 # degrades throughput: 4.1E8 (16384 32 12) up to 4.5E8 (65536 128 12)
-  ###CUFLAGS+= --maxrregcount 64 # degrades throughput: 1.7E8 (16384 32 12) flat at 1.7E8 (65536 128 12)
-else ifneq ($(origin REQUIRE_CUDA),undefined)
-  # If REQUIRE_CUDA is set but no cuda is found, stop here (e.g. for CI tests on GPU #443)
-  $(error No cuda installation found (set CUDA_HOME or make nvcc visible in PATH))
-else
-  # No cuda. Switch cuda compilation off and go to common random numbers in C++
-  $(warning CUDA_HOME is not set or is invalid: export CUDA_HOME to compile with cuda)
-  override NVCC=
-  override USE_NVTX=
-  override CUINC=
-  override CURANDLIBFLAGS=
-endif
-export NVCC
-export CUFLAGS
+CUDA_COMPILER_PATH := $(shell compiler="`which nvcc 2>/dev/null`" && while [ -L "$$compiler" ]; do compiler=`readlink "$$compiler"`; done && echo "$$compiler")
+HIP_COMPILER_PATH := $(shell compiler="`which hipcc 2>/dev/null`" && while [ -L "$$compiler" ]; do compiler=`readlink "$$compiler"`; done && echo "$$compiler")
+
+ifeq ($(findstring nvcc,$(CUDA_COMPILER_PATH)),nvcc)
+  #=== Configure the CUDA compiler
+
+  # If CXX is not a single word (example "clang++ --gcc-toolchain...") then disable CUDA builds (issue #505)
+  # This is because it is impossible to pass this to "GPUFLAGS += -ccbin <host-compiler>" below
+  ifneq ($(words $(subst ccache ,,$(CXX))),1) # allow at most "CXX=ccache <host-compiler>" from outside
+    $(warning CUDA builds are not supported for multi-word CXX "$(CXX)")
+    override CUDA_HOME=disabled
+  endif
+
+  # If CUDA_HOME is not set, try to set it from the location of nvcc
+  ifndef CUDA_HOME
+    CUDA_HOME = $(patsubst %bin/nvcc,%,$(shell which nvcc 2>/dev/null))
+    $(warning CUDA_HOME was not set: using "$(CUDA_HOME)")
+  endif
+
+  # Set GPUCC as $(CUDA_HOME)/bin/nvcc if it exists
+  ifneq ($(wildcard $(CUDA_HOME)/bin/nvcc),)
+    GPUCC = $(CUDA_HOME)/bin/nvcc
+    USE_NVTX ?=-DUSE_NVTX
+    # See https://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/index.html
+    # See https://arnon.dk/matching-sm-architectures-arch-and-gencode-for-various-nvidia-cards/
+    # Default: use compute capability 70 for V100 (CERN lxbatch, CERN itscrd, Juwels Cluster).
+    # Embed device code for 70, and PTX for 70+.
+    # Export MADGRAPH_CUDA_ARCHITECTURE (comma-separated list) to use another value or list of values (see #533).
+    # Examples: use 60 for P100 (Piz Daint), 80 for A100 (Juwels Booster, NVidia raplab/Curiosity).
+    MADGRAPH_CUDA_ARCHITECTURE ?= 70
+    ###CUARCHFLAGS = -gencode arch=compute_$(MADGRAPH_CUDA_ARCHITECTURE),code=compute_$(MADGRAPH_CUDA_ARCHITECTURE) -gencode arch=compute_$(MADGRAPH_CUDA_ARCHITECTURE),code=sm_$(MADGRAPH_CUDA_ARCHITECTURE) # Older implementation (AV): go back to this one for multi-GPU support #533
+    ###CUARCHFLAGS = --gpu-architecture=compute_$(MADGRAPH_CUDA_ARCHITECTURE) --gpu-code=sm_$(MADGRAPH_CUDA_ARCHITECTURE),compute_$(MADGRAPH_CUDA_ARCHITECTURE) # Newer implementation (SH): cannot use this as-is for multi-GPU support #533
+    comma:=,
+    CUARCHFLAGS = $(foreach arch,$(subst $(comma), ,$(MADGRAPH_CUDA_ARCHITECTURE)),-gencode arch=compute_$(arch),code=compute_$(arch) -gencode arch=compute_$(arch),code=sm_$(arch))
+    CUINC = -I$(CUDA_HOME)/include/
+    CURANDLIBFLAGS = -L$(CUDA_HOME)/lib64/ -lcurand # NB: -lcuda is not needed here!
+    CUOPTFLAGS = -lineinfo
+    GPUFLAGS = $(foreach opt, $(OPTFLAGS), -Xcompiler $(opt)) $(CUOPTFLAGS) $(INCFLAGS) $(CUINC) $(USE_NVTX) $(CUARCHFLAGS) -use_fast_math
+    ###GPUFLAGS += -Xcompiler -Wall -Xcompiler -Wextra -Xcompiler -Wshadow
+    ###GPUCC_VERSION = $(shell $(GPUCC) --version | grep 'Cuda compilation tools' | cut -d' ' -f5 | cut -d, -f1)
+    GPUFLAGS += -std=c++17 # need CUDA >= 11.2 (see #333): this is enforced in mgOnGpuConfig.h
+    # Without -maxrregcount: baseline throughput: 6.5E8 (16384 32 12) up to 7.3E8 (65536 128 12)
+    ###GPUFLAGS+= --maxrregcount 160 # improves throughput: 6.9E8 (16384 32 12) up to 7.7E8 (65536 128 12)
+    ###GPUFLAGS+= --maxrregcount 128 # improves throughput: 7.3E8 (16384 32 12) up to 7.6E8 (65536 128 12)
+    ###GPUFLAGS+= --maxrregcount 96 # degrades throughput: 4.1E8 (16384 32 12) up to 4.5E8 (65536 128 12)
+    ###GPUFLAGS+= --maxrregcount 64 # degrades throughput: 1.7E8 (16384 32 12) flat at 1.7E8 (65536 128 12)
+    CUBUILDRULEFLAGS = -Xcompiler -fPIC -c
+    CCBUILDRULEFLAGS = -Xcompiler -fPIC -c -x cu
+    CUDATESTFLAGS = -lcuda
+  else ifneq ($(origin REQUIRE_CUDA),undefined)
+    # If REQUIRE_CUDA is set but no cuda is found, stop here (e.g. for CI tests on GPU #443)
+    $(error No cuda installation found (set CUDA_HOME or make GPUCC visible in PATH))
+  else
+    # No cuda. Switch cuda compilation off and go to common random numbers in C++
+    $(warning CUDA_HOME is not set or is invalid: export CUDA_HOME to compile with cuda)
+    override GPUCC=
+    override USE_NVTX=
+    override CUINC=
+    override CURANDLIBFLAGS=
+  endif
+  export GPUCC
+  export GPUFLAGS
+
+  # Set the host C++ compiler for GPUCC via "-ccbin <host-compiler>"
+  # (NB issue #505: this must be a single word, "clang++ --gcc-toolchain..." is not supported)
+  GPUFLAGS += -ccbin $(shell which $(subst ccache ,,$(CXX)))
+
+  # Allow newer (unsupported) C++ compilers with older versions of CUDA if ALLOW_UNSUPPORTED_COMPILER_IN_CUDA is set (#504)
+  ifneq ($(origin ALLOW_UNSUPPORTED_COMPILER_IN_CUDA),undefined)
+  GPUFLAGS += -allow-unsupported-compiler
+  endif
+
+else ifeq ($(findstring hipcc,$(HIP_COMPILER_PATH)),hipcc)
+  #=== Configure the HIP compiler
+
+  # If CXX is not a single word (example "clang++ --gcc-toolchain...") then disable CUDA builds (issue #505)
+  # This is because it is impossible to pass this to "GPUFLAGS += -ccbin <host-compiler>" below
+  ifneq ($(words $(subst ccache ,,$(CXX))),1) # allow at most "CXX=ccache <host-compiler>" from outside
+    $(warning CUDA builds are not supported for multi-word CXX "$(CXX)")
+    override CUDA_HOME=disabled
+  endif
+
+  # If HIP_HOME is not set, try to set it from the location of GPUCC
+  ifndef HIP_HOME
+    HIP_HOME = $(patsubst %bin/hipcc,%,$(HIP_COMPILER_PATH))
+    $(warning HIP_HOME was not set: using "$(HIP_HOME)")
+  endif
 
-# Set the host C++ compiler for nvcc via "-ccbin <host-compiler>"
-# (NB issue #505: this must be a single word, "clang++ --gcc-toolchain..." is not supported)
-CUFLAGS += -ccbin $(shell which $(subst ccache ,,$(CXX)))
+  # Set GPUCC as $(HIP_HOME)/bin/hipcc if it exists
+  ifneq ($(wildcard $(HIP_HOME)/bin/hipcc),)
+    GPUCC = $(HIP_HOME)/bin/hipcc
+
+    # Should maybe find something equivelant to this in HIP
+    #USE_NVTX ?=-DUSE_NVTX
+
+    HIPARCHFLAGS = -target x86_64-linux-gnu --offload-arch=gfx90a
+    HIPINC = -I$(HIP_HOME)/include/
+
+    # -DHIP_FAST_MATH equivelant to -use_fast_math in HIP 
+    # (But only for single precision line 208: https://rocm-developer-tools.github.io/HIP/hcc__detail_2math__functions_8h_source.html)
+    GPUFLAGS = $(OPTFLAGS) $(CUOPTFLAGS) $(INCFLAGS) $(HIPINC) $(HIPARCHFLAGS) -DHIP_FAST_MATH -DHIP_PLATFORM=amd -fPIC
+    ###GPUFLAGS += -Xcompiler -Wall -Xcompiler -Wextra -Xcompiler -Wshadow
+    GPUFLAGS += -std=c++17
+    # Without -maxrregcount: baseline throughput: 6.5E8 (16384 32 12) up to 7.3E8 (65536 128 12)
+    ###GPUFLAGS+= --maxrregcount 160 # improves throughput: 6.9E8 (16384 32 12) up to 7.7E8 (65536 128 12)
+    ###GPUFLAGS+= --maxrregcount 128 # improves throughput: 7.3E8 (16384 32 12) up to 7.6E8 (65536 128 12)
+    ###GPUFLAGS+= --maxrregcount 96 # degrades throughput: 4.1E8 (16384 32 12) up to 4.5E8 (65536 128 12)
+    ###GPUFLAGS+= --maxrregcount 64 # degrades throughput: 1.7E8 (16384 32 12) flat at 1.7E8 (65536 128 12)
+
+    CUBUILDRULEFLAGS = -fPIC -c
+    CCBUILDRULEFLAGS = -fPIC -c
+
+  else ifneq ($(origin REQUIRE_HIP),undefined)
+    # If REQUIRE_HIP is set but no cuda is found, stop here (e.g. for CI tests on GPU #443)
+    $(error No hip installation found (set HIP_HOME or make GPUCC visible in PATH))
+  else
+    # No hip. Switch hip compilation off and go to common random numbers in C++
+    $(warning HIP_HOME is not set or is invalid: export HIP_HOME to compile with hip)
+    override GPUCC=
+    override USE_NVTX=
+    override CUINC=
+    override CURANDLIBFLAGS=
+  endif
+
+  # Allow newer (unsupported) C++ compilers with older versions of CUDA if ALLOW_UNSUPPORTED_COMPILER_IN_CUDA is set (#504)
+  ifneq ($(origin ALLOW_UNSUPPORTED_COMPILER_IN_CUDA),undefined)
+  GPUFLAGS += -allow-unsupported-compiler
+  endif
 
-# Allow newer (unsupported) C++ compilers with older versions of CUDA if ALLOW_UNSUPPORTED_COMPILER_IN_CUDA is set (#504)
-ifneq ($(origin ALLOW_UNSUPPORTED_COMPILER_IN_CUDA),undefined)
-CUFLAGS += -allow-unsupported-compiler
 endif
 
 #-------------------------------------------------------------------------------
@@ -179,9 +245,9 @@ endif
 #ifeq ($(USECCACHE)$(shell echo $(AR) | grep ccache),1)
 #  override AR:=ccache $(AR)
 #endif
-ifneq ($(NVCC),)
-  ifeq ($(USECCACHE)$(shell echo $(NVCC) | grep ccache),1)
-    override NVCC:=ccache $(NVCC)
+ifneq ($(GPUCC),)
+  ifeq ($(USECCACHE)$(shell echo $(GPUCC) | grep ccache),1)
+    override GPUCC:=ccache $(GPUCC)
   endif
 endif
 
@@ -205,7 +271,7 @@ endif
 
 # PowerPC-specific CUDA compiler flags (to be reviewed!)
 ifeq ($(UNAME_P),ppc64le)
-  CUFLAGS+= -Xcompiler -mno-float128
+  GPUFLAGS+= -Xcompiler -mno-float128
 endif
 
 #-------------------------------------------------------------------------------
@@ -215,10 +281,10 @@ endif
 # Set the default OMPFLAGS choice
 ifneq ($(shell $(CXX) --version | egrep '^Intel'),)
 override OMPFLAGS = -fopenmp
-###override OMPFLAGS = # disable OpenMP MT on Intel (was ok without nvcc but not ok with nvcc before #578)
+###override OMPFLAGS = # disable OpenMP MT on Intel (was ok without GPUCC but not ok with GPUCC before #578)
 else ifneq ($(shell $(CXX) --version | egrep '^(clang)'),)
 override OMPFLAGS = -fopenmp
-###override OMPFLAGS = # disable OpenMP MT on clang (was not ok without or with nvcc before #578)
+###override OMPFLAGS = # disable OpenMP MT on clang (was not ok without or with GPUCC before #578)
 else ifneq ($(shell $(CXX) --version | egrep '^(Apple clang)'),)
 override OMPFLAGS = # disable OpenMP MT on Apple clang (builds fail in the CI #578)
 else
@@ -269,7 +335,10 @@ endif
 
 # Set the default RNDGEN (random number generator) choice
 ifeq ($(RNDGEN),)
-  ifeq ($(NVCC),)
+  ifeq ($(GPUCC),)
+    override RNDGEN = hasNoCurand
+  # Edgecase for HIP compilation
+  else ifeq ($(findstring hipcc,$(GPUCC)),hipcc)
     override RNDGEN = hasNoCurand
   else ifeq ($(RNDGEN),)
     override RNDGEN = hasCurand
@@ -344,13 +413,13 @@ CXXFLAGS+= $(AVXFLAGS)
 $(info FPTYPE=$(FPTYPE))
 ifeq ($(FPTYPE),d)
   CXXFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_DOUBLE
-  CUFLAGS  += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_DOUBLE
+  GPUFLAGS  += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_DOUBLE
 else ifeq ($(FPTYPE),f)
   CXXFLAGS += -DMGONGPU_FPTYPE_FLOAT -DMGONGPU_FPTYPE2_FLOAT
-  CUFLAGS  += -DMGONGPU_FPTYPE_FLOAT -DMGONGPU_FPTYPE2_FLOAT
+  GPUFLAGS  += -DMGONGPU_FPTYPE_FLOAT -DMGONGPU_FPTYPE2_FLOAT
 else ifeq ($(FPTYPE),m)
   CXXFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_FLOAT
-  CUFLAGS  += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_FLOAT
+  GPUFLAGS  += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_FLOAT
 else
   $(error Unknown FPTYPE='$(FPTYPE)': only 'd', 'f' and 'm' are supported)
 endif
@@ -359,7 +428,7 @@ endif
 $(info HELINL=$(HELINL))
 ifeq ($(HELINL),1)
   CXXFLAGS += -DMGONGPU_INLINE_HELAMPS
-  CUFLAGS  += -DMGONGPU_INLINE_HELAMPS
+  GPUFLAGS  += -DMGONGPU_INLINE_HELAMPS
 else ifneq ($(HELINL),0)
   $(error Unknown HELINL='$(HELINL)': only '0' and '1' are supported)
 endif
@@ -368,7 +437,7 @@ endif
 $(info HRDCOD=$(HRDCOD))
 ifeq ($(HRDCOD),1)
   CXXFLAGS += -DMGONGPU_HARDCODE_PARAM
-  CUFLAGS  += -DMGONGPU_HARDCODE_PARAM
+  GPUFLAGS  += -DMGONGPU_HARDCODE_PARAM
 else ifneq ($(HRDCOD),0)
   $(error Unknown HRDCOD='$(HRDCOD)': only '0' and '1' are supported)
 endif
@@ -420,11 +489,11 @@ ifeq ($(UNAME_S),Darwin)
   override CULIBFLAGSRPATH2 =
 else
   # RPATH to cuda/cpp libs when linking executables
-  override CXXLIBFLAGSRPATH = -Wl,-rpath,$(LIBDIRRPATH)
-  override CULIBFLAGSRPATH = -Xlinker -rpath,$(LIBDIRRPATH)
+  override CXXLIBFLAGSRPATH = -Wl,-rpath=$(LIBDIRRPATH)
+  override CULIBFLAGSRPATH = -Xlinker -rpath=$(LIBDIRRPATH)
   # RPATH to common lib when linking cuda/cpp libs
-  override CXXLIBFLAGSRPATH2 = -Wl,-rpath,'$$ORIGIN'
-  override CULIBFLAGSRPATH2 = -Xlinker -rpath,'$$ORIGIN'
+  override CXXLIBFLAGSRPATH2 = -Wl,-rpath='$$ORIGIN'
+  override CULIBFLAGSRPATH2 = -Xlinker -rpath='$$ORIGIN'
 endif
 
 # Setting LD_LIBRARY_PATH or DYLD_LIBRARY_PATH in the RUNTIME is no longer necessary (neither on Linux nor on Mac)
@@ -437,7 +506,7 @@ override RUNTIME =
 cxx_main=$(BUILDDIR)/check.exe
 fcxx_main=$(BUILDDIR)/fcheck.exe
 
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 cu_main=$(BUILDDIR)/gcheck.exe
 fcu_main=$(BUILDDIR)/fgcheck.exe
 else
@@ -468,15 +537,16 @@ $(BUILDDIR)/.build.$(TAG):
 	@touch $(BUILDDIR)/.build.$(TAG)
 
 # Generic target and build rules: objects from CUDA compilation
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 $(BUILDDIR)/%.o : %.cu *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
 	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
-	$(NVCC) $(CPPFLAGS) $(CUFLAGS) -Xcompiler -fPIC -c $< -o $@
+	$(GPUCC) $(CPPFLAGS) $(GPUFLAGS) $(CUBUILDRULEFLAGS) $< -o $@
 
 $(BUILDDIR)/%_cu.o : %.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
 	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
-	$(NVCC) $(CPPFLAGS) $(CUFLAGS) -Xcompiler -fPIC -c -x cu $< -o $@
+	$(GPUCC) $(CPPFLAGS) $(GPUFLAGS) $(CCBUILDRULEFLAGS) $< -o $@
 endif
+# -x cu in line above
 
 # Generic target and build rules: objects from C++ compilation
 # (NB do not include CUINC here! add it only for NVTX or curand #679)
@@ -485,11 +555,14 @@ $(BUILDDIR)/%.o : %.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
 	$(CXX) $(CPPFLAGS) $(CXXFLAGS) -fPIC -c $< -o $@
 
 # Apply special build flags only to CrossSectionKernel.cc and gCrossSectionKernel.cu (no fast math, see #117 and #516)
+# Added edgecase for HIP compilation
 ifeq ($(shell $(CXX) --version | grep ^nvc++),)
 $(BUILDDIR)/CrossSectionKernels.o: CXXFLAGS := $(filter-out -ffast-math,$(CXXFLAGS))
 $(BUILDDIR)/CrossSectionKernels.o: CXXFLAGS += -fno-fast-math
-ifneq ($(NVCC),)
-$(BUILDDIR)/gCrossSectionKernels.o: CUFLAGS += -Xcompiler -fno-fast-math
+ifeq ($(findstring nvcc,$(GPUCC)),nvcc)
+  $(BUILDDIR)/gCrossSectionKernels.o: GPUFLAGS += -Xcompiler -fno-fast-math
+else
+  $(BUILDDIR)/gCrossSectionKernels.o: GPUFLAGS += -fno-fast-math
 endif
 endif
 
@@ -505,10 +578,10 @@ ifeq ($(RNDGEN),hasCurand)
 $(BUILDDIR)/CurandRandomNumberKernel.o: CXXFLAGS += $(CUINC)
 endif
 
-# Avoid "warning: builtin __has_trivial_... is deprecated; use __is_trivially_... instead" in nvcc with icx2023 (#592)
+# Avoid "warning: builtin __has_trivial_... is deprecated; use __is_trivially_... instead" in GPUCC with icx2023 (#592)
 ifneq ($(shell $(CXX) --version | egrep '^(Intel)'),)
-ifneq ($(NVCC),)
-CUFLAGS += -Xcompiler -Wno-deprecated-builtins
+ifneq ($(GPUCC),)
+GPUFLAGS += -Wno-deprecated-builtins
 endif
 endif
 
@@ -516,8 +589,8 @@ endif
 # This patch does remove the warning, but I prefer to keep it disabled for the moment...
 ###ifneq ($(shell $(CXX) --version | egrep '^(clang|Apple clang|Intel)'),)
 ###$(BUILDDIR)/CrossSectionKernels.o: CXXFLAGS += -Wno-overriding-t-option
-###ifneq ($(NVCC),)
-###$(BUILDDIR)/gCrossSectionKernels.o: CUFLAGS += -Xcompiler -Wno-overriding-t-option
+###ifneq ($(GPUCC),)
+###$(BUILDDIR)/gCrossSectionKernels.o: GPUFLAGS += -Xcompiler -Wno-overriding-t-option
 ###endif
 ###endif
 
@@ -544,7 +617,7 @@ MG5AMC_CXXLIB = mg5amc_$(processid_short)_cpp
 cxx_objects_lib=$(BUILDDIR)/CPPProcess.o $(BUILDDIR)/MatrixElementKernels.o $(BUILDDIR)/BridgeKernels.o $(BUILDDIR)/CrossSectionKernels.o
 cxx_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel.o $(BUILDDIR)/RamboSamplingKernels.o
 
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 MG5AMC_CULIB = mg5amc_$(processid_short)_cuda
 cu_objects_lib=$(BUILDDIR)/gCPPProcess.o $(BUILDDIR)/gMatrixElementKernels.o $(BUILDDIR)/gBridgeKernels.o $(BUILDDIR)/gCrossSectionKernels.o
 cu_objects_exe=$(BUILDDIR)/gCommonRandomNumberKernel.o $(BUILDDIR)/gRamboSamplingKernels.o
@@ -556,11 +629,11 @@ $(LIBDIR)/lib$(MG5AMC_CXXLIB).so: cxx_objects_lib += $(BUILDDIR)/fbridge.o
 $(LIBDIR)/lib$(MG5AMC_CXXLIB).so: $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib)
 	$(CXX) -shared -o $@ $(cxx_objects_lib) $(CXXLIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB)
 
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 $(LIBDIR)/lib$(MG5AMC_CULIB).so: $(BUILDDIR)/fbridge_cu.o
 $(LIBDIR)/lib$(MG5AMC_CULIB).so: cu_objects_lib += $(BUILDDIR)/fbridge_cu.o
 $(LIBDIR)/lib$(MG5AMC_CULIB).so: $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cu_objects_lib)
-	$(NVCC) --shared -o $@ $(cu_objects_lib) $(CULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB)
+	$(GPUCC) --shared -o $@ $(cu_objects_lib) $(CULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB)
 endif
 
 #-------------------------------------------------------------------------------
@@ -577,16 +650,16 @@ $(cxx_main): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PAT
 $(cxx_main): $(BUILDDIR)/check_sa.o $(LIBDIR)/lib$(MG5AMC_CXXLIB).so $(cxx_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel.o
 	$(CXX) -o $@ $(BUILDDIR)/check_sa.o $(OMPFLAGS) -ldl -pthread $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_CXXLIB) $(cxx_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel.o $(CURANDLIBFLAGS)
 
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 ifneq ($(shell $(CXX) --version | grep ^Intel),)
-$(cu_main): LIBFLAGS += -lintlc # compile with icpx and link with nvcc (undefined reference to `_intel_fast_memcpy')
-$(cu_main): LIBFLAGS += -lsvml # compile with icpx and link with nvcc (undefined reference to `__svml_cos4_l9')
+$(cu_main): LIBFLAGS += -lintlc # compile with icpx and link with GPUCC (undefined reference to `_intel_fast_memcpy')
+$(cu_main): LIBFLAGS += -lsvml # compile with icpx and link with GPUCC (undefined reference to `__svml_cos4_l9')
 else ifneq ($(shell $(CXX) --version | grep ^nvc++),) # support nvc++ #531
 $(cu_main): LIBFLAGS += -L$(patsubst %bin/nvc++,%lib,$(subst ccache ,,$(CXX))) -lnvhpcatm -lnvcpumath -lnvc
 endif
 $(cu_main): LIBFLAGS += $(CULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH
 $(cu_main): $(BUILDDIR)/gcheck_sa.o $(LIBDIR)/lib$(MG5AMC_CULIB).so $(cu_objects_exe) $(BUILDDIR)/gCurandRandomNumberKernel.o
-	$(NVCC) -o $@ $(BUILDDIR)/gcheck_sa.o $(CUARCHFLAGS) $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe) $(BUILDDIR)/gCurandRandomNumberKernel.o $(CURANDLIBFLAGS)
+	$(GPUCC) -o $@ $(BUILDDIR)/gcheck_sa.o $(CUARCHFLAGS) $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe) $(BUILDDIR)/gCurandRandomNumberKernel.o $(CURANDLIBFLAGS)
 endif
 
 #-------------------------------------------------------------------------------
@@ -612,17 +685,17 @@ $(fcxx_main): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PA
 $(fcxx_main): $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler.o $(LIBDIR)/lib$(MG5AMC_CXXLIB).so $(cxx_objects_exe)
 	$(CXX) -o $@ $(BUILDDIR)/fcheck_sa.o $(OMPFLAGS) $(BUILDDIR)/fsampler.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CXXLIB) $(cxx_objects_exe)
 
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 ifneq ($(shell $(CXX) --version | grep ^Intel),)
-$(fcu_main): LIBFLAGS += -lintlc # compile with icpx and link with nvcc (undefined reference to `_intel_fast_memcpy')
-$(fcu_main): LIBFLAGS += -lsvml # compile with icpx and link with nvcc (undefined reference to `__svml_cos4_l9')
+$(fcu_main): LIBFLAGS += -lintlc # compile with icpx and link with GPUCC (undefined reference to `_intel_fast_memcpy')
+$(fcu_main): LIBFLAGS += -lsvml # compile with icpx and link with GPUCC (undefined reference to `__svml_cos4_l9')
 endif
 ifeq ($(UNAME_S),Darwin)
 $(fcu_main): LIBFLAGS += -L$(shell dirname $(shell $(FC) --print-file-name libgfortran.dylib)) # add path to libgfortran on Mac #375
 endif
 $(fcu_main): LIBFLAGS += $(CULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH
 $(fcu_main): $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler_cu.o $(LIBDIR)/lib$(MG5AMC_CULIB).so $(cu_objects_exe)
-	$(NVCC) -o $@ $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler_cu.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe)
+	$(GPUCC) -o $@ $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler_cu.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe)
 endif
 
 #-------------------------------------------------------------------------------
@@ -634,7 +707,7 @@ $(BUILDDIR)/testxxx.o: testxxx_cc_ref.txt
 $(testmain): $(BUILDDIR)/testxxx.o
 $(testmain): cxx_objects_exe += $(BUILDDIR)/testxxx.o # Comment out this line to skip the C++ test of xxx functions
 
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 $(BUILDDIR)/testxxx_cu.o: $(GTESTLIBS)
 $(BUILDDIR)/testxxx_cu.o: INCFLAGS += $(GTESTINC)
 $(BUILDDIR)/testxxx_cu.o: testxxx_cc_ref.txt
@@ -647,7 +720,7 @@ $(BUILDDIR)/testmisc.o: INCFLAGS += $(GTESTINC)
 $(testmain): $(BUILDDIR)/testmisc.o
 $(testmain): cxx_objects_exe += $(BUILDDIR)/testmisc.o # Comment out this line to skip the C++ miscellaneous tests
 
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 $(BUILDDIR)/testmisc_cu.o: $(GTESTLIBS)
 $(BUILDDIR)/testmisc_cu.o: INCFLAGS += $(GTESTINC)
 $(testmain): $(BUILDDIR)/testmisc_cu.o
@@ -659,12 +732,12 @@ $(BUILDDIR)/runTest.o: INCFLAGS += $(GTESTINC)
 $(testmain): $(BUILDDIR)/runTest.o
 $(testmain): cxx_objects_exe += $(BUILDDIR)/runTest.o
 
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 $(BUILDDIR)/runTest_cu.o: $(GTESTLIBS)
 $(BUILDDIR)/runTest_cu.o: INCFLAGS += $(GTESTINC)
 ifneq ($(shell $(CXX) --version | grep ^Intel),)
-$(testmain): LIBFLAGS += -lintlc # compile with icpx and link with nvcc (undefined reference to `_intel_fast_memcpy')
-$(testmain): LIBFLAGS += -lsvml # compile with icpx and link with nvcc (undefined reference to `__svml_cos4_l9')
+$(testmain): LIBFLAGS += -lintlc # compile with icpx and link with GPUCC (undefined reference to `_intel_fast_memcpy')
+$(testmain): LIBFLAGS += -lsvml # compile with icpx and link with GPUCC (undefined reference to `__svml_cos4_l9')
 else ifneq ($(shell $(CXX) --version | grep ^nvc++),) # support nvc++ #531
 $(testmain): LIBFLAGS += -L$(patsubst %bin/nvc++,%lib,$(subst ccache ,,$(CXX))) -lnvhpcatm -lnvcpumath -lnvc
 endif
@@ -688,14 +761,14 @@ $(testmain): LIBFLAGS += -lgomp
 endif
 endif
 
-ifeq ($(NVCC),) # link only runTest.o
+ifeq ($(GPUCC),) # link only runTest.o
 $(testmain): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH
 $(testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(cxx_objects_exe) $(GTESTLIBS)
 	$(CXX) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) -ldl -pthread $(LIBFLAGS)
 else # link both runTest.o and runTest_cu.o
 $(testmain): LIBFLAGS += $(CULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH
 $(testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) $(GTESTLIBS)
-	$(NVCC) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) -ldl $(LIBFLAGS) -lcuda
+	  $(GPUCC) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) -ldl $(LIBFLAGS) $(CUDATESTFLAGS)
 endif
 
 # Use flock (Linux only, no Mac) to allow 'make -j' if googletest has not yet been downloaded https://stackoverflow.com/a/32666215
@@ -798,9 +871,9 @@ ifeq ($(USECCACHE),1)
 	ccache --version | head -1
 endif
 	@echo ""
-	@echo NVCC=$(NVCC)
-ifneq ($(NVCC),)
-	$(NVCC) --version
+	@echo GPUCC=$(GPUCC)
+ifneq ($(GPUCC),)
+	$(GPUCC) --version
 endif
 	@echo ""
 	@echo CXX=$(CXX)
@@ -819,7 +892,7 @@ endif
 
 # Target: check (run the C++ test executable)
 # [NB THIS IS WHAT IS USED IN THE GITHUB CI!]
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 check: runTest cmpFcheck cmpFGcheck
 else
 check: runTest cmpFcheck
diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/fbridge.cc b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/fbridge.cc
index f93c05b0b3..2b956730d4 100644
--- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/fbridge.cc
+++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/fbridge.cc
@@ -1,11 +1,11 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: S. Roiser (Oct 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Roiser, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #include "Bridge.h"
 #include "CPPProcess.h"
-#include "CudaRuntime.h"
+#include "GpuRuntime.h"
 
 extern "C"
 {
@@ -22,7 +22,7 @@ extern "C"
    * Using the same Fortran MadEvent code, linking to the hetrerogeneous library would allow access to both CPU and GPU implementations.
    * The specific heterogeneous configuration (how many GPUs, how many threads on each CPU, etc) could be loaded in CUDA/C++ from a data file.
    */
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   using namespace mg5amcGpu;
 #else
   using namespace mg5amcCpu;
@@ -46,8 +46,8 @@ extern "C"
    */
   void fbridgecreate_( CppObjectInFortran** ppbridge, const int* pnevtF, const int* pnparF, const int* pnp4F )
   {
-#ifdef __CUDACC__
-    CudaRuntime::setUp();
+#ifdef MGONGPUCPP_GPUIMPL
+    GpuRuntime::setUp();
 #endif
     // Create a process object, read parm card and set parameters
     // FIXME: the process instance can happily go out of scope because it is only needed to read parameters?
@@ -69,8 +69,8 @@ extern "C"
     Bridge<FORTRANFPTYPE>* pbridge = dynamic_cast<Bridge<FORTRANFPTYPE>*>( *ppbridge );
     if( pbridge == 0 ) throw std::runtime_error( "fbridgedelete_: invalid Bridge address" );
     delete pbridge;
-#ifdef __CUDACC__
-    CudaRuntime::tearDown();
+#ifdef MGONGPUCPP_GPUIMPL
+    GpuRuntime::tearDown();
 #endif
   }
 
@@ -100,7 +100,7 @@ extern "C"
   {
     Bridge<FORTRANFPTYPE>* pbridge = dynamic_cast<Bridge<FORTRANFPTYPE>*>( *ppbridge );
     if( pbridge == 0 ) throw std::runtime_error( "fbridgesequence_: invalid Bridge address" );
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     // Use the device/GPU implementation in the CUDA library
     // (there is also a host implementation in this library)
     pbridge->gpu_sequence( momenta, gs, rndhel, rndcol, *pchannelId, mes, selhel, selcol );
diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/fsampler.cc b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/fsampler.cc
index 2fb445372d..3743934f41 100644
--- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/fsampler.cc
+++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/fsampler.cc
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Feb 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #include "mgOnGpuConfig.h"
 
@@ -13,7 +13,7 @@
 
 //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -40,7 +40,7 @@ namespace mg5amcCpu
   private:
     const int m_nevt; // The number of events in each iteration
     int m_iiter;      // The iteration counter (for random number seeding)
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
     HostBufferRndNumMomenta m_hstRndmom; // Memory buffers for random numbers
     HostBufferMomenta m_hstMomenta;      // Memory buffers for momenta
     HostBufferWeights m_hstWeights;      // Memory buffers for sampling weights
@@ -105,7 +105,7 @@ namespace mg5amcCpu
 
 extern "C"
 {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   using namespace mg5amcGpu;
 #else
   using namespace mg5amcCpu;
diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/runTest.cc b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/runTest.cc
index df7488178e..4c9bc9ee6b 100644
--- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/runTest.cc
+++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/runTest.cc
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: S. Hageboeck (Nov 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 //----------------------------------------------------------------------------
 // Use ./runTest.exe --gtest_filter=*xxx to run only testxxx.cc tests
 //----------------------------------------------------------------------------
@@ -18,7 +18,7 @@
 #include "RandomNumberKernels.h"
 #include "epoch_process_id.h"
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 using namespace mg5amcGpu;
 #else
 using namespace mg5amcCpu;
@@ -35,7 +35,7 @@ struct CUDA_CPU_TestBase : public TestDriverBase
     : TestDriverBase( npar, refFileName ) {}
 };
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 struct CPUTest : public CUDA_CPU_TestBase
 {
   // Struct data members (process, and memory structures for random numbers, momenta, matrix elements and weights on host and device)
@@ -117,7 +117,7 @@ struct CPUTest : public CUDA_CPU_TestBase
 };
 #endif
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 struct CUDATest : public CUDA_CPU_TestBase
 {
   // Reset the device when our test goes out of scope. Note that this should happen after
@@ -126,7 +126,7 @@ struct CUDATest : public CUDA_CPU_TestBase
   {
     ~DeviceReset()
     {
-      checkCuda( cudaDeviceReset() ); // this is needed by cuda-memcheck --leak-check full
+      gpuDeviceReset(); // this is needed by cuda-memcheck --leak-check full
     }
   } deviceResetter;
 
@@ -252,7 +252,7 @@ INSTANTIATE_TEST_SUITE_P( prefix, \
                           test_suite_name, \
                           testing::Values( new CUDATest( MG_EPOCH_REFERENCE_FILE_NAME ) ) );
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 MG_INSTANTIATE_TEST_SUITE_GPU( XTESTID_GPU( MG_EPOCH_PROCESS_ID ), MadgraphTest );
 #else
 MG_INSTANTIATE_TEST_SUITE_CPU( XTESTID_CPU( MG_EPOCH_PROCESS_ID ), MadgraphTest );
diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/testmisc.cc b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/testmisc.cc
index 895d6eeb56..ba9e59a8a3 100644
--- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/testmisc.cc
+++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/testmisc.cc
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 //----------------------------------------------------------------------------
 // Use ./runTest.exe --gtest_filter=*misc to run only testmisc.cc tests
 //----------------------------------------------------------------------------
@@ -17,7 +17,7 @@
 #include <sstream>
 #include <typeinfo>
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 #define TESTID( s ) s##_GPU_MISC
 #else
 #define TESTID( s ) s##_CPU_MISC
@@ -26,7 +26,7 @@
 #define XTESTID( s ) TESTID( s )
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -59,7 +59,7 @@ namespace mg5amcCpu
 
 TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testmisc )
 {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   using namespace mg5amcGpu;
 #else
   using namespace mg5amcCpu;
diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/testxxx.cc b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/testxxx.cc
index e40f635e46..016bc0f472 100644
--- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/testxxx.cc
+++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/testxxx.cc
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Apr 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 //----------------------------------------------------------------------------
 // Use ./runTest.exe --gtest_filter=*xxx to run only testxxx.cc tests
 //----------------------------------------------------------------------------
@@ -24,7 +24,7 @@
 #include <iomanip>
 #include <iostream>
 #include <vector>
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 #define TESTID( s ) s##_GPU_XXX
 #else
 #define TESTID( s ) s##_CPU_XXX
@@ -32,7 +32,7 @@
 
 #define XTESTID( s ) TESTID( s )
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -42,7 +42,7 @@ namespace mg5amcCpu
   int FPEhandlerIevt = -1;
   inline void FPEhandler( int sig )
   {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     std::cerr << "Floating Point Exception (GPU): '" << FPEhandlerMessage << "' ievt=" << FPEhandlerIevt << std::endl;
 #else
     std::cerr << "Floating Point Exception (CPU neppV=" << neppV << "): '" << FPEhandlerMessage << "' ievt=" << FPEhandlerIevt << std::endl;
@@ -53,7 +53,7 @@ namespace mg5amcCpu
 
 TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testxxx )
 {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   using namespace mg5amcGpu;
 #else
   using namespace mg5amcCpu;
@@ -76,7 +76,7 @@ TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testxxx )
   assert( nevt % neppM == 0 ); // nevt must be a multiple of neppM
   assert( nevt % neppV == 0 ); // nevt must be a multiple of neppV
   // Fill in the input momenta
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   mg5amcGpu::PinnedHostBufferMomenta hstMomenta( nevt ); // AOSOA[npagM][npar=4][np4=4][neppM]
 #else
   mg5amcCpu::HostBufferMomenta hstMomenta( nevt ); // AOSOA[npagM][npar=4][np4=4][neppM]
@@ -321,7 +321,7 @@ TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testxxx )
   {
     for( int ievt = 0; ievt < nevt; ievt++ )
     {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
       using namespace mg5amcGpu;
 #else
       using namespace mg5amcCpu;
diff --git a/epochX/cudacpp/gg_ttggg.mad/bin/internal/ufomodel/py3_model.pkl b/epochX/cudacpp/gg_ttggg.mad/bin/internal/ufomodel/py3_model.pkl
index c67363a41f7befb95905219d5f8068b362abbe2f..b6989c1453094d7f45cf2ee4b2124efa29e9064b 100644
GIT binary patch
delta 44
zcmX?hj%n{XrVZZ9<PsSe81xg<iuIFIi}Li6GxW>zi?a2z^s`D*Gt>1a7cIL20CztS
A_y7O^

delta 53
zcmdmcj_KGrrVZZ9)Uy~E81z#TOA_@H%Mx=Ei;FY$-2+0642+EReceqHeVz5wGZM>m
JCuc6Z0ssYc66F8@

diff --git a/epochX/cudacpp/gg_ttggg.mad/src/HelAmps_sm.h b/epochX/cudacpp/gg_ttggg.mad/src/HelAmps_sm.h
index 6db5ca82f3..f4ea9f0a8a 100644
--- a/epochX/cudacpp/gg_ttggg.mad/src/HelAmps_sm.h
+++ b/epochX/cudacpp/gg_ttggg.mad/src/HelAmps_sm.h
@@ -5,7 +5,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: A. Valassi (Sep 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
 // MadGraph5_aMC@NLO v. 3.5.0_lo_vect, 2023-06-09
@@ -28,7 +28,7 @@
 //#include <iomanip>
 //#include <iostream>
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/gg_ttggg.mad/src/Parameters_sm.cc b/epochX/cudacpp/gg_ttggg.mad/src/Parameters_sm.cc
index 22fdd96a68..01e7d9bcf2 100644
--- a/epochX/cudacpp/gg_ttggg.mad/src/Parameters_sm.cc
+++ b/epochX/cudacpp/gg_ttggg.mad/src/Parameters_sm.cc
@@ -4,7 +4,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: A. Valassi (Sep 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
 // MadGraph5_aMC@NLO v. 3.5.0_lo_vect, 2023-06-09
@@ -17,7 +17,7 @@
 #include <iomanip>
 #include <iostream>
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 using namespace mg5amcGpu;
 #else
 using namespace mg5amcCpu;
diff --git a/epochX/cudacpp/gg_ttggg.mad/src/Parameters_sm.h b/epochX/cudacpp/gg_ttggg.mad/src/Parameters_sm.h
index 11fd9e3c74..b44537e599 100644
--- a/epochX/cudacpp/gg_ttggg.mad/src/Parameters_sm.h
+++ b/epochX/cudacpp/gg_ttggg.mad/src/Parameters_sm.h
@@ -27,7 +27,7 @@
 #include "read_slha.h"
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -93,7 +93,7 @@ namespace mg5amcCpu
 #include <limits>
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -218,7 +218,7 @@ namespace mg5amcCpu
 //==========================================================================
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -239,7 +239,7 @@ namespace mg5amcCpu
 #pragma GCC diagnostic push
 #pragma GCC diagnostic ignored "-Wunused-variable"  // e.g. <<warning: unused variable ‘mdl_G__exp__2’ [-Wunused-variable]>>
 #pragma GCC diagnostic ignored "-Wunused-parameter" // e.g. <<warning: unused parameter ‘G’ [-Wunused-parameter]>>
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 #pragma nv_diagnostic push
 #pragma nv_diag_suppress 177 // e.g. <<warning #177-D: variable "mdl_G__exp__2" was declared but never referenced>>
 #endif
@@ -267,7 +267,7 @@ namespace mg5amcCpu
       // End SM implementation - no special handling of vectors of floats as in EFT (#439)
       return out;
     }
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 #pragma GCC diagnostic pop
 #pragma nv_diagnostic pop
 #endif
diff --git a/epochX/cudacpp/gg_ttggg.mad/src/cudacpp_src.mk b/epochX/cudacpp/gg_ttggg.mad/src/cudacpp_src.mk
index 554d7a704c..f73ff6fa03 100644
--- a/epochX/cudacpp/gg_ttggg.mad/src/cudacpp_src.mk
+++ b/epochX/cudacpp/gg_ttggg.mad/src/cudacpp_src.mk
@@ -38,13 +38,13 @@ endif
 
 #-------------------------------------------------------------------------------
 
-#=== Configure the CUDA compiler (note: NVCC is already exported including ccache)
+#=== Configure the CUDA compiler (note: GPUCC is already exported including ccache)
 
-###$(info NVCC=$(NVCC))
+###$(info GPUCC=$(GPUCC))
 
 #-------------------------------------------------------------------------------
 
-#=== Configure ccache for C++ builds (note: NVCC is already exported including ccache)
+#=== Configure ccache for C++ builds (note: GPUCC is already exported including ccache)
 
 # Enable ccache if USECCACHE=1
 ifeq ($(USECCACHE)$(shell echo $(CXX) | grep ccache),1)
@@ -246,20 +246,20 @@ $(BUILDDIR)/%.o : %.cc *.h $(BUILDDIR)/.build.$(TAG)
 # Generic target and build rules: objects from CUDA compilation
 $(BUILDDIR)/%_cu.o : %.cc *.h $(BUILDDIR)/.build.$(TAG)
 	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
-	$(NVCC) $(CPPFLAGS) $(CUFLAGS) -Xcompiler -fPIC -c -x cu $< -o $@
+	$(GPUCC) $(CPPFLAGS) $(CUFLAGS) -Xcompiler -fPIC -c -x cu $< -o $@
 
 #-------------------------------------------------------------------------------
 
 cxx_objects=$(addprefix $(BUILDDIR)/, Parameters_sm.o read_slha.o)
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 cu_objects=$(addprefix $(BUILDDIR)/, Parameters_sm_cu.o)
 endif
 
 # Target (and build rules): common (src) library
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so : $(cxx_objects) $(cu_objects)
 	@if [ ! -d $(LIBDIR) ]; then echo "mkdir -p $(LIBDIR)"; mkdir -p $(LIBDIR); fi
-	$(NVCC) -shared -o $@ $(cxx_objects) $(cu_objects)
+	$(GPUCC) -shared -o $@ $(cxx_objects) $(cu_objects)
 else
 $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so : $(cxx_objects)
 	@if [ ! -d $(LIBDIR) ]; then echo "mkdir -p $(LIBDIR)"; mkdir -p $(LIBDIR); fi
diff --git a/epochX/cudacpp/gg_ttggg.mad/src/mgOnGpuConfig.h b/epochX/cudacpp/gg_ttggg.mad/src/mgOnGpuConfig.h
index cacab1031a..e540c8587c 100644
--- a/epochX/cudacpp/gg_ttggg.mad/src/mgOnGpuConfig.h
+++ b/epochX/cudacpp/gg_ttggg.mad/src/mgOnGpuConfig.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jul 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MGONGPUCONFIG_H
 #define MGONGPUCONFIG_H 1
@@ -10,13 +10,27 @@
 // There are two different code bases for standalone_cudacpp (without multichannel) and madevent+cudacpp (with multichannel)
 #define MGONGPU_SUPPORTS_MULTICHANNEL 1
 
+// Is this a GPU (CUDA, HIP) or CPU implementation?
+#ifdef __CUDACC__
+#define MGONGPUCPP_GPUIMPL cuda
+#elif defined __HIPCC__
+#define MGONGPUCPP_GPUIMPL hip
+#include "hip/hip_runtime.h"
+#else
+#undef MGONGPUCPP_GPUIMPL
+#endif
+
 // ** NB1 Throughputs (e.g. 6.8E8) are events/sec for "./gcheck.exe -p 65536 128 12"
 // ** NB2 Baseline on b7g47n0004 fluctuates (probably depends on load on other VMs)
 
 // Choose if curand is supported for generating random numbers
+// For CUDA, by default, it is supported
+// For HIP, by default, it is not supported
 // For C++, by default, do not inline, but allow this macro to be set from outside with e.g. -DMGONGPU_HAS_NO_CURAND
 #ifdef __CUDACC__
 #undef MGONGPU_HAS_NO_CURAND
+#elif defined __HIPCC__
+#define MGONGPU_HAS_NO_CURAND 1
 #else
 //#undef MGONGPU_HAS_NO_CURAND // default
 ////#define MGONGPU_HAS_NO_CURAND 1
@@ -52,23 +66,28 @@
 //#undef MGONGPU_HARDCODE_PARAM // default
 ////#define MGONGPU_HARDCODE_PARAM 1
 
-// Complex type in c++: std::complex or cxsmpl (CHOOSE ONLY ONE)
-#ifndef __CUDACC__
-//#define MGONGPU_CPPCXTYPE_STDCOMPLEX 1 // ~8 percent slower on float, same on double (5.1E6/double, 9.4E6/float)
-#define MGONGPU_CPPCXTYPE_CXSMPL 1 // new default (5.1E6/double, 10.2E6/float)
-#endif
-
-// Complex type in cuda: thrust or cucomplex or cxsmpl (CHOOSE ONLY ONE)
+// Complex type in CUDA: thrust or cucomplex or cxsmpl (CHOOSE ONLY ONE)
 #ifdef __CUDACC__
 #define MGONGPU_CUCXTYPE_THRUST 1 // default (~1.15E9/double, ~3.2E9/float)
 //#define MGONGPU_CUCXTYPE_CUCOMPLEX 1 // ~10 percent slower (1.03E9/double, ~2.8E9/float)
 //#define MGONGPU_CUCXTYPE_CXSMPL 1 // ~10 percent slower (1.00E9/double, ~2.9E9/float)
+
+// Complex type in HIP: cxsmpl (ONLY ONE OPTION POSSIBLE)
+#elif defined __HIPCC__
+#define MGONGPU_CUCXTYPE_CXSMPL 1 // ~10 percent slower (1.00E9/double, ~2.9E9/float)
+
+// Complex type in C++: std::complex or cxsmpl (CHOOSE ONLY ONE)
+#else
+//#define MGONGPU_CPPCXTYPE_STDCOMPLEX 1 // ~8 percent slower on float, same on double (5.1E6/double, 9.4E6/float)
+#define MGONGPU_CPPCXTYPE_CXSMPL 1 // new default (5.1E6/double, 10.2E6/float)
 #endif
 
-// Cuda nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation
+// CUDA nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation
 #ifdef __CUDACC__
-#undef MGONGPU_NSIGHT_DEBUG // default
+#undef MGONGPU_NSIGHT_DEBUG // default in CUDA
 //#define MGONGPU_NSIGHT_DEBUG 1
+#else
+#undef MGONGPU_NSIGHT_DEBUG // only option in HIP or C++
 #endif
 
 // SANITY CHECKS (floating point precision for everything but color algebra #537)
@@ -84,17 +103,21 @@
 #error You cannot use double precision for color algebra and single precision elsewhere
 #endif
 
-// SANITY CHECKS (c++ complex number implementation)
-#ifndef __CUDACC__
-#if defined MGONGPU_CPPCXTYPE_STDCOMPLEX and defined MGONGPU_CPPCXTYPE_CXSMPL
-#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CPPCXTYPE_STDCOMPLEX or MGONGPU_CPPCXTYPE_CXSMPL
+// SANITY CHECKS (CUDA complex number implementation)
+#ifdef __CUDACC__
+#if defined MGONGPU_CUCXTYPE_THRUST and defined MGONGPU_CUCXTYPE_CUCOMPLEX
+#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CUCXTYPE_THRUST or MGONGPU_CUCXTYPE_CUCOMPLEX for CUDA
+#elif defined MGONGPU_CUCXTYPE_THRUST and defined MGONGPU_CUCXTYPE_CXSMPL
+#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CUCXTYPE_THRUST or MGONGPU_CUCXTYPE_CXSMPL for CUDA
+#elif defined MGONGPU_CUCXTYPE_CUCOMPLEX and defined MGONGPU_CUCXTYPE_CXSMPL
+#error You must CHOOSE (ONE AND) ONLY ONE OF MGONGPU_CUCXTYPE_CUCOMPLEX or MGONGPU_CUCXTYPE_CXSMPL for CUDA
 #endif
 #endif
 
-// SANITY CHECKS (cuda complex number implementation)
-#ifdef __CUDACC__
-#if defined MGONGPU_CUCXTYPE_THRUST and defined MGONGPU_CUCXTYPE_CUCOMPLEX and defined MGONGPU_CUCXTYPE_CXSMPL
-#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CUCXTYPE_THRUST or MGONGPU_CUCXTYPE_CUCOMPLEX or MGONGPU_CUCXTYPE_CXSMPL
+// SANITY CHECKS (C++ complex number implementation)
+#ifndef MGONGPUCPP_GPUIMPL
+#if defined MGONGPU_CPPCXTYPE_STDCOMPLEX and defined MGONGPU_CPPCXTYPE_CXSMPL
+#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CPPCXTYPE_STDCOMPLEX or MGONGPU_CPPCXTYPE_CXSMPL for C++
 #endif
 #endif
 
@@ -132,7 +155,7 @@ namespace mgOnGpu
   // Alignment requirement for using reinterpret_cast with SIMD vectorized code
   // (using reinterpret_cast with non aligned memory may lead to segmentation faults!)
   // Only needed for C++ code but can be enforced also in NVCC builds of C++ code using CUDA>=11.2 and C++17 (#318, #319, #333)
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   constexpr int cppAlign = 64; // alignment requirement for SIMD vectorization (64-byte i.e. 512-bit)
 #endif
 
@@ -143,7 +166,7 @@ using mgOnGpu::fptype;
 using mgOnGpu::fptype2;
 
 // C++ SIMD vectorization width (this will be used to set neppV)
-#ifdef __CUDACC__ // CUDA implementation has no SIMD
+#ifdef MGONGPUCPP_GPUIMPL // CUDA and HIP implementations have no SIMD
 #undef MGONGPU_CPPSIMD
 #elif defined __AVX512VL__ && defined MGONGPU_PVW512 // C++ "512z" AVX512 with 512 width (512-bit ie 64-byte): 8 (DOUBLE) or 16 (FLOAT)
 #ifdef MGONGPU_FPTYPE_DOUBLE
@@ -173,9 +196,9 @@ using mgOnGpu::fptype2;
 #undef MGONGPU_CPPSIMD
 #endif
 
-// Cuda nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation
+// CUDA nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation
 // Arguments (not used so far): text is __FUNCTION__, code is 0 (start) or 1 (end)
-#if defined __CUDACC__ && defined MGONGPU_NSIGHT_DEBUG /* clang-format off */
+#if defined __CUDA__ && defined MGONGPU_NSIGHT_DEBUG /* clang-format off */
 #define mgDebugDeclare() __shared__ float mgDebugCounter[mgOnGpu::ntpbMAX];
 #define mgDebugInitialise() { mgDebugCounter[threadIdx.x] = 0; }
 #define mgDebug( code, text ) { mgDebugCounter[threadIdx.x] += 1; }
@@ -187,8 +210,8 @@ using mgOnGpu::fptype2;
 #define mgDebugFinalise() { /*noop*/ }
 #endif /* clang-format on */
 
-// Define empty CUDA declaration specifiers for C++
-#ifndef __CUDACC__
+// Define empty CUDA/HIP declaration specifiers for C++
+#ifndef MGONGPUCPP_GPUIMPL
 #define __global__
 #define __host__
 #define __device__
diff --git a/epochX/cudacpp/gg_ttggg.mad/src/mgOnGpuCxtypes.h b/epochX/cudacpp/gg_ttggg.mad/src/mgOnGpuCxtypes.h
index b56348bc58..6ae0c42ecb 100644
--- a/epochX/cudacpp/gg_ttggg.mad/src/mgOnGpuCxtypes.h
+++ b/epochX/cudacpp/gg_ttggg.mad/src/mgOnGpuCxtypes.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022, based on earlier work by D. Smith) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MGONGPUCXTYPES_H
 #define MGONGPUCXTYPES_H 1
@@ -19,7 +19,7 @@
 #include <complex>
 
 // Complex type in cuda: thrust or cucomplex or cxsmpl
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 #if defined MGONGPU_CUCXTYPE_THRUST
 #pragma clang diagnostic push
 #pragma clang diagnostic ignored "-Wtautological-compare" // for icpx2021/clang13 (https://stackoverflow.com/a/15864661)
@@ -82,7 +82,7 @@ namespace mgOnGpu /* clang-format off */
 using mgOnGpu::cxsmpl;
 
 // Printout to stream for user defined types
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -209,14 +209,14 @@ namespace mg5amcCpu
 //==========================================================================
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
 #endif
 {
   // --- Type definitions (complex type: cxtype)
-#ifdef __CUDACC__ // cuda
+#ifdef MGONGPUCPP_GPUIMPL // cuda
 #if defined MGONGPU_CUCXTYPE_THRUST
   typedef thrust::complex<fptype> cxtype;
 #elif defined MGONGPU_CUCXTYPE_CUCOMPLEX
@@ -249,7 +249,7 @@ namespace mg5amcCpu
 //==========================================================================
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -301,7 +301,7 @@ namespace mg5amcCpu
 
   //==========================================================================
 
-#if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_THRUST // cuda + thrust
+#if defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CUCXTYPE_THRUST // cuda + thrust
 
   //------------------------------
   // CUDA - using thrust::complex
@@ -337,11 +337,11 @@ namespace mg5amcCpu
     return c;
   }
 
-#endif // #if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_THRUST
+#endif // #if defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CUCXTYPE_THRUST
 
   //==========================================================================
 
-#if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_CUCOMPLEX // cuda + cucomplex
+#if defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CUCXTYPE_CUCOMPLEX // cuda + cucomplex
 
   //------------------------------
   // CUDA - using cuComplex
@@ -556,11 +556,11 @@ namespace mg5amcCpu
     return cxmake( c.real(), c.imag() );
   }
 
-#endif // #if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_CUCOMPLEX
+#endif // #if defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CUCXTYPE_CUCOMPLEX
 
   //==========================================================================
 
-#if not defined __CUDACC__ and defined MGONGPU_CPPCXTYPE_STDCOMPLEX // c++ + stdcomplex
+#if not defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CPPCXTYPE_STDCOMPLEX // c++ + stdcomplex
 
   //------------------------------
   // C++ - using std::complex
@@ -604,7 +604,7 @@ namespace mg5amcCpu
   }
 #endif
 
-#endif // #if not defined __CUDACC__ and defined MGONGPU_CPPCXTYPE_STDCOMPLEX
+#endif // #if not defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CPPCXTYPE_STDCOMPLEX
 
   //==========================================================================
 
@@ -627,7 +627,7 @@ namespace mg5amcCpu
 //==========================================================================
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/gg_ttggg.mad/src/mgOnGpuFptypes.h b/epochX/cudacpp/gg_ttggg.mad/src/mgOnGpuFptypes.h
index 905c97d700..fa3a02664b 100644
--- a/epochX/cudacpp/gg_ttggg.mad/src/mgOnGpuFptypes.h
+++ b/epochX/cudacpp/gg_ttggg.mad/src/mgOnGpuFptypes.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MGONGPUFPTYPES_H
 #define MGONGPUFPTYPES_H 1
@@ -12,7 +12,7 @@
 #include <cmath>
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL // cuda
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -20,7 +20,7 @@ namespace mg5amcCpu
 {
   //==========================================================================
 
-#ifdef __CUDACC__ // cuda
+#ifdef MGONGPUCPP_GPUIMPL // cuda
 
   //------------------------------
   // Floating point types - Cuda
@@ -64,11 +64,11 @@ namespace mg5amcCpu
 #endif
   }
 
-#endif // #ifdef __CUDACC__
+#endif // #ifdef MGONGPUCPP_GPUIMPL
 
   //==========================================================================
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 
   //------------------------------
   // Floating point types - C++
@@ -92,7 +92,7 @@ namespace mg5amcCpu
     return std::sqrt( f );
   }
 
-#endif // #ifndef __CUDACC__
+#endif // #ifndef MGONGPUCPP_GPUIMPL
 
   //==========================================================================
 
diff --git a/epochX/cudacpp/gg_ttggg.mad/src/mgOnGpuVectors.h b/epochX/cudacpp/gg_ttggg.mad/src/mgOnGpuVectors.h
index e1299ba81e..cdae04326b 100644
--- a/epochX/cudacpp/gg_ttggg.mad/src/mgOnGpuVectors.h
+++ b/epochX/cudacpp/gg_ttggg.mad/src/mgOnGpuVectors.h
@@ -32,7 +32,7 @@
 #endif
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -131,7 +131,7 @@ namespace mg5amcCpu
 #endif
 #endif
 
-#else // i.e #ifndef MGONGPU_CPPSIMD (this includes #ifdef __CUDACC__)
+#else // i.e #ifndef MGONGPU_CPPSIMD (this includes #ifdef MGONGPUCPP_GPUIMPL)
 
   const int neppV = 1;
 
@@ -153,13 +153,13 @@ namespace mg5amcCpu
 //==========================================================================
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
 #endif
 {
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 
   // Printout to stream for user defined types
 
@@ -805,11 +805,11 @@ namespace mg5amcCpu
 
 #endif // #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
 
-#endif // #ifndef __CUDACC__
+#endif // #ifndef MGONGPUCPP_GPUIMPL
 
   //==========================================================================
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 
   //------------------------------
   // Vector types - CUDA
@@ -853,12 +853,12 @@ namespace mg5amcCpu
     return mask;
   }
 
-#endif // #ifdef __CUDACC__
+#endif // #ifdef MGONGPUCPP_GPUIMPL
 
   //==========================================================================
 
   // Scalar-or-vector types: scalar in CUDA, vector or scalar in C++
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   typedef bool bool_sv;
   typedef fptype fptype_sv;
   typedef fptype2 fptype2_sv;
@@ -879,7 +879,7 @@ namespace mg5amcCpu
 #endif
 
   // Scalar-or-vector zeros: scalar in CUDA, vector or scalar in C++
-#ifdef __CUDACC__ /* clang-format off */
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
   inline __host__ __device__ cxtype cxzero_sv(){ return cxtype( 0, 0 ); }
 #elif defined MGONGPU_CPPSIMD
   inline cxtype_v cxzero_sv() { return cxtype_v(); } // RRRR=0000 IIII=0000
diff --git a/epochX/cudacpp/gg_ttggg.mad/src/rambo.h b/epochX/cudacpp/gg_ttggg.mad/src/rambo.h
index e02ea52496..cd7e1008ea 100644
--- a/epochX/cudacpp/gg_ttggg.mad/src/rambo.h
+++ b/epochX/cudacpp/gg_ttggg.mad/src/rambo.h
@@ -4,7 +4,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 
 #include "mgOnGpuConfig.h"
@@ -18,7 +18,7 @@
 #include <iostream>
 
 // Simplified rambo version for 2 to N (with N>=2) processes with massless particles
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -83,7 +83,7 @@ namespace mg5amcCpu
       static bool first = true;
       if( first )
       {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
         if constexpr( M_ACCESS::isOnDevice() ) // avoid
         {
           const int ievt0 = 0;
@@ -166,7 +166,7 @@ namespace mg5amcCpu
     wt = po2log;
     if( nparf != 2 ) wt = ( 2. * nparf - 4. ) * log( energy ) + z[nparf - 1];
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
     // issue warnings if weight is too small or too large
     static int iwarn[5] = { 0, 0, 0, 0, 0 };
     if( wt < -180. )
diff --git a/epochX/cudacpp/gg_ttggg.sa/CODEGEN_cudacpp_gg_ttggg_log.txt b/epochX/cudacpp/gg_ttggg.sa/CODEGEN_cudacpp_gg_ttggg_log.txt
index 1c454bccf7..b6f4bb1e36 100644
--- a/epochX/cudacpp/gg_ttggg.sa/CODEGEN_cudacpp_gg_ttggg_log.txt
+++ b/epochX/cudacpp/gg_ttggg.sa/CODEGEN_cudacpp_gg_ttggg_log.txt
@@ -53,7 +53,7 @@ Note that you can still compile and run aMC@NLO with the built-in PDFs
 Using default text editor "vi". Set another one in ./input/mg5_configuration.txt
 Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt
 No valid web browser found. Please set in ./input/mg5_configuration.txt
-import /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_cudacpp_gg_ttggg.mg
+import /afs/cern.ch/work/j/jteig/mg5amcnlo/CODEGEN_cudacpp_gg_ttggg.mg
 The import format was not given, so we guess it as command
 set stdout_level DEBUG
 set output information to level: 10
@@ -62,7 +62,7 @@ generate g g > t t~ g g g
 No model currently active, so we import the Standard Model
 INFO: load particles 
 INFO: load vertices 
-[1;32mDEBUG: model prefixing  takes 0.005292177200317383 [0m
+[1;32mDEBUG: model prefixing  takes 0.0034203529357910156 [0m
 INFO: Restrict model sm with file models/sm/restrict_default.dat . 
 [1;32mDEBUG: Simplifying conditional expressions [0m
 [1;32mDEBUG: remove interactions: u s w+ at order: QED=1 [0m
@@ -155,7 +155,7 @@ INFO: Please specify coupling orders to bypass this step.
 INFO: Trying coupling order WEIGHTED<=5: WEIGTHED IS QCD+2*QED 
 INFO: Trying process: g g > t t~ g g g WEIGHTED<=5 @1  
 INFO: Process has 1240 diagrams 
-1 processes with 1240 diagrams generated in 1.707 s
+1 processes with 1240 diagrams generated in 1.389 s
 Total: 1 processes with 1240 diagrams
 output standalone_cudacpp CODEGEN_cudacpp_gg_ttggg
 Load PLUGIN.CUDACPP_SA_OUTPUT
@@ -163,7 +163,7 @@ Load PLUGIN.CUDACPP_SA_OUTPUT
 [1;32mDEBUG:  cformat = [0m plugin [1;30m[export_cpp.py at line 3071][0m [0m
 [1;32mDEBUG:  Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [1;30m[output.py at line 152][0m [0m
 [1;32mDEBUG:  Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [1;30m[output.py at line 157][0m [0m
-INFO: Creating subdirectories in directory /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_cudacpp_gg_ttggg 
+INFO: Creating subdirectories in directory /afs/cern.ch/work/j/jteig/mg5amcnlo/CODEGEN_cudacpp_gg_ttggg 
 INFO: Organizing processes into subprocess groups 
 INFO: Generating Helas calls for process: g g > t t~ g g g WEIGHTED<=5 @1 
 INFO: Processing color information for process: g g > t t~ g g g @1 
@@ -173,12 +173,12 @@ INFO: Processing color information for process: g g > t t~ g g g @1
 [1;32mDEBUG:    type(me)=<class 'int'> me=0 [1;30m[output.py at line 189][0m [0m
 [1;32mDEBUG:  Entering PLUGIN_OneProcessExporter.__init__ [1;30m[model_handling.py at line 1039][0m [0m
 [1;32mDEBUG:  proc_id = [0m 0 [1;30m[model_handling.py at line 1045][0m [0m
-INFO: Creating files in directory /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_cudacpp_gg_ttggg/SubProcesses/P1_Sigma_sm_gg_ttxggg 
+INFO: Creating files in directory /afs/cern.ch/work/j/jteig/mg5amcnlo/CODEGEN_cudacpp_gg_ttggg/SubProcesses/P1_Sigma_sm_gg_ttxggg 
 [1;32mDEBUG:  Entering PLUGIN_OneProcessExporter.generate_process_files [1;30m[model_handling.py at line 1297][0m [0m
 [1;32mDEBUG:  self.include_multi_channel is not yet defined: this is standalone_cudacpp mode [1;30m[model_handling.py at line 1301][0m [0m
-FileWriter <class 'PLUGIN.CUDACPP_SA_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_cudacpp_gg_ttggg/SubProcesses/P1_Sigma_sm_gg_ttxggg/./CPPProcess.h
+FileWriter <class 'PLUGIN.CUDACPP_SA_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /afs/cern.ch/work/j/jteig/mg5amcnlo/CODEGEN_cudacpp_gg_ttggg/SubProcesses/P1_Sigma_sm_gg_ttxggg/./CPPProcess.h
 [1;32mDEBUG:  Entering PLUGIN_OneProcessExporter.write_process_h_file [1;30m[model_handling.py at line 1453][0m [0m
-FileWriter <class 'PLUGIN.CUDACPP_SA_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_cudacpp_gg_ttggg/SubProcesses/P1_Sigma_sm_gg_ttxggg/./CPPProcess.cc
+FileWriter <class 'PLUGIN.CUDACPP_SA_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /afs/cern.ch/work/j/jteig/mg5amcnlo/CODEGEN_cudacpp_gg_ttggg/SubProcesses/P1_Sigma_sm_gg_ttxggg/./CPPProcess.cc
 [1;32mDEBUG:  Entering PLUGIN_OneProcessExporter.write_process_cc_file [1;30m[model_handling.py at line 1475][0m [0m
 [1;32mDEBUG:  Entering PLUGIN_OneProcessExporter.get_sigmaKin_lines [1;30m[model_handling.py at line 1143][0m [0m
 [1;32mDEBUG:  self.include_multi_channel = [0m False [1;30m[model_handling.py at line 1144][0m [0m
@@ -186,18 +186,18 @@ FileWriter <class 'PLUGIN.CUDACPP_SA_OUTPUT.model_handling.PLUGIN_CPPWriter'> fo
 [1;32mDEBUG:  type(self.helas_call_writer) = [0m <class 'PLUGIN.CUDACPP_SA_OUTPUT.model_handling.PLUGIN_GPUFOHelasCallWriter'> [1;30m[model_handling.py at line 1162][0m [0m
 [1;32mDEBUG:  self.support_multichannel, self.include_multi_channel = [0m True False [1;30m[model_handling.py at line 1163][0m [0m
 [1;32mDEBUG:  multi_channel_map = [0m None [1;30m[model_handling.py at line 1654][0m [0m
-[1;32mDEBUG:  diag_to_config = [0m {} [1;30m[model_handling.py at line 1709][0m [0m
-[1;32mDEBUG:  call = [0m vxxxxx( momenta,m_pars->%s, cHel[ihel][%d],%+d, w_sv[%d], %d ); [1;30m[model_handling.py at line 1821][0m [0m
-[1;32mDEBUG:  ('ZERO', 0, -1, 0, 0) [1;30m[model_handling.py at line 1822][0m [0m
-[1;32mDEBUG:  call = [0m vxxxxx( momenta,m_pars->%s, cHel[ihel][%d],%+d, w_sv[%d], %d ); [1;30m[model_handling.py at line 1821][0m [0m
-[1;32mDEBUG:  ('ZERO', 1, -1, 1, 1) [1;30m[model_handling.py at line 1822][0m [0m
-[1;32mDEBUG:  call = [0m vxxxxx( momenta,m_pars->%s, cHel[ihel][%d],%+d, w_sv[%d], %d ); [1;30m[model_handling.py at line 1821][0m [0m
-[1;32mDEBUG:  ('ZERO', 4, 1, 4, 4) [1;30m[model_handling.py at line 1822][0m [0m
-[1;32mDEBUG:  call = [0m vxxxxx( momenta,m_pars->%s, cHel[ihel][%d],%+d, w_sv[%d], %d ); [1;30m[model_handling.py at line 1821][0m [0m
-[1;32mDEBUG:  ('ZERO', 5, 1, 5, 5) [1;30m[model_handling.py at line 1822][0m [0m
-[1;32mDEBUG:  call = [0m vxxxxx( momenta,m_pars->%s, cHel[ihel][%d],%+d, w_sv[%d], %d ); [1;30m[model_handling.py at line 1821][0m [0m
-[1;32mDEBUG:  ('ZERO', 6, 1, 6, 6) [1;30m[model_handling.py at line 1822][0m [0m
-INFO: Created files CPPProcess.h and CPPProcess.cc in directory /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_cudacpp_gg_ttggg/SubProcesses/P1_Sigma_sm_gg_ttxggg/. 
+[1;32mDEBUG:  diag_to_config = [0m {} [1;30m[model_handling.py at line 1711][0m [0m
+[1;32mDEBUG:  call = [0m vxxxxx( momenta,m_pars->%s, cHel[ihel][%d],%+d, w_sv[%d], %d ); [1;30m[model_handling.py at line 1823][0m [0m
+[1;32mDEBUG:  ('ZERO', 0, -1, 0, 0) [1;30m[model_handling.py at line 1824][0m [0m
+[1;32mDEBUG:  call = [0m vxxxxx( momenta,m_pars->%s, cHel[ihel][%d],%+d, w_sv[%d], %d ); [1;30m[model_handling.py at line 1823][0m [0m
+[1;32mDEBUG:  ('ZERO', 1, -1, 1, 1) [1;30m[model_handling.py at line 1824][0m [0m
+[1;32mDEBUG:  call = [0m vxxxxx( momenta,m_pars->%s, cHel[ihel][%d],%+d, w_sv[%d], %d ); [1;30m[model_handling.py at line 1823][0m [0m
+[1;32mDEBUG:  ('ZERO', 4, 1, 4, 4) [1;30m[model_handling.py at line 1824][0m [0m
+[1;32mDEBUG:  call = [0m vxxxxx( momenta,m_pars->%s, cHel[ihel][%d],%+d, w_sv[%d], %d ); [1;30m[model_handling.py at line 1823][0m [0m
+[1;32mDEBUG:  ('ZERO', 5, 1, 5, 5) [1;30m[model_handling.py at line 1824][0m [0m
+[1;32mDEBUG:  call = [0m vxxxxx( momenta,m_pars->%s, cHel[ihel][%d],%+d, w_sv[%d], %d ); [1;30m[model_handling.py at line 1823][0m [0m
+[1;32mDEBUG:  ('ZERO', 6, 1, 6, 6) [1;30m[model_handling.py at line 1824][0m [0m
+INFO: Created files CPPProcess.h and CPPProcess.cc in directory /afs/cern.ch/work/j/jteig/mg5amcnlo/CODEGEN_cudacpp_gg_ttggg/SubProcesses/P1_Sigma_sm_gg_ttxggg/. 
 [1;32mDEBUG:  Entering PLUGIN_OneProcessExporter.edit_CMakeLists [1;30m[model_handling.py at line 1343][0m [0m
 [1;32mDEBUG:  Entering PLUGIN_OneProcessExporter.edit_check_sa [1;30m[model_handling.py at line 1352][0m [0m
 [1;32mDEBUG:  Entering PLUGIN_OneProcessExporter.edit_mgonGPU [1;30m[model_handling.py at line 1369][0m [0m
@@ -205,8 +205,8 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory /data/avalassi/G
 [1;32mDEBUG:  Entering PLUGIN_OneProcessExporter.edit_testxxx [1;30m[model_handling.py at line 1419][0m [0m
 [1;32mDEBUG:  Entering PLUGIN_OneProcessExporter.edit_memorybuffers [1;30m[model_handling.py at line 1430][0m [0m
 [1;32mDEBUG:  Entering PLUGIN_OneProcessExporter.edit_memoryaccesscouplings [1;30m[model_handling.py at line 1441][0m [0m
-[1;32mDEBUG:  'Copying test reference file: ', template_ref  = [0m Copying test reference file:  /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/../../../test/ref/dump_CPUTest.Sigma_sm_gg_ttxggg.txt [1;30m[model_handling.py at line 1335][0m [0m
-Generated helas calls for 1 subprocesses (1240 diagrams) in 6.005 s
+[1;32mDEBUG:  'Copying test reference file: ', template_ref  = [0m Copying test reference file:  /afs/cern.ch/work/j/jteig/mg5amcnlo/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/../../../test/ref/dump_CPUTest.Sigma_sm_gg_ttxggg.txt [1;30m[model_handling.py at line 1335][0m [0m
+Generated helas calls for 1 subprocesses (1240 diagrams) in 4.586 s
 [1;32mDEBUG:  Entering PLUGIN_ProcessExporter.convert_model (create the model) [1;30m[output.py at line 194][0m [0m
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates VVV1 routines[0m
@@ -214,7 +214,7 @@ ALOHA: aloha creates FFV1 routines[0m
 ALOHA: aloha creates VVVV1 routines[0m
 ALOHA: aloha creates VVVV3 routines[0m
 ALOHA: aloha creates VVVV4 routines[0m
-ALOHA: aloha creates 5 routines in  0.266 s
+ALOHA: aloha creates 5 routines in  0.252 s
 <class 'aloha.create_aloha.AbstractRoutine'> VVV1
 <class 'aloha.create_aloha.AbstractRoutine'> VVV1
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
@@ -227,8 +227,8 @@ ALOHA: aloha creates 5 routines in  0.266 s
 <class 'aloha.create_aloha.AbstractRoutine'> VVVV3
 <class 'aloha.create_aloha.AbstractRoutine'> VVVV4
 <class 'aloha.create_aloha.AbstractRoutine'> VVVV4
-FileWriter <class 'PLUGIN.CUDACPP_SA_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_cudacpp_gg_ttggg/src/./HelAmps_sm.h
-INFO: Created file HelAmps_sm.h in directory /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_cudacpp_gg_ttggg/src/. 
+FileWriter <class 'PLUGIN.CUDACPP_SA_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /afs/cern.ch/work/j/jteig/mg5amcnlo/CODEGEN_cudacpp_gg_ttggg/src/./HelAmps_sm.h
+INFO: Created file HelAmps_sm.h in directory /afs/cern.ch/work/j/jteig/mg5amcnlo/CODEGEN_cudacpp_gg_ttggg/src/. 
 super_write_set_parameters_onlyfixMajorana (hardcoded=False)
 [1;32mDEBUG:  'pardef_lines size =', len(pardef_lines), ', keys size =', len(pardef_lines.keys())  = [0m pardef_lines size = 48 , keys size = 48 [1;30m[model_handling.py at line 729][0m [0m
 super_write_set_parameters_onlyfixMajorana (hardcoded=True)
@@ -237,13 +237,13 @@ super_write_set_parameters_onlyfixMajorana (hardcoded=True)
 [1;32mDEBUG:  'pardef_lines size =', len(pardef_lines), ', keys size =', len(pardef_lines.keys())  = [0m pardef_lines size = 3 , keys size = 3 [1;30m[model_handling.py at line 729][0m [0m
 [1;32mDEBUG:  'pardef_lines size =', len(pardef_lines), ', keys size =', len(pardef_lines.keys())  = [0m pardef_lines size = 3 , keys size = 3 [1;30m[model_handling.py at line 729][0m [0m
 [1;32mDEBUG:  'pardef_lines size =', len(pardef_lines), ', keys size =', len(pardef_lines.keys())  = [0m pardef_lines size = 3 , keys size = 3 [1;30m[model_handling.py at line 729][0m [0m
-FileWriter <class 'PLUGIN.CUDACPP_SA_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_cudacpp_gg_ttggg/src/./Parameters_sm.h
-FileWriter <class 'PLUGIN.CUDACPP_SA_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_cudacpp_gg_ttggg/src/./Parameters_sm.cc
+FileWriter <class 'PLUGIN.CUDACPP_SA_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /afs/cern.ch/work/j/jteig/mg5amcnlo/CODEGEN_cudacpp_gg_ttggg/src/./Parameters_sm.h
+FileWriter <class 'PLUGIN.CUDACPP_SA_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /afs/cern.ch/work/j/jteig/mg5amcnlo/CODEGEN_cudacpp_gg_ttggg/src/./Parameters_sm.cc
 INFO: Created files Parameters_sm.h and Parameters_sm.cc in directory 
-INFO: /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_cudacpp_gg_ttggg/src/. and /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_cudacpp_gg_ttggg/src/. 
+INFO: /afs/cern.ch/work/j/jteig/mg5amcnlo/CODEGEN_cudacpp_gg_ttggg/src/. and /afs/cern.ch/work/j/jteig/mg5amcnlo/CODEGEN_cudacpp_gg_ttggg/src/. 
 [1;32mDEBUG:  Entering PLUGIN_ProcessExporter.finalize [1;30m[output.py at line 203][0m [0m
 quit
 
-real	0m12.140s
-user	0m11.838s
-sys	0m0.186s
+real	0m9.536s
+user	0m8.980s
+sys	0m0.292s
diff --git a/epochX/cudacpp/gg_ttggg.sa/COPYRIGHT b/epochX/cudacpp/gg_ttggg.sa/COPYRIGHT
index a134b5fef9..84a883fbb0 100644
--- a/epochX/cudacpp/gg_ttggg.sa/COPYRIGHT
+++ b/epochX/cudacpp/gg_ttggg.sa/COPYRIGHT
@@ -15,6 +15,7 @@ The full development team currently includes the following authors :
   Stephan Hageboeck (CERN)
   Olivier Mattelaer (Universite Catholique de Louvain, original author)
   Stefan Roiser (CERN, original author)
+  Joergen Teig (CERN)
   Andrea Valassi (CERN, original author)
   Zenny Wettersten (CERN)
 See https://github.com/madgraph5/madgraph4gpu for more details. For the full
diff --git a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/Bridge.h b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/Bridge.h
index d7e629cacd..f37c972b24 100644
--- a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/Bridge.h
+++ b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/Bridge.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: S. Roiser (Nov 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Roiser, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef BRIDGE_H
 #define BRIDGE_H 1
@@ -22,7 +22,7 @@
 #include <memory>
 #include <type_traits>
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -82,7 +82,7 @@ namespace mg5amcCpu
     Bridge& operator=( const Bridge& ) = delete;
     Bridge& operator=( Bridge&& ) = delete;
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     /**
      * Set the gpublocks and gputhreads for the gpusequence - throws if evnt != gpublocks*gputhreads
      * (this is needed for BridgeKernel tests rather than for actual production use in Fortran)
@@ -149,7 +149,7 @@ namespace mg5amcCpu
     unsigned int m_nevt; // number of events
     int m_nGoodHel;      // the number of good helicities (-1 initially when they have not yet been calculated)
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     int m_gputhreads; // number of gpu threads (default set from number of events, can be modified)
     int m_gpublocks;  // number of gpu blocks (default set from number of events, can be modified)
     DeviceBuffer<FORTRANFPTYPE, sizePerEventMomenta> m_devMomentaF;
@@ -186,12 +186,12 @@ namespace mg5amcCpu
   // Forward declare transposition methods
   //
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 
   template<typename Tin, typename Tout>
   __global__ void dev_transposeMomentaF2C( const Tin* in, Tout* out, const unsigned int nevt );
 
-#endif // __CUDACC__
+#endif // MGONGPUCPP_GPUIMPL
 
   template<typename Tin, typename Tout>
   void hst_transposeMomentaF2C( const Tin* in, Tout* out, const unsigned int nevt );
@@ -208,7 +208,7 @@ namespace mg5amcCpu
   Bridge<FORTRANFPTYPE>::Bridge( unsigned int nevtF, unsigned int nparF, unsigned int np4F )
     : m_nevt( nevtF )
     , m_nGoodHel( -1 )
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     , m_gputhreads( 256 )                  // default number of gpu threads
     , m_gpublocks( m_nevt / m_gputhreads ) // this ensures m_nevt <= m_gpublocks*m_gputhreads
     , m_devMomentaF( m_nevt )
@@ -232,7 +232,7 @@ namespace mg5amcCpu
   {
     if( nparF != CPPProcess::npar ) throw std::runtime_error( "Bridge constructor: npar mismatch" );
     if( np4F != CPPProcess::np4 ) throw std::runtime_error( "Bridge constructor: np4 mismatch" );
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     if( ( m_nevt < s_gputhreadsmin ) || ( m_nevt % s_gputhreadsmin != 0 ) )
       throw std::runtime_error( "Bridge constructor: nevt should be a multiple of " + std::to_string( s_gputhreadsmin ) );
     while( m_nevt != m_gpublocks * m_gputhreads )
@@ -250,11 +250,11 @@ namespace mg5amcCpu
     std::cout << "WARNING! Instantiate host Bridge (nevt=" << m_nevt << ")" << std::endl;
     CPPProcess process( /*verbose=*/false );
     m_pmek.reset( new MatrixElementKernelHost( m_hstMomentaC, m_hstGs, m_hstRndHel, m_hstRndCol, m_hstMEs, m_hstSelHel, m_hstSelCol, m_nevt ) );
-#endif // __CUDACC__
+#endif // MGONGPUCPP_GPUIMPL
     process.initProc( "../../Cards/param_card.dat" );
   }
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   template<typename FORTRANFPTYPE>
   void Bridge<FORTRANFPTYPE>::set_gpugrid( const int gpublocks, const int gputhreads )
   {
@@ -268,7 +268,7 @@ namespace mg5amcCpu
   }
 #endif
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   template<typename FORTRANFPTYPE>
   void Bridge<FORTRANFPTYPE>::gpu_sequence( const FORTRANFPTYPE* momenta,
                                             const FORTRANFPTYPE* gs,
@@ -283,14 +283,14 @@ namespace mg5amcCpu
     constexpr int neppM = MemoryAccessMomenta::neppM;
     if constexpr( neppM == 1 && std::is_same_v<FORTRANFPTYPE, fptype> )
     {
-      checkCuda( cudaMemcpy( m_devMomentaC.data(), momenta, m_devMomentaC.bytes(), cudaMemcpyHostToDevice ) );
+      gpuMemcpy( m_devMomentaC.data(), momenta, m_devMomentaC.bytes(), gpuMemcpyHostToDevice );
     }
     else
     {
-      checkCuda( cudaMemcpy( m_devMomentaF.data(), momenta, m_devMomentaF.bytes(), cudaMemcpyHostToDevice ) );
+      gpuMemcpy( m_devMomentaF.data(), momenta, m_devMomentaF.bytes(), gpuMemcpyHostToDevice );
       const int thrPerEvt = CPPProcess::npar * CPPProcess::np4; // AV: transpose alg does 1 element per thread (NOT 1 event per thread)
       //const int thrPerEvt = 1; // AV: try new alg with 1 event per thread... this seems slower
-      dev_transposeMomentaF2C<<<m_gpublocks * thrPerEvt, m_gputhreads>>>( m_devMomentaF.data(), m_devMomentaC.data(), m_nevt );
+      gpuLaunchKernel( dev_transposeMomentaF2C, m_gpublocks * thrPerEvt, m_gputhreads, m_devMomentaF.data(), m_devMomentaC.data(), m_nevt );
     }
     if constexpr( std::is_same_v<FORTRANFPTYPE, fptype> )
     {
@@ -333,7 +333,7 @@ namespace mg5amcCpu
   }
 #endif
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   template<typename FORTRANFPTYPE>
   void Bridge<FORTRANFPTYPE>::cpu_sequence( const FORTRANFPTYPE* momenta,
                                             const FORTRANFPTYPE* gs,
@@ -388,7 +388,7 @@ namespace mg5amcCpu
   // - C++ array: momenta[npagM][npar][np4][neppM] with nevt=npagM*neppM (AOSOA)
   //
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   template<typename Tin, typename Tout>
   __global__ void dev_transposeMomentaF2C( const Tin* in, Tout* out, const unsigned int nevt )
   {
diff --git a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/BridgeKernels.cc b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/BridgeKernels.cc
index d58066c9c1..eaf4037a24 100644
--- a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/BridgeKernels.cc
+++ b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/BridgeKernels.cc
@@ -1,17 +1,18 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #include "BridgeKernels.h"
 
+#include "GpuAbstraction.h"
 #include "MemoryAccessMomenta.h"
 
 #include <sstream>
 
 //============================================================================
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -45,7 +46,7 @@ namespace mg5amcCpu
 
 //============================================================================
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 namespace mg5amcCpu
 {
 
@@ -96,7 +97,7 @@ namespace mg5amcCpu
 
 //============================================================================
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 {
 
diff --git a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/BridgeKernels.h b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/BridgeKernels.h
index 15eb4bff4d..3efef8ce97 100644
--- a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/BridgeKernels.h
+++ b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/BridgeKernels.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef BRIDGEKERNELS_H
 #define BRIDGEKERNELS_H 1
@@ -12,7 +12,7 @@
 #include "MatrixElementKernels.h"
 #include "MemoryBuffers.h"
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -49,7 +49,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A Bridge wrapper class encapsulating matrix element calculations on a CPU host
   class BridgeKernelHost final : public BridgeKernelBase
   {
@@ -89,7 +89,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   // A Bridge wrapper class encapsulating matrix element calculations on a GPU device
   class BridgeKernelDevice : public BridgeKernelBase
   {
diff --git a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/CommonRandomNumberKernel.cc b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/CommonRandomNumberKernel.cc
index 985b39f576..010bc4cbd0 100644
--- a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/CommonRandomNumberKernel.cc
+++ b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/CommonRandomNumberKernel.cc
@@ -1,15 +1,16 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #include "CommonRandomNumbers.h"
+#include "GpuAbstraction.h"
 #include "MemoryBuffers.h"
 #include "RandomNumberKernels.h"
 
 #include <cassert>
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/CrossSectionKernels.cc b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/CrossSectionKernels.cc
index 0b355a3c8d..c15b39844d 100644
--- a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/CrossSectionKernels.cc
+++ b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/CrossSectionKernels.cc
@@ -1,10 +1,11 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #include "CrossSectionKernels.h"
 
+#include "GpuAbstraction.h"
 #include "MemoryAccessMatrixElements.h"
 #include "MemoryAccessWeights.h"
 #include "MemoryBuffers.h"
@@ -77,7 +78,7 @@ debug_me_is_abnormal( const fptype& me, size_t ievtALL )
 
 //============================================================================
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -185,7 +186,7 @@ namespace mg5amcCpu
 
 //============================================================================
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 {
 
diff --git a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/CrossSectionKernels.h b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/CrossSectionKernels.h
index 7933ca4bbf..4d9659e04e 100644
--- a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/CrossSectionKernels.h
+++ b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/CrossSectionKernels.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef CROSSSECTIONKERNELS_H
 #define CROSSSECTIONKERNELS_H 1
@@ -13,7 +13,7 @@
 
 //============================================================================
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -96,7 +96,7 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
   /*
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   // A class encapsulating the calculation of event statistics on a GPU device
   class CrossSectionKernelDevice : public CrossSectionKernelBase, public NumberOfEvents
   {
diff --git a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/CudaRuntime.h b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/CudaRuntime.h
deleted file mode 100644
index 64ce52f4b3..0000000000
--- a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/CudaRuntime.h
+++ /dev/null
@@ -1,85 +0,0 @@
-// Copyright (C) 2020-2023 CERN and UCLouvain.
-// Licensed under the GNU Lesser General Public License (version 3 or later).
-// Created by: S. Roiser (Jul 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
-
-#ifndef MG5AMC_CUDARUNTIME_H
-#define MG5AMC_CUDARUNTIME_H 1
-
-// MG5AMC on GPU uses the CUDA runtime API, not the lower level CUDA driver API
-// See https://docs.nvidia.com/cuda/cuda-runtime-api/driver-vs-runtime-api.html#driver-vs-runtime-api
-
-#include <cassert>
-#include <iostream>
-
-//--------------------------------------------------------------------------
-
-// See https://stackoverflow.com/a/14038590
-#ifdef __CUDACC__ /* clang-format off */
-#define checkCuda( code ) { assertCuda( code, __FILE__, __LINE__ ); }
-inline void assertCuda( cudaError_t code, const char* file, int line, bool abort = true )
-{
-  if( code != cudaSuccess )
-  {
-    printf( "ERROR! assertCuda: '%s' (%d) in %s:%d\n", cudaGetErrorString( code ), code, file, line );
-    if( abort ) assert( code == cudaSuccess );
-  }
-}
-#endif /* clang-format on */
-
-//--------------------------------------------------------------------------
-
-#ifdef __CUDACC__
-namespace mg5amcGpu
-{
-  // Instantiate a CudaRuntime at the beginnining of the application's main to
-  // invoke cudaSetDevice(0) in the constructor and book a cudaDeviceReset() call in the destructor
-  // *** FIXME! This will all need to be designed differently when going to multi-GPU nodes! ***
-  struct CudaRuntime final
-  {
-    CudaRuntime( const bool debug = true )
-      : m_debug( debug ) { setUp( m_debug ); }
-    ~CudaRuntime() { tearDown( m_debug ); }
-    CudaRuntime( const CudaRuntime& ) = delete;
-    CudaRuntime( CudaRuntime&& ) = delete;
-    CudaRuntime& operator=( const CudaRuntime& ) = delete;
-    CudaRuntime& operator=( CudaRuntime&& ) = delete;
-    bool m_debug;
-
-    // Set up CUDA application
-    // ** NB: strictly speaking this is not needed when using the CUDA runtime API **
-    // Calling cudaSetDevice on startup is useful to properly book-keep the time spent in CUDA initialization
-    static void setUp( const bool debug = true )
-    {
-      // ** NB: it is useful to call cudaSetDevice, or cudaFree, to properly book-keep the time spent in CUDA initialization
-      // ** NB: otherwise, the first CUDA operation (eg a cudaMemcpyToSymbol in CPPProcess ctor) appears to take much longer!
-      /*
-      // [We initially added cudaFree(0) to "ease profile analysis" only because it shows up as a big recognizable block!]
-      // No explicit initialization is needed: https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#initialization
-      // It is not clear what cudaFree(0) does at all: https://stackoverflow.com/questions/69967813/
-      if ( debug ) std::cout << "__CudaRuntime: calling cudaFree(0)" << std::endl;
-      checkCuda( cudaFree( 0 ) ); // SLOW!
-      */
-      // Replace cudaFree(0) by cudaSetDevice(0), even if it is not really needed either
-      // (but see https://developer.nvidia.com/blog/cuda-pro-tip-always-set-current-device-avoid-multithreading-bugs)
-      if( debug ) std::cout << "__CudaRuntime: calling cudaSetDevice(0)" << std::endl;
-      checkCuda( cudaSetDevice( 0 ) ); // SLOW!
-    }
-
-    // Tear down CUDA application (call cudaDeviceReset)
-    // ** NB: strictly speaking this is not needed when using the CUDA runtime API **
-    // Calling cudaDeviceReset on shutdown is only needed for checking memory leaks in cuda-memcheck
-    // See https://docs.nvidia.com/cuda/cuda-memcheck/index.html#leak-checking
-    static void tearDown( const bool debug = true )
-    {
-      if( debug ) std::cout << "__CudaRuntime: calling cudaDeviceReset()" << std::endl;
-      checkCuda( cudaDeviceReset() );
-    }
-  };
-
-}
-#endif
-
-//--------------------------------------------------------------------------
-
-#endif // MG5AMC_CUDARUNTIME_H
diff --git a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/CurandRandomNumberKernel.cc b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/CurandRandomNumberKernel.cc
index eb56333b03..08a16f6f2c 100644
--- a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/CurandRandomNumberKernel.cc
+++ b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/CurandRandomNumberKernel.cc
@@ -1,9 +1,9 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
-#include "CudaRuntime.h"
+#include "GpuRuntime.h"
 #include "MemoryBuffers.h"
 #include "RandomNumberKernels.h"
 
@@ -22,7 +22,7 @@ inline void assertCurand( curandStatus_t code, const char *file, int line, bool
 }
 #endif /* clang-format on */
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -36,7 +36,7 @@ namespace mg5amcCpu
   {
     if( m_isOnDevice )
     {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
       if( !m_rnarray.isOnDevice() )
         throw std::runtime_error( "CurandRandomNumberKernel on device with a host random number array" );
 #else
@@ -114,7 +114,7 @@ namespace mg5amcCpu
     /*
     printf( "\nCurandRandomNumberKernel::generateRnarray size = %d\n", (int)m_rnarray.size() );
     fptype* data = m_rnarray.data();
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     if( m_rnarray.isOnDevice() )
     {
       data = new fptype[m_rnarray.size()]();
@@ -123,7 +123,7 @@ namespace mg5amcCpu
 #endif
     for( int i = 0; i < ( (int)m_rnarray.size() / 4 ); i++ )
       printf( "[%4d] %f %f %f %f\n", i * 4, data[i * 4], data[i * 4 + 2], data[i * 4 + 2], data[i * 4 + 3] );
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     if( m_rnarray.isOnDevice() ) delete[] data;
 #endif
     */
diff --git a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/EventStatistics.h b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/EventStatistics.h
index 48b51e0a49..b425a5bade 100644
--- a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/EventStatistics.h
+++ b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/EventStatistics.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef EventStatistics_H
 #define EventStatistics_H 1
@@ -16,7 +16,7 @@
 #include <limits>
 #include <string>
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MadgraphTest.h b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MadgraphTest.h
index ffe3b84d53..176338151a 100644
--- a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MadgraphTest.h
+++ b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MadgraphTest.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: S. Hageboeck (Dec 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MADGRAPHTEST_H_
 #define MADGRAPHTEST_H_ 1
@@ -21,7 +21,7 @@
 #include <string>
 #include <vector>
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 using mg5amcGpu::CPPProcess;
 #else
 using mg5amcCpu::CPPProcess;
@@ -200,7 +200,7 @@ class MadgraphTest : public testing::TestWithParam<TestDriverBase*>
 
 // Since we link both the CPU-only and GPU tests into the same executable, we prevent
 // a multiply defined symbol by only compiling this in the non-CUDA phase:
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 
 /// Compare momenta and matrix elements.
 /// This uses an implementation of TestDriverBase to run a madgraph workflow,
@@ -309,6 +309,6 @@ TEST_P( MadgraphTest, CompareMomentaAndME )
   }
 }
 
-#endif // __CUDACC__
+#endif // MGONGPUCPP_GPUIMPL
 
 #endif /* MADGRAPHTEST_H_ */
diff --git a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MatrixElementKernels.cc b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MatrixElementKernels.cc
index 30257195b6..d6d6c4f179 100644
--- a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MatrixElementKernels.cc
+++ b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MatrixElementKernels.cc
@@ -1,12 +1,12 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #include "MatrixElementKernels.h"
 
 #include "CPPProcess.h"
-#include "CudaRuntime.h"
+#include "GpuRuntime.h" // Includes the abstraction for Nvidia/AMD compilation
 #include "MemoryAccessMomenta.h"
 #include "MemoryBuffers.h"
 
@@ -14,7 +14,7 @@
 
 //============================================================================
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 namespace mg5amcCpu
 {
 
@@ -143,7 +143,7 @@ namespace mg5amcCpu
 
 //============================================================================
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 {
 
@@ -202,13 +202,13 @@ namespace mg5amcGpu
     PinnedHostBufferHelicityMask hstIsGoodHel( ncomb );
     DeviceBufferHelicityMask devIsGoodHel( ncomb );
     // ... 0d1. Compute good helicity mask on the device
-    computeDependentCouplings<<<m_gpublocks, m_gputhreads>>>( m_gs.data(), m_couplings.data() );
+    gpuLaunchKernel( computeDependentCouplings, m_gpublocks, m_gputhreads, m_gs.data(), m_couplings.data() );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    sigmaKin_getGoodHel<<<m_gpublocks, m_gputhreads>>>( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_numerators.data(), m_denominators.data(), devIsGoodHel.data() );
+    gpuLaunchKernel( sigmaKin_getGoodHel, m_gpublocks, m_gputhreads, m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_numerators.data(), m_denominators.data(), devIsGoodHel.data() );
 #else
-    sigmaKin_getGoodHel<<<m_gpublocks, m_gputhreads>>>( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), devIsGoodHel.data() );
+    gpuLaunchKernel( sigmaKin_getGoodHel, m_gpublocks, m_gputhreads, m_momenta.data(), m_couplings.data(), m_matrixElements.data(), devIsGoodHel.data() );
 #endif
-    checkCuda( cudaPeekAtLastError() );
+    checkGpu( gpuPeekAtLastError() );
     // ... 0d2. Copy back good helicity mask to the host
     copyHostFromDevice( hstIsGoodHel, devIsGoodHel );
     // ... 0d3. Copy back good helicity list to constant memory on the device
@@ -219,19 +219,19 @@ namespace mg5amcGpu
 
   void MatrixElementKernelDevice::computeMatrixElements( const unsigned int channelId )
   {
-    computeDependentCouplings<<<m_gpublocks, m_gputhreads>>>( m_gs.data(), m_couplings.data() );
+    gpuLaunchKernel( computeDependentCouplings, m_gpublocks, m_gputhreads, m_gs.data(), m_couplings.data() );
 #ifndef MGONGPU_NSIGHT_DEBUG
     constexpr unsigned int sharedMemSize = 0;
 #else
     constexpr unsigned int sharedMemSize = ntpbMAX * sizeof( float );
 #endif
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    sigmaKin<<<m_gpublocks, m_gputhreads, sharedMemSize>>>( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), channelId, m_numerators.data(), m_denominators.data(), m_selhel.data(), m_selcol.data() );
+    gpuLaunchKernelSharedMem( sigmaKin, m_gpublocks, m_gputhreads, sharedMemSize, m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), channelId, m_numerators.data(), m_denominators.data(), m_selhel.data(), m_selcol.data() );
 #else
-    sigmaKin<<<m_gpublocks, m_gputhreads, sharedMemSize>>>( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), m_selhel.data(), m_selcol.data() );
+    gpuLaunchKernelSharedMem( sigmaKin, m_gpublocks, m_gputhreads, sharedMemSize, m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), m_selhel.data(), m_selcol.data() );
 #endif
-    checkCuda( cudaPeekAtLastError() );
-    checkCuda( cudaDeviceSynchronize() );
+    checkGpu( gpuPeekAtLastError() );
+    checkGpu( gpuDeviceSynchronize() );
   }
 
   //--------------------------------------------------------------------------
diff --git a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MatrixElementKernels.h b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MatrixElementKernels.h
index 23e84757a2..72bd8f195b 100644
--- a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MatrixElementKernels.h
+++ b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MatrixElementKernels.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MATRIXELEMENTKERNELS_H
 #define MATRIXELEMENTKERNELS_H 1
@@ -10,7 +10,7 @@
 
 #include "MemoryBuffers.h"
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -81,7 +81,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating matrix element calculations on a CPU host
   class MatrixElementKernelHost final : public MatrixElementKernelBase, public NumberOfEvents
   {
@@ -130,7 +130,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   // A class encapsulating matrix element calculations on a GPU device
   class MatrixElementKernelDevice : public MatrixElementKernelBase, public NumberOfEvents
   {
diff --git a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MemoryAccessAmplitudes.h b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MemoryAccessAmplitudes.h
index 573b3bbbc9..ffb76e93de 100644
--- a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MemoryAccessAmplitudes.h
+++ b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MemoryAccessAmplitudes.h
@@ -15,7 +15,7 @@
 #define MGONGPU_TRIVIAL_AMPLITUDES 1
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MemoryAccessCouplings.h b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MemoryAccessCouplings.h
index 35a3af42e0..3afdf3e554 100644
--- a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MemoryAccessCouplings.h
+++ b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MemoryAccessCouplings.h
@@ -15,7 +15,7 @@
 #include "MemoryBuffers.h"       // for HostBufferCouplings::isaligned
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MemoryAccessCouplingsFixed.h b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MemoryAccessCouplingsFixed.h
index dc0d93afff..ffcdf4dbef 100644
--- a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MemoryAccessCouplingsFixed.h
+++ b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MemoryAccessCouplingsFixed.h
@@ -14,7 +14,7 @@
 //#include "MemoryAccessHelpers.h"
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MemoryAccessDenominators.h b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MemoryAccessDenominators.h
index 3bce635718..66f2d32a6b 100644
--- a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MemoryAccessDenominators.h
+++ b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MemoryAccessDenominators.h
@@ -10,7 +10,7 @@
 #include "MemoryAccessGs.h"
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MemoryAccessGs.h b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MemoryAccessGs.h
index 31311aa375..4c726b30f3 100644
--- a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MemoryAccessGs.h
+++ b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MemoryAccessGs.h
@@ -13,7 +13,7 @@
 #include "MemoryBuffers.h" // for HostBufferMatrixElements::isaligned
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MemoryAccessHelpers.h b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MemoryAccessHelpers.h
index c82a6c7635..db73e4e064 100644
--- a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MemoryAccessHelpers.h
+++ b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MemoryAccessHelpers.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MemoryAccessHelpers_H
 #define MemoryAccessHelpers_H 1
@@ -105,7 +105,7 @@ class KernelAccessHelper : public MemoryAccessHelper<T>
     }
     else
     {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
       const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid
       //printf( "kernelAccessRecord: ievt=%d threadId=%d\n", ievt, threadIdx.x );
       return T::ieventAccessRecord( buffer, ievt ); // NB fptype and fptype_sv coincide for CUDA
diff --git a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MemoryAccessMatrixElements.h b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MemoryAccessMatrixElements.h
index f32e6fea5b..3741011971 100644
--- a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MemoryAccessMatrixElements.h
+++ b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MemoryAccessMatrixElements.h
@@ -13,7 +13,7 @@
 #include "MemoryBuffers.h" // for HostBufferMatrixElements::isaligned
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MemoryAccessMomenta.h b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MemoryAccessMomenta.h
index 29266de32c..3be229d392 100644
--- a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MemoryAccessMomenta.h
+++ b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MemoryAccessMomenta.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MemoryAccessMomenta_H
 #define MemoryAccessMomenta_H 1
@@ -13,7 +13,7 @@
 #include "MemoryAccessVectors.h"
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -30,7 +30,7 @@ namespace mg5amcCpu
 
     // Number of Events Per Page in the momenta AOSOA memory buffer layout
     // (these are all best kept as a compile-time constants: see issue #23)
-#ifdef __CUDACC__ /* clang-format off */
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
     // -----------------------------------------------------------------------------------------------
     // --- GPUs: neppM is best set to a power of 2 times the number of fptype's in a 32-byte cacheline
     // --- This is relevant to ensure coalesced access to momenta in global memory
diff --git a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MemoryAccessNumerators.h b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MemoryAccessNumerators.h
index b152183b28..18991f4fa6 100644
--- a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MemoryAccessNumerators.h
+++ b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MemoryAccessNumerators.h
@@ -10,7 +10,7 @@
 #include "MemoryAccessGs.h"
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MemoryAccessRandomNumbers.h b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MemoryAccessRandomNumbers.h
index e2988d39f3..40cb089135 100644
--- a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MemoryAccessRandomNumbers.h
+++ b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MemoryAccessRandomNumbers.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MemoryAccessRandomNumbers_H
 #define MemoryAccessRandomNumbers_H 1
@@ -11,7 +11,7 @@
 #include "CPPProcess.h"
 #include "MemoryAccessHelpers.h"
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 using mg5amcGpu::CPPProcess;
 #else
 using mg5amcCpu::CPPProcess;
diff --git a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MemoryAccessVectors.h b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MemoryAccessVectors.h
index e9b197368e..08faccff0f 100644
--- a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MemoryAccessVectors.h
+++ b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MemoryAccessVectors.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MemoryAccessVectors_H
 #define MemoryAccessVectors_H 1
@@ -10,7 +10,7 @@
 
 #include "mgOnGpuVectors.h"
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 namespace mg5amcCpu // this is only needed for CPU SIMD vectorization
 {
 
diff --git a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MemoryAccessWavefunctions.h b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MemoryAccessWavefunctions.h
index 5428aaf933..33bef4559e 100644
--- a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MemoryAccessWavefunctions.h
+++ b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MemoryAccessWavefunctions.h
@@ -15,7 +15,7 @@
 #define MGONGPU_TRIVIAL_WAVEFUNCTIONS 1
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MemoryBuffers.h b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MemoryBuffers.h
index 3093e6ed18..7756a71621 100644
--- a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MemoryBuffers.h
+++ b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MemoryBuffers.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021, based on earlier work by S. Hageboeck) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Roiser, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MemoryBuffers_H
 #define MemoryBuffers_H 1
@@ -11,12 +11,12 @@
 #include "mgOnGpuCxtypes.h"
 
 #include "CPPProcess.h"
-#include "CudaRuntime.h"
+#include "GpuRuntime.h"
 #include "Parameters_sm.h"
 
 #include <sstream>
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -87,7 +87,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   constexpr bool HostBufferALIGNED = false;   // ismisaligned=false
   constexpr bool HostBufferMISALIGNED = true; // ismisaligned=true
 
@@ -119,7 +119,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   // A class encapsulating a CUDA pinned host buffer
   template<typename T>
   class PinnedHostBufferBase : public BufferBase<T>
@@ -128,18 +128,18 @@ namespace mg5amcCpu
     PinnedHostBufferBase( const size_t size )
       : BufferBase<T>( size, false )
     {
-      checkCuda( cudaMallocHost( &( this->m_data ), this->bytes() ) );
+      gpuMallocHost( &( this->m_data ), this->bytes() );
     }
     virtual ~PinnedHostBufferBase()
     {
-      checkCuda( cudaFreeHost( this->m_data ) );
+      gpuFreeHost( this->m_data );
     }
   };
 #endif
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   // A class encapsulating a CUDA device buffer
   template<typename T>
   class DeviceBufferBase : public BufferBase<T>
@@ -148,18 +148,18 @@ namespace mg5amcCpu
     DeviceBufferBase( const size_t size )
       : BufferBase<T>( size, true )
     {
-      checkCuda( cudaMalloc( &( this->m_data ), this->bytes() ) );
+      gpuMalloc( &( this->m_data ), this->bytes() );
     }
     virtual ~DeviceBufferBase()
     {
-      checkCuda( cudaFree( this->m_data ) );
+      gpuFree( this->m_data );
     }
   };
 #endif
 
   //--------------------------------------------------------------------------
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for a given number of events
   template<typename T, size_t sizePerEvent, bool ismisaligned>
   class HostBuffer : public HostBufferBase<T, ismisaligned>, virtual private NumberOfEvents
@@ -175,7 +175,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   // A class encapsulating a CUDA pinned host buffer for a given number of events
   template<typename T, size_t sizePerEvent>
   class PinnedHostBuffer : public PinnedHostBufferBase<T>, virtual private NumberOfEvents
@@ -191,7 +191,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   // A class encapsulating a CUDA device buffer for a given number of events
   template<typename T, size_t sizePerEvent>
   class DeviceBuffer : public DeviceBufferBase<T>, virtual private NumberOfEvents
@@ -213,7 +213,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for momenta random numbers
   constexpr size_t sizePerEventRndNumMomenta = MemoryBuffers::np4 * MemoryBuffers::nparf;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for momenta random numbers
   typedef HostBuffer<fptype, sizePerEventRndNumMomenta, HostBufferALIGNED> HostBufferRndNumMomenta;
 #else
@@ -232,7 +232,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer with ONE fptype per event
   constexpr size_t sizePerEventOneFp = 1;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer with ONE fptype per event
   typedef HostBuffer<fptype, sizePerEventOneFp, HostBufferALIGNED> HostBufferOneFp;
 #else
@@ -257,7 +257,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for Gs
   constexpr size_t sizePerEventGs = 1;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for gs
   typedef HostBuffer<fptype, sizePerEventGs, HostBufferALIGNED> HostBufferGs;
 #else
@@ -276,7 +276,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for numerators
   constexpr size_t sizePerEventNumerators = 1;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for gs
   typedef HostBuffer<fptype, sizePerEventNumerators, HostBufferALIGNED> HostBufferNumerators;
 #else
@@ -296,7 +296,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for denominators
   constexpr size_t sizePerEventDenominators = 1;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for gs
   typedef HostBuffer<fptype, sizePerEventDenominators, HostBufferALIGNED> HostBufferDenominators;
 #else
@@ -315,7 +315,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for random numbers
   constexpr size_t sizePerEventCouplings = MemoryBuffers::ndcoup * MemoryBuffers::nx2;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for gs
   typedef HostBuffer<fptype, sizePerEventCouplings, HostBufferALIGNED> HostBufferCouplings;
 #else
@@ -333,7 +333,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for momenta
   constexpr size_t sizePerEventMomenta = MemoryBuffers::np4 * MemoryBuffers::npar;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for momenta
   typedef HostBuffer<fptype, sizePerEventMomenta, HostBufferALIGNED> HostBufferMomenta;
   //typedef HostBuffer<fptype, sizePerEventMomenta, HostBufferMISALIGNED> HostBufferMomenta; // TEST MISALIGNMENT!
@@ -352,7 +352,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for sampling weights
   constexpr size_t sizePerEventWeights = 1;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for sampling weights
   typedef HostBuffer<fptype, sizePerEventWeights, HostBufferALIGNED> HostBufferWeights;
 #else
@@ -370,7 +370,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for matrix elements
   constexpr size_t sizePerEventMatrixElements = 1;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for matrix elements
   typedef HostBuffer<fptype, sizePerEventMatrixElements, HostBufferALIGNED> HostBufferMatrixElements;
 #else
@@ -385,7 +385,7 @@ namespace mg5amcCpu
   // A base class encapsulating a memory buffer for the helicity mask
   typedef BufferBase<bool> BufferHelicityMask;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for the helicity mask
   typedef HostBufferBase<bool, HostBufferALIGNED> HostBufferHelicityMask;
 #else
@@ -403,7 +403,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for wavefunctions
   constexpr size_t sizePerEventWavefunctions = MemoryBuffers::nw6 * MemoryBuffers::nx2;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for wavefunctions
   typedef HostBuffer<fptype, sizePerEventWavefunctions, HostBufferALIGNED> HostBufferWavefunctions;
 #else
@@ -421,7 +421,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for helicity random numbers
   constexpr size_t sizePerEventRndNumHelicity = 1;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for helicity random numbers
   typedef HostBuffer<fptype, sizePerEventRndNumHelicity, HostBufferALIGNED> HostBufferRndNumHelicity;
 #else
@@ -439,7 +439,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for color random numbers
   constexpr size_t sizePerEventRndNumColor = 1;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for color random numbers
   typedef HostBuffer<fptype, sizePerEventRndNumColor, HostBufferALIGNED> HostBufferRndNumColor;
 #else
@@ -457,7 +457,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for helicity selection
   constexpr size_t sizePerEventSelectedHelicity = 1;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for helicity selection
   typedef HostBuffer<int, sizePerEventSelectedHelicity, HostBufferALIGNED> HostBufferSelectedHelicity;
 #else
@@ -475,7 +475,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for color selection
   constexpr size_t sizePerEventSelectedColor = 1;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for color selection
   typedef HostBuffer<int, sizePerEventSelectedColor, HostBufferALIGNED> HostBufferSelectedColor;
 #else
@@ -487,7 +487,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   template<class Tdst, class Tsrc>
   void copyDeviceFromHost( Tdst& dst, const Tsrc& src ) // keep the same order of arguments as in memcpy
   {
@@ -504,13 +504,13 @@ namespace mg5amcCpu
       throw std::runtime_error( sstr.str() );
     }
     // NB (PR #45): cudaMemcpy involves an intermediate memcpy to pinned memory if host array is a not a pinned host array
-    checkCuda( cudaMemcpy( dst.data(), src.data(), src.bytes(), cudaMemcpyHostToDevice ) );
+    gpuMemcpy( dst.data(), src.data(), src.bytes(), gpuMemcpyHostToDevice );
   }
 #endif
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   template<class Tdst, class Tsrc>
   void copyHostFromDevice( Tdst& dst, const Tsrc& src ) // keep the same order of arguments as in memcpy
   {
@@ -527,7 +527,7 @@ namespace mg5amcCpu
       throw std::runtime_error( sstr.str() );
     }
     // NB (PR #45): cudaMemcpy involves an intermediate memcpy to pinned memory if host array is a not a pinned host array
-    checkCuda( cudaMemcpy( dst.data(), src.data(), src.bytes(), cudaMemcpyDeviceToHost ) );
+    gpuMemcpy( dst.data(), src.data(), src.bytes(), gpuMemcpyDeviceToHost );
   }
 #endif
 
diff --git a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/P1_Sigma_sm_gg_ttxggg/CPPProcess.cc b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/P1_Sigma_sm_gg_ttxggg/CPPProcess.cc
index b7a16f1170..a2f1fc1dc2 100644
--- a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/P1_Sigma_sm_gg_ttxggg/CPPProcess.cc
+++ b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/P1_Sigma_sm_gg_ttxggg/CPPProcess.cc
@@ -4,7 +4,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
 // MadGraph5_aMC@NLO v. 3.5.0_lo_vect, 2023-06-09
@@ -16,7 +16,6 @@
 
 #include "mgOnGpuConfig.h"
 
-#include "CudaRuntime.h"
 #include "HelAmps_sm.h"
 #include "MemoryAccessAmplitudes.h"
 #include "MemoryAccessCouplings.h"
@@ -46,7 +45,7 @@
 // Class member functions for calculating the matrix elements for
 // Process: g g > t t~ g g g WEIGHTED<=5 @1
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -80,7 +79,7 @@ namespace mg5amcCpu
   __device__ const fptype cIPD[2] = { (fptype)Parameters_sm::mdl_MT, (fptype)Parameters_sm::mdl_WT };
   __device__ const fptype* cIPC = nullptr; // unused as nicoup=0
 #else
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   __device__ __constant__ fptype cIPD[2];
   __device__ __constant__ fptype* cIPC = nullptr; // unused as nicoup=0
 #else
@@ -90,7 +89,7 @@ namespace mg5amcCpu
 #endif
 
   // Helicity combinations (and filtering of "good" helicity combinations)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   __device__ __constant__ short cHel[ncomb][npar];
   __device__ __constant__ int cNGoodHel;
   __device__ __constant__ int cGoodHel[ncomb];
@@ -118,13 +117,13 @@ namespace mg5amcCpu
                            fptype* allDenominators,       // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
                            fptype_sv* jamp2_sv            // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled)
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
                            , const int ievt00             // input: first event number in current C++ event page (for CUDA, ievt depends on threadid)
 #endif
                            )
   //ALWAYS_INLINE // attributes are not permitted in a function definition
   {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     using namespace mg5amcGpu;
     using M_ACCESS = DeviceAccessMomenta;         // non-trivial access: buffer includes all events
     using E_ACCESS = DeviceAccessMatrixElements;  // non-trivial access: buffer includes all events
@@ -151,7 +150,7 @@ namespace mg5amcCpu
 #endif /* clang-format on */
     mgDebug( 0, __FUNCTION__ );
     //printf( "calculate_wavefunctions: ihel=%2d\n", ihel );
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
     //printf( "calculate_wavefunctions: ievt00=%d\n", ievt00 );
 #endif
 
@@ -187,7 +186,7 @@ namespace mg5amcCpu
 #endif
     for( int iParity = 0; iParity < nParity; ++iParity )
     { // START LOOP ON IPARITY
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
       const int ievt0 = ievt00 + iParity * neppV;
 #endif
       constexpr size_t nxcoup = ndcoup + nicoup; // both dependent and independent couplings
@@ -200,8 +199,10 @@ namespace mg5amcCpu
         allCOUPs[idcoup] = CD_ACCESS::idcoupAccessBufferConst( allcouplings, idcoup ); // dependent couplings, vary event-by-event
       for( size_t iicoup = 0; iicoup < nicoup; iicoup++ )
         allCOUPs[ndcoup + iicoup] = CI_ACCESS::iicoupAccessBufferConst( cIPC, iicoup ); // independent couplings, fixed for all events
+#ifdef MGONGPUCPP_GPUIMPL
 #ifdef __CUDACC__
 #pragma nv_diagnostic pop
+#endif
       // CUDA kernels take input/output buffers with momenta/MEs for all events
       const fptype* momenta = allmomenta;
       const fptype* COUPs[nxcoup];
@@ -31908,7 +31909,7 @@ namespace mg5amcCpu
         { -116, 442, 442, -134, -134, 505, -44, -134, 28, -224, -62, 496, -53, 19, -62, 496, 10, -80, -62, -71, 10, -80, 1, -8, 136, -116, -116, -44, -44, 514, -116, 442, -44, 28, -53, -62, -44, -53, 514, -62, 100, 10, 28, -62, -62, 10, 10, 1, 514, 505, -62, 496, -71, 568, -44, -134, -53, -62, 19, -71, 10, -80, 100, 10, 640, -80, 1, -8, 10, 1, 64, -8, -62, -71, 10, -80, 1, -8, 28, -62, -62, 10, 10, 1, 1, -8, 10, 1, 64, -8, -8, 64, -80, -8, -512, 64, 496, 568, -80, 640, -8, 64, -224, 496, 496, -80, -80, -8, -8, 64, -80, -8, -512, 64, 64, -512, 640, 64, 4096, -512 },
         { 136, -116, -116, -44, -44, 514, -116, 442, -44, 28, -53, -62, -44, -53, 514, -62, 100, 10, 28, -62, -62, 10, 10, 1, -116, 442, 442, -134, -134, 505, -44, -134, 28, -224, -62, 496, -53, 19, -62, 496, 10, -80, -62, -71, 10, -80, 1, -8, -44, -134, -53, -62, 19, -71, 514, 505, -62, 496, -71, 568, 100, 10, 10, -80, -80, 640, 10, 1, 1, -8, -8, 64, 28, -62, -62, 10, 10, 1, -62, -71, 10, -80, 1, -8, 10, 1, 1, -8, -8, 64, -80, -8, -8, 64, 64, -512, -224, 496, 496, -80, -80, -8, 496, 568, -80, 640, -8, 64, -80, -8, -8, 64, 64, -512, 640, 64, 64, -512, -512, 4096 } }; // 2-D array[120][120]
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
       // Pre-compute a constexpr triangular color matrix properly normalized #475
       struct TriangularNormalizedColorMatrix
       {
@@ -31965,7 +31966,7 @@ namespace mg5amcCpu
 #endif
       for( int icol = 0; icol < ncolor; icol++ )
       {
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
         // === C++ START ===
         // Diagonal terms
 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
@@ -32024,7 +32025,7 @@ namespace mg5amcCpu
       MEs_sv_previous += deltaMEs_previous;
 #endif
       /*
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
       if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", blockDim.x * blockIdx.x + threadIdx.x, ihel, MEs_sv );
 #else
 #ifdef MGONGPU_CPPSIMD
@@ -32183,8 +32184,8 @@ namespace mg5amcCpu
       { 1, 1, 1, -1, 1, -1, 1 },
       { 1, 1, 1, -1, 1, 1, -1 },
       { 1, 1, 1, -1, 1, 1, 1 } };
-#ifdef __CUDACC__
-    checkCuda( cudaMemcpyToSymbol( cHel, tHel, ncomb * npar * sizeof( short ) ) );
+#ifdef MGONGPUCPP_GPUIMPL
+    gpuMemcpyToSymbol( cHel, tHel, ncomb * npar * sizeof( short ) );
 #else
     memcpy( cHel, tHel, ncomb * npar * sizeof( short ) );
 #endif
@@ -32227,9 +32228,9 @@ namespace mg5amcCpu
     // Then copy them to CUDA constant memory (issue #39) or its C++ emulation in file-scope static memory
     const fptype tIPD[2] = { (fptype)m_pars->mdl_MT, (fptype)m_pars->mdl_WT };
     //const cxtype tIPC[0] = { ... }; // nicoup=0
-#ifdef __CUDACC__
-    checkCuda( cudaMemcpyToSymbol( cIPD, tIPD, 2 * sizeof( fptype ) ) );
-    //checkCuda( cudaMemcpyToSymbol( cIPC, tIPC, 0 * sizeof( cxtype ) ) ); // nicoup=0
+#ifdef MGONGPUCPP_GPUIMPL
+    gpuMemcpyToSymbol( cIPD, tIPD, 2 * sizeof( fptype ) );
+    //gpuMemcpyToSymbol( cIPC, tIPC, 0 * sizeof( cxtype ) ); // nicoup=0
 #else
     memcpy( cIPD, tIPD, 2 * sizeof( fptype ) );
     //memcpy( cIPC, tIPC, 0 * sizeof( cxtype ) ); // nicoup=0
@@ -32268,7 +32269,7 @@ namespace mg5amcCpu
   {
     std::stringstream out;
     // CUDA version (NVCC)
-    // [Use __NVCC__ instead of __CUDACC__ here!]
+    // [Use __NVCC__ instead of MGONGPUCPP_GPUIMPL here!]
     // [This tests if 'nvcc' was used even to build a .cc file, even if not necessarily 'nvcc -x cu' for a .cu file]
     // [Check 'nvcc --compiler-options -dM -E dummy.c | grep CUDA': see https://stackoverflow.com/a/53713712]
 #ifdef __NVCC__
@@ -32333,12 +32334,12 @@ namespace mg5amcCpu
   __global__ void /* clang-format off */
   computeDependentCouplings( const fptype* allgs, // input: Gs[nevt]
                              fptype* allcouplings // output: couplings[nevt*ndcoup*2]
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
                              , const int nevt     // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #endif
   ) /* clang-format on */
   {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     using namespace mg5amcGpu;
     using G_ACCESS = DeviceAccessGs;
     using C_ACCESS = DeviceAccessCouplings;
@@ -32359,7 +32360,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__ /* clang-format off */
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
   __global__ void
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
@@ -32488,9 +32489,9 @@ namespace mg5amcCpu
         nGoodHel++;
       }
     }
-#ifdef __CUDACC__
-    checkCuda( cudaMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) ) );
-    checkCuda( cudaMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) ) );
+#ifdef MGONGPUCPP_GPUIMPL
+    gpuMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) );
+    gpuMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) );
 #else
     cNGoodHel = nGoodHel;
     for( int ihel = 0; ihel < ncomb; ihel++ ) cGoodHel[ihel] = goodHel[ihel];
@@ -32514,7 +32515,7 @@ namespace mg5amcCpu
 #endif
             int* allselhel,                // output: helicity selection[nevt]
             int* allselcol                 // output: helicity selection[nevt]
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
             , const int nevt               // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #endif
             ) /* clang-format on */
@@ -32535,7 +32536,7 @@ namespace mg5amcCpu
     // Denominators: spins, colors and identical particles
     constexpr int helcolDenominators[1] = { 1536 }; // assume nprocesses == 1 (#272 and #343)
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     // Remember: in CUDA this is a kernel for one event, in c++ this processes n events
     const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid
 #else
@@ -32549,9 +32550,12 @@ namespace mg5amcCpu
 #endif
 
     // Start sigmaKin_lines
+
+#include "GpuAbstraction.h"
+
     // === PART 0 - INITIALISATION (before calculate_wavefunctions) ===
     // Reset the "matrix elements" - running sums of |M|^2 over helicities for the given event
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     allMEs[ievt] = 0;
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     allNumerators[ievt] = 0;
@@ -32579,7 +32583,7 @@ namespace mg5amcCpu
     // === PART 1 - HELICITY LOOP: CALCULATE WAVEFUNCTIONS ===
     // (in both CUDA and C++, using precomputed good helicities)
 
-#ifdef __CUDACC__ // CUDA OR C++
+#ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++
 
     // *** START OF PART 1a - CUDA (one event per CPU thread) ***
     // Running sum of partial amplitudes squared for event by event color selection (#402)
@@ -32783,7 +32787,7 @@ namespace mg5amcCpu
     // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event
     // [NB 'sum over final spins, average over initial spins', eg see
     // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf]
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     allMEs[ievt] /= helcolDenominators[0];
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     if( channelId > 0 ) allMEs[ievt] *= allNumerators[ievt] / allDenominators[ievt];
diff --git a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/P1_Sigma_sm_gg_ttxggg/CPPProcess.h b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/P1_Sigma_sm_gg_ttxggg/CPPProcess.h
index d1dd4d6150..b1f469b1c9 100644
--- a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/P1_Sigma_sm_gg_ttxggg/CPPProcess.h
+++ b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/P1_Sigma_sm_gg_ttxggg/CPPProcess.h
@@ -4,7 +4,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
 // MadGraph5_aMC@NLO v. 3.5.0_lo_vect, 2023-06-09
@@ -25,7 +25,7 @@
 
 //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -107,7 +107,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   __global__ void
   computeDependentCouplings( const fptype* allgs,    // input: Gs[nevt]
                              fptype* allcouplings ); // output: couplings[nevt*ndcoup*2]
@@ -120,7 +120,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__ /* clang-format off */
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
   __global__ void
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
@@ -150,7 +150,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__ /* clang-format off */
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
   __global__ void
   sigmaKin( const fptype* allmomenta,      // input: momenta[nevt*npar*4]
             const fptype* allcouplings,    // input: couplings[nevt*ndcoup*2]
diff --git a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/P1_Sigma_sm_gg_ttxggg/CudaRuntime.h b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/P1_Sigma_sm_gg_ttxggg/CudaRuntime.h
deleted file mode 120000
index ce9e1a487a..0000000000
--- a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/P1_Sigma_sm_gg_ttxggg/CudaRuntime.h
+++ /dev/null
@@ -1 +0,0 @@
-../CudaRuntime.h
\ No newline at end of file
diff --git a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/P1_Sigma_sm_gg_ttxggg/check_sa.cc b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/P1_Sigma_sm_gg_ttxggg/check_sa.cc
index f1e75b9252..1bad694d1c 100644
--- a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/P1_Sigma_sm_gg_ttxggg/check_sa.cc
+++ b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/P1_Sigma_sm_gg_ttxggg/check_sa.cc
@@ -4,7 +4,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: O. Mattelaer (Nov 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 
 #include "mgOnGpuConfig.h"
@@ -12,6 +12,7 @@
 #include "BridgeKernels.h"
 #include "CPPProcess.h"
 #include "CrossSectionKernels.h"
+#include "GpuRuntime.h"
 #include "MatrixElementKernels.h"
 #include "MemoryAccessMatrixElements.h"
 #include "MemoryAccessMomenta.h"
@@ -63,7 +64,7 @@ usage( char* argv0, int ret = 1 )
   std::cout << std::endl;
   std::cout << "Summary stats are always computed: '-p' and '-j' only control their printout" << std::endl;
   std::cout << "The '-d' flag only enables NaN/abnormal warnings and OMP debugging" << std::endl;
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 #ifdef _OPENMP
   std::cout << std::endl;
   std::cout << "Use the OMP_NUM_THREADS environment variable to control OMP multi-threading" << std::endl;
@@ -77,7 +78,7 @@ int
 main( int argc, char** argv )
 {
   // Namespaces for CUDA and C++ (FIXME - eventually use the same namespace everywhere...)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   using namespace mg5amcGpu;
 #else
   using namespace mg5amcCpu;
@@ -103,11 +104,11 @@ main( int argc, char** argv )
     CurandDevice = 2
   };
 #ifdef __CUDACC__
-  RandomNumberMode rndgen = RandomNumberMode::CurandDevice; // default on GPU
+  RandomNumberMode rndgen = RandomNumberMode::CurandDevice; // default on CUDA GPU
 #elif not defined MGONGPU_HAS_NO_CURAND
   RandomNumberMode rndgen = RandomNumberMode::CurandHost;  // default on CPU if build has curand
 #else
-  RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // default on CPU if build has no curand
+  RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // default on CPU if build has no curand and on HIP GPU
 #endif
   // Rambo sampling mode (NB RamboHost implies CommonRandom or CurandHost!)
   enum class RamboSamplingMode
@@ -115,7 +116,7 @@ main( int argc, char** argv )
     RamboHost = 1,
     RamboDevice = 2
   };
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   RamboSamplingMode rmbsmp = RamboSamplingMode::RamboDevice; // default on GPU
 #else
   RamboSamplingMode rmbsmp = RamboSamplingMode::RamboHost; // default on CPU
@@ -148,7 +149,7 @@ main( int argc, char** argv )
 #ifdef __CUDACC__
       rndgen = RandomNumberMode::CurandDevice;
 #else
-      throw std::runtime_error( "CurandDevice is not supported on CPUs" );
+      throw std::runtime_error( "CurandDevice is not supported on CPUs or on HIP GPUs" );
 #endif
     }
     else if( arg == "--curhst" )
@@ -165,7 +166,7 @@ main( int argc, char** argv )
     }
     else if( arg == "--rmbdev" )
     {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
       rmbsmp = RamboSamplingMode::RamboDevice;
 #else
       throw std::runtime_error( "RamboDevice is not supported on CPUs" );
@@ -239,13 +240,13 @@ main( int argc, char** argv )
     return usage( argv[0] );
   }
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 #ifdef _OPENMP
   ompnumthreadsNotSetMeansOneThread( debug ? 1 : 0 ); // quiet(-1), info(0), debug(1)
 #endif
 #endif
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // Fail gently and avoid "Illegal instruction (core dumped)" if the host does not support the SIMD used in the ME calculation
   // Note: this prevents a crash on pmpe04 but not on some github CI nodes?
   // [NB: SIMD vectorization in mg5amc C++ code is only used in the ME calculation below MatrixElementKernelHost!]
@@ -263,14 +264,14 @@ main( int argc, char** argv )
 
   // === STEP 0 - INITIALISE
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 
-  // --- 00. Initialise cuda
-  // Instantiate a CudaRuntime at the beginnining of the application's main to
-  // invoke cudaSetDevice(0) in the constructor and book a cudaDeviceReset() call in the destructor
-  const std::string cdinKey = "00 CudaInit";
+  // --- 00. Initialise GPU
+  // Instantiate a GpuRuntime at the beginnining of the application's main.
+  // For CUDA this invokes cudaSetDevice(0) in the constructor and books a cudaDeviceReset() call in the destructor.
+  const std::string cdinKey = "00 GpuInit";
   timermap.start( cdinKey );
-  CudaRuntime cudaRuntime( debug );
+  GpuRuntime GpuRuntime( debug );
 #endif
 
   // --- 0a. Initialise physics process
@@ -292,7 +293,7 @@ main( int argc, char** argv )
   timermap.start( alloKey );
 
   // Memory buffers for random numbers for momenta
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferRndNumMomenta hstRndmom( nevt );
 #else
   PinnedHostBufferRndNumMomenta hstRndmom( nevt );
@@ -300,7 +301,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for sampling weights
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferWeights hstWeights( nevt );
 #else
   PinnedHostBufferWeights hstWeights( nevt );
@@ -308,7 +309,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for momenta
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferMomenta hstMomenta( nevt );
 #else
   PinnedHostBufferMomenta hstMomenta( nevt );
@@ -316,7 +317,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for Gs
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferGs hstGs( nevt );
 #else
   PinnedHostBufferGs hstGs( nevt );
@@ -333,7 +334,7 @@ main( int argc, char** argv )
   }
 
   // Memory buffers for matrix elements
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferMatrixElements hstMatrixElements( nevt );
 #else
   PinnedHostBufferMatrixElements hstMatrixElements( nevt );
@@ -342,7 +343,7 @@ main( int argc, char** argv )
 
   // Memory buffers for random numbers for helicity selection
   // *** NB #403 these buffers always remain initialised at 0: no need for helicity choice in gcheck/check (no LHE produced) ***
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferRndNumHelicity hstRndHel( nevt );
 #else
   PinnedHostBufferRndNumHelicity hstRndHel( nevt );
@@ -351,7 +352,7 @@ main( int argc, char** argv )
 
   // Memory buffers for random numbers for color selection
   // *** NB #402 these buffers always remain initialised at 0: no need for color choice in gcheck/check (no LHE produced) ***
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferRndNumColor hstRndCol( nevt );
 #else
   PinnedHostBufferRndNumColor hstRndCol( nevt );
@@ -359,7 +360,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for helicity selection
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferSelectedHelicity hstSelHel( nevt );
 #else
   PinnedHostBufferSelectedHelicity hstSelHel( nevt );
@@ -367,7 +368,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for color selection
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferSelectedColor hstSelCol( nevt );
 #else
   PinnedHostBufferSelectedColor hstSelCol( nevt );
@@ -403,7 +404,7 @@ main( int argc, char** argv )
 #else
   else
   {
-    throw std::logic_error( "CurandDevice is not supported on CPUs" ); // INTERNAL ERROR (no path to this statement)
+    throw std::logic_error( "CurandDevice is not supported on CPUs or HIP GPUs" ); // INTERNAL ERROR (no path to this statement)
   }
 #endif
 #else
@@ -421,7 +422,7 @@ main( int argc, char** argv )
   }
   else
   {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     prsk.reset( new RamboSamplingKernelDevice( energy, devRndmom, devMomenta, devWeights, gpublocks, gputhreads ) );
 #else
     throw std::logic_error( "RamboDevice is not supported on CPUs" ); // INTERNAL ERROR (no path to this statement)
@@ -432,7 +433,7 @@ main( int argc, char** argv )
   std::unique_ptr<MatrixElementKernelBase> pmek;
   if( !bridge )
   {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     pmek.reset( new MatrixElementKernelDevice( devMomenta, devGs, devRndHel, devRndCol, devMatrixElements, devSelHel, devSelCol, gpublocks, gputhreads ) );
 #else
     pmek.reset( new MatrixElementKernelHost( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, nevt ) );
@@ -440,7 +441,7 @@ main( int argc, char** argv )
   }
   else
   {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     pmek.reset( new BridgeKernelDevice( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, gpublocks, gputhreads ) );
 #else
     pmek.reset( new BridgeKernelHost( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, nevt ) );
@@ -482,7 +483,7 @@ main( int argc, char** argv )
     prnk->generateRnarray();
     //std::cout << "Got random numbers" << std::endl;
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     if( rndgen != RandomNumberMode::CurandDevice && rmbsmp == RamboSamplingMode::RamboDevice )
     {
       // --- 1c. Copy rndmom from host to device
@@ -514,7 +515,7 @@ main( int argc, char** argv )
     prsk->getMomentaFinal();
     //std::cout << "Got final momenta" << std::endl;
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     if( rmbsmp == RamboSamplingMode::RamboDevice )
     {
       // --- 2c. CopyDToH Weights
@@ -559,7 +560,7 @@ main( int argc, char** argv )
       dynamic_cast<BridgeKernelBase*>( pmek.get() )->transposeInputMomentaC2F();
     }
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     // --- 2d. CopyHToD Momenta
     const std::string gKey = "0.. CpHTDg";
     rambtime += timermap.start( gKey ); // FIXME! NOT A RAMBO TIMER!
@@ -588,7 +589,7 @@ main( int argc, char** argv )
     wv3atime += timermap.stop(); // calc only
     wavetime += wv3atime;        // calc plus copy
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     if( !bridge )
     {
       // --- 3b. CopyDToH MEs
@@ -731,15 +732,19 @@ main( int argc, char** argv )
     rndgentxt = "CURAND DEVICE";
 #ifdef __CUDACC__
   rndgentxt += " (CUDA code)";
+#elif defined __HIPCC__
+  rndgentxt += " (HIP code)";
 #else
   rndgentxt += " (C++ code)";
 #endif
 
   // Workflow description summary
   std::string wrkflwtxt;
-  // -- CUDA or C++?
+  // -- CUDA or HIP or C++?
 #ifdef __CUDACC__
   wrkflwtxt += "CUD:";
+#elif defined __HIPCC__
+  wrkflwtxt += "HIP:";
 #else
   wrkflwtxt += "CPP:";
 #endif
@@ -764,6 +769,12 @@ main( int argc, char** argv )
 #else
   wrkflwtxt += "???:"; // no path to this statement
 #endif
+#elif defined __HIPCC__
+#if defined MGONGPU_CUCXTYPE_CXSMPL
+  wrkflwtxt += "CXS:";
+#else
+  wrkflwtxt += "???:"; // no path to this statement
+#endif
 #else
 #if defined MGONGPU_CPPCXTYPE_STDCOMPLEX
   wrkflwtxt += "STX:";
@@ -789,7 +800,7 @@ main( int argc, char** argv )
     wrkflwtxt += "RMBDEV+";
   else
     wrkflwtxt += "??????+"; // no path to this statement
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   // -- HOST or DEVICE matrix elements? Standalone MEs or BRIDGE?
   if( !bridge )
     wrkflwtxt += "MESDEV";
@@ -845,7 +856,7 @@ main( int argc, char** argv )
 
   if( perf )
   {
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 #ifdef _OPENMP
     // Get the output of "nproc --all" (https://stackoverflow.com/a/478960)
     std::string nprocall;
@@ -866,6 +877,8 @@ main( int argc, char** argv )
     std::cout << std::string( SEP79, '*' ) << std::endl
 #ifdef __CUDACC__
               << "Process                     = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_CUDA"
+#elif defined __HIPCC__
+              << "Process                     = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_HIP"
 #else
               << "Process                     = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_CPP"
 #endif
@@ -892,21 +905,21 @@ main( int argc, char** argv )
 #elif defined MGONGPU_FPTYPE_FLOAT
               << "FP precision                = FLOAT (NaN/abnormal=" << nabn << ", zero=" << nzero << ")" << std::endl
 #endif
-#ifdef __CUDACC__
 #if defined MGONGPU_CUCXTYPE_CUCOMPLEX
               << "Complex type                = CUCOMPLEX" << std::endl
 #elif defined MGONGPU_CUCXTYPE_THRUST
               << "Complex type                = THRUST::COMPLEX" << std::endl
-#endif
-#else
+#elif defined MGONGPU_CUCXTYPE_CXSMPL
               << "Complex type                = STD::COMPLEX" << std::endl
+#else
+              << "Complex type                = ???" << std::endl // no path to this statement...
 #endif
               << "RanNumb memory layout       = AOSOA[" << neppR << "]"
               << ( neppR == 1 ? " == AOS" : "" )
               << " [HARDCODED FOR REPRODUCIBILITY]" << std::endl
               << "Momenta memory layout       = AOSOA[" << neppM << "]"
               << ( neppM == 1 ? " == AOS" : "" ) << std::endl
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     //<< "Wavefunction GPU memory     = LOCAL" << std::endl
 #else
 #if !defined MGONGPU_CPPSIMD
@@ -937,7 +950,7 @@ main( int argc, char** argv )
 #endif
 #endif
               << "Random number generation    = " << rndgentxt << std::endl
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 #ifdef _OPENMP
               << "OMP threads / `nproc --all` = " << omp_get_max_threads() << " / " << nprocall // includes a newline
 #endif
@@ -1033,14 +1046,14 @@ main( int argc, char** argv )
              << "\"FLOAT (NaN/abnormal=" << nabn << ")\"," << std::endl
 #endif
              << "\"Complex type\": "
-#ifdef __CUDACC__
 #if defined MGONGPU_CUCXTYPE_CUCOMPLEX
              << "\"CUCOMPLEX\"," << std::endl
 #elif defined MGONGPU_CUCXTYPE_THRUST
              << "\"THRUST::COMPLEX\"," << std::endl
-#endif
-#else
+#elif defined MGONGPU_CUCXTYPE_CXSMPL
              << "\"STD::COMPLEX\"," << std::endl
+#else
+             << "\"???\"," << std::endl                           // no path to this statement...
 #endif
              << "\"RanNumb memory layout\": "
              << "\"AOSOA[" << neppR << "]\""
@@ -1048,7 +1061,7 @@ main( int argc, char** argv )
              << "\"Momenta memory layout\": "
              << "\"AOSOA[" << neppM << "]\""
              << ( neppM == 1 ? " == AOS" : "" ) << ", " << std::endl
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     //<< "\"Wavefunction GPU memory\": " << "\"LOCAL\"," << std::endl
 #endif
              << "\"Curand generation\": "
diff --git a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/RamboSamplingKernels.cc b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/RamboSamplingKernels.cc
index da68aa9255..79abbcc4f8 100644
--- a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/RamboSamplingKernels.cc
+++ b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/RamboSamplingKernels.cc
@@ -1,11 +1,11 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #include "RamboSamplingKernels.h"
 
-#include "CudaRuntime.h"
+#include "GpuRuntime.h"
 #include "MemoryAccessMomenta.h"
 #include "MemoryAccessRandomNumbers.h"
 #include "MemoryAccessWeights.h"
@@ -14,7 +14,7 @@
 
 #include <sstream>
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -92,7 +92,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   RamboSamplingKernelDevice::RamboSamplingKernelDevice( const fptype energy,               // input: energy
                                                         const BufferRndNumMomenta& rndmom, // input: random numbers in [0,1]
                                                         BufferMomenta& momenta,            // output: momenta
@@ -135,7 +135,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   __global__ void
   getMomentaInitialDevice( const fptype energy,
                            fptype* momenta )
@@ -147,17 +147,17 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   void
   RamboSamplingKernelDevice::getMomentaInitial()
   {
-    getMomentaInitialDevice<<<m_gpublocks, m_gputhreads>>>( m_energy, m_momenta.data() );
+    gpuLaunchKernel( getMomentaInitialDevice, m_gpublocks, m_gputhreads, m_energy, m_momenta.data() );
   }
 #endif
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   __global__ void
   getMomentaFinalDevice( const fptype energy,
                          const fptype* rndmom,
@@ -171,11 +171,11 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   void
   RamboSamplingKernelDevice::getMomentaFinal()
   {
-    getMomentaFinalDevice<<<m_gpublocks, m_gputhreads>>>( m_energy, m_rndmom.data(), m_momenta.data(), m_weights.data() );
+    gpuLaunchKernel( getMomentaFinalDevice, m_gpublocks, m_gputhreads, m_energy, m_rndmom.data(), m_momenta.data(), m_weights.data() );
   }
 #endif
 
diff --git a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/RamboSamplingKernels.h b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/RamboSamplingKernels.h
index 184089efd7..7c214cd74b 100644
--- a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/RamboSamplingKernels.h
+++ b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/RamboSamplingKernels.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef RAMBOSAMPLINGKERNELS_H
 #define RAMBOSAMPLINGKERNELS_H 1
@@ -10,7 +10,7 @@
 
 #include "MemoryBuffers.h"
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -93,7 +93,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   // A class encapsulating RAMBO phase space sampling on a GPU device
   class RamboSamplingKernelDevice final : public SamplingKernelBase, public NumberOfEvents
   {
diff --git a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/RandomNumberKernels.h b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/RandomNumberKernels.h
index 188a72c2c9..21d63beeac 100644
--- a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/RandomNumberKernels.h
+++ b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/RandomNumberKernels.h
@@ -1,14 +1,14 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef RANDOMNUMBERKERNELS_H
 #define RANDOMNUMBERKERNELS_H 1
 
 #include "mgOnGpuConfig.h"
 
-// NB This must come AFTER mgOnGpuConfig.h which contains our definition of __global__ when __CUDACC__ is not defined
+// NB This must come AFTER mgOnGpuConfig.h which contains our definition of __global__ when MGONGPUCPP_GPUIMPL is not defined
 #ifndef MGONGPU_HAS_NO_CURAND
 //#include "curand.h"
 struct curandGenerator_st; // forward definition from curand.h
@@ -16,7 +16,7 @@ struct curandGenerator_st; // forward definition from curand.h
 
 #include "MemoryBuffers.h"
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/cudacpp.mk b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/cudacpp.mk
index 43cee0977e..77334d2c04 100644
--- a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/cudacpp.mk
+++ b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/cudacpp.mk
@@ -1,7 +1,7 @@
 # Copyright (C) 2020-2023 CERN and UCLouvain.
 # Licensed under the GNU Lesser General Public License (version 3 or later).
 # Created by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin.
-# Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+# Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 
 #=== Determine the name of this makefile (https://ftp.gnu.org/old-gnu/Manuals/make-3.80/html_node/make_17.html)
 #=== NB: different names (e.g. cudacpp.mk and cudacpp_src.mk) are used in the Subprocess and src directories
@@ -32,7 +32,7 @@ UNAME_P := $(shell uname -p)
 #=== Configure common compiler flags for C++ and CUDA
 
 INCFLAGS = -I.
-OPTFLAGS = -O3 # this ends up in CUFLAGS too (should it?), cannot add -Ofast or -ffast-math here
+OPTFLAGS = -O3 # this ends up in GPUFLAGS too (should it?), cannot add -Ofast or -ffast-math here
 
 # Dependency on src directory
 MG5AMC_COMMONLIB = mg5amc_common
@@ -103,69 +103,135 @@ endif
 
 #-------------------------------------------------------------------------------
 
-#=== Configure the CUDA compiler
-
-# If CXX is not a single word (example "clang++ --gcc-toolchain...") then disable CUDA builds (issue #505)
-# This is because it is impossible to pass this to "CUFLAGS += -ccbin <host-compiler>" below
-ifneq ($(words $(subst ccache ,,$(CXX))),1) # allow at most "CXX=ccache <host-compiler>" from outside
-  $(warning CUDA builds are not supported for multi-word CXX "$(CXX)")
-  override CUDA_HOME=disabled
-endif
-
-# If CUDA_HOME is not set, try to set it from the location of nvcc
-ifndef CUDA_HOME
-  CUDA_HOME = $(patsubst %bin/nvcc,%,$(shell which nvcc 2>/dev/null))
-  $(warning CUDA_HOME was not set: using "$(CUDA_HOME)")
-endif
-
-# Set NVCC as $(CUDA_HOME)/bin/nvcc if it exists
-ifneq ($(wildcard $(CUDA_HOME)/bin/nvcc),)
-  NVCC = $(CUDA_HOME)/bin/nvcc
-  USE_NVTX ?=-DUSE_NVTX
-  # See https://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/index.html
-  # See https://arnon.dk/matching-sm-architectures-arch-and-gencode-for-various-nvidia-cards/
-  # Default: use compute capability 70 for V100 (CERN lxbatch, CERN itscrd, Juwels Cluster).
-  # Embed device code for 70, and PTX for 70+.
-  # Export MADGRAPH_CUDA_ARCHITECTURE (comma-separated list) to use another value or list of values (see #533).
-  # Examples: use 60 for P100 (Piz Daint), 80 for A100 (Juwels Booster, NVidia raplab/Curiosity).
-  MADGRAPH_CUDA_ARCHITECTURE ?= 70
-  ###CUARCHFLAGS = -gencode arch=compute_$(MADGRAPH_CUDA_ARCHITECTURE),code=compute_$(MADGRAPH_CUDA_ARCHITECTURE) -gencode arch=compute_$(MADGRAPH_CUDA_ARCHITECTURE),code=sm_$(MADGRAPH_CUDA_ARCHITECTURE) # Older implementation (AV): go back to this one for multi-GPU support #533
-  ###CUARCHFLAGS = --gpu-architecture=compute_$(MADGRAPH_CUDA_ARCHITECTURE) --gpu-code=sm_$(MADGRAPH_CUDA_ARCHITECTURE),compute_$(MADGRAPH_CUDA_ARCHITECTURE) # Newer implementation (SH): cannot use this as-is for multi-GPU support #533
-  comma:=,
-  CUARCHFLAGS = $(foreach arch,$(subst $(comma), ,$(MADGRAPH_CUDA_ARCHITECTURE)),-gencode arch=compute_$(arch),code=compute_$(arch) -gencode arch=compute_$(arch),code=sm_$(arch))
-  CUINC = -I$(CUDA_HOME)/include/
-  CURANDLIBFLAGS = -L$(CUDA_HOME)/lib64/ -lcurand # NB: -lcuda is not needed here!
-  CUOPTFLAGS = -lineinfo
-  CUFLAGS = $(foreach opt, $(OPTFLAGS), -Xcompiler $(opt)) $(CUOPTFLAGS) $(INCFLAGS) $(CUINC) $(USE_NVTX) $(CUARCHFLAGS) -use_fast_math
-  ###CUFLAGS += -Xcompiler -Wall -Xcompiler -Wextra -Xcompiler -Wshadow
-  ###NVCC_VERSION = $(shell $(NVCC) --version | grep 'Cuda compilation tools' | cut -d' ' -f5 | cut -d, -f1)
-  CUFLAGS += -std=c++17 # need CUDA >= 11.2 (see #333): this is enforced in mgOnGpuConfig.h
-  # Without -maxrregcount: baseline throughput: 6.5E8 (16384 32 12) up to 7.3E8 (65536 128 12)
-  ###CUFLAGS+= --maxrregcount 160 # improves throughput: 6.9E8 (16384 32 12) up to 7.7E8 (65536 128 12)
-  ###CUFLAGS+= --maxrregcount 128 # improves throughput: 7.3E8 (16384 32 12) up to 7.6E8 (65536 128 12)
-  ###CUFLAGS+= --maxrregcount 96 # degrades throughput: 4.1E8 (16384 32 12) up to 4.5E8 (65536 128 12)
-  ###CUFLAGS+= --maxrregcount 64 # degrades throughput: 1.7E8 (16384 32 12) flat at 1.7E8 (65536 128 12)
-else ifneq ($(origin REQUIRE_CUDA),undefined)
-  # If REQUIRE_CUDA is set but no cuda is found, stop here (e.g. for CI tests on GPU #443)
-  $(error No cuda installation found (set CUDA_HOME or make nvcc visible in PATH))
-else
-  # No cuda. Switch cuda compilation off and go to common random numbers in C++
-  $(warning CUDA_HOME is not set or is invalid: export CUDA_HOME to compile with cuda)
-  override NVCC=
-  override USE_NVTX=
-  override CUINC=
-  override CURANDLIBFLAGS=
-endif
-export NVCC
-export CUFLAGS
+CUDA_COMPILER_PATH := $(shell compiler="`which nvcc 2>/dev/null`" && while [ -L "$$compiler" ]; do compiler=`readlink "$$compiler"`; done && echo "$$compiler")
+HIP_COMPILER_PATH := $(shell compiler="`which hipcc 2>/dev/null`" && while [ -L "$$compiler" ]; do compiler=`readlink "$$compiler"`; done && echo "$$compiler")
+
+ifeq ($(findstring nvcc,$(CUDA_COMPILER_PATH)),nvcc)
+  #=== Configure the CUDA compiler
+
+  # If CXX is not a single word (example "clang++ --gcc-toolchain...") then disable CUDA builds (issue #505)
+  # This is because it is impossible to pass this to "GPUFLAGS += -ccbin <host-compiler>" below
+  ifneq ($(words $(subst ccache ,,$(CXX))),1) # allow at most "CXX=ccache <host-compiler>" from outside
+    $(warning CUDA builds are not supported for multi-word CXX "$(CXX)")
+    override CUDA_HOME=disabled
+  endif
+
+  # If CUDA_HOME is not set, try to set it from the location of nvcc
+  ifndef CUDA_HOME
+    CUDA_HOME = $(patsubst %bin/nvcc,%,$(shell which nvcc 2>/dev/null))
+    $(warning CUDA_HOME was not set: using "$(CUDA_HOME)")
+  endif
+
+  # Set GPUCC as $(CUDA_HOME)/bin/nvcc if it exists
+  ifneq ($(wildcard $(CUDA_HOME)/bin/nvcc),)
+    GPUCC = $(CUDA_HOME)/bin/nvcc
+    USE_NVTX ?=-DUSE_NVTX
+    # See https://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/index.html
+    # See https://arnon.dk/matching-sm-architectures-arch-and-gencode-for-various-nvidia-cards/
+    # Default: use compute capability 70 for V100 (CERN lxbatch, CERN itscrd, Juwels Cluster).
+    # Embed device code for 70, and PTX for 70+.
+    # Export MADGRAPH_CUDA_ARCHITECTURE (comma-separated list) to use another value or list of values (see #533).
+    # Examples: use 60 for P100 (Piz Daint), 80 for A100 (Juwels Booster, NVidia raplab/Curiosity).
+    MADGRAPH_CUDA_ARCHITECTURE ?= 70
+    ###CUARCHFLAGS = -gencode arch=compute_$(MADGRAPH_CUDA_ARCHITECTURE),code=compute_$(MADGRAPH_CUDA_ARCHITECTURE) -gencode arch=compute_$(MADGRAPH_CUDA_ARCHITECTURE),code=sm_$(MADGRAPH_CUDA_ARCHITECTURE) # Older implementation (AV): go back to this one for multi-GPU support #533
+    ###CUARCHFLAGS = --gpu-architecture=compute_$(MADGRAPH_CUDA_ARCHITECTURE) --gpu-code=sm_$(MADGRAPH_CUDA_ARCHITECTURE),compute_$(MADGRAPH_CUDA_ARCHITECTURE) # Newer implementation (SH): cannot use this as-is for multi-GPU support #533
+    comma:=,
+    CUARCHFLAGS = $(foreach arch,$(subst $(comma), ,$(MADGRAPH_CUDA_ARCHITECTURE)),-gencode arch=compute_$(arch),code=compute_$(arch) -gencode arch=compute_$(arch),code=sm_$(arch))
+    CUINC = -I$(CUDA_HOME)/include/
+    CURANDLIBFLAGS = -L$(CUDA_HOME)/lib64/ -lcurand # NB: -lcuda is not needed here!
+    CUOPTFLAGS = -lineinfo
+    GPUFLAGS = $(foreach opt, $(OPTFLAGS), -Xcompiler $(opt)) $(CUOPTFLAGS) $(INCFLAGS) $(CUINC) $(USE_NVTX) $(CUARCHFLAGS) -use_fast_math
+    ###GPUFLAGS += -Xcompiler -Wall -Xcompiler -Wextra -Xcompiler -Wshadow
+    ###GPUCC_VERSION = $(shell $(GPUCC) --version | grep 'Cuda compilation tools' | cut -d' ' -f5 | cut -d, -f1)
+    GPUFLAGS += -std=c++17 # need CUDA >= 11.2 (see #333): this is enforced in mgOnGpuConfig.h
+    # Without -maxrregcount: baseline throughput: 6.5E8 (16384 32 12) up to 7.3E8 (65536 128 12)
+    ###GPUFLAGS+= --maxrregcount 160 # improves throughput: 6.9E8 (16384 32 12) up to 7.7E8 (65536 128 12)
+    ###GPUFLAGS+= --maxrregcount 128 # improves throughput: 7.3E8 (16384 32 12) up to 7.6E8 (65536 128 12)
+    ###GPUFLAGS+= --maxrregcount 96 # degrades throughput: 4.1E8 (16384 32 12) up to 4.5E8 (65536 128 12)
+    ###GPUFLAGS+= --maxrregcount 64 # degrades throughput: 1.7E8 (16384 32 12) flat at 1.7E8 (65536 128 12)
+    CUBUILDRULEFLAGS = -Xcompiler -fPIC -c
+    CCBUILDRULEFLAGS = -Xcompiler -fPIC -c -x cu
+    CUDATESTFLAGS = -lcuda
+  else ifneq ($(origin REQUIRE_CUDA),undefined)
+    # If REQUIRE_CUDA is set but no cuda is found, stop here (e.g. for CI tests on GPU #443)
+    $(error No cuda installation found (set CUDA_HOME or make GPUCC visible in PATH))
+  else
+    # No cuda. Switch cuda compilation off and go to common random numbers in C++
+    $(warning CUDA_HOME is not set or is invalid: export CUDA_HOME to compile with cuda)
+    override GPUCC=
+    override USE_NVTX=
+    override CUINC=
+    override CURANDLIBFLAGS=
+  endif
+  export GPUCC
+  export GPUFLAGS
+
+  # Set the host C++ compiler for GPUCC via "-ccbin <host-compiler>"
+  # (NB issue #505: this must be a single word, "clang++ --gcc-toolchain..." is not supported)
+  GPUFLAGS += -ccbin $(shell which $(subst ccache ,,$(CXX)))
+
+  # Allow newer (unsupported) C++ compilers with older versions of CUDA if ALLOW_UNSUPPORTED_COMPILER_IN_CUDA is set (#504)
+  ifneq ($(origin ALLOW_UNSUPPORTED_COMPILER_IN_CUDA),undefined)
+  GPUFLAGS += -allow-unsupported-compiler
+  endif
+
+else ifeq ($(findstring hipcc,$(HIP_COMPILER_PATH)),hipcc)
+  #=== Configure the HIP compiler
+
+  # If CXX is not a single word (example "clang++ --gcc-toolchain...") then disable CUDA builds (issue #505)
+  # This is because it is impossible to pass this to "GPUFLAGS += -ccbin <host-compiler>" below
+  ifneq ($(words $(subst ccache ,,$(CXX))),1) # allow at most "CXX=ccache <host-compiler>" from outside
+    $(warning CUDA builds are not supported for multi-word CXX "$(CXX)")
+    override CUDA_HOME=disabled
+  endif
+
+  # If HIP_HOME is not set, try to set it from the location of GPUCC
+  ifndef HIP_HOME
+    HIP_HOME = $(patsubst %bin/hipcc,%,$(HIP_COMPILER_PATH))
+    $(warning HIP_HOME was not set: using "$(HIP_HOME)")
+  endif
 
-# Set the host C++ compiler for nvcc via "-ccbin <host-compiler>"
-# (NB issue #505: this must be a single word, "clang++ --gcc-toolchain..." is not supported)
-CUFLAGS += -ccbin $(shell which $(subst ccache ,,$(CXX)))
+  # Set GPUCC as $(HIP_HOME)/bin/hipcc if it exists
+  ifneq ($(wildcard $(HIP_HOME)/bin/hipcc),)
+    GPUCC = $(HIP_HOME)/bin/hipcc
+
+    # Should maybe find something equivelant to this in HIP
+    #USE_NVTX ?=-DUSE_NVTX
+
+    HIPARCHFLAGS = -target x86_64-linux-gnu --offload-arch=gfx90a
+    HIPINC = -I$(HIP_HOME)/include/
+
+    # -DHIP_FAST_MATH equivelant to -use_fast_math in HIP 
+    # (But only for single precision line 208: https://rocm-developer-tools.github.io/HIP/hcc__detail_2math__functions_8h_source.html)
+    GPUFLAGS = $(OPTFLAGS) $(CUOPTFLAGS) $(INCFLAGS) $(HIPINC) $(HIPARCHFLAGS) -DHIP_FAST_MATH -DHIP_PLATFORM=amd -fPIC
+    ###GPUFLAGS += -Xcompiler -Wall -Xcompiler -Wextra -Xcompiler -Wshadow
+    GPUFLAGS += -std=c++17
+    # Without -maxrregcount: baseline throughput: 6.5E8 (16384 32 12) up to 7.3E8 (65536 128 12)
+    ###GPUFLAGS+= --maxrregcount 160 # improves throughput: 6.9E8 (16384 32 12) up to 7.7E8 (65536 128 12)
+    ###GPUFLAGS+= --maxrregcount 128 # improves throughput: 7.3E8 (16384 32 12) up to 7.6E8 (65536 128 12)
+    ###GPUFLAGS+= --maxrregcount 96 # degrades throughput: 4.1E8 (16384 32 12) up to 4.5E8 (65536 128 12)
+    ###GPUFLAGS+= --maxrregcount 64 # degrades throughput: 1.7E8 (16384 32 12) flat at 1.7E8 (65536 128 12)
+
+    CUBUILDRULEFLAGS = -fPIC -c
+    CCBUILDRULEFLAGS = -fPIC -c
+
+  else ifneq ($(origin REQUIRE_HIP),undefined)
+    # If REQUIRE_HIP is set but no cuda is found, stop here (e.g. for CI tests on GPU #443)
+    $(error No hip installation found (set HIP_HOME or make GPUCC visible in PATH))
+  else
+    # No hip. Switch hip compilation off and go to common random numbers in C++
+    $(warning HIP_HOME is not set or is invalid: export HIP_HOME to compile with hip)
+    override GPUCC=
+    override USE_NVTX=
+    override CUINC=
+    override CURANDLIBFLAGS=
+  endif
+
+  # Allow newer (unsupported) C++ compilers with older versions of CUDA if ALLOW_UNSUPPORTED_COMPILER_IN_CUDA is set (#504)
+  ifneq ($(origin ALLOW_UNSUPPORTED_COMPILER_IN_CUDA),undefined)
+  GPUFLAGS += -allow-unsupported-compiler
+  endif
 
-# Allow newer (unsupported) C++ compilers with older versions of CUDA if ALLOW_UNSUPPORTED_COMPILER_IN_CUDA is set (#504)
-ifneq ($(origin ALLOW_UNSUPPORTED_COMPILER_IN_CUDA),undefined)
-CUFLAGS += -allow-unsupported-compiler
 endif
 
 #-------------------------------------------------------------------------------
@@ -179,9 +245,9 @@ endif
 #ifeq ($(USECCACHE)$(shell echo $(AR) | grep ccache),1)
 #  override AR:=ccache $(AR)
 #endif
-ifneq ($(NVCC),)
-  ifeq ($(USECCACHE)$(shell echo $(NVCC) | grep ccache),1)
-    override NVCC:=ccache $(NVCC)
+ifneq ($(GPUCC),)
+  ifeq ($(USECCACHE)$(shell echo $(GPUCC) | grep ccache),1)
+    override GPUCC:=ccache $(GPUCC)
   endif
 endif
 
@@ -205,7 +271,7 @@ endif
 
 # PowerPC-specific CUDA compiler flags (to be reviewed!)
 ifeq ($(UNAME_P),ppc64le)
-  CUFLAGS+= -Xcompiler -mno-float128
+  GPUFLAGS+= -Xcompiler -mno-float128
 endif
 
 #-------------------------------------------------------------------------------
@@ -215,10 +281,10 @@ endif
 # Set the default OMPFLAGS choice
 ifneq ($(shell $(CXX) --version | egrep '^Intel'),)
 override OMPFLAGS = -fopenmp
-###override OMPFLAGS = # disable OpenMP MT on Intel (was ok without nvcc but not ok with nvcc before #578)
+###override OMPFLAGS = # disable OpenMP MT on Intel (was ok without GPUCC but not ok with GPUCC before #578)
 else ifneq ($(shell $(CXX) --version | egrep '^(clang)'),)
 override OMPFLAGS = -fopenmp
-###override OMPFLAGS = # disable OpenMP MT on clang (was not ok without or with nvcc before #578)
+###override OMPFLAGS = # disable OpenMP MT on clang (was not ok without or with GPUCC before #578)
 else ifneq ($(shell $(CXX) --version | egrep '^(Apple clang)'),)
 override OMPFLAGS = # disable OpenMP MT on Apple clang (builds fail in the CI #578)
 else
@@ -269,7 +335,10 @@ endif
 
 # Set the default RNDGEN (random number generator) choice
 ifeq ($(RNDGEN),)
-  ifeq ($(NVCC),)
+  ifeq ($(GPUCC),)
+    override RNDGEN = hasNoCurand
+  # Edgecase for HIP compilation
+  else ifeq ($(findstring hipcc,$(GPUCC)),hipcc)
     override RNDGEN = hasNoCurand
   else ifeq ($(RNDGEN),)
     override RNDGEN = hasCurand
@@ -344,13 +413,13 @@ CXXFLAGS+= $(AVXFLAGS)
 $(info FPTYPE=$(FPTYPE))
 ifeq ($(FPTYPE),d)
   CXXFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_DOUBLE
-  CUFLAGS  += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_DOUBLE
+  GPUFLAGS  += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_DOUBLE
 else ifeq ($(FPTYPE),f)
   CXXFLAGS += -DMGONGPU_FPTYPE_FLOAT -DMGONGPU_FPTYPE2_FLOAT
-  CUFLAGS  += -DMGONGPU_FPTYPE_FLOAT -DMGONGPU_FPTYPE2_FLOAT
+  GPUFLAGS  += -DMGONGPU_FPTYPE_FLOAT -DMGONGPU_FPTYPE2_FLOAT
 else ifeq ($(FPTYPE),m)
   CXXFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_FLOAT
-  CUFLAGS  += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_FLOAT
+  GPUFLAGS  += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_FLOAT
 else
   $(error Unknown FPTYPE='$(FPTYPE)': only 'd', 'f' and 'm' are supported)
 endif
@@ -359,7 +428,7 @@ endif
 $(info HELINL=$(HELINL))
 ifeq ($(HELINL),1)
   CXXFLAGS += -DMGONGPU_INLINE_HELAMPS
-  CUFLAGS  += -DMGONGPU_INLINE_HELAMPS
+  GPUFLAGS  += -DMGONGPU_INLINE_HELAMPS
 else ifneq ($(HELINL),0)
   $(error Unknown HELINL='$(HELINL)': only '0' and '1' are supported)
 endif
@@ -368,7 +437,7 @@ endif
 $(info HRDCOD=$(HRDCOD))
 ifeq ($(HRDCOD),1)
   CXXFLAGS += -DMGONGPU_HARDCODE_PARAM
-  CUFLAGS  += -DMGONGPU_HARDCODE_PARAM
+  GPUFLAGS  += -DMGONGPU_HARDCODE_PARAM
 else ifneq ($(HRDCOD),0)
   $(error Unknown HRDCOD='$(HRDCOD)': only '0' and '1' are supported)
 endif
@@ -420,11 +489,11 @@ ifeq ($(UNAME_S),Darwin)
   override CULIBFLAGSRPATH2 =
 else
   # RPATH to cuda/cpp libs when linking executables
-  override CXXLIBFLAGSRPATH = -Wl,-rpath,$(LIBDIRRPATH)
-  override CULIBFLAGSRPATH = -Xlinker -rpath,$(LIBDIRRPATH)
+  override CXXLIBFLAGSRPATH = -Wl,-rpath=$(LIBDIRRPATH)
+  override CULIBFLAGSRPATH = -Xlinker -rpath=$(LIBDIRRPATH)
   # RPATH to common lib when linking cuda/cpp libs
-  override CXXLIBFLAGSRPATH2 = -Wl,-rpath,'$$ORIGIN'
-  override CULIBFLAGSRPATH2 = -Xlinker -rpath,'$$ORIGIN'
+  override CXXLIBFLAGSRPATH2 = -Wl,-rpath='$$ORIGIN'
+  override CULIBFLAGSRPATH2 = -Xlinker -rpath='$$ORIGIN'
 endif
 
 # Setting LD_LIBRARY_PATH or DYLD_LIBRARY_PATH in the RUNTIME is no longer necessary (neither on Linux nor on Mac)
@@ -437,7 +506,7 @@ override RUNTIME =
 cxx_main=$(BUILDDIR)/check.exe
 fcxx_main=$(BUILDDIR)/fcheck.exe
 
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 cu_main=$(BUILDDIR)/gcheck.exe
 fcu_main=$(BUILDDIR)/fgcheck.exe
 else
@@ -468,15 +537,16 @@ $(BUILDDIR)/.build.$(TAG):
 	@touch $(BUILDDIR)/.build.$(TAG)
 
 # Generic target and build rules: objects from CUDA compilation
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 $(BUILDDIR)/%.o : %.cu *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
 	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
-	$(NVCC) $(CPPFLAGS) $(CUFLAGS) -Xcompiler -fPIC -c $< -o $@
+	$(GPUCC) $(CPPFLAGS) $(GPUFLAGS) $(CUBUILDRULEFLAGS) $< -o $@
 
 $(BUILDDIR)/%_cu.o : %.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
 	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
-	$(NVCC) $(CPPFLAGS) $(CUFLAGS) -Xcompiler -fPIC -c -x cu $< -o $@
+	$(GPUCC) $(CPPFLAGS) $(GPUFLAGS) $(CCBUILDRULEFLAGS) $< -o $@
 endif
+# -x cu in line above
 
 # Generic target and build rules: objects from C++ compilation
 # (NB do not include CUINC here! add it only for NVTX or curand #679)
@@ -485,11 +555,14 @@ $(BUILDDIR)/%.o : %.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
 	$(CXX) $(CPPFLAGS) $(CXXFLAGS) -fPIC -c $< -o $@
 
 # Apply special build flags only to CrossSectionKernel.cc and gCrossSectionKernel.cu (no fast math, see #117 and #516)
+# Added edgecase for HIP compilation
 ifeq ($(shell $(CXX) --version | grep ^nvc++),)
 $(BUILDDIR)/CrossSectionKernels.o: CXXFLAGS := $(filter-out -ffast-math,$(CXXFLAGS))
 $(BUILDDIR)/CrossSectionKernels.o: CXXFLAGS += -fno-fast-math
-ifneq ($(NVCC),)
-$(BUILDDIR)/gCrossSectionKernels.o: CUFLAGS += -Xcompiler -fno-fast-math
+ifeq ($(findstring nvcc,$(GPUCC)),nvcc)
+  $(BUILDDIR)/gCrossSectionKernels.o: GPUFLAGS += -Xcompiler -fno-fast-math
+else
+  $(BUILDDIR)/gCrossSectionKernels.o: GPUFLAGS += -fno-fast-math
 endif
 endif
 
@@ -505,10 +578,10 @@ ifeq ($(RNDGEN),hasCurand)
 $(BUILDDIR)/CurandRandomNumberKernel.o: CXXFLAGS += $(CUINC)
 endif
 
-# Avoid "warning: builtin __has_trivial_... is deprecated; use __is_trivially_... instead" in nvcc with icx2023 (#592)
+# Avoid "warning: builtin __has_trivial_... is deprecated; use __is_trivially_... instead" in GPUCC with icx2023 (#592)
 ifneq ($(shell $(CXX) --version | egrep '^(Intel)'),)
-ifneq ($(NVCC),)
-CUFLAGS += -Xcompiler -Wno-deprecated-builtins
+ifneq ($(GPUCC),)
+GPUFLAGS += -Wno-deprecated-builtins
 endif
 endif
 
@@ -516,8 +589,8 @@ endif
 # This patch does remove the warning, but I prefer to keep it disabled for the moment...
 ###ifneq ($(shell $(CXX) --version | egrep '^(clang|Apple clang|Intel)'),)
 ###$(BUILDDIR)/CrossSectionKernels.o: CXXFLAGS += -Wno-overriding-t-option
-###ifneq ($(NVCC),)
-###$(BUILDDIR)/gCrossSectionKernels.o: CUFLAGS += -Xcompiler -Wno-overriding-t-option
+###ifneq ($(GPUCC),)
+###$(BUILDDIR)/gCrossSectionKernels.o: GPUFLAGS += -Xcompiler -Wno-overriding-t-option
 ###endif
 ###endif
 
@@ -544,7 +617,7 @@ MG5AMC_CXXLIB = mg5amc_$(processid_short)_cpp
 cxx_objects_lib=$(BUILDDIR)/CPPProcess.o $(BUILDDIR)/MatrixElementKernels.o $(BUILDDIR)/BridgeKernels.o $(BUILDDIR)/CrossSectionKernels.o
 cxx_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel.o $(BUILDDIR)/RamboSamplingKernels.o
 
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 MG5AMC_CULIB = mg5amc_$(processid_short)_cuda
 cu_objects_lib=$(BUILDDIR)/gCPPProcess.o $(BUILDDIR)/gMatrixElementKernels.o $(BUILDDIR)/gBridgeKernels.o $(BUILDDIR)/gCrossSectionKernels.o
 cu_objects_exe=$(BUILDDIR)/gCommonRandomNumberKernel.o $(BUILDDIR)/gRamboSamplingKernels.o
@@ -556,11 +629,11 @@ $(LIBDIR)/lib$(MG5AMC_CXXLIB).so: cxx_objects_lib += $(BUILDDIR)/fbridge.o
 $(LIBDIR)/lib$(MG5AMC_CXXLIB).so: $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib)
 	$(CXX) -shared -o $@ $(cxx_objects_lib) $(CXXLIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB)
 
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 $(LIBDIR)/lib$(MG5AMC_CULIB).so: $(BUILDDIR)/fbridge_cu.o
 $(LIBDIR)/lib$(MG5AMC_CULIB).so: cu_objects_lib += $(BUILDDIR)/fbridge_cu.o
 $(LIBDIR)/lib$(MG5AMC_CULIB).so: $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cu_objects_lib)
-	$(NVCC) --shared -o $@ $(cu_objects_lib) $(CULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB)
+	$(GPUCC) --shared -o $@ $(cu_objects_lib) $(CULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB)
 endif
 
 #-------------------------------------------------------------------------------
@@ -577,16 +650,16 @@ $(cxx_main): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PAT
 $(cxx_main): $(BUILDDIR)/check_sa.o $(LIBDIR)/lib$(MG5AMC_CXXLIB).so $(cxx_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel.o
 	$(CXX) -o $@ $(BUILDDIR)/check_sa.o $(OMPFLAGS) -ldl -pthread $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_CXXLIB) $(cxx_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel.o $(CURANDLIBFLAGS)
 
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 ifneq ($(shell $(CXX) --version | grep ^Intel),)
-$(cu_main): LIBFLAGS += -lintlc # compile with icpx and link with nvcc (undefined reference to `_intel_fast_memcpy')
-$(cu_main): LIBFLAGS += -lsvml # compile with icpx and link with nvcc (undefined reference to `__svml_cos4_l9')
+$(cu_main): LIBFLAGS += -lintlc # compile with icpx and link with GPUCC (undefined reference to `_intel_fast_memcpy')
+$(cu_main): LIBFLAGS += -lsvml # compile with icpx and link with GPUCC (undefined reference to `__svml_cos4_l9')
 else ifneq ($(shell $(CXX) --version | grep ^nvc++),) # support nvc++ #531
 $(cu_main): LIBFLAGS += -L$(patsubst %bin/nvc++,%lib,$(subst ccache ,,$(CXX))) -lnvhpcatm -lnvcpumath -lnvc
 endif
 $(cu_main): LIBFLAGS += $(CULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH
 $(cu_main): $(BUILDDIR)/gcheck_sa.o $(LIBDIR)/lib$(MG5AMC_CULIB).so $(cu_objects_exe) $(BUILDDIR)/gCurandRandomNumberKernel.o
-	$(NVCC) -o $@ $(BUILDDIR)/gcheck_sa.o $(CUARCHFLAGS) $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe) $(BUILDDIR)/gCurandRandomNumberKernel.o $(CURANDLIBFLAGS)
+	$(GPUCC) -o $@ $(BUILDDIR)/gcheck_sa.o $(CUARCHFLAGS) $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe) $(BUILDDIR)/gCurandRandomNumberKernel.o $(CURANDLIBFLAGS)
 endif
 
 #-------------------------------------------------------------------------------
@@ -612,17 +685,17 @@ $(fcxx_main): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PA
 $(fcxx_main): $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler.o $(LIBDIR)/lib$(MG5AMC_CXXLIB).so $(cxx_objects_exe)
 	$(CXX) -o $@ $(BUILDDIR)/fcheck_sa.o $(OMPFLAGS) $(BUILDDIR)/fsampler.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CXXLIB) $(cxx_objects_exe)
 
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 ifneq ($(shell $(CXX) --version | grep ^Intel),)
-$(fcu_main): LIBFLAGS += -lintlc # compile with icpx and link with nvcc (undefined reference to `_intel_fast_memcpy')
-$(fcu_main): LIBFLAGS += -lsvml # compile with icpx and link with nvcc (undefined reference to `__svml_cos4_l9')
+$(fcu_main): LIBFLAGS += -lintlc # compile with icpx and link with GPUCC (undefined reference to `_intel_fast_memcpy')
+$(fcu_main): LIBFLAGS += -lsvml # compile with icpx and link with GPUCC (undefined reference to `__svml_cos4_l9')
 endif
 ifeq ($(UNAME_S),Darwin)
 $(fcu_main): LIBFLAGS += -L$(shell dirname $(shell $(FC) --print-file-name libgfortran.dylib)) # add path to libgfortran on Mac #375
 endif
 $(fcu_main): LIBFLAGS += $(CULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH
 $(fcu_main): $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler_cu.o $(LIBDIR)/lib$(MG5AMC_CULIB).so $(cu_objects_exe)
-	$(NVCC) -o $@ $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler_cu.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe)
+	$(GPUCC) -o $@ $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler_cu.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe)
 endif
 
 #-------------------------------------------------------------------------------
@@ -634,7 +707,7 @@ $(BUILDDIR)/testxxx.o: testxxx_cc_ref.txt
 $(testmain): $(BUILDDIR)/testxxx.o
 $(testmain): cxx_objects_exe += $(BUILDDIR)/testxxx.o # Comment out this line to skip the C++ test of xxx functions
 
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 $(BUILDDIR)/testxxx_cu.o: $(GTESTLIBS)
 $(BUILDDIR)/testxxx_cu.o: INCFLAGS += $(GTESTINC)
 $(BUILDDIR)/testxxx_cu.o: testxxx_cc_ref.txt
@@ -647,7 +720,7 @@ $(BUILDDIR)/testmisc.o: INCFLAGS += $(GTESTINC)
 $(testmain): $(BUILDDIR)/testmisc.o
 $(testmain): cxx_objects_exe += $(BUILDDIR)/testmisc.o # Comment out this line to skip the C++ miscellaneous tests
 
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 $(BUILDDIR)/testmisc_cu.o: $(GTESTLIBS)
 $(BUILDDIR)/testmisc_cu.o: INCFLAGS += $(GTESTINC)
 $(testmain): $(BUILDDIR)/testmisc_cu.o
@@ -659,12 +732,12 @@ $(BUILDDIR)/runTest.o: INCFLAGS += $(GTESTINC)
 $(testmain): $(BUILDDIR)/runTest.o
 $(testmain): cxx_objects_exe += $(BUILDDIR)/runTest.o
 
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 $(BUILDDIR)/runTest_cu.o: $(GTESTLIBS)
 $(BUILDDIR)/runTest_cu.o: INCFLAGS += $(GTESTINC)
 ifneq ($(shell $(CXX) --version | grep ^Intel),)
-$(testmain): LIBFLAGS += -lintlc # compile with icpx and link with nvcc (undefined reference to `_intel_fast_memcpy')
-$(testmain): LIBFLAGS += -lsvml # compile with icpx and link with nvcc (undefined reference to `__svml_cos4_l9')
+$(testmain): LIBFLAGS += -lintlc # compile with icpx and link with GPUCC (undefined reference to `_intel_fast_memcpy')
+$(testmain): LIBFLAGS += -lsvml # compile with icpx and link with GPUCC (undefined reference to `__svml_cos4_l9')
 else ifneq ($(shell $(CXX) --version | grep ^nvc++),) # support nvc++ #531
 $(testmain): LIBFLAGS += -L$(patsubst %bin/nvc++,%lib,$(subst ccache ,,$(CXX))) -lnvhpcatm -lnvcpumath -lnvc
 endif
@@ -688,14 +761,14 @@ $(testmain): LIBFLAGS += -lgomp
 endif
 endif
 
-ifeq ($(NVCC),) # link only runTest.o
+ifeq ($(GPUCC),) # link only runTest.o
 $(testmain): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH
 $(testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(cxx_objects_exe) $(GTESTLIBS)
 	$(CXX) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) -ldl -pthread $(LIBFLAGS)
 else # link both runTest.o and runTest_cu.o
 $(testmain): LIBFLAGS += $(CULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH
 $(testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) $(GTESTLIBS)
-	$(NVCC) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) -ldl $(LIBFLAGS) -lcuda
+	  $(GPUCC) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) -ldl $(LIBFLAGS) $(CUDATESTFLAGS)
 endif
 
 # Use flock (Linux only, no Mac) to allow 'make -j' if googletest has not yet been downloaded https://stackoverflow.com/a/32666215
@@ -798,9 +871,9 @@ ifeq ($(USECCACHE),1)
 	ccache --version | head -1
 endif
 	@echo ""
-	@echo NVCC=$(NVCC)
-ifneq ($(NVCC),)
-	$(NVCC) --version
+	@echo GPUCC=$(GPUCC)
+ifneq ($(GPUCC),)
+	$(GPUCC) --version
 endif
 	@echo ""
 	@echo CXX=$(CXX)
@@ -819,7 +892,7 @@ endif
 
 # Target: check (run the C++ test executable)
 # [NB THIS IS WHAT IS USED IN THE GITHUB CI!]
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 check: runTest cmpFcheck cmpFGcheck
 else
 check: runTest cmpFcheck
diff --git a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/fbridge.cc b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/fbridge.cc
index f93c05b0b3..2b956730d4 100644
--- a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/fbridge.cc
+++ b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/fbridge.cc
@@ -1,11 +1,11 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: S. Roiser (Oct 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Roiser, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #include "Bridge.h"
 #include "CPPProcess.h"
-#include "CudaRuntime.h"
+#include "GpuRuntime.h"
 
 extern "C"
 {
@@ -22,7 +22,7 @@ extern "C"
    * Using the same Fortran MadEvent code, linking to the hetrerogeneous library would allow access to both CPU and GPU implementations.
    * The specific heterogeneous configuration (how many GPUs, how many threads on each CPU, etc) could be loaded in CUDA/C++ from a data file.
    */
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   using namespace mg5amcGpu;
 #else
   using namespace mg5amcCpu;
@@ -46,8 +46,8 @@ extern "C"
    */
   void fbridgecreate_( CppObjectInFortran** ppbridge, const int* pnevtF, const int* pnparF, const int* pnp4F )
   {
-#ifdef __CUDACC__
-    CudaRuntime::setUp();
+#ifdef MGONGPUCPP_GPUIMPL
+    GpuRuntime::setUp();
 #endif
     // Create a process object, read parm card and set parameters
     // FIXME: the process instance can happily go out of scope because it is only needed to read parameters?
@@ -69,8 +69,8 @@ extern "C"
     Bridge<FORTRANFPTYPE>* pbridge = dynamic_cast<Bridge<FORTRANFPTYPE>*>( *ppbridge );
     if( pbridge == 0 ) throw std::runtime_error( "fbridgedelete_: invalid Bridge address" );
     delete pbridge;
-#ifdef __CUDACC__
-    CudaRuntime::tearDown();
+#ifdef MGONGPUCPP_GPUIMPL
+    GpuRuntime::tearDown();
 #endif
   }
 
@@ -100,7 +100,7 @@ extern "C"
   {
     Bridge<FORTRANFPTYPE>* pbridge = dynamic_cast<Bridge<FORTRANFPTYPE>*>( *ppbridge );
     if( pbridge == 0 ) throw std::runtime_error( "fbridgesequence_: invalid Bridge address" );
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     // Use the device/GPU implementation in the CUDA library
     // (there is also a host implementation in this library)
     pbridge->gpu_sequence( momenta, gs, rndhel, rndcol, *pchannelId, mes, selhel, selcol );
diff --git a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/fsampler.cc b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/fsampler.cc
index 2fb445372d..3743934f41 100644
--- a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/fsampler.cc
+++ b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/fsampler.cc
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Feb 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #include "mgOnGpuConfig.h"
 
@@ -13,7 +13,7 @@
 
 //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -40,7 +40,7 @@ namespace mg5amcCpu
   private:
     const int m_nevt; // The number of events in each iteration
     int m_iiter;      // The iteration counter (for random number seeding)
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
     HostBufferRndNumMomenta m_hstRndmom; // Memory buffers for random numbers
     HostBufferMomenta m_hstMomenta;      // Memory buffers for momenta
     HostBufferWeights m_hstWeights;      // Memory buffers for sampling weights
@@ -105,7 +105,7 @@ namespace mg5amcCpu
 
 extern "C"
 {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   using namespace mg5amcGpu;
 #else
   using namespace mg5amcCpu;
diff --git a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/runTest.cc b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/runTest.cc
index df7488178e..4c9bc9ee6b 100644
--- a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/runTest.cc
+++ b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/runTest.cc
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: S. Hageboeck (Nov 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 //----------------------------------------------------------------------------
 // Use ./runTest.exe --gtest_filter=*xxx to run only testxxx.cc tests
 //----------------------------------------------------------------------------
@@ -18,7 +18,7 @@
 #include "RandomNumberKernels.h"
 #include "epoch_process_id.h"
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 using namespace mg5amcGpu;
 #else
 using namespace mg5amcCpu;
@@ -35,7 +35,7 @@ struct CUDA_CPU_TestBase : public TestDriverBase
     : TestDriverBase( npar, refFileName ) {}
 };
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 struct CPUTest : public CUDA_CPU_TestBase
 {
   // Struct data members (process, and memory structures for random numbers, momenta, matrix elements and weights on host and device)
@@ -117,7 +117,7 @@ struct CPUTest : public CUDA_CPU_TestBase
 };
 #endif
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 struct CUDATest : public CUDA_CPU_TestBase
 {
   // Reset the device when our test goes out of scope. Note that this should happen after
@@ -126,7 +126,7 @@ struct CUDATest : public CUDA_CPU_TestBase
   {
     ~DeviceReset()
     {
-      checkCuda( cudaDeviceReset() ); // this is needed by cuda-memcheck --leak-check full
+      gpuDeviceReset(); // this is needed by cuda-memcheck --leak-check full
     }
   } deviceResetter;
 
@@ -252,7 +252,7 @@ INSTANTIATE_TEST_SUITE_P( prefix, \
                           test_suite_name, \
                           testing::Values( new CUDATest( MG_EPOCH_REFERENCE_FILE_NAME ) ) );
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 MG_INSTANTIATE_TEST_SUITE_GPU( XTESTID_GPU( MG_EPOCH_PROCESS_ID ), MadgraphTest );
 #else
 MG_INSTANTIATE_TEST_SUITE_CPU( XTESTID_CPU( MG_EPOCH_PROCESS_ID ), MadgraphTest );
diff --git a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/testmisc.cc b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/testmisc.cc
index 895d6eeb56..ba9e59a8a3 100644
--- a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/testmisc.cc
+++ b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/testmisc.cc
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 //----------------------------------------------------------------------------
 // Use ./runTest.exe --gtest_filter=*misc to run only testmisc.cc tests
 //----------------------------------------------------------------------------
@@ -17,7 +17,7 @@
 #include <sstream>
 #include <typeinfo>
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 #define TESTID( s ) s##_GPU_MISC
 #else
 #define TESTID( s ) s##_CPU_MISC
@@ -26,7 +26,7 @@
 #define XTESTID( s ) TESTID( s )
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -59,7 +59,7 @@ namespace mg5amcCpu
 
 TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testmisc )
 {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   using namespace mg5amcGpu;
 #else
   using namespace mg5amcCpu;
diff --git a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/testxxx.cc b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/testxxx.cc
index e40f635e46..016bc0f472 100644
--- a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/testxxx.cc
+++ b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/testxxx.cc
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Apr 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 //----------------------------------------------------------------------------
 // Use ./runTest.exe --gtest_filter=*xxx to run only testxxx.cc tests
 //----------------------------------------------------------------------------
@@ -24,7 +24,7 @@
 #include <iomanip>
 #include <iostream>
 #include <vector>
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 #define TESTID( s ) s##_GPU_XXX
 #else
 #define TESTID( s ) s##_CPU_XXX
@@ -32,7 +32,7 @@
 
 #define XTESTID( s ) TESTID( s )
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -42,7 +42,7 @@ namespace mg5amcCpu
   int FPEhandlerIevt = -1;
   inline void FPEhandler( int sig )
   {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     std::cerr << "Floating Point Exception (GPU): '" << FPEhandlerMessage << "' ievt=" << FPEhandlerIevt << std::endl;
 #else
     std::cerr << "Floating Point Exception (CPU neppV=" << neppV << "): '" << FPEhandlerMessage << "' ievt=" << FPEhandlerIevt << std::endl;
@@ -53,7 +53,7 @@ namespace mg5amcCpu
 
 TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testxxx )
 {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   using namespace mg5amcGpu;
 #else
   using namespace mg5amcCpu;
@@ -76,7 +76,7 @@ TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testxxx )
   assert( nevt % neppM == 0 ); // nevt must be a multiple of neppM
   assert( nevt % neppV == 0 ); // nevt must be a multiple of neppV
   // Fill in the input momenta
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   mg5amcGpu::PinnedHostBufferMomenta hstMomenta( nevt ); // AOSOA[npagM][npar=4][np4=4][neppM]
 #else
   mg5amcCpu::HostBufferMomenta hstMomenta( nevt ); // AOSOA[npagM][npar=4][np4=4][neppM]
@@ -321,7 +321,7 @@ TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testxxx )
   {
     for( int ievt = 0; ievt < nevt; ievt++ )
     {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
       using namespace mg5amcGpu;
 #else
       using namespace mg5amcCpu;
diff --git a/epochX/cudacpp/gg_ttggg.sa/src/HelAmps_sm.h b/epochX/cudacpp/gg_ttggg.sa/src/HelAmps_sm.h
index 6db5ca82f3..f4ea9f0a8a 100644
--- a/epochX/cudacpp/gg_ttggg.sa/src/HelAmps_sm.h
+++ b/epochX/cudacpp/gg_ttggg.sa/src/HelAmps_sm.h
@@ -5,7 +5,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: A. Valassi (Sep 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
 // MadGraph5_aMC@NLO v. 3.5.0_lo_vect, 2023-06-09
@@ -28,7 +28,7 @@
 //#include <iomanip>
 //#include <iostream>
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/gg_ttggg.sa/src/Parameters_sm.cc b/epochX/cudacpp/gg_ttggg.sa/src/Parameters_sm.cc
index 22fdd96a68..01e7d9bcf2 100644
--- a/epochX/cudacpp/gg_ttggg.sa/src/Parameters_sm.cc
+++ b/epochX/cudacpp/gg_ttggg.sa/src/Parameters_sm.cc
@@ -4,7 +4,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: A. Valassi (Sep 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
 // MadGraph5_aMC@NLO v. 3.5.0_lo_vect, 2023-06-09
@@ -17,7 +17,7 @@
 #include <iomanip>
 #include <iostream>
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 using namespace mg5amcGpu;
 #else
 using namespace mg5amcCpu;
diff --git a/epochX/cudacpp/gg_ttggg.sa/src/Parameters_sm.h b/epochX/cudacpp/gg_ttggg.sa/src/Parameters_sm.h
index 11fd9e3c74..b44537e599 100644
--- a/epochX/cudacpp/gg_ttggg.sa/src/Parameters_sm.h
+++ b/epochX/cudacpp/gg_ttggg.sa/src/Parameters_sm.h
@@ -27,7 +27,7 @@
 #include "read_slha.h"
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -93,7 +93,7 @@ namespace mg5amcCpu
 #include <limits>
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -218,7 +218,7 @@ namespace mg5amcCpu
 //==========================================================================
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -239,7 +239,7 @@ namespace mg5amcCpu
 #pragma GCC diagnostic push
 #pragma GCC diagnostic ignored "-Wunused-variable"  // e.g. <<warning: unused variable ‘mdl_G__exp__2’ [-Wunused-variable]>>
 #pragma GCC diagnostic ignored "-Wunused-parameter" // e.g. <<warning: unused parameter ‘G’ [-Wunused-parameter]>>
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 #pragma nv_diagnostic push
 #pragma nv_diag_suppress 177 // e.g. <<warning #177-D: variable "mdl_G__exp__2" was declared but never referenced>>
 #endif
@@ -267,7 +267,7 @@ namespace mg5amcCpu
       // End SM implementation - no special handling of vectors of floats as in EFT (#439)
       return out;
     }
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 #pragma GCC diagnostic pop
 #pragma nv_diagnostic pop
 #endif
diff --git a/epochX/cudacpp/gg_ttggg.sa/src/cudacpp_src.mk b/epochX/cudacpp/gg_ttggg.sa/src/cudacpp_src.mk
index 554d7a704c..f73ff6fa03 100644
--- a/epochX/cudacpp/gg_ttggg.sa/src/cudacpp_src.mk
+++ b/epochX/cudacpp/gg_ttggg.sa/src/cudacpp_src.mk
@@ -38,13 +38,13 @@ endif
 
 #-------------------------------------------------------------------------------
 
-#=== Configure the CUDA compiler (note: NVCC is already exported including ccache)
+#=== Configure the CUDA compiler (note: GPUCC is already exported including ccache)
 
-###$(info NVCC=$(NVCC))
+###$(info GPUCC=$(GPUCC))
 
 #-------------------------------------------------------------------------------
 
-#=== Configure ccache for C++ builds (note: NVCC is already exported including ccache)
+#=== Configure ccache for C++ builds (note: GPUCC is already exported including ccache)
 
 # Enable ccache if USECCACHE=1
 ifeq ($(USECCACHE)$(shell echo $(CXX) | grep ccache),1)
@@ -246,20 +246,20 @@ $(BUILDDIR)/%.o : %.cc *.h $(BUILDDIR)/.build.$(TAG)
 # Generic target and build rules: objects from CUDA compilation
 $(BUILDDIR)/%_cu.o : %.cc *.h $(BUILDDIR)/.build.$(TAG)
 	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
-	$(NVCC) $(CPPFLAGS) $(CUFLAGS) -Xcompiler -fPIC -c -x cu $< -o $@
+	$(GPUCC) $(CPPFLAGS) $(CUFLAGS) -Xcompiler -fPIC -c -x cu $< -o $@
 
 #-------------------------------------------------------------------------------
 
 cxx_objects=$(addprefix $(BUILDDIR)/, Parameters_sm.o read_slha.o)
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 cu_objects=$(addprefix $(BUILDDIR)/, Parameters_sm_cu.o)
 endif
 
 # Target (and build rules): common (src) library
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so : $(cxx_objects) $(cu_objects)
 	@if [ ! -d $(LIBDIR) ]; then echo "mkdir -p $(LIBDIR)"; mkdir -p $(LIBDIR); fi
-	$(NVCC) -shared -o $@ $(cxx_objects) $(cu_objects)
+	$(GPUCC) -shared -o $@ $(cxx_objects) $(cu_objects)
 else
 $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so : $(cxx_objects)
 	@if [ ! -d $(LIBDIR) ]; then echo "mkdir -p $(LIBDIR)"; mkdir -p $(LIBDIR); fi
diff --git a/epochX/cudacpp/gg_ttggg.sa/src/mgOnGpuConfig.h b/epochX/cudacpp/gg_ttggg.sa/src/mgOnGpuConfig.h
index c0f067f1d8..205accb85b 100644
--- a/epochX/cudacpp/gg_ttggg.sa/src/mgOnGpuConfig.h
+++ b/epochX/cudacpp/gg_ttggg.sa/src/mgOnGpuConfig.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jul 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MGONGPUCONFIG_H
 #define MGONGPUCONFIG_H 1
@@ -10,13 +10,27 @@
 // There are two different code bases for standalone_cudacpp (without multichannel) and madevent+cudacpp (with multichannel)
 #undef MGONGPU_SUPPORTS_MULTICHANNEL
 
+// Is this a GPU (CUDA, HIP) or CPU implementation?
+#ifdef __CUDACC__
+#define MGONGPUCPP_GPUIMPL cuda
+#elif defined __HIPCC__
+#define MGONGPUCPP_GPUIMPL hip
+#include "hip/hip_runtime.h"
+#else
+#undef MGONGPUCPP_GPUIMPL
+#endif
+
 // ** NB1 Throughputs (e.g. 6.8E8) are events/sec for "./gcheck.exe -p 65536 128 12"
 // ** NB2 Baseline on b7g47n0004 fluctuates (probably depends on load on other VMs)
 
 // Choose if curand is supported for generating random numbers
+// For CUDA, by default, it is supported
+// For HIP, by default, it is not supported
 // For C++, by default, do not inline, but allow this macro to be set from outside with e.g. -DMGONGPU_HAS_NO_CURAND
 #ifdef __CUDACC__
 #undef MGONGPU_HAS_NO_CURAND
+#elif defined __HIPCC__
+#define MGONGPU_HAS_NO_CURAND 1
 #else
 //#undef MGONGPU_HAS_NO_CURAND // default
 ////#define MGONGPU_HAS_NO_CURAND 1
@@ -52,23 +66,28 @@
 //#undef MGONGPU_HARDCODE_PARAM // default
 ////#define MGONGPU_HARDCODE_PARAM 1
 
-// Complex type in c++: std::complex or cxsmpl (CHOOSE ONLY ONE)
-#ifndef __CUDACC__
-//#define MGONGPU_CPPCXTYPE_STDCOMPLEX 1 // ~8 percent slower on float, same on double (5.1E6/double, 9.4E6/float)
-#define MGONGPU_CPPCXTYPE_CXSMPL 1 // new default (5.1E6/double, 10.2E6/float)
-#endif
-
-// Complex type in cuda: thrust or cucomplex or cxsmpl (CHOOSE ONLY ONE)
+// Complex type in CUDA: thrust or cucomplex or cxsmpl (CHOOSE ONLY ONE)
 #ifdef __CUDACC__
 #define MGONGPU_CUCXTYPE_THRUST 1 // default (~1.15E9/double, ~3.2E9/float)
 //#define MGONGPU_CUCXTYPE_CUCOMPLEX 1 // ~10 percent slower (1.03E9/double, ~2.8E9/float)
 //#define MGONGPU_CUCXTYPE_CXSMPL 1 // ~10 percent slower (1.00E9/double, ~2.9E9/float)
+
+// Complex type in HIP: cxsmpl (ONLY ONE OPTION POSSIBLE)
+#elif defined __HIPCC__
+#define MGONGPU_CUCXTYPE_CXSMPL 1 // ~10 percent slower (1.00E9/double, ~2.9E9/float)
+
+// Complex type in C++: std::complex or cxsmpl (CHOOSE ONLY ONE)
+#else
+//#define MGONGPU_CPPCXTYPE_STDCOMPLEX 1 // ~8 percent slower on float, same on double (5.1E6/double, 9.4E6/float)
+#define MGONGPU_CPPCXTYPE_CXSMPL 1 // new default (5.1E6/double, 10.2E6/float)
 #endif
 
-// Cuda nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation
+// CUDA nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation
 #ifdef __CUDACC__
-#undef MGONGPU_NSIGHT_DEBUG // default
+#undef MGONGPU_NSIGHT_DEBUG // default in CUDA
 //#define MGONGPU_NSIGHT_DEBUG 1
+#else
+#undef MGONGPU_NSIGHT_DEBUG // only option in HIP or C++
 #endif
 
 // SANITY CHECKS (floating point precision for everything but color algebra #537)
@@ -84,17 +103,21 @@
 #error You cannot use double precision for color algebra and single precision elsewhere
 #endif
 
-// SANITY CHECKS (c++ complex number implementation)
-#ifndef __CUDACC__
-#if defined MGONGPU_CPPCXTYPE_STDCOMPLEX and defined MGONGPU_CPPCXTYPE_CXSMPL
-#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CPPCXTYPE_STDCOMPLEX or MGONGPU_CPPCXTYPE_CXSMPL
+// SANITY CHECKS (CUDA complex number implementation)
+#ifdef __CUDACC__
+#if defined MGONGPU_CUCXTYPE_THRUST and defined MGONGPU_CUCXTYPE_CUCOMPLEX
+#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CUCXTYPE_THRUST or MGONGPU_CUCXTYPE_CUCOMPLEX for CUDA
+#elif defined MGONGPU_CUCXTYPE_THRUST and defined MGONGPU_CUCXTYPE_CXSMPL
+#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CUCXTYPE_THRUST or MGONGPU_CUCXTYPE_CXSMPL for CUDA
+#elif defined MGONGPU_CUCXTYPE_CUCOMPLEX and defined MGONGPU_CUCXTYPE_CXSMPL
+#error You must CHOOSE (ONE AND) ONLY ONE OF MGONGPU_CUCXTYPE_CUCOMPLEX or MGONGPU_CUCXTYPE_CXSMPL for CUDA
 #endif
 #endif
 
-// SANITY CHECKS (cuda complex number implementation)
-#ifdef __CUDACC__
-#if defined MGONGPU_CUCXTYPE_THRUST and defined MGONGPU_CUCXTYPE_CUCOMPLEX and defined MGONGPU_CUCXTYPE_CXSMPL
-#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CUCXTYPE_THRUST or MGONGPU_CUCXTYPE_CUCOMPLEX or MGONGPU_CUCXTYPE_CXSMPL
+// SANITY CHECKS (C++ complex number implementation)
+#ifndef MGONGPUCPP_GPUIMPL
+#if defined MGONGPU_CPPCXTYPE_STDCOMPLEX and defined MGONGPU_CPPCXTYPE_CXSMPL
+#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CPPCXTYPE_STDCOMPLEX or MGONGPU_CPPCXTYPE_CXSMPL for C++
 #endif
 #endif
 
@@ -132,7 +155,7 @@ namespace mgOnGpu
   // Alignment requirement for using reinterpret_cast with SIMD vectorized code
   // (using reinterpret_cast with non aligned memory may lead to segmentation faults!)
   // Only needed for C++ code but can be enforced also in NVCC builds of C++ code using CUDA>=11.2 and C++17 (#318, #319, #333)
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   constexpr int cppAlign = 64; // alignment requirement for SIMD vectorization (64-byte i.e. 512-bit)
 #endif
 
@@ -143,7 +166,7 @@ using mgOnGpu::fptype;
 using mgOnGpu::fptype2;
 
 // C++ SIMD vectorization width (this will be used to set neppV)
-#ifdef __CUDACC__ // CUDA implementation has no SIMD
+#ifdef MGONGPUCPP_GPUIMPL // CUDA and HIP implementations have no SIMD
 #undef MGONGPU_CPPSIMD
 #elif defined __AVX512VL__ && defined MGONGPU_PVW512 // C++ "512z" AVX512 with 512 width (512-bit ie 64-byte): 8 (DOUBLE) or 16 (FLOAT)
 #ifdef MGONGPU_FPTYPE_DOUBLE
@@ -173,9 +196,9 @@ using mgOnGpu::fptype2;
 #undef MGONGPU_CPPSIMD
 #endif
 
-// Cuda nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation
+// CUDA nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation
 // Arguments (not used so far): text is __FUNCTION__, code is 0 (start) or 1 (end)
-#if defined __CUDACC__ && defined MGONGPU_NSIGHT_DEBUG /* clang-format off */
+#if defined __CUDA__ && defined MGONGPU_NSIGHT_DEBUG /* clang-format off */
 #define mgDebugDeclare() __shared__ float mgDebugCounter[mgOnGpu::ntpbMAX];
 #define mgDebugInitialise() { mgDebugCounter[threadIdx.x] = 0; }
 #define mgDebug( code, text ) { mgDebugCounter[threadIdx.x] += 1; }
@@ -187,8 +210,8 @@ using mgOnGpu::fptype2;
 #define mgDebugFinalise() { /*noop*/ }
 #endif /* clang-format on */
 
-// Define empty CUDA declaration specifiers for C++
-#ifndef __CUDACC__
+// Define empty CUDA/HIP declaration specifiers for C++
+#ifndef MGONGPUCPP_GPUIMPL
 #define __global__
 #define __host__
 #define __device__
diff --git a/epochX/cudacpp/gg_ttggg.sa/src/mgOnGpuCxtypes.h b/epochX/cudacpp/gg_ttggg.sa/src/mgOnGpuCxtypes.h
index b56348bc58..6ae0c42ecb 100644
--- a/epochX/cudacpp/gg_ttggg.sa/src/mgOnGpuCxtypes.h
+++ b/epochX/cudacpp/gg_ttggg.sa/src/mgOnGpuCxtypes.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022, based on earlier work by D. Smith) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MGONGPUCXTYPES_H
 #define MGONGPUCXTYPES_H 1
@@ -19,7 +19,7 @@
 #include <complex>
 
 // Complex type in cuda: thrust or cucomplex or cxsmpl
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 #if defined MGONGPU_CUCXTYPE_THRUST
 #pragma clang diagnostic push
 #pragma clang diagnostic ignored "-Wtautological-compare" // for icpx2021/clang13 (https://stackoverflow.com/a/15864661)
@@ -82,7 +82,7 @@ namespace mgOnGpu /* clang-format off */
 using mgOnGpu::cxsmpl;
 
 // Printout to stream for user defined types
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -209,14 +209,14 @@ namespace mg5amcCpu
 //==========================================================================
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
 #endif
 {
   // --- Type definitions (complex type: cxtype)
-#ifdef __CUDACC__ // cuda
+#ifdef MGONGPUCPP_GPUIMPL // cuda
 #if defined MGONGPU_CUCXTYPE_THRUST
   typedef thrust::complex<fptype> cxtype;
 #elif defined MGONGPU_CUCXTYPE_CUCOMPLEX
@@ -249,7 +249,7 @@ namespace mg5amcCpu
 //==========================================================================
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -301,7 +301,7 @@ namespace mg5amcCpu
 
   //==========================================================================
 
-#if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_THRUST // cuda + thrust
+#if defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CUCXTYPE_THRUST // cuda + thrust
 
   //------------------------------
   // CUDA - using thrust::complex
@@ -337,11 +337,11 @@ namespace mg5amcCpu
     return c;
   }
 
-#endif // #if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_THRUST
+#endif // #if defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CUCXTYPE_THRUST
 
   //==========================================================================
 
-#if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_CUCOMPLEX // cuda + cucomplex
+#if defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CUCXTYPE_CUCOMPLEX // cuda + cucomplex
 
   //------------------------------
   // CUDA - using cuComplex
@@ -556,11 +556,11 @@ namespace mg5amcCpu
     return cxmake( c.real(), c.imag() );
   }
 
-#endif // #if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_CUCOMPLEX
+#endif // #if defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CUCXTYPE_CUCOMPLEX
 
   //==========================================================================
 
-#if not defined __CUDACC__ and defined MGONGPU_CPPCXTYPE_STDCOMPLEX // c++ + stdcomplex
+#if not defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CPPCXTYPE_STDCOMPLEX // c++ + stdcomplex
 
   //------------------------------
   // C++ - using std::complex
@@ -604,7 +604,7 @@ namespace mg5amcCpu
   }
 #endif
 
-#endif // #if not defined __CUDACC__ and defined MGONGPU_CPPCXTYPE_STDCOMPLEX
+#endif // #if not defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CPPCXTYPE_STDCOMPLEX
 
   //==========================================================================
 
@@ -627,7 +627,7 @@ namespace mg5amcCpu
 //==========================================================================
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/gg_ttggg.sa/src/mgOnGpuFptypes.h b/epochX/cudacpp/gg_ttggg.sa/src/mgOnGpuFptypes.h
index 905c97d700..fa3a02664b 100644
--- a/epochX/cudacpp/gg_ttggg.sa/src/mgOnGpuFptypes.h
+++ b/epochX/cudacpp/gg_ttggg.sa/src/mgOnGpuFptypes.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MGONGPUFPTYPES_H
 #define MGONGPUFPTYPES_H 1
@@ -12,7 +12,7 @@
 #include <cmath>
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL // cuda
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -20,7 +20,7 @@ namespace mg5amcCpu
 {
   //==========================================================================
 
-#ifdef __CUDACC__ // cuda
+#ifdef MGONGPUCPP_GPUIMPL // cuda
 
   //------------------------------
   // Floating point types - Cuda
@@ -64,11 +64,11 @@ namespace mg5amcCpu
 #endif
   }
 
-#endif // #ifdef __CUDACC__
+#endif // #ifdef MGONGPUCPP_GPUIMPL
 
   //==========================================================================
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 
   //------------------------------
   // Floating point types - C++
@@ -92,7 +92,7 @@ namespace mg5amcCpu
     return std::sqrt( f );
   }
 
-#endif // #ifndef __CUDACC__
+#endif // #ifndef MGONGPUCPP_GPUIMPL
 
   //==========================================================================
 
diff --git a/epochX/cudacpp/gg_ttggg.sa/src/mgOnGpuVectors.h b/epochX/cudacpp/gg_ttggg.sa/src/mgOnGpuVectors.h
index e1299ba81e..cdae04326b 100644
--- a/epochX/cudacpp/gg_ttggg.sa/src/mgOnGpuVectors.h
+++ b/epochX/cudacpp/gg_ttggg.sa/src/mgOnGpuVectors.h
@@ -32,7 +32,7 @@
 #endif
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -131,7 +131,7 @@ namespace mg5amcCpu
 #endif
 #endif
 
-#else // i.e #ifndef MGONGPU_CPPSIMD (this includes #ifdef __CUDACC__)
+#else // i.e #ifndef MGONGPU_CPPSIMD (this includes #ifdef MGONGPUCPP_GPUIMPL)
 
   const int neppV = 1;
 
@@ -153,13 +153,13 @@ namespace mg5amcCpu
 //==========================================================================
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
 #endif
 {
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 
   // Printout to stream for user defined types
 
@@ -805,11 +805,11 @@ namespace mg5amcCpu
 
 #endif // #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
 
-#endif // #ifndef __CUDACC__
+#endif // #ifndef MGONGPUCPP_GPUIMPL
 
   //==========================================================================
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 
   //------------------------------
   // Vector types - CUDA
@@ -853,12 +853,12 @@ namespace mg5amcCpu
     return mask;
   }
 
-#endif // #ifdef __CUDACC__
+#endif // #ifdef MGONGPUCPP_GPUIMPL
 
   //==========================================================================
 
   // Scalar-or-vector types: scalar in CUDA, vector or scalar in C++
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   typedef bool bool_sv;
   typedef fptype fptype_sv;
   typedef fptype2 fptype2_sv;
@@ -879,7 +879,7 @@ namespace mg5amcCpu
 #endif
 
   // Scalar-or-vector zeros: scalar in CUDA, vector or scalar in C++
-#ifdef __CUDACC__ /* clang-format off */
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
   inline __host__ __device__ cxtype cxzero_sv(){ return cxtype( 0, 0 ); }
 #elif defined MGONGPU_CPPSIMD
   inline cxtype_v cxzero_sv() { return cxtype_v(); } // RRRR=0000 IIII=0000
diff --git a/epochX/cudacpp/gg_ttggg.sa/src/rambo.h b/epochX/cudacpp/gg_ttggg.sa/src/rambo.h
index e02ea52496..cd7e1008ea 100644
--- a/epochX/cudacpp/gg_ttggg.sa/src/rambo.h
+++ b/epochX/cudacpp/gg_ttggg.sa/src/rambo.h
@@ -4,7 +4,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 
 #include "mgOnGpuConfig.h"
@@ -18,7 +18,7 @@
 #include <iostream>
 
 // Simplified rambo version for 2 to N (with N>=2) processes with massless particles
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -83,7 +83,7 @@ namespace mg5amcCpu
       static bool first = true;
       if( first )
       {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
         if constexpr( M_ACCESS::isOnDevice() ) // avoid
         {
           const int ievt0 = 0;
@@ -166,7 +166,7 @@ namespace mg5amcCpu
     wt = po2log;
     if( nparf != 2 ) wt = ( 2. * nparf - 4. ) * log( energy ) + z[nparf - 1];
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
     // issue warnings if weight is too small or too large
     static int iwarn[5] = { 0, 0, 0, 0, 0 };
     if( wt < -180. )

From b79fe00868de6188a27598b4aa8e8edf9d2d9151 Mon Sep 17 00:00:00 2001
From: Jorgen T <jorgen.teig@gmail.com>
Date: Wed, 9 Aug 2023 15:09:47 +0200
Subject: [PATCH 444/509] Reverted changes to FC env variable

---
 .github/workflows/c-cpp.yml | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)

diff --git a/.github/workflows/c-cpp.yml b/.github/workflows/c-cpp.yml
index fe26071c99..b38d44bd13 100644
--- a/.github/workflows/c-cpp.yml
+++ b/.github/workflows/c-cpp.yml
@@ -67,6 +67,7 @@ jobs:
     env:
       CUDA_HOME: /usr/local/cuda/
       REQUIRE_CUDA: 1
+      FC: gfortran
     strategy:
       matrix:
         folder: [ epochX/cudacpp/ee_mumu.sa/SubProcesses/P1_Sigma_sm_epem_mupmum,
@@ -79,11 +80,8 @@ jobs:
     steps:
     - uses: actions/checkout@v2
     - name: make info
-      run: FC=`which gfortran`;
-           make FPTYPE=${{ matrix.precision }} -C ${{ matrix.folder }} info
+      run: make FPTYPE=${{ matrix.precision }} -C ${{ matrix.folder }} info
     - name: make
-      run: FC=`which gfortran`;
-           make FPTYPE=${{ matrix.precision }} -C ${{ matrix.folder }}
+      run: make FPTYPE=${{ matrix.precision }} -C ${{ matrix.folder }}
     - name: make check
-      run: FC=`which gfortran`;
-           make FPTYPE=${{ matrix.precision }} -C ${{ matrix.folder }} check
+      run: make FPTYPE=${{ matrix.precision }} -C ${{ matrix.folder }} check

From 3b8ce7e33d44b7956a6820278c13e5e8f9f4b11a Mon Sep 17 00:00:00 2001
From: Jorgen T <jorgen.teig@gmail.com>
Date: Wed, 9 Aug 2023 18:06:13 +0200
Subject: [PATCH 445/509] Changed position of exporting GPUCC and GPUFLAGS in
 cudacpp.mk

---
 epochX/cudacpp/gg_ttgg.mad/SubProcesses/cudacpp.mk | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/cudacpp.mk b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/cudacpp.mk
index 77334d2c04..81900d1807 100644
--- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/cudacpp.mk
+++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/cudacpp.mk
@@ -163,8 +163,6 @@ ifeq ($(findstring nvcc,$(CUDA_COMPILER_PATH)),nvcc)
     override CUINC=
     override CURANDLIBFLAGS=
   endif
-  export GPUCC
-  export GPUFLAGS
 
   # Set the host C++ compiler for GPUCC via "-ccbin <host-compiler>"
   # (NB issue #505: this must be a single word, "clang++ --gcc-toolchain..." is not supported)
@@ -234,6 +232,9 @@ else ifeq ($(findstring hipcc,$(HIP_COMPILER_PATH)),hipcc)
 
 endif
 
+export GPUCC
+export GPUFLAGS
+
 #-------------------------------------------------------------------------------
 
 #=== Configure ccache for C++ and CUDA builds

From 609d548b71458d58bd52b2f514ab1f0b985c97fd Mon Sep 17 00:00:00 2001
From: Jorgen T <jorgen.teig@gmail.com>
Date: Thu, 10 Aug 2023 09:38:28 +0200
Subject: [PATCH 446/509] Added correct HIP_PLATFORM when compiling for HIP in
 cudacpp_src.mk

---
 epochX/cudacpp/gg_ttgg.mad/src/cudacpp_src.mk | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/epochX/cudacpp/gg_ttgg.mad/src/cudacpp_src.mk b/epochX/cudacpp/gg_ttgg.mad/src/cudacpp_src.mk
index f73ff6fa03..0cddb2b945 100644
--- a/epochX/cudacpp/gg_ttgg.mad/src/cudacpp_src.mk
+++ b/epochX/cudacpp/gg_ttgg.mad/src/cudacpp_src.mk
@@ -85,6 +85,11 @@ endif
 ###$(info OMPFLAGS=$(OMPFLAGS))
 CXXFLAGS += $(OMPFLAGS)
 
+# Add correct -DHIP_LATFORM when compiling for HIP
+ifeq ($(findstring hipcc,$(GPUCC)),hipcc)
+  CUFLAGS += -DHIP_PLATFORM=amd
+endif
+
 # Set the build flags appropriate to each AVX choice (example: "make AVX=none")
 # [NB MGONGPU_PVW512 is needed because "-mprefer-vector-width=256" is not exposed in a macro]
 # [See https://gcc.gnu.org/bugzilla/show_bug.cgi?id=96476]

From e1bb7456e28c0a191faf39a3c5bd0d505e615c05 Mon Sep 17 00:00:00 2001
From: Jorgen T <jorgen.teig@gmail.com>
Date: Thu, 10 Aug 2023 09:48:30 +0200
Subject: [PATCH 447/509] Moved "-c -x cu" behind an ifeq nvcc

---
 epochX/cudacpp/gg_ttgg.mad/src/cudacpp_src.mk | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/epochX/cudacpp/gg_ttgg.mad/src/cudacpp_src.mk b/epochX/cudacpp/gg_ttgg.mad/src/cudacpp_src.mk
index 0cddb2b945..ec08ee2293 100644
--- a/epochX/cudacpp/gg_ttgg.mad/src/cudacpp_src.mk
+++ b/epochX/cudacpp/gg_ttgg.mad/src/cudacpp_src.mk
@@ -86,8 +86,10 @@ endif
 CXXFLAGS += $(OMPFLAGS)
 
 # Add correct -DHIP_LATFORM when compiling for HIP
-ifeq ($(findstring hipcc,$(GPUCC)),hipcc)
-  CUFLAGS += -DHIP_PLATFORM=amd
+ifeq ($(findstring nvcc,$(GPUCC)),nvcc)
+  CUFLAGS += -Xcompiler -fPIC -c -x cu
+else ifeq ($(findstring hipcc,$(GPUCC)),hipcc)
+  CUFLAGS += -DHIP_PLATFORM=amd -Xcompiler -fPIC
 endif
 
 # Set the build flags appropriate to each AVX choice (example: "make AVX=none")
@@ -251,7 +253,7 @@ $(BUILDDIR)/%.o : %.cc *.h $(BUILDDIR)/.build.$(TAG)
 # Generic target and build rules: objects from CUDA compilation
 $(BUILDDIR)/%_cu.o : %.cc *.h $(BUILDDIR)/.build.$(TAG)
 	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
-	$(GPUCC) $(CPPFLAGS) $(CUFLAGS) -Xcompiler -fPIC -c -x cu $< -o $@
+	$(GPUCC) $(CPPFLAGS) $(CUFLAGS) $< -o $@
 
 #-------------------------------------------------------------------------------
 

From 5097e9d4f10dcad469ff1057519482f28fef6c50 Mon Sep 17 00:00:00 2001
From: Jorgen T <jorgen.teig@gmail.com>
Date: Thu, 10 Aug 2023 09:58:03 +0200
Subject: [PATCH 448/509] Changed ifdef back to __CUDACC__ in mgOnGpuCxtypes.h

---
 epochX/cudacpp/gg_ttgg.mad/src/mgOnGpuCxtypes.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/epochX/cudacpp/gg_ttgg.mad/src/mgOnGpuCxtypes.h b/epochX/cudacpp/gg_ttgg.mad/src/mgOnGpuCxtypes.h
index 6ae0c42ecb..d627607e8d 100644
--- a/epochX/cudacpp/gg_ttgg.mad/src/mgOnGpuCxtypes.h
+++ b/epochX/cudacpp/gg_ttgg.mad/src/mgOnGpuCxtypes.h
@@ -19,7 +19,7 @@
 #include <complex>
 
 // Complex type in cuda: thrust or cucomplex or cxsmpl
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 #if defined MGONGPU_CUCXTYPE_THRUST
 #pragma clang diagnostic push
 #pragma clang diagnostic ignored "-Wtautological-compare" // for icpx2021/clang13 (https://stackoverflow.com/a/15864661)

From 762122b3c4c8f2ede9990a92e6bfa899da383549 Mon Sep 17 00:00:00 2001
From: Jorgen T <jorgen.teig@gmail.com>
Date: Thu, 10 Aug 2023 10:01:12 +0200
Subject: [PATCH 449/509] Revert changes from last commit because it is handled
 elsewhere inn code

---
 epochX/cudacpp/gg_ttgg.mad/src/mgOnGpuCxtypes.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/epochX/cudacpp/gg_ttgg.mad/src/mgOnGpuCxtypes.h b/epochX/cudacpp/gg_ttgg.mad/src/mgOnGpuCxtypes.h
index d627607e8d..6ae0c42ecb 100644
--- a/epochX/cudacpp/gg_ttgg.mad/src/mgOnGpuCxtypes.h
+++ b/epochX/cudacpp/gg_ttgg.mad/src/mgOnGpuCxtypes.h
@@ -19,7 +19,7 @@
 #include <complex>
 
 // Complex type in cuda: thrust or cucomplex or cxsmpl
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 #if defined MGONGPU_CUCXTYPE_THRUST
 #pragma clang diagnostic push
 #pragma clang diagnostic ignored "-Wtautological-compare" // for icpx2021/clang13 (https://stackoverflow.com/a/15864661)

From ab3a60bf6e6b5e8f57c9d526c61288792388785a Mon Sep 17 00:00:00 2001
From: Jorgen T <jorgen.teig@gmail.com>
Date: Thu, 10 Aug 2023 11:15:59 +0200
Subject: [PATCH 450/509] Added back -c to HIP compilation in src mkfile

---
 epochX/cudacpp/gg_ttgg.mad/src/cudacpp_src.mk | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/epochX/cudacpp/gg_ttgg.mad/src/cudacpp_src.mk b/epochX/cudacpp/gg_ttgg.mad/src/cudacpp_src.mk
index ec08ee2293..b762c727c1 100644
--- a/epochX/cudacpp/gg_ttgg.mad/src/cudacpp_src.mk
+++ b/epochX/cudacpp/gg_ttgg.mad/src/cudacpp_src.mk
@@ -89,7 +89,7 @@ CXXFLAGS += $(OMPFLAGS)
 ifeq ($(findstring nvcc,$(GPUCC)),nvcc)
   CUFLAGS += -Xcompiler -fPIC -c -x cu
 else ifeq ($(findstring hipcc,$(GPUCC)),hipcc)
-  CUFLAGS += -DHIP_PLATFORM=amd -Xcompiler -fPIC
+  CUFLAGS += -DHIP_PLATFORM=amd -fPIC -c
 endif
 
 # Set the build flags appropriate to each AVX choice (example: "make AVX=none")

From e3e11dc6db708a065acc0f0347508dc4e9831994 Mon Sep 17 00:00:00 2001
From: Jorgen T <jorgen.teig@gmail.com>
Date: Thu, 10 Aug 2023 12:26:50 +0200
Subject: [PATCH 451/509] Fix for compilation error with std::complex using
 cxsmpl

---
 epochX/cudacpp/gg_ttgg.mad/src/mgOnGpuCxtypes.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/epochX/cudacpp/gg_ttgg.mad/src/mgOnGpuCxtypes.h b/epochX/cudacpp/gg_ttgg.mad/src/mgOnGpuCxtypes.h
index 6ae0c42ecb..46d9f02733 100644
--- a/epochX/cudacpp/gg_ttgg.mad/src/mgOnGpuCxtypes.h
+++ b/epochX/cudacpp/gg_ttgg.mad/src/mgOnGpuCxtypes.h
@@ -92,7 +92,7 @@ namespace mg5amcCpu
   inline __host__ std::ostream&
   operator<<( std::ostream& out, const cxsmpl<FP>& c )
   {
-    out << std::complex( c.real(), c.imag() );
+    out << std::complex<FP>( c.real(), c.imag() );
     return out;
   }
 

From 87031ad9e928f6eff8fb17a1bb669e4cf8ea6f92 Mon Sep 17 00:00:00 2001
From: Jorgen T <jorgen.teig@gmail.com>
Date: Thu, 10 Aug 2023 15:04:07 +0200
Subject: [PATCH 452/509] Export HIPARCHFLAGS and set AMD ARCH in
 cudacpp_src.mk, also change CUFLAGS to GPUFLAGS in cudacpp_src.mk

---
 epochX/cudacpp/gg_ttgg.mad/SubProcesses/cudacpp.mk | 2 ++
 epochX/cudacpp/gg_ttgg.mad/src/cudacpp_src.mk      | 8 ++++----
 2 files changed, 6 insertions(+), 4 deletions(-)

diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/cudacpp.mk b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/cudacpp.mk
index 81900d1807..9415b01fad 100644
--- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/cudacpp.mk
+++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/cudacpp.mk
@@ -213,6 +213,8 @@ else ifeq ($(findstring hipcc,$(HIP_COMPILER_PATH)),hipcc)
     CUBUILDRULEFLAGS = -fPIC -c
     CCBUILDRULEFLAGS = -fPIC -c
 
+    export HIPARCHFLAGS
+
   else ifneq ($(origin REQUIRE_HIP),undefined)
     # If REQUIRE_HIP is set but no cuda is found, stop here (e.g. for CI tests on GPU #443)
     $(error No hip installation found (set HIP_HOME or make GPUCC visible in PATH))
diff --git a/epochX/cudacpp/gg_ttgg.mad/src/cudacpp_src.mk b/epochX/cudacpp/gg_ttgg.mad/src/cudacpp_src.mk
index b762c727c1..d132f0877b 100644
--- a/epochX/cudacpp/gg_ttgg.mad/src/cudacpp_src.mk
+++ b/epochX/cudacpp/gg_ttgg.mad/src/cudacpp_src.mk
@@ -19,7 +19,7 @@ SHELL := /bin/bash
 #=== Configure common compiler flags for CUDA and C++
 
 INCFLAGS = -I.
-OPTFLAGS = -O3 # this ends up in CUFLAGS too (should it?), cannot add -Ofast or -ffast-math here
+OPTFLAGS = -O3 # this ends up in GPUFLAGS too (should it?), cannot add -Ofast or -ffast-math here
 
 #-------------------------------------------------------------------------------
 
@@ -87,9 +87,9 @@ CXXFLAGS += $(OMPFLAGS)
 
 # Add correct -DHIP_LATFORM when compiling for HIP
 ifeq ($(findstring nvcc,$(GPUCC)),nvcc)
-  CUFLAGS += -Xcompiler -fPIC -c -x cu
+  GPUFLAGS += -Xcompiler -fPIC -c -x cu
 else ifeq ($(findstring hipcc,$(GPUCC)),hipcc)
-  CUFLAGS += -DHIP_PLATFORM=amd -fPIC -c
+  GPUFLAGS += $(HIPARCHFLAGS) -DHIP_PLATFORM=amd -fPIC -c
 endif
 
 # Set the build flags appropriate to each AVX choice (example: "make AVX=none")
@@ -253,7 +253,7 @@ $(BUILDDIR)/%.o : %.cc *.h $(BUILDDIR)/.build.$(TAG)
 # Generic target and build rules: objects from CUDA compilation
 $(BUILDDIR)/%_cu.o : %.cc *.h $(BUILDDIR)/.build.$(TAG)
 	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
-	$(GPUCC) $(CPPFLAGS) $(CUFLAGS) $< -o $@
+	$(GPUCC) $(CPPFLAGS) $(GPUFLAGS) $< -o $@
 
 #-------------------------------------------------------------------------------
 

From e85915910c0569d48da2c0c89f14f9bd4b2bfcca Mon Sep 17 00:00:00 2001
From: Jorgen T <jorgen.teig@gmail.com>
Date: Thu, 10 Aug 2023 15:10:26 +0200
Subject: [PATCH 453/509] [CODEGEN] Added changes from gg_ttgg.mad to code
 generator

---
 .../madgraph/iolibs/template_files/gpu/cudacpp.mk     |  2 ++
 .../madgraph/iolibs/template_files/gpu/cudacpp_src.mk | 11 +++++++++--
 .../iolibs/template_files/gpu/mgOnGpuCxtypes.h        |  2 +-
 3 files changed, 12 insertions(+), 3 deletions(-)

diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/cudacpp.mk b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/cudacpp.mk
index f052bf011e..ce1ffd76f9 100644
--- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/cudacpp.mk
+++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/cudacpp.mk
@@ -215,6 +215,8 @@ else ifeq ($(findstring hipcc,$(HIP_COMPILER_PATH)),hipcc)
     CUBUILDRULEFLAGS = -fPIC -c
     CCBUILDRULEFLAGS = -fPIC -c
 
+    export HIPARCHFLAGS
+
   else ifneq ($(origin REQUIRE_HIP),undefined)
     # If REQUIRE_HIP is set but no cuda is found, stop here (e.g. for CI tests on GPU #443)
     $(error No hip installation found (set HIP_HOME or make GPUCC visible in PATH))
diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/cudacpp_src.mk b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/cudacpp_src.mk
index f3a26552db..d28c92ec13 100644
--- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/cudacpp_src.mk
+++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/cudacpp_src.mk
@@ -19,7 +19,7 @@ SHELL := /bin/bash
 #=== Configure common compiler flags for CUDA and C++
 
 INCFLAGS = -I.
-OPTFLAGS = -O3 # this ends up in CUFLAGS too (should it?), cannot add -Ofast or -ffast-math here
+OPTFLAGS = -O3 # this ends up in GPUFLAGS too (should it?), cannot add -Ofast or -ffast-math here
 
 #-------------------------------------------------------------------------------
 
@@ -85,6 +85,13 @@ endif
 ###$(info OMPFLAGS=$(OMPFLAGS))
 CXXFLAGS += $(OMPFLAGS)
 
+# Add correct -DHIP_LATFORM when compiling for HIP
+ifeq ($(findstring nvcc,$(GPUCC)),nvcc)
+  GPUFLAGS += -Xcompiler -fPIC -c -x cu
+else ifeq ($(findstring hipcc,$(GPUCC)),hipcc)
+  GPUFLAGS += $(HIPARCHFLAGS) -DHIP_PLATFORM=amd -fPIC -c
+endif
+
 # Set the build flags appropriate to each AVX choice (example: "make AVX=none")
 # [NB MGONGPU_PVW512 is needed because "-mprefer-vector-width=256" is not exposed in a macro]
 # [See https://gcc.gnu.org/bugzilla/show_bug.cgi?id=96476]
@@ -246,7 +253,7 @@ $(BUILDDIR)/%%.o : %%.cc *.h $(BUILDDIR)/.build.$(TAG)
 # Generic target and build rules: objects from CUDA compilation
 $(BUILDDIR)/%%_cu.o : %%.cc *.h $(BUILDDIR)/.build.$(TAG)
 	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
-	$(GPUCC) $(CPPFLAGS) $(CUFLAGS) -Xcompiler -fPIC -c -x cu $< -o $@
+	$(GPUCC) $(CPPFLAGS) $(GPUFLAGS) $< -o $@
 
 #-------------------------------------------------------------------------------
 
diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/mgOnGpuCxtypes.h b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/mgOnGpuCxtypes.h
index 6ae0c42ecb..46d9f02733 100644
--- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/mgOnGpuCxtypes.h
+++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/mgOnGpuCxtypes.h
@@ -92,7 +92,7 @@ namespace mg5amcCpu
   inline __host__ std::ostream&
   operator<<( std::ostream& out, const cxsmpl<FP>& c )
   {
-    out << std::complex( c.real(), c.imag() );
+    out << std::complex<FP>( c.real(), c.imag() );
     return out;
   }
 

From 2920ff290ee82dcbd05c2f5b1cad556b8f22d7fe Mon Sep 17 00:00:00 2001
From: Jorgen T <jorgen.teig@gmail.com>
Date: Thu, 10 Aug 2023 15:44:56 +0200
Subject: [PATCH 454/509] [CODEGEN] Added export of GPUCC and GPUFLAGS to
 codegen

---
 .../madgraph/iolibs/template_files/gpu/cudacpp.mk            | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/cudacpp.mk b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/cudacpp.mk
index ce1ffd76f9..bca50cd087 100644
--- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/cudacpp.mk
+++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/cudacpp.mk
@@ -163,8 +163,6 @@ ifeq ($(findstring nvcc,$(CUDA_COMPILER_PATH)),nvcc)
     override CUINC=
     override CURANDLIBFLAGS=
   endif
-  export GPUCC
-  export GPUFLAGS
 
   # Set the host C++ compiler for GPUCC via "-ccbin <host-compiler>"
   # (NB issue #505: this must be a single word, "clang++ --gcc-toolchain..." is not supported)
@@ -236,6 +234,9 @@ else ifeq ($(findstring hipcc,$(HIP_COMPILER_PATH)),hipcc)
 
 endif
 
+export GPUCC
+export GPUFLAGS
+
 #-------------------------------------------------------------------------------
 
 #=== Configure ccache for C++ and CUDA builds

From be4bf041428b7007931b024c949c577bc6d4b07c Mon Sep 17 00:00:00 2001
From: Jorgen T <jorgen.teig@gmail.com>
Date: Thu, 10 Aug 2023 15:53:51 +0200
Subject: [PATCH 455/509] [CODEGEN] Regenerate all 5 .sa/.mad processes
 (ee_mumu -> gg_ttggg) based on gg_ttgg.mad

---
 .../ee_mumu.mad/CODEGEN_mad_ee_mumu_log.txt   | 12 +++++-----
 .../ee_mumu.mad/SubProcesses/cudacpp.mk       |  7 ++++--
 epochX/cudacpp/ee_mumu.mad/src/cudacpp_src.mk | 11 ++++++++--
 .../cudacpp/ee_mumu.mad/src/mgOnGpuCxtypes.h  |  2 +-
 .../CODEGEN_cudacpp_ee_mumu_log.txt           | 12 +++++-----
 .../ee_mumu.sa/SubProcesses/cudacpp.mk        |  7 ++++--
 epochX/cudacpp/ee_mumu.sa/src/cudacpp_src.mk  | 11 ++++++++--
 .../cudacpp/ee_mumu.sa/src/mgOnGpuCxtypes.h   |  2 +-
 .../gg_tt.mad/CODEGEN_mad_gg_tt_log.txt       | 18 +++++++--------
 .../cudacpp/gg_tt.mad/SubProcesses/cudacpp.mk |  7 ++++--
 epochX/cudacpp/gg_tt.mad/src/cudacpp_src.mk   | 11 ++++++++--
 epochX/cudacpp/gg_tt.mad/src/mgOnGpuCxtypes.h |  2 +-
 .../gg_tt.sa/CODEGEN_cudacpp_gg_tt_log.txt    | 14 ++++++------
 .../cudacpp/gg_tt.sa/SubProcesses/cudacpp.mk  |  7 ++++--
 epochX/cudacpp/gg_tt.sa/src/cudacpp_src.mk    | 11 ++++++++--
 epochX/cudacpp/gg_tt.sa/src/mgOnGpuCxtypes.h  |  2 +-
 .../gg_ttg.mad/CODEGEN_mad_gg_ttg_log.txt     | 16 +++++++-------
 .../gg_ttg.mad/SubProcesses/cudacpp.mk        |  7 ++++--
 epochX/cudacpp/gg_ttg.mad/src/cudacpp_src.mk  | 11 ++++++++--
 .../cudacpp/gg_ttg.mad/src/mgOnGpuCxtypes.h   |  2 +-
 .../gg_ttg.sa/CODEGEN_cudacpp_gg_ttg_log.txt  | 14 ++++++------
 .../cudacpp/gg_ttg.sa/SubProcesses/cudacpp.mk |  7 ++++--
 epochX/cudacpp/gg_ttg.sa/src/cudacpp_src.mk   | 11 ++++++++--
 epochX/cudacpp/gg_ttg.sa/src/mgOnGpuCxtypes.h |  2 +-
 .../gg_ttgg.mad/CODEGEN_mad_gg_ttgg_log.txt   | 20 ++++++++---------
 .../CODEGEN_cudacpp_gg_ttgg_log.txt           | 14 ++++++------
 .../gg_ttgg.sa/SubProcesses/cudacpp.mk        |  7 ++++--
 epochX/cudacpp/gg_ttgg.sa/src/cudacpp_src.mk  | 11 ++++++++--
 .../cudacpp/gg_ttgg.sa/src/mgOnGpuCxtypes.h   |  2 +-
 .../gg_ttggg.mad/CODEGEN_mad_gg_ttggg_log.txt | 22 +++++++++----------
 .../gg_ttggg.mad/SubProcesses/cudacpp.mk      |  7 ++++--
 .../cudacpp/gg_ttggg.mad/src/cudacpp_src.mk   | 11 ++++++++--
 .../cudacpp/gg_ttggg.mad/src/mgOnGpuCxtypes.h |  2 +-
 .../CODEGEN_cudacpp_gg_ttggg_log.txt          | 14 ++++++------
 .../gg_ttggg.sa/SubProcesses/cudacpp.mk       |  7 ++++--
 epochX/cudacpp/gg_ttggg.sa/src/cudacpp_src.mk | 11 ++++++++--
 .../cudacpp/gg_ttggg.sa/src/mgOnGpuCxtypes.h  |  2 +-
 37 files changed, 213 insertions(+), 123 deletions(-)

diff --git a/epochX/cudacpp/ee_mumu.mad/CODEGEN_mad_ee_mumu_log.txt b/epochX/cudacpp/ee_mumu.mad/CODEGEN_mad_ee_mumu_log.txt
index 50f3467303..c5451c1383 100644
--- a/epochX/cudacpp/ee_mumu.mad/CODEGEN_mad_ee_mumu_log.txt
+++ b/epochX/cudacpp/ee_mumu.mad/CODEGEN_mad_ee_mumu_log.txt
@@ -62,7 +62,7 @@ generate e+ e- > mu+ mu-
 No model currently active, so we import the Standard Model
 INFO: load particles 
 INFO: load vertices 
-[1;32mDEBUG: model prefixing  takes 0.0032465457916259766 [0m
+[1;32mDEBUG: model prefixing  takes 0.0034666061401367188 [0m
 INFO: Restrict model sm with file models/sm/restrict_default.dat . 
 [1;32mDEBUG: Simplifying conditional expressions [0m
 [1;32mDEBUG: remove interactions: u s w+ at order: QED=1 [0m
@@ -176,7 +176,7 @@ INFO: Creating files in directory P1_epem_mupmum
 [1;32mDEBUG:  Entering PLUGIN_OneProcessExporter.__init__ [1;30m[model_handling.py at line 1039][0m [0m
 [1;32mDEBUG:  kwargs[prefix] = 0 [1;30m[model_handling.py at line 1040][0m [0m
 [1;32mDEBUG:  proc_id = [0m 1 [1;30m[model_handling.py at line 1045][0m [0m
-[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_SA_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7f983ab093d0> [1;30m[export_v4.py at line 6174][0m [0m
+[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_SA_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7fa8b93883d0> [1;30m[export_v4.py at line 6174][0m [0m
 INFO: Creating files in directory . 
 [1;32mDEBUG:  Entering PLUGIN_OneProcessExporter.generate_process_files [1;30m[model_handling.py at line 1297][0m [0m
 [1;32mDEBUG:  self.include_multi_channel is already defined: this is madevent+second_exporter mode [1;30m[model_handling.py at line 1299][0m [0m
@@ -210,7 +210,7 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./.
 INFO: Generating Feynman diagrams for Process: e+ e- > mu+ mu- WEIGHTED<=4 @1 
 INFO: Finding symmetric diagrams for subprocess group epem_mupmum 
 Generated helas calls for 1 subprocesses (2 diagrams) in 0.003 s
-Wrote files for 8 helas calls in 0.202 s
+Wrote files for 8 helas calls in 0.214 s
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates FFV1 routines[0m
 ALOHA: aloha creates FFV2 routines[0m
@@ -254,6 +254,6 @@ Type "launch" to generate events from this process, or see
 Run "open index.html" to see more information about this process.
 quit
 
-real	0m4.503s
-user	0m1.166s
-sys	0m1.531s
+real	0m5.476s
+user	0m1.181s
+sys	0m1.503s
diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/cudacpp.mk b/epochX/cudacpp/ee_mumu.mad/SubProcesses/cudacpp.mk
index 77334d2c04..9415b01fad 100644
--- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/cudacpp.mk
+++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/cudacpp.mk
@@ -163,8 +163,6 @@ ifeq ($(findstring nvcc,$(CUDA_COMPILER_PATH)),nvcc)
     override CUINC=
     override CURANDLIBFLAGS=
   endif
-  export GPUCC
-  export GPUFLAGS
 
   # Set the host C++ compiler for GPUCC via "-ccbin <host-compiler>"
   # (NB issue #505: this must be a single word, "clang++ --gcc-toolchain..." is not supported)
@@ -215,6 +213,8 @@ else ifeq ($(findstring hipcc,$(HIP_COMPILER_PATH)),hipcc)
     CUBUILDRULEFLAGS = -fPIC -c
     CCBUILDRULEFLAGS = -fPIC -c
 
+    export HIPARCHFLAGS
+
   else ifneq ($(origin REQUIRE_HIP),undefined)
     # If REQUIRE_HIP is set but no cuda is found, stop here (e.g. for CI tests on GPU #443)
     $(error No hip installation found (set HIP_HOME or make GPUCC visible in PATH))
@@ -234,6 +234,9 @@ else ifeq ($(findstring hipcc,$(HIP_COMPILER_PATH)),hipcc)
 
 endif
 
+export GPUCC
+export GPUFLAGS
+
 #-------------------------------------------------------------------------------
 
 #=== Configure ccache for C++ and CUDA builds
diff --git a/epochX/cudacpp/ee_mumu.mad/src/cudacpp_src.mk b/epochX/cudacpp/ee_mumu.mad/src/cudacpp_src.mk
index f73ff6fa03..d132f0877b 100644
--- a/epochX/cudacpp/ee_mumu.mad/src/cudacpp_src.mk
+++ b/epochX/cudacpp/ee_mumu.mad/src/cudacpp_src.mk
@@ -19,7 +19,7 @@ SHELL := /bin/bash
 #=== Configure common compiler flags for CUDA and C++
 
 INCFLAGS = -I.
-OPTFLAGS = -O3 # this ends up in CUFLAGS too (should it?), cannot add -Ofast or -ffast-math here
+OPTFLAGS = -O3 # this ends up in GPUFLAGS too (should it?), cannot add -Ofast or -ffast-math here
 
 #-------------------------------------------------------------------------------
 
@@ -85,6 +85,13 @@ endif
 ###$(info OMPFLAGS=$(OMPFLAGS))
 CXXFLAGS += $(OMPFLAGS)
 
+# Add correct -DHIP_LATFORM when compiling for HIP
+ifeq ($(findstring nvcc,$(GPUCC)),nvcc)
+  GPUFLAGS += -Xcompiler -fPIC -c -x cu
+else ifeq ($(findstring hipcc,$(GPUCC)),hipcc)
+  GPUFLAGS += $(HIPARCHFLAGS) -DHIP_PLATFORM=amd -fPIC -c
+endif
+
 # Set the build flags appropriate to each AVX choice (example: "make AVX=none")
 # [NB MGONGPU_PVW512 is needed because "-mprefer-vector-width=256" is not exposed in a macro]
 # [See https://gcc.gnu.org/bugzilla/show_bug.cgi?id=96476]
@@ -246,7 +253,7 @@ $(BUILDDIR)/%.o : %.cc *.h $(BUILDDIR)/.build.$(TAG)
 # Generic target and build rules: objects from CUDA compilation
 $(BUILDDIR)/%_cu.o : %.cc *.h $(BUILDDIR)/.build.$(TAG)
 	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
-	$(GPUCC) $(CPPFLAGS) $(CUFLAGS) -Xcompiler -fPIC -c -x cu $< -o $@
+	$(GPUCC) $(CPPFLAGS) $(GPUFLAGS) $< -o $@
 
 #-------------------------------------------------------------------------------
 
diff --git a/epochX/cudacpp/ee_mumu.mad/src/mgOnGpuCxtypes.h b/epochX/cudacpp/ee_mumu.mad/src/mgOnGpuCxtypes.h
index 6ae0c42ecb..46d9f02733 100644
--- a/epochX/cudacpp/ee_mumu.mad/src/mgOnGpuCxtypes.h
+++ b/epochX/cudacpp/ee_mumu.mad/src/mgOnGpuCxtypes.h
@@ -92,7 +92,7 @@ namespace mg5amcCpu
   inline __host__ std::ostream&
   operator<<( std::ostream& out, const cxsmpl<FP>& c )
   {
-    out << std::complex( c.real(), c.imag() );
+    out << std::complex<FP>( c.real(), c.imag() );
     return out;
   }
 
diff --git a/epochX/cudacpp/ee_mumu.sa/CODEGEN_cudacpp_ee_mumu_log.txt b/epochX/cudacpp/ee_mumu.sa/CODEGEN_cudacpp_ee_mumu_log.txt
index eef31d6909..46d6636a3f 100644
--- a/epochX/cudacpp/ee_mumu.sa/CODEGEN_cudacpp_ee_mumu_log.txt
+++ b/epochX/cudacpp/ee_mumu.sa/CODEGEN_cudacpp_ee_mumu_log.txt
@@ -62,7 +62,7 @@ generate e+ e- > mu+ mu-
 No model currently active, so we import the Standard Model
 INFO: load particles 
 INFO: load vertices 
-[1;32mDEBUG: model prefixing  takes 0.0036804676055908203 [0m
+[1;32mDEBUG: model prefixing  takes 0.003462553024291992 [0m
 INFO: Restrict model sm with file models/sm/restrict_default.dat . 
 [1;32mDEBUG: Simplifying conditional expressions [0m
 [1;32mDEBUG: remove interactions: u s w+ at order: QED=1 [0m
@@ -154,7 +154,7 @@ INFO: Checking for minimal orders which gives processes.
 INFO: Please specify coupling orders to bypass this step. 
 INFO: Trying process: e+ e- > mu+ mu- WEIGHTED<=4 @1  
 INFO: Process has 2 diagrams 
-1 processes with 2 diagrams generated in 0.004 s
+1 processes with 2 diagrams generated in 0.003 s
 Total: 1 processes with 2 diagrams
 output standalone_cudacpp CODEGEN_cudacpp_ee_mumu
 Load PLUGIN.CUDACPP_SA_OUTPUT
@@ -202,7 +202,7 @@ ALOHA: aloha creates FFV1 routines[0m
 ALOHA: aloha creates FFV2 routines[0m
 ALOHA: aloha creates FFV4 routines[0m
 ALOHA: aloha creates FFV2_4 routines[0m
-ALOHA: aloha creates 4 routines in  0.215 s
+ALOHA: aloha creates 4 routines in  0.191 s
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
 <class 'aloha.create_aloha.AbstractRoutine'> FFV2
@@ -225,6 +225,6 @@ INFO: /afs/cern.ch/work/j/jteig/mg5amcnlo/CODEGEN_cudacpp_ee_mumu/src/. and /afs
 [1;32mDEBUG:  Entering PLUGIN_ProcessExporter.finalize [1;30m[output.py at line 203][0m [0m
 quit
 
-real	0m0.899s
-user	0m0.522s
-sys	0m0.169s
+real	0m0.940s
+user	0m0.430s
+sys	0m0.166s
diff --git a/epochX/cudacpp/ee_mumu.sa/SubProcesses/cudacpp.mk b/epochX/cudacpp/ee_mumu.sa/SubProcesses/cudacpp.mk
index 77334d2c04..9415b01fad 100644
--- a/epochX/cudacpp/ee_mumu.sa/SubProcesses/cudacpp.mk
+++ b/epochX/cudacpp/ee_mumu.sa/SubProcesses/cudacpp.mk
@@ -163,8 +163,6 @@ ifeq ($(findstring nvcc,$(CUDA_COMPILER_PATH)),nvcc)
     override CUINC=
     override CURANDLIBFLAGS=
   endif
-  export GPUCC
-  export GPUFLAGS
 
   # Set the host C++ compiler for GPUCC via "-ccbin <host-compiler>"
   # (NB issue #505: this must be a single word, "clang++ --gcc-toolchain..." is not supported)
@@ -215,6 +213,8 @@ else ifeq ($(findstring hipcc,$(HIP_COMPILER_PATH)),hipcc)
     CUBUILDRULEFLAGS = -fPIC -c
     CCBUILDRULEFLAGS = -fPIC -c
 
+    export HIPARCHFLAGS
+
   else ifneq ($(origin REQUIRE_HIP),undefined)
     # If REQUIRE_HIP is set but no cuda is found, stop here (e.g. for CI tests on GPU #443)
     $(error No hip installation found (set HIP_HOME or make GPUCC visible in PATH))
@@ -234,6 +234,9 @@ else ifeq ($(findstring hipcc,$(HIP_COMPILER_PATH)),hipcc)
 
 endif
 
+export GPUCC
+export GPUFLAGS
+
 #-------------------------------------------------------------------------------
 
 #=== Configure ccache for C++ and CUDA builds
diff --git a/epochX/cudacpp/ee_mumu.sa/src/cudacpp_src.mk b/epochX/cudacpp/ee_mumu.sa/src/cudacpp_src.mk
index f73ff6fa03..d132f0877b 100644
--- a/epochX/cudacpp/ee_mumu.sa/src/cudacpp_src.mk
+++ b/epochX/cudacpp/ee_mumu.sa/src/cudacpp_src.mk
@@ -19,7 +19,7 @@ SHELL := /bin/bash
 #=== Configure common compiler flags for CUDA and C++
 
 INCFLAGS = -I.
-OPTFLAGS = -O3 # this ends up in CUFLAGS too (should it?), cannot add -Ofast or -ffast-math here
+OPTFLAGS = -O3 # this ends up in GPUFLAGS too (should it?), cannot add -Ofast or -ffast-math here
 
 #-------------------------------------------------------------------------------
 
@@ -85,6 +85,13 @@ endif
 ###$(info OMPFLAGS=$(OMPFLAGS))
 CXXFLAGS += $(OMPFLAGS)
 
+# Add correct -DHIP_LATFORM when compiling for HIP
+ifeq ($(findstring nvcc,$(GPUCC)),nvcc)
+  GPUFLAGS += -Xcompiler -fPIC -c -x cu
+else ifeq ($(findstring hipcc,$(GPUCC)),hipcc)
+  GPUFLAGS += $(HIPARCHFLAGS) -DHIP_PLATFORM=amd -fPIC -c
+endif
+
 # Set the build flags appropriate to each AVX choice (example: "make AVX=none")
 # [NB MGONGPU_PVW512 is needed because "-mprefer-vector-width=256" is not exposed in a macro]
 # [See https://gcc.gnu.org/bugzilla/show_bug.cgi?id=96476]
@@ -246,7 +253,7 @@ $(BUILDDIR)/%.o : %.cc *.h $(BUILDDIR)/.build.$(TAG)
 # Generic target and build rules: objects from CUDA compilation
 $(BUILDDIR)/%_cu.o : %.cc *.h $(BUILDDIR)/.build.$(TAG)
 	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
-	$(GPUCC) $(CPPFLAGS) $(CUFLAGS) -Xcompiler -fPIC -c -x cu $< -o $@
+	$(GPUCC) $(CPPFLAGS) $(GPUFLAGS) $< -o $@
 
 #-------------------------------------------------------------------------------
 
diff --git a/epochX/cudacpp/ee_mumu.sa/src/mgOnGpuCxtypes.h b/epochX/cudacpp/ee_mumu.sa/src/mgOnGpuCxtypes.h
index 6ae0c42ecb..46d9f02733 100644
--- a/epochX/cudacpp/ee_mumu.sa/src/mgOnGpuCxtypes.h
+++ b/epochX/cudacpp/ee_mumu.sa/src/mgOnGpuCxtypes.h
@@ -92,7 +92,7 @@ namespace mg5amcCpu
   inline __host__ std::ostream&
   operator<<( std::ostream& out, const cxsmpl<FP>& c )
   {
-    out << std::complex( c.real(), c.imag() );
+    out << std::complex<FP>( c.real(), c.imag() );
     return out;
   }
 
diff --git a/epochX/cudacpp/gg_tt.mad/CODEGEN_mad_gg_tt_log.txt b/epochX/cudacpp/gg_tt.mad/CODEGEN_mad_gg_tt_log.txt
index 66404e5be8..d9a82265b6 100644
--- a/epochX/cudacpp/gg_tt.mad/CODEGEN_mad_gg_tt_log.txt
+++ b/epochX/cudacpp/gg_tt.mad/CODEGEN_mad_gg_tt_log.txt
@@ -62,7 +62,7 @@ generate g g > t t~
 No model currently active, so we import the Standard Model
 INFO: load particles 
 INFO: load vertices 
-[1;32mDEBUG: model prefixing  takes 0.003694295883178711 [0m
+[1;32mDEBUG: model prefixing  takes 0.003400087356567383 [0m
 INFO: Restrict model sm with file models/sm/restrict_default.dat . 
 [1;32mDEBUG: Simplifying conditional expressions [0m
 [1;32mDEBUG: remove interactions: u s w+ at order: QED=1 [0m
@@ -177,7 +177,7 @@ INFO: Creating files in directory P1_gg_ttx
 [1;32mDEBUG:  Entering PLUGIN_OneProcessExporter.__init__ [1;30m[model_handling.py at line 1039][0m [0m
 [1;32mDEBUG:  kwargs[prefix] = 0 [1;30m[model_handling.py at line 1040][0m [0m
 [1;32mDEBUG:  proc_id = [0m 1 [1;30m[model_handling.py at line 1045][0m [0m
-[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_SA_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7f11940656a0> [1;30m[export_v4.py at line 6174][0m [0m
+[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_SA_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7f9ac2b576a0> [1;30m[export_v4.py at line 6174][0m [0m
 INFO: Creating files in directory . 
 [1;32mDEBUG:  Entering PLUGIN_OneProcessExporter.generate_process_files [1;30m[model_handling.py at line 1297][0m [0m
 [1;32mDEBUG:  self.include_multi_channel is already defined: this is madevent+second_exporter mode [1;30m[model_handling.py at line 1299][0m [0m
@@ -213,17 +213,17 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./.
 [1;32mDEBUG:  Done [1;30m[export_cpp.py at line 713][0m [0m
 INFO: Generating Feynman diagrams for Process: g g > t t~ WEIGHTED<=2 @1 
 INFO: Finding symmetric diagrams for subprocess group gg_ttx 
-Generated helas calls for 1 subprocesses (3 diagrams) in 0.006 s
-Wrote files for 10 helas calls in 0.231 s
+Generated helas calls for 1 subprocesses (3 diagrams) in 0.005 s
+Wrote files for 10 helas calls in 0.318 s
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates VVV1 set of routines with options: P0[0m
 ALOHA: aloha creates FFV1 routines[0m
-ALOHA: aloha creates 2 routines in  0.122 s
+ALOHA: aloha creates 2 routines in  0.116 s
 [1;32mDEBUG:  Entering PLUGIN_ProcessExporter.convert_model (create the model) [1;30m[output.py at line 194][0m [0m
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates VVV1 set of routines with options: P0[0m
 ALOHA: aloha creates FFV1 routines[0m
-ALOHA: aloha creates 4 routines in  0.096 s
+ALOHA: aloha creates 4 routines in  0.089 s
 <class 'aloha.create_aloha.AbstractRoutine'> VVV1
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
@@ -254,6 +254,6 @@ Type "launch" to generate events from this process, or see
 Run "open index.html" to see more information about this process.
 quit
 
-real	0m4.821s
-user	0m1.257s
-sys	0m1.715s
+real	0m5.392s
+user	0m1.094s
+sys	0m1.559s
diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/cudacpp.mk b/epochX/cudacpp/gg_tt.mad/SubProcesses/cudacpp.mk
index 77334d2c04..9415b01fad 100644
--- a/epochX/cudacpp/gg_tt.mad/SubProcesses/cudacpp.mk
+++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/cudacpp.mk
@@ -163,8 +163,6 @@ ifeq ($(findstring nvcc,$(CUDA_COMPILER_PATH)),nvcc)
     override CUINC=
     override CURANDLIBFLAGS=
   endif
-  export GPUCC
-  export GPUFLAGS
 
   # Set the host C++ compiler for GPUCC via "-ccbin <host-compiler>"
   # (NB issue #505: this must be a single word, "clang++ --gcc-toolchain..." is not supported)
@@ -215,6 +213,8 @@ else ifeq ($(findstring hipcc,$(HIP_COMPILER_PATH)),hipcc)
     CUBUILDRULEFLAGS = -fPIC -c
     CCBUILDRULEFLAGS = -fPIC -c
 
+    export HIPARCHFLAGS
+
   else ifneq ($(origin REQUIRE_HIP),undefined)
     # If REQUIRE_HIP is set but no cuda is found, stop here (e.g. for CI tests on GPU #443)
     $(error No hip installation found (set HIP_HOME or make GPUCC visible in PATH))
@@ -234,6 +234,9 @@ else ifeq ($(findstring hipcc,$(HIP_COMPILER_PATH)),hipcc)
 
 endif
 
+export GPUCC
+export GPUFLAGS
+
 #-------------------------------------------------------------------------------
 
 #=== Configure ccache for C++ and CUDA builds
diff --git a/epochX/cudacpp/gg_tt.mad/src/cudacpp_src.mk b/epochX/cudacpp/gg_tt.mad/src/cudacpp_src.mk
index f73ff6fa03..d132f0877b 100644
--- a/epochX/cudacpp/gg_tt.mad/src/cudacpp_src.mk
+++ b/epochX/cudacpp/gg_tt.mad/src/cudacpp_src.mk
@@ -19,7 +19,7 @@ SHELL := /bin/bash
 #=== Configure common compiler flags for CUDA and C++
 
 INCFLAGS = -I.
-OPTFLAGS = -O3 # this ends up in CUFLAGS too (should it?), cannot add -Ofast or -ffast-math here
+OPTFLAGS = -O3 # this ends up in GPUFLAGS too (should it?), cannot add -Ofast or -ffast-math here
 
 #-------------------------------------------------------------------------------
 
@@ -85,6 +85,13 @@ endif
 ###$(info OMPFLAGS=$(OMPFLAGS))
 CXXFLAGS += $(OMPFLAGS)
 
+# Add correct -DHIP_LATFORM when compiling for HIP
+ifeq ($(findstring nvcc,$(GPUCC)),nvcc)
+  GPUFLAGS += -Xcompiler -fPIC -c -x cu
+else ifeq ($(findstring hipcc,$(GPUCC)),hipcc)
+  GPUFLAGS += $(HIPARCHFLAGS) -DHIP_PLATFORM=amd -fPIC -c
+endif
+
 # Set the build flags appropriate to each AVX choice (example: "make AVX=none")
 # [NB MGONGPU_PVW512 is needed because "-mprefer-vector-width=256" is not exposed in a macro]
 # [See https://gcc.gnu.org/bugzilla/show_bug.cgi?id=96476]
@@ -246,7 +253,7 @@ $(BUILDDIR)/%.o : %.cc *.h $(BUILDDIR)/.build.$(TAG)
 # Generic target and build rules: objects from CUDA compilation
 $(BUILDDIR)/%_cu.o : %.cc *.h $(BUILDDIR)/.build.$(TAG)
 	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
-	$(GPUCC) $(CPPFLAGS) $(CUFLAGS) -Xcompiler -fPIC -c -x cu $< -o $@
+	$(GPUCC) $(CPPFLAGS) $(GPUFLAGS) $< -o $@
 
 #-------------------------------------------------------------------------------
 
diff --git a/epochX/cudacpp/gg_tt.mad/src/mgOnGpuCxtypes.h b/epochX/cudacpp/gg_tt.mad/src/mgOnGpuCxtypes.h
index 6ae0c42ecb..46d9f02733 100644
--- a/epochX/cudacpp/gg_tt.mad/src/mgOnGpuCxtypes.h
+++ b/epochX/cudacpp/gg_tt.mad/src/mgOnGpuCxtypes.h
@@ -92,7 +92,7 @@ namespace mg5amcCpu
   inline __host__ std::ostream&
   operator<<( std::ostream& out, const cxsmpl<FP>& c )
   {
-    out << std::complex( c.real(), c.imag() );
+    out << std::complex<FP>( c.real(), c.imag() );
     return out;
   }
 
diff --git a/epochX/cudacpp/gg_tt.sa/CODEGEN_cudacpp_gg_tt_log.txt b/epochX/cudacpp/gg_tt.sa/CODEGEN_cudacpp_gg_tt_log.txt
index 32dec0cb3a..75fe3bae7e 100644
--- a/epochX/cudacpp/gg_tt.sa/CODEGEN_cudacpp_gg_tt_log.txt
+++ b/epochX/cudacpp/gg_tt.sa/CODEGEN_cudacpp_gg_tt_log.txt
@@ -62,7 +62,7 @@ generate g g > t t~
 No model currently active, so we import the Standard Model
 INFO: load particles 
 INFO: load vertices 
-[1;32mDEBUG: model prefixing  takes 0.0037648677825927734 [0m
+[1;32mDEBUG: model prefixing  takes 0.0033168792724609375 [0m
 INFO: Restrict model sm with file models/sm/restrict_default.dat . 
 [1;32mDEBUG: Simplifying conditional expressions [0m
 [1;32mDEBUG: remove interactions: u s w+ at order: QED=1 [0m
@@ -155,7 +155,7 @@ INFO: Please specify coupling orders to bypass this step.
 INFO: Trying coupling order WEIGHTED<=2: WEIGTHED IS QCD+2*QED 
 INFO: Trying process: g g > t t~ WEIGHTED<=2 @1  
 INFO: Process has 3 diagrams 
-1 processes with 3 diagrams generated in 0.007 s
+1 processes with 3 diagrams generated in 0.006 s
 Total: 1 processes with 3 diagrams
 output standalone_cudacpp CODEGEN_cudacpp_gg_tt
 Load PLUGIN.CUDACPP_SA_OUTPUT
@@ -200,12 +200,12 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory /afs/cern.ch/wor
 [1;32mDEBUG:  Entering PLUGIN_OneProcessExporter.edit_memorybuffers [1;30m[model_handling.py at line 1430][0m [0m
 [1;32mDEBUG:  Entering PLUGIN_OneProcessExporter.edit_memoryaccesscouplings [1;30m[model_handling.py at line 1441][0m [0m
 [1;32mDEBUG:  'Copying test reference file: ', template_ref  = [0m Copying test reference file:  /afs/cern.ch/work/j/jteig/mg5amcnlo/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/../../../test/ref/dump_CPUTest.Sigma_sm_gg_ttx.txt [1;30m[model_handling.py at line 1335][0m [0m
-Generated helas calls for 1 subprocesses (3 diagrams) in 0.005 s
+Generated helas calls for 1 subprocesses (3 diagrams) in 0.004 s
 [1;32mDEBUG:  Entering PLUGIN_ProcessExporter.convert_model (create the model) [1;30m[output.py at line 194][0m [0m
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates VVV1 set of routines with options: P0[0m
 ALOHA: aloha creates FFV1 routines[0m
-ALOHA: aloha creates 2 routines in  0.108 s
+ALOHA: aloha creates 2 routines in  0.104 s
 <class 'aloha.create_aloha.AbstractRoutine'> VVV1
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
@@ -227,6 +227,6 @@ INFO: /afs/cern.ch/work/j/jteig/mg5amcnlo/CODEGEN_cudacpp_gg_tt/src/. and /afs/c
 [1;32mDEBUG:  Entering PLUGIN_ProcessExporter.finalize [1;30m[output.py at line 203][0m [0m
 quit
 
-real	0m1.176s
-user	0m0.435s
-sys	0m0.168s
+real	0m0.826s
+user	0m0.385s
+sys	0m0.132s
diff --git a/epochX/cudacpp/gg_tt.sa/SubProcesses/cudacpp.mk b/epochX/cudacpp/gg_tt.sa/SubProcesses/cudacpp.mk
index 77334d2c04..9415b01fad 100644
--- a/epochX/cudacpp/gg_tt.sa/SubProcesses/cudacpp.mk
+++ b/epochX/cudacpp/gg_tt.sa/SubProcesses/cudacpp.mk
@@ -163,8 +163,6 @@ ifeq ($(findstring nvcc,$(CUDA_COMPILER_PATH)),nvcc)
     override CUINC=
     override CURANDLIBFLAGS=
   endif
-  export GPUCC
-  export GPUFLAGS
 
   # Set the host C++ compiler for GPUCC via "-ccbin <host-compiler>"
   # (NB issue #505: this must be a single word, "clang++ --gcc-toolchain..." is not supported)
@@ -215,6 +213,8 @@ else ifeq ($(findstring hipcc,$(HIP_COMPILER_PATH)),hipcc)
     CUBUILDRULEFLAGS = -fPIC -c
     CCBUILDRULEFLAGS = -fPIC -c
 
+    export HIPARCHFLAGS
+
   else ifneq ($(origin REQUIRE_HIP),undefined)
     # If REQUIRE_HIP is set but no cuda is found, stop here (e.g. for CI tests on GPU #443)
     $(error No hip installation found (set HIP_HOME or make GPUCC visible in PATH))
@@ -234,6 +234,9 @@ else ifeq ($(findstring hipcc,$(HIP_COMPILER_PATH)),hipcc)
 
 endif
 
+export GPUCC
+export GPUFLAGS
+
 #-------------------------------------------------------------------------------
 
 #=== Configure ccache for C++ and CUDA builds
diff --git a/epochX/cudacpp/gg_tt.sa/src/cudacpp_src.mk b/epochX/cudacpp/gg_tt.sa/src/cudacpp_src.mk
index f73ff6fa03..d132f0877b 100644
--- a/epochX/cudacpp/gg_tt.sa/src/cudacpp_src.mk
+++ b/epochX/cudacpp/gg_tt.sa/src/cudacpp_src.mk
@@ -19,7 +19,7 @@ SHELL := /bin/bash
 #=== Configure common compiler flags for CUDA and C++
 
 INCFLAGS = -I.
-OPTFLAGS = -O3 # this ends up in CUFLAGS too (should it?), cannot add -Ofast or -ffast-math here
+OPTFLAGS = -O3 # this ends up in GPUFLAGS too (should it?), cannot add -Ofast or -ffast-math here
 
 #-------------------------------------------------------------------------------
 
@@ -85,6 +85,13 @@ endif
 ###$(info OMPFLAGS=$(OMPFLAGS))
 CXXFLAGS += $(OMPFLAGS)
 
+# Add correct -DHIP_LATFORM when compiling for HIP
+ifeq ($(findstring nvcc,$(GPUCC)),nvcc)
+  GPUFLAGS += -Xcompiler -fPIC -c -x cu
+else ifeq ($(findstring hipcc,$(GPUCC)),hipcc)
+  GPUFLAGS += $(HIPARCHFLAGS) -DHIP_PLATFORM=amd -fPIC -c
+endif
+
 # Set the build flags appropriate to each AVX choice (example: "make AVX=none")
 # [NB MGONGPU_PVW512 is needed because "-mprefer-vector-width=256" is not exposed in a macro]
 # [See https://gcc.gnu.org/bugzilla/show_bug.cgi?id=96476]
@@ -246,7 +253,7 @@ $(BUILDDIR)/%.o : %.cc *.h $(BUILDDIR)/.build.$(TAG)
 # Generic target and build rules: objects from CUDA compilation
 $(BUILDDIR)/%_cu.o : %.cc *.h $(BUILDDIR)/.build.$(TAG)
 	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
-	$(GPUCC) $(CPPFLAGS) $(CUFLAGS) -Xcompiler -fPIC -c -x cu $< -o $@
+	$(GPUCC) $(CPPFLAGS) $(GPUFLAGS) $< -o $@
 
 #-------------------------------------------------------------------------------
 
diff --git a/epochX/cudacpp/gg_tt.sa/src/mgOnGpuCxtypes.h b/epochX/cudacpp/gg_tt.sa/src/mgOnGpuCxtypes.h
index 6ae0c42ecb..46d9f02733 100644
--- a/epochX/cudacpp/gg_tt.sa/src/mgOnGpuCxtypes.h
+++ b/epochX/cudacpp/gg_tt.sa/src/mgOnGpuCxtypes.h
@@ -92,7 +92,7 @@ namespace mg5amcCpu
   inline __host__ std::ostream&
   operator<<( std::ostream& out, const cxsmpl<FP>& c )
   {
-    out << std::complex( c.real(), c.imag() );
+    out << std::complex<FP>( c.real(), c.imag() );
     return out;
   }
 
diff --git a/epochX/cudacpp/gg_ttg.mad/CODEGEN_mad_gg_ttg_log.txt b/epochX/cudacpp/gg_ttg.mad/CODEGEN_mad_gg_ttg_log.txt
index cb05831ae6..bd90554f35 100644
--- a/epochX/cudacpp/gg_ttg.mad/CODEGEN_mad_gg_ttg_log.txt
+++ b/epochX/cudacpp/gg_ttg.mad/CODEGEN_mad_gg_ttg_log.txt
@@ -62,7 +62,7 @@ generate g g > t t~ g
 No model currently active, so we import the Standard Model
 INFO: load particles 
 INFO: load vertices 
-[1;32mDEBUG: model prefixing  takes 0.0036163330078125 [0m
+[1;32mDEBUG: model prefixing  takes 0.0034699440002441406 [0m
 INFO: Restrict model sm with file models/sm/restrict_default.dat . 
 [1;32mDEBUG: Simplifying conditional expressions [0m
 [1;32mDEBUG: remove interactions: u s w+ at order: QED=1 [0m
@@ -177,7 +177,7 @@ INFO: Creating files in directory P1_gg_ttxg
 [1;32mDEBUG:  Entering PLUGIN_OneProcessExporter.__init__ [1;30m[model_handling.py at line 1039][0m [0m
 [1;32mDEBUG:  kwargs[prefix] = 0 [1;30m[model_handling.py at line 1040][0m [0m
 [1;32mDEBUG:  proc_id = [0m 1 [1;30m[model_handling.py at line 1045][0m [0m
-[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_SA_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7fa71e369ee0> [1;30m[export_v4.py at line 6174][0m [0m
+[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_SA_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7fb13aef8ee0> [1;30m[export_v4.py at line 6174][0m [0m
 INFO: Creating files in directory . 
 [1;32mDEBUG:  Entering PLUGIN_OneProcessExporter.generate_process_files [1;30m[model_handling.py at line 1297][0m [0m
 [1;32mDEBUG:  self.include_multi_channel is already defined: this is madevent+second_exporter mode [1;30m[model_handling.py at line 1299][0m [0m
@@ -216,14 +216,14 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./.
 INFO: Generating Feynman diagrams for Process: g g > t t~ g WEIGHTED<=3 @1 
 INFO: Finding symmetric diagrams for subprocess group gg_ttxg 
 Generated helas calls for 1 subprocesses (16 diagrams) in 0.029 s
-Wrote files for 36 helas calls in 0.285 s
+Wrote files for 36 helas calls in 0.334 s
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates VVV1 routines[0m
 ALOHA: aloha creates FFV1 routines[0m
 ALOHA: aloha creates VVVV1 set of routines with options: P0[0m
 ALOHA: aloha creates VVVV3 set of routines with options: P0[0m
 ALOHA: aloha creates VVVV4 set of routines with options: P0[0m
-ALOHA: aloha creates 5 routines in  0.247 s
+ALOHA: aloha creates 5 routines in  0.235 s
 [1;32mDEBUG:  Entering PLUGIN_ProcessExporter.convert_model (create the model) [1;30m[output.py at line 194][0m [0m
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates VVV1 routines[0m
@@ -231,7 +231,7 @@ ALOHA: aloha creates FFV1 routines[0m
 ALOHA: aloha creates VVVV1 set of routines with options: P0[0m
 ALOHA: aloha creates VVVV3 set of routines with options: P0[0m
 ALOHA: aloha creates VVVV4 set of routines with options: P0[0m
-ALOHA: aloha creates 10 routines in  0.229 s
+ALOHA: aloha creates 10 routines in  0.215 s
 <class 'aloha.create_aloha.AbstractRoutine'> VVV1
 <class 'aloha.create_aloha.AbstractRoutine'> VVV1
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
@@ -267,6 +267,6 @@ Type "launch" to generate events from this process, or see
 Run "open index.html" to see more information about this process.
 quit
 
-real	0m4.906s
-user	0m1.545s
-sys	0m1.926s
+real	0m5.786s
+user	0m1.451s
+sys	0m1.632s
diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/cudacpp.mk b/epochX/cudacpp/gg_ttg.mad/SubProcesses/cudacpp.mk
index 77334d2c04..9415b01fad 100644
--- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/cudacpp.mk
+++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/cudacpp.mk
@@ -163,8 +163,6 @@ ifeq ($(findstring nvcc,$(CUDA_COMPILER_PATH)),nvcc)
     override CUINC=
     override CURANDLIBFLAGS=
   endif
-  export GPUCC
-  export GPUFLAGS
 
   # Set the host C++ compiler for GPUCC via "-ccbin <host-compiler>"
   # (NB issue #505: this must be a single word, "clang++ --gcc-toolchain..." is not supported)
@@ -215,6 +213,8 @@ else ifeq ($(findstring hipcc,$(HIP_COMPILER_PATH)),hipcc)
     CUBUILDRULEFLAGS = -fPIC -c
     CCBUILDRULEFLAGS = -fPIC -c
 
+    export HIPARCHFLAGS
+
   else ifneq ($(origin REQUIRE_HIP),undefined)
     # If REQUIRE_HIP is set but no cuda is found, stop here (e.g. for CI tests on GPU #443)
     $(error No hip installation found (set HIP_HOME or make GPUCC visible in PATH))
@@ -234,6 +234,9 @@ else ifeq ($(findstring hipcc,$(HIP_COMPILER_PATH)),hipcc)
 
 endif
 
+export GPUCC
+export GPUFLAGS
+
 #-------------------------------------------------------------------------------
 
 #=== Configure ccache for C++ and CUDA builds
diff --git a/epochX/cudacpp/gg_ttg.mad/src/cudacpp_src.mk b/epochX/cudacpp/gg_ttg.mad/src/cudacpp_src.mk
index f73ff6fa03..d132f0877b 100644
--- a/epochX/cudacpp/gg_ttg.mad/src/cudacpp_src.mk
+++ b/epochX/cudacpp/gg_ttg.mad/src/cudacpp_src.mk
@@ -19,7 +19,7 @@ SHELL := /bin/bash
 #=== Configure common compiler flags for CUDA and C++
 
 INCFLAGS = -I.
-OPTFLAGS = -O3 # this ends up in CUFLAGS too (should it?), cannot add -Ofast or -ffast-math here
+OPTFLAGS = -O3 # this ends up in GPUFLAGS too (should it?), cannot add -Ofast or -ffast-math here
 
 #-------------------------------------------------------------------------------
 
@@ -85,6 +85,13 @@ endif
 ###$(info OMPFLAGS=$(OMPFLAGS))
 CXXFLAGS += $(OMPFLAGS)
 
+# Add correct -DHIP_LATFORM when compiling for HIP
+ifeq ($(findstring nvcc,$(GPUCC)),nvcc)
+  GPUFLAGS += -Xcompiler -fPIC -c -x cu
+else ifeq ($(findstring hipcc,$(GPUCC)),hipcc)
+  GPUFLAGS += $(HIPARCHFLAGS) -DHIP_PLATFORM=amd -fPIC -c
+endif
+
 # Set the build flags appropriate to each AVX choice (example: "make AVX=none")
 # [NB MGONGPU_PVW512 is needed because "-mprefer-vector-width=256" is not exposed in a macro]
 # [See https://gcc.gnu.org/bugzilla/show_bug.cgi?id=96476]
@@ -246,7 +253,7 @@ $(BUILDDIR)/%.o : %.cc *.h $(BUILDDIR)/.build.$(TAG)
 # Generic target and build rules: objects from CUDA compilation
 $(BUILDDIR)/%_cu.o : %.cc *.h $(BUILDDIR)/.build.$(TAG)
 	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
-	$(GPUCC) $(CPPFLAGS) $(CUFLAGS) -Xcompiler -fPIC -c -x cu $< -o $@
+	$(GPUCC) $(CPPFLAGS) $(GPUFLAGS) $< -o $@
 
 #-------------------------------------------------------------------------------
 
diff --git a/epochX/cudacpp/gg_ttg.mad/src/mgOnGpuCxtypes.h b/epochX/cudacpp/gg_ttg.mad/src/mgOnGpuCxtypes.h
index 6ae0c42ecb..46d9f02733 100644
--- a/epochX/cudacpp/gg_ttg.mad/src/mgOnGpuCxtypes.h
+++ b/epochX/cudacpp/gg_ttg.mad/src/mgOnGpuCxtypes.h
@@ -92,7 +92,7 @@ namespace mg5amcCpu
   inline __host__ std::ostream&
   operator<<( std::ostream& out, const cxsmpl<FP>& c )
   {
-    out << std::complex( c.real(), c.imag() );
+    out << std::complex<FP>( c.real(), c.imag() );
     return out;
   }
 
diff --git a/epochX/cudacpp/gg_ttg.sa/CODEGEN_cudacpp_gg_ttg_log.txt b/epochX/cudacpp/gg_ttg.sa/CODEGEN_cudacpp_gg_ttg_log.txt
index 8658acc7f4..a069b5df15 100644
--- a/epochX/cudacpp/gg_ttg.sa/CODEGEN_cudacpp_gg_ttg_log.txt
+++ b/epochX/cudacpp/gg_ttg.sa/CODEGEN_cudacpp_gg_ttg_log.txt
@@ -62,7 +62,7 @@ generate g g > t t~ g
 No model currently active, so we import the Standard Model
 INFO: load particles 
 INFO: load vertices 
-[1;32mDEBUG: model prefixing  takes 0.0037899017333984375 [0m
+[1;32mDEBUG: model prefixing  takes 0.0034067630767822266 [0m
 INFO: Restrict model sm with file models/sm/restrict_default.dat . 
 [1;32mDEBUG: Simplifying conditional expressions [0m
 [1;32mDEBUG: remove interactions: u s w+ at order: QED=1 [0m
@@ -155,7 +155,7 @@ INFO: Please specify coupling orders to bypass this step.
 INFO: Trying coupling order WEIGHTED<=3: WEIGTHED IS QCD+2*QED 
 INFO: Trying process: g g > t t~ g WEIGHTED<=3 @1  
 INFO: Process has 16 diagrams 
-1 processes with 16 diagrams generated in 0.017 s
+1 processes with 16 diagrams generated in 0.015 s
 Total: 1 processes with 16 diagrams
 output standalone_cudacpp CODEGEN_cudacpp_gg_ttg
 Load PLUGIN.CUDACPP_SA_OUTPUT
@@ -202,7 +202,7 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory /afs/cern.ch/wor
 [1;32mDEBUG:  Entering PLUGIN_OneProcessExporter.edit_memorybuffers [1;30m[model_handling.py at line 1430][0m [0m
 [1;32mDEBUG:  Entering PLUGIN_OneProcessExporter.edit_memoryaccesscouplings [1;30m[model_handling.py at line 1441][0m [0m
 [1;32mDEBUG:  'Copying test reference file: ', template_ref  = [0m Copying test reference file:  /afs/cern.ch/work/j/jteig/mg5amcnlo/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/../../../test/ref/dump_CPUTest.Sigma_sm_gg_ttxg.txt [1;30m[model_handling.py at line 1335][0m [0m
-Generated helas calls for 1 subprocesses (16 diagrams) in 0.028 s
+Generated helas calls for 1 subprocesses (16 diagrams) in 0.027 s
 [1;32mDEBUG:  Entering PLUGIN_ProcessExporter.convert_model (create the model) [1;30m[output.py at line 194][0m [0m
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates VVV1 routines[0m
@@ -210,7 +210,7 @@ ALOHA: aloha creates FFV1 routines[0m
 ALOHA: aloha creates VVVV1 set of routines with options: P0[0m
 ALOHA: aloha creates VVVV3 set of routines with options: P0[0m
 ALOHA: aloha creates VVVV4 set of routines with options: P0[0m
-ALOHA: aloha creates 5 routines in  0.242 s
+ALOHA: aloha creates 5 routines in  0.226 s
 <class 'aloha.create_aloha.AbstractRoutine'> VVV1
 <class 'aloha.create_aloha.AbstractRoutine'> VVV1
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
@@ -237,6 +237,6 @@ INFO: /afs/cern.ch/work/j/jteig/mg5amcnlo/CODEGEN_cudacpp_gg_ttg/src/. and /afs/
 [1;32mDEBUG:  Entering PLUGIN_ProcessExporter.finalize [1;30m[output.py at line 203][0m [0m
 quit
 
-real	0m1.318s
-user	0m0.623s
-sys	0m0.179s
+real	0m0.961s
+user	0m0.558s
+sys	0m0.141s
diff --git a/epochX/cudacpp/gg_ttg.sa/SubProcesses/cudacpp.mk b/epochX/cudacpp/gg_ttg.sa/SubProcesses/cudacpp.mk
index 77334d2c04..9415b01fad 100644
--- a/epochX/cudacpp/gg_ttg.sa/SubProcesses/cudacpp.mk
+++ b/epochX/cudacpp/gg_ttg.sa/SubProcesses/cudacpp.mk
@@ -163,8 +163,6 @@ ifeq ($(findstring nvcc,$(CUDA_COMPILER_PATH)),nvcc)
     override CUINC=
     override CURANDLIBFLAGS=
   endif
-  export GPUCC
-  export GPUFLAGS
 
   # Set the host C++ compiler for GPUCC via "-ccbin <host-compiler>"
   # (NB issue #505: this must be a single word, "clang++ --gcc-toolchain..." is not supported)
@@ -215,6 +213,8 @@ else ifeq ($(findstring hipcc,$(HIP_COMPILER_PATH)),hipcc)
     CUBUILDRULEFLAGS = -fPIC -c
     CCBUILDRULEFLAGS = -fPIC -c
 
+    export HIPARCHFLAGS
+
   else ifneq ($(origin REQUIRE_HIP),undefined)
     # If REQUIRE_HIP is set but no cuda is found, stop here (e.g. for CI tests on GPU #443)
     $(error No hip installation found (set HIP_HOME or make GPUCC visible in PATH))
@@ -234,6 +234,9 @@ else ifeq ($(findstring hipcc,$(HIP_COMPILER_PATH)),hipcc)
 
 endif
 
+export GPUCC
+export GPUFLAGS
+
 #-------------------------------------------------------------------------------
 
 #=== Configure ccache for C++ and CUDA builds
diff --git a/epochX/cudacpp/gg_ttg.sa/src/cudacpp_src.mk b/epochX/cudacpp/gg_ttg.sa/src/cudacpp_src.mk
index f73ff6fa03..d132f0877b 100644
--- a/epochX/cudacpp/gg_ttg.sa/src/cudacpp_src.mk
+++ b/epochX/cudacpp/gg_ttg.sa/src/cudacpp_src.mk
@@ -19,7 +19,7 @@ SHELL := /bin/bash
 #=== Configure common compiler flags for CUDA and C++
 
 INCFLAGS = -I.
-OPTFLAGS = -O3 # this ends up in CUFLAGS too (should it?), cannot add -Ofast or -ffast-math here
+OPTFLAGS = -O3 # this ends up in GPUFLAGS too (should it?), cannot add -Ofast or -ffast-math here
 
 #-------------------------------------------------------------------------------
 
@@ -85,6 +85,13 @@ endif
 ###$(info OMPFLAGS=$(OMPFLAGS))
 CXXFLAGS += $(OMPFLAGS)
 
+# Add correct -DHIP_LATFORM when compiling for HIP
+ifeq ($(findstring nvcc,$(GPUCC)),nvcc)
+  GPUFLAGS += -Xcompiler -fPIC -c -x cu
+else ifeq ($(findstring hipcc,$(GPUCC)),hipcc)
+  GPUFLAGS += $(HIPARCHFLAGS) -DHIP_PLATFORM=amd -fPIC -c
+endif
+
 # Set the build flags appropriate to each AVX choice (example: "make AVX=none")
 # [NB MGONGPU_PVW512 is needed because "-mprefer-vector-width=256" is not exposed in a macro]
 # [See https://gcc.gnu.org/bugzilla/show_bug.cgi?id=96476]
@@ -246,7 +253,7 @@ $(BUILDDIR)/%.o : %.cc *.h $(BUILDDIR)/.build.$(TAG)
 # Generic target and build rules: objects from CUDA compilation
 $(BUILDDIR)/%_cu.o : %.cc *.h $(BUILDDIR)/.build.$(TAG)
 	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
-	$(GPUCC) $(CPPFLAGS) $(CUFLAGS) -Xcompiler -fPIC -c -x cu $< -o $@
+	$(GPUCC) $(CPPFLAGS) $(GPUFLAGS) $< -o $@
 
 #-------------------------------------------------------------------------------
 
diff --git a/epochX/cudacpp/gg_ttg.sa/src/mgOnGpuCxtypes.h b/epochX/cudacpp/gg_ttg.sa/src/mgOnGpuCxtypes.h
index 6ae0c42ecb..46d9f02733 100644
--- a/epochX/cudacpp/gg_ttg.sa/src/mgOnGpuCxtypes.h
+++ b/epochX/cudacpp/gg_ttg.sa/src/mgOnGpuCxtypes.h
@@ -92,7 +92,7 @@ namespace mg5amcCpu
   inline __host__ std::ostream&
   operator<<( std::ostream& out, const cxsmpl<FP>& c )
   {
-    out << std::complex( c.real(), c.imag() );
+    out << std::complex<FP>( c.real(), c.imag() );
     return out;
   }
 
diff --git a/epochX/cudacpp/gg_ttgg.mad/CODEGEN_mad_gg_ttgg_log.txt b/epochX/cudacpp/gg_ttgg.mad/CODEGEN_mad_gg_ttgg_log.txt
index 17d1780ff2..ff123fd66d 100644
--- a/epochX/cudacpp/gg_ttgg.mad/CODEGEN_mad_gg_ttgg_log.txt
+++ b/epochX/cudacpp/gg_ttgg.mad/CODEGEN_mad_gg_ttgg_log.txt
@@ -62,7 +62,7 @@ generate g g > t t~ g g
 No model currently active, so we import the Standard Model
 INFO: load particles 
 INFO: load vertices 
-[1;32mDEBUG: model prefixing  takes 0.0033278465270996094 [0m
+[1;32mDEBUG: model prefixing  takes 0.003490447998046875 [0m
 INFO: Restrict model sm with file models/sm/restrict_default.dat . 
 [1;32mDEBUG: Simplifying conditional expressions [0m
 [1;32mDEBUG: remove interactions: u s w+ at order: QED=1 [0m
@@ -155,7 +155,7 @@ INFO: Please specify coupling orders to bypass this step.
 INFO: Trying coupling order WEIGHTED<=4: WEIGTHED IS QCD+2*QED 
 INFO: Trying process: g g > t t~ g g WEIGHTED<=4 @1  
 INFO: Process has 123 diagrams 
-1 processes with 123 diagrams generated in 0.114 s
+1 processes with 123 diagrams generated in 0.111 s
 Total: 1 processes with 123 diagrams
 output madevent CODEGEN_mad_gg_ttgg --hel_recycling=False --vector_size=16384 --me_exporter=standalone_cudacpp
 Load PLUGIN.CUDACPP_SA_OUTPUT
@@ -177,7 +177,7 @@ INFO: Creating files in directory P1_gg_ttxgg
 [1;32mDEBUG:  Entering PLUGIN_OneProcessExporter.__init__ [1;30m[model_handling.py at line 1039][0m [0m
 [1;32mDEBUG:  kwargs[prefix] = 0 [1;30m[model_handling.py at line 1040][0m [0m
 [1;32mDEBUG:  proc_id = [0m 1 [1;30m[model_handling.py at line 1045][0m [0m
-[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_SA_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7f55ae2b4df0> [1;30m[export_v4.py at line 6174][0m [0m
+[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_SA_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7fe3e22fadf0> [1;30m[export_v4.py at line 6174][0m [0m
 INFO: Creating files in directory . 
 [1;32mDEBUG:  Entering PLUGIN_OneProcessExporter.generate_process_files [1;30m[model_handling.py at line 1297][0m [0m
 [1;32mDEBUG:  self.include_multi_channel is already defined: this is madevent+second_exporter mode [1;30m[model_handling.py at line 1299][0m [0m
@@ -217,15 +217,15 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./.
 [1;32mDEBUG:  Done [1;30m[export_cpp.py at line 713][0m [0m
 INFO: Generating Feynman diagrams for Process: g g > t t~ g g WEIGHTED<=4 @1 
 INFO: Finding symmetric diagrams for subprocess group gg_ttxgg 
-Generated helas calls for 1 subprocesses (123 diagrams) in 0.305 s
-Wrote files for 222 helas calls in 0.676 s
+Generated helas calls for 1 subprocesses (123 diagrams) in 0.293 s
+Wrote files for 222 helas calls in 0.687 s
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates VVV1 routines[0m
 ALOHA: aloha creates FFV1 routines[0m
 ALOHA: aloha creates VVVV1 routines[0m
 ALOHA: aloha creates VVVV3 routines[0m
 ALOHA: aloha creates VVVV4 routines[0m
-ALOHA: aloha creates 5 routines in  0.233 s
+ALOHA: aloha creates 5 routines in  0.229 s
 [1;32mDEBUG:  Entering PLUGIN_ProcessExporter.convert_model (create the model) [1;30m[output.py at line 194][0m [0m
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates VVV1 routines[0m
@@ -233,7 +233,7 @@ ALOHA: aloha creates FFV1 routines[0m
 ALOHA: aloha creates VVVV1 routines[0m
 ALOHA: aloha creates VVVV3 routines[0m
 ALOHA: aloha creates VVVV4 routines[0m
-ALOHA: aloha creates 10 routines in  0.211 s
+ALOHA: aloha creates 10 routines in  0.213 s
 <class 'aloha.create_aloha.AbstractRoutine'> VVV1
 <class 'aloha.create_aloha.AbstractRoutine'> VVV1
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
@@ -272,6 +272,6 @@ Type "launch" to generate events from this process, or see
 Run "open index.html" to see more information about this process.
 quit
 
-real	0m5.318s
-user	0m2.189s
-sys	0m1.585s
+real	0m5.697s
+user	0m2.159s
+sys	0m1.553s
diff --git a/epochX/cudacpp/gg_ttgg.sa/CODEGEN_cudacpp_gg_ttgg_log.txt b/epochX/cudacpp/gg_ttgg.sa/CODEGEN_cudacpp_gg_ttgg_log.txt
index 02e4d54a65..e2b5dfc03f 100644
--- a/epochX/cudacpp/gg_ttgg.sa/CODEGEN_cudacpp_gg_ttgg_log.txt
+++ b/epochX/cudacpp/gg_ttgg.sa/CODEGEN_cudacpp_gg_ttgg_log.txt
@@ -62,7 +62,7 @@ generate g g > t t~ g g
 No model currently active, so we import the Standard Model
 INFO: load particles 
 INFO: load vertices 
-[1;32mDEBUG: model prefixing  takes 0.0037162303924560547 [0m
+[1;32mDEBUG: model prefixing  takes 0.0033235549926757812 [0m
 INFO: Restrict model sm with file models/sm/restrict_default.dat . 
 [1;32mDEBUG: Simplifying conditional expressions [0m
 [1;32mDEBUG: remove interactions: u s w+ at order: QED=1 [0m
@@ -155,7 +155,7 @@ INFO: Please specify coupling orders to bypass this step.
 INFO: Trying coupling order WEIGHTED<=4: WEIGTHED IS QCD+2*QED 
 INFO: Trying process: g g > t t~ g g WEIGHTED<=4 @1  
 INFO: Process has 123 diagrams 
-1 processes with 123 diagrams generated in 0.122 s
+1 processes with 123 diagrams generated in 0.111 s
 Total: 1 processes with 123 diagrams
 output standalone_cudacpp CODEGEN_cudacpp_gg_ttgg
 Load PLUGIN.CUDACPP_SA_OUTPUT
@@ -204,7 +204,7 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory /afs/cern.ch/wor
 [1;32mDEBUG:  Entering PLUGIN_OneProcessExporter.edit_memorybuffers [1;30m[model_handling.py at line 1430][0m [0m
 [1;32mDEBUG:  Entering PLUGIN_OneProcessExporter.edit_memoryaccesscouplings [1;30m[model_handling.py at line 1441][0m [0m
 [1;32mDEBUG:  'Copying test reference file: ', template_ref  = [0m Copying test reference file:  /afs/cern.ch/work/j/jteig/mg5amcnlo/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/../../../test/ref/dump_CPUTest.Sigma_sm_gg_ttxgg.txt [1;30m[model_handling.py at line 1335][0m [0m
-Generated helas calls for 1 subprocesses (123 diagrams) in 0.305 s
+Generated helas calls for 1 subprocesses (123 diagrams) in 0.293 s
 [1;32mDEBUG:  Entering PLUGIN_ProcessExporter.convert_model (create the model) [1;30m[output.py at line 194][0m [0m
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates VVV1 routines[0m
@@ -212,7 +212,7 @@ ALOHA: aloha creates FFV1 routines[0m
 ALOHA: aloha creates VVVV1 routines[0m
 ALOHA: aloha creates VVVV3 routines[0m
 ALOHA: aloha creates VVVV4 routines[0m
-ALOHA: aloha creates 5 routines in  0.234 s
+ALOHA: aloha creates 5 routines in  0.219 s
 <class 'aloha.create_aloha.AbstractRoutine'> VVV1
 <class 'aloha.create_aloha.AbstractRoutine'> VVV1
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
@@ -242,6 +242,6 @@ INFO: /afs/cern.ch/work/j/jteig/mg5amcnlo/CODEGEN_cudacpp_gg_ttgg/src/. and /afs
 [1;32mDEBUG:  Entering PLUGIN_ProcessExporter.finalize [1;30m[output.py at line 203][0m [0m
 quit
 
-real	0m1.437s
-user	0m1.070s
-sys	0m0.150s
+real	0m1.592s
+user	0m0.987s
+sys	0m0.160s
diff --git a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/cudacpp.mk b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/cudacpp.mk
index 77334d2c04..9415b01fad 100644
--- a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/cudacpp.mk
+++ b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/cudacpp.mk
@@ -163,8 +163,6 @@ ifeq ($(findstring nvcc,$(CUDA_COMPILER_PATH)),nvcc)
     override CUINC=
     override CURANDLIBFLAGS=
   endif
-  export GPUCC
-  export GPUFLAGS
 
   # Set the host C++ compiler for GPUCC via "-ccbin <host-compiler>"
   # (NB issue #505: this must be a single word, "clang++ --gcc-toolchain..." is not supported)
@@ -215,6 +213,8 @@ else ifeq ($(findstring hipcc,$(HIP_COMPILER_PATH)),hipcc)
     CUBUILDRULEFLAGS = -fPIC -c
     CCBUILDRULEFLAGS = -fPIC -c
 
+    export HIPARCHFLAGS
+
   else ifneq ($(origin REQUIRE_HIP),undefined)
     # If REQUIRE_HIP is set but no cuda is found, stop here (e.g. for CI tests on GPU #443)
     $(error No hip installation found (set HIP_HOME or make GPUCC visible in PATH))
@@ -234,6 +234,9 @@ else ifeq ($(findstring hipcc,$(HIP_COMPILER_PATH)),hipcc)
 
 endif
 
+export GPUCC
+export GPUFLAGS
+
 #-------------------------------------------------------------------------------
 
 #=== Configure ccache for C++ and CUDA builds
diff --git a/epochX/cudacpp/gg_ttgg.sa/src/cudacpp_src.mk b/epochX/cudacpp/gg_ttgg.sa/src/cudacpp_src.mk
index f73ff6fa03..d132f0877b 100644
--- a/epochX/cudacpp/gg_ttgg.sa/src/cudacpp_src.mk
+++ b/epochX/cudacpp/gg_ttgg.sa/src/cudacpp_src.mk
@@ -19,7 +19,7 @@ SHELL := /bin/bash
 #=== Configure common compiler flags for CUDA and C++
 
 INCFLAGS = -I.
-OPTFLAGS = -O3 # this ends up in CUFLAGS too (should it?), cannot add -Ofast or -ffast-math here
+OPTFLAGS = -O3 # this ends up in GPUFLAGS too (should it?), cannot add -Ofast or -ffast-math here
 
 #-------------------------------------------------------------------------------
 
@@ -85,6 +85,13 @@ endif
 ###$(info OMPFLAGS=$(OMPFLAGS))
 CXXFLAGS += $(OMPFLAGS)
 
+# Add correct -DHIP_LATFORM when compiling for HIP
+ifeq ($(findstring nvcc,$(GPUCC)),nvcc)
+  GPUFLAGS += -Xcompiler -fPIC -c -x cu
+else ifeq ($(findstring hipcc,$(GPUCC)),hipcc)
+  GPUFLAGS += $(HIPARCHFLAGS) -DHIP_PLATFORM=amd -fPIC -c
+endif
+
 # Set the build flags appropriate to each AVX choice (example: "make AVX=none")
 # [NB MGONGPU_PVW512 is needed because "-mprefer-vector-width=256" is not exposed in a macro]
 # [See https://gcc.gnu.org/bugzilla/show_bug.cgi?id=96476]
@@ -246,7 +253,7 @@ $(BUILDDIR)/%.o : %.cc *.h $(BUILDDIR)/.build.$(TAG)
 # Generic target and build rules: objects from CUDA compilation
 $(BUILDDIR)/%_cu.o : %.cc *.h $(BUILDDIR)/.build.$(TAG)
 	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
-	$(GPUCC) $(CPPFLAGS) $(CUFLAGS) -Xcompiler -fPIC -c -x cu $< -o $@
+	$(GPUCC) $(CPPFLAGS) $(GPUFLAGS) $< -o $@
 
 #-------------------------------------------------------------------------------
 
diff --git a/epochX/cudacpp/gg_ttgg.sa/src/mgOnGpuCxtypes.h b/epochX/cudacpp/gg_ttgg.sa/src/mgOnGpuCxtypes.h
index 6ae0c42ecb..46d9f02733 100644
--- a/epochX/cudacpp/gg_ttgg.sa/src/mgOnGpuCxtypes.h
+++ b/epochX/cudacpp/gg_ttgg.sa/src/mgOnGpuCxtypes.h
@@ -92,7 +92,7 @@ namespace mg5amcCpu
   inline __host__ std::ostream&
   operator<<( std::ostream& out, const cxsmpl<FP>& c )
   {
-    out << std::complex( c.real(), c.imag() );
+    out << std::complex<FP>( c.real(), c.imag() );
     return out;
   }
 
diff --git a/epochX/cudacpp/gg_ttggg.mad/CODEGEN_mad_gg_ttggg_log.txt b/epochX/cudacpp/gg_ttggg.mad/CODEGEN_mad_gg_ttggg_log.txt
index a335b4d0a4..45f48dfb83 100644
--- a/epochX/cudacpp/gg_ttggg.mad/CODEGEN_mad_gg_ttggg_log.txt
+++ b/epochX/cudacpp/gg_ttggg.mad/CODEGEN_mad_gg_ttggg_log.txt
@@ -62,7 +62,7 @@ generate g g > t t~ g g g
 No model currently active, so we import the Standard Model
 INFO: load particles 
 INFO: load vertices 
-[1;32mDEBUG: model prefixing  takes 0.003568410873413086 [0m
+[1;32mDEBUG: model prefixing  takes 0.0033910274505615234 [0m
 INFO: Restrict model sm with file models/sm/restrict_default.dat . 
 [1;32mDEBUG: Simplifying conditional expressions [0m
 [1;32mDEBUG: remove interactions: u s w+ at order: QED=1 [0m
@@ -155,7 +155,7 @@ INFO: Please specify coupling orders to bypass this step.
 INFO: Trying coupling order WEIGHTED<=5: WEIGTHED IS QCD+2*QED 
 INFO: Trying process: g g > t t~ g g g WEIGHTED<=5 @1  
 INFO: Process has 1240 diagrams 
-1 processes with 1240 diagrams generated in 1.395 s
+1 processes with 1240 diagrams generated in 1.349 s
 Total: 1 processes with 1240 diagrams
 output madevent CODEGEN_mad_gg_ttggg --hel_recycling=False --vector_size=16384 --me_exporter=standalone_cudacpp
 Load PLUGIN.CUDACPP_SA_OUTPUT
@@ -175,11 +175,11 @@ INFO: Generating Helas calls for process: g g > t t~ g g g WEIGHTED<=5 @1
 INFO: Processing color information for process: g g > t t~ g g g @1 
 INFO: Creating files in directory P1_gg_ttxggg 
 INFO: Computing Color-Flow optimization [15120 term] 
-INFO: Color-Flow passed to 1592 term in 24s. Introduce 2768 contraction 
+INFO: Color-Flow passed to 1592 term in 22s. Introduce 2768 contraction 
 [1;32mDEBUG:  Entering PLUGIN_OneProcessExporter.__init__ [1;30m[model_handling.py at line 1039][0m [0m
 [1;32mDEBUG:  kwargs[prefix] = 0 [1;30m[model_handling.py at line 1040][0m [0m
 [1;32mDEBUG:  proc_id = [0m 1 [1;30m[model_handling.py at line 1045][0m [0m
-[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_SA_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7fea7f73c4f0> [1;30m[export_v4.py at line 6174][0m [0m
+[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_SA_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7fbf722662e0> [1;30m[export_v4.py at line 6174][0m [0m
 INFO: Creating files in directory . 
 [1;32mDEBUG:  Entering PLUGIN_OneProcessExporter.generate_process_files [1;30m[model_handling.py at line 1297][0m [0m
 [1;32mDEBUG:  self.include_multi_channel is already defined: this is madevent+second_exporter mode [1;30m[model_handling.py at line 1299][0m [0m
@@ -221,15 +221,15 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./.
 [1;32mDEBUG:  Done [1;30m[export_cpp.py at line 713][0m [0m
 INFO: Generating Feynman diagrams for Process: g g > t t~ g g g WEIGHTED<=5 @1 
 INFO: Finding symmetric diagrams for subprocess group gg_ttxggg 
-Generated helas calls for 1 subprocesses (1240 diagrams) in 4.737 s
-Wrote files for 2281 helas calls in 32.091 s
+Generated helas calls for 1 subprocesses (1240 diagrams) in 4.555 s
+Wrote files for 2281 helas calls in 29.831 s
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates VVV1 routines[0m
 ALOHA: aloha creates FFV1 routines[0m
 ALOHA: aloha creates VVVV1 routines[0m
 ALOHA: aloha creates VVVV3 routines[0m
 ALOHA: aloha creates VVVV4 routines[0m
-ALOHA: aloha creates 5 routines in  0.233 s
+ALOHA: aloha creates 5 routines in  0.219 s
 [1;32mDEBUG:  Entering PLUGIN_ProcessExporter.convert_model (create the model) [1;30m[output.py at line 194][0m [0m
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates VVV1 routines[0m
@@ -237,7 +237,7 @@ ALOHA: aloha creates FFV1 routines[0m
 ALOHA: aloha creates VVVV1 routines[0m
 ALOHA: aloha creates VVVV3 routines[0m
 ALOHA: aloha creates VVVV4 routines[0m
-ALOHA: aloha creates 10 routines in  0.226 s
+ALOHA: aloha creates 10 routines in  0.227 s
 <class 'aloha.create_aloha.AbstractRoutine'> VVV1
 <class 'aloha.create_aloha.AbstractRoutine'> VVV1
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
@@ -276,6 +276,6 @@ Type "launch" to generate events from this process, or see
 Run "open index.html" to see more information about this process.
 quit
 
-real	0m43.400s
-user	0m38.584s
-sys	0m3.126s
+real	0m42.085s
+user	0m35.594s
+sys	0m2.900s
diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/cudacpp.mk b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/cudacpp.mk
index 77334d2c04..9415b01fad 100644
--- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/cudacpp.mk
+++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/cudacpp.mk
@@ -163,8 +163,6 @@ ifeq ($(findstring nvcc,$(CUDA_COMPILER_PATH)),nvcc)
     override CUINC=
     override CURANDLIBFLAGS=
   endif
-  export GPUCC
-  export GPUFLAGS
 
   # Set the host C++ compiler for GPUCC via "-ccbin <host-compiler>"
   # (NB issue #505: this must be a single word, "clang++ --gcc-toolchain..." is not supported)
@@ -215,6 +213,8 @@ else ifeq ($(findstring hipcc,$(HIP_COMPILER_PATH)),hipcc)
     CUBUILDRULEFLAGS = -fPIC -c
     CCBUILDRULEFLAGS = -fPIC -c
 
+    export HIPARCHFLAGS
+
   else ifneq ($(origin REQUIRE_HIP),undefined)
     # If REQUIRE_HIP is set but no cuda is found, stop here (e.g. for CI tests on GPU #443)
     $(error No hip installation found (set HIP_HOME or make GPUCC visible in PATH))
@@ -234,6 +234,9 @@ else ifeq ($(findstring hipcc,$(HIP_COMPILER_PATH)),hipcc)
 
 endif
 
+export GPUCC
+export GPUFLAGS
+
 #-------------------------------------------------------------------------------
 
 #=== Configure ccache for C++ and CUDA builds
diff --git a/epochX/cudacpp/gg_ttggg.mad/src/cudacpp_src.mk b/epochX/cudacpp/gg_ttggg.mad/src/cudacpp_src.mk
index f73ff6fa03..d132f0877b 100644
--- a/epochX/cudacpp/gg_ttggg.mad/src/cudacpp_src.mk
+++ b/epochX/cudacpp/gg_ttggg.mad/src/cudacpp_src.mk
@@ -19,7 +19,7 @@ SHELL := /bin/bash
 #=== Configure common compiler flags for CUDA and C++
 
 INCFLAGS = -I.
-OPTFLAGS = -O3 # this ends up in CUFLAGS too (should it?), cannot add -Ofast or -ffast-math here
+OPTFLAGS = -O3 # this ends up in GPUFLAGS too (should it?), cannot add -Ofast or -ffast-math here
 
 #-------------------------------------------------------------------------------
 
@@ -85,6 +85,13 @@ endif
 ###$(info OMPFLAGS=$(OMPFLAGS))
 CXXFLAGS += $(OMPFLAGS)
 
+# Add correct -DHIP_LATFORM when compiling for HIP
+ifeq ($(findstring nvcc,$(GPUCC)),nvcc)
+  GPUFLAGS += -Xcompiler -fPIC -c -x cu
+else ifeq ($(findstring hipcc,$(GPUCC)),hipcc)
+  GPUFLAGS += $(HIPARCHFLAGS) -DHIP_PLATFORM=amd -fPIC -c
+endif
+
 # Set the build flags appropriate to each AVX choice (example: "make AVX=none")
 # [NB MGONGPU_PVW512 is needed because "-mprefer-vector-width=256" is not exposed in a macro]
 # [See https://gcc.gnu.org/bugzilla/show_bug.cgi?id=96476]
@@ -246,7 +253,7 @@ $(BUILDDIR)/%.o : %.cc *.h $(BUILDDIR)/.build.$(TAG)
 # Generic target and build rules: objects from CUDA compilation
 $(BUILDDIR)/%_cu.o : %.cc *.h $(BUILDDIR)/.build.$(TAG)
 	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
-	$(GPUCC) $(CPPFLAGS) $(CUFLAGS) -Xcompiler -fPIC -c -x cu $< -o $@
+	$(GPUCC) $(CPPFLAGS) $(GPUFLAGS) $< -o $@
 
 #-------------------------------------------------------------------------------
 
diff --git a/epochX/cudacpp/gg_ttggg.mad/src/mgOnGpuCxtypes.h b/epochX/cudacpp/gg_ttggg.mad/src/mgOnGpuCxtypes.h
index 6ae0c42ecb..46d9f02733 100644
--- a/epochX/cudacpp/gg_ttggg.mad/src/mgOnGpuCxtypes.h
+++ b/epochX/cudacpp/gg_ttggg.mad/src/mgOnGpuCxtypes.h
@@ -92,7 +92,7 @@ namespace mg5amcCpu
   inline __host__ std::ostream&
   operator<<( std::ostream& out, const cxsmpl<FP>& c )
   {
-    out << std::complex( c.real(), c.imag() );
+    out << std::complex<FP>( c.real(), c.imag() );
     return out;
   }
 
diff --git a/epochX/cudacpp/gg_ttggg.sa/CODEGEN_cudacpp_gg_ttggg_log.txt b/epochX/cudacpp/gg_ttggg.sa/CODEGEN_cudacpp_gg_ttggg_log.txt
index b6f4bb1e36..9102ff337e 100644
--- a/epochX/cudacpp/gg_ttggg.sa/CODEGEN_cudacpp_gg_ttggg_log.txt
+++ b/epochX/cudacpp/gg_ttggg.sa/CODEGEN_cudacpp_gg_ttggg_log.txt
@@ -62,7 +62,7 @@ generate g g > t t~ g g g
 No model currently active, so we import the Standard Model
 INFO: load particles 
 INFO: load vertices 
-[1;32mDEBUG: model prefixing  takes 0.0034203529357910156 [0m
+[1;32mDEBUG: model prefixing  takes 0.0035371780395507812 [0m
 INFO: Restrict model sm with file models/sm/restrict_default.dat . 
 [1;32mDEBUG: Simplifying conditional expressions [0m
 [1;32mDEBUG: remove interactions: u s w+ at order: QED=1 [0m
@@ -155,7 +155,7 @@ INFO: Please specify coupling orders to bypass this step.
 INFO: Trying coupling order WEIGHTED<=5: WEIGTHED IS QCD+2*QED 
 INFO: Trying process: g g > t t~ g g g WEIGHTED<=5 @1  
 INFO: Process has 1240 diagrams 
-1 processes with 1240 diagrams generated in 1.389 s
+1 processes with 1240 diagrams generated in 1.326 s
 Total: 1 processes with 1240 diagrams
 output standalone_cudacpp CODEGEN_cudacpp_gg_ttggg
 Load PLUGIN.CUDACPP_SA_OUTPUT
@@ -206,7 +206,7 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory /afs/cern.ch/wor
 [1;32mDEBUG:  Entering PLUGIN_OneProcessExporter.edit_memorybuffers [1;30m[model_handling.py at line 1430][0m [0m
 [1;32mDEBUG:  Entering PLUGIN_OneProcessExporter.edit_memoryaccesscouplings [1;30m[model_handling.py at line 1441][0m [0m
 [1;32mDEBUG:  'Copying test reference file: ', template_ref  = [0m Copying test reference file:  /afs/cern.ch/work/j/jteig/mg5amcnlo/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/../../../test/ref/dump_CPUTest.Sigma_sm_gg_ttxggg.txt [1;30m[model_handling.py at line 1335][0m [0m
-Generated helas calls for 1 subprocesses (1240 diagrams) in 4.586 s
+Generated helas calls for 1 subprocesses (1240 diagrams) in 4.393 s
 [1;32mDEBUG:  Entering PLUGIN_ProcessExporter.convert_model (create the model) [1;30m[output.py at line 194][0m [0m
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates VVV1 routines[0m
@@ -214,7 +214,7 @@ ALOHA: aloha creates FFV1 routines[0m
 ALOHA: aloha creates VVVV1 routines[0m
 ALOHA: aloha creates VVVV3 routines[0m
 ALOHA: aloha creates VVVV4 routines[0m
-ALOHA: aloha creates 5 routines in  0.252 s
+ALOHA: aloha creates 5 routines in  0.250 s
 <class 'aloha.create_aloha.AbstractRoutine'> VVV1
 <class 'aloha.create_aloha.AbstractRoutine'> VVV1
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
@@ -244,6 +244,6 @@ INFO: /afs/cern.ch/work/j/jteig/mg5amcnlo/CODEGEN_cudacpp_gg_ttggg/src/. and /af
 [1;32mDEBUG:  Entering PLUGIN_ProcessExporter.finalize [1;30m[output.py at line 203][0m [0m
 quit
 
-real	0m9.536s
-user	0m8.980s
-sys	0m0.292s
+real	0m9.147s
+user	0m8.601s
+sys	0m0.323s
diff --git a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/cudacpp.mk b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/cudacpp.mk
index 77334d2c04..9415b01fad 100644
--- a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/cudacpp.mk
+++ b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/cudacpp.mk
@@ -163,8 +163,6 @@ ifeq ($(findstring nvcc,$(CUDA_COMPILER_PATH)),nvcc)
     override CUINC=
     override CURANDLIBFLAGS=
   endif
-  export GPUCC
-  export GPUFLAGS
 
   # Set the host C++ compiler for GPUCC via "-ccbin <host-compiler>"
   # (NB issue #505: this must be a single word, "clang++ --gcc-toolchain..." is not supported)
@@ -215,6 +213,8 @@ else ifeq ($(findstring hipcc,$(HIP_COMPILER_PATH)),hipcc)
     CUBUILDRULEFLAGS = -fPIC -c
     CCBUILDRULEFLAGS = -fPIC -c
 
+    export HIPARCHFLAGS
+
   else ifneq ($(origin REQUIRE_HIP),undefined)
     # If REQUIRE_HIP is set but no cuda is found, stop here (e.g. for CI tests on GPU #443)
     $(error No hip installation found (set HIP_HOME or make GPUCC visible in PATH))
@@ -234,6 +234,9 @@ else ifeq ($(findstring hipcc,$(HIP_COMPILER_PATH)),hipcc)
 
 endif
 
+export GPUCC
+export GPUFLAGS
+
 #-------------------------------------------------------------------------------
 
 #=== Configure ccache for C++ and CUDA builds
diff --git a/epochX/cudacpp/gg_ttggg.sa/src/cudacpp_src.mk b/epochX/cudacpp/gg_ttggg.sa/src/cudacpp_src.mk
index f73ff6fa03..d132f0877b 100644
--- a/epochX/cudacpp/gg_ttggg.sa/src/cudacpp_src.mk
+++ b/epochX/cudacpp/gg_ttggg.sa/src/cudacpp_src.mk
@@ -19,7 +19,7 @@ SHELL := /bin/bash
 #=== Configure common compiler flags for CUDA and C++
 
 INCFLAGS = -I.
-OPTFLAGS = -O3 # this ends up in CUFLAGS too (should it?), cannot add -Ofast or -ffast-math here
+OPTFLAGS = -O3 # this ends up in GPUFLAGS too (should it?), cannot add -Ofast or -ffast-math here
 
 #-------------------------------------------------------------------------------
 
@@ -85,6 +85,13 @@ endif
 ###$(info OMPFLAGS=$(OMPFLAGS))
 CXXFLAGS += $(OMPFLAGS)
 
+# Add correct -DHIP_LATFORM when compiling for HIP
+ifeq ($(findstring nvcc,$(GPUCC)),nvcc)
+  GPUFLAGS += -Xcompiler -fPIC -c -x cu
+else ifeq ($(findstring hipcc,$(GPUCC)),hipcc)
+  GPUFLAGS += $(HIPARCHFLAGS) -DHIP_PLATFORM=amd -fPIC -c
+endif
+
 # Set the build flags appropriate to each AVX choice (example: "make AVX=none")
 # [NB MGONGPU_PVW512 is needed because "-mprefer-vector-width=256" is not exposed in a macro]
 # [See https://gcc.gnu.org/bugzilla/show_bug.cgi?id=96476]
@@ -246,7 +253,7 @@ $(BUILDDIR)/%.o : %.cc *.h $(BUILDDIR)/.build.$(TAG)
 # Generic target and build rules: objects from CUDA compilation
 $(BUILDDIR)/%_cu.o : %.cc *.h $(BUILDDIR)/.build.$(TAG)
 	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
-	$(GPUCC) $(CPPFLAGS) $(CUFLAGS) -Xcompiler -fPIC -c -x cu $< -o $@
+	$(GPUCC) $(CPPFLAGS) $(GPUFLAGS) $< -o $@
 
 #-------------------------------------------------------------------------------
 
diff --git a/epochX/cudacpp/gg_ttggg.sa/src/mgOnGpuCxtypes.h b/epochX/cudacpp/gg_ttggg.sa/src/mgOnGpuCxtypes.h
index 6ae0c42ecb..46d9f02733 100644
--- a/epochX/cudacpp/gg_ttggg.sa/src/mgOnGpuCxtypes.h
+++ b/epochX/cudacpp/gg_ttggg.sa/src/mgOnGpuCxtypes.h
@@ -92,7 +92,7 @@ namespace mg5amcCpu
   inline __host__ std::ostream&
   operator<<( std::ostream& out, const cxsmpl<FP>& c )
   {
-    out << std::complex( c.real(), c.imag() );
+    out << std::complex<FP>( c.real(), c.imag() );
     return out;
   }
 

From 4defb73373f1447904ffce3501327c95ff1a88c0 Mon Sep 17 00:00:00 2001
From: Jorgen T <jorgen.teig@gmail.com>
Date: Thu, 10 Aug 2023 17:20:16 +0200
Subject: [PATCH 456/509] Fixed warning and changed HIPARCHFLAGS export so it
 exports to cudacpp_src.mk

---
 .../madgraph/iolibs/template_files/gpu/cudacpp.mk      | 10 +++++-----
 epochX/cudacpp/gg_ttgg.mad/SubProcesses/runTest.cc     |  2 +-
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/cudacpp.mk b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/cudacpp.mk
index bca50cd087..3226401f96 100644
--- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/cudacpp.mk
+++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/cudacpp.mk
@@ -176,11 +176,11 @@ ifeq ($(findstring nvcc,$(CUDA_COMPILER_PATH)),nvcc)
 else ifeq ($(findstring hipcc,$(HIP_COMPILER_PATH)),hipcc)
   #=== Configure the HIP compiler
 
-  # If CXX is not a single word (example "clang++ --gcc-toolchain...") then disable CUDA builds (issue #505)
+  # If CXX is not a single word (example "clang++ --gcc-toolchain...") then disable CUDA/HIP builds (issue #505)
   # This is because it is impossible to pass this to "GPUFLAGS += -ccbin <host-compiler>" below
   ifneq ($(words $(subst ccache ,,$(CXX))),1) # allow at most "CXX=ccache <host-compiler>" from outside
-    $(warning CUDA builds are not supported for multi-word CXX "$(CXX)")
-    override CUDA_HOME=disabled
+    $(warning HIP builds are not supported for multi-word CXX "$(CXX)")
+    override HIP_HOME=disabled
   endif
 
   # If HIP_HOME is not set, try to set it from the location of GPUCC
@@ -213,8 +213,6 @@ else ifeq ($(findstring hipcc,$(HIP_COMPILER_PATH)),hipcc)
     CUBUILDRULEFLAGS = -fPIC -c
     CCBUILDRULEFLAGS = -fPIC -c
 
-    export HIPARCHFLAGS
-
   else ifneq ($(origin REQUIRE_HIP),undefined)
     # If REQUIRE_HIP is set but no cuda is found, stop here (e.g. for CI tests on GPU #443)
     $(error No hip installation found (set HIP_HOME or make GPUCC visible in PATH))
@@ -237,6 +235,8 @@ endif
 export GPUCC
 export GPUFLAGS
 
+export HIPARCHFLAGS
+
 #-------------------------------------------------------------------------------
 
 #=== Configure ccache for C++ and CUDA builds
diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/runTest.cc b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/runTest.cc
index 4c9bc9ee6b..0ed26180ca 100644
--- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/runTest.cc
+++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/runTest.cc
@@ -126,7 +126,7 @@ struct CUDATest : public CUDA_CPU_TestBase
   {
     ~DeviceReset()
     {
-      gpuDeviceReset(); // this is needed by cuda-memcheck --leak-check full
+      checkGpu( gpuDeviceReset() ); // this is needed by cuda-memcheck --leak-check full
     }
   } deviceResetter;
 

From 088b329081ec54129b1df764795b86f4e2317569 Mon Sep 17 00:00:00 2001
From: Jorgen T <jorgen.teig@gmail.com>
Date: Thu, 10 Aug 2023 17:22:16 +0200
Subject: [PATCH 457/509] Added -std=c++17 to GPUFLAGS in cudacpp_src.mk

---
 epochX/cudacpp/gg_ttgg.mad/src/cudacpp_src.mk | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/epochX/cudacpp/gg_ttgg.mad/src/cudacpp_src.mk b/epochX/cudacpp/gg_ttgg.mad/src/cudacpp_src.mk
index d132f0877b..2f90b64b47 100644
--- a/epochX/cudacpp/gg_ttgg.mad/src/cudacpp_src.mk
+++ b/epochX/cudacpp/gg_ttgg.mad/src/cudacpp_src.mk
@@ -89,7 +89,7 @@ CXXFLAGS += $(OMPFLAGS)
 ifeq ($(findstring nvcc,$(GPUCC)),nvcc)
   GPUFLAGS += -Xcompiler -fPIC -c -x cu
 else ifeq ($(findstring hipcc,$(GPUCC)),hipcc)
-  GPUFLAGS += $(HIPARCHFLAGS) -DHIP_PLATFORM=amd -fPIC -c
+  GPUFLAGS += -std=c++17 $(HIPARCHFLAGS) -DHIP_PLATFORM=amd -fPIC -c
 endif
 
 # Set the build flags appropriate to each AVX choice (example: "make AVX=none")

From 1899fe34bc61ee219ae97eb11a5389e5ebd54889 Mon Sep 17 00:00:00 2001
From: Jorgen T <jorgen.teig@gmail.com>
Date: Thu, 10 Aug 2023 17:39:57 +0200
Subject: [PATCH 458/509] Revert changes to cudacpp_src.mk, exporting GPUFLAGS
 now working as expected

---
 epochX/cudacpp/gg_ttgg.mad/SubProcesses/cudacpp.mk | 2 --
 epochX/cudacpp/gg_ttgg.mad/src/cudacpp_src.mk      | 2 +-
 2 files changed, 1 insertion(+), 3 deletions(-)

diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/cudacpp.mk b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/cudacpp.mk
index 9415b01fad..81900d1807 100644
--- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/cudacpp.mk
+++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/cudacpp.mk
@@ -213,8 +213,6 @@ else ifeq ($(findstring hipcc,$(HIP_COMPILER_PATH)),hipcc)
     CUBUILDRULEFLAGS = -fPIC -c
     CCBUILDRULEFLAGS = -fPIC -c
 
-    export HIPARCHFLAGS
-
   else ifneq ($(origin REQUIRE_HIP),undefined)
     # If REQUIRE_HIP is set but no cuda is found, stop here (e.g. for CI tests on GPU #443)
     $(error No hip installation found (set HIP_HOME or make GPUCC visible in PATH))
diff --git a/epochX/cudacpp/gg_ttgg.mad/src/cudacpp_src.mk b/epochX/cudacpp/gg_ttgg.mad/src/cudacpp_src.mk
index 2f90b64b47..f2804ffb85 100644
--- a/epochX/cudacpp/gg_ttgg.mad/src/cudacpp_src.mk
+++ b/epochX/cudacpp/gg_ttgg.mad/src/cudacpp_src.mk
@@ -89,7 +89,7 @@ CXXFLAGS += $(OMPFLAGS)
 ifeq ($(findstring nvcc,$(GPUCC)),nvcc)
   GPUFLAGS += -Xcompiler -fPIC -c -x cu
 else ifeq ($(findstring hipcc,$(GPUCC)),hipcc)
-  GPUFLAGS += -std=c++17 $(HIPARCHFLAGS) -DHIP_PLATFORM=amd -fPIC -c
+  GPUFLAGS += -fPIC -c
 endif
 
 # Set the build flags appropriate to each AVX choice (example: "make AVX=none")

From d38ba00bdf82dd9c5fdceba54a568d8fc66929e8 Mon Sep 17 00:00:00 2001
From: Jorgen T <jorgen.teig@gmail.com>
Date: Thu, 10 Aug 2023 17:43:28 +0200
Subject: [PATCH 459/509] [CODEGEN] Fixed error in runTest.cc and reverted
 changes in cudacpp_src.mk and cudacpp.mk

---
 .../madgraph/iolibs/template_files/gpu/cudacpp.mk               | 2 --
 .../madgraph/iolibs/template_files/gpu/cudacpp_src.mk           | 2 +-
 .../madgraph/iolibs/template_files/gpu/runTest.cc               | 2 +-
 3 files changed, 2 insertions(+), 4 deletions(-)

diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/cudacpp.mk b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/cudacpp.mk
index 3226401f96..563f17c303 100644
--- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/cudacpp.mk
+++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/cudacpp.mk
@@ -235,8 +235,6 @@ endif
 export GPUCC
 export GPUFLAGS
 
-export HIPARCHFLAGS
-
 #-------------------------------------------------------------------------------
 
 #=== Configure ccache for C++ and CUDA builds
diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/cudacpp_src.mk b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/cudacpp_src.mk
index d28c92ec13..7eda8524c0 100644
--- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/cudacpp_src.mk
+++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/cudacpp_src.mk
@@ -89,7 +89,7 @@ CXXFLAGS += $(OMPFLAGS)
 ifeq ($(findstring nvcc,$(GPUCC)),nvcc)
   GPUFLAGS += -Xcompiler -fPIC -c -x cu
 else ifeq ($(findstring hipcc,$(GPUCC)),hipcc)
-  GPUFLAGS += $(HIPARCHFLAGS) -DHIP_PLATFORM=amd -fPIC -c
+  GPUFLAGS += -fPIC -c
 endif
 
 # Set the build flags appropriate to each AVX choice (example: "make AVX=none")
diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/runTest.cc b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/runTest.cc
index 4c9bc9ee6b..0ed26180ca 100644
--- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/runTest.cc
+++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/runTest.cc
@@ -126,7 +126,7 @@ struct CUDATest : public CUDA_CPU_TestBase
   {
     ~DeviceReset()
     {
-      gpuDeviceReset(); // this is needed by cuda-memcheck --leak-check full
+      checkGpu( gpuDeviceReset() ); // this is needed by cuda-memcheck --leak-check full
     }
   } deviceResetter;
 

From 62b3e360fa4e605cea7af5d648f290c0f777b086 Mon Sep 17 00:00:00 2001
From: Jorgen T <jorgen.teig@gmail.com>
Date: Thu, 10 Aug 2023 17:49:37 +0200
Subject: [PATCH 460/509] Added changes from CODEGEN into gg_ttgg.mad

---
 epochX/cudacpp/gg_ttgg.mad/SubProcesses/cudacpp.mk | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/cudacpp.mk b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/cudacpp.mk
index 81900d1807..59a2c906eb 100644
--- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/cudacpp.mk
+++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/cudacpp.mk
@@ -176,11 +176,11 @@ ifeq ($(findstring nvcc,$(CUDA_COMPILER_PATH)),nvcc)
 else ifeq ($(findstring hipcc,$(HIP_COMPILER_PATH)),hipcc)
   #=== Configure the HIP compiler
 
-  # If CXX is not a single word (example "clang++ --gcc-toolchain...") then disable CUDA builds (issue #505)
+  # If CXX is not a single word (example "clang++ --gcc-toolchain...") then disable CUDA/HIP builds (issue #505)
   # This is because it is impossible to pass this to "GPUFLAGS += -ccbin <host-compiler>" below
   ifneq ($(words $(subst ccache ,,$(CXX))),1) # allow at most "CXX=ccache <host-compiler>" from outside
-    $(warning CUDA builds are not supported for multi-word CXX "$(CXX)")
-    override CUDA_HOME=disabled
+    $(warning HIP builds are not supported for multi-word CXX "$(CXX)")
+    override HIP_HOME=disabled
   endif
 
   # If HIP_HOME is not set, try to set it from the location of GPUCC

From efef15d9b6bbea516ba1643099c09913b755b124 Mon Sep 17 00:00:00 2001
From: Jorgen T <jorgen.teig@gmail.com>
Date: Thu, 10 Aug 2023 18:00:12 +0200
Subject: [PATCH 461/509] [CODEGEN] Regeneratead all 5 .sa/.mad processes ro
 remove all errors in code

---
 .../ee_mumu.mad/CODEGEN_mad_ee_mumu_log.txt   | 18 +++++++--------
 .../ee_mumu.mad/SubProcesses/cudacpp.mk       |  8 +++----
 .../ee_mumu.mad/SubProcesses/runTest.cc       |  2 +-
 epochX/cudacpp/ee_mumu.mad/src/cudacpp_src.mk |  2 +-
 .../CODEGEN_cudacpp_ee_mumu_log.txt           | 12 +++++-----
 .../ee_mumu.sa/SubProcesses/cudacpp.mk        |  8 +++----
 .../ee_mumu.sa/SubProcesses/runTest.cc        |  2 +-
 epochX/cudacpp/ee_mumu.sa/src/cudacpp_src.mk  |  2 +-
 .../gg_tt.mad/CODEGEN_mad_gg_tt_log.txt       | 16 +++++++-------
 .../cudacpp/gg_tt.mad/SubProcesses/cudacpp.mk |  8 +++----
 .../cudacpp/gg_tt.mad/SubProcesses/runTest.cc |  2 +-
 epochX/cudacpp/gg_tt.mad/src/cudacpp_src.mk   |  2 +-
 .../gg_tt.sa/CODEGEN_cudacpp_gg_tt_log.txt    | 12 +++++-----
 .../cudacpp/gg_tt.sa/SubProcesses/cudacpp.mk  |  8 +++----
 .../cudacpp/gg_tt.sa/SubProcesses/runTest.cc  |  2 +-
 epochX/cudacpp/gg_tt.sa/src/cudacpp_src.mk    |  2 +-
 .../gg_ttg.mad/CODEGEN_mad_gg_ttg_log.txt     | 20 ++++++++---------
 .../gg_ttg.mad/SubProcesses/cudacpp.mk        |  8 +++----
 .../gg_ttg.mad/SubProcesses/runTest.cc        |  2 +-
 epochX/cudacpp/gg_ttg.mad/src/cudacpp_src.mk  |  2 +-
 .../gg_ttg.sa/CODEGEN_cudacpp_gg_ttg_log.txt  | 14 ++++++------
 .../cudacpp/gg_ttg.sa/SubProcesses/cudacpp.mk |  8 +++----
 .../cudacpp/gg_ttg.sa/SubProcesses/runTest.cc |  2 +-
 epochX/cudacpp/gg_ttg.sa/src/cudacpp_src.mk   |  2 +-
 .../gg_ttgg.mad/CODEGEN_mad_gg_ttgg_log.txt   | 16 +++++++-------
 .../CODEGEN_cudacpp_gg_ttgg_log.txt           | 14 ++++++------
 .../gg_ttgg.sa/SubProcesses/cudacpp.mk        |  8 +++----
 .../gg_ttgg.sa/SubProcesses/runTest.cc        |  2 +-
 epochX/cudacpp/gg_ttgg.sa/src/cudacpp_src.mk  |  2 +-
 .../gg_ttggg.mad/CODEGEN_mad_gg_ttggg_log.txt | 22 +++++++++----------
 .../gg_ttggg.mad/SubProcesses/cudacpp.mk      |  8 +++----
 .../gg_ttggg.mad/SubProcesses/runTest.cc      |  2 +-
 .../cudacpp/gg_ttggg.mad/src/cudacpp_src.mk   |  2 +-
 .../CODEGEN_cudacpp_gg_ttggg_log.txt          | 12 +++++-----
 .../gg_ttggg.sa/SubProcesses/cudacpp.mk       |  8 +++----
 .../gg_ttggg.sa/SubProcesses/runTest.cc       |  2 +-
 epochX/cudacpp/gg_ttggg.sa/src/cudacpp_src.mk |  2 +-
 37 files changed, 123 insertions(+), 141 deletions(-)

diff --git a/epochX/cudacpp/ee_mumu.mad/CODEGEN_mad_ee_mumu_log.txt b/epochX/cudacpp/ee_mumu.mad/CODEGEN_mad_ee_mumu_log.txt
index c5451c1383..fa98576109 100644
--- a/epochX/cudacpp/ee_mumu.mad/CODEGEN_mad_ee_mumu_log.txt
+++ b/epochX/cudacpp/ee_mumu.mad/CODEGEN_mad_ee_mumu_log.txt
@@ -62,7 +62,7 @@ generate e+ e- > mu+ mu-
 No model currently active, so we import the Standard Model
 INFO: load particles 
 INFO: load vertices 
-[1;32mDEBUG: model prefixing  takes 0.0034666061401367188 [0m
+[1;32mDEBUG: model prefixing  takes 0.0033915042877197266 [0m
 INFO: Restrict model sm with file models/sm/restrict_default.dat . 
 [1;32mDEBUG: Simplifying conditional expressions [0m
 [1;32mDEBUG: remove interactions: u s w+ at order: QED=1 [0m
@@ -176,7 +176,7 @@ INFO: Creating files in directory P1_epem_mupmum
 [1;32mDEBUG:  Entering PLUGIN_OneProcessExporter.__init__ [1;30m[model_handling.py at line 1039][0m [0m
 [1;32mDEBUG:  kwargs[prefix] = 0 [1;30m[model_handling.py at line 1040][0m [0m
 [1;32mDEBUG:  proc_id = [0m 1 [1;30m[model_handling.py at line 1045][0m [0m
-[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_SA_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7fa8b93883d0> [1;30m[export_v4.py at line 6174][0m [0m
+[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_SA_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7f9f4b0363d0> [1;30m[export_v4.py at line 6174][0m [0m
 INFO: Creating files in directory . 
 [1;32mDEBUG:  Entering PLUGIN_OneProcessExporter.generate_process_files [1;30m[model_handling.py at line 1297][0m [0m
 [1;32mDEBUG:  self.include_multi_channel is already defined: this is madevent+second_exporter mode [1;30m[model_handling.py at line 1299][0m [0m
@@ -209,20 +209,20 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./.
 [1;34mWARNING: vector code for lepton pdf not implemented. We removed the option to run dressed lepton [0m
 INFO: Generating Feynman diagrams for Process: e+ e- > mu+ mu- WEIGHTED<=4 @1 
 INFO: Finding symmetric diagrams for subprocess group epem_mupmum 
-Generated helas calls for 1 subprocesses (2 diagrams) in 0.003 s
-Wrote files for 8 helas calls in 0.214 s
+Generated helas calls for 1 subprocesses (2 diagrams) in 0.004 s
+Wrote files for 8 helas calls in 0.221 s
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates FFV1 routines[0m
 ALOHA: aloha creates FFV2 routines[0m
 ALOHA: aloha creates FFV4 routines[0m
-ALOHA: aloha creates 3 routines in  0.139 s
+ALOHA: aloha creates 3 routines in  0.138 s
 [1;32mDEBUG:  Entering PLUGIN_ProcessExporter.convert_model (create the model) [1;30m[output.py at line 194][0m [0m
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates FFV1 routines[0m
 ALOHA: aloha creates FFV2 routines[0m
 ALOHA: aloha creates FFV4 routines[0m
 ALOHA: aloha creates FFV2_4 routines[0m
-ALOHA: aloha creates 7 routines in  0.172 s
+ALOHA: aloha creates 7 routines in  0.183 s
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
 <class 'aloha.create_aloha.AbstractRoutine'> FFV2
@@ -254,6 +254,6 @@ Type "launch" to generate events from this process, or see
 Run "open index.html" to see more information about this process.
 quit
 
-real	0m5.476s
-user	0m1.181s
-sys	0m1.503s
+real	0m4.967s
+user	0m1.195s
+sys	0m1.770s
diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/cudacpp.mk b/epochX/cudacpp/ee_mumu.mad/SubProcesses/cudacpp.mk
index 9415b01fad..59a2c906eb 100644
--- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/cudacpp.mk
+++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/cudacpp.mk
@@ -176,11 +176,11 @@ ifeq ($(findstring nvcc,$(CUDA_COMPILER_PATH)),nvcc)
 else ifeq ($(findstring hipcc,$(HIP_COMPILER_PATH)),hipcc)
   #=== Configure the HIP compiler
 
-  # If CXX is not a single word (example "clang++ --gcc-toolchain...") then disable CUDA builds (issue #505)
+  # If CXX is not a single word (example "clang++ --gcc-toolchain...") then disable CUDA/HIP builds (issue #505)
   # This is because it is impossible to pass this to "GPUFLAGS += -ccbin <host-compiler>" below
   ifneq ($(words $(subst ccache ,,$(CXX))),1) # allow at most "CXX=ccache <host-compiler>" from outside
-    $(warning CUDA builds are not supported for multi-word CXX "$(CXX)")
-    override CUDA_HOME=disabled
+    $(warning HIP builds are not supported for multi-word CXX "$(CXX)")
+    override HIP_HOME=disabled
   endif
 
   # If HIP_HOME is not set, try to set it from the location of GPUCC
@@ -213,8 +213,6 @@ else ifeq ($(findstring hipcc,$(HIP_COMPILER_PATH)),hipcc)
     CUBUILDRULEFLAGS = -fPIC -c
     CCBUILDRULEFLAGS = -fPIC -c
 
-    export HIPARCHFLAGS
-
   else ifneq ($(origin REQUIRE_HIP),undefined)
     # If REQUIRE_HIP is set but no cuda is found, stop here (e.g. for CI tests on GPU #443)
     $(error No hip installation found (set HIP_HOME or make GPUCC visible in PATH))
diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/runTest.cc b/epochX/cudacpp/ee_mumu.mad/SubProcesses/runTest.cc
index 4c9bc9ee6b..0ed26180ca 100644
--- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/runTest.cc
+++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/runTest.cc
@@ -126,7 +126,7 @@ struct CUDATest : public CUDA_CPU_TestBase
   {
     ~DeviceReset()
     {
-      gpuDeviceReset(); // this is needed by cuda-memcheck --leak-check full
+      checkGpu( gpuDeviceReset() ); // this is needed by cuda-memcheck --leak-check full
     }
   } deviceResetter;
 
diff --git a/epochX/cudacpp/ee_mumu.mad/src/cudacpp_src.mk b/epochX/cudacpp/ee_mumu.mad/src/cudacpp_src.mk
index d132f0877b..f2804ffb85 100644
--- a/epochX/cudacpp/ee_mumu.mad/src/cudacpp_src.mk
+++ b/epochX/cudacpp/ee_mumu.mad/src/cudacpp_src.mk
@@ -89,7 +89,7 @@ CXXFLAGS += $(OMPFLAGS)
 ifeq ($(findstring nvcc,$(GPUCC)),nvcc)
   GPUFLAGS += -Xcompiler -fPIC -c -x cu
 else ifeq ($(findstring hipcc,$(GPUCC)),hipcc)
-  GPUFLAGS += $(HIPARCHFLAGS) -DHIP_PLATFORM=amd -fPIC -c
+  GPUFLAGS += -fPIC -c
 endif
 
 # Set the build flags appropriate to each AVX choice (example: "make AVX=none")
diff --git a/epochX/cudacpp/ee_mumu.sa/CODEGEN_cudacpp_ee_mumu_log.txt b/epochX/cudacpp/ee_mumu.sa/CODEGEN_cudacpp_ee_mumu_log.txt
index 46d6636a3f..d3ddddc183 100644
--- a/epochX/cudacpp/ee_mumu.sa/CODEGEN_cudacpp_ee_mumu_log.txt
+++ b/epochX/cudacpp/ee_mumu.sa/CODEGEN_cudacpp_ee_mumu_log.txt
@@ -62,7 +62,7 @@ generate e+ e- > mu+ mu-
 No model currently active, so we import the Standard Model
 INFO: load particles 
 INFO: load vertices 
-[1;32mDEBUG: model prefixing  takes 0.003462553024291992 [0m
+[1;32mDEBUG: model prefixing  takes 0.003403902053833008 [0m
 INFO: Restrict model sm with file models/sm/restrict_default.dat . 
 [1;32mDEBUG: Simplifying conditional expressions [0m
 [1;32mDEBUG: remove interactions: u s w+ at order: QED=1 [0m
@@ -195,14 +195,14 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory /afs/cern.ch/wor
 [1;32mDEBUG:  Entering PLUGIN_OneProcessExporter.edit_memorybuffers [1;30m[model_handling.py at line 1430][0m [0m
 [1;32mDEBUG:  Entering PLUGIN_OneProcessExporter.edit_memoryaccesscouplings [1;30m[model_handling.py at line 1441][0m [0m
 [1;32mDEBUG:  'Copying test reference file: ', template_ref  = [0m Copying test reference file:  /afs/cern.ch/work/j/jteig/mg5amcnlo/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/../../../test/ref/dump_CPUTest.Sigma_sm_epem_mupmum.txt [1;30m[model_handling.py at line 1335][0m [0m
-Generated helas calls for 1 subprocesses (2 diagrams) in 0.003 s
+Generated helas calls for 1 subprocesses (2 diagrams) in 0.004 s
 [1;32mDEBUG:  Entering PLUGIN_ProcessExporter.convert_model (create the model) [1;30m[output.py at line 194][0m [0m
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates FFV1 routines[0m
 ALOHA: aloha creates FFV2 routines[0m
 ALOHA: aloha creates FFV4 routines[0m
 ALOHA: aloha creates FFV2_4 routines[0m
-ALOHA: aloha creates 4 routines in  0.191 s
+ALOHA: aloha creates 4 routines in  0.194 s
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
 <class 'aloha.create_aloha.AbstractRoutine'> FFV2
@@ -225,6 +225,6 @@ INFO: /afs/cern.ch/work/j/jteig/mg5amcnlo/CODEGEN_cudacpp_ee_mumu/src/. and /afs
 [1;32mDEBUG:  Entering PLUGIN_ProcessExporter.finalize [1;30m[output.py at line 203][0m [0m
 quit
 
-real	0m0.940s
-user	0m0.430s
-sys	0m0.166s
+real	0m0.880s
+user	0m0.446s
+sys	0m0.188s
diff --git a/epochX/cudacpp/ee_mumu.sa/SubProcesses/cudacpp.mk b/epochX/cudacpp/ee_mumu.sa/SubProcesses/cudacpp.mk
index 9415b01fad..59a2c906eb 100644
--- a/epochX/cudacpp/ee_mumu.sa/SubProcesses/cudacpp.mk
+++ b/epochX/cudacpp/ee_mumu.sa/SubProcesses/cudacpp.mk
@@ -176,11 +176,11 @@ ifeq ($(findstring nvcc,$(CUDA_COMPILER_PATH)),nvcc)
 else ifeq ($(findstring hipcc,$(HIP_COMPILER_PATH)),hipcc)
   #=== Configure the HIP compiler
 
-  # If CXX is not a single word (example "clang++ --gcc-toolchain...") then disable CUDA builds (issue #505)
+  # If CXX is not a single word (example "clang++ --gcc-toolchain...") then disable CUDA/HIP builds (issue #505)
   # This is because it is impossible to pass this to "GPUFLAGS += -ccbin <host-compiler>" below
   ifneq ($(words $(subst ccache ,,$(CXX))),1) # allow at most "CXX=ccache <host-compiler>" from outside
-    $(warning CUDA builds are not supported for multi-word CXX "$(CXX)")
-    override CUDA_HOME=disabled
+    $(warning HIP builds are not supported for multi-word CXX "$(CXX)")
+    override HIP_HOME=disabled
   endif
 
   # If HIP_HOME is not set, try to set it from the location of GPUCC
@@ -213,8 +213,6 @@ else ifeq ($(findstring hipcc,$(HIP_COMPILER_PATH)),hipcc)
     CUBUILDRULEFLAGS = -fPIC -c
     CCBUILDRULEFLAGS = -fPIC -c
 
-    export HIPARCHFLAGS
-
   else ifneq ($(origin REQUIRE_HIP),undefined)
     # If REQUIRE_HIP is set but no cuda is found, stop here (e.g. for CI tests on GPU #443)
     $(error No hip installation found (set HIP_HOME or make GPUCC visible in PATH))
diff --git a/epochX/cudacpp/ee_mumu.sa/SubProcesses/runTest.cc b/epochX/cudacpp/ee_mumu.sa/SubProcesses/runTest.cc
index 4c9bc9ee6b..0ed26180ca 100644
--- a/epochX/cudacpp/ee_mumu.sa/SubProcesses/runTest.cc
+++ b/epochX/cudacpp/ee_mumu.sa/SubProcesses/runTest.cc
@@ -126,7 +126,7 @@ struct CUDATest : public CUDA_CPU_TestBase
   {
     ~DeviceReset()
     {
-      gpuDeviceReset(); // this is needed by cuda-memcheck --leak-check full
+      checkGpu( gpuDeviceReset() ); // this is needed by cuda-memcheck --leak-check full
     }
   } deviceResetter;
 
diff --git a/epochX/cudacpp/ee_mumu.sa/src/cudacpp_src.mk b/epochX/cudacpp/ee_mumu.sa/src/cudacpp_src.mk
index d132f0877b..f2804ffb85 100644
--- a/epochX/cudacpp/ee_mumu.sa/src/cudacpp_src.mk
+++ b/epochX/cudacpp/ee_mumu.sa/src/cudacpp_src.mk
@@ -89,7 +89,7 @@ CXXFLAGS += $(OMPFLAGS)
 ifeq ($(findstring nvcc,$(GPUCC)),nvcc)
   GPUFLAGS += -Xcompiler -fPIC -c -x cu
 else ifeq ($(findstring hipcc,$(GPUCC)),hipcc)
-  GPUFLAGS += $(HIPARCHFLAGS) -DHIP_PLATFORM=amd -fPIC -c
+  GPUFLAGS += -fPIC -c
 endif
 
 # Set the build flags appropriate to each AVX choice (example: "make AVX=none")
diff --git a/epochX/cudacpp/gg_tt.mad/CODEGEN_mad_gg_tt_log.txt b/epochX/cudacpp/gg_tt.mad/CODEGEN_mad_gg_tt_log.txt
index d9a82265b6..22062afbf3 100644
--- a/epochX/cudacpp/gg_tt.mad/CODEGEN_mad_gg_tt_log.txt
+++ b/epochX/cudacpp/gg_tt.mad/CODEGEN_mad_gg_tt_log.txt
@@ -62,7 +62,7 @@ generate g g > t t~
 No model currently active, so we import the Standard Model
 INFO: load particles 
 INFO: load vertices 
-[1;32mDEBUG: model prefixing  takes 0.003400087356567383 [0m
+[1;32mDEBUG: model prefixing  takes 0.003465414047241211 [0m
 INFO: Restrict model sm with file models/sm/restrict_default.dat . 
 [1;32mDEBUG: Simplifying conditional expressions [0m
 [1;32mDEBUG: remove interactions: u s w+ at order: QED=1 [0m
@@ -177,7 +177,7 @@ INFO: Creating files in directory P1_gg_ttx
 [1;32mDEBUG:  Entering PLUGIN_OneProcessExporter.__init__ [1;30m[model_handling.py at line 1039][0m [0m
 [1;32mDEBUG:  kwargs[prefix] = 0 [1;30m[model_handling.py at line 1040][0m [0m
 [1;32mDEBUG:  proc_id = [0m 1 [1;30m[model_handling.py at line 1045][0m [0m
-[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_SA_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7f9ac2b576a0> [1;30m[export_v4.py at line 6174][0m [0m
+[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_SA_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7f3aa36926a0> [1;30m[export_v4.py at line 6174][0m [0m
 INFO: Creating files in directory . 
 [1;32mDEBUG:  Entering PLUGIN_OneProcessExporter.generate_process_files [1;30m[model_handling.py at line 1297][0m [0m
 [1;32mDEBUG:  self.include_multi_channel is already defined: this is madevent+second_exporter mode [1;30m[model_handling.py at line 1299][0m [0m
@@ -214,16 +214,16 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./.
 INFO: Generating Feynman diagrams for Process: g g > t t~ WEIGHTED<=2 @1 
 INFO: Finding symmetric diagrams for subprocess group gg_ttx 
 Generated helas calls for 1 subprocesses (3 diagrams) in 0.005 s
-Wrote files for 10 helas calls in 0.318 s
+Wrote files for 10 helas calls in 0.254 s
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates VVV1 set of routines with options: P0[0m
 ALOHA: aloha creates FFV1 routines[0m
-ALOHA: aloha creates 2 routines in  0.116 s
+ALOHA: aloha creates 2 routines in  0.103 s
 [1;32mDEBUG:  Entering PLUGIN_ProcessExporter.convert_model (create the model) [1;30m[output.py at line 194][0m [0m
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates VVV1 set of routines with options: P0[0m
 ALOHA: aloha creates FFV1 routines[0m
-ALOHA: aloha creates 4 routines in  0.089 s
+ALOHA: aloha creates 4 routines in  0.096 s
 <class 'aloha.create_aloha.AbstractRoutine'> VVV1
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
@@ -254,6 +254,6 @@ Type "launch" to generate events from this process, or see
 Run "open index.html" to see more information about this process.
 quit
 
-real	0m5.392s
-user	0m1.094s
-sys	0m1.559s
+real	0m4.770s
+user	0m1.113s
+sys	0m1.622s
diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/cudacpp.mk b/epochX/cudacpp/gg_tt.mad/SubProcesses/cudacpp.mk
index 9415b01fad..59a2c906eb 100644
--- a/epochX/cudacpp/gg_tt.mad/SubProcesses/cudacpp.mk
+++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/cudacpp.mk
@@ -176,11 +176,11 @@ ifeq ($(findstring nvcc,$(CUDA_COMPILER_PATH)),nvcc)
 else ifeq ($(findstring hipcc,$(HIP_COMPILER_PATH)),hipcc)
   #=== Configure the HIP compiler
 
-  # If CXX is not a single word (example "clang++ --gcc-toolchain...") then disable CUDA builds (issue #505)
+  # If CXX is not a single word (example "clang++ --gcc-toolchain...") then disable CUDA/HIP builds (issue #505)
   # This is because it is impossible to pass this to "GPUFLAGS += -ccbin <host-compiler>" below
   ifneq ($(words $(subst ccache ,,$(CXX))),1) # allow at most "CXX=ccache <host-compiler>" from outside
-    $(warning CUDA builds are not supported for multi-word CXX "$(CXX)")
-    override CUDA_HOME=disabled
+    $(warning HIP builds are not supported for multi-word CXX "$(CXX)")
+    override HIP_HOME=disabled
   endif
 
   # If HIP_HOME is not set, try to set it from the location of GPUCC
@@ -213,8 +213,6 @@ else ifeq ($(findstring hipcc,$(HIP_COMPILER_PATH)),hipcc)
     CUBUILDRULEFLAGS = -fPIC -c
     CCBUILDRULEFLAGS = -fPIC -c
 
-    export HIPARCHFLAGS
-
   else ifneq ($(origin REQUIRE_HIP),undefined)
     # If REQUIRE_HIP is set but no cuda is found, stop here (e.g. for CI tests on GPU #443)
     $(error No hip installation found (set HIP_HOME or make GPUCC visible in PATH))
diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/runTest.cc b/epochX/cudacpp/gg_tt.mad/SubProcesses/runTest.cc
index 4c9bc9ee6b..0ed26180ca 100644
--- a/epochX/cudacpp/gg_tt.mad/SubProcesses/runTest.cc
+++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/runTest.cc
@@ -126,7 +126,7 @@ struct CUDATest : public CUDA_CPU_TestBase
   {
     ~DeviceReset()
     {
-      gpuDeviceReset(); // this is needed by cuda-memcheck --leak-check full
+      checkGpu( gpuDeviceReset() ); // this is needed by cuda-memcheck --leak-check full
     }
   } deviceResetter;
 
diff --git a/epochX/cudacpp/gg_tt.mad/src/cudacpp_src.mk b/epochX/cudacpp/gg_tt.mad/src/cudacpp_src.mk
index d132f0877b..f2804ffb85 100644
--- a/epochX/cudacpp/gg_tt.mad/src/cudacpp_src.mk
+++ b/epochX/cudacpp/gg_tt.mad/src/cudacpp_src.mk
@@ -89,7 +89,7 @@ CXXFLAGS += $(OMPFLAGS)
 ifeq ($(findstring nvcc,$(GPUCC)),nvcc)
   GPUFLAGS += -Xcompiler -fPIC -c -x cu
 else ifeq ($(findstring hipcc,$(GPUCC)),hipcc)
-  GPUFLAGS += $(HIPARCHFLAGS) -DHIP_PLATFORM=amd -fPIC -c
+  GPUFLAGS += -fPIC -c
 endif
 
 # Set the build flags appropriate to each AVX choice (example: "make AVX=none")
diff --git a/epochX/cudacpp/gg_tt.sa/CODEGEN_cudacpp_gg_tt_log.txt b/epochX/cudacpp/gg_tt.sa/CODEGEN_cudacpp_gg_tt_log.txt
index 75fe3bae7e..37ba2376ae 100644
--- a/epochX/cudacpp/gg_tt.sa/CODEGEN_cudacpp_gg_tt_log.txt
+++ b/epochX/cudacpp/gg_tt.sa/CODEGEN_cudacpp_gg_tt_log.txt
@@ -62,7 +62,7 @@ generate g g > t t~
 No model currently active, so we import the Standard Model
 INFO: load particles 
 INFO: load vertices 
-[1;32mDEBUG: model prefixing  takes 0.0033168792724609375 [0m
+[1;32mDEBUG: model prefixing  takes 0.0033271312713623047 [0m
 INFO: Restrict model sm with file models/sm/restrict_default.dat . 
 [1;32mDEBUG: Simplifying conditional expressions [0m
 [1;32mDEBUG: remove interactions: u s w+ at order: QED=1 [0m
@@ -200,12 +200,12 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory /afs/cern.ch/wor
 [1;32mDEBUG:  Entering PLUGIN_OneProcessExporter.edit_memorybuffers [1;30m[model_handling.py at line 1430][0m [0m
 [1;32mDEBUG:  Entering PLUGIN_OneProcessExporter.edit_memoryaccesscouplings [1;30m[model_handling.py at line 1441][0m [0m
 [1;32mDEBUG:  'Copying test reference file: ', template_ref  = [0m Copying test reference file:  /afs/cern.ch/work/j/jteig/mg5amcnlo/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/../../../test/ref/dump_CPUTest.Sigma_sm_gg_ttx.txt [1;30m[model_handling.py at line 1335][0m [0m
-Generated helas calls for 1 subprocesses (3 diagrams) in 0.004 s
+Generated helas calls for 1 subprocesses (3 diagrams) in 0.005 s
 [1;32mDEBUG:  Entering PLUGIN_ProcessExporter.convert_model (create the model) [1;30m[output.py at line 194][0m [0m
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates VVV1 set of routines with options: P0[0m
 ALOHA: aloha creates FFV1 routines[0m
-ALOHA: aloha creates 2 routines in  0.104 s
+ALOHA: aloha creates 2 routines in  0.107 s
 <class 'aloha.create_aloha.AbstractRoutine'> VVV1
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
@@ -227,6 +227,6 @@ INFO: /afs/cern.ch/work/j/jteig/mg5amcnlo/CODEGEN_cudacpp_gg_tt/src/. and /afs/c
 [1;32mDEBUG:  Entering PLUGIN_ProcessExporter.finalize [1;30m[output.py at line 203][0m [0m
 quit
 
-real	0m0.826s
-user	0m0.385s
-sys	0m0.132s
+real	0m0.819s
+user	0m0.408s
+sys	0m0.189s
diff --git a/epochX/cudacpp/gg_tt.sa/SubProcesses/cudacpp.mk b/epochX/cudacpp/gg_tt.sa/SubProcesses/cudacpp.mk
index 9415b01fad..59a2c906eb 100644
--- a/epochX/cudacpp/gg_tt.sa/SubProcesses/cudacpp.mk
+++ b/epochX/cudacpp/gg_tt.sa/SubProcesses/cudacpp.mk
@@ -176,11 +176,11 @@ ifeq ($(findstring nvcc,$(CUDA_COMPILER_PATH)),nvcc)
 else ifeq ($(findstring hipcc,$(HIP_COMPILER_PATH)),hipcc)
   #=== Configure the HIP compiler
 
-  # If CXX is not a single word (example "clang++ --gcc-toolchain...") then disable CUDA builds (issue #505)
+  # If CXX is not a single word (example "clang++ --gcc-toolchain...") then disable CUDA/HIP builds (issue #505)
   # This is because it is impossible to pass this to "GPUFLAGS += -ccbin <host-compiler>" below
   ifneq ($(words $(subst ccache ,,$(CXX))),1) # allow at most "CXX=ccache <host-compiler>" from outside
-    $(warning CUDA builds are not supported for multi-word CXX "$(CXX)")
-    override CUDA_HOME=disabled
+    $(warning HIP builds are not supported for multi-word CXX "$(CXX)")
+    override HIP_HOME=disabled
   endif
 
   # If HIP_HOME is not set, try to set it from the location of GPUCC
@@ -213,8 +213,6 @@ else ifeq ($(findstring hipcc,$(HIP_COMPILER_PATH)),hipcc)
     CUBUILDRULEFLAGS = -fPIC -c
     CCBUILDRULEFLAGS = -fPIC -c
 
-    export HIPARCHFLAGS
-
   else ifneq ($(origin REQUIRE_HIP),undefined)
     # If REQUIRE_HIP is set but no cuda is found, stop here (e.g. for CI tests on GPU #443)
     $(error No hip installation found (set HIP_HOME or make GPUCC visible in PATH))
diff --git a/epochX/cudacpp/gg_tt.sa/SubProcesses/runTest.cc b/epochX/cudacpp/gg_tt.sa/SubProcesses/runTest.cc
index 4c9bc9ee6b..0ed26180ca 100644
--- a/epochX/cudacpp/gg_tt.sa/SubProcesses/runTest.cc
+++ b/epochX/cudacpp/gg_tt.sa/SubProcesses/runTest.cc
@@ -126,7 +126,7 @@ struct CUDATest : public CUDA_CPU_TestBase
   {
     ~DeviceReset()
     {
-      gpuDeviceReset(); // this is needed by cuda-memcheck --leak-check full
+      checkGpu( gpuDeviceReset() ); // this is needed by cuda-memcheck --leak-check full
     }
   } deviceResetter;
 
diff --git a/epochX/cudacpp/gg_tt.sa/src/cudacpp_src.mk b/epochX/cudacpp/gg_tt.sa/src/cudacpp_src.mk
index d132f0877b..f2804ffb85 100644
--- a/epochX/cudacpp/gg_tt.sa/src/cudacpp_src.mk
+++ b/epochX/cudacpp/gg_tt.sa/src/cudacpp_src.mk
@@ -89,7 +89,7 @@ CXXFLAGS += $(OMPFLAGS)
 ifeq ($(findstring nvcc,$(GPUCC)),nvcc)
   GPUFLAGS += -Xcompiler -fPIC -c -x cu
 else ifeq ($(findstring hipcc,$(GPUCC)),hipcc)
-  GPUFLAGS += $(HIPARCHFLAGS) -DHIP_PLATFORM=amd -fPIC -c
+  GPUFLAGS += -fPIC -c
 endif
 
 # Set the build flags appropriate to each AVX choice (example: "make AVX=none")
diff --git a/epochX/cudacpp/gg_ttg.mad/CODEGEN_mad_gg_ttg_log.txt b/epochX/cudacpp/gg_ttg.mad/CODEGEN_mad_gg_ttg_log.txt
index bd90554f35..1d9203fb02 100644
--- a/epochX/cudacpp/gg_ttg.mad/CODEGEN_mad_gg_ttg_log.txt
+++ b/epochX/cudacpp/gg_ttg.mad/CODEGEN_mad_gg_ttg_log.txt
@@ -62,7 +62,7 @@ generate g g > t t~ g
 No model currently active, so we import the Standard Model
 INFO: load particles 
 INFO: load vertices 
-[1;32mDEBUG: model prefixing  takes 0.0034699440002441406 [0m
+[1;32mDEBUG: model prefixing  takes 0.003412008285522461 [0m
 INFO: Restrict model sm with file models/sm/restrict_default.dat . 
 [1;32mDEBUG: Simplifying conditional expressions [0m
 [1;32mDEBUG: remove interactions: u s w+ at order: QED=1 [0m
@@ -155,7 +155,7 @@ INFO: Please specify coupling orders to bypass this step.
 INFO: Trying coupling order WEIGHTED<=3: WEIGTHED IS QCD+2*QED 
 INFO: Trying process: g g > t t~ g WEIGHTED<=3 @1  
 INFO: Process has 16 diagrams 
-1 processes with 16 diagrams generated in 0.016 s
+1 processes with 16 diagrams generated in 0.015 s
 Total: 1 processes with 16 diagrams
 output madevent CODEGEN_mad_gg_ttg --hel_recycling=False --vector_size=16384 --me_exporter=standalone_cudacpp
 Load PLUGIN.CUDACPP_SA_OUTPUT
@@ -177,7 +177,7 @@ INFO: Creating files in directory P1_gg_ttxg
 [1;32mDEBUG:  Entering PLUGIN_OneProcessExporter.__init__ [1;30m[model_handling.py at line 1039][0m [0m
 [1;32mDEBUG:  kwargs[prefix] = 0 [1;30m[model_handling.py at line 1040][0m [0m
 [1;32mDEBUG:  proc_id = [0m 1 [1;30m[model_handling.py at line 1045][0m [0m
-[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_SA_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7fb13aef8ee0> [1;30m[export_v4.py at line 6174][0m [0m
+[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_SA_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7f745ec85ee0> [1;30m[export_v4.py at line 6174][0m [0m
 INFO: Creating files in directory . 
 [1;32mDEBUG:  Entering PLUGIN_OneProcessExporter.generate_process_files [1;30m[model_handling.py at line 1297][0m [0m
 [1;32mDEBUG:  self.include_multi_channel is already defined: this is madevent+second_exporter mode [1;30m[model_handling.py at line 1299][0m [0m
@@ -215,15 +215,15 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./.
 [1;32mDEBUG:  Done [1;30m[export_cpp.py at line 713][0m [0m
 INFO: Generating Feynman diagrams for Process: g g > t t~ g WEIGHTED<=3 @1 
 INFO: Finding symmetric diagrams for subprocess group gg_ttxg 
-Generated helas calls for 1 subprocesses (16 diagrams) in 0.029 s
-Wrote files for 36 helas calls in 0.334 s
+Generated helas calls for 1 subprocesses (16 diagrams) in 0.028 s
+Wrote files for 36 helas calls in 0.286 s
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates VVV1 routines[0m
 ALOHA: aloha creates FFV1 routines[0m
 ALOHA: aloha creates VVVV1 set of routines with options: P0[0m
 ALOHA: aloha creates VVVV3 set of routines with options: P0[0m
 ALOHA: aloha creates VVVV4 set of routines with options: P0[0m
-ALOHA: aloha creates 5 routines in  0.235 s
+ALOHA: aloha creates 5 routines in  0.227 s
 [1;32mDEBUG:  Entering PLUGIN_ProcessExporter.convert_model (create the model) [1;30m[output.py at line 194][0m [0m
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates VVV1 routines[0m
@@ -231,7 +231,7 @@ ALOHA: aloha creates FFV1 routines[0m
 ALOHA: aloha creates VVVV1 set of routines with options: P0[0m
 ALOHA: aloha creates VVVV3 set of routines with options: P0[0m
 ALOHA: aloha creates VVVV4 set of routines with options: P0[0m
-ALOHA: aloha creates 10 routines in  0.215 s
+ALOHA: aloha creates 10 routines in  0.214 s
 <class 'aloha.create_aloha.AbstractRoutine'> VVV1
 <class 'aloha.create_aloha.AbstractRoutine'> VVV1
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
@@ -267,6 +267,6 @@ Type "launch" to generate events from this process, or see
 Run "open index.html" to see more information about this process.
 quit
 
-real	0m5.786s
-user	0m1.451s
-sys	0m1.632s
+real	0m5.078s
+user	0m1.397s
+sys	0m1.624s
diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/cudacpp.mk b/epochX/cudacpp/gg_ttg.mad/SubProcesses/cudacpp.mk
index 9415b01fad..59a2c906eb 100644
--- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/cudacpp.mk
+++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/cudacpp.mk
@@ -176,11 +176,11 @@ ifeq ($(findstring nvcc,$(CUDA_COMPILER_PATH)),nvcc)
 else ifeq ($(findstring hipcc,$(HIP_COMPILER_PATH)),hipcc)
   #=== Configure the HIP compiler
 
-  # If CXX is not a single word (example "clang++ --gcc-toolchain...") then disable CUDA builds (issue #505)
+  # If CXX is not a single word (example "clang++ --gcc-toolchain...") then disable CUDA/HIP builds (issue #505)
   # This is because it is impossible to pass this to "GPUFLAGS += -ccbin <host-compiler>" below
   ifneq ($(words $(subst ccache ,,$(CXX))),1) # allow at most "CXX=ccache <host-compiler>" from outside
-    $(warning CUDA builds are not supported for multi-word CXX "$(CXX)")
-    override CUDA_HOME=disabled
+    $(warning HIP builds are not supported for multi-word CXX "$(CXX)")
+    override HIP_HOME=disabled
   endif
 
   # If HIP_HOME is not set, try to set it from the location of GPUCC
@@ -213,8 +213,6 @@ else ifeq ($(findstring hipcc,$(HIP_COMPILER_PATH)),hipcc)
     CUBUILDRULEFLAGS = -fPIC -c
     CCBUILDRULEFLAGS = -fPIC -c
 
-    export HIPARCHFLAGS
-
   else ifneq ($(origin REQUIRE_HIP),undefined)
     # If REQUIRE_HIP is set but no cuda is found, stop here (e.g. for CI tests on GPU #443)
     $(error No hip installation found (set HIP_HOME or make GPUCC visible in PATH))
diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/runTest.cc b/epochX/cudacpp/gg_ttg.mad/SubProcesses/runTest.cc
index 4c9bc9ee6b..0ed26180ca 100644
--- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/runTest.cc
+++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/runTest.cc
@@ -126,7 +126,7 @@ struct CUDATest : public CUDA_CPU_TestBase
   {
     ~DeviceReset()
     {
-      gpuDeviceReset(); // this is needed by cuda-memcheck --leak-check full
+      checkGpu( gpuDeviceReset() ); // this is needed by cuda-memcheck --leak-check full
     }
   } deviceResetter;
 
diff --git a/epochX/cudacpp/gg_ttg.mad/src/cudacpp_src.mk b/epochX/cudacpp/gg_ttg.mad/src/cudacpp_src.mk
index d132f0877b..f2804ffb85 100644
--- a/epochX/cudacpp/gg_ttg.mad/src/cudacpp_src.mk
+++ b/epochX/cudacpp/gg_ttg.mad/src/cudacpp_src.mk
@@ -89,7 +89,7 @@ CXXFLAGS += $(OMPFLAGS)
 ifeq ($(findstring nvcc,$(GPUCC)),nvcc)
   GPUFLAGS += -Xcompiler -fPIC -c -x cu
 else ifeq ($(findstring hipcc,$(GPUCC)),hipcc)
-  GPUFLAGS += $(HIPARCHFLAGS) -DHIP_PLATFORM=amd -fPIC -c
+  GPUFLAGS += -fPIC -c
 endif
 
 # Set the build flags appropriate to each AVX choice (example: "make AVX=none")
diff --git a/epochX/cudacpp/gg_ttg.sa/CODEGEN_cudacpp_gg_ttg_log.txt b/epochX/cudacpp/gg_ttg.sa/CODEGEN_cudacpp_gg_ttg_log.txt
index a069b5df15..79c4b9b67c 100644
--- a/epochX/cudacpp/gg_ttg.sa/CODEGEN_cudacpp_gg_ttg_log.txt
+++ b/epochX/cudacpp/gg_ttg.sa/CODEGEN_cudacpp_gg_ttg_log.txt
@@ -62,7 +62,7 @@ generate g g > t t~ g
 No model currently active, so we import the Standard Model
 INFO: load particles 
 INFO: load vertices 
-[1;32mDEBUG: model prefixing  takes 0.0034067630767822266 [0m
+[1;32mDEBUG: model prefixing  takes 0.0035054683685302734 [0m
 INFO: Restrict model sm with file models/sm/restrict_default.dat . 
 [1;32mDEBUG: Simplifying conditional expressions [0m
 [1;32mDEBUG: remove interactions: u s w+ at order: QED=1 [0m
@@ -155,7 +155,7 @@ INFO: Please specify coupling orders to bypass this step.
 INFO: Trying coupling order WEIGHTED<=3: WEIGTHED IS QCD+2*QED 
 INFO: Trying process: g g > t t~ g WEIGHTED<=3 @1  
 INFO: Process has 16 diagrams 
-1 processes with 16 diagrams generated in 0.015 s
+1 processes with 16 diagrams generated in 0.016 s
 Total: 1 processes with 16 diagrams
 output standalone_cudacpp CODEGEN_cudacpp_gg_ttg
 Load PLUGIN.CUDACPP_SA_OUTPUT
@@ -202,7 +202,7 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory /afs/cern.ch/wor
 [1;32mDEBUG:  Entering PLUGIN_OneProcessExporter.edit_memorybuffers [1;30m[model_handling.py at line 1430][0m [0m
 [1;32mDEBUG:  Entering PLUGIN_OneProcessExporter.edit_memoryaccesscouplings [1;30m[model_handling.py at line 1441][0m [0m
 [1;32mDEBUG:  'Copying test reference file: ', template_ref  = [0m Copying test reference file:  /afs/cern.ch/work/j/jteig/mg5amcnlo/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/../../../test/ref/dump_CPUTest.Sigma_sm_gg_ttxg.txt [1;30m[model_handling.py at line 1335][0m [0m
-Generated helas calls for 1 subprocesses (16 diagrams) in 0.027 s
+Generated helas calls for 1 subprocesses (16 diagrams) in 0.026 s
 [1;32mDEBUG:  Entering PLUGIN_ProcessExporter.convert_model (create the model) [1;30m[output.py at line 194][0m [0m
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates VVV1 routines[0m
@@ -210,7 +210,7 @@ ALOHA: aloha creates FFV1 routines[0m
 ALOHA: aloha creates VVVV1 set of routines with options: P0[0m
 ALOHA: aloha creates VVVV3 set of routines with options: P0[0m
 ALOHA: aloha creates VVVV4 set of routines with options: P0[0m
-ALOHA: aloha creates 5 routines in  0.226 s
+ALOHA: aloha creates 5 routines in  0.224 s
 <class 'aloha.create_aloha.AbstractRoutine'> VVV1
 <class 'aloha.create_aloha.AbstractRoutine'> VVV1
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
@@ -237,6 +237,6 @@ INFO: /afs/cern.ch/work/j/jteig/mg5amcnlo/CODEGEN_cudacpp_gg_ttg/src/. and /afs/
 [1;32mDEBUG:  Entering PLUGIN_ProcessExporter.finalize [1;30m[output.py at line 203][0m [0m
 quit
 
-real	0m0.961s
-user	0m0.558s
-sys	0m0.141s
+real	0m0.919s
+user	0m0.552s
+sys	0m0.150s
diff --git a/epochX/cudacpp/gg_ttg.sa/SubProcesses/cudacpp.mk b/epochX/cudacpp/gg_ttg.sa/SubProcesses/cudacpp.mk
index 9415b01fad..59a2c906eb 100644
--- a/epochX/cudacpp/gg_ttg.sa/SubProcesses/cudacpp.mk
+++ b/epochX/cudacpp/gg_ttg.sa/SubProcesses/cudacpp.mk
@@ -176,11 +176,11 @@ ifeq ($(findstring nvcc,$(CUDA_COMPILER_PATH)),nvcc)
 else ifeq ($(findstring hipcc,$(HIP_COMPILER_PATH)),hipcc)
   #=== Configure the HIP compiler
 
-  # If CXX is not a single word (example "clang++ --gcc-toolchain...") then disable CUDA builds (issue #505)
+  # If CXX is not a single word (example "clang++ --gcc-toolchain...") then disable CUDA/HIP builds (issue #505)
   # This is because it is impossible to pass this to "GPUFLAGS += -ccbin <host-compiler>" below
   ifneq ($(words $(subst ccache ,,$(CXX))),1) # allow at most "CXX=ccache <host-compiler>" from outside
-    $(warning CUDA builds are not supported for multi-word CXX "$(CXX)")
-    override CUDA_HOME=disabled
+    $(warning HIP builds are not supported for multi-word CXX "$(CXX)")
+    override HIP_HOME=disabled
   endif
 
   # If HIP_HOME is not set, try to set it from the location of GPUCC
@@ -213,8 +213,6 @@ else ifeq ($(findstring hipcc,$(HIP_COMPILER_PATH)),hipcc)
     CUBUILDRULEFLAGS = -fPIC -c
     CCBUILDRULEFLAGS = -fPIC -c
 
-    export HIPARCHFLAGS
-
   else ifneq ($(origin REQUIRE_HIP),undefined)
     # If REQUIRE_HIP is set but no cuda is found, stop here (e.g. for CI tests on GPU #443)
     $(error No hip installation found (set HIP_HOME or make GPUCC visible in PATH))
diff --git a/epochX/cudacpp/gg_ttg.sa/SubProcesses/runTest.cc b/epochX/cudacpp/gg_ttg.sa/SubProcesses/runTest.cc
index 4c9bc9ee6b..0ed26180ca 100644
--- a/epochX/cudacpp/gg_ttg.sa/SubProcesses/runTest.cc
+++ b/epochX/cudacpp/gg_ttg.sa/SubProcesses/runTest.cc
@@ -126,7 +126,7 @@ struct CUDATest : public CUDA_CPU_TestBase
   {
     ~DeviceReset()
     {
-      gpuDeviceReset(); // this is needed by cuda-memcheck --leak-check full
+      checkGpu( gpuDeviceReset() ); // this is needed by cuda-memcheck --leak-check full
     }
   } deviceResetter;
 
diff --git a/epochX/cudacpp/gg_ttg.sa/src/cudacpp_src.mk b/epochX/cudacpp/gg_ttg.sa/src/cudacpp_src.mk
index d132f0877b..f2804ffb85 100644
--- a/epochX/cudacpp/gg_ttg.sa/src/cudacpp_src.mk
+++ b/epochX/cudacpp/gg_ttg.sa/src/cudacpp_src.mk
@@ -89,7 +89,7 @@ CXXFLAGS += $(OMPFLAGS)
 ifeq ($(findstring nvcc,$(GPUCC)),nvcc)
   GPUFLAGS += -Xcompiler -fPIC -c -x cu
 else ifeq ($(findstring hipcc,$(GPUCC)),hipcc)
-  GPUFLAGS += $(HIPARCHFLAGS) -DHIP_PLATFORM=amd -fPIC -c
+  GPUFLAGS += -fPIC -c
 endif
 
 # Set the build flags appropriate to each AVX choice (example: "make AVX=none")
diff --git a/epochX/cudacpp/gg_ttgg.mad/CODEGEN_mad_gg_ttgg_log.txt b/epochX/cudacpp/gg_ttgg.mad/CODEGEN_mad_gg_ttgg_log.txt
index ff123fd66d..980215fbbe 100644
--- a/epochX/cudacpp/gg_ttgg.mad/CODEGEN_mad_gg_ttgg_log.txt
+++ b/epochX/cudacpp/gg_ttgg.mad/CODEGEN_mad_gg_ttgg_log.txt
@@ -62,7 +62,7 @@ generate g g > t t~ g g
 No model currently active, so we import the Standard Model
 INFO: load particles 
 INFO: load vertices 
-[1;32mDEBUG: model prefixing  takes 0.003490447998046875 [0m
+[1;32mDEBUG: model prefixing  takes 0.0033049583435058594 [0m
 INFO: Restrict model sm with file models/sm/restrict_default.dat . 
 [1;32mDEBUG: Simplifying conditional expressions [0m
 [1;32mDEBUG: remove interactions: u s w+ at order: QED=1 [0m
@@ -177,7 +177,7 @@ INFO: Creating files in directory P1_gg_ttxgg
 [1;32mDEBUG:  Entering PLUGIN_OneProcessExporter.__init__ [1;30m[model_handling.py at line 1039][0m [0m
 [1;32mDEBUG:  kwargs[prefix] = 0 [1;30m[model_handling.py at line 1040][0m [0m
 [1;32mDEBUG:  proc_id = [0m 1 [1;30m[model_handling.py at line 1045][0m [0m
-[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_SA_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7fe3e22fadf0> [1;30m[export_v4.py at line 6174][0m [0m
+[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_SA_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7f6f1bb5cdf0> [1;30m[export_v4.py at line 6174][0m [0m
 INFO: Creating files in directory . 
 [1;32mDEBUG:  Entering PLUGIN_OneProcessExporter.generate_process_files [1;30m[model_handling.py at line 1297][0m [0m
 [1;32mDEBUG:  self.include_multi_channel is already defined: this is madevent+second_exporter mode [1;30m[model_handling.py at line 1299][0m [0m
@@ -217,15 +217,15 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./.
 [1;32mDEBUG:  Done [1;30m[export_cpp.py at line 713][0m [0m
 INFO: Generating Feynman diagrams for Process: g g > t t~ g g WEIGHTED<=4 @1 
 INFO: Finding symmetric diagrams for subprocess group gg_ttxgg 
-Generated helas calls for 1 subprocesses (123 diagrams) in 0.293 s
-Wrote files for 222 helas calls in 0.687 s
+Generated helas calls for 1 subprocesses (123 diagrams) in 0.288 s
+Wrote files for 222 helas calls in 0.668 s
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates VVV1 routines[0m
 ALOHA: aloha creates FFV1 routines[0m
 ALOHA: aloha creates VVVV1 routines[0m
 ALOHA: aloha creates VVVV3 routines[0m
 ALOHA: aloha creates VVVV4 routines[0m
-ALOHA: aloha creates 5 routines in  0.229 s
+ALOHA: aloha creates 5 routines in  0.228 s
 [1;32mDEBUG:  Entering PLUGIN_ProcessExporter.convert_model (create the model) [1;30m[output.py at line 194][0m [0m
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates VVV1 routines[0m
@@ -272,6 +272,6 @@ Type "launch" to generate events from this process, or see
 Run "open index.html" to see more information about this process.
 quit
 
-real	0m5.697s
-user	0m2.159s
-sys	0m1.553s
+real	0m5.769s
+user	0m2.121s
+sys	0m1.610s
diff --git a/epochX/cudacpp/gg_ttgg.sa/CODEGEN_cudacpp_gg_ttgg_log.txt b/epochX/cudacpp/gg_ttgg.sa/CODEGEN_cudacpp_gg_ttgg_log.txt
index e2b5dfc03f..0632d20865 100644
--- a/epochX/cudacpp/gg_ttgg.sa/CODEGEN_cudacpp_gg_ttgg_log.txt
+++ b/epochX/cudacpp/gg_ttgg.sa/CODEGEN_cudacpp_gg_ttgg_log.txt
@@ -62,7 +62,7 @@ generate g g > t t~ g g
 No model currently active, so we import the Standard Model
 INFO: load particles 
 INFO: load vertices 
-[1;32mDEBUG: model prefixing  takes 0.0033235549926757812 [0m
+[1;32mDEBUG: model prefixing  takes 0.003401517868041992 [0m
 INFO: Restrict model sm with file models/sm/restrict_default.dat . 
 [1;32mDEBUG: Simplifying conditional expressions [0m
 [1;32mDEBUG: remove interactions: u s w+ at order: QED=1 [0m
@@ -155,7 +155,7 @@ INFO: Please specify coupling orders to bypass this step.
 INFO: Trying coupling order WEIGHTED<=4: WEIGTHED IS QCD+2*QED 
 INFO: Trying process: g g > t t~ g g WEIGHTED<=4 @1  
 INFO: Process has 123 diagrams 
-1 processes with 123 diagrams generated in 0.111 s
+1 processes with 123 diagrams generated in 0.112 s
 Total: 1 processes with 123 diagrams
 output standalone_cudacpp CODEGEN_cudacpp_gg_ttgg
 Load PLUGIN.CUDACPP_SA_OUTPUT
@@ -204,7 +204,7 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory /afs/cern.ch/wor
 [1;32mDEBUG:  Entering PLUGIN_OneProcessExporter.edit_memorybuffers [1;30m[model_handling.py at line 1430][0m [0m
 [1;32mDEBUG:  Entering PLUGIN_OneProcessExporter.edit_memoryaccesscouplings [1;30m[model_handling.py at line 1441][0m [0m
 [1;32mDEBUG:  'Copying test reference file: ', template_ref  = [0m Copying test reference file:  /afs/cern.ch/work/j/jteig/mg5amcnlo/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/../../../test/ref/dump_CPUTest.Sigma_sm_gg_ttxgg.txt [1;30m[model_handling.py at line 1335][0m [0m
-Generated helas calls for 1 subprocesses (123 diagrams) in 0.293 s
+Generated helas calls for 1 subprocesses (123 diagrams) in 0.292 s
 [1;32mDEBUG:  Entering PLUGIN_ProcessExporter.convert_model (create the model) [1;30m[output.py at line 194][0m [0m
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates VVV1 routines[0m
@@ -212,7 +212,7 @@ ALOHA: aloha creates FFV1 routines[0m
 ALOHA: aloha creates VVVV1 routines[0m
 ALOHA: aloha creates VVVV3 routines[0m
 ALOHA: aloha creates VVVV4 routines[0m
-ALOHA: aloha creates 5 routines in  0.219 s
+ALOHA: aloha creates 5 routines in  0.221 s
 <class 'aloha.create_aloha.AbstractRoutine'> VVV1
 <class 'aloha.create_aloha.AbstractRoutine'> VVV1
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
@@ -242,6 +242,6 @@ INFO: /afs/cern.ch/work/j/jteig/mg5amcnlo/CODEGEN_cudacpp_gg_ttgg/src/. and /afs
 [1;32mDEBUG:  Entering PLUGIN_ProcessExporter.finalize [1;30m[output.py at line 203][0m [0m
 quit
 
-real	0m1.592s
-user	0m0.987s
-sys	0m0.160s
+real	0m1.403s
+user	0m0.994s
+sys	0m0.177s
diff --git a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/cudacpp.mk b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/cudacpp.mk
index 9415b01fad..59a2c906eb 100644
--- a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/cudacpp.mk
+++ b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/cudacpp.mk
@@ -176,11 +176,11 @@ ifeq ($(findstring nvcc,$(CUDA_COMPILER_PATH)),nvcc)
 else ifeq ($(findstring hipcc,$(HIP_COMPILER_PATH)),hipcc)
   #=== Configure the HIP compiler
 
-  # If CXX is not a single word (example "clang++ --gcc-toolchain...") then disable CUDA builds (issue #505)
+  # If CXX is not a single word (example "clang++ --gcc-toolchain...") then disable CUDA/HIP builds (issue #505)
   # This is because it is impossible to pass this to "GPUFLAGS += -ccbin <host-compiler>" below
   ifneq ($(words $(subst ccache ,,$(CXX))),1) # allow at most "CXX=ccache <host-compiler>" from outside
-    $(warning CUDA builds are not supported for multi-word CXX "$(CXX)")
-    override CUDA_HOME=disabled
+    $(warning HIP builds are not supported for multi-word CXX "$(CXX)")
+    override HIP_HOME=disabled
   endif
 
   # If HIP_HOME is not set, try to set it from the location of GPUCC
@@ -213,8 +213,6 @@ else ifeq ($(findstring hipcc,$(HIP_COMPILER_PATH)),hipcc)
     CUBUILDRULEFLAGS = -fPIC -c
     CCBUILDRULEFLAGS = -fPIC -c
 
-    export HIPARCHFLAGS
-
   else ifneq ($(origin REQUIRE_HIP),undefined)
     # If REQUIRE_HIP is set but no cuda is found, stop here (e.g. for CI tests on GPU #443)
     $(error No hip installation found (set HIP_HOME or make GPUCC visible in PATH))
diff --git a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/runTest.cc b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/runTest.cc
index 4c9bc9ee6b..0ed26180ca 100644
--- a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/runTest.cc
+++ b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/runTest.cc
@@ -126,7 +126,7 @@ struct CUDATest : public CUDA_CPU_TestBase
   {
     ~DeviceReset()
     {
-      gpuDeviceReset(); // this is needed by cuda-memcheck --leak-check full
+      checkGpu( gpuDeviceReset() ); // this is needed by cuda-memcheck --leak-check full
     }
   } deviceResetter;
 
diff --git a/epochX/cudacpp/gg_ttgg.sa/src/cudacpp_src.mk b/epochX/cudacpp/gg_ttgg.sa/src/cudacpp_src.mk
index d132f0877b..f2804ffb85 100644
--- a/epochX/cudacpp/gg_ttgg.sa/src/cudacpp_src.mk
+++ b/epochX/cudacpp/gg_ttgg.sa/src/cudacpp_src.mk
@@ -89,7 +89,7 @@ CXXFLAGS += $(OMPFLAGS)
 ifeq ($(findstring nvcc,$(GPUCC)),nvcc)
   GPUFLAGS += -Xcompiler -fPIC -c -x cu
 else ifeq ($(findstring hipcc,$(GPUCC)),hipcc)
-  GPUFLAGS += $(HIPARCHFLAGS) -DHIP_PLATFORM=amd -fPIC -c
+  GPUFLAGS += -fPIC -c
 endif
 
 # Set the build flags appropriate to each AVX choice (example: "make AVX=none")
diff --git a/epochX/cudacpp/gg_ttggg.mad/CODEGEN_mad_gg_ttggg_log.txt b/epochX/cudacpp/gg_ttggg.mad/CODEGEN_mad_gg_ttggg_log.txt
index 45f48dfb83..37e5758803 100644
--- a/epochX/cudacpp/gg_ttggg.mad/CODEGEN_mad_gg_ttggg_log.txt
+++ b/epochX/cudacpp/gg_ttggg.mad/CODEGEN_mad_gg_ttggg_log.txt
@@ -62,7 +62,7 @@ generate g g > t t~ g g g
 No model currently active, so we import the Standard Model
 INFO: load particles 
 INFO: load vertices 
-[1;32mDEBUG: model prefixing  takes 0.0033910274505615234 [0m
+[1;32mDEBUG: model prefixing  takes 0.004005908966064453 [0m
 INFO: Restrict model sm with file models/sm/restrict_default.dat . 
 [1;32mDEBUG: Simplifying conditional expressions [0m
 [1;32mDEBUG: remove interactions: u s w+ at order: QED=1 [0m
@@ -155,7 +155,7 @@ INFO: Please specify coupling orders to bypass this step.
 INFO: Trying coupling order WEIGHTED<=5: WEIGTHED IS QCD+2*QED 
 INFO: Trying process: g g > t t~ g g g WEIGHTED<=5 @1  
 INFO: Process has 1240 diagrams 
-1 processes with 1240 diagrams generated in 1.349 s
+1 processes with 1240 diagrams generated in 1.307 s
 Total: 1 processes with 1240 diagrams
 output madevent CODEGEN_mad_gg_ttggg --hel_recycling=False --vector_size=16384 --me_exporter=standalone_cudacpp
 Load PLUGIN.CUDACPP_SA_OUTPUT
@@ -175,11 +175,11 @@ INFO: Generating Helas calls for process: g g > t t~ g g g WEIGHTED<=5 @1
 INFO: Processing color information for process: g g > t t~ g g g @1 
 INFO: Creating files in directory P1_gg_ttxggg 
 INFO: Computing Color-Flow optimization [15120 term] 
-INFO: Color-Flow passed to 1592 term in 22s. Introduce 2768 contraction 
+INFO: Color-Flow passed to 1592 term in 21s. Introduce 2768 contraction 
 [1;32mDEBUG:  Entering PLUGIN_OneProcessExporter.__init__ [1;30m[model_handling.py at line 1039][0m [0m
 [1;32mDEBUG:  kwargs[prefix] = 0 [1;30m[model_handling.py at line 1040][0m [0m
 [1;32mDEBUG:  proc_id = [0m 1 [1;30m[model_handling.py at line 1045][0m [0m
-[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_SA_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7fbf722662e0> [1;30m[export_v4.py at line 6174][0m [0m
+[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_SA_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7fa21b5e1250> [1;30m[export_v4.py at line 6174][0m [0m
 INFO: Creating files in directory . 
 [1;32mDEBUG:  Entering PLUGIN_OneProcessExporter.generate_process_files [1;30m[model_handling.py at line 1297][0m [0m
 [1;32mDEBUG:  self.include_multi_channel is already defined: this is madevent+second_exporter mode [1;30m[model_handling.py at line 1299][0m [0m
@@ -221,15 +221,15 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./.
 [1;32mDEBUG:  Done [1;30m[export_cpp.py at line 713][0m [0m
 INFO: Generating Feynman diagrams for Process: g g > t t~ g g g WEIGHTED<=5 @1 
 INFO: Finding symmetric diagrams for subprocess group gg_ttxggg 
-Generated helas calls for 1 subprocesses (1240 diagrams) in 4.555 s
-Wrote files for 2281 helas calls in 29.831 s
+Generated helas calls for 1 subprocesses (1240 diagrams) in 4.314 s
+Wrote files for 2281 helas calls in 28.906 s
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates VVV1 routines[0m
 ALOHA: aloha creates FFV1 routines[0m
 ALOHA: aloha creates VVVV1 routines[0m
 ALOHA: aloha creates VVVV3 routines[0m
 ALOHA: aloha creates VVVV4 routines[0m
-ALOHA: aloha creates 5 routines in  0.219 s
+ALOHA: aloha creates 5 routines in  0.218 s
 [1;32mDEBUG:  Entering PLUGIN_ProcessExporter.convert_model (create the model) [1;30m[output.py at line 194][0m [0m
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates VVV1 routines[0m
@@ -237,7 +237,7 @@ ALOHA: aloha creates FFV1 routines[0m
 ALOHA: aloha creates VVVV1 routines[0m
 ALOHA: aloha creates VVVV3 routines[0m
 ALOHA: aloha creates VVVV4 routines[0m
-ALOHA: aloha creates 10 routines in  0.227 s
+ALOHA: aloha creates 10 routines in  0.213 s
 <class 'aloha.create_aloha.AbstractRoutine'> VVV1
 <class 'aloha.create_aloha.AbstractRoutine'> VVV1
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
@@ -276,6 +276,6 @@ Type "launch" to generate events from this process, or see
 Run "open index.html" to see more information about this process.
 quit
 
-real	0m42.085s
-user	0m35.594s
-sys	0m2.900s
+real	0m39.678s
+user	0m34.707s
+sys	0m2.723s
diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/cudacpp.mk b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/cudacpp.mk
index 9415b01fad..59a2c906eb 100644
--- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/cudacpp.mk
+++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/cudacpp.mk
@@ -176,11 +176,11 @@ ifeq ($(findstring nvcc,$(CUDA_COMPILER_PATH)),nvcc)
 else ifeq ($(findstring hipcc,$(HIP_COMPILER_PATH)),hipcc)
   #=== Configure the HIP compiler
 
-  # If CXX is not a single word (example "clang++ --gcc-toolchain...") then disable CUDA builds (issue #505)
+  # If CXX is not a single word (example "clang++ --gcc-toolchain...") then disable CUDA/HIP builds (issue #505)
   # This is because it is impossible to pass this to "GPUFLAGS += -ccbin <host-compiler>" below
   ifneq ($(words $(subst ccache ,,$(CXX))),1) # allow at most "CXX=ccache <host-compiler>" from outside
-    $(warning CUDA builds are not supported for multi-word CXX "$(CXX)")
-    override CUDA_HOME=disabled
+    $(warning HIP builds are not supported for multi-word CXX "$(CXX)")
+    override HIP_HOME=disabled
   endif
 
   # If HIP_HOME is not set, try to set it from the location of GPUCC
@@ -213,8 +213,6 @@ else ifeq ($(findstring hipcc,$(HIP_COMPILER_PATH)),hipcc)
     CUBUILDRULEFLAGS = -fPIC -c
     CCBUILDRULEFLAGS = -fPIC -c
 
-    export HIPARCHFLAGS
-
   else ifneq ($(origin REQUIRE_HIP),undefined)
     # If REQUIRE_HIP is set but no cuda is found, stop here (e.g. for CI tests on GPU #443)
     $(error No hip installation found (set HIP_HOME or make GPUCC visible in PATH))
diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/runTest.cc b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/runTest.cc
index 4c9bc9ee6b..0ed26180ca 100644
--- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/runTest.cc
+++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/runTest.cc
@@ -126,7 +126,7 @@ struct CUDATest : public CUDA_CPU_TestBase
   {
     ~DeviceReset()
     {
-      gpuDeviceReset(); // this is needed by cuda-memcheck --leak-check full
+      checkGpu( gpuDeviceReset() ); // this is needed by cuda-memcheck --leak-check full
     }
   } deviceResetter;
 
diff --git a/epochX/cudacpp/gg_ttggg.mad/src/cudacpp_src.mk b/epochX/cudacpp/gg_ttggg.mad/src/cudacpp_src.mk
index d132f0877b..f2804ffb85 100644
--- a/epochX/cudacpp/gg_ttggg.mad/src/cudacpp_src.mk
+++ b/epochX/cudacpp/gg_ttggg.mad/src/cudacpp_src.mk
@@ -89,7 +89,7 @@ CXXFLAGS += $(OMPFLAGS)
 ifeq ($(findstring nvcc,$(GPUCC)),nvcc)
   GPUFLAGS += -Xcompiler -fPIC -c -x cu
 else ifeq ($(findstring hipcc,$(GPUCC)),hipcc)
-  GPUFLAGS += $(HIPARCHFLAGS) -DHIP_PLATFORM=amd -fPIC -c
+  GPUFLAGS += -fPIC -c
 endif
 
 # Set the build flags appropriate to each AVX choice (example: "make AVX=none")
diff --git a/epochX/cudacpp/gg_ttggg.sa/CODEGEN_cudacpp_gg_ttggg_log.txt b/epochX/cudacpp/gg_ttggg.sa/CODEGEN_cudacpp_gg_ttggg_log.txt
index 9102ff337e..00508a246d 100644
--- a/epochX/cudacpp/gg_ttggg.sa/CODEGEN_cudacpp_gg_ttggg_log.txt
+++ b/epochX/cudacpp/gg_ttggg.sa/CODEGEN_cudacpp_gg_ttggg_log.txt
@@ -62,7 +62,7 @@ generate g g > t t~ g g g
 No model currently active, so we import the Standard Model
 INFO: load particles 
 INFO: load vertices 
-[1;32mDEBUG: model prefixing  takes 0.0035371780395507812 [0m
+[1;32mDEBUG: model prefixing  takes 0.0033860206604003906 [0m
 INFO: Restrict model sm with file models/sm/restrict_default.dat . 
 [1;32mDEBUG: Simplifying conditional expressions [0m
 [1;32mDEBUG: remove interactions: u s w+ at order: QED=1 [0m
@@ -206,7 +206,7 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory /afs/cern.ch/wor
 [1;32mDEBUG:  Entering PLUGIN_OneProcessExporter.edit_memorybuffers [1;30m[model_handling.py at line 1430][0m [0m
 [1;32mDEBUG:  Entering PLUGIN_OneProcessExporter.edit_memoryaccesscouplings [1;30m[model_handling.py at line 1441][0m [0m
 [1;32mDEBUG:  'Copying test reference file: ', template_ref  = [0m Copying test reference file:  /afs/cern.ch/work/j/jteig/mg5amcnlo/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/../../../test/ref/dump_CPUTest.Sigma_sm_gg_ttxggg.txt [1;30m[model_handling.py at line 1335][0m [0m
-Generated helas calls for 1 subprocesses (1240 diagrams) in 4.393 s
+Generated helas calls for 1 subprocesses (1240 diagrams) in 4.400 s
 [1;32mDEBUG:  Entering PLUGIN_ProcessExporter.convert_model (create the model) [1;30m[output.py at line 194][0m [0m
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates VVV1 routines[0m
@@ -214,7 +214,7 @@ ALOHA: aloha creates FFV1 routines[0m
 ALOHA: aloha creates VVVV1 routines[0m
 ALOHA: aloha creates VVVV3 routines[0m
 ALOHA: aloha creates VVVV4 routines[0m
-ALOHA: aloha creates 5 routines in  0.250 s
+ALOHA: aloha creates 5 routines in  0.266 s
 <class 'aloha.create_aloha.AbstractRoutine'> VVV1
 <class 'aloha.create_aloha.AbstractRoutine'> VVV1
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
@@ -244,6 +244,6 @@ INFO: /afs/cern.ch/work/j/jteig/mg5amcnlo/CODEGEN_cudacpp_gg_ttggg/src/. and /af
 [1;32mDEBUG:  Entering PLUGIN_ProcessExporter.finalize [1;30m[output.py at line 203][0m [0m
 quit
 
-real	0m9.147s
-user	0m8.601s
-sys	0m0.323s
+real	0m9.573s
+user	0m8.590s
+sys	0m0.331s
diff --git a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/cudacpp.mk b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/cudacpp.mk
index 9415b01fad..59a2c906eb 100644
--- a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/cudacpp.mk
+++ b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/cudacpp.mk
@@ -176,11 +176,11 @@ ifeq ($(findstring nvcc,$(CUDA_COMPILER_PATH)),nvcc)
 else ifeq ($(findstring hipcc,$(HIP_COMPILER_PATH)),hipcc)
   #=== Configure the HIP compiler
 
-  # If CXX is not a single word (example "clang++ --gcc-toolchain...") then disable CUDA builds (issue #505)
+  # If CXX is not a single word (example "clang++ --gcc-toolchain...") then disable CUDA/HIP builds (issue #505)
   # This is because it is impossible to pass this to "GPUFLAGS += -ccbin <host-compiler>" below
   ifneq ($(words $(subst ccache ,,$(CXX))),1) # allow at most "CXX=ccache <host-compiler>" from outside
-    $(warning CUDA builds are not supported for multi-word CXX "$(CXX)")
-    override CUDA_HOME=disabled
+    $(warning HIP builds are not supported for multi-word CXX "$(CXX)")
+    override HIP_HOME=disabled
   endif
 
   # If HIP_HOME is not set, try to set it from the location of GPUCC
@@ -213,8 +213,6 @@ else ifeq ($(findstring hipcc,$(HIP_COMPILER_PATH)),hipcc)
     CUBUILDRULEFLAGS = -fPIC -c
     CCBUILDRULEFLAGS = -fPIC -c
 
-    export HIPARCHFLAGS
-
   else ifneq ($(origin REQUIRE_HIP),undefined)
     # If REQUIRE_HIP is set but no cuda is found, stop here (e.g. for CI tests on GPU #443)
     $(error No hip installation found (set HIP_HOME or make GPUCC visible in PATH))
diff --git a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/runTest.cc b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/runTest.cc
index 4c9bc9ee6b..0ed26180ca 100644
--- a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/runTest.cc
+++ b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/runTest.cc
@@ -126,7 +126,7 @@ struct CUDATest : public CUDA_CPU_TestBase
   {
     ~DeviceReset()
     {
-      gpuDeviceReset(); // this is needed by cuda-memcheck --leak-check full
+      checkGpu( gpuDeviceReset() ); // this is needed by cuda-memcheck --leak-check full
     }
   } deviceResetter;
 
diff --git a/epochX/cudacpp/gg_ttggg.sa/src/cudacpp_src.mk b/epochX/cudacpp/gg_ttggg.sa/src/cudacpp_src.mk
index d132f0877b..f2804ffb85 100644
--- a/epochX/cudacpp/gg_ttggg.sa/src/cudacpp_src.mk
+++ b/epochX/cudacpp/gg_ttggg.sa/src/cudacpp_src.mk
@@ -89,7 +89,7 @@ CXXFLAGS += $(OMPFLAGS)
 ifeq ($(findstring nvcc,$(GPUCC)),nvcc)
   GPUFLAGS += -Xcompiler -fPIC -c -x cu
 else ifeq ($(findstring hipcc,$(GPUCC)),hipcc)
-  GPUFLAGS += $(HIPARCHFLAGS) -DHIP_PLATFORM=amd -fPIC -c
+  GPUFLAGS += -fPIC -c
 endif
 
 # Set the build flags appropriate to each AVX choice (example: "make AVX=none")

From d8890850cb1f311fc41cb6d40c95b00018ce832b Mon Sep 17 00:00:00 2001
From: Jorgen T <jorgen.teig@gmail.com>
Date: Tue, 15 Aug 2023 09:42:31 +0200
Subject: [PATCH 462/509] Added warnings if name prefix variable is not set

---
 tools/profiling/buildCUDAProcess.sh | 14 +++++++++++++-
 tools/profiling/buildSYCLProcess.sh |  5 +++++
 2 files changed, 18 insertions(+), 1 deletion(-)

diff --git a/tools/profiling/buildCUDAProcess.sh b/tools/profiling/buildCUDAProcess.sh
index 2dcc7f2ef4..0923aca9ab 100755
--- a/tools/profiling/buildCUDAProcess.sh
+++ b/tools/profiling/buildCUDAProcess.sh
@@ -50,6 +50,11 @@ fi
 
 # Begin script in case all parameters are correct
 
+# Added check if the CUDA_NAME_PREFIX/HIP_NAME_PREFIX variable are not set
+if [ -z "$CUDA_NAME_PREFIX" -o -z "$HIP_NAME_PREFIX" ]; then
+    echo "WARNING: CUDA_NAME_PREFIX/HIP_NAME_PREFIX is not set. Cannot append system info to JSON file names!"
+fi
+
 ##################################################################
 
 # Set variables for later use
@@ -120,4 +125,11 @@ $MG_EXE -j $blocksPerGrid $threadsPerBlock $iterations
 echo "${MG_EXE} -j ${blocksPerGrid} ${threadsPerBlock} ${iterations}"
 
 cd perf/data/
-mv 0-perf-test-run0.json ${REPORT_FOLDER}/test_${MG_PROC}_${CUDA_NAME_PREFIX}_${blocksPerGrid}_${threadsPerBlock}_${iterations}.json
\ No newline at end of file
+
+if [ -n "$CUDA_NAME_PREFIX" ]; then
+    mv 0-perf-test-run0.json "${REPORT_FOLDER}/test_${MG_PROC}_${CUDA_NAME_PREFIX}_${blocksPerGrid}_${threadsPerBlock}_${iterations}.json"
+elif [ -n "$HIP_NAME_PREFIX" ]; then
+    mv 0-perf-test-run0.json "${REPORT_FOLDER}/test_${MG_PROC}_${HIP_NAME_PREFIX}_${blocksPerGrid}_${threadsPerBlock}_${iterations}.json"
+else
+    mv 0-perf-test-run0.json "${REPORT_FOLDER}/test_${MG_PROC}_undefined_${blocksPerGrid}_${threadsPerBlock}_${iterations}.json"
+fi
\ No newline at end of file
diff --git a/tools/profiling/buildSYCLProcess.sh b/tools/profiling/buildSYCLProcess.sh
index 5afafb9e0c..efdfd66c71 100755
--- a/tools/profiling/buildSYCLProcess.sh
+++ b/tools/profiling/buildSYCLProcess.sh
@@ -48,6 +48,11 @@ then
     helpFunction
 fi
 
+# Added check if the SYCL_NAME_PREFIX variable are not set
+if [ -z "$SYCL_NAME_PREFIX" ]; then
+    echo "WARNING: SYCL_NAME_PREFIX is not set. Cannot append system info to JSON file names!"
+fi
+
 ##################################################################
 
 # Assign correct SM level for NVIDIA GPUs

From 22a0ac0fc4ae0899e07564323a1a789602043ed8 Mon Sep 17 00:00:00 2001
From: Jorgen T <jorgen.teig@gmail.com>
Date: Thu, 13 Jul 2023 15:15:41 +0200
Subject: [PATCH 463/509] [CODEGEN] Added GPU abstraction to CODEGEN

---
 .../iolibs/template_files/gpu/Bridge.h        |  30 +-
 .../template_files/gpu/BridgeKernels.cc       |   7 +-
 .../iolibs/template_files/gpu/BridgeKernels.h |   6 +-
 .../gpu/CommonRandomNumberKernel.cc           |   3 +-
 .../template_files/gpu/CrossSectionKernels.cc |   5 +-
 .../template_files/gpu/CrossSectionKernels.h  |   4 +-
 .../iolibs/template_files/gpu/CudaRuntime.h   |   4 +-
 .../gpu/CurandRandomNumberKernel.cc           |  10 +-
 .../template_files/gpu/EventStatistics.h      |   2 +-
 .../template_files/gpu/GpuAbstraction.h       |  79 +++++
 .../iolibs/template_files/gpu/GpuRuntime.h    |  80 +++++
 .../iolibs/template_files/gpu/MadgraphTest.h  |   6 +-
 .../gpu/MatrixElementKernels.cc               |  24 +-
 .../template_files/gpu/MatrixElementKernels.h |   6 +-
 .../template_files/gpu/MemoryAccessHelpers.h  |   2 +-
 .../template_files/gpu/MemoryAccessMomenta.h  |  24 +-
 .../gpu/MemoryAccessRandomNumbers.h           |   2 +-
 .../template_files/gpu/MemoryAccessVectors.h  |   2 +-
 .../iolibs/template_files/gpu/MemoryBuffers.h |  61 ++--
 .../gpu/RamboSamplingKernels.cc               |  18 +-
 .../template_files/gpu/RamboSamplingKernels.h |   4 +-
 .../template_files/gpu/RandomNumberKernels.h  |   4 +-
 .../iolibs/template_files/gpu/check_sa.cc     |  74 ++--
 .../template_files/gpu/cpp_hel_amps_h.inc     |   2 +-
 .../iolibs/template_files/gpu/cudacpp.mk      | 323 +++++++++++-------
 .../iolibs/template_files/gpu/fbridge.cc      |  10 +-
 .../iolibs/template_files/gpu/fsampler.cc     |   6 +-
 .../iolibs/template_files/gpu/mgOnGpuConfig.h |  25 +-
 .../template_files/gpu/mgOnGpuCxtypes.h       |  16 +-
 .../template_files/gpu/mgOnGpuFptypes.h       |   8 +-
 .../template_files/gpu/mgOnGpuVectors.h       |  18 +-
 .../iolibs/template_files/gpu/process_cc.inc  |   2 +-
 .../gpu/process_function_definitions.inc      |  32 +-
 .../iolibs/template_files/gpu/process_h.inc   |   8 +-
 .../template_files/gpu/process_matrix.inc     |   8 +-
 .../gpu/process_sigmaKin_function.inc         |   9 +-
 .../iolibs/template_files/gpu/rambo.h         |   6 +-
 .../iolibs/template_files/gpu/runTest.cc      |  10 +-
 .../iolibs/template_files/gpu/testmisc.cc     |   2 +-
 .../iolibs/template_files/gpu/testxxx.cc      |   6 +-
 .../PLUGIN/CUDACPP_SA_OUTPUT/output.py        |   6 +-
 41 files changed, 601 insertions(+), 353 deletions(-)
 create mode 100644 epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/GpuAbstraction.h
 create mode 100644 epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/GpuRuntime.h

diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/Bridge.h b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/Bridge.h
index bf8b5e024d..51241e9840 100644
--- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/Bridge.h
+++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/Bridge.h
@@ -23,7 +23,7 @@
 #include <memory>
 #include <type_traits>
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -83,7 +83,7 @@ namespace mg5amcCpu
     Bridge& operator=( const Bridge& ) = delete;
     Bridge& operator=( Bridge&& ) = delete;
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     /**
      * Set the gpublocks and gputhreads for the gpusequence - throws if evnt != gpublocks*gputhreads
      * (this is needed for BridgeKernel tests rather than for actual production use in Fortran)
@@ -150,7 +150,7 @@ namespace mg5amcCpu
     unsigned int m_nevt; // number of events
     int m_nGoodHel;      // the number of good helicities (-1 initially when they have not yet been calculated)
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     int m_gputhreads; // number of gpu threads (default set from number of events, can be modified)
     int m_gpublocks;  // number of gpu blocks (default set from number of events, can be modified)
     DeviceBuffer<FORTRANFPTYPE, sizePerEventMomenta> m_devMomentaF;
@@ -187,12 +187,12 @@ namespace mg5amcCpu
   // Forward declare transposition methods
   //
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 
   template<typename Tin, typename Tout>
   __global__ void dev_transposeMomentaF2C( const Tin* in, Tout* out, const unsigned int nevt );
 
-#endif // __CUDACC__
+#endif // MGONGPUCPP_GPUIMPL
 
   template<typename Tin, typename Tout>
   void hst_transposeMomentaF2C( const Tin* in, Tout* out, const unsigned int nevt );
@@ -209,7 +209,7 @@ namespace mg5amcCpu
   Bridge<FORTRANFPTYPE>::Bridge( unsigned int nevtF, unsigned int nparF, unsigned int np4F )
     : m_nevt( nevtF )
     , m_nGoodHel( -1 )
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     , m_gputhreads( 256 )                  // default number of gpu threads
     , m_gpublocks( m_nevt / m_gputhreads ) // this ensures m_nevt <= m_gpublocks*m_gputhreads
     , m_devMomentaF( m_nevt )
@@ -233,7 +233,7 @@ namespace mg5amcCpu
   {
     if( nparF != CPPProcess::npar ) throw std::runtime_error( "Bridge constructor: npar mismatch" );
     if( np4F != CPPProcess::np4 ) throw std::runtime_error( "Bridge constructor: np4 mismatch" );
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     if( ( m_nevt < s_gputhreadsmin ) || ( m_nevt % s_gputhreadsmin != 0 ) )
       throw std::runtime_error( "Bridge constructor: nevt should be a multiple of " + std::to_string( s_gputhreadsmin ) );
     while( m_nevt != m_gpublocks * m_gputhreads )
@@ -249,7 +249,7 @@ namespace mg5amcCpu
 #else
     std::cout << "WARNING! Instantiate host Bridge (nevt=" << m_nevt << ")" << std::endl;
     m_pmek.reset( new MatrixElementKernelHost( m_hstMomentaC, m_hstGs, m_hstRndHel, m_hstRndCol, m_hstMEs, m_hstSelHel, m_hstSelCol, m_nevt ) );
-#endif // __CUDACC__
+#endif // MGONGPUCPP_GPUIMPL
     // Create a process object, read param card and set parameters
     // FIXME: the process instance can happily go out of scope because it is only needed to read parameters?
     // FIXME: the CPPProcess should really be a singleton? what if fbridgecreate is called from several Fortran threads?
@@ -262,7 +262,7 @@ namespace mg5amcCpu
     process.initProc( paramCard );
   }
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   template<typename FORTRANFPTYPE>
   void Bridge<FORTRANFPTYPE>::set_gpugrid( const int gpublocks, const int gputhreads )
   {
@@ -276,7 +276,7 @@ namespace mg5amcCpu
   }
 #endif
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   template<typename FORTRANFPTYPE>
   void Bridge<FORTRANFPTYPE>::gpu_sequence( const FORTRANFPTYPE* momenta,
                                             const FORTRANFPTYPE* gs,
@@ -291,14 +291,14 @@ namespace mg5amcCpu
     constexpr int neppM = MemoryAccessMomenta::neppM;
     if constexpr( neppM == 1 && std::is_same_v<FORTRANFPTYPE, fptype> )
     {
-      checkCuda( cudaMemcpy( m_devMomentaC.data(), momenta, m_devMomentaC.bytes(), cudaMemcpyHostToDevice ) );
+      gpuMemcpy( m_devMomentaC.data(), momenta, m_devMomentaC.bytes(), cudaMemcpyHostToDevice );
     }
     else
     {
-      checkCuda( cudaMemcpy( m_devMomentaF.data(), momenta, m_devMomentaF.bytes(), cudaMemcpyHostToDevice ) );
+      gpuMemcpy( m_devMomentaF.data(), momenta, m_devMomentaF.bytes(), cudaMemcpyHostToDevice );
       const int thrPerEvt = CPPProcess::npar * CPPProcess::np4; // AV: transpose alg does 1 element per thread (NOT 1 event per thread)
       //const int thrPerEvt = 1; // AV: try new alg with 1 event per thread... this seems slower
-      dev_transposeMomentaF2C<<<m_gpublocks * thrPerEvt, m_gputhreads>>>( m_devMomentaF.data(), m_devMomentaC.data(), m_nevt );
+      gpuLaunchKernel( dev_transposeMomentaF2C, m_gpublocks * thrPerEvt, m_gputhreads, m_devMomentaF.data(), m_devMomentaC.data(), m_nevt );
     }
     if constexpr( std::is_same_v<FORTRANFPTYPE, fptype> )
     {
@@ -341,7 +341,7 @@ namespace mg5amcCpu
   }
 #endif
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   template<typename FORTRANFPTYPE>
   void Bridge<FORTRANFPTYPE>::cpu_sequence( const FORTRANFPTYPE* momenta,
                                             const FORTRANFPTYPE* gs,
@@ -396,7 +396,7 @@ namespace mg5amcCpu
   // - C++ array: momenta[npagM][npar][np4][neppM] with nevt=npagM*neppM (AOSOA)
   //
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   template<typename Tin, typename Tout>
   __global__ void dev_transposeMomentaF2C( const Tin* in, Tout* out, const unsigned int nevt )
   {
diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/BridgeKernels.cc b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/BridgeKernels.cc
index d58066c9c1..6034db93ec 100644
--- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/BridgeKernels.cc
+++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/BridgeKernels.cc
@@ -5,13 +5,14 @@
 
 #include "BridgeKernels.h"
 
+#include "GpuAbstraction.h"
 #include "MemoryAccessMomenta.h"
 
 #include <sstream>
 
 //============================================================================
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -45,7 +46,7 @@ namespace mg5amcCpu
 
 //============================================================================
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 namespace mg5amcCpu
 {
 
@@ -96,7 +97,7 @@ namespace mg5amcCpu
 
 //============================================================================
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 {
 
diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/BridgeKernels.h b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/BridgeKernels.h
index 15eb4bff4d..7c7feb692a 100644
--- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/BridgeKernels.h
+++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/BridgeKernels.h
@@ -12,7 +12,7 @@
 #include "MatrixElementKernels.h"
 #include "MemoryBuffers.h"
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -49,7 +49,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A Bridge wrapper class encapsulating matrix element calculations on a CPU host
   class BridgeKernelHost final : public BridgeKernelBase
   {
@@ -89,7 +89,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   // A Bridge wrapper class encapsulating matrix element calculations on a GPU device
   class BridgeKernelDevice : public BridgeKernelBase
   {
diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/CommonRandomNumberKernel.cc b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/CommonRandomNumberKernel.cc
index 985b39f576..f17b9c0ad7 100644
--- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/CommonRandomNumberKernel.cc
+++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/CommonRandomNumberKernel.cc
@@ -4,12 +4,13 @@
 // Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #include "CommonRandomNumbers.h"
+#include "GpuAbstraction.h"
 #include "MemoryBuffers.h"
 #include "RandomNumberKernels.h"
 
 #include <cassert>
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/CrossSectionKernels.cc b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/CrossSectionKernels.cc
index 0b355a3c8d..36ca2a94d4 100644
--- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/CrossSectionKernels.cc
+++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/CrossSectionKernels.cc
@@ -5,6 +5,7 @@
 
 #include "CrossSectionKernels.h"
 
+#include "GpuAbstraction.h"
 #include "MemoryAccessMatrixElements.h"
 #include "MemoryAccessWeights.h"
 #include "MemoryBuffers.h"
@@ -77,7 +78,7 @@ debug_me_is_abnormal( const fptype& me, size_t ievtALL )
 
 //============================================================================
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -185,7 +186,7 @@ namespace mg5amcCpu
 
 //============================================================================
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 {
 
diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/CrossSectionKernels.h b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/CrossSectionKernels.h
index 7933ca4bbf..ff2350a14d 100644
--- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/CrossSectionKernels.h
+++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/CrossSectionKernels.h
@@ -13,7 +13,7 @@
 
 //============================================================================
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -96,7 +96,7 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
   /*
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   // A class encapsulating the calculation of event statistics on a GPU device
   class CrossSectionKernelDevice : public CrossSectionKernelBase, public NumberOfEvents
   {
diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/CudaRuntime.h b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/CudaRuntime.h
index 64ce52f4b3..df0c3f3df8 100644
--- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/CudaRuntime.h
+++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/CudaRuntime.h
@@ -15,7 +15,7 @@
 //--------------------------------------------------------------------------
 
 // See https://stackoverflow.com/a/14038590
-#ifdef __CUDACC__ /* clang-format off */
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
 #define checkCuda( code ) { assertCuda( code, __FILE__, __LINE__ ); }
 inline void assertCuda( cudaError_t code, const char* file, int line, bool abort = true )
 {
@@ -29,7 +29,7 @@ inline void assertCuda( cudaError_t code, const char* file, int line, bool abort
 
 //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 {
   // Instantiate a CudaRuntime at the beginnining of the application's main to
diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/CurandRandomNumberKernel.cc b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/CurandRandomNumberKernel.cc
index eb56333b03..5b33207ad0 100644
--- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/CurandRandomNumberKernel.cc
+++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/CurandRandomNumberKernel.cc
@@ -3,7 +3,7 @@
 // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
 // Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
-#include "CudaRuntime.h"
+#include "GpuRuntime.h"
 #include "MemoryBuffers.h"
 #include "RandomNumberKernels.h"
 
@@ -22,7 +22,7 @@ inline void assertCurand( curandStatus_t code, const char *file, int line, bool
 }
 #endif /* clang-format on */
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_CUDACC
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -36,7 +36,7 @@ namespace mg5amcCpu
   {
     if( m_isOnDevice )
     {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_CUDACC
       if( !m_rnarray.isOnDevice() )
         throw std::runtime_error( "CurandRandomNumberKernel on device with a host random number array" );
 #else
@@ -114,7 +114,7 @@ namespace mg5amcCpu
     /*
     printf( "\nCurandRandomNumberKernel::generateRnarray size = %d\n", (int)m_rnarray.size() );
     fptype* data = m_rnarray.data();
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     if( m_rnarray.isOnDevice() )
     {
       data = new fptype[m_rnarray.size()]();
@@ -123,7 +123,7 @@ namespace mg5amcCpu
 #endif
     for( int i = 0; i < ( (int)m_rnarray.size() / 4 ); i++ )
       printf( "[%4d] %f %f %f %f\n", i * 4, data[i * 4], data[i * 4 + 2], data[i * 4 + 2], data[i * 4 + 3] );
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     if( m_rnarray.isOnDevice() ) delete[] data;
 #endif
     */
diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/EventStatistics.h b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/EventStatistics.h
index 48b51e0a49..e7d7f3b3c3 100644
--- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/EventStatistics.h
+++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/EventStatistics.h
@@ -16,7 +16,7 @@
 #include <limits>
 #include <string>
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/GpuAbstraction.h b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/GpuAbstraction.h
new file mode 100644
index 0000000000..98a0124b55
--- /dev/null
+++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/GpuAbstraction.h
@@ -0,0 +1,79 @@
+#ifndef MG5AMC_GPUABSTRACTION_H
+#define MG5AMC_GPUABSTRACTION_H 1
+
+#include <cassert>
+
+#ifdef MGONGPUCPP_GPUIMPL
+  #define MGONGPUCPP_CUDACC 1
+#endif
+
+#ifdef __HIPCC__
+  #include "hip/hip_runtime.h"
+  #define MGONGPUCPP_HIPCC 1
+#endif
+
+#ifdef MGONGPUCPP_CUDACC
+
+  // Defines correct compiler
+  #define MGONGPUCPP_GPUIMPL MGONGPUCPP_GPUIMPL
+
+  //--------------------------------------------------------------------------
+
+  #define gpuError_t cudaError_t
+  #define gpuPeekAtLastError cudaPeekAtLastError
+  #define gpuGetErrorString cudaGetErrorString
+  #define gpuSuccess cudaSuccess
+
+  #define gpuMallocHost(ptr, size) checkGpu( cudaMallocHost(ptr, size) )
+  #define gpuMalloc(ptr, size) checkGpu( cudaMalloc(ptr, size) )
+
+  #define gpuMemcpy(dstData, srcData, srcBytes, func) checkGpu( cudaMemcpy(dstData, srcData, srcBytes, func) )
+  #define gpuMemcpyHostToDevice cudaMemcpyHostToDevice
+  #define gpuMemcpyDeviceToHost cudaMemcpyDeviceToHost
+  #define gpuMemcpyToSymbol(type1, type2, size) checkGpu( cudaMemcpyToSymbol(type1, type2, size) )
+
+  #define gpuFree(ptr) checkGpu( cudaFree(ptr) )
+  #define gpuFreeHost(ptr) checkGpu( cudaFreeHost(ptr) )
+
+  #define gpuSetDevice cudaSetDevice
+  #define gpuDeviceSynchronize cudaDeviceSynchronize
+  #define gpuDeviceReset cudaDeviceReset
+
+  #define gpuLaunchKernel( kernel, blocks, threads, ...)                    kernel<<<blocks, threads>>> (__VA_ARGS__)
+  #define gpuLaunchKernelSharedMem(kernel, blocks, threads, sharedMem, ...) kernel<<<blocks, threads, sharedMem>>>(__VA_ARGS__)
+
+//--------------------------------------------------------------------------
+
+#elif defined MGONGPUCPP_HIPCC
+
+  // Defines correct compiler
+  #define MGONGPUCPP_GPUIMPL __HCC__
+
+  //--------------------------------------------------------------------------
+
+  #define gpuError_t hipError_t
+  #define gpuPeekAtLastError hipPeekAtLastError
+  #define gpuGetErrorString hipGetErrorString
+  #define gpuSuccess hipSuccess
+
+  #define gpuMallocHost(ptr, size) checkGpu( hipHostMalloc(ptr, size) ) // HostMalloc better
+  #define gpuMalloc(ptr, size) checkGpu( hipMalloc(ptr, size) )
+
+  #define gpuMemcpy(dstData, srcData, srcBytes, func) checkGpu( hipMemcpy(dstData, srcData, srcBytes, func) )
+  #define gpuMemcpyHostToDevice hipMemcpyHostToDevice
+  #define gpuMemcpyDeviceToHost hipMemcpyDeviceToHost
+  #define gpuMemcpyToSymbol(type1, type2, size) checkGpu( hipMemcpyToSymbol(type1, type2, size) )
+
+  #define gpuFree(ptr) checkGpu( hipFree(ptr) )
+  #define gpuFreeHost(ptr) checkGpu( hipHostFree(ptr) )
+
+  #define gpuSetDevice hipSetDevice
+  #define gpuDeviceSynchronize hipDeviceSynchronize
+  #define gpuDeviceReset hipDeviceReset
+
+  #define gpuLaunchKernel( kernel, blocks, threads, ...)                    kernel<<<blocks, threads>>> (__VA_ARGS__)
+  #define gpuLaunchKernelSharedMem(kernel, blocks, threads, sharedMem, ...) kernel<<<blocks, threads, sharedMem>>>(__VA_ARGS__)
+
+#endif
+
+#endif // MG5AMC_GPUABSTRACTION_H
\ No newline at end of file
diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/GpuRuntime.h b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/GpuRuntime.h
new file mode 100644
index 0000000000..86c9179f4c
--- /dev/null
+++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/GpuRuntime.h
@@ -0,0 +1,80 @@
+#ifndef MG5AMC_GPURUNTIME_H
+#define MG5AMC_GPURUNTIME_H 1
+
+// MG5AMC on GPU uses the CUDA runtime API, not the lower level CUDA driver API
+// See https://docs.nvidia.com/cuda/cuda-runtime-api/driver-vs-runtime-api.html#driver-vs-runtime-api
+
+#include "GpuAbstraction.h"
+
+#include <iostream>
+
+//--------------------------------------------------------------------------
+
+// See https://stackoverflow.com/a/14038590
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
+#define checkGpu( code ) { assertGpu( code, __FILE__, __LINE__ ); }
+inline void assertGpu( gpuError_t code, const char* file, int line, bool abort = true )
+{
+  if( code != gpuSuccess )
+  {
+    printf( "ERROR! assertGpu: '%s' (%d) in %s:%d\n", gpuGetErrorString( code ), code, file, line );
+    if( abort ) assert( code == gpuSuccess );
+  }
+}
+#endif /* clang-format on */
+
+//--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+namespace mg5amcGpu
+{
+  // Instantiate a GpuRuntime at the beginnining of the application's main to
+  // invoke gpuSetDevice(0) in the constructor and book a gpuDeviceReset() call in the destructor
+  // *** FIXME! This will all need to be designed differently when going to multi-GPU nodes! ***
+  struct GpuRuntime final
+  {
+    GpuRuntime( const bool debug = true )
+      : m_debug( debug ) { setUp( m_debug ); }
+    ~GpuRuntime() { tearDown( m_debug ); }
+    GpuRuntime( const GpuRuntime& ) = delete;
+    GpuRuntime( GpuRuntime&& ) = delete;
+    GpuRuntime& operator=( const GpuRuntime& ) = delete;
+    GpuRuntime& operator=( GpuRuntime&& ) = delete;
+    bool m_debug;
+
+    // Set up CUDA application
+    // ** NB: strictly speaking this is not needed when using the CUDA runtime API **
+    // Calling cudaSetDevice on startup is useful to properly book-keep the time spent in CUDA initialization
+    static void setUp( const bool debug = true )
+    {
+      // ** NB: it is useful to call cudaSetDevice, or cudaFree, to properly book-keep the time spent in CUDA initialization
+      // ** NB: otherwise, the first CUDA operation (eg a cudaMemcpyToSymbol in CPPProcess ctor) appears to take much longer!
+      /*
+      // [We initially added cudaFree(0) to "ease profile analysis" only because it shows up as a big recognizable block!]
+      // No explicit initialization is needed: https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#initialization
+      // It is not clear what cudaFree(0) does at all: https://stackoverflow.com/questions/69967813/
+      if ( debug ) std::cout << "__CudaRuntime: calling cudaFree(0)" << std::endl;
+      checkCuda( cudaFree( 0 ) ); // SLOW!
+      */
+      // Replace cudaFree(0) by cudaSetDevice(0), even if it is not really needed either
+      // (but see https://developer.nvidia.com/blog/cuda-pro-tip-always-set-current-device-avoid-multithreading-bugs)
+      if( debug ) std::cout << "__GpuRuntime: calling GpuSetDevice(0)" << std::endl;
+      checkGpu ( gpuSetDevice( 0 ) ); // SLOW!
+    }
+
+    // Tear down CUDA application (call cudaDeviceReset)
+    // ** NB: strictly speaking this is not needed when using the CUDA runtime API **
+    // Calling cudaDeviceReset on shutdown is only needed for checking memory leaks in cuda-memcheck
+    // See https://docs.nvidia.com/cuda/cuda-memcheck/index.html#leak-checking
+    static void tearDown( const bool debug = true )
+    {
+      if( debug ) std::cout << "__GpuRuntime: calling GpuDeviceReset()" << std::endl;
+      checkGpu( gpuDeviceReset() );
+    }
+  };
+}
+#endif
+
+//--------------------------------------------------------------------------
+
+#endif // MG5AMC_GPURUNTIME_H
diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MadgraphTest.h b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MadgraphTest.h
index ffe3b84d53..3fa9f13a82 100644
--- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MadgraphTest.h
+++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MadgraphTest.h
@@ -21,7 +21,7 @@
 #include <string>
 #include <vector>
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 using mg5amcGpu::CPPProcess;
 #else
 using mg5amcCpu::CPPProcess;
@@ -200,7 +200,7 @@ class MadgraphTest : public testing::TestWithParam<TestDriverBase*>
 
 // Since we link both the CPU-only and GPU tests into the same executable, we prevent
 // a multiply defined symbol by only compiling this in the non-CUDA phase:
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 
 /// Compare momenta and matrix elements.
 /// This uses an implementation of TestDriverBase to run a madgraph workflow,
@@ -309,6 +309,6 @@ TEST_P( MadgraphTest, CompareMomentaAndME )
   }
 }
 
-#endif // __CUDACC__
+#endif // MGONGPUCPP_GPUIMPL
 
 #endif /* MADGRAPHTEST_H_ */
diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MatrixElementKernels.cc b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MatrixElementKernels.cc
index 30257195b6..dd3eee4ea3 100644
--- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MatrixElementKernels.cc
+++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MatrixElementKernels.cc
@@ -6,7 +6,7 @@
 #include "MatrixElementKernels.h"
 
 #include "CPPProcess.h"
-#include "CudaRuntime.h"
+#include "GpuRuntime.h" // Includes the abstraction for Nvidia/AMD compilation
 #include "MemoryAccessMomenta.h"
 #include "MemoryBuffers.h"
 
@@ -14,7 +14,7 @@
 
 //============================================================================
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 namespace mg5amcCpu
 {
 
@@ -143,7 +143,7 @@ namespace mg5amcCpu
 
 //============================================================================
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 {
 
@@ -202,13 +202,13 @@ namespace mg5amcGpu
     PinnedHostBufferHelicityMask hstIsGoodHel( ncomb );
     DeviceBufferHelicityMask devIsGoodHel( ncomb );
     // ... 0d1. Compute good helicity mask on the device
-    computeDependentCouplings<<<m_gpublocks, m_gputhreads>>>( m_gs.data(), m_couplings.data() );
+    gpuLaunchKernel( computeDependentCouplings, m_gpublocks, m_gputhreads, m_gs.data(), m_couplings.data() );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    sigmaKin_getGoodHel<<<m_gpublocks, m_gputhreads>>>( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_numerators.data(), m_denominators.data(), devIsGoodHel.data() );
+    gpuLaunchKernel( sigmaKin_getGoodHel, m_gpublocks, m_gputhreads, m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_numerators.data(), m_denominators.data(), devIsGoodHel.data() );
 #else
-    sigmaKin_getGoodHel<<<m_gpublocks, m_gputhreads>>>( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), devIsGoodHel.data() );
+    gpuLaunchKernel( sigmaKin_getGoodHel, m_gpublocks, m_gputhreads, m_momenta.data(), m_couplings.data(), m_matrixElements.data(), devIsGoodHel.data() );
 #endif
-    checkCuda( cudaPeekAtLastError() );
+    checkGpu( gpuPeekAtLastError() );
     // ... 0d2. Copy back good helicity mask to the host
     copyHostFromDevice( hstIsGoodHel, devIsGoodHel );
     // ... 0d3. Copy back good helicity list to constant memory on the device
@@ -219,19 +219,19 @@ namespace mg5amcGpu
 
   void MatrixElementKernelDevice::computeMatrixElements( const unsigned int channelId )
   {
-    computeDependentCouplings<<<m_gpublocks, m_gputhreads>>>( m_gs.data(), m_couplings.data() );
+    gpuLaunchKernel( computeDependentCouplings, m_gpublocks, m_gputhreads, m_gs.data(), m_couplings.data() );
 #ifndef MGONGPU_NSIGHT_DEBUG
     constexpr unsigned int sharedMemSize = 0;
 #else
     constexpr unsigned int sharedMemSize = ntpbMAX * sizeof( float );
 #endif
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    sigmaKin<<<m_gpublocks, m_gputhreads, sharedMemSize>>>( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), channelId, m_numerators.data(), m_denominators.data(), m_selhel.data(), m_selcol.data() );
+    gpuLaunchKernelSharedmem( sigmaKin, m_gpublocks, m_gputhreads, sharedMemSize, m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), channelId, m_numerators.data(), m_denominators.data(), m_selhel.data(), m_selcol.data() );
 #else
-    sigmaKin<<<m_gpublocks, m_gputhreads, sharedMemSize>>>( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), m_selhel.data(), m_selcol.data() );
+    gpuLaunchKernelSharedmem( sigmaKin, m_gpublocks, m_gputhreads, sharedMemSize, m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), m_selhel.data(), m_selcol.data() );
 #endif
-    checkCuda( cudaPeekAtLastError() );
-    checkCuda( cudaDeviceSynchronize() );
+    checkGpu( gpuPeekAtLastError() );
+    checkGpu( gpuDeviceSynchronize() );
   }
 
   //--------------------------------------------------------------------------
diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MatrixElementKernels.h b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MatrixElementKernels.h
index 23e84757a2..4477a385ed 100644
--- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MatrixElementKernels.h
+++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MatrixElementKernels.h
@@ -10,7 +10,7 @@
 
 #include "MemoryBuffers.h"
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -81,7 +81,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating matrix element calculations on a CPU host
   class MatrixElementKernelHost final : public MatrixElementKernelBase, public NumberOfEvents
   {
@@ -130,7 +130,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   // A class encapsulating matrix element calculations on a GPU device
   class MatrixElementKernelDevice : public MatrixElementKernelBase, public NumberOfEvents
   {
diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MemoryAccessHelpers.h b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MemoryAccessHelpers.h
index c82a6c7635..67306c3922 100644
--- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MemoryAccessHelpers.h
+++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MemoryAccessHelpers.h
@@ -105,7 +105,7 @@ class KernelAccessHelper : public MemoryAccessHelper<T>
     }
     else
     {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
       const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid
       //printf( "kernelAccessRecord: ievt=%d threadId=%d\n", ievt, threadIdx.x );
       return T::ieventAccessRecord( buffer, ievt ); // NB fptype and fptype_sv coincide for CUDA
diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MemoryAccessMomenta.h b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MemoryAccessMomenta.h
index 29266de32c..dc4bb2aa22 100644
--- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MemoryAccessMomenta.h
+++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MemoryAccessMomenta.h
@@ -13,7 +13,7 @@
 #include "MemoryAccessVectors.h"
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -28,17 +28,17 @@ namespace mg5amcCpu
   {
   public:
 
-    // Number of Events Per Page in the momenta AOSOA memory buffer layout
-    // (these are all best kept as a compile-time constants: see issue #23)
-#ifdef __CUDACC__ /* clang-format off */
-    // -----------------------------------------------------------------------------------------------
-    // --- GPUs: neppM is best set to a power of 2 times the number of fptype's in a 32-byte cacheline
-    // --- This is relevant to ensure coalesced access to momenta in global memory
-    // --- Note that neppR is hardcoded and may differ from neppM and neppV on some platforms
-    // -----------------------------------------------------------------------------------------------
-    //static constexpr int neppM = 64/sizeof(fptype); // 2x 32-byte GPU cache lines (512 bits): 8 (DOUBLE) or 16 (FLOAT)
-    static constexpr int neppM = 32/sizeof(fptype); // (DEFAULT) 32-byte GPU cache line (256 bits): 4 (DOUBLE) or 8 (FLOAT)
-    //static constexpr int neppM = 1; // *** NB: this is equivalent to AOS *** (slower: 1.03E9 instead of 1.11E9 in eemumu)
+  // Number of Events Per Page in the momenta AOSOA memory buffer layout
+  // (these are all best kept as a compile-time constants: see issue #23)
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
+  // -----------------------------------------------------------------------------------------------
+  // --- GPUs: neppM is best set to a power of 2 times the number of fptype's in a 32-byte cacheline
+  // --- This is relevant to ensure coalesced access to momenta in global memory
+  // --- Note that neppR is hardcoded and may differ from neppM and neppV on some platforms
+  // -----------------------------------------------------------------------------------------------
+  //static constexpr int neppM = 64/sizeof(fptype); // 2x 32-byte GPU cache lines (512 bits): 8 (DOUBLE) or 16 (FLOAT)
+  static constexpr int neppM = 32/sizeof(fptype); // (DEFAULT) 32-byte GPU cache line (256 bits): 4 (DOUBLE) or 8 (FLOAT)
+  //static constexpr int neppM = 1; // *** NB: this is equivalent to AOS *** (slower: 1.03E9 instead of 1.11E9 in eemumu)
 #else
     // -----------------------------------------------------------------------------------------------
     // --- CPUs: neppM is best set equal to the number of fptype's (neppV) in a vector register
diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MemoryAccessRandomNumbers.h b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MemoryAccessRandomNumbers.h
index e2988d39f3..949a42066d 100644
--- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MemoryAccessRandomNumbers.h
+++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MemoryAccessRandomNumbers.h
@@ -11,7 +11,7 @@
 #include "CPPProcess.h"
 #include "MemoryAccessHelpers.h"
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 using mg5amcGpu::CPPProcess;
 #else
 using mg5amcCpu::CPPProcess;
diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MemoryAccessVectors.h b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MemoryAccessVectors.h
index e9b197368e..a9ae26b6dc 100644
--- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MemoryAccessVectors.h
+++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MemoryAccessVectors.h
@@ -10,7 +10,7 @@
 
 #include "mgOnGpuVectors.h"
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 namespace mg5amcCpu // this is only needed for CPU SIMD vectorization
 {
 
diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MemoryBuffers.h b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MemoryBuffers.h
index 48306a9d41..d6ba45dcad 100644
--- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MemoryBuffers.h
+++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MemoryBuffers.h
@@ -11,12 +11,11 @@
 #include "mgOnGpuCxtypes.h"
 
 #include "CPPProcess.h"
-#include "CudaRuntime.h"
 #include "Parameters_%(model_name)s.h"
 
 #include <sstream>
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -87,7 +86,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   constexpr bool HostBufferALIGNED = false;   // ismisaligned=false
   constexpr bool HostBufferMISALIGNED = true; // ismisaligned=true
 
@@ -119,7 +118,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   // A class encapsulating a CUDA pinned host buffer
   template<typename T>
   class PinnedHostBufferBase : public BufferBase<T>
@@ -128,18 +127,18 @@ namespace mg5amcCpu
     PinnedHostBufferBase( const size_t size )
       : BufferBase<T>( size, false )
     {
-      checkCuda( cudaMallocHost( &( this->m_data ), this->bytes() ) );
+      gpuMallocHost( &( this->m_data ), this->bytes() );
     }
     virtual ~PinnedHostBufferBase()
     {
-      checkCuda( cudaFreeHost( this->m_data ) );
+      gpuFreeHost( this->m_data );
     }
   };
 #endif
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   // A class encapsulating a CUDA device buffer
   template<typename T>
   class DeviceBufferBase : public BufferBase<T>
@@ -148,18 +147,18 @@ namespace mg5amcCpu
     DeviceBufferBase( const size_t size )
       : BufferBase<T>( size, true )
     {
-      checkCuda( cudaMalloc( &( this->m_data ), this->bytes() ) );
+      gpuMalloc( &( this->m_data ), this->bytes() );
     }
     virtual ~DeviceBufferBase()
     {
-      checkCuda( cudaFree( this->m_data ) );
+      gpuFree( this->m_data );
     }
   };
 #endif
 
   //--------------------------------------------------------------------------
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for a given number of events
   template<typename T, size_t sizePerEvent, bool ismisaligned>
   class HostBuffer : public HostBufferBase<T, ismisaligned>, virtual private NumberOfEvents
@@ -175,7 +174,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   // A class encapsulating a CUDA pinned host buffer for a given number of events
   template<typename T, size_t sizePerEvent>
   class PinnedHostBuffer : public PinnedHostBufferBase<T>, virtual private NumberOfEvents
@@ -191,7 +190,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   // A class encapsulating a CUDA device buffer for a given number of events
   template<typename T, size_t sizePerEvent>
   class DeviceBuffer : public DeviceBufferBase<T>, virtual private NumberOfEvents
@@ -213,7 +212,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for momenta random numbers
   constexpr size_t sizePerEventRndNumMomenta = MemoryBuffers::np4 * MemoryBuffers::nparf;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for momenta random numbers
   typedef HostBuffer<fptype, sizePerEventRndNumMomenta, HostBufferALIGNED> HostBufferRndNumMomenta;
 #else
@@ -232,7 +231,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer with ONE fptype per event
   constexpr size_t sizePerEventOneFp = 1;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer with ONE fptype per event
   typedef HostBuffer<fptype, sizePerEventOneFp, HostBufferALIGNED> HostBufferOneFp;
 #else
@@ -257,7 +256,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for Gs
   constexpr size_t sizePerEventGs = 1;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for gs
   typedef HostBuffer<fptype, sizePerEventGs, HostBufferALIGNED> HostBufferGs;
 #else
@@ -276,7 +275,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for numerators
   constexpr size_t sizePerEventNumerators = 1;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for gs
   typedef HostBuffer<fptype, sizePerEventNumerators, HostBufferALIGNED> HostBufferNumerators;
 #else
@@ -296,7 +295,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for denominators
   constexpr size_t sizePerEventDenominators = 1;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for gs
   typedef HostBuffer<fptype, sizePerEventDenominators, HostBufferALIGNED> HostBufferDenominators;
 #else
@@ -315,7 +314,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for random numbers
   constexpr size_t sizePerEventCouplings = MemoryBuffers::ndcoup * MemoryBuffers::nx2;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for gs
   typedef HostBuffer<fptype, sizePerEventCouplings, HostBufferALIGNED> HostBufferCouplings;
 #else
@@ -333,7 +332,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for momenta
   constexpr size_t sizePerEventMomenta = MemoryBuffers::np4 * MemoryBuffers::npar;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for momenta
   typedef HostBuffer<fptype, sizePerEventMomenta, HostBufferALIGNED> HostBufferMomenta;
   //typedef HostBuffer<fptype, sizePerEventMomenta, HostBufferMISALIGNED> HostBufferMomenta; // TEST MISALIGNMENT!
@@ -352,7 +351,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for sampling weights
   constexpr size_t sizePerEventWeights = 1;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for sampling weights
   typedef HostBuffer<fptype, sizePerEventWeights, HostBufferALIGNED> HostBufferWeights;
 #else
@@ -370,7 +369,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for matrix elements
   constexpr size_t sizePerEventMatrixElements = 1;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for matrix elements
   typedef HostBuffer<fptype, sizePerEventMatrixElements, HostBufferALIGNED> HostBufferMatrixElements;
 #else
@@ -385,7 +384,7 @@ namespace mg5amcCpu
   // A base class encapsulating a memory buffer for the helicity mask
   typedef BufferBase<bool> BufferHelicityMask;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for the helicity mask
   typedef HostBufferBase<bool, HostBufferALIGNED> HostBufferHelicityMask;
 #else
@@ -403,7 +402,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for wavefunctions
   constexpr size_t sizePerEventWavefunctions = MemoryBuffers::nw6 * MemoryBuffers::nx2;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for wavefunctions
   typedef HostBuffer<fptype, sizePerEventWavefunctions, HostBufferALIGNED> HostBufferWavefunctions;
 #else
@@ -421,7 +420,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for helicity random numbers
   constexpr size_t sizePerEventRndNumHelicity = 1;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for helicity random numbers
   typedef HostBuffer<fptype, sizePerEventRndNumHelicity, HostBufferALIGNED> HostBufferRndNumHelicity;
 #else
@@ -439,7 +438,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for color random numbers
   constexpr size_t sizePerEventRndNumColor = 1;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for color random numbers
   typedef HostBuffer<fptype, sizePerEventRndNumColor, HostBufferALIGNED> HostBufferRndNumColor;
 #else
@@ -457,7 +456,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for helicity selection
   constexpr size_t sizePerEventSelectedHelicity = 1;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for helicity selection
   typedef HostBuffer<int, sizePerEventSelectedHelicity, HostBufferALIGNED> HostBufferSelectedHelicity;
 #else
@@ -475,7 +474,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for color selection
   constexpr size_t sizePerEventSelectedColor = 1;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for color selection
   typedef HostBuffer<int, sizePerEventSelectedColor, HostBufferALIGNED> HostBufferSelectedColor;
 #else
@@ -487,7 +486,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   template<class Tdst, class Tsrc>
   void copyDeviceFromHost( Tdst& dst, const Tsrc& src ) // keep the same order of arguments as in memcpy
   {
@@ -504,13 +503,13 @@ namespace mg5amcCpu
       throw std::runtime_error( sstr.str() );
     }
     // NB (PR #45): cudaMemcpy involves an intermediate memcpy to pinned memory if host array is a not a pinned host array
-    checkCuda( cudaMemcpy( dst.data(), src.data(), src.bytes(), cudaMemcpyHostToDevice ) );
+    gpuMemcpy( dst.data(), src.data(), src.bytes(), gpuMemcpyHostToDevice );
   }
 #endif
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   template<class Tdst, class Tsrc>
   void copyHostFromDevice( Tdst& dst, const Tsrc& src ) // keep the same order of arguments as in memcpy
   {
@@ -527,7 +526,7 @@ namespace mg5amcCpu
       throw std::runtime_error( sstr.str() );
     }
     // NB (PR #45): cudaMemcpy involves an intermediate memcpy to pinned memory if host array is a not a pinned host array
-    checkCuda( cudaMemcpy( dst.data(), src.data(), src.bytes(), cudaMemcpyDeviceToHost ) );
+    gpuMemcpy( dst.data(), src.data(), src.bytes(), gpuMemcpyDeviceToHost );
   }
 #endif
 
diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/RamboSamplingKernels.cc b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/RamboSamplingKernels.cc
index da68aa9255..8745b084d3 100644
--- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/RamboSamplingKernels.cc
+++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/RamboSamplingKernels.cc
@@ -5,7 +5,7 @@
 
 #include "RamboSamplingKernels.h"
 
-#include "CudaRuntime.h"
+#include "GpuRuntime.h"
 #include "MemoryAccessMomenta.h"
 #include "MemoryAccessRandomNumbers.h"
 #include "MemoryAccessWeights.h"
@@ -14,7 +14,7 @@
 
 #include <sstream>
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -92,7 +92,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   RamboSamplingKernelDevice::RamboSamplingKernelDevice( const fptype energy,               // input: energy
                                                         const BufferRndNumMomenta& rndmom, // input: random numbers in [0,1]
                                                         BufferMomenta& momenta,            // output: momenta
@@ -135,7 +135,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   __global__ void
   getMomentaInitialDevice( const fptype energy,
                            fptype* momenta )
@@ -147,17 +147,17 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   void
   RamboSamplingKernelDevice::getMomentaInitial()
   {
-    getMomentaInitialDevice<<<m_gpublocks, m_gputhreads>>>( m_energy, m_momenta.data() );
+    gpuLaunchKernel( getMomentaInitialDevice, m_gpublocks, m_gputhreads, m_energy, m_momenta.data() );
   }
 #endif
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   __global__ void
   getMomentaFinalDevice( const fptype energy,
                          const fptype* rndmom,
@@ -171,11 +171,11 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   void
   RamboSamplingKernelDevice::getMomentaFinal()
   {
-    getMomentaFinalDevice<<<m_gpublocks, m_gputhreads>>>( m_energy, m_rndmom.data(), m_momenta.data(), m_weights.data() );
+    gpuLaunchKernel( getMomentaFinalDevice, m_gpublocks, m_gputhreads, m_energy, m_rndmom.data(), m_momenta.data(), m_weights.data() );
   }
 #endif
 
diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/RamboSamplingKernels.h b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/RamboSamplingKernels.h
index 184089efd7..fe63a7bb77 100644
--- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/RamboSamplingKernels.h
+++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/RamboSamplingKernels.h
@@ -10,7 +10,7 @@
 
 #include "MemoryBuffers.h"
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -93,7 +93,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   // A class encapsulating RAMBO phase space sampling on a GPU device
   class RamboSamplingKernelDevice final : public SamplingKernelBase, public NumberOfEvents
   {
diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/RandomNumberKernels.h b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/RandomNumberKernels.h
index 188a72c2c9..0c215f2583 100644
--- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/RandomNumberKernels.h
+++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/RandomNumberKernels.h
@@ -8,7 +8,7 @@
 
 #include "mgOnGpuConfig.h"
 
-// NB This must come AFTER mgOnGpuConfig.h which contains our definition of __global__ when __CUDACC__ is not defined
+// NB This must come AFTER mgOnGpuConfig.h which contains our definition of __global__ when MGONGPUCPP_GPUIMPL is not defined
 #ifndef MGONGPU_HAS_NO_CURAND
 //#include "curand.h"
 struct curandGenerator_st; // forward definition from curand.h
@@ -16,7 +16,7 @@ struct curandGenerator_st; // forward definition from curand.h
 
 #include "MemoryBuffers.h"
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/check_sa.cc b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/check_sa.cc
index f5f08dc64e..9a39220077 100644
--- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/check_sa.cc
+++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/check_sa.cc
@@ -63,7 +63,7 @@ usage( char* argv0, int ret = 1 )
   std::cout << std::endl;
   std::cout << "Summary stats are always computed: '-p' and '-j' only control their printout" << std::endl;
   std::cout << "The '-d' flag only enables NaN/abnormal warnings and OMP debugging" << std::endl;
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 #ifdef _OPENMP
   std::cout << std::endl;
   std::cout << "Use the OMP_NUM_THREADS environment variable to control OMP multi-threading" << std::endl;
@@ -77,7 +77,7 @@ int
 main( int argc, char** argv )
 {
   // Namespaces for CUDA and C++ (FIXME - eventually use the same namespace everywhere...)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   using namespace mg5amcGpu;
 #else
   using namespace mg5amcCpu;
@@ -102,7 +102,7 @@ main( int argc, char** argv )
     CurandHost = 1,
     CurandDevice = 2
   };
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   RandomNumberMode rndgen = RandomNumberMode::CurandDevice; // default on GPU
 #elif not defined MGONGPU_HAS_NO_CURAND
   RandomNumberMode rndgen = RandomNumberMode::CurandHost;  // default on CPU if build has curand
@@ -115,7 +115,7 @@ main( int argc, char** argv )
     RamboHost = 1,
     RamboDevice = 2
   };
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   RamboSamplingMode rmbsmp = RamboSamplingMode::RamboDevice; // default on GPU
 #else
   RamboSamplingMode rmbsmp = RamboSamplingMode::RamboHost; // default on CPU
@@ -145,7 +145,7 @@ main( int argc, char** argv )
     }
     else if( arg == "--curdev" )
     {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
       rndgen = RandomNumberMode::CurandDevice;
 #else
       throw std::runtime_error( "CurandDevice is not supported on CPUs" );
@@ -165,7 +165,7 @@ main( int argc, char** argv )
     }
     else if( arg == "--rmbdev" )
     {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
       rmbsmp = RamboSamplingMode::RamboDevice;
 #else
       throw std::runtime_error( "RamboDevice is not supported on CPUs" );
@@ -239,13 +239,13 @@ main( int argc, char** argv )
     return usage( argv[0] );
   }
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 #ifdef _OPENMP
   ompnumthreadsNotSetMeansOneThread( debug ? 1 : 0 ); // quiet(-1), info(0), debug(1)
 #endif
 #endif
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // Fail gently and avoid "Illegal instruction (core dumped)" if the host does not support the SIMD used in the ME calculation
   // Note: this prevents a crash on pmpe04 but not on some github CI nodes?
   // [NB: SIMD vectorization in mg5amc C++ code is only used in the ME calculation below MatrixElementKernelHost!]
@@ -263,7 +263,7 @@ main( int argc, char** argv )
 
   // === STEP 0 - INITIALISE
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 
   // --- 00. Initialise cuda
   // Instantiate a CudaRuntime at the beginnining of the application's main to
@@ -292,7 +292,7 @@ main( int argc, char** argv )
   timermap.start( alloKey );
 
   // Memory buffers for random numbers for momenta
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferRndNumMomenta hstRndmom( nevt );
 #else
   PinnedHostBufferRndNumMomenta hstRndmom( nevt );
@@ -300,7 +300,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for sampling weights
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferWeights hstWeights( nevt );
 #else
   PinnedHostBufferWeights hstWeights( nevt );
@@ -308,7 +308,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for momenta
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferMomenta hstMomenta( nevt );
 #else
   PinnedHostBufferMomenta hstMomenta( nevt );
@@ -316,7 +316,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for Gs
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferGs hstGs( nevt );
 #else
   PinnedHostBufferGs hstGs( nevt );
@@ -333,7 +333,7 @@ main( int argc, char** argv )
   }
 
   // Memory buffers for matrix elements
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferMatrixElements hstMatrixElements( nevt );
 #else
   PinnedHostBufferMatrixElements hstMatrixElements( nevt );
@@ -342,7 +342,7 @@ main( int argc, char** argv )
 
   // Memory buffers for random numbers for helicity selection
   // *** NB #403 these buffers always remain initialised at 0: no need for helicity choice in gcheck/check (no LHE produced) ***
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferRndNumHelicity hstRndHel( nevt );
 #else
   PinnedHostBufferRndNumHelicity hstRndHel( nevt );
@@ -351,7 +351,7 @@ main( int argc, char** argv )
 
   // Memory buffers for random numbers for color selection
   // *** NB #402 these buffers always remain initialised at 0: no need for color choice in gcheck/check (no LHE produced) ***
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferRndNumColor hstRndCol( nevt );
 #else
   PinnedHostBufferRndNumColor hstRndCol( nevt );
@@ -359,7 +359,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for helicity selection
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferSelectedHelicity hstSelHel( nevt );
 #else
   PinnedHostBufferSelectedHelicity hstSelHel( nevt );
@@ -367,7 +367,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for color selection
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferSelectedColor hstSelCol( nevt );
 #else
   PinnedHostBufferSelectedColor hstSelCol( nevt );
@@ -394,7 +394,7 @@ main( int argc, char** argv )
     const bool onDevice = false;
     prnk.reset( new CurandRandomNumberKernel( hstRndmom, onDevice ) );
   }
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   else
   {
     const bool onDevice = true;
@@ -421,7 +421,7 @@ main( int argc, char** argv )
   }
   else
   {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     prsk.reset( new RamboSamplingKernelDevice( energy, devRndmom, devMomenta, devWeights, gpublocks, gputhreads ) );
 #else
     throw std::logic_error( "RamboDevice is not supported on CPUs" ); // INTERNAL ERROR (no path to this statement)
@@ -432,7 +432,7 @@ main( int argc, char** argv )
   std::unique_ptr<MatrixElementKernelBase> pmek;
   if( !bridge )
   {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     pmek.reset( new MatrixElementKernelDevice( devMomenta, devGs, devRndHel, devRndCol, devMatrixElements, devSelHel, devSelCol, gpublocks, gputhreads ) );
 #else
     pmek.reset( new MatrixElementKernelHost( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, nevt ) );
@@ -440,7 +440,7 @@ main( int argc, char** argv )
   }
   else
   {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     pmek.reset( new BridgeKernelDevice( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, gpublocks, gputhreads ) );
 #else
     pmek.reset( new BridgeKernelHost( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, nevt ) );
@@ -482,7 +482,7 @@ main( int argc, char** argv )
     prnk->generateRnarray();
     //std::cout << "Got random numbers" << std::endl;
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     if( rndgen != RandomNumberMode::CurandDevice && rmbsmp == RamboSamplingMode::RamboDevice )
     {
       // --- 1c. Copy rndmom from host to device
@@ -514,7 +514,7 @@ main( int argc, char** argv )
     prsk->getMomentaFinal();
     //std::cout << "Got final momenta" << std::endl;
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     if( rmbsmp == RamboSamplingMode::RamboDevice )
     {
       // --- 2c. CopyDToH Weights
@@ -559,7 +559,7 @@ main( int argc, char** argv )
       dynamic_cast<BridgeKernelBase*>( pmek.get() )->transposeInputMomentaC2F();
     }
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     // --- 2d. CopyHToD Momenta
     const std::string gKey = "0.. CpHTDg";
     rambtime += timermap.start( gKey ); // FIXME! NOT A RAMBO TIMER!
@@ -588,7 +588,7 @@ main( int argc, char** argv )
     wv3atime += timermap.stop(); // calc only
     wavetime += wv3atime;        // calc plus copy
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     if( !bridge )
     {
       // --- 3b. CopyDToH MEs
@@ -729,7 +729,7 @@ main( int argc, char** argv )
     rndgentxt = "CURAND HOST";
   else if( rndgen == RandomNumberMode::CurandDevice )
     rndgentxt = "CURAND DEVICE";
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   rndgentxt += " (CUDA code)";
 #else
   rndgentxt += " (C++ code)";
@@ -738,7 +738,7 @@ main( int argc, char** argv )
   // Workflow description summary
   std::string wrkflwtxt;
   // -- CUDA or C++?
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   wrkflwtxt += "CUD:";
 #else
   wrkflwtxt += "CPP:";
@@ -754,7 +754,7 @@ main( int argc, char** argv )
   wrkflwtxt += "???+"; // no path to this statement
 #endif
   // -- CUCOMPLEX or THRUST or STD complex numbers?
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 #if defined MGONGPU_CUCXTYPE_CUCOMPLEX
   wrkflwtxt += "CUX:";
 #elif defined MGONGPU_CUCXTYPE_THRUST
@@ -789,7 +789,7 @@ main( int argc, char** argv )
     wrkflwtxt += "RMBDEV+";
   else
     wrkflwtxt += "??????+"; // no path to this statement
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   // -- HOST or DEVICE matrix elements? Standalone MEs or BRIDGE?
   if( !bridge )
     wrkflwtxt += "MESDEV";
@@ -845,7 +845,7 @@ main( int argc, char** argv )
 
   if( perf )
   {
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 #ifdef _OPENMP
     // Get the output of "nproc --all" (https://stackoverflow.com/a/478960)
     std::string nprocall;
@@ -864,7 +864,7 @@ main( int argc, char** argv )
 #endif
     // Dump all configuration parameters and all results
     std::cout << std::string( SEP79, '*' ) << std::endl
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
               << "Process                     = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_CUDA"
 #else
               << "Process                     = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_CPP"
@@ -892,7 +892,7 @@ main( int argc, char** argv )
 #elif defined MGONGPU_FPTYPE_FLOAT
               << "FP precision                = FLOAT (NaN/abnormal=" << nabn << ", zero=" << nzero << ")" << std::endl
 #endif
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 #if defined MGONGPU_CUCXTYPE_CUCOMPLEX
               << "Complex type                = CUCOMPLEX" << std::endl
 #elif defined MGONGPU_CUCXTYPE_THRUST
@@ -906,7 +906,7 @@ main( int argc, char** argv )
               << " [HARDCODED FOR REPRODUCIBILITY]" << std::endl
               << "Momenta memory layout       = AOSOA[" << neppM << "]"
               << ( neppM == 1 ? " == AOS" : "" ) << std::endl
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     //<< "Wavefunction GPU memory     = LOCAL" << std::endl
 #else
 #if !defined MGONGPU_CPPSIMD
@@ -937,7 +937,7 @@ main( int argc, char** argv )
 #endif
 #endif
               << "Random number generation    = " << rndgentxt << std::endl
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 #ifdef _OPENMP
               << "OMP threads / `nproc --all` = " << omp_get_max_threads() << " / " << nprocall // includes a newline
 #endif
@@ -1033,7 +1033,7 @@ main( int argc, char** argv )
              << "\"FLOAT (NaN/abnormal=" << nabn << ")\"," << std::endl
 #endif
              << "\"Complex type\": "
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 #if defined MGONGPU_CUCXTYPE_CUCOMPLEX
              << "\"CUCOMPLEX\"," << std::endl
 #elif defined MGONGPU_CUCXTYPE_THRUST
@@ -1048,7 +1048,7 @@ main( int argc, char** argv )
              << "\"Momenta memory layout\": "
              << "\"AOSOA[" << neppM << "]\""
              << ( neppM == 1 ? " == AOS" : "" ) << ", " << std::endl
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     //<< "\"Wavefunction GPU memory\": " << "\"LOCAL\"," << std::endl
 #endif
              << "\"Curand generation\": "
diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/cpp_hel_amps_h.inc b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/cpp_hel_amps_h.inc
index 562af241af..594fb770c5 100644
--- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/cpp_hel_amps_h.inc
+++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/cpp_hel_amps_h.inc
@@ -26,7 +26,7 @@
 //#include <iomanip>
 //#include <iostream>
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/cudacpp.mk b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/cudacpp.mk
index 222d75f846..d98dca1eb3 100644
--- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/cudacpp.mk
+++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/cudacpp.mk
@@ -32,7 +32,7 @@ UNAME_P := $(shell uname -p)
 #=== Configure common compiler flags for C++ and CUDA
 
 INCFLAGS = -I.
-OPTFLAGS = -O3 # this ends up in CUFLAGS too (should it?), cannot add -Ofast or -ffast-math here
+OPTFLAGS = -O3 # this ends up in GPUFLAGS too (should it?), cannot add -Ofast or -ffast-math here
 
 # Dependency on src directory
 MG5AMC_COMMONLIB = mg5amc_common
@@ -103,71 +103,139 @@ endif
 
 #-------------------------------------------------------------------------------
 
-#=== Configure the CUDA compiler
-
-# If CXX is not a single word (example "clang++ --gcc-toolchain...") then disable CUDA builds (issue #505)
-# This is because it is impossible to pass this to "CUFLAGS += -ccbin <host-compiler>" below
-ifneq ($(words $(subst ccache ,,$(CXX))),1) # allow at most "CXX=ccache <host-compiler>" from outside
-  $(warning CUDA builds are not supported for multi-word CXX "$(CXX)")
-  override CUDA_HOME=disabled
-endif
-
-# If CUDA_HOME is not set, try to set it from the location of nvcc
-ifndef CUDA_HOME
-  CUDA_HOME = $(patsubst %%bin/nvcc,%%,$(shell which nvcc 2>/dev/null))
-  $(warning CUDA_HOME was not set: using "$(CUDA_HOME)")
-endif
-
-# Set NVCC as $(CUDA_HOME)/bin/nvcc if it exists
-ifneq ($(wildcard $(CUDA_HOME)/bin/nvcc),)
-  NVCC = $(CUDA_HOME)/bin/nvcc
-  USE_NVTX ?=-DUSE_NVTX
-  # See https://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/index.html
-  # See https://arnon.dk/matching-sm-architectures-arch-and-gencode-for-various-nvidia-cards/
-  # Default: use compute capability 70 for V100 (CERN lxbatch, CERN itscrd, Juwels Cluster).
-  # Embed device code for 70, and PTX for 70+.
-  # Export MADGRAPH_CUDA_ARCHITECTURE (comma-separated list) to use another value or list of values (see #533).
-  # Examples: use 60 for P100 (Piz Daint), 80 for A100 (Juwels Booster, NVidia raplab/Curiosity).
-  MADGRAPH_CUDA_ARCHITECTURE ?= 70
-  ###CUARCHFLAGS = -gencode arch=compute_$(MADGRAPH_CUDA_ARCHITECTURE),code=compute_$(MADGRAPH_CUDA_ARCHITECTURE) -gencode arch=compute_$(MADGRAPH_CUDA_ARCHITECTURE),code=sm_$(MADGRAPH_CUDA_ARCHITECTURE) # Older implementation (AV): go back to this one for multi-GPU support #533
-  ###CUARCHFLAGS = --gpu-architecture=compute_$(MADGRAPH_CUDA_ARCHITECTURE) --gpu-code=sm_$(MADGRAPH_CUDA_ARCHITECTURE),compute_$(MADGRAPH_CUDA_ARCHITECTURE) # Newer implementation (SH): cannot use this as-is for multi-GPU support #533
-  comma:=,
-  CUARCHFLAGS = $(foreach arch,$(subst $(comma), ,$(MADGRAPH_CUDA_ARCHITECTURE)),-gencode arch=compute_$(arch),code=compute_$(arch) -gencode arch=compute_$(arch),code=sm_$(arch))
-  CUINC = -I$(CUDA_HOME)/include/
-  CURANDLIBFLAGS = -L$(CUDA_HOME)/lib64/ -lcurand # NB: -lcuda is not needed here!
-  CUOPTFLAGS = -lineinfo
-  CUFLAGS = $(foreach opt, $(OPTFLAGS), -Xcompiler $(opt)) $(CUOPTFLAGS) $(INCFLAGS) $(CUINC) $(USE_NVTX) $(CUARCHFLAGS) -use_fast_math
-  ###CUFLAGS += -Xcompiler -Wall -Xcompiler -Wextra -Xcompiler -Wshadow
-  ###NVCC_VERSION = $(shell $(NVCC) --version | grep 'Cuda compilation tools' | cut -d' ' -f5 | cut -d, -f1)
-  CUFLAGS += -std=c++17 # need CUDA >= 11.2 (see #333): this is enforced in mgOnGpuConfig.h
-  # Without -maxrregcount: baseline throughput: 6.5E8 (16384 32 12) up to 7.3E8 (65536 128 12)
-  ###CUFLAGS+= --maxrregcount 160 # improves throughput: 6.9E8 (16384 32 12) up to 7.7E8 (65536 128 12)
-  ###CUFLAGS+= --maxrregcount 128 # improves throughput: 7.3E8 (16384 32 12) up to 7.6E8 (65536 128 12)
-  ###CUFLAGS+= --maxrregcount 96 # degrades throughput: 4.1E8 (16384 32 12) up to 4.5E8 (65536 128 12)
-  ###CUFLAGS+= --maxrregcount 64 # degrades throughput: 1.7E8 (16384 32 12) flat at 1.7E8 (65536 128 12)
-else ifneq ($(origin REQUIRE_CUDA),undefined)
-  # If REQUIRE_CUDA is set but no cuda is found, stop here (e.g. for CI tests on GPU #443)
-  $(error No cuda installation found (set CUDA_HOME or make nvcc visible in PATH))
-else
-  # No cuda. Switch cuda compilation off and go to common random numbers in C++
-  $(warning CUDA_HOME is not set or is invalid: export CUDA_HOME to compile with cuda)
-  override NVCC=
-  override USE_NVTX=
-  override CUINC=
-  override CURANDLIBFLAGS=
-endif
-export NVCC
-export CUFLAGS
+CUDA_COMPILER_PATH := $(shell compiler="`which nvcc 2>/dev/null`" && while [ -L "$$compiler" ]; do compiler=`readlink "$$compiler"`; done && echo "$$compiler")
+HIP_COMPILER_PATH := $(shell compiler="`which hipcc 2>/dev/null`" && while [ -L "$$compiler" ]; do compiler=`readlink "$$compiler"`; done && echo "$$compiler")
+
+ifeq ($(findstring nvcc,$(CUDA_COMPILER_PATH)),nvcc)
+  #=== Configure the CUDA compiler
+
+  # If CXX is not a single word (example "clang++ --gcc-toolchain...") then disable CUDA builds (issue #505)
+  # This is because it is impossible to pass this to "GPUFLAGS += -ccbin <host-compiler>" below
+  ifneq ($(words $(subst ccache ,,$(CXX))),1) # allow at most "CXX=ccache <host-compiler>" from outside
+    $(warning CUDA builds are not supported for multi-word CXX "$(CXX)")
+    override CUDA_HOME=disabled
+  endif
+
+  # If CUDA_HOME is not set, try to set it from the location of NVCC
+  ifndef CUDA_HOME
+    CUDA_HOME = $(patsubst %bin/nvcc,%,$(shell which nvcc 2>/dev/null))
+    $(warning CUDA_HOME was not set: using "$(CUDA_HOME)")
+  endif
+
+  # Set GPUCC as $(CUDA_HOME)/bin/nvcc if it exists
+  ifneq ($(wildcard $(CUDA_HOME)/bin/nvcc),)
+    GPUCC = $(CUDA_HOME)/bin/nvcc
+    USE_NVTX ?=-DUSE_NVTX
+    # See https://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/index.html
+    # See https://arnon.dk/matching-sm-architectures-arch-and-gencode-for-various-nvidia-cards/
+    # Default: use compute capability 70 for V100 (CERN lxbatch, CERN itscrd, Juwels Cluster).
+    # Embed device code for 70, and PTX for 70+.
+    # Export MADGRAPH_CUDA_ARCHITECTURE (comma-separated list) to use another value or list of values (see #533).
+    # Examples: use 60 for P100 (Piz Daint), 80 for A100 (Juwels Booster, NVidia raplab/Curiosity).
+    MADGRAPH_CUDA_ARCHITECTURE ?= 70
+    ###CUARCHFLAGS = -gencode arch=compute_$(MADGRAPH_CUDA_ARCHITECTURE),code=compute_$(MADGRAPH_CUDA_ARCHITECTURE) -gencode arch=compute_$(MADGRAPH_CUDA_ARCHITECTURE),code=sm_$(MADGRAPH_CUDA_ARCHITECTURE) # Older implementation (AV): go back to this one for multi-GPU support #533
+    ###CUARCHFLAGS = --gpu-architecture=compute_$(MADGRAPH_CUDA_ARCHITECTURE) --gpu-code=sm_$(MADGRAPH_CUDA_ARCHITECTURE),compute_$(MADGRAPH_CUDA_ARCHITECTURE) # Newer implementation (SH): cannot use this as-is for multi-GPU support #533
+    comma:=,
+    CUARCHFLAGS = $(foreach arch,$(subst $(comma), ,$(MADGRAPH_CUDA_ARCHITECTURE)),-gencode arch=compute_$(arch),code=compute_$(arch) -gencode arch=compute_$(arch),code=sm_$(arch))
+    CUINC = -I$(CUDA_HOME)/include/
+    CURANDLIBFLAGS = -L$(CUDA_HOME)/lib64/ -lcurand # NB: -lcuda is not needed here!
+    CUOPTFLAGS = -lineinfo
+    GPUFLAGS = $(OPTFLAGS) $(CUOPTFLAGS) $(INCFLAGS) $(CUINC) $(USE_NVTX) $(CUARCHFLAGS) -use_fast_math
+    ###GPUFLAGS += -Xcompiler -Wall -Xcompiler -Wextra -Xcompiler -Wshadow
+    ###GPUCC_VERSION = $(shell $(GPUCC) --version | grep 'Cuda compilation tools' | cut -d' ' -f5 | cut -d, -f1)
+    GPUFLAGS += -std=c++17 # need CUDA >= 11.2 (see #333): this is enforced in mgOnGpuConfig.h
+    # Without -maxrregcount: baseline throughput: 6.5E8 (16384 32 12) up to 7.3E8 (65536 128 12)
+    ###GPUFLAGS+= --maxrregcount 160 # improves throughput: 6.9E8 (16384 32 12) up to 7.7E8 (65536 128 12)
+    ###GPUFLAGS+= --maxrregcount 128 # improves throughput: 7.3E8 (16384 32 12) up to 7.6E8 (65536 128 12)
+    ###GPUFLAGS+= --maxrregcount 96 # degrades throughput: 4.1E8 (16384 32 12) up to 4.5E8 (65536 128 12)
+    ###GPUFLAGS+= --maxrregcount 64 # degrades throughput: 1.7E8 (16384 32 12) flat at 1.7E8 (65536 128 12)
+
+    CUBUILDRULEFLAGS = -Xcompiler -fPIC -c
+    CCBUILDRULEFLAGS = -Xcompiler -fPIC -c -x cu
+
+    CUDATESTFLAGS = -lcuda
+
+  else ifneq ($(origin REQUIRE_CUDA),undefined)
+    # If REQUIRE_CUDA is set but no cuda is found, stop here (e.g. for CI tests on GPU #443)
+    $(error No cuda installation found (set CUDA_HOME or make GPUCC visible in PATH))
+  else
+    # No cuda. Switch cuda compilation off and go to common random numbers in C++
+    $(warning CUDA_HOME is not set or is invalid: export CUDA_HOME to compile with cuda)
+    override GPUCC=
+    override USE_NVTX=
+    override CUINC=
+    override CURANDLIBFLAGS=
+  endif
+
+  # Set the host C++ compiler for GPUCC via "-ccbin <host-compiler>"
+  # (NB issue #505: this must be a single word, "clang++ --gcc-toolchain..." is not supported)
+  GPUFLAGS += -ccbin $(shell which $(subst ccache ,,$(CXX)))
+
+  # Allow newer (unsupported) C++ compilers with older versions of CUDA if ALLOW_UNSUPPORTED_COMPILER_IN_CUDA is set (#504)
+  ifneq ($(origin ALLOW_UNSUPPORTED_COMPILER_IN_CUDA),undefined)
+  GPUFLAGS += -allow-unsupported-compiler
+  endif
 
-# Set the host C++ compiler for nvcc via "-ccbin <host-compiler>"
-# (NB issue #505: this must be a single word, "clang++ --gcc-toolchain..." is not supported)
-CUFLAGS += -ccbin $(shell which $(subst ccache ,,$(CXX)))
+else ifeq ($(findstring hipcc,$(HIP_COMPILER_PATH)),hipcc)
+  #=== Configure the HIP compiler
+
+  # If CXX is not a single word (example "clang++ --gcc-toolchain...") then disable CUDA builds (issue #505)
+  # This is because it is impossible to pass this to "GPUFLAGS += -ccbin <host-compiler>" below
+  ifneq ($(words $(subst ccache ,,$(CXX))),1) # allow at most "CXX=ccache <host-compiler>" from outside
+    $(warning CUDA builds are not supported for multi-word CXX "$(CXX)")
+    override CUDA_HOME=disabled
+  endif
+
+  # If HIP_HOME is not set, try to set it from the location of GPUCC
+  ifndef HIP_HOME
+    HIP_HOME = $(patsubst %bin/hipcc,%,$(HIP_COMPILER_PATH))
+    $(warning HIP_HOME was not set: using "$(HIP_HOME)")
+  endif
+
+  # Set GPUCC as $(HIP_HOME)/bin/hipcc if it exists
+  ifneq ($(wildcard $(HIP_HOME)/bin/hipcc),)
+    GPUCC = $(HIP_HOME)/bin/hipcc
+
+    # Should maybe find something equivelant to this in HIP
+    #USE_NVTX ?=-DUSE_NVTX
+
+    HIPARCHFLAGS = -target x86_64-linux-gnu --offload-arch=gfx90a
+    HIPINC = -I$(HIP_HOME)/include/
+
+    # -DHIP_FAST_MATH equivelant to -use_fast_math in HIP 
+    # (But only for single precision line 208: https://rocm-developer-tools.github.io/HIP/hcc__detail_2math__functions_8h_source.html)
+    GPUFLAGS = $(OPTFLAGS) $(CUOPTFLAGS) $(INCFLAGS) $(HIPINC) $(HIPARCHFLAGS) -DHIP_FAST_MATH -DHIP_PLATFORM=amd -fPIC
+    ###GPUFLAGS += -Xcompiler -Wall -Xcompiler -Wextra -Xcompiler -Wshadow
+    GPUFLAGS += -std=c++17
+    # Without -maxrregcount: baseline throughput: 6.5E8 (16384 32 12) up to 7.3E8 (65536 128 12)
+    ###GPUFLAGS+= --maxrregcount 160 # improves throughput: 6.9E8 (16384 32 12) up to 7.7E8 (65536 128 12)
+    ###GPUFLAGS+= --maxrregcount 128 # improves throughput: 7.3E8 (16384 32 12) up to 7.6E8 (65536 128 12)
+    ###GPUFLAGS+= --maxrregcount 96 # degrades throughput: 4.1E8 (16384 32 12) up to 4.5E8 (65536 128 12)
+    ###GPUFLAGS+= --maxrregcount 64 # degrades throughput: 1.7E8 (16384 32 12) flat at 1.7E8 (65536 128 12)
+
+    CUBUILDRULEFLAGS = -fPIC -c
+    CCBUILDRULEFLAGS = -fPIC -c
+
+  else ifneq ($(origin REQUIRE_HIP),undefined)
+    # If REQUIRE_HIP is set but no cuda is found, stop here (e.g. for CI tests on GPU #443)
+    $(error No hip installation found (set HIP_HOME or make GPUCC visible in PATH))
+  else
+    # No hip. Switch hip compilation off and go to common random numbers in C++
+    $(warning HIP_HOME is not set or is invalid: export HIP_HOME to compile with hip)
+    override GPUCC=
+    override USE_NVTX=
+    override CUINC=
+    override CURANDLIBFLAGS=
+  endif
+
+  # Allow newer (unsupported) C++ compilers with older versions of CUDA if ALLOW_UNSUPPORTED_COMPILER_IN_CUDA is set (#504)
+  ifneq ($(origin ALLOW_UNSUPPORTED_COMPILER_IN_CUDA),undefined)
+  GPUFLAGS += -allow-unsupported-compiler
+  endif
 
-# Allow newer (unsupported) C++ compilers with older versions of CUDA if ALLOW_UNSUPPORTED_COMPILER_IN_CUDA is set (#504)
-ifneq ($(origin ALLOW_UNSUPPORTED_COMPILER_IN_CUDA),undefined)
-CUFLAGS += -allow-unsupported-compiler
 endif
 
+  
 #-------------------------------------------------------------------------------
 
 #=== Configure ccache for C++ and CUDA builds
@@ -179,9 +247,9 @@ endif
 #ifeq ($(USECCACHE)$(shell echo $(AR) | grep ccache),1)
 #  override AR:=ccache $(AR)
 #endif
-ifneq ($(NVCC),)
-  ifeq ($(USECCACHE)$(shell echo $(NVCC) | grep ccache),1)
-    override NVCC:=ccache $(NVCC)
+ifneq ($(GPUCC),)
+  ifeq ($(USECCACHE)$(shell echo $(GPUCC) | grep ccache),1)
+    override GPUCC:=ccache $(GPUCC)
   endif
 endif
 
@@ -191,11 +259,11 @@ endif
 
 # PowerPC-specific CXX compiler flags (being reviewed)
 ifeq ($(UNAME_P),ppc64le)
-  CXXFLAGS+= -mcpu=power9 -mtune=power9 # gains ~2-3%% both for none and sse4
+  CXXFLAGS+= -mcpu=power9 -mtune=power9 # gains ~2-3% both for none and sse4
   # Throughput references without the extra flags below: none=1.41-1.42E6, sse4=2.15-2.19E6
   ###CXXFLAGS+= -DNO_WARN_X86_INTRINSICS # no change
   ###CXXFLAGS+= -fpeel-loops # no change
-  ###CXXFLAGS+= -funroll-loops # gains ~1%% for none, loses ~1%% for sse4
+  ###CXXFLAGS+= -funroll-loops # gains ~1% for none, loses ~1% for sse4
   ###CXXFLAGS+= -ftree-vectorize # no change
   ###CXXFLAGS+= -flto # would increase to none=4.08-4.12E6, sse4=4.99-5.03E6!
 else
@@ -205,7 +273,7 @@ endif
 
 # PowerPC-specific CUDA compiler flags (to be reviewed!)
 ifeq ($(UNAME_P),ppc64le)
-  CUFLAGS+= -Xcompiler -mno-float128
+  GPUFLAGS+= -Xcompiler -mno-float128
 endif
 
 #-------------------------------------------------------------------------------
@@ -215,10 +283,10 @@ endif
 # Set the default OMPFLAGS choice
 ifneq ($(shell $(CXX) --version | egrep '^Intel'),)
 override OMPFLAGS = -fopenmp
-###override OMPFLAGS = # disable OpenMP MT on Intel (was ok without nvcc but not ok with nvcc before #578)
+###override OMPFLAGS = # disable OpenMP MT on Intel (was ok without GPUCC but not ok with GPUCC before #578)
 else ifneq ($(shell $(CXX) --version | egrep '^(clang)'),)
 override OMPFLAGS = -fopenmp
-###override OMPFLAGS = # disable OpenMP MT on clang (was not ok without or with nvcc before #578)
+###override OMPFLAGS = # disable OpenMP MT on clang (was not ok without or with GPUCC before #578)
 else ifneq ($(shell $(CXX) --version | egrep '^(Apple clang)'),)
 override OMPFLAGS = # disable OpenMP MT on Apple clang (builds fail in the CI #578)
 else
@@ -269,7 +337,10 @@ endif
 
 # Set the default RNDGEN (random number generator) choice
 ifeq ($(RNDGEN),)
-  ifeq ($(NVCC),)
+  ifeq ($(GPUCC),)
+    override RNDGEN = hasNoCurand
+  # Edgecase for HIP compilation
+  else ifeq ($(findstring hipcc,$(GPUCC)),hipcc)
     override RNDGEN = hasNoCurand
   else ifeq ($(RNDGEN),)
     override RNDGEN = hasCurand
@@ -344,13 +415,13 @@ CXXFLAGS+= $(AVXFLAGS)
 $(info FPTYPE=$(FPTYPE))
 ifeq ($(FPTYPE),d)
   CXXFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_DOUBLE
-  CUFLAGS  += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_DOUBLE
+  GPUFLAGS  += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_DOUBLE
 else ifeq ($(FPTYPE),f)
   CXXFLAGS += -DMGONGPU_FPTYPE_FLOAT -DMGONGPU_FPTYPE2_FLOAT
-  CUFLAGS  += -DMGONGPU_FPTYPE_FLOAT -DMGONGPU_FPTYPE2_FLOAT
+  GPUFLAGS  += -DMGONGPU_FPTYPE_FLOAT -DMGONGPU_FPTYPE2_FLOAT
 else ifeq ($(FPTYPE),m)
   CXXFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_FLOAT
-  CUFLAGS  += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_FLOAT
+  GPUFLAGS  += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_FLOAT
 else
   $(error Unknown FPTYPE='$(FPTYPE)': only 'd', 'f' and 'm' are supported)
 endif
@@ -359,7 +430,7 @@ endif
 $(info HELINL=$(HELINL))
 ifeq ($(HELINL),1)
   CXXFLAGS += -DMGONGPU_INLINE_HELAMPS
-  CUFLAGS  += -DMGONGPU_INLINE_HELAMPS
+  GPUFLAGS  += -DMGONGPU_INLINE_HELAMPS
 else ifneq ($(HELINL),0)
   $(error Unknown HELINL='$(HELINL)': only '0' and '1' are supported)
 endif
@@ -368,7 +439,7 @@ endif
 $(info HRDCOD=$(HRDCOD))
 ifeq ($(HRDCOD),1)
   CXXFLAGS += -DMGONGPU_HARDCODE_PARAM
-  CUFLAGS  += -DMGONGPU_HARDCODE_PARAM
+  GPUFLAGS  += -DMGONGPU_HARDCODE_PARAM
 else ifneq ($(HRDCOD),0)
   $(error Unknown HRDCOD='$(HRDCOD)': only '0' and '1' are supported)
 endif
@@ -420,11 +491,11 @@ ifeq ($(UNAME_S),Darwin)
   override CULIBFLAGSRPATH2 =
 else
   # RPATH to cuda/cpp libs when linking executables
-  override CXXLIBFLAGSRPATH = -Wl,-rpath,$(LIBDIRRPATH)
-  override CULIBFLAGSRPATH = -Xlinker -rpath,$(LIBDIRRPATH)
+  override CXXLIBFLAGSRPATH = -Wl,-rpath=$(LIBDIRRPATH)
+  override CULIBFLAGSRPATH = -Xlinker -rpath=$(LIBDIRRPATH)
   # RPATH to common lib when linking cuda/cpp libs
-  override CXXLIBFLAGSRPATH2 = -Wl,-rpath,'$$ORIGIN'
-  override CULIBFLAGSRPATH2 = -Xlinker -rpath,'$$ORIGIN'
+  override CXXLIBFLAGSRPATH2 = -Wl,-rpath='$$ORIGIN'
+  override CULIBFLAGSRPATH2 = -Xlinker -rpath='$$ORIGIN'
 endif
 
 # Setting LD_LIBRARY_PATH or DYLD_LIBRARY_PATH in the RUNTIME is no longer necessary (neither on Linux nor on Mac)
@@ -437,7 +508,7 @@ override RUNTIME =
 cxx_main=$(BUILDDIR)/check.exe
 fcxx_main=$(BUILDDIR)/fcheck.exe
 
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 cu_main=$(BUILDDIR)/gcheck.exe
 fcu_main=$(BUILDDIR)/fgcheck.exe
 else
@@ -468,28 +539,32 @@ $(BUILDDIR)/.build.$(TAG):
 	@touch $(BUILDDIR)/.build.$(TAG)
 
 # Generic target and build rules: objects from CUDA compilation
-ifneq ($(NVCC),)
-$(BUILDDIR)/%%.o : %%.cu *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
+ifneq ($(GPUCC),)
+$(BUILDDIR)/%.o : %.cu *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
 	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
-	$(NVCC) $(CPPFLAGS) $(CUFLAGS) -Xcompiler -fPIC -c $< -o $@
+	$(GPUCC) $(CPPFLAGS) $(GPUFLAGS) $(CUBUILDRULEFLAGS) $< -o $@
 
-$(BUILDDIR)/%%_cu.o : %%.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
+$(BUILDDIR)/%_cu.o : %.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
 	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
-	$(NVCC) $(CPPFLAGS) $(CUFLAGS) -Xcompiler -fPIC -c -x cu $< -o $@
+	$(GPUCC) $(CPPFLAGS) $(GPUFLAGS) $(CCBUILDRULEFLAGS) $< -o $@
 endif
+# -x cu in line above
 
 # Generic target and build rules: objects from C++ compilation
 # (NB do not include CUINC here! add it only for NVTX or curand #679)
-$(BUILDDIR)/%%.o : %%.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
+$(BUILDDIR)/%.o : %.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
 	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
 	$(CXX) $(CPPFLAGS) $(CXXFLAGS) -fPIC -c $< -o $@
 
 # Apply special build flags only to CrossSectionKernel.cc and gCrossSectionKernel.cu (no fast math, see #117 and #516)
+# Added edgecase for HIP compilation
 ifeq ($(shell $(CXX) --version | grep ^nvc++),)
-$(BUILDDIR)/CrossSectionKernels.o: CXXFLAGS := $(filter-out -ffast-math,$(CXXFLAGS))
 $(BUILDDIR)/CrossSectionKernels.o: CXXFLAGS += -fno-fast-math
-ifneq ($(NVCC),)
-$(BUILDDIR)/gCrossSectionKernels.o: CUFLAGS += -Xcompiler -fno-fast-math
+$(BUILDDIR)/CrossSectionKernels.o: CXXFLAGS += -fno-fast-math
+ifeq ($(findstring nvcc,$(GPUCC)),nvcc)
+  $(BUILDDIR)/gCrossSectionKernels.o: GPUFLAGS += -Xcompiler -fno-fast-math
+else
+  $(BUILDDIR)/gCrossSectionKernels.o: GPUFLAGS += -fno-fast-math
 endif
 endif
 
@@ -505,10 +580,10 @@ ifeq ($(RNDGEN),hasCurand)
 $(BUILDDIR)/CurandRandomNumberKernel.o: CXXFLAGS += $(CUINC)
 endif
 
-# Avoid "warning: builtin __has_trivial_... is deprecated; use __is_trivially_... instead" in nvcc with icx2023 (#592)
+# Avoid "warning: builtin __has_trivial_... is deprecated; use __is_trivially_... instead" in GPUCC with icx2023 (#592)
 ifneq ($(shell $(CXX) --version | egrep '^(Intel)'),)
-ifneq ($(NVCC),)
-CUFLAGS += -Xcompiler -Wno-deprecated-builtins
+ifneq ($(GPUCC),)
+GPUFLAGS += -Wno-deprecated-builtins
 endif
 endif
 
@@ -516,8 +591,8 @@ endif
 # This patch does remove the warning, but I prefer to keep it disabled for the moment...
 ###ifneq ($(shell $(CXX) --version | egrep '^(clang|Apple clang|Intel)'),)
 ###$(BUILDDIR)/CrossSectionKernels.o: CXXFLAGS += -Wno-overriding-t-option
-###ifneq ($(NVCC),)
-###$(BUILDDIR)/gCrossSectionKernels.o: CUFLAGS += -Xcompiler -Wno-overriding-t-option
+###ifneq ($(GPUCC),)
+###$(BUILDDIR)/gCrossSectionKernels.o: GPUFLAGS += -Xcompiler -Wno-overriding-t-option
 ###endif
 ###endif
 
@@ -544,7 +619,7 @@ MG5AMC_CXXLIB = mg5amc_$(processid_short)_cpp
 cxx_objects_lib=$(BUILDDIR)/CPPProcess.o $(BUILDDIR)/MatrixElementKernels.o $(BUILDDIR)/BridgeKernels.o $(BUILDDIR)/CrossSectionKernels.o
 cxx_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel.o $(BUILDDIR)/RamboSamplingKernels.o
 
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 MG5AMC_CULIB = mg5amc_$(processid_short)_cuda
 cu_objects_lib=$(BUILDDIR)/gCPPProcess.o $(BUILDDIR)/gMatrixElementKernels.o $(BUILDDIR)/gBridgeKernels.o $(BUILDDIR)/gCrossSectionKernels.o
 cu_objects_exe=$(BUILDDIR)/gCommonRandomNumberKernel.o $(BUILDDIR)/gRamboSamplingKernels.o
@@ -556,17 +631,17 @@ $(LIBDIR)/lib$(MG5AMC_CXXLIB).so: cxx_objects_lib += $(BUILDDIR)/fbridge.o
 $(LIBDIR)/lib$(MG5AMC_CXXLIB).so: $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib)
 	$(CXX) -shared -o $@ $(cxx_objects_lib) $(CXXLIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB)
 
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 $(LIBDIR)/lib$(MG5AMC_CULIB).so: $(BUILDDIR)/fbridge_cu.o
 $(LIBDIR)/lib$(MG5AMC_CULIB).so: cu_objects_lib += $(BUILDDIR)/fbridge_cu.o
 $(LIBDIR)/lib$(MG5AMC_CULIB).so: $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cu_objects_lib)
-	$(NVCC) --shared -o $@ $(cu_objects_lib) $(CULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB)
+	$(GPUCC) --shared -o $@ $(cu_objects_lib) $(CULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB)
 endif
 
 #-------------------------------------------------------------------------------
 
 # Target (and build rules): Fortran include files
-###$(INCDIR)/%%.inc : ../%%.inc
+###$(INCDIR)/%.inc : ../%.inc
 ###	@if [ ! -d $(INCDIR) ]; then echo "mkdir -p $(INCDIR)"; mkdir -p $(INCDIR); fi
 ###	\cp $< $@
 
@@ -577,27 +652,27 @@ $(cxx_main): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PAT
 $(cxx_main): $(BUILDDIR)/check_sa.o $(LIBDIR)/lib$(MG5AMC_CXXLIB).so $(cxx_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel.o
 	$(CXX) -o $@ $(BUILDDIR)/check_sa.o $(OMPFLAGS) -ldl -pthread $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_CXXLIB) $(cxx_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel.o $(CURANDLIBFLAGS)
 
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 ifneq ($(shell $(CXX) --version | grep ^Intel),)
-$(cu_main): LIBFLAGS += -lintlc # compile with icpx and link with nvcc (undefined reference to `_intel_fast_memcpy')
-$(cu_main): LIBFLAGS += -lsvml # compile with icpx and link with nvcc (undefined reference to `__svml_cos4_l9')
+$(cu_main): LIBFLAGS += -lintlc # compile with icpx and link with GPUCC (undefined reference to `_intel_fast_memcpy')
+$(cu_main): LIBFLAGS += -lsvml # compile with icpx and link with GPUCC (undefined reference to `__svml_cos4_l9')
 else ifneq ($(shell $(CXX) --version | grep ^nvc++),) # support nvc++ #531
-$(cu_main): LIBFLAGS += -L$(patsubst %%bin/nvc++,%%lib,$(subst ccache ,,$(CXX))) -lnvhpcatm -lnvcpumath -lnvc
+$(cu_main): LIBFLAGS += -L$(patsubst %bin/nvc++,%lib,$(subst ccache ,,$(CXX))) -lnvhpcatm -lnvcpumath -lnvc
 endif
 $(cu_main): LIBFLAGS += $(CULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH
 $(cu_main): $(BUILDDIR)/gcheck_sa.o $(LIBDIR)/lib$(MG5AMC_CULIB).so $(cu_objects_exe) $(BUILDDIR)/gCurandRandomNumberKernel.o
-	$(NVCC) -o $@ $(BUILDDIR)/gcheck_sa.o $(CUARCHFLAGS) $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe) $(BUILDDIR)/gCurandRandomNumberKernel.o $(CURANDLIBFLAGS)
+	$(GPUCC) -o $@ $(BUILDDIR)/gcheck_sa.o $(CUARCHFLAGS) $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe) $(BUILDDIR)/gCurandRandomNumberKernel.o $(CURANDLIBFLAGS)
 endif
 
 #-------------------------------------------------------------------------------
 
 # Generic target and build rules: objects from Fortran compilation
-$(BUILDDIR)/%%.o : %%.f *.inc
+$(BUILDDIR)/%.o : %.f *.inc
 	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
 	$(FC) -I. -c $< -o $@
 
 # Generic target and build rules: objects from Fortran compilation
-###$(BUILDDIR)/%%.o : %%.f *.inc
+###$(BUILDDIR)/%.o : %.f *.inc
 ###	@if [ ! -d $(INCDIR) ]; then echo "mkdir -p $(INCDIR)"; mkdir -p $(INCDIR); fi
 ###	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
 ###	$(FC) -I. -I$(INCDIR) -c $< -o $@
@@ -612,17 +687,17 @@ $(fcxx_main): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PA
 $(fcxx_main): $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler.o $(LIBDIR)/lib$(MG5AMC_CXXLIB).so $(cxx_objects_exe)
 	$(CXX) -o $@ $(BUILDDIR)/fcheck_sa.o $(OMPFLAGS) $(BUILDDIR)/fsampler.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CXXLIB) $(cxx_objects_exe)
 
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 ifneq ($(shell $(CXX) --version | grep ^Intel),)
-$(fcu_main): LIBFLAGS += -lintlc # compile with icpx and link with nvcc (undefined reference to `_intel_fast_memcpy')
-$(fcu_main): LIBFLAGS += -lsvml # compile with icpx and link with nvcc (undefined reference to `__svml_cos4_l9')
+$(fcu_main): LIBFLAGS += -lintlc # compile with icpx and link with GPUCC (undefined reference to `_intel_fast_memcpy')
+$(fcu_main): LIBFLAGS += -lsvml # compile with icpx and link with GPUCC (undefined reference to `__svml_cos4_l9')
 endif
 ifeq ($(UNAME_S),Darwin)
 $(fcu_main): LIBFLAGS += -L$(shell dirname $(shell $(FC) --print-file-name libgfortran.dylib)) # add path to libgfortran on Mac #375
 endif
 $(fcu_main): LIBFLAGS += $(CULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH
 $(fcu_main): $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler_cu.o $(LIBDIR)/lib$(MG5AMC_CULIB).so $(cu_objects_exe)
-	$(NVCC) -o $@ $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler_cu.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe)
+	$(GPUCC) -o $@ $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler_cu.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe)
 endif
 
 #-------------------------------------------------------------------------------
@@ -634,7 +709,7 @@ $(BUILDDIR)/testxxx.o: testxxx_cc_ref.txt
 $(testmain): $(BUILDDIR)/testxxx.o
 $(testmain): cxx_objects_exe += $(BUILDDIR)/testxxx.o # Comment out this line to skip the C++ test of xxx functions
 
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 $(BUILDDIR)/testxxx_cu.o: $(GTESTLIBS)
 $(BUILDDIR)/testxxx_cu.o: INCFLAGS += $(GTESTINC)
 $(BUILDDIR)/testxxx_cu.o: testxxx_cc_ref.txt
@@ -647,7 +722,7 @@ $(BUILDDIR)/testmisc.o: INCFLAGS += $(GTESTINC)
 $(testmain): $(BUILDDIR)/testmisc.o
 $(testmain): cxx_objects_exe += $(BUILDDIR)/testmisc.o # Comment out this line to skip the C++ miscellaneous tests
 
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 $(BUILDDIR)/testmisc_cu.o: $(GTESTLIBS)
 $(BUILDDIR)/testmisc_cu.o: INCFLAGS += $(GTESTINC)
 $(testmain): $(BUILDDIR)/testmisc_cu.o
@@ -659,14 +734,14 @@ $(BUILDDIR)/runTest.o: INCFLAGS += $(GTESTINC)
 $(testmain): $(BUILDDIR)/runTest.o
 $(testmain): cxx_objects_exe += $(BUILDDIR)/runTest.o
 
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 $(BUILDDIR)/runTest_cu.o: $(GTESTLIBS)
 $(BUILDDIR)/runTest_cu.o: INCFLAGS += $(GTESTINC)
 ifneq ($(shell $(CXX) --version | grep ^Intel),)
-$(testmain): LIBFLAGS += -lintlc # compile with icpx and link with nvcc (undefined reference to `_intel_fast_memcpy')
-$(testmain): LIBFLAGS += -lsvml # compile with icpx and link with nvcc (undefined reference to `__svml_cos4_l9')
+$(testmain): LIBFLAGS += -lintlc # compile with icpx and link with GPUCC (undefined reference to `_intel_fast_memcpy')
+$(testmain): LIBFLAGS += -lsvml # compile with icpx and link with GPUCC (undefined reference to `__svml_cos4_l9')
 else ifneq ($(shell $(CXX) --version | grep ^nvc++),) # support nvc++ #531
-$(testmain): LIBFLAGS += -L$(patsubst %%bin/nvc++,%%lib,$(subst ccache ,,$(CXX))) -lnvhpcatm -lnvcpumath -lnvc
+$(testmain): LIBFLAGS += -L$(patsubst %bin/nvc++,%lib,$(subst ccache ,,$(CXX))) -lnvhpcatm -lnvcpumath -lnvc
 endif
 $(testmain): $(BUILDDIR)/runTest_cu.o
 $(testmain): cu_objects_exe  += $(BUILDDIR)/runTest_cu.o
@@ -688,14 +763,14 @@ $(testmain): LIBFLAGS += -lgomp
 endif
 endif
 
-ifeq ($(NVCC),) # link only runTest.o
+ifeq ($(GPUCC),) # link only runTest.o
 $(testmain): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH
 $(testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(cxx_objects_exe) $(GTESTLIBS)
 	$(CXX) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) -ldl -pthread $(LIBFLAGS)
 else # link both runTest.o and runTest_cu.o
 $(testmain): LIBFLAGS += $(CULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH
 $(testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) $(GTESTLIBS)
-	$(NVCC) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) -ldl $(LIBFLAGS) -lcuda
+	  $(GPUCC) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) -ldl $(LIBFLAGS) $(CUDATESTFLAGS)
 endif
 
 # Use flock (Linux only, no Mac) to allow 'make -j' if googletest has not yet been downloaded https://stackoverflow.com/a/32666215
@@ -798,9 +873,9 @@ ifeq ($(USECCACHE),1)
 	ccache --version | head -1
 endif
 	@echo ""
-	@echo NVCC=$(NVCC)
-ifneq ($(NVCC),)
-	$(NVCC) --version
+	@echo GPUCC=$(GPUCC)
+ifneq ($(GPUCC),)
+	$(GPUCC) --version
 endif
 	@echo ""
 	@echo CXX=$(CXX)
@@ -819,7 +894,7 @@ endif
 
 # Target: check (run the C++ test executable)
 # [NB THIS IS WHAT IS USED IN THE GITHUB CI!]
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 check: runTest cmpFcheck cmpFGcheck
 else
 check: runTest cmpFcheck
@@ -850,14 +925,14 @@ cmpFcheck: all.$(TAG)
 	@echo
 	@echo "$(BUILDDIR)/check.exe --common -p 2 32 2"
 	@echo "$(BUILDDIR)/fcheck.exe 2 32 2"
-	@me1=$(shell $(RUNTIME) $(BUILDDIR)/check.exe --common -p 2 32 2 | grep MeanMatrix | awk '{print $$4}'); me2=$(shell $(RUNTIME) $(BUILDDIR)/fcheck.exe 2 32 2 | grep Average | awk '{print $$4}'); echo "Avg ME (C++/C++)    = $${me1}"; echo "Avg ME (F77/C++)    = $${me2}"; if [ "$${me2}" == "NaN" ]; then echo "ERROR! Fortran calculation (F77/C++) returned NaN"; elif [ "$${me2}" == "" ]; then echo "ERROR! Fortran calculation (F77/C++) crashed"; else python3 -c "me1=$${me1}; me2=$${me2}; reldif=abs((me2-me1)/me1); print('Relative difference =', reldif); ok = reldif <= 2E-4; print ( '%%s (relative difference %%s 2E-4)' %% ( ('OK','<=') if ok else ('ERROR','>') ) ); import sys; sys.exit(0 if ok else 1)"; fi
+	@me1=$(shell $(RUNTIME) $(BUILDDIR)/check.exe --common -p 2 32 2 | grep MeanMatrix | awk '{print $$4}'); me2=$(shell $(RUNTIME) $(BUILDDIR)/fcheck.exe 2 32 2 | grep Average | awk '{print $$4}'); echo "Avg ME (C++/C++)    = $${me1}"; echo "Avg ME (F77/C++)    = $${me2}"; if [ "$${me2}" == "NaN" ]; then echo "ERROR! Fortran calculation (F77/C++) returned NaN"; elif [ "$${me2}" == "" ]; then echo "ERROR! Fortran calculation (F77/C++) crashed"; else python3 -c "me1=$${me1}; me2=$${me2}; reldif=abs((me2-me1)/me1); print('Relative difference =', reldif); ok = reldif <= 2E-4; print ( '%s (relative difference %s 2E-4)' % ( ('OK','<=') if ok else ('ERROR','>') ) ); import sys; sys.exit(0 if ok else 1)"; fi
 
 # Target: cmpFGcheck (compare ME results from the CUDA and Fortran with CUDA MEs standalone executables, with a small number of events)
 cmpFGcheck: all.$(TAG)
 	@echo
 	@echo "$(BUILDDIR)/gcheck.exe --common -p 2 32 2"
 	@echo "$(BUILDDIR)/fgcheck.exe 2 32 2"
-	@me1=$(shell $(RUNTIME) $(BUILDDIR)/gcheck.exe --common -p 2 32 2 | grep MeanMatrix | awk '{print $$4}'); me2=$(shell $(RUNTIME) $(BUILDDIR)/fgcheck.exe 2 32 2 | grep Average | awk '{print $$4}'); echo "Avg ME (C++/CUDA)   = $${me1}"; echo "Avg ME (F77/CUDA)   = $${me2}"; if [ "$${me2}" == "NaN" ]; then echo "ERROR! Fortran calculation (F77/CUDA) crashed"; elif [ "$${me2}" == "" ]; then echo "ERROR! Fortran calculation (F77/CUDA) crashed"; else python3 -c "me1=$${me1}; me2=$${me2}; reldif=abs((me2-me1)/me1); print('Relative difference =', reldif); ok = reldif <= 2E-4; print ( '%%s (relative difference %%s 2E-4)' %% ( ('OK','<=') if ok else ('ERROR','>') ) ); import sys; sys.exit(0 if ok else 1)"; fi
+	@me1=$(shell $(RUNTIME) $(BUILDDIR)/gcheck.exe --common -p 2 32 2 | grep MeanMatrix | awk '{print $$4}'); me2=$(shell $(RUNTIME) $(BUILDDIR)/fgcheck.exe 2 32 2 | grep Average | awk '{print $$4}'); echo "Avg ME (C++/CUDA)   = $${me1}"; echo "Avg ME (F77/CUDA)   = $${me2}"; if [ "$${me2}" == "NaN" ]; then echo "ERROR! Fortran calculation (F77/CUDA) crashed"; elif [ "$${me2}" == "" ]; then echo "ERROR! Fortran calculation (F77/CUDA) crashed"; else python3 -c "me1=$${me1}; me2=$${me2}; reldif=abs((me2-me1)/me1); print('Relative difference =', reldif); ok = reldif <= 2E-4; print ( '%s (relative difference %s 2E-4)' % ( ('OK','<=') if ok else ('ERROR','>') ) ); import sys; sys.exit(0 if ok else 1)"; fi
 
 # Target: memcheck (run the CUDA standalone executable gcheck.exe with a small number of events through cuda-memcheck)
 memcheck: all.$(TAG)
diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/fbridge.cc b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/fbridge.cc
index 2d2b36d560..34ca33ded6 100644
--- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/fbridge.cc
+++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/fbridge.cc
@@ -5,7 +5,7 @@
 
 #include "Bridge.h"
 #include "CPPProcess.h"
-#include "CudaRuntime.h"
+#include "GpuRuntime.h"
 
 extern "C"
 {
@@ -22,7 +22,7 @@ extern "C"
    * Using the same Fortran MadEvent code, linking to the hetrerogeneous library would allow access to both CPU and GPU implementations.
    * The specific heterogeneous configuration (how many GPUs, how many threads on each CPU, etc) could be loaded in CUDA/C++ from a data file.
    */
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   using namespace mg5amcGpu;
 #else
   using namespace mg5amcCpu;
@@ -46,7 +46,7 @@ extern "C"
    */
   void fbridgecreate_( CppObjectInFortran** ppbridge, const int* pnevtF, const int* pnparF, const int* pnp4F )
   {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     CudaRuntime::setUp();
 #endif
     // (NB: CPPProcess::initProc no longer needs to be executed here because it is called in the Bridge constructor)
@@ -65,7 +65,7 @@ extern "C"
     Bridge<FORTRANFPTYPE>* pbridge = dynamic_cast<Bridge<FORTRANFPTYPE>*>( *ppbridge );
     if( pbridge == 0 ) throw std::runtime_error( "fbridgedelete_: invalid Bridge address" );
     delete pbridge;
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     CudaRuntime::tearDown();
 #endif
   }
@@ -96,7 +96,7 @@ extern "C"
   {
     Bridge<FORTRANFPTYPE>* pbridge = dynamic_cast<Bridge<FORTRANFPTYPE>*>( *ppbridge );
     if( pbridge == 0 ) throw std::runtime_error( "fbridgesequence_: invalid Bridge address" );
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     // Use the device/GPU implementation in the CUDA library
     // (there is also a host implementation in this library)
     pbridge->gpu_sequence( momenta, gs, rndhel, rndcol, *pchannelId, mes, selhel, selcol );
diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/fsampler.cc b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/fsampler.cc
index 2fb445372d..acffa7c19e 100644
--- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/fsampler.cc
+++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/fsampler.cc
@@ -13,7 +13,7 @@
 
 //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -40,7 +40,7 @@ namespace mg5amcCpu
   private:
     const int m_nevt; // The number of events in each iteration
     int m_iiter;      // The iteration counter (for random number seeding)
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
     HostBufferRndNumMomenta m_hstRndmom; // Memory buffers for random numbers
     HostBufferMomenta m_hstMomenta;      // Memory buffers for momenta
     HostBufferWeights m_hstWeights;      // Memory buffers for sampling weights
@@ -105,7 +105,7 @@ namespace mg5amcCpu
 
 extern "C"
 {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   using namespace mg5amcGpu;
 #else
   using namespace mg5amcCpu;
diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/mgOnGpuConfig.h b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/mgOnGpuConfig.h
index 02bfdcf8f5..5b04029787 100644
--- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/mgOnGpuConfig.h
+++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/mgOnGpuConfig.h
@@ -6,6 +6,8 @@
 #ifndef MGONGPUCONFIG_H
 #define MGONGPUCONFIG_H 1
 
+#include "GpuRuntime.h" // Includes the GPU abstraction
+
 // HARDCODED AT CODE GENERATION TIME: DO NOT MODIFY (#473)
 // There are two different code bases for standalone_cudacpp (without multichannel) and madevent+cudacpp (with multichannel)
 %(mgongpu_supports_multichannel)s
@@ -15,9 +17,10 @@
 
 // Choose if curand is supported for generating random numbers
 // For C++, by default, do not inline, but allow this macro to be set from outside with e.g. -DMGONGPU_HAS_NO_CURAND
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_CUDACC
 #undef MGONGPU_HAS_NO_CURAND
-#else
+#elif defined MGONGPUCPP_HIPCC
+#define MGONGPU_HAS_NO_CURAND 1
 //#undef MGONGPU_HAS_NO_CURAND // default
 ////#define MGONGPU_HAS_NO_CURAND 1
 #endif
@@ -53,20 +56,20 @@
 ////#define MGONGPU_HARDCODE_PARAM 1
 
 // Complex type in c++: std::complex or cxsmpl (CHOOSE ONLY ONE)
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 //#define MGONGPU_CPPCXTYPE_STDCOMPLEX 1 // ~8 percent slower on float, same on double (5.1E6/double, 9.4E6/float)
 #define MGONGPU_CPPCXTYPE_CXSMPL 1 // new default (5.1E6/double, 10.2E6/float)
 #endif
 
 // Complex type in cuda: thrust or cucomplex or cxsmpl (CHOOSE ONLY ONE)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 #define MGONGPU_CUCXTYPE_THRUST 1 // default (~1.15E9/double, ~3.2E9/float)
 //#define MGONGPU_CUCXTYPE_CUCOMPLEX 1 // ~10 percent slower (1.03E9/double, ~2.8E9/float)
 //#define MGONGPU_CUCXTYPE_CXSMPL 1 // ~10 percent slower (1.00E9/double, ~2.9E9/float)
 #endif
 
 // Cuda nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 #undef MGONGPU_NSIGHT_DEBUG // default
 //#define MGONGPU_NSIGHT_DEBUG 1
 #endif
@@ -85,14 +88,14 @@
 #endif
 
 // SANITY CHECKS (c++ complex number implementation)
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 #if defined MGONGPU_CPPCXTYPE_STDCOMPLEX and defined MGONGPU_CPPCXTYPE_CXSMPL
 #error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CPPCXTYPE_STDCOMPLEX or MGONGPU_CPPCXTYPE_CXSMPL
 #endif
 #endif
 
 // SANITY CHECKS (cuda complex number implementation)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 #if defined MGONGPU_CUCXTYPE_THRUST and defined MGONGPU_CUCXTYPE_CUCOMPLEX and defined MGONGPU_CUCXTYPE_CXSMPL
 #error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CUCXTYPE_THRUST or MGONGPU_CUCXTYPE_CUCOMPLEX or MGONGPU_CUCXTYPE_CXSMPL
 #endif
@@ -132,7 +135,7 @@ namespace mgOnGpu
   // Alignment requirement for using reinterpret_cast with SIMD vectorized code
   // (using reinterpret_cast with non aligned memory may lead to segmentation faults!)
   // Only needed for C++ code but can be enforced also in NVCC builds of C++ code using CUDA>=11.2 and C++17 (#318, #319, #333)
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   constexpr int cppAlign = 64; // alignment requirement for SIMD vectorization (64-byte i.e. 512-bit)
 #endif
 
@@ -143,7 +146,7 @@ using mgOnGpu::fptype;
 using mgOnGpu::fptype2;
 
 // C++ SIMD vectorization width (this will be used to set neppV)
-#ifdef __CUDACC__ // CUDA implementation has no SIMD
+#ifdef MGONGPUCPP_GPUIMPL // CUDA implementation has no SIMD
 #undef MGONGPU_CPPSIMD
 #elif defined __AVX512VL__ && defined MGONGPU_PVW512 // C++ "512z" AVX512 with 512 width (512-bit ie 64-byte): 8 (DOUBLE) or 16 (FLOAT)
 #ifdef MGONGPU_FPTYPE_DOUBLE
@@ -175,7 +178,7 @@ using mgOnGpu::fptype2;
 
 // Cuda nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation
 // Arguments (not used so far): text is __FUNCTION__, code is 0 (start) or 1 (end)
-#if defined __CUDACC__ && defined MGONGPU_NSIGHT_DEBUG /* clang-format off */
+#if defined MGONGPUCPP_GPUIMPL && defined MGONGPU_NSIGHT_DEBUG /* clang-format off */
 #define mgDebugDeclare() __shared__ float mgDebugCounter[mgOnGpu::ntpbMAX];
 #define mgDebugInitialise() { mgDebugCounter[threadIdx.x] = 0; }
 #define mgDebug( code, text ) { mgDebugCounter[threadIdx.x] += 1; }
@@ -188,7 +191,7 @@ using mgOnGpu::fptype2;
 #endif /* clang-format on */
 
 // Define empty CUDA declaration specifiers for C++
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 #define __global__
 #define __host__
 #define __device__
diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/mgOnGpuCxtypes.h b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/mgOnGpuCxtypes.h
index b56348bc58..b5e1f1a495 100644
--- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/mgOnGpuCxtypes.h
+++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/mgOnGpuCxtypes.h
@@ -19,7 +19,7 @@
 #include <complex>
 
 // Complex type in cuda: thrust or cucomplex or cxsmpl
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 #if defined MGONGPU_CUCXTYPE_THRUST
 #pragma clang diagnostic push
 #pragma clang diagnostic ignored "-Wtautological-compare" // for icpx2021/clang13 (https://stackoverflow.com/a/15864661)
@@ -216,7 +216,7 @@ namespace mg5amcCpu
 #endif
 {
   // --- Type definitions (complex type: cxtype)
-#ifdef __CUDACC__ // cuda
+#ifdef MGONGPUCPP_GPUIMPL // cuda
 #if defined MGONGPU_CUCXTYPE_THRUST
   typedef thrust::complex<fptype> cxtype;
 #elif defined MGONGPU_CUCXTYPE_CUCOMPLEX
@@ -301,7 +301,7 @@ namespace mg5amcCpu
 
   //==========================================================================
 
-#if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_THRUST // cuda + thrust
+#if defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CUCXTYPE_THRUST // cuda + thrust
 
   //------------------------------
   // CUDA - using thrust::complex
@@ -337,11 +337,11 @@ namespace mg5amcCpu
     return c;
   }
 
-#endif // #if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_THRUST
+#endif // #if defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CUCXTYPE_THRUST
 
   //==========================================================================
 
-#if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_CUCOMPLEX // cuda + cucomplex
+#if defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CUCXTYPE_CUCOMPLEX // cuda + cucomplex
 
   //------------------------------
   // CUDA - using cuComplex
@@ -556,11 +556,11 @@ namespace mg5amcCpu
     return cxmake( c.real(), c.imag() );
   }
 
-#endif // #if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_CUCOMPLEX
+#endif // #if defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CUCXTYPE_CUCOMPLEX
 
   //==========================================================================
 
-#if not defined __CUDACC__ and defined MGONGPU_CPPCXTYPE_STDCOMPLEX // c++ + stdcomplex
+#if not defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CPPCXTYPE_STDCOMPLEX // c++ + stdcomplex
 
   //------------------------------
   // C++ - using std::complex
@@ -604,7 +604,7 @@ namespace mg5amcCpu
   }
 #endif
 
-#endif // #if not defined __CUDACC__ and defined MGONGPU_CPPCXTYPE_STDCOMPLEX
+#endif // #if not defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CPPCXTYPE_STDCOMPLEX
 
   //==========================================================================
 
diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/mgOnGpuFptypes.h b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/mgOnGpuFptypes.h
index 905c97d700..d9a955c235 100644
--- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/mgOnGpuFptypes.h
+++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/mgOnGpuFptypes.h
@@ -20,7 +20,7 @@ namespace mg5amcCpu
 {
   //==========================================================================
 
-#ifdef __CUDACC__ // cuda
+#ifdef MGONGPUCPP_GPUIMPL // cuda
 
   //------------------------------
   // Floating point types - Cuda
@@ -64,11 +64,11 @@ namespace mg5amcCpu
 #endif
   }
 
-#endif // #ifdef __CUDACC__
+#endif // #ifdef MGONGPUCPP_GPUIMPL
 
   //==========================================================================
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 
   //------------------------------
   // Floating point types - C++
@@ -92,7 +92,7 @@ namespace mg5amcCpu
     return std::sqrt( f );
   }
 
-#endif // #ifndef __CUDACC__
+#endif // #ifndef MGONGPUCPP_GPUIMPL
 
   //==========================================================================
 
diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/mgOnGpuVectors.h b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/mgOnGpuVectors.h
index e1299ba81e..de12c1d24f 100644
--- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/mgOnGpuVectors.h
+++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/mgOnGpuVectors.h
@@ -9,6 +9,8 @@
 #include "mgOnGpuCxtypes.h"
 #include "mgOnGpuFptypes.h"
 
+#include "GpuAbstraction.h"
+
 #include <iostream>
 
 //==========================================================================
@@ -131,7 +133,7 @@ namespace mg5amcCpu
 #endif
 #endif
 
-#else // i.e #ifndef MGONGPU_CPPSIMD (this includes #ifdef __CUDACC__)
+#else // i.e #ifndef MGONGPU_CPPSIMD (this includes #ifdef MGONGPUCPP_GPUIMPL)
 
   const int neppV = 1;
 
@@ -153,13 +155,13 @@ namespace mg5amcCpu
 //==========================================================================
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
 #endif
 {
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 
   // Printout to stream for user defined types
 
@@ -805,11 +807,11 @@ namespace mg5amcCpu
 
 #endif // #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
 
-#endif // #ifndef __CUDACC__
+#endif // #ifndef MGONGPUCPP_GPUIMPL
 
   //==========================================================================
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 
   //------------------------------
   // Vector types - CUDA
@@ -853,12 +855,12 @@ namespace mg5amcCpu
     return mask;
   }
 
-#endif // #ifdef __CUDACC__
+#endif // #ifdef MGONGPUCPP_GPUIMPL
 
   //==========================================================================
 
   // Scalar-or-vector types: scalar in CUDA, vector or scalar in C++
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   typedef bool bool_sv;
   typedef fptype fptype_sv;
   typedef fptype2 fptype2_sv;
@@ -879,7 +881,7 @@ namespace mg5amcCpu
 #endif
 
   // Scalar-or-vector zeros: scalar in CUDA, vector or scalar in C++
-#ifdef __CUDACC__ /* clang-format off */
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
   inline __host__ __device__ cxtype cxzero_sv(){ return cxtype( 0, 0 ); }
 #elif defined MGONGPU_CPPSIMD
   inline cxtype_v cxzero_sv() { return cxtype_v(); } // RRRR=0000 IIII=0000
diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/process_cc.inc b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/process_cc.inc
index 778e210468..9dceb45708 100644
--- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/process_cc.inc
+++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/process_cc.inc
@@ -14,7 +14,7 @@
 
 #include "mgOnGpuConfig.h"
 
-#include "CudaRuntime.h"
+#include "GpuRuntime.h"
 %(hel_amps_h)s
 #include "MemoryAccessAmplitudes.h"
 #include "MemoryAccessCouplings.h"
diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/process_function_definitions.inc b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/process_function_definitions.inc
index c3c0812b94..d4e999733f 100644
--- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/process_function_definitions.inc
+++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/process_function_definitions.inc
@@ -10,7 +10,9 @@
 // Class member functions for calculating the matrix elements for
 %(process_lines)s
 
-#ifdef __CUDACC__
+#include "GpuRuntime.h"
+
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -44,7 +46,7 @@ namespace mg5amcCpu
   %(cipdhrdcod)s
   %(cipchrdcod)s
 #else
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   %(cipddevice)s
   %(cipcdevice)s
 #else
@@ -54,7 +56,7 @@ namespace mg5amcCpu
 #endif
 
   // Helicity combinations (and filtering of "good" helicity combinations)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   __device__ __constant__ short cHel[ncomb][npar];
   __device__ __constant__ int cNGoodHel;
   __device__ __constant__ int cGoodHel[ncomb];
@@ -80,8 +82,8 @@ namespace mg5amcCpu
     // Helicities for the process [NB do keep 'static' for this constexpr array, see issue #283]
     // *** NB There is no automatic check yet that these are in the same order as Fortran! #569 ***
 %(all_helicities)s
-#ifdef __CUDACC__
-    checkCuda( cudaMemcpyToSymbol( cHel, tHel, ncomb * npar * sizeof( short ) ) );
+#ifdef MGONGPUCPP_GPUIMPL
+    gpuMemcpyToSymbol( cHel, tHel, ncomb * npar * sizeof( short ) );
 #else
     memcpy( cHel, tHel, ncomb * npar * sizeof( short ) );
 #endif
@@ -117,7 +119,7 @@ namespace mg5amcCpu
     // Then copy them to CUDA constant memory (issue #39) or its C++ emulation in file-scope static memory
     %(cipdassign)s
     %(cipcassign)s
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     %(cipd2tipdSym)s
     %(cipc2tipcSym)s
 #else
@@ -150,7 +152,7 @@ namespace mg5amcCpu
   {
     std::stringstream out;
     // CUDA version (NVCC)
-    // [Use __NVCC__ instead of __CUDACC__ here!]
+    // [Use __NVCC__ instead of MGONGPUCPP_GPUIMPL here!]
     // [This tests if 'nvcc' was used even to build a .cc file, even if not necessarily 'nvcc -x cu' for a .cu file]
     // [Check 'nvcc --compiler-options -dM -E dummy.c | grep CUDA': see https://stackoverflow.com/a/53713712]
 #ifdef __NVCC__
@@ -215,12 +217,12 @@ namespace mg5amcCpu
   __global__ void /* clang-format off */
   computeDependentCouplings( const fptype* allgs, // input: Gs[nevt]
                              fptype* allcouplings // output: couplings[nevt*ndcoup*2]
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
                              , const int nevt     // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #endif
   ) /* clang-format on */
   {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     using namespace mg5amcGpu;
     using G_ACCESS = DeviceAccessGs;
     using C_ACCESS = DeviceAccessCouplings;
@@ -241,7 +243,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__ /* clang-format off */
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
   __global__ void
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
@@ -367,9 +369,9 @@ namespace mg5amcCpu
         nGoodHel++;
       }
     }
-#ifdef __CUDACC__
-    checkCuda( cudaMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) ) );
-    checkCuda( cudaMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) ) );
+#ifdef MGONGPUCPP_GPUIMPL
+    gpuMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) );
+    gpuMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) );
 #else
     cNGoodHel = nGoodHel;
     for( int ihel = 0; ihel < ncomb; ihel++ ) cGoodHel[ihel] = goodHel[ihel];
@@ -393,7 +395,7 @@ namespace mg5amcCpu
 #endif
             int* allselhel,                // output: helicity selection[nevt]
             int* allselcol                 // output: helicity selection[nevt]
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
             , const int nevt               // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #endif
             ) /* clang-format on */
@@ -414,7 +416,7 @@ namespace mg5amcCpu
     // Denominators: spins, colors and identical particles
     constexpr int helcolDenominators[1] = { %(den_factors)s }; // assume nprocesses == 1 (#272 and #343)
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     // Remember: in CUDA this is a kernel for one event, in c++ this processes n events
     const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid
 #else
diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/process_h.inc b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/process_h.inc
index 893f7f3215..8a9de336f2 100644
--- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/process_h.inc
+++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/process_h.inc
@@ -23,7 +23,7 @@
 
 //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -32,7 +32,7 @@ namespace mg5amcCpu
 %(process_class_definitions)s
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   __global__ void
   computeDependentCouplings( const fptype* allgs,    // input: Gs[nevt]
                              fptype* allcouplings ); // output: couplings[nevt*ndcoup*2]
@@ -45,7 +45,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__ /* clang-format off */
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
   __global__ void
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
@@ -75,7 +75,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__ /* clang-format off */
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
   __global__ void
   sigmaKin( const fptype* allmomenta,      // input: momenta[nevt*npar*4]
             const fptype* allcouplings,    // input: couplings[nevt*ndcoup*2]
diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/process_matrix.inc b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/process_matrix.inc
index 1e473edcf8..241c50a9d1 100644
--- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/process_matrix.inc
+++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/process_matrix.inc
@@ -7,6 +7,8 @@
 ! Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 !==========================================================================
 
+#include "GpuAbstraction.h"
+
       // *** COLOR CHOICE BELOW ***
       // Store the leading color flows for choice of color
       if( jamp2_sv ) // disable color choice if nullptr
@@ -17,7 +19,7 @@
       // (This method used to be called %(process_class_name)s::matrix_%(proc_name)s(%(matrix_args)s)?)
 %(color_matrix_lines)s
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
       // Pre-compute a constexpr triangular color matrix properly normalized #475
       struct TriangularNormalizedColorMatrix
       {
@@ -74,7 +76,7 @@
 #endif
       for( int icol = 0; icol < ncolor; icol++ )
       {
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
         // === C++ START ===
         // Diagonal terms
 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
@@ -133,7 +135,7 @@
       MEs_sv_previous += deltaMEs_previous;
 #endif
       /*
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
       if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%%6d ihel=%%2d me_running=%%f\n", blockDim.x * blockIdx.x + threadIdx.x, ihel, MEs_sv );
 #else
 #ifdef MGONGPU_CPPSIMD
diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/process_sigmaKin_function.inc b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/process_sigmaKin_function.inc
index 9fcd58196b..59c1623c5a 100644
--- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/process_sigmaKin_function.inc
+++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/process_sigmaKin_function.inc
@@ -6,9 +6,12 @@
 ! Modified by: A. Valassi (Sep 2021) for the MG5aMC CUDACPP plugin.
 ! Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 !==========================================================================
+
+#include "GpuAbstraction.h"
+
     // === PART 0 - INITIALISATION (before calculate_wavefunctions) ===
     // Reset the "matrix elements" - running sums of |M|^2 over helicities for the given event
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     allMEs[ievt] = 0;
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     allNumerators[ievt] = 0;
@@ -36,7 +39,7 @@
     // === PART 1 - HELICITY LOOP: CALCULATE WAVEFUNCTIONS ===
     // (in both CUDA and C++, using precomputed good helicities)
 
-#ifdef __CUDACC__ // CUDA OR C++
+#ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++
 
     // *** START OF PART 1a - CUDA (one event per CPU thread) ***
     // Running sum of partial amplitudes squared for event by event color selection (#402)
@@ -240,7 +243,7 @@
     // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event
     // [NB 'sum over final spins, average over initial spins', eg see
     // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf]
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     allMEs[ievt] /= helcolDenominators[0];
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     if( channelId > 0 ) allMEs[ievt] *= allNumerators[ievt] / allDenominators[ievt];
diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/rambo.h b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/rambo.h
index e02ea52496..3a331b979a 100644
--- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/rambo.h
+++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/rambo.h
@@ -18,7 +18,7 @@
 #include <iostream>
 
 // Simplified rambo version for 2 to N (with N>=2) processes with massless particles
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -83,7 +83,7 @@ namespace mg5amcCpu
       static bool first = true;
       if( first )
       {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
         if constexpr( M_ACCESS::isOnDevice() ) // avoid
         {
           const int ievt0 = 0;
@@ -166,7 +166,7 @@ namespace mg5amcCpu
     wt = po2log;
     if( nparf != 2 ) wt = ( 2. * nparf - 4. ) * log( energy ) + z[nparf - 1];
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
     // issue warnings if weight is too small or too large
     static int iwarn[5] = { 0, 0, 0, 0, 0 };
     if( wt < -180. )
diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/runTest.cc b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/runTest.cc
index d4a760a71b..6f20a7248a 100644
--- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/runTest.cc
+++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/runTest.cc
@@ -18,7 +18,7 @@
 #include "RandomNumberKernels.h"
 #include "epoch_process_id.h"
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 using namespace mg5amcGpu;
 #else
 using namespace mg5amcCpu;
@@ -35,7 +35,7 @@ struct CUDA_CPU_TestBase : public TestDriverBase
     : TestDriverBase( npar, refFileName ) {}
 };
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 struct CPUTest : public CUDA_CPU_TestBase
 {
   // Struct data members (process, and memory structures for random numbers, momenta, matrix elements and weights on host and device)
@@ -119,7 +119,7 @@ struct CPUTest : public CUDA_CPU_TestBase
 };
 #endif
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 struct CUDATest : public CUDA_CPU_TestBase
 {
   // Reset the device when our test goes out of scope. Note that this should happen after
@@ -128,7 +128,7 @@ struct CUDATest : public CUDA_CPU_TestBase
   {
     ~DeviceReset()
     {
-      checkCuda( cudaDeviceReset() ); // this is needed by cuda-memcheck --leak-check full
+      gpuDeviceReset(); // this is needed by cuda-memcheck --leak-check full
     }
   } deviceResetter;
 
@@ -256,7 +256,7 @@ INSTANTIATE_TEST_SUITE_P( prefix, \
                           test_suite_name, \
                           testing::Values( new CUDATest( MG_EPOCH_REFERENCE_FILE_NAME ) ) );
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 MG_INSTANTIATE_TEST_SUITE_GPU( XTESTID_GPU( MG_EPOCH_PROCESS_ID ), MadgraphTest );
 #else
 MG_INSTANTIATE_TEST_SUITE_CPU( XTESTID_CPU( MG_EPOCH_PROCESS_ID ), MadgraphTest );
diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/testmisc.cc b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/testmisc.cc
index 895d6eeb56..5d00e2c06c 100644
--- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/testmisc.cc
+++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/testmisc.cc
@@ -17,7 +17,7 @@
 #include <sstream>
 #include <typeinfo>
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 #define TESTID( s ) s##_GPU_MISC
 #else
 #define TESTID( s ) s##_CPU_MISC
diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/testxxx.cc b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/testxxx.cc
index 3e6569b553..6f8736c120 100644
--- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/testxxx.cc
+++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/testxxx.cc
@@ -24,7 +24,7 @@
 #include <iomanip>
 #include <iostream>
 #include <vector>
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 #define TESTID( s ) s##_GPU_XXX
 #else
 #define TESTID( s ) s##_CPU_XXX
@@ -76,7 +76,7 @@ TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testxxx )
   assert( nevt %% neppM == 0 ); // nevt must be a multiple of neppM
   assert( nevt %% neppV == 0 ); // nevt must be a multiple of neppV
   // Fill in the input momenta
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   mg5amcGpu::PinnedHostBufferMomenta hstMomenta( nevt ); // AOSOA[npagM][npar=4][np4=4][neppM]
 #else
   mg5amcCpu::HostBufferMomenta hstMomenta( nevt ); // AOSOA[npagM][npar=4][np4=4][neppM]
@@ -321,7 +321,7 @@ TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testxxx )
   {
     for( int ievt = 0; ievt < nevt; ievt++ )
     {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
       using namespace mg5amcGpu;
 #else
       using namespace mg5amcCpu;
diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/output.py b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/output.py
index aebab6f1a7..a947f262b0 100644
--- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/output.py
+++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/output.py
@@ -86,9 +86,9 @@ class PLUGIN_ProcessExporter(PLUGIN_export_cpp.ProcessExporterGPU):
                      'CMake': [s+'CMake/Compilers.txt', s+'CMake/Platforms.txt', s+'CMake/Macros.txt'],
                      'src': [s+'gpu/rambo.h', s+'read_slha.h', s+'read_slha.cc',
                              s+'gpu/mgOnGpuFptypes.h', s+'gpu/mgOnGpuCxtypes.h', s+'gpu/mgOnGpuVectors.h',
-                             s+'CMake/src/CMakeLists.txt'],
+                             s+'CMake/src/CMakeLists.txt', s+'gpu/GpuRuntime.h', s+'gpu/GpuAbstraction.h'],
                      'SubProcesses': [s+'gpu/nvtx.h', s+'gpu/timer.h', s+'gpu/timermap.h',
-                                      s+'gpu/ompnumthreads.h', s+'gpu/CudaRuntime.h',
+                                      s+'gpu/ompnumthreads.h', s+'gpu/GpuRuntime.h', s+'gpu/GpuAbstraction.h',
                                       s+'gpu/MemoryAccessHelpers.h', s+'gpu/MemoryAccessVectors.h',
                                       s+'gpu/MemoryAccessMatrixElements.h', s+'gpu/MemoryAccessMomenta.h',
                                       s+'gpu/MemoryAccessRandomNumbers.h', s+'gpu/MemoryAccessWeights.h',
@@ -109,7 +109,7 @@ class PLUGIN_ProcessExporter(PLUGIN_export_cpp.ProcessExporterGPU):
                                       s+'CMake/SubProcesses/CMakeLists.txt'],
                      'test': [s+'gpu/cudacpp_test.mk']}
     to_link_in_P = ['nvtx.h', 'timer.h', 'timermap.h',
-                    'ompnumthreads.h', 'CudaRuntime.h',
+                    'ompnumthreads.h', 'GpuRuntime.h', 'GpuAbstraction.h',
                     'MemoryAccessHelpers.h', 'MemoryAccessVectors.h',
                     'MemoryAccessMatrixElements.h', 'MemoryAccessMomenta.h',
                     'MemoryAccessRandomNumbers.h', 'MemoryAccessWeights.h',

From 81cf765fb205389563a1627f595215474c87d983 Mon Sep 17 00:00:00 2001
From: Andrea Valassi <andrea.valassi@cern.ch>
Date: Thu, 13 Jul 2023 15:55:32 +0200
Subject: [PATCH 464/509] [jthip] change % to %% in CODEGEN cudacpp.mk

---
 .../iolibs/template_files/gpu/cudacpp.mk      | 28 +++++++++----------
 1 file changed, 14 insertions(+), 14 deletions(-)

diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/cudacpp.mk b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/cudacpp.mk
index d98dca1eb3..f024f15ce7 100644
--- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/cudacpp.mk
+++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/cudacpp.mk
@@ -118,7 +118,7 @@ ifeq ($(findstring nvcc,$(CUDA_COMPILER_PATH)),nvcc)
 
   # If CUDA_HOME is not set, try to set it from the location of NVCC
   ifndef CUDA_HOME
-    CUDA_HOME = $(patsubst %bin/nvcc,%,$(shell which nvcc 2>/dev/null))
+    CUDA_HOME = $(patsubst %%bin/nvcc,%%,$(shell which nvcc 2>/dev/null))
     $(warning CUDA_HOME was not set: using "$(CUDA_HOME)")
   endif
 
@@ -188,7 +188,7 @@ else ifeq ($(findstring hipcc,$(HIP_COMPILER_PATH)),hipcc)
 
   # If HIP_HOME is not set, try to set it from the location of GPUCC
   ifndef HIP_HOME
-    HIP_HOME = $(patsubst %bin/hipcc,%,$(HIP_COMPILER_PATH))
+    HIP_HOME = $(patsubst %%bin/hipcc,%%,$(HIP_COMPILER_PATH))
     $(warning HIP_HOME was not set: using "$(HIP_HOME)")
   endif
 
@@ -259,11 +259,11 @@ endif
 
 # PowerPC-specific CXX compiler flags (being reviewed)
 ifeq ($(UNAME_P),ppc64le)
-  CXXFLAGS+= -mcpu=power9 -mtune=power9 # gains ~2-3% both for none and sse4
+  CXXFLAGS+= -mcpu=power9 -mtune=power9 # gains ~2-3%% both for none and sse4
   # Throughput references without the extra flags below: none=1.41-1.42E6, sse4=2.15-2.19E6
   ###CXXFLAGS+= -DNO_WARN_X86_INTRINSICS # no change
   ###CXXFLAGS+= -fpeel-loops # no change
-  ###CXXFLAGS+= -funroll-loops # gains ~1% for none, loses ~1% for sse4
+  ###CXXFLAGS+= -funroll-loops # gains ~1%% for none, loses ~1%% for sse4
   ###CXXFLAGS+= -ftree-vectorize # no change
   ###CXXFLAGS+= -flto # would increase to none=4.08-4.12E6, sse4=4.99-5.03E6!
 else
@@ -540,11 +540,11 @@ $(BUILDDIR)/.build.$(TAG):
 
 # Generic target and build rules: objects from CUDA compilation
 ifneq ($(GPUCC),)
-$(BUILDDIR)/%.o : %.cu *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
+$(BUILDDIR)/%%.o : %%.cu *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
 	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
 	$(GPUCC) $(CPPFLAGS) $(GPUFLAGS) $(CUBUILDRULEFLAGS) $< -o $@
 
-$(BUILDDIR)/%_cu.o : %.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
+$(BUILDDIR)/%%_cu.o : %%.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
 	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
 	$(GPUCC) $(CPPFLAGS) $(GPUFLAGS) $(CCBUILDRULEFLAGS) $< -o $@
 endif
@@ -552,7 +552,7 @@ endif
 
 # Generic target and build rules: objects from C++ compilation
 # (NB do not include CUINC here! add it only for NVTX or curand #679)
-$(BUILDDIR)/%.o : %.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
+$(BUILDDIR)/%%.o : %%.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
 	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
 	$(CXX) $(CPPFLAGS) $(CXXFLAGS) -fPIC -c $< -o $@
 
@@ -641,7 +641,7 @@ endif
 #-------------------------------------------------------------------------------
 
 # Target (and build rules): Fortran include files
-###$(INCDIR)/%.inc : ../%.inc
+###$(INCDIR)/%%.inc : ../%%.inc
 ###	@if [ ! -d $(INCDIR) ]; then echo "mkdir -p $(INCDIR)"; mkdir -p $(INCDIR); fi
 ###	\cp $< $@
 
@@ -657,7 +657,7 @@ ifneq ($(shell $(CXX) --version | grep ^Intel),)
 $(cu_main): LIBFLAGS += -lintlc # compile with icpx and link with GPUCC (undefined reference to `_intel_fast_memcpy')
 $(cu_main): LIBFLAGS += -lsvml # compile with icpx and link with GPUCC (undefined reference to `__svml_cos4_l9')
 else ifneq ($(shell $(CXX) --version | grep ^nvc++),) # support nvc++ #531
-$(cu_main): LIBFLAGS += -L$(patsubst %bin/nvc++,%lib,$(subst ccache ,,$(CXX))) -lnvhpcatm -lnvcpumath -lnvc
+$(cu_main): LIBFLAGS += -L$(patsubst %%bin/nvc++,%%lib,$(subst ccache ,,$(CXX))) -lnvhpcatm -lnvcpumath -lnvc
 endif
 $(cu_main): LIBFLAGS += $(CULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH
 $(cu_main): $(BUILDDIR)/gcheck_sa.o $(LIBDIR)/lib$(MG5AMC_CULIB).so $(cu_objects_exe) $(BUILDDIR)/gCurandRandomNumberKernel.o
@@ -667,12 +667,12 @@ endif
 #-------------------------------------------------------------------------------
 
 # Generic target and build rules: objects from Fortran compilation
-$(BUILDDIR)/%.o : %.f *.inc
+$(BUILDDIR)/%%.o : %%.f *.inc
 	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
 	$(FC) -I. -c $< -o $@
 
 # Generic target and build rules: objects from Fortran compilation
-###$(BUILDDIR)/%.o : %.f *.inc
+###$(BUILDDIR)/%%.o : %%.f *.inc
 ###	@if [ ! -d $(INCDIR) ]; then echo "mkdir -p $(INCDIR)"; mkdir -p $(INCDIR); fi
 ###	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
 ###	$(FC) -I. -I$(INCDIR) -c $< -o $@
@@ -741,7 +741,7 @@ ifneq ($(shell $(CXX) --version | grep ^Intel),)
 $(testmain): LIBFLAGS += -lintlc # compile with icpx and link with GPUCC (undefined reference to `_intel_fast_memcpy')
 $(testmain): LIBFLAGS += -lsvml # compile with icpx and link with GPUCC (undefined reference to `__svml_cos4_l9')
 else ifneq ($(shell $(CXX) --version | grep ^nvc++),) # support nvc++ #531
-$(testmain): LIBFLAGS += -L$(patsubst %bin/nvc++,%lib,$(subst ccache ,,$(CXX))) -lnvhpcatm -lnvcpumath -lnvc
+$(testmain): LIBFLAGS += -L$(patsubst %%bin/nvc++,%%lib,$(subst ccache ,,$(CXX))) -lnvhpcatm -lnvcpumath -lnvc
 endif
 $(testmain): $(BUILDDIR)/runTest_cu.o
 $(testmain): cu_objects_exe  += $(BUILDDIR)/runTest_cu.o
@@ -925,14 +925,14 @@ cmpFcheck: all.$(TAG)
 	@echo
 	@echo "$(BUILDDIR)/check.exe --common -p 2 32 2"
 	@echo "$(BUILDDIR)/fcheck.exe 2 32 2"
-	@me1=$(shell $(RUNTIME) $(BUILDDIR)/check.exe --common -p 2 32 2 | grep MeanMatrix | awk '{print $$4}'); me2=$(shell $(RUNTIME) $(BUILDDIR)/fcheck.exe 2 32 2 | grep Average | awk '{print $$4}'); echo "Avg ME (C++/C++)    = $${me1}"; echo "Avg ME (F77/C++)    = $${me2}"; if [ "$${me2}" == "NaN" ]; then echo "ERROR! Fortran calculation (F77/C++) returned NaN"; elif [ "$${me2}" == "" ]; then echo "ERROR! Fortran calculation (F77/C++) crashed"; else python3 -c "me1=$${me1}; me2=$${me2}; reldif=abs((me2-me1)/me1); print('Relative difference =', reldif); ok = reldif <= 2E-4; print ( '%s (relative difference %s 2E-4)' % ( ('OK','<=') if ok else ('ERROR','>') ) ); import sys; sys.exit(0 if ok else 1)"; fi
+	@me1=$(shell $(RUNTIME) $(BUILDDIR)/check.exe --common -p 2 32 2 | grep MeanMatrix | awk '{print $$4}'); me2=$(shell $(RUNTIME) $(BUILDDIR)/fcheck.exe 2 32 2 | grep Average | awk '{print $$4}'); echo "Avg ME (C++/C++)    = $${me1}"; echo "Avg ME (F77/C++)    = $${me2}"; if [ "$${me2}" == "NaN" ]; then echo "ERROR! Fortran calculation (F77/C++) returned NaN"; elif [ "$${me2}" == "" ]; then echo "ERROR! Fortran calculation (F77/C++) crashed"; else python3 -c "me1=$${me1}; me2=$${me2}; reldif=abs((me2-me1)/me1); print('Relative difference =', reldif); ok = reldif <= 2E-4; print ( '%%s (relative difference %%s 2E-4)' %% ( ('OK','<=') if ok else ('ERROR','>') ) ); import sys; sys.exit(0 if ok else 1)"; fi
 
 # Target: cmpFGcheck (compare ME results from the CUDA and Fortran with CUDA MEs standalone executables, with a small number of events)
 cmpFGcheck: all.$(TAG)
 	@echo
 	@echo "$(BUILDDIR)/gcheck.exe --common -p 2 32 2"
 	@echo "$(BUILDDIR)/fgcheck.exe 2 32 2"
-	@me1=$(shell $(RUNTIME) $(BUILDDIR)/gcheck.exe --common -p 2 32 2 | grep MeanMatrix | awk '{print $$4}'); me2=$(shell $(RUNTIME) $(BUILDDIR)/fgcheck.exe 2 32 2 | grep Average | awk '{print $$4}'); echo "Avg ME (C++/CUDA)   = $${me1}"; echo "Avg ME (F77/CUDA)   = $${me2}"; if [ "$${me2}" == "NaN" ]; then echo "ERROR! Fortran calculation (F77/CUDA) crashed"; elif [ "$${me2}" == "" ]; then echo "ERROR! Fortran calculation (F77/CUDA) crashed"; else python3 -c "me1=$${me1}; me2=$${me2}; reldif=abs((me2-me1)/me1); print('Relative difference =', reldif); ok = reldif <= 2E-4; print ( '%s (relative difference %s 2E-4)' % ( ('OK','<=') if ok else ('ERROR','>') ) ); import sys; sys.exit(0 if ok else 1)"; fi
+	@me1=$(shell $(RUNTIME) $(BUILDDIR)/gcheck.exe --common -p 2 32 2 | grep MeanMatrix | awk '{print $$4}'); me2=$(shell $(RUNTIME) $(BUILDDIR)/fgcheck.exe 2 32 2 | grep Average | awk '{print $$4}'); echo "Avg ME (C++/CUDA)   = $${me1}"; echo "Avg ME (F77/CUDA)   = $${me2}"; if [ "$${me2}" == "NaN" ]; then echo "ERROR! Fortran calculation (F77/CUDA) crashed"; elif [ "$${me2}" == "" ]; then echo "ERROR! Fortran calculation (F77/CUDA) crashed"; else python3 -c "me1=$${me1}; me2=$${me2}; reldif=abs((me2-me1)/me1); print('Relative difference =', reldif); ok = reldif <= 2E-4; print ( '%%s (relative difference %%s 2E-4)' %% ( ('OK','<=') if ok else ('ERROR','>') ) ); import sys; sys.exit(0 if ok else 1)"; fi
 
 # Target: memcheck (run the CUDA standalone executable gcheck.exe with a small number of events through cuda-memcheck)
 memcheck: all.$(TAG)

From b83f8c94157989c99937013fa3f2756de07a99e9 Mon Sep 17 00:00:00 2001
From: Andrea Valassi <andrea.valassi@cern.ch>
Date: Thu, 13 Jul 2023 16:25:15 +0200
Subject: [PATCH 465/509] [jthip] clang-format GpuAbstraction.h both in CODEGEN
 and in ggttgg.mad

---
 .../template_files/gpu/GpuAbstraction.h       | 86 +++++++++----------
 .../gg_ttgg.mad/SubProcesses/GpuAbstraction.h | 79 +++++++++++++++++
 2 files changed, 122 insertions(+), 43 deletions(-)
 create mode 100644 epochX/cudacpp/gg_ttgg.mad/SubProcesses/GpuAbstraction.h

diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/GpuAbstraction.h b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/GpuAbstraction.h
index 98a0124b55..2f000e33d1 100644
--- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/GpuAbstraction.h
+++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/GpuAbstraction.h
@@ -4,75 +4,75 @@
 #include <cassert>
 
 #ifdef MGONGPUCPP_GPUIMPL
-  #define MGONGPUCPP_CUDACC 1
+#define MGONGPUCPP_CUDACC 1
 #endif
 
 #ifdef __HIPCC__
-  #include "hip/hip_runtime.h"
-  #define MGONGPUCPP_HIPCC 1
+#include "hip/hip_runtime.h"
+#define MGONGPUCPP_HIPCC 1
 #endif
 
 #ifdef MGONGPUCPP_CUDACC
 
-  // Defines correct compiler
-  #define MGONGPUCPP_GPUIMPL MGONGPUCPP_GPUIMPL
+// Defines correct compiler
+#define MGONGPUCPP_GPUIMPL MGONGPUCPP_GPUIMPL
 
-  //--------------------------------------------------------------------------
+//--------------------------------------------------------------------------
 
-  #define gpuError_t cudaError_t
-  #define gpuPeekAtLastError cudaPeekAtLastError
-  #define gpuGetErrorString cudaGetErrorString
-  #define gpuSuccess cudaSuccess
+#define gpuError_t cudaError_t
+#define gpuPeekAtLastError cudaPeekAtLastError
+#define gpuGetErrorString cudaGetErrorString
+#define gpuSuccess cudaSuccess
 
-  #define gpuMallocHost(ptr, size) checkGpu( cudaMallocHost(ptr, size) )
-  #define gpuMalloc(ptr, size) checkGpu( cudaMalloc(ptr, size) )
+#define gpuMallocHost( ptr, size ) checkGpu( cudaMallocHost( ptr, size ) )
+#define gpuMalloc( ptr, size ) checkGpu( cudaMalloc( ptr, size ) )
 
-  #define gpuMemcpy(dstData, srcData, srcBytes, func) checkGpu( cudaMemcpy(dstData, srcData, srcBytes, func) )
-  #define gpuMemcpyHostToDevice cudaMemcpyHostToDevice
-  #define gpuMemcpyDeviceToHost cudaMemcpyDeviceToHost
-  #define gpuMemcpyToSymbol(type1, type2, size) checkGpu( cudaMemcpyToSymbol(type1, type2, size) )
+#define gpuMemcpy( dstData, srcData, srcBytes, func ) checkGpu( cudaMemcpy( dstData, srcData, srcBytes, func ) )
+#define gpuMemcpyHostToDevice cudaMemcpyHostToDevice
+#define gpuMemcpyDeviceToHost cudaMemcpyDeviceToHost
+#define gpuMemcpyToSymbol( type1, type2, size ) checkGpu( cudaMemcpyToSymbol( type1, type2, size ) )
 
-  #define gpuFree(ptr) checkGpu( cudaFree(ptr) )
-  #define gpuFreeHost(ptr) checkGpu( cudaFreeHost(ptr) )
+#define gpuFree( ptr ) checkGpu( cudaFree( ptr ) )
+#define gpuFreeHost( ptr ) checkGpu( cudaFreeHost( ptr ) )
 
-  #define gpuSetDevice cudaSetDevice
-  #define gpuDeviceSynchronize cudaDeviceSynchronize
-  #define gpuDeviceReset cudaDeviceReset
+#define gpuSetDevice cudaSetDevice
+#define gpuDeviceSynchronize cudaDeviceSynchronize
+#define gpuDeviceReset cudaDeviceReset
 
-  #define gpuLaunchKernel( kernel, blocks, threads, ...)                    kernel<<<blocks, threads>>> (__VA_ARGS__)
-  #define gpuLaunchKernelSharedMem(kernel, blocks, threads, sharedMem, ...) kernel<<<blocks, threads, sharedMem>>>(__VA_ARGS__)
+#define gpuLaunchKernel( kernel, blocks, threads, ... ) kernel<<<blocks, threads>>>( __VA_ARGS__ )
+#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<<blocks, threads, sharedMem>>>( __VA_ARGS__ )
 
 //--------------------------------------------------------------------------
 
 #elif defined MGONGPUCPP_HIPCC
 
-  // Defines correct compiler
-  #define MGONGPUCPP_GPUIMPL __HCC__
+// Defines correct compiler
+#define MGONGPUCPP_GPUIMPL __HCC__
 
-  //--------------------------------------------------------------------------
+//--------------------------------------------------------------------------
 
-  #define gpuError_t hipError_t
-  #define gpuPeekAtLastError hipPeekAtLastError
-  #define gpuGetErrorString hipGetErrorString
-  #define gpuSuccess hipSuccess
+#define gpuError_t hipError_t
+#define gpuPeekAtLastError hipPeekAtLastError
+#define gpuGetErrorString hipGetErrorString
+#define gpuSuccess hipSuccess
 
-  #define gpuMallocHost(ptr, size) checkGpu( hipHostMalloc(ptr, size) ) // HostMalloc better
-  #define gpuMalloc(ptr, size) checkGpu( hipMalloc(ptr, size) )
+#define gpuMallocHost( ptr, size ) checkGpu( hipHostMalloc( ptr, size ) ) // HostMalloc better
+#define gpuMalloc( ptr, size ) checkGpu( hipMalloc( ptr, size ) )
 
-  #define gpuMemcpy(dstData, srcData, srcBytes, func) checkGpu( hipMemcpy(dstData, srcData, srcBytes, func) )
-  #define gpuMemcpyHostToDevice hipMemcpyHostToDevice
-  #define gpuMemcpyDeviceToHost hipMemcpyDeviceToHost
-  #define gpuMemcpyToSymbol(type1, type2, size) checkGpu( hipMemcpyToSymbol(type1, type2, size) )
+#define gpuMemcpy( dstData, srcData, srcBytes, func ) checkGpu( hipMemcpy( dstData, srcData, srcBytes, func ) )
+#define gpuMemcpyHostToDevice hipMemcpyHostToDevice
+#define gpuMemcpyDeviceToHost hipMemcpyDeviceToHost
+#define gpuMemcpyToSymbol( type1, type2, size ) checkGpu( hipMemcpyToSymbol( type1, type2, size ) )
 
-  #define gpuFree(ptr) checkGpu( hipFree(ptr) )
-  #define gpuFreeHost(ptr) checkGpu( hipHostFree(ptr) )
+#define gpuFree( ptr ) checkGpu( hipFree( ptr ) )
+#define gpuFreeHost( ptr ) checkGpu( hipHostFree( ptr ) )
 
-  #define gpuSetDevice hipSetDevice
-  #define gpuDeviceSynchronize hipDeviceSynchronize
-  #define gpuDeviceReset hipDeviceReset
+#define gpuSetDevice hipSetDevice
+#define gpuDeviceSynchronize hipDeviceSynchronize
+#define gpuDeviceReset hipDeviceReset
 
-  #define gpuLaunchKernel( kernel, blocks, threads, ...)                    kernel<<<blocks, threads>>> (__VA_ARGS__)
-  #define gpuLaunchKernelSharedMem(kernel, blocks, threads, sharedMem, ...) kernel<<<blocks, threads, sharedMem>>>(__VA_ARGS__)
+#define gpuLaunchKernel( kernel, blocks, threads, ... ) kernel<<<blocks, threads>>>( __VA_ARGS__ )
+#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<<blocks, threads, sharedMem>>>( __VA_ARGS__ )
 
 #endif
 
diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/GpuAbstraction.h b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/GpuAbstraction.h
new file mode 100644
index 0000000000..2f000e33d1
--- /dev/null
+++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/GpuAbstraction.h
@@ -0,0 +1,79 @@
+#ifndef MG5AMC_GPUABSTRACTION_H
+#define MG5AMC_GPUABSTRACTION_H 1
+
+#include <cassert>
+
+#ifdef MGONGPUCPP_GPUIMPL
+#define MGONGPUCPP_CUDACC 1
+#endif
+
+#ifdef __HIPCC__
+#include "hip/hip_runtime.h"
+#define MGONGPUCPP_HIPCC 1
+#endif
+
+#ifdef MGONGPUCPP_CUDACC
+
+// Defines correct compiler
+#define MGONGPUCPP_GPUIMPL MGONGPUCPP_GPUIMPL
+
+//--------------------------------------------------------------------------
+
+#define gpuError_t cudaError_t
+#define gpuPeekAtLastError cudaPeekAtLastError
+#define gpuGetErrorString cudaGetErrorString
+#define gpuSuccess cudaSuccess
+
+#define gpuMallocHost( ptr, size ) checkGpu( cudaMallocHost( ptr, size ) )
+#define gpuMalloc( ptr, size ) checkGpu( cudaMalloc( ptr, size ) )
+
+#define gpuMemcpy( dstData, srcData, srcBytes, func ) checkGpu( cudaMemcpy( dstData, srcData, srcBytes, func ) )
+#define gpuMemcpyHostToDevice cudaMemcpyHostToDevice
+#define gpuMemcpyDeviceToHost cudaMemcpyDeviceToHost
+#define gpuMemcpyToSymbol( type1, type2, size ) checkGpu( cudaMemcpyToSymbol( type1, type2, size ) )
+
+#define gpuFree( ptr ) checkGpu( cudaFree( ptr ) )
+#define gpuFreeHost( ptr ) checkGpu( cudaFreeHost( ptr ) )
+
+#define gpuSetDevice cudaSetDevice
+#define gpuDeviceSynchronize cudaDeviceSynchronize
+#define gpuDeviceReset cudaDeviceReset
+
+#define gpuLaunchKernel( kernel, blocks, threads, ... ) kernel<<<blocks, threads>>>( __VA_ARGS__ )
+#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<<blocks, threads, sharedMem>>>( __VA_ARGS__ )
+
+//--------------------------------------------------------------------------
+
+#elif defined MGONGPUCPP_HIPCC
+
+// Defines correct compiler
+#define MGONGPUCPP_GPUIMPL __HCC__
+
+//--------------------------------------------------------------------------
+
+#define gpuError_t hipError_t
+#define gpuPeekAtLastError hipPeekAtLastError
+#define gpuGetErrorString hipGetErrorString
+#define gpuSuccess hipSuccess
+
+#define gpuMallocHost( ptr, size ) checkGpu( hipHostMalloc( ptr, size ) ) // HostMalloc better
+#define gpuMalloc( ptr, size ) checkGpu( hipMalloc( ptr, size ) )
+
+#define gpuMemcpy( dstData, srcData, srcBytes, func ) checkGpu( hipMemcpy( dstData, srcData, srcBytes, func ) )
+#define gpuMemcpyHostToDevice hipMemcpyHostToDevice
+#define gpuMemcpyDeviceToHost hipMemcpyDeviceToHost
+#define gpuMemcpyToSymbol( type1, type2, size ) checkGpu( hipMemcpyToSymbol( type1, type2, size ) )
+
+#define gpuFree( ptr ) checkGpu( hipFree( ptr ) )
+#define gpuFreeHost( ptr ) checkGpu( hipHostFree( ptr ) )
+
+#define gpuSetDevice hipSetDevice
+#define gpuDeviceSynchronize hipDeviceSynchronize
+#define gpuDeviceReset hipDeviceReset
+
+#define gpuLaunchKernel( kernel, blocks, threads, ... ) kernel<<<blocks, threads>>>( __VA_ARGS__ )
+#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<<blocks, threads, sharedMem>>>( __VA_ARGS__ )
+
+#endif
+
+#endif // MG5AMC_GPUABSTRACTION_H
\ No newline at end of file

From 1afbafc2a84c6a8d2d4c9e867e3fb8baae0843c5 Mon Sep 17 00:00:00 2001
From: Andrea Valassi <andrea.valassi@cern.ch>
Date: Thu, 13 Jul 2023 16:27:55 +0200
Subject: [PATCH 466/509] [jthip] clang-format GpuRuntime.h both in CODEGEN and
 in ggttgg.mad

---
 .../madgraph/iolibs/template_files/gpu/GpuRuntime.h             | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/GpuRuntime.h b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/GpuRuntime.h
index 86c9179f4c..895a662e52 100644
--- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/GpuRuntime.h
+++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/GpuRuntime.h
@@ -59,7 +59,7 @@ namespace mg5amcGpu
       // Replace cudaFree(0) by cudaSetDevice(0), even if it is not really needed either
       // (but see https://developer.nvidia.com/blog/cuda-pro-tip-always-set-current-device-avoid-multithreading-bugs)
       if( debug ) std::cout << "__GpuRuntime: calling GpuSetDevice(0)" << std::endl;
-      checkGpu ( gpuSetDevice( 0 ) ); // SLOW!
+      checkGpu( gpuSetDevice( 0 ) ); // SLOW!
     }
 
     // Tear down CUDA application (call cudaDeviceReset)

From d1f5c5ba18ab19e76d818c052528c13d3af3d756 Mon Sep 17 00:00:00 2001
From: Jorgen T <jorgen.teig@gmail.com>
Date: Mon, 17 Jul 2023 14:47:32 +0200
Subject: [PATCH 467/509] Made the codegenerated files same as the templated
 files in gg_ttgg

---
 .../madgraph/iolibs/template_files/gpu/Bridge.h               | 4 ++--
 .../iolibs/template_files/gpu/MatrixElementKernels.cc         | 4 ++--
 .../madgraph/iolibs/template_files/gpu/fbridge.cc             | 4 ++--
 .../cudacpp/gg_ttgg.mad/SubProcesses/MatrixElementKernels.cc  | 4 ++--
 4 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/Bridge.h b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/Bridge.h
index 51241e9840..bcdfe29154 100644
--- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/Bridge.h
+++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/Bridge.h
@@ -291,11 +291,11 @@ namespace mg5amcCpu
     constexpr int neppM = MemoryAccessMomenta::neppM;
     if constexpr( neppM == 1 && std::is_same_v<FORTRANFPTYPE, fptype> )
     {
-      gpuMemcpy( m_devMomentaC.data(), momenta, m_devMomentaC.bytes(), cudaMemcpyHostToDevice );
+      gpuMemcpy( m_devMomentaC.data(), momenta, m_devMomentaC.bytes(), gpuMemcpyHostToDevice );
     }
     else
     {
-      gpuMemcpy( m_devMomentaF.data(), momenta, m_devMomentaF.bytes(), cudaMemcpyHostToDevice );
+      gpuMemcpy( m_devMomentaF.data(), momenta, m_devMomentaF.bytes(), gpuMemcpyHostToDevice );
       const int thrPerEvt = CPPProcess::npar * CPPProcess::np4; // AV: transpose alg does 1 element per thread (NOT 1 event per thread)
       //const int thrPerEvt = 1; // AV: try new alg with 1 event per thread... this seems slower
       gpuLaunchKernel( dev_transposeMomentaF2C, m_gpublocks * thrPerEvt, m_gputhreads, m_devMomentaF.data(), m_devMomentaC.data(), m_nevt );
diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MatrixElementKernels.cc b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MatrixElementKernels.cc
index dd3eee4ea3..a9e20e114f 100644
--- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MatrixElementKernels.cc
+++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MatrixElementKernels.cc
@@ -226,9 +226,9 @@ namespace mg5amcGpu
     constexpr unsigned int sharedMemSize = ntpbMAX * sizeof( float );
 #endif
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    gpuLaunchKernelSharedmem( sigmaKin, m_gpublocks, m_gputhreads, sharedMemSize, m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), channelId, m_numerators.data(), m_denominators.data(), m_selhel.data(), m_selcol.data() );
+    gpuLaunchKernelSharedMem( sigmaKin, m_gpublocks, m_gputhreads, sharedMemSize, m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), channelId, m_numerators.data(), m_denominators.data(), m_selhel.data(), m_selcol.data() );
 #else
-    gpuLaunchKernelSharedmem( sigmaKin, m_gpublocks, m_gputhreads, sharedMemSize, m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), m_selhel.data(), m_selcol.data() );
+    gpuLaunchKernelSharedMem( sigmaKin, m_gpublocks, m_gputhreads, sharedMemSize, m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), m_selhel.data(), m_selcol.data() );
 #endif
     checkGpu( gpuPeekAtLastError() );
     checkGpu( gpuDeviceSynchronize() );
diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/fbridge.cc b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/fbridge.cc
index 34ca33ded6..592a8c74bb 100644
--- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/fbridge.cc
+++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/fbridge.cc
@@ -47,7 +47,7 @@ extern "C"
   void fbridgecreate_( CppObjectInFortran** ppbridge, const int* pnevtF, const int* pnparF, const int* pnp4F )
   {
 #ifdef MGONGPUCPP_GPUIMPL
-    CudaRuntime::setUp();
+    GpuRuntime::setUp();
 #endif
     // (NB: CPPProcess::initProc no longer needs to be executed here because it is called in the Bridge constructor)
     // FIXME: disable OMP in Bridge when called from Fortran
@@ -66,7 +66,7 @@ extern "C"
     if( pbridge == 0 ) throw std::runtime_error( "fbridgedelete_: invalid Bridge address" );
     delete pbridge;
 #ifdef MGONGPUCPP_GPUIMPL
-    CudaRuntime::tearDown();
+    GpuRuntime::tearDown();
 #endif
   }
 
diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MatrixElementKernels.cc b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MatrixElementKernels.cc
index 30257195b6..7a3d4c1b75 100644
--- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MatrixElementKernels.cc
+++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MatrixElementKernels.cc
@@ -6,7 +6,7 @@
 #include "MatrixElementKernels.h"
 
 #include "CPPProcess.h"
-#include "CudaRuntime.h"
+#include "GpuRuntime.h" // Includes the abstraction for Nvidia/AMD compilation
 #include "MemoryAccessMomenta.h"
 #include "MemoryBuffers.h"
 
@@ -202,7 +202,7 @@ namespace mg5amcGpu
     PinnedHostBufferHelicityMask hstIsGoodHel( ncomb );
     DeviceBufferHelicityMask devIsGoodHel( ncomb );
     // ... 0d1. Compute good helicity mask on the device
-    computeDependentCouplings<<<m_gpublocks, m_gputhreads>>>( m_gs.data(), m_couplings.data() );
+    gpuLaunchKernel( computeDependentCouplings, m_gpublocks, m_gputhreads, m_gs.data(), m_couplings.data() );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     sigmaKin_getGoodHel<<<m_gpublocks, m_gputhreads>>>( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_numerators.data(), m_denominators.data(), devIsGoodHel.data() );
 #else

From 1b5c0fdff6208b18ecb2e292571c5aea9f482a23 Mon Sep 17 00:00:00 2001
From: Andrea Valassi <andrea.valassi@cern.ch>
Date: Tue, 18 Jul 2023 18:11:04 +0200
Subject: [PATCH 468/509] [jthip] backport to CODEGEN from ggttgg.mad

---
 .../template_files/cpp_model_parameters_h.inc | 14 ++-
 .../iolibs/template_files/gpu/CudaRuntime.h   | 85 -------------------
 .../iolibs/template_files/gpu/check_sa.cc     | 59 ++++++++-----
 .../iolibs/template_files/gpu/mgOnGpuConfig.h | 73 ++++++++++------
 .../cudacpp/gg_ttgg.mad/src/mgOnGpuConfig.h   |  2 +
 .../cudacpp/gg_ttgg.mad/src/mgOnGpuVectors.h  |  2 +
 6 files changed, 96 insertions(+), 139 deletions(-)
 delete mode 100644 epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/CudaRuntime.h

diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/cpp_model_parameters_h.inc b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/cpp_model_parameters_h.inc
index 0250c160ed..ef3d99d07c 100644
--- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/cpp_model_parameters_h.inc
+++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/cpp_model_parameters_h.inc
@@ -172,7 +172,7 @@ namespace mg5amcCpu
 #pragma GCC diagnostic push
 #pragma GCC diagnostic ignored "-Wunused-variable"  // e.g. <<warning: unused variable ‘mdl_G__exp__2’ [-Wunused-variable]>>
 #pragma GCC diagnostic ignored "-Wunused-parameter" // e.g. <<warning: unused parameter ‘G’ [-Wunused-parameter]>>
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 #pragma nv_diagnostic push
 #pragma nv_diag_suppress 177 // e.g. <<warning #177-D: variable "mdl_G__exp__2" was declared but never referenced>>
 #endif
@@ -194,9 +194,9 @@ namespace mg5amcCpu
 %(dcoupsetdcoup)s
       }
 %(eftspecial2)s
-      return out;
-    }
-#ifdef __CUDACC__
+    return out;
+  }
+#ifdef MGONGPUCPP_GPUIMPL
 #pragma GCC diagnostic pop
 #pragma nv_diagnostic pop
 #endif
@@ -212,6 +212,12 @@ namespace mg5amcCpu
 
   //==========================================================================
 
+#ifdef MGONGPUCPP_GPUIMPL
+namespace mg5amcGpu
+#else
+namespace mg5amcCpu
+#endif
+{
 #pragma GCC diagnostic push
 #ifndef __clang__
 #pragma GCC diagnostic ignored "-Wunused-but-set-variable" // e.g. <<warning: variable ‘couplings_sv’ set but not used [-Wunused-but-set-variable]>>
diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/CudaRuntime.h b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/CudaRuntime.h
deleted file mode 100644
index df0c3f3df8..0000000000
--- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/CudaRuntime.h
+++ /dev/null
@@ -1,85 +0,0 @@
-// Copyright (C) 2020-2023 CERN and UCLouvain.
-// Licensed under the GNU Lesser General Public License (version 3 or later).
-// Created by: S. Roiser (Jul 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
-
-#ifndef MG5AMC_CUDARUNTIME_H
-#define MG5AMC_CUDARUNTIME_H 1
-
-// MG5AMC on GPU uses the CUDA runtime API, not the lower level CUDA driver API
-// See https://docs.nvidia.com/cuda/cuda-runtime-api/driver-vs-runtime-api.html#driver-vs-runtime-api
-
-#include <cassert>
-#include <iostream>
-
-//--------------------------------------------------------------------------
-
-// See https://stackoverflow.com/a/14038590
-#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
-#define checkCuda( code ) { assertCuda( code, __FILE__, __LINE__ ); }
-inline void assertCuda( cudaError_t code, const char* file, int line, bool abort = true )
-{
-  if( code != cudaSuccess )
-  {
-    printf( "ERROR! assertCuda: '%s' (%d) in %s:%d\n", cudaGetErrorString( code ), code, file, line );
-    if( abort ) assert( code == cudaSuccess );
-  }
-}
-#endif /* clang-format on */
-
-//--------------------------------------------------------------------------
-
-#ifdef MGONGPUCPP_GPUIMPL
-namespace mg5amcGpu
-{
-  // Instantiate a CudaRuntime at the beginnining of the application's main to
-  // invoke cudaSetDevice(0) in the constructor and book a cudaDeviceReset() call in the destructor
-  // *** FIXME! This will all need to be designed differently when going to multi-GPU nodes! ***
-  struct CudaRuntime final
-  {
-    CudaRuntime( const bool debug = true )
-      : m_debug( debug ) { setUp( m_debug ); }
-    ~CudaRuntime() { tearDown( m_debug ); }
-    CudaRuntime( const CudaRuntime& ) = delete;
-    CudaRuntime( CudaRuntime&& ) = delete;
-    CudaRuntime& operator=( const CudaRuntime& ) = delete;
-    CudaRuntime& operator=( CudaRuntime&& ) = delete;
-    bool m_debug;
-
-    // Set up CUDA application
-    // ** NB: strictly speaking this is not needed when using the CUDA runtime API **
-    // Calling cudaSetDevice on startup is useful to properly book-keep the time spent in CUDA initialization
-    static void setUp( const bool debug = true )
-    {
-      // ** NB: it is useful to call cudaSetDevice, or cudaFree, to properly book-keep the time spent in CUDA initialization
-      // ** NB: otherwise, the first CUDA operation (eg a cudaMemcpyToSymbol in CPPProcess ctor) appears to take much longer!
-      /*
-      // [We initially added cudaFree(0) to "ease profile analysis" only because it shows up as a big recognizable block!]
-      // No explicit initialization is needed: https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#initialization
-      // It is not clear what cudaFree(0) does at all: https://stackoverflow.com/questions/69967813/
-      if ( debug ) std::cout << "__CudaRuntime: calling cudaFree(0)" << std::endl;
-      checkCuda( cudaFree( 0 ) ); // SLOW!
-      */
-      // Replace cudaFree(0) by cudaSetDevice(0), even if it is not really needed either
-      // (but see https://developer.nvidia.com/blog/cuda-pro-tip-always-set-current-device-avoid-multithreading-bugs)
-      if( debug ) std::cout << "__CudaRuntime: calling cudaSetDevice(0)" << std::endl;
-      checkCuda( cudaSetDevice( 0 ) ); // SLOW!
-    }
-
-    // Tear down CUDA application (call cudaDeviceReset)
-    // ** NB: strictly speaking this is not needed when using the CUDA runtime API **
-    // Calling cudaDeviceReset on shutdown is only needed for checking memory leaks in cuda-memcheck
-    // See https://docs.nvidia.com/cuda/cuda-memcheck/index.html#leak-checking
-    static void tearDown( const bool debug = true )
-    {
-      if( debug ) std::cout << "__CudaRuntime: calling cudaDeviceReset()" << std::endl;
-      checkCuda( cudaDeviceReset() );
-    }
-  };
-
-}
-#endif
-
-//--------------------------------------------------------------------------
-
-#endif // MG5AMC_CUDARUNTIME_H
diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/check_sa.cc b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/check_sa.cc
index 9a39220077..491dfc02e1 100644
--- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/check_sa.cc
+++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/check_sa.cc
@@ -12,6 +12,7 @@
 #include "BridgeKernels.h"
 #include "CPPProcess.h"
 #include "CrossSectionKernels.h"
+#include "GpuRuntime.h"
 #include "MatrixElementKernels.h"
 #include "MemoryAccessMatrixElements.h"
 #include "MemoryAccessMomenta.h"
@@ -102,12 +103,12 @@ main( int argc, char** argv )
     CurandHost = 1,
     CurandDevice = 2
   };
-#ifdef MGONGPUCPP_GPUIMPL
-  RandomNumberMode rndgen = RandomNumberMode::CurandDevice; // default on GPU
+#ifdef __CUDACC__
+  RandomNumberMode rndgen = RandomNumberMode::CurandDevice; // default on CUDA GPU
 #elif not defined MGONGPU_HAS_NO_CURAND
   RandomNumberMode rndgen = RandomNumberMode::CurandHost;  // default on CPU if build has curand
 #else
-  RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // default on CPU if build has no curand
+  RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // default on CPU if build has no curand and on HIP GPU
 #endif
   // Rambo sampling mode (NB RamboHost implies CommonRandom or CurandHost!)
   enum class RamboSamplingMode
@@ -145,10 +146,10 @@ main( int argc, char** argv )
     }
     else if( arg == "--curdev" )
     {
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
       rndgen = RandomNumberMode::CurandDevice;
 #else
-      throw std::runtime_error( "CurandDevice is not supported on CPUs" );
+      throw std::runtime_error( "CurandDevice is not supported on CPUs or on HIP GPUs" );
 #endif
     }
     else if( arg == "--curhst" )
@@ -265,12 +266,12 @@ main( int argc, char** argv )
 
 #ifdef MGONGPUCPP_GPUIMPL
 
-  // --- 00. Initialise cuda
-  // Instantiate a CudaRuntime at the beginnining of the application's main to
-  // invoke cudaSetDevice(0) in the constructor and book a cudaDeviceReset() call in the destructor
-  const std::string cdinKey = "00 CudaInit";
+  // --- 00. Initialise GPU
+  // Instantiate a GpuRuntime at the beginnining of the application's main.
+  // For CUDA this invokes cudaSetDevice(0) in the constructor and books a cudaDeviceReset() call in the destructor.
+  const std::string cdinKey = "00 GpuInit";
   timermap.start( cdinKey );
-  CudaRuntime cudaRuntime( debug );
+  GpuRuntime GpuRuntime( debug );
 #endif
 
   // --- 0a. Initialise physics process
@@ -394,7 +395,7 @@ main( int argc, char** argv )
     const bool onDevice = false;
     prnk.reset( new CurandRandomNumberKernel( hstRndmom, onDevice ) );
   }
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
   else
   {
     const bool onDevice = true;
@@ -403,7 +404,7 @@ main( int argc, char** argv )
 #else
   else
   {
-    throw std::logic_error( "CurandDevice is not supported on CPUs" ); // INTERNAL ERROR (no path to this statement)
+    throw std::logic_error( "CurandDevice is not supported on CPUs or HIP GPUs" ); // INTERNAL ERROR (no path to this statement)
   }
 #endif
 #else
@@ -729,17 +730,21 @@ main( int argc, char** argv )
     rndgentxt = "CURAND HOST";
   else if( rndgen == RandomNumberMode::CurandDevice )
     rndgentxt = "CURAND DEVICE";
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
   rndgentxt += " (CUDA code)";
+#elif defined __HIPCC__
+  rndgentxt += " (HIP code)";
 #else
   rndgentxt += " (C++ code)";
 #endif
 
   // Workflow description summary
   std::string wrkflwtxt;
-  // -- CUDA or C++?
-#ifdef MGONGPUCPP_GPUIMPL
+  // -- CUDA or HIP or C++?
+#ifdef __CUDACC__
   wrkflwtxt += "CUD:";
+#elif defined __HIPCC__
+  wrkflwtxt += "HIP:";
 #else
   wrkflwtxt += "CPP:";
 #endif
@@ -754,7 +759,7 @@ main( int argc, char** argv )
   wrkflwtxt += "???+"; // no path to this statement
 #endif
   // -- CUCOMPLEX or THRUST or STD complex numbers?
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 #if defined MGONGPU_CUCXTYPE_CUCOMPLEX
   wrkflwtxt += "CUX:";
 #elif defined MGONGPU_CUCXTYPE_THRUST
@@ -764,6 +769,12 @@ main( int argc, char** argv )
 #else
   wrkflwtxt += "???:"; // no path to this statement
 #endif
+#elif defined __HIPCC__
+#if defined MGONGPU_CUCXTYPE_CXSMPL
+  wrkflwtxt += "CXS:";
+#else
+  wrkflwtxt += "???:"; // no path to this statement
+#endif
 #else
 #if defined MGONGPU_CPPCXTYPE_STDCOMPLEX
   wrkflwtxt += "STX:";
@@ -864,8 +875,10 @@ main( int argc, char** argv )
 #endif
     // Dump all configuration parameters and all results
     std::cout << std::string( SEP79, '*' ) << std::endl
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
               << "Process                     = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_CUDA"
+#elif defined __HIPCC__
+              << "Process                     = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_HIP"
 #else
               << "Process                     = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_CPP"
 #endif
@@ -892,14 +905,14 @@ main( int argc, char** argv )
 #elif defined MGONGPU_FPTYPE_FLOAT
               << "FP precision                = FLOAT (NaN/abnormal=" << nabn << ", zero=" << nzero << ")" << std::endl
 #endif
-#ifdef MGONGPUCPP_GPUIMPL
 #if defined MGONGPU_CUCXTYPE_CUCOMPLEX
               << "Complex type                = CUCOMPLEX" << std::endl
 #elif defined MGONGPU_CUCXTYPE_THRUST
               << "Complex type                = THRUST::COMPLEX" << std::endl
-#endif
-#else
+#elif defined MGONGPU_CUCXTYPE_CXSMPL
               << "Complex type                = STD::COMPLEX" << std::endl
+#else
+              << "Complex type                = ???" << std::endl // no path to this statement...
 #endif
               << "RanNumb memory layout       = AOSOA[" << neppR << "]"
               << ( neppR == 1 ? " == AOS" : "" )
@@ -1033,14 +1046,14 @@ main( int argc, char** argv )
              << "\"FLOAT (NaN/abnormal=" << nabn << ")\"," << std::endl
 #endif
              << "\"Complex type\": "
-#ifdef MGONGPUCPP_GPUIMPL
 #if defined MGONGPU_CUCXTYPE_CUCOMPLEX
              << "\"CUCOMPLEX\"," << std::endl
 #elif defined MGONGPU_CUCXTYPE_THRUST
              << "\"THRUST::COMPLEX\"," << std::endl
-#endif
-#else
+#elif defined MGONGPU_CUCXTYPE_CXSMPL
              << "\"STD::COMPLEX\"," << std::endl
+#else
+             << "\"???\"," << std::endl                           // no path to this statement...
 #endif
              << "\"RanNumb memory layout\": "
              << "\"AOSOA[" << neppR << "]\""
diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/mgOnGpuConfig.h b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/mgOnGpuConfig.h
index 5b04029787..1811de4699 100644
--- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/mgOnGpuConfig.h
+++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/mgOnGpuConfig.h
@@ -6,21 +6,31 @@
 #ifndef MGONGPUCONFIG_H
 #define MGONGPUCONFIG_H 1
 
-#include "GpuRuntime.h" // Includes the GPU abstraction
-
 // HARDCODED AT CODE GENERATION TIME: DO NOT MODIFY (#473)
 // There are two different code bases for standalone_cudacpp (without multichannel) and madevent+cudacpp (with multichannel)
 %(mgongpu_supports_multichannel)s
 
+// Is this a GPU (CUDA, HIP) or CPU implementation?
+#ifdef __CUDACC__
+#define MGONGPUCPP_GPUIMPL cuda
+#elif defined __HIPCC__
+#define MGONGPUCPP_GPUIMPL hip
+#else
+#undef MGONGPUCPP_GPUIMPL
+#endif
+
 // ** NB1 Throughputs (e.g. 6.8E8) are events/sec for "./gcheck.exe -p 65536 128 12"
 // ** NB2 Baseline on b7g47n0004 fluctuates (probably depends on load on other VMs)
 
 // Choose if curand is supported for generating random numbers
+// For CUDA, by default, it is supported
+// For HIP, by default, it is not supported
 // For C++, by default, do not inline, but allow this macro to be set from outside with e.g. -DMGONGPU_HAS_NO_CURAND
-#ifdef MGONGPUCPP_CUDACC
+#ifdef __CUDACC__
 #undef MGONGPU_HAS_NO_CURAND
-#elif defined MGONGPUCPP_HIPCC
+#elif defined __HIPCC__
 #define MGONGPU_HAS_NO_CURAND 1
+#else
 //#undef MGONGPU_HAS_NO_CURAND // default
 ////#define MGONGPU_HAS_NO_CURAND 1
 #endif
@@ -55,23 +65,28 @@
 //#undef MGONGPU_HARDCODE_PARAM // default
 ////#define MGONGPU_HARDCODE_PARAM 1
 
-// Complex type in c++: std::complex or cxsmpl (CHOOSE ONLY ONE)
-#ifndef MGONGPUCPP_GPUIMPL
-//#define MGONGPU_CPPCXTYPE_STDCOMPLEX 1 // ~8 percent slower on float, same on double (5.1E6/double, 9.4E6/float)
-#define MGONGPU_CPPCXTYPE_CXSMPL 1 // new default (5.1E6/double, 10.2E6/float)
-#endif
-
-// Complex type in cuda: thrust or cucomplex or cxsmpl (CHOOSE ONLY ONE)
-#ifdef MGONGPUCPP_GPUIMPL
+// Complex type in CUDA: thrust or cucomplex or cxsmpl (CHOOSE ONLY ONE)
+#ifdef __CUDACC__
 #define MGONGPU_CUCXTYPE_THRUST 1 // default (~1.15E9/double, ~3.2E9/float)
 //#define MGONGPU_CUCXTYPE_CUCOMPLEX 1 // ~10 percent slower (1.03E9/double, ~2.8E9/float)
 //#define MGONGPU_CUCXTYPE_CXSMPL 1 // ~10 percent slower (1.00E9/double, ~2.9E9/float)
+
+// Complex type in HIP: cxsmpl (ONLY ONE OPTION POSSIBLE)
+#elif defined __HIPCC__
+#define MGONGPU_CUCXTYPE_CXSMPL 1 // ~10 percent slower (1.00E9/double, ~2.9E9/float)
+
+// Complex type in C++: std::complex or cxsmpl (CHOOSE ONLY ONE)
+#else
+//#define MGONGPU_CPPCXTYPE_STDCOMPLEX 1 // ~8 percent slower on float, same on double (5.1E6/double, 9.4E6/float)
+#define MGONGPU_CPPCXTYPE_CXSMPL 1 // new default (5.1E6/double, 10.2E6/float)
 #endif
 
-// Cuda nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation
-#ifdef MGONGPUCPP_GPUIMPL
-#undef MGONGPU_NSIGHT_DEBUG // default
+// CUDA nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation
+#ifdef __CUDACC__
+#undef MGONGPU_NSIGHT_DEBUG // default in CUDA
 //#define MGONGPU_NSIGHT_DEBUG 1
+#else
+#undef MGONGPU_NSIGHT_DEBUG // only option in HIP or C++
 #endif
 
 // SANITY CHECKS (floating point precision for everything but color algebra #537)
@@ -87,17 +102,21 @@
 #error You cannot use double precision for color algebra and single precision elsewhere
 #endif
 
-// SANITY CHECKS (c++ complex number implementation)
-#ifndef MGONGPUCPP_GPUIMPL
-#if defined MGONGPU_CPPCXTYPE_STDCOMPLEX and defined MGONGPU_CPPCXTYPE_CXSMPL
-#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CPPCXTYPE_STDCOMPLEX or MGONGPU_CPPCXTYPE_CXSMPL
+// SANITY CHECKS (CUDA complex number implementation)
+#ifdef __CUDACC__
+#if defined MGONGPU_CUCXTYPE_THRUST and defined MGONGPU_CUCXTYPE_CUCOMPLEX
+#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CUCXTYPE_THRUST or MGONGPU_CUCXTYPE_CUCOMPLEX for CUDA
+#elif defined MGONGPU_CUCXTYPE_THRUST and defined MGONGPU_CUCXTYPE_CXSMPL
+#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CUCXTYPE_THRUST or MGONGPU_CUCXTYPE_CXSMPL for CUDA
+#elif defined MGONGPU_CUCXTYPE_CUCOMPLEX and defined MGONGPU_CUCXTYPE_CXSMPL
+#error You must CHOOSE (ONE AND) ONLY ONE OF MGONGPU_CUCXTYPE_CUCOMPLEX or MGONGPU_CUCXTYPE_CXSMPL for CUDA
 #endif
 #endif
 
-// SANITY CHECKS (cuda complex number implementation)
-#ifdef MGONGPUCPP_GPUIMPL
-#if defined MGONGPU_CUCXTYPE_THRUST and defined MGONGPU_CUCXTYPE_CUCOMPLEX and defined MGONGPU_CUCXTYPE_CXSMPL
-#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CUCXTYPE_THRUST or MGONGPU_CUCXTYPE_CUCOMPLEX or MGONGPU_CUCXTYPE_CXSMPL
+// SANITY CHECKS (C++ complex number implementation)
+#ifndef MGONGPUCPP_GPUIMPL
+#if defined MGONGPU_CPPCXTYPE_STDCOMPLEX and defined MGONGPU_CPPCXTYPE_CXSMPL
+#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CPPCXTYPE_STDCOMPLEX or MGONGPU_CPPCXTYPE_CXSMPL for C++
 #endif
 #endif
 
@@ -146,7 +165,7 @@ using mgOnGpu::fptype;
 using mgOnGpu::fptype2;
 
 // C++ SIMD vectorization width (this will be used to set neppV)
-#ifdef MGONGPUCPP_GPUIMPL // CUDA implementation has no SIMD
+#ifdef MGONGPUCPP_GPUIMPL // CUDA and HIP implementations have no SIMD
 #undef MGONGPU_CPPSIMD
 #elif defined __AVX512VL__ && defined MGONGPU_PVW512 // C++ "512z" AVX512 with 512 width (512-bit ie 64-byte): 8 (DOUBLE) or 16 (FLOAT)
 #ifdef MGONGPU_FPTYPE_DOUBLE
@@ -176,9 +195,9 @@ using mgOnGpu::fptype2;
 #undef MGONGPU_CPPSIMD
 #endif
 
-// Cuda nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation
+// CUDA nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation
 // Arguments (not used so far): text is __FUNCTION__, code is 0 (start) or 1 (end)
-#if defined MGONGPUCPP_GPUIMPL && defined MGONGPU_NSIGHT_DEBUG /* clang-format off */
+#if defined __CUDA__ && defined MGONGPU_NSIGHT_DEBUG /* clang-format off */
 #define mgDebugDeclare() __shared__ float mgDebugCounter[mgOnGpu::ntpbMAX];
 #define mgDebugInitialise() { mgDebugCounter[threadIdx.x] = 0; }
 #define mgDebug( code, text ) { mgDebugCounter[threadIdx.x] += 1; }
@@ -190,7 +209,7 @@ using mgOnGpu::fptype2;
 #define mgDebugFinalise() { /*noop*/ }
 #endif /* clang-format on */
 
-// Define empty CUDA declaration specifiers for C++
+// Define empty CUDA/HIP declaration specifiers for C++
 #ifndef MGONGPUCPP_GPUIMPL
 #define __global__
 #define __host__
diff --git a/epochX/cudacpp/gg_ttgg.mad/src/mgOnGpuConfig.h b/epochX/cudacpp/gg_ttgg.mad/src/mgOnGpuConfig.h
index cacab1031a..ed3e219f8a 100644
--- a/epochX/cudacpp/gg_ttgg.mad/src/mgOnGpuConfig.h
+++ b/epochX/cudacpp/gg_ttgg.mad/src/mgOnGpuConfig.h
@@ -69,6 +69,8 @@
 #ifdef __CUDACC__
 #undef MGONGPU_NSIGHT_DEBUG // default
 //#define MGONGPU_NSIGHT_DEBUG 1
+#else
+#undef MGONGPU_NSIGHT_DEBUG // only option in HIP or C++
 #endif
 
 // SANITY CHECKS (floating point precision for everything but color algebra #537)
diff --git a/epochX/cudacpp/gg_ttgg.mad/src/mgOnGpuVectors.h b/epochX/cudacpp/gg_ttgg.mad/src/mgOnGpuVectors.h
index e1299ba81e..e91f5927d6 100644
--- a/epochX/cudacpp/gg_ttgg.mad/src/mgOnGpuVectors.h
+++ b/epochX/cudacpp/gg_ttgg.mad/src/mgOnGpuVectors.h
@@ -9,6 +9,8 @@
 #include "mgOnGpuCxtypes.h"
 #include "mgOnGpuFptypes.h"
 
+#include "GpuAbstraction.h"
+
 #include <iostream>
 
 //==========================================================================

From 0f1b8115d006f512a5081586916498f54fe1b90c Mon Sep 17 00:00:00 2001
From: Andrea Valassi <andrea.valassi@cern.ch>
Date: Tue, 18 Jul 2023 18:25:34 +0200
Subject: [PATCH 469/509] [jthip] complete backport to CODEGEN from ggttgg.mad,
 including a few improvements

---
 .../gpu/CurandRandomNumberKernel.cc           |  4 +--
 .../template_files/gpu/GpuAbstraction.h       | 27 +++++--------------
 .../iolibs/template_files/gpu/MemoryBuffers.h |  1 +
 .../template_files/gpu/mgOnGpuVectors.h       |  2 --
 .../iolibs/template_files/gpu/process_cc.inc  |  1 -
 .../gpu/process_function_definitions.inc      |  2 --
 .../template_files/gpu/process_matrix.inc     |  2 --
 .../CUDACPP_SA_OUTPUT/model_handling.py       | 20 +++++++-------
 .../SubProcesses/P1_gg_ttxgg/check_sa.cc      |  4 ++-
 9 files changed, 24 insertions(+), 39 deletions(-)

diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/CurandRandomNumberKernel.cc b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/CurandRandomNumberKernel.cc
index 5b33207ad0..98ec214eaf 100644
--- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/CurandRandomNumberKernel.cc
+++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/CurandRandomNumberKernel.cc
@@ -22,7 +22,7 @@ inline void assertCurand( curandStatus_t code, const char *file, int line, bool
 }
 #endif /* clang-format on */
 
-#ifdef MGONGPUCPP_CUDACC
+#ifdef __CUDACC__
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -36,7 +36,7 @@ namespace mg5amcCpu
   {
     if( m_isOnDevice )
     {
-#ifdef MGONGPUCPP_CUDACC
+#ifdef __CUDACC__
       if( !m_rnarray.isOnDevice() )
         throw std::runtime_error( "CurandRandomNumberKernel on device with a host random number array" );
 #else
diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/GpuAbstraction.h b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/GpuAbstraction.h
index 2f000e33d1..427c82c05d 100644
--- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/GpuAbstraction.h
+++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/GpuAbstraction.h
@@ -3,22 +3,10 @@
 
 #include <cassert>
 
-#ifdef MGONGPUCPP_GPUIMPL
-#define MGONGPUCPP_CUDACC 1
-#endif
-
-#ifdef __HIPCC__
-#include "hip/hip_runtime.h"
-#define MGONGPUCPP_HIPCC 1
-#endif
-
-#ifdef MGONGPUCPP_CUDACC
-
-// Defines correct compiler
-#define MGONGPUCPP_GPUIMPL MGONGPUCPP_GPUIMPL
-
 //--------------------------------------------------------------------------
 
+#ifdef __CUDACC__
+
 #define gpuError_t cudaError_t
 #define gpuPeekAtLastError cudaPeekAtLastError
 #define gpuGetErrorString cudaGetErrorString
@@ -44,12 +32,9 @@
 
 //--------------------------------------------------------------------------
 
-#elif defined MGONGPUCPP_HIPCC
-
-// Defines correct compiler
-#define MGONGPUCPP_GPUIMPL __HCC__
+#elif defined __HIPCC__
 
-//--------------------------------------------------------------------------
+#include "hip/hip_runtime.h"
 
 #define gpuError_t hipError_t
 #define gpuPeekAtLastError hipPeekAtLastError
@@ -74,6 +59,8 @@
 #define gpuLaunchKernel( kernel, blocks, threads, ... ) kernel<<<blocks, threads>>>( __VA_ARGS__ )
 #define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<<blocks, threads, sharedMem>>>( __VA_ARGS__ )
 
+//--------------------------------------------------------------------------
+
 #endif
 
-#endif // MG5AMC_GPUABSTRACTION_H
\ No newline at end of file
+#endif // MG5AMC_GPUABSTRACTION_H
diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MemoryBuffers.h b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MemoryBuffers.h
index d6ba45dcad..522e6ce100 100644
--- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MemoryBuffers.h
+++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MemoryBuffers.h
@@ -11,6 +11,7 @@
 #include "mgOnGpuCxtypes.h"
 
 #include "CPPProcess.h"
+#include "GpuRuntime.h"
 #include "Parameters_%(model_name)s.h"
 
 #include <sstream>
diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/mgOnGpuVectors.h b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/mgOnGpuVectors.h
index de12c1d24f..fbfe68f6c1 100644
--- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/mgOnGpuVectors.h
+++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/mgOnGpuVectors.h
@@ -9,8 +9,6 @@
 #include "mgOnGpuCxtypes.h"
 #include "mgOnGpuFptypes.h"
 
-#include "GpuAbstraction.h"
-
 #include <iostream>
 
 //==========================================================================
diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/process_cc.inc b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/process_cc.inc
index 9dceb45708..95400f42db 100644
--- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/process_cc.inc
+++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/process_cc.inc
@@ -14,7 +14,6 @@
 
 #include "mgOnGpuConfig.h"
 
-#include "GpuRuntime.h"
 %(hel_amps_h)s
 #include "MemoryAccessAmplitudes.h"
 #include "MemoryAccessCouplings.h"
diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/process_function_definitions.inc b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/process_function_definitions.inc
index d4e999733f..aa8f899798 100644
--- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/process_function_definitions.inc
+++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/process_function_definitions.inc
@@ -10,8 +10,6 @@
 // Class member functions for calculating the matrix elements for
 %(process_lines)s
 
-#include "GpuRuntime.h"
-
 #ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/process_matrix.inc b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/process_matrix.inc
index 241c50a9d1..3cfbf668ca 100644
--- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/process_matrix.inc
+++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/process_matrix.inc
@@ -7,8 +7,6 @@
 ! Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 !==========================================================================
 
-#include "GpuAbstraction.h"
-
       // *** COLOR CHOICE BELOW ***
       // Store the leading color flows for choice of color
       if( jamp2_sv ) // disable color choice if nullptr
diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/model_handling.py b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/model_handling.py
index 8bf85c5a55..abfd2428b6 100644
--- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/model_handling.py
+++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/model_handling.py
@@ -1092,7 +1092,7 @@ def get_process_function_definitions(self, write=True):
                                          %(len(coupling_indep), ' ), cxmake( m_pars->'.join(coupling_indep)) # AV only indep!
             replace_dict['cipcdevice'] = '__device__ __constant__ fptype cIPC[%i];'%(2*len(coupling_indep))
             replace_dict['cipcstatic'] = 'static fptype cIPC[%i];'%(2*len(coupling_indep))
-            replace_dict['cipc2tipcSym'] = 'checkCuda( cudaMemcpyToSymbol( cIPC, tIPC, %i * sizeof( cxtype ) ) );'%len(coupling_indep)
+            replace_dict['cipc2tipcSym'] = 'gpuMemcpyToSymbol( cIPC, tIPC, %i * sizeof( cxtype ) );'%len(coupling_indep)
             replace_dict['cipc2tipc'] = 'memcpy( cIPC, tIPC, %i * sizeof( cxtype ) );'%len(coupling_indep)
             replace_dict['cipcdump'] = '\n    //for ( i=0; i<%i; i++ ) std::cout << std::setprecision(17) << "tIPC[i] = " << tIPC[i] << std::endl;'%len(coupling_indep)
             coup_str_hrd = '__device__ const fptype cIPC[%s] = { ' % (len(coupling_indep)*2)
@@ -1103,7 +1103,7 @@ def get_process_function_definitions(self, write=True):
             replace_dict['cipcassign'] = '//const cxtype tIPC[0] = { ... }; // nicoup=0'
             replace_dict['cipcdevice'] = '__device__ __constant__ fptype* cIPC = nullptr; // unused as nicoup=0'
             replace_dict['cipcstatic'] = 'static fptype* cIPC = nullptr; // unused as nicoup=0'
-            replace_dict['cipc2tipcSym'] = '//checkCuda( cudaMemcpyToSymbol( cIPC, tIPC, %i * sizeof( cxtype ) ) ); // nicoup=0'%len(coupling_indep)
+            replace_dict['cipc2tipcSym'] = '//gpuMemcpyToSymbol( cIPC, tIPC, %i * sizeof( cxtype ) ); // nicoup=0'%len(coupling_indep)
             replace_dict['cipc2tipc'] = '//memcpy( cIPC, tIPC, %i * sizeof( cxtype ) ); // nicoup=0'%len(coupling_indep)
             replace_dict['cipcdump'] = ''
             replace_dict['cipchrdcod'] = '__device__ const fptype* cIPC = nullptr; // unused as nicoup=0'
@@ -1112,7 +1112,7 @@ def get_process_function_definitions(self, write=True):
                                          %(len(params), ', (fptype)m_pars->'.join(params))
             replace_dict['cipddevice'] = '__device__ __constant__ fptype cIPD[%i];'%(len(params))
             replace_dict['cipdstatic'] = 'static fptype cIPD[%i];'%(len(params))
-            replace_dict['cipd2tipdSym'] = 'checkCuda( cudaMemcpyToSymbol( cIPD, tIPD, %i * sizeof( fptype ) ) );'%len(params)
+            replace_dict['cipd2tipdSym'] = 'gpuMemcpyToSymbol( cIPD, tIPD, %i * sizeof( fptype ) );'%len(params)
             replace_dict['cipd2tipd'] = 'memcpy( cIPD, tIPD, %i * sizeof( fptype ) );'%len(params)
             replace_dict['cipddump'] = '\n    //for ( i=0; i<%i; i++ ) std::cout << std::setprecision(17) << "tIPD[i] = " << tIPD[i] << std::endl;'%len(params)
             param_str_hrd = '__device__ const fptype cIPD[%s] = { ' % len(params)
@@ -1123,7 +1123,7 @@ def get_process_function_definitions(self, write=True):
             replace_dict['cipdassign'] = '//const fptype tIPD[0] = { ... }; // nparam=0'
             replace_dict['cipddevice'] = '//__device__ __constant__ fptype* cIPD = nullptr; // unused as nparam=0'
             replace_dict['cipdstatic'] = '//static fptype* cIPD = nullptr; // unused as nparam=0'
-            replace_dict['cipd2tipdSym'] = '//checkCuda( cudaMemcpyToSymbol( cIPD, tIPD, %i * sizeof( fptype ) ) ); // nparam=0'%len(params)
+            replace_dict['cipd2tipdSym'] = '//gpuMemcpyToSymbol( cIPD, tIPD, %i * sizeof( fptype ) ); // nparam=0'%len(params)
             replace_dict['cipd2tipd'] = '//memcpy( cIPD, tIPD, %i * sizeof( fptype ) ); // nparam=0'%len(params)
             replace_dict['cipddump'] = ''
             replace_dict['cipdhrdcod'] = '//__device__ const fptype* cIPD = nullptr; // unused as nparam=0'
@@ -1195,13 +1195,13 @@ def get_all_sigmaKin_lines(self, color_amplitudes, class_name):
                            fptype* allDenominators,       // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
                            fptype_sv* jamp2_sv            // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled)
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
                            , const int ievt00             // input: first event number in current C++ event page (for CUDA, ievt depends on threadid)
 #endif
                            )
   //ALWAYS_INLINE // attributes are not permitted in a function definition
   {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     using namespace mg5amcGpu;
     using M_ACCESS = DeviceAccessMomenta;         // non-trivial access: buffer includes all events
     using E_ACCESS = DeviceAccessMatrixElements;  // non-trivial access: buffer includes all events
@@ -1228,7 +1228,7 @@ def get_all_sigmaKin_lines(self, color_amplitudes, class_name):
 #endif /* clang-format on */
     mgDebug( 0, __FUNCTION__ );
     //printf( \"calculate_wavefunctions: ihel=%2d\\n\", ihel );
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
     //printf( \"calculate_wavefunctions: ievt00=%d\\n\", ievt00 );
 #endif""")
             nwavefuncs = self.matrix_elements[0].get_number_of_wavefunctions()
@@ -1265,7 +1265,7 @@ def get_all_sigmaKin_lines(self, color_amplitudes, class_name):
 #endif
     for( int iParity = 0; iParity < nParity; ++iParity )
     { // START LOOP ON IPARITY
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
       const int ievt0 = ievt00 + iParity * neppV;
 #endif""")
             ret_lines += helas_calls
@@ -1665,8 +1665,10 @@ def super_get_matrix_element_calls(self, matrix_element, color_amplitudes, multi
         allCOUPs[idcoup] = CD_ACCESS::idcoupAccessBufferConst( allcouplings, idcoup ); // dependent couplings, vary event-by-event
       for( size_t iicoup = 0; iicoup < nicoup; iicoup++ )
         allCOUPs[ndcoup + iicoup] = CI_ACCESS::iicoupAccessBufferConst( cIPC, iicoup ); // independent couplings, fixed for all events
+#ifdef MGONGPUCPP_GPUIMPL
 #ifdef __CUDACC__
 #pragma nv_diagnostic pop
+#endif
       // CUDA kernels take input/output buffers with momenta/MEs for all events
       const fptype* momenta = allmomenta;
       const fptype* COUPs[nxcoup];
@@ -1782,7 +1784,7 @@ def get_external(self, wf, argument):
             split_line2 = [ str.lstrip(' ').rstrip(' ') for str in split_line2] # AV
             split_line2.insert(2, '0') # add parameter fmass=0
             line2 = ', '.join(split_line2)
-            text = '#if not( defined __CUDACC__ and defined MGONGPU_TEST_DIVERGENCE )\n      %s\n#else\n      if( ( blockDim.x * blockIdx.x + threadIdx.x ) %% 2 == 0 )\n        %s\n      else\n        %s\n#endif\n' # AV
+            text = '#if not( defined MGONGPUCPP_GPUIMPL and defined MGONGPU_TEST_DIVERGENCE )\n      %s\n#else\n      if( ( blockDim.x * blockIdx.x + threadIdx.x ) %% 2 == 0 )\n        %s\n      else\n        %s\n#endif\n' # AV
             return text % (line, line, line2)
         text = '%s\n' # AV
         return text % line
diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/check_sa.cc b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/check_sa.cc
index f5f08dc64e..ceed439cb0 100644
--- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/check_sa.cc
+++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/check_sa.cc
@@ -740,6 +740,8 @@ main( int argc, char** argv )
   // -- CUDA or C++?
 #ifdef __CUDACC__
   wrkflwtxt += "CUD:";
+#elif defined __HIPCC__
+  wrkflwtxt += "HIP:";
 #else
   wrkflwtxt += "CPP:";
 #endif
@@ -1040,7 +1042,7 @@ main( int argc, char** argv )
              << "\"THRUST::COMPLEX\"," << std::endl
 #endif
 #else
-             << "\"STD::COMPLEX\"," << std::endl
+             << "\"???\"," << std::endl                           // no path to this statement...
 #endif
              << "\"RanNumb memory layout\": "
              << "\"AOSOA[" << neppR << "]\""

From 71ff5e2e63d7ff1310f07604f32ea9980e424c46 Mon Sep 17 00:00:00 2001
From: Andrea Valassi <andrea.valassi@cern.ch>
Date: Tue, 18 Jul 2023 19:06:32 +0200
Subject: [PATCH 470/509] [jthip] in CODEGEN, remove the copying to src of
 GpuRuntime.h and GpuAbstraction.h

---
 epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/output.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/output.py b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/output.py
index a947f262b0..d97ab3b4de 100644
--- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/output.py
+++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/output.py
@@ -86,7 +86,7 @@ class PLUGIN_ProcessExporter(PLUGIN_export_cpp.ProcessExporterGPU):
                      'CMake': [s+'CMake/Compilers.txt', s+'CMake/Platforms.txt', s+'CMake/Macros.txt'],
                      'src': [s+'gpu/rambo.h', s+'read_slha.h', s+'read_slha.cc',
                              s+'gpu/mgOnGpuFptypes.h', s+'gpu/mgOnGpuCxtypes.h', s+'gpu/mgOnGpuVectors.h',
-                             s+'CMake/src/CMakeLists.txt', s+'gpu/GpuRuntime.h', s+'gpu/GpuAbstraction.h'],
+                             s+'CMake/src/CMakeLists.txt' ],
                      'SubProcesses': [s+'gpu/nvtx.h', s+'gpu/timer.h', s+'gpu/timermap.h',
                                       s+'gpu/ompnumthreads.h', s+'gpu/GpuRuntime.h', s+'gpu/GpuAbstraction.h',
                                       s+'gpu/MemoryAccessHelpers.h', s+'gpu/MemoryAccessVectors.h',

From a37fb41ac45b3d66c42436d68467521d4b1f6281 Mon Sep 17 00:00:00 2001
From: Andrea Valassi <andrea.valassi@cern.ch>
Date: Tue, 18 Jul 2023 19:24:36 +0200
Subject: [PATCH 471/509] [jthip] In CODEGEN, acknowledge Joergen in each file
 and in COPYRIGHT/AUTHORS

---
 .../CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/AUTHORS | 2 +-
 .../madgraph/iolibs/template_files/COPYRIGHT                 | 1 +
 .../iolibs/template_files/cpp_model_parameters_cc.inc        | 2 +-
 .../madgraph/iolibs/template_files/gpu/Bridge.h              | 2 +-
 .../madgraph/iolibs/template_files/gpu/BridgeKernels.cc      | 2 +-
 .../madgraph/iolibs/template_files/gpu/BridgeKernels.h       | 2 +-
 .../iolibs/template_files/gpu/CommonRandomNumberKernel.cc    | 2 +-
 .../iolibs/template_files/gpu/CrossSectionKernels.cc         | 2 +-
 .../madgraph/iolibs/template_files/gpu/CrossSectionKernels.h | 2 +-
 .../iolibs/template_files/gpu/CurandRandomNumberKernel.cc    | 2 +-
 .../madgraph/iolibs/template_files/gpu/EventStatistics.h     | 2 +-
 .../madgraph/iolibs/template_files/gpu/GpuAbstraction.h      | 5 +++++
 .../madgraph/iolibs/template_files/gpu/GpuRuntime.h          | 5 +++++
 .../madgraph/iolibs/template_files/gpu/MadgraphTest.h        | 2 +-
 .../iolibs/template_files/gpu/MatrixElementKernels.cc        | 2 +-
 .../iolibs/template_files/gpu/MatrixElementKernels.h         | 2 +-
 .../madgraph/iolibs/template_files/gpu/MemoryAccessHelpers.h | 2 +-
 .../madgraph/iolibs/template_files/gpu/MemoryAccessMomenta.h | 2 +-
 .../iolibs/template_files/gpu/MemoryAccessRandomNumbers.h    | 2 +-
 .../madgraph/iolibs/template_files/gpu/MemoryAccessVectors.h | 2 +-
 .../madgraph/iolibs/template_files/gpu/MemoryBuffers.h       | 2 +-
 .../iolibs/template_files/gpu/RamboSamplingKernels.cc        | 2 +-
 .../iolibs/template_files/gpu/RamboSamplingKernels.h         | 2 +-
 .../madgraph/iolibs/template_files/gpu/RandomNumberKernels.h | 2 +-
 .../madgraph/iolibs/template_files/gpu/check_sa.cc           | 2 +-
 .../madgraph/iolibs/template_files/gpu/cpp_hel_amps_h.inc    | 2 +-
 .../madgraph/iolibs/template_files/gpu/cudacpp.mk            | 2 +-
 .../madgraph/iolibs/template_files/gpu/fbridge.cc            | 2 +-
 .../madgraph/iolibs/template_files/gpu/fsampler.cc           | 2 +-
 .../madgraph/iolibs/template_files/gpu/mgOnGpuConfig.h       | 2 +-
 .../madgraph/iolibs/template_files/gpu/mgOnGpuCxtypes.h      | 2 +-
 .../madgraph/iolibs/template_files/gpu/mgOnGpuFptypes.h      | 2 +-
 .../madgraph/iolibs/template_files/gpu/process_cc.inc        | 2 +-
 .../template_files/gpu/process_function_definitions.inc      | 2 +-
 .../madgraph/iolibs/template_files/gpu/process_h.inc         | 2 +-
 .../madgraph/iolibs/template_files/gpu/process_matrix.inc    | 2 +-
 .../iolibs/template_files/gpu/process_sigmaKin_function.inc  | 2 +-
 .../madgraph/iolibs/template_files/gpu/rambo.h               | 2 +-
 .../madgraph/iolibs/template_files/gpu/runTest.cc            | 2 +-
 .../madgraph/iolibs/template_files/gpu/testmisc.cc           | 2 +-
 .../madgraph/iolibs/template_files/gpu/testxxx.cc            | 2 +-
 .../CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/model_handling.py       | 2 +-
 epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/output.py    | 2 +-
 43 files changed, 51 insertions(+), 40 deletions(-)

diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/AUTHORS b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/AUTHORS
index 8541e954b9..0aeb2c8a87 100644
--- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/AUTHORS
+++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/AUTHORS
@@ -10,6 +10,7 @@ generates includes the following authors:
   Stephan Hageboeck (CERN)
   Olivier Mattelaer (Universite Catholique de Louvain, original author)
   Stefan Roiser (CERN, original author)
+  Joergen Teig (CERN)
   Andrea Valassi (CERN, original author)
   Zenny Wettersten (CERN)
 
@@ -28,5 +29,4 @@ acknowledged collaboration with the following collaborators:
   Taran Singhania (PES University Bangalore)
   David Smith (CERN)
   Carl Vuosalo (University of Wisconsin-Madison)
-  Joergen Teig (CERN)
 
diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/COPYRIGHT b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/COPYRIGHT
index a134b5fef9..84a883fbb0 100644
--- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/COPYRIGHT
+++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/COPYRIGHT
@@ -15,6 +15,7 @@ The full development team currently includes the following authors :
   Stephan Hageboeck (CERN)
   Olivier Mattelaer (Universite Catholique de Louvain, original author)
   Stefan Roiser (CERN, original author)
+  Joergen Teig (CERN)
   Andrea Valassi (CERN, original author)
   Zenny Wettersten (CERN)
 See https://github.com/madgraph5/madgraph4gpu for more details. For the full
diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/cpp_model_parameters_cc.inc b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/cpp_model_parameters_cc.inc
index 3c231bdbd6..05b664981d 100644
--- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/cpp_model_parameters_cc.inc
+++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/cpp_model_parameters_cc.inc
@@ -4,7 +4,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: A. Valassi (Sep 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
 %(info_lines)s
diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/Bridge.h b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/Bridge.h
index bcdfe29154..89437b4c42 100644
--- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/Bridge.h
+++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/Bridge.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: S. Roiser (Nov 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Roiser, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef BRIDGE_H
 #define BRIDGE_H 1
diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/BridgeKernels.cc b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/BridgeKernels.cc
index 6034db93ec..eaf4037a24 100644
--- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/BridgeKernels.cc
+++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/BridgeKernels.cc
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #include "BridgeKernels.h"
 
diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/BridgeKernels.h b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/BridgeKernels.h
index 7c7feb692a..3efef8ce97 100644
--- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/BridgeKernels.h
+++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/BridgeKernels.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef BRIDGEKERNELS_H
 #define BRIDGEKERNELS_H 1
diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/CommonRandomNumberKernel.cc b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/CommonRandomNumberKernel.cc
index f17b9c0ad7..010bc4cbd0 100644
--- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/CommonRandomNumberKernel.cc
+++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/CommonRandomNumberKernel.cc
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #include "CommonRandomNumbers.h"
 #include "GpuAbstraction.h"
diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/CrossSectionKernels.cc b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/CrossSectionKernels.cc
index 36ca2a94d4..c15b39844d 100644
--- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/CrossSectionKernels.cc
+++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/CrossSectionKernels.cc
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #include "CrossSectionKernels.h"
 
diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/CrossSectionKernels.h b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/CrossSectionKernels.h
index ff2350a14d..4d9659e04e 100644
--- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/CrossSectionKernels.h
+++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/CrossSectionKernels.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef CROSSSECTIONKERNELS_H
 #define CROSSSECTIONKERNELS_H 1
diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/CurandRandomNumberKernel.cc b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/CurandRandomNumberKernel.cc
index 98ec214eaf..38c477c17a 100644
--- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/CurandRandomNumberKernel.cc
+++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/CurandRandomNumberKernel.cc
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #include "GpuRuntime.h"
 #include "MemoryBuffers.h"
diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/EventStatistics.h b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/EventStatistics.h
index e7d7f3b3c3..b425a5bade 100644
--- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/EventStatistics.h
+++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/EventStatistics.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef EventStatistics_H
 #define EventStatistics_H 1
diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/GpuAbstraction.h b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/GpuAbstraction.h
index 427c82c05d..6a7d9c05c0 100644
--- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/GpuAbstraction.h
+++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/GpuAbstraction.h
@@ -1,3 +1,8 @@
+// Copyright (C) 2020-2023 CERN and UCLouvain.
+// Licensed under the GNU Lesser General Public License (version 3 or later).
+// Created by: J. Teig (Jul 2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+
 #ifndef MG5AMC_GPUABSTRACTION_H
 #define MG5AMC_GPUABSTRACTION_H 1
 
diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/GpuRuntime.h b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/GpuRuntime.h
index 895a662e52..93579ef08b 100644
--- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/GpuRuntime.h
+++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/GpuRuntime.h
@@ -1,3 +1,8 @@
+// Copyright (C) 2020-2023 CERN and UCLouvain.
+// Licensed under the GNU Lesser General Public License (version 3 or later).
+// Created by: J. Teig (Jun 2023, based on earlier work by S. Roiser) for the MG5aMC CUDACPP plugin.
+// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+
 #ifndef MG5AMC_GPURUNTIME_H
 #define MG5AMC_GPURUNTIME_H 1
 
diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MadgraphTest.h b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MadgraphTest.h
index 3fa9f13a82..176338151a 100644
--- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MadgraphTest.h
+++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MadgraphTest.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: S. Hageboeck (Dec 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MADGRAPHTEST_H_
 #define MADGRAPHTEST_H_ 1
diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MatrixElementKernels.cc b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MatrixElementKernels.cc
index a9e20e114f..d6d6c4f179 100644
--- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MatrixElementKernels.cc
+++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MatrixElementKernels.cc
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #include "MatrixElementKernels.h"
 
diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MatrixElementKernels.h b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MatrixElementKernels.h
index 4477a385ed..72bd8f195b 100644
--- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MatrixElementKernels.h
+++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MatrixElementKernels.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MATRIXELEMENTKERNELS_H
 #define MATRIXELEMENTKERNELS_H 1
diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MemoryAccessHelpers.h b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MemoryAccessHelpers.h
index 67306c3922..db73e4e064 100644
--- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MemoryAccessHelpers.h
+++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MemoryAccessHelpers.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MemoryAccessHelpers_H
 #define MemoryAccessHelpers_H 1
diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MemoryAccessMomenta.h b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MemoryAccessMomenta.h
index dc4bb2aa22..d3f5a15bd2 100644
--- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MemoryAccessMomenta.h
+++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MemoryAccessMomenta.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MemoryAccessMomenta_H
 #define MemoryAccessMomenta_H 1
diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MemoryAccessRandomNumbers.h b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MemoryAccessRandomNumbers.h
index 949a42066d..40cb089135 100644
--- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MemoryAccessRandomNumbers.h
+++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MemoryAccessRandomNumbers.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MemoryAccessRandomNumbers_H
 #define MemoryAccessRandomNumbers_H 1
diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MemoryAccessVectors.h b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MemoryAccessVectors.h
index a9ae26b6dc..08faccff0f 100644
--- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MemoryAccessVectors.h
+++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MemoryAccessVectors.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MemoryAccessVectors_H
 #define MemoryAccessVectors_H 1
diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MemoryBuffers.h b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MemoryBuffers.h
index 522e6ce100..f29b8c5357 100644
--- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MemoryBuffers.h
+++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MemoryBuffers.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021, based on earlier work by S. Hageboeck) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Roiser, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MemoryBuffers_H
 #define MemoryBuffers_H 1
diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/RamboSamplingKernels.cc b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/RamboSamplingKernels.cc
index 8745b084d3..79abbcc4f8 100644
--- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/RamboSamplingKernels.cc
+++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/RamboSamplingKernels.cc
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #include "RamboSamplingKernels.h"
 
diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/RamboSamplingKernels.h b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/RamboSamplingKernels.h
index fe63a7bb77..7c214cd74b 100644
--- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/RamboSamplingKernels.h
+++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/RamboSamplingKernels.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef RAMBOSAMPLINGKERNELS_H
 #define RAMBOSAMPLINGKERNELS_H 1
diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/RandomNumberKernels.h b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/RandomNumberKernels.h
index 0c215f2583..21d63beeac 100644
--- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/RandomNumberKernels.h
+++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/RandomNumberKernels.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef RANDOMNUMBERKERNELS_H
 #define RANDOMNUMBERKERNELS_H 1
diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/check_sa.cc b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/check_sa.cc
index 491dfc02e1..734c2f83f8 100644
--- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/check_sa.cc
+++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/check_sa.cc
@@ -4,7 +4,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: O. Mattelaer (Nov 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 
 #include "mgOnGpuConfig.h"
diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/cpp_hel_amps_h.inc b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/cpp_hel_amps_h.inc
index 594fb770c5..b9840f1374 100644
--- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/cpp_hel_amps_h.inc
+++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/cpp_hel_amps_h.inc
@@ -5,7 +5,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: A. Valassi (Sep 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for %(output_name)s by
 %(info_lines)s
diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/cudacpp.mk b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/cudacpp.mk
index f024f15ce7..699ce2c4e0 100644
--- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/cudacpp.mk
+++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/cudacpp.mk
@@ -1,7 +1,7 @@
 # Copyright (C) 2020-2023 CERN and UCLouvain.
 # Licensed under the GNU Lesser General Public License (version 3 or later).
 # Created by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin.
-# Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+# Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 
 #=== Determine the name of this makefile (https://ftp.gnu.org/old-gnu/Manuals/make-3.80/html_node/make_17.html)
 #=== NB: different names (e.g. cudacpp.mk and cudacpp_src.mk) are used in the Subprocess and src directories
diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/fbridge.cc b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/fbridge.cc
index 592a8c74bb..22ce3f5115 100644
--- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/fbridge.cc
+++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/fbridge.cc
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: S. Roiser (Oct 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Roiser, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #include "Bridge.h"
 #include "CPPProcess.h"
diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/fsampler.cc b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/fsampler.cc
index acffa7c19e..3743934f41 100644
--- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/fsampler.cc
+++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/fsampler.cc
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Feb 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #include "mgOnGpuConfig.h"
 
diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/mgOnGpuConfig.h b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/mgOnGpuConfig.h
index 1811de4699..35ad042b75 100644
--- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/mgOnGpuConfig.h
+++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/mgOnGpuConfig.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jul 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MGONGPUCONFIG_H
 #define MGONGPUCONFIG_H 1
diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/mgOnGpuCxtypes.h b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/mgOnGpuCxtypes.h
index b5e1f1a495..687d449117 100644
--- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/mgOnGpuCxtypes.h
+++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/mgOnGpuCxtypes.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022, based on earlier work by D. Smith) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MGONGPUCXTYPES_H
 #define MGONGPUCXTYPES_H 1
diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/mgOnGpuFptypes.h b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/mgOnGpuFptypes.h
index d9a955c235..83a46c1d4e 100644
--- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/mgOnGpuFptypes.h
+++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/mgOnGpuFptypes.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MGONGPUFPTYPES_H
 #define MGONGPUFPTYPES_H 1
diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/process_cc.inc b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/process_cc.inc
index 95400f42db..815fd8d5b7 100644
--- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/process_cc.inc
+++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/process_cc.inc
@@ -4,7 +4,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
 %(info_lines)s
diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/process_function_definitions.inc b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/process_function_definitions.inc
index aa8f899798..be10dba1de 100644
--- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/process_function_definitions.inc
+++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/process_function_definitions.inc
@@ -4,7 +4,7 @@
 ! Copyright (C) 2020-2023 CERN and UCLouvain.
 ! Licensed under the GNU Lesser General Public License (version 3 or later).
 ! Modified by: A. Valassi (Sep 2021) for the MG5aMC CUDACPP plugin.
-! Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+! Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 !==========================================================================
 //==========================================================================
 // Class member functions for calculating the matrix elements for
diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/process_h.inc b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/process_h.inc
index 8a9de336f2..2c3adf57e2 100644
--- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/process_h.inc
+++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/process_h.inc
@@ -4,7 +4,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
 %(info_lines)s
diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/process_matrix.inc b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/process_matrix.inc
index 3cfbf668ca..960f029d8d 100644
--- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/process_matrix.inc
+++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/process_matrix.inc
@@ -4,7 +4,7 @@
 ! Copyright (C) 2020-2023 CERN and UCLouvain.
 ! Licensed under the GNU Lesser General Public License (version 3 or later).
 ! Modified by: A. Valassi (Sep 2021) for the MG5aMC CUDACPP plugin.
-! Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+! Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 !==========================================================================
 
       // *** COLOR CHOICE BELOW ***
diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/process_sigmaKin_function.inc b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/process_sigmaKin_function.inc
index 59c1623c5a..b84a96d6ec 100644
--- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/process_sigmaKin_function.inc
+++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/process_sigmaKin_function.inc
@@ -4,7 +4,7 @@
 ! Copyright (C) 2020-2023 CERN and UCLouvain.
 ! Licensed under the GNU Lesser General Public License (version 3 or later).
 ! Modified by: A. Valassi (Sep 2021) for the MG5aMC CUDACPP plugin.
-! Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+! Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 !==========================================================================
 
 #include "GpuAbstraction.h"
diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/rambo.h b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/rambo.h
index 3a331b979a..cd7e1008ea 100644
--- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/rambo.h
+++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/rambo.h
@@ -4,7 +4,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 
 #include "mgOnGpuConfig.h"
diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/runTest.cc b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/runTest.cc
index 6f20a7248a..7f8d6ffd12 100644
--- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/runTest.cc
+++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/runTest.cc
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: S. Hageboeck (Nov 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 //----------------------------------------------------------------------------
 // Use ./runTest.exe --gtest_filter=*xxx to run only testxxx.cc tests
 //----------------------------------------------------------------------------
diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/testmisc.cc b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/testmisc.cc
index 5d00e2c06c..8f3480c45f 100644
--- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/testmisc.cc
+++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/testmisc.cc
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 //----------------------------------------------------------------------------
 // Use ./runTest.exe --gtest_filter=*misc to run only testmisc.cc tests
 //----------------------------------------------------------------------------
diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/testxxx.cc b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/testxxx.cc
index 6f8736c120..ffaf9ad005 100644
--- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/testxxx.cc
+++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/testxxx.cc
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Apr 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 //----------------------------------------------------------------------------
 // Use ./runTest.exe --gtest_filter=*xxx to run only testxxx.cc tests
 //----------------------------------------------------------------------------
diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/model_handling.py b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/model_handling.py
index abfd2428b6..1a38085af9 100644
--- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/model_handling.py
+++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/model_handling.py
@@ -1,7 +1,7 @@
 # Copyright (C) 2020-2023 CERN and UCLouvain.
 # Licensed under the GNU Lesser General Public License (version 3 or later).
 # Created by: O. Mattelaer (Sep 2021) for the MG5aMC CUDACPP plugin.
-# Further modified by: O. Mattelaer, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+# Further modified by: O. Mattelaer, J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 import os
 
diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/output.py b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/output.py
index d97ab3b4de..585d065c39 100644
--- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/output.py
+++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/output.py
@@ -1,7 +1,7 @@
 # Copyright (C) 2020-2023 CERN and UCLouvain.
 # Licensed under the GNU Lesser General Public License (version 3 or later).
 # Created by: O. Mattelaer (Sep 2021) for the MG5aMC CUDACPP plugin.
-# Further modified by: O. Mattelaer, S. Roiser, A. Valassi, Z. Wettersten (2021-2023) for the MG5aMC CUDACPP plugin.
+# Further modified by: O. Mattelaer, S. Roiser, A. Valassi, J. Teig, Z. Wettersten (2021-2023) for the MG5aMC CUDACPP plugin.
 
 import os
 

From 428aa50b5d76514535c638b6aad2dbe56756396b Mon Sep 17 00:00:00 2001
From: Jorgen T <jorgen.teig@gmail.com>
Date: Thu, 20 Jul 2023 14:41:55 +0200
Subject: [PATCH 472/509] [CODEGEN] Added HIP runtime include in
 mgOnGpuConfig.h in codegen

---
 .../madgraph/iolibs/template_files/gpu/GpuAbstraction.h         | 2 --
 .../madgraph/iolibs/template_files/gpu/mgOnGpuConfig.h          | 1 +
 2 files changed, 1 insertion(+), 2 deletions(-)

diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/GpuAbstraction.h b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/GpuAbstraction.h
index 6a7d9c05c0..9c467b1e04 100644
--- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/GpuAbstraction.h
+++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/GpuAbstraction.h
@@ -39,8 +39,6 @@
 
 #elif defined __HIPCC__
 
-#include "hip/hip_runtime.h"
-
 #define gpuError_t hipError_t
 #define gpuPeekAtLastError hipPeekAtLastError
 #define gpuGetErrorString hipGetErrorString
diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/mgOnGpuConfig.h b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/mgOnGpuConfig.h
index 35ad042b75..8da9429de8 100644
--- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/mgOnGpuConfig.h
+++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/mgOnGpuConfig.h
@@ -15,6 +15,7 @@
 #define MGONGPUCPP_GPUIMPL cuda
 #elif defined __HIPCC__
 #define MGONGPUCPP_GPUIMPL hip
+#include "hip/hip_runtime.h"
 #else
 #undef MGONGPUCPP_GPUIMPL
 #endif

From 24fbbb6067e21cbbcc057ecb7d5b11f6b89a8922 Mon Sep 17 00:00:00 2001
From: Andrea Valassi <andrea.valassi@cern.ch>
Date: Tue, 25 Jul 2023 16:59:59 +0200
Subject: [PATCH 473/509] [jthip/namespace] backport latest changes from
 ggttgg.mad to CODEGEN

---
 .../iolibs/template_files/cpp_model_parameters_cc.inc     | 2 +-
 .../iolibs/template_files/cpp_model_parameters_h.inc      | 6 +++---
 .../iolibs/template_files/gpu/CurandRandomNumberKernel.cc | 4 ++--
 .../iolibs/template_files/gpu/MemoryAccessAmplitudes.h    | 2 +-
 .../iolibs/template_files/gpu/MemoryAccessCouplings.h     | 2 +-
 .../template_files/gpu/MemoryAccessCouplingsFixed.h       | 2 +-
 .../iolibs/template_files/gpu/MemoryAccessDenominators.h  | 2 +-
 .../madgraph/iolibs/template_files/gpu/MemoryAccessGs.h   | 2 +-
 .../template_files/gpu/MemoryAccessMatrixElements.h       | 2 +-
 .../iolibs/template_files/gpu/MemoryAccessNumerators.h    | 2 +-
 .../iolibs/template_files/gpu/MemoryAccessWavefunctions.h | 2 +-
 .../madgraph/iolibs/template_files/gpu/mgOnGpuCxtypes.h   | 8 ++++----
 .../madgraph/iolibs/template_files/gpu/mgOnGpuVectors.h   | 2 +-
 .../madgraph/iolibs/template_files/gpu/testmisc.cc        | 4 ++--
 .../madgraph/iolibs/template_files/gpu/testxxx.cc         | 6 +++---
 15 files changed, 24 insertions(+), 24 deletions(-)

diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/cpp_model_parameters_cc.inc b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/cpp_model_parameters_cc.inc
index 05b664981d..54ce4c64cf 100644
--- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/cpp_model_parameters_cc.inc
+++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/cpp_model_parameters_cc.inc
@@ -15,7 +15,7 @@
 #include <iomanip>
 #include <iostream>
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 using namespace mg5amcGpu;
 #else
 using namespace mg5amcCpu;
diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/cpp_model_parameters_h.inc b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/cpp_model_parameters_h.inc
index ef3d99d07c..5ab7aa7abd 100644
--- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/cpp_model_parameters_h.inc
+++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/cpp_model_parameters_h.inc
@@ -25,7 +25,7 @@
 #include "read_slha.h"
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -85,7 +85,7 @@ namespace mg5amcCpu
 #include <limits>
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -155,7 +155,7 @@ namespace mg5amcCpu
 //==========================================================================
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/CurandRandomNumberKernel.cc b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/CurandRandomNumberKernel.cc
index 38c477c17a..08a16f6f2c 100644
--- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/CurandRandomNumberKernel.cc
+++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/CurandRandomNumberKernel.cc
@@ -22,7 +22,7 @@ inline void assertCurand( curandStatus_t code, const char *file, int line, bool
 }
 #endif /* clang-format on */
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -36,7 +36,7 @@ namespace mg5amcCpu
   {
     if( m_isOnDevice )
     {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
       if( !m_rnarray.isOnDevice() )
         throw std::runtime_error( "CurandRandomNumberKernel on device with a host random number array" );
 #else
diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MemoryAccessAmplitudes.h b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MemoryAccessAmplitudes.h
index 573b3bbbc9..ffb76e93de 100644
--- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MemoryAccessAmplitudes.h
+++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MemoryAccessAmplitudes.h
@@ -15,7 +15,7 @@
 #define MGONGPU_TRIVIAL_AMPLITUDES 1
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MemoryAccessCouplings.h b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MemoryAccessCouplings.h
index 1afc589b11..b4b76f3842 100644
--- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MemoryAccessCouplings.h
+++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MemoryAccessCouplings.h
@@ -15,7 +15,7 @@
 #include "MemoryBuffers.h"       // for HostBufferCouplings::isaligned
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MemoryAccessCouplingsFixed.h b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MemoryAccessCouplingsFixed.h
index dc0d93afff..ffcdf4dbef 100644
--- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MemoryAccessCouplingsFixed.h
+++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MemoryAccessCouplingsFixed.h
@@ -14,7 +14,7 @@
 //#include "MemoryAccessHelpers.h"
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MemoryAccessDenominators.h b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MemoryAccessDenominators.h
index 3bce635718..66f2d32a6b 100644
--- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MemoryAccessDenominators.h
+++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MemoryAccessDenominators.h
@@ -10,7 +10,7 @@
 #include "MemoryAccessGs.h"
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MemoryAccessGs.h b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MemoryAccessGs.h
index 31311aa375..4c726b30f3 100644
--- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MemoryAccessGs.h
+++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MemoryAccessGs.h
@@ -13,7 +13,7 @@
 #include "MemoryBuffers.h" // for HostBufferMatrixElements::isaligned
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MemoryAccessMatrixElements.h b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MemoryAccessMatrixElements.h
index f32e6fea5b..3741011971 100644
--- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MemoryAccessMatrixElements.h
+++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MemoryAccessMatrixElements.h
@@ -13,7 +13,7 @@
 #include "MemoryBuffers.h" // for HostBufferMatrixElements::isaligned
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MemoryAccessNumerators.h b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MemoryAccessNumerators.h
index b152183b28..18991f4fa6 100644
--- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MemoryAccessNumerators.h
+++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MemoryAccessNumerators.h
@@ -10,7 +10,7 @@
 #include "MemoryAccessGs.h"
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MemoryAccessWavefunctions.h b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MemoryAccessWavefunctions.h
index 5428aaf933..33bef4559e 100644
--- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MemoryAccessWavefunctions.h
+++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MemoryAccessWavefunctions.h
@@ -15,7 +15,7 @@
 #define MGONGPU_TRIVIAL_WAVEFUNCTIONS 1
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/mgOnGpuCxtypes.h b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/mgOnGpuCxtypes.h
index 687d449117..6ae0c42ecb 100644
--- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/mgOnGpuCxtypes.h
+++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/mgOnGpuCxtypes.h
@@ -82,7 +82,7 @@ namespace mgOnGpu /* clang-format off */
 using mgOnGpu::cxsmpl;
 
 // Printout to stream for user defined types
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -209,7 +209,7 @@ namespace mg5amcCpu
 //==========================================================================
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -249,7 +249,7 @@ namespace mg5amcCpu
 //==========================================================================
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -627,7 +627,7 @@ namespace mg5amcCpu
 //==========================================================================
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/mgOnGpuVectors.h b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/mgOnGpuVectors.h
index fbfe68f6c1..cdae04326b 100644
--- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/mgOnGpuVectors.h
+++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/mgOnGpuVectors.h
@@ -32,7 +32,7 @@
 #endif
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/testmisc.cc b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/testmisc.cc
index 8f3480c45f..ba9e59a8a3 100644
--- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/testmisc.cc
+++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/testmisc.cc
@@ -26,7 +26,7 @@
 #define XTESTID( s ) TESTID( s )
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -59,7 +59,7 @@ namespace mg5amcCpu
 
 TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testmisc )
 {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   using namespace mg5amcGpu;
 #else
   using namespace mg5amcCpu;
diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/testxxx.cc b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/testxxx.cc
index ffaf9ad005..786cf10171 100644
--- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/testxxx.cc
+++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/testxxx.cc
@@ -32,7 +32,7 @@
 
 #define XTESTID( s ) TESTID( s )
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -42,7 +42,7 @@ namespace mg5amcCpu
   int FPEhandlerIevt = -1;
   inline void FPEhandler( int sig )
   {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     std::cerr << "Floating Point Exception (GPU): '" << FPEhandlerMessage << "' ievt=" << FPEhandlerIevt << std::endl;
 #else
     std::cerr << "Floating Point Exception (CPU neppV=" << neppV << "): '" << FPEhandlerMessage << "' ievt=" << FPEhandlerIevt << std::endl;
@@ -53,7 +53,7 @@ namespace mg5amcCpu
 
 TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testxxx )
 {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   using namespace mg5amcGpu;
 #else
   using namespace mg5amcCpu;

From 10df7037f3552bf533521cfb819907a07d584b57 Mon Sep 17 00:00:00 2001
From: Andrea Valassi <andrea.valassi@cern.ch>
Date: Tue, 25 Jul 2023 17:06:31 +0200
Subject: [PATCH 474/509] [jthip] in CODEGEN, backport also cudacpp_src.mk
 using GPUCC instead of NVCC

---
 .../iolibs/template_files/gpu/cudacpp_src.mk       | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/cudacpp_src.mk b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/cudacpp_src.mk
index dac2e47d1d..f3a26552db 100644
--- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/cudacpp_src.mk
+++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/cudacpp_src.mk
@@ -38,13 +38,13 @@ endif
 
 #-------------------------------------------------------------------------------
 
-#=== Configure the CUDA compiler (note: NVCC is already exported including ccache)
+#=== Configure the CUDA compiler (note: GPUCC is already exported including ccache)
 
-###$(info NVCC=$(NVCC))
+###$(info GPUCC=$(GPUCC))
 
 #-------------------------------------------------------------------------------
 
-#=== Configure ccache for C++ builds (note: NVCC is already exported including ccache)
+#=== Configure ccache for C++ builds (note: GPUCC is already exported including ccache)
 
 # Enable ccache if USECCACHE=1
 ifeq ($(USECCACHE)$(shell echo $(CXX) | grep ccache),1)
@@ -246,20 +246,20 @@ $(BUILDDIR)/%%.o : %%.cc *.h $(BUILDDIR)/.build.$(TAG)
 # Generic target and build rules: objects from CUDA compilation
 $(BUILDDIR)/%%_cu.o : %%.cc *.h $(BUILDDIR)/.build.$(TAG)
 	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
-	$(NVCC) $(CPPFLAGS) $(CUFLAGS) -Xcompiler -fPIC -c -x cu $< -o $@
+	$(GPUCC) $(CPPFLAGS) $(CUFLAGS) -Xcompiler -fPIC -c -x cu $< -o $@
 
 #-------------------------------------------------------------------------------
 
 cxx_objects=$(addprefix $(BUILDDIR)/, Parameters_%(model)s.o read_slha.o)
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 cu_objects=$(addprefix $(BUILDDIR)/, Parameters_%(model)s_cu.o)
 endif
 
 # Target (and build rules): common (src) library
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so : $(cxx_objects) $(cu_objects)
 	@if [ ! -d $(LIBDIR) ]; then echo "mkdir -p $(LIBDIR)"; mkdir -p $(LIBDIR); fi
-	$(NVCC) -shared -o $@ $(cxx_objects) $(cu_objects)
+	$(GPUCC) -shared -o $@ $(cxx_objects) $(cu_objects)
 else
 $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so : $(cxx_objects)
 	@if [ ! -d $(LIBDIR) ]; then echo "mkdir -p $(LIBDIR)"; mkdir -p $(LIBDIR); fi

From 43e0c646fe37112ee40e0c6fac196f4d1414dce4 Mon Sep 17 00:00:00 2001
From: Jorgen T <jorgen.teig@gmail.com>
Date: Thu, 10 Aug 2023 15:10:26 +0200
Subject: [PATCH 475/509] [CODEGEN] Added changes from gg_ttgg.mad to code
 generator

---
 .../madgraph/iolibs/template_files/gpu/cudacpp.mk     |  2 ++
 .../madgraph/iolibs/template_files/gpu/cudacpp_src.mk | 11 +++++++++--
 .../iolibs/template_files/gpu/mgOnGpuCxtypes.h        |  2 +-
 3 files changed, 12 insertions(+), 3 deletions(-)

diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/cudacpp.mk b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/cudacpp.mk
index 699ce2c4e0..9fb389be2c 100644
--- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/cudacpp.mk
+++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/cudacpp.mk
@@ -216,6 +216,8 @@ else ifeq ($(findstring hipcc,$(HIP_COMPILER_PATH)),hipcc)
     CUBUILDRULEFLAGS = -fPIC -c
     CCBUILDRULEFLAGS = -fPIC -c
 
+    export HIPARCHFLAGS
+
   else ifneq ($(origin REQUIRE_HIP),undefined)
     # If REQUIRE_HIP is set but no cuda is found, stop here (e.g. for CI tests on GPU #443)
     $(error No hip installation found (set HIP_HOME or make GPUCC visible in PATH))
diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/cudacpp_src.mk b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/cudacpp_src.mk
index f3a26552db..d28c92ec13 100644
--- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/cudacpp_src.mk
+++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/cudacpp_src.mk
@@ -19,7 +19,7 @@ SHELL := /bin/bash
 #=== Configure common compiler flags for CUDA and C++
 
 INCFLAGS = -I.
-OPTFLAGS = -O3 # this ends up in CUFLAGS too (should it?), cannot add -Ofast or -ffast-math here
+OPTFLAGS = -O3 # this ends up in GPUFLAGS too (should it?), cannot add -Ofast or -ffast-math here
 
 #-------------------------------------------------------------------------------
 
@@ -85,6 +85,13 @@ endif
 ###$(info OMPFLAGS=$(OMPFLAGS))
 CXXFLAGS += $(OMPFLAGS)
 
+# Add correct -DHIP_LATFORM when compiling for HIP
+ifeq ($(findstring nvcc,$(GPUCC)),nvcc)
+  GPUFLAGS += -Xcompiler -fPIC -c -x cu
+else ifeq ($(findstring hipcc,$(GPUCC)),hipcc)
+  GPUFLAGS += $(HIPARCHFLAGS) -DHIP_PLATFORM=amd -fPIC -c
+endif
+
 # Set the build flags appropriate to each AVX choice (example: "make AVX=none")
 # [NB MGONGPU_PVW512 is needed because "-mprefer-vector-width=256" is not exposed in a macro]
 # [See https://gcc.gnu.org/bugzilla/show_bug.cgi?id=96476]
@@ -246,7 +253,7 @@ $(BUILDDIR)/%%.o : %%.cc *.h $(BUILDDIR)/.build.$(TAG)
 # Generic target and build rules: objects from CUDA compilation
 $(BUILDDIR)/%%_cu.o : %%.cc *.h $(BUILDDIR)/.build.$(TAG)
 	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
-	$(GPUCC) $(CPPFLAGS) $(CUFLAGS) -Xcompiler -fPIC -c -x cu $< -o $@
+	$(GPUCC) $(CPPFLAGS) $(GPUFLAGS) $< -o $@
 
 #-------------------------------------------------------------------------------
 
diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/mgOnGpuCxtypes.h b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/mgOnGpuCxtypes.h
index 6ae0c42ecb..46d9f02733 100644
--- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/mgOnGpuCxtypes.h
+++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/mgOnGpuCxtypes.h
@@ -92,7 +92,7 @@ namespace mg5amcCpu
   inline __host__ std::ostream&
   operator<<( std::ostream& out, const cxsmpl<FP>& c )
   {
-    out << std::complex( c.real(), c.imag() );
+    out << std::complex<FP>( c.real(), c.imag() );
     return out;
   }
 

From e99a2b87773025a9098e8ad3933cf408f01cedf2 Mon Sep 17 00:00:00 2001
From: Jorgen T <jorgen.teig@gmail.com>
Date: Thu, 10 Aug 2023 15:44:56 +0200
Subject: [PATCH 476/509] [CODEGEN] Added export of GPUCC and GPUFLAGS to
 codegen

---
 .../madgraph/iolibs/template_files/gpu/cudacpp.mk             | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/cudacpp.mk b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/cudacpp.mk
index 9fb389be2c..e1d691f1d6 100644
--- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/cudacpp.mk
+++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/cudacpp.mk
@@ -237,7 +237,9 @@ else ifeq ($(findstring hipcc,$(HIP_COMPILER_PATH)),hipcc)
 
 endif
 
-  
+export GPUCC
+export GPUFLAGS
+
 #-------------------------------------------------------------------------------
 
 #=== Configure ccache for C++ and CUDA builds

From 4adb62fe5a480b7d5aec864e0fea21e466b1e76e Mon Sep 17 00:00:00 2001
From: Jorgen T <jorgen.teig@gmail.com>
Date: Thu, 10 Aug 2023 17:20:16 +0200
Subject: [PATCH 477/509] Fixed warning and changed HIPARCHFLAGS export so it
 exports to cudacpp_src.mk

---
 .../madgraph/iolibs/template_files/gpu/cudacpp.mk      | 10 +++++-----
 epochX/cudacpp/gg_ttgg.mad/SubProcesses/runTest.cc     |  2 +-
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/cudacpp.mk b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/cudacpp.mk
index e1d691f1d6..b6703137aa 100644
--- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/cudacpp.mk
+++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/cudacpp.mk
@@ -179,11 +179,11 @@ ifeq ($(findstring nvcc,$(CUDA_COMPILER_PATH)),nvcc)
 else ifeq ($(findstring hipcc,$(HIP_COMPILER_PATH)),hipcc)
   #=== Configure the HIP compiler
 
-  # If CXX is not a single word (example "clang++ --gcc-toolchain...") then disable CUDA builds (issue #505)
+  # If CXX is not a single word (example "clang++ --gcc-toolchain...") then disable CUDA/HIP builds (issue #505)
   # This is because it is impossible to pass this to "GPUFLAGS += -ccbin <host-compiler>" below
   ifneq ($(words $(subst ccache ,,$(CXX))),1) # allow at most "CXX=ccache <host-compiler>" from outside
-    $(warning CUDA builds are not supported for multi-word CXX "$(CXX)")
-    override CUDA_HOME=disabled
+    $(warning HIP builds are not supported for multi-word CXX "$(CXX)")
+    override HIP_HOME=disabled
   endif
 
   # If HIP_HOME is not set, try to set it from the location of GPUCC
@@ -216,8 +216,6 @@ else ifeq ($(findstring hipcc,$(HIP_COMPILER_PATH)),hipcc)
     CUBUILDRULEFLAGS = -fPIC -c
     CCBUILDRULEFLAGS = -fPIC -c
 
-    export HIPARCHFLAGS
-
   else ifneq ($(origin REQUIRE_HIP),undefined)
     # If REQUIRE_HIP is set but no cuda is found, stop here (e.g. for CI tests on GPU #443)
     $(error No hip installation found (set HIP_HOME or make GPUCC visible in PATH))
@@ -240,6 +238,8 @@ endif
 export GPUCC
 export GPUFLAGS
 
+export HIPARCHFLAGS
+
 #-------------------------------------------------------------------------------
 
 #=== Configure ccache for C++ and CUDA builds
diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/runTest.cc b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/runTest.cc
index d4a760a71b..904cb78a72 100644
--- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/runTest.cc
+++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/runTest.cc
@@ -128,7 +128,7 @@ struct CUDATest : public CUDA_CPU_TestBase
   {
     ~DeviceReset()
     {
-      checkCuda( cudaDeviceReset() ); // this is needed by cuda-memcheck --leak-check full
+      checkGpu( gpuDeviceReset() ); // this is needed by cuda-memcheck --leak-check full
     }
   } deviceResetter;
 

From e18c882d7612f24cf65aa1a63bf02628f55de2ab Mon Sep 17 00:00:00 2001
From: Jorgen T <jorgen.teig@gmail.com>
Date: Thu, 10 Aug 2023 17:43:28 +0200
Subject: [PATCH 478/509] [CODEGEN] Fixed error in runTest.cc and reverted
 changes in cudacpp_src.mk and cudacpp.mk

---
 .../madgraph/iolibs/template_files/gpu/cudacpp.mk               | 2 --
 .../madgraph/iolibs/template_files/gpu/cudacpp_src.mk           | 2 +-
 .../madgraph/iolibs/template_files/gpu/runTest.cc               | 2 +-
 3 files changed, 2 insertions(+), 4 deletions(-)

diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/cudacpp.mk b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/cudacpp.mk
index b6703137aa..14ea0f52d6 100644
--- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/cudacpp.mk
+++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/cudacpp.mk
@@ -238,8 +238,6 @@ endif
 export GPUCC
 export GPUFLAGS
 
-export HIPARCHFLAGS
-
 #-------------------------------------------------------------------------------
 
 #=== Configure ccache for C++ and CUDA builds
diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/cudacpp_src.mk b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/cudacpp_src.mk
index d28c92ec13..7eda8524c0 100644
--- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/cudacpp_src.mk
+++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/cudacpp_src.mk
@@ -89,7 +89,7 @@ CXXFLAGS += $(OMPFLAGS)
 ifeq ($(findstring nvcc,$(GPUCC)),nvcc)
   GPUFLAGS += -Xcompiler -fPIC -c -x cu
 else ifeq ($(findstring hipcc,$(GPUCC)),hipcc)
-  GPUFLAGS += $(HIPARCHFLAGS) -DHIP_PLATFORM=amd -fPIC -c
+  GPUFLAGS += -fPIC -c
 endif
 
 # Set the build flags appropriate to each AVX choice (example: "make AVX=none")
diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/runTest.cc b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/runTest.cc
index 7f8d6ffd12..de327f2321 100644
--- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/runTest.cc
+++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/runTest.cc
@@ -128,7 +128,7 @@ struct CUDATest : public CUDA_CPU_TestBase
   {
     ~DeviceReset()
     {
-      gpuDeviceReset(); // this is needed by cuda-memcheck --leak-check full
+      checkGpu( gpuDeviceReset() ); // this is needed by cuda-memcheck --leak-check full
     }
   } deviceResetter;
 

From 597de7394d2ce496d43c43bb1fef7dd310582f6c Mon Sep 17 00:00:00 2001
From: Jorgen T <jorgen.teig@gmail.com>
Date: Thu, 13 Jul 2023 15:15:41 +0200
Subject: [PATCH 479/509] [CODEGEN] Added GPU abstraction to CODEGEN

---
 .../iolibs/template_files/gpu/check_sa.cc     | 20 +++++++++--------
 .../iolibs/template_files/gpu/cudacpp.mk      | 22 +++++++++----------
 .../iolibs/template_files/gpu/mgOnGpuConfig.h |  4 +++-
 .../template_files/gpu/mgOnGpuVectors.h       |  2 ++
 .../template_files/gpu/process_matrix.inc     |  2 ++
 .../PLUGIN/CUDACPP_SA_OUTPUT/output.py        |  2 +-
 6 files changed, 30 insertions(+), 22 deletions(-)

diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/check_sa.cc b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/check_sa.cc
index 734c2f83f8..611db19653 100644
--- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/check_sa.cc
+++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/check_sa.cc
@@ -103,8 +103,8 @@ main( int argc, char** argv )
     CurandHost = 1,
     CurandDevice = 2
   };
-#ifdef __CUDACC__
-  RandomNumberMode rndgen = RandomNumberMode::CurandDevice; // default on CUDA GPU
+#ifdef MGONGPUCPP_GPUIMPL
+  RandomNumberMode rndgen = RandomNumberMode::CurandDevice; // default on GPU
 #elif not defined MGONGPU_HAS_NO_CURAND
   RandomNumberMode rndgen = RandomNumberMode::CurandHost;  // default on CPU if build has curand
 #else
@@ -146,7 +146,7 @@ main( int argc, char** argv )
     }
     else if( arg == "--curdev" )
     {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
       rndgen = RandomNumberMode::CurandDevice;
 #else
       throw std::runtime_error( "CurandDevice is not supported on CPUs or on HIP GPUs" );
@@ -395,7 +395,7 @@ main( int argc, char** argv )
     const bool onDevice = false;
     prnk.reset( new CurandRandomNumberKernel( hstRndmom, onDevice ) );
   }
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   else
   {
     const bool onDevice = true;
@@ -730,7 +730,7 @@ main( int argc, char** argv )
     rndgentxt = "CURAND HOST";
   else if( rndgen == RandomNumberMode::CurandDevice )
     rndgentxt = "CURAND DEVICE";
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   rndgentxt += " (CUDA code)";
 #elif defined __HIPCC__
   rndgentxt += " (HIP code)";
@@ -740,8 +740,8 @@ main( int argc, char** argv )
 
   // Workflow description summary
   std::string wrkflwtxt;
-  // -- CUDA or HIP or C++?
-#ifdef __CUDACC__
+  // -- CUDA or C++?
+#ifdef MGONGPUCPP_GPUIMPL
   wrkflwtxt += "CUD:";
 #elif defined __HIPCC__
   wrkflwtxt += "HIP:";
@@ -759,7 +759,7 @@ main( int argc, char** argv )
   wrkflwtxt += "???+"; // no path to this statement
 #endif
   // -- CUCOMPLEX or THRUST or STD complex numbers?
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 #if defined MGONGPU_CUCXTYPE_CUCOMPLEX
   wrkflwtxt += "CUX:";
 #elif defined MGONGPU_CUCXTYPE_THRUST
@@ -875,7 +875,7 @@ main( int argc, char** argv )
 #endif
     // Dump all configuration parameters and all results
     std::cout << std::string( SEP79, '*' ) << std::endl
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
               << "Process                     = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_CUDA"
 #elif defined __HIPCC__
               << "Process                     = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_HIP"
@@ -905,6 +905,7 @@ main( int argc, char** argv )
 #elif defined MGONGPU_FPTYPE_FLOAT
               << "FP precision                = FLOAT (NaN/abnormal=" << nabn << ", zero=" << nzero << ")" << std::endl
 #endif
+#ifdef MGONGPUCPP_GPUIMPL
 #if defined MGONGPU_CUCXTYPE_CUCOMPLEX
               << "Complex type                = CUCOMPLEX" << std::endl
 #elif defined MGONGPU_CUCXTYPE_THRUST
@@ -1046,6 +1047,7 @@ main( int argc, char** argv )
              << "\"FLOAT (NaN/abnormal=" << nabn << ")\"," << std::endl
 #endif
              << "\"Complex type\": "
+#ifdef MGONGPUCPP_GPUIMPL
 #if defined MGONGPU_CUCXTYPE_CUCOMPLEX
              << "\"CUCOMPLEX\"," << std::endl
 #elif defined MGONGPU_CUCXTYPE_THRUST
diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/cudacpp.mk b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/cudacpp.mk
index 401868a61c..965c0e36bf 100644
--- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/cudacpp.mk
+++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/cudacpp.mk
@@ -262,11 +262,11 @@ endif
 
 # PowerPC-specific CXX compiler flags (being reviewed)
 ifeq ($(UNAME_P),ppc64le)
-  CXXFLAGS+= -mcpu=power9 -mtune=power9 # gains ~2-3%% both for none and sse4
+  CXXFLAGS+= -mcpu=power9 -mtune=power9 # gains ~2-3% both for none and sse4
   # Throughput references without the extra flags below: none=1.41-1.42E6, sse4=2.15-2.19E6
   ###CXXFLAGS+= -DNO_WARN_X86_INTRINSICS # no change
   ###CXXFLAGS+= -fpeel-loops # no change
-  ###CXXFLAGS+= -funroll-loops # gains ~1%% for none, loses ~1%% for sse4
+  ###CXXFLAGS+= -funroll-loops # gains ~1% for none, loses ~1% for sse4
   ###CXXFLAGS+= -ftree-vectorize # no change
   ###CXXFLAGS+= -flto # would increase to none=4.08-4.12E6, sse4=4.99-5.03E6!
 else
@@ -547,7 +547,7 @@ $(BUILDDIR)/%%.o : %%.cu *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
 	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
 	$(GPUCC) $(CPPFLAGS) $(GPUFLAGS) $(CUBUILDRULEFLAGS) $< -o $@
 
-$(BUILDDIR)/%%_cu.o : %%.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
+$(BUILDDIR)/%_cu.o : %.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
 	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
 	$(GPUCC) $(CPPFLAGS) $(GPUFLAGS) $(CCBUILDRULEFLAGS) $< -o $@
 endif
@@ -555,7 +555,7 @@ endif
 
 # Generic target and build rules: objects from C++ compilation
 # (NB do not include CUINC here! add it only for NVTX or curand #679)
-$(BUILDDIR)/%%.o : %%.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
+$(BUILDDIR)/%.o : %.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
 	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
 	$(CXX) $(CPPFLAGS) $(CXXFLAGS) -fPIC -c $< -o $@
 
@@ -644,7 +644,7 @@ endif
 #-------------------------------------------------------------------------------
 
 # Target (and build rules): Fortran include files
-###$(INCDIR)/%%.inc : ../%%.inc
+###$(INCDIR)/%.inc : ../%.inc
 ###	@if [ ! -d $(INCDIR) ]; then echo "mkdir -p $(INCDIR)"; mkdir -p $(INCDIR); fi
 ###	\cp $< $@
 
@@ -660,7 +660,7 @@ ifneq ($(shell $(CXX) --version | grep ^Intel),)
 $(cu_main): LIBFLAGS += -lintlc # compile with icpx and link with GPUCC (undefined reference to `_intel_fast_memcpy')
 $(cu_main): LIBFLAGS += -lsvml # compile with icpx and link with GPUCC (undefined reference to `__svml_cos4_l9')
 else ifneq ($(shell $(CXX) --version | grep ^nvc++),) # support nvc++ #531
-$(cu_main): LIBFLAGS += -L$(patsubst %%bin/nvc++,%%lib,$(subst ccache ,,$(CXX))) -lnvhpcatm -lnvcpumath -lnvc
+$(cu_main): LIBFLAGS += -L$(patsubst %bin/nvc++,%lib,$(subst ccache ,,$(CXX))) -lnvhpcatm -lnvcpumath -lnvc
 endif
 $(cu_main): LIBFLAGS += $(CULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH
 $(cu_main): $(BUILDDIR)/gcheck_sa.o $(LIBDIR)/lib$(MG5AMC_CULIB).so $(cu_objects_exe) $(BUILDDIR)/gCurandRandomNumberKernel.o
@@ -670,12 +670,12 @@ endif
 #-------------------------------------------------------------------------------
 
 # Generic target and build rules: objects from Fortran compilation
-$(BUILDDIR)/%%.o : %%.f *.inc
+$(BUILDDIR)/%.o : %.f *.inc
 	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
 	$(FC) -I. -c $< -o $@
 
 # Generic target and build rules: objects from Fortran compilation
-###$(BUILDDIR)/%%.o : %%.f *.inc
+###$(BUILDDIR)/%.o : %.f *.inc
 ###	@if [ ! -d $(INCDIR) ]; then echo "mkdir -p $(INCDIR)"; mkdir -p $(INCDIR); fi
 ###	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
 ###	$(FC) -I. -I$(INCDIR) -c $< -o $@
@@ -744,7 +744,7 @@ ifneq ($(shell $(CXX) --version | grep ^Intel),)
 $(testmain): LIBFLAGS += -lintlc # compile with icpx and link with GPUCC (undefined reference to `_intel_fast_memcpy')
 $(testmain): LIBFLAGS += -lsvml # compile with icpx and link with GPUCC (undefined reference to `__svml_cos4_l9')
 else ifneq ($(shell $(CXX) --version | grep ^nvc++),) # support nvc++ #531
-$(testmain): LIBFLAGS += -L$(patsubst %%bin/nvc++,%%lib,$(subst ccache ,,$(CXX))) -lnvhpcatm -lnvcpumath -lnvc
+$(testmain): LIBFLAGS += -L$(patsubst %bin/nvc++,%lib,$(subst ccache ,,$(CXX))) -lnvhpcatm -lnvcpumath -lnvc
 endif
 $(testmain): $(BUILDDIR)/runTest_cu.o
 $(testmain): cu_objects_exe  += $(BUILDDIR)/runTest_cu.o
@@ -928,14 +928,14 @@ cmpFcheck: all.$(TAG)
 	@echo
 	@echo "$(BUILDDIR)/check.exe --common -p 2 32 2"
 	@echo "$(BUILDDIR)/fcheck.exe 2 32 2"
-	@me1=$(shell $(RUNTIME) $(BUILDDIR)/check.exe --common -p 2 32 2 | grep MeanMatrix | awk '{print $$4}'); me2=$(shell $(RUNTIME) $(BUILDDIR)/fcheck.exe 2 32 2 | grep Average | awk '{print $$4}'); echo "Avg ME (C++/C++)    = $${me1}"; echo "Avg ME (F77/C++)    = $${me2}"; if [ "$${me2}" == "NaN" ]; then echo "ERROR! Fortran calculation (F77/C++) returned NaN"; elif [ "$${me2}" == "" ]; then echo "ERROR! Fortran calculation (F77/C++) crashed"; else python3 -c "me1=$${me1}; me2=$${me2}; reldif=abs((me2-me1)/me1); print('Relative difference =', reldif); ok = reldif <= 2E-4; print ( '%%s (relative difference %%s 2E-4)' %% ( ('OK','<=') if ok else ('ERROR','>') ) ); import sys; sys.exit(0 if ok else 1)"; fi
+	@me1=$(shell $(RUNTIME) $(BUILDDIR)/check.exe --common -p 2 32 2 | grep MeanMatrix | awk '{print $$4}'); me2=$(shell $(RUNTIME) $(BUILDDIR)/fcheck.exe 2 32 2 | grep Average | awk '{print $$4}'); echo "Avg ME (C++/C++)    = $${me1}"; echo "Avg ME (F77/C++)    = $${me2}"; if [ "$${me2}" == "NaN" ]; then echo "ERROR! Fortran calculation (F77/C++) returned NaN"; elif [ "$${me2}" == "" ]; then echo "ERROR! Fortran calculation (F77/C++) crashed"; else python3 -c "me1=$${me1}; me2=$${me2}; reldif=abs((me2-me1)/me1); print('Relative difference =', reldif); ok = reldif <= 2E-4; print ( '%s (relative difference %s 2E-4)' % ( ('OK','<=') if ok else ('ERROR','>') ) ); import sys; sys.exit(0 if ok else 1)"; fi
 
 # Target: cmpFGcheck (compare ME results from the CUDA and Fortran with CUDA MEs standalone executables, with a small number of events)
 cmpFGcheck: all.$(TAG)
 	@echo
 	@echo "$(BUILDDIR)/gcheck.exe --common -p 2 32 2"
 	@echo "$(BUILDDIR)/fgcheck.exe 2 32 2"
-	@me1=$(shell $(RUNTIME) $(BUILDDIR)/gcheck.exe --common -p 2 32 2 | grep MeanMatrix | awk '{print $$4}'); me2=$(shell $(RUNTIME) $(BUILDDIR)/fgcheck.exe 2 32 2 | grep Average | awk '{print $$4}'); echo "Avg ME (C++/CUDA)   = $${me1}"; echo "Avg ME (F77/CUDA)   = $${me2}"; if [ "$${me2}" == "NaN" ]; then echo "ERROR! Fortran calculation (F77/CUDA) crashed"; elif [ "$${me2}" == "" ]; then echo "ERROR! Fortran calculation (F77/CUDA) crashed"; else python3 -c "me1=$${me1}; me2=$${me2}; reldif=abs((me2-me1)/me1); print('Relative difference =', reldif); ok = reldif <= 2E-4; print ( '%%s (relative difference %%s 2E-4)' %% ( ('OK','<=') if ok else ('ERROR','>') ) ); import sys; sys.exit(0 if ok else 1)"; fi
+	@me1=$(shell $(RUNTIME) $(BUILDDIR)/gcheck.exe --common -p 2 32 2 | grep MeanMatrix | awk '{print $$4}'); me2=$(shell $(RUNTIME) $(BUILDDIR)/fgcheck.exe 2 32 2 | grep Average | awk '{print $$4}'); echo "Avg ME (C++/CUDA)   = $${me1}"; echo "Avg ME (F77/CUDA)   = $${me2}"; if [ "$${me2}" == "NaN" ]; then echo "ERROR! Fortran calculation (F77/CUDA) crashed"; elif [ "$${me2}" == "" ]; then echo "ERROR! Fortran calculation (F77/CUDA) crashed"; else python3 -c "me1=$${me1}; me2=$${me2}; reldif=abs((me2-me1)/me1); print('Relative difference =', reldif); ok = reldif <= 2E-4; print ( '%s (relative difference %s 2E-4)' % ( ('OK','<=') if ok else ('ERROR','>') ) ); import sys; sys.exit(0 if ok else 1)"; fi
 
 # Target: memcheck (run the CUDA standalone executable gcheck.exe with a small number of events through cuda-memcheck)
 memcheck: all.$(TAG)
diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/mgOnGpuConfig.h b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/mgOnGpuConfig.h
index 8da9429de8..3fa1fff9a3 100644
--- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/mgOnGpuConfig.h
+++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/mgOnGpuConfig.h
@@ -6,6 +6,8 @@
 #ifndef MGONGPUCONFIG_H
 #define MGONGPUCONFIG_H 1
 
+#include "GpuRuntime.h" // Includes the GPU abstraction
+
 // HARDCODED AT CODE GENERATION TIME: DO NOT MODIFY (#473)
 // There are two different code bases for standalone_cudacpp (without multichannel) and madevent+cudacpp (with multichannel)
 %(mgongpu_supports_multichannel)s
@@ -27,7 +29,7 @@
 // For CUDA, by default, it is supported
 // For HIP, by default, it is not supported
 // For C++, by default, do not inline, but allow this macro to be set from outside with e.g. -DMGONGPU_HAS_NO_CURAND
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_CUDACC
 #undef MGONGPU_HAS_NO_CURAND
 #elif defined __HIPCC__
 #define MGONGPU_HAS_NO_CURAND 1
diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/mgOnGpuVectors.h b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/mgOnGpuVectors.h
index cdae04326b..dd8b83752d 100644
--- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/mgOnGpuVectors.h
+++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/mgOnGpuVectors.h
@@ -9,6 +9,8 @@
 #include "mgOnGpuCxtypes.h"
 #include "mgOnGpuFptypes.h"
 
+#include "GpuAbstraction.h"
+
 #include <iostream>
 
 //==========================================================================
diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/process_matrix.inc b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/process_matrix.inc
index 960f029d8d..84e324a679 100644
--- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/process_matrix.inc
+++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/process_matrix.inc
@@ -7,6 +7,8 @@
 ! Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 !==========================================================================
 
+#include "GpuAbstraction.h"
+
       // *** COLOR CHOICE BELOW ***
       // Store the leading color flows for choice of color
       if( jamp2_sv ) // disable color choice if nullptr
diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/output.py b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/output.py
index ed7b1985dd..e08746a1b6 100644
--- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/output.py
+++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/output.py
@@ -88,7 +88,7 @@ class PLUGIN_ProcessExporter(PLUGIN_export_cpp.ProcessExporterGPU):
                      'CMake': [s+'CMake/Compilers.txt', s+'CMake/Platforms.txt', s+'CMake/Macros.txt'],
                      'src': [s+'gpu/rambo.h', s+'read_slha.h', s+'read_slha.cc',
                              s+'gpu/mgOnGpuFptypes.h', s+'gpu/mgOnGpuCxtypes.h', s+'gpu/mgOnGpuVectors.h',
-                             s+'CMake/src/CMakeLists.txt' ],
+                             s+'CMake/src/CMakeLists.txt', s+'gpu/GpuRuntime.h', s+'gpu/GpuAbstraction.h'],
                      'SubProcesses': [s+'gpu/nvtx.h', s+'gpu/timer.h', s+'gpu/timermap.h',
                                       s+'gpu/ompnumthreads.h', s+'gpu/GpuRuntime.h', s+'gpu/GpuAbstraction.h',
                                       s+'gpu/MemoryAccessHelpers.h', s+'gpu/MemoryAccessVectors.h',

From 1a6496ab3b9bdb5bbc03b3790b2ed06f23749e30 Mon Sep 17 00:00:00 2001
From: Jorgen T <jorgen.teig@gmail.com>
Date: Fri, 29 Sep 2023 16:25:12 +0200
Subject: [PATCH 480/509] Updated first name in Author list

---
 .../CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/AUTHORS    | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/AUTHORS b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/AUTHORS
index 0aeb2c8a87..71519d1ad8 100644
--- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/AUTHORS
+++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/AUTHORS
@@ -10,7 +10,7 @@ generates includes the following authors:
   Stephan Hageboeck (CERN)
   Olivier Mattelaer (Universite Catholique de Louvain, original author)
   Stefan Roiser (CERN, original author)
-  Joergen Teig (CERN)
+  Jorgen Teig (CERN)
   Andrea Valassi (CERN, original author)
   Zenny Wettersten (CERN)
 

From d2e2f47a303a9b1c25805d96d04beb4f07b57575 Mon Sep 17 00:00:00 2001
From: Andrea Valassi <andrea.valassi@cern.ch>
Date: Wed, 24 Jan 2024 10:16:55 +0100
Subject: [PATCH 481/509] [jt774] (before merging upstream/master) improve
 logic of "if CUDA else HIP else neither" in CODEGEN cudacpp.mk

---
 .../iolibs/template_files/gpu/cudacpp.mk      | 228 +++++++++---------
 1 file changed, 108 insertions(+), 120 deletions(-)

diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/cudacpp.mk b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/cudacpp.mk
index 965c0e36bf..2864673ead 100644
--- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/cudacpp.mk
+++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/cudacpp.mk
@@ -30,7 +30,7 @@ UNAME_P := $(shell uname -p)
 include ../../Source/make_opts
 #-------------------------------------------------------------------------------
 
-#=== Configure common compiler flags for C++ and CUDA
+#=== Configure common compiler flags for C++ and CUDA/HIP
 
 INCFLAGS = -I.
 OPTFLAGS = -O3 # this ends up in GPUFLAGS too (should it?), cannot add -Ofast or -ffast-math here
@@ -104,69 +104,73 @@ endif
 
 #-------------------------------------------------------------------------------
 
-CUDA_COMPILER_PATH := $(shell compiler="`which nvcc 2>/dev/null`" && while [ -L "$$compiler" ]; do compiler=`readlink "$$compiler"`; done && echo "$$compiler")
-HIP_COMPILER_PATH := $(shell compiler="`which hipcc 2>/dev/null`" && while [ -L "$$compiler" ]; do compiler=`readlink "$$compiler"`; done && echo "$$compiler")
-
-ifeq ($(findstring nvcc,$(CUDA_COMPILER_PATH)),nvcc)
-  #=== Configure the CUDA compiler
-
-  # If CXX is not a single word (example "clang++ --gcc-toolchain...") then disable CUDA builds (issue #505)
-  # This is because it is impossible to pass this to "GPUFLAGS += -ccbin <host-compiler>" below
-  ifneq ($(words $(subst ccache ,,$(CXX))),1) # allow at most "CXX=ccache <host-compiler>" from outside
-    $(warning CUDA builds are not supported for multi-word CXX "$(CXX)")
-    override CUDA_HOME=disabled
-  endif
-
-  # If CUDA_HOME is not set, try to set it from the location of NVCC
-  ifndef CUDA_HOME
-    CUDA_HOME = $(patsubst %%bin/nvcc,%%,$(shell which nvcc 2>/dev/null))
-    $(warning CUDA_HOME was not set: using "$(CUDA_HOME)")
-  endif
-
-  # Set GPUCC as $(CUDA_HOME)/bin/nvcc if it exists
-  ifneq ($(wildcard $(CUDA_HOME)/bin/nvcc),)
-    GPUCC = $(CUDA_HOME)/bin/nvcc
-    USE_NVTX ?=-DUSE_NVTX
-    # See https://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/index.html
-    # See https://arnon.dk/matching-sm-architectures-arch-and-gencode-for-various-nvidia-cards/
-    # Default: use compute capability 70 for V100 (CERN lxbatch, CERN itscrd, Juwels Cluster).
-    # Embed device code for 70, and PTX for 70+.
-    # Export MADGRAPH_CUDA_ARCHITECTURE (comma-separated list) to use another value or list of values (see #533).
-    # Examples: use 60 for P100 (Piz Daint), 80 for A100 (Juwels Booster, NVidia raplab/Curiosity).
-    MADGRAPH_CUDA_ARCHITECTURE ?= 70
-    ###CUARCHFLAGS = -gencode arch=compute_$(MADGRAPH_CUDA_ARCHITECTURE),code=compute_$(MADGRAPH_CUDA_ARCHITECTURE) -gencode arch=compute_$(MADGRAPH_CUDA_ARCHITECTURE),code=sm_$(MADGRAPH_CUDA_ARCHITECTURE) # Older implementation (AV): go back to this one for multi-GPU support #533
-    ###CUARCHFLAGS = --gpu-architecture=compute_$(MADGRAPH_CUDA_ARCHITECTURE) --gpu-code=sm_$(MADGRAPH_CUDA_ARCHITECTURE),compute_$(MADGRAPH_CUDA_ARCHITECTURE) # Newer implementation (SH): cannot use this as-is for multi-GPU support #533
-    comma:=,
-    CUARCHFLAGS = $(foreach arch,$(subst $(comma), ,$(MADGRAPH_CUDA_ARCHITECTURE)),-gencode arch=compute_$(arch),code=compute_$(arch) -gencode arch=compute_$(arch),code=sm_$(arch))
-    CUINC = -I$(CUDA_HOME)/include/
-    CURANDLIBFLAGS = -L$(CUDA_HOME)/lib64/ -lcurand # NB: -lcuda is not needed here!
-    CUOPTFLAGS = -lineinfo
-    GPUFLAGS = $(OPTFLAGS) $(CUOPTFLAGS) $(INCFLAGS) $(CUINC) $(USE_NVTX) $(CUARCHFLAGS) -use_fast_math
-    ###GPUFLAGS += -Xcompiler -Wall -Xcompiler -Wextra -Xcompiler -Wshadow
-    ###GPUCC_VERSION = $(shell $(GPUCC) --version | grep 'Cuda compilation tools' | cut -d' ' -f5 | cut -d, -f1)
-    GPUFLAGS += -std=c++17 # need CUDA >= 11.2 (see #333): this is enforced in mgOnGpuConfig.h
-    # Without -maxrregcount: baseline throughput: 6.5E8 (16384 32 12) up to 7.3E8 (65536 128 12)
-    ###GPUFLAGS+= --maxrregcount 160 # improves throughput: 6.9E8 (16384 32 12) up to 7.7E8 (65536 128 12)
-    ###GPUFLAGS+= --maxrregcount 128 # improves throughput: 7.3E8 (16384 32 12) up to 7.6E8 (65536 128 12)
-    ###GPUFLAGS+= --maxrregcount 96 # degrades throughput: 4.1E8 (16384 32 12) up to 4.5E8 (65536 128 12)
-    ###GPUFLAGS+= --maxrregcount 64 # degrades throughput: 1.7E8 (16384 32 12) flat at 1.7E8 (65536 128 12)
-
-    CUBUILDRULEFLAGS = -Xcompiler -fPIC -c
-    CCBUILDRULEFLAGS = -Xcompiler -fPIC -c -x cu
-
-    CUDATESTFLAGS = -lcuda
-
-  else ifneq ($(origin REQUIRE_CUDA),undefined)
-    # If REQUIRE_CUDA is set but no cuda is found, stop here (e.g. for CI tests on GPU #443)
-    $(error No cuda installation found (set CUDA_HOME or make GPUCC visible in PATH))
-  else
-    # No cuda. Switch cuda compilation off and go to common random numbers in C++
-    $(warning CUDA_HOME is not set or is invalid: export CUDA_HOME to compile with cuda)
-    override GPUCC=
-    override USE_NVTX=
-    override CUINC=
-    override CURANDLIBFLAGS=
-  endif
+#=== Configure the GPU compiler (CUDA or HIP)
+
+# FIXME! (AV 24.01.2024)
+# In the current implementation (without separate builds for C++ and CUDA/HIP), we first check for cudacc and hipcc in CUDA_HOME and HIP_HOME.
+# If CUDA_HOME or HIP_HOME are not set, try to determine them from the path to cudacc and hipcc.
+# While convoluted, this is currently necessary to allow disabling CUDA/HIP builds by setting CUDA_HOME or HIP_HOME to invalid paths.
+# This will (probably?) be fixed when separate C++ and CUDA/HIP builds are implemented (PR #775).
+
+# If CXX is not a single word (example "clang++ --gcc-toolchain...") then disable CUDA and HIP builds (issue #505)
+# This is because it is impossible to pass this to "GPUFLAGS += -ccbin <host-compiler>" below
+ifneq ($(words $(subst ccache ,,$(CXX))),1) # allow at most "CXX=ccache <host-compiler>" from outside
+  $(warning CUDA and HIP builds are not supported for multi-word CXX "$(CXX)")
+  override CUDA_HOME=disabled
+  override HIP_HOME=disabled
+endif
+
+# If CUDA_HOME is not set, try to set it from the path to nvcc
+ifndef CUDA_HOME
+  CUDA_HOME = $(patsubst %%bin/nvcc,%%,$(shell which nvcc 2>/dev/null))
+  $(warning CUDA_HOME was not set: using "$(CUDA_HOME)")
+endif
+
+# If HIP_HOME is not set, try to set it from the path to hipcc
+ifndef HIP_HOME
+  HIP_HOME = $(patsubst %%bin/hipcc,%%,$(HIP_COMPILER_PATH))
+  $(warning HIP_HOME was not set: using "$(HIP_HOME)")
+endif
+
+# FIXME! (AV 24.01.2024)
+# In the current implementation (without separate builds for C++ and CUDA/HIP),
+# builds are performed for HIP only if CUDA is not found in the path.
+# If both CUDA and HIP are installed, HIP builds can be triggered by unsetting CUDA_HOME.
+# This will be fixed when separate C++ and CUDA/HIP builds are implemented (PR #775).
+
+#--- Option 1: CUDA exists -> use CUDA
+
+# Set GPUCC as $(CUDA_HOME)/bin/nvcc if it exists
+ifneq ($(wildcard $(CUDA_HOME)/bin/nvcc),)
+
+  GPUCC = $(CUDA_HOME)/bin/nvcc
+  USE_NVTX ?=-DUSE_NVTX
+  # See https://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/index.html
+  # See https://arnon.dk/matching-sm-architectures-arch-and-gencode-for-various-nvidia-cards/
+  # Default: use compute capability 70 for V100 (CERN lxbatch, CERN itscrd, Juwels Cluster).
+  # Embed device code for 70, and PTX for 70+.
+  # Export MADGRAPH_CUDA_ARCHITECTURE (comma-separated list) to use another value or list of values (see #533).
+  # Examples: use 60 for P100 (Piz Daint), 80 for A100 (Juwels Booster, NVidia raplab/Curiosity).
+  MADGRAPH_CUDA_ARCHITECTURE ?= 70
+  ###CUARCHFLAGS = -gencode arch=compute_$(MADGRAPH_CUDA_ARCHITECTURE),code=compute_$(MADGRAPH_CUDA_ARCHITECTURE) -gencode arch=compute_$(MADGRAPH_CUDA_ARCHITECTURE),code=sm_$(MADGRAPH_CUDA_ARCHITECTURE) # Older implementation (AV): go back to this one for multi-GPU support #533
+  ###CUARCHFLAGS = --gpu-architecture=compute_$(MADGRAPH_CUDA_ARCHITECTURE) --gpu-code=sm_$(MADGRAPH_CUDA_ARCHITECTURE),compute_$(MADGRAPH_CUDA_ARCHITECTURE) # Newer implementation (SH): cannot use this as-is for multi-GPU support #533
+  comma:=,
+  CUARCHFLAGS = $(foreach arch,$(subst $(comma), ,$(MADGRAPH_CUDA_ARCHITECTURE)),-gencode arch=compute_$(arch),code=compute_$(arch) -gencode arch=compute_$(arch),code=sm_$(arch))
+  CUINC = -I$(CUDA_HOME)/include/
+  CURANDLIBFLAGS = -L$(CUDA_HOME)/lib64/ -lcurand # NB: -lcuda is not needed here!
+  CUOPTFLAGS = -lineinfo
+  GPUFLAGS = $(OPTFLAGS) $(CUOPTFLAGS) $(INCFLAGS) $(CUINC) $(USE_NVTX) $(CUARCHFLAGS) -use_fast_math
+  ###GPUFLAGS += -Xcompiler -Wall -Xcompiler -Wextra -Xcompiler -Wshadow
+  ###GPUCC_VERSION = $(shell $(GPUCC) --version | grep 'Cuda compilation tools' | cut -d' ' -f5 | cut -d, -f1)
+  GPUFLAGS += -std=c++17 # need CUDA >= 11.2 (see #333): this is enforced in mgOnGpuConfig.h
+  # Without -maxrregcount: baseline throughput: 6.5E8 (16384 32 12) up to 7.3E8 (65536 128 12)
+  ###GPUFLAGS+= --maxrregcount 160 # improves throughput: 6.9E8 (16384 32 12) up to 7.7E8 (65536 128 12)
+  ###GPUFLAGS+= --maxrregcount 128 # improves throughput: 7.3E8 (16384 32 12) up to 7.6E8 (65536 128 12)
+  ###GPUFLAGS+= --maxrregcount 96 # degrades throughput: 4.1E8 (16384 32 12) up to 4.5E8 (65536 128 12)
+  ###GPUFLAGS+= --maxrregcount 64 # degrades throughput: 1.7E8 (16384 32 12) flat at 1.7E8 (65536 128 12)
+  CUBUILDRULEFLAGS = -Xcompiler -fPIC -c
+  CCBUILDRULEFLAGS = -Xcompiler -fPIC -c -x cu
+  CUDATESTFLAGS = -lcuda
 
   # Set the host C++ compiler for GPUCC via "-ccbin <host-compiler>"
   # (NB issue #505: this must be a single word, "clang++ --gcc-toolchain..." is not supported)
@@ -177,71 +181,55 @@ ifeq ($(findstring nvcc,$(CUDA_COMPILER_PATH)),nvcc)
   GPUFLAGS += -allow-unsupported-compiler
   endif
 
-else ifeq ($(findstring hipcc,$(HIP_COMPILER_PATH)),hipcc)
-  #=== Configure the HIP compiler
+else ifneq ($(origin REQUIRE_CUDA),undefined)
 
-  # If CXX is not a single word (example "clang++ --gcc-toolchain...") then disable CUDA/HIP builds (issue #505)
-  # This is because it is impossible to pass this to "GPUFLAGS += -ccbin <host-compiler>" below
-  ifneq ($(words $(subst ccache ,,$(CXX))),1) # allow at most "CXX=ccache <host-compiler>" from outside
-    $(warning HIP builds are not supported for multi-word CXX "$(CXX)")
-    override HIP_HOME=disabled
-  endif
+  # If REQUIRE_CUDA is set but no cuda is found, stop here (e.g. for CI tests on GPU #443)
+  $(error No cuda installation found (set CUDA_HOME or make GPUCC visible in PATH))
 
-  # If HIP_HOME is not set, try to set it from the location of GPUCC
-  ifndef HIP_HOME
-    HIP_HOME = $(patsubst %%bin/hipcc,%%,$(HIP_COMPILER_PATH))
-    $(warning HIP_HOME was not set: using "$(HIP_HOME)")
-  endif
+#--- Option 2: CUDA does not exist, HIP exists -> use HIP
 
-  # Set GPUCC as $(HIP_HOME)/bin/hipcc if it exists
-  ifneq ($(wildcard $(HIP_HOME)/bin/hipcc),)
-    GPUCC = $(HIP_HOME)/bin/hipcc
-
-    # Should maybe find something equivelant to this in HIP
-    #USE_NVTX ?=-DUSE_NVTX
-
-    HIPARCHFLAGS = -target x86_64-linux-gnu --offload-arch=gfx90a
-    HIPINC = -I$(HIP_HOME)/include/
-
-    # -DHIP_FAST_MATH equivelant to -use_fast_math in HIP 
-    # (But only for single precision line 208: https://rocm-developer-tools.github.io/HIP/hcc__detail_2math__functions_8h_source.html)
-    GPUFLAGS = $(OPTFLAGS) $(CUOPTFLAGS) $(INCFLAGS) $(HIPINC) $(HIPARCHFLAGS) -DHIP_FAST_MATH -DHIP_PLATFORM=amd -fPIC
-    ###GPUFLAGS += -Xcompiler -Wall -Xcompiler -Wextra -Xcompiler -Wshadow
-    GPUFLAGS += -std=c++17
-    # Without -maxrregcount: baseline throughput: 6.5E8 (16384 32 12) up to 7.3E8 (65536 128 12)
-    ###GPUFLAGS+= --maxrregcount 160 # improves throughput: 6.9E8 (16384 32 12) up to 7.7E8 (65536 128 12)
-    ###GPUFLAGS+= --maxrregcount 128 # improves throughput: 7.3E8 (16384 32 12) up to 7.6E8 (65536 128 12)
-    ###GPUFLAGS+= --maxrregcount 96 # degrades throughput: 4.1E8 (16384 32 12) up to 4.5E8 (65536 128 12)
-    ###GPUFLAGS+= --maxrregcount 64 # degrades throughput: 1.7E8 (16384 32 12) flat at 1.7E8 (65536 128 12)
-
-    CUBUILDRULEFLAGS = -fPIC -c
-    CCBUILDRULEFLAGS = -fPIC -c
-
-  else ifneq ($(origin REQUIRE_HIP),undefined)
-    # If REQUIRE_HIP is set but no cuda is found, stop here (e.g. for CI tests on GPU #443)
-    $(error No hip installation found (set HIP_HOME or make GPUCC visible in PATH))
-  else
-    # No hip. Switch hip compilation off and go to common random numbers in C++
-    $(warning HIP_HOME is not set or is invalid: export HIP_HOME to compile with hip)
-    override GPUCC=
-    override USE_NVTX=
-    override CUINC=
-    override CURANDLIBFLAGS=
-  endif
+# Set GPUCC as $(HIP_HOME)/bin/hipcc if it exists
+else ifneq ($(wildcard $(HIP_HOME)/bin/hipcc),)
 
-  # Allow newer (unsupported) C++ compilers with older versions of CUDA if ALLOW_UNSUPPORTED_COMPILER_IN_CUDA is set (#504)
-  ifneq ($(origin ALLOW_UNSUPPORTED_COMPILER_IN_CUDA),undefined)
-  GPUFLAGS += -allow-unsupported-compiler
-  endif
+  GPUCC = $(HIP_HOME)/bin/hipcc
+  #USE_NVTX ?=-DUSE_NVTX # should maybe find something equivalent to this in HIP?
+  HIPARCHFLAGS = -target x86_64-linux-gnu --offload-arch=gfx90a
+  HIPINC = -I$(HIP_HOME)/include/
+  # Note: -DHIP_FAST_MATH is equivalent to -use_fast_math in HIP 
+  # (but only for single precision line 208: https://rocm-developer-tools.github.io/HIP/hcc__detail_2math__functions_8h_source.html)
+  GPUFLAGS = $(OPTFLAGS) $(CUOPTFLAGS) $(INCFLAGS) $(HIPINC) $(HIPARCHFLAGS) -DHIP_FAST_MATH -DHIP_PLATFORM=amd -fPIC
+  ###GPUFLAGS += -Xcompiler -Wall -Xcompiler -Wextra -Xcompiler -Wshadow
+  GPUFLAGS += -std=c++17
+  ###GPUFLAGS+= --maxrregcount 255 # (AV: is this option valid on HIP and meaningful on AMD GPUs?)
+  CUBUILDRULEFLAGS = -fPIC -c
+  CCBUILDRULEFLAGS = -fPIC -c
+
+else ifneq ($(origin REQUIRE_HIP),undefined)
+
+  # If REQUIRE_HIP is set but no HIP is found, stop here (e.g. for CI tests on GPU #443)
+  $(error No hip installation found (set HIP_HOME or make GPUCC visible in PATH))
+
+#--- Option 3: CUDA does not exist, HIP does not exist -> switch off both CUDA and HIP
+
+else
+
+  # No cudacc and no hipcc: switch CUDA and HIP compilation off and go to common random numbers in C++
+  $(warning CUDA_HOME is not set or is invalid: export CUDA_HOME to compile with cuda)
+  $(warning HIP_HOME is not set or is invalid: export HIP_HOME to compile with hip)
+  override GPUCC=
+  override USE_NVTX=
+  override CUINC=
+  override CURANDLIBFLAGS=
 
 endif
 
+# Export GPUCC (so that it can also be used in cudacpp_src.mk?)
 export GPUCC
 export GPUFLAGS
 
 #-------------------------------------------------------------------------------
 
-#=== Configure ccache for C++ and CUDA builds
+#=== Configure ccache for C++ and CUDA/HIP builds
 
 # Enable ccache if USECCACHE=1
 ifeq ($(USECCACHE)$(shell echo $(CXX) | grep ccache),1)
@@ -258,7 +246,7 @@ endif
 
 #-------------------------------------------------------------------------------
 
-#=== Configure PowerPC-specific compiler flags for C++ and CUDA
+#=== Configure PowerPC-specific compiler flags for C++ and CUDA/HIP
 
 # PowerPC-specific CXX compiler flags (being reviewed)
 ifeq ($(UNAME_P),ppc64le)
@@ -274,7 +262,7 @@ else
   ######CXXFLAGS+= -fno-semantic-interposition # no benefit (neither alone, nor combined with -flto)
 endif
 
-# PowerPC-specific CUDA compiler flags (to be reviewed!)
+# PowerPC-specific CUDA/HIP compiler flags (to be reviewed!)
 ifeq ($(UNAME_P),ppc64le)
   GPUFLAGS+= -Xcompiler -mno-float128
 endif
@@ -360,7 +348,7 @@ export OMPFLAGS
 
 #-------------------------------------------------------------------------------
 
-#=== Set the CUDA/C++ compiler flags appropriate to user-defined choices of AVX, FPTYPE, HELINL, HRDCOD, RNDGEN
+#=== Set the CUDA/HIP/C++ compiler flags appropriate to user-defined choices of AVX, FPTYPE, HELINL, HRDCOD, RNDGEN
 
 # Set the build flags appropriate to OMPFLAGS
 $(info OMPFLAGS=$(OMPFLAGS))

From 8e9120cc2ac6f7bc12bf5a8b9a3caec6e0311f93 Mon Sep 17 00:00:00 2001
From: Andrea Valassi <andrea.valassi@cern.ch>
Date: Wed, 24 Jan 2024 10:18:06 +0100
Subject: [PATCH 482/509] [jt774] (before merging usptream/master) remove
 CODEGEN #cudacpp.mk#

---
 .../iolibs/template_files/gpu/#cudacpp.mk#    | 867 ------------------
 1 file changed, 867 deletions(-)
 delete mode 100644 epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/#cudacpp.mk#

diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/#cudacpp.mk# b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/#cudacpp.mk#
deleted file mode 100644
index e238257ab6..0000000000
--- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/#cudacpp.mk#
+++ /dev/null
@@ -1,867 +0,0 @@
-# Copyright (C) 2020-2023 CERN and UCLouvain.
-# Licensed under the GNU Lesser General Public License (version 3 or later).
-# Created by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin.
-# Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
-
-#=== Determine the name of this makefile (https://ftp.gnu.org/old-gnu/Manuals/make-3.80/html_node/make_17.html)
-#=== NB: different names (e.g. cudacpp.mk and cudacpp_src.mk) are used in the Subprocess and src directories
-
-CUDACPP_MAKEFILE = $(word $(words $(MAKEFILE_LIST)),$(MAKEFILE_LIST))
-CUDACPP_SRC_MAKEFILE = cudacpp_src.mk
-
-#-------------------------------------------------------------------------------
-
-#=== Use bash in the Makefile (https://www.gnu.org/software/make/manual/html_node/Choosing-the-Shell.html)
-
-SHELL := /bin/bash
-
-#-------------------------------------------------------------------------------
-
-#=== Detect O/S and architecture (assuming uname is available, https://en.wikipedia.org/wiki/Uname)
-
-# Detect O/S kernel (Linux, Darwin...)
-UNAME_S := $(shell uname -s)
-###$(info UNAME_S='$(UNAME_S)')
-
-# Detect architecture (x86_64, ppc64le...)
-UNAME_P := $(shell uname -p)
-###$(info UNAME_P='$(UNAME_P)')
-
-include ../../Source/make_opts
-#-------------------------------------------------------------------------------
-
-#=== Configure common compiler flags for C++ and CUDA
-
-INCFLAGS = -I.
-OPTFLAGS = -O3 # this ends up in CUFLAGS too (should it?), cannot add -Ofast or -ffast-math here
-
-# Dependency on src directory
-MG5AMC_COMMONLIB = mg5amc_common
-LIBFLAGS = -L$(LIBDIR) -l$(MG5AMC_COMMONLIB)
-INCFLAGS += -I../../src
-
-# Compiler-specific googletest build directory (#125 and #738)
-ifneq ($(shell $(CXX) --version | grep '^Intel(R) oneAPI DPC++/C++ Compiler'),)
-override CXXNAME = icpx$(shell $(CXX) --version | head -1 | cut -d' ' -f5)
-else ifneq ($(shell $(CXX) --version | egrep '^clang'),)
-override CXXNAME = clang$(shell $(CXX) --version | head -1 | cut -d' ' -f3)
-else ifneq ($(shell $(CXX) --version | grep '^g++ (GCC)'),)
-override CXXNAME = gcc$(shell $(CXX) --version | head -1 | cut -d' ' -f3)
-else
-override CXXNAME = unknown
-endif
-###$(info CXXNAME=$(CXXNAME))
-override CXXNAMESUFFIX = _$(CXXNAME)
-export CXXNAMESUFFIX
-
-# Dependency on test directory
-# Within the madgraph4gpu git repo: by default use a common gtest installation in <topdir>/test (optionally use an external or local gtest)
-# Outside the madgraph4gpu git repo: by default do not build the tests (optionally use an external or local gtest)
-###GTEST_ROOT = /cvmfs/sft.cern.ch/lcg/releases/gtest/1.11.0-21e8c/x86_64-centos8-gcc11-opt/# example of an external gtest installation
-###LOCALGTEST = yes# comment this out (or use make LOCALGTEST=yes) to build tests using a local gtest installation
-TESTDIRCOMMON = ../../../../../test
-TESTDIRLOCAL = ../../test
-ifneq ($(wildcard $(GTEST_ROOT)),)
-TESTDIR =
-else ifneq ($(LOCALGTEST),)
-TESTDIR=$(TESTDIRLOCAL)
-GTEST_ROOT = $(TESTDIR)/googletest/install$(CXXNAMESUFFIX)
-else ifneq ($(wildcard ../../../../../epochX/cudacpp/CODEGEN),)
-TESTDIR = $(TESTDIRCOMMON)
-GTEST_ROOT = $(TESTDIR)/googletest/install$(CXXNAMESUFFIX)
-else
-TESTDIR =
-endif
-ifneq ($(GTEST_ROOT),)
-GTESTLIBDIR = $(GTEST_ROOT)/lib64/
-GTESTLIBS = $(GTESTLIBDIR)/libgtest.a $(GTESTLIBDIR)/libgtest_main.a
-GTESTINC = -I$(GTEST_ROOT)/include
-else
-GTESTLIBDIR =
-GTESTLIBS =
-GTESTINC =
-endif
-###$(info GTEST_ROOT = $(GTEST_ROOT))
-###$(info LOCALGTEST = $(LOCALGTEST))
-###$(info TESTDIR = $(TESTDIR))
-
-#-------------------------------------------------------------------------------
-
-#=== Configure the C++ compiler
-
-CXXFLAGS = $(OPTFLAGS) -std=c++17 $(INCFLAGS) -Wall -Wshadow -Wextra
-ifeq ($(shell $(CXX) --version | grep ^nvc++),)
-CXXFLAGS += -ffast-math # see issue #117
-endif
-###CXXFLAGS+= -Ofast # performance is not different from --fast-math
-###CXXFLAGS+= -g # FOR DEBUGGING ONLY
-
-# Optionally add debug flags to display the full list of flags (eg on Darwin)
-###CXXFLAGS+= -v
-
-# Note: AR, CXX and FC are implicitly defined if not set externally
-# See https://www.gnu.org/software/make/manual/html_node/Implicit-Variables.html
-
-#-------------------------------------------------------------------------------
-
-#=== Configure the CUDA compiler
-
-# If CXX is not a single word (example "clang++ --gcc-toolchain...") then disable CUDA builds (issue #505)
-# This is because it is impossible to pass this to "CUFLAGS += -ccbin <host-compiler>" below
-ifneq ($(words $(subst ccache ,,$(CXX))),1) # allow at most "CXX=ccache <host-compiler>" from outside
-  $(warning CUDA builds are not supported for multi-word CXX "$(CXX)")
-  override CUDA_HOME=disabled
-endif
-
-# If CUDA_HOME is not set, try to set it from the location of nvcc
-ifndef CUDA_HOME
-  CUDA_HOME = $(patsubst %%bin/nvcc,%%,$(shell which nvcc 2>/dev/null))
-  $(warning CUDA_HOME was not set: using "$(CUDA_HOME)")
-endif
-
-# Set NVCC as $(CUDA_HOME)/bin/nvcc if it exists
-ifneq ($(wildcard $(CUDA_HOME)/bin/nvcc),)
-  NVCC = $(CUDA_HOME)/bin/nvcc
-  USE_NVTX ?=-DUSE_NVTX
-  # See https://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/index.html
-  # See https://arnon.dk/matching-sm-architectures-arch-and-gencode-for-various-nvidia-cards/
-  # Default: use compute capability 70 for V100 (CERN lxbatch, CERN itscrd, Juwels Cluster).
-  # Embed device code for 70, and PTX for 70+.
-  # Export MADGRAPH_CUDA_ARCHITECTURE (comma-separated list) to use another value or list of values (see #533).
-  # Examples: use 60 for P100 (Piz Daint), 80 for A100 (Juwels Booster, NVidia raplab/Curiosity).
-  MADGRAPH_CUDA_ARCHITECTURE ?= 70
-  ###CUARCHFLAGS = -gencode arch=compute_$(MADGRAPH_CUDA_ARCHITECTURE),code=compute_$(MADGRAPH_CUDA_ARCHITECTURE) -gencode arch=compute_$(MADGRAPH_CUDA_ARCHITECTURE),code=sm_$(MADGRAPH_CUDA_ARCHITECTURE) # Older implementation (AV): go back to this one for multi-GPU support #533
-  ###CUARCHFLAGS = --gpu-architecture=compute_$(MADGRAPH_CUDA_ARCHITECTURE) --gpu-code=sm_$(MADGRAPH_CUDA_ARCHITECTURE),compute_$(MADGRAPH_CUDA_ARCHITECTURE) # Newer implementation (SH): cannot use this as-is for multi-GPU support #533
-  comma:=,
-  CUARCHFLAGS = $(foreach arch,$(subst $(comma), ,$(MADGRAPH_CUDA_ARCHITECTURE)),-gencode arch=compute_$(arch),code=compute_$(arch) -gencode arch=compute_$(arch),code=sm_$(arch))
-  CUINC = -I$(CUDA_HOME)/include/
-  CURANDLIBFLAGS = -L$(CUDA_HOME)/lib64/ -lcurand # NB: -lcuda is not needed here!
-  CUOPTFLAGS = -lineinfo
-  CUFLAGS = $(foreach opt, $(OPTFLAGS), -Xcompiler $(opt)) $(CUOPTFLAGS) $(INCFLAGS) $(CUINC) $(USE_NVTX) $(CUARCHFLAGS) -use_fast_math
-  ###CUFLAGS += -Xcompiler -Wall -Xcompiler -Wextra -Xcompiler -Wshadow
-  ###NVCC_VERSION = $(shell $(NVCC) --version | grep 'Cuda compilation tools' | cut -d' ' -f5 | cut -d, -f1)
-  CUFLAGS += -std=c++17 # need CUDA >= 11.2 (see #333): this is enforced in mgOnGpuConfig.h
-  # Without -maxrregcount: baseline throughput: 6.5E8 (16384 32 12) up to 7.3E8 (65536 128 12)
-  ###CUFLAGS+= --maxrregcount 160 # improves throughput: 6.9E8 (16384 32 12) up to 7.7E8 (65536 128 12)
-  ###CUFLAGS+= --maxrregcount 128 # improves throughput: 7.3E8 (16384 32 12) up to 7.6E8 (65536 128 12)
-  ###CUFLAGS+= --maxrregcount 96 # degrades throughput: 4.1E8 (16384 32 12) up to 4.5E8 (65536 128 12)
-  ###CUFLAGS+= --maxrregcount 64 # degrades throughput: 1.7E8 (16384 32 12) flat at 1.7E8 (65536 128 12)
-else ifneq ($(origin REQUIRE_CUDA),undefined)
-  # If REQUIRE_CUDA is set but no cuda is found, stop here (e.g. for CI tests on GPU #443)
-  $(error No cuda installation found (set CUDA_HOME or make nvcc visible in PATH))
-else
-  # No cuda. Switch cuda compilation off and go to common random numbers in C++
-  $(warning CUDA_HOME is not set or is invalid: export CUDA_HOME to compile with cuda)
-  override NVCC=
-  override USE_NVTX=
-  override CUINC=
-  override CURANDLIBFLAGS=
-endif
-export NVCC
-export CUFLAGS
-
-# Set the host C++ compiler for nvcc via "-ccbin <host-compiler>"
-# (NB issue #505: this must be a single word, "clang++ --gcc-toolchain..." is not supported)
-CUFLAGS += -ccbin $(shell which $(subst ccache ,,$(CXX)))
-
-# Allow newer (unsupported) C++ compilers with older versions of CUDA if ALLOW_UNSUPPORTED_COMPILER_IN_CUDA is set (#504)
-ifneq ($(origin ALLOW_UNSUPPORTED_COMPILER_IN_CUDA),undefined)
-CUFLAGS += -allow-unsupported-compiler
-endif
-
-#-------------------------------------------------------------------------------
-
-#=== Configure ccache for C++ and CUDA builds
-
-# Enable ccache if USECCACHE=1
-ifeq ($(USECCACHE)$(shell echo $(CXX) | grep ccache),1)
-  override CXX:=ccache $(CXX)
-endif
-#ifeq ($(USECCACHE)$(shell echo $(AR) | grep ccache),1)
-#  override AR:=ccache $(AR)
-#endif
-ifneq ($(NVCC),)
-  ifeq ($(USECCACHE)$(shell echo $(NVCC) | grep ccache),1)
-    override NVCC:=ccache $(NVCC)
-  endif
-endif
-
-#-------------------------------------------------------------------------------
-
-#=== Configure PowerPC-specific compiler flags for C++ and CUDA
-
-# PowerPC-specific CXX compiler flags (being reviewed)
-ifeq ($(UNAME_P),ppc64le)
-  CXXFLAGS+= -mcpu=power9 -mtune=power9 # gains ~2-3%% both for none and sse4
-  # Throughput references without the extra flags below: none=1.41-1.42E6, sse4=2.15-2.19E6
-  ###CXXFLAGS+= -DNO_WARN_X86_INTRINSICS # no change
-  ###CXXFLAGS+= -fpeel-loops # no change
-  ###CXXFLAGS+= -funroll-loops # gains ~1%% for none, loses ~1%% for sse4
-  ###CXXFLAGS+= -ftree-vectorize # no change
-  ###CXXFLAGS+= -flto # would increase to none=4.08-4.12E6, sse4=4.99-5.03E6!
-else
-  ###CXXFLAGS+= -flto # also on Intel this would increase throughputs by a factor 2 to 4...
-  ######CXXFLAGS+= -fno-semantic-interposition # no benefit (neither alone, nor combined with -flto)
-endif
-
-# PowerPC-specific CUDA compiler flags (to be reviewed!)
-ifeq ($(UNAME_P),ppc64le)
-  CUFLAGS+= -Xcompiler -mno-float128
-endif
-
-#-------------------------------------------------------------------------------
-
-#=== Configure defaults and check if user-defined choices exist for OMPFLAGS, AVX, FPTYPE, HELINL, HRDCOD, RNDGEN
-
-# Set the default OMPFLAGS choice
-ifneq ($(shell $(CXX) --version | egrep '^Intel'),)
-override OMPFLAGS = -fopenmp
-###override OMPFLAGS = # disable OpenMP MT on Intel (was ok without nvcc but not ok with nvcc before #578)
-else ifneq ($(shell $(CXX) --version | egrep '^(clang)'),)
-override OMPFLAGS = -fopenmp
-###override OMPFLAGS = # disable OpenMP MT on clang (was not ok without or with nvcc before #578)
-else ifneq ($(shell $(CXX) --version | egrep '^(Apple clang)'),)
-override OMPFLAGS = -fopenmp # disable OpenMP MT on Apple clang (builds fail in the CI #578)
-else
-override OMPFLAGS = -fopenmp
-###override OMPFLAGS = # disable OpenMP MT (default before #575)
-endif
-
-# Set the default AVX (vectorization) choice
-ifeq ($(AVX),)
-  ifeq ($(UNAME_P),ppc64le)
-    ###override AVX = none
-    override AVX = sse4
-  else ifeq ($(UNAME_P),arm)
-    ###override AVX = none
-    override AVX = sse4
-  else ifeq ($(wildcard /proc/cpuinfo),)
-    override AVX = none
-    $(warning Using AVX='$(AVX)' because host SIMD features cannot be read from /proc/cpuinfo)
-  else ifeq ($(shell grep -m1 -c avx512vl /proc/cpuinfo)$(shell $(CXX) --version | grep ^clang),1)
-    override AVX = 512y
-    ###$(info Using AVX='$(AVX)' as no user input exists)
-  else
-    override AVX = avx2
-    ifneq ($(shell grep -m1 -c avx512vl /proc/cpuinfo),1)
-      $(warning Using AVX='$(AVX)' because host does not support avx512vl)
-    else
-      $(warning Using AVX='$(AVX)' because this is faster than avx512vl for clang)
-    endif
-  endif
-else
-  ###$(info Using AVX='$(AVX)' according to user input)
-endif
-
-# Set the default FPTYPE (floating point type) choice
-ifeq ($(FPTYPE),)
-  override FPTYPE = d
-endif
-
-# Set the default HELINL (inline helicities?) choice
-ifeq ($(HELINL),)
-  override HELINL = 0
-endif
-
-# Set the default HRDCOD (hardcode cIPD physics parameters?) choice
-ifeq ($(HRDCOD),)
-  override HRDCOD = 0
-endif
-
-# Set the default RNDGEN (random number generator) choice
-ifeq ($(RNDGEN),)
-  ifeq ($(NVCC),)
-    override RNDGEN = hasNoCurand
-  else ifeq ($(RNDGEN),)
-    override RNDGEN = hasCurand
-  endif
-endif
-
-# Export AVX, FPTYPE, HELINL, HRDCOD, RNDGEN, OMPFLAGS so that it is not necessary to pass them to the src Makefile too
-export AVX
-export FPTYPE
-export HELINL
-export HRDCOD
-export RNDGEN
-export OMPFLAGS
-
-#-------------------------------------------------------------------------------
-
-#=== Set the CUDA/C++ compiler flags appropriate to user-defined choices of AVX, FPTYPE, HELINL, HRDCOD, RNDGEN
-
-# Set the build flags appropriate to OMPFLAGS
-$(info OMPFLAGS=$(OMPFLAGS))
-CXXFLAGS += $(OMPFLAGS)
-
-# Set the build flags appropriate to each AVX choice (example: "make AVX=none")
-# [NB MGONGPU_PVW512 is needed because "-mprefer-vector-width=256" is not exposed in a macro]
-# [See https://gcc.gnu.org/bugzilla/show_bug.cgi?id=96476]
-$(info AVX=$(AVX))
-ifeq ($(UNAME_P),ppc64le)
-  ifeq ($(AVX),sse4)
-    override AVXFLAGS = -D__SSE4_2__ # Power9 VSX with 128 width (VSR registers)
-  else ifneq ($(AVX),none)
-    $(error Unknown AVX='$(AVX)': only 'none' and 'sse4' are supported on PowerPC for the moment)
-  endif
-else ifeq ($(UNAME_P),arm)
-  ifeq ($(AVX),sse4)
-    override AVXFLAGS = -D__SSE4_2__ # ARM NEON with 128 width (Q/quadword registers)
-  else ifneq ($(AVX),none)
-    $(error Unknown AVX='$(AVX)': only 'none' and 'sse4' are supported on ARM for the moment)
-  endif
-else ifneq ($(shell $(CXX) --version | grep ^nvc++),) # support nvc++ #531
-  ifeq ($(AVX),none)
-    override AVXFLAGS = -mno-sse3 # no SIMD
-  else ifeq ($(AVX),sse4)
-    override AVXFLAGS = -mno-avx # SSE4.2 with 128 width (xmm registers)
-  else ifeq ($(AVX),avx2)
-    override AVXFLAGS = -march=haswell # AVX2 with 256 width (ymm registers) [DEFAULT for clang]
-  else ifeq ($(AVX),512y)
-    override AVXFLAGS = -march=skylake -mprefer-vector-width=256 # AVX512 with 256 width (ymm registers) [DEFAULT for gcc]
-  else ifeq ($(AVX),512z)
-    override AVXFLAGS = -march=skylake -DMGONGPU_PVW512 # AVX512 with 512 width (zmm registers)
-  else
-    $(error Unknown AVX='$(AVX)': only 'none', 'sse4', 'avx2', '512y' and '512z' are supported)
-  endif
-else
-  ifeq ($(AVX),none)
-    override AVXFLAGS = -march=x86-64 # no SIMD (see #588)
-  else ifeq ($(AVX),sse4)
-    override AVXFLAGS = -march=nehalem # SSE4.2 with 128 width (xmm registers)
-  else ifeq ($(AVX),avx2)
-    override AVXFLAGS = -march=haswell # AVX2 with 256 width (ymm registers) [DEFAULT for clang]
-  else ifeq ($(AVX),512y)
-    override AVXFLAGS = -march=skylake-avx512 -mprefer-vector-width=256 # AVX512 with 256 width (ymm registers) [DEFAULT for gcc]
-  else ifeq ($(AVX),512z)
-    override AVXFLAGS = -march=skylake-avx512 -DMGONGPU_PVW512 # AVX512 with 512 width (zmm registers)
-  else
-    $(error Unknown AVX='$(AVX)': only 'none', 'sse4', 'avx2', '512y' and '512z' are supported)
-  endif
-endif
-# For the moment, use AVXFLAGS everywhere: eventually, use them only in encapsulated implementations?
-CXXFLAGS+= $(AVXFLAGS)
-
-# Set the build flags appropriate to each FPTYPE choice (example: "make FPTYPE=f")
-$(info FPTYPE=$(FPTYPE))
-ifeq ($(FPTYPE),d)
-  CXXFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_DOUBLE
-  CUFLAGS  += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_DOUBLE
-else ifeq ($(FPTYPE),f)
-  CXXFLAGS += -DMGONGPU_FPTYPE_FLOAT -DMGONGPU_FPTYPE2_FLOAT
-  CUFLAGS  += -DMGONGPU_FPTYPE_FLOAT -DMGONGPU_FPTYPE2_FLOAT
-else ifeq ($(FPTYPE),m)
-  CXXFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_FLOAT
-  CUFLAGS  += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_FLOAT
-else
-  $(error Unknown FPTYPE='$(FPTYPE)': only 'd', 'f' and 'm' are supported)
-endif
-
-# Set the build flags appropriate to each HELINL choice (example: "make HELINL=1")
-$(info HELINL=$(HELINL))
-ifeq ($(HELINL),1)
-  CXXFLAGS += -DMGONGPU_INLINE_HELAMPS
-  CUFLAGS  += -DMGONGPU_INLINE_HELAMPS
-else ifneq ($(HELINL),0)
-  $(error Unknown HELINL='$(HELINL)': only '0' and '1' are supported)
-endif
-
-# Set the build flags appropriate to each HRDCOD choice (example: "make HRDCOD=1")
-$(info HRDCOD=$(HRDCOD))
-ifeq ($(HRDCOD),1)
-  CXXFLAGS += -DMGONGPU_HARDCODE_PARAM
-  CUFLAGS  += -DMGONGPU_HARDCODE_PARAM
-else ifneq ($(HRDCOD),0)
-  $(error Unknown HRDCOD='$(HRDCOD)': only '0' and '1' are supported)
-endif
-
-# Set the build flags appropriate to each RNDGEN choice (example: "make RNDGEN=hasNoCurand")
-$(info RNDGEN=$(RNDGEN))
-ifeq ($(RNDGEN),hasNoCurand)
-  override CXXFLAGSCURAND = -DMGONGPU_HAS_NO_CURAND
-else ifeq ($(RNDGEN),hasCurand)
-  override CXXFLAGSCURAND =
-else
-  $(error Unknown RNDGEN='$(RNDGEN)': only 'hasCurand' and 'hasNoCurand' are supported)
-endif
-
-#-------------------------------------------------------------------------------
-
-#=== Configure build directories and build lockfiles ===
-
-# Build directory "short" tag (defines target and path to the optional build directory)
-# (Rationale: keep directory names shorter, e.g. do not include random number generator choice)
-override DIRTAG = $(AVX)_$(FPTYPE)_inl$(HELINL)_hrd$(HRDCOD)
-
-# Build lockfile "full" tag (defines full specification of build options that cannot be intermixed)
-# (Rationale: avoid mixing of CUDA and no-CUDA environment builds with different random number generators)
-override TAG = $(AVX)_$(FPTYPE)_inl$(HELINL)_hrd$(HRDCOD)_$(RNDGEN)
-
-# Build directory: current directory by default, or build.$(DIRTAG) if USEBUILDDIR==1
-ifeq ($(USEBUILDDIR),1)
-  override BUILDDIR = build.$(DIRTAG)
-  override LIBDIR = ../../lib/$(BUILDDIR)
-  override LIBDIRRPATH = '$$ORIGIN/../$(LIBDIR)'
-  $(info Building in BUILDDIR=$(BUILDDIR) for tag=$(TAG) (USEBUILDDIR is set = 1))
-else
-  override BUILDDIR = .
-  override LIBDIR = ../../lib
-  override LIBDIRRPATH = '$$ORIGIN/$(LIBDIR)'
-  $(info Building in BUILDDIR=$(BUILDDIR) for tag=$(TAG) (USEBUILDDIR is not set))
-endif
-###override INCDIR = ../../include
-###$(info Building in BUILDDIR=$(BUILDDIR) for tag=$(TAG))
-
-# On Linux, set rpath to LIBDIR to make it unnecessary to use LD_LIBRARY_PATH
-# Use relative paths with respect to the executables or shared libraries ($ORIGIN on Linux)
-# On Darwin, building libraries with absolute paths in LIBDIR makes this unnecessary
-ifeq ($(UNAME_S),Darwin)
-  override CXXLIBFLAGSRPATH =
-  override CULIBFLAGSRPATH =
-  override CXXLIBFLAGSRPATH2 =
-  override CULIBFLAGSRPATH2 =
-else
-  # RPATH to cuda/cpp libs when linking executables
-  override CXXLIBFLAGSRPATH = -Wl,-rpath,$(LIBDIRRPATH)
-  override CULIBFLAGSRPATH = -Xlinker -rpath,$(LIBDIRRPATH)
-  # RPATH to common lib when linking cuda/cpp libs
-  override CXXLIBFLAGSRPATH2 = -Wl,-rpath,'$$ORIGIN'
-  override CULIBFLAGSRPATH2 = -Xlinker -rpath,'$$ORIGIN'
-endif
-
-# Setting LD_LIBRARY_PATH or DYLD_LIBRARY_PATH in the RUNTIME is no longer necessary (neither on Linux nor on Mac)
-override RUNTIME =
-
-#===============================================================================
-#=== Makefile TARGETS and build rules below
-#===============================================================================
-
-cxx_main=$(BUILDDIR)/check.exe
-fcxx_main=$(BUILDDIR)/fcheck.exe
-
-ifneq ($(NVCC),)
-cu_main=$(BUILDDIR)/gcheck.exe
-fcu_main=$(BUILDDIR)/fgcheck.exe
-else
-cu_main=
-fcu_main=
-endif
-
-testmain=$(BUILDDIR)/runTest.exe
-
-ifneq ($(GTESTLIBS),)
-all.$(TAG): $(BUILDDIR)/.build.$(TAG) $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cu_main) $(cxx_main) $(fcu_main) $(fcxx_main) $(testmain)
-else
-all.$(TAG): $(BUILDDIR)/.build.$(TAG) $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cu_main) $(cxx_main) $(fcu_main) $(fcxx_main)
-endif
-
-# Target (and build options): debug
-MAKEDEBUG=
-debug: OPTFLAGS   = -g -O0
-debug: CUOPTFLAGS = -G
-debug: MAKEDEBUG := debug
-debug: all.$(TAG)
-
-# Target: tag-specific build lockfiles
-override oldtagsb=`if [ -d $(BUILDDIR) ]; then find $(BUILDDIR) -maxdepth 1 -name '.build.*' ! -name '.build.$(TAG)' -exec echo $(shell pwd)/{} \; ; fi`
-$(BUILDDIR)/.build.$(TAG):
-	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
-	@if [ "$(oldtagsb)" != "" ]; then echo "Cannot build for tag=$(TAG) as old builds exist for other tags:"; echo "  $(oldtagsb)"; echo "Please run 'make clean' first\nIf 'make clean' is not enough: run 'make clean USEBUILDDIR=1 AVX=$(AVX) FPTYPE=$(FPTYPE)' or 'make cleanall'"; exit 1; fi
-	@touch $(BUILDDIR)/.build.$(TAG)
-
-# Generic target and build rules: objects from CUDA compilation
-ifneq ($(NVCC),)
-$(BUILDDIR)/%%.o : %%.cu *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
-	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
-	$(NVCC) $(CPPFLAGS) $(CUFLAGS) -Xcompiler -fPIC -c $< -o $@
-
-$(BUILDDIR)/%%_cu.o : %%.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
-	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
-	$(NVCC) $(CPPFLAGS) $(CUFLAGS) -Xcompiler -fPIC -c -x cu $< -o $@
-endif
-
-# Generic target and build rules: objects from C++ compilation
-# (NB do not include CUINC here! add it only for NVTX or curand #679)
-$(BUILDDIR)/%%.o : %%.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
-	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
-	$(CXX) $(CPPFLAGS) $(CXXFLAGS) -fPIC -c $< -o $@
-
-# Apply special build flags only to CrossSectionKernel.cc and gCrossSectionKernel.cu (no fast math, see #117 and #516)
-ifeq ($(shell $(CXX) --version | grep ^nvc++),)
-$(BUILDDIR)/CrossSectionKernels.o: CXXFLAGS := $(filter-out -ffast-math,$(CXXFLAGS))
-$(BUILDDIR)/CrossSectionKernels.o: CXXFLAGS += -fno-fast-math
-ifneq ($(NVCC),)
-$(BUILDDIR)/gCrossSectionKernels.o: CUFLAGS += -Xcompiler -fno-fast-math
-endif
-endif
-
-# Apply special build flags only to check_sa.o and gcheck_sa.o (NVTX in timermap.h, #679)
-$(BUILDDIR)/check_sa.o: CXXFLAGS += $(USE_NVTX) $(CUINC)
-$(BUILDDIR)/gcheck_sa.o: CXXFLAGS += $(USE_NVTX) $(CUINC)
-
-# Apply special build flags only to check_sa and CurandRandomNumberKernel (curand headers, #679)
-$(BUILDDIR)/check_sa.o: CXXFLAGS += $(CXXFLAGSCURAND)
-$(BUILDDIR)/gcheck_sa.o: CXXFLAGS += $(CXXFLAGSCURAND)
-$(BUILDDIR)/CurandRandomNumberKernel.o: CXXFLAGS += $(CXXFLAGSCURAND)
-ifeq ($(RNDGEN),hasCurand)
-$(BUILDDIR)/CurandRandomNumberKernel.o: CXXFLAGS += $(CUINC)
-endif
-
-# Avoid "warning: builtin __has_trivial_... is deprecated; use __is_trivially_... instead" in nvcc with icx2023 (#592)
-ifneq ($(shell $(CXX) --version | egrep '^(Intel)'),)
-ifneq ($(NVCC),)
-CUFLAGS += -Xcompiler -Wno-deprecated-builtins
-endif
-endif
-
-# Avoid clang warning "overriding '-ffp-contract=fast' option with '-ffp-contract=on'" (#516)
-# This patch does remove the warning, but I prefer to keep it disabled for the moment...
-###ifneq ($(shell $(CXX) --version | egrep '^(clang|Apple clang|Intel)'),)
-###$(BUILDDIR)/CrossSectionKernels.o: CXXFLAGS += -Wno-overriding-t-option
-###ifneq ($(NVCC),)
-###$(BUILDDIR)/gCrossSectionKernels.o: CUFLAGS += -Xcompiler -Wno-overriding-t-option
-###endif
-###endif
-
-#### Apply special build flags only to CPPProcess.cc (-flto)
-###$(BUILDDIR)/CPPProcess.o: CXXFLAGS += -flto
-
-#### Apply special build flags only to CPPProcess.cc (AVXFLAGS)
-###$(BUILDDIR)/CPPProcess.o: CXXFLAGS += $(AVXFLAGS)
-
-#-------------------------------------------------------------------------------
-
-# Target (and build rules): common (src) library
-commonlib : $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so
-
-$(LIBDIR)/lib$(MG5AMC_COMMONLIB).so: ../../src/*.h ../../src/*.cc $(BUILDDIR)/.build.$(TAG)
-	$(MAKE) -C ../../src $(MAKEDEBUG) -f $(CUDACPP_SRC_MAKEFILE)
-
-#-------------------------------------------------------------------------------
-
-processid_short=$(shell basename $(CURDIR) | awk -F_ '{print $$(NF-1)"_"$$NF}')
-###$(info processid_short=$(processid_short))
-
-MG5AMC_CXXLIB = mg5amc_$(processid_short)_cpp
-cxx_objects_lib=$(BUILDDIR)/CPPProcess.o $(BUILDDIR)/MatrixElementKernels.o $(BUILDDIR)/BridgeKernels.o $(BUILDDIR)/CrossSectionKernels.o
-cxx_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel.o $(BUILDDIR)/RamboSamplingKernels.o
-
-ifneq ($(NVCC),)
-MG5AMC_CULIB = mg5amc_$(processid_short)_cuda
-cu_objects_lib=$(BUILDDIR)/gCPPProcess.o $(BUILDDIR)/gMatrixElementKernels.o $(BUILDDIR)/gBridgeKernels.o $(BUILDDIR)/gCrossSectionKernels.o
-cu_objects_exe=$(BUILDDIR)/gCommonRandomNumberKernel.o $(BUILDDIR)/gRamboSamplingKernels.o
-endif
-
-# Target (and build rules): C++ and CUDA shared libraries
-$(LIBDIR)/lib$(MG5AMC_CXXLIB).so: $(BUILDDIR)/fbridge.o
-$(LIBDIR)/lib$(MG5AMC_CXXLIB).so: cxx_objects_lib += $(BUILDDIR)/fbridge.o
-$(LIBDIR)/lib$(MG5AMC_CXXLIB).so: $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib)
-	$(CXX) -shared -o $@ $(cxx_objects_lib) $(CXXLIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) )
-
-ifneq ($(NVCC),)
-$(LIBDIR)/lib$(MG5AMC_CULIB).so: $(BUILDDIR)/fbridge_cu.o
-$(LIBDIR)/lib$(MG5AMC_CULIB).so: cu_objects_lib += $(BUILDDIR)/fbridge_cu.o
-$(LIBDIR)/lib$(MG5AMC_CULIB).so: $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cu_objects_lib)
-	$(NVCC) --shared -o $@ $(cu_objects_lib) $(CULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB)
-endif
-
-#-------------------------------------------------------------------------------
-
-# Target (and build rules): Fortran include files
-###$(INCDIR)/%%.inc : ../%%.inc
-###	@if [ ! -d $(INCDIR) ]; then echo "mkdir -p $(INCDIR)"; mkdir -p $(INCDIR); fi
-###	\cp $< $@
-
-#-------------------------------------------------------------------------------
-
-# Target (and build rules): C++ and CUDA standalone executables
-$(cxx_main): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH
-$(cxx_main): $(BUILDDIR)/check_sa.o $(LIBDIR)/lib$(MG5AMC_CXXLIB).so $(cxx_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel.o
-	$(CXX) -o $@ $(BUILDDIR)/check_sa.o $(OMPFLAGS) -ldl -pthread $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_CXXLIB) $(cxx_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel.o $(CURANDLIBFLAGS)
-
-ifneq ($(NVCC),)
-ifneq ($(shell $(CXX) --version | grep ^Intel),)
-$(cu_main): LIBFLAGS += -lintlc # compile with icpx and link with nvcc (undefined reference to `_intel_fast_memcpy')
-$(cu_main): LIBFLAGS += -lsvml # compile with icpx and link with nvcc (undefined reference to `__svml_cos4_l9')
-else ifneq ($(shell $(CXX) --version | grep ^nvc++),) # support nvc++ #531
-$(cu_main): LIBFLAGS += -L$(patsubst %%bin/nvc++,%%lib,$(subst ccache ,,$(CXX))) -lnvhpcatm -lnvcpumath -lnvc
-endif
-$(cu_main): LIBFLAGS += $(CULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH
-$(cu_main): $(BUILDDIR)/gcheck_sa.o $(LIBDIR)/lib$(MG5AMC_CULIB).so $(cu_objects_exe) $(BUILDDIR)/gCurandRandomNumberKernel.o
-	$(NVCC) -o $@ $(BUILDDIR)/gcheck_sa.o $(CUARCHFLAGS) $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe) $(BUILDDIR)/gCurandRandomNumberKernel.o $(CURANDLIBFLAGS)
-endif
-
-#-------------------------------------------------------------------------------
-
-# Generic target and build rules: objects from Fortran compilation
-$(BUILDDIR)/%%.o : %%.f *.inc
-	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
-	$(FC) -I. -c $< -o $@
-
-# Generic target and build rules: objects from Fortran compilation
-###$(BUILDDIR)/%%.o : %%.f *.inc
-###	@if [ ! -d $(INCDIR) ]; then echo "mkdir -p $(INCDIR)"; mkdir -p $(INCDIR); fi
-###	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
-###	$(FC) -I. -I$(INCDIR) -c $< -o $@
-
-# Target (and build rules): Fortran standalone executables
-###$(BUILDDIR)/fcheck_sa.o : $(INCDIR)/fbridge.inc
-
-ifeq ($(UNAME_S),Darwin)
-$(fcxx_main): LIBFLAGS += -L$(shell dirname $(shell $(FC) --print-file-name libgfortran.dylib)) # add path to libgfortran on Mac #375
-endif
-$(fcxx_main): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH
-$(fcxx_main): $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler.o $(LIBDIR)/lib$(MG5AMC_CXXLIB).so $(cxx_objects_exe)
-	$(CXX) -o $@ $(BUILDDIR)/fcheck_sa.o $(OMPFLAGS) $(BUILDDIR)/fsampler.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CXXLIB) $(cxx_objects_exe)
-
-ifneq ($(NVCC),)
-ifneq ($(shell $(CXX) --version | grep ^Intel),)
-$(fcu_main): LIBFLAGS += -lintlc # compile with icpx and link with nvcc (undefined reference to `_intel_fast_memcpy')
-$(fcu_main): LIBFLAGS += -lsvml # compile with icpx and link with nvcc (undefined reference to `__svml_cos4_l9')
-endif
-ifeq ($(UNAME_S),Darwin)
-$(fcu_main): LIBFLAGS += -L$(shell dirname $(shell $(FC) --print-file-name libgfortran.dylib)) # add path to libgfortran on Mac #375
-endif
-$(fcu_main): LIBFLAGS += $(CULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH
-$(fcu_main): $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler_cu.o $(LIBDIR)/lib$(MG5AMC_CULIB).so $(cu_objects_exe)
-	$(NVCC) -o $@ $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler_cu.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe)
-endif
-
-#-------------------------------------------------------------------------------
-
-# Target (and build rules): test objects and test executable
-$(BUILDDIR)/testxxx.o: $(GTESTLIBS)
-$(BUILDDIR)/testxxx.o: INCFLAGS += $(GTESTINC)
-$(BUILDDIR)/testxxx.o: testxxx_cc_ref.txt
-$(testmain): $(BUILDDIR)/testxxx.o
-$(testmain): cxx_objects_exe += $(BUILDDIR)/testxxx.o # Comment out this line to skip the C++ test of xxx functions
-
-ifneq ($(NVCC),)
-$(BUILDDIR)/testxxx_cu.o: $(GTESTLIBS)
-$(BUILDDIR)/testxxx_cu.o: INCFLAGS += $(GTESTINC)
-$(BUILDDIR)/testxxx_cu.o: testxxx_cc_ref.txt
-$(testmain): $(BUILDDIR)/testxxx_cu.o
-$(testmain): cu_objects_exe += $(BUILDDIR)/testxxx_cu.o # Comment out this line to skip the CUDA test of xxx functions
-endif
-
-$(BUILDDIR)/testmisc.o: $(GTESTLIBS)
-$(BUILDDIR)/testmisc.o: INCFLAGS += $(GTESTINC)
-$(testmain): $(BUILDDIR)/testmisc.o
-$(testmain): cxx_objects_exe += $(BUILDDIR)/testmisc.o # Comment out this line to skip the C++ miscellaneous tests
-
-ifneq ($(NVCC),)
-$(BUILDDIR)/testmisc_cu.o: $(GTESTLIBS)
-$(BUILDDIR)/testmisc_cu.o: INCFLAGS += $(GTESTINC)
-$(testmain): $(BUILDDIR)/testmisc_cu.o
-$(testmain): cu_objects_exe += $(BUILDDIR)/testmisc_cu.o # Comment out this line to skip the CUDA miscellaneous tests
-endif
-
-$(BUILDDIR)/runTest.o: $(GTESTLIBS)
-$(BUILDDIR)/runTest.o: INCFLAGS += $(GTESTINC)
-$(testmain): $(BUILDDIR)/runTest.o
-$(testmain): cxx_objects_exe += $(BUILDDIR)/runTest.o
-
-ifneq ($(NVCC),)
-$(BUILDDIR)/runTest_cu.o: $(GTESTLIBS)
-$(BUILDDIR)/runTest_cu.o: INCFLAGS += $(GTESTINC)
-ifneq ($(shell $(CXX) --version | grep ^Intel),)
-$(testmain): LIBFLAGS += -lintlc # compile with icpx and link with nvcc (undefined reference to `_intel_fast_memcpy')
-$(testmain): LIBFLAGS += -lsvml # compile with icpx and link with nvcc (undefined reference to `__svml_cos4_l9')
-else ifneq ($(shell $(CXX) --version | grep ^nvc++),) # support nvc++ #531
-$(testmain): LIBFLAGS += -L$(patsubst %%bin/nvc++,%%lib,$(subst ccache ,,$(CXX))) -lnvhpcatm -lnvcpumath -lnvc
-endif
-$(testmain): $(BUILDDIR)/runTest_cu.o
-$(testmain): cu_objects_exe  += $(BUILDDIR)/runTest_cu.o
-endif
-
-$(testmain): $(GTESTLIBS)
-$(testmain): INCFLAGS +=  $(GTESTINC)
-$(testmain): LIBFLAGS += -L$(GTESTLIBDIR) -lgtest -lgtest_main
-
-ifneq ($(OMPFLAGS),)
-ifneq ($(shell $(CXX) --version | egrep '^Intel'),)
-$(testmain): LIBFLAGS += -liomp5 # see #578 (not '-qopenmp -static-intel' as in https://stackoverflow.com/questions/45909648)
-else ifneq ($(shell $(CXX) --version | egrep '^clang'),)
-$(testmain): LIBFLAGS += -L $(shell dirname $(shell $(CXX) -print-file-name=libc++.so)) -lomp # see #604
-###else ifneq ($(shell $(CXX) --version | egrep '^Apple clang'),)
-###$(testmain): LIBFLAGS += ???? # OMP is not supported yet by cudacpp for Apple clang (see #578 and #604)
-else
-$(testmain): LIBFLAGS += -lgomp
-endif
-endif
-
-ifeq ($(NVCC),) # link only runTest.o
-$(testmain): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH
-$(testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(cxx_objects_exe) $(GTESTLIBS)
-	$(CXX) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) -ldl -pthread $(LIBFLAGS)
-else # link both runTest.o and runTest_cu.o
-$(testmain): LIBFLAGS += $(CULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH
-$(testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) $(GTESTLIBS)
-	$(NVCC) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) -ldl $(LIBFLAGS) -lcuda
-endif
-
-# Use flock (Linux only, no Mac) to allow 'make -j' if googletest has not yet been downloaded https://stackoverflow.com/a/32666215
-$(GTESTLIBS):
-ifneq ($(shell which flock 2>/dev/null),)
-	flock $(BUILDDIR)/.make_test.lock $(MAKE) -C $(TESTDIR)
-else
-	$(MAKE) -C $(TESTDIR)
-endif
-
-#-------------------------------------------------------------------------------
-
-# Target: build all targets in all AVX modes (each AVX mode in a separate build directory)
-# Split the avxall target into five separate targets to allow parallel 'make -j avxall' builds
-# (Hack: add a fbridge.inc dependency to avxall, to ensure it is only copied once for all AVX modes)
-avxnone:
-	@echo
-	$(MAKE) USEBUILDDIR=1 AVX=none -f $(CUDACPP_MAKEFILE)
-
-avxsse4:
-	@echo
-	$(MAKE) USEBUILDDIR=1 AVX=sse4 -f $(CUDACPP_MAKEFILE)
-
-avxavx2:
-	@echo
-	$(MAKE) USEBUILDDIR=1 AVX=avx2 -f $(CUDACPP_MAKEFILE)
-
-avx512y:
-	@echo
-	$(MAKE) USEBUILDDIR=1 AVX=512y -f $(CUDACPP_MAKEFILE)
-
-avx512z:
-	@echo
-	$(MAKE) USEBUILDDIR=1 AVX=512z -f $(CUDACPP_MAKEFILE)
-
-ifeq ($(UNAME_P),ppc64le)
-###avxall: $(INCDIR)/fbridge.inc avxnone avxsse4
-avxall: avxnone avxsse4
-else ifeq ($(UNAME_P),arm)
-###avxall: $(INCDIR)/fbridge.inc avxnone avxsse4
-avxall: avxnone avxsse4
-else
-###avxall: $(INCDIR)/fbridge.inc avxnone avxsse4 avxavx2 avx512y avx512z
-avxall: avxnone avxsse4 avxavx2 avx512y avx512z
-endif
-
-#-------------------------------------------------------------------------------
-
-# Target: clean the builds
-.PHONY: clean
-
-clean:
-ifeq ($(USEBUILDDIR),1)
-	rm -rf $(BUILDDIR)
-else
-	rm -f $(BUILDDIR)/.build.* $(BUILDDIR)/*.o $(BUILDDIR)/*.exe
-	rm -f $(LIBDIR)/lib$(MG5AMC_CXXLIB).so $(LIBDIR)/lib$(MG5AMC_CULIB).so
-endif
-	$(MAKE) -C ../../src clean -f $(CUDACPP_SRC_MAKEFILE)
-###	rm -rf $(INCDIR)
-
-cleanall:
-	@echo
-	$(MAKE) USEBUILDDIR=0 clean -f $(CUDACPP_MAKEFILE)
-	@echo
-	$(MAKE) USEBUILDDIR=0 -C ../../src cleanall -f $(CUDACPP_SRC_MAKEFILE)
-	rm -rf build.*
-
-# Target: clean the builds as well as the gtest installation(s)
-distclean: cleanall
-ifneq ($(wildcard $(TESTDIRCOMMON)),)
-	$(MAKE) -C $(TESTDIRCOMMON) clean
-endif
-	$(MAKE) -C $(TESTDIRLOCAL) clean
-
-#-------------------------------------------------------------------------------
-
-# Target: show system and compiler information
-info:
-	@echo ""
-	@uname -spn # e.g. Linux nodename.cern.ch x86_64
-ifeq ($(UNAME_S),Darwin)
-	@sysctl -a | grep -i brand
-	@sysctl -a | grep machdep.cpu | grep features || true
-	@sysctl -a | grep hw.physicalcpu:
-	@sysctl -a | grep hw.logicalcpu:
-else
-	@cat /proc/cpuinfo | grep "model name" | sort -u
-	@cat /proc/cpuinfo | grep "flags" | sort -u
-	@cat /proc/cpuinfo | grep "cpu cores" | sort -u
-	@cat /proc/cpuinfo | grep "physical id" | sort -u
-endif
-	@echo ""
-ifneq ($(shell which nvidia-smi 2>/dev/null),)
-	nvidia-smi -L
-	@echo ""
-endif
-	@echo USECCACHE=$(USECCACHE)
-ifeq ($(USECCACHE),1)
-	ccache --version | head -1
-endif
-	@echo ""
-	@echo NVCC=$(NVCC)
-ifneq ($(NVCC),)
-	$(NVCC) --version
-endif
-	@echo ""
-	@echo CXX=$(CXX)
-ifneq ($(shell $(CXX) --version | grep ^clang),)
-	@echo $(CXX) -v
-	@$(CXX) -v |& egrep -v '(Found|multilib)'
-	@readelf -p .comment `$(CXX) -print-libgcc-file-name` |& grep 'GCC: (GNU)' | grep -v Warning | sort -u | awk '{print "GCC toolchain:",$$5}'
-else
-	$(CXX) --version
-endif
-	@echo ""
-	@echo FC=$(FC)
-	$(FC) --version
-
-#-------------------------------------------------------------------------------
-
-# Target: check (run the C++ test executable)
-# [NB THIS IS WHAT IS USED IN THE GITHUB CI!]
-ifneq ($(NVCC),)
-check: runTest cmpFcheck cmpFGcheck
-else
-check: runTest cmpFcheck
-endif
-
-# Target: runTest (run the C++ test executable runTest.exe)
-runTest: all.$(TAG)
-	$(RUNTIME) $(BUILDDIR)/runTest.exe
-
-# Target: runCheck (run the C++ standalone executable check.exe, with a small number of events)
-runCheck: all.$(TAG)
-	$(RUNTIME) $(BUILDDIR)/check.exe -p 2 32 2
-
-# Target: runGcheck (run the CUDA standalone executable gcheck.exe, with a small number of events)
-runGcheck: all.$(TAG)
-	$(RUNTIME) $(BUILDDIR)/gcheck.exe -p 2 32 2
-
-# Target: runFcheck (run the Fortran standalone executable - with C++ MEs - fcheck.exe, with a small number of events)
-runFcheck: all.$(TAG)
-	$(RUNTIME) $(BUILDDIR)/fcheck.exe 2 32 2
-
-# Target: runFGcheck (run the Fortran standalone executable - with CUDA MEs - fgcheck.exe, with a small number of events)
-runFGcheck: all.$(TAG)
-	$(RUNTIME) $(BUILDDIR)/fgcheck.exe 2 32 2
-
-# Target: cmpFcheck (compare ME results from the C++ and Fortran with C++ MEs standalone executables, with a small number of events)
-cmpFcheck: all.$(TAG)
-	@echo
-	@echo "$(BUILDDIR)/check.exe --common -p 2 32 2"
-	@echo "$(BUILDDIR)/fcheck.exe 2 32 2"
-	@me1=$(shell $(RUNTIME) $(BUILDDIR)/check.exe --common -p 2 32 2 | grep MeanMatrix | awk '{print $$4}'); me2=$(shell $(RUNTIME) $(BUILDDIR)/fcheck.exe 2 32 2 | grep Average | awk '{print $$4}'); echo "Avg ME (C++/C++)    = $${me1}"; echo "Avg ME (F77/C++)    = $${me2}"; if [ "$${me2}" == "NaN" ]; then echo "ERROR! Fortran calculation (F77/C++) returned NaN"; elif [ "$${me2}" == "" ]; then echo "ERROR! Fortran calculation (F77/C++) crashed"; else python3 -c "me1=$${me1}; me2=$${me2}; reldif=abs((me2-me1)/me1); print('Relative difference =', reldif); ok = reldif <= 2E-4; print ( '%%s (relative difference %%s 2E-4)' %% ( ('OK','<=') if ok else ('ERROR','>') ) ); import sys; sys.exit(0 if ok else 1)"; fi
-
-# Target: cmpFGcheck (compare ME results from the CUDA and Fortran with CUDA MEs standalone executables, with a small number of events)
-cmpFGcheck: all.$(TAG)
-	@echo
-	@echo "$(BUILDDIR)/gcheck.exe --common -p 2 32 2"
-	@echo "$(BUILDDIR)/fgcheck.exe 2 32 2"
-	@me1=$(shell $(RUNTIME) $(BUILDDIR)/gcheck.exe --common -p 2 32 2 | grep MeanMatrix | awk '{print $$4}'); me2=$(shell $(RUNTIME) $(BUILDDIR)/fgcheck.exe 2 32 2 | grep Average | awk '{print $$4}'); echo "Avg ME (C++/CUDA)   = $${me1}"; echo "Avg ME (F77/CUDA)   = $${me2}"; if [ "$${me2}" == "NaN" ]; then echo "ERROR! Fortran calculation (F77/CUDA) crashed"; elif [ "$${me2}" == "" ]; then echo "ERROR! Fortran calculation (F77/CUDA) crashed"; else python3 -c "me1=$${me1}; me2=$${me2}; reldif=abs((me2-me1)/me1); print('Relative difference =', reldif); ok = reldif <= 2E-4; print ( '%%s (relative difference %%s 2E-4)' %% ( ('OK','<=') if ok else ('ERROR','>') ) ); import sys; sys.exit(0 if ok else 1)"; fi
-
-# Target: memcheck (run the CUDA standalone executable gcheck.exe with a small number of events through cuda-memcheck)
-memcheck: all.$(TAG)
-	$(RUNTIME) $(CUDA_HOME)/bin/cuda-memcheck --check-api-memory-access yes --check-deprecated-instr yes --check-device-heap yes --demangle full --language c --leak-check full --racecheck-report all --report-api-errors all --show-backtrace yes --tool memcheck --track-unused-memory yes $(BUILDDIR)/gcheck.exe -p 2 32 2
-
-#-------------------------------------------------------------------------------

From cf8875b648f9524f646242b4a03dfb1bbb2828a9 Mon Sep 17 00:00:00 2001
From: Andrea Valassi <andrea.valassi@cern.ch>
Date: Wed, 24 Jan 2024 10:27:14 +0100
Subject: [PATCH 483/509] [jt774] (after merging upstream/master) fix CODEGEN
 cudacpp.mk: replace % by %% (code generation was failing)

---
 .../iolibs/template_files/gpu/cudacpp.mk      | 22 +++++++++----------
 1 file changed, 11 insertions(+), 11 deletions(-)

diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/cudacpp.mk b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/cudacpp.mk
index 49a79b2674..011a5326ab 100644
--- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/cudacpp.mk
+++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/cudacpp.mk
@@ -271,11 +271,11 @@ endif
 
 # PowerPC-specific CXX compiler flags (being reviewed)
 ifeq ($(UNAME_P),ppc64le)
-  CXXFLAGS+= -mcpu=power9 -mtune=power9 # gains ~2-3% both for none and sse4
+  CXXFLAGS+= -mcpu=power9 -mtune=power9 # gains ~2-3%% both for none and sse4
   # Throughput references without the extra flags below: none=1.41-1.42E6, sse4=2.15-2.19E6
   ###CXXFLAGS+= -DNO_WARN_X86_INTRINSICS # no change
   ###CXXFLAGS+= -fpeel-loops # no change
-  ###CXXFLAGS+= -funroll-loops # gains ~1% for none, loses ~1% for sse4
+  ###CXXFLAGS+= -funroll-loops # gains ~1%% for none, loses ~1%% for sse4
   ###CXXFLAGS+= -ftree-vectorize # no change
   ###CXXFLAGS+= -flto # would increase to none=4.08-4.12E6, sse4=4.99-5.03E6!
 else
@@ -558,7 +558,7 @@ $(BUILDDIR)/%%.o : %%.cu *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
 	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
 	$(GPUCC) $(CPPFLAGS) $(GPUFLAGS) $(CUBUILDRULEFLAGS) $< -o $@
 
-$(BUILDDIR)/%_cu.o : %.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
+$(BUILDDIR)/%%_cu.o : %%.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
 	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
 	$(GPUCC) $(CPPFLAGS) $(GPUFLAGS) $(CCBUILDRULEFLAGS) $< -o $@
 endif
@@ -566,7 +566,7 @@ endif
 
 # Generic target and build rules: objects from C++ compilation
 # (NB do not include CUINC here! add it only for NVTX or curand #679)
-$(BUILDDIR)/%.o : %.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
+$(BUILDDIR)/%%.o : %%.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
 	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
 	$(CXX) $(CPPFLAGS) $(CXXFLAGS) -fPIC -c $< -o $@
 
@@ -656,7 +656,7 @@ endif
 #-------------------------------------------------------------------------------
 
 # Target (and build rules): Fortran include files
-###$(INCDIR)/%.inc : ../%.inc
+###$(INCDIR)/%%.inc : ../%%.inc
 ###	@if [ ! -d $(INCDIR) ]; then echo "mkdir -p $(INCDIR)"; mkdir -p $(INCDIR); fi
 ###	\cp $< $@
 
@@ -672,7 +672,7 @@ ifneq ($(shell $(CXX) --version | grep ^Intel),)
 $(cu_main): LIBFLAGS += -lintlc # compile with icpx and link with GPUCC (undefined reference to `_intel_fast_memcpy')
 $(cu_main): LIBFLAGS += -lsvml # compile with icpx and link with GPUCC (undefined reference to `__svml_cos4_l9')
 else ifneq ($(shell $(CXX) --version | grep ^nvc++),) # support nvc++ #531
-$(cu_main): LIBFLAGS += -L$(patsubst %bin/nvc++,%lib,$(subst ccache ,,$(CXX))) -lnvhpcatm -lnvcpumath -lnvc
+$(cu_main): LIBFLAGS += -L$(patsubst %%bin/nvc++,%%lib,$(subst ccache ,,$(CXX))) -lnvhpcatm -lnvcpumath -lnvc
 endif
 $(cu_main): LIBFLAGS += $(CULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH
 $(cu_main): $(BUILDDIR)/gcheck_sa.o $(LIBDIR)/lib$(MG5AMC_CULIB).so $(cu_objects_exe) $(BUILDDIR)/gCurandRandomNumberKernel.o
@@ -682,12 +682,12 @@ endif
 #-------------------------------------------------------------------------------
 
 # Generic target and build rules: objects from Fortran compilation
-$(BUILDDIR)/%.o : %.f *.inc
+$(BUILDDIR)/%%.o : %%.f *.inc
 	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
 	$(FC) -I. -c $< -o $@
 
 # Generic target and build rules: objects from Fortran compilation
-###$(BUILDDIR)/%.o : %.f *.inc
+###$(BUILDDIR)/%%.o : %%.f *.inc
 ###	@if [ ! -d $(INCDIR) ]; then echo "mkdir -p $(INCDIR)"; mkdir -p $(INCDIR); fi
 ###	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
 ###	$(FC) -I. -I$(INCDIR) -c $< -o $@
@@ -756,7 +756,7 @@ ifneq ($(shell $(CXX) --version | grep ^Intel),)
 $(testmain): LIBFLAGS += -lintlc # compile with icpx and link with GPUCC (undefined reference to `_intel_fast_memcpy')
 $(testmain): LIBFLAGS += -lsvml # compile with icpx and link with GPUCC (undefined reference to `__svml_cos4_l9')
 else ifneq ($(shell $(CXX) --version | grep ^nvc++),) # support nvc++ #531
-$(testmain): LIBFLAGS += -L$(patsubst %bin/nvc++,%lib,$(subst ccache ,,$(CXX))) -lnvhpcatm -lnvcpumath -lnvc
+$(testmain): LIBFLAGS += -L$(patsubst %%bin/nvc++,%%lib,$(subst ccache ,,$(CXX))) -lnvhpcatm -lnvcpumath -lnvc
 endif
 $(testmain): $(BUILDDIR)/runTest_cu.o
 $(testmain): cu_objects_exe  += $(BUILDDIR)/runTest_cu.o
@@ -946,14 +946,14 @@ cmpFcheck: all.$(TAG)
 	@echo
 	@echo "$(BUILDDIR)/check.exe --common -p 2 32 2"
 	@echo "$(BUILDDIR)/fcheck.exe 2 32 2"
-	@me1=$(shell $(RUNTIME) $(BUILDDIR)/check.exe --common -p 2 32 2 | grep MeanMatrix | awk '{print $$4}'); me2=$(shell $(RUNTIME) $(BUILDDIR)/fcheck.exe 2 32 2 | grep Average | awk '{print $$4}'); echo "Avg ME (C++/C++)    = $${me1}"; echo "Avg ME (F77/C++)    = $${me2}"; if [ "$${me2}" == "NaN" ]; then echo "ERROR! Fortran calculation (F77/C++) returned NaN"; elif [ "$${me2}" == "" ]; then echo "ERROR! Fortran calculation (F77/C++) crashed"; else python3 -c "me1=$${me1}; me2=$${me2}; reldif=abs((me2-me1)/me1); print('Relative difference =', reldif); ok = reldif <= 2E-4; print ( '%s (relative difference %s 2E-4)' % ( ('OK','<=') if ok else ('ERROR','>') ) ); import sys; sys.exit(0 if ok else 1)"; fi
+	@me1=$(shell $(RUNTIME) $(BUILDDIR)/check.exe --common -p 2 32 2 | grep MeanMatrix | awk '{print $$4}'); me2=$(shell $(RUNTIME) $(BUILDDIR)/fcheck.exe 2 32 2 | grep Average | awk '{print $$4}'); echo "Avg ME (C++/C++)    = $${me1}"; echo "Avg ME (F77/C++)    = $${me2}"; if [ "$${me2}" == "NaN" ]; then echo "ERROR! Fortran calculation (F77/C++) returned NaN"; elif [ "$${me2}" == "" ]; then echo "ERROR! Fortran calculation (F77/C++) crashed"; else python3 -c "me1=$${me1}; me2=$${me2}; reldif=abs((me2-me1)/me1); print('Relative difference =', reldif); ok = reldif <= 2E-4; print ( '%%s (relative difference %%s 2E-4)' %% ( ('OK','<=') if ok else ('ERROR','>') ) ); import sys; sys.exit(0 if ok else 1)"; fi
 
 # Target: cmpFGcheck (compare ME results from the CUDA and Fortran with CUDA MEs standalone executables, with a small number of events)
 cmpFGcheck: all.$(TAG)
 	@echo
 	@echo "$(BUILDDIR)/gcheck.exe --common -p 2 32 2"
 	@echo "$(BUILDDIR)/fgcheck.exe 2 32 2"
-	@me1=$(shell $(RUNTIME) $(BUILDDIR)/gcheck.exe --common -p 2 32 2 | grep MeanMatrix | awk '{print $$4}'); me2=$(shell $(RUNTIME) $(BUILDDIR)/fgcheck.exe 2 32 2 | grep Average | awk '{print $$4}'); echo "Avg ME (C++/CUDA)   = $${me1}"; echo "Avg ME (F77/CUDA)   = $${me2}"; if [ "$${me2}" == "NaN" ]; then echo "ERROR! Fortran calculation (F77/CUDA) crashed"; elif [ "$${me2}" == "" ]; then echo "ERROR! Fortran calculation (F77/CUDA) crashed"; else python3 -c "me1=$${me1}; me2=$${me2}; reldif=abs((me2-me1)/me1); print('Relative difference =', reldif); ok = reldif <= 2E-4; print ( '%s (relative difference %s 2E-4)' % ( ('OK','<=') if ok else ('ERROR','>') ) ); import sys; sys.exit(0 if ok else 1)"; fi
+	@me1=$(shell $(RUNTIME) $(BUILDDIR)/gcheck.exe --common -p 2 32 2 | grep MeanMatrix | awk '{print $$4}'); me2=$(shell $(RUNTIME) $(BUILDDIR)/fgcheck.exe 2 32 2 | grep Average | awk '{print $$4}'); echo "Avg ME (C++/CUDA)   = $${me1}"; echo "Avg ME (F77/CUDA)   = $${me2}"; if [ "$${me2}" == "NaN" ]; then echo "ERROR! Fortran calculation (F77/CUDA) crashed"; elif [ "$${me2}" == "" ]; then echo "ERROR! Fortran calculation (F77/CUDA) crashed"; else python3 -c "me1=$${me1}; me2=$${me2}; reldif=abs((me2-me1)/me1); print('Relative difference =', reldif); ok = reldif <= 2E-4; print ( '%%s (relative difference %%s 2E-4)' %% ( ('OK','<=') if ok else ('ERROR','>') ) ); import sys; sys.exit(0 if ok else 1)"; fi
 
 # Target: memcheck (run the CUDA standalone executable gcheck.exe with a small number of events through cuda-memcheck)
 memcheck: all.$(TAG)

From e32bc4e6ea9ac0c3808c9644e5526c1b2bda3db2 Mon Sep 17 00:00:00 2001
From: Andrea Valassi <andrea.valassi@cern.ch>
Date: Wed, 24 Jan 2024 10:36:44 +0100
Subject: [PATCH 484/509] [jt774] (after merging upstream/master) fix clang
 formatting in CODEGEN (code generation was failing clang formatting checks)

---
 .../template_files/cpp_model_parameters_h.inc | 40 +++++++++----------
 .../template_files/gpu/MemoryAccessMomenta.h  | 21 +++++-----
 .../iolibs/template_files/gpu/check_sa.cc     |  8 ++--
 .../CUDACPP_SA_OUTPUT/model_handling.py       |  6 +--
 4 files changed, 37 insertions(+), 38 deletions(-)

diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/cpp_model_parameters_h.inc b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/cpp_model_parameters_h.inc
index 5ab7aa7abd..8b8797c04c 100644
--- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/cpp_model_parameters_h.inc
+++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/cpp_model_parameters_h.inc
@@ -194,8 +194,8 @@ namespace mg5amcCpu
 %(dcoupsetdcoup)s
       }
 %(eftspecial2)s
-    return out;
-  }
+      return out;
+    }
 #ifdef MGONGPUCPP_GPUIMPL
 #pragma GCC diagnostic pop
 #pragma nv_diagnostic pop
@@ -213,33 +213,33 @@ namespace mg5amcCpu
   //==========================================================================
 
 #ifdef MGONGPUCPP_GPUIMPL
-namespace mg5amcGpu
+  namespace mg5amcGpu
 #else
-namespace mg5amcCpu
+  namespace mg5amcCpu
 #endif
-{
+  {
 #pragma GCC diagnostic push
 #ifndef __clang__
 #pragma GCC diagnostic ignored "-Wunused-but-set-variable" // e.g. <<warning: variable ‘couplings_sv’ set but not used [-Wunused-but-set-variable]>>
 #endif
-  // Compute the output couplings (e.g. gc10 and gc11) from the input gs
-  template<class G_ACCESS, class C_ACCESS>
-  __device__ inline void
-  G2COUP( const fptype gs[],
-          fptype couplings[] )
-  {
-    mgDebug( 0, __FUNCTION__ );
-    using namespace Parameters_%(model_name)s_dependentCouplings;
-    const fptype_sv& gs_sv = G_ACCESS::kernelAccessConst( gs );
-    DependentCouplings_sv couplings_sv = computeDependentCouplings_fromG( gs_sv );
+    // Compute the output couplings (e.g. gc10 and gc11) from the input gs
+    template<class G_ACCESS, class C_ACCESS>
+    __device__ inline void
+    G2COUP( const fptype gs[],
+            fptype couplings[] )
+    {
+      mgDebug( 0, __FUNCTION__ );
+      using namespace Parameters_%(model_name)s_dependentCouplings;
+      const fptype_sv& gs_sv = G_ACCESS::kernelAccessConst( gs );
+      DependentCouplings_sv couplings_sv = computeDependentCouplings_fromG( gs_sv );
 %(dcoupaccessbuffer)s%(dcoupkernelaccess)s%(dcoupcompute)s
-    mgDebug( 1, __FUNCTION__ );
-    return;
-  }
+      mgDebug( 1, __FUNCTION__ );
+      return;
+    }
 #pragma GCC diagnostic pop
 
-} // end namespace mg5amcGpu/mg5amcCpu
+  } // end namespace mg5amcGpu/mg5amcCpu
 
-//==========================================================================
+  //==========================================================================
 
 #endif // Parameters_%(model_name)s_H
diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MemoryAccessMomenta.h b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MemoryAccessMomenta.h
index d3f5a15bd2..86df5d5471 100644
--- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MemoryAccessMomenta.h
+++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MemoryAccessMomenta.h
@@ -27,18 +27,17 @@ namespace mg5amcCpu
   class MemoryAccessMomentaBase //_AOSOAv1
   {
   public:
-
-  // Number of Events Per Page in the momenta AOSOA memory buffer layout
-  // (these are all best kept as a compile-time constants: see issue #23)
+    // Number of Events Per Page in the momenta AOSOA memory buffer layout
+    // (these are all best kept as a compile-time constants: see issue #23)
 #ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
-  // -----------------------------------------------------------------------------------------------
-  // --- GPUs: neppM is best set to a power of 2 times the number of fptype's in a 32-byte cacheline
-  // --- This is relevant to ensure coalesced access to momenta in global memory
-  // --- Note that neppR is hardcoded and may differ from neppM and neppV on some platforms
-  // -----------------------------------------------------------------------------------------------
-  //static constexpr int neppM = 64/sizeof(fptype); // 2x 32-byte GPU cache lines (512 bits): 8 (DOUBLE) or 16 (FLOAT)
-  static constexpr int neppM = 32/sizeof(fptype); // (DEFAULT) 32-byte GPU cache line (256 bits): 4 (DOUBLE) or 8 (FLOAT)
-  //static constexpr int neppM = 1; // *** NB: this is equivalent to AOS *** (slower: 1.03E9 instead of 1.11E9 in eemumu)
+    // -----------------------------------------------------------------------------------------------
+    // --- GPUs: neppM is best set to a power of 2 times the number of fptype's in a 32-byte cacheline
+    // --- This is relevant to ensure coalesced access to momenta in global memory
+    // --- Note that neppR is hardcoded and may differ from neppM and neppV on some platforms
+    // -----------------------------------------------------------------------------------------------
+    //static constexpr int neppM = 64/sizeof(fptype); // 2x 32-byte GPU cache lines (512 bits): 8 (DOUBLE) or 16 (FLOAT)
+    static constexpr int neppM = 32/sizeof(fptype); // (DEFAULT) 32-byte GPU cache line (256 bits): 4 (DOUBLE) or 8 (FLOAT)
+    //static constexpr int neppM = 1; // *** NB: this is equivalent to AOS *** (slower: 1.03E9 instead of 1.11E9 in eemumu)
 #else
     // -----------------------------------------------------------------------------------------------
     // --- CPUs: neppM is best set equal to the number of fptype's (neppV) in a vector register
diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/check_sa.cc b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/check_sa.cc
index 7c93c07a1a..b9a05dea46 100644
--- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/check_sa.cc
+++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/check_sa.cc
@@ -152,7 +152,7 @@ main( int argc, char** argv )
 #ifdef MGONGPUCPP_GPUIMPL
   RamboSamplingMode rmbsmp = RamboSamplingMode::RamboDevice; // default on GPU
 #else
-  RamboSamplingMode rmbsmp = RamboSamplingMode::RamboHost;  // default on CPU
+  RamboSamplingMode rmbsmp = RamboSamplingMode::RamboHost; // default on CPU
 #endif
   // Bridge emulation mode (NB Bridge implies RamboHost!)
   bool bridge = false;
@@ -780,7 +780,7 @@ main( int argc, char** argv )
   wrkflwtxt += "CPP:";
 #endif
   // -- DOUBLE or FLOAT?
-#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT /* clang-format off */
   wrkflwtxt += "MIX+"; // mixed fptypes (single precision color algebra #537)
 #elif defined MGONGPU_FPTYPE_DOUBLE
   wrkflwtxt += "DBL+";
@@ -799,7 +799,7 @@ main( int argc, char** argv )
   wrkflwtxt += "CXS:";
 #else
   wrkflwtxt += "???:"; // no path to this statement
-#endif
+#endif /* clang-format on */
 #elif defined __HIPCC__
 #if defined MGONGPU_CUCXTYPE_CXSMPL
   wrkflwtxt += "CXS:";
@@ -1086,7 +1086,7 @@ main( int argc, char** argv )
 #elif defined MGONGPU_CUCXTYPE_CXSMPL
              << "\"STD::COMPLEX\"," << std::endl
 #else
-             << "\"???\"," << std::endl                           // no path to this statement...
+             << "\"???\"," << std::endl // no path to this statement...
 #endif
              << "\"RanNumb memory layout\": "
              << "\"AOSOA[" << neppR << "]\""
diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/model_handling.py b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/model_handling.py
index 3e0ebe545f..b585102292 100644
--- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/model_handling.py
+++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/model_handling.py
@@ -859,11 +859,11 @@ def super_generate_parameters_class_files(self):
             replace_dict['dcoupsetdpar'] = '    ' + '\n'.join( dcoupsetdpar )
             dcoupsetdcoup = [ '    ' + line.replace('constexpr cxsmpl<double> ','out.').replace('mdl_complexi', 'cI') for line in self.write_hardcoded_parameters(list(self.coups_dep.values())).split('\n') if line != '' ]
             replace_dict['dcoupsetdcoup'] = '    ' + '\n'.join( dcoupsetdcoup )
-            dcoupaccessbuffer = [ '    fptype* %ss = C_ACCESS::idcoupAccessBuffer( couplings, idcoup_%s );'%( name, name ) for name in self.coups_dep ]
+            dcoupaccessbuffer = [ '      fptype* %ss = C_ACCESS::idcoupAccessBuffer( couplings, idcoup_%s );'%( name, name ) for name in self.coups_dep ]
             replace_dict['dcoupaccessbuffer'] = '\n'.join( dcoupaccessbuffer ) + '\n'
-            dcoupkernelaccess = [ '    cxtype_sv_ref %ss_sv = C_ACCESS::kernelAccess( %ss );'%( name, name ) for name in self.coups_dep ]
+            dcoupkernelaccess = [ '      cxtype_sv_ref %ss_sv = C_ACCESS::kernelAccess( %ss );'%( name, name ) for name in self.coups_dep ]
             replace_dict['dcoupkernelaccess'] = '\n'.join( dcoupkernelaccess ) + '\n'
-            dcoupcompute = [ '    %ss_sv = couplings_sv.%s;'%( name, name ) for name in self.coups_dep ]
+            dcoupcompute = [ '      %ss_sv = couplings_sv.%s;'%( name, name ) for name in self.coups_dep ]
             replace_dict['dcoupcompute'] = '\n'.join( dcoupcompute )
             # Special handling in EFT for fptype=float using SIMD
             dcoupoutfptypev2 = [ '      fptype_v %sr_v;\n      fptype_v %si_v;'%(name,name) for name in self.coups_dep ]

From def02b58e369c76e9f3b63b1991ca1ec8e148107 Mon Sep 17 00:00:00 2001
From: Andrea Valassi <andrea.valassi@cern.ch>
Date: Wed, 24 Jan 2024 11:01:40 +0100
Subject: [PATCH 485/509] [jt774] regenerate gg_tt.mad - the build fails
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

CUDA_HOME=none HIP_HOME=none make |& more
...
ccache g++  -O3  -std=c++17 -I.  -fPIC -Wall -Wshadow -Wextra -ffast-math  -fopenmp  -march=skylake-avx512 -mprefer-vector-width=256  -DMGONGPU_FPTYPE_DOU
BLE -DMGONGPU_FPTYPE2_DOUBLE -DMGONGPU_HAS_NO_CURAND -fPIC -c Parameters_sm.cc -o Parameters_sm.o
In file included from /usr/include/c++/11/locale:41,
                 from /usr/include/c++/11/iomanip:43,
                 from Parameters_sm.cc:17:
/usr/include/c++/11/bits/locale_facets_nonio.h:59:39: error: ‘locale’ has not been declared
   59 |     struct __timepunct_cache : public locale::facet
      |                                       ^~~~~~
---
 .../gg_tt.mad/CODEGEN_mad_gg_tt_log.txt       |  22 +-
 epochX/cudacpp/gg_tt.mad/COPYRIGHT            |   1 +
 .../cudacpp/gg_tt.mad/SubProcesses/Bridge.h   |  32 +--
 .../gg_tt.mad/SubProcesses/BridgeKernels.cc   |   9 +-
 .../gg_tt.mad/SubProcesses/BridgeKernels.h    |   8 +-
 .../SubProcesses/CommonRandomNumberKernel.cc  |   5 +-
 .../SubProcesses/CrossSectionKernels.cc       |   7 +-
 .../SubProcesses/CrossSectionKernels.h        |   6 +-
 .../gg_tt.mad/SubProcesses/CudaRuntime.h      |  85 -------
 .../SubProcesses/CurandRandomNumberKernel.cc  |  12 +-
 .../gg_tt.mad/SubProcesses/EventStatistics.h  |   4 +-
 .../gg_tt.mad/SubProcesses/MadgraphTest.h     |   8 +-
 .../SubProcesses/MatrixElementKernels.cc      |  26 +-
 .../SubProcesses/MatrixElementKernels.h       |   8 +-
 .../SubProcesses/MemoryAccessAmplitudes.h     |   2 +-
 .../SubProcesses/MemoryAccessCouplings.h      |   2 +-
 .../SubProcesses/MemoryAccessCouplingsFixed.h |   2 +-
 .../SubProcesses/MemoryAccessDenominators.h   |   2 +-
 .../gg_tt.mad/SubProcesses/MemoryAccessGs.h   |   2 +-
 .../SubProcesses/MemoryAccessHelpers.h        |   4 +-
 .../SubProcesses/MemoryAccessMatrixElements.h |   2 +-
 .../SubProcesses/MemoryAccessMomenta.h        |   7 +-
 .../SubProcesses/MemoryAccessNumerators.h     |   2 +-
 .../SubProcesses/MemoryAccessRandomNumbers.h  |   4 +-
 .../SubProcesses/MemoryAccessVectors.h        |   4 +-
 .../SubProcesses/MemoryAccessWavefunctions.h  |   2 +-
 .../gg_tt.mad/SubProcesses/MemoryBuffers.h    |  64 ++---
 .../SubProcesses/P1_gg_ttx/CPPProcess.cc      |  64 ++---
 .../SubProcesses/P1_gg_ttx/CPPProcess.h       |  10 +-
 .../SubProcesses/P1_gg_ttx/CudaRuntime.h      |   1 -
 .../SubProcesses/P1_gg_ttx/check_sa.cc        | 117 +++++----
 .../SubProcesses/RamboSamplingKernels.cc      |  20 +-
 .../SubProcesses/RamboSamplingKernels.h       |   6 +-
 .../SubProcesses/RandomNumberKernels.h        |   6 +-
 .../cudacpp/gg_tt.mad/SubProcesses/cudacpp.mk | 233 +++++++++++-------
 .../cudacpp/gg_tt.mad/SubProcesses/fbridge.cc |  16 +-
 .../gg_tt.mad/SubProcesses/fsampler.cc        |   8 +-
 .../cudacpp/gg_tt.mad/SubProcesses/runTest.cc |  12 +-
 .../gg_tt.mad/SubProcesses/testmisc.cc        |   8 +-
 .../cudacpp/gg_tt.mad/SubProcesses/testxxx.cc |  14 +-
 epochX/cudacpp/gg_tt.mad/src/HelAmps_sm.h     |   4 +-
 epochX/cudacpp/gg_tt.mad/src/Parameters_sm.cc |   4 +-
 epochX/cudacpp/gg_tt.mad/src/Parameters_sm.h  |  58 +++--
 epochX/cudacpp/gg_tt.mad/src/cudacpp_src.mk   |  23 +-
 epochX/cudacpp/gg_tt.mad/src/mgOnGpuConfig.h  |  76 ++++--
 epochX/cudacpp/gg_tt.mad/src/mgOnGpuCxtypes.h |  28 +--
 epochX/cudacpp/gg_tt.mad/src/mgOnGpuFptypes.h |  10 +-
 epochX/cudacpp/gg_tt.mad/src/mgOnGpuVectors.h |  20 +-
 epochX/cudacpp/gg_tt.mad/src/rambo.h          |   8 +-
 49 files changed, 562 insertions(+), 516 deletions(-)
 delete mode 100644 epochX/cudacpp/gg_tt.mad/SubProcesses/CudaRuntime.h
 delete mode 120000 epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/CudaRuntime.h

diff --git a/epochX/cudacpp/gg_tt.mad/CODEGEN_mad_gg_tt_log.txt b/epochX/cudacpp/gg_tt.mad/CODEGEN_mad_gg_tt_log.txt
index a477013568..b56b36111b 100644
--- a/epochX/cudacpp/gg_tt.mad/CODEGEN_mad_gg_tt_log.txt
+++ b/epochX/cudacpp/gg_tt.mad/CODEGEN_mad_gg_tt_log.txt
@@ -52,7 +52,7 @@ Note that you can still compile and run aMC@NLO with the built-in PDFs
 
 Using default text editor "vi". Set another one in ./input/mg5_configuration.txt
 Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt
-No valid web browser found. Please set in ./input/mg5_configuration.txt
+Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt
 import /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt.mg
 The import format was not given, so we guess it as command
 set stdout_level DEBUG
@@ -62,7 +62,7 @@ generate g g > t t~
 No model currently active, so we import the Standard Model
 INFO: load particles 
 INFO: load vertices 
-[1;32mDEBUG: model prefixing  takes 0.005816459655761719 [0m
+[1;32mDEBUG: model prefixing  takes 0.005602598190307617 [0m
 INFO: Restrict model sm with file models/sm/restrict_default.dat . 
 [1;32mDEBUG: Simplifying conditional expressions [0m
 [1;32mDEBUG: remove interactions: u s w+ at order: QED=1 [0m
@@ -175,7 +175,7 @@ INFO: Generating Helas calls for process: g g > t t~ WEIGHTED<=2 @1
 INFO: Processing color information for process: g g > t t~ @1 
 INFO: Creating files in directory P1_gg_ttx 
 [1;32mDEBUG:  kwargs[prefix] = 0 [1;30m[model_handling.py at line 1058][0m [0m
-[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7f089da5e700> [1;30m[export_v4.py at line 6262][0m [0m
+[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7fbbe40f09d0> [1;30m[export_v4.py at line 6262][0m [0m
 INFO: Creating files in directory . 
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.h
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.cc
@@ -191,16 +191,16 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./.
 INFO: Generating Feynman diagrams for Process: g g > t t~ WEIGHTED<=2 @1 
 INFO: Finding symmetric diagrams for subprocess group gg_ttx 
 Generated helas calls for 1 subprocesses (3 diagrams) in 0.006 s
-Wrote files for 10 helas calls in 0.103 s
+Wrote files for 10 helas calls in 0.106 s
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates VVV1 set of routines with options: P0[0m
 ALOHA: aloha creates FFV1 routines[0m
-ALOHA: aloha creates 2 routines in  0.155 s
+ALOHA: aloha creates 2 routines in  0.152 s
 [1;32mDEBUG:  Entering PLUGIN_ProcessExporter.convert_model (create the model) [1;30m[output.py at line 202][0m [0m
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates VVV1 set of routines with options: P0[0m
 ALOHA: aloha creates FFV1 routines[0m
-ALOHA: aloha creates 4 routines in  0.135 s
+ALOHA: aloha creates 4 routines in  0.148 s
 <class 'aloha.create_aloha.AbstractRoutine'> VVV1
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
@@ -237,9 +237,9 @@ Type "launch" to generate events from this process, or see
 Run "open index.html" to see more information about this process.
 quit
 
-real	0m1.729s
-user	0m1.515s
-sys	0m0.204s
+real	0m1.778s
+user	0m1.548s
+sys	0m0.220s
 Code generation completed in 2 seconds
 ************************************************************
 *                                                          *
@@ -266,7 +266,7 @@ INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/mg5amc
 INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt/Cards/me5_configuration.txt  
 Using default text editor "vi". Set another one in ./input/mg5_configuration.txt
 Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt
-No valid web browser found. Please set in ./input/mg5_configuration.txt
+Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt
 treatcards run
 quit
 INFO:  
@@ -296,7 +296,7 @@ INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/mg5amc
 INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt/Cards/me5_configuration.txt  
 Using default text editor "vi". Set another one in ./input/mg5_configuration.txt
 Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt
-No valid web browser found. Please set in ./input/mg5_configuration.txt
+Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt
 treatcards param
 quit
 INFO:  
diff --git a/epochX/cudacpp/gg_tt.mad/COPYRIGHT b/epochX/cudacpp/gg_tt.mad/COPYRIGHT
index a134b5fef9..84a883fbb0 100644
--- a/epochX/cudacpp/gg_tt.mad/COPYRIGHT
+++ b/epochX/cudacpp/gg_tt.mad/COPYRIGHT
@@ -15,6 +15,7 @@ The full development team currently includes the following authors :
   Stephan Hageboeck (CERN)
   Olivier Mattelaer (Universite Catholique de Louvain, original author)
   Stefan Roiser (CERN, original author)
+  Joergen Teig (CERN)
   Andrea Valassi (CERN, original author)
   Zenny Wettersten (CERN)
 See https://github.com/madgraph5/madgraph4gpu for more details. For the full
diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/Bridge.h b/epochX/cudacpp/gg_tt.mad/SubProcesses/Bridge.h
index bf8b5e024d..89437b4c42 100644
--- a/epochX/cudacpp/gg_tt.mad/SubProcesses/Bridge.h
+++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/Bridge.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: S. Roiser (Nov 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Roiser, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef BRIDGE_H
 #define BRIDGE_H 1
@@ -23,7 +23,7 @@
 #include <memory>
 #include <type_traits>
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -83,7 +83,7 @@ namespace mg5amcCpu
     Bridge& operator=( const Bridge& ) = delete;
     Bridge& operator=( Bridge&& ) = delete;
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     /**
      * Set the gpublocks and gputhreads for the gpusequence - throws if evnt != gpublocks*gputhreads
      * (this is needed for BridgeKernel tests rather than for actual production use in Fortran)
@@ -150,7 +150,7 @@ namespace mg5amcCpu
     unsigned int m_nevt; // number of events
     int m_nGoodHel;      // the number of good helicities (-1 initially when they have not yet been calculated)
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     int m_gputhreads; // number of gpu threads (default set from number of events, can be modified)
     int m_gpublocks;  // number of gpu blocks (default set from number of events, can be modified)
     DeviceBuffer<FORTRANFPTYPE, sizePerEventMomenta> m_devMomentaF;
@@ -187,12 +187,12 @@ namespace mg5amcCpu
   // Forward declare transposition methods
   //
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 
   template<typename Tin, typename Tout>
   __global__ void dev_transposeMomentaF2C( const Tin* in, Tout* out, const unsigned int nevt );
 
-#endif // __CUDACC__
+#endif // MGONGPUCPP_GPUIMPL
 
   template<typename Tin, typename Tout>
   void hst_transposeMomentaF2C( const Tin* in, Tout* out, const unsigned int nevt );
@@ -209,7 +209,7 @@ namespace mg5amcCpu
   Bridge<FORTRANFPTYPE>::Bridge( unsigned int nevtF, unsigned int nparF, unsigned int np4F )
     : m_nevt( nevtF )
     , m_nGoodHel( -1 )
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     , m_gputhreads( 256 )                  // default number of gpu threads
     , m_gpublocks( m_nevt / m_gputhreads ) // this ensures m_nevt <= m_gpublocks*m_gputhreads
     , m_devMomentaF( m_nevt )
@@ -233,7 +233,7 @@ namespace mg5amcCpu
   {
     if( nparF != CPPProcess::npar ) throw std::runtime_error( "Bridge constructor: npar mismatch" );
     if( np4F != CPPProcess::np4 ) throw std::runtime_error( "Bridge constructor: np4 mismatch" );
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     if( ( m_nevt < s_gputhreadsmin ) || ( m_nevt % s_gputhreadsmin != 0 ) )
       throw std::runtime_error( "Bridge constructor: nevt should be a multiple of " + std::to_string( s_gputhreadsmin ) );
     while( m_nevt != m_gpublocks * m_gputhreads )
@@ -249,7 +249,7 @@ namespace mg5amcCpu
 #else
     std::cout << "WARNING! Instantiate host Bridge (nevt=" << m_nevt << ")" << std::endl;
     m_pmek.reset( new MatrixElementKernelHost( m_hstMomentaC, m_hstGs, m_hstRndHel, m_hstRndCol, m_hstMEs, m_hstSelHel, m_hstSelCol, m_nevt ) );
-#endif // __CUDACC__
+#endif // MGONGPUCPP_GPUIMPL
     // Create a process object, read param card and set parameters
     // FIXME: the process instance can happily go out of scope because it is only needed to read parameters?
     // FIXME: the CPPProcess should really be a singleton? what if fbridgecreate is called from several Fortran threads?
@@ -262,7 +262,7 @@ namespace mg5amcCpu
     process.initProc( paramCard );
   }
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   template<typename FORTRANFPTYPE>
   void Bridge<FORTRANFPTYPE>::set_gpugrid( const int gpublocks, const int gputhreads )
   {
@@ -276,7 +276,7 @@ namespace mg5amcCpu
   }
 #endif
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   template<typename FORTRANFPTYPE>
   void Bridge<FORTRANFPTYPE>::gpu_sequence( const FORTRANFPTYPE* momenta,
                                             const FORTRANFPTYPE* gs,
@@ -291,14 +291,14 @@ namespace mg5amcCpu
     constexpr int neppM = MemoryAccessMomenta::neppM;
     if constexpr( neppM == 1 && std::is_same_v<FORTRANFPTYPE, fptype> )
     {
-      checkCuda( cudaMemcpy( m_devMomentaC.data(), momenta, m_devMomentaC.bytes(), cudaMemcpyHostToDevice ) );
+      gpuMemcpy( m_devMomentaC.data(), momenta, m_devMomentaC.bytes(), gpuMemcpyHostToDevice );
     }
     else
     {
-      checkCuda( cudaMemcpy( m_devMomentaF.data(), momenta, m_devMomentaF.bytes(), cudaMemcpyHostToDevice ) );
+      gpuMemcpy( m_devMomentaF.data(), momenta, m_devMomentaF.bytes(), gpuMemcpyHostToDevice );
       const int thrPerEvt = CPPProcess::npar * CPPProcess::np4; // AV: transpose alg does 1 element per thread (NOT 1 event per thread)
       //const int thrPerEvt = 1; // AV: try new alg with 1 event per thread... this seems slower
-      dev_transposeMomentaF2C<<<m_gpublocks * thrPerEvt, m_gputhreads>>>( m_devMomentaF.data(), m_devMomentaC.data(), m_nevt );
+      gpuLaunchKernel( dev_transposeMomentaF2C, m_gpublocks * thrPerEvt, m_gputhreads, m_devMomentaF.data(), m_devMomentaC.data(), m_nevt );
     }
     if constexpr( std::is_same_v<FORTRANFPTYPE, fptype> )
     {
@@ -341,7 +341,7 @@ namespace mg5amcCpu
   }
 #endif
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   template<typename FORTRANFPTYPE>
   void Bridge<FORTRANFPTYPE>::cpu_sequence( const FORTRANFPTYPE* momenta,
                                             const FORTRANFPTYPE* gs,
@@ -396,7 +396,7 @@ namespace mg5amcCpu
   // - C++ array: momenta[npagM][npar][np4][neppM] with nevt=npagM*neppM (AOSOA)
   //
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   template<typename Tin, typename Tout>
   __global__ void dev_transposeMomentaF2C( const Tin* in, Tout* out, const unsigned int nevt )
   {
diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/BridgeKernels.cc b/epochX/cudacpp/gg_tt.mad/SubProcesses/BridgeKernels.cc
index d58066c9c1..eaf4037a24 100644
--- a/epochX/cudacpp/gg_tt.mad/SubProcesses/BridgeKernels.cc
+++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/BridgeKernels.cc
@@ -1,17 +1,18 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #include "BridgeKernels.h"
 
+#include "GpuAbstraction.h"
 #include "MemoryAccessMomenta.h"
 
 #include <sstream>
 
 //============================================================================
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -45,7 +46,7 @@ namespace mg5amcCpu
 
 //============================================================================
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 namespace mg5amcCpu
 {
 
@@ -96,7 +97,7 @@ namespace mg5amcCpu
 
 //============================================================================
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 {
 
diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/BridgeKernels.h b/epochX/cudacpp/gg_tt.mad/SubProcesses/BridgeKernels.h
index 15eb4bff4d..3efef8ce97 100644
--- a/epochX/cudacpp/gg_tt.mad/SubProcesses/BridgeKernels.h
+++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/BridgeKernels.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef BRIDGEKERNELS_H
 #define BRIDGEKERNELS_H 1
@@ -12,7 +12,7 @@
 #include "MatrixElementKernels.h"
 #include "MemoryBuffers.h"
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -49,7 +49,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A Bridge wrapper class encapsulating matrix element calculations on a CPU host
   class BridgeKernelHost final : public BridgeKernelBase
   {
@@ -89,7 +89,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   // A Bridge wrapper class encapsulating matrix element calculations on a GPU device
   class BridgeKernelDevice : public BridgeKernelBase
   {
diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/CommonRandomNumberKernel.cc b/epochX/cudacpp/gg_tt.mad/SubProcesses/CommonRandomNumberKernel.cc
index 985b39f576..010bc4cbd0 100644
--- a/epochX/cudacpp/gg_tt.mad/SubProcesses/CommonRandomNumberKernel.cc
+++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/CommonRandomNumberKernel.cc
@@ -1,15 +1,16 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #include "CommonRandomNumbers.h"
+#include "GpuAbstraction.h"
 #include "MemoryBuffers.h"
 #include "RandomNumberKernels.h"
 
 #include <cassert>
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/CrossSectionKernels.cc b/epochX/cudacpp/gg_tt.mad/SubProcesses/CrossSectionKernels.cc
index 0b355a3c8d..c15b39844d 100644
--- a/epochX/cudacpp/gg_tt.mad/SubProcesses/CrossSectionKernels.cc
+++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/CrossSectionKernels.cc
@@ -1,10 +1,11 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #include "CrossSectionKernels.h"
 
+#include "GpuAbstraction.h"
 #include "MemoryAccessMatrixElements.h"
 #include "MemoryAccessWeights.h"
 #include "MemoryBuffers.h"
@@ -77,7 +78,7 @@ debug_me_is_abnormal( const fptype& me, size_t ievtALL )
 
 //============================================================================
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -185,7 +186,7 @@ namespace mg5amcCpu
 
 //============================================================================
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 {
 
diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/CrossSectionKernels.h b/epochX/cudacpp/gg_tt.mad/SubProcesses/CrossSectionKernels.h
index 7933ca4bbf..4d9659e04e 100644
--- a/epochX/cudacpp/gg_tt.mad/SubProcesses/CrossSectionKernels.h
+++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/CrossSectionKernels.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef CROSSSECTIONKERNELS_H
 #define CROSSSECTIONKERNELS_H 1
@@ -13,7 +13,7 @@
 
 //============================================================================
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -96,7 +96,7 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
   /*
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   // A class encapsulating the calculation of event statistics on a GPU device
   class CrossSectionKernelDevice : public CrossSectionKernelBase, public NumberOfEvents
   {
diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/CudaRuntime.h b/epochX/cudacpp/gg_tt.mad/SubProcesses/CudaRuntime.h
deleted file mode 100644
index 64ce52f4b3..0000000000
--- a/epochX/cudacpp/gg_tt.mad/SubProcesses/CudaRuntime.h
+++ /dev/null
@@ -1,85 +0,0 @@
-// Copyright (C) 2020-2023 CERN and UCLouvain.
-// Licensed under the GNU Lesser General Public License (version 3 or later).
-// Created by: S. Roiser (Jul 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
-
-#ifndef MG5AMC_CUDARUNTIME_H
-#define MG5AMC_CUDARUNTIME_H 1
-
-// MG5AMC on GPU uses the CUDA runtime API, not the lower level CUDA driver API
-// See https://docs.nvidia.com/cuda/cuda-runtime-api/driver-vs-runtime-api.html#driver-vs-runtime-api
-
-#include <cassert>
-#include <iostream>
-
-//--------------------------------------------------------------------------
-
-// See https://stackoverflow.com/a/14038590
-#ifdef __CUDACC__ /* clang-format off */
-#define checkCuda( code ) { assertCuda( code, __FILE__, __LINE__ ); }
-inline void assertCuda( cudaError_t code, const char* file, int line, bool abort = true )
-{
-  if( code != cudaSuccess )
-  {
-    printf( "ERROR! assertCuda: '%s' (%d) in %s:%d\n", cudaGetErrorString( code ), code, file, line );
-    if( abort ) assert( code == cudaSuccess );
-  }
-}
-#endif /* clang-format on */
-
-//--------------------------------------------------------------------------
-
-#ifdef __CUDACC__
-namespace mg5amcGpu
-{
-  // Instantiate a CudaRuntime at the beginnining of the application's main to
-  // invoke cudaSetDevice(0) in the constructor and book a cudaDeviceReset() call in the destructor
-  // *** FIXME! This will all need to be designed differently when going to multi-GPU nodes! ***
-  struct CudaRuntime final
-  {
-    CudaRuntime( const bool debug = true )
-      : m_debug( debug ) { setUp( m_debug ); }
-    ~CudaRuntime() { tearDown( m_debug ); }
-    CudaRuntime( const CudaRuntime& ) = delete;
-    CudaRuntime( CudaRuntime&& ) = delete;
-    CudaRuntime& operator=( const CudaRuntime& ) = delete;
-    CudaRuntime& operator=( CudaRuntime&& ) = delete;
-    bool m_debug;
-
-    // Set up CUDA application
-    // ** NB: strictly speaking this is not needed when using the CUDA runtime API **
-    // Calling cudaSetDevice on startup is useful to properly book-keep the time spent in CUDA initialization
-    static void setUp( const bool debug = true )
-    {
-      // ** NB: it is useful to call cudaSetDevice, or cudaFree, to properly book-keep the time spent in CUDA initialization
-      // ** NB: otherwise, the first CUDA operation (eg a cudaMemcpyToSymbol in CPPProcess ctor) appears to take much longer!
-      /*
-      // [We initially added cudaFree(0) to "ease profile analysis" only because it shows up as a big recognizable block!]
-      // No explicit initialization is needed: https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#initialization
-      // It is not clear what cudaFree(0) does at all: https://stackoverflow.com/questions/69967813/
-      if ( debug ) std::cout << "__CudaRuntime: calling cudaFree(0)" << std::endl;
-      checkCuda( cudaFree( 0 ) ); // SLOW!
-      */
-      // Replace cudaFree(0) by cudaSetDevice(0), even if it is not really needed either
-      // (but see https://developer.nvidia.com/blog/cuda-pro-tip-always-set-current-device-avoid-multithreading-bugs)
-      if( debug ) std::cout << "__CudaRuntime: calling cudaSetDevice(0)" << std::endl;
-      checkCuda( cudaSetDevice( 0 ) ); // SLOW!
-    }
-
-    // Tear down CUDA application (call cudaDeviceReset)
-    // ** NB: strictly speaking this is not needed when using the CUDA runtime API **
-    // Calling cudaDeviceReset on shutdown is only needed for checking memory leaks in cuda-memcheck
-    // See https://docs.nvidia.com/cuda/cuda-memcheck/index.html#leak-checking
-    static void tearDown( const bool debug = true )
-    {
-      if( debug ) std::cout << "__CudaRuntime: calling cudaDeviceReset()" << std::endl;
-      checkCuda( cudaDeviceReset() );
-    }
-  };
-
-}
-#endif
-
-//--------------------------------------------------------------------------
-
-#endif // MG5AMC_CUDARUNTIME_H
diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/CurandRandomNumberKernel.cc b/epochX/cudacpp/gg_tt.mad/SubProcesses/CurandRandomNumberKernel.cc
index eb56333b03..08a16f6f2c 100644
--- a/epochX/cudacpp/gg_tt.mad/SubProcesses/CurandRandomNumberKernel.cc
+++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/CurandRandomNumberKernel.cc
@@ -1,9 +1,9 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
-#include "CudaRuntime.h"
+#include "GpuRuntime.h"
 #include "MemoryBuffers.h"
 #include "RandomNumberKernels.h"
 
@@ -22,7 +22,7 @@ inline void assertCurand( curandStatus_t code, const char *file, int line, bool
 }
 #endif /* clang-format on */
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -36,7 +36,7 @@ namespace mg5amcCpu
   {
     if( m_isOnDevice )
     {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
       if( !m_rnarray.isOnDevice() )
         throw std::runtime_error( "CurandRandomNumberKernel on device with a host random number array" );
 #else
@@ -114,7 +114,7 @@ namespace mg5amcCpu
     /*
     printf( "\nCurandRandomNumberKernel::generateRnarray size = %d\n", (int)m_rnarray.size() );
     fptype* data = m_rnarray.data();
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     if( m_rnarray.isOnDevice() )
     {
       data = new fptype[m_rnarray.size()]();
@@ -123,7 +123,7 @@ namespace mg5amcCpu
 #endif
     for( int i = 0; i < ( (int)m_rnarray.size() / 4 ); i++ )
       printf( "[%4d] %f %f %f %f\n", i * 4, data[i * 4], data[i * 4 + 2], data[i * 4 + 2], data[i * 4 + 3] );
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     if( m_rnarray.isOnDevice() ) delete[] data;
 #endif
     */
diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/EventStatistics.h b/epochX/cudacpp/gg_tt.mad/SubProcesses/EventStatistics.h
index 48b51e0a49..b425a5bade 100644
--- a/epochX/cudacpp/gg_tt.mad/SubProcesses/EventStatistics.h
+++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/EventStatistics.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef EventStatistics_H
 #define EventStatistics_H 1
@@ -16,7 +16,7 @@
 #include <limits>
 #include <string>
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/MadgraphTest.h b/epochX/cudacpp/gg_tt.mad/SubProcesses/MadgraphTest.h
index ef40624c88..a64c05c26a 100644
--- a/epochX/cudacpp/gg_tt.mad/SubProcesses/MadgraphTest.h
+++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/MadgraphTest.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: S. Hageboeck (Dec 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MADGRAPHTEST_H_
 #define MADGRAPHTEST_H_ 1
@@ -22,7 +22,7 @@
 #include <string>
 #include <vector>
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 using mg5amcGpu::CPPProcess;
 #else
 using mg5amcCpu::CPPProcess;
@@ -201,7 +201,7 @@ class MadgraphTest : public testing::TestWithParam<TestDriverBase*>
 
 // Since we link both the CPU-only and GPU tests into the same executable, we prevent
 // a multiply defined symbol by only compiling this in the non-CUDA phase:
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 
 /// Compare momenta and matrix elements.
 /// This uses an implementation of TestDriverBase to run a madgraph workflow,
@@ -307,6 +307,6 @@ TEST_P( MadgraphTest, CompareMomentaAndME )
   }
 }
 
-#endif // __CUDACC__
+#endif // MGONGPUCPP_GPUIMPL
 
 #endif /* MADGRAPHTEST_H_ */
diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/MatrixElementKernels.cc b/epochX/cudacpp/gg_tt.mad/SubProcesses/MatrixElementKernels.cc
index 74b5239ebf..81699dfea9 100644
--- a/epochX/cudacpp/gg_tt.mad/SubProcesses/MatrixElementKernels.cc
+++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/MatrixElementKernels.cc
@@ -1,12 +1,12 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #include "MatrixElementKernels.h"
 
 #include "CPPProcess.h"
-#include "CudaRuntime.h"
+#include "GpuRuntime.h" // Includes the abstraction for Nvidia/AMD compilation
 #include "MemoryAccessMomenta.h"
 #include "MemoryBuffers.h"
 
@@ -14,7 +14,7 @@
 
 //============================================================================
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 namespace mg5amcCpu
 {
 
@@ -150,7 +150,7 @@ namespace mg5amcCpu
 
 //============================================================================
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 {
 
@@ -209,13 +209,13 @@ namespace mg5amcGpu
     PinnedHostBufferHelicityMask hstIsGoodHel( ncomb );
     DeviceBufferHelicityMask devIsGoodHel( ncomb );
     // ... 0d1. Compute good helicity mask on the device
-    computeDependentCouplings<<<m_gpublocks, m_gputhreads>>>( m_gs.data(), m_couplings.data() );
+    gpuLaunchKernel( computeDependentCouplings, m_gpublocks, m_gputhreads, m_gs.data(), m_couplings.data() );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    sigmaKin_getGoodHel<<<m_gpublocks, m_gputhreads>>>( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_numerators.data(), m_denominators.data(), devIsGoodHel.data() );
+    gpuLaunchKernel( sigmaKin_getGoodHel, m_gpublocks, m_gputhreads, m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_numerators.data(), m_denominators.data(), devIsGoodHel.data() );
 #else
-    sigmaKin_getGoodHel<<<m_gpublocks, m_gputhreads>>>( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), devIsGoodHel.data() );
+    gpuLaunchKernel( sigmaKin_getGoodHel, m_gpublocks, m_gputhreads, m_momenta.data(), m_couplings.data(), m_matrixElements.data(), devIsGoodHel.data() );
 #endif
-    checkCuda( cudaPeekAtLastError() );
+    checkGpu( gpuPeekAtLastError() );
     // ... 0d2. Copy back good helicity mask to the host
     copyHostFromDevice( hstIsGoodHel, devIsGoodHel );
     // ... 0d3. Copy back good helicity list to constant memory on the device
@@ -226,19 +226,19 @@ namespace mg5amcGpu
 
   void MatrixElementKernelDevice::computeMatrixElements( const unsigned int channelId )
   {
-    computeDependentCouplings<<<m_gpublocks, m_gputhreads>>>( m_gs.data(), m_couplings.data() );
+    gpuLaunchKernel( computeDependentCouplings, m_gpublocks, m_gputhreads, m_gs.data(), m_couplings.data() );
 #ifndef MGONGPU_NSIGHT_DEBUG
     constexpr unsigned int sharedMemSize = 0;
 #else
     constexpr unsigned int sharedMemSize = ntpbMAX * sizeof( float );
 #endif
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    sigmaKin<<<m_gpublocks, m_gputhreads, sharedMemSize>>>( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), channelId, m_numerators.data(), m_denominators.data(), m_selhel.data(), m_selcol.data() );
+    gpuLaunchKernelSharedMem( sigmaKin, m_gpublocks, m_gputhreads, sharedMemSize, m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), channelId, m_numerators.data(), m_denominators.data(), m_selhel.data(), m_selcol.data() );
 #else
-    sigmaKin<<<m_gpublocks, m_gputhreads, sharedMemSize>>>( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), m_selhel.data(), m_selcol.data() );
+    gpuLaunchKernelSharedMem( sigmaKin, m_gpublocks, m_gputhreads, sharedMemSize, m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), m_selhel.data(), m_selcol.data() );
 #endif
-    checkCuda( cudaPeekAtLastError() );
-    checkCuda( cudaDeviceSynchronize() );
+    checkGpu( gpuPeekAtLastError() );
+    checkGpu( gpuDeviceSynchronize() );
   }
 
   //--------------------------------------------------------------------------
diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/MatrixElementKernels.h b/epochX/cudacpp/gg_tt.mad/SubProcesses/MatrixElementKernels.h
index 23e84757a2..72bd8f195b 100644
--- a/epochX/cudacpp/gg_tt.mad/SubProcesses/MatrixElementKernels.h
+++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/MatrixElementKernels.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MATRIXELEMENTKERNELS_H
 #define MATRIXELEMENTKERNELS_H 1
@@ -10,7 +10,7 @@
 
 #include "MemoryBuffers.h"
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -81,7 +81,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating matrix element calculations on a CPU host
   class MatrixElementKernelHost final : public MatrixElementKernelBase, public NumberOfEvents
   {
@@ -130,7 +130,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   // A class encapsulating matrix element calculations on a GPU device
   class MatrixElementKernelDevice : public MatrixElementKernelBase, public NumberOfEvents
   {
diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryAccessAmplitudes.h b/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryAccessAmplitudes.h
index 573b3bbbc9..ffb76e93de 100644
--- a/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryAccessAmplitudes.h
+++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryAccessAmplitudes.h
@@ -15,7 +15,7 @@
 #define MGONGPU_TRIVIAL_AMPLITUDES 1
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryAccessCouplings.h b/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryAccessCouplings.h
index 35a3af42e0..3afdf3e554 100644
--- a/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryAccessCouplings.h
+++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryAccessCouplings.h
@@ -15,7 +15,7 @@
 #include "MemoryBuffers.h"       // for HostBufferCouplings::isaligned
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryAccessCouplingsFixed.h b/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryAccessCouplingsFixed.h
index dc0d93afff..ffcdf4dbef 100644
--- a/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryAccessCouplingsFixed.h
+++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryAccessCouplingsFixed.h
@@ -14,7 +14,7 @@
 //#include "MemoryAccessHelpers.h"
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryAccessDenominators.h b/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryAccessDenominators.h
index 3bce635718..66f2d32a6b 100644
--- a/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryAccessDenominators.h
+++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryAccessDenominators.h
@@ -10,7 +10,7 @@
 #include "MemoryAccessGs.h"
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryAccessGs.h b/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryAccessGs.h
index 31311aa375..4c726b30f3 100644
--- a/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryAccessGs.h
+++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryAccessGs.h
@@ -13,7 +13,7 @@
 #include "MemoryBuffers.h" // for HostBufferMatrixElements::isaligned
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryAccessHelpers.h b/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryAccessHelpers.h
index c82a6c7635..db73e4e064 100644
--- a/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryAccessHelpers.h
+++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryAccessHelpers.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MemoryAccessHelpers_H
 #define MemoryAccessHelpers_H 1
@@ -105,7 +105,7 @@ class KernelAccessHelper : public MemoryAccessHelper<T>
     }
     else
     {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
       const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid
       //printf( "kernelAccessRecord: ievt=%d threadId=%d\n", ievt, threadIdx.x );
       return T::ieventAccessRecord( buffer, ievt ); // NB fptype and fptype_sv coincide for CUDA
diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryAccessMatrixElements.h b/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryAccessMatrixElements.h
index f32e6fea5b..3741011971 100644
--- a/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryAccessMatrixElements.h
+++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryAccessMatrixElements.h
@@ -13,7 +13,7 @@
 #include "MemoryBuffers.h" // for HostBufferMatrixElements::isaligned
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryAccessMomenta.h b/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryAccessMomenta.h
index 29266de32c..86df5d5471 100644
--- a/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryAccessMomenta.h
+++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryAccessMomenta.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MemoryAccessMomenta_H
 #define MemoryAccessMomenta_H 1
@@ -13,7 +13,7 @@
 #include "MemoryAccessVectors.h"
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -27,10 +27,9 @@ namespace mg5amcCpu
   class MemoryAccessMomentaBase //_AOSOAv1
   {
   public:
-
     // Number of Events Per Page in the momenta AOSOA memory buffer layout
     // (these are all best kept as a compile-time constants: see issue #23)
-#ifdef __CUDACC__ /* clang-format off */
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
     // -----------------------------------------------------------------------------------------------
     // --- GPUs: neppM is best set to a power of 2 times the number of fptype's in a 32-byte cacheline
     // --- This is relevant to ensure coalesced access to momenta in global memory
diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryAccessNumerators.h b/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryAccessNumerators.h
index b152183b28..18991f4fa6 100644
--- a/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryAccessNumerators.h
+++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryAccessNumerators.h
@@ -10,7 +10,7 @@
 #include "MemoryAccessGs.h"
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryAccessRandomNumbers.h b/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryAccessRandomNumbers.h
index e2988d39f3..40cb089135 100644
--- a/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryAccessRandomNumbers.h
+++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryAccessRandomNumbers.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MemoryAccessRandomNumbers_H
 #define MemoryAccessRandomNumbers_H 1
@@ -11,7 +11,7 @@
 #include "CPPProcess.h"
 #include "MemoryAccessHelpers.h"
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 using mg5amcGpu::CPPProcess;
 #else
 using mg5amcCpu::CPPProcess;
diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryAccessVectors.h b/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryAccessVectors.h
index e9b197368e..08faccff0f 100644
--- a/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryAccessVectors.h
+++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryAccessVectors.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MemoryAccessVectors_H
 #define MemoryAccessVectors_H 1
@@ -10,7 +10,7 @@
 
 #include "mgOnGpuVectors.h"
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 namespace mg5amcCpu // this is only needed for CPU SIMD vectorization
 {
 
diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryAccessWavefunctions.h b/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryAccessWavefunctions.h
index 5428aaf933..33bef4559e 100644
--- a/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryAccessWavefunctions.h
+++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryAccessWavefunctions.h
@@ -15,7 +15,7 @@
 #define MGONGPU_TRIVIAL_WAVEFUNCTIONS 1
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryBuffers.h b/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryBuffers.h
index 3093e6ed18..7756a71621 100644
--- a/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryBuffers.h
+++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryBuffers.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021, based on earlier work by S. Hageboeck) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Roiser, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MemoryBuffers_H
 #define MemoryBuffers_H 1
@@ -11,12 +11,12 @@
 #include "mgOnGpuCxtypes.h"
 
 #include "CPPProcess.h"
-#include "CudaRuntime.h"
+#include "GpuRuntime.h"
 #include "Parameters_sm.h"
 
 #include <sstream>
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -87,7 +87,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   constexpr bool HostBufferALIGNED = false;   // ismisaligned=false
   constexpr bool HostBufferMISALIGNED = true; // ismisaligned=true
 
@@ -119,7 +119,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   // A class encapsulating a CUDA pinned host buffer
   template<typename T>
   class PinnedHostBufferBase : public BufferBase<T>
@@ -128,18 +128,18 @@ namespace mg5amcCpu
     PinnedHostBufferBase( const size_t size )
       : BufferBase<T>( size, false )
     {
-      checkCuda( cudaMallocHost( &( this->m_data ), this->bytes() ) );
+      gpuMallocHost( &( this->m_data ), this->bytes() );
     }
     virtual ~PinnedHostBufferBase()
     {
-      checkCuda( cudaFreeHost( this->m_data ) );
+      gpuFreeHost( this->m_data );
     }
   };
 #endif
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   // A class encapsulating a CUDA device buffer
   template<typename T>
   class DeviceBufferBase : public BufferBase<T>
@@ -148,18 +148,18 @@ namespace mg5amcCpu
     DeviceBufferBase( const size_t size )
       : BufferBase<T>( size, true )
     {
-      checkCuda( cudaMalloc( &( this->m_data ), this->bytes() ) );
+      gpuMalloc( &( this->m_data ), this->bytes() );
     }
     virtual ~DeviceBufferBase()
     {
-      checkCuda( cudaFree( this->m_data ) );
+      gpuFree( this->m_data );
     }
   };
 #endif
 
   //--------------------------------------------------------------------------
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for a given number of events
   template<typename T, size_t sizePerEvent, bool ismisaligned>
   class HostBuffer : public HostBufferBase<T, ismisaligned>, virtual private NumberOfEvents
@@ -175,7 +175,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   // A class encapsulating a CUDA pinned host buffer for a given number of events
   template<typename T, size_t sizePerEvent>
   class PinnedHostBuffer : public PinnedHostBufferBase<T>, virtual private NumberOfEvents
@@ -191,7 +191,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   // A class encapsulating a CUDA device buffer for a given number of events
   template<typename T, size_t sizePerEvent>
   class DeviceBuffer : public DeviceBufferBase<T>, virtual private NumberOfEvents
@@ -213,7 +213,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for momenta random numbers
   constexpr size_t sizePerEventRndNumMomenta = MemoryBuffers::np4 * MemoryBuffers::nparf;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for momenta random numbers
   typedef HostBuffer<fptype, sizePerEventRndNumMomenta, HostBufferALIGNED> HostBufferRndNumMomenta;
 #else
@@ -232,7 +232,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer with ONE fptype per event
   constexpr size_t sizePerEventOneFp = 1;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer with ONE fptype per event
   typedef HostBuffer<fptype, sizePerEventOneFp, HostBufferALIGNED> HostBufferOneFp;
 #else
@@ -257,7 +257,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for Gs
   constexpr size_t sizePerEventGs = 1;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for gs
   typedef HostBuffer<fptype, sizePerEventGs, HostBufferALIGNED> HostBufferGs;
 #else
@@ -276,7 +276,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for numerators
   constexpr size_t sizePerEventNumerators = 1;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for gs
   typedef HostBuffer<fptype, sizePerEventNumerators, HostBufferALIGNED> HostBufferNumerators;
 #else
@@ -296,7 +296,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for denominators
   constexpr size_t sizePerEventDenominators = 1;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for gs
   typedef HostBuffer<fptype, sizePerEventDenominators, HostBufferALIGNED> HostBufferDenominators;
 #else
@@ -315,7 +315,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for random numbers
   constexpr size_t sizePerEventCouplings = MemoryBuffers::ndcoup * MemoryBuffers::nx2;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for gs
   typedef HostBuffer<fptype, sizePerEventCouplings, HostBufferALIGNED> HostBufferCouplings;
 #else
@@ -333,7 +333,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for momenta
   constexpr size_t sizePerEventMomenta = MemoryBuffers::np4 * MemoryBuffers::npar;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for momenta
   typedef HostBuffer<fptype, sizePerEventMomenta, HostBufferALIGNED> HostBufferMomenta;
   //typedef HostBuffer<fptype, sizePerEventMomenta, HostBufferMISALIGNED> HostBufferMomenta; // TEST MISALIGNMENT!
@@ -352,7 +352,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for sampling weights
   constexpr size_t sizePerEventWeights = 1;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for sampling weights
   typedef HostBuffer<fptype, sizePerEventWeights, HostBufferALIGNED> HostBufferWeights;
 #else
@@ -370,7 +370,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for matrix elements
   constexpr size_t sizePerEventMatrixElements = 1;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for matrix elements
   typedef HostBuffer<fptype, sizePerEventMatrixElements, HostBufferALIGNED> HostBufferMatrixElements;
 #else
@@ -385,7 +385,7 @@ namespace mg5amcCpu
   // A base class encapsulating a memory buffer for the helicity mask
   typedef BufferBase<bool> BufferHelicityMask;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for the helicity mask
   typedef HostBufferBase<bool, HostBufferALIGNED> HostBufferHelicityMask;
 #else
@@ -403,7 +403,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for wavefunctions
   constexpr size_t sizePerEventWavefunctions = MemoryBuffers::nw6 * MemoryBuffers::nx2;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for wavefunctions
   typedef HostBuffer<fptype, sizePerEventWavefunctions, HostBufferALIGNED> HostBufferWavefunctions;
 #else
@@ -421,7 +421,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for helicity random numbers
   constexpr size_t sizePerEventRndNumHelicity = 1;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for helicity random numbers
   typedef HostBuffer<fptype, sizePerEventRndNumHelicity, HostBufferALIGNED> HostBufferRndNumHelicity;
 #else
@@ -439,7 +439,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for color random numbers
   constexpr size_t sizePerEventRndNumColor = 1;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for color random numbers
   typedef HostBuffer<fptype, sizePerEventRndNumColor, HostBufferALIGNED> HostBufferRndNumColor;
 #else
@@ -457,7 +457,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for helicity selection
   constexpr size_t sizePerEventSelectedHelicity = 1;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for helicity selection
   typedef HostBuffer<int, sizePerEventSelectedHelicity, HostBufferALIGNED> HostBufferSelectedHelicity;
 #else
@@ -475,7 +475,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for color selection
   constexpr size_t sizePerEventSelectedColor = 1;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for color selection
   typedef HostBuffer<int, sizePerEventSelectedColor, HostBufferALIGNED> HostBufferSelectedColor;
 #else
@@ -487,7 +487,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   template<class Tdst, class Tsrc>
   void copyDeviceFromHost( Tdst& dst, const Tsrc& src ) // keep the same order of arguments as in memcpy
   {
@@ -504,13 +504,13 @@ namespace mg5amcCpu
       throw std::runtime_error( sstr.str() );
     }
     // NB (PR #45): cudaMemcpy involves an intermediate memcpy to pinned memory if host array is a not a pinned host array
-    checkCuda( cudaMemcpy( dst.data(), src.data(), src.bytes(), cudaMemcpyHostToDevice ) );
+    gpuMemcpy( dst.data(), src.data(), src.bytes(), gpuMemcpyHostToDevice );
   }
 #endif
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   template<class Tdst, class Tsrc>
   void copyHostFromDevice( Tdst& dst, const Tsrc& src ) // keep the same order of arguments as in memcpy
   {
@@ -527,7 +527,7 @@ namespace mg5amcCpu
       throw std::runtime_error( sstr.str() );
     }
     // NB (PR #45): cudaMemcpy involves an intermediate memcpy to pinned memory if host array is a not a pinned host array
-    checkCuda( cudaMemcpy( dst.data(), src.data(), src.bytes(), cudaMemcpyDeviceToHost ) );
+    gpuMemcpy( dst.data(), src.data(), src.bytes(), gpuMemcpyDeviceToHost );
   }
 #endif
 
diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/CPPProcess.cc b/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/CPPProcess.cc
index 18052b6676..e167c60e14 100644
--- a/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/CPPProcess.cc
+++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/CPPProcess.cc
@@ -4,7 +4,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
 // MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
@@ -16,7 +16,6 @@
 
 #include "mgOnGpuConfig.h"
 
-#include "CudaRuntime.h"
 #include "HelAmps_sm.h"
 #include "MemoryAccessAmplitudes.h"
 #include "MemoryAccessCouplings.h"
@@ -46,7 +45,7 @@
 // Class member functions for calculating the matrix elements for
 // Process: g g > t t~ WEIGHTED<=2 @1
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -80,7 +79,7 @@ namespace mg5amcCpu
   __device__ const fptype cIPD[2] = { (fptype)Parameters_sm::mdl_MT, (fptype)Parameters_sm::mdl_WT };
   __device__ const fptype* cIPC = nullptr; // unused as nicoup=0
 #else
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   __device__ __constant__ fptype cIPD[2];
   __device__ __constant__ fptype* cIPC = nullptr; // unused as nicoup=0
 #else
@@ -90,7 +89,7 @@ namespace mg5amcCpu
 #endif
 
   // Helicity combinations (and filtering of "good" helicity combinations)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   __device__ __constant__ short cHel[ncomb][npar];
   __device__ __constant__ int cNGoodHel;
   __device__ __constant__ int cGoodHel[ncomb];
@@ -118,13 +117,13 @@ namespace mg5amcCpu
                            fptype* allDenominators,       // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
                            fptype_sv* jamp2_sv            // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled)
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
                            , const int ievt00             // input: first event number in current C++ event page (for CUDA, ievt depends on threadid)
 #endif
                            )
   //ALWAYS_INLINE // attributes are not permitted in a function definition
   {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     using namespace mg5amcGpu;
     using M_ACCESS = DeviceAccessMomenta;         // non-trivial access: buffer includes all events
     using E_ACCESS = DeviceAccessMatrixElements;  // non-trivial access: buffer includes all events
@@ -151,7 +150,7 @@ namespace mg5amcCpu
 #endif /* clang-format on */
     mgDebug( 0, __FUNCTION__ );
     //printf( "calculate_wavefunctions: ihel=%2d\n", ihel );
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
     //printf( "calculate_wavefunctions: ievt00=%d\n", ievt00 );
 #endif
 
@@ -187,7 +186,7 @@ namespace mg5amcCpu
 #endif
     for( int iParity = 0; iParity < nParity; ++iParity )
     { // START LOOP ON IPARITY
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
       const int ievt0 = ievt00 + iParity * neppV;
 #endif
       constexpr size_t nxcoup = ndcoup + nicoup; // both dependent and independent couplings
@@ -200,8 +199,10 @@ namespace mg5amcCpu
         allCOUPs[idcoup] = CD_ACCESS::idcoupAccessBufferConst( allcouplings, idcoup ); // dependent couplings, vary event-by-event
       for( size_t iicoup = 0; iicoup < nicoup; iicoup++ )
         allCOUPs[ndcoup + iicoup] = CI_ACCESS::iicoupAccessBufferConst( cIPC, iicoup ); // independent couplings, fixed for all events
+#ifdef MGONGPUCPP_GPUIMPL
 #ifdef __CUDACC__
 #pragma nv_diagnostic pop
+#endif
       // CUDA kernels take input/output buffers with momenta/MEs for all events
       const fptype* momenta = allmomenta;
       const fptype* COUPs[nxcoup];
@@ -283,6 +284,8 @@ namespace mg5amcCpu
 #endif
       jamp_sv[1] -= amp_sv[0];
 
+#include "GpuAbstraction.h"
+
       // *** COLOR CHOICE BELOW ***
       // Store the leading color flows for choice of color
       if( jamp2_sv ) // disable color choice if nullptr
@@ -302,7 +305,7 @@ namespace mg5amcCpu
         { 16, -2 },
         { -2, 16 } }; // 2-D array[2][2]
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
       // Pre-compute a constexpr triangular color matrix properly normalized #475
       struct TriangularNormalizedColorMatrix
       {
@@ -359,7 +362,7 @@ namespace mg5amcCpu
 #endif
       for( int icol = 0; icol < ncolor; icol++ )
       {
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
         // === C++ START ===
         // Diagonal terms
 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
@@ -418,7 +421,7 @@ namespace mg5amcCpu
       MEs_sv_previous += deltaMEs_previous;
 #endif
       /*
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
       if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", blockDim.x * blockIdx.x + threadIdx.x, ihel, MEs_sv );
 #else
 #ifdef MGONGPU_CPPSIMD
@@ -465,8 +468,8 @@ namespace mg5amcCpu
       { 1, 1, -1, -1 },
       { 1, 1, 1, 1 },
       { 1, 1, 1, -1 } };
-#ifdef __CUDACC__
-    checkCuda( cudaMemcpyToSymbol( cHel, tHel, ncomb * npar * sizeof( short ) ) );
+#ifdef MGONGPUCPP_GPUIMPL
+    gpuMemcpyToSymbol( cHel, tHel, ncomb * npar * sizeof( short ) );
 #else
     memcpy( cHel, tHel, ncomb * npar * sizeof( short ) );
 #endif
@@ -506,9 +509,9 @@ namespace mg5amcCpu
     // Then copy them to CUDA constant memory (issue #39) or its C++ emulation in file-scope static memory
     const fptype tIPD[2] = { (fptype)m_pars->mdl_MT, (fptype)m_pars->mdl_WT };
     //const cxtype tIPC[0] = { ... }; // nicoup=0
-#ifdef __CUDACC__
-    checkCuda( cudaMemcpyToSymbol( cIPD, tIPD, 2 * sizeof( fptype ) ) );
-    //checkCuda( cudaMemcpyToSymbol( cIPC, tIPC, 0 * sizeof( cxtype ) ) ); // nicoup=0
+#ifdef MGONGPUCPP_GPUIMPL
+    gpuMemcpyToSymbol( cIPD, tIPD, 2 * sizeof( fptype ) );
+    //gpuMemcpyToSymbol( cIPC, tIPC, 0 * sizeof( cxtype ) ); // nicoup=0
 #else
     memcpy( cIPD, tIPD, 2 * sizeof( fptype ) );
     //memcpy( cIPC, tIPC, 0 * sizeof( cxtype ) ); // nicoup=0
@@ -544,7 +547,7 @@ namespace mg5amcCpu
   {
     std::stringstream out;
     // CUDA version (NVCC)
-    // [Use __NVCC__ instead of __CUDACC__ here!]
+    // [Use __NVCC__ instead of MGONGPUCPP_GPUIMPL here!]
     // [This tests if 'nvcc' was used even to build a .cc file, even if not necessarily 'nvcc -x cu' for a .cu file]
     // [Check 'nvcc --compiler-options -dM -E dummy.c | grep CUDA': see https://stackoverflow.com/a/53713712]
 #ifdef __NVCC__
@@ -609,12 +612,12 @@ namespace mg5amcCpu
   __global__ void /* clang-format off */
   computeDependentCouplings( const fptype* allgs, // input: Gs[nevt]
                              fptype* allcouplings // output: couplings[nevt*ndcoup*2]
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
                              , const int nevt     // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #endif
   ) /* clang-format on */
   {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     using namespace mg5amcGpu;
     using G_ACCESS = DeviceAccessGs;
     using C_ACCESS = DeviceAccessCouplings;
@@ -635,7 +638,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__ /* clang-format off */
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
   __global__ void
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
@@ -761,9 +764,9 @@ namespace mg5amcCpu
         nGoodHel++;
       }
     }
-#ifdef __CUDACC__
-    checkCuda( cudaMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) ) );
-    checkCuda( cudaMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) ) );
+#ifdef MGONGPUCPP_GPUIMPL
+    gpuMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) );
+    gpuMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) );
 #else
     cNGoodHel = nGoodHel;
     for( int ihel = 0; ihel < ncomb; ihel++ ) cGoodHel[ihel] = goodHel[ihel];
@@ -787,7 +790,7 @@ namespace mg5amcCpu
 #endif
             int* allselhel,                // output: helicity selection[nevt]
             int* allselcol                 // output: helicity selection[nevt]
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
             , const int nevt               // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #endif
             ) /* clang-format on */
@@ -807,7 +810,7 @@ namespace mg5amcCpu
     // Denominators: spins, colors and identical particles
     constexpr int helcolDenominators[1] = { 256 }; // assume nprocesses == 1 (#272 and #343)
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     // Remember: in CUDA this is a kernel for one event, in c++ this processes n events
     const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid
 #else
@@ -821,9 +824,12 @@ namespace mg5amcCpu
 #endif
 
     // Start sigmaKin_lines
+
+#include "GpuAbstraction.h"
+
     // === PART 0 - INITIALISATION (before calculate_wavefunctions) ===
     // Reset the "matrix elements" - running sums of |M|^2 over helicities for the given event
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     allMEs[ievt] = 0;
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     allNumerators[ievt] = 0;
@@ -851,7 +857,7 @@ namespace mg5amcCpu
     // === PART 1 - HELICITY LOOP: CALCULATE WAVEFUNCTIONS ===
     // (in both CUDA and C++, using precomputed good helicities)
 
-#ifdef __CUDACC__ // CUDA OR C++
+#ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++
 
     // *** START OF PART 1a - CUDA (one event per CPU thread) ***
     // Running sum of partial amplitudes squared for event by event color selection (#402)
@@ -1061,7 +1067,7 @@ namespace mg5amcCpu
     // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event
     // [NB 'sum over final spins, average over initial spins', eg see
     // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf]
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     allMEs[ievt] /= helcolDenominators[0];
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     if( channelId > 0 ) allMEs[ievt] *= allNumerators[ievt] / allDenominators[ievt];
diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/CPPProcess.h b/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/CPPProcess.h
index 3ebd92c038..4a88a07226 100644
--- a/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/CPPProcess.h
+++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/CPPProcess.h
@@ -4,7 +4,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
 // MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
@@ -25,7 +25,7 @@
 
 //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -107,7 +107,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   __global__ void
   computeDependentCouplings( const fptype* allgs,    // input: Gs[nevt]
                              fptype* allcouplings ); // output: couplings[nevt*ndcoup*2]
@@ -120,7 +120,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__ /* clang-format off */
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
   __global__ void
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
@@ -150,7 +150,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__ /* clang-format off */
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
   __global__ void
   sigmaKin( const fptype* allmomenta,      // input: momenta[nevt*npar*4]
             const fptype* allcouplings,    // input: couplings[nevt*ndcoup*2]
diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/CudaRuntime.h b/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/CudaRuntime.h
deleted file mode 120000
index ce9e1a487a..0000000000
--- a/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/CudaRuntime.h
+++ /dev/null
@@ -1 +0,0 @@
-../CudaRuntime.h
\ No newline at end of file
diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/check_sa.cc b/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/check_sa.cc
index 3fbf0ffbee..b9a05dea46 100644
--- a/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/check_sa.cc
+++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/check_sa.cc
@@ -4,7 +4,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: O. Mattelaer (Nov 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 
 #include "mgOnGpuConfig.h"
@@ -12,6 +12,7 @@
 #include "BridgeKernels.h"
 #include "CPPProcess.h"
 #include "CrossSectionKernels.h"
+#include "GpuRuntime.h"
 #include "MatrixElementKernels.h"
 #include "MemoryAccessMatrixElements.h"
 #include "MemoryAccessMomenta.h"
@@ -65,7 +66,7 @@ usage( char* argv0, int ret = 1 )
   std::cout << std::endl;
   std::cout << "Summary stats are always computed: '-p' and '-j' only control their printout" << std::endl;
   std::cout << "The '-d' flag only enables NaN/abnormal warnings and OMP debugging" << std::endl;
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 #ifdef _OPENMP
   std::cout << std::endl;
   std::cout << "Use the OMP_NUM_THREADS environment variable to control OMP multi-threading" << std::endl;
@@ -96,7 +97,7 @@ int
 main( int argc, char** argv )
 {
   // Namespaces for CUDA and C++ (FIXME - eventually use the same namespace everywhere...)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   using namespace mg5amcGpu;
 #else
   using namespace mg5amcCpu;
@@ -134,9 +135,11 @@ main( int argc, char** argv )
     CurandDevice = 2
   };
 #ifdef MGONGPU_HAS_NO_CURAND
-  RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // this is the only supported mode if build has no curand (PR #784)
+  RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // this is the only supported mode if build has no curand (PR #784 and #785)
+#elif defined __HIPCC__
+#error Internal error: MGONGPU_HAS_NO_CURAND should have been set for __HIPCC__ // default on AMD GPUs should be common random
 #elif defined __CUDACC__
-  RandomNumberMode rndgen = RandomNumberMode::CurandDevice; // default on GPU if build has curand
+  RandomNumberMode rndgen = RandomNumberMode::CurandDevice; // default on NVidia GPU if build has curand
 #else
   RandomNumberMode rndgen = RandomNumberMode::CurandHost; // default on CPU if build has curand
 #endif
@@ -146,10 +149,10 @@ main( int argc, char** argv )
     RamboHost = 1,
     RamboDevice = 2
   };
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   RamboSamplingMode rmbsmp = RamboSamplingMode::RamboDevice; // default on GPU
 #else
-  RamboSamplingMode rmbsmp = RamboSamplingMode::RamboHost;  // default on CPU
+  RamboSamplingMode rmbsmp = RamboSamplingMode::RamboHost; // default on CPU
 #endif
   // Bridge emulation mode (NB Bridge implies RamboHost!)
   bool bridge = false;
@@ -177,7 +180,7 @@ main( int argc, char** argv )
     else if( arg == "--curdev" )
     {
 #ifndef __CUDACC__
-      throw std::runtime_error( "CurandDevice is not supported on CPUs" );
+      throw std::runtime_error( "CurandDevice is not supported on CPUs or non-NVidia GPUs" );
 #elif defined MGONGPU_HAS_NO_CURAND
       throw std::runtime_error( "CurandDevice is not supported because this application was built without Curand support" );
 #else
@@ -198,7 +201,7 @@ main( int argc, char** argv )
     }
     else if( arg == "--rmbdev" )
     {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
       rmbsmp = RamboSamplingMode::RamboDevice;
 #else
       throw std::runtime_error( "RamboDevice is not supported on CPUs" );
@@ -272,13 +275,13 @@ main( int argc, char** argv )
     return usage( argv[0] );
   }
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 #ifdef _OPENMP
   ompnumthreadsNotSetMeansOneThread( debug ? 1 : 0 ); // quiet(-1), info(0), debug(1)
 #endif
 #endif
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // Fail gently and avoid "Illegal instruction (core dumped)" if the host does not support the SIMD used in the ME calculation
   // Note: this prevents a crash on pmpe04 but not on some github CI nodes?
   // [NB: SIMD vectorization in mg5amc C++ code is only used in the ME calculation below MatrixElementKernelHost!]
@@ -296,14 +299,14 @@ main( int argc, char** argv )
 
   // === STEP 0 - INITIALISE
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 
-  // --- 00. Initialise cuda
-  // Instantiate a CudaRuntime at the beginnining of the application's main to
-  // invoke cudaSetDevice(0) in the constructor and book a cudaDeviceReset() call in the destructor
-  const std::string cdinKey = "00 CudaInit";
+  // --- 00. Initialise GPU
+  // Instantiate a GpuRuntime at the beginnining of the application's main.
+  // For CUDA this invokes cudaSetDevice(0) in the constructor and books a cudaDeviceReset() call in the destructor.
+  const std::string cdinKey = "00 GpuInit";
   timermap.start( cdinKey );
-  CudaRuntime cudaRuntime( debug );
+  GpuRuntime GpuRuntime( debug );
 #endif
 
   // --- 0a. Initialise physics process
@@ -325,7 +328,7 @@ main( int argc, char** argv )
   timermap.start( alloKey );
 
   // Memory buffers for random numbers for momenta
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferRndNumMomenta hstRndmom( nevt );
 #else
   PinnedHostBufferRndNumMomenta hstRndmom( nevt );
@@ -333,7 +336,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for sampling weights
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferWeights hstWeights( nevt );
 #else
   PinnedHostBufferWeights hstWeights( nevt );
@@ -341,7 +344,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for momenta
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferMomenta hstMomenta( nevt );
 #else
   PinnedHostBufferMomenta hstMomenta( nevt );
@@ -349,7 +352,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for Gs
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferGs hstGs( nevt );
 #else
   PinnedHostBufferGs hstGs( nevt );
@@ -366,7 +369,7 @@ main( int argc, char** argv )
   }
 
   // Memory buffers for matrix elements
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferMatrixElements hstMatrixElements( nevt );
 #else
   PinnedHostBufferMatrixElements hstMatrixElements( nevt );
@@ -375,7 +378,7 @@ main( int argc, char** argv )
 
   // Memory buffers for random numbers for helicity selection
   // *** NB #403 these buffers always remain initialised at 0: no need for helicity choice in gcheck/check (no LHE produced) ***
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferRndNumHelicity hstRndHel( nevt );
 #else
   PinnedHostBufferRndNumHelicity hstRndHel( nevt );
@@ -384,7 +387,7 @@ main( int argc, char** argv )
 
   // Memory buffers for random numbers for color selection
   // *** NB #402 these buffers always remain initialised at 0: no need for color choice in gcheck/check (no LHE produced) ***
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferRndNumColor hstRndCol( nevt );
 #else
   PinnedHostBufferRndNumColor hstRndCol( nevt );
@@ -392,7 +395,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for helicity selection
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferSelectedHelicity hstSelHel( nevt );
 #else
   PinnedHostBufferSelectedHelicity hstSelHel( nevt );
@@ -400,7 +403,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for color selection
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferSelectedColor hstSelCol( nevt );
 #else
   PinnedHostBufferSelectedColor hstSelCol( nevt );
@@ -438,7 +441,7 @@ main( int argc, char** argv )
     const bool onDevice = true;
     prnk.reset( new CurandRandomNumberKernel( devRndmom, onDevice ) );
 #else
-    throw std::logic_error( "INTERNAL ERROR! CurandDevice is not supported on CPUs" ); // INTERNAL ERROR (no path to this statement)
+    throw std::logic_error( "INTERNAL ERROR! CurandDevice is not supported on CPUs or non-NVidia GPUs" ); // INTERNAL ERROR (no path to this statement)
 #endif
   }
 
@@ -450,7 +453,7 @@ main( int argc, char** argv )
   }
   else
   {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     prsk.reset( new RamboSamplingKernelDevice( energy, devRndmom, devMomenta, devWeights, gpublocks, gputhreads ) );
 #else
     throw std::logic_error( "RamboDevice is not supported on CPUs" ); // INTERNAL ERROR (no path to this statement)
@@ -461,7 +464,7 @@ main( int argc, char** argv )
   std::unique_ptr<MatrixElementKernelBase> pmek;
   if( !bridge )
   {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     pmek.reset( new MatrixElementKernelDevice( devMomenta, devGs, devRndHel, devRndCol, devMatrixElements, devSelHel, devSelCol, gpublocks, gputhreads ) );
 #else
     pmek.reset( new MatrixElementKernelHost( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, nevt ) );
@@ -469,7 +472,7 @@ main( int argc, char** argv )
   }
   else
   {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     pmek.reset( new BridgeKernelDevice( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, gpublocks, gputhreads ) );
 #else
     pmek.reset( new BridgeKernelHost( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, nevt ) );
@@ -511,7 +514,7 @@ main( int argc, char** argv )
     prnk->generateRnarray();
     //std::cout << "Got random numbers" << std::endl;
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     if( rndgen != RandomNumberMode::CurandDevice && rmbsmp == RamboSamplingMode::RamboDevice )
     {
       // --- 1c. Copy rndmom from host to device
@@ -543,7 +546,7 @@ main( int argc, char** argv )
     prsk->getMomentaFinal();
     //std::cout << "Got final momenta" << std::endl;
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     if( rmbsmp == RamboSamplingMode::RamboDevice )
     {
       // --- 2c. CopyDToH Weights
@@ -588,7 +591,7 @@ main( int argc, char** argv )
       dynamic_cast<BridgeKernelBase*>( pmek.get() )->transposeInputMomentaC2F();
     }
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     // --- 2d. CopyHToD Momenta
     const std::string gKey = "0.. CpHTDg";
     rambtime += timermap.start( gKey ); // FIXME! NOT A RAMBO TIMER!
@@ -617,7 +620,7 @@ main( int argc, char** argv )
     wv3atime += timermap.stop(); // calc only
     wavetime += wv3atime;        // calc plus copy
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     if( !bridge )
     {
       // --- 3b. CopyDToH MEs
@@ -758,8 +761,10 @@ main( int argc, char** argv )
     rndgentxt = "CURAND HOST";
   else if( rndgen == RandomNumberMode::CurandDevice )
     rndgentxt = "CURAND DEVICE";
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   rndgentxt += " (CUDA code)";
+#elif defined __HIPCC__
+  rndgentxt += " (HIP code)";
 #else
   rndgentxt += " (C++ code)";
 #endif
@@ -767,13 +772,15 @@ main( int argc, char** argv )
   // Workflow description summary
   std::string wrkflwtxt;
   // -- CUDA or C++?
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   wrkflwtxt += "CUD:";
+#elif defined __HIPCC__
+  wrkflwtxt += "HIP:";
 #else
   wrkflwtxt += "CPP:";
 #endif
   // -- DOUBLE or FLOAT?
-#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT /* clang-format off */
   wrkflwtxt += "MIX+"; // mixed fptypes (single precision color algebra #537)
 #elif defined MGONGPU_FPTYPE_DOUBLE
   wrkflwtxt += "DBL+";
@@ -783,7 +790,7 @@ main( int argc, char** argv )
   wrkflwtxt += "???+"; // no path to this statement
 #endif
   // -- CUCOMPLEX or THRUST or STD complex numbers?
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 #if defined MGONGPU_CUCXTYPE_CUCOMPLEX
   wrkflwtxt += "CUX:";
 #elif defined MGONGPU_CUCXTYPE_THRUST
@@ -792,6 +799,12 @@ main( int argc, char** argv )
   wrkflwtxt += "CXS:";
 #else
   wrkflwtxt += "???:"; // no path to this statement
+#endif /* clang-format on */
+#elif defined __HIPCC__
+#if defined MGONGPU_CUCXTYPE_CXSMPL
+  wrkflwtxt += "CXS:";
+#else
+  wrkflwtxt += "???:"; // no path to this statement
 #endif
 #else
 #if defined MGONGPU_CPPCXTYPE_STDCOMPLEX
@@ -818,7 +831,7 @@ main( int argc, char** argv )
     wrkflwtxt += "RMBDEV+";
   else
     wrkflwtxt += "??????+"; // no path to this statement
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   // -- HOST or DEVICE matrix elements? Standalone MEs or BRIDGE?
   if( !bridge )
     wrkflwtxt += "MESDEV";
@@ -874,7 +887,7 @@ main( int argc, char** argv )
 
   if( perf )
   {
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 #ifdef _OPENMP
     // Get the output of "nproc --all" (https://stackoverflow.com/a/478960)
     std::string nprocall;
@@ -893,8 +906,10 @@ main( int argc, char** argv )
 #endif
     // Dump all configuration parameters and all results
     std::cout << std::string( SEP79, '*' ) << std::endl
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
               << "Process                     = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_CUDA"
+#elif defined __HIPCC__
+              << "Process                     = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_HIP"
 #else
               << "Process                     = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_CPP"
 #endif
@@ -921,21 +936,22 @@ main( int argc, char** argv )
 #elif defined MGONGPU_FPTYPE_FLOAT
               << "FP precision                = FLOAT (NaN/abnormal=" << nabn << ", zero=" << nzero << ")" << std::endl
 #endif
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 #if defined MGONGPU_CUCXTYPE_CUCOMPLEX
               << "Complex type                = CUCOMPLEX" << std::endl
 #elif defined MGONGPU_CUCXTYPE_THRUST
               << "Complex type                = THRUST::COMPLEX" << std::endl
-#endif
-#else
+#elif defined MGONGPU_CUCXTYPE_CXSMPL
               << "Complex type                = STD::COMPLEX" << std::endl
+#else
+              << "Complex type                = ???" << std::endl // no path to this statement...
 #endif
               << "RanNumb memory layout       = AOSOA[" << neppR << "]"
               << ( neppR == 1 ? " == AOS" : "" )
               << " [HARDCODED FOR REPRODUCIBILITY]" << std::endl
               << "Momenta memory layout       = AOSOA[" << neppM << "]"
               << ( neppM == 1 ? " == AOS" : "" ) << std::endl
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     //<< "Wavefunction GPU memory     = LOCAL" << std::endl
 #else
 #if !defined MGONGPU_CPPSIMD
@@ -966,7 +982,7 @@ main( int argc, char** argv )
 #endif
 #endif
               << "Random number generation    = " << rndgentxt << std::endl
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 #ifdef _OPENMP
               << "OMP threads / `nproc --all` = " << omp_get_max_threads() << " / " << nprocall // includes a newline
 #endif
@@ -1062,14 +1078,15 @@ main( int argc, char** argv )
              << "\"FLOAT (NaN/abnormal=" << nabn << ")\"," << std::endl
 #endif
              << "\"Complex type\": "
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 #if defined MGONGPU_CUCXTYPE_CUCOMPLEX
              << "\"CUCOMPLEX\"," << std::endl
 #elif defined MGONGPU_CUCXTYPE_THRUST
              << "\"THRUST::COMPLEX\"," << std::endl
-#endif
-#else
+#elif defined MGONGPU_CUCXTYPE_CXSMPL
              << "\"STD::COMPLEX\"," << std::endl
+#else
+             << "\"???\"," << std::endl // no path to this statement...
 #endif
              << "\"RanNumb memory layout\": "
              << "\"AOSOA[" << neppR << "]\""
@@ -1077,7 +1094,7 @@ main( int argc, char** argv )
              << "\"Momenta memory layout\": "
              << "\"AOSOA[" << neppM << "]\""
              << ( neppM == 1 ? " == AOS" : "" ) << ", " << std::endl
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     //<< "\"Wavefunction GPU memory\": " << "\"LOCAL\"," << std::endl
 #endif
              << "\"Curand generation\": "
diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/RamboSamplingKernels.cc b/epochX/cudacpp/gg_tt.mad/SubProcesses/RamboSamplingKernels.cc
index da68aa9255..79abbcc4f8 100644
--- a/epochX/cudacpp/gg_tt.mad/SubProcesses/RamboSamplingKernels.cc
+++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/RamboSamplingKernels.cc
@@ -1,11 +1,11 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #include "RamboSamplingKernels.h"
 
-#include "CudaRuntime.h"
+#include "GpuRuntime.h"
 #include "MemoryAccessMomenta.h"
 #include "MemoryAccessRandomNumbers.h"
 #include "MemoryAccessWeights.h"
@@ -14,7 +14,7 @@
 
 #include <sstream>
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -92,7 +92,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   RamboSamplingKernelDevice::RamboSamplingKernelDevice( const fptype energy,               // input: energy
                                                         const BufferRndNumMomenta& rndmom, // input: random numbers in [0,1]
                                                         BufferMomenta& momenta,            // output: momenta
@@ -135,7 +135,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   __global__ void
   getMomentaInitialDevice( const fptype energy,
                            fptype* momenta )
@@ -147,17 +147,17 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   void
   RamboSamplingKernelDevice::getMomentaInitial()
   {
-    getMomentaInitialDevice<<<m_gpublocks, m_gputhreads>>>( m_energy, m_momenta.data() );
+    gpuLaunchKernel( getMomentaInitialDevice, m_gpublocks, m_gputhreads, m_energy, m_momenta.data() );
   }
 #endif
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   __global__ void
   getMomentaFinalDevice( const fptype energy,
                          const fptype* rndmom,
@@ -171,11 +171,11 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   void
   RamboSamplingKernelDevice::getMomentaFinal()
   {
-    getMomentaFinalDevice<<<m_gpublocks, m_gputhreads>>>( m_energy, m_rndmom.data(), m_momenta.data(), m_weights.data() );
+    gpuLaunchKernel( getMomentaFinalDevice, m_gpublocks, m_gputhreads, m_energy, m_rndmom.data(), m_momenta.data(), m_weights.data() );
   }
 #endif
 
diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/RamboSamplingKernels.h b/epochX/cudacpp/gg_tt.mad/SubProcesses/RamboSamplingKernels.h
index 184089efd7..7c214cd74b 100644
--- a/epochX/cudacpp/gg_tt.mad/SubProcesses/RamboSamplingKernels.h
+++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/RamboSamplingKernels.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef RAMBOSAMPLINGKERNELS_H
 #define RAMBOSAMPLINGKERNELS_H 1
@@ -10,7 +10,7 @@
 
 #include "MemoryBuffers.h"
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -93,7 +93,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   // A class encapsulating RAMBO phase space sampling on a GPU device
   class RamboSamplingKernelDevice final : public SamplingKernelBase, public NumberOfEvents
   {
diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/RandomNumberKernels.h b/epochX/cudacpp/gg_tt.mad/SubProcesses/RandomNumberKernels.h
index 188a72c2c9..21d63beeac 100644
--- a/epochX/cudacpp/gg_tt.mad/SubProcesses/RandomNumberKernels.h
+++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/RandomNumberKernels.h
@@ -1,14 +1,14 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef RANDOMNUMBERKERNELS_H
 #define RANDOMNUMBERKERNELS_H 1
 
 #include "mgOnGpuConfig.h"
 
-// NB This must come AFTER mgOnGpuConfig.h which contains our definition of __global__ when __CUDACC__ is not defined
+// NB This must come AFTER mgOnGpuConfig.h which contains our definition of __global__ when MGONGPUCPP_GPUIMPL is not defined
 #ifndef MGONGPU_HAS_NO_CURAND
 //#include "curand.h"
 struct curandGenerator_st; // forward definition from curand.h
@@ -16,7 +16,7 @@ struct curandGenerator_st; // forward definition from curand.h
 
 #include "MemoryBuffers.h"
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/cudacpp.mk b/epochX/cudacpp/gg_tt.mad/SubProcesses/cudacpp.mk
index 509307506b..24f2d49d80 100644
--- a/epochX/cudacpp/gg_tt.mad/SubProcesses/cudacpp.mk
+++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/cudacpp.mk
@@ -1,7 +1,7 @@
 # Copyright (C) 2020-2023 CERN and UCLouvain.
 # Licensed under the GNU Lesser General Public License (version 3 or later).
 # Created by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin.
-# Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+# Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 
 #=== Determine the name of this makefile (https://ftp.gnu.org/old-gnu/Manuals/make-3.80/html_node/make_17.html)
 #=== NB: use ':=' to ensure that the value of CUDACPP_MAKEFILE is not modified further down after including make_opts
@@ -42,10 +42,10 @@ endif
 
 #-------------------------------------------------------------------------------
 
-#=== Configure common compiler flags for C++ and CUDA
+#=== Configure common compiler flags for C++ and CUDA/HIP
 
 INCFLAGS = -I.
-OPTFLAGS = -O3 # this ends up in CUFLAGS too (should it?), cannot add -Ofast or -ffast-math here
+OPTFLAGS = -O3 # this ends up in GPUFLAGS too (should it?), cannot add -Ofast or -ffast-math here
 
 # Dependency on src directory
 MG5AMC_COMMONLIB = mg5amc_common
@@ -121,24 +121,46 @@ endif
 
 #-------------------------------------------------------------------------------
 
-#=== Configure the CUDA compiler
+#=== Configure the GPU compiler (CUDA or HIP)
 
-# If CXX is not a single word (example "clang++ --gcc-toolchain...") then disable CUDA builds (issue #505)
-# This is because it is impossible to pass this to "CUFLAGS += -ccbin <host-compiler>" below
+# FIXME! (AV 24.01.2024)
+# In the current implementation (without separate builds for C++ and CUDA/HIP), we first check for cudacc and hipcc in CUDA_HOME and HIP_HOME.
+# If CUDA_HOME or HIP_HOME are not set, try to determine them from the path to cudacc and hipcc.
+# While convoluted, this is currently necessary to allow disabling CUDA/HIP builds by setting CUDA_HOME or HIP_HOME to invalid paths.
+# This will (probably?) be fixed when separate C++ and CUDA/HIP builds are implemented (PR #775).
+
+# If CXX is not a single word (example "clang++ --gcc-toolchain...") then disable CUDA and HIP builds (issue #505)
+# This is because it is impossible to pass this to "GPUFLAGS += -ccbin <host-compiler>" below
 ifneq ($(words $(subst ccache ,,$(CXX))),1) # allow at most "CXX=ccache <host-compiler>" from outside
-  $(warning CUDA builds are not supported for multi-word CXX "$(CXX)")
+  $(warning CUDA and HIP builds are not supported for multi-word CXX "$(CXX)")
   override CUDA_HOME=disabled
+  override HIP_HOME=disabled
 endif
 
-# If CUDA_HOME is not set, try to set it from the location of nvcc
+# If CUDA_HOME is not set, try to set it from the path to nvcc
 ifndef CUDA_HOME
   CUDA_HOME = $(patsubst %bin/nvcc,%,$(shell which nvcc 2>/dev/null))
   $(warning CUDA_HOME was not set: using "$(CUDA_HOME)")
 endif
 
-# Set NVCC as $(CUDA_HOME)/bin/nvcc if it exists
+# If HIP_HOME is not set, try to set it from the path to hipcc
+ifndef HIP_HOME
+  HIP_HOME = $(patsubst %bin/hipcc,%,$(HIP_COMPILER_PATH))
+  $(warning HIP_HOME was not set: using "$(HIP_HOME)")
+endif
+
+# FIXME! (AV 24.01.2024)
+# In the current implementation (without separate builds for C++ and CUDA/HIP),
+# builds are performed for HIP only if CUDA is not found in the path.
+# If both CUDA and HIP are installed, HIP builds can be triggered by unsetting CUDA_HOME.
+# This will be fixed when separate C++ and CUDA/HIP builds are implemented (PR #775).
+
+#--- Option 1: CUDA exists -> use CUDA
+
+# Set GPUCC as $(CUDA_HOME)/bin/nvcc if it exists
 ifneq ($(wildcard $(CUDA_HOME)/bin/nvcc),)
-  NVCC = $(CUDA_HOME)/bin/nvcc
+
+  GPUCC = $(CUDA_HOME)/bin/nvcc
   USE_NVTX ?=-DUSE_NVTX
   # See https://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/index.html
   # See https://arnon.dk/matching-sm-architectures-arch-and-gencode-for-various-nvidia-cards/
@@ -158,41 +180,77 @@ ifneq ($(wildcard $(CUDA_HOME)/bin/nvcc),)
     CURANDLIBFLAGS = -L$(CUDA_HOME)/lib64/ -lcurand # NB: -lcuda is not needed here!
   endif
   CUOPTFLAGS = -lineinfo
-  CUFLAGS = $(foreach opt, $(OPTFLAGS), -Xcompiler $(opt)) $(CUOPTFLAGS) $(INCFLAGS) $(CUINC) $(USE_NVTX) $(CUARCHFLAGS) -use_fast_math
-  ###CUFLAGS += -Xcompiler -Wall -Xcompiler -Wextra -Xcompiler -Wshadow
-  ###NVCC_VERSION = $(shell $(NVCC) --version | grep 'Cuda compilation tools' | cut -d' ' -f5 | cut -d, -f1)
-  CUFLAGS += -std=c++17 # need CUDA >= 11.2 (see #333): this is enforced in mgOnGpuConfig.h
+  GPUFLAGS = $(OPTFLAGS) $(CUOPTFLAGS) $(INCFLAGS) $(CUINC) $(USE_NVTX) $(CUARCHFLAGS) -use_fast_math
+  ###GPUFLAGS += -Xcompiler -Wall -Xcompiler -Wextra -Xcompiler -Wshadow
+  ###GPUCC_VERSION = $(shell $(GPUCC) --version | grep 'Cuda compilation tools' | cut -d' ' -f5 | cut -d, -f1)
+  GPUFLAGS += -std=c++17 # need CUDA >= 11.2 (see #333): this is enforced in mgOnGpuConfig.h
   # Without -maxrregcount: baseline throughput: 6.5E8 (16384 32 12) up to 7.3E8 (65536 128 12)
-  ###CUFLAGS+= --maxrregcount 160 # improves throughput: 6.9E8 (16384 32 12) up to 7.7E8 (65536 128 12)
-  ###CUFLAGS+= --maxrregcount 128 # improves throughput: 7.3E8 (16384 32 12) up to 7.6E8 (65536 128 12)
-  ###CUFLAGS+= --maxrregcount 96 # degrades throughput: 4.1E8 (16384 32 12) up to 4.5E8 (65536 128 12)
-  ###CUFLAGS+= --maxrregcount 64 # degrades throughput: 1.7E8 (16384 32 12) flat at 1.7E8 (65536 128 12)
+  ###GPUFLAGS+= --maxrregcount 160 # improves throughput: 6.9E8 (16384 32 12) up to 7.7E8 (65536 128 12)
+  ###GPUFLAGS+= --maxrregcount 128 # improves throughput: 7.3E8 (16384 32 12) up to 7.6E8 (65536 128 12)
+  ###GPUFLAGS+= --maxrregcount 96 # degrades throughput: 4.1E8 (16384 32 12) up to 4.5E8 (65536 128 12)
+  ###GPUFLAGS+= --maxrregcount 64 # degrades throughput: 1.7E8 (16384 32 12) flat at 1.7E8 (65536 128 12)
+  CUBUILDRULEFLAGS = -Xcompiler -fPIC -c
+  CCBUILDRULEFLAGS = -Xcompiler -fPIC -c -x cu
+  CUDATESTFLAGS = -lcuda
+
+  # Set the host C++ compiler for GPUCC via "-ccbin <host-compiler>"
+  # (NB issue #505: this must be a single word, "clang++ --gcc-toolchain..." is not supported)
+  GPUFLAGS += -ccbin $(shell which $(subst ccache ,,$(CXX)))
+
+  # Allow newer (unsupported) C++ compilers with older versions of CUDA if ALLOW_UNSUPPORTED_COMPILER_IN_CUDA is set (#504)
+  ifneq ($(origin ALLOW_UNSUPPORTED_COMPILER_IN_CUDA),undefined)
+  GPUFLAGS += -allow-unsupported-compiler
+  endif
+
 else ifneq ($(origin REQUIRE_CUDA),undefined)
+
   # If REQUIRE_CUDA is set but no cuda is found, stop here (e.g. for CI tests on GPU #443)
-  $(error No cuda installation found (set CUDA_HOME or make nvcc visible in PATH))
+  $(error No cuda installation found (set CUDA_HOME or make GPUCC visible in PATH))
+
+#--- Option 2: CUDA does not exist, HIP exists -> use HIP
+
+# Set GPUCC as $(HIP_HOME)/bin/hipcc if it exists
+else ifneq ($(wildcard $(HIP_HOME)/bin/hipcc),)
+
+  GPUCC = $(HIP_HOME)/bin/hipcc
+  #USE_NVTX ?=-DUSE_NVTX # should maybe find something equivalent to this in HIP?
+  HIPARCHFLAGS = -target x86_64-linux-gnu --offload-arch=gfx90a
+  HIPINC = -I$(HIP_HOME)/include/
+  # Note: -DHIP_FAST_MATH is equivalent to -use_fast_math in HIP 
+  # (but only for single precision line 208: https://rocm-developer-tools.github.io/HIP/hcc__detail_2math__functions_8h_source.html)
+  GPUFLAGS = $(OPTFLAGS) $(CUOPTFLAGS) $(INCFLAGS) $(HIPINC) $(HIPARCHFLAGS) -DHIP_FAST_MATH -DHIP_PLATFORM=amd -fPIC
+  ###GPUFLAGS += -Xcompiler -Wall -Xcompiler -Wextra -Xcompiler -Wshadow
+  GPUFLAGS += -std=c++17
+  ###GPUFLAGS+= --maxrregcount 255 # (AV: is this option valid on HIP and meaningful on AMD GPUs?)
+  CUBUILDRULEFLAGS = -fPIC -c
+  CCBUILDRULEFLAGS = -fPIC -c
+
+else ifneq ($(origin REQUIRE_HIP),undefined)
+
+  # If REQUIRE_HIP is set but no HIP is found, stop here (e.g. for CI tests on GPU #443)
+  $(error No hip installation found (set HIP_HOME or make GPUCC visible in PATH))
+
+#--- Option 3: CUDA does not exist, HIP does not exist -> switch off both CUDA and HIP
+
 else
-  # No cuda. Switch cuda compilation off and go to common random numbers in C++
+
+  # No cudacc and no hipcc: switch CUDA and HIP compilation off and go to common random numbers in C++
   $(warning CUDA_HOME is not set or is invalid: export CUDA_HOME to compile with cuda)
-  override NVCC=
+  $(warning HIP_HOME is not set or is invalid: export HIP_HOME to compile with hip)
+  override GPUCC=
   override USE_NVTX=
   override CUINC=
   override CURANDLIBFLAGS=
-endif
-export NVCC
-export CUFLAGS
 
-# Set the host C++ compiler for nvcc via "-ccbin <host-compiler>"
-# (NB issue #505: this must be a single word, "clang++ --gcc-toolchain..." is not supported)
-CUFLAGS += -ccbin $(shell which $(subst ccache ,,$(CXX)))
-
-# Allow newer (unsupported) C++ compilers with older versions of CUDA if ALLOW_UNSUPPORTED_COMPILER_IN_CUDA is set (#504)
-ifneq ($(origin ALLOW_UNSUPPORTED_COMPILER_IN_CUDA),undefined)
-CUFLAGS += -allow-unsupported-compiler
 endif
 
+# Export GPUCC (so that it can also be used in cudacpp_src.mk?)
+export GPUCC
+export GPUFLAGS
+
 #-------------------------------------------------------------------------------
 
-#=== Configure ccache for C++ and CUDA builds
+#=== Configure ccache for C++ and CUDA/HIP builds
 
 # Enable ccache if USECCACHE=1
 ifeq ($(USECCACHE)$(shell echo $(CXX) | grep ccache),1)
@@ -201,15 +259,15 @@ endif
 #ifeq ($(USECCACHE)$(shell echo $(AR) | grep ccache),1)
 #  override AR:=ccache $(AR)
 #endif
-ifneq ($(NVCC),)
-  ifeq ($(USECCACHE)$(shell echo $(NVCC) | grep ccache),1)
-    override NVCC:=ccache $(NVCC)
+ifneq ($(GPUCC),)
+  ifeq ($(USECCACHE)$(shell echo $(GPUCC) | grep ccache),1)
+    override GPUCC:=ccache $(GPUCC)
   endif
 endif
 
 #-------------------------------------------------------------------------------
 
-#=== Configure PowerPC-specific compiler flags for C++ and CUDA
+#=== Configure PowerPC-specific compiler flags for C++ and CUDA/HIP
 
 # PowerPC-specific CXX compiler flags (being reviewed)
 ifeq ($(UNAME_P),ppc64le)
@@ -225,9 +283,9 @@ else
   ######CXXFLAGS+= -fno-semantic-interposition # no benefit (neither alone, nor combined with -flto)
 endif
 
-# PowerPC-specific CUDA compiler flags (to be reviewed!)
+# PowerPC-specific CUDA/HIP compiler flags (to be reviewed!)
 ifeq ($(UNAME_P),ppc64le)
-  CUFLAGS+= -Xcompiler -mno-float128
+  GPUFLAGS+= -Xcompiler -mno-float128
 endif
 
 #-------------------------------------------------------------------------------
@@ -237,7 +295,7 @@ endif
 # Set the default OMPFLAGS choice
 ifneq ($(shell $(CXX) --version | egrep '^Intel'),)
 override OMPFLAGS = -fopenmp
-###override OMPFLAGS = # disable OpenMP MT on Intel (was ok without nvcc but not ok with nvcc before #578)
+###override OMPFLAGS = # disable OpenMP MT on Intel (was ok without GPUCC but not ok with GPUCC before #578)
 else ifneq ($(shell $(CXX) --version | egrep '^(clang)'),)
 override OMPFLAGS = -fopenmp
 ###override OMPFLAGS = # disable OpenMP MT on clang (was not ok without or with nvcc before #578)
@@ -293,7 +351,10 @@ endif
 
 # Set the default RNDGEN (random number generator) choice
 ifeq ($(RNDGEN),)
-  ifeq ($(NVCC),)
+  ifeq ($(GPUCC),)
+    override RNDGEN = hasNoCurand
+  # Edgecase for HIP compilation
+  else ifeq ($(findstring hipcc,$(GPUCC)),hipcc)
     override RNDGEN = hasNoCurand
   else ifeq ($(RNDGEN),)
     override RNDGEN = hasCurand
@@ -310,7 +371,7 @@ export OMPFLAGS
 
 #-------------------------------------------------------------------------------
 
-#=== Set the CUDA/C++ compiler flags appropriate to user-defined choices of AVX, FPTYPE, HELINL, HRDCOD, RNDGEN
+#=== Set the CUDA/HIP/C++ compiler flags appropriate to user-defined choices of AVX, FPTYPE, HELINL, HRDCOD, RNDGEN
 
 # Set the build flags appropriate to OMPFLAGS
 $(info OMPFLAGS=$(OMPFLAGS))
@@ -368,13 +429,13 @@ CXXFLAGS+= $(AVXFLAGS)
 $(info FPTYPE=$(FPTYPE))
 ifeq ($(FPTYPE),d)
   CXXFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_DOUBLE
-  CUFLAGS  += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_DOUBLE
+  GPUFLAGS  += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_DOUBLE
 else ifeq ($(FPTYPE),f)
   CXXFLAGS += -DMGONGPU_FPTYPE_FLOAT -DMGONGPU_FPTYPE2_FLOAT
-  CUFLAGS  += -DMGONGPU_FPTYPE_FLOAT -DMGONGPU_FPTYPE2_FLOAT
+  GPUFLAGS  += -DMGONGPU_FPTYPE_FLOAT -DMGONGPU_FPTYPE2_FLOAT
 else ifeq ($(FPTYPE),m)
   CXXFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_FLOAT
-  CUFLAGS  += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_FLOAT
+  GPUFLAGS  += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_FLOAT
 else
   $(error Unknown FPTYPE='$(FPTYPE)': only 'd', 'f' and 'm' are supported)
 endif
@@ -383,7 +444,7 @@ endif
 $(info HELINL=$(HELINL))
 ifeq ($(HELINL),1)
   CXXFLAGS += -DMGONGPU_INLINE_HELAMPS
-  CUFLAGS  += -DMGONGPU_INLINE_HELAMPS
+  GPUFLAGS  += -DMGONGPU_INLINE_HELAMPS
 else ifneq ($(HELINL),0)
   $(error Unknown HELINL='$(HELINL)': only '0' and '1' are supported)
 endif
@@ -392,7 +453,7 @@ endif
 $(info HRDCOD=$(HRDCOD))
 ifeq ($(HRDCOD),1)
   CXXFLAGS += -DMGONGPU_HARDCODE_PARAM
-  CUFLAGS  += -DMGONGPU_HARDCODE_PARAM
+  GPUFLAGS  += -DMGONGPU_HARDCODE_PARAM
 else ifneq ($(HRDCOD),0)
   $(error Unknown HRDCOD='$(HRDCOD)': only '0' and '1' are supported)
 endif
@@ -444,11 +505,11 @@ ifeq ($(UNAME_S),Darwin)
   override CULIBFLAGSRPATH2 =
 else
   # RPATH to cuda/cpp libs when linking executables
-  override CXXLIBFLAGSRPATH = -Wl,-rpath,$(LIBDIRRPATH)
-  override CULIBFLAGSRPATH = -Xlinker -rpath,$(LIBDIRRPATH)
+  override CXXLIBFLAGSRPATH = -Wl,-rpath=$(LIBDIRRPATH)
+  override CULIBFLAGSRPATH = -Xlinker -rpath=$(LIBDIRRPATH)
   # RPATH to common lib when linking cuda/cpp libs
-  override CXXLIBFLAGSRPATH2 = -Wl,-rpath,'$$ORIGIN'
-  override CULIBFLAGSRPATH2 = -Xlinker -rpath,'$$ORIGIN'
+  override CXXLIBFLAGSRPATH2 = -Wl,-rpath='$$ORIGIN'
+  override CULIBFLAGSRPATH2 = -Xlinker -rpath='$$ORIGIN'
 endif
 
 # Setting LD_LIBRARY_PATH or DYLD_LIBRARY_PATH in the RUNTIME is no longer necessary (neither on Linux nor on Mac)
@@ -461,7 +522,7 @@ override RUNTIME =
 cxx_main=$(BUILDDIR)/check.exe
 fcxx_main=$(BUILDDIR)/fcheck.exe
 
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 cu_main=$(BUILDDIR)/gcheck.exe
 fcu_main=$(BUILDDIR)/fgcheck.exe
 else
@@ -492,15 +553,16 @@ $(BUILDDIR)/.build.$(TAG):
 	@touch $(BUILDDIR)/.build.$(TAG)
 
 # Generic target and build rules: objects from CUDA compilation
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 $(BUILDDIR)/%.o : %.cu *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
 	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
-	$(NVCC) $(CPPFLAGS) $(CUFLAGS) -Xcompiler -fPIC -c $< -o $@
+	$(GPUCC) $(CPPFLAGS) $(GPUFLAGS) $(CUBUILDRULEFLAGS) $< -o $@
 
 $(BUILDDIR)/%_cu.o : %.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
 	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
-	$(NVCC) $(CPPFLAGS) $(CUFLAGS) -Xcompiler -fPIC -c -x cu $< -o $@
+	$(GPUCC) $(CPPFLAGS) $(GPUFLAGS) $(CCBUILDRULEFLAGS) $< -o $@
 endif
+# -x cu in line above
 
 # Generic target and build rules: objects from C++ compilation
 # (NB do not include CUINC here! add it only for NVTX or curand #679)
@@ -509,11 +571,14 @@ $(BUILDDIR)/%.o : %.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
 	$(CXX) $(CPPFLAGS) $(CXXFLAGS) -fPIC -c $< -o $@
 
 # Apply special build flags only to CrossSectionKernel.cc and gCrossSectionKernel.cu (no fast math, see #117 and #516)
+# Added edgecase for HIP compilation
 ifeq ($(shell $(CXX) --version | grep ^nvc++),)
-$(BUILDDIR)/CrossSectionKernels.o: CXXFLAGS := $(filter-out -ffast-math,$(CXXFLAGS))
 $(BUILDDIR)/CrossSectionKernels.o: CXXFLAGS += -fno-fast-math
-ifneq ($(NVCC),)
-$(BUILDDIR)/gCrossSectionKernels.o: CUFLAGS += -Xcompiler -fno-fast-math
+$(BUILDDIR)/CrossSectionKernels.o: CXXFLAGS += -fno-fast-math
+ifeq ($(findstring nvcc,$(GPUCC)),nvcc)
+  $(BUILDDIR)/gCrossSectionKernels.o: GPUFLAGS += -Xcompiler -fno-fast-math
+else
+  $(BUILDDIR)/gCrossSectionKernels.o: GPUFLAGS += -fno-fast-math
 endif
 endif
 
@@ -530,10 +595,10 @@ ifeq ($(RNDGEN),hasCurand)
 $(BUILDDIR)/CurandRandomNumberKernel.o: CXXFLAGS += $(CUINC)
 endif
 
-# Avoid "warning: builtin __has_trivial_... is deprecated; use __is_trivially_... instead" in nvcc with icx2023 (#592)
+# Avoid "warning: builtin __has_trivial_... is deprecated; use __is_trivially_... instead" in GPUCC with icx2023 (#592)
 ifneq ($(shell $(CXX) --version | egrep '^(Intel)'),)
-ifneq ($(NVCC),)
-CUFLAGS += -Xcompiler -Wno-deprecated-builtins
+ifneq ($(GPUCC),)
+GPUFLAGS += -Wno-deprecated-builtins
 endif
 endif
 
@@ -541,8 +606,8 @@ endif
 # This patch does remove the warning, but I prefer to keep it disabled for the moment...
 ###ifneq ($(shell $(CXX) --version | egrep '^(clang|Apple clang|Intel)'),)
 ###$(BUILDDIR)/CrossSectionKernels.o: CXXFLAGS += -Wno-overriding-t-option
-###ifneq ($(NVCC),)
-###$(BUILDDIR)/gCrossSectionKernels.o: CUFLAGS += -Xcompiler -Wno-overriding-t-option
+###ifneq ($(GPUCC),)
+###$(BUILDDIR)/gCrossSectionKernels.o: GPUFLAGS += -Xcompiler -Wno-overriding-t-option
 ###endif
 ###endif
 
@@ -569,7 +634,7 @@ MG5AMC_CXXLIB = mg5amc_$(processid_short)_cpp
 cxx_objects_lib=$(BUILDDIR)/CPPProcess.o $(BUILDDIR)/MatrixElementKernels.o $(BUILDDIR)/BridgeKernels.o $(BUILDDIR)/CrossSectionKernels.o
 cxx_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel.o $(BUILDDIR)/RamboSamplingKernels.o
 
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 MG5AMC_CULIB = mg5amc_$(processid_short)_cuda
 cu_objects_lib=$(BUILDDIR)/gCPPProcess.o $(BUILDDIR)/gMatrixElementKernels.o $(BUILDDIR)/gBridgeKernels.o $(BUILDDIR)/gCrossSectionKernels.o
 cu_objects_exe=$(BUILDDIR)/gCommonRandomNumberKernel.o $(BUILDDIR)/gRamboSamplingKernels.o
@@ -581,11 +646,11 @@ $(LIBDIR)/lib$(MG5AMC_CXXLIB).so: cxx_objects_lib += $(BUILDDIR)/fbridge.o
 $(LIBDIR)/lib$(MG5AMC_CXXLIB).so: $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib)
 	$(CXX) -shared -o $@ $(cxx_objects_lib) $(CXXLIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB)
 
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 $(LIBDIR)/lib$(MG5AMC_CULIB).so: $(BUILDDIR)/fbridge_cu.o
 $(LIBDIR)/lib$(MG5AMC_CULIB).so: cu_objects_lib += $(BUILDDIR)/fbridge_cu.o
 $(LIBDIR)/lib$(MG5AMC_CULIB).so: $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cu_objects_lib)
-	$(NVCC) --shared -o $@ $(cu_objects_lib) $(CULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB)
+	$(GPUCC) --shared -o $@ $(cu_objects_lib) $(CULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB)
 endif
 
 #-------------------------------------------------------------------------------
@@ -602,16 +667,16 @@ $(cxx_main): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PAT
 $(cxx_main): $(BUILDDIR)/check_sa.o $(LIBDIR)/lib$(MG5AMC_CXXLIB).so $(cxx_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel.o
 	$(CXX) -o $@ $(BUILDDIR)/check_sa.o $(OMPFLAGS) -ldl -pthread $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_CXXLIB) $(cxx_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel.o $(CURANDLIBFLAGS)
 
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 ifneq ($(shell $(CXX) --version | grep ^Intel),)
-$(cu_main): LIBFLAGS += -lintlc # compile with icpx and link with nvcc (undefined reference to `_intel_fast_memcpy')
-$(cu_main): LIBFLAGS += -lsvml # compile with icpx and link with nvcc (undefined reference to `__svml_cos4_l9')
+$(cu_main): LIBFLAGS += -lintlc # compile with icpx and link with GPUCC (undefined reference to `_intel_fast_memcpy')
+$(cu_main): LIBFLAGS += -lsvml # compile with icpx and link with GPUCC (undefined reference to `__svml_cos4_l9')
 else ifneq ($(shell $(CXX) --version | grep ^nvc++),) # support nvc++ #531
 $(cu_main): LIBFLAGS += -L$(patsubst %bin/nvc++,%lib,$(subst ccache ,,$(CXX))) -lnvhpcatm -lnvcpumath -lnvc
 endif
 $(cu_main): LIBFLAGS += $(CULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH
 $(cu_main): $(BUILDDIR)/gcheck_sa.o $(LIBDIR)/lib$(MG5AMC_CULIB).so $(cu_objects_exe) $(BUILDDIR)/gCurandRandomNumberKernel.o
-	$(NVCC) -o $@ $(BUILDDIR)/gcheck_sa.o $(CUARCHFLAGS) $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe) $(BUILDDIR)/gCurandRandomNumberKernel.o $(CURANDLIBFLAGS)
+	$(GPUCC) -o $@ $(BUILDDIR)/gcheck_sa.o $(CUARCHFLAGS) $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe) $(BUILDDIR)/gCurandRandomNumberKernel.o $(CURANDLIBFLAGS)
 endif
 
 #-------------------------------------------------------------------------------
@@ -637,17 +702,17 @@ $(fcxx_main): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PA
 $(fcxx_main): $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler.o $(LIBDIR)/lib$(MG5AMC_CXXLIB).so $(cxx_objects_exe)
 	$(CXX) -o $@ $(BUILDDIR)/fcheck_sa.o $(OMPFLAGS) $(BUILDDIR)/fsampler.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CXXLIB) $(cxx_objects_exe)
 
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 ifneq ($(shell $(CXX) --version | grep ^Intel),)
-$(fcu_main): LIBFLAGS += -lintlc # compile with icpx and link with nvcc (undefined reference to `_intel_fast_memcpy')
-$(fcu_main): LIBFLAGS += -lsvml # compile with icpx and link with nvcc (undefined reference to `__svml_cos4_l9')
+$(fcu_main): LIBFLAGS += -lintlc # compile with icpx and link with GPUCC (undefined reference to `_intel_fast_memcpy')
+$(fcu_main): LIBFLAGS += -lsvml # compile with icpx and link with GPUCC (undefined reference to `__svml_cos4_l9')
 endif
 ifeq ($(UNAME_S),Darwin)
 $(fcu_main): LIBFLAGS += -L$(shell dirname $(shell $(FC) --print-file-name libgfortran.dylib)) # add path to libgfortran on Mac #375
 endif
 $(fcu_main): LIBFLAGS += $(CULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH
 $(fcu_main): $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler_cu.o $(LIBDIR)/lib$(MG5AMC_CULIB).so $(cu_objects_exe)
-	$(NVCC) -o $@ $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler_cu.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe)
+	$(GPUCC) -o $@ $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler_cu.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe)
 endif
 
 #-------------------------------------------------------------------------------
@@ -659,7 +724,7 @@ $(BUILDDIR)/testxxx.o: testxxx_cc_ref.txt
 $(testmain): $(BUILDDIR)/testxxx.o
 $(testmain): cxx_objects_exe += $(BUILDDIR)/testxxx.o # Comment out this line to skip the C++ test of xxx functions
 
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 $(BUILDDIR)/testxxx_cu.o: $(GTESTLIBS)
 $(BUILDDIR)/testxxx_cu.o: INCFLAGS += $(GTESTINC)
 $(BUILDDIR)/testxxx_cu.o: testxxx_cc_ref.txt
@@ -672,7 +737,7 @@ $(BUILDDIR)/testmisc.o: INCFLAGS += $(GTESTINC)
 $(testmain): $(BUILDDIR)/testmisc.o
 $(testmain): cxx_objects_exe += $(BUILDDIR)/testmisc.o # Comment out this line to skip the C++ miscellaneous tests
 
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 $(BUILDDIR)/testmisc_cu.o: $(GTESTLIBS)
 $(BUILDDIR)/testmisc_cu.o: INCFLAGS += $(GTESTINC)
 $(testmain): $(BUILDDIR)/testmisc_cu.o
@@ -684,12 +749,12 @@ $(BUILDDIR)/runTest.o: INCFLAGS += $(GTESTINC)
 $(testmain): $(BUILDDIR)/runTest.o
 $(testmain): cxx_objects_exe += $(BUILDDIR)/runTest.o
 
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 $(BUILDDIR)/runTest_cu.o: $(GTESTLIBS)
 $(BUILDDIR)/runTest_cu.o: INCFLAGS += $(GTESTINC)
 ifneq ($(shell $(CXX) --version | grep ^Intel),)
-$(testmain): LIBFLAGS += -lintlc # compile with icpx and link with nvcc (undefined reference to `_intel_fast_memcpy')
-$(testmain): LIBFLAGS += -lsvml # compile with icpx and link with nvcc (undefined reference to `__svml_cos4_l9')
+$(testmain): LIBFLAGS += -lintlc # compile with icpx and link with GPUCC (undefined reference to `_intel_fast_memcpy')
+$(testmain): LIBFLAGS += -lsvml # compile with icpx and link with GPUCC (undefined reference to `__svml_cos4_l9')
 else ifneq ($(shell $(CXX) --version | grep ^nvc++),) # support nvc++ #531
 $(testmain): LIBFLAGS += -L$(patsubst %bin/nvc++,%lib,$(subst ccache ,,$(CXX))) -lnvhpcatm -lnvcpumath -lnvc
 endif
@@ -713,14 +778,14 @@ $(testmain): LIBFLAGS += -lgomp
 endif
 endif
 
-ifeq ($(NVCC),) # link only runTest.o
+ifeq ($(GPUCC),) # link only runTest.o
 $(testmain): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH
 $(testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(cxx_objects_exe) $(GTESTLIBS)
 	$(CXX) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) -ldl -pthread $(LIBFLAGS)
 else # link both runTest.o and runTest_cu.o
 $(testmain): LIBFLAGS += $(CULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH
 $(testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) $(GTESTLIBS)
-	$(NVCC) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) -ldl $(LIBFLAGS) -lcuda
+	  $(GPUCC) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) -ldl $(LIBFLAGS) $(CUDATESTFLAGS)
 endif
 
 # Use target gtestlibs to build only googletest
@@ -829,9 +894,9 @@ ifeq ($(USECCACHE),1)
 	ccache --version | head -1
 endif
 	@echo ""
-	@echo NVCC=$(NVCC)
-ifneq ($(NVCC),)
-	$(NVCC) --version
+	@echo GPUCC=$(GPUCC)
+ifneq ($(GPUCC),)
+	$(GPUCC) --version
 endif
 	@echo ""
 	@echo CXX=$(CXX)
@@ -850,7 +915,7 @@ endif
 
 # Target: check (run the C++ test executable)
 # [NB THIS IS WHAT IS USED IN THE GITHUB CI!]
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 check: runTest cmpFcheck cmpFGcheck
 else
 check: runTest cmpFcheck
diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/fbridge.cc b/epochX/cudacpp/gg_tt.mad/SubProcesses/fbridge.cc
index 2d2b36d560..22ce3f5115 100644
--- a/epochX/cudacpp/gg_tt.mad/SubProcesses/fbridge.cc
+++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/fbridge.cc
@@ -1,11 +1,11 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: S. Roiser (Oct 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Roiser, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #include "Bridge.h"
 #include "CPPProcess.h"
-#include "CudaRuntime.h"
+#include "GpuRuntime.h"
 
 extern "C"
 {
@@ -22,7 +22,7 @@ extern "C"
    * Using the same Fortran MadEvent code, linking to the hetrerogeneous library would allow access to both CPU and GPU implementations.
    * The specific heterogeneous configuration (how many GPUs, how many threads on each CPU, etc) could be loaded in CUDA/C++ from a data file.
    */
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   using namespace mg5amcGpu;
 #else
   using namespace mg5amcCpu;
@@ -46,8 +46,8 @@ extern "C"
    */
   void fbridgecreate_( CppObjectInFortran** ppbridge, const int* pnevtF, const int* pnparF, const int* pnp4F )
   {
-#ifdef __CUDACC__
-    CudaRuntime::setUp();
+#ifdef MGONGPUCPP_GPUIMPL
+    GpuRuntime::setUp();
 #endif
     // (NB: CPPProcess::initProc no longer needs to be executed here because it is called in the Bridge constructor)
     // FIXME: disable OMP in Bridge when called from Fortran
@@ -65,8 +65,8 @@ extern "C"
     Bridge<FORTRANFPTYPE>* pbridge = dynamic_cast<Bridge<FORTRANFPTYPE>*>( *ppbridge );
     if( pbridge == 0 ) throw std::runtime_error( "fbridgedelete_: invalid Bridge address" );
     delete pbridge;
-#ifdef __CUDACC__
-    CudaRuntime::tearDown();
+#ifdef MGONGPUCPP_GPUIMPL
+    GpuRuntime::tearDown();
 #endif
   }
 
@@ -96,7 +96,7 @@ extern "C"
   {
     Bridge<FORTRANFPTYPE>* pbridge = dynamic_cast<Bridge<FORTRANFPTYPE>*>( *ppbridge );
     if( pbridge == 0 ) throw std::runtime_error( "fbridgesequence_: invalid Bridge address" );
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     // Use the device/GPU implementation in the CUDA library
     // (there is also a host implementation in this library)
     pbridge->gpu_sequence( momenta, gs, rndhel, rndcol, *pchannelId, mes, selhel, selcol );
diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/fsampler.cc b/epochX/cudacpp/gg_tt.mad/SubProcesses/fsampler.cc
index 2fb445372d..3743934f41 100644
--- a/epochX/cudacpp/gg_tt.mad/SubProcesses/fsampler.cc
+++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/fsampler.cc
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Feb 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #include "mgOnGpuConfig.h"
 
@@ -13,7 +13,7 @@
 
 //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -40,7 +40,7 @@ namespace mg5amcCpu
   private:
     const int m_nevt; // The number of events in each iteration
     int m_iiter;      // The iteration counter (for random number seeding)
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
     HostBufferRndNumMomenta m_hstRndmom; // Memory buffers for random numbers
     HostBufferMomenta m_hstMomenta;      // Memory buffers for momenta
     HostBufferWeights m_hstWeights;      // Memory buffers for sampling weights
@@ -105,7 +105,7 @@ namespace mg5amcCpu
 
 extern "C"
 {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   using namespace mg5amcGpu;
 #else
   using namespace mg5amcCpu;
diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/runTest.cc b/epochX/cudacpp/gg_tt.mad/SubProcesses/runTest.cc
index d4a760a71b..de327f2321 100644
--- a/epochX/cudacpp/gg_tt.mad/SubProcesses/runTest.cc
+++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/runTest.cc
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: S. Hageboeck (Nov 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 //----------------------------------------------------------------------------
 // Use ./runTest.exe --gtest_filter=*xxx to run only testxxx.cc tests
 //----------------------------------------------------------------------------
@@ -18,7 +18,7 @@
 #include "RandomNumberKernels.h"
 #include "epoch_process_id.h"
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 using namespace mg5amcGpu;
 #else
 using namespace mg5amcCpu;
@@ -35,7 +35,7 @@ struct CUDA_CPU_TestBase : public TestDriverBase
     : TestDriverBase( npar, refFileName ) {}
 };
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 struct CPUTest : public CUDA_CPU_TestBase
 {
   // Struct data members (process, and memory structures for random numbers, momenta, matrix elements and weights on host and device)
@@ -119,7 +119,7 @@ struct CPUTest : public CUDA_CPU_TestBase
 };
 #endif
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 struct CUDATest : public CUDA_CPU_TestBase
 {
   // Reset the device when our test goes out of scope. Note that this should happen after
@@ -128,7 +128,7 @@ struct CUDATest : public CUDA_CPU_TestBase
   {
     ~DeviceReset()
     {
-      checkCuda( cudaDeviceReset() ); // this is needed by cuda-memcheck --leak-check full
+      checkGpu( gpuDeviceReset() ); // this is needed by cuda-memcheck --leak-check full
     }
   } deviceResetter;
 
@@ -256,7 +256,7 @@ INSTANTIATE_TEST_SUITE_P( prefix, \
                           test_suite_name, \
                           testing::Values( new CUDATest( MG_EPOCH_REFERENCE_FILE_NAME ) ) );
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 MG_INSTANTIATE_TEST_SUITE_GPU( XTESTID_GPU( MG_EPOCH_PROCESS_ID ), MadgraphTest );
 #else
 MG_INSTANTIATE_TEST_SUITE_CPU( XTESTID_CPU( MG_EPOCH_PROCESS_ID ), MadgraphTest );
diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/testmisc.cc b/epochX/cudacpp/gg_tt.mad/SubProcesses/testmisc.cc
index 895d6eeb56..ba9e59a8a3 100644
--- a/epochX/cudacpp/gg_tt.mad/SubProcesses/testmisc.cc
+++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/testmisc.cc
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 //----------------------------------------------------------------------------
 // Use ./runTest.exe --gtest_filter=*misc to run only testmisc.cc tests
 //----------------------------------------------------------------------------
@@ -17,7 +17,7 @@
 #include <sstream>
 #include <typeinfo>
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 #define TESTID( s ) s##_GPU_MISC
 #else
 #define TESTID( s ) s##_CPU_MISC
@@ -26,7 +26,7 @@
 #define XTESTID( s ) TESTID( s )
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -59,7 +59,7 @@ namespace mg5amcCpu
 
 TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testmisc )
 {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   using namespace mg5amcGpu;
 #else
   using namespace mg5amcCpu;
diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/testxxx.cc b/epochX/cudacpp/gg_tt.mad/SubProcesses/testxxx.cc
index 3361fe5aa9..e5167de00c 100644
--- a/epochX/cudacpp/gg_tt.mad/SubProcesses/testxxx.cc
+++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/testxxx.cc
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Apr 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 //----------------------------------------------------------------------------
 // Use ./runTest.exe --gtest_filter=*xxx to run only testxxx.cc tests
 //----------------------------------------------------------------------------
@@ -24,7 +24,7 @@
 #include <iomanip>
 #include <iostream>
 #include <vector>
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 #define TESTID( s ) s##_GPU_XXX
 #else
 #define TESTID( s ) s##_CPU_XXX
@@ -32,7 +32,7 @@
 
 #define XTESTID( s ) TESTID( s )
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -42,7 +42,7 @@ namespace mg5amcCpu
   int FPEhandlerIevt = -1;
   inline void FPEhandler( int sig )
   {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     std::cerr << "Floating Point Exception (GPU): '" << FPEhandlerMessage << "' ievt=" << FPEhandlerIevt << std::endl;
 #else
     std::cerr << "Floating Point Exception (CPU neppV=" << neppV << "): '" << FPEhandlerMessage << "' ievt=" << FPEhandlerIevt << std::endl;
@@ -53,7 +53,7 @@ namespace mg5amcCpu
 
 TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testxxx )
 {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   using namespace mg5amcGpu;
 #else
   using namespace mg5amcCpu;
@@ -77,7 +77,7 @@ TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testxxx )
   assert( nevt % neppM == 0 ); // nevt must be a multiple of neppM
   assert( nevt % neppV == 0 ); // nevt must be a multiple of neppV
   // Fill in the input momenta
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   mg5amcGpu::PinnedHostBufferMomenta hstMomenta( nevt ); // AOSOA[npagM][npar=4][np4=4][neppM]
 #else
   mg5amcCpu::HostBufferMomenta hstMomenta( nevt ); // AOSOA[npagM][npar=4][np4=4][neppM]
@@ -322,7 +322,7 @@ TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testxxx )
   {
     for( int ievt = 0; ievt < nevt; ievt++ )
     {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
       using namespace mg5amcGpu;
 #else
       using namespace mg5amcCpu;
diff --git a/epochX/cudacpp/gg_tt.mad/src/HelAmps_sm.h b/epochX/cudacpp/gg_tt.mad/src/HelAmps_sm.h
index 55f43bb43a..add8fce575 100644
--- a/epochX/cudacpp/gg_tt.mad/src/HelAmps_sm.h
+++ b/epochX/cudacpp/gg_tt.mad/src/HelAmps_sm.h
@@ -5,7 +5,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: A. Valassi (Sep 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
 // MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
@@ -28,7 +28,7 @@
 //#include <iomanip>
 //#include <iostream>
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/gg_tt.mad/src/Parameters_sm.cc b/epochX/cudacpp/gg_tt.mad/src/Parameters_sm.cc
index a9bc93ff98..c5dd6e7e4c 100644
--- a/epochX/cudacpp/gg_tt.mad/src/Parameters_sm.cc
+++ b/epochX/cudacpp/gg_tt.mad/src/Parameters_sm.cc
@@ -4,7 +4,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: A. Valassi (Sep 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
 // MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
@@ -17,7 +17,7 @@
 #include <iomanip>
 #include <iostream>
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 using namespace mg5amcGpu;
 #else
 using namespace mg5amcCpu;
diff --git a/epochX/cudacpp/gg_tt.mad/src/Parameters_sm.h b/epochX/cudacpp/gg_tt.mad/src/Parameters_sm.h
index 932f123fea..06fc44c44c 100644
--- a/epochX/cudacpp/gg_tt.mad/src/Parameters_sm.h
+++ b/epochX/cudacpp/gg_tt.mad/src/Parameters_sm.h
@@ -27,7 +27,7 @@
 #include "read_slha.h"
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -93,7 +93,7 @@ namespace mg5amcCpu
 #include <limits>
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -217,7 +217,7 @@ namespace mg5amcCpu
 //==========================================================================
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -236,7 +236,7 @@ namespace mg5amcCpu
 #pragma GCC diagnostic push
 #pragma GCC diagnostic ignored "-Wunused-variable"  // e.g. <<warning: unused variable ‘mdl_G__exp__2’ [-Wunused-variable]>>
 #pragma GCC diagnostic ignored "-Wunused-parameter" // e.g. <<warning: unused parameter ‘G’ [-Wunused-parameter]>>
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 #pragma nv_diagnostic push
 #pragma nv_diag_suppress 177 // e.g. <<warning #177-D: variable "mdl_G__exp__2" was declared but never referenced>>
 #endif
@@ -263,7 +263,7 @@ namespace mg5amcCpu
       // End SM implementation - no special handling of vectors of floats as in EFT (#439)
       return out;
     }
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 #pragma GCC diagnostic pop
 #pragma nv_diagnostic pop
 #endif
@@ -279,33 +279,39 @@ namespace mg5amcCpu
 
   //==========================================================================
 
+#ifdef MGONGPUCPP_GPUIMPL
+  namespace mg5amcGpu
+#else
+  namespace mg5amcCpu
+#endif
+  {
 #pragma GCC diagnostic push
 #ifndef __clang__
 #pragma GCC diagnostic ignored "-Wunused-but-set-variable" // e.g. <<warning: variable ‘couplings_sv’ set but not used [-Wunused-but-set-variable]>>
 #endif
-  // Compute the output couplings (e.g. gc10 and gc11) from the input gs
-  template<class G_ACCESS, class C_ACCESS>
-  __device__ inline void
-  G2COUP( const fptype gs[],
-          fptype couplings[] )
-  {
-    mgDebug( 0, __FUNCTION__ );
-    using namespace Parameters_sm_dependentCouplings;
-    const fptype_sv& gs_sv = G_ACCESS::kernelAccessConst( gs );
-    DependentCouplings_sv couplings_sv = computeDependentCouplings_fromG( gs_sv );
-    fptype* GC_10s = C_ACCESS::idcoupAccessBuffer( couplings, idcoup_GC_10 );
-    fptype* GC_11s = C_ACCESS::idcoupAccessBuffer( couplings, idcoup_GC_11 );
-    cxtype_sv_ref GC_10s_sv = C_ACCESS::kernelAccess( GC_10s );
-    cxtype_sv_ref GC_11s_sv = C_ACCESS::kernelAccess( GC_11s );
-    GC_10s_sv = couplings_sv.GC_10;
-    GC_11s_sv = couplings_sv.GC_11;
-    mgDebug( 1, __FUNCTION__ );
-    return;
-  }
+    // Compute the output couplings (e.g. gc10 and gc11) from the input gs
+    template<class G_ACCESS, class C_ACCESS>
+    __device__ inline void
+    G2COUP( const fptype gs[],
+            fptype couplings[] )
+    {
+      mgDebug( 0, __FUNCTION__ );
+      using namespace Parameters_sm_dependentCouplings;
+      const fptype_sv& gs_sv = G_ACCESS::kernelAccessConst( gs );
+      DependentCouplings_sv couplings_sv = computeDependentCouplings_fromG( gs_sv );
+      fptype* GC_10s = C_ACCESS::idcoupAccessBuffer( couplings, idcoup_GC_10 );
+      fptype* GC_11s = C_ACCESS::idcoupAccessBuffer( couplings, idcoup_GC_11 );
+      cxtype_sv_ref GC_10s_sv = C_ACCESS::kernelAccess( GC_10s );
+      cxtype_sv_ref GC_11s_sv = C_ACCESS::kernelAccess( GC_11s );
+      GC_10s_sv = couplings_sv.GC_10;
+      GC_11s_sv = couplings_sv.GC_11;
+      mgDebug( 1, __FUNCTION__ );
+      return;
+    }
 #pragma GCC diagnostic pop
 
-} // end namespace mg5amcGpu/mg5amcCpu
+  } // end namespace mg5amcGpu/mg5amcCpu
 
-//==========================================================================
+  //==========================================================================
 
 #endif // Parameters_sm_H
diff --git a/epochX/cudacpp/gg_tt.mad/src/cudacpp_src.mk b/epochX/cudacpp/gg_tt.mad/src/cudacpp_src.mk
index d4cc628aec..159e19a46d 100644
--- a/epochX/cudacpp/gg_tt.mad/src/cudacpp_src.mk
+++ b/epochX/cudacpp/gg_tt.mad/src/cudacpp_src.mk
@@ -19,7 +19,7 @@ SHELL := /bin/bash
 #=== Configure common compiler flags for CUDA and C++
 
 INCFLAGS = -I.
-OPTFLAGS = -O3 # this ends up in CUFLAGS too (should it?), cannot add -Ofast or -ffast-math here
+OPTFLAGS = -O3 # this ends up in GPUFLAGS too (should it?), cannot add -Ofast or -ffast-math here
 
 #-------------------------------------------------------------------------------
 
@@ -45,13 +45,13 @@ endif
 
 #-------------------------------------------------------------------------------
 
-#=== Configure the CUDA compiler (note: NVCC is already exported including ccache)
+#=== Configure the CUDA compiler (note: GPUCC is already exported including ccache)
 
-###$(info NVCC=$(NVCC))
+###$(info GPUCC=$(GPUCC))
 
 #-------------------------------------------------------------------------------
 
-#=== Configure ccache for C++ builds (note: NVCC is already exported including ccache)
+#=== Configure ccache for C++ builds (note: GPUCC is already exported including ccache)
 
 # Enable ccache if USECCACHE=1
 ifeq ($(USECCACHE)$(shell echo $(CXX) | grep ccache),1)
@@ -92,6 +92,13 @@ endif
 ###$(info OMPFLAGS=$(OMPFLAGS))
 CXXFLAGS += $(OMPFLAGS)
 
+# Add correct -DHIP_LATFORM when compiling for HIP
+ifeq ($(findstring nvcc,$(GPUCC)),nvcc)
+  GPUFLAGS += -Xcompiler -fPIC -c -x cu
+else ifeq ($(findstring hipcc,$(GPUCC)),hipcc)
+  GPUFLAGS += -fPIC -c
+endif
+
 # Set the build flags appropriate to each AVX choice (example: "make AVX=none")
 # [NB MGONGPU_PVW512 is needed because "-mprefer-vector-width=256" is not exposed in a macro]
 # [See https://gcc.gnu.org/bugzilla/show_bug.cgi?id=96476]
@@ -253,20 +260,20 @@ $(BUILDDIR)/%.o : %.cc *.h $(BUILDDIR)/.build.$(TAG)
 # Generic target and build rules: objects from CUDA compilation
 $(BUILDDIR)/%_cu.o : %.cc *.h $(BUILDDIR)/.build.$(TAG)
 	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
-	$(NVCC) $(CPPFLAGS) $(CUFLAGS) -Xcompiler -fPIC -c -x cu $< -o $@
+	$(GPUCC) $(CPPFLAGS) $(GPUFLAGS) $< -o $@
 
 #-------------------------------------------------------------------------------
 
 cxx_objects=$(addprefix $(BUILDDIR)/, Parameters_sm.o read_slha.o)
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 cu_objects=$(addprefix $(BUILDDIR)/, Parameters_sm_cu.o)
 endif
 
 # Target (and build rules): common (src) library
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so : $(cxx_objects) $(cu_objects)
 	@if [ ! -d $(LIBDIR) ]; then echo "mkdir -p $(LIBDIR)"; mkdir -p $(LIBDIR); fi
-	$(NVCC) -shared -o $@ $(cxx_objects) $(cu_objects) $(LDFLAGS)
+	$(GPUCC) -shared -o $@ $(cxx_objects) $(cu_objects) $(LDFLAGS)
 else
 $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so : $(cxx_objects)
 	@if [ ! -d $(LIBDIR) ]; then echo "mkdir -p $(LIBDIR)"; mkdir -p $(LIBDIR); fi
diff --git a/epochX/cudacpp/gg_tt.mad/src/mgOnGpuConfig.h b/epochX/cudacpp/gg_tt.mad/src/mgOnGpuConfig.h
index 80032e528b..d9af210552 100644
--- a/epochX/cudacpp/gg_tt.mad/src/mgOnGpuConfig.h
+++ b/epochX/cudacpp/gg_tt.mad/src/mgOnGpuConfig.h
@@ -1,21 +1,37 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jul 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MGONGPUCONFIG_H
 #define MGONGPUCONFIG_H 1
 
+#include "GpuRuntime.h" // Includes the GPU abstraction
+
 // HARDCODED AT CODE GENERATION TIME: DO NOT MODIFY (#473)
 // There are two different code bases for standalone_cudacpp (without multichannel) and madevent+cudacpp (with multichannel)
 #define MGONGPU_SUPPORTS_MULTICHANNEL 1
 
+// Is this a GPU (CUDA, HIP) or CPU implementation?
+#ifdef __CUDACC__
+#define MGONGPUCPP_GPUIMPL cuda
+#elif defined __HIPCC__
+#define MGONGPUCPP_GPUIMPL hip
+#include "hip/hip_runtime.h"
+#else
+#undef MGONGPUCPP_GPUIMPL
+#endif
+
 // ** NB1 Throughputs (e.g. 6.8E8) are events/sec for "./gcheck.exe -p 65536 128 12"
 // ** NB2 Baseline on b7g47n0004 fluctuates (probably depends on load on other VMs)
 
 // Choose if curand is supported for generating random numbers
+// For HIP, by default, do not use curand (common random numbers will be used instead)
 // For both CUDA and C++, by default, do not inline, but allow this macro to be set from outside with e.g. -DMGONGPU_HAS_NO_CURAND
-// (there exist CUDA installations, e.g. using the HPC package, which do not include curand - see PR #784)
+// (there exist CUDA installations, e.g. using the HPC package, which do not include curand - see PR #784 and #785)
+#if defined __HIPCC__
+#define MGONGPU_HAS_NO_CURAND 1
+#else
 //#ifdef __CUDACC__
 //#undef MGONGPU_HAS_NO_CURAND // default
 ////#define MGONGPU_HAS_NO_CURAND 1
@@ -23,6 +39,7 @@
 //#undef MGONGPU_HAS_NO_CURAND // default
 ////#define MGONGPU_HAS_NO_CURAND 1
 //#endif
+#endif
 
 // Choose floating point precision (for everything but color algebra #537)
 // If one of these macros has been set from outside with e.g. -DMGONGPU_FPTYPE_FLOAT, nothing happens (issue #167)
@@ -54,23 +71,28 @@
 //#undef MGONGPU_HARDCODE_PARAM // default
 ////#define MGONGPU_HARDCODE_PARAM 1
 
-// Complex type in c++: std::complex or cxsmpl (CHOOSE ONLY ONE)
-#ifndef __CUDACC__
-//#define MGONGPU_CPPCXTYPE_STDCOMPLEX 1 // ~8 percent slower on float, same on double (5.1E6/double, 9.4E6/float)
-#define MGONGPU_CPPCXTYPE_CXSMPL 1 // new default (5.1E6/double, 10.2E6/float)
-#endif
-
-// Complex type in cuda: thrust or cucomplex or cxsmpl (CHOOSE ONLY ONE)
+// Complex type in CUDA: thrust or cucomplex or cxsmpl (CHOOSE ONLY ONE)
 #ifdef __CUDACC__
 #define MGONGPU_CUCXTYPE_THRUST 1 // default (~1.15E9/double, ~3.2E9/float)
 //#define MGONGPU_CUCXTYPE_CUCOMPLEX 1 // ~10 percent slower (1.03E9/double, ~2.8E9/float)
 //#define MGONGPU_CUCXTYPE_CXSMPL 1 // ~10 percent slower (1.00E9/double, ~2.9E9/float)
+
+// Complex type in HIP: cxsmpl (ONLY ONE OPTION POSSIBLE)
+#elif defined __HIPCC__
+#define MGONGPU_CUCXTYPE_CXSMPL 1 // ~10 percent slower (1.00E9/double, ~2.9E9/float)
+
+// Complex type in C++: std::complex or cxsmpl (CHOOSE ONLY ONE)
+#else
+//#define MGONGPU_CPPCXTYPE_STDCOMPLEX 1 // ~8 percent slower on float, same on double (5.1E6/double, 9.4E6/float)
+#define MGONGPU_CPPCXTYPE_CXSMPL 1 // new default (5.1E6/double, 10.2E6/float)
 #endif
 
-// Cuda nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation
+// CUDA nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation
 #ifdef __CUDACC__
-#undef MGONGPU_NSIGHT_DEBUG // default
+#undef MGONGPU_NSIGHT_DEBUG // default in CUDA
 //#define MGONGPU_NSIGHT_DEBUG 1
+#else
+#undef MGONGPU_NSIGHT_DEBUG // only option in HIP or C++
 #endif
 
 // SANITY CHECKS (floating point precision for everything but color algebra #537)
@@ -86,17 +108,21 @@
 #error You cannot use double precision for color algebra and single precision elsewhere
 #endif
 
-// SANITY CHECKS (c++ complex number implementation)
-#ifndef __CUDACC__
-#if defined MGONGPU_CPPCXTYPE_STDCOMPLEX and defined MGONGPU_CPPCXTYPE_CXSMPL
-#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CPPCXTYPE_STDCOMPLEX or MGONGPU_CPPCXTYPE_CXSMPL
+// SANITY CHECKS (CUDA complex number implementation)
+#ifdef __CUDACC__
+#if defined MGONGPU_CUCXTYPE_THRUST and defined MGONGPU_CUCXTYPE_CUCOMPLEX
+#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CUCXTYPE_THRUST or MGONGPU_CUCXTYPE_CUCOMPLEX for CUDA
+#elif defined MGONGPU_CUCXTYPE_THRUST and defined MGONGPU_CUCXTYPE_CXSMPL
+#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CUCXTYPE_THRUST or MGONGPU_CUCXTYPE_CXSMPL for CUDA
+#elif defined MGONGPU_CUCXTYPE_CUCOMPLEX and defined MGONGPU_CUCXTYPE_CXSMPL
+#error You must CHOOSE (ONE AND) ONLY ONE OF MGONGPU_CUCXTYPE_CUCOMPLEX or MGONGPU_CUCXTYPE_CXSMPL for CUDA
 #endif
 #endif
 
-// SANITY CHECKS (cuda complex number implementation)
-#ifdef __CUDACC__
-#if defined MGONGPU_CUCXTYPE_THRUST and defined MGONGPU_CUCXTYPE_CUCOMPLEX and defined MGONGPU_CUCXTYPE_CXSMPL
-#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CUCXTYPE_THRUST or MGONGPU_CUCXTYPE_CUCOMPLEX or MGONGPU_CUCXTYPE_CXSMPL
+// SANITY CHECKS (C++ complex number implementation)
+#ifndef MGONGPUCPP_GPUIMPL
+#if defined MGONGPU_CPPCXTYPE_STDCOMPLEX and defined MGONGPU_CPPCXTYPE_CXSMPL
+#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CPPCXTYPE_STDCOMPLEX or MGONGPU_CPPCXTYPE_CXSMPL for C++
 #endif
 #endif
 
@@ -134,7 +160,7 @@ namespace mgOnGpu
   // Alignment requirement for using reinterpret_cast with SIMD vectorized code
   // (using reinterpret_cast with non aligned memory may lead to segmentation faults!)
   // Only needed for C++ code but can be enforced also in NVCC builds of C++ code using CUDA>=11.2 and C++17 (#318, #319, #333)
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   constexpr int cppAlign = 64; // alignment requirement for SIMD vectorization (64-byte i.e. 512-bit)
 #endif
 
@@ -145,7 +171,7 @@ using mgOnGpu::fptype;
 using mgOnGpu::fptype2;
 
 // C++ SIMD vectorization width (this will be used to set neppV)
-#ifdef __CUDACC__ // CUDA implementation has no SIMD
+#ifdef MGONGPUCPP_GPUIMPL // CUDA and HIP implementations have no SIMD
 #undef MGONGPU_CPPSIMD
 #elif defined __AVX512VL__ && defined MGONGPU_PVW512 // C++ "512z" AVX512 with 512 width (512-bit ie 64-byte): 8 (DOUBLE) or 16 (FLOAT)
 #ifdef MGONGPU_FPTYPE_DOUBLE
@@ -175,9 +201,9 @@ using mgOnGpu::fptype2;
 #undef MGONGPU_CPPSIMD
 #endif
 
-// Cuda nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation
+// CUDA nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation
 // Arguments (not used so far): text is __FUNCTION__, code is 0 (start) or 1 (end)
-#if defined __CUDACC__ && defined MGONGPU_NSIGHT_DEBUG /* clang-format off */
+#if defined __CUDA__ && defined MGONGPU_NSIGHT_DEBUG /* clang-format off */
 #define mgDebugDeclare() __shared__ float mgDebugCounter[mgOnGpu::ntpbMAX];
 #define mgDebugInitialise() { mgDebugCounter[threadIdx.x] = 0; }
 #define mgDebug( code, text ) { mgDebugCounter[threadIdx.x] += 1; }
@@ -189,8 +215,8 @@ using mgOnGpu::fptype2;
 #define mgDebugFinalise() { /*noop*/ }
 #endif /* clang-format on */
 
-// Define empty CUDA declaration specifiers for C++
-#ifndef __CUDACC__
+// Define empty CUDA/HIP declaration specifiers for C++
+#ifndef MGONGPUCPP_GPUIMPL
 #define __global__
 #define __host__
 #define __device__
diff --git a/epochX/cudacpp/gg_tt.mad/src/mgOnGpuCxtypes.h b/epochX/cudacpp/gg_tt.mad/src/mgOnGpuCxtypes.h
index ca9a9f00c0..5532e22fa1 100644
--- a/epochX/cudacpp/gg_tt.mad/src/mgOnGpuCxtypes.h
+++ b/epochX/cudacpp/gg_tt.mad/src/mgOnGpuCxtypes.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022, based on earlier work by D. Smith) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MGONGPUCXTYPES_H
 #define MGONGPUCXTYPES_H 1
@@ -19,7 +19,7 @@
 #include <complex>
 
 // Complex type in cuda: thrust or cucomplex or cxsmpl
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 #if defined MGONGPU_CUCXTYPE_THRUST
 #pragma clang diagnostic push
 #pragma clang diagnostic ignored "-Wtautological-compare" // for icpx2021/clang13 (https://stackoverflow.com/a/15864661)
@@ -82,7 +82,7 @@ namespace mgOnGpu /* clang-format off */
 using mgOnGpu::cxsmpl;
 
 // Printout to stream for user defined types
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -92,7 +92,7 @@ namespace mg5amcCpu
   inline __host__ std::ostream&
   operator<<( std::ostream& out, const cxsmpl<FP>& c )
   {
-    out << std::complex( c.real(), c.imag() );
+    out << std::complex<FP>( c.real(), c.imag() );
     return out;
   }
 
@@ -215,14 +215,14 @@ namespace mg5amcCpu
 //==========================================================================
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
 #endif
 {
   // --- Type definitions (complex type: cxtype)
-#ifdef __CUDACC__ // cuda
+#ifdef MGONGPUCPP_GPUIMPL // cuda
 #if defined MGONGPU_CUCXTYPE_THRUST
   typedef thrust::complex<fptype> cxtype;
 #elif defined MGONGPU_CUCXTYPE_CUCOMPLEX
@@ -255,7 +255,7 @@ namespace mg5amcCpu
 //==========================================================================
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -307,7 +307,7 @@ namespace mg5amcCpu
 
   //==========================================================================
 
-#if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_THRUST // cuda + thrust
+#if defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CUCXTYPE_THRUST // cuda + thrust
 
   //------------------------------
   // CUDA - using thrust::complex
@@ -343,11 +343,11 @@ namespace mg5amcCpu
     return c;
   }
 
-#endif // #if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_THRUST
+#endif // #if defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CUCXTYPE_THRUST
 
   //==========================================================================
 
-#if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_CUCOMPLEX // cuda + cucomplex
+#if defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CUCXTYPE_CUCOMPLEX // cuda + cucomplex
 
   //------------------------------
   // CUDA - using cuComplex
@@ -562,11 +562,11 @@ namespace mg5amcCpu
     return cxmake( c.real(), c.imag() );
   }
 
-#endif // #if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_CUCOMPLEX
+#endif // #if defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CUCXTYPE_CUCOMPLEX
 
   //==========================================================================
 
-#if not defined __CUDACC__ and defined MGONGPU_CPPCXTYPE_STDCOMPLEX // c++ + stdcomplex
+#if not defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CPPCXTYPE_STDCOMPLEX // c++ + stdcomplex
 
   //------------------------------
   // C++ - using std::complex
@@ -610,7 +610,7 @@ namespace mg5amcCpu
   }
 #endif
 
-#endif // #if not defined __CUDACC__ and defined MGONGPU_CPPCXTYPE_STDCOMPLEX
+#endif // #if not defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CPPCXTYPE_STDCOMPLEX
 
   //==========================================================================
 
@@ -633,7 +633,7 @@ namespace mg5amcCpu
 //==========================================================================
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/gg_tt.mad/src/mgOnGpuFptypes.h b/epochX/cudacpp/gg_tt.mad/src/mgOnGpuFptypes.h
index 905c97d700..83a46c1d4e 100644
--- a/epochX/cudacpp/gg_tt.mad/src/mgOnGpuFptypes.h
+++ b/epochX/cudacpp/gg_tt.mad/src/mgOnGpuFptypes.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MGONGPUFPTYPES_H
 #define MGONGPUFPTYPES_H 1
@@ -20,7 +20,7 @@ namespace mg5amcCpu
 {
   //==========================================================================
 
-#ifdef __CUDACC__ // cuda
+#ifdef MGONGPUCPP_GPUIMPL // cuda
 
   //------------------------------
   // Floating point types - Cuda
@@ -64,11 +64,11 @@ namespace mg5amcCpu
 #endif
   }
 
-#endif // #ifdef __CUDACC__
+#endif // #ifdef MGONGPUCPP_GPUIMPL
 
   //==========================================================================
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 
   //------------------------------
   // Floating point types - C++
@@ -92,7 +92,7 @@ namespace mg5amcCpu
     return std::sqrt( f );
   }
 
-#endif // #ifndef __CUDACC__
+#endif // #ifndef MGONGPUCPP_GPUIMPL
 
   //==========================================================================
 
diff --git a/epochX/cudacpp/gg_tt.mad/src/mgOnGpuVectors.h b/epochX/cudacpp/gg_tt.mad/src/mgOnGpuVectors.h
index e1299ba81e..dd8b83752d 100644
--- a/epochX/cudacpp/gg_tt.mad/src/mgOnGpuVectors.h
+++ b/epochX/cudacpp/gg_tt.mad/src/mgOnGpuVectors.h
@@ -9,6 +9,8 @@
 #include "mgOnGpuCxtypes.h"
 #include "mgOnGpuFptypes.h"
 
+#include "GpuAbstraction.h"
+
 #include <iostream>
 
 //==========================================================================
@@ -32,7 +34,7 @@
 #endif
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -131,7 +133,7 @@ namespace mg5amcCpu
 #endif
 #endif
 
-#else // i.e #ifndef MGONGPU_CPPSIMD (this includes #ifdef __CUDACC__)
+#else // i.e #ifndef MGONGPU_CPPSIMD (this includes #ifdef MGONGPUCPP_GPUIMPL)
 
   const int neppV = 1;
 
@@ -153,13 +155,13 @@ namespace mg5amcCpu
 //==========================================================================
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
 #endif
 {
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 
   // Printout to stream for user defined types
 
@@ -805,11 +807,11 @@ namespace mg5amcCpu
 
 #endif // #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
 
-#endif // #ifndef __CUDACC__
+#endif // #ifndef MGONGPUCPP_GPUIMPL
 
   //==========================================================================
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 
   //------------------------------
   // Vector types - CUDA
@@ -853,12 +855,12 @@ namespace mg5amcCpu
     return mask;
   }
 
-#endif // #ifdef __CUDACC__
+#endif // #ifdef MGONGPUCPP_GPUIMPL
 
   //==========================================================================
 
   // Scalar-or-vector types: scalar in CUDA, vector or scalar in C++
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   typedef bool bool_sv;
   typedef fptype fptype_sv;
   typedef fptype2 fptype2_sv;
@@ -879,7 +881,7 @@ namespace mg5amcCpu
 #endif
 
   // Scalar-or-vector zeros: scalar in CUDA, vector or scalar in C++
-#ifdef __CUDACC__ /* clang-format off */
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
   inline __host__ __device__ cxtype cxzero_sv(){ return cxtype( 0, 0 ); }
 #elif defined MGONGPU_CPPSIMD
   inline cxtype_v cxzero_sv() { return cxtype_v(); } // RRRR=0000 IIII=0000
diff --git a/epochX/cudacpp/gg_tt.mad/src/rambo.h b/epochX/cudacpp/gg_tt.mad/src/rambo.h
index e02ea52496..cd7e1008ea 100644
--- a/epochX/cudacpp/gg_tt.mad/src/rambo.h
+++ b/epochX/cudacpp/gg_tt.mad/src/rambo.h
@@ -4,7 +4,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 
 #include "mgOnGpuConfig.h"
@@ -18,7 +18,7 @@
 #include <iostream>
 
 // Simplified rambo version for 2 to N (with N>=2) processes with massless particles
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -83,7 +83,7 @@ namespace mg5amcCpu
       static bool first = true;
       if( first )
       {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
         if constexpr( M_ACCESS::isOnDevice() ) // avoid
         {
           const int ievt0 = 0;
@@ -166,7 +166,7 @@ namespace mg5amcCpu
     wt = po2log;
     if( nparf != 2 ) wt = ( 2. * nparf - 4. ) * log( energy ) + z[nparf - 1];
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
     // issue warnings if weight is too small or too large
     static int iwarn[5] = { 0, 0, 0, 0, 0 };
     if( wt < -180. )

From d4200cf4eaa9b21ba9ec7df1cccf24a03efbd5f8 Mon Sep 17 00:00:00 2001
From: Andrea Valassi <andrea.valassi@cern.ch>
Date: Wed, 24 Jan 2024 11:21:06 +0100
Subject: [PATCH 486/509] Revert "[jt774] regenerate gg_tt.mad - the build
 fails"

This reverts commit def02b58e369c76e9f3b63b1991ca1ec8e148107.
---
 .../gg_tt.mad/CODEGEN_mad_gg_tt_log.txt       |  22 +-
 epochX/cudacpp/gg_tt.mad/COPYRIGHT            |   1 -
 .../cudacpp/gg_tt.mad/SubProcesses/Bridge.h   |  32 +--
 .../gg_tt.mad/SubProcesses/BridgeKernels.cc   |   9 +-
 .../gg_tt.mad/SubProcesses/BridgeKernels.h    |   8 +-
 .../SubProcesses/CommonRandomNumberKernel.cc  |   5 +-
 .../SubProcesses/CrossSectionKernels.cc       |   7 +-
 .../SubProcesses/CrossSectionKernels.h        |   6 +-
 .../gg_tt.mad/SubProcesses/CudaRuntime.h      |  85 +++++++
 .../SubProcesses/CurandRandomNumberKernel.cc  |  12 +-
 .../gg_tt.mad/SubProcesses/EventStatistics.h  |   4 +-
 .../gg_tt.mad/SubProcesses/MadgraphTest.h     |   8 +-
 .../SubProcesses/MatrixElementKernels.cc      |  26 +-
 .../SubProcesses/MatrixElementKernels.h       |   8 +-
 .../SubProcesses/MemoryAccessAmplitudes.h     |   2 +-
 .../SubProcesses/MemoryAccessCouplings.h      |   2 +-
 .../SubProcesses/MemoryAccessCouplingsFixed.h |   2 +-
 .../SubProcesses/MemoryAccessDenominators.h   |   2 +-
 .../gg_tt.mad/SubProcesses/MemoryAccessGs.h   |   2 +-
 .../SubProcesses/MemoryAccessHelpers.h        |   4 +-
 .../SubProcesses/MemoryAccessMatrixElements.h |   2 +-
 .../SubProcesses/MemoryAccessMomenta.h        |   7 +-
 .../SubProcesses/MemoryAccessNumerators.h     |   2 +-
 .../SubProcesses/MemoryAccessRandomNumbers.h  |   4 +-
 .../SubProcesses/MemoryAccessVectors.h        |   4 +-
 .../SubProcesses/MemoryAccessWavefunctions.h  |   2 +-
 .../gg_tt.mad/SubProcesses/MemoryBuffers.h    |  64 ++---
 .../SubProcesses/P1_gg_ttx/CPPProcess.cc      |  64 +++--
 .../SubProcesses/P1_gg_ttx/CPPProcess.h       |  10 +-
 .../SubProcesses/P1_gg_ttx/CudaRuntime.h      |   1 +
 .../SubProcesses/P1_gg_ttx/check_sa.cc        | 117 ++++-----
 .../SubProcesses/RamboSamplingKernels.cc      |  20 +-
 .../SubProcesses/RamboSamplingKernels.h       |   6 +-
 .../SubProcesses/RandomNumberKernels.h        |   6 +-
 .../cudacpp/gg_tt.mad/SubProcesses/cudacpp.mk | 233 +++++++-----------
 .../cudacpp/gg_tt.mad/SubProcesses/fbridge.cc |  16 +-
 .../gg_tt.mad/SubProcesses/fsampler.cc        |   8 +-
 .../cudacpp/gg_tt.mad/SubProcesses/runTest.cc |  12 +-
 .../gg_tt.mad/SubProcesses/testmisc.cc        |   8 +-
 .../cudacpp/gg_tt.mad/SubProcesses/testxxx.cc |  14 +-
 epochX/cudacpp/gg_tt.mad/src/HelAmps_sm.h     |   4 +-
 epochX/cudacpp/gg_tt.mad/src/Parameters_sm.cc |   4 +-
 epochX/cudacpp/gg_tt.mad/src/Parameters_sm.h  |  58 ++---
 epochX/cudacpp/gg_tt.mad/src/cudacpp_src.mk   |  23 +-
 epochX/cudacpp/gg_tt.mad/src/mgOnGpuConfig.h  |  76 ++----
 epochX/cudacpp/gg_tt.mad/src/mgOnGpuCxtypes.h |  28 +--
 epochX/cudacpp/gg_tt.mad/src/mgOnGpuFptypes.h |  10 +-
 epochX/cudacpp/gg_tt.mad/src/mgOnGpuVectors.h |  20 +-
 epochX/cudacpp/gg_tt.mad/src/rambo.h          |   8 +-
 49 files changed, 516 insertions(+), 562 deletions(-)
 create mode 100644 epochX/cudacpp/gg_tt.mad/SubProcesses/CudaRuntime.h
 create mode 120000 epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/CudaRuntime.h

diff --git a/epochX/cudacpp/gg_tt.mad/CODEGEN_mad_gg_tt_log.txt b/epochX/cudacpp/gg_tt.mad/CODEGEN_mad_gg_tt_log.txt
index b56b36111b..a477013568 100644
--- a/epochX/cudacpp/gg_tt.mad/CODEGEN_mad_gg_tt_log.txt
+++ b/epochX/cudacpp/gg_tt.mad/CODEGEN_mad_gg_tt_log.txt
@@ -52,7 +52,7 @@ Note that you can still compile and run aMC@NLO with the built-in PDFs
 
 Using default text editor "vi". Set another one in ./input/mg5_configuration.txt
 Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt
-Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt
+No valid web browser found. Please set in ./input/mg5_configuration.txt
 import /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt.mg
 The import format was not given, so we guess it as command
 set stdout_level DEBUG
@@ -62,7 +62,7 @@ generate g g > t t~
 No model currently active, so we import the Standard Model
 INFO: load particles 
 INFO: load vertices 
-[1;32mDEBUG: model prefixing  takes 0.005602598190307617 [0m
+[1;32mDEBUG: model prefixing  takes 0.005816459655761719 [0m
 INFO: Restrict model sm with file models/sm/restrict_default.dat . 
 [1;32mDEBUG: Simplifying conditional expressions [0m
 [1;32mDEBUG: remove interactions: u s w+ at order: QED=1 [0m
@@ -175,7 +175,7 @@ INFO: Generating Helas calls for process: g g > t t~ WEIGHTED<=2 @1
 INFO: Processing color information for process: g g > t t~ @1 
 INFO: Creating files in directory P1_gg_ttx 
 [1;32mDEBUG:  kwargs[prefix] = 0 [1;30m[model_handling.py at line 1058][0m [0m
-[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7fbbe40f09d0> [1;30m[export_v4.py at line 6262][0m [0m
+[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7f089da5e700> [1;30m[export_v4.py at line 6262][0m [0m
 INFO: Creating files in directory . 
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.h
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.cc
@@ -191,16 +191,16 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./.
 INFO: Generating Feynman diagrams for Process: g g > t t~ WEIGHTED<=2 @1 
 INFO: Finding symmetric diagrams for subprocess group gg_ttx 
 Generated helas calls for 1 subprocesses (3 diagrams) in 0.006 s
-Wrote files for 10 helas calls in 0.106 s
+Wrote files for 10 helas calls in 0.103 s
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates VVV1 set of routines with options: P0[0m
 ALOHA: aloha creates FFV1 routines[0m
-ALOHA: aloha creates 2 routines in  0.152 s
+ALOHA: aloha creates 2 routines in  0.155 s
 [1;32mDEBUG:  Entering PLUGIN_ProcessExporter.convert_model (create the model) [1;30m[output.py at line 202][0m [0m
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates VVV1 set of routines with options: P0[0m
 ALOHA: aloha creates FFV1 routines[0m
-ALOHA: aloha creates 4 routines in  0.148 s
+ALOHA: aloha creates 4 routines in  0.135 s
 <class 'aloha.create_aloha.AbstractRoutine'> VVV1
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
@@ -237,9 +237,9 @@ Type "launch" to generate events from this process, or see
 Run "open index.html" to see more information about this process.
 quit
 
-real	0m1.778s
-user	0m1.548s
-sys	0m0.220s
+real	0m1.729s
+user	0m1.515s
+sys	0m0.204s
 Code generation completed in 2 seconds
 ************************************************************
 *                                                          *
@@ -266,7 +266,7 @@ INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/mg5amc
 INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt/Cards/me5_configuration.txt  
 Using default text editor "vi". Set another one in ./input/mg5_configuration.txt
 Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt
-Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt
+No valid web browser found. Please set in ./input/mg5_configuration.txt
 treatcards run
 quit
 INFO:  
@@ -296,7 +296,7 @@ INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/mg5amc
 INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt/Cards/me5_configuration.txt  
 Using default text editor "vi". Set another one in ./input/mg5_configuration.txt
 Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt
-Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt
+No valid web browser found. Please set in ./input/mg5_configuration.txt
 treatcards param
 quit
 INFO:  
diff --git a/epochX/cudacpp/gg_tt.mad/COPYRIGHT b/epochX/cudacpp/gg_tt.mad/COPYRIGHT
index 84a883fbb0..a134b5fef9 100644
--- a/epochX/cudacpp/gg_tt.mad/COPYRIGHT
+++ b/epochX/cudacpp/gg_tt.mad/COPYRIGHT
@@ -15,7 +15,6 @@ The full development team currently includes the following authors :
   Stephan Hageboeck (CERN)
   Olivier Mattelaer (Universite Catholique de Louvain, original author)
   Stefan Roiser (CERN, original author)
-  Joergen Teig (CERN)
   Andrea Valassi (CERN, original author)
   Zenny Wettersten (CERN)
 See https://github.com/madgraph5/madgraph4gpu for more details. For the full
diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/Bridge.h b/epochX/cudacpp/gg_tt.mad/SubProcesses/Bridge.h
index 89437b4c42..bf8b5e024d 100644
--- a/epochX/cudacpp/gg_tt.mad/SubProcesses/Bridge.h
+++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/Bridge.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: S. Roiser (Nov 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Roiser, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef BRIDGE_H
 #define BRIDGE_H 1
@@ -23,7 +23,7 @@
 #include <memory>
 #include <type_traits>
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -83,7 +83,7 @@ namespace mg5amcCpu
     Bridge& operator=( const Bridge& ) = delete;
     Bridge& operator=( Bridge&& ) = delete;
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
     /**
      * Set the gpublocks and gputhreads for the gpusequence - throws if evnt != gpublocks*gputhreads
      * (this is needed for BridgeKernel tests rather than for actual production use in Fortran)
@@ -150,7 +150,7 @@ namespace mg5amcCpu
     unsigned int m_nevt; // number of events
     int m_nGoodHel;      // the number of good helicities (-1 initially when they have not yet been calculated)
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
     int m_gputhreads; // number of gpu threads (default set from number of events, can be modified)
     int m_gpublocks;  // number of gpu blocks (default set from number of events, can be modified)
     DeviceBuffer<FORTRANFPTYPE, sizePerEventMomenta> m_devMomentaF;
@@ -187,12 +187,12 @@ namespace mg5amcCpu
   // Forward declare transposition methods
   //
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 
   template<typename Tin, typename Tout>
   __global__ void dev_transposeMomentaF2C( const Tin* in, Tout* out, const unsigned int nevt );
 
-#endif // MGONGPUCPP_GPUIMPL
+#endif // __CUDACC__
 
   template<typename Tin, typename Tout>
   void hst_transposeMomentaF2C( const Tin* in, Tout* out, const unsigned int nevt );
@@ -209,7 +209,7 @@ namespace mg5amcCpu
   Bridge<FORTRANFPTYPE>::Bridge( unsigned int nevtF, unsigned int nparF, unsigned int np4F )
     : m_nevt( nevtF )
     , m_nGoodHel( -1 )
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
     , m_gputhreads( 256 )                  // default number of gpu threads
     , m_gpublocks( m_nevt / m_gputhreads ) // this ensures m_nevt <= m_gpublocks*m_gputhreads
     , m_devMomentaF( m_nevt )
@@ -233,7 +233,7 @@ namespace mg5amcCpu
   {
     if( nparF != CPPProcess::npar ) throw std::runtime_error( "Bridge constructor: npar mismatch" );
     if( np4F != CPPProcess::np4 ) throw std::runtime_error( "Bridge constructor: np4 mismatch" );
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
     if( ( m_nevt < s_gputhreadsmin ) || ( m_nevt % s_gputhreadsmin != 0 ) )
       throw std::runtime_error( "Bridge constructor: nevt should be a multiple of " + std::to_string( s_gputhreadsmin ) );
     while( m_nevt != m_gpublocks * m_gputhreads )
@@ -249,7 +249,7 @@ namespace mg5amcCpu
 #else
     std::cout << "WARNING! Instantiate host Bridge (nevt=" << m_nevt << ")" << std::endl;
     m_pmek.reset( new MatrixElementKernelHost( m_hstMomentaC, m_hstGs, m_hstRndHel, m_hstRndCol, m_hstMEs, m_hstSelHel, m_hstSelCol, m_nevt ) );
-#endif // MGONGPUCPP_GPUIMPL
+#endif // __CUDACC__
     // Create a process object, read param card and set parameters
     // FIXME: the process instance can happily go out of scope because it is only needed to read parameters?
     // FIXME: the CPPProcess should really be a singleton? what if fbridgecreate is called from several Fortran threads?
@@ -262,7 +262,7 @@ namespace mg5amcCpu
     process.initProc( paramCard );
   }
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
   template<typename FORTRANFPTYPE>
   void Bridge<FORTRANFPTYPE>::set_gpugrid( const int gpublocks, const int gputhreads )
   {
@@ -276,7 +276,7 @@ namespace mg5amcCpu
   }
 #endif
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
   template<typename FORTRANFPTYPE>
   void Bridge<FORTRANFPTYPE>::gpu_sequence( const FORTRANFPTYPE* momenta,
                                             const FORTRANFPTYPE* gs,
@@ -291,14 +291,14 @@ namespace mg5amcCpu
     constexpr int neppM = MemoryAccessMomenta::neppM;
     if constexpr( neppM == 1 && std::is_same_v<FORTRANFPTYPE, fptype> )
     {
-      gpuMemcpy( m_devMomentaC.data(), momenta, m_devMomentaC.bytes(), gpuMemcpyHostToDevice );
+      checkCuda( cudaMemcpy( m_devMomentaC.data(), momenta, m_devMomentaC.bytes(), cudaMemcpyHostToDevice ) );
     }
     else
     {
-      gpuMemcpy( m_devMomentaF.data(), momenta, m_devMomentaF.bytes(), gpuMemcpyHostToDevice );
+      checkCuda( cudaMemcpy( m_devMomentaF.data(), momenta, m_devMomentaF.bytes(), cudaMemcpyHostToDevice ) );
       const int thrPerEvt = CPPProcess::npar * CPPProcess::np4; // AV: transpose alg does 1 element per thread (NOT 1 event per thread)
       //const int thrPerEvt = 1; // AV: try new alg with 1 event per thread... this seems slower
-      gpuLaunchKernel( dev_transposeMomentaF2C, m_gpublocks * thrPerEvt, m_gputhreads, m_devMomentaF.data(), m_devMomentaC.data(), m_nevt );
+      dev_transposeMomentaF2C<<<m_gpublocks * thrPerEvt, m_gputhreads>>>( m_devMomentaF.data(), m_devMomentaC.data(), m_nevt );
     }
     if constexpr( std::is_same_v<FORTRANFPTYPE, fptype> )
     {
@@ -341,7 +341,7 @@ namespace mg5amcCpu
   }
 #endif
 
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
   template<typename FORTRANFPTYPE>
   void Bridge<FORTRANFPTYPE>::cpu_sequence( const FORTRANFPTYPE* momenta,
                                             const FORTRANFPTYPE* gs,
@@ -396,7 +396,7 @@ namespace mg5amcCpu
   // - C++ array: momenta[npagM][npar][np4][neppM] with nevt=npagM*neppM (AOSOA)
   //
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
   template<typename Tin, typename Tout>
   __global__ void dev_transposeMomentaF2C( const Tin* in, Tout* out, const unsigned int nevt )
   {
diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/BridgeKernels.cc b/epochX/cudacpp/gg_tt.mad/SubProcesses/BridgeKernels.cc
index eaf4037a24..d58066c9c1 100644
--- a/epochX/cudacpp/gg_tt.mad/SubProcesses/BridgeKernels.cc
+++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/BridgeKernels.cc
@@ -1,18 +1,17 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #include "BridgeKernels.h"
 
-#include "GpuAbstraction.h"
 #include "MemoryAccessMomenta.h"
 
 #include <sstream>
 
 //============================================================================
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -46,7 +45,7 @@ namespace mg5amcCpu
 
 //============================================================================
 
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
 namespace mg5amcCpu
 {
 
@@ -97,7 +96,7 @@ namespace mg5amcCpu
 
 //============================================================================
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 namespace mg5amcGpu
 {
 
diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/BridgeKernels.h b/epochX/cudacpp/gg_tt.mad/SubProcesses/BridgeKernels.h
index 3efef8ce97..15eb4bff4d 100644
--- a/epochX/cudacpp/gg_tt.mad/SubProcesses/BridgeKernels.h
+++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/BridgeKernels.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef BRIDGEKERNELS_H
 #define BRIDGEKERNELS_H 1
@@ -12,7 +12,7 @@
 #include "MatrixElementKernels.h"
 #include "MemoryBuffers.h"
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -49,7 +49,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
   // A Bridge wrapper class encapsulating matrix element calculations on a CPU host
   class BridgeKernelHost final : public BridgeKernelBase
   {
@@ -89,7 +89,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
   // A Bridge wrapper class encapsulating matrix element calculations on a GPU device
   class BridgeKernelDevice : public BridgeKernelBase
   {
diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/CommonRandomNumberKernel.cc b/epochX/cudacpp/gg_tt.mad/SubProcesses/CommonRandomNumberKernel.cc
index 010bc4cbd0..985b39f576 100644
--- a/epochX/cudacpp/gg_tt.mad/SubProcesses/CommonRandomNumberKernel.cc
+++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/CommonRandomNumberKernel.cc
@@ -1,16 +1,15 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #include "CommonRandomNumbers.h"
-#include "GpuAbstraction.h"
 #include "MemoryBuffers.h"
 #include "RandomNumberKernels.h"
 
 #include <cassert>
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/CrossSectionKernels.cc b/epochX/cudacpp/gg_tt.mad/SubProcesses/CrossSectionKernels.cc
index c15b39844d..0b355a3c8d 100644
--- a/epochX/cudacpp/gg_tt.mad/SubProcesses/CrossSectionKernels.cc
+++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/CrossSectionKernels.cc
@@ -1,11 +1,10 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #include "CrossSectionKernels.h"
 
-#include "GpuAbstraction.h"
 #include "MemoryAccessMatrixElements.h"
 #include "MemoryAccessWeights.h"
 #include "MemoryBuffers.h"
@@ -78,7 +77,7 @@ debug_me_is_abnormal( const fptype& me, size_t ievtALL )
 
 //============================================================================
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -186,7 +185,7 @@ namespace mg5amcCpu
 
 //============================================================================
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 namespace mg5amcGpu
 {
 
diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/CrossSectionKernels.h b/epochX/cudacpp/gg_tt.mad/SubProcesses/CrossSectionKernels.h
index 4d9659e04e..7933ca4bbf 100644
--- a/epochX/cudacpp/gg_tt.mad/SubProcesses/CrossSectionKernels.h
+++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/CrossSectionKernels.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef CROSSSECTIONKERNELS_H
 #define CROSSSECTIONKERNELS_H 1
@@ -13,7 +13,7 @@
 
 //============================================================================
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -96,7 +96,7 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
   /*
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
   // A class encapsulating the calculation of event statistics on a GPU device
   class CrossSectionKernelDevice : public CrossSectionKernelBase, public NumberOfEvents
   {
diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/CudaRuntime.h b/epochX/cudacpp/gg_tt.mad/SubProcesses/CudaRuntime.h
new file mode 100644
index 0000000000..64ce52f4b3
--- /dev/null
+++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/CudaRuntime.h
@@ -0,0 +1,85 @@
+// Copyright (C) 2020-2023 CERN and UCLouvain.
+// Licensed under the GNU Lesser General Public License (version 3 or later).
+// Created by: S. Roiser (Jul 2020) for the MG5aMC CUDACPP plugin.
+// Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+
+#ifndef MG5AMC_CUDARUNTIME_H
+#define MG5AMC_CUDARUNTIME_H 1
+
+// MG5AMC on GPU uses the CUDA runtime API, not the lower level CUDA driver API
+// See https://docs.nvidia.com/cuda/cuda-runtime-api/driver-vs-runtime-api.html#driver-vs-runtime-api
+
+#include <cassert>
+#include <iostream>
+
+//--------------------------------------------------------------------------
+
+// See https://stackoverflow.com/a/14038590
+#ifdef __CUDACC__ /* clang-format off */
+#define checkCuda( code ) { assertCuda( code, __FILE__, __LINE__ ); }
+inline void assertCuda( cudaError_t code, const char* file, int line, bool abort = true )
+{
+  if( code != cudaSuccess )
+  {
+    printf( "ERROR! assertCuda: '%s' (%d) in %s:%d\n", cudaGetErrorString( code ), code, file, line );
+    if( abort ) assert( code == cudaSuccess );
+  }
+}
+#endif /* clang-format on */
+
+//--------------------------------------------------------------------------
+
+#ifdef __CUDACC__
+namespace mg5amcGpu
+{
+  // Instantiate a CudaRuntime at the beginnining of the application's main to
+  // invoke cudaSetDevice(0) in the constructor and book a cudaDeviceReset() call in the destructor
+  // *** FIXME! This will all need to be designed differently when going to multi-GPU nodes! ***
+  struct CudaRuntime final
+  {
+    CudaRuntime( const bool debug = true )
+      : m_debug( debug ) { setUp( m_debug ); }
+    ~CudaRuntime() { tearDown( m_debug ); }
+    CudaRuntime( const CudaRuntime& ) = delete;
+    CudaRuntime( CudaRuntime&& ) = delete;
+    CudaRuntime& operator=( const CudaRuntime& ) = delete;
+    CudaRuntime& operator=( CudaRuntime&& ) = delete;
+    bool m_debug;
+
+    // Set up CUDA application
+    // ** NB: strictly speaking this is not needed when using the CUDA runtime API **
+    // Calling cudaSetDevice on startup is useful to properly book-keep the time spent in CUDA initialization
+    static void setUp( const bool debug = true )
+    {
+      // ** NB: it is useful to call cudaSetDevice, or cudaFree, to properly book-keep the time spent in CUDA initialization
+      // ** NB: otherwise, the first CUDA operation (eg a cudaMemcpyToSymbol in CPPProcess ctor) appears to take much longer!
+      /*
+      // [We initially added cudaFree(0) to "ease profile analysis" only because it shows up as a big recognizable block!]
+      // No explicit initialization is needed: https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#initialization
+      // It is not clear what cudaFree(0) does at all: https://stackoverflow.com/questions/69967813/
+      if ( debug ) std::cout << "__CudaRuntime: calling cudaFree(0)" << std::endl;
+      checkCuda( cudaFree( 0 ) ); // SLOW!
+      */
+      // Replace cudaFree(0) by cudaSetDevice(0), even if it is not really needed either
+      // (but see https://developer.nvidia.com/blog/cuda-pro-tip-always-set-current-device-avoid-multithreading-bugs)
+      if( debug ) std::cout << "__CudaRuntime: calling cudaSetDevice(0)" << std::endl;
+      checkCuda( cudaSetDevice( 0 ) ); // SLOW!
+    }
+
+    // Tear down CUDA application (call cudaDeviceReset)
+    // ** NB: strictly speaking this is not needed when using the CUDA runtime API **
+    // Calling cudaDeviceReset on shutdown is only needed for checking memory leaks in cuda-memcheck
+    // See https://docs.nvidia.com/cuda/cuda-memcheck/index.html#leak-checking
+    static void tearDown( const bool debug = true )
+    {
+      if( debug ) std::cout << "__CudaRuntime: calling cudaDeviceReset()" << std::endl;
+      checkCuda( cudaDeviceReset() );
+    }
+  };
+
+}
+#endif
+
+//--------------------------------------------------------------------------
+
+#endif // MG5AMC_CUDARUNTIME_H
diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/CurandRandomNumberKernel.cc b/epochX/cudacpp/gg_tt.mad/SubProcesses/CurandRandomNumberKernel.cc
index 08a16f6f2c..eb56333b03 100644
--- a/epochX/cudacpp/gg_tt.mad/SubProcesses/CurandRandomNumberKernel.cc
+++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/CurandRandomNumberKernel.cc
@@ -1,9 +1,9 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
-#include "GpuRuntime.h"
+#include "CudaRuntime.h"
 #include "MemoryBuffers.h"
 #include "RandomNumberKernels.h"
 
@@ -22,7 +22,7 @@ inline void assertCurand( curandStatus_t code, const char *file, int line, bool
 }
 #endif /* clang-format on */
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -36,7 +36,7 @@ namespace mg5amcCpu
   {
     if( m_isOnDevice )
     {
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
       if( !m_rnarray.isOnDevice() )
         throw std::runtime_error( "CurandRandomNumberKernel on device with a host random number array" );
 #else
@@ -114,7 +114,7 @@ namespace mg5amcCpu
     /*
     printf( "\nCurandRandomNumberKernel::generateRnarray size = %d\n", (int)m_rnarray.size() );
     fptype* data = m_rnarray.data();
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
     if( m_rnarray.isOnDevice() )
     {
       data = new fptype[m_rnarray.size()]();
@@ -123,7 +123,7 @@ namespace mg5amcCpu
 #endif
     for( int i = 0; i < ( (int)m_rnarray.size() / 4 ); i++ )
       printf( "[%4d] %f %f %f %f\n", i * 4, data[i * 4], data[i * 4 + 2], data[i * 4 + 2], data[i * 4 + 3] );
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
     if( m_rnarray.isOnDevice() ) delete[] data;
 #endif
     */
diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/EventStatistics.h b/epochX/cudacpp/gg_tt.mad/SubProcesses/EventStatistics.h
index b425a5bade..48b51e0a49 100644
--- a/epochX/cudacpp/gg_tt.mad/SubProcesses/EventStatistics.h
+++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/EventStatistics.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef EventStatistics_H
 #define EventStatistics_H 1
@@ -16,7 +16,7 @@
 #include <limits>
 #include <string>
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/MadgraphTest.h b/epochX/cudacpp/gg_tt.mad/SubProcesses/MadgraphTest.h
index a64c05c26a..ef40624c88 100644
--- a/epochX/cudacpp/gg_tt.mad/SubProcesses/MadgraphTest.h
+++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/MadgraphTest.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: S. Hageboeck (Dec 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MADGRAPHTEST_H_
 #define MADGRAPHTEST_H_ 1
@@ -22,7 +22,7 @@
 #include <string>
 #include <vector>
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 using mg5amcGpu::CPPProcess;
 #else
 using mg5amcCpu::CPPProcess;
@@ -201,7 +201,7 @@ class MadgraphTest : public testing::TestWithParam<TestDriverBase*>
 
 // Since we link both the CPU-only and GPU tests into the same executable, we prevent
 // a multiply defined symbol by only compiling this in the non-CUDA phase:
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
 
 /// Compare momenta and matrix elements.
 /// This uses an implementation of TestDriverBase to run a madgraph workflow,
@@ -307,6 +307,6 @@ TEST_P( MadgraphTest, CompareMomentaAndME )
   }
 }
 
-#endif // MGONGPUCPP_GPUIMPL
+#endif // __CUDACC__
 
 #endif /* MADGRAPHTEST_H_ */
diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/MatrixElementKernels.cc b/epochX/cudacpp/gg_tt.mad/SubProcesses/MatrixElementKernels.cc
index 81699dfea9..74b5239ebf 100644
--- a/epochX/cudacpp/gg_tt.mad/SubProcesses/MatrixElementKernels.cc
+++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/MatrixElementKernels.cc
@@ -1,12 +1,12 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #include "MatrixElementKernels.h"
 
 #include "CPPProcess.h"
-#include "GpuRuntime.h" // Includes the abstraction for Nvidia/AMD compilation
+#include "CudaRuntime.h"
 #include "MemoryAccessMomenta.h"
 #include "MemoryBuffers.h"
 
@@ -14,7 +14,7 @@
 
 //============================================================================
 
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
 namespace mg5amcCpu
 {
 
@@ -150,7 +150,7 @@ namespace mg5amcCpu
 
 //============================================================================
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 namespace mg5amcGpu
 {
 
@@ -209,13 +209,13 @@ namespace mg5amcGpu
     PinnedHostBufferHelicityMask hstIsGoodHel( ncomb );
     DeviceBufferHelicityMask devIsGoodHel( ncomb );
     // ... 0d1. Compute good helicity mask on the device
-    gpuLaunchKernel( computeDependentCouplings, m_gpublocks, m_gputhreads, m_gs.data(), m_couplings.data() );
+    computeDependentCouplings<<<m_gpublocks, m_gputhreads>>>( m_gs.data(), m_couplings.data() );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    gpuLaunchKernel( sigmaKin_getGoodHel, m_gpublocks, m_gputhreads, m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_numerators.data(), m_denominators.data(), devIsGoodHel.data() );
+    sigmaKin_getGoodHel<<<m_gpublocks, m_gputhreads>>>( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_numerators.data(), m_denominators.data(), devIsGoodHel.data() );
 #else
-    gpuLaunchKernel( sigmaKin_getGoodHel, m_gpublocks, m_gputhreads, m_momenta.data(), m_couplings.data(), m_matrixElements.data(), devIsGoodHel.data() );
+    sigmaKin_getGoodHel<<<m_gpublocks, m_gputhreads>>>( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), devIsGoodHel.data() );
 #endif
-    checkGpu( gpuPeekAtLastError() );
+    checkCuda( cudaPeekAtLastError() );
     // ... 0d2. Copy back good helicity mask to the host
     copyHostFromDevice( hstIsGoodHel, devIsGoodHel );
     // ... 0d3. Copy back good helicity list to constant memory on the device
@@ -226,19 +226,19 @@ namespace mg5amcGpu
 
   void MatrixElementKernelDevice::computeMatrixElements( const unsigned int channelId )
   {
-    gpuLaunchKernel( computeDependentCouplings, m_gpublocks, m_gputhreads, m_gs.data(), m_couplings.data() );
+    computeDependentCouplings<<<m_gpublocks, m_gputhreads>>>( m_gs.data(), m_couplings.data() );
 #ifndef MGONGPU_NSIGHT_DEBUG
     constexpr unsigned int sharedMemSize = 0;
 #else
     constexpr unsigned int sharedMemSize = ntpbMAX * sizeof( float );
 #endif
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    gpuLaunchKernelSharedMem( sigmaKin, m_gpublocks, m_gputhreads, sharedMemSize, m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), channelId, m_numerators.data(), m_denominators.data(), m_selhel.data(), m_selcol.data() );
+    sigmaKin<<<m_gpublocks, m_gputhreads, sharedMemSize>>>( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), channelId, m_numerators.data(), m_denominators.data(), m_selhel.data(), m_selcol.data() );
 #else
-    gpuLaunchKernelSharedMem( sigmaKin, m_gpublocks, m_gputhreads, sharedMemSize, m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), m_selhel.data(), m_selcol.data() );
+    sigmaKin<<<m_gpublocks, m_gputhreads, sharedMemSize>>>( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), m_selhel.data(), m_selcol.data() );
 #endif
-    checkGpu( gpuPeekAtLastError() );
-    checkGpu( gpuDeviceSynchronize() );
+    checkCuda( cudaPeekAtLastError() );
+    checkCuda( cudaDeviceSynchronize() );
   }
 
   //--------------------------------------------------------------------------
diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/MatrixElementKernels.h b/epochX/cudacpp/gg_tt.mad/SubProcesses/MatrixElementKernels.h
index 72bd8f195b..23e84757a2 100644
--- a/epochX/cudacpp/gg_tt.mad/SubProcesses/MatrixElementKernels.h
+++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/MatrixElementKernels.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MATRIXELEMENTKERNELS_H
 #define MATRIXELEMENTKERNELS_H 1
@@ -10,7 +10,7 @@
 
 #include "MemoryBuffers.h"
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -81,7 +81,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
   // A class encapsulating matrix element calculations on a CPU host
   class MatrixElementKernelHost final : public MatrixElementKernelBase, public NumberOfEvents
   {
@@ -130,7 +130,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
   // A class encapsulating matrix element calculations on a GPU device
   class MatrixElementKernelDevice : public MatrixElementKernelBase, public NumberOfEvents
   {
diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryAccessAmplitudes.h b/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryAccessAmplitudes.h
index ffb76e93de..573b3bbbc9 100644
--- a/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryAccessAmplitudes.h
+++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryAccessAmplitudes.h
@@ -15,7 +15,7 @@
 #define MGONGPU_TRIVIAL_AMPLITUDES 1
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryAccessCouplings.h b/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryAccessCouplings.h
index 3afdf3e554..35a3af42e0 100644
--- a/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryAccessCouplings.h
+++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryAccessCouplings.h
@@ -15,7 +15,7 @@
 #include "MemoryBuffers.h"       // for HostBufferCouplings::isaligned
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryAccessCouplingsFixed.h b/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryAccessCouplingsFixed.h
index ffcdf4dbef..dc0d93afff 100644
--- a/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryAccessCouplingsFixed.h
+++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryAccessCouplingsFixed.h
@@ -14,7 +14,7 @@
 //#include "MemoryAccessHelpers.h"
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryAccessDenominators.h b/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryAccessDenominators.h
index 66f2d32a6b..3bce635718 100644
--- a/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryAccessDenominators.h
+++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryAccessDenominators.h
@@ -10,7 +10,7 @@
 #include "MemoryAccessGs.h"
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryAccessGs.h b/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryAccessGs.h
index 4c726b30f3..31311aa375 100644
--- a/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryAccessGs.h
+++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryAccessGs.h
@@ -13,7 +13,7 @@
 #include "MemoryBuffers.h" // for HostBufferMatrixElements::isaligned
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryAccessHelpers.h b/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryAccessHelpers.h
index db73e4e064..c82a6c7635 100644
--- a/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryAccessHelpers.h
+++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryAccessHelpers.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MemoryAccessHelpers_H
 #define MemoryAccessHelpers_H 1
@@ -105,7 +105,7 @@ class KernelAccessHelper : public MemoryAccessHelper<T>
     }
     else
     {
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
       const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid
       //printf( "kernelAccessRecord: ievt=%d threadId=%d\n", ievt, threadIdx.x );
       return T::ieventAccessRecord( buffer, ievt ); // NB fptype and fptype_sv coincide for CUDA
diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryAccessMatrixElements.h b/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryAccessMatrixElements.h
index 3741011971..f32e6fea5b 100644
--- a/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryAccessMatrixElements.h
+++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryAccessMatrixElements.h
@@ -13,7 +13,7 @@
 #include "MemoryBuffers.h" // for HostBufferMatrixElements::isaligned
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryAccessMomenta.h b/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryAccessMomenta.h
index 86df5d5471..29266de32c 100644
--- a/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryAccessMomenta.h
+++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryAccessMomenta.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MemoryAccessMomenta_H
 #define MemoryAccessMomenta_H 1
@@ -13,7 +13,7 @@
 #include "MemoryAccessVectors.h"
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -27,9 +27,10 @@ namespace mg5amcCpu
   class MemoryAccessMomentaBase //_AOSOAv1
   {
   public:
+
     // Number of Events Per Page in the momenta AOSOA memory buffer layout
     // (these are all best kept as a compile-time constants: see issue #23)
-#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
+#ifdef __CUDACC__ /* clang-format off */
     // -----------------------------------------------------------------------------------------------
     // --- GPUs: neppM is best set to a power of 2 times the number of fptype's in a 32-byte cacheline
     // --- This is relevant to ensure coalesced access to momenta in global memory
diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryAccessNumerators.h b/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryAccessNumerators.h
index 18991f4fa6..b152183b28 100644
--- a/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryAccessNumerators.h
+++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryAccessNumerators.h
@@ -10,7 +10,7 @@
 #include "MemoryAccessGs.h"
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryAccessRandomNumbers.h b/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryAccessRandomNumbers.h
index 40cb089135..e2988d39f3 100644
--- a/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryAccessRandomNumbers.h
+++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryAccessRandomNumbers.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MemoryAccessRandomNumbers_H
 #define MemoryAccessRandomNumbers_H 1
@@ -11,7 +11,7 @@
 #include "CPPProcess.h"
 #include "MemoryAccessHelpers.h"
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 using mg5amcGpu::CPPProcess;
 #else
 using mg5amcCpu::CPPProcess;
diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryAccessVectors.h b/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryAccessVectors.h
index 08faccff0f..e9b197368e 100644
--- a/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryAccessVectors.h
+++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryAccessVectors.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MemoryAccessVectors_H
 #define MemoryAccessVectors_H 1
@@ -10,7 +10,7 @@
 
 #include "mgOnGpuVectors.h"
 
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
 namespace mg5amcCpu // this is only needed for CPU SIMD vectorization
 {
 
diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryAccessWavefunctions.h b/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryAccessWavefunctions.h
index 33bef4559e..5428aaf933 100644
--- a/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryAccessWavefunctions.h
+++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryAccessWavefunctions.h
@@ -15,7 +15,7 @@
 #define MGONGPU_TRIVIAL_WAVEFUNCTIONS 1
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryBuffers.h b/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryBuffers.h
index 7756a71621..3093e6ed18 100644
--- a/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryBuffers.h
+++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryBuffers.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021, based on earlier work by S. Hageboeck) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Roiser, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MemoryBuffers_H
 #define MemoryBuffers_H 1
@@ -11,12 +11,12 @@
 #include "mgOnGpuCxtypes.h"
 
 #include "CPPProcess.h"
-#include "GpuRuntime.h"
+#include "CudaRuntime.h"
 #include "Parameters_sm.h"
 
 #include <sstream>
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -87,7 +87,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
   constexpr bool HostBufferALIGNED = false;   // ismisaligned=false
   constexpr bool HostBufferMISALIGNED = true; // ismisaligned=true
 
@@ -119,7 +119,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
   // A class encapsulating a CUDA pinned host buffer
   template<typename T>
   class PinnedHostBufferBase : public BufferBase<T>
@@ -128,18 +128,18 @@ namespace mg5amcCpu
     PinnedHostBufferBase( const size_t size )
       : BufferBase<T>( size, false )
     {
-      gpuMallocHost( &( this->m_data ), this->bytes() );
+      checkCuda( cudaMallocHost( &( this->m_data ), this->bytes() ) );
     }
     virtual ~PinnedHostBufferBase()
     {
-      gpuFreeHost( this->m_data );
+      checkCuda( cudaFreeHost( this->m_data ) );
     }
   };
 #endif
 
   //--------------------------------------------------------------------------
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
   // A class encapsulating a CUDA device buffer
   template<typename T>
   class DeviceBufferBase : public BufferBase<T>
@@ -148,18 +148,18 @@ namespace mg5amcCpu
     DeviceBufferBase( const size_t size )
       : BufferBase<T>( size, true )
     {
-      gpuMalloc( &( this->m_data ), this->bytes() );
+      checkCuda( cudaMalloc( &( this->m_data ), this->bytes() ) );
     }
     virtual ~DeviceBufferBase()
     {
-      gpuFree( this->m_data );
+      checkCuda( cudaFree( this->m_data ) );
     }
   };
 #endif
 
   //--------------------------------------------------------------------------
 
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
   // A class encapsulating a C++ host buffer for a given number of events
   template<typename T, size_t sizePerEvent, bool ismisaligned>
   class HostBuffer : public HostBufferBase<T, ismisaligned>, virtual private NumberOfEvents
@@ -175,7 +175,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
   // A class encapsulating a CUDA pinned host buffer for a given number of events
   template<typename T, size_t sizePerEvent>
   class PinnedHostBuffer : public PinnedHostBufferBase<T>, virtual private NumberOfEvents
@@ -191,7 +191,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
   // A class encapsulating a CUDA device buffer for a given number of events
   template<typename T, size_t sizePerEvent>
   class DeviceBuffer : public DeviceBufferBase<T>, virtual private NumberOfEvents
@@ -213,7 +213,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for momenta random numbers
   constexpr size_t sizePerEventRndNumMomenta = MemoryBuffers::np4 * MemoryBuffers::nparf;
 
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
   // A class encapsulating a C++ host buffer for momenta random numbers
   typedef HostBuffer<fptype, sizePerEventRndNumMomenta, HostBufferALIGNED> HostBufferRndNumMomenta;
 #else
@@ -232,7 +232,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer with ONE fptype per event
   constexpr size_t sizePerEventOneFp = 1;
 
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
   // A class encapsulating a C++ host buffer with ONE fptype per event
   typedef HostBuffer<fptype, sizePerEventOneFp, HostBufferALIGNED> HostBufferOneFp;
 #else
@@ -257,7 +257,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for Gs
   constexpr size_t sizePerEventGs = 1;
 
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
   // A class encapsulating a C++ host buffer for gs
   typedef HostBuffer<fptype, sizePerEventGs, HostBufferALIGNED> HostBufferGs;
 #else
@@ -276,7 +276,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for numerators
   constexpr size_t sizePerEventNumerators = 1;
 
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
   // A class encapsulating a C++ host buffer for gs
   typedef HostBuffer<fptype, sizePerEventNumerators, HostBufferALIGNED> HostBufferNumerators;
 #else
@@ -296,7 +296,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for denominators
   constexpr size_t sizePerEventDenominators = 1;
 
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
   // A class encapsulating a C++ host buffer for gs
   typedef HostBuffer<fptype, sizePerEventDenominators, HostBufferALIGNED> HostBufferDenominators;
 #else
@@ -315,7 +315,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for random numbers
   constexpr size_t sizePerEventCouplings = MemoryBuffers::ndcoup * MemoryBuffers::nx2;
 
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
   // A class encapsulating a C++ host buffer for gs
   typedef HostBuffer<fptype, sizePerEventCouplings, HostBufferALIGNED> HostBufferCouplings;
 #else
@@ -333,7 +333,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for momenta
   constexpr size_t sizePerEventMomenta = MemoryBuffers::np4 * MemoryBuffers::npar;
 
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
   // A class encapsulating a C++ host buffer for momenta
   typedef HostBuffer<fptype, sizePerEventMomenta, HostBufferALIGNED> HostBufferMomenta;
   //typedef HostBuffer<fptype, sizePerEventMomenta, HostBufferMISALIGNED> HostBufferMomenta; // TEST MISALIGNMENT!
@@ -352,7 +352,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for sampling weights
   constexpr size_t sizePerEventWeights = 1;
 
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
   // A class encapsulating a C++ host buffer for sampling weights
   typedef HostBuffer<fptype, sizePerEventWeights, HostBufferALIGNED> HostBufferWeights;
 #else
@@ -370,7 +370,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for matrix elements
   constexpr size_t sizePerEventMatrixElements = 1;
 
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
   // A class encapsulating a C++ host buffer for matrix elements
   typedef HostBuffer<fptype, sizePerEventMatrixElements, HostBufferALIGNED> HostBufferMatrixElements;
 #else
@@ -385,7 +385,7 @@ namespace mg5amcCpu
   // A base class encapsulating a memory buffer for the helicity mask
   typedef BufferBase<bool> BufferHelicityMask;
 
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
   // A class encapsulating a C++ host buffer for the helicity mask
   typedef HostBufferBase<bool, HostBufferALIGNED> HostBufferHelicityMask;
 #else
@@ -403,7 +403,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for wavefunctions
   constexpr size_t sizePerEventWavefunctions = MemoryBuffers::nw6 * MemoryBuffers::nx2;
 
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
   // A class encapsulating a C++ host buffer for wavefunctions
   typedef HostBuffer<fptype, sizePerEventWavefunctions, HostBufferALIGNED> HostBufferWavefunctions;
 #else
@@ -421,7 +421,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for helicity random numbers
   constexpr size_t sizePerEventRndNumHelicity = 1;
 
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
   // A class encapsulating a C++ host buffer for helicity random numbers
   typedef HostBuffer<fptype, sizePerEventRndNumHelicity, HostBufferALIGNED> HostBufferRndNumHelicity;
 #else
@@ -439,7 +439,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for color random numbers
   constexpr size_t sizePerEventRndNumColor = 1;
 
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
   // A class encapsulating a C++ host buffer for color random numbers
   typedef HostBuffer<fptype, sizePerEventRndNumColor, HostBufferALIGNED> HostBufferRndNumColor;
 #else
@@ -457,7 +457,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for helicity selection
   constexpr size_t sizePerEventSelectedHelicity = 1;
 
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
   // A class encapsulating a C++ host buffer for helicity selection
   typedef HostBuffer<int, sizePerEventSelectedHelicity, HostBufferALIGNED> HostBufferSelectedHelicity;
 #else
@@ -475,7 +475,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for color selection
   constexpr size_t sizePerEventSelectedColor = 1;
 
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
   // A class encapsulating a C++ host buffer for color selection
   typedef HostBuffer<int, sizePerEventSelectedColor, HostBufferALIGNED> HostBufferSelectedColor;
 #else
@@ -487,7 +487,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
   template<class Tdst, class Tsrc>
   void copyDeviceFromHost( Tdst& dst, const Tsrc& src ) // keep the same order of arguments as in memcpy
   {
@@ -504,13 +504,13 @@ namespace mg5amcCpu
       throw std::runtime_error( sstr.str() );
     }
     // NB (PR #45): cudaMemcpy involves an intermediate memcpy to pinned memory if host array is a not a pinned host array
-    gpuMemcpy( dst.data(), src.data(), src.bytes(), gpuMemcpyHostToDevice );
+    checkCuda( cudaMemcpy( dst.data(), src.data(), src.bytes(), cudaMemcpyHostToDevice ) );
   }
 #endif
 
   //--------------------------------------------------------------------------
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
   template<class Tdst, class Tsrc>
   void copyHostFromDevice( Tdst& dst, const Tsrc& src ) // keep the same order of arguments as in memcpy
   {
@@ -527,7 +527,7 @@ namespace mg5amcCpu
       throw std::runtime_error( sstr.str() );
     }
     // NB (PR #45): cudaMemcpy involves an intermediate memcpy to pinned memory if host array is a not a pinned host array
-    gpuMemcpy( dst.data(), src.data(), src.bytes(), gpuMemcpyDeviceToHost );
+    checkCuda( cudaMemcpy( dst.data(), src.data(), src.bytes(), cudaMemcpyDeviceToHost ) );
   }
 #endif
 
diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/CPPProcess.cc b/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/CPPProcess.cc
index e167c60e14..18052b6676 100644
--- a/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/CPPProcess.cc
+++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/CPPProcess.cc
@@ -4,7 +4,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
 // MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
@@ -16,6 +16,7 @@
 
 #include "mgOnGpuConfig.h"
 
+#include "CudaRuntime.h"
 #include "HelAmps_sm.h"
 #include "MemoryAccessAmplitudes.h"
 #include "MemoryAccessCouplings.h"
@@ -45,7 +46,7 @@
 // Class member functions for calculating the matrix elements for
 // Process: g g > t t~ WEIGHTED<=2 @1
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -79,7 +80,7 @@ namespace mg5amcCpu
   __device__ const fptype cIPD[2] = { (fptype)Parameters_sm::mdl_MT, (fptype)Parameters_sm::mdl_WT };
   __device__ const fptype* cIPC = nullptr; // unused as nicoup=0
 #else
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
   __device__ __constant__ fptype cIPD[2];
   __device__ __constant__ fptype* cIPC = nullptr; // unused as nicoup=0
 #else
@@ -89,7 +90,7 @@ namespace mg5amcCpu
 #endif
 
   // Helicity combinations (and filtering of "good" helicity combinations)
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
   __device__ __constant__ short cHel[ncomb][npar];
   __device__ __constant__ int cNGoodHel;
   __device__ __constant__ int cGoodHel[ncomb];
@@ -117,13 +118,13 @@ namespace mg5amcCpu
                            fptype* allDenominators,       // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
                            fptype_sv* jamp2_sv            // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled)
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
                            , const int ievt00             // input: first event number in current C++ event page (for CUDA, ievt depends on threadid)
 #endif
                            )
   //ALWAYS_INLINE // attributes are not permitted in a function definition
   {
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
     using namespace mg5amcGpu;
     using M_ACCESS = DeviceAccessMomenta;         // non-trivial access: buffer includes all events
     using E_ACCESS = DeviceAccessMatrixElements;  // non-trivial access: buffer includes all events
@@ -150,7 +151,7 @@ namespace mg5amcCpu
 #endif /* clang-format on */
     mgDebug( 0, __FUNCTION__ );
     //printf( "calculate_wavefunctions: ihel=%2d\n", ihel );
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
     //printf( "calculate_wavefunctions: ievt00=%d\n", ievt00 );
 #endif
 
@@ -186,7 +187,7 @@ namespace mg5amcCpu
 #endif
     for( int iParity = 0; iParity < nParity; ++iParity )
     { // START LOOP ON IPARITY
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
       const int ievt0 = ievt00 + iParity * neppV;
 #endif
       constexpr size_t nxcoup = ndcoup + nicoup; // both dependent and independent couplings
@@ -199,10 +200,8 @@ namespace mg5amcCpu
         allCOUPs[idcoup] = CD_ACCESS::idcoupAccessBufferConst( allcouplings, idcoup ); // dependent couplings, vary event-by-event
       for( size_t iicoup = 0; iicoup < nicoup; iicoup++ )
         allCOUPs[ndcoup + iicoup] = CI_ACCESS::iicoupAccessBufferConst( cIPC, iicoup ); // independent couplings, fixed for all events
-#ifdef MGONGPUCPP_GPUIMPL
 #ifdef __CUDACC__
 #pragma nv_diagnostic pop
-#endif
       // CUDA kernels take input/output buffers with momenta/MEs for all events
       const fptype* momenta = allmomenta;
       const fptype* COUPs[nxcoup];
@@ -284,8 +283,6 @@ namespace mg5amcCpu
 #endif
       jamp_sv[1] -= amp_sv[0];
 
-#include "GpuAbstraction.h"
-
       // *** COLOR CHOICE BELOW ***
       // Store the leading color flows for choice of color
       if( jamp2_sv ) // disable color choice if nullptr
@@ -305,7 +302,7 @@ namespace mg5amcCpu
         { 16, -2 },
         { -2, 16 } }; // 2-D array[2][2]
 
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
       // Pre-compute a constexpr triangular color matrix properly normalized #475
       struct TriangularNormalizedColorMatrix
       {
@@ -362,7 +359,7 @@ namespace mg5amcCpu
 #endif
       for( int icol = 0; icol < ncolor; icol++ )
       {
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
         // === C++ START ===
         // Diagonal terms
 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
@@ -421,7 +418,7 @@ namespace mg5amcCpu
       MEs_sv_previous += deltaMEs_previous;
 #endif
       /*
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
       if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", blockDim.x * blockIdx.x + threadIdx.x, ihel, MEs_sv );
 #else
 #ifdef MGONGPU_CPPSIMD
@@ -468,8 +465,8 @@ namespace mg5amcCpu
       { 1, 1, -1, -1 },
       { 1, 1, 1, 1 },
       { 1, 1, 1, -1 } };
-#ifdef MGONGPUCPP_GPUIMPL
-    gpuMemcpyToSymbol( cHel, tHel, ncomb * npar * sizeof( short ) );
+#ifdef __CUDACC__
+    checkCuda( cudaMemcpyToSymbol( cHel, tHel, ncomb * npar * sizeof( short ) ) );
 #else
     memcpy( cHel, tHel, ncomb * npar * sizeof( short ) );
 #endif
@@ -509,9 +506,9 @@ namespace mg5amcCpu
     // Then copy them to CUDA constant memory (issue #39) or its C++ emulation in file-scope static memory
     const fptype tIPD[2] = { (fptype)m_pars->mdl_MT, (fptype)m_pars->mdl_WT };
     //const cxtype tIPC[0] = { ... }; // nicoup=0
-#ifdef MGONGPUCPP_GPUIMPL
-    gpuMemcpyToSymbol( cIPD, tIPD, 2 * sizeof( fptype ) );
-    //gpuMemcpyToSymbol( cIPC, tIPC, 0 * sizeof( cxtype ) ); // nicoup=0
+#ifdef __CUDACC__
+    checkCuda( cudaMemcpyToSymbol( cIPD, tIPD, 2 * sizeof( fptype ) ) );
+    //checkCuda( cudaMemcpyToSymbol( cIPC, tIPC, 0 * sizeof( cxtype ) ) ); // nicoup=0
 #else
     memcpy( cIPD, tIPD, 2 * sizeof( fptype ) );
     //memcpy( cIPC, tIPC, 0 * sizeof( cxtype ) ); // nicoup=0
@@ -547,7 +544,7 @@ namespace mg5amcCpu
   {
     std::stringstream out;
     // CUDA version (NVCC)
-    // [Use __NVCC__ instead of MGONGPUCPP_GPUIMPL here!]
+    // [Use __NVCC__ instead of __CUDACC__ here!]
     // [This tests if 'nvcc' was used even to build a .cc file, even if not necessarily 'nvcc -x cu' for a .cu file]
     // [Check 'nvcc --compiler-options -dM -E dummy.c | grep CUDA': see https://stackoverflow.com/a/53713712]
 #ifdef __NVCC__
@@ -612,12 +609,12 @@ namespace mg5amcCpu
   __global__ void /* clang-format off */
   computeDependentCouplings( const fptype* allgs, // input: Gs[nevt]
                              fptype* allcouplings // output: couplings[nevt*ndcoup*2]
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
                              , const int nevt     // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #endif
   ) /* clang-format on */
   {
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
     using namespace mg5amcGpu;
     using G_ACCESS = DeviceAccessGs;
     using C_ACCESS = DeviceAccessCouplings;
@@ -638,7 +635,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
+#ifdef __CUDACC__ /* clang-format off */
   __global__ void
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
@@ -764,9 +761,9 @@ namespace mg5amcCpu
         nGoodHel++;
       }
     }
-#ifdef MGONGPUCPP_GPUIMPL
-    gpuMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) );
-    gpuMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) );
+#ifdef __CUDACC__
+    checkCuda( cudaMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) ) );
+    checkCuda( cudaMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) ) );
 #else
     cNGoodHel = nGoodHel;
     for( int ihel = 0; ihel < ncomb; ihel++ ) cGoodHel[ihel] = goodHel[ihel];
@@ -790,7 +787,7 @@ namespace mg5amcCpu
 #endif
             int* allselhel,                // output: helicity selection[nevt]
             int* allselcol                 // output: helicity selection[nevt]
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
             , const int nevt               // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #endif
             ) /* clang-format on */
@@ -810,7 +807,7 @@ namespace mg5amcCpu
     // Denominators: spins, colors and identical particles
     constexpr int helcolDenominators[1] = { 256 }; // assume nprocesses == 1 (#272 and #343)
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
     // Remember: in CUDA this is a kernel for one event, in c++ this processes n events
     const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid
 #else
@@ -824,12 +821,9 @@ namespace mg5amcCpu
 #endif
 
     // Start sigmaKin_lines
-
-#include "GpuAbstraction.h"
-
     // === PART 0 - INITIALISATION (before calculate_wavefunctions) ===
     // Reset the "matrix elements" - running sums of |M|^2 over helicities for the given event
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
     allMEs[ievt] = 0;
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     allNumerators[ievt] = 0;
@@ -857,7 +851,7 @@ namespace mg5amcCpu
     // === PART 1 - HELICITY LOOP: CALCULATE WAVEFUNCTIONS ===
     // (in both CUDA and C++, using precomputed good helicities)
 
-#ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++
+#ifdef __CUDACC__ // CUDA OR C++
 
     // *** START OF PART 1a - CUDA (one event per CPU thread) ***
     // Running sum of partial amplitudes squared for event by event color selection (#402)
@@ -1067,7 +1061,7 @@ namespace mg5amcCpu
     // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event
     // [NB 'sum over final spins, average over initial spins', eg see
     // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf]
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
     allMEs[ievt] /= helcolDenominators[0];
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     if( channelId > 0 ) allMEs[ievt] *= allNumerators[ievt] / allDenominators[ievt];
diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/CPPProcess.h b/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/CPPProcess.h
index 4a88a07226..3ebd92c038 100644
--- a/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/CPPProcess.h
+++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/CPPProcess.h
@@ -4,7 +4,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
 // MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
@@ -25,7 +25,7 @@
 
 //--------------------------------------------------------------------------
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -107,7 +107,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
   __global__ void
   computeDependentCouplings( const fptype* allgs,    // input: Gs[nevt]
                              fptype* allcouplings ); // output: couplings[nevt*ndcoup*2]
@@ -120,7 +120,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
+#ifdef __CUDACC__ /* clang-format off */
   __global__ void
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
@@ -150,7 +150,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
+#ifdef __CUDACC__ /* clang-format off */
   __global__ void
   sigmaKin( const fptype* allmomenta,      // input: momenta[nevt*npar*4]
             const fptype* allcouplings,    // input: couplings[nevt*ndcoup*2]
diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/CudaRuntime.h b/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/CudaRuntime.h
new file mode 120000
index 0000000000..ce9e1a487a
--- /dev/null
+++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/CudaRuntime.h
@@ -0,0 +1 @@
+../CudaRuntime.h
\ No newline at end of file
diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/check_sa.cc b/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/check_sa.cc
index b9a05dea46..3fbf0ffbee 100644
--- a/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/check_sa.cc
+++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/check_sa.cc
@@ -4,7 +4,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: O. Mattelaer (Nov 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 
 #include "mgOnGpuConfig.h"
@@ -12,7 +12,6 @@
 #include "BridgeKernels.h"
 #include "CPPProcess.h"
 #include "CrossSectionKernels.h"
-#include "GpuRuntime.h"
 #include "MatrixElementKernels.h"
 #include "MemoryAccessMatrixElements.h"
 #include "MemoryAccessMomenta.h"
@@ -66,7 +65,7 @@ usage( char* argv0, int ret = 1 )
   std::cout << std::endl;
   std::cout << "Summary stats are always computed: '-p' and '-j' only control their printout" << std::endl;
   std::cout << "The '-d' flag only enables NaN/abnormal warnings and OMP debugging" << std::endl;
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
 #ifdef _OPENMP
   std::cout << std::endl;
   std::cout << "Use the OMP_NUM_THREADS environment variable to control OMP multi-threading" << std::endl;
@@ -97,7 +96,7 @@ int
 main( int argc, char** argv )
 {
   // Namespaces for CUDA and C++ (FIXME - eventually use the same namespace everywhere...)
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
   using namespace mg5amcGpu;
 #else
   using namespace mg5amcCpu;
@@ -135,11 +134,9 @@ main( int argc, char** argv )
     CurandDevice = 2
   };
 #ifdef MGONGPU_HAS_NO_CURAND
-  RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // this is the only supported mode if build has no curand (PR #784 and #785)
-#elif defined __HIPCC__
-#error Internal error: MGONGPU_HAS_NO_CURAND should have been set for __HIPCC__ // default on AMD GPUs should be common random
+  RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // this is the only supported mode if build has no curand (PR #784)
 #elif defined __CUDACC__
-  RandomNumberMode rndgen = RandomNumberMode::CurandDevice; // default on NVidia GPU if build has curand
+  RandomNumberMode rndgen = RandomNumberMode::CurandDevice; // default on GPU if build has curand
 #else
   RandomNumberMode rndgen = RandomNumberMode::CurandHost; // default on CPU if build has curand
 #endif
@@ -149,10 +146,10 @@ main( int argc, char** argv )
     RamboHost = 1,
     RamboDevice = 2
   };
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
   RamboSamplingMode rmbsmp = RamboSamplingMode::RamboDevice; // default on GPU
 #else
-  RamboSamplingMode rmbsmp = RamboSamplingMode::RamboHost; // default on CPU
+  RamboSamplingMode rmbsmp = RamboSamplingMode::RamboHost;  // default on CPU
 #endif
   // Bridge emulation mode (NB Bridge implies RamboHost!)
   bool bridge = false;
@@ -180,7 +177,7 @@ main( int argc, char** argv )
     else if( arg == "--curdev" )
     {
 #ifndef __CUDACC__
-      throw std::runtime_error( "CurandDevice is not supported on CPUs or non-NVidia GPUs" );
+      throw std::runtime_error( "CurandDevice is not supported on CPUs" );
 #elif defined MGONGPU_HAS_NO_CURAND
       throw std::runtime_error( "CurandDevice is not supported because this application was built without Curand support" );
 #else
@@ -201,7 +198,7 @@ main( int argc, char** argv )
     }
     else if( arg == "--rmbdev" )
     {
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
       rmbsmp = RamboSamplingMode::RamboDevice;
 #else
       throw std::runtime_error( "RamboDevice is not supported on CPUs" );
@@ -275,13 +272,13 @@ main( int argc, char** argv )
     return usage( argv[0] );
   }
 
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
 #ifdef _OPENMP
   ompnumthreadsNotSetMeansOneThread( debug ? 1 : 0 ); // quiet(-1), info(0), debug(1)
 #endif
 #endif
 
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
   // Fail gently and avoid "Illegal instruction (core dumped)" if the host does not support the SIMD used in the ME calculation
   // Note: this prevents a crash on pmpe04 but not on some github CI nodes?
   // [NB: SIMD vectorization in mg5amc C++ code is only used in the ME calculation below MatrixElementKernelHost!]
@@ -299,14 +296,14 @@ main( int argc, char** argv )
 
   // === STEP 0 - INITIALISE
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 
-  // --- 00. Initialise GPU
-  // Instantiate a GpuRuntime at the beginnining of the application's main.
-  // For CUDA this invokes cudaSetDevice(0) in the constructor and books a cudaDeviceReset() call in the destructor.
-  const std::string cdinKey = "00 GpuInit";
+  // --- 00. Initialise cuda
+  // Instantiate a CudaRuntime at the beginnining of the application's main to
+  // invoke cudaSetDevice(0) in the constructor and book a cudaDeviceReset() call in the destructor
+  const std::string cdinKey = "00 CudaInit";
   timermap.start( cdinKey );
-  GpuRuntime GpuRuntime( debug );
+  CudaRuntime cudaRuntime( debug );
 #endif
 
   // --- 0a. Initialise physics process
@@ -328,7 +325,7 @@ main( int argc, char** argv )
   timermap.start( alloKey );
 
   // Memory buffers for random numbers for momenta
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
   HostBufferRndNumMomenta hstRndmom( nevt );
 #else
   PinnedHostBufferRndNumMomenta hstRndmom( nevt );
@@ -336,7 +333,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for sampling weights
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
   HostBufferWeights hstWeights( nevt );
 #else
   PinnedHostBufferWeights hstWeights( nevt );
@@ -344,7 +341,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for momenta
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
   HostBufferMomenta hstMomenta( nevt );
 #else
   PinnedHostBufferMomenta hstMomenta( nevt );
@@ -352,7 +349,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for Gs
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
   HostBufferGs hstGs( nevt );
 #else
   PinnedHostBufferGs hstGs( nevt );
@@ -369,7 +366,7 @@ main( int argc, char** argv )
   }
 
   // Memory buffers for matrix elements
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
   HostBufferMatrixElements hstMatrixElements( nevt );
 #else
   PinnedHostBufferMatrixElements hstMatrixElements( nevt );
@@ -378,7 +375,7 @@ main( int argc, char** argv )
 
   // Memory buffers for random numbers for helicity selection
   // *** NB #403 these buffers always remain initialised at 0: no need for helicity choice in gcheck/check (no LHE produced) ***
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
   HostBufferRndNumHelicity hstRndHel( nevt );
 #else
   PinnedHostBufferRndNumHelicity hstRndHel( nevt );
@@ -387,7 +384,7 @@ main( int argc, char** argv )
 
   // Memory buffers for random numbers for color selection
   // *** NB #402 these buffers always remain initialised at 0: no need for color choice in gcheck/check (no LHE produced) ***
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
   HostBufferRndNumColor hstRndCol( nevt );
 #else
   PinnedHostBufferRndNumColor hstRndCol( nevt );
@@ -395,7 +392,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for helicity selection
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
   HostBufferSelectedHelicity hstSelHel( nevt );
 #else
   PinnedHostBufferSelectedHelicity hstSelHel( nevt );
@@ -403,7 +400,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for color selection
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
   HostBufferSelectedColor hstSelCol( nevt );
 #else
   PinnedHostBufferSelectedColor hstSelCol( nevt );
@@ -441,7 +438,7 @@ main( int argc, char** argv )
     const bool onDevice = true;
     prnk.reset( new CurandRandomNumberKernel( devRndmom, onDevice ) );
 #else
-    throw std::logic_error( "INTERNAL ERROR! CurandDevice is not supported on CPUs or non-NVidia GPUs" ); // INTERNAL ERROR (no path to this statement)
+    throw std::logic_error( "INTERNAL ERROR! CurandDevice is not supported on CPUs" ); // INTERNAL ERROR (no path to this statement)
 #endif
   }
 
@@ -453,7 +450,7 @@ main( int argc, char** argv )
   }
   else
   {
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
     prsk.reset( new RamboSamplingKernelDevice( energy, devRndmom, devMomenta, devWeights, gpublocks, gputhreads ) );
 #else
     throw std::logic_error( "RamboDevice is not supported on CPUs" ); // INTERNAL ERROR (no path to this statement)
@@ -464,7 +461,7 @@ main( int argc, char** argv )
   std::unique_ptr<MatrixElementKernelBase> pmek;
   if( !bridge )
   {
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
     pmek.reset( new MatrixElementKernelDevice( devMomenta, devGs, devRndHel, devRndCol, devMatrixElements, devSelHel, devSelCol, gpublocks, gputhreads ) );
 #else
     pmek.reset( new MatrixElementKernelHost( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, nevt ) );
@@ -472,7 +469,7 @@ main( int argc, char** argv )
   }
   else
   {
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
     pmek.reset( new BridgeKernelDevice( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, gpublocks, gputhreads ) );
 #else
     pmek.reset( new BridgeKernelHost( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, nevt ) );
@@ -514,7 +511,7 @@ main( int argc, char** argv )
     prnk->generateRnarray();
     //std::cout << "Got random numbers" << std::endl;
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
     if( rndgen != RandomNumberMode::CurandDevice && rmbsmp == RamboSamplingMode::RamboDevice )
     {
       // --- 1c. Copy rndmom from host to device
@@ -546,7 +543,7 @@ main( int argc, char** argv )
     prsk->getMomentaFinal();
     //std::cout << "Got final momenta" << std::endl;
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
     if( rmbsmp == RamboSamplingMode::RamboDevice )
     {
       // --- 2c. CopyDToH Weights
@@ -591,7 +588,7 @@ main( int argc, char** argv )
       dynamic_cast<BridgeKernelBase*>( pmek.get() )->transposeInputMomentaC2F();
     }
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
     // --- 2d. CopyHToD Momenta
     const std::string gKey = "0.. CpHTDg";
     rambtime += timermap.start( gKey ); // FIXME! NOT A RAMBO TIMER!
@@ -620,7 +617,7 @@ main( int argc, char** argv )
     wv3atime += timermap.stop(); // calc only
     wavetime += wv3atime;        // calc plus copy
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
     if( !bridge )
     {
       // --- 3b. CopyDToH MEs
@@ -761,10 +758,8 @@ main( int argc, char** argv )
     rndgentxt = "CURAND HOST";
   else if( rndgen == RandomNumberMode::CurandDevice )
     rndgentxt = "CURAND DEVICE";
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
   rndgentxt += " (CUDA code)";
-#elif defined __HIPCC__
-  rndgentxt += " (HIP code)";
 #else
   rndgentxt += " (C++ code)";
 #endif
@@ -772,15 +767,13 @@ main( int argc, char** argv )
   // Workflow description summary
   std::string wrkflwtxt;
   // -- CUDA or C++?
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
   wrkflwtxt += "CUD:";
-#elif defined __HIPCC__
-  wrkflwtxt += "HIP:";
 #else
   wrkflwtxt += "CPP:";
 #endif
   // -- DOUBLE or FLOAT?
-#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT /* clang-format off */
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
   wrkflwtxt += "MIX+"; // mixed fptypes (single precision color algebra #537)
 #elif defined MGONGPU_FPTYPE_DOUBLE
   wrkflwtxt += "DBL+";
@@ -790,7 +783,7 @@ main( int argc, char** argv )
   wrkflwtxt += "???+"; // no path to this statement
 #endif
   // -- CUCOMPLEX or THRUST or STD complex numbers?
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 #if defined MGONGPU_CUCXTYPE_CUCOMPLEX
   wrkflwtxt += "CUX:";
 #elif defined MGONGPU_CUCXTYPE_THRUST
@@ -799,12 +792,6 @@ main( int argc, char** argv )
   wrkflwtxt += "CXS:";
 #else
   wrkflwtxt += "???:"; // no path to this statement
-#endif /* clang-format on */
-#elif defined __HIPCC__
-#if defined MGONGPU_CUCXTYPE_CXSMPL
-  wrkflwtxt += "CXS:";
-#else
-  wrkflwtxt += "???:"; // no path to this statement
 #endif
 #else
 #if defined MGONGPU_CPPCXTYPE_STDCOMPLEX
@@ -831,7 +818,7 @@ main( int argc, char** argv )
     wrkflwtxt += "RMBDEV+";
   else
     wrkflwtxt += "??????+"; // no path to this statement
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
   // -- HOST or DEVICE matrix elements? Standalone MEs or BRIDGE?
   if( !bridge )
     wrkflwtxt += "MESDEV";
@@ -887,7 +874,7 @@ main( int argc, char** argv )
 
   if( perf )
   {
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
 #ifdef _OPENMP
     // Get the output of "nproc --all" (https://stackoverflow.com/a/478960)
     std::string nprocall;
@@ -906,10 +893,8 @@ main( int argc, char** argv )
 #endif
     // Dump all configuration parameters and all results
     std::cout << std::string( SEP79, '*' ) << std::endl
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
               << "Process                     = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_CUDA"
-#elif defined __HIPCC__
-              << "Process                     = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_HIP"
 #else
               << "Process                     = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_CPP"
 #endif
@@ -936,22 +921,21 @@ main( int argc, char** argv )
 #elif defined MGONGPU_FPTYPE_FLOAT
               << "FP precision                = FLOAT (NaN/abnormal=" << nabn << ", zero=" << nzero << ")" << std::endl
 #endif
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 #if defined MGONGPU_CUCXTYPE_CUCOMPLEX
               << "Complex type                = CUCOMPLEX" << std::endl
 #elif defined MGONGPU_CUCXTYPE_THRUST
               << "Complex type                = THRUST::COMPLEX" << std::endl
-#elif defined MGONGPU_CUCXTYPE_CXSMPL
-              << "Complex type                = STD::COMPLEX" << std::endl
+#endif
 #else
-              << "Complex type                = ???" << std::endl // no path to this statement...
+              << "Complex type                = STD::COMPLEX" << std::endl
 #endif
               << "RanNumb memory layout       = AOSOA[" << neppR << "]"
               << ( neppR == 1 ? " == AOS" : "" )
               << " [HARDCODED FOR REPRODUCIBILITY]" << std::endl
               << "Momenta memory layout       = AOSOA[" << neppM << "]"
               << ( neppM == 1 ? " == AOS" : "" ) << std::endl
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
     //<< "Wavefunction GPU memory     = LOCAL" << std::endl
 #else
 #if !defined MGONGPU_CPPSIMD
@@ -982,7 +966,7 @@ main( int argc, char** argv )
 #endif
 #endif
               << "Random number generation    = " << rndgentxt << std::endl
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
 #ifdef _OPENMP
               << "OMP threads / `nproc --all` = " << omp_get_max_threads() << " / " << nprocall // includes a newline
 #endif
@@ -1078,15 +1062,14 @@ main( int argc, char** argv )
              << "\"FLOAT (NaN/abnormal=" << nabn << ")\"," << std::endl
 #endif
              << "\"Complex type\": "
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 #if defined MGONGPU_CUCXTYPE_CUCOMPLEX
              << "\"CUCOMPLEX\"," << std::endl
 #elif defined MGONGPU_CUCXTYPE_THRUST
              << "\"THRUST::COMPLEX\"," << std::endl
-#elif defined MGONGPU_CUCXTYPE_CXSMPL
-             << "\"STD::COMPLEX\"," << std::endl
+#endif
 #else
-             << "\"???\"," << std::endl // no path to this statement...
+             << "\"STD::COMPLEX\"," << std::endl
 #endif
              << "\"RanNumb memory layout\": "
              << "\"AOSOA[" << neppR << "]\""
@@ -1094,7 +1077,7 @@ main( int argc, char** argv )
              << "\"Momenta memory layout\": "
              << "\"AOSOA[" << neppM << "]\""
              << ( neppM == 1 ? " == AOS" : "" ) << ", " << std::endl
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
     //<< "\"Wavefunction GPU memory\": " << "\"LOCAL\"," << std::endl
 #endif
              << "\"Curand generation\": "
diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/RamboSamplingKernels.cc b/epochX/cudacpp/gg_tt.mad/SubProcesses/RamboSamplingKernels.cc
index 79abbcc4f8..da68aa9255 100644
--- a/epochX/cudacpp/gg_tt.mad/SubProcesses/RamboSamplingKernels.cc
+++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/RamboSamplingKernels.cc
@@ -1,11 +1,11 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #include "RamboSamplingKernels.h"
 
-#include "GpuRuntime.h"
+#include "CudaRuntime.h"
 #include "MemoryAccessMomenta.h"
 #include "MemoryAccessRandomNumbers.h"
 #include "MemoryAccessWeights.h"
@@ -14,7 +14,7 @@
 
 #include <sstream>
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -92,7 +92,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
   RamboSamplingKernelDevice::RamboSamplingKernelDevice( const fptype energy,               // input: energy
                                                         const BufferRndNumMomenta& rndmom, // input: random numbers in [0,1]
                                                         BufferMomenta& momenta,            // output: momenta
@@ -135,7 +135,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
   __global__ void
   getMomentaInitialDevice( const fptype energy,
                            fptype* momenta )
@@ -147,17 +147,17 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
   void
   RamboSamplingKernelDevice::getMomentaInitial()
   {
-    gpuLaunchKernel( getMomentaInitialDevice, m_gpublocks, m_gputhreads, m_energy, m_momenta.data() );
+    getMomentaInitialDevice<<<m_gpublocks, m_gputhreads>>>( m_energy, m_momenta.data() );
   }
 #endif
 
   //--------------------------------------------------------------------------
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
   __global__ void
   getMomentaFinalDevice( const fptype energy,
                          const fptype* rndmom,
@@ -171,11 +171,11 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
   void
   RamboSamplingKernelDevice::getMomentaFinal()
   {
-    gpuLaunchKernel( getMomentaFinalDevice, m_gpublocks, m_gputhreads, m_energy, m_rndmom.data(), m_momenta.data(), m_weights.data() );
+    getMomentaFinalDevice<<<m_gpublocks, m_gputhreads>>>( m_energy, m_rndmom.data(), m_momenta.data(), m_weights.data() );
   }
 #endif
 
diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/RamboSamplingKernels.h b/epochX/cudacpp/gg_tt.mad/SubProcesses/RamboSamplingKernels.h
index 7c214cd74b..184089efd7 100644
--- a/epochX/cudacpp/gg_tt.mad/SubProcesses/RamboSamplingKernels.h
+++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/RamboSamplingKernels.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef RAMBOSAMPLINGKERNELS_H
 #define RAMBOSAMPLINGKERNELS_H 1
@@ -10,7 +10,7 @@
 
 #include "MemoryBuffers.h"
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -93,7 +93,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
   // A class encapsulating RAMBO phase space sampling on a GPU device
   class RamboSamplingKernelDevice final : public SamplingKernelBase, public NumberOfEvents
   {
diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/RandomNumberKernels.h b/epochX/cudacpp/gg_tt.mad/SubProcesses/RandomNumberKernels.h
index 21d63beeac..188a72c2c9 100644
--- a/epochX/cudacpp/gg_tt.mad/SubProcesses/RandomNumberKernels.h
+++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/RandomNumberKernels.h
@@ -1,14 +1,14 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef RANDOMNUMBERKERNELS_H
 #define RANDOMNUMBERKERNELS_H 1
 
 #include "mgOnGpuConfig.h"
 
-// NB This must come AFTER mgOnGpuConfig.h which contains our definition of __global__ when MGONGPUCPP_GPUIMPL is not defined
+// NB This must come AFTER mgOnGpuConfig.h which contains our definition of __global__ when __CUDACC__ is not defined
 #ifndef MGONGPU_HAS_NO_CURAND
 //#include "curand.h"
 struct curandGenerator_st; // forward definition from curand.h
@@ -16,7 +16,7 @@ struct curandGenerator_st; // forward definition from curand.h
 
 #include "MemoryBuffers.h"
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/cudacpp.mk b/epochX/cudacpp/gg_tt.mad/SubProcesses/cudacpp.mk
index 24f2d49d80..509307506b 100644
--- a/epochX/cudacpp/gg_tt.mad/SubProcesses/cudacpp.mk
+++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/cudacpp.mk
@@ -1,7 +1,7 @@
 # Copyright (C) 2020-2023 CERN and UCLouvain.
 # Licensed under the GNU Lesser General Public License (version 3 or later).
 # Created by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin.
-# Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+# Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 
 #=== Determine the name of this makefile (https://ftp.gnu.org/old-gnu/Manuals/make-3.80/html_node/make_17.html)
 #=== NB: use ':=' to ensure that the value of CUDACPP_MAKEFILE is not modified further down after including make_opts
@@ -42,10 +42,10 @@ endif
 
 #-------------------------------------------------------------------------------
 
-#=== Configure common compiler flags for C++ and CUDA/HIP
+#=== Configure common compiler flags for C++ and CUDA
 
 INCFLAGS = -I.
-OPTFLAGS = -O3 # this ends up in GPUFLAGS too (should it?), cannot add -Ofast or -ffast-math here
+OPTFLAGS = -O3 # this ends up in CUFLAGS too (should it?), cannot add -Ofast or -ffast-math here
 
 # Dependency on src directory
 MG5AMC_COMMONLIB = mg5amc_common
@@ -121,46 +121,24 @@ endif
 
 #-------------------------------------------------------------------------------
 
-#=== Configure the GPU compiler (CUDA or HIP)
+#=== Configure the CUDA compiler
 
-# FIXME! (AV 24.01.2024)
-# In the current implementation (without separate builds for C++ and CUDA/HIP), we first check for cudacc and hipcc in CUDA_HOME and HIP_HOME.
-# If CUDA_HOME or HIP_HOME are not set, try to determine them from the path to cudacc and hipcc.
-# While convoluted, this is currently necessary to allow disabling CUDA/HIP builds by setting CUDA_HOME or HIP_HOME to invalid paths.
-# This will (probably?) be fixed when separate C++ and CUDA/HIP builds are implemented (PR #775).
-
-# If CXX is not a single word (example "clang++ --gcc-toolchain...") then disable CUDA and HIP builds (issue #505)
-# This is because it is impossible to pass this to "GPUFLAGS += -ccbin <host-compiler>" below
+# If CXX is not a single word (example "clang++ --gcc-toolchain...") then disable CUDA builds (issue #505)
+# This is because it is impossible to pass this to "CUFLAGS += -ccbin <host-compiler>" below
 ifneq ($(words $(subst ccache ,,$(CXX))),1) # allow at most "CXX=ccache <host-compiler>" from outside
-  $(warning CUDA and HIP builds are not supported for multi-word CXX "$(CXX)")
+  $(warning CUDA builds are not supported for multi-word CXX "$(CXX)")
   override CUDA_HOME=disabled
-  override HIP_HOME=disabled
 endif
 
-# If CUDA_HOME is not set, try to set it from the path to nvcc
+# If CUDA_HOME is not set, try to set it from the location of nvcc
 ifndef CUDA_HOME
   CUDA_HOME = $(patsubst %bin/nvcc,%,$(shell which nvcc 2>/dev/null))
   $(warning CUDA_HOME was not set: using "$(CUDA_HOME)")
 endif
 
-# If HIP_HOME is not set, try to set it from the path to hipcc
-ifndef HIP_HOME
-  HIP_HOME = $(patsubst %bin/hipcc,%,$(HIP_COMPILER_PATH))
-  $(warning HIP_HOME was not set: using "$(HIP_HOME)")
-endif
-
-# FIXME! (AV 24.01.2024)
-# In the current implementation (without separate builds for C++ and CUDA/HIP),
-# builds are performed for HIP only if CUDA is not found in the path.
-# If both CUDA and HIP are installed, HIP builds can be triggered by unsetting CUDA_HOME.
-# This will be fixed when separate C++ and CUDA/HIP builds are implemented (PR #775).
-
-#--- Option 1: CUDA exists -> use CUDA
-
-# Set GPUCC as $(CUDA_HOME)/bin/nvcc if it exists
+# Set NVCC as $(CUDA_HOME)/bin/nvcc if it exists
 ifneq ($(wildcard $(CUDA_HOME)/bin/nvcc),)
-
-  GPUCC = $(CUDA_HOME)/bin/nvcc
+  NVCC = $(CUDA_HOME)/bin/nvcc
   USE_NVTX ?=-DUSE_NVTX
   # See https://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/index.html
   # See https://arnon.dk/matching-sm-architectures-arch-and-gencode-for-various-nvidia-cards/
@@ -180,77 +158,41 @@ ifneq ($(wildcard $(CUDA_HOME)/bin/nvcc),)
     CURANDLIBFLAGS = -L$(CUDA_HOME)/lib64/ -lcurand # NB: -lcuda is not needed here!
   endif
   CUOPTFLAGS = -lineinfo
-  GPUFLAGS = $(OPTFLAGS) $(CUOPTFLAGS) $(INCFLAGS) $(CUINC) $(USE_NVTX) $(CUARCHFLAGS) -use_fast_math
-  ###GPUFLAGS += -Xcompiler -Wall -Xcompiler -Wextra -Xcompiler -Wshadow
-  ###GPUCC_VERSION = $(shell $(GPUCC) --version | grep 'Cuda compilation tools' | cut -d' ' -f5 | cut -d, -f1)
-  GPUFLAGS += -std=c++17 # need CUDA >= 11.2 (see #333): this is enforced in mgOnGpuConfig.h
+  CUFLAGS = $(foreach opt, $(OPTFLAGS), -Xcompiler $(opt)) $(CUOPTFLAGS) $(INCFLAGS) $(CUINC) $(USE_NVTX) $(CUARCHFLAGS) -use_fast_math
+  ###CUFLAGS += -Xcompiler -Wall -Xcompiler -Wextra -Xcompiler -Wshadow
+  ###NVCC_VERSION = $(shell $(NVCC) --version | grep 'Cuda compilation tools' | cut -d' ' -f5 | cut -d, -f1)
+  CUFLAGS += -std=c++17 # need CUDA >= 11.2 (see #333): this is enforced in mgOnGpuConfig.h
   # Without -maxrregcount: baseline throughput: 6.5E8 (16384 32 12) up to 7.3E8 (65536 128 12)
-  ###GPUFLAGS+= --maxrregcount 160 # improves throughput: 6.9E8 (16384 32 12) up to 7.7E8 (65536 128 12)
-  ###GPUFLAGS+= --maxrregcount 128 # improves throughput: 7.3E8 (16384 32 12) up to 7.6E8 (65536 128 12)
-  ###GPUFLAGS+= --maxrregcount 96 # degrades throughput: 4.1E8 (16384 32 12) up to 4.5E8 (65536 128 12)
-  ###GPUFLAGS+= --maxrregcount 64 # degrades throughput: 1.7E8 (16384 32 12) flat at 1.7E8 (65536 128 12)
-  CUBUILDRULEFLAGS = -Xcompiler -fPIC -c
-  CCBUILDRULEFLAGS = -Xcompiler -fPIC -c -x cu
-  CUDATESTFLAGS = -lcuda
-
-  # Set the host C++ compiler for GPUCC via "-ccbin <host-compiler>"
-  # (NB issue #505: this must be a single word, "clang++ --gcc-toolchain..." is not supported)
-  GPUFLAGS += -ccbin $(shell which $(subst ccache ,,$(CXX)))
-
-  # Allow newer (unsupported) C++ compilers with older versions of CUDA if ALLOW_UNSUPPORTED_COMPILER_IN_CUDA is set (#504)
-  ifneq ($(origin ALLOW_UNSUPPORTED_COMPILER_IN_CUDA),undefined)
-  GPUFLAGS += -allow-unsupported-compiler
-  endif
-
+  ###CUFLAGS+= --maxrregcount 160 # improves throughput: 6.9E8 (16384 32 12) up to 7.7E8 (65536 128 12)
+  ###CUFLAGS+= --maxrregcount 128 # improves throughput: 7.3E8 (16384 32 12) up to 7.6E8 (65536 128 12)
+  ###CUFLAGS+= --maxrregcount 96 # degrades throughput: 4.1E8 (16384 32 12) up to 4.5E8 (65536 128 12)
+  ###CUFLAGS+= --maxrregcount 64 # degrades throughput: 1.7E8 (16384 32 12) flat at 1.7E8 (65536 128 12)
 else ifneq ($(origin REQUIRE_CUDA),undefined)
-
   # If REQUIRE_CUDA is set but no cuda is found, stop here (e.g. for CI tests on GPU #443)
-  $(error No cuda installation found (set CUDA_HOME or make GPUCC visible in PATH))
-
-#--- Option 2: CUDA does not exist, HIP exists -> use HIP
-
-# Set GPUCC as $(HIP_HOME)/bin/hipcc if it exists
-else ifneq ($(wildcard $(HIP_HOME)/bin/hipcc),)
-
-  GPUCC = $(HIP_HOME)/bin/hipcc
-  #USE_NVTX ?=-DUSE_NVTX # should maybe find something equivalent to this in HIP?
-  HIPARCHFLAGS = -target x86_64-linux-gnu --offload-arch=gfx90a
-  HIPINC = -I$(HIP_HOME)/include/
-  # Note: -DHIP_FAST_MATH is equivalent to -use_fast_math in HIP 
-  # (but only for single precision line 208: https://rocm-developer-tools.github.io/HIP/hcc__detail_2math__functions_8h_source.html)
-  GPUFLAGS = $(OPTFLAGS) $(CUOPTFLAGS) $(INCFLAGS) $(HIPINC) $(HIPARCHFLAGS) -DHIP_FAST_MATH -DHIP_PLATFORM=amd -fPIC
-  ###GPUFLAGS += -Xcompiler -Wall -Xcompiler -Wextra -Xcompiler -Wshadow
-  GPUFLAGS += -std=c++17
-  ###GPUFLAGS+= --maxrregcount 255 # (AV: is this option valid on HIP and meaningful on AMD GPUs?)
-  CUBUILDRULEFLAGS = -fPIC -c
-  CCBUILDRULEFLAGS = -fPIC -c
-
-else ifneq ($(origin REQUIRE_HIP),undefined)
-
-  # If REQUIRE_HIP is set but no HIP is found, stop here (e.g. for CI tests on GPU #443)
-  $(error No hip installation found (set HIP_HOME or make GPUCC visible in PATH))
-
-#--- Option 3: CUDA does not exist, HIP does not exist -> switch off both CUDA and HIP
-
+  $(error No cuda installation found (set CUDA_HOME or make nvcc visible in PATH))
 else
-
-  # No cudacc and no hipcc: switch CUDA and HIP compilation off and go to common random numbers in C++
+  # No cuda. Switch cuda compilation off and go to common random numbers in C++
   $(warning CUDA_HOME is not set or is invalid: export CUDA_HOME to compile with cuda)
-  $(warning HIP_HOME is not set or is invalid: export HIP_HOME to compile with hip)
-  override GPUCC=
+  override NVCC=
   override USE_NVTX=
   override CUINC=
   override CURANDLIBFLAGS=
-
 endif
+export NVCC
+export CUFLAGS
 
-# Export GPUCC (so that it can also be used in cudacpp_src.mk?)
-export GPUCC
-export GPUFLAGS
+# Set the host C++ compiler for nvcc via "-ccbin <host-compiler>"
+# (NB issue #505: this must be a single word, "clang++ --gcc-toolchain..." is not supported)
+CUFLAGS += -ccbin $(shell which $(subst ccache ,,$(CXX)))
+
+# Allow newer (unsupported) C++ compilers with older versions of CUDA if ALLOW_UNSUPPORTED_COMPILER_IN_CUDA is set (#504)
+ifneq ($(origin ALLOW_UNSUPPORTED_COMPILER_IN_CUDA),undefined)
+CUFLAGS += -allow-unsupported-compiler
+endif
 
 #-------------------------------------------------------------------------------
 
-#=== Configure ccache for C++ and CUDA/HIP builds
+#=== Configure ccache for C++ and CUDA builds
 
 # Enable ccache if USECCACHE=1
 ifeq ($(USECCACHE)$(shell echo $(CXX) | grep ccache),1)
@@ -259,15 +201,15 @@ endif
 #ifeq ($(USECCACHE)$(shell echo $(AR) | grep ccache),1)
 #  override AR:=ccache $(AR)
 #endif
-ifneq ($(GPUCC),)
-  ifeq ($(USECCACHE)$(shell echo $(GPUCC) | grep ccache),1)
-    override GPUCC:=ccache $(GPUCC)
+ifneq ($(NVCC),)
+  ifeq ($(USECCACHE)$(shell echo $(NVCC) | grep ccache),1)
+    override NVCC:=ccache $(NVCC)
   endif
 endif
 
 #-------------------------------------------------------------------------------
 
-#=== Configure PowerPC-specific compiler flags for C++ and CUDA/HIP
+#=== Configure PowerPC-specific compiler flags for C++ and CUDA
 
 # PowerPC-specific CXX compiler flags (being reviewed)
 ifeq ($(UNAME_P),ppc64le)
@@ -283,9 +225,9 @@ else
   ######CXXFLAGS+= -fno-semantic-interposition # no benefit (neither alone, nor combined with -flto)
 endif
 
-# PowerPC-specific CUDA/HIP compiler flags (to be reviewed!)
+# PowerPC-specific CUDA compiler flags (to be reviewed!)
 ifeq ($(UNAME_P),ppc64le)
-  GPUFLAGS+= -Xcompiler -mno-float128
+  CUFLAGS+= -Xcompiler -mno-float128
 endif
 
 #-------------------------------------------------------------------------------
@@ -295,7 +237,7 @@ endif
 # Set the default OMPFLAGS choice
 ifneq ($(shell $(CXX) --version | egrep '^Intel'),)
 override OMPFLAGS = -fopenmp
-###override OMPFLAGS = # disable OpenMP MT on Intel (was ok without GPUCC but not ok with GPUCC before #578)
+###override OMPFLAGS = # disable OpenMP MT on Intel (was ok without nvcc but not ok with nvcc before #578)
 else ifneq ($(shell $(CXX) --version | egrep '^(clang)'),)
 override OMPFLAGS = -fopenmp
 ###override OMPFLAGS = # disable OpenMP MT on clang (was not ok without or with nvcc before #578)
@@ -351,10 +293,7 @@ endif
 
 # Set the default RNDGEN (random number generator) choice
 ifeq ($(RNDGEN),)
-  ifeq ($(GPUCC),)
-    override RNDGEN = hasNoCurand
-  # Edgecase for HIP compilation
-  else ifeq ($(findstring hipcc,$(GPUCC)),hipcc)
+  ifeq ($(NVCC),)
     override RNDGEN = hasNoCurand
   else ifeq ($(RNDGEN),)
     override RNDGEN = hasCurand
@@ -371,7 +310,7 @@ export OMPFLAGS
 
 #-------------------------------------------------------------------------------
 
-#=== Set the CUDA/HIP/C++ compiler flags appropriate to user-defined choices of AVX, FPTYPE, HELINL, HRDCOD, RNDGEN
+#=== Set the CUDA/C++ compiler flags appropriate to user-defined choices of AVX, FPTYPE, HELINL, HRDCOD, RNDGEN
 
 # Set the build flags appropriate to OMPFLAGS
 $(info OMPFLAGS=$(OMPFLAGS))
@@ -429,13 +368,13 @@ CXXFLAGS+= $(AVXFLAGS)
 $(info FPTYPE=$(FPTYPE))
 ifeq ($(FPTYPE),d)
   CXXFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_DOUBLE
-  GPUFLAGS  += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_DOUBLE
+  CUFLAGS  += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_DOUBLE
 else ifeq ($(FPTYPE),f)
   CXXFLAGS += -DMGONGPU_FPTYPE_FLOAT -DMGONGPU_FPTYPE2_FLOAT
-  GPUFLAGS  += -DMGONGPU_FPTYPE_FLOAT -DMGONGPU_FPTYPE2_FLOAT
+  CUFLAGS  += -DMGONGPU_FPTYPE_FLOAT -DMGONGPU_FPTYPE2_FLOAT
 else ifeq ($(FPTYPE),m)
   CXXFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_FLOAT
-  GPUFLAGS  += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_FLOAT
+  CUFLAGS  += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_FLOAT
 else
   $(error Unknown FPTYPE='$(FPTYPE)': only 'd', 'f' and 'm' are supported)
 endif
@@ -444,7 +383,7 @@ endif
 $(info HELINL=$(HELINL))
 ifeq ($(HELINL),1)
   CXXFLAGS += -DMGONGPU_INLINE_HELAMPS
-  GPUFLAGS  += -DMGONGPU_INLINE_HELAMPS
+  CUFLAGS  += -DMGONGPU_INLINE_HELAMPS
 else ifneq ($(HELINL),0)
   $(error Unknown HELINL='$(HELINL)': only '0' and '1' are supported)
 endif
@@ -453,7 +392,7 @@ endif
 $(info HRDCOD=$(HRDCOD))
 ifeq ($(HRDCOD),1)
   CXXFLAGS += -DMGONGPU_HARDCODE_PARAM
-  GPUFLAGS  += -DMGONGPU_HARDCODE_PARAM
+  CUFLAGS  += -DMGONGPU_HARDCODE_PARAM
 else ifneq ($(HRDCOD),0)
   $(error Unknown HRDCOD='$(HRDCOD)': only '0' and '1' are supported)
 endif
@@ -505,11 +444,11 @@ ifeq ($(UNAME_S),Darwin)
   override CULIBFLAGSRPATH2 =
 else
   # RPATH to cuda/cpp libs when linking executables
-  override CXXLIBFLAGSRPATH = -Wl,-rpath=$(LIBDIRRPATH)
-  override CULIBFLAGSRPATH = -Xlinker -rpath=$(LIBDIRRPATH)
+  override CXXLIBFLAGSRPATH = -Wl,-rpath,$(LIBDIRRPATH)
+  override CULIBFLAGSRPATH = -Xlinker -rpath,$(LIBDIRRPATH)
   # RPATH to common lib when linking cuda/cpp libs
-  override CXXLIBFLAGSRPATH2 = -Wl,-rpath='$$ORIGIN'
-  override CULIBFLAGSRPATH2 = -Xlinker -rpath='$$ORIGIN'
+  override CXXLIBFLAGSRPATH2 = -Wl,-rpath,'$$ORIGIN'
+  override CULIBFLAGSRPATH2 = -Xlinker -rpath,'$$ORIGIN'
 endif
 
 # Setting LD_LIBRARY_PATH or DYLD_LIBRARY_PATH in the RUNTIME is no longer necessary (neither on Linux nor on Mac)
@@ -522,7 +461,7 @@ override RUNTIME =
 cxx_main=$(BUILDDIR)/check.exe
 fcxx_main=$(BUILDDIR)/fcheck.exe
 
-ifneq ($(GPUCC),)
+ifneq ($(NVCC),)
 cu_main=$(BUILDDIR)/gcheck.exe
 fcu_main=$(BUILDDIR)/fgcheck.exe
 else
@@ -553,16 +492,15 @@ $(BUILDDIR)/.build.$(TAG):
 	@touch $(BUILDDIR)/.build.$(TAG)
 
 # Generic target and build rules: objects from CUDA compilation
-ifneq ($(GPUCC),)
+ifneq ($(NVCC),)
 $(BUILDDIR)/%.o : %.cu *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
 	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
-	$(GPUCC) $(CPPFLAGS) $(GPUFLAGS) $(CUBUILDRULEFLAGS) $< -o $@
+	$(NVCC) $(CPPFLAGS) $(CUFLAGS) -Xcompiler -fPIC -c $< -o $@
 
 $(BUILDDIR)/%_cu.o : %.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
 	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
-	$(GPUCC) $(CPPFLAGS) $(GPUFLAGS) $(CCBUILDRULEFLAGS) $< -o $@
+	$(NVCC) $(CPPFLAGS) $(CUFLAGS) -Xcompiler -fPIC -c -x cu $< -o $@
 endif
-# -x cu in line above
 
 # Generic target and build rules: objects from C++ compilation
 # (NB do not include CUINC here! add it only for NVTX or curand #679)
@@ -571,14 +509,11 @@ $(BUILDDIR)/%.o : %.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
 	$(CXX) $(CPPFLAGS) $(CXXFLAGS) -fPIC -c $< -o $@
 
 # Apply special build flags only to CrossSectionKernel.cc and gCrossSectionKernel.cu (no fast math, see #117 and #516)
-# Added edgecase for HIP compilation
 ifeq ($(shell $(CXX) --version | grep ^nvc++),)
+$(BUILDDIR)/CrossSectionKernels.o: CXXFLAGS := $(filter-out -ffast-math,$(CXXFLAGS))
 $(BUILDDIR)/CrossSectionKernels.o: CXXFLAGS += -fno-fast-math
-$(BUILDDIR)/CrossSectionKernels.o: CXXFLAGS += -fno-fast-math
-ifeq ($(findstring nvcc,$(GPUCC)),nvcc)
-  $(BUILDDIR)/gCrossSectionKernels.o: GPUFLAGS += -Xcompiler -fno-fast-math
-else
-  $(BUILDDIR)/gCrossSectionKernels.o: GPUFLAGS += -fno-fast-math
+ifneq ($(NVCC),)
+$(BUILDDIR)/gCrossSectionKernels.o: CUFLAGS += -Xcompiler -fno-fast-math
 endif
 endif
 
@@ -595,10 +530,10 @@ ifeq ($(RNDGEN),hasCurand)
 $(BUILDDIR)/CurandRandomNumberKernel.o: CXXFLAGS += $(CUINC)
 endif
 
-# Avoid "warning: builtin __has_trivial_... is deprecated; use __is_trivially_... instead" in GPUCC with icx2023 (#592)
+# Avoid "warning: builtin __has_trivial_... is deprecated; use __is_trivially_... instead" in nvcc with icx2023 (#592)
 ifneq ($(shell $(CXX) --version | egrep '^(Intel)'),)
-ifneq ($(GPUCC),)
-GPUFLAGS += -Wno-deprecated-builtins
+ifneq ($(NVCC),)
+CUFLAGS += -Xcompiler -Wno-deprecated-builtins
 endif
 endif
 
@@ -606,8 +541,8 @@ endif
 # This patch does remove the warning, but I prefer to keep it disabled for the moment...
 ###ifneq ($(shell $(CXX) --version | egrep '^(clang|Apple clang|Intel)'),)
 ###$(BUILDDIR)/CrossSectionKernels.o: CXXFLAGS += -Wno-overriding-t-option
-###ifneq ($(GPUCC),)
-###$(BUILDDIR)/gCrossSectionKernels.o: GPUFLAGS += -Xcompiler -Wno-overriding-t-option
+###ifneq ($(NVCC),)
+###$(BUILDDIR)/gCrossSectionKernels.o: CUFLAGS += -Xcompiler -Wno-overriding-t-option
 ###endif
 ###endif
 
@@ -634,7 +569,7 @@ MG5AMC_CXXLIB = mg5amc_$(processid_short)_cpp
 cxx_objects_lib=$(BUILDDIR)/CPPProcess.o $(BUILDDIR)/MatrixElementKernels.o $(BUILDDIR)/BridgeKernels.o $(BUILDDIR)/CrossSectionKernels.o
 cxx_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel.o $(BUILDDIR)/RamboSamplingKernels.o
 
-ifneq ($(GPUCC),)
+ifneq ($(NVCC),)
 MG5AMC_CULIB = mg5amc_$(processid_short)_cuda
 cu_objects_lib=$(BUILDDIR)/gCPPProcess.o $(BUILDDIR)/gMatrixElementKernels.o $(BUILDDIR)/gBridgeKernels.o $(BUILDDIR)/gCrossSectionKernels.o
 cu_objects_exe=$(BUILDDIR)/gCommonRandomNumberKernel.o $(BUILDDIR)/gRamboSamplingKernels.o
@@ -646,11 +581,11 @@ $(LIBDIR)/lib$(MG5AMC_CXXLIB).so: cxx_objects_lib += $(BUILDDIR)/fbridge.o
 $(LIBDIR)/lib$(MG5AMC_CXXLIB).so: $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib)
 	$(CXX) -shared -o $@ $(cxx_objects_lib) $(CXXLIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB)
 
-ifneq ($(GPUCC),)
+ifneq ($(NVCC),)
 $(LIBDIR)/lib$(MG5AMC_CULIB).so: $(BUILDDIR)/fbridge_cu.o
 $(LIBDIR)/lib$(MG5AMC_CULIB).so: cu_objects_lib += $(BUILDDIR)/fbridge_cu.o
 $(LIBDIR)/lib$(MG5AMC_CULIB).so: $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cu_objects_lib)
-	$(GPUCC) --shared -o $@ $(cu_objects_lib) $(CULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB)
+	$(NVCC) --shared -o $@ $(cu_objects_lib) $(CULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB)
 endif
 
 #-------------------------------------------------------------------------------
@@ -667,16 +602,16 @@ $(cxx_main): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PAT
 $(cxx_main): $(BUILDDIR)/check_sa.o $(LIBDIR)/lib$(MG5AMC_CXXLIB).so $(cxx_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel.o
 	$(CXX) -o $@ $(BUILDDIR)/check_sa.o $(OMPFLAGS) -ldl -pthread $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_CXXLIB) $(cxx_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel.o $(CURANDLIBFLAGS)
 
-ifneq ($(GPUCC),)
+ifneq ($(NVCC),)
 ifneq ($(shell $(CXX) --version | grep ^Intel),)
-$(cu_main): LIBFLAGS += -lintlc # compile with icpx and link with GPUCC (undefined reference to `_intel_fast_memcpy')
-$(cu_main): LIBFLAGS += -lsvml # compile with icpx and link with GPUCC (undefined reference to `__svml_cos4_l9')
+$(cu_main): LIBFLAGS += -lintlc # compile with icpx and link with nvcc (undefined reference to `_intel_fast_memcpy')
+$(cu_main): LIBFLAGS += -lsvml # compile with icpx and link with nvcc (undefined reference to `__svml_cos4_l9')
 else ifneq ($(shell $(CXX) --version | grep ^nvc++),) # support nvc++ #531
 $(cu_main): LIBFLAGS += -L$(patsubst %bin/nvc++,%lib,$(subst ccache ,,$(CXX))) -lnvhpcatm -lnvcpumath -lnvc
 endif
 $(cu_main): LIBFLAGS += $(CULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH
 $(cu_main): $(BUILDDIR)/gcheck_sa.o $(LIBDIR)/lib$(MG5AMC_CULIB).so $(cu_objects_exe) $(BUILDDIR)/gCurandRandomNumberKernel.o
-	$(GPUCC) -o $@ $(BUILDDIR)/gcheck_sa.o $(CUARCHFLAGS) $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe) $(BUILDDIR)/gCurandRandomNumberKernel.o $(CURANDLIBFLAGS)
+	$(NVCC) -o $@ $(BUILDDIR)/gcheck_sa.o $(CUARCHFLAGS) $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe) $(BUILDDIR)/gCurandRandomNumberKernel.o $(CURANDLIBFLAGS)
 endif
 
 #-------------------------------------------------------------------------------
@@ -702,17 +637,17 @@ $(fcxx_main): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PA
 $(fcxx_main): $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler.o $(LIBDIR)/lib$(MG5AMC_CXXLIB).so $(cxx_objects_exe)
 	$(CXX) -o $@ $(BUILDDIR)/fcheck_sa.o $(OMPFLAGS) $(BUILDDIR)/fsampler.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CXXLIB) $(cxx_objects_exe)
 
-ifneq ($(GPUCC),)
+ifneq ($(NVCC),)
 ifneq ($(shell $(CXX) --version | grep ^Intel),)
-$(fcu_main): LIBFLAGS += -lintlc # compile with icpx and link with GPUCC (undefined reference to `_intel_fast_memcpy')
-$(fcu_main): LIBFLAGS += -lsvml # compile with icpx and link with GPUCC (undefined reference to `__svml_cos4_l9')
+$(fcu_main): LIBFLAGS += -lintlc # compile with icpx and link with nvcc (undefined reference to `_intel_fast_memcpy')
+$(fcu_main): LIBFLAGS += -lsvml # compile with icpx and link with nvcc (undefined reference to `__svml_cos4_l9')
 endif
 ifeq ($(UNAME_S),Darwin)
 $(fcu_main): LIBFLAGS += -L$(shell dirname $(shell $(FC) --print-file-name libgfortran.dylib)) # add path to libgfortran on Mac #375
 endif
 $(fcu_main): LIBFLAGS += $(CULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH
 $(fcu_main): $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler_cu.o $(LIBDIR)/lib$(MG5AMC_CULIB).so $(cu_objects_exe)
-	$(GPUCC) -o $@ $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler_cu.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe)
+	$(NVCC) -o $@ $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler_cu.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe)
 endif
 
 #-------------------------------------------------------------------------------
@@ -724,7 +659,7 @@ $(BUILDDIR)/testxxx.o: testxxx_cc_ref.txt
 $(testmain): $(BUILDDIR)/testxxx.o
 $(testmain): cxx_objects_exe += $(BUILDDIR)/testxxx.o # Comment out this line to skip the C++ test of xxx functions
 
-ifneq ($(GPUCC),)
+ifneq ($(NVCC),)
 $(BUILDDIR)/testxxx_cu.o: $(GTESTLIBS)
 $(BUILDDIR)/testxxx_cu.o: INCFLAGS += $(GTESTINC)
 $(BUILDDIR)/testxxx_cu.o: testxxx_cc_ref.txt
@@ -737,7 +672,7 @@ $(BUILDDIR)/testmisc.o: INCFLAGS += $(GTESTINC)
 $(testmain): $(BUILDDIR)/testmisc.o
 $(testmain): cxx_objects_exe += $(BUILDDIR)/testmisc.o # Comment out this line to skip the C++ miscellaneous tests
 
-ifneq ($(GPUCC),)
+ifneq ($(NVCC),)
 $(BUILDDIR)/testmisc_cu.o: $(GTESTLIBS)
 $(BUILDDIR)/testmisc_cu.o: INCFLAGS += $(GTESTINC)
 $(testmain): $(BUILDDIR)/testmisc_cu.o
@@ -749,12 +684,12 @@ $(BUILDDIR)/runTest.o: INCFLAGS += $(GTESTINC)
 $(testmain): $(BUILDDIR)/runTest.o
 $(testmain): cxx_objects_exe += $(BUILDDIR)/runTest.o
 
-ifneq ($(GPUCC),)
+ifneq ($(NVCC),)
 $(BUILDDIR)/runTest_cu.o: $(GTESTLIBS)
 $(BUILDDIR)/runTest_cu.o: INCFLAGS += $(GTESTINC)
 ifneq ($(shell $(CXX) --version | grep ^Intel),)
-$(testmain): LIBFLAGS += -lintlc # compile with icpx and link with GPUCC (undefined reference to `_intel_fast_memcpy')
-$(testmain): LIBFLAGS += -lsvml # compile with icpx and link with GPUCC (undefined reference to `__svml_cos4_l9')
+$(testmain): LIBFLAGS += -lintlc # compile with icpx and link with nvcc (undefined reference to `_intel_fast_memcpy')
+$(testmain): LIBFLAGS += -lsvml # compile with icpx and link with nvcc (undefined reference to `__svml_cos4_l9')
 else ifneq ($(shell $(CXX) --version | grep ^nvc++),) # support nvc++ #531
 $(testmain): LIBFLAGS += -L$(patsubst %bin/nvc++,%lib,$(subst ccache ,,$(CXX))) -lnvhpcatm -lnvcpumath -lnvc
 endif
@@ -778,14 +713,14 @@ $(testmain): LIBFLAGS += -lgomp
 endif
 endif
 
-ifeq ($(GPUCC),) # link only runTest.o
+ifeq ($(NVCC),) # link only runTest.o
 $(testmain): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH
 $(testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(cxx_objects_exe) $(GTESTLIBS)
 	$(CXX) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) -ldl -pthread $(LIBFLAGS)
 else # link both runTest.o and runTest_cu.o
 $(testmain): LIBFLAGS += $(CULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH
 $(testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) $(GTESTLIBS)
-	  $(GPUCC) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) -ldl $(LIBFLAGS) $(CUDATESTFLAGS)
+	$(NVCC) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) -ldl $(LIBFLAGS) -lcuda
 endif
 
 # Use target gtestlibs to build only googletest
@@ -894,9 +829,9 @@ ifeq ($(USECCACHE),1)
 	ccache --version | head -1
 endif
 	@echo ""
-	@echo GPUCC=$(GPUCC)
-ifneq ($(GPUCC),)
-	$(GPUCC) --version
+	@echo NVCC=$(NVCC)
+ifneq ($(NVCC),)
+	$(NVCC) --version
 endif
 	@echo ""
 	@echo CXX=$(CXX)
@@ -915,7 +850,7 @@ endif
 
 # Target: check (run the C++ test executable)
 # [NB THIS IS WHAT IS USED IN THE GITHUB CI!]
-ifneq ($(GPUCC),)
+ifneq ($(NVCC),)
 check: runTest cmpFcheck cmpFGcheck
 else
 check: runTest cmpFcheck
diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/fbridge.cc b/epochX/cudacpp/gg_tt.mad/SubProcesses/fbridge.cc
index 22ce3f5115..2d2b36d560 100644
--- a/epochX/cudacpp/gg_tt.mad/SubProcesses/fbridge.cc
+++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/fbridge.cc
@@ -1,11 +1,11 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: S. Roiser (Oct 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Roiser, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #include "Bridge.h"
 #include "CPPProcess.h"
-#include "GpuRuntime.h"
+#include "CudaRuntime.h"
 
 extern "C"
 {
@@ -22,7 +22,7 @@ extern "C"
    * Using the same Fortran MadEvent code, linking to the hetrerogeneous library would allow access to both CPU and GPU implementations.
    * The specific heterogeneous configuration (how many GPUs, how many threads on each CPU, etc) could be loaded in CUDA/C++ from a data file.
    */
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
   using namespace mg5amcGpu;
 #else
   using namespace mg5amcCpu;
@@ -46,8 +46,8 @@ extern "C"
    */
   void fbridgecreate_( CppObjectInFortran** ppbridge, const int* pnevtF, const int* pnparF, const int* pnp4F )
   {
-#ifdef MGONGPUCPP_GPUIMPL
-    GpuRuntime::setUp();
+#ifdef __CUDACC__
+    CudaRuntime::setUp();
 #endif
     // (NB: CPPProcess::initProc no longer needs to be executed here because it is called in the Bridge constructor)
     // FIXME: disable OMP in Bridge when called from Fortran
@@ -65,8 +65,8 @@ extern "C"
     Bridge<FORTRANFPTYPE>* pbridge = dynamic_cast<Bridge<FORTRANFPTYPE>*>( *ppbridge );
     if( pbridge == 0 ) throw std::runtime_error( "fbridgedelete_: invalid Bridge address" );
     delete pbridge;
-#ifdef MGONGPUCPP_GPUIMPL
-    GpuRuntime::tearDown();
+#ifdef __CUDACC__
+    CudaRuntime::tearDown();
 #endif
   }
 
@@ -96,7 +96,7 @@ extern "C"
   {
     Bridge<FORTRANFPTYPE>* pbridge = dynamic_cast<Bridge<FORTRANFPTYPE>*>( *ppbridge );
     if( pbridge == 0 ) throw std::runtime_error( "fbridgesequence_: invalid Bridge address" );
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
     // Use the device/GPU implementation in the CUDA library
     // (there is also a host implementation in this library)
     pbridge->gpu_sequence( momenta, gs, rndhel, rndcol, *pchannelId, mes, selhel, selcol );
diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/fsampler.cc b/epochX/cudacpp/gg_tt.mad/SubProcesses/fsampler.cc
index 3743934f41..2fb445372d 100644
--- a/epochX/cudacpp/gg_tt.mad/SubProcesses/fsampler.cc
+++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/fsampler.cc
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Feb 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #include "mgOnGpuConfig.h"
 
@@ -13,7 +13,7 @@
 
 //--------------------------------------------------------------------------
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -40,7 +40,7 @@ namespace mg5amcCpu
   private:
     const int m_nevt; // The number of events in each iteration
     int m_iiter;      // The iteration counter (for random number seeding)
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
     HostBufferRndNumMomenta m_hstRndmom; // Memory buffers for random numbers
     HostBufferMomenta m_hstMomenta;      // Memory buffers for momenta
     HostBufferWeights m_hstWeights;      // Memory buffers for sampling weights
@@ -105,7 +105,7 @@ namespace mg5amcCpu
 
 extern "C"
 {
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
   using namespace mg5amcGpu;
 #else
   using namespace mg5amcCpu;
diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/runTest.cc b/epochX/cudacpp/gg_tt.mad/SubProcesses/runTest.cc
index de327f2321..d4a760a71b 100644
--- a/epochX/cudacpp/gg_tt.mad/SubProcesses/runTest.cc
+++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/runTest.cc
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: S. Hageboeck (Nov 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 //----------------------------------------------------------------------------
 // Use ./runTest.exe --gtest_filter=*xxx to run only testxxx.cc tests
 //----------------------------------------------------------------------------
@@ -18,7 +18,7 @@
 #include "RandomNumberKernels.h"
 #include "epoch_process_id.h"
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 using namespace mg5amcGpu;
 #else
 using namespace mg5amcCpu;
@@ -35,7 +35,7 @@ struct CUDA_CPU_TestBase : public TestDriverBase
     : TestDriverBase( npar, refFileName ) {}
 };
 
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
 struct CPUTest : public CUDA_CPU_TestBase
 {
   // Struct data members (process, and memory structures for random numbers, momenta, matrix elements and weights on host and device)
@@ -119,7 +119,7 @@ struct CPUTest : public CUDA_CPU_TestBase
 };
 #endif
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 struct CUDATest : public CUDA_CPU_TestBase
 {
   // Reset the device when our test goes out of scope. Note that this should happen after
@@ -128,7 +128,7 @@ struct CUDATest : public CUDA_CPU_TestBase
   {
     ~DeviceReset()
     {
-      checkGpu( gpuDeviceReset() ); // this is needed by cuda-memcheck --leak-check full
+      checkCuda( cudaDeviceReset() ); // this is needed by cuda-memcheck --leak-check full
     }
   } deviceResetter;
 
@@ -256,7 +256,7 @@ INSTANTIATE_TEST_SUITE_P( prefix, \
                           test_suite_name, \
                           testing::Values( new CUDATest( MG_EPOCH_REFERENCE_FILE_NAME ) ) );
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 MG_INSTANTIATE_TEST_SUITE_GPU( XTESTID_GPU( MG_EPOCH_PROCESS_ID ), MadgraphTest );
 #else
 MG_INSTANTIATE_TEST_SUITE_CPU( XTESTID_CPU( MG_EPOCH_PROCESS_ID ), MadgraphTest );
diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/testmisc.cc b/epochX/cudacpp/gg_tt.mad/SubProcesses/testmisc.cc
index ba9e59a8a3..895d6eeb56 100644
--- a/epochX/cudacpp/gg_tt.mad/SubProcesses/testmisc.cc
+++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/testmisc.cc
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 //----------------------------------------------------------------------------
 // Use ./runTest.exe --gtest_filter=*misc to run only testmisc.cc tests
 //----------------------------------------------------------------------------
@@ -17,7 +17,7 @@
 #include <sstream>
 #include <typeinfo>
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 #define TESTID( s ) s##_GPU_MISC
 #else
 #define TESTID( s ) s##_CPU_MISC
@@ -26,7 +26,7 @@
 #define XTESTID( s ) TESTID( s )
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -59,7 +59,7 @@ namespace mg5amcCpu
 
 TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testmisc )
 {
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
   using namespace mg5amcGpu;
 #else
   using namespace mg5amcCpu;
diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/testxxx.cc b/epochX/cudacpp/gg_tt.mad/SubProcesses/testxxx.cc
index e5167de00c..3361fe5aa9 100644
--- a/epochX/cudacpp/gg_tt.mad/SubProcesses/testxxx.cc
+++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/testxxx.cc
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Apr 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 //----------------------------------------------------------------------------
 // Use ./runTest.exe --gtest_filter=*xxx to run only testxxx.cc tests
 //----------------------------------------------------------------------------
@@ -24,7 +24,7 @@
 #include <iomanip>
 #include <iostream>
 #include <vector>
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 #define TESTID( s ) s##_GPU_XXX
 #else
 #define TESTID( s ) s##_CPU_XXX
@@ -32,7 +32,7 @@
 
 #define XTESTID( s ) TESTID( s )
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -42,7 +42,7 @@ namespace mg5amcCpu
   int FPEhandlerIevt = -1;
   inline void FPEhandler( int sig )
   {
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
     std::cerr << "Floating Point Exception (GPU): '" << FPEhandlerMessage << "' ievt=" << FPEhandlerIevt << std::endl;
 #else
     std::cerr << "Floating Point Exception (CPU neppV=" << neppV << "): '" << FPEhandlerMessage << "' ievt=" << FPEhandlerIevt << std::endl;
@@ -53,7 +53,7 @@ namespace mg5amcCpu
 
 TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testxxx )
 {
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
   using namespace mg5amcGpu;
 #else
   using namespace mg5amcCpu;
@@ -77,7 +77,7 @@ TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testxxx )
   assert( nevt % neppM == 0 ); // nevt must be a multiple of neppM
   assert( nevt % neppV == 0 ); // nevt must be a multiple of neppV
   // Fill in the input momenta
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
   mg5amcGpu::PinnedHostBufferMomenta hstMomenta( nevt ); // AOSOA[npagM][npar=4][np4=4][neppM]
 #else
   mg5amcCpu::HostBufferMomenta hstMomenta( nevt ); // AOSOA[npagM][npar=4][np4=4][neppM]
@@ -322,7 +322,7 @@ TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testxxx )
   {
     for( int ievt = 0; ievt < nevt; ievt++ )
     {
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
       using namespace mg5amcGpu;
 #else
       using namespace mg5amcCpu;
diff --git a/epochX/cudacpp/gg_tt.mad/src/HelAmps_sm.h b/epochX/cudacpp/gg_tt.mad/src/HelAmps_sm.h
index add8fce575..55f43bb43a 100644
--- a/epochX/cudacpp/gg_tt.mad/src/HelAmps_sm.h
+++ b/epochX/cudacpp/gg_tt.mad/src/HelAmps_sm.h
@@ -5,7 +5,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: A. Valassi (Sep 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
 // MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
@@ -28,7 +28,7 @@
 //#include <iomanip>
 //#include <iostream>
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/gg_tt.mad/src/Parameters_sm.cc b/epochX/cudacpp/gg_tt.mad/src/Parameters_sm.cc
index c5dd6e7e4c..a9bc93ff98 100644
--- a/epochX/cudacpp/gg_tt.mad/src/Parameters_sm.cc
+++ b/epochX/cudacpp/gg_tt.mad/src/Parameters_sm.cc
@@ -4,7 +4,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: A. Valassi (Sep 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
 // MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
@@ -17,7 +17,7 @@
 #include <iomanip>
 #include <iostream>
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 using namespace mg5amcGpu;
 #else
 using namespace mg5amcCpu;
diff --git a/epochX/cudacpp/gg_tt.mad/src/Parameters_sm.h b/epochX/cudacpp/gg_tt.mad/src/Parameters_sm.h
index 06fc44c44c..932f123fea 100644
--- a/epochX/cudacpp/gg_tt.mad/src/Parameters_sm.h
+++ b/epochX/cudacpp/gg_tt.mad/src/Parameters_sm.h
@@ -27,7 +27,7 @@
 #include "read_slha.h"
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -93,7 +93,7 @@ namespace mg5amcCpu
 #include <limits>
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -217,7 +217,7 @@ namespace mg5amcCpu
 //==========================================================================
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -236,7 +236,7 @@ namespace mg5amcCpu
 #pragma GCC diagnostic push
 #pragma GCC diagnostic ignored "-Wunused-variable"  // e.g. <<warning: unused variable ‘mdl_G__exp__2’ [-Wunused-variable]>>
 #pragma GCC diagnostic ignored "-Wunused-parameter" // e.g. <<warning: unused parameter ‘G’ [-Wunused-parameter]>>
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 #pragma nv_diagnostic push
 #pragma nv_diag_suppress 177 // e.g. <<warning #177-D: variable "mdl_G__exp__2" was declared but never referenced>>
 #endif
@@ -263,7 +263,7 @@ namespace mg5amcCpu
       // End SM implementation - no special handling of vectors of floats as in EFT (#439)
       return out;
     }
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 #pragma GCC diagnostic pop
 #pragma nv_diagnostic pop
 #endif
@@ -279,39 +279,33 @@ namespace mg5amcCpu
 
   //==========================================================================
 
-#ifdef MGONGPUCPP_GPUIMPL
-  namespace mg5amcGpu
-#else
-  namespace mg5amcCpu
-#endif
-  {
 #pragma GCC diagnostic push
 #ifndef __clang__
 #pragma GCC diagnostic ignored "-Wunused-but-set-variable" // e.g. <<warning: variable ‘couplings_sv’ set but not used [-Wunused-but-set-variable]>>
 #endif
-    // Compute the output couplings (e.g. gc10 and gc11) from the input gs
-    template<class G_ACCESS, class C_ACCESS>
-    __device__ inline void
-    G2COUP( const fptype gs[],
-            fptype couplings[] )
-    {
-      mgDebug( 0, __FUNCTION__ );
-      using namespace Parameters_sm_dependentCouplings;
-      const fptype_sv& gs_sv = G_ACCESS::kernelAccessConst( gs );
-      DependentCouplings_sv couplings_sv = computeDependentCouplings_fromG( gs_sv );
-      fptype* GC_10s = C_ACCESS::idcoupAccessBuffer( couplings, idcoup_GC_10 );
-      fptype* GC_11s = C_ACCESS::idcoupAccessBuffer( couplings, idcoup_GC_11 );
-      cxtype_sv_ref GC_10s_sv = C_ACCESS::kernelAccess( GC_10s );
-      cxtype_sv_ref GC_11s_sv = C_ACCESS::kernelAccess( GC_11s );
-      GC_10s_sv = couplings_sv.GC_10;
-      GC_11s_sv = couplings_sv.GC_11;
-      mgDebug( 1, __FUNCTION__ );
-      return;
-    }
+  // Compute the output couplings (e.g. gc10 and gc11) from the input gs
+  template<class G_ACCESS, class C_ACCESS>
+  __device__ inline void
+  G2COUP( const fptype gs[],
+          fptype couplings[] )
+  {
+    mgDebug( 0, __FUNCTION__ );
+    using namespace Parameters_sm_dependentCouplings;
+    const fptype_sv& gs_sv = G_ACCESS::kernelAccessConst( gs );
+    DependentCouplings_sv couplings_sv = computeDependentCouplings_fromG( gs_sv );
+    fptype* GC_10s = C_ACCESS::idcoupAccessBuffer( couplings, idcoup_GC_10 );
+    fptype* GC_11s = C_ACCESS::idcoupAccessBuffer( couplings, idcoup_GC_11 );
+    cxtype_sv_ref GC_10s_sv = C_ACCESS::kernelAccess( GC_10s );
+    cxtype_sv_ref GC_11s_sv = C_ACCESS::kernelAccess( GC_11s );
+    GC_10s_sv = couplings_sv.GC_10;
+    GC_11s_sv = couplings_sv.GC_11;
+    mgDebug( 1, __FUNCTION__ );
+    return;
+  }
 #pragma GCC diagnostic pop
 
-  } // end namespace mg5amcGpu/mg5amcCpu
+} // end namespace mg5amcGpu/mg5amcCpu
 
-  //==========================================================================
+//==========================================================================
 
 #endif // Parameters_sm_H
diff --git a/epochX/cudacpp/gg_tt.mad/src/cudacpp_src.mk b/epochX/cudacpp/gg_tt.mad/src/cudacpp_src.mk
index 159e19a46d..d4cc628aec 100644
--- a/epochX/cudacpp/gg_tt.mad/src/cudacpp_src.mk
+++ b/epochX/cudacpp/gg_tt.mad/src/cudacpp_src.mk
@@ -19,7 +19,7 @@ SHELL := /bin/bash
 #=== Configure common compiler flags for CUDA and C++
 
 INCFLAGS = -I.
-OPTFLAGS = -O3 # this ends up in GPUFLAGS too (should it?), cannot add -Ofast or -ffast-math here
+OPTFLAGS = -O3 # this ends up in CUFLAGS too (should it?), cannot add -Ofast or -ffast-math here
 
 #-------------------------------------------------------------------------------
 
@@ -45,13 +45,13 @@ endif
 
 #-------------------------------------------------------------------------------
 
-#=== Configure the CUDA compiler (note: GPUCC is already exported including ccache)
+#=== Configure the CUDA compiler (note: NVCC is already exported including ccache)
 
-###$(info GPUCC=$(GPUCC))
+###$(info NVCC=$(NVCC))
 
 #-------------------------------------------------------------------------------
 
-#=== Configure ccache for C++ builds (note: GPUCC is already exported including ccache)
+#=== Configure ccache for C++ builds (note: NVCC is already exported including ccache)
 
 # Enable ccache if USECCACHE=1
 ifeq ($(USECCACHE)$(shell echo $(CXX) | grep ccache),1)
@@ -92,13 +92,6 @@ endif
 ###$(info OMPFLAGS=$(OMPFLAGS))
 CXXFLAGS += $(OMPFLAGS)
 
-# Add correct -DHIP_LATFORM when compiling for HIP
-ifeq ($(findstring nvcc,$(GPUCC)),nvcc)
-  GPUFLAGS += -Xcompiler -fPIC -c -x cu
-else ifeq ($(findstring hipcc,$(GPUCC)),hipcc)
-  GPUFLAGS += -fPIC -c
-endif
-
 # Set the build flags appropriate to each AVX choice (example: "make AVX=none")
 # [NB MGONGPU_PVW512 is needed because "-mprefer-vector-width=256" is not exposed in a macro]
 # [See https://gcc.gnu.org/bugzilla/show_bug.cgi?id=96476]
@@ -260,20 +253,20 @@ $(BUILDDIR)/%.o : %.cc *.h $(BUILDDIR)/.build.$(TAG)
 # Generic target and build rules: objects from CUDA compilation
 $(BUILDDIR)/%_cu.o : %.cc *.h $(BUILDDIR)/.build.$(TAG)
 	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
-	$(GPUCC) $(CPPFLAGS) $(GPUFLAGS) $< -o $@
+	$(NVCC) $(CPPFLAGS) $(CUFLAGS) -Xcompiler -fPIC -c -x cu $< -o $@
 
 #-------------------------------------------------------------------------------
 
 cxx_objects=$(addprefix $(BUILDDIR)/, Parameters_sm.o read_slha.o)
-ifneq ($(GPUCC),)
+ifneq ($(NVCC),)
 cu_objects=$(addprefix $(BUILDDIR)/, Parameters_sm_cu.o)
 endif
 
 # Target (and build rules): common (src) library
-ifneq ($(GPUCC),)
+ifneq ($(NVCC),)
 $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so : $(cxx_objects) $(cu_objects)
 	@if [ ! -d $(LIBDIR) ]; then echo "mkdir -p $(LIBDIR)"; mkdir -p $(LIBDIR); fi
-	$(GPUCC) -shared -o $@ $(cxx_objects) $(cu_objects) $(LDFLAGS)
+	$(NVCC) -shared -o $@ $(cxx_objects) $(cu_objects) $(LDFLAGS)
 else
 $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so : $(cxx_objects)
 	@if [ ! -d $(LIBDIR) ]; then echo "mkdir -p $(LIBDIR)"; mkdir -p $(LIBDIR); fi
diff --git a/epochX/cudacpp/gg_tt.mad/src/mgOnGpuConfig.h b/epochX/cudacpp/gg_tt.mad/src/mgOnGpuConfig.h
index d9af210552..80032e528b 100644
--- a/epochX/cudacpp/gg_tt.mad/src/mgOnGpuConfig.h
+++ b/epochX/cudacpp/gg_tt.mad/src/mgOnGpuConfig.h
@@ -1,37 +1,21 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jul 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MGONGPUCONFIG_H
 #define MGONGPUCONFIG_H 1
 
-#include "GpuRuntime.h" // Includes the GPU abstraction
-
 // HARDCODED AT CODE GENERATION TIME: DO NOT MODIFY (#473)
 // There are two different code bases for standalone_cudacpp (without multichannel) and madevent+cudacpp (with multichannel)
 #define MGONGPU_SUPPORTS_MULTICHANNEL 1
 
-// Is this a GPU (CUDA, HIP) or CPU implementation?
-#ifdef __CUDACC__
-#define MGONGPUCPP_GPUIMPL cuda
-#elif defined __HIPCC__
-#define MGONGPUCPP_GPUIMPL hip
-#include "hip/hip_runtime.h"
-#else
-#undef MGONGPUCPP_GPUIMPL
-#endif
-
 // ** NB1 Throughputs (e.g. 6.8E8) are events/sec for "./gcheck.exe -p 65536 128 12"
 // ** NB2 Baseline on b7g47n0004 fluctuates (probably depends on load on other VMs)
 
 // Choose if curand is supported for generating random numbers
-// For HIP, by default, do not use curand (common random numbers will be used instead)
 // For both CUDA and C++, by default, do not inline, but allow this macro to be set from outside with e.g. -DMGONGPU_HAS_NO_CURAND
-// (there exist CUDA installations, e.g. using the HPC package, which do not include curand - see PR #784 and #785)
-#if defined __HIPCC__
-#define MGONGPU_HAS_NO_CURAND 1
-#else
+// (there exist CUDA installations, e.g. using the HPC package, which do not include curand - see PR #784)
 //#ifdef __CUDACC__
 //#undef MGONGPU_HAS_NO_CURAND // default
 ////#define MGONGPU_HAS_NO_CURAND 1
@@ -39,7 +23,6 @@
 //#undef MGONGPU_HAS_NO_CURAND // default
 ////#define MGONGPU_HAS_NO_CURAND 1
 //#endif
-#endif
 
 // Choose floating point precision (for everything but color algebra #537)
 // If one of these macros has been set from outside with e.g. -DMGONGPU_FPTYPE_FLOAT, nothing happens (issue #167)
@@ -71,28 +54,23 @@
 //#undef MGONGPU_HARDCODE_PARAM // default
 ////#define MGONGPU_HARDCODE_PARAM 1
 
-// Complex type in CUDA: thrust or cucomplex or cxsmpl (CHOOSE ONLY ONE)
+// Complex type in c++: std::complex or cxsmpl (CHOOSE ONLY ONE)
+#ifndef __CUDACC__
+//#define MGONGPU_CPPCXTYPE_STDCOMPLEX 1 // ~8 percent slower on float, same on double (5.1E6/double, 9.4E6/float)
+#define MGONGPU_CPPCXTYPE_CXSMPL 1 // new default (5.1E6/double, 10.2E6/float)
+#endif
+
+// Complex type in cuda: thrust or cucomplex or cxsmpl (CHOOSE ONLY ONE)
 #ifdef __CUDACC__
 #define MGONGPU_CUCXTYPE_THRUST 1 // default (~1.15E9/double, ~3.2E9/float)
 //#define MGONGPU_CUCXTYPE_CUCOMPLEX 1 // ~10 percent slower (1.03E9/double, ~2.8E9/float)
 //#define MGONGPU_CUCXTYPE_CXSMPL 1 // ~10 percent slower (1.00E9/double, ~2.9E9/float)
-
-// Complex type in HIP: cxsmpl (ONLY ONE OPTION POSSIBLE)
-#elif defined __HIPCC__
-#define MGONGPU_CUCXTYPE_CXSMPL 1 // ~10 percent slower (1.00E9/double, ~2.9E9/float)
-
-// Complex type in C++: std::complex or cxsmpl (CHOOSE ONLY ONE)
-#else
-//#define MGONGPU_CPPCXTYPE_STDCOMPLEX 1 // ~8 percent slower on float, same on double (5.1E6/double, 9.4E6/float)
-#define MGONGPU_CPPCXTYPE_CXSMPL 1 // new default (5.1E6/double, 10.2E6/float)
 #endif
 
-// CUDA nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation
+// Cuda nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation
 #ifdef __CUDACC__
-#undef MGONGPU_NSIGHT_DEBUG // default in CUDA
+#undef MGONGPU_NSIGHT_DEBUG // default
 //#define MGONGPU_NSIGHT_DEBUG 1
-#else
-#undef MGONGPU_NSIGHT_DEBUG // only option in HIP or C++
 #endif
 
 // SANITY CHECKS (floating point precision for everything but color algebra #537)
@@ -108,21 +86,17 @@
 #error You cannot use double precision for color algebra and single precision elsewhere
 #endif
 
-// SANITY CHECKS (CUDA complex number implementation)
-#ifdef __CUDACC__
-#if defined MGONGPU_CUCXTYPE_THRUST and defined MGONGPU_CUCXTYPE_CUCOMPLEX
-#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CUCXTYPE_THRUST or MGONGPU_CUCXTYPE_CUCOMPLEX for CUDA
-#elif defined MGONGPU_CUCXTYPE_THRUST and defined MGONGPU_CUCXTYPE_CXSMPL
-#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CUCXTYPE_THRUST or MGONGPU_CUCXTYPE_CXSMPL for CUDA
-#elif defined MGONGPU_CUCXTYPE_CUCOMPLEX and defined MGONGPU_CUCXTYPE_CXSMPL
-#error You must CHOOSE (ONE AND) ONLY ONE OF MGONGPU_CUCXTYPE_CUCOMPLEX or MGONGPU_CUCXTYPE_CXSMPL for CUDA
+// SANITY CHECKS (c++ complex number implementation)
+#ifndef __CUDACC__
+#if defined MGONGPU_CPPCXTYPE_STDCOMPLEX and defined MGONGPU_CPPCXTYPE_CXSMPL
+#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CPPCXTYPE_STDCOMPLEX or MGONGPU_CPPCXTYPE_CXSMPL
 #endif
 #endif
 
-// SANITY CHECKS (C++ complex number implementation)
-#ifndef MGONGPUCPP_GPUIMPL
-#if defined MGONGPU_CPPCXTYPE_STDCOMPLEX and defined MGONGPU_CPPCXTYPE_CXSMPL
-#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CPPCXTYPE_STDCOMPLEX or MGONGPU_CPPCXTYPE_CXSMPL for C++
+// SANITY CHECKS (cuda complex number implementation)
+#ifdef __CUDACC__
+#if defined MGONGPU_CUCXTYPE_THRUST and defined MGONGPU_CUCXTYPE_CUCOMPLEX and defined MGONGPU_CUCXTYPE_CXSMPL
+#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CUCXTYPE_THRUST or MGONGPU_CUCXTYPE_CUCOMPLEX or MGONGPU_CUCXTYPE_CXSMPL
 #endif
 #endif
 
@@ -160,7 +134,7 @@ namespace mgOnGpu
   // Alignment requirement for using reinterpret_cast with SIMD vectorized code
   // (using reinterpret_cast with non aligned memory may lead to segmentation faults!)
   // Only needed for C++ code but can be enforced also in NVCC builds of C++ code using CUDA>=11.2 and C++17 (#318, #319, #333)
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
   constexpr int cppAlign = 64; // alignment requirement for SIMD vectorization (64-byte i.e. 512-bit)
 #endif
 
@@ -171,7 +145,7 @@ using mgOnGpu::fptype;
 using mgOnGpu::fptype2;
 
 // C++ SIMD vectorization width (this will be used to set neppV)
-#ifdef MGONGPUCPP_GPUIMPL // CUDA and HIP implementations have no SIMD
+#ifdef __CUDACC__ // CUDA implementation has no SIMD
 #undef MGONGPU_CPPSIMD
 #elif defined __AVX512VL__ && defined MGONGPU_PVW512 // C++ "512z" AVX512 with 512 width (512-bit ie 64-byte): 8 (DOUBLE) or 16 (FLOAT)
 #ifdef MGONGPU_FPTYPE_DOUBLE
@@ -201,9 +175,9 @@ using mgOnGpu::fptype2;
 #undef MGONGPU_CPPSIMD
 #endif
 
-// CUDA nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation
+// Cuda nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation
 // Arguments (not used so far): text is __FUNCTION__, code is 0 (start) or 1 (end)
-#if defined __CUDA__ && defined MGONGPU_NSIGHT_DEBUG /* clang-format off */
+#if defined __CUDACC__ && defined MGONGPU_NSIGHT_DEBUG /* clang-format off */
 #define mgDebugDeclare() __shared__ float mgDebugCounter[mgOnGpu::ntpbMAX];
 #define mgDebugInitialise() { mgDebugCounter[threadIdx.x] = 0; }
 #define mgDebug( code, text ) { mgDebugCounter[threadIdx.x] += 1; }
@@ -215,8 +189,8 @@ using mgOnGpu::fptype2;
 #define mgDebugFinalise() { /*noop*/ }
 #endif /* clang-format on */
 
-// Define empty CUDA/HIP declaration specifiers for C++
-#ifndef MGONGPUCPP_GPUIMPL
+// Define empty CUDA declaration specifiers for C++
+#ifndef __CUDACC__
 #define __global__
 #define __host__
 #define __device__
diff --git a/epochX/cudacpp/gg_tt.mad/src/mgOnGpuCxtypes.h b/epochX/cudacpp/gg_tt.mad/src/mgOnGpuCxtypes.h
index 5532e22fa1..ca9a9f00c0 100644
--- a/epochX/cudacpp/gg_tt.mad/src/mgOnGpuCxtypes.h
+++ b/epochX/cudacpp/gg_tt.mad/src/mgOnGpuCxtypes.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022, based on earlier work by D. Smith) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MGONGPUCXTYPES_H
 #define MGONGPUCXTYPES_H 1
@@ -19,7 +19,7 @@
 #include <complex>
 
 // Complex type in cuda: thrust or cucomplex or cxsmpl
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 #if defined MGONGPU_CUCXTYPE_THRUST
 #pragma clang diagnostic push
 #pragma clang diagnostic ignored "-Wtautological-compare" // for icpx2021/clang13 (https://stackoverflow.com/a/15864661)
@@ -82,7 +82,7 @@ namespace mgOnGpu /* clang-format off */
 using mgOnGpu::cxsmpl;
 
 // Printout to stream for user defined types
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -92,7 +92,7 @@ namespace mg5amcCpu
   inline __host__ std::ostream&
   operator<<( std::ostream& out, const cxsmpl<FP>& c )
   {
-    out << std::complex<FP>( c.real(), c.imag() );
+    out << std::complex( c.real(), c.imag() );
     return out;
   }
 
@@ -215,14 +215,14 @@ namespace mg5amcCpu
 //==========================================================================
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
 #endif
 {
   // --- Type definitions (complex type: cxtype)
-#ifdef MGONGPUCPP_GPUIMPL // cuda
+#ifdef __CUDACC__ // cuda
 #if defined MGONGPU_CUCXTYPE_THRUST
   typedef thrust::complex<fptype> cxtype;
 #elif defined MGONGPU_CUCXTYPE_CUCOMPLEX
@@ -255,7 +255,7 @@ namespace mg5amcCpu
 //==========================================================================
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -307,7 +307,7 @@ namespace mg5amcCpu
 
   //==========================================================================
 
-#if defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CUCXTYPE_THRUST // cuda + thrust
+#if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_THRUST // cuda + thrust
 
   //------------------------------
   // CUDA - using thrust::complex
@@ -343,11 +343,11 @@ namespace mg5amcCpu
     return c;
   }
 
-#endif // #if defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CUCXTYPE_THRUST
+#endif // #if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_THRUST
 
   //==========================================================================
 
-#if defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CUCXTYPE_CUCOMPLEX // cuda + cucomplex
+#if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_CUCOMPLEX // cuda + cucomplex
 
   //------------------------------
   // CUDA - using cuComplex
@@ -562,11 +562,11 @@ namespace mg5amcCpu
     return cxmake( c.real(), c.imag() );
   }
 
-#endif // #if defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CUCXTYPE_CUCOMPLEX
+#endif // #if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_CUCOMPLEX
 
   //==========================================================================
 
-#if not defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CPPCXTYPE_STDCOMPLEX // c++ + stdcomplex
+#if not defined __CUDACC__ and defined MGONGPU_CPPCXTYPE_STDCOMPLEX // c++ + stdcomplex
 
   //------------------------------
   // C++ - using std::complex
@@ -610,7 +610,7 @@ namespace mg5amcCpu
   }
 #endif
 
-#endif // #if not defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CPPCXTYPE_STDCOMPLEX
+#endif // #if not defined __CUDACC__ and defined MGONGPU_CPPCXTYPE_STDCOMPLEX
 
   //==========================================================================
 
@@ -633,7 +633,7 @@ namespace mg5amcCpu
 //==========================================================================
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/gg_tt.mad/src/mgOnGpuFptypes.h b/epochX/cudacpp/gg_tt.mad/src/mgOnGpuFptypes.h
index 83a46c1d4e..905c97d700 100644
--- a/epochX/cudacpp/gg_tt.mad/src/mgOnGpuFptypes.h
+++ b/epochX/cudacpp/gg_tt.mad/src/mgOnGpuFptypes.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MGONGPUFPTYPES_H
 #define MGONGPUFPTYPES_H 1
@@ -20,7 +20,7 @@ namespace mg5amcCpu
 {
   //==========================================================================
 
-#ifdef MGONGPUCPP_GPUIMPL // cuda
+#ifdef __CUDACC__ // cuda
 
   //------------------------------
   // Floating point types - Cuda
@@ -64,11 +64,11 @@ namespace mg5amcCpu
 #endif
   }
 
-#endif // #ifdef MGONGPUCPP_GPUIMPL
+#endif // #ifdef __CUDACC__
 
   //==========================================================================
 
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
 
   //------------------------------
   // Floating point types - C++
@@ -92,7 +92,7 @@ namespace mg5amcCpu
     return std::sqrt( f );
   }
 
-#endif // #ifndef MGONGPUCPP_GPUIMPL
+#endif // #ifndef __CUDACC__
 
   //==========================================================================
 
diff --git a/epochX/cudacpp/gg_tt.mad/src/mgOnGpuVectors.h b/epochX/cudacpp/gg_tt.mad/src/mgOnGpuVectors.h
index dd8b83752d..e1299ba81e 100644
--- a/epochX/cudacpp/gg_tt.mad/src/mgOnGpuVectors.h
+++ b/epochX/cudacpp/gg_tt.mad/src/mgOnGpuVectors.h
@@ -9,8 +9,6 @@
 #include "mgOnGpuCxtypes.h"
 #include "mgOnGpuFptypes.h"
 
-#include "GpuAbstraction.h"
-
 #include <iostream>
 
 //==========================================================================
@@ -34,7 +32,7 @@
 #endif
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -133,7 +131,7 @@ namespace mg5amcCpu
 #endif
 #endif
 
-#else // i.e #ifndef MGONGPU_CPPSIMD (this includes #ifdef MGONGPUCPP_GPUIMPL)
+#else // i.e #ifndef MGONGPU_CPPSIMD (this includes #ifdef __CUDACC__)
 
   const int neppV = 1;
 
@@ -155,13 +153,13 @@ namespace mg5amcCpu
 //==========================================================================
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
 #endif
 {
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
 
   // Printout to stream for user defined types
 
@@ -807,11 +805,11 @@ namespace mg5amcCpu
 
 #endif // #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
 
-#endif // #ifndef MGONGPUCPP_GPUIMPL
+#endif // #ifndef __CUDACC__
 
   //==========================================================================
 
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 
   //------------------------------
   // Vector types - CUDA
@@ -855,12 +853,12 @@ namespace mg5amcCpu
     return mask;
   }
 
-#endif // #ifdef MGONGPUCPP_GPUIMPL
+#endif // #ifdef __CUDACC__
 
   //==========================================================================
 
   // Scalar-or-vector types: scalar in CUDA, vector or scalar in C++
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
   typedef bool bool_sv;
   typedef fptype fptype_sv;
   typedef fptype2 fptype2_sv;
@@ -881,7 +879,7 @@ namespace mg5amcCpu
 #endif
 
   // Scalar-or-vector zeros: scalar in CUDA, vector or scalar in C++
-#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
+#ifdef __CUDACC__ /* clang-format off */
   inline __host__ __device__ cxtype cxzero_sv(){ return cxtype( 0, 0 ); }
 #elif defined MGONGPU_CPPSIMD
   inline cxtype_v cxzero_sv() { return cxtype_v(); } // RRRR=0000 IIII=0000
diff --git a/epochX/cudacpp/gg_tt.mad/src/rambo.h b/epochX/cudacpp/gg_tt.mad/src/rambo.h
index cd7e1008ea..e02ea52496 100644
--- a/epochX/cudacpp/gg_tt.mad/src/rambo.h
+++ b/epochX/cudacpp/gg_tt.mad/src/rambo.h
@@ -4,7 +4,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 
 #include "mgOnGpuConfig.h"
@@ -18,7 +18,7 @@
 #include <iostream>
 
 // Simplified rambo version for 2 to N (with N>=2) processes with massless particles
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -83,7 +83,7 @@ namespace mg5amcCpu
       static bool first = true;
       if( first )
       {
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
         if constexpr( M_ACCESS::isOnDevice() ) // avoid
         {
           const int ievt0 = 0;
@@ -166,7 +166,7 @@ namespace mg5amcCpu
     wt = po2log;
     if( nparf != 2 ) wt = ( 2. * nparf - 4. ) * log( energy ) + z[nparf - 1];
 
-#ifndef MGONGPUCPP_GPUIMPL
+#ifndef __CUDACC__
     // issue warnings if weight is too small or too large
     static int iwarn[5] = { 0, 0, 0, 0, 0 };
     if( wt < -180. )

From e3ca5d90cc8cd077629975e3f5da3e6df9fb7c35 Mon Sep 17 00:00:00 2001
From: Andrea Valassi <andrea.valassi@cern.ch>
Date: Wed, 24 Jan 2024 15:46:29 +0100
Subject: [PATCH 487/509] [jthip24] regenerate 5 processes sa and 5 processes
 mad

I am working now in branch valassi/jthip24.
This is my first commit over joorgen/gpu_abstraction branch (also known as valassi/jthip),
as of commit 229ffeb0cda3b224a5d721b01b9bf8dd1a7fd8b5 (Tue Aug 15 11:33:01 2023 +0200).

This branch contains fewer features than jooorgen/master (PR #718),
but it is more advanced than joorgen/gpu_abstraction_only (PR #774).
I will probably need some of the commits here to fix PR #774 in branch valassi/jt774.

I regenerate the ten processes as follows:
  ./CODEGEN/generateAndCompare.sh ee_mumu --mad
  ./CODEGEN/generateAndCompare.sh gg_tt --mad
  ./CODEGEN/generateAndCompare.sh gg_ttg --mad
  ./CODEGEN/generateAndCompare.sh gg_ttgg --mad
  ./CODEGEN/generateAndCompare.sh gg_ttggg --mad
  ./CODEGEN/generateAndCompare.sh ee_mumu
  ./CODEGEN/generateAndCompare.sh gg_tt
  ./CODEGEN/generateAndCompare.sh gg_ttg
  ./CODEGEN/generateAndCompare.sh gg_ttgg
  ./CODEGEN/generateAndCompare.sh gg_ttggg
These are the five processes that would get conflicts when I merge upstream/master here,
so I guess that these are the only ten processes touched in this branch.
The fact that I can regenerate them and there are no real differences
(except for irrelevant stuff like me5_configuration.txt, aloha_file.inc, py3_model.pkl)
shows that ALL IMPORTANT CHANGES BY JORGEN HERE ARE IN THE CODEGEN.

I can therefore merge upstream/master, fix conflicts in CODEGEN and regenerate.
---
 .../ee_mumu.mad/CODEGEN_mad_ee_mumu_log.txt   |  48 ++++++++--------
 .../ee_mumu.mad/Cards/me5_configuration.txt   |   4 +-
 .../ee_mumu.mad/Source/DHELAS/aloha_file.inc  |   2 +-
 .../bin/internal/ufomodel/py3_model.pkl       | Bin 42813 -> 42822 bytes
 .../CODEGEN_cudacpp_ee_mumu_log.txt           |  40 +++++++-------
 .../gg_tt.mad/CODEGEN_mad_gg_tt_log.txt       |  51 ++++++++---------
 .../gg_tt.mad/Cards/me5_configuration.txt     |   4 +-
 .../gg_tt.mad/Source/DHELAS/aloha_file.inc    |   2 +-
 .../bin/internal/ufomodel/py3_model.pkl       | Bin 42813 -> 42822 bytes
 .../gg_tt.sa/CODEGEN_cudacpp_gg_tt_log.txt    |  40 +++++++-------
 .../gg_ttg.mad/CODEGEN_mad_gg_ttg_log.txt     |  50 ++++++++---------
 .../gg_ttg.mad/Cards/me5_configuration.txt    |   4 +-
 .../gg_ttg.mad/Source/DHELAS/aloha_file.inc   |   2 +-
 .../bin/internal/ufomodel/py3_model.pkl       | Bin 42813 -> 42822 bytes
 .../gg_ttg.sa/CODEGEN_cudacpp_gg_ttg_log.txt  |  40 +++++++-------
 .../gg_ttgg.mad/CODEGEN_mad_gg_ttgg_log.txt   |  50 ++++++++---------
 .../gg_ttgg.mad/Cards/me5_configuration.txt   |   4 +-
 .../gg_ttgg.mad/Source/DHELAS/aloha_file.inc  |   2 +-
 .../bin/internal/ufomodel/py3_model.pkl       | Bin 42813 -> 42822 bytes
 .../CODEGEN_cudacpp_gg_ttgg_log.txt           |  40 +++++++-------
 .../gg_ttggg.mad/CODEGEN_mad_gg_ttggg_log.txt |  52 +++++++++---------
 .../gg_ttggg.mad/Cards/me5_configuration.txt  |   4 +-
 .../gg_ttggg.mad/Source/DHELAS/aloha_file.inc |   2 +-
 .../bin/internal/ufomodel/py3_model.pkl       | Bin 42813 -> 42822 bytes
 .../CODEGEN_cudacpp_gg_ttggg_log.txt          |  40 +++++++-------
 25 files changed, 241 insertions(+), 240 deletions(-)

diff --git a/epochX/cudacpp/ee_mumu.mad/CODEGEN_mad_ee_mumu_log.txt b/epochX/cudacpp/ee_mumu.mad/CODEGEN_mad_ee_mumu_log.txt
index 4cc74544a8..92778f7ec9 100644
--- a/epochX/cudacpp/ee_mumu.mad/CODEGEN_mad_ee_mumu_log.txt
+++ b/epochX/cudacpp/ee_mumu.mad/CODEGEN_mad_ee_mumu_log.txt
@@ -52,8 +52,8 @@ Note that you can still compile and run aMC@NLO with the built-in PDFs
 
 Using default text editor "vi". Set another one in ./input/mg5_configuration.txt
 Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt
-No valid web browser found. Please set in ./input/mg5_configuration.txt
-import /afs/cern.ch/work/j/jteig/mg5amcnlo/CODEGEN_mad_ee_mumu.mg
+Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt
+import /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_mad_ee_mumu.mg
 The import format was not given, so we guess it as command
 set stdout_level DEBUG
 set output information to level: 10
@@ -62,7 +62,7 @@ generate e+ e- > mu+ mu-
 No model currently active, so we import the Standard Model
 INFO: load particles 
 INFO: load vertices 
-[1;32mDEBUG: model prefixing  takes 0.0033915042877197266 [0m
+[1;32mDEBUG: model prefixing  takes 0.005407810211181641 [0m
 INFO: Restrict model sm with file models/sm/restrict_default.dat . 
 [1;32mDEBUG: Simplifying conditional expressions [0m
 [1;32mDEBUG: remove interactions: u s w+ at order: QED=1 [0m
@@ -154,7 +154,7 @@ INFO: Checking for minimal orders which gives processes.
 INFO: Please specify coupling orders to bypass this step. 
 INFO: Trying process: e+ e- > mu+ mu- WEIGHTED<=4 @1  
 INFO: Process has 2 diagrams 
-1 processes with 2 diagrams generated in 0.003 s
+1 processes with 2 diagrams generated in 0.004 s
 Total: 1 processes with 2 diagrams
 output madevent CODEGEN_mad_ee_mumu --hel_recycling=False --vector_size=16384 --me_exporter=standalone_cudacpp
 Load PLUGIN.CUDACPP_SA_OUTPUT
@@ -165,10 +165,10 @@ Load PLUGIN.CUDACPP_SA_OUTPUT
 INFO: initialize a new directory: CODEGEN_mad_ee_mumu 
 INFO: remove old information in CODEGEN_mad_ee_mumu 
 [1;32mDEBUG:  Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [1;30m[output.py at line 157][0m [0m
-[1;34mWARNING: File exists /afs/cern.ch/work/j/jteig/mg5amcnlo/CODEGEN_mad_ee_mumu [0m
-INFO: Creating subdirectories in directory /afs/cern.ch/work/j/jteig/mg5amcnlo/CODEGEN_mad_ee_mumu 
-[1;34mWARNING: File exists /afs/cern.ch/work/j/jteig/mg5amcnlo/CODEGEN_mad_ee_mumu/Cards [0m
-[1;34mWARNING: File exists /afs/cern.ch/work/j/jteig/mg5amcnlo/CODEGEN_mad_ee_mumu/SubProcesses [0m
+[1;34mWARNING: File exists /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_mad_ee_mumu [0m
+INFO: Creating subdirectories in directory /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_mad_ee_mumu 
+[1;34mWARNING: File exists /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_mad_ee_mumu/Cards [0m
+[1;34mWARNING: File exists /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_mad_ee_mumu/SubProcesses [0m
 INFO: Organizing processes into subprocess groups 
 INFO: Generating Helas calls for process: e+ e- > mu+ mu- WEIGHTED<=4 @1 
 INFO: Processing color information for process: e+ e- > mu+ mu- @1 
@@ -176,7 +176,7 @@ INFO: Creating files in directory P1_epem_mupmum
 [1;32mDEBUG:  Entering PLUGIN_OneProcessExporter.__init__ [1;30m[model_handling.py at line 1039][0m [0m
 [1;32mDEBUG:  kwargs[prefix] = 0 [1;30m[model_handling.py at line 1040][0m [0m
 [1;32mDEBUG:  proc_id = [0m 1 [1;30m[model_handling.py at line 1045][0m [0m
-[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_SA_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7f9f4b0363d0> [1;30m[export_v4.py at line 6174][0m [0m
+[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_SA_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7fed2485b790> [1;30m[export_v4.py at line 6179][0m [0m
 INFO: Creating files in directory . 
 [1;32mDEBUG:  Entering PLUGIN_OneProcessExporter.generate_process_files [1;30m[model_handling.py at line 1297][0m [0m
 [1;32mDEBUG:  self.include_multi_channel is already defined: this is madevent+second_exporter mode [1;30m[model_handling.py at line 1299][0m [0m
@@ -201,7 +201,7 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./.
 [1;32mDEBUG:  Entering PLUGIN_OneProcessExporter.edit_testxxx [1;30m[model_handling.py at line 1419][0m [0m
 [1;32mDEBUG:  Entering PLUGIN_OneProcessExporter.edit_memorybuffers [1;30m[model_handling.py at line 1430][0m [0m
 [1;32mDEBUG:  Entering PLUGIN_OneProcessExporter.edit_memoryaccesscouplings [1;30m[model_handling.py at line 1441][0m [0m
-[1;32mDEBUG:  'Copying test reference file: ', template_ref  = [0m Copying test reference file:  /afs/cern.ch/work/j/jteig/mg5amcnlo/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/../../../test/ref/dump_CPUTest.Sigma_sm_epem_mupmum.txt [1;30m[model_handling.py at line 1335][0m [0m
+[1;32mDEBUG:  'Copying test reference file: ', template_ref  = [0m Copying test reference file:  /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/../../../test/ref/dump_CPUTest.Sigma_sm_epem_mupmum.txt [1;30m[model_handling.py at line 1335][0m [0m
 [1;32mDEBUG:  proc_id = [0m 1 [1;30m[export_cpp.py at line 710][0m [0m
 [1;32mDEBUG:  config_map = [0m [1, 2] [1;30m[export_cpp.py at line 711][0m [0m
 [1;32mDEBUG:  subproc_number = [0m 0 [1;30m[export_cpp.py at line 712][0m [0m
@@ -210,19 +210,19 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./.
 INFO: Generating Feynman diagrams for Process: e+ e- > mu+ mu- WEIGHTED<=4 @1 
 INFO: Finding symmetric diagrams for subprocess group epem_mupmum 
 Generated helas calls for 1 subprocesses (2 diagrams) in 0.004 s
-Wrote files for 8 helas calls in 0.221 s
+Wrote files for 8 helas calls in 0.098 s
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates FFV1 routines[0m
 ALOHA: aloha creates FFV2 routines[0m
 ALOHA: aloha creates FFV4 routines[0m
-ALOHA: aloha creates 3 routines in  0.138 s
+ALOHA: aloha creates 3 routines in  0.202 s
 [1;32mDEBUG:  Entering PLUGIN_ProcessExporter.convert_model (create the model) [1;30m[output.py at line 194][0m [0m
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates FFV1 routines[0m
 ALOHA: aloha creates FFV2 routines[0m
 ALOHA: aloha creates FFV4 routines[0m
 ALOHA: aloha creates FFV2_4 routines[0m
-ALOHA: aloha creates 7 routines in  0.183 s
+ALOHA: aloha creates 7 routines in  0.256 s
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
 <class 'aloha.create_aloha.AbstractRoutine'> FFV2
@@ -231,29 +231,29 @@ ALOHA: aloha creates 7 routines in  0.183 s
 <class 'aloha.create_aloha.AbstractRoutine'> FFV4
 <class 'aloha.create_aloha.AbstractRoutine'> FFV2_4
 <class 'aloha.create_aloha.AbstractRoutine'> FFV2_4
-FileWriter <class 'PLUGIN.CUDACPP_SA_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /afs/cern.ch/work/j/jteig/mg5amcnlo/CODEGEN_mad_ee_mumu/src/./HelAmps_sm.h
-INFO: Created file HelAmps_sm.h in directory /afs/cern.ch/work/j/jteig/mg5amcnlo/CODEGEN_mad_ee_mumu/src/. 
+FileWriter <class 'PLUGIN.CUDACPP_SA_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_mad_ee_mumu/src/./HelAmps_sm.h
+INFO: Created file HelAmps_sm.h in directory /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_mad_ee_mumu/src/. 
 super_write_set_parameters_onlyfixMajorana (hardcoded=False)
 [1;32mDEBUG:  'pardef_lines size =', len(pardef_lines), ', keys size =', len(pardef_lines.keys())  = [0m pardef_lines size = 48 , keys size = 48 [1;30m[model_handling.py at line 729][0m [0m
 super_write_set_parameters_onlyfixMajorana (hardcoded=True)
 [1;32mDEBUG:  'pardef_lines size =', len(pardef_lines), ', keys size =', len(pardef_lines.keys())  = [0m pardef_lines size = 3 , keys size = 3 [1;30m[model_handling.py at line 729][0m [0m
 [1;32mDEBUG:  'pardef_lines size =', len(pardef_lines), ', keys size =', len(pardef_lines.keys())  = [0m pardef_lines size = 3 , keys size = 3 [1;30m[model_handling.py at line 729][0m [0m
-FileWriter <class 'PLUGIN.CUDACPP_SA_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /afs/cern.ch/work/j/jteig/mg5amcnlo/CODEGEN_mad_ee_mumu/src/./Parameters_sm.h
-FileWriter <class 'PLUGIN.CUDACPP_SA_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /afs/cern.ch/work/j/jteig/mg5amcnlo/CODEGEN_mad_ee_mumu/src/./Parameters_sm.cc
+FileWriter <class 'PLUGIN.CUDACPP_SA_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_mad_ee_mumu/src/./Parameters_sm.h
+FileWriter <class 'PLUGIN.CUDACPP_SA_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_mad_ee_mumu/src/./Parameters_sm.cc
 INFO: Created files Parameters_sm.h and Parameters_sm.cc in directory 
-INFO: /afs/cern.ch/work/j/jteig/mg5amcnlo/CODEGEN_mad_ee_mumu/src/. and /afs/cern.ch/work/j/jteig/mg5amcnlo/CODEGEN_mad_ee_mumu/src/. 
+INFO: /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_mad_ee_mumu/src/. and /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_mad_ee_mumu/src/. 
 The option zerowidth_tchannel is modified [True] but will not be written in the configuration files.
 If you want to make this value the default for future session, you can run 'save options --all'
-save configuration file to /afs/cern.ch/work/j/jteig/mg5amcnlo/CODEGEN_mad_ee_mumu/Cards/me5_configuration.txt
+save configuration file to /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_mad_ee_mumu/Cards/me5_configuration.txt
 INFO: Use Fortran compiler gfortran 
 INFO: Use c++ compiler g++ 
 INFO: Generate web pages 
-Output to directory /afs/cern.ch/work/j/jteig/mg5amcnlo/CODEGEN_mad_ee_mumu done.
+Output to directory /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_mad_ee_mumu done.
 Type "launch" to generate events from this process, or see
-/afs/cern.ch/work/j/jteig/mg5amcnlo/CODEGEN_mad_ee_mumu/README
+/data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_mad_ee_mumu/README
 Run "open index.html" to see more information about this process.
 quit
 
-real	0m4.967s
-user	0m1.195s
-sys	0m1.770s
+real	0m1.873s
+user	0m1.636s
+sys	0m0.197s
diff --git a/epochX/cudacpp/ee_mumu.mad/Cards/me5_configuration.txt b/epochX/cudacpp/ee_mumu.mad/Cards/me5_configuration.txt
index 9e9ed9d752..00d7c6f8d6 100644
--- a/epochX/cudacpp/ee_mumu.mad/Cards/me5_configuration.txt
+++ b/epochX/cudacpp/ee_mumu.mad/Cards/me5_configuration.txt
@@ -234,7 +234,7 @@
 # pineappl = pineappl
 
 
-#mg5_path = /afs/cern.ch/work/j/jteig/mg5amcnlo 
+#mg5_path = /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo 
 
 # MG5 MAIN DIRECTORY
-#mg5_path = /afs/cern.ch/work/j/jteig/mg5amcnlo
+#mg5_path = /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo
diff --git a/epochX/cudacpp/ee_mumu.mad/Source/DHELAS/aloha_file.inc b/epochX/cudacpp/ee_mumu.mad/Source/DHELAS/aloha_file.inc
index e58e08d7bd..738db319fd 100644
--- a/epochX/cudacpp/ee_mumu.mad/Source/DHELAS/aloha_file.inc
+++ b/epochX/cudacpp/ee_mumu.mad/Source/DHELAS/aloha_file.inc
@@ -1 +1 @@
-ALOHARoutine = FFV1_0.o FFV1P0_3.o FFV2_0.o FFV2_3.o FFV4_0.o FFV4_3.o
+ALOHARoutine = FFV1_0.o FFV4_3.o FFV1P0_3.o FFV2_0.o FFV4_0.o FFV2_3.o
diff --git a/epochX/cudacpp/ee_mumu.mad/bin/internal/ufomodel/py3_model.pkl b/epochX/cudacpp/ee_mumu.mad/bin/internal/ufomodel/py3_model.pkl
index b6989c1453094d7f45cf2ee4b2124efa29e9064b..f71ba45bbc6d4acc8d32bb06662fe900a694009f 100644
GIT binary patch
delta 68
zcmdmcj_KGrrVZZ9)Uy~E81z#TOA_@H%Mx=Ei;FY$-2+0642+EReceqHeVz5wGZM>m
YCuc5GV>FyRVVM+*g|30c<aNvX0Xnc3lK=n!

delta 80
zcmX?hj%n{XrVZZ9<PsSe81xg<iuIFIi}Li6GxW>zi?a2z^s`D*Gt>1a7cEnhH;B*4
kk1tD2E>YJ}P)|uLNmaK}P&YC#GS)RP(>1W1ynR_e0E4X?GXMYp

diff --git a/epochX/cudacpp/ee_mumu.sa/CODEGEN_cudacpp_ee_mumu_log.txt b/epochX/cudacpp/ee_mumu.sa/CODEGEN_cudacpp_ee_mumu_log.txt
index 97a40cef65..bc2b0f8499 100644
--- a/epochX/cudacpp/ee_mumu.sa/CODEGEN_cudacpp_ee_mumu_log.txt
+++ b/epochX/cudacpp/ee_mumu.sa/CODEGEN_cudacpp_ee_mumu_log.txt
@@ -52,8 +52,8 @@ Note that you can still compile and run aMC@NLO with the built-in PDFs
 
 Using default text editor "vi". Set another one in ./input/mg5_configuration.txt
 Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt
-No valid web browser found. Please set in ./input/mg5_configuration.txt
-import /afs/cern.ch/work/j/jteig/mg5amcnlo/CODEGEN_cudacpp_ee_mumu.mg
+Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt
+import /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_cudacpp_ee_mumu.mg
 The import format was not given, so we guess it as command
 set stdout_level DEBUG
 set output information to level: 10
@@ -62,7 +62,7 @@ generate e+ e- > mu+ mu-
 No model currently active, so we import the Standard Model
 INFO: load particles 
 INFO: load vertices 
-[1;32mDEBUG: model prefixing  takes 0.003403902053833008 [0m
+[1;32mDEBUG: model prefixing  takes 0.0053980350494384766 [0m
 INFO: Restrict model sm with file models/sm/restrict_default.dat . 
 [1;32mDEBUG: Simplifying conditional expressions [0m
 [1;32mDEBUG: remove interactions: u s w+ at order: QED=1 [0m
@@ -154,7 +154,7 @@ INFO: Checking for minimal orders which gives processes.
 INFO: Please specify coupling orders to bypass this step. 
 INFO: Trying process: e+ e- > mu+ mu- WEIGHTED<=4 @1  
 INFO: Process has 2 diagrams 
-1 processes with 2 diagrams generated in 0.003 s
+1 processes with 2 diagrams generated in 0.004 s
 Total: 1 processes with 2 diagrams
 output standalone_cudacpp CODEGEN_cudacpp_ee_mumu
 Load PLUGIN.CUDACPP_SA_OUTPUT
@@ -162,7 +162,7 @@ Load PLUGIN.CUDACPP_SA_OUTPUT
 [1;32mDEBUG:  cformat = [0m plugin [1;30m[export_cpp.py at line 3071][0m [0m
 [1;32mDEBUG:  Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [1;30m[output.py at line 152][0m [0m
 [1;32mDEBUG:  Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [1;30m[output.py at line 157][0m [0m
-INFO: Creating subdirectories in directory /afs/cern.ch/work/j/jteig/mg5amcnlo/CODEGEN_cudacpp_ee_mumu 
+INFO: Creating subdirectories in directory /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_cudacpp_ee_mumu 
 INFO: Organizing processes into subprocess groups 
 INFO: Generating Helas calls for process: e+ e- > mu+ mu- WEIGHTED<=4 @1 
 INFO: Processing color information for process: e+ e- > mu+ mu- @1 
@@ -172,12 +172,12 @@ INFO: Processing color information for process: e+ e- > mu+ mu- @1
 [1;32mDEBUG:    type(me)=<class 'int'> me=0 [1;30m[output.py at line 189][0m [0m
 [1;32mDEBUG:  Entering PLUGIN_OneProcessExporter.__init__ [1;30m[model_handling.py at line 1039][0m [0m
 [1;32mDEBUG:  proc_id = [0m 0 [1;30m[model_handling.py at line 1045][0m [0m
-INFO: Creating files in directory /afs/cern.ch/work/j/jteig/mg5amcnlo/CODEGEN_cudacpp_ee_mumu/SubProcesses/P1_Sigma_sm_epem_mupmum 
+INFO: Creating files in directory /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_cudacpp_ee_mumu/SubProcesses/P1_Sigma_sm_epem_mupmum 
 [1;32mDEBUG:  Entering PLUGIN_OneProcessExporter.generate_process_files [1;30m[model_handling.py at line 1297][0m [0m
 [1;32mDEBUG:  self.include_multi_channel is not yet defined: this is standalone_cudacpp mode [1;30m[model_handling.py at line 1301][0m [0m
-FileWriter <class 'PLUGIN.CUDACPP_SA_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /afs/cern.ch/work/j/jteig/mg5amcnlo/CODEGEN_cudacpp_ee_mumu/SubProcesses/P1_Sigma_sm_epem_mupmum/./CPPProcess.h
+FileWriter <class 'PLUGIN.CUDACPP_SA_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_cudacpp_ee_mumu/SubProcesses/P1_Sigma_sm_epem_mupmum/./CPPProcess.h
 [1;32mDEBUG:  Entering PLUGIN_OneProcessExporter.write_process_h_file [1;30m[model_handling.py at line 1453][0m [0m
-FileWriter <class 'PLUGIN.CUDACPP_SA_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /afs/cern.ch/work/j/jteig/mg5amcnlo/CODEGEN_cudacpp_ee_mumu/SubProcesses/P1_Sigma_sm_epem_mupmum/./CPPProcess.cc
+FileWriter <class 'PLUGIN.CUDACPP_SA_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_cudacpp_ee_mumu/SubProcesses/P1_Sigma_sm_epem_mupmum/./CPPProcess.cc
 [1;32mDEBUG:  Entering PLUGIN_OneProcessExporter.write_process_cc_file [1;30m[model_handling.py at line 1475][0m [0m
 [1;32mDEBUG:  Entering PLUGIN_OneProcessExporter.get_sigmaKin_lines [1;30m[model_handling.py at line 1143][0m [0m
 [1;32mDEBUG:  self.include_multi_channel = [0m False [1;30m[model_handling.py at line 1144][0m [0m
@@ -186,7 +186,7 @@ FileWriter <class 'PLUGIN.CUDACPP_SA_OUTPUT.model_handling.PLUGIN_CPPWriter'> fo
 [1;32mDEBUG:  self.support_multichannel, self.include_multi_channel = [0m True False [1;30m[model_handling.py at line 1163][0m [0m
 [1;32mDEBUG:  multi_channel_map = [0m None [1;30m[model_handling.py at line 1654][0m [0m
 [1;32mDEBUG:  diag_to_config = [0m {} [1;30m[model_handling.py at line 1711][0m [0m
-INFO: Created files CPPProcess.h and CPPProcess.cc in directory /afs/cern.ch/work/j/jteig/mg5amcnlo/CODEGEN_cudacpp_ee_mumu/SubProcesses/P1_Sigma_sm_epem_mupmum/. 
+INFO: Created files CPPProcess.h and CPPProcess.cc in directory /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_cudacpp_ee_mumu/SubProcesses/P1_Sigma_sm_epem_mupmum/. 
 [1;32mDEBUG:  Entering PLUGIN_OneProcessExporter.edit_CMakeLists [1;30m[model_handling.py at line 1343][0m [0m
 [1;32mDEBUG:  Entering PLUGIN_OneProcessExporter.edit_check_sa [1;30m[model_handling.py at line 1352][0m [0m
 [1;32mDEBUG:  Entering PLUGIN_OneProcessExporter.edit_mgonGPU [1;30m[model_handling.py at line 1369][0m [0m
@@ -194,15 +194,15 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory /afs/cern.ch/wor
 [1;32mDEBUG:  Entering PLUGIN_OneProcessExporter.edit_testxxx [1;30m[model_handling.py at line 1419][0m [0m
 [1;32mDEBUG:  Entering PLUGIN_OneProcessExporter.edit_memorybuffers [1;30m[model_handling.py at line 1430][0m [0m
 [1;32mDEBUG:  Entering PLUGIN_OneProcessExporter.edit_memoryaccesscouplings [1;30m[model_handling.py at line 1441][0m [0m
-[1;32mDEBUG:  'Copying test reference file: ', template_ref  = [0m Copying test reference file:  /afs/cern.ch/work/j/jteig/mg5amcnlo/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/../../../test/ref/dump_CPUTest.Sigma_sm_epem_mupmum.txt [1;30m[model_handling.py at line 1335][0m [0m
-Generated helas calls for 1 subprocesses (2 diagrams) in 0.004 s
+[1;32mDEBUG:  'Copying test reference file: ', template_ref  = [0m Copying test reference file:  /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/../../../test/ref/dump_CPUTest.Sigma_sm_epem_mupmum.txt [1;30m[model_handling.py at line 1335][0m [0m
+Generated helas calls for 1 subprocesses (2 diagrams) in 0.003 s
 [1;32mDEBUG:  Entering PLUGIN_ProcessExporter.convert_model (create the model) [1;30m[output.py at line 194][0m [0m
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates FFV1 routines[0m
 ALOHA: aloha creates FFV2 routines[0m
 ALOHA: aloha creates FFV4 routines[0m
 ALOHA: aloha creates FFV2_4 routines[0m
-ALOHA: aloha creates 4 routines in  0.194 s
+ALOHA: aloha creates 4 routines in  0.265 s
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
 <class 'aloha.create_aloha.AbstractRoutine'> FFV2
@@ -211,20 +211,20 @@ ALOHA: aloha creates 4 routines in  0.194 s
 <class 'aloha.create_aloha.AbstractRoutine'> FFV4
 <class 'aloha.create_aloha.AbstractRoutine'> FFV2_4
 <class 'aloha.create_aloha.AbstractRoutine'> FFV2_4
-FileWriter <class 'PLUGIN.CUDACPP_SA_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /afs/cern.ch/work/j/jteig/mg5amcnlo/CODEGEN_cudacpp_ee_mumu/src/./HelAmps_sm.h
-INFO: Created file HelAmps_sm.h in directory /afs/cern.ch/work/j/jteig/mg5amcnlo/CODEGEN_cudacpp_ee_mumu/src/. 
+FileWriter <class 'PLUGIN.CUDACPP_SA_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_cudacpp_ee_mumu/src/./HelAmps_sm.h
+INFO: Created file HelAmps_sm.h in directory /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_cudacpp_ee_mumu/src/. 
 super_write_set_parameters_onlyfixMajorana (hardcoded=False)
 [1;32mDEBUG:  'pardef_lines size =', len(pardef_lines), ', keys size =', len(pardef_lines.keys())  = [0m pardef_lines size = 48 , keys size = 48 [1;30m[model_handling.py at line 729][0m [0m
 super_write_set_parameters_onlyfixMajorana (hardcoded=True)
 [1;32mDEBUG:  'pardef_lines size =', len(pardef_lines), ', keys size =', len(pardef_lines.keys())  = [0m pardef_lines size = 3 , keys size = 3 [1;30m[model_handling.py at line 729][0m [0m
 [1;32mDEBUG:  'pardef_lines size =', len(pardef_lines), ', keys size =', len(pardef_lines.keys())  = [0m pardef_lines size = 3 , keys size = 3 [1;30m[model_handling.py at line 729][0m [0m
-FileWriter <class 'PLUGIN.CUDACPP_SA_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /afs/cern.ch/work/j/jteig/mg5amcnlo/CODEGEN_cudacpp_ee_mumu/src/./Parameters_sm.h
-FileWriter <class 'PLUGIN.CUDACPP_SA_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /afs/cern.ch/work/j/jteig/mg5amcnlo/CODEGEN_cudacpp_ee_mumu/src/./Parameters_sm.cc
+FileWriter <class 'PLUGIN.CUDACPP_SA_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_cudacpp_ee_mumu/src/./Parameters_sm.h
+FileWriter <class 'PLUGIN.CUDACPP_SA_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_cudacpp_ee_mumu/src/./Parameters_sm.cc
 INFO: Created files Parameters_sm.h and Parameters_sm.cc in directory 
-INFO: /afs/cern.ch/work/j/jteig/mg5amcnlo/CODEGEN_cudacpp_ee_mumu/src/. and /afs/cern.ch/work/j/jteig/mg5amcnlo/CODEGEN_cudacpp_ee_mumu/src/. 
+INFO: /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_cudacpp_ee_mumu/src/. and /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_cudacpp_ee_mumu/src/. 
 [1;32mDEBUG:  Entering PLUGIN_ProcessExporter.finalize [1;30m[output.py at line 203][0m [0m
 quit
 
-real	0m0.880s
-user	0m0.446s
-sys	0m0.188s
+real	0m0.698s
+user	0m0.628s
+sys	0m0.058s
diff --git a/epochX/cudacpp/gg_tt.mad/CODEGEN_mad_gg_tt_log.txt b/epochX/cudacpp/gg_tt.mad/CODEGEN_mad_gg_tt_log.txt
index a7569ca528..5d9cc8971f 100644
--- a/epochX/cudacpp/gg_tt.mad/CODEGEN_mad_gg_tt_log.txt
+++ b/epochX/cudacpp/gg_tt.mad/CODEGEN_mad_gg_tt_log.txt
@@ -2,6 +2,7 @@
 This version is intended for development/beta testing and NOT for production.
 This version has not been fully tested (if at all) and might have limited user support (if at all)[0m
 Running MG5 in debug mode
+('WARNING: loading of madgraph too slow!!!', 0.6495707035064697)
 ************************************************************
 *                                                          *
 *                     W E L C O M E to                     *
@@ -52,8 +53,8 @@ Note that you can still compile and run aMC@NLO with the built-in PDFs
 
 Using default text editor "vi". Set another one in ./input/mg5_configuration.txt
 Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt
-No valid web browser found. Please set in ./input/mg5_configuration.txt
-import /afs/cern.ch/work/j/jteig/mg5amcnlo/CODEGEN_mad_gg_tt.mg
+Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt
+import /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_mad_gg_tt.mg
 The import format was not given, so we guess it as command
 set stdout_level DEBUG
 set output information to level: 10
@@ -62,7 +63,7 @@ generate g g > t t~
 No model currently active, so we import the Standard Model
 INFO: load particles 
 INFO: load vertices 
-[1;32mDEBUG: model prefixing  takes 0.003465414047241211 [0m
+[1;32mDEBUG: model prefixing  takes 0.005490541458129883 [0m
 INFO: Restrict model sm with file models/sm/restrict_default.dat . 
 [1;32mDEBUG: Simplifying conditional expressions [0m
 [1;32mDEBUG: remove interactions: u s w+ at order: QED=1 [0m
@@ -155,7 +156,7 @@ INFO: Please specify coupling orders to bypass this step.
 INFO: Trying coupling order WEIGHTED<=2: WEIGTHED IS QCD+2*QED 
 INFO: Trying process: g g > t t~ WEIGHTED<=2 @1  
 INFO: Process has 3 diagrams 
-1 processes with 3 diagrams generated in 0.006 s
+1 processes with 3 diagrams generated in 0.008 s
 Total: 1 processes with 3 diagrams
 output madevent CODEGEN_mad_gg_tt --hel_recycling=False --vector_size=16384 --me_exporter=standalone_cudacpp
 Load PLUGIN.CUDACPP_SA_OUTPUT
@@ -166,10 +167,10 @@ Load PLUGIN.CUDACPP_SA_OUTPUT
 INFO: initialize a new directory: CODEGEN_mad_gg_tt 
 INFO: remove old information in CODEGEN_mad_gg_tt 
 [1;32mDEBUG:  Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [1;30m[output.py at line 157][0m [0m
-[1;34mWARNING: File exists /afs/cern.ch/work/j/jteig/mg5amcnlo/CODEGEN_mad_gg_tt [0m
-INFO: Creating subdirectories in directory /afs/cern.ch/work/j/jteig/mg5amcnlo/CODEGEN_mad_gg_tt 
-[1;34mWARNING: File exists /afs/cern.ch/work/j/jteig/mg5amcnlo/CODEGEN_mad_gg_tt/Cards [0m
-[1;34mWARNING: File exists /afs/cern.ch/work/j/jteig/mg5amcnlo/CODEGEN_mad_gg_tt/SubProcesses [0m
+[1;34mWARNING: File exists /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_mad_gg_tt [0m
+INFO: Creating subdirectories in directory /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_mad_gg_tt 
+[1;34mWARNING: File exists /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_mad_gg_tt/Cards [0m
+[1;34mWARNING: File exists /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_mad_gg_tt/SubProcesses [0m
 INFO: Organizing processes into subprocess groups 
 INFO: Generating Helas calls for process: g g > t t~ WEIGHTED<=2 @1 
 INFO: Processing color information for process: g g > t t~ @1 
@@ -177,7 +178,7 @@ INFO: Creating files in directory P1_gg_ttx
 [1;32mDEBUG:  Entering PLUGIN_OneProcessExporter.__init__ [1;30m[model_handling.py at line 1039][0m [0m
 [1;32mDEBUG:  kwargs[prefix] = 0 [1;30m[model_handling.py at line 1040][0m [0m
 [1;32mDEBUG:  proc_id = [0m 1 [1;30m[model_handling.py at line 1045][0m [0m
-[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_SA_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7f3aa36926a0> [1;30m[export_v4.py at line 6174][0m [0m
+[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_SA_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7f7444634ac0> [1;30m[export_v4.py at line 6179][0m [0m
 INFO: Creating files in directory . 
 [1;32mDEBUG:  Entering PLUGIN_OneProcessExporter.generate_process_files [1;30m[model_handling.py at line 1297][0m [0m
 [1;32mDEBUG:  self.include_multi_channel is already defined: this is madevent+second_exporter mode [1;30m[model_handling.py at line 1299][0m [0m
@@ -206,30 +207,30 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./.
 [1;32mDEBUG:  Entering PLUGIN_OneProcessExporter.edit_testxxx [1;30m[model_handling.py at line 1419][0m [0m
 [1;32mDEBUG:  Entering PLUGIN_OneProcessExporter.edit_memorybuffers [1;30m[model_handling.py at line 1430][0m [0m
 [1;32mDEBUG:  Entering PLUGIN_OneProcessExporter.edit_memoryaccesscouplings [1;30m[model_handling.py at line 1441][0m [0m
-[1;32mDEBUG:  'Copying test reference file: ', template_ref  = [0m Copying test reference file:  /afs/cern.ch/work/j/jteig/mg5amcnlo/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/../../../test/ref/dump_CPUTest.Sigma_sm_gg_ttx.txt [1;30m[model_handling.py at line 1335][0m [0m
+[1;32mDEBUG:  'Copying test reference file: ', template_ref  = [0m Copying test reference file:  /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/../../../test/ref/dump_CPUTest.Sigma_sm_gg_ttx.txt [1;30m[model_handling.py at line 1335][0m [0m
 [1;32mDEBUG:  proc_id = [0m 1 [1;30m[export_cpp.py at line 710][0m [0m
 [1;32mDEBUG:  config_map = [0m [1, 2, 3] [1;30m[export_cpp.py at line 711][0m [0m
 [1;32mDEBUG:  subproc_number = [0m 0 [1;30m[export_cpp.py at line 712][0m [0m
 [1;32mDEBUG:  Done [1;30m[export_cpp.py at line 713][0m [0m
 INFO: Generating Feynman diagrams for Process: g g > t t~ WEIGHTED<=2 @1 
 INFO: Finding symmetric diagrams for subprocess group gg_ttx 
-Generated helas calls for 1 subprocesses (3 diagrams) in 0.005 s
-Wrote files for 10 helas calls in 0.254 s
+Generated helas calls for 1 subprocesses (3 diagrams) in 0.006 s
+Wrote files for 10 helas calls in 0.119 s
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates VVV1 set of routines with options: P0[0m
 ALOHA: aloha creates FFV1 routines[0m
-ALOHA: aloha creates 2 routines in  0.103 s
+ALOHA: aloha creates 2 routines in  0.422 s
 [1;32mDEBUG:  Entering PLUGIN_ProcessExporter.convert_model (create the model) [1;30m[output.py at line 194][0m [0m
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates VVV1 set of routines with options: P0[0m
 ALOHA: aloha creates FFV1 routines[0m
-ALOHA: aloha creates 4 routines in  0.096 s
+ALOHA: aloha creates 4 routines in  0.130 s
 <class 'aloha.create_aloha.AbstractRoutine'> VVV1
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
-FileWriter <class 'PLUGIN.CUDACPP_SA_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /afs/cern.ch/work/j/jteig/mg5amcnlo/CODEGEN_mad_gg_tt/src/./HelAmps_sm.h
-INFO: Created file HelAmps_sm.h in directory /afs/cern.ch/work/j/jteig/mg5amcnlo/CODEGEN_mad_gg_tt/src/. 
+FileWriter <class 'PLUGIN.CUDACPP_SA_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_mad_gg_tt/src/./HelAmps_sm.h
+INFO: Created file HelAmps_sm.h in directory /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_mad_gg_tt/src/. 
 super_write_set_parameters_onlyfixMajorana (hardcoded=False)
 [1;32mDEBUG:  'pardef_lines size =', len(pardef_lines), ', keys size =', len(pardef_lines.keys())  = [0m pardef_lines size = 48 , keys size = 48 [1;30m[model_handling.py at line 729][0m [0m
 super_write_set_parameters_onlyfixMajorana (hardcoded=True)
@@ -238,22 +239,22 @@ super_write_set_parameters_onlyfixMajorana (hardcoded=True)
 [1;32mDEBUG:  'pardef_lines size =', len(pardef_lines), ', keys size =', len(pardef_lines.keys())  = [0m pardef_lines size = 3 , keys size = 3 [1;30m[model_handling.py at line 729][0m [0m
 [1;32mDEBUG:  'pardef_lines size =', len(pardef_lines), ', keys size =', len(pardef_lines.keys())  = [0m pardef_lines size = 2 , keys size = 2 [1;30m[model_handling.py at line 729][0m [0m
 [1;32mDEBUG:  'pardef_lines size =', len(pardef_lines), ', keys size =', len(pardef_lines.keys())  = [0m pardef_lines size = 2 , keys size = 2 [1;30m[model_handling.py at line 729][0m [0m
-FileWriter <class 'PLUGIN.CUDACPP_SA_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /afs/cern.ch/work/j/jteig/mg5amcnlo/CODEGEN_mad_gg_tt/src/./Parameters_sm.h
-FileWriter <class 'PLUGIN.CUDACPP_SA_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /afs/cern.ch/work/j/jteig/mg5amcnlo/CODEGEN_mad_gg_tt/src/./Parameters_sm.cc
+FileWriter <class 'PLUGIN.CUDACPP_SA_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_mad_gg_tt/src/./Parameters_sm.h
+FileWriter <class 'PLUGIN.CUDACPP_SA_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_mad_gg_tt/src/./Parameters_sm.cc
 INFO: Created files Parameters_sm.h and Parameters_sm.cc in directory 
-INFO: /afs/cern.ch/work/j/jteig/mg5amcnlo/CODEGEN_mad_gg_tt/src/. and /afs/cern.ch/work/j/jteig/mg5amcnlo/CODEGEN_mad_gg_tt/src/. 
+INFO: /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_mad_gg_tt/src/. and /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_mad_gg_tt/src/. 
 The option zerowidth_tchannel is modified [True] but will not be written in the configuration files.
 If you want to make this value the default for future session, you can run 'save options --all'
-save configuration file to /afs/cern.ch/work/j/jteig/mg5amcnlo/CODEGEN_mad_gg_tt/Cards/me5_configuration.txt
+save configuration file to /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_mad_gg_tt/Cards/me5_configuration.txt
 INFO: Use Fortran compiler gfortran 
 INFO: Use c++ compiler g++ 
 INFO: Generate web pages 
-Output to directory /afs/cern.ch/work/j/jteig/mg5amcnlo/CODEGEN_mad_gg_tt done.
+Output to directory /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_mad_gg_tt done.
 Type "launch" to generate events from this process, or see
-/afs/cern.ch/work/j/jteig/mg5amcnlo/CODEGEN_mad_gg_tt/README
+/data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_mad_gg_tt/README
 Run "open index.html" to see more information about this process.
 quit
 
-real	0m4.770s
-user	0m1.113s
-sys	0m1.622s
+real	0m3.274s
+user	0m1.816s
+sys	0m0.258s
diff --git a/epochX/cudacpp/gg_tt.mad/Cards/me5_configuration.txt b/epochX/cudacpp/gg_tt.mad/Cards/me5_configuration.txt
index 9e9ed9d752..00d7c6f8d6 100644
--- a/epochX/cudacpp/gg_tt.mad/Cards/me5_configuration.txt
+++ b/epochX/cudacpp/gg_tt.mad/Cards/me5_configuration.txt
@@ -234,7 +234,7 @@
 # pineappl = pineappl
 
 
-#mg5_path = /afs/cern.ch/work/j/jteig/mg5amcnlo 
+#mg5_path = /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo 
 
 # MG5 MAIN DIRECTORY
-#mg5_path = /afs/cern.ch/work/j/jteig/mg5amcnlo
+#mg5_path = /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo
diff --git a/epochX/cudacpp/gg_tt.mad/Source/DHELAS/aloha_file.inc b/epochX/cudacpp/gg_tt.mad/Source/DHELAS/aloha_file.inc
index eaf8cc8601..5597c614b0 100644
--- a/epochX/cudacpp/gg_tt.mad/Source/DHELAS/aloha_file.inc
+++ b/epochX/cudacpp/gg_tt.mad/Source/DHELAS/aloha_file.inc
@@ -1 +1 @@
-ALOHARoutine = VVV1P0_1.o FFV1_0.o FFV1_1.o FFV1_2.o
+ALOHARoutine = FFV1_1.o FFV1_0.o FFV1_2.o VVV1P0_1.o
diff --git a/epochX/cudacpp/gg_tt.mad/bin/internal/ufomodel/py3_model.pkl b/epochX/cudacpp/gg_tt.mad/bin/internal/ufomodel/py3_model.pkl
index b6989c1453094d7f45cf2ee4b2124efa29e9064b..f71ba45bbc6d4acc8d32bb06662fe900a694009f 100644
GIT binary patch
delta 68
zcmdmcj_KGrrVZZ9)Uy~E81z#TOA_@H%Mx=Ei;FY$-2+0642+EReceqHeVz5wGZM>m
YCuc5GV>FyRVVM+*g|30c<aNvX0Xnc3lK=n!

delta 80
zcmX?hj%n{XrVZZ9<PsSe81xg<iuIFIi}Li6GxW>zi?a2z^s`D*Gt>1a7cEnhH;B*4
kk1tD2E>YJ}P)|uLNmaK}P&YC#GS)RP(>1W1ynR_e0E4X?GXMYp

diff --git a/epochX/cudacpp/gg_tt.sa/CODEGEN_cudacpp_gg_tt_log.txt b/epochX/cudacpp/gg_tt.sa/CODEGEN_cudacpp_gg_tt_log.txt
index 6ed61780de..ed49bfc186 100644
--- a/epochX/cudacpp/gg_tt.sa/CODEGEN_cudacpp_gg_tt_log.txt
+++ b/epochX/cudacpp/gg_tt.sa/CODEGEN_cudacpp_gg_tt_log.txt
@@ -52,8 +52,8 @@ Note that you can still compile and run aMC@NLO with the built-in PDFs
 
 Using default text editor "vi". Set another one in ./input/mg5_configuration.txt
 Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt
-No valid web browser found. Please set in ./input/mg5_configuration.txt
-import /afs/cern.ch/work/j/jteig/mg5amcnlo/CODEGEN_cudacpp_gg_tt.mg
+Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt
+import /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_cudacpp_gg_tt.mg
 The import format was not given, so we guess it as command
 set stdout_level DEBUG
 set output information to level: 10
@@ -62,7 +62,7 @@ generate g g > t t~
 No model currently active, so we import the Standard Model
 INFO: load particles 
 INFO: load vertices 
-[1;32mDEBUG: model prefixing  takes 0.0033271312713623047 [0m
+[1;32mDEBUG: model prefixing  takes 0.0055599212646484375 [0m
 INFO: Restrict model sm with file models/sm/restrict_default.dat . 
 [1;32mDEBUG: Simplifying conditional expressions [0m
 [1;32mDEBUG: remove interactions: u s w+ at order: QED=1 [0m
@@ -155,7 +155,7 @@ INFO: Please specify coupling orders to bypass this step.
 INFO: Trying coupling order WEIGHTED<=2: WEIGTHED IS QCD+2*QED 
 INFO: Trying process: g g > t t~ WEIGHTED<=2 @1  
 INFO: Process has 3 diagrams 
-1 processes with 3 diagrams generated in 0.006 s
+1 processes with 3 diagrams generated in 0.008 s
 Total: 1 processes with 3 diagrams
 output standalone_cudacpp CODEGEN_cudacpp_gg_tt
 Load PLUGIN.CUDACPP_SA_OUTPUT
@@ -163,7 +163,7 @@ Load PLUGIN.CUDACPP_SA_OUTPUT
 [1;32mDEBUG:  cformat = [0m plugin [1;30m[export_cpp.py at line 3071][0m [0m
 [1;32mDEBUG:  Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [1;30m[output.py at line 152][0m [0m
 [1;32mDEBUG:  Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [1;30m[output.py at line 157][0m [0m
-INFO: Creating subdirectories in directory /afs/cern.ch/work/j/jteig/mg5amcnlo/CODEGEN_cudacpp_gg_tt 
+INFO: Creating subdirectories in directory /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_cudacpp_gg_tt 
 INFO: Organizing processes into subprocess groups 
 INFO: Generating Helas calls for process: g g > t t~ WEIGHTED<=2 @1 
 INFO: Processing color information for process: g g > t t~ @1 
@@ -173,12 +173,12 @@ INFO: Processing color information for process: g g > t t~ @1
 [1;32mDEBUG:    type(me)=<class 'int'> me=0 [1;30m[output.py at line 189][0m [0m
 [1;32mDEBUG:  Entering PLUGIN_OneProcessExporter.__init__ [1;30m[model_handling.py at line 1039][0m [0m
 [1;32mDEBUG:  proc_id = [0m 0 [1;30m[model_handling.py at line 1045][0m [0m
-INFO: Creating files in directory /afs/cern.ch/work/j/jteig/mg5amcnlo/CODEGEN_cudacpp_gg_tt/SubProcesses/P1_Sigma_sm_gg_ttx 
+INFO: Creating files in directory /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_cudacpp_gg_tt/SubProcesses/P1_Sigma_sm_gg_ttx 
 [1;32mDEBUG:  Entering PLUGIN_OneProcessExporter.generate_process_files [1;30m[model_handling.py at line 1297][0m [0m
 [1;32mDEBUG:  self.include_multi_channel is not yet defined: this is standalone_cudacpp mode [1;30m[model_handling.py at line 1301][0m [0m
-FileWriter <class 'PLUGIN.CUDACPP_SA_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /afs/cern.ch/work/j/jteig/mg5amcnlo/CODEGEN_cudacpp_gg_tt/SubProcesses/P1_Sigma_sm_gg_ttx/./CPPProcess.h
+FileWriter <class 'PLUGIN.CUDACPP_SA_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_cudacpp_gg_tt/SubProcesses/P1_Sigma_sm_gg_ttx/./CPPProcess.h
 [1;32mDEBUG:  Entering PLUGIN_OneProcessExporter.write_process_h_file [1;30m[model_handling.py at line 1453][0m [0m
-FileWriter <class 'PLUGIN.CUDACPP_SA_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /afs/cern.ch/work/j/jteig/mg5amcnlo/CODEGEN_cudacpp_gg_tt/SubProcesses/P1_Sigma_sm_gg_ttx/./CPPProcess.cc
+FileWriter <class 'PLUGIN.CUDACPP_SA_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_cudacpp_gg_tt/SubProcesses/P1_Sigma_sm_gg_ttx/./CPPProcess.cc
 [1;32mDEBUG:  Entering PLUGIN_OneProcessExporter.write_process_cc_file [1;30m[model_handling.py at line 1475][0m [0m
 [1;32mDEBUG:  Entering PLUGIN_OneProcessExporter.get_sigmaKin_lines [1;30m[model_handling.py at line 1143][0m [0m
 [1;32mDEBUG:  self.include_multi_channel = [0m False [1;30m[model_handling.py at line 1144][0m [0m
@@ -191,7 +191,7 @@ FileWriter <class 'PLUGIN.CUDACPP_SA_OUTPUT.model_handling.PLUGIN_CPPWriter'> fo
 [1;32mDEBUG:  ('ZERO', 0, -1, 0, 0) [1;30m[model_handling.py at line 1824][0m [0m
 [1;32mDEBUG:  call = [0m vxxxxx( momenta,m_pars->%s, cHel[ihel][%d],%+d, w_sv[%d], %d ); [1;30m[model_handling.py at line 1823][0m [0m
 [1;32mDEBUG:  ('ZERO', 1, -1, 1, 1) [1;30m[model_handling.py at line 1824][0m [0m
-INFO: Created files CPPProcess.h and CPPProcess.cc in directory /afs/cern.ch/work/j/jteig/mg5amcnlo/CODEGEN_cudacpp_gg_tt/SubProcesses/P1_Sigma_sm_gg_ttx/. 
+INFO: Created files CPPProcess.h and CPPProcess.cc in directory /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_cudacpp_gg_tt/SubProcesses/P1_Sigma_sm_gg_ttx/. 
 [1;32mDEBUG:  Entering PLUGIN_OneProcessExporter.edit_CMakeLists [1;30m[model_handling.py at line 1343][0m [0m
 [1;32mDEBUG:  Entering PLUGIN_OneProcessExporter.edit_check_sa [1;30m[model_handling.py at line 1352][0m [0m
 [1;32mDEBUG:  Entering PLUGIN_OneProcessExporter.edit_mgonGPU [1;30m[model_handling.py at line 1369][0m [0m
@@ -199,19 +199,19 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory /afs/cern.ch/wor
 [1;32mDEBUG:  Entering PLUGIN_OneProcessExporter.edit_testxxx [1;30m[model_handling.py at line 1419][0m [0m
 [1;32mDEBUG:  Entering PLUGIN_OneProcessExporter.edit_memorybuffers [1;30m[model_handling.py at line 1430][0m [0m
 [1;32mDEBUG:  Entering PLUGIN_OneProcessExporter.edit_memoryaccesscouplings [1;30m[model_handling.py at line 1441][0m [0m
-[1;32mDEBUG:  'Copying test reference file: ', template_ref  = [0m Copying test reference file:  /afs/cern.ch/work/j/jteig/mg5amcnlo/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/../../../test/ref/dump_CPUTest.Sigma_sm_gg_ttx.txt [1;30m[model_handling.py at line 1335][0m [0m
-Generated helas calls for 1 subprocesses (3 diagrams) in 0.005 s
+[1;32mDEBUG:  'Copying test reference file: ', template_ref  = [0m Copying test reference file:  /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/../../../test/ref/dump_CPUTest.Sigma_sm_gg_ttx.txt [1;30m[model_handling.py at line 1335][0m [0m
+Generated helas calls for 1 subprocesses (3 diagrams) in 0.006 s
 [1;32mDEBUG:  Entering PLUGIN_ProcessExporter.convert_model (create the model) [1;30m[output.py at line 194][0m [0m
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates VVV1 set of routines with options: P0[0m
 ALOHA: aloha creates FFV1 routines[0m
-ALOHA: aloha creates 2 routines in  0.107 s
+ALOHA: aloha creates 2 routines in  0.142 s
 <class 'aloha.create_aloha.AbstractRoutine'> VVV1
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
-FileWriter <class 'PLUGIN.CUDACPP_SA_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /afs/cern.ch/work/j/jteig/mg5amcnlo/CODEGEN_cudacpp_gg_tt/src/./HelAmps_sm.h
-INFO: Created file HelAmps_sm.h in directory /afs/cern.ch/work/j/jteig/mg5amcnlo/CODEGEN_cudacpp_gg_tt/src/. 
+FileWriter <class 'PLUGIN.CUDACPP_SA_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_cudacpp_gg_tt/src/./HelAmps_sm.h
+INFO: Created file HelAmps_sm.h in directory /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_cudacpp_gg_tt/src/. 
 super_write_set_parameters_onlyfixMajorana (hardcoded=False)
 [1;32mDEBUG:  'pardef_lines size =', len(pardef_lines), ', keys size =', len(pardef_lines.keys())  = [0m pardef_lines size = 48 , keys size = 48 [1;30m[model_handling.py at line 729][0m [0m
 super_write_set_parameters_onlyfixMajorana (hardcoded=True)
@@ -220,13 +220,13 @@ super_write_set_parameters_onlyfixMajorana (hardcoded=True)
 [1;32mDEBUG:  'pardef_lines size =', len(pardef_lines), ', keys size =', len(pardef_lines.keys())  = [0m pardef_lines size = 3 , keys size = 3 [1;30m[model_handling.py at line 729][0m [0m
 [1;32mDEBUG:  'pardef_lines size =', len(pardef_lines), ', keys size =', len(pardef_lines.keys())  = [0m pardef_lines size = 2 , keys size = 2 [1;30m[model_handling.py at line 729][0m [0m
 [1;32mDEBUG:  'pardef_lines size =', len(pardef_lines), ', keys size =', len(pardef_lines.keys())  = [0m pardef_lines size = 2 , keys size = 2 [1;30m[model_handling.py at line 729][0m [0m
-FileWriter <class 'PLUGIN.CUDACPP_SA_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /afs/cern.ch/work/j/jteig/mg5amcnlo/CODEGEN_cudacpp_gg_tt/src/./Parameters_sm.h
-FileWriter <class 'PLUGIN.CUDACPP_SA_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /afs/cern.ch/work/j/jteig/mg5amcnlo/CODEGEN_cudacpp_gg_tt/src/./Parameters_sm.cc
+FileWriter <class 'PLUGIN.CUDACPP_SA_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_cudacpp_gg_tt/src/./Parameters_sm.h
+FileWriter <class 'PLUGIN.CUDACPP_SA_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_cudacpp_gg_tt/src/./Parameters_sm.cc
 INFO: Created files Parameters_sm.h and Parameters_sm.cc in directory 
-INFO: /afs/cern.ch/work/j/jteig/mg5amcnlo/CODEGEN_cudacpp_gg_tt/src/. and /afs/cern.ch/work/j/jteig/mg5amcnlo/CODEGEN_cudacpp_gg_tt/src/. 
+INFO: /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_cudacpp_gg_tt/src/. and /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_cudacpp_gg_tt/src/. 
 [1;32mDEBUG:  Entering PLUGIN_ProcessExporter.finalize [1;30m[output.py at line 203][0m [0m
 quit
 
-real	0m0.819s
-user	0m0.408s
-sys	0m0.189s
+real	0m0.594s
+user	0m0.537s
+sys	0m0.048s
diff --git a/epochX/cudacpp/gg_ttg.mad/CODEGEN_mad_gg_ttg_log.txt b/epochX/cudacpp/gg_ttg.mad/CODEGEN_mad_gg_ttg_log.txt
index 95b0d8df3c..d22ecfb1e2 100644
--- a/epochX/cudacpp/gg_ttg.mad/CODEGEN_mad_gg_ttg_log.txt
+++ b/epochX/cudacpp/gg_ttg.mad/CODEGEN_mad_gg_ttg_log.txt
@@ -52,8 +52,8 @@ Note that you can still compile and run aMC@NLO with the built-in PDFs
 
 Using default text editor "vi". Set another one in ./input/mg5_configuration.txt
 Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt
-No valid web browser found. Please set in ./input/mg5_configuration.txt
-import /afs/cern.ch/work/j/jteig/mg5amcnlo/CODEGEN_mad_gg_ttg.mg
+Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt
+import /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_mad_gg_ttg.mg
 The import format was not given, so we guess it as command
 set stdout_level DEBUG
 set output information to level: 10
@@ -62,7 +62,7 @@ generate g g > t t~ g
 No model currently active, so we import the Standard Model
 INFO: load particles 
 INFO: load vertices 
-[1;32mDEBUG: model prefixing  takes 0.003412008285522461 [0m
+[1;32mDEBUG: model prefixing  takes 0.005263328552246094 [0m
 INFO: Restrict model sm with file models/sm/restrict_default.dat . 
 [1;32mDEBUG: Simplifying conditional expressions [0m
 [1;32mDEBUG: remove interactions: u s w+ at order: QED=1 [0m
@@ -155,7 +155,7 @@ INFO: Please specify coupling orders to bypass this step.
 INFO: Trying coupling order WEIGHTED<=3: WEIGTHED IS QCD+2*QED 
 INFO: Trying process: g g > t t~ g WEIGHTED<=3 @1  
 INFO: Process has 16 diagrams 
-1 processes with 16 diagrams generated in 0.015 s
+1 processes with 16 diagrams generated in 0.022 s
 Total: 1 processes with 16 diagrams
 output madevent CODEGEN_mad_gg_ttg --hel_recycling=False --vector_size=16384 --me_exporter=standalone_cudacpp
 Load PLUGIN.CUDACPP_SA_OUTPUT
@@ -166,10 +166,10 @@ Load PLUGIN.CUDACPP_SA_OUTPUT
 INFO: initialize a new directory: CODEGEN_mad_gg_ttg 
 INFO: remove old information in CODEGEN_mad_gg_ttg 
 [1;32mDEBUG:  Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [1;30m[output.py at line 157][0m [0m
-[1;34mWARNING: File exists /afs/cern.ch/work/j/jteig/mg5amcnlo/CODEGEN_mad_gg_ttg [0m
-INFO: Creating subdirectories in directory /afs/cern.ch/work/j/jteig/mg5amcnlo/CODEGEN_mad_gg_ttg 
-[1;34mWARNING: File exists /afs/cern.ch/work/j/jteig/mg5amcnlo/CODEGEN_mad_gg_ttg/Cards [0m
-[1;34mWARNING: File exists /afs/cern.ch/work/j/jteig/mg5amcnlo/CODEGEN_mad_gg_ttg/SubProcesses [0m
+[1;34mWARNING: File exists /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_mad_gg_ttg [0m
+INFO: Creating subdirectories in directory /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_mad_gg_ttg 
+[1;34mWARNING: File exists /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_mad_gg_ttg/Cards [0m
+[1;34mWARNING: File exists /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_mad_gg_ttg/SubProcesses [0m
 INFO: Organizing processes into subprocess groups 
 INFO: Generating Helas calls for process: g g > t t~ g WEIGHTED<=3 @1 
 INFO: Processing color information for process: g g > t t~ g @1 
@@ -177,7 +177,7 @@ INFO: Creating files in directory P1_gg_ttxg
 [1;32mDEBUG:  Entering PLUGIN_OneProcessExporter.__init__ [1;30m[model_handling.py at line 1039][0m [0m
 [1;32mDEBUG:  kwargs[prefix] = 0 [1;30m[model_handling.py at line 1040][0m [0m
 [1;32mDEBUG:  proc_id = [0m 1 [1;30m[model_handling.py at line 1045][0m [0m
-[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_SA_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7f745ec85ee0> [1;30m[export_v4.py at line 6174][0m [0m
+[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_SA_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7f709fe53370> [1;30m[export_v4.py at line 6179][0m [0m
 INFO: Creating files in directory . 
 [1;32mDEBUG:  Entering PLUGIN_OneProcessExporter.generate_process_files [1;30m[model_handling.py at line 1297][0m [0m
 [1;32mDEBUG:  self.include_multi_channel is already defined: this is madevent+second_exporter mode [1;30m[model_handling.py at line 1299][0m [0m
@@ -208,22 +208,22 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./.
 [1;32mDEBUG:  Entering PLUGIN_OneProcessExporter.edit_testxxx [1;30m[model_handling.py at line 1419][0m [0m
 [1;32mDEBUG:  Entering PLUGIN_OneProcessExporter.edit_memorybuffers [1;30m[model_handling.py at line 1430][0m [0m
 [1;32mDEBUG:  Entering PLUGIN_OneProcessExporter.edit_memoryaccesscouplings [1;30m[model_handling.py at line 1441][0m [0m
-[1;32mDEBUG:  'Copying test reference file: ', template_ref  = [0m Copying test reference file:  /afs/cern.ch/work/j/jteig/mg5amcnlo/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/../../../test/ref/dump_CPUTest.Sigma_sm_gg_ttxg.txt [1;30m[model_handling.py at line 1335][0m [0m
+[1;32mDEBUG:  'Copying test reference file: ', template_ref  = [0m Copying test reference file:  /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/../../../test/ref/dump_CPUTest.Sigma_sm_gg_ttxg.txt [1;30m[model_handling.py at line 1335][0m [0m
 [1;32mDEBUG:  proc_id = [0m 1 [1;30m[export_cpp.py at line 710][0m [0m
 [1;32mDEBUG:  config_map = [0m [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 0] [1;30m[export_cpp.py at line 711][0m [0m
 [1;32mDEBUG:  subproc_number = [0m 0 [1;30m[export_cpp.py at line 712][0m [0m
 [1;32mDEBUG:  Done [1;30m[export_cpp.py at line 713][0m [0m
 INFO: Generating Feynman diagrams for Process: g g > t t~ g WEIGHTED<=3 @1 
 INFO: Finding symmetric diagrams for subprocess group gg_ttxg 
-Generated helas calls for 1 subprocesses (16 diagrams) in 0.028 s
-Wrote files for 36 helas calls in 0.286 s
+Generated helas calls for 1 subprocesses (16 diagrams) in 0.037 s
+Wrote files for 36 helas calls in 0.162 s
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates VVV1 routines[0m
 ALOHA: aloha creates FFV1 routines[0m
 ALOHA: aloha creates VVVV1 set of routines with options: P0[0m
 ALOHA: aloha creates VVVV3 set of routines with options: P0[0m
 ALOHA: aloha creates VVVV4 set of routines with options: P0[0m
-ALOHA: aloha creates 5 routines in  0.227 s
+ALOHA: aloha creates 5 routines in  0.324 s
 [1;32mDEBUG:  Entering PLUGIN_ProcessExporter.convert_model (create the model) [1;30m[output.py at line 194][0m [0m
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates VVV1 routines[0m
@@ -231,7 +231,7 @@ ALOHA: aloha creates FFV1 routines[0m
 ALOHA: aloha creates VVVV1 set of routines with options: P0[0m
 ALOHA: aloha creates VVVV3 set of routines with options: P0[0m
 ALOHA: aloha creates VVVV4 set of routines with options: P0[0m
-ALOHA: aloha creates 10 routines in  0.214 s
+ALOHA: aloha creates 10 routines in  0.310 s
 <class 'aloha.create_aloha.AbstractRoutine'> VVV1
 <class 'aloha.create_aloha.AbstractRoutine'> VVV1
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
@@ -241,8 +241,8 @@ ALOHA: aloha creates 10 routines in  0.214 s
 <class 'aloha.create_aloha.AbstractRoutine'> VVVV1
 <class 'aloha.create_aloha.AbstractRoutine'> VVVV3
 <class 'aloha.create_aloha.AbstractRoutine'> VVVV4
-FileWriter <class 'PLUGIN.CUDACPP_SA_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /afs/cern.ch/work/j/jteig/mg5amcnlo/CODEGEN_mad_gg_ttg/src/./HelAmps_sm.h
-INFO: Created file HelAmps_sm.h in directory /afs/cern.ch/work/j/jteig/mg5amcnlo/CODEGEN_mad_gg_ttg/src/. 
+FileWriter <class 'PLUGIN.CUDACPP_SA_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_mad_gg_ttg/src/./HelAmps_sm.h
+INFO: Created file HelAmps_sm.h in directory /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_mad_gg_ttg/src/. 
 super_write_set_parameters_onlyfixMajorana (hardcoded=False)
 [1;32mDEBUG:  'pardef_lines size =', len(pardef_lines), ', keys size =', len(pardef_lines.keys())  = [0m pardef_lines size = 48 , keys size = 48 [1;30m[model_handling.py at line 729][0m [0m
 super_write_set_parameters_onlyfixMajorana (hardcoded=True)
@@ -251,22 +251,22 @@ super_write_set_parameters_onlyfixMajorana (hardcoded=True)
 [1;32mDEBUG:  'pardef_lines size =', len(pardef_lines), ', keys size =', len(pardef_lines.keys())  = [0m pardef_lines size = 3 , keys size = 3 [1;30m[model_handling.py at line 729][0m [0m
 [1;32mDEBUG:  'pardef_lines size =', len(pardef_lines), ', keys size =', len(pardef_lines.keys())  = [0m pardef_lines size = 3 , keys size = 3 [1;30m[model_handling.py at line 729][0m [0m
 [1;32mDEBUG:  'pardef_lines size =', len(pardef_lines), ', keys size =', len(pardef_lines.keys())  = [0m pardef_lines size = 3 , keys size = 3 [1;30m[model_handling.py at line 729][0m [0m
-FileWriter <class 'PLUGIN.CUDACPP_SA_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /afs/cern.ch/work/j/jteig/mg5amcnlo/CODEGEN_mad_gg_ttg/src/./Parameters_sm.h
-FileWriter <class 'PLUGIN.CUDACPP_SA_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /afs/cern.ch/work/j/jteig/mg5amcnlo/CODEGEN_mad_gg_ttg/src/./Parameters_sm.cc
+FileWriter <class 'PLUGIN.CUDACPP_SA_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_mad_gg_ttg/src/./Parameters_sm.h
+FileWriter <class 'PLUGIN.CUDACPP_SA_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_mad_gg_ttg/src/./Parameters_sm.cc
 INFO: Created files Parameters_sm.h and Parameters_sm.cc in directory 
-INFO: /afs/cern.ch/work/j/jteig/mg5amcnlo/CODEGEN_mad_gg_ttg/src/. and /afs/cern.ch/work/j/jteig/mg5amcnlo/CODEGEN_mad_gg_ttg/src/. 
+INFO: /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_mad_gg_ttg/src/. and /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_mad_gg_ttg/src/. 
 The option zerowidth_tchannel is modified [True] but will not be written in the configuration files.
 If you want to make this value the default for future session, you can run 'save options --all'
-save configuration file to /afs/cern.ch/work/j/jteig/mg5amcnlo/CODEGEN_mad_gg_ttg/Cards/me5_configuration.txt
+save configuration file to /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_mad_gg_ttg/Cards/me5_configuration.txt
 INFO: Use Fortran compiler gfortran 
 INFO: Use c++ compiler g++ 
 INFO: Generate web pages 
-Output to directory /afs/cern.ch/work/j/jteig/mg5amcnlo/CODEGEN_mad_gg_ttg done.
+Output to directory /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_mad_gg_ttg done.
 Type "launch" to generate events from this process, or see
-/afs/cern.ch/work/j/jteig/mg5amcnlo/CODEGEN_mad_gg_ttg/README
+/data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_mad_gg_ttg/README
 Run "open index.html" to see more information about this process.
 quit
 
-real	0m5.078s
-user	0m1.397s
-sys	0m1.624s
+real	0m2.202s
+user	0m1.971s
+sys	0m0.182s
diff --git a/epochX/cudacpp/gg_ttg.mad/Cards/me5_configuration.txt b/epochX/cudacpp/gg_ttg.mad/Cards/me5_configuration.txt
index 9e9ed9d752..00d7c6f8d6 100644
--- a/epochX/cudacpp/gg_ttg.mad/Cards/me5_configuration.txt
+++ b/epochX/cudacpp/gg_ttg.mad/Cards/me5_configuration.txt
@@ -234,7 +234,7 @@
 # pineappl = pineappl
 
 
-#mg5_path = /afs/cern.ch/work/j/jteig/mg5amcnlo 
+#mg5_path = /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo 
 
 # MG5 MAIN DIRECTORY
-#mg5_path = /afs/cern.ch/work/j/jteig/mg5amcnlo
+#mg5_path = /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo
diff --git a/epochX/cudacpp/gg_ttg.mad/Source/DHELAS/aloha_file.inc b/epochX/cudacpp/gg_ttg.mad/Source/DHELAS/aloha_file.inc
index 9d01a65cec..50c12b0804 100644
--- a/epochX/cudacpp/gg_ttg.mad/Source/DHELAS/aloha_file.inc
+++ b/epochX/cudacpp/gg_ttg.mad/Source/DHELAS/aloha_file.inc
@@ -1 +1 @@
-ALOHARoutine = VVV1_0.o VVV1P0_1.o FFV1_0.o FFV1_1.o FFV1_2.o FFV1P0_3.o VVVV1P0_1.o VVVV3P0_1.o VVVV4P0_1.o
+ALOHARoutine = FFV1_1.o VVVV4P0_1.o FFV1_0.o VVV1_0.o FFV1_2.o VVVV3P0_1.o VVVV1P0_1.o VVV1P0_1.o FFV1P0_3.o
diff --git a/epochX/cudacpp/gg_ttg.mad/bin/internal/ufomodel/py3_model.pkl b/epochX/cudacpp/gg_ttg.mad/bin/internal/ufomodel/py3_model.pkl
index b6989c1453094d7f45cf2ee4b2124efa29e9064b..f71ba45bbc6d4acc8d32bb06662fe900a694009f 100644
GIT binary patch
delta 68
zcmdmcj_KGrrVZZ9)Uy~E81z#TOA_@H%Mx=Ei;FY$-2+0642+EReceqHeVz5wGZM>m
YCuc5GV>FyRVVM+*g|30c<aNvX0Xnc3lK=n!

delta 80
zcmX?hj%n{XrVZZ9<PsSe81xg<iuIFIi}Li6GxW>zi?a2z^s`D*Gt>1a7cEnhH;B*4
kk1tD2E>YJ}P)|uLNmaK}P&YC#GS)RP(>1W1ynR_e0E4X?GXMYp

diff --git a/epochX/cudacpp/gg_ttg.sa/CODEGEN_cudacpp_gg_ttg_log.txt b/epochX/cudacpp/gg_ttg.sa/CODEGEN_cudacpp_gg_ttg_log.txt
index 83c867432e..d710d27afd 100644
--- a/epochX/cudacpp/gg_ttg.sa/CODEGEN_cudacpp_gg_ttg_log.txt
+++ b/epochX/cudacpp/gg_ttg.sa/CODEGEN_cudacpp_gg_ttg_log.txt
@@ -52,8 +52,8 @@ Note that you can still compile and run aMC@NLO with the built-in PDFs
 
 Using default text editor "vi". Set another one in ./input/mg5_configuration.txt
 Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt
-No valid web browser found. Please set in ./input/mg5_configuration.txt
-import /afs/cern.ch/work/j/jteig/mg5amcnlo/CODEGEN_cudacpp_gg_ttg.mg
+Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt
+import /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_cudacpp_gg_ttg.mg
 The import format was not given, so we guess it as command
 set stdout_level DEBUG
 set output information to level: 10
@@ -62,7 +62,7 @@ generate g g > t t~ g
 No model currently active, so we import the Standard Model
 INFO: load particles 
 INFO: load vertices 
-[1;32mDEBUG: model prefixing  takes 0.0035054683685302734 [0m
+[1;32mDEBUG: model prefixing  takes 0.005392789840698242 [0m
 INFO: Restrict model sm with file models/sm/restrict_default.dat . 
 [1;32mDEBUG: Simplifying conditional expressions [0m
 [1;32mDEBUG: remove interactions: u s w+ at order: QED=1 [0m
@@ -155,7 +155,7 @@ INFO: Please specify coupling orders to bypass this step.
 INFO: Trying coupling order WEIGHTED<=3: WEIGTHED IS QCD+2*QED 
 INFO: Trying process: g g > t t~ g WEIGHTED<=3 @1  
 INFO: Process has 16 diagrams 
-1 processes with 16 diagrams generated in 0.016 s
+1 processes with 16 diagrams generated in 0.021 s
 Total: 1 processes with 16 diagrams
 output standalone_cudacpp CODEGEN_cudacpp_gg_ttg
 Load PLUGIN.CUDACPP_SA_OUTPUT
@@ -163,7 +163,7 @@ Load PLUGIN.CUDACPP_SA_OUTPUT
 [1;32mDEBUG:  cformat = [0m plugin [1;30m[export_cpp.py at line 3071][0m [0m
 [1;32mDEBUG:  Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [1;30m[output.py at line 152][0m [0m
 [1;32mDEBUG:  Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [1;30m[output.py at line 157][0m [0m
-INFO: Creating subdirectories in directory /afs/cern.ch/work/j/jteig/mg5amcnlo/CODEGEN_cudacpp_gg_ttg 
+INFO: Creating subdirectories in directory /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_cudacpp_gg_ttg 
 INFO: Organizing processes into subprocess groups 
 INFO: Generating Helas calls for process: g g > t t~ g WEIGHTED<=3 @1 
 INFO: Processing color information for process: g g > t t~ g @1 
@@ -173,12 +173,12 @@ INFO: Processing color information for process: g g > t t~ g @1
 [1;32mDEBUG:    type(me)=<class 'int'> me=0 [1;30m[output.py at line 189][0m [0m
 [1;32mDEBUG:  Entering PLUGIN_OneProcessExporter.__init__ [1;30m[model_handling.py at line 1039][0m [0m
 [1;32mDEBUG:  proc_id = [0m 0 [1;30m[model_handling.py at line 1045][0m [0m
-INFO: Creating files in directory /afs/cern.ch/work/j/jteig/mg5amcnlo/CODEGEN_cudacpp_gg_ttg/SubProcesses/P1_Sigma_sm_gg_ttxg 
+INFO: Creating files in directory /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_cudacpp_gg_ttg/SubProcesses/P1_Sigma_sm_gg_ttxg 
 [1;32mDEBUG:  Entering PLUGIN_OneProcessExporter.generate_process_files [1;30m[model_handling.py at line 1297][0m [0m
 [1;32mDEBUG:  self.include_multi_channel is not yet defined: this is standalone_cudacpp mode [1;30m[model_handling.py at line 1301][0m [0m
-FileWriter <class 'PLUGIN.CUDACPP_SA_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /afs/cern.ch/work/j/jteig/mg5amcnlo/CODEGEN_cudacpp_gg_ttg/SubProcesses/P1_Sigma_sm_gg_ttxg/./CPPProcess.h
+FileWriter <class 'PLUGIN.CUDACPP_SA_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_cudacpp_gg_ttg/SubProcesses/P1_Sigma_sm_gg_ttxg/./CPPProcess.h
 [1;32mDEBUG:  Entering PLUGIN_OneProcessExporter.write_process_h_file [1;30m[model_handling.py at line 1453][0m [0m
-FileWriter <class 'PLUGIN.CUDACPP_SA_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /afs/cern.ch/work/j/jteig/mg5amcnlo/CODEGEN_cudacpp_gg_ttg/SubProcesses/P1_Sigma_sm_gg_ttxg/./CPPProcess.cc
+FileWriter <class 'PLUGIN.CUDACPP_SA_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_cudacpp_gg_ttg/SubProcesses/P1_Sigma_sm_gg_ttxg/./CPPProcess.cc
 [1;32mDEBUG:  Entering PLUGIN_OneProcessExporter.write_process_cc_file [1;30m[model_handling.py at line 1475][0m [0m
 [1;32mDEBUG:  Entering PLUGIN_OneProcessExporter.get_sigmaKin_lines [1;30m[model_handling.py at line 1143][0m [0m
 [1;32mDEBUG:  self.include_multi_channel = [0m False [1;30m[model_handling.py at line 1144][0m [0m
@@ -193,7 +193,7 @@ FileWriter <class 'PLUGIN.CUDACPP_SA_OUTPUT.model_handling.PLUGIN_CPPWriter'> fo
 [1;32mDEBUG:  ('ZERO', 1, -1, 1, 1) [1;30m[model_handling.py at line 1824][0m [0m
 [1;32mDEBUG:  call = [0m vxxxxx( momenta,m_pars->%s, cHel[ihel][%d],%+d, w_sv[%d], %d ); [1;30m[model_handling.py at line 1823][0m [0m
 [1;32mDEBUG:  ('ZERO', 4, 1, 4, 4) [1;30m[model_handling.py at line 1824][0m [0m
-INFO: Created files CPPProcess.h and CPPProcess.cc in directory /afs/cern.ch/work/j/jteig/mg5amcnlo/CODEGEN_cudacpp_gg_ttg/SubProcesses/P1_Sigma_sm_gg_ttxg/. 
+INFO: Created files CPPProcess.h and CPPProcess.cc in directory /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_cudacpp_gg_ttg/SubProcesses/P1_Sigma_sm_gg_ttxg/. 
 [1;32mDEBUG:  Entering PLUGIN_OneProcessExporter.edit_CMakeLists [1;30m[model_handling.py at line 1343][0m [0m
 [1;32mDEBUG:  Entering PLUGIN_OneProcessExporter.edit_check_sa [1;30m[model_handling.py at line 1352][0m [0m
 [1;32mDEBUG:  Entering PLUGIN_OneProcessExporter.edit_mgonGPU [1;30m[model_handling.py at line 1369][0m [0m
@@ -201,8 +201,8 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory /afs/cern.ch/wor
 [1;32mDEBUG:  Entering PLUGIN_OneProcessExporter.edit_testxxx [1;30m[model_handling.py at line 1419][0m [0m
 [1;32mDEBUG:  Entering PLUGIN_OneProcessExporter.edit_memorybuffers [1;30m[model_handling.py at line 1430][0m [0m
 [1;32mDEBUG:  Entering PLUGIN_OneProcessExporter.edit_memoryaccesscouplings [1;30m[model_handling.py at line 1441][0m [0m
-[1;32mDEBUG:  'Copying test reference file: ', template_ref  = [0m Copying test reference file:  /afs/cern.ch/work/j/jteig/mg5amcnlo/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/../../../test/ref/dump_CPUTest.Sigma_sm_gg_ttxg.txt [1;30m[model_handling.py at line 1335][0m [0m
-Generated helas calls for 1 subprocesses (16 diagrams) in 0.026 s
+[1;32mDEBUG:  'Copying test reference file: ', template_ref  = [0m Copying test reference file:  /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/../../../test/ref/dump_CPUTest.Sigma_sm_gg_ttxg.txt [1;30m[model_handling.py at line 1335][0m [0m
+Generated helas calls for 1 subprocesses (16 diagrams) in 0.037 s
 [1;32mDEBUG:  Entering PLUGIN_ProcessExporter.convert_model (create the model) [1;30m[output.py at line 194][0m [0m
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates VVV1 routines[0m
@@ -210,7 +210,7 @@ ALOHA: aloha creates FFV1 routines[0m
 ALOHA: aloha creates VVVV1 set of routines with options: P0[0m
 ALOHA: aloha creates VVVV3 set of routines with options: P0[0m
 ALOHA: aloha creates VVVV4 set of routines with options: P0[0m
-ALOHA: aloha creates 5 routines in  0.224 s
+ALOHA: aloha creates 5 routines in  0.324 s
 <class 'aloha.create_aloha.AbstractRoutine'> VVV1
 <class 'aloha.create_aloha.AbstractRoutine'> VVV1
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
@@ -220,8 +220,8 @@ ALOHA: aloha creates 5 routines in  0.224 s
 <class 'aloha.create_aloha.AbstractRoutine'> VVVV1
 <class 'aloha.create_aloha.AbstractRoutine'> VVVV3
 <class 'aloha.create_aloha.AbstractRoutine'> VVVV4
-FileWriter <class 'PLUGIN.CUDACPP_SA_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /afs/cern.ch/work/j/jteig/mg5amcnlo/CODEGEN_cudacpp_gg_ttg/src/./HelAmps_sm.h
-INFO: Created file HelAmps_sm.h in directory /afs/cern.ch/work/j/jteig/mg5amcnlo/CODEGEN_cudacpp_gg_ttg/src/. 
+FileWriter <class 'PLUGIN.CUDACPP_SA_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_cudacpp_gg_ttg/src/./HelAmps_sm.h
+INFO: Created file HelAmps_sm.h in directory /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_cudacpp_gg_ttg/src/. 
 super_write_set_parameters_onlyfixMajorana (hardcoded=False)
 [1;32mDEBUG:  'pardef_lines size =', len(pardef_lines), ', keys size =', len(pardef_lines.keys())  = [0m pardef_lines size = 48 , keys size = 48 [1;30m[model_handling.py at line 729][0m [0m
 super_write_set_parameters_onlyfixMajorana (hardcoded=True)
@@ -230,13 +230,13 @@ super_write_set_parameters_onlyfixMajorana (hardcoded=True)
 [1;32mDEBUG:  'pardef_lines size =', len(pardef_lines), ', keys size =', len(pardef_lines.keys())  = [0m pardef_lines size = 3 , keys size = 3 [1;30m[model_handling.py at line 729][0m [0m
 [1;32mDEBUG:  'pardef_lines size =', len(pardef_lines), ', keys size =', len(pardef_lines.keys())  = [0m pardef_lines size = 3 , keys size = 3 [1;30m[model_handling.py at line 729][0m [0m
 [1;32mDEBUG:  'pardef_lines size =', len(pardef_lines), ', keys size =', len(pardef_lines.keys())  = [0m pardef_lines size = 3 , keys size = 3 [1;30m[model_handling.py at line 729][0m [0m
-FileWriter <class 'PLUGIN.CUDACPP_SA_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /afs/cern.ch/work/j/jteig/mg5amcnlo/CODEGEN_cudacpp_gg_ttg/src/./Parameters_sm.h
-FileWriter <class 'PLUGIN.CUDACPP_SA_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /afs/cern.ch/work/j/jteig/mg5amcnlo/CODEGEN_cudacpp_gg_ttg/src/./Parameters_sm.cc
+FileWriter <class 'PLUGIN.CUDACPP_SA_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_cudacpp_gg_ttg/src/./Parameters_sm.h
+FileWriter <class 'PLUGIN.CUDACPP_SA_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_cudacpp_gg_ttg/src/./Parameters_sm.cc
 INFO: Created files Parameters_sm.h and Parameters_sm.cc in directory 
-INFO: /afs/cern.ch/work/j/jteig/mg5amcnlo/CODEGEN_cudacpp_gg_ttg/src/. and /afs/cern.ch/work/j/jteig/mg5amcnlo/CODEGEN_cudacpp_gg_ttg/src/. 
+INFO: /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_cudacpp_gg_ttg/src/. and /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_cudacpp_gg_ttg/src/. 
 [1;32mDEBUG:  Entering PLUGIN_ProcessExporter.finalize [1;30m[output.py at line 203][0m [0m
 quit
 
-real	0m0.919s
-user	0m0.552s
-sys	0m0.150s
+real	0m0.944s
+user	0m0.766s
+sys	0m0.068s
diff --git a/epochX/cudacpp/gg_ttgg.mad/CODEGEN_mad_gg_ttgg_log.txt b/epochX/cudacpp/gg_ttgg.mad/CODEGEN_mad_gg_ttgg_log.txt
index 6e1e63fd2d..24d824ed19 100644
--- a/epochX/cudacpp/gg_ttgg.mad/CODEGEN_mad_gg_ttgg_log.txt
+++ b/epochX/cudacpp/gg_ttgg.mad/CODEGEN_mad_gg_ttgg_log.txt
@@ -52,8 +52,8 @@ Note that you can still compile and run aMC@NLO with the built-in PDFs
 
 Using default text editor "vi". Set another one in ./input/mg5_configuration.txt
 Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt
-No valid web browser found. Please set in ./input/mg5_configuration.txt
-import /afs/cern.ch/work/j/jteig/mg5amcnlo/CODEGEN_mad_gg_ttgg.mg
+Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt
+import /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_mad_gg_ttgg.mg
 The import format was not given, so we guess it as command
 set stdout_level DEBUG
 set output information to level: 10
@@ -62,7 +62,7 @@ generate g g > t t~ g g
 No model currently active, so we import the Standard Model
 INFO: load particles 
 INFO: load vertices 
-[1;32mDEBUG: model prefixing  takes 0.0033049583435058594 [0m
+[1;32mDEBUG: model prefixing  takes 0.005574703216552734 [0m
 INFO: Restrict model sm with file models/sm/restrict_default.dat . 
 [1;32mDEBUG: Simplifying conditional expressions [0m
 [1;32mDEBUG: remove interactions: u s w+ at order: QED=1 [0m
@@ -155,7 +155,7 @@ INFO: Please specify coupling orders to bypass this step.
 INFO: Trying coupling order WEIGHTED<=4: WEIGTHED IS QCD+2*QED 
 INFO: Trying process: g g > t t~ g g WEIGHTED<=4 @1  
 INFO: Process has 123 diagrams 
-1 processes with 123 diagrams generated in 0.111 s
+1 processes with 123 diagrams generated in 0.157 s
 Total: 1 processes with 123 diagrams
 output madevent CODEGEN_mad_gg_ttgg --hel_recycling=False --vector_size=16384 --me_exporter=standalone_cudacpp
 Load PLUGIN.CUDACPP_SA_OUTPUT
@@ -166,10 +166,10 @@ Load PLUGIN.CUDACPP_SA_OUTPUT
 INFO: initialize a new directory: CODEGEN_mad_gg_ttgg 
 INFO: remove old information in CODEGEN_mad_gg_ttgg 
 [1;32mDEBUG:  Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [1;30m[output.py at line 157][0m [0m
-[1;34mWARNING: File exists /afs/cern.ch/work/j/jteig/mg5amcnlo/CODEGEN_mad_gg_ttgg [0m
-INFO: Creating subdirectories in directory /afs/cern.ch/work/j/jteig/mg5amcnlo/CODEGEN_mad_gg_ttgg 
-[1;34mWARNING: File exists /afs/cern.ch/work/j/jteig/mg5amcnlo/CODEGEN_mad_gg_ttgg/Cards [0m
-[1;34mWARNING: File exists /afs/cern.ch/work/j/jteig/mg5amcnlo/CODEGEN_mad_gg_ttgg/SubProcesses [0m
+[1;34mWARNING: File exists /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_mad_gg_ttgg [0m
+INFO: Creating subdirectories in directory /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_mad_gg_ttgg 
+[1;34mWARNING: File exists /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_mad_gg_ttgg/Cards [0m
+[1;34mWARNING: File exists /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_mad_gg_ttgg/SubProcesses [0m
 INFO: Organizing processes into subprocess groups 
 INFO: Generating Helas calls for process: g g > t t~ g g WEIGHTED<=4 @1 
 INFO: Processing color information for process: g g > t t~ g g @1 
@@ -177,7 +177,7 @@ INFO: Creating files in directory P1_gg_ttxgg
 [1;32mDEBUG:  Entering PLUGIN_OneProcessExporter.__init__ [1;30m[model_handling.py at line 1039][0m [0m
 [1;32mDEBUG:  kwargs[prefix] = 0 [1;30m[model_handling.py at line 1040][0m [0m
 [1;32mDEBUG:  proc_id = [0m 1 [1;30m[model_handling.py at line 1045][0m [0m
-[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_SA_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7f6f1bb5cdf0> [1;30m[export_v4.py at line 6174][0m [0m
+[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_SA_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7f2794d7fe50> [1;30m[export_v4.py at line 6179][0m [0m
 INFO: Creating files in directory . 
 [1;32mDEBUG:  Entering PLUGIN_OneProcessExporter.generate_process_files [1;30m[model_handling.py at line 1297][0m [0m
 [1;32mDEBUG:  self.include_multi_channel is already defined: this is madevent+second_exporter mode [1;30m[model_handling.py at line 1299][0m [0m
@@ -210,22 +210,22 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./.
 [1;32mDEBUG:  Entering PLUGIN_OneProcessExporter.edit_testxxx [1;30m[model_handling.py at line 1419][0m [0m
 [1;32mDEBUG:  Entering PLUGIN_OneProcessExporter.edit_memorybuffers [1;30m[model_handling.py at line 1430][0m [0m
 [1;32mDEBUG:  Entering PLUGIN_OneProcessExporter.edit_memoryaccesscouplings [1;30m[model_handling.py at line 1441][0m [0m
-[1;32mDEBUG:  'Copying test reference file: ', template_ref  = [0m Copying test reference file:  /afs/cern.ch/work/j/jteig/mg5amcnlo/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/../../../test/ref/dump_CPUTest.Sigma_sm_gg_ttxgg.txt [1;30m[model_handling.py at line 1335][0m [0m
+[1;32mDEBUG:  'Copying test reference file: ', template_ref  = [0m Copying test reference file:  /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/../../../test/ref/dump_CPUTest.Sigma_sm_gg_ttxgg.txt [1;30m[model_handling.py at line 1335][0m [0m
 [1;32mDEBUG:  proc_id = [0m 1 [1;30m[export_cpp.py at line 710][0m [0m
 [1;32mDEBUG:  config_map = [0m [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 0, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 0, 46, 47, 48, 49, 50, 51, 52, 53, 54, 0, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 0, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 0, 88, 89, 90, 91, 92, 93, 0, 94, 95, 96, 97, 98, 99, 0, 100, 101, 102, 103, 104, 105, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] [1;30m[export_cpp.py at line 711][0m [0m
 [1;32mDEBUG:  subproc_number = [0m 0 [1;30m[export_cpp.py at line 712][0m [0m
 [1;32mDEBUG:  Done [1;30m[export_cpp.py at line 713][0m [0m
 INFO: Generating Feynman diagrams for Process: g g > t t~ g g WEIGHTED<=4 @1 
 INFO: Finding symmetric diagrams for subprocess group gg_ttxgg 
-Generated helas calls for 1 subprocesses (123 diagrams) in 0.288 s
-Wrote files for 222 helas calls in 0.668 s
+Generated helas calls for 1 subprocesses (123 diagrams) in 0.424 s
+Wrote files for 222 helas calls in 0.735 s
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates VVV1 routines[0m
 ALOHA: aloha creates FFV1 routines[0m
 ALOHA: aloha creates VVVV1 routines[0m
 ALOHA: aloha creates VVVV3 routines[0m
 ALOHA: aloha creates VVVV4 routines[0m
-ALOHA: aloha creates 5 routines in  0.228 s
+ALOHA: aloha creates 5 routines in  0.332 s
 [1;32mDEBUG:  Entering PLUGIN_ProcessExporter.convert_model (create the model) [1;30m[output.py at line 194][0m [0m
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates VVV1 routines[0m
@@ -233,7 +233,7 @@ ALOHA: aloha creates FFV1 routines[0m
 ALOHA: aloha creates VVVV1 routines[0m
 ALOHA: aloha creates VVVV3 routines[0m
 ALOHA: aloha creates VVVV4 routines[0m
-ALOHA: aloha creates 10 routines in  0.213 s
+ALOHA: aloha creates 10 routines in  0.315 s
 <class 'aloha.create_aloha.AbstractRoutine'> VVV1
 <class 'aloha.create_aloha.AbstractRoutine'> VVV1
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
@@ -246,8 +246,8 @@ ALOHA: aloha creates 10 routines in  0.213 s
 <class 'aloha.create_aloha.AbstractRoutine'> VVVV3
 <class 'aloha.create_aloha.AbstractRoutine'> VVVV4
 <class 'aloha.create_aloha.AbstractRoutine'> VVVV4
-FileWriter <class 'PLUGIN.CUDACPP_SA_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /afs/cern.ch/work/j/jteig/mg5amcnlo/CODEGEN_mad_gg_ttgg/src/./HelAmps_sm.h
-INFO: Created file HelAmps_sm.h in directory /afs/cern.ch/work/j/jteig/mg5amcnlo/CODEGEN_mad_gg_ttgg/src/. 
+FileWriter <class 'PLUGIN.CUDACPP_SA_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_mad_gg_ttgg/src/./HelAmps_sm.h
+INFO: Created file HelAmps_sm.h in directory /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_mad_gg_ttgg/src/. 
 super_write_set_parameters_onlyfixMajorana (hardcoded=False)
 [1;32mDEBUG:  'pardef_lines size =', len(pardef_lines), ', keys size =', len(pardef_lines.keys())  = [0m pardef_lines size = 48 , keys size = 48 [1;30m[model_handling.py at line 729][0m [0m
 super_write_set_parameters_onlyfixMajorana (hardcoded=True)
@@ -256,22 +256,22 @@ super_write_set_parameters_onlyfixMajorana (hardcoded=True)
 [1;32mDEBUG:  'pardef_lines size =', len(pardef_lines), ', keys size =', len(pardef_lines.keys())  = [0m pardef_lines size = 3 , keys size = 3 [1;30m[model_handling.py at line 729][0m [0m
 [1;32mDEBUG:  'pardef_lines size =', len(pardef_lines), ', keys size =', len(pardef_lines.keys())  = [0m pardef_lines size = 3 , keys size = 3 [1;30m[model_handling.py at line 729][0m [0m
 [1;32mDEBUG:  'pardef_lines size =', len(pardef_lines), ', keys size =', len(pardef_lines.keys())  = [0m pardef_lines size = 3 , keys size = 3 [1;30m[model_handling.py at line 729][0m [0m
-FileWriter <class 'PLUGIN.CUDACPP_SA_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /afs/cern.ch/work/j/jteig/mg5amcnlo/CODEGEN_mad_gg_ttgg/src/./Parameters_sm.h
-FileWriter <class 'PLUGIN.CUDACPP_SA_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /afs/cern.ch/work/j/jteig/mg5amcnlo/CODEGEN_mad_gg_ttgg/src/./Parameters_sm.cc
+FileWriter <class 'PLUGIN.CUDACPP_SA_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_mad_gg_ttgg/src/./Parameters_sm.h
+FileWriter <class 'PLUGIN.CUDACPP_SA_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_mad_gg_ttgg/src/./Parameters_sm.cc
 INFO: Created files Parameters_sm.h and Parameters_sm.cc in directory 
-INFO: /afs/cern.ch/work/j/jteig/mg5amcnlo/CODEGEN_mad_gg_ttgg/src/. and /afs/cern.ch/work/j/jteig/mg5amcnlo/CODEGEN_mad_gg_ttgg/src/. 
+INFO: /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_mad_gg_ttgg/src/. and /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_mad_gg_ttgg/src/. 
 The option zerowidth_tchannel is modified [True] but will not be written in the configuration files.
 If you want to make this value the default for future session, you can run 'save options --all'
-save configuration file to /afs/cern.ch/work/j/jteig/mg5amcnlo/CODEGEN_mad_gg_ttgg/Cards/me5_configuration.txt
+save configuration file to /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_mad_gg_ttgg/Cards/me5_configuration.txt
 INFO: Use Fortran compiler gfortran 
 INFO: Use c++ compiler g++ 
 INFO: Generate web pages 
-Output to directory /afs/cern.ch/work/j/jteig/mg5amcnlo/CODEGEN_mad_gg_ttgg done.
+Output to directory /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_mad_gg_ttgg done.
 Type "launch" to generate events from this process, or see
-/afs/cern.ch/work/j/jteig/mg5amcnlo/CODEGEN_mad_gg_ttgg/README
+/data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_mad_gg_ttgg/README
 Run "open index.html" to see more information about this process.
 quit
 
-real	0m5.769s
-user	0m2.121s
-sys	0m1.610s
+real	0m3.284s
+user	0m3.067s
+sys	0m0.205s
diff --git a/epochX/cudacpp/gg_ttgg.mad/Cards/me5_configuration.txt b/epochX/cudacpp/gg_ttgg.mad/Cards/me5_configuration.txt
index 9e9ed9d752..00d7c6f8d6 100644
--- a/epochX/cudacpp/gg_ttgg.mad/Cards/me5_configuration.txt
+++ b/epochX/cudacpp/gg_ttgg.mad/Cards/me5_configuration.txt
@@ -234,7 +234,7 @@
 # pineappl = pineappl
 
 
-#mg5_path = /afs/cern.ch/work/j/jteig/mg5amcnlo 
+#mg5_path = /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo 
 
 # MG5 MAIN DIRECTORY
-#mg5_path = /afs/cern.ch/work/j/jteig/mg5amcnlo
+#mg5_path = /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo
diff --git a/epochX/cudacpp/gg_ttgg.mad/Source/DHELAS/aloha_file.inc b/epochX/cudacpp/gg_ttgg.mad/Source/DHELAS/aloha_file.inc
index 1b5bf6ec54..ec923afd6d 100644
--- a/epochX/cudacpp/gg_ttgg.mad/Source/DHELAS/aloha_file.inc
+++ b/epochX/cudacpp/gg_ttgg.mad/Source/DHELAS/aloha_file.inc
@@ -1 +1 @@
-ALOHARoutine = VVV1_0.o VVV1P0_1.o FFV1_0.o FFV1_1.o FFV1_2.o FFV1P0_3.o VVVV1_0.o VVVV1P0_1.o VVVV3_0.o VVVV3P0_1.o VVVV4_0.o VVVV4P0_1.o
+ALOHARoutine = FFV1_1.o VVVV4_0.o VVVV4P0_1.o FFV1_0.o VVV1_0.o FFV1_2.o VVVV3_0.o VVVV1_0.o VVVV3P0_1.o VVVV1P0_1.o VVV1P0_1.o FFV1P0_3.o
diff --git a/epochX/cudacpp/gg_ttgg.mad/bin/internal/ufomodel/py3_model.pkl b/epochX/cudacpp/gg_ttgg.mad/bin/internal/ufomodel/py3_model.pkl
index b6989c1453094d7f45cf2ee4b2124efa29e9064b..f71ba45bbc6d4acc8d32bb06662fe900a694009f 100644
GIT binary patch
delta 68
zcmdmcj_KGrrVZZ9)Uy~E81z#TOA_@H%Mx=Ei;FY$-2+0642+EReceqHeVz5wGZM>m
YCuc5GV>FyRVVM+*g|30c<aNvX0Xnc3lK=n!

delta 80
zcmX?hj%n{XrVZZ9<PsSe81xg<iuIFIi}Li6GxW>zi?a2z^s`D*Gt>1a7cEnhH;B*4
kk1tD2E>YJ}P)|uLNmaK}P&YC#GS)RP(>1W1ynR_e0E4X?GXMYp

diff --git a/epochX/cudacpp/gg_ttgg.sa/CODEGEN_cudacpp_gg_ttgg_log.txt b/epochX/cudacpp/gg_ttgg.sa/CODEGEN_cudacpp_gg_ttgg_log.txt
index d385556495..b06fbec52e 100644
--- a/epochX/cudacpp/gg_ttgg.sa/CODEGEN_cudacpp_gg_ttgg_log.txt
+++ b/epochX/cudacpp/gg_ttgg.sa/CODEGEN_cudacpp_gg_ttgg_log.txt
@@ -52,8 +52,8 @@ Note that you can still compile and run aMC@NLO with the built-in PDFs
 
 Using default text editor "vi". Set another one in ./input/mg5_configuration.txt
 Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt
-No valid web browser found. Please set in ./input/mg5_configuration.txt
-import /afs/cern.ch/work/j/jteig/mg5amcnlo/CODEGEN_cudacpp_gg_ttgg.mg
+Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt
+import /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_cudacpp_gg_ttgg.mg
 The import format was not given, so we guess it as command
 set stdout_level DEBUG
 set output information to level: 10
@@ -62,7 +62,7 @@ generate g g > t t~ g g
 No model currently active, so we import the Standard Model
 INFO: load particles 
 INFO: load vertices 
-[1;32mDEBUG: model prefixing  takes 0.003401517868041992 [0m
+[1;32mDEBUG: model prefixing  takes 0.005280971527099609 [0m
 INFO: Restrict model sm with file models/sm/restrict_default.dat . 
 [1;32mDEBUG: Simplifying conditional expressions [0m
 [1;32mDEBUG: remove interactions: u s w+ at order: QED=1 [0m
@@ -155,7 +155,7 @@ INFO: Please specify coupling orders to bypass this step.
 INFO: Trying coupling order WEIGHTED<=4: WEIGTHED IS QCD+2*QED 
 INFO: Trying process: g g > t t~ g g WEIGHTED<=4 @1  
 INFO: Process has 123 diagrams 
-1 processes with 123 diagrams generated in 0.112 s
+1 processes with 123 diagrams generated in 0.156 s
 Total: 1 processes with 123 diagrams
 output standalone_cudacpp CODEGEN_cudacpp_gg_ttgg
 Load PLUGIN.CUDACPP_SA_OUTPUT
@@ -163,7 +163,7 @@ Load PLUGIN.CUDACPP_SA_OUTPUT
 [1;32mDEBUG:  cformat = [0m plugin [1;30m[export_cpp.py at line 3071][0m [0m
 [1;32mDEBUG:  Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [1;30m[output.py at line 152][0m [0m
 [1;32mDEBUG:  Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [1;30m[output.py at line 157][0m [0m
-INFO: Creating subdirectories in directory /afs/cern.ch/work/j/jteig/mg5amcnlo/CODEGEN_cudacpp_gg_ttgg 
+INFO: Creating subdirectories in directory /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_cudacpp_gg_ttgg 
 INFO: Organizing processes into subprocess groups 
 INFO: Generating Helas calls for process: g g > t t~ g g WEIGHTED<=4 @1 
 INFO: Processing color information for process: g g > t t~ g g @1 
@@ -173,12 +173,12 @@ INFO: Processing color information for process: g g > t t~ g g @1
 [1;32mDEBUG:    type(me)=<class 'int'> me=0 [1;30m[output.py at line 189][0m [0m
 [1;32mDEBUG:  Entering PLUGIN_OneProcessExporter.__init__ [1;30m[model_handling.py at line 1039][0m [0m
 [1;32mDEBUG:  proc_id = [0m 0 [1;30m[model_handling.py at line 1045][0m [0m
-INFO: Creating files in directory /afs/cern.ch/work/j/jteig/mg5amcnlo/CODEGEN_cudacpp_gg_ttgg/SubProcesses/P1_Sigma_sm_gg_ttxgg 
+INFO: Creating files in directory /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_cudacpp_gg_ttgg/SubProcesses/P1_Sigma_sm_gg_ttxgg 
 [1;32mDEBUG:  Entering PLUGIN_OneProcessExporter.generate_process_files [1;30m[model_handling.py at line 1297][0m [0m
 [1;32mDEBUG:  self.include_multi_channel is not yet defined: this is standalone_cudacpp mode [1;30m[model_handling.py at line 1301][0m [0m
-FileWriter <class 'PLUGIN.CUDACPP_SA_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /afs/cern.ch/work/j/jteig/mg5amcnlo/CODEGEN_cudacpp_gg_ttgg/SubProcesses/P1_Sigma_sm_gg_ttxgg/./CPPProcess.h
+FileWriter <class 'PLUGIN.CUDACPP_SA_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_cudacpp_gg_ttgg/SubProcesses/P1_Sigma_sm_gg_ttxgg/./CPPProcess.h
 [1;32mDEBUG:  Entering PLUGIN_OneProcessExporter.write_process_h_file [1;30m[model_handling.py at line 1453][0m [0m
-FileWriter <class 'PLUGIN.CUDACPP_SA_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /afs/cern.ch/work/j/jteig/mg5amcnlo/CODEGEN_cudacpp_gg_ttgg/SubProcesses/P1_Sigma_sm_gg_ttxgg/./CPPProcess.cc
+FileWriter <class 'PLUGIN.CUDACPP_SA_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_cudacpp_gg_ttgg/SubProcesses/P1_Sigma_sm_gg_ttxgg/./CPPProcess.cc
 [1;32mDEBUG:  Entering PLUGIN_OneProcessExporter.write_process_cc_file [1;30m[model_handling.py at line 1475][0m [0m
 [1;32mDEBUG:  Entering PLUGIN_OneProcessExporter.get_sigmaKin_lines [1;30m[model_handling.py at line 1143][0m [0m
 [1;32mDEBUG:  self.include_multi_channel = [0m False [1;30m[model_handling.py at line 1144][0m [0m
@@ -195,7 +195,7 @@ FileWriter <class 'PLUGIN.CUDACPP_SA_OUTPUT.model_handling.PLUGIN_CPPWriter'> fo
 [1;32mDEBUG:  ('ZERO', 4, 1, 4, 4) [1;30m[model_handling.py at line 1824][0m [0m
 [1;32mDEBUG:  call = [0m vxxxxx( momenta,m_pars->%s, cHel[ihel][%d],%+d, w_sv[%d], %d ); [1;30m[model_handling.py at line 1823][0m [0m
 [1;32mDEBUG:  ('ZERO', 5, 1, 5, 5) [1;30m[model_handling.py at line 1824][0m [0m
-INFO: Created files CPPProcess.h and CPPProcess.cc in directory /afs/cern.ch/work/j/jteig/mg5amcnlo/CODEGEN_cudacpp_gg_ttgg/SubProcesses/P1_Sigma_sm_gg_ttxgg/. 
+INFO: Created files CPPProcess.h and CPPProcess.cc in directory /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_cudacpp_gg_ttgg/SubProcesses/P1_Sigma_sm_gg_ttxgg/. 
 [1;32mDEBUG:  Entering PLUGIN_OneProcessExporter.edit_CMakeLists [1;30m[model_handling.py at line 1343][0m [0m
 [1;32mDEBUG:  Entering PLUGIN_OneProcessExporter.edit_check_sa [1;30m[model_handling.py at line 1352][0m [0m
 [1;32mDEBUG:  Entering PLUGIN_OneProcessExporter.edit_mgonGPU [1;30m[model_handling.py at line 1369][0m [0m
@@ -203,8 +203,8 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory /afs/cern.ch/wor
 [1;32mDEBUG:  Entering PLUGIN_OneProcessExporter.edit_testxxx [1;30m[model_handling.py at line 1419][0m [0m
 [1;32mDEBUG:  Entering PLUGIN_OneProcessExporter.edit_memorybuffers [1;30m[model_handling.py at line 1430][0m [0m
 [1;32mDEBUG:  Entering PLUGIN_OneProcessExporter.edit_memoryaccesscouplings [1;30m[model_handling.py at line 1441][0m [0m
-[1;32mDEBUG:  'Copying test reference file: ', template_ref  = [0m Copying test reference file:  /afs/cern.ch/work/j/jteig/mg5amcnlo/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/../../../test/ref/dump_CPUTest.Sigma_sm_gg_ttxgg.txt [1;30m[model_handling.py at line 1335][0m [0m
-Generated helas calls for 1 subprocesses (123 diagrams) in 0.292 s
+[1;32mDEBUG:  'Copying test reference file: ', template_ref  = [0m Copying test reference file:  /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/../../../test/ref/dump_CPUTest.Sigma_sm_gg_ttxgg.txt [1;30m[model_handling.py at line 1335][0m [0m
+Generated helas calls for 1 subprocesses (123 diagrams) in 0.436 s
 [1;32mDEBUG:  Entering PLUGIN_ProcessExporter.convert_model (create the model) [1;30m[output.py at line 194][0m [0m
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates VVV1 routines[0m
@@ -212,7 +212,7 @@ ALOHA: aloha creates FFV1 routines[0m
 ALOHA: aloha creates VVVV1 routines[0m
 ALOHA: aloha creates VVVV3 routines[0m
 ALOHA: aloha creates VVVV4 routines[0m
-ALOHA: aloha creates 5 routines in  0.221 s
+ALOHA: aloha creates 5 routines in  0.313 s
 <class 'aloha.create_aloha.AbstractRoutine'> VVV1
 <class 'aloha.create_aloha.AbstractRoutine'> VVV1
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
@@ -225,8 +225,8 @@ ALOHA: aloha creates 5 routines in  0.221 s
 <class 'aloha.create_aloha.AbstractRoutine'> VVVV3
 <class 'aloha.create_aloha.AbstractRoutine'> VVVV4
 <class 'aloha.create_aloha.AbstractRoutine'> VVVV4
-FileWriter <class 'PLUGIN.CUDACPP_SA_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /afs/cern.ch/work/j/jteig/mg5amcnlo/CODEGEN_cudacpp_gg_ttgg/src/./HelAmps_sm.h
-INFO: Created file HelAmps_sm.h in directory /afs/cern.ch/work/j/jteig/mg5amcnlo/CODEGEN_cudacpp_gg_ttgg/src/. 
+FileWriter <class 'PLUGIN.CUDACPP_SA_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_cudacpp_gg_ttgg/src/./HelAmps_sm.h
+INFO: Created file HelAmps_sm.h in directory /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_cudacpp_gg_ttgg/src/. 
 super_write_set_parameters_onlyfixMajorana (hardcoded=False)
 [1;32mDEBUG:  'pardef_lines size =', len(pardef_lines), ', keys size =', len(pardef_lines.keys())  = [0m pardef_lines size = 48 , keys size = 48 [1;30m[model_handling.py at line 729][0m [0m
 super_write_set_parameters_onlyfixMajorana (hardcoded=True)
@@ -235,13 +235,13 @@ super_write_set_parameters_onlyfixMajorana (hardcoded=True)
 [1;32mDEBUG:  'pardef_lines size =', len(pardef_lines), ', keys size =', len(pardef_lines.keys())  = [0m pardef_lines size = 3 , keys size = 3 [1;30m[model_handling.py at line 729][0m [0m
 [1;32mDEBUG:  'pardef_lines size =', len(pardef_lines), ', keys size =', len(pardef_lines.keys())  = [0m pardef_lines size = 3 , keys size = 3 [1;30m[model_handling.py at line 729][0m [0m
 [1;32mDEBUG:  'pardef_lines size =', len(pardef_lines), ', keys size =', len(pardef_lines.keys())  = [0m pardef_lines size = 3 , keys size = 3 [1;30m[model_handling.py at line 729][0m [0m
-FileWriter <class 'PLUGIN.CUDACPP_SA_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /afs/cern.ch/work/j/jteig/mg5amcnlo/CODEGEN_cudacpp_gg_ttgg/src/./Parameters_sm.h
-FileWriter <class 'PLUGIN.CUDACPP_SA_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /afs/cern.ch/work/j/jteig/mg5amcnlo/CODEGEN_cudacpp_gg_ttgg/src/./Parameters_sm.cc
+FileWriter <class 'PLUGIN.CUDACPP_SA_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_cudacpp_gg_ttgg/src/./Parameters_sm.h
+FileWriter <class 'PLUGIN.CUDACPP_SA_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_cudacpp_gg_ttgg/src/./Parameters_sm.cc
 INFO: Created files Parameters_sm.h and Parameters_sm.cc in directory 
-INFO: /afs/cern.ch/work/j/jteig/mg5amcnlo/CODEGEN_cudacpp_gg_ttgg/src/. and /afs/cern.ch/work/j/jteig/mg5amcnlo/CODEGEN_cudacpp_gg_ttgg/src/. 
+INFO: /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_cudacpp_gg_ttgg/src/. and /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_cudacpp_gg_ttgg/src/. 
 [1;32mDEBUG:  Entering PLUGIN_ProcessExporter.finalize [1;30m[output.py at line 203][0m [0m
 quit
 
-real	0m1.403s
-user	0m0.994s
-sys	0m0.177s
+real	0m1.504s
+user	0m1.444s
+sys	0m0.051s
diff --git a/epochX/cudacpp/gg_ttggg.mad/CODEGEN_mad_gg_ttggg_log.txt b/epochX/cudacpp/gg_ttggg.mad/CODEGEN_mad_gg_ttggg_log.txt
index 6e08f68b96..71cf69851d 100644
--- a/epochX/cudacpp/gg_ttggg.mad/CODEGEN_mad_gg_ttggg_log.txt
+++ b/epochX/cudacpp/gg_ttggg.mad/CODEGEN_mad_gg_ttggg_log.txt
@@ -52,8 +52,8 @@ Note that you can still compile and run aMC@NLO with the built-in PDFs
 
 Using default text editor "vi". Set another one in ./input/mg5_configuration.txt
 Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt
-No valid web browser found. Please set in ./input/mg5_configuration.txt
-import /afs/cern.ch/work/j/jteig/mg5amcnlo/CODEGEN_mad_gg_ttggg.mg
+Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt
+import /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_mad_gg_ttggg.mg
 The import format was not given, so we guess it as command
 set stdout_level DEBUG
 set output information to level: 10
@@ -62,7 +62,7 @@ generate g g > t t~ g g g
 No model currently active, so we import the Standard Model
 INFO: load particles 
 INFO: load vertices 
-[1;32mDEBUG: model prefixing  takes 0.004005908966064453 [0m
+[1;32mDEBUG: model prefixing  takes 0.0058345794677734375 [0m
 INFO: Restrict model sm with file models/sm/restrict_default.dat . 
 [1;32mDEBUG: Simplifying conditional expressions [0m
 [1;32mDEBUG: remove interactions: u s w+ at order: QED=1 [0m
@@ -155,7 +155,7 @@ INFO: Please specify coupling orders to bypass this step.
 INFO: Trying coupling order WEIGHTED<=5: WEIGTHED IS QCD+2*QED 
 INFO: Trying process: g g > t t~ g g g WEIGHTED<=5 @1  
 INFO: Process has 1240 diagrams 
-1 processes with 1240 diagrams generated in 1.307 s
+1 processes with 1240 diagrams generated in 1.894 s
 Total: 1 processes with 1240 diagrams
 output madevent CODEGEN_mad_gg_ttggg --hel_recycling=False --vector_size=16384 --me_exporter=standalone_cudacpp
 Load PLUGIN.CUDACPP_SA_OUTPUT
@@ -166,20 +166,20 @@ Load PLUGIN.CUDACPP_SA_OUTPUT
 INFO: initialize a new directory: CODEGEN_mad_gg_ttggg 
 INFO: remove old information in CODEGEN_mad_gg_ttggg 
 [1;32mDEBUG:  Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [1;30m[output.py at line 157][0m [0m
-[1;34mWARNING: File exists /afs/cern.ch/work/j/jteig/mg5amcnlo/CODEGEN_mad_gg_ttggg [0m
-INFO: Creating subdirectories in directory /afs/cern.ch/work/j/jteig/mg5amcnlo/CODEGEN_mad_gg_ttggg 
-[1;34mWARNING: File exists /afs/cern.ch/work/j/jteig/mg5amcnlo/CODEGEN_mad_gg_ttggg/Cards [0m
-[1;34mWARNING: File exists /afs/cern.ch/work/j/jteig/mg5amcnlo/CODEGEN_mad_gg_ttggg/SubProcesses [0m
+[1;34mWARNING: File exists /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_mad_gg_ttggg [0m
+INFO: Creating subdirectories in directory /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_mad_gg_ttggg 
+[1;34mWARNING: File exists /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_mad_gg_ttggg/Cards [0m
+[1;34mWARNING: File exists /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_mad_gg_ttggg/SubProcesses [0m
 INFO: Organizing processes into subprocess groups 
 INFO: Generating Helas calls for process: g g > t t~ g g g WEIGHTED<=5 @1 
 INFO: Processing color information for process: g g > t t~ g g g @1 
 INFO: Creating files in directory P1_gg_ttxggg 
 INFO: Computing Color-Flow optimization [15120 term] 
-INFO: Color-Flow passed to 1592 term in 21s. Introduce 2768 contraction 
+INFO: Color-Flow passed to 1592 term in 36s. Introduce 2768 contraction 
 [1;32mDEBUG:  Entering PLUGIN_OneProcessExporter.__init__ [1;30m[model_handling.py at line 1039][0m [0m
 [1;32mDEBUG:  kwargs[prefix] = 0 [1;30m[model_handling.py at line 1040][0m [0m
 [1;32mDEBUG:  proc_id = [0m 1 [1;30m[model_handling.py at line 1045][0m [0m
-[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_SA_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7fa21b5e1250> [1;30m[export_v4.py at line 6174][0m [0m
+[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_SA_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7f7c3d985610> [1;30m[export_v4.py at line 6179][0m [0m
 INFO: Creating files in directory . 
 [1;32mDEBUG:  Entering PLUGIN_OneProcessExporter.generate_process_files [1;30m[model_handling.py at line 1297][0m [0m
 [1;32mDEBUG:  self.include_multi_channel is already defined: this is madevent+second_exporter mode [1;30m[model_handling.py at line 1299][0m [0m
@@ -214,22 +214,22 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./.
 [1;32mDEBUG:  Entering PLUGIN_OneProcessExporter.edit_testxxx [1;30m[model_handling.py at line 1419][0m [0m
 [1;32mDEBUG:  Entering PLUGIN_OneProcessExporter.edit_memorybuffers [1;30m[model_handling.py at line 1430][0m [0m
 [1;32mDEBUG:  Entering PLUGIN_OneProcessExporter.edit_memoryaccesscouplings [1;30m[model_handling.py at line 1441][0m [0m
-[1;32mDEBUG:  'Copying test reference file: ', template_ref  = [0m Copying test reference file:  /afs/cern.ch/work/j/jteig/mg5amcnlo/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/../../../test/ref/dump_CPUTest.Sigma_sm_gg_ttxggg.txt [1;30m[model_handling.py at line 1335][0m [0m
+[1;32mDEBUG:  'Copying test reference file: ', template_ref  = [0m Copying test reference file:  /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/../../../test/ref/dump_CPUTest.Sigma_sm_gg_ttxggg.txt [1;30m[model_handling.py at line 1335][0m [0m
 [1;32mDEBUG:  proc_id = [0m 1 [1;30m[export_cpp.py at line 710][0m [0m
 [1;32mDEBUG:  config_map = [0m [1, 2, 0, 3, 4, 0, 5, 6, 0, 0, 0, 0, 0, 7, 8, 9, 0, 10, 11, 12, 0, 13, 14, 15, 0, 16, 17, 18, 19, 20, 21, 0, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 0, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 0, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 0, 67, 68, 69, 70, 71, 72, 73, 74, 75, 0, 76, 77, 78, 79, 80, 81, 82, 83, 84, 0, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 0, 0, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 0, 121, 122, 0, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 0, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 0, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196, 0, 197, 198, 199, 200, 201, 202, 0, 203, 204, 205, 206, 207, 208, 0, 209, 210, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223, 224, 225, 0, 226, 227, 0, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239, 240, 241, 242, 0, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254, 255, 256, 257, 0, 258, 259, 260, 261, 262, 263, 264, 265, 266, 267, 268, 269, 270, 271, 272, 273, 274, 275, 276, 277, 278, 279, 280, 281, 282, 283, 284, 285, 286, 287, 288, 289, 290, 291, 292, 293, 294, 295, 296, 297, 298, 299, 300, 301, 0, 302, 303, 304, 305, 306, 307, 0, 308, 309, 310, 311, 312, 313, 0, 314, 315, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 316, 317, 318, 319, 320, 321, 0, 322, 323, 324, 325, 326, 327, 328, 329, 330, 331, 332, 333, 334, 335, 336, 0, 337, 338, 339, 340, 341, 342, 343, 344, 345, 346, 347, 348, 349, 350, 351, 0, 352, 353, 354, 355, 356, 357, 358, 359, 360, 361, 362, 363, 364, 365, 366, 0, 367, 368, 369, 370, 371, 372, 373, 374, 375, 376, 377, 0, 378, 379, 0, 380, 381, 0, 0, 0, 0, 0, 382, 383, 384, 385, 386, 387, 388, 389, 390, 0, 391, 392, 393, 394, 395, 396, 397, 398, 399, 0, 400, 401, 402, 403, 404, 405, 406, 407, 408, 0, 409, 410, 411, 412, 413, 414, 0, 415, 416, 417, 418, 419, 420, 0, 0, 0, 421, 422, 423, 424, 425, 426, 0, 427, 428, 429, 430, 431, 432, 433, 434, 435, 436, 437, 438, 439, 440, 441, 0, 442, 443, 444, 445, 446, 447, 448, 449, 450, 451, 452, 453, 454, 455, 456, 0, 457, 458, 459, 460, 461, 462, 463, 464, 465, 466, 467, 468, 469, 470, 471, 0, 472, 473, 474, 475, 476, 477, 478, 479, 480, 481, 482, 0, 483, 484, 0, 485, 486, 0, 0, 0, 0, 0, 487, 488, 489, 490, 491, 492, 493, 494, 495, 0, 496, 497, 498, 499, 500, 501, 502, 503, 504, 0, 505, 506, 507, 508, 509, 510, 511, 512, 513, 0, 514, 515, 516, 517, 518, 519, 0, 520, 521, 522, 523, 524, 525, 0, 0, 0, 526, 527, 528, 529, 530, 531, 0, 532, 533, 534, 535, 536, 537, 538, 539, 540, 541, 542, 543, 544, 545, 546, 0, 547, 548, 549, 550, 551, 552, 553, 554, 555, 556, 557, 558, 559, 560, 561, 0, 562, 563, 564, 565, 566, 567, 568, 569, 570, 571, 572, 573, 574, 575, 576, 0, 577, 578, 579, 580, 581, 582, 583, 584, 585, 586, 587, 0, 588, 589, 0, 590, 591, 0, 0, 0, 0, 0, 592, 593, 594, 595, 596, 597, 598, 599, 600, 0, 601, 602, 603, 604, 605, 606, 607, 608, 609, 0, 610, 611, 612, 613, 614, 615, 616, 617, 618, 0, 619, 620, 621, 622, 623, 624, 0, 625, 626, 627, 628, 629, 630, 0, 0, 0, 631, 632, 633, 634, 635, 636, 637, 638, 639, 640, 641, 642, 643, 644, 645, 646, 647, 648, 649, 650, 651, 652, 653, 654, 655, 656, 657, 658, 659, 660, 661, 662, 663, 0, 664, 665, 666, 667, 668, 669, 0, 670, 671, 672, 673, 674, 675, 0, 0, 0, 676, 677, 678, 679, 680, 681, 682, 683, 684, 685, 686, 687, 688, 689, 690, 691, 692, 693, 694, 695, 696, 697, 698, 699, 700, 701, 702, 703, 704, 705, 706, 707, 708, 0, 709, 710, 711, 712, 713, 714, 0, 715, 716, 717, 718, 719, 720, 0, 0, 0, 721, 722, 0, 723, 724, 0, 725, 726, 0, 0, 0, 0, 0, 727, 728, 729, 730, 731, 732, 733, 734, 735, 0, 736, 737, 738, 739, 740, 741, 742, 743, 744, 0, 745, 746, 747, 748, 749, 750, 751, 752, 753, 0, 754, 755, 756, 757, 758, 759, 0, 760, 761, 762, 763, 764, 765, 766, 767, 0, 768, 769, 0, 770, 771, 0, 0, 0, 0, 0, 772, 773, 774, 775, 776, 777, 778, 779, 780, 0, 781, 782, 783, 784, 785, 786, 787, 788, 789, 0, 790, 791, 792, 793, 794, 795, 796, 797, 798, 0, 799, 800, 801, 802, 803, 804, 0, 805, 806, 807, 808, 809, 810, 811, 812, 0, 813, 814, 0, 815, 816, 0, 0, 0, 0, 0, 817, 818, 819, 820, 821, 822, 823, 824, 825, 0, 826, 827, 828, 829, 830, 831, 832, 833, 834, 0, 835, 836, 837, 838, 839, 840, 841, 842, 843, 0, 844, 845, 846, 847, 848, 849, 0, 850, 851, 852, 853, 854, 855, 856, 857, 0, 858, 859, 0, 860, 861, 0, 0, 0, 0, 862, 863, 0, 864, 865, 0, 866, 867, 0, 0, 0, 0, 868, 869, 0, 870, 871, 0, 872, 873, 0, 0, 0, 0, 0, 0, 0, 874, 875, 876, 877, 878, 879, 880, 881, 882, 883, 884, 885, 886, 887, 888, 889, 890, 891, 0, 892, 893, 894, 895, 896, 897, 898, 899, 900, 901, 902, 903, 904, 905, 906, 907, 908, 909, 0, 910, 911, 912, 913, 914, 915, 916, 917, 918, 919, 920, 921, 922, 923, 924, 925, 926, 927, 0, 928, 929, 930, 931, 932, 933, 0, 934, 935, 936, 937, 938, 939, 0, 940, 941, 942, 943, 944, 945, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] [1;30m[export_cpp.py at line 711][0m [0m
 [1;32mDEBUG:  subproc_number = [0m 0 [1;30m[export_cpp.py at line 712][0m [0m
 [1;32mDEBUG:  Done [1;30m[export_cpp.py at line 713][0m [0m
 INFO: Generating Feynman diagrams for Process: g g > t t~ g g g WEIGHTED<=5 @1 
 INFO: Finding symmetric diagrams for subprocess group gg_ttxggg 
-Generated helas calls for 1 subprocesses (1240 diagrams) in 4.314 s
-Wrote files for 2281 helas calls in 28.906 s
+Generated helas calls for 1 subprocesses (1240 diagrams) in 6.556 s
+Wrote files for 2281 helas calls in 46.775 s
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates VVV1 routines[0m
 ALOHA: aloha creates FFV1 routines[0m
 ALOHA: aloha creates VVVV1 routines[0m
 ALOHA: aloha creates VVVV3 routines[0m
 ALOHA: aloha creates VVVV4 routines[0m
-ALOHA: aloha creates 5 routines in  0.218 s
+ALOHA: aloha creates 5 routines in  0.315 s
 [1;32mDEBUG:  Entering PLUGIN_ProcessExporter.convert_model (create the model) [1;30m[output.py at line 194][0m [0m
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates VVV1 routines[0m
@@ -237,7 +237,7 @@ ALOHA: aloha creates FFV1 routines[0m
 ALOHA: aloha creates VVVV1 routines[0m
 ALOHA: aloha creates VVVV3 routines[0m
 ALOHA: aloha creates VVVV4 routines[0m
-ALOHA: aloha creates 10 routines in  0.213 s
+ALOHA: aloha creates 10 routines in  0.312 s
 <class 'aloha.create_aloha.AbstractRoutine'> VVV1
 <class 'aloha.create_aloha.AbstractRoutine'> VVV1
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
@@ -250,8 +250,8 @@ ALOHA: aloha creates 10 routines in  0.213 s
 <class 'aloha.create_aloha.AbstractRoutine'> VVVV3
 <class 'aloha.create_aloha.AbstractRoutine'> VVVV4
 <class 'aloha.create_aloha.AbstractRoutine'> VVVV4
-FileWriter <class 'PLUGIN.CUDACPP_SA_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /afs/cern.ch/work/j/jteig/mg5amcnlo/CODEGEN_mad_gg_ttggg/src/./HelAmps_sm.h
-INFO: Created file HelAmps_sm.h in directory /afs/cern.ch/work/j/jteig/mg5amcnlo/CODEGEN_mad_gg_ttggg/src/. 
+FileWriter <class 'PLUGIN.CUDACPP_SA_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_mad_gg_ttggg/src/./HelAmps_sm.h
+INFO: Created file HelAmps_sm.h in directory /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_mad_gg_ttggg/src/. 
 super_write_set_parameters_onlyfixMajorana (hardcoded=False)
 [1;32mDEBUG:  'pardef_lines size =', len(pardef_lines), ', keys size =', len(pardef_lines.keys())  = [0m pardef_lines size = 48 , keys size = 48 [1;30m[model_handling.py at line 729][0m [0m
 super_write_set_parameters_onlyfixMajorana (hardcoded=True)
@@ -260,22 +260,22 @@ super_write_set_parameters_onlyfixMajorana (hardcoded=True)
 [1;32mDEBUG:  'pardef_lines size =', len(pardef_lines), ', keys size =', len(pardef_lines.keys())  = [0m pardef_lines size = 3 , keys size = 3 [1;30m[model_handling.py at line 729][0m [0m
 [1;32mDEBUG:  'pardef_lines size =', len(pardef_lines), ', keys size =', len(pardef_lines.keys())  = [0m pardef_lines size = 3 , keys size = 3 [1;30m[model_handling.py at line 729][0m [0m
 [1;32mDEBUG:  'pardef_lines size =', len(pardef_lines), ', keys size =', len(pardef_lines.keys())  = [0m pardef_lines size = 3 , keys size = 3 [1;30m[model_handling.py at line 729][0m [0m
-FileWriter <class 'PLUGIN.CUDACPP_SA_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /afs/cern.ch/work/j/jteig/mg5amcnlo/CODEGEN_mad_gg_ttggg/src/./Parameters_sm.h
-FileWriter <class 'PLUGIN.CUDACPP_SA_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /afs/cern.ch/work/j/jteig/mg5amcnlo/CODEGEN_mad_gg_ttggg/src/./Parameters_sm.cc
+FileWriter <class 'PLUGIN.CUDACPP_SA_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_mad_gg_ttggg/src/./Parameters_sm.h
+FileWriter <class 'PLUGIN.CUDACPP_SA_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_mad_gg_ttggg/src/./Parameters_sm.cc
 INFO: Created files Parameters_sm.h and Parameters_sm.cc in directory 
-INFO: /afs/cern.ch/work/j/jteig/mg5amcnlo/CODEGEN_mad_gg_ttggg/src/. and /afs/cern.ch/work/j/jteig/mg5amcnlo/CODEGEN_mad_gg_ttggg/src/. 
+INFO: /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_mad_gg_ttggg/src/. and /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_mad_gg_ttggg/src/. 
 The option zerowidth_tchannel is modified [True] but will not be written in the configuration files.
 If you want to make this value the default for future session, you can run 'save options --all'
-save configuration file to /afs/cern.ch/work/j/jteig/mg5amcnlo/CODEGEN_mad_gg_ttggg/Cards/me5_configuration.txt
+save configuration file to /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_mad_gg_ttggg/Cards/me5_configuration.txt
 INFO: Use Fortran compiler gfortran 
 INFO: Use c++ compiler g++ 
 INFO: Generate web pages 
-Output to directory /afs/cern.ch/work/j/jteig/mg5amcnlo/CODEGEN_mad_gg_ttggg done.
+Output to directory /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_mad_gg_ttggg done.
 Type "launch" to generate events from this process, or see
-/afs/cern.ch/work/j/jteig/mg5amcnlo/CODEGEN_mad_gg_ttggg/README
+/data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_mad_gg_ttggg/README
 Run "open index.html" to see more information about this process.
 quit
 
-real	0m39.678s
-user	0m34.707s
-sys	0m2.723s
+real	0m57.415s
+user	0m56.319s
+sys	0m0.892s
diff --git a/epochX/cudacpp/gg_ttggg.mad/Cards/me5_configuration.txt b/epochX/cudacpp/gg_ttggg.mad/Cards/me5_configuration.txt
index 9e9ed9d752..00d7c6f8d6 100644
--- a/epochX/cudacpp/gg_ttggg.mad/Cards/me5_configuration.txt
+++ b/epochX/cudacpp/gg_ttggg.mad/Cards/me5_configuration.txt
@@ -234,7 +234,7 @@
 # pineappl = pineappl
 
 
-#mg5_path = /afs/cern.ch/work/j/jteig/mg5amcnlo 
+#mg5_path = /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo 
 
 # MG5 MAIN DIRECTORY
-#mg5_path = /afs/cern.ch/work/j/jteig/mg5amcnlo
+#mg5_path = /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo
diff --git a/epochX/cudacpp/gg_ttggg.mad/Source/DHELAS/aloha_file.inc b/epochX/cudacpp/gg_ttggg.mad/Source/DHELAS/aloha_file.inc
index 1b5bf6ec54..ec923afd6d 100644
--- a/epochX/cudacpp/gg_ttggg.mad/Source/DHELAS/aloha_file.inc
+++ b/epochX/cudacpp/gg_ttggg.mad/Source/DHELAS/aloha_file.inc
@@ -1 +1 @@
-ALOHARoutine = VVV1_0.o VVV1P0_1.o FFV1_0.o FFV1_1.o FFV1_2.o FFV1P0_3.o VVVV1_0.o VVVV1P0_1.o VVVV3_0.o VVVV3P0_1.o VVVV4_0.o VVVV4P0_1.o
+ALOHARoutine = FFV1_1.o VVVV4_0.o VVVV4P0_1.o FFV1_0.o VVV1_0.o FFV1_2.o VVVV3_0.o VVVV1_0.o VVVV3P0_1.o VVVV1P0_1.o VVV1P0_1.o FFV1P0_3.o
diff --git a/epochX/cudacpp/gg_ttggg.mad/bin/internal/ufomodel/py3_model.pkl b/epochX/cudacpp/gg_ttggg.mad/bin/internal/ufomodel/py3_model.pkl
index b6989c1453094d7f45cf2ee4b2124efa29e9064b..f71ba45bbc6d4acc8d32bb06662fe900a694009f 100644
GIT binary patch
delta 68
zcmdmcj_KGrrVZZ9)Uy~E81z#TOA_@H%Mx=Ei;FY$-2+0642+EReceqHeVz5wGZM>m
YCuc5GV>FyRVVM+*g|30c<aNvX0Xnc3lK=n!

delta 80
zcmX?hj%n{XrVZZ9<PsSe81xg<iuIFIi}Li6GxW>zi?a2z^s`D*Gt>1a7cEnhH;B*4
kk1tD2E>YJ}P)|uLNmaK}P&YC#GS)RP(>1W1ynR_e0E4X?GXMYp

diff --git a/epochX/cudacpp/gg_ttggg.sa/CODEGEN_cudacpp_gg_ttggg_log.txt b/epochX/cudacpp/gg_ttggg.sa/CODEGEN_cudacpp_gg_ttggg_log.txt
index e089834b6e..8047308e05 100644
--- a/epochX/cudacpp/gg_ttggg.sa/CODEGEN_cudacpp_gg_ttggg_log.txt
+++ b/epochX/cudacpp/gg_ttggg.sa/CODEGEN_cudacpp_gg_ttggg_log.txt
@@ -52,8 +52,8 @@ Note that you can still compile and run aMC@NLO with the built-in PDFs
 
 Using default text editor "vi". Set another one in ./input/mg5_configuration.txt
 Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt
-No valid web browser found. Please set in ./input/mg5_configuration.txt
-import /afs/cern.ch/work/j/jteig/mg5amcnlo/CODEGEN_cudacpp_gg_ttggg.mg
+Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt
+import /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_cudacpp_gg_ttggg.mg
 The import format was not given, so we guess it as command
 set stdout_level DEBUG
 set output information to level: 10
@@ -62,7 +62,7 @@ generate g g > t t~ g g g
 No model currently active, so we import the Standard Model
 INFO: load particles 
 INFO: load vertices 
-[1;32mDEBUG: model prefixing  takes 0.0033860206604003906 [0m
+[1;32mDEBUG: model prefixing  takes 0.005494117736816406 [0m
 INFO: Restrict model sm with file models/sm/restrict_default.dat . 
 [1;32mDEBUG: Simplifying conditional expressions [0m
 [1;32mDEBUG: remove interactions: u s w+ at order: QED=1 [0m
@@ -155,7 +155,7 @@ INFO: Please specify coupling orders to bypass this step.
 INFO: Trying coupling order WEIGHTED<=5: WEIGTHED IS QCD+2*QED 
 INFO: Trying process: g g > t t~ g g g WEIGHTED<=5 @1  
 INFO: Process has 1240 diagrams 
-1 processes with 1240 diagrams generated in 1.326 s
+1 processes with 1240 diagrams generated in 1.886 s
 Total: 1 processes with 1240 diagrams
 output standalone_cudacpp CODEGEN_cudacpp_gg_ttggg
 Load PLUGIN.CUDACPP_SA_OUTPUT
@@ -163,7 +163,7 @@ Load PLUGIN.CUDACPP_SA_OUTPUT
 [1;32mDEBUG:  cformat = [0m plugin [1;30m[export_cpp.py at line 3071][0m [0m
 [1;32mDEBUG:  Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [1;30m[output.py at line 152][0m [0m
 [1;32mDEBUG:  Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [1;30m[output.py at line 157][0m [0m
-INFO: Creating subdirectories in directory /afs/cern.ch/work/j/jteig/mg5amcnlo/CODEGEN_cudacpp_gg_ttggg 
+INFO: Creating subdirectories in directory /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_cudacpp_gg_ttggg 
 INFO: Organizing processes into subprocess groups 
 INFO: Generating Helas calls for process: g g > t t~ g g g WEIGHTED<=5 @1 
 INFO: Processing color information for process: g g > t t~ g g g @1 
@@ -173,12 +173,12 @@ INFO: Processing color information for process: g g > t t~ g g g @1
 [1;32mDEBUG:    type(me)=<class 'int'> me=0 [1;30m[output.py at line 189][0m [0m
 [1;32mDEBUG:  Entering PLUGIN_OneProcessExporter.__init__ [1;30m[model_handling.py at line 1039][0m [0m
 [1;32mDEBUG:  proc_id = [0m 0 [1;30m[model_handling.py at line 1045][0m [0m
-INFO: Creating files in directory /afs/cern.ch/work/j/jteig/mg5amcnlo/CODEGEN_cudacpp_gg_ttggg/SubProcesses/P1_Sigma_sm_gg_ttxggg 
+INFO: Creating files in directory /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_cudacpp_gg_ttggg/SubProcesses/P1_Sigma_sm_gg_ttxggg 
 [1;32mDEBUG:  Entering PLUGIN_OneProcessExporter.generate_process_files [1;30m[model_handling.py at line 1297][0m [0m
 [1;32mDEBUG:  self.include_multi_channel is not yet defined: this is standalone_cudacpp mode [1;30m[model_handling.py at line 1301][0m [0m
-FileWriter <class 'PLUGIN.CUDACPP_SA_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /afs/cern.ch/work/j/jteig/mg5amcnlo/CODEGEN_cudacpp_gg_ttggg/SubProcesses/P1_Sigma_sm_gg_ttxggg/./CPPProcess.h
+FileWriter <class 'PLUGIN.CUDACPP_SA_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_cudacpp_gg_ttggg/SubProcesses/P1_Sigma_sm_gg_ttxggg/./CPPProcess.h
 [1;32mDEBUG:  Entering PLUGIN_OneProcessExporter.write_process_h_file [1;30m[model_handling.py at line 1453][0m [0m
-FileWriter <class 'PLUGIN.CUDACPP_SA_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /afs/cern.ch/work/j/jteig/mg5amcnlo/CODEGEN_cudacpp_gg_ttggg/SubProcesses/P1_Sigma_sm_gg_ttxggg/./CPPProcess.cc
+FileWriter <class 'PLUGIN.CUDACPP_SA_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_cudacpp_gg_ttggg/SubProcesses/P1_Sigma_sm_gg_ttxggg/./CPPProcess.cc
 [1;32mDEBUG:  Entering PLUGIN_OneProcessExporter.write_process_cc_file [1;30m[model_handling.py at line 1475][0m [0m
 [1;32mDEBUG:  Entering PLUGIN_OneProcessExporter.get_sigmaKin_lines [1;30m[model_handling.py at line 1143][0m [0m
 [1;32mDEBUG:  self.include_multi_channel = [0m False [1;30m[model_handling.py at line 1144][0m [0m
@@ -197,7 +197,7 @@ FileWriter <class 'PLUGIN.CUDACPP_SA_OUTPUT.model_handling.PLUGIN_CPPWriter'> fo
 [1;32mDEBUG:  ('ZERO', 5, 1, 5, 5) [1;30m[model_handling.py at line 1824][0m [0m
 [1;32mDEBUG:  call = [0m vxxxxx( momenta,m_pars->%s, cHel[ihel][%d],%+d, w_sv[%d], %d ); [1;30m[model_handling.py at line 1823][0m [0m
 [1;32mDEBUG:  ('ZERO', 6, 1, 6, 6) [1;30m[model_handling.py at line 1824][0m [0m
-INFO: Created files CPPProcess.h and CPPProcess.cc in directory /afs/cern.ch/work/j/jteig/mg5amcnlo/CODEGEN_cudacpp_gg_ttggg/SubProcesses/P1_Sigma_sm_gg_ttxggg/. 
+INFO: Created files CPPProcess.h and CPPProcess.cc in directory /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_cudacpp_gg_ttggg/SubProcesses/P1_Sigma_sm_gg_ttxggg/. 
 [1;32mDEBUG:  Entering PLUGIN_OneProcessExporter.edit_CMakeLists [1;30m[model_handling.py at line 1343][0m [0m
 [1;32mDEBUG:  Entering PLUGIN_OneProcessExporter.edit_check_sa [1;30m[model_handling.py at line 1352][0m [0m
 [1;32mDEBUG:  Entering PLUGIN_OneProcessExporter.edit_mgonGPU [1;30m[model_handling.py at line 1369][0m [0m
@@ -205,8 +205,8 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory /afs/cern.ch/wor
 [1;32mDEBUG:  Entering PLUGIN_OneProcessExporter.edit_testxxx [1;30m[model_handling.py at line 1419][0m [0m
 [1;32mDEBUG:  Entering PLUGIN_OneProcessExporter.edit_memorybuffers [1;30m[model_handling.py at line 1430][0m [0m
 [1;32mDEBUG:  Entering PLUGIN_OneProcessExporter.edit_memoryaccesscouplings [1;30m[model_handling.py at line 1441][0m [0m
-[1;32mDEBUG:  'Copying test reference file: ', template_ref  = [0m Copying test reference file:  /afs/cern.ch/work/j/jteig/mg5amcnlo/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/../../../test/ref/dump_CPUTest.Sigma_sm_gg_ttxggg.txt [1;30m[model_handling.py at line 1335][0m [0m
-Generated helas calls for 1 subprocesses (1240 diagrams) in 4.400 s
+[1;32mDEBUG:  'Copying test reference file: ', template_ref  = [0m Copying test reference file:  /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/../../../test/ref/dump_CPUTest.Sigma_sm_gg_ttxggg.txt [1;30m[model_handling.py at line 1335][0m [0m
+Generated helas calls for 1 subprocesses (1240 diagrams) in 6.533 s
 [1;32mDEBUG:  Entering PLUGIN_ProcessExporter.convert_model (create the model) [1;30m[output.py at line 194][0m [0m
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates VVV1 routines[0m
@@ -214,7 +214,7 @@ ALOHA: aloha creates FFV1 routines[0m
 ALOHA: aloha creates VVVV1 routines[0m
 ALOHA: aloha creates VVVV3 routines[0m
 ALOHA: aloha creates VVVV4 routines[0m
-ALOHA: aloha creates 5 routines in  0.302 s
+ALOHA: aloha creates 5 routines in  0.342 s
 <class 'aloha.create_aloha.AbstractRoutine'> VVV1
 <class 'aloha.create_aloha.AbstractRoutine'> VVV1
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
@@ -227,8 +227,8 @@ ALOHA: aloha creates 5 routines in  0.302 s
 <class 'aloha.create_aloha.AbstractRoutine'> VVVV3
 <class 'aloha.create_aloha.AbstractRoutine'> VVVV4
 <class 'aloha.create_aloha.AbstractRoutine'> VVVV4
-FileWriter <class 'PLUGIN.CUDACPP_SA_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /afs/cern.ch/work/j/jteig/mg5amcnlo/CODEGEN_cudacpp_gg_ttggg/src/./HelAmps_sm.h
-INFO: Created file HelAmps_sm.h in directory /afs/cern.ch/work/j/jteig/mg5amcnlo/CODEGEN_cudacpp_gg_ttggg/src/. 
+FileWriter <class 'PLUGIN.CUDACPP_SA_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_cudacpp_gg_ttggg/src/./HelAmps_sm.h
+INFO: Created file HelAmps_sm.h in directory /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_cudacpp_gg_ttggg/src/. 
 super_write_set_parameters_onlyfixMajorana (hardcoded=False)
 [1;32mDEBUG:  'pardef_lines size =', len(pardef_lines), ', keys size =', len(pardef_lines.keys())  = [0m pardef_lines size = 48 , keys size = 48 [1;30m[model_handling.py at line 729][0m [0m
 super_write_set_parameters_onlyfixMajorana (hardcoded=True)
@@ -237,13 +237,13 @@ super_write_set_parameters_onlyfixMajorana (hardcoded=True)
 [1;32mDEBUG:  'pardef_lines size =', len(pardef_lines), ', keys size =', len(pardef_lines.keys())  = [0m pardef_lines size = 3 , keys size = 3 [1;30m[model_handling.py at line 729][0m [0m
 [1;32mDEBUG:  'pardef_lines size =', len(pardef_lines), ', keys size =', len(pardef_lines.keys())  = [0m pardef_lines size = 3 , keys size = 3 [1;30m[model_handling.py at line 729][0m [0m
 [1;32mDEBUG:  'pardef_lines size =', len(pardef_lines), ', keys size =', len(pardef_lines.keys())  = [0m pardef_lines size = 3 , keys size = 3 [1;30m[model_handling.py at line 729][0m [0m
-FileWriter <class 'PLUGIN.CUDACPP_SA_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /afs/cern.ch/work/j/jteig/mg5amcnlo/CODEGEN_cudacpp_gg_ttggg/src/./Parameters_sm.h
-FileWriter <class 'PLUGIN.CUDACPP_SA_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /afs/cern.ch/work/j/jteig/mg5amcnlo/CODEGEN_cudacpp_gg_ttggg/src/./Parameters_sm.cc
+FileWriter <class 'PLUGIN.CUDACPP_SA_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_cudacpp_gg_ttggg/src/./Parameters_sm.h
+FileWriter <class 'PLUGIN.CUDACPP_SA_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_cudacpp_gg_ttggg/src/./Parameters_sm.cc
 INFO: Created files Parameters_sm.h and Parameters_sm.cc in directory 
-INFO: /afs/cern.ch/work/j/jteig/mg5amcnlo/CODEGEN_cudacpp_gg_ttggg/src/. and /afs/cern.ch/work/j/jteig/mg5amcnlo/CODEGEN_cudacpp_gg_ttggg/src/. 
+INFO: /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_cudacpp_gg_ttggg/src/. and /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_cudacpp_gg_ttggg/src/. 
 [1;32mDEBUG:  Entering PLUGIN_ProcessExporter.finalize [1;30m[output.py at line 203][0m [0m
 quit
 
-real	0m9.573s
-user	0m8.590s
-sys	0m0.331s
+real	0m13.003s
+user	0m12.866s
+sys	0m0.085s

From 7d0d43886ae84b9c44440999a722f2a733db700e Mon Sep 17 00:00:00 2001
From: Andrea Valassi <andrea.valassi@cern.ch>
Date: Wed, 24 Jan 2024 16:06:11 +0100
Subject: [PATCH 488/509] [jthip24] (before merging upstream/master) remove
 CODEGEN #mgOnGpuFptypes.h#

---
 .../template_files/gpu/#mgOnGpuFptypes.h#     | 101 ------------------
 1 file changed, 101 deletions(-)
 delete mode 100644 epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/#mgOnGpuFptypes.h#

diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/#mgOnGpuFptypes.h# b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/#mgOnGpuFptypes.h#
deleted file mode 100644
index e71fa26aec..0000000000
--- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/#mgOnGpuFptypes.h#
+++ /dev/null
@@ -1,101 +0,0 @@
-// Copyright (C) 2020-2023 CERN and UCLouvain.
-// Licensed under the GNU Lesser General Public License (version 3 or later).
-// Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
-
-#ifndef MGONGPUFPTYPES_H
-#define MGONGPUFPTYPES_H 1
-
-#include "mgOnGpuConfig.h"
-
-#include <algorithm>
-#include <cmath>
-
-// NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef MGONGPUCPP_GPUIMPL // cuda
-namespace mg5amcGpu
-#else
-namespace mg5amcCpu
-#endif
-{
-  //==========================================================================
-
-#ifdef __CUDACC__ // cuda
-
-  //------------------------------
-  // Floating point types - Cuda
-  //------------------------------
-
-  /*
-  inline __host__ __device__ fptype
-  fpmax( const fptype& a, const fptype& b )
-  {
-    return max( a, b );
-  }
-
-  inline __host__ __device__ fptype
-  fpmin( const fptype& a, const fptype& b )
-  {
-    return min( a, b );
-  }
-  */
-
-  inline __host__ __device__ const fptype&
-  fpmax( const fptype& a, const fptype& b )
-  {
-    return ( ( b < a ) ? a : b );
-  }
-
-  inline __host__ __device__ const fptype&
-  fpmin( const fptype& a, const fptype& b )
-  {
-    return ( ( a < b ) ? a : b );
-  }
-
-  inline __host__ __device__ fptype
-  fpsqrt( const fptype& f )
-  {
-#if defined MGONGPU_FPTYPE_FLOAT
-    // See https://docs.nvidia.com/cuda/cuda-math-api/group__CUDA__MATH__SINGLE.html
-    return sqrtf( f );
-#else
-    // See https://docs.nvidia.com/cuda/cuda-math-api/group__CUDA__MATH__DOUBLE.html
-    return sqrt( f );
-#endif
-  }
-
-#endif // #ifdef MGONGPUCPP_GPUIMPL
-
-  //==========================================================================
-
-#ifndef MGONGPUCPP_GPUIMPL
-
-  //------------------------------
-  // Floating point types - C++
-  //------------------------------
-
-  inline const fptype&
-  fpmax( const fptype& a, const fptype& b )
-  {
-    return std::max( a, b );
-  }
-
-  inline const fptype&
-  fpmin( const fptype& a, const fptype& b )
-  {
-    return std::min( a, b );
-  }
-
-  inline fptype
-  fpsqrt( const fptype& f )
-  {
-    return std::sqrt( f );
-  }
-
-#endif // #ifndef MGONGPUCPP_GPUIMPL
-
-  //==========================================================================
-
-} // end namespace mg5amcGpu/mg5amcCpu
-
-#endif // MGONGPUFPTYPES_H

From 0089b29c91276b7ef8df3c98913404c319517e0b Mon Sep 17 00:00:00 2001
From: Andrea Valassi <andrea.valassi@cern.ch>
Date: Wed, 24 Jan 2024 16:16:03 +0100
Subject: [PATCH 489/509] [jthip24] (before merging upstream/master) improve
 logic of "if CUDA else HIP else neither" in CODEGEN cudacpp.mk

(This is derived from this morning's commit d2e2f47a303a9b1c25805d96d04beb4f07b57575 in branch jt774)
---
 .../iolibs/template_files/gpu/cudacpp.mk      | 226 +++++++++---------
 1 file changed, 109 insertions(+), 117 deletions(-)

diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/cudacpp.mk b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/cudacpp.mk
index 563f17c303..298d6e94b3 100644
--- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/cudacpp.mk
+++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/cudacpp.mk
@@ -29,7 +29,7 @@ UNAME_P := $(shell uname -p)
 
 #-------------------------------------------------------------------------------
 
-#=== Configure common compiler flags for C++ and CUDA
+#=== Configure common compiler flags for C++ and CUDA/HIP
 
 INCFLAGS = -I.
 OPTFLAGS = -O3 # this ends up in GPUFLAGS too (should it?), cannot add -Ofast or -ffast-math here
@@ -103,66 +103,74 @@ endif
 
 #-------------------------------------------------------------------------------
 
-CUDA_COMPILER_PATH := $(shell compiler="`which nvcc 2>/dev/null`" && while [ -L "$$compiler" ]; do compiler=`readlink "$$compiler"`; done && echo "$$compiler")
-HIP_COMPILER_PATH := $(shell compiler="`which hipcc 2>/dev/null`" && while [ -L "$$compiler" ]; do compiler=`readlink "$$compiler"`; done && echo "$$compiler")
-
-ifeq ($(findstring nvcc,$(CUDA_COMPILER_PATH)),nvcc)
-  #=== Configure the CUDA compiler
-
-  # If CXX is not a single word (example "clang++ --gcc-toolchain...") then disable CUDA builds (issue #505)
-  # This is because it is impossible to pass this to "GPUFLAGS += -ccbin <host-compiler>" below
-  ifneq ($(words $(subst ccache ,,$(CXX))),1) # allow at most "CXX=ccache <host-compiler>" from outside
-    $(warning CUDA builds are not supported for multi-word CXX "$(CXX)")
-    override CUDA_HOME=disabled
-  endif
-
-  # If CUDA_HOME is not set, try to set it from the location of nvcc
-  ifndef CUDA_HOME
-    CUDA_HOME = $(patsubst %%bin/nvcc,%%,$(shell which nvcc 2>/dev/null))
-    $(warning CUDA_HOME was not set: using "$(CUDA_HOME)")
-  endif
-
-  # Set GPUCC as $(CUDA_HOME)/bin/nvcc if it exists
-  ifneq ($(wildcard $(CUDA_HOME)/bin/nvcc),)
-    GPUCC = $(CUDA_HOME)/bin/nvcc
-    USE_NVTX ?=-DUSE_NVTX
-    # See https://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/index.html
-    # See https://arnon.dk/matching-sm-architectures-arch-and-gencode-for-various-nvidia-cards/
-    # Default: use compute capability 70 for V100 (CERN lxbatch, CERN itscrd, Juwels Cluster).
-    # Embed device code for 70, and PTX for 70+.
-    # Export MADGRAPH_CUDA_ARCHITECTURE (comma-separated list) to use another value or list of values (see #533).
-    # Examples: use 60 for P100 (Piz Daint), 80 for A100 (Juwels Booster, NVidia raplab/Curiosity).
-    MADGRAPH_CUDA_ARCHITECTURE ?= 70
-    ###CUARCHFLAGS = -gencode arch=compute_$(MADGRAPH_CUDA_ARCHITECTURE),code=compute_$(MADGRAPH_CUDA_ARCHITECTURE) -gencode arch=compute_$(MADGRAPH_CUDA_ARCHITECTURE),code=sm_$(MADGRAPH_CUDA_ARCHITECTURE) # Older implementation (AV): go back to this one for multi-GPU support #533
-    ###CUARCHFLAGS = --gpu-architecture=compute_$(MADGRAPH_CUDA_ARCHITECTURE) --gpu-code=sm_$(MADGRAPH_CUDA_ARCHITECTURE),compute_$(MADGRAPH_CUDA_ARCHITECTURE) # Newer implementation (SH): cannot use this as-is for multi-GPU support #533
-    comma:=,
-    CUARCHFLAGS = $(foreach arch,$(subst $(comma), ,$(MADGRAPH_CUDA_ARCHITECTURE)),-gencode arch=compute_$(arch),code=compute_$(arch) -gencode arch=compute_$(arch),code=sm_$(arch))
-    CUINC = -I$(CUDA_HOME)/include/
-    CURANDLIBFLAGS = -L$(CUDA_HOME)/lib64/ -lcurand # NB: -lcuda is not needed here!
-    CUOPTFLAGS = -lineinfo
-    GPUFLAGS = $(foreach opt, $(OPTFLAGS), -Xcompiler $(opt)) $(CUOPTFLAGS) $(INCFLAGS) $(CUINC) $(USE_NVTX) $(CUARCHFLAGS) -use_fast_math
-    ###GPUFLAGS += -Xcompiler -Wall -Xcompiler -Wextra -Xcompiler -Wshadow
-    ###GPUCC_VERSION = $(shell $(GPUCC) --version | grep 'Cuda compilation tools' | cut -d' ' -f5 | cut -d, -f1)
-    GPUFLAGS += -std=c++17 # need CUDA >= 11.2 (see #333): this is enforced in mgOnGpuConfig.h
-    # Without -maxrregcount: baseline throughput: 6.5E8 (16384 32 12) up to 7.3E8 (65536 128 12)
-    ###GPUFLAGS+= --maxrregcount 160 # improves throughput: 6.9E8 (16384 32 12) up to 7.7E8 (65536 128 12)
-    ###GPUFLAGS+= --maxrregcount 128 # improves throughput: 7.3E8 (16384 32 12) up to 7.6E8 (65536 128 12)
-    ###GPUFLAGS+= --maxrregcount 96 # degrades throughput: 4.1E8 (16384 32 12) up to 4.5E8 (65536 128 12)
-    ###GPUFLAGS+= --maxrregcount 64 # degrades throughput: 1.7E8 (16384 32 12) flat at 1.7E8 (65536 128 12)
-    CUBUILDRULEFLAGS = -Xcompiler -fPIC -c
-    CCBUILDRULEFLAGS = -Xcompiler -fPIC -c -x cu
-    CUDATESTFLAGS = -lcuda
-  else ifneq ($(origin REQUIRE_CUDA),undefined)
-    # If REQUIRE_CUDA is set but no cuda is found, stop here (e.g. for CI tests on GPU #443)
-    $(error No cuda installation found (set CUDA_HOME or make GPUCC visible in PATH))
-  else
-    # No cuda. Switch cuda compilation off and go to common random numbers in C++
-    $(warning CUDA_HOME is not set or is invalid: export CUDA_HOME to compile with cuda)
-    override GPUCC=
-    override USE_NVTX=
-    override CUINC=
-    override CURANDLIBFLAGS=
-  endif
+#=== Configure the GPU compiler (CUDA or HIP)
+
+# FIXME! (AV 24.01.2024)
+# In the current implementation (without separate builds for C++ and CUDA/HIP), we first check for cudacc and hipcc in CUDA_HOME and HIP_HOME.
+# If CUDA_HOME or HIP_HOME are not set, try to determine them from the path to cudacc and hipcc.
+# While convoluted, this is currently necessary to allow disabling CUDA/HIP builds by setting CUDA_HOME or HIP_HOME to invalid paths.
+# This will (probably?) be fixed when separate C++ and CUDA/HIP builds are implemented (PR #775).
+
+# If CXX is not a single word (example "clang++ --gcc-toolchain...") then disable CUDA and HIP builds (issue #505)
+# This is because it is impossible to pass this to "GPUFLAGS += -ccbin <host-compiler>" below
+ifneq ($(words $(subst ccache ,,$(CXX))),1) # allow at most "CXX=ccache <host-compiler>" from outside
+  $(warning CUDA and HIP builds are not supported for multi-word CXX "$(CXX)")
+  override CUDA_HOME=disabled
+  override HIP_HOME=disabled
+endif
+
+# If CUDA_HOME is not set, try to set it from the path to nvcc
+ifndef CUDA_HOME
+  CUDA_HOME = $(patsubst %%bin/nvcc,%%,$(shell which nvcc 2>/dev/null))
+  $(warning CUDA_HOME was not set: using "$(CUDA_HOME)")
+endif
+
+# If HIP_HOME is not set, try to set it from the path to hipcc
+ifndef HIP_HOME
+  HIP_HOME = $(patsubst %%bin/hipcc,%%,$(HIP_COMPILER_PATH))
+  $(warning HIP_HOME was not set: using "$(HIP_HOME)")
+endif
+
+# FIXME! (AV 24.01.2024)
+# In the current implementation (without separate builds for C++ and CUDA/HIP),
+# builds are performed for HIP only if CUDA is not found in the path.
+# If both CUDA and HIP are installed, HIP builds can be triggered by unsetting CUDA_HOME.
+# This will be fixed when separate C++ and CUDA/HIP builds are implemented (PR #775).
+
+#--- Option 1: CUDA exists -> use CUDA
+
+# Set GPUCC as $(CUDA_HOME)/bin/nvcc if it exists
+ifneq ($(wildcard $(CUDA_HOME)/bin/nvcc),)
+
+  GPUCC = $(CUDA_HOME)/bin/nvcc
+  USE_NVTX ?=-DUSE_NVTX
+  # See https://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/index.html
+  # See https://arnon.dk/matching-sm-architectures-arch-and-gencode-for-various-nvidia-cards/
+  # Default: use compute capability 70 for V100 (CERN lxbatch, CERN itscrd, Juwels Cluster).
+  # Embed device code for 70, and PTX for 70+.
+  # Export MADGRAPH_CUDA_ARCHITECTURE (comma-separated list) to use another value or list of values (see #533).
+  # Examples: use 60 for P100 (Piz Daint), 80 for A100 (Juwels Booster, NVidia raplab/Curiosity).
+  MADGRAPH_CUDA_ARCHITECTURE ?= 70
+  ###CUARCHFLAGS = -gencode arch=compute_$(MADGRAPH_CUDA_ARCHITECTURE),code=compute_$(MADGRAPH_CUDA_ARCHITECTURE) -gencode arch=compute_$(MADGRAPH_CUDA_ARCHITECTURE),code=sm_$(MADGRAPH_CUDA_ARCHITECTURE) # Older implementation (AV): go back to this one for multi-GPU support #533
+  ###CUARCHFLAGS = --gpu-architecture=compute_$(MADGRAPH_CUDA_ARCHITECTURE) --gpu-code=sm_$(MADGRAPH_CUDA_ARCHITECTURE),compute_$(MADGRAPH_CUDA_ARCHITECTURE) # Newer implementation (SH): cannot use this as-is for multi-GPU support #533
+  comma:=,
+  CUARCHFLAGS = $(foreach arch,$(subst $(comma), ,$(MADGRAPH_CUDA_ARCHITECTURE)),-gencode arch=compute_$(arch),code=compute_$(arch) -gencode arch=compute_$(arch),code=sm_$(arch))
+  CUINC = -I$(CUDA_HOME)/include/
+  CURANDLIBFLAGS = -L$(CUDA_HOME)/lib64/ -lcurand # NB: -lcuda is not needed here!
+  CUOPTFLAGS = -lineinfo
+  ###GPUFLAGS = $(OPTFLAGS) $(CUOPTFLAGS) $(INCFLAGS) $(CUINC) $(USE_NVTX) $(CUARCHFLAGS) -use_fast_math
+  GPUFLAGS = $(foreach opt, $(OPTFLAGS), -Xcompiler $(opt)) $(CUOPTFLAGS) $(INCFLAGS) $(CUINC) $(USE_NVTX) $(CUARCHFLAGS) -use_fast_math
+  ###GPUFLAGS += -Xcompiler -Wall -Xcompiler -Wextra -Xcompiler -Wshadow
+  ###GPUCC_VERSION = $(shell $(GPUCC) --version | grep 'Cuda compilation tools' | cut -d' ' -f5 | cut -d, -f1)
+  GPUFLAGS += -std=c++17 # need CUDA >= 11.2 (see #333): this is enforced in mgOnGpuConfig.h
+  # Without -maxrregcount: baseline throughput: 6.5E8 (16384 32 12) up to 7.3E8 (65536 128 12)
+  ###GPUFLAGS+= --maxrregcount 160 # improves throughput: 6.9E8 (16384 32 12) up to 7.7E8 (65536 128 12)
+  ###GPUFLAGS+= --maxrregcount 128 # improves throughput: 7.3E8 (16384 32 12) up to 7.6E8 (65536 128 12)
+  ###GPUFLAGS+= --maxrregcount 96 # degrades throughput: 4.1E8 (16384 32 12) up to 4.5E8 (65536 128 12)
+  ###GPUFLAGS+= --maxrregcount 64 # degrades throughput: 1.7E8 (16384 32 12) flat at 1.7E8 (65536 128 12)
+  CUBUILDRULEFLAGS = -Xcompiler -fPIC -c
+  CCBUILDRULEFLAGS = -Xcompiler -fPIC -c -x cu
+  CUDATESTFLAGS = -lcuda
 
   # Set the host C++ compiler for GPUCC via "-ccbin <host-compiler>"
   # (NB issue #505: this must be a single word, "clang++ --gcc-toolchain..." is not supported)
@@ -173,71 +181,55 @@ ifeq ($(findstring nvcc,$(CUDA_COMPILER_PATH)),nvcc)
   GPUFLAGS += -allow-unsupported-compiler
   endif
 
-else ifeq ($(findstring hipcc,$(HIP_COMPILER_PATH)),hipcc)
-  #=== Configure the HIP compiler
+else ifneq ($(origin REQUIRE_CUDA),undefined)
 
-  # If CXX is not a single word (example "clang++ --gcc-toolchain...") then disable CUDA/HIP builds (issue #505)
-  # This is because it is impossible to pass this to "GPUFLAGS += -ccbin <host-compiler>" below
-  ifneq ($(words $(subst ccache ,,$(CXX))),1) # allow at most "CXX=ccache <host-compiler>" from outside
-    $(warning HIP builds are not supported for multi-word CXX "$(CXX)")
-    override HIP_HOME=disabled
-  endif
+  # If REQUIRE_CUDA is set but no cuda is found, stop here (e.g. for CI tests on GPU #443)
+  $(error No cuda installation found (set CUDA_HOME or make GPUCC visible in PATH))
 
-  # If HIP_HOME is not set, try to set it from the location of GPUCC
-  ifndef HIP_HOME
-    HIP_HOME = $(patsubst %%bin/hipcc,%%,$(HIP_COMPILER_PATH))
-    $(warning HIP_HOME was not set: using "$(HIP_HOME)")
-  endif
+#--- Option 2: CUDA does not exist, HIP exists -> use HIP
 
-  # Set GPUCC as $(HIP_HOME)/bin/hipcc if it exists
-  ifneq ($(wildcard $(HIP_HOME)/bin/hipcc),)
-    GPUCC = $(HIP_HOME)/bin/hipcc
-
-    # Should maybe find something equivelant to this in HIP
-    #USE_NVTX ?=-DUSE_NVTX
-
-    HIPARCHFLAGS = -target x86_64-linux-gnu --offload-arch=gfx90a
-    HIPINC = -I$(HIP_HOME)/include/
-
-    # -DHIP_FAST_MATH equivelant to -use_fast_math in HIP 
-    # (But only for single precision line 208: https://rocm-developer-tools.github.io/HIP/hcc__detail_2math__functions_8h_source.html)
-    GPUFLAGS = $(OPTFLAGS) $(CUOPTFLAGS) $(INCFLAGS) $(HIPINC) $(HIPARCHFLAGS) -DHIP_FAST_MATH -DHIP_PLATFORM=amd -fPIC
-    ###GPUFLAGS += -Xcompiler -Wall -Xcompiler -Wextra -Xcompiler -Wshadow
-    GPUFLAGS += -std=c++17
-    # Without -maxrregcount: baseline throughput: 6.5E8 (16384 32 12) up to 7.3E8 (65536 128 12)
-    ###GPUFLAGS+= --maxrregcount 160 # improves throughput: 6.9E8 (16384 32 12) up to 7.7E8 (65536 128 12)
-    ###GPUFLAGS+= --maxrregcount 128 # improves throughput: 7.3E8 (16384 32 12) up to 7.6E8 (65536 128 12)
-    ###GPUFLAGS+= --maxrregcount 96 # degrades throughput: 4.1E8 (16384 32 12) up to 4.5E8 (65536 128 12)
-    ###GPUFLAGS+= --maxrregcount 64 # degrades throughput: 1.7E8 (16384 32 12) flat at 1.7E8 (65536 128 12)
-
-    CUBUILDRULEFLAGS = -fPIC -c
-    CCBUILDRULEFLAGS = -fPIC -c
-
-  else ifneq ($(origin REQUIRE_HIP),undefined)
-    # If REQUIRE_HIP is set but no cuda is found, stop here (e.g. for CI tests on GPU #443)
-    $(error No hip installation found (set HIP_HOME or make GPUCC visible in PATH))
-  else
-    # No hip. Switch hip compilation off and go to common random numbers in C++
-    $(warning HIP_HOME is not set or is invalid: export HIP_HOME to compile with hip)
-    override GPUCC=
-    override USE_NVTX=
-    override CUINC=
-    override CURANDLIBFLAGS=
-  endif
+# Set GPUCC as $(HIP_HOME)/bin/hipcc if it exists
+else ifneq ($(wildcard $(HIP_HOME)/bin/hipcc),)
 
-  # Allow newer (unsupported) C++ compilers with older versions of CUDA if ALLOW_UNSUPPORTED_COMPILER_IN_CUDA is set (#504)
-  ifneq ($(origin ALLOW_UNSUPPORTED_COMPILER_IN_CUDA),undefined)
-  GPUFLAGS += -allow-unsupported-compiler
-  endif
+  GPUCC = $(HIP_HOME)/bin/hipcc
+  #USE_NVTX ?=-DUSE_NVTX # should maybe find something equivalent to this in HIP?
+  HIPARCHFLAGS = -target x86_64-linux-gnu --offload-arch=gfx90a
+  HIPINC = -I$(HIP_HOME)/include/
+  # Note: -DHIP_FAST_MATH is equivalent to -use_fast_math in HIP 
+  # (but only for single precision line 208: https://rocm-developer-tools.github.io/HIP/hcc__detail_2math__functions_8h_source.html)
+  GPUFLAGS = $(OPTFLAGS) $(CUOPTFLAGS) $(INCFLAGS) $(HIPINC) $(HIPARCHFLAGS) -DHIP_FAST_MATH -DHIP_PLATFORM=amd -fPIC
+  ###GPUFLAGS += -Xcompiler -Wall -Xcompiler -Wextra -Xcompiler -Wshadow
+  GPUFLAGS += -std=c++17
+  ###GPUFLAGS+= --maxrregcount 255 # (AV: is this option valid on HIP and meaningful on AMD GPUs?)
+  CUBUILDRULEFLAGS = -fPIC -c
+  CCBUILDRULEFLAGS = -fPIC -c
+
+else ifneq ($(origin REQUIRE_HIP),undefined)
+
+  # If REQUIRE_HIP is set but no HIP is found, stop here (e.g. for CI tests on GPU #443)
+  $(error No hip installation found (set HIP_HOME or make GPUCC visible in PATH))
+
+#--- Option 3: CUDA does not exist, HIP does not exist -> switch off both CUDA and HIP
+
+else
+
+  # No cudacc and no hipcc: switch CUDA and HIP compilation off and go to common random numbers in C++
+  $(warning CUDA_HOME is not set or is invalid: export CUDA_HOME to compile with cuda)
+  $(warning HIP_HOME is not set or is invalid: export HIP_HOME to compile with hip)
+  override GPUCC=
+  override USE_NVTX=
+  override CUINC=
+  override CURANDLIBFLAGS=
 
 endif
 
+# Export GPUCC (so that it can also be used in cudacpp_src.mk?)
 export GPUCC
 export GPUFLAGS
 
 #-------------------------------------------------------------------------------
 
-#=== Configure ccache for C++ and CUDA builds
+#=== Configure ccache for C++ and CUDA/HIP builds
 
 # Enable ccache if USECCACHE=1
 ifeq ($(USECCACHE)$(shell echo $(CXX) | grep ccache),1)
@@ -254,7 +246,7 @@ endif
 
 #-------------------------------------------------------------------------------
 
-#=== Configure PowerPC-specific compiler flags for C++ and CUDA
+#=== Configure PowerPC-specific compiler flags for C++ and CUDA/HIP
 
 # PowerPC-specific CXX compiler flags (being reviewed)
 ifeq ($(UNAME_P),ppc64le)
@@ -270,7 +262,7 @@ else
   ######CXXFLAGS+= -fno-semantic-interposition # no benefit (neither alone, nor combined with -flto)
 endif
 
-# PowerPC-specific CUDA compiler flags (to be reviewed!)
+# PowerPC-specific CUDA/HIP compiler flags (to be reviewed!)
 ifeq ($(UNAME_P),ppc64le)
   GPUFLAGS+= -Xcompiler -mno-float128
 endif
@@ -356,7 +348,7 @@ export OMPFLAGS
 
 #-------------------------------------------------------------------------------
 
-#=== Set the CUDA/C++ compiler flags appropriate to user-defined choices of AVX, FPTYPE, HELINL, HRDCOD, RNDGEN
+#=== Set the CUDA/HIP/C++ compiler flags appropriate to user-defined choices of AVX, FPTYPE, HELINL, HRDCOD, RNDGEN
 
 # Set the build flags appropriate to OMPFLAGS
 $(info OMPFLAGS=$(OMPFLAGS))

From f44a9c77344c1dd2f18c08e48715fe723a32e588 Mon Sep 17 00:00:00 2001
From: Andrea Valassi <andrea.valassi@cern.ch>
Date: Wed, 24 Jan 2024 16:35:03 +0100
Subject: [PATCH 490/509] [jthip24] (after merging upstream/master) fix clang
 formatting in CODEGEN (code generation was failing clang formatting checks)

---
 .../madgraph/iolibs/template_files/gpu/check_sa.cc            | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/check_sa.cc b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/check_sa.cc
index bf91f6da38..7cac5ab47b 100644
--- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/check_sa.cc
+++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/check_sa.cc
@@ -778,7 +778,7 @@ main( int argc, char** argv )
   wrkflwtxt += "HIP:";
 #else
   wrkflwtxt += "CPP:";
-#endif
+#endif /* clang-format off */
   // -- DOUBLE or FLOAT?
 #if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
   wrkflwtxt += "MIX+"; // mixed fptypes (single precision color algebra #537)
@@ -788,7 +788,7 @@ main( int argc, char** argv )
   wrkflwtxt += "FLT+";
 #else
   wrkflwtxt += "???+"; // no path to this statement
-#endif
+#endif /* clang-format on */
   // -- CUCOMPLEX or THRUST or STD complex numbers?
 #ifdef __CUDACC__
 #if defined MGONGPU_CUCXTYPE_CUCOMPLEX

From c54c3a89ba352ac36a85e5b4750008c663a092ba Mon Sep 17 00:00:00 2001
From: Andrea Valassi <andrea.valassi@cern.ch>
Date: Wed, 24 Jan 2024 16:35:30 +0100
Subject: [PATCH 491/509] [jthip24] regenerate gg_tt.mad - the code builds for
 C++ and CUDA

(NB: this code does not build instead in branch jt774 based on PR #774)
---
 .../gg_tt.mad/CODEGEN_mad_gg_tt_log.txt       | 175 ++++++----
 .../gg_tt.mad/Cards/me5_configuration.txt     |   4 +-
 .../cudacpp/gg_tt.mad/Cards/proc_card_mg5.dat |   6 +-
 epochX/cudacpp/gg_tt.mad/Cards/run_card.dat   |  32 +-
 .../gg_tt.mad/Cards/run_card_default.dat      |  26 +-
 epochX/cudacpp/gg_tt.mad/MGMEVersion.txt      |   2 +-
 .../gg_tt.mad/Source/DHELAS/aloha_file.inc    |   2 +-
 .../gg_tt.mad/Source/PDF/pdfwrap_lhapdf.f     |   1 +
 epochX/cudacpp/gg_tt.mad/Source/make_opts     |  17 +-
 epochX/cudacpp/gg_tt.mad/Source/makefile      |   4 +-
 .../cudacpp/gg_tt.mad/Source/param_card.inc   |  14 +-
 epochX/cudacpp/gg_tt.mad/Source/vector.inc    |   3 +-
 .../cudacpp/gg_tt.mad/SubProcesses/Bridge.h   |  14 +-
 .../gg_tt.mad/SubProcesses/MGVersion.txt      |   2 +-
 .../gg_tt.mad/SubProcesses/MadgraphTest.h     |  12 +-
 .../SubProcesses/MatrixElementKernels.cc      |   9 +-
 .../SubProcesses/P1_gg_ttx/CPPProcess.cc      | 170 ++++-----
 .../SubProcesses/P1_gg_ttx/CPPProcess.h       |   2 +-
 .../SubProcesses/P1_gg_ttx/auto_dsig.f        |   2 +-
 .../SubProcesses/P1_gg_ttx/auto_dsig1.f       |  23 +-
 .../SubProcesses/P1_gg_ttx/check_sa.cc        |  87 +++--
 .../SubProcesses/P1_gg_ttx/counters.cc        |  18 +-
 .../SubProcesses/P1_gg_ttx/matrix1.f          |   6 +-
 .../cudacpp/gg_tt.mad/SubProcesses/cudacpp.mk | 273 ++++++++-------
 .../gg_tt.mad/SubProcesses/dummy_fct.f        |  10 +-
 .../cudacpp/gg_tt.mad/SubProcesses/fbridge.cc |   6 +-
 epochX/cudacpp/gg_tt.mad/SubProcesses/genps.f |   4 +-
 .../cudacpp/gg_tt.mad/SubProcesses/makefile   |  19 +-
 .../cudacpp/gg_tt.mad/SubProcesses/runTest.cc |   4 +
 .../cudacpp/gg_tt.mad/SubProcesses/testxxx.cc |   3 +-
 epochX/cudacpp/gg_tt.mad/bin/generate_events  |  22 +-
 .../gg_tt.mad/bin/internal/__init__.py        |   1 +
 .../cudacpp/gg_tt.mad/bin/internal/banner.py  | 326 ++++++++++++++----
 .../bin/internal/check_param_card.py          |   2 +-
 .../bin/internal/common_run_interface.py      |  28 +-
 .../gg_tt.mad/bin/internal/extended_cmd.py    |   8 +-
 .../gg_tt.mad/bin/internal/gen_ximprove.py    |  15 +-
 .../gg_tt.mad/bin/internal/lhe_parser.py      |  77 +++--
 .../bin/internal/madevent_interface.py        |  29 +-
 epochX/cudacpp/gg_tt.mad/bin/internal/misc.py |   2 +-
 .../gg_tt.mad/bin/internal/shower_card.py     |  10 +-
 .../bin/internal/ufomodel/py3_model.pkl       | Bin 42822 -> 0 bytes
 epochX/cudacpp/gg_tt.mad/bin/madevent         |  26 +-
 epochX/cudacpp/gg_tt.mad/src/HelAmps_sm.h     |  10 +-
 epochX/cudacpp/gg_tt.mad/src/Parameters_sm.cc |   2 +-
 epochX/cudacpp/gg_tt.mad/src/Parameters_sm.h  |   2 +-
 epochX/cudacpp/gg_tt.mad/src/cudacpp_src.mk   |  11 +-
 epochX/cudacpp/gg_tt.mad/src/mgOnGpuConfig.h  |  15 +-
 epochX/cudacpp/gg_tt.mad/src/mgOnGpuCxtypes.h |   6 +
 49 files changed, 1005 insertions(+), 537 deletions(-)
 delete mode 100644 epochX/cudacpp/gg_tt.mad/bin/internal/ufomodel/py3_model.pkl

diff --git a/epochX/cudacpp/gg_tt.mad/CODEGEN_mad_gg_tt_log.txt b/epochX/cudacpp/gg_tt.mad/CODEGEN_mad_gg_tt_log.txt
index 5d9cc8971f..5473d52ae9 100644
--- a/epochX/cudacpp/gg_tt.mad/CODEGEN_mad_gg_tt_log.txt
+++ b/epochX/cudacpp/gg_tt.mad/CODEGEN_mad_gg_tt_log.txt
@@ -2,7 +2,6 @@
 This version is intended for development/beta testing and NOT for production.
 This version has not been fully tested (if at all) and might have limited user support (if at all)[0m
 Running MG5 in debug mode
-('WARNING: loading of madgraph too slow!!!', 0.6495707035064697)
 ************************************************************
 *                                                          *
 *                     W E L C O M E to                     *
@@ -15,7 +14,7 @@ Running MG5 in debug mode
 *                   *        * *        *                  *
 *                 *                       *                *
 *                                                          *
-*         VERSION 3.5.1_lo_vect         2023-08-08         *
+*         VERSION 3.5.2_lo_vect         2023-11-08         *
 [1;31m*                                                          *[1;0m
 [1;31m*          WARNING: UNKNOWN DEVELOPMENT VERSION.           *[1;0m
 [1;31m*            WARNING: DO NOT USE FOR PRODUCTION            *[1;0m
@@ -54,7 +53,7 @@ Note that you can still compile and run aMC@NLO with the built-in PDFs
 Using default text editor "vi". Set another one in ./input/mg5_configuration.txt
 Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt
 Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt
-import /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_mad_gg_tt.mg
+import /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt.mg
 The import format was not given, so we guess it as command
 set stdout_level DEBUG
 set output information to level: 10
@@ -63,7 +62,7 @@ generate g g > t t~
 No model currently active, so we import the Standard Model
 INFO: load particles 
 INFO: load vertices 
-[1;32mDEBUG: model prefixing  takes 0.005490541458129883 [0m
+[1;32mDEBUG: model prefixing  takes 0.005357503890991211 [0m
 INFO: Restrict model sm with file models/sm/restrict_default.dat . 
 [1;32mDEBUG: Simplifying conditional expressions [0m
 [1;32mDEBUG: remove interactions: u s w+ at order: QED=1 [0m
@@ -158,103 +157,147 @@ INFO: Trying process: g g > t t~ WEIGHTED<=2 @1
 INFO: Process has 3 diagrams 
 1 processes with 3 diagrams generated in 0.008 s
 Total: 1 processes with 3 diagrams
-output madevent CODEGEN_mad_gg_tt --hel_recycling=False --vector_size=16384 --me_exporter=standalone_cudacpp
-Load PLUGIN.CUDACPP_SA_OUTPUT
-[1mAddition matrix-element will be done with PLUGIN: CUDACPP_SA_OUTPUT[0m
-[1mOutput will be done with PLUGIN: CUDACPP_SA_OUTPUT[0m
+output madevent ../TMPOUT/CODEGEN_mad_gg_tt --hel_recycling=False --vector_size=32 --me_exporter=standalone_cudacpp
+Load PLUGIN.CUDACPP_OUTPUT
+[1mAddition matrix-element will be done with PLUGIN: CUDACPP_OUTPUT[0m
+[1mOutput will be done with PLUGIN: CUDACPP_OUTPUT[0m
 [1;32mDEBUG:  cformat = [0m standalone_cudacpp [1;30m[export_cpp.py at line 3071][0m [0m
-[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [1;30m[output.py at line 152][0m [0m
+[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [1;30m[output.py at line 160][0m [0m
 INFO: initialize a new directory: CODEGEN_mad_gg_tt 
 INFO: remove old information in CODEGEN_mad_gg_tt 
-[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [1;30m[output.py at line 157][0m [0m
-[1;34mWARNING: File exists /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_mad_gg_tt [0m
-INFO: Creating subdirectories in directory /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_mad_gg_tt 
-[1;34mWARNING: File exists /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_mad_gg_tt/Cards [0m
-[1;34mWARNING: File exists /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_mad_gg_tt/SubProcesses [0m
+[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [1;30m[output.py at line 165][0m [0m
+[1;34mWARNING: File exists /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt [0m
+INFO: Creating subdirectories in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt 
+[1;34mWARNING: File exists /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt/Cards [0m
+[1;34mWARNING: File exists /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt/SubProcesses [0m
 INFO: Organizing processes into subprocess groups 
 INFO: Generating Helas calls for process: g g > t t~ WEIGHTED<=2 @1 
 INFO: Processing color information for process: g g > t t~ @1 
 INFO: Creating files in directory P1_gg_ttx 
-[1;32mDEBUG:  Entering PLUGIN_OneProcessExporter.__init__ [1;30m[model_handling.py at line 1039][0m [0m
-[1;32mDEBUG:  kwargs[prefix] = 0 [1;30m[model_handling.py at line 1040][0m [0m
-[1;32mDEBUG:  proc_id = [0m 1 [1;30m[model_handling.py at line 1045][0m [0m
-[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_SA_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7f7444634ac0> [1;30m[export_v4.py at line 6179][0m [0m
+[1;32mDEBUG:  kwargs[prefix] = 0 [1;30m[model_handling.py at line 1058][0m [0m
+[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7f40222f5940> [1;30m[export_v4.py at line 6262][0m [0m
 INFO: Creating files in directory . 
-[1;32mDEBUG:  Entering PLUGIN_OneProcessExporter.generate_process_files [1;30m[model_handling.py at line 1297][0m [0m
-[1;32mDEBUG:  self.include_multi_channel is already defined: this is madevent+second_exporter mode [1;30m[model_handling.py at line 1299][0m [0m
-FileWriter <class 'PLUGIN.CUDACPP_SA_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.h
-[1;32mDEBUG:  Entering PLUGIN_OneProcessExporter.write_process_h_file [1;30m[model_handling.py at line 1453][0m [0m
-FileWriter <class 'PLUGIN.CUDACPP_SA_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.cc
-[1;32mDEBUG:  Entering PLUGIN_OneProcessExporter.write_process_cc_file [1;30m[model_handling.py at line 1475][0m [0m
-[1;32mDEBUG:  Entering PLUGIN_OneProcessExporter.get_sigmaKin_lines [1;30m[model_handling.py at line 1143][0m [0m
-[1;32mDEBUG:  self.include_multi_channel = [0m [1, 2, 3] [1;30m[model_handling.py at line 1144][0m [0m
-[1;32mDEBUG:  self.support_multichannel = [0m True [1;30m[model_handling.py at line 1145][0m [0m
-[1;32mDEBUG:  type(self.helas_call_writer) = [0m <class 'PLUGIN.CUDACPP_SA_OUTPUT.model_handling.PLUGIN_GPUFOHelasCallWriter'> [1;30m[model_handling.py at line 1162][0m [0m
-[1;32mDEBUG:  self.support_multichannel, self.include_multi_channel = [0m True [1, 2, 3] [1;30m[model_handling.py at line 1163][0m [0m
-[1;32mDEBUG:  multi_channel = [0m {1: [0], 2: [1], 3: [2]} [1;30m[model_handling.py at line 1169][0m [0m
-[1;32mDEBUG:  multi_channel_map = [0m {1: [0], 2: [1], 3: [2]} [1;30m[model_handling.py at line 1654][0m [0m
-[1;32mDEBUG:  diag_to_config = [0m {1: 1, 2: 2, 3: 3} [1;30m[model_handling.py at line 1711][0m [0m
-[1;32mDEBUG:  call = [0m vxxxxx( momenta,m_pars->%s, cHel[ihel][%d],%+d, w_sv[%d], %d ); [1;30m[model_handling.py at line 1823][0m [0m
-[1;32mDEBUG:  ('ZERO', 0, -1, 0, 0) [1;30m[model_handling.py at line 1824][0m [0m
-[1;32mDEBUG:  call = [0m vxxxxx( momenta,m_pars->%s, cHel[ihel][%d],%+d, w_sv[%d], %d ); [1;30m[model_handling.py at line 1823][0m [0m
-[1;32mDEBUG:  ('ZERO', 1, -1, 1, 1) [1;30m[model_handling.py at line 1824][0m [0m
+FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.h
+FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.cc
 INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./. 
-[1;32mDEBUG:  Entering PLUGIN_OneProcessExporter.edit_CMakeLists [1;30m[model_handling.py at line 1343][0m [0m
-[1;32mDEBUG:  Entering PLUGIN_OneProcessExporter.edit_check_sa [1;30m[model_handling.py at line 1352][0m [0m
-[1;32mDEBUG:  Entering PLUGIN_OneProcessExporter.edit_mgonGPU [1;30m[model_handling.py at line 1369][0m [0m
-[1;32mDEBUG:  Entering PLUGIN_OneProcessExporter.edit_processidfile [1;30m[model_handling.py at line 1389][0m [0m
-[1;32mDEBUG:  Entering PLUGIN_OneProcessExporter.edit_coloramps [1;30m[model_handling.py at line 1401][0m [0m
-[1;32mDEBUG:  Entering PLUGIN_OneProcessExporter.edit_testxxx [1;30m[model_handling.py at line 1419][0m [0m
-[1;32mDEBUG:  Entering PLUGIN_OneProcessExporter.edit_memorybuffers [1;30m[model_handling.py at line 1430][0m [0m
-[1;32mDEBUG:  Entering PLUGIN_OneProcessExporter.edit_memoryaccesscouplings [1;30m[model_handling.py at line 1441][0m [0m
-[1;32mDEBUG:  'Copying test reference file: ', template_ref  = [0m Copying test reference file:  /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/../../../test/ref/dump_CPUTest.Sigma_sm_gg_ttx.txt [1;30m[model_handling.py at line 1335][0m [0m
 [1;32mDEBUG:  proc_id = [0m 1 [1;30m[export_cpp.py at line 710][0m [0m
 [1;32mDEBUG:  config_map = [0m [1, 2, 3] [1;30m[export_cpp.py at line 711][0m [0m
 [1;32mDEBUG:  subproc_number = [0m 0 [1;30m[export_cpp.py at line 712][0m [0m
 [1;32mDEBUG:  Done [1;30m[export_cpp.py at line 713][0m [0m
+[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m False True 32 [1;30m[export_v4.py at line 1872][0m [0m
+[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m False True 32 [1;30m[export_v4.py at line 1872][0m [0m
+[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m 32 True 32 [1;30m[export_v4.py at line 1872][0m [0m
+[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m 32 True 32 [1;30m[export_v4.py at line 1872][0m [0m
 INFO: Generating Feynman diagrams for Process: g g > t t~ WEIGHTED<=2 @1 
 INFO: Finding symmetric diagrams for subprocess group gg_ttx 
 Generated helas calls for 1 subprocesses (3 diagrams) in 0.006 s
-Wrote files for 10 helas calls in 0.119 s
+Wrote files for 10 helas calls in 0.101 s
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates VVV1 set of routines with options: P0[0m
 ALOHA: aloha creates FFV1 routines[0m
-ALOHA: aloha creates 2 routines in  0.422 s
-[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.convert_model (create the model) [1;30m[output.py at line 194][0m [0m
+ALOHA: aloha creates 2 routines in  0.144 s
+[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.convert_model (create the model) [1;30m[output.py at line 202][0m [0m
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates VVV1 set of routines with options: P0[0m
 ALOHA: aloha creates FFV1 routines[0m
-ALOHA: aloha creates 4 routines in  0.130 s
+ALOHA: aloha creates 4 routines in  0.133 s
 <class 'aloha.create_aloha.AbstractRoutine'> VVV1
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
-FileWriter <class 'PLUGIN.CUDACPP_SA_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_mad_gg_tt/src/./HelAmps_sm.h
-INFO: Created file HelAmps_sm.h in directory /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_mad_gg_tt/src/. 
+FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt/src/./HelAmps_sm.h
+INFO: Created file HelAmps_sm.h in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt/src/. 
 super_write_set_parameters_onlyfixMajorana (hardcoded=False)
-[1;32mDEBUG:  'pardef_lines size =', len(pardef_lines), ', keys size =', len(pardef_lines.keys())  = [0m pardef_lines size = 48 , keys size = 48 [1;30m[model_handling.py at line 729][0m [0m
 super_write_set_parameters_onlyfixMajorana (hardcoded=True)
-[1;32mDEBUG:  'pardef_lines size =', len(pardef_lines), ', keys size =', len(pardef_lines.keys())  = [0m pardef_lines size = 3 , keys size = 3 [1;30m[model_handling.py at line 729][0m [0m
-[1;32mDEBUG:  'pardef_lines size =', len(pardef_lines), ', keys size =', len(pardef_lines.keys())  = [0m pardef_lines size = 2 , keys size = 2 [1;30m[model_handling.py at line 729][0m [0m
-[1;32mDEBUG:  'pardef_lines size =', len(pardef_lines), ', keys size =', len(pardef_lines.keys())  = [0m pardef_lines size = 3 , keys size = 3 [1;30m[model_handling.py at line 729][0m [0m
-[1;32mDEBUG:  'pardef_lines size =', len(pardef_lines), ', keys size =', len(pardef_lines.keys())  = [0m pardef_lines size = 2 , keys size = 2 [1;30m[model_handling.py at line 729][0m [0m
-[1;32mDEBUG:  'pardef_lines size =', len(pardef_lines), ', keys size =', len(pardef_lines.keys())  = [0m pardef_lines size = 2 , keys size = 2 [1;30m[model_handling.py at line 729][0m [0m
-FileWriter <class 'PLUGIN.CUDACPP_SA_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_mad_gg_tt/src/./Parameters_sm.h
-FileWriter <class 'PLUGIN.CUDACPP_SA_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_mad_gg_tt/src/./Parameters_sm.cc
+FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt/src/./Parameters_sm.h
+FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt/src/./Parameters_sm.cc
 INFO: Created files Parameters_sm.h and Parameters_sm.cc in directory 
-INFO: /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_mad_gg_tt/src/. and /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_mad_gg_tt/src/. 
+INFO: /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt/src/. and /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt/src/. 
 The option zerowidth_tchannel is modified [True] but will not be written in the configuration files.
 If you want to make this value the default for future session, you can run 'save options --all'
-save configuration file to /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_mad_gg_tt/Cards/me5_configuration.txt
+save configuration file to /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt/Cards/me5_configuration.txt
 INFO: Use Fortran compiler gfortran 
 INFO: Use c++ compiler g++ 
 INFO: Generate web pages 
-Output to directory /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_mad_gg_tt done.
+DEBUG: cd /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt; patch -p4 -i /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.common
+patching file Source/genps.inc
+patching file Source/makefile
+patching file SubProcesses/makefile
+patching file bin/internal/gen_ximprove.py
+Hunk #1 succeeded at 391 (offset 6 lines).
+patching file bin/internal/madevent_interface.py
+DEBUG: cd /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt/SubProcesses/P1_gg_ttx; patch -p6 -i /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.P1
+patching file auto_dsig1.f
+patching file driver.f
+patching file matrix1.f
+[1;32mDEBUG:  p.returncode = [0m 0 [1;30m[output.py at line 237][0m [0m
+Output to directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt done.
 Type "launch" to generate events from this process, or see
-/data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_mad_gg_tt/README
+/data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt/README
 Run "open index.html" to see more information about this process.
 quit
 
-real	0m3.274s
-user	0m1.816s
-sys	0m0.258s
+real	0m1.689s
+user	0m1.475s
+sys	0m0.213s
+Code generation completed in 2 seconds
+************************************************************
+*                                                          *
+*                      W E L C O M E to                    *
+*             M A D G R A P H 5 _ a M C @ N L O            *
+*                      M A D E V E N T                     *
+*                                                          *
+*                 *                       *                *
+*                   *        * *        *                  *
+*                     * * * * 5 * * * *                    *
+*                   *        * *        *                  *
+*                 *                       *                *
+*                                                          *
+*         VERSION 3.5.2_lo_vect                            *
+*                                                          *
+*    The MadGraph5_aMC@NLO Development Team - Find us at   *
+*    https://server06.fynu.ucl.ac.be/projects/madgraph     *
+*                                                          *
+*               Type 'help' for in-line help.              *
+*                                                          *
+************************************************************
+INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt/Cards/me5_configuration.txt  
+INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/mg5amcnlo/input/mg5_configuration.txt  
+INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt/Cards/me5_configuration.txt  
+Using default text editor "vi". Set another one in ./input/mg5_configuration.txt
+Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt
+Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt
+treatcards run
+quit
+INFO:  
+launch in debug mode
+************************************************************
+*                                                          *
+*                      W E L C O M E to                    *
+*             M A D G R A P H 5 _ a M C @ N L O            *
+*                      M A D E V E N T                     *
+*                                                          *
+*                 *                       *                *
+*                   *        * *        *                  *
+*                     * * * * 5 * * * *                    *
+*                   *        * *        *                  *
+*                 *                       *                *
+*                                                          *
+*         VERSION 3.5.2_lo_vect                            *
+*                                                          *
+*    The MadGraph5_aMC@NLO Development Team - Find us at   *
+*    https://server06.fynu.ucl.ac.be/projects/madgraph     *
+*                                                          *
+*               Type 'help' for in-line help.              *
+*                                                          *
+************************************************************
+INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt/Cards/me5_configuration.txt  
+INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/mg5amcnlo/input/mg5_configuration.txt  
+INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt/Cards/me5_configuration.txt  
+Using default text editor "vi". Set another one in ./input/mg5_configuration.txt
+Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt
+Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt
+treatcards param
+quit
+INFO:  
+launch in debug mode
diff --git a/epochX/cudacpp/gg_tt.mad/Cards/me5_configuration.txt b/epochX/cudacpp/gg_tt.mad/Cards/me5_configuration.txt
index 00d7c6f8d6..cdeedc7863 100644
--- a/epochX/cudacpp/gg_tt.mad/Cards/me5_configuration.txt
+++ b/epochX/cudacpp/gg_tt.mad/Cards/me5_configuration.txt
@@ -234,7 +234,7 @@
 # pineappl = pineappl
 
 
-#mg5_path = /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo 
+#mg5_path = /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/mg5amcnlo 
 
 # MG5 MAIN DIRECTORY
-#mg5_path = /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo
+#mg5_path = /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/mg5amcnlo
diff --git a/epochX/cudacpp/gg_tt.mad/Cards/proc_card_mg5.dat b/epochX/cudacpp/gg_tt.mad/Cards/proc_card_mg5.dat
index 7f331650ba..cf111e2e6d 100644
--- a/epochX/cudacpp/gg_tt.mad/Cards/proc_card_mg5.dat
+++ b/epochX/cudacpp/gg_tt.mad/Cards/proc_card_mg5.dat
@@ -8,7 +8,7 @@
 #*                *                       *                 *
 #*                                                          *
 #*                                                          *
-#*         VERSION 3.5.1_lo_vect         2023-08-08         *
+#*         VERSION 3.5.2_lo_vect         2023-11-08         *
 [1;31m#*                                                          *[1;0m
 [1;31m#*          WARNING: UNKNOWN DEVELOPMENT VERSION.           *[1;0m
 [1;31m#*            WARNING: DO NOT USE FOR PRODUCTION            *[1;0m
@@ -45,5 +45,5 @@ define l+ = e+ mu+
 define l- = e- mu-
 define vl = ve vm vt
 define vl~ = ve~ vm~ vt~
-output madevent CODEGEN_mad_gg_tt --hel_recycling=False --vector_size=\
-16384 --me_exporter=standalone_cudacpp
+output madevent ../TMPOUT/CODEGEN_mad_gg_tt --hel_recycling=False --ve\
+ctor_size=32 --me_exporter=standalone_cudacpp
diff --git a/epochX/cudacpp/gg_tt.mad/Cards/run_card.dat b/epochX/cudacpp/gg_tt.mad/Cards/run_card.dat
index 80d07f317e..a6463dc262 100644
--- a/epochX/cudacpp/gg_tt.mad/Cards/run_card.dat
+++ b/epochX/cudacpp/gg_tt.mad/Cards/run_card.dat
@@ -80,7 +80,23 @@
    1  = sde_strategy  ! default integration strategy (hep-ph/2021.00773)
                              ! 1 is old strategy (using amp square)
 			     ! 2 is new strategy (using only the denominator)
-# To see advanced option for Phase-Space optimization: type "update psoptim"			     
+#*********************************************************************
+# Phase-Space Optim (advanced)
+#*********************************************************************
+   0 = job_strategy ! see appendix of 1507.00020 (page 26)
+   0 =  hard_survey ! force to have better estimate of the integral at survey for difficult mode like interference
+   -1.0 = tmin_for_channel ! limit the non-singular reach of --some-- channel of integration related to T-channel diagram (value between -1 and 0), -1 is no impact
+   -1 = survey_splitting ! for loop-induced control how many core are used at survey for the computation of a single iteration.
+   2 = survey_nchannel_per_job ! control how many Channel are integrated inside a single job on cluster/multicore
+   -1 = refine_evt_by_job ! control the maximal number of events for the first iteration of the refine (larger means less jobs)  
+#*********************************************************************
+# Compilation flag. 
+#*********************************************************************   
+   -O3 -ffast-math -fbounds-check = global_flag ! build flags for all Fortran code (for a fair comparison to cudacpp; default is -O)
+   --fast-math  = aloha_flag ! fortran optimization flag for aloha function. Suggestions: '-ffast-math'
+   -O3 = matrix_flag ! fortran optimization flag for matrix.f function. Suggestions: '-O3'
+   16384 = vector_size ! size of fortran arrays allocated in the multi-event API for SIMD/GPU (VECSIZE_MEMMAX)
+			     
 #*********************************************************************
 # Customization (custom cuts/scale/bias/...)                         *
 # list of files containing fortran function that overwrite default   *
@@ -140,12 +156,10 @@
 systematics = systematics_program ! none, systematics [python], SysCalc [depreceted, C++]
 ['--mur=0.5,1,2', '--muf=0.5,1,2', '--pdf=errorset'] = systematics_arguments ! see: https://cp3.irmp.ucl.ac.be/projects/madgraph/wiki/Systematics#Systematicspythonmodule
 
-#*********************************************************************
-# Options for the cudacpp plugin
-#*********************************************************************
-
-# Set cudacpp-specific values of non-cudacpp-specific options
--O3 -ffast-math -fbounds-check = global_flag ! build flags for Fortran code (for a fair comparison to cudacpp)
+#***********************************************************************
+# SIMD/GPU configuration for the CUDACPP plugin
+#************************************************************************
+ d = floating_type ! floating point precision: f (single), d (double), m (mixed: double for amplitudes, single for colors)
+ auto = avx_level ! SIMD vectorization level: none, sse4, avx2, 512y, 512z, auto
+ CPP = cudacpp_backend ! CUDACPP backend: FORTRAN, CPP, CUDA
 
-# New cudacpp-specific options (default values are defined in banner.py)
-CPP = cudacpp_backend ! valid backends are FORTRAN, CPP, CUDA
diff --git a/epochX/cudacpp/gg_tt.mad/Cards/run_card_default.dat b/epochX/cudacpp/gg_tt.mad/Cards/run_card_default.dat
index 48d84c73f0..27e990a016 100644
--- a/epochX/cudacpp/gg_tt.mad/Cards/run_card_default.dat
+++ b/epochX/cudacpp/gg_tt.mad/Cards/run_card_default.dat
@@ -80,7 +80,23 @@
    2  = sde_strategy  ! default integration strategy (hep-ph/2021.00773)
                              ! 1 is old strategy (using amp square)
 			     ! 2 is new strategy (using only the denominator)
-# To see advanced option for Phase-Space optimization: type "update psoptim"			     
+#*********************************************************************
+# Phase-Space Optim (advanced)
+#*********************************************************************
+   0 = job_strategy ! see appendix of 1507.00020 (page 26)
+   0 =  hard_survey ! force to have better estimate of the integral at survey for difficult mode like interference
+   -1.0 = tmin_for_channel ! limit the non-singular reach of --some-- channel of integration related to T-channel diagram (value between -1 and 0), -1 is no impact
+   -1 = survey_splitting ! for loop-induced control how many core are used at survey for the computation of a single iteration.
+   2 = survey_nchannel_per_job ! control how many Channel are integrated inside a single job on cluster/multicore
+   -1 = refine_evt_by_job ! control the maximal number of events for the first iteration of the refine (larger means less jobs)  
+#*********************************************************************
+# Compilation flag. 
+#*********************************************************************   
+   -O = global_flag ! fortran optimization flag use for the all code.
+   --fast-math  = aloha_flag ! fortran optimization flag for aloha function. Suggestions: '-ffast-math'
+   -O3 = matrix_flag ! fortran optimization flag for matrix.f function. Suggestions: '-O3'
+   16 = vector_size ! size of fortran arrays allocated in the multi-event API for SIMD/GPU (VECSIZE_MEMMAX)
+			     
 #*********************************************************************
 # Customization (custom cuts/scale/bias/...)                         *
 # list of files containing fortran function that overwrite default   *
@@ -139,3 +155,11 @@
 #
 systematics = systematics_program ! none, systematics [python], SysCalc [depreceted, C++]
 ['--mur=0.5,1,2', '--muf=0.5,1,2', '--pdf=errorset'] = systematics_arguments ! see: https://cp3.irmp.ucl.ac.be/projects/madgraph/wiki/Systematics#Systematicspythonmodule
+
+#***********************************************************************
+# SIMD/GPU configuration for the CUDACPP plugin
+#************************************************************************
+ d = floating_type ! floating point precision: f (single), d (double), m (mixed: double for amplitudes, single for colors)
+ auto = avx_level ! SIMD vectorization level: none, sse4, avx2, 512y, 512z, auto
+ CPP = cudacpp_backend ! CUDACPP backend: FORTRAN, CPP, CUDA
+
diff --git a/epochX/cudacpp/gg_tt.mad/MGMEVersion.txt b/epochX/cudacpp/gg_tt.mad/MGMEVersion.txt
index 1c1a95761b..85c67c3554 100644
--- a/epochX/cudacpp/gg_tt.mad/MGMEVersion.txt
+++ b/epochX/cudacpp/gg_tt.mad/MGMEVersion.txt
@@ -1 +1 @@
-3.5.1_lo_vect
\ No newline at end of file
+3.5.2_lo_vect
\ No newline at end of file
diff --git a/epochX/cudacpp/gg_tt.mad/Source/DHELAS/aloha_file.inc b/epochX/cudacpp/gg_tt.mad/Source/DHELAS/aloha_file.inc
index 5597c614b0..3a21194b00 100644
--- a/epochX/cudacpp/gg_tt.mad/Source/DHELAS/aloha_file.inc
+++ b/epochX/cudacpp/gg_tt.mad/Source/DHELAS/aloha_file.inc
@@ -1 +1 @@
-ALOHARoutine = FFV1_1.o FFV1_0.o FFV1_2.o VVV1P0_1.o
+ALOHARoutine = FFV1_0.o FFV1_1.o FFV1_2.o VVV1P0_1.o
diff --git a/epochX/cudacpp/gg_tt.mad/Source/PDF/pdfwrap_lhapdf.f b/epochX/cudacpp/gg_tt.mad/Source/PDF/pdfwrap_lhapdf.f
index 0be926e6cd..3f36905346 100644
--- a/epochX/cudacpp/gg_tt.mad/Source/PDF/pdfwrap_lhapdf.f
+++ b/epochX/cudacpp/gg_tt.mad/Source/PDF/pdfwrap_lhapdf.f
@@ -5,6 +5,7 @@ SUBROUTINE PDFWRAP
 C     
       INCLUDE 'pdf.inc'
       INCLUDE '../alfas.inc'
+      INCLUDE '../vector.inc'
       INCLUDE '../coupl.inc'
       REAL*8 ZMASS
       DATA ZMASS/91.188D0/
diff --git a/epochX/cudacpp/gg_tt.mad/Source/make_opts b/epochX/cudacpp/gg_tt.mad/Source/make_opts
index bd3c24228d..e4b87ee6ad 100644
--- a/epochX/cudacpp/gg_tt.mad/Source/make_opts
+++ b/epochX/cudacpp/gg_tt.mad/Source/make_opts
@@ -1,17 +1,12 @@
-pdlabel1=
-pdlabel2=
-lhapdf=
-PYTHIA8_PATH=NotInstalled
-MG5AMC_VERSION=3.5.0_lo_vect
-GLOBAL_FLAG=-O3 -ffast-math -fbounds-check
-ALOHA_FLAG=
-MATRIX_FLAG=
 DEFAULT_CPP_COMPILER=g++
+DEFAULT_F2PY_COMPILER=f2py3
+DEFAULT_F_COMPILER=gfortran
+GLOBAL_FLAG=-O3 -ffast-math -fbounds-check
 MACFLAG=
-STDLIB=-lstdc++
+MG5AMC_VERSION=SpecifiedByMG5aMCAtRunTime
+PYTHIA8_PATH=NotInstalled
 STDLIB_FLAG=
-DEFAULT_F_COMPILER=gfortran
-DEFAULT_F2PY_COMPILER=f2py3
+STDLIB=-lstdc++
 #end_of_make_opts_variables
 
 BIASLIBDIR=../../../lib/
diff --git a/epochX/cudacpp/gg_tt.mad/Source/makefile b/epochX/cudacpp/gg_tt.mad/Source/makefile
index dbe08b846e..00c73099a0 100644
--- a/epochX/cudacpp/gg_tt.mad/Source/makefile
+++ b/epochX/cudacpp/gg_tt.mad/Source/makefile
@@ -136,5 +136,7 @@ cleanSource:
 clean: cleanSource
 	for i in `ls -d ../SubProcesses/P*`; do cd $$i; make clean; cd -; done;
 
-cleanall: cleanSource
+cleanavx:
+	for i in `ls -d ../SubProcesses/P*`; do cd $$i; make cleanavxs; cd -; done;
+cleanall: cleanSource # THIS IS THE ONE
 	for i in `ls -d ../SubProcesses/P*`; do cd $$i; make cleanavxs; cd -; done;
diff --git a/epochX/cudacpp/gg_tt.mad/Source/param_card.inc b/epochX/cudacpp/gg_tt.mad/Source/param_card.inc
index 1fcfce55bb..081365c16b 100644
--- a/epochX/cudacpp/gg_tt.mad/Source/param_card.inc
+++ b/epochX/cudacpp/gg_tt.mad/Source/param_card.inc
@@ -1,15 +1,15 @@
-      MDL_WZ = 2.441404D+00
-      MDL_WW = 2.047600D+00
-      MDL_WH = 6.382339D-03
-      MDL_WT = 1.491500D+00
+      MDL_MB = 4.700000D+00
+      MDL_MT = 1.730000D+02
       MDL_MTA = 1.777000D+00
       MDL_MZ = 9.118800D+01
       MDL_MH = 1.250000D+02
-      MDL_MB = 4.700000D+00
-      MDL_MT = 1.730000D+02
       AEWM1 = 1.325070D+02
       MDL_GF = 1.166390D-05
       AS = 1.180000D-01
-      MDL_YMTAU = 1.777000D+00
       MDL_YMB = 4.700000D+00
       MDL_YMT = 1.730000D+02
+      MDL_YMTAU = 1.777000D+00
+      MDL_WT = 1.491500D+00
+      MDL_WZ = 2.441404D+00
+      MDL_WW = 2.047600D+00
+      MDL_WH = 6.382339D-03
diff --git a/epochX/cudacpp/gg_tt.mad/Source/vector.inc b/epochX/cudacpp/gg_tt.mad/Source/vector.inc
index 92254c0f2a..863eebbc70 100644
--- a/epochX/cudacpp/gg_tt.mad/Source/vector.inc
+++ b/epochX/cudacpp/gg_tt.mad/Source/vector.inc
@@ -28,5 +28,4 @@ C     BECAUSE IT DOES NOT GO THROUGH THE CPP PREPROCESSOR
 C     (see https://github.com/madgraph5/madgraph4gpu/issues/458).
 C     
       INTEGER VECSIZE_MEMMAX
-      PARAMETER (VECSIZE_MEMMAX=16384) ! NB: 16k events per GPU grid is the minimum required to fill a V100 GPU
-c     PARAMETER (VECSIZE_MEMMAX=32) ! NB: workaround for out-of-memory on Juwels: 32 is enough for no-CUDA builds (issue #498)
+      PARAMETER (VECSIZE_MEMMAX=16384)
diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/Bridge.h b/epochX/cudacpp/gg_tt.mad/SubProcesses/Bridge.h
index f37c972b24..89437b4c42 100644
--- a/epochX/cudacpp/gg_tt.mad/SubProcesses/Bridge.h
+++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/Bridge.h
@@ -18,6 +18,7 @@
 #include <cassert>
 #include <cmath>
 #include <cstring>
+#include <filesystem>
 #include <iostream>
 #include <memory>
 #include <type_traits>
@@ -244,14 +245,21 @@ namespace mg5amcCpu
     }
     std::cout << "WARNING! Instantiate device Bridge (nevt=" << m_nevt << ", gpublocks=" << m_gpublocks << ", gputhreads=" << m_gputhreads
               << ", gpublocks*gputhreads=" << m_gpublocks * m_gputhreads << ")" << std::endl;
-    CPPProcess process( /*verbose=*/false );
     m_pmek.reset( new MatrixElementKernelDevice( m_devMomentaC, m_devGs, m_devRndHel, m_devRndCol, m_devMEs, m_devSelHel, m_devSelCol, m_gpublocks, m_gputhreads ) );
 #else
     std::cout << "WARNING! Instantiate host Bridge (nevt=" << m_nevt << ")" << std::endl;
-    CPPProcess process( /*verbose=*/false );
     m_pmek.reset( new MatrixElementKernelHost( m_hstMomentaC, m_hstGs, m_hstRndHel, m_hstRndCol, m_hstMEs, m_hstSelHel, m_hstSelCol, m_nevt ) );
 #endif // MGONGPUCPP_GPUIMPL
-    process.initProc( "../../Cards/param_card.dat" );
+    // Create a process object, read param card and set parameters
+    // FIXME: the process instance can happily go out of scope because it is only needed to read parameters?
+    // FIXME: the CPPProcess should really be a singleton? what if fbridgecreate is called from several Fortran threads?
+    CPPProcess process( /*verbose=*/false );
+    std::string paramCard = "../../Cards/param_card.dat";
+    if( !std::filesystem::exists( paramCard ) )
+    {
+      paramCard = "../" + paramCard;
+    }
+    process.initProc( paramCard );
   }
 
 #ifdef MGONGPUCPP_GPUIMPL
diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/MGVersion.txt b/epochX/cudacpp/gg_tt.mad/SubProcesses/MGVersion.txt
index 1c1a95761b..85c67c3554 100644
--- a/epochX/cudacpp/gg_tt.mad/SubProcesses/MGVersion.txt
+++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/MGVersion.txt
@@ -1 +1 @@
-3.5.1_lo_vect
\ No newline at end of file
+3.5.2_lo_vect
\ No newline at end of file
diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/MadgraphTest.h b/epochX/cudacpp/gg_tt.mad/SubProcesses/MadgraphTest.h
index 176338151a..a64c05c26a 100644
--- a/epochX/cudacpp/gg_tt.mad/SubProcesses/MadgraphTest.h
+++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/MadgraphTest.h
@@ -14,6 +14,7 @@
 
 #include <array>
 #include <cmath>
+#include <filesystem>
 #include <fstream>
 #include <iomanip>
 #include <memory>
@@ -215,19 +216,16 @@ TEST_P( MadgraphTest, CompareMomentaAndME )
 #endif
   constexpr fptype energy = 1500; // historical default, Ecms = 1500 GeV = 1.5 TeV (above the Z peak)
   // Dump events to a new reference file?
-  constexpr bool dumpEvents = false;
-  std::string dumpFileName = std::string( "dump_" ) + testing::UnitTest::GetInstance()->current_test_info()->test_suite_name() + '.' + testing::UnitTest::GetInstance()->current_test_info()->name() + ".txt";
-  while( dumpFileName.find( '/' ) != std::string::npos )
-  {
-    dumpFileName.replace( dumpFileName.find( '/' ), 1, "_" );
-  }
+  const char* dumpEventsC = getenv( "CUDACPP_RUNTEST_DUMPEVENTS" );
+  const bool dumpEvents = ( dumpEventsC != 0 ) && ( std::string( dumpEventsC ) != "" );
+  const std::string refFileName = testDriver->getRefFileName();
+  const std::string dumpFileName = std::filesystem::path( refFileName ).filename();
   std::ofstream dumpFile;
   if( dumpEvents )
   {
     dumpFile.open( dumpFileName, std::ios::trunc );
   }
   // Read reference data
-  const std::string refFileName = testDriver->getRefFileName();
   std::map<unsigned int, ReferenceData> referenceData;
   if( !dumpEvents )
   {
diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/MatrixElementKernels.cc b/epochX/cudacpp/gg_tt.mad/SubProcesses/MatrixElementKernels.cc
index d6d6c4f179..81699dfea9 100644
--- a/epochX/cudacpp/gg_tt.mad/SubProcesses/MatrixElementKernels.cc
+++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/MatrixElementKernels.cc
@@ -112,10 +112,17 @@ namespace mg5amcCpu
     // See https://community.arm.com/arm-community-blogs/b/operating-systems-blog/posts/runtime-detection-of-cpu-features-on-an-armv8-a-cpu
     bool ok = true; // this is just an assumption!
     const std::string tag = "arm neon (128bit as in SSE4.2)";
-#else
+#elif defined( __x86_64__ ) || defined( __i386__ )
     bool known = true;
     bool ok = __builtin_cpu_supports( "sse4.2" );
     const std::string tag = "nehalem (SSE4.2)";
+#else // AV FIXME! Added by OM for Mac, should identify the correct __xxx__ flag that should be targeted
+    bool known = false; // __builtin_cpu_supports is not supported
+    // See https://gcc.gnu.org/onlinedocs/gcc/Basic-PowerPC-Built-in-Functions-Available-on-all-Configurations.html
+    // See https://stackoverflow.com/q/62783908
+    // See https://community.arm.com/arm-community-blogs/b/operating-systems-blog/posts/runtime-detection-of-cpu-features-on-an-armv8-a-cpu
+    bool ok = true; // this is just an assumption!
+    const std::string tag = "arm neon (128bit as in SSE4.2)";
 #endif
 #else
     bool known = true;
diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/CPPProcess.cc b/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/CPPProcess.cc
index deedd7ecbe..f20c229897 100644
--- a/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/CPPProcess.cc
+++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/CPPProcess.cc
@@ -7,7 +7,7 @@
 // Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+// MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
@@ -247,10 +247,10 @@ namespace mg5amcCpu
 
       ixxxxx<M_ACCESS, W_ACCESS>( momenta, cIPD[0], cHel[ihel][3], -1, w_fp[3], 3 );
 
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[1], COUPs[0], 0., 0., w_fp[4] );
+      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[1], COUPs[0], 1.0, 0., 0., w_fp[4] );
 
       // Amplitude(s) for diagram number 1
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[2], w_fp[4], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[2], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 1 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -261,10 +261,10 @@ namespace mg5amcCpu
       // *** DIAGRAM 2 OF 3 ***
 
       // Wavefunction(s) for diagram number 2
-      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[2], w_fp[0], COUPs[1], cIPD[0], cIPD[1], w_fp[4] );
+      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[2], w_fp[0], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[4] );
 
       // Amplitude(s) for diagram number 2
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[4], w_fp[1], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[4], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 2 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -274,10 +274,10 @@ namespace mg5amcCpu
       // *** DIAGRAM 3 OF 3 ***
 
       // Wavefunction(s) for diagram number 3
-      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[0], COUPs[1], cIPD[0], cIPD[1], w_fp[4] );
+      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[0], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[4] );
 
       // Amplitude(s) for diagram number 3
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[4], w_fp[2], w_fp[1], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[4], w_fp[2], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 3 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -646,12 +646,12 @@ namespace mg5amcCpu
                        fptype* allDenominators,    // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
                        bool* isGoodHel )           // output: isGoodHel[ncomb] - device array (CUDA implementation)
-  { /* clang-format on */
-    fptype allMEsLast = 0;
+  {                                                         /* clang-format on */
     const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid
-    allMEs[ievt] = 0;
     for( int ihel = 0; ihel < ncomb; ihel++ )
     {
+      // NEW IMPLEMENTATION OF GETGOODHEL (#630): RESET THE RUNNING SUM OVER HELICITIES TO 0 BEFORE ADDING A NEW HELICITY
+      allMEs[ievt] = 0;
       // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s)
       constexpr fptype_sv* jamp2_sv = nullptr; // no need for color selection during helicity filtering
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
@@ -660,12 +660,11 @@ namespace mg5amcCpu
 #else
       calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv );
 #endif
-      if( allMEs[ievt] != allMEsLast )
+      if( allMEs[ievt] != 0 ) // NEW IMPLEMENTATION OF GETGOODHEL (#630): COMPARE EACH HELICITY CONTRIBUTION TO 0
       {
         //if ( !isGoodHel[ihel] ) std::cout << "sigmaKin_getGoodHel ihel=" << ihel << " TRUE" << std::endl;
         isGoodHel[ihel] = true;
       }
-      allMEsLast = allMEs[ievt]; // running sum up to helicity ihel for event ievt
     }
   }
 #else
@@ -684,19 +683,11 @@ namespace mg5amcCpu
     //assert( (size_t)(allMEs) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS]
     // Allocate arrays at build time to contain at least 16 events (or at least neppV events if neppV>16, e.g. in future VPUs)
     constexpr int maxtry0 = std::max( 16, neppV ); // 16, but at least neppV (otherwise the npagV loop does not even start)
-    fptype allMEsLast[maxtry0] = { 0 };            // allocated at build time: maxtry0 must be a constexpr
     // Loop over only nevt events if nevt is < 16 (note that nevt is always >= neppV)
     assert( nevt >= neppV );
     const int maxtry = std::min( maxtry0, nevt ); // 16, but at most nevt (avoid invalid memory access if nevt<maxtry0)
 
-    // PART 0 - INITIALISATION (before calculate_wavefunctions)
-    // Reset the "matrix elements" - running sums of |M|^2 over helicities for the given event
-    for( int ievt = 0; ievt < maxtry; ++ievt )
-    {
-      allMEs[ievt] = 0; // all zeros
-    }
-
-    // PART 1 - HELICITY LOOP: CALCULATE WAVEFUNCTIONS
+    // HELICITY LOOP: CALCULATE WAVEFUNCTIONS
     const int npagV = maxtry / neppV;
 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
     // Mixed fptypes #537: float for color algebra and double elsewhere
@@ -715,6 +706,16 @@ namespace mg5amcCpu
 #endif
       for( int ihel = 0; ihel < ncomb; ihel++ )
       {
+        // NEW IMPLEMENTATION OF GETGOODHEL (#630): RESET THE RUNNING SUM OVER HELICITIES TO 0 BEFORE ADDING A NEW HELICITY
+        for( int ieppV = 0; ieppV < neppV; ++ieppV )
+        {
+          const int ievt = ievt00 + ieppV;
+          allMEs[ievt] = 0;
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+          const int ievt2 = ievt00 + ieppV + neppV;
+          allMEs[ievt2] = 0;
+#endif
+        }
         constexpr fptype_sv* jamp2_sv = nullptr; // no need for color selection during helicity filtering
         //std::cout << "sigmaKin_getGoodHel ihel=" << ihel << ( isGoodHel[ihel] ? " true" : " false" ) << std::endl;
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
@@ -726,22 +727,18 @@ namespace mg5amcCpu
         for( int ieppV = 0; ieppV < neppV; ++ieppV )
         {
           const int ievt = ievt00 + ieppV;
-          const bool differs = ( allMEs[ievt] != allMEsLast[ievt] );
-          if( differs )
+          if( allMEs[ievt] != 0 ) // NEW IMPLEMENTATION OF GETGOODHEL (#630): COMPARE EACH HELICITY CONTRIBUTION TO 0
           {
             //if ( !isGoodHel[ihel] ) std::cout << "sigmaKin_getGoodHel ihel=" << ihel << " TRUE" << std::endl;
             isGoodHel[ihel] = true;
           }
-          allMEsLast[ievt] = allMEs[ievt]; // running sum up to helicity ihel
 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
           const int ievt2 = ievt00 + ieppV + neppV;
-          const bool differs2 = ( allMEs[ievt2] != allMEsLast[ievt2] );
-          if( differs2 )
+          if( allMEs[ievt2] != 0 ) // NEW IMPLEMENTATION OF GETGOODHEL (#630): COMPARE EACH HELICITY CONTRIBUTION TO 0
           {
             //if ( !isGoodHel[ihel] ) std::cout << "sigmaKin_getGoodHel ihel=" << ihel << " TRUE" << std::endl;
             isGoodHel[ihel] = true;
           }
-          allMEsLast[ievt2] = allMEs[ievt2]; // running sum up to helicity ihel
 #endif
         }
       }
@@ -798,13 +795,12 @@ namespace mg5amcCpu
   {
     mgDebugInitialise();
 
-    // SANITY CHECKS for cudacpp code generation (see issues #272 and #343 and PRs #619, #626, #360 and #396)
+    // SANITY CHECKS for cudacpp code generation (see issues #272 and #343 and PRs #619, #626, #360, #396 and #754)
     // These variable are not used anywhere else in the code and their scope is limited to this sanity check
     {
-      // nprocesses>1 was last observed for "mirror processes" in uux_ttx in the 270 branch (see issue #343 and PRs #360 and #396)
+      // nprocesses == 2 may happen for "mirror processes" such as P0_uux_ttx within pp_tt012j (see PR #754)
       constexpr int nprocesses = 1;
-      static_assert( nprocesses == 1, "Assume nprocesses == 1" );
-      // process_id corresponds to the index of DSIG1 Fortran functions (must be 1 because cudacpp is unable to handle DSIG2)
+      static_assert( nprocesses == 1 || nprocesses == 2, "Assume nprocesses == 1 or 2" );
       constexpr int process_id = 1; // code generation source: madevent + cudacpp exporter
       static_assert( process_id == 1, "Assume process_id == 1" );
     }
@@ -890,23 +886,26 @@ namespace mg5amcCpu
     }
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     // Event-by-event random choice of color #402
-    const int channelIdC = channelId - 1; // coloramps.h uses the C array indexing starting at 0
-    fptype targetamp[ncolor] = { 0 };
-    for( int icolC = 0; icolC < ncolor; icolC++ )
+    if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783)
     {
-      if( icolC == 0 )
-        targetamp[icolC] = 0;
-      else
-        targetamp[icolC] = targetamp[icolC - 1];
-      if( mgOnGpu::icolamp[channelIdC][icolC] ) targetamp[icolC] += jamp2_sv[icolC];
-    }
-    //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] );
-    for( int icolC = 0; icolC < ncolor; icolC++ )
-    {
-      if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) )
+      const unsigned int channelIdC = channelId - 1; // coloramps.h uses the C array indexing starting at 0
+      fptype targetamp[ncolor] = { 0 };
+      for( int icolC = 0; icolC < ncolor; icolC++ )
       {
-        allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1]
-        break;
+        if( icolC == 0 )
+          targetamp[icolC] = 0;
+        else
+          targetamp[icolC] = targetamp[icolC - 1];
+        if( mgOnGpu::icolamp[channelIdC][icolC] ) targetamp[icolC] += jamp2_sv[icolC];
+      }
+      //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] );
+      for( int icolC = 0; icolC < ncolor; icolC++ )
+      {
+        if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) )
+        {
+          allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1]
+          break;
+        }
       }
     }
 #endif
@@ -1001,57 +1000,60 @@ namespace mg5amcCpu
 #endif
       }
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // multichannel enabled (random color choice)
-      const int channelIdC = channelId - 1; // coloramps.h uses the C array indexing starting at 0
       // Event-by-event random choice of color #402
-      fptype_sv targetamp[ncolor] = { 0 };
-      for( int icolC = 0; icolC < ncolor; icolC++ )
+      if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783)
       {
-        if( icolC == 0 )
-          targetamp[icolC] = fptype_sv{ 0 };
-        else
-          targetamp[icolC] = targetamp[icolC - 1];
-        if( mgOnGpu::icolamp[channelIdC][icolC] ) targetamp[icolC] += jamp2_sv[icolC];
-      }
+        const unsigned int channelIdC = channelId - 1; // coloramps.h uses the C array indexing starting at 0
+        fptype_sv targetamp[ncolor] = { 0 };
+        for( int icolC = 0; icolC < ncolor; icolC++ )
+        {
+          if( icolC == 0 )
+            targetamp[icolC] = fptype_sv{ 0 };
+          else
+            targetamp[icolC] = targetamp[icolC - 1];
+          if( mgOnGpu::icolamp[channelIdC][icolC] ) targetamp[icolC] += jamp2_sv[icolC];
+        }
 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-      fptype_sv targetamp2[ncolor] = { 0 };
-      for( int icolC = 0; icolC < ncolor; icolC++ )
-      {
-        if( icolC == 0 )
-          targetamp2[icolC] = fptype_sv{ 0 };
-        else
-          targetamp2[icolC] = targetamp2[icolC - 1];
-        if( mgOnGpu::icolamp[channelIdC][icolC] ) targetamp2[icolC] += jamp2_sv[ncolor + icolC];
-      }
-#endif
-      for( int ieppV = 0; ieppV < neppV; ++ieppV )
-      {
-        const int ievt = ievt00 + ieppV;
-        //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] );
+        fptype_sv targetamp2[ncolor] = { 0 };
         for( int icolC = 0; icolC < ncolor; icolC++ )
         {
+          if( icolC == 0 )
+            targetamp2[icolC] = fptype_sv{ 0 };
+          else
+            targetamp2[icolC] = targetamp2[icolC - 1];
+          if( mgOnGpu::icolamp[channelIdC][icolC] ) targetamp2[icolC] += jamp2_sv[ncolor + icolC];
+        }
+#endif
+        for( int ieppV = 0; ieppV < neppV; ++ieppV )
+        {
+          const int ievt = ievt00 + ieppV;
+          //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] );
+          for( int icolC = 0; icolC < ncolor; icolC++ )
+          {
 #if defined MGONGPU_CPPSIMD
-          const bool okcol = allrndcol[ievt] < ( targetamp[icolC][ieppV] / targetamp[ncolor - 1][ieppV] );
+            const bool okcol = allrndcol[ievt] < ( targetamp[icolC][ieppV] / targetamp[ncolor - 1][ieppV] );
 #else
-          const bool okcol = allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] );
+            const bool okcol = allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] );
 #endif
-          if( okcol )
-          {
-            allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1]
-            break;
+            if( okcol )
+            {
+              allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1]
+              break;
+            }
           }
-        }
 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-        const int ievt2 = ievt00 + ieppV + neppV;
-        //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt2, allrndcol[ievt2] );
-        for( int icolC = 0; icolC < ncolor; icolC++ )
-        {
-          if( allrndcol[ievt2] < ( targetamp2[icolC][ieppV] / targetamp2[ncolor - 1][ieppV] ) )
+          const int ievt2 = ievt00 + ieppV + neppV;
+          //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt2, allrndcol[ievt2] );
+          for( int icolC = 0; icolC < ncolor; icolC++ )
           {
-            allselcol[ievt2] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1]
-            break;
+            if( allrndcol[ievt2] < ( targetamp2[icolC][ieppV] / targetamp2[ncolor - 1][ieppV] ) )
+            {
+              allselcol[ievt2] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1]
+              break;
+            }
           }
-        }
 #endif
+        }
       }
 #endif // multichannel enabled (random color choice)
     }
diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/CPPProcess.h b/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/CPPProcess.h
index 26a8ecb9f5..4a88a07226 100644
--- a/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/CPPProcess.h
+++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/CPPProcess.h
@@ -7,7 +7,7 @@
 // Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+// MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/auto_dsig.f b/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/auto_dsig.f
index fe184caddf..d80d770784 100644
--- a/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/auto_dsig.f
+++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/auto_dsig.f
@@ -359,7 +359,7 @@ SUBROUTINE DSIG_VEC(ALL_P,ALL_WGT,ALL_XBK,ALL_Q2FACT,ALL_CM_RAP
       DOUBLE PRECISION FUNCTION DSIG(PP,WGT,IMODE)
 C     ****************************************************
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+C     Generated by MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/auto_dsig1.f b/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/auto_dsig1.f
index 3b24a9924c..9346ee4c6a 100644
--- a/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/auto_dsig1.f
+++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/auto_dsig1.f
@@ -1,7 +1,7 @@
       DOUBLE PRECISION FUNCTION DSIG1(PP,WGT,IMODE)
 C     ****************************************************
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+C     Generated by MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
@@ -39,6 +39,7 @@ DOUBLE PRECISION FUNCTION DSIG1(PP,WGT,IMODE)
 C     LOCAL VARIABLES 
 C     
       INTEGER I,ITYPE,LP,IPROC
+      DOUBLE PRECISION QSCALE
       DOUBLE PRECISION G1
       DOUBLE PRECISION G2
       DOUBLE PRECISION XPQ(-7:7),PD(0:MAXPROC)
@@ -126,11 +127,24 @@ DOUBLE PRECISION FUNCTION DSIG1(PP,WGT,IMODE)
 
       IF (ABS(LPP(IB(1))).GE.1) THEN
           !LP=SIGN(1,LPP(IB(1)))
-        G1=PDG2PDF(LPP(IB(1)),0, IB(1),XBK(IB(1)),DSQRT(Q2FACT(IB(1))))
+        IF (DSQRT(Q2FACT(IB(1))).EQ.0D0) THEN
+          QSCALE=0D0
+          DO I=3,NEXTERNAL
+            QSCALE=QSCALE+DSQRT(MAX(0D0,(PP(0,I)+PP(3,I))*(PP(0,I)
+     $       -PP(3,I))))
+          ENDDO
+          QSCALE=QSCALE/2D0
+        ELSE
+          QSCALE=DSQRT(Q2FACT(IB(1)))
+        ENDIF
+        G1=PDG2PDF(LPP(IB(1)),0, IB(1),XBK(IB(1)), QSCALE)
       ENDIF
       IF (ABS(LPP(IB(2))).GE.1) THEN
           !LP=SIGN(1,LPP(IB(2)))
-        G2=PDG2PDF(LPP(IB(2)),0, IB(2),XBK(IB(2)),DSQRT(Q2FACT(IB(2))))
+        IF (DSQRT(Q2FACT(IB(2))).NE.0D0) THEN
+          QSCALE=DSQRT(Q2FACT(IB(2)))
+        ENDIF
+        G2=PDG2PDF(LPP(IB(2)),0, IB(2),XBK(IB(2)), QSCALE)
       ENDIF
       PD(0) = 0D0
       IPROC = 0
@@ -202,7 +216,7 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT,
      $  ALL_CM_RAP, ALL_WGT, IMODE, ALL_OUT, VECSIZE_USED)
 C     ****************************************************
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+C     Generated by MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
@@ -249,6 +263,7 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT,
 C     
 C     LOCAL VARIABLES 
 C     
+      DOUBLE PRECISION QSCALE
       INTEGER I,ITYPE,LP,IPROC
       DOUBLE PRECISION G1(VECSIZE_MEMMAX)
       DOUBLE PRECISION G2(VECSIZE_MEMMAX)
diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/check_sa.cc b/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/check_sa.cc
index 1bad694d1c..7cac5ab47b 100644
--- a/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/check_sa.cc
+++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/check_sa.cc
@@ -29,7 +29,9 @@
 
 #include <algorithm>
 #include <array>
+#include <cfenv> // for feenableexcept
 #include <cmath>
+#include <csignal> // for signal and SIGFPE
 #include <cstring>
 #include <fstream>
 #include <iomanip>
@@ -74,6 +76,23 @@ usage( char* argv0, int ret = 1 )
   return ret;
 }
 
+#ifdef __CUDACC__
+namespace mg5amcGpu
+#else
+namespace mg5amcCpu
+#endif
+{
+  inline void FPEhandler( int sig )
+  {
+#ifdef __CUDACC__
+    std::cerr << "Floating Point Exception (GPU)" << std::endl;
+#else
+    std::cerr << "Floating Point Exception (CPU)" << std::endl;
+#endif
+    exit( 0 );
+  }
+}
+
 int
 main( int argc, char** argv )
 {
@@ -84,6 +103,18 @@ main( int argc, char** argv )
   using namespace mg5amcCpu;
 #endif
 
+  // Enable FPEs (test #701 and #733 - except on MacOS where feenableexcept is not defined #730)
+#ifndef __APPLE__
+  const char* enableFPEc = getenv( "CUDACPP_RUNTIME_ENABLEFPE" );
+  const bool enableFPE = ( enableFPEc != 0 ) && ( std::string( enableFPEc ) != "" );
+  if( enableFPE )
+  {
+    std::cout << "WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions" << std::endl;
+    feenableexcept( FE_INVALID | FE_DIVBYZERO | FE_OVERFLOW | FE_UNDERFLOW ); // debug #701
+    signal( SIGFPE, FPEhandler );
+  }
+#endif
+
   // DEFAULTS FOR COMMAND LINE ARGUMENTS
   bool verbose = false;
   bool debug = false;
@@ -103,12 +134,14 @@ main( int argc, char** argv )
     CurandHost = 1,
     CurandDevice = 2
   };
-#ifdef __CUDACC__
-  RandomNumberMode rndgen = RandomNumberMode::CurandDevice; // default on CUDA GPU
-#elif not defined MGONGPU_HAS_NO_CURAND
-  RandomNumberMode rndgen = RandomNumberMode::CurandHost;  // default on CPU if build has curand
+#ifdef MGONGPU_HAS_NO_CURAND
+  RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // this is the only supported mode if build has no curand (PR #784 and #785)
+#elif defined __HIPCC__
+#error Internal error: MGONGPU_HAS_NO_CURAND should have been set for __HIPCC__ // default on AMD GPUs should be common random
+#elif defined __CUDACC__
+  RandomNumberMode rndgen = RandomNumberMode::CurandDevice; // default on NVidia GPU if build has curand
 #else
-  RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // default on CPU if build has no curand and on HIP GPU
+  RandomNumberMode rndgen = RandomNumberMode::CurandHost; // default on CPU if build has curand
 #endif
   // Rambo sampling mode (NB RamboHost implies CommonRandom or CurandHost!)
   enum class RamboSamplingMode
@@ -146,18 +179,20 @@ main( int argc, char** argv )
     }
     else if( arg == "--curdev" )
     {
-#ifdef __CUDACC__
-      rndgen = RandomNumberMode::CurandDevice;
+#ifndef __CUDACC__
+      throw std::runtime_error( "CurandDevice is not supported on CPUs or non-NVidia GPUs" );
+#elif defined MGONGPU_HAS_NO_CURAND
+      throw std::runtime_error( "CurandDevice is not supported because this application was built without Curand support" );
 #else
-      throw std::runtime_error( "CurandDevice is not supported on CPUs or on HIP GPUs" );
+      rndgen = RandomNumberMode::CurandDevice;
 #endif
     }
     else if( arg == "--curhst" )
     {
-#ifndef MGONGPU_HAS_NO_CURAND
-      rndgen = RandomNumberMode::CurandHost;
-#else
+#ifdef MGONGPU_HAS_NO_CURAND
       throw std::runtime_error( "CurandHost is not supported because this application was built without Curand support" );
+#else
+      rndgen = RandomNumberMode::CurandHost;
 #endif
     }
     else if( arg == "--common" )
@@ -278,10 +313,10 @@ main( int argc, char** argv )
   const std::string procKey = "0a ProcInit";
   timermap.start( procKey );
 
-  // Create a process object
+  // Create a process object, read param card and set parameters
+  // FIXME: the process instance can happily go out of scope because it is only needed to read parameters?
+  // FIXME: the CPPProcess should really be a singleton? (for instance, in bridge mode this will be called twice here?)
   CPPProcess process( verbose );
-
-  // Read param_card and set parameters
   process.initProc( "../../Cards/param_card.dat" );
   const fptype energy = 1500; // historical default, Ecms = 1500 GeV = 1.5 TeV (above the Z peak)
   //const fptype energy = 91.2; // Ecms = 91.2 GeV (Z peak)
@@ -389,30 +424,26 @@ main( int argc, char** argv )
   {
     prnk.reset( new CommonRandomNumberKernel( hstRndmom ) );
   }
-#ifndef MGONGPU_HAS_NO_CURAND
   else if( rndgen == RandomNumberMode::CurandHost )
   {
+#ifdef MGONGPU_HAS_NO_CURAND
+    throw std::runtime_error( "INTERNAL ERROR! CurandHost is not supported because this application was built without Curand support" ); // INTERNAL ERROR (no path to this statement)
+#else
     const bool onDevice = false;
     prnk.reset( new CurandRandomNumberKernel( hstRndmom, onDevice ) );
+#endif
   }
-#ifdef __CUDACC__
   else
   {
+#ifdef MGONGPU_HAS_NO_CURAND
+    throw std::runtime_error( "INTERNAL ERROR! CurandDevice is not supported because this application was built without Curand support" ); // INTERNAL ERROR (no path to this statement)
+#elif defined __CUDACC__
     const bool onDevice = true;
     prnk.reset( new CurandRandomNumberKernel( devRndmom, onDevice ) );
-  }
 #else
-  else
-  {
-    throw std::logic_error( "CurandDevice is not supported on CPUs or HIP GPUs" ); // INTERNAL ERROR (no path to this statement)
-  }
+    throw std::logic_error( "INTERNAL ERROR! CurandDevice is not supported on CPUs or non-NVidia GPUs" ); // INTERNAL ERROR (no path to this statement)
 #endif
-#else
-  else
-  {
-    throw std::logic_error( "This application was built without Curand support" ); // INTERNAL ERROR (no path to this statement)
   }
-#endif
 
   // --- 0c. Create rambo sampling kernel [keep this in 0c for the moment]
   std::unique_ptr<SamplingKernelBase> prsk;
@@ -747,7 +778,7 @@ main( int argc, char** argv )
   wrkflwtxt += "HIP:";
 #else
   wrkflwtxt += "CPP:";
-#endif
+#endif /* clang-format off */
   // -- DOUBLE or FLOAT?
 #if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
   wrkflwtxt += "MIX+"; // mixed fptypes (single precision color algebra #537)
@@ -757,7 +788,7 @@ main( int argc, char** argv )
   wrkflwtxt += "FLT+";
 #else
   wrkflwtxt += "???+"; // no path to this statement
-#endif
+#endif /* clang-format on */
   // -- CUCOMPLEX or THRUST or STD complex numbers?
 #ifdef __CUDACC__
 #if defined MGONGPU_CUCXTYPE_CUCOMPLEX
diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/counters.cc b/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/counters.cc
index 71fa817036..3bbdec9387 100644
--- a/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/counters.cc
+++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/counters.cc
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #include "timer.h"
 #define TIMERTYPE std::chrono::high_resolution_clock
@@ -36,13 +36,10 @@ extern "C"
 
   static mgOnGpu::Timer<TIMERTYPE> program_timer;
   static float program_totaltime = 0;
-  static mgOnGpu::Timer<TIMERTYPE> matrix1_timer;
-  static float matrix1_totaltime = 0;
   static mgOnGpu::Timer<TIMERTYPE> smatrix1_timer;
   static float smatrix1_totaltime = 0;
   static mgOnGpu::Timer<TIMERTYPE> smatrix1multi_timer[nimplC];
   static float smatrix1multi_totaltime[nimplC] = { 0 };
-  static int matrix1_counter = 0;
   static int smatrix1_counter = 0;
   static int smatrix1multi_counter[nimplC] = { 0 };
 
@@ -52,19 +49,6 @@ extern "C"
     return;
   }
 
-  void counters_matrix1_start_()
-  {
-    matrix1_counter++;
-    matrix1_timer.Start();
-    return;
-  }
-
-  void counters_matrix1_stop_()
-  {
-    matrix1_totaltime += matrix1_timer.GetDuration();
-    return;
-  }
-
   void counters_smatrix1_start_()
   {
     smatrix1_counter++;
diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/matrix1.f b/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/matrix1.f
index ef18aff221..0c2ce6ec40 100644
--- a/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/matrix1.f
+++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/matrix1.f
@@ -1,7 +1,7 @@
       SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL,
      $  ICOL)
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+C     Generated by MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
@@ -301,7 +301,7 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL,
 
       REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+C     Generated by MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
@@ -396,7 +396,6 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
 C     ----------
 C     BEGIN CODE
 C     ----------
-      call counters_matrix1_start()
       IF (FIRST) THEN
         FIRST=.FALSE.
         IF(ZERO.NE.0D0) FK_ZERO = SIGN(MAX(ABS(ZERO), ABS(ZERO
@@ -470,7 +469,6 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
         ENDDO
       ENDDO
 
-      call counters_matrix1_stop()
       END
 
       SUBROUTINE PRINT_ZERO_AMP_1()
diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/cudacpp.mk b/epochX/cudacpp/gg_tt.mad/SubProcesses/cudacpp.mk
index 59a2c906eb..f2cfa349da 100644
--- a/epochX/cudacpp/gg_tt.mad/SubProcesses/cudacpp.mk
+++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/cudacpp.mk
@@ -4,10 +4,13 @@
 # Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 
 #=== Determine the name of this makefile (https://ftp.gnu.org/old-gnu/Manuals/make-3.80/html_node/make_17.html)
-#=== NB: different names (e.g. cudacpp.mk and cudacpp_src.mk) are used in the Subprocess and src directories
+#=== NB: use ':=' to ensure that the value of CUDACPP_MAKEFILE is not modified further down after including make_opts
+#=== NB: use 'override' to ensure that the value can not be modified from the outside
+override CUDACPP_MAKEFILE := $(word $(words $(MAKEFILE_LIST)),$(MAKEFILE_LIST))
+###$(info CUDACPP_MAKEFILE='$(CUDACPP_MAKEFILE)')
 
-CUDACPP_MAKEFILE = $(word $(words $(MAKEFILE_LIST)),$(MAKEFILE_LIST))
-CUDACPP_SRC_MAKEFILE = cudacpp_src.mk
+#=== NB: different names (e.g. cudacpp.mk and cudacpp_src.mk) are used in the Subprocess and src directories
+override CUDACPP_SRC_MAKEFILE = cudacpp_src.mk
 
 #-------------------------------------------------------------------------------
 
@@ -29,7 +32,17 @@ UNAME_P := $(shell uname -p)
 
 #-------------------------------------------------------------------------------
 
-#=== Configure common compiler flags for C++ and CUDA
+#=== Include the common MG5aMC Makefile options
+
+# OM: this is crucial for MG5aMC flag consistency/documentation
+# AV: temporarely comment this out because it breaks cudacpp builds
+ifneq ($(wildcard ../../Source/make_opts),)
+include ../../Source/make_opts
+endif
+
+#-------------------------------------------------------------------------------
+
+#=== Configure common compiler flags for C++ and CUDA/HIP
 
 INCFLAGS = -I.
 OPTFLAGS = -O3 # this ends up in GPUFLAGS too (should it?), cannot add -Ofast or -ffast-math here
@@ -101,68 +114,85 @@ endif
 # Note: AR, CXX and FC are implicitly defined if not set externally
 # See https://www.gnu.org/software/make/manual/html_node/Implicit-Variables.html
 
-#-------------------------------------------------------------------------------
-
-CUDA_COMPILER_PATH := $(shell compiler="`which nvcc 2>/dev/null`" && while [ -L "$$compiler" ]; do compiler=`readlink "$$compiler"`; done && echo "$$compiler")
-HIP_COMPILER_PATH := $(shell compiler="`which hipcc 2>/dev/null`" && while [ -L "$$compiler" ]; do compiler=`readlink "$$compiler"`; done && echo "$$compiler")
-
-ifeq ($(findstring nvcc,$(CUDA_COMPILER_PATH)),nvcc)
-  #=== Configure the CUDA compiler
-
-  # If CXX is not a single word (example "clang++ --gcc-toolchain...") then disable CUDA builds (issue #505)
-  # This is because it is impossible to pass this to "GPUFLAGS += -ccbin <host-compiler>" below
-  ifneq ($(words $(subst ccache ,,$(CXX))),1) # allow at most "CXX=ccache <host-compiler>" from outside
-    $(warning CUDA builds are not supported for multi-word CXX "$(CXX)")
-    override CUDA_HOME=disabled
-  endif
+# Add -mmacosx-version-min=11.3 to avoid "ld: warning: object file was built for newer macOS version than being linked"
+ifneq ($(shell $(CXX) --version | egrep '^Apple clang'),)
+CXXFLAGS += -mmacosx-version-min=11.3
+endif
 
-  # If CUDA_HOME is not set, try to set it from the location of nvcc
-  ifndef CUDA_HOME
-    CUDA_HOME = $(patsubst %bin/nvcc,%,$(shell which nvcc 2>/dev/null))
-    $(warning CUDA_HOME was not set: using "$(CUDA_HOME)")
-  endif
+#-------------------------------------------------------------------------------
 
-  # Set GPUCC as $(CUDA_HOME)/bin/nvcc if it exists
-  ifneq ($(wildcard $(CUDA_HOME)/bin/nvcc),)
-    GPUCC = $(CUDA_HOME)/bin/nvcc
-    USE_NVTX ?=-DUSE_NVTX
-    # See https://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/index.html
-    # See https://arnon.dk/matching-sm-architectures-arch-and-gencode-for-various-nvidia-cards/
-    # Default: use compute capability 70 for V100 (CERN lxbatch, CERN itscrd, Juwels Cluster).
-    # Embed device code for 70, and PTX for 70+.
-    # Export MADGRAPH_CUDA_ARCHITECTURE (comma-separated list) to use another value or list of values (see #533).
-    # Examples: use 60 for P100 (Piz Daint), 80 for A100 (Juwels Booster, NVidia raplab/Curiosity).
-    MADGRAPH_CUDA_ARCHITECTURE ?= 70
-    ###CUARCHFLAGS = -gencode arch=compute_$(MADGRAPH_CUDA_ARCHITECTURE),code=compute_$(MADGRAPH_CUDA_ARCHITECTURE) -gencode arch=compute_$(MADGRAPH_CUDA_ARCHITECTURE),code=sm_$(MADGRAPH_CUDA_ARCHITECTURE) # Older implementation (AV): go back to this one for multi-GPU support #533
-    ###CUARCHFLAGS = --gpu-architecture=compute_$(MADGRAPH_CUDA_ARCHITECTURE) --gpu-code=sm_$(MADGRAPH_CUDA_ARCHITECTURE),compute_$(MADGRAPH_CUDA_ARCHITECTURE) # Newer implementation (SH): cannot use this as-is for multi-GPU support #533
-    comma:=,
-    CUARCHFLAGS = $(foreach arch,$(subst $(comma), ,$(MADGRAPH_CUDA_ARCHITECTURE)),-gencode arch=compute_$(arch),code=compute_$(arch) -gencode arch=compute_$(arch),code=sm_$(arch))
-    CUINC = -I$(CUDA_HOME)/include/
-    CURANDLIBFLAGS = -L$(CUDA_HOME)/lib64/ -lcurand # NB: -lcuda is not needed here!
-    CUOPTFLAGS = -lineinfo
-    GPUFLAGS = $(foreach opt, $(OPTFLAGS), -Xcompiler $(opt)) $(CUOPTFLAGS) $(INCFLAGS) $(CUINC) $(USE_NVTX) $(CUARCHFLAGS) -use_fast_math
-    ###GPUFLAGS += -Xcompiler -Wall -Xcompiler -Wextra -Xcompiler -Wshadow
-    ###GPUCC_VERSION = $(shell $(GPUCC) --version | grep 'Cuda compilation tools' | cut -d' ' -f5 | cut -d, -f1)
-    GPUFLAGS += -std=c++17 # need CUDA >= 11.2 (see #333): this is enforced in mgOnGpuConfig.h
-    # Without -maxrregcount: baseline throughput: 6.5E8 (16384 32 12) up to 7.3E8 (65536 128 12)
-    ###GPUFLAGS+= --maxrregcount 160 # improves throughput: 6.9E8 (16384 32 12) up to 7.7E8 (65536 128 12)
-    ###GPUFLAGS+= --maxrregcount 128 # improves throughput: 7.3E8 (16384 32 12) up to 7.6E8 (65536 128 12)
-    ###GPUFLAGS+= --maxrregcount 96 # degrades throughput: 4.1E8 (16384 32 12) up to 4.5E8 (65536 128 12)
-    ###GPUFLAGS+= --maxrregcount 64 # degrades throughput: 1.7E8 (16384 32 12) flat at 1.7E8 (65536 128 12)
-    CUBUILDRULEFLAGS = -Xcompiler -fPIC -c
-    CCBUILDRULEFLAGS = -Xcompiler -fPIC -c -x cu
-    CUDATESTFLAGS = -lcuda
-  else ifneq ($(origin REQUIRE_CUDA),undefined)
-    # If REQUIRE_CUDA is set but no cuda is found, stop here (e.g. for CI tests on GPU #443)
-    $(error No cuda installation found (set CUDA_HOME or make GPUCC visible in PATH))
+#=== Configure the GPU compiler (CUDA or HIP)
+
+# FIXME! (AV 24.01.2024)
+# In the current implementation (without separate builds for C++ and CUDA/HIP), we first check for cudacc and hipcc in CUDA_HOME and HIP_HOME.
+# If CUDA_HOME or HIP_HOME are not set, try to determine them from the path to cudacc and hipcc.
+# While convoluted, this is currently necessary to allow disabling CUDA/HIP builds by setting CUDA_HOME or HIP_HOME to invalid paths.
+# This will (probably?) be fixed when separate C++ and CUDA/HIP builds are implemented (PR #775).
+
+# If CXX is not a single word (example "clang++ --gcc-toolchain...") then disable CUDA and HIP builds (issue #505)
+# This is because it is impossible to pass this to "GPUFLAGS += -ccbin <host-compiler>" below
+ifneq ($(words $(subst ccache ,,$(CXX))),1) # allow at most "CXX=ccache <host-compiler>" from outside
+  $(warning CUDA and HIP builds are not supported for multi-word CXX "$(CXX)")
+  override CUDA_HOME=disabled
+  override HIP_HOME=disabled
+endif
+
+# If CUDA_HOME is not set, try to set it from the path to nvcc
+ifndef CUDA_HOME
+  CUDA_HOME = $(patsubst %bin/nvcc,%,$(shell which nvcc 2>/dev/null))
+  $(warning CUDA_HOME was not set: using "$(CUDA_HOME)")
+endif
+
+# If HIP_HOME is not set, try to set it from the path to hipcc
+ifndef HIP_HOME
+  HIP_HOME = $(patsubst %bin/hipcc,%,$(HIP_COMPILER_PATH))
+  $(warning HIP_HOME was not set: using "$(HIP_HOME)")
+endif
+
+# FIXME! (AV 24.01.2024)
+# In the current implementation (without separate builds for C++ and CUDA/HIP),
+# builds are performed for HIP only if CUDA is not found in the path.
+# If both CUDA and HIP are installed, HIP builds can be triggered by unsetting CUDA_HOME.
+# This will be fixed when separate C++ and CUDA/HIP builds are implemented (PR #775).
+
+#--- Option 1: CUDA exists -> use CUDA
+
+# Set GPUCC as $(CUDA_HOME)/bin/nvcc if it exists
+ifneq ($(wildcard $(CUDA_HOME)/bin/nvcc),)
+
+  GPUCC = $(CUDA_HOME)/bin/nvcc
+  USE_NVTX ?=-DUSE_NVTX
+  # See https://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/index.html
+  # See https://arnon.dk/matching-sm-architectures-arch-and-gencode-for-various-nvidia-cards/
+  # Default: use compute capability 70 for V100 (CERN lxbatch, CERN itscrd, Juwels Cluster).
+  # Embed device code for 70, and PTX for 70+.
+  # Export MADGRAPH_CUDA_ARCHITECTURE (comma-separated list) to use another value or list of values (see #533).
+  # Examples: use 60 for P100 (Piz Daint), 80 for A100 (Juwels Booster, NVidia raplab/Curiosity).
+  MADGRAPH_CUDA_ARCHITECTURE ?= 70
+  ###CUARCHFLAGS = -gencode arch=compute_$(MADGRAPH_CUDA_ARCHITECTURE),code=compute_$(MADGRAPH_CUDA_ARCHITECTURE) -gencode arch=compute_$(MADGRAPH_CUDA_ARCHITECTURE),code=sm_$(MADGRAPH_CUDA_ARCHITECTURE) # Older implementation (AV): go back to this one for multi-GPU support #533
+  ###CUARCHFLAGS = --gpu-architecture=compute_$(MADGRAPH_CUDA_ARCHITECTURE) --gpu-code=sm_$(MADGRAPH_CUDA_ARCHITECTURE),compute_$(MADGRAPH_CUDA_ARCHITECTURE) # Newer implementation (SH): cannot use this as-is for multi-GPU support #533
+  comma:=,
+  CUARCHFLAGS = $(foreach arch,$(subst $(comma), ,$(MADGRAPH_CUDA_ARCHITECTURE)),-gencode arch=compute_$(arch),code=compute_$(arch) -gencode arch=compute_$(arch),code=sm_$(arch))
+  CUINC = -I$(CUDA_HOME)/include/
+  ifeq ($(RNDGEN),hasNoCurand)
+    CURANDLIBFLAGS=
   else
-    # No cuda. Switch cuda compilation off and go to common random numbers in C++
-    $(warning CUDA_HOME is not set or is invalid: export CUDA_HOME to compile with cuda)
-    override GPUCC=
-    override USE_NVTX=
-    override CUINC=
-    override CURANDLIBFLAGS=
+    CURANDLIBFLAGS = -L$(CUDA_HOME)/lib64/ -lcurand # NB: -lcuda is not needed here!
   endif
+  CUOPTFLAGS = -lineinfo
+  ###GPUFLAGS = $(OPTFLAGS) $(CUOPTFLAGS) $(INCFLAGS) $(CUINC) $(USE_NVTX) $(CUARCHFLAGS) -use_fast_math
+  GPUFLAGS = $(foreach opt, $(OPTFLAGS), -Xcompiler $(opt)) $(CUOPTFLAGS) $(INCFLAGS) $(CUINC) $(USE_NVTX) $(CUARCHFLAGS) -use_fast_math
+  ###GPUFLAGS += -Xcompiler -Wall -Xcompiler -Wextra -Xcompiler -Wshadow
+  ###GPUCC_VERSION = $(shell $(GPUCC) --version | grep 'Cuda compilation tools' | cut -d' ' -f5 | cut -d, -f1)
+  GPUFLAGS += -std=c++17 # need CUDA >= 11.2 (see #333): this is enforced in mgOnGpuConfig.h
+  # Without -maxrregcount: baseline throughput: 6.5E8 (16384 32 12) up to 7.3E8 (65536 128 12)
+  ###GPUFLAGS+= --maxrregcount 160 # improves throughput: 6.9E8 (16384 32 12) up to 7.7E8 (65536 128 12)
+  ###GPUFLAGS+= --maxrregcount 128 # improves throughput: 7.3E8 (16384 32 12) up to 7.6E8 (65536 128 12)
+  ###GPUFLAGS+= --maxrregcount 96 # degrades throughput: 4.1E8 (16384 32 12) up to 4.5E8 (65536 128 12)
+  ###GPUFLAGS+= --maxrregcount 64 # degrades throughput: 1.7E8 (16384 32 12) flat at 1.7E8 (65536 128 12)
+  CUBUILDRULEFLAGS = -Xcompiler -fPIC -c
+  CCBUILDRULEFLAGS = -Xcompiler -fPIC -c -x cu
+  CUDATESTFLAGS = -lcuda
 
   # Set the host C++ compiler for GPUCC via "-ccbin <host-compiler>"
   # (NB issue #505: this must be a single word, "clang++ --gcc-toolchain..." is not supported)
@@ -173,71 +203,55 @@ ifeq ($(findstring nvcc,$(CUDA_COMPILER_PATH)),nvcc)
   GPUFLAGS += -allow-unsupported-compiler
   endif
 
-else ifeq ($(findstring hipcc,$(HIP_COMPILER_PATH)),hipcc)
-  #=== Configure the HIP compiler
+else ifneq ($(origin REQUIRE_CUDA),undefined)
 
-  # If CXX is not a single word (example "clang++ --gcc-toolchain...") then disable CUDA/HIP builds (issue #505)
-  # This is because it is impossible to pass this to "GPUFLAGS += -ccbin <host-compiler>" below
-  ifneq ($(words $(subst ccache ,,$(CXX))),1) # allow at most "CXX=ccache <host-compiler>" from outside
-    $(warning HIP builds are not supported for multi-word CXX "$(CXX)")
-    override HIP_HOME=disabled
-  endif
+  # If REQUIRE_CUDA is set but no cuda is found, stop here (e.g. for CI tests on GPU #443)
+  $(error No cuda installation found (set CUDA_HOME or make GPUCC visible in PATH))
 
-  # If HIP_HOME is not set, try to set it from the location of GPUCC
-  ifndef HIP_HOME
-    HIP_HOME = $(patsubst %bin/hipcc,%,$(HIP_COMPILER_PATH))
-    $(warning HIP_HOME was not set: using "$(HIP_HOME)")
-  endif
+#--- Option 2: CUDA does not exist, HIP exists -> use HIP
 
-  # Set GPUCC as $(HIP_HOME)/bin/hipcc if it exists
-  ifneq ($(wildcard $(HIP_HOME)/bin/hipcc),)
-    GPUCC = $(HIP_HOME)/bin/hipcc
-
-    # Should maybe find something equivelant to this in HIP
-    #USE_NVTX ?=-DUSE_NVTX
-
-    HIPARCHFLAGS = -target x86_64-linux-gnu --offload-arch=gfx90a
-    HIPINC = -I$(HIP_HOME)/include/
-
-    # -DHIP_FAST_MATH equivelant to -use_fast_math in HIP 
-    # (But only for single precision line 208: https://rocm-developer-tools.github.io/HIP/hcc__detail_2math__functions_8h_source.html)
-    GPUFLAGS = $(OPTFLAGS) $(CUOPTFLAGS) $(INCFLAGS) $(HIPINC) $(HIPARCHFLAGS) -DHIP_FAST_MATH -DHIP_PLATFORM=amd -fPIC
-    ###GPUFLAGS += -Xcompiler -Wall -Xcompiler -Wextra -Xcompiler -Wshadow
-    GPUFLAGS += -std=c++17
-    # Without -maxrregcount: baseline throughput: 6.5E8 (16384 32 12) up to 7.3E8 (65536 128 12)
-    ###GPUFLAGS+= --maxrregcount 160 # improves throughput: 6.9E8 (16384 32 12) up to 7.7E8 (65536 128 12)
-    ###GPUFLAGS+= --maxrregcount 128 # improves throughput: 7.3E8 (16384 32 12) up to 7.6E8 (65536 128 12)
-    ###GPUFLAGS+= --maxrregcount 96 # degrades throughput: 4.1E8 (16384 32 12) up to 4.5E8 (65536 128 12)
-    ###GPUFLAGS+= --maxrregcount 64 # degrades throughput: 1.7E8 (16384 32 12) flat at 1.7E8 (65536 128 12)
-
-    CUBUILDRULEFLAGS = -fPIC -c
-    CCBUILDRULEFLAGS = -fPIC -c
-
-  else ifneq ($(origin REQUIRE_HIP),undefined)
-    # If REQUIRE_HIP is set but no cuda is found, stop here (e.g. for CI tests on GPU #443)
-    $(error No hip installation found (set HIP_HOME or make GPUCC visible in PATH))
-  else
-    # No hip. Switch hip compilation off and go to common random numbers in C++
-    $(warning HIP_HOME is not set or is invalid: export HIP_HOME to compile with hip)
-    override GPUCC=
-    override USE_NVTX=
-    override CUINC=
-    override CURANDLIBFLAGS=
-  endif
+# Set GPUCC as $(HIP_HOME)/bin/hipcc if it exists
+else ifneq ($(wildcard $(HIP_HOME)/bin/hipcc),)
 
-  # Allow newer (unsupported) C++ compilers with older versions of CUDA if ALLOW_UNSUPPORTED_COMPILER_IN_CUDA is set (#504)
-  ifneq ($(origin ALLOW_UNSUPPORTED_COMPILER_IN_CUDA),undefined)
-  GPUFLAGS += -allow-unsupported-compiler
-  endif
+  GPUCC = $(HIP_HOME)/bin/hipcc
+  #USE_NVTX ?=-DUSE_NVTX # should maybe find something equivalent to this in HIP?
+  HIPARCHFLAGS = -target x86_64-linux-gnu --offload-arch=gfx90a
+  HIPINC = -I$(HIP_HOME)/include/
+  # Note: -DHIP_FAST_MATH is equivalent to -use_fast_math in HIP 
+  # (but only for single precision line 208: https://rocm-developer-tools.github.io/HIP/hcc__detail_2math__functions_8h_source.html)
+  GPUFLAGS = $(OPTFLAGS) $(CUOPTFLAGS) $(INCFLAGS) $(HIPINC) $(HIPARCHFLAGS) -DHIP_FAST_MATH -DHIP_PLATFORM=amd -fPIC
+  ###GPUFLAGS += -Xcompiler -Wall -Xcompiler -Wextra -Xcompiler -Wshadow
+  GPUFLAGS += -std=c++17
+  ###GPUFLAGS+= --maxrregcount 255 # (AV: is this option valid on HIP and meaningful on AMD GPUs?)
+  CUBUILDRULEFLAGS = -fPIC -c
+  CCBUILDRULEFLAGS = -fPIC -c
+
+else ifneq ($(origin REQUIRE_HIP),undefined)
+
+  # If REQUIRE_HIP is set but no HIP is found, stop here (e.g. for CI tests on GPU #443)
+  $(error No hip installation found (set HIP_HOME or make GPUCC visible in PATH))
+
+#--- Option 3: CUDA does not exist, HIP does not exist -> switch off both CUDA and HIP
+
+else
+
+  # No cudacc and no hipcc: switch CUDA and HIP compilation off and go to common random numbers in C++
+  $(warning CUDA_HOME is not set or is invalid: export CUDA_HOME to compile with cuda)
+  $(warning HIP_HOME is not set or is invalid: export HIP_HOME to compile with hip)
+  override GPUCC=
+  override USE_NVTX=
+  override CUINC=
+  override CURANDLIBFLAGS=
 
 endif
 
+# Export GPUCC (so that it can also be used in cudacpp_src.mk?)
 export GPUCC
 export GPUFLAGS
 
 #-------------------------------------------------------------------------------
 
-#=== Configure ccache for C++ and CUDA builds
+#=== Configure ccache for C++ and CUDA/HIP builds
 
 # Enable ccache if USECCACHE=1
 ifeq ($(USECCACHE)$(shell echo $(CXX) | grep ccache),1)
@@ -254,7 +268,7 @@ endif
 
 #-------------------------------------------------------------------------------
 
-#=== Configure PowerPC-specific compiler flags for C++ and CUDA
+#=== Configure PowerPC-specific compiler flags for C++ and CUDA/HIP
 
 # PowerPC-specific CXX compiler flags (being reviewed)
 ifeq ($(UNAME_P),ppc64le)
@@ -270,7 +284,7 @@ else
   ######CXXFLAGS+= -fno-semantic-interposition # no benefit (neither alone, nor combined with -flto)
 endif
 
-# PowerPC-specific CUDA compiler flags (to be reviewed!)
+# PowerPC-specific CUDA/HIP compiler flags (to be reviewed!)
 ifeq ($(UNAME_P),ppc64le)
   GPUFLAGS+= -Xcompiler -mno-float128
 endif
@@ -285,12 +299,14 @@ override OMPFLAGS = -fopenmp
 ###override OMPFLAGS = # disable OpenMP MT on Intel (was ok without GPUCC but not ok with GPUCC before #578)
 else ifneq ($(shell $(CXX) --version | egrep '^(clang)'),)
 override OMPFLAGS = -fopenmp
-###override OMPFLAGS = # disable OpenMP MT on clang (was not ok without or with GPUCC before #578)
-else ifneq ($(shell $(CXX) --version | egrep '^(Apple clang)'),)
-override OMPFLAGS = # disable OpenMP MT on Apple clang (builds fail in the CI #578)
+###override OMPFLAGS = # disable OpenMP MT on clang (was not ok without or with nvcc before #578)
+###else ifneq ($(shell $(CXX) --version | egrep '^(Apple clang)'),) # AV for Mac (Apple clang compiler)
+else ifeq ($(UNAME_S),Darwin) # OM for Mac (any compiler)
+override OMPFLAGS = # AV disable OpenMP MT on Apple clang (builds fail in the CI #578)
+###override OMPFLAGS = -fopenmp # OM reenable OpenMP MT on Apple clang? (AV Oct 2023: this still fails in the CI)
 else
-override OMPFLAGS = -fopenmp
-###override OMPFLAGS = # disable OpenMP MT (default before #575)
+override OMPFLAGS = -fopenmp # enable OpenMP MT by default on all other platforms
+###override OMPFLAGS = # disable OpenMP MT on all other platforms (default before #575)
 endif
 
 # Set the default AVX (vectorization) choice
@@ -356,7 +372,7 @@ export OMPFLAGS
 
 #-------------------------------------------------------------------------------
 
-#=== Set the CUDA/C++ compiler flags appropriate to user-defined choices of AVX, FPTYPE, HELINL, HRDCOD, RNDGEN
+#=== Set the CUDA/HIP/C++ compiler flags appropriate to user-defined choices of AVX, FPTYPE, HELINL, HRDCOD, RNDGEN
 
 # Set the build flags appropriate to OMPFLAGS
 $(info OMPFLAGS=$(OMPFLAGS))
@@ -573,8 +589,9 @@ $(BUILDDIR)/gcheck_sa.o: CXXFLAGS += $(USE_NVTX) $(CUINC)
 
 # Apply special build flags only to check_sa and CurandRandomNumberKernel (curand headers, #679)
 $(BUILDDIR)/check_sa.o: CXXFLAGS += $(CXXFLAGSCURAND)
-$(BUILDDIR)/gcheck_sa.o: CXXFLAGS += $(CXXFLAGSCURAND)
+$(BUILDDIR)/gcheck_sa.o: CUFLAGS += $(CXXFLAGSCURAND)
 $(BUILDDIR)/CurandRandomNumberKernel.o: CXXFLAGS += $(CXXFLAGSCURAND)
+$(BUILDDIR)/gCurandRandomNumberKernel.o: CUFLAGS += $(CXXFLAGSCURAND)
 ifeq ($(RNDGEN),hasCurand)
 $(BUILDDIR)/CurandRandomNumberKernel.o: CXXFLAGS += $(CUINC)
 endif
@@ -772,12 +789,18 @@ $(testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(cxx_object
 	  $(GPUCC) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) -ldl $(LIBFLAGS) $(CUDATESTFLAGS)
 endif
 
+# Use target gtestlibs to build only googletest
+ifneq ($(GTESTLIBS),)
+gtestlibs: $(GTESTLIBS)
+endif
+
 # Use flock (Linux only, no Mac) to allow 'make -j' if googletest has not yet been downloaded https://stackoverflow.com/a/32666215
 $(GTESTLIBS):
 ifneq ($(shell which flock 2>/dev/null),)
+	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
 	flock $(BUILDDIR)/.make_test.lock $(MAKE) -C $(TESTDIR)
 else
-	$(MAKE) -C $(TESTDIR)
+	if [ -d $(TESTDIR) ]; then $(MAKE) -C $(TESTDIR); fi
 endif
 
 #-------------------------------------------------------------------------------
diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/dummy_fct.f b/epochX/cudacpp/gg_tt.mad/SubProcesses/dummy_fct.f
index 076cf29d67..4f7a204b8f 100644
--- a/epochX/cudacpp/gg_tt.mad/SubProcesses/dummy_fct.f
+++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/dummy_fct.f
@@ -32,7 +32,7 @@ logical FUNCTION dummy_cuts(P)
       LOGICAL  IS_A_NU(NEXTERNAL),IS_HEAVY(NEXTERNAL)
       logical  do_cuts(nexternal)
       COMMON /TO_SPECISA/IS_A_J,IS_A_A,IS_A_L,IS_A_B,IS_A_NU,IS_HEAVY,
-     . IS_A_ONIUM, do_cuts
+     & IS_A_ONIUM, do_cuts
 
       dummy_cuts=.true.
 
@@ -118,15 +118,16 @@ double precision function user_dynamical_scale(P)
       
       
 C ************************************************************
-C default for the library implementing a dummt bias function
+C default for the library implementing a dummy bias function
 C ************************************************************
       subroutine bias_wgt_custom(p, original_weight, bias_weight)
-          implicit none
+      implicit none
 C
 C Parameters
 C
           include 'nexternal.inc'
-C
+
+C     
 C Arguments
 C
           double precision p(0:3, nexternal)
@@ -161,3 +162,4 @@ subroutine bias_wgt_custom(p, original_weight, bias_weight)
 
       return
       end subroutine bias_wgt_custom
+
diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/fbridge.cc b/epochX/cudacpp/gg_tt.mad/SubProcesses/fbridge.cc
index 2b956730d4..22ce3f5115 100644
--- a/epochX/cudacpp/gg_tt.mad/SubProcesses/fbridge.cc
+++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/fbridge.cc
@@ -49,11 +49,7 @@ extern "C"
 #ifdef MGONGPUCPP_GPUIMPL
     GpuRuntime::setUp();
 #endif
-    // Create a process object, read parm card and set parameters
-    // FIXME: the process instance can happily go out of scope because it is only needed to read parameters?
-    // FIXME: the CPPProcess should really be a singleton? what if fbridgecreate is called from several Fortran threads?
-    CPPProcess process( /*verbose=*/false );
-    process.initProc( "../../Cards/param_card.dat" );
+    // (NB: CPPProcess::initProc no longer needs to be executed here because it is called in the Bridge constructor)
     // FIXME: disable OMP in Bridge when called from Fortran
     *ppbridge = new Bridge<FORTRANFPTYPE>( *pnevtF, *pnparF, *pnp4F );
   }
diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/genps.f b/epochX/cudacpp/gg_tt.mad/SubProcesses/genps.f
index fe9c61504b..c00e33d954 100644
--- a/epochX/cudacpp/gg_tt.mad/SubProcesses/genps.f
+++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/genps.f
@@ -1877,12 +1877,12 @@ double precision function get_channel_cut(p, config)
          d1 = iforest(1, -i, config)
          d2 = iforest(2, -i, config)
          do j=0,3
-            if (d1.gt.0.and.d1.le.2) then
+            if (d1.gt.0.and.d1.le.nincoming) then
                ptemp(j,-i) = ptemp(j,-i) - ptemp(j, d1)
             else
                ptemp(j,-i) = ptemp(j,-i)+ptemp(j, d1)
             endif
-            if (d2.gt.0.and.d2.le.2) then
+            if (d2.gt.0.and.d2.le.nincoming) then
                ptemp(j,-i) = ptemp(j,-i) - ptemp(j, d2)
             else
                ptemp(j,-i) = ptemp(j,-i)+ptemp(j, d2)
diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/makefile b/epochX/cudacpp/gg_tt.mad/SubProcesses/makefile
index 74db44d848..d572486c2e 100644
--- a/epochX/cudacpp/gg_tt.mad/SubProcesses/makefile
+++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/makefile
@@ -9,6 +9,12 @@ FFLAGS+= -cpp
 # Compile counters with -O3 as in the cudacpp makefile (avoid being "unfair" to Fortran #740)
 CXXFLAGS = -O3 -Wall -Wshadow -Wextra
 
+# Add -std=c++17 explicitly to avoid build errors on macOS
+# Add -mmacosx-version-min=11.3 to avoid "ld: warning: object file was built for newer macOS version than being linked"
+ifneq ($(shell $(CXX) --version | egrep '^Apple clang'),)
+CXXFLAGS += -std=c++17 -mmacosx-version-min=11.3
+endif
+
 # Enable ccache if USECCACHE=1
 ifeq ($(USECCACHE)$(shell echo $(CXX) | grep ccache),1)
   override CXX:=ccache $(CXX)
@@ -51,7 +57,7 @@ CUDACPP_MAKEFILE=cudacpp.mk
 CUDACPP_MAKEENV:=$(shell echo '$(.VARIABLES)' | tr " " "\n" | egrep "(USEBUILDDIR|AVX|FPTYPE|HELINL|HRDCOD)")
 ###$(info CUDACPP_MAKEENV=$(CUDACPP_MAKEENV))
 ###$(info $(foreach v,$(CUDACPP_MAKEENV),$(v)="$($(v))"))
-CUDACPP_BUILDDIR:=$(shell $(MAKE) $(foreach v,$(CUDACPP_MAKEENV),$(v)="$($(v))") -f $(CUDACPP_MAKEFILE) -pn |& awk '/Building/{print $$3}' | sed s/BUILDDIR=//)
+CUDACPP_BUILDDIR:=$(shell $(MAKE) $(foreach v,$(CUDACPP_MAKEENV),$(v)="$($(v))") -f $(CUDACPP_MAKEFILE) -pn 2>&1 | awk '/Building/{print $$3}' | sed s/BUILDDIR=//)
 ifeq ($(CUDACPP_BUILDDIR),)
 $(error CUDACPP_BUILDDIR='$(CUDACPP_BUILDDIR)' should not be empty!)
 else
@@ -89,7 +95,12 @@ SYMMETRY = symmetry.o idenparts.o
 
 # Binaries
 
-LDFLAGS+=-Wl,--no-relax # avoid 'failed to convert GOTPCREL relocation' error #458
+ifeq ($(UNAME),Darwin)
+LDFLAGS += -lc++ # avoid 'Undefined symbols' for chrono::steady_clock on macOS (checked with otool -L libmg5amc_gg_ttx_cpp.so) 
+LDFLAGS += -mmacosx-version-min=11.3 # avoid "ld: warning: object file was built for newer macOS version than being linked"  
+else
+LDFLAGS += -Wl,--no-relax # avoid 'failed to convert GOTPCREL relocation' error #458 (not supported on macOS)
+endif
 
 all: $(PROG)_fortran $(CUDACPP_BUILDDIR)/$(PROG)_cpp # also builds $(PROG)_cuda if $(CUDACPP_CULIB) exists (#503)
 
@@ -100,8 +111,8 @@ LINKLIBS += -lintlc # undefined reference to `_intel_fast_memcpy'
 else ifneq ($(shell $(CXX) --version | egrep '^clang'),)
 override OMPFLAGS = -fopenmp
 $(CUDACPP_BUILDDIR)/$(PROG)_cpp: LINKLIBS += -L $(shell dirname $(shell $(CXX) -print-file-name=libc++.so)) -lomp # see #604
-###else ifneq ($(shell $(CXX) --version | egrep '^Apple clang'),)
-###override OMPFLAGS = -fopenmp # OMP is not supported yet by cudacpp for Apple clang
+else ifneq ($(shell $(CXX) --version | egrep '^Apple clang'),)
+override OMPFLAGS = # OMP is not supported yet by cudacpp for Apple clang
 else
 override OMPFLAGS = -fopenmp
 endif
diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/runTest.cc b/epochX/cudacpp/gg_tt.mad/SubProcesses/runTest.cc
index 0ed26180ca..de327f2321 100644
--- a/epochX/cudacpp/gg_tt.mad/SubProcesses/runTest.cc
+++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/runTest.cc
@@ -71,6 +71,8 @@ struct CPUTest : public CUDA_CPU_TestBase
     , hstSelCol( nevt )
     , hstIsGoodHel( CPPProcess::ncomb )
   {
+    // FIXME: the process instance can happily go out of scope because it is only needed to read parameters?
+    // FIXME: the CPPProcess should really be a singleton?
     process.initProc( "../../Cards/param_card.dat" );
   }
 
@@ -183,6 +185,8 @@ struct CUDATest : public CUDA_CPU_TestBase
     , devSelCol( nevt )
     , devIsGoodHel( CPPProcess::ncomb )
   {
+    // FIXME: the process instance can happily go out of scope because it is only needed to read parameters?
+    // FIXME: the CPPProcess should really be a singleton?
     process.initProc( "../../Cards/param_card.dat" );
   }
 
diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/testxxx.cc b/epochX/cudacpp/gg_tt.mad/SubProcesses/testxxx.cc
index 016bc0f472..e5167de00c 100644
--- a/epochX/cudacpp/gg_tt.mad/SubProcesses/testxxx.cc
+++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/testxxx.cc
@@ -59,7 +59,8 @@ TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testxxx )
   using namespace mg5amcCpu;
 #endif
 #ifndef __APPLE__ // test #701 (except on MacOS where feenableexcept is not defined #730)
-  const bool enableFPE = !getenv( "CUDACPP_RUNTIME_DISABLEFPE" );
+  const char* enableFPEc = getenv( "CUDACPP_RUNTIME_ENABLEFPE" );
+  const bool enableFPE = ( enableFPEc != 0 ) && ( std::string( enableFPEc ) != "" );
   if( enableFPE )
   {
     feenableexcept( FE_INVALID | FE_DIVBYZERO | FE_OVERFLOW | FE_UNDERFLOW ); // debug #701
diff --git a/epochX/cudacpp/gg_tt.mad/bin/generate_events b/epochX/cudacpp/gg_tt.mad/bin/generate_events
index 107313b25d..5577cc66a0 100755
--- a/epochX/cudacpp/gg_tt.mad/bin/generate_events
+++ b/epochX/cudacpp/gg_tt.mad/bin/generate_events
@@ -46,7 +46,7 @@ if __debug__ and (not os.path.exists(pjoin(root_path,'../..', 'bin','create_rele
 
 sys.path.append(pjoin(root_path,'bin','internal'))
 import madevent_interface as ME        
-
+import misc as misc
 
 import logging
 import logging.config
@@ -160,17 +160,31 @@ if '__main__' == __name__:
     # Check that python version is valid
 
     set_configuration()
-    argument = sys.argv    
+    argument = sys.argv
+
+    # check for plugin customization of the launch command
+    launch_interface = ME.MadEventCmdShell
+    if os.path.exists(pjoin(root_path, 'bin','internal', 'launch_plugin.py')):
+        with  misc.TMP_variable(sys, 'path', sys.path + [pjoin(root_path, 'bin', 'internal')]):
+            from importlib import reload
+            try:
+                reload('launch_plugin')
+            except Exception as error:
+                import launch_plugin
+        launch_interface =  launch_plugin.MEINTERFACE
+
+
+    
     try:
         if '-h' in argument or '--help' in argument:
-            launch = ME.MadEventCmdShell(me_dir=root_path, force_run=True)
+            launch = launch_interface(me_dir=root_path, force_run=True)
             launch.exec_cmd('help generate_events')
             sys.exit()
         elif len(argument) > 1 and argument[1] in ['0', '1', '2']:
             argument = treat_old_argument(argument)
         
         with ME.MadEventCmdShell.RunWebHandling(root_path, ):
-            launch = ME.MadEventCmdShell(me_dir=root_path, force_run=True)
+            launch = launch_interface(me_dir=root_path, force_run=True)
             launch.run_cmd('generate_events %s' % ' '.join(argument[1:]))
             launch.run_cmd('quit')
     except ME.MadEventAlreadyRunning as message:
diff --git a/epochX/cudacpp/gg_tt.mad/bin/internal/__init__.py b/epochX/cudacpp/gg_tt.mad/bin/internal/__init__.py
index 16e60d8182..0d17042f0d 100755
--- a/epochX/cudacpp/gg_tt.mad/bin/internal/__init__.py
+++ b/epochX/cudacpp/gg_tt.mad/bin/internal/__init__.py
@@ -26,6 +26,7 @@ class aMCatNLOError(MadGraph5Error):
 import os
 import logging
 import time
+pjoin = os.path.join
 
 #Look for basic file position MG5DIR and MG4DIR
 MG5DIR = os.path.realpath(os.path.join(os.path.dirname(__file__),
diff --git a/epochX/cudacpp/gg_tt.mad/bin/internal/banner.py b/epochX/cudacpp/gg_tt.mad/bin/internal/banner.py
index c1e54d3cb9..bd1517985f 100755
--- a/epochX/cudacpp/gg_tt.mad/bin/internal/banner.py
+++ b/epochX/cudacpp/gg_tt.mad/bin/internal/banner.py
@@ -537,7 +537,7 @@ def charge_card(self, tag):
             self.param_card = param_card_reader.ParamCard(param_card)
             return self.param_card
         elif tag == 'mgruncard':
-            self.run_card = RunCard(self[tag])
+            self.run_card = RunCard(self[tag], unknown_warning=False)
             return self.run_card
         elif tag == 'mg5proccard':
             proc_card = self[tag].split('\n')
@@ -1002,14 +1002,18 @@ def __init__(self, finput=None, **opt):
         self.allowed_value = {}
         
         self.default_setup()
+        self.plugin_input(finput)
         
 
         # if input is define read that input
         if isinstance(finput, (file, str, StringIO.StringIO)):
             self.read(finput, **opt)
+        
 
 
 
+    def plugin_input(self, finput=None):
+        pass
 
 
     def default_setup(self):
@@ -2621,7 +2625,28 @@ class RunCard(ConfigFile):
     default_include_file = 'run_card.inc'
     default_autodef_file = 'run.inc'
     donewarning = []
+    include_as_parameter = []
+
+    def plugin_input(self, finput):
 
+        if not finput and not MADEVENT:
+            return
+        curr_dir = None
+        if isinstance(finput, file):
+            # expected path to be like "XXXX/Cards/run_card.dat"
+            curr_dir = os.path.dirname(os.path.dirname(finput.name))
+        elif isinstance(finput, str):
+            curr_dir = os.path.dirname(os.path.dirname(finput))
+        
+        if curr_dir:
+            if os.path.exists(pjoin(curr_dir, 'bin', 'internal', 'plugin_run_card')):
+                # expected format {} passing everything as optional argument
+                for line in open(pjoin(curr_dir, 'bin', 'internal', 'plugin_run_card')):
+                    if line.startswith('#'):
+                        continue
+                    opts = dict(eval(line))
+                    self.add_param(**opts)
+        
     @classmethod
     def fill_post_set_from_blocks(cls):
         """set the post_set function for any parameter defined in a run_block"""
@@ -2647,18 +2672,48 @@ def __new__(cls, finput=None, **opt):
             elif isinstance(finput, cls):
                 target_class = finput.__class__
             elif isinstance(finput, str):
+                path = finput
                 if '\n' not in finput:
                     finput = open(finput).read()
                 if 'req_acc_FO' in finput:
                     target_class = RunCardNLO
                 else:
                     target_class = RunCardLO
+                    if MADEVENT and os.path.exists(pjoin(MEDIR, 'bin','internal', 'launch_plugin.py')):
+                        with  misc.TMP_variable(sys, 'path', sys.path + [pjoin(MEDIR, 'bin', 'internal')]):
+                            from importlib import reload
+                            try:
+                                reload('launch_plugin')
+                            except Exception as error:
+                                import launch_plugin
+                        target_class = launch_plugin.RunCard
+                    elif not MADEVENT:
+                        if 'run_card.dat' in path:
+                            launch_plugin_path = path.replace('run_card.dat', '../bin/internal/launch_plugin.py')
+                        elif 'run_card_default.dat' in path:
+                             launch_plugin_path = path.replace('run_card_default.dat', '../bin/internal/launch_plugin.py')
+                        else:
+                            launch_plugin_path = None
+                        if launch_plugin_path and os.path.exists(launch_plugin_path):
+                            misc.sprint('try to use plugin class', path.replace('run_card.dat', '../bin/internal/launch_plugin.py'))
+                            pydir = os.path.dirname(launch_plugin_path)
+                            with  misc.TMP_variable(sys, 'path', sys.path + [pydir]):
+                                from importlib import reload
+                                try:
+                                    reload('launch_plugin')
+                                except Exception as error:
+                                    import launch_plugin
+                            target_class = launch_plugin.RunCard
+            elif issubclass(finput, RunCard):
+                target_class = finput
             else:
                 return None
 
             target_class.fill_post_set_from_blocks()
-
-            return super(RunCard, cls).__new__(target_class, finput, **opt)
+            out = super(RunCard, cls).__new__(target_class, finput, **opt)
+            if not isinstance(out, RunCard): #should not happen but in presence of missmatch of library loaded.
+                out.__init__(finput, **opt)
+            return out
         else:
             return super(RunCard, cls).__new__(cls, finput, **opt)
 
@@ -2686,7 +2741,7 @@ def __init__(self, *args, **opts):
         self.system_default = {}
         
         self.display_block = [] # set some block to be displayed
-
+        self.fct_mod = {} # {param: (fct_pointer, *argument, **opts)}
 
         self.cut_class = {} 
         self.warned=False
@@ -2723,7 +2778,7 @@ def get_lepton_densities(cls):
 
     def add_param(self, name, value, fortran_name=None, include=True, 
                   hidden=False, legacy=False, cut=False, system=False, sys_default=None,
-                  autodef=False, 
+                  autodef=False, fct_mod=None,
                   **opts):
         """ add a parameter to the card. value is the default value and 
         defines the type (int/float/bool/str) of the input.
@@ -2737,6 +2792,7 @@ def add_param(self, name, value, fortran_name=None, include=True,
                  If a path (Source/PDF/pdf.inc) the definition will be added within that file
                  Default is False (does not add the definition)
                  entry added in the run_card will automatically have this on True.
+        fct_mod: defines a function to run if the parameter is modify in the include file
         options of **opts:
         - allowed: list of valid options. '*' means anything else should be allowed.
                  empty list means anything possible as well. 
@@ -2761,15 +2817,22 @@ def add_param(self, name, value, fortran_name=None, include=True,
         if autodef:
             self.definition_path[autodef].append(name)
             self.user_set.add(name)
+        # function to trigger if a value is modified in the include file
+        # main target is action to force correct recompilation (like for compilation flag/...)
+        if fct_mod:
+            self.fct_mod[name] = fct_mod
 
-    def read(self, finput, consistency=True):
+    def read(self, finput, consistency=True, unknown_warning=True, **opt):
         """Read the input file, this can be a path to a file, 
            a file object, a str with the content of the file."""
            
         if isinstance(finput, str):
             if "\n" in finput:
                 finput = finput.split('\n')
+                if 'path' in opt:
+                    self.path = opt['path']
             elif os.path.isfile(finput):
+                self.path = finput
                 finput = open(finput)
             else:
                 raise Exception("No such file %s" % finput)
@@ -2784,7 +2847,7 @@ def read(self, finput, consistency=True):
             name = name.lower().strip()
             if name not in self:
                 #looks like an entry added by a user -> add it nicely
-                self.add_unknown_entry(name, value)
+                self.add_unknown_entry(name, value, unknown_warning)
             else:
                 self.set( name, value, user=True)
         # parameter not set in the run_card can be set to compatiblity value
@@ -2796,7 +2859,7 @@ def read(self, finput, consistency=True):
                         logger.warning(str(error))
                     else:
                         raise
-    def add_unknown_entry(self, name, value):
+    def add_unknown_entry(self, name, value, unknow_warning):
         """function to add an entry to the run_card when the associated parameter does not exists.
            This is based on the guess_entry_fromname for the various syntax providing input.
            This then call add_param accordingly.
@@ -2835,7 +2898,7 @@ def add_unknown_entry(self, name, value):
                 raise Exception("dictionary need to have at least one entry")
             default['dict']['__type__'] = default[self.guess_type_from_value(default_value[0])]
 
-        if name not in RunCard.donewarning:
+        if name not in RunCard.donewarning and unknow_warning:
             logger.warning("Found unexpected entry in run_card: \"%s\" with value \"%s\".\n"+\
                 "  The type was assigned to %s. \n"+\
                 "  The definition of that variable will %sbe automatically added to fortran file %s\n"+\
@@ -2873,7 +2936,17 @@ def valid_line(self, line, tmp):
                 return False 
         else:
             return True      
-                    
+
+
+    def reset_simd(self, old_value, new_value, name, *args, **opts):
+        #return
+        raise Exception('pass in reset simd')
+
+    def make_clean(self,old_value, new_value, name, dir):
+        raise Exception('pass make clean for ', dir)
+
+    def make_Ptouch(self,old_value, new_value, name, reset):
+        raise Exception('pass Ptouch for ', reset)             
                 
     def write(self, output_file, template=None, python_template=False,
                     write_hidden=False, template_options=None, **opt):
@@ -2898,11 +2971,12 @@ def write(self, output_file, template=None, python_template=False,
         if python_template and not to_write:
             import string
             if self.blocks:
-                text = string.Template(text)
                 mapping = {}
                 for b in self.blocks:
                     mapping[b.name] =  b.get_template(self)
-                text = text.substitute(mapping)
+                    if "$%s" % b.name not in text:
+                        text += "\n$%s\n" % b.name
+                text = string.Template(text).substitute(mapping)
 
             if not self.list_parameter:
                 text = text % self
@@ -3048,6 +3122,77 @@ def write(self, output_file, template=None, python_template=False,
         else:
             output_file.write(text)
 
+    def get_last_value_include(self, output_dir):
+        """For paraeter in self.fct_mod
+        parse the associate inc file to get the value of the previous run.
+        We return a dictionary {name: old_value}
+        if inc file does not exist we will return the current value (i.e. set has no change)
+        """
+
+        #remember that 
+        # default_include_file is a class variable
+        # self.includepath is on the form include_path : [list of param ]
+        out = {}
+
+        # setup inc_to_parse to be like self.includepath (include_path : [list of param ])
+        # BUT only containing the parameter that need to be tracked for the fct_mod option
+        inc_to_parse = {}
+        for inc_file, params in self.includepath.items():
+            if not inc_file:
+                continue
+            if any(p in params for p in self.fct_mod):
+                inc_to_parse[inc_file] = [name for name in self.includepath[inc_file] if name in self.fct_mod]
+
+        # now loop over the files and ask the associate function
+        for inc_file, params in inc_to_parse.items():
+            if inc_file is True:
+                inc_file = self.default_include_file
+            out.update(self.get_value_from_include(inc_file, params, output_dir))
+
+        return out
+
+    def get_value_from_include(self, path, list_of_params, output_dir):
+        """for a given include file return the current value of the requested parameter
+        return a dictionary {name: value}
+        if path does not exists return the current value in self for all parameter"""
+
+        #WARNING DOES NOT HANDLE LIST/DICT so far
+
+        # handle case where file is missing
+        if not os.path.exists(pjoin(output_dir,path)):
+            misc.sprint("include file not existing", pjoin(output_dir,path))
+            out = {name: self[name] for name in list_of_params}
+
+        with open(pjoin(output_dir,path), 'r') as fsock:
+            text = fsock.read()
+        
+        for name in list_of_params:
+            misc.sprint(name, name in self.fortran_name)
+            misc.sprint(self.fortran_name[name] if name in self.fortran_name[name] else name)
+        to_track = [self.fortran_name[name] if name in self.fortran_name else name for name in list_of_params]
+        pattern = re.compile(r"\(?(%(names)s)\s?=\s?([^)]*)\)?" % {'names':'|'.join(to_track)}, re.I)
+        out =  dict(pattern.findall(text))
+        misc.sprint(out)
+        for name in list_of_params:
+            if name in self.fortran_name:
+                value = out[self.fortran_name[name]]
+                del out[self.fortran_name[name]]
+                out[name] = value
+
+        for name, value in out.items():
+            try:
+                out[name] = self.format_variable(value, type(self[name]))
+            except Exception:
+                continue
+
+        if len(out) != len(list_of_params):
+            misc.sprint(list_of_params)
+            misc.sprint(to_track)
+            misc.sprint(self.fortran_name)
+            misc.sprint(text)
+            raise Exception
+        return out 
+
 
     def get_default(self, name, default=None, log_level=None):
         """return self[name] if exist otherwise default. log control if we 
@@ -3338,71 +3483,93 @@ def write_include_file(self, output_dir, output_file=None):
         #ensusre that system only parameter are correctly set
         self.update_system_parameter_for_include()
 
+        value_in_old_include = self.get_last_value_include(output_dir)
+
+
         if output_dir:
             self.write_autodef(output_dir, output_file=None)
             # check/fix status of customised functions
             self.edit_dummy_fct_from_file(self["custom_fcts"], os.path.dirname(output_dir))
         
         for incname in self.includepath:
-            if incname is True:
-                pathinc = self.default_include_file
-            elif incname is False:
-                continue
-            else:
-                pathinc = incname
+            self.write_one_include_file(output_dir, incname, output_file)
+ 
+        for name,value in value_in_old_include.items():
+            if value != self[name]:
+                self.fct_mod[name][0](value, self[name], name, *self.fct_mod[name][1],**self.fct_mod[name][2])
 
-            if output_file:
-                fsock = output_file
+    def write_one_include_file(self, output_dir, incname, output_file=None):
+        """write one include file at the time"""
+
+        misc.sprint(incname)
+        if incname is True:
+            pathinc = self.default_include_file
+        elif incname is False:
+            return
+        else:
+            pathinc = incname
+
+        if output_file:
+            fsock = output_file
+        else:
+            fsock = file_writers.FortranWriter(pjoin(output_dir,pathinc+'.tmp'))
+
+
+        for key in self.includepath[incname]:                
+            #define the fortran name
+            if key in self.fortran_name:
+                fortran_name = self.fortran_name[key]
             else:
-                fsock = file_writers.FortranWriter(pjoin(output_dir,pathinc+'.tmp'))  
-            for key in self.includepath[incname]:                
-                #define the fortran name
-                if key in self.fortran_name:
-                    fortran_name = self.fortran_name[key]
+                fortran_name = key
+                
+            if incname in self.include_as_parameter:
+                fsock.writelines('INTEGER %s\n' % fortran_name)
+            #get the value with warning if the user didn't set it
+            value = self.get_default(key)
+            if hasattr(self, 'mod_inc_%s' % key):
+                value = getattr(self, 'mod_inc_%s' % key)(value)
+            # Special treatment for strings containing a list of
+            # strings. Convert it to a list of strings
+            if isinstance(value, list):
+                # in case of a list, add the length of the list as 0th
+                # element in fortran. Only in case of integer or float
+                # list (not for bool nor string)
+                targettype = self.list_parameter[key]                        
+                if targettype is bool:
+                    pass
+                elif targettype is int:
+                    line = '%s(%s) = %s \n' % (fortran_name, 0, self.f77_formatting(len(value)))
+                    fsock.writelines(line)
+                elif targettype is float:
+                    line = '%s(%s) = %s \n' % (fortran_name, 0, self.f77_formatting(float(len(value))))
+                    fsock.writelines(line)
+                # output the rest of the list in fortran
+                for i,v in enumerate(value):
+                    line = '%s(%s) = %s \n' % (fortran_name, i+1, self.f77_formatting(v))
+                    fsock.writelines(line)
+            elif isinstance(value, dict):
+                for fortran_name, onevalue in value.items():
+                    line = '%s = %s \n' % (fortran_name, self.f77_formatting(onevalue))
+                    fsock.writelines(line)                       
+            elif isinstance(incname,str) and 'compile' in incname:
+                if incname in self.include_as_parameter:
+                    line = 'PARAMETER (%s=%s)' %( fortran_name, value)
                 else:
-                    fortran_name = key
-                    
-                #get the value with warning if the user didn't set it
-                value = self.get_default(key)
-                if hasattr(self, 'mod_inc_%s' % key):
-                    value = getattr(self, 'mod_inc_%s' % key)(value)
-                # Special treatment for strings containing a list of
-                # strings. Convert it to a list of strings
-                if isinstance(value, list):
-                    # in case of a list, add the length of the list as 0th
-                    # element in fortran. Only in case of integer or float
-                    # list (not for bool nor string)
-                    targettype = self.list_parameter[key]                        
-                    if targettype is bool:
-                        pass
-                    elif targettype is int:
-                        line = '%s(%s) = %s \n' % (fortran_name, 0, self.f77_formatting(len(value)))
-                        fsock.writelines(line)
-                    elif targettype is float:
-                        line = '%s(%s) = %s \n' % (fortran_name, 0, self.f77_formatting(float(len(value))))
-                        fsock.writelines(line)
-                    # output the rest of the list in fortran
-                    for i,v in enumerate(value):
-                        line = '%s(%s) = %s \n' % (fortran_name, i+1, self.f77_formatting(v))
-                        fsock.writelines(line)
-                elif isinstance(value, dict):
-                    for fortran_name, onevalue in value.items():
-                        line = '%s = %s \n' % (fortran_name, self.f77_formatting(onevalue))
-                        fsock.writelines(line)                       
-                elif isinstance(incname,str) and 'compile' in incname:
                     line = '%s = %s \n' % (fortran_name, value)
-                    fsock.write(line)
+                fsock.write(line)
+            else:
+                if incname in self.include_as_parameter:
+                    line = 'PARAMETER (%s=%s)' %( fortran_name, self.f77_formatting(value))
                 else:
                     line = '%s = %s \n' % (fortran_name, self.f77_formatting(value))
-                    fsock.writelines(line)
-            if not output_file:
-                fsock.close()
-                path = pjoin(output_dir,pathinc)
-                if not os.path.exists(path) or not filecmp.cmp(path,  path+'.tmp'):
-                    files.mv(path+'.tmp', path)
-                else:
-                    os.remove(path+'.tmp')
-
+                fsock.writelines(line)
+        if not output_file:
+            fsock.close()
+            path = pjoin(output_dir,pathinc)
+            if not os.path.exists(path) or not filecmp.cmp(path,  path+'.tmp'):
+                files.mv(path+'.tmp', path)
+            else:
+                os.remove(path+'.tmp')
 
     def write_autodef(self, output_dir, output_file=None):
         """ Add the definition of variable to run.inc if the variable is set with autodef.
@@ -3741,13 +3908,14 @@ def remove_all_cut(self):
    %(tmin_for_channel)s = tmin_for_channel ! limit the non-singular reach of --some-- channel of integration related to T-channel diagram (value between -1 and 0), -1 is no impact
    %(survey_splitting)s = survey_splitting ! for loop-induced control how many core are used at survey for the computation of a single iteration.
    %(survey_nchannel_per_job)s = survey_nchannel_per_job ! control how many Channel are integrated inside a single job on cluster/multicore
-   %(refine_evt_by_job)s = refine_evt_by_job ! control the maximal number of events for the first iteration of the refine (larger means less jobs)
+   %(refine_evt_by_job)s = refine_evt_by_job ! control the maximal number of events for the first iteration of the refine (larger means less jobs)  
 #*********************************************************************
-# Compilation flag. No automatic re-compilation (need manual "make clean" in Source)
+# Compilation flag. 
 #*********************************************************************   
    %(global_flag)s = global_flag ! fortran optimization flag use for the all code.
    %(aloha_flag)s  = aloha_flag ! fortran optimization flag for aloha function. Suggestions: '-ffast-math'
    %(matrix_flag)s = matrix_flag ! fortran optimization flag for matrix.f function. Suggestions: '-O3'
+   %(vector_size)s = vector_size ! size of fortran arrays allocated in the multi-event API for SIMD/GPU (VECSIZE_MEMMAX)
 """
 
 template_off = '# To see advanced option for Phase-Space optimization: type "update psoptim"'
@@ -3903,9 +4071,12 @@ class RunCardLO(RunCard):
                       "get_dummy_x1_x2": pjoin("SubProcesses","dummy_fct.f"), 
                       "dummy_boostframe": pjoin("SubProcesses","dummy_fct.f"),
                       "user_dynamical_scale": pjoin("SubProcesses","dummy_fct.f"),
+                      "bias_wgt_custom": pjoin("SubProcesses","dummy_fct.f"),
                       "user_": pjoin("SubProcesses","dummy_fct.f") # all function starting by user will be added to that file
                       }
     
+    include_as_parameter = ['vector.inc']
+
     if MG5DIR:
         default_run_card = pjoin(MG5DIR, "internal", "default_run_card_lo.dat")
     
@@ -4139,10 +4310,15 @@ def default_setup(self):
         self.add_param('hel_splitamp', True, hidden=True, include=False, comment='decide if amplitude aloha call can be splitted in two or not when doing helicity per helicity optimization.')
         self.add_param('hel_zeroamp', True, hidden=True, include=False, comment='decide if zero amplitude can be removed from the computation when doing helicity per helicity optimization.')
         self.add_param('SDE_strategy', 1, allowed=[1,2], fortran_name="sde_strat", comment="decide how Multi-channel should behaves \"1\" means full single diagram enhanced (hep-ph/0208156), \"2\" use the product of the denominator")
-        self.add_param('global_flag', '-O', include=False, hidden=True, comment='global fortran compilation flag, suggestion -fbound-check')
-        self.add_param('aloha_flag', '', include=False, hidden=True, comment='global fortran compilation flag, suggestion: -ffast-math')
-        self.add_param('matrix_flag', '', include=False, hidden=True, comment='fortran compilation flag	for the	matrix-element files, suggestion -O3')        
-        
+        self.add_param('global_flag', '-O', include=False, hidden=True, comment='global fortran compilation flag, suggestion -fbound-check',
+                       fct_mod=(self.make_clean, ('Source'),{}))
+        self.add_param('aloha_flag', '', include=False, hidden=True, comment='global fortran compilation flag, suggestion: -ffast-math',
+                       fct_mod=(self.make_clean, ('Source/DHELAS'),{}))
+        self.add_param('matrix_flag', '', include=False, hidden=True, comment='fortran compilation flag	for the	matrix-element files, suggestion -O3',
+                       fct_mod=(self.make_Ptouch, ('matrix'),{}))        
+        self.add_param('vector_size', 1, include='vector.inc', hidden=True, comment='lockstep size for parralelism run', 
+                       fortran_name='VECSIZE_MEMMAX', fct_mod=(self.reset_simd,(),{}))
+
         # parameter allowing to define simple cut via the pdg
         # Special syntax are related to those. (can not be edit directly)
         self.add_param('pt_min_pdg',{'__type__':0.}, include=False, cut=True)
@@ -4164,8 +4340,7 @@ def default_setup(self):
         self.add_param('mxxmin4pdg',[-1.], system=True)
         self.add_param('mxxpart_antipart', [False], system=True)
                      
-        # CUDACPP parameters
-        self.add_param('cudacpp_backend', 'CPP', include=False, hidden=False)
+        
              
     def check_validity(self):
         """ """
@@ -4704,6 +4879,9 @@ def create_default_for_process(self, proc_characteristic, history, proc_def):
                 continue
             break
 
+        if proc_characteristic['ninitial'] == 1:
+            self['SDE_strategy'] =1
+
         if 'MLM' in proc_characteristic['limitations']:
             if self['dynamical_scale_choice'] ==  -1:
                 self['dynamical_scale_choice'] = 3
@@ -5769,7 +5947,7 @@ def default_setup(self):
         self.add_param("CheckCycle", 3)
         self.add_param("MaxAttempts", 10)
         self.add_param("ZeroThres", 1e-9)
-        self.add_param("OSThres", 1.0e-13)
+        self.add_param("OSThres", 1.0e-8)
         self.add_param("DoubleCheckHelicityFilter", True)
         self.add_param("WriteOutFilters", True)
         self.add_param("UseLoopFilter", False)
diff --git a/epochX/cudacpp/gg_tt.mad/bin/internal/check_param_card.py b/epochX/cudacpp/gg_tt.mad/bin/internal/check_param_card.py
index fe874a06a4..71089d7480 100755
--- a/epochX/cudacpp/gg_tt.mad/bin/internal/check_param_card.py
+++ b/epochX/cudacpp/gg_tt.mad/bin/internal/check_param_card.py
@@ -85,7 +85,7 @@ def load_str(self, text):
             self.value= ' '.join(data[len(self.lhacode):])
             # check that lhacode are the first entry otherwise return invalid param.
             if ' '.join([str(i) for i in self.lhacode]) != ' '.join(data[:len(self.lhacode)]):
-                raise InvalidParam
+                raise InvalidParam("line was %s" % str(data))
         else:
             self.value = data[-1]
         
diff --git a/epochX/cudacpp/gg_tt.mad/bin/internal/common_run_interface.py b/epochX/cudacpp/gg_tt.mad/bin/internal/common_run_interface.py
index 5d0187e3fa..87cb4b88df 100755
--- a/epochX/cudacpp/gg_tt.mad/bin/internal/common_run_interface.py
+++ b/epochX/cudacpp/gg_tt.mad/bin/internal/common_run_interface.py
@@ -749,13 +749,15 @@ def writeRunWeb(me_dir):
         
     class RunWebHandling(object):
         
-        def __init__(self, me_dir, crashifpresent=True, warnifpresent=True):
+        def __init__(self, me_dir, crashifpresent=True, warnifpresent=True, force_run=False):
             """raise error if RunWeb already exists
             me_dir is the directory where the write RunWeb"""
             
             self.remove_run_web = True
             self.me_dir = me_dir
-            
+            if force_run:
+                self.remove_run_web = False
+                return            
             if crashifpresent or warnifpresent:
                 if os.path.exists(pjoin(me_dir, 'RunWeb')):
                     pid = open(pjoin(me_dir, 'RunWeb')).read()
@@ -4904,6 +4906,7 @@ def __init__(self, question, cards=[], from_banner=None, banner=None, mode='auto
         self.load_default()        
         self.define_paths(**opt)
         self.last_editline_pos = 0
+        self.update_dependent_done = False
 
         if 'allow_arg' not in opt or not opt['allow_arg']:
             # add some mininal content for this:
@@ -6574,7 +6577,7 @@ def reask(self, *args, **opt):
     fail_due_to_format = 0 #parameter to avoid infinite loop
     def postcmd(self, stop, line):
 
-        if line not in [None, '0', 'done', '']:
+        if line not in [None, '0', 'done', '',0]:
             ending_question = cmd.OneLinePathCompletion.postcmd(self,stop,line)
         else:
             ending_question = True
@@ -6583,7 +6586,9 @@ def postcmd(self, stop, line):
             self.check_card_consistency()
             if self.param_consistency:
                 try:
-                    self.do_update('dependent', timer=20)
+                    if not self.update_dependent_done:
+                        self.do_update('dependent', timer=20)
+                    self.update_dependent_done = False
                 except MadGraph5Error as error:
                     if 'Missing block:' in str(error):
                         self.fail_due_to_format +=1
@@ -6636,6 +6641,8 @@ def do_update(self, line, timer=0):
             self.update_dependent(self.mother_interface, self.me_dir, self.param_card,
                                    self.paths['param'], timer, run_card=self.run_card,
                                    lhapdfconfig=self.lhapdf)
+            self.update_dependent_done = True
+            
 
         elif args[0] == 'missing':
             self.update_missing()
@@ -6715,12 +6722,13 @@ class TimeOutError(Exception):
         def handle_alarm(signum, frame): 
             raise TimeOutError
         signal.signal(signal.SIGALRM, handle_alarm)
+
         if timer:
-            signal.alarm(timer)
             log_level=30
         else:
             log_level=20
 
+
         if run_card:
             as_for_pdf = {'cteq6_m': 0.118,
                           'cteq6_d': 0.118, 
@@ -6779,6 +6787,10 @@ def handle_alarm(signum, frame):
                     logger.log(log_level, "update the strong coupling value (alpha_s) to the value from the pdf selected: %s",  as_for_pdf[pdlabel])
                     modify = True
 
+        if timer:
+            signal.alarm(timer)
+
+
         # Try to load the model in the limited amount of time allowed
         try:
             model = mecmd.get_model()
@@ -6907,7 +6919,8 @@ def check_block(self, blockname):
     def check_answer_consistency(self):
         """function called if the code reads a file"""
         self.check_card_consistency()
-        self.do_update('dependent', timer=20) 
+        if not self.update_dependent_done:
+            self.do_update('dependent', timer=20) 
       
     def help_set(self):
         '''help message for set'''
@@ -7533,7 +7546,8 @@ def open_file(self, answer):
             else:
                 raise
         if time.time() - start < .5:
-            self.mother_interface.ask("Are you really that fast? If you are using an editor that returns directly. Please confirm that you have finised to edit the file", 'y')
+            self.mother_interface.ask("Are you really that fast? If you are using an editor that returns directly. Please confirm that you have finised to edit the file", 'y',
+                                      timeout=False)
         self.reload_card(path)
         
     def reload_card(self, path): 
diff --git a/epochX/cudacpp/gg_tt.mad/bin/internal/extended_cmd.py b/epochX/cudacpp/gg_tt.mad/bin/internal/extended_cmd.py
index a6a8609dce..2f37070580 100755
--- a/epochX/cudacpp/gg_tt.mad/bin/internal/extended_cmd.py
+++ b/epochX/cudacpp/gg_tt.mad/bin/internal/extended_cmd.py
@@ -1108,9 +1108,12 @@ def ask(self, question, default, choices=[], path_msg=None,
         if alias:
             choices += list(alias.keys())
         
+
+
         question_instance = obj(question, allow_arg=choices, default=default, 
                                                    mother_interface=self, **opt)
-        
+        if fct_timeout is None:
+            fct_timeout = lambda x: question_instance.postcmd(x, default) if x and default else False
         if first_cmd:
             if isinstance(first_cmd, str):
                 question_instance.onecmd(first_cmd)
@@ -2271,6 +2274,9 @@ def postcmd(self, stop, line):
                 if n:
                     self.default(line)
                     return self.postcmd(stop, line)
+            elif self.value is None and line:
+                self.default(line)
+                return self.postcmd(stop, line) 
             if not self.casesensitive:
                 for ans in self.allow_arg:
                     if ans.lower() == self.value.lower():
diff --git a/epochX/cudacpp/gg_tt.mad/bin/internal/gen_ximprove.py b/epochX/cudacpp/gg_tt.mad/bin/internal/gen_ximprove.py
index 3b8ec31215..5fd170d18d 100755
--- a/epochX/cudacpp/gg_tt.mad/bin/internal/gen_ximprove.py
+++ b/epochX/cudacpp/gg_tt.mad/bin/internal/gen_ximprove.py
@@ -154,9 +154,18 @@ def get_helicity(self, to_submit=True, clean=True):
             p = misc.Popen(['./gensym'], stdout=subprocess.PIPE,
                                  stderr=subprocess.STDOUT, cwd=Pdir)
             #sym_input = "%(points)d %(iterations)d %(accuracy)f \n" % self.opts
+            
             (stdout, _) = p.communicate(''.encode())
             stdout = stdout.decode('ascii',errors='ignore')
-            nb_channel = max([math.floor(float(d)) for d in stdout.split()])
+            if stdout:
+                nb_channel = max([math.floor(float(d)) for d in stdout.split()])
+            else:
+                for matrix_file in misc.glob('matrix*orig.f', Pdir):
+                    files.cp(matrix_file, matrix_file.replace('orig','optim'))
+                P_zero_result.append(Pdir)
+                if os.path.exists(pjoin(self.me_dir, 'error')):
+                    os.remove(pjoin(self.me_dir, 'error'))
+                continue # bypass bad process
             
             self.cmd.compile(['madevent_forhel'], cwd=Pdir)
             if not os.path.exists(pjoin(Pdir, 'madevent_forhel')):
@@ -178,11 +187,13 @@ def get_helicity(self, to_submit=True, clean=True):
             #sym_input = "%(points)d %(iterations)d %(accuracy)f \n" % self.opts
             (stdout, _) = p.communicate(" ".encode())
             stdout = stdout.decode('ascii',errors='ignore')
-            if os.path.exists(pjoin(self.me_dir,'error')):
+            if os.path.exists(pjoin(self.me_dir, 'error')):
                 raise Exception(pjoin(self.me_dir,'error')) 
                 # note a continue is not enough here, we have in top to link
                 # the matrixX_optim.f to matrixX_orig.f to let the code to work
                 # after this error.
+                #                for matrix_file in misc.glob('matrix*orig.f', Pdir):
+                #    files.cp(matrix_file, matrix_file.replace('orig','optim'))
 
             if 'no events passed cuts' in stdout:
                 raise Exception
diff --git a/epochX/cudacpp/gg_tt.mad/bin/internal/lhe_parser.py b/epochX/cudacpp/gg_tt.mad/bin/internal/lhe_parser.py
index cff8789e38..a6b8582e1a 100755
--- a/epochX/cudacpp/gg_tt.mad/bin/internal/lhe_parser.py
+++ b/epochX/cudacpp/gg_tt.mad/bin/internal/lhe_parser.py
@@ -342,7 +342,12 @@ def next_event(self):
                 text.append(line)
                 
             if '</event>' in line:
-                if self.parsing:
+                if self.parsing == "wgt_only":
+                    out = Event(text, parse_momenta=False)
+                    #if len(out) == 0  and not self.allow_empty_event:
+                    #    raise Exception
+                    return out
+                elif self.parsing:
                     out = Event(text)
                     if len(out) == 0  and not self.allow_empty_event:
                         raise Exception
@@ -448,6 +453,8 @@ def unweight(self, outputpath, get_wgt=None, max_wgt=0, trunc_error=0,
         event_target reweight for that many event with maximal trunc_error.
         (stop to write event when target is reached)
         """
+        self.parsing = 'wgt_only'
+
         if not get_wgt:
             def weight(event):
                 return event.wgt
@@ -914,6 +921,8 @@ class MultiEventFile(EventFile):
        The number of events in each file need to be provide in advance 
        (if not provide the file is first read to find that number"""
     
+    parsing = True # check if/when we need to parse the event.
+
     def __new__(cls, start_list=[],parse=True):
         return object.__new__(MultiEventFile)
     
@@ -986,6 +995,7 @@ def next(self):
         nb_event = random.randint(1, remaining_event)
         sum_nb=0
         for i, obj in enumerate(self.files):
+            obj.parsing = "wgt_only"
             sum_nb += self.initial_nb_events[i] - self.curr_nb_events[i]
             if nb_event <= sum_nb:
                 self.curr_nb_events[i] += 1
@@ -1065,6 +1075,8 @@ def define_init_banner(self, wgt, lha_strategy, proc_charac=None):
             # check special case without PDF for one (or both) beam
             if init_information["idbmup1"] in [0,9]:
                 event = next(self)
+                if len(event) == 0:
+                    event = Event(str(event))
                 init_information["idbmup1"]= event[0].pdg
                 if init_information["idbmup2"] == 0:
                     init_information["idbmup2"]= event[1].pdg
@@ -1115,6 +1127,7 @@ def initialize_unweighting(self, getwgt, trunc_error):
         total_event = 0
         sum_cross = collections.defaultdict(int)
         for i,f in enumerate(self.files):
+            f.parsing = 'wgt_only'
             nb_event = 0 
             # We need to loop over the event file to get some information about the 
             # new cross-section/ wgt of event.
@@ -1302,7 +1315,7 @@ class Event(list):
 
     warning_order = True # raise a warning if the order of the particle are not in accordance of child/mother
 
-    def __init__(self, text=None):
+    def __init__(self, text=None, parse_momenta=True):
         """The initialization of an empty Event (or one associate to a text file)"""
         list.__init__(self)
         
@@ -1322,15 +1335,15 @@ def __init__(self, text=None):
         self.matched_scale_data = None
         self.syscalc_data = {}
         if text:
-            self.parse(text)
+            self.parse(text, parse_momenta=parse_momenta)
 
 
-            
-    def parse(self, text):
+    event_flag_pattern = re.compile(r"""(\w*)=(?:(?:['"])([^'"]*)(?=['"])|(\S*))""")   
+    def parse(self, text, parse_momenta=True):
         """Take the input file and create the structured information"""
         #text = re.sub(r'</?event>', '', text) # remove pointless tag
         status = 'first' 
-
+        tags = []
         if not isinstance(text, list):
             text = text.split('\n')
 
@@ -1354,24 +1367,28 @@ def parse(self, text):
                 if '<rwgt>' in line:
                     status = 'tag'
                 else:
-                    self.assign_scale_line(line)
+                    self.assign_scale_line(line, convert=parse_momenta)
                     status = 'part' 
                     continue
             if '<' in line:
                 status = 'tag'
                 
             if 'part' == status:
-                part = Particle(line, event=self)
-                if part.E != 0 or part.status==-1:
-                    self.append(part)
-                elif self.nexternal:
-                    self.nexternal-=1
+                if parse_momenta:
+                    part = Particle(line, event=self)
+                    if part.E != 0 or part.status==-1:
+                        self.append(part)
+                    elif self.nexternal:
+                        self.nexternal-=1
+                else:
+                    tags.append(line)
             else:
-                if '</event>' in line:
+                if line.endswith('</event>'):
                     line = line.replace('</event>','',1)
-                self.tag += '%s\n' % line
-                
-        self.assign_mother()
+                tags.append(line) 
+        self.tag += "\n".join(tags)
+        if parse_momenta:     
+            self.assign_mother()
     
     
     def assign_mother(self):
@@ -1905,19 +1922,27 @@ def check(self):
         #3. check mass
                    
          
-    def assign_scale_line(self, line):
+    def assign_scale_line(self, line, convert=True):
         """read the line corresponding to global event line
         format of the line is:
         Nexternal IEVENT WEIGHT SCALE AEW AS
         """
         inputs = line.split()
         assert len(inputs) == 6
-        self.nexternal=int(inputs[0])
-        self.ievent=int(inputs[1])
-        self.wgt=float(inputs[2])
-        self.scale=float(inputs[3])
-        self.aqed=float(inputs[4])
-        self.aqcd=float(inputs[5])
+        if convert:
+            self.nexternal=int(inputs[0])
+            self.ievent=int(inputs[1])
+            self.wgt=float(inputs[2])
+            self.scale=float(inputs[3])
+            self.aqed=float(inputs[4])
+            self.aqcd=float(inputs[5])
+        else:
+            self.nexternal=inputs[0]
+            self.ievent=inputs[1]
+            self.wgt=float(inputs[2])
+            self.scale=inputs[3]
+            self.aqed=inputs[4]
+            self.aqcd=inputs[5]
         
     def get_tag_and_order(self):
         """Return the unique tag identifying the SubProcesses for the generation.
@@ -2269,7 +2294,11 @@ def __str__(self, event_id=''):
         else:
             event_flag = ''
 
-        scale_str = "%2d %6d %+13.7e %14.8e %14.8e %14.8e" % \
+        try:
+            scale_str = "%2d %6d %+13.7e %14.8e %14.8e %14.8e" % \
+            (self.nexternal,self.ievent,self.wgt,self.scale,self.aqed,self.aqcd)
+        except:
+            scale_str = "%s %s %+13.7e %s %s %s" % \
             (self.nexternal,self.ievent,self.wgt,self.scale,self.aqed,self.aqcd)
 
             
diff --git a/epochX/cudacpp/gg_tt.mad/bin/internal/madevent_interface.py b/epochX/cudacpp/gg_tt.mad/bin/internal/madevent_interface.py
index b70b548e53..cb6bf4ca57 100755
--- a/epochX/cudacpp/gg_tt.mad/bin/internal/madevent_interface.py
+++ b/epochX/cudacpp/gg_tt.mad/bin/internal/madevent_interface.py
@@ -1,4 +1,4 @@
-################################################################################
+###############################################################################
 #
 # Copyright (c) 2011 The MadGraph5_aMC@NLO Development team and Contributors
 #
@@ -3191,7 +3191,7 @@ def do_treatcards(self, line, mode=None, opt=None):
       
         if mode in ['run', 'all']:
             if not hasattr(self, 'run_card'):
-                run_card = banner_mod.RunCard(opt['run_card'])
+                run_card = banner_mod.RunCard(opt['run_card'], path=pjoin(self.me_dir, 'Cards', 'run_card.dat'))
             else:
                 run_card = self.run_card
             self.run_card = run_card
@@ -3675,7 +3675,7 @@ def do_refine(self, line):
         devnull.close()
     
     ############################################################################ 
-    def do_combine_iteration(self, line):
+    def do_comine_iteration(self, line):
         """Not in help: Combine a given iteration combine_iteration Pdir Gdir S|R step
             S is for survey 
             R is for refine
@@ -3703,8 +3703,9 @@ def do_combine_iteration(self, line):
     ############################################################################ 
     def do_combine_events(self, line):
         """Advanced commands: Launch combine events"""
-
+        start=time.time()
         args = self.split_arg(line)
+        start = time.time()
         # Check argument's validity
         self.check_combine_events(args)
         self.update_status('Combining Events', level='parton')
@@ -3795,8 +3796,9 @@ def do_combine_events(self, line):
     
         if self.run_card['bias_module'].lower() not in  ['dummy', 'none'] and nb_event:
             self.correct_bias()
-        
-        
+        elif self.run_card['custom_fcts']:
+            self.correct_bias()
+        logger.info("combination of events done in %s s ", time.time()-start)
         
         self.to_store.append('event')
     
@@ -7364,7 +7366,7 @@ def wait_monitoring(Idle, Running, Done):
     import optparse
     # Get the directory of the script real path (bin)                                                                                                                                                           
     # and add it to the current PYTHONPATH                                                                                                                                                                      
-    root_path = os.path.dirname(os.path.dirname(os.path.realpath( __file__ )))
+    #root_path = os.path.dirname(os.path.dirname(os.path.dirname(os.path.realpath( __file__ ))))
     sys.path.insert(0, root_path)
 
     class MyOptParser(optparse.OptionParser):    
@@ -7407,7 +7409,13 @@ def error(self, msg=''):
     import logging.config
     # Set logging level according to the logging level given by options                                                                                                                                         
     #logging.basicConfig(level=vars(logging)[options.logging])                                                                                                                                                  
+    import internal
     import internal.coloring_logging
+    # internal.file = XXX/bin/internal/__init__.py
+    # => need three dirname to get XXX
+    # we use internal to have any issue with pythonpath finding the wrong file
+    me_dir = os.path.dirname(os.path.dirname(os.path.dirname(internal.__file__)))
+    print("me_dir is", me_dir)
     try:
         if __debug__ and options.logging == 'INFO':
             options.logging = 'DEBUG'
@@ -7415,7 +7423,8 @@ def error(self, msg=''):
             level = int(options.logging)
         else:
             level = eval('logging.' + options.logging)
-        logging.config.fileConfig(os.path.join(root_path, 'internal', 'me5_logging.conf'))
+        log_path = os.path.join(me_dir, 'bin', 'internal', 'me5_logging.conf')
+        logging.config.fileConfig(log_path)
         logging.root.setLevel(level)
         logging.getLogger('madgraph').setLevel(level)
     except:
@@ -7429,9 +7438,9 @@ def error(self, msg=''):
             if '--web' in args:
                 i = args.index('--web') 
                 args.pop(i)                                                                                                                                                                     
-                cmd_line = MadEventCmd(os.path.dirname(root_path),force_run=True)
+                cmd_line = MadEventCmd(me_dir, force_run=True)
             else:
-                cmd_line = MadEventCmdShell(os.path.dirname(root_path),force_run=True)
+                cmd_line = MadEventCmdShell(me_dir, force_run=True)
             if not hasattr(cmd_line, 'do_%s' % args[0]):
                 if parser_error:
                     print(parser_error)
diff --git a/epochX/cudacpp/gg_tt.mad/bin/internal/misc.py b/epochX/cudacpp/gg_tt.mad/bin/internal/misc.py
index d3fed3baa2..91cd3e5c22 100755
--- a/epochX/cudacpp/gg_tt.mad/bin/internal/misc.py
+++ b/epochX/cudacpp/gg_tt.mad/bin/internal/misc.py
@@ -347,7 +347,7 @@ def tell(msg):
     if dependency=='ninja':
         if cmd.options['ninja'] in ['None',None,''] or\
          (cmd.options['ninja'] == './HEPTools/lib' and not MG5dir is None and\
-         which_lib(pjoin(MG5dir,cmd.options['ninja'],'libninja.a')) is None):
+         which_lib(pjoin(MG5dir,cmd.options['ninja'],'lib','libninja.a')) is None):
             tell("Installing ninja...")
             cmd.do_install('ninja')
  
diff --git a/epochX/cudacpp/gg_tt.mad/bin/internal/shower_card.py b/epochX/cudacpp/gg_tt.mad/bin/internal/shower_card.py
index c6d3948cc4..c344ea1b15 100755
--- a/epochX/cudacpp/gg_tt.mad/bin/internal/shower_card.py
+++ b/epochX/cudacpp/gg_tt.mad/bin/internal/shower_card.py
@@ -45,7 +45,9 @@ class ShowerCard(dict):
     false = ['.false.', 'f', 'false', '0']
     logical_vars = ['ue_enabled', 'hadronize', 'b_stable', 'pi_stable', 'wp_stable', 
                     'wm_stable', 'z_stable', 'h_stable', 'tap_stable', 'tam_stable', 
-                    'mup_stable', 'mum_stable', 'is_4lep', 'is_bbar', 'combine_td']
+                    'mup_stable', 'mum_stable', 'is_4lep', 'is_bbar', 'combine_td',
+                    'space_shower_me_corrections', 'time_shower_me_corrections',
+                    'time_shower_me_extended', 'time_shower_me_after_first']
     string_vars = ['extralibs', 'extrapaths', 'includepaths', 'analyse']
     for i in range(1,100):
         string_vars.append('dm_'+str(i))
@@ -82,7 +84,11 @@ class ShowerCard(dict):
             'b_mass' : {'HERWIG6':'b_mass', 'PYTHIA6': 'b_mass', 'HERWIGPP': 'b_mass', 'PYTHIA8': 'b_mass'},
             'analyse' : {'HERWIG6':'hwuti', 'PYTHIA6':'pyuti', 'HERWIGPP':'hwpputi', 'PYTHIA8':'py8uti'},
             'qcut' : {'PYTHIA8':'qcut'},
-            'njmax' : {'PYTHIA8':'njmax'}}
+            'njmax' : {'PYTHIA8':'njmax'},
+            'space_shower_me_corrections' : {'PYTHIA8':'space_shower_me_corrections'},
+            'time_shower_me_corrections' : {'PYTHIA8':'time_shower_me_corrections'},
+            'time_shower_me_extended' : {'PYTHIA8':'time_shower_me_extended'},
+            'time_shower_me_after_first' : {'PYTHIA8':'time_shower_me_after_first'}}
     
     stdhep_dict = {'HERWIG6':'mcatnlo_hwan_stdhep.o', 'PYTHIA6':'mcatnlo_pyan_stdhep.o'}
     
diff --git a/epochX/cudacpp/gg_tt.mad/bin/internal/ufomodel/py3_model.pkl b/epochX/cudacpp/gg_tt.mad/bin/internal/ufomodel/py3_model.pkl
deleted file mode 100644
index f71ba45bbc6d4acc8d32bb06662fe900a694009f..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 42822
zcmb__2bdJa_B|}1h$1K|m=F<Vm%KZ>uqY~s1X)3xl^}}iusdw;xPiViOVY+T0_L1^
z&N=6tbB@o5@%hX-{m-ec4%1z`!0-F}-gl|Gb?cmas;fiwboU-sRNIuQtC!RQYglcI
zq=(H*Wz#h+^D^n$T(;ZPmb!FfH@id0;daO3*_}Id=+K;MO4~)Vi%`rq*_~#uhr)uC
z<Qi%l(^<Q7dtlRb*q#m9TsON5FxM$wllX4-x`^#sY!%J!2)2~H-t4aYw>j6KvgxMX
z>#GE_3mTg34XQfL?t+-wmc|ybyH|Cby$)+uO6t@0hLccM|EUg5schEnp^9@RB=@p=
zs-BA)>T;I7k=0XiEvT!nsl`~>8&_?nA{!f;({?XaT$Eb4#NK3fHy|#MmIbN$R1Tb*
zR&|)YE?f=S8V--W*^1fSQO%|s=hrZE<?OveA>@`UNZWm8GbMZT+3Vw<+LqSlTw3t|
zCVLCV-g5Q^aPFC2kZWn4*4i{LE%sKx=V<I}Z(Y@))#3>C^+sr^y^ZA#%2K<Zg|V<O
z0#+c~>Y;|CsjjhR>KuDpYa_opYmVLD+PJE>)yvw%+SJ<I+H8f@dxg~pC8V*P1y;-F
z8$dozghhkx?LnxDL+u?xjJtk^X8adsoMjITGtRP$!PuvOaizytQph;SQ&?{g)(Y$G
zA)2pifkLaNg^dT~O|4$;RA`mPCH7DY8>UdzutJXEo~ngudjv>2A^nBvWp*iOiedxl
z(`*15&dO{~QVsQ1&Mx!E&sUQGQ^#QZ%6+CLyTW5?T4s-IpJ_u3l>vNIA$X+^p0h`L
z;JIb?j_rf@G=uL{2tLN!4z2diz~+W;U1sm1p*vwi?7C`(`p@d&k5G{rde_3?AL~Ka
z+T%2I?J|2eBeXwlr;fMB>&@NS%(8nS%LI=lXHV2D7@J9}VcEx?9G<!R*n41ptz#yf
zQb<_k5!TszYQnl@_S6vJgRjR%tfLuvS|Rja9(2~8uA#Hb?7iEE?qr5e7D88h(DUql
zH1xb>_6#F*urYSCXX=g7#mq9RkY%>VV%htGg-12Zo&%DOvE3~onQZSD-WHSX{jn`J
z$4<xteO)SRrR(gu{x0Yr!VV~;t|2w2Z+fVGpjMb3Y^StB9yA{`9KzD}Jgsnjv%*@?
zg!c40((pji)H>8ogU2low&$;gXsTVWiMpGKtU{s&k0_TKYG*W2E;ZOb$V|jDlsXOU
zGtO?rc=8Cw`mB9Yt#%Vew1=6t*{4mbQ&R)aR;k)tLrZhkZfWnNzDE%0UI#3|naghX
zIv~`gn=J^>7OQR6P-o{b%AGA<O0*VRv)%utt?qN}g^?W8g+NWJu|7RdQr*V$vY<LA
z4bAo4>_zCk*s+%s>nn!z{Q36M+WDTGZWDE>n|-jFo2B+4dD}_a%UtEznuexSJ+4G#
ziGAowHG<P>YZkOD!b$otoTPcyZ)|B<P~&?K#|e90N?jcwjdXKvseQy;`$$D!jsHsR
zqf+X+3ReLOaG_D3weyaBbXA9{4oBO|RoJBQHHn11BCBS5i`1nhYp-<dW7Ndjd&2Ja
zu~kLc*&BM)!3ElJ)2u$~nuC{l_VKefLdXJKXtc__6t`ZDx)wQMTC2r@KC#%!;iTt|
z@JV`v)%c!laoA3A>{E-arIt4^r{(R_llB?bp(`y&w_KqX#GR=)mA11|7Sx<=`9emH
zw$I60$o^dG*s6|Miz!;iTV4+5F^5h%hx3cALHwV};R2Pzh3#(X`Q7Y`)K+V-FP>{(
zQj9_DSXGoh+P*YtU*_1C&r3P>6=4Zi=IyJJ_SJC-*C+`yXU;6OuT>JR3nY||u&-AV
zZg4li_}%Rr4N`8Rl+Iep&02~Ub4#&Ob8E46k%dfeE1vQ%{$sjG_V%QGhhyJ4&uzP_
zxT>hClWIrX-AVf%$G&%7+Oh8o3%NgUKajK^ED)k79?}#K`xG3HN0RoVj{TUXc)S?H
z<WoG6w4ZeBr%3U1nBtkd{cO^HE>7{hrg*`p=md%vllDuF{j#Qbg%m}eidU2NYmWUo
zDc%TEyqUM(O4@H1P^3Ad-f?$VY!7UZchye8a>9PkFPGczr#aXDL#B>L+aGwlPHCC_
zp|#xl5W`q>wEdCwVJoNK$JUM5X`<8b6K=CEdYgS(Y;{$a+N*G!wQ$XAnJ;!NT;W=g
z%i5JzZ~sE@v*0W=KuzV()t33fvA+z9|0-{PowWa3AU;L$-&pPx{2!(u7iBmcl-u8`
z8Tg%XC;FZ$*3l|{D7IEv19-4h+&?PrpB($=F!wKc``4uXTb%oMJ-$3n{h@Y*KZ6}1
zQEvaG()b%`#0T>qrm?P0<6oTylaLNL>oX7Oh=L<Uu%T?E6Y6>C44afLu(7mc9eDK~
zT3ssZqQ=~$D}((*&8RY24-RH1-PmF{;K=%DV4hvwJY@q_?}{ytQB%owXICb>AqtN4
zPyzl?P<o=CmyKYPvN3Et*<SGa)3i)BK~-hDDWm;OS|OXEu9EG|2E&L-A2cx8^>nhE
zt2#GtUzO?>?8sENM8T1*RCAE()~M&DFKkk_fsLoyk6ty~WLp;fv8+f)e>hZn+p)zo
zmIKhh^t$QvwpaC4YW}%nxdXd0!GS0^Qmg`k1WQoQ%OKdK42F#-I0W8|KY~LU<L8*D
zkYRAB9EY>TG=wA2z#P}tIhLxr+S+;$%h;7UCQxvsTm=L<R-m4jk+4Y_1sl(?5?+6M
zSIB5o)%M<z(SEiSqhu#IRJLQ-V#;=BG%(u@bhf*wdS9!rw?E6S?8<D%qTt9l6%b^*
z8|rx(51W+TVdL3OpjYkMGLc0;!I2d*2@aLuWVV<R+yf0vu)9uhimIyw)r_oSN2a$Y
z3XV)w%|Uw8P|wR=ut}K?8&7X<cr*SEo@9)lVP&aQ!=W<Vhb^W_IRg#Ma6_HpOjTF=
zjNah0*p)f*(!r5^RX~vA9Mto&A8b<ghmGer7v5k}9>5qs$HYjffkWkZAX`kMm_h?{
z?4fg<r|P)NBekkjWMLU;t@-*fboP-tlKOXYs;sVRyQJx*=J_bJdrh@ZH>qdv#dF2N
zzdXBkNFOZ?C?_QY>&QW{xU`Z+wg(r8(u8_mnqiaD0vjK>1@Nll_{51bOT|V_?J2@w
z|HxP>S=7}C=Gb7`%B^VN2=>$?xKPzOf{RqC$iiybTJvRccmx-d)E~jU^aw7Ymm|0o
zg^V{u2a`<?(IF@&#esEX87znBP_~DL=rGjtayV>Kj)091(UI_G{E<0|G0QW0Kr(7P
z3NN80j}Q*UMtUfYR&@@=a#bp_u)4R*%X-VOmJbib3ex*SQLTq!CA}PqV^Hvi;#jij
zp*RlZq#O_H$O*6<iWAu$8j6!p&&$cMNjU{JJ`|_I8w|y1jA?%;PDcobVq-lNXRw~(
zK4n$8SK@LeE6n{Y6#U%JW*{yY-MM}aYDqa4){*mInf>{U2xWf(>Up^kHYpdu#<RZ|
z-XQx+7~>x$N=xNZIMf`!j4g%*g<OsXs_CWGT)}#V^V`}>tGSXDs<{dUU(MAF^wnI0
zT2iisb>uo&s=1yKAvHIko|hY8lX4SmT+Pk!25N3$%<_!ZkWmsM)ACk?F#An(_P4Q~
z;gqzhb@sQj!tC!r!O#9q2Kw3Gg<4YXhIQl~SZ04OBSP8Vhk9P_hfT@@u<`64gg40k
zA;z?y{lf@h_M7VLA7S0gUR_zsqeNl$kD=ga|2PBnJbD7<q&x}h$WyS){b{y`a(@Q(
zygUn=l;>dMxjzrDK4w;z%L}NfW9Ewt_9s+jLSBMHO{kaIVmfHPf(DA(OpAJzb#M1@
zuP9$*Uy6Di1z*$~4Ai3DL^&yM!8-CbEJeM;_K>J|QP0bJut|9zHZJNv?BSgW%H#u7
zy;?#(M2%}Z`G`@0$chp2F+5y0A1$A-3x5T|@~EOxK1G!aJz0f<QT=CVp!(ig{pYNE
zyOXQ_3-+b@FH!K-f5kwp{%e$z@?Tg-zJaCs|FJ!!{#(@Z@*QkazK4yg|A9R+6rJ(4
z{zxBYB^72UE~6!7l%mMq^^=Ou;g=P4`$m3d-J74Tm|x&hVtyr9J!==wmERbs#r%$P
zQvQH-<WE?N`HSr#F@K|;mw#ZB@-J*$Ob6Uyjf&~0e7*`TOKqPbG#R%~C-%Yi*<4HN
ztm-_Eby1}v3%^EKYyLAt_&l}_N&WL!m0szsOD{KfR}}maT90gcgu0=el=Wd9*#MR!
z)Sd025!w*-y!3!gN>AAM2yFzf+S+Qz-Iy`U)xAAT-O>>wq!&DD&)tMgragC4G;nOT
z&||Zis&j05t5T7LpHnRJ?<khx$CL2b^dV_%Y&NHtW3vSc{@83uHa#|5p`4VhVIAoU
z%dy#p?V+*hhk9PNg-uF-*!b9N2X8Po0~iw;n}lo+j~bgD*kl@;foR~^Y^lelSk*Z;
zC8|_p;YT3L{OgZ}pM0#y?>ynL8AQ_9*bJtZV>1K=e{6=5O^?kml#?<X){zmg9Gg<M
zhsLH1^}HltlTr>FADaqzgRvROn9$galu_`gv8iN}X>3NLfn&3k9-AFm&v1XjC4{>~
z--#898H0i^W@iR!F}t9glwDyR84F7><JcY&vm5GIcfw-b2^$wP0p38&M8<@~l*%M{
zl$gnEGKtv(4HUDr7BhwQ40ixuOcg5>vnL9^n5hiZVy2;-l)YdbnGQ=ad$T<xCW(4p
zs$r9|4{Th_40r=EGZ_;SQ!caMQDXR0hEdGEXrP$BTFe~QGn}!$nEhCxnEg@k#mr@(
z7IOf~NvVN#<Um-8NwGa7W*+K!sfA5S9c)}o8s0$6e8z;tjFNhIlo*RmrY+Ne28!86
zi^;I=PP1w~%?@H;ifKf_7t_Q*Ev6Y|tQ%n+SpZ8hHrqpD1ogaRVUv=Bjf-i8HxRRs
zF(EN!vIrg}W-*&gVwRwRV)|(@OIdfPnJ4C8_NACZQ1Hb#4Af$lp`4UMVI4UPmSPTP
zdq~U?sORNK*rXf<8yAy@HxP3)V?tsoWH~%a%nCM{#H>UE#cZp^9K*Uh%{(#3vM<FP
zhk`HWcm`@QC!m~^6JZ@W36^3`W_w7?DX8b=RM@1P1{)W1I=q3HGZ+&RQz>V{qr{xW
zCX<-6(Lgc%wU~2Mok#j}RjJ6rue_G|=UvP2n{N0>e;!HwpLJ{WO80zvIW`xd;K+rF
zCb(5}5$bul7&a-Fz{W*i3U9FRxQsFWeU5}&j=GY31shC~uS5eSZ>J?+rRtP?wJH@^
z_`%sS|Iln1eq9brzJ{c}<T@?+T6!t@Iux){R5XF)8&Jne5f&>&*tq1I;SD6;!kFb5
ztu&*AMy_RVMF=$x&>C-3b!xm_m5MC<)@_-8<F@d-wiWp)JFM{z()$`4w8lH>rN+BZ
zaO7@96KK2#^}O5*o0R)t;~MXWH_-S1W7^mFAVR2dd#&*y)-#;8R*m~hjXcZ>lX?UN
zM;>KFXa{=?^}IX|o0KPD<4HXUZ^mCAKE;?|eK=a4hC@xZXV_wzY|o;BN$#MNe2(=D
zCz@5KlYE{PCiwyij=ac-P?9g9o|l(llky5|Jjqw#4U&9~F@BPx6Y@G7D#<t4VoLH&
zG%(45I?1<K&v2$#4LZrUSz(gzpy0^6j0h$99_o2{A2uoffsH5m0lYzyA2KFLa<qH|
zhf4BewwRLq1Px5GSSR@@>)z&emuIWkmq~txf+L@+9>E;^0`<In37eF!VB<-C%^vE2
z>aCp1<-hQ$mGd`j46i2rrSfR`AKI`KGp;MYRsC{RB|7o%SofxYoA~!|sl<Ok!I2+T
zk09}%P|wTHuu1s^HlFye?4c6R`1$@uUpU`Pds+UjqH|S)be4Z8L_G)GEdPW{W%(Bh
zj{L291X=!rdS3p8O-ctmuM^L*BfQZpi|AY9EIX;_T-9KmWoK3AajlCg6<K%!(OUhJ
zhu-sm;p5smKnRX&HQs`jUe2kmC^)j7q6tKILp?9+!zN_|*tqEK@CN7T4H@H4{PGdf
z0}eIud$Pqe@i#&PC;kwvd1F<l=3c5)WZ_vzYt5g53~Szmr2etIPHWzjUTWS91xI=-
znm}_O)bp}AY*My>jceW#-azwKj9H%1N;67m<b1U?La1@5*4S6osc{=sDzflMrnTme
zVum&LBfYP&L2KNWUTW-*f+O20nn2?K)bp}EY*KcBjcXhTZ=kW5G3{$CK?pSt(;5e{
z?%paKW35s@DGeqHlNy48f898gf%>{}7|K`$!(tT-%j?EcwucVJWvF8n42xATY&`c0
zc!S(WGA5Xlr7{W*H76_CVw#hq(LgQ3wU!-OcW?Z8T6SVzY8iuquVrTjYAw5<j72ak
z7QwL8GLG#bExVzPMKCND!LV^H6W|TBOk_-;rA#Klp|nh9i%H8KXrPu6TFVsH-TQu?
zmMZq8mOWAMwM=E8)-nxctbt*%28N}Uz1bepl0+SAU|6hyVdGk6z#C|p$(TS(LT16C
zwD6%Rqn3TqKrN+O%N*9-IW|_$vHjSWTJ}f5*D{xZTFU__V+jn4B`_?tq}U$PG7ohu
zfnl)(hK*}U!y9Os&zL|<xzxj<v{-C0ZI=c#P)nKCl40GQW1g0S*q2%wQSh}iF;Hu1
zMj0z$Sge3ysl{e{NQ<D36)-GTz_4*Gt?&j~7BVK#QXz}rP+AtV#iV5k8mJ|qwJc@b
zonxMsgV~o_4ne`!;xJHaS%xwez_3^V!&1xPY!7KU0(C5aVX**)jcdun8)!M2F@ctm
zvK$VjWd&PIT2`WgTFSMSV_0|R*aSVtj%8nJISvJ1%kd1<T24S2>t9%`e_^TRWVVO2
zoPs*mzpz;U!p60n4sW3448{anM#-6QC@p8P#iZqIG*C;0)^ZN(?i};9oXft{avlo4
zmh%~?wOoL5QZ9sb<RVyVxtQ%CEtjC4mrG%jav5w~%jNI}TCQMBprul-ghOe$iY+EB
zSEGSiMrtkBu<p(=Ps_FJOD)%-;A^>_fm+KAC@1AcSVwMxrIwr79@26P>Up^pHYvBk
z#<koIZ=mH4#speM%bjp2EqAfSq~&fjP|GN-<sR1EyI*7Ug7aSXrI!0p@U`5}K&|Bg
zl#}uxtRoM>Qp>|^4{3P>^}IX^o0P|3<60hvH&~ZG!5IH`Lzz5@x{~q~8%$E3Mgyf(
zYAMgK?%w+vtED{4zLfGD3ci%*8K|YafO1k^gmvU4SW0=B?I9_zpq`gkVUzM2Y+TCg
z@CH)eV9fH2R*+EwB9~=vB80h**15mMx_hf@g3kSIqA>S&Q1Ekqmw`I>_fW=fmavZe
z2bQ^i!1hq?AEKU@k6@GXF>E~dPv8x5|CBN9=e`Od%za0l`)90sQ%F4zEuRyGxqpFz
zpZk{#^nWb>3bmws4eQ8%VVV6mj0k1_Kh*Q`Eo@T0gN<kZJ-k8oKQN~KAIpD42(#Zw
zXa5uH-mG!6|CuPv{udPd?0;pTpZ#yBCFOTmNB)3i_J1-Wl>J|*=jCtMr2GRL&;DO{
zgX}xt0ov7M-w`3qevHn(h;?t?xY>6i3bXHwf}edC2Kw2rgIZG7g~fBJu*`lvMuf8O
zhB_8tuvmb>#<TAZZ;<_ljA?)Ddmx0_@2s=$$+|c8?t=;&5rx@rjDnwiF9zyazX{4o
z*%a21&0v{(Z?=bW?}IuPQm|M^!Nzmn65imh$ySW<52TglvNas)=+l=irlZd`XrPu|
zw3dFXdt1WQvMu{kOMeu6E!#0rYZ-uYQnrV6WCvJk8OZjKmSWWNQUaTlL9lTxgW(Od
z3}K9~Wpud=g+pl>#uk&7;b@?iUA2}Gtb2R3dyAr!eW|4k1z$^ofm%yB%1Nn!bz~$g
zwTxnWNJ}N^c^M6xlpSH?T6SU&b%yh96jjI=_|%P}o!J;@t0<LS;3;?j!I53zGO8S_
zdgrRfYL(+y_hz4~ayRy+%JC@pDtBj~RyhIXq)dc$WD+b@PG);Z<sPW#WeRLks$k<P
z_hb*HGUJPzN*{Khn52wS6gh!RQ_;Dqaazn?3b6$Dh^z_v(b4H}DKUGa;EPEzP>ZQX
z87n4OM`plM%uKe2#LPlHFWeYO*%vl0W)6Eq#Oz1k8jIOqMdzw^(_-c-M0LY?xW4c`
z04^n_1_i91RFB}ULJIY~%!5rzEo?l?I(UOCr!-^QzpF4GA>1j(>!j*gckeEi>!d8A
zaJn?0;7CUG2$DJo^}IB~CZ!2Bo>ViuK~gP@X+Nn22w_sY>!fVf-5bi4Iw>Ivlggst
zNKW+#l4?agFAHIlvIsVw)M9vpq?RzI{iK#6gh@@%Ngd33rVGz_!}VM|geXkPK><&k
zsvbd7hoYXB!(fwgIBYzrBj62^I+8K|F>Yj;90i9u#^u>!I>sH11|~UCC%K&UOfM!`
zu9IBBzD#l@3XUA3dIU)xi+Wy;gT>RLu<;~MfHz3;M8+)7=+rVQv&de25<-~NB%RdB
ztY@}iQk6QXQ;5Q(PDR0y(^QWjsnb!<%NejqITJRX)LHNbNuAA@Hj_%b-wsF#KCOV)
zD6?`7!W}smwzy=joX3A74`jzb(l9e64XJsJ>27j9;*t3U3gkj|#=A;A)qy>|c!9`8
zaLtC;4kLz*kc&~#Zv;v@)hL&+QN2Q&UXX6ClS@%AQNIi<Of|Nq<ucT#wa&|ElFLzb
z<O*26c*>Uv`SxgIs%chrO%2#;YRcqFv}P13KVsmw4;oY^SE0%3lObtFF=Zn7SEI#|
zYha1bPb{$i_^PR;i%aENc2mH8&`{cMXg?p^BOhKiOs+#1B=S|)+LopTjp@a5J$o7n
zZ$OJ9H^RmU%j70@Q-u8Lg-<3o!w(s4qu!!WZ`G*YFz9R=<Tf9dP1;>NVn}I;+`c;K
zI~4St8rrl45^`4ozMuD?K?%9r#cz8H+@tXC)%XQlAR+hpz~l6QjGI+G$S2TSAu*_U
zXfd7XTuL{4(o6acRvS&MmRZ#$a=)8R+XMN4%I3j%Hf5L(5Bb?J&7po8^6=_lA5pN6
znqbT1F(0<wY4-T)NS;t6Pnt*)@{~`qc{?LnCQq-9<r&5DY@8*55qi#N*{vNGY%gzT
zif=K!m+1}m{OZzqL8bGeDV;KT$xml1ZZI`ROcSWAL|$GU(JPARRTEJ{Uh|3Yd}@bM
zd3|+gZz!}kH5%We=4_fe$2+dbTRx(9D405jN3^$H<hJ*xcNFrwZ6TM-dq(7Pd4F}t
z|53;vXk>mxhB;?#$?+)sp%2>wE&Xsl$yvkLSX@@pPd-{5_{R$T6AkVU$Yg&&KK1ea
z0h!DJS+zRk&lK|K8reI5)wVQe>TBF1eL}wQ!TYM@-NQIzTJg1=w3;~Tq>zA?FI~pA
z4<cVF#;@CCEL)tA|E?J$Lc#dW>KOm07{6_kF#*Q!){GIMVEle{j6W#GAKPT)@$#oN
zV?-zze_kEqFN*QkHW_*R`EAV@5emlNSI78=V*ImB#<Im_^4FR%A{2~&ua5B_#rSWV
zj0rGyz#9kdMMArC6`^44h~Rdw9EuR&NGI6v1y*?PD3#7@#)wNMFm_oT<2s6Q-F6tu
zr0be8Qsa7UF{ahWGSZ82&D)Gi&u)mrtuNTQvH@(n*Sz?aSgL7!O6r2^-tOpw>)s9V
z58lZp04^EvPlwcmSyM};CyMIwa?;oXPZ~X}@2_JNeWQh|=9b!nWFxdIPxaJ2rcIx*
zPql1J4=)=L!;gPSFF5APCa_f<Qwlm;n`%tUrtt8hSZA;qs<=A_%M5ri%>(hI`O*h2
zH-%lkSXfkXZq2zFvSvt>alto7v8pJQi9l`vw<BA^5)$_tI;5&)D|X`r^Q2uLsMz(8
zZ%(#ia9b0uQw*-Jg4@Q0%kc75<2?7q#i>t!ov(1zhsb8NG|o>qXLIR>X6Z*hH_dH{
z+cAdQU*T@&;;Nyea-7m-0Q<VRe)(?ymsTCPo11G0b9-XOrSG6H2Wm|3<f5)Wq&Pr*
zXw`w^R*u>;hFU_@_y`PAsDmS@>T*Yh1gNjS^8M9+e%=&A9ZJ;BG5y07>Tr$feOzZr
zQ>}~$Fn`+n>wMq4lI|FDcI-Q&YTT@GQpyl_c*{^Uz$f6w>*uhX1p)j#sbF6<ykmPl
z$N!#pQx0K{BxaXb%A*u!WehVXqXW#klaMAD)NBo5?numaVwgKA%rP;{wCwC-?mG5?
z(*~?M<%}5SF2r0vhPkW492>)Il5qj%y&J5!Y}t!n#V~gxX7?E8c!jyU#`LDvlBQg$
zRVD<ee>I<Z{9YgI6hoaz)E+U^NeXqci)wAFH_OcDx|Z(n<q`#=*5XsgRaeU%WO29o
z6r%Nxp;algJvG{Ry$^-wP$mR2l_2qXIZc7=r9md^l+uTVwrU7uIzi%Fb#Dcd)F3l8
z$j0I676PdzNPN2Oqd;b8ko`1B%~*3LGYPVJY)ob;5FUWBA6WDBm=sK?P$v5lBtD_$
zD3JX$NWBIrm_Z?s{RtAEL30(z0UD%HgA`1d5J(L{;uGdT1(MPr{q>l1TK>;TyX?J<
zJ7Lh)VfP8+$L%ZgP}DP~7DdC1se{{*G%QD_Si^Nl9sJH_>*w7`90Hk7kUp^?u2&$I
z2H8Uk`>p@>_OkOcF^~p=#Ai%KfgGek_SYaw2OKc?*$;cfKpF`W-`GtGq*;SBxga|q
z`RnOl9QfrmF_0F51VfxwXKD8WU;&!)V#CfAA)I#;D~k$l+0qp{tfG^&!slMg@nBmz
z+yz<)Z}_fNhL|d^(4DbG@Hw&=mRn(<i?R?Xp_NzRniBV_a5to_=JMg?!?5;NOC6*!
z$c<ajpzgK=3)7O}Dq_-n<=2b9l9Ccx5>472j-`gw4_2uk;-;>?UluGRIVukOsM}zh
zLQgbfMBt)t8Da{?$fY}!U3s1gjnrZA;mIFZ4!?J~lupYL3|2e?z*81d6byh`!Fpp+
zB1fW&f%YgOMQHQzIdU{CY5f)LEPqA2oShWCcV?J6i!0g{@EbT*GCaa@41A6p3mcwU
z*+pe?9HSNZwwzh|uF6!OeR$#AkmC_=AU%OF5z-Uk!&6AG;qk0pBquXiarQ2phaNo=
zE0I$WWnem$og+-A!RN^7uw>$g?t+Ozk!P@vA~a1Dvh|ZQ5o4e`i(MmhXT#^nIj~`2
z*+p_LgUOR2qXF$a21L-#htH7<V8dwjrE(#IRo0%ObXsi=4B<s+GvHs$E)o1o;B(|s
z*aG}Tav6h7_)u~=+6?$tuuBC0O86YP3YPfZ#o)s9Lb;mJihwsPi`^Xub60JgMI}lL
zcCTv?X5hM(y(3)L!G{-pV9CWBNW=n{zq8!HUW(JZMT}PNEjOaUz;YAgBP=(==g2Lv
zVMX{cL2hNVLf=kvPxbFS4KFSmrlf_^DUsXI!$5mGfg-ebz~{)Fu;C50WS-o`V8yyM
z$B~kkG|dYghRWQr<e1-$SOecZgoyCn3!fwR!4~k<%KZ$E^3|I89zd*t??FOD_#T1}
zPl~}7@TKJu21ohQX1+%eYv6l~5D~t|;dA5(*aE&Ld6L0Vz9uu@Q;0S2Jxz!R-!t$z
z@+@otUrwH5aFj1+=6fEo2EG>v5#f6gK0LMtOFr+5sY|$Mc$wjfahqVqpux=f3Zf01
zuM#A}`5JtVybeoFZ)b06oh)xKT2XS>h+I8j-%->ByT+U7VW54BKoQ!v;lsB(U`gwp
z2KOjkoG6ud8Lx<Wlu<i}yKjVVP9@|$gn`Y!8=G-21m0)w2-|<)!?Sd-Q8wJZW_<f>
zylwjt!VGL5vv-8;6ZjnY6gJA1kX4LtpN+R{KSP*-?Q`~yuzdlaBVWQ6u;JcnnS90g
zLN<4=4@Q!=YrjUAf$hKS9bx+hK74fqHp-TeZy8_6=I#>#8*kNqhcE-%_v{^E`vE?@
z<pmpID=U+q7+=U1*H$J!Bh0||3wuY{eudAG-(aI`3HhDzg=}$cW%38Y3~Yb0cZBUP
z`0#ERY=kXQCjT(LkS(q)A^#%Gz}5j*x>2@{@ZsZ^uu-;zbYgrVTU=X0IwQ=$)`h(z
z+SY;3k#%7U*l=gMRJt-=v3bA9;P(~GLUlM<QY-7B-9Xcg-6AyW!-sF9z(#4xq&wr|
zH0~t}Xf{N<fu;w$MQD1$=g3B|QJREo%=kD>!c)@=?FO1n*eyb{DSUVY5;j7ELt<~n
zM`^G#ximTHgLVVW=Ij=s*#bV?@P&=il*v|%kJIQu%gNSgH_-HDw+PKP@Hx^CHcFF_
zZ5bb@(Sw$g{%AMQY{za9ngQ@RvOR18P5I)|gzUihC=Jdq?x4++foM0-6ti1|rUX7m
z2Ej&X$`Ue|@lhI_GCZ0gXgAOdWw!{;F!&r94jZ9KBxD5Rqck{Icr>MGH_()^TZAS7
zAMW<TlEz!0rqlj4cLh5sF7M2oPV<s`B>dp;e}mSi+))gVa8$zQ$Y|IwM|P3Vu_HSb
zaAX&eV<-3x9Ag+B;n*2IM|Ob?bJPYLyRuUON3G%*3%`M59K$t7+WReJH#qTP9qe4$
z9oE0OGXWJxCenpb^d7x&Z;aqKlu2lCxqIR&CVVd=RgKl@WHc9~Kn9Q8yMMF?e2z?k
zB?X@_m?Tw<Q<`{LR&ej4THm{g+`8BkJqoB?NmB_F8PRF*IkFcl3B6V7B!8n#XCFn5
zHF1YY++KUbWk63d(i_WGb%i&-wb}iSzf_}fuIvL_T<iWppqj=8d_G^6bepJyy2%VQ
z<5#zt_{Wi1j73TuOZ|usuPW4!^uk*^iUEIwr?m;6z0b<NbaB9z${ZB&O;uPobE;a3
zzewc$#9t!&qYXc@X~MavIdT9j3Hghdq@{-437QFjPy>*9xa~lc1B4W6j?9Dg5E`1K
z7F9C>zw$421qf+{Fkd6^k-BWUR_YB9+(Ao+4X<s<7B4Lks{o}zp=2})w-`RnUn>V0
zV7Ru%Pm$STU;(GG0H;aeG;17YQ=6@o76XjVCfm@=Y!(!t*a}57ipqvCSp$s9h9GV>
zIfbJh8E<8Geb`xu@?2S@Jo>4@#i%&4gf6Q{eYw=#=6v)?KPl!eL;c6Zh9`!}Qp95l
z2P+98dz07O7Y|m@hiGWN4CF#@0cd?E1P4j3Rtm74D7L#wS!Q53R52W;8Ti!_mtnIu
zlb6HI=tmgPk5uSKx#-rnE_5$IGv!F;k~iZXZNOcwa96mvna!}zxzJns(B9FbU~pum
znc^4&#j%RwIF~~G#D$O7jg;dVst!iHi{c&$gOgYx+X;v**u?H;IFS&MW8g{f;T=m@
z?!D?bH@;?+oWfAWMrG<5z<|uZRaCpEL{3Gdf$KB^M7U0e&yh1=LtOYA5knPM3Aoh#
zp@6GE81BmJn^a^z3;hhtXA>^Md=7kg5*RkbJX+3UsAA^4cBh87{R*hgN1%b~0``wk
zT?iju_=F`D)-3#NCa%veW-PffT=5&gE-`>zs=zLbfR)STJ{TAD2Cyp(U{@-zt0G_(
za<vb}^|}G<8Uxt13hcTF*hsnF15-~KtBJFRTAkm31_SnujE@ZAP4MCUQrPeij*?p#
zt2USaU@EZ_aw{4P*tao0f_*!Dj@$to#;%k*85_gK%KI)f7_jeVd<6R*`0&kK*f92J
zxsS1N?1bEp1_Sm3jE`VH2p`^8g$-esjgW^K8^gw``Vll3upec71p6`g9C;izj9n^E
zFgA{zkSEb#z<!GH5$vbobL1J=Fm{<d%UEJ(x}szNdyWASu;<}(<OSF;SVCT8tODzf
zk^$@`21LMKhR=~#V8dYL@+xB$*v2Rsz+PiO1nhPA9C-sa3|1j;GFE~0M#%v776T$+
zZ^MV%g|K0;k@7BM71&lN8Nl9SKm_c4`0$1;Y#3~me85-*wk=8qun!p!0s9C(M?QuP
zgH_5Wj3roTuUD(mPtjn&Ud8wb_Gj?n>rAj=?9uWCV`JD@ihhX(1NK*pk6?cdA70dj
z4Phrn$Ty6QVPpOIKQtJyzh!&``#bm?`5rcmT`E5?R$=pJ!lCXN+q+s)r#1d0^CO}R
zL_e`}gy?7Z9Qg${OjIVnGBzM8FF0lTM86@*K=eC1M~MD_4=)76g2-AC!X9bH{>zO0
zw*mVfh5fI_{>#NKUE}k9u%s@L4!D1BP}C9bsG=hH@c1Vz6lI3_mu<tjWs2Ls<ltpN
zn6|T-wu^yw9YwpYrmgX5ck=WGt9Lc9ZC>^i<{(|oDXeEmp_@u!eK!T~awIWIHejrp
zG`uR-7kK_PPZpO~-4S6J-VNC?GGTha=SWZ3@XV}~jToy)`m2F9Tzm(YbmZI^9Soek
zh!f%51U^SLg$;9#mdzNeIJfmiIdZ*UNZA|l2FgB!iBN71pCenqau)UQ7e#~GCUQ5N
z+PLn$5<D5OrI~Rn1LM|;v9D$v<ug`zjP5I3dh!(PocakC+-6caxUVmi$Tnv3eg^Vw
z6?uP6KB*0IZzZxO<lC9a2N=k=SL8cr@>y+=yY~a*+t)Pg1I^^c2J#X`K1h?-8_B(!
z_-jT!*i1geKt5EF57Xp}+8|$>VIOWLA7LObRpe!we0dw>YcuQ#GkLjzyh4$W)Z}Lx
z$-SQe)@;~EnaL{+<f9e&j+*?+Hptgz*mp9Mk1>$%tjKrK<af3~zBa?YtC@VPfqa}I
z-%XRNzlxxapInW*w~=rkd9{b`@n*u^4TKXE;Y3Zi3l{;3u-H4QtqI>GGv8zb-yVu@
zip%Gn11rj;im{4$2PIWq2eifQ9#TqVPjoVzFQyVJa=w@bAMP*1@_f<XKmBjFI-M?;
zOgB^QZJ<gjs%n?Ymsl?QFjh%y?>-Jsy)zJL&^40)5nZ$3bA*pigigH`6|yg56&cTn
z-W^EqhD!7Vq3`67dk#7pxc4JognNJZ9GMFn<{l{rFjjH5b1R2jH3&3t9mxI>t`vN@
zvklAbx0Ap9hH?9O7xdn)(S}UDDQnHFbq3b7Vx6y9`KL)-UBT`cIrh0Hcs!6+JkouF
zi#NyW&4iYLut5=KG$FqR>k|6AW1D=Tn{fx3`5Fy;O^UDC<@4t1$Pv=QShWdv^rw%%
zHx^v9c@OaF#IRZYD_3e?03gFAwuv9v!~&lqS=i8Y8d)ki#;RQQiRaSxOvZ!9dNEND
z6KtW-h_!;kkmW)$N3vW5A0C2+<wQKnKVVjKB65cL!&E>Uj0#3*ZBkoePHm|nwS!e^
zhq$Tv8#5seW7Poe8%wg?T_xDI#xQ><UD>h>Jcdy`l=P8NJPbZOAPpPd*5z^pV^yk`
z`P;g!m6$h*Yr1gr?pKEvkRC)E6k4DsKG=o<C7#bl$_CX(QAtE~9zHyu4a>PR$v>Q2
z46Pi)Hq!923|LiSZA1rI{(ly_+$?m3LFh^)^cYvDKNCmFv5Zxlcl)+BFP`fQ6z~Xg
z9Qqi>_jp1@#`gsH@F(|R!?S9XoWxi~JHXokB7+xWJsDjLtfvqu!g?xv_?`-En6*+)
zXRKo7lat;#%6-q<ea9j6ETf!(XanJy1c?xy1)n2l!-CK{C&XdCC!}TzpXfcu%yF)P
z<2=Q2zRTfl!BHdR0>&y)L)+hi<?i_}GzaBE^fgGlh>#J97sKbsC9t6}9fiLi$XG=m
zoR=b-vf#F!yIl>rTm~RRE|(KOlFJqFIdUZ|a>=ac@3o`bm`jEP{Oc?KP$E~E6S&%t
zz%?p?YuyC=ZIO`c7^?<jNE@lNDgSzOHORk#h!OcW!iO)7!iG0Rx!lZH#lB|yGB5aU
zK_3I{t%Qou-Ugo|x5I{ME94HwD%v&Lm&tl3x)@mRB2t9)Zusyg7h%J!BjsMkD%Snh
zXxihz<()kJ>j7`ud%GhJbB2j|AD9fu-p>Rg$vyy|BM-uclN}`wF;*ozXHDkC|2@}-
zL1f7F5%NcJeH1=N9)k_%S}Bh+b}a{1U)ZhXz&-&cL$XgYfk?7X!G}Lu2^&s!v^>Mu
zwHw&~TdvQ7$dK!E<d5X~Jbd^a4mOl)<p_C^u`1UG+nTZQ0~KD^)9>T0*|kQyQ`v)R
z@Aw>?9^z0z3MKLq)EEZjWr~ap$Sd$U@+vF_Bvb7luJ6V$v~!W=@vq&*me(t|WC;|7
zF7pd^{eoc+!sAfk0eH<k0IwSc;0-kZZ@L5Eoh~ZN<SoXk8NpZA+*<-m=BZbw{QKc=
zBgQb!@33oRoZp4dk@sLZ&ip&9E;tX%`gfcHrAy|yuNL`S@0+>)W8nHgaeb({w(h)I
zuG$FKM`o^%4P2inu1{SqfA}h76=T)#ZLDX~lG^a#$!7>KjNRw#7a6-R;KQpEu;HC{
zq<qC##estROVE<Ee2t0$_`eK}0Dl7?-jaX~1CNq#8B5>{-;gqZeP;msUV;4(0jrcB
zJ+RrKiR8Xd=x^Mg5M!YEnO!3}{{o*Qzru1N@eekwW+J5{8}~Oe*Y5_dKNQ!WE|<4)
zM~{%d7#kV*Cixp)gNT0^91-y^e2#R$t5czYA6+UP;c*9^uZkJKiWm?9>ja-8og-jn
z(!~P{4}4R25|+q1h%wNt%dQccuJGY=&9EGJ{t>Fx417~$;JcZ*);Dl%pt!nguA!^n
zgL4tC4b5CV3|u`G*G4XvzmY3sW5yQl!MX6@NiPH##%>e#i;Uf-@Zp^a*zooqDZLr1
zIQT@W`Yl(zM%M>^1N7z$k3eq$AKsII4MUHTtr)BJVZM`R0NdIC)>naT69KD~ejeCt
z^+bfabHzl%<542pBFaG2pPeIFZwH?v17JCs_{YNBVF=&O53hYm7TLtxo7r|Sunkmf
z#hR^`&o%~w-sWAu;B7E|H_H90K=e1+5_1lN3^@!|ISkP`Oj`RK@UE@OHJZava}L7{
zISf}hjL<nOTKgOlZRAjD&Y{eZLqg?Hu5;LaZF9hnw{7H5Va{QsA%{^ahf1BpthLXf
zt!**doWqWW9ClJUjL|tPU;7-|+7>&TbJ)d@!>%fau{wuQYnubVQfHo%<IFkiX2@Z@
z%3*h%L;c$4(AKt?V9sHpA%{sShsipJGuJ+cwzkC{<{YMka!6-(!y{FBse*5=?8zLw
zNBO6s;>a|*aOduN_XvsPIt-m8d%;_9tY!b`QFgil*xLojT<TwRd#_UYSKN~%8O3w6
zsy3kRqflqKsMd#WTHbGWJ|q`U!3#Mv&G55A@ab0Th`GYo$zw0u?AaoCTV`oD*%z&N
z*Jcj>ab!P+>PO#a%&0Dv{ozn2@a%$y=B&&`b*>!1s44&AKV_;qrqwGr*_^c2=34p6
z3<5Iz9x7gp(f9|#g+GzSo<9D}Qklm{7qymcA=FOk3~|(#A*L56hTy{<gZr_ND3Lle
z1lVcR9GTDl*r;b|yn5F0d3^lOMT%r(sa_3kFZHnh#EDRxfkPu4q!5buw>ESxeAIfn
zv}C3hm+?c&jR*+RXhO}AW_HpN0azw2aA?2<Yy(^X$a8-Hhy^<<(_M5M(E+-kh9BpM
z5TGX{2Zu&)Wm^Hdm#zj}h=2fa5o-7yjh%44z(3%sahAkzB3WWH1X(UE06kcN9-=|*
z0Mzk8z3s}^<|d_@no``h9L;3~NQWw<!!*(d0n*_yB);MkAPp@o8RQP>&{D!4Q2=|S
zf;~#Zei*>!W3Z#zgT<yQ8x$v<UhI+{T|l~Ak*?6B{OhB7s8_Bf>EL!rk0~HMR*@d3
zNyqSl-`&xUUu)9xc9fn_KzgDgJ;^0iJJrl;>{KViq1jJiTUDoEx70g$DR*$p|Kd{a
z;HM%gn5w6thHnC~C#LGU!A3hng8@$Oq5MSg)O1c7Y5|1j3(qV7I!gha?E+cX>Xgp$
zQM|c%Fi?1?!Bh%?sq}atia9xd182fLQk6kS%*muYCj)%RVW*s1ko9>g>+^NiN9n9D
zP+6<vo2Nc<gu77nyGZx@L-)H__3ISg1ewB7x<vK6RQG#V_q)vRhrW8W`K+=(K;=<k
zeoHV8WkbqK<Z>Vd8}|yucBRXf*(BJwR~ZPWwnI3$UmJvKU-jF9@lP<Bs|%93MkRBt
zn~e3ImVTWflQoncVY|M7?FPkmqh@<wv)!~hw!-0ry0*04T)=jVV!PF4%Zv^N`ZfdM
zjCKYZcc^{F!dy!Dy}596p?o&W?JB9Xdc}A_N^%Xgjp>@YhT5Fmp%T1PWwV*pwW_z(
zx2lgdV9FP`(y&%lZE6jw+Q1q<<tJ3t-z2Hp&?=wu18e-_A5}fA$|>Km#=p8zRb-8s
z@{fwd;kK%?HFnBhto3GX9c#jrKUm{m*{SMiRjJDk_4gU8I$3*7`I~KvFhEsZtm-L$
zvbLPH&8^v0>svKbex`?i)TXMtRX62F)~;r)ht)9Ud)D|jp<C4(%MH!=5^+ks{OpRn
zONqQ2R=qeb>G{fElWQ3v_o%je4Q-`zpK80GZG4Tnz9m=Fl*+V7syUUF2jHHj4x}xD
zZwl8~4Iq_TYl%Flx;+G2Y~2W{$5s`&iD50CP0PY`T}`cpm#5S6Fk<kjpSpTUEwF|y
z#Gn3XY0VC6YRJ}h+l!#-y6SYUn>>QBlB7JU5I3A=Z8go>dRnXEu5D>u(AdyiU!#)9
z%43Lev%+8PNHu30TAJM^c^obHoRh-1v8onX)wZfn124p%Dq&a-??pd}=Ghr&9bT8p
zrG}^Q$3jxsY{T$L(`S^8C`$~VI;kQxb^P#pE46TFQ+-9MskXVXWq1=ls@#|zo^9H0
zyMqU6xPd$N8<-eYF|4$vu?1fktIZ7@(r+L@(`+Et&=HmR|3F8cA}4O{bf{0YVoaV!
zb*?<aJgYEK(&~!|wRn}fCW{XkH{=>X@GM$as%<T;&CQgbZcMwcqCcm)KM$+6b*izk
lWl_z7hT4Phedib8U7<8LHZ(QlQaQ-Z%8RPUOR%lO{vY*^v@!qy

diff --git a/epochX/cudacpp/gg_tt.mad/bin/madevent b/epochX/cudacpp/gg_tt.mad/bin/madevent
index c944aa1faf..dff9711b73 100755
--- a/epochX/cudacpp/gg_tt.mad/bin/madevent
+++ b/epochX/cudacpp/gg_tt.mad/bin/madevent
@@ -32,6 +32,7 @@ except ImportError:
 
     
 import os
+pjoin = os.path.join 
 import optparse
 
 # Get the directory of the script real path (bin)
@@ -160,13 +161,30 @@ except:
     pass
 import internal.madevent_interface as cmd_interface
 
+# check for plugin customization of the launch command
+launch_interface = cmd_interface.MadEventCmdShell
+if os.path.exists(pjoin(root_path, 'bin','internal', 'launch_plugin.py')):
+    with  misc.TMP_variable(sys, 'path', sys.path + [pjoin(root_path, 'bin', 'internal')]):
+        from importlib import reload
+        try:
+            reload('launch_plugin')
+        except Exception as error:
+            import launch_plugin
+    launch_interface =  launch_plugin.MEINTERFACE
+
+
+#Source use this executable for compilation always allow it
+force_run = False
+if (args and args[0] == 'treatcards'):
+    force_run=True    
+
 # Call the cmd interface main loop
 try:
     if '-h' in args or '--help' in args:
-        launch = ME.MadEventCmdShell(me_dir=os.path.dirname(root_path), force_run=True)
+        launch = launch_interface(me_dir=os.path.dirname(root_path), force_run=True)
         launch.exec_cmd('help generate_events')
         sys.exit(0)
-    with cmd_interface.MadEventCmdShell.RunWebHandling(os.path.dirname(root_path), ):
+    with cmd_interface.MadEventCmdShell.RunWebHandling(os.path.dirname(root_path), force_run=force_run):
         if (args and os.path.isfile(args[0])):
             # They are an input file 
             input_file = args[0]
@@ -178,7 +196,7 @@ try:
                 cmd_line.run_cmd('import command ' + input_file)
                 cmd_line.run_cmd('quit')      
             else:
-                cmd_line = cmd_interface.MadEventCmdShell(force_run=True)
+                cmd_line = launch_interface(force_run=True)
                 cmd_line.use_rawinput = False
                 cmd_line.haspiping = False
                 cmd_line.run_cmd('import command ' + input_file)
@@ -188,7 +206,7 @@ try:
             if options.web:
                 cmd_line = cmd_interface.MadEventCmd(force_run=True)
             else:
-                cmd_line = cmd_interface.MadEventCmdShell(force_run=True)
+                cmd_line = launch_interface(force_run=True)
             if not hasattr(cmd_line, 'do_%s' % args[0]):
                 if parser_error:
                     print( parser_error)
diff --git a/epochX/cudacpp/gg_tt.mad/src/HelAmps_sm.h b/epochX/cudacpp/gg_tt.mad/src/HelAmps_sm.h
index 57ad0974c3..add8fce575 100644
--- a/epochX/cudacpp/gg_tt.mad/src/HelAmps_sm.h
+++ b/epochX/cudacpp/gg_tt.mad/src/HelAmps_sm.h
@@ -8,7 +8,7 @@
 // Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+// MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
@@ -862,6 +862,7 @@ namespace mg5amcCpu
   VVV1P0_1( const fptype allV2[],
             const fptype allV3[],
             const fptype allCOUP[],
+            const double Ccoeff,
             const fptype M1,
             const fptype W1,
             fptype allV1[] ) ALWAYS_INLINE;
@@ -875,6 +876,7 @@ namespace mg5amcCpu
           const fptype allF2[],
           const fptype allV3[],
           const fptype allCOUP[],
+          const double Ccoeff,
           fptype allvertexes[] ) ALWAYS_INLINE;
 
   //--------------------------------------------------------------------------
@@ -885,6 +887,7 @@ namespace mg5amcCpu
   FFV1_1( const fptype allF2[],
           const fptype allV3[],
           const fptype allCOUP[],
+          const double Ccoeff,
           const fptype M1,
           const fptype W1,
           fptype allF1[] ) ALWAYS_INLINE;
@@ -897,6 +900,7 @@ namespace mg5amcCpu
   FFV1_2( const fptype allF1[],
           const fptype allV3[],
           const fptype allCOUP[],
+          const double Ccoeff,
           const fptype M2,
           const fptype W2,
           fptype allF2[] ) ALWAYS_INLINE;
@@ -909,6 +913,7 @@ namespace mg5amcCpu
   VVV1P0_1( const fptype allV2[],
             const fptype allV3[],
             const fptype allCOUP[],
+            const double Ccoeff,
             const fptype M1,
             const fptype W1,
             fptype allV1[] )
@@ -947,6 +952,7 @@ namespace mg5amcCpu
           const fptype allF2[],
           const fptype allV3[],
           const fptype allCOUP[],
+          const double Ccoeff,
           fptype allvertexes[] )
   {
     mgDebug( 0, __FUNCTION__ );
@@ -970,6 +976,7 @@ namespace mg5amcCpu
   FFV1_1( const fptype allF2[],
           const fptype allV3[],
           const fptype allCOUP[],
+          const double Ccoeff,
           const fptype M1,
           const fptype W1,
           fptype allF1[] )
@@ -1001,6 +1008,7 @@ namespace mg5amcCpu
   FFV1_2( const fptype allF1[],
           const fptype allV3[],
           const fptype allCOUP[],
+          const double Ccoeff,
           const fptype M2,
           const fptype W2,
           fptype allF2[] )
diff --git a/epochX/cudacpp/gg_tt.mad/src/Parameters_sm.cc b/epochX/cudacpp/gg_tt.mad/src/Parameters_sm.cc
index 0e29798b23..c5dd6e7e4c 100644
--- a/epochX/cudacpp/gg_tt.mad/src/Parameters_sm.cc
+++ b/epochX/cudacpp/gg_tt.mad/src/Parameters_sm.cc
@@ -7,7 +7,7 @@
 // Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+// MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
diff --git a/epochX/cudacpp/gg_tt.mad/src/Parameters_sm.h b/epochX/cudacpp/gg_tt.mad/src/Parameters_sm.h
index f629e8cadb..5f2f4391b9 100644
--- a/epochX/cudacpp/gg_tt.mad/src/Parameters_sm.h
+++ b/epochX/cudacpp/gg_tt.mad/src/Parameters_sm.h
@@ -7,7 +7,7 @@
 // Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+// MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
diff --git a/epochX/cudacpp/gg_tt.mad/src/cudacpp_src.mk b/epochX/cudacpp/gg_tt.mad/src/cudacpp_src.mk
index f2804ffb85..159e19a46d 100644
--- a/epochX/cudacpp/gg_tt.mad/src/cudacpp_src.mk
+++ b/epochX/cudacpp/gg_tt.mad/src/cudacpp_src.mk
@@ -36,6 +36,13 @@ endif
 # See https://www.gnu.org/software/make/manual/html_node/Implicit-Variables.html
 ###RANLIB = ranlib
 
+# Add -mmacosx-version-min=11.3 to avoid "ld: warning: object file was built for newer macOS version than being linked"
+LDFLAGS =
+ifneq ($(shell $(CXX) --version | egrep '^Apple clang'),)
+CXXFLAGS += -mmacosx-version-min=11.3
+LDFLAGS += -mmacosx-version-min=11.3
+endif
+
 #-------------------------------------------------------------------------------
 
 #=== Configure the CUDA compiler (note: GPUCC is already exported including ccache)
@@ -266,11 +273,11 @@ endif
 ifneq ($(GPUCC),)
 $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so : $(cxx_objects) $(cu_objects)
 	@if [ ! -d $(LIBDIR) ]; then echo "mkdir -p $(LIBDIR)"; mkdir -p $(LIBDIR); fi
-	$(GPUCC) -shared -o $@ $(cxx_objects) $(cu_objects)
+	$(GPUCC) -shared -o $@ $(cxx_objects) $(cu_objects) $(LDFLAGS)
 else
 $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so : $(cxx_objects)
 	@if [ ! -d $(LIBDIR) ]; then echo "mkdir -p $(LIBDIR)"; mkdir -p $(LIBDIR); fi
-	$(CXX) -shared -o $@ $(cxx_objects)
+	$(CXX) -shared -o $@ $(cxx_objects) $(LDFLAGS)
 endif
 
 #-------------------------------------------------------------------------------
diff --git a/epochX/cudacpp/gg_tt.mad/src/mgOnGpuConfig.h b/epochX/cudacpp/gg_tt.mad/src/mgOnGpuConfig.h
index e540c8587c..a9515694ae 100644
--- a/epochX/cudacpp/gg_tt.mad/src/mgOnGpuConfig.h
+++ b/epochX/cudacpp/gg_tt.mad/src/mgOnGpuConfig.h
@@ -24,16 +24,19 @@
 // ** NB2 Baseline on b7g47n0004 fluctuates (probably depends on load on other VMs)
 
 // Choose if curand is supported for generating random numbers
-// For CUDA, by default, it is supported
-// For HIP, by default, it is not supported
-// For C++, by default, do not inline, but allow this macro to be set from outside with e.g. -DMGONGPU_HAS_NO_CURAND
-#ifdef __CUDACC__
-#undef MGONGPU_HAS_NO_CURAND
-#elif defined __HIPCC__
+// For HIP, by default, do not use curand (common random numbers will be used instead)
+// For both CUDA and C++, by default, do not inline, but allow this macro to be set from outside with e.g. -DMGONGPU_HAS_NO_CURAND
+// (there exist CUDA installations, e.g. using the HPC package, which do not include curand - see PR #784 and #785)
+#if defined __HIPCC__
 #define MGONGPU_HAS_NO_CURAND 1
 #else
+//#ifdef __CUDACC__
+//#undef MGONGPU_HAS_NO_CURAND // default
+////#define MGONGPU_HAS_NO_CURAND 1
+//#else
 //#undef MGONGPU_HAS_NO_CURAND // default
 ////#define MGONGPU_HAS_NO_CURAND 1
+//#endif
 #endif
 
 // Choose floating point precision (for everything but color algebra #537)
diff --git a/epochX/cudacpp/gg_tt.mad/src/mgOnGpuCxtypes.h b/epochX/cudacpp/gg_tt.mad/src/mgOnGpuCxtypes.h
index 46d9f02733..5532e22fa1 100644
--- a/epochX/cudacpp/gg_tt.mad/src/mgOnGpuCxtypes.h
+++ b/epochX/cudacpp/gg_tt.mad/src/mgOnGpuCxtypes.h
@@ -159,6 +159,12 @@ namespace mg5amcCpu
     return cxsmpl<float>( a, 0 ) * b;
   }
 
+  inline __host__ __device__ constexpr cxsmpl<float>
+  operator*( const cxsmpl<float>& a, const double& b )
+  {
+    return a * cxsmpl<float>( b, 0 );
+  }
+
   template<typename FP>
   inline __host__ __device__ constexpr cxsmpl<FP>
   operator/( const cxsmpl<FP>& a, const cxsmpl<FP>& b )

From b3562ff80f4a11b0399ae76befcc45149412006f Mon Sep 17 00:00:00 2001
From: Jorgen T <jorgen.teig@gmail.com>
Date: Fri, 29 Sep 2023 16:25:12 +0200
Subject: [PATCH 492/509] Updated first name in Author list

---
 .../CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/AUTHORS    | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/AUTHORS b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/AUTHORS
index 0aeb2c8a87..71519d1ad8 100644
--- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/AUTHORS
+++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/AUTHORS
@@ -10,7 +10,7 @@ generates includes the following authors:
   Stephan Hageboeck (CERN)
   Olivier Mattelaer (Universite Catholique de Louvain, original author)
   Stefan Roiser (CERN, original author)
-  Joergen Teig (CERN)
+  Jorgen Teig (CERN)
   Andrea Valassi (CERN, original author)
   Zenny Wettersten (CERN)
 

From ce629c2b7b101b1c3b6e943eaf30c689ffd3d5bb Mon Sep 17 00:00:00 2001
From: Andrea Valassi <andrea.valassi@cern.ch>
Date: Wed, 24 Jan 2024 17:14:51 +0100
Subject: [PATCH 493/509] [jt774] remove '#include "GpuAbstraction.h"' from
 CODEGEN mgOnGpuVectors.h and process_matrix.inc as in branch jthip24

These are changes that in that branch I included in commitcommit 6e90139833db998d1f6b2546d16c33c357804b24 (Tue Jul 18 18:25:34 2023 +0200),
which consisted in a backport to CODEGEN of earlier changes in ggttggg.mad.
---
 .../madgraph/iolibs/template_files/gpu/mgOnGpuVectors.h         | 2 --
 .../madgraph/iolibs/template_files/gpu/process_matrix.inc       | 2 --
 2 files changed, 4 deletions(-)

diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/mgOnGpuVectors.h b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/mgOnGpuVectors.h
index dd8b83752d..cdae04326b 100644
--- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/mgOnGpuVectors.h
+++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/mgOnGpuVectors.h
@@ -9,8 +9,6 @@
 #include "mgOnGpuCxtypes.h"
 #include "mgOnGpuFptypes.h"
 
-#include "GpuAbstraction.h"
-
 #include <iostream>
 
 //==========================================================================
diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/process_matrix.inc b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/process_matrix.inc
index 84e324a679..960f029d8d 100644
--- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/process_matrix.inc
+++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/process_matrix.inc
@@ -7,8 +7,6 @@
 ! Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 !==========================================================================
 
-#include "GpuAbstraction.h"
-
       // *** COLOR CHOICE BELOW ***
       // Store the leading color flows for choice of color
       if( jamp2_sv ) // disable color choice if nullptr

From 7363e1fde1cda981c64746eb095abd94425d1266 Mon Sep 17 00:00:00 2001
From: Andrea Valassi <andrea.valassi@cern.ch>
Date: Tue, 18 Jul 2023 19:06:32 +0200
Subject: [PATCH 494/509] [jthip] in CODEGEN, remove the copying to src of
 GpuRuntime.h and GpuAbstraction.h

---
 epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/output.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/output.py b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/output.py
index 3e1d68d1fb..c89295c01f 100644
--- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/output.py
+++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/output.py
@@ -88,7 +88,7 @@ class PLUGIN_ProcessExporter(PLUGIN_export_cpp.ProcessExporterGPU):
                      'CMake': [s+'CMake/Compilers.txt', s+'CMake/Platforms.txt', s+'CMake/Macros.txt'],
                      'src': [s+'gpu/rambo.h', s+'read_slha.h', s+'read_slha.cc',
                              s+'gpu/mgOnGpuFptypes.h', s+'gpu/mgOnGpuCxtypes.h', s+'gpu/mgOnGpuVectors.h',
-                             s+'CMake/src/CMakeLists.txt', s+'gpu/GpuRuntime.h', s+'gpu/GpuAbstraction.h'],
+                             s+'CMake/src/CMakeLists.txt' ],
                      'SubProcesses': [s+'gpu/nvtx.h', s+'gpu/timer.h', s+'gpu/timermap.h',
                                       s+'gpu/ompnumthreads.h', s+'gpu/GpuRuntime.h', s+'gpu/GpuAbstraction.h',
                                       s+'gpu/MemoryAccessHelpers.h', s+'gpu/MemoryAccessVectors.h',

From 47e2b8f71ddd18df57e3b01be8b8cfeba8b8ad0a Mon Sep 17 00:00:00 2001
From: Andrea Valassi <andrea.valassi@cern.ch>
Date: Wed, 24 Jan 2024 17:22:32 +0100
Subject: [PATCH 495/509] [jt774] in CODEGEN mgOnGpuFptypes.h, replace one more
 __CUDACC__ by MGONGPUCPP_GPUIMPL... not clear why this was not done yet

In branch jthip24, this is coming from Jorgen's commit 6741186a50be8c16a09630a959a6327d2b4a7a8a (Thu Jul 13 15:15:41 2023 +0200)
which includes many such changes
---
 .../madgraph/iolibs/template_files/gpu/mgOnGpuFptypes.h         | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/mgOnGpuFptypes.h b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/mgOnGpuFptypes.h
index 83a46c1d4e..fa3a02664b 100644
--- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/mgOnGpuFptypes.h
+++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/mgOnGpuFptypes.h
@@ -12,7 +12,7 @@
 #include <cmath>
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL // cuda
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu

From 721652e3d25b4332df673f92134f11a759dc5454 Mon Sep 17 00:00:00 2001
From: Andrea Valassi <andrea.valassi@cern.ch>
Date: Tue, 18 Jul 2023 18:11:04 +0200
Subject: [PATCH 496/509] [jt774] cherry-pick commit 1b5c0fdff ([jthip]
 backport to CODEGEN from ggttgg.mad on Tue Jul 18 18:11:04 2023 +0200)

Fix conflicts:
	epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/cpp_model_parameters_h.inc
	epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/check_sa.cc
	epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/mgOnGpuConfig.h

NB: this is very strange, because this same commit 1b5c0fdff is already included in the jt774 branch earlier on...
---
 .../iolibs/template_files/gpu/check_sa.cc      | 18 +++++++++++-------
 .../iolibs/template_files/gpu/mgOnGpuConfig.h  |  3 ---
 2 files changed, 11 insertions(+), 10 deletions(-)

diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/check_sa.cc b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/check_sa.cc
index b9a05dea46..c336edb68a 100644
--- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/check_sa.cc
+++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/check_sa.cc
@@ -761,7 +761,7 @@ main( int argc, char** argv )
     rndgentxt = "CURAND HOST";
   else if( rndgen == RandomNumberMode::CurandDevice )
     rndgentxt = "CURAND DEVICE";
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
   rndgentxt += " (CUDA code)";
 #elif defined __HIPCC__
   rndgentxt += " (HIP code)";
@@ -771,8 +771,8 @@ main( int argc, char** argv )
 
   // Workflow description summary
   std::string wrkflwtxt;
-  // -- CUDA or C++?
-#ifdef MGONGPUCPP_GPUIMPL
+  // -- CUDA or HIP or C++?
+#ifdef __CUDACC__
   wrkflwtxt += "CUD:";
 #elif defined __HIPCC__
   wrkflwtxt += "HIP:";
@@ -790,7 +790,7 @@ main( int argc, char** argv )
   wrkflwtxt += "???+"; // no path to this statement
 #endif
   // -- CUCOMPLEX or THRUST or STD complex numbers?
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 #if defined MGONGPU_CUCXTYPE_CUCOMPLEX
   wrkflwtxt += "CUX:";
 #elif defined MGONGPU_CUCXTYPE_THRUST
@@ -806,6 +806,12 @@ main( int argc, char** argv )
 #else
   wrkflwtxt += "???:"; // no path to this statement
 #endif
+#elif defined __HIPCC__
+#if defined MGONGPU_CUCXTYPE_CXSMPL
+  wrkflwtxt += "CXS:";
+#else
+  wrkflwtxt += "???:"; // no path to this statement
+#endif
 #else
 #if defined MGONGPU_CPPCXTYPE_STDCOMPLEX
   wrkflwtxt += "STX:";
@@ -906,7 +912,7 @@ main( int argc, char** argv )
 #endif
     // Dump all configuration parameters and all results
     std::cout << std::string( SEP79, '*' ) << std::endl
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
               << "Process                     = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_CUDA"
 #elif defined __HIPCC__
               << "Process                     = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_HIP"
@@ -936,7 +942,6 @@ main( int argc, char** argv )
 #elif defined MGONGPU_FPTYPE_FLOAT
               << "FP precision                = FLOAT (NaN/abnormal=" << nabn << ", zero=" << nzero << ")" << std::endl
 #endif
-#ifdef MGONGPUCPP_GPUIMPL
 #if defined MGONGPU_CUCXTYPE_CUCOMPLEX
               << "Complex type                = CUCOMPLEX" << std::endl
 #elif defined MGONGPU_CUCXTYPE_THRUST
@@ -1078,7 +1083,6 @@ main( int argc, char** argv )
              << "\"FLOAT (NaN/abnormal=" << nabn << ")\"," << std::endl
 #endif
              << "\"Complex type\": "
-#ifdef MGONGPUCPP_GPUIMPL
 #if defined MGONGPU_CUCXTYPE_CUCOMPLEX
              << "\"CUCOMPLEX\"," << std::endl
 #elif defined MGONGPU_CUCXTYPE_THRUST
diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/mgOnGpuConfig.h b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/mgOnGpuConfig.h
index bca351fa89..46a8f0efc0 100644
--- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/mgOnGpuConfig.h
+++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/mgOnGpuConfig.h
@@ -6,8 +6,6 @@
 #ifndef MGONGPUCONFIG_H
 #define MGONGPUCONFIG_H 1
 
-#include "GpuRuntime.h" // Includes the GPU abstraction
-
 // HARDCODED AT CODE GENERATION TIME: DO NOT MODIFY (#473)
 // There are two different code bases for standalone_cudacpp (without multichannel) and madevent+cudacpp (with multichannel)
 %(mgongpu_supports_multichannel)s
@@ -17,7 +15,6 @@
 #define MGONGPUCPP_GPUIMPL cuda
 #elif defined __HIPCC__
 #define MGONGPUCPP_GPUIMPL hip
-#include "hip/hip_runtime.h"
 #else
 #undef MGONGPUCPP_GPUIMPL
 #endif

From 71a9ece3cf70058569457f44f95f50b16ba51421 Mon Sep 17 00:00:00 2001
From: Andrea Valassi <andrea.valassi@cern.ch>
Date: Wed, 24 Jan 2024 16:35:03 +0100
Subject: [PATCH 497/509] [jthip24] (after merging upstream/master) fix clang
 formatting in CODEGEN (code generation was failing clang formatting checks)

This is a cherry-pick of f44a9c77344c1dd2f18c08e48715fe723a32e588 from jthip24
---
 .../madgraph/iolibs/template_files/gpu/check_sa.cc            | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/check_sa.cc b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/check_sa.cc
index c336edb68a..748ecad0cc 100644
--- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/check_sa.cc
+++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/check_sa.cc
@@ -778,7 +778,7 @@ main( int argc, char** argv )
   wrkflwtxt += "HIP:";
 #else
   wrkflwtxt += "CPP:";
-#endif
+#endif /* clang-format off */
   // -- DOUBLE or FLOAT?
 #if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT /* clang-format off */
   wrkflwtxt += "MIX+"; // mixed fptypes (single precision color algebra #537)
@@ -788,7 +788,7 @@ main( int argc, char** argv )
   wrkflwtxt += "FLT+";
 #else
   wrkflwtxt += "???+"; // no path to this statement
-#endif
+#endif /* clang-format on */
   // -- CUCOMPLEX or THRUST or STD complex numbers?
 #ifdef __CUDACC__
 #if defined MGONGPU_CUCXTYPE_CUCOMPLEX

From 809001313092940ef7799b9eec7d119cb229e82e Mon Sep 17 00:00:00 2001
From: Andrea Valassi <andrea.valassi@cern.ch>
Date: Thu, 25 Jan 2024 17:09:21 +0100
Subject: [PATCH 498/509] [jt774] copy CODEGEN check_sa.cc from jthip24 as-is
 (currently f44a9c77344c1dd2f18c08e48715fe723a32e588)

Code generation is now succeeding (it was previously failing in clang-format)

git checkout f44a9c77344c1dd2f18c08e48715fe723a32e588 CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/check_sa.cc
---
 .../madgraph/iolibs/template_files/gpu/check_sa.cc     | 10 ++--------
 1 file changed, 2 insertions(+), 8 deletions(-)

diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/check_sa.cc b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/check_sa.cc
index 748ecad0cc..7cac5ab47b 100644
--- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/check_sa.cc
+++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/check_sa.cc
@@ -780,7 +780,7 @@ main( int argc, char** argv )
   wrkflwtxt += "CPP:";
 #endif /* clang-format off */
   // -- DOUBLE or FLOAT?
-#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT /* clang-format off */
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
   wrkflwtxt += "MIX+"; // mixed fptypes (single precision color algebra #537)
 #elif defined MGONGPU_FPTYPE_DOUBLE
   wrkflwtxt += "DBL+";
@@ -799,12 +799,6 @@ main( int argc, char** argv )
   wrkflwtxt += "CXS:";
 #else
   wrkflwtxt += "???:"; // no path to this statement
-#endif /* clang-format on */
-#elif defined __HIPCC__
-#if defined MGONGPU_CUCXTYPE_CXSMPL
-  wrkflwtxt += "CXS:";
-#else
-  wrkflwtxt += "???:"; // no path to this statement
 #endif
 #elif defined __HIPCC__
 #if defined MGONGPU_CUCXTYPE_CXSMPL
@@ -1090,7 +1084,7 @@ main( int argc, char** argv )
 #elif defined MGONGPU_CUCXTYPE_CXSMPL
              << "\"STD::COMPLEX\"," << std::endl
 #else
-             << "\"???\"," << std::endl // no path to this statement...
+             << "\"???\"," << std::endl                           // no path to this statement...
 #endif
              << "\"RanNumb memory layout\": "
              << "\"AOSOA[" << neppR << "]\""

From 9a8c86c144cdba81eea5dd6f111eb40d90cbd537 Mon Sep 17 00:00:00 2001
From: Andrea Valassi <andrea.valassi@cern.ch>
Date: Thu, 25 Jan 2024 17:14:11 +0100
Subject: [PATCH 499/509] [jt774] add one empty line in CODEGEN
 MemoryAccessMomenta.h as in jthip24

(I accidentally removed it in commit e32bc4e6ea9ac0c3808c9644e5526c1b2bda3db2 on Wed Jan 24 10:36:44 2024 +0100)
---
 .../madgraph/iolibs/template_files/gpu/MemoryAccessMomenta.h     | 1 +
 1 file changed, 1 insertion(+)

diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MemoryAccessMomenta.h b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MemoryAccessMomenta.h
index 86df5d5471..3be229d392 100644
--- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MemoryAccessMomenta.h
+++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MemoryAccessMomenta.h
@@ -27,6 +27,7 @@ namespace mg5amcCpu
   class MemoryAccessMomentaBase //_AOSOAv1
   {
   public:
+
     // Number of Events Per Page in the momenta AOSOA memory buffer layout
     // (these are all best kept as a compile-time constants: see issue #23)
 #ifdef MGONGPUCPP_GPUIMPL /* clang-format off */

From d1131c1acbbb9688e0d1224abf228f5f074501cf Mon Sep 17 00:00:00 2001
From: Andrea Valassi <andrea.valassi@cern.ch>
Date: Thu, 25 Jan 2024 16:46:19 +0100
Subject: [PATCH 500/509] [jthip24] Remove hip_runtime.h from CODEGEN
 mgOnGpuConfig.h and add it back to GpuAbstraction.h

Revert "[CODEGEN] Added HIP runtime include in mgOnGpuConfig.h in codegen"
This reverts Jorgen's commit 35913a385f9961f4ca8e67aabaa37940149c5aa5 (2023-07-13 15:15:41 +0200)
---
 .../madgraph/iolibs/template_files/gpu/GpuAbstraction.h         | 2 ++
 .../madgraph/iolibs/template_files/gpu/mgOnGpuConfig.h          | 1 -
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/GpuAbstraction.h b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/GpuAbstraction.h
index 9c467b1e04..6a7d9c05c0 100644
--- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/GpuAbstraction.h
+++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/GpuAbstraction.h
@@ -39,6 +39,8 @@
 
 #elif defined __HIPCC__
 
+#include "hip/hip_runtime.h"
+
 #define gpuError_t hipError_t
 #define gpuPeekAtLastError hipPeekAtLastError
 #define gpuGetErrorString hipGetErrorString
diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/mgOnGpuConfig.h b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/mgOnGpuConfig.h
index ca32045bd4..46a8f0efc0 100644
--- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/mgOnGpuConfig.h
+++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/mgOnGpuConfig.h
@@ -15,7 +15,6 @@
 #define MGONGPUCPP_GPUIMPL cuda
 #elif defined __HIPCC__
 #define MGONGPUCPP_GPUIMPL hip
-#include "hip/hip_runtime.h"
 #else
 #undef MGONGPUCPP_GPUIMPL
 #endif

From 0a069c7e4f1c90dd5fd285b4621f766f3f6c230a Mon Sep 17 00:00:00 2001
From: Andrea Valassi <andrea.valassi@cern.ch>
Date: Thu, 25 Jan 2024 16:46:19 +0100
Subject: [PATCH 501/509] [jthip24] Remove hip_runtime.h from CODEGEN
 mgOnGpuConfig.h and add it back to GpuAbstraction.h

Revert "[CODEGEN] Added HIP runtime include in mgOnGpuConfig.h in codegen"
This reverts Jorgen's commit 35913a385f9961f4ca8e67aabaa37940149c5aa5 (2023-07-13 15:15:41 +0200)

This is a cherry-pick in jt774 of d1131c1acbbb9688e0d1224abf228f5f074501cf from jthip24
---
 .../madgraph/iolibs/template_files/gpu/GpuAbstraction.h         | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/GpuAbstraction.h b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/GpuAbstraction.h
index 9c467b1e04..6a7d9c05c0 100644
--- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/GpuAbstraction.h
+++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/GpuAbstraction.h
@@ -39,6 +39,8 @@
 
 #elif defined __HIPCC__
 
+#include "hip/hip_runtime.h"
+
 #define gpuError_t hipError_t
 #define gpuPeekAtLastError hipPeekAtLastError
 #define gpuGetErrorString hipGetErrorString

From 89170af9217e16fc74acbbdc705dc92855251bf6 Mon Sep 17 00:00:00 2001
From: Andrea Valassi <andrea.valassi@cern.ch>
Date: Thu, 25 Jan 2024 17:29:59 +0100
Subject: [PATCH 502/509] [jt774] copy CODEGEN cpp_model_parameters_h.inc from
 jthip24 as-is (currently af0f0d4458fd5089ff47188b5631e6aa8e1014f3)

This is meant to fix the build of the code generated ggtt.mad (it was failing before in jt774 while it succeeds in jthip24)
However code generation is now failing in clang formatting

git checkout af0f0d4458fd5089ff47188b5631e6aa8e1014f3 CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/cpp_model_parameters_h.inc
---
 .../template_files/cpp_model_parameters_h.inc | 36 ++++++++-----------
 1 file changed, 15 insertions(+), 21 deletions(-)

diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/cpp_model_parameters_h.inc b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/cpp_model_parameters_h.inc
index 8b8797c04c..94b8dd6444 100644
--- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/cpp_model_parameters_h.inc
+++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/cpp_model_parameters_h.inc
@@ -212,34 +212,28 @@ namespace mg5amcCpu
 
   //==========================================================================
 
-#ifdef MGONGPUCPP_GPUIMPL
-  namespace mg5amcGpu
-#else
-  namespace mg5amcCpu
-#endif
-  {
 #pragma GCC diagnostic push
 #ifndef __clang__
 #pragma GCC diagnostic ignored "-Wunused-but-set-variable" // e.g. <<warning: variable ‘couplings_sv’ set but not used [-Wunused-but-set-variable]>>
 #endif
-    // Compute the output couplings (e.g. gc10 and gc11) from the input gs
-    template<class G_ACCESS, class C_ACCESS>
-    __device__ inline void
-    G2COUP( const fptype gs[],
-            fptype couplings[] )
-    {
-      mgDebug( 0, __FUNCTION__ );
-      using namespace Parameters_%(model_name)s_dependentCouplings;
-      const fptype_sv& gs_sv = G_ACCESS::kernelAccessConst( gs );
-      DependentCouplings_sv couplings_sv = computeDependentCouplings_fromG( gs_sv );
+  // Compute the output couplings (e.g. gc10 and gc11) from the input gs
+  template<class G_ACCESS, class C_ACCESS>
+  __device__ inline void
+  G2COUP( const fptype gs[],
+          fptype couplings[] )
+  {
+    mgDebug( 0, __FUNCTION__ );
+    using namespace Parameters_%(model_name)s_dependentCouplings;
+    const fptype_sv& gs_sv = G_ACCESS::kernelAccessConst( gs );
+    DependentCouplings_sv couplings_sv = computeDependentCouplings_fromG( gs_sv );
 %(dcoupaccessbuffer)s%(dcoupkernelaccess)s%(dcoupcompute)s
-      mgDebug( 1, __FUNCTION__ );
-      return;
-    }
+    mgDebug( 1, __FUNCTION__ );
+    return;
+  }
 #pragma GCC diagnostic pop
 
-  } // end namespace mg5amcGpu/mg5amcCpu
+} // end namespace mg5amcGpu/mg5amcCpu
 
-  //==========================================================================
+//==========================================================================
 
 #endif // Parameters_%(model_name)s_H

From 69d5ed60ba3f202b4350f0e1bc312125b9746c9e Mon Sep 17 00:00:00 2001
From: Andrea Valassi <andrea.valassi@cern.ch>
Date: Thu, 25 Jan 2024 17:34:17 +0100
Subject: [PATCH 503/509] [jt774] fix clang formatting in CODEGEN
 model_handling.py after the previous commit

(undo the changes here from commit e32bc4e6ea9ac0c3808c9644e5526c1b2bda3db2 on Wed Jan 24 10:36:44 2024 +0100)

The code generated ggtt.mad is now finally succeeding in jt774, as it does in jthip24.
HOWEVER the two code branches are not identical yet, there is still a minor difference in makefiles
---
 .../CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/model_handling.py      | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/model_handling.py b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/model_handling.py
index b585102292..3e0ebe545f 100644
--- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/model_handling.py
+++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/model_handling.py
@@ -859,11 +859,11 @@ def super_generate_parameters_class_files(self):
             replace_dict['dcoupsetdpar'] = '    ' + '\n'.join( dcoupsetdpar )
             dcoupsetdcoup = [ '    ' + line.replace('constexpr cxsmpl<double> ','out.').replace('mdl_complexi', 'cI') for line in self.write_hardcoded_parameters(list(self.coups_dep.values())).split('\n') if line != '' ]
             replace_dict['dcoupsetdcoup'] = '    ' + '\n'.join( dcoupsetdcoup )
-            dcoupaccessbuffer = [ '      fptype* %ss = C_ACCESS::idcoupAccessBuffer( couplings, idcoup_%s );'%( name, name ) for name in self.coups_dep ]
+            dcoupaccessbuffer = [ '    fptype* %ss = C_ACCESS::idcoupAccessBuffer( couplings, idcoup_%s );'%( name, name ) for name in self.coups_dep ]
             replace_dict['dcoupaccessbuffer'] = '\n'.join( dcoupaccessbuffer ) + '\n'
-            dcoupkernelaccess = [ '      cxtype_sv_ref %ss_sv = C_ACCESS::kernelAccess( %ss );'%( name, name ) for name in self.coups_dep ]
+            dcoupkernelaccess = [ '    cxtype_sv_ref %ss_sv = C_ACCESS::kernelAccess( %ss );'%( name, name ) for name in self.coups_dep ]
             replace_dict['dcoupkernelaccess'] = '\n'.join( dcoupkernelaccess ) + '\n'
-            dcoupcompute = [ '      %ss_sv = couplings_sv.%s;'%( name, name ) for name in self.coups_dep ]
+            dcoupcompute = [ '    %ss_sv = couplings_sv.%s;'%( name, name ) for name in self.coups_dep ]
             replace_dict['dcoupcompute'] = '\n'.join( dcoupcompute )
             # Special handling in EFT for fptype=float using SIMD
             dcoupoutfptypev2 = [ '      fptype_v %sr_v;\n      fptype_v %si_v;'%(name,name) for name in self.coups_dep ]

From d43cfeb5d02ded294cc291bcb6a9b58fec379287 Mon Sep 17 00:00:00 2001
From: Andrea Valassi <andrea.valassi@cern.ch>
Date: Thu, 25 Jan 2024 17:43:48 +0100
Subject: [PATCH 504/509] [jt774] copy CODEGEN cudacpp.mk from jthip24 as-is
 (currently 4ba21335e6ae4a0b1ea379cfdf565d72030f7a2e)

git checkout 4ba21335e6ae4a0b1ea379cfdf565d72030f7a2e CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/cudacpp.mk

NB: CODEGEN in jt774 is now identical to that in jthip24!
---
 .../madgraph/iolibs/template_files/gpu/cudacpp.mk            | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/cudacpp.mk b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/cudacpp.mk
index 011a5326ab..dbca8e330f 100644
--- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/cudacpp.mk
+++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/cudacpp.mk
@@ -180,7 +180,8 @@ ifneq ($(wildcard $(CUDA_HOME)/bin/nvcc),)
     CURANDLIBFLAGS = -L$(CUDA_HOME)/lib64/ -lcurand # NB: -lcuda is not needed here!
   endif
   CUOPTFLAGS = -lineinfo
-  GPUFLAGS = $(OPTFLAGS) $(CUOPTFLAGS) $(INCFLAGS) $(CUINC) $(USE_NVTX) $(CUARCHFLAGS) -use_fast_math
+  ###GPUFLAGS = $(OPTFLAGS) $(CUOPTFLAGS) $(INCFLAGS) $(CUINC) $(USE_NVTX) $(CUARCHFLAGS) -use_fast_math
+  GPUFLAGS = $(foreach opt, $(OPTFLAGS), -Xcompiler $(opt)) $(CUOPTFLAGS) $(INCFLAGS) $(CUINC) $(USE_NVTX) $(CUARCHFLAGS) -use_fast_math
   ###GPUFLAGS += -Xcompiler -Wall -Xcompiler -Wextra -Xcompiler -Wshadow
   ###GPUCC_VERSION = $(shell $(GPUCC) --version | grep 'Cuda compilation tools' | cut -d' ' -f5 | cut -d, -f1)
   GPUFLAGS += -std=c++17 # need CUDA >= 11.2 (see #333): this is enforced in mgOnGpuConfig.h
@@ -573,7 +574,7 @@ $(BUILDDIR)/%%.o : %%.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
 # Apply special build flags only to CrossSectionKernel.cc and gCrossSectionKernel.cu (no fast math, see #117 and #516)
 # Added edgecase for HIP compilation
 ifeq ($(shell $(CXX) --version | grep ^nvc++),)
-$(BUILDDIR)/CrossSectionKernels.o: CXXFLAGS += -fno-fast-math
+$(BUILDDIR)/CrossSectionKernels.o: CXXFLAGS := $(filter-out -ffast-math,$(CXXFLAGS))
 $(BUILDDIR)/CrossSectionKernels.o: CXXFLAGS += -fno-fast-math
 ifeq ($(findstring nvcc,$(GPUCC)),nvcc)
   $(BUILDDIR)/gCrossSectionKernels.o: GPUFLAGS += -Xcompiler -fno-fast-math

From 5e424fe41675a179291eb9b2d8d8277131c01fe5 Mon Sep 17 00:00:00 2001
From: Andrea Valassi <andrea.valassi@cern.ch>
Date: Thu, 25 Jan 2024 17:46:29 +0100
Subject: [PATCH 505/509] [jt774] regenerate ggtt.mad - add add the previously
 absent GpuAbstraction.h and GpuRuntime.h files and symlinks

---
 .../gg_tt.mad/CODEGEN_mad_gg_tt_log.txt       |  22 +-
 epochX/cudacpp/gg_tt.mad/COPYRIGHT            |   1 +
 .../cudacpp/gg_tt.mad/SubProcesses/Bridge.h   |  32 +--
 .../gg_tt.mad/SubProcesses/BridgeKernels.cc   |   9 +-
 .../gg_tt.mad/SubProcesses/BridgeKernels.h    |   8 +-
 .../SubProcesses/CommonRandomNumberKernel.cc  |   5 +-
 .../SubProcesses/CrossSectionKernels.cc       |   7 +-
 .../SubProcesses/CrossSectionKernels.h        |   6 +-
 .../SubProcesses/CurandRandomNumberKernel.cc  |  12 +-
 .../gg_tt.mad/SubProcesses/EventStatistics.h  |   4 +-
 .../gg_tt.mad/SubProcesses/GpuAbstraction.h   |  71 ++++++
 .../{CudaRuntime.h => GpuRuntime.h}           |  54 ++--
 .../gg_tt.mad/SubProcesses/MadgraphTest.h     |   8 +-
 .../SubProcesses/MatrixElementKernels.cc      |  26 +-
 .../SubProcesses/MatrixElementKernels.h       |   8 +-
 .../SubProcesses/MemoryAccessAmplitudes.h     |   2 +-
 .../SubProcesses/MemoryAccessCouplings.h      |   2 +-
 .../SubProcesses/MemoryAccessCouplingsFixed.h |   2 +-
 .../SubProcesses/MemoryAccessDenominators.h   |   2 +-
 .../gg_tt.mad/SubProcesses/MemoryAccessGs.h   |   2 +-
 .../SubProcesses/MemoryAccessHelpers.h        |   4 +-
 .../SubProcesses/MemoryAccessMatrixElements.h |   2 +-
 .../SubProcesses/MemoryAccessMomenta.h        |   6 +-
 .../SubProcesses/MemoryAccessNumerators.h     |   2 +-
 .../SubProcesses/MemoryAccessRandomNumbers.h  |   4 +-
 .../SubProcesses/MemoryAccessVectors.h        |   4 +-
 .../SubProcesses/MemoryAccessWavefunctions.h  |   2 +-
 .../gg_tt.mad/SubProcesses/MemoryBuffers.h    |  64 ++---
 .../SubProcesses/P1_gg_ttx/CPPProcess.cc      |  62 ++---
 .../SubProcesses/P1_gg_ttx/CPPProcess.h       |  10 +-
 .../SubProcesses/P1_gg_ttx/CudaRuntime.h      |   1 -
 .../SubProcesses/P1_gg_ttx/GpuAbstraction.h   |   1 +
 .../SubProcesses/P1_gg_ttx/GpuRuntime.h       |   1 +
 .../SubProcesses/P1_gg_ttx/check_sa.cc        | 111 +++++----
 .../SubProcesses/RamboSamplingKernels.cc      |  20 +-
 .../SubProcesses/RamboSamplingKernels.h       |   6 +-
 .../SubProcesses/RandomNumberKernels.h        |   6 +-
 .../cudacpp/gg_tt.mad/SubProcesses/cudacpp.mk | 232 +++++++++++-------
 .../cudacpp/gg_tt.mad/SubProcesses/fbridge.cc |  16 +-
 .../gg_tt.mad/SubProcesses/fsampler.cc        |   8 +-
 .../cudacpp/gg_tt.mad/SubProcesses/runTest.cc |  12 +-
 .../gg_tt.mad/SubProcesses/testmisc.cc        |   8 +-
 .../cudacpp/gg_tt.mad/SubProcesses/testxxx.cc |  14 +-
 epochX/cudacpp/gg_tt.mad/src/HelAmps_sm.h     |   4 +-
 epochX/cudacpp/gg_tt.mad/src/Parameters_sm.cc |   4 +-
 epochX/cudacpp/gg_tt.mad/src/Parameters_sm.h  |  10 +-
 epochX/cudacpp/gg_tt.mad/src/cudacpp_src.mk   |  23 +-
 epochX/cudacpp/gg_tt.mad/src/mgOnGpuConfig.h  |  73 ++++--
 epochX/cudacpp/gg_tt.mad/src/mgOnGpuCxtypes.h |  28 +--
 epochX/cudacpp/gg_tt.mad/src/mgOnGpuFptypes.h |  12 +-
 epochX/cudacpp/gg_tt.mad/src/mgOnGpuVectors.h |  18 +-
 epochX/cudacpp/gg_tt.mad/src/rambo.h          |   8 +-
 52 files changed, 625 insertions(+), 434 deletions(-)
 create mode 100644 epochX/cudacpp/gg_tt.mad/SubProcesses/GpuAbstraction.h
 rename epochX/cudacpp/gg_tt.mad/SubProcesses/{CudaRuntime.h => GpuRuntime.h} (62%)
 delete mode 120000 epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/CudaRuntime.h
 create mode 120000 epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/GpuAbstraction.h
 create mode 120000 epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/GpuRuntime.h

diff --git a/epochX/cudacpp/gg_tt.mad/CODEGEN_mad_gg_tt_log.txt b/epochX/cudacpp/gg_tt.mad/CODEGEN_mad_gg_tt_log.txt
index a477013568..360771ac98 100644
--- a/epochX/cudacpp/gg_tt.mad/CODEGEN_mad_gg_tt_log.txt
+++ b/epochX/cudacpp/gg_tt.mad/CODEGEN_mad_gg_tt_log.txt
@@ -52,7 +52,7 @@ Note that you can still compile and run aMC@NLO with the built-in PDFs
 
 Using default text editor "vi". Set another one in ./input/mg5_configuration.txt
 Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt
-No valid web browser found. Please set in ./input/mg5_configuration.txt
+Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt
 import /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt.mg
 The import format was not given, so we guess it as command
 set stdout_level DEBUG
@@ -62,7 +62,7 @@ generate g g > t t~
 No model currently active, so we import the Standard Model
 INFO: load particles 
 INFO: load vertices 
-[1;32mDEBUG: model prefixing  takes 0.005816459655761719 [0m
+[1;32mDEBUG: model prefixing  takes 0.005492210388183594 [0m
 INFO: Restrict model sm with file models/sm/restrict_default.dat . 
 [1;32mDEBUG: Simplifying conditional expressions [0m
 [1;32mDEBUG: remove interactions: u s w+ at order: QED=1 [0m
@@ -175,7 +175,7 @@ INFO: Generating Helas calls for process: g g > t t~ WEIGHTED<=2 @1
 INFO: Processing color information for process: g g > t t~ @1 
 INFO: Creating files in directory P1_gg_ttx 
 [1;32mDEBUG:  kwargs[prefix] = 0 [1;30m[model_handling.py at line 1058][0m [0m
-[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7f089da5e700> [1;30m[export_v4.py at line 6262][0m [0m
+[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7fa9ca549f70> [1;30m[export_v4.py at line 6262][0m [0m
 INFO: Creating files in directory . 
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.h
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.cc
@@ -191,16 +191,16 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./.
 INFO: Generating Feynman diagrams for Process: g g > t t~ WEIGHTED<=2 @1 
 INFO: Finding symmetric diagrams for subprocess group gg_ttx 
 Generated helas calls for 1 subprocesses (3 diagrams) in 0.006 s
-Wrote files for 10 helas calls in 0.103 s
+Wrote files for 10 helas calls in 0.101 s
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates VVV1 set of routines with options: P0[0m
 ALOHA: aloha creates FFV1 routines[0m
-ALOHA: aloha creates 2 routines in  0.155 s
+ALOHA: aloha creates 2 routines in  0.145 s
 [1;32mDEBUG:  Entering PLUGIN_ProcessExporter.convert_model (create the model) [1;30m[output.py at line 202][0m [0m
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates VVV1 set of routines with options: P0[0m
 ALOHA: aloha creates FFV1 routines[0m
-ALOHA: aloha creates 4 routines in  0.135 s
+ALOHA: aloha creates 4 routines in  0.131 s
 <class 'aloha.create_aloha.AbstractRoutine'> VVV1
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
@@ -237,9 +237,9 @@ Type "launch" to generate events from this process, or see
 Run "open index.html" to see more information about this process.
 quit
 
-real	0m1.729s
-user	0m1.515s
-sys	0m0.204s
+real	0m1.713s
+user	0m1.482s
+sys	0m0.227s
 Code generation completed in 2 seconds
 ************************************************************
 *                                                          *
@@ -266,7 +266,7 @@ INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/mg5amc
 INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt/Cards/me5_configuration.txt  
 Using default text editor "vi". Set another one in ./input/mg5_configuration.txt
 Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt
-No valid web browser found. Please set in ./input/mg5_configuration.txt
+Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt
 treatcards run
 quit
 INFO:  
@@ -296,7 +296,7 @@ INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/mg5amc
 INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt/Cards/me5_configuration.txt  
 Using default text editor "vi". Set another one in ./input/mg5_configuration.txt
 Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt
-No valid web browser found. Please set in ./input/mg5_configuration.txt
+Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt
 treatcards param
 quit
 INFO:  
diff --git a/epochX/cudacpp/gg_tt.mad/COPYRIGHT b/epochX/cudacpp/gg_tt.mad/COPYRIGHT
index a134b5fef9..84a883fbb0 100644
--- a/epochX/cudacpp/gg_tt.mad/COPYRIGHT
+++ b/epochX/cudacpp/gg_tt.mad/COPYRIGHT
@@ -15,6 +15,7 @@ The full development team currently includes the following authors :
   Stephan Hageboeck (CERN)
   Olivier Mattelaer (Universite Catholique de Louvain, original author)
   Stefan Roiser (CERN, original author)
+  Joergen Teig (CERN)
   Andrea Valassi (CERN, original author)
   Zenny Wettersten (CERN)
 See https://github.com/madgraph5/madgraph4gpu for more details. For the full
diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/Bridge.h b/epochX/cudacpp/gg_tt.mad/SubProcesses/Bridge.h
index bf8b5e024d..89437b4c42 100644
--- a/epochX/cudacpp/gg_tt.mad/SubProcesses/Bridge.h
+++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/Bridge.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: S. Roiser (Nov 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Roiser, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef BRIDGE_H
 #define BRIDGE_H 1
@@ -23,7 +23,7 @@
 #include <memory>
 #include <type_traits>
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -83,7 +83,7 @@ namespace mg5amcCpu
     Bridge& operator=( const Bridge& ) = delete;
     Bridge& operator=( Bridge&& ) = delete;
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     /**
      * Set the gpublocks and gputhreads for the gpusequence - throws if evnt != gpublocks*gputhreads
      * (this is needed for BridgeKernel tests rather than for actual production use in Fortran)
@@ -150,7 +150,7 @@ namespace mg5amcCpu
     unsigned int m_nevt; // number of events
     int m_nGoodHel;      // the number of good helicities (-1 initially when they have not yet been calculated)
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     int m_gputhreads; // number of gpu threads (default set from number of events, can be modified)
     int m_gpublocks;  // number of gpu blocks (default set from number of events, can be modified)
     DeviceBuffer<FORTRANFPTYPE, sizePerEventMomenta> m_devMomentaF;
@@ -187,12 +187,12 @@ namespace mg5amcCpu
   // Forward declare transposition methods
   //
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 
   template<typename Tin, typename Tout>
   __global__ void dev_transposeMomentaF2C( const Tin* in, Tout* out, const unsigned int nevt );
 
-#endif // __CUDACC__
+#endif // MGONGPUCPP_GPUIMPL
 
   template<typename Tin, typename Tout>
   void hst_transposeMomentaF2C( const Tin* in, Tout* out, const unsigned int nevt );
@@ -209,7 +209,7 @@ namespace mg5amcCpu
   Bridge<FORTRANFPTYPE>::Bridge( unsigned int nevtF, unsigned int nparF, unsigned int np4F )
     : m_nevt( nevtF )
     , m_nGoodHel( -1 )
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     , m_gputhreads( 256 )                  // default number of gpu threads
     , m_gpublocks( m_nevt / m_gputhreads ) // this ensures m_nevt <= m_gpublocks*m_gputhreads
     , m_devMomentaF( m_nevt )
@@ -233,7 +233,7 @@ namespace mg5amcCpu
   {
     if( nparF != CPPProcess::npar ) throw std::runtime_error( "Bridge constructor: npar mismatch" );
     if( np4F != CPPProcess::np4 ) throw std::runtime_error( "Bridge constructor: np4 mismatch" );
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     if( ( m_nevt < s_gputhreadsmin ) || ( m_nevt % s_gputhreadsmin != 0 ) )
       throw std::runtime_error( "Bridge constructor: nevt should be a multiple of " + std::to_string( s_gputhreadsmin ) );
     while( m_nevt != m_gpublocks * m_gputhreads )
@@ -249,7 +249,7 @@ namespace mg5amcCpu
 #else
     std::cout << "WARNING! Instantiate host Bridge (nevt=" << m_nevt << ")" << std::endl;
     m_pmek.reset( new MatrixElementKernelHost( m_hstMomentaC, m_hstGs, m_hstRndHel, m_hstRndCol, m_hstMEs, m_hstSelHel, m_hstSelCol, m_nevt ) );
-#endif // __CUDACC__
+#endif // MGONGPUCPP_GPUIMPL
     // Create a process object, read param card and set parameters
     // FIXME: the process instance can happily go out of scope because it is only needed to read parameters?
     // FIXME: the CPPProcess should really be a singleton? what if fbridgecreate is called from several Fortran threads?
@@ -262,7 +262,7 @@ namespace mg5amcCpu
     process.initProc( paramCard );
   }
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   template<typename FORTRANFPTYPE>
   void Bridge<FORTRANFPTYPE>::set_gpugrid( const int gpublocks, const int gputhreads )
   {
@@ -276,7 +276,7 @@ namespace mg5amcCpu
   }
 #endif
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   template<typename FORTRANFPTYPE>
   void Bridge<FORTRANFPTYPE>::gpu_sequence( const FORTRANFPTYPE* momenta,
                                             const FORTRANFPTYPE* gs,
@@ -291,14 +291,14 @@ namespace mg5amcCpu
     constexpr int neppM = MemoryAccessMomenta::neppM;
     if constexpr( neppM == 1 && std::is_same_v<FORTRANFPTYPE, fptype> )
     {
-      checkCuda( cudaMemcpy( m_devMomentaC.data(), momenta, m_devMomentaC.bytes(), cudaMemcpyHostToDevice ) );
+      gpuMemcpy( m_devMomentaC.data(), momenta, m_devMomentaC.bytes(), gpuMemcpyHostToDevice );
     }
     else
     {
-      checkCuda( cudaMemcpy( m_devMomentaF.data(), momenta, m_devMomentaF.bytes(), cudaMemcpyHostToDevice ) );
+      gpuMemcpy( m_devMomentaF.data(), momenta, m_devMomentaF.bytes(), gpuMemcpyHostToDevice );
       const int thrPerEvt = CPPProcess::npar * CPPProcess::np4; // AV: transpose alg does 1 element per thread (NOT 1 event per thread)
       //const int thrPerEvt = 1; // AV: try new alg with 1 event per thread... this seems slower
-      dev_transposeMomentaF2C<<<m_gpublocks * thrPerEvt, m_gputhreads>>>( m_devMomentaF.data(), m_devMomentaC.data(), m_nevt );
+      gpuLaunchKernel( dev_transposeMomentaF2C, m_gpublocks * thrPerEvt, m_gputhreads, m_devMomentaF.data(), m_devMomentaC.data(), m_nevt );
     }
     if constexpr( std::is_same_v<FORTRANFPTYPE, fptype> )
     {
@@ -341,7 +341,7 @@ namespace mg5amcCpu
   }
 #endif
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   template<typename FORTRANFPTYPE>
   void Bridge<FORTRANFPTYPE>::cpu_sequence( const FORTRANFPTYPE* momenta,
                                             const FORTRANFPTYPE* gs,
@@ -396,7 +396,7 @@ namespace mg5amcCpu
   // - C++ array: momenta[npagM][npar][np4][neppM] with nevt=npagM*neppM (AOSOA)
   //
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   template<typename Tin, typename Tout>
   __global__ void dev_transposeMomentaF2C( const Tin* in, Tout* out, const unsigned int nevt )
   {
diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/BridgeKernels.cc b/epochX/cudacpp/gg_tt.mad/SubProcesses/BridgeKernels.cc
index d58066c9c1..eaf4037a24 100644
--- a/epochX/cudacpp/gg_tt.mad/SubProcesses/BridgeKernels.cc
+++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/BridgeKernels.cc
@@ -1,17 +1,18 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #include "BridgeKernels.h"
 
+#include "GpuAbstraction.h"
 #include "MemoryAccessMomenta.h"
 
 #include <sstream>
 
 //============================================================================
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -45,7 +46,7 @@ namespace mg5amcCpu
 
 //============================================================================
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 namespace mg5amcCpu
 {
 
@@ -96,7 +97,7 @@ namespace mg5amcCpu
 
 //============================================================================
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 {
 
diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/BridgeKernels.h b/epochX/cudacpp/gg_tt.mad/SubProcesses/BridgeKernels.h
index 15eb4bff4d..3efef8ce97 100644
--- a/epochX/cudacpp/gg_tt.mad/SubProcesses/BridgeKernels.h
+++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/BridgeKernels.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef BRIDGEKERNELS_H
 #define BRIDGEKERNELS_H 1
@@ -12,7 +12,7 @@
 #include "MatrixElementKernels.h"
 #include "MemoryBuffers.h"
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -49,7 +49,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A Bridge wrapper class encapsulating matrix element calculations on a CPU host
   class BridgeKernelHost final : public BridgeKernelBase
   {
@@ -89,7 +89,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   // A Bridge wrapper class encapsulating matrix element calculations on a GPU device
   class BridgeKernelDevice : public BridgeKernelBase
   {
diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/CommonRandomNumberKernel.cc b/epochX/cudacpp/gg_tt.mad/SubProcesses/CommonRandomNumberKernel.cc
index 985b39f576..010bc4cbd0 100644
--- a/epochX/cudacpp/gg_tt.mad/SubProcesses/CommonRandomNumberKernel.cc
+++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/CommonRandomNumberKernel.cc
@@ -1,15 +1,16 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #include "CommonRandomNumbers.h"
+#include "GpuAbstraction.h"
 #include "MemoryBuffers.h"
 #include "RandomNumberKernels.h"
 
 #include <cassert>
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/CrossSectionKernels.cc b/epochX/cudacpp/gg_tt.mad/SubProcesses/CrossSectionKernels.cc
index 0b355a3c8d..c15b39844d 100644
--- a/epochX/cudacpp/gg_tt.mad/SubProcesses/CrossSectionKernels.cc
+++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/CrossSectionKernels.cc
@@ -1,10 +1,11 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #include "CrossSectionKernels.h"
 
+#include "GpuAbstraction.h"
 #include "MemoryAccessMatrixElements.h"
 #include "MemoryAccessWeights.h"
 #include "MemoryBuffers.h"
@@ -77,7 +78,7 @@ debug_me_is_abnormal( const fptype& me, size_t ievtALL )
 
 //============================================================================
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -185,7 +186,7 @@ namespace mg5amcCpu
 
 //============================================================================
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 {
 
diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/CrossSectionKernels.h b/epochX/cudacpp/gg_tt.mad/SubProcesses/CrossSectionKernels.h
index 7933ca4bbf..4d9659e04e 100644
--- a/epochX/cudacpp/gg_tt.mad/SubProcesses/CrossSectionKernels.h
+++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/CrossSectionKernels.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef CROSSSECTIONKERNELS_H
 #define CROSSSECTIONKERNELS_H 1
@@ -13,7 +13,7 @@
 
 //============================================================================
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -96,7 +96,7 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
   /*
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   // A class encapsulating the calculation of event statistics on a GPU device
   class CrossSectionKernelDevice : public CrossSectionKernelBase, public NumberOfEvents
   {
diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/CurandRandomNumberKernel.cc b/epochX/cudacpp/gg_tt.mad/SubProcesses/CurandRandomNumberKernel.cc
index eb56333b03..08a16f6f2c 100644
--- a/epochX/cudacpp/gg_tt.mad/SubProcesses/CurandRandomNumberKernel.cc
+++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/CurandRandomNumberKernel.cc
@@ -1,9 +1,9 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
-#include "CudaRuntime.h"
+#include "GpuRuntime.h"
 #include "MemoryBuffers.h"
 #include "RandomNumberKernels.h"
 
@@ -22,7 +22,7 @@ inline void assertCurand( curandStatus_t code, const char *file, int line, bool
 }
 #endif /* clang-format on */
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -36,7 +36,7 @@ namespace mg5amcCpu
   {
     if( m_isOnDevice )
     {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
       if( !m_rnarray.isOnDevice() )
         throw std::runtime_error( "CurandRandomNumberKernel on device with a host random number array" );
 #else
@@ -114,7 +114,7 @@ namespace mg5amcCpu
     /*
     printf( "\nCurandRandomNumberKernel::generateRnarray size = %d\n", (int)m_rnarray.size() );
     fptype* data = m_rnarray.data();
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     if( m_rnarray.isOnDevice() )
     {
       data = new fptype[m_rnarray.size()]();
@@ -123,7 +123,7 @@ namespace mg5amcCpu
 #endif
     for( int i = 0; i < ( (int)m_rnarray.size() / 4 ); i++ )
       printf( "[%4d] %f %f %f %f\n", i * 4, data[i * 4], data[i * 4 + 2], data[i * 4 + 2], data[i * 4 + 3] );
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     if( m_rnarray.isOnDevice() ) delete[] data;
 #endif
     */
diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/EventStatistics.h b/epochX/cudacpp/gg_tt.mad/SubProcesses/EventStatistics.h
index 48b51e0a49..b425a5bade 100644
--- a/epochX/cudacpp/gg_tt.mad/SubProcesses/EventStatistics.h
+++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/EventStatistics.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef EventStatistics_H
 #define EventStatistics_H 1
@@ -16,7 +16,7 @@
 #include <limits>
 #include <string>
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/GpuAbstraction.h b/epochX/cudacpp/gg_tt.mad/SubProcesses/GpuAbstraction.h
new file mode 100644
index 0000000000..6a7d9c05c0
--- /dev/null
+++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/GpuAbstraction.h
@@ -0,0 +1,71 @@
+// Copyright (C) 2020-2023 CERN and UCLouvain.
+// Licensed under the GNU Lesser General Public License (version 3 or later).
+// Created by: J. Teig (Jul 2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+
+#ifndef MG5AMC_GPUABSTRACTION_H
+#define MG5AMC_GPUABSTRACTION_H 1
+
+#include <cassert>
+
+//--------------------------------------------------------------------------
+
+#ifdef __CUDACC__
+
+#define gpuError_t cudaError_t
+#define gpuPeekAtLastError cudaPeekAtLastError
+#define gpuGetErrorString cudaGetErrorString
+#define gpuSuccess cudaSuccess
+
+#define gpuMallocHost( ptr, size ) checkGpu( cudaMallocHost( ptr, size ) )
+#define gpuMalloc( ptr, size ) checkGpu( cudaMalloc( ptr, size ) )
+
+#define gpuMemcpy( dstData, srcData, srcBytes, func ) checkGpu( cudaMemcpy( dstData, srcData, srcBytes, func ) )
+#define gpuMemcpyHostToDevice cudaMemcpyHostToDevice
+#define gpuMemcpyDeviceToHost cudaMemcpyDeviceToHost
+#define gpuMemcpyToSymbol( type1, type2, size ) checkGpu( cudaMemcpyToSymbol( type1, type2, size ) )
+
+#define gpuFree( ptr ) checkGpu( cudaFree( ptr ) )
+#define gpuFreeHost( ptr ) checkGpu( cudaFreeHost( ptr ) )
+
+#define gpuSetDevice cudaSetDevice
+#define gpuDeviceSynchronize cudaDeviceSynchronize
+#define gpuDeviceReset cudaDeviceReset
+
+#define gpuLaunchKernel( kernel, blocks, threads, ... ) kernel<<<blocks, threads>>>( __VA_ARGS__ )
+#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<<blocks, threads, sharedMem>>>( __VA_ARGS__ )
+
+//--------------------------------------------------------------------------
+
+#elif defined __HIPCC__
+
+#include "hip/hip_runtime.h"
+
+#define gpuError_t hipError_t
+#define gpuPeekAtLastError hipPeekAtLastError
+#define gpuGetErrorString hipGetErrorString
+#define gpuSuccess hipSuccess
+
+#define gpuMallocHost( ptr, size ) checkGpu( hipHostMalloc( ptr, size ) ) // HostMalloc better
+#define gpuMalloc( ptr, size ) checkGpu( hipMalloc( ptr, size ) )
+
+#define gpuMemcpy( dstData, srcData, srcBytes, func ) checkGpu( hipMemcpy( dstData, srcData, srcBytes, func ) )
+#define gpuMemcpyHostToDevice hipMemcpyHostToDevice
+#define gpuMemcpyDeviceToHost hipMemcpyDeviceToHost
+#define gpuMemcpyToSymbol( type1, type2, size ) checkGpu( hipMemcpyToSymbol( type1, type2, size ) )
+
+#define gpuFree( ptr ) checkGpu( hipFree( ptr ) )
+#define gpuFreeHost( ptr ) checkGpu( hipHostFree( ptr ) )
+
+#define gpuSetDevice hipSetDevice
+#define gpuDeviceSynchronize hipDeviceSynchronize
+#define gpuDeviceReset hipDeviceReset
+
+#define gpuLaunchKernel( kernel, blocks, threads, ... ) kernel<<<blocks, threads>>>( __VA_ARGS__ )
+#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<<blocks, threads, sharedMem>>>( __VA_ARGS__ )
+
+//--------------------------------------------------------------------------
+
+#endif
+
+#endif // MG5AMC_GPUABSTRACTION_H
diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/CudaRuntime.h b/epochX/cudacpp/gg_tt.mad/SubProcesses/GpuRuntime.h
similarity index 62%
rename from epochX/cudacpp/gg_tt.mad/SubProcesses/CudaRuntime.h
rename to epochX/cudacpp/gg_tt.mad/SubProcesses/GpuRuntime.h
index 64ce52f4b3..93579ef08b 100644
--- a/epochX/cudacpp/gg_tt.mad/SubProcesses/CudaRuntime.h
+++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/GpuRuntime.h
@@ -1,49 +1,50 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
-// Created by: S. Roiser (Jul 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+// Created by: J. Teig (Jun 2023, based on earlier work by S. Roiser) for the MG5aMC CUDACPP plugin.
+// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 
-#ifndef MG5AMC_CUDARUNTIME_H
-#define MG5AMC_CUDARUNTIME_H 1
+#ifndef MG5AMC_GPURUNTIME_H
+#define MG5AMC_GPURUNTIME_H 1
 
 // MG5AMC on GPU uses the CUDA runtime API, not the lower level CUDA driver API
 // See https://docs.nvidia.com/cuda/cuda-runtime-api/driver-vs-runtime-api.html#driver-vs-runtime-api
 
-#include <cassert>
+#include "GpuAbstraction.h"
+
 #include <iostream>
 
 //--------------------------------------------------------------------------
 
 // See https://stackoverflow.com/a/14038590
-#ifdef __CUDACC__ /* clang-format off */
-#define checkCuda( code ) { assertCuda( code, __FILE__, __LINE__ ); }
-inline void assertCuda( cudaError_t code, const char* file, int line, bool abort = true )
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
+#define checkGpu( code ) { assertGpu( code, __FILE__, __LINE__ ); }
+inline void assertGpu( gpuError_t code, const char* file, int line, bool abort = true )
 {
-  if( code != cudaSuccess )
+  if( code != gpuSuccess )
   {
-    printf( "ERROR! assertCuda: '%s' (%d) in %s:%d\n", cudaGetErrorString( code ), code, file, line );
-    if( abort ) assert( code == cudaSuccess );
+    printf( "ERROR! assertGpu: '%s' (%d) in %s:%d\n", gpuGetErrorString( code ), code, file, line );
+    if( abort ) assert( code == gpuSuccess );
   }
 }
 #endif /* clang-format on */
 
 //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 {
-  // Instantiate a CudaRuntime at the beginnining of the application's main to
-  // invoke cudaSetDevice(0) in the constructor and book a cudaDeviceReset() call in the destructor
+  // Instantiate a GpuRuntime at the beginnining of the application's main to
+  // invoke gpuSetDevice(0) in the constructor and book a gpuDeviceReset() call in the destructor
   // *** FIXME! This will all need to be designed differently when going to multi-GPU nodes! ***
-  struct CudaRuntime final
+  struct GpuRuntime final
   {
-    CudaRuntime( const bool debug = true )
+    GpuRuntime( const bool debug = true )
       : m_debug( debug ) { setUp( m_debug ); }
-    ~CudaRuntime() { tearDown( m_debug ); }
-    CudaRuntime( const CudaRuntime& ) = delete;
-    CudaRuntime( CudaRuntime&& ) = delete;
-    CudaRuntime& operator=( const CudaRuntime& ) = delete;
-    CudaRuntime& operator=( CudaRuntime&& ) = delete;
+    ~GpuRuntime() { tearDown( m_debug ); }
+    GpuRuntime( const GpuRuntime& ) = delete;
+    GpuRuntime( GpuRuntime&& ) = delete;
+    GpuRuntime& operator=( const GpuRuntime& ) = delete;
+    GpuRuntime& operator=( GpuRuntime&& ) = delete;
     bool m_debug;
 
     // Set up CUDA application
@@ -62,8 +63,8 @@ namespace mg5amcGpu
       */
       // Replace cudaFree(0) by cudaSetDevice(0), even if it is not really needed either
       // (but see https://developer.nvidia.com/blog/cuda-pro-tip-always-set-current-device-avoid-multithreading-bugs)
-      if( debug ) std::cout << "__CudaRuntime: calling cudaSetDevice(0)" << std::endl;
-      checkCuda( cudaSetDevice( 0 ) ); // SLOW!
+      if( debug ) std::cout << "__GpuRuntime: calling GpuSetDevice(0)" << std::endl;
+      checkGpu( gpuSetDevice( 0 ) ); // SLOW!
     }
 
     // Tear down CUDA application (call cudaDeviceReset)
@@ -72,14 +73,13 @@ namespace mg5amcGpu
     // See https://docs.nvidia.com/cuda/cuda-memcheck/index.html#leak-checking
     static void tearDown( const bool debug = true )
     {
-      if( debug ) std::cout << "__CudaRuntime: calling cudaDeviceReset()" << std::endl;
-      checkCuda( cudaDeviceReset() );
+      if( debug ) std::cout << "__GpuRuntime: calling GpuDeviceReset()" << std::endl;
+      checkGpu( gpuDeviceReset() );
     }
   };
-
 }
 #endif
 
 //--------------------------------------------------------------------------
 
-#endif // MG5AMC_CUDARUNTIME_H
+#endif // MG5AMC_GPURUNTIME_H
diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/MadgraphTest.h b/epochX/cudacpp/gg_tt.mad/SubProcesses/MadgraphTest.h
index ef40624c88..a64c05c26a 100644
--- a/epochX/cudacpp/gg_tt.mad/SubProcesses/MadgraphTest.h
+++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/MadgraphTest.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: S. Hageboeck (Dec 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MADGRAPHTEST_H_
 #define MADGRAPHTEST_H_ 1
@@ -22,7 +22,7 @@
 #include <string>
 #include <vector>
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 using mg5amcGpu::CPPProcess;
 #else
 using mg5amcCpu::CPPProcess;
@@ -201,7 +201,7 @@ class MadgraphTest : public testing::TestWithParam<TestDriverBase*>
 
 // Since we link both the CPU-only and GPU tests into the same executable, we prevent
 // a multiply defined symbol by only compiling this in the non-CUDA phase:
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 
 /// Compare momenta and matrix elements.
 /// This uses an implementation of TestDriverBase to run a madgraph workflow,
@@ -307,6 +307,6 @@ TEST_P( MadgraphTest, CompareMomentaAndME )
   }
 }
 
-#endif // __CUDACC__
+#endif // MGONGPUCPP_GPUIMPL
 
 #endif /* MADGRAPHTEST_H_ */
diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/MatrixElementKernels.cc b/epochX/cudacpp/gg_tt.mad/SubProcesses/MatrixElementKernels.cc
index 74b5239ebf..81699dfea9 100644
--- a/epochX/cudacpp/gg_tt.mad/SubProcesses/MatrixElementKernels.cc
+++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/MatrixElementKernels.cc
@@ -1,12 +1,12 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #include "MatrixElementKernels.h"
 
 #include "CPPProcess.h"
-#include "CudaRuntime.h"
+#include "GpuRuntime.h" // Includes the abstraction for Nvidia/AMD compilation
 #include "MemoryAccessMomenta.h"
 #include "MemoryBuffers.h"
 
@@ -14,7 +14,7 @@
 
 //============================================================================
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 namespace mg5amcCpu
 {
 
@@ -150,7 +150,7 @@ namespace mg5amcCpu
 
 //============================================================================
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 {
 
@@ -209,13 +209,13 @@ namespace mg5amcGpu
     PinnedHostBufferHelicityMask hstIsGoodHel( ncomb );
     DeviceBufferHelicityMask devIsGoodHel( ncomb );
     // ... 0d1. Compute good helicity mask on the device
-    computeDependentCouplings<<<m_gpublocks, m_gputhreads>>>( m_gs.data(), m_couplings.data() );
+    gpuLaunchKernel( computeDependentCouplings, m_gpublocks, m_gputhreads, m_gs.data(), m_couplings.data() );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    sigmaKin_getGoodHel<<<m_gpublocks, m_gputhreads>>>( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_numerators.data(), m_denominators.data(), devIsGoodHel.data() );
+    gpuLaunchKernel( sigmaKin_getGoodHel, m_gpublocks, m_gputhreads, m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_numerators.data(), m_denominators.data(), devIsGoodHel.data() );
 #else
-    sigmaKin_getGoodHel<<<m_gpublocks, m_gputhreads>>>( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), devIsGoodHel.data() );
+    gpuLaunchKernel( sigmaKin_getGoodHel, m_gpublocks, m_gputhreads, m_momenta.data(), m_couplings.data(), m_matrixElements.data(), devIsGoodHel.data() );
 #endif
-    checkCuda( cudaPeekAtLastError() );
+    checkGpu( gpuPeekAtLastError() );
     // ... 0d2. Copy back good helicity mask to the host
     copyHostFromDevice( hstIsGoodHel, devIsGoodHel );
     // ... 0d3. Copy back good helicity list to constant memory on the device
@@ -226,19 +226,19 @@ namespace mg5amcGpu
 
   void MatrixElementKernelDevice::computeMatrixElements( const unsigned int channelId )
   {
-    computeDependentCouplings<<<m_gpublocks, m_gputhreads>>>( m_gs.data(), m_couplings.data() );
+    gpuLaunchKernel( computeDependentCouplings, m_gpublocks, m_gputhreads, m_gs.data(), m_couplings.data() );
 #ifndef MGONGPU_NSIGHT_DEBUG
     constexpr unsigned int sharedMemSize = 0;
 #else
     constexpr unsigned int sharedMemSize = ntpbMAX * sizeof( float );
 #endif
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    sigmaKin<<<m_gpublocks, m_gputhreads, sharedMemSize>>>( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), channelId, m_numerators.data(), m_denominators.data(), m_selhel.data(), m_selcol.data() );
+    gpuLaunchKernelSharedMem( sigmaKin, m_gpublocks, m_gputhreads, sharedMemSize, m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), channelId, m_numerators.data(), m_denominators.data(), m_selhel.data(), m_selcol.data() );
 #else
-    sigmaKin<<<m_gpublocks, m_gputhreads, sharedMemSize>>>( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), m_selhel.data(), m_selcol.data() );
+    gpuLaunchKernelSharedMem( sigmaKin, m_gpublocks, m_gputhreads, sharedMemSize, m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), m_selhel.data(), m_selcol.data() );
 #endif
-    checkCuda( cudaPeekAtLastError() );
-    checkCuda( cudaDeviceSynchronize() );
+    checkGpu( gpuPeekAtLastError() );
+    checkGpu( gpuDeviceSynchronize() );
   }
 
   //--------------------------------------------------------------------------
diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/MatrixElementKernels.h b/epochX/cudacpp/gg_tt.mad/SubProcesses/MatrixElementKernels.h
index 23e84757a2..72bd8f195b 100644
--- a/epochX/cudacpp/gg_tt.mad/SubProcesses/MatrixElementKernels.h
+++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/MatrixElementKernels.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MATRIXELEMENTKERNELS_H
 #define MATRIXELEMENTKERNELS_H 1
@@ -10,7 +10,7 @@
 
 #include "MemoryBuffers.h"
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -81,7 +81,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating matrix element calculations on a CPU host
   class MatrixElementKernelHost final : public MatrixElementKernelBase, public NumberOfEvents
   {
@@ -130,7 +130,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   // A class encapsulating matrix element calculations on a GPU device
   class MatrixElementKernelDevice : public MatrixElementKernelBase, public NumberOfEvents
   {
diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryAccessAmplitudes.h b/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryAccessAmplitudes.h
index 573b3bbbc9..ffb76e93de 100644
--- a/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryAccessAmplitudes.h
+++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryAccessAmplitudes.h
@@ -15,7 +15,7 @@
 #define MGONGPU_TRIVIAL_AMPLITUDES 1
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryAccessCouplings.h b/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryAccessCouplings.h
index 35a3af42e0..3afdf3e554 100644
--- a/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryAccessCouplings.h
+++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryAccessCouplings.h
@@ -15,7 +15,7 @@
 #include "MemoryBuffers.h"       // for HostBufferCouplings::isaligned
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryAccessCouplingsFixed.h b/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryAccessCouplingsFixed.h
index dc0d93afff..ffcdf4dbef 100644
--- a/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryAccessCouplingsFixed.h
+++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryAccessCouplingsFixed.h
@@ -14,7 +14,7 @@
 //#include "MemoryAccessHelpers.h"
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryAccessDenominators.h b/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryAccessDenominators.h
index 3bce635718..66f2d32a6b 100644
--- a/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryAccessDenominators.h
+++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryAccessDenominators.h
@@ -10,7 +10,7 @@
 #include "MemoryAccessGs.h"
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryAccessGs.h b/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryAccessGs.h
index 31311aa375..4c726b30f3 100644
--- a/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryAccessGs.h
+++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryAccessGs.h
@@ -13,7 +13,7 @@
 #include "MemoryBuffers.h" // for HostBufferMatrixElements::isaligned
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryAccessHelpers.h b/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryAccessHelpers.h
index c82a6c7635..db73e4e064 100644
--- a/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryAccessHelpers.h
+++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryAccessHelpers.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MemoryAccessHelpers_H
 #define MemoryAccessHelpers_H 1
@@ -105,7 +105,7 @@ class KernelAccessHelper : public MemoryAccessHelper<T>
     }
     else
     {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
       const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid
       //printf( "kernelAccessRecord: ievt=%d threadId=%d\n", ievt, threadIdx.x );
       return T::ieventAccessRecord( buffer, ievt ); // NB fptype and fptype_sv coincide for CUDA
diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryAccessMatrixElements.h b/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryAccessMatrixElements.h
index f32e6fea5b..3741011971 100644
--- a/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryAccessMatrixElements.h
+++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryAccessMatrixElements.h
@@ -13,7 +13,7 @@
 #include "MemoryBuffers.h" // for HostBufferMatrixElements::isaligned
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryAccessMomenta.h b/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryAccessMomenta.h
index 29266de32c..3be229d392 100644
--- a/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryAccessMomenta.h
+++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryAccessMomenta.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MemoryAccessMomenta_H
 #define MemoryAccessMomenta_H 1
@@ -13,7 +13,7 @@
 #include "MemoryAccessVectors.h"
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -30,7 +30,7 @@ namespace mg5amcCpu
 
     // Number of Events Per Page in the momenta AOSOA memory buffer layout
     // (these are all best kept as a compile-time constants: see issue #23)
-#ifdef __CUDACC__ /* clang-format off */
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
     // -----------------------------------------------------------------------------------------------
     // --- GPUs: neppM is best set to a power of 2 times the number of fptype's in a 32-byte cacheline
     // --- This is relevant to ensure coalesced access to momenta in global memory
diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryAccessNumerators.h b/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryAccessNumerators.h
index b152183b28..18991f4fa6 100644
--- a/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryAccessNumerators.h
+++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryAccessNumerators.h
@@ -10,7 +10,7 @@
 #include "MemoryAccessGs.h"
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryAccessRandomNumbers.h b/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryAccessRandomNumbers.h
index e2988d39f3..40cb089135 100644
--- a/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryAccessRandomNumbers.h
+++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryAccessRandomNumbers.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MemoryAccessRandomNumbers_H
 #define MemoryAccessRandomNumbers_H 1
@@ -11,7 +11,7 @@
 #include "CPPProcess.h"
 #include "MemoryAccessHelpers.h"
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 using mg5amcGpu::CPPProcess;
 #else
 using mg5amcCpu::CPPProcess;
diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryAccessVectors.h b/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryAccessVectors.h
index e9b197368e..08faccff0f 100644
--- a/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryAccessVectors.h
+++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryAccessVectors.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MemoryAccessVectors_H
 #define MemoryAccessVectors_H 1
@@ -10,7 +10,7 @@
 
 #include "mgOnGpuVectors.h"
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 namespace mg5amcCpu // this is only needed for CPU SIMD vectorization
 {
 
diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryAccessWavefunctions.h b/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryAccessWavefunctions.h
index 5428aaf933..33bef4559e 100644
--- a/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryAccessWavefunctions.h
+++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryAccessWavefunctions.h
@@ -15,7 +15,7 @@
 #define MGONGPU_TRIVIAL_WAVEFUNCTIONS 1
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryBuffers.h b/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryBuffers.h
index 3093e6ed18..7756a71621 100644
--- a/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryBuffers.h
+++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryBuffers.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021, based on earlier work by S. Hageboeck) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Roiser, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MemoryBuffers_H
 #define MemoryBuffers_H 1
@@ -11,12 +11,12 @@
 #include "mgOnGpuCxtypes.h"
 
 #include "CPPProcess.h"
-#include "CudaRuntime.h"
+#include "GpuRuntime.h"
 #include "Parameters_sm.h"
 
 #include <sstream>
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -87,7 +87,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   constexpr bool HostBufferALIGNED = false;   // ismisaligned=false
   constexpr bool HostBufferMISALIGNED = true; // ismisaligned=true
 
@@ -119,7 +119,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   // A class encapsulating a CUDA pinned host buffer
   template<typename T>
   class PinnedHostBufferBase : public BufferBase<T>
@@ -128,18 +128,18 @@ namespace mg5amcCpu
     PinnedHostBufferBase( const size_t size )
       : BufferBase<T>( size, false )
     {
-      checkCuda( cudaMallocHost( &( this->m_data ), this->bytes() ) );
+      gpuMallocHost( &( this->m_data ), this->bytes() );
     }
     virtual ~PinnedHostBufferBase()
     {
-      checkCuda( cudaFreeHost( this->m_data ) );
+      gpuFreeHost( this->m_data );
     }
   };
 #endif
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   // A class encapsulating a CUDA device buffer
   template<typename T>
   class DeviceBufferBase : public BufferBase<T>
@@ -148,18 +148,18 @@ namespace mg5amcCpu
     DeviceBufferBase( const size_t size )
       : BufferBase<T>( size, true )
     {
-      checkCuda( cudaMalloc( &( this->m_data ), this->bytes() ) );
+      gpuMalloc( &( this->m_data ), this->bytes() );
     }
     virtual ~DeviceBufferBase()
     {
-      checkCuda( cudaFree( this->m_data ) );
+      gpuFree( this->m_data );
     }
   };
 #endif
 
   //--------------------------------------------------------------------------
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for a given number of events
   template<typename T, size_t sizePerEvent, bool ismisaligned>
   class HostBuffer : public HostBufferBase<T, ismisaligned>, virtual private NumberOfEvents
@@ -175,7 +175,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   // A class encapsulating a CUDA pinned host buffer for a given number of events
   template<typename T, size_t sizePerEvent>
   class PinnedHostBuffer : public PinnedHostBufferBase<T>, virtual private NumberOfEvents
@@ -191,7 +191,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   // A class encapsulating a CUDA device buffer for a given number of events
   template<typename T, size_t sizePerEvent>
   class DeviceBuffer : public DeviceBufferBase<T>, virtual private NumberOfEvents
@@ -213,7 +213,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for momenta random numbers
   constexpr size_t sizePerEventRndNumMomenta = MemoryBuffers::np4 * MemoryBuffers::nparf;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for momenta random numbers
   typedef HostBuffer<fptype, sizePerEventRndNumMomenta, HostBufferALIGNED> HostBufferRndNumMomenta;
 #else
@@ -232,7 +232,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer with ONE fptype per event
   constexpr size_t sizePerEventOneFp = 1;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer with ONE fptype per event
   typedef HostBuffer<fptype, sizePerEventOneFp, HostBufferALIGNED> HostBufferOneFp;
 #else
@@ -257,7 +257,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for Gs
   constexpr size_t sizePerEventGs = 1;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for gs
   typedef HostBuffer<fptype, sizePerEventGs, HostBufferALIGNED> HostBufferGs;
 #else
@@ -276,7 +276,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for numerators
   constexpr size_t sizePerEventNumerators = 1;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for gs
   typedef HostBuffer<fptype, sizePerEventNumerators, HostBufferALIGNED> HostBufferNumerators;
 #else
@@ -296,7 +296,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for denominators
   constexpr size_t sizePerEventDenominators = 1;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for gs
   typedef HostBuffer<fptype, sizePerEventDenominators, HostBufferALIGNED> HostBufferDenominators;
 #else
@@ -315,7 +315,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for random numbers
   constexpr size_t sizePerEventCouplings = MemoryBuffers::ndcoup * MemoryBuffers::nx2;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for gs
   typedef HostBuffer<fptype, sizePerEventCouplings, HostBufferALIGNED> HostBufferCouplings;
 #else
@@ -333,7 +333,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for momenta
   constexpr size_t sizePerEventMomenta = MemoryBuffers::np4 * MemoryBuffers::npar;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for momenta
   typedef HostBuffer<fptype, sizePerEventMomenta, HostBufferALIGNED> HostBufferMomenta;
   //typedef HostBuffer<fptype, sizePerEventMomenta, HostBufferMISALIGNED> HostBufferMomenta; // TEST MISALIGNMENT!
@@ -352,7 +352,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for sampling weights
   constexpr size_t sizePerEventWeights = 1;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for sampling weights
   typedef HostBuffer<fptype, sizePerEventWeights, HostBufferALIGNED> HostBufferWeights;
 #else
@@ -370,7 +370,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for matrix elements
   constexpr size_t sizePerEventMatrixElements = 1;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for matrix elements
   typedef HostBuffer<fptype, sizePerEventMatrixElements, HostBufferALIGNED> HostBufferMatrixElements;
 #else
@@ -385,7 +385,7 @@ namespace mg5amcCpu
   // A base class encapsulating a memory buffer for the helicity mask
   typedef BufferBase<bool> BufferHelicityMask;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for the helicity mask
   typedef HostBufferBase<bool, HostBufferALIGNED> HostBufferHelicityMask;
 #else
@@ -403,7 +403,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for wavefunctions
   constexpr size_t sizePerEventWavefunctions = MemoryBuffers::nw6 * MemoryBuffers::nx2;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for wavefunctions
   typedef HostBuffer<fptype, sizePerEventWavefunctions, HostBufferALIGNED> HostBufferWavefunctions;
 #else
@@ -421,7 +421,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for helicity random numbers
   constexpr size_t sizePerEventRndNumHelicity = 1;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for helicity random numbers
   typedef HostBuffer<fptype, sizePerEventRndNumHelicity, HostBufferALIGNED> HostBufferRndNumHelicity;
 #else
@@ -439,7 +439,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for color random numbers
   constexpr size_t sizePerEventRndNumColor = 1;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for color random numbers
   typedef HostBuffer<fptype, sizePerEventRndNumColor, HostBufferALIGNED> HostBufferRndNumColor;
 #else
@@ -457,7 +457,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for helicity selection
   constexpr size_t sizePerEventSelectedHelicity = 1;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for helicity selection
   typedef HostBuffer<int, sizePerEventSelectedHelicity, HostBufferALIGNED> HostBufferSelectedHelicity;
 #else
@@ -475,7 +475,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for color selection
   constexpr size_t sizePerEventSelectedColor = 1;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for color selection
   typedef HostBuffer<int, sizePerEventSelectedColor, HostBufferALIGNED> HostBufferSelectedColor;
 #else
@@ -487,7 +487,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   template<class Tdst, class Tsrc>
   void copyDeviceFromHost( Tdst& dst, const Tsrc& src ) // keep the same order of arguments as in memcpy
   {
@@ -504,13 +504,13 @@ namespace mg5amcCpu
       throw std::runtime_error( sstr.str() );
     }
     // NB (PR #45): cudaMemcpy involves an intermediate memcpy to pinned memory if host array is a not a pinned host array
-    checkCuda( cudaMemcpy( dst.data(), src.data(), src.bytes(), cudaMemcpyHostToDevice ) );
+    gpuMemcpy( dst.data(), src.data(), src.bytes(), gpuMemcpyHostToDevice );
   }
 #endif
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   template<class Tdst, class Tsrc>
   void copyHostFromDevice( Tdst& dst, const Tsrc& src ) // keep the same order of arguments as in memcpy
   {
@@ -527,7 +527,7 @@ namespace mg5amcCpu
       throw std::runtime_error( sstr.str() );
     }
     // NB (PR #45): cudaMemcpy involves an intermediate memcpy to pinned memory if host array is a not a pinned host array
-    checkCuda( cudaMemcpy( dst.data(), src.data(), src.bytes(), cudaMemcpyDeviceToHost ) );
+    gpuMemcpy( dst.data(), src.data(), src.bytes(), gpuMemcpyDeviceToHost );
   }
 #endif
 
diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/CPPProcess.cc b/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/CPPProcess.cc
index 18052b6676..f20c229897 100644
--- a/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/CPPProcess.cc
+++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/CPPProcess.cc
@@ -4,7 +4,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
 // MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
@@ -16,7 +16,6 @@
 
 #include "mgOnGpuConfig.h"
 
-#include "CudaRuntime.h"
 #include "HelAmps_sm.h"
 #include "MemoryAccessAmplitudes.h"
 #include "MemoryAccessCouplings.h"
@@ -46,7 +45,7 @@
 // Class member functions for calculating the matrix elements for
 // Process: g g > t t~ WEIGHTED<=2 @1
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -80,7 +79,7 @@ namespace mg5amcCpu
   __device__ const fptype cIPD[2] = { (fptype)Parameters_sm::mdl_MT, (fptype)Parameters_sm::mdl_WT };
   __device__ const fptype* cIPC = nullptr; // unused as nicoup=0
 #else
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   __device__ __constant__ fptype cIPD[2];
   __device__ __constant__ fptype* cIPC = nullptr; // unused as nicoup=0
 #else
@@ -90,7 +89,7 @@ namespace mg5amcCpu
 #endif
 
   // Helicity combinations (and filtering of "good" helicity combinations)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   __device__ __constant__ short cHel[ncomb][npar];
   __device__ __constant__ int cNGoodHel;
   __device__ __constant__ int cGoodHel[ncomb];
@@ -118,13 +117,13 @@ namespace mg5amcCpu
                            fptype* allDenominators,       // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
                            fptype_sv* jamp2_sv            // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled)
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
                            , const int ievt00             // input: first event number in current C++ event page (for CUDA, ievt depends on threadid)
 #endif
                            )
   //ALWAYS_INLINE // attributes are not permitted in a function definition
   {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     using namespace mg5amcGpu;
     using M_ACCESS = DeviceAccessMomenta;         // non-trivial access: buffer includes all events
     using E_ACCESS = DeviceAccessMatrixElements;  // non-trivial access: buffer includes all events
@@ -151,7 +150,7 @@ namespace mg5amcCpu
 #endif /* clang-format on */
     mgDebug( 0, __FUNCTION__ );
     //printf( "calculate_wavefunctions: ihel=%2d\n", ihel );
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
     //printf( "calculate_wavefunctions: ievt00=%d\n", ievt00 );
 #endif
 
@@ -187,7 +186,7 @@ namespace mg5amcCpu
 #endif
     for( int iParity = 0; iParity < nParity; ++iParity )
     { // START LOOP ON IPARITY
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
       const int ievt0 = ievt00 + iParity * neppV;
 #endif
       constexpr size_t nxcoup = ndcoup + nicoup; // both dependent and independent couplings
@@ -200,8 +199,10 @@ namespace mg5amcCpu
         allCOUPs[idcoup] = CD_ACCESS::idcoupAccessBufferConst( allcouplings, idcoup ); // dependent couplings, vary event-by-event
       for( size_t iicoup = 0; iicoup < nicoup; iicoup++ )
         allCOUPs[ndcoup + iicoup] = CI_ACCESS::iicoupAccessBufferConst( cIPC, iicoup ); // independent couplings, fixed for all events
+#ifdef MGONGPUCPP_GPUIMPL
 #ifdef __CUDACC__
 #pragma nv_diagnostic pop
+#endif
       // CUDA kernels take input/output buffers with momenta/MEs for all events
       const fptype* momenta = allmomenta;
       const fptype* COUPs[nxcoup];
@@ -302,7 +303,7 @@ namespace mg5amcCpu
         { 16, -2 },
         { -2, 16 } }; // 2-D array[2][2]
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
       // Pre-compute a constexpr triangular color matrix properly normalized #475
       struct TriangularNormalizedColorMatrix
       {
@@ -359,7 +360,7 @@ namespace mg5amcCpu
 #endif
       for( int icol = 0; icol < ncolor; icol++ )
       {
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
         // === C++ START ===
         // Diagonal terms
 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
@@ -418,7 +419,7 @@ namespace mg5amcCpu
       MEs_sv_previous += deltaMEs_previous;
 #endif
       /*
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
       if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", blockDim.x * blockIdx.x + threadIdx.x, ihel, MEs_sv );
 #else
 #ifdef MGONGPU_CPPSIMD
@@ -465,8 +466,8 @@ namespace mg5amcCpu
       { 1, 1, -1, -1 },
       { 1, 1, 1, 1 },
       { 1, 1, 1, -1 } };
-#ifdef __CUDACC__
-    checkCuda( cudaMemcpyToSymbol( cHel, tHel, ncomb * npar * sizeof( short ) ) );
+#ifdef MGONGPUCPP_GPUIMPL
+    gpuMemcpyToSymbol( cHel, tHel, ncomb * npar * sizeof( short ) );
 #else
     memcpy( cHel, tHel, ncomb * npar * sizeof( short ) );
 #endif
@@ -506,9 +507,9 @@ namespace mg5amcCpu
     // Then copy them to CUDA constant memory (issue #39) or its C++ emulation in file-scope static memory
     const fptype tIPD[2] = { (fptype)m_pars->mdl_MT, (fptype)m_pars->mdl_WT };
     //const cxtype tIPC[0] = { ... }; // nicoup=0
-#ifdef __CUDACC__
-    checkCuda( cudaMemcpyToSymbol( cIPD, tIPD, 2 * sizeof( fptype ) ) );
-    //checkCuda( cudaMemcpyToSymbol( cIPC, tIPC, 0 * sizeof( cxtype ) ) ); // nicoup=0
+#ifdef MGONGPUCPP_GPUIMPL
+    gpuMemcpyToSymbol( cIPD, tIPD, 2 * sizeof( fptype ) );
+    //gpuMemcpyToSymbol( cIPC, tIPC, 0 * sizeof( cxtype ) ); // nicoup=0
 #else
     memcpy( cIPD, tIPD, 2 * sizeof( fptype ) );
     //memcpy( cIPC, tIPC, 0 * sizeof( cxtype ) ); // nicoup=0
@@ -544,7 +545,7 @@ namespace mg5amcCpu
   {
     std::stringstream out;
     // CUDA version (NVCC)
-    // [Use __NVCC__ instead of __CUDACC__ here!]
+    // [Use __NVCC__ instead of MGONGPUCPP_GPUIMPL here!]
     // [This tests if 'nvcc' was used even to build a .cc file, even if not necessarily 'nvcc -x cu' for a .cu file]
     // [Check 'nvcc --compiler-options -dM -E dummy.c | grep CUDA': see https://stackoverflow.com/a/53713712]
 #ifdef __NVCC__
@@ -609,12 +610,12 @@ namespace mg5amcCpu
   __global__ void /* clang-format off */
   computeDependentCouplings( const fptype* allgs, // input: Gs[nevt]
                              fptype* allcouplings // output: couplings[nevt*ndcoup*2]
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
                              , const int nevt     // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #endif
   ) /* clang-format on */
   {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     using namespace mg5amcGpu;
     using G_ACCESS = DeviceAccessGs;
     using C_ACCESS = DeviceAccessCouplings;
@@ -635,7 +636,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__ /* clang-format off */
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
   __global__ void
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
@@ -761,9 +762,9 @@ namespace mg5amcCpu
         nGoodHel++;
       }
     }
-#ifdef __CUDACC__
-    checkCuda( cudaMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) ) );
-    checkCuda( cudaMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) ) );
+#ifdef MGONGPUCPP_GPUIMPL
+    gpuMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) );
+    gpuMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) );
 #else
     cNGoodHel = nGoodHel;
     for( int ihel = 0; ihel < ncomb; ihel++ ) cGoodHel[ihel] = goodHel[ihel];
@@ -787,7 +788,7 @@ namespace mg5amcCpu
 #endif
             int* allselhel,                // output: helicity selection[nevt]
             int* allselcol                 // output: helicity selection[nevt]
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
             , const int nevt               // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #endif
             ) /* clang-format on */
@@ -807,7 +808,7 @@ namespace mg5amcCpu
     // Denominators: spins, colors and identical particles
     constexpr int helcolDenominators[1] = { 256 }; // assume nprocesses == 1 (#272 and #343)
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     // Remember: in CUDA this is a kernel for one event, in c++ this processes n events
     const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid
 #else
@@ -821,9 +822,12 @@ namespace mg5amcCpu
 #endif
 
     // Start sigmaKin_lines
+
+#include "GpuAbstraction.h"
+
     // === PART 0 - INITIALISATION (before calculate_wavefunctions) ===
     // Reset the "matrix elements" - running sums of |M|^2 over helicities for the given event
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     allMEs[ievt] = 0;
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     allNumerators[ievt] = 0;
@@ -851,7 +855,7 @@ namespace mg5amcCpu
     // === PART 1 - HELICITY LOOP: CALCULATE WAVEFUNCTIONS ===
     // (in both CUDA and C++, using precomputed good helicities)
 
-#ifdef __CUDACC__ // CUDA OR C++
+#ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++
 
     // *** START OF PART 1a - CUDA (one event per CPU thread) ***
     // Running sum of partial amplitudes squared for event by event color selection (#402)
@@ -1061,7 +1065,7 @@ namespace mg5amcCpu
     // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event
     // [NB 'sum over final spins, average over initial spins', eg see
     // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf]
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     allMEs[ievt] /= helcolDenominators[0];
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     if( channelId > 0 ) allMEs[ievt] *= allNumerators[ievt] / allDenominators[ievt];
diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/CPPProcess.h b/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/CPPProcess.h
index 3ebd92c038..4a88a07226 100644
--- a/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/CPPProcess.h
+++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/CPPProcess.h
@@ -4,7 +4,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
 // MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
@@ -25,7 +25,7 @@
 
 //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -107,7 +107,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   __global__ void
   computeDependentCouplings( const fptype* allgs,    // input: Gs[nevt]
                              fptype* allcouplings ); // output: couplings[nevt*ndcoup*2]
@@ -120,7 +120,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__ /* clang-format off */
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
   __global__ void
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
@@ -150,7 +150,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__ /* clang-format off */
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
   __global__ void
   sigmaKin( const fptype* allmomenta,      // input: momenta[nevt*npar*4]
             const fptype* allcouplings,    // input: couplings[nevt*ndcoup*2]
diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/CudaRuntime.h b/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/CudaRuntime.h
deleted file mode 120000
index ce9e1a487a..0000000000
--- a/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/CudaRuntime.h
+++ /dev/null
@@ -1 +0,0 @@
-../CudaRuntime.h
\ No newline at end of file
diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/GpuAbstraction.h b/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/GpuAbstraction.h
new file mode 120000
index 0000000000..72054e19ba
--- /dev/null
+++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/GpuAbstraction.h
@@ -0,0 +1 @@
+../GpuAbstraction.h
\ No newline at end of file
diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/GpuRuntime.h b/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/GpuRuntime.h
new file mode 120000
index 0000000000..3920e83be4
--- /dev/null
+++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/GpuRuntime.h
@@ -0,0 +1 @@
+../GpuRuntime.h
\ No newline at end of file
diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/check_sa.cc b/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/check_sa.cc
index 3fbf0ffbee..7cac5ab47b 100644
--- a/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/check_sa.cc
+++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/check_sa.cc
@@ -4,7 +4,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: O. Mattelaer (Nov 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 
 #include "mgOnGpuConfig.h"
@@ -12,6 +12,7 @@
 #include "BridgeKernels.h"
 #include "CPPProcess.h"
 #include "CrossSectionKernels.h"
+#include "GpuRuntime.h"
 #include "MatrixElementKernels.h"
 #include "MemoryAccessMatrixElements.h"
 #include "MemoryAccessMomenta.h"
@@ -65,7 +66,7 @@ usage( char* argv0, int ret = 1 )
   std::cout << std::endl;
   std::cout << "Summary stats are always computed: '-p' and '-j' only control their printout" << std::endl;
   std::cout << "The '-d' flag only enables NaN/abnormal warnings and OMP debugging" << std::endl;
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 #ifdef _OPENMP
   std::cout << std::endl;
   std::cout << "Use the OMP_NUM_THREADS environment variable to control OMP multi-threading" << std::endl;
@@ -96,7 +97,7 @@ int
 main( int argc, char** argv )
 {
   // Namespaces for CUDA and C++ (FIXME - eventually use the same namespace everywhere...)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   using namespace mg5amcGpu;
 #else
   using namespace mg5amcCpu;
@@ -134,9 +135,11 @@ main( int argc, char** argv )
     CurandDevice = 2
   };
 #ifdef MGONGPU_HAS_NO_CURAND
-  RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // this is the only supported mode if build has no curand (PR #784)
+  RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // this is the only supported mode if build has no curand (PR #784 and #785)
+#elif defined __HIPCC__
+#error Internal error: MGONGPU_HAS_NO_CURAND should have been set for __HIPCC__ // default on AMD GPUs should be common random
 #elif defined __CUDACC__
-  RandomNumberMode rndgen = RandomNumberMode::CurandDevice; // default on GPU if build has curand
+  RandomNumberMode rndgen = RandomNumberMode::CurandDevice; // default on NVidia GPU if build has curand
 #else
   RandomNumberMode rndgen = RandomNumberMode::CurandHost; // default on CPU if build has curand
 #endif
@@ -146,10 +149,10 @@ main( int argc, char** argv )
     RamboHost = 1,
     RamboDevice = 2
   };
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   RamboSamplingMode rmbsmp = RamboSamplingMode::RamboDevice; // default on GPU
 #else
-  RamboSamplingMode rmbsmp = RamboSamplingMode::RamboHost;  // default on CPU
+  RamboSamplingMode rmbsmp = RamboSamplingMode::RamboHost; // default on CPU
 #endif
   // Bridge emulation mode (NB Bridge implies RamboHost!)
   bool bridge = false;
@@ -177,7 +180,7 @@ main( int argc, char** argv )
     else if( arg == "--curdev" )
     {
 #ifndef __CUDACC__
-      throw std::runtime_error( "CurandDevice is not supported on CPUs" );
+      throw std::runtime_error( "CurandDevice is not supported on CPUs or non-NVidia GPUs" );
 #elif defined MGONGPU_HAS_NO_CURAND
       throw std::runtime_error( "CurandDevice is not supported because this application was built without Curand support" );
 #else
@@ -198,7 +201,7 @@ main( int argc, char** argv )
     }
     else if( arg == "--rmbdev" )
     {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
       rmbsmp = RamboSamplingMode::RamboDevice;
 #else
       throw std::runtime_error( "RamboDevice is not supported on CPUs" );
@@ -272,13 +275,13 @@ main( int argc, char** argv )
     return usage( argv[0] );
   }
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 #ifdef _OPENMP
   ompnumthreadsNotSetMeansOneThread( debug ? 1 : 0 ); // quiet(-1), info(0), debug(1)
 #endif
 #endif
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // Fail gently and avoid "Illegal instruction (core dumped)" if the host does not support the SIMD used in the ME calculation
   // Note: this prevents a crash on pmpe04 but not on some github CI nodes?
   // [NB: SIMD vectorization in mg5amc C++ code is only used in the ME calculation below MatrixElementKernelHost!]
@@ -296,14 +299,14 @@ main( int argc, char** argv )
 
   // === STEP 0 - INITIALISE
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 
-  // --- 00. Initialise cuda
-  // Instantiate a CudaRuntime at the beginnining of the application's main to
-  // invoke cudaSetDevice(0) in the constructor and book a cudaDeviceReset() call in the destructor
-  const std::string cdinKey = "00 CudaInit";
+  // --- 00. Initialise GPU
+  // Instantiate a GpuRuntime at the beginnining of the application's main.
+  // For CUDA this invokes cudaSetDevice(0) in the constructor and books a cudaDeviceReset() call in the destructor.
+  const std::string cdinKey = "00 GpuInit";
   timermap.start( cdinKey );
-  CudaRuntime cudaRuntime( debug );
+  GpuRuntime GpuRuntime( debug );
 #endif
 
   // --- 0a. Initialise physics process
@@ -325,7 +328,7 @@ main( int argc, char** argv )
   timermap.start( alloKey );
 
   // Memory buffers for random numbers for momenta
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferRndNumMomenta hstRndmom( nevt );
 #else
   PinnedHostBufferRndNumMomenta hstRndmom( nevt );
@@ -333,7 +336,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for sampling weights
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferWeights hstWeights( nevt );
 #else
   PinnedHostBufferWeights hstWeights( nevt );
@@ -341,7 +344,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for momenta
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferMomenta hstMomenta( nevt );
 #else
   PinnedHostBufferMomenta hstMomenta( nevt );
@@ -349,7 +352,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for Gs
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferGs hstGs( nevt );
 #else
   PinnedHostBufferGs hstGs( nevt );
@@ -366,7 +369,7 @@ main( int argc, char** argv )
   }
 
   // Memory buffers for matrix elements
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferMatrixElements hstMatrixElements( nevt );
 #else
   PinnedHostBufferMatrixElements hstMatrixElements( nevt );
@@ -375,7 +378,7 @@ main( int argc, char** argv )
 
   // Memory buffers for random numbers for helicity selection
   // *** NB #403 these buffers always remain initialised at 0: no need for helicity choice in gcheck/check (no LHE produced) ***
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferRndNumHelicity hstRndHel( nevt );
 #else
   PinnedHostBufferRndNumHelicity hstRndHel( nevt );
@@ -384,7 +387,7 @@ main( int argc, char** argv )
 
   // Memory buffers for random numbers for color selection
   // *** NB #402 these buffers always remain initialised at 0: no need for color choice in gcheck/check (no LHE produced) ***
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferRndNumColor hstRndCol( nevt );
 #else
   PinnedHostBufferRndNumColor hstRndCol( nevt );
@@ -392,7 +395,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for helicity selection
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferSelectedHelicity hstSelHel( nevt );
 #else
   PinnedHostBufferSelectedHelicity hstSelHel( nevt );
@@ -400,7 +403,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for color selection
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferSelectedColor hstSelCol( nevt );
 #else
   PinnedHostBufferSelectedColor hstSelCol( nevt );
@@ -438,7 +441,7 @@ main( int argc, char** argv )
     const bool onDevice = true;
     prnk.reset( new CurandRandomNumberKernel( devRndmom, onDevice ) );
 #else
-    throw std::logic_error( "INTERNAL ERROR! CurandDevice is not supported on CPUs" ); // INTERNAL ERROR (no path to this statement)
+    throw std::logic_error( "INTERNAL ERROR! CurandDevice is not supported on CPUs or non-NVidia GPUs" ); // INTERNAL ERROR (no path to this statement)
 #endif
   }
 
@@ -450,7 +453,7 @@ main( int argc, char** argv )
   }
   else
   {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     prsk.reset( new RamboSamplingKernelDevice( energy, devRndmom, devMomenta, devWeights, gpublocks, gputhreads ) );
 #else
     throw std::logic_error( "RamboDevice is not supported on CPUs" ); // INTERNAL ERROR (no path to this statement)
@@ -461,7 +464,7 @@ main( int argc, char** argv )
   std::unique_ptr<MatrixElementKernelBase> pmek;
   if( !bridge )
   {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     pmek.reset( new MatrixElementKernelDevice( devMomenta, devGs, devRndHel, devRndCol, devMatrixElements, devSelHel, devSelCol, gpublocks, gputhreads ) );
 #else
     pmek.reset( new MatrixElementKernelHost( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, nevt ) );
@@ -469,7 +472,7 @@ main( int argc, char** argv )
   }
   else
   {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     pmek.reset( new BridgeKernelDevice( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, gpublocks, gputhreads ) );
 #else
     pmek.reset( new BridgeKernelHost( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, nevt ) );
@@ -511,7 +514,7 @@ main( int argc, char** argv )
     prnk->generateRnarray();
     //std::cout << "Got random numbers" << std::endl;
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     if( rndgen != RandomNumberMode::CurandDevice && rmbsmp == RamboSamplingMode::RamboDevice )
     {
       // --- 1c. Copy rndmom from host to device
@@ -543,7 +546,7 @@ main( int argc, char** argv )
     prsk->getMomentaFinal();
     //std::cout << "Got final momenta" << std::endl;
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     if( rmbsmp == RamboSamplingMode::RamboDevice )
     {
       // --- 2c. CopyDToH Weights
@@ -588,7 +591,7 @@ main( int argc, char** argv )
       dynamic_cast<BridgeKernelBase*>( pmek.get() )->transposeInputMomentaC2F();
     }
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     // --- 2d. CopyHToD Momenta
     const std::string gKey = "0.. CpHTDg";
     rambtime += timermap.start( gKey ); // FIXME! NOT A RAMBO TIMER!
@@ -617,7 +620,7 @@ main( int argc, char** argv )
     wv3atime += timermap.stop(); // calc only
     wavetime += wv3atime;        // calc plus copy
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     if( !bridge )
     {
       // --- 3b. CopyDToH MEs
@@ -760,18 +763,22 @@ main( int argc, char** argv )
     rndgentxt = "CURAND DEVICE";
 #ifdef __CUDACC__
   rndgentxt += " (CUDA code)";
+#elif defined __HIPCC__
+  rndgentxt += " (HIP code)";
 #else
   rndgentxt += " (C++ code)";
 #endif
 
   // Workflow description summary
   std::string wrkflwtxt;
-  // -- CUDA or C++?
+  // -- CUDA or HIP or C++?
 #ifdef __CUDACC__
   wrkflwtxt += "CUD:";
+#elif defined __HIPCC__
+  wrkflwtxt += "HIP:";
 #else
   wrkflwtxt += "CPP:";
-#endif
+#endif /* clang-format off */
   // -- DOUBLE or FLOAT?
 #if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
   wrkflwtxt += "MIX+"; // mixed fptypes (single precision color algebra #537)
@@ -781,7 +788,7 @@ main( int argc, char** argv )
   wrkflwtxt += "FLT+";
 #else
   wrkflwtxt += "???+"; // no path to this statement
-#endif
+#endif /* clang-format on */
   // -- CUCOMPLEX or THRUST or STD complex numbers?
 #ifdef __CUDACC__
 #if defined MGONGPU_CUCXTYPE_CUCOMPLEX
@@ -793,6 +800,12 @@ main( int argc, char** argv )
 #else
   wrkflwtxt += "???:"; // no path to this statement
 #endif
+#elif defined __HIPCC__
+#if defined MGONGPU_CUCXTYPE_CXSMPL
+  wrkflwtxt += "CXS:";
+#else
+  wrkflwtxt += "???:"; // no path to this statement
+#endif
 #else
 #if defined MGONGPU_CPPCXTYPE_STDCOMPLEX
   wrkflwtxt += "STX:";
@@ -818,7 +831,7 @@ main( int argc, char** argv )
     wrkflwtxt += "RMBDEV+";
   else
     wrkflwtxt += "??????+"; // no path to this statement
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   // -- HOST or DEVICE matrix elements? Standalone MEs or BRIDGE?
   if( !bridge )
     wrkflwtxt += "MESDEV";
@@ -874,7 +887,7 @@ main( int argc, char** argv )
 
   if( perf )
   {
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 #ifdef _OPENMP
     // Get the output of "nproc --all" (https://stackoverflow.com/a/478960)
     std::string nprocall;
@@ -895,6 +908,8 @@ main( int argc, char** argv )
     std::cout << std::string( SEP79, '*' ) << std::endl
 #ifdef __CUDACC__
               << "Process                     = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_CUDA"
+#elif defined __HIPCC__
+              << "Process                     = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_HIP"
 #else
               << "Process                     = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_CPP"
 #endif
@@ -921,21 +936,21 @@ main( int argc, char** argv )
 #elif defined MGONGPU_FPTYPE_FLOAT
               << "FP precision                = FLOAT (NaN/abnormal=" << nabn << ", zero=" << nzero << ")" << std::endl
 #endif
-#ifdef __CUDACC__
 #if defined MGONGPU_CUCXTYPE_CUCOMPLEX
               << "Complex type                = CUCOMPLEX" << std::endl
 #elif defined MGONGPU_CUCXTYPE_THRUST
               << "Complex type                = THRUST::COMPLEX" << std::endl
-#endif
-#else
+#elif defined MGONGPU_CUCXTYPE_CXSMPL
               << "Complex type                = STD::COMPLEX" << std::endl
+#else
+              << "Complex type                = ???" << std::endl // no path to this statement...
 #endif
               << "RanNumb memory layout       = AOSOA[" << neppR << "]"
               << ( neppR == 1 ? " == AOS" : "" )
               << " [HARDCODED FOR REPRODUCIBILITY]" << std::endl
               << "Momenta memory layout       = AOSOA[" << neppM << "]"
               << ( neppM == 1 ? " == AOS" : "" ) << std::endl
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     //<< "Wavefunction GPU memory     = LOCAL" << std::endl
 #else
 #if !defined MGONGPU_CPPSIMD
@@ -966,7 +981,7 @@ main( int argc, char** argv )
 #endif
 #endif
               << "Random number generation    = " << rndgentxt << std::endl
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 #ifdef _OPENMP
               << "OMP threads / `nproc --all` = " << omp_get_max_threads() << " / " << nprocall // includes a newline
 #endif
@@ -1062,14 +1077,14 @@ main( int argc, char** argv )
              << "\"FLOAT (NaN/abnormal=" << nabn << ")\"," << std::endl
 #endif
              << "\"Complex type\": "
-#ifdef __CUDACC__
 #if defined MGONGPU_CUCXTYPE_CUCOMPLEX
              << "\"CUCOMPLEX\"," << std::endl
 #elif defined MGONGPU_CUCXTYPE_THRUST
              << "\"THRUST::COMPLEX\"," << std::endl
-#endif
-#else
+#elif defined MGONGPU_CUCXTYPE_CXSMPL
              << "\"STD::COMPLEX\"," << std::endl
+#else
+             << "\"???\"," << std::endl                           // no path to this statement...
 #endif
              << "\"RanNumb memory layout\": "
              << "\"AOSOA[" << neppR << "]\""
@@ -1077,7 +1092,7 @@ main( int argc, char** argv )
              << "\"Momenta memory layout\": "
              << "\"AOSOA[" << neppM << "]\""
              << ( neppM == 1 ? " == AOS" : "" ) << ", " << std::endl
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     //<< "\"Wavefunction GPU memory\": " << "\"LOCAL\"," << std::endl
 #endif
              << "\"Curand generation\": "
diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/RamboSamplingKernels.cc b/epochX/cudacpp/gg_tt.mad/SubProcesses/RamboSamplingKernels.cc
index da68aa9255..79abbcc4f8 100644
--- a/epochX/cudacpp/gg_tt.mad/SubProcesses/RamboSamplingKernels.cc
+++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/RamboSamplingKernels.cc
@@ -1,11 +1,11 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #include "RamboSamplingKernels.h"
 
-#include "CudaRuntime.h"
+#include "GpuRuntime.h"
 #include "MemoryAccessMomenta.h"
 #include "MemoryAccessRandomNumbers.h"
 #include "MemoryAccessWeights.h"
@@ -14,7 +14,7 @@
 
 #include <sstream>
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -92,7 +92,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   RamboSamplingKernelDevice::RamboSamplingKernelDevice( const fptype energy,               // input: energy
                                                         const BufferRndNumMomenta& rndmom, // input: random numbers in [0,1]
                                                         BufferMomenta& momenta,            // output: momenta
@@ -135,7 +135,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   __global__ void
   getMomentaInitialDevice( const fptype energy,
                            fptype* momenta )
@@ -147,17 +147,17 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   void
   RamboSamplingKernelDevice::getMomentaInitial()
   {
-    getMomentaInitialDevice<<<m_gpublocks, m_gputhreads>>>( m_energy, m_momenta.data() );
+    gpuLaunchKernel( getMomentaInitialDevice, m_gpublocks, m_gputhreads, m_energy, m_momenta.data() );
   }
 #endif
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   __global__ void
   getMomentaFinalDevice( const fptype energy,
                          const fptype* rndmom,
@@ -171,11 +171,11 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   void
   RamboSamplingKernelDevice::getMomentaFinal()
   {
-    getMomentaFinalDevice<<<m_gpublocks, m_gputhreads>>>( m_energy, m_rndmom.data(), m_momenta.data(), m_weights.data() );
+    gpuLaunchKernel( getMomentaFinalDevice, m_gpublocks, m_gputhreads, m_energy, m_rndmom.data(), m_momenta.data(), m_weights.data() );
   }
 #endif
 
diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/RamboSamplingKernels.h b/epochX/cudacpp/gg_tt.mad/SubProcesses/RamboSamplingKernels.h
index 184089efd7..7c214cd74b 100644
--- a/epochX/cudacpp/gg_tt.mad/SubProcesses/RamboSamplingKernels.h
+++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/RamboSamplingKernels.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef RAMBOSAMPLINGKERNELS_H
 #define RAMBOSAMPLINGKERNELS_H 1
@@ -10,7 +10,7 @@
 
 #include "MemoryBuffers.h"
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -93,7 +93,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   // A class encapsulating RAMBO phase space sampling on a GPU device
   class RamboSamplingKernelDevice final : public SamplingKernelBase, public NumberOfEvents
   {
diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/RandomNumberKernels.h b/epochX/cudacpp/gg_tt.mad/SubProcesses/RandomNumberKernels.h
index 188a72c2c9..21d63beeac 100644
--- a/epochX/cudacpp/gg_tt.mad/SubProcesses/RandomNumberKernels.h
+++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/RandomNumberKernels.h
@@ -1,14 +1,14 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef RANDOMNUMBERKERNELS_H
 #define RANDOMNUMBERKERNELS_H 1
 
 #include "mgOnGpuConfig.h"
 
-// NB This must come AFTER mgOnGpuConfig.h which contains our definition of __global__ when __CUDACC__ is not defined
+// NB This must come AFTER mgOnGpuConfig.h which contains our definition of __global__ when MGONGPUCPP_GPUIMPL is not defined
 #ifndef MGONGPU_HAS_NO_CURAND
 //#include "curand.h"
 struct curandGenerator_st; // forward definition from curand.h
@@ -16,7 +16,7 @@ struct curandGenerator_st; // forward definition from curand.h
 
 #include "MemoryBuffers.h"
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/cudacpp.mk b/epochX/cudacpp/gg_tt.mad/SubProcesses/cudacpp.mk
index 509307506b..f2cfa349da 100644
--- a/epochX/cudacpp/gg_tt.mad/SubProcesses/cudacpp.mk
+++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/cudacpp.mk
@@ -1,7 +1,7 @@
 # Copyright (C) 2020-2023 CERN and UCLouvain.
 # Licensed under the GNU Lesser General Public License (version 3 or later).
 # Created by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin.
-# Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+# Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 
 #=== Determine the name of this makefile (https://ftp.gnu.org/old-gnu/Manuals/make-3.80/html_node/make_17.html)
 #=== NB: use ':=' to ensure that the value of CUDACPP_MAKEFILE is not modified further down after including make_opts
@@ -42,10 +42,10 @@ endif
 
 #-------------------------------------------------------------------------------
 
-#=== Configure common compiler flags for C++ and CUDA
+#=== Configure common compiler flags for C++ and CUDA/HIP
 
 INCFLAGS = -I.
-OPTFLAGS = -O3 # this ends up in CUFLAGS too (should it?), cannot add -Ofast or -ffast-math here
+OPTFLAGS = -O3 # this ends up in GPUFLAGS too (should it?), cannot add -Ofast or -ffast-math here
 
 # Dependency on src directory
 MG5AMC_COMMONLIB = mg5amc_common
@@ -121,24 +121,46 @@ endif
 
 #-------------------------------------------------------------------------------
 
-#=== Configure the CUDA compiler
+#=== Configure the GPU compiler (CUDA or HIP)
 
-# If CXX is not a single word (example "clang++ --gcc-toolchain...") then disable CUDA builds (issue #505)
-# This is because it is impossible to pass this to "CUFLAGS += -ccbin <host-compiler>" below
+# FIXME! (AV 24.01.2024)
+# In the current implementation (without separate builds for C++ and CUDA/HIP), we first check for cudacc and hipcc in CUDA_HOME and HIP_HOME.
+# If CUDA_HOME or HIP_HOME are not set, try to determine them from the path to cudacc and hipcc.
+# While convoluted, this is currently necessary to allow disabling CUDA/HIP builds by setting CUDA_HOME or HIP_HOME to invalid paths.
+# This will (probably?) be fixed when separate C++ and CUDA/HIP builds are implemented (PR #775).
+
+# If CXX is not a single word (example "clang++ --gcc-toolchain...") then disable CUDA and HIP builds (issue #505)
+# This is because it is impossible to pass this to "GPUFLAGS += -ccbin <host-compiler>" below
 ifneq ($(words $(subst ccache ,,$(CXX))),1) # allow at most "CXX=ccache <host-compiler>" from outside
-  $(warning CUDA builds are not supported for multi-word CXX "$(CXX)")
+  $(warning CUDA and HIP builds are not supported for multi-word CXX "$(CXX)")
   override CUDA_HOME=disabled
+  override HIP_HOME=disabled
 endif
 
-# If CUDA_HOME is not set, try to set it from the location of nvcc
+# If CUDA_HOME is not set, try to set it from the path to nvcc
 ifndef CUDA_HOME
   CUDA_HOME = $(patsubst %bin/nvcc,%,$(shell which nvcc 2>/dev/null))
   $(warning CUDA_HOME was not set: using "$(CUDA_HOME)")
 endif
 
-# Set NVCC as $(CUDA_HOME)/bin/nvcc if it exists
+# If HIP_HOME is not set, try to set it from the path to hipcc
+ifndef HIP_HOME
+  HIP_HOME = $(patsubst %bin/hipcc,%,$(HIP_COMPILER_PATH))
+  $(warning HIP_HOME was not set: using "$(HIP_HOME)")
+endif
+
+# FIXME! (AV 24.01.2024)
+# In the current implementation (without separate builds for C++ and CUDA/HIP),
+# builds are performed for HIP only if CUDA is not found in the path.
+# If both CUDA and HIP are installed, HIP builds can be triggered by unsetting CUDA_HOME.
+# This will be fixed when separate C++ and CUDA/HIP builds are implemented (PR #775).
+
+#--- Option 1: CUDA exists -> use CUDA
+
+# Set GPUCC as $(CUDA_HOME)/bin/nvcc if it exists
 ifneq ($(wildcard $(CUDA_HOME)/bin/nvcc),)
-  NVCC = $(CUDA_HOME)/bin/nvcc
+
+  GPUCC = $(CUDA_HOME)/bin/nvcc
   USE_NVTX ?=-DUSE_NVTX
   # See https://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/index.html
   # See https://arnon.dk/matching-sm-architectures-arch-and-gencode-for-various-nvidia-cards/
@@ -158,41 +180,78 @@ ifneq ($(wildcard $(CUDA_HOME)/bin/nvcc),)
     CURANDLIBFLAGS = -L$(CUDA_HOME)/lib64/ -lcurand # NB: -lcuda is not needed here!
   endif
   CUOPTFLAGS = -lineinfo
-  CUFLAGS = $(foreach opt, $(OPTFLAGS), -Xcompiler $(opt)) $(CUOPTFLAGS) $(INCFLAGS) $(CUINC) $(USE_NVTX) $(CUARCHFLAGS) -use_fast_math
-  ###CUFLAGS += -Xcompiler -Wall -Xcompiler -Wextra -Xcompiler -Wshadow
-  ###NVCC_VERSION = $(shell $(NVCC) --version | grep 'Cuda compilation tools' | cut -d' ' -f5 | cut -d, -f1)
-  CUFLAGS += -std=c++17 # need CUDA >= 11.2 (see #333): this is enforced in mgOnGpuConfig.h
+  ###GPUFLAGS = $(OPTFLAGS) $(CUOPTFLAGS) $(INCFLAGS) $(CUINC) $(USE_NVTX) $(CUARCHFLAGS) -use_fast_math
+  GPUFLAGS = $(foreach opt, $(OPTFLAGS), -Xcompiler $(opt)) $(CUOPTFLAGS) $(INCFLAGS) $(CUINC) $(USE_NVTX) $(CUARCHFLAGS) -use_fast_math
+  ###GPUFLAGS += -Xcompiler -Wall -Xcompiler -Wextra -Xcompiler -Wshadow
+  ###GPUCC_VERSION = $(shell $(GPUCC) --version | grep 'Cuda compilation tools' | cut -d' ' -f5 | cut -d, -f1)
+  GPUFLAGS += -std=c++17 # need CUDA >= 11.2 (see #333): this is enforced in mgOnGpuConfig.h
   # Without -maxrregcount: baseline throughput: 6.5E8 (16384 32 12) up to 7.3E8 (65536 128 12)
-  ###CUFLAGS+= --maxrregcount 160 # improves throughput: 6.9E8 (16384 32 12) up to 7.7E8 (65536 128 12)
-  ###CUFLAGS+= --maxrregcount 128 # improves throughput: 7.3E8 (16384 32 12) up to 7.6E8 (65536 128 12)
-  ###CUFLAGS+= --maxrregcount 96 # degrades throughput: 4.1E8 (16384 32 12) up to 4.5E8 (65536 128 12)
-  ###CUFLAGS+= --maxrregcount 64 # degrades throughput: 1.7E8 (16384 32 12) flat at 1.7E8 (65536 128 12)
+  ###GPUFLAGS+= --maxrregcount 160 # improves throughput: 6.9E8 (16384 32 12) up to 7.7E8 (65536 128 12)
+  ###GPUFLAGS+= --maxrregcount 128 # improves throughput: 7.3E8 (16384 32 12) up to 7.6E8 (65536 128 12)
+  ###GPUFLAGS+= --maxrregcount 96 # degrades throughput: 4.1E8 (16384 32 12) up to 4.5E8 (65536 128 12)
+  ###GPUFLAGS+= --maxrregcount 64 # degrades throughput: 1.7E8 (16384 32 12) flat at 1.7E8 (65536 128 12)
+  CUBUILDRULEFLAGS = -Xcompiler -fPIC -c
+  CCBUILDRULEFLAGS = -Xcompiler -fPIC -c -x cu
+  CUDATESTFLAGS = -lcuda
+
+  # Set the host C++ compiler for GPUCC via "-ccbin <host-compiler>"
+  # (NB issue #505: this must be a single word, "clang++ --gcc-toolchain..." is not supported)
+  GPUFLAGS += -ccbin $(shell which $(subst ccache ,,$(CXX)))
+
+  # Allow newer (unsupported) C++ compilers with older versions of CUDA if ALLOW_UNSUPPORTED_COMPILER_IN_CUDA is set (#504)
+  ifneq ($(origin ALLOW_UNSUPPORTED_COMPILER_IN_CUDA),undefined)
+  GPUFLAGS += -allow-unsupported-compiler
+  endif
+
 else ifneq ($(origin REQUIRE_CUDA),undefined)
+
   # If REQUIRE_CUDA is set but no cuda is found, stop here (e.g. for CI tests on GPU #443)
-  $(error No cuda installation found (set CUDA_HOME or make nvcc visible in PATH))
+  $(error No cuda installation found (set CUDA_HOME or make GPUCC visible in PATH))
+
+#--- Option 2: CUDA does not exist, HIP exists -> use HIP
+
+# Set GPUCC as $(HIP_HOME)/bin/hipcc if it exists
+else ifneq ($(wildcard $(HIP_HOME)/bin/hipcc),)
+
+  GPUCC = $(HIP_HOME)/bin/hipcc
+  #USE_NVTX ?=-DUSE_NVTX # should maybe find something equivalent to this in HIP?
+  HIPARCHFLAGS = -target x86_64-linux-gnu --offload-arch=gfx90a
+  HIPINC = -I$(HIP_HOME)/include/
+  # Note: -DHIP_FAST_MATH is equivalent to -use_fast_math in HIP 
+  # (but only for single precision line 208: https://rocm-developer-tools.github.io/HIP/hcc__detail_2math__functions_8h_source.html)
+  GPUFLAGS = $(OPTFLAGS) $(CUOPTFLAGS) $(INCFLAGS) $(HIPINC) $(HIPARCHFLAGS) -DHIP_FAST_MATH -DHIP_PLATFORM=amd -fPIC
+  ###GPUFLAGS += -Xcompiler -Wall -Xcompiler -Wextra -Xcompiler -Wshadow
+  GPUFLAGS += -std=c++17
+  ###GPUFLAGS+= --maxrregcount 255 # (AV: is this option valid on HIP and meaningful on AMD GPUs?)
+  CUBUILDRULEFLAGS = -fPIC -c
+  CCBUILDRULEFLAGS = -fPIC -c
+
+else ifneq ($(origin REQUIRE_HIP),undefined)
+
+  # If REQUIRE_HIP is set but no HIP is found, stop here (e.g. for CI tests on GPU #443)
+  $(error No hip installation found (set HIP_HOME or make GPUCC visible in PATH))
+
+#--- Option 3: CUDA does not exist, HIP does not exist -> switch off both CUDA and HIP
+
 else
-  # No cuda. Switch cuda compilation off and go to common random numbers in C++
+
+  # No cudacc and no hipcc: switch CUDA and HIP compilation off and go to common random numbers in C++
   $(warning CUDA_HOME is not set or is invalid: export CUDA_HOME to compile with cuda)
-  override NVCC=
+  $(warning HIP_HOME is not set or is invalid: export HIP_HOME to compile with hip)
+  override GPUCC=
   override USE_NVTX=
   override CUINC=
   override CURANDLIBFLAGS=
-endif
-export NVCC
-export CUFLAGS
-
-# Set the host C++ compiler for nvcc via "-ccbin <host-compiler>"
-# (NB issue #505: this must be a single word, "clang++ --gcc-toolchain..." is not supported)
-CUFLAGS += -ccbin $(shell which $(subst ccache ,,$(CXX)))
 
-# Allow newer (unsupported) C++ compilers with older versions of CUDA if ALLOW_UNSUPPORTED_COMPILER_IN_CUDA is set (#504)
-ifneq ($(origin ALLOW_UNSUPPORTED_COMPILER_IN_CUDA),undefined)
-CUFLAGS += -allow-unsupported-compiler
 endif
 
+# Export GPUCC (so that it can also be used in cudacpp_src.mk?)
+export GPUCC
+export GPUFLAGS
+
 #-------------------------------------------------------------------------------
 
-#=== Configure ccache for C++ and CUDA builds
+#=== Configure ccache for C++ and CUDA/HIP builds
 
 # Enable ccache if USECCACHE=1
 ifeq ($(USECCACHE)$(shell echo $(CXX) | grep ccache),1)
@@ -201,15 +260,15 @@ endif
 #ifeq ($(USECCACHE)$(shell echo $(AR) | grep ccache),1)
 #  override AR:=ccache $(AR)
 #endif
-ifneq ($(NVCC),)
-  ifeq ($(USECCACHE)$(shell echo $(NVCC) | grep ccache),1)
-    override NVCC:=ccache $(NVCC)
+ifneq ($(GPUCC),)
+  ifeq ($(USECCACHE)$(shell echo $(GPUCC) | grep ccache),1)
+    override GPUCC:=ccache $(GPUCC)
   endif
 endif
 
 #-------------------------------------------------------------------------------
 
-#=== Configure PowerPC-specific compiler flags for C++ and CUDA
+#=== Configure PowerPC-specific compiler flags for C++ and CUDA/HIP
 
 # PowerPC-specific CXX compiler flags (being reviewed)
 ifeq ($(UNAME_P),ppc64le)
@@ -225,9 +284,9 @@ else
   ######CXXFLAGS+= -fno-semantic-interposition # no benefit (neither alone, nor combined with -flto)
 endif
 
-# PowerPC-specific CUDA compiler flags (to be reviewed!)
+# PowerPC-specific CUDA/HIP compiler flags (to be reviewed!)
 ifeq ($(UNAME_P),ppc64le)
-  CUFLAGS+= -Xcompiler -mno-float128
+  GPUFLAGS+= -Xcompiler -mno-float128
 endif
 
 #-------------------------------------------------------------------------------
@@ -237,7 +296,7 @@ endif
 # Set the default OMPFLAGS choice
 ifneq ($(shell $(CXX) --version | egrep '^Intel'),)
 override OMPFLAGS = -fopenmp
-###override OMPFLAGS = # disable OpenMP MT on Intel (was ok without nvcc but not ok with nvcc before #578)
+###override OMPFLAGS = # disable OpenMP MT on Intel (was ok without GPUCC but not ok with GPUCC before #578)
 else ifneq ($(shell $(CXX) --version | egrep '^(clang)'),)
 override OMPFLAGS = -fopenmp
 ###override OMPFLAGS = # disable OpenMP MT on clang (was not ok without or with nvcc before #578)
@@ -293,7 +352,10 @@ endif
 
 # Set the default RNDGEN (random number generator) choice
 ifeq ($(RNDGEN),)
-  ifeq ($(NVCC),)
+  ifeq ($(GPUCC),)
+    override RNDGEN = hasNoCurand
+  # Edgecase for HIP compilation
+  else ifeq ($(findstring hipcc,$(GPUCC)),hipcc)
     override RNDGEN = hasNoCurand
   else ifeq ($(RNDGEN),)
     override RNDGEN = hasCurand
@@ -310,7 +372,7 @@ export OMPFLAGS
 
 #-------------------------------------------------------------------------------
 
-#=== Set the CUDA/C++ compiler flags appropriate to user-defined choices of AVX, FPTYPE, HELINL, HRDCOD, RNDGEN
+#=== Set the CUDA/HIP/C++ compiler flags appropriate to user-defined choices of AVX, FPTYPE, HELINL, HRDCOD, RNDGEN
 
 # Set the build flags appropriate to OMPFLAGS
 $(info OMPFLAGS=$(OMPFLAGS))
@@ -368,13 +430,13 @@ CXXFLAGS+= $(AVXFLAGS)
 $(info FPTYPE=$(FPTYPE))
 ifeq ($(FPTYPE),d)
   CXXFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_DOUBLE
-  CUFLAGS  += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_DOUBLE
+  GPUFLAGS  += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_DOUBLE
 else ifeq ($(FPTYPE),f)
   CXXFLAGS += -DMGONGPU_FPTYPE_FLOAT -DMGONGPU_FPTYPE2_FLOAT
-  CUFLAGS  += -DMGONGPU_FPTYPE_FLOAT -DMGONGPU_FPTYPE2_FLOAT
+  GPUFLAGS  += -DMGONGPU_FPTYPE_FLOAT -DMGONGPU_FPTYPE2_FLOAT
 else ifeq ($(FPTYPE),m)
   CXXFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_FLOAT
-  CUFLAGS  += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_FLOAT
+  GPUFLAGS  += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_FLOAT
 else
   $(error Unknown FPTYPE='$(FPTYPE)': only 'd', 'f' and 'm' are supported)
 endif
@@ -383,7 +445,7 @@ endif
 $(info HELINL=$(HELINL))
 ifeq ($(HELINL),1)
   CXXFLAGS += -DMGONGPU_INLINE_HELAMPS
-  CUFLAGS  += -DMGONGPU_INLINE_HELAMPS
+  GPUFLAGS  += -DMGONGPU_INLINE_HELAMPS
 else ifneq ($(HELINL),0)
   $(error Unknown HELINL='$(HELINL)': only '0' and '1' are supported)
 endif
@@ -392,7 +454,7 @@ endif
 $(info HRDCOD=$(HRDCOD))
 ifeq ($(HRDCOD),1)
   CXXFLAGS += -DMGONGPU_HARDCODE_PARAM
-  CUFLAGS  += -DMGONGPU_HARDCODE_PARAM
+  GPUFLAGS  += -DMGONGPU_HARDCODE_PARAM
 else ifneq ($(HRDCOD),0)
   $(error Unknown HRDCOD='$(HRDCOD)': only '0' and '1' are supported)
 endif
@@ -444,11 +506,11 @@ ifeq ($(UNAME_S),Darwin)
   override CULIBFLAGSRPATH2 =
 else
   # RPATH to cuda/cpp libs when linking executables
-  override CXXLIBFLAGSRPATH = -Wl,-rpath,$(LIBDIRRPATH)
-  override CULIBFLAGSRPATH = -Xlinker -rpath,$(LIBDIRRPATH)
+  override CXXLIBFLAGSRPATH = -Wl,-rpath=$(LIBDIRRPATH)
+  override CULIBFLAGSRPATH = -Xlinker -rpath=$(LIBDIRRPATH)
   # RPATH to common lib when linking cuda/cpp libs
-  override CXXLIBFLAGSRPATH2 = -Wl,-rpath,'$$ORIGIN'
-  override CULIBFLAGSRPATH2 = -Xlinker -rpath,'$$ORIGIN'
+  override CXXLIBFLAGSRPATH2 = -Wl,-rpath='$$ORIGIN'
+  override CULIBFLAGSRPATH2 = -Xlinker -rpath='$$ORIGIN'
 endif
 
 # Setting LD_LIBRARY_PATH or DYLD_LIBRARY_PATH in the RUNTIME is no longer necessary (neither on Linux nor on Mac)
@@ -461,7 +523,7 @@ override RUNTIME =
 cxx_main=$(BUILDDIR)/check.exe
 fcxx_main=$(BUILDDIR)/fcheck.exe
 
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 cu_main=$(BUILDDIR)/gcheck.exe
 fcu_main=$(BUILDDIR)/fgcheck.exe
 else
@@ -492,15 +554,16 @@ $(BUILDDIR)/.build.$(TAG):
 	@touch $(BUILDDIR)/.build.$(TAG)
 
 # Generic target and build rules: objects from CUDA compilation
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 $(BUILDDIR)/%.o : %.cu *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
 	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
-	$(NVCC) $(CPPFLAGS) $(CUFLAGS) -Xcompiler -fPIC -c $< -o $@
+	$(GPUCC) $(CPPFLAGS) $(GPUFLAGS) $(CUBUILDRULEFLAGS) $< -o $@
 
 $(BUILDDIR)/%_cu.o : %.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
 	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
-	$(NVCC) $(CPPFLAGS) $(CUFLAGS) -Xcompiler -fPIC -c -x cu $< -o $@
+	$(GPUCC) $(CPPFLAGS) $(GPUFLAGS) $(CCBUILDRULEFLAGS) $< -o $@
 endif
+# -x cu in line above
 
 # Generic target and build rules: objects from C++ compilation
 # (NB do not include CUINC here! add it only for NVTX or curand #679)
@@ -509,11 +572,14 @@ $(BUILDDIR)/%.o : %.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
 	$(CXX) $(CPPFLAGS) $(CXXFLAGS) -fPIC -c $< -o $@
 
 # Apply special build flags only to CrossSectionKernel.cc and gCrossSectionKernel.cu (no fast math, see #117 and #516)
+# Added edgecase for HIP compilation
 ifeq ($(shell $(CXX) --version | grep ^nvc++),)
 $(BUILDDIR)/CrossSectionKernels.o: CXXFLAGS := $(filter-out -ffast-math,$(CXXFLAGS))
 $(BUILDDIR)/CrossSectionKernels.o: CXXFLAGS += -fno-fast-math
-ifneq ($(NVCC),)
-$(BUILDDIR)/gCrossSectionKernels.o: CUFLAGS += -Xcompiler -fno-fast-math
+ifeq ($(findstring nvcc,$(GPUCC)),nvcc)
+  $(BUILDDIR)/gCrossSectionKernels.o: GPUFLAGS += -Xcompiler -fno-fast-math
+else
+  $(BUILDDIR)/gCrossSectionKernels.o: GPUFLAGS += -fno-fast-math
 endif
 endif
 
@@ -530,10 +596,10 @@ ifeq ($(RNDGEN),hasCurand)
 $(BUILDDIR)/CurandRandomNumberKernel.o: CXXFLAGS += $(CUINC)
 endif
 
-# Avoid "warning: builtin __has_trivial_... is deprecated; use __is_trivially_... instead" in nvcc with icx2023 (#592)
+# Avoid "warning: builtin __has_trivial_... is deprecated; use __is_trivially_... instead" in GPUCC with icx2023 (#592)
 ifneq ($(shell $(CXX) --version | egrep '^(Intel)'),)
-ifneq ($(NVCC),)
-CUFLAGS += -Xcompiler -Wno-deprecated-builtins
+ifneq ($(GPUCC),)
+GPUFLAGS += -Wno-deprecated-builtins
 endif
 endif
 
@@ -541,8 +607,8 @@ endif
 # This patch does remove the warning, but I prefer to keep it disabled for the moment...
 ###ifneq ($(shell $(CXX) --version | egrep '^(clang|Apple clang|Intel)'),)
 ###$(BUILDDIR)/CrossSectionKernels.o: CXXFLAGS += -Wno-overriding-t-option
-###ifneq ($(NVCC),)
-###$(BUILDDIR)/gCrossSectionKernels.o: CUFLAGS += -Xcompiler -Wno-overriding-t-option
+###ifneq ($(GPUCC),)
+###$(BUILDDIR)/gCrossSectionKernels.o: GPUFLAGS += -Xcompiler -Wno-overriding-t-option
 ###endif
 ###endif
 
@@ -569,7 +635,7 @@ MG5AMC_CXXLIB = mg5amc_$(processid_short)_cpp
 cxx_objects_lib=$(BUILDDIR)/CPPProcess.o $(BUILDDIR)/MatrixElementKernels.o $(BUILDDIR)/BridgeKernels.o $(BUILDDIR)/CrossSectionKernels.o
 cxx_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel.o $(BUILDDIR)/RamboSamplingKernels.o
 
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 MG5AMC_CULIB = mg5amc_$(processid_short)_cuda
 cu_objects_lib=$(BUILDDIR)/gCPPProcess.o $(BUILDDIR)/gMatrixElementKernels.o $(BUILDDIR)/gBridgeKernels.o $(BUILDDIR)/gCrossSectionKernels.o
 cu_objects_exe=$(BUILDDIR)/gCommonRandomNumberKernel.o $(BUILDDIR)/gRamboSamplingKernels.o
@@ -581,11 +647,11 @@ $(LIBDIR)/lib$(MG5AMC_CXXLIB).so: cxx_objects_lib += $(BUILDDIR)/fbridge.o
 $(LIBDIR)/lib$(MG5AMC_CXXLIB).so: $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib)
 	$(CXX) -shared -o $@ $(cxx_objects_lib) $(CXXLIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB)
 
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 $(LIBDIR)/lib$(MG5AMC_CULIB).so: $(BUILDDIR)/fbridge_cu.o
 $(LIBDIR)/lib$(MG5AMC_CULIB).so: cu_objects_lib += $(BUILDDIR)/fbridge_cu.o
 $(LIBDIR)/lib$(MG5AMC_CULIB).so: $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cu_objects_lib)
-	$(NVCC) --shared -o $@ $(cu_objects_lib) $(CULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB)
+	$(GPUCC) --shared -o $@ $(cu_objects_lib) $(CULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB)
 endif
 
 #-------------------------------------------------------------------------------
@@ -602,16 +668,16 @@ $(cxx_main): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PAT
 $(cxx_main): $(BUILDDIR)/check_sa.o $(LIBDIR)/lib$(MG5AMC_CXXLIB).so $(cxx_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel.o
 	$(CXX) -o $@ $(BUILDDIR)/check_sa.o $(OMPFLAGS) -ldl -pthread $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_CXXLIB) $(cxx_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel.o $(CURANDLIBFLAGS)
 
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 ifneq ($(shell $(CXX) --version | grep ^Intel),)
-$(cu_main): LIBFLAGS += -lintlc # compile with icpx and link with nvcc (undefined reference to `_intel_fast_memcpy')
-$(cu_main): LIBFLAGS += -lsvml # compile with icpx and link with nvcc (undefined reference to `__svml_cos4_l9')
+$(cu_main): LIBFLAGS += -lintlc # compile with icpx and link with GPUCC (undefined reference to `_intel_fast_memcpy')
+$(cu_main): LIBFLAGS += -lsvml # compile with icpx and link with GPUCC (undefined reference to `__svml_cos4_l9')
 else ifneq ($(shell $(CXX) --version | grep ^nvc++),) # support nvc++ #531
 $(cu_main): LIBFLAGS += -L$(patsubst %bin/nvc++,%lib,$(subst ccache ,,$(CXX))) -lnvhpcatm -lnvcpumath -lnvc
 endif
 $(cu_main): LIBFLAGS += $(CULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH
 $(cu_main): $(BUILDDIR)/gcheck_sa.o $(LIBDIR)/lib$(MG5AMC_CULIB).so $(cu_objects_exe) $(BUILDDIR)/gCurandRandomNumberKernel.o
-	$(NVCC) -o $@ $(BUILDDIR)/gcheck_sa.o $(CUARCHFLAGS) $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe) $(BUILDDIR)/gCurandRandomNumberKernel.o $(CURANDLIBFLAGS)
+	$(GPUCC) -o $@ $(BUILDDIR)/gcheck_sa.o $(CUARCHFLAGS) $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe) $(BUILDDIR)/gCurandRandomNumberKernel.o $(CURANDLIBFLAGS)
 endif
 
 #-------------------------------------------------------------------------------
@@ -637,17 +703,17 @@ $(fcxx_main): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PA
 $(fcxx_main): $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler.o $(LIBDIR)/lib$(MG5AMC_CXXLIB).so $(cxx_objects_exe)
 	$(CXX) -o $@ $(BUILDDIR)/fcheck_sa.o $(OMPFLAGS) $(BUILDDIR)/fsampler.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CXXLIB) $(cxx_objects_exe)
 
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 ifneq ($(shell $(CXX) --version | grep ^Intel),)
-$(fcu_main): LIBFLAGS += -lintlc # compile with icpx and link with nvcc (undefined reference to `_intel_fast_memcpy')
-$(fcu_main): LIBFLAGS += -lsvml # compile with icpx and link with nvcc (undefined reference to `__svml_cos4_l9')
+$(fcu_main): LIBFLAGS += -lintlc # compile with icpx and link with GPUCC (undefined reference to `_intel_fast_memcpy')
+$(fcu_main): LIBFLAGS += -lsvml # compile with icpx and link with GPUCC (undefined reference to `__svml_cos4_l9')
 endif
 ifeq ($(UNAME_S),Darwin)
 $(fcu_main): LIBFLAGS += -L$(shell dirname $(shell $(FC) --print-file-name libgfortran.dylib)) # add path to libgfortran on Mac #375
 endif
 $(fcu_main): LIBFLAGS += $(CULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH
 $(fcu_main): $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler_cu.o $(LIBDIR)/lib$(MG5AMC_CULIB).so $(cu_objects_exe)
-	$(NVCC) -o $@ $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler_cu.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe)
+	$(GPUCC) -o $@ $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler_cu.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe)
 endif
 
 #-------------------------------------------------------------------------------
@@ -659,7 +725,7 @@ $(BUILDDIR)/testxxx.o: testxxx_cc_ref.txt
 $(testmain): $(BUILDDIR)/testxxx.o
 $(testmain): cxx_objects_exe += $(BUILDDIR)/testxxx.o # Comment out this line to skip the C++ test of xxx functions
 
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 $(BUILDDIR)/testxxx_cu.o: $(GTESTLIBS)
 $(BUILDDIR)/testxxx_cu.o: INCFLAGS += $(GTESTINC)
 $(BUILDDIR)/testxxx_cu.o: testxxx_cc_ref.txt
@@ -672,7 +738,7 @@ $(BUILDDIR)/testmisc.o: INCFLAGS += $(GTESTINC)
 $(testmain): $(BUILDDIR)/testmisc.o
 $(testmain): cxx_objects_exe += $(BUILDDIR)/testmisc.o # Comment out this line to skip the C++ miscellaneous tests
 
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 $(BUILDDIR)/testmisc_cu.o: $(GTESTLIBS)
 $(BUILDDIR)/testmisc_cu.o: INCFLAGS += $(GTESTINC)
 $(testmain): $(BUILDDIR)/testmisc_cu.o
@@ -684,12 +750,12 @@ $(BUILDDIR)/runTest.o: INCFLAGS += $(GTESTINC)
 $(testmain): $(BUILDDIR)/runTest.o
 $(testmain): cxx_objects_exe += $(BUILDDIR)/runTest.o
 
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 $(BUILDDIR)/runTest_cu.o: $(GTESTLIBS)
 $(BUILDDIR)/runTest_cu.o: INCFLAGS += $(GTESTINC)
 ifneq ($(shell $(CXX) --version | grep ^Intel),)
-$(testmain): LIBFLAGS += -lintlc # compile with icpx and link with nvcc (undefined reference to `_intel_fast_memcpy')
-$(testmain): LIBFLAGS += -lsvml # compile with icpx and link with nvcc (undefined reference to `__svml_cos4_l9')
+$(testmain): LIBFLAGS += -lintlc # compile with icpx and link with GPUCC (undefined reference to `_intel_fast_memcpy')
+$(testmain): LIBFLAGS += -lsvml # compile with icpx and link with GPUCC (undefined reference to `__svml_cos4_l9')
 else ifneq ($(shell $(CXX) --version | grep ^nvc++),) # support nvc++ #531
 $(testmain): LIBFLAGS += -L$(patsubst %bin/nvc++,%lib,$(subst ccache ,,$(CXX))) -lnvhpcatm -lnvcpumath -lnvc
 endif
@@ -713,14 +779,14 @@ $(testmain): LIBFLAGS += -lgomp
 endif
 endif
 
-ifeq ($(NVCC),) # link only runTest.o
+ifeq ($(GPUCC),) # link only runTest.o
 $(testmain): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH
 $(testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(cxx_objects_exe) $(GTESTLIBS)
 	$(CXX) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) -ldl -pthread $(LIBFLAGS)
 else # link both runTest.o and runTest_cu.o
 $(testmain): LIBFLAGS += $(CULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH
 $(testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) $(GTESTLIBS)
-	$(NVCC) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) -ldl $(LIBFLAGS) -lcuda
+	  $(GPUCC) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) -ldl $(LIBFLAGS) $(CUDATESTFLAGS)
 endif
 
 # Use target gtestlibs to build only googletest
@@ -829,9 +895,9 @@ ifeq ($(USECCACHE),1)
 	ccache --version | head -1
 endif
 	@echo ""
-	@echo NVCC=$(NVCC)
-ifneq ($(NVCC),)
-	$(NVCC) --version
+	@echo GPUCC=$(GPUCC)
+ifneq ($(GPUCC),)
+	$(GPUCC) --version
 endif
 	@echo ""
 	@echo CXX=$(CXX)
@@ -850,7 +916,7 @@ endif
 
 # Target: check (run the C++ test executable)
 # [NB THIS IS WHAT IS USED IN THE GITHUB CI!]
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 check: runTest cmpFcheck cmpFGcheck
 else
 check: runTest cmpFcheck
diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/fbridge.cc b/epochX/cudacpp/gg_tt.mad/SubProcesses/fbridge.cc
index 2d2b36d560..22ce3f5115 100644
--- a/epochX/cudacpp/gg_tt.mad/SubProcesses/fbridge.cc
+++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/fbridge.cc
@@ -1,11 +1,11 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: S. Roiser (Oct 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Roiser, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #include "Bridge.h"
 #include "CPPProcess.h"
-#include "CudaRuntime.h"
+#include "GpuRuntime.h"
 
 extern "C"
 {
@@ -22,7 +22,7 @@ extern "C"
    * Using the same Fortran MadEvent code, linking to the hetrerogeneous library would allow access to both CPU and GPU implementations.
    * The specific heterogeneous configuration (how many GPUs, how many threads on each CPU, etc) could be loaded in CUDA/C++ from a data file.
    */
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   using namespace mg5amcGpu;
 #else
   using namespace mg5amcCpu;
@@ -46,8 +46,8 @@ extern "C"
    */
   void fbridgecreate_( CppObjectInFortran** ppbridge, const int* pnevtF, const int* pnparF, const int* pnp4F )
   {
-#ifdef __CUDACC__
-    CudaRuntime::setUp();
+#ifdef MGONGPUCPP_GPUIMPL
+    GpuRuntime::setUp();
 #endif
     // (NB: CPPProcess::initProc no longer needs to be executed here because it is called in the Bridge constructor)
     // FIXME: disable OMP in Bridge when called from Fortran
@@ -65,8 +65,8 @@ extern "C"
     Bridge<FORTRANFPTYPE>* pbridge = dynamic_cast<Bridge<FORTRANFPTYPE>*>( *ppbridge );
     if( pbridge == 0 ) throw std::runtime_error( "fbridgedelete_: invalid Bridge address" );
     delete pbridge;
-#ifdef __CUDACC__
-    CudaRuntime::tearDown();
+#ifdef MGONGPUCPP_GPUIMPL
+    GpuRuntime::tearDown();
 #endif
   }
 
@@ -96,7 +96,7 @@ extern "C"
   {
     Bridge<FORTRANFPTYPE>* pbridge = dynamic_cast<Bridge<FORTRANFPTYPE>*>( *ppbridge );
     if( pbridge == 0 ) throw std::runtime_error( "fbridgesequence_: invalid Bridge address" );
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     // Use the device/GPU implementation in the CUDA library
     // (there is also a host implementation in this library)
     pbridge->gpu_sequence( momenta, gs, rndhel, rndcol, *pchannelId, mes, selhel, selcol );
diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/fsampler.cc b/epochX/cudacpp/gg_tt.mad/SubProcesses/fsampler.cc
index 2fb445372d..3743934f41 100644
--- a/epochX/cudacpp/gg_tt.mad/SubProcesses/fsampler.cc
+++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/fsampler.cc
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Feb 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #include "mgOnGpuConfig.h"
 
@@ -13,7 +13,7 @@
 
 //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -40,7 +40,7 @@ namespace mg5amcCpu
   private:
     const int m_nevt; // The number of events in each iteration
     int m_iiter;      // The iteration counter (for random number seeding)
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
     HostBufferRndNumMomenta m_hstRndmom; // Memory buffers for random numbers
     HostBufferMomenta m_hstMomenta;      // Memory buffers for momenta
     HostBufferWeights m_hstWeights;      // Memory buffers for sampling weights
@@ -105,7 +105,7 @@ namespace mg5amcCpu
 
 extern "C"
 {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   using namespace mg5amcGpu;
 #else
   using namespace mg5amcCpu;
diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/runTest.cc b/epochX/cudacpp/gg_tt.mad/SubProcesses/runTest.cc
index d4a760a71b..de327f2321 100644
--- a/epochX/cudacpp/gg_tt.mad/SubProcesses/runTest.cc
+++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/runTest.cc
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: S. Hageboeck (Nov 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 //----------------------------------------------------------------------------
 // Use ./runTest.exe --gtest_filter=*xxx to run only testxxx.cc tests
 //----------------------------------------------------------------------------
@@ -18,7 +18,7 @@
 #include "RandomNumberKernels.h"
 #include "epoch_process_id.h"
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 using namespace mg5amcGpu;
 #else
 using namespace mg5amcCpu;
@@ -35,7 +35,7 @@ struct CUDA_CPU_TestBase : public TestDriverBase
     : TestDriverBase( npar, refFileName ) {}
 };
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 struct CPUTest : public CUDA_CPU_TestBase
 {
   // Struct data members (process, and memory structures for random numbers, momenta, matrix elements and weights on host and device)
@@ -119,7 +119,7 @@ struct CPUTest : public CUDA_CPU_TestBase
 };
 #endif
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 struct CUDATest : public CUDA_CPU_TestBase
 {
   // Reset the device when our test goes out of scope. Note that this should happen after
@@ -128,7 +128,7 @@ struct CUDATest : public CUDA_CPU_TestBase
   {
     ~DeviceReset()
     {
-      checkCuda( cudaDeviceReset() ); // this is needed by cuda-memcheck --leak-check full
+      checkGpu( gpuDeviceReset() ); // this is needed by cuda-memcheck --leak-check full
     }
   } deviceResetter;
 
@@ -256,7 +256,7 @@ INSTANTIATE_TEST_SUITE_P( prefix, \
                           test_suite_name, \
                           testing::Values( new CUDATest( MG_EPOCH_REFERENCE_FILE_NAME ) ) );
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 MG_INSTANTIATE_TEST_SUITE_GPU( XTESTID_GPU( MG_EPOCH_PROCESS_ID ), MadgraphTest );
 #else
 MG_INSTANTIATE_TEST_SUITE_CPU( XTESTID_CPU( MG_EPOCH_PROCESS_ID ), MadgraphTest );
diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/testmisc.cc b/epochX/cudacpp/gg_tt.mad/SubProcesses/testmisc.cc
index 895d6eeb56..ba9e59a8a3 100644
--- a/epochX/cudacpp/gg_tt.mad/SubProcesses/testmisc.cc
+++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/testmisc.cc
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 //----------------------------------------------------------------------------
 // Use ./runTest.exe --gtest_filter=*misc to run only testmisc.cc tests
 //----------------------------------------------------------------------------
@@ -17,7 +17,7 @@
 #include <sstream>
 #include <typeinfo>
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 #define TESTID( s ) s##_GPU_MISC
 #else
 #define TESTID( s ) s##_CPU_MISC
@@ -26,7 +26,7 @@
 #define XTESTID( s ) TESTID( s )
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -59,7 +59,7 @@ namespace mg5amcCpu
 
 TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testmisc )
 {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   using namespace mg5amcGpu;
 #else
   using namespace mg5amcCpu;
diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/testxxx.cc b/epochX/cudacpp/gg_tt.mad/SubProcesses/testxxx.cc
index 3361fe5aa9..e5167de00c 100644
--- a/epochX/cudacpp/gg_tt.mad/SubProcesses/testxxx.cc
+++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/testxxx.cc
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Apr 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 //----------------------------------------------------------------------------
 // Use ./runTest.exe --gtest_filter=*xxx to run only testxxx.cc tests
 //----------------------------------------------------------------------------
@@ -24,7 +24,7 @@
 #include <iomanip>
 #include <iostream>
 #include <vector>
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 #define TESTID( s ) s##_GPU_XXX
 #else
 #define TESTID( s ) s##_CPU_XXX
@@ -32,7 +32,7 @@
 
 #define XTESTID( s ) TESTID( s )
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -42,7 +42,7 @@ namespace mg5amcCpu
   int FPEhandlerIevt = -1;
   inline void FPEhandler( int sig )
   {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     std::cerr << "Floating Point Exception (GPU): '" << FPEhandlerMessage << "' ievt=" << FPEhandlerIevt << std::endl;
 #else
     std::cerr << "Floating Point Exception (CPU neppV=" << neppV << "): '" << FPEhandlerMessage << "' ievt=" << FPEhandlerIevt << std::endl;
@@ -53,7 +53,7 @@ namespace mg5amcCpu
 
 TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testxxx )
 {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   using namespace mg5amcGpu;
 #else
   using namespace mg5amcCpu;
@@ -77,7 +77,7 @@ TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testxxx )
   assert( nevt % neppM == 0 ); // nevt must be a multiple of neppM
   assert( nevt % neppV == 0 ); // nevt must be a multiple of neppV
   // Fill in the input momenta
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   mg5amcGpu::PinnedHostBufferMomenta hstMomenta( nevt ); // AOSOA[npagM][npar=4][np4=4][neppM]
 #else
   mg5amcCpu::HostBufferMomenta hstMomenta( nevt ); // AOSOA[npagM][npar=4][np4=4][neppM]
@@ -322,7 +322,7 @@ TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testxxx )
   {
     for( int ievt = 0; ievt < nevt; ievt++ )
     {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
       using namespace mg5amcGpu;
 #else
       using namespace mg5amcCpu;
diff --git a/epochX/cudacpp/gg_tt.mad/src/HelAmps_sm.h b/epochX/cudacpp/gg_tt.mad/src/HelAmps_sm.h
index 55f43bb43a..add8fce575 100644
--- a/epochX/cudacpp/gg_tt.mad/src/HelAmps_sm.h
+++ b/epochX/cudacpp/gg_tt.mad/src/HelAmps_sm.h
@@ -5,7 +5,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: A. Valassi (Sep 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
 // MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
@@ -28,7 +28,7 @@
 //#include <iomanip>
 //#include <iostream>
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/gg_tt.mad/src/Parameters_sm.cc b/epochX/cudacpp/gg_tt.mad/src/Parameters_sm.cc
index a9bc93ff98..c5dd6e7e4c 100644
--- a/epochX/cudacpp/gg_tt.mad/src/Parameters_sm.cc
+++ b/epochX/cudacpp/gg_tt.mad/src/Parameters_sm.cc
@@ -4,7 +4,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: A. Valassi (Sep 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
 // MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
@@ -17,7 +17,7 @@
 #include <iomanip>
 #include <iostream>
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 using namespace mg5amcGpu;
 #else
 using namespace mg5amcCpu;
diff --git a/epochX/cudacpp/gg_tt.mad/src/Parameters_sm.h b/epochX/cudacpp/gg_tt.mad/src/Parameters_sm.h
index 932f123fea..5f2f4391b9 100644
--- a/epochX/cudacpp/gg_tt.mad/src/Parameters_sm.h
+++ b/epochX/cudacpp/gg_tt.mad/src/Parameters_sm.h
@@ -27,7 +27,7 @@
 #include "read_slha.h"
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -93,7 +93,7 @@ namespace mg5amcCpu
 #include <limits>
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -217,7 +217,7 @@ namespace mg5amcCpu
 //==========================================================================
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -236,7 +236,7 @@ namespace mg5amcCpu
 #pragma GCC diagnostic push
 #pragma GCC diagnostic ignored "-Wunused-variable"  // e.g. <<warning: unused variable ‘mdl_G__exp__2’ [-Wunused-variable]>>
 #pragma GCC diagnostic ignored "-Wunused-parameter" // e.g. <<warning: unused parameter ‘G’ [-Wunused-parameter]>>
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 #pragma nv_diagnostic push
 #pragma nv_diag_suppress 177 // e.g. <<warning #177-D: variable "mdl_G__exp__2" was declared but never referenced>>
 #endif
@@ -263,7 +263,7 @@ namespace mg5amcCpu
       // End SM implementation - no special handling of vectors of floats as in EFT (#439)
       return out;
     }
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 #pragma GCC diagnostic pop
 #pragma nv_diagnostic pop
 #endif
diff --git a/epochX/cudacpp/gg_tt.mad/src/cudacpp_src.mk b/epochX/cudacpp/gg_tt.mad/src/cudacpp_src.mk
index d4cc628aec..159e19a46d 100644
--- a/epochX/cudacpp/gg_tt.mad/src/cudacpp_src.mk
+++ b/epochX/cudacpp/gg_tt.mad/src/cudacpp_src.mk
@@ -19,7 +19,7 @@ SHELL := /bin/bash
 #=== Configure common compiler flags for CUDA and C++
 
 INCFLAGS = -I.
-OPTFLAGS = -O3 # this ends up in CUFLAGS too (should it?), cannot add -Ofast or -ffast-math here
+OPTFLAGS = -O3 # this ends up in GPUFLAGS too (should it?), cannot add -Ofast or -ffast-math here
 
 #-------------------------------------------------------------------------------
 
@@ -45,13 +45,13 @@ endif
 
 #-------------------------------------------------------------------------------
 
-#=== Configure the CUDA compiler (note: NVCC is already exported including ccache)
+#=== Configure the CUDA compiler (note: GPUCC is already exported including ccache)
 
-###$(info NVCC=$(NVCC))
+###$(info GPUCC=$(GPUCC))
 
 #-------------------------------------------------------------------------------
 
-#=== Configure ccache for C++ builds (note: NVCC is already exported including ccache)
+#=== Configure ccache for C++ builds (note: GPUCC is already exported including ccache)
 
 # Enable ccache if USECCACHE=1
 ifeq ($(USECCACHE)$(shell echo $(CXX) | grep ccache),1)
@@ -92,6 +92,13 @@ endif
 ###$(info OMPFLAGS=$(OMPFLAGS))
 CXXFLAGS += $(OMPFLAGS)
 
+# Add correct -DHIP_LATFORM when compiling for HIP
+ifeq ($(findstring nvcc,$(GPUCC)),nvcc)
+  GPUFLAGS += -Xcompiler -fPIC -c -x cu
+else ifeq ($(findstring hipcc,$(GPUCC)),hipcc)
+  GPUFLAGS += -fPIC -c
+endif
+
 # Set the build flags appropriate to each AVX choice (example: "make AVX=none")
 # [NB MGONGPU_PVW512 is needed because "-mprefer-vector-width=256" is not exposed in a macro]
 # [See https://gcc.gnu.org/bugzilla/show_bug.cgi?id=96476]
@@ -253,20 +260,20 @@ $(BUILDDIR)/%.o : %.cc *.h $(BUILDDIR)/.build.$(TAG)
 # Generic target and build rules: objects from CUDA compilation
 $(BUILDDIR)/%_cu.o : %.cc *.h $(BUILDDIR)/.build.$(TAG)
 	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
-	$(NVCC) $(CPPFLAGS) $(CUFLAGS) -Xcompiler -fPIC -c -x cu $< -o $@
+	$(GPUCC) $(CPPFLAGS) $(GPUFLAGS) $< -o $@
 
 #-------------------------------------------------------------------------------
 
 cxx_objects=$(addprefix $(BUILDDIR)/, Parameters_sm.o read_slha.o)
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 cu_objects=$(addprefix $(BUILDDIR)/, Parameters_sm_cu.o)
 endif
 
 # Target (and build rules): common (src) library
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so : $(cxx_objects) $(cu_objects)
 	@if [ ! -d $(LIBDIR) ]; then echo "mkdir -p $(LIBDIR)"; mkdir -p $(LIBDIR); fi
-	$(NVCC) -shared -o $@ $(cxx_objects) $(cu_objects) $(LDFLAGS)
+	$(GPUCC) -shared -o $@ $(cxx_objects) $(cu_objects) $(LDFLAGS)
 else
 $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so : $(cxx_objects)
 	@if [ ! -d $(LIBDIR) ]; then echo "mkdir -p $(LIBDIR)"; mkdir -p $(LIBDIR); fi
diff --git a/epochX/cudacpp/gg_tt.mad/src/mgOnGpuConfig.h b/epochX/cudacpp/gg_tt.mad/src/mgOnGpuConfig.h
index 80032e528b..55d03f1252 100644
--- a/epochX/cudacpp/gg_tt.mad/src/mgOnGpuConfig.h
+++ b/epochX/cudacpp/gg_tt.mad/src/mgOnGpuConfig.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jul 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MGONGPUCONFIG_H
 #define MGONGPUCONFIG_H 1
@@ -10,12 +10,25 @@
 // There are two different code bases for standalone_cudacpp (without multichannel) and madevent+cudacpp (with multichannel)
 #define MGONGPU_SUPPORTS_MULTICHANNEL 1
 
+// Is this a GPU (CUDA, HIP) or CPU implementation?
+#ifdef __CUDACC__
+#define MGONGPUCPP_GPUIMPL cuda
+#elif defined __HIPCC__
+#define MGONGPUCPP_GPUIMPL hip
+#else
+#undef MGONGPUCPP_GPUIMPL
+#endif
+
 // ** NB1 Throughputs (e.g. 6.8E8) are events/sec for "./gcheck.exe -p 65536 128 12"
 // ** NB2 Baseline on b7g47n0004 fluctuates (probably depends on load on other VMs)
 
 // Choose if curand is supported for generating random numbers
+// For HIP, by default, do not use curand (common random numbers will be used instead)
 // For both CUDA and C++, by default, do not inline, but allow this macro to be set from outside with e.g. -DMGONGPU_HAS_NO_CURAND
-// (there exist CUDA installations, e.g. using the HPC package, which do not include curand - see PR #784)
+// (there exist CUDA installations, e.g. using the HPC package, which do not include curand - see PR #784 and #785)
+#if defined __HIPCC__
+#define MGONGPU_HAS_NO_CURAND 1
+#else
 //#ifdef __CUDACC__
 //#undef MGONGPU_HAS_NO_CURAND // default
 ////#define MGONGPU_HAS_NO_CURAND 1
@@ -23,6 +36,7 @@
 //#undef MGONGPU_HAS_NO_CURAND // default
 ////#define MGONGPU_HAS_NO_CURAND 1
 //#endif
+#endif
 
 // Choose floating point precision (for everything but color algebra #537)
 // If one of these macros has been set from outside with e.g. -DMGONGPU_FPTYPE_FLOAT, nothing happens (issue #167)
@@ -54,23 +68,28 @@
 //#undef MGONGPU_HARDCODE_PARAM // default
 ////#define MGONGPU_HARDCODE_PARAM 1
 
-// Complex type in c++: std::complex or cxsmpl (CHOOSE ONLY ONE)
-#ifndef __CUDACC__
-//#define MGONGPU_CPPCXTYPE_STDCOMPLEX 1 // ~8 percent slower on float, same on double (5.1E6/double, 9.4E6/float)
-#define MGONGPU_CPPCXTYPE_CXSMPL 1 // new default (5.1E6/double, 10.2E6/float)
-#endif
-
-// Complex type in cuda: thrust or cucomplex or cxsmpl (CHOOSE ONLY ONE)
+// Complex type in CUDA: thrust or cucomplex or cxsmpl (CHOOSE ONLY ONE)
 #ifdef __CUDACC__
 #define MGONGPU_CUCXTYPE_THRUST 1 // default (~1.15E9/double, ~3.2E9/float)
 //#define MGONGPU_CUCXTYPE_CUCOMPLEX 1 // ~10 percent slower (1.03E9/double, ~2.8E9/float)
 //#define MGONGPU_CUCXTYPE_CXSMPL 1 // ~10 percent slower (1.00E9/double, ~2.9E9/float)
+
+// Complex type in HIP: cxsmpl (ONLY ONE OPTION POSSIBLE)
+#elif defined __HIPCC__
+#define MGONGPU_CUCXTYPE_CXSMPL 1 // ~10 percent slower (1.00E9/double, ~2.9E9/float)
+
+// Complex type in C++: std::complex or cxsmpl (CHOOSE ONLY ONE)
+#else
+//#define MGONGPU_CPPCXTYPE_STDCOMPLEX 1 // ~8 percent slower on float, same on double (5.1E6/double, 9.4E6/float)
+#define MGONGPU_CPPCXTYPE_CXSMPL 1 // new default (5.1E6/double, 10.2E6/float)
 #endif
 
-// Cuda nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation
+// CUDA nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation
 #ifdef __CUDACC__
-#undef MGONGPU_NSIGHT_DEBUG // default
+#undef MGONGPU_NSIGHT_DEBUG // default in CUDA
 //#define MGONGPU_NSIGHT_DEBUG 1
+#else
+#undef MGONGPU_NSIGHT_DEBUG // only option in HIP or C++
 #endif
 
 // SANITY CHECKS (floating point precision for everything but color algebra #537)
@@ -86,17 +105,21 @@
 #error You cannot use double precision for color algebra and single precision elsewhere
 #endif
 
-// SANITY CHECKS (c++ complex number implementation)
-#ifndef __CUDACC__
-#if defined MGONGPU_CPPCXTYPE_STDCOMPLEX and defined MGONGPU_CPPCXTYPE_CXSMPL
-#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CPPCXTYPE_STDCOMPLEX or MGONGPU_CPPCXTYPE_CXSMPL
+// SANITY CHECKS (CUDA complex number implementation)
+#ifdef __CUDACC__
+#if defined MGONGPU_CUCXTYPE_THRUST and defined MGONGPU_CUCXTYPE_CUCOMPLEX
+#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CUCXTYPE_THRUST or MGONGPU_CUCXTYPE_CUCOMPLEX for CUDA
+#elif defined MGONGPU_CUCXTYPE_THRUST and defined MGONGPU_CUCXTYPE_CXSMPL
+#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CUCXTYPE_THRUST or MGONGPU_CUCXTYPE_CXSMPL for CUDA
+#elif defined MGONGPU_CUCXTYPE_CUCOMPLEX and defined MGONGPU_CUCXTYPE_CXSMPL
+#error You must CHOOSE (ONE AND) ONLY ONE OF MGONGPU_CUCXTYPE_CUCOMPLEX or MGONGPU_CUCXTYPE_CXSMPL for CUDA
 #endif
 #endif
 
-// SANITY CHECKS (cuda complex number implementation)
-#ifdef __CUDACC__
-#if defined MGONGPU_CUCXTYPE_THRUST and defined MGONGPU_CUCXTYPE_CUCOMPLEX and defined MGONGPU_CUCXTYPE_CXSMPL
-#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CUCXTYPE_THRUST or MGONGPU_CUCXTYPE_CUCOMPLEX or MGONGPU_CUCXTYPE_CXSMPL
+// SANITY CHECKS (C++ complex number implementation)
+#ifndef MGONGPUCPP_GPUIMPL
+#if defined MGONGPU_CPPCXTYPE_STDCOMPLEX and defined MGONGPU_CPPCXTYPE_CXSMPL
+#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CPPCXTYPE_STDCOMPLEX or MGONGPU_CPPCXTYPE_CXSMPL for C++
 #endif
 #endif
 
@@ -134,7 +157,7 @@ namespace mgOnGpu
   // Alignment requirement for using reinterpret_cast with SIMD vectorized code
   // (using reinterpret_cast with non aligned memory may lead to segmentation faults!)
   // Only needed for C++ code but can be enforced also in NVCC builds of C++ code using CUDA>=11.2 and C++17 (#318, #319, #333)
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   constexpr int cppAlign = 64; // alignment requirement for SIMD vectorization (64-byte i.e. 512-bit)
 #endif
 
@@ -145,7 +168,7 @@ using mgOnGpu::fptype;
 using mgOnGpu::fptype2;
 
 // C++ SIMD vectorization width (this will be used to set neppV)
-#ifdef __CUDACC__ // CUDA implementation has no SIMD
+#ifdef MGONGPUCPP_GPUIMPL // CUDA and HIP implementations have no SIMD
 #undef MGONGPU_CPPSIMD
 #elif defined __AVX512VL__ && defined MGONGPU_PVW512 // C++ "512z" AVX512 with 512 width (512-bit ie 64-byte): 8 (DOUBLE) or 16 (FLOAT)
 #ifdef MGONGPU_FPTYPE_DOUBLE
@@ -175,9 +198,9 @@ using mgOnGpu::fptype2;
 #undef MGONGPU_CPPSIMD
 #endif
 
-// Cuda nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation
+// CUDA nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation
 // Arguments (not used so far): text is __FUNCTION__, code is 0 (start) or 1 (end)
-#if defined __CUDACC__ && defined MGONGPU_NSIGHT_DEBUG /* clang-format off */
+#if defined __CUDA__ && defined MGONGPU_NSIGHT_DEBUG /* clang-format off */
 #define mgDebugDeclare() __shared__ float mgDebugCounter[mgOnGpu::ntpbMAX];
 #define mgDebugInitialise() { mgDebugCounter[threadIdx.x] = 0; }
 #define mgDebug( code, text ) { mgDebugCounter[threadIdx.x] += 1; }
@@ -189,8 +212,8 @@ using mgOnGpu::fptype2;
 #define mgDebugFinalise() { /*noop*/ }
 #endif /* clang-format on */
 
-// Define empty CUDA declaration specifiers for C++
-#ifndef __CUDACC__
+// Define empty CUDA/HIP declaration specifiers for C++
+#ifndef MGONGPUCPP_GPUIMPL
 #define __global__
 #define __host__
 #define __device__
diff --git a/epochX/cudacpp/gg_tt.mad/src/mgOnGpuCxtypes.h b/epochX/cudacpp/gg_tt.mad/src/mgOnGpuCxtypes.h
index ca9a9f00c0..5532e22fa1 100644
--- a/epochX/cudacpp/gg_tt.mad/src/mgOnGpuCxtypes.h
+++ b/epochX/cudacpp/gg_tt.mad/src/mgOnGpuCxtypes.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022, based on earlier work by D. Smith) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MGONGPUCXTYPES_H
 #define MGONGPUCXTYPES_H 1
@@ -19,7 +19,7 @@
 #include <complex>
 
 // Complex type in cuda: thrust or cucomplex or cxsmpl
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 #if defined MGONGPU_CUCXTYPE_THRUST
 #pragma clang diagnostic push
 #pragma clang diagnostic ignored "-Wtautological-compare" // for icpx2021/clang13 (https://stackoverflow.com/a/15864661)
@@ -82,7 +82,7 @@ namespace mgOnGpu /* clang-format off */
 using mgOnGpu::cxsmpl;
 
 // Printout to stream for user defined types
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -92,7 +92,7 @@ namespace mg5amcCpu
   inline __host__ std::ostream&
   operator<<( std::ostream& out, const cxsmpl<FP>& c )
   {
-    out << std::complex( c.real(), c.imag() );
+    out << std::complex<FP>( c.real(), c.imag() );
     return out;
   }
 
@@ -215,14 +215,14 @@ namespace mg5amcCpu
 //==========================================================================
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
 #endif
 {
   // --- Type definitions (complex type: cxtype)
-#ifdef __CUDACC__ // cuda
+#ifdef MGONGPUCPP_GPUIMPL // cuda
 #if defined MGONGPU_CUCXTYPE_THRUST
   typedef thrust::complex<fptype> cxtype;
 #elif defined MGONGPU_CUCXTYPE_CUCOMPLEX
@@ -255,7 +255,7 @@ namespace mg5amcCpu
 //==========================================================================
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -307,7 +307,7 @@ namespace mg5amcCpu
 
   //==========================================================================
 
-#if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_THRUST // cuda + thrust
+#if defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CUCXTYPE_THRUST // cuda + thrust
 
   //------------------------------
   // CUDA - using thrust::complex
@@ -343,11 +343,11 @@ namespace mg5amcCpu
     return c;
   }
 
-#endif // #if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_THRUST
+#endif // #if defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CUCXTYPE_THRUST
 
   //==========================================================================
 
-#if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_CUCOMPLEX // cuda + cucomplex
+#if defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CUCXTYPE_CUCOMPLEX // cuda + cucomplex
 
   //------------------------------
   // CUDA - using cuComplex
@@ -562,11 +562,11 @@ namespace mg5amcCpu
     return cxmake( c.real(), c.imag() );
   }
 
-#endif // #if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_CUCOMPLEX
+#endif // #if defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CUCXTYPE_CUCOMPLEX
 
   //==========================================================================
 
-#if not defined __CUDACC__ and defined MGONGPU_CPPCXTYPE_STDCOMPLEX // c++ + stdcomplex
+#if not defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CPPCXTYPE_STDCOMPLEX // c++ + stdcomplex
 
   //------------------------------
   // C++ - using std::complex
@@ -610,7 +610,7 @@ namespace mg5amcCpu
   }
 #endif
 
-#endif // #if not defined __CUDACC__ and defined MGONGPU_CPPCXTYPE_STDCOMPLEX
+#endif // #if not defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CPPCXTYPE_STDCOMPLEX
 
   //==========================================================================
 
@@ -633,7 +633,7 @@ namespace mg5amcCpu
 //==========================================================================
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/gg_tt.mad/src/mgOnGpuFptypes.h b/epochX/cudacpp/gg_tt.mad/src/mgOnGpuFptypes.h
index 905c97d700..fa3a02664b 100644
--- a/epochX/cudacpp/gg_tt.mad/src/mgOnGpuFptypes.h
+++ b/epochX/cudacpp/gg_tt.mad/src/mgOnGpuFptypes.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MGONGPUFPTYPES_H
 #define MGONGPUFPTYPES_H 1
@@ -12,7 +12,7 @@
 #include <cmath>
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL // cuda
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -20,7 +20,7 @@ namespace mg5amcCpu
 {
   //==========================================================================
 
-#ifdef __CUDACC__ // cuda
+#ifdef MGONGPUCPP_GPUIMPL // cuda
 
   //------------------------------
   // Floating point types - Cuda
@@ -64,11 +64,11 @@ namespace mg5amcCpu
 #endif
   }
 
-#endif // #ifdef __CUDACC__
+#endif // #ifdef MGONGPUCPP_GPUIMPL
 
   //==========================================================================
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 
   //------------------------------
   // Floating point types - C++
@@ -92,7 +92,7 @@ namespace mg5amcCpu
     return std::sqrt( f );
   }
 
-#endif // #ifndef __CUDACC__
+#endif // #ifndef MGONGPUCPP_GPUIMPL
 
   //==========================================================================
 
diff --git a/epochX/cudacpp/gg_tt.mad/src/mgOnGpuVectors.h b/epochX/cudacpp/gg_tt.mad/src/mgOnGpuVectors.h
index e1299ba81e..cdae04326b 100644
--- a/epochX/cudacpp/gg_tt.mad/src/mgOnGpuVectors.h
+++ b/epochX/cudacpp/gg_tt.mad/src/mgOnGpuVectors.h
@@ -32,7 +32,7 @@
 #endif
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -131,7 +131,7 @@ namespace mg5amcCpu
 #endif
 #endif
 
-#else // i.e #ifndef MGONGPU_CPPSIMD (this includes #ifdef __CUDACC__)
+#else // i.e #ifndef MGONGPU_CPPSIMD (this includes #ifdef MGONGPUCPP_GPUIMPL)
 
   const int neppV = 1;
 
@@ -153,13 +153,13 @@ namespace mg5amcCpu
 //==========================================================================
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
 #endif
 {
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 
   // Printout to stream for user defined types
 
@@ -805,11 +805,11 @@ namespace mg5amcCpu
 
 #endif // #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
 
-#endif // #ifndef __CUDACC__
+#endif // #ifndef MGONGPUCPP_GPUIMPL
 
   //==========================================================================
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 
   //------------------------------
   // Vector types - CUDA
@@ -853,12 +853,12 @@ namespace mg5amcCpu
     return mask;
   }
 
-#endif // #ifdef __CUDACC__
+#endif // #ifdef MGONGPUCPP_GPUIMPL
 
   //==========================================================================
 
   // Scalar-or-vector types: scalar in CUDA, vector or scalar in C++
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   typedef bool bool_sv;
   typedef fptype fptype_sv;
   typedef fptype2 fptype2_sv;
@@ -879,7 +879,7 @@ namespace mg5amcCpu
 #endif
 
   // Scalar-or-vector zeros: scalar in CUDA, vector or scalar in C++
-#ifdef __CUDACC__ /* clang-format off */
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
   inline __host__ __device__ cxtype cxzero_sv(){ return cxtype( 0, 0 ); }
 #elif defined MGONGPU_CPPSIMD
   inline cxtype_v cxzero_sv() { return cxtype_v(); } // RRRR=0000 IIII=0000
diff --git a/epochX/cudacpp/gg_tt.mad/src/rambo.h b/epochX/cudacpp/gg_tt.mad/src/rambo.h
index e02ea52496..cd7e1008ea 100644
--- a/epochX/cudacpp/gg_tt.mad/src/rambo.h
+++ b/epochX/cudacpp/gg_tt.mad/src/rambo.h
@@ -4,7 +4,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 
 #include "mgOnGpuConfig.h"
@@ -18,7 +18,7 @@
 #include <iostream>
 
 // Simplified rambo version for 2 to N (with N>=2) processes with massless particles
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -83,7 +83,7 @@ namespace mg5amcCpu
       static bool first = true;
       if( first )
       {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
         if constexpr( M_ACCESS::isOnDevice() ) // avoid
         {
           const int ievt0 = 0;
@@ -166,7 +166,7 @@ namespace mg5amcCpu
     wt = po2log;
     if( nparf != 2 ) wt = ( 2. * nparf - 4. ) * log( energy ) + z[nparf - 1];
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
     // issue warnings if weight is too small or too large
     static int iwarn[5] = { 0, 0, 0, 0, 0 };
     if( wt < -180. )

From ddb77e9d2052538f81a7868649e65c408a3f924e Mon Sep 17 00:00:00 2001
From: Andrea Valassi <andrea.valassi@cern.ch>
Date: Thu, 25 Jan 2024 17:47:53 +0100
Subject: [PATCH 506/509] [jthip24] regenerate gg_tt.mad (this is now identical
 to the version in jt774)

---
 .../cudacpp/gg_tt.mad/CODEGEN_mad_gg_tt_log.txt  | 16 ++++++++--------
 .../gg_tt.mad/SubProcesses/GpuAbstraction.h      |  2 ++
 epochX/cudacpp/gg_tt.mad/src/mgOnGpuConfig.h     |  1 -
 3 files changed, 10 insertions(+), 9 deletions(-)

diff --git a/epochX/cudacpp/gg_tt.mad/CODEGEN_mad_gg_tt_log.txt b/epochX/cudacpp/gg_tt.mad/CODEGEN_mad_gg_tt_log.txt
index 5473d52ae9..25807a1217 100644
--- a/epochX/cudacpp/gg_tt.mad/CODEGEN_mad_gg_tt_log.txt
+++ b/epochX/cudacpp/gg_tt.mad/CODEGEN_mad_gg_tt_log.txt
@@ -62,7 +62,7 @@ generate g g > t t~
 No model currently active, so we import the Standard Model
 INFO: load particles 
 INFO: load vertices 
-[1;32mDEBUG: model prefixing  takes 0.005357503890991211 [0m
+[1;32mDEBUG: model prefixing  takes 0.00539088249206543 [0m
 INFO: Restrict model sm with file models/sm/restrict_default.dat . 
 [1;32mDEBUG: Simplifying conditional expressions [0m
 [1;32mDEBUG: remove interactions: u s w+ at order: QED=1 [0m
@@ -175,7 +175,7 @@ INFO: Generating Helas calls for process: g g > t t~ WEIGHTED<=2 @1
 INFO: Processing color information for process: g g > t t~ @1 
 INFO: Creating files in directory P1_gg_ttx 
 [1;32mDEBUG:  kwargs[prefix] = 0 [1;30m[model_handling.py at line 1058][0m [0m
-[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7f40222f5940> [1;30m[export_v4.py at line 6262][0m [0m
+[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7f0450812940> [1;30m[export_v4.py at line 6262][0m [0m
 INFO: Creating files in directory . 
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.h
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.cc
@@ -191,16 +191,16 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./.
 INFO: Generating Feynman diagrams for Process: g g > t t~ WEIGHTED<=2 @1 
 INFO: Finding symmetric diagrams for subprocess group gg_ttx 
 Generated helas calls for 1 subprocesses (3 diagrams) in 0.006 s
-Wrote files for 10 helas calls in 0.101 s
+Wrote files for 10 helas calls in 0.100 s
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates VVV1 set of routines with options: P0[0m
 ALOHA: aloha creates FFV1 routines[0m
-ALOHA: aloha creates 2 routines in  0.144 s
+ALOHA: aloha creates 2 routines in  0.143 s
 [1;32mDEBUG:  Entering PLUGIN_ProcessExporter.convert_model (create the model) [1;30m[output.py at line 202][0m [0m
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates VVV1 set of routines with options: P0[0m
 ALOHA: aloha creates FFV1 routines[0m
-ALOHA: aloha creates 4 routines in  0.133 s
+ALOHA: aloha creates 4 routines in  0.132 s
 <class 'aloha.create_aloha.AbstractRoutine'> VVV1
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
@@ -237,9 +237,9 @@ Type "launch" to generate events from this process, or see
 Run "open index.html" to see more information about this process.
 quit
 
-real	0m1.689s
-user	0m1.475s
-sys	0m0.213s
+real	0m1.694s
+user	0m1.467s
+sys	0m0.223s
 Code generation completed in 2 seconds
 ************************************************************
 *                                                          *
diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/GpuAbstraction.h b/epochX/cudacpp/gg_tt.mad/SubProcesses/GpuAbstraction.h
index 9c467b1e04..6a7d9c05c0 100644
--- a/epochX/cudacpp/gg_tt.mad/SubProcesses/GpuAbstraction.h
+++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/GpuAbstraction.h
@@ -39,6 +39,8 @@
 
 #elif defined __HIPCC__
 
+#include "hip/hip_runtime.h"
+
 #define gpuError_t hipError_t
 #define gpuPeekAtLastError hipPeekAtLastError
 #define gpuGetErrorString hipGetErrorString
diff --git a/epochX/cudacpp/gg_tt.mad/src/mgOnGpuConfig.h b/epochX/cudacpp/gg_tt.mad/src/mgOnGpuConfig.h
index a9515694ae..55d03f1252 100644
--- a/epochX/cudacpp/gg_tt.mad/src/mgOnGpuConfig.h
+++ b/epochX/cudacpp/gg_tt.mad/src/mgOnGpuConfig.h
@@ -15,7 +15,6 @@
 #define MGONGPUCPP_GPUIMPL cuda
 #elif defined __HIPCC__
 #define MGONGPUCPP_GPUIMPL hip
-#include "hip/hip_runtime.h"
 #else
 #undef MGONGPUCPP_GPUIMPL
 #endif

From 9bc201828248fa99914945b0760ae9dd0e0763a6 Mon Sep 17 00:00:00 2001
From: Andrea Valassi <andrea.valassi@cern.ch>
Date: Thu, 25 Jan 2024 17:52:34 +0100
Subject: [PATCH 507/509] [jthip24] regenerate all processes - add to repo
 Gpu*.h when missing (eg in pp_tt012j)

NB: Now all processes in the repo are the same as in jt774, except for codegen logs: will now copy over those codegen logs
---
 .../ee_mumu.mad/CODEGEN_mad_ee_mumu_log.txt   |   171 +-
 .../ee_mumu.mad/Cards/me5_configuration.txt   |     4 +-
 .../ee_mumu.mad/Cards/proc_card_mg5.dat       |     6 +-
 epochX/cudacpp/ee_mumu.mad/Cards/run_card.dat |    32 +-
 .../ee_mumu.mad/Cards/run_card_default.dat    |    26 +-
 epochX/cudacpp/ee_mumu.mad/MGMEVersion.txt    |     2 +-
 .../ee_mumu.mad/Source/DHELAS/aloha_file.inc  |     2 +-
 .../ee_mumu.mad/Source/PDF/pdfwrap_lhapdf.f   |     1 +
 epochX/cudacpp/ee_mumu.mad/Source/make_opts   |    17 +-
 epochX/cudacpp/ee_mumu.mad/Source/makefile    |     4 +-
 .../cudacpp/ee_mumu.mad/Source/param_card.inc |    14 +-
 epochX/cudacpp/ee_mumu.mad/Source/vector.inc  |     3 +-
 .../cudacpp/ee_mumu.mad/SubProcesses/Bridge.h |    14 +-
 .../ee_mumu.mad/SubProcesses/GpuAbstraction.h |     2 +
 .../ee_mumu.mad/SubProcesses/MGVersion.txt    |     2 +-
 .../ee_mumu.mad/SubProcesses/MadgraphTest.h   |    12 +-
 .../SubProcesses/MatrixElementKernels.cc      |     9 +-
 .../SubProcesses/P1_epem_mupmum/CPPProcess.cc |   181 +-
 .../SubProcesses/P1_epem_mupmum/CPPProcess.h  |     2 +-
 .../SubProcesses/P1_epem_mupmum/auto_dsig.f   |     2 +-
 .../SubProcesses/P1_epem_mupmum/auto_dsig1.f  |    25 +-
 .../SubProcesses/P1_epem_mupmum/check_sa.cc   |    87 +-
 .../SubProcesses/P1_epem_mupmum/counters.cc   |    18 +-
 .../SubProcesses/P1_epem_mupmum/matrix1.f     |     6 +-
 .../ee_mumu.mad/SubProcesses/cudacpp.mk       |   273 +-
 .../ee_mumu.mad/SubProcesses/dummy_fct.f      |    10 +-
 .../ee_mumu.mad/SubProcesses/fbridge.cc       |     6 +-
 .../cudacpp/ee_mumu.mad/SubProcesses/genps.f  |     4 +-
 .../cudacpp/ee_mumu.mad/SubProcesses/makefile |    19 +-
 .../ee_mumu.mad/SubProcesses/runTest.cc       |     4 +
 .../ee_mumu.mad/SubProcesses/testxxx.cc       |     3 +-
 .../cudacpp/ee_mumu.mad/bin/generate_events   |    22 +-
 .../ee_mumu.mad/bin/internal/__init__.py      |     1 +
 .../ee_mumu.mad/bin/internal/banner.py        |   326 +-
 .../bin/internal/check_param_card.py          |     2 +-
 .../bin/internal/common_run_interface.py      |    28 +-
 .../ee_mumu.mad/bin/internal/extended_cmd.py  |     8 +-
 .../ee_mumu.mad/bin/internal/gen_ximprove.py  |    15 +-
 .../ee_mumu.mad/bin/internal/lhe_parser.py    |    77 +-
 .../bin/internal/madevent_interface.py        |    29 +-
 .../cudacpp/ee_mumu.mad/bin/internal/misc.py  |     2 +-
 .../ee_mumu.mad/bin/internal/shower_card.py   |    10 +-
 .../bin/internal/ufomodel/py3_model.pkl       |   Bin 42822 -> 0 bytes
 epochX/cudacpp/ee_mumu.mad/bin/madevent       |    26 +-
 epochX/cudacpp/ee_mumu.mad/src/HelAmps_sm.h   |    32 +-
 .../cudacpp/ee_mumu.mad/src/Parameters_sm.cc  |     2 +-
 .../cudacpp/ee_mumu.mad/src/Parameters_sm.h   |     2 +-
 epochX/cudacpp/ee_mumu.mad/src/cudacpp_src.mk |    11 +-
 .../cudacpp/ee_mumu.mad/src/mgOnGpuConfig.h   |    16 +-
 .../cudacpp/ee_mumu.mad/src/mgOnGpuCxtypes.h  |     6 +
 .../CODEGEN_cudacpp_ee_mumu_log.txt           |    82 +-
 .../cudacpp/ee_mumu.sa/SubProcesses/Bridge.h  |    14 +-
 .../ee_mumu.sa/SubProcesses/GpuAbstraction.h  |     2 +
 .../ee_mumu.sa/SubProcesses/MadgraphTest.h    |    12 +-
 .../SubProcesses/MatrixElementKernels.cc      |     9 +-
 .../P1_Sigma_sm_epem_mupmum/CPPProcess.cc     |   181 +-
 .../P1_Sigma_sm_epem_mupmum/CPPProcess.h      |     2 +-
 .../P1_Sigma_sm_epem_mupmum/check_sa.cc       |    87 +-
 .../ee_mumu.sa/SubProcesses/cudacpp.mk        |   273 +-
 .../ee_mumu.sa/SubProcesses/fbridge.cc        |     6 +-
 .../ee_mumu.sa/SubProcesses/runTest.cc        |     4 +
 .../ee_mumu.sa/SubProcesses/testxxx.cc        |     3 +-
 epochX/cudacpp/ee_mumu.sa/src/HelAmps_sm.h    |    32 +-
 .../cudacpp/ee_mumu.sa/src/Parameters_sm.cc   |     2 +-
 epochX/cudacpp/ee_mumu.sa/src/Parameters_sm.h |     2 +-
 epochX/cudacpp/ee_mumu.sa/src/cudacpp_src.mk  |    11 +-
 epochX/cudacpp/ee_mumu.sa/src/mgOnGpuConfig.h |    16 +-
 .../cudacpp/ee_mumu.sa/src/mgOnGpuCxtypes.h   |     6 +
 .../gg_tt.mad/CODEGEN_mad_gg_tt_log.txt       |    14 +-
 .../gg_tt.sa/CODEGEN_cudacpp_gg_tt_log.txt    |    87 +-
 epochX/cudacpp/gg_tt.sa/SubProcesses/Bridge.h |    14 +-
 .../gg_tt.sa/SubProcesses/GpuAbstraction.h    |     2 +
 .../gg_tt.sa/SubProcesses/MadgraphTest.h      |    12 +-
 .../SubProcesses/MatrixElementKernels.cc      |     9 +-
 .../P1_Sigma_sm_gg_ttx/CPPProcess.cc          |   170 +-
 .../P1_Sigma_sm_gg_ttx/CPPProcess.h           |     2 +-
 .../P1_Sigma_sm_gg_ttx/check_sa.cc            |    87 +-
 .../cudacpp/gg_tt.sa/SubProcesses/cudacpp.mk  |   273 +-
 .../cudacpp/gg_tt.sa/SubProcesses/fbridge.cc  |     6 +-
 .../cudacpp/gg_tt.sa/SubProcesses/runTest.cc  |     4 +
 .../cudacpp/gg_tt.sa/SubProcesses/testxxx.cc  |     3 +-
 epochX/cudacpp/gg_tt.sa/src/HelAmps_sm.h      |    10 +-
 epochX/cudacpp/gg_tt.sa/src/Parameters_sm.cc  |     2 +-
 epochX/cudacpp/gg_tt.sa/src/Parameters_sm.h   |     2 +-
 epochX/cudacpp/gg_tt.sa/src/cudacpp_src.mk    |    11 +-
 epochX/cudacpp/gg_tt.sa/src/mgOnGpuConfig.h   |    16 +-
 epochX/cudacpp/gg_tt.sa/src/mgOnGpuCxtypes.h  |     6 +
 .../gg_tt01g.mad/CODEGEN_mad_gg_tt01g_log.txt |    30 +-
 epochX/cudacpp/gg_tt01g.mad/COPYRIGHT         |     1 +
 .../gg_tt01g.mad/SubProcesses/Bridge.h        |    32 +-
 .../SubProcesses/BridgeKernels.cc             |     9 +-
 .../gg_tt01g.mad/SubProcesses/BridgeKernels.h |     8 +-
 .../SubProcesses/CommonRandomNumberKernel.cc  |     5 +-
 .../SubProcesses/CrossSectionKernels.cc       |     7 +-
 .../SubProcesses/CrossSectionKernels.h        |     6 +-
 .../gg_tt01g.mad/SubProcesses/CudaRuntime.h   |    85 -
 .../SubProcesses/CurandRandomNumberKernel.cc  |    12 +-
 .../SubProcesses/EventStatistics.h            |     4 +-
 .../SubProcesses/GpuAbstraction.h             |     2 +
 .../gg_tt01g.mad/SubProcesses/MadgraphTest.h  |     8 +-
 .../SubProcesses/MatrixElementKernels.cc      |    26 +-
 .../SubProcesses/MatrixElementKernels.h       |     8 +-
 .../SubProcesses/MemoryAccessAmplitudes.h     |     2 +-
 .../SubProcesses/MemoryAccessCouplings.h      |     2 +-
 .../SubProcesses/MemoryAccessCouplingsFixed.h |     2 +-
 .../SubProcesses/MemoryAccessDenominators.h   |     2 +-
 .../SubProcesses/MemoryAccessGs.h             |     2 +-
 .../SubProcesses/MemoryAccessHelpers.h        |     4 +-
 .../SubProcesses/MemoryAccessMatrixElements.h |     2 +-
 .../SubProcesses/MemoryAccessMomenta.h        |     6 +-
 .../SubProcesses/MemoryAccessNumerators.h     |     2 +-
 .../SubProcesses/MemoryAccessRandomNumbers.h  |     4 +-
 .../SubProcesses/MemoryAccessVectors.h        |     4 +-
 .../SubProcesses/MemoryAccessWavefunctions.h  |     2 +-
 .../gg_tt01g.mad/SubProcesses/MemoryBuffers.h |    64 +-
 .../SubProcesses/P1_gg_ttx/CPPProcess.cc      |    62 +-
 .../SubProcesses/P1_gg_ttx/CPPProcess.h       |    10 +-
 .../SubProcesses/P1_gg_ttx/CudaRuntime.h      |     1 -
 .../SubProcesses/P1_gg_ttx/check_sa.cc        |   111 +-
 .../SubProcesses/P2_gg_ttxg/CPPProcess.cc     |    62 +-
 .../SubProcesses/P2_gg_ttxg/CPPProcess.h      |    10 +-
 .../SubProcesses/P2_gg_ttxg/CudaRuntime.h     |     1 -
 .../SubProcesses/P2_gg_ttxg/check_sa.cc       |   111 +-
 .../SubProcesses/RamboSamplingKernels.cc      |    20 +-
 .../SubProcesses/RamboSamplingKernels.h       |     6 +-
 .../SubProcesses/RandomNumberKernels.h        |     6 +-
 .../gg_tt01g.mad/SubProcesses/cudacpp.mk      |   232 +-
 .../gg_tt01g.mad/SubProcesses/fbridge.cc      |    16 +-
 .../gg_tt01g.mad/SubProcesses/fsampler.cc     |     8 +-
 .../gg_tt01g.mad/SubProcesses/runTest.cc      |    12 +-
 .../gg_tt01g.mad/SubProcesses/testmisc.cc     |     8 +-
 .../gg_tt01g.mad/SubProcesses/testxxx.cc      |    14 +-
 epochX/cudacpp/gg_tt01g.mad/src/HelAmps_sm.h  |     4 +-
 .../cudacpp/gg_tt01g.mad/src/Parameters_sm.cc |     4 +-
 .../cudacpp/gg_tt01g.mad/src/Parameters_sm.h  |    10 +-
 .../cudacpp/gg_tt01g.mad/src/cudacpp_src.mk   |    23 +-
 .../cudacpp/gg_tt01g.mad/src/mgOnGpuConfig.h  |    73 +-
 .../cudacpp/gg_tt01g.mad/src/mgOnGpuCxtypes.h |    28 +-
 .../cudacpp/gg_tt01g.mad/src/mgOnGpuFptypes.h |    12 +-
 .../cudacpp/gg_tt01g.mad/src/mgOnGpuVectors.h |    18 +-
 epochX/cudacpp/gg_tt01g.mad/src/rambo.h       |     8 +-
 .../gg_ttg.mad/CODEGEN_mad_gg_ttg_log.txt     |   182 +-
 .../gg_ttg.mad/Cards/me5_configuration.txt    |     4 +-
 .../gg_ttg.mad/Cards/proc_card_mg5.dat        |     6 +-
 epochX/cudacpp/gg_ttg.mad/Cards/run_card.dat  |    32 +-
 .../gg_ttg.mad/Cards/run_card_default.dat     |    26 +-
 epochX/cudacpp/gg_ttg.mad/MGMEVersion.txt     |     2 +-
 .../gg_ttg.mad/Source/DHELAS/aloha_file.inc   |     2 +-
 .../gg_ttg.mad/Source/PDF/pdfwrap_lhapdf.f    |     1 +
 epochX/cudacpp/gg_ttg.mad/Source/make_opts    |    17 +-
 epochX/cudacpp/gg_ttg.mad/Source/makefile     |     4 +-
 .../cudacpp/gg_ttg.mad/Source/param_card.inc  |    14 +-
 epochX/cudacpp/gg_ttg.mad/Source/vector.inc   |     3 +-
 .../cudacpp/gg_ttg.mad/SubProcesses/Bridge.h  |    14 +-
 .../gg_ttg.mad/SubProcesses/GpuAbstraction.h  |     2 +
 .../gg_ttg.mad/SubProcesses/MGVersion.txt     |     2 +-
 .../gg_ttg.mad/SubProcesses/MadgraphTest.h    |    12 +-
 .../SubProcesses/MatrixElementKernels.cc      |     9 +-
 .../SubProcesses/P1_gg_ttxg/CPPProcess.cc     |   220 +-
 .../SubProcesses/P1_gg_ttxg/CPPProcess.h      |     2 +-
 .../SubProcesses/P1_gg_ttxg/auto_dsig.f       |     2 +-
 .../SubProcesses/P1_gg_ttxg/auto_dsig1.f      |    23 +-
 .../SubProcesses/P1_gg_ttxg/check_sa.cc       |    87 +-
 .../SubProcesses/P1_gg_ttxg/counters.cc       |    18 +-
 .../SubProcesses/P1_gg_ttxg/matrix1.f         |    55 +-
 .../gg_ttg.mad/SubProcesses/cudacpp.mk        |   273 +-
 .../gg_ttg.mad/SubProcesses/dummy_fct.f       |    10 +-
 .../gg_ttg.mad/SubProcesses/fbridge.cc        |     6 +-
 .../cudacpp/gg_ttg.mad/SubProcesses/genps.f   |     4 +-
 .../cudacpp/gg_ttg.mad/SubProcesses/makefile  |    19 +-
 .../gg_ttg.mad/SubProcesses/runTest.cc        |     4 +
 .../gg_ttg.mad/SubProcesses/testxxx.cc        |     3 +-
 epochX/cudacpp/gg_ttg.mad/bin/generate_events |    22 +-
 .../gg_ttg.mad/bin/internal/__init__.py       |     1 +
 .../cudacpp/gg_ttg.mad/bin/internal/banner.py |   326 +-
 .../bin/internal/check_param_card.py          |     2 +-
 .../bin/internal/common_run_interface.py      |    28 +-
 .../gg_ttg.mad/bin/internal/extended_cmd.py   |     8 +-
 .../gg_ttg.mad/bin/internal/gen_ximprove.py   |    15 +-
 .../gg_ttg.mad/bin/internal/lhe_parser.py     |    77 +-
 .../bin/internal/madevent_interface.py        |    29 +-
 .../cudacpp/gg_ttg.mad/bin/internal/misc.py   |     2 +-
 .../gg_ttg.mad/bin/internal/shower_card.py    |    10 +-
 .../bin/internal/ufomodel/py3_model.pkl       |   Bin 42822 -> 0 bytes
 epochX/cudacpp/gg_ttg.mad/bin/madevent        |    26 +-
 epochX/cudacpp/gg_ttg.mad/src/HelAmps_sm.h    |    20 +-
 .../cudacpp/gg_ttg.mad/src/Parameters_sm.cc   |     2 +-
 epochX/cudacpp/gg_ttg.mad/src/Parameters_sm.h |     2 +-
 epochX/cudacpp/gg_ttg.mad/src/cudacpp_src.mk  |    11 +-
 epochX/cudacpp/gg_ttg.mad/src/mgOnGpuConfig.h |    16 +-
 .../cudacpp/gg_ttg.mad/src/mgOnGpuCxtypes.h   |     6 +
 .../gg_ttg.sa/CODEGEN_cudacpp_gg_ttg_log.txt  |    89 +-
 .../cudacpp/gg_ttg.sa/SubProcesses/Bridge.h   |    14 +-
 .../gg_ttg.sa/SubProcesses/GpuAbstraction.h   |     2 +
 .../gg_ttg.sa/SubProcesses/MadgraphTest.h     |    12 +-
 .../SubProcesses/MatrixElementKernels.cc      |     9 +-
 .../P1_Sigma_sm_gg_ttxg/CPPProcess.cc         |   220 +-
 .../P1_Sigma_sm_gg_ttxg/CPPProcess.h          |     2 +-
 .../P1_Sigma_sm_gg_ttxg/check_sa.cc           |    87 +-
 .../cudacpp/gg_ttg.sa/SubProcesses/cudacpp.mk |   273 +-
 .../cudacpp/gg_ttg.sa/SubProcesses/fbridge.cc |     6 +-
 .../cudacpp/gg_ttg.sa/SubProcesses/runTest.cc |     4 +
 .../cudacpp/gg_ttg.sa/SubProcesses/testxxx.cc |     3 +-
 epochX/cudacpp/gg_ttg.sa/src/HelAmps_sm.h     |    20 +-
 epochX/cudacpp/gg_ttg.sa/src/Parameters_sm.cc |     2 +-
 epochX/cudacpp/gg_ttg.sa/src/Parameters_sm.h  |     2 +-
 epochX/cudacpp/gg_ttg.sa/src/cudacpp_src.mk   |    11 +-
 epochX/cudacpp/gg_ttg.sa/src/mgOnGpuConfig.h  |    16 +-
 epochX/cudacpp/gg_ttg.sa/src/mgOnGpuCxtypes.h |     6 +
 .../gg_ttgg.mad/CODEGEN_mad_gg_ttgg_log.txt   |   182 +-
 .../gg_ttgg.mad/Cards/me5_configuration.txt   |     4 +-
 .../gg_ttgg.mad/Cards/proc_card_mg5.dat       |     6 +-
 epochX/cudacpp/gg_ttgg.mad/Cards/run_card.dat |    32 +-
 .../gg_ttgg.mad/Cards/run_card_default.dat    |    26 +-
 epochX/cudacpp/gg_ttgg.mad/MGMEVersion.txt    |     2 +-
 .../gg_ttgg.mad/Source/DHELAS/aloha_file.inc  |     2 +-
 .../gg_ttgg.mad/Source/PDF/pdfwrap_lhapdf.f   |     1 +
 epochX/cudacpp/gg_ttgg.mad/Source/make_opts   |    17 +-
 epochX/cudacpp/gg_ttgg.mad/Source/makefile    |     4 +-
 .../cudacpp/gg_ttgg.mad/Source/param_card.inc |    14 +-
 epochX/cudacpp/gg_ttgg.mad/Source/vector.inc  |     3 +-
 .../cudacpp/gg_ttgg.mad/SubProcesses/Bridge.h |    14 +-
 .../gg_ttgg.mad/SubProcesses/GpuAbstraction.h |     2 +
 .../gg_ttgg.mad/SubProcesses/MGVersion.txt    |     2 +-
 .../gg_ttgg.mad/SubProcesses/MadgraphTest.h   |    12 +-
 .../SubProcesses/MatrixElementKernels.cc      |     9 +-
 .../SubProcesses/P1_gg_ttxgg/CPPProcess.cc    |   590 +-
 .../SubProcesses/P1_gg_ttxgg/CPPProcess.h     |     2 +-
 .../SubProcesses/P1_gg_ttxgg/auto_dsig.f      |     2 +-
 .../SubProcesses/P1_gg_ttxgg/auto_dsig1.f     |    23 +-
 .../SubProcesses/P1_gg_ttxgg/check_sa.cc      |    87 +-
 .../SubProcesses/P1_gg_ttxgg/counters.cc      |    18 +-
 .../SubProcesses/P1_gg_ttxgg/matrix1.f        |   644 +-
 .../gg_ttgg.mad/SubProcesses/cudacpp.mk       |   273 +-
 .../gg_ttgg.mad/SubProcesses/dummy_fct.f      |    10 +-
 .../gg_ttgg.mad/SubProcesses/fbridge.cc       |     6 +-
 .../cudacpp/gg_ttgg.mad/SubProcesses/genps.f  |     4 +-
 .../cudacpp/gg_ttgg.mad/SubProcesses/makefile |    19 +-
 .../gg_ttgg.mad/SubProcesses/runTest.cc       |     4 +
 .../gg_ttgg.mad/SubProcesses/testxxx.cc       |     3 +-
 .../cudacpp/gg_ttgg.mad/bin/generate_events   |    22 +-
 .../gg_ttgg.mad/bin/internal/__init__.py      |     1 +
 .../gg_ttgg.mad/bin/internal/banner.py        |   326 +-
 .../bin/internal/check_param_card.py          |     2 +-
 .../bin/internal/common_run_interface.py      |    28 +-
 .../gg_ttgg.mad/bin/internal/extended_cmd.py  |     8 +-
 .../gg_ttgg.mad/bin/internal/gen_ximprove.py  |    15 +-
 .../gg_ttgg.mad/bin/internal/lhe_parser.py    |    77 +-
 .../bin/internal/madevent_interface.py        |    29 +-
 .../cudacpp/gg_ttgg.mad/bin/internal/misc.py  |     2 +-
 .../gg_ttgg.mad/bin/internal/shower_card.py   |    10 +-
 .../bin/internal/ufomodel/py3_model.pkl       |   Bin 42822 -> 0 bytes
 epochX/cudacpp/gg_ttgg.mad/bin/madevent       |    26 +-
 epochX/cudacpp/gg_ttgg.mad/src/HelAmps_sm.h   |    26 +-
 .../cudacpp/gg_ttgg.mad/src/Parameters_sm.cc  |     2 +-
 .../cudacpp/gg_ttgg.mad/src/Parameters_sm.h   |     2 +-
 epochX/cudacpp/gg_ttgg.mad/src/cudacpp_src.mk |    11 +-
 .../cudacpp/gg_ttgg.mad/src/mgOnGpuConfig.h   |    16 +-
 .../cudacpp/gg_ttgg.mad/src/mgOnGpuCxtypes.h  |     6 +
 .../CODEGEN_cudacpp_gg_ttgg_log.txt           |    95 +-
 .../cudacpp/gg_ttgg.sa/SubProcesses/Bridge.h  |    14 +-
 .../gg_ttgg.sa/SubProcesses/GpuAbstraction.h  |     2 +
 .../gg_ttgg.sa/SubProcesses/MadgraphTest.h    |    12 +-
 .../SubProcesses/MatrixElementKernels.cc      |     9 +-
 .../P1_Sigma_sm_gg_ttxgg/CPPProcess.cc        |   590 +-
 .../P1_Sigma_sm_gg_ttxgg/CPPProcess.h         |     2 +-
 .../P1_Sigma_sm_gg_ttxgg/check_sa.cc          |    87 +-
 .../gg_ttgg.sa/SubProcesses/cudacpp.mk        |   273 +-
 .../gg_ttgg.sa/SubProcesses/fbridge.cc        |     6 +-
 .../gg_ttgg.sa/SubProcesses/runTest.cc        |     4 +
 .../gg_ttgg.sa/SubProcesses/testxxx.cc        |     3 +-
 epochX/cudacpp/gg_ttgg.sa/src/HelAmps_sm.h    |    26 +-
 .../cudacpp/gg_ttgg.sa/src/Parameters_sm.cc   |     2 +-
 epochX/cudacpp/gg_ttgg.sa/src/Parameters_sm.h |     2 +-
 epochX/cudacpp/gg_ttgg.sa/src/cudacpp_src.mk  |    11 +-
 epochX/cudacpp/gg_ttgg.sa/src/mgOnGpuConfig.h |    16 +-
 .../cudacpp/gg_ttgg.sa/src/mgOnGpuCxtypes.h   |     6 +
 .../gg_ttggg.mad/CODEGEN_mad_gg_ttggg_log.txt |   190 +-
 .../gg_ttggg.mad/Cards/me5_configuration.txt  |     4 +-
 .../gg_ttggg.mad/Cards/proc_card_mg5.dat      |     6 +-
 .../cudacpp/gg_ttggg.mad/Cards/run_card.dat   |    32 +-
 .../gg_ttggg.mad/Cards/run_card_default.dat   |    26 +-
 epochX/cudacpp/gg_ttggg.mad/MGMEVersion.txt   |     2 +-
 .../gg_ttggg.mad/Source/DHELAS/aloha_file.inc |     2 +-
 .../gg_ttggg.mad/Source/PDF/pdfwrap_lhapdf.f  |     1 +
 epochX/cudacpp/gg_ttggg.mad/Source/make_opts  |    17 +-
 epochX/cudacpp/gg_ttggg.mad/Source/makefile   |     4 +-
 .../gg_ttggg.mad/Source/param_card.inc        |    14 +-
 epochX/cudacpp/gg_ttggg.mad/Source/vector.inc |     3 +-
 .../gg_ttggg.mad/SubProcesses/Bridge.h        |    14 +-
 .../SubProcesses/GpuAbstraction.h             |     2 +
 .../gg_ttggg.mad/SubProcesses/MGVersion.txt   |     2 +-
 .../gg_ttggg.mad/SubProcesses/MadgraphTest.h  |    12 +-
 .../SubProcesses/MatrixElementKernels.cc      |     9 +-
 .../SubProcesses/P1_gg_ttxggg/CPPProcess.cc   |  4706 +++----
 .../SubProcesses/P1_gg_ttxggg/CPPProcess.h    |     2 +-
 .../SubProcesses/P1_gg_ttxggg/auto_dsig.f     |     2 +-
 .../SubProcesses/P1_gg_ttxggg/auto_dsig1.f    |    23 +-
 .../SubProcesses/P1_gg_ttxggg/check_sa.cc     |    87 +-
 .../SubProcesses/P1_gg_ttxggg/counters.cc     |    18 +-
 .../SubProcesses/P1_gg_ttxggg/matrix1.f       | 10187 ++++++++--------
 .../gg_ttggg.mad/SubProcesses/cudacpp.mk      |   273 +-
 .../gg_ttggg.mad/SubProcesses/dummy_fct.f     |    10 +-
 .../gg_ttggg.mad/SubProcesses/fbridge.cc      |     6 +-
 .../cudacpp/gg_ttggg.mad/SubProcesses/genps.f |     4 +-
 .../gg_ttggg.mad/SubProcesses/makefile        |    19 +-
 .../gg_ttggg.mad/SubProcesses/runTest.cc      |     4 +
 .../gg_ttggg.mad/SubProcesses/testxxx.cc      |     3 +-
 .../cudacpp/gg_ttggg.mad/bin/generate_events  |    22 +-
 .../gg_ttggg.mad/bin/internal/__init__.py     |     1 +
 .../gg_ttggg.mad/bin/internal/banner.py       |   326 +-
 .../bin/internal/check_param_card.py          |     2 +-
 .../bin/internal/common_run_interface.py      |    28 +-
 .../gg_ttggg.mad/bin/internal/extended_cmd.py |     8 +-
 .../gg_ttggg.mad/bin/internal/gen_ximprove.py |    15 +-
 .../gg_ttggg.mad/bin/internal/lhe_parser.py   |    77 +-
 .../bin/internal/madevent_interface.py        |    29 +-
 .../cudacpp/gg_ttggg.mad/bin/internal/misc.py |     2 +-
 .../gg_ttggg.mad/bin/internal/shower_card.py  |    10 +-
 .../bin/internal/ufomodel/py3_model.pkl       |   Bin 42822 -> 0 bytes
 epochX/cudacpp/gg_ttggg.mad/bin/madevent      |    26 +-
 epochX/cudacpp/gg_ttggg.mad/src/HelAmps_sm.h  |    26 +-
 .../cudacpp/gg_ttggg.mad/src/Parameters_sm.cc |     2 +-
 .../cudacpp/gg_ttggg.mad/src/Parameters_sm.h  |     2 +-
 .../cudacpp/gg_ttggg.mad/src/cudacpp_src.mk   |    11 +-
 .../cudacpp/gg_ttggg.mad/src/mgOnGpuConfig.h  |    16 +-
 .../cudacpp/gg_ttggg.mad/src/mgOnGpuCxtypes.h |     6 +
 .../CODEGEN_cudacpp_gg_ttggg_log.txt          |    97 +-
 .../cudacpp/gg_ttggg.sa/SubProcesses/Bridge.h |    14 +-
 .../gg_ttggg.sa/SubProcesses/GpuAbstraction.h |     2 +
 .../gg_ttggg.sa/SubProcesses/MadgraphTest.h   |    12 +-
 .../SubProcesses/MatrixElementKernels.cc      |     9 +-
 .../P1_Sigma_sm_gg_ttxggg/CPPProcess.cc       |  4706 +++----
 .../P1_Sigma_sm_gg_ttxggg/CPPProcess.h        |     2 +-
 .../P1_Sigma_sm_gg_ttxggg/check_sa.cc         |    87 +-
 .../gg_ttggg.sa/SubProcesses/cudacpp.mk       |   273 +-
 .../gg_ttggg.sa/SubProcesses/fbridge.cc       |     6 +-
 .../gg_ttggg.sa/SubProcesses/runTest.cc       |     4 +
 .../gg_ttggg.sa/SubProcesses/testxxx.cc       |     3 +-
 epochX/cudacpp/gg_ttggg.sa/src/HelAmps_sm.h   |    26 +-
 .../cudacpp/gg_ttggg.sa/src/Parameters_sm.cc  |     2 +-
 .../cudacpp/gg_ttggg.sa/src/Parameters_sm.h   |     2 +-
 epochX/cudacpp/gg_ttggg.sa/src/cudacpp_src.mk |    11 +-
 .../cudacpp/gg_ttggg.sa/src/mgOnGpuConfig.h   |    16 +-
 .../cudacpp/gg_ttggg.sa/src/mgOnGpuCxtypes.h  |     6 +
 .../gq_ttq.mad/CODEGEN_mad_gq_ttq_log.txt     |    30 +-
 epochX/cudacpp/gq_ttq.mad/COPYRIGHT           |     1 +
 .../cudacpp/gq_ttq.mad/SubProcesses/Bridge.h  |    32 +-
 .../gq_ttq.mad/SubProcesses/BridgeKernels.cc  |     9 +-
 .../gq_ttq.mad/SubProcesses/BridgeKernels.h   |     8 +-
 .../SubProcesses/CommonRandomNumberKernel.cc  |     5 +-
 .../SubProcesses/CrossSectionKernels.cc       |     7 +-
 .../SubProcesses/CrossSectionKernels.h        |     6 +-
 .../SubProcesses/CurandRandomNumberKernel.cc  |    12 +-
 .../gq_ttq.mad/SubProcesses/EventStatistics.h |     4 +-
 .../gq_ttq.mad/SubProcesses/GpuAbstraction.h  |     2 +
 .../gq_ttq.mad/SubProcesses/MadgraphTest.h    |     8 +-
 .../SubProcesses/MatrixElementKernels.cc      |    26 +-
 .../SubProcesses/MatrixElementKernels.h       |     8 +-
 .../SubProcesses/MemoryAccessAmplitudes.h     |     2 +-
 .../SubProcesses/MemoryAccessCouplings.h      |     2 +-
 .../SubProcesses/MemoryAccessCouplingsFixed.h |     2 +-
 .../SubProcesses/MemoryAccessDenominators.h   |     2 +-
 .../gq_ttq.mad/SubProcesses/MemoryAccessGs.h  |     2 +-
 .../SubProcesses/MemoryAccessHelpers.h        |     4 +-
 .../SubProcesses/MemoryAccessMatrixElements.h |     2 +-
 .../SubProcesses/MemoryAccessMomenta.h        |     6 +-
 .../SubProcesses/MemoryAccessNumerators.h     |     2 +-
 .../SubProcesses/MemoryAccessRandomNumbers.h  |     4 +-
 .../SubProcesses/MemoryAccessVectors.h        |     4 +-
 .../SubProcesses/MemoryAccessWavefunctions.h  |     2 +-
 .../gq_ttq.mad/SubProcesses/MemoryBuffers.h   |    64 +-
 .../SubProcesses/P1_gu_ttxu/CPPProcess.cc     |    62 +-
 .../SubProcesses/P1_gu_ttxu/CPPProcess.h      |    10 +-
 .../SubProcesses/P1_gu_ttxu/CudaRuntime.h     |     1 -
 .../SubProcesses/P1_gu_ttxu/check_sa.cc       |   111 +-
 .../SubProcesses/P1_gux_ttxux/CPPProcess.cc   |    62 +-
 .../SubProcesses/P1_gux_ttxux/CPPProcess.h    |    10 +-
 .../SubProcesses/P1_gux_ttxux/CudaRuntime.h   |     1 -
 .../SubProcesses/P1_gux_ttxux/check_sa.cc     |   111 +-
 .../SubProcesses/RamboSamplingKernels.cc      |    20 +-
 .../SubProcesses/RamboSamplingKernels.h       |     6 +-
 .../SubProcesses/RandomNumberKernels.h        |     6 +-
 .../gq_ttq.mad/SubProcesses/cudacpp.mk        |   232 +-
 .../gq_ttq.mad/SubProcesses/fbridge.cc        |    16 +-
 .../gq_ttq.mad/SubProcesses/fsampler.cc       |     8 +-
 .../gq_ttq.mad/SubProcesses/runTest.cc        |    12 +-
 .../gq_ttq.mad/SubProcesses/testmisc.cc       |     8 +-
 .../gq_ttq.mad/SubProcesses/testxxx.cc        |    14 +-
 epochX/cudacpp/gq_ttq.mad/src/HelAmps_sm.h    |     4 +-
 .../cudacpp/gq_ttq.mad/src/Parameters_sm.cc   |     4 +-
 epochX/cudacpp/gq_ttq.mad/src/Parameters_sm.h |    10 +-
 epochX/cudacpp/gq_ttq.mad/src/cudacpp_src.mk  |    23 +-
 epochX/cudacpp/gq_ttq.mad/src/mgOnGpuConfig.h |    73 +-
 .../cudacpp/gq_ttq.mad/src/mgOnGpuCxtypes.h   |    28 +-
 .../cudacpp/gq_ttq.mad/src/mgOnGpuFptypes.h   |    12 +-
 .../cudacpp/gq_ttq.mad/src/mgOnGpuVectors.h   |    18 +-
 epochX/cudacpp/gq_ttq.mad/src/rambo.h         |     8 +-
 .../gq_ttq.sa/CODEGEN_cudacpp_gq_ttq_log.txt  |    16 +-
 epochX/cudacpp/gq_ttq.sa/COPYRIGHT            |     1 +
 .../cudacpp/gq_ttq.sa/SubProcesses/Bridge.h   |    32 +-
 .../gq_ttq.sa/SubProcesses/BridgeKernels.cc   |     9 +-
 .../gq_ttq.sa/SubProcesses/BridgeKernels.h    |     8 +-
 .../SubProcesses/CommonRandomNumberKernel.cc  |     5 +-
 .../SubProcesses/CrossSectionKernels.cc       |     7 +-
 .../SubProcesses/CrossSectionKernels.h        |     6 +-
 .../gq_ttq.sa/SubProcesses/CudaRuntime.h      |    85 -
 .../SubProcesses/CurandRandomNumberKernel.cc  |    12 +-
 .../gq_ttq.sa/SubProcesses/EventStatistics.h  |     4 +-
 .../gq_ttq.sa/SubProcesses/MadgraphTest.h     |     8 +-
 .../SubProcesses/MatrixElementKernels.cc      |    26 +-
 .../SubProcesses/MatrixElementKernels.h       |     8 +-
 .../SubProcesses/MemoryAccessAmplitudes.h     |     2 +-
 .../SubProcesses/MemoryAccessCouplings.h      |     2 +-
 .../SubProcesses/MemoryAccessCouplingsFixed.h |     2 +-
 .../SubProcesses/MemoryAccessDenominators.h   |     2 +-
 .../gq_ttq.sa/SubProcesses/MemoryAccessGs.h   |     2 +-
 .../SubProcesses/MemoryAccessHelpers.h        |     4 +-
 .../SubProcesses/MemoryAccessMatrixElements.h |     2 +-
 .../SubProcesses/MemoryAccessMomenta.h        |     6 +-
 .../SubProcesses/MemoryAccessNumerators.h     |     2 +-
 .../SubProcesses/MemoryAccessRandomNumbers.h  |     4 +-
 .../SubProcesses/MemoryAccessVectors.h        |     4 +-
 .../SubProcesses/MemoryAccessWavefunctions.h  |     2 +-
 .../gq_ttq.sa/SubProcesses/MemoryBuffers.h    |    64 +-
 .../P1_Sigma_sm_gu_ttxu/CPPProcess.cc         |    62 +-
 .../P1_Sigma_sm_gu_ttxu/CPPProcess.h          |    10 +-
 .../P1_Sigma_sm_gu_ttxu/CudaRuntime.h         |     1 -
 .../P1_Sigma_sm_gu_ttxu/check_sa.cc           |   111 +-
 .../P1_Sigma_sm_gux_ttxux/CPPProcess.cc       |    62 +-
 .../P1_Sigma_sm_gux_ttxux/CPPProcess.h        |    10 +-
 .../P1_Sigma_sm_gux_ttxux/CudaRuntime.h       |     1 -
 .../P1_Sigma_sm_gux_ttxux/check_sa.cc         |   111 +-
 .../SubProcesses/RamboSamplingKernels.cc      |    20 +-
 .../SubProcesses/RamboSamplingKernels.h       |     6 +-
 .../SubProcesses/RandomNumberKernels.h        |     6 +-
 .../cudacpp/gq_ttq.sa/SubProcesses/cudacpp.mk |   232 +-
 .../cudacpp/gq_ttq.sa/SubProcesses/fbridge.cc |    16 +-
 .../gq_ttq.sa/SubProcesses/fsampler.cc        |     8 +-
 .../cudacpp/gq_ttq.sa/SubProcesses/runTest.cc |    12 +-
 .../gq_ttq.sa/SubProcesses/testmisc.cc        |     8 +-
 .../cudacpp/gq_ttq.sa/SubProcesses/testxxx.cc |    14 +-
 epochX/cudacpp/gq_ttq.sa/src/HelAmps_sm.h     |     4 +-
 epochX/cudacpp/gq_ttq.sa/src/Parameters_sm.cc |     4 +-
 epochX/cudacpp/gq_ttq.sa/src/Parameters_sm.h  |    10 +-
 epochX/cudacpp/gq_ttq.sa/src/cudacpp_src.mk   |    23 +-
 epochX/cudacpp/gq_ttq.sa/src/mgOnGpuConfig.h  |    73 +-
 epochX/cudacpp/gq_ttq.sa/src/mgOnGpuCxtypes.h |    28 +-
 epochX/cudacpp/gq_ttq.sa/src/mgOnGpuFptypes.h |    12 +-
 epochX/cudacpp/gq_ttq.sa/src/mgOnGpuVectors.h |    18 +-
 epochX/cudacpp/gq_ttq.sa/src/rambo.h          |     8 +-
 .../CODEGEN_cudacpp_heft_gg_h_log.txt         |    10 +-
 epochX/cudacpp/heft_gg_h.sa/COPYRIGHT         |     1 +
 .../heft_gg_h.sa/SubProcesses/Bridge.h        |    32 +-
 .../SubProcesses/BridgeKernels.cc             |     9 +-
 .../heft_gg_h.sa/SubProcesses/BridgeKernels.h |     8 +-
 .../SubProcesses/CommonRandomNumberKernel.cc  |     5 +-
 .../SubProcesses/CrossSectionKernels.cc       |     7 +-
 .../SubProcesses/CrossSectionKernels.h        |     6 +-
 .../heft_gg_h.sa/SubProcesses/CudaRuntime.h   |    85 -
 .../SubProcesses/CurandRandomNumberKernel.cc  |    12 +-
 .../SubProcesses/EventStatistics.h            |     4 +-
 .../heft_gg_h.sa/SubProcesses/MadgraphTest.h  |     8 +-
 .../SubProcesses/MatrixElementKernels.cc      |    26 +-
 .../SubProcesses/MatrixElementKernels.h       |     8 +-
 .../SubProcesses/MemoryAccessAmplitudes.h     |     2 +-
 .../SubProcesses/MemoryAccessCouplings.h      |     2 +-
 .../SubProcesses/MemoryAccessCouplingsFixed.h |     2 +-
 .../SubProcesses/MemoryAccessDenominators.h   |     2 +-
 .../SubProcesses/MemoryAccessGs.h             |     2 +-
 .../SubProcesses/MemoryAccessHelpers.h        |     4 +-
 .../SubProcesses/MemoryAccessMatrixElements.h |     2 +-
 .../SubProcesses/MemoryAccessMomenta.h        |     6 +-
 .../SubProcesses/MemoryAccessNumerators.h     |     2 +-
 .../SubProcesses/MemoryAccessRandomNumbers.h  |     4 +-
 .../SubProcesses/MemoryAccessVectors.h        |     4 +-
 .../SubProcesses/MemoryAccessWavefunctions.h  |     2 +-
 .../heft_gg_h.sa/SubProcesses/MemoryBuffers.h |    64 +-
 .../P1_Sigma_heft_gg_h/CPPProcess.cc          |    62 +-
 .../P1_Sigma_heft_gg_h/CPPProcess.h           |    10 +-
 .../P1_Sigma_heft_gg_h/CudaRuntime.h          |     1 -
 .../P1_Sigma_heft_gg_h/check_sa.cc            |   111 +-
 .../SubProcesses/RamboSamplingKernels.cc      |    20 +-
 .../SubProcesses/RamboSamplingKernels.h       |     6 +-
 .../SubProcesses/RandomNumberKernels.h        |     6 +-
 .../heft_gg_h.sa/SubProcesses/cudacpp.mk      |   232 +-
 .../heft_gg_h.sa/SubProcesses/fbridge.cc      |    16 +-
 .../heft_gg_h.sa/SubProcesses/fsampler.cc     |     8 +-
 .../heft_gg_h.sa/SubProcesses/runTest.cc      |    12 +-
 .../heft_gg_h.sa/SubProcesses/testmisc.cc     |     8 +-
 .../heft_gg_h.sa/SubProcesses/testxxx.cc      |    14 +-
 .../cudacpp/heft_gg_h.sa/src/HelAmps_heft.h   |     4 +-
 .../heft_gg_h.sa/src/Parameters_heft.cc       |     4 +-
 .../heft_gg_h.sa/src/Parameters_heft.h        |    10 +-
 .../cudacpp/heft_gg_h.sa/src/cudacpp_src.mk   |    23 +-
 .../cudacpp/heft_gg_h.sa/src/mgOnGpuConfig.h  |    73 +-
 .../cudacpp/heft_gg_h.sa/src/mgOnGpuCxtypes.h |    28 +-
 .../cudacpp/heft_gg_h.sa/src/mgOnGpuFptypes.h |    12 +-
 .../cudacpp/heft_gg_h.sa/src/mgOnGpuVectors.h |    18 +-
 epochX/cudacpp/heft_gg_h.sa/src/rambo.h       |     8 +-
 .../CODEGEN_mad_pp_tt012j_log.txt             |    64 +-
 epochX/cudacpp/pp_tt012j.mad/COPYRIGHT        |     1 +
 .../pp_tt012j.mad/SubProcesses/Bridge.h       |    32 +-
 .../SubProcesses/BridgeKernels.cc             |     9 +-
 .../SubProcesses/BridgeKernels.h              |     8 +-
 .../SubProcesses/CommonRandomNumberKernel.cc  |     5 +-
 .../SubProcesses/CrossSectionKernels.cc       |     7 +-
 .../SubProcesses/CrossSectionKernels.h        |     6 +-
 .../pp_tt012j.mad/SubProcesses/CudaRuntime.h  |    85 -
 .../SubProcesses/CurandRandomNumberKernel.cc  |    12 +-
 .../SubProcesses/EventStatistics.h            |     4 +-
 .../SubProcesses/GpuAbstraction.h             |    71 +
 .../SubProcesses/GpuRuntime.h}                |    54 +-
 .../pp_tt012j.mad/SubProcesses/MadgraphTest.h |     8 +-
 .../SubProcesses/MatrixElementKernels.cc      |    26 +-
 .../SubProcesses/MatrixElementKernels.h       |     8 +-
 .../SubProcesses/MemoryAccessAmplitudes.h     |     2 +-
 .../SubProcesses/MemoryAccessCouplings.h      |     2 +-
 .../SubProcesses/MemoryAccessCouplingsFixed.h |     2 +-
 .../SubProcesses/MemoryAccessDenominators.h   |     2 +-
 .../SubProcesses/MemoryAccessGs.h             |     2 +-
 .../SubProcesses/MemoryAccessHelpers.h        |     4 +-
 .../SubProcesses/MemoryAccessMatrixElements.h |     2 +-
 .../SubProcesses/MemoryAccessMomenta.h        |     6 +-
 .../SubProcesses/MemoryAccessNumerators.h     |     2 +-
 .../SubProcesses/MemoryAccessRandomNumbers.h  |     4 +-
 .../SubProcesses/MemoryAccessVectors.h        |     4 +-
 .../SubProcesses/MemoryAccessWavefunctions.h  |     2 +-
 .../SubProcesses/MemoryBuffers.h              |    64 +-
 .../SubProcesses/P0_gg_ttx/CPPProcess.cc      |    62 +-
 .../SubProcesses/P0_gg_ttx/CPPProcess.h       |    10 +-
 .../SubProcesses/P0_gg_ttx/CudaRuntime.h      |     1 -
 .../SubProcesses/P0_gg_ttx/GpuAbstraction.h   |     1 +
 .../SubProcesses/P0_gg_ttx/GpuRuntime.h       |     1 +
 .../SubProcesses/P0_gg_ttx/check_sa.cc        |   111 +-
 .../SubProcesses/P0_uux_ttx/CPPProcess.cc     |    62 +-
 .../SubProcesses/P0_uux_ttx/CPPProcess.h      |    10 +-
 .../SubProcesses/P0_uux_ttx/CudaRuntime.h     |     1 -
 .../SubProcesses/P0_uux_ttx/GpuAbstraction.h  |     1 +
 .../SubProcesses/P0_uux_ttx/GpuRuntime.h      |     1 +
 .../SubProcesses/P0_uux_ttx/check_sa.cc       |   111 +-
 .../SubProcesses/P1_gg_ttxg/CPPProcess.cc     |    62 +-
 .../SubProcesses/P1_gg_ttxg/CPPProcess.h      |    10 +-
 .../SubProcesses/P1_gg_ttxg/CudaRuntime.h     |     1 -
 .../SubProcesses/P1_gg_ttxg/GpuAbstraction.h  |     1 +
 .../SubProcesses/P1_gg_ttxg/GpuRuntime.h      |     1 +
 .../SubProcesses/P1_gg_ttxg/check_sa.cc       |   111 +-
 .../SubProcesses/P1_gu_ttxu/CPPProcess.cc     |    62 +-
 .../SubProcesses/P1_gu_ttxu/CPPProcess.h      |    10 +-
 .../SubProcesses/P1_gu_ttxu/CudaRuntime.h     |     1 -
 .../SubProcesses/P1_gu_ttxu/GpuAbstraction.h  |     1 +
 .../SubProcesses/P1_gu_ttxu/GpuRuntime.h      |     1 +
 .../SubProcesses/P1_gu_ttxu/check_sa.cc       |   111 +-
 .../SubProcesses/P1_gux_ttxux/CPPProcess.cc   |    62 +-
 .../SubProcesses/P1_gux_ttxux/CPPProcess.h    |    10 +-
 .../SubProcesses/P1_gux_ttxux/CudaRuntime.h   |     1 -
 .../P1_gux_ttxux/GpuAbstraction.h             |     1 +
 .../SubProcesses/P1_gux_ttxux/GpuRuntime.h    |     1 +
 .../SubProcesses/P1_gux_ttxux/check_sa.cc     |   111 +-
 .../SubProcesses/P1_uux_ttxg/CPPProcess.cc    |    62 +-
 .../SubProcesses/P1_uux_ttxg/CPPProcess.h     |    10 +-
 .../SubProcesses/P1_uux_ttxg/CudaRuntime.h    |     1 -
 .../SubProcesses/P1_uux_ttxg/GpuAbstraction.h |     1 +
 .../SubProcesses/P1_uux_ttxg/GpuRuntime.h     |     1 +
 .../SubProcesses/P1_uux_ttxg/check_sa.cc      |   111 +-
 .../SubProcesses/P2_gg_ttxgg/CPPProcess.cc    |    62 +-
 .../SubProcesses/P2_gg_ttxgg/CPPProcess.h     |    10 +-
 .../SubProcesses/P2_gg_ttxgg/CudaRuntime.h    |     1 -
 .../SubProcesses/P2_gg_ttxgg/GpuAbstraction.h |     1 +
 .../SubProcesses/P2_gg_ttxgg/GpuRuntime.h     |     1 +
 .../SubProcesses/P2_gg_ttxgg/check_sa.cc      |   111 +-
 .../SubProcesses/P2_gg_ttxuux/CPPProcess.cc   |    62 +-
 .../SubProcesses/P2_gg_ttxuux/CPPProcess.h    |    10 +-
 .../SubProcesses/P2_gg_ttxuux/CudaRuntime.h   |     1 -
 .../P2_gg_ttxuux/GpuAbstraction.h             |     1 +
 .../SubProcesses/P2_gg_ttxuux/GpuRuntime.h    |     1 +
 .../SubProcesses/P2_gg_ttxuux/check_sa.cc     |   111 +-
 .../SubProcesses/P2_gu_ttxgu/CPPProcess.cc    |    62 +-
 .../SubProcesses/P2_gu_ttxgu/CPPProcess.h     |    10 +-
 .../SubProcesses/P2_gu_ttxgu/CudaRuntime.h    |     1 -
 .../SubProcesses/P2_gu_ttxgu/GpuAbstraction.h |     1 +
 .../SubProcesses/P2_gu_ttxgu/GpuRuntime.h     |     1 +
 .../SubProcesses/P2_gu_ttxgu/check_sa.cc      |   111 +-
 .../SubProcesses/P2_gux_ttxgux/CPPProcess.cc  |    62 +-
 .../SubProcesses/P2_gux_ttxgux/CPPProcess.h   |    10 +-
 .../SubProcesses/P2_gux_ttxgux/CudaRuntime.h  |     1 -
 .../P2_gux_ttxgux/GpuAbstraction.h            |     1 +
 .../SubProcesses/P2_gux_ttxgux/GpuRuntime.h   |     1 +
 .../SubProcesses/P2_gux_ttxgux/check_sa.cc    |   111 +-
 .../SubProcesses/P2_uc_ttxuc/CPPProcess.cc    |    62 +-
 .../SubProcesses/P2_uc_ttxuc/CPPProcess.h     |    10 +-
 .../SubProcesses/P2_uc_ttxuc/CudaRuntime.h    |     1 -
 .../SubProcesses/P2_uc_ttxuc/GpuAbstraction.h |     1 +
 .../SubProcesses/P2_uc_ttxuc/GpuRuntime.h     |     1 +
 .../SubProcesses/P2_uc_ttxuc/check_sa.cc      |   111 +-
 .../SubProcesses/P2_ucx_ttxucx/CPPProcess.cc  |    62 +-
 .../SubProcesses/P2_ucx_ttxucx/CPPProcess.h   |    10 +-
 .../SubProcesses/P2_ucx_ttxucx/CudaRuntime.h  |     1 -
 .../P2_ucx_ttxucx/GpuAbstraction.h            |     1 +
 .../SubProcesses/P2_ucx_ttxucx/GpuRuntime.h   |     1 +
 .../SubProcesses/P2_ucx_ttxucx/check_sa.cc    |   111 +-
 .../SubProcesses/P2_uu_ttxuu/CPPProcess.cc    |    62 +-
 .../SubProcesses/P2_uu_ttxuu/CPPProcess.h     |    10 +-
 .../SubProcesses/P2_uu_ttxuu/CudaRuntime.h    |     1 -
 .../SubProcesses/P2_uu_ttxuu/GpuAbstraction.h |     1 +
 .../SubProcesses/P2_uu_ttxuu/GpuRuntime.h     |     1 +
 .../SubProcesses/P2_uu_ttxuu/check_sa.cc      |   111 +-
 .../SubProcesses/P2_uux_ttxccx/CPPProcess.cc  |    62 +-
 .../SubProcesses/P2_uux_ttxccx/CPPProcess.h   |    10 +-
 .../SubProcesses/P2_uux_ttxccx/CudaRuntime.h  |     1 -
 .../P2_uux_ttxccx/GpuAbstraction.h            |     1 +
 .../SubProcesses/P2_uux_ttxccx/GpuRuntime.h   |     1 +
 .../SubProcesses/P2_uux_ttxccx/check_sa.cc    |   111 +-
 .../SubProcesses/P2_uux_ttxgg/CPPProcess.cc   |    62 +-
 .../SubProcesses/P2_uux_ttxgg/CPPProcess.h    |    10 +-
 .../SubProcesses/P2_uux_ttxgg/CudaRuntime.h   |     1 -
 .../P2_uux_ttxgg/GpuAbstraction.h             |     1 +
 .../SubProcesses/P2_uux_ttxgg/GpuRuntime.h    |     1 +
 .../SubProcesses/P2_uux_ttxgg/check_sa.cc     |   111 +-
 .../SubProcesses/P2_uux_ttxuux/CPPProcess.cc  |    62 +-
 .../SubProcesses/P2_uux_ttxuux/CPPProcess.h   |    10 +-
 .../SubProcesses/P2_uux_ttxuux/CudaRuntime.h  |     1 -
 .../P2_uux_ttxuux/GpuAbstraction.h            |     1 +
 .../SubProcesses/P2_uux_ttxuux/GpuRuntime.h   |     1 +
 .../SubProcesses/P2_uux_ttxuux/check_sa.cc    |   111 +-
 .../P2_uxcx_ttxuxcx/CPPProcess.cc             |    62 +-
 .../SubProcesses/P2_uxcx_ttxuxcx/CPPProcess.h |    10 +-
 .../P2_uxcx_ttxuxcx/CudaRuntime.h             |     1 -
 .../P2_uxcx_ttxuxcx/GpuAbstraction.h          |     1 +
 .../SubProcesses/P2_uxcx_ttxuxcx/GpuRuntime.h |     1 +
 .../SubProcesses/P2_uxcx_ttxuxcx/check_sa.cc  |   111 +-
 .../P2_uxux_ttxuxux/CPPProcess.cc             |    62 +-
 .../SubProcesses/P2_uxux_ttxuxux/CPPProcess.h |    10 +-
 .../P2_uxux_ttxuxux/CudaRuntime.h             |     1 -
 .../P2_uxux_ttxuxux/GpuAbstraction.h          |     1 +
 .../SubProcesses/P2_uxux_ttxuxux/GpuRuntime.h |     1 +
 .../SubProcesses/P2_uxux_ttxuxux/check_sa.cc  |   111 +-
 .../SubProcesses/RamboSamplingKernels.cc      |    20 +-
 .../SubProcesses/RamboSamplingKernels.h       |     6 +-
 .../SubProcesses/RandomNumberKernels.h        |     6 +-
 .../pp_tt012j.mad/SubProcesses/cudacpp.mk     |   232 +-
 .../pp_tt012j.mad/SubProcesses/fbridge.cc     |    16 +-
 .../pp_tt012j.mad/SubProcesses/fsampler.cc    |     8 +-
 .../pp_tt012j.mad/SubProcesses/runTest.cc     |    12 +-
 .../pp_tt012j.mad/SubProcesses/testmisc.cc    |     8 +-
 .../pp_tt012j.mad/SubProcesses/testxxx.cc     |    14 +-
 epochX/cudacpp/pp_tt012j.mad/src/HelAmps_sm.h |     4 +-
 .../pp_tt012j.mad/src/Parameters_sm.cc        |     4 +-
 .../cudacpp/pp_tt012j.mad/src/Parameters_sm.h |    10 +-
 .../cudacpp/pp_tt012j.mad/src/cudacpp_src.mk  |    23 +-
 .../cudacpp/pp_tt012j.mad/src/mgOnGpuConfig.h |    73 +-
 .../pp_tt012j.mad/src/mgOnGpuCxtypes.h        |    28 +-
 .../pp_tt012j.mad/src/mgOnGpuFptypes.h        |    12 +-
 .../pp_tt012j.mad/src/mgOnGpuVectors.h        |    18 +-
 epochX/cudacpp/pp_tt012j.mad/src/rambo.h      |     8 +-
 655 files changed, 21432 insertions(+), 18299 deletions(-)
 delete mode 100644 epochX/cudacpp/ee_mumu.mad/bin/internal/ufomodel/py3_model.pkl
 delete mode 100644 epochX/cudacpp/gg_tt01g.mad/SubProcesses/CudaRuntime.h
 delete mode 120000 epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/CudaRuntime.h
 delete mode 120000 epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/CudaRuntime.h
 delete mode 100644 epochX/cudacpp/gg_ttg.mad/bin/internal/ufomodel/py3_model.pkl
 delete mode 100644 epochX/cudacpp/gg_ttgg.mad/bin/internal/ufomodel/py3_model.pkl
 delete mode 100644 epochX/cudacpp/gg_ttggg.mad/bin/internal/ufomodel/py3_model.pkl
 delete mode 120000 epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/CudaRuntime.h
 delete mode 120000 epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/CudaRuntime.h
 delete mode 100644 epochX/cudacpp/gq_ttq.sa/SubProcesses/CudaRuntime.h
 delete mode 120000 epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gu_ttxu/CudaRuntime.h
 delete mode 120000 epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gux_ttxux/CudaRuntime.h
 delete mode 100644 epochX/cudacpp/heft_gg_h.sa/SubProcesses/CudaRuntime.h
 delete mode 120000 epochX/cudacpp/heft_gg_h.sa/SubProcesses/P1_Sigma_heft_gg_h/CudaRuntime.h
 delete mode 100644 epochX/cudacpp/pp_tt012j.mad/SubProcesses/CudaRuntime.h
 create mode 100644 epochX/cudacpp/pp_tt012j.mad/SubProcesses/GpuAbstraction.h
 rename epochX/cudacpp/{gq_ttq.mad/SubProcesses/CudaRuntime.h => pp_tt012j.mad/SubProcesses/GpuRuntime.h} (62%)
 delete mode 120000 epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_gg_ttx/CudaRuntime.h
 create mode 120000 epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_gg_ttx/GpuAbstraction.h
 create mode 120000 epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_gg_ttx/GpuRuntime.h
 delete mode 120000 epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_uux_ttx/CudaRuntime.h
 create mode 120000 epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_uux_ttx/GpuAbstraction.h
 create mode 120000 epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_uux_ttx/GpuRuntime.h
 delete mode 120000 epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gg_ttxg/CudaRuntime.h
 create mode 120000 epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gg_ttxg/GpuAbstraction.h
 create mode 120000 epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gg_ttxg/GpuRuntime.h
 delete mode 120000 epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gu_ttxu/CudaRuntime.h
 create mode 120000 epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gu_ttxu/GpuAbstraction.h
 create mode 120000 epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gu_ttxu/GpuRuntime.h
 delete mode 120000 epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gux_ttxux/CudaRuntime.h
 create mode 120000 epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gux_ttxux/GpuAbstraction.h
 create mode 120000 epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gux_ttxux/GpuRuntime.h
 delete mode 120000 epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_uux_ttxg/CudaRuntime.h
 create mode 120000 epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_uux_ttxg/GpuAbstraction.h
 create mode 120000 epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_uux_ttxg/GpuRuntime.h
 delete mode 120000 epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxgg/CudaRuntime.h
 create mode 120000 epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxgg/GpuAbstraction.h
 create mode 120000 epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxgg/GpuRuntime.h
 delete mode 120000 epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxuux/CudaRuntime.h
 create mode 120000 epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxuux/GpuAbstraction.h
 create mode 120000 epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxuux/GpuRuntime.h
 delete mode 120000 epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gu_ttxgu/CudaRuntime.h
 create mode 120000 epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gu_ttxgu/GpuAbstraction.h
 create mode 120000 epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gu_ttxgu/GpuRuntime.h
 delete mode 120000 epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gux_ttxgux/CudaRuntime.h
 create mode 120000 epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gux_ttxgux/GpuAbstraction.h
 create mode 120000 epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gux_ttxgux/GpuRuntime.h
 delete mode 120000 epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uc_ttxuc/CudaRuntime.h
 create mode 120000 epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uc_ttxuc/GpuAbstraction.h
 create mode 120000 epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uc_ttxuc/GpuRuntime.h
 delete mode 120000 epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_ucx_ttxucx/CudaRuntime.h
 create mode 120000 epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_ucx_ttxucx/GpuAbstraction.h
 create mode 120000 epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_ucx_ttxucx/GpuRuntime.h
 delete mode 120000 epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uu_ttxuu/CudaRuntime.h
 create mode 120000 epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uu_ttxuu/GpuAbstraction.h
 create mode 120000 epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uu_ttxuu/GpuRuntime.h
 delete mode 120000 epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxccx/CudaRuntime.h
 create mode 120000 epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxccx/GpuAbstraction.h
 create mode 120000 epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxccx/GpuRuntime.h
 delete mode 120000 epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxgg/CudaRuntime.h
 create mode 120000 epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxgg/GpuAbstraction.h
 create mode 120000 epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxgg/GpuRuntime.h
 delete mode 120000 epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxuux/CudaRuntime.h
 create mode 120000 epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxuux/GpuAbstraction.h
 create mode 120000 epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxuux/GpuRuntime.h
 delete mode 120000 epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxcx_ttxuxcx/CudaRuntime.h
 create mode 120000 epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxcx_ttxuxcx/GpuAbstraction.h
 create mode 120000 epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxcx_ttxuxcx/GpuRuntime.h
 delete mode 120000 epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxux_ttxuxux/CudaRuntime.h
 create mode 120000 epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxux_ttxuxux/GpuAbstraction.h
 create mode 120000 epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxux_ttxuxux/GpuRuntime.h

diff --git a/epochX/cudacpp/ee_mumu.mad/CODEGEN_mad_ee_mumu_log.txt b/epochX/cudacpp/ee_mumu.mad/CODEGEN_mad_ee_mumu_log.txt
index 92778f7ec9..3be3e9348e 100644
--- a/epochX/cudacpp/ee_mumu.mad/CODEGEN_mad_ee_mumu_log.txt
+++ b/epochX/cudacpp/ee_mumu.mad/CODEGEN_mad_ee_mumu_log.txt
@@ -14,7 +14,7 @@ Running MG5 in debug mode
 *                   *        * *        *                  *
 *                 *                       *                *
 *                                                          *
-*         VERSION 3.5.1_lo_vect         2023-08-08         *
+*         VERSION 3.5.2_lo_vect         2023-11-08         *
 [1;31m*                                                          *[1;0m
 [1;31m*          WARNING: UNKNOWN DEVELOPMENT VERSION.           *[1;0m
 [1;31m*            WARNING: DO NOT USE FOR PRODUCTION            *[1;0m
@@ -53,7 +53,7 @@ Note that you can still compile and run aMC@NLO with the built-in PDFs
 Using default text editor "vi". Set another one in ./input/mg5_configuration.txt
 Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt
 Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt
-import /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_mad_ee_mumu.mg
+import /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu.mg
 The import format was not given, so we guess it as command
 set stdout_level DEBUG
 set output information to level: 10
@@ -62,7 +62,7 @@ generate e+ e- > mu+ mu-
 No model currently active, so we import the Standard Model
 INFO: load particles 
 INFO: load vertices 
-[1;32mDEBUG: model prefixing  takes 0.005407810211181641 [0m
+[1;32mDEBUG: model prefixing  takes 0.005505561828613281 [0m
 INFO: Restrict model sm with file models/sm/restrict_default.dat . 
 [1;32mDEBUG: Simplifying conditional expressions [0m
 [1;32mDEBUG: remove interactions: u s w+ at order: QED=1 [0m
@@ -156,73 +156,54 @@ INFO: Trying process: e+ e- > mu+ mu- WEIGHTED<=4 @1
 INFO: Process has 2 diagrams 
 1 processes with 2 diagrams generated in 0.004 s
 Total: 1 processes with 2 diagrams
-output madevent CODEGEN_mad_ee_mumu --hel_recycling=False --vector_size=16384 --me_exporter=standalone_cudacpp
-Load PLUGIN.CUDACPP_SA_OUTPUT
-[1mAddition matrix-element will be done with PLUGIN: CUDACPP_SA_OUTPUT[0m
-[1mOutput will be done with PLUGIN: CUDACPP_SA_OUTPUT[0m
+output madevent ../TMPOUT/CODEGEN_mad_ee_mumu --hel_recycling=False --vector_size=32 --me_exporter=standalone_cudacpp
+Load PLUGIN.CUDACPP_OUTPUT
+[1mAddition matrix-element will be done with PLUGIN: CUDACPP_OUTPUT[0m
+[1mOutput will be done with PLUGIN: CUDACPP_OUTPUT[0m
 [1;32mDEBUG:  cformat = [0m standalone_cudacpp [1;30m[export_cpp.py at line 3071][0m [0m
-[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [1;30m[output.py at line 152][0m [0m
+[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [1;30m[output.py at line 160][0m [0m
 INFO: initialize a new directory: CODEGEN_mad_ee_mumu 
 INFO: remove old information in CODEGEN_mad_ee_mumu 
-[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [1;30m[output.py at line 157][0m [0m
-[1;34mWARNING: File exists /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_mad_ee_mumu [0m
-INFO: Creating subdirectories in directory /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_mad_ee_mumu 
-[1;34mWARNING: File exists /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_mad_ee_mumu/Cards [0m
-[1;34mWARNING: File exists /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_mad_ee_mumu/SubProcesses [0m
+[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [1;30m[output.py at line 165][0m [0m
+[1;34mWARNING: File exists /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu [0m
+INFO: Creating subdirectories in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu 
+[1;34mWARNING: File exists /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu/Cards [0m
+[1;34mWARNING: File exists /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu/SubProcesses [0m
 INFO: Organizing processes into subprocess groups 
 INFO: Generating Helas calls for process: e+ e- > mu+ mu- WEIGHTED<=4 @1 
 INFO: Processing color information for process: e+ e- > mu+ mu- @1 
 INFO: Creating files in directory P1_epem_mupmum 
-[1;32mDEBUG:  Entering PLUGIN_OneProcessExporter.__init__ [1;30m[model_handling.py at line 1039][0m [0m
-[1;32mDEBUG:  kwargs[prefix] = 0 [1;30m[model_handling.py at line 1040][0m [0m
-[1;32mDEBUG:  proc_id = [0m 1 [1;30m[model_handling.py at line 1045][0m [0m
-[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_SA_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7fed2485b790> [1;30m[export_v4.py at line 6179][0m [0m
+[1;32mDEBUG:  kwargs[prefix] = 0 [1;30m[model_handling.py at line 1058][0m [0m
+[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7f5a06712b20> [1;30m[export_v4.py at line 6262][0m [0m
 INFO: Creating files in directory . 
-[1;32mDEBUG:  Entering PLUGIN_OneProcessExporter.generate_process_files [1;30m[model_handling.py at line 1297][0m [0m
-[1;32mDEBUG:  self.include_multi_channel is already defined: this is madevent+second_exporter mode [1;30m[model_handling.py at line 1299][0m [0m
-FileWriter <class 'PLUGIN.CUDACPP_SA_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.h
-[1;32mDEBUG:  Entering PLUGIN_OneProcessExporter.write_process_h_file [1;30m[model_handling.py at line 1453][0m [0m
-FileWriter <class 'PLUGIN.CUDACPP_SA_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.cc
-[1;32mDEBUG:  Entering PLUGIN_OneProcessExporter.write_process_cc_file [1;30m[model_handling.py at line 1475][0m [0m
-[1;32mDEBUG:  Entering PLUGIN_OneProcessExporter.get_sigmaKin_lines [1;30m[model_handling.py at line 1143][0m [0m
-[1;32mDEBUG:  self.include_multi_channel = [0m [1, 2] [1;30m[model_handling.py at line 1144][0m [0m
-[1;32mDEBUG:  self.support_multichannel = [0m True [1;30m[model_handling.py at line 1145][0m [0m
-[1;32mDEBUG:  type(self.helas_call_writer) = [0m <class 'PLUGIN.CUDACPP_SA_OUTPUT.model_handling.PLUGIN_GPUFOHelasCallWriter'> [1;30m[model_handling.py at line 1162][0m [0m
-[1;32mDEBUG:  self.support_multichannel, self.include_multi_channel = [0m True [1, 2] [1;30m[model_handling.py at line 1163][0m [0m
-[1;32mDEBUG:  multi_channel = [0m {1: [0], 2: [1]} [1;30m[model_handling.py at line 1169][0m [0m
-[1;32mDEBUG:  multi_channel_map = [0m {1: [0], 2: [1]} [1;30m[model_handling.py at line 1654][0m [0m
-[1;32mDEBUG:  diag_to_config = [0m {1: 1, 2: 2} [1;30m[model_handling.py at line 1711][0m [0m
+FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.h
+FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.cc
 INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./. 
-[1;32mDEBUG:  Entering PLUGIN_OneProcessExporter.edit_CMakeLists [1;30m[model_handling.py at line 1343][0m [0m
-[1;32mDEBUG:  Entering PLUGIN_OneProcessExporter.edit_check_sa [1;30m[model_handling.py at line 1352][0m [0m
-[1;32mDEBUG:  Entering PLUGIN_OneProcessExporter.edit_mgonGPU [1;30m[model_handling.py at line 1369][0m [0m
-[1;32mDEBUG:  Entering PLUGIN_OneProcessExporter.edit_processidfile [1;30m[model_handling.py at line 1389][0m [0m
-[1;32mDEBUG:  Entering PLUGIN_OneProcessExporter.edit_coloramps [1;30m[model_handling.py at line 1401][0m [0m
-[1;32mDEBUG:  Entering PLUGIN_OneProcessExporter.edit_testxxx [1;30m[model_handling.py at line 1419][0m [0m
-[1;32mDEBUG:  Entering PLUGIN_OneProcessExporter.edit_memorybuffers [1;30m[model_handling.py at line 1430][0m [0m
-[1;32mDEBUG:  Entering PLUGIN_OneProcessExporter.edit_memoryaccesscouplings [1;30m[model_handling.py at line 1441][0m [0m
-[1;32mDEBUG:  'Copying test reference file: ', template_ref  = [0m Copying test reference file:  /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/../../../test/ref/dump_CPUTest.Sigma_sm_epem_mupmum.txt [1;30m[model_handling.py at line 1335][0m [0m
 [1;32mDEBUG:  proc_id = [0m 1 [1;30m[export_cpp.py at line 710][0m [0m
 [1;32mDEBUG:  config_map = [0m [1, 2] [1;30m[export_cpp.py at line 711][0m [0m
 [1;32mDEBUG:  subproc_number = [0m 0 [1;30m[export_cpp.py at line 712][0m [0m
 [1;32mDEBUG:  Done [1;30m[export_cpp.py at line 713][0m [0m
+[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m False True 32 [1;30m[export_v4.py at line 1872][0m [0m
+[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m False True 32 [1;30m[export_v4.py at line 1872][0m [0m
+[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m 32 True 32 [1;30m[export_v4.py at line 1872][0m [0m
 [1;34mWARNING: vector code for lepton pdf not implemented. We removed the option to run dressed lepton [0m
+[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m 32 True 32 [1;30m[export_v4.py at line 1872][0m [0m
 INFO: Generating Feynman diagrams for Process: e+ e- > mu+ mu- WEIGHTED<=4 @1 
 INFO: Finding symmetric diagrams for subprocess group epem_mupmum 
 Generated helas calls for 1 subprocesses (2 diagrams) in 0.004 s
-Wrote files for 8 helas calls in 0.098 s
+Wrote files for 8 helas calls in 0.097 s
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates FFV1 routines[0m
 ALOHA: aloha creates FFV2 routines[0m
 ALOHA: aloha creates FFV4 routines[0m
-ALOHA: aloha creates 3 routines in  0.202 s
-[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.convert_model (create the model) [1;30m[output.py at line 194][0m [0m
+ALOHA: aloha creates 3 routines in  0.199 s
+[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.convert_model (create the model) [1;30m[output.py at line 202][0m [0m
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates FFV1 routines[0m
 ALOHA: aloha creates FFV2 routines[0m
 ALOHA: aloha creates FFV4 routines[0m
 ALOHA: aloha creates FFV2_4 routines[0m
-ALOHA: aloha creates 7 routines in  0.256 s
+ALOHA: aloha creates 7 routines in  0.253 s
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
 <class 'aloha.create_aloha.AbstractRoutine'> FFV2
@@ -231,29 +212,103 @@ ALOHA: aloha creates 7 routines in  0.256 s
 <class 'aloha.create_aloha.AbstractRoutine'> FFV4
 <class 'aloha.create_aloha.AbstractRoutine'> FFV2_4
 <class 'aloha.create_aloha.AbstractRoutine'> FFV2_4
-FileWriter <class 'PLUGIN.CUDACPP_SA_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_mad_ee_mumu/src/./HelAmps_sm.h
-INFO: Created file HelAmps_sm.h in directory /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_mad_ee_mumu/src/. 
+FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu/src/./HelAmps_sm.h
+INFO: Created file HelAmps_sm.h in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu/src/. 
 super_write_set_parameters_onlyfixMajorana (hardcoded=False)
-[1;32mDEBUG:  'pardef_lines size =', len(pardef_lines), ', keys size =', len(pardef_lines.keys())  = [0m pardef_lines size = 48 , keys size = 48 [1;30m[model_handling.py at line 729][0m [0m
 super_write_set_parameters_onlyfixMajorana (hardcoded=True)
-[1;32mDEBUG:  'pardef_lines size =', len(pardef_lines), ', keys size =', len(pardef_lines.keys())  = [0m pardef_lines size = 3 , keys size = 3 [1;30m[model_handling.py at line 729][0m [0m
-[1;32mDEBUG:  'pardef_lines size =', len(pardef_lines), ', keys size =', len(pardef_lines.keys())  = [0m pardef_lines size = 3 , keys size = 3 [1;30m[model_handling.py at line 729][0m [0m
-FileWriter <class 'PLUGIN.CUDACPP_SA_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_mad_ee_mumu/src/./Parameters_sm.h
-FileWriter <class 'PLUGIN.CUDACPP_SA_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_mad_ee_mumu/src/./Parameters_sm.cc
+FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu/src/./Parameters_sm.h
+FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu/src/./Parameters_sm.cc
 INFO: Created files Parameters_sm.h and Parameters_sm.cc in directory 
-INFO: /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_mad_ee_mumu/src/. and /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_mad_ee_mumu/src/. 
+INFO: /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu/src/. and /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu/src/. 
 The option zerowidth_tchannel is modified [True] but will not be written in the configuration files.
 If you want to make this value the default for future session, you can run 'save options --all'
-save configuration file to /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_mad_ee_mumu/Cards/me5_configuration.txt
+save configuration file to /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu/Cards/me5_configuration.txt
 INFO: Use Fortran compiler gfortran 
 INFO: Use c++ compiler g++ 
 INFO: Generate web pages 
-Output to directory /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_mad_ee_mumu done.
+DEBUG: cd /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu; patch -p4 -i /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.common
+patching file Source/genps.inc
+patching file Source/makefile
+patching file SubProcesses/makefile
+patching file bin/internal/gen_ximprove.py
+Hunk #1 succeeded at 391 (offset 6 lines).
+patching file bin/internal/madevent_interface.py
+DEBUG: cd /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu/SubProcesses/P1_epem_mupmum; patch -p6 -i /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.P1
+patching file auto_dsig1.f
+Hunk #1 succeeded at 496 (offset 12 lines).
+patching file driver.f
+patching file matrix1.f
+Hunk #3 succeeded at 230 (offset 9 lines).
+Hunk #4 succeeded at 267 (offset 18 lines).
+Hunk #5 succeeded at 312 (offset 18 lines).
+[1;32mDEBUG:  p.returncode = [0m 0 [1;30m[output.py at line 237][0m [0m
+Output to directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu done.
 Type "launch" to generate events from this process, or see
-/data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_mad_ee_mumu/README
+/data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu/README
 Run "open index.html" to see more information about this process.
 quit
 
-real	0m1.873s
-user	0m1.636s
-sys	0m0.197s
+real	0m1.874s
+user	0m1.628s
+sys	0m0.225s
+Code generation completed in 2 seconds
+************************************************************
+*                                                          *
+*                      W E L C O M E to                    *
+*             M A D G R A P H 5 _ a M C @ N L O            *
+*                      M A D E V E N T                     *
+*                                                          *
+*                 *                       *                *
+*                   *        * *        *                  *
+*                     * * * * 5 * * * *                    *
+*                   *        * *        *                  *
+*                 *                       *                *
+*                                                          *
+*         VERSION 3.5.2_lo_vect                            *
+*                                                          *
+*    The MadGraph5_aMC@NLO Development Team - Find us at   *
+*    https://server06.fynu.ucl.ac.be/projects/madgraph     *
+*                                                          *
+*               Type 'help' for in-line help.              *
+*                                                          *
+************************************************************
+INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu/Cards/me5_configuration.txt  
+INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/mg5amcnlo/input/mg5_configuration.txt  
+INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu/Cards/me5_configuration.txt  
+Using default text editor "vi". Set another one in ./input/mg5_configuration.txt
+Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt
+Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt
+treatcards run
+quit
+INFO:  
+launch in debug mode
+************************************************************
+*                                                          *
+*                      W E L C O M E to                    *
+*             M A D G R A P H 5 _ a M C @ N L O            *
+*                      M A D E V E N T                     *
+*                                                          *
+*                 *                       *                *
+*                   *        * *        *                  *
+*                     * * * * 5 * * * *                    *
+*                   *        * *        *                  *
+*                 *                       *                *
+*                                                          *
+*         VERSION 3.5.2_lo_vect                            *
+*                                                          *
+*    The MadGraph5_aMC@NLO Development Team - Find us at   *
+*    https://server06.fynu.ucl.ac.be/projects/madgraph     *
+*                                                          *
+*               Type 'help' for in-line help.              *
+*                                                          *
+************************************************************
+INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu/Cards/me5_configuration.txt  
+INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/mg5amcnlo/input/mg5_configuration.txt  
+INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu/Cards/me5_configuration.txt  
+Using default text editor "vi". Set another one in ./input/mg5_configuration.txt
+Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt
+Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt
+treatcards param
+quit
+INFO:  
+launch in debug mode
diff --git a/epochX/cudacpp/ee_mumu.mad/Cards/me5_configuration.txt b/epochX/cudacpp/ee_mumu.mad/Cards/me5_configuration.txt
index 00d7c6f8d6..cdeedc7863 100644
--- a/epochX/cudacpp/ee_mumu.mad/Cards/me5_configuration.txt
+++ b/epochX/cudacpp/ee_mumu.mad/Cards/me5_configuration.txt
@@ -234,7 +234,7 @@
 # pineappl = pineappl
 
 
-#mg5_path = /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo 
+#mg5_path = /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/mg5amcnlo 
 
 # MG5 MAIN DIRECTORY
-#mg5_path = /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo
+#mg5_path = /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/mg5amcnlo
diff --git a/epochX/cudacpp/ee_mumu.mad/Cards/proc_card_mg5.dat b/epochX/cudacpp/ee_mumu.mad/Cards/proc_card_mg5.dat
index f6f7f2e4d1..22e76563ab 100644
--- a/epochX/cudacpp/ee_mumu.mad/Cards/proc_card_mg5.dat
+++ b/epochX/cudacpp/ee_mumu.mad/Cards/proc_card_mg5.dat
@@ -8,7 +8,7 @@
 #*                *                       *                 *
 #*                                                          *
 #*                                                          *
-#*         VERSION 3.5.1_lo_vect         2023-08-08         *
+#*         VERSION 3.5.2_lo_vect         2023-11-08         *
 [1;31m#*                                                          *[1;0m
 [1;31m#*          WARNING: UNKNOWN DEVELOPMENT VERSION.           *[1;0m
 [1;31m#*            WARNING: DO NOT USE FOR PRODUCTION            *[1;0m
@@ -45,5 +45,5 @@ define l+ = e+ mu+
 define l- = e- mu-
 define vl = ve vm vt
 define vl~ = ve~ vm~ vt~
-output madevent CODEGEN_mad_ee_mumu --hel_recycling=False --vector_siz\
-e=16384 --me_exporter=standalone_cudacpp
+output madevent ../TMPOUT/CODEGEN_mad_ee_mumu --hel_recycling=False --\
+vector_size=32 --me_exporter=standalone_cudacpp
diff --git a/epochX/cudacpp/ee_mumu.mad/Cards/run_card.dat b/epochX/cudacpp/ee_mumu.mad/Cards/run_card.dat
index 86f6a33258..1084532333 100644
--- a/epochX/cudacpp/ee_mumu.mad/Cards/run_card.dat
+++ b/epochX/cudacpp/ee_mumu.mad/Cards/run_card.dat
@@ -85,7 +85,23 @@
    1  = sde_strategy  ! default integration strategy (hep-ph/2021.00773)
                              ! 1 is old strategy (using amp square)
 			     ! 2 is new strategy (using only the denominator)
-# To see advanced option for Phase-Space optimization: type "update psoptim"			     
+#*********************************************************************
+# Phase-Space Optim (advanced)
+#*********************************************************************
+   0 = job_strategy ! see appendix of 1507.00020 (page 26)
+   0 =  hard_survey ! force to have better estimate of the integral at survey for difficult mode like interference
+   -1.0 = tmin_for_channel ! limit the non-singular reach of --some-- channel of integration related to T-channel diagram (value between -1 and 0), -1 is no impact
+   -1 = survey_splitting ! for loop-induced control how many core are used at survey for the computation of a single iteration.
+   2 = survey_nchannel_per_job ! control how many Channel are integrated inside a single job on cluster/multicore
+   -1 = refine_evt_by_job ! control the maximal number of events for the first iteration of the refine (larger means less jobs)  
+#*********************************************************************
+# Compilation flag. 
+#*********************************************************************   
+   -O3 -ffast-math -fbounds-check = global_flag ! build flags for all Fortran code (for a fair comparison to cudacpp; default is -O)
+   --fast-math  = aloha_flag ! fortran optimization flag for aloha function. Suggestions: '-ffast-math'
+   -O3 = matrix_flag ! fortran optimization flag for matrix.f function. Suggestions: '-O3'
+   16384 = vector_size ! size of fortran arrays allocated in the multi-event API for SIMD/GPU (VECSIZE_MEMMAX)
+			     
 #*********************************************************************
 # Customization (custom cuts/scale/bias/...)                         *
 # list of files containing fortran function that overwrite default   *
@@ -182,12 +198,10 @@
 systematics = systematics_program ! none, systematics [python], SysCalc [depreceted, C++]
 ['--mur=0.5,1,2', '--muf=0.5,1,2', '--pdf=errorset'] = systematics_arguments ! see: https://cp3.irmp.ucl.ac.be/projects/madgraph/wiki/Systematics#Systematicspythonmodule
 
-#*********************************************************************
-# Options for the cudacpp plugin
-#*********************************************************************
-
-# Set cudacpp-specific values of non-cudacpp-specific options
--O3 -ffast-math -fbounds-check = global_flag ! build flags for Fortran code (for a fair comparison to cudacpp)
+#***********************************************************************
+# SIMD/GPU configuration for the CUDACPP plugin
+#************************************************************************
+ d = floating_type ! floating point precision: f (single), d (double), m (mixed: double for amplitudes, single for colors)
+ auto = avx_level ! SIMD vectorization level: none, sse4, avx2, 512y, 512z, auto
+ CPP = cudacpp_backend ! CUDACPP backend: FORTRAN, CPP, CUDA
 
-# New cudacpp-specific options (default values are defined in banner.py)
-CPP = cudacpp_backend ! valid backends are FORTRAN, CPP, CUDA
diff --git a/epochX/cudacpp/ee_mumu.mad/Cards/run_card_default.dat b/epochX/cudacpp/ee_mumu.mad/Cards/run_card_default.dat
index 67c1a4de28..0fa0402261 100644
--- a/epochX/cudacpp/ee_mumu.mad/Cards/run_card_default.dat
+++ b/epochX/cudacpp/ee_mumu.mad/Cards/run_card_default.dat
@@ -85,7 +85,23 @@
    2  = sde_strategy  ! default integration strategy (hep-ph/2021.00773)
                              ! 1 is old strategy (using amp square)
 			     ! 2 is new strategy (using only the denominator)
-# To see advanced option for Phase-Space optimization: type "update psoptim"			     
+#*********************************************************************
+# Phase-Space Optim (advanced)
+#*********************************************************************
+   0 = job_strategy ! see appendix of 1507.00020 (page 26)
+   0 =  hard_survey ! force to have better estimate of the integral at survey for difficult mode like interference
+   -1.0 = tmin_for_channel ! limit the non-singular reach of --some-- channel of integration related to T-channel diagram (value between -1 and 0), -1 is no impact
+   -1 = survey_splitting ! for loop-induced control how many core are used at survey for the computation of a single iteration.
+   2 = survey_nchannel_per_job ! control how many Channel are integrated inside a single job on cluster/multicore
+   -1 = refine_evt_by_job ! control the maximal number of events for the first iteration of the refine (larger means less jobs)  
+#*********************************************************************
+# Compilation flag. 
+#*********************************************************************   
+   -O = global_flag ! fortran optimization flag use for the all code.
+   --fast-math  = aloha_flag ! fortran optimization flag for aloha function. Suggestions: '-ffast-math'
+   -O3 = matrix_flag ! fortran optimization flag for matrix.f function. Suggestions: '-O3'
+   16 = vector_size ! size of fortran arrays allocated in the multi-event API for SIMD/GPU (VECSIZE_MEMMAX)
+			     
 #*********************************************************************
 # Customization (custom cuts/scale/bias/...)                         *
 # list of files containing fortran function that overwrite default   *
@@ -181,3 +197,11 @@
 #
 systematics = systematics_program ! none, systematics [python], SysCalc [depreceted, C++]
 ['--mur=0.5,1,2', '--muf=0.5,1,2', '--pdf=errorset'] = systematics_arguments ! see: https://cp3.irmp.ucl.ac.be/projects/madgraph/wiki/Systematics#Systematicspythonmodule
+
+#***********************************************************************
+# SIMD/GPU configuration for the CUDACPP plugin
+#************************************************************************
+ d = floating_type ! floating point precision: f (single), d (double), m (mixed: double for amplitudes, single for colors)
+ auto = avx_level ! SIMD vectorization level: none, sse4, avx2, 512y, 512z, auto
+ CPP = cudacpp_backend ! CUDACPP backend: FORTRAN, CPP, CUDA
+
diff --git a/epochX/cudacpp/ee_mumu.mad/MGMEVersion.txt b/epochX/cudacpp/ee_mumu.mad/MGMEVersion.txt
index 1c1a95761b..85c67c3554 100644
--- a/epochX/cudacpp/ee_mumu.mad/MGMEVersion.txt
+++ b/epochX/cudacpp/ee_mumu.mad/MGMEVersion.txt
@@ -1 +1 @@
-3.5.1_lo_vect
\ No newline at end of file
+3.5.2_lo_vect
\ No newline at end of file
diff --git a/epochX/cudacpp/ee_mumu.mad/Source/DHELAS/aloha_file.inc b/epochX/cudacpp/ee_mumu.mad/Source/DHELAS/aloha_file.inc
index 738db319fd..13aaa31c6d 100644
--- a/epochX/cudacpp/ee_mumu.mad/Source/DHELAS/aloha_file.inc
+++ b/epochX/cudacpp/ee_mumu.mad/Source/DHELAS/aloha_file.inc
@@ -1 +1 @@
-ALOHARoutine = FFV1_0.o FFV4_3.o FFV1P0_3.o FFV2_0.o FFV4_0.o FFV2_3.o
+ALOHARoutine = FFV1P0_3.o FFV1_0.o FFV2_0.o FFV2_3.o FFV4_0.o FFV4_3.o
diff --git a/epochX/cudacpp/ee_mumu.mad/Source/PDF/pdfwrap_lhapdf.f b/epochX/cudacpp/ee_mumu.mad/Source/PDF/pdfwrap_lhapdf.f
index 0be926e6cd..3f36905346 100644
--- a/epochX/cudacpp/ee_mumu.mad/Source/PDF/pdfwrap_lhapdf.f
+++ b/epochX/cudacpp/ee_mumu.mad/Source/PDF/pdfwrap_lhapdf.f
@@ -5,6 +5,7 @@ SUBROUTINE PDFWRAP
 C     
       INCLUDE 'pdf.inc'
       INCLUDE '../alfas.inc'
+      INCLUDE '../vector.inc'
       INCLUDE '../coupl.inc'
       REAL*8 ZMASS
       DATA ZMASS/91.188D0/
diff --git a/epochX/cudacpp/ee_mumu.mad/Source/make_opts b/epochX/cudacpp/ee_mumu.mad/Source/make_opts
index bd3c24228d..e4b87ee6ad 100644
--- a/epochX/cudacpp/ee_mumu.mad/Source/make_opts
+++ b/epochX/cudacpp/ee_mumu.mad/Source/make_opts
@@ -1,17 +1,12 @@
-pdlabel1=
-pdlabel2=
-lhapdf=
-PYTHIA8_PATH=NotInstalled
-MG5AMC_VERSION=3.5.0_lo_vect
-GLOBAL_FLAG=-O3 -ffast-math -fbounds-check
-ALOHA_FLAG=
-MATRIX_FLAG=
 DEFAULT_CPP_COMPILER=g++
+DEFAULT_F2PY_COMPILER=f2py3
+DEFAULT_F_COMPILER=gfortran
+GLOBAL_FLAG=-O3 -ffast-math -fbounds-check
 MACFLAG=
-STDLIB=-lstdc++
+MG5AMC_VERSION=SpecifiedByMG5aMCAtRunTime
+PYTHIA8_PATH=NotInstalled
 STDLIB_FLAG=
-DEFAULT_F_COMPILER=gfortran
-DEFAULT_F2PY_COMPILER=f2py3
+STDLIB=-lstdc++
 #end_of_make_opts_variables
 
 BIASLIBDIR=../../../lib/
diff --git a/epochX/cudacpp/ee_mumu.mad/Source/makefile b/epochX/cudacpp/ee_mumu.mad/Source/makefile
index dbe08b846e..00c73099a0 100644
--- a/epochX/cudacpp/ee_mumu.mad/Source/makefile
+++ b/epochX/cudacpp/ee_mumu.mad/Source/makefile
@@ -136,5 +136,7 @@ cleanSource:
 clean: cleanSource
 	for i in `ls -d ../SubProcesses/P*`; do cd $$i; make clean; cd -; done;
 
-cleanall: cleanSource
+cleanavx:
+	for i in `ls -d ../SubProcesses/P*`; do cd $$i; make cleanavxs; cd -; done;
+cleanall: cleanSource # THIS IS THE ONE
 	for i in `ls -d ../SubProcesses/P*`; do cd $$i; make cleanavxs; cd -; done;
diff --git a/epochX/cudacpp/ee_mumu.mad/Source/param_card.inc b/epochX/cudacpp/ee_mumu.mad/Source/param_card.inc
index 1fcfce55bb..081365c16b 100644
--- a/epochX/cudacpp/ee_mumu.mad/Source/param_card.inc
+++ b/epochX/cudacpp/ee_mumu.mad/Source/param_card.inc
@@ -1,15 +1,15 @@
-      MDL_WZ = 2.441404D+00
-      MDL_WW = 2.047600D+00
-      MDL_WH = 6.382339D-03
-      MDL_WT = 1.491500D+00
+      MDL_MB = 4.700000D+00
+      MDL_MT = 1.730000D+02
       MDL_MTA = 1.777000D+00
       MDL_MZ = 9.118800D+01
       MDL_MH = 1.250000D+02
-      MDL_MB = 4.700000D+00
-      MDL_MT = 1.730000D+02
       AEWM1 = 1.325070D+02
       MDL_GF = 1.166390D-05
       AS = 1.180000D-01
-      MDL_YMTAU = 1.777000D+00
       MDL_YMB = 4.700000D+00
       MDL_YMT = 1.730000D+02
+      MDL_YMTAU = 1.777000D+00
+      MDL_WT = 1.491500D+00
+      MDL_WZ = 2.441404D+00
+      MDL_WW = 2.047600D+00
+      MDL_WH = 6.382339D-03
diff --git a/epochX/cudacpp/ee_mumu.mad/Source/vector.inc b/epochX/cudacpp/ee_mumu.mad/Source/vector.inc
index 92254c0f2a..863eebbc70 100644
--- a/epochX/cudacpp/ee_mumu.mad/Source/vector.inc
+++ b/epochX/cudacpp/ee_mumu.mad/Source/vector.inc
@@ -28,5 +28,4 @@ C     BECAUSE IT DOES NOT GO THROUGH THE CPP PREPROCESSOR
 C     (see https://github.com/madgraph5/madgraph4gpu/issues/458).
 C     
       INTEGER VECSIZE_MEMMAX
-      PARAMETER (VECSIZE_MEMMAX=16384) ! NB: 16k events per GPU grid is the minimum required to fill a V100 GPU
-c     PARAMETER (VECSIZE_MEMMAX=32) ! NB: workaround for out-of-memory on Juwels: 32 is enough for no-CUDA builds (issue #498)
+      PARAMETER (VECSIZE_MEMMAX=16384)
diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/Bridge.h b/epochX/cudacpp/ee_mumu.mad/SubProcesses/Bridge.h
index f37c972b24..89437b4c42 100644
--- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/Bridge.h
+++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/Bridge.h
@@ -18,6 +18,7 @@
 #include <cassert>
 #include <cmath>
 #include <cstring>
+#include <filesystem>
 #include <iostream>
 #include <memory>
 #include <type_traits>
@@ -244,14 +245,21 @@ namespace mg5amcCpu
     }
     std::cout << "WARNING! Instantiate device Bridge (nevt=" << m_nevt << ", gpublocks=" << m_gpublocks << ", gputhreads=" << m_gputhreads
               << ", gpublocks*gputhreads=" << m_gpublocks * m_gputhreads << ")" << std::endl;
-    CPPProcess process( /*verbose=*/false );
     m_pmek.reset( new MatrixElementKernelDevice( m_devMomentaC, m_devGs, m_devRndHel, m_devRndCol, m_devMEs, m_devSelHel, m_devSelCol, m_gpublocks, m_gputhreads ) );
 #else
     std::cout << "WARNING! Instantiate host Bridge (nevt=" << m_nevt << ")" << std::endl;
-    CPPProcess process( /*verbose=*/false );
     m_pmek.reset( new MatrixElementKernelHost( m_hstMomentaC, m_hstGs, m_hstRndHel, m_hstRndCol, m_hstMEs, m_hstSelHel, m_hstSelCol, m_nevt ) );
 #endif // MGONGPUCPP_GPUIMPL
-    process.initProc( "../../Cards/param_card.dat" );
+    // Create a process object, read param card and set parameters
+    // FIXME: the process instance can happily go out of scope because it is only needed to read parameters?
+    // FIXME: the CPPProcess should really be a singleton? what if fbridgecreate is called from several Fortran threads?
+    CPPProcess process( /*verbose=*/false );
+    std::string paramCard = "../../Cards/param_card.dat";
+    if( !std::filesystem::exists( paramCard ) )
+    {
+      paramCard = "../" + paramCard;
+    }
+    process.initProc( paramCard );
   }
 
 #ifdef MGONGPUCPP_GPUIMPL
diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/GpuAbstraction.h b/epochX/cudacpp/ee_mumu.mad/SubProcesses/GpuAbstraction.h
index 9c467b1e04..6a7d9c05c0 100644
--- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/GpuAbstraction.h
+++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/GpuAbstraction.h
@@ -39,6 +39,8 @@
 
 #elif defined __HIPCC__
 
+#include "hip/hip_runtime.h"
+
 #define gpuError_t hipError_t
 #define gpuPeekAtLastError hipPeekAtLastError
 #define gpuGetErrorString hipGetErrorString
diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/MGVersion.txt b/epochX/cudacpp/ee_mumu.mad/SubProcesses/MGVersion.txt
index 1c1a95761b..85c67c3554 100644
--- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/MGVersion.txt
+++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/MGVersion.txt
@@ -1 +1 @@
-3.5.1_lo_vect
\ No newline at end of file
+3.5.2_lo_vect
\ No newline at end of file
diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/MadgraphTest.h b/epochX/cudacpp/ee_mumu.mad/SubProcesses/MadgraphTest.h
index 176338151a..a64c05c26a 100644
--- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/MadgraphTest.h
+++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/MadgraphTest.h
@@ -14,6 +14,7 @@
 
 #include <array>
 #include <cmath>
+#include <filesystem>
 #include <fstream>
 #include <iomanip>
 #include <memory>
@@ -215,19 +216,16 @@ TEST_P( MadgraphTest, CompareMomentaAndME )
 #endif
   constexpr fptype energy = 1500; // historical default, Ecms = 1500 GeV = 1.5 TeV (above the Z peak)
   // Dump events to a new reference file?
-  constexpr bool dumpEvents = false;
-  std::string dumpFileName = std::string( "dump_" ) + testing::UnitTest::GetInstance()->current_test_info()->test_suite_name() + '.' + testing::UnitTest::GetInstance()->current_test_info()->name() + ".txt";
-  while( dumpFileName.find( '/' ) != std::string::npos )
-  {
-    dumpFileName.replace( dumpFileName.find( '/' ), 1, "_" );
-  }
+  const char* dumpEventsC = getenv( "CUDACPP_RUNTEST_DUMPEVENTS" );
+  const bool dumpEvents = ( dumpEventsC != 0 ) && ( std::string( dumpEventsC ) != "" );
+  const std::string refFileName = testDriver->getRefFileName();
+  const std::string dumpFileName = std::filesystem::path( refFileName ).filename();
   std::ofstream dumpFile;
   if( dumpEvents )
   {
     dumpFile.open( dumpFileName, std::ios::trunc );
   }
   // Read reference data
-  const std::string refFileName = testDriver->getRefFileName();
   std::map<unsigned int, ReferenceData> referenceData;
   if( !dumpEvents )
   {
diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/MatrixElementKernels.cc b/epochX/cudacpp/ee_mumu.mad/SubProcesses/MatrixElementKernels.cc
index d6d6c4f179..81699dfea9 100644
--- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/MatrixElementKernels.cc
+++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/MatrixElementKernels.cc
@@ -112,10 +112,17 @@ namespace mg5amcCpu
     // See https://community.arm.com/arm-community-blogs/b/operating-systems-blog/posts/runtime-detection-of-cpu-features-on-an-armv8-a-cpu
     bool ok = true; // this is just an assumption!
     const std::string tag = "arm neon (128bit as in SSE4.2)";
-#else
+#elif defined( __x86_64__ ) || defined( __i386__ )
     bool known = true;
     bool ok = __builtin_cpu_supports( "sse4.2" );
     const std::string tag = "nehalem (SSE4.2)";
+#else // AV FIXME! Added by OM for Mac, should identify the correct __xxx__ flag that should be targeted
+    bool known = false; // __builtin_cpu_supports is not supported
+    // See https://gcc.gnu.org/onlinedocs/gcc/Basic-PowerPC-Built-in-Functions-Available-on-all-Configurations.html
+    // See https://stackoverflow.com/q/62783908
+    // See https://community.arm.com/arm-community-blogs/b/operating-systems-blog/posts/runtime-detection-of-cpu-features-on-an-armv8-a-cpu
+    bool ok = true; // this is just an assumption!
+    const std::string tag = "arm neon (128bit as in SSE4.2)";
 #endif
 #else
     bool known = true;
diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/CPPProcess.cc b/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/CPPProcess.cc
index 05d3ef0cfb..83e5b15013 100644
--- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/CPPProcess.cc
+++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/CPPProcess.cc
@@ -7,7 +7,7 @@
 // Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+// MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
@@ -239,25 +239,18 @@ namespace mg5amcCpu
       // *** DIAGRAM 1 OF 2 ***
 
       // Wavefunction(s) for diagram number 1
-#if not( defined MGONGPUCPP_GPUIMPL and defined MGONGPU_TEST_DIVERGENCE )
-      opzxxx<M_ACCESS, W_ACCESS>( momenta, cHel[ihel][0], -1, w_fp[0], 0 ); // NB: opzxxx only uses pz
-#else
-      if( ( blockDim.x * blockIdx.x + threadIdx.x ) % 2 == 0 )
-        opzxxx<M_ACCESS, W_ACCESS>( momenta, cHel[ihel][0], -1, w_fp[0], 0 ); // NB: opzxxx only uses pz
-      else
-        oxxxxx<M_ACCESS, W_ACCESS>( momenta, 0, cHel[ihel][0], -1, w_fp[0], 0 );
-#endif
+      oxxxxx<M_ACCESS, W_ACCESS>( momenta, 0., cHel[ihel][0], -1, w_fp[0], 0 );
 
-      imzxxx<M_ACCESS, W_ACCESS>( momenta, cHel[ihel][1], +1, w_fp[1], 1 ); // NB: imzxxx only uses pz
+      ixxxxx<M_ACCESS, W_ACCESS>( momenta, 0., cHel[ihel][1], +1, w_fp[1], 1 );
 
-      ixzxxx<M_ACCESS, W_ACCESS>( momenta, cHel[ihel][2], -1, w_fp[2], 2 );
+      ixxxxx<M_ACCESS, W_ACCESS>( momenta, 0., cHel[ihel][2], -1, w_fp[2], 2 );
 
-      oxzxxx<M_ACCESS, W_ACCESS>( momenta, cHel[ihel][3], +1, w_fp[3], 3 );
+      oxxxxx<M_ACCESS, W_ACCESS>( momenta, 0., cHel[ihel][3], +1, w_fp[3], 3 );
 
-      FFV1P0_3<W_ACCESS, CI_ACCESS>( w_fp[1], w_fp[0], COUPs[0], 0., 0., w_fp[4] );
+      FFV1P0_3<W_ACCESS, CI_ACCESS>( w_fp[1], w_fp[0], COUPs[ndcoup + 0], 1.0, 0., 0., w_fp[4] );
 
       // Amplitude(s) for diagram number 1
-      FFV1_0<W_ACCESS, A_ACCESS, CI_ACCESS>( w_fp[2], w_fp[3], w_fp[4], COUPs[0], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CI_ACCESS>( w_fp[2], w_fp[3], w_fp[4], COUPs[ndcoup + 0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 1 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -267,10 +260,10 @@ namespace mg5amcCpu
       // *** DIAGRAM 2 OF 2 ***
 
       // Wavefunction(s) for diagram number 2
-      FFV2_4_3<W_ACCESS, CI_ACCESS>( w_fp[1], w_fp[0], COUPs[1], COUPs[2], cIPD[0], cIPD[1], w_fp[4] );
+      FFV2_4_3<W_ACCESS, CI_ACCESS>( w_fp[1], w_fp[0], COUPs[ndcoup + 1], 1.0, COUPs[ndcoup + 2], 1.0, cIPD[0], cIPD[1], w_fp[4] );
 
       // Amplitude(s) for diagram number 2
-      FFV2_4_0<W_ACCESS, A_ACCESS, CI_ACCESS>( w_fp[2], w_fp[3], w_fp[4], COUPs[1], COUPs[2], &amp_fp[0] );
+      FFV2_4_0<W_ACCESS, A_ACCESS, CI_ACCESS>( w_fp[2], w_fp[3], w_fp[4], COUPs[ndcoup + 1], 1.0, COUPs[ndcoup + 2], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 2 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -638,12 +631,12 @@ namespace mg5amcCpu
                        fptype* allDenominators,    // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
                        bool* isGoodHel )           // output: isGoodHel[ncomb] - device array (CUDA implementation)
-  { /* clang-format on */
-    fptype allMEsLast = 0;
+  {                                                         /* clang-format on */
     const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid
-    allMEs[ievt] = 0;
     for( int ihel = 0; ihel < ncomb; ihel++ )
     {
+      // NEW IMPLEMENTATION OF GETGOODHEL (#630): RESET THE RUNNING SUM OVER HELICITIES TO 0 BEFORE ADDING A NEW HELICITY
+      allMEs[ievt] = 0;
       // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s)
       constexpr fptype_sv* jamp2_sv = nullptr; // no need for color selection during helicity filtering
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
@@ -652,12 +645,11 @@ namespace mg5amcCpu
 #else
       calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv );
 #endif
-      if( allMEs[ievt] != allMEsLast )
+      if( allMEs[ievt] != 0 ) // NEW IMPLEMENTATION OF GETGOODHEL (#630): COMPARE EACH HELICITY CONTRIBUTION TO 0
       {
         //if ( !isGoodHel[ihel] ) std::cout << "sigmaKin_getGoodHel ihel=" << ihel << " TRUE" << std::endl;
         isGoodHel[ihel] = true;
       }
-      allMEsLast = allMEs[ievt]; // running sum up to helicity ihel for event ievt
     }
   }
 #else
@@ -676,19 +668,11 @@ namespace mg5amcCpu
     //assert( (size_t)(allMEs) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS]
     // Allocate arrays at build time to contain at least 16 events (or at least neppV events if neppV>16, e.g. in future VPUs)
     constexpr int maxtry0 = std::max( 16, neppV ); // 16, but at least neppV (otherwise the npagV loop does not even start)
-    fptype allMEsLast[maxtry0] = { 0 };            // allocated at build time: maxtry0 must be a constexpr
     // Loop over only nevt events if nevt is < 16 (note that nevt is always >= neppV)
     assert( nevt >= neppV );
     const int maxtry = std::min( maxtry0, nevt ); // 16, but at most nevt (avoid invalid memory access if nevt<maxtry0)
 
-    // PART 0 - INITIALISATION (before calculate_wavefunctions)
-    // Reset the "matrix elements" - running sums of |M|^2 over helicities for the given event
-    for( int ievt = 0; ievt < maxtry; ++ievt )
-    {
-      allMEs[ievt] = 0; // all zeros
-    }
-
-    // PART 1 - HELICITY LOOP: CALCULATE WAVEFUNCTIONS
+    // HELICITY LOOP: CALCULATE WAVEFUNCTIONS
     const int npagV = maxtry / neppV;
 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
     // Mixed fptypes #537: float for color algebra and double elsewhere
@@ -707,6 +691,16 @@ namespace mg5amcCpu
 #endif
       for( int ihel = 0; ihel < ncomb; ihel++ )
       {
+        // NEW IMPLEMENTATION OF GETGOODHEL (#630): RESET THE RUNNING SUM OVER HELICITIES TO 0 BEFORE ADDING A NEW HELICITY
+        for( int ieppV = 0; ieppV < neppV; ++ieppV )
+        {
+          const int ievt = ievt00 + ieppV;
+          allMEs[ievt] = 0;
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+          const int ievt2 = ievt00 + ieppV + neppV;
+          allMEs[ievt2] = 0;
+#endif
+        }
         constexpr fptype_sv* jamp2_sv = nullptr; // no need for color selection during helicity filtering
         //std::cout << "sigmaKin_getGoodHel ihel=" << ihel << ( isGoodHel[ihel] ? " true" : " false" ) << std::endl;
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
@@ -718,22 +712,18 @@ namespace mg5amcCpu
         for( int ieppV = 0; ieppV < neppV; ++ieppV )
         {
           const int ievt = ievt00 + ieppV;
-          const bool differs = ( allMEs[ievt] != allMEsLast[ievt] );
-          if( differs )
+          if( allMEs[ievt] != 0 ) // NEW IMPLEMENTATION OF GETGOODHEL (#630): COMPARE EACH HELICITY CONTRIBUTION TO 0
           {
             //if ( !isGoodHel[ihel] ) std::cout << "sigmaKin_getGoodHel ihel=" << ihel << " TRUE" << std::endl;
             isGoodHel[ihel] = true;
           }
-          allMEsLast[ievt] = allMEs[ievt]; // running sum up to helicity ihel
 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
           const int ievt2 = ievt00 + ieppV + neppV;
-          const bool differs2 = ( allMEs[ievt2] != allMEsLast[ievt2] );
-          if( differs2 )
+          if( allMEs[ievt2] != 0 ) // NEW IMPLEMENTATION OF GETGOODHEL (#630): COMPARE EACH HELICITY CONTRIBUTION TO 0
           {
             //if ( !isGoodHel[ihel] ) std::cout << "sigmaKin_getGoodHel ihel=" << ihel << " TRUE" << std::endl;
             isGoodHel[ihel] = true;
           }
-          allMEsLast[ievt2] = allMEs[ievt2]; // running sum up to helicity ihel
 #endif
         }
       }
@@ -790,13 +780,12 @@ namespace mg5amcCpu
   {
     mgDebugInitialise();
 
-    // SANITY CHECKS for cudacpp code generation (see issues #272 and #343 and PRs #619, #626, #360 and #396)
+    // SANITY CHECKS for cudacpp code generation (see issues #272 and #343 and PRs #619, #626, #360, #396 and #754)
     // These variable are not used anywhere else in the code and their scope is limited to this sanity check
     {
-      // nprocesses>1 was last observed for "mirror processes" in uux_ttx in the 270 branch (see issue #343 and PRs #360 and #396)
+      // nprocesses == 2 may happen for "mirror processes" such as P0_uux_ttx within pp_tt012j (see PR #754)
       constexpr int nprocesses = 1;
-      static_assert( nprocesses == 1, "Assume nprocesses == 1" );
-      // process_id corresponds to the index of DSIG1 Fortran functions (must be 1 because cudacpp is unable to handle DSIG2)
+      static_assert( nprocesses == 1 || nprocesses == 2, "Assume nprocesses == 1 or 2" );
       constexpr int process_id = 1; // code generation source: madevent + cudacpp exporter
       static_assert( process_id == 1, "Assume process_id == 1" );
     }
@@ -882,23 +871,26 @@ namespace mg5amcCpu
     }
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     // Event-by-event random choice of color #402
-    const int channelIdC = channelId - 1; // coloramps.h uses the C array indexing starting at 0
-    fptype targetamp[ncolor] = { 0 };
-    for( int icolC = 0; icolC < ncolor; icolC++ )
-    {
-      if( icolC == 0 )
-        targetamp[icolC] = 0;
-      else
-        targetamp[icolC] = targetamp[icolC - 1];
-      if( mgOnGpu::icolamp[channelIdC][icolC] ) targetamp[icolC] += jamp2_sv[icolC];
-    }
-    //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] );
-    for( int icolC = 0; icolC < ncolor; icolC++ )
+    if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783)
     {
-      if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) )
+      const unsigned int channelIdC = channelId - 1; // coloramps.h uses the C array indexing starting at 0
+      fptype targetamp[ncolor] = { 0 };
+      for( int icolC = 0; icolC < ncolor; icolC++ )
       {
-        allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1]
-        break;
+        if( icolC == 0 )
+          targetamp[icolC] = 0;
+        else
+          targetamp[icolC] = targetamp[icolC - 1];
+        if( mgOnGpu::icolamp[channelIdC][icolC] ) targetamp[icolC] += jamp2_sv[icolC];
+      }
+      //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] );
+      for( int icolC = 0; icolC < ncolor; icolC++ )
+      {
+        if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) )
+        {
+          allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1]
+          break;
+        }
       }
     }
 #endif
@@ -993,57 +985,60 @@ namespace mg5amcCpu
 #endif
       }
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // multichannel enabled (random color choice)
-      const int channelIdC = channelId - 1; // coloramps.h uses the C array indexing starting at 0
       // Event-by-event random choice of color #402
-      fptype_sv targetamp[ncolor] = { 0 };
-      for( int icolC = 0; icolC < ncolor; icolC++ )
+      if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783)
       {
-        if( icolC == 0 )
-          targetamp[icolC] = fptype_sv{ 0 };
-        else
-          targetamp[icolC] = targetamp[icolC - 1];
-        if( mgOnGpu::icolamp[channelIdC][icolC] ) targetamp[icolC] += jamp2_sv[icolC];
-      }
+        const unsigned int channelIdC = channelId - 1; // coloramps.h uses the C array indexing starting at 0
+        fptype_sv targetamp[ncolor] = { 0 };
+        for( int icolC = 0; icolC < ncolor; icolC++ )
+        {
+          if( icolC == 0 )
+            targetamp[icolC] = fptype_sv{ 0 };
+          else
+            targetamp[icolC] = targetamp[icolC - 1];
+          if( mgOnGpu::icolamp[channelIdC][icolC] ) targetamp[icolC] += jamp2_sv[icolC];
+        }
 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-      fptype_sv targetamp2[ncolor] = { 0 };
-      for( int icolC = 0; icolC < ncolor; icolC++ )
-      {
-        if( icolC == 0 )
-          targetamp2[icolC] = fptype_sv{ 0 };
-        else
-          targetamp2[icolC] = targetamp2[icolC - 1];
-        if( mgOnGpu::icolamp[channelIdC][icolC] ) targetamp2[icolC] += jamp2_sv[ncolor + icolC];
-      }
-#endif
-      for( int ieppV = 0; ieppV < neppV; ++ieppV )
-      {
-        const int ievt = ievt00 + ieppV;
-        //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] );
+        fptype_sv targetamp2[ncolor] = { 0 };
         for( int icolC = 0; icolC < ncolor; icolC++ )
         {
+          if( icolC == 0 )
+            targetamp2[icolC] = fptype_sv{ 0 };
+          else
+            targetamp2[icolC] = targetamp2[icolC - 1];
+          if( mgOnGpu::icolamp[channelIdC][icolC] ) targetamp2[icolC] += jamp2_sv[ncolor + icolC];
+        }
+#endif
+        for( int ieppV = 0; ieppV < neppV; ++ieppV )
+        {
+          const int ievt = ievt00 + ieppV;
+          //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] );
+          for( int icolC = 0; icolC < ncolor; icolC++ )
+          {
 #if defined MGONGPU_CPPSIMD
-          const bool okcol = allrndcol[ievt] < ( targetamp[icolC][ieppV] / targetamp[ncolor - 1][ieppV] );
+            const bool okcol = allrndcol[ievt] < ( targetamp[icolC][ieppV] / targetamp[ncolor - 1][ieppV] );
 #else
-          const bool okcol = allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] );
+            const bool okcol = allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] );
 #endif
-          if( okcol )
-          {
-            allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1]
-            break;
+            if( okcol )
+            {
+              allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1]
+              break;
+            }
           }
-        }
 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-        const int ievt2 = ievt00 + ieppV + neppV;
-        //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt2, allrndcol[ievt2] );
-        for( int icolC = 0; icolC < ncolor; icolC++ )
-        {
-          if( allrndcol[ievt2] < ( targetamp2[icolC][ieppV] / targetamp2[ncolor - 1][ieppV] ) )
+          const int ievt2 = ievt00 + ieppV + neppV;
+          //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt2, allrndcol[ievt2] );
+          for( int icolC = 0; icolC < ncolor; icolC++ )
           {
-            allselcol[ievt2] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1]
-            break;
+            if( allrndcol[ievt2] < ( targetamp2[icolC][ieppV] / targetamp2[ncolor - 1][ieppV] ) )
+            {
+              allselcol[ievt2] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1]
+              break;
+            }
           }
-        }
 #endif
+        }
       }
 #endif // multichannel enabled (random color choice)
     }
diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/CPPProcess.h b/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/CPPProcess.h
index 5b8fdd4347..0b29ffb3ff 100644
--- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/CPPProcess.h
+++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/CPPProcess.h
@@ -7,7 +7,7 @@
 // Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+// MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/auto_dsig.f b/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/auto_dsig.f
index f78f7c102e..02520466e6 100644
--- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/auto_dsig.f
+++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/auto_dsig.f
@@ -359,7 +359,7 @@ SUBROUTINE DSIG_VEC(ALL_P,ALL_WGT,ALL_XBK,ALL_Q2FACT,ALL_CM_RAP
       DOUBLE PRECISION FUNCTION DSIG(PP,WGT,IMODE)
 C     ****************************************************
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+C     Generated by MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/auto_dsig1.f b/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/auto_dsig1.f
index b836e34865..4188745070 100644
--- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/auto_dsig1.f
+++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/auto_dsig1.f
@@ -1,7 +1,7 @@
       DOUBLE PRECISION FUNCTION DSIG1(PP,WGT,IMODE)
 C     ****************************************************
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+C     Generated by MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
@@ -39,6 +39,7 @@ DOUBLE PRECISION FUNCTION DSIG1(PP,WGT,IMODE)
 C     LOCAL VARIABLES 
 C     
       INTEGER I,ITYPE,LP,IPROC
+      DOUBLE PRECISION QSCALE
       DOUBLE PRECISION EP1
       DOUBLE PRECISION EM2
       DOUBLE PRECISION XPQ(-7:7),PD(0:MAXPROC)
@@ -129,15 +130,26 @@ DOUBLE PRECISION FUNCTION DSIG1(PP,WGT,IMODE)
 
       IF (ABS(LPP(IB(1))).GE.1) THEN
           !LP=SIGN(1,LPP(IB(1)))
-        EP1=PDG2PDF(LPP(IB(1)),-11, IB(1),XBK(IB(1)),DSQRT(Q2FACT(IB(1)
-     $   )))
+        IF (DSQRT(Q2FACT(IB(1))).EQ.0D0) THEN
+          QSCALE=0D0
+          DO I=3,NEXTERNAL
+            QSCALE=QSCALE+DSQRT(MAX(0D0,(PP(0,I)+PP(3,I))*(PP(0,I)
+     $       -PP(3,I))))
+          ENDDO
+          QSCALE=QSCALE/2D0
+        ELSE
+          QSCALE=DSQRT(Q2FACT(IB(1)))
+        ENDIF
+        EP1=PDG2PDF(LPP(IB(1)),-11, IB(1),XBK(IB(1)), QSCALE)
         IF (PDLABEL.EQ.'dressed') EP1_COMPONENTS(1:4) =
      $    EE_COMPONENTS(1:4)
       ENDIF
       IF (ABS(LPP(IB(2))).GE.1) THEN
           !LP=SIGN(1,LPP(IB(2)))
-        EM2=PDG2PDF(LPP(IB(2)),11, IB(2),XBK(IB(2)),DSQRT(Q2FACT(IB(2))
-     $   ))
+        IF (DSQRT(Q2FACT(IB(2))).NE.0D0) THEN
+          QSCALE=DSQRT(Q2FACT(IB(2)))
+        ENDIF
+        EM2=PDG2PDF(LPP(IB(2)),11, IB(2),XBK(IB(2)), QSCALE)
         IF (PDLABEL.EQ.'dressed') EM2_COMPONENTS(1:4) =
      $    EE_COMPONENTS(1:4)
       ENDIF
@@ -213,7 +225,7 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT,
      $  ALL_CM_RAP, ALL_WGT, IMODE, ALL_OUT, VECSIZE_USED)
 C     ****************************************************
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+C     Generated by MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
@@ -260,6 +272,7 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT,
 C     
 C     LOCAL VARIABLES 
 C     
+      DOUBLE PRECISION QSCALE
       INTEGER I,ITYPE,LP,IPROC
       DOUBLE PRECISION EP1(VECSIZE_MEMMAX)
       DOUBLE PRECISION EM2(VECSIZE_MEMMAX)
diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/check_sa.cc b/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/check_sa.cc
index 1bad694d1c..7cac5ab47b 100644
--- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/check_sa.cc
+++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/check_sa.cc
@@ -29,7 +29,9 @@
 
 #include <algorithm>
 #include <array>
+#include <cfenv> // for feenableexcept
 #include <cmath>
+#include <csignal> // for signal and SIGFPE
 #include <cstring>
 #include <fstream>
 #include <iomanip>
@@ -74,6 +76,23 @@ usage( char* argv0, int ret = 1 )
   return ret;
 }
 
+#ifdef __CUDACC__
+namespace mg5amcGpu
+#else
+namespace mg5amcCpu
+#endif
+{
+  inline void FPEhandler( int sig )
+  {
+#ifdef __CUDACC__
+    std::cerr << "Floating Point Exception (GPU)" << std::endl;
+#else
+    std::cerr << "Floating Point Exception (CPU)" << std::endl;
+#endif
+    exit( 0 );
+  }
+}
+
 int
 main( int argc, char** argv )
 {
@@ -84,6 +103,18 @@ main( int argc, char** argv )
   using namespace mg5amcCpu;
 #endif
 
+  // Enable FPEs (test #701 and #733 - except on MacOS where feenableexcept is not defined #730)
+#ifndef __APPLE__
+  const char* enableFPEc = getenv( "CUDACPP_RUNTIME_ENABLEFPE" );
+  const bool enableFPE = ( enableFPEc != 0 ) && ( std::string( enableFPEc ) != "" );
+  if( enableFPE )
+  {
+    std::cout << "WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions" << std::endl;
+    feenableexcept( FE_INVALID | FE_DIVBYZERO | FE_OVERFLOW | FE_UNDERFLOW ); // debug #701
+    signal( SIGFPE, FPEhandler );
+  }
+#endif
+
   // DEFAULTS FOR COMMAND LINE ARGUMENTS
   bool verbose = false;
   bool debug = false;
@@ -103,12 +134,14 @@ main( int argc, char** argv )
     CurandHost = 1,
     CurandDevice = 2
   };
-#ifdef __CUDACC__
-  RandomNumberMode rndgen = RandomNumberMode::CurandDevice; // default on CUDA GPU
-#elif not defined MGONGPU_HAS_NO_CURAND
-  RandomNumberMode rndgen = RandomNumberMode::CurandHost;  // default on CPU if build has curand
+#ifdef MGONGPU_HAS_NO_CURAND
+  RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // this is the only supported mode if build has no curand (PR #784 and #785)
+#elif defined __HIPCC__
+#error Internal error: MGONGPU_HAS_NO_CURAND should have been set for __HIPCC__ // default on AMD GPUs should be common random
+#elif defined __CUDACC__
+  RandomNumberMode rndgen = RandomNumberMode::CurandDevice; // default on NVidia GPU if build has curand
 #else
-  RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // default on CPU if build has no curand and on HIP GPU
+  RandomNumberMode rndgen = RandomNumberMode::CurandHost; // default on CPU if build has curand
 #endif
   // Rambo sampling mode (NB RamboHost implies CommonRandom or CurandHost!)
   enum class RamboSamplingMode
@@ -146,18 +179,20 @@ main( int argc, char** argv )
     }
     else if( arg == "--curdev" )
     {
-#ifdef __CUDACC__
-      rndgen = RandomNumberMode::CurandDevice;
+#ifndef __CUDACC__
+      throw std::runtime_error( "CurandDevice is not supported on CPUs or non-NVidia GPUs" );
+#elif defined MGONGPU_HAS_NO_CURAND
+      throw std::runtime_error( "CurandDevice is not supported because this application was built without Curand support" );
 #else
-      throw std::runtime_error( "CurandDevice is not supported on CPUs or on HIP GPUs" );
+      rndgen = RandomNumberMode::CurandDevice;
 #endif
     }
     else if( arg == "--curhst" )
     {
-#ifndef MGONGPU_HAS_NO_CURAND
-      rndgen = RandomNumberMode::CurandHost;
-#else
+#ifdef MGONGPU_HAS_NO_CURAND
       throw std::runtime_error( "CurandHost is not supported because this application was built without Curand support" );
+#else
+      rndgen = RandomNumberMode::CurandHost;
 #endif
     }
     else if( arg == "--common" )
@@ -278,10 +313,10 @@ main( int argc, char** argv )
   const std::string procKey = "0a ProcInit";
   timermap.start( procKey );
 
-  // Create a process object
+  // Create a process object, read param card and set parameters
+  // FIXME: the process instance can happily go out of scope because it is only needed to read parameters?
+  // FIXME: the CPPProcess should really be a singleton? (for instance, in bridge mode this will be called twice here?)
   CPPProcess process( verbose );
-
-  // Read param_card and set parameters
   process.initProc( "../../Cards/param_card.dat" );
   const fptype energy = 1500; // historical default, Ecms = 1500 GeV = 1.5 TeV (above the Z peak)
   //const fptype energy = 91.2; // Ecms = 91.2 GeV (Z peak)
@@ -389,30 +424,26 @@ main( int argc, char** argv )
   {
     prnk.reset( new CommonRandomNumberKernel( hstRndmom ) );
   }
-#ifndef MGONGPU_HAS_NO_CURAND
   else if( rndgen == RandomNumberMode::CurandHost )
   {
+#ifdef MGONGPU_HAS_NO_CURAND
+    throw std::runtime_error( "INTERNAL ERROR! CurandHost is not supported because this application was built without Curand support" ); // INTERNAL ERROR (no path to this statement)
+#else
     const bool onDevice = false;
     prnk.reset( new CurandRandomNumberKernel( hstRndmom, onDevice ) );
+#endif
   }
-#ifdef __CUDACC__
   else
   {
+#ifdef MGONGPU_HAS_NO_CURAND
+    throw std::runtime_error( "INTERNAL ERROR! CurandDevice is not supported because this application was built without Curand support" ); // INTERNAL ERROR (no path to this statement)
+#elif defined __CUDACC__
     const bool onDevice = true;
     prnk.reset( new CurandRandomNumberKernel( devRndmom, onDevice ) );
-  }
 #else
-  else
-  {
-    throw std::logic_error( "CurandDevice is not supported on CPUs or HIP GPUs" ); // INTERNAL ERROR (no path to this statement)
-  }
+    throw std::logic_error( "INTERNAL ERROR! CurandDevice is not supported on CPUs or non-NVidia GPUs" ); // INTERNAL ERROR (no path to this statement)
 #endif
-#else
-  else
-  {
-    throw std::logic_error( "This application was built without Curand support" ); // INTERNAL ERROR (no path to this statement)
   }
-#endif
 
   // --- 0c. Create rambo sampling kernel [keep this in 0c for the moment]
   std::unique_ptr<SamplingKernelBase> prsk;
@@ -747,7 +778,7 @@ main( int argc, char** argv )
   wrkflwtxt += "HIP:";
 #else
   wrkflwtxt += "CPP:";
-#endif
+#endif /* clang-format off */
   // -- DOUBLE or FLOAT?
 #if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
   wrkflwtxt += "MIX+"; // mixed fptypes (single precision color algebra #537)
@@ -757,7 +788,7 @@ main( int argc, char** argv )
   wrkflwtxt += "FLT+";
 #else
   wrkflwtxt += "???+"; // no path to this statement
-#endif
+#endif /* clang-format on */
   // -- CUCOMPLEX or THRUST or STD complex numbers?
 #ifdef __CUDACC__
 #if defined MGONGPU_CUCXTYPE_CUCOMPLEX
diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/counters.cc b/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/counters.cc
index 71fa817036..3bbdec9387 100644
--- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/counters.cc
+++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/counters.cc
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #include "timer.h"
 #define TIMERTYPE std::chrono::high_resolution_clock
@@ -36,13 +36,10 @@ extern "C"
 
   static mgOnGpu::Timer<TIMERTYPE> program_timer;
   static float program_totaltime = 0;
-  static mgOnGpu::Timer<TIMERTYPE> matrix1_timer;
-  static float matrix1_totaltime = 0;
   static mgOnGpu::Timer<TIMERTYPE> smatrix1_timer;
   static float smatrix1_totaltime = 0;
   static mgOnGpu::Timer<TIMERTYPE> smatrix1multi_timer[nimplC];
   static float smatrix1multi_totaltime[nimplC] = { 0 };
-  static int matrix1_counter = 0;
   static int smatrix1_counter = 0;
   static int smatrix1multi_counter[nimplC] = { 0 };
 
@@ -52,19 +49,6 @@ extern "C"
     return;
   }
 
-  void counters_matrix1_start_()
-  {
-    matrix1_counter++;
-    matrix1_timer.Start();
-    return;
-  }
-
-  void counters_matrix1_stop_()
-  {
-    matrix1_totaltime += matrix1_timer.GetDuration();
-    return;
-  }
-
   void counters_smatrix1_start_()
   {
     smatrix1_counter++;
diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/matrix1.f b/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/matrix1.f
index e00f0e1b64..1991a72bb9 100644
--- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/matrix1.f
+++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/matrix1.f
@@ -1,7 +1,7 @@
       SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL,
      $  ICOL)
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+C     Generated by MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
@@ -319,7 +319,7 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL,
 
       REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+C     Generated by MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
@@ -410,7 +410,6 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
 C     ----------
 C     BEGIN CODE
 C     ----------
-      call counters_matrix1_start()
       IF (FIRST) THEN
         FIRST=.FALSE.
         IF(ZERO.NE.0D0) FK_ZERO = SIGN(MAX(ABS(ZERO), ABS(ZERO
@@ -478,7 +477,6 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
         ENDDO
       ENDDO
 
-      call counters_matrix1_stop()
       END
 
       SUBROUTINE PRINT_ZERO_AMP_1()
diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/cudacpp.mk b/epochX/cudacpp/ee_mumu.mad/SubProcesses/cudacpp.mk
index 59a2c906eb..f2cfa349da 100644
--- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/cudacpp.mk
+++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/cudacpp.mk
@@ -4,10 +4,13 @@
 # Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 
 #=== Determine the name of this makefile (https://ftp.gnu.org/old-gnu/Manuals/make-3.80/html_node/make_17.html)
-#=== NB: different names (e.g. cudacpp.mk and cudacpp_src.mk) are used in the Subprocess and src directories
+#=== NB: use ':=' to ensure that the value of CUDACPP_MAKEFILE is not modified further down after including make_opts
+#=== NB: use 'override' to ensure that the value can not be modified from the outside
+override CUDACPP_MAKEFILE := $(word $(words $(MAKEFILE_LIST)),$(MAKEFILE_LIST))
+###$(info CUDACPP_MAKEFILE='$(CUDACPP_MAKEFILE)')
 
-CUDACPP_MAKEFILE = $(word $(words $(MAKEFILE_LIST)),$(MAKEFILE_LIST))
-CUDACPP_SRC_MAKEFILE = cudacpp_src.mk
+#=== NB: different names (e.g. cudacpp.mk and cudacpp_src.mk) are used in the Subprocess and src directories
+override CUDACPP_SRC_MAKEFILE = cudacpp_src.mk
 
 #-------------------------------------------------------------------------------
 
@@ -29,7 +32,17 @@ UNAME_P := $(shell uname -p)
 
 #-------------------------------------------------------------------------------
 
-#=== Configure common compiler flags for C++ and CUDA
+#=== Include the common MG5aMC Makefile options
+
+# OM: this is crucial for MG5aMC flag consistency/documentation
+# AV: temporarely comment this out because it breaks cudacpp builds
+ifneq ($(wildcard ../../Source/make_opts),)
+include ../../Source/make_opts
+endif
+
+#-------------------------------------------------------------------------------
+
+#=== Configure common compiler flags for C++ and CUDA/HIP
 
 INCFLAGS = -I.
 OPTFLAGS = -O3 # this ends up in GPUFLAGS too (should it?), cannot add -Ofast or -ffast-math here
@@ -101,68 +114,85 @@ endif
 # Note: AR, CXX and FC are implicitly defined if not set externally
 # See https://www.gnu.org/software/make/manual/html_node/Implicit-Variables.html
 
-#-------------------------------------------------------------------------------
-
-CUDA_COMPILER_PATH := $(shell compiler="`which nvcc 2>/dev/null`" && while [ -L "$$compiler" ]; do compiler=`readlink "$$compiler"`; done && echo "$$compiler")
-HIP_COMPILER_PATH := $(shell compiler="`which hipcc 2>/dev/null`" && while [ -L "$$compiler" ]; do compiler=`readlink "$$compiler"`; done && echo "$$compiler")
-
-ifeq ($(findstring nvcc,$(CUDA_COMPILER_PATH)),nvcc)
-  #=== Configure the CUDA compiler
-
-  # If CXX is not a single word (example "clang++ --gcc-toolchain...") then disable CUDA builds (issue #505)
-  # This is because it is impossible to pass this to "GPUFLAGS += -ccbin <host-compiler>" below
-  ifneq ($(words $(subst ccache ,,$(CXX))),1) # allow at most "CXX=ccache <host-compiler>" from outside
-    $(warning CUDA builds are not supported for multi-word CXX "$(CXX)")
-    override CUDA_HOME=disabled
-  endif
+# Add -mmacosx-version-min=11.3 to avoid "ld: warning: object file was built for newer macOS version than being linked"
+ifneq ($(shell $(CXX) --version | egrep '^Apple clang'),)
+CXXFLAGS += -mmacosx-version-min=11.3
+endif
 
-  # If CUDA_HOME is not set, try to set it from the location of nvcc
-  ifndef CUDA_HOME
-    CUDA_HOME = $(patsubst %bin/nvcc,%,$(shell which nvcc 2>/dev/null))
-    $(warning CUDA_HOME was not set: using "$(CUDA_HOME)")
-  endif
+#-------------------------------------------------------------------------------
 
-  # Set GPUCC as $(CUDA_HOME)/bin/nvcc if it exists
-  ifneq ($(wildcard $(CUDA_HOME)/bin/nvcc),)
-    GPUCC = $(CUDA_HOME)/bin/nvcc
-    USE_NVTX ?=-DUSE_NVTX
-    # See https://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/index.html
-    # See https://arnon.dk/matching-sm-architectures-arch-and-gencode-for-various-nvidia-cards/
-    # Default: use compute capability 70 for V100 (CERN lxbatch, CERN itscrd, Juwels Cluster).
-    # Embed device code for 70, and PTX for 70+.
-    # Export MADGRAPH_CUDA_ARCHITECTURE (comma-separated list) to use another value or list of values (see #533).
-    # Examples: use 60 for P100 (Piz Daint), 80 for A100 (Juwels Booster, NVidia raplab/Curiosity).
-    MADGRAPH_CUDA_ARCHITECTURE ?= 70
-    ###CUARCHFLAGS = -gencode arch=compute_$(MADGRAPH_CUDA_ARCHITECTURE),code=compute_$(MADGRAPH_CUDA_ARCHITECTURE) -gencode arch=compute_$(MADGRAPH_CUDA_ARCHITECTURE),code=sm_$(MADGRAPH_CUDA_ARCHITECTURE) # Older implementation (AV): go back to this one for multi-GPU support #533
-    ###CUARCHFLAGS = --gpu-architecture=compute_$(MADGRAPH_CUDA_ARCHITECTURE) --gpu-code=sm_$(MADGRAPH_CUDA_ARCHITECTURE),compute_$(MADGRAPH_CUDA_ARCHITECTURE) # Newer implementation (SH): cannot use this as-is for multi-GPU support #533
-    comma:=,
-    CUARCHFLAGS = $(foreach arch,$(subst $(comma), ,$(MADGRAPH_CUDA_ARCHITECTURE)),-gencode arch=compute_$(arch),code=compute_$(arch) -gencode arch=compute_$(arch),code=sm_$(arch))
-    CUINC = -I$(CUDA_HOME)/include/
-    CURANDLIBFLAGS = -L$(CUDA_HOME)/lib64/ -lcurand # NB: -lcuda is not needed here!
-    CUOPTFLAGS = -lineinfo
-    GPUFLAGS = $(foreach opt, $(OPTFLAGS), -Xcompiler $(opt)) $(CUOPTFLAGS) $(INCFLAGS) $(CUINC) $(USE_NVTX) $(CUARCHFLAGS) -use_fast_math
-    ###GPUFLAGS += -Xcompiler -Wall -Xcompiler -Wextra -Xcompiler -Wshadow
-    ###GPUCC_VERSION = $(shell $(GPUCC) --version | grep 'Cuda compilation tools' | cut -d' ' -f5 | cut -d, -f1)
-    GPUFLAGS += -std=c++17 # need CUDA >= 11.2 (see #333): this is enforced in mgOnGpuConfig.h
-    # Without -maxrregcount: baseline throughput: 6.5E8 (16384 32 12) up to 7.3E8 (65536 128 12)
-    ###GPUFLAGS+= --maxrregcount 160 # improves throughput: 6.9E8 (16384 32 12) up to 7.7E8 (65536 128 12)
-    ###GPUFLAGS+= --maxrregcount 128 # improves throughput: 7.3E8 (16384 32 12) up to 7.6E8 (65536 128 12)
-    ###GPUFLAGS+= --maxrregcount 96 # degrades throughput: 4.1E8 (16384 32 12) up to 4.5E8 (65536 128 12)
-    ###GPUFLAGS+= --maxrregcount 64 # degrades throughput: 1.7E8 (16384 32 12) flat at 1.7E8 (65536 128 12)
-    CUBUILDRULEFLAGS = -Xcompiler -fPIC -c
-    CCBUILDRULEFLAGS = -Xcompiler -fPIC -c -x cu
-    CUDATESTFLAGS = -lcuda
-  else ifneq ($(origin REQUIRE_CUDA),undefined)
-    # If REQUIRE_CUDA is set but no cuda is found, stop here (e.g. for CI tests on GPU #443)
-    $(error No cuda installation found (set CUDA_HOME or make GPUCC visible in PATH))
+#=== Configure the GPU compiler (CUDA or HIP)
+
+# FIXME! (AV 24.01.2024)
+# In the current implementation (without separate builds for C++ and CUDA/HIP), we first check for cudacc and hipcc in CUDA_HOME and HIP_HOME.
+# If CUDA_HOME or HIP_HOME are not set, try to determine them from the path to cudacc and hipcc.
+# While convoluted, this is currently necessary to allow disabling CUDA/HIP builds by setting CUDA_HOME or HIP_HOME to invalid paths.
+# This will (probably?) be fixed when separate C++ and CUDA/HIP builds are implemented (PR #775).
+
+# If CXX is not a single word (example "clang++ --gcc-toolchain...") then disable CUDA and HIP builds (issue #505)
+# This is because it is impossible to pass this to "GPUFLAGS += -ccbin <host-compiler>" below
+ifneq ($(words $(subst ccache ,,$(CXX))),1) # allow at most "CXX=ccache <host-compiler>" from outside
+  $(warning CUDA and HIP builds are not supported for multi-word CXX "$(CXX)")
+  override CUDA_HOME=disabled
+  override HIP_HOME=disabled
+endif
+
+# If CUDA_HOME is not set, try to set it from the path to nvcc
+ifndef CUDA_HOME
+  CUDA_HOME = $(patsubst %bin/nvcc,%,$(shell which nvcc 2>/dev/null))
+  $(warning CUDA_HOME was not set: using "$(CUDA_HOME)")
+endif
+
+# If HIP_HOME is not set, try to set it from the path to hipcc
+ifndef HIP_HOME
+  HIP_HOME = $(patsubst %bin/hipcc,%,$(HIP_COMPILER_PATH))
+  $(warning HIP_HOME was not set: using "$(HIP_HOME)")
+endif
+
+# FIXME! (AV 24.01.2024)
+# In the current implementation (without separate builds for C++ and CUDA/HIP),
+# builds are performed for HIP only if CUDA is not found in the path.
+# If both CUDA and HIP are installed, HIP builds can be triggered by unsetting CUDA_HOME.
+# This will be fixed when separate C++ and CUDA/HIP builds are implemented (PR #775).
+
+#--- Option 1: CUDA exists -> use CUDA
+
+# Set GPUCC as $(CUDA_HOME)/bin/nvcc if it exists
+ifneq ($(wildcard $(CUDA_HOME)/bin/nvcc),)
+
+  GPUCC = $(CUDA_HOME)/bin/nvcc
+  USE_NVTX ?=-DUSE_NVTX
+  # See https://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/index.html
+  # See https://arnon.dk/matching-sm-architectures-arch-and-gencode-for-various-nvidia-cards/
+  # Default: use compute capability 70 for V100 (CERN lxbatch, CERN itscrd, Juwels Cluster).
+  # Embed device code for 70, and PTX for 70+.
+  # Export MADGRAPH_CUDA_ARCHITECTURE (comma-separated list) to use another value or list of values (see #533).
+  # Examples: use 60 for P100 (Piz Daint), 80 for A100 (Juwels Booster, NVidia raplab/Curiosity).
+  MADGRAPH_CUDA_ARCHITECTURE ?= 70
+  ###CUARCHFLAGS = -gencode arch=compute_$(MADGRAPH_CUDA_ARCHITECTURE),code=compute_$(MADGRAPH_CUDA_ARCHITECTURE) -gencode arch=compute_$(MADGRAPH_CUDA_ARCHITECTURE),code=sm_$(MADGRAPH_CUDA_ARCHITECTURE) # Older implementation (AV): go back to this one for multi-GPU support #533
+  ###CUARCHFLAGS = --gpu-architecture=compute_$(MADGRAPH_CUDA_ARCHITECTURE) --gpu-code=sm_$(MADGRAPH_CUDA_ARCHITECTURE),compute_$(MADGRAPH_CUDA_ARCHITECTURE) # Newer implementation (SH): cannot use this as-is for multi-GPU support #533
+  comma:=,
+  CUARCHFLAGS = $(foreach arch,$(subst $(comma), ,$(MADGRAPH_CUDA_ARCHITECTURE)),-gencode arch=compute_$(arch),code=compute_$(arch) -gencode arch=compute_$(arch),code=sm_$(arch))
+  CUINC = -I$(CUDA_HOME)/include/
+  ifeq ($(RNDGEN),hasNoCurand)
+    CURANDLIBFLAGS=
   else
-    # No cuda. Switch cuda compilation off and go to common random numbers in C++
-    $(warning CUDA_HOME is not set or is invalid: export CUDA_HOME to compile with cuda)
-    override GPUCC=
-    override USE_NVTX=
-    override CUINC=
-    override CURANDLIBFLAGS=
+    CURANDLIBFLAGS = -L$(CUDA_HOME)/lib64/ -lcurand # NB: -lcuda is not needed here!
   endif
+  CUOPTFLAGS = -lineinfo
+  ###GPUFLAGS = $(OPTFLAGS) $(CUOPTFLAGS) $(INCFLAGS) $(CUINC) $(USE_NVTX) $(CUARCHFLAGS) -use_fast_math
+  GPUFLAGS = $(foreach opt, $(OPTFLAGS), -Xcompiler $(opt)) $(CUOPTFLAGS) $(INCFLAGS) $(CUINC) $(USE_NVTX) $(CUARCHFLAGS) -use_fast_math
+  ###GPUFLAGS += -Xcompiler -Wall -Xcompiler -Wextra -Xcompiler -Wshadow
+  ###GPUCC_VERSION = $(shell $(GPUCC) --version | grep 'Cuda compilation tools' | cut -d' ' -f5 | cut -d, -f1)
+  GPUFLAGS += -std=c++17 # need CUDA >= 11.2 (see #333): this is enforced in mgOnGpuConfig.h
+  # Without -maxrregcount: baseline throughput: 6.5E8 (16384 32 12) up to 7.3E8 (65536 128 12)
+  ###GPUFLAGS+= --maxrregcount 160 # improves throughput: 6.9E8 (16384 32 12) up to 7.7E8 (65536 128 12)
+  ###GPUFLAGS+= --maxrregcount 128 # improves throughput: 7.3E8 (16384 32 12) up to 7.6E8 (65536 128 12)
+  ###GPUFLAGS+= --maxrregcount 96 # degrades throughput: 4.1E8 (16384 32 12) up to 4.5E8 (65536 128 12)
+  ###GPUFLAGS+= --maxrregcount 64 # degrades throughput: 1.7E8 (16384 32 12) flat at 1.7E8 (65536 128 12)
+  CUBUILDRULEFLAGS = -Xcompiler -fPIC -c
+  CCBUILDRULEFLAGS = -Xcompiler -fPIC -c -x cu
+  CUDATESTFLAGS = -lcuda
 
   # Set the host C++ compiler for GPUCC via "-ccbin <host-compiler>"
   # (NB issue #505: this must be a single word, "clang++ --gcc-toolchain..." is not supported)
@@ -173,71 +203,55 @@ ifeq ($(findstring nvcc,$(CUDA_COMPILER_PATH)),nvcc)
   GPUFLAGS += -allow-unsupported-compiler
   endif
 
-else ifeq ($(findstring hipcc,$(HIP_COMPILER_PATH)),hipcc)
-  #=== Configure the HIP compiler
+else ifneq ($(origin REQUIRE_CUDA),undefined)
 
-  # If CXX is not a single word (example "clang++ --gcc-toolchain...") then disable CUDA/HIP builds (issue #505)
-  # This is because it is impossible to pass this to "GPUFLAGS += -ccbin <host-compiler>" below
-  ifneq ($(words $(subst ccache ,,$(CXX))),1) # allow at most "CXX=ccache <host-compiler>" from outside
-    $(warning HIP builds are not supported for multi-word CXX "$(CXX)")
-    override HIP_HOME=disabled
-  endif
+  # If REQUIRE_CUDA is set but no cuda is found, stop here (e.g. for CI tests on GPU #443)
+  $(error No cuda installation found (set CUDA_HOME or make GPUCC visible in PATH))
 
-  # If HIP_HOME is not set, try to set it from the location of GPUCC
-  ifndef HIP_HOME
-    HIP_HOME = $(patsubst %bin/hipcc,%,$(HIP_COMPILER_PATH))
-    $(warning HIP_HOME was not set: using "$(HIP_HOME)")
-  endif
+#--- Option 2: CUDA does not exist, HIP exists -> use HIP
 
-  # Set GPUCC as $(HIP_HOME)/bin/hipcc if it exists
-  ifneq ($(wildcard $(HIP_HOME)/bin/hipcc),)
-    GPUCC = $(HIP_HOME)/bin/hipcc
-
-    # Should maybe find something equivelant to this in HIP
-    #USE_NVTX ?=-DUSE_NVTX
-
-    HIPARCHFLAGS = -target x86_64-linux-gnu --offload-arch=gfx90a
-    HIPINC = -I$(HIP_HOME)/include/
-
-    # -DHIP_FAST_MATH equivelant to -use_fast_math in HIP 
-    # (But only for single precision line 208: https://rocm-developer-tools.github.io/HIP/hcc__detail_2math__functions_8h_source.html)
-    GPUFLAGS = $(OPTFLAGS) $(CUOPTFLAGS) $(INCFLAGS) $(HIPINC) $(HIPARCHFLAGS) -DHIP_FAST_MATH -DHIP_PLATFORM=amd -fPIC
-    ###GPUFLAGS += -Xcompiler -Wall -Xcompiler -Wextra -Xcompiler -Wshadow
-    GPUFLAGS += -std=c++17
-    # Without -maxrregcount: baseline throughput: 6.5E8 (16384 32 12) up to 7.3E8 (65536 128 12)
-    ###GPUFLAGS+= --maxrregcount 160 # improves throughput: 6.9E8 (16384 32 12) up to 7.7E8 (65536 128 12)
-    ###GPUFLAGS+= --maxrregcount 128 # improves throughput: 7.3E8 (16384 32 12) up to 7.6E8 (65536 128 12)
-    ###GPUFLAGS+= --maxrregcount 96 # degrades throughput: 4.1E8 (16384 32 12) up to 4.5E8 (65536 128 12)
-    ###GPUFLAGS+= --maxrregcount 64 # degrades throughput: 1.7E8 (16384 32 12) flat at 1.7E8 (65536 128 12)
-
-    CUBUILDRULEFLAGS = -fPIC -c
-    CCBUILDRULEFLAGS = -fPIC -c
-
-  else ifneq ($(origin REQUIRE_HIP),undefined)
-    # If REQUIRE_HIP is set but no cuda is found, stop here (e.g. for CI tests on GPU #443)
-    $(error No hip installation found (set HIP_HOME or make GPUCC visible in PATH))
-  else
-    # No hip. Switch hip compilation off and go to common random numbers in C++
-    $(warning HIP_HOME is not set or is invalid: export HIP_HOME to compile with hip)
-    override GPUCC=
-    override USE_NVTX=
-    override CUINC=
-    override CURANDLIBFLAGS=
-  endif
+# Set GPUCC as $(HIP_HOME)/bin/hipcc if it exists
+else ifneq ($(wildcard $(HIP_HOME)/bin/hipcc),)
 
-  # Allow newer (unsupported) C++ compilers with older versions of CUDA if ALLOW_UNSUPPORTED_COMPILER_IN_CUDA is set (#504)
-  ifneq ($(origin ALLOW_UNSUPPORTED_COMPILER_IN_CUDA),undefined)
-  GPUFLAGS += -allow-unsupported-compiler
-  endif
+  GPUCC = $(HIP_HOME)/bin/hipcc
+  #USE_NVTX ?=-DUSE_NVTX # should maybe find something equivalent to this in HIP?
+  HIPARCHFLAGS = -target x86_64-linux-gnu --offload-arch=gfx90a
+  HIPINC = -I$(HIP_HOME)/include/
+  # Note: -DHIP_FAST_MATH is equivalent to -use_fast_math in HIP 
+  # (but only for single precision line 208: https://rocm-developer-tools.github.io/HIP/hcc__detail_2math__functions_8h_source.html)
+  GPUFLAGS = $(OPTFLAGS) $(CUOPTFLAGS) $(INCFLAGS) $(HIPINC) $(HIPARCHFLAGS) -DHIP_FAST_MATH -DHIP_PLATFORM=amd -fPIC
+  ###GPUFLAGS += -Xcompiler -Wall -Xcompiler -Wextra -Xcompiler -Wshadow
+  GPUFLAGS += -std=c++17
+  ###GPUFLAGS+= --maxrregcount 255 # (AV: is this option valid on HIP and meaningful on AMD GPUs?)
+  CUBUILDRULEFLAGS = -fPIC -c
+  CCBUILDRULEFLAGS = -fPIC -c
+
+else ifneq ($(origin REQUIRE_HIP),undefined)
+
+  # If REQUIRE_HIP is set but no HIP is found, stop here (e.g. for CI tests on GPU #443)
+  $(error No hip installation found (set HIP_HOME or make GPUCC visible in PATH))
+
+#--- Option 3: CUDA does not exist, HIP does not exist -> switch off both CUDA and HIP
+
+else
+
+  # No cudacc and no hipcc: switch CUDA and HIP compilation off and go to common random numbers in C++
+  $(warning CUDA_HOME is not set or is invalid: export CUDA_HOME to compile with cuda)
+  $(warning HIP_HOME is not set or is invalid: export HIP_HOME to compile with hip)
+  override GPUCC=
+  override USE_NVTX=
+  override CUINC=
+  override CURANDLIBFLAGS=
 
 endif
 
+# Export GPUCC (so that it can also be used in cudacpp_src.mk?)
 export GPUCC
 export GPUFLAGS
 
 #-------------------------------------------------------------------------------
 
-#=== Configure ccache for C++ and CUDA builds
+#=== Configure ccache for C++ and CUDA/HIP builds
 
 # Enable ccache if USECCACHE=1
 ifeq ($(USECCACHE)$(shell echo $(CXX) | grep ccache),1)
@@ -254,7 +268,7 @@ endif
 
 #-------------------------------------------------------------------------------
 
-#=== Configure PowerPC-specific compiler flags for C++ and CUDA
+#=== Configure PowerPC-specific compiler flags for C++ and CUDA/HIP
 
 # PowerPC-specific CXX compiler flags (being reviewed)
 ifeq ($(UNAME_P),ppc64le)
@@ -270,7 +284,7 @@ else
   ######CXXFLAGS+= -fno-semantic-interposition # no benefit (neither alone, nor combined with -flto)
 endif
 
-# PowerPC-specific CUDA compiler flags (to be reviewed!)
+# PowerPC-specific CUDA/HIP compiler flags (to be reviewed!)
 ifeq ($(UNAME_P),ppc64le)
   GPUFLAGS+= -Xcompiler -mno-float128
 endif
@@ -285,12 +299,14 @@ override OMPFLAGS = -fopenmp
 ###override OMPFLAGS = # disable OpenMP MT on Intel (was ok without GPUCC but not ok with GPUCC before #578)
 else ifneq ($(shell $(CXX) --version | egrep '^(clang)'),)
 override OMPFLAGS = -fopenmp
-###override OMPFLAGS = # disable OpenMP MT on clang (was not ok without or with GPUCC before #578)
-else ifneq ($(shell $(CXX) --version | egrep '^(Apple clang)'),)
-override OMPFLAGS = # disable OpenMP MT on Apple clang (builds fail in the CI #578)
+###override OMPFLAGS = # disable OpenMP MT on clang (was not ok without or with nvcc before #578)
+###else ifneq ($(shell $(CXX) --version | egrep '^(Apple clang)'),) # AV for Mac (Apple clang compiler)
+else ifeq ($(UNAME_S),Darwin) # OM for Mac (any compiler)
+override OMPFLAGS = # AV disable OpenMP MT on Apple clang (builds fail in the CI #578)
+###override OMPFLAGS = -fopenmp # OM reenable OpenMP MT on Apple clang? (AV Oct 2023: this still fails in the CI)
 else
-override OMPFLAGS = -fopenmp
-###override OMPFLAGS = # disable OpenMP MT (default before #575)
+override OMPFLAGS = -fopenmp # enable OpenMP MT by default on all other platforms
+###override OMPFLAGS = # disable OpenMP MT on all other platforms (default before #575)
 endif
 
 # Set the default AVX (vectorization) choice
@@ -356,7 +372,7 @@ export OMPFLAGS
 
 #-------------------------------------------------------------------------------
 
-#=== Set the CUDA/C++ compiler flags appropriate to user-defined choices of AVX, FPTYPE, HELINL, HRDCOD, RNDGEN
+#=== Set the CUDA/HIP/C++ compiler flags appropriate to user-defined choices of AVX, FPTYPE, HELINL, HRDCOD, RNDGEN
 
 # Set the build flags appropriate to OMPFLAGS
 $(info OMPFLAGS=$(OMPFLAGS))
@@ -573,8 +589,9 @@ $(BUILDDIR)/gcheck_sa.o: CXXFLAGS += $(USE_NVTX) $(CUINC)
 
 # Apply special build flags only to check_sa and CurandRandomNumberKernel (curand headers, #679)
 $(BUILDDIR)/check_sa.o: CXXFLAGS += $(CXXFLAGSCURAND)
-$(BUILDDIR)/gcheck_sa.o: CXXFLAGS += $(CXXFLAGSCURAND)
+$(BUILDDIR)/gcheck_sa.o: CUFLAGS += $(CXXFLAGSCURAND)
 $(BUILDDIR)/CurandRandomNumberKernel.o: CXXFLAGS += $(CXXFLAGSCURAND)
+$(BUILDDIR)/gCurandRandomNumberKernel.o: CUFLAGS += $(CXXFLAGSCURAND)
 ifeq ($(RNDGEN),hasCurand)
 $(BUILDDIR)/CurandRandomNumberKernel.o: CXXFLAGS += $(CUINC)
 endif
@@ -772,12 +789,18 @@ $(testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(cxx_object
 	  $(GPUCC) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) -ldl $(LIBFLAGS) $(CUDATESTFLAGS)
 endif
 
+# Use target gtestlibs to build only googletest
+ifneq ($(GTESTLIBS),)
+gtestlibs: $(GTESTLIBS)
+endif
+
 # Use flock (Linux only, no Mac) to allow 'make -j' if googletest has not yet been downloaded https://stackoverflow.com/a/32666215
 $(GTESTLIBS):
 ifneq ($(shell which flock 2>/dev/null),)
+	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
 	flock $(BUILDDIR)/.make_test.lock $(MAKE) -C $(TESTDIR)
 else
-	$(MAKE) -C $(TESTDIR)
+	if [ -d $(TESTDIR) ]; then $(MAKE) -C $(TESTDIR); fi
 endif
 
 #-------------------------------------------------------------------------------
diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/dummy_fct.f b/epochX/cudacpp/ee_mumu.mad/SubProcesses/dummy_fct.f
index 076cf29d67..4f7a204b8f 100644
--- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/dummy_fct.f
+++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/dummy_fct.f
@@ -32,7 +32,7 @@ logical FUNCTION dummy_cuts(P)
       LOGICAL  IS_A_NU(NEXTERNAL),IS_HEAVY(NEXTERNAL)
       logical  do_cuts(nexternal)
       COMMON /TO_SPECISA/IS_A_J,IS_A_A,IS_A_L,IS_A_B,IS_A_NU,IS_HEAVY,
-     . IS_A_ONIUM, do_cuts
+     & IS_A_ONIUM, do_cuts
 
       dummy_cuts=.true.
 
@@ -118,15 +118,16 @@ double precision function user_dynamical_scale(P)
       
       
 C ************************************************************
-C default for the library implementing a dummt bias function
+C default for the library implementing a dummy bias function
 C ************************************************************
       subroutine bias_wgt_custom(p, original_weight, bias_weight)
-          implicit none
+      implicit none
 C
 C Parameters
 C
           include 'nexternal.inc'
-C
+
+C     
 C Arguments
 C
           double precision p(0:3, nexternal)
@@ -161,3 +162,4 @@ subroutine bias_wgt_custom(p, original_weight, bias_weight)
 
       return
       end subroutine bias_wgt_custom
+
diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/fbridge.cc b/epochX/cudacpp/ee_mumu.mad/SubProcesses/fbridge.cc
index 2b956730d4..22ce3f5115 100644
--- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/fbridge.cc
+++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/fbridge.cc
@@ -49,11 +49,7 @@ extern "C"
 #ifdef MGONGPUCPP_GPUIMPL
     GpuRuntime::setUp();
 #endif
-    // Create a process object, read parm card and set parameters
-    // FIXME: the process instance can happily go out of scope because it is only needed to read parameters?
-    // FIXME: the CPPProcess should really be a singleton? what if fbridgecreate is called from several Fortran threads?
-    CPPProcess process( /*verbose=*/false );
-    process.initProc( "../../Cards/param_card.dat" );
+    // (NB: CPPProcess::initProc no longer needs to be executed here because it is called in the Bridge constructor)
     // FIXME: disable OMP in Bridge when called from Fortran
     *ppbridge = new Bridge<FORTRANFPTYPE>( *pnevtF, *pnparF, *pnp4F );
   }
diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/genps.f b/epochX/cudacpp/ee_mumu.mad/SubProcesses/genps.f
index fe9c61504b..c00e33d954 100644
--- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/genps.f
+++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/genps.f
@@ -1877,12 +1877,12 @@ double precision function get_channel_cut(p, config)
          d1 = iforest(1, -i, config)
          d2 = iforest(2, -i, config)
          do j=0,3
-            if (d1.gt.0.and.d1.le.2) then
+            if (d1.gt.0.and.d1.le.nincoming) then
                ptemp(j,-i) = ptemp(j,-i) - ptemp(j, d1)
             else
                ptemp(j,-i) = ptemp(j,-i)+ptemp(j, d1)
             endif
-            if (d2.gt.0.and.d2.le.2) then
+            if (d2.gt.0.and.d2.le.nincoming) then
                ptemp(j,-i) = ptemp(j,-i) - ptemp(j, d2)
             else
                ptemp(j,-i) = ptemp(j,-i)+ptemp(j, d2)
diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/makefile b/epochX/cudacpp/ee_mumu.mad/SubProcesses/makefile
index 74db44d848..d572486c2e 100644
--- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/makefile
+++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/makefile
@@ -9,6 +9,12 @@ FFLAGS+= -cpp
 # Compile counters with -O3 as in the cudacpp makefile (avoid being "unfair" to Fortran #740)
 CXXFLAGS = -O3 -Wall -Wshadow -Wextra
 
+# Add -std=c++17 explicitly to avoid build errors on macOS
+# Add -mmacosx-version-min=11.3 to avoid "ld: warning: object file was built for newer macOS version than being linked"
+ifneq ($(shell $(CXX) --version | egrep '^Apple clang'),)
+CXXFLAGS += -std=c++17 -mmacosx-version-min=11.3
+endif
+
 # Enable ccache if USECCACHE=1
 ifeq ($(USECCACHE)$(shell echo $(CXX) | grep ccache),1)
   override CXX:=ccache $(CXX)
@@ -51,7 +57,7 @@ CUDACPP_MAKEFILE=cudacpp.mk
 CUDACPP_MAKEENV:=$(shell echo '$(.VARIABLES)' | tr " " "\n" | egrep "(USEBUILDDIR|AVX|FPTYPE|HELINL|HRDCOD)")
 ###$(info CUDACPP_MAKEENV=$(CUDACPP_MAKEENV))
 ###$(info $(foreach v,$(CUDACPP_MAKEENV),$(v)="$($(v))"))
-CUDACPP_BUILDDIR:=$(shell $(MAKE) $(foreach v,$(CUDACPP_MAKEENV),$(v)="$($(v))") -f $(CUDACPP_MAKEFILE) -pn |& awk '/Building/{print $$3}' | sed s/BUILDDIR=//)
+CUDACPP_BUILDDIR:=$(shell $(MAKE) $(foreach v,$(CUDACPP_MAKEENV),$(v)="$($(v))") -f $(CUDACPP_MAKEFILE) -pn 2>&1 | awk '/Building/{print $$3}' | sed s/BUILDDIR=//)
 ifeq ($(CUDACPP_BUILDDIR),)
 $(error CUDACPP_BUILDDIR='$(CUDACPP_BUILDDIR)' should not be empty!)
 else
@@ -89,7 +95,12 @@ SYMMETRY = symmetry.o idenparts.o
 
 # Binaries
 
-LDFLAGS+=-Wl,--no-relax # avoid 'failed to convert GOTPCREL relocation' error #458
+ifeq ($(UNAME),Darwin)
+LDFLAGS += -lc++ # avoid 'Undefined symbols' for chrono::steady_clock on macOS (checked with otool -L libmg5amc_gg_ttx_cpp.so) 
+LDFLAGS += -mmacosx-version-min=11.3 # avoid "ld: warning: object file was built for newer macOS version than being linked"  
+else
+LDFLAGS += -Wl,--no-relax # avoid 'failed to convert GOTPCREL relocation' error #458 (not supported on macOS)
+endif
 
 all: $(PROG)_fortran $(CUDACPP_BUILDDIR)/$(PROG)_cpp # also builds $(PROG)_cuda if $(CUDACPP_CULIB) exists (#503)
 
@@ -100,8 +111,8 @@ LINKLIBS += -lintlc # undefined reference to `_intel_fast_memcpy'
 else ifneq ($(shell $(CXX) --version | egrep '^clang'),)
 override OMPFLAGS = -fopenmp
 $(CUDACPP_BUILDDIR)/$(PROG)_cpp: LINKLIBS += -L $(shell dirname $(shell $(CXX) -print-file-name=libc++.so)) -lomp # see #604
-###else ifneq ($(shell $(CXX) --version | egrep '^Apple clang'),)
-###override OMPFLAGS = -fopenmp # OMP is not supported yet by cudacpp for Apple clang
+else ifneq ($(shell $(CXX) --version | egrep '^Apple clang'),)
+override OMPFLAGS = # OMP is not supported yet by cudacpp for Apple clang
 else
 override OMPFLAGS = -fopenmp
 endif
diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/runTest.cc b/epochX/cudacpp/ee_mumu.mad/SubProcesses/runTest.cc
index 0ed26180ca..de327f2321 100644
--- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/runTest.cc
+++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/runTest.cc
@@ -71,6 +71,8 @@ struct CPUTest : public CUDA_CPU_TestBase
     , hstSelCol( nevt )
     , hstIsGoodHel( CPPProcess::ncomb )
   {
+    // FIXME: the process instance can happily go out of scope because it is only needed to read parameters?
+    // FIXME: the CPPProcess should really be a singleton?
     process.initProc( "../../Cards/param_card.dat" );
   }
 
@@ -183,6 +185,8 @@ struct CUDATest : public CUDA_CPU_TestBase
     , devSelCol( nevt )
     , devIsGoodHel( CPPProcess::ncomb )
   {
+    // FIXME: the process instance can happily go out of scope because it is only needed to read parameters?
+    // FIXME: the CPPProcess should really be a singleton?
     process.initProc( "../../Cards/param_card.dat" );
   }
 
diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/testxxx.cc b/epochX/cudacpp/ee_mumu.mad/SubProcesses/testxxx.cc
index 016bc0f472..e5167de00c 100644
--- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/testxxx.cc
+++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/testxxx.cc
@@ -59,7 +59,8 @@ TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testxxx )
   using namespace mg5amcCpu;
 #endif
 #ifndef __APPLE__ // test #701 (except on MacOS where feenableexcept is not defined #730)
-  const bool enableFPE = !getenv( "CUDACPP_RUNTIME_DISABLEFPE" );
+  const char* enableFPEc = getenv( "CUDACPP_RUNTIME_ENABLEFPE" );
+  const bool enableFPE = ( enableFPEc != 0 ) && ( std::string( enableFPEc ) != "" );
   if( enableFPE )
   {
     feenableexcept( FE_INVALID | FE_DIVBYZERO | FE_OVERFLOW | FE_UNDERFLOW ); // debug #701
diff --git a/epochX/cudacpp/ee_mumu.mad/bin/generate_events b/epochX/cudacpp/ee_mumu.mad/bin/generate_events
index 107313b25d..5577cc66a0 100755
--- a/epochX/cudacpp/ee_mumu.mad/bin/generate_events
+++ b/epochX/cudacpp/ee_mumu.mad/bin/generate_events
@@ -46,7 +46,7 @@ if __debug__ and (not os.path.exists(pjoin(root_path,'../..', 'bin','create_rele
 
 sys.path.append(pjoin(root_path,'bin','internal'))
 import madevent_interface as ME        
-
+import misc as misc
 
 import logging
 import logging.config
@@ -160,17 +160,31 @@ if '__main__' == __name__:
     # Check that python version is valid
 
     set_configuration()
-    argument = sys.argv    
+    argument = sys.argv
+
+    # check for plugin customization of the launch command
+    launch_interface = ME.MadEventCmdShell
+    if os.path.exists(pjoin(root_path, 'bin','internal', 'launch_plugin.py')):
+        with  misc.TMP_variable(sys, 'path', sys.path + [pjoin(root_path, 'bin', 'internal')]):
+            from importlib import reload
+            try:
+                reload('launch_plugin')
+            except Exception as error:
+                import launch_plugin
+        launch_interface =  launch_plugin.MEINTERFACE
+
+
+    
     try:
         if '-h' in argument or '--help' in argument:
-            launch = ME.MadEventCmdShell(me_dir=root_path, force_run=True)
+            launch = launch_interface(me_dir=root_path, force_run=True)
             launch.exec_cmd('help generate_events')
             sys.exit()
         elif len(argument) > 1 and argument[1] in ['0', '1', '2']:
             argument = treat_old_argument(argument)
         
         with ME.MadEventCmdShell.RunWebHandling(root_path, ):
-            launch = ME.MadEventCmdShell(me_dir=root_path, force_run=True)
+            launch = launch_interface(me_dir=root_path, force_run=True)
             launch.run_cmd('generate_events %s' % ' '.join(argument[1:]))
             launch.run_cmd('quit')
     except ME.MadEventAlreadyRunning as message:
diff --git a/epochX/cudacpp/ee_mumu.mad/bin/internal/__init__.py b/epochX/cudacpp/ee_mumu.mad/bin/internal/__init__.py
index 16e60d8182..0d17042f0d 100755
--- a/epochX/cudacpp/ee_mumu.mad/bin/internal/__init__.py
+++ b/epochX/cudacpp/ee_mumu.mad/bin/internal/__init__.py
@@ -26,6 +26,7 @@ class aMCatNLOError(MadGraph5Error):
 import os
 import logging
 import time
+pjoin = os.path.join
 
 #Look for basic file position MG5DIR and MG4DIR
 MG5DIR = os.path.realpath(os.path.join(os.path.dirname(__file__),
diff --git a/epochX/cudacpp/ee_mumu.mad/bin/internal/banner.py b/epochX/cudacpp/ee_mumu.mad/bin/internal/banner.py
index c1e54d3cb9..bd1517985f 100755
--- a/epochX/cudacpp/ee_mumu.mad/bin/internal/banner.py
+++ b/epochX/cudacpp/ee_mumu.mad/bin/internal/banner.py
@@ -537,7 +537,7 @@ def charge_card(self, tag):
             self.param_card = param_card_reader.ParamCard(param_card)
             return self.param_card
         elif tag == 'mgruncard':
-            self.run_card = RunCard(self[tag])
+            self.run_card = RunCard(self[tag], unknown_warning=False)
             return self.run_card
         elif tag == 'mg5proccard':
             proc_card = self[tag].split('\n')
@@ -1002,14 +1002,18 @@ def __init__(self, finput=None, **opt):
         self.allowed_value = {}
         
         self.default_setup()
+        self.plugin_input(finput)
         
 
         # if input is define read that input
         if isinstance(finput, (file, str, StringIO.StringIO)):
             self.read(finput, **opt)
+        
 
 
 
+    def plugin_input(self, finput=None):
+        pass
 
 
     def default_setup(self):
@@ -2621,7 +2625,28 @@ class RunCard(ConfigFile):
     default_include_file = 'run_card.inc'
     default_autodef_file = 'run.inc'
     donewarning = []
+    include_as_parameter = []
+
+    def plugin_input(self, finput):
 
+        if not finput and not MADEVENT:
+            return
+        curr_dir = None
+        if isinstance(finput, file):
+            # expected path to be like "XXXX/Cards/run_card.dat"
+            curr_dir = os.path.dirname(os.path.dirname(finput.name))
+        elif isinstance(finput, str):
+            curr_dir = os.path.dirname(os.path.dirname(finput))
+        
+        if curr_dir:
+            if os.path.exists(pjoin(curr_dir, 'bin', 'internal', 'plugin_run_card')):
+                # expected format {} passing everything as optional argument
+                for line in open(pjoin(curr_dir, 'bin', 'internal', 'plugin_run_card')):
+                    if line.startswith('#'):
+                        continue
+                    opts = dict(eval(line))
+                    self.add_param(**opts)
+        
     @classmethod
     def fill_post_set_from_blocks(cls):
         """set the post_set function for any parameter defined in a run_block"""
@@ -2647,18 +2672,48 @@ def __new__(cls, finput=None, **opt):
             elif isinstance(finput, cls):
                 target_class = finput.__class__
             elif isinstance(finput, str):
+                path = finput
                 if '\n' not in finput:
                     finput = open(finput).read()
                 if 'req_acc_FO' in finput:
                     target_class = RunCardNLO
                 else:
                     target_class = RunCardLO
+                    if MADEVENT and os.path.exists(pjoin(MEDIR, 'bin','internal', 'launch_plugin.py')):
+                        with  misc.TMP_variable(sys, 'path', sys.path + [pjoin(MEDIR, 'bin', 'internal')]):
+                            from importlib import reload
+                            try:
+                                reload('launch_plugin')
+                            except Exception as error:
+                                import launch_plugin
+                        target_class = launch_plugin.RunCard
+                    elif not MADEVENT:
+                        if 'run_card.dat' in path:
+                            launch_plugin_path = path.replace('run_card.dat', '../bin/internal/launch_plugin.py')
+                        elif 'run_card_default.dat' in path:
+                             launch_plugin_path = path.replace('run_card_default.dat', '../bin/internal/launch_plugin.py')
+                        else:
+                            launch_plugin_path = None
+                        if launch_plugin_path and os.path.exists(launch_plugin_path):
+                            misc.sprint('try to use plugin class', path.replace('run_card.dat', '../bin/internal/launch_plugin.py'))
+                            pydir = os.path.dirname(launch_plugin_path)
+                            with  misc.TMP_variable(sys, 'path', sys.path + [pydir]):
+                                from importlib import reload
+                                try:
+                                    reload('launch_plugin')
+                                except Exception as error:
+                                    import launch_plugin
+                            target_class = launch_plugin.RunCard
+            elif issubclass(finput, RunCard):
+                target_class = finput
             else:
                 return None
 
             target_class.fill_post_set_from_blocks()
-
-            return super(RunCard, cls).__new__(target_class, finput, **opt)
+            out = super(RunCard, cls).__new__(target_class, finput, **opt)
+            if not isinstance(out, RunCard): #should not happen but in presence of missmatch of library loaded.
+                out.__init__(finput, **opt)
+            return out
         else:
             return super(RunCard, cls).__new__(cls, finput, **opt)
 
@@ -2686,7 +2741,7 @@ def __init__(self, *args, **opts):
         self.system_default = {}
         
         self.display_block = [] # set some block to be displayed
-
+        self.fct_mod = {} # {param: (fct_pointer, *argument, **opts)}
 
         self.cut_class = {} 
         self.warned=False
@@ -2723,7 +2778,7 @@ def get_lepton_densities(cls):
 
     def add_param(self, name, value, fortran_name=None, include=True, 
                   hidden=False, legacy=False, cut=False, system=False, sys_default=None,
-                  autodef=False, 
+                  autodef=False, fct_mod=None,
                   **opts):
         """ add a parameter to the card. value is the default value and 
         defines the type (int/float/bool/str) of the input.
@@ -2737,6 +2792,7 @@ def add_param(self, name, value, fortran_name=None, include=True,
                  If a path (Source/PDF/pdf.inc) the definition will be added within that file
                  Default is False (does not add the definition)
                  entry added in the run_card will automatically have this on True.
+        fct_mod: defines a function to run if the parameter is modify in the include file
         options of **opts:
         - allowed: list of valid options. '*' means anything else should be allowed.
                  empty list means anything possible as well. 
@@ -2761,15 +2817,22 @@ def add_param(self, name, value, fortran_name=None, include=True,
         if autodef:
             self.definition_path[autodef].append(name)
             self.user_set.add(name)
+        # function to trigger if a value is modified in the include file
+        # main target is action to force correct recompilation (like for compilation flag/...)
+        if fct_mod:
+            self.fct_mod[name] = fct_mod
 
-    def read(self, finput, consistency=True):
+    def read(self, finput, consistency=True, unknown_warning=True, **opt):
         """Read the input file, this can be a path to a file, 
            a file object, a str with the content of the file."""
            
         if isinstance(finput, str):
             if "\n" in finput:
                 finput = finput.split('\n')
+                if 'path' in opt:
+                    self.path = opt['path']
             elif os.path.isfile(finput):
+                self.path = finput
                 finput = open(finput)
             else:
                 raise Exception("No such file %s" % finput)
@@ -2784,7 +2847,7 @@ def read(self, finput, consistency=True):
             name = name.lower().strip()
             if name not in self:
                 #looks like an entry added by a user -> add it nicely
-                self.add_unknown_entry(name, value)
+                self.add_unknown_entry(name, value, unknown_warning)
             else:
                 self.set( name, value, user=True)
         # parameter not set in the run_card can be set to compatiblity value
@@ -2796,7 +2859,7 @@ def read(self, finput, consistency=True):
                         logger.warning(str(error))
                     else:
                         raise
-    def add_unknown_entry(self, name, value):
+    def add_unknown_entry(self, name, value, unknow_warning):
         """function to add an entry to the run_card when the associated parameter does not exists.
            This is based on the guess_entry_fromname for the various syntax providing input.
            This then call add_param accordingly.
@@ -2835,7 +2898,7 @@ def add_unknown_entry(self, name, value):
                 raise Exception("dictionary need to have at least one entry")
             default['dict']['__type__'] = default[self.guess_type_from_value(default_value[0])]
 
-        if name not in RunCard.donewarning:
+        if name not in RunCard.donewarning and unknow_warning:
             logger.warning("Found unexpected entry in run_card: \"%s\" with value \"%s\".\n"+\
                 "  The type was assigned to %s. \n"+\
                 "  The definition of that variable will %sbe automatically added to fortran file %s\n"+\
@@ -2873,7 +2936,17 @@ def valid_line(self, line, tmp):
                 return False 
         else:
             return True      
-                    
+
+
+    def reset_simd(self, old_value, new_value, name, *args, **opts):
+        #return
+        raise Exception('pass in reset simd')
+
+    def make_clean(self,old_value, new_value, name, dir):
+        raise Exception('pass make clean for ', dir)
+
+    def make_Ptouch(self,old_value, new_value, name, reset):
+        raise Exception('pass Ptouch for ', reset)             
                 
     def write(self, output_file, template=None, python_template=False,
                     write_hidden=False, template_options=None, **opt):
@@ -2898,11 +2971,12 @@ def write(self, output_file, template=None, python_template=False,
         if python_template and not to_write:
             import string
             if self.blocks:
-                text = string.Template(text)
                 mapping = {}
                 for b in self.blocks:
                     mapping[b.name] =  b.get_template(self)
-                text = text.substitute(mapping)
+                    if "$%s" % b.name not in text:
+                        text += "\n$%s\n" % b.name
+                text = string.Template(text).substitute(mapping)
 
             if not self.list_parameter:
                 text = text % self
@@ -3048,6 +3122,77 @@ def write(self, output_file, template=None, python_template=False,
         else:
             output_file.write(text)
 
+    def get_last_value_include(self, output_dir):
+        """For paraeter in self.fct_mod
+        parse the associate inc file to get the value of the previous run.
+        We return a dictionary {name: old_value}
+        if inc file does not exist we will return the current value (i.e. set has no change)
+        """
+
+        #remember that 
+        # default_include_file is a class variable
+        # self.includepath is on the form include_path : [list of param ]
+        out = {}
+
+        # setup inc_to_parse to be like self.includepath (include_path : [list of param ])
+        # BUT only containing the parameter that need to be tracked for the fct_mod option
+        inc_to_parse = {}
+        for inc_file, params in self.includepath.items():
+            if not inc_file:
+                continue
+            if any(p in params for p in self.fct_mod):
+                inc_to_parse[inc_file] = [name for name in self.includepath[inc_file] if name in self.fct_mod]
+
+        # now loop over the files and ask the associate function
+        for inc_file, params in inc_to_parse.items():
+            if inc_file is True:
+                inc_file = self.default_include_file
+            out.update(self.get_value_from_include(inc_file, params, output_dir))
+
+        return out
+
+    def get_value_from_include(self, path, list_of_params, output_dir):
+        """for a given include file return the current value of the requested parameter
+        return a dictionary {name: value}
+        if path does not exists return the current value in self for all parameter"""
+
+        #WARNING DOES NOT HANDLE LIST/DICT so far
+
+        # handle case where file is missing
+        if not os.path.exists(pjoin(output_dir,path)):
+            misc.sprint("include file not existing", pjoin(output_dir,path))
+            out = {name: self[name] for name in list_of_params}
+
+        with open(pjoin(output_dir,path), 'r') as fsock:
+            text = fsock.read()
+        
+        for name in list_of_params:
+            misc.sprint(name, name in self.fortran_name)
+            misc.sprint(self.fortran_name[name] if name in self.fortran_name[name] else name)
+        to_track = [self.fortran_name[name] if name in self.fortran_name else name for name in list_of_params]
+        pattern = re.compile(r"\(?(%(names)s)\s?=\s?([^)]*)\)?" % {'names':'|'.join(to_track)}, re.I)
+        out =  dict(pattern.findall(text))
+        misc.sprint(out)
+        for name in list_of_params:
+            if name in self.fortran_name:
+                value = out[self.fortran_name[name]]
+                del out[self.fortran_name[name]]
+                out[name] = value
+
+        for name, value in out.items():
+            try:
+                out[name] = self.format_variable(value, type(self[name]))
+            except Exception:
+                continue
+
+        if len(out) != len(list_of_params):
+            misc.sprint(list_of_params)
+            misc.sprint(to_track)
+            misc.sprint(self.fortran_name)
+            misc.sprint(text)
+            raise Exception
+        return out 
+
 
     def get_default(self, name, default=None, log_level=None):
         """return self[name] if exist otherwise default. log control if we 
@@ -3338,71 +3483,93 @@ def write_include_file(self, output_dir, output_file=None):
         #ensusre that system only parameter are correctly set
         self.update_system_parameter_for_include()
 
+        value_in_old_include = self.get_last_value_include(output_dir)
+
+
         if output_dir:
             self.write_autodef(output_dir, output_file=None)
             # check/fix status of customised functions
             self.edit_dummy_fct_from_file(self["custom_fcts"], os.path.dirname(output_dir))
         
         for incname in self.includepath:
-            if incname is True:
-                pathinc = self.default_include_file
-            elif incname is False:
-                continue
-            else:
-                pathinc = incname
+            self.write_one_include_file(output_dir, incname, output_file)
+ 
+        for name,value in value_in_old_include.items():
+            if value != self[name]:
+                self.fct_mod[name][0](value, self[name], name, *self.fct_mod[name][1],**self.fct_mod[name][2])
 
-            if output_file:
-                fsock = output_file
+    def write_one_include_file(self, output_dir, incname, output_file=None):
+        """write one include file at the time"""
+
+        misc.sprint(incname)
+        if incname is True:
+            pathinc = self.default_include_file
+        elif incname is False:
+            return
+        else:
+            pathinc = incname
+
+        if output_file:
+            fsock = output_file
+        else:
+            fsock = file_writers.FortranWriter(pjoin(output_dir,pathinc+'.tmp'))
+
+
+        for key in self.includepath[incname]:                
+            #define the fortran name
+            if key in self.fortran_name:
+                fortran_name = self.fortran_name[key]
             else:
-                fsock = file_writers.FortranWriter(pjoin(output_dir,pathinc+'.tmp'))  
-            for key in self.includepath[incname]:                
-                #define the fortran name
-                if key in self.fortran_name:
-                    fortran_name = self.fortran_name[key]
+                fortran_name = key
+                
+            if incname in self.include_as_parameter:
+                fsock.writelines('INTEGER %s\n' % fortran_name)
+            #get the value with warning if the user didn't set it
+            value = self.get_default(key)
+            if hasattr(self, 'mod_inc_%s' % key):
+                value = getattr(self, 'mod_inc_%s' % key)(value)
+            # Special treatment for strings containing a list of
+            # strings. Convert it to a list of strings
+            if isinstance(value, list):
+                # in case of a list, add the length of the list as 0th
+                # element in fortran. Only in case of integer or float
+                # list (not for bool nor string)
+                targettype = self.list_parameter[key]                        
+                if targettype is bool:
+                    pass
+                elif targettype is int:
+                    line = '%s(%s) = %s \n' % (fortran_name, 0, self.f77_formatting(len(value)))
+                    fsock.writelines(line)
+                elif targettype is float:
+                    line = '%s(%s) = %s \n' % (fortran_name, 0, self.f77_formatting(float(len(value))))
+                    fsock.writelines(line)
+                # output the rest of the list in fortran
+                for i,v in enumerate(value):
+                    line = '%s(%s) = %s \n' % (fortran_name, i+1, self.f77_formatting(v))
+                    fsock.writelines(line)
+            elif isinstance(value, dict):
+                for fortran_name, onevalue in value.items():
+                    line = '%s = %s \n' % (fortran_name, self.f77_formatting(onevalue))
+                    fsock.writelines(line)                       
+            elif isinstance(incname,str) and 'compile' in incname:
+                if incname in self.include_as_parameter:
+                    line = 'PARAMETER (%s=%s)' %( fortran_name, value)
                 else:
-                    fortran_name = key
-                    
-                #get the value with warning if the user didn't set it
-                value = self.get_default(key)
-                if hasattr(self, 'mod_inc_%s' % key):
-                    value = getattr(self, 'mod_inc_%s' % key)(value)
-                # Special treatment for strings containing a list of
-                # strings. Convert it to a list of strings
-                if isinstance(value, list):
-                    # in case of a list, add the length of the list as 0th
-                    # element in fortran. Only in case of integer or float
-                    # list (not for bool nor string)
-                    targettype = self.list_parameter[key]                        
-                    if targettype is bool:
-                        pass
-                    elif targettype is int:
-                        line = '%s(%s) = %s \n' % (fortran_name, 0, self.f77_formatting(len(value)))
-                        fsock.writelines(line)
-                    elif targettype is float:
-                        line = '%s(%s) = %s \n' % (fortran_name, 0, self.f77_formatting(float(len(value))))
-                        fsock.writelines(line)
-                    # output the rest of the list in fortran
-                    for i,v in enumerate(value):
-                        line = '%s(%s) = %s \n' % (fortran_name, i+1, self.f77_formatting(v))
-                        fsock.writelines(line)
-                elif isinstance(value, dict):
-                    for fortran_name, onevalue in value.items():
-                        line = '%s = %s \n' % (fortran_name, self.f77_formatting(onevalue))
-                        fsock.writelines(line)                       
-                elif isinstance(incname,str) and 'compile' in incname:
                     line = '%s = %s \n' % (fortran_name, value)
-                    fsock.write(line)
+                fsock.write(line)
+            else:
+                if incname in self.include_as_parameter:
+                    line = 'PARAMETER (%s=%s)' %( fortran_name, self.f77_formatting(value))
                 else:
                     line = '%s = %s \n' % (fortran_name, self.f77_formatting(value))
-                    fsock.writelines(line)
-            if not output_file:
-                fsock.close()
-                path = pjoin(output_dir,pathinc)
-                if not os.path.exists(path) or not filecmp.cmp(path,  path+'.tmp'):
-                    files.mv(path+'.tmp', path)
-                else:
-                    os.remove(path+'.tmp')
-
+                fsock.writelines(line)
+        if not output_file:
+            fsock.close()
+            path = pjoin(output_dir,pathinc)
+            if not os.path.exists(path) or not filecmp.cmp(path,  path+'.tmp'):
+                files.mv(path+'.tmp', path)
+            else:
+                os.remove(path+'.tmp')
 
     def write_autodef(self, output_dir, output_file=None):
         """ Add the definition of variable to run.inc if the variable is set with autodef.
@@ -3741,13 +3908,14 @@ def remove_all_cut(self):
    %(tmin_for_channel)s = tmin_for_channel ! limit the non-singular reach of --some-- channel of integration related to T-channel diagram (value between -1 and 0), -1 is no impact
    %(survey_splitting)s = survey_splitting ! for loop-induced control how many core are used at survey for the computation of a single iteration.
    %(survey_nchannel_per_job)s = survey_nchannel_per_job ! control how many Channel are integrated inside a single job on cluster/multicore
-   %(refine_evt_by_job)s = refine_evt_by_job ! control the maximal number of events for the first iteration of the refine (larger means less jobs)
+   %(refine_evt_by_job)s = refine_evt_by_job ! control the maximal number of events for the first iteration of the refine (larger means less jobs)  
 #*********************************************************************
-# Compilation flag. No automatic re-compilation (need manual "make clean" in Source)
+# Compilation flag. 
 #*********************************************************************   
    %(global_flag)s = global_flag ! fortran optimization flag use for the all code.
    %(aloha_flag)s  = aloha_flag ! fortran optimization flag for aloha function. Suggestions: '-ffast-math'
    %(matrix_flag)s = matrix_flag ! fortran optimization flag for matrix.f function. Suggestions: '-O3'
+   %(vector_size)s = vector_size ! size of fortran arrays allocated in the multi-event API for SIMD/GPU (VECSIZE_MEMMAX)
 """
 
 template_off = '# To see advanced option for Phase-Space optimization: type "update psoptim"'
@@ -3903,9 +4071,12 @@ class RunCardLO(RunCard):
                       "get_dummy_x1_x2": pjoin("SubProcesses","dummy_fct.f"), 
                       "dummy_boostframe": pjoin("SubProcesses","dummy_fct.f"),
                       "user_dynamical_scale": pjoin("SubProcesses","dummy_fct.f"),
+                      "bias_wgt_custom": pjoin("SubProcesses","dummy_fct.f"),
                       "user_": pjoin("SubProcesses","dummy_fct.f") # all function starting by user will be added to that file
                       }
     
+    include_as_parameter = ['vector.inc']
+
     if MG5DIR:
         default_run_card = pjoin(MG5DIR, "internal", "default_run_card_lo.dat")
     
@@ -4139,10 +4310,15 @@ def default_setup(self):
         self.add_param('hel_splitamp', True, hidden=True, include=False, comment='decide if amplitude aloha call can be splitted in two or not when doing helicity per helicity optimization.')
         self.add_param('hel_zeroamp', True, hidden=True, include=False, comment='decide if zero amplitude can be removed from the computation when doing helicity per helicity optimization.')
         self.add_param('SDE_strategy', 1, allowed=[1,2], fortran_name="sde_strat", comment="decide how Multi-channel should behaves \"1\" means full single diagram enhanced (hep-ph/0208156), \"2\" use the product of the denominator")
-        self.add_param('global_flag', '-O', include=False, hidden=True, comment='global fortran compilation flag, suggestion -fbound-check')
-        self.add_param('aloha_flag', '', include=False, hidden=True, comment='global fortran compilation flag, suggestion: -ffast-math')
-        self.add_param('matrix_flag', '', include=False, hidden=True, comment='fortran compilation flag	for the	matrix-element files, suggestion -O3')        
-        
+        self.add_param('global_flag', '-O', include=False, hidden=True, comment='global fortran compilation flag, suggestion -fbound-check',
+                       fct_mod=(self.make_clean, ('Source'),{}))
+        self.add_param('aloha_flag', '', include=False, hidden=True, comment='global fortran compilation flag, suggestion: -ffast-math',
+                       fct_mod=(self.make_clean, ('Source/DHELAS'),{}))
+        self.add_param('matrix_flag', '', include=False, hidden=True, comment='fortran compilation flag	for the	matrix-element files, suggestion -O3',
+                       fct_mod=(self.make_Ptouch, ('matrix'),{}))        
+        self.add_param('vector_size', 1, include='vector.inc', hidden=True, comment='lockstep size for parralelism run', 
+                       fortran_name='VECSIZE_MEMMAX', fct_mod=(self.reset_simd,(),{}))
+
         # parameter allowing to define simple cut via the pdg
         # Special syntax are related to those. (can not be edit directly)
         self.add_param('pt_min_pdg',{'__type__':0.}, include=False, cut=True)
@@ -4164,8 +4340,7 @@ def default_setup(self):
         self.add_param('mxxmin4pdg',[-1.], system=True)
         self.add_param('mxxpart_antipart', [False], system=True)
                      
-        # CUDACPP parameters
-        self.add_param('cudacpp_backend', 'CPP', include=False, hidden=False)
+        
              
     def check_validity(self):
         """ """
@@ -4704,6 +4879,9 @@ def create_default_for_process(self, proc_characteristic, history, proc_def):
                 continue
             break
 
+        if proc_characteristic['ninitial'] == 1:
+            self['SDE_strategy'] =1
+
         if 'MLM' in proc_characteristic['limitations']:
             if self['dynamical_scale_choice'] ==  -1:
                 self['dynamical_scale_choice'] = 3
@@ -5769,7 +5947,7 @@ def default_setup(self):
         self.add_param("CheckCycle", 3)
         self.add_param("MaxAttempts", 10)
         self.add_param("ZeroThres", 1e-9)
-        self.add_param("OSThres", 1.0e-13)
+        self.add_param("OSThres", 1.0e-8)
         self.add_param("DoubleCheckHelicityFilter", True)
         self.add_param("WriteOutFilters", True)
         self.add_param("UseLoopFilter", False)
diff --git a/epochX/cudacpp/ee_mumu.mad/bin/internal/check_param_card.py b/epochX/cudacpp/ee_mumu.mad/bin/internal/check_param_card.py
index fe874a06a4..71089d7480 100755
--- a/epochX/cudacpp/ee_mumu.mad/bin/internal/check_param_card.py
+++ b/epochX/cudacpp/ee_mumu.mad/bin/internal/check_param_card.py
@@ -85,7 +85,7 @@ def load_str(self, text):
             self.value= ' '.join(data[len(self.lhacode):])
             # check that lhacode are the first entry otherwise return invalid param.
             if ' '.join([str(i) for i in self.lhacode]) != ' '.join(data[:len(self.lhacode)]):
-                raise InvalidParam
+                raise InvalidParam("line was %s" % str(data))
         else:
             self.value = data[-1]
         
diff --git a/epochX/cudacpp/ee_mumu.mad/bin/internal/common_run_interface.py b/epochX/cudacpp/ee_mumu.mad/bin/internal/common_run_interface.py
index 5d0187e3fa..87cb4b88df 100755
--- a/epochX/cudacpp/ee_mumu.mad/bin/internal/common_run_interface.py
+++ b/epochX/cudacpp/ee_mumu.mad/bin/internal/common_run_interface.py
@@ -749,13 +749,15 @@ def writeRunWeb(me_dir):
         
     class RunWebHandling(object):
         
-        def __init__(self, me_dir, crashifpresent=True, warnifpresent=True):
+        def __init__(self, me_dir, crashifpresent=True, warnifpresent=True, force_run=False):
             """raise error if RunWeb already exists
             me_dir is the directory where the write RunWeb"""
             
             self.remove_run_web = True
             self.me_dir = me_dir
-            
+            if force_run:
+                self.remove_run_web = False
+                return            
             if crashifpresent or warnifpresent:
                 if os.path.exists(pjoin(me_dir, 'RunWeb')):
                     pid = open(pjoin(me_dir, 'RunWeb')).read()
@@ -4904,6 +4906,7 @@ def __init__(self, question, cards=[], from_banner=None, banner=None, mode='auto
         self.load_default()        
         self.define_paths(**opt)
         self.last_editline_pos = 0
+        self.update_dependent_done = False
 
         if 'allow_arg' not in opt or not opt['allow_arg']:
             # add some mininal content for this:
@@ -6574,7 +6577,7 @@ def reask(self, *args, **opt):
     fail_due_to_format = 0 #parameter to avoid infinite loop
     def postcmd(self, stop, line):
 
-        if line not in [None, '0', 'done', '']:
+        if line not in [None, '0', 'done', '',0]:
             ending_question = cmd.OneLinePathCompletion.postcmd(self,stop,line)
         else:
             ending_question = True
@@ -6583,7 +6586,9 @@ def postcmd(self, stop, line):
             self.check_card_consistency()
             if self.param_consistency:
                 try:
-                    self.do_update('dependent', timer=20)
+                    if not self.update_dependent_done:
+                        self.do_update('dependent', timer=20)
+                    self.update_dependent_done = False
                 except MadGraph5Error as error:
                     if 'Missing block:' in str(error):
                         self.fail_due_to_format +=1
@@ -6636,6 +6641,8 @@ def do_update(self, line, timer=0):
             self.update_dependent(self.mother_interface, self.me_dir, self.param_card,
                                    self.paths['param'], timer, run_card=self.run_card,
                                    lhapdfconfig=self.lhapdf)
+            self.update_dependent_done = True
+            
 
         elif args[0] == 'missing':
             self.update_missing()
@@ -6715,12 +6722,13 @@ class TimeOutError(Exception):
         def handle_alarm(signum, frame): 
             raise TimeOutError
         signal.signal(signal.SIGALRM, handle_alarm)
+
         if timer:
-            signal.alarm(timer)
             log_level=30
         else:
             log_level=20
 
+
         if run_card:
             as_for_pdf = {'cteq6_m': 0.118,
                           'cteq6_d': 0.118, 
@@ -6779,6 +6787,10 @@ def handle_alarm(signum, frame):
                     logger.log(log_level, "update the strong coupling value (alpha_s) to the value from the pdf selected: %s",  as_for_pdf[pdlabel])
                     modify = True
 
+        if timer:
+            signal.alarm(timer)
+
+
         # Try to load the model in the limited amount of time allowed
         try:
             model = mecmd.get_model()
@@ -6907,7 +6919,8 @@ def check_block(self, blockname):
     def check_answer_consistency(self):
         """function called if the code reads a file"""
         self.check_card_consistency()
-        self.do_update('dependent', timer=20) 
+        if not self.update_dependent_done:
+            self.do_update('dependent', timer=20) 
       
     def help_set(self):
         '''help message for set'''
@@ -7533,7 +7546,8 @@ def open_file(self, answer):
             else:
                 raise
         if time.time() - start < .5:
-            self.mother_interface.ask("Are you really that fast? If you are using an editor that returns directly. Please confirm that you have finised to edit the file", 'y')
+            self.mother_interface.ask("Are you really that fast? If you are using an editor that returns directly. Please confirm that you have finised to edit the file", 'y',
+                                      timeout=False)
         self.reload_card(path)
         
     def reload_card(self, path): 
diff --git a/epochX/cudacpp/ee_mumu.mad/bin/internal/extended_cmd.py b/epochX/cudacpp/ee_mumu.mad/bin/internal/extended_cmd.py
index a6a8609dce..2f37070580 100755
--- a/epochX/cudacpp/ee_mumu.mad/bin/internal/extended_cmd.py
+++ b/epochX/cudacpp/ee_mumu.mad/bin/internal/extended_cmd.py
@@ -1108,9 +1108,12 @@ def ask(self, question, default, choices=[], path_msg=None,
         if alias:
             choices += list(alias.keys())
         
+
+
         question_instance = obj(question, allow_arg=choices, default=default, 
                                                    mother_interface=self, **opt)
-        
+        if fct_timeout is None:
+            fct_timeout = lambda x: question_instance.postcmd(x, default) if x and default else False
         if first_cmd:
             if isinstance(first_cmd, str):
                 question_instance.onecmd(first_cmd)
@@ -2271,6 +2274,9 @@ def postcmd(self, stop, line):
                 if n:
                     self.default(line)
                     return self.postcmd(stop, line)
+            elif self.value is None and line:
+                self.default(line)
+                return self.postcmd(stop, line) 
             if not self.casesensitive:
                 for ans in self.allow_arg:
                     if ans.lower() == self.value.lower():
diff --git a/epochX/cudacpp/ee_mumu.mad/bin/internal/gen_ximprove.py b/epochX/cudacpp/ee_mumu.mad/bin/internal/gen_ximprove.py
index 3b8ec31215..5fd170d18d 100755
--- a/epochX/cudacpp/ee_mumu.mad/bin/internal/gen_ximprove.py
+++ b/epochX/cudacpp/ee_mumu.mad/bin/internal/gen_ximprove.py
@@ -154,9 +154,18 @@ def get_helicity(self, to_submit=True, clean=True):
             p = misc.Popen(['./gensym'], stdout=subprocess.PIPE,
                                  stderr=subprocess.STDOUT, cwd=Pdir)
             #sym_input = "%(points)d %(iterations)d %(accuracy)f \n" % self.opts
+            
             (stdout, _) = p.communicate(''.encode())
             stdout = stdout.decode('ascii',errors='ignore')
-            nb_channel = max([math.floor(float(d)) for d in stdout.split()])
+            if stdout:
+                nb_channel = max([math.floor(float(d)) for d in stdout.split()])
+            else:
+                for matrix_file in misc.glob('matrix*orig.f', Pdir):
+                    files.cp(matrix_file, matrix_file.replace('orig','optim'))
+                P_zero_result.append(Pdir)
+                if os.path.exists(pjoin(self.me_dir, 'error')):
+                    os.remove(pjoin(self.me_dir, 'error'))
+                continue # bypass bad process
             
             self.cmd.compile(['madevent_forhel'], cwd=Pdir)
             if not os.path.exists(pjoin(Pdir, 'madevent_forhel')):
@@ -178,11 +187,13 @@ def get_helicity(self, to_submit=True, clean=True):
             #sym_input = "%(points)d %(iterations)d %(accuracy)f \n" % self.opts
             (stdout, _) = p.communicate(" ".encode())
             stdout = stdout.decode('ascii',errors='ignore')
-            if os.path.exists(pjoin(self.me_dir,'error')):
+            if os.path.exists(pjoin(self.me_dir, 'error')):
                 raise Exception(pjoin(self.me_dir,'error')) 
                 # note a continue is not enough here, we have in top to link
                 # the matrixX_optim.f to matrixX_orig.f to let the code to work
                 # after this error.
+                #                for matrix_file in misc.glob('matrix*orig.f', Pdir):
+                #    files.cp(matrix_file, matrix_file.replace('orig','optim'))
 
             if 'no events passed cuts' in stdout:
                 raise Exception
diff --git a/epochX/cudacpp/ee_mumu.mad/bin/internal/lhe_parser.py b/epochX/cudacpp/ee_mumu.mad/bin/internal/lhe_parser.py
index cff8789e38..a6b8582e1a 100755
--- a/epochX/cudacpp/ee_mumu.mad/bin/internal/lhe_parser.py
+++ b/epochX/cudacpp/ee_mumu.mad/bin/internal/lhe_parser.py
@@ -342,7 +342,12 @@ def next_event(self):
                 text.append(line)
                 
             if '</event>' in line:
-                if self.parsing:
+                if self.parsing == "wgt_only":
+                    out = Event(text, parse_momenta=False)
+                    #if len(out) == 0  and not self.allow_empty_event:
+                    #    raise Exception
+                    return out
+                elif self.parsing:
                     out = Event(text)
                     if len(out) == 0  and not self.allow_empty_event:
                         raise Exception
@@ -448,6 +453,8 @@ def unweight(self, outputpath, get_wgt=None, max_wgt=0, trunc_error=0,
         event_target reweight for that many event with maximal trunc_error.
         (stop to write event when target is reached)
         """
+        self.parsing = 'wgt_only'
+
         if not get_wgt:
             def weight(event):
                 return event.wgt
@@ -914,6 +921,8 @@ class MultiEventFile(EventFile):
        The number of events in each file need to be provide in advance 
        (if not provide the file is first read to find that number"""
     
+    parsing = True # check if/when we need to parse the event.
+
     def __new__(cls, start_list=[],parse=True):
         return object.__new__(MultiEventFile)
     
@@ -986,6 +995,7 @@ def next(self):
         nb_event = random.randint(1, remaining_event)
         sum_nb=0
         for i, obj in enumerate(self.files):
+            obj.parsing = "wgt_only"
             sum_nb += self.initial_nb_events[i] - self.curr_nb_events[i]
             if nb_event <= sum_nb:
                 self.curr_nb_events[i] += 1
@@ -1065,6 +1075,8 @@ def define_init_banner(self, wgt, lha_strategy, proc_charac=None):
             # check special case without PDF for one (or both) beam
             if init_information["idbmup1"] in [0,9]:
                 event = next(self)
+                if len(event) == 0:
+                    event = Event(str(event))
                 init_information["idbmup1"]= event[0].pdg
                 if init_information["idbmup2"] == 0:
                     init_information["idbmup2"]= event[1].pdg
@@ -1115,6 +1127,7 @@ def initialize_unweighting(self, getwgt, trunc_error):
         total_event = 0
         sum_cross = collections.defaultdict(int)
         for i,f in enumerate(self.files):
+            f.parsing = 'wgt_only'
             nb_event = 0 
             # We need to loop over the event file to get some information about the 
             # new cross-section/ wgt of event.
@@ -1302,7 +1315,7 @@ class Event(list):
 
     warning_order = True # raise a warning if the order of the particle are not in accordance of child/mother
 
-    def __init__(self, text=None):
+    def __init__(self, text=None, parse_momenta=True):
         """The initialization of an empty Event (or one associate to a text file)"""
         list.__init__(self)
         
@@ -1322,15 +1335,15 @@ def __init__(self, text=None):
         self.matched_scale_data = None
         self.syscalc_data = {}
         if text:
-            self.parse(text)
+            self.parse(text, parse_momenta=parse_momenta)
 
 
-            
-    def parse(self, text):
+    event_flag_pattern = re.compile(r"""(\w*)=(?:(?:['"])([^'"]*)(?=['"])|(\S*))""")   
+    def parse(self, text, parse_momenta=True):
         """Take the input file and create the structured information"""
         #text = re.sub(r'</?event>', '', text) # remove pointless tag
         status = 'first' 
-
+        tags = []
         if not isinstance(text, list):
             text = text.split('\n')
 
@@ -1354,24 +1367,28 @@ def parse(self, text):
                 if '<rwgt>' in line:
                     status = 'tag'
                 else:
-                    self.assign_scale_line(line)
+                    self.assign_scale_line(line, convert=parse_momenta)
                     status = 'part' 
                     continue
             if '<' in line:
                 status = 'tag'
                 
             if 'part' == status:
-                part = Particle(line, event=self)
-                if part.E != 0 or part.status==-1:
-                    self.append(part)
-                elif self.nexternal:
-                    self.nexternal-=1
+                if parse_momenta:
+                    part = Particle(line, event=self)
+                    if part.E != 0 or part.status==-1:
+                        self.append(part)
+                    elif self.nexternal:
+                        self.nexternal-=1
+                else:
+                    tags.append(line)
             else:
-                if '</event>' in line:
+                if line.endswith('</event>'):
                     line = line.replace('</event>','',1)
-                self.tag += '%s\n' % line
-                
-        self.assign_mother()
+                tags.append(line) 
+        self.tag += "\n".join(tags)
+        if parse_momenta:     
+            self.assign_mother()
     
     
     def assign_mother(self):
@@ -1905,19 +1922,27 @@ def check(self):
         #3. check mass
                    
          
-    def assign_scale_line(self, line):
+    def assign_scale_line(self, line, convert=True):
         """read the line corresponding to global event line
         format of the line is:
         Nexternal IEVENT WEIGHT SCALE AEW AS
         """
         inputs = line.split()
         assert len(inputs) == 6
-        self.nexternal=int(inputs[0])
-        self.ievent=int(inputs[1])
-        self.wgt=float(inputs[2])
-        self.scale=float(inputs[3])
-        self.aqed=float(inputs[4])
-        self.aqcd=float(inputs[5])
+        if convert:
+            self.nexternal=int(inputs[0])
+            self.ievent=int(inputs[1])
+            self.wgt=float(inputs[2])
+            self.scale=float(inputs[3])
+            self.aqed=float(inputs[4])
+            self.aqcd=float(inputs[5])
+        else:
+            self.nexternal=inputs[0]
+            self.ievent=inputs[1]
+            self.wgt=float(inputs[2])
+            self.scale=inputs[3]
+            self.aqed=inputs[4]
+            self.aqcd=inputs[5]
         
     def get_tag_and_order(self):
         """Return the unique tag identifying the SubProcesses for the generation.
@@ -2269,7 +2294,11 @@ def __str__(self, event_id=''):
         else:
             event_flag = ''
 
-        scale_str = "%2d %6d %+13.7e %14.8e %14.8e %14.8e" % \
+        try:
+            scale_str = "%2d %6d %+13.7e %14.8e %14.8e %14.8e" % \
+            (self.nexternal,self.ievent,self.wgt,self.scale,self.aqed,self.aqcd)
+        except:
+            scale_str = "%s %s %+13.7e %s %s %s" % \
             (self.nexternal,self.ievent,self.wgt,self.scale,self.aqed,self.aqcd)
 
             
diff --git a/epochX/cudacpp/ee_mumu.mad/bin/internal/madevent_interface.py b/epochX/cudacpp/ee_mumu.mad/bin/internal/madevent_interface.py
index b70b548e53..cb6bf4ca57 100755
--- a/epochX/cudacpp/ee_mumu.mad/bin/internal/madevent_interface.py
+++ b/epochX/cudacpp/ee_mumu.mad/bin/internal/madevent_interface.py
@@ -1,4 +1,4 @@
-################################################################################
+###############################################################################
 #
 # Copyright (c) 2011 The MadGraph5_aMC@NLO Development team and Contributors
 #
@@ -3191,7 +3191,7 @@ def do_treatcards(self, line, mode=None, opt=None):
       
         if mode in ['run', 'all']:
             if not hasattr(self, 'run_card'):
-                run_card = banner_mod.RunCard(opt['run_card'])
+                run_card = banner_mod.RunCard(opt['run_card'], path=pjoin(self.me_dir, 'Cards', 'run_card.dat'))
             else:
                 run_card = self.run_card
             self.run_card = run_card
@@ -3675,7 +3675,7 @@ def do_refine(self, line):
         devnull.close()
     
     ############################################################################ 
-    def do_combine_iteration(self, line):
+    def do_comine_iteration(self, line):
         """Not in help: Combine a given iteration combine_iteration Pdir Gdir S|R step
             S is for survey 
             R is for refine
@@ -3703,8 +3703,9 @@ def do_combine_iteration(self, line):
     ############################################################################ 
     def do_combine_events(self, line):
         """Advanced commands: Launch combine events"""
-
+        start=time.time()
         args = self.split_arg(line)
+        start = time.time()
         # Check argument's validity
         self.check_combine_events(args)
         self.update_status('Combining Events', level='parton')
@@ -3795,8 +3796,9 @@ def do_combine_events(self, line):
     
         if self.run_card['bias_module'].lower() not in  ['dummy', 'none'] and nb_event:
             self.correct_bias()
-        
-        
+        elif self.run_card['custom_fcts']:
+            self.correct_bias()
+        logger.info("combination of events done in %s s ", time.time()-start)
         
         self.to_store.append('event')
     
@@ -7364,7 +7366,7 @@ def wait_monitoring(Idle, Running, Done):
     import optparse
     # Get the directory of the script real path (bin)                                                                                                                                                           
     # and add it to the current PYTHONPATH                                                                                                                                                                      
-    root_path = os.path.dirname(os.path.dirname(os.path.realpath( __file__ )))
+    #root_path = os.path.dirname(os.path.dirname(os.path.dirname(os.path.realpath( __file__ ))))
     sys.path.insert(0, root_path)
 
     class MyOptParser(optparse.OptionParser):    
@@ -7407,7 +7409,13 @@ def error(self, msg=''):
     import logging.config
     # Set logging level according to the logging level given by options                                                                                                                                         
     #logging.basicConfig(level=vars(logging)[options.logging])                                                                                                                                                  
+    import internal
     import internal.coloring_logging
+    # internal.file = XXX/bin/internal/__init__.py
+    # => need three dirname to get XXX
+    # we use internal to have any issue with pythonpath finding the wrong file
+    me_dir = os.path.dirname(os.path.dirname(os.path.dirname(internal.__file__)))
+    print("me_dir is", me_dir)
     try:
         if __debug__ and options.logging == 'INFO':
             options.logging = 'DEBUG'
@@ -7415,7 +7423,8 @@ def error(self, msg=''):
             level = int(options.logging)
         else:
             level = eval('logging.' + options.logging)
-        logging.config.fileConfig(os.path.join(root_path, 'internal', 'me5_logging.conf'))
+        log_path = os.path.join(me_dir, 'bin', 'internal', 'me5_logging.conf')
+        logging.config.fileConfig(log_path)
         logging.root.setLevel(level)
         logging.getLogger('madgraph').setLevel(level)
     except:
@@ -7429,9 +7438,9 @@ def error(self, msg=''):
             if '--web' in args:
                 i = args.index('--web') 
                 args.pop(i)                                                                                                                                                                     
-                cmd_line = MadEventCmd(os.path.dirname(root_path),force_run=True)
+                cmd_line = MadEventCmd(me_dir, force_run=True)
             else:
-                cmd_line = MadEventCmdShell(os.path.dirname(root_path),force_run=True)
+                cmd_line = MadEventCmdShell(me_dir, force_run=True)
             if not hasattr(cmd_line, 'do_%s' % args[0]):
                 if parser_error:
                     print(parser_error)
diff --git a/epochX/cudacpp/ee_mumu.mad/bin/internal/misc.py b/epochX/cudacpp/ee_mumu.mad/bin/internal/misc.py
index d3fed3baa2..91cd3e5c22 100755
--- a/epochX/cudacpp/ee_mumu.mad/bin/internal/misc.py
+++ b/epochX/cudacpp/ee_mumu.mad/bin/internal/misc.py
@@ -347,7 +347,7 @@ def tell(msg):
     if dependency=='ninja':
         if cmd.options['ninja'] in ['None',None,''] or\
          (cmd.options['ninja'] == './HEPTools/lib' and not MG5dir is None and\
-         which_lib(pjoin(MG5dir,cmd.options['ninja'],'libninja.a')) is None):
+         which_lib(pjoin(MG5dir,cmd.options['ninja'],'lib','libninja.a')) is None):
             tell("Installing ninja...")
             cmd.do_install('ninja')
  
diff --git a/epochX/cudacpp/ee_mumu.mad/bin/internal/shower_card.py b/epochX/cudacpp/ee_mumu.mad/bin/internal/shower_card.py
index c6d3948cc4..c344ea1b15 100755
--- a/epochX/cudacpp/ee_mumu.mad/bin/internal/shower_card.py
+++ b/epochX/cudacpp/ee_mumu.mad/bin/internal/shower_card.py
@@ -45,7 +45,9 @@ class ShowerCard(dict):
     false = ['.false.', 'f', 'false', '0']
     logical_vars = ['ue_enabled', 'hadronize', 'b_stable', 'pi_stable', 'wp_stable', 
                     'wm_stable', 'z_stable', 'h_stable', 'tap_stable', 'tam_stable', 
-                    'mup_stable', 'mum_stable', 'is_4lep', 'is_bbar', 'combine_td']
+                    'mup_stable', 'mum_stable', 'is_4lep', 'is_bbar', 'combine_td',
+                    'space_shower_me_corrections', 'time_shower_me_corrections',
+                    'time_shower_me_extended', 'time_shower_me_after_first']
     string_vars = ['extralibs', 'extrapaths', 'includepaths', 'analyse']
     for i in range(1,100):
         string_vars.append('dm_'+str(i))
@@ -82,7 +84,11 @@ class ShowerCard(dict):
             'b_mass' : {'HERWIG6':'b_mass', 'PYTHIA6': 'b_mass', 'HERWIGPP': 'b_mass', 'PYTHIA8': 'b_mass'},
             'analyse' : {'HERWIG6':'hwuti', 'PYTHIA6':'pyuti', 'HERWIGPP':'hwpputi', 'PYTHIA8':'py8uti'},
             'qcut' : {'PYTHIA8':'qcut'},
-            'njmax' : {'PYTHIA8':'njmax'}}
+            'njmax' : {'PYTHIA8':'njmax'},
+            'space_shower_me_corrections' : {'PYTHIA8':'space_shower_me_corrections'},
+            'time_shower_me_corrections' : {'PYTHIA8':'time_shower_me_corrections'},
+            'time_shower_me_extended' : {'PYTHIA8':'time_shower_me_extended'},
+            'time_shower_me_after_first' : {'PYTHIA8':'time_shower_me_after_first'}}
     
     stdhep_dict = {'HERWIG6':'mcatnlo_hwan_stdhep.o', 'PYTHIA6':'mcatnlo_pyan_stdhep.o'}
     
diff --git a/epochX/cudacpp/ee_mumu.mad/bin/internal/ufomodel/py3_model.pkl b/epochX/cudacpp/ee_mumu.mad/bin/internal/ufomodel/py3_model.pkl
deleted file mode 100644
index f71ba45bbc6d4acc8d32bb06662fe900a694009f..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 42822
zcmb__2bdJa_B|}1h$1K|m=F<Vm%KZ>uqY~s1X)3xl^}}iusdw;xPiViOVY+T0_L1^
z&N=6tbB@o5@%hX-{m-ec4%1z`!0-F}-gl|Gb?cmas;fiwboU-sRNIuQtC!RQYglcI
zq=(H*Wz#h+^D^n$T(;ZPmb!FfH@id0;daO3*_}Id=+K;MO4~)Vi%`rq*_~#uhr)uC
z<Qi%l(^<Q7dtlRb*q#m9TsON5FxM$wllX4-x`^#sY!%J!2)2~H-t4aYw>j6KvgxMX
z>#GE_3mTg34XQfL?t+-wmc|ybyH|Cby$)+uO6t@0hLccM|EUg5schEnp^9@RB=@p=
zs-BA)>T;I7k=0XiEvT!nsl`~>8&_?nA{!f;({?XaT$Eb4#NK3fHy|#MmIbN$R1Tb*
zR&|)YE?f=S8V--W*^1fSQO%|s=hrZE<?OveA>@`UNZWm8GbMZT+3Vw<+LqSlTw3t|
zCVLCV-g5Q^aPFC2kZWn4*4i{LE%sKx=V<I}Z(Y@))#3>C^+sr^y^ZA#%2K<Zg|V<O
z0#+c~>Y;|CsjjhR>KuDpYa_opYmVLD+PJE>)yvw%+SJ<I+H8f@dxg~pC8V*P1y;-F
z8$dozghhkx?LnxDL+u?xjJtk^X8adsoMjITGtRP$!PuvOaizytQph;SQ&?{g)(Y$G
zA)2pifkLaNg^dT~O|4$;RA`mPCH7DY8>UdzutJXEo~ngudjv>2A^nBvWp*iOiedxl
z(`*15&dO{~QVsQ1&Mx!E&sUQGQ^#QZ%6+CLyTW5?T4s-IpJ_u3l>vNIA$X+^p0h`L
z;JIb?j_rf@G=uL{2tLN!4z2diz~+W;U1sm1p*vwi?7C`(`p@d&k5G{rde_3?AL~Ka
z+T%2I?J|2eBeXwlr;fMB>&@NS%(8nS%LI=lXHV2D7@J9}VcEx?9G<!R*n41ptz#yf
zQb<_k5!TszYQnl@_S6vJgRjR%tfLuvS|Rja9(2~8uA#Hb?7iEE?qr5e7D88h(DUql
zH1xb>_6#F*urYSCXX=g7#mq9RkY%>VV%htGg-12Zo&%DOvE3~onQZSD-WHSX{jn`J
z$4<xteO)SRrR(gu{x0Yr!VV~;t|2w2Z+fVGpjMb3Y^StB9yA{`9KzD}Jgsnjv%*@?
zg!c40((pji)H>8ogU2low&$;gXsTVWiMpGKtU{s&k0_TKYG*W2E;ZOb$V|jDlsXOU
zGtO?rc=8Cw`mB9Yt#%Vew1=6t*{4mbQ&R)aR;k)tLrZhkZfWnNzDE%0UI#3|naghX
zIv~`gn=J^>7OQR6P-o{b%AGA<O0*VRv)%utt?qN}g^?W8g+NWJu|7RdQr*V$vY<LA
z4bAo4>_zCk*s+%s>nn!z{Q36M+WDTGZWDE>n|-jFo2B+4dD}_a%UtEznuexSJ+4G#
ziGAowHG<P>YZkOD!b$otoTPcyZ)|B<P~&?K#|e90N?jcwjdXKvseQy;`$$D!jsHsR
zqf+X+3ReLOaG_D3weyaBbXA9{4oBO|RoJBQHHn11BCBS5i`1nhYp-<dW7Ndjd&2Ja
zu~kLc*&BM)!3ElJ)2u$~nuC{l_VKefLdXJKXtc__6t`ZDx)wQMTC2r@KC#%!;iTt|
z@JV`v)%c!laoA3A>{E-arIt4^r{(R_llB?bp(`y&w_KqX#GR=)mA11|7Sx<=`9emH
zw$I60$o^dG*s6|Miz!;iTV4+5F^5h%hx3cALHwV};R2Pzh3#(X`Q7Y`)K+V-FP>{(
zQj9_DSXGoh+P*YtU*_1C&r3P>6=4Zi=IyJJ_SJC-*C+`yXU;6OuT>JR3nY||u&-AV
zZg4li_}%Rr4N`8Rl+Iep&02~Ub4#&Ob8E46k%dfeE1vQ%{$sjG_V%QGhhyJ4&uzP_
zxT>hClWIrX-AVf%$G&%7+Oh8o3%NgUKajK^ED)k79?}#K`xG3HN0RoVj{TUXc)S?H
z<WoG6w4ZeBr%3U1nBtkd{cO^HE>7{hrg*`p=md%vllDuF{j#Qbg%m}eidU2NYmWUo
zDc%TEyqUM(O4@H1P^3Ad-f?$VY!7UZchye8a>9PkFPGczr#aXDL#B>L+aGwlPHCC_
zp|#xl5W`q>wEdCwVJoNK$JUM5X`<8b6K=CEdYgS(Y;{$a+N*G!wQ$XAnJ;!NT;W=g
z%i5JzZ~sE@v*0W=KuzV()t33fvA+z9|0-{PowWa3AU;L$-&pPx{2!(u7iBmcl-u8`
z8Tg%XC;FZ$*3l|{D7IEv19-4h+&?PrpB($=F!wKc``4uXTb%oMJ-$3n{h@Y*KZ6}1
zQEvaG()b%`#0T>qrm?P0<6oTylaLNL>oX7Oh=L<Uu%T?E6Y6>C44afLu(7mc9eDK~
zT3ssZqQ=~$D}((*&8RY24-RH1-PmF{;K=%DV4hvwJY@q_?}{ytQB%owXICb>AqtN4
zPyzl?P<o=CmyKYPvN3Et*<SGa)3i)BK~-hDDWm;OS|OXEu9EG|2E&L-A2cx8^>nhE
zt2#GtUzO?>?8sENM8T1*RCAE()~M&DFKkk_fsLoyk6ty~WLp;fv8+f)e>hZn+p)zo
zmIKhh^t$QvwpaC4YW}%nxdXd0!GS0^Qmg`k1WQoQ%OKdK42F#-I0W8|KY~LU<L8*D
zkYRAB9EY>TG=wA2z#P}tIhLxr+S+;$%h;7UCQxvsTm=L<R-m4jk+4Y_1sl(?5?+6M
zSIB5o)%M<z(SEiSqhu#IRJLQ-V#;=BG%(u@bhf*wdS9!rw?E6S?8<D%qTt9l6%b^*
z8|rx(51W+TVdL3OpjYkMGLc0;!I2d*2@aLuWVV<R+yf0vu)9uhimIyw)r_oSN2a$Y
z3XV)w%|Uw8P|wR=ut}K?8&7X<cr*SEo@9)lVP&aQ!=W<Vhb^W_IRg#Ma6_HpOjTF=
zjNah0*p)f*(!r5^RX~vA9Mto&A8b<ghmGer7v5k}9>5qs$HYjffkWkZAX`kMm_h?{
z?4fg<r|P)NBekkjWMLU;t@-*fboP-tlKOXYs;sVRyQJx*=J_bJdrh@ZH>qdv#dF2N
zzdXBkNFOZ?C?_QY>&QW{xU`Z+wg(r8(u8_mnqiaD0vjK>1@Nll_{51bOT|V_?J2@w
z|HxP>S=7}C=Gb7`%B^VN2=>$?xKPzOf{RqC$iiybTJvRccmx-d)E~jU^aw7Ymm|0o
zg^V{u2a`<?(IF@&#esEX87znBP_~DL=rGjtayV>Kj)091(UI_G{E<0|G0QW0Kr(7P
z3NN80j}Q*UMtUfYR&@@=a#bp_u)4R*%X-VOmJbib3ex*SQLTq!CA}PqV^Hvi;#jij
zp*RlZq#O_H$O*6<iWAu$8j6!p&&$cMNjU{JJ`|_I8w|y1jA?%;PDcobVq-lNXRw~(
zK4n$8SK@LeE6n{Y6#U%JW*{yY-MM}aYDqa4){*mInf>{U2xWf(>Up^kHYpdu#<RZ|
z-XQx+7~>x$N=xNZIMf`!j4g%*g<OsXs_CWGT)}#V^V`}>tGSXDs<{dUU(MAF^wnI0
zT2iisb>uo&s=1yKAvHIko|hY8lX4SmT+Pk!25N3$%<_!ZkWmsM)ACk?F#An(_P4Q~
z;gqzhb@sQj!tC!r!O#9q2Kw3Gg<4YXhIQl~SZ04OBSP8Vhk9P_hfT@@u<`64gg40k
zA;z?y{lf@h_M7VLA7S0gUR_zsqeNl$kD=ga|2PBnJbD7<q&x}h$WyS){b{y`a(@Q(
zygUn=l;>dMxjzrDK4w;z%L}NfW9Ewt_9s+jLSBMHO{kaIVmfHPf(DA(OpAJzb#M1@
zuP9$*Uy6Di1z*$~4Ai3DL^&yM!8-CbEJeM;_K>J|QP0bJut|9zHZJNv?BSgW%H#u7
zy;?#(M2%}Z`G`@0$chp2F+5y0A1$A-3x5T|@~EOxK1G!aJz0f<QT=CVp!(ig{pYNE
zyOXQ_3-+b@FH!K-f5kwp{%e$z@?Tg-zJaCs|FJ!!{#(@Z@*QkazK4yg|A9R+6rJ(4
z{zxBYB^72UE~6!7l%mMq^^=Ou;g=P4`$m3d-J74Tm|x&hVtyr9J!==wmERbs#r%$P
zQvQH-<WE?N`HSr#F@K|;mw#ZB@-J*$Ob6Uyjf&~0e7*`TOKqPbG#R%~C-%Yi*<4HN
ztm-_Eby1}v3%^EKYyLAt_&l}_N&WL!m0szsOD{KfR}}maT90gcgu0=el=Wd9*#MR!
z)Sd025!w*-y!3!gN>AAM2yFzf+S+Qz-Iy`U)xAAT-O>>wq!&DD&)tMgragC4G;nOT
z&||Zis&j05t5T7LpHnRJ?<khx$CL2b^dV_%Y&NHtW3vSc{@83uHa#|5p`4VhVIAoU
z%dy#p?V+*hhk9PNg-uF-*!b9N2X8Po0~iw;n}lo+j~bgD*kl@;foR~^Y^lelSk*Z;
zC8|_p;YT3L{OgZ}pM0#y?>ynL8AQ_9*bJtZV>1K=e{6=5O^?kml#?<X){zmg9Gg<M
zhsLH1^}HltlTr>FADaqzgRvROn9$galu_`gv8iN}X>3NLfn&3k9-AFm&v1XjC4{>~
z--#898H0i^W@iR!F}t9glwDyR84F7><JcY&vm5GIcfw-b2^$wP0p38&M8<@~l*%M{
zl$gnEGKtv(4HUDr7BhwQ40ixuOcg5>vnL9^n5hiZVy2;-l)YdbnGQ=ad$T<xCW(4p
zs$r9|4{Th_40r=EGZ_;SQ!caMQDXR0hEdGEXrP$BTFe~QGn}!$nEhCxnEg@k#mr@(
z7IOf~NvVN#<Um-8NwGa7W*+K!sfA5S9c)}o8s0$6e8z;tjFNhIlo*RmrY+Ne28!86
zi^;I=PP1w~%?@H;ifKf_7t_Q*Ev6Y|tQ%n+SpZ8hHrqpD1ogaRVUv=Bjf-i8HxRRs
zF(EN!vIrg}W-*&gVwRwRV)|(@OIdfPnJ4C8_NACZQ1Hb#4Af$lp`4UMVI4UPmSPTP
zdq~U?sORNK*rXf<8yAy@HxP3)V?tsoWH~%a%nCM{#H>UE#cZp^9K*Uh%{(#3vM<FP
zhk`HWcm`@QC!m~^6JZ@W36^3`W_w7?DX8b=RM@1P1{)W1I=q3HGZ+&RQz>V{qr{xW
zCX<-6(Lgc%wU~2Mok#j}RjJ6rue_G|=UvP2n{N0>e;!HwpLJ{WO80zvIW`xd;K+rF
zCb(5}5$bul7&a-Fz{W*i3U9FRxQsFWeU5}&j=GY31shC~uS5eSZ>J?+rRtP?wJH@^
z_`%sS|Iln1eq9brzJ{c}<T@?+T6!t@Iux){R5XF)8&Jne5f&>&*tq1I;SD6;!kFb5
ztu&*AMy_RVMF=$x&>C-3b!xm_m5MC<)@_-8<F@d-wiWp)JFM{z()$`4w8lH>rN+BZ
zaO7@96KK2#^}O5*o0R)t;~MXWH_-S1W7^mFAVR2dd#&*y)-#;8R*m~hjXcZ>lX?UN
zM;>KFXa{=?^}IX|o0KPD<4HXUZ^mCAKE;?|eK=a4hC@xZXV_wzY|o;BN$#MNe2(=D
zCz@5KlYE{PCiwyij=ac-P?9g9o|l(llky5|Jjqw#4U&9~F@BPx6Y@G7D#<t4VoLH&
zG%(45I?1<K&v2$#4LZrUSz(gzpy0^6j0h$99_o2{A2uoffsH5m0lYzyA2KFLa<qH|
zhf4BewwRLq1Px5GSSR@@>)z&emuIWkmq~txf+L@+9>E;^0`<In37eF!VB<-C%^vE2
z>aCp1<-hQ$mGd`j46i2rrSfR`AKI`KGp;MYRsC{RB|7o%SofxYoA~!|sl<Ok!I2+T
zk09}%P|wTHuu1s^HlFye?4c6R`1$@uUpU`Pds+UjqH|S)be4Z8L_G)GEdPW{W%(Bh
zj{L291X=!rdS3p8O-ctmuM^L*BfQZpi|AY9EIX;_T-9KmWoK3AajlCg6<K%!(OUhJ
zhu-sm;p5smKnRX&HQs`jUe2kmC^)j7q6tKILp?9+!zN_|*tqEK@CN7T4H@H4{PGdf
z0}eIud$Pqe@i#&PC;kwvd1F<l=3c5)WZ_vzYt5g53~Szmr2etIPHWzjUTWS91xI=-
znm}_O)bp}AY*My>jceW#-azwKj9H%1N;67m<b1U?La1@5*4S6osc{=sDzflMrnTme
zVum&LBfYP&L2KNWUTW-*f+O20nn2?K)bp}EY*KcBjcXhTZ=kW5G3{$CK?pSt(;5e{
z?%paKW35s@DGeqHlNy48f898gf%>{}7|K`$!(tT-%j?EcwucVJWvF8n42xATY&`c0
zc!S(WGA5Xlr7{W*H76_CVw#hq(LgQ3wU!-OcW?Z8T6SVzY8iuquVrTjYAw5<j72ak
z7QwL8GLG#bExVzPMKCND!LV^H6W|TBOk_-;rA#Klp|nh9i%H8KXrPu6TFVsH-TQu?
zmMZq8mOWAMwM=E8)-nxctbt*%28N}Uz1bepl0+SAU|6hyVdGk6z#C|p$(TS(LT16C
zwD6%Rqn3TqKrN+O%N*9-IW|_$vHjSWTJ}f5*D{xZTFU__V+jn4B`_?tq}U$PG7ohu
zfnl)(hK*}U!y9Os&zL|<xzxj<v{-C0ZI=c#P)nKCl40GQW1g0S*q2%wQSh}iF;Hu1
zMj0z$Sge3ysl{e{NQ<D36)-GTz_4*Gt?&j~7BVK#QXz}rP+AtV#iV5k8mJ|qwJc@b
zonxMsgV~o_4ne`!;xJHaS%xwez_3^V!&1xPY!7KU0(C5aVX**)jcdun8)!M2F@ctm
zvK$VjWd&PIT2`WgTFSMSV_0|R*aSVtj%8nJISvJ1%kd1<T24S2>t9%`e_^TRWVVO2
zoPs*mzpz;U!p60n4sW3448{anM#-6QC@p8P#iZqIG*C;0)^ZN(?i};9oXft{avlo4
zmh%~?wOoL5QZ9sb<RVyVxtQ%CEtjC4mrG%jav5w~%jNI}TCQMBprul-ghOe$iY+EB
zSEGSiMrtkBu<p(=Ps_FJOD)%-;A^>_fm+KAC@1AcSVwMxrIwr79@26P>Up^pHYvBk
z#<koIZ=mH4#speM%bjp2EqAfSq~&fjP|GN-<sR1EyI*7Ug7aSXrI!0p@U`5}K&|Bg
zl#}uxtRoM>Qp>|^4{3P>^}IX^o0P|3<60hvH&~ZG!5IH`Lzz5@x{~q~8%$E3Mgyf(
zYAMgK?%w+vtED{4zLfGD3ci%*8K|YafO1k^gmvU4SW0=B?I9_zpq`gkVUzM2Y+TCg
z@CH)eV9fH2R*+EwB9~=vB80h**15mMx_hf@g3kSIqA>S&Q1Ekqmw`I>_fW=fmavZe
z2bQ^i!1hq?AEKU@k6@GXF>E~dPv8x5|CBN9=e`Od%za0l`)90sQ%F4zEuRyGxqpFz
zpZk{#^nWb>3bmws4eQ8%VVV6mj0k1_Kh*Q`Eo@T0gN<kZJ-k8oKQN~KAIpD42(#Zw
zXa5uH-mG!6|CuPv{udPd?0;pTpZ#yBCFOTmNB)3i_J1-Wl>J|*=jCtMr2GRL&;DO{
zgX}xt0ov7M-w`3qevHn(h;?t?xY>6i3bXHwf}edC2Kw2rgIZG7g~fBJu*`lvMuf8O
zhB_8tuvmb>#<TAZZ;<_ljA?)Ddmx0_@2s=$$+|c8?t=;&5rx@rjDnwiF9zyazX{4o
z*%a21&0v{(Z?=bW?}IuPQm|M^!Nzmn65imh$ySW<52TglvNas)=+l=irlZd`XrPu|
zw3dFXdt1WQvMu{kOMeu6E!#0rYZ-uYQnrV6WCvJk8OZjKmSWWNQUaTlL9lTxgW(Od
z3}K9~Wpud=g+pl>#uk&7;b@?iUA2}Gtb2R3dyAr!eW|4k1z$^ofm%yB%1Nn!bz~$g
zwTxnWNJ}N^c^M6xlpSH?T6SU&b%yh96jjI=_|%P}o!J;@t0<LS;3;?j!I53zGO8S_
zdgrRfYL(+y_hz4~ayRy+%JC@pDtBj~RyhIXq)dc$WD+b@PG);Z<sPW#WeRLks$k<P
z_hb*HGUJPzN*{Khn52wS6gh!RQ_;Dqaazn?3b6$Dh^z_v(b4H}DKUGa;EPEzP>ZQX
z87n4OM`plM%uKe2#LPlHFWeYO*%vl0W)6Eq#Oz1k8jIOqMdzw^(_-c-M0LY?xW4c`
z04^n_1_i91RFB}ULJIY~%!5rzEo?l?I(UOCr!-^QzpF4GA>1j(>!j*gckeEi>!d8A
zaJn?0;7CUG2$DJo^}IB~CZ!2Bo>ViuK~gP@X+Nn22w_sY>!fVf-5bi4Iw>Ivlggst
zNKW+#l4?agFAHIlvIsVw)M9vpq?RzI{iK#6gh@@%Ngd33rVGz_!}VM|geXkPK><&k
zsvbd7hoYXB!(fwgIBYzrBj62^I+8K|F>Yj;90i9u#^u>!I>sH11|~UCC%K&UOfM!`
zu9IBBzD#l@3XUA3dIU)xi+Wy;gT>RLu<;~MfHz3;M8+)7=+rVQv&de25<-~NB%RdB
ztY@}iQk6QXQ;5Q(PDR0y(^QWjsnb!<%NejqITJRX)LHNbNuAA@Hj_%b-wsF#KCOV)
zD6?`7!W}smwzy=joX3A74`jzb(l9e64XJsJ>27j9;*t3U3gkj|#=A;A)qy>|c!9`8
zaLtC;4kLz*kc&~#Zv;v@)hL&+QN2Q&UXX6ClS@%AQNIi<Of|Nq<ucT#wa&|ElFLzb
z<O*26c*>Uv`SxgIs%chrO%2#;YRcqFv}P13KVsmw4;oY^SE0%3lObtFF=Zn7SEI#|
zYha1bPb{$i_^PR;i%aENc2mH8&`{cMXg?p^BOhKiOs+#1B=S|)+LopTjp@a5J$o7n
zZ$OJ9H^RmU%j70@Q-u8Lg-<3o!w(s4qu!!WZ`G*YFz9R=<Tf9dP1;>NVn}I;+`c;K
zI~4St8rrl45^`4ozMuD?K?%9r#cz8H+@tXC)%XQlAR+hpz~l6QjGI+G$S2TSAu*_U
zXfd7XTuL{4(o6acRvS&MmRZ#$a=)8R+XMN4%I3j%Hf5L(5Bb?J&7po8^6=_lA5pN6
znqbT1F(0<wY4-T)NS;t6Pnt*)@{~`qc{?LnCQq-9<r&5DY@8*55qi#N*{vNGY%gzT
zif=K!m+1}m{OZzqL8bGeDV;KT$xml1ZZI`ROcSWAL|$GU(JPARRTEJ{Uh|3Yd}@bM
zd3|+gZz!}kH5%We=4_fe$2+dbTRx(9D405jN3^$H<hJ*xcNFrwZ6TM-dq(7Pd4F}t
z|53;vXk>mxhB;?#$?+)sp%2>wE&Xsl$yvkLSX@@pPd-{5_{R$T6AkVU$Yg&&KK1ea
z0h!DJS+zRk&lK|K8reI5)wVQe>TBF1eL}wQ!TYM@-NQIzTJg1=w3;~Tq>zA?FI~pA
z4<cVF#;@CCEL)tA|E?J$Lc#dW>KOm07{6_kF#*Q!){GIMVEle{j6W#GAKPT)@$#oN
zV?-zze_kEqFN*QkHW_*R`EAV@5emlNSI78=V*ImB#<Im_^4FR%A{2~&ua5B_#rSWV
zj0rGyz#9kdMMArC6`^44h~Rdw9EuR&NGI6v1y*?PD3#7@#)wNMFm_oT<2s6Q-F6tu
zr0be8Qsa7UF{ahWGSZ82&D)Gi&u)mrtuNTQvH@(n*Sz?aSgL7!O6r2^-tOpw>)s9V
z58lZp04^EvPlwcmSyM};CyMIwa?;oXPZ~X}@2_JNeWQh|=9b!nWFxdIPxaJ2rcIx*
zPql1J4=)=L!;gPSFF5APCa_f<Qwlm;n`%tUrtt8hSZA;qs<=A_%M5ri%>(hI`O*h2
zH-%lkSXfkXZq2zFvSvt>alto7v8pJQi9l`vw<BA^5)$_tI;5&)D|X`r^Q2uLsMz(8
zZ%(#ia9b0uQw*-Jg4@Q0%kc75<2?7q#i>t!ov(1zhsb8NG|o>qXLIR>X6Z*hH_dH{
z+cAdQU*T@&;;Nyea-7m-0Q<VRe)(?ymsTCPo11G0b9-XOrSG6H2Wm|3<f5)Wq&Pr*
zXw`w^R*u>;hFU_@_y`PAsDmS@>T*Yh1gNjS^8M9+e%=&A9ZJ;BG5y07>Tr$feOzZr
zQ>}~$Fn`+n>wMq4lI|FDcI-Q&YTT@GQpyl_c*{^Uz$f6w>*uhX1p)j#sbF6<ykmPl
z$N!#pQx0K{BxaXb%A*u!WehVXqXW#klaMAD)NBo5?numaVwgKA%rP;{wCwC-?mG5?
z(*~?M<%}5SF2r0vhPkW492>)Il5qj%y&J5!Y}t!n#V~gxX7?E8c!jyU#`LDvlBQg$
zRVD<ee>I<Z{9YgI6hoaz)E+U^NeXqci)wAFH_OcDx|Z(n<q`#=*5XsgRaeU%WO29o
z6r%Nxp;algJvG{Ry$^-wP$mR2l_2qXIZc7=r9md^l+uTVwrU7uIzi%Fb#Dcd)F3l8
z$j0I676PdzNPN2Oqd;b8ko`1B%~*3LGYPVJY)ob;5FUWBA6WDBm=sK?P$v5lBtD_$
zD3JX$NWBIrm_Z?s{RtAEL30(z0UD%HgA`1d5J(L{;uGdT1(MPr{q>l1TK>;TyX?J<
zJ7Lh)VfP8+$L%ZgP}DP~7DdC1se{{*G%QD_Si^Nl9sJH_>*w7`90Hk7kUp^?u2&$I
z2H8Uk`>p@>_OkOcF^~p=#Ai%KfgGek_SYaw2OKc?*$;cfKpF`W-`GtGq*;SBxga|q
z`RnOl9QfrmF_0F51VfxwXKD8WU;&!)V#CfAA)I#;D~k$l+0qp{tfG^&!slMg@nBmz
z+yz<)Z}_fNhL|d^(4DbG@Hw&=mRn(<i?R?Xp_NzRniBV_a5to_=JMg?!?5;NOC6*!
z$c<ajpzgK=3)7O}Dq_-n<=2b9l9Ccx5>472j-`gw4_2uk;-;>?UluGRIVukOsM}zh
zLQgbfMBt)t8Da{?$fY}!U3s1gjnrZA;mIFZ4!?J~lupYL3|2e?z*81d6byh`!Fpp+
zB1fW&f%YgOMQHQzIdU{CY5f)LEPqA2oShWCcV?J6i!0g{@EbT*GCaa@41A6p3mcwU
z*+pe?9HSNZwwzh|uF6!OeR$#AkmC_=AU%OF5z-Uk!&6AG;qk0pBquXiarQ2phaNo=
zE0I$WWnem$og+-A!RN^7uw>$g?t+Ozk!P@vA~a1Dvh|ZQ5o4e`i(MmhXT#^nIj~`2
z*+p_LgUOR2qXF$a21L-#htH7<V8dwjrE(#IRo0%ObXsi=4B<s+GvHs$E)o1o;B(|s
z*aG}Tav6h7_)u~=+6?$tuuBC0O86YP3YPfZ#o)s9Lb;mJihwsPi`^Xub60JgMI}lL
zcCTv?X5hM(y(3)L!G{-pV9CWBNW=n{zq8!HUW(JZMT}PNEjOaUz;YAgBP=(==g2Lv
zVMX{cL2hNVLf=kvPxbFS4KFSmrlf_^DUsXI!$5mGfg-ebz~{)Fu;C50WS-o`V8yyM
z$B~kkG|dYghRWQr<e1-$SOecZgoyCn3!fwR!4~k<%KZ$E^3|I89zd*t??FOD_#T1}
zPl~}7@TKJu21ohQX1+%eYv6l~5D~t|;dA5(*aE&Ld6L0Vz9uu@Q;0S2Jxz!R-!t$z
z@+@otUrwH5aFj1+=6fEo2EG>v5#f6gK0LMtOFr+5sY|$Mc$wjfahqVqpux=f3Zf01
zuM#A}`5JtVybeoFZ)b06oh)xKT2XS>h+I8j-%->ByT+U7VW54BKoQ!v;lsB(U`gwp
z2KOjkoG6ud8Lx<Wlu<i}yKjVVP9@|$gn`Y!8=G-21m0)w2-|<)!?Sd-Q8wJZW_<f>
zylwjt!VGL5vv-8;6ZjnY6gJA1kX4LtpN+R{KSP*-?Q`~yuzdlaBVWQ6u;JcnnS90g
zLN<4=4@Q!=YrjUAf$hKS9bx+hK74fqHp-TeZy8_6=I#>#8*kNqhcE-%_v{^E`vE?@
z<pmpID=U+q7+=U1*H$J!Bh0||3wuY{eudAG-(aI`3HhDzg=}$cW%38Y3~Yb0cZBUP
z`0#ERY=kXQCjT(LkS(q)A^#%Gz}5j*x>2@{@ZsZ^uu-;zbYgrVTU=X0IwQ=$)`h(z
z+SY;3k#%7U*l=gMRJt-=v3bA9;P(~GLUlM<QY-7B-9Xcg-6AyW!-sF9z(#4xq&wr|
zH0~t}Xf{N<fu;w$MQD1$=g3B|QJREo%=kD>!c)@=?FO1n*eyb{DSUVY5;j7ELt<~n
zM`^G#ximTHgLVVW=Ij=s*#bV?@P&=il*v|%kJIQu%gNSgH_-HDw+PKP@Hx^CHcFF_
zZ5bb@(Sw$g{%AMQY{za9ngQ@RvOR18P5I)|gzUihC=Jdq?x4++foM0-6ti1|rUX7m
z2Ej&X$`Ue|@lhI_GCZ0gXgAOdWw!{;F!&r94jZ9KBxD5Rqck{Icr>MGH_()^TZAS7
zAMW<TlEz!0rqlj4cLh5sF7M2oPV<s`B>dp;e}mSi+))gVa8$zQ$Y|IwM|P3Vu_HSb
zaAX&eV<-3x9Ag+B;n*2IM|Ob?bJPYLyRuUON3G%*3%`M59K$t7+WReJH#qTP9qe4$
z9oE0OGXWJxCenpb^d7x&Z;aqKlu2lCxqIR&CVVd=RgKl@WHc9~Kn9Q8yMMF?e2z?k
zB?X@_m?Tw<Q<`{LR&ej4THm{g+`8BkJqoB?NmB_F8PRF*IkFcl3B6V7B!8n#XCFn5
zHF1YY++KUbWk63d(i_WGb%i&-wb}iSzf_}fuIvL_T<iWppqj=8d_G^6bepJyy2%VQ
z<5#zt_{Wi1j73TuOZ|usuPW4!^uk*^iUEIwr?m;6z0b<NbaB9z${ZB&O;uPobE;a3
zzewc$#9t!&qYXc@X~MavIdT9j3Hghdq@{-437QFjPy>*9xa~lc1B4W6j?9Dg5E`1K
z7F9C>zw$421qf+{Fkd6^k-BWUR_YB9+(Ao+4X<s<7B4Lks{o}zp=2})w-`RnUn>V0
zV7Ru%Pm$STU;(GG0H;aeG;17YQ=6@o76XjVCfm@=Y!(!t*a}57ipqvCSp$s9h9GV>
zIfbJh8E<8Geb`xu@?2S@Jo>4@#i%&4gf6Q{eYw=#=6v)?KPl!eL;c6Zh9`!}Qp95l
z2P+98dz07O7Y|m@hiGWN4CF#@0cd?E1P4j3Rtm74D7L#wS!Q53R52W;8Ti!_mtnIu
zlb6HI=tmgPk5uSKx#-rnE_5$IGv!F;k~iZXZNOcwa96mvna!}zxzJns(B9FbU~pum
znc^4&#j%RwIF~~G#D$O7jg;dVst!iHi{c&$gOgYx+X;v**u?H;IFS&MW8g{f;T=m@
z?!D?bH@;?+oWfAWMrG<5z<|uZRaCpEL{3Gdf$KB^M7U0e&yh1=LtOYA5knPM3Aoh#
zp@6GE81BmJn^a^z3;hhtXA>^Md=7kg5*RkbJX+3UsAA^4cBh87{R*hgN1%b~0``wk
zT?iju_=F`D)-3#NCa%veW-PffT=5&gE-`>zs=zLbfR)STJ{TAD2Cyp(U{@-zt0G_(
za<vb}^|}G<8Uxt13hcTF*hsnF15-~KtBJFRTAkm31_SnujE@ZAP4MCUQrPeij*?p#
zt2USaU@EZ_aw{4P*tao0f_*!Dj@$to#;%k*85_gK%KI)f7_jeVd<6R*`0&kK*f92J
zxsS1N?1bEp1_Sm3jE`VH2p`^8g$-esjgW^K8^gw``Vll3upec71p6`g9C;izj9n^E
zFgA{zkSEb#z<!GH5$vbobL1J=Fm{<d%UEJ(x}szNdyWASu;<}(<OSF;SVCT8tODzf
zk^$@`21LMKhR=~#V8dYL@+xB$*v2Rsz+PiO1nhPA9C-sa3|1j;GFE~0M#%v776T$+
zZ^MV%g|K0;k@7BM71&lN8Nl9SKm_c4`0$1;Y#3~me85-*wk=8qun!p!0s9C(M?QuP
zgH_5Wj3roTuUD(mPtjn&Ud8wb_Gj?n>rAj=?9uWCV`JD@ihhX(1NK*pk6?cdA70dj
z4Phrn$Ty6QVPpOIKQtJyzh!&``#bm?`5rcmT`E5?R$=pJ!lCXN+q+s)r#1d0^CO}R
zL_e`}gy?7Z9Qg${OjIVnGBzM8FF0lTM86@*K=eC1M~MD_4=)76g2-AC!X9bH{>zO0
zw*mVfh5fI_{>#NKUE}k9u%s@L4!D1BP}C9bsG=hH@c1Vz6lI3_mu<tjWs2Ls<ltpN
zn6|T-wu^yw9YwpYrmgX5ck=WGt9Lc9ZC>^i<{(|oDXeEmp_@u!eK!T~awIWIHejrp
zG`uR-7kK_PPZpO~-4S6J-VNC?GGTha=SWZ3@XV}~jToy)`m2F9Tzm(YbmZI^9Soek
zh!f%51U^SLg$;9#mdzNeIJfmiIdZ*UNZA|l2FgB!iBN71pCenqau)UQ7e#~GCUQ5N
z+PLn$5<D5OrI~Rn1LM|;v9D$v<ug`zjP5I3dh!(PocakC+-6caxUVmi$Tnv3eg^Vw
z6?uP6KB*0IZzZxO<lC9a2N=k=SL8cr@>y+=yY~a*+t)Pg1I^^c2J#X`K1h?-8_B(!
z_-jT!*i1geKt5EF57Xp}+8|$>VIOWLA7LObRpe!we0dw>YcuQ#GkLjzyh4$W)Z}Lx
z$-SQe)@;~EnaL{+<f9e&j+*?+Hptgz*mp9Mk1>$%tjKrK<af3~zBa?YtC@VPfqa}I
z-%XRNzlxxapInW*w~=rkd9{b`@n*u^4TKXE;Y3Zi3l{;3u-H4QtqI>GGv8zb-yVu@
zip%Gn11rj;im{4$2PIWq2eifQ9#TqVPjoVzFQyVJa=w@bAMP*1@_f<XKmBjFI-M?;
zOgB^QZJ<gjs%n?Ymsl?QFjh%y?>-Jsy)zJL&^40)5nZ$3bA*pigigH`6|yg56&cTn
z-W^EqhD!7Vq3`67dk#7pxc4JognNJZ9GMFn<{l{rFjjH5b1R2jH3&3t9mxI>t`vN@
zvklAbx0Ap9hH?9O7xdn)(S}UDDQnHFbq3b7Vx6y9`KL)-UBT`cIrh0Hcs!6+JkouF
zi#NyW&4iYLut5=KG$FqR>k|6AW1D=Tn{fx3`5Fy;O^UDC<@4t1$Pv=QShWdv^rw%%
zHx^v9c@OaF#IRZYD_3e?03gFAwuv9v!~&lqS=i8Y8d)ki#;RQQiRaSxOvZ!9dNEND
z6KtW-h_!;kkmW)$N3vW5A0C2+<wQKnKVVjKB65cL!&E>Uj0#3*ZBkoePHm|nwS!e^
zhq$Tv8#5seW7Poe8%wg?T_xDI#xQ><UD>h>Jcdy`l=P8NJPbZOAPpPd*5z^pV^yk`
z`P;g!m6$h*Yr1gr?pKEvkRC)E6k4DsKG=o<C7#bl$_CX(QAtE~9zHyu4a>PR$v>Q2
z46Pi)Hq!923|LiSZA1rI{(ly_+$?m3LFh^)^cYvDKNCmFv5Zxlcl)+BFP`fQ6z~Xg
z9Qqi>_jp1@#`gsH@F(|R!?S9XoWxi~JHXokB7+xWJsDjLtfvqu!g?xv_?`-En6*+)
zXRKo7lat;#%6-q<ea9j6ETf!(XanJy1c?xy1)n2l!-CK{C&XdCC!}TzpXfcu%yF)P
z<2=Q2zRTfl!BHdR0>&y)L)+hi<?i_}GzaBE^fgGlh>#J97sKbsC9t6}9fiLi$XG=m
zoR=b-vf#F!yIl>rTm~RRE|(KOlFJqFIdUZ|a>=ac@3o`bm`jEP{Oc?KP$E~E6S&%t
zz%?p?YuyC=ZIO`c7^?<jNE@lNDgSzOHORk#h!OcW!iO)7!iG0Rx!lZH#lB|yGB5aU
zK_3I{t%Qou-Ugo|x5I{ME94HwD%v&Lm&tl3x)@mRB2t9)Zusyg7h%J!BjsMkD%Snh
zXxihz<()kJ>j7`ud%GhJbB2j|AD9fu-p>Rg$vyy|BM-uclN}`wF;*ozXHDkC|2@}-
zL1f7F5%NcJeH1=N9)k_%S}Bh+b}a{1U)ZhXz&-&cL$XgYfk?7X!G}Lu2^&s!v^>Mu
zwHw&~TdvQ7$dK!E<d5X~Jbd^a4mOl)<p_C^u`1UG+nTZQ0~KD^)9>T0*|kQyQ`v)R
z@Aw>?9^z0z3MKLq)EEZjWr~ap$Sd$U@+vF_Bvb7luJ6V$v~!W=@vq&*me(t|WC;|7
zF7pd^{eoc+!sAfk0eH<k0IwSc;0-kZZ@L5Eoh~ZN<SoXk8NpZA+*<-m=BZbw{QKc=
zBgQb!@33oRoZp4dk@sLZ&ip&9E;tX%`gfcHrAy|yuNL`S@0+>)W8nHgaeb({w(h)I
zuG$FKM`o^%4P2inu1{SqfA}h76=T)#ZLDX~lG^a#$!7>KjNRw#7a6-R;KQpEu;HC{
zq<qC##estROVE<Ee2t0$_`eK}0Dl7?-jaX~1CNq#8B5>{-;gqZeP;msUV;4(0jrcB
zJ+RrKiR8Xd=x^Mg5M!YEnO!3}{{o*Qzru1N@eekwW+J5{8}~Oe*Y5_dKNQ!WE|<4)
zM~{%d7#kV*Cixp)gNT0^91-y^e2#R$t5czYA6+UP;c*9^uZkJKiWm?9>ja-8og-jn
z(!~P{4}4R25|+q1h%wNt%dQccuJGY=&9EGJ{t>Fx417~$;JcZ*);Dl%pt!nguA!^n
zgL4tC4b5CV3|u`G*G4XvzmY3sW5yQl!MX6@NiPH##%>e#i;Uf-@Zp^a*zooqDZLr1
zIQT@W`Yl(zM%M>^1N7z$k3eq$AKsII4MUHTtr)BJVZM`R0NdIC)>naT69KD~ejeCt
z^+bfabHzl%<542pBFaG2pPeIFZwH?v17JCs_{YNBVF=&O53hYm7TLtxo7r|Sunkmf
z#hR^`&o%~w-sWAu;B7E|H_H90K=e1+5_1lN3^@!|ISkP`Oj`RK@UE@OHJZava}L7{
zISf}hjL<nOTKgOlZRAjD&Y{eZLqg?Hu5;LaZF9hnw{7H5Va{QsA%{^ahf1BpthLXf
zt!**doWqWW9ClJUjL|tPU;7-|+7>&TbJ)d@!>%fau{wuQYnubVQfHo%<IFkiX2@Z@
z%3*h%L;c$4(AKt?V9sHpA%{sShsipJGuJ+cwzkC{<{YMka!6-(!y{FBse*5=?8zLw
zNBO6s;>a|*aOduN_XvsPIt-m8d%;_9tY!b`QFgil*xLojT<TwRd#_UYSKN~%8O3w6
zsy3kRqflqKsMd#WTHbGWJ|q`U!3#Mv&G55A@ab0Th`GYo$zw0u?AaoCTV`oD*%z&N
z*Jcj>ab!P+>PO#a%&0Dv{ozn2@a%$y=B&&`b*>!1s44&AKV_;qrqwGr*_^c2=34p6
z3<5Iz9x7gp(f9|#g+GzSo<9D}Qklm{7qymcA=FOk3~|(#A*L56hTy{<gZr_ND3Lle
z1lVcR9GTDl*r;b|yn5F0d3^lOMT%r(sa_3kFZHnh#EDRxfkPu4q!5buw>ESxeAIfn
zv}C3hm+?c&jR*+RXhO}AW_HpN0azw2aA?2<Yy(^X$a8-Hhy^<<(_M5M(E+-kh9BpM
z5TGX{2Zu&)Wm^Hdm#zj}h=2fa5o-7yjh%44z(3%sahAkzB3WWH1X(UE06kcN9-=|*
z0Mzk8z3s}^<|d_@no``h9L;3~NQWw<!!*(d0n*_yB);MkAPp@o8RQP>&{D!4Q2=|S
zf;~#Zei*>!W3Z#zgT<yQ8x$v<UhI+{T|l~Ak*?6B{OhB7s8_Bf>EL!rk0~HMR*@d3
zNyqSl-`&xUUu)9xc9fn_KzgDgJ;^0iJJrl;>{KViq1jJiTUDoEx70g$DR*$p|Kd{a
z;HM%gn5w6thHnC~C#LGU!A3hng8@$Oq5MSg)O1c7Y5|1j3(qV7I!gha?E+cX>Xgp$
zQM|c%Fi?1?!Bh%?sq}atia9xd182fLQk6kS%*muYCj)%RVW*s1ko9>g>+^NiN9n9D
zP+6<vo2Nc<gu77nyGZx@L-)H__3ISg1ewB7x<vK6RQG#V_q)vRhrW8W`K+=(K;=<k
zeoHV8WkbqK<Z>Vd8}|yucBRXf*(BJwR~ZPWwnI3$UmJvKU-jF9@lP<Bs|%93MkRBt
zn~e3ImVTWflQoncVY|M7?FPkmqh@<wv)!~hw!-0ry0*04T)=jVV!PF4%Zv^N`ZfdM
zjCKYZcc^{F!dy!Dy}596p?o&W?JB9Xdc}A_N^%Xgjp>@YhT5Fmp%T1PWwV*pwW_z(
zx2lgdV9FP`(y&%lZE6jw+Q1q<<tJ3t-z2Hp&?=wu18e-_A5}fA$|>Km#=p8zRb-8s
z@{fwd;kK%?HFnBhto3GX9c#jrKUm{m*{SMiRjJDk_4gU8I$3*7`I~KvFhEsZtm-L$
zvbLPH&8^v0>svKbex`?i)TXMtRX62F)~;r)ht)9Ud)D|jp<C4(%MH!=5^+ks{OpRn
zONqQ2R=qeb>G{fElWQ3v_o%je4Q-`zpK80GZG4Tnz9m=Fl*+V7syUUF2jHHj4x}xD
zZwl8~4Iq_TYl%Flx;+G2Y~2W{$5s`&iD50CP0PY`T}`cpm#5S6Fk<kjpSpTUEwF|y
z#Gn3XY0VC6YRJ}h+l!#-y6SYUn>>QBlB7JU5I3A=Z8go>dRnXEu5D>u(AdyiU!#)9
z%43Lev%+8PNHu30TAJM^c^obHoRh-1v8onX)wZfn124p%Dq&a-??pd}=Ghr&9bT8p
zrG}^Q$3jxsY{T$L(`S^8C`$~VI;kQxb^P#pE46TFQ+-9MskXVXWq1=ls@#|zo^9H0
zyMqU6xPd$N8<-eYF|4$vu?1fktIZ7@(r+L@(`+Et&=HmR|3F8cA}4O{bf{0YVoaV!
zb*?<aJgYEK(&~!|wRn}fCW{XkH{=>X@GM$as%<T;&CQgbZcMwcqCcm)KM$+6b*izk
lWl_z7hT4Phedib8U7<8LHZ(QlQaQ-Z%8RPUOR%lO{vY*^v@!qy

diff --git a/epochX/cudacpp/ee_mumu.mad/bin/madevent b/epochX/cudacpp/ee_mumu.mad/bin/madevent
index c944aa1faf..dff9711b73 100755
--- a/epochX/cudacpp/ee_mumu.mad/bin/madevent
+++ b/epochX/cudacpp/ee_mumu.mad/bin/madevent
@@ -32,6 +32,7 @@ except ImportError:
 
     
 import os
+pjoin = os.path.join 
 import optparse
 
 # Get the directory of the script real path (bin)
@@ -160,13 +161,30 @@ except:
     pass
 import internal.madevent_interface as cmd_interface
 
+# check for plugin customization of the launch command
+launch_interface = cmd_interface.MadEventCmdShell
+if os.path.exists(pjoin(root_path, 'bin','internal', 'launch_plugin.py')):
+    with  misc.TMP_variable(sys, 'path', sys.path + [pjoin(root_path, 'bin', 'internal')]):
+        from importlib import reload
+        try:
+            reload('launch_plugin')
+        except Exception as error:
+            import launch_plugin
+    launch_interface =  launch_plugin.MEINTERFACE
+
+
+#Source use this executable for compilation always allow it
+force_run = False
+if (args and args[0] == 'treatcards'):
+    force_run=True    
+
 # Call the cmd interface main loop
 try:
     if '-h' in args or '--help' in args:
-        launch = ME.MadEventCmdShell(me_dir=os.path.dirname(root_path), force_run=True)
+        launch = launch_interface(me_dir=os.path.dirname(root_path), force_run=True)
         launch.exec_cmd('help generate_events')
         sys.exit(0)
-    with cmd_interface.MadEventCmdShell.RunWebHandling(os.path.dirname(root_path), ):
+    with cmd_interface.MadEventCmdShell.RunWebHandling(os.path.dirname(root_path), force_run=force_run):
         if (args and os.path.isfile(args[0])):
             # They are an input file 
             input_file = args[0]
@@ -178,7 +196,7 @@ try:
                 cmd_line.run_cmd('import command ' + input_file)
                 cmd_line.run_cmd('quit')      
             else:
-                cmd_line = cmd_interface.MadEventCmdShell(force_run=True)
+                cmd_line = launch_interface(force_run=True)
                 cmd_line.use_rawinput = False
                 cmd_line.haspiping = False
                 cmd_line.run_cmd('import command ' + input_file)
@@ -188,7 +206,7 @@ try:
             if options.web:
                 cmd_line = cmd_interface.MadEventCmd(force_run=True)
             else:
-                cmd_line = cmd_interface.MadEventCmdShell(force_run=True)
+                cmd_line = launch_interface(force_run=True)
             if not hasattr(cmd_line, 'do_%s' % args[0]):
                 if parser_error:
                     print( parser_error)
diff --git a/epochX/cudacpp/ee_mumu.mad/src/HelAmps_sm.h b/epochX/cudacpp/ee_mumu.mad/src/HelAmps_sm.h
index 4eab78944d..e878fcd28e 100644
--- a/epochX/cudacpp/ee_mumu.mad/src/HelAmps_sm.h
+++ b/epochX/cudacpp/ee_mumu.mad/src/HelAmps_sm.h
@@ -8,7 +8,7 @@
 // Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+// MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
@@ -863,6 +863,7 @@ namespace mg5amcCpu
           const fptype allF2[],
           const fptype allV3[],
           const fptype allCOUP[],
+          const double Ccoeff,
           fptype allvertexes[] ) ALWAYS_INLINE;
 
   //--------------------------------------------------------------------------
@@ -873,6 +874,7 @@ namespace mg5amcCpu
   FFV1P0_3( const fptype allF1[],
             const fptype allF2[],
             const fptype allCOUP[],
+            const double Ccoeff,
             const fptype M3,
             const fptype W3,
             fptype allV3[] ) ALWAYS_INLINE;
@@ -886,6 +888,7 @@ namespace mg5amcCpu
           const fptype allF2[],
           const fptype allV3[],
           const fptype allCOUP[],
+          const double Ccoeff,
           fptype allvertexes[] ) ALWAYS_INLINE;
 
   //--------------------------------------------------------------------------
@@ -896,6 +899,7 @@ namespace mg5amcCpu
   FFV2_3( const fptype allF1[],
           const fptype allF2[],
           const fptype allCOUP[],
+          const double Ccoeff,
           const fptype M3,
           const fptype W3,
           fptype allV3[] ) ALWAYS_INLINE;
@@ -909,6 +913,7 @@ namespace mg5amcCpu
           const fptype allF2[],
           const fptype allV3[],
           const fptype allCOUP[],
+          const double Ccoeff,
           fptype allvertexes[] ) ALWAYS_INLINE;
 
   //--------------------------------------------------------------------------
@@ -919,6 +924,7 @@ namespace mg5amcCpu
   FFV4_3( const fptype allF1[],
           const fptype allF2[],
           const fptype allCOUP[],
+          const double Ccoeff,
           const fptype M3,
           const fptype W3,
           fptype allV3[] ) ALWAYS_INLINE;
@@ -932,7 +938,9 @@ namespace mg5amcCpu
             const fptype allF2[],
             const fptype allV3[],
             const fptype allCOUP1[],
+            const double Ccoeff1,
             const fptype allCOUP2[],
+            const double Ccoeff2,
             fptype allvertexes[] ) ALWAYS_INLINE;
 
   //--------------------------------------------------------------------------
@@ -943,7 +951,9 @@ namespace mg5amcCpu
   FFV2_4_3( const fptype allF1[],
             const fptype allF2[],
             const fptype allCOUP1[],
+            const double Ccoeff1,
             const fptype allCOUP2[],
+            const double Ccoeff2,
             const fptype M3,
             const fptype W3,
             fptype allV3[] ) ALWAYS_INLINE;
@@ -957,6 +967,7 @@ namespace mg5amcCpu
           const fptype allF2[],
           const fptype allV3[],
           const fptype allCOUP[],
+          const double Ccoeff,
           fptype allvertexes[] )
   {
     mgDebug( 0, __FUNCTION__ );
@@ -980,6 +991,7 @@ namespace mg5amcCpu
   FFV1P0_3( const fptype allF1[],
             const fptype allF2[],
             const fptype allCOUP[],
+            const double Ccoeff,
             const fptype M3,
             const fptype W3,
             fptype allV3[] )
@@ -1011,6 +1023,7 @@ namespace mg5amcCpu
           const fptype allF2[],
           const fptype allV3[],
           const fptype allCOUP[],
+          const double Ccoeff,
           fptype allvertexes[] )
   {
     mgDebug( 0, __FUNCTION__ );
@@ -1034,6 +1047,7 @@ namespace mg5amcCpu
   FFV2_3( const fptype allF1[],
           const fptype allF2[],
           const fptype allCOUP[],
+          const double Ccoeff,
           const fptype M3,
           const fptype W3,
           fptype allV3[] )
@@ -1067,6 +1081,7 @@ namespace mg5amcCpu
           const fptype allF2[],
           const fptype allV3[],
           const fptype allCOUP[],
+          const double Ccoeff,
           fptype allvertexes[] )
   {
     mgDebug( 0, __FUNCTION__ );
@@ -1093,6 +1108,7 @@ namespace mg5amcCpu
   FFV4_3( const fptype allF1[],
           const fptype allF2[],
           const fptype allCOUP[],
+          const double Ccoeff,
           const fptype M3,
           const fptype W3,
           fptype allV3[] )
@@ -1129,7 +1145,9 @@ namespace mg5amcCpu
             const fptype allF2[],
             const fptype allV3[],
             const fptype allCOUP1[],
+            const double Ccoeff1,
             const fptype allCOUP2[],
+            const double Ccoeff2,
             fptype allvertexes[] )
   {
     mgDebug( 0, __FUNCTION__ );
@@ -1144,7 +1162,7 @@ namespace mg5amcCpu
     constexpr fptype two( 2. );
     const cxtype_sv TMP1 = ( F1[2] * ( F2[4] * ( V3[2] + V3[5] ) + F2[5] * ( V3[3] + cI * V3[4] ) ) + F1[3] * ( F2[4] * ( V3[3] - cI * V3[4] ) + F2[5] * ( V3[2] - V3[5] ) ) );
     const cxtype_sv TMP3 = ( F1[4] * ( F2[2] * ( V3[2] - V3[5] ) - F2[3] * ( V3[3] + cI * V3[4] ) ) + F1[5] * ( F2[2] * ( -V3[3] + cI * V3[4] ) + F2[3] * ( V3[2] + V3[5] ) ) );
-    ( *vertex ) = ( -one ) * ( COUP2 * ( +cI * TMP1 + ( two * cI ) * TMP3 ) + cI * ( TMP1 * COUP1 ) );
+    ( *vertex ) = ( -one ) * ( Ccoeff2 * COUP2 * ( +cI * TMP1 + ( two * cI ) * TMP3 ) + cI * ( TMP1 * Ccoeff1 * COUP1 ) );
     mgDebug( 1, __FUNCTION__ );
     return;
   }
@@ -1157,7 +1175,9 @@ namespace mg5amcCpu
   FFV2_4_3( const fptype allF1[],
             const fptype allF2[],
             const fptype allCOUP1[],
+            const double Ccoeff1,
             const fptype allCOUP2[],
+            const double Ccoeff2,
             const fptype M3,
             const fptype W3,
             fptype allV3[] )
@@ -1179,10 +1199,10 @@ namespace mg5amcCpu
     const cxtype_sv TMP2 = ( F1[2] * ( F2[4] * ( P3[0] + P3[3] ) + F2[5] * ( P3[1] + cI * P3[2] ) ) + F1[3] * ( F2[4] * ( P3[1] - cI * P3[2] ) + F2[5] * ( P3[0] - P3[3] ) ) );
     const cxtype_sv TMP4 = ( F1[4] * ( F2[2] * ( P3[0] - P3[3] ) - F2[3] * ( P3[1] + cI * P3[2] ) ) + F1[5] * ( F2[2] * ( -P3[1] + cI * P3[2] ) + F2[3] * ( P3[0] + P3[3] ) ) );
     const cxtype_sv denom = one / ( ( P3[0] * P3[0] ) - ( P3[1] * P3[1] ) - ( P3[2] * P3[2] ) - ( P3[3] * P3[3] ) - M3 * ( M3 - cI * W3 ) );
-    V3[2] = denom * ( -two * cI ) * ( COUP2 * ( OM3 * -half * P3[0] * ( TMP2 + two * TMP4 ) + ( +half * ( F1[2] * F2[4] + F1[3] * F2[5] ) + F1[4] * F2[2] + F1[5] * F2[3] ) ) + half * ( COUP1 * ( F1[2] * F2[4] + F1[3] * F2[5] - P3[0] * OM3 * TMP2 ) ) );
-    V3[3] = denom * ( -two * cI ) * ( COUP2 * ( OM3 * -half * P3[1] * ( TMP2 + two * TMP4 ) + ( -half * ( F1[2] * F2[5] + F1[3] * F2[4] ) + F1[4] * F2[3] + F1[5] * F2[2] ) ) - half * ( COUP1 * ( F1[2] * F2[5] + F1[3] * F2[4] + P3[1] * OM3 * TMP2 ) ) );
-    V3[4] = denom * cI * ( COUP2 * ( OM3 * P3[2] * ( TMP2 + two * TMP4 ) + ( +cI * ( F1[2] * F2[5] ) - cI * ( F1[3] * F2[4] ) + ( -two * cI ) * ( F1[4] * F2[3] ) + ( two * cI ) * ( F1[5] * F2[2] ) ) ) + COUP1 * ( +cI * ( F1[2] * F2[5] ) - cI * ( F1[3] * F2[4] ) + P3[2] * OM3 * TMP2 ) );
-    V3[5] = denom * ( two * cI ) * ( COUP2 * ( OM3 * half * P3[3] * ( TMP2 + two * TMP4 ) + ( +half * ( F1[2] * F2[4] ) - half * ( F1[3] * F2[5] ) - F1[4] * F2[2] + F1[5] * F2[3] ) ) + half * ( COUP1 * ( F1[2] * F2[4] + P3[3] * OM3 * TMP2 - F1[3] * F2[5] ) ) );
+    V3[2] = denom * ( -two * cI ) * ( Ccoeff2 * COUP2 * ( OM3 * -half * P3[0] * ( TMP2 + two * TMP4 ) + ( +half * ( F1[2] * F2[4] + F1[3] * F2[5] ) + F1[4] * F2[2] + F1[5] * F2[3] ) ) + half * ( Ccoeff1 * COUP1 * ( F1[2] * F2[4] + F1[3] * F2[5] - P3[0] * OM3 * TMP2 ) ) );
+    V3[3] = denom * ( -two * cI ) * ( Ccoeff2 * COUP2 * ( OM3 * -half * P3[1] * ( TMP2 + two * TMP4 ) + ( -half * ( F1[2] * F2[5] + F1[3] * F2[4] ) + F1[4] * F2[3] + F1[5] * F2[2] ) ) - half * ( Ccoeff1 * COUP1 * ( F1[2] * F2[5] + F1[3] * F2[4] + P3[1] * OM3 * TMP2 ) ) );
+    V3[4] = denom * cI * ( Ccoeff2 * COUP2 * ( OM3 * P3[2] * ( TMP2 + two * TMP4 ) + ( +cI * ( F1[2] * F2[5] ) - cI * ( F1[3] * F2[4] ) + ( -two * cI ) * ( F1[4] * F2[3] ) + ( two * cI ) * ( F1[5] * F2[2] ) ) ) + Ccoeff1 * COUP1 * ( +cI * ( F1[2] * F2[5] ) - cI * ( F1[3] * F2[4] ) + P3[2] * OM3 * TMP2 ) );
+    V3[5] = denom * ( two * cI ) * ( Ccoeff2 * COUP2 * ( OM3 * half * P3[3] * ( TMP2 + two * TMP4 ) + ( +half * ( F1[2] * F2[4] ) - half * ( F1[3] * F2[5] ) - F1[4] * F2[2] + F1[5] * F2[3] ) ) + half * ( Ccoeff1 * COUP1 * ( F1[2] * F2[4] + P3[3] * OM3 * TMP2 - F1[3] * F2[5] ) ) );
     mgDebug( 1, __FUNCTION__ );
     return;
   }
diff --git a/epochX/cudacpp/ee_mumu.mad/src/Parameters_sm.cc b/epochX/cudacpp/ee_mumu.mad/src/Parameters_sm.cc
index 0f03d17601..cffc5d3bff 100644
--- a/epochX/cudacpp/ee_mumu.mad/src/Parameters_sm.cc
+++ b/epochX/cudacpp/ee_mumu.mad/src/Parameters_sm.cc
@@ -7,7 +7,7 @@
 // Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+// MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
diff --git a/epochX/cudacpp/ee_mumu.mad/src/Parameters_sm.h b/epochX/cudacpp/ee_mumu.mad/src/Parameters_sm.h
index e3ab5916c0..2a6d960581 100644
--- a/epochX/cudacpp/ee_mumu.mad/src/Parameters_sm.h
+++ b/epochX/cudacpp/ee_mumu.mad/src/Parameters_sm.h
@@ -7,7 +7,7 @@
 // Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+// MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
diff --git a/epochX/cudacpp/ee_mumu.mad/src/cudacpp_src.mk b/epochX/cudacpp/ee_mumu.mad/src/cudacpp_src.mk
index f2804ffb85..159e19a46d 100644
--- a/epochX/cudacpp/ee_mumu.mad/src/cudacpp_src.mk
+++ b/epochX/cudacpp/ee_mumu.mad/src/cudacpp_src.mk
@@ -36,6 +36,13 @@ endif
 # See https://www.gnu.org/software/make/manual/html_node/Implicit-Variables.html
 ###RANLIB = ranlib
 
+# Add -mmacosx-version-min=11.3 to avoid "ld: warning: object file was built for newer macOS version than being linked"
+LDFLAGS =
+ifneq ($(shell $(CXX) --version | egrep '^Apple clang'),)
+CXXFLAGS += -mmacosx-version-min=11.3
+LDFLAGS += -mmacosx-version-min=11.3
+endif
+
 #-------------------------------------------------------------------------------
 
 #=== Configure the CUDA compiler (note: GPUCC is already exported including ccache)
@@ -266,11 +273,11 @@ endif
 ifneq ($(GPUCC),)
 $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so : $(cxx_objects) $(cu_objects)
 	@if [ ! -d $(LIBDIR) ]; then echo "mkdir -p $(LIBDIR)"; mkdir -p $(LIBDIR); fi
-	$(GPUCC) -shared -o $@ $(cxx_objects) $(cu_objects)
+	$(GPUCC) -shared -o $@ $(cxx_objects) $(cu_objects) $(LDFLAGS)
 else
 $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so : $(cxx_objects)
 	@if [ ! -d $(LIBDIR) ]; then echo "mkdir -p $(LIBDIR)"; mkdir -p $(LIBDIR); fi
-	$(CXX) -shared -o $@ $(cxx_objects)
+	$(CXX) -shared -o $@ $(cxx_objects) $(LDFLAGS)
 endif
 
 #-------------------------------------------------------------------------------
diff --git a/epochX/cudacpp/ee_mumu.mad/src/mgOnGpuConfig.h b/epochX/cudacpp/ee_mumu.mad/src/mgOnGpuConfig.h
index e540c8587c..55d03f1252 100644
--- a/epochX/cudacpp/ee_mumu.mad/src/mgOnGpuConfig.h
+++ b/epochX/cudacpp/ee_mumu.mad/src/mgOnGpuConfig.h
@@ -15,7 +15,6 @@
 #define MGONGPUCPP_GPUIMPL cuda
 #elif defined __HIPCC__
 #define MGONGPUCPP_GPUIMPL hip
-#include "hip/hip_runtime.h"
 #else
 #undef MGONGPUCPP_GPUIMPL
 #endif
@@ -24,16 +23,19 @@
 // ** NB2 Baseline on b7g47n0004 fluctuates (probably depends on load on other VMs)
 
 // Choose if curand is supported for generating random numbers
-// For CUDA, by default, it is supported
-// For HIP, by default, it is not supported
-// For C++, by default, do not inline, but allow this macro to be set from outside with e.g. -DMGONGPU_HAS_NO_CURAND
-#ifdef __CUDACC__
-#undef MGONGPU_HAS_NO_CURAND
-#elif defined __HIPCC__
+// For HIP, by default, do not use curand (common random numbers will be used instead)
+// For both CUDA and C++, by default, do not inline, but allow this macro to be set from outside with e.g. -DMGONGPU_HAS_NO_CURAND
+// (there exist CUDA installations, e.g. using the HPC package, which do not include curand - see PR #784 and #785)
+#if defined __HIPCC__
 #define MGONGPU_HAS_NO_CURAND 1
 #else
+//#ifdef __CUDACC__
+//#undef MGONGPU_HAS_NO_CURAND // default
+////#define MGONGPU_HAS_NO_CURAND 1
+//#else
 //#undef MGONGPU_HAS_NO_CURAND // default
 ////#define MGONGPU_HAS_NO_CURAND 1
+//#endif
 #endif
 
 // Choose floating point precision (for everything but color algebra #537)
diff --git a/epochX/cudacpp/ee_mumu.mad/src/mgOnGpuCxtypes.h b/epochX/cudacpp/ee_mumu.mad/src/mgOnGpuCxtypes.h
index 46d9f02733..5532e22fa1 100644
--- a/epochX/cudacpp/ee_mumu.mad/src/mgOnGpuCxtypes.h
+++ b/epochX/cudacpp/ee_mumu.mad/src/mgOnGpuCxtypes.h
@@ -159,6 +159,12 @@ namespace mg5amcCpu
     return cxsmpl<float>( a, 0 ) * b;
   }
 
+  inline __host__ __device__ constexpr cxsmpl<float>
+  operator*( const cxsmpl<float>& a, const double& b )
+  {
+    return a * cxsmpl<float>( b, 0 );
+  }
+
   template<typename FP>
   inline __host__ __device__ constexpr cxsmpl<FP>
   operator/( const cxsmpl<FP>& a, const cxsmpl<FP>& b )
diff --git a/epochX/cudacpp/ee_mumu.sa/CODEGEN_cudacpp_ee_mumu_log.txt b/epochX/cudacpp/ee_mumu.sa/CODEGEN_cudacpp_ee_mumu_log.txt
index bc2b0f8499..26e1484575 100644
--- a/epochX/cudacpp/ee_mumu.sa/CODEGEN_cudacpp_ee_mumu_log.txt
+++ b/epochX/cudacpp/ee_mumu.sa/CODEGEN_cudacpp_ee_mumu_log.txt
@@ -14,7 +14,7 @@ Running MG5 in debug mode
 *                   *        * *        *                  *
 *                 *                       *                *
 *                                                          *
-*         VERSION 3.5.1_lo_vect         2023-08-08         *
+*         VERSION 3.5.2_lo_vect         2023-11-08         *
 [1;31m*                                                          *[1;0m
 [1;31m*          WARNING: UNKNOWN DEVELOPMENT VERSION.           *[1;0m
 [1;31m*            WARNING: DO NOT USE FOR PRODUCTION            *[1;0m
@@ -53,7 +53,7 @@ Note that you can still compile and run aMC@NLO with the built-in PDFs
 Using default text editor "vi". Set another one in ./input/mg5_configuration.txt
 Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt
 Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt
-import /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_cudacpp_ee_mumu.mg
+import /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_ee_mumu.mg
 The import format was not given, so we guess it as command
 set stdout_level DEBUG
 set output information to level: 10
@@ -62,7 +62,7 @@ generate e+ e- > mu+ mu-
 No model currently active, so we import the Standard Model
 INFO: load particles 
 INFO: load vertices 
-[1;32mDEBUG: model prefixing  takes 0.0053980350494384766 [0m
+[1;32mDEBUG: model prefixing  takes 0.005408763885498047 [0m
 INFO: Restrict model sm with file models/sm/restrict_default.dat . 
 [1;32mDEBUG: Simplifying conditional expressions [0m
 [1;32mDEBUG: remove interactions: u s w+ at order: QED=1 [0m
@@ -156,53 +156,32 @@ INFO: Trying process: e+ e- > mu+ mu- WEIGHTED<=4 @1
 INFO: Process has 2 diagrams 
 1 processes with 2 diagrams generated in 0.004 s
 Total: 1 processes with 2 diagrams
-output standalone_cudacpp CODEGEN_cudacpp_ee_mumu
-Load PLUGIN.CUDACPP_SA_OUTPUT
-[1mOutput will be done with PLUGIN: CUDACPP_SA_OUTPUT[0m
+output standalone_cudacpp ../TMPOUT/CODEGEN_cudacpp_ee_mumu
+Load PLUGIN.CUDACPP_OUTPUT
+[1mOutput will be done with PLUGIN: CUDACPP_OUTPUT[0m
 [1;32mDEBUG:  cformat = [0m plugin [1;30m[export_cpp.py at line 3071][0m [0m
-[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [1;30m[output.py at line 152][0m [0m
-[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [1;30m[output.py at line 157][0m [0m
-INFO: Creating subdirectories in directory /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_cudacpp_ee_mumu 
+[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [1;30m[output.py at line 160][0m [0m
+[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [1;30m[output.py at line 165][0m [0m
+INFO: Creating subdirectories in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_ee_mumu 
 INFO: Organizing processes into subprocess groups 
 INFO: Generating Helas calls for process: e+ e- > mu+ mu- WEIGHTED<=4 @1 
 INFO: Processing color information for process: e+ e- > mu+ mu- @1 
-[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.generate_subprocess_directory (create the directory) [1;30m[output.py at line 186][0m [0m
-[1;32mDEBUG:    type(subproc_group)=<class 'madgraph.core.helas_objects.HelasMatrixElement'> [1;30m[output.py at line 187][0m [0m
-[1;32mDEBUG:    type(fortran_model)=<class 'PLUGIN.CUDACPP_SA_OUTPUT.model_handling.PLUGIN_GPUFOHelasCallWriter'> [1;30m[output.py at line 188][0m [0m
-[1;32mDEBUG:    type(me)=<class 'int'> me=0 [1;30m[output.py at line 189][0m [0m
-[1;32mDEBUG:  Entering PLUGIN_OneProcessExporter.__init__ [1;30m[model_handling.py at line 1039][0m [0m
-[1;32mDEBUG:  proc_id = [0m 0 [1;30m[model_handling.py at line 1045][0m [0m
-INFO: Creating files in directory /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_cudacpp_ee_mumu/SubProcesses/P1_Sigma_sm_epem_mupmum 
-[1;32mDEBUG:  Entering PLUGIN_OneProcessExporter.generate_process_files [1;30m[model_handling.py at line 1297][0m [0m
-[1;32mDEBUG:  self.include_multi_channel is not yet defined: this is standalone_cudacpp mode [1;30m[model_handling.py at line 1301][0m [0m
-FileWriter <class 'PLUGIN.CUDACPP_SA_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_cudacpp_ee_mumu/SubProcesses/P1_Sigma_sm_epem_mupmum/./CPPProcess.h
-[1;32mDEBUG:  Entering PLUGIN_OneProcessExporter.write_process_h_file [1;30m[model_handling.py at line 1453][0m [0m
-FileWriter <class 'PLUGIN.CUDACPP_SA_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_cudacpp_ee_mumu/SubProcesses/P1_Sigma_sm_epem_mupmum/./CPPProcess.cc
-[1;32mDEBUG:  Entering PLUGIN_OneProcessExporter.write_process_cc_file [1;30m[model_handling.py at line 1475][0m [0m
-[1;32mDEBUG:  Entering PLUGIN_OneProcessExporter.get_sigmaKin_lines [1;30m[model_handling.py at line 1143][0m [0m
-[1;32mDEBUG:  self.include_multi_channel = [0m False [1;30m[model_handling.py at line 1144][0m [0m
-[1;32mDEBUG:  self.support_multichannel = [0m True [1;30m[model_handling.py at line 1145][0m [0m
-[1;32mDEBUG:  type(self.helas_call_writer) = [0m <class 'PLUGIN.CUDACPP_SA_OUTPUT.model_handling.PLUGIN_GPUFOHelasCallWriter'> [1;30m[model_handling.py at line 1162][0m [0m
-[1;32mDEBUG:  self.support_multichannel, self.include_multi_channel = [0m True False [1;30m[model_handling.py at line 1163][0m [0m
-[1;32mDEBUG:  multi_channel_map = [0m None [1;30m[model_handling.py at line 1654][0m [0m
-[1;32mDEBUG:  diag_to_config = [0m {} [1;30m[model_handling.py at line 1711][0m [0m
-INFO: Created files CPPProcess.h and CPPProcess.cc in directory /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_cudacpp_ee_mumu/SubProcesses/P1_Sigma_sm_epem_mupmum/. 
-[1;32mDEBUG:  Entering PLUGIN_OneProcessExporter.edit_CMakeLists [1;30m[model_handling.py at line 1343][0m [0m
-[1;32mDEBUG:  Entering PLUGIN_OneProcessExporter.edit_check_sa [1;30m[model_handling.py at line 1352][0m [0m
-[1;32mDEBUG:  Entering PLUGIN_OneProcessExporter.edit_mgonGPU [1;30m[model_handling.py at line 1369][0m [0m
-[1;32mDEBUG:  Entering PLUGIN_OneProcessExporter.edit_processidfile [1;30m[model_handling.py at line 1389][0m [0m
-[1;32mDEBUG:  Entering PLUGIN_OneProcessExporter.edit_testxxx [1;30m[model_handling.py at line 1419][0m [0m
-[1;32mDEBUG:  Entering PLUGIN_OneProcessExporter.edit_memorybuffers [1;30m[model_handling.py at line 1430][0m [0m
-[1;32mDEBUG:  Entering PLUGIN_OneProcessExporter.edit_memoryaccesscouplings [1;30m[model_handling.py at line 1441][0m [0m
-[1;32mDEBUG:  'Copying test reference file: ', template_ref  = [0m Copying test reference file:  /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/../../../test/ref/dump_CPUTest.Sigma_sm_epem_mupmum.txt [1;30m[model_handling.py at line 1335][0m [0m
-Generated helas calls for 1 subprocesses (2 diagrams) in 0.003 s
-[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.convert_model (create the model) [1;30m[output.py at line 194][0m [0m
+[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.generate_subprocess_directory (create the directory) [1;30m[output.py at line 194][0m [0m
+[1;32mDEBUG:    type(subproc_group)=<class 'madgraph.core.helas_objects.HelasMatrixElement'> [1;30m[output.py at line 195][0m [0m
+[1;32mDEBUG:    type(fortran_model)=<class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_GPUFOHelasCallWriter'> [1;30m[output.py at line 196][0m [0m
+[1;32mDEBUG:    type(me)=<class 'int'> me=0 [1;30m[output.py at line 197][0m [0m
+INFO: Creating files in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_ee_mumu/SubProcesses/P1_Sigma_sm_epem_mupmum 
+FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_ee_mumu/SubProcesses/P1_Sigma_sm_epem_mupmum/./CPPProcess.h
+FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_ee_mumu/SubProcesses/P1_Sigma_sm_epem_mupmum/./CPPProcess.cc
+INFO: Created files CPPProcess.h and CPPProcess.cc in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_ee_mumu/SubProcesses/P1_Sigma_sm_epem_mupmum/. 
+Generated helas calls for 1 subprocesses (2 diagrams) in 0.004 s
+[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.convert_model (create the model) [1;30m[output.py at line 202][0m [0m
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates FFV1 routines[0m
 ALOHA: aloha creates FFV2 routines[0m
 ALOHA: aloha creates FFV4 routines[0m
 ALOHA: aloha creates FFV2_4 routines[0m
-ALOHA: aloha creates 4 routines in  0.265 s
+ALOHA: aloha creates 4 routines in  0.266 s
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
 <class 'aloha.create_aloha.AbstractRoutine'> FFV2
@@ -211,20 +190,17 @@ ALOHA: aloha creates 4 routines in  0.265 s
 <class 'aloha.create_aloha.AbstractRoutine'> FFV4
 <class 'aloha.create_aloha.AbstractRoutine'> FFV2_4
 <class 'aloha.create_aloha.AbstractRoutine'> FFV2_4
-FileWriter <class 'PLUGIN.CUDACPP_SA_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_cudacpp_ee_mumu/src/./HelAmps_sm.h
-INFO: Created file HelAmps_sm.h in directory /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_cudacpp_ee_mumu/src/. 
+FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_ee_mumu/src/./HelAmps_sm.h
+INFO: Created file HelAmps_sm.h in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_ee_mumu/src/. 
 super_write_set_parameters_onlyfixMajorana (hardcoded=False)
-[1;32mDEBUG:  'pardef_lines size =', len(pardef_lines), ', keys size =', len(pardef_lines.keys())  = [0m pardef_lines size = 48 , keys size = 48 [1;30m[model_handling.py at line 729][0m [0m
 super_write_set_parameters_onlyfixMajorana (hardcoded=True)
-[1;32mDEBUG:  'pardef_lines size =', len(pardef_lines), ', keys size =', len(pardef_lines.keys())  = [0m pardef_lines size = 3 , keys size = 3 [1;30m[model_handling.py at line 729][0m [0m
-[1;32mDEBUG:  'pardef_lines size =', len(pardef_lines), ', keys size =', len(pardef_lines.keys())  = [0m pardef_lines size = 3 , keys size = 3 [1;30m[model_handling.py at line 729][0m [0m
-FileWriter <class 'PLUGIN.CUDACPP_SA_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_cudacpp_ee_mumu/src/./Parameters_sm.h
-FileWriter <class 'PLUGIN.CUDACPP_SA_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_cudacpp_ee_mumu/src/./Parameters_sm.cc
+FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_ee_mumu/src/./Parameters_sm.h
+FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_ee_mumu/src/./Parameters_sm.cc
 INFO: Created files Parameters_sm.h and Parameters_sm.cc in directory 
-INFO: /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_cudacpp_ee_mumu/src/. and /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_cudacpp_ee_mumu/src/. 
-[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.finalize [1;30m[output.py at line 203][0m [0m
+INFO: /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_ee_mumu/src/. and /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_ee_mumu/src/. 
 quit
 
-real	0m0.698s
-user	0m0.628s
-sys	0m0.058s
+real	0m0.649s
+user	0m0.586s
+sys	0m0.056s
+Code generation completed in 1 seconds
diff --git a/epochX/cudacpp/ee_mumu.sa/SubProcesses/Bridge.h b/epochX/cudacpp/ee_mumu.sa/SubProcesses/Bridge.h
index f37c972b24..89437b4c42 100644
--- a/epochX/cudacpp/ee_mumu.sa/SubProcesses/Bridge.h
+++ b/epochX/cudacpp/ee_mumu.sa/SubProcesses/Bridge.h
@@ -18,6 +18,7 @@
 #include <cassert>
 #include <cmath>
 #include <cstring>
+#include <filesystem>
 #include <iostream>
 #include <memory>
 #include <type_traits>
@@ -244,14 +245,21 @@ namespace mg5amcCpu
     }
     std::cout << "WARNING! Instantiate device Bridge (nevt=" << m_nevt << ", gpublocks=" << m_gpublocks << ", gputhreads=" << m_gputhreads
               << ", gpublocks*gputhreads=" << m_gpublocks * m_gputhreads << ")" << std::endl;
-    CPPProcess process( /*verbose=*/false );
     m_pmek.reset( new MatrixElementKernelDevice( m_devMomentaC, m_devGs, m_devRndHel, m_devRndCol, m_devMEs, m_devSelHel, m_devSelCol, m_gpublocks, m_gputhreads ) );
 #else
     std::cout << "WARNING! Instantiate host Bridge (nevt=" << m_nevt << ")" << std::endl;
-    CPPProcess process( /*verbose=*/false );
     m_pmek.reset( new MatrixElementKernelHost( m_hstMomentaC, m_hstGs, m_hstRndHel, m_hstRndCol, m_hstMEs, m_hstSelHel, m_hstSelCol, m_nevt ) );
 #endif // MGONGPUCPP_GPUIMPL
-    process.initProc( "../../Cards/param_card.dat" );
+    // Create a process object, read param card and set parameters
+    // FIXME: the process instance can happily go out of scope because it is only needed to read parameters?
+    // FIXME: the CPPProcess should really be a singleton? what if fbridgecreate is called from several Fortran threads?
+    CPPProcess process( /*verbose=*/false );
+    std::string paramCard = "../../Cards/param_card.dat";
+    if( !std::filesystem::exists( paramCard ) )
+    {
+      paramCard = "../" + paramCard;
+    }
+    process.initProc( paramCard );
   }
 
 #ifdef MGONGPUCPP_GPUIMPL
diff --git a/epochX/cudacpp/ee_mumu.sa/SubProcesses/GpuAbstraction.h b/epochX/cudacpp/ee_mumu.sa/SubProcesses/GpuAbstraction.h
index 9c467b1e04..6a7d9c05c0 100644
--- a/epochX/cudacpp/ee_mumu.sa/SubProcesses/GpuAbstraction.h
+++ b/epochX/cudacpp/ee_mumu.sa/SubProcesses/GpuAbstraction.h
@@ -39,6 +39,8 @@
 
 #elif defined __HIPCC__
 
+#include "hip/hip_runtime.h"
+
 #define gpuError_t hipError_t
 #define gpuPeekAtLastError hipPeekAtLastError
 #define gpuGetErrorString hipGetErrorString
diff --git a/epochX/cudacpp/ee_mumu.sa/SubProcesses/MadgraphTest.h b/epochX/cudacpp/ee_mumu.sa/SubProcesses/MadgraphTest.h
index 176338151a..a64c05c26a 100644
--- a/epochX/cudacpp/ee_mumu.sa/SubProcesses/MadgraphTest.h
+++ b/epochX/cudacpp/ee_mumu.sa/SubProcesses/MadgraphTest.h
@@ -14,6 +14,7 @@
 
 #include <array>
 #include <cmath>
+#include <filesystem>
 #include <fstream>
 #include <iomanip>
 #include <memory>
@@ -215,19 +216,16 @@ TEST_P( MadgraphTest, CompareMomentaAndME )
 #endif
   constexpr fptype energy = 1500; // historical default, Ecms = 1500 GeV = 1.5 TeV (above the Z peak)
   // Dump events to a new reference file?
-  constexpr bool dumpEvents = false;
-  std::string dumpFileName = std::string( "dump_" ) + testing::UnitTest::GetInstance()->current_test_info()->test_suite_name() + '.' + testing::UnitTest::GetInstance()->current_test_info()->name() + ".txt";
-  while( dumpFileName.find( '/' ) != std::string::npos )
-  {
-    dumpFileName.replace( dumpFileName.find( '/' ), 1, "_" );
-  }
+  const char* dumpEventsC = getenv( "CUDACPP_RUNTEST_DUMPEVENTS" );
+  const bool dumpEvents = ( dumpEventsC != 0 ) && ( std::string( dumpEventsC ) != "" );
+  const std::string refFileName = testDriver->getRefFileName();
+  const std::string dumpFileName = std::filesystem::path( refFileName ).filename();
   std::ofstream dumpFile;
   if( dumpEvents )
   {
     dumpFile.open( dumpFileName, std::ios::trunc );
   }
   // Read reference data
-  const std::string refFileName = testDriver->getRefFileName();
   std::map<unsigned int, ReferenceData> referenceData;
   if( !dumpEvents )
   {
diff --git a/epochX/cudacpp/ee_mumu.sa/SubProcesses/MatrixElementKernels.cc b/epochX/cudacpp/ee_mumu.sa/SubProcesses/MatrixElementKernels.cc
index d6d6c4f179..81699dfea9 100644
--- a/epochX/cudacpp/ee_mumu.sa/SubProcesses/MatrixElementKernels.cc
+++ b/epochX/cudacpp/ee_mumu.sa/SubProcesses/MatrixElementKernels.cc
@@ -112,10 +112,17 @@ namespace mg5amcCpu
     // See https://community.arm.com/arm-community-blogs/b/operating-systems-blog/posts/runtime-detection-of-cpu-features-on-an-armv8-a-cpu
     bool ok = true; // this is just an assumption!
     const std::string tag = "arm neon (128bit as in SSE4.2)";
-#else
+#elif defined( __x86_64__ ) || defined( __i386__ )
     bool known = true;
     bool ok = __builtin_cpu_supports( "sse4.2" );
     const std::string tag = "nehalem (SSE4.2)";
+#else // AV FIXME! Added by OM for Mac, should identify the correct __xxx__ flag that should be targeted
+    bool known = false; // __builtin_cpu_supports is not supported
+    // See https://gcc.gnu.org/onlinedocs/gcc/Basic-PowerPC-Built-in-Functions-Available-on-all-Configurations.html
+    // See https://stackoverflow.com/q/62783908
+    // See https://community.arm.com/arm-community-blogs/b/operating-systems-blog/posts/runtime-detection-of-cpu-features-on-an-armv8-a-cpu
+    bool ok = true; // this is just an assumption!
+    const std::string tag = "arm neon (128bit as in SSE4.2)";
 #endif
 #else
     bool known = true;
diff --git a/epochX/cudacpp/ee_mumu.sa/SubProcesses/P1_Sigma_sm_epem_mupmum/CPPProcess.cc b/epochX/cudacpp/ee_mumu.sa/SubProcesses/P1_Sigma_sm_epem_mupmum/CPPProcess.cc
index ef66c43396..13429436af 100644
--- a/epochX/cudacpp/ee_mumu.sa/SubProcesses/P1_Sigma_sm_epem_mupmum/CPPProcess.cc
+++ b/epochX/cudacpp/ee_mumu.sa/SubProcesses/P1_Sigma_sm_epem_mupmum/CPPProcess.cc
@@ -7,7 +7,7 @@
 // Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+// MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
@@ -239,25 +239,18 @@ namespace mg5amcCpu
       // *** DIAGRAM 1 OF 2 ***
 
       // Wavefunction(s) for diagram number 1
-#if not( defined MGONGPUCPP_GPUIMPL and defined MGONGPU_TEST_DIVERGENCE )
-      opzxxx<M_ACCESS, W_ACCESS>( momenta, cHel[ihel][0], -1, w_fp[0], 0 ); // NB: opzxxx only uses pz
-#else
-      if( ( blockDim.x * blockIdx.x + threadIdx.x ) % 2 == 0 )
-        opzxxx<M_ACCESS, W_ACCESS>( momenta, cHel[ihel][0], -1, w_fp[0], 0 ); // NB: opzxxx only uses pz
-      else
-        oxxxxx<M_ACCESS, W_ACCESS>( momenta, 0, cHel[ihel][0], -1, w_fp[0], 0 );
-#endif
+      oxxxxx<M_ACCESS, W_ACCESS>( momenta, 0., cHel[ihel][0], -1, w_fp[0], 0 );
 
-      imzxxx<M_ACCESS, W_ACCESS>( momenta, cHel[ihel][1], +1, w_fp[1], 1 ); // NB: imzxxx only uses pz
+      ixxxxx<M_ACCESS, W_ACCESS>( momenta, 0., cHel[ihel][1], +1, w_fp[1], 1 );
 
-      ixzxxx<M_ACCESS, W_ACCESS>( momenta, cHel[ihel][2], -1, w_fp[2], 2 );
+      ixxxxx<M_ACCESS, W_ACCESS>( momenta, 0., cHel[ihel][2], -1, w_fp[2], 2 );
 
-      oxzxxx<M_ACCESS, W_ACCESS>( momenta, cHel[ihel][3], +1, w_fp[3], 3 );
+      oxxxxx<M_ACCESS, W_ACCESS>( momenta, 0., cHel[ihel][3], +1, w_fp[3], 3 );
 
-      FFV1P0_3<W_ACCESS, CI_ACCESS>( w_fp[1], w_fp[0], COUPs[0], 0., 0., w_fp[4] );
+      FFV1P0_3<W_ACCESS, CI_ACCESS>( w_fp[1], w_fp[0], COUPs[ndcoup + 0], 1.0, 0., 0., w_fp[4] );
 
       // Amplitude(s) for diagram number 1
-      FFV1_0<W_ACCESS, A_ACCESS, CI_ACCESS>( w_fp[2], w_fp[3], w_fp[4], COUPs[0], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CI_ACCESS>( w_fp[2], w_fp[3], w_fp[4], COUPs[ndcoup + 0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -266,10 +259,10 @@ namespace mg5amcCpu
       // *** DIAGRAM 2 OF 2 ***
 
       // Wavefunction(s) for diagram number 2
-      FFV2_4_3<W_ACCESS, CI_ACCESS>( w_fp[1], w_fp[0], COUPs[1], COUPs[2], cIPD[0], cIPD[1], w_fp[4] );
+      FFV2_4_3<W_ACCESS, CI_ACCESS>( w_fp[1], w_fp[0], COUPs[ndcoup + 1], 1.0, COUPs[ndcoup + 2], 1.0, cIPD[0], cIPD[1], w_fp[4] );
 
       // Amplitude(s) for diagram number 2
-      FFV2_4_0<W_ACCESS, A_ACCESS, CI_ACCESS>( w_fp[2], w_fp[3], w_fp[4], COUPs[1], COUPs[2], &amp_fp[0] );
+      FFV2_4_0<W_ACCESS, A_ACCESS, CI_ACCESS>( w_fp[2], w_fp[3], w_fp[4], COUPs[ndcoup + 1], 1.0, COUPs[ndcoup + 2], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -636,12 +629,12 @@ namespace mg5amcCpu
                        fptype* allDenominators,    // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
                        bool* isGoodHel )           // output: isGoodHel[ncomb] - device array (CUDA implementation)
-  { /* clang-format on */
-    fptype allMEsLast = 0;
+  {                                                         /* clang-format on */
     const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid
-    allMEs[ievt] = 0;
     for( int ihel = 0; ihel < ncomb; ihel++ )
     {
+      // NEW IMPLEMENTATION OF GETGOODHEL (#630): RESET THE RUNNING SUM OVER HELICITIES TO 0 BEFORE ADDING A NEW HELICITY
+      allMEs[ievt] = 0;
       // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s)
       constexpr fptype_sv* jamp2_sv = nullptr; // no need for color selection during helicity filtering
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
@@ -650,12 +643,11 @@ namespace mg5amcCpu
 #else
       calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv );
 #endif
-      if( allMEs[ievt] != allMEsLast )
+      if( allMEs[ievt] != 0 ) // NEW IMPLEMENTATION OF GETGOODHEL (#630): COMPARE EACH HELICITY CONTRIBUTION TO 0
       {
         //if ( !isGoodHel[ihel] ) std::cout << "sigmaKin_getGoodHel ihel=" << ihel << " TRUE" << std::endl;
         isGoodHel[ihel] = true;
       }
-      allMEsLast = allMEs[ievt]; // running sum up to helicity ihel for event ievt
     }
   }
 #else
@@ -674,19 +666,11 @@ namespace mg5amcCpu
     //assert( (size_t)(allMEs) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS]
     // Allocate arrays at build time to contain at least 16 events (or at least neppV events if neppV>16, e.g. in future VPUs)
     constexpr int maxtry0 = std::max( 16, neppV ); // 16, but at least neppV (otherwise the npagV loop does not even start)
-    fptype allMEsLast[maxtry0] = { 0 };            // allocated at build time: maxtry0 must be a constexpr
     // Loop over only nevt events if nevt is < 16 (note that nevt is always >= neppV)
     assert( nevt >= neppV );
     const int maxtry = std::min( maxtry0, nevt ); // 16, but at most nevt (avoid invalid memory access if nevt<maxtry0)
 
-    // PART 0 - INITIALISATION (before calculate_wavefunctions)
-    // Reset the "matrix elements" - running sums of |M|^2 over helicities for the given event
-    for( int ievt = 0; ievt < maxtry; ++ievt )
-    {
-      allMEs[ievt] = 0; // all zeros
-    }
-
-    // PART 1 - HELICITY LOOP: CALCULATE WAVEFUNCTIONS
+    // HELICITY LOOP: CALCULATE WAVEFUNCTIONS
     const int npagV = maxtry / neppV;
 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
     // Mixed fptypes #537: float for color algebra and double elsewhere
@@ -705,6 +689,16 @@ namespace mg5amcCpu
 #endif
       for( int ihel = 0; ihel < ncomb; ihel++ )
       {
+        // NEW IMPLEMENTATION OF GETGOODHEL (#630): RESET THE RUNNING SUM OVER HELICITIES TO 0 BEFORE ADDING A NEW HELICITY
+        for( int ieppV = 0; ieppV < neppV; ++ieppV )
+        {
+          const int ievt = ievt00 + ieppV;
+          allMEs[ievt] = 0;
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+          const int ievt2 = ievt00 + ieppV + neppV;
+          allMEs[ievt2] = 0;
+#endif
+        }
         constexpr fptype_sv* jamp2_sv = nullptr; // no need for color selection during helicity filtering
         //std::cout << "sigmaKin_getGoodHel ihel=" << ihel << ( isGoodHel[ihel] ? " true" : " false" ) << std::endl;
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
@@ -716,22 +710,18 @@ namespace mg5amcCpu
         for( int ieppV = 0; ieppV < neppV; ++ieppV )
         {
           const int ievt = ievt00 + ieppV;
-          const bool differs = ( allMEs[ievt] != allMEsLast[ievt] );
-          if( differs )
+          if( allMEs[ievt] != 0 ) // NEW IMPLEMENTATION OF GETGOODHEL (#630): COMPARE EACH HELICITY CONTRIBUTION TO 0
           {
             //if ( !isGoodHel[ihel] ) std::cout << "sigmaKin_getGoodHel ihel=" << ihel << " TRUE" << std::endl;
             isGoodHel[ihel] = true;
           }
-          allMEsLast[ievt] = allMEs[ievt]; // running sum up to helicity ihel
 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
           const int ievt2 = ievt00 + ieppV + neppV;
-          const bool differs2 = ( allMEs[ievt2] != allMEsLast[ievt2] );
-          if( differs2 )
+          if( allMEs[ievt2] != 0 ) // NEW IMPLEMENTATION OF GETGOODHEL (#630): COMPARE EACH HELICITY CONTRIBUTION TO 0
           {
             //if ( !isGoodHel[ihel] ) std::cout << "sigmaKin_getGoodHel ihel=" << ihel << " TRUE" << std::endl;
             isGoodHel[ihel] = true;
           }
-          allMEsLast[ievt2] = allMEs[ievt2]; // running sum up to helicity ihel
 #endif
         }
       }
@@ -788,13 +778,12 @@ namespace mg5amcCpu
   {
     mgDebugInitialise();
 
-    // SANITY CHECKS for cudacpp code generation (see issues #272 and #343 and PRs #619, #626, #360 and #396)
+    // SANITY CHECKS for cudacpp code generation (see issues #272 and #343 and PRs #619, #626, #360, #396 and #754)
     // These variable are not used anywhere else in the code and their scope is limited to this sanity check
     {
-      // nprocesses>1 was last observed for "mirror processes" in uux_ttx in the 270 branch (see issue #343 and PRs #360 and #396)
+      // nprocesses == 2 may happen for "mirror processes" such as P0_uux_ttx within pp_tt012j (see PR #754)
       constexpr int nprocesses = 1;
-      static_assert( nprocesses == 1, "Assume nprocesses == 1" );
-      // process_id corresponds to the index of DSIG1 Fortran functions (must be 1 because cudacpp is unable to handle DSIG2)
+      static_assert( nprocesses == 1 || nprocesses == 2, "Assume nprocesses == 1 or 2" );
       constexpr int process_id = 1; // code generation source: standalone_cudacpp
       static_assert( process_id == 1, "Assume process_id == 1" );
     }
@@ -880,23 +869,26 @@ namespace mg5amcCpu
     }
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     // Event-by-event random choice of color #402
-    const int channelIdC = channelId - 1; // coloramps.h uses the C array indexing starting at 0
-    fptype targetamp[ncolor] = { 0 };
-    for( int icolC = 0; icolC < ncolor; icolC++ )
-    {
-      if( icolC == 0 )
-        targetamp[icolC] = 0;
-      else
-        targetamp[icolC] = targetamp[icolC - 1];
-      if( mgOnGpu::icolamp[channelIdC][icolC] ) targetamp[icolC] += jamp2_sv[icolC];
-    }
-    //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] );
-    for( int icolC = 0; icolC < ncolor; icolC++ )
+    if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783)
     {
-      if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) )
+      const unsigned int channelIdC = channelId - 1; // coloramps.h uses the C array indexing starting at 0
+      fptype targetamp[ncolor] = { 0 };
+      for( int icolC = 0; icolC < ncolor; icolC++ )
       {
-        allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1]
-        break;
+        if( icolC == 0 )
+          targetamp[icolC] = 0;
+        else
+          targetamp[icolC] = targetamp[icolC - 1];
+        if( mgOnGpu::icolamp[channelIdC][icolC] ) targetamp[icolC] += jamp2_sv[icolC];
+      }
+      //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] );
+      for( int icolC = 0; icolC < ncolor; icolC++ )
+      {
+        if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) )
+        {
+          allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1]
+          break;
+        }
       }
     }
 #endif
@@ -991,57 +983,60 @@ namespace mg5amcCpu
 #endif
       }
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // multichannel enabled (random color choice)
-      const int channelIdC = channelId - 1; // coloramps.h uses the C array indexing starting at 0
       // Event-by-event random choice of color #402
-      fptype_sv targetamp[ncolor] = { 0 };
-      for( int icolC = 0; icolC < ncolor; icolC++ )
+      if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783)
       {
-        if( icolC == 0 )
-          targetamp[icolC] = fptype_sv{ 0 };
-        else
-          targetamp[icolC] = targetamp[icolC - 1];
-        if( mgOnGpu::icolamp[channelIdC][icolC] ) targetamp[icolC] += jamp2_sv[icolC];
-      }
+        const unsigned int channelIdC = channelId - 1; // coloramps.h uses the C array indexing starting at 0
+        fptype_sv targetamp[ncolor] = { 0 };
+        for( int icolC = 0; icolC < ncolor; icolC++ )
+        {
+          if( icolC == 0 )
+            targetamp[icolC] = fptype_sv{ 0 };
+          else
+            targetamp[icolC] = targetamp[icolC - 1];
+          if( mgOnGpu::icolamp[channelIdC][icolC] ) targetamp[icolC] += jamp2_sv[icolC];
+        }
 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-      fptype_sv targetamp2[ncolor] = { 0 };
-      for( int icolC = 0; icolC < ncolor; icolC++ )
-      {
-        if( icolC == 0 )
-          targetamp2[icolC] = fptype_sv{ 0 };
-        else
-          targetamp2[icolC] = targetamp2[icolC - 1];
-        if( mgOnGpu::icolamp[channelIdC][icolC] ) targetamp2[icolC] += jamp2_sv[ncolor + icolC];
-      }
-#endif
-      for( int ieppV = 0; ieppV < neppV; ++ieppV )
-      {
-        const int ievt = ievt00 + ieppV;
-        //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] );
+        fptype_sv targetamp2[ncolor] = { 0 };
         for( int icolC = 0; icolC < ncolor; icolC++ )
         {
+          if( icolC == 0 )
+            targetamp2[icolC] = fptype_sv{ 0 };
+          else
+            targetamp2[icolC] = targetamp2[icolC - 1];
+          if( mgOnGpu::icolamp[channelIdC][icolC] ) targetamp2[icolC] += jamp2_sv[ncolor + icolC];
+        }
+#endif
+        for( int ieppV = 0; ieppV < neppV; ++ieppV )
+        {
+          const int ievt = ievt00 + ieppV;
+          //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] );
+          for( int icolC = 0; icolC < ncolor; icolC++ )
+          {
 #if defined MGONGPU_CPPSIMD
-          const bool okcol = allrndcol[ievt] < ( targetamp[icolC][ieppV] / targetamp[ncolor - 1][ieppV] );
+            const bool okcol = allrndcol[ievt] < ( targetamp[icolC][ieppV] / targetamp[ncolor - 1][ieppV] );
 #else
-          const bool okcol = allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] );
+            const bool okcol = allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] );
 #endif
-          if( okcol )
-          {
-            allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1]
-            break;
+            if( okcol )
+            {
+              allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1]
+              break;
+            }
           }
-        }
 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-        const int ievt2 = ievt00 + ieppV + neppV;
-        //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt2, allrndcol[ievt2] );
-        for( int icolC = 0; icolC < ncolor; icolC++ )
-        {
-          if( allrndcol[ievt2] < ( targetamp2[icolC][ieppV] / targetamp2[ncolor - 1][ieppV] ) )
+          const int ievt2 = ievt00 + ieppV + neppV;
+          //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt2, allrndcol[ievt2] );
+          for( int icolC = 0; icolC < ncolor; icolC++ )
           {
-            allselcol[ievt2] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1]
-            break;
+            if( allrndcol[ievt2] < ( targetamp2[icolC][ieppV] / targetamp2[ncolor - 1][ieppV] ) )
+            {
+              allselcol[ievt2] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1]
+              break;
+            }
           }
-        }
 #endif
+        }
       }
 #endif // multichannel enabled (random color choice)
     }
diff --git a/epochX/cudacpp/ee_mumu.sa/SubProcesses/P1_Sigma_sm_epem_mupmum/CPPProcess.h b/epochX/cudacpp/ee_mumu.sa/SubProcesses/P1_Sigma_sm_epem_mupmum/CPPProcess.h
index 5b8fdd4347..0b29ffb3ff 100644
--- a/epochX/cudacpp/ee_mumu.sa/SubProcesses/P1_Sigma_sm_epem_mupmum/CPPProcess.h
+++ b/epochX/cudacpp/ee_mumu.sa/SubProcesses/P1_Sigma_sm_epem_mupmum/CPPProcess.h
@@ -7,7 +7,7 @@
 // Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+// MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
diff --git a/epochX/cudacpp/ee_mumu.sa/SubProcesses/P1_Sigma_sm_epem_mupmum/check_sa.cc b/epochX/cudacpp/ee_mumu.sa/SubProcesses/P1_Sigma_sm_epem_mupmum/check_sa.cc
index 1bad694d1c..7cac5ab47b 100644
--- a/epochX/cudacpp/ee_mumu.sa/SubProcesses/P1_Sigma_sm_epem_mupmum/check_sa.cc
+++ b/epochX/cudacpp/ee_mumu.sa/SubProcesses/P1_Sigma_sm_epem_mupmum/check_sa.cc
@@ -29,7 +29,9 @@
 
 #include <algorithm>
 #include <array>
+#include <cfenv> // for feenableexcept
 #include <cmath>
+#include <csignal> // for signal and SIGFPE
 #include <cstring>
 #include <fstream>
 #include <iomanip>
@@ -74,6 +76,23 @@ usage( char* argv0, int ret = 1 )
   return ret;
 }
 
+#ifdef __CUDACC__
+namespace mg5amcGpu
+#else
+namespace mg5amcCpu
+#endif
+{
+  inline void FPEhandler( int sig )
+  {
+#ifdef __CUDACC__
+    std::cerr << "Floating Point Exception (GPU)" << std::endl;
+#else
+    std::cerr << "Floating Point Exception (CPU)" << std::endl;
+#endif
+    exit( 0 );
+  }
+}
+
 int
 main( int argc, char** argv )
 {
@@ -84,6 +103,18 @@ main( int argc, char** argv )
   using namespace mg5amcCpu;
 #endif
 
+  // Enable FPEs (test #701 and #733 - except on MacOS where feenableexcept is not defined #730)
+#ifndef __APPLE__
+  const char* enableFPEc = getenv( "CUDACPP_RUNTIME_ENABLEFPE" );
+  const bool enableFPE = ( enableFPEc != 0 ) && ( std::string( enableFPEc ) != "" );
+  if( enableFPE )
+  {
+    std::cout << "WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions" << std::endl;
+    feenableexcept( FE_INVALID | FE_DIVBYZERO | FE_OVERFLOW | FE_UNDERFLOW ); // debug #701
+    signal( SIGFPE, FPEhandler );
+  }
+#endif
+
   // DEFAULTS FOR COMMAND LINE ARGUMENTS
   bool verbose = false;
   bool debug = false;
@@ -103,12 +134,14 @@ main( int argc, char** argv )
     CurandHost = 1,
     CurandDevice = 2
   };
-#ifdef __CUDACC__
-  RandomNumberMode rndgen = RandomNumberMode::CurandDevice; // default on CUDA GPU
-#elif not defined MGONGPU_HAS_NO_CURAND
-  RandomNumberMode rndgen = RandomNumberMode::CurandHost;  // default on CPU if build has curand
+#ifdef MGONGPU_HAS_NO_CURAND
+  RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // this is the only supported mode if build has no curand (PR #784 and #785)
+#elif defined __HIPCC__
+#error Internal error: MGONGPU_HAS_NO_CURAND should have been set for __HIPCC__ // default on AMD GPUs should be common random
+#elif defined __CUDACC__
+  RandomNumberMode rndgen = RandomNumberMode::CurandDevice; // default on NVidia GPU if build has curand
 #else
-  RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // default on CPU if build has no curand and on HIP GPU
+  RandomNumberMode rndgen = RandomNumberMode::CurandHost; // default on CPU if build has curand
 #endif
   // Rambo sampling mode (NB RamboHost implies CommonRandom or CurandHost!)
   enum class RamboSamplingMode
@@ -146,18 +179,20 @@ main( int argc, char** argv )
     }
     else if( arg == "--curdev" )
     {
-#ifdef __CUDACC__
-      rndgen = RandomNumberMode::CurandDevice;
+#ifndef __CUDACC__
+      throw std::runtime_error( "CurandDevice is not supported on CPUs or non-NVidia GPUs" );
+#elif defined MGONGPU_HAS_NO_CURAND
+      throw std::runtime_error( "CurandDevice is not supported because this application was built without Curand support" );
 #else
-      throw std::runtime_error( "CurandDevice is not supported on CPUs or on HIP GPUs" );
+      rndgen = RandomNumberMode::CurandDevice;
 #endif
     }
     else if( arg == "--curhst" )
     {
-#ifndef MGONGPU_HAS_NO_CURAND
-      rndgen = RandomNumberMode::CurandHost;
-#else
+#ifdef MGONGPU_HAS_NO_CURAND
       throw std::runtime_error( "CurandHost is not supported because this application was built without Curand support" );
+#else
+      rndgen = RandomNumberMode::CurandHost;
 #endif
     }
     else if( arg == "--common" )
@@ -278,10 +313,10 @@ main( int argc, char** argv )
   const std::string procKey = "0a ProcInit";
   timermap.start( procKey );
 
-  // Create a process object
+  // Create a process object, read param card and set parameters
+  // FIXME: the process instance can happily go out of scope because it is only needed to read parameters?
+  // FIXME: the CPPProcess should really be a singleton? (for instance, in bridge mode this will be called twice here?)
   CPPProcess process( verbose );
-
-  // Read param_card and set parameters
   process.initProc( "../../Cards/param_card.dat" );
   const fptype energy = 1500; // historical default, Ecms = 1500 GeV = 1.5 TeV (above the Z peak)
   //const fptype energy = 91.2; // Ecms = 91.2 GeV (Z peak)
@@ -389,30 +424,26 @@ main( int argc, char** argv )
   {
     prnk.reset( new CommonRandomNumberKernel( hstRndmom ) );
   }
-#ifndef MGONGPU_HAS_NO_CURAND
   else if( rndgen == RandomNumberMode::CurandHost )
   {
+#ifdef MGONGPU_HAS_NO_CURAND
+    throw std::runtime_error( "INTERNAL ERROR! CurandHost is not supported because this application was built without Curand support" ); // INTERNAL ERROR (no path to this statement)
+#else
     const bool onDevice = false;
     prnk.reset( new CurandRandomNumberKernel( hstRndmom, onDevice ) );
+#endif
   }
-#ifdef __CUDACC__
   else
   {
+#ifdef MGONGPU_HAS_NO_CURAND
+    throw std::runtime_error( "INTERNAL ERROR! CurandDevice is not supported because this application was built without Curand support" ); // INTERNAL ERROR (no path to this statement)
+#elif defined __CUDACC__
     const bool onDevice = true;
     prnk.reset( new CurandRandomNumberKernel( devRndmom, onDevice ) );
-  }
 #else
-  else
-  {
-    throw std::logic_error( "CurandDevice is not supported on CPUs or HIP GPUs" ); // INTERNAL ERROR (no path to this statement)
-  }
+    throw std::logic_error( "INTERNAL ERROR! CurandDevice is not supported on CPUs or non-NVidia GPUs" ); // INTERNAL ERROR (no path to this statement)
 #endif
-#else
-  else
-  {
-    throw std::logic_error( "This application was built without Curand support" ); // INTERNAL ERROR (no path to this statement)
   }
-#endif
 
   // --- 0c. Create rambo sampling kernel [keep this in 0c for the moment]
   std::unique_ptr<SamplingKernelBase> prsk;
@@ -747,7 +778,7 @@ main( int argc, char** argv )
   wrkflwtxt += "HIP:";
 #else
   wrkflwtxt += "CPP:";
-#endif
+#endif /* clang-format off */
   // -- DOUBLE or FLOAT?
 #if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
   wrkflwtxt += "MIX+"; // mixed fptypes (single precision color algebra #537)
@@ -757,7 +788,7 @@ main( int argc, char** argv )
   wrkflwtxt += "FLT+";
 #else
   wrkflwtxt += "???+"; // no path to this statement
-#endif
+#endif /* clang-format on */
   // -- CUCOMPLEX or THRUST or STD complex numbers?
 #ifdef __CUDACC__
 #if defined MGONGPU_CUCXTYPE_CUCOMPLEX
diff --git a/epochX/cudacpp/ee_mumu.sa/SubProcesses/cudacpp.mk b/epochX/cudacpp/ee_mumu.sa/SubProcesses/cudacpp.mk
index 59a2c906eb..f2cfa349da 100644
--- a/epochX/cudacpp/ee_mumu.sa/SubProcesses/cudacpp.mk
+++ b/epochX/cudacpp/ee_mumu.sa/SubProcesses/cudacpp.mk
@@ -4,10 +4,13 @@
 # Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 
 #=== Determine the name of this makefile (https://ftp.gnu.org/old-gnu/Manuals/make-3.80/html_node/make_17.html)
-#=== NB: different names (e.g. cudacpp.mk and cudacpp_src.mk) are used in the Subprocess and src directories
+#=== NB: use ':=' to ensure that the value of CUDACPP_MAKEFILE is not modified further down after including make_opts
+#=== NB: use 'override' to ensure that the value can not be modified from the outside
+override CUDACPP_MAKEFILE := $(word $(words $(MAKEFILE_LIST)),$(MAKEFILE_LIST))
+###$(info CUDACPP_MAKEFILE='$(CUDACPP_MAKEFILE)')
 
-CUDACPP_MAKEFILE = $(word $(words $(MAKEFILE_LIST)),$(MAKEFILE_LIST))
-CUDACPP_SRC_MAKEFILE = cudacpp_src.mk
+#=== NB: different names (e.g. cudacpp.mk and cudacpp_src.mk) are used in the Subprocess and src directories
+override CUDACPP_SRC_MAKEFILE = cudacpp_src.mk
 
 #-------------------------------------------------------------------------------
 
@@ -29,7 +32,17 @@ UNAME_P := $(shell uname -p)
 
 #-------------------------------------------------------------------------------
 
-#=== Configure common compiler flags for C++ and CUDA
+#=== Include the common MG5aMC Makefile options
+
+# OM: this is crucial for MG5aMC flag consistency/documentation
+# AV: temporarely comment this out because it breaks cudacpp builds
+ifneq ($(wildcard ../../Source/make_opts),)
+include ../../Source/make_opts
+endif
+
+#-------------------------------------------------------------------------------
+
+#=== Configure common compiler flags for C++ and CUDA/HIP
 
 INCFLAGS = -I.
 OPTFLAGS = -O3 # this ends up in GPUFLAGS too (should it?), cannot add -Ofast or -ffast-math here
@@ -101,68 +114,85 @@ endif
 # Note: AR, CXX and FC are implicitly defined if not set externally
 # See https://www.gnu.org/software/make/manual/html_node/Implicit-Variables.html
 
-#-------------------------------------------------------------------------------
-
-CUDA_COMPILER_PATH := $(shell compiler="`which nvcc 2>/dev/null`" && while [ -L "$$compiler" ]; do compiler=`readlink "$$compiler"`; done && echo "$$compiler")
-HIP_COMPILER_PATH := $(shell compiler="`which hipcc 2>/dev/null`" && while [ -L "$$compiler" ]; do compiler=`readlink "$$compiler"`; done && echo "$$compiler")
-
-ifeq ($(findstring nvcc,$(CUDA_COMPILER_PATH)),nvcc)
-  #=== Configure the CUDA compiler
-
-  # If CXX is not a single word (example "clang++ --gcc-toolchain...") then disable CUDA builds (issue #505)
-  # This is because it is impossible to pass this to "GPUFLAGS += -ccbin <host-compiler>" below
-  ifneq ($(words $(subst ccache ,,$(CXX))),1) # allow at most "CXX=ccache <host-compiler>" from outside
-    $(warning CUDA builds are not supported for multi-word CXX "$(CXX)")
-    override CUDA_HOME=disabled
-  endif
+# Add -mmacosx-version-min=11.3 to avoid "ld: warning: object file was built for newer macOS version than being linked"
+ifneq ($(shell $(CXX) --version | egrep '^Apple clang'),)
+CXXFLAGS += -mmacosx-version-min=11.3
+endif
 
-  # If CUDA_HOME is not set, try to set it from the location of nvcc
-  ifndef CUDA_HOME
-    CUDA_HOME = $(patsubst %bin/nvcc,%,$(shell which nvcc 2>/dev/null))
-    $(warning CUDA_HOME was not set: using "$(CUDA_HOME)")
-  endif
+#-------------------------------------------------------------------------------
 
-  # Set GPUCC as $(CUDA_HOME)/bin/nvcc if it exists
-  ifneq ($(wildcard $(CUDA_HOME)/bin/nvcc),)
-    GPUCC = $(CUDA_HOME)/bin/nvcc
-    USE_NVTX ?=-DUSE_NVTX
-    # See https://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/index.html
-    # See https://arnon.dk/matching-sm-architectures-arch-and-gencode-for-various-nvidia-cards/
-    # Default: use compute capability 70 for V100 (CERN lxbatch, CERN itscrd, Juwels Cluster).
-    # Embed device code for 70, and PTX for 70+.
-    # Export MADGRAPH_CUDA_ARCHITECTURE (comma-separated list) to use another value or list of values (see #533).
-    # Examples: use 60 for P100 (Piz Daint), 80 for A100 (Juwels Booster, NVidia raplab/Curiosity).
-    MADGRAPH_CUDA_ARCHITECTURE ?= 70
-    ###CUARCHFLAGS = -gencode arch=compute_$(MADGRAPH_CUDA_ARCHITECTURE),code=compute_$(MADGRAPH_CUDA_ARCHITECTURE) -gencode arch=compute_$(MADGRAPH_CUDA_ARCHITECTURE),code=sm_$(MADGRAPH_CUDA_ARCHITECTURE) # Older implementation (AV): go back to this one for multi-GPU support #533
-    ###CUARCHFLAGS = --gpu-architecture=compute_$(MADGRAPH_CUDA_ARCHITECTURE) --gpu-code=sm_$(MADGRAPH_CUDA_ARCHITECTURE),compute_$(MADGRAPH_CUDA_ARCHITECTURE) # Newer implementation (SH): cannot use this as-is for multi-GPU support #533
-    comma:=,
-    CUARCHFLAGS = $(foreach arch,$(subst $(comma), ,$(MADGRAPH_CUDA_ARCHITECTURE)),-gencode arch=compute_$(arch),code=compute_$(arch) -gencode arch=compute_$(arch),code=sm_$(arch))
-    CUINC = -I$(CUDA_HOME)/include/
-    CURANDLIBFLAGS = -L$(CUDA_HOME)/lib64/ -lcurand # NB: -lcuda is not needed here!
-    CUOPTFLAGS = -lineinfo
-    GPUFLAGS = $(foreach opt, $(OPTFLAGS), -Xcompiler $(opt)) $(CUOPTFLAGS) $(INCFLAGS) $(CUINC) $(USE_NVTX) $(CUARCHFLAGS) -use_fast_math
-    ###GPUFLAGS += -Xcompiler -Wall -Xcompiler -Wextra -Xcompiler -Wshadow
-    ###GPUCC_VERSION = $(shell $(GPUCC) --version | grep 'Cuda compilation tools' | cut -d' ' -f5 | cut -d, -f1)
-    GPUFLAGS += -std=c++17 # need CUDA >= 11.2 (see #333): this is enforced in mgOnGpuConfig.h
-    # Without -maxrregcount: baseline throughput: 6.5E8 (16384 32 12) up to 7.3E8 (65536 128 12)
-    ###GPUFLAGS+= --maxrregcount 160 # improves throughput: 6.9E8 (16384 32 12) up to 7.7E8 (65536 128 12)
-    ###GPUFLAGS+= --maxrregcount 128 # improves throughput: 7.3E8 (16384 32 12) up to 7.6E8 (65536 128 12)
-    ###GPUFLAGS+= --maxrregcount 96 # degrades throughput: 4.1E8 (16384 32 12) up to 4.5E8 (65536 128 12)
-    ###GPUFLAGS+= --maxrregcount 64 # degrades throughput: 1.7E8 (16384 32 12) flat at 1.7E8 (65536 128 12)
-    CUBUILDRULEFLAGS = -Xcompiler -fPIC -c
-    CCBUILDRULEFLAGS = -Xcompiler -fPIC -c -x cu
-    CUDATESTFLAGS = -lcuda
-  else ifneq ($(origin REQUIRE_CUDA),undefined)
-    # If REQUIRE_CUDA is set but no cuda is found, stop here (e.g. for CI tests on GPU #443)
-    $(error No cuda installation found (set CUDA_HOME or make GPUCC visible in PATH))
+#=== Configure the GPU compiler (CUDA or HIP)
+
+# FIXME! (AV 24.01.2024)
+# In the current implementation (without separate builds for C++ and CUDA/HIP), we first check for cudacc and hipcc in CUDA_HOME and HIP_HOME.
+# If CUDA_HOME or HIP_HOME are not set, try to determine them from the path to cudacc and hipcc.
+# While convoluted, this is currently necessary to allow disabling CUDA/HIP builds by setting CUDA_HOME or HIP_HOME to invalid paths.
+# This will (probably?) be fixed when separate C++ and CUDA/HIP builds are implemented (PR #775).
+
+# If CXX is not a single word (example "clang++ --gcc-toolchain...") then disable CUDA and HIP builds (issue #505)
+# This is because it is impossible to pass this to "GPUFLAGS += -ccbin <host-compiler>" below
+ifneq ($(words $(subst ccache ,,$(CXX))),1) # allow at most "CXX=ccache <host-compiler>" from outside
+  $(warning CUDA and HIP builds are not supported for multi-word CXX "$(CXX)")
+  override CUDA_HOME=disabled
+  override HIP_HOME=disabled
+endif
+
+# If CUDA_HOME is not set, try to set it from the path to nvcc
+ifndef CUDA_HOME
+  CUDA_HOME = $(patsubst %bin/nvcc,%,$(shell which nvcc 2>/dev/null))
+  $(warning CUDA_HOME was not set: using "$(CUDA_HOME)")
+endif
+
+# If HIP_HOME is not set, try to set it from the path to hipcc
+ifndef HIP_HOME
+  HIP_HOME = $(patsubst %bin/hipcc,%,$(HIP_COMPILER_PATH))
+  $(warning HIP_HOME was not set: using "$(HIP_HOME)")
+endif
+
+# FIXME! (AV 24.01.2024)
+# In the current implementation (without separate builds for C++ and CUDA/HIP),
+# builds are performed for HIP only if CUDA is not found in the path.
+# If both CUDA and HIP are installed, HIP builds can be triggered by unsetting CUDA_HOME.
+# This will be fixed when separate C++ and CUDA/HIP builds are implemented (PR #775).
+
+#--- Option 1: CUDA exists -> use CUDA
+
+# Set GPUCC as $(CUDA_HOME)/bin/nvcc if it exists
+ifneq ($(wildcard $(CUDA_HOME)/bin/nvcc),)
+
+  GPUCC = $(CUDA_HOME)/bin/nvcc
+  USE_NVTX ?=-DUSE_NVTX
+  # See https://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/index.html
+  # See https://arnon.dk/matching-sm-architectures-arch-and-gencode-for-various-nvidia-cards/
+  # Default: use compute capability 70 for V100 (CERN lxbatch, CERN itscrd, Juwels Cluster).
+  # Embed device code for 70, and PTX for 70+.
+  # Export MADGRAPH_CUDA_ARCHITECTURE (comma-separated list) to use another value or list of values (see #533).
+  # Examples: use 60 for P100 (Piz Daint), 80 for A100 (Juwels Booster, NVidia raplab/Curiosity).
+  MADGRAPH_CUDA_ARCHITECTURE ?= 70
+  ###CUARCHFLAGS = -gencode arch=compute_$(MADGRAPH_CUDA_ARCHITECTURE),code=compute_$(MADGRAPH_CUDA_ARCHITECTURE) -gencode arch=compute_$(MADGRAPH_CUDA_ARCHITECTURE),code=sm_$(MADGRAPH_CUDA_ARCHITECTURE) # Older implementation (AV): go back to this one for multi-GPU support #533
+  ###CUARCHFLAGS = --gpu-architecture=compute_$(MADGRAPH_CUDA_ARCHITECTURE) --gpu-code=sm_$(MADGRAPH_CUDA_ARCHITECTURE),compute_$(MADGRAPH_CUDA_ARCHITECTURE) # Newer implementation (SH): cannot use this as-is for multi-GPU support #533
+  comma:=,
+  CUARCHFLAGS = $(foreach arch,$(subst $(comma), ,$(MADGRAPH_CUDA_ARCHITECTURE)),-gencode arch=compute_$(arch),code=compute_$(arch) -gencode arch=compute_$(arch),code=sm_$(arch))
+  CUINC = -I$(CUDA_HOME)/include/
+  ifeq ($(RNDGEN),hasNoCurand)
+    CURANDLIBFLAGS=
   else
-    # No cuda. Switch cuda compilation off and go to common random numbers in C++
-    $(warning CUDA_HOME is not set or is invalid: export CUDA_HOME to compile with cuda)
-    override GPUCC=
-    override USE_NVTX=
-    override CUINC=
-    override CURANDLIBFLAGS=
+    CURANDLIBFLAGS = -L$(CUDA_HOME)/lib64/ -lcurand # NB: -lcuda is not needed here!
   endif
+  CUOPTFLAGS = -lineinfo
+  ###GPUFLAGS = $(OPTFLAGS) $(CUOPTFLAGS) $(INCFLAGS) $(CUINC) $(USE_NVTX) $(CUARCHFLAGS) -use_fast_math
+  GPUFLAGS = $(foreach opt, $(OPTFLAGS), -Xcompiler $(opt)) $(CUOPTFLAGS) $(INCFLAGS) $(CUINC) $(USE_NVTX) $(CUARCHFLAGS) -use_fast_math
+  ###GPUFLAGS += -Xcompiler -Wall -Xcompiler -Wextra -Xcompiler -Wshadow
+  ###GPUCC_VERSION = $(shell $(GPUCC) --version | grep 'Cuda compilation tools' | cut -d' ' -f5 | cut -d, -f1)
+  GPUFLAGS += -std=c++17 # need CUDA >= 11.2 (see #333): this is enforced in mgOnGpuConfig.h
+  # Without -maxrregcount: baseline throughput: 6.5E8 (16384 32 12) up to 7.3E8 (65536 128 12)
+  ###GPUFLAGS+= --maxrregcount 160 # improves throughput: 6.9E8 (16384 32 12) up to 7.7E8 (65536 128 12)
+  ###GPUFLAGS+= --maxrregcount 128 # improves throughput: 7.3E8 (16384 32 12) up to 7.6E8 (65536 128 12)
+  ###GPUFLAGS+= --maxrregcount 96 # degrades throughput: 4.1E8 (16384 32 12) up to 4.5E8 (65536 128 12)
+  ###GPUFLAGS+= --maxrregcount 64 # degrades throughput: 1.7E8 (16384 32 12) flat at 1.7E8 (65536 128 12)
+  CUBUILDRULEFLAGS = -Xcompiler -fPIC -c
+  CCBUILDRULEFLAGS = -Xcompiler -fPIC -c -x cu
+  CUDATESTFLAGS = -lcuda
 
   # Set the host C++ compiler for GPUCC via "-ccbin <host-compiler>"
   # (NB issue #505: this must be a single word, "clang++ --gcc-toolchain..." is not supported)
@@ -173,71 +203,55 @@ ifeq ($(findstring nvcc,$(CUDA_COMPILER_PATH)),nvcc)
   GPUFLAGS += -allow-unsupported-compiler
   endif
 
-else ifeq ($(findstring hipcc,$(HIP_COMPILER_PATH)),hipcc)
-  #=== Configure the HIP compiler
+else ifneq ($(origin REQUIRE_CUDA),undefined)
 
-  # If CXX is not a single word (example "clang++ --gcc-toolchain...") then disable CUDA/HIP builds (issue #505)
-  # This is because it is impossible to pass this to "GPUFLAGS += -ccbin <host-compiler>" below
-  ifneq ($(words $(subst ccache ,,$(CXX))),1) # allow at most "CXX=ccache <host-compiler>" from outside
-    $(warning HIP builds are not supported for multi-word CXX "$(CXX)")
-    override HIP_HOME=disabled
-  endif
+  # If REQUIRE_CUDA is set but no cuda is found, stop here (e.g. for CI tests on GPU #443)
+  $(error No cuda installation found (set CUDA_HOME or make GPUCC visible in PATH))
 
-  # If HIP_HOME is not set, try to set it from the location of GPUCC
-  ifndef HIP_HOME
-    HIP_HOME = $(patsubst %bin/hipcc,%,$(HIP_COMPILER_PATH))
-    $(warning HIP_HOME was not set: using "$(HIP_HOME)")
-  endif
+#--- Option 2: CUDA does not exist, HIP exists -> use HIP
 
-  # Set GPUCC as $(HIP_HOME)/bin/hipcc if it exists
-  ifneq ($(wildcard $(HIP_HOME)/bin/hipcc),)
-    GPUCC = $(HIP_HOME)/bin/hipcc
-
-    # Should maybe find something equivelant to this in HIP
-    #USE_NVTX ?=-DUSE_NVTX
-
-    HIPARCHFLAGS = -target x86_64-linux-gnu --offload-arch=gfx90a
-    HIPINC = -I$(HIP_HOME)/include/
-
-    # -DHIP_FAST_MATH equivelant to -use_fast_math in HIP 
-    # (But only for single precision line 208: https://rocm-developer-tools.github.io/HIP/hcc__detail_2math__functions_8h_source.html)
-    GPUFLAGS = $(OPTFLAGS) $(CUOPTFLAGS) $(INCFLAGS) $(HIPINC) $(HIPARCHFLAGS) -DHIP_FAST_MATH -DHIP_PLATFORM=amd -fPIC
-    ###GPUFLAGS += -Xcompiler -Wall -Xcompiler -Wextra -Xcompiler -Wshadow
-    GPUFLAGS += -std=c++17
-    # Without -maxrregcount: baseline throughput: 6.5E8 (16384 32 12) up to 7.3E8 (65536 128 12)
-    ###GPUFLAGS+= --maxrregcount 160 # improves throughput: 6.9E8 (16384 32 12) up to 7.7E8 (65536 128 12)
-    ###GPUFLAGS+= --maxrregcount 128 # improves throughput: 7.3E8 (16384 32 12) up to 7.6E8 (65536 128 12)
-    ###GPUFLAGS+= --maxrregcount 96 # degrades throughput: 4.1E8 (16384 32 12) up to 4.5E8 (65536 128 12)
-    ###GPUFLAGS+= --maxrregcount 64 # degrades throughput: 1.7E8 (16384 32 12) flat at 1.7E8 (65536 128 12)
-
-    CUBUILDRULEFLAGS = -fPIC -c
-    CCBUILDRULEFLAGS = -fPIC -c
-
-  else ifneq ($(origin REQUIRE_HIP),undefined)
-    # If REQUIRE_HIP is set but no cuda is found, stop here (e.g. for CI tests on GPU #443)
-    $(error No hip installation found (set HIP_HOME or make GPUCC visible in PATH))
-  else
-    # No hip. Switch hip compilation off and go to common random numbers in C++
-    $(warning HIP_HOME is not set or is invalid: export HIP_HOME to compile with hip)
-    override GPUCC=
-    override USE_NVTX=
-    override CUINC=
-    override CURANDLIBFLAGS=
-  endif
+# Set GPUCC as $(HIP_HOME)/bin/hipcc if it exists
+else ifneq ($(wildcard $(HIP_HOME)/bin/hipcc),)
 
-  # Allow newer (unsupported) C++ compilers with older versions of CUDA if ALLOW_UNSUPPORTED_COMPILER_IN_CUDA is set (#504)
-  ifneq ($(origin ALLOW_UNSUPPORTED_COMPILER_IN_CUDA),undefined)
-  GPUFLAGS += -allow-unsupported-compiler
-  endif
+  GPUCC = $(HIP_HOME)/bin/hipcc
+  #USE_NVTX ?=-DUSE_NVTX # should maybe find something equivalent to this in HIP?
+  HIPARCHFLAGS = -target x86_64-linux-gnu --offload-arch=gfx90a
+  HIPINC = -I$(HIP_HOME)/include/
+  # Note: -DHIP_FAST_MATH is equivalent to -use_fast_math in HIP 
+  # (but only for single precision line 208: https://rocm-developer-tools.github.io/HIP/hcc__detail_2math__functions_8h_source.html)
+  GPUFLAGS = $(OPTFLAGS) $(CUOPTFLAGS) $(INCFLAGS) $(HIPINC) $(HIPARCHFLAGS) -DHIP_FAST_MATH -DHIP_PLATFORM=amd -fPIC
+  ###GPUFLAGS += -Xcompiler -Wall -Xcompiler -Wextra -Xcompiler -Wshadow
+  GPUFLAGS += -std=c++17
+  ###GPUFLAGS+= --maxrregcount 255 # (AV: is this option valid on HIP and meaningful on AMD GPUs?)
+  CUBUILDRULEFLAGS = -fPIC -c
+  CCBUILDRULEFLAGS = -fPIC -c
+
+else ifneq ($(origin REQUIRE_HIP),undefined)
+
+  # If REQUIRE_HIP is set but no HIP is found, stop here (e.g. for CI tests on GPU #443)
+  $(error No hip installation found (set HIP_HOME or make GPUCC visible in PATH))
+
+#--- Option 3: CUDA does not exist, HIP does not exist -> switch off both CUDA and HIP
+
+else
+
+  # No cudacc and no hipcc: switch CUDA and HIP compilation off and go to common random numbers in C++
+  $(warning CUDA_HOME is not set or is invalid: export CUDA_HOME to compile with cuda)
+  $(warning HIP_HOME is not set or is invalid: export HIP_HOME to compile with hip)
+  override GPUCC=
+  override USE_NVTX=
+  override CUINC=
+  override CURANDLIBFLAGS=
 
 endif
 
+# Export GPUCC (so that it can also be used in cudacpp_src.mk?)
 export GPUCC
 export GPUFLAGS
 
 #-------------------------------------------------------------------------------
 
-#=== Configure ccache for C++ and CUDA builds
+#=== Configure ccache for C++ and CUDA/HIP builds
 
 # Enable ccache if USECCACHE=1
 ifeq ($(USECCACHE)$(shell echo $(CXX) | grep ccache),1)
@@ -254,7 +268,7 @@ endif
 
 #-------------------------------------------------------------------------------
 
-#=== Configure PowerPC-specific compiler flags for C++ and CUDA
+#=== Configure PowerPC-specific compiler flags for C++ and CUDA/HIP
 
 # PowerPC-specific CXX compiler flags (being reviewed)
 ifeq ($(UNAME_P),ppc64le)
@@ -270,7 +284,7 @@ else
   ######CXXFLAGS+= -fno-semantic-interposition # no benefit (neither alone, nor combined with -flto)
 endif
 
-# PowerPC-specific CUDA compiler flags (to be reviewed!)
+# PowerPC-specific CUDA/HIP compiler flags (to be reviewed!)
 ifeq ($(UNAME_P),ppc64le)
   GPUFLAGS+= -Xcompiler -mno-float128
 endif
@@ -285,12 +299,14 @@ override OMPFLAGS = -fopenmp
 ###override OMPFLAGS = # disable OpenMP MT on Intel (was ok without GPUCC but not ok with GPUCC before #578)
 else ifneq ($(shell $(CXX) --version | egrep '^(clang)'),)
 override OMPFLAGS = -fopenmp
-###override OMPFLAGS = # disable OpenMP MT on clang (was not ok without or with GPUCC before #578)
-else ifneq ($(shell $(CXX) --version | egrep '^(Apple clang)'),)
-override OMPFLAGS = # disable OpenMP MT on Apple clang (builds fail in the CI #578)
+###override OMPFLAGS = # disable OpenMP MT on clang (was not ok without or with nvcc before #578)
+###else ifneq ($(shell $(CXX) --version | egrep '^(Apple clang)'),) # AV for Mac (Apple clang compiler)
+else ifeq ($(UNAME_S),Darwin) # OM for Mac (any compiler)
+override OMPFLAGS = # AV disable OpenMP MT on Apple clang (builds fail in the CI #578)
+###override OMPFLAGS = -fopenmp # OM reenable OpenMP MT on Apple clang? (AV Oct 2023: this still fails in the CI)
 else
-override OMPFLAGS = -fopenmp
-###override OMPFLAGS = # disable OpenMP MT (default before #575)
+override OMPFLAGS = -fopenmp # enable OpenMP MT by default on all other platforms
+###override OMPFLAGS = # disable OpenMP MT on all other platforms (default before #575)
 endif
 
 # Set the default AVX (vectorization) choice
@@ -356,7 +372,7 @@ export OMPFLAGS
 
 #-------------------------------------------------------------------------------
 
-#=== Set the CUDA/C++ compiler flags appropriate to user-defined choices of AVX, FPTYPE, HELINL, HRDCOD, RNDGEN
+#=== Set the CUDA/HIP/C++ compiler flags appropriate to user-defined choices of AVX, FPTYPE, HELINL, HRDCOD, RNDGEN
 
 # Set the build flags appropriate to OMPFLAGS
 $(info OMPFLAGS=$(OMPFLAGS))
@@ -573,8 +589,9 @@ $(BUILDDIR)/gcheck_sa.o: CXXFLAGS += $(USE_NVTX) $(CUINC)
 
 # Apply special build flags only to check_sa and CurandRandomNumberKernel (curand headers, #679)
 $(BUILDDIR)/check_sa.o: CXXFLAGS += $(CXXFLAGSCURAND)
-$(BUILDDIR)/gcheck_sa.o: CXXFLAGS += $(CXXFLAGSCURAND)
+$(BUILDDIR)/gcheck_sa.o: CUFLAGS += $(CXXFLAGSCURAND)
 $(BUILDDIR)/CurandRandomNumberKernel.o: CXXFLAGS += $(CXXFLAGSCURAND)
+$(BUILDDIR)/gCurandRandomNumberKernel.o: CUFLAGS += $(CXXFLAGSCURAND)
 ifeq ($(RNDGEN),hasCurand)
 $(BUILDDIR)/CurandRandomNumberKernel.o: CXXFLAGS += $(CUINC)
 endif
@@ -772,12 +789,18 @@ $(testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(cxx_object
 	  $(GPUCC) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) -ldl $(LIBFLAGS) $(CUDATESTFLAGS)
 endif
 
+# Use target gtestlibs to build only googletest
+ifneq ($(GTESTLIBS),)
+gtestlibs: $(GTESTLIBS)
+endif
+
 # Use flock (Linux only, no Mac) to allow 'make -j' if googletest has not yet been downloaded https://stackoverflow.com/a/32666215
 $(GTESTLIBS):
 ifneq ($(shell which flock 2>/dev/null),)
+	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
 	flock $(BUILDDIR)/.make_test.lock $(MAKE) -C $(TESTDIR)
 else
-	$(MAKE) -C $(TESTDIR)
+	if [ -d $(TESTDIR) ]; then $(MAKE) -C $(TESTDIR); fi
 endif
 
 #-------------------------------------------------------------------------------
diff --git a/epochX/cudacpp/ee_mumu.sa/SubProcesses/fbridge.cc b/epochX/cudacpp/ee_mumu.sa/SubProcesses/fbridge.cc
index 2b956730d4..22ce3f5115 100644
--- a/epochX/cudacpp/ee_mumu.sa/SubProcesses/fbridge.cc
+++ b/epochX/cudacpp/ee_mumu.sa/SubProcesses/fbridge.cc
@@ -49,11 +49,7 @@ extern "C"
 #ifdef MGONGPUCPP_GPUIMPL
     GpuRuntime::setUp();
 #endif
-    // Create a process object, read parm card and set parameters
-    // FIXME: the process instance can happily go out of scope because it is only needed to read parameters?
-    // FIXME: the CPPProcess should really be a singleton? what if fbridgecreate is called from several Fortran threads?
-    CPPProcess process( /*verbose=*/false );
-    process.initProc( "../../Cards/param_card.dat" );
+    // (NB: CPPProcess::initProc no longer needs to be executed here because it is called in the Bridge constructor)
     // FIXME: disable OMP in Bridge when called from Fortran
     *ppbridge = new Bridge<FORTRANFPTYPE>( *pnevtF, *pnparF, *pnp4F );
   }
diff --git a/epochX/cudacpp/ee_mumu.sa/SubProcesses/runTest.cc b/epochX/cudacpp/ee_mumu.sa/SubProcesses/runTest.cc
index 0ed26180ca..de327f2321 100644
--- a/epochX/cudacpp/ee_mumu.sa/SubProcesses/runTest.cc
+++ b/epochX/cudacpp/ee_mumu.sa/SubProcesses/runTest.cc
@@ -71,6 +71,8 @@ struct CPUTest : public CUDA_CPU_TestBase
     , hstSelCol( nevt )
     , hstIsGoodHel( CPPProcess::ncomb )
   {
+    // FIXME: the process instance can happily go out of scope because it is only needed to read parameters?
+    // FIXME: the CPPProcess should really be a singleton?
     process.initProc( "../../Cards/param_card.dat" );
   }
 
@@ -183,6 +185,8 @@ struct CUDATest : public CUDA_CPU_TestBase
     , devSelCol( nevt )
     , devIsGoodHel( CPPProcess::ncomb )
   {
+    // FIXME: the process instance can happily go out of scope because it is only needed to read parameters?
+    // FIXME: the CPPProcess should really be a singleton?
     process.initProc( "../../Cards/param_card.dat" );
   }
 
diff --git a/epochX/cudacpp/ee_mumu.sa/SubProcesses/testxxx.cc b/epochX/cudacpp/ee_mumu.sa/SubProcesses/testxxx.cc
index 016bc0f472..e5167de00c 100644
--- a/epochX/cudacpp/ee_mumu.sa/SubProcesses/testxxx.cc
+++ b/epochX/cudacpp/ee_mumu.sa/SubProcesses/testxxx.cc
@@ -59,7 +59,8 @@ TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testxxx )
   using namespace mg5amcCpu;
 #endif
 #ifndef __APPLE__ // test #701 (except on MacOS where feenableexcept is not defined #730)
-  const bool enableFPE = !getenv( "CUDACPP_RUNTIME_DISABLEFPE" );
+  const char* enableFPEc = getenv( "CUDACPP_RUNTIME_ENABLEFPE" );
+  const bool enableFPE = ( enableFPEc != 0 ) && ( std::string( enableFPEc ) != "" );
   if( enableFPE )
   {
     feenableexcept( FE_INVALID | FE_DIVBYZERO | FE_OVERFLOW | FE_UNDERFLOW ); // debug #701
diff --git a/epochX/cudacpp/ee_mumu.sa/src/HelAmps_sm.h b/epochX/cudacpp/ee_mumu.sa/src/HelAmps_sm.h
index 4eab78944d..e878fcd28e 100644
--- a/epochX/cudacpp/ee_mumu.sa/src/HelAmps_sm.h
+++ b/epochX/cudacpp/ee_mumu.sa/src/HelAmps_sm.h
@@ -8,7 +8,7 @@
 // Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+// MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
@@ -863,6 +863,7 @@ namespace mg5amcCpu
           const fptype allF2[],
           const fptype allV3[],
           const fptype allCOUP[],
+          const double Ccoeff,
           fptype allvertexes[] ) ALWAYS_INLINE;
 
   //--------------------------------------------------------------------------
@@ -873,6 +874,7 @@ namespace mg5amcCpu
   FFV1P0_3( const fptype allF1[],
             const fptype allF2[],
             const fptype allCOUP[],
+            const double Ccoeff,
             const fptype M3,
             const fptype W3,
             fptype allV3[] ) ALWAYS_INLINE;
@@ -886,6 +888,7 @@ namespace mg5amcCpu
           const fptype allF2[],
           const fptype allV3[],
           const fptype allCOUP[],
+          const double Ccoeff,
           fptype allvertexes[] ) ALWAYS_INLINE;
 
   //--------------------------------------------------------------------------
@@ -896,6 +899,7 @@ namespace mg5amcCpu
   FFV2_3( const fptype allF1[],
           const fptype allF2[],
           const fptype allCOUP[],
+          const double Ccoeff,
           const fptype M3,
           const fptype W3,
           fptype allV3[] ) ALWAYS_INLINE;
@@ -909,6 +913,7 @@ namespace mg5amcCpu
           const fptype allF2[],
           const fptype allV3[],
           const fptype allCOUP[],
+          const double Ccoeff,
           fptype allvertexes[] ) ALWAYS_INLINE;
 
   //--------------------------------------------------------------------------
@@ -919,6 +924,7 @@ namespace mg5amcCpu
   FFV4_3( const fptype allF1[],
           const fptype allF2[],
           const fptype allCOUP[],
+          const double Ccoeff,
           const fptype M3,
           const fptype W3,
           fptype allV3[] ) ALWAYS_INLINE;
@@ -932,7 +938,9 @@ namespace mg5amcCpu
             const fptype allF2[],
             const fptype allV3[],
             const fptype allCOUP1[],
+            const double Ccoeff1,
             const fptype allCOUP2[],
+            const double Ccoeff2,
             fptype allvertexes[] ) ALWAYS_INLINE;
 
   //--------------------------------------------------------------------------
@@ -943,7 +951,9 @@ namespace mg5amcCpu
   FFV2_4_3( const fptype allF1[],
             const fptype allF2[],
             const fptype allCOUP1[],
+            const double Ccoeff1,
             const fptype allCOUP2[],
+            const double Ccoeff2,
             const fptype M3,
             const fptype W3,
             fptype allV3[] ) ALWAYS_INLINE;
@@ -957,6 +967,7 @@ namespace mg5amcCpu
           const fptype allF2[],
           const fptype allV3[],
           const fptype allCOUP[],
+          const double Ccoeff,
           fptype allvertexes[] )
   {
     mgDebug( 0, __FUNCTION__ );
@@ -980,6 +991,7 @@ namespace mg5amcCpu
   FFV1P0_3( const fptype allF1[],
             const fptype allF2[],
             const fptype allCOUP[],
+            const double Ccoeff,
             const fptype M3,
             const fptype W3,
             fptype allV3[] )
@@ -1011,6 +1023,7 @@ namespace mg5amcCpu
           const fptype allF2[],
           const fptype allV3[],
           const fptype allCOUP[],
+          const double Ccoeff,
           fptype allvertexes[] )
   {
     mgDebug( 0, __FUNCTION__ );
@@ -1034,6 +1047,7 @@ namespace mg5amcCpu
   FFV2_3( const fptype allF1[],
           const fptype allF2[],
           const fptype allCOUP[],
+          const double Ccoeff,
           const fptype M3,
           const fptype W3,
           fptype allV3[] )
@@ -1067,6 +1081,7 @@ namespace mg5amcCpu
           const fptype allF2[],
           const fptype allV3[],
           const fptype allCOUP[],
+          const double Ccoeff,
           fptype allvertexes[] )
   {
     mgDebug( 0, __FUNCTION__ );
@@ -1093,6 +1108,7 @@ namespace mg5amcCpu
   FFV4_3( const fptype allF1[],
           const fptype allF2[],
           const fptype allCOUP[],
+          const double Ccoeff,
           const fptype M3,
           const fptype W3,
           fptype allV3[] )
@@ -1129,7 +1145,9 @@ namespace mg5amcCpu
             const fptype allF2[],
             const fptype allV3[],
             const fptype allCOUP1[],
+            const double Ccoeff1,
             const fptype allCOUP2[],
+            const double Ccoeff2,
             fptype allvertexes[] )
   {
     mgDebug( 0, __FUNCTION__ );
@@ -1144,7 +1162,7 @@ namespace mg5amcCpu
     constexpr fptype two( 2. );
     const cxtype_sv TMP1 = ( F1[2] * ( F2[4] * ( V3[2] + V3[5] ) + F2[5] * ( V3[3] + cI * V3[4] ) ) + F1[3] * ( F2[4] * ( V3[3] - cI * V3[4] ) + F2[5] * ( V3[2] - V3[5] ) ) );
     const cxtype_sv TMP3 = ( F1[4] * ( F2[2] * ( V3[2] - V3[5] ) - F2[3] * ( V3[3] + cI * V3[4] ) ) + F1[5] * ( F2[2] * ( -V3[3] + cI * V3[4] ) + F2[3] * ( V3[2] + V3[5] ) ) );
-    ( *vertex ) = ( -one ) * ( COUP2 * ( +cI * TMP1 + ( two * cI ) * TMP3 ) + cI * ( TMP1 * COUP1 ) );
+    ( *vertex ) = ( -one ) * ( Ccoeff2 * COUP2 * ( +cI * TMP1 + ( two * cI ) * TMP3 ) + cI * ( TMP1 * Ccoeff1 * COUP1 ) );
     mgDebug( 1, __FUNCTION__ );
     return;
   }
@@ -1157,7 +1175,9 @@ namespace mg5amcCpu
   FFV2_4_3( const fptype allF1[],
             const fptype allF2[],
             const fptype allCOUP1[],
+            const double Ccoeff1,
             const fptype allCOUP2[],
+            const double Ccoeff2,
             const fptype M3,
             const fptype W3,
             fptype allV3[] )
@@ -1179,10 +1199,10 @@ namespace mg5amcCpu
     const cxtype_sv TMP2 = ( F1[2] * ( F2[4] * ( P3[0] + P3[3] ) + F2[5] * ( P3[1] + cI * P3[2] ) ) + F1[3] * ( F2[4] * ( P3[1] - cI * P3[2] ) + F2[5] * ( P3[0] - P3[3] ) ) );
     const cxtype_sv TMP4 = ( F1[4] * ( F2[2] * ( P3[0] - P3[3] ) - F2[3] * ( P3[1] + cI * P3[2] ) ) + F1[5] * ( F2[2] * ( -P3[1] + cI * P3[2] ) + F2[3] * ( P3[0] + P3[3] ) ) );
     const cxtype_sv denom = one / ( ( P3[0] * P3[0] ) - ( P3[1] * P3[1] ) - ( P3[2] * P3[2] ) - ( P3[3] * P3[3] ) - M3 * ( M3 - cI * W3 ) );
-    V3[2] = denom * ( -two * cI ) * ( COUP2 * ( OM3 * -half * P3[0] * ( TMP2 + two * TMP4 ) + ( +half * ( F1[2] * F2[4] + F1[3] * F2[5] ) + F1[4] * F2[2] + F1[5] * F2[3] ) ) + half * ( COUP1 * ( F1[2] * F2[4] + F1[3] * F2[5] - P3[0] * OM3 * TMP2 ) ) );
-    V3[3] = denom * ( -two * cI ) * ( COUP2 * ( OM3 * -half * P3[1] * ( TMP2 + two * TMP4 ) + ( -half * ( F1[2] * F2[5] + F1[3] * F2[4] ) + F1[4] * F2[3] + F1[5] * F2[2] ) ) - half * ( COUP1 * ( F1[2] * F2[5] + F1[3] * F2[4] + P3[1] * OM3 * TMP2 ) ) );
-    V3[4] = denom * cI * ( COUP2 * ( OM3 * P3[2] * ( TMP2 + two * TMP4 ) + ( +cI * ( F1[2] * F2[5] ) - cI * ( F1[3] * F2[4] ) + ( -two * cI ) * ( F1[4] * F2[3] ) + ( two * cI ) * ( F1[5] * F2[2] ) ) ) + COUP1 * ( +cI * ( F1[2] * F2[5] ) - cI * ( F1[3] * F2[4] ) + P3[2] * OM3 * TMP2 ) );
-    V3[5] = denom * ( two * cI ) * ( COUP2 * ( OM3 * half * P3[3] * ( TMP2 + two * TMP4 ) + ( +half * ( F1[2] * F2[4] ) - half * ( F1[3] * F2[5] ) - F1[4] * F2[2] + F1[5] * F2[3] ) ) + half * ( COUP1 * ( F1[2] * F2[4] + P3[3] * OM3 * TMP2 - F1[3] * F2[5] ) ) );
+    V3[2] = denom * ( -two * cI ) * ( Ccoeff2 * COUP2 * ( OM3 * -half * P3[0] * ( TMP2 + two * TMP4 ) + ( +half * ( F1[2] * F2[4] + F1[3] * F2[5] ) + F1[4] * F2[2] + F1[5] * F2[3] ) ) + half * ( Ccoeff1 * COUP1 * ( F1[2] * F2[4] + F1[3] * F2[5] - P3[0] * OM3 * TMP2 ) ) );
+    V3[3] = denom * ( -two * cI ) * ( Ccoeff2 * COUP2 * ( OM3 * -half * P3[1] * ( TMP2 + two * TMP4 ) + ( -half * ( F1[2] * F2[5] + F1[3] * F2[4] ) + F1[4] * F2[3] + F1[5] * F2[2] ) ) - half * ( Ccoeff1 * COUP1 * ( F1[2] * F2[5] + F1[3] * F2[4] + P3[1] * OM3 * TMP2 ) ) );
+    V3[4] = denom * cI * ( Ccoeff2 * COUP2 * ( OM3 * P3[2] * ( TMP2 + two * TMP4 ) + ( +cI * ( F1[2] * F2[5] ) - cI * ( F1[3] * F2[4] ) + ( -two * cI ) * ( F1[4] * F2[3] ) + ( two * cI ) * ( F1[5] * F2[2] ) ) ) + Ccoeff1 * COUP1 * ( +cI * ( F1[2] * F2[5] ) - cI * ( F1[3] * F2[4] ) + P3[2] * OM3 * TMP2 ) );
+    V3[5] = denom * ( two * cI ) * ( Ccoeff2 * COUP2 * ( OM3 * half * P3[3] * ( TMP2 + two * TMP4 ) + ( +half * ( F1[2] * F2[4] ) - half * ( F1[3] * F2[5] ) - F1[4] * F2[2] + F1[5] * F2[3] ) ) + half * ( Ccoeff1 * COUP1 * ( F1[2] * F2[4] + P3[3] * OM3 * TMP2 - F1[3] * F2[5] ) ) );
     mgDebug( 1, __FUNCTION__ );
     return;
   }
diff --git a/epochX/cudacpp/ee_mumu.sa/src/Parameters_sm.cc b/epochX/cudacpp/ee_mumu.sa/src/Parameters_sm.cc
index 0f03d17601..cffc5d3bff 100644
--- a/epochX/cudacpp/ee_mumu.sa/src/Parameters_sm.cc
+++ b/epochX/cudacpp/ee_mumu.sa/src/Parameters_sm.cc
@@ -7,7 +7,7 @@
 // Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+// MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
diff --git a/epochX/cudacpp/ee_mumu.sa/src/Parameters_sm.h b/epochX/cudacpp/ee_mumu.sa/src/Parameters_sm.h
index e3ab5916c0..2a6d960581 100644
--- a/epochX/cudacpp/ee_mumu.sa/src/Parameters_sm.h
+++ b/epochX/cudacpp/ee_mumu.sa/src/Parameters_sm.h
@@ -7,7 +7,7 @@
 // Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+// MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
diff --git a/epochX/cudacpp/ee_mumu.sa/src/cudacpp_src.mk b/epochX/cudacpp/ee_mumu.sa/src/cudacpp_src.mk
index f2804ffb85..159e19a46d 100644
--- a/epochX/cudacpp/ee_mumu.sa/src/cudacpp_src.mk
+++ b/epochX/cudacpp/ee_mumu.sa/src/cudacpp_src.mk
@@ -36,6 +36,13 @@ endif
 # See https://www.gnu.org/software/make/manual/html_node/Implicit-Variables.html
 ###RANLIB = ranlib
 
+# Add -mmacosx-version-min=11.3 to avoid "ld: warning: object file was built for newer macOS version than being linked"
+LDFLAGS =
+ifneq ($(shell $(CXX) --version | egrep '^Apple clang'),)
+CXXFLAGS += -mmacosx-version-min=11.3
+LDFLAGS += -mmacosx-version-min=11.3
+endif
+
 #-------------------------------------------------------------------------------
 
 #=== Configure the CUDA compiler (note: GPUCC is already exported including ccache)
@@ -266,11 +273,11 @@ endif
 ifneq ($(GPUCC),)
 $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so : $(cxx_objects) $(cu_objects)
 	@if [ ! -d $(LIBDIR) ]; then echo "mkdir -p $(LIBDIR)"; mkdir -p $(LIBDIR); fi
-	$(GPUCC) -shared -o $@ $(cxx_objects) $(cu_objects)
+	$(GPUCC) -shared -o $@ $(cxx_objects) $(cu_objects) $(LDFLAGS)
 else
 $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so : $(cxx_objects)
 	@if [ ! -d $(LIBDIR) ]; then echo "mkdir -p $(LIBDIR)"; mkdir -p $(LIBDIR); fi
-	$(CXX) -shared -o $@ $(cxx_objects)
+	$(CXX) -shared -o $@ $(cxx_objects) $(LDFLAGS)
 endif
 
 #-------------------------------------------------------------------------------
diff --git a/epochX/cudacpp/ee_mumu.sa/src/mgOnGpuConfig.h b/epochX/cudacpp/ee_mumu.sa/src/mgOnGpuConfig.h
index 205accb85b..da4ba36ad8 100644
--- a/epochX/cudacpp/ee_mumu.sa/src/mgOnGpuConfig.h
+++ b/epochX/cudacpp/ee_mumu.sa/src/mgOnGpuConfig.h
@@ -15,7 +15,6 @@
 #define MGONGPUCPP_GPUIMPL cuda
 #elif defined __HIPCC__
 #define MGONGPUCPP_GPUIMPL hip
-#include "hip/hip_runtime.h"
 #else
 #undef MGONGPUCPP_GPUIMPL
 #endif
@@ -24,16 +23,19 @@
 // ** NB2 Baseline on b7g47n0004 fluctuates (probably depends on load on other VMs)
 
 // Choose if curand is supported for generating random numbers
-// For CUDA, by default, it is supported
-// For HIP, by default, it is not supported
-// For C++, by default, do not inline, but allow this macro to be set from outside with e.g. -DMGONGPU_HAS_NO_CURAND
-#ifdef __CUDACC__
-#undef MGONGPU_HAS_NO_CURAND
-#elif defined __HIPCC__
+// For HIP, by default, do not use curand (common random numbers will be used instead)
+// For both CUDA and C++, by default, do not inline, but allow this macro to be set from outside with e.g. -DMGONGPU_HAS_NO_CURAND
+// (there exist CUDA installations, e.g. using the HPC package, which do not include curand - see PR #784 and #785)
+#if defined __HIPCC__
 #define MGONGPU_HAS_NO_CURAND 1
 #else
+//#ifdef __CUDACC__
+//#undef MGONGPU_HAS_NO_CURAND // default
+////#define MGONGPU_HAS_NO_CURAND 1
+//#else
 //#undef MGONGPU_HAS_NO_CURAND // default
 ////#define MGONGPU_HAS_NO_CURAND 1
+//#endif
 #endif
 
 // Choose floating point precision (for everything but color algebra #537)
diff --git a/epochX/cudacpp/ee_mumu.sa/src/mgOnGpuCxtypes.h b/epochX/cudacpp/ee_mumu.sa/src/mgOnGpuCxtypes.h
index 46d9f02733..5532e22fa1 100644
--- a/epochX/cudacpp/ee_mumu.sa/src/mgOnGpuCxtypes.h
+++ b/epochX/cudacpp/ee_mumu.sa/src/mgOnGpuCxtypes.h
@@ -159,6 +159,12 @@ namespace mg5amcCpu
     return cxsmpl<float>( a, 0 ) * b;
   }
 
+  inline __host__ __device__ constexpr cxsmpl<float>
+  operator*( const cxsmpl<float>& a, const double& b )
+  {
+    return a * cxsmpl<float>( b, 0 );
+  }
+
   template<typename FP>
   inline __host__ __device__ constexpr cxsmpl<FP>
   operator/( const cxsmpl<FP>& a, const cxsmpl<FP>& b )
diff --git a/epochX/cudacpp/gg_tt.mad/CODEGEN_mad_gg_tt_log.txt b/epochX/cudacpp/gg_tt.mad/CODEGEN_mad_gg_tt_log.txt
index 25807a1217..e35d15e679 100644
--- a/epochX/cudacpp/gg_tt.mad/CODEGEN_mad_gg_tt_log.txt
+++ b/epochX/cudacpp/gg_tt.mad/CODEGEN_mad_gg_tt_log.txt
@@ -62,7 +62,7 @@ generate g g > t t~
 No model currently active, so we import the Standard Model
 INFO: load particles 
 INFO: load vertices 
-[1;32mDEBUG: model prefixing  takes 0.00539088249206543 [0m
+[1;32mDEBUG: model prefixing  takes 0.00555729866027832 [0m
 INFO: Restrict model sm with file models/sm/restrict_default.dat . 
 [1;32mDEBUG: Simplifying conditional expressions [0m
 [1;32mDEBUG: remove interactions: u s w+ at order: QED=1 [0m
@@ -175,7 +175,7 @@ INFO: Generating Helas calls for process: g g > t t~ WEIGHTED<=2 @1
 INFO: Processing color information for process: g g > t t~ @1 
 INFO: Creating files in directory P1_gg_ttx 
 [1;32mDEBUG:  kwargs[prefix] = 0 [1;30m[model_handling.py at line 1058][0m [0m
-[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7f0450812940> [1;30m[export_v4.py at line 6262][0m [0m
+[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7f724bea16a0> [1;30m[export_v4.py at line 6262][0m [0m
 INFO: Creating files in directory . 
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.h
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.cc
@@ -191,7 +191,7 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./.
 INFO: Generating Feynman diagrams for Process: g g > t t~ WEIGHTED<=2 @1 
 INFO: Finding symmetric diagrams for subprocess group gg_ttx 
 Generated helas calls for 1 subprocesses (3 diagrams) in 0.006 s
-Wrote files for 10 helas calls in 0.100 s
+Wrote files for 10 helas calls in 0.101 s
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates VVV1 set of routines with options: P0[0m
 ALOHA: aloha creates FFV1 routines[0m
@@ -200,7 +200,7 @@ ALOHA: aloha creates 2 routines in  0.143 s
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates VVV1 set of routines with options: P0[0m
 ALOHA: aloha creates FFV1 routines[0m
-ALOHA: aloha creates 4 routines in  0.132 s
+ALOHA: aloha creates 4 routines in  0.130 s
 <class 'aloha.create_aloha.AbstractRoutine'> VVV1
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
@@ -237,9 +237,9 @@ Type "launch" to generate events from this process, or see
 Run "open index.html" to see more information about this process.
 quit
 
-real	0m1.694s
-user	0m1.467s
-sys	0m0.223s
+real	0m1.692s
+user	0m1.443s
+sys	0m0.239s
 Code generation completed in 2 seconds
 ************************************************************
 *                                                          *
diff --git a/epochX/cudacpp/gg_tt.sa/CODEGEN_cudacpp_gg_tt_log.txt b/epochX/cudacpp/gg_tt.sa/CODEGEN_cudacpp_gg_tt_log.txt
index ed49bfc186..d541a897ed 100644
--- a/epochX/cudacpp/gg_tt.sa/CODEGEN_cudacpp_gg_tt_log.txt
+++ b/epochX/cudacpp/gg_tt.sa/CODEGEN_cudacpp_gg_tt_log.txt
@@ -14,7 +14,7 @@ Running MG5 in debug mode
 *                   *        * *        *                  *
 *                 *                       *                *
 *                                                          *
-*         VERSION 3.5.1_lo_vect         2023-08-08         *
+*         VERSION 3.5.2_lo_vect         2023-11-08         *
 [1;31m*                                                          *[1;0m
 [1;31m*          WARNING: UNKNOWN DEVELOPMENT VERSION.           *[1;0m
 [1;31m*            WARNING: DO NOT USE FOR PRODUCTION            *[1;0m
@@ -53,7 +53,7 @@ Note that you can still compile and run aMC@NLO with the built-in PDFs
 Using default text editor "vi". Set another one in ./input/mg5_configuration.txt
 Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt
 Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt
-import /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_cudacpp_gg_tt.mg
+import /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_tt.mg
 The import format was not given, so we guess it as command
 set stdout_level DEBUG
 set output information to level: 10
@@ -62,7 +62,7 @@ generate g g > t t~
 No model currently active, so we import the Standard Model
 INFO: load particles 
 INFO: load vertices 
-[1;32mDEBUG: model prefixing  takes 0.0055599212646484375 [0m
+[1;32mDEBUG: model prefixing  takes 0.0053708553314208984 [0m
 INFO: Restrict model sm with file models/sm/restrict_default.dat . 
 [1;32mDEBUG: Simplifying conditional expressions [0m
 [1;32mDEBUG: remove interactions: u s w+ at order: QED=1 [0m
@@ -157,76 +157,45 @@ INFO: Trying process: g g > t t~ WEIGHTED<=2 @1
 INFO: Process has 3 diagrams 
 1 processes with 3 diagrams generated in 0.008 s
 Total: 1 processes with 3 diagrams
-output standalone_cudacpp CODEGEN_cudacpp_gg_tt
-Load PLUGIN.CUDACPP_SA_OUTPUT
-[1mOutput will be done with PLUGIN: CUDACPP_SA_OUTPUT[0m
+output standalone_cudacpp ../TMPOUT/CODEGEN_cudacpp_gg_tt
+Load PLUGIN.CUDACPP_OUTPUT
+[1mOutput will be done with PLUGIN: CUDACPP_OUTPUT[0m
 [1;32mDEBUG:  cformat = [0m plugin [1;30m[export_cpp.py at line 3071][0m [0m
-[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [1;30m[output.py at line 152][0m [0m
-[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [1;30m[output.py at line 157][0m [0m
-INFO: Creating subdirectories in directory /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_cudacpp_gg_tt 
+[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [1;30m[output.py at line 160][0m [0m
+[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [1;30m[output.py at line 165][0m [0m
+INFO: Creating subdirectories in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_tt 
 INFO: Organizing processes into subprocess groups 
 INFO: Generating Helas calls for process: g g > t t~ WEIGHTED<=2 @1 
 INFO: Processing color information for process: g g > t t~ @1 
-[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.generate_subprocess_directory (create the directory) [1;30m[output.py at line 186][0m [0m
-[1;32mDEBUG:    type(subproc_group)=<class 'madgraph.core.helas_objects.HelasMatrixElement'> [1;30m[output.py at line 187][0m [0m
-[1;32mDEBUG:    type(fortran_model)=<class 'PLUGIN.CUDACPP_SA_OUTPUT.model_handling.PLUGIN_GPUFOHelasCallWriter'> [1;30m[output.py at line 188][0m [0m
-[1;32mDEBUG:    type(me)=<class 'int'> me=0 [1;30m[output.py at line 189][0m [0m
-[1;32mDEBUG:  Entering PLUGIN_OneProcessExporter.__init__ [1;30m[model_handling.py at line 1039][0m [0m
-[1;32mDEBUG:  proc_id = [0m 0 [1;30m[model_handling.py at line 1045][0m [0m
-INFO: Creating files in directory /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_cudacpp_gg_tt/SubProcesses/P1_Sigma_sm_gg_ttx 
-[1;32mDEBUG:  Entering PLUGIN_OneProcessExporter.generate_process_files [1;30m[model_handling.py at line 1297][0m [0m
-[1;32mDEBUG:  self.include_multi_channel is not yet defined: this is standalone_cudacpp mode [1;30m[model_handling.py at line 1301][0m [0m
-FileWriter <class 'PLUGIN.CUDACPP_SA_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_cudacpp_gg_tt/SubProcesses/P1_Sigma_sm_gg_ttx/./CPPProcess.h
-[1;32mDEBUG:  Entering PLUGIN_OneProcessExporter.write_process_h_file [1;30m[model_handling.py at line 1453][0m [0m
-FileWriter <class 'PLUGIN.CUDACPP_SA_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_cudacpp_gg_tt/SubProcesses/P1_Sigma_sm_gg_ttx/./CPPProcess.cc
-[1;32mDEBUG:  Entering PLUGIN_OneProcessExporter.write_process_cc_file [1;30m[model_handling.py at line 1475][0m [0m
-[1;32mDEBUG:  Entering PLUGIN_OneProcessExporter.get_sigmaKin_lines [1;30m[model_handling.py at line 1143][0m [0m
-[1;32mDEBUG:  self.include_multi_channel = [0m False [1;30m[model_handling.py at line 1144][0m [0m
-[1;32mDEBUG:  self.support_multichannel = [0m True [1;30m[model_handling.py at line 1145][0m [0m
-[1;32mDEBUG:  type(self.helas_call_writer) = [0m <class 'PLUGIN.CUDACPP_SA_OUTPUT.model_handling.PLUGIN_GPUFOHelasCallWriter'> [1;30m[model_handling.py at line 1162][0m [0m
-[1;32mDEBUG:  self.support_multichannel, self.include_multi_channel = [0m True False [1;30m[model_handling.py at line 1163][0m [0m
-[1;32mDEBUG:  multi_channel_map = [0m None [1;30m[model_handling.py at line 1654][0m [0m
-[1;32mDEBUG:  diag_to_config = [0m {} [1;30m[model_handling.py at line 1711][0m [0m
-[1;32mDEBUG:  call = [0m vxxxxx( momenta,m_pars->%s, cHel[ihel][%d],%+d, w_sv[%d], %d ); [1;30m[model_handling.py at line 1823][0m [0m
-[1;32mDEBUG:  ('ZERO', 0, -1, 0, 0) [1;30m[model_handling.py at line 1824][0m [0m
-[1;32mDEBUG:  call = [0m vxxxxx( momenta,m_pars->%s, cHel[ihel][%d],%+d, w_sv[%d], %d ); [1;30m[model_handling.py at line 1823][0m [0m
-[1;32mDEBUG:  ('ZERO', 1, -1, 1, 1) [1;30m[model_handling.py at line 1824][0m [0m
-INFO: Created files CPPProcess.h and CPPProcess.cc in directory /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_cudacpp_gg_tt/SubProcesses/P1_Sigma_sm_gg_ttx/. 
-[1;32mDEBUG:  Entering PLUGIN_OneProcessExporter.edit_CMakeLists [1;30m[model_handling.py at line 1343][0m [0m
-[1;32mDEBUG:  Entering PLUGIN_OneProcessExporter.edit_check_sa [1;30m[model_handling.py at line 1352][0m [0m
-[1;32mDEBUG:  Entering PLUGIN_OneProcessExporter.edit_mgonGPU [1;30m[model_handling.py at line 1369][0m [0m
-[1;32mDEBUG:  Entering PLUGIN_OneProcessExporter.edit_processidfile [1;30m[model_handling.py at line 1389][0m [0m
-[1;32mDEBUG:  Entering PLUGIN_OneProcessExporter.edit_testxxx [1;30m[model_handling.py at line 1419][0m [0m
-[1;32mDEBUG:  Entering PLUGIN_OneProcessExporter.edit_memorybuffers [1;30m[model_handling.py at line 1430][0m [0m
-[1;32mDEBUG:  Entering PLUGIN_OneProcessExporter.edit_memoryaccesscouplings [1;30m[model_handling.py at line 1441][0m [0m
-[1;32mDEBUG:  'Copying test reference file: ', template_ref  = [0m Copying test reference file:  /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/../../../test/ref/dump_CPUTest.Sigma_sm_gg_ttx.txt [1;30m[model_handling.py at line 1335][0m [0m
+[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.generate_subprocess_directory (create the directory) [1;30m[output.py at line 194][0m [0m
+[1;32mDEBUG:    type(subproc_group)=<class 'madgraph.core.helas_objects.HelasMatrixElement'> [1;30m[output.py at line 195][0m [0m
+[1;32mDEBUG:    type(fortran_model)=<class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_GPUFOHelasCallWriter'> [1;30m[output.py at line 196][0m [0m
+[1;32mDEBUG:    type(me)=<class 'int'> me=0 [1;30m[output.py at line 197][0m [0m
+INFO: Creating files in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_tt/SubProcesses/P1_Sigma_sm_gg_ttx 
+FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_tt/SubProcesses/P1_Sigma_sm_gg_ttx/./CPPProcess.h
+FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_tt/SubProcesses/P1_Sigma_sm_gg_ttx/./CPPProcess.cc
+INFO: Created files CPPProcess.h and CPPProcess.cc in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_tt/SubProcesses/P1_Sigma_sm_gg_ttx/. 
 Generated helas calls for 1 subprocesses (3 diagrams) in 0.006 s
-[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.convert_model (create the model) [1;30m[output.py at line 194][0m [0m
+[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.convert_model (create the model) [1;30m[output.py at line 202][0m [0m
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates VVV1 set of routines with options: P0[0m
 ALOHA: aloha creates FFV1 routines[0m
-ALOHA: aloha creates 2 routines in  0.142 s
+ALOHA: aloha creates 2 routines in  0.144 s
 <class 'aloha.create_aloha.AbstractRoutine'> VVV1
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
-FileWriter <class 'PLUGIN.CUDACPP_SA_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_cudacpp_gg_tt/src/./HelAmps_sm.h
-INFO: Created file HelAmps_sm.h in directory /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_cudacpp_gg_tt/src/. 
+FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_tt/src/./HelAmps_sm.h
+INFO: Created file HelAmps_sm.h in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_tt/src/. 
 super_write_set_parameters_onlyfixMajorana (hardcoded=False)
-[1;32mDEBUG:  'pardef_lines size =', len(pardef_lines), ', keys size =', len(pardef_lines.keys())  = [0m pardef_lines size = 48 , keys size = 48 [1;30m[model_handling.py at line 729][0m [0m
 super_write_set_parameters_onlyfixMajorana (hardcoded=True)
-[1;32mDEBUG:  'pardef_lines size =', len(pardef_lines), ', keys size =', len(pardef_lines.keys())  = [0m pardef_lines size = 3 , keys size = 3 [1;30m[model_handling.py at line 729][0m [0m
-[1;32mDEBUG:  'pardef_lines size =', len(pardef_lines), ', keys size =', len(pardef_lines.keys())  = [0m pardef_lines size = 2 , keys size = 2 [1;30m[model_handling.py at line 729][0m [0m
-[1;32mDEBUG:  'pardef_lines size =', len(pardef_lines), ', keys size =', len(pardef_lines.keys())  = [0m pardef_lines size = 3 , keys size = 3 [1;30m[model_handling.py at line 729][0m [0m
-[1;32mDEBUG:  'pardef_lines size =', len(pardef_lines), ', keys size =', len(pardef_lines.keys())  = [0m pardef_lines size = 2 , keys size = 2 [1;30m[model_handling.py at line 729][0m [0m
-[1;32mDEBUG:  'pardef_lines size =', len(pardef_lines), ', keys size =', len(pardef_lines.keys())  = [0m pardef_lines size = 2 , keys size = 2 [1;30m[model_handling.py at line 729][0m [0m
-FileWriter <class 'PLUGIN.CUDACPP_SA_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_cudacpp_gg_tt/src/./Parameters_sm.h
-FileWriter <class 'PLUGIN.CUDACPP_SA_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_cudacpp_gg_tt/src/./Parameters_sm.cc
+FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_tt/src/./Parameters_sm.h
+FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_tt/src/./Parameters_sm.cc
 INFO: Created files Parameters_sm.h and Parameters_sm.cc in directory 
-INFO: /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_cudacpp_gg_tt/src/. and /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_cudacpp_gg_tt/src/. 
-[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.finalize [1;30m[output.py at line 203][0m [0m
+INFO: /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_tt/src/. and /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_tt/src/. 
 quit
 
-real	0m0.594s
-user	0m0.537s
-sys	0m0.048s
+real	0m0.545s
+user	0m0.478s
+sys	0m0.059s
+Code generation completed in 0 seconds
diff --git a/epochX/cudacpp/gg_tt.sa/SubProcesses/Bridge.h b/epochX/cudacpp/gg_tt.sa/SubProcesses/Bridge.h
index f37c972b24..89437b4c42 100644
--- a/epochX/cudacpp/gg_tt.sa/SubProcesses/Bridge.h
+++ b/epochX/cudacpp/gg_tt.sa/SubProcesses/Bridge.h
@@ -18,6 +18,7 @@
 #include <cassert>
 #include <cmath>
 #include <cstring>
+#include <filesystem>
 #include <iostream>
 #include <memory>
 #include <type_traits>
@@ -244,14 +245,21 @@ namespace mg5amcCpu
     }
     std::cout << "WARNING! Instantiate device Bridge (nevt=" << m_nevt << ", gpublocks=" << m_gpublocks << ", gputhreads=" << m_gputhreads
               << ", gpublocks*gputhreads=" << m_gpublocks * m_gputhreads << ")" << std::endl;
-    CPPProcess process( /*verbose=*/false );
     m_pmek.reset( new MatrixElementKernelDevice( m_devMomentaC, m_devGs, m_devRndHel, m_devRndCol, m_devMEs, m_devSelHel, m_devSelCol, m_gpublocks, m_gputhreads ) );
 #else
     std::cout << "WARNING! Instantiate host Bridge (nevt=" << m_nevt << ")" << std::endl;
-    CPPProcess process( /*verbose=*/false );
     m_pmek.reset( new MatrixElementKernelHost( m_hstMomentaC, m_hstGs, m_hstRndHel, m_hstRndCol, m_hstMEs, m_hstSelHel, m_hstSelCol, m_nevt ) );
 #endif // MGONGPUCPP_GPUIMPL
-    process.initProc( "../../Cards/param_card.dat" );
+    // Create a process object, read param card and set parameters
+    // FIXME: the process instance can happily go out of scope because it is only needed to read parameters?
+    // FIXME: the CPPProcess should really be a singleton? what if fbridgecreate is called from several Fortran threads?
+    CPPProcess process( /*verbose=*/false );
+    std::string paramCard = "../../Cards/param_card.dat";
+    if( !std::filesystem::exists( paramCard ) )
+    {
+      paramCard = "../" + paramCard;
+    }
+    process.initProc( paramCard );
   }
 
 #ifdef MGONGPUCPP_GPUIMPL
diff --git a/epochX/cudacpp/gg_tt.sa/SubProcesses/GpuAbstraction.h b/epochX/cudacpp/gg_tt.sa/SubProcesses/GpuAbstraction.h
index 9c467b1e04..6a7d9c05c0 100644
--- a/epochX/cudacpp/gg_tt.sa/SubProcesses/GpuAbstraction.h
+++ b/epochX/cudacpp/gg_tt.sa/SubProcesses/GpuAbstraction.h
@@ -39,6 +39,8 @@
 
 #elif defined __HIPCC__
 
+#include "hip/hip_runtime.h"
+
 #define gpuError_t hipError_t
 #define gpuPeekAtLastError hipPeekAtLastError
 #define gpuGetErrorString hipGetErrorString
diff --git a/epochX/cudacpp/gg_tt.sa/SubProcesses/MadgraphTest.h b/epochX/cudacpp/gg_tt.sa/SubProcesses/MadgraphTest.h
index 176338151a..a64c05c26a 100644
--- a/epochX/cudacpp/gg_tt.sa/SubProcesses/MadgraphTest.h
+++ b/epochX/cudacpp/gg_tt.sa/SubProcesses/MadgraphTest.h
@@ -14,6 +14,7 @@
 
 #include <array>
 #include <cmath>
+#include <filesystem>
 #include <fstream>
 #include <iomanip>
 #include <memory>
@@ -215,19 +216,16 @@ TEST_P( MadgraphTest, CompareMomentaAndME )
 #endif
   constexpr fptype energy = 1500; // historical default, Ecms = 1500 GeV = 1.5 TeV (above the Z peak)
   // Dump events to a new reference file?
-  constexpr bool dumpEvents = false;
-  std::string dumpFileName = std::string( "dump_" ) + testing::UnitTest::GetInstance()->current_test_info()->test_suite_name() + '.' + testing::UnitTest::GetInstance()->current_test_info()->name() + ".txt";
-  while( dumpFileName.find( '/' ) != std::string::npos )
-  {
-    dumpFileName.replace( dumpFileName.find( '/' ), 1, "_" );
-  }
+  const char* dumpEventsC = getenv( "CUDACPP_RUNTEST_DUMPEVENTS" );
+  const bool dumpEvents = ( dumpEventsC != 0 ) && ( std::string( dumpEventsC ) != "" );
+  const std::string refFileName = testDriver->getRefFileName();
+  const std::string dumpFileName = std::filesystem::path( refFileName ).filename();
   std::ofstream dumpFile;
   if( dumpEvents )
   {
     dumpFile.open( dumpFileName, std::ios::trunc );
   }
   // Read reference data
-  const std::string refFileName = testDriver->getRefFileName();
   std::map<unsigned int, ReferenceData> referenceData;
   if( !dumpEvents )
   {
diff --git a/epochX/cudacpp/gg_tt.sa/SubProcesses/MatrixElementKernels.cc b/epochX/cudacpp/gg_tt.sa/SubProcesses/MatrixElementKernels.cc
index d6d6c4f179..81699dfea9 100644
--- a/epochX/cudacpp/gg_tt.sa/SubProcesses/MatrixElementKernels.cc
+++ b/epochX/cudacpp/gg_tt.sa/SubProcesses/MatrixElementKernels.cc
@@ -112,10 +112,17 @@ namespace mg5amcCpu
     // See https://community.arm.com/arm-community-blogs/b/operating-systems-blog/posts/runtime-detection-of-cpu-features-on-an-armv8-a-cpu
     bool ok = true; // this is just an assumption!
     const std::string tag = "arm neon (128bit as in SSE4.2)";
-#else
+#elif defined( __x86_64__ ) || defined( __i386__ )
     bool known = true;
     bool ok = __builtin_cpu_supports( "sse4.2" );
     const std::string tag = "nehalem (SSE4.2)";
+#else // AV FIXME! Added by OM for Mac, should identify the correct __xxx__ flag that should be targeted
+    bool known = false; // __builtin_cpu_supports is not supported
+    // See https://gcc.gnu.org/onlinedocs/gcc/Basic-PowerPC-Built-in-Functions-Available-on-all-Configurations.html
+    // See https://stackoverflow.com/q/62783908
+    // See https://community.arm.com/arm-community-blogs/b/operating-systems-blog/posts/runtime-detection-of-cpu-features-on-an-armv8-a-cpu
+    bool ok = true; // this is just an assumption!
+    const std::string tag = "arm neon (128bit as in SSE4.2)";
 #endif
 #else
     bool known = true;
diff --git a/epochX/cudacpp/gg_tt.sa/SubProcesses/P1_Sigma_sm_gg_ttx/CPPProcess.cc b/epochX/cudacpp/gg_tt.sa/SubProcesses/P1_Sigma_sm_gg_ttx/CPPProcess.cc
index 960d7ef518..e7dbb05570 100644
--- a/epochX/cudacpp/gg_tt.sa/SubProcesses/P1_Sigma_sm_gg_ttx/CPPProcess.cc
+++ b/epochX/cudacpp/gg_tt.sa/SubProcesses/P1_Sigma_sm_gg_ttx/CPPProcess.cc
@@ -7,7 +7,7 @@
 // Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+// MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
@@ -247,10 +247,10 @@ namespace mg5amcCpu
 
       ixxxxx<M_ACCESS, W_ACCESS>( momenta, cIPD[0], cHel[ihel][3], -1, w_fp[3], 3 );
 
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[1], COUPs[0], 0., 0., w_fp[4] );
+      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[1], COUPs[0], 1.0, 0., 0., w_fp[4] );
 
       // Amplitude(s) for diagram number 1
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[2], w_fp[4], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[2], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -260,10 +260,10 @@ namespace mg5amcCpu
       // *** DIAGRAM 2 OF 3 ***
 
       // Wavefunction(s) for diagram number 2
-      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[2], w_fp[0], COUPs[1], cIPD[0], cIPD[1], w_fp[4] );
+      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[2], w_fp[0], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[4] );
 
       // Amplitude(s) for diagram number 2
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[4], w_fp[1], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[4], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -272,10 +272,10 @@ namespace mg5amcCpu
       // *** DIAGRAM 3 OF 3 ***
 
       // Wavefunction(s) for diagram number 3
-      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[0], COUPs[1], cIPD[0], cIPD[1], w_fp[4] );
+      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[0], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[4] );
 
       // Amplitude(s) for diagram number 3
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[4], w_fp[2], w_fp[1], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[4], w_fp[2], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -643,12 +643,12 @@ namespace mg5amcCpu
                        fptype* allDenominators,    // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
                        bool* isGoodHel )           // output: isGoodHel[ncomb] - device array (CUDA implementation)
-  { /* clang-format on */
-    fptype allMEsLast = 0;
+  {                                                         /* clang-format on */
     const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid
-    allMEs[ievt] = 0;
     for( int ihel = 0; ihel < ncomb; ihel++ )
     {
+      // NEW IMPLEMENTATION OF GETGOODHEL (#630): RESET THE RUNNING SUM OVER HELICITIES TO 0 BEFORE ADDING A NEW HELICITY
+      allMEs[ievt] = 0;
       // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s)
       constexpr fptype_sv* jamp2_sv = nullptr; // no need for color selection during helicity filtering
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
@@ -657,12 +657,11 @@ namespace mg5amcCpu
 #else
       calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv );
 #endif
-      if( allMEs[ievt] != allMEsLast )
+      if( allMEs[ievt] != 0 ) // NEW IMPLEMENTATION OF GETGOODHEL (#630): COMPARE EACH HELICITY CONTRIBUTION TO 0
       {
         //if ( !isGoodHel[ihel] ) std::cout << "sigmaKin_getGoodHel ihel=" << ihel << " TRUE" << std::endl;
         isGoodHel[ihel] = true;
       }
-      allMEsLast = allMEs[ievt]; // running sum up to helicity ihel for event ievt
     }
   }
 #else
@@ -681,19 +680,11 @@ namespace mg5amcCpu
     //assert( (size_t)(allMEs) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS]
     // Allocate arrays at build time to contain at least 16 events (or at least neppV events if neppV>16, e.g. in future VPUs)
     constexpr int maxtry0 = std::max( 16, neppV ); // 16, but at least neppV (otherwise the npagV loop does not even start)
-    fptype allMEsLast[maxtry0] = { 0 };            // allocated at build time: maxtry0 must be a constexpr
     // Loop over only nevt events if nevt is < 16 (note that nevt is always >= neppV)
     assert( nevt >= neppV );
     const int maxtry = std::min( maxtry0, nevt ); // 16, but at most nevt (avoid invalid memory access if nevt<maxtry0)
 
-    // PART 0 - INITIALISATION (before calculate_wavefunctions)
-    // Reset the "matrix elements" - running sums of |M|^2 over helicities for the given event
-    for( int ievt = 0; ievt < maxtry; ++ievt )
-    {
-      allMEs[ievt] = 0; // all zeros
-    }
-
-    // PART 1 - HELICITY LOOP: CALCULATE WAVEFUNCTIONS
+    // HELICITY LOOP: CALCULATE WAVEFUNCTIONS
     const int npagV = maxtry / neppV;
 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
     // Mixed fptypes #537: float for color algebra and double elsewhere
@@ -712,6 +703,16 @@ namespace mg5amcCpu
 #endif
       for( int ihel = 0; ihel < ncomb; ihel++ )
       {
+        // NEW IMPLEMENTATION OF GETGOODHEL (#630): RESET THE RUNNING SUM OVER HELICITIES TO 0 BEFORE ADDING A NEW HELICITY
+        for( int ieppV = 0; ieppV < neppV; ++ieppV )
+        {
+          const int ievt = ievt00 + ieppV;
+          allMEs[ievt] = 0;
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+          const int ievt2 = ievt00 + ieppV + neppV;
+          allMEs[ievt2] = 0;
+#endif
+        }
         constexpr fptype_sv* jamp2_sv = nullptr; // no need for color selection during helicity filtering
         //std::cout << "sigmaKin_getGoodHel ihel=" << ihel << ( isGoodHel[ihel] ? " true" : " false" ) << std::endl;
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
@@ -723,22 +724,18 @@ namespace mg5amcCpu
         for( int ieppV = 0; ieppV < neppV; ++ieppV )
         {
           const int ievt = ievt00 + ieppV;
-          const bool differs = ( allMEs[ievt] != allMEsLast[ievt] );
-          if( differs )
+          if( allMEs[ievt] != 0 ) // NEW IMPLEMENTATION OF GETGOODHEL (#630): COMPARE EACH HELICITY CONTRIBUTION TO 0
           {
             //if ( !isGoodHel[ihel] ) std::cout << "sigmaKin_getGoodHel ihel=" << ihel << " TRUE" << std::endl;
             isGoodHel[ihel] = true;
           }
-          allMEsLast[ievt] = allMEs[ievt]; // running sum up to helicity ihel
 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
           const int ievt2 = ievt00 + ieppV + neppV;
-          const bool differs2 = ( allMEs[ievt2] != allMEsLast[ievt2] );
-          if( differs2 )
+          if( allMEs[ievt2] != 0 ) // NEW IMPLEMENTATION OF GETGOODHEL (#630): COMPARE EACH HELICITY CONTRIBUTION TO 0
           {
             //if ( !isGoodHel[ihel] ) std::cout << "sigmaKin_getGoodHel ihel=" << ihel << " TRUE" << std::endl;
             isGoodHel[ihel] = true;
           }
-          allMEsLast[ievt2] = allMEs[ievt2]; // running sum up to helicity ihel
 #endif
         }
       }
@@ -795,13 +792,12 @@ namespace mg5amcCpu
   {
     mgDebugInitialise();
 
-    // SANITY CHECKS for cudacpp code generation (see issues #272 and #343 and PRs #619, #626, #360 and #396)
+    // SANITY CHECKS for cudacpp code generation (see issues #272 and #343 and PRs #619, #626, #360, #396 and #754)
     // These variable are not used anywhere else in the code and their scope is limited to this sanity check
     {
-      // nprocesses>1 was last observed for "mirror processes" in uux_ttx in the 270 branch (see issue #343 and PRs #360 and #396)
+      // nprocesses == 2 may happen for "mirror processes" such as P0_uux_ttx within pp_tt012j (see PR #754)
       constexpr int nprocesses = 1;
-      static_assert( nprocesses == 1, "Assume nprocesses == 1" );
-      // process_id corresponds to the index of DSIG1 Fortran functions (must be 1 because cudacpp is unable to handle DSIG2)
+      static_assert( nprocesses == 1 || nprocesses == 2, "Assume nprocesses == 1 or 2" );
       constexpr int process_id = 1; // code generation source: standalone_cudacpp
       static_assert( process_id == 1, "Assume process_id == 1" );
     }
@@ -887,23 +883,26 @@ namespace mg5amcCpu
     }
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     // Event-by-event random choice of color #402
-    const int channelIdC = channelId - 1; // coloramps.h uses the C array indexing starting at 0
-    fptype targetamp[ncolor] = { 0 };
-    for( int icolC = 0; icolC < ncolor; icolC++ )
+    if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783)
     {
-      if( icolC == 0 )
-        targetamp[icolC] = 0;
-      else
-        targetamp[icolC] = targetamp[icolC - 1];
-      if( mgOnGpu::icolamp[channelIdC][icolC] ) targetamp[icolC] += jamp2_sv[icolC];
-    }
-    //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] );
-    for( int icolC = 0; icolC < ncolor; icolC++ )
-    {
-      if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) )
+      const unsigned int channelIdC = channelId - 1; // coloramps.h uses the C array indexing starting at 0
+      fptype targetamp[ncolor] = { 0 };
+      for( int icolC = 0; icolC < ncolor; icolC++ )
       {
-        allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1]
-        break;
+        if( icolC == 0 )
+          targetamp[icolC] = 0;
+        else
+          targetamp[icolC] = targetamp[icolC - 1];
+        if( mgOnGpu::icolamp[channelIdC][icolC] ) targetamp[icolC] += jamp2_sv[icolC];
+      }
+      //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] );
+      for( int icolC = 0; icolC < ncolor; icolC++ )
+      {
+        if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) )
+        {
+          allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1]
+          break;
+        }
       }
     }
 #endif
@@ -998,57 +997,60 @@ namespace mg5amcCpu
 #endif
       }
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // multichannel enabled (random color choice)
-      const int channelIdC = channelId - 1; // coloramps.h uses the C array indexing starting at 0
       // Event-by-event random choice of color #402
-      fptype_sv targetamp[ncolor] = { 0 };
-      for( int icolC = 0; icolC < ncolor; icolC++ )
+      if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783)
       {
-        if( icolC == 0 )
-          targetamp[icolC] = fptype_sv{ 0 };
-        else
-          targetamp[icolC] = targetamp[icolC - 1];
-        if( mgOnGpu::icolamp[channelIdC][icolC] ) targetamp[icolC] += jamp2_sv[icolC];
-      }
+        const unsigned int channelIdC = channelId - 1; // coloramps.h uses the C array indexing starting at 0
+        fptype_sv targetamp[ncolor] = { 0 };
+        for( int icolC = 0; icolC < ncolor; icolC++ )
+        {
+          if( icolC == 0 )
+            targetamp[icolC] = fptype_sv{ 0 };
+          else
+            targetamp[icolC] = targetamp[icolC - 1];
+          if( mgOnGpu::icolamp[channelIdC][icolC] ) targetamp[icolC] += jamp2_sv[icolC];
+        }
 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-      fptype_sv targetamp2[ncolor] = { 0 };
-      for( int icolC = 0; icolC < ncolor; icolC++ )
-      {
-        if( icolC == 0 )
-          targetamp2[icolC] = fptype_sv{ 0 };
-        else
-          targetamp2[icolC] = targetamp2[icolC - 1];
-        if( mgOnGpu::icolamp[channelIdC][icolC] ) targetamp2[icolC] += jamp2_sv[ncolor + icolC];
-      }
-#endif
-      for( int ieppV = 0; ieppV < neppV; ++ieppV )
-      {
-        const int ievt = ievt00 + ieppV;
-        //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] );
+        fptype_sv targetamp2[ncolor] = { 0 };
         for( int icolC = 0; icolC < ncolor; icolC++ )
         {
+          if( icolC == 0 )
+            targetamp2[icolC] = fptype_sv{ 0 };
+          else
+            targetamp2[icolC] = targetamp2[icolC - 1];
+          if( mgOnGpu::icolamp[channelIdC][icolC] ) targetamp2[icolC] += jamp2_sv[ncolor + icolC];
+        }
+#endif
+        for( int ieppV = 0; ieppV < neppV; ++ieppV )
+        {
+          const int ievt = ievt00 + ieppV;
+          //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] );
+          for( int icolC = 0; icolC < ncolor; icolC++ )
+          {
 #if defined MGONGPU_CPPSIMD
-          const bool okcol = allrndcol[ievt] < ( targetamp[icolC][ieppV] / targetamp[ncolor - 1][ieppV] );
+            const bool okcol = allrndcol[ievt] < ( targetamp[icolC][ieppV] / targetamp[ncolor - 1][ieppV] );
 #else
-          const bool okcol = allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] );
+            const bool okcol = allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] );
 #endif
-          if( okcol )
-          {
-            allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1]
-            break;
+            if( okcol )
+            {
+              allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1]
+              break;
+            }
           }
-        }
 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-        const int ievt2 = ievt00 + ieppV + neppV;
-        //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt2, allrndcol[ievt2] );
-        for( int icolC = 0; icolC < ncolor; icolC++ )
-        {
-          if( allrndcol[ievt2] < ( targetamp2[icolC][ieppV] / targetamp2[ncolor - 1][ieppV] ) )
+          const int ievt2 = ievt00 + ieppV + neppV;
+          //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt2, allrndcol[ievt2] );
+          for( int icolC = 0; icolC < ncolor; icolC++ )
           {
-            allselcol[ievt2] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1]
-            break;
+            if( allrndcol[ievt2] < ( targetamp2[icolC][ieppV] / targetamp2[ncolor - 1][ieppV] ) )
+            {
+              allselcol[ievt2] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1]
+              break;
+            }
           }
-        }
 #endif
+        }
       }
 #endif // multichannel enabled (random color choice)
     }
diff --git a/epochX/cudacpp/gg_tt.sa/SubProcesses/P1_Sigma_sm_gg_ttx/CPPProcess.h b/epochX/cudacpp/gg_tt.sa/SubProcesses/P1_Sigma_sm_gg_ttx/CPPProcess.h
index 26a8ecb9f5..4a88a07226 100644
--- a/epochX/cudacpp/gg_tt.sa/SubProcesses/P1_Sigma_sm_gg_ttx/CPPProcess.h
+++ b/epochX/cudacpp/gg_tt.sa/SubProcesses/P1_Sigma_sm_gg_ttx/CPPProcess.h
@@ -7,7 +7,7 @@
 // Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+// MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
diff --git a/epochX/cudacpp/gg_tt.sa/SubProcesses/P1_Sigma_sm_gg_ttx/check_sa.cc b/epochX/cudacpp/gg_tt.sa/SubProcesses/P1_Sigma_sm_gg_ttx/check_sa.cc
index 1bad694d1c..7cac5ab47b 100644
--- a/epochX/cudacpp/gg_tt.sa/SubProcesses/P1_Sigma_sm_gg_ttx/check_sa.cc
+++ b/epochX/cudacpp/gg_tt.sa/SubProcesses/P1_Sigma_sm_gg_ttx/check_sa.cc
@@ -29,7 +29,9 @@
 
 #include <algorithm>
 #include <array>
+#include <cfenv> // for feenableexcept
 #include <cmath>
+#include <csignal> // for signal and SIGFPE
 #include <cstring>
 #include <fstream>
 #include <iomanip>
@@ -74,6 +76,23 @@ usage( char* argv0, int ret = 1 )
   return ret;
 }
 
+#ifdef __CUDACC__
+namespace mg5amcGpu
+#else
+namespace mg5amcCpu
+#endif
+{
+  inline void FPEhandler( int sig )
+  {
+#ifdef __CUDACC__
+    std::cerr << "Floating Point Exception (GPU)" << std::endl;
+#else
+    std::cerr << "Floating Point Exception (CPU)" << std::endl;
+#endif
+    exit( 0 );
+  }
+}
+
 int
 main( int argc, char** argv )
 {
@@ -84,6 +103,18 @@ main( int argc, char** argv )
   using namespace mg5amcCpu;
 #endif
 
+  // Enable FPEs (test #701 and #733 - except on MacOS where feenableexcept is not defined #730)
+#ifndef __APPLE__
+  const char* enableFPEc = getenv( "CUDACPP_RUNTIME_ENABLEFPE" );
+  const bool enableFPE = ( enableFPEc != 0 ) && ( std::string( enableFPEc ) != "" );
+  if( enableFPE )
+  {
+    std::cout << "WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions" << std::endl;
+    feenableexcept( FE_INVALID | FE_DIVBYZERO | FE_OVERFLOW | FE_UNDERFLOW ); // debug #701
+    signal( SIGFPE, FPEhandler );
+  }
+#endif
+
   // DEFAULTS FOR COMMAND LINE ARGUMENTS
   bool verbose = false;
   bool debug = false;
@@ -103,12 +134,14 @@ main( int argc, char** argv )
     CurandHost = 1,
     CurandDevice = 2
   };
-#ifdef __CUDACC__
-  RandomNumberMode rndgen = RandomNumberMode::CurandDevice; // default on CUDA GPU
-#elif not defined MGONGPU_HAS_NO_CURAND
-  RandomNumberMode rndgen = RandomNumberMode::CurandHost;  // default on CPU if build has curand
+#ifdef MGONGPU_HAS_NO_CURAND
+  RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // this is the only supported mode if build has no curand (PR #784 and #785)
+#elif defined __HIPCC__
+#error Internal error: MGONGPU_HAS_NO_CURAND should have been set for __HIPCC__ // default on AMD GPUs should be common random
+#elif defined __CUDACC__
+  RandomNumberMode rndgen = RandomNumberMode::CurandDevice; // default on NVidia GPU if build has curand
 #else
-  RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // default on CPU if build has no curand and on HIP GPU
+  RandomNumberMode rndgen = RandomNumberMode::CurandHost; // default on CPU if build has curand
 #endif
   // Rambo sampling mode (NB RamboHost implies CommonRandom or CurandHost!)
   enum class RamboSamplingMode
@@ -146,18 +179,20 @@ main( int argc, char** argv )
     }
     else if( arg == "--curdev" )
     {
-#ifdef __CUDACC__
-      rndgen = RandomNumberMode::CurandDevice;
+#ifndef __CUDACC__
+      throw std::runtime_error( "CurandDevice is not supported on CPUs or non-NVidia GPUs" );
+#elif defined MGONGPU_HAS_NO_CURAND
+      throw std::runtime_error( "CurandDevice is not supported because this application was built without Curand support" );
 #else
-      throw std::runtime_error( "CurandDevice is not supported on CPUs or on HIP GPUs" );
+      rndgen = RandomNumberMode::CurandDevice;
 #endif
     }
     else if( arg == "--curhst" )
     {
-#ifndef MGONGPU_HAS_NO_CURAND
-      rndgen = RandomNumberMode::CurandHost;
-#else
+#ifdef MGONGPU_HAS_NO_CURAND
       throw std::runtime_error( "CurandHost is not supported because this application was built without Curand support" );
+#else
+      rndgen = RandomNumberMode::CurandHost;
 #endif
     }
     else if( arg == "--common" )
@@ -278,10 +313,10 @@ main( int argc, char** argv )
   const std::string procKey = "0a ProcInit";
   timermap.start( procKey );
 
-  // Create a process object
+  // Create a process object, read param card and set parameters
+  // FIXME: the process instance can happily go out of scope because it is only needed to read parameters?
+  // FIXME: the CPPProcess should really be a singleton? (for instance, in bridge mode this will be called twice here?)
   CPPProcess process( verbose );
-
-  // Read param_card and set parameters
   process.initProc( "../../Cards/param_card.dat" );
   const fptype energy = 1500; // historical default, Ecms = 1500 GeV = 1.5 TeV (above the Z peak)
   //const fptype energy = 91.2; // Ecms = 91.2 GeV (Z peak)
@@ -389,30 +424,26 @@ main( int argc, char** argv )
   {
     prnk.reset( new CommonRandomNumberKernel( hstRndmom ) );
   }
-#ifndef MGONGPU_HAS_NO_CURAND
   else if( rndgen == RandomNumberMode::CurandHost )
   {
+#ifdef MGONGPU_HAS_NO_CURAND
+    throw std::runtime_error( "INTERNAL ERROR! CurandHost is not supported because this application was built without Curand support" ); // INTERNAL ERROR (no path to this statement)
+#else
     const bool onDevice = false;
     prnk.reset( new CurandRandomNumberKernel( hstRndmom, onDevice ) );
+#endif
   }
-#ifdef __CUDACC__
   else
   {
+#ifdef MGONGPU_HAS_NO_CURAND
+    throw std::runtime_error( "INTERNAL ERROR! CurandDevice is not supported because this application was built without Curand support" ); // INTERNAL ERROR (no path to this statement)
+#elif defined __CUDACC__
     const bool onDevice = true;
     prnk.reset( new CurandRandomNumberKernel( devRndmom, onDevice ) );
-  }
 #else
-  else
-  {
-    throw std::logic_error( "CurandDevice is not supported on CPUs or HIP GPUs" ); // INTERNAL ERROR (no path to this statement)
-  }
+    throw std::logic_error( "INTERNAL ERROR! CurandDevice is not supported on CPUs or non-NVidia GPUs" ); // INTERNAL ERROR (no path to this statement)
 #endif
-#else
-  else
-  {
-    throw std::logic_error( "This application was built without Curand support" ); // INTERNAL ERROR (no path to this statement)
   }
-#endif
 
   // --- 0c. Create rambo sampling kernel [keep this in 0c for the moment]
   std::unique_ptr<SamplingKernelBase> prsk;
@@ -747,7 +778,7 @@ main( int argc, char** argv )
   wrkflwtxt += "HIP:";
 #else
   wrkflwtxt += "CPP:";
-#endif
+#endif /* clang-format off */
   // -- DOUBLE or FLOAT?
 #if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
   wrkflwtxt += "MIX+"; // mixed fptypes (single precision color algebra #537)
@@ -757,7 +788,7 @@ main( int argc, char** argv )
   wrkflwtxt += "FLT+";
 #else
   wrkflwtxt += "???+"; // no path to this statement
-#endif
+#endif /* clang-format on */
   // -- CUCOMPLEX or THRUST or STD complex numbers?
 #ifdef __CUDACC__
 #if defined MGONGPU_CUCXTYPE_CUCOMPLEX
diff --git a/epochX/cudacpp/gg_tt.sa/SubProcesses/cudacpp.mk b/epochX/cudacpp/gg_tt.sa/SubProcesses/cudacpp.mk
index 59a2c906eb..f2cfa349da 100644
--- a/epochX/cudacpp/gg_tt.sa/SubProcesses/cudacpp.mk
+++ b/epochX/cudacpp/gg_tt.sa/SubProcesses/cudacpp.mk
@@ -4,10 +4,13 @@
 # Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 
 #=== Determine the name of this makefile (https://ftp.gnu.org/old-gnu/Manuals/make-3.80/html_node/make_17.html)
-#=== NB: different names (e.g. cudacpp.mk and cudacpp_src.mk) are used in the Subprocess and src directories
+#=== NB: use ':=' to ensure that the value of CUDACPP_MAKEFILE is not modified further down after including make_opts
+#=== NB: use 'override' to ensure that the value can not be modified from the outside
+override CUDACPP_MAKEFILE := $(word $(words $(MAKEFILE_LIST)),$(MAKEFILE_LIST))
+###$(info CUDACPP_MAKEFILE='$(CUDACPP_MAKEFILE)')
 
-CUDACPP_MAKEFILE = $(word $(words $(MAKEFILE_LIST)),$(MAKEFILE_LIST))
-CUDACPP_SRC_MAKEFILE = cudacpp_src.mk
+#=== NB: different names (e.g. cudacpp.mk and cudacpp_src.mk) are used in the Subprocess and src directories
+override CUDACPP_SRC_MAKEFILE = cudacpp_src.mk
 
 #-------------------------------------------------------------------------------
 
@@ -29,7 +32,17 @@ UNAME_P := $(shell uname -p)
 
 #-------------------------------------------------------------------------------
 
-#=== Configure common compiler flags for C++ and CUDA
+#=== Include the common MG5aMC Makefile options
+
+# OM: this is crucial for MG5aMC flag consistency/documentation
+# AV: temporarely comment this out because it breaks cudacpp builds
+ifneq ($(wildcard ../../Source/make_opts),)
+include ../../Source/make_opts
+endif
+
+#-------------------------------------------------------------------------------
+
+#=== Configure common compiler flags for C++ and CUDA/HIP
 
 INCFLAGS = -I.
 OPTFLAGS = -O3 # this ends up in GPUFLAGS too (should it?), cannot add -Ofast or -ffast-math here
@@ -101,68 +114,85 @@ endif
 # Note: AR, CXX and FC are implicitly defined if not set externally
 # See https://www.gnu.org/software/make/manual/html_node/Implicit-Variables.html
 
-#-------------------------------------------------------------------------------
-
-CUDA_COMPILER_PATH := $(shell compiler="`which nvcc 2>/dev/null`" && while [ -L "$$compiler" ]; do compiler=`readlink "$$compiler"`; done && echo "$$compiler")
-HIP_COMPILER_PATH := $(shell compiler="`which hipcc 2>/dev/null`" && while [ -L "$$compiler" ]; do compiler=`readlink "$$compiler"`; done && echo "$$compiler")
-
-ifeq ($(findstring nvcc,$(CUDA_COMPILER_PATH)),nvcc)
-  #=== Configure the CUDA compiler
-
-  # If CXX is not a single word (example "clang++ --gcc-toolchain...") then disable CUDA builds (issue #505)
-  # This is because it is impossible to pass this to "GPUFLAGS += -ccbin <host-compiler>" below
-  ifneq ($(words $(subst ccache ,,$(CXX))),1) # allow at most "CXX=ccache <host-compiler>" from outside
-    $(warning CUDA builds are not supported for multi-word CXX "$(CXX)")
-    override CUDA_HOME=disabled
-  endif
+# Add -mmacosx-version-min=11.3 to avoid "ld: warning: object file was built for newer macOS version than being linked"
+ifneq ($(shell $(CXX) --version | egrep '^Apple clang'),)
+CXXFLAGS += -mmacosx-version-min=11.3
+endif
 
-  # If CUDA_HOME is not set, try to set it from the location of nvcc
-  ifndef CUDA_HOME
-    CUDA_HOME = $(patsubst %bin/nvcc,%,$(shell which nvcc 2>/dev/null))
-    $(warning CUDA_HOME was not set: using "$(CUDA_HOME)")
-  endif
+#-------------------------------------------------------------------------------
 
-  # Set GPUCC as $(CUDA_HOME)/bin/nvcc if it exists
-  ifneq ($(wildcard $(CUDA_HOME)/bin/nvcc),)
-    GPUCC = $(CUDA_HOME)/bin/nvcc
-    USE_NVTX ?=-DUSE_NVTX
-    # See https://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/index.html
-    # See https://arnon.dk/matching-sm-architectures-arch-and-gencode-for-various-nvidia-cards/
-    # Default: use compute capability 70 for V100 (CERN lxbatch, CERN itscrd, Juwels Cluster).
-    # Embed device code for 70, and PTX for 70+.
-    # Export MADGRAPH_CUDA_ARCHITECTURE (comma-separated list) to use another value or list of values (see #533).
-    # Examples: use 60 for P100 (Piz Daint), 80 for A100 (Juwels Booster, NVidia raplab/Curiosity).
-    MADGRAPH_CUDA_ARCHITECTURE ?= 70
-    ###CUARCHFLAGS = -gencode arch=compute_$(MADGRAPH_CUDA_ARCHITECTURE),code=compute_$(MADGRAPH_CUDA_ARCHITECTURE) -gencode arch=compute_$(MADGRAPH_CUDA_ARCHITECTURE),code=sm_$(MADGRAPH_CUDA_ARCHITECTURE) # Older implementation (AV): go back to this one for multi-GPU support #533
-    ###CUARCHFLAGS = --gpu-architecture=compute_$(MADGRAPH_CUDA_ARCHITECTURE) --gpu-code=sm_$(MADGRAPH_CUDA_ARCHITECTURE),compute_$(MADGRAPH_CUDA_ARCHITECTURE) # Newer implementation (SH): cannot use this as-is for multi-GPU support #533
-    comma:=,
-    CUARCHFLAGS = $(foreach arch,$(subst $(comma), ,$(MADGRAPH_CUDA_ARCHITECTURE)),-gencode arch=compute_$(arch),code=compute_$(arch) -gencode arch=compute_$(arch),code=sm_$(arch))
-    CUINC = -I$(CUDA_HOME)/include/
-    CURANDLIBFLAGS = -L$(CUDA_HOME)/lib64/ -lcurand # NB: -lcuda is not needed here!
-    CUOPTFLAGS = -lineinfo
-    GPUFLAGS = $(foreach opt, $(OPTFLAGS), -Xcompiler $(opt)) $(CUOPTFLAGS) $(INCFLAGS) $(CUINC) $(USE_NVTX) $(CUARCHFLAGS) -use_fast_math
-    ###GPUFLAGS += -Xcompiler -Wall -Xcompiler -Wextra -Xcompiler -Wshadow
-    ###GPUCC_VERSION = $(shell $(GPUCC) --version | grep 'Cuda compilation tools' | cut -d' ' -f5 | cut -d, -f1)
-    GPUFLAGS += -std=c++17 # need CUDA >= 11.2 (see #333): this is enforced in mgOnGpuConfig.h
-    # Without -maxrregcount: baseline throughput: 6.5E8 (16384 32 12) up to 7.3E8 (65536 128 12)
-    ###GPUFLAGS+= --maxrregcount 160 # improves throughput: 6.9E8 (16384 32 12) up to 7.7E8 (65536 128 12)
-    ###GPUFLAGS+= --maxrregcount 128 # improves throughput: 7.3E8 (16384 32 12) up to 7.6E8 (65536 128 12)
-    ###GPUFLAGS+= --maxrregcount 96 # degrades throughput: 4.1E8 (16384 32 12) up to 4.5E8 (65536 128 12)
-    ###GPUFLAGS+= --maxrregcount 64 # degrades throughput: 1.7E8 (16384 32 12) flat at 1.7E8 (65536 128 12)
-    CUBUILDRULEFLAGS = -Xcompiler -fPIC -c
-    CCBUILDRULEFLAGS = -Xcompiler -fPIC -c -x cu
-    CUDATESTFLAGS = -lcuda
-  else ifneq ($(origin REQUIRE_CUDA),undefined)
-    # If REQUIRE_CUDA is set but no cuda is found, stop here (e.g. for CI tests on GPU #443)
-    $(error No cuda installation found (set CUDA_HOME or make GPUCC visible in PATH))
+#=== Configure the GPU compiler (CUDA or HIP)
+
+# FIXME! (AV 24.01.2024)
+# In the current implementation (without separate builds for C++ and CUDA/HIP), we first check for cudacc and hipcc in CUDA_HOME and HIP_HOME.
+# If CUDA_HOME or HIP_HOME are not set, try to determine them from the path to cudacc and hipcc.
+# While convoluted, this is currently necessary to allow disabling CUDA/HIP builds by setting CUDA_HOME or HIP_HOME to invalid paths.
+# This will (probably?) be fixed when separate C++ and CUDA/HIP builds are implemented (PR #775).
+
+# If CXX is not a single word (example "clang++ --gcc-toolchain...") then disable CUDA and HIP builds (issue #505)
+# This is because it is impossible to pass this to "GPUFLAGS += -ccbin <host-compiler>" below
+ifneq ($(words $(subst ccache ,,$(CXX))),1) # allow at most "CXX=ccache <host-compiler>" from outside
+  $(warning CUDA and HIP builds are not supported for multi-word CXX "$(CXX)")
+  override CUDA_HOME=disabled
+  override HIP_HOME=disabled
+endif
+
+# If CUDA_HOME is not set, try to set it from the path to nvcc
+ifndef CUDA_HOME
+  CUDA_HOME = $(patsubst %bin/nvcc,%,$(shell which nvcc 2>/dev/null))
+  $(warning CUDA_HOME was not set: using "$(CUDA_HOME)")
+endif
+
+# If HIP_HOME is not set, try to set it from the path to hipcc
+ifndef HIP_HOME
+  HIP_HOME = $(patsubst %bin/hipcc,%,$(HIP_COMPILER_PATH))
+  $(warning HIP_HOME was not set: using "$(HIP_HOME)")
+endif
+
+# FIXME! (AV 24.01.2024)
+# In the current implementation (without separate builds for C++ and CUDA/HIP),
+# builds are performed for HIP only if CUDA is not found in the path.
+# If both CUDA and HIP are installed, HIP builds can be triggered by unsetting CUDA_HOME.
+# This will be fixed when separate C++ and CUDA/HIP builds are implemented (PR #775).
+
+#--- Option 1: CUDA exists -> use CUDA
+
+# Set GPUCC as $(CUDA_HOME)/bin/nvcc if it exists
+ifneq ($(wildcard $(CUDA_HOME)/bin/nvcc),)
+
+  GPUCC = $(CUDA_HOME)/bin/nvcc
+  USE_NVTX ?=-DUSE_NVTX
+  # See https://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/index.html
+  # See https://arnon.dk/matching-sm-architectures-arch-and-gencode-for-various-nvidia-cards/
+  # Default: use compute capability 70 for V100 (CERN lxbatch, CERN itscrd, Juwels Cluster).
+  # Embed device code for 70, and PTX for 70+.
+  # Export MADGRAPH_CUDA_ARCHITECTURE (comma-separated list) to use another value or list of values (see #533).
+  # Examples: use 60 for P100 (Piz Daint), 80 for A100 (Juwels Booster, NVidia raplab/Curiosity).
+  MADGRAPH_CUDA_ARCHITECTURE ?= 70
+  ###CUARCHFLAGS = -gencode arch=compute_$(MADGRAPH_CUDA_ARCHITECTURE),code=compute_$(MADGRAPH_CUDA_ARCHITECTURE) -gencode arch=compute_$(MADGRAPH_CUDA_ARCHITECTURE),code=sm_$(MADGRAPH_CUDA_ARCHITECTURE) # Older implementation (AV): go back to this one for multi-GPU support #533
+  ###CUARCHFLAGS = --gpu-architecture=compute_$(MADGRAPH_CUDA_ARCHITECTURE) --gpu-code=sm_$(MADGRAPH_CUDA_ARCHITECTURE),compute_$(MADGRAPH_CUDA_ARCHITECTURE) # Newer implementation (SH): cannot use this as-is for multi-GPU support #533
+  comma:=,
+  CUARCHFLAGS = $(foreach arch,$(subst $(comma), ,$(MADGRAPH_CUDA_ARCHITECTURE)),-gencode arch=compute_$(arch),code=compute_$(arch) -gencode arch=compute_$(arch),code=sm_$(arch))
+  CUINC = -I$(CUDA_HOME)/include/
+  ifeq ($(RNDGEN),hasNoCurand)
+    CURANDLIBFLAGS=
   else
-    # No cuda. Switch cuda compilation off and go to common random numbers in C++
-    $(warning CUDA_HOME is not set or is invalid: export CUDA_HOME to compile with cuda)
-    override GPUCC=
-    override USE_NVTX=
-    override CUINC=
-    override CURANDLIBFLAGS=
+    CURANDLIBFLAGS = -L$(CUDA_HOME)/lib64/ -lcurand # NB: -lcuda is not needed here!
   endif
+  CUOPTFLAGS = -lineinfo
+  ###GPUFLAGS = $(OPTFLAGS) $(CUOPTFLAGS) $(INCFLAGS) $(CUINC) $(USE_NVTX) $(CUARCHFLAGS) -use_fast_math
+  GPUFLAGS = $(foreach opt, $(OPTFLAGS), -Xcompiler $(opt)) $(CUOPTFLAGS) $(INCFLAGS) $(CUINC) $(USE_NVTX) $(CUARCHFLAGS) -use_fast_math
+  ###GPUFLAGS += -Xcompiler -Wall -Xcompiler -Wextra -Xcompiler -Wshadow
+  ###GPUCC_VERSION = $(shell $(GPUCC) --version | grep 'Cuda compilation tools' | cut -d' ' -f5 | cut -d, -f1)
+  GPUFLAGS += -std=c++17 # need CUDA >= 11.2 (see #333): this is enforced in mgOnGpuConfig.h
+  # Without -maxrregcount: baseline throughput: 6.5E8 (16384 32 12) up to 7.3E8 (65536 128 12)
+  ###GPUFLAGS+= --maxrregcount 160 # improves throughput: 6.9E8 (16384 32 12) up to 7.7E8 (65536 128 12)
+  ###GPUFLAGS+= --maxrregcount 128 # improves throughput: 7.3E8 (16384 32 12) up to 7.6E8 (65536 128 12)
+  ###GPUFLAGS+= --maxrregcount 96 # degrades throughput: 4.1E8 (16384 32 12) up to 4.5E8 (65536 128 12)
+  ###GPUFLAGS+= --maxrregcount 64 # degrades throughput: 1.7E8 (16384 32 12) flat at 1.7E8 (65536 128 12)
+  CUBUILDRULEFLAGS = -Xcompiler -fPIC -c
+  CCBUILDRULEFLAGS = -Xcompiler -fPIC -c -x cu
+  CUDATESTFLAGS = -lcuda
 
   # Set the host C++ compiler for GPUCC via "-ccbin <host-compiler>"
   # (NB issue #505: this must be a single word, "clang++ --gcc-toolchain..." is not supported)
@@ -173,71 +203,55 @@ ifeq ($(findstring nvcc,$(CUDA_COMPILER_PATH)),nvcc)
   GPUFLAGS += -allow-unsupported-compiler
   endif
 
-else ifeq ($(findstring hipcc,$(HIP_COMPILER_PATH)),hipcc)
-  #=== Configure the HIP compiler
+else ifneq ($(origin REQUIRE_CUDA),undefined)
 
-  # If CXX is not a single word (example "clang++ --gcc-toolchain...") then disable CUDA/HIP builds (issue #505)
-  # This is because it is impossible to pass this to "GPUFLAGS += -ccbin <host-compiler>" below
-  ifneq ($(words $(subst ccache ,,$(CXX))),1) # allow at most "CXX=ccache <host-compiler>" from outside
-    $(warning HIP builds are not supported for multi-word CXX "$(CXX)")
-    override HIP_HOME=disabled
-  endif
+  # If REQUIRE_CUDA is set but no cuda is found, stop here (e.g. for CI tests on GPU #443)
+  $(error No cuda installation found (set CUDA_HOME or make GPUCC visible in PATH))
 
-  # If HIP_HOME is not set, try to set it from the location of GPUCC
-  ifndef HIP_HOME
-    HIP_HOME = $(patsubst %bin/hipcc,%,$(HIP_COMPILER_PATH))
-    $(warning HIP_HOME was not set: using "$(HIP_HOME)")
-  endif
+#--- Option 2: CUDA does not exist, HIP exists -> use HIP
 
-  # Set GPUCC as $(HIP_HOME)/bin/hipcc if it exists
-  ifneq ($(wildcard $(HIP_HOME)/bin/hipcc),)
-    GPUCC = $(HIP_HOME)/bin/hipcc
-
-    # Should maybe find something equivelant to this in HIP
-    #USE_NVTX ?=-DUSE_NVTX
-
-    HIPARCHFLAGS = -target x86_64-linux-gnu --offload-arch=gfx90a
-    HIPINC = -I$(HIP_HOME)/include/
-
-    # -DHIP_FAST_MATH equivelant to -use_fast_math in HIP 
-    # (But only for single precision line 208: https://rocm-developer-tools.github.io/HIP/hcc__detail_2math__functions_8h_source.html)
-    GPUFLAGS = $(OPTFLAGS) $(CUOPTFLAGS) $(INCFLAGS) $(HIPINC) $(HIPARCHFLAGS) -DHIP_FAST_MATH -DHIP_PLATFORM=amd -fPIC
-    ###GPUFLAGS += -Xcompiler -Wall -Xcompiler -Wextra -Xcompiler -Wshadow
-    GPUFLAGS += -std=c++17
-    # Without -maxrregcount: baseline throughput: 6.5E8 (16384 32 12) up to 7.3E8 (65536 128 12)
-    ###GPUFLAGS+= --maxrregcount 160 # improves throughput: 6.9E8 (16384 32 12) up to 7.7E8 (65536 128 12)
-    ###GPUFLAGS+= --maxrregcount 128 # improves throughput: 7.3E8 (16384 32 12) up to 7.6E8 (65536 128 12)
-    ###GPUFLAGS+= --maxrregcount 96 # degrades throughput: 4.1E8 (16384 32 12) up to 4.5E8 (65536 128 12)
-    ###GPUFLAGS+= --maxrregcount 64 # degrades throughput: 1.7E8 (16384 32 12) flat at 1.7E8 (65536 128 12)
-
-    CUBUILDRULEFLAGS = -fPIC -c
-    CCBUILDRULEFLAGS = -fPIC -c
-
-  else ifneq ($(origin REQUIRE_HIP),undefined)
-    # If REQUIRE_HIP is set but no cuda is found, stop here (e.g. for CI tests on GPU #443)
-    $(error No hip installation found (set HIP_HOME or make GPUCC visible in PATH))
-  else
-    # No hip. Switch hip compilation off and go to common random numbers in C++
-    $(warning HIP_HOME is not set or is invalid: export HIP_HOME to compile with hip)
-    override GPUCC=
-    override USE_NVTX=
-    override CUINC=
-    override CURANDLIBFLAGS=
-  endif
+# Set GPUCC as $(HIP_HOME)/bin/hipcc if it exists
+else ifneq ($(wildcard $(HIP_HOME)/bin/hipcc),)
 
-  # Allow newer (unsupported) C++ compilers with older versions of CUDA if ALLOW_UNSUPPORTED_COMPILER_IN_CUDA is set (#504)
-  ifneq ($(origin ALLOW_UNSUPPORTED_COMPILER_IN_CUDA),undefined)
-  GPUFLAGS += -allow-unsupported-compiler
-  endif
+  GPUCC = $(HIP_HOME)/bin/hipcc
+  #USE_NVTX ?=-DUSE_NVTX # should maybe find something equivalent to this in HIP?
+  HIPARCHFLAGS = -target x86_64-linux-gnu --offload-arch=gfx90a
+  HIPINC = -I$(HIP_HOME)/include/
+  # Note: -DHIP_FAST_MATH is equivalent to -use_fast_math in HIP 
+  # (but only for single precision line 208: https://rocm-developer-tools.github.io/HIP/hcc__detail_2math__functions_8h_source.html)
+  GPUFLAGS = $(OPTFLAGS) $(CUOPTFLAGS) $(INCFLAGS) $(HIPINC) $(HIPARCHFLAGS) -DHIP_FAST_MATH -DHIP_PLATFORM=amd -fPIC
+  ###GPUFLAGS += -Xcompiler -Wall -Xcompiler -Wextra -Xcompiler -Wshadow
+  GPUFLAGS += -std=c++17
+  ###GPUFLAGS+= --maxrregcount 255 # (AV: is this option valid on HIP and meaningful on AMD GPUs?)
+  CUBUILDRULEFLAGS = -fPIC -c
+  CCBUILDRULEFLAGS = -fPIC -c
+
+else ifneq ($(origin REQUIRE_HIP),undefined)
+
+  # If REQUIRE_HIP is set but no HIP is found, stop here (e.g. for CI tests on GPU #443)
+  $(error No hip installation found (set HIP_HOME or make GPUCC visible in PATH))
+
+#--- Option 3: CUDA does not exist, HIP does not exist -> switch off both CUDA and HIP
+
+else
+
+  # No cudacc and no hipcc: switch CUDA and HIP compilation off and go to common random numbers in C++
+  $(warning CUDA_HOME is not set or is invalid: export CUDA_HOME to compile with cuda)
+  $(warning HIP_HOME is not set or is invalid: export HIP_HOME to compile with hip)
+  override GPUCC=
+  override USE_NVTX=
+  override CUINC=
+  override CURANDLIBFLAGS=
 
 endif
 
+# Export GPUCC (so that it can also be used in cudacpp_src.mk?)
 export GPUCC
 export GPUFLAGS
 
 #-------------------------------------------------------------------------------
 
-#=== Configure ccache for C++ and CUDA builds
+#=== Configure ccache for C++ and CUDA/HIP builds
 
 # Enable ccache if USECCACHE=1
 ifeq ($(USECCACHE)$(shell echo $(CXX) | grep ccache),1)
@@ -254,7 +268,7 @@ endif
 
 #-------------------------------------------------------------------------------
 
-#=== Configure PowerPC-specific compiler flags for C++ and CUDA
+#=== Configure PowerPC-specific compiler flags for C++ and CUDA/HIP
 
 # PowerPC-specific CXX compiler flags (being reviewed)
 ifeq ($(UNAME_P),ppc64le)
@@ -270,7 +284,7 @@ else
   ######CXXFLAGS+= -fno-semantic-interposition # no benefit (neither alone, nor combined with -flto)
 endif
 
-# PowerPC-specific CUDA compiler flags (to be reviewed!)
+# PowerPC-specific CUDA/HIP compiler flags (to be reviewed!)
 ifeq ($(UNAME_P),ppc64le)
   GPUFLAGS+= -Xcompiler -mno-float128
 endif
@@ -285,12 +299,14 @@ override OMPFLAGS = -fopenmp
 ###override OMPFLAGS = # disable OpenMP MT on Intel (was ok without GPUCC but not ok with GPUCC before #578)
 else ifneq ($(shell $(CXX) --version | egrep '^(clang)'),)
 override OMPFLAGS = -fopenmp
-###override OMPFLAGS = # disable OpenMP MT on clang (was not ok without or with GPUCC before #578)
-else ifneq ($(shell $(CXX) --version | egrep '^(Apple clang)'),)
-override OMPFLAGS = # disable OpenMP MT on Apple clang (builds fail in the CI #578)
+###override OMPFLAGS = # disable OpenMP MT on clang (was not ok without or with nvcc before #578)
+###else ifneq ($(shell $(CXX) --version | egrep '^(Apple clang)'),) # AV for Mac (Apple clang compiler)
+else ifeq ($(UNAME_S),Darwin) # OM for Mac (any compiler)
+override OMPFLAGS = # AV disable OpenMP MT on Apple clang (builds fail in the CI #578)
+###override OMPFLAGS = -fopenmp # OM reenable OpenMP MT on Apple clang? (AV Oct 2023: this still fails in the CI)
 else
-override OMPFLAGS = -fopenmp
-###override OMPFLAGS = # disable OpenMP MT (default before #575)
+override OMPFLAGS = -fopenmp # enable OpenMP MT by default on all other platforms
+###override OMPFLAGS = # disable OpenMP MT on all other platforms (default before #575)
 endif
 
 # Set the default AVX (vectorization) choice
@@ -356,7 +372,7 @@ export OMPFLAGS
 
 #-------------------------------------------------------------------------------
 
-#=== Set the CUDA/C++ compiler flags appropriate to user-defined choices of AVX, FPTYPE, HELINL, HRDCOD, RNDGEN
+#=== Set the CUDA/HIP/C++ compiler flags appropriate to user-defined choices of AVX, FPTYPE, HELINL, HRDCOD, RNDGEN
 
 # Set the build flags appropriate to OMPFLAGS
 $(info OMPFLAGS=$(OMPFLAGS))
@@ -573,8 +589,9 @@ $(BUILDDIR)/gcheck_sa.o: CXXFLAGS += $(USE_NVTX) $(CUINC)
 
 # Apply special build flags only to check_sa and CurandRandomNumberKernel (curand headers, #679)
 $(BUILDDIR)/check_sa.o: CXXFLAGS += $(CXXFLAGSCURAND)
-$(BUILDDIR)/gcheck_sa.o: CXXFLAGS += $(CXXFLAGSCURAND)
+$(BUILDDIR)/gcheck_sa.o: CUFLAGS += $(CXXFLAGSCURAND)
 $(BUILDDIR)/CurandRandomNumberKernel.o: CXXFLAGS += $(CXXFLAGSCURAND)
+$(BUILDDIR)/gCurandRandomNumberKernel.o: CUFLAGS += $(CXXFLAGSCURAND)
 ifeq ($(RNDGEN),hasCurand)
 $(BUILDDIR)/CurandRandomNumberKernel.o: CXXFLAGS += $(CUINC)
 endif
@@ -772,12 +789,18 @@ $(testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(cxx_object
 	  $(GPUCC) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) -ldl $(LIBFLAGS) $(CUDATESTFLAGS)
 endif
 
+# Use target gtestlibs to build only googletest
+ifneq ($(GTESTLIBS),)
+gtestlibs: $(GTESTLIBS)
+endif
+
 # Use flock (Linux only, no Mac) to allow 'make -j' if googletest has not yet been downloaded https://stackoverflow.com/a/32666215
 $(GTESTLIBS):
 ifneq ($(shell which flock 2>/dev/null),)
+	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
 	flock $(BUILDDIR)/.make_test.lock $(MAKE) -C $(TESTDIR)
 else
-	$(MAKE) -C $(TESTDIR)
+	if [ -d $(TESTDIR) ]; then $(MAKE) -C $(TESTDIR); fi
 endif
 
 #-------------------------------------------------------------------------------
diff --git a/epochX/cudacpp/gg_tt.sa/SubProcesses/fbridge.cc b/epochX/cudacpp/gg_tt.sa/SubProcesses/fbridge.cc
index 2b956730d4..22ce3f5115 100644
--- a/epochX/cudacpp/gg_tt.sa/SubProcesses/fbridge.cc
+++ b/epochX/cudacpp/gg_tt.sa/SubProcesses/fbridge.cc
@@ -49,11 +49,7 @@ extern "C"
 #ifdef MGONGPUCPP_GPUIMPL
     GpuRuntime::setUp();
 #endif
-    // Create a process object, read parm card and set parameters
-    // FIXME: the process instance can happily go out of scope because it is only needed to read parameters?
-    // FIXME: the CPPProcess should really be a singleton? what if fbridgecreate is called from several Fortran threads?
-    CPPProcess process( /*verbose=*/false );
-    process.initProc( "../../Cards/param_card.dat" );
+    // (NB: CPPProcess::initProc no longer needs to be executed here because it is called in the Bridge constructor)
     // FIXME: disable OMP in Bridge when called from Fortran
     *ppbridge = new Bridge<FORTRANFPTYPE>( *pnevtF, *pnparF, *pnp4F );
   }
diff --git a/epochX/cudacpp/gg_tt.sa/SubProcesses/runTest.cc b/epochX/cudacpp/gg_tt.sa/SubProcesses/runTest.cc
index 0ed26180ca..de327f2321 100644
--- a/epochX/cudacpp/gg_tt.sa/SubProcesses/runTest.cc
+++ b/epochX/cudacpp/gg_tt.sa/SubProcesses/runTest.cc
@@ -71,6 +71,8 @@ struct CPUTest : public CUDA_CPU_TestBase
     , hstSelCol( nevt )
     , hstIsGoodHel( CPPProcess::ncomb )
   {
+    // FIXME: the process instance can happily go out of scope because it is only needed to read parameters?
+    // FIXME: the CPPProcess should really be a singleton?
     process.initProc( "../../Cards/param_card.dat" );
   }
 
@@ -183,6 +185,8 @@ struct CUDATest : public CUDA_CPU_TestBase
     , devSelCol( nevt )
     , devIsGoodHel( CPPProcess::ncomb )
   {
+    // FIXME: the process instance can happily go out of scope because it is only needed to read parameters?
+    // FIXME: the CPPProcess should really be a singleton?
     process.initProc( "../../Cards/param_card.dat" );
   }
 
diff --git a/epochX/cudacpp/gg_tt.sa/SubProcesses/testxxx.cc b/epochX/cudacpp/gg_tt.sa/SubProcesses/testxxx.cc
index 016bc0f472..e5167de00c 100644
--- a/epochX/cudacpp/gg_tt.sa/SubProcesses/testxxx.cc
+++ b/epochX/cudacpp/gg_tt.sa/SubProcesses/testxxx.cc
@@ -59,7 +59,8 @@ TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testxxx )
   using namespace mg5amcCpu;
 #endif
 #ifndef __APPLE__ // test #701 (except on MacOS where feenableexcept is not defined #730)
-  const bool enableFPE = !getenv( "CUDACPP_RUNTIME_DISABLEFPE" );
+  const char* enableFPEc = getenv( "CUDACPP_RUNTIME_ENABLEFPE" );
+  const bool enableFPE = ( enableFPEc != 0 ) && ( std::string( enableFPEc ) != "" );
   if( enableFPE )
   {
     feenableexcept( FE_INVALID | FE_DIVBYZERO | FE_OVERFLOW | FE_UNDERFLOW ); // debug #701
diff --git a/epochX/cudacpp/gg_tt.sa/src/HelAmps_sm.h b/epochX/cudacpp/gg_tt.sa/src/HelAmps_sm.h
index 57ad0974c3..add8fce575 100644
--- a/epochX/cudacpp/gg_tt.sa/src/HelAmps_sm.h
+++ b/epochX/cudacpp/gg_tt.sa/src/HelAmps_sm.h
@@ -8,7 +8,7 @@
 // Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+// MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
@@ -862,6 +862,7 @@ namespace mg5amcCpu
   VVV1P0_1( const fptype allV2[],
             const fptype allV3[],
             const fptype allCOUP[],
+            const double Ccoeff,
             const fptype M1,
             const fptype W1,
             fptype allV1[] ) ALWAYS_INLINE;
@@ -875,6 +876,7 @@ namespace mg5amcCpu
           const fptype allF2[],
           const fptype allV3[],
           const fptype allCOUP[],
+          const double Ccoeff,
           fptype allvertexes[] ) ALWAYS_INLINE;
 
   //--------------------------------------------------------------------------
@@ -885,6 +887,7 @@ namespace mg5amcCpu
   FFV1_1( const fptype allF2[],
           const fptype allV3[],
           const fptype allCOUP[],
+          const double Ccoeff,
           const fptype M1,
           const fptype W1,
           fptype allF1[] ) ALWAYS_INLINE;
@@ -897,6 +900,7 @@ namespace mg5amcCpu
   FFV1_2( const fptype allF1[],
           const fptype allV3[],
           const fptype allCOUP[],
+          const double Ccoeff,
           const fptype M2,
           const fptype W2,
           fptype allF2[] ) ALWAYS_INLINE;
@@ -909,6 +913,7 @@ namespace mg5amcCpu
   VVV1P0_1( const fptype allV2[],
             const fptype allV3[],
             const fptype allCOUP[],
+            const double Ccoeff,
             const fptype M1,
             const fptype W1,
             fptype allV1[] )
@@ -947,6 +952,7 @@ namespace mg5amcCpu
           const fptype allF2[],
           const fptype allV3[],
           const fptype allCOUP[],
+          const double Ccoeff,
           fptype allvertexes[] )
   {
     mgDebug( 0, __FUNCTION__ );
@@ -970,6 +976,7 @@ namespace mg5amcCpu
   FFV1_1( const fptype allF2[],
           const fptype allV3[],
           const fptype allCOUP[],
+          const double Ccoeff,
           const fptype M1,
           const fptype W1,
           fptype allF1[] )
@@ -1001,6 +1008,7 @@ namespace mg5amcCpu
   FFV1_2( const fptype allF1[],
           const fptype allV3[],
           const fptype allCOUP[],
+          const double Ccoeff,
           const fptype M2,
           const fptype W2,
           fptype allF2[] )
diff --git a/epochX/cudacpp/gg_tt.sa/src/Parameters_sm.cc b/epochX/cudacpp/gg_tt.sa/src/Parameters_sm.cc
index 0e29798b23..c5dd6e7e4c 100644
--- a/epochX/cudacpp/gg_tt.sa/src/Parameters_sm.cc
+++ b/epochX/cudacpp/gg_tt.sa/src/Parameters_sm.cc
@@ -7,7 +7,7 @@
 // Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+// MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
diff --git a/epochX/cudacpp/gg_tt.sa/src/Parameters_sm.h b/epochX/cudacpp/gg_tt.sa/src/Parameters_sm.h
index f629e8cadb..5f2f4391b9 100644
--- a/epochX/cudacpp/gg_tt.sa/src/Parameters_sm.h
+++ b/epochX/cudacpp/gg_tt.sa/src/Parameters_sm.h
@@ -7,7 +7,7 @@
 // Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+// MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
diff --git a/epochX/cudacpp/gg_tt.sa/src/cudacpp_src.mk b/epochX/cudacpp/gg_tt.sa/src/cudacpp_src.mk
index f2804ffb85..159e19a46d 100644
--- a/epochX/cudacpp/gg_tt.sa/src/cudacpp_src.mk
+++ b/epochX/cudacpp/gg_tt.sa/src/cudacpp_src.mk
@@ -36,6 +36,13 @@ endif
 # See https://www.gnu.org/software/make/manual/html_node/Implicit-Variables.html
 ###RANLIB = ranlib
 
+# Add -mmacosx-version-min=11.3 to avoid "ld: warning: object file was built for newer macOS version than being linked"
+LDFLAGS =
+ifneq ($(shell $(CXX) --version | egrep '^Apple clang'),)
+CXXFLAGS += -mmacosx-version-min=11.3
+LDFLAGS += -mmacosx-version-min=11.3
+endif
+
 #-------------------------------------------------------------------------------
 
 #=== Configure the CUDA compiler (note: GPUCC is already exported including ccache)
@@ -266,11 +273,11 @@ endif
 ifneq ($(GPUCC),)
 $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so : $(cxx_objects) $(cu_objects)
 	@if [ ! -d $(LIBDIR) ]; then echo "mkdir -p $(LIBDIR)"; mkdir -p $(LIBDIR); fi
-	$(GPUCC) -shared -o $@ $(cxx_objects) $(cu_objects)
+	$(GPUCC) -shared -o $@ $(cxx_objects) $(cu_objects) $(LDFLAGS)
 else
 $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so : $(cxx_objects)
 	@if [ ! -d $(LIBDIR) ]; then echo "mkdir -p $(LIBDIR)"; mkdir -p $(LIBDIR); fi
-	$(CXX) -shared -o $@ $(cxx_objects)
+	$(CXX) -shared -o $@ $(cxx_objects) $(LDFLAGS)
 endif
 
 #-------------------------------------------------------------------------------
diff --git a/epochX/cudacpp/gg_tt.sa/src/mgOnGpuConfig.h b/epochX/cudacpp/gg_tt.sa/src/mgOnGpuConfig.h
index 205accb85b..da4ba36ad8 100644
--- a/epochX/cudacpp/gg_tt.sa/src/mgOnGpuConfig.h
+++ b/epochX/cudacpp/gg_tt.sa/src/mgOnGpuConfig.h
@@ -15,7 +15,6 @@
 #define MGONGPUCPP_GPUIMPL cuda
 #elif defined __HIPCC__
 #define MGONGPUCPP_GPUIMPL hip
-#include "hip/hip_runtime.h"
 #else
 #undef MGONGPUCPP_GPUIMPL
 #endif
@@ -24,16 +23,19 @@
 // ** NB2 Baseline on b7g47n0004 fluctuates (probably depends on load on other VMs)
 
 // Choose if curand is supported for generating random numbers
-// For CUDA, by default, it is supported
-// For HIP, by default, it is not supported
-// For C++, by default, do not inline, but allow this macro to be set from outside with e.g. -DMGONGPU_HAS_NO_CURAND
-#ifdef __CUDACC__
-#undef MGONGPU_HAS_NO_CURAND
-#elif defined __HIPCC__
+// For HIP, by default, do not use curand (common random numbers will be used instead)
+// For both CUDA and C++, by default, do not inline, but allow this macro to be set from outside with e.g. -DMGONGPU_HAS_NO_CURAND
+// (there exist CUDA installations, e.g. using the HPC package, which do not include curand - see PR #784 and #785)
+#if defined __HIPCC__
 #define MGONGPU_HAS_NO_CURAND 1
 #else
+//#ifdef __CUDACC__
+//#undef MGONGPU_HAS_NO_CURAND // default
+////#define MGONGPU_HAS_NO_CURAND 1
+//#else
 //#undef MGONGPU_HAS_NO_CURAND // default
 ////#define MGONGPU_HAS_NO_CURAND 1
+//#endif
 #endif
 
 // Choose floating point precision (for everything but color algebra #537)
diff --git a/epochX/cudacpp/gg_tt.sa/src/mgOnGpuCxtypes.h b/epochX/cudacpp/gg_tt.sa/src/mgOnGpuCxtypes.h
index 46d9f02733..5532e22fa1 100644
--- a/epochX/cudacpp/gg_tt.sa/src/mgOnGpuCxtypes.h
+++ b/epochX/cudacpp/gg_tt.sa/src/mgOnGpuCxtypes.h
@@ -159,6 +159,12 @@ namespace mg5amcCpu
     return cxsmpl<float>( a, 0 ) * b;
   }
 
+  inline __host__ __device__ constexpr cxsmpl<float>
+  operator*( const cxsmpl<float>& a, const double& b )
+  {
+    return a * cxsmpl<float>( b, 0 );
+  }
+
   template<typename FP>
   inline __host__ __device__ constexpr cxsmpl<FP>
   operator/( const cxsmpl<FP>& a, const cxsmpl<FP>& b )
diff --git a/epochX/cudacpp/gg_tt01g.mad/CODEGEN_mad_gg_tt01g_log.txt b/epochX/cudacpp/gg_tt01g.mad/CODEGEN_mad_gg_tt01g_log.txt
index b3d319e039..0e50ba9321 100644
--- a/epochX/cudacpp/gg_tt01g.mad/CODEGEN_mad_gg_tt01g_log.txt
+++ b/epochX/cudacpp/gg_tt01g.mad/CODEGEN_mad_gg_tt01g_log.txt
@@ -52,7 +52,7 @@ Note that you can still compile and run aMC@NLO with the built-in PDFs
 
 Using default text editor "vi". Set another one in ./input/mg5_configuration.txt
 Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt
-No valid web browser found. Please set in ./input/mg5_configuration.txt
+Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt
 import /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g.mg
 The import format was not given, so we guess it as command
 set stdout_level DEBUG
@@ -62,7 +62,7 @@ generate g g > t t~
 No model currently active, so we import the Standard Model
 INFO: load particles 
 INFO: load vertices 
-[1;32mDEBUG: model prefixing  takes 0.005671977996826172 [0m
+[1;32mDEBUG: model prefixing  takes 0.005617380142211914 [0m
 INFO: Restrict model sm with file models/sm/restrict_default.dat . 
 [1;32mDEBUG: Simplifying conditional expressions [0m
 [1;32mDEBUG: remove interactions: u s w+ at order: QED=1 [0m
@@ -163,7 +163,7 @@ INFO: Please specify coupling orders to bypass this step.
 INFO: Trying coupling order WEIGHTED<=3: WEIGTHED IS QCD+2*QED 
 INFO: Trying process: g g > t t~ g WEIGHTED<=3 @2  
 INFO: Process has 16 diagrams 
-1 processes with 16 diagrams generated in 0.020 s
+1 processes with 16 diagrams generated in 0.019 s
 Total: 2 processes with 19 diagrams
 output madevent ../TMPOUT/CODEGEN_mad_gg_tt01g --hel_recycling=False --vector_size=32 --me_exporter=standalone_cudacpp
 Load PLUGIN.CUDACPP_OUTPUT
@@ -185,7 +185,7 @@ INFO: Generating Helas calls for process: g g > t t~ WEIGHTED<=2 @1
 INFO: Processing color information for process: g g > t t~ @1 
 INFO: Creating files in directory P2_gg_ttxg 
 [1;32mDEBUG:  kwargs[prefix] = 0 [1;30m[model_handling.py at line 1058][0m [0m
-[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7f4feb5031c0> [1;30m[export_v4.py at line 6262][0m [0m
+[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7f814a9d11f0> [1;30m[export_v4.py at line 6262][0m [0m
 INFO: Creating files in directory . 
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.h
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.cc
@@ -202,7 +202,7 @@ INFO: Generating Feynman diagrams for Process: g g > t t~ g WEIGHTED<=3 @2
 INFO: Finding symmetric diagrams for subprocess group gg_ttxg 
 INFO: Creating files in directory P1_gg_ttx 
 [1;32mDEBUG:  kwargs[prefix] = 0 [1;30m[model_handling.py at line 1058][0m [0m
-[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7f4feb492fd0> [1;30m[export_v4.py at line 6262][0m [0m
+[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7f814a9d1070> [1;30m[export_v4.py at line 6262][0m [0m
 INFO: Creating files in directory . 
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.h
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.cc
@@ -217,15 +217,15 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./.
 [1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m 32 True 32 [1;30m[export_v4.py at line 1872][0m [0m
 INFO: Generating Feynman diagrams for Process: g g > t t~ WEIGHTED<=2 @1 
 INFO: Finding symmetric diagrams for subprocess group gg_ttx 
-Generated helas calls for 2 subprocesses (19 diagrams) in 0.044 s
-Wrote files for 46 helas calls in 0.247 s
+Generated helas calls for 2 subprocesses (19 diagrams) in 0.043 s
+Wrote files for 46 helas calls in 0.239 s
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates VVV1 routines[0m
 ALOHA: aloha creates FFV1 routines[0m
 ALOHA: aloha creates VVVV1 set of routines with options: P0[0m
 ALOHA: aloha creates VVVV3 set of routines with options: P0[0m
 ALOHA: aloha creates VVVV4 set of routines with options: P0[0m
-ALOHA: aloha creates 5 routines in  0.330 s
+ALOHA: aloha creates 5 routines in  0.329 s
 [1;32mDEBUG:  Entering PLUGIN_ProcessExporter.convert_model (create the model) [1;30m[output.py at line 202][0m [0m
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates VVV1 routines[0m
@@ -233,7 +233,7 @@ ALOHA: aloha creates FFV1 routines[0m
 ALOHA: aloha creates VVVV1 set of routines with options: P0[0m
 ALOHA: aloha creates VVVV3 set of routines with options: P0[0m
 ALOHA: aloha creates VVVV4 set of routines with options: P0[0m
-ALOHA: aloha creates 10 routines in  0.316 s
+ALOHA: aloha creates 10 routines in  0.312 s
 <class 'aloha.create_aloha.AbstractRoutine'> VVV1
 <class 'aloha.create_aloha.AbstractRoutine'> VVV1
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
@@ -283,10 +283,10 @@ Type "launch" to generate events from this process, or see
 Run "open index.html" to see more information about this process.
 quit
 
-real	0m2.334s
-user	0m2.083s
-sys	0m0.238s
-Code generation completed in 2 seconds
+real	0m2.298s
+user	0m2.019s
+sys	0m0.276s
+Code generation completed in 3 seconds
 ************************************************************
 *                                                          *
 *                      W E L C O M E to                    *
@@ -312,7 +312,7 @@ INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/mg5amc
 INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g/Cards/me5_configuration.txt  
 Using default text editor "vi". Set another one in ./input/mg5_configuration.txt
 Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt
-No valid web browser found. Please set in ./input/mg5_configuration.txt
+Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt
 treatcards run
 quit
 INFO:  
@@ -342,7 +342,7 @@ INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/mg5amc
 INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g/Cards/me5_configuration.txt  
 Using default text editor "vi". Set another one in ./input/mg5_configuration.txt
 Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt
-No valid web browser found. Please set in ./input/mg5_configuration.txt
+Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt
 treatcards param
 quit
 INFO:  
diff --git a/epochX/cudacpp/gg_tt01g.mad/COPYRIGHT b/epochX/cudacpp/gg_tt01g.mad/COPYRIGHT
index a134b5fef9..84a883fbb0 100644
--- a/epochX/cudacpp/gg_tt01g.mad/COPYRIGHT
+++ b/epochX/cudacpp/gg_tt01g.mad/COPYRIGHT
@@ -15,6 +15,7 @@ The full development team currently includes the following authors :
   Stephan Hageboeck (CERN)
   Olivier Mattelaer (Universite Catholique de Louvain, original author)
   Stefan Roiser (CERN, original author)
+  Joergen Teig (CERN)
   Andrea Valassi (CERN, original author)
   Zenny Wettersten (CERN)
 See https://github.com/madgraph5/madgraph4gpu for more details. For the full
diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/Bridge.h b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/Bridge.h
index bf8b5e024d..89437b4c42 100644
--- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/Bridge.h
+++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/Bridge.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: S. Roiser (Nov 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Roiser, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef BRIDGE_H
 #define BRIDGE_H 1
@@ -23,7 +23,7 @@
 #include <memory>
 #include <type_traits>
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -83,7 +83,7 @@ namespace mg5amcCpu
     Bridge& operator=( const Bridge& ) = delete;
     Bridge& operator=( Bridge&& ) = delete;
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     /**
      * Set the gpublocks and gputhreads for the gpusequence - throws if evnt != gpublocks*gputhreads
      * (this is needed for BridgeKernel tests rather than for actual production use in Fortran)
@@ -150,7 +150,7 @@ namespace mg5amcCpu
     unsigned int m_nevt; // number of events
     int m_nGoodHel;      // the number of good helicities (-1 initially when they have not yet been calculated)
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     int m_gputhreads; // number of gpu threads (default set from number of events, can be modified)
     int m_gpublocks;  // number of gpu blocks (default set from number of events, can be modified)
     DeviceBuffer<FORTRANFPTYPE, sizePerEventMomenta> m_devMomentaF;
@@ -187,12 +187,12 @@ namespace mg5amcCpu
   // Forward declare transposition methods
   //
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 
   template<typename Tin, typename Tout>
   __global__ void dev_transposeMomentaF2C( const Tin* in, Tout* out, const unsigned int nevt );
 
-#endif // __CUDACC__
+#endif // MGONGPUCPP_GPUIMPL
 
   template<typename Tin, typename Tout>
   void hst_transposeMomentaF2C( const Tin* in, Tout* out, const unsigned int nevt );
@@ -209,7 +209,7 @@ namespace mg5amcCpu
   Bridge<FORTRANFPTYPE>::Bridge( unsigned int nevtF, unsigned int nparF, unsigned int np4F )
     : m_nevt( nevtF )
     , m_nGoodHel( -1 )
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     , m_gputhreads( 256 )                  // default number of gpu threads
     , m_gpublocks( m_nevt / m_gputhreads ) // this ensures m_nevt <= m_gpublocks*m_gputhreads
     , m_devMomentaF( m_nevt )
@@ -233,7 +233,7 @@ namespace mg5amcCpu
   {
     if( nparF != CPPProcess::npar ) throw std::runtime_error( "Bridge constructor: npar mismatch" );
     if( np4F != CPPProcess::np4 ) throw std::runtime_error( "Bridge constructor: np4 mismatch" );
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     if( ( m_nevt < s_gputhreadsmin ) || ( m_nevt % s_gputhreadsmin != 0 ) )
       throw std::runtime_error( "Bridge constructor: nevt should be a multiple of " + std::to_string( s_gputhreadsmin ) );
     while( m_nevt != m_gpublocks * m_gputhreads )
@@ -249,7 +249,7 @@ namespace mg5amcCpu
 #else
     std::cout << "WARNING! Instantiate host Bridge (nevt=" << m_nevt << ")" << std::endl;
     m_pmek.reset( new MatrixElementKernelHost( m_hstMomentaC, m_hstGs, m_hstRndHel, m_hstRndCol, m_hstMEs, m_hstSelHel, m_hstSelCol, m_nevt ) );
-#endif // __CUDACC__
+#endif // MGONGPUCPP_GPUIMPL
     // Create a process object, read param card and set parameters
     // FIXME: the process instance can happily go out of scope because it is only needed to read parameters?
     // FIXME: the CPPProcess should really be a singleton? what if fbridgecreate is called from several Fortran threads?
@@ -262,7 +262,7 @@ namespace mg5amcCpu
     process.initProc( paramCard );
   }
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   template<typename FORTRANFPTYPE>
   void Bridge<FORTRANFPTYPE>::set_gpugrid( const int gpublocks, const int gputhreads )
   {
@@ -276,7 +276,7 @@ namespace mg5amcCpu
   }
 #endif
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   template<typename FORTRANFPTYPE>
   void Bridge<FORTRANFPTYPE>::gpu_sequence( const FORTRANFPTYPE* momenta,
                                             const FORTRANFPTYPE* gs,
@@ -291,14 +291,14 @@ namespace mg5amcCpu
     constexpr int neppM = MemoryAccessMomenta::neppM;
     if constexpr( neppM == 1 && std::is_same_v<FORTRANFPTYPE, fptype> )
     {
-      checkCuda( cudaMemcpy( m_devMomentaC.data(), momenta, m_devMomentaC.bytes(), cudaMemcpyHostToDevice ) );
+      gpuMemcpy( m_devMomentaC.data(), momenta, m_devMomentaC.bytes(), gpuMemcpyHostToDevice );
     }
     else
     {
-      checkCuda( cudaMemcpy( m_devMomentaF.data(), momenta, m_devMomentaF.bytes(), cudaMemcpyHostToDevice ) );
+      gpuMemcpy( m_devMomentaF.data(), momenta, m_devMomentaF.bytes(), gpuMemcpyHostToDevice );
       const int thrPerEvt = CPPProcess::npar * CPPProcess::np4; // AV: transpose alg does 1 element per thread (NOT 1 event per thread)
       //const int thrPerEvt = 1; // AV: try new alg with 1 event per thread... this seems slower
-      dev_transposeMomentaF2C<<<m_gpublocks * thrPerEvt, m_gputhreads>>>( m_devMomentaF.data(), m_devMomentaC.data(), m_nevt );
+      gpuLaunchKernel( dev_transposeMomentaF2C, m_gpublocks * thrPerEvt, m_gputhreads, m_devMomentaF.data(), m_devMomentaC.data(), m_nevt );
     }
     if constexpr( std::is_same_v<FORTRANFPTYPE, fptype> )
     {
@@ -341,7 +341,7 @@ namespace mg5amcCpu
   }
 #endif
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   template<typename FORTRANFPTYPE>
   void Bridge<FORTRANFPTYPE>::cpu_sequence( const FORTRANFPTYPE* momenta,
                                             const FORTRANFPTYPE* gs,
@@ -396,7 +396,7 @@ namespace mg5amcCpu
   // - C++ array: momenta[npagM][npar][np4][neppM] with nevt=npagM*neppM (AOSOA)
   //
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   template<typename Tin, typename Tout>
   __global__ void dev_transposeMomentaF2C( const Tin* in, Tout* out, const unsigned int nevt )
   {
diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/BridgeKernels.cc b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/BridgeKernels.cc
index d58066c9c1..eaf4037a24 100644
--- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/BridgeKernels.cc
+++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/BridgeKernels.cc
@@ -1,17 +1,18 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #include "BridgeKernels.h"
 
+#include "GpuAbstraction.h"
 #include "MemoryAccessMomenta.h"
 
 #include <sstream>
 
 //============================================================================
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -45,7 +46,7 @@ namespace mg5amcCpu
 
 //============================================================================
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 namespace mg5amcCpu
 {
 
@@ -96,7 +97,7 @@ namespace mg5amcCpu
 
 //============================================================================
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 {
 
diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/BridgeKernels.h b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/BridgeKernels.h
index 15eb4bff4d..3efef8ce97 100644
--- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/BridgeKernels.h
+++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/BridgeKernels.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef BRIDGEKERNELS_H
 #define BRIDGEKERNELS_H 1
@@ -12,7 +12,7 @@
 #include "MatrixElementKernels.h"
 #include "MemoryBuffers.h"
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -49,7 +49,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A Bridge wrapper class encapsulating matrix element calculations on a CPU host
   class BridgeKernelHost final : public BridgeKernelBase
   {
@@ -89,7 +89,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   // A Bridge wrapper class encapsulating matrix element calculations on a GPU device
   class BridgeKernelDevice : public BridgeKernelBase
   {
diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/CommonRandomNumberKernel.cc b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/CommonRandomNumberKernel.cc
index 985b39f576..010bc4cbd0 100644
--- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/CommonRandomNumberKernel.cc
+++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/CommonRandomNumberKernel.cc
@@ -1,15 +1,16 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #include "CommonRandomNumbers.h"
+#include "GpuAbstraction.h"
 #include "MemoryBuffers.h"
 #include "RandomNumberKernels.h"
 
 #include <cassert>
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/CrossSectionKernels.cc b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/CrossSectionKernels.cc
index 0b355a3c8d..c15b39844d 100644
--- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/CrossSectionKernels.cc
+++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/CrossSectionKernels.cc
@@ -1,10 +1,11 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #include "CrossSectionKernels.h"
 
+#include "GpuAbstraction.h"
 #include "MemoryAccessMatrixElements.h"
 #include "MemoryAccessWeights.h"
 #include "MemoryBuffers.h"
@@ -77,7 +78,7 @@ debug_me_is_abnormal( const fptype& me, size_t ievtALL )
 
 //============================================================================
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -185,7 +186,7 @@ namespace mg5amcCpu
 
 //============================================================================
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 {
 
diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/CrossSectionKernels.h b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/CrossSectionKernels.h
index 7933ca4bbf..4d9659e04e 100644
--- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/CrossSectionKernels.h
+++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/CrossSectionKernels.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef CROSSSECTIONKERNELS_H
 #define CROSSSECTIONKERNELS_H 1
@@ -13,7 +13,7 @@
 
 //============================================================================
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -96,7 +96,7 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
   /*
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   // A class encapsulating the calculation of event statistics on a GPU device
   class CrossSectionKernelDevice : public CrossSectionKernelBase, public NumberOfEvents
   {
diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/CudaRuntime.h b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/CudaRuntime.h
deleted file mode 100644
index 64ce52f4b3..0000000000
--- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/CudaRuntime.h
+++ /dev/null
@@ -1,85 +0,0 @@
-// Copyright (C) 2020-2023 CERN and UCLouvain.
-// Licensed under the GNU Lesser General Public License (version 3 or later).
-// Created by: S. Roiser (Jul 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
-
-#ifndef MG5AMC_CUDARUNTIME_H
-#define MG5AMC_CUDARUNTIME_H 1
-
-// MG5AMC on GPU uses the CUDA runtime API, not the lower level CUDA driver API
-// See https://docs.nvidia.com/cuda/cuda-runtime-api/driver-vs-runtime-api.html#driver-vs-runtime-api
-
-#include <cassert>
-#include <iostream>
-
-//--------------------------------------------------------------------------
-
-// See https://stackoverflow.com/a/14038590
-#ifdef __CUDACC__ /* clang-format off */
-#define checkCuda( code ) { assertCuda( code, __FILE__, __LINE__ ); }
-inline void assertCuda( cudaError_t code, const char* file, int line, bool abort = true )
-{
-  if( code != cudaSuccess )
-  {
-    printf( "ERROR! assertCuda: '%s' (%d) in %s:%d\n", cudaGetErrorString( code ), code, file, line );
-    if( abort ) assert( code == cudaSuccess );
-  }
-}
-#endif /* clang-format on */
-
-//--------------------------------------------------------------------------
-
-#ifdef __CUDACC__
-namespace mg5amcGpu
-{
-  // Instantiate a CudaRuntime at the beginnining of the application's main to
-  // invoke cudaSetDevice(0) in the constructor and book a cudaDeviceReset() call in the destructor
-  // *** FIXME! This will all need to be designed differently when going to multi-GPU nodes! ***
-  struct CudaRuntime final
-  {
-    CudaRuntime( const bool debug = true )
-      : m_debug( debug ) { setUp( m_debug ); }
-    ~CudaRuntime() { tearDown( m_debug ); }
-    CudaRuntime( const CudaRuntime& ) = delete;
-    CudaRuntime( CudaRuntime&& ) = delete;
-    CudaRuntime& operator=( const CudaRuntime& ) = delete;
-    CudaRuntime& operator=( CudaRuntime&& ) = delete;
-    bool m_debug;
-
-    // Set up CUDA application
-    // ** NB: strictly speaking this is not needed when using the CUDA runtime API **
-    // Calling cudaSetDevice on startup is useful to properly book-keep the time spent in CUDA initialization
-    static void setUp( const bool debug = true )
-    {
-      // ** NB: it is useful to call cudaSetDevice, or cudaFree, to properly book-keep the time spent in CUDA initialization
-      // ** NB: otherwise, the first CUDA operation (eg a cudaMemcpyToSymbol in CPPProcess ctor) appears to take much longer!
-      /*
-      // [We initially added cudaFree(0) to "ease profile analysis" only because it shows up as a big recognizable block!]
-      // No explicit initialization is needed: https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#initialization
-      // It is not clear what cudaFree(0) does at all: https://stackoverflow.com/questions/69967813/
-      if ( debug ) std::cout << "__CudaRuntime: calling cudaFree(0)" << std::endl;
-      checkCuda( cudaFree( 0 ) ); // SLOW!
-      */
-      // Replace cudaFree(0) by cudaSetDevice(0), even if it is not really needed either
-      // (but see https://developer.nvidia.com/blog/cuda-pro-tip-always-set-current-device-avoid-multithreading-bugs)
-      if( debug ) std::cout << "__CudaRuntime: calling cudaSetDevice(0)" << std::endl;
-      checkCuda( cudaSetDevice( 0 ) ); // SLOW!
-    }
-
-    // Tear down CUDA application (call cudaDeviceReset)
-    // ** NB: strictly speaking this is not needed when using the CUDA runtime API **
-    // Calling cudaDeviceReset on shutdown is only needed for checking memory leaks in cuda-memcheck
-    // See https://docs.nvidia.com/cuda/cuda-memcheck/index.html#leak-checking
-    static void tearDown( const bool debug = true )
-    {
-      if( debug ) std::cout << "__CudaRuntime: calling cudaDeviceReset()" << std::endl;
-      checkCuda( cudaDeviceReset() );
-    }
-  };
-
-}
-#endif
-
-//--------------------------------------------------------------------------
-
-#endif // MG5AMC_CUDARUNTIME_H
diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/CurandRandomNumberKernel.cc b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/CurandRandomNumberKernel.cc
index eb56333b03..08a16f6f2c 100644
--- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/CurandRandomNumberKernel.cc
+++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/CurandRandomNumberKernel.cc
@@ -1,9 +1,9 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
-#include "CudaRuntime.h"
+#include "GpuRuntime.h"
 #include "MemoryBuffers.h"
 #include "RandomNumberKernels.h"
 
@@ -22,7 +22,7 @@ inline void assertCurand( curandStatus_t code, const char *file, int line, bool
 }
 #endif /* clang-format on */
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -36,7 +36,7 @@ namespace mg5amcCpu
   {
     if( m_isOnDevice )
     {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
       if( !m_rnarray.isOnDevice() )
         throw std::runtime_error( "CurandRandomNumberKernel on device with a host random number array" );
 #else
@@ -114,7 +114,7 @@ namespace mg5amcCpu
     /*
     printf( "\nCurandRandomNumberKernel::generateRnarray size = %d\n", (int)m_rnarray.size() );
     fptype* data = m_rnarray.data();
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     if( m_rnarray.isOnDevice() )
     {
       data = new fptype[m_rnarray.size()]();
@@ -123,7 +123,7 @@ namespace mg5amcCpu
 #endif
     for( int i = 0; i < ( (int)m_rnarray.size() / 4 ); i++ )
       printf( "[%4d] %f %f %f %f\n", i * 4, data[i * 4], data[i * 4 + 2], data[i * 4 + 2], data[i * 4 + 3] );
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     if( m_rnarray.isOnDevice() ) delete[] data;
 #endif
     */
diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/EventStatistics.h b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/EventStatistics.h
index 48b51e0a49..b425a5bade 100644
--- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/EventStatistics.h
+++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/EventStatistics.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef EventStatistics_H
 #define EventStatistics_H 1
@@ -16,7 +16,7 @@
 #include <limits>
 #include <string>
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/GpuAbstraction.h b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/GpuAbstraction.h
index 9c467b1e04..6a7d9c05c0 100644
--- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/GpuAbstraction.h
+++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/GpuAbstraction.h
@@ -39,6 +39,8 @@
 
 #elif defined __HIPCC__
 
+#include "hip/hip_runtime.h"
+
 #define gpuError_t hipError_t
 #define gpuPeekAtLastError hipPeekAtLastError
 #define gpuGetErrorString hipGetErrorString
diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MadgraphTest.h b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MadgraphTest.h
index ef40624c88..a64c05c26a 100644
--- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MadgraphTest.h
+++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MadgraphTest.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: S. Hageboeck (Dec 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MADGRAPHTEST_H_
 #define MADGRAPHTEST_H_ 1
@@ -22,7 +22,7 @@
 #include <string>
 #include <vector>
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 using mg5amcGpu::CPPProcess;
 #else
 using mg5amcCpu::CPPProcess;
@@ -201,7 +201,7 @@ class MadgraphTest : public testing::TestWithParam<TestDriverBase*>
 
 // Since we link both the CPU-only and GPU tests into the same executable, we prevent
 // a multiply defined symbol by only compiling this in the non-CUDA phase:
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 
 /// Compare momenta and matrix elements.
 /// This uses an implementation of TestDriverBase to run a madgraph workflow,
@@ -307,6 +307,6 @@ TEST_P( MadgraphTest, CompareMomentaAndME )
   }
 }
 
-#endif // __CUDACC__
+#endif // MGONGPUCPP_GPUIMPL
 
 #endif /* MADGRAPHTEST_H_ */
diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MatrixElementKernels.cc b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MatrixElementKernels.cc
index 74b5239ebf..81699dfea9 100644
--- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MatrixElementKernels.cc
+++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MatrixElementKernels.cc
@@ -1,12 +1,12 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #include "MatrixElementKernels.h"
 
 #include "CPPProcess.h"
-#include "CudaRuntime.h"
+#include "GpuRuntime.h" // Includes the abstraction for Nvidia/AMD compilation
 #include "MemoryAccessMomenta.h"
 #include "MemoryBuffers.h"
 
@@ -14,7 +14,7 @@
 
 //============================================================================
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 namespace mg5amcCpu
 {
 
@@ -150,7 +150,7 @@ namespace mg5amcCpu
 
 //============================================================================
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 {
 
@@ -209,13 +209,13 @@ namespace mg5amcGpu
     PinnedHostBufferHelicityMask hstIsGoodHel( ncomb );
     DeviceBufferHelicityMask devIsGoodHel( ncomb );
     // ... 0d1. Compute good helicity mask on the device
-    computeDependentCouplings<<<m_gpublocks, m_gputhreads>>>( m_gs.data(), m_couplings.data() );
+    gpuLaunchKernel( computeDependentCouplings, m_gpublocks, m_gputhreads, m_gs.data(), m_couplings.data() );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    sigmaKin_getGoodHel<<<m_gpublocks, m_gputhreads>>>( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_numerators.data(), m_denominators.data(), devIsGoodHel.data() );
+    gpuLaunchKernel( sigmaKin_getGoodHel, m_gpublocks, m_gputhreads, m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_numerators.data(), m_denominators.data(), devIsGoodHel.data() );
 #else
-    sigmaKin_getGoodHel<<<m_gpublocks, m_gputhreads>>>( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), devIsGoodHel.data() );
+    gpuLaunchKernel( sigmaKin_getGoodHel, m_gpublocks, m_gputhreads, m_momenta.data(), m_couplings.data(), m_matrixElements.data(), devIsGoodHel.data() );
 #endif
-    checkCuda( cudaPeekAtLastError() );
+    checkGpu( gpuPeekAtLastError() );
     // ... 0d2. Copy back good helicity mask to the host
     copyHostFromDevice( hstIsGoodHel, devIsGoodHel );
     // ... 0d3. Copy back good helicity list to constant memory on the device
@@ -226,19 +226,19 @@ namespace mg5amcGpu
 
   void MatrixElementKernelDevice::computeMatrixElements( const unsigned int channelId )
   {
-    computeDependentCouplings<<<m_gpublocks, m_gputhreads>>>( m_gs.data(), m_couplings.data() );
+    gpuLaunchKernel( computeDependentCouplings, m_gpublocks, m_gputhreads, m_gs.data(), m_couplings.data() );
 #ifndef MGONGPU_NSIGHT_DEBUG
     constexpr unsigned int sharedMemSize = 0;
 #else
     constexpr unsigned int sharedMemSize = ntpbMAX * sizeof( float );
 #endif
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    sigmaKin<<<m_gpublocks, m_gputhreads, sharedMemSize>>>( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), channelId, m_numerators.data(), m_denominators.data(), m_selhel.data(), m_selcol.data() );
+    gpuLaunchKernelSharedMem( sigmaKin, m_gpublocks, m_gputhreads, sharedMemSize, m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), channelId, m_numerators.data(), m_denominators.data(), m_selhel.data(), m_selcol.data() );
 #else
-    sigmaKin<<<m_gpublocks, m_gputhreads, sharedMemSize>>>( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), m_selhel.data(), m_selcol.data() );
+    gpuLaunchKernelSharedMem( sigmaKin, m_gpublocks, m_gputhreads, sharedMemSize, m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), m_selhel.data(), m_selcol.data() );
 #endif
-    checkCuda( cudaPeekAtLastError() );
-    checkCuda( cudaDeviceSynchronize() );
+    checkGpu( gpuPeekAtLastError() );
+    checkGpu( gpuDeviceSynchronize() );
   }
 
   //--------------------------------------------------------------------------
diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MatrixElementKernels.h b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MatrixElementKernels.h
index 23e84757a2..72bd8f195b 100644
--- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MatrixElementKernels.h
+++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MatrixElementKernels.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MATRIXELEMENTKERNELS_H
 #define MATRIXELEMENTKERNELS_H 1
@@ -10,7 +10,7 @@
 
 #include "MemoryBuffers.h"
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -81,7 +81,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating matrix element calculations on a CPU host
   class MatrixElementKernelHost final : public MatrixElementKernelBase, public NumberOfEvents
   {
@@ -130,7 +130,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   // A class encapsulating matrix element calculations on a GPU device
   class MatrixElementKernelDevice : public MatrixElementKernelBase, public NumberOfEvents
   {
diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MemoryAccessAmplitudes.h b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MemoryAccessAmplitudes.h
index 573b3bbbc9..ffb76e93de 100644
--- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MemoryAccessAmplitudes.h
+++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MemoryAccessAmplitudes.h
@@ -15,7 +15,7 @@
 #define MGONGPU_TRIVIAL_AMPLITUDES 1
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MemoryAccessCouplings.h b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MemoryAccessCouplings.h
index 35a3af42e0..3afdf3e554 100644
--- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MemoryAccessCouplings.h
+++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MemoryAccessCouplings.h
@@ -15,7 +15,7 @@
 #include "MemoryBuffers.h"       // for HostBufferCouplings::isaligned
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MemoryAccessCouplingsFixed.h b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MemoryAccessCouplingsFixed.h
index dc0d93afff..ffcdf4dbef 100644
--- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MemoryAccessCouplingsFixed.h
+++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MemoryAccessCouplingsFixed.h
@@ -14,7 +14,7 @@
 //#include "MemoryAccessHelpers.h"
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MemoryAccessDenominators.h b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MemoryAccessDenominators.h
index 3bce635718..66f2d32a6b 100644
--- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MemoryAccessDenominators.h
+++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MemoryAccessDenominators.h
@@ -10,7 +10,7 @@
 #include "MemoryAccessGs.h"
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MemoryAccessGs.h b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MemoryAccessGs.h
index 31311aa375..4c726b30f3 100644
--- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MemoryAccessGs.h
+++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MemoryAccessGs.h
@@ -13,7 +13,7 @@
 #include "MemoryBuffers.h" // for HostBufferMatrixElements::isaligned
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MemoryAccessHelpers.h b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MemoryAccessHelpers.h
index c82a6c7635..db73e4e064 100644
--- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MemoryAccessHelpers.h
+++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MemoryAccessHelpers.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MemoryAccessHelpers_H
 #define MemoryAccessHelpers_H 1
@@ -105,7 +105,7 @@ class KernelAccessHelper : public MemoryAccessHelper<T>
     }
     else
     {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
       const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid
       //printf( "kernelAccessRecord: ievt=%d threadId=%d\n", ievt, threadIdx.x );
       return T::ieventAccessRecord( buffer, ievt ); // NB fptype and fptype_sv coincide for CUDA
diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MemoryAccessMatrixElements.h b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MemoryAccessMatrixElements.h
index f32e6fea5b..3741011971 100644
--- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MemoryAccessMatrixElements.h
+++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MemoryAccessMatrixElements.h
@@ -13,7 +13,7 @@
 #include "MemoryBuffers.h" // for HostBufferMatrixElements::isaligned
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MemoryAccessMomenta.h b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MemoryAccessMomenta.h
index 29266de32c..3be229d392 100644
--- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MemoryAccessMomenta.h
+++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MemoryAccessMomenta.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MemoryAccessMomenta_H
 #define MemoryAccessMomenta_H 1
@@ -13,7 +13,7 @@
 #include "MemoryAccessVectors.h"
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -30,7 +30,7 @@ namespace mg5amcCpu
 
     // Number of Events Per Page in the momenta AOSOA memory buffer layout
     // (these are all best kept as a compile-time constants: see issue #23)
-#ifdef __CUDACC__ /* clang-format off */
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
     // -----------------------------------------------------------------------------------------------
     // --- GPUs: neppM is best set to a power of 2 times the number of fptype's in a 32-byte cacheline
     // --- This is relevant to ensure coalesced access to momenta in global memory
diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MemoryAccessNumerators.h b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MemoryAccessNumerators.h
index b152183b28..18991f4fa6 100644
--- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MemoryAccessNumerators.h
+++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MemoryAccessNumerators.h
@@ -10,7 +10,7 @@
 #include "MemoryAccessGs.h"
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MemoryAccessRandomNumbers.h b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MemoryAccessRandomNumbers.h
index e2988d39f3..40cb089135 100644
--- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MemoryAccessRandomNumbers.h
+++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MemoryAccessRandomNumbers.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MemoryAccessRandomNumbers_H
 #define MemoryAccessRandomNumbers_H 1
@@ -11,7 +11,7 @@
 #include "CPPProcess.h"
 #include "MemoryAccessHelpers.h"
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 using mg5amcGpu::CPPProcess;
 #else
 using mg5amcCpu::CPPProcess;
diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MemoryAccessVectors.h b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MemoryAccessVectors.h
index e9b197368e..08faccff0f 100644
--- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MemoryAccessVectors.h
+++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MemoryAccessVectors.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MemoryAccessVectors_H
 #define MemoryAccessVectors_H 1
@@ -10,7 +10,7 @@
 
 #include "mgOnGpuVectors.h"
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 namespace mg5amcCpu // this is only needed for CPU SIMD vectorization
 {
 
diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MemoryAccessWavefunctions.h b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MemoryAccessWavefunctions.h
index 5428aaf933..33bef4559e 100644
--- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MemoryAccessWavefunctions.h
+++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MemoryAccessWavefunctions.h
@@ -15,7 +15,7 @@
 #define MGONGPU_TRIVIAL_WAVEFUNCTIONS 1
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MemoryBuffers.h b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MemoryBuffers.h
index 3093e6ed18..7756a71621 100644
--- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MemoryBuffers.h
+++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MemoryBuffers.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021, based on earlier work by S. Hageboeck) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Roiser, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MemoryBuffers_H
 #define MemoryBuffers_H 1
@@ -11,12 +11,12 @@
 #include "mgOnGpuCxtypes.h"
 
 #include "CPPProcess.h"
-#include "CudaRuntime.h"
+#include "GpuRuntime.h"
 #include "Parameters_sm.h"
 
 #include <sstream>
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -87,7 +87,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   constexpr bool HostBufferALIGNED = false;   // ismisaligned=false
   constexpr bool HostBufferMISALIGNED = true; // ismisaligned=true
 
@@ -119,7 +119,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   // A class encapsulating a CUDA pinned host buffer
   template<typename T>
   class PinnedHostBufferBase : public BufferBase<T>
@@ -128,18 +128,18 @@ namespace mg5amcCpu
     PinnedHostBufferBase( const size_t size )
       : BufferBase<T>( size, false )
     {
-      checkCuda( cudaMallocHost( &( this->m_data ), this->bytes() ) );
+      gpuMallocHost( &( this->m_data ), this->bytes() );
     }
     virtual ~PinnedHostBufferBase()
     {
-      checkCuda( cudaFreeHost( this->m_data ) );
+      gpuFreeHost( this->m_data );
     }
   };
 #endif
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   // A class encapsulating a CUDA device buffer
   template<typename T>
   class DeviceBufferBase : public BufferBase<T>
@@ -148,18 +148,18 @@ namespace mg5amcCpu
     DeviceBufferBase( const size_t size )
       : BufferBase<T>( size, true )
     {
-      checkCuda( cudaMalloc( &( this->m_data ), this->bytes() ) );
+      gpuMalloc( &( this->m_data ), this->bytes() );
     }
     virtual ~DeviceBufferBase()
     {
-      checkCuda( cudaFree( this->m_data ) );
+      gpuFree( this->m_data );
     }
   };
 #endif
 
   //--------------------------------------------------------------------------
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for a given number of events
   template<typename T, size_t sizePerEvent, bool ismisaligned>
   class HostBuffer : public HostBufferBase<T, ismisaligned>, virtual private NumberOfEvents
@@ -175,7 +175,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   // A class encapsulating a CUDA pinned host buffer for a given number of events
   template<typename T, size_t sizePerEvent>
   class PinnedHostBuffer : public PinnedHostBufferBase<T>, virtual private NumberOfEvents
@@ -191,7 +191,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   // A class encapsulating a CUDA device buffer for a given number of events
   template<typename T, size_t sizePerEvent>
   class DeviceBuffer : public DeviceBufferBase<T>, virtual private NumberOfEvents
@@ -213,7 +213,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for momenta random numbers
   constexpr size_t sizePerEventRndNumMomenta = MemoryBuffers::np4 * MemoryBuffers::nparf;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for momenta random numbers
   typedef HostBuffer<fptype, sizePerEventRndNumMomenta, HostBufferALIGNED> HostBufferRndNumMomenta;
 #else
@@ -232,7 +232,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer with ONE fptype per event
   constexpr size_t sizePerEventOneFp = 1;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer with ONE fptype per event
   typedef HostBuffer<fptype, sizePerEventOneFp, HostBufferALIGNED> HostBufferOneFp;
 #else
@@ -257,7 +257,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for Gs
   constexpr size_t sizePerEventGs = 1;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for gs
   typedef HostBuffer<fptype, sizePerEventGs, HostBufferALIGNED> HostBufferGs;
 #else
@@ -276,7 +276,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for numerators
   constexpr size_t sizePerEventNumerators = 1;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for gs
   typedef HostBuffer<fptype, sizePerEventNumerators, HostBufferALIGNED> HostBufferNumerators;
 #else
@@ -296,7 +296,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for denominators
   constexpr size_t sizePerEventDenominators = 1;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for gs
   typedef HostBuffer<fptype, sizePerEventDenominators, HostBufferALIGNED> HostBufferDenominators;
 #else
@@ -315,7 +315,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for random numbers
   constexpr size_t sizePerEventCouplings = MemoryBuffers::ndcoup * MemoryBuffers::nx2;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for gs
   typedef HostBuffer<fptype, sizePerEventCouplings, HostBufferALIGNED> HostBufferCouplings;
 #else
@@ -333,7 +333,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for momenta
   constexpr size_t sizePerEventMomenta = MemoryBuffers::np4 * MemoryBuffers::npar;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for momenta
   typedef HostBuffer<fptype, sizePerEventMomenta, HostBufferALIGNED> HostBufferMomenta;
   //typedef HostBuffer<fptype, sizePerEventMomenta, HostBufferMISALIGNED> HostBufferMomenta; // TEST MISALIGNMENT!
@@ -352,7 +352,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for sampling weights
   constexpr size_t sizePerEventWeights = 1;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for sampling weights
   typedef HostBuffer<fptype, sizePerEventWeights, HostBufferALIGNED> HostBufferWeights;
 #else
@@ -370,7 +370,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for matrix elements
   constexpr size_t sizePerEventMatrixElements = 1;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for matrix elements
   typedef HostBuffer<fptype, sizePerEventMatrixElements, HostBufferALIGNED> HostBufferMatrixElements;
 #else
@@ -385,7 +385,7 @@ namespace mg5amcCpu
   // A base class encapsulating a memory buffer for the helicity mask
   typedef BufferBase<bool> BufferHelicityMask;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for the helicity mask
   typedef HostBufferBase<bool, HostBufferALIGNED> HostBufferHelicityMask;
 #else
@@ -403,7 +403,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for wavefunctions
   constexpr size_t sizePerEventWavefunctions = MemoryBuffers::nw6 * MemoryBuffers::nx2;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for wavefunctions
   typedef HostBuffer<fptype, sizePerEventWavefunctions, HostBufferALIGNED> HostBufferWavefunctions;
 #else
@@ -421,7 +421,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for helicity random numbers
   constexpr size_t sizePerEventRndNumHelicity = 1;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for helicity random numbers
   typedef HostBuffer<fptype, sizePerEventRndNumHelicity, HostBufferALIGNED> HostBufferRndNumHelicity;
 #else
@@ -439,7 +439,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for color random numbers
   constexpr size_t sizePerEventRndNumColor = 1;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for color random numbers
   typedef HostBuffer<fptype, sizePerEventRndNumColor, HostBufferALIGNED> HostBufferRndNumColor;
 #else
@@ -457,7 +457,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for helicity selection
   constexpr size_t sizePerEventSelectedHelicity = 1;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for helicity selection
   typedef HostBuffer<int, sizePerEventSelectedHelicity, HostBufferALIGNED> HostBufferSelectedHelicity;
 #else
@@ -475,7 +475,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for color selection
   constexpr size_t sizePerEventSelectedColor = 1;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for color selection
   typedef HostBuffer<int, sizePerEventSelectedColor, HostBufferALIGNED> HostBufferSelectedColor;
 #else
@@ -487,7 +487,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   template<class Tdst, class Tsrc>
   void copyDeviceFromHost( Tdst& dst, const Tsrc& src ) // keep the same order of arguments as in memcpy
   {
@@ -504,13 +504,13 @@ namespace mg5amcCpu
       throw std::runtime_error( sstr.str() );
     }
     // NB (PR #45): cudaMemcpy involves an intermediate memcpy to pinned memory if host array is a not a pinned host array
-    checkCuda( cudaMemcpy( dst.data(), src.data(), src.bytes(), cudaMemcpyHostToDevice ) );
+    gpuMemcpy( dst.data(), src.data(), src.bytes(), gpuMemcpyHostToDevice );
   }
 #endif
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   template<class Tdst, class Tsrc>
   void copyHostFromDevice( Tdst& dst, const Tsrc& src ) // keep the same order of arguments as in memcpy
   {
@@ -527,7 +527,7 @@ namespace mg5amcCpu
       throw std::runtime_error( sstr.str() );
     }
     // NB (PR #45): cudaMemcpy involves an intermediate memcpy to pinned memory if host array is a not a pinned host array
-    checkCuda( cudaMemcpy( dst.data(), src.data(), src.bytes(), cudaMemcpyDeviceToHost ) );
+    gpuMemcpy( dst.data(), src.data(), src.bytes(), gpuMemcpyDeviceToHost );
   }
 #endif
 
diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/CPPProcess.cc b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/CPPProcess.cc
index 18052b6676..f20c229897 100644
--- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/CPPProcess.cc
+++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/CPPProcess.cc
@@ -4,7 +4,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
 // MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
@@ -16,7 +16,6 @@
 
 #include "mgOnGpuConfig.h"
 
-#include "CudaRuntime.h"
 #include "HelAmps_sm.h"
 #include "MemoryAccessAmplitudes.h"
 #include "MemoryAccessCouplings.h"
@@ -46,7 +45,7 @@
 // Class member functions for calculating the matrix elements for
 // Process: g g > t t~ WEIGHTED<=2 @1
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -80,7 +79,7 @@ namespace mg5amcCpu
   __device__ const fptype cIPD[2] = { (fptype)Parameters_sm::mdl_MT, (fptype)Parameters_sm::mdl_WT };
   __device__ const fptype* cIPC = nullptr; // unused as nicoup=0
 #else
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   __device__ __constant__ fptype cIPD[2];
   __device__ __constant__ fptype* cIPC = nullptr; // unused as nicoup=0
 #else
@@ -90,7 +89,7 @@ namespace mg5amcCpu
 #endif
 
   // Helicity combinations (and filtering of "good" helicity combinations)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   __device__ __constant__ short cHel[ncomb][npar];
   __device__ __constant__ int cNGoodHel;
   __device__ __constant__ int cGoodHel[ncomb];
@@ -118,13 +117,13 @@ namespace mg5amcCpu
                            fptype* allDenominators,       // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
                            fptype_sv* jamp2_sv            // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled)
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
                            , const int ievt00             // input: first event number in current C++ event page (for CUDA, ievt depends on threadid)
 #endif
                            )
   //ALWAYS_INLINE // attributes are not permitted in a function definition
   {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     using namespace mg5amcGpu;
     using M_ACCESS = DeviceAccessMomenta;         // non-trivial access: buffer includes all events
     using E_ACCESS = DeviceAccessMatrixElements;  // non-trivial access: buffer includes all events
@@ -151,7 +150,7 @@ namespace mg5amcCpu
 #endif /* clang-format on */
     mgDebug( 0, __FUNCTION__ );
     //printf( "calculate_wavefunctions: ihel=%2d\n", ihel );
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
     //printf( "calculate_wavefunctions: ievt00=%d\n", ievt00 );
 #endif
 
@@ -187,7 +186,7 @@ namespace mg5amcCpu
 #endif
     for( int iParity = 0; iParity < nParity; ++iParity )
     { // START LOOP ON IPARITY
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
       const int ievt0 = ievt00 + iParity * neppV;
 #endif
       constexpr size_t nxcoup = ndcoup + nicoup; // both dependent and independent couplings
@@ -200,8 +199,10 @@ namespace mg5amcCpu
         allCOUPs[idcoup] = CD_ACCESS::idcoupAccessBufferConst( allcouplings, idcoup ); // dependent couplings, vary event-by-event
       for( size_t iicoup = 0; iicoup < nicoup; iicoup++ )
         allCOUPs[ndcoup + iicoup] = CI_ACCESS::iicoupAccessBufferConst( cIPC, iicoup ); // independent couplings, fixed for all events
+#ifdef MGONGPUCPP_GPUIMPL
 #ifdef __CUDACC__
 #pragma nv_diagnostic pop
+#endif
       // CUDA kernels take input/output buffers with momenta/MEs for all events
       const fptype* momenta = allmomenta;
       const fptype* COUPs[nxcoup];
@@ -302,7 +303,7 @@ namespace mg5amcCpu
         { 16, -2 },
         { -2, 16 } }; // 2-D array[2][2]
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
       // Pre-compute a constexpr triangular color matrix properly normalized #475
       struct TriangularNormalizedColorMatrix
       {
@@ -359,7 +360,7 @@ namespace mg5amcCpu
 #endif
       for( int icol = 0; icol < ncolor; icol++ )
       {
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
         // === C++ START ===
         // Diagonal terms
 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
@@ -418,7 +419,7 @@ namespace mg5amcCpu
       MEs_sv_previous += deltaMEs_previous;
 #endif
       /*
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
       if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", blockDim.x * blockIdx.x + threadIdx.x, ihel, MEs_sv );
 #else
 #ifdef MGONGPU_CPPSIMD
@@ -465,8 +466,8 @@ namespace mg5amcCpu
       { 1, 1, -1, -1 },
       { 1, 1, 1, 1 },
       { 1, 1, 1, -1 } };
-#ifdef __CUDACC__
-    checkCuda( cudaMemcpyToSymbol( cHel, tHel, ncomb * npar * sizeof( short ) ) );
+#ifdef MGONGPUCPP_GPUIMPL
+    gpuMemcpyToSymbol( cHel, tHel, ncomb * npar * sizeof( short ) );
 #else
     memcpy( cHel, tHel, ncomb * npar * sizeof( short ) );
 #endif
@@ -506,9 +507,9 @@ namespace mg5amcCpu
     // Then copy them to CUDA constant memory (issue #39) or its C++ emulation in file-scope static memory
     const fptype tIPD[2] = { (fptype)m_pars->mdl_MT, (fptype)m_pars->mdl_WT };
     //const cxtype tIPC[0] = { ... }; // nicoup=0
-#ifdef __CUDACC__
-    checkCuda( cudaMemcpyToSymbol( cIPD, tIPD, 2 * sizeof( fptype ) ) );
-    //checkCuda( cudaMemcpyToSymbol( cIPC, tIPC, 0 * sizeof( cxtype ) ) ); // nicoup=0
+#ifdef MGONGPUCPP_GPUIMPL
+    gpuMemcpyToSymbol( cIPD, tIPD, 2 * sizeof( fptype ) );
+    //gpuMemcpyToSymbol( cIPC, tIPC, 0 * sizeof( cxtype ) ); // nicoup=0
 #else
     memcpy( cIPD, tIPD, 2 * sizeof( fptype ) );
     //memcpy( cIPC, tIPC, 0 * sizeof( cxtype ) ); // nicoup=0
@@ -544,7 +545,7 @@ namespace mg5amcCpu
   {
     std::stringstream out;
     // CUDA version (NVCC)
-    // [Use __NVCC__ instead of __CUDACC__ here!]
+    // [Use __NVCC__ instead of MGONGPUCPP_GPUIMPL here!]
     // [This tests if 'nvcc' was used even to build a .cc file, even if not necessarily 'nvcc -x cu' for a .cu file]
     // [Check 'nvcc --compiler-options -dM -E dummy.c | grep CUDA': see https://stackoverflow.com/a/53713712]
 #ifdef __NVCC__
@@ -609,12 +610,12 @@ namespace mg5amcCpu
   __global__ void /* clang-format off */
   computeDependentCouplings( const fptype* allgs, // input: Gs[nevt]
                              fptype* allcouplings // output: couplings[nevt*ndcoup*2]
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
                              , const int nevt     // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #endif
   ) /* clang-format on */
   {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     using namespace mg5amcGpu;
     using G_ACCESS = DeviceAccessGs;
     using C_ACCESS = DeviceAccessCouplings;
@@ -635,7 +636,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__ /* clang-format off */
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
   __global__ void
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
@@ -761,9 +762,9 @@ namespace mg5amcCpu
         nGoodHel++;
       }
     }
-#ifdef __CUDACC__
-    checkCuda( cudaMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) ) );
-    checkCuda( cudaMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) ) );
+#ifdef MGONGPUCPP_GPUIMPL
+    gpuMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) );
+    gpuMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) );
 #else
     cNGoodHel = nGoodHel;
     for( int ihel = 0; ihel < ncomb; ihel++ ) cGoodHel[ihel] = goodHel[ihel];
@@ -787,7 +788,7 @@ namespace mg5amcCpu
 #endif
             int* allselhel,                // output: helicity selection[nevt]
             int* allselcol                 // output: helicity selection[nevt]
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
             , const int nevt               // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #endif
             ) /* clang-format on */
@@ -807,7 +808,7 @@ namespace mg5amcCpu
     // Denominators: spins, colors and identical particles
     constexpr int helcolDenominators[1] = { 256 }; // assume nprocesses == 1 (#272 and #343)
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     // Remember: in CUDA this is a kernel for one event, in c++ this processes n events
     const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid
 #else
@@ -821,9 +822,12 @@ namespace mg5amcCpu
 #endif
 
     // Start sigmaKin_lines
+
+#include "GpuAbstraction.h"
+
     // === PART 0 - INITIALISATION (before calculate_wavefunctions) ===
     // Reset the "matrix elements" - running sums of |M|^2 over helicities for the given event
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     allMEs[ievt] = 0;
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     allNumerators[ievt] = 0;
@@ -851,7 +855,7 @@ namespace mg5amcCpu
     // === PART 1 - HELICITY LOOP: CALCULATE WAVEFUNCTIONS ===
     // (in both CUDA and C++, using precomputed good helicities)
 
-#ifdef __CUDACC__ // CUDA OR C++
+#ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++
 
     // *** START OF PART 1a - CUDA (one event per CPU thread) ***
     // Running sum of partial amplitudes squared for event by event color selection (#402)
@@ -1061,7 +1065,7 @@ namespace mg5amcCpu
     // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event
     // [NB 'sum over final spins, average over initial spins', eg see
     // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf]
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     allMEs[ievt] /= helcolDenominators[0];
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     if( channelId > 0 ) allMEs[ievt] *= allNumerators[ievt] / allDenominators[ievt];
diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/CPPProcess.h b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/CPPProcess.h
index 3ebd92c038..4a88a07226 100644
--- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/CPPProcess.h
+++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/CPPProcess.h
@@ -4,7 +4,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
 // MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
@@ -25,7 +25,7 @@
 
 //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -107,7 +107,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   __global__ void
   computeDependentCouplings( const fptype* allgs,    // input: Gs[nevt]
                              fptype* allcouplings ); // output: couplings[nevt*ndcoup*2]
@@ -120,7 +120,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__ /* clang-format off */
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
   __global__ void
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
@@ -150,7 +150,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__ /* clang-format off */
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
   __global__ void
   sigmaKin( const fptype* allmomenta,      // input: momenta[nevt*npar*4]
             const fptype* allcouplings,    // input: couplings[nevt*ndcoup*2]
diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/CudaRuntime.h b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/CudaRuntime.h
deleted file mode 120000
index ce9e1a487a..0000000000
--- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/CudaRuntime.h
+++ /dev/null
@@ -1 +0,0 @@
-../CudaRuntime.h
\ No newline at end of file
diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/check_sa.cc b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/check_sa.cc
index 3fbf0ffbee..7cac5ab47b 100644
--- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/check_sa.cc
+++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/check_sa.cc
@@ -4,7 +4,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: O. Mattelaer (Nov 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 
 #include "mgOnGpuConfig.h"
@@ -12,6 +12,7 @@
 #include "BridgeKernels.h"
 #include "CPPProcess.h"
 #include "CrossSectionKernels.h"
+#include "GpuRuntime.h"
 #include "MatrixElementKernels.h"
 #include "MemoryAccessMatrixElements.h"
 #include "MemoryAccessMomenta.h"
@@ -65,7 +66,7 @@ usage( char* argv0, int ret = 1 )
   std::cout << std::endl;
   std::cout << "Summary stats are always computed: '-p' and '-j' only control their printout" << std::endl;
   std::cout << "The '-d' flag only enables NaN/abnormal warnings and OMP debugging" << std::endl;
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 #ifdef _OPENMP
   std::cout << std::endl;
   std::cout << "Use the OMP_NUM_THREADS environment variable to control OMP multi-threading" << std::endl;
@@ -96,7 +97,7 @@ int
 main( int argc, char** argv )
 {
   // Namespaces for CUDA and C++ (FIXME - eventually use the same namespace everywhere...)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   using namespace mg5amcGpu;
 #else
   using namespace mg5amcCpu;
@@ -134,9 +135,11 @@ main( int argc, char** argv )
     CurandDevice = 2
   };
 #ifdef MGONGPU_HAS_NO_CURAND
-  RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // this is the only supported mode if build has no curand (PR #784)
+  RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // this is the only supported mode if build has no curand (PR #784 and #785)
+#elif defined __HIPCC__
+#error Internal error: MGONGPU_HAS_NO_CURAND should have been set for __HIPCC__ // default on AMD GPUs should be common random
 #elif defined __CUDACC__
-  RandomNumberMode rndgen = RandomNumberMode::CurandDevice; // default on GPU if build has curand
+  RandomNumberMode rndgen = RandomNumberMode::CurandDevice; // default on NVidia GPU if build has curand
 #else
   RandomNumberMode rndgen = RandomNumberMode::CurandHost; // default on CPU if build has curand
 #endif
@@ -146,10 +149,10 @@ main( int argc, char** argv )
     RamboHost = 1,
     RamboDevice = 2
   };
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   RamboSamplingMode rmbsmp = RamboSamplingMode::RamboDevice; // default on GPU
 #else
-  RamboSamplingMode rmbsmp = RamboSamplingMode::RamboHost;  // default on CPU
+  RamboSamplingMode rmbsmp = RamboSamplingMode::RamboHost; // default on CPU
 #endif
   // Bridge emulation mode (NB Bridge implies RamboHost!)
   bool bridge = false;
@@ -177,7 +180,7 @@ main( int argc, char** argv )
     else if( arg == "--curdev" )
     {
 #ifndef __CUDACC__
-      throw std::runtime_error( "CurandDevice is not supported on CPUs" );
+      throw std::runtime_error( "CurandDevice is not supported on CPUs or non-NVidia GPUs" );
 #elif defined MGONGPU_HAS_NO_CURAND
       throw std::runtime_error( "CurandDevice is not supported because this application was built without Curand support" );
 #else
@@ -198,7 +201,7 @@ main( int argc, char** argv )
     }
     else if( arg == "--rmbdev" )
     {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
       rmbsmp = RamboSamplingMode::RamboDevice;
 #else
       throw std::runtime_error( "RamboDevice is not supported on CPUs" );
@@ -272,13 +275,13 @@ main( int argc, char** argv )
     return usage( argv[0] );
   }
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 #ifdef _OPENMP
   ompnumthreadsNotSetMeansOneThread( debug ? 1 : 0 ); // quiet(-1), info(0), debug(1)
 #endif
 #endif
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // Fail gently and avoid "Illegal instruction (core dumped)" if the host does not support the SIMD used in the ME calculation
   // Note: this prevents a crash on pmpe04 but not on some github CI nodes?
   // [NB: SIMD vectorization in mg5amc C++ code is only used in the ME calculation below MatrixElementKernelHost!]
@@ -296,14 +299,14 @@ main( int argc, char** argv )
 
   // === STEP 0 - INITIALISE
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 
-  // --- 00. Initialise cuda
-  // Instantiate a CudaRuntime at the beginnining of the application's main to
-  // invoke cudaSetDevice(0) in the constructor and book a cudaDeviceReset() call in the destructor
-  const std::string cdinKey = "00 CudaInit";
+  // --- 00. Initialise GPU
+  // Instantiate a GpuRuntime at the beginnining of the application's main.
+  // For CUDA this invokes cudaSetDevice(0) in the constructor and books a cudaDeviceReset() call in the destructor.
+  const std::string cdinKey = "00 GpuInit";
   timermap.start( cdinKey );
-  CudaRuntime cudaRuntime( debug );
+  GpuRuntime GpuRuntime( debug );
 #endif
 
   // --- 0a. Initialise physics process
@@ -325,7 +328,7 @@ main( int argc, char** argv )
   timermap.start( alloKey );
 
   // Memory buffers for random numbers for momenta
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferRndNumMomenta hstRndmom( nevt );
 #else
   PinnedHostBufferRndNumMomenta hstRndmom( nevt );
@@ -333,7 +336,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for sampling weights
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferWeights hstWeights( nevt );
 #else
   PinnedHostBufferWeights hstWeights( nevt );
@@ -341,7 +344,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for momenta
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferMomenta hstMomenta( nevt );
 #else
   PinnedHostBufferMomenta hstMomenta( nevt );
@@ -349,7 +352,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for Gs
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferGs hstGs( nevt );
 #else
   PinnedHostBufferGs hstGs( nevt );
@@ -366,7 +369,7 @@ main( int argc, char** argv )
   }
 
   // Memory buffers for matrix elements
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferMatrixElements hstMatrixElements( nevt );
 #else
   PinnedHostBufferMatrixElements hstMatrixElements( nevt );
@@ -375,7 +378,7 @@ main( int argc, char** argv )
 
   // Memory buffers for random numbers for helicity selection
   // *** NB #403 these buffers always remain initialised at 0: no need for helicity choice in gcheck/check (no LHE produced) ***
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferRndNumHelicity hstRndHel( nevt );
 #else
   PinnedHostBufferRndNumHelicity hstRndHel( nevt );
@@ -384,7 +387,7 @@ main( int argc, char** argv )
 
   // Memory buffers for random numbers for color selection
   // *** NB #402 these buffers always remain initialised at 0: no need for color choice in gcheck/check (no LHE produced) ***
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferRndNumColor hstRndCol( nevt );
 #else
   PinnedHostBufferRndNumColor hstRndCol( nevt );
@@ -392,7 +395,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for helicity selection
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferSelectedHelicity hstSelHel( nevt );
 #else
   PinnedHostBufferSelectedHelicity hstSelHel( nevt );
@@ -400,7 +403,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for color selection
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferSelectedColor hstSelCol( nevt );
 #else
   PinnedHostBufferSelectedColor hstSelCol( nevt );
@@ -438,7 +441,7 @@ main( int argc, char** argv )
     const bool onDevice = true;
     prnk.reset( new CurandRandomNumberKernel( devRndmom, onDevice ) );
 #else
-    throw std::logic_error( "INTERNAL ERROR! CurandDevice is not supported on CPUs" ); // INTERNAL ERROR (no path to this statement)
+    throw std::logic_error( "INTERNAL ERROR! CurandDevice is not supported on CPUs or non-NVidia GPUs" ); // INTERNAL ERROR (no path to this statement)
 #endif
   }
 
@@ -450,7 +453,7 @@ main( int argc, char** argv )
   }
   else
   {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     prsk.reset( new RamboSamplingKernelDevice( energy, devRndmom, devMomenta, devWeights, gpublocks, gputhreads ) );
 #else
     throw std::logic_error( "RamboDevice is not supported on CPUs" ); // INTERNAL ERROR (no path to this statement)
@@ -461,7 +464,7 @@ main( int argc, char** argv )
   std::unique_ptr<MatrixElementKernelBase> pmek;
   if( !bridge )
   {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     pmek.reset( new MatrixElementKernelDevice( devMomenta, devGs, devRndHel, devRndCol, devMatrixElements, devSelHel, devSelCol, gpublocks, gputhreads ) );
 #else
     pmek.reset( new MatrixElementKernelHost( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, nevt ) );
@@ -469,7 +472,7 @@ main( int argc, char** argv )
   }
   else
   {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     pmek.reset( new BridgeKernelDevice( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, gpublocks, gputhreads ) );
 #else
     pmek.reset( new BridgeKernelHost( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, nevt ) );
@@ -511,7 +514,7 @@ main( int argc, char** argv )
     prnk->generateRnarray();
     //std::cout << "Got random numbers" << std::endl;
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     if( rndgen != RandomNumberMode::CurandDevice && rmbsmp == RamboSamplingMode::RamboDevice )
     {
       // --- 1c. Copy rndmom from host to device
@@ -543,7 +546,7 @@ main( int argc, char** argv )
     prsk->getMomentaFinal();
     //std::cout << "Got final momenta" << std::endl;
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     if( rmbsmp == RamboSamplingMode::RamboDevice )
     {
       // --- 2c. CopyDToH Weights
@@ -588,7 +591,7 @@ main( int argc, char** argv )
       dynamic_cast<BridgeKernelBase*>( pmek.get() )->transposeInputMomentaC2F();
     }
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     // --- 2d. CopyHToD Momenta
     const std::string gKey = "0.. CpHTDg";
     rambtime += timermap.start( gKey ); // FIXME! NOT A RAMBO TIMER!
@@ -617,7 +620,7 @@ main( int argc, char** argv )
     wv3atime += timermap.stop(); // calc only
     wavetime += wv3atime;        // calc plus copy
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     if( !bridge )
     {
       // --- 3b. CopyDToH MEs
@@ -760,18 +763,22 @@ main( int argc, char** argv )
     rndgentxt = "CURAND DEVICE";
 #ifdef __CUDACC__
   rndgentxt += " (CUDA code)";
+#elif defined __HIPCC__
+  rndgentxt += " (HIP code)";
 #else
   rndgentxt += " (C++ code)";
 #endif
 
   // Workflow description summary
   std::string wrkflwtxt;
-  // -- CUDA or C++?
+  // -- CUDA or HIP or C++?
 #ifdef __CUDACC__
   wrkflwtxt += "CUD:";
+#elif defined __HIPCC__
+  wrkflwtxt += "HIP:";
 #else
   wrkflwtxt += "CPP:";
-#endif
+#endif /* clang-format off */
   // -- DOUBLE or FLOAT?
 #if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
   wrkflwtxt += "MIX+"; // mixed fptypes (single precision color algebra #537)
@@ -781,7 +788,7 @@ main( int argc, char** argv )
   wrkflwtxt += "FLT+";
 #else
   wrkflwtxt += "???+"; // no path to this statement
-#endif
+#endif /* clang-format on */
   // -- CUCOMPLEX or THRUST or STD complex numbers?
 #ifdef __CUDACC__
 #if defined MGONGPU_CUCXTYPE_CUCOMPLEX
@@ -793,6 +800,12 @@ main( int argc, char** argv )
 #else
   wrkflwtxt += "???:"; // no path to this statement
 #endif
+#elif defined __HIPCC__
+#if defined MGONGPU_CUCXTYPE_CXSMPL
+  wrkflwtxt += "CXS:";
+#else
+  wrkflwtxt += "???:"; // no path to this statement
+#endif
 #else
 #if defined MGONGPU_CPPCXTYPE_STDCOMPLEX
   wrkflwtxt += "STX:";
@@ -818,7 +831,7 @@ main( int argc, char** argv )
     wrkflwtxt += "RMBDEV+";
   else
     wrkflwtxt += "??????+"; // no path to this statement
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   // -- HOST or DEVICE matrix elements? Standalone MEs or BRIDGE?
   if( !bridge )
     wrkflwtxt += "MESDEV";
@@ -874,7 +887,7 @@ main( int argc, char** argv )
 
   if( perf )
   {
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 #ifdef _OPENMP
     // Get the output of "nproc --all" (https://stackoverflow.com/a/478960)
     std::string nprocall;
@@ -895,6 +908,8 @@ main( int argc, char** argv )
     std::cout << std::string( SEP79, '*' ) << std::endl
 #ifdef __CUDACC__
               << "Process                     = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_CUDA"
+#elif defined __HIPCC__
+              << "Process                     = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_HIP"
 #else
               << "Process                     = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_CPP"
 #endif
@@ -921,21 +936,21 @@ main( int argc, char** argv )
 #elif defined MGONGPU_FPTYPE_FLOAT
               << "FP precision                = FLOAT (NaN/abnormal=" << nabn << ", zero=" << nzero << ")" << std::endl
 #endif
-#ifdef __CUDACC__
 #if defined MGONGPU_CUCXTYPE_CUCOMPLEX
               << "Complex type                = CUCOMPLEX" << std::endl
 #elif defined MGONGPU_CUCXTYPE_THRUST
               << "Complex type                = THRUST::COMPLEX" << std::endl
-#endif
-#else
+#elif defined MGONGPU_CUCXTYPE_CXSMPL
               << "Complex type                = STD::COMPLEX" << std::endl
+#else
+              << "Complex type                = ???" << std::endl // no path to this statement...
 #endif
               << "RanNumb memory layout       = AOSOA[" << neppR << "]"
               << ( neppR == 1 ? " == AOS" : "" )
               << " [HARDCODED FOR REPRODUCIBILITY]" << std::endl
               << "Momenta memory layout       = AOSOA[" << neppM << "]"
               << ( neppM == 1 ? " == AOS" : "" ) << std::endl
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     //<< "Wavefunction GPU memory     = LOCAL" << std::endl
 #else
 #if !defined MGONGPU_CPPSIMD
@@ -966,7 +981,7 @@ main( int argc, char** argv )
 #endif
 #endif
               << "Random number generation    = " << rndgentxt << std::endl
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 #ifdef _OPENMP
               << "OMP threads / `nproc --all` = " << omp_get_max_threads() << " / " << nprocall // includes a newline
 #endif
@@ -1062,14 +1077,14 @@ main( int argc, char** argv )
              << "\"FLOAT (NaN/abnormal=" << nabn << ")\"," << std::endl
 #endif
              << "\"Complex type\": "
-#ifdef __CUDACC__
 #if defined MGONGPU_CUCXTYPE_CUCOMPLEX
              << "\"CUCOMPLEX\"," << std::endl
 #elif defined MGONGPU_CUCXTYPE_THRUST
              << "\"THRUST::COMPLEX\"," << std::endl
-#endif
-#else
+#elif defined MGONGPU_CUCXTYPE_CXSMPL
              << "\"STD::COMPLEX\"," << std::endl
+#else
+             << "\"???\"," << std::endl                           // no path to this statement...
 #endif
              << "\"RanNumb memory layout\": "
              << "\"AOSOA[" << neppR << "]\""
@@ -1077,7 +1092,7 @@ main( int argc, char** argv )
              << "\"Momenta memory layout\": "
              << "\"AOSOA[" << neppM << "]\""
              << ( neppM == 1 ? " == AOS" : "" ) << ", " << std::endl
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     //<< "\"Wavefunction GPU memory\": " << "\"LOCAL\"," << std::endl
 #endif
              << "\"Curand generation\": "
diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/CPPProcess.cc b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/CPPProcess.cc
index bfab81142d..3c7715b235 100644
--- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/CPPProcess.cc
+++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/CPPProcess.cc
@@ -4,7 +4,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
 // MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
@@ -16,7 +16,6 @@
 
 #include "mgOnGpuConfig.h"
 
-#include "CudaRuntime.h"
 #include "HelAmps_sm.h"
 #include "MemoryAccessAmplitudes.h"
 #include "MemoryAccessCouplings.h"
@@ -46,7 +45,7 @@
 // Class member functions for calculating the matrix elements for
 // Process: g g > t t~ g WEIGHTED<=3 @2
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -80,7 +79,7 @@ namespace mg5amcCpu
   __device__ const fptype cIPD[2] = { (fptype)Parameters_sm::mdl_MT, (fptype)Parameters_sm::mdl_WT };
   __device__ const fptype* cIPC = nullptr; // unused as nicoup=0
 #else
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   __device__ __constant__ fptype cIPD[2];
   __device__ __constant__ fptype* cIPC = nullptr; // unused as nicoup=0
 #else
@@ -90,7 +89,7 @@ namespace mg5amcCpu
 #endif
 
   // Helicity combinations (and filtering of "good" helicity combinations)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   __device__ __constant__ short cHel[ncomb][npar];
   __device__ __constant__ int cNGoodHel;
   __device__ __constant__ int cGoodHel[ncomb];
@@ -118,13 +117,13 @@ namespace mg5amcCpu
                            fptype* allDenominators,       // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
                            fptype_sv* jamp2_sv            // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled)
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
                            , const int ievt00             // input: first event number in current C++ event page (for CUDA, ievt depends on threadid)
 #endif
                            )
   //ALWAYS_INLINE // attributes are not permitted in a function definition
   {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     using namespace mg5amcGpu;
     using M_ACCESS = DeviceAccessMomenta;         // non-trivial access: buffer includes all events
     using E_ACCESS = DeviceAccessMatrixElements;  // non-trivial access: buffer includes all events
@@ -151,7 +150,7 @@ namespace mg5amcCpu
 #endif /* clang-format on */
     mgDebug( 0, __FUNCTION__ );
     //printf( "calculate_wavefunctions: ihel=%2d\n", ihel );
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
     //printf( "calculate_wavefunctions: ievt00=%d\n", ievt00 );
 #endif
 
@@ -187,7 +186,7 @@ namespace mg5amcCpu
 #endif
     for( int iParity = 0; iParity < nParity; ++iParity )
     { // START LOOP ON IPARITY
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
       const int ievt0 = ievt00 + iParity * neppV;
 #endif
       constexpr size_t nxcoup = ndcoup + nicoup; // both dependent and independent couplings
@@ -200,8 +199,10 @@ namespace mg5amcCpu
         allCOUPs[idcoup] = CD_ACCESS::idcoupAccessBufferConst( allcouplings, idcoup ); // dependent couplings, vary event-by-event
       for( size_t iicoup = 0; iicoup < nicoup; iicoup++ )
         allCOUPs[ndcoup + iicoup] = CI_ACCESS::iicoupAccessBufferConst( cIPC, iicoup ); // independent couplings, fixed for all events
+#ifdef MGONGPUCPP_GPUIMPL
 #ifdef __CUDACC__
 #pragma nv_diagnostic pop
+#endif
       // CUDA kernels take input/output buffers with momenta/MEs for all events
       const fptype* momenta = allmomenta;
       const fptype* COUPs[nxcoup];
@@ -505,7 +506,7 @@ namespace mg5amcCpu
         { 1, -8, 10, 1, 64, -8 },
         { 10, 1, 1, -8, -8, 64 } }; // 2-D array[6][6]
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
       // Pre-compute a constexpr triangular color matrix properly normalized #475
       struct TriangularNormalizedColorMatrix
       {
@@ -562,7 +563,7 @@ namespace mg5amcCpu
 #endif
       for( int icol = 0; icol < ncolor; icol++ )
       {
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
         // === C++ START ===
         // Diagonal terms
 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
@@ -621,7 +622,7 @@ namespace mg5amcCpu
       MEs_sv_previous += deltaMEs_previous;
 #endif
       /*
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
       if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", blockDim.x * blockIdx.x + threadIdx.x, ihel, MEs_sv );
 #else
 #ifdef MGONGPU_CPPSIMD
@@ -684,8 +685,8 @@ namespace mg5amcCpu
       { 1, 1, 1, 1, 1 },
       { 1, 1, 1, -1, -1 },
       { 1, 1, 1, -1, 1 } };
-#ifdef __CUDACC__
-    checkCuda( cudaMemcpyToSymbol( cHel, tHel, ncomb * npar * sizeof( short ) ) );
+#ifdef MGONGPUCPP_GPUIMPL
+    gpuMemcpyToSymbol( cHel, tHel, ncomb * npar * sizeof( short ) );
 #else
     memcpy( cHel, tHel, ncomb * npar * sizeof( short ) );
 #endif
@@ -726,9 +727,9 @@ namespace mg5amcCpu
     // Then copy them to CUDA constant memory (issue #39) or its C++ emulation in file-scope static memory
     const fptype tIPD[2] = { (fptype)m_pars->mdl_MT, (fptype)m_pars->mdl_WT };
     //const cxtype tIPC[0] = { ... }; // nicoup=0
-#ifdef __CUDACC__
-    checkCuda( cudaMemcpyToSymbol( cIPD, tIPD, 2 * sizeof( fptype ) ) );
-    //checkCuda( cudaMemcpyToSymbol( cIPC, tIPC, 0 * sizeof( cxtype ) ) ); // nicoup=0
+#ifdef MGONGPUCPP_GPUIMPL
+    gpuMemcpyToSymbol( cIPD, tIPD, 2 * sizeof( fptype ) );
+    //gpuMemcpyToSymbol( cIPC, tIPC, 0 * sizeof( cxtype ) ); // nicoup=0
 #else
     memcpy( cIPD, tIPD, 2 * sizeof( fptype ) );
     //memcpy( cIPC, tIPC, 0 * sizeof( cxtype ) ); // nicoup=0
@@ -765,7 +766,7 @@ namespace mg5amcCpu
   {
     std::stringstream out;
     // CUDA version (NVCC)
-    // [Use __NVCC__ instead of __CUDACC__ here!]
+    // [Use __NVCC__ instead of MGONGPUCPP_GPUIMPL here!]
     // [This tests if 'nvcc' was used even to build a .cc file, even if not necessarily 'nvcc -x cu' for a .cu file]
     // [Check 'nvcc --compiler-options -dM -E dummy.c | grep CUDA': see https://stackoverflow.com/a/53713712]
 #ifdef __NVCC__
@@ -830,12 +831,12 @@ namespace mg5amcCpu
   __global__ void /* clang-format off */
   computeDependentCouplings( const fptype* allgs, // input: Gs[nevt]
                              fptype* allcouplings // output: couplings[nevt*ndcoup*2]
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
                              , const int nevt     // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #endif
   ) /* clang-format on */
   {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     using namespace mg5amcGpu;
     using G_ACCESS = DeviceAccessGs;
     using C_ACCESS = DeviceAccessCouplings;
@@ -856,7 +857,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__ /* clang-format off */
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
   __global__ void
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
@@ -982,9 +983,9 @@ namespace mg5amcCpu
         nGoodHel++;
       }
     }
-#ifdef __CUDACC__
-    checkCuda( cudaMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) ) );
-    checkCuda( cudaMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) ) );
+#ifdef MGONGPUCPP_GPUIMPL
+    gpuMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) );
+    gpuMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) );
 #else
     cNGoodHel = nGoodHel;
     for( int ihel = 0; ihel < ncomb; ihel++ ) cGoodHel[ihel] = goodHel[ihel];
@@ -1008,7 +1009,7 @@ namespace mg5amcCpu
 #endif
             int* allselhel,                // output: helicity selection[nevt]
             int* allselcol                 // output: helicity selection[nevt]
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
             , const int nevt               // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #endif
             ) /* clang-format on */
@@ -1028,7 +1029,7 @@ namespace mg5amcCpu
     // Denominators: spins, colors and identical particles
     constexpr int helcolDenominators[1] = { 256 }; // assume nprocesses == 1 (#272 and #343)
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     // Remember: in CUDA this is a kernel for one event, in c++ this processes n events
     const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid
 #else
@@ -1042,9 +1043,12 @@ namespace mg5amcCpu
 #endif
 
     // Start sigmaKin_lines
+
+#include "GpuAbstraction.h"
+
     // === PART 0 - INITIALISATION (before calculate_wavefunctions) ===
     // Reset the "matrix elements" - running sums of |M|^2 over helicities for the given event
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     allMEs[ievt] = 0;
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     allNumerators[ievt] = 0;
@@ -1072,7 +1076,7 @@ namespace mg5amcCpu
     // === PART 1 - HELICITY LOOP: CALCULATE WAVEFUNCTIONS ===
     // (in both CUDA and C++, using precomputed good helicities)
 
-#ifdef __CUDACC__ // CUDA OR C++
+#ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++
 
     // *** START OF PART 1a - CUDA (one event per CPU thread) ***
     // Running sum of partial amplitudes squared for event by event color selection (#402)
@@ -1282,7 +1286,7 @@ namespace mg5amcCpu
     // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event
     // [NB 'sum over final spins, average over initial spins', eg see
     // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf]
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     allMEs[ievt] /= helcolDenominators[0];
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     if( channelId > 0 ) allMEs[ievt] *= allNumerators[ievt] / allDenominators[ievt];
diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/CPPProcess.h b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/CPPProcess.h
index 3901ddcb20..d4b3c0445c 100644
--- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/CPPProcess.h
+++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/CPPProcess.h
@@ -4,7 +4,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
 // MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
@@ -25,7 +25,7 @@
 
 //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -107,7 +107,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   __global__ void
   computeDependentCouplings( const fptype* allgs,    // input: Gs[nevt]
                              fptype* allcouplings ); // output: couplings[nevt*ndcoup*2]
@@ -120,7 +120,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__ /* clang-format off */
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
   __global__ void
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
@@ -150,7 +150,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__ /* clang-format off */
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
   __global__ void
   sigmaKin( const fptype* allmomenta,      // input: momenta[nevt*npar*4]
             const fptype* allcouplings,    // input: couplings[nevt*ndcoup*2]
diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/CudaRuntime.h b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/CudaRuntime.h
deleted file mode 120000
index ce9e1a487a..0000000000
--- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/CudaRuntime.h
+++ /dev/null
@@ -1 +0,0 @@
-../CudaRuntime.h
\ No newline at end of file
diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/check_sa.cc b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/check_sa.cc
index 3fbf0ffbee..7cac5ab47b 100644
--- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/check_sa.cc
+++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/check_sa.cc
@@ -4,7 +4,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: O. Mattelaer (Nov 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 
 #include "mgOnGpuConfig.h"
@@ -12,6 +12,7 @@
 #include "BridgeKernels.h"
 #include "CPPProcess.h"
 #include "CrossSectionKernels.h"
+#include "GpuRuntime.h"
 #include "MatrixElementKernels.h"
 #include "MemoryAccessMatrixElements.h"
 #include "MemoryAccessMomenta.h"
@@ -65,7 +66,7 @@ usage( char* argv0, int ret = 1 )
   std::cout << std::endl;
   std::cout << "Summary stats are always computed: '-p' and '-j' only control their printout" << std::endl;
   std::cout << "The '-d' flag only enables NaN/abnormal warnings and OMP debugging" << std::endl;
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 #ifdef _OPENMP
   std::cout << std::endl;
   std::cout << "Use the OMP_NUM_THREADS environment variable to control OMP multi-threading" << std::endl;
@@ -96,7 +97,7 @@ int
 main( int argc, char** argv )
 {
   // Namespaces for CUDA and C++ (FIXME - eventually use the same namespace everywhere...)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   using namespace mg5amcGpu;
 #else
   using namespace mg5amcCpu;
@@ -134,9 +135,11 @@ main( int argc, char** argv )
     CurandDevice = 2
   };
 #ifdef MGONGPU_HAS_NO_CURAND
-  RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // this is the only supported mode if build has no curand (PR #784)
+  RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // this is the only supported mode if build has no curand (PR #784 and #785)
+#elif defined __HIPCC__
+#error Internal error: MGONGPU_HAS_NO_CURAND should have been set for __HIPCC__ // default on AMD GPUs should be common random
 #elif defined __CUDACC__
-  RandomNumberMode rndgen = RandomNumberMode::CurandDevice; // default on GPU if build has curand
+  RandomNumberMode rndgen = RandomNumberMode::CurandDevice; // default on NVidia GPU if build has curand
 #else
   RandomNumberMode rndgen = RandomNumberMode::CurandHost; // default on CPU if build has curand
 #endif
@@ -146,10 +149,10 @@ main( int argc, char** argv )
     RamboHost = 1,
     RamboDevice = 2
   };
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   RamboSamplingMode rmbsmp = RamboSamplingMode::RamboDevice; // default on GPU
 #else
-  RamboSamplingMode rmbsmp = RamboSamplingMode::RamboHost;  // default on CPU
+  RamboSamplingMode rmbsmp = RamboSamplingMode::RamboHost; // default on CPU
 #endif
   // Bridge emulation mode (NB Bridge implies RamboHost!)
   bool bridge = false;
@@ -177,7 +180,7 @@ main( int argc, char** argv )
     else if( arg == "--curdev" )
     {
 #ifndef __CUDACC__
-      throw std::runtime_error( "CurandDevice is not supported on CPUs" );
+      throw std::runtime_error( "CurandDevice is not supported on CPUs or non-NVidia GPUs" );
 #elif defined MGONGPU_HAS_NO_CURAND
       throw std::runtime_error( "CurandDevice is not supported because this application was built without Curand support" );
 #else
@@ -198,7 +201,7 @@ main( int argc, char** argv )
     }
     else if( arg == "--rmbdev" )
     {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
       rmbsmp = RamboSamplingMode::RamboDevice;
 #else
       throw std::runtime_error( "RamboDevice is not supported on CPUs" );
@@ -272,13 +275,13 @@ main( int argc, char** argv )
     return usage( argv[0] );
   }
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 #ifdef _OPENMP
   ompnumthreadsNotSetMeansOneThread( debug ? 1 : 0 ); // quiet(-1), info(0), debug(1)
 #endif
 #endif
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // Fail gently and avoid "Illegal instruction (core dumped)" if the host does not support the SIMD used in the ME calculation
   // Note: this prevents a crash on pmpe04 but not on some github CI nodes?
   // [NB: SIMD vectorization in mg5amc C++ code is only used in the ME calculation below MatrixElementKernelHost!]
@@ -296,14 +299,14 @@ main( int argc, char** argv )
 
   // === STEP 0 - INITIALISE
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 
-  // --- 00. Initialise cuda
-  // Instantiate a CudaRuntime at the beginnining of the application's main to
-  // invoke cudaSetDevice(0) in the constructor and book a cudaDeviceReset() call in the destructor
-  const std::string cdinKey = "00 CudaInit";
+  // --- 00. Initialise GPU
+  // Instantiate a GpuRuntime at the beginnining of the application's main.
+  // For CUDA this invokes cudaSetDevice(0) in the constructor and books a cudaDeviceReset() call in the destructor.
+  const std::string cdinKey = "00 GpuInit";
   timermap.start( cdinKey );
-  CudaRuntime cudaRuntime( debug );
+  GpuRuntime GpuRuntime( debug );
 #endif
 
   // --- 0a. Initialise physics process
@@ -325,7 +328,7 @@ main( int argc, char** argv )
   timermap.start( alloKey );
 
   // Memory buffers for random numbers for momenta
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferRndNumMomenta hstRndmom( nevt );
 #else
   PinnedHostBufferRndNumMomenta hstRndmom( nevt );
@@ -333,7 +336,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for sampling weights
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferWeights hstWeights( nevt );
 #else
   PinnedHostBufferWeights hstWeights( nevt );
@@ -341,7 +344,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for momenta
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferMomenta hstMomenta( nevt );
 #else
   PinnedHostBufferMomenta hstMomenta( nevt );
@@ -349,7 +352,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for Gs
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferGs hstGs( nevt );
 #else
   PinnedHostBufferGs hstGs( nevt );
@@ -366,7 +369,7 @@ main( int argc, char** argv )
   }
 
   // Memory buffers for matrix elements
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferMatrixElements hstMatrixElements( nevt );
 #else
   PinnedHostBufferMatrixElements hstMatrixElements( nevt );
@@ -375,7 +378,7 @@ main( int argc, char** argv )
 
   // Memory buffers for random numbers for helicity selection
   // *** NB #403 these buffers always remain initialised at 0: no need for helicity choice in gcheck/check (no LHE produced) ***
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferRndNumHelicity hstRndHel( nevt );
 #else
   PinnedHostBufferRndNumHelicity hstRndHel( nevt );
@@ -384,7 +387,7 @@ main( int argc, char** argv )
 
   // Memory buffers for random numbers for color selection
   // *** NB #402 these buffers always remain initialised at 0: no need for color choice in gcheck/check (no LHE produced) ***
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferRndNumColor hstRndCol( nevt );
 #else
   PinnedHostBufferRndNumColor hstRndCol( nevt );
@@ -392,7 +395,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for helicity selection
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferSelectedHelicity hstSelHel( nevt );
 #else
   PinnedHostBufferSelectedHelicity hstSelHel( nevt );
@@ -400,7 +403,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for color selection
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferSelectedColor hstSelCol( nevt );
 #else
   PinnedHostBufferSelectedColor hstSelCol( nevt );
@@ -438,7 +441,7 @@ main( int argc, char** argv )
     const bool onDevice = true;
     prnk.reset( new CurandRandomNumberKernel( devRndmom, onDevice ) );
 #else
-    throw std::logic_error( "INTERNAL ERROR! CurandDevice is not supported on CPUs" ); // INTERNAL ERROR (no path to this statement)
+    throw std::logic_error( "INTERNAL ERROR! CurandDevice is not supported on CPUs or non-NVidia GPUs" ); // INTERNAL ERROR (no path to this statement)
 #endif
   }
 
@@ -450,7 +453,7 @@ main( int argc, char** argv )
   }
   else
   {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     prsk.reset( new RamboSamplingKernelDevice( energy, devRndmom, devMomenta, devWeights, gpublocks, gputhreads ) );
 #else
     throw std::logic_error( "RamboDevice is not supported on CPUs" ); // INTERNAL ERROR (no path to this statement)
@@ -461,7 +464,7 @@ main( int argc, char** argv )
   std::unique_ptr<MatrixElementKernelBase> pmek;
   if( !bridge )
   {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     pmek.reset( new MatrixElementKernelDevice( devMomenta, devGs, devRndHel, devRndCol, devMatrixElements, devSelHel, devSelCol, gpublocks, gputhreads ) );
 #else
     pmek.reset( new MatrixElementKernelHost( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, nevt ) );
@@ -469,7 +472,7 @@ main( int argc, char** argv )
   }
   else
   {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     pmek.reset( new BridgeKernelDevice( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, gpublocks, gputhreads ) );
 #else
     pmek.reset( new BridgeKernelHost( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, nevt ) );
@@ -511,7 +514,7 @@ main( int argc, char** argv )
     prnk->generateRnarray();
     //std::cout << "Got random numbers" << std::endl;
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     if( rndgen != RandomNumberMode::CurandDevice && rmbsmp == RamboSamplingMode::RamboDevice )
     {
       // --- 1c. Copy rndmom from host to device
@@ -543,7 +546,7 @@ main( int argc, char** argv )
     prsk->getMomentaFinal();
     //std::cout << "Got final momenta" << std::endl;
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     if( rmbsmp == RamboSamplingMode::RamboDevice )
     {
       // --- 2c. CopyDToH Weights
@@ -588,7 +591,7 @@ main( int argc, char** argv )
       dynamic_cast<BridgeKernelBase*>( pmek.get() )->transposeInputMomentaC2F();
     }
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     // --- 2d. CopyHToD Momenta
     const std::string gKey = "0.. CpHTDg";
     rambtime += timermap.start( gKey ); // FIXME! NOT A RAMBO TIMER!
@@ -617,7 +620,7 @@ main( int argc, char** argv )
     wv3atime += timermap.stop(); // calc only
     wavetime += wv3atime;        // calc plus copy
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     if( !bridge )
     {
       // --- 3b. CopyDToH MEs
@@ -760,18 +763,22 @@ main( int argc, char** argv )
     rndgentxt = "CURAND DEVICE";
 #ifdef __CUDACC__
   rndgentxt += " (CUDA code)";
+#elif defined __HIPCC__
+  rndgentxt += " (HIP code)";
 #else
   rndgentxt += " (C++ code)";
 #endif
 
   // Workflow description summary
   std::string wrkflwtxt;
-  // -- CUDA or C++?
+  // -- CUDA or HIP or C++?
 #ifdef __CUDACC__
   wrkflwtxt += "CUD:";
+#elif defined __HIPCC__
+  wrkflwtxt += "HIP:";
 #else
   wrkflwtxt += "CPP:";
-#endif
+#endif /* clang-format off */
   // -- DOUBLE or FLOAT?
 #if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
   wrkflwtxt += "MIX+"; // mixed fptypes (single precision color algebra #537)
@@ -781,7 +788,7 @@ main( int argc, char** argv )
   wrkflwtxt += "FLT+";
 #else
   wrkflwtxt += "???+"; // no path to this statement
-#endif
+#endif /* clang-format on */
   // -- CUCOMPLEX or THRUST or STD complex numbers?
 #ifdef __CUDACC__
 #if defined MGONGPU_CUCXTYPE_CUCOMPLEX
@@ -793,6 +800,12 @@ main( int argc, char** argv )
 #else
   wrkflwtxt += "???:"; // no path to this statement
 #endif
+#elif defined __HIPCC__
+#if defined MGONGPU_CUCXTYPE_CXSMPL
+  wrkflwtxt += "CXS:";
+#else
+  wrkflwtxt += "???:"; // no path to this statement
+#endif
 #else
 #if defined MGONGPU_CPPCXTYPE_STDCOMPLEX
   wrkflwtxt += "STX:";
@@ -818,7 +831,7 @@ main( int argc, char** argv )
     wrkflwtxt += "RMBDEV+";
   else
     wrkflwtxt += "??????+"; // no path to this statement
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   // -- HOST or DEVICE matrix elements? Standalone MEs or BRIDGE?
   if( !bridge )
     wrkflwtxt += "MESDEV";
@@ -874,7 +887,7 @@ main( int argc, char** argv )
 
   if( perf )
   {
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 #ifdef _OPENMP
     // Get the output of "nproc --all" (https://stackoverflow.com/a/478960)
     std::string nprocall;
@@ -895,6 +908,8 @@ main( int argc, char** argv )
     std::cout << std::string( SEP79, '*' ) << std::endl
 #ifdef __CUDACC__
               << "Process                     = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_CUDA"
+#elif defined __HIPCC__
+              << "Process                     = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_HIP"
 #else
               << "Process                     = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_CPP"
 #endif
@@ -921,21 +936,21 @@ main( int argc, char** argv )
 #elif defined MGONGPU_FPTYPE_FLOAT
               << "FP precision                = FLOAT (NaN/abnormal=" << nabn << ", zero=" << nzero << ")" << std::endl
 #endif
-#ifdef __CUDACC__
 #if defined MGONGPU_CUCXTYPE_CUCOMPLEX
               << "Complex type                = CUCOMPLEX" << std::endl
 #elif defined MGONGPU_CUCXTYPE_THRUST
               << "Complex type                = THRUST::COMPLEX" << std::endl
-#endif
-#else
+#elif defined MGONGPU_CUCXTYPE_CXSMPL
               << "Complex type                = STD::COMPLEX" << std::endl
+#else
+              << "Complex type                = ???" << std::endl // no path to this statement...
 #endif
               << "RanNumb memory layout       = AOSOA[" << neppR << "]"
               << ( neppR == 1 ? " == AOS" : "" )
               << " [HARDCODED FOR REPRODUCIBILITY]" << std::endl
               << "Momenta memory layout       = AOSOA[" << neppM << "]"
               << ( neppM == 1 ? " == AOS" : "" ) << std::endl
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     //<< "Wavefunction GPU memory     = LOCAL" << std::endl
 #else
 #if !defined MGONGPU_CPPSIMD
@@ -966,7 +981,7 @@ main( int argc, char** argv )
 #endif
 #endif
               << "Random number generation    = " << rndgentxt << std::endl
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 #ifdef _OPENMP
               << "OMP threads / `nproc --all` = " << omp_get_max_threads() << " / " << nprocall // includes a newline
 #endif
@@ -1062,14 +1077,14 @@ main( int argc, char** argv )
              << "\"FLOAT (NaN/abnormal=" << nabn << ")\"," << std::endl
 #endif
              << "\"Complex type\": "
-#ifdef __CUDACC__
 #if defined MGONGPU_CUCXTYPE_CUCOMPLEX
              << "\"CUCOMPLEX\"," << std::endl
 #elif defined MGONGPU_CUCXTYPE_THRUST
              << "\"THRUST::COMPLEX\"," << std::endl
-#endif
-#else
+#elif defined MGONGPU_CUCXTYPE_CXSMPL
              << "\"STD::COMPLEX\"," << std::endl
+#else
+             << "\"???\"," << std::endl                           // no path to this statement...
 #endif
              << "\"RanNumb memory layout\": "
              << "\"AOSOA[" << neppR << "]\""
@@ -1077,7 +1092,7 @@ main( int argc, char** argv )
              << "\"Momenta memory layout\": "
              << "\"AOSOA[" << neppM << "]\""
              << ( neppM == 1 ? " == AOS" : "" ) << ", " << std::endl
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     //<< "\"Wavefunction GPU memory\": " << "\"LOCAL\"," << std::endl
 #endif
              << "\"Curand generation\": "
diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/RamboSamplingKernels.cc b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/RamboSamplingKernels.cc
index da68aa9255..79abbcc4f8 100644
--- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/RamboSamplingKernels.cc
+++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/RamboSamplingKernels.cc
@@ -1,11 +1,11 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #include "RamboSamplingKernels.h"
 
-#include "CudaRuntime.h"
+#include "GpuRuntime.h"
 #include "MemoryAccessMomenta.h"
 #include "MemoryAccessRandomNumbers.h"
 #include "MemoryAccessWeights.h"
@@ -14,7 +14,7 @@
 
 #include <sstream>
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -92,7 +92,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   RamboSamplingKernelDevice::RamboSamplingKernelDevice( const fptype energy,               // input: energy
                                                         const BufferRndNumMomenta& rndmom, // input: random numbers in [0,1]
                                                         BufferMomenta& momenta,            // output: momenta
@@ -135,7 +135,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   __global__ void
   getMomentaInitialDevice( const fptype energy,
                            fptype* momenta )
@@ -147,17 +147,17 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   void
   RamboSamplingKernelDevice::getMomentaInitial()
   {
-    getMomentaInitialDevice<<<m_gpublocks, m_gputhreads>>>( m_energy, m_momenta.data() );
+    gpuLaunchKernel( getMomentaInitialDevice, m_gpublocks, m_gputhreads, m_energy, m_momenta.data() );
   }
 #endif
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   __global__ void
   getMomentaFinalDevice( const fptype energy,
                          const fptype* rndmom,
@@ -171,11 +171,11 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   void
   RamboSamplingKernelDevice::getMomentaFinal()
   {
-    getMomentaFinalDevice<<<m_gpublocks, m_gputhreads>>>( m_energy, m_rndmom.data(), m_momenta.data(), m_weights.data() );
+    gpuLaunchKernel( getMomentaFinalDevice, m_gpublocks, m_gputhreads, m_energy, m_rndmom.data(), m_momenta.data(), m_weights.data() );
   }
 #endif
 
diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/RamboSamplingKernels.h b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/RamboSamplingKernels.h
index 184089efd7..7c214cd74b 100644
--- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/RamboSamplingKernels.h
+++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/RamboSamplingKernels.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef RAMBOSAMPLINGKERNELS_H
 #define RAMBOSAMPLINGKERNELS_H 1
@@ -10,7 +10,7 @@
 
 #include "MemoryBuffers.h"
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -93,7 +93,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   // A class encapsulating RAMBO phase space sampling on a GPU device
   class RamboSamplingKernelDevice final : public SamplingKernelBase, public NumberOfEvents
   {
diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/RandomNumberKernels.h b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/RandomNumberKernels.h
index 188a72c2c9..21d63beeac 100644
--- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/RandomNumberKernels.h
+++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/RandomNumberKernels.h
@@ -1,14 +1,14 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef RANDOMNUMBERKERNELS_H
 #define RANDOMNUMBERKERNELS_H 1
 
 #include "mgOnGpuConfig.h"
 
-// NB This must come AFTER mgOnGpuConfig.h which contains our definition of __global__ when __CUDACC__ is not defined
+// NB This must come AFTER mgOnGpuConfig.h which contains our definition of __global__ when MGONGPUCPP_GPUIMPL is not defined
 #ifndef MGONGPU_HAS_NO_CURAND
 //#include "curand.h"
 struct curandGenerator_st; // forward definition from curand.h
@@ -16,7 +16,7 @@ struct curandGenerator_st; // forward definition from curand.h
 
 #include "MemoryBuffers.h"
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/cudacpp.mk b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/cudacpp.mk
index 509307506b..f2cfa349da 100644
--- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/cudacpp.mk
+++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/cudacpp.mk
@@ -1,7 +1,7 @@
 # Copyright (C) 2020-2023 CERN and UCLouvain.
 # Licensed under the GNU Lesser General Public License (version 3 or later).
 # Created by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin.
-# Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+# Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 
 #=== Determine the name of this makefile (https://ftp.gnu.org/old-gnu/Manuals/make-3.80/html_node/make_17.html)
 #=== NB: use ':=' to ensure that the value of CUDACPP_MAKEFILE is not modified further down after including make_opts
@@ -42,10 +42,10 @@ endif
 
 #-------------------------------------------------------------------------------
 
-#=== Configure common compiler flags for C++ and CUDA
+#=== Configure common compiler flags for C++ and CUDA/HIP
 
 INCFLAGS = -I.
-OPTFLAGS = -O3 # this ends up in CUFLAGS too (should it?), cannot add -Ofast or -ffast-math here
+OPTFLAGS = -O3 # this ends up in GPUFLAGS too (should it?), cannot add -Ofast or -ffast-math here
 
 # Dependency on src directory
 MG5AMC_COMMONLIB = mg5amc_common
@@ -121,24 +121,46 @@ endif
 
 #-------------------------------------------------------------------------------
 
-#=== Configure the CUDA compiler
+#=== Configure the GPU compiler (CUDA or HIP)
 
-# If CXX is not a single word (example "clang++ --gcc-toolchain...") then disable CUDA builds (issue #505)
-# This is because it is impossible to pass this to "CUFLAGS += -ccbin <host-compiler>" below
+# FIXME! (AV 24.01.2024)
+# In the current implementation (without separate builds for C++ and CUDA/HIP), we first check for cudacc and hipcc in CUDA_HOME and HIP_HOME.
+# If CUDA_HOME or HIP_HOME are not set, try to determine them from the path to cudacc and hipcc.
+# While convoluted, this is currently necessary to allow disabling CUDA/HIP builds by setting CUDA_HOME or HIP_HOME to invalid paths.
+# This will (probably?) be fixed when separate C++ and CUDA/HIP builds are implemented (PR #775).
+
+# If CXX is not a single word (example "clang++ --gcc-toolchain...") then disable CUDA and HIP builds (issue #505)
+# This is because it is impossible to pass this to "GPUFLAGS += -ccbin <host-compiler>" below
 ifneq ($(words $(subst ccache ,,$(CXX))),1) # allow at most "CXX=ccache <host-compiler>" from outside
-  $(warning CUDA builds are not supported for multi-word CXX "$(CXX)")
+  $(warning CUDA and HIP builds are not supported for multi-word CXX "$(CXX)")
   override CUDA_HOME=disabled
+  override HIP_HOME=disabled
 endif
 
-# If CUDA_HOME is not set, try to set it from the location of nvcc
+# If CUDA_HOME is not set, try to set it from the path to nvcc
 ifndef CUDA_HOME
   CUDA_HOME = $(patsubst %bin/nvcc,%,$(shell which nvcc 2>/dev/null))
   $(warning CUDA_HOME was not set: using "$(CUDA_HOME)")
 endif
 
-# Set NVCC as $(CUDA_HOME)/bin/nvcc if it exists
+# If HIP_HOME is not set, try to set it from the path to hipcc
+ifndef HIP_HOME
+  HIP_HOME = $(patsubst %bin/hipcc,%,$(HIP_COMPILER_PATH))
+  $(warning HIP_HOME was not set: using "$(HIP_HOME)")
+endif
+
+# FIXME! (AV 24.01.2024)
+# In the current implementation (without separate builds for C++ and CUDA/HIP),
+# builds are performed for HIP only if CUDA is not found in the path.
+# If both CUDA and HIP are installed, HIP builds can be triggered by unsetting CUDA_HOME.
+# This will be fixed when separate C++ and CUDA/HIP builds are implemented (PR #775).
+
+#--- Option 1: CUDA exists -> use CUDA
+
+# Set GPUCC as $(CUDA_HOME)/bin/nvcc if it exists
 ifneq ($(wildcard $(CUDA_HOME)/bin/nvcc),)
-  NVCC = $(CUDA_HOME)/bin/nvcc
+
+  GPUCC = $(CUDA_HOME)/bin/nvcc
   USE_NVTX ?=-DUSE_NVTX
   # See https://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/index.html
   # See https://arnon.dk/matching-sm-architectures-arch-and-gencode-for-various-nvidia-cards/
@@ -158,41 +180,78 @@ ifneq ($(wildcard $(CUDA_HOME)/bin/nvcc),)
     CURANDLIBFLAGS = -L$(CUDA_HOME)/lib64/ -lcurand # NB: -lcuda is not needed here!
   endif
   CUOPTFLAGS = -lineinfo
-  CUFLAGS = $(foreach opt, $(OPTFLAGS), -Xcompiler $(opt)) $(CUOPTFLAGS) $(INCFLAGS) $(CUINC) $(USE_NVTX) $(CUARCHFLAGS) -use_fast_math
-  ###CUFLAGS += -Xcompiler -Wall -Xcompiler -Wextra -Xcompiler -Wshadow
-  ###NVCC_VERSION = $(shell $(NVCC) --version | grep 'Cuda compilation tools' | cut -d' ' -f5 | cut -d, -f1)
-  CUFLAGS += -std=c++17 # need CUDA >= 11.2 (see #333): this is enforced in mgOnGpuConfig.h
+  ###GPUFLAGS = $(OPTFLAGS) $(CUOPTFLAGS) $(INCFLAGS) $(CUINC) $(USE_NVTX) $(CUARCHFLAGS) -use_fast_math
+  GPUFLAGS = $(foreach opt, $(OPTFLAGS), -Xcompiler $(opt)) $(CUOPTFLAGS) $(INCFLAGS) $(CUINC) $(USE_NVTX) $(CUARCHFLAGS) -use_fast_math
+  ###GPUFLAGS += -Xcompiler -Wall -Xcompiler -Wextra -Xcompiler -Wshadow
+  ###GPUCC_VERSION = $(shell $(GPUCC) --version | grep 'Cuda compilation tools' | cut -d' ' -f5 | cut -d, -f1)
+  GPUFLAGS += -std=c++17 # need CUDA >= 11.2 (see #333): this is enforced in mgOnGpuConfig.h
   # Without -maxrregcount: baseline throughput: 6.5E8 (16384 32 12) up to 7.3E8 (65536 128 12)
-  ###CUFLAGS+= --maxrregcount 160 # improves throughput: 6.9E8 (16384 32 12) up to 7.7E8 (65536 128 12)
-  ###CUFLAGS+= --maxrregcount 128 # improves throughput: 7.3E8 (16384 32 12) up to 7.6E8 (65536 128 12)
-  ###CUFLAGS+= --maxrregcount 96 # degrades throughput: 4.1E8 (16384 32 12) up to 4.5E8 (65536 128 12)
-  ###CUFLAGS+= --maxrregcount 64 # degrades throughput: 1.7E8 (16384 32 12) flat at 1.7E8 (65536 128 12)
+  ###GPUFLAGS+= --maxrregcount 160 # improves throughput: 6.9E8 (16384 32 12) up to 7.7E8 (65536 128 12)
+  ###GPUFLAGS+= --maxrregcount 128 # improves throughput: 7.3E8 (16384 32 12) up to 7.6E8 (65536 128 12)
+  ###GPUFLAGS+= --maxrregcount 96 # degrades throughput: 4.1E8 (16384 32 12) up to 4.5E8 (65536 128 12)
+  ###GPUFLAGS+= --maxrregcount 64 # degrades throughput: 1.7E8 (16384 32 12) flat at 1.7E8 (65536 128 12)
+  CUBUILDRULEFLAGS = -Xcompiler -fPIC -c
+  CCBUILDRULEFLAGS = -Xcompiler -fPIC -c -x cu
+  CUDATESTFLAGS = -lcuda
+
+  # Set the host C++ compiler for GPUCC via "-ccbin <host-compiler>"
+  # (NB issue #505: this must be a single word, "clang++ --gcc-toolchain..." is not supported)
+  GPUFLAGS += -ccbin $(shell which $(subst ccache ,,$(CXX)))
+
+  # Allow newer (unsupported) C++ compilers with older versions of CUDA if ALLOW_UNSUPPORTED_COMPILER_IN_CUDA is set (#504)
+  ifneq ($(origin ALLOW_UNSUPPORTED_COMPILER_IN_CUDA),undefined)
+  GPUFLAGS += -allow-unsupported-compiler
+  endif
+
 else ifneq ($(origin REQUIRE_CUDA),undefined)
+
   # If REQUIRE_CUDA is set but no cuda is found, stop here (e.g. for CI tests on GPU #443)
-  $(error No cuda installation found (set CUDA_HOME or make nvcc visible in PATH))
+  $(error No cuda installation found (set CUDA_HOME or make GPUCC visible in PATH))
+
+#--- Option 2: CUDA does not exist, HIP exists -> use HIP
+
+# Set GPUCC as $(HIP_HOME)/bin/hipcc if it exists
+else ifneq ($(wildcard $(HIP_HOME)/bin/hipcc),)
+
+  GPUCC = $(HIP_HOME)/bin/hipcc
+  #USE_NVTX ?=-DUSE_NVTX # should maybe find something equivalent to this in HIP?
+  HIPARCHFLAGS = -target x86_64-linux-gnu --offload-arch=gfx90a
+  HIPINC = -I$(HIP_HOME)/include/
+  # Note: -DHIP_FAST_MATH is equivalent to -use_fast_math in HIP 
+  # (but only for single precision line 208: https://rocm-developer-tools.github.io/HIP/hcc__detail_2math__functions_8h_source.html)
+  GPUFLAGS = $(OPTFLAGS) $(CUOPTFLAGS) $(INCFLAGS) $(HIPINC) $(HIPARCHFLAGS) -DHIP_FAST_MATH -DHIP_PLATFORM=amd -fPIC
+  ###GPUFLAGS += -Xcompiler -Wall -Xcompiler -Wextra -Xcompiler -Wshadow
+  GPUFLAGS += -std=c++17
+  ###GPUFLAGS+= --maxrregcount 255 # (AV: is this option valid on HIP and meaningful on AMD GPUs?)
+  CUBUILDRULEFLAGS = -fPIC -c
+  CCBUILDRULEFLAGS = -fPIC -c
+
+else ifneq ($(origin REQUIRE_HIP),undefined)
+
+  # If REQUIRE_HIP is set but no HIP is found, stop here (e.g. for CI tests on GPU #443)
+  $(error No hip installation found (set HIP_HOME or make GPUCC visible in PATH))
+
+#--- Option 3: CUDA does not exist, HIP does not exist -> switch off both CUDA and HIP
+
 else
-  # No cuda. Switch cuda compilation off and go to common random numbers in C++
+
+  # No cudacc and no hipcc: switch CUDA and HIP compilation off and go to common random numbers in C++
   $(warning CUDA_HOME is not set or is invalid: export CUDA_HOME to compile with cuda)
-  override NVCC=
+  $(warning HIP_HOME is not set or is invalid: export HIP_HOME to compile with hip)
+  override GPUCC=
   override USE_NVTX=
   override CUINC=
   override CURANDLIBFLAGS=
-endif
-export NVCC
-export CUFLAGS
-
-# Set the host C++ compiler for nvcc via "-ccbin <host-compiler>"
-# (NB issue #505: this must be a single word, "clang++ --gcc-toolchain..." is not supported)
-CUFLAGS += -ccbin $(shell which $(subst ccache ,,$(CXX)))
 
-# Allow newer (unsupported) C++ compilers with older versions of CUDA if ALLOW_UNSUPPORTED_COMPILER_IN_CUDA is set (#504)
-ifneq ($(origin ALLOW_UNSUPPORTED_COMPILER_IN_CUDA),undefined)
-CUFLAGS += -allow-unsupported-compiler
 endif
 
+# Export GPUCC (so that it can also be used in cudacpp_src.mk?)
+export GPUCC
+export GPUFLAGS
+
 #-------------------------------------------------------------------------------
 
-#=== Configure ccache for C++ and CUDA builds
+#=== Configure ccache for C++ and CUDA/HIP builds
 
 # Enable ccache if USECCACHE=1
 ifeq ($(USECCACHE)$(shell echo $(CXX) | grep ccache),1)
@@ -201,15 +260,15 @@ endif
 #ifeq ($(USECCACHE)$(shell echo $(AR) | grep ccache),1)
 #  override AR:=ccache $(AR)
 #endif
-ifneq ($(NVCC),)
-  ifeq ($(USECCACHE)$(shell echo $(NVCC) | grep ccache),1)
-    override NVCC:=ccache $(NVCC)
+ifneq ($(GPUCC),)
+  ifeq ($(USECCACHE)$(shell echo $(GPUCC) | grep ccache),1)
+    override GPUCC:=ccache $(GPUCC)
   endif
 endif
 
 #-------------------------------------------------------------------------------
 
-#=== Configure PowerPC-specific compiler flags for C++ and CUDA
+#=== Configure PowerPC-specific compiler flags for C++ and CUDA/HIP
 
 # PowerPC-specific CXX compiler flags (being reviewed)
 ifeq ($(UNAME_P),ppc64le)
@@ -225,9 +284,9 @@ else
   ######CXXFLAGS+= -fno-semantic-interposition # no benefit (neither alone, nor combined with -flto)
 endif
 
-# PowerPC-specific CUDA compiler flags (to be reviewed!)
+# PowerPC-specific CUDA/HIP compiler flags (to be reviewed!)
 ifeq ($(UNAME_P),ppc64le)
-  CUFLAGS+= -Xcompiler -mno-float128
+  GPUFLAGS+= -Xcompiler -mno-float128
 endif
 
 #-------------------------------------------------------------------------------
@@ -237,7 +296,7 @@ endif
 # Set the default OMPFLAGS choice
 ifneq ($(shell $(CXX) --version | egrep '^Intel'),)
 override OMPFLAGS = -fopenmp
-###override OMPFLAGS = # disable OpenMP MT on Intel (was ok without nvcc but not ok with nvcc before #578)
+###override OMPFLAGS = # disable OpenMP MT on Intel (was ok without GPUCC but not ok with GPUCC before #578)
 else ifneq ($(shell $(CXX) --version | egrep '^(clang)'),)
 override OMPFLAGS = -fopenmp
 ###override OMPFLAGS = # disable OpenMP MT on clang (was not ok without or with nvcc before #578)
@@ -293,7 +352,10 @@ endif
 
 # Set the default RNDGEN (random number generator) choice
 ifeq ($(RNDGEN),)
-  ifeq ($(NVCC),)
+  ifeq ($(GPUCC),)
+    override RNDGEN = hasNoCurand
+  # Edgecase for HIP compilation
+  else ifeq ($(findstring hipcc,$(GPUCC)),hipcc)
     override RNDGEN = hasNoCurand
   else ifeq ($(RNDGEN),)
     override RNDGEN = hasCurand
@@ -310,7 +372,7 @@ export OMPFLAGS
 
 #-------------------------------------------------------------------------------
 
-#=== Set the CUDA/C++ compiler flags appropriate to user-defined choices of AVX, FPTYPE, HELINL, HRDCOD, RNDGEN
+#=== Set the CUDA/HIP/C++ compiler flags appropriate to user-defined choices of AVX, FPTYPE, HELINL, HRDCOD, RNDGEN
 
 # Set the build flags appropriate to OMPFLAGS
 $(info OMPFLAGS=$(OMPFLAGS))
@@ -368,13 +430,13 @@ CXXFLAGS+= $(AVXFLAGS)
 $(info FPTYPE=$(FPTYPE))
 ifeq ($(FPTYPE),d)
   CXXFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_DOUBLE
-  CUFLAGS  += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_DOUBLE
+  GPUFLAGS  += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_DOUBLE
 else ifeq ($(FPTYPE),f)
   CXXFLAGS += -DMGONGPU_FPTYPE_FLOAT -DMGONGPU_FPTYPE2_FLOAT
-  CUFLAGS  += -DMGONGPU_FPTYPE_FLOAT -DMGONGPU_FPTYPE2_FLOAT
+  GPUFLAGS  += -DMGONGPU_FPTYPE_FLOAT -DMGONGPU_FPTYPE2_FLOAT
 else ifeq ($(FPTYPE),m)
   CXXFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_FLOAT
-  CUFLAGS  += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_FLOAT
+  GPUFLAGS  += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_FLOAT
 else
   $(error Unknown FPTYPE='$(FPTYPE)': only 'd', 'f' and 'm' are supported)
 endif
@@ -383,7 +445,7 @@ endif
 $(info HELINL=$(HELINL))
 ifeq ($(HELINL),1)
   CXXFLAGS += -DMGONGPU_INLINE_HELAMPS
-  CUFLAGS  += -DMGONGPU_INLINE_HELAMPS
+  GPUFLAGS  += -DMGONGPU_INLINE_HELAMPS
 else ifneq ($(HELINL),0)
   $(error Unknown HELINL='$(HELINL)': only '0' and '1' are supported)
 endif
@@ -392,7 +454,7 @@ endif
 $(info HRDCOD=$(HRDCOD))
 ifeq ($(HRDCOD),1)
   CXXFLAGS += -DMGONGPU_HARDCODE_PARAM
-  CUFLAGS  += -DMGONGPU_HARDCODE_PARAM
+  GPUFLAGS  += -DMGONGPU_HARDCODE_PARAM
 else ifneq ($(HRDCOD),0)
   $(error Unknown HRDCOD='$(HRDCOD)': only '0' and '1' are supported)
 endif
@@ -444,11 +506,11 @@ ifeq ($(UNAME_S),Darwin)
   override CULIBFLAGSRPATH2 =
 else
   # RPATH to cuda/cpp libs when linking executables
-  override CXXLIBFLAGSRPATH = -Wl,-rpath,$(LIBDIRRPATH)
-  override CULIBFLAGSRPATH = -Xlinker -rpath,$(LIBDIRRPATH)
+  override CXXLIBFLAGSRPATH = -Wl,-rpath=$(LIBDIRRPATH)
+  override CULIBFLAGSRPATH = -Xlinker -rpath=$(LIBDIRRPATH)
   # RPATH to common lib when linking cuda/cpp libs
-  override CXXLIBFLAGSRPATH2 = -Wl,-rpath,'$$ORIGIN'
-  override CULIBFLAGSRPATH2 = -Xlinker -rpath,'$$ORIGIN'
+  override CXXLIBFLAGSRPATH2 = -Wl,-rpath='$$ORIGIN'
+  override CULIBFLAGSRPATH2 = -Xlinker -rpath='$$ORIGIN'
 endif
 
 # Setting LD_LIBRARY_PATH or DYLD_LIBRARY_PATH in the RUNTIME is no longer necessary (neither on Linux nor on Mac)
@@ -461,7 +523,7 @@ override RUNTIME =
 cxx_main=$(BUILDDIR)/check.exe
 fcxx_main=$(BUILDDIR)/fcheck.exe
 
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 cu_main=$(BUILDDIR)/gcheck.exe
 fcu_main=$(BUILDDIR)/fgcheck.exe
 else
@@ -492,15 +554,16 @@ $(BUILDDIR)/.build.$(TAG):
 	@touch $(BUILDDIR)/.build.$(TAG)
 
 # Generic target and build rules: objects from CUDA compilation
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 $(BUILDDIR)/%.o : %.cu *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
 	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
-	$(NVCC) $(CPPFLAGS) $(CUFLAGS) -Xcompiler -fPIC -c $< -o $@
+	$(GPUCC) $(CPPFLAGS) $(GPUFLAGS) $(CUBUILDRULEFLAGS) $< -o $@
 
 $(BUILDDIR)/%_cu.o : %.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
 	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
-	$(NVCC) $(CPPFLAGS) $(CUFLAGS) -Xcompiler -fPIC -c -x cu $< -o $@
+	$(GPUCC) $(CPPFLAGS) $(GPUFLAGS) $(CCBUILDRULEFLAGS) $< -o $@
 endif
+# -x cu in line above
 
 # Generic target and build rules: objects from C++ compilation
 # (NB do not include CUINC here! add it only for NVTX or curand #679)
@@ -509,11 +572,14 @@ $(BUILDDIR)/%.o : %.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
 	$(CXX) $(CPPFLAGS) $(CXXFLAGS) -fPIC -c $< -o $@
 
 # Apply special build flags only to CrossSectionKernel.cc and gCrossSectionKernel.cu (no fast math, see #117 and #516)
+# Added edgecase for HIP compilation
 ifeq ($(shell $(CXX) --version | grep ^nvc++),)
 $(BUILDDIR)/CrossSectionKernels.o: CXXFLAGS := $(filter-out -ffast-math,$(CXXFLAGS))
 $(BUILDDIR)/CrossSectionKernels.o: CXXFLAGS += -fno-fast-math
-ifneq ($(NVCC),)
-$(BUILDDIR)/gCrossSectionKernels.o: CUFLAGS += -Xcompiler -fno-fast-math
+ifeq ($(findstring nvcc,$(GPUCC)),nvcc)
+  $(BUILDDIR)/gCrossSectionKernels.o: GPUFLAGS += -Xcompiler -fno-fast-math
+else
+  $(BUILDDIR)/gCrossSectionKernels.o: GPUFLAGS += -fno-fast-math
 endif
 endif
 
@@ -530,10 +596,10 @@ ifeq ($(RNDGEN),hasCurand)
 $(BUILDDIR)/CurandRandomNumberKernel.o: CXXFLAGS += $(CUINC)
 endif
 
-# Avoid "warning: builtin __has_trivial_... is deprecated; use __is_trivially_... instead" in nvcc with icx2023 (#592)
+# Avoid "warning: builtin __has_trivial_... is deprecated; use __is_trivially_... instead" in GPUCC with icx2023 (#592)
 ifneq ($(shell $(CXX) --version | egrep '^(Intel)'),)
-ifneq ($(NVCC),)
-CUFLAGS += -Xcompiler -Wno-deprecated-builtins
+ifneq ($(GPUCC),)
+GPUFLAGS += -Wno-deprecated-builtins
 endif
 endif
 
@@ -541,8 +607,8 @@ endif
 # This patch does remove the warning, but I prefer to keep it disabled for the moment...
 ###ifneq ($(shell $(CXX) --version | egrep '^(clang|Apple clang|Intel)'),)
 ###$(BUILDDIR)/CrossSectionKernels.o: CXXFLAGS += -Wno-overriding-t-option
-###ifneq ($(NVCC),)
-###$(BUILDDIR)/gCrossSectionKernels.o: CUFLAGS += -Xcompiler -Wno-overriding-t-option
+###ifneq ($(GPUCC),)
+###$(BUILDDIR)/gCrossSectionKernels.o: GPUFLAGS += -Xcompiler -Wno-overriding-t-option
 ###endif
 ###endif
 
@@ -569,7 +635,7 @@ MG5AMC_CXXLIB = mg5amc_$(processid_short)_cpp
 cxx_objects_lib=$(BUILDDIR)/CPPProcess.o $(BUILDDIR)/MatrixElementKernels.o $(BUILDDIR)/BridgeKernels.o $(BUILDDIR)/CrossSectionKernels.o
 cxx_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel.o $(BUILDDIR)/RamboSamplingKernels.o
 
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 MG5AMC_CULIB = mg5amc_$(processid_short)_cuda
 cu_objects_lib=$(BUILDDIR)/gCPPProcess.o $(BUILDDIR)/gMatrixElementKernels.o $(BUILDDIR)/gBridgeKernels.o $(BUILDDIR)/gCrossSectionKernels.o
 cu_objects_exe=$(BUILDDIR)/gCommonRandomNumberKernel.o $(BUILDDIR)/gRamboSamplingKernels.o
@@ -581,11 +647,11 @@ $(LIBDIR)/lib$(MG5AMC_CXXLIB).so: cxx_objects_lib += $(BUILDDIR)/fbridge.o
 $(LIBDIR)/lib$(MG5AMC_CXXLIB).so: $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib)
 	$(CXX) -shared -o $@ $(cxx_objects_lib) $(CXXLIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB)
 
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 $(LIBDIR)/lib$(MG5AMC_CULIB).so: $(BUILDDIR)/fbridge_cu.o
 $(LIBDIR)/lib$(MG5AMC_CULIB).so: cu_objects_lib += $(BUILDDIR)/fbridge_cu.o
 $(LIBDIR)/lib$(MG5AMC_CULIB).so: $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cu_objects_lib)
-	$(NVCC) --shared -o $@ $(cu_objects_lib) $(CULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB)
+	$(GPUCC) --shared -o $@ $(cu_objects_lib) $(CULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB)
 endif
 
 #-------------------------------------------------------------------------------
@@ -602,16 +668,16 @@ $(cxx_main): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PAT
 $(cxx_main): $(BUILDDIR)/check_sa.o $(LIBDIR)/lib$(MG5AMC_CXXLIB).so $(cxx_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel.o
 	$(CXX) -o $@ $(BUILDDIR)/check_sa.o $(OMPFLAGS) -ldl -pthread $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_CXXLIB) $(cxx_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel.o $(CURANDLIBFLAGS)
 
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 ifneq ($(shell $(CXX) --version | grep ^Intel),)
-$(cu_main): LIBFLAGS += -lintlc # compile with icpx and link with nvcc (undefined reference to `_intel_fast_memcpy')
-$(cu_main): LIBFLAGS += -lsvml # compile with icpx and link with nvcc (undefined reference to `__svml_cos4_l9')
+$(cu_main): LIBFLAGS += -lintlc # compile with icpx and link with GPUCC (undefined reference to `_intel_fast_memcpy')
+$(cu_main): LIBFLAGS += -lsvml # compile with icpx and link with GPUCC (undefined reference to `__svml_cos4_l9')
 else ifneq ($(shell $(CXX) --version | grep ^nvc++),) # support nvc++ #531
 $(cu_main): LIBFLAGS += -L$(patsubst %bin/nvc++,%lib,$(subst ccache ,,$(CXX))) -lnvhpcatm -lnvcpumath -lnvc
 endif
 $(cu_main): LIBFLAGS += $(CULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH
 $(cu_main): $(BUILDDIR)/gcheck_sa.o $(LIBDIR)/lib$(MG5AMC_CULIB).so $(cu_objects_exe) $(BUILDDIR)/gCurandRandomNumberKernel.o
-	$(NVCC) -o $@ $(BUILDDIR)/gcheck_sa.o $(CUARCHFLAGS) $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe) $(BUILDDIR)/gCurandRandomNumberKernel.o $(CURANDLIBFLAGS)
+	$(GPUCC) -o $@ $(BUILDDIR)/gcheck_sa.o $(CUARCHFLAGS) $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe) $(BUILDDIR)/gCurandRandomNumberKernel.o $(CURANDLIBFLAGS)
 endif
 
 #-------------------------------------------------------------------------------
@@ -637,17 +703,17 @@ $(fcxx_main): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PA
 $(fcxx_main): $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler.o $(LIBDIR)/lib$(MG5AMC_CXXLIB).so $(cxx_objects_exe)
 	$(CXX) -o $@ $(BUILDDIR)/fcheck_sa.o $(OMPFLAGS) $(BUILDDIR)/fsampler.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CXXLIB) $(cxx_objects_exe)
 
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 ifneq ($(shell $(CXX) --version | grep ^Intel),)
-$(fcu_main): LIBFLAGS += -lintlc # compile with icpx and link with nvcc (undefined reference to `_intel_fast_memcpy')
-$(fcu_main): LIBFLAGS += -lsvml # compile with icpx and link with nvcc (undefined reference to `__svml_cos4_l9')
+$(fcu_main): LIBFLAGS += -lintlc # compile with icpx and link with GPUCC (undefined reference to `_intel_fast_memcpy')
+$(fcu_main): LIBFLAGS += -lsvml # compile with icpx and link with GPUCC (undefined reference to `__svml_cos4_l9')
 endif
 ifeq ($(UNAME_S),Darwin)
 $(fcu_main): LIBFLAGS += -L$(shell dirname $(shell $(FC) --print-file-name libgfortran.dylib)) # add path to libgfortran on Mac #375
 endif
 $(fcu_main): LIBFLAGS += $(CULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH
 $(fcu_main): $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler_cu.o $(LIBDIR)/lib$(MG5AMC_CULIB).so $(cu_objects_exe)
-	$(NVCC) -o $@ $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler_cu.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe)
+	$(GPUCC) -o $@ $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler_cu.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe)
 endif
 
 #-------------------------------------------------------------------------------
@@ -659,7 +725,7 @@ $(BUILDDIR)/testxxx.o: testxxx_cc_ref.txt
 $(testmain): $(BUILDDIR)/testxxx.o
 $(testmain): cxx_objects_exe += $(BUILDDIR)/testxxx.o # Comment out this line to skip the C++ test of xxx functions
 
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 $(BUILDDIR)/testxxx_cu.o: $(GTESTLIBS)
 $(BUILDDIR)/testxxx_cu.o: INCFLAGS += $(GTESTINC)
 $(BUILDDIR)/testxxx_cu.o: testxxx_cc_ref.txt
@@ -672,7 +738,7 @@ $(BUILDDIR)/testmisc.o: INCFLAGS += $(GTESTINC)
 $(testmain): $(BUILDDIR)/testmisc.o
 $(testmain): cxx_objects_exe += $(BUILDDIR)/testmisc.o # Comment out this line to skip the C++ miscellaneous tests
 
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 $(BUILDDIR)/testmisc_cu.o: $(GTESTLIBS)
 $(BUILDDIR)/testmisc_cu.o: INCFLAGS += $(GTESTINC)
 $(testmain): $(BUILDDIR)/testmisc_cu.o
@@ -684,12 +750,12 @@ $(BUILDDIR)/runTest.o: INCFLAGS += $(GTESTINC)
 $(testmain): $(BUILDDIR)/runTest.o
 $(testmain): cxx_objects_exe += $(BUILDDIR)/runTest.o
 
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 $(BUILDDIR)/runTest_cu.o: $(GTESTLIBS)
 $(BUILDDIR)/runTest_cu.o: INCFLAGS += $(GTESTINC)
 ifneq ($(shell $(CXX) --version | grep ^Intel),)
-$(testmain): LIBFLAGS += -lintlc # compile with icpx and link with nvcc (undefined reference to `_intel_fast_memcpy')
-$(testmain): LIBFLAGS += -lsvml # compile with icpx and link with nvcc (undefined reference to `__svml_cos4_l9')
+$(testmain): LIBFLAGS += -lintlc # compile with icpx and link with GPUCC (undefined reference to `_intel_fast_memcpy')
+$(testmain): LIBFLAGS += -lsvml # compile with icpx and link with GPUCC (undefined reference to `__svml_cos4_l9')
 else ifneq ($(shell $(CXX) --version | grep ^nvc++),) # support nvc++ #531
 $(testmain): LIBFLAGS += -L$(patsubst %bin/nvc++,%lib,$(subst ccache ,,$(CXX))) -lnvhpcatm -lnvcpumath -lnvc
 endif
@@ -713,14 +779,14 @@ $(testmain): LIBFLAGS += -lgomp
 endif
 endif
 
-ifeq ($(NVCC),) # link only runTest.o
+ifeq ($(GPUCC),) # link only runTest.o
 $(testmain): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH
 $(testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(cxx_objects_exe) $(GTESTLIBS)
 	$(CXX) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) -ldl -pthread $(LIBFLAGS)
 else # link both runTest.o and runTest_cu.o
 $(testmain): LIBFLAGS += $(CULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH
 $(testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) $(GTESTLIBS)
-	$(NVCC) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) -ldl $(LIBFLAGS) -lcuda
+	  $(GPUCC) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) -ldl $(LIBFLAGS) $(CUDATESTFLAGS)
 endif
 
 # Use target gtestlibs to build only googletest
@@ -829,9 +895,9 @@ ifeq ($(USECCACHE),1)
 	ccache --version | head -1
 endif
 	@echo ""
-	@echo NVCC=$(NVCC)
-ifneq ($(NVCC),)
-	$(NVCC) --version
+	@echo GPUCC=$(GPUCC)
+ifneq ($(GPUCC),)
+	$(GPUCC) --version
 endif
 	@echo ""
 	@echo CXX=$(CXX)
@@ -850,7 +916,7 @@ endif
 
 # Target: check (run the C++ test executable)
 # [NB THIS IS WHAT IS USED IN THE GITHUB CI!]
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 check: runTest cmpFcheck cmpFGcheck
 else
 check: runTest cmpFcheck
diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/fbridge.cc b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/fbridge.cc
index 2d2b36d560..22ce3f5115 100644
--- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/fbridge.cc
+++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/fbridge.cc
@@ -1,11 +1,11 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: S. Roiser (Oct 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Roiser, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #include "Bridge.h"
 #include "CPPProcess.h"
-#include "CudaRuntime.h"
+#include "GpuRuntime.h"
 
 extern "C"
 {
@@ -22,7 +22,7 @@ extern "C"
    * Using the same Fortran MadEvent code, linking to the hetrerogeneous library would allow access to both CPU and GPU implementations.
    * The specific heterogeneous configuration (how many GPUs, how many threads on each CPU, etc) could be loaded in CUDA/C++ from a data file.
    */
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   using namespace mg5amcGpu;
 #else
   using namespace mg5amcCpu;
@@ -46,8 +46,8 @@ extern "C"
    */
   void fbridgecreate_( CppObjectInFortran** ppbridge, const int* pnevtF, const int* pnparF, const int* pnp4F )
   {
-#ifdef __CUDACC__
-    CudaRuntime::setUp();
+#ifdef MGONGPUCPP_GPUIMPL
+    GpuRuntime::setUp();
 #endif
     // (NB: CPPProcess::initProc no longer needs to be executed here because it is called in the Bridge constructor)
     // FIXME: disable OMP in Bridge when called from Fortran
@@ -65,8 +65,8 @@ extern "C"
     Bridge<FORTRANFPTYPE>* pbridge = dynamic_cast<Bridge<FORTRANFPTYPE>*>( *ppbridge );
     if( pbridge == 0 ) throw std::runtime_error( "fbridgedelete_: invalid Bridge address" );
     delete pbridge;
-#ifdef __CUDACC__
-    CudaRuntime::tearDown();
+#ifdef MGONGPUCPP_GPUIMPL
+    GpuRuntime::tearDown();
 #endif
   }
 
@@ -96,7 +96,7 @@ extern "C"
   {
     Bridge<FORTRANFPTYPE>* pbridge = dynamic_cast<Bridge<FORTRANFPTYPE>*>( *ppbridge );
     if( pbridge == 0 ) throw std::runtime_error( "fbridgesequence_: invalid Bridge address" );
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     // Use the device/GPU implementation in the CUDA library
     // (there is also a host implementation in this library)
     pbridge->gpu_sequence( momenta, gs, rndhel, rndcol, *pchannelId, mes, selhel, selcol );
diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/fsampler.cc b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/fsampler.cc
index 2fb445372d..3743934f41 100644
--- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/fsampler.cc
+++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/fsampler.cc
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Feb 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #include "mgOnGpuConfig.h"
 
@@ -13,7 +13,7 @@
 
 //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -40,7 +40,7 @@ namespace mg5amcCpu
   private:
     const int m_nevt; // The number of events in each iteration
     int m_iiter;      // The iteration counter (for random number seeding)
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
     HostBufferRndNumMomenta m_hstRndmom; // Memory buffers for random numbers
     HostBufferMomenta m_hstMomenta;      // Memory buffers for momenta
     HostBufferWeights m_hstWeights;      // Memory buffers for sampling weights
@@ -105,7 +105,7 @@ namespace mg5amcCpu
 
 extern "C"
 {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   using namespace mg5amcGpu;
 #else
   using namespace mg5amcCpu;
diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/runTest.cc b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/runTest.cc
index d4a760a71b..de327f2321 100644
--- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/runTest.cc
+++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/runTest.cc
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: S. Hageboeck (Nov 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 //----------------------------------------------------------------------------
 // Use ./runTest.exe --gtest_filter=*xxx to run only testxxx.cc tests
 //----------------------------------------------------------------------------
@@ -18,7 +18,7 @@
 #include "RandomNumberKernels.h"
 #include "epoch_process_id.h"
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 using namespace mg5amcGpu;
 #else
 using namespace mg5amcCpu;
@@ -35,7 +35,7 @@ struct CUDA_CPU_TestBase : public TestDriverBase
     : TestDriverBase( npar, refFileName ) {}
 };
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 struct CPUTest : public CUDA_CPU_TestBase
 {
   // Struct data members (process, and memory structures for random numbers, momenta, matrix elements and weights on host and device)
@@ -119,7 +119,7 @@ struct CPUTest : public CUDA_CPU_TestBase
 };
 #endif
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 struct CUDATest : public CUDA_CPU_TestBase
 {
   // Reset the device when our test goes out of scope. Note that this should happen after
@@ -128,7 +128,7 @@ struct CUDATest : public CUDA_CPU_TestBase
   {
     ~DeviceReset()
     {
-      checkCuda( cudaDeviceReset() ); // this is needed by cuda-memcheck --leak-check full
+      checkGpu( gpuDeviceReset() ); // this is needed by cuda-memcheck --leak-check full
     }
   } deviceResetter;
 
@@ -256,7 +256,7 @@ INSTANTIATE_TEST_SUITE_P( prefix, \
                           test_suite_name, \
                           testing::Values( new CUDATest( MG_EPOCH_REFERENCE_FILE_NAME ) ) );
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 MG_INSTANTIATE_TEST_SUITE_GPU( XTESTID_GPU( MG_EPOCH_PROCESS_ID ), MadgraphTest );
 #else
 MG_INSTANTIATE_TEST_SUITE_CPU( XTESTID_CPU( MG_EPOCH_PROCESS_ID ), MadgraphTest );
diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/testmisc.cc b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/testmisc.cc
index 895d6eeb56..ba9e59a8a3 100644
--- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/testmisc.cc
+++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/testmisc.cc
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 //----------------------------------------------------------------------------
 // Use ./runTest.exe --gtest_filter=*misc to run only testmisc.cc tests
 //----------------------------------------------------------------------------
@@ -17,7 +17,7 @@
 #include <sstream>
 #include <typeinfo>
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 #define TESTID( s ) s##_GPU_MISC
 #else
 #define TESTID( s ) s##_CPU_MISC
@@ -26,7 +26,7 @@
 #define XTESTID( s ) TESTID( s )
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -59,7 +59,7 @@ namespace mg5amcCpu
 
 TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testmisc )
 {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   using namespace mg5amcGpu;
 #else
   using namespace mg5amcCpu;
diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/testxxx.cc b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/testxxx.cc
index 3361fe5aa9..e5167de00c 100644
--- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/testxxx.cc
+++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/testxxx.cc
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Apr 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 //----------------------------------------------------------------------------
 // Use ./runTest.exe --gtest_filter=*xxx to run only testxxx.cc tests
 //----------------------------------------------------------------------------
@@ -24,7 +24,7 @@
 #include <iomanip>
 #include <iostream>
 #include <vector>
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 #define TESTID( s ) s##_GPU_XXX
 #else
 #define TESTID( s ) s##_CPU_XXX
@@ -32,7 +32,7 @@
 
 #define XTESTID( s ) TESTID( s )
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -42,7 +42,7 @@ namespace mg5amcCpu
   int FPEhandlerIevt = -1;
   inline void FPEhandler( int sig )
   {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     std::cerr << "Floating Point Exception (GPU): '" << FPEhandlerMessage << "' ievt=" << FPEhandlerIevt << std::endl;
 #else
     std::cerr << "Floating Point Exception (CPU neppV=" << neppV << "): '" << FPEhandlerMessage << "' ievt=" << FPEhandlerIevt << std::endl;
@@ -53,7 +53,7 @@ namespace mg5amcCpu
 
 TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testxxx )
 {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   using namespace mg5amcGpu;
 #else
   using namespace mg5amcCpu;
@@ -77,7 +77,7 @@ TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testxxx )
   assert( nevt % neppM == 0 ); // nevt must be a multiple of neppM
   assert( nevt % neppV == 0 ); // nevt must be a multiple of neppV
   // Fill in the input momenta
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   mg5amcGpu::PinnedHostBufferMomenta hstMomenta( nevt ); // AOSOA[npagM][npar=4][np4=4][neppM]
 #else
   mg5amcCpu::HostBufferMomenta hstMomenta( nevt ); // AOSOA[npagM][npar=4][np4=4][neppM]
@@ -322,7 +322,7 @@ TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testxxx )
   {
     for( int ievt = 0; ievt < nevt; ievt++ )
     {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
       using namespace mg5amcGpu;
 #else
       using namespace mg5amcCpu;
diff --git a/epochX/cudacpp/gg_tt01g.mad/src/HelAmps_sm.h b/epochX/cudacpp/gg_tt01g.mad/src/HelAmps_sm.h
index 361b488401..0dd0f3ebba 100644
--- a/epochX/cudacpp/gg_tt01g.mad/src/HelAmps_sm.h
+++ b/epochX/cudacpp/gg_tt01g.mad/src/HelAmps_sm.h
@@ -5,7 +5,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: A. Valassi (Sep 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
 // MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
@@ -28,7 +28,7 @@
 //#include <iomanip>
 //#include <iostream>
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/gg_tt01g.mad/src/Parameters_sm.cc b/epochX/cudacpp/gg_tt01g.mad/src/Parameters_sm.cc
index 64fc3fea62..067445b198 100644
--- a/epochX/cudacpp/gg_tt01g.mad/src/Parameters_sm.cc
+++ b/epochX/cudacpp/gg_tt01g.mad/src/Parameters_sm.cc
@@ -4,7 +4,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: A. Valassi (Sep 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
 // MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
@@ -17,7 +17,7 @@
 #include <iomanip>
 #include <iostream>
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 using namespace mg5amcGpu;
 #else
 using namespace mg5amcCpu;
diff --git a/epochX/cudacpp/gg_tt01g.mad/src/Parameters_sm.h b/epochX/cudacpp/gg_tt01g.mad/src/Parameters_sm.h
index b6568d3761..9581d66e0e 100644
--- a/epochX/cudacpp/gg_tt01g.mad/src/Parameters_sm.h
+++ b/epochX/cudacpp/gg_tt01g.mad/src/Parameters_sm.h
@@ -27,7 +27,7 @@
 #include "read_slha.h"
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -93,7 +93,7 @@ namespace mg5amcCpu
 #include <limits>
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -218,7 +218,7 @@ namespace mg5amcCpu
 //==========================================================================
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -239,7 +239,7 @@ namespace mg5amcCpu
 #pragma GCC diagnostic push
 #pragma GCC diagnostic ignored "-Wunused-variable"  // e.g. <<warning: unused variable ‘mdl_G__exp__2’ [-Wunused-variable]>>
 #pragma GCC diagnostic ignored "-Wunused-parameter" // e.g. <<warning: unused parameter ‘G’ [-Wunused-parameter]>>
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 #pragma nv_diagnostic push
 #pragma nv_diag_suppress 177 // e.g. <<warning #177-D: variable "mdl_G__exp__2" was declared but never referenced>>
 #endif
@@ -267,7 +267,7 @@ namespace mg5amcCpu
       // End SM implementation - no special handling of vectors of floats as in EFT (#439)
       return out;
     }
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 #pragma GCC diagnostic pop
 #pragma nv_diagnostic pop
 #endif
diff --git a/epochX/cudacpp/gg_tt01g.mad/src/cudacpp_src.mk b/epochX/cudacpp/gg_tt01g.mad/src/cudacpp_src.mk
index d4cc628aec..159e19a46d 100644
--- a/epochX/cudacpp/gg_tt01g.mad/src/cudacpp_src.mk
+++ b/epochX/cudacpp/gg_tt01g.mad/src/cudacpp_src.mk
@@ -19,7 +19,7 @@ SHELL := /bin/bash
 #=== Configure common compiler flags for CUDA and C++
 
 INCFLAGS = -I.
-OPTFLAGS = -O3 # this ends up in CUFLAGS too (should it?), cannot add -Ofast or -ffast-math here
+OPTFLAGS = -O3 # this ends up in GPUFLAGS too (should it?), cannot add -Ofast or -ffast-math here
 
 #-------------------------------------------------------------------------------
 
@@ -45,13 +45,13 @@ endif
 
 #-------------------------------------------------------------------------------
 
-#=== Configure the CUDA compiler (note: NVCC is already exported including ccache)
+#=== Configure the CUDA compiler (note: GPUCC is already exported including ccache)
 
-###$(info NVCC=$(NVCC))
+###$(info GPUCC=$(GPUCC))
 
 #-------------------------------------------------------------------------------
 
-#=== Configure ccache for C++ builds (note: NVCC is already exported including ccache)
+#=== Configure ccache for C++ builds (note: GPUCC is already exported including ccache)
 
 # Enable ccache if USECCACHE=1
 ifeq ($(USECCACHE)$(shell echo $(CXX) | grep ccache),1)
@@ -92,6 +92,13 @@ endif
 ###$(info OMPFLAGS=$(OMPFLAGS))
 CXXFLAGS += $(OMPFLAGS)
 
+# Add correct -DHIP_LATFORM when compiling for HIP
+ifeq ($(findstring nvcc,$(GPUCC)),nvcc)
+  GPUFLAGS += -Xcompiler -fPIC -c -x cu
+else ifeq ($(findstring hipcc,$(GPUCC)),hipcc)
+  GPUFLAGS += -fPIC -c
+endif
+
 # Set the build flags appropriate to each AVX choice (example: "make AVX=none")
 # [NB MGONGPU_PVW512 is needed because "-mprefer-vector-width=256" is not exposed in a macro]
 # [See https://gcc.gnu.org/bugzilla/show_bug.cgi?id=96476]
@@ -253,20 +260,20 @@ $(BUILDDIR)/%.o : %.cc *.h $(BUILDDIR)/.build.$(TAG)
 # Generic target and build rules: objects from CUDA compilation
 $(BUILDDIR)/%_cu.o : %.cc *.h $(BUILDDIR)/.build.$(TAG)
 	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
-	$(NVCC) $(CPPFLAGS) $(CUFLAGS) -Xcompiler -fPIC -c -x cu $< -o $@
+	$(GPUCC) $(CPPFLAGS) $(GPUFLAGS) $< -o $@
 
 #-------------------------------------------------------------------------------
 
 cxx_objects=$(addprefix $(BUILDDIR)/, Parameters_sm.o read_slha.o)
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 cu_objects=$(addprefix $(BUILDDIR)/, Parameters_sm_cu.o)
 endif
 
 # Target (and build rules): common (src) library
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so : $(cxx_objects) $(cu_objects)
 	@if [ ! -d $(LIBDIR) ]; then echo "mkdir -p $(LIBDIR)"; mkdir -p $(LIBDIR); fi
-	$(NVCC) -shared -o $@ $(cxx_objects) $(cu_objects) $(LDFLAGS)
+	$(GPUCC) -shared -o $@ $(cxx_objects) $(cu_objects) $(LDFLAGS)
 else
 $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so : $(cxx_objects)
 	@if [ ! -d $(LIBDIR) ]; then echo "mkdir -p $(LIBDIR)"; mkdir -p $(LIBDIR); fi
diff --git a/epochX/cudacpp/gg_tt01g.mad/src/mgOnGpuConfig.h b/epochX/cudacpp/gg_tt01g.mad/src/mgOnGpuConfig.h
index 80032e528b..55d03f1252 100644
--- a/epochX/cudacpp/gg_tt01g.mad/src/mgOnGpuConfig.h
+++ b/epochX/cudacpp/gg_tt01g.mad/src/mgOnGpuConfig.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jul 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MGONGPUCONFIG_H
 #define MGONGPUCONFIG_H 1
@@ -10,12 +10,25 @@
 // There are two different code bases for standalone_cudacpp (without multichannel) and madevent+cudacpp (with multichannel)
 #define MGONGPU_SUPPORTS_MULTICHANNEL 1
 
+// Is this a GPU (CUDA, HIP) or CPU implementation?
+#ifdef __CUDACC__
+#define MGONGPUCPP_GPUIMPL cuda
+#elif defined __HIPCC__
+#define MGONGPUCPP_GPUIMPL hip
+#else
+#undef MGONGPUCPP_GPUIMPL
+#endif
+
 // ** NB1 Throughputs (e.g. 6.8E8) are events/sec for "./gcheck.exe -p 65536 128 12"
 // ** NB2 Baseline on b7g47n0004 fluctuates (probably depends on load on other VMs)
 
 // Choose if curand is supported for generating random numbers
+// For HIP, by default, do not use curand (common random numbers will be used instead)
 // For both CUDA and C++, by default, do not inline, but allow this macro to be set from outside with e.g. -DMGONGPU_HAS_NO_CURAND
-// (there exist CUDA installations, e.g. using the HPC package, which do not include curand - see PR #784)
+// (there exist CUDA installations, e.g. using the HPC package, which do not include curand - see PR #784 and #785)
+#if defined __HIPCC__
+#define MGONGPU_HAS_NO_CURAND 1
+#else
 //#ifdef __CUDACC__
 //#undef MGONGPU_HAS_NO_CURAND // default
 ////#define MGONGPU_HAS_NO_CURAND 1
@@ -23,6 +36,7 @@
 //#undef MGONGPU_HAS_NO_CURAND // default
 ////#define MGONGPU_HAS_NO_CURAND 1
 //#endif
+#endif
 
 // Choose floating point precision (for everything but color algebra #537)
 // If one of these macros has been set from outside with e.g. -DMGONGPU_FPTYPE_FLOAT, nothing happens (issue #167)
@@ -54,23 +68,28 @@
 //#undef MGONGPU_HARDCODE_PARAM // default
 ////#define MGONGPU_HARDCODE_PARAM 1
 
-// Complex type in c++: std::complex or cxsmpl (CHOOSE ONLY ONE)
-#ifndef __CUDACC__
-//#define MGONGPU_CPPCXTYPE_STDCOMPLEX 1 // ~8 percent slower on float, same on double (5.1E6/double, 9.4E6/float)
-#define MGONGPU_CPPCXTYPE_CXSMPL 1 // new default (5.1E6/double, 10.2E6/float)
-#endif
-
-// Complex type in cuda: thrust or cucomplex or cxsmpl (CHOOSE ONLY ONE)
+// Complex type in CUDA: thrust or cucomplex or cxsmpl (CHOOSE ONLY ONE)
 #ifdef __CUDACC__
 #define MGONGPU_CUCXTYPE_THRUST 1 // default (~1.15E9/double, ~3.2E9/float)
 //#define MGONGPU_CUCXTYPE_CUCOMPLEX 1 // ~10 percent slower (1.03E9/double, ~2.8E9/float)
 //#define MGONGPU_CUCXTYPE_CXSMPL 1 // ~10 percent slower (1.00E9/double, ~2.9E9/float)
+
+// Complex type in HIP: cxsmpl (ONLY ONE OPTION POSSIBLE)
+#elif defined __HIPCC__
+#define MGONGPU_CUCXTYPE_CXSMPL 1 // ~10 percent slower (1.00E9/double, ~2.9E9/float)
+
+// Complex type in C++: std::complex or cxsmpl (CHOOSE ONLY ONE)
+#else
+//#define MGONGPU_CPPCXTYPE_STDCOMPLEX 1 // ~8 percent slower on float, same on double (5.1E6/double, 9.4E6/float)
+#define MGONGPU_CPPCXTYPE_CXSMPL 1 // new default (5.1E6/double, 10.2E6/float)
 #endif
 
-// Cuda nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation
+// CUDA nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation
 #ifdef __CUDACC__
-#undef MGONGPU_NSIGHT_DEBUG // default
+#undef MGONGPU_NSIGHT_DEBUG // default in CUDA
 //#define MGONGPU_NSIGHT_DEBUG 1
+#else
+#undef MGONGPU_NSIGHT_DEBUG // only option in HIP or C++
 #endif
 
 // SANITY CHECKS (floating point precision for everything but color algebra #537)
@@ -86,17 +105,21 @@
 #error You cannot use double precision for color algebra and single precision elsewhere
 #endif
 
-// SANITY CHECKS (c++ complex number implementation)
-#ifndef __CUDACC__
-#if defined MGONGPU_CPPCXTYPE_STDCOMPLEX and defined MGONGPU_CPPCXTYPE_CXSMPL
-#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CPPCXTYPE_STDCOMPLEX or MGONGPU_CPPCXTYPE_CXSMPL
+// SANITY CHECKS (CUDA complex number implementation)
+#ifdef __CUDACC__
+#if defined MGONGPU_CUCXTYPE_THRUST and defined MGONGPU_CUCXTYPE_CUCOMPLEX
+#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CUCXTYPE_THRUST or MGONGPU_CUCXTYPE_CUCOMPLEX for CUDA
+#elif defined MGONGPU_CUCXTYPE_THRUST and defined MGONGPU_CUCXTYPE_CXSMPL
+#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CUCXTYPE_THRUST or MGONGPU_CUCXTYPE_CXSMPL for CUDA
+#elif defined MGONGPU_CUCXTYPE_CUCOMPLEX and defined MGONGPU_CUCXTYPE_CXSMPL
+#error You must CHOOSE (ONE AND) ONLY ONE OF MGONGPU_CUCXTYPE_CUCOMPLEX or MGONGPU_CUCXTYPE_CXSMPL for CUDA
 #endif
 #endif
 
-// SANITY CHECKS (cuda complex number implementation)
-#ifdef __CUDACC__
-#if defined MGONGPU_CUCXTYPE_THRUST and defined MGONGPU_CUCXTYPE_CUCOMPLEX and defined MGONGPU_CUCXTYPE_CXSMPL
-#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CUCXTYPE_THRUST or MGONGPU_CUCXTYPE_CUCOMPLEX or MGONGPU_CUCXTYPE_CXSMPL
+// SANITY CHECKS (C++ complex number implementation)
+#ifndef MGONGPUCPP_GPUIMPL
+#if defined MGONGPU_CPPCXTYPE_STDCOMPLEX and defined MGONGPU_CPPCXTYPE_CXSMPL
+#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CPPCXTYPE_STDCOMPLEX or MGONGPU_CPPCXTYPE_CXSMPL for C++
 #endif
 #endif
 
@@ -134,7 +157,7 @@ namespace mgOnGpu
   // Alignment requirement for using reinterpret_cast with SIMD vectorized code
   // (using reinterpret_cast with non aligned memory may lead to segmentation faults!)
   // Only needed for C++ code but can be enforced also in NVCC builds of C++ code using CUDA>=11.2 and C++17 (#318, #319, #333)
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   constexpr int cppAlign = 64; // alignment requirement for SIMD vectorization (64-byte i.e. 512-bit)
 #endif
 
@@ -145,7 +168,7 @@ using mgOnGpu::fptype;
 using mgOnGpu::fptype2;
 
 // C++ SIMD vectorization width (this will be used to set neppV)
-#ifdef __CUDACC__ // CUDA implementation has no SIMD
+#ifdef MGONGPUCPP_GPUIMPL // CUDA and HIP implementations have no SIMD
 #undef MGONGPU_CPPSIMD
 #elif defined __AVX512VL__ && defined MGONGPU_PVW512 // C++ "512z" AVX512 with 512 width (512-bit ie 64-byte): 8 (DOUBLE) or 16 (FLOAT)
 #ifdef MGONGPU_FPTYPE_DOUBLE
@@ -175,9 +198,9 @@ using mgOnGpu::fptype2;
 #undef MGONGPU_CPPSIMD
 #endif
 
-// Cuda nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation
+// CUDA nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation
 // Arguments (not used so far): text is __FUNCTION__, code is 0 (start) or 1 (end)
-#if defined __CUDACC__ && defined MGONGPU_NSIGHT_DEBUG /* clang-format off */
+#if defined __CUDA__ && defined MGONGPU_NSIGHT_DEBUG /* clang-format off */
 #define mgDebugDeclare() __shared__ float mgDebugCounter[mgOnGpu::ntpbMAX];
 #define mgDebugInitialise() { mgDebugCounter[threadIdx.x] = 0; }
 #define mgDebug( code, text ) { mgDebugCounter[threadIdx.x] += 1; }
@@ -189,8 +212,8 @@ using mgOnGpu::fptype2;
 #define mgDebugFinalise() { /*noop*/ }
 #endif /* clang-format on */
 
-// Define empty CUDA declaration specifiers for C++
-#ifndef __CUDACC__
+// Define empty CUDA/HIP declaration specifiers for C++
+#ifndef MGONGPUCPP_GPUIMPL
 #define __global__
 #define __host__
 #define __device__
diff --git a/epochX/cudacpp/gg_tt01g.mad/src/mgOnGpuCxtypes.h b/epochX/cudacpp/gg_tt01g.mad/src/mgOnGpuCxtypes.h
index ca9a9f00c0..5532e22fa1 100644
--- a/epochX/cudacpp/gg_tt01g.mad/src/mgOnGpuCxtypes.h
+++ b/epochX/cudacpp/gg_tt01g.mad/src/mgOnGpuCxtypes.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022, based on earlier work by D. Smith) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MGONGPUCXTYPES_H
 #define MGONGPUCXTYPES_H 1
@@ -19,7 +19,7 @@
 #include <complex>
 
 // Complex type in cuda: thrust or cucomplex or cxsmpl
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 #if defined MGONGPU_CUCXTYPE_THRUST
 #pragma clang diagnostic push
 #pragma clang diagnostic ignored "-Wtautological-compare" // for icpx2021/clang13 (https://stackoverflow.com/a/15864661)
@@ -82,7 +82,7 @@ namespace mgOnGpu /* clang-format off */
 using mgOnGpu::cxsmpl;
 
 // Printout to stream for user defined types
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -92,7 +92,7 @@ namespace mg5amcCpu
   inline __host__ std::ostream&
   operator<<( std::ostream& out, const cxsmpl<FP>& c )
   {
-    out << std::complex( c.real(), c.imag() );
+    out << std::complex<FP>( c.real(), c.imag() );
     return out;
   }
 
@@ -215,14 +215,14 @@ namespace mg5amcCpu
 //==========================================================================
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
 #endif
 {
   // --- Type definitions (complex type: cxtype)
-#ifdef __CUDACC__ // cuda
+#ifdef MGONGPUCPP_GPUIMPL // cuda
 #if defined MGONGPU_CUCXTYPE_THRUST
   typedef thrust::complex<fptype> cxtype;
 #elif defined MGONGPU_CUCXTYPE_CUCOMPLEX
@@ -255,7 +255,7 @@ namespace mg5amcCpu
 //==========================================================================
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -307,7 +307,7 @@ namespace mg5amcCpu
 
   //==========================================================================
 
-#if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_THRUST // cuda + thrust
+#if defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CUCXTYPE_THRUST // cuda + thrust
 
   //------------------------------
   // CUDA - using thrust::complex
@@ -343,11 +343,11 @@ namespace mg5amcCpu
     return c;
   }
 
-#endif // #if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_THRUST
+#endif // #if defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CUCXTYPE_THRUST
 
   //==========================================================================
 
-#if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_CUCOMPLEX // cuda + cucomplex
+#if defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CUCXTYPE_CUCOMPLEX // cuda + cucomplex
 
   //------------------------------
   // CUDA - using cuComplex
@@ -562,11 +562,11 @@ namespace mg5amcCpu
     return cxmake( c.real(), c.imag() );
   }
 
-#endif // #if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_CUCOMPLEX
+#endif // #if defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CUCXTYPE_CUCOMPLEX
 
   //==========================================================================
 
-#if not defined __CUDACC__ and defined MGONGPU_CPPCXTYPE_STDCOMPLEX // c++ + stdcomplex
+#if not defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CPPCXTYPE_STDCOMPLEX // c++ + stdcomplex
 
   //------------------------------
   // C++ - using std::complex
@@ -610,7 +610,7 @@ namespace mg5amcCpu
   }
 #endif
 
-#endif // #if not defined __CUDACC__ and defined MGONGPU_CPPCXTYPE_STDCOMPLEX
+#endif // #if not defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CPPCXTYPE_STDCOMPLEX
 
   //==========================================================================
 
@@ -633,7 +633,7 @@ namespace mg5amcCpu
 //==========================================================================
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/gg_tt01g.mad/src/mgOnGpuFptypes.h b/epochX/cudacpp/gg_tt01g.mad/src/mgOnGpuFptypes.h
index 905c97d700..fa3a02664b 100644
--- a/epochX/cudacpp/gg_tt01g.mad/src/mgOnGpuFptypes.h
+++ b/epochX/cudacpp/gg_tt01g.mad/src/mgOnGpuFptypes.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MGONGPUFPTYPES_H
 #define MGONGPUFPTYPES_H 1
@@ -12,7 +12,7 @@
 #include <cmath>
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL // cuda
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -20,7 +20,7 @@ namespace mg5amcCpu
 {
   //==========================================================================
 
-#ifdef __CUDACC__ // cuda
+#ifdef MGONGPUCPP_GPUIMPL // cuda
 
   //------------------------------
   // Floating point types - Cuda
@@ -64,11 +64,11 @@ namespace mg5amcCpu
 #endif
   }
 
-#endif // #ifdef __CUDACC__
+#endif // #ifdef MGONGPUCPP_GPUIMPL
 
   //==========================================================================
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 
   //------------------------------
   // Floating point types - C++
@@ -92,7 +92,7 @@ namespace mg5amcCpu
     return std::sqrt( f );
   }
 
-#endif // #ifndef __CUDACC__
+#endif // #ifndef MGONGPUCPP_GPUIMPL
 
   //==========================================================================
 
diff --git a/epochX/cudacpp/gg_tt01g.mad/src/mgOnGpuVectors.h b/epochX/cudacpp/gg_tt01g.mad/src/mgOnGpuVectors.h
index e1299ba81e..cdae04326b 100644
--- a/epochX/cudacpp/gg_tt01g.mad/src/mgOnGpuVectors.h
+++ b/epochX/cudacpp/gg_tt01g.mad/src/mgOnGpuVectors.h
@@ -32,7 +32,7 @@
 #endif
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -131,7 +131,7 @@ namespace mg5amcCpu
 #endif
 #endif
 
-#else // i.e #ifndef MGONGPU_CPPSIMD (this includes #ifdef __CUDACC__)
+#else // i.e #ifndef MGONGPU_CPPSIMD (this includes #ifdef MGONGPUCPP_GPUIMPL)
 
   const int neppV = 1;
 
@@ -153,13 +153,13 @@ namespace mg5amcCpu
 //==========================================================================
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
 #endif
 {
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 
   // Printout to stream for user defined types
 
@@ -805,11 +805,11 @@ namespace mg5amcCpu
 
 #endif // #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
 
-#endif // #ifndef __CUDACC__
+#endif // #ifndef MGONGPUCPP_GPUIMPL
 
   //==========================================================================
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 
   //------------------------------
   // Vector types - CUDA
@@ -853,12 +853,12 @@ namespace mg5amcCpu
     return mask;
   }
 
-#endif // #ifdef __CUDACC__
+#endif // #ifdef MGONGPUCPP_GPUIMPL
 
   //==========================================================================
 
   // Scalar-or-vector types: scalar in CUDA, vector or scalar in C++
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   typedef bool bool_sv;
   typedef fptype fptype_sv;
   typedef fptype2 fptype2_sv;
@@ -879,7 +879,7 @@ namespace mg5amcCpu
 #endif
 
   // Scalar-or-vector zeros: scalar in CUDA, vector or scalar in C++
-#ifdef __CUDACC__ /* clang-format off */
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
   inline __host__ __device__ cxtype cxzero_sv(){ return cxtype( 0, 0 ); }
 #elif defined MGONGPU_CPPSIMD
   inline cxtype_v cxzero_sv() { return cxtype_v(); } // RRRR=0000 IIII=0000
diff --git a/epochX/cudacpp/gg_tt01g.mad/src/rambo.h b/epochX/cudacpp/gg_tt01g.mad/src/rambo.h
index e02ea52496..cd7e1008ea 100644
--- a/epochX/cudacpp/gg_tt01g.mad/src/rambo.h
+++ b/epochX/cudacpp/gg_tt01g.mad/src/rambo.h
@@ -4,7 +4,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 
 #include "mgOnGpuConfig.h"
@@ -18,7 +18,7 @@
 #include <iostream>
 
 // Simplified rambo version for 2 to N (with N>=2) processes with massless particles
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -83,7 +83,7 @@ namespace mg5amcCpu
       static bool first = true;
       if( first )
       {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
         if constexpr( M_ACCESS::isOnDevice() ) // avoid
         {
           const int ievt0 = 0;
@@ -166,7 +166,7 @@ namespace mg5amcCpu
     wt = po2log;
     if( nparf != 2 ) wt = ( 2. * nparf - 4. ) * log( energy ) + z[nparf - 1];
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
     // issue warnings if weight is too small or too large
     static int iwarn[5] = { 0, 0, 0, 0, 0 };
     if( wt < -180. )
diff --git a/epochX/cudacpp/gg_ttg.mad/CODEGEN_mad_gg_ttg_log.txt b/epochX/cudacpp/gg_ttg.mad/CODEGEN_mad_gg_ttg_log.txt
index d22ecfb1e2..527b74cf99 100644
--- a/epochX/cudacpp/gg_ttg.mad/CODEGEN_mad_gg_ttg_log.txt
+++ b/epochX/cudacpp/gg_ttg.mad/CODEGEN_mad_gg_ttg_log.txt
@@ -14,7 +14,7 @@ Running MG5 in debug mode
 *                   *        * *        *                  *
 *                 *                       *                *
 *                                                          *
-*         VERSION 3.5.1_lo_vect         2023-08-08         *
+*         VERSION 3.5.2_lo_vect         2023-11-08         *
 [1;31m*                                                          *[1;0m
 [1;31m*          WARNING: UNKNOWN DEVELOPMENT VERSION.           *[1;0m
 [1;31m*            WARNING: DO NOT USE FOR PRODUCTION            *[1;0m
@@ -53,7 +53,7 @@ Note that you can still compile and run aMC@NLO with the built-in PDFs
 Using default text editor "vi". Set another one in ./input/mg5_configuration.txt
 Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt
 Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt
-import /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_mad_gg_ttg.mg
+import /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg.mg
 The import format was not given, so we guess it as command
 set stdout_level DEBUG
 set output information to level: 10
@@ -62,7 +62,7 @@ generate g g > t t~ g
 No model currently active, so we import the Standard Model
 INFO: load particles 
 INFO: load vertices 
-[1;32mDEBUG: model prefixing  takes 0.005263328552246094 [0m
+[1;32mDEBUG: model prefixing  takes 0.005304574966430664 [0m
 INFO: Restrict model sm with file models/sm/restrict_default.dat . 
 [1;32mDEBUG: Simplifying conditional expressions [0m
 [1;32mDEBUG: remove interactions: u s w+ at order: QED=1 [0m
@@ -155,83 +155,58 @@ INFO: Please specify coupling orders to bypass this step.
 INFO: Trying coupling order WEIGHTED<=3: WEIGTHED IS QCD+2*QED 
 INFO: Trying process: g g > t t~ g WEIGHTED<=3 @1  
 INFO: Process has 16 diagrams 
-1 processes with 16 diagrams generated in 0.022 s
+1 processes with 16 diagrams generated in 0.021 s
 Total: 1 processes with 16 diagrams
-output madevent CODEGEN_mad_gg_ttg --hel_recycling=False --vector_size=16384 --me_exporter=standalone_cudacpp
-Load PLUGIN.CUDACPP_SA_OUTPUT
-[1mAddition matrix-element will be done with PLUGIN: CUDACPP_SA_OUTPUT[0m
-[1mOutput will be done with PLUGIN: CUDACPP_SA_OUTPUT[0m
+output madevent ../TMPOUT/CODEGEN_mad_gg_ttg --hel_recycling=False --vector_size=32 --me_exporter=standalone_cudacpp
+Load PLUGIN.CUDACPP_OUTPUT
+[1mAddition matrix-element will be done with PLUGIN: CUDACPP_OUTPUT[0m
+[1mOutput will be done with PLUGIN: CUDACPP_OUTPUT[0m
 [1;32mDEBUG:  cformat = [0m standalone_cudacpp [1;30m[export_cpp.py at line 3071][0m [0m
-[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [1;30m[output.py at line 152][0m [0m
+[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [1;30m[output.py at line 160][0m [0m
 INFO: initialize a new directory: CODEGEN_mad_gg_ttg 
 INFO: remove old information in CODEGEN_mad_gg_ttg 
-[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [1;30m[output.py at line 157][0m [0m
-[1;34mWARNING: File exists /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_mad_gg_ttg [0m
-INFO: Creating subdirectories in directory /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_mad_gg_ttg 
-[1;34mWARNING: File exists /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_mad_gg_ttg/Cards [0m
-[1;34mWARNING: File exists /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_mad_gg_ttg/SubProcesses [0m
+[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [1;30m[output.py at line 165][0m [0m
+[1;34mWARNING: File exists /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg [0m
+INFO: Creating subdirectories in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg 
+[1;34mWARNING: File exists /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg/Cards [0m
+[1;34mWARNING: File exists /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg/SubProcesses [0m
 INFO: Organizing processes into subprocess groups 
 INFO: Generating Helas calls for process: g g > t t~ g WEIGHTED<=3 @1 
 INFO: Processing color information for process: g g > t t~ g @1 
 INFO: Creating files in directory P1_gg_ttxg 
-[1;32mDEBUG:  Entering PLUGIN_OneProcessExporter.__init__ [1;30m[model_handling.py at line 1039][0m [0m
-[1;32mDEBUG:  kwargs[prefix] = 0 [1;30m[model_handling.py at line 1040][0m [0m
-[1;32mDEBUG:  proc_id = [0m 1 [1;30m[model_handling.py at line 1045][0m [0m
-[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_SA_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7f709fe53370> [1;30m[export_v4.py at line 6179][0m [0m
+[1;32mDEBUG:  kwargs[prefix] = 0 [1;30m[model_handling.py at line 1058][0m [0m
+[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7feaad7456a0> [1;30m[export_v4.py at line 6262][0m [0m
 INFO: Creating files in directory . 
-[1;32mDEBUG:  Entering PLUGIN_OneProcessExporter.generate_process_files [1;30m[model_handling.py at line 1297][0m [0m
-[1;32mDEBUG:  self.include_multi_channel is already defined: this is madevent+second_exporter mode [1;30m[model_handling.py at line 1299][0m [0m
-FileWriter <class 'PLUGIN.CUDACPP_SA_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.h
-[1;32mDEBUG:  Entering PLUGIN_OneProcessExporter.write_process_h_file [1;30m[model_handling.py at line 1453][0m [0m
-FileWriter <class 'PLUGIN.CUDACPP_SA_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.cc
-[1;32mDEBUG:  Entering PLUGIN_OneProcessExporter.write_process_cc_file [1;30m[model_handling.py at line 1475][0m [0m
-[1;32mDEBUG:  Entering PLUGIN_OneProcessExporter.get_sigmaKin_lines [1;30m[model_handling.py at line 1143][0m [0m
-[1;32mDEBUG:  self.include_multi_channel = [0m [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 0] [1;30m[model_handling.py at line 1144][0m [0m
-[1;32mDEBUG:  self.support_multichannel = [0m True [1;30m[model_handling.py at line 1145][0m [0m
-[1;32mDEBUG:  type(self.helas_call_writer) = [0m <class 'PLUGIN.CUDACPP_SA_OUTPUT.model_handling.PLUGIN_GPUFOHelasCallWriter'> [1;30m[model_handling.py at line 1162][0m [0m
-[1;32mDEBUG:  self.support_multichannel, self.include_multi_channel = [0m True [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 0] [1;30m[model_handling.py at line 1163][0m [0m
-[1;32mDEBUG:  multi_channel = [0m {1: [0], 2: [1], 3: [2], 4: [3], 5: [4], 6: [5], 7: [6], 8: [7], 9: [8], 10: [9], 11: [10], 12: [11], 13: [12], 14: [13], 15: [14]} [1;30m[model_handling.py at line 1169][0m [0m
-[1;32mDEBUG:  multi_channel_map = [0m {1: [0], 2: [1], 3: [2], 4: [3], 5: [4], 6: [5], 7: [6], 8: [7], 9: [8], 10: [9], 11: [10], 12: [11], 13: [12], 14: [13], 15: [14]} [1;30m[model_handling.py at line 1654][0m [0m
-[1;32mDEBUG:  diag_to_config = [0m {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14, 15: 15} [1;30m[model_handling.py at line 1711][0m [0m
-[1;32mDEBUG:  call = [0m vxxxxx( momenta,m_pars->%s, cHel[ihel][%d],%+d, w_sv[%d], %d ); [1;30m[model_handling.py at line 1823][0m [0m
-[1;32mDEBUG:  ('ZERO', 0, -1, 0, 0) [1;30m[model_handling.py at line 1824][0m [0m
-[1;32mDEBUG:  call = [0m vxxxxx( momenta,m_pars->%s, cHel[ihel][%d],%+d, w_sv[%d], %d ); [1;30m[model_handling.py at line 1823][0m [0m
-[1;32mDEBUG:  ('ZERO', 1, -1, 1, 1) [1;30m[model_handling.py at line 1824][0m [0m
-[1;32mDEBUG:  call = [0m vxxxxx( momenta,m_pars->%s, cHel[ihel][%d],%+d, w_sv[%d], %d ); [1;30m[model_handling.py at line 1823][0m [0m
-[1;32mDEBUG:  ('ZERO', 4, 1, 4, 4) [1;30m[model_handling.py at line 1824][0m [0m
+FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.h
+FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.cc
 INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./. 
-[1;32mDEBUG:  Entering PLUGIN_OneProcessExporter.edit_CMakeLists [1;30m[model_handling.py at line 1343][0m [0m
-[1;32mDEBUG:  Entering PLUGIN_OneProcessExporter.edit_check_sa [1;30m[model_handling.py at line 1352][0m [0m
-[1;32mDEBUG:  Entering PLUGIN_OneProcessExporter.edit_mgonGPU [1;30m[model_handling.py at line 1369][0m [0m
-[1;32mDEBUG:  Entering PLUGIN_OneProcessExporter.edit_processidfile [1;30m[model_handling.py at line 1389][0m [0m
-[1;32mDEBUG:  Entering PLUGIN_OneProcessExporter.edit_coloramps [1;30m[model_handling.py at line 1401][0m [0m
-[1;32mDEBUG:  Entering PLUGIN_OneProcessExporter.edit_testxxx [1;30m[model_handling.py at line 1419][0m [0m
-[1;32mDEBUG:  Entering PLUGIN_OneProcessExporter.edit_memorybuffers [1;30m[model_handling.py at line 1430][0m [0m
-[1;32mDEBUG:  Entering PLUGIN_OneProcessExporter.edit_memoryaccesscouplings [1;30m[model_handling.py at line 1441][0m [0m
-[1;32mDEBUG:  'Copying test reference file: ', template_ref  = [0m Copying test reference file:  /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/../../../test/ref/dump_CPUTest.Sigma_sm_gg_ttxg.txt [1;30m[model_handling.py at line 1335][0m [0m
 [1;32mDEBUG:  proc_id = [0m 1 [1;30m[export_cpp.py at line 710][0m [0m
 [1;32mDEBUG:  config_map = [0m [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 0] [1;30m[export_cpp.py at line 711][0m [0m
 [1;32mDEBUG:  subproc_number = [0m 0 [1;30m[export_cpp.py at line 712][0m [0m
 [1;32mDEBUG:  Done [1;30m[export_cpp.py at line 713][0m [0m
+[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m False True 32 [1;30m[export_v4.py at line 1872][0m [0m
+[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m False True 32 [1;30m[export_v4.py at line 1872][0m [0m
+[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m 32 True 32 [1;30m[export_v4.py at line 1872][0m [0m
+[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m 32 True 32 [1;30m[export_v4.py at line 1872][0m [0m
 INFO: Generating Feynman diagrams for Process: g g > t t~ g WEIGHTED<=3 @1 
 INFO: Finding symmetric diagrams for subprocess group gg_ttxg 
 Generated helas calls for 1 subprocesses (16 diagrams) in 0.037 s
-Wrote files for 36 helas calls in 0.162 s
+Wrote files for 36 helas calls in 0.149 s
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates VVV1 routines[0m
 ALOHA: aloha creates FFV1 routines[0m
 ALOHA: aloha creates VVVV1 set of routines with options: P0[0m
 ALOHA: aloha creates VVVV3 set of routines with options: P0[0m
 ALOHA: aloha creates VVVV4 set of routines with options: P0[0m
-ALOHA: aloha creates 5 routines in  0.324 s
-[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.convert_model (create the model) [1;30m[output.py at line 194][0m [0m
+ALOHA: aloha creates 5 routines in  0.325 s
+[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.convert_model (create the model) [1;30m[output.py at line 202][0m [0m
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates VVV1 routines[0m
 ALOHA: aloha creates FFV1 routines[0m
 ALOHA: aloha creates VVVV1 set of routines with options: P0[0m
 ALOHA: aloha creates VVVV3 set of routines with options: P0[0m
 ALOHA: aloha creates VVVV4 set of routines with options: P0[0m
-ALOHA: aloha creates 10 routines in  0.310 s
+ALOHA: aloha creates 10 routines in  0.308 s
 <class 'aloha.create_aloha.AbstractRoutine'> VVV1
 <class 'aloha.create_aloha.AbstractRoutine'> VVV1
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
@@ -241,32 +216,103 @@ ALOHA: aloha creates 10 routines in  0.310 s
 <class 'aloha.create_aloha.AbstractRoutine'> VVVV1
 <class 'aloha.create_aloha.AbstractRoutine'> VVVV3
 <class 'aloha.create_aloha.AbstractRoutine'> VVVV4
-FileWriter <class 'PLUGIN.CUDACPP_SA_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_mad_gg_ttg/src/./HelAmps_sm.h
-INFO: Created file HelAmps_sm.h in directory /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_mad_gg_ttg/src/. 
+FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg/src/./HelAmps_sm.h
+INFO: Created file HelAmps_sm.h in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg/src/. 
 super_write_set_parameters_onlyfixMajorana (hardcoded=False)
-[1;32mDEBUG:  'pardef_lines size =', len(pardef_lines), ', keys size =', len(pardef_lines.keys())  = [0m pardef_lines size = 48 , keys size = 48 [1;30m[model_handling.py at line 729][0m [0m
 super_write_set_parameters_onlyfixMajorana (hardcoded=True)
-[1;32mDEBUG:  'pardef_lines size =', len(pardef_lines), ', keys size =', len(pardef_lines.keys())  = [0m pardef_lines size = 3 , keys size = 3 [1;30m[model_handling.py at line 729][0m [0m
-[1;32mDEBUG:  'pardef_lines size =', len(pardef_lines), ', keys size =', len(pardef_lines.keys())  = [0m pardef_lines size = 3 , keys size = 3 [1;30m[model_handling.py at line 729][0m [0m
-[1;32mDEBUG:  'pardef_lines size =', len(pardef_lines), ', keys size =', len(pardef_lines.keys())  = [0m pardef_lines size = 3 , keys size = 3 [1;30m[model_handling.py at line 729][0m [0m
-[1;32mDEBUG:  'pardef_lines size =', len(pardef_lines), ', keys size =', len(pardef_lines.keys())  = [0m pardef_lines size = 3 , keys size = 3 [1;30m[model_handling.py at line 729][0m [0m
-[1;32mDEBUG:  'pardef_lines size =', len(pardef_lines), ', keys size =', len(pardef_lines.keys())  = [0m pardef_lines size = 3 , keys size = 3 [1;30m[model_handling.py at line 729][0m [0m
-FileWriter <class 'PLUGIN.CUDACPP_SA_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_mad_gg_ttg/src/./Parameters_sm.h
-FileWriter <class 'PLUGIN.CUDACPP_SA_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_mad_gg_ttg/src/./Parameters_sm.cc
+FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg/src/./Parameters_sm.h
+FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg/src/./Parameters_sm.cc
 INFO: Created files Parameters_sm.h and Parameters_sm.cc in directory 
-INFO: /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_mad_gg_ttg/src/. and /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_mad_gg_ttg/src/. 
+INFO: /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg/src/. and /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg/src/. 
 The option zerowidth_tchannel is modified [True] but will not be written in the configuration files.
 If you want to make this value the default for future session, you can run 'save options --all'
-save configuration file to /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_mad_gg_ttg/Cards/me5_configuration.txt
+save configuration file to /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg/Cards/me5_configuration.txt
 INFO: Use Fortran compiler gfortran 
 INFO: Use c++ compiler g++ 
 INFO: Generate web pages 
-Output to directory /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_mad_gg_ttg done.
+DEBUG: cd /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg; patch -p4 -i /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.common
+patching file Source/genps.inc
+patching file Source/makefile
+patching file SubProcesses/makefile
+patching file bin/internal/gen_ximprove.py
+Hunk #1 succeeded at 391 (offset 6 lines).
+patching file bin/internal/madevent_interface.py
+DEBUG: cd /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg/SubProcesses/P1_gg_ttxg; patch -p6 -i /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.P1
+patching file auto_dsig1.f
+patching file driver.f
+patching file matrix1.f
+Hunk #2 succeeded at 159 (offset 16 lines).
+Hunk #3 succeeded at 237 (offset 16 lines).
+Hunk #4 succeeded at 265 (offset 16 lines).
+Hunk #5 succeeded at 310 (offset 16 lines).
+[1;32mDEBUG:  p.returncode = [0m 0 [1;30m[output.py at line 237][0m [0m
+Output to directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg done.
 Type "launch" to generate events from this process, or see
-/data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_mad_gg_ttg/README
+/data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg/README
 Run "open index.html" to see more information about this process.
 quit
 
-real	0m2.202s
-user	0m1.971s
-sys	0m0.182s
+real	0m2.265s
+user	0m1.926s
+sys	0m0.245s
+Code generation completed in 2 seconds
+************************************************************
+*                                                          *
+*                      W E L C O M E to                    *
+*             M A D G R A P H 5 _ a M C @ N L O            *
+*                      M A D E V E N T                     *
+*                                                          *
+*                 *                       *                *
+*                   *        * *        *                  *
+*                     * * * * 5 * * * *                    *
+*                   *        * *        *                  *
+*                 *                       *                *
+*                                                          *
+*         VERSION 3.5.2_lo_vect                            *
+*                                                          *
+*    The MadGraph5_aMC@NLO Development Team - Find us at   *
+*    https://server06.fynu.ucl.ac.be/projects/madgraph     *
+*                                                          *
+*               Type 'help' for in-line help.              *
+*                                                          *
+************************************************************
+INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg/Cards/me5_configuration.txt  
+INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/mg5amcnlo/input/mg5_configuration.txt  
+INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg/Cards/me5_configuration.txt  
+Using default text editor "vi". Set another one in ./input/mg5_configuration.txt
+Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt
+Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt
+treatcards run
+quit
+INFO:  
+launch in debug mode
+************************************************************
+*                                                          *
+*                      W E L C O M E to                    *
+*             M A D G R A P H 5 _ a M C @ N L O            *
+*                      M A D E V E N T                     *
+*                                                          *
+*                 *                       *                *
+*                   *        * *        *                  *
+*                     * * * * 5 * * * *                    *
+*                   *        * *        *                  *
+*                 *                       *                *
+*                                                          *
+*         VERSION 3.5.2_lo_vect                            *
+*                                                          *
+*    The MadGraph5_aMC@NLO Development Team - Find us at   *
+*    https://server06.fynu.ucl.ac.be/projects/madgraph     *
+*                                                          *
+*               Type 'help' for in-line help.              *
+*                                                          *
+************************************************************
+INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg/Cards/me5_configuration.txt  
+INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/mg5amcnlo/input/mg5_configuration.txt  
+INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg/Cards/me5_configuration.txt  
+Using default text editor "vi". Set another one in ./input/mg5_configuration.txt
+Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt
+Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt
+treatcards param
+quit
+INFO:  
+launch in debug mode
diff --git a/epochX/cudacpp/gg_ttg.mad/Cards/me5_configuration.txt b/epochX/cudacpp/gg_ttg.mad/Cards/me5_configuration.txt
index 00d7c6f8d6..cdeedc7863 100644
--- a/epochX/cudacpp/gg_ttg.mad/Cards/me5_configuration.txt
+++ b/epochX/cudacpp/gg_ttg.mad/Cards/me5_configuration.txt
@@ -234,7 +234,7 @@
 # pineappl = pineappl
 
 
-#mg5_path = /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo 
+#mg5_path = /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/mg5amcnlo 
 
 # MG5 MAIN DIRECTORY
-#mg5_path = /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo
+#mg5_path = /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/mg5amcnlo
diff --git a/epochX/cudacpp/gg_ttg.mad/Cards/proc_card_mg5.dat b/epochX/cudacpp/gg_ttg.mad/Cards/proc_card_mg5.dat
index bd12dbd0f8..9d09090869 100644
--- a/epochX/cudacpp/gg_ttg.mad/Cards/proc_card_mg5.dat
+++ b/epochX/cudacpp/gg_ttg.mad/Cards/proc_card_mg5.dat
@@ -8,7 +8,7 @@
 #*                *                       *                 *
 #*                                                          *
 #*                                                          *
-#*         VERSION 3.5.1_lo_vect         2023-08-08         *
+#*         VERSION 3.5.2_lo_vect         2023-11-08         *
 [1;31m#*                                                          *[1;0m
 [1;31m#*          WARNING: UNKNOWN DEVELOPMENT VERSION.           *[1;0m
 [1;31m#*            WARNING: DO NOT USE FOR PRODUCTION            *[1;0m
@@ -45,5 +45,5 @@ define l+ = e+ mu+
 define l- = e- mu-
 define vl = ve vm vt
 define vl~ = ve~ vm~ vt~
-output madevent CODEGEN_mad_gg_ttg --hel_recycling=False --vector_size\
-=16384 --me_exporter=standalone_cudacpp
+output madevent ../TMPOUT/CODEGEN_mad_gg_ttg --hel_recycling=False --v\
+ector_size=32 --me_exporter=standalone_cudacpp
diff --git a/epochX/cudacpp/gg_ttg.mad/Cards/run_card.dat b/epochX/cudacpp/gg_ttg.mad/Cards/run_card.dat
index a9fbb5a212..f119b5a1c7 100644
--- a/epochX/cudacpp/gg_ttg.mad/Cards/run_card.dat
+++ b/epochX/cudacpp/gg_ttg.mad/Cards/run_card.dat
@@ -80,7 +80,23 @@
    1  = sde_strategy  ! default integration strategy (hep-ph/2021.00773)
                              ! 1 is old strategy (using amp square)
 			     ! 2 is new strategy (using only the denominator)
-# To see advanced option for Phase-Space optimization: type "update psoptim"			     
+#*********************************************************************
+# Phase-Space Optim (advanced)
+#*********************************************************************
+   0 = job_strategy ! see appendix of 1507.00020 (page 26)
+   0 =  hard_survey ! force to have better estimate of the integral at survey for difficult mode like interference
+   -1.0 = tmin_for_channel ! limit the non-singular reach of --some-- channel of integration related to T-channel diagram (value between -1 and 0), -1 is no impact
+   -1 = survey_splitting ! for loop-induced control how many core are used at survey for the computation of a single iteration.
+   2 = survey_nchannel_per_job ! control how many Channel are integrated inside a single job on cluster/multicore
+   -1 = refine_evt_by_job ! control the maximal number of events for the first iteration of the refine (larger means less jobs)  
+#*********************************************************************
+# Compilation flag. 
+#*********************************************************************   
+   -O3 -ffast-math -fbounds-check = global_flag ! build flags for all Fortran code (for a fair comparison to cudacpp; default is -O)
+   --fast-math  = aloha_flag ! fortran optimization flag for aloha function. Suggestions: '-ffast-math'
+   -O3 = matrix_flag ! fortran optimization flag for matrix.f function. Suggestions: '-O3'
+   16384 = vector_size ! size of fortran arrays allocated in the multi-event API for SIMD/GPU (VECSIZE_MEMMAX)
+			     
 #*********************************************************************
 # Customization (custom cuts/scale/bias/...)                         *
 # list of files containing fortran function that overwrite default   *
@@ -143,12 +159,10 @@
 systematics = systematics_program ! none, systematics [python], SysCalc [depreceted, C++]
 ['--mur=0.5,1,2', '--muf=0.5,1,2', '--pdf=errorset'] = systematics_arguments ! see: https://cp3.irmp.ucl.ac.be/projects/madgraph/wiki/Systematics#Systematicspythonmodule
 
-#*********************************************************************
-# Options for the cudacpp plugin
-#*********************************************************************
-
-# Set cudacpp-specific values of non-cudacpp-specific options
--O3 -ffast-math -fbounds-check = global_flag ! build flags for Fortran code (for a fair comparison to cudacpp)
+#***********************************************************************
+# SIMD/GPU configuration for the CUDACPP plugin
+#************************************************************************
+ d = floating_type ! floating point precision: f (single), d (double), m (mixed: double for amplitudes, single for colors)
+ auto = avx_level ! SIMD vectorization level: none, sse4, avx2, 512y, 512z, auto
+ CPP = cudacpp_backend ! CUDACPP backend: FORTRAN, CPP, CUDA
 
-# New cudacpp-specific options (default values are defined in banner.py)
-CPP = cudacpp_backend ! valid backends are FORTRAN, CPP, CUDA
diff --git a/epochX/cudacpp/gg_ttg.mad/Cards/run_card_default.dat b/epochX/cudacpp/gg_ttg.mad/Cards/run_card_default.dat
index 1464f610ba..e89c78702d 100644
--- a/epochX/cudacpp/gg_ttg.mad/Cards/run_card_default.dat
+++ b/epochX/cudacpp/gg_ttg.mad/Cards/run_card_default.dat
@@ -80,7 +80,23 @@
    2  = sde_strategy  ! default integration strategy (hep-ph/2021.00773)
                              ! 1 is old strategy (using amp square)
 			     ! 2 is new strategy (using only the denominator)
-# To see advanced option for Phase-Space optimization: type "update psoptim"			     
+#*********************************************************************
+# Phase-Space Optim (advanced)
+#*********************************************************************
+   0 = job_strategy ! see appendix of 1507.00020 (page 26)
+   0 =  hard_survey ! force to have better estimate of the integral at survey for difficult mode like interference
+   -1.0 = tmin_for_channel ! limit the non-singular reach of --some-- channel of integration related to T-channel diagram (value between -1 and 0), -1 is no impact
+   -1 = survey_splitting ! for loop-induced control how many core are used at survey for the computation of a single iteration.
+   2 = survey_nchannel_per_job ! control how many Channel are integrated inside a single job on cluster/multicore
+   -1 = refine_evt_by_job ! control the maximal number of events for the first iteration of the refine (larger means less jobs)  
+#*********************************************************************
+# Compilation flag. 
+#*********************************************************************   
+   -O = global_flag ! fortran optimization flag use for the all code.
+   --fast-math  = aloha_flag ! fortran optimization flag for aloha function. Suggestions: '-ffast-math'
+   -O3 = matrix_flag ! fortran optimization flag for matrix.f function. Suggestions: '-O3'
+   16 = vector_size ! size of fortran arrays allocated in the multi-event API for SIMD/GPU (VECSIZE_MEMMAX)
+			     
 #*********************************************************************
 # Customization (custom cuts/scale/bias/...)                         *
 # list of files containing fortran function that overwrite default   *
@@ -142,3 +158,11 @@
 #
 systematics = systematics_program ! none, systematics [python], SysCalc [depreceted, C++]
 ['--mur=0.5,1,2', '--muf=0.5,1,2', '--pdf=errorset'] = systematics_arguments ! see: https://cp3.irmp.ucl.ac.be/projects/madgraph/wiki/Systematics#Systematicspythonmodule
+
+#***********************************************************************
+# SIMD/GPU configuration for the CUDACPP plugin
+#************************************************************************
+ d = floating_type ! floating point precision: f (single), d (double), m (mixed: double for amplitudes, single for colors)
+ auto = avx_level ! SIMD vectorization level: none, sse4, avx2, 512y, 512z, auto
+ CPP = cudacpp_backend ! CUDACPP backend: FORTRAN, CPP, CUDA
+
diff --git a/epochX/cudacpp/gg_ttg.mad/MGMEVersion.txt b/epochX/cudacpp/gg_ttg.mad/MGMEVersion.txt
index 1c1a95761b..85c67c3554 100644
--- a/epochX/cudacpp/gg_ttg.mad/MGMEVersion.txt
+++ b/epochX/cudacpp/gg_ttg.mad/MGMEVersion.txt
@@ -1 +1 @@
-3.5.1_lo_vect
\ No newline at end of file
+3.5.2_lo_vect
\ No newline at end of file
diff --git a/epochX/cudacpp/gg_ttg.mad/Source/DHELAS/aloha_file.inc b/epochX/cudacpp/gg_ttg.mad/Source/DHELAS/aloha_file.inc
index 50c12b0804..7639734c1c 100644
--- a/epochX/cudacpp/gg_ttg.mad/Source/DHELAS/aloha_file.inc
+++ b/epochX/cudacpp/gg_ttg.mad/Source/DHELAS/aloha_file.inc
@@ -1 +1 @@
-ALOHARoutine = FFV1_1.o VVVV4P0_1.o FFV1_0.o VVV1_0.o FFV1_2.o VVVV3P0_1.o VVVV1P0_1.o VVV1P0_1.o FFV1P0_3.o
+ALOHARoutine = FFV1P0_3.o FFV1_0.o FFV1_1.o FFV1_2.o VVV1P0_1.o VVV1_0.o VVVV1P0_1.o VVVV3P0_1.o VVVV4P0_1.o
diff --git a/epochX/cudacpp/gg_ttg.mad/Source/PDF/pdfwrap_lhapdf.f b/epochX/cudacpp/gg_ttg.mad/Source/PDF/pdfwrap_lhapdf.f
index 0be926e6cd..3f36905346 100644
--- a/epochX/cudacpp/gg_ttg.mad/Source/PDF/pdfwrap_lhapdf.f
+++ b/epochX/cudacpp/gg_ttg.mad/Source/PDF/pdfwrap_lhapdf.f
@@ -5,6 +5,7 @@ SUBROUTINE PDFWRAP
 C     
       INCLUDE 'pdf.inc'
       INCLUDE '../alfas.inc'
+      INCLUDE '../vector.inc'
       INCLUDE '../coupl.inc'
       REAL*8 ZMASS
       DATA ZMASS/91.188D0/
diff --git a/epochX/cudacpp/gg_ttg.mad/Source/make_opts b/epochX/cudacpp/gg_ttg.mad/Source/make_opts
index bd3c24228d..e4b87ee6ad 100644
--- a/epochX/cudacpp/gg_ttg.mad/Source/make_opts
+++ b/epochX/cudacpp/gg_ttg.mad/Source/make_opts
@@ -1,17 +1,12 @@
-pdlabel1=
-pdlabel2=
-lhapdf=
-PYTHIA8_PATH=NotInstalled
-MG5AMC_VERSION=3.5.0_lo_vect
-GLOBAL_FLAG=-O3 -ffast-math -fbounds-check
-ALOHA_FLAG=
-MATRIX_FLAG=
 DEFAULT_CPP_COMPILER=g++
+DEFAULT_F2PY_COMPILER=f2py3
+DEFAULT_F_COMPILER=gfortran
+GLOBAL_FLAG=-O3 -ffast-math -fbounds-check
 MACFLAG=
-STDLIB=-lstdc++
+MG5AMC_VERSION=SpecifiedByMG5aMCAtRunTime
+PYTHIA8_PATH=NotInstalled
 STDLIB_FLAG=
-DEFAULT_F_COMPILER=gfortran
-DEFAULT_F2PY_COMPILER=f2py3
+STDLIB=-lstdc++
 #end_of_make_opts_variables
 
 BIASLIBDIR=../../../lib/
diff --git a/epochX/cudacpp/gg_ttg.mad/Source/makefile b/epochX/cudacpp/gg_ttg.mad/Source/makefile
index dbe08b846e..00c73099a0 100644
--- a/epochX/cudacpp/gg_ttg.mad/Source/makefile
+++ b/epochX/cudacpp/gg_ttg.mad/Source/makefile
@@ -136,5 +136,7 @@ cleanSource:
 clean: cleanSource
 	for i in `ls -d ../SubProcesses/P*`; do cd $$i; make clean; cd -; done;
 
-cleanall: cleanSource
+cleanavx:
+	for i in `ls -d ../SubProcesses/P*`; do cd $$i; make cleanavxs; cd -; done;
+cleanall: cleanSource # THIS IS THE ONE
 	for i in `ls -d ../SubProcesses/P*`; do cd $$i; make cleanavxs; cd -; done;
diff --git a/epochX/cudacpp/gg_ttg.mad/Source/param_card.inc b/epochX/cudacpp/gg_ttg.mad/Source/param_card.inc
index 1fcfce55bb..081365c16b 100644
--- a/epochX/cudacpp/gg_ttg.mad/Source/param_card.inc
+++ b/epochX/cudacpp/gg_ttg.mad/Source/param_card.inc
@@ -1,15 +1,15 @@
-      MDL_WZ = 2.441404D+00
-      MDL_WW = 2.047600D+00
-      MDL_WH = 6.382339D-03
-      MDL_WT = 1.491500D+00
+      MDL_MB = 4.700000D+00
+      MDL_MT = 1.730000D+02
       MDL_MTA = 1.777000D+00
       MDL_MZ = 9.118800D+01
       MDL_MH = 1.250000D+02
-      MDL_MB = 4.700000D+00
-      MDL_MT = 1.730000D+02
       AEWM1 = 1.325070D+02
       MDL_GF = 1.166390D-05
       AS = 1.180000D-01
-      MDL_YMTAU = 1.777000D+00
       MDL_YMB = 4.700000D+00
       MDL_YMT = 1.730000D+02
+      MDL_YMTAU = 1.777000D+00
+      MDL_WT = 1.491500D+00
+      MDL_WZ = 2.441404D+00
+      MDL_WW = 2.047600D+00
+      MDL_WH = 6.382339D-03
diff --git a/epochX/cudacpp/gg_ttg.mad/Source/vector.inc b/epochX/cudacpp/gg_ttg.mad/Source/vector.inc
index 92254c0f2a..863eebbc70 100644
--- a/epochX/cudacpp/gg_ttg.mad/Source/vector.inc
+++ b/epochX/cudacpp/gg_ttg.mad/Source/vector.inc
@@ -28,5 +28,4 @@ C     BECAUSE IT DOES NOT GO THROUGH THE CPP PREPROCESSOR
 C     (see https://github.com/madgraph5/madgraph4gpu/issues/458).
 C     
       INTEGER VECSIZE_MEMMAX
-      PARAMETER (VECSIZE_MEMMAX=16384) ! NB: 16k events per GPU grid is the minimum required to fill a V100 GPU
-c     PARAMETER (VECSIZE_MEMMAX=32) ! NB: workaround for out-of-memory on Juwels: 32 is enough for no-CUDA builds (issue #498)
+      PARAMETER (VECSIZE_MEMMAX=16384)
diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/Bridge.h b/epochX/cudacpp/gg_ttg.mad/SubProcesses/Bridge.h
index f37c972b24..89437b4c42 100644
--- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/Bridge.h
+++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/Bridge.h
@@ -18,6 +18,7 @@
 #include <cassert>
 #include <cmath>
 #include <cstring>
+#include <filesystem>
 #include <iostream>
 #include <memory>
 #include <type_traits>
@@ -244,14 +245,21 @@ namespace mg5amcCpu
     }
     std::cout << "WARNING! Instantiate device Bridge (nevt=" << m_nevt << ", gpublocks=" << m_gpublocks << ", gputhreads=" << m_gputhreads
               << ", gpublocks*gputhreads=" << m_gpublocks * m_gputhreads << ")" << std::endl;
-    CPPProcess process( /*verbose=*/false );
     m_pmek.reset( new MatrixElementKernelDevice( m_devMomentaC, m_devGs, m_devRndHel, m_devRndCol, m_devMEs, m_devSelHel, m_devSelCol, m_gpublocks, m_gputhreads ) );
 #else
     std::cout << "WARNING! Instantiate host Bridge (nevt=" << m_nevt << ")" << std::endl;
-    CPPProcess process( /*verbose=*/false );
     m_pmek.reset( new MatrixElementKernelHost( m_hstMomentaC, m_hstGs, m_hstRndHel, m_hstRndCol, m_hstMEs, m_hstSelHel, m_hstSelCol, m_nevt ) );
 #endif // MGONGPUCPP_GPUIMPL
-    process.initProc( "../../Cards/param_card.dat" );
+    // Create a process object, read param card and set parameters
+    // FIXME: the process instance can happily go out of scope because it is only needed to read parameters?
+    // FIXME: the CPPProcess should really be a singleton? what if fbridgecreate is called from several Fortran threads?
+    CPPProcess process( /*verbose=*/false );
+    std::string paramCard = "../../Cards/param_card.dat";
+    if( !std::filesystem::exists( paramCard ) )
+    {
+      paramCard = "../" + paramCard;
+    }
+    process.initProc( paramCard );
   }
 
 #ifdef MGONGPUCPP_GPUIMPL
diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/GpuAbstraction.h b/epochX/cudacpp/gg_ttg.mad/SubProcesses/GpuAbstraction.h
index 9c467b1e04..6a7d9c05c0 100644
--- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/GpuAbstraction.h
+++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/GpuAbstraction.h
@@ -39,6 +39,8 @@
 
 #elif defined __HIPCC__
 
+#include "hip/hip_runtime.h"
+
 #define gpuError_t hipError_t
 #define gpuPeekAtLastError hipPeekAtLastError
 #define gpuGetErrorString hipGetErrorString
diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/MGVersion.txt b/epochX/cudacpp/gg_ttg.mad/SubProcesses/MGVersion.txt
index 1c1a95761b..85c67c3554 100644
--- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/MGVersion.txt
+++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/MGVersion.txt
@@ -1 +1 @@
-3.5.1_lo_vect
\ No newline at end of file
+3.5.2_lo_vect
\ No newline at end of file
diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/MadgraphTest.h b/epochX/cudacpp/gg_ttg.mad/SubProcesses/MadgraphTest.h
index 176338151a..a64c05c26a 100644
--- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/MadgraphTest.h
+++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/MadgraphTest.h
@@ -14,6 +14,7 @@
 
 #include <array>
 #include <cmath>
+#include <filesystem>
 #include <fstream>
 #include <iomanip>
 #include <memory>
@@ -215,19 +216,16 @@ TEST_P( MadgraphTest, CompareMomentaAndME )
 #endif
   constexpr fptype energy = 1500; // historical default, Ecms = 1500 GeV = 1.5 TeV (above the Z peak)
   // Dump events to a new reference file?
-  constexpr bool dumpEvents = false;
-  std::string dumpFileName = std::string( "dump_" ) + testing::UnitTest::GetInstance()->current_test_info()->test_suite_name() + '.' + testing::UnitTest::GetInstance()->current_test_info()->name() + ".txt";
-  while( dumpFileName.find( '/' ) != std::string::npos )
-  {
-    dumpFileName.replace( dumpFileName.find( '/' ), 1, "_" );
-  }
+  const char* dumpEventsC = getenv( "CUDACPP_RUNTEST_DUMPEVENTS" );
+  const bool dumpEvents = ( dumpEventsC != 0 ) && ( std::string( dumpEventsC ) != "" );
+  const std::string refFileName = testDriver->getRefFileName();
+  const std::string dumpFileName = std::filesystem::path( refFileName ).filename();
   std::ofstream dumpFile;
   if( dumpEvents )
   {
     dumpFile.open( dumpFileName, std::ios::trunc );
   }
   // Read reference data
-  const std::string refFileName = testDriver->getRefFileName();
   std::map<unsigned int, ReferenceData> referenceData;
   if( !dumpEvents )
   {
diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/MatrixElementKernels.cc b/epochX/cudacpp/gg_ttg.mad/SubProcesses/MatrixElementKernels.cc
index d6d6c4f179..81699dfea9 100644
--- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/MatrixElementKernels.cc
+++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/MatrixElementKernels.cc
@@ -112,10 +112,17 @@ namespace mg5amcCpu
     // See https://community.arm.com/arm-community-blogs/b/operating-systems-blog/posts/runtime-detection-of-cpu-features-on-an-armv8-a-cpu
     bool ok = true; // this is just an assumption!
     const std::string tag = "arm neon (128bit as in SSE4.2)";
-#else
+#elif defined( __x86_64__ ) || defined( __i386__ )
     bool known = true;
     bool ok = __builtin_cpu_supports( "sse4.2" );
     const std::string tag = "nehalem (SSE4.2)";
+#else // AV FIXME! Added by OM for Mac, should identify the correct __xxx__ flag that should be targeted
+    bool known = false; // __builtin_cpu_supports is not supported
+    // See https://gcc.gnu.org/onlinedocs/gcc/Basic-PowerPC-Built-in-Functions-Available-on-all-Configurations.html
+    // See https://stackoverflow.com/q/62783908
+    // See https://community.arm.com/arm-community-blogs/b/operating-systems-blog/posts/runtime-detection-of-cpu-features-on-an-armv8-a-cpu
+    bool ok = true; // this is just an assumption!
+    const std::string tag = "arm neon (128bit as in SSE4.2)";
 #endif
 #else
     bool known = true;
diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/CPPProcess.cc b/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/CPPProcess.cc
index ecd334bb0f..0e4d5d1157 100644
--- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/CPPProcess.cc
+++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/CPPProcess.cc
@@ -7,7 +7,7 @@
 // Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+// MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
@@ -249,11 +249,11 @@ namespace mg5amcCpu
 
       vxxxxx<M_ACCESS, W_ACCESS>( momenta, 0., cHel[ihel][4], +1, w_fp[4], 4 );
 
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[1], COUPs[0], 0., 0., w_fp[5] );
-      FFV1P0_3<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[2], COUPs[1], 0., 0., w_fp[6] );
+      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[1], COUPs[0], 1.0, 0., 0., w_fp[5] );
+      FFV1P0_3<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[2], COUPs[1], 1.0, 0., 0., w_fp[6] );
 
       // Amplitude(s) for diagram number 1
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[5], w_fp[6], w_fp[4], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[5], w_fp[6], w_fp[4], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 1 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -266,10 +266,10 @@ namespace mg5amcCpu
       // *** DIAGRAM 2 OF 16 ***
 
       // Wavefunction(s) for diagram number 2
-      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[2], w_fp[4], COUPs[1], cIPD[0], cIPD[1], w_fp[7] );
+      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[2], w_fp[4], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[7] );
 
       // Amplitude(s) for diagram number 2
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[7], w_fp[5], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[7], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 2 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -280,10 +280,10 @@ namespace mg5amcCpu
       // *** DIAGRAM 3 OF 16 ***
 
       // Wavefunction(s) for diagram number 3
-      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[4], COUPs[1], cIPD[0], cIPD[1], w_fp[8] );
+      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[4], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[8] );
 
       // Amplitude(s) for diagram number 3
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[2], w_fp[5], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[2], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 3 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -294,11 +294,11 @@ namespace mg5amcCpu
       // *** DIAGRAM 4 OF 16 ***
 
       // Wavefunction(s) for diagram number 4
-      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[2], w_fp[0], COUPs[1], cIPD[0], cIPD[1], w_fp[5] );
-      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[1], COUPs[1], cIPD[0], cIPD[1], w_fp[9] );
+      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[2], w_fp[0], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[5] );
+      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[1], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[9] );
 
       // Amplitude(s) for diagram number 4
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[9], w_fp[5], w_fp[4], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[9], w_fp[5], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 4 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -308,10 +308,10 @@ namespace mg5amcCpu
       // *** DIAGRAM 5 OF 16 ***
 
       // Wavefunction(s) for diagram number 5
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[1], w_fp[4], COUPs[0], 0., 0., w_fp[10] );
+      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[1], w_fp[4], COUPs[0], 1.0, 0., 0., w_fp[10] );
 
       // Amplitude(s) for diagram number 5
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[5], w_fp[10], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[5], w_fp[10], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 5 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -325,7 +325,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 6
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[5], w_fp[1], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[5], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 6 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -335,11 +335,11 @@ namespace mg5amcCpu
       // *** DIAGRAM 7 OF 16 ***
 
       // Wavefunction(s) for diagram number 7
-      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[0], COUPs[1], cIPD[0], cIPD[1], w_fp[5] );
-      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[2], w_fp[1], COUPs[1], cIPD[0], cIPD[1], w_fp[11] );
+      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[0], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[5] );
+      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[2], w_fp[1], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[11] );
 
       // Amplitude(s) for diagram number 7
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[5], w_fp[11], w_fp[4], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[5], w_fp[11], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 7 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -352,7 +352,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 8
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[5], w_fp[2], w_fp[10], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[5], w_fp[2], w_fp[10], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 8 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -366,7 +366,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 9
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[5], w_fp[7], w_fp[1], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[5], w_fp[7], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 9 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -376,10 +376,10 @@ namespace mg5amcCpu
       // *** DIAGRAM 10 OF 16 ***
 
       // Wavefunction(s) for diagram number 10
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[4], COUPs[0], 0., 0., w_fp[5] );
+      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[4], COUPs[0], 1.0, 0., 0., w_fp[5] );
 
       // Amplitude(s) for diagram number 10
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[11], w_fp[5], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[11], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 10 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -393,7 +393,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 11
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[9], w_fp[2], w_fp[5], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[9], w_fp[2], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 11 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -407,7 +407,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 12
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[5], w_fp[1], w_fp[6], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[5], w_fp[1], w_fp[6], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 12 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -423,7 +423,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 13
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[11], w_fp[0], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[11], w_fp[0], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 13 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -436,7 +436,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 14
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[9], w_fp[7], w_fp[0], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[9], w_fp[7], w_fp[0], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 14 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -449,7 +449,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 15
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[10], w_fp[6], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[10], w_fp[6], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 15 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -462,22 +462,22 @@ namespace mg5amcCpu
       // *** DIAGRAM 16 OF 16 ***
 
       // Wavefunction(s) for diagram number 16
-      VVVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[1], w_fp[4], COUPs[2], 0., 0., w_fp[10] );
-      VVVV3P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[1], w_fp[4], COUPs[2], 0., 0., w_fp[6] );
-      VVVV4P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[1], w_fp[4], COUPs[2], 0., 0., w_fp[9] );
+      VVVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[1], w_fp[4], COUPs[2], 1.0, 0., 0., w_fp[10] );
+      VVVV3P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[1], w_fp[4], COUPs[2], 1.0, 0., 0., w_fp[6] );
+      VVVV4P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[1], w_fp[4], COUPs[2], 1.0, 0., 0., w_fp[9] );
 
       // Amplitude(s) for diagram number 16
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[2], w_fp[10], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[2], w_fp[10], COUPs[1], 1.0, &amp_fp[0] );
       jamp_sv[0] += amp_sv[0];
       jamp_sv[1] -= amp_sv[0];
       jamp_sv[3] -= amp_sv[0];
       jamp_sv[5] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[2], w_fp[6], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[2], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
       jamp_sv[1] -= amp_sv[0];
       jamp_sv[2] += amp_sv[0];
       jamp_sv[3] -= amp_sv[0];
       jamp_sv[4] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[2], w_fp[9], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[2], w_fp[9], COUPs[1], 1.0, &amp_fp[0] );
       jamp_sv[0] -= amp_sv[0];
       jamp_sv[2] += amp_sv[0];
       jamp_sv[4] += amp_sv[0];
@@ -867,12 +867,12 @@ namespace mg5amcCpu
                        fptype* allDenominators,    // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
                        bool* isGoodHel )           // output: isGoodHel[ncomb] - device array (CUDA implementation)
-  { /* clang-format on */
-    fptype allMEsLast = 0;
+  {                                                         /* clang-format on */
     const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid
-    allMEs[ievt] = 0;
     for( int ihel = 0; ihel < ncomb; ihel++ )
     {
+      // NEW IMPLEMENTATION OF GETGOODHEL (#630): RESET THE RUNNING SUM OVER HELICITIES TO 0 BEFORE ADDING A NEW HELICITY
+      allMEs[ievt] = 0;
       // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s)
       constexpr fptype_sv* jamp2_sv = nullptr; // no need for color selection during helicity filtering
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
@@ -881,12 +881,11 @@ namespace mg5amcCpu
 #else
       calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv );
 #endif
-      if( allMEs[ievt] != allMEsLast )
+      if( allMEs[ievt] != 0 ) // NEW IMPLEMENTATION OF GETGOODHEL (#630): COMPARE EACH HELICITY CONTRIBUTION TO 0
       {
         //if ( !isGoodHel[ihel] ) std::cout << "sigmaKin_getGoodHel ihel=" << ihel << " TRUE" << std::endl;
         isGoodHel[ihel] = true;
       }
-      allMEsLast = allMEs[ievt]; // running sum up to helicity ihel for event ievt
     }
   }
 #else
@@ -905,19 +904,11 @@ namespace mg5amcCpu
     //assert( (size_t)(allMEs) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS]
     // Allocate arrays at build time to contain at least 16 events (or at least neppV events if neppV>16, e.g. in future VPUs)
     constexpr int maxtry0 = std::max( 16, neppV ); // 16, but at least neppV (otherwise the npagV loop does not even start)
-    fptype allMEsLast[maxtry0] = { 0 };            // allocated at build time: maxtry0 must be a constexpr
     // Loop over only nevt events if nevt is < 16 (note that nevt is always >= neppV)
     assert( nevt >= neppV );
     const int maxtry = std::min( maxtry0, nevt ); // 16, but at most nevt (avoid invalid memory access if nevt<maxtry0)
 
-    // PART 0 - INITIALISATION (before calculate_wavefunctions)
-    // Reset the "matrix elements" - running sums of |M|^2 over helicities for the given event
-    for( int ievt = 0; ievt < maxtry; ++ievt )
-    {
-      allMEs[ievt] = 0; // all zeros
-    }
-
-    // PART 1 - HELICITY LOOP: CALCULATE WAVEFUNCTIONS
+    // HELICITY LOOP: CALCULATE WAVEFUNCTIONS
     const int npagV = maxtry / neppV;
 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
     // Mixed fptypes #537: float for color algebra and double elsewhere
@@ -936,6 +927,16 @@ namespace mg5amcCpu
 #endif
       for( int ihel = 0; ihel < ncomb; ihel++ )
       {
+        // NEW IMPLEMENTATION OF GETGOODHEL (#630): RESET THE RUNNING SUM OVER HELICITIES TO 0 BEFORE ADDING A NEW HELICITY
+        for( int ieppV = 0; ieppV < neppV; ++ieppV )
+        {
+          const int ievt = ievt00 + ieppV;
+          allMEs[ievt] = 0;
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+          const int ievt2 = ievt00 + ieppV + neppV;
+          allMEs[ievt2] = 0;
+#endif
+        }
         constexpr fptype_sv* jamp2_sv = nullptr; // no need for color selection during helicity filtering
         //std::cout << "sigmaKin_getGoodHel ihel=" << ihel << ( isGoodHel[ihel] ? " true" : " false" ) << std::endl;
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
@@ -947,22 +948,18 @@ namespace mg5amcCpu
         for( int ieppV = 0; ieppV < neppV; ++ieppV )
         {
           const int ievt = ievt00 + ieppV;
-          const bool differs = ( allMEs[ievt] != allMEsLast[ievt] );
-          if( differs )
+          if( allMEs[ievt] != 0 ) // NEW IMPLEMENTATION OF GETGOODHEL (#630): COMPARE EACH HELICITY CONTRIBUTION TO 0
           {
             //if ( !isGoodHel[ihel] ) std::cout << "sigmaKin_getGoodHel ihel=" << ihel << " TRUE" << std::endl;
             isGoodHel[ihel] = true;
           }
-          allMEsLast[ievt] = allMEs[ievt]; // running sum up to helicity ihel
 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
           const int ievt2 = ievt00 + ieppV + neppV;
-          const bool differs2 = ( allMEs[ievt2] != allMEsLast[ievt2] );
-          if( differs2 )
+          if( allMEs[ievt2] != 0 ) // NEW IMPLEMENTATION OF GETGOODHEL (#630): COMPARE EACH HELICITY CONTRIBUTION TO 0
           {
             //if ( !isGoodHel[ihel] ) std::cout << "sigmaKin_getGoodHel ihel=" << ihel << " TRUE" << std::endl;
             isGoodHel[ihel] = true;
           }
-          allMEsLast[ievt2] = allMEs[ievt2]; // running sum up to helicity ihel
 #endif
         }
       }
@@ -1019,13 +1016,12 @@ namespace mg5amcCpu
   {
     mgDebugInitialise();
 
-    // SANITY CHECKS for cudacpp code generation (see issues #272 and #343 and PRs #619, #626, #360 and #396)
+    // SANITY CHECKS for cudacpp code generation (see issues #272 and #343 and PRs #619, #626, #360, #396 and #754)
     // These variable are not used anywhere else in the code and their scope is limited to this sanity check
     {
-      // nprocesses>1 was last observed for "mirror processes" in uux_ttx in the 270 branch (see issue #343 and PRs #360 and #396)
+      // nprocesses == 2 may happen for "mirror processes" such as P0_uux_ttx within pp_tt012j (see PR #754)
       constexpr int nprocesses = 1;
-      static_assert( nprocesses == 1, "Assume nprocesses == 1" );
-      // process_id corresponds to the index of DSIG1 Fortran functions (must be 1 because cudacpp is unable to handle DSIG2)
+      static_assert( nprocesses == 1 || nprocesses == 2, "Assume nprocesses == 1 or 2" );
       constexpr int process_id = 1; // code generation source: madevent + cudacpp exporter
       static_assert( process_id == 1, "Assume process_id == 1" );
     }
@@ -1111,23 +1107,26 @@ namespace mg5amcCpu
     }
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     // Event-by-event random choice of color #402
-    const int channelIdC = channelId - 1; // coloramps.h uses the C array indexing starting at 0
-    fptype targetamp[ncolor] = { 0 };
-    for( int icolC = 0; icolC < ncolor; icolC++ )
+    if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783)
     {
-      if( icolC == 0 )
-        targetamp[icolC] = 0;
-      else
-        targetamp[icolC] = targetamp[icolC - 1];
-      if( mgOnGpu::icolamp[channelIdC][icolC] ) targetamp[icolC] += jamp2_sv[icolC];
-    }
-    //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] );
-    for( int icolC = 0; icolC < ncolor; icolC++ )
-    {
-      if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) )
+      const unsigned int channelIdC = channelId - 1; // coloramps.h uses the C array indexing starting at 0
+      fptype targetamp[ncolor] = { 0 };
+      for( int icolC = 0; icolC < ncolor; icolC++ )
       {
-        allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1]
-        break;
+        if( icolC == 0 )
+          targetamp[icolC] = 0;
+        else
+          targetamp[icolC] = targetamp[icolC - 1];
+        if( mgOnGpu::icolamp[channelIdC][icolC] ) targetamp[icolC] += jamp2_sv[icolC];
+      }
+      //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] );
+      for( int icolC = 0; icolC < ncolor; icolC++ )
+      {
+        if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) )
+        {
+          allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1]
+          break;
+        }
       }
     }
 #endif
@@ -1222,57 +1221,60 @@ namespace mg5amcCpu
 #endif
       }
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // multichannel enabled (random color choice)
-      const int channelIdC = channelId - 1; // coloramps.h uses the C array indexing starting at 0
       // Event-by-event random choice of color #402
-      fptype_sv targetamp[ncolor] = { 0 };
-      for( int icolC = 0; icolC < ncolor; icolC++ )
+      if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783)
       {
-        if( icolC == 0 )
-          targetamp[icolC] = fptype_sv{ 0 };
-        else
-          targetamp[icolC] = targetamp[icolC - 1];
-        if( mgOnGpu::icolamp[channelIdC][icolC] ) targetamp[icolC] += jamp2_sv[icolC];
-      }
+        const unsigned int channelIdC = channelId - 1; // coloramps.h uses the C array indexing starting at 0
+        fptype_sv targetamp[ncolor] = { 0 };
+        for( int icolC = 0; icolC < ncolor; icolC++ )
+        {
+          if( icolC == 0 )
+            targetamp[icolC] = fptype_sv{ 0 };
+          else
+            targetamp[icolC] = targetamp[icolC - 1];
+          if( mgOnGpu::icolamp[channelIdC][icolC] ) targetamp[icolC] += jamp2_sv[icolC];
+        }
 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-      fptype_sv targetamp2[ncolor] = { 0 };
-      for( int icolC = 0; icolC < ncolor; icolC++ )
-      {
-        if( icolC == 0 )
-          targetamp2[icolC] = fptype_sv{ 0 };
-        else
-          targetamp2[icolC] = targetamp2[icolC - 1];
-        if( mgOnGpu::icolamp[channelIdC][icolC] ) targetamp2[icolC] += jamp2_sv[ncolor + icolC];
-      }
-#endif
-      for( int ieppV = 0; ieppV < neppV; ++ieppV )
-      {
-        const int ievt = ievt00 + ieppV;
-        //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] );
+        fptype_sv targetamp2[ncolor] = { 0 };
         for( int icolC = 0; icolC < ncolor; icolC++ )
         {
+          if( icolC == 0 )
+            targetamp2[icolC] = fptype_sv{ 0 };
+          else
+            targetamp2[icolC] = targetamp2[icolC - 1];
+          if( mgOnGpu::icolamp[channelIdC][icolC] ) targetamp2[icolC] += jamp2_sv[ncolor + icolC];
+        }
+#endif
+        for( int ieppV = 0; ieppV < neppV; ++ieppV )
+        {
+          const int ievt = ievt00 + ieppV;
+          //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] );
+          for( int icolC = 0; icolC < ncolor; icolC++ )
+          {
 #if defined MGONGPU_CPPSIMD
-          const bool okcol = allrndcol[ievt] < ( targetamp[icolC][ieppV] / targetamp[ncolor - 1][ieppV] );
+            const bool okcol = allrndcol[ievt] < ( targetamp[icolC][ieppV] / targetamp[ncolor - 1][ieppV] );
 #else
-          const bool okcol = allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] );
+            const bool okcol = allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] );
 #endif
-          if( okcol )
-          {
-            allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1]
-            break;
+            if( okcol )
+            {
+              allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1]
+              break;
+            }
           }
-        }
 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-        const int ievt2 = ievt00 + ieppV + neppV;
-        //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt2, allrndcol[ievt2] );
-        for( int icolC = 0; icolC < ncolor; icolC++ )
-        {
-          if( allrndcol[ievt2] < ( targetamp2[icolC][ieppV] / targetamp2[ncolor - 1][ieppV] ) )
+          const int ievt2 = ievt00 + ieppV + neppV;
+          //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt2, allrndcol[ievt2] );
+          for( int icolC = 0; icolC < ncolor; icolC++ )
           {
-            allselcol[ievt2] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1]
-            break;
+            if( allrndcol[ievt2] < ( targetamp2[icolC][ieppV] / targetamp2[ncolor - 1][ieppV] ) )
+            {
+              allselcol[ievt2] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1]
+              break;
+            }
           }
-        }
 #endif
+        }
       }
 #endif // multichannel enabled (random color choice)
     }
diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/CPPProcess.h b/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/CPPProcess.h
index 2731db9bfd..11f562273e 100644
--- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/CPPProcess.h
+++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/CPPProcess.h
@@ -7,7 +7,7 @@
 // Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+// MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/auto_dsig.f b/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/auto_dsig.f
index d528b1d2f0..dd4cd3a0c2 100644
--- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/auto_dsig.f
+++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/auto_dsig.f
@@ -359,7 +359,7 @@ SUBROUTINE DSIG_VEC(ALL_P,ALL_WGT,ALL_XBK,ALL_Q2FACT,ALL_CM_RAP
       DOUBLE PRECISION FUNCTION DSIG(PP,WGT,IMODE)
 C     ****************************************************
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+C     Generated by MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/auto_dsig1.f b/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/auto_dsig1.f
index 668cc26192..e28575ead8 100644
--- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/auto_dsig1.f
+++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/auto_dsig1.f
@@ -1,7 +1,7 @@
       DOUBLE PRECISION FUNCTION DSIG1(PP,WGT,IMODE)
 C     ****************************************************
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+C     Generated by MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
@@ -39,6 +39,7 @@ DOUBLE PRECISION FUNCTION DSIG1(PP,WGT,IMODE)
 C     LOCAL VARIABLES 
 C     
       INTEGER I,ITYPE,LP,IPROC
+      DOUBLE PRECISION QSCALE
       DOUBLE PRECISION G1
       DOUBLE PRECISION G2
       DOUBLE PRECISION XPQ(-7:7),PD(0:MAXPROC)
@@ -126,11 +127,24 @@ DOUBLE PRECISION FUNCTION DSIG1(PP,WGT,IMODE)
 
       IF (ABS(LPP(IB(1))).GE.1) THEN
           !LP=SIGN(1,LPP(IB(1)))
-        G1=PDG2PDF(LPP(IB(1)),0, IB(1),XBK(IB(1)),DSQRT(Q2FACT(IB(1))))
+        IF (DSQRT(Q2FACT(IB(1))).EQ.0D0) THEN
+          QSCALE=0D0
+          DO I=3,NEXTERNAL
+            QSCALE=QSCALE+DSQRT(MAX(0D0,(PP(0,I)+PP(3,I))*(PP(0,I)
+     $       -PP(3,I))))
+          ENDDO
+          QSCALE=QSCALE/2D0
+        ELSE
+          QSCALE=DSQRT(Q2FACT(IB(1)))
+        ENDIF
+        G1=PDG2PDF(LPP(IB(1)),0, IB(1),XBK(IB(1)), QSCALE)
       ENDIF
       IF (ABS(LPP(IB(2))).GE.1) THEN
           !LP=SIGN(1,LPP(IB(2)))
-        G2=PDG2PDF(LPP(IB(2)),0, IB(2),XBK(IB(2)),DSQRT(Q2FACT(IB(2))))
+        IF (DSQRT(Q2FACT(IB(2))).NE.0D0) THEN
+          QSCALE=DSQRT(Q2FACT(IB(2)))
+        ENDIF
+        G2=PDG2PDF(LPP(IB(2)),0, IB(2),XBK(IB(2)), QSCALE)
       ENDIF
       PD(0) = 0D0
       IPROC = 0
@@ -202,7 +216,7 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT,
      $  ALL_CM_RAP, ALL_WGT, IMODE, ALL_OUT, VECSIZE_USED)
 C     ****************************************************
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+C     Generated by MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
@@ -249,6 +263,7 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT,
 C     
 C     LOCAL VARIABLES 
 C     
+      DOUBLE PRECISION QSCALE
       INTEGER I,ITYPE,LP,IPROC
       DOUBLE PRECISION G1(VECSIZE_MEMMAX)
       DOUBLE PRECISION G2(VECSIZE_MEMMAX)
diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/check_sa.cc b/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/check_sa.cc
index 1bad694d1c..7cac5ab47b 100644
--- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/check_sa.cc
+++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/check_sa.cc
@@ -29,7 +29,9 @@
 
 #include <algorithm>
 #include <array>
+#include <cfenv> // for feenableexcept
 #include <cmath>
+#include <csignal> // for signal and SIGFPE
 #include <cstring>
 #include <fstream>
 #include <iomanip>
@@ -74,6 +76,23 @@ usage( char* argv0, int ret = 1 )
   return ret;
 }
 
+#ifdef __CUDACC__
+namespace mg5amcGpu
+#else
+namespace mg5amcCpu
+#endif
+{
+  inline void FPEhandler( int sig )
+  {
+#ifdef __CUDACC__
+    std::cerr << "Floating Point Exception (GPU)" << std::endl;
+#else
+    std::cerr << "Floating Point Exception (CPU)" << std::endl;
+#endif
+    exit( 0 );
+  }
+}
+
 int
 main( int argc, char** argv )
 {
@@ -84,6 +103,18 @@ main( int argc, char** argv )
   using namespace mg5amcCpu;
 #endif
 
+  // Enable FPEs (test #701 and #733 - except on MacOS where feenableexcept is not defined #730)
+#ifndef __APPLE__
+  const char* enableFPEc = getenv( "CUDACPP_RUNTIME_ENABLEFPE" );
+  const bool enableFPE = ( enableFPEc != 0 ) && ( std::string( enableFPEc ) != "" );
+  if( enableFPE )
+  {
+    std::cout << "WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions" << std::endl;
+    feenableexcept( FE_INVALID | FE_DIVBYZERO | FE_OVERFLOW | FE_UNDERFLOW ); // debug #701
+    signal( SIGFPE, FPEhandler );
+  }
+#endif
+
   // DEFAULTS FOR COMMAND LINE ARGUMENTS
   bool verbose = false;
   bool debug = false;
@@ -103,12 +134,14 @@ main( int argc, char** argv )
     CurandHost = 1,
     CurandDevice = 2
   };
-#ifdef __CUDACC__
-  RandomNumberMode rndgen = RandomNumberMode::CurandDevice; // default on CUDA GPU
-#elif not defined MGONGPU_HAS_NO_CURAND
-  RandomNumberMode rndgen = RandomNumberMode::CurandHost;  // default on CPU if build has curand
+#ifdef MGONGPU_HAS_NO_CURAND
+  RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // this is the only supported mode if build has no curand (PR #784 and #785)
+#elif defined __HIPCC__
+#error Internal error: MGONGPU_HAS_NO_CURAND should have been set for __HIPCC__ // default on AMD GPUs should be common random
+#elif defined __CUDACC__
+  RandomNumberMode rndgen = RandomNumberMode::CurandDevice; // default on NVidia GPU if build has curand
 #else
-  RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // default on CPU if build has no curand and on HIP GPU
+  RandomNumberMode rndgen = RandomNumberMode::CurandHost; // default on CPU if build has curand
 #endif
   // Rambo sampling mode (NB RamboHost implies CommonRandom or CurandHost!)
   enum class RamboSamplingMode
@@ -146,18 +179,20 @@ main( int argc, char** argv )
     }
     else if( arg == "--curdev" )
     {
-#ifdef __CUDACC__
-      rndgen = RandomNumberMode::CurandDevice;
+#ifndef __CUDACC__
+      throw std::runtime_error( "CurandDevice is not supported on CPUs or non-NVidia GPUs" );
+#elif defined MGONGPU_HAS_NO_CURAND
+      throw std::runtime_error( "CurandDevice is not supported because this application was built without Curand support" );
 #else
-      throw std::runtime_error( "CurandDevice is not supported on CPUs or on HIP GPUs" );
+      rndgen = RandomNumberMode::CurandDevice;
 #endif
     }
     else if( arg == "--curhst" )
     {
-#ifndef MGONGPU_HAS_NO_CURAND
-      rndgen = RandomNumberMode::CurandHost;
-#else
+#ifdef MGONGPU_HAS_NO_CURAND
       throw std::runtime_error( "CurandHost is not supported because this application was built without Curand support" );
+#else
+      rndgen = RandomNumberMode::CurandHost;
 #endif
     }
     else if( arg == "--common" )
@@ -278,10 +313,10 @@ main( int argc, char** argv )
   const std::string procKey = "0a ProcInit";
   timermap.start( procKey );
 
-  // Create a process object
+  // Create a process object, read param card and set parameters
+  // FIXME: the process instance can happily go out of scope because it is only needed to read parameters?
+  // FIXME: the CPPProcess should really be a singleton? (for instance, in bridge mode this will be called twice here?)
   CPPProcess process( verbose );
-
-  // Read param_card and set parameters
   process.initProc( "../../Cards/param_card.dat" );
   const fptype energy = 1500; // historical default, Ecms = 1500 GeV = 1.5 TeV (above the Z peak)
   //const fptype energy = 91.2; // Ecms = 91.2 GeV (Z peak)
@@ -389,30 +424,26 @@ main( int argc, char** argv )
   {
     prnk.reset( new CommonRandomNumberKernel( hstRndmom ) );
   }
-#ifndef MGONGPU_HAS_NO_CURAND
   else if( rndgen == RandomNumberMode::CurandHost )
   {
+#ifdef MGONGPU_HAS_NO_CURAND
+    throw std::runtime_error( "INTERNAL ERROR! CurandHost is not supported because this application was built without Curand support" ); // INTERNAL ERROR (no path to this statement)
+#else
     const bool onDevice = false;
     prnk.reset( new CurandRandomNumberKernel( hstRndmom, onDevice ) );
+#endif
   }
-#ifdef __CUDACC__
   else
   {
+#ifdef MGONGPU_HAS_NO_CURAND
+    throw std::runtime_error( "INTERNAL ERROR! CurandDevice is not supported because this application was built without Curand support" ); // INTERNAL ERROR (no path to this statement)
+#elif defined __CUDACC__
     const bool onDevice = true;
     prnk.reset( new CurandRandomNumberKernel( devRndmom, onDevice ) );
-  }
 #else
-  else
-  {
-    throw std::logic_error( "CurandDevice is not supported on CPUs or HIP GPUs" ); // INTERNAL ERROR (no path to this statement)
-  }
+    throw std::logic_error( "INTERNAL ERROR! CurandDevice is not supported on CPUs or non-NVidia GPUs" ); // INTERNAL ERROR (no path to this statement)
 #endif
-#else
-  else
-  {
-    throw std::logic_error( "This application was built without Curand support" ); // INTERNAL ERROR (no path to this statement)
   }
-#endif
 
   // --- 0c. Create rambo sampling kernel [keep this in 0c for the moment]
   std::unique_ptr<SamplingKernelBase> prsk;
@@ -747,7 +778,7 @@ main( int argc, char** argv )
   wrkflwtxt += "HIP:";
 #else
   wrkflwtxt += "CPP:";
-#endif
+#endif /* clang-format off */
   // -- DOUBLE or FLOAT?
 #if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
   wrkflwtxt += "MIX+"; // mixed fptypes (single precision color algebra #537)
@@ -757,7 +788,7 @@ main( int argc, char** argv )
   wrkflwtxt += "FLT+";
 #else
   wrkflwtxt += "???+"; // no path to this statement
-#endif
+#endif /* clang-format on */
   // -- CUCOMPLEX or THRUST or STD complex numbers?
 #ifdef __CUDACC__
 #if defined MGONGPU_CUCXTYPE_CUCOMPLEX
diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/counters.cc b/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/counters.cc
index 71fa817036..3bbdec9387 100644
--- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/counters.cc
+++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/counters.cc
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #include "timer.h"
 #define TIMERTYPE std::chrono::high_resolution_clock
@@ -36,13 +36,10 @@ extern "C"
 
   static mgOnGpu::Timer<TIMERTYPE> program_timer;
   static float program_totaltime = 0;
-  static mgOnGpu::Timer<TIMERTYPE> matrix1_timer;
-  static float matrix1_totaltime = 0;
   static mgOnGpu::Timer<TIMERTYPE> smatrix1_timer;
   static float smatrix1_totaltime = 0;
   static mgOnGpu::Timer<TIMERTYPE> smatrix1multi_timer[nimplC];
   static float smatrix1multi_totaltime[nimplC] = { 0 };
-  static int matrix1_counter = 0;
   static int smatrix1_counter = 0;
   static int smatrix1multi_counter[nimplC] = { 0 };
 
@@ -52,19 +49,6 @@ extern "C"
     return;
   }
 
-  void counters_matrix1_start_()
-  {
-    matrix1_counter++;
-    matrix1_timer.Start();
-    return;
-  }
-
-  void counters_matrix1_stop_()
-  {
-    matrix1_totaltime += matrix1_timer.GetDuration();
-    return;
-  }
-
   void counters_smatrix1_start_()
   {
     smatrix1_counter++;
diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/matrix1.f b/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/matrix1.f
index 520966d7b7..a885b7fde3 100644
--- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/matrix1.f
+++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/matrix1.f
@@ -1,7 +1,7 @@
       SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL,
      $  ICOL)
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+C     Generated by MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
@@ -317,7 +317,7 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL,
 
       REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+C     Generated by MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
@@ -359,7 +359,7 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
 C     LOCAL VARIABLES 
 C     
       INTEGER I,J,M,N
-      COMPLEX*16 ZTEMP, TMP_JAMP(10)
+      COMPLEX*16 ZTEMP, TMP_JAMP(9)
       REAL*8 CF(NCOLOR,NCOLOR)
       COMPLEX*16 AMP(NGRAPHS), JAMP(NCOLOR,NAMPSO)
       COMPLEX*16 W(6,NWAVEFUNCS)
@@ -434,7 +434,6 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
 C     ----------
 C     BEGIN CODE
 C     ----------
-      call counters_matrix1_start()
       IF (FIRST) THEN
         FIRST=.FALSE.
         IF(ZERO.NE.0D0) FK_ZERO = SIGN(MAX(ABS(ZERO), ABS(ZERO
@@ -509,33 +508,30 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
       TMP_JAMP(3) = AMP(15) +  AMP(16)  ! used 4 times
       TMP_JAMP(2) = AMP(1) +  AMP(18)  ! used 4 times
       TMP_JAMP(1) = AMP(12) - AMP(17)  ! used 4 times
-      TMP_JAMP(10) = TMP_JAMP(3) - TMP_JAMP(2)  ! used 2 times
-      TMP_JAMP(9) = TMP_JAMP(1) + ((-0.000000000000000D+00
+      TMP_JAMP(9) = TMP_JAMP(3) + ((0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * AMP(8)  ! used 2 times
+      TMP_JAMP(8) = TMP_JAMP(3) + ((-0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * AMP(5)  ! used 2 times
+      TMP_JAMP(7) = TMP_JAMP(2) + ((-0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * AMP(2)  ! used 2 times
+      TMP_JAMP(6) = TMP_JAMP(2) + ((0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * AMP(3)  ! used 2 times
+      TMP_JAMP(5) = TMP_JAMP(1) + ((-0.000000000000000D+00
      $ ,1.000000000000000D+00)) * AMP(11)  ! used 2 times
-      TMP_JAMP(8) = TMP_JAMP(2) - TMP_JAMP(1)  ! used 2 times
-      TMP_JAMP(7) = TMP_JAMP(1) + ((0.000000000000000D+00,
+      TMP_JAMP(4) = TMP_JAMP(1) + ((0.000000000000000D+00,
      $ -1.000000000000000D+00)) * AMP(10)  ! used 2 times
-      TMP_JAMP(6) = TMP_JAMP(3) - TMP_JAMP(1)  ! used 2 times
-      TMP_JAMP(5) = TMP_JAMP(2) + ((0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * AMP(3)  ! used 2 times
-      TMP_JAMP(4) = TMP_JAMP(3) + ((-0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * AMP(5)  ! used 2 times
-      JAMP(1,1) = (-1.000000000000000D+00)*AMP(6)+TMP_JAMP(4)+(
-     $ -1.000000000000000D+00)*TMP_JAMP(5)
-      JAMP(2,1) = (-1.000000000000000D+00)*AMP(4)+(-1.000000000000000D
-     $ +00)*TMP_JAMP(4)+TMP_JAMP(9)
-      JAMP(3,1) = (-1.000000000000000D+00)*AMP(13)+TMP_JAMP(5)+(
-     $ -1.000000000000000D+00)*TMP_JAMP(7)
-      JAMP(4,1) = (-1.000000000000000D+00)*AMP(7)+((0.000000000000000D
-     $ +00,1.000000000000000D+00))*AMP(8)+((0.000000000000000D+00,
-     $ -1.000000000000000D+00))*AMP(10)+(-1.000000000000000D+00)
-     $ *TMP_JAMP(6)
-      JAMP(5,1) = ((0.000000000000000D+00,1.000000000000000D+00))
-     $ *AMP(2)+((0.000000000000000D+00,-1.000000000000000D+00))*AMP(11)
-     $ +(-1.000000000000000D+00)*AMP(14)+TMP_JAMP(8)
-      JAMP(6,1) = ((0.000000000000000D+00,-1.000000000000000D+00))
-     $ *AMP(2)+((0.000000000000000D+00,-1.000000000000000D+00))*AMP(8)
-     $ +(-1.000000000000000D+00)*AMP(9)+TMP_JAMP(10)
+      JAMP(1,1) = (-1.000000000000000D+00)*AMP(6)+(-1.000000000000000D
+     $ +00)*TMP_JAMP(6)+TMP_JAMP(8)
+      JAMP(2,1) = (-1.000000000000000D+00)*AMP(4)+TMP_JAMP(5)+(
+     $ -1.000000000000000D+00)*TMP_JAMP(8)
+      JAMP(3,1) = (-1.000000000000000D+00)*AMP(13)+(
+     $ -1.000000000000000D+00)*TMP_JAMP(4)+TMP_JAMP(6)
+      JAMP(4,1) = (-1.000000000000000D+00)*AMP(7)+TMP_JAMP(4)+(
+     $ -1.000000000000000D+00)*TMP_JAMP(9)
+      JAMP(5,1) = (-1.000000000000000D+00)*AMP(14)+(
+     $ -1.000000000000000D+00)*TMP_JAMP(5)+TMP_JAMP(7)
+      JAMP(6,1) = (-1.000000000000000D+00)*AMP(9)+(-1.000000000000000D
+     $ +00)*TMP_JAMP(7)+TMP_JAMP(9)
 
       IF(INIT_MODE)THEN
         DO I=1, NGRAPHS
@@ -588,7 +584,6 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
         ENDDO
       ENDDO
 
-      call counters_matrix1_stop()
       END
 
       SUBROUTINE PRINT_ZERO_AMP_1()
diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/cudacpp.mk b/epochX/cudacpp/gg_ttg.mad/SubProcesses/cudacpp.mk
index 59a2c906eb..f2cfa349da 100644
--- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/cudacpp.mk
+++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/cudacpp.mk
@@ -4,10 +4,13 @@
 # Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 
 #=== Determine the name of this makefile (https://ftp.gnu.org/old-gnu/Manuals/make-3.80/html_node/make_17.html)
-#=== NB: different names (e.g. cudacpp.mk and cudacpp_src.mk) are used in the Subprocess and src directories
+#=== NB: use ':=' to ensure that the value of CUDACPP_MAKEFILE is not modified further down after including make_opts
+#=== NB: use 'override' to ensure that the value can not be modified from the outside
+override CUDACPP_MAKEFILE := $(word $(words $(MAKEFILE_LIST)),$(MAKEFILE_LIST))
+###$(info CUDACPP_MAKEFILE='$(CUDACPP_MAKEFILE)')
 
-CUDACPP_MAKEFILE = $(word $(words $(MAKEFILE_LIST)),$(MAKEFILE_LIST))
-CUDACPP_SRC_MAKEFILE = cudacpp_src.mk
+#=== NB: different names (e.g. cudacpp.mk and cudacpp_src.mk) are used in the Subprocess and src directories
+override CUDACPP_SRC_MAKEFILE = cudacpp_src.mk
 
 #-------------------------------------------------------------------------------
 
@@ -29,7 +32,17 @@ UNAME_P := $(shell uname -p)
 
 #-------------------------------------------------------------------------------
 
-#=== Configure common compiler flags for C++ and CUDA
+#=== Include the common MG5aMC Makefile options
+
+# OM: this is crucial for MG5aMC flag consistency/documentation
+# AV: temporarely comment this out because it breaks cudacpp builds
+ifneq ($(wildcard ../../Source/make_opts),)
+include ../../Source/make_opts
+endif
+
+#-------------------------------------------------------------------------------
+
+#=== Configure common compiler flags for C++ and CUDA/HIP
 
 INCFLAGS = -I.
 OPTFLAGS = -O3 # this ends up in GPUFLAGS too (should it?), cannot add -Ofast or -ffast-math here
@@ -101,68 +114,85 @@ endif
 # Note: AR, CXX and FC are implicitly defined if not set externally
 # See https://www.gnu.org/software/make/manual/html_node/Implicit-Variables.html
 
-#-------------------------------------------------------------------------------
-
-CUDA_COMPILER_PATH := $(shell compiler="`which nvcc 2>/dev/null`" && while [ -L "$$compiler" ]; do compiler=`readlink "$$compiler"`; done && echo "$$compiler")
-HIP_COMPILER_PATH := $(shell compiler="`which hipcc 2>/dev/null`" && while [ -L "$$compiler" ]; do compiler=`readlink "$$compiler"`; done && echo "$$compiler")
-
-ifeq ($(findstring nvcc,$(CUDA_COMPILER_PATH)),nvcc)
-  #=== Configure the CUDA compiler
-
-  # If CXX is not a single word (example "clang++ --gcc-toolchain...") then disable CUDA builds (issue #505)
-  # This is because it is impossible to pass this to "GPUFLAGS += -ccbin <host-compiler>" below
-  ifneq ($(words $(subst ccache ,,$(CXX))),1) # allow at most "CXX=ccache <host-compiler>" from outside
-    $(warning CUDA builds are not supported for multi-word CXX "$(CXX)")
-    override CUDA_HOME=disabled
-  endif
+# Add -mmacosx-version-min=11.3 to avoid "ld: warning: object file was built for newer macOS version than being linked"
+ifneq ($(shell $(CXX) --version | egrep '^Apple clang'),)
+CXXFLAGS += -mmacosx-version-min=11.3
+endif
 
-  # If CUDA_HOME is not set, try to set it from the location of nvcc
-  ifndef CUDA_HOME
-    CUDA_HOME = $(patsubst %bin/nvcc,%,$(shell which nvcc 2>/dev/null))
-    $(warning CUDA_HOME was not set: using "$(CUDA_HOME)")
-  endif
+#-------------------------------------------------------------------------------
 
-  # Set GPUCC as $(CUDA_HOME)/bin/nvcc if it exists
-  ifneq ($(wildcard $(CUDA_HOME)/bin/nvcc),)
-    GPUCC = $(CUDA_HOME)/bin/nvcc
-    USE_NVTX ?=-DUSE_NVTX
-    # See https://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/index.html
-    # See https://arnon.dk/matching-sm-architectures-arch-and-gencode-for-various-nvidia-cards/
-    # Default: use compute capability 70 for V100 (CERN lxbatch, CERN itscrd, Juwels Cluster).
-    # Embed device code for 70, and PTX for 70+.
-    # Export MADGRAPH_CUDA_ARCHITECTURE (comma-separated list) to use another value or list of values (see #533).
-    # Examples: use 60 for P100 (Piz Daint), 80 for A100 (Juwels Booster, NVidia raplab/Curiosity).
-    MADGRAPH_CUDA_ARCHITECTURE ?= 70
-    ###CUARCHFLAGS = -gencode arch=compute_$(MADGRAPH_CUDA_ARCHITECTURE),code=compute_$(MADGRAPH_CUDA_ARCHITECTURE) -gencode arch=compute_$(MADGRAPH_CUDA_ARCHITECTURE),code=sm_$(MADGRAPH_CUDA_ARCHITECTURE) # Older implementation (AV): go back to this one for multi-GPU support #533
-    ###CUARCHFLAGS = --gpu-architecture=compute_$(MADGRAPH_CUDA_ARCHITECTURE) --gpu-code=sm_$(MADGRAPH_CUDA_ARCHITECTURE),compute_$(MADGRAPH_CUDA_ARCHITECTURE) # Newer implementation (SH): cannot use this as-is for multi-GPU support #533
-    comma:=,
-    CUARCHFLAGS = $(foreach arch,$(subst $(comma), ,$(MADGRAPH_CUDA_ARCHITECTURE)),-gencode arch=compute_$(arch),code=compute_$(arch) -gencode arch=compute_$(arch),code=sm_$(arch))
-    CUINC = -I$(CUDA_HOME)/include/
-    CURANDLIBFLAGS = -L$(CUDA_HOME)/lib64/ -lcurand # NB: -lcuda is not needed here!
-    CUOPTFLAGS = -lineinfo
-    GPUFLAGS = $(foreach opt, $(OPTFLAGS), -Xcompiler $(opt)) $(CUOPTFLAGS) $(INCFLAGS) $(CUINC) $(USE_NVTX) $(CUARCHFLAGS) -use_fast_math
-    ###GPUFLAGS += -Xcompiler -Wall -Xcompiler -Wextra -Xcompiler -Wshadow
-    ###GPUCC_VERSION = $(shell $(GPUCC) --version | grep 'Cuda compilation tools' | cut -d' ' -f5 | cut -d, -f1)
-    GPUFLAGS += -std=c++17 # need CUDA >= 11.2 (see #333): this is enforced in mgOnGpuConfig.h
-    # Without -maxrregcount: baseline throughput: 6.5E8 (16384 32 12) up to 7.3E8 (65536 128 12)
-    ###GPUFLAGS+= --maxrregcount 160 # improves throughput: 6.9E8 (16384 32 12) up to 7.7E8 (65536 128 12)
-    ###GPUFLAGS+= --maxrregcount 128 # improves throughput: 7.3E8 (16384 32 12) up to 7.6E8 (65536 128 12)
-    ###GPUFLAGS+= --maxrregcount 96 # degrades throughput: 4.1E8 (16384 32 12) up to 4.5E8 (65536 128 12)
-    ###GPUFLAGS+= --maxrregcount 64 # degrades throughput: 1.7E8 (16384 32 12) flat at 1.7E8 (65536 128 12)
-    CUBUILDRULEFLAGS = -Xcompiler -fPIC -c
-    CCBUILDRULEFLAGS = -Xcompiler -fPIC -c -x cu
-    CUDATESTFLAGS = -lcuda
-  else ifneq ($(origin REQUIRE_CUDA),undefined)
-    # If REQUIRE_CUDA is set but no cuda is found, stop here (e.g. for CI tests on GPU #443)
-    $(error No cuda installation found (set CUDA_HOME or make GPUCC visible in PATH))
+#=== Configure the GPU compiler (CUDA or HIP)
+
+# FIXME! (AV 24.01.2024)
+# In the current implementation (without separate builds for C++ and CUDA/HIP), we first check for cudacc and hipcc in CUDA_HOME and HIP_HOME.
+# If CUDA_HOME or HIP_HOME are not set, try to determine them from the path to cudacc and hipcc.
+# While convoluted, this is currently necessary to allow disabling CUDA/HIP builds by setting CUDA_HOME or HIP_HOME to invalid paths.
+# This will (probably?) be fixed when separate C++ and CUDA/HIP builds are implemented (PR #775).
+
+# If CXX is not a single word (example "clang++ --gcc-toolchain...") then disable CUDA and HIP builds (issue #505)
+# This is because it is impossible to pass this to "GPUFLAGS += -ccbin <host-compiler>" below
+ifneq ($(words $(subst ccache ,,$(CXX))),1) # allow at most "CXX=ccache <host-compiler>" from outside
+  $(warning CUDA and HIP builds are not supported for multi-word CXX "$(CXX)")
+  override CUDA_HOME=disabled
+  override HIP_HOME=disabled
+endif
+
+# If CUDA_HOME is not set, try to set it from the path to nvcc
+ifndef CUDA_HOME
+  CUDA_HOME = $(patsubst %bin/nvcc,%,$(shell which nvcc 2>/dev/null))
+  $(warning CUDA_HOME was not set: using "$(CUDA_HOME)")
+endif
+
+# If HIP_HOME is not set, try to set it from the path to hipcc
+ifndef HIP_HOME
+  HIP_HOME = $(patsubst %bin/hipcc,%,$(HIP_COMPILER_PATH))
+  $(warning HIP_HOME was not set: using "$(HIP_HOME)")
+endif
+
+# FIXME! (AV 24.01.2024)
+# In the current implementation (without separate builds for C++ and CUDA/HIP),
+# builds are performed for HIP only if CUDA is not found in the path.
+# If both CUDA and HIP are installed, HIP builds can be triggered by unsetting CUDA_HOME.
+# This will be fixed when separate C++ and CUDA/HIP builds are implemented (PR #775).
+
+#--- Option 1: CUDA exists -> use CUDA
+
+# Set GPUCC as $(CUDA_HOME)/bin/nvcc if it exists
+ifneq ($(wildcard $(CUDA_HOME)/bin/nvcc),)
+
+  GPUCC = $(CUDA_HOME)/bin/nvcc
+  USE_NVTX ?=-DUSE_NVTX
+  # See https://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/index.html
+  # See https://arnon.dk/matching-sm-architectures-arch-and-gencode-for-various-nvidia-cards/
+  # Default: use compute capability 70 for V100 (CERN lxbatch, CERN itscrd, Juwels Cluster).
+  # Embed device code for 70, and PTX for 70+.
+  # Export MADGRAPH_CUDA_ARCHITECTURE (comma-separated list) to use another value or list of values (see #533).
+  # Examples: use 60 for P100 (Piz Daint), 80 for A100 (Juwels Booster, NVidia raplab/Curiosity).
+  MADGRAPH_CUDA_ARCHITECTURE ?= 70
+  ###CUARCHFLAGS = -gencode arch=compute_$(MADGRAPH_CUDA_ARCHITECTURE),code=compute_$(MADGRAPH_CUDA_ARCHITECTURE) -gencode arch=compute_$(MADGRAPH_CUDA_ARCHITECTURE),code=sm_$(MADGRAPH_CUDA_ARCHITECTURE) # Older implementation (AV): go back to this one for multi-GPU support #533
+  ###CUARCHFLAGS = --gpu-architecture=compute_$(MADGRAPH_CUDA_ARCHITECTURE) --gpu-code=sm_$(MADGRAPH_CUDA_ARCHITECTURE),compute_$(MADGRAPH_CUDA_ARCHITECTURE) # Newer implementation (SH): cannot use this as-is for multi-GPU support #533
+  comma:=,
+  CUARCHFLAGS = $(foreach arch,$(subst $(comma), ,$(MADGRAPH_CUDA_ARCHITECTURE)),-gencode arch=compute_$(arch),code=compute_$(arch) -gencode arch=compute_$(arch),code=sm_$(arch))
+  CUINC = -I$(CUDA_HOME)/include/
+  ifeq ($(RNDGEN),hasNoCurand)
+    CURANDLIBFLAGS=
   else
-    # No cuda. Switch cuda compilation off and go to common random numbers in C++
-    $(warning CUDA_HOME is not set or is invalid: export CUDA_HOME to compile with cuda)
-    override GPUCC=
-    override USE_NVTX=
-    override CUINC=
-    override CURANDLIBFLAGS=
+    CURANDLIBFLAGS = -L$(CUDA_HOME)/lib64/ -lcurand # NB: -lcuda is not needed here!
   endif
+  CUOPTFLAGS = -lineinfo
+  ###GPUFLAGS = $(OPTFLAGS) $(CUOPTFLAGS) $(INCFLAGS) $(CUINC) $(USE_NVTX) $(CUARCHFLAGS) -use_fast_math
+  GPUFLAGS = $(foreach opt, $(OPTFLAGS), -Xcompiler $(opt)) $(CUOPTFLAGS) $(INCFLAGS) $(CUINC) $(USE_NVTX) $(CUARCHFLAGS) -use_fast_math
+  ###GPUFLAGS += -Xcompiler -Wall -Xcompiler -Wextra -Xcompiler -Wshadow
+  ###GPUCC_VERSION = $(shell $(GPUCC) --version | grep 'Cuda compilation tools' | cut -d' ' -f5 | cut -d, -f1)
+  GPUFLAGS += -std=c++17 # need CUDA >= 11.2 (see #333): this is enforced in mgOnGpuConfig.h
+  # Without -maxrregcount: baseline throughput: 6.5E8 (16384 32 12) up to 7.3E8 (65536 128 12)
+  ###GPUFLAGS+= --maxrregcount 160 # improves throughput: 6.9E8 (16384 32 12) up to 7.7E8 (65536 128 12)
+  ###GPUFLAGS+= --maxrregcount 128 # improves throughput: 7.3E8 (16384 32 12) up to 7.6E8 (65536 128 12)
+  ###GPUFLAGS+= --maxrregcount 96 # degrades throughput: 4.1E8 (16384 32 12) up to 4.5E8 (65536 128 12)
+  ###GPUFLAGS+= --maxrregcount 64 # degrades throughput: 1.7E8 (16384 32 12) flat at 1.7E8 (65536 128 12)
+  CUBUILDRULEFLAGS = -Xcompiler -fPIC -c
+  CCBUILDRULEFLAGS = -Xcompiler -fPIC -c -x cu
+  CUDATESTFLAGS = -lcuda
 
   # Set the host C++ compiler for GPUCC via "-ccbin <host-compiler>"
   # (NB issue #505: this must be a single word, "clang++ --gcc-toolchain..." is not supported)
@@ -173,71 +203,55 @@ ifeq ($(findstring nvcc,$(CUDA_COMPILER_PATH)),nvcc)
   GPUFLAGS += -allow-unsupported-compiler
   endif
 
-else ifeq ($(findstring hipcc,$(HIP_COMPILER_PATH)),hipcc)
-  #=== Configure the HIP compiler
+else ifneq ($(origin REQUIRE_CUDA),undefined)
 
-  # If CXX is not a single word (example "clang++ --gcc-toolchain...") then disable CUDA/HIP builds (issue #505)
-  # This is because it is impossible to pass this to "GPUFLAGS += -ccbin <host-compiler>" below
-  ifneq ($(words $(subst ccache ,,$(CXX))),1) # allow at most "CXX=ccache <host-compiler>" from outside
-    $(warning HIP builds are not supported for multi-word CXX "$(CXX)")
-    override HIP_HOME=disabled
-  endif
+  # If REQUIRE_CUDA is set but no cuda is found, stop here (e.g. for CI tests on GPU #443)
+  $(error No cuda installation found (set CUDA_HOME or make GPUCC visible in PATH))
 
-  # If HIP_HOME is not set, try to set it from the location of GPUCC
-  ifndef HIP_HOME
-    HIP_HOME = $(patsubst %bin/hipcc,%,$(HIP_COMPILER_PATH))
-    $(warning HIP_HOME was not set: using "$(HIP_HOME)")
-  endif
+#--- Option 2: CUDA does not exist, HIP exists -> use HIP
 
-  # Set GPUCC as $(HIP_HOME)/bin/hipcc if it exists
-  ifneq ($(wildcard $(HIP_HOME)/bin/hipcc),)
-    GPUCC = $(HIP_HOME)/bin/hipcc
-
-    # Should maybe find something equivelant to this in HIP
-    #USE_NVTX ?=-DUSE_NVTX
-
-    HIPARCHFLAGS = -target x86_64-linux-gnu --offload-arch=gfx90a
-    HIPINC = -I$(HIP_HOME)/include/
-
-    # -DHIP_FAST_MATH equivelant to -use_fast_math in HIP 
-    # (But only for single precision line 208: https://rocm-developer-tools.github.io/HIP/hcc__detail_2math__functions_8h_source.html)
-    GPUFLAGS = $(OPTFLAGS) $(CUOPTFLAGS) $(INCFLAGS) $(HIPINC) $(HIPARCHFLAGS) -DHIP_FAST_MATH -DHIP_PLATFORM=amd -fPIC
-    ###GPUFLAGS += -Xcompiler -Wall -Xcompiler -Wextra -Xcompiler -Wshadow
-    GPUFLAGS += -std=c++17
-    # Without -maxrregcount: baseline throughput: 6.5E8 (16384 32 12) up to 7.3E8 (65536 128 12)
-    ###GPUFLAGS+= --maxrregcount 160 # improves throughput: 6.9E8 (16384 32 12) up to 7.7E8 (65536 128 12)
-    ###GPUFLAGS+= --maxrregcount 128 # improves throughput: 7.3E8 (16384 32 12) up to 7.6E8 (65536 128 12)
-    ###GPUFLAGS+= --maxrregcount 96 # degrades throughput: 4.1E8 (16384 32 12) up to 4.5E8 (65536 128 12)
-    ###GPUFLAGS+= --maxrregcount 64 # degrades throughput: 1.7E8 (16384 32 12) flat at 1.7E8 (65536 128 12)
-
-    CUBUILDRULEFLAGS = -fPIC -c
-    CCBUILDRULEFLAGS = -fPIC -c
-
-  else ifneq ($(origin REQUIRE_HIP),undefined)
-    # If REQUIRE_HIP is set but no cuda is found, stop here (e.g. for CI tests on GPU #443)
-    $(error No hip installation found (set HIP_HOME or make GPUCC visible in PATH))
-  else
-    # No hip. Switch hip compilation off and go to common random numbers in C++
-    $(warning HIP_HOME is not set or is invalid: export HIP_HOME to compile with hip)
-    override GPUCC=
-    override USE_NVTX=
-    override CUINC=
-    override CURANDLIBFLAGS=
-  endif
+# Set GPUCC as $(HIP_HOME)/bin/hipcc if it exists
+else ifneq ($(wildcard $(HIP_HOME)/bin/hipcc),)
 
-  # Allow newer (unsupported) C++ compilers with older versions of CUDA if ALLOW_UNSUPPORTED_COMPILER_IN_CUDA is set (#504)
-  ifneq ($(origin ALLOW_UNSUPPORTED_COMPILER_IN_CUDA),undefined)
-  GPUFLAGS += -allow-unsupported-compiler
-  endif
+  GPUCC = $(HIP_HOME)/bin/hipcc
+  #USE_NVTX ?=-DUSE_NVTX # should maybe find something equivalent to this in HIP?
+  HIPARCHFLAGS = -target x86_64-linux-gnu --offload-arch=gfx90a
+  HIPINC = -I$(HIP_HOME)/include/
+  # Note: -DHIP_FAST_MATH is equivalent to -use_fast_math in HIP 
+  # (but only for single precision line 208: https://rocm-developer-tools.github.io/HIP/hcc__detail_2math__functions_8h_source.html)
+  GPUFLAGS = $(OPTFLAGS) $(CUOPTFLAGS) $(INCFLAGS) $(HIPINC) $(HIPARCHFLAGS) -DHIP_FAST_MATH -DHIP_PLATFORM=amd -fPIC
+  ###GPUFLAGS += -Xcompiler -Wall -Xcompiler -Wextra -Xcompiler -Wshadow
+  GPUFLAGS += -std=c++17
+  ###GPUFLAGS+= --maxrregcount 255 # (AV: is this option valid on HIP and meaningful on AMD GPUs?)
+  CUBUILDRULEFLAGS = -fPIC -c
+  CCBUILDRULEFLAGS = -fPIC -c
+
+else ifneq ($(origin REQUIRE_HIP),undefined)
+
+  # If REQUIRE_HIP is set but no HIP is found, stop here (e.g. for CI tests on GPU #443)
+  $(error No hip installation found (set HIP_HOME or make GPUCC visible in PATH))
+
+#--- Option 3: CUDA does not exist, HIP does not exist -> switch off both CUDA and HIP
+
+else
+
+  # No cudacc and no hipcc: switch CUDA and HIP compilation off and go to common random numbers in C++
+  $(warning CUDA_HOME is not set or is invalid: export CUDA_HOME to compile with cuda)
+  $(warning HIP_HOME is not set or is invalid: export HIP_HOME to compile with hip)
+  override GPUCC=
+  override USE_NVTX=
+  override CUINC=
+  override CURANDLIBFLAGS=
 
 endif
 
+# Export GPUCC (so that it can also be used in cudacpp_src.mk?)
 export GPUCC
 export GPUFLAGS
 
 #-------------------------------------------------------------------------------
 
-#=== Configure ccache for C++ and CUDA builds
+#=== Configure ccache for C++ and CUDA/HIP builds
 
 # Enable ccache if USECCACHE=1
 ifeq ($(USECCACHE)$(shell echo $(CXX) | grep ccache),1)
@@ -254,7 +268,7 @@ endif
 
 #-------------------------------------------------------------------------------
 
-#=== Configure PowerPC-specific compiler flags for C++ and CUDA
+#=== Configure PowerPC-specific compiler flags for C++ and CUDA/HIP
 
 # PowerPC-specific CXX compiler flags (being reviewed)
 ifeq ($(UNAME_P),ppc64le)
@@ -270,7 +284,7 @@ else
   ######CXXFLAGS+= -fno-semantic-interposition # no benefit (neither alone, nor combined with -flto)
 endif
 
-# PowerPC-specific CUDA compiler flags (to be reviewed!)
+# PowerPC-specific CUDA/HIP compiler flags (to be reviewed!)
 ifeq ($(UNAME_P),ppc64le)
   GPUFLAGS+= -Xcompiler -mno-float128
 endif
@@ -285,12 +299,14 @@ override OMPFLAGS = -fopenmp
 ###override OMPFLAGS = # disable OpenMP MT on Intel (was ok without GPUCC but not ok with GPUCC before #578)
 else ifneq ($(shell $(CXX) --version | egrep '^(clang)'),)
 override OMPFLAGS = -fopenmp
-###override OMPFLAGS = # disable OpenMP MT on clang (was not ok without or with GPUCC before #578)
-else ifneq ($(shell $(CXX) --version | egrep '^(Apple clang)'),)
-override OMPFLAGS = # disable OpenMP MT on Apple clang (builds fail in the CI #578)
+###override OMPFLAGS = # disable OpenMP MT on clang (was not ok without or with nvcc before #578)
+###else ifneq ($(shell $(CXX) --version | egrep '^(Apple clang)'),) # AV for Mac (Apple clang compiler)
+else ifeq ($(UNAME_S),Darwin) # OM for Mac (any compiler)
+override OMPFLAGS = # AV disable OpenMP MT on Apple clang (builds fail in the CI #578)
+###override OMPFLAGS = -fopenmp # OM reenable OpenMP MT on Apple clang? (AV Oct 2023: this still fails in the CI)
 else
-override OMPFLAGS = -fopenmp
-###override OMPFLAGS = # disable OpenMP MT (default before #575)
+override OMPFLAGS = -fopenmp # enable OpenMP MT by default on all other platforms
+###override OMPFLAGS = # disable OpenMP MT on all other platforms (default before #575)
 endif
 
 # Set the default AVX (vectorization) choice
@@ -356,7 +372,7 @@ export OMPFLAGS
 
 #-------------------------------------------------------------------------------
 
-#=== Set the CUDA/C++ compiler flags appropriate to user-defined choices of AVX, FPTYPE, HELINL, HRDCOD, RNDGEN
+#=== Set the CUDA/HIP/C++ compiler flags appropriate to user-defined choices of AVX, FPTYPE, HELINL, HRDCOD, RNDGEN
 
 # Set the build flags appropriate to OMPFLAGS
 $(info OMPFLAGS=$(OMPFLAGS))
@@ -573,8 +589,9 @@ $(BUILDDIR)/gcheck_sa.o: CXXFLAGS += $(USE_NVTX) $(CUINC)
 
 # Apply special build flags only to check_sa and CurandRandomNumberKernel (curand headers, #679)
 $(BUILDDIR)/check_sa.o: CXXFLAGS += $(CXXFLAGSCURAND)
-$(BUILDDIR)/gcheck_sa.o: CXXFLAGS += $(CXXFLAGSCURAND)
+$(BUILDDIR)/gcheck_sa.o: CUFLAGS += $(CXXFLAGSCURAND)
 $(BUILDDIR)/CurandRandomNumberKernel.o: CXXFLAGS += $(CXXFLAGSCURAND)
+$(BUILDDIR)/gCurandRandomNumberKernel.o: CUFLAGS += $(CXXFLAGSCURAND)
 ifeq ($(RNDGEN),hasCurand)
 $(BUILDDIR)/CurandRandomNumberKernel.o: CXXFLAGS += $(CUINC)
 endif
@@ -772,12 +789,18 @@ $(testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(cxx_object
 	  $(GPUCC) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) -ldl $(LIBFLAGS) $(CUDATESTFLAGS)
 endif
 
+# Use target gtestlibs to build only googletest
+ifneq ($(GTESTLIBS),)
+gtestlibs: $(GTESTLIBS)
+endif
+
 # Use flock (Linux only, no Mac) to allow 'make -j' if googletest has not yet been downloaded https://stackoverflow.com/a/32666215
 $(GTESTLIBS):
 ifneq ($(shell which flock 2>/dev/null),)
+	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
 	flock $(BUILDDIR)/.make_test.lock $(MAKE) -C $(TESTDIR)
 else
-	$(MAKE) -C $(TESTDIR)
+	if [ -d $(TESTDIR) ]; then $(MAKE) -C $(TESTDIR); fi
 endif
 
 #-------------------------------------------------------------------------------
diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/dummy_fct.f b/epochX/cudacpp/gg_ttg.mad/SubProcesses/dummy_fct.f
index 076cf29d67..4f7a204b8f 100644
--- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/dummy_fct.f
+++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/dummy_fct.f
@@ -32,7 +32,7 @@ logical FUNCTION dummy_cuts(P)
       LOGICAL  IS_A_NU(NEXTERNAL),IS_HEAVY(NEXTERNAL)
       logical  do_cuts(nexternal)
       COMMON /TO_SPECISA/IS_A_J,IS_A_A,IS_A_L,IS_A_B,IS_A_NU,IS_HEAVY,
-     . IS_A_ONIUM, do_cuts
+     & IS_A_ONIUM, do_cuts
 
       dummy_cuts=.true.
 
@@ -118,15 +118,16 @@ double precision function user_dynamical_scale(P)
       
       
 C ************************************************************
-C default for the library implementing a dummt bias function
+C default for the library implementing a dummy bias function
 C ************************************************************
       subroutine bias_wgt_custom(p, original_weight, bias_weight)
-          implicit none
+      implicit none
 C
 C Parameters
 C
           include 'nexternal.inc'
-C
+
+C     
 C Arguments
 C
           double precision p(0:3, nexternal)
@@ -161,3 +162,4 @@ subroutine bias_wgt_custom(p, original_weight, bias_weight)
 
       return
       end subroutine bias_wgt_custom
+
diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/fbridge.cc b/epochX/cudacpp/gg_ttg.mad/SubProcesses/fbridge.cc
index 2b956730d4..22ce3f5115 100644
--- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/fbridge.cc
+++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/fbridge.cc
@@ -49,11 +49,7 @@ extern "C"
 #ifdef MGONGPUCPP_GPUIMPL
     GpuRuntime::setUp();
 #endif
-    // Create a process object, read parm card and set parameters
-    // FIXME: the process instance can happily go out of scope because it is only needed to read parameters?
-    // FIXME: the CPPProcess should really be a singleton? what if fbridgecreate is called from several Fortran threads?
-    CPPProcess process( /*verbose=*/false );
-    process.initProc( "../../Cards/param_card.dat" );
+    // (NB: CPPProcess::initProc no longer needs to be executed here because it is called in the Bridge constructor)
     // FIXME: disable OMP in Bridge when called from Fortran
     *ppbridge = new Bridge<FORTRANFPTYPE>( *pnevtF, *pnparF, *pnp4F );
   }
diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/genps.f b/epochX/cudacpp/gg_ttg.mad/SubProcesses/genps.f
index fe9c61504b..c00e33d954 100644
--- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/genps.f
+++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/genps.f
@@ -1877,12 +1877,12 @@ double precision function get_channel_cut(p, config)
          d1 = iforest(1, -i, config)
          d2 = iforest(2, -i, config)
          do j=0,3
-            if (d1.gt.0.and.d1.le.2) then
+            if (d1.gt.0.and.d1.le.nincoming) then
                ptemp(j,-i) = ptemp(j,-i) - ptemp(j, d1)
             else
                ptemp(j,-i) = ptemp(j,-i)+ptemp(j, d1)
             endif
-            if (d2.gt.0.and.d2.le.2) then
+            if (d2.gt.0.and.d2.le.nincoming) then
                ptemp(j,-i) = ptemp(j,-i) - ptemp(j, d2)
             else
                ptemp(j,-i) = ptemp(j,-i)+ptemp(j, d2)
diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/makefile b/epochX/cudacpp/gg_ttg.mad/SubProcesses/makefile
index 74db44d848..d572486c2e 100644
--- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/makefile
+++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/makefile
@@ -9,6 +9,12 @@ FFLAGS+= -cpp
 # Compile counters with -O3 as in the cudacpp makefile (avoid being "unfair" to Fortran #740)
 CXXFLAGS = -O3 -Wall -Wshadow -Wextra
 
+# Add -std=c++17 explicitly to avoid build errors on macOS
+# Add -mmacosx-version-min=11.3 to avoid "ld: warning: object file was built for newer macOS version than being linked"
+ifneq ($(shell $(CXX) --version | egrep '^Apple clang'),)
+CXXFLAGS += -std=c++17 -mmacosx-version-min=11.3
+endif
+
 # Enable ccache if USECCACHE=1
 ifeq ($(USECCACHE)$(shell echo $(CXX) | grep ccache),1)
   override CXX:=ccache $(CXX)
@@ -51,7 +57,7 @@ CUDACPP_MAKEFILE=cudacpp.mk
 CUDACPP_MAKEENV:=$(shell echo '$(.VARIABLES)' | tr " " "\n" | egrep "(USEBUILDDIR|AVX|FPTYPE|HELINL|HRDCOD)")
 ###$(info CUDACPP_MAKEENV=$(CUDACPP_MAKEENV))
 ###$(info $(foreach v,$(CUDACPP_MAKEENV),$(v)="$($(v))"))
-CUDACPP_BUILDDIR:=$(shell $(MAKE) $(foreach v,$(CUDACPP_MAKEENV),$(v)="$($(v))") -f $(CUDACPP_MAKEFILE) -pn |& awk '/Building/{print $$3}' | sed s/BUILDDIR=//)
+CUDACPP_BUILDDIR:=$(shell $(MAKE) $(foreach v,$(CUDACPP_MAKEENV),$(v)="$($(v))") -f $(CUDACPP_MAKEFILE) -pn 2>&1 | awk '/Building/{print $$3}' | sed s/BUILDDIR=//)
 ifeq ($(CUDACPP_BUILDDIR),)
 $(error CUDACPP_BUILDDIR='$(CUDACPP_BUILDDIR)' should not be empty!)
 else
@@ -89,7 +95,12 @@ SYMMETRY = symmetry.o idenparts.o
 
 # Binaries
 
-LDFLAGS+=-Wl,--no-relax # avoid 'failed to convert GOTPCREL relocation' error #458
+ifeq ($(UNAME),Darwin)
+LDFLAGS += -lc++ # avoid 'Undefined symbols' for chrono::steady_clock on macOS (checked with otool -L libmg5amc_gg_ttx_cpp.so) 
+LDFLAGS += -mmacosx-version-min=11.3 # avoid "ld: warning: object file was built for newer macOS version than being linked"  
+else
+LDFLAGS += -Wl,--no-relax # avoid 'failed to convert GOTPCREL relocation' error #458 (not supported on macOS)
+endif
 
 all: $(PROG)_fortran $(CUDACPP_BUILDDIR)/$(PROG)_cpp # also builds $(PROG)_cuda if $(CUDACPP_CULIB) exists (#503)
 
@@ -100,8 +111,8 @@ LINKLIBS += -lintlc # undefined reference to `_intel_fast_memcpy'
 else ifneq ($(shell $(CXX) --version | egrep '^clang'),)
 override OMPFLAGS = -fopenmp
 $(CUDACPP_BUILDDIR)/$(PROG)_cpp: LINKLIBS += -L $(shell dirname $(shell $(CXX) -print-file-name=libc++.so)) -lomp # see #604
-###else ifneq ($(shell $(CXX) --version | egrep '^Apple clang'),)
-###override OMPFLAGS = -fopenmp # OMP is not supported yet by cudacpp for Apple clang
+else ifneq ($(shell $(CXX) --version | egrep '^Apple clang'),)
+override OMPFLAGS = # OMP is not supported yet by cudacpp for Apple clang
 else
 override OMPFLAGS = -fopenmp
 endif
diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/runTest.cc b/epochX/cudacpp/gg_ttg.mad/SubProcesses/runTest.cc
index 0ed26180ca..de327f2321 100644
--- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/runTest.cc
+++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/runTest.cc
@@ -71,6 +71,8 @@ struct CPUTest : public CUDA_CPU_TestBase
     , hstSelCol( nevt )
     , hstIsGoodHel( CPPProcess::ncomb )
   {
+    // FIXME: the process instance can happily go out of scope because it is only needed to read parameters?
+    // FIXME: the CPPProcess should really be a singleton?
     process.initProc( "../../Cards/param_card.dat" );
   }
 
@@ -183,6 +185,8 @@ struct CUDATest : public CUDA_CPU_TestBase
     , devSelCol( nevt )
     , devIsGoodHel( CPPProcess::ncomb )
   {
+    // FIXME: the process instance can happily go out of scope because it is only needed to read parameters?
+    // FIXME: the CPPProcess should really be a singleton?
     process.initProc( "../../Cards/param_card.dat" );
   }
 
diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/testxxx.cc b/epochX/cudacpp/gg_ttg.mad/SubProcesses/testxxx.cc
index 016bc0f472..e5167de00c 100644
--- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/testxxx.cc
+++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/testxxx.cc
@@ -59,7 +59,8 @@ TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testxxx )
   using namespace mg5amcCpu;
 #endif
 #ifndef __APPLE__ // test #701 (except on MacOS where feenableexcept is not defined #730)
-  const bool enableFPE = !getenv( "CUDACPP_RUNTIME_DISABLEFPE" );
+  const char* enableFPEc = getenv( "CUDACPP_RUNTIME_ENABLEFPE" );
+  const bool enableFPE = ( enableFPEc != 0 ) && ( std::string( enableFPEc ) != "" );
   if( enableFPE )
   {
     feenableexcept( FE_INVALID | FE_DIVBYZERO | FE_OVERFLOW | FE_UNDERFLOW ); // debug #701
diff --git a/epochX/cudacpp/gg_ttg.mad/bin/generate_events b/epochX/cudacpp/gg_ttg.mad/bin/generate_events
index 107313b25d..5577cc66a0 100755
--- a/epochX/cudacpp/gg_ttg.mad/bin/generate_events
+++ b/epochX/cudacpp/gg_ttg.mad/bin/generate_events
@@ -46,7 +46,7 @@ if __debug__ and (not os.path.exists(pjoin(root_path,'../..', 'bin','create_rele
 
 sys.path.append(pjoin(root_path,'bin','internal'))
 import madevent_interface as ME        
-
+import misc as misc
 
 import logging
 import logging.config
@@ -160,17 +160,31 @@ if '__main__' == __name__:
     # Check that python version is valid
 
     set_configuration()
-    argument = sys.argv    
+    argument = sys.argv
+
+    # check for plugin customization of the launch command
+    launch_interface = ME.MadEventCmdShell
+    if os.path.exists(pjoin(root_path, 'bin','internal', 'launch_plugin.py')):
+        with  misc.TMP_variable(sys, 'path', sys.path + [pjoin(root_path, 'bin', 'internal')]):
+            from importlib import reload
+            try:
+                reload('launch_plugin')
+            except Exception as error:
+                import launch_plugin
+        launch_interface =  launch_plugin.MEINTERFACE
+
+
+    
     try:
         if '-h' in argument or '--help' in argument:
-            launch = ME.MadEventCmdShell(me_dir=root_path, force_run=True)
+            launch = launch_interface(me_dir=root_path, force_run=True)
             launch.exec_cmd('help generate_events')
             sys.exit()
         elif len(argument) > 1 and argument[1] in ['0', '1', '2']:
             argument = treat_old_argument(argument)
         
         with ME.MadEventCmdShell.RunWebHandling(root_path, ):
-            launch = ME.MadEventCmdShell(me_dir=root_path, force_run=True)
+            launch = launch_interface(me_dir=root_path, force_run=True)
             launch.run_cmd('generate_events %s' % ' '.join(argument[1:]))
             launch.run_cmd('quit')
     except ME.MadEventAlreadyRunning as message:
diff --git a/epochX/cudacpp/gg_ttg.mad/bin/internal/__init__.py b/epochX/cudacpp/gg_ttg.mad/bin/internal/__init__.py
index 16e60d8182..0d17042f0d 100755
--- a/epochX/cudacpp/gg_ttg.mad/bin/internal/__init__.py
+++ b/epochX/cudacpp/gg_ttg.mad/bin/internal/__init__.py
@@ -26,6 +26,7 @@ class aMCatNLOError(MadGraph5Error):
 import os
 import logging
 import time
+pjoin = os.path.join
 
 #Look for basic file position MG5DIR and MG4DIR
 MG5DIR = os.path.realpath(os.path.join(os.path.dirname(__file__),
diff --git a/epochX/cudacpp/gg_ttg.mad/bin/internal/banner.py b/epochX/cudacpp/gg_ttg.mad/bin/internal/banner.py
index c1e54d3cb9..bd1517985f 100755
--- a/epochX/cudacpp/gg_ttg.mad/bin/internal/banner.py
+++ b/epochX/cudacpp/gg_ttg.mad/bin/internal/banner.py
@@ -537,7 +537,7 @@ def charge_card(self, tag):
             self.param_card = param_card_reader.ParamCard(param_card)
             return self.param_card
         elif tag == 'mgruncard':
-            self.run_card = RunCard(self[tag])
+            self.run_card = RunCard(self[tag], unknown_warning=False)
             return self.run_card
         elif tag == 'mg5proccard':
             proc_card = self[tag].split('\n')
@@ -1002,14 +1002,18 @@ def __init__(self, finput=None, **opt):
         self.allowed_value = {}
         
         self.default_setup()
+        self.plugin_input(finput)
         
 
         # if input is define read that input
         if isinstance(finput, (file, str, StringIO.StringIO)):
             self.read(finput, **opt)
+        
 
 
 
+    def plugin_input(self, finput=None):
+        pass
 
 
     def default_setup(self):
@@ -2621,7 +2625,28 @@ class RunCard(ConfigFile):
     default_include_file = 'run_card.inc'
     default_autodef_file = 'run.inc'
     donewarning = []
+    include_as_parameter = []
+
+    def plugin_input(self, finput):
 
+        if not finput and not MADEVENT:
+            return
+        curr_dir = None
+        if isinstance(finput, file):
+            # expected path to be like "XXXX/Cards/run_card.dat"
+            curr_dir = os.path.dirname(os.path.dirname(finput.name))
+        elif isinstance(finput, str):
+            curr_dir = os.path.dirname(os.path.dirname(finput))
+        
+        if curr_dir:
+            if os.path.exists(pjoin(curr_dir, 'bin', 'internal', 'plugin_run_card')):
+                # expected format {} passing everything as optional argument
+                for line in open(pjoin(curr_dir, 'bin', 'internal', 'plugin_run_card')):
+                    if line.startswith('#'):
+                        continue
+                    opts = dict(eval(line))
+                    self.add_param(**opts)
+        
     @classmethod
     def fill_post_set_from_blocks(cls):
         """set the post_set function for any parameter defined in a run_block"""
@@ -2647,18 +2672,48 @@ def __new__(cls, finput=None, **opt):
             elif isinstance(finput, cls):
                 target_class = finput.__class__
             elif isinstance(finput, str):
+                path = finput
                 if '\n' not in finput:
                     finput = open(finput).read()
                 if 'req_acc_FO' in finput:
                     target_class = RunCardNLO
                 else:
                     target_class = RunCardLO
+                    if MADEVENT and os.path.exists(pjoin(MEDIR, 'bin','internal', 'launch_plugin.py')):
+                        with  misc.TMP_variable(sys, 'path', sys.path + [pjoin(MEDIR, 'bin', 'internal')]):
+                            from importlib import reload
+                            try:
+                                reload('launch_plugin')
+                            except Exception as error:
+                                import launch_plugin
+                        target_class = launch_plugin.RunCard
+                    elif not MADEVENT:
+                        if 'run_card.dat' in path:
+                            launch_plugin_path = path.replace('run_card.dat', '../bin/internal/launch_plugin.py')
+                        elif 'run_card_default.dat' in path:
+                             launch_plugin_path = path.replace('run_card_default.dat', '../bin/internal/launch_plugin.py')
+                        else:
+                            launch_plugin_path = None
+                        if launch_plugin_path and os.path.exists(launch_plugin_path):
+                            misc.sprint('try to use plugin class', path.replace('run_card.dat', '../bin/internal/launch_plugin.py'))
+                            pydir = os.path.dirname(launch_plugin_path)
+                            with  misc.TMP_variable(sys, 'path', sys.path + [pydir]):
+                                from importlib import reload
+                                try:
+                                    reload('launch_plugin')
+                                except Exception as error:
+                                    import launch_plugin
+                            target_class = launch_plugin.RunCard
+            elif issubclass(finput, RunCard):
+                target_class = finput
             else:
                 return None
 
             target_class.fill_post_set_from_blocks()
-
-            return super(RunCard, cls).__new__(target_class, finput, **opt)
+            out = super(RunCard, cls).__new__(target_class, finput, **opt)
+            if not isinstance(out, RunCard): #should not happen but in presence of missmatch of library loaded.
+                out.__init__(finput, **opt)
+            return out
         else:
             return super(RunCard, cls).__new__(cls, finput, **opt)
 
@@ -2686,7 +2741,7 @@ def __init__(self, *args, **opts):
         self.system_default = {}
         
         self.display_block = [] # set some block to be displayed
-
+        self.fct_mod = {} # {param: (fct_pointer, *argument, **opts)}
 
         self.cut_class = {} 
         self.warned=False
@@ -2723,7 +2778,7 @@ def get_lepton_densities(cls):
 
     def add_param(self, name, value, fortran_name=None, include=True, 
                   hidden=False, legacy=False, cut=False, system=False, sys_default=None,
-                  autodef=False, 
+                  autodef=False, fct_mod=None,
                   **opts):
         """ add a parameter to the card. value is the default value and 
         defines the type (int/float/bool/str) of the input.
@@ -2737,6 +2792,7 @@ def add_param(self, name, value, fortran_name=None, include=True,
                  If a path (Source/PDF/pdf.inc) the definition will be added within that file
                  Default is False (does not add the definition)
                  entry added in the run_card will automatically have this on True.
+        fct_mod: defines a function to run if the parameter is modify in the include file
         options of **opts:
         - allowed: list of valid options. '*' means anything else should be allowed.
                  empty list means anything possible as well. 
@@ -2761,15 +2817,22 @@ def add_param(self, name, value, fortran_name=None, include=True,
         if autodef:
             self.definition_path[autodef].append(name)
             self.user_set.add(name)
+        # function to trigger if a value is modified in the include file
+        # main target is action to force correct recompilation (like for compilation flag/...)
+        if fct_mod:
+            self.fct_mod[name] = fct_mod
 
-    def read(self, finput, consistency=True):
+    def read(self, finput, consistency=True, unknown_warning=True, **opt):
         """Read the input file, this can be a path to a file, 
            a file object, a str with the content of the file."""
            
         if isinstance(finput, str):
             if "\n" in finput:
                 finput = finput.split('\n')
+                if 'path' in opt:
+                    self.path = opt['path']
             elif os.path.isfile(finput):
+                self.path = finput
                 finput = open(finput)
             else:
                 raise Exception("No such file %s" % finput)
@@ -2784,7 +2847,7 @@ def read(self, finput, consistency=True):
             name = name.lower().strip()
             if name not in self:
                 #looks like an entry added by a user -> add it nicely
-                self.add_unknown_entry(name, value)
+                self.add_unknown_entry(name, value, unknown_warning)
             else:
                 self.set( name, value, user=True)
         # parameter not set in the run_card can be set to compatiblity value
@@ -2796,7 +2859,7 @@ def read(self, finput, consistency=True):
                         logger.warning(str(error))
                     else:
                         raise
-    def add_unknown_entry(self, name, value):
+    def add_unknown_entry(self, name, value, unknow_warning):
         """function to add an entry to the run_card when the associated parameter does not exists.
            This is based on the guess_entry_fromname for the various syntax providing input.
            This then call add_param accordingly.
@@ -2835,7 +2898,7 @@ def add_unknown_entry(self, name, value):
                 raise Exception("dictionary need to have at least one entry")
             default['dict']['__type__'] = default[self.guess_type_from_value(default_value[0])]
 
-        if name not in RunCard.donewarning:
+        if name not in RunCard.donewarning and unknow_warning:
             logger.warning("Found unexpected entry in run_card: \"%s\" with value \"%s\".\n"+\
                 "  The type was assigned to %s. \n"+\
                 "  The definition of that variable will %sbe automatically added to fortran file %s\n"+\
@@ -2873,7 +2936,17 @@ def valid_line(self, line, tmp):
                 return False 
         else:
             return True      
-                    
+
+
+    def reset_simd(self, old_value, new_value, name, *args, **opts):
+        #return
+        raise Exception('pass in reset simd')
+
+    def make_clean(self,old_value, new_value, name, dir):
+        raise Exception('pass make clean for ', dir)
+
+    def make_Ptouch(self,old_value, new_value, name, reset):
+        raise Exception('pass Ptouch for ', reset)             
                 
     def write(self, output_file, template=None, python_template=False,
                     write_hidden=False, template_options=None, **opt):
@@ -2898,11 +2971,12 @@ def write(self, output_file, template=None, python_template=False,
         if python_template and not to_write:
             import string
             if self.blocks:
-                text = string.Template(text)
                 mapping = {}
                 for b in self.blocks:
                     mapping[b.name] =  b.get_template(self)
-                text = text.substitute(mapping)
+                    if "$%s" % b.name not in text:
+                        text += "\n$%s\n" % b.name
+                text = string.Template(text).substitute(mapping)
 
             if not self.list_parameter:
                 text = text % self
@@ -3048,6 +3122,77 @@ def write(self, output_file, template=None, python_template=False,
         else:
             output_file.write(text)
 
+    def get_last_value_include(self, output_dir):
+        """For paraeter in self.fct_mod
+        parse the associate inc file to get the value of the previous run.
+        We return a dictionary {name: old_value}
+        if inc file does not exist we will return the current value (i.e. set has no change)
+        """
+
+        #remember that 
+        # default_include_file is a class variable
+        # self.includepath is on the form include_path : [list of param ]
+        out = {}
+
+        # setup inc_to_parse to be like self.includepath (include_path : [list of param ])
+        # BUT only containing the parameter that need to be tracked for the fct_mod option
+        inc_to_parse = {}
+        for inc_file, params in self.includepath.items():
+            if not inc_file:
+                continue
+            if any(p in params for p in self.fct_mod):
+                inc_to_parse[inc_file] = [name for name in self.includepath[inc_file] if name in self.fct_mod]
+
+        # now loop over the files and ask the associate function
+        for inc_file, params in inc_to_parse.items():
+            if inc_file is True:
+                inc_file = self.default_include_file
+            out.update(self.get_value_from_include(inc_file, params, output_dir))
+
+        return out
+
+    def get_value_from_include(self, path, list_of_params, output_dir):
+        """for a given include file return the current value of the requested parameter
+        return a dictionary {name: value}
+        if path does not exists return the current value in self for all parameter"""
+
+        #WARNING DOES NOT HANDLE LIST/DICT so far
+
+        # handle case where file is missing
+        if not os.path.exists(pjoin(output_dir,path)):
+            misc.sprint("include file not existing", pjoin(output_dir,path))
+            out = {name: self[name] for name in list_of_params}
+
+        with open(pjoin(output_dir,path), 'r') as fsock:
+            text = fsock.read()
+        
+        for name in list_of_params:
+            misc.sprint(name, name in self.fortran_name)
+            misc.sprint(self.fortran_name[name] if name in self.fortran_name[name] else name)
+        to_track = [self.fortran_name[name] if name in self.fortran_name else name for name in list_of_params]
+        pattern = re.compile(r"\(?(%(names)s)\s?=\s?([^)]*)\)?" % {'names':'|'.join(to_track)}, re.I)
+        out =  dict(pattern.findall(text))
+        misc.sprint(out)
+        for name in list_of_params:
+            if name in self.fortran_name:
+                value = out[self.fortran_name[name]]
+                del out[self.fortran_name[name]]
+                out[name] = value
+
+        for name, value in out.items():
+            try:
+                out[name] = self.format_variable(value, type(self[name]))
+            except Exception:
+                continue
+
+        if len(out) != len(list_of_params):
+            misc.sprint(list_of_params)
+            misc.sprint(to_track)
+            misc.sprint(self.fortran_name)
+            misc.sprint(text)
+            raise Exception
+        return out 
+
 
     def get_default(self, name, default=None, log_level=None):
         """return self[name] if exist otherwise default. log control if we 
@@ -3338,71 +3483,93 @@ def write_include_file(self, output_dir, output_file=None):
         #ensusre that system only parameter are correctly set
         self.update_system_parameter_for_include()
 
+        value_in_old_include = self.get_last_value_include(output_dir)
+
+
         if output_dir:
             self.write_autodef(output_dir, output_file=None)
             # check/fix status of customised functions
             self.edit_dummy_fct_from_file(self["custom_fcts"], os.path.dirname(output_dir))
         
         for incname in self.includepath:
-            if incname is True:
-                pathinc = self.default_include_file
-            elif incname is False:
-                continue
-            else:
-                pathinc = incname
+            self.write_one_include_file(output_dir, incname, output_file)
+ 
+        for name,value in value_in_old_include.items():
+            if value != self[name]:
+                self.fct_mod[name][0](value, self[name], name, *self.fct_mod[name][1],**self.fct_mod[name][2])
 
-            if output_file:
-                fsock = output_file
+    def write_one_include_file(self, output_dir, incname, output_file=None):
+        """write one include file at the time"""
+
+        misc.sprint(incname)
+        if incname is True:
+            pathinc = self.default_include_file
+        elif incname is False:
+            return
+        else:
+            pathinc = incname
+
+        if output_file:
+            fsock = output_file
+        else:
+            fsock = file_writers.FortranWriter(pjoin(output_dir,pathinc+'.tmp'))
+
+
+        for key in self.includepath[incname]:                
+            #define the fortran name
+            if key in self.fortran_name:
+                fortran_name = self.fortran_name[key]
             else:
-                fsock = file_writers.FortranWriter(pjoin(output_dir,pathinc+'.tmp'))  
-            for key in self.includepath[incname]:                
-                #define the fortran name
-                if key in self.fortran_name:
-                    fortran_name = self.fortran_name[key]
+                fortran_name = key
+                
+            if incname in self.include_as_parameter:
+                fsock.writelines('INTEGER %s\n' % fortran_name)
+            #get the value with warning if the user didn't set it
+            value = self.get_default(key)
+            if hasattr(self, 'mod_inc_%s' % key):
+                value = getattr(self, 'mod_inc_%s' % key)(value)
+            # Special treatment for strings containing a list of
+            # strings. Convert it to a list of strings
+            if isinstance(value, list):
+                # in case of a list, add the length of the list as 0th
+                # element in fortran. Only in case of integer or float
+                # list (not for bool nor string)
+                targettype = self.list_parameter[key]                        
+                if targettype is bool:
+                    pass
+                elif targettype is int:
+                    line = '%s(%s) = %s \n' % (fortran_name, 0, self.f77_formatting(len(value)))
+                    fsock.writelines(line)
+                elif targettype is float:
+                    line = '%s(%s) = %s \n' % (fortran_name, 0, self.f77_formatting(float(len(value))))
+                    fsock.writelines(line)
+                # output the rest of the list in fortran
+                for i,v in enumerate(value):
+                    line = '%s(%s) = %s \n' % (fortran_name, i+1, self.f77_formatting(v))
+                    fsock.writelines(line)
+            elif isinstance(value, dict):
+                for fortran_name, onevalue in value.items():
+                    line = '%s = %s \n' % (fortran_name, self.f77_formatting(onevalue))
+                    fsock.writelines(line)                       
+            elif isinstance(incname,str) and 'compile' in incname:
+                if incname in self.include_as_parameter:
+                    line = 'PARAMETER (%s=%s)' %( fortran_name, value)
                 else:
-                    fortran_name = key
-                    
-                #get the value with warning if the user didn't set it
-                value = self.get_default(key)
-                if hasattr(self, 'mod_inc_%s' % key):
-                    value = getattr(self, 'mod_inc_%s' % key)(value)
-                # Special treatment for strings containing a list of
-                # strings. Convert it to a list of strings
-                if isinstance(value, list):
-                    # in case of a list, add the length of the list as 0th
-                    # element in fortran. Only in case of integer or float
-                    # list (not for bool nor string)
-                    targettype = self.list_parameter[key]                        
-                    if targettype is bool:
-                        pass
-                    elif targettype is int:
-                        line = '%s(%s) = %s \n' % (fortran_name, 0, self.f77_formatting(len(value)))
-                        fsock.writelines(line)
-                    elif targettype is float:
-                        line = '%s(%s) = %s \n' % (fortran_name, 0, self.f77_formatting(float(len(value))))
-                        fsock.writelines(line)
-                    # output the rest of the list in fortran
-                    for i,v in enumerate(value):
-                        line = '%s(%s) = %s \n' % (fortran_name, i+1, self.f77_formatting(v))
-                        fsock.writelines(line)
-                elif isinstance(value, dict):
-                    for fortran_name, onevalue in value.items():
-                        line = '%s = %s \n' % (fortran_name, self.f77_formatting(onevalue))
-                        fsock.writelines(line)                       
-                elif isinstance(incname,str) and 'compile' in incname:
                     line = '%s = %s \n' % (fortran_name, value)
-                    fsock.write(line)
+                fsock.write(line)
+            else:
+                if incname in self.include_as_parameter:
+                    line = 'PARAMETER (%s=%s)' %( fortran_name, self.f77_formatting(value))
                 else:
                     line = '%s = %s \n' % (fortran_name, self.f77_formatting(value))
-                    fsock.writelines(line)
-            if not output_file:
-                fsock.close()
-                path = pjoin(output_dir,pathinc)
-                if not os.path.exists(path) or not filecmp.cmp(path,  path+'.tmp'):
-                    files.mv(path+'.tmp', path)
-                else:
-                    os.remove(path+'.tmp')
-
+                fsock.writelines(line)
+        if not output_file:
+            fsock.close()
+            path = pjoin(output_dir,pathinc)
+            if not os.path.exists(path) or not filecmp.cmp(path,  path+'.tmp'):
+                files.mv(path+'.tmp', path)
+            else:
+                os.remove(path+'.tmp')
 
     def write_autodef(self, output_dir, output_file=None):
         """ Add the definition of variable to run.inc if the variable is set with autodef.
@@ -3741,13 +3908,14 @@ def remove_all_cut(self):
    %(tmin_for_channel)s = tmin_for_channel ! limit the non-singular reach of --some-- channel of integration related to T-channel diagram (value between -1 and 0), -1 is no impact
    %(survey_splitting)s = survey_splitting ! for loop-induced control how many core are used at survey for the computation of a single iteration.
    %(survey_nchannel_per_job)s = survey_nchannel_per_job ! control how many Channel are integrated inside a single job on cluster/multicore
-   %(refine_evt_by_job)s = refine_evt_by_job ! control the maximal number of events for the first iteration of the refine (larger means less jobs)
+   %(refine_evt_by_job)s = refine_evt_by_job ! control the maximal number of events for the first iteration of the refine (larger means less jobs)  
 #*********************************************************************
-# Compilation flag. No automatic re-compilation (need manual "make clean" in Source)
+# Compilation flag. 
 #*********************************************************************   
    %(global_flag)s = global_flag ! fortran optimization flag use for the all code.
    %(aloha_flag)s  = aloha_flag ! fortran optimization flag for aloha function. Suggestions: '-ffast-math'
    %(matrix_flag)s = matrix_flag ! fortran optimization flag for matrix.f function. Suggestions: '-O3'
+   %(vector_size)s = vector_size ! size of fortran arrays allocated in the multi-event API for SIMD/GPU (VECSIZE_MEMMAX)
 """
 
 template_off = '# To see advanced option for Phase-Space optimization: type "update psoptim"'
@@ -3903,9 +4071,12 @@ class RunCardLO(RunCard):
                       "get_dummy_x1_x2": pjoin("SubProcesses","dummy_fct.f"), 
                       "dummy_boostframe": pjoin("SubProcesses","dummy_fct.f"),
                       "user_dynamical_scale": pjoin("SubProcesses","dummy_fct.f"),
+                      "bias_wgt_custom": pjoin("SubProcesses","dummy_fct.f"),
                       "user_": pjoin("SubProcesses","dummy_fct.f") # all function starting by user will be added to that file
                       }
     
+    include_as_parameter = ['vector.inc']
+
     if MG5DIR:
         default_run_card = pjoin(MG5DIR, "internal", "default_run_card_lo.dat")
     
@@ -4139,10 +4310,15 @@ def default_setup(self):
         self.add_param('hel_splitamp', True, hidden=True, include=False, comment='decide if amplitude aloha call can be splitted in two or not when doing helicity per helicity optimization.')
         self.add_param('hel_zeroamp', True, hidden=True, include=False, comment='decide if zero amplitude can be removed from the computation when doing helicity per helicity optimization.')
         self.add_param('SDE_strategy', 1, allowed=[1,2], fortran_name="sde_strat", comment="decide how Multi-channel should behaves \"1\" means full single diagram enhanced (hep-ph/0208156), \"2\" use the product of the denominator")
-        self.add_param('global_flag', '-O', include=False, hidden=True, comment='global fortran compilation flag, suggestion -fbound-check')
-        self.add_param('aloha_flag', '', include=False, hidden=True, comment='global fortran compilation flag, suggestion: -ffast-math')
-        self.add_param('matrix_flag', '', include=False, hidden=True, comment='fortran compilation flag	for the	matrix-element files, suggestion -O3')        
-        
+        self.add_param('global_flag', '-O', include=False, hidden=True, comment='global fortran compilation flag, suggestion -fbound-check',
+                       fct_mod=(self.make_clean, ('Source'),{}))
+        self.add_param('aloha_flag', '', include=False, hidden=True, comment='global fortran compilation flag, suggestion: -ffast-math',
+                       fct_mod=(self.make_clean, ('Source/DHELAS'),{}))
+        self.add_param('matrix_flag', '', include=False, hidden=True, comment='fortran compilation flag	for the	matrix-element files, suggestion -O3',
+                       fct_mod=(self.make_Ptouch, ('matrix'),{}))        
+        self.add_param('vector_size', 1, include='vector.inc', hidden=True, comment='lockstep size for parralelism run', 
+                       fortran_name='VECSIZE_MEMMAX', fct_mod=(self.reset_simd,(),{}))
+
         # parameter allowing to define simple cut via the pdg
         # Special syntax are related to those. (can not be edit directly)
         self.add_param('pt_min_pdg',{'__type__':0.}, include=False, cut=True)
@@ -4164,8 +4340,7 @@ def default_setup(self):
         self.add_param('mxxmin4pdg',[-1.], system=True)
         self.add_param('mxxpart_antipart', [False], system=True)
                      
-        # CUDACPP parameters
-        self.add_param('cudacpp_backend', 'CPP', include=False, hidden=False)
+        
              
     def check_validity(self):
         """ """
@@ -4704,6 +4879,9 @@ def create_default_for_process(self, proc_characteristic, history, proc_def):
                 continue
             break
 
+        if proc_characteristic['ninitial'] == 1:
+            self['SDE_strategy'] =1
+
         if 'MLM' in proc_characteristic['limitations']:
             if self['dynamical_scale_choice'] ==  -1:
                 self['dynamical_scale_choice'] = 3
@@ -5769,7 +5947,7 @@ def default_setup(self):
         self.add_param("CheckCycle", 3)
         self.add_param("MaxAttempts", 10)
         self.add_param("ZeroThres", 1e-9)
-        self.add_param("OSThres", 1.0e-13)
+        self.add_param("OSThres", 1.0e-8)
         self.add_param("DoubleCheckHelicityFilter", True)
         self.add_param("WriteOutFilters", True)
         self.add_param("UseLoopFilter", False)
diff --git a/epochX/cudacpp/gg_ttg.mad/bin/internal/check_param_card.py b/epochX/cudacpp/gg_ttg.mad/bin/internal/check_param_card.py
index fe874a06a4..71089d7480 100755
--- a/epochX/cudacpp/gg_ttg.mad/bin/internal/check_param_card.py
+++ b/epochX/cudacpp/gg_ttg.mad/bin/internal/check_param_card.py
@@ -85,7 +85,7 @@ def load_str(self, text):
             self.value= ' '.join(data[len(self.lhacode):])
             # check that lhacode are the first entry otherwise return invalid param.
             if ' '.join([str(i) for i in self.lhacode]) != ' '.join(data[:len(self.lhacode)]):
-                raise InvalidParam
+                raise InvalidParam("line was %s" % str(data))
         else:
             self.value = data[-1]
         
diff --git a/epochX/cudacpp/gg_ttg.mad/bin/internal/common_run_interface.py b/epochX/cudacpp/gg_ttg.mad/bin/internal/common_run_interface.py
index 5d0187e3fa..87cb4b88df 100755
--- a/epochX/cudacpp/gg_ttg.mad/bin/internal/common_run_interface.py
+++ b/epochX/cudacpp/gg_ttg.mad/bin/internal/common_run_interface.py
@@ -749,13 +749,15 @@ def writeRunWeb(me_dir):
         
     class RunWebHandling(object):
         
-        def __init__(self, me_dir, crashifpresent=True, warnifpresent=True):
+        def __init__(self, me_dir, crashifpresent=True, warnifpresent=True, force_run=False):
             """raise error if RunWeb already exists
             me_dir is the directory where the write RunWeb"""
             
             self.remove_run_web = True
             self.me_dir = me_dir
-            
+            if force_run:
+                self.remove_run_web = False
+                return            
             if crashifpresent or warnifpresent:
                 if os.path.exists(pjoin(me_dir, 'RunWeb')):
                     pid = open(pjoin(me_dir, 'RunWeb')).read()
@@ -4904,6 +4906,7 @@ def __init__(self, question, cards=[], from_banner=None, banner=None, mode='auto
         self.load_default()        
         self.define_paths(**opt)
         self.last_editline_pos = 0
+        self.update_dependent_done = False
 
         if 'allow_arg' not in opt or not opt['allow_arg']:
             # add some mininal content for this:
@@ -6574,7 +6577,7 @@ def reask(self, *args, **opt):
     fail_due_to_format = 0 #parameter to avoid infinite loop
     def postcmd(self, stop, line):
 
-        if line not in [None, '0', 'done', '']:
+        if line not in [None, '0', 'done', '',0]:
             ending_question = cmd.OneLinePathCompletion.postcmd(self,stop,line)
         else:
             ending_question = True
@@ -6583,7 +6586,9 @@ def postcmd(self, stop, line):
             self.check_card_consistency()
             if self.param_consistency:
                 try:
-                    self.do_update('dependent', timer=20)
+                    if not self.update_dependent_done:
+                        self.do_update('dependent', timer=20)
+                    self.update_dependent_done = False
                 except MadGraph5Error as error:
                     if 'Missing block:' in str(error):
                         self.fail_due_to_format +=1
@@ -6636,6 +6641,8 @@ def do_update(self, line, timer=0):
             self.update_dependent(self.mother_interface, self.me_dir, self.param_card,
                                    self.paths['param'], timer, run_card=self.run_card,
                                    lhapdfconfig=self.lhapdf)
+            self.update_dependent_done = True
+            
 
         elif args[0] == 'missing':
             self.update_missing()
@@ -6715,12 +6722,13 @@ class TimeOutError(Exception):
         def handle_alarm(signum, frame): 
             raise TimeOutError
         signal.signal(signal.SIGALRM, handle_alarm)
+
         if timer:
-            signal.alarm(timer)
             log_level=30
         else:
             log_level=20
 
+
         if run_card:
             as_for_pdf = {'cteq6_m': 0.118,
                           'cteq6_d': 0.118, 
@@ -6779,6 +6787,10 @@ def handle_alarm(signum, frame):
                     logger.log(log_level, "update the strong coupling value (alpha_s) to the value from the pdf selected: %s",  as_for_pdf[pdlabel])
                     modify = True
 
+        if timer:
+            signal.alarm(timer)
+
+
         # Try to load the model in the limited amount of time allowed
         try:
             model = mecmd.get_model()
@@ -6907,7 +6919,8 @@ def check_block(self, blockname):
     def check_answer_consistency(self):
         """function called if the code reads a file"""
         self.check_card_consistency()
-        self.do_update('dependent', timer=20) 
+        if not self.update_dependent_done:
+            self.do_update('dependent', timer=20) 
       
     def help_set(self):
         '''help message for set'''
@@ -7533,7 +7546,8 @@ def open_file(self, answer):
             else:
                 raise
         if time.time() - start < .5:
-            self.mother_interface.ask("Are you really that fast? If you are using an editor that returns directly. Please confirm that you have finised to edit the file", 'y')
+            self.mother_interface.ask("Are you really that fast? If you are using an editor that returns directly. Please confirm that you have finised to edit the file", 'y',
+                                      timeout=False)
         self.reload_card(path)
         
     def reload_card(self, path): 
diff --git a/epochX/cudacpp/gg_ttg.mad/bin/internal/extended_cmd.py b/epochX/cudacpp/gg_ttg.mad/bin/internal/extended_cmd.py
index a6a8609dce..2f37070580 100755
--- a/epochX/cudacpp/gg_ttg.mad/bin/internal/extended_cmd.py
+++ b/epochX/cudacpp/gg_ttg.mad/bin/internal/extended_cmd.py
@@ -1108,9 +1108,12 @@ def ask(self, question, default, choices=[], path_msg=None,
         if alias:
             choices += list(alias.keys())
         
+
+
         question_instance = obj(question, allow_arg=choices, default=default, 
                                                    mother_interface=self, **opt)
-        
+        if fct_timeout is None:
+            fct_timeout = lambda x: question_instance.postcmd(x, default) if x and default else False
         if first_cmd:
             if isinstance(first_cmd, str):
                 question_instance.onecmd(first_cmd)
@@ -2271,6 +2274,9 @@ def postcmd(self, stop, line):
                 if n:
                     self.default(line)
                     return self.postcmd(stop, line)
+            elif self.value is None and line:
+                self.default(line)
+                return self.postcmd(stop, line) 
             if not self.casesensitive:
                 for ans in self.allow_arg:
                     if ans.lower() == self.value.lower():
diff --git a/epochX/cudacpp/gg_ttg.mad/bin/internal/gen_ximprove.py b/epochX/cudacpp/gg_ttg.mad/bin/internal/gen_ximprove.py
index 3b8ec31215..5fd170d18d 100755
--- a/epochX/cudacpp/gg_ttg.mad/bin/internal/gen_ximprove.py
+++ b/epochX/cudacpp/gg_ttg.mad/bin/internal/gen_ximprove.py
@@ -154,9 +154,18 @@ def get_helicity(self, to_submit=True, clean=True):
             p = misc.Popen(['./gensym'], stdout=subprocess.PIPE,
                                  stderr=subprocess.STDOUT, cwd=Pdir)
             #sym_input = "%(points)d %(iterations)d %(accuracy)f \n" % self.opts
+            
             (stdout, _) = p.communicate(''.encode())
             stdout = stdout.decode('ascii',errors='ignore')
-            nb_channel = max([math.floor(float(d)) for d in stdout.split()])
+            if stdout:
+                nb_channel = max([math.floor(float(d)) for d in stdout.split()])
+            else:
+                for matrix_file in misc.glob('matrix*orig.f', Pdir):
+                    files.cp(matrix_file, matrix_file.replace('orig','optim'))
+                P_zero_result.append(Pdir)
+                if os.path.exists(pjoin(self.me_dir, 'error')):
+                    os.remove(pjoin(self.me_dir, 'error'))
+                continue # bypass bad process
             
             self.cmd.compile(['madevent_forhel'], cwd=Pdir)
             if not os.path.exists(pjoin(Pdir, 'madevent_forhel')):
@@ -178,11 +187,13 @@ def get_helicity(self, to_submit=True, clean=True):
             #sym_input = "%(points)d %(iterations)d %(accuracy)f \n" % self.opts
             (stdout, _) = p.communicate(" ".encode())
             stdout = stdout.decode('ascii',errors='ignore')
-            if os.path.exists(pjoin(self.me_dir,'error')):
+            if os.path.exists(pjoin(self.me_dir, 'error')):
                 raise Exception(pjoin(self.me_dir,'error')) 
                 # note a continue is not enough here, we have in top to link
                 # the matrixX_optim.f to matrixX_orig.f to let the code to work
                 # after this error.
+                #                for matrix_file in misc.glob('matrix*orig.f', Pdir):
+                #    files.cp(matrix_file, matrix_file.replace('orig','optim'))
 
             if 'no events passed cuts' in stdout:
                 raise Exception
diff --git a/epochX/cudacpp/gg_ttg.mad/bin/internal/lhe_parser.py b/epochX/cudacpp/gg_ttg.mad/bin/internal/lhe_parser.py
index cff8789e38..a6b8582e1a 100755
--- a/epochX/cudacpp/gg_ttg.mad/bin/internal/lhe_parser.py
+++ b/epochX/cudacpp/gg_ttg.mad/bin/internal/lhe_parser.py
@@ -342,7 +342,12 @@ def next_event(self):
                 text.append(line)
                 
             if '</event>' in line:
-                if self.parsing:
+                if self.parsing == "wgt_only":
+                    out = Event(text, parse_momenta=False)
+                    #if len(out) == 0  and not self.allow_empty_event:
+                    #    raise Exception
+                    return out
+                elif self.parsing:
                     out = Event(text)
                     if len(out) == 0  and not self.allow_empty_event:
                         raise Exception
@@ -448,6 +453,8 @@ def unweight(self, outputpath, get_wgt=None, max_wgt=0, trunc_error=0,
         event_target reweight for that many event with maximal trunc_error.
         (stop to write event when target is reached)
         """
+        self.parsing = 'wgt_only'
+
         if not get_wgt:
             def weight(event):
                 return event.wgt
@@ -914,6 +921,8 @@ class MultiEventFile(EventFile):
        The number of events in each file need to be provide in advance 
        (if not provide the file is first read to find that number"""
     
+    parsing = True # check if/when we need to parse the event.
+
     def __new__(cls, start_list=[],parse=True):
         return object.__new__(MultiEventFile)
     
@@ -986,6 +995,7 @@ def next(self):
         nb_event = random.randint(1, remaining_event)
         sum_nb=0
         for i, obj in enumerate(self.files):
+            obj.parsing = "wgt_only"
             sum_nb += self.initial_nb_events[i] - self.curr_nb_events[i]
             if nb_event <= sum_nb:
                 self.curr_nb_events[i] += 1
@@ -1065,6 +1075,8 @@ def define_init_banner(self, wgt, lha_strategy, proc_charac=None):
             # check special case without PDF for one (or both) beam
             if init_information["idbmup1"] in [0,9]:
                 event = next(self)
+                if len(event) == 0:
+                    event = Event(str(event))
                 init_information["idbmup1"]= event[0].pdg
                 if init_information["idbmup2"] == 0:
                     init_information["idbmup2"]= event[1].pdg
@@ -1115,6 +1127,7 @@ def initialize_unweighting(self, getwgt, trunc_error):
         total_event = 0
         sum_cross = collections.defaultdict(int)
         for i,f in enumerate(self.files):
+            f.parsing = 'wgt_only'
             nb_event = 0 
             # We need to loop over the event file to get some information about the 
             # new cross-section/ wgt of event.
@@ -1302,7 +1315,7 @@ class Event(list):
 
     warning_order = True # raise a warning if the order of the particle are not in accordance of child/mother
 
-    def __init__(self, text=None):
+    def __init__(self, text=None, parse_momenta=True):
         """The initialization of an empty Event (or one associate to a text file)"""
         list.__init__(self)
         
@@ -1322,15 +1335,15 @@ def __init__(self, text=None):
         self.matched_scale_data = None
         self.syscalc_data = {}
         if text:
-            self.parse(text)
+            self.parse(text, parse_momenta=parse_momenta)
 
 
-            
-    def parse(self, text):
+    event_flag_pattern = re.compile(r"""(\w*)=(?:(?:['"])([^'"]*)(?=['"])|(\S*))""")   
+    def parse(self, text, parse_momenta=True):
         """Take the input file and create the structured information"""
         #text = re.sub(r'</?event>', '', text) # remove pointless tag
         status = 'first' 
-
+        tags = []
         if not isinstance(text, list):
             text = text.split('\n')
 
@@ -1354,24 +1367,28 @@ def parse(self, text):
                 if '<rwgt>' in line:
                     status = 'tag'
                 else:
-                    self.assign_scale_line(line)
+                    self.assign_scale_line(line, convert=parse_momenta)
                     status = 'part' 
                     continue
             if '<' in line:
                 status = 'tag'
                 
             if 'part' == status:
-                part = Particle(line, event=self)
-                if part.E != 0 or part.status==-1:
-                    self.append(part)
-                elif self.nexternal:
-                    self.nexternal-=1
+                if parse_momenta:
+                    part = Particle(line, event=self)
+                    if part.E != 0 or part.status==-1:
+                        self.append(part)
+                    elif self.nexternal:
+                        self.nexternal-=1
+                else:
+                    tags.append(line)
             else:
-                if '</event>' in line:
+                if line.endswith('</event>'):
                     line = line.replace('</event>','',1)
-                self.tag += '%s\n' % line
-                
-        self.assign_mother()
+                tags.append(line) 
+        self.tag += "\n".join(tags)
+        if parse_momenta:     
+            self.assign_mother()
     
     
     def assign_mother(self):
@@ -1905,19 +1922,27 @@ def check(self):
         #3. check mass
                    
          
-    def assign_scale_line(self, line):
+    def assign_scale_line(self, line, convert=True):
         """read the line corresponding to global event line
         format of the line is:
         Nexternal IEVENT WEIGHT SCALE AEW AS
         """
         inputs = line.split()
         assert len(inputs) == 6
-        self.nexternal=int(inputs[0])
-        self.ievent=int(inputs[1])
-        self.wgt=float(inputs[2])
-        self.scale=float(inputs[3])
-        self.aqed=float(inputs[4])
-        self.aqcd=float(inputs[5])
+        if convert:
+            self.nexternal=int(inputs[0])
+            self.ievent=int(inputs[1])
+            self.wgt=float(inputs[2])
+            self.scale=float(inputs[3])
+            self.aqed=float(inputs[4])
+            self.aqcd=float(inputs[5])
+        else:
+            self.nexternal=inputs[0]
+            self.ievent=inputs[1]
+            self.wgt=float(inputs[2])
+            self.scale=inputs[3]
+            self.aqed=inputs[4]
+            self.aqcd=inputs[5]
         
     def get_tag_and_order(self):
         """Return the unique tag identifying the SubProcesses for the generation.
@@ -2269,7 +2294,11 @@ def __str__(self, event_id=''):
         else:
             event_flag = ''
 
-        scale_str = "%2d %6d %+13.7e %14.8e %14.8e %14.8e" % \
+        try:
+            scale_str = "%2d %6d %+13.7e %14.8e %14.8e %14.8e" % \
+            (self.nexternal,self.ievent,self.wgt,self.scale,self.aqed,self.aqcd)
+        except:
+            scale_str = "%s %s %+13.7e %s %s %s" % \
             (self.nexternal,self.ievent,self.wgt,self.scale,self.aqed,self.aqcd)
 
             
diff --git a/epochX/cudacpp/gg_ttg.mad/bin/internal/madevent_interface.py b/epochX/cudacpp/gg_ttg.mad/bin/internal/madevent_interface.py
index b70b548e53..cb6bf4ca57 100755
--- a/epochX/cudacpp/gg_ttg.mad/bin/internal/madevent_interface.py
+++ b/epochX/cudacpp/gg_ttg.mad/bin/internal/madevent_interface.py
@@ -1,4 +1,4 @@
-################################################################################
+###############################################################################
 #
 # Copyright (c) 2011 The MadGraph5_aMC@NLO Development team and Contributors
 #
@@ -3191,7 +3191,7 @@ def do_treatcards(self, line, mode=None, opt=None):
       
         if mode in ['run', 'all']:
             if not hasattr(self, 'run_card'):
-                run_card = banner_mod.RunCard(opt['run_card'])
+                run_card = banner_mod.RunCard(opt['run_card'], path=pjoin(self.me_dir, 'Cards', 'run_card.dat'))
             else:
                 run_card = self.run_card
             self.run_card = run_card
@@ -3675,7 +3675,7 @@ def do_refine(self, line):
         devnull.close()
     
     ############################################################################ 
-    def do_combine_iteration(self, line):
+    def do_comine_iteration(self, line):
         """Not in help: Combine a given iteration combine_iteration Pdir Gdir S|R step
             S is for survey 
             R is for refine
@@ -3703,8 +3703,9 @@ def do_combine_iteration(self, line):
     ############################################################################ 
     def do_combine_events(self, line):
         """Advanced commands: Launch combine events"""
-
+        start=time.time()
         args = self.split_arg(line)
+        start = time.time()
         # Check argument's validity
         self.check_combine_events(args)
         self.update_status('Combining Events', level='parton')
@@ -3795,8 +3796,9 @@ def do_combine_events(self, line):
     
         if self.run_card['bias_module'].lower() not in  ['dummy', 'none'] and nb_event:
             self.correct_bias()
-        
-        
+        elif self.run_card['custom_fcts']:
+            self.correct_bias()
+        logger.info("combination of events done in %s s ", time.time()-start)
         
         self.to_store.append('event')
     
@@ -7364,7 +7366,7 @@ def wait_monitoring(Idle, Running, Done):
     import optparse
     # Get the directory of the script real path (bin)                                                                                                                                                           
     # and add it to the current PYTHONPATH                                                                                                                                                                      
-    root_path = os.path.dirname(os.path.dirname(os.path.realpath( __file__ )))
+    #root_path = os.path.dirname(os.path.dirname(os.path.dirname(os.path.realpath( __file__ ))))
     sys.path.insert(0, root_path)
 
     class MyOptParser(optparse.OptionParser):    
@@ -7407,7 +7409,13 @@ def error(self, msg=''):
     import logging.config
     # Set logging level according to the logging level given by options                                                                                                                                         
     #logging.basicConfig(level=vars(logging)[options.logging])                                                                                                                                                  
+    import internal
     import internal.coloring_logging
+    # internal.file = XXX/bin/internal/__init__.py
+    # => need three dirname to get XXX
+    # we use internal to have any issue with pythonpath finding the wrong file
+    me_dir = os.path.dirname(os.path.dirname(os.path.dirname(internal.__file__)))
+    print("me_dir is", me_dir)
     try:
         if __debug__ and options.logging == 'INFO':
             options.logging = 'DEBUG'
@@ -7415,7 +7423,8 @@ def error(self, msg=''):
             level = int(options.logging)
         else:
             level = eval('logging.' + options.logging)
-        logging.config.fileConfig(os.path.join(root_path, 'internal', 'me5_logging.conf'))
+        log_path = os.path.join(me_dir, 'bin', 'internal', 'me5_logging.conf')
+        logging.config.fileConfig(log_path)
         logging.root.setLevel(level)
         logging.getLogger('madgraph').setLevel(level)
     except:
@@ -7429,9 +7438,9 @@ def error(self, msg=''):
             if '--web' in args:
                 i = args.index('--web') 
                 args.pop(i)                                                                                                                                                                     
-                cmd_line = MadEventCmd(os.path.dirname(root_path),force_run=True)
+                cmd_line = MadEventCmd(me_dir, force_run=True)
             else:
-                cmd_line = MadEventCmdShell(os.path.dirname(root_path),force_run=True)
+                cmd_line = MadEventCmdShell(me_dir, force_run=True)
             if not hasattr(cmd_line, 'do_%s' % args[0]):
                 if parser_error:
                     print(parser_error)
diff --git a/epochX/cudacpp/gg_ttg.mad/bin/internal/misc.py b/epochX/cudacpp/gg_ttg.mad/bin/internal/misc.py
index d3fed3baa2..91cd3e5c22 100755
--- a/epochX/cudacpp/gg_ttg.mad/bin/internal/misc.py
+++ b/epochX/cudacpp/gg_ttg.mad/bin/internal/misc.py
@@ -347,7 +347,7 @@ def tell(msg):
     if dependency=='ninja':
         if cmd.options['ninja'] in ['None',None,''] or\
          (cmd.options['ninja'] == './HEPTools/lib' and not MG5dir is None and\
-         which_lib(pjoin(MG5dir,cmd.options['ninja'],'libninja.a')) is None):
+         which_lib(pjoin(MG5dir,cmd.options['ninja'],'lib','libninja.a')) is None):
             tell("Installing ninja...")
             cmd.do_install('ninja')
  
diff --git a/epochX/cudacpp/gg_ttg.mad/bin/internal/shower_card.py b/epochX/cudacpp/gg_ttg.mad/bin/internal/shower_card.py
index c6d3948cc4..c344ea1b15 100755
--- a/epochX/cudacpp/gg_ttg.mad/bin/internal/shower_card.py
+++ b/epochX/cudacpp/gg_ttg.mad/bin/internal/shower_card.py
@@ -45,7 +45,9 @@ class ShowerCard(dict):
     false = ['.false.', 'f', 'false', '0']
     logical_vars = ['ue_enabled', 'hadronize', 'b_stable', 'pi_stable', 'wp_stable', 
                     'wm_stable', 'z_stable', 'h_stable', 'tap_stable', 'tam_stable', 
-                    'mup_stable', 'mum_stable', 'is_4lep', 'is_bbar', 'combine_td']
+                    'mup_stable', 'mum_stable', 'is_4lep', 'is_bbar', 'combine_td',
+                    'space_shower_me_corrections', 'time_shower_me_corrections',
+                    'time_shower_me_extended', 'time_shower_me_after_first']
     string_vars = ['extralibs', 'extrapaths', 'includepaths', 'analyse']
     for i in range(1,100):
         string_vars.append('dm_'+str(i))
@@ -82,7 +84,11 @@ class ShowerCard(dict):
             'b_mass' : {'HERWIG6':'b_mass', 'PYTHIA6': 'b_mass', 'HERWIGPP': 'b_mass', 'PYTHIA8': 'b_mass'},
             'analyse' : {'HERWIG6':'hwuti', 'PYTHIA6':'pyuti', 'HERWIGPP':'hwpputi', 'PYTHIA8':'py8uti'},
             'qcut' : {'PYTHIA8':'qcut'},
-            'njmax' : {'PYTHIA8':'njmax'}}
+            'njmax' : {'PYTHIA8':'njmax'},
+            'space_shower_me_corrections' : {'PYTHIA8':'space_shower_me_corrections'},
+            'time_shower_me_corrections' : {'PYTHIA8':'time_shower_me_corrections'},
+            'time_shower_me_extended' : {'PYTHIA8':'time_shower_me_extended'},
+            'time_shower_me_after_first' : {'PYTHIA8':'time_shower_me_after_first'}}
     
     stdhep_dict = {'HERWIG6':'mcatnlo_hwan_stdhep.o', 'PYTHIA6':'mcatnlo_pyan_stdhep.o'}
     
diff --git a/epochX/cudacpp/gg_ttg.mad/bin/internal/ufomodel/py3_model.pkl b/epochX/cudacpp/gg_ttg.mad/bin/internal/ufomodel/py3_model.pkl
deleted file mode 100644
index f71ba45bbc6d4acc8d32bb06662fe900a694009f..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 42822
zcmb__2bdJa_B|}1h$1K|m=F<Vm%KZ>uqY~s1X)3xl^}}iusdw;xPiViOVY+T0_L1^
z&N=6tbB@o5@%hX-{m-ec4%1z`!0-F}-gl|Gb?cmas;fiwboU-sRNIuQtC!RQYglcI
zq=(H*Wz#h+^D^n$T(;ZPmb!FfH@id0;daO3*_}Id=+K;MO4~)Vi%`rq*_~#uhr)uC
z<Qi%l(^<Q7dtlRb*q#m9TsON5FxM$wllX4-x`^#sY!%J!2)2~H-t4aYw>j6KvgxMX
z>#GE_3mTg34XQfL?t+-wmc|ybyH|Cby$)+uO6t@0hLccM|EUg5schEnp^9@RB=@p=
zs-BA)>T;I7k=0XiEvT!nsl`~>8&_?nA{!f;({?XaT$Eb4#NK3fHy|#MmIbN$R1Tb*
zR&|)YE?f=S8V--W*^1fSQO%|s=hrZE<?OveA>@`UNZWm8GbMZT+3Vw<+LqSlTw3t|
zCVLCV-g5Q^aPFC2kZWn4*4i{LE%sKx=V<I}Z(Y@))#3>C^+sr^y^ZA#%2K<Zg|V<O
z0#+c~>Y;|CsjjhR>KuDpYa_opYmVLD+PJE>)yvw%+SJ<I+H8f@dxg~pC8V*P1y;-F
z8$dozghhkx?LnxDL+u?xjJtk^X8adsoMjITGtRP$!PuvOaizytQph;SQ&?{g)(Y$G
zA)2pifkLaNg^dT~O|4$;RA`mPCH7DY8>UdzutJXEo~ngudjv>2A^nBvWp*iOiedxl
z(`*15&dO{~QVsQ1&Mx!E&sUQGQ^#QZ%6+CLyTW5?T4s-IpJ_u3l>vNIA$X+^p0h`L
z;JIb?j_rf@G=uL{2tLN!4z2diz~+W;U1sm1p*vwi?7C`(`p@d&k5G{rde_3?AL~Ka
z+T%2I?J|2eBeXwlr;fMB>&@NS%(8nS%LI=lXHV2D7@J9}VcEx?9G<!R*n41ptz#yf
zQb<_k5!TszYQnl@_S6vJgRjR%tfLuvS|Rja9(2~8uA#Hb?7iEE?qr5e7D88h(DUql
zH1xb>_6#F*urYSCXX=g7#mq9RkY%>VV%htGg-12Zo&%DOvE3~onQZSD-WHSX{jn`J
z$4<xteO)SRrR(gu{x0Yr!VV~;t|2w2Z+fVGpjMb3Y^StB9yA{`9KzD}Jgsnjv%*@?
zg!c40((pji)H>8ogU2low&$;gXsTVWiMpGKtU{s&k0_TKYG*W2E;ZOb$V|jDlsXOU
zGtO?rc=8Cw`mB9Yt#%Vew1=6t*{4mbQ&R)aR;k)tLrZhkZfWnNzDE%0UI#3|naghX
zIv~`gn=J^>7OQR6P-o{b%AGA<O0*VRv)%utt?qN}g^?W8g+NWJu|7RdQr*V$vY<LA
z4bAo4>_zCk*s+%s>nn!z{Q36M+WDTGZWDE>n|-jFo2B+4dD}_a%UtEznuexSJ+4G#
ziGAowHG<P>YZkOD!b$otoTPcyZ)|B<P~&?K#|e90N?jcwjdXKvseQy;`$$D!jsHsR
zqf+X+3ReLOaG_D3weyaBbXA9{4oBO|RoJBQHHn11BCBS5i`1nhYp-<dW7Ndjd&2Ja
zu~kLc*&BM)!3ElJ)2u$~nuC{l_VKefLdXJKXtc__6t`ZDx)wQMTC2r@KC#%!;iTt|
z@JV`v)%c!laoA3A>{E-arIt4^r{(R_llB?bp(`y&w_KqX#GR=)mA11|7Sx<=`9emH
zw$I60$o^dG*s6|Miz!;iTV4+5F^5h%hx3cALHwV};R2Pzh3#(X`Q7Y`)K+V-FP>{(
zQj9_DSXGoh+P*YtU*_1C&r3P>6=4Zi=IyJJ_SJC-*C+`yXU;6OuT>JR3nY||u&-AV
zZg4li_}%Rr4N`8Rl+Iep&02~Ub4#&Ob8E46k%dfeE1vQ%{$sjG_V%QGhhyJ4&uzP_
zxT>hClWIrX-AVf%$G&%7+Oh8o3%NgUKajK^ED)k79?}#K`xG3HN0RoVj{TUXc)S?H
z<WoG6w4ZeBr%3U1nBtkd{cO^HE>7{hrg*`p=md%vllDuF{j#Qbg%m}eidU2NYmWUo
zDc%TEyqUM(O4@H1P^3Ad-f?$VY!7UZchye8a>9PkFPGczr#aXDL#B>L+aGwlPHCC_
zp|#xl5W`q>wEdCwVJoNK$JUM5X`<8b6K=CEdYgS(Y;{$a+N*G!wQ$XAnJ;!NT;W=g
z%i5JzZ~sE@v*0W=KuzV()t33fvA+z9|0-{PowWa3AU;L$-&pPx{2!(u7iBmcl-u8`
z8Tg%XC;FZ$*3l|{D7IEv19-4h+&?PrpB($=F!wKc``4uXTb%oMJ-$3n{h@Y*KZ6}1
zQEvaG()b%`#0T>qrm?P0<6oTylaLNL>oX7Oh=L<Uu%T?E6Y6>C44afLu(7mc9eDK~
zT3ssZqQ=~$D}((*&8RY24-RH1-PmF{;K=%DV4hvwJY@q_?}{ytQB%owXICb>AqtN4
zPyzl?P<o=CmyKYPvN3Et*<SGa)3i)BK~-hDDWm;OS|OXEu9EG|2E&L-A2cx8^>nhE
zt2#GtUzO?>?8sENM8T1*RCAE()~M&DFKkk_fsLoyk6ty~WLp;fv8+f)e>hZn+p)zo
zmIKhh^t$QvwpaC4YW}%nxdXd0!GS0^Qmg`k1WQoQ%OKdK42F#-I0W8|KY~LU<L8*D
zkYRAB9EY>TG=wA2z#P}tIhLxr+S+;$%h;7UCQxvsTm=L<R-m4jk+4Y_1sl(?5?+6M
zSIB5o)%M<z(SEiSqhu#IRJLQ-V#;=BG%(u@bhf*wdS9!rw?E6S?8<D%qTt9l6%b^*
z8|rx(51W+TVdL3OpjYkMGLc0;!I2d*2@aLuWVV<R+yf0vu)9uhimIyw)r_oSN2a$Y
z3XV)w%|Uw8P|wR=ut}K?8&7X<cr*SEo@9)lVP&aQ!=W<Vhb^W_IRg#Ma6_HpOjTF=
zjNah0*p)f*(!r5^RX~vA9Mto&A8b<ghmGer7v5k}9>5qs$HYjffkWkZAX`kMm_h?{
z?4fg<r|P)NBekkjWMLU;t@-*fboP-tlKOXYs;sVRyQJx*=J_bJdrh@ZH>qdv#dF2N
zzdXBkNFOZ?C?_QY>&QW{xU`Z+wg(r8(u8_mnqiaD0vjK>1@Nll_{51bOT|V_?J2@w
z|HxP>S=7}C=Gb7`%B^VN2=>$?xKPzOf{RqC$iiybTJvRccmx-d)E~jU^aw7Ymm|0o
zg^V{u2a`<?(IF@&#esEX87znBP_~DL=rGjtayV>Kj)091(UI_G{E<0|G0QW0Kr(7P
z3NN80j}Q*UMtUfYR&@@=a#bp_u)4R*%X-VOmJbib3ex*SQLTq!CA}PqV^Hvi;#jij
zp*RlZq#O_H$O*6<iWAu$8j6!p&&$cMNjU{JJ`|_I8w|y1jA?%;PDcobVq-lNXRw~(
zK4n$8SK@LeE6n{Y6#U%JW*{yY-MM}aYDqa4){*mInf>{U2xWf(>Up^kHYpdu#<RZ|
z-XQx+7~>x$N=xNZIMf`!j4g%*g<OsXs_CWGT)}#V^V`}>tGSXDs<{dUU(MAF^wnI0
zT2iisb>uo&s=1yKAvHIko|hY8lX4SmT+Pk!25N3$%<_!ZkWmsM)ACk?F#An(_P4Q~
z;gqzhb@sQj!tC!r!O#9q2Kw3Gg<4YXhIQl~SZ04OBSP8Vhk9P_hfT@@u<`64gg40k
zA;z?y{lf@h_M7VLA7S0gUR_zsqeNl$kD=ga|2PBnJbD7<q&x}h$WyS){b{y`a(@Q(
zygUn=l;>dMxjzrDK4w;z%L}NfW9Ewt_9s+jLSBMHO{kaIVmfHPf(DA(OpAJzb#M1@
zuP9$*Uy6Di1z*$~4Ai3DL^&yM!8-CbEJeM;_K>J|QP0bJut|9zHZJNv?BSgW%H#u7
zy;?#(M2%}Z`G`@0$chp2F+5y0A1$A-3x5T|@~EOxK1G!aJz0f<QT=CVp!(ig{pYNE
zyOXQ_3-+b@FH!K-f5kwp{%e$z@?Tg-zJaCs|FJ!!{#(@Z@*QkazK4yg|A9R+6rJ(4
z{zxBYB^72UE~6!7l%mMq^^=Ou;g=P4`$m3d-J74Tm|x&hVtyr9J!==wmERbs#r%$P
zQvQH-<WE?N`HSr#F@K|;mw#ZB@-J*$Ob6Uyjf&~0e7*`TOKqPbG#R%~C-%Yi*<4HN
ztm-_Eby1}v3%^EKYyLAt_&l}_N&WL!m0szsOD{KfR}}maT90gcgu0=el=Wd9*#MR!
z)Sd025!w*-y!3!gN>AAM2yFzf+S+Qz-Iy`U)xAAT-O>>wq!&DD&)tMgragC4G;nOT
z&||Zis&j05t5T7LpHnRJ?<khx$CL2b^dV_%Y&NHtW3vSc{@83uHa#|5p`4VhVIAoU
z%dy#p?V+*hhk9PNg-uF-*!b9N2X8Po0~iw;n}lo+j~bgD*kl@;foR~^Y^lelSk*Z;
zC8|_p;YT3L{OgZ}pM0#y?>ynL8AQ_9*bJtZV>1K=e{6=5O^?kml#?<X){zmg9Gg<M
zhsLH1^}HltlTr>FADaqzgRvROn9$galu_`gv8iN}X>3NLfn&3k9-AFm&v1XjC4{>~
z--#898H0i^W@iR!F}t9glwDyR84F7><JcY&vm5GIcfw-b2^$wP0p38&M8<@~l*%M{
zl$gnEGKtv(4HUDr7BhwQ40ixuOcg5>vnL9^n5hiZVy2;-l)YdbnGQ=ad$T<xCW(4p
zs$r9|4{Th_40r=EGZ_;SQ!caMQDXR0hEdGEXrP$BTFe~QGn}!$nEhCxnEg@k#mr@(
z7IOf~NvVN#<Um-8NwGa7W*+K!sfA5S9c)}o8s0$6e8z;tjFNhIlo*RmrY+Ne28!86
zi^;I=PP1w~%?@H;ifKf_7t_Q*Ev6Y|tQ%n+SpZ8hHrqpD1ogaRVUv=Bjf-i8HxRRs
zF(EN!vIrg}W-*&gVwRwRV)|(@OIdfPnJ4C8_NACZQ1Hb#4Af$lp`4UMVI4UPmSPTP
zdq~U?sORNK*rXf<8yAy@HxP3)V?tsoWH~%a%nCM{#H>UE#cZp^9K*Uh%{(#3vM<FP
zhk`HWcm`@QC!m~^6JZ@W36^3`W_w7?DX8b=RM@1P1{)W1I=q3HGZ+&RQz>V{qr{xW
zCX<-6(Lgc%wU~2Mok#j}RjJ6rue_G|=UvP2n{N0>e;!HwpLJ{WO80zvIW`xd;K+rF
zCb(5}5$bul7&a-Fz{W*i3U9FRxQsFWeU5}&j=GY31shC~uS5eSZ>J?+rRtP?wJH@^
z_`%sS|Iln1eq9brzJ{c}<T@?+T6!t@Iux){R5XF)8&Jne5f&>&*tq1I;SD6;!kFb5
ztu&*AMy_RVMF=$x&>C-3b!xm_m5MC<)@_-8<F@d-wiWp)JFM{z()$`4w8lH>rN+BZ
zaO7@96KK2#^}O5*o0R)t;~MXWH_-S1W7^mFAVR2dd#&*y)-#;8R*m~hjXcZ>lX?UN
zM;>KFXa{=?^}IX|o0KPD<4HXUZ^mCAKE;?|eK=a4hC@xZXV_wzY|o;BN$#MNe2(=D
zCz@5KlYE{PCiwyij=ac-P?9g9o|l(llky5|Jjqw#4U&9~F@BPx6Y@G7D#<t4VoLH&
zG%(45I?1<K&v2$#4LZrUSz(gzpy0^6j0h$99_o2{A2uoffsH5m0lYzyA2KFLa<qH|
zhf4BewwRLq1Px5GSSR@@>)z&emuIWkmq~txf+L@+9>E;^0`<In37eF!VB<-C%^vE2
z>aCp1<-hQ$mGd`j46i2rrSfR`AKI`KGp;MYRsC{RB|7o%SofxYoA~!|sl<Ok!I2+T
zk09}%P|wTHuu1s^HlFye?4c6R`1$@uUpU`Pds+UjqH|S)be4Z8L_G)GEdPW{W%(Bh
zj{L291X=!rdS3p8O-ctmuM^L*BfQZpi|AY9EIX;_T-9KmWoK3AajlCg6<K%!(OUhJ
zhu-sm;p5smKnRX&HQs`jUe2kmC^)j7q6tKILp?9+!zN_|*tqEK@CN7T4H@H4{PGdf
z0}eIud$Pqe@i#&PC;kwvd1F<l=3c5)WZ_vzYt5g53~Szmr2etIPHWzjUTWS91xI=-
znm}_O)bp}AY*My>jceW#-azwKj9H%1N;67m<b1U?La1@5*4S6osc{=sDzflMrnTme
zVum&LBfYP&L2KNWUTW-*f+O20nn2?K)bp}EY*KcBjcXhTZ=kW5G3{$CK?pSt(;5e{
z?%paKW35s@DGeqHlNy48f898gf%>{}7|K`$!(tT-%j?EcwucVJWvF8n42xATY&`c0
zc!S(WGA5Xlr7{W*H76_CVw#hq(LgQ3wU!-OcW?Z8T6SVzY8iuquVrTjYAw5<j72ak
z7QwL8GLG#bExVzPMKCND!LV^H6W|TBOk_-;rA#Klp|nh9i%H8KXrPu6TFVsH-TQu?
zmMZq8mOWAMwM=E8)-nxctbt*%28N}Uz1bepl0+SAU|6hyVdGk6z#C|p$(TS(LT16C
zwD6%Rqn3TqKrN+O%N*9-IW|_$vHjSWTJ}f5*D{xZTFU__V+jn4B`_?tq}U$PG7ohu
zfnl)(hK*}U!y9Os&zL|<xzxj<v{-C0ZI=c#P)nKCl40GQW1g0S*q2%wQSh}iF;Hu1
zMj0z$Sge3ysl{e{NQ<D36)-GTz_4*Gt?&j~7BVK#QXz}rP+AtV#iV5k8mJ|qwJc@b
zonxMsgV~o_4ne`!;xJHaS%xwez_3^V!&1xPY!7KU0(C5aVX**)jcdun8)!M2F@ctm
zvK$VjWd&PIT2`WgTFSMSV_0|R*aSVtj%8nJISvJ1%kd1<T24S2>t9%`e_^TRWVVO2
zoPs*mzpz;U!p60n4sW3448{anM#-6QC@p8P#iZqIG*C;0)^ZN(?i};9oXft{avlo4
zmh%~?wOoL5QZ9sb<RVyVxtQ%CEtjC4mrG%jav5w~%jNI}TCQMBprul-ghOe$iY+EB
zSEGSiMrtkBu<p(=Ps_FJOD)%-;A^>_fm+KAC@1AcSVwMxrIwr79@26P>Up^pHYvBk
z#<koIZ=mH4#speM%bjp2EqAfSq~&fjP|GN-<sR1EyI*7Ug7aSXrI!0p@U`5}K&|Bg
zl#}uxtRoM>Qp>|^4{3P>^}IX^o0P|3<60hvH&~ZG!5IH`Lzz5@x{~q~8%$E3Mgyf(
zYAMgK?%w+vtED{4zLfGD3ci%*8K|YafO1k^gmvU4SW0=B?I9_zpq`gkVUzM2Y+TCg
z@CH)eV9fH2R*+EwB9~=vB80h**15mMx_hf@g3kSIqA>S&Q1Ekqmw`I>_fW=fmavZe
z2bQ^i!1hq?AEKU@k6@GXF>E~dPv8x5|CBN9=e`Od%za0l`)90sQ%F4zEuRyGxqpFz
zpZk{#^nWb>3bmws4eQ8%VVV6mj0k1_Kh*Q`Eo@T0gN<kZJ-k8oKQN~KAIpD42(#Zw
zXa5uH-mG!6|CuPv{udPd?0;pTpZ#yBCFOTmNB)3i_J1-Wl>J|*=jCtMr2GRL&;DO{
zgX}xt0ov7M-w`3qevHn(h;?t?xY>6i3bXHwf}edC2Kw2rgIZG7g~fBJu*`lvMuf8O
zhB_8tuvmb>#<TAZZ;<_ljA?)Ddmx0_@2s=$$+|c8?t=;&5rx@rjDnwiF9zyazX{4o
z*%a21&0v{(Z?=bW?}IuPQm|M^!Nzmn65imh$ySW<52TglvNas)=+l=irlZd`XrPu|
zw3dFXdt1WQvMu{kOMeu6E!#0rYZ-uYQnrV6WCvJk8OZjKmSWWNQUaTlL9lTxgW(Od
z3}K9~Wpud=g+pl>#uk&7;b@?iUA2}Gtb2R3dyAr!eW|4k1z$^ofm%yB%1Nn!bz~$g
zwTxnWNJ}N^c^M6xlpSH?T6SU&b%yh96jjI=_|%P}o!J;@t0<LS;3;?j!I53zGO8S_
zdgrRfYL(+y_hz4~ayRy+%JC@pDtBj~RyhIXq)dc$WD+b@PG);Z<sPW#WeRLks$k<P
z_hb*HGUJPzN*{Khn52wS6gh!RQ_;Dqaazn?3b6$Dh^z_v(b4H}DKUGa;EPEzP>ZQX
z87n4OM`plM%uKe2#LPlHFWeYO*%vl0W)6Eq#Oz1k8jIOqMdzw^(_-c-M0LY?xW4c`
z04^n_1_i91RFB}ULJIY~%!5rzEo?l?I(UOCr!-^QzpF4GA>1j(>!j*gckeEi>!d8A
zaJn?0;7CUG2$DJo^}IB~CZ!2Bo>ViuK~gP@X+Nn22w_sY>!fVf-5bi4Iw>Ivlggst
zNKW+#l4?agFAHIlvIsVw)M9vpq?RzI{iK#6gh@@%Ngd33rVGz_!}VM|geXkPK><&k
zsvbd7hoYXB!(fwgIBYzrBj62^I+8K|F>Yj;90i9u#^u>!I>sH11|~UCC%K&UOfM!`
zu9IBBzD#l@3XUA3dIU)xi+Wy;gT>RLu<;~MfHz3;M8+)7=+rVQv&de25<-~NB%RdB
ztY@}iQk6QXQ;5Q(PDR0y(^QWjsnb!<%NejqITJRX)LHNbNuAA@Hj_%b-wsF#KCOV)
zD6?`7!W}smwzy=joX3A74`jzb(l9e64XJsJ>27j9;*t3U3gkj|#=A;A)qy>|c!9`8
zaLtC;4kLz*kc&~#Zv;v@)hL&+QN2Q&UXX6ClS@%AQNIi<Of|Nq<ucT#wa&|ElFLzb
z<O*26c*>Uv`SxgIs%chrO%2#;YRcqFv}P13KVsmw4;oY^SE0%3lObtFF=Zn7SEI#|
zYha1bPb{$i_^PR;i%aENc2mH8&`{cMXg?p^BOhKiOs+#1B=S|)+LopTjp@a5J$o7n
zZ$OJ9H^RmU%j70@Q-u8Lg-<3o!w(s4qu!!WZ`G*YFz9R=<Tf9dP1;>NVn}I;+`c;K
zI~4St8rrl45^`4ozMuD?K?%9r#cz8H+@tXC)%XQlAR+hpz~l6QjGI+G$S2TSAu*_U
zXfd7XTuL{4(o6acRvS&MmRZ#$a=)8R+XMN4%I3j%Hf5L(5Bb?J&7po8^6=_lA5pN6
znqbT1F(0<wY4-T)NS;t6Pnt*)@{~`qc{?LnCQq-9<r&5DY@8*55qi#N*{vNGY%gzT
zif=K!m+1}m{OZzqL8bGeDV;KT$xml1ZZI`ROcSWAL|$GU(JPARRTEJ{Uh|3Yd}@bM
zd3|+gZz!}kH5%We=4_fe$2+dbTRx(9D405jN3^$H<hJ*xcNFrwZ6TM-dq(7Pd4F}t
z|53;vXk>mxhB;?#$?+)sp%2>wE&Xsl$yvkLSX@@pPd-{5_{R$T6AkVU$Yg&&KK1ea
z0h!DJS+zRk&lK|K8reI5)wVQe>TBF1eL}wQ!TYM@-NQIzTJg1=w3;~Tq>zA?FI~pA
z4<cVF#;@CCEL)tA|E?J$Lc#dW>KOm07{6_kF#*Q!){GIMVEle{j6W#GAKPT)@$#oN
zV?-zze_kEqFN*QkHW_*R`EAV@5emlNSI78=V*ImB#<Im_^4FR%A{2~&ua5B_#rSWV
zj0rGyz#9kdMMArC6`^44h~Rdw9EuR&NGI6v1y*?PD3#7@#)wNMFm_oT<2s6Q-F6tu
zr0be8Qsa7UF{ahWGSZ82&D)Gi&u)mrtuNTQvH@(n*Sz?aSgL7!O6r2^-tOpw>)s9V
z58lZp04^EvPlwcmSyM};CyMIwa?;oXPZ~X}@2_JNeWQh|=9b!nWFxdIPxaJ2rcIx*
zPql1J4=)=L!;gPSFF5APCa_f<Qwlm;n`%tUrtt8hSZA;qs<=A_%M5ri%>(hI`O*h2
zH-%lkSXfkXZq2zFvSvt>alto7v8pJQi9l`vw<BA^5)$_tI;5&)D|X`r^Q2uLsMz(8
zZ%(#ia9b0uQw*-Jg4@Q0%kc75<2?7q#i>t!ov(1zhsb8NG|o>qXLIR>X6Z*hH_dH{
z+cAdQU*T@&;;Nyea-7m-0Q<VRe)(?ymsTCPo11G0b9-XOrSG6H2Wm|3<f5)Wq&Pr*
zXw`w^R*u>;hFU_@_y`PAsDmS@>T*Yh1gNjS^8M9+e%=&A9ZJ;BG5y07>Tr$feOzZr
zQ>}~$Fn`+n>wMq4lI|FDcI-Q&YTT@GQpyl_c*{^Uz$f6w>*uhX1p)j#sbF6<ykmPl
z$N!#pQx0K{BxaXb%A*u!WehVXqXW#klaMAD)NBo5?numaVwgKA%rP;{wCwC-?mG5?
z(*~?M<%}5SF2r0vhPkW492>)Il5qj%y&J5!Y}t!n#V~gxX7?E8c!jyU#`LDvlBQg$
zRVD<ee>I<Z{9YgI6hoaz)E+U^NeXqci)wAFH_OcDx|Z(n<q`#=*5XsgRaeU%WO29o
z6r%Nxp;algJvG{Ry$^-wP$mR2l_2qXIZc7=r9md^l+uTVwrU7uIzi%Fb#Dcd)F3l8
z$j0I676PdzNPN2Oqd;b8ko`1B%~*3LGYPVJY)ob;5FUWBA6WDBm=sK?P$v5lBtD_$
zD3JX$NWBIrm_Z?s{RtAEL30(z0UD%HgA`1d5J(L{;uGdT1(MPr{q>l1TK>;TyX?J<
zJ7Lh)VfP8+$L%ZgP}DP~7DdC1se{{*G%QD_Si^Nl9sJH_>*w7`90Hk7kUp^?u2&$I
z2H8Uk`>p@>_OkOcF^~p=#Ai%KfgGek_SYaw2OKc?*$;cfKpF`W-`GtGq*;SBxga|q
z`RnOl9QfrmF_0F51VfxwXKD8WU;&!)V#CfAA)I#;D~k$l+0qp{tfG^&!slMg@nBmz
z+yz<)Z}_fNhL|d^(4DbG@Hw&=mRn(<i?R?Xp_NzRniBV_a5to_=JMg?!?5;NOC6*!
z$c<ajpzgK=3)7O}Dq_-n<=2b9l9Ccx5>472j-`gw4_2uk;-;>?UluGRIVukOsM}zh
zLQgbfMBt)t8Da{?$fY}!U3s1gjnrZA;mIFZ4!?J~lupYL3|2e?z*81d6byh`!Fpp+
zB1fW&f%YgOMQHQzIdU{CY5f)LEPqA2oShWCcV?J6i!0g{@EbT*GCaa@41A6p3mcwU
z*+pe?9HSNZwwzh|uF6!OeR$#AkmC_=AU%OF5z-Uk!&6AG;qk0pBquXiarQ2phaNo=
zE0I$WWnem$og+-A!RN^7uw>$g?t+Ozk!P@vA~a1Dvh|ZQ5o4e`i(MmhXT#^nIj~`2
z*+p_LgUOR2qXF$a21L-#htH7<V8dwjrE(#IRo0%ObXsi=4B<s+GvHs$E)o1o;B(|s
z*aG}Tav6h7_)u~=+6?$tuuBC0O86YP3YPfZ#o)s9Lb;mJihwsPi`^Xub60JgMI}lL
zcCTv?X5hM(y(3)L!G{-pV9CWBNW=n{zq8!HUW(JZMT}PNEjOaUz;YAgBP=(==g2Lv
zVMX{cL2hNVLf=kvPxbFS4KFSmrlf_^DUsXI!$5mGfg-ebz~{)Fu;C50WS-o`V8yyM
z$B~kkG|dYghRWQr<e1-$SOecZgoyCn3!fwR!4~k<%KZ$E^3|I89zd*t??FOD_#T1}
zPl~}7@TKJu21ohQX1+%eYv6l~5D~t|;dA5(*aE&Ld6L0Vz9uu@Q;0S2Jxz!R-!t$z
z@+@otUrwH5aFj1+=6fEo2EG>v5#f6gK0LMtOFr+5sY|$Mc$wjfahqVqpux=f3Zf01
zuM#A}`5JtVybeoFZ)b06oh)xKT2XS>h+I8j-%->ByT+U7VW54BKoQ!v;lsB(U`gwp
z2KOjkoG6ud8Lx<Wlu<i}yKjVVP9@|$gn`Y!8=G-21m0)w2-|<)!?Sd-Q8wJZW_<f>
zylwjt!VGL5vv-8;6ZjnY6gJA1kX4LtpN+R{KSP*-?Q`~yuzdlaBVWQ6u;JcnnS90g
zLN<4=4@Q!=YrjUAf$hKS9bx+hK74fqHp-TeZy8_6=I#>#8*kNqhcE-%_v{^E`vE?@
z<pmpID=U+q7+=U1*H$J!Bh0||3wuY{eudAG-(aI`3HhDzg=}$cW%38Y3~Yb0cZBUP
z`0#ERY=kXQCjT(LkS(q)A^#%Gz}5j*x>2@{@ZsZ^uu-;zbYgrVTU=X0IwQ=$)`h(z
z+SY;3k#%7U*l=gMRJt-=v3bA9;P(~GLUlM<QY-7B-9Xcg-6AyW!-sF9z(#4xq&wr|
zH0~t}Xf{N<fu;w$MQD1$=g3B|QJREo%=kD>!c)@=?FO1n*eyb{DSUVY5;j7ELt<~n
zM`^G#ximTHgLVVW=Ij=s*#bV?@P&=il*v|%kJIQu%gNSgH_-HDw+PKP@Hx^CHcFF_
zZ5bb@(Sw$g{%AMQY{za9ngQ@RvOR18P5I)|gzUihC=Jdq?x4++foM0-6ti1|rUX7m
z2Ej&X$`Ue|@lhI_GCZ0gXgAOdWw!{;F!&r94jZ9KBxD5Rqck{Icr>MGH_()^TZAS7
zAMW<TlEz!0rqlj4cLh5sF7M2oPV<s`B>dp;e}mSi+))gVa8$zQ$Y|IwM|P3Vu_HSb
zaAX&eV<-3x9Ag+B;n*2IM|Ob?bJPYLyRuUON3G%*3%`M59K$t7+WReJH#qTP9qe4$
z9oE0OGXWJxCenpb^d7x&Z;aqKlu2lCxqIR&CVVd=RgKl@WHc9~Kn9Q8yMMF?e2z?k
zB?X@_m?Tw<Q<`{LR&ej4THm{g+`8BkJqoB?NmB_F8PRF*IkFcl3B6V7B!8n#XCFn5
zHF1YY++KUbWk63d(i_WGb%i&-wb}iSzf_}fuIvL_T<iWppqj=8d_G^6bepJyy2%VQ
z<5#zt_{Wi1j73TuOZ|usuPW4!^uk*^iUEIwr?m;6z0b<NbaB9z${ZB&O;uPobE;a3
zzewc$#9t!&qYXc@X~MavIdT9j3Hghdq@{-437QFjPy>*9xa~lc1B4W6j?9Dg5E`1K
z7F9C>zw$421qf+{Fkd6^k-BWUR_YB9+(Ao+4X<s<7B4Lks{o}zp=2})w-`RnUn>V0
zV7Ru%Pm$STU;(GG0H;aeG;17YQ=6@o76XjVCfm@=Y!(!t*a}57ipqvCSp$s9h9GV>
zIfbJh8E<8Geb`xu@?2S@Jo>4@#i%&4gf6Q{eYw=#=6v)?KPl!eL;c6Zh9`!}Qp95l
z2P+98dz07O7Y|m@hiGWN4CF#@0cd?E1P4j3Rtm74D7L#wS!Q53R52W;8Ti!_mtnIu
zlb6HI=tmgPk5uSKx#-rnE_5$IGv!F;k~iZXZNOcwa96mvna!}zxzJns(B9FbU~pum
znc^4&#j%RwIF~~G#D$O7jg;dVst!iHi{c&$gOgYx+X;v**u?H;IFS&MW8g{f;T=m@
z?!D?bH@;?+oWfAWMrG<5z<|uZRaCpEL{3Gdf$KB^M7U0e&yh1=LtOYA5knPM3Aoh#
zp@6GE81BmJn^a^z3;hhtXA>^Md=7kg5*RkbJX+3UsAA^4cBh87{R*hgN1%b~0``wk
zT?iju_=F`D)-3#NCa%veW-PffT=5&gE-`>zs=zLbfR)STJ{TAD2Cyp(U{@-zt0G_(
za<vb}^|}G<8Uxt13hcTF*hsnF15-~KtBJFRTAkm31_SnujE@ZAP4MCUQrPeij*?p#
zt2USaU@EZ_aw{4P*tao0f_*!Dj@$to#;%k*85_gK%KI)f7_jeVd<6R*`0&kK*f92J
zxsS1N?1bEp1_Sm3jE`VH2p`^8g$-esjgW^K8^gw``Vll3upec71p6`g9C;izj9n^E
zFgA{zkSEb#z<!GH5$vbobL1J=Fm{<d%UEJ(x}szNdyWASu;<}(<OSF;SVCT8tODzf
zk^$@`21LMKhR=~#V8dYL@+xB$*v2Rsz+PiO1nhPA9C-sa3|1j;GFE~0M#%v776T$+
zZ^MV%g|K0;k@7BM71&lN8Nl9SKm_c4`0$1;Y#3~me85-*wk=8qun!p!0s9C(M?QuP
zgH_5Wj3roTuUD(mPtjn&Ud8wb_Gj?n>rAj=?9uWCV`JD@ihhX(1NK*pk6?cdA70dj
z4Phrn$Ty6QVPpOIKQtJyzh!&``#bm?`5rcmT`E5?R$=pJ!lCXN+q+s)r#1d0^CO}R
zL_e`}gy?7Z9Qg${OjIVnGBzM8FF0lTM86@*K=eC1M~MD_4=)76g2-AC!X9bH{>zO0
zw*mVfh5fI_{>#NKUE}k9u%s@L4!D1BP}C9bsG=hH@c1Vz6lI3_mu<tjWs2Ls<ltpN
zn6|T-wu^yw9YwpYrmgX5ck=WGt9Lc9ZC>^i<{(|oDXeEmp_@u!eK!T~awIWIHejrp
zG`uR-7kK_PPZpO~-4S6J-VNC?GGTha=SWZ3@XV}~jToy)`m2F9Tzm(YbmZI^9Soek
zh!f%51U^SLg$;9#mdzNeIJfmiIdZ*UNZA|l2FgB!iBN71pCenqau)UQ7e#~GCUQ5N
z+PLn$5<D5OrI~Rn1LM|;v9D$v<ug`zjP5I3dh!(PocakC+-6caxUVmi$Tnv3eg^Vw
z6?uP6KB*0IZzZxO<lC9a2N=k=SL8cr@>y+=yY~a*+t)Pg1I^^c2J#X`K1h?-8_B(!
z_-jT!*i1geKt5EF57Xp}+8|$>VIOWLA7LObRpe!we0dw>YcuQ#GkLjzyh4$W)Z}Lx
z$-SQe)@;~EnaL{+<f9e&j+*?+Hptgz*mp9Mk1>$%tjKrK<af3~zBa?YtC@VPfqa}I
z-%XRNzlxxapInW*w~=rkd9{b`@n*u^4TKXE;Y3Zi3l{;3u-H4QtqI>GGv8zb-yVu@
zip%Gn11rj;im{4$2PIWq2eifQ9#TqVPjoVzFQyVJa=w@bAMP*1@_f<XKmBjFI-M?;
zOgB^QZJ<gjs%n?Ymsl?QFjh%y?>-Jsy)zJL&^40)5nZ$3bA*pigigH`6|yg56&cTn
z-W^EqhD!7Vq3`67dk#7pxc4JognNJZ9GMFn<{l{rFjjH5b1R2jH3&3t9mxI>t`vN@
zvklAbx0Ap9hH?9O7xdn)(S}UDDQnHFbq3b7Vx6y9`KL)-UBT`cIrh0Hcs!6+JkouF
zi#NyW&4iYLut5=KG$FqR>k|6AW1D=Tn{fx3`5Fy;O^UDC<@4t1$Pv=QShWdv^rw%%
zHx^v9c@OaF#IRZYD_3e?03gFAwuv9v!~&lqS=i8Y8d)ki#;RQQiRaSxOvZ!9dNEND
z6KtW-h_!;kkmW)$N3vW5A0C2+<wQKnKVVjKB65cL!&E>Uj0#3*ZBkoePHm|nwS!e^
zhq$Tv8#5seW7Poe8%wg?T_xDI#xQ><UD>h>Jcdy`l=P8NJPbZOAPpPd*5z^pV^yk`
z`P;g!m6$h*Yr1gr?pKEvkRC)E6k4DsKG=o<C7#bl$_CX(QAtE~9zHyu4a>PR$v>Q2
z46Pi)Hq!923|LiSZA1rI{(ly_+$?m3LFh^)^cYvDKNCmFv5Zxlcl)+BFP`fQ6z~Xg
z9Qqi>_jp1@#`gsH@F(|R!?S9XoWxi~JHXokB7+xWJsDjLtfvqu!g?xv_?`-En6*+)
zXRKo7lat;#%6-q<ea9j6ETf!(XanJy1c?xy1)n2l!-CK{C&XdCC!}TzpXfcu%yF)P
z<2=Q2zRTfl!BHdR0>&y)L)+hi<?i_}GzaBE^fgGlh>#J97sKbsC9t6}9fiLi$XG=m
zoR=b-vf#F!yIl>rTm~RRE|(KOlFJqFIdUZ|a>=ac@3o`bm`jEP{Oc?KP$E~E6S&%t
zz%?p?YuyC=ZIO`c7^?<jNE@lNDgSzOHORk#h!OcW!iO)7!iG0Rx!lZH#lB|yGB5aU
zK_3I{t%Qou-Ugo|x5I{ME94HwD%v&Lm&tl3x)@mRB2t9)Zusyg7h%J!BjsMkD%Snh
zXxihz<()kJ>j7`ud%GhJbB2j|AD9fu-p>Rg$vyy|BM-uclN}`wF;*ozXHDkC|2@}-
zL1f7F5%NcJeH1=N9)k_%S}Bh+b}a{1U)ZhXz&-&cL$XgYfk?7X!G}Lu2^&s!v^>Mu
zwHw&~TdvQ7$dK!E<d5X~Jbd^a4mOl)<p_C^u`1UG+nTZQ0~KD^)9>T0*|kQyQ`v)R
z@Aw>?9^z0z3MKLq)EEZjWr~ap$Sd$U@+vF_Bvb7luJ6V$v~!W=@vq&*me(t|WC;|7
zF7pd^{eoc+!sAfk0eH<k0IwSc;0-kZZ@L5Eoh~ZN<SoXk8NpZA+*<-m=BZbw{QKc=
zBgQb!@33oRoZp4dk@sLZ&ip&9E;tX%`gfcHrAy|yuNL`S@0+>)W8nHgaeb({w(h)I
zuG$FKM`o^%4P2inu1{SqfA}h76=T)#ZLDX~lG^a#$!7>KjNRw#7a6-R;KQpEu;HC{
zq<qC##estROVE<Ee2t0$_`eK}0Dl7?-jaX~1CNq#8B5>{-;gqZeP;msUV;4(0jrcB
zJ+RrKiR8Xd=x^Mg5M!YEnO!3}{{o*Qzru1N@eekwW+J5{8}~Oe*Y5_dKNQ!WE|<4)
zM~{%d7#kV*Cixp)gNT0^91-y^e2#R$t5czYA6+UP;c*9^uZkJKiWm?9>ja-8og-jn
z(!~P{4}4R25|+q1h%wNt%dQccuJGY=&9EGJ{t>Fx417~$;JcZ*);Dl%pt!nguA!^n
zgL4tC4b5CV3|u`G*G4XvzmY3sW5yQl!MX6@NiPH##%>e#i;Uf-@Zp^a*zooqDZLr1
zIQT@W`Yl(zM%M>^1N7z$k3eq$AKsII4MUHTtr)BJVZM`R0NdIC)>naT69KD~ejeCt
z^+bfabHzl%<542pBFaG2pPeIFZwH?v17JCs_{YNBVF=&O53hYm7TLtxo7r|Sunkmf
z#hR^`&o%~w-sWAu;B7E|H_H90K=e1+5_1lN3^@!|ISkP`Oj`RK@UE@OHJZava}L7{
zISf}hjL<nOTKgOlZRAjD&Y{eZLqg?Hu5;LaZF9hnw{7H5Va{QsA%{^ahf1BpthLXf
zt!**doWqWW9ClJUjL|tPU;7-|+7>&TbJ)d@!>%fau{wuQYnubVQfHo%<IFkiX2@Z@
z%3*h%L;c$4(AKt?V9sHpA%{sShsipJGuJ+cwzkC{<{YMka!6-(!y{FBse*5=?8zLw
zNBO6s;>a|*aOduN_XvsPIt-m8d%;_9tY!b`QFgil*xLojT<TwRd#_UYSKN~%8O3w6
zsy3kRqflqKsMd#WTHbGWJ|q`U!3#Mv&G55A@ab0Th`GYo$zw0u?AaoCTV`oD*%z&N
z*Jcj>ab!P+>PO#a%&0Dv{ozn2@a%$y=B&&`b*>!1s44&AKV_;qrqwGr*_^c2=34p6
z3<5Iz9x7gp(f9|#g+GzSo<9D}Qklm{7qymcA=FOk3~|(#A*L56hTy{<gZr_ND3Lle
z1lVcR9GTDl*r;b|yn5F0d3^lOMT%r(sa_3kFZHnh#EDRxfkPu4q!5buw>ESxeAIfn
zv}C3hm+?c&jR*+RXhO}AW_HpN0azw2aA?2<Yy(^X$a8-Hhy^<<(_M5M(E+-kh9BpM
z5TGX{2Zu&)Wm^Hdm#zj}h=2fa5o-7yjh%44z(3%sahAkzB3WWH1X(UE06kcN9-=|*
z0Mzk8z3s}^<|d_@no``h9L;3~NQWw<!!*(d0n*_yB);MkAPp@o8RQP>&{D!4Q2=|S
zf;~#Zei*>!W3Z#zgT<yQ8x$v<UhI+{T|l~Ak*?6B{OhB7s8_Bf>EL!rk0~HMR*@d3
zNyqSl-`&xUUu)9xc9fn_KzgDgJ;^0iJJrl;>{KViq1jJiTUDoEx70g$DR*$p|Kd{a
z;HM%gn5w6thHnC~C#LGU!A3hng8@$Oq5MSg)O1c7Y5|1j3(qV7I!gha?E+cX>Xgp$
zQM|c%Fi?1?!Bh%?sq}atia9xd182fLQk6kS%*muYCj)%RVW*s1ko9>g>+^NiN9n9D
zP+6<vo2Nc<gu77nyGZx@L-)H__3ISg1ewB7x<vK6RQG#V_q)vRhrW8W`K+=(K;=<k
zeoHV8WkbqK<Z>Vd8}|yucBRXf*(BJwR~ZPWwnI3$UmJvKU-jF9@lP<Bs|%93MkRBt
zn~e3ImVTWflQoncVY|M7?FPkmqh@<wv)!~hw!-0ry0*04T)=jVV!PF4%Zv^N`ZfdM
zjCKYZcc^{F!dy!Dy}596p?o&W?JB9Xdc}A_N^%Xgjp>@YhT5Fmp%T1PWwV*pwW_z(
zx2lgdV9FP`(y&%lZE6jw+Q1q<<tJ3t-z2Hp&?=wu18e-_A5}fA$|>Km#=p8zRb-8s
z@{fwd;kK%?HFnBhto3GX9c#jrKUm{m*{SMiRjJDk_4gU8I$3*7`I~KvFhEsZtm-L$
zvbLPH&8^v0>svKbex`?i)TXMtRX62F)~;r)ht)9Ud)D|jp<C4(%MH!=5^+ks{OpRn
zONqQ2R=qeb>G{fElWQ3v_o%je4Q-`zpK80GZG4Tnz9m=Fl*+V7syUUF2jHHj4x}xD
zZwl8~4Iq_TYl%Flx;+G2Y~2W{$5s`&iD50CP0PY`T}`cpm#5S6Fk<kjpSpTUEwF|y
z#Gn3XY0VC6YRJ}h+l!#-y6SYUn>>QBlB7JU5I3A=Z8go>dRnXEu5D>u(AdyiU!#)9
z%43Lev%+8PNHu30TAJM^c^obHoRh-1v8onX)wZfn124p%Dq&a-??pd}=Ghr&9bT8p
zrG}^Q$3jxsY{T$L(`S^8C`$~VI;kQxb^P#pE46TFQ+-9MskXVXWq1=ls@#|zo^9H0
zyMqU6xPd$N8<-eYF|4$vu?1fktIZ7@(r+L@(`+Et&=HmR|3F8cA}4O{bf{0YVoaV!
zb*?<aJgYEK(&~!|wRn}fCW{XkH{=>X@GM$as%<T;&CQgbZcMwcqCcm)KM$+6b*izk
lWl_z7hT4Phedib8U7<8LHZ(QlQaQ-Z%8RPUOR%lO{vY*^v@!qy

diff --git a/epochX/cudacpp/gg_ttg.mad/bin/madevent b/epochX/cudacpp/gg_ttg.mad/bin/madevent
index c944aa1faf..dff9711b73 100755
--- a/epochX/cudacpp/gg_ttg.mad/bin/madevent
+++ b/epochX/cudacpp/gg_ttg.mad/bin/madevent
@@ -32,6 +32,7 @@ except ImportError:
 
     
 import os
+pjoin = os.path.join 
 import optparse
 
 # Get the directory of the script real path (bin)
@@ -160,13 +161,30 @@ except:
     pass
 import internal.madevent_interface as cmd_interface
 
+# check for plugin customization of the launch command
+launch_interface = cmd_interface.MadEventCmdShell
+if os.path.exists(pjoin(root_path, 'bin','internal', 'launch_plugin.py')):
+    with  misc.TMP_variable(sys, 'path', sys.path + [pjoin(root_path, 'bin', 'internal')]):
+        from importlib import reload
+        try:
+            reload('launch_plugin')
+        except Exception as error:
+            import launch_plugin
+    launch_interface =  launch_plugin.MEINTERFACE
+
+
+#Source use this executable for compilation always allow it
+force_run = False
+if (args and args[0] == 'treatcards'):
+    force_run=True    
+
 # Call the cmd interface main loop
 try:
     if '-h' in args or '--help' in args:
-        launch = ME.MadEventCmdShell(me_dir=os.path.dirname(root_path), force_run=True)
+        launch = launch_interface(me_dir=os.path.dirname(root_path), force_run=True)
         launch.exec_cmd('help generate_events')
         sys.exit(0)
-    with cmd_interface.MadEventCmdShell.RunWebHandling(os.path.dirname(root_path), ):
+    with cmd_interface.MadEventCmdShell.RunWebHandling(os.path.dirname(root_path), force_run=force_run):
         if (args and os.path.isfile(args[0])):
             # They are an input file 
             input_file = args[0]
@@ -178,7 +196,7 @@ try:
                 cmd_line.run_cmd('import command ' + input_file)
                 cmd_line.run_cmd('quit')      
             else:
-                cmd_line = cmd_interface.MadEventCmdShell(force_run=True)
+                cmd_line = launch_interface(force_run=True)
                 cmd_line.use_rawinput = False
                 cmd_line.haspiping = False
                 cmd_line.run_cmd('import command ' + input_file)
@@ -188,7 +206,7 @@ try:
             if options.web:
                 cmd_line = cmd_interface.MadEventCmd(force_run=True)
             else:
-                cmd_line = cmd_interface.MadEventCmdShell(force_run=True)
+                cmd_line = launch_interface(force_run=True)
             if not hasattr(cmd_line, 'do_%s' % args[0]):
                 if parser_error:
                     print( parser_error)
diff --git a/epochX/cudacpp/gg_ttg.mad/src/HelAmps_sm.h b/epochX/cudacpp/gg_ttg.mad/src/HelAmps_sm.h
index 9f0973d9b0..0dd0f3ebba 100644
--- a/epochX/cudacpp/gg_ttg.mad/src/HelAmps_sm.h
+++ b/epochX/cudacpp/gg_ttg.mad/src/HelAmps_sm.h
@@ -8,7 +8,7 @@
 // Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+// MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
@@ -863,6 +863,7 @@ namespace mg5amcCpu
           const fptype allV2[],
           const fptype allV3[],
           const fptype allCOUP[],
+          const double Ccoeff,
           fptype allvertexes[] ) ALWAYS_INLINE;
 
   //--------------------------------------------------------------------------
@@ -873,6 +874,7 @@ namespace mg5amcCpu
   VVV1P0_1( const fptype allV2[],
             const fptype allV3[],
             const fptype allCOUP[],
+            const double Ccoeff,
             const fptype M1,
             const fptype W1,
             fptype allV1[] ) ALWAYS_INLINE;
@@ -886,6 +888,7 @@ namespace mg5amcCpu
           const fptype allF2[],
           const fptype allV3[],
           const fptype allCOUP[],
+          const double Ccoeff,
           fptype allvertexes[] ) ALWAYS_INLINE;
 
   //--------------------------------------------------------------------------
@@ -896,6 +899,7 @@ namespace mg5amcCpu
   FFV1_1( const fptype allF2[],
           const fptype allV3[],
           const fptype allCOUP[],
+          const double Ccoeff,
           const fptype M1,
           const fptype W1,
           fptype allF1[] ) ALWAYS_INLINE;
@@ -908,6 +912,7 @@ namespace mg5amcCpu
   FFV1_2( const fptype allF1[],
           const fptype allV3[],
           const fptype allCOUP[],
+          const double Ccoeff,
           const fptype M2,
           const fptype W2,
           fptype allF2[] ) ALWAYS_INLINE;
@@ -920,6 +925,7 @@ namespace mg5amcCpu
   FFV1P0_3( const fptype allF1[],
             const fptype allF2[],
             const fptype allCOUP[],
+            const double Ccoeff,
             const fptype M3,
             const fptype W3,
             fptype allV3[] ) ALWAYS_INLINE;
@@ -933,6 +939,7 @@ namespace mg5amcCpu
              const fptype allV3[],
              const fptype allV4[],
              const fptype allCOUP[],
+             const double Ccoeff,
              const fptype M1,
              const fptype W1,
              fptype allV1[] ) ALWAYS_INLINE;
@@ -946,6 +953,7 @@ namespace mg5amcCpu
              const fptype allV3[],
              const fptype allV4[],
              const fptype allCOUP[],
+             const double Ccoeff,
              const fptype M1,
              const fptype W1,
              fptype allV1[] ) ALWAYS_INLINE;
@@ -959,6 +967,7 @@ namespace mg5amcCpu
              const fptype allV3[],
              const fptype allV4[],
              const fptype allCOUP[],
+             const double Ccoeff,
              const fptype M1,
              const fptype W1,
              fptype allV1[] ) ALWAYS_INLINE;
@@ -972,6 +981,7 @@ namespace mg5amcCpu
           const fptype allV2[],
           const fptype allV3[],
           const fptype allCOUP[],
+          const double Ccoeff,
           fptype allvertexes[] )
   {
     mgDebug( 0, __FUNCTION__ );
@@ -1006,6 +1016,7 @@ namespace mg5amcCpu
   VVV1P0_1( const fptype allV2[],
             const fptype allV3[],
             const fptype allCOUP[],
+            const double Ccoeff,
             const fptype M1,
             const fptype W1,
             fptype allV1[] )
@@ -1044,6 +1055,7 @@ namespace mg5amcCpu
           const fptype allF2[],
           const fptype allV3[],
           const fptype allCOUP[],
+          const double Ccoeff,
           fptype allvertexes[] )
   {
     mgDebug( 0, __FUNCTION__ );
@@ -1067,6 +1079,7 @@ namespace mg5amcCpu
   FFV1_1( const fptype allF2[],
           const fptype allV3[],
           const fptype allCOUP[],
+          const double Ccoeff,
           const fptype M1,
           const fptype W1,
           fptype allF1[] )
@@ -1098,6 +1111,7 @@ namespace mg5amcCpu
   FFV1_2( const fptype allF1[],
           const fptype allV3[],
           const fptype allCOUP[],
+          const double Ccoeff,
           const fptype M2,
           const fptype W2,
           fptype allF2[] )
@@ -1129,6 +1143,7 @@ namespace mg5amcCpu
   FFV1P0_3( const fptype allF1[],
             const fptype allF2[],
             const fptype allCOUP[],
+            const double Ccoeff,
             const fptype M3,
             const fptype W3,
             fptype allV3[] )
@@ -1160,6 +1175,7 @@ namespace mg5amcCpu
              const fptype allV3[],
              const fptype allV4[],
              const fptype allCOUP[],
+             const double Ccoeff,
              const fptype M1,
              const fptype W1,
              fptype allV1[] )
@@ -1194,6 +1210,7 @@ namespace mg5amcCpu
              const fptype allV3[],
              const fptype allV4[],
              const fptype allCOUP[],
+             const double Ccoeff,
              const fptype M1,
              const fptype W1,
              fptype allV1[] )
@@ -1228,6 +1245,7 @@ namespace mg5amcCpu
              const fptype allV3[],
              const fptype allV4[],
              const fptype allCOUP[],
+             const double Ccoeff,
              const fptype M1,
              const fptype W1,
              fptype allV1[] )
diff --git a/epochX/cudacpp/gg_ttg.mad/src/Parameters_sm.cc b/epochX/cudacpp/gg_ttg.mad/src/Parameters_sm.cc
index 05eba20217..067445b198 100644
--- a/epochX/cudacpp/gg_ttg.mad/src/Parameters_sm.cc
+++ b/epochX/cudacpp/gg_ttg.mad/src/Parameters_sm.cc
@@ -7,7 +7,7 @@
 // Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+// MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
diff --git a/epochX/cudacpp/gg_ttg.mad/src/Parameters_sm.h b/epochX/cudacpp/gg_ttg.mad/src/Parameters_sm.h
index 41830f87ca..9581d66e0e 100644
--- a/epochX/cudacpp/gg_ttg.mad/src/Parameters_sm.h
+++ b/epochX/cudacpp/gg_ttg.mad/src/Parameters_sm.h
@@ -7,7 +7,7 @@
 // Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+// MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
diff --git a/epochX/cudacpp/gg_ttg.mad/src/cudacpp_src.mk b/epochX/cudacpp/gg_ttg.mad/src/cudacpp_src.mk
index f2804ffb85..159e19a46d 100644
--- a/epochX/cudacpp/gg_ttg.mad/src/cudacpp_src.mk
+++ b/epochX/cudacpp/gg_ttg.mad/src/cudacpp_src.mk
@@ -36,6 +36,13 @@ endif
 # See https://www.gnu.org/software/make/manual/html_node/Implicit-Variables.html
 ###RANLIB = ranlib
 
+# Add -mmacosx-version-min=11.3 to avoid "ld: warning: object file was built for newer macOS version than being linked"
+LDFLAGS =
+ifneq ($(shell $(CXX) --version | egrep '^Apple clang'),)
+CXXFLAGS += -mmacosx-version-min=11.3
+LDFLAGS += -mmacosx-version-min=11.3
+endif
+
 #-------------------------------------------------------------------------------
 
 #=== Configure the CUDA compiler (note: GPUCC is already exported including ccache)
@@ -266,11 +273,11 @@ endif
 ifneq ($(GPUCC),)
 $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so : $(cxx_objects) $(cu_objects)
 	@if [ ! -d $(LIBDIR) ]; then echo "mkdir -p $(LIBDIR)"; mkdir -p $(LIBDIR); fi
-	$(GPUCC) -shared -o $@ $(cxx_objects) $(cu_objects)
+	$(GPUCC) -shared -o $@ $(cxx_objects) $(cu_objects) $(LDFLAGS)
 else
 $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so : $(cxx_objects)
 	@if [ ! -d $(LIBDIR) ]; then echo "mkdir -p $(LIBDIR)"; mkdir -p $(LIBDIR); fi
-	$(CXX) -shared -o $@ $(cxx_objects)
+	$(CXX) -shared -o $@ $(cxx_objects) $(LDFLAGS)
 endif
 
 #-------------------------------------------------------------------------------
diff --git a/epochX/cudacpp/gg_ttg.mad/src/mgOnGpuConfig.h b/epochX/cudacpp/gg_ttg.mad/src/mgOnGpuConfig.h
index e540c8587c..55d03f1252 100644
--- a/epochX/cudacpp/gg_ttg.mad/src/mgOnGpuConfig.h
+++ b/epochX/cudacpp/gg_ttg.mad/src/mgOnGpuConfig.h
@@ -15,7 +15,6 @@
 #define MGONGPUCPP_GPUIMPL cuda
 #elif defined __HIPCC__
 #define MGONGPUCPP_GPUIMPL hip
-#include "hip/hip_runtime.h"
 #else
 #undef MGONGPUCPP_GPUIMPL
 #endif
@@ -24,16 +23,19 @@
 // ** NB2 Baseline on b7g47n0004 fluctuates (probably depends on load on other VMs)
 
 // Choose if curand is supported for generating random numbers
-// For CUDA, by default, it is supported
-// For HIP, by default, it is not supported
-// For C++, by default, do not inline, but allow this macro to be set from outside with e.g. -DMGONGPU_HAS_NO_CURAND
-#ifdef __CUDACC__
-#undef MGONGPU_HAS_NO_CURAND
-#elif defined __HIPCC__
+// For HIP, by default, do not use curand (common random numbers will be used instead)
+// For both CUDA and C++, by default, do not inline, but allow this macro to be set from outside with e.g. -DMGONGPU_HAS_NO_CURAND
+// (there exist CUDA installations, e.g. using the HPC package, which do not include curand - see PR #784 and #785)
+#if defined __HIPCC__
 #define MGONGPU_HAS_NO_CURAND 1
 #else
+//#ifdef __CUDACC__
+//#undef MGONGPU_HAS_NO_CURAND // default
+////#define MGONGPU_HAS_NO_CURAND 1
+//#else
 //#undef MGONGPU_HAS_NO_CURAND // default
 ////#define MGONGPU_HAS_NO_CURAND 1
+//#endif
 #endif
 
 // Choose floating point precision (for everything but color algebra #537)
diff --git a/epochX/cudacpp/gg_ttg.mad/src/mgOnGpuCxtypes.h b/epochX/cudacpp/gg_ttg.mad/src/mgOnGpuCxtypes.h
index 46d9f02733..5532e22fa1 100644
--- a/epochX/cudacpp/gg_ttg.mad/src/mgOnGpuCxtypes.h
+++ b/epochX/cudacpp/gg_ttg.mad/src/mgOnGpuCxtypes.h
@@ -159,6 +159,12 @@ namespace mg5amcCpu
     return cxsmpl<float>( a, 0 ) * b;
   }
 
+  inline __host__ __device__ constexpr cxsmpl<float>
+  operator*( const cxsmpl<float>& a, const double& b )
+  {
+    return a * cxsmpl<float>( b, 0 );
+  }
+
   template<typename FP>
   inline __host__ __device__ constexpr cxsmpl<FP>
   operator/( const cxsmpl<FP>& a, const cxsmpl<FP>& b )
diff --git a/epochX/cudacpp/gg_ttg.sa/CODEGEN_cudacpp_gg_ttg_log.txt b/epochX/cudacpp/gg_ttg.sa/CODEGEN_cudacpp_gg_ttg_log.txt
index d710d27afd..a4e93bc7e3 100644
--- a/epochX/cudacpp/gg_ttg.sa/CODEGEN_cudacpp_gg_ttg_log.txt
+++ b/epochX/cudacpp/gg_ttg.sa/CODEGEN_cudacpp_gg_ttg_log.txt
@@ -14,7 +14,7 @@ Running MG5 in debug mode
 *                   *        * *        *                  *
 *                 *                       *                *
 *                                                          *
-*         VERSION 3.5.1_lo_vect         2023-08-08         *
+*         VERSION 3.5.2_lo_vect         2023-11-08         *
 [1;31m*                                                          *[1;0m
 [1;31m*          WARNING: UNKNOWN DEVELOPMENT VERSION.           *[1;0m
 [1;31m*            WARNING: DO NOT USE FOR PRODUCTION            *[1;0m
@@ -53,7 +53,7 @@ Note that you can still compile and run aMC@NLO with the built-in PDFs
 Using default text editor "vi". Set another one in ./input/mg5_configuration.txt
 Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt
 Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt
-import /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_cudacpp_gg_ttg.mg
+import /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttg.mg
 The import format was not given, so we guess it as command
 set stdout_level DEBUG
 set output information to level: 10
@@ -62,7 +62,7 @@ generate g g > t t~ g
 No model currently active, so we import the Standard Model
 INFO: load particles 
 INFO: load vertices 
-[1;32mDEBUG: model prefixing  takes 0.005392789840698242 [0m
+[1;32mDEBUG: model prefixing  takes 0.005494117736816406 [0m
 INFO: Restrict model sm with file models/sm/restrict_default.dat . 
 [1;32mDEBUG: Simplifying conditional expressions [0m
 [1;32mDEBUG: remove interactions: u s w+ at order: QED=1 [0m
@@ -157,60 +157,33 @@ INFO: Trying process: g g > t t~ g WEIGHTED<=3 @1
 INFO: Process has 16 diagrams 
 1 processes with 16 diagrams generated in 0.021 s
 Total: 1 processes with 16 diagrams
-output standalone_cudacpp CODEGEN_cudacpp_gg_ttg
-Load PLUGIN.CUDACPP_SA_OUTPUT
-[1mOutput will be done with PLUGIN: CUDACPP_SA_OUTPUT[0m
+output standalone_cudacpp ../TMPOUT/CODEGEN_cudacpp_gg_ttg
+Load PLUGIN.CUDACPP_OUTPUT
+[1mOutput will be done with PLUGIN: CUDACPP_OUTPUT[0m
 [1;32mDEBUG:  cformat = [0m plugin [1;30m[export_cpp.py at line 3071][0m [0m
-[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [1;30m[output.py at line 152][0m [0m
-[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [1;30m[output.py at line 157][0m [0m
-INFO: Creating subdirectories in directory /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_cudacpp_gg_ttg 
+[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [1;30m[output.py at line 160][0m [0m
+[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [1;30m[output.py at line 165][0m [0m
+INFO: Creating subdirectories in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttg 
 INFO: Organizing processes into subprocess groups 
 INFO: Generating Helas calls for process: g g > t t~ g WEIGHTED<=3 @1 
 INFO: Processing color information for process: g g > t t~ g @1 
-[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.generate_subprocess_directory (create the directory) [1;30m[output.py at line 186][0m [0m
-[1;32mDEBUG:    type(subproc_group)=<class 'madgraph.core.helas_objects.HelasMatrixElement'> [1;30m[output.py at line 187][0m [0m
-[1;32mDEBUG:    type(fortran_model)=<class 'PLUGIN.CUDACPP_SA_OUTPUT.model_handling.PLUGIN_GPUFOHelasCallWriter'> [1;30m[output.py at line 188][0m [0m
-[1;32mDEBUG:    type(me)=<class 'int'> me=0 [1;30m[output.py at line 189][0m [0m
-[1;32mDEBUG:  Entering PLUGIN_OneProcessExporter.__init__ [1;30m[model_handling.py at line 1039][0m [0m
-[1;32mDEBUG:  proc_id = [0m 0 [1;30m[model_handling.py at line 1045][0m [0m
-INFO: Creating files in directory /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_cudacpp_gg_ttg/SubProcesses/P1_Sigma_sm_gg_ttxg 
-[1;32mDEBUG:  Entering PLUGIN_OneProcessExporter.generate_process_files [1;30m[model_handling.py at line 1297][0m [0m
-[1;32mDEBUG:  self.include_multi_channel is not yet defined: this is standalone_cudacpp mode [1;30m[model_handling.py at line 1301][0m [0m
-FileWriter <class 'PLUGIN.CUDACPP_SA_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_cudacpp_gg_ttg/SubProcesses/P1_Sigma_sm_gg_ttxg/./CPPProcess.h
-[1;32mDEBUG:  Entering PLUGIN_OneProcessExporter.write_process_h_file [1;30m[model_handling.py at line 1453][0m [0m
-FileWriter <class 'PLUGIN.CUDACPP_SA_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_cudacpp_gg_ttg/SubProcesses/P1_Sigma_sm_gg_ttxg/./CPPProcess.cc
-[1;32mDEBUG:  Entering PLUGIN_OneProcessExporter.write_process_cc_file [1;30m[model_handling.py at line 1475][0m [0m
-[1;32mDEBUG:  Entering PLUGIN_OneProcessExporter.get_sigmaKin_lines [1;30m[model_handling.py at line 1143][0m [0m
-[1;32mDEBUG:  self.include_multi_channel = [0m False [1;30m[model_handling.py at line 1144][0m [0m
-[1;32mDEBUG:  self.support_multichannel = [0m True [1;30m[model_handling.py at line 1145][0m [0m
-[1;32mDEBUG:  type(self.helas_call_writer) = [0m <class 'PLUGIN.CUDACPP_SA_OUTPUT.model_handling.PLUGIN_GPUFOHelasCallWriter'> [1;30m[model_handling.py at line 1162][0m [0m
-[1;32mDEBUG:  self.support_multichannel, self.include_multi_channel = [0m True False [1;30m[model_handling.py at line 1163][0m [0m
-[1;32mDEBUG:  multi_channel_map = [0m None [1;30m[model_handling.py at line 1654][0m [0m
-[1;32mDEBUG:  diag_to_config = [0m {} [1;30m[model_handling.py at line 1711][0m [0m
-[1;32mDEBUG:  call = [0m vxxxxx( momenta,m_pars->%s, cHel[ihel][%d],%+d, w_sv[%d], %d ); [1;30m[model_handling.py at line 1823][0m [0m
-[1;32mDEBUG:  ('ZERO', 0, -1, 0, 0) [1;30m[model_handling.py at line 1824][0m [0m
-[1;32mDEBUG:  call = [0m vxxxxx( momenta,m_pars->%s, cHel[ihel][%d],%+d, w_sv[%d], %d ); [1;30m[model_handling.py at line 1823][0m [0m
-[1;32mDEBUG:  ('ZERO', 1, -1, 1, 1) [1;30m[model_handling.py at line 1824][0m [0m
-[1;32mDEBUG:  call = [0m vxxxxx( momenta,m_pars->%s, cHel[ihel][%d],%+d, w_sv[%d], %d ); [1;30m[model_handling.py at line 1823][0m [0m
-[1;32mDEBUG:  ('ZERO', 4, 1, 4, 4) [1;30m[model_handling.py at line 1824][0m [0m
-INFO: Created files CPPProcess.h and CPPProcess.cc in directory /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_cudacpp_gg_ttg/SubProcesses/P1_Sigma_sm_gg_ttxg/. 
-[1;32mDEBUG:  Entering PLUGIN_OneProcessExporter.edit_CMakeLists [1;30m[model_handling.py at line 1343][0m [0m
-[1;32mDEBUG:  Entering PLUGIN_OneProcessExporter.edit_check_sa [1;30m[model_handling.py at line 1352][0m [0m
-[1;32mDEBUG:  Entering PLUGIN_OneProcessExporter.edit_mgonGPU [1;30m[model_handling.py at line 1369][0m [0m
-[1;32mDEBUG:  Entering PLUGIN_OneProcessExporter.edit_processidfile [1;30m[model_handling.py at line 1389][0m [0m
-[1;32mDEBUG:  Entering PLUGIN_OneProcessExporter.edit_testxxx [1;30m[model_handling.py at line 1419][0m [0m
-[1;32mDEBUG:  Entering PLUGIN_OneProcessExporter.edit_memorybuffers [1;30m[model_handling.py at line 1430][0m [0m
-[1;32mDEBUG:  Entering PLUGIN_OneProcessExporter.edit_memoryaccesscouplings [1;30m[model_handling.py at line 1441][0m [0m
-[1;32mDEBUG:  'Copying test reference file: ', template_ref  = [0m Copying test reference file:  /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/../../../test/ref/dump_CPUTest.Sigma_sm_gg_ttxg.txt [1;30m[model_handling.py at line 1335][0m [0m
+[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.generate_subprocess_directory (create the directory) [1;30m[output.py at line 194][0m [0m
+[1;32mDEBUG:    type(subproc_group)=<class 'madgraph.core.helas_objects.HelasMatrixElement'> [1;30m[output.py at line 195][0m [0m
+[1;32mDEBUG:    type(fortran_model)=<class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_GPUFOHelasCallWriter'> [1;30m[output.py at line 196][0m [0m
+[1;32mDEBUG:    type(me)=<class 'int'> me=0 [1;30m[output.py at line 197][0m [0m
+INFO: Creating files in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttg/SubProcesses/P1_Sigma_sm_gg_ttxg 
+FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttg/SubProcesses/P1_Sigma_sm_gg_ttxg/./CPPProcess.h
+FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttg/SubProcesses/P1_Sigma_sm_gg_ttxg/./CPPProcess.cc
+INFO: Created files CPPProcess.h and CPPProcess.cc in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttg/SubProcesses/P1_Sigma_sm_gg_ttxg/. 
 Generated helas calls for 1 subprocesses (16 diagrams) in 0.037 s
-[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.convert_model (create the model) [1;30m[output.py at line 194][0m [0m
+[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.convert_model (create the model) [1;30m[output.py at line 202][0m [0m
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates VVV1 routines[0m
 ALOHA: aloha creates FFV1 routines[0m
 ALOHA: aloha creates VVVV1 set of routines with options: P0[0m
 ALOHA: aloha creates VVVV3 set of routines with options: P0[0m
 ALOHA: aloha creates VVVV4 set of routines with options: P0[0m
-ALOHA: aloha creates 5 routines in  0.324 s
+ALOHA: aloha creates 5 routines in  0.320 s
 <class 'aloha.create_aloha.AbstractRoutine'> VVV1
 <class 'aloha.create_aloha.AbstractRoutine'> VVV1
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
@@ -220,23 +193,17 @@ ALOHA: aloha creates 5 routines in  0.324 s
 <class 'aloha.create_aloha.AbstractRoutine'> VVVV1
 <class 'aloha.create_aloha.AbstractRoutine'> VVVV3
 <class 'aloha.create_aloha.AbstractRoutine'> VVVV4
-FileWriter <class 'PLUGIN.CUDACPP_SA_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_cudacpp_gg_ttg/src/./HelAmps_sm.h
-INFO: Created file HelAmps_sm.h in directory /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_cudacpp_gg_ttg/src/. 
+FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttg/src/./HelAmps_sm.h
+INFO: Created file HelAmps_sm.h in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttg/src/. 
 super_write_set_parameters_onlyfixMajorana (hardcoded=False)
-[1;32mDEBUG:  'pardef_lines size =', len(pardef_lines), ', keys size =', len(pardef_lines.keys())  = [0m pardef_lines size = 48 , keys size = 48 [1;30m[model_handling.py at line 729][0m [0m
 super_write_set_parameters_onlyfixMajorana (hardcoded=True)
-[1;32mDEBUG:  'pardef_lines size =', len(pardef_lines), ', keys size =', len(pardef_lines.keys())  = [0m pardef_lines size = 3 , keys size = 3 [1;30m[model_handling.py at line 729][0m [0m
-[1;32mDEBUG:  'pardef_lines size =', len(pardef_lines), ', keys size =', len(pardef_lines.keys())  = [0m pardef_lines size = 3 , keys size = 3 [1;30m[model_handling.py at line 729][0m [0m
-[1;32mDEBUG:  'pardef_lines size =', len(pardef_lines), ', keys size =', len(pardef_lines.keys())  = [0m pardef_lines size = 3 , keys size = 3 [1;30m[model_handling.py at line 729][0m [0m
-[1;32mDEBUG:  'pardef_lines size =', len(pardef_lines), ', keys size =', len(pardef_lines.keys())  = [0m pardef_lines size = 3 , keys size = 3 [1;30m[model_handling.py at line 729][0m [0m
-[1;32mDEBUG:  'pardef_lines size =', len(pardef_lines), ', keys size =', len(pardef_lines.keys())  = [0m pardef_lines size = 3 , keys size = 3 [1;30m[model_handling.py at line 729][0m [0m
-FileWriter <class 'PLUGIN.CUDACPP_SA_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_cudacpp_gg_ttg/src/./Parameters_sm.h
-FileWriter <class 'PLUGIN.CUDACPP_SA_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_cudacpp_gg_ttg/src/./Parameters_sm.cc
+FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttg/src/./Parameters_sm.h
+FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttg/src/./Parameters_sm.cc
 INFO: Created files Parameters_sm.h and Parameters_sm.cc in directory 
-INFO: /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_cudacpp_gg_ttg/src/. and /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_cudacpp_gg_ttg/src/. 
-[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.finalize [1;30m[output.py at line 203][0m [0m
+INFO: /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttg/src/. and /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttg/src/. 
 quit
 
-real	0m0.944s
-user	0m0.766s
-sys	0m0.068s
+real	0m0.790s
+user	0m0.706s
+sys	0m0.056s
+Code generation completed in 1 seconds
diff --git a/epochX/cudacpp/gg_ttg.sa/SubProcesses/Bridge.h b/epochX/cudacpp/gg_ttg.sa/SubProcesses/Bridge.h
index f37c972b24..89437b4c42 100644
--- a/epochX/cudacpp/gg_ttg.sa/SubProcesses/Bridge.h
+++ b/epochX/cudacpp/gg_ttg.sa/SubProcesses/Bridge.h
@@ -18,6 +18,7 @@
 #include <cassert>
 #include <cmath>
 #include <cstring>
+#include <filesystem>
 #include <iostream>
 #include <memory>
 #include <type_traits>
@@ -244,14 +245,21 @@ namespace mg5amcCpu
     }
     std::cout << "WARNING! Instantiate device Bridge (nevt=" << m_nevt << ", gpublocks=" << m_gpublocks << ", gputhreads=" << m_gputhreads
               << ", gpublocks*gputhreads=" << m_gpublocks * m_gputhreads << ")" << std::endl;
-    CPPProcess process( /*verbose=*/false );
     m_pmek.reset( new MatrixElementKernelDevice( m_devMomentaC, m_devGs, m_devRndHel, m_devRndCol, m_devMEs, m_devSelHel, m_devSelCol, m_gpublocks, m_gputhreads ) );
 #else
     std::cout << "WARNING! Instantiate host Bridge (nevt=" << m_nevt << ")" << std::endl;
-    CPPProcess process( /*verbose=*/false );
     m_pmek.reset( new MatrixElementKernelHost( m_hstMomentaC, m_hstGs, m_hstRndHel, m_hstRndCol, m_hstMEs, m_hstSelHel, m_hstSelCol, m_nevt ) );
 #endif // MGONGPUCPP_GPUIMPL
-    process.initProc( "../../Cards/param_card.dat" );
+    // Create a process object, read param card and set parameters
+    // FIXME: the process instance can happily go out of scope because it is only needed to read parameters?
+    // FIXME: the CPPProcess should really be a singleton? what if fbridgecreate is called from several Fortran threads?
+    CPPProcess process( /*verbose=*/false );
+    std::string paramCard = "../../Cards/param_card.dat";
+    if( !std::filesystem::exists( paramCard ) )
+    {
+      paramCard = "../" + paramCard;
+    }
+    process.initProc( paramCard );
   }
 
 #ifdef MGONGPUCPP_GPUIMPL
diff --git a/epochX/cudacpp/gg_ttg.sa/SubProcesses/GpuAbstraction.h b/epochX/cudacpp/gg_ttg.sa/SubProcesses/GpuAbstraction.h
index 9c467b1e04..6a7d9c05c0 100644
--- a/epochX/cudacpp/gg_ttg.sa/SubProcesses/GpuAbstraction.h
+++ b/epochX/cudacpp/gg_ttg.sa/SubProcesses/GpuAbstraction.h
@@ -39,6 +39,8 @@
 
 #elif defined __HIPCC__
 
+#include "hip/hip_runtime.h"
+
 #define gpuError_t hipError_t
 #define gpuPeekAtLastError hipPeekAtLastError
 #define gpuGetErrorString hipGetErrorString
diff --git a/epochX/cudacpp/gg_ttg.sa/SubProcesses/MadgraphTest.h b/epochX/cudacpp/gg_ttg.sa/SubProcesses/MadgraphTest.h
index 176338151a..a64c05c26a 100644
--- a/epochX/cudacpp/gg_ttg.sa/SubProcesses/MadgraphTest.h
+++ b/epochX/cudacpp/gg_ttg.sa/SubProcesses/MadgraphTest.h
@@ -14,6 +14,7 @@
 
 #include <array>
 #include <cmath>
+#include <filesystem>
 #include <fstream>
 #include <iomanip>
 #include <memory>
@@ -215,19 +216,16 @@ TEST_P( MadgraphTest, CompareMomentaAndME )
 #endif
   constexpr fptype energy = 1500; // historical default, Ecms = 1500 GeV = 1.5 TeV (above the Z peak)
   // Dump events to a new reference file?
-  constexpr bool dumpEvents = false;
-  std::string dumpFileName = std::string( "dump_" ) + testing::UnitTest::GetInstance()->current_test_info()->test_suite_name() + '.' + testing::UnitTest::GetInstance()->current_test_info()->name() + ".txt";
-  while( dumpFileName.find( '/' ) != std::string::npos )
-  {
-    dumpFileName.replace( dumpFileName.find( '/' ), 1, "_" );
-  }
+  const char* dumpEventsC = getenv( "CUDACPP_RUNTEST_DUMPEVENTS" );
+  const bool dumpEvents = ( dumpEventsC != 0 ) && ( std::string( dumpEventsC ) != "" );
+  const std::string refFileName = testDriver->getRefFileName();
+  const std::string dumpFileName = std::filesystem::path( refFileName ).filename();
   std::ofstream dumpFile;
   if( dumpEvents )
   {
     dumpFile.open( dumpFileName, std::ios::trunc );
   }
   // Read reference data
-  const std::string refFileName = testDriver->getRefFileName();
   std::map<unsigned int, ReferenceData> referenceData;
   if( !dumpEvents )
   {
diff --git a/epochX/cudacpp/gg_ttg.sa/SubProcesses/MatrixElementKernels.cc b/epochX/cudacpp/gg_ttg.sa/SubProcesses/MatrixElementKernels.cc
index d6d6c4f179..81699dfea9 100644
--- a/epochX/cudacpp/gg_ttg.sa/SubProcesses/MatrixElementKernels.cc
+++ b/epochX/cudacpp/gg_ttg.sa/SubProcesses/MatrixElementKernels.cc
@@ -112,10 +112,17 @@ namespace mg5amcCpu
     // See https://community.arm.com/arm-community-blogs/b/operating-systems-blog/posts/runtime-detection-of-cpu-features-on-an-armv8-a-cpu
     bool ok = true; // this is just an assumption!
     const std::string tag = "arm neon (128bit as in SSE4.2)";
-#else
+#elif defined( __x86_64__ ) || defined( __i386__ )
     bool known = true;
     bool ok = __builtin_cpu_supports( "sse4.2" );
     const std::string tag = "nehalem (SSE4.2)";
+#else // AV FIXME! Added by OM for Mac, should identify the correct __xxx__ flag that should be targeted
+    bool known = false; // __builtin_cpu_supports is not supported
+    // See https://gcc.gnu.org/onlinedocs/gcc/Basic-PowerPC-Built-in-Functions-Available-on-all-Configurations.html
+    // See https://stackoverflow.com/q/62783908
+    // See https://community.arm.com/arm-community-blogs/b/operating-systems-blog/posts/runtime-detection-of-cpu-features-on-an-armv8-a-cpu
+    bool ok = true; // this is just an assumption!
+    const std::string tag = "arm neon (128bit as in SSE4.2)";
 #endif
 #else
     bool known = true;
diff --git a/epochX/cudacpp/gg_ttg.sa/SubProcesses/P1_Sigma_sm_gg_ttxg/CPPProcess.cc b/epochX/cudacpp/gg_ttg.sa/SubProcesses/P1_Sigma_sm_gg_ttxg/CPPProcess.cc
index 3358e6c7e0..2e02593919 100644
--- a/epochX/cudacpp/gg_ttg.sa/SubProcesses/P1_Sigma_sm_gg_ttxg/CPPProcess.cc
+++ b/epochX/cudacpp/gg_ttg.sa/SubProcesses/P1_Sigma_sm_gg_ttxg/CPPProcess.cc
@@ -7,7 +7,7 @@
 // Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+// MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
@@ -249,11 +249,11 @@ namespace mg5amcCpu
 
       vxxxxx<M_ACCESS, W_ACCESS>( momenta, 0., cHel[ihel][4], +1, w_fp[4], 4 );
 
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[1], COUPs[0], 0., 0., w_fp[5] );
-      FFV1P0_3<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[2], COUPs[1], 0., 0., w_fp[6] );
+      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[1], COUPs[0], 1.0, 0., 0., w_fp[5] );
+      FFV1P0_3<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[2], COUPs[1], 1.0, 0., 0., w_fp[6] );
 
       // Amplitude(s) for diagram number 1
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[5], w_fp[6], w_fp[4], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[5], w_fp[6], w_fp[4], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -265,10 +265,10 @@ namespace mg5amcCpu
       // *** DIAGRAM 2 OF 16 ***
 
       // Wavefunction(s) for diagram number 2
-      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[2], w_fp[4], COUPs[1], cIPD[0], cIPD[1], w_fp[7] );
+      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[2], w_fp[4], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[7] );
 
       // Amplitude(s) for diagram number 2
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[7], w_fp[5], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[7], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -278,10 +278,10 @@ namespace mg5amcCpu
       // *** DIAGRAM 3 OF 16 ***
 
       // Wavefunction(s) for diagram number 3
-      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[4], COUPs[1], cIPD[0], cIPD[1], w_fp[8] );
+      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[4], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[8] );
 
       // Amplitude(s) for diagram number 3
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[2], w_fp[5], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[2], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -291,11 +291,11 @@ namespace mg5amcCpu
       // *** DIAGRAM 4 OF 16 ***
 
       // Wavefunction(s) for diagram number 4
-      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[2], w_fp[0], COUPs[1], cIPD[0], cIPD[1], w_fp[5] );
-      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[1], COUPs[1], cIPD[0], cIPD[1], w_fp[9] );
+      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[2], w_fp[0], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[5] );
+      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[1], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[9] );
 
       // Amplitude(s) for diagram number 4
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[9], w_fp[5], w_fp[4], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[9], w_fp[5], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -304,10 +304,10 @@ namespace mg5amcCpu
       // *** DIAGRAM 5 OF 16 ***
 
       // Wavefunction(s) for diagram number 5
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[1], w_fp[4], COUPs[0], 0., 0., w_fp[10] );
+      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[1], w_fp[4], COUPs[0], 1.0, 0., 0., w_fp[10] );
 
       // Amplitude(s) for diagram number 5
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[5], w_fp[10], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[5], w_fp[10], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -320,7 +320,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 6
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[5], w_fp[1], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[5], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -329,11 +329,11 @@ namespace mg5amcCpu
       // *** DIAGRAM 7 OF 16 ***
 
       // Wavefunction(s) for diagram number 7
-      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[0], COUPs[1], cIPD[0], cIPD[1], w_fp[5] );
-      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[2], w_fp[1], COUPs[1], cIPD[0], cIPD[1], w_fp[11] );
+      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[0], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[5] );
+      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[2], w_fp[1], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[11] );
 
       // Amplitude(s) for diagram number 7
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[5], w_fp[11], w_fp[4], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[5], w_fp[11], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -345,7 +345,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 8
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[5], w_fp[2], w_fp[10], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[5], w_fp[2], w_fp[10], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -358,7 +358,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 9
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[5], w_fp[7], w_fp[1], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[5], w_fp[7], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -367,10 +367,10 @@ namespace mg5amcCpu
       // *** DIAGRAM 10 OF 16 ***
 
       // Wavefunction(s) for diagram number 10
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[4], COUPs[0], 0., 0., w_fp[5] );
+      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[4], COUPs[0], 1.0, 0., 0., w_fp[5] );
 
       // Amplitude(s) for diagram number 10
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[11], w_fp[5], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[11], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -383,7 +383,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 11
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[9], w_fp[2], w_fp[5], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[9], w_fp[2], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -396,7 +396,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 12
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[5], w_fp[1], w_fp[6], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[5], w_fp[1], w_fp[6], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -411,7 +411,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 13
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[11], w_fp[0], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[11], w_fp[0], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -423,7 +423,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 14
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[9], w_fp[7], w_fp[0], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[9], w_fp[7], w_fp[0], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -435,7 +435,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 15
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[10], w_fp[6], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[10], w_fp[6], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -447,12 +447,12 @@ namespace mg5amcCpu
       // *** DIAGRAM 16 OF 16 ***
 
       // Wavefunction(s) for diagram number 16
-      VVVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[1], w_fp[4], COUPs[2], 0., 0., w_fp[10] );
-      VVVV3P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[1], w_fp[4], COUPs[2], 0., 0., w_fp[6] );
-      VVVV4P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[1], w_fp[4], COUPs[2], 0., 0., w_fp[9] );
+      VVVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[1], w_fp[4], COUPs[2], 1.0, 0., 0., w_fp[10] );
+      VVVV3P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[1], w_fp[4], COUPs[2], 1.0, 0., 0., w_fp[6] );
+      VVVV4P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[1], w_fp[4], COUPs[2], 1.0, 0., 0., w_fp[9] );
 
       // Amplitude(s) for diagram number 16
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[2], w_fp[10], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[2], w_fp[10], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -460,7 +460,7 @@ namespace mg5amcCpu
       jamp_sv[1] -= amp_sv[0];
       jamp_sv[3] -= amp_sv[0];
       jamp_sv[5] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[2], w_fp[6], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[2], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -468,7 +468,7 @@ namespace mg5amcCpu
       jamp_sv[2] += amp_sv[0];
       jamp_sv[3] -= amp_sv[0];
       jamp_sv[4] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[2], w_fp[9], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[2], w_fp[9], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -861,12 +861,12 @@ namespace mg5amcCpu
                        fptype* allDenominators,    // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
                        bool* isGoodHel )           // output: isGoodHel[ncomb] - device array (CUDA implementation)
-  { /* clang-format on */
-    fptype allMEsLast = 0;
+  {                                                         /* clang-format on */
     const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid
-    allMEs[ievt] = 0;
     for( int ihel = 0; ihel < ncomb; ihel++ )
     {
+      // NEW IMPLEMENTATION OF GETGOODHEL (#630): RESET THE RUNNING SUM OVER HELICITIES TO 0 BEFORE ADDING A NEW HELICITY
+      allMEs[ievt] = 0;
       // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s)
       constexpr fptype_sv* jamp2_sv = nullptr; // no need for color selection during helicity filtering
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
@@ -875,12 +875,11 @@ namespace mg5amcCpu
 #else
       calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv );
 #endif
-      if( allMEs[ievt] != allMEsLast )
+      if( allMEs[ievt] != 0 ) // NEW IMPLEMENTATION OF GETGOODHEL (#630): COMPARE EACH HELICITY CONTRIBUTION TO 0
       {
         //if ( !isGoodHel[ihel] ) std::cout << "sigmaKin_getGoodHel ihel=" << ihel << " TRUE" << std::endl;
         isGoodHel[ihel] = true;
       }
-      allMEsLast = allMEs[ievt]; // running sum up to helicity ihel for event ievt
     }
   }
 #else
@@ -899,19 +898,11 @@ namespace mg5amcCpu
     //assert( (size_t)(allMEs) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS]
     // Allocate arrays at build time to contain at least 16 events (or at least neppV events if neppV>16, e.g. in future VPUs)
     constexpr int maxtry0 = std::max( 16, neppV ); // 16, but at least neppV (otherwise the npagV loop does not even start)
-    fptype allMEsLast[maxtry0] = { 0 };            // allocated at build time: maxtry0 must be a constexpr
     // Loop over only nevt events if nevt is < 16 (note that nevt is always >= neppV)
     assert( nevt >= neppV );
     const int maxtry = std::min( maxtry0, nevt ); // 16, but at most nevt (avoid invalid memory access if nevt<maxtry0)
 
-    // PART 0 - INITIALISATION (before calculate_wavefunctions)
-    // Reset the "matrix elements" - running sums of |M|^2 over helicities for the given event
-    for( int ievt = 0; ievt < maxtry; ++ievt )
-    {
-      allMEs[ievt] = 0; // all zeros
-    }
-
-    // PART 1 - HELICITY LOOP: CALCULATE WAVEFUNCTIONS
+    // HELICITY LOOP: CALCULATE WAVEFUNCTIONS
     const int npagV = maxtry / neppV;
 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
     // Mixed fptypes #537: float for color algebra and double elsewhere
@@ -930,6 +921,16 @@ namespace mg5amcCpu
 #endif
       for( int ihel = 0; ihel < ncomb; ihel++ )
       {
+        // NEW IMPLEMENTATION OF GETGOODHEL (#630): RESET THE RUNNING SUM OVER HELICITIES TO 0 BEFORE ADDING A NEW HELICITY
+        for( int ieppV = 0; ieppV < neppV; ++ieppV )
+        {
+          const int ievt = ievt00 + ieppV;
+          allMEs[ievt] = 0;
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+          const int ievt2 = ievt00 + ieppV + neppV;
+          allMEs[ievt2] = 0;
+#endif
+        }
         constexpr fptype_sv* jamp2_sv = nullptr; // no need for color selection during helicity filtering
         //std::cout << "sigmaKin_getGoodHel ihel=" << ihel << ( isGoodHel[ihel] ? " true" : " false" ) << std::endl;
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
@@ -941,22 +942,18 @@ namespace mg5amcCpu
         for( int ieppV = 0; ieppV < neppV; ++ieppV )
         {
           const int ievt = ievt00 + ieppV;
-          const bool differs = ( allMEs[ievt] != allMEsLast[ievt] );
-          if( differs )
+          if( allMEs[ievt] != 0 ) // NEW IMPLEMENTATION OF GETGOODHEL (#630): COMPARE EACH HELICITY CONTRIBUTION TO 0
           {
             //if ( !isGoodHel[ihel] ) std::cout << "sigmaKin_getGoodHel ihel=" << ihel << " TRUE" << std::endl;
             isGoodHel[ihel] = true;
           }
-          allMEsLast[ievt] = allMEs[ievt]; // running sum up to helicity ihel
 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
           const int ievt2 = ievt00 + ieppV + neppV;
-          const bool differs2 = ( allMEs[ievt2] != allMEsLast[ievt2] );
-          if( differs2 )
+          if( allMEs[ievt2] != 0 ) // NEW IMPLEMENTATION OF GETGOODHEL (#630): COMPARE EACH HELICITY CONTRIBUTION TO 0
           {
             //if ( !isGoodHel[ihel] ) std::cout << "sigmaKin_getGoodHel ihel=" << ihel << " TRUE" << std::endl;
             isGoodHel[ihel] = true;
           }
-          allMEsLast[ievt2] = allMEs[ievt2]; // running sum up to helicity ihel
 #endif
         }
       }
@@ -1013,13 +1010,12 @@ namespace mg5amcCpu
   {
     mgDebugInitialise();
 
-    // SANITY CHECKS for cudacpp code generation (see issues #272 and #343 and PRs #619, #626, #360 and #396)
+    // SANITY CHECKS for cudacpp code generation (see issues #272 and #343 and PRs #619, #626, #360, #396 and #754)
     // These variable are not used anywhere else in the code and their scope is limited to this sanity check
     {
-      // nprocesses>1 was last observed for "mirror processes" in uux_ttx in the 270 branch (see issue #343 and PRs #360 and #396)
+      // nprocesses == 2 may happen for "mirror processes" such as P0_uux_ttx within pp_tt012j (see PR #754)
       constexpr int nprocesses = 1;
-      static_assert( nprocesses == 1, "Assume nprocesses == 1" );
-      // process_id corresponds to the index of DSIG1 Fortran functions (must be 1 because cudacpp is unable to handle DSIG2)
+      static_assert( nprocesses == 1 || nprocesses == 2, "Assume nprocesses == 1 or 2" );
       constexpr int process_id = 1; // code generation source: standalone_cudacpp
       static_assert( process_id == 1, "Assume process_id == 1" );
     }
@@ -1105,23 +1101,26 @@ namespace mg5amcCpu
     }
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     // Event-by-event random choice of color #402
-    const int channelIdC = channelId - 1; // coloramps.h uses the C array indexing starting at 0
-    fptype targetamp[ncolor] = { 0 };
-    for( int icolC = 0; icolC < ncolor; icolC++ )
+    if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783)
     {
-      if( icolC == 0 )
-        targetamp[icolC] = 0;
-      else
-        targetamp[icolC] = targetamp[icolC - 1];
-      if( mgOnGpu::icolamp[channelIdC][icolC] ) targetamp[icolC] += jamp2_sv[icolC];
-    }
-    //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] );
-    for( int icolC = 0; icolC < ncolor; icolC++ )
-    {
-      if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) )
+      const unsigned int channelIdC = channelId - 1; // coloramps.h uses the C array indexing starting at 0
+      fptype targetamp[ncolor] = { 0 };
+      for( int icolC = 0; icolC < ncolor; icolC++ )
       {
-        allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1]
-        break;
+        if( icolC == 0 )
+          targetamp[icolC] = 0;
+        else
+          targetamp[icolC] = targetamp[icolC - 1];
+        if( mgOnGpu::icolamp[channelIdC][icolC] ) targetamp[icolC] += jamp2_sv[icolC];
+      }
+      //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] );
+      for( int icolC = 0; icolC < ncolor; icolC++ )
+      {
+        if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) )
+        {
+          allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1]
+          break;
+        }
       }
     }
 #endif
@@ -1216,57 +1215,60 @@ namespace mg5amcCpu
 #endif
       }
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // multichannel enabled (random color choice)
-      const int channelIdC = channelId - 1; // coloramps.h uses the C array indexing starting at 0
       // Event-by-event random choice of color #402
-      fptype_sv targetamp[ncolor] = { 0 };
-      for( int icolC = 0; icolC < ncolor; icolC++ )
+      if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783)
       {
-        if( icolC == 0 )
-          targetamp[icolC] = fptype_sv{ 0 };
-        else
-          targetamp[icolC] = targetamp[icolC - 1];
-        if( mgOnGpu::icolamp[channelIdC][icolC] ) targetamp[icolC] += jamp2_sv[icolC];
-      }
+        const unsigned int channelIdC = channelId - 1; // coloramps.h uses the C array indexing starting at 0
+        fptype_sv targetamp[ncolor] = { 0 };
+        for( int icolC = 0; icolC < ncolor; icolC++ )
+        {
+          if( icolC == 0 )
+            targetamp[icolC] = fptype_sv{ 0 };
+          else
+            targetamp[icolC] = targetamp[icolC - 1];
+          if( mgOnGpu::icolamp[channelIdC][icolC] ) targetamp[icolC] += jamp2_sv[icolC];
+        }
 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-      fptype_sv targetamp2[ncolor] = { 0 };
-      for( int icolC = 0; icolC < ncolor; icolC++ )
-      {
-        if( icolC == 0 )
-          targetamp2[icolC] = fptype_sv{ 0 };
-        else
-          targetamp2[icolC] = targetamp2[icolC - 1];
-        if( mgOnGpu::icolamp[channelIdC][icolC] ) targetamp2[icolC] += jamp2_sv[ncolor + icolC];
-      }
-#endif
-      for( int ieppV = 0; ieppV < neppV; ++ieppV )
-      {
-        const int ievt = ievt00 + ieppV;
-        //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] );
+        fptype_sv targetamp2[ncolor] = { 0 };
         for( int icolC = 0; icolC < ncolor; icolC++ )
         {
+          if( icolC == 0 )
+            targetamp2[icolC] = fptype_sv{ 0 };
+          else
+            targetamp2[icolC] = targetamp2[icolC - 1];
+          if( mgOnGpu::icolamp[channelIdC][icolC] ) targetamp2[icolC] += jamp2_sv[ncolor + icolC];
+        }
+#endif
+        for( int ieppV = 0; ieppV < neppV; ++ieppV )
+        {
+          const int ievt = ievt00 + ieppV;
+          //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] );
+          for( int icolC = 0; icolC < ncolor; icolC++ )
+          {
 #if defined MGONGPU_CPPSIMD
-          const bool okcol = allrndcol[ievt] < ( targetamp[icolC][ieppV] / targetamp[ncolor - 1][ieppV] );
+            const bool okcol = allrndcol[ievt] < ( targetamp[icolC][ieppV] / targetamp[ncolor - 1][ieppV] );
 #else
-          const bool okcol = allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] );
+            const bool okcol = allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] );
 #endif
-          if( okcol )
-          {
-            allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1]
-            break;
+            if( okcol )
+            {
+              allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1]
+              break;
+            }
           }
-        }
 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-        const int ievt2 = ievt00 + ieppV + neppV;
-        //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt2, allrndcol[ievt2] );
-        for( int icolC = 0; icolC < ncolor; icolC++ )
-        {
-          if( allrndcol[ievt2] < ( targetamp2[icolC][ieppV] / targetamp2[ncolor - 1][ieppV] ) )
+          const int ievt2 = ievt00 + ieppV + neppV;
+          //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt2, allrndcol[ievt2] );
+          for( int icolC = 0; icolC < ncolor; icolC++ )
           {
-            allselcol[ievt2] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1]
-            break;
+            if( allrndcol[ievt2] < ( targetamp2[icolC][ieppV] / targetamp2[ncolor - 1][ieppV] ) )
+            {
+              allselcol[ievt2] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1]
+              break;
+            }
           }
-        }
 #endif
+        }
       }
 #endif // multichannel enabled (random color choice)
     }
diff --git a/epochX/cudacpp/gg_ttg.sa/SubProcesses/P1_Sigma_sm_gg_ttxg/CPPProcess.h b/epochX/cudacpp/gg_ttg.sa/SubProcesses/P1_Sigma_sm_gg_ttxg/CPPProcess.h
index 2731db9bfd..11f562273e 100644
--- a/epochX/cudacpp/gg_ttg.sa/SubProcesses/P1_Sigma_sm_gg_ttxg/CPPProcess.h
+++ b/epochX/cudacpp/gg_ttg.sa/SubProcesses/P1_Sigma_sm_gg_ttxg/CPPProcess.h
@@ -7,7 +7,7 @@
 // Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+// MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
diff --git a/epochX/cudacpp/gg_ttg.sa/SubProcesses/P1_Sigma_sm_gg_ttxg/check_sa.cc b/epochX/cudacpp/gg_ttg.sa/SubProcesses/P1_Sigma_sm_gg_ttxg/check_sa.cc
index 1bad694d1c..7cac5ab47b 100644
--- a/epochX/cudacpp/gg_ttg.sa/SubProcesses/P1_Sigma_sm_gg_ttxg/check_sa.cc
+++ b/epochX/cudacpp/gg_ttg.sa/SubProcesses/P1_Sigma_sm_gg_ttxg/check_sa.cc
@@ -29,7 +29,9 @@
 
 #include <algorithm>
 #include <array>
+#include <cfenv> // for feenableexcept
 #include <cmath>
+#include <csignal> // for signal and SIGFPE
 #include <cstring>
 #include <fstream>
 #include <iomanip>
@@ -74,6 +76,23 @@ usage( char* argv0, int ret = 1 )
   return ret;
 }
 
+#ifdef __CUDACC__
+namespace mg5amcGpu
+#else
+namespace mg5amcCpu
+#endif
+{
+  inline void FPEhandler( int sig )
+  {
+#ifdef __CUDACC__
+    std::cerr << "Floating Point Exception (GPU)" << std::endl;
+#else
+    std::cerr << "Floating Point Exception (CPU)" << std::endl;
+#endif
+    exit( 0 );
+  }
+}
+
 int
 main( int argc, char** argv )
 {
@@ -84,6 +103,18 @@ main( int argc, char** argv )
   using namespace mg5amcCpu;
 #endif
 
+  // Enable FPEs (test #701 and #733 - except on MacOS where feenableexcept is not defined #730)
+#ifndef __APPLE__
+  const char* enableFPEc = getenv( "CUDACPP_RUNTIME_ENABLEFPE" );
+  const bool enableFPE = ( enableFPEc != 0 ) && ( std::string( enableFPEc ) != "" );
+  if( enableFPE )
+  {
+    std::cout << "WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions" << std::endl;
+    feenableexcept( FE_INVALID | FE_DIVBYZERO | FE_OVERFLOW | FE_UNDERFLOW ); // debug #701
+    signal( SIGFPE, FPEhandler );
+  }
+#endif
+
   // DEFAULTS FOR COMMAND LINE ARGUMENTS
   bool verbose = false;
   bool debug = false;
@@ -103,12 +134,14 @@ main( int argc, char** argv )
     CurandHost = 1,
     CurandDevice = 2
   };
-#ifdef __CUDACC__
-  RandomNumberMode rndgen = RandomNumberMode::CurandDevice; // default on CUDA GPU
-#elif not defined MGONGPU_HAS_NO_CURAND
-  RandomNumberMode rndgen = RandomNumberMode::CurandHost;  // default on CPU if build has curand
+#ifdef MGONGPU_HAS_NO_CURAND
+  RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // this is the only supported mode if build has no curand (PR #784 and #785)
+#elif defined __HIPCC__
+#error Internal error: MGONGPU_HAS_NO_CURAND should have been set for __HIPCC__ // default on AMD GPUs should be common random
+#elif defined __CUDACC__
+  RandomNumberMode rndgen = RandomNumberMode::CurandDevice; // default on NVidia GPU if build has curand
 #else
-  RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // default on CPU if build has no curand and on HIP GPU
+  RandomNumberMode rndgen = RandomNumberMode::CurandHost; // default on CPU if build has curand
 #endif
   // Rambo sampling mode (NB RamboHost implies CommonRandom or CurandHost!)
   enum class RamboSamplingMode
@@ -146,18 +179,20 @@ main( int argc, char** argv )
     }
     else if( arg == "--curdev" )
     {
-#ifdef __CUDACC__
-      rndgen = RandomNumberMode::CurandDevice;
+#ifndef __CUDACC__
+      throw std::runtime_error( "CurandDevice is not supported on CPUs or non-NVidia GPUs" );
+#elif defined MGONGPU_HAS_NO_CURAND
+      throw std::runtime_error( "CurandDevice is not supported because this application was built without Curand support" );
 #else
-      throw std::runtime_error( "CurandDevice is not supported on CPUs or on HIP GPUs" );
+      rndgen = RandomNumberMode::CurandDevice;
 #endif
     }
     else if( arg == "--curhst" )
     {
-#ifndef MGONGPU_HAS_NO_CURAND
-      rndgen = RandomNumberMode::CurandHost;
-#else
+#ifdef MGONGPU_HAS_NO_CURAND
       throw std::runtime_error( "CurandHost is not supported because this application was built without Curand support" );
+#else
+      rndgen = RandomNumberMode::CurandHost;
 #endif
     }
     else if( arg == "--common" )
@@ -278,10 +313,10 @@ main( int argc, char** argv )
   const std::string procKey = "0a ProcInit";
   timermap.start( procKey );
 
-  // Create a process object
+  // Create a process object, read param card and set parameters
+  // FIXME: the process instance can happily go out of scope because it is only needed to read parameters?
+  // FIXME: the CPPProcess should really be a singleton? (for instance, in bridge mode this will be called twice here?)
   CPPProcess process( verbose );
-
-  // Read param_card and set parameters
   process.initProc( "../../Cards/param_card.dat" );
   const fptype energy = 1500; // historical default, Ecms = 1500 GeV = 1.5 TeV (above the Z peak)
   //const fptype energy = 91.2; // Ecms = 91.2 GeV (Z peak)
@@ -389,30 +424,26 @@ main( int argc, char** argv )
   {
     prnk.reset( new CommonRandomNumberKernel( hstRndmom ) );
   }
-#ifndef MGONGPU_HAS_NO_CURAND
   else if( rndgen == RandomNumberMode::CurandHost )
   {
+#ifdef MGONGPU_HAS_NO_CURAND
+    throw std::runtime_error( "INTERNAL ERROR! CurandHost is not supported because this application was built without Curand support" ); // INTERNAL ERROR (no path to this statement)
+#else
     const bool onDevice = false;
     prnk.reset( new CurandRandomNumberKernel( hstRndmom, onDevice ) );
+#endif
   }
-#ifdef __CUDACC__
   else
   {
+#ifdef MGONGPU_HAS_NO_CURAND
+    throw std::runtime_error( "INTERNAL ERROR! CurandDevice is not supported because this application was built without Curand support" ); // INTERNAL ERROR (no path to this statement)
+#elif defined __CUDACC__
     const bool onDevice = true;
     prnk.reset( new CurandRandomNumberKernel( devRndmom, onDevice ) );
-  }
 #else
-  else
-  {
-    throw std::logic_error( "CurandDevice is not supported on CPUs or HIP GPUs" ); // INTERNAL ERROR (no path to this statement)
-  }
+    throw std::logic_error( "INTERNAL ERROR! CurandDevice is not supported on CPUs or non-NVidia GPUs" ); // INTERNAL ERROR (no path to this statement)
 #endif
-#else
-  else
-  {
-    throw std::logic_error( "This application was built without Curand support" ); // INTERNAL ERROR (no path to this statement)
   }
-#endif
 
   // --- 0c. Create rambo sampling kernel [keep this in 0c for the moment]
   std::unique_ptr<SamplingKernelBase> prsk;
@@ -747,7 +778,7 @@ main( int argc, char** argv )
   wrkflwtxt += "HIP:";
 #else
   wrkflwtxt += "CPP:";
-#endif
+#endif /* clang-format off */
   // -- DOUBLE or FLOAT?
 #if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
   wrkflwtxt += "MIX+"; // mixed fptypes (single precision color algebra #537)
@@ -757,7 +788,7 @@ main( int argc, char** argv )
   wrkflwtxt += "FLT+";
 #else
   wrkflwtxt += "???+"; // no path to this statement
-#endif
+#endif /* clang-format on */
   // -- CUCOMPLEX or THRUST or STD complex numbers?
 #ifdef __CUDACC__
 #if defined MGONGPU_CUCXTYPE_CUCOMPLEX
diff --git a/epochX/cudacpp/gg_ttg.sa/SubProcesses/cudacpp.mk b/epochX/cudacpp/gg_ttg.sa/SubProcesses/cudacpp.mk
index 59a2c906eb..f2cfa349da 100644
--- a/epochX/cudacpp/gg_ttg.sa/SubProcesses/cudacpp.mk
+++ b/epochX/cudacpp/gg_ttg.sa/SubProcesses/cudacpp.mk
@@ -4,10 +4,13 @@
 # Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 
 #=== Determine the name of this makefile (https://ftp.gnu.org/old-gnu/Manuals/make-3.80/html_node/make_17.html)
-#=== NB: different names (e.g. cudacpp.mk and cudacpp_src.mk) are used in the Subprocess and src directories
+#=== NB: use ':=' to ensure that the value of CUDACPP_MAKEFILE is not modified further down after including make_opts
+#=== NB: use 'override' to ensure that the value can not be modified from the outside
+override CUDACPP_MAKEFILE := $(word $(words $(MAKEFILE_LIST)),$(MAKEFILE_LIST))
+###$(info CUDACPP_MAKEFILE='$(CUDACPP_MAKEFILE)')
 
-CUDACPP_MAKEFILE = $(word $(words $(MAKEFILE_LIST)),$(MAKEFILE_LIST))
-CUDACPP_SRC_MAKEFILE = cudacpp_src.mk
+#=== NB: different names (e.g. cudacpp.mk and cudacpp_src.mk) are used in the Subprocess and src directories
+override CUDACPP_SRC_MAKEFILE = cudacpp_src.mk
 
 #-------------------------------------------------------------------------------
 
@@ -29,7 +32,17 @@ UNAME_P := $(shell uname -p)
 
 #-------------------------------------------------------------------------------
 
-#=== Configure common compiler flags for C++ and CUDA
+#=== Include the common MG5aMC Makefile options
+
+# OM: this is crucial for MG5aMC flag consistency/documentation
+# AV: temporarely comment this out because it breaks cudacpp builds
+ifneq ($(wildcard ../../Source/make_opts),)
+include ../../Source/make_opts
+endif
+
+#-------------------------------------------------------------------------------
+
+#=== Configure common compiler flags for C++ and CUDA/HIP
 
 INCFLAGS = -I.
 OPTFLAGS = -O3 # this ends up in GPUFLAGS too (should it?), cannot add -Ofast or -ffast-math here
@@ -101,68 +114,85 @@ endif
 # Note: AR, CXX and FC are implicitly defined if not set externally
 # See https://www.gnu.org/software/make/manual/html_node/Implicit-Variables.html
 
-#-------------------------------------------------------------------------------
-
-CUDA_COMPILER_PATH := $(shell compiler="`which nvcc 2>/dev/null`" && while [ -L "$$compiler" ]; do compiler=`readlink "$$compiler"`; done && echo "$$compiler")
-HIP_COMPILER_PATH := $(shell compiler="`which hipcc 2>/dev/null`" && while [ -L "$$compiler" ]; do compiler=`readlink "$$compiler"`; done && echo "$$compiler")
-
-ifeq ($(findstring nvcc,$(CUDA_COMPILER_PATH)),nvcc)
-  #=== Configure the CUDA compiler
-
-  # If CXX is not a single word (example "clang++ --gcc-toolchain...") then disable CUDA builds (issue #505)
-  # This is because it is impossible to pass this to "GPUFLAGS += -ccbin <host-compiler>" below
-  ifneq ($(words $(subst ccache ,,$(CXX))),1) # allow at most "CXX=ccache <host-compiler>" from outside
-    $(warning CUDA builds are not supported for multi-word CXX "$(CXX)")
-    override CUDA_HOME=disabled
-  endif
+# Add -mmacosx-version-min=11.3 to avoid "ld: warning: object file was built for newer macOS version than being linked"
+ifneq ($(shell $(CXX) --version | egrep '^Apple clang'),)
+CXXFLAGS += -mmacosx-version-min=11.3
+endif
 
-  # If CUDA_HOME is not set, try to set it from the location of nvcc
-  ifndef CUDA_HOME
-    CUDA_HOME = $(patsubst %bin/nvcc,%,$(shell which nvcc 2>/dev/null))
-    $(warning CUDA_HOME was not set: using "$(CUDA_HOME)")
-  endif
+#-------------------------------------------------------------------------------
 
-  # Set GPUCC as $(CUDA_HOME)/bin/nvcc if it exists
-  ifneq ($(wildcard $(CUDA_HOME)/bin/nvcc),)
-    GPUCC = $(CUDA_HOME)/bin/nvcc
-    USE_NVTX ?=-DUSE_NVTX
-    # See https://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/index.html
-    # See https://arnon.dk/matching-sm-architectures-arch-and-gencode-for-various-nvidia-cards/
-    # Default: use compute capability 70 for V100 (CERN lxbatch, CERN itscrd, Juwels Cluster).
-    # Embed device code for 70, and PTX for 70+.
-    # Export MADGRAPH_CUDA_ARCHITECTURE (comma-separated list) to use another value or list of values (see #533).
-    # Examples: use 60 for P100 (Piz Daint), 80 for A100 (Juwels Booster, NVidia raplab/Curiosity).
-    MADGRAPH_CUDA_ARCHITECTURE ?= 70
-    ###CUARCHFLAGS = -gencode arch=compute_$(MADGRAPH_CUDA_ARCHITECTURE),code=compute_$(MADGRAPH_CUDA_ARCHITECTURE) -gencode arch=compute_$(MADGRAPH_CUDA_ARCHITECTURE),code=sm_$(MADGRAPH_CUDA_ARCHITECTURE) # Older implementation (AV): go back to this one for multi-GPU support #533
-    ###CUARCHFLAGS = --gpu-architecture=compute_$(MADGRAPH_CUDA_ARCHITECTURE) --gpu-code=sm_$(MADGRAPH_CUDA_ARCHITECTURE),compute_$(MADGRAPH_CUDA_ARCHITECTURE) # Newer implementation (SH): cannot use this as-is for multi-GPU support #533
-    comma:=,
-    CUARCHFLAGS = $(foreach arch,$(subst $(comma), ,$(MADGRAPH_CUDA_ARCHITECTURE)),-gencode arch=compute_$(arch),code=compute_$(arch) -gencode arch=compute_$(arch),code=sm_$(arch))
-    CUINC = -I$(CUDA_HOME)/include/
-    CURANDLIBFLAGS = -L$(CUDA_HOME)/lib64/ -lcurand # NB: -lcuda is not needed here!
-    CUOPTFLAGS = -lineinfo
-    GPUFLAGS = $(foreach opt, $(OPTFLAGS), -Xcompiler $(opt)) $(CUOPTFLAGS) $(INCFLAGS) $(CUINC) $(USE_NVTX) $(CUARCHFLAGS) -use_fast_math
-    ###GPUFLAGS += -Xcompiler -Wall -Xcompiler -Wextra -Xcompiler -Wshadow
-    ###GPUCC_VERSION = $(shell $(GPUCC) --version | grep 'Cuda compilation tools' | cut -d' ' -f5 | cut -d, -f1)
-    GPUFLAGS += -std=c++17 # need CUDA >= 11.2 (see #333): this is enforced in mgOnGpuConfig.h
-    # Without -maxrregcount: baseline throughput: 6.5E8 (16384 32 12) up to 7.3E8 (65536 128 12)
-    ###GPUFLAGS+= --maxrregcount 160 # improves throughput: 6.9E8 (16384 32 12) up to 7.7E8 (65536 128 12)
-    ###GPUFLAGS+= --maxrregcount 128 # improves throughput: 7.3E8 (16384 32 12) up to 7.6E8 (65536 128 12)
-    ###GPUFLAGS+= --maxrregcount 96 # degrades throughput: 4.1E8 (16384 32 12) up to 4.5E8 (65536 128 12)
-    ###GPUFLAGS+= --maxrregcount 64 # degrades throughput: 1.7E8 (16384 32 12) flat at 1.7E8 (65536 128 12)
-    CUBUILDRULEFLAGS = -Xcompiler -fPIC -c
-    CCBUILDRULEFLAGS = -Xcompiler -fPIC -c -x cu
-    CUDATESTFLAGS = -lcuda
-  else ifneq ($(origin REQUIRE_CUDA),undefined)
-    # If REQUIRE_CUDA is set but no cuda is found, stop here (e.g. for CI tests on GPU #443)
-    $(error No cuda installation found (set CUDA_HOME or make GPUCC visible in PATH))
+#=== Configure the GPU compiler (CUDA or HIP)
+
+# FIXME! (AV 24.01.2024)
+# In the current implementation (without separate builds for C++ and CUDA/HIP), we first check for cudacc and hipcc in CUDA_HOME and HIP_HOME.
+# If CUDA_HOME or HIP_HOME are not set, try to determine them from the path to cudacc and hipcc.
+# While convoluted, this is currently necessary to allow disabling CUDA/HIP builds by setting CUDA_HOME or HIP_HOME to invalid paths.
+# This will (probably?) be fixed when separate C++ and CUDA/HIP builds are implemented (PR #775).
+
+# If CXX is not a single word (example "clang++ --gcc-toolchain...") then disable CUDA and HIP builds (issue #505)
+# This is because it is impossible to pass this to "GPUFLAGS += -ccbin <host-compiler>" below
+ifneq ($(words $(subst ccache ,,$(CXX))),1) # allow at most "CXX=ccache <host-compiler>" from outside
+  $(warning CUDA and HIP builds are not supported for multi-word CXX "$(CXX)")
+  override CUDA_HOME=disabled
+  override HIP_HOME=disabled
+endif
+
+# If CUDA_HOME is not set, try to set it from the path to nvcc
+ifndef CUDA_HOME
+  CUDA_HOME = $(patsubst %bin/nvcc,%,$(shell which nvcc 2>/dev/null))
+  $(warning CUDA_HOME was not set: using "$(CUDA_HOME)")
+endif
+
+# If HIP_HOME is not set, try to set it from the path to hipcc
+ifndef HIP_HOME
+  HIP_HOME = $(patsubst %bin/hipcc,%,$(HIP_COMPILER_PATH))
+  $(warning HIP_HOME was not set: using "$(HIP_HOME)")
+endif
+
+# FIXME! (AV 24.01.2024)
+# In the current implementation (without separate builds for C++ and CUDA/HIP),
+# builds are performed for HIP only if CUDA is not found in the path.
+# If both CUDA and HIP are installed, HIP builds can be triggered by unsetting CUDA_HOME.
+# This will be fixed when separate C++ and CUDA/HIP builds are implemented (PR #775).
+
+#--- Option 1: CUDA exists -> use CUDA
+
+# Set GPUCC as $(CUDA_HOME)/bin/nvcc if it exists
+ifneq ($(wildcard $(CUDA_HOME)/bin/nvcc),)
+
+  GPUCC = $(CUDA_HOME)/bin/nvcc
+  USE_NVTX ?=-DUSE_NVTX
+  # See https://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/index.html
+  # See https://arnon.dk/matching-sm-architectures-arch-and-gencode-for-various-nvidia-cards/
+  # Default: use compute capability 70 for V100 (CERN lxbatch, CERN itscrd, Juwels Cluster).
+  # Embed device code for 70, and PTX for 70+.
+  # Export MADGRAPH_CUDA_ARCHITECTURE (comma-separated list) to use another value or list of values (see #533).
+  # Examples: use 60 for P100 (Piz Daint), 80 for A100 (Juwels Booster, NVidia raplab/Curiosity).
+  MADGRAPH_CUDA_ARCHITECTURE ?= 70
+  ###CUARCHFLAGS = -gencode arch=compute_$(MADGRAPH_CUDA_ARCHITECTURE),code=compute_$(MADGRAPH_CUDA_ARCHITECTURE) -gencode arch=compute_$(MADGRAPH_CUDA_ARCHITECTURE),code=sm_$(MADGRAPH_CUDA_ARCHITECTURE) # Older implementation (AV): go back to this one for multi-GPU support #533
+  ###CUARCHFLAGS = --gpu-architecture=compute_$(MADGRAPH_CUDA_ARCHITECTURE) --gpu-code=sm_$(MADGRAPH_CUDA_ARCHITECTURE),compute_$(MADGRAPH_CUDA_ARCHITECTURE) # Newer implementation (SH): cannot use this as-is for multi-GPU support #533
+  comma:=,
+  CUARCHFLAGS = $(foreach arch,$(subst $(comma), ,$(MADGRAPH_CUDA_ARCHITECTURE)),-gencode arch=compute_$(arch),code=compute_$(arch) -gencode arch=compute_$(arch),code=sm_$(arch))
+  CUINC = -I$(CUDA_HOME)/include/
+  ifeq ($(RNDGEN),hasNoCurand)
+    CURANDLIBFLAGS=
   else
-    # No cuda. Switch cuda compilation off and go to common random numbers in C++
-    $(warning CUDA_HOME is not set or is invalid: export CUDA_HOME to compile with cuda)
-    override GPUCC=
-    override USE_NVTX=
-    override CUINC=
-    override CURANDLIBFLAGS=
+    CURANDLIBFLAGS = -L$(CUDA_HOME)/lib64/ -lcurand # NB: -lcuda is not needed here!
   endif
+  CUOPTFLAGS = -lineinfo
+  ###GPUFLAGS = $(OPTFLAGS) $(CUOPTFLAGS) $(INCFLAGS) $(CUINC) $(USE_NVTX) $(CUARCHFLAGS) -use_fast_math
+  GPUFLAGS = $(foreach opt, $(OPTFLAGS), -Xcompiler $(opt)) $(CUOPTFLAGS) $(INCFLAGS) $(CUINC) $(USE_NVTX) $(CUARCHFLAGS) -use_fast_math
+  ###GPUFLAGS += -Xcompiler -Wall -Xcompiler -Wextra -Xcompiler -Wshadow
+  ###GPUCC_VERSION = $(shell $(GPUCC) --version | grep 'Cuda compilation tools' | cut -d' ' -f5 | cut -d, -f1)
+  GPUFLAGS += -std=c++17 # need CUDA >= 11.2 (see #333): this is enforced in mgOnGpuConfig.h
+  # Without -maxrregcount: baseline throughput: 6.5E8 (16384 32 12) up to 7.3E8 (65536 128 12)
+  ###GPUFLAGS+= --maxrregcount 160 # improves throughput: 6.9E8 (16384 32 12) up to 7.7E8 (65536 128 12)
+  ###GPUFLAGS+= --maxrregcount 128 # improves throughput: 7.3E8 (16384 32 12) up to 7.6E8 (65536 128 12)
+  ###GPUFLAGS+= --maxrregcount 96 # degrades throughput: 4.1E8 (16384 32 12) up to 4.5E8 (65536 128 12)
+  ###GPUFLAGS+= --maxrregcount 64 # degrades throughput: 1.7E8 (16384 32 12) flat at 1.7E8 (65536 128 12)
+  CUBUILDRULEFLAGS = -Xcompiler -fPIC -c
+  CCBUILDRULEFLAGS = -Xcompiler -fPIC -c -x cu
+  CUDATESTFLAGS = -lcuda
 
   # Set the host C++ compiler for GPUCC via "-ccbin <host-compiler>"
   # (NB issue #505: this must be a single word, "clang++ --gcc-toolchain..." is not supported)
@@ -173,71 +203,55 @@ ifeq ($(findstring nvcc,$(CUDA_COMPILER_PATH)),nvcc)
   GPUFLAGS += -allow-unsupported-compiler
   endif
 
-else ifeq ($(findstring hipcc,$(HIP_COMPILER_PATH)),hipcc)
-  #=== Configure the HIP compiler
+else ifneq ($(origin REQUIRE_CUDA),undefined)
 
-  # If CXX is not a single word (example "clang++ --gcc-toolchain...") then disable CUDA/HIP builds (issue #505)
-  # This is because it is impossible to pass this to "GPUFLAGS += -ccbin <host-compiler>" below
-  ifneq ($(words $(subst ccache ,,$(CXX))),1) # allow at most "CXX=ccache <host-compiler>" from outside
-    $(warning HIP builds are not supported for multi-word CXX "$(CXX)")
-    override HIP_HOME=disabled
-  endif
+  # If REQUIRE_CUDA is set but no cuda is found, stop here (e.g. for CI tests on GPU #443)
+  $(error No cuda installation found (set CUDA_HOME or make GPUCC visible in PATH))
 
-  # If HIP_HOME is not set, try to set it from the location of GPUCC
-  ifndef HIP_HOME
-    HIP_HOME = $(patsubst %bin/hipcc,%,$(HIP_COMPILER_PATH))
-    $(warning HIP_HOME was not set: using "$(HIP_HOME)")
-  endif
+#--- Option 2: CUDA does not exist, HIP exists -> use HIP
 
-  # Set GPUCC as $(HIP_HOME)/bin/hipcc if it exists
-  ifneq ($(wildcard $(HIP_HOME)/bin/hipcc),)
-    GPUCC = $(HIP_HOME)/bin/hipcc
-
-    # Should maybe find something equivelant to this in HIP
-    #USE_NVTX ?=-DUSE_NVTX
-
-    HIPARCHFLAGS = -target x86_64-linux-gnu --offload-arch=gfx90a
-    HIPINC = -I$(HIP_HOME)/include/
-
-    # -DHIP_FAST_MATH equivelant to -use_fast_math in HIP 
-    # (But only for single precision line 208: https://rocm-developer-tools.github.io/HIP/hcc__detail_2math__functions_8h_source.html)
-    GPUFLAGS = $(OPTFLAGS) $(CUOPTFLAGS) $(INCFLAGS) $(HIPINC) $(HIPARCHFLAGS) -DHIP_FAST_MATH -DHIP_PLATFORM=amd -fPIC
-    ###GPUFLAGS += -Xcompiler -Wall -Xcompiler -Wextra -Xcompiler -Wshadow
-    GPUFLAGS += -std=c++17
-    # Without -maxrregcount: baseline throughput: 6.5E8 (16384 32 12) up to 7.3E8 (65536 128 12)
-    ###GPUFLAGS+= --maxrregcount 160 # improves throughput: 6.9E8 (16384 32 12) up to 7.7E8 (65536 128 12)
-    ###GPUFLAGS+= --maxrregcount 128 # improves throughput: 7.3E8 (16384 32 12) up to 7.6E8 (65536 128 12)
-    ###GPUFLAGS+= --maxrregcount 96 # degrades throughput: 4.1E8 (16384 32 12) up to 4.5E8 (65536 128 12)
-    ###GPUFLAGS+= --maxrregcount 64 # degrades throughput: 1.7E8 (16384 32 12) flat at 1.7E8 (65536 128 12)
-
-    CUBUILDRULEFLAGS = -fPIC -c
-    CCBUILDRULEFLAGS = -fPIC -c
-
-  else ifneq ($(origin REQUIRE_HIP),undefined)
-    # If REQUIRE_HIP is set but no cuda is found, stop here (e.g. for CI tests on GPU #443)
-    $(error No hip installation found (set HIP_HOME or make GPUCC visible in PATH))
-  else
-    # No hip. Switch hip compilation off and go to common random numbers in C++
-    $(warning HIP_HOME is not set or is invalid: export HIP_HOME to compile with hip)
-    override GPUCC=
-    override USE_NVTX=
-    override CUINC=
-    override CURANDLIBFLAGS=
-  endif
+# Set GPUCC as $(HIP_HOME)/bin/hipcc if it exists
+else ifneq ($(wildcard $(HIP_HOME)/bin/hipcc),)
 
-  # Allow newer (unsupported) C++ compilers with older versions of CUDA if ALLOW_UNSUPPORTED_COMPILER_IN_CUDA is set (#504)
-  ifneq ($(origin ALLOW_UNSUPPORTED_COMPILER_IN_CUDA),undefined)
-  GPUFLAGS += -allow-unsupported-compiler
-  endif
+  GPUCC = $(HIP_HOME)/bin/hipcc
+  #USE_NVTX ?=-DUSE_NVTX # should maybe find something equivalent to this in HIP?
+  HIPARCHFLAGS = -target x86_64-linux-gnu --offload-arch=gfx90a
+  HIPINC = -I$(HIP_HOME)/include/
+  # Note: -DHIP_FAST_MATH is equivalent to -use_fast_math in HIP 
+  # (but only for single precision line 208: https://rocm-developer-tools.github.io/HIP/hcc__detail_2math__functions_8h_source.html)
+  GPUFLAGS = $(OPTFLAGS) $(CUOPTFLAGS) $(INCFLAGS) $(HIPINC) $(HIPARCHFLAGS) -DHIP_FAST_MATH -DHIP_PLATFORM=amd -fPIC
+  ###GPUFLAGS += -Xcompiler -Wall -Xcompiler -Wextra -Xcompiler -Wshadow
+  GPUFLAGS += -std=c++17
+  ###GPUFLAGS+= --maxrregcount 255 # (AV: is this option valid on HIP and meaningful on AMD GPUs?)
+  CUBUILDRULEFLAGS = -fPIC -c
+  CCBUILDRULEFLAGS = -fPIC -c
+
+else ifneq ($(origin REQUIRE_HIP),undefined)
+
+  # If REQUIRE_HIP is set but no HIP is found, stop here (e.g. for CI tests on GPU #443)
+  $(error No hip installation found (set HIP_HOME or make GPUCC visible in PATH))
+
+#--- Option 3: CUDA does not exist, HIP does not exist -> switch off both CUDA and HIP
+
+else
+
+  # No cudacc and no hipcc: switch CUDA and HIP compilation off and go to common random numbers in C++
+  $(warning CUDA_HOME is not set or is invalid: export CUDA_HOME to compile with cuda)
+  $(warning HIP_HOME is not set or is invalid: export HIP_HOME to compile with hip)
+  override GPUCC=
+  override USE_NVTX=
+  override CUINC=
+  override CURANDLIBFLAGS=
 
 endif
 
+# Export GPUCC (so that it can also be used in cudacpp_src.mk?)
 export GPUCC
 export GPUFLAGS
 
 #-------------------------------------------------------------------------------
 
-#=== Configure ccache for C++ and CUDA builds
+#=== Configure ccache for C++ and CUDA/HIP builds
 
 # Enable ccache if USECCACHE=1
 ifeq ($(USECCACHE)$(shell echo $(CXX) | grep ccache),1)
@@ -254,7 +268,7 @@ endif
 
 #-------------------------------------------------------------------------------
 
-#=== Configure PowerPC-specific compiler flags for C++ and CUDA
+#=== Configure PowerPC-specific compiler flags for C++ and CUDA/HIP
 
 # PowerPC-specific CXX compiler flags (being reviewed)
 ifeq ($(UNAME_P),ppc64le)
@@ -270,7 +284,7 @@ else
   ######CXXFLAGS+= -fno-semantic-interposition # no benefit (neither alone, nor combined with -flto)
 endif
 
-# PowerPC-specific CUDA compiler flags (to be reviewed!)
+# PowerPC-specific CUDA/HIP compiler flags (to be reviewed!)
 ifeq ($(UNAME_P),ppc64le)
   GPUFLAGS+= -Xcompiler -mno-float128
 endif
@@ -285,12 +299,14 @@ override OMPFLAGS = -fopenmp
 ###override OMPFLAGS = # disable OpenMP MT on Intel (was ok without GPUCC but not ok with GPUCC before #578)
 else ifneq ($(shell $(CXX) --version | egrep '^(clang)'),)
 override OMPFLAGS = -fopenmp
-###override OMPFLAGS = # disable OpenMP MT on clang (was not ok without or with GPUCC before #578)
-else ifneq ($(shell $(CXX) --version | egrep '^(Apple clang)'),)
-override OMPFLAGS = # disable OpenMP MT on Apple clang (builds fail in the CI #578)
+###override OMPFLAGS = # disable OpenMP MT on clang (was not ok without or with nvcc before #578)
+###else ifneq ($(shell $(CXX) --version | egrep '^(Apple clang)'),) # AV for Mac (Apple clang compiler)
+else ifeq ($(UNAME_S),Darwin) # OM for Mac (any compiler)
+override OMPFLAGS = # AV disable OpenMP MT on Apple clang (builds fail in the CI #578)
+###override OMPFLAGS = -fopenmp # OM reenable OpenMP MT on Apple clang? (AV Oct 2023: this still fails in the CI)
 else
-override OMPFLAGS = -fopenmp
-###override OMPFLAGS = # disable OpenMP MT (default before #575)
+override OMPFLAGS = -fopenmp # enable OpenMP MT by default on all other platforms
+###override OMPFLAGS = # disable OpenMP MT on all other platforms (default before #575)
 endif
 
 # Set the default AVX (vectorization) choice
@@ -356,7 +372,7 @@ export OMPFLAGS
 
 #-------------------------------------------------------------------------------
 
-#=== Set the CUDA/C++ compiler flags appropriate to user-defined choices of AVX, FPTYPE, HELINL, HRDCOD, RNDGEN
+#=== Set the CUDA/HIP/C++ compiler flags appropriate to user-defined choices of AVX, FPTYPE, HELINL, HRDCOD, RNDGEN
 
 # Set the build flags appropriate to OMPFLAGS
 $(info OMPFLAGS=$(OMPFLAGS))
@@ -573,8 +589,9 @@ $(BUILDDIR)/gcheck_sa.o: CXXFLAGS += $(USE_NVTX) $(CUINC)
 
 # Apply special build flags only to check_sa and CurandRandomNumberKernel (curand headers, #679)
 $(BUILDDIR)/check_sa.o: CXXFLAGS += $(CXXFLAGSCURAND)
-$(BUILDDIR)/gcheck_sa.o: CXXFLAGS += $(CXXFLAGSCURAND)
+$(BUILDDIR)/gcheck_sa.o: CUFLAGS += $(CXXFLAGSCURAND)
 $(BUILDDIR)/CurandRandomNumberKernel.o: CXXFLAGS += $(CXXFLAGSCURAND)
+$(BUILDDIR)/gCurandRandomNumberKernel.o: CUFLAGS += $(CXXFLAGSCURAND)
 ifeq ($(RNDGEN),hasCurand)
 $(BUILDDIR)/CurandRandomNumberKernel.o: CXXFLAGS += $(CUINC)
 endif
@@ -772,12 +789,18 @@ $(testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(cxx_object
 	  $(GPUCC) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) -ldl $(LIBFLAGS) $(CUDATESTFLAGS)
 endif
 
+# Use target gtestlibs to build only googletest
+ifneq ($(GTESTLIBS),)
+gtestlibs: $(GTESTLIBS)
+endif
+
 # Use flock (Linux only, no Mac) to allow 'make -j' if googletest has not yet been downloaded https://stackoverflow.com/a/32666215
 $(GTESTLIBS):
 ifneq ($(shell which flock 2>/dev/null),)
+	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
 	flock $(BUILDDIR)/.make_test.lock $(MAKE) -C $(TESTDIR)
 else
-	$(MAKE) -C $(TESTDIR)
+	if [ -d $(TESTDIR) ]; then $(MAKE) -C $(TESTDIR); fi
 endif
 
 #-------------------------------------------------------------------------------
diff --git a/epochX/cudacpp/gg_ttg.sa/SubProcesses/fbridge.cc b/epochX/cudacpp/gg_ttg.sa/SubProcesses/fbridge.cc
index 2b956730d4..22ce3f5115 100644
--- a/epochX/cudacpp/gg_ttg.sa/SubProcesses/fbridge.cc
+++ b/epochX/cudacpp/gg_ttg.sa/SubProcesses/fbridge.cc
@@ -49,11 +49,7 @@ extern "C"
 #ifdef MGONGPUCPP_GPUIMPL
     GpuRuntime::setUp();
 #endif
-    // Create a process object, read parm card and set parameters
-    // FIXME: the process instance can happily go out of scope because it is only needed to read parameters?
-    // FIXME: the CPPProcess should really be a singleton? what if fbridgecreate is called from several Fortran threads?
-    CPPProcess process( /*verbose=*/false );
-    process.initProc( "../../Cards/param_card.dat" );
+    // (NB: CPPProcess::initProc no longer needs to be executed here because it is called in the Bridge constructor)
     // FIXME: disable OMP in Bridge when called from Fortran
     *ppbridge = new Bridge<FORTRANFPTYPE>( *pnevtF, *pnparF, *pnp4F );
   }
diff --git a/epochX/cudacpp/gg_ttg.sa/SubProcesses/runTest.cc b/epochX/cudacpp/gg_ttg.sa/SubProcesses/runTest.cc
index 0ed26180ca..de327f2321 100644
--- a/epochX/cudacpp/gg_ttg.sa/SubProcesses/runTest.cc
+++ b/epochX/cudacpp/gg_ttg.sa/SubProcesses/runTest.cc
@@ -71,6 +71,8 @@ struct CPUTest : public CUDA_CPU_TestBase
     , hstSelCol( nevt )
     , hstIsGoodHel( CPPProcess::ncomb )
   {
+    // FIXME: the process instance can happily go out of scope because it is only needed to read parameters?
+    // FIXME: the CPPProcess should really be a singleton?
     process.initProc( "../../Cards/param_card.dat" );
   }
 
@@ -183,6 +185,8 @@ struct CUDATest : public CUDA_CPU_TestBase
     , devSelCol( nevt )
     , devIsGoodHel( CPPProcess::ncomb )
   {
+    // FIXME: the process instance can happily go out of scope because it is only needed to read parameters?
+    // FIXME: the CPPProcess should really be a singleton?
     process.initProc( "../../Cards/param_card.dat" );
   }
 
diff --git a/epochX/cudacpp/gg_ttg.sa/SubProcesses/testxxx.cc b/epochX/cudacpp/gg_ttg.sa/SubProcesses/testxxx.cc
index 016bc0f472..e5167de00c 100644
--- a/epochX/cudacpp/gg_ttg.sa/SubProcesses/testxxx.cc
+++ b/epochX/cudacpp/gg_ttg.sa/SubProcesses/testxxx.cc
@@ -59,7 +59,8 @@ TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testxxx )
   using namespace mg5amcCpu;
 #endif
 #ifndef __APPLE__ // test #701 (except on MacOS where feenableexcept is not defined #730)
-  const bool enableFPE = !getenv( "CUDACPP_RUNTIME_DISABLEFPE" );
+  const char* enableFPEc = getenv( "CUDACPP_RUNTIME_ENABLEFPE" );
+  const bool enableFPE = ( enableFPEc != 0 ) && ( std::string( enableFPEc ) != "" );
   if( enableFPE )
   {
     feenableexcept( FE_INVALID | FE_DIVBYZERO | FE_OVERFLOW | FE_UNDERFLOW ); // debug #701
diff --git a/epochX/cudacpp/gg_ttg.sa/src/HelAmps_sm.h b/epochX/cudacpp/gg_ttg.sa/src/HelAmps_sm.h
index 9f0973d9b0..0dd0f3ebba 100644
--- a/epochX/cudacpp/gg_ttg.sa/src/HelAmps_sm.h
+++ b/epochX/cudacpp/gg_ttg.sa/src/HelAmps_sm.h
@@ -8,7 +8,7 @@
 // Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+// MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
@@ -863,6 +863,7 @@ namespace mg5amcCpu
           const fptype allV2[],
           const fptype allV3[],
           const fptype allCOUP[],
+          const double Ccoeff,
           fptype allvertexes[] ) ALWAYS_INLINE;
 
   //--------------------------------------------------------------------------
@@ -873,6 +874,7 @@ namespace mg5amcCpu
   VVV1P0_1( const fptype allV2[],
             const fptype allV3[],
             const fptype allCOUP[],
+            const double Ccoeff,
             const fptype M1,
             const fptype W1,
             fptype allV1[] ) ALWAYS_INLINE;
@@ -886,6 +888,7 @@ namespace mg5amcCpu
           const fptype allF2[],
           const fptype allV3[],
           const fptype allCOUP[],
+          const double Ccoeff,
           fptype allvertexes[] ) ALWAYS_INLINE;
 
   //--------------------------------------------------------------------------
@@ -896,6 +899,7 @@ namespace mg5amcCpu
   FFV1_1( const fptype allF2[],
           const fptype allV3[],
           const fptype allCOUP[],
+          const double Ccoeff,
           const fptype M1,
           const fptype W1,
           fptype allF1[] ) ALWAYS_INLINE;
@@ -908,6 +912,7 @@ namespace mg5amcCpu
   FFV1_2( const fptype allF1[],
           const fptype allV3[],
           const fptype allCOUP[],
+          const double Ccoeff,
           const fptype M2,
           const fptype W2,
           fptype allF2[] ) ALWAYS_INLINE;
@@ -920,6 +925,7 @@ namespace mg5amcCpu
   FFV1P0_3( const fptype allF1[],
             const fptype allF2[],
             const fptype allCOUP[],
+            const double Ccoeff,
             const fptype M3,
             const fptype W3,
             fptype allV3[] ) ALWAYS_INLINE;
@@ -933,6 +939,7 @@ namespace mg5amcCpu
              const fptype allV3[],
              const fptype allV4[],
              const fptype allCOUP[],
+             const double Ccoeff,
              const fptype M1,
              const fptype W1,
              fptype allV1[] ) ALWAYS_INLINE;
@@ -946,6 +953,7 @@ namespace mg5amcCpu
              const fptype allV3[],
              const fptype allV4[],
              const fptype allCOUP[],
+             const double Ccoeff,
              const fptype M1,
              const fptype W1,
              fptype allV1[] ) ALWAYS_INLINE;
@@ -959,6 +967,7 @@ namespace mg5amcCpu
              const fptype allV3[],
              const fptype allV4[],
              const fptype allCOUP[],
+             const double Ccoeff,
              const fptype M1,
              const fptype W1,
              fptype allV1[] ) ALWAYS_INLINE;
@@ -972,6 +981,7 @@ namespace mg5amcCpu
           const fptype allV2[],
           const fptype allV3[],
           const fptype allCOUP[],
+          const double Ccoeff,
           fptype allvertexes[] )
   {
     mgDebug( 0, __FUNCTION__ );
@@ -1006,6 +1016,7 @@ namespace mg5amcCpu
   VVV1P0_1( const fptype allV2[],
             const fptype allV3[],
             const fptype allCOUP[],
+            const double Ccoeff,
             const fptype M1,
             const fptype W1,
             fptype allV1[] )
@@ -1044,6 +1055,7 @@ namespace mg5amcCpu
           const fptype allF2[],
           const fptype allV3[],
           const fptype allCOUP[],
+          const double Ccoeff,
           fptype allvertexes[] )
   {
     mgDebug( 0, __FUNCTION__ );
@@ -1067,6 +1079,7 @@ namespace mg5amcCpu
   FFV1_1( const fptype allF2[],
           const fptype allV3[],
           const fptype allCOUP[],
+          const double Ccoeff,
           const fptype M1,
           const fptype W1,
           fptype allF1[] )
@@ -1098,6 +1111,7 @@ namespace mg5amcCpu
   FFV1_2( const fptype allF1[],
           const fptype allV3[],
           const fptype allCOUP[],
+          const double Ccoeff,
           const fptype M2,
           const fptype W2,
           fptype allF2[] )
@@ -1129,6 +1143,7 @@ namespace mg5amcCpu
   FFV1P0_3( const fptype allF1[],
             const fptype allF2[],
             const fptype allCOUP[],
+            const double Ccoeff,
             const fptype M3,
             const fptype W3,
             fptype allV3[] )
@@ -1160,6 +1175,7 @@ namespace mg5amcCpu
              const fptype allV3[],
              const fptype allV4[],
              const fptype allCOUP[],
+             const double Ccoeff,
              const fptype M1,
              const fptype W1,
              fptype allV1[] )
@@ -1194,6 +1210,7 @@ namespace mg5amcCpu
              const fptype allV3[],
              const fptype allV4[],
              const fptype allCOUP[],
+             const double Ccoeff,
              const fptype M1,
              const fptype W1,
              fptype allV1[] )
@@ -1228,6 +1245,7 @@ namespace mg5amcCpu
              const fptype allV3[],
              const fptype allV4[],
              const fptype allCOUP[],
+             const double Ccoeff,
              const fptype M1,
              const fptype W1,
              fptype allV1[] )
diff --git a/epochX/cudacpp/gg_ttg.sa/src/Parameters_sm.cc b/epochX/cudacpp/gg_ttg.sa/src/Parameters_sm.cc
index 05eba20217..067445b198 100644
--- a/epochX/cudacpp/gg_ttg.sa/src/Parameters_sm.cc
+++ b/epochX/cudacpp/gg_ttg.sa/src/Parameters_sm.cc
@@ -7,7 +7,7 @@
 // Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+// MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
diff --git a/epochX/cudacpp/gg_ttg.sa/src/Parameters_sm.h b/epochX/cudacpp/gg_ttg.sa/src/Parameters_sm.h
index 41830f87ca..9581d66e0e 100644
--- a/epochX/cudacpp/gg_ttg.sa/src/Parameters_sm.h
+++ b/epochX/cudacpp/gg_ttg.sa/src/Parameters_sm.h
@@ -7,7 +7,7 @@
 // Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+// MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
diff --git a/epochX/cudacpp/gg_ttg.sa/src/cudacpp_src.mk b/epochX/cudacpp/gg_ttg.sa/src/cudacpp_src.mk
index f2804ffb85..159e19a46d 100644
--- a/epochX/cudacpp/gg_ttg.sa/src/cudacpp_src.mk
+++ b/epochX/cudacpp/gg_ttg.sa/src/cudacpp_src.mk
@@ -36,6 +36,13 @@ endif
 # See https://www.gnu.org/software/make/manual/html_node/Implicit-Variables.html
 ###RANLIB = ranlib
 
+# Add -mmacosx-version-min=11.3 to avoid "ld: warning: object file was built for newer macOS version than being linked"
+LDFLAGS =
+ifneq ($(shell $(CXX) --version | egrep '^Apple clang'),)
+CXXFLAGS += -mmacosx-version-min=11.3
+LDFLAGS += -mmacosx-version-min=11.3
+endif
+
 #-------------------------------------------------------------------------------
 
 #=== Configure the CUDA compiler (note: GPUCC is already exported including ccache)
@@ -266,11 +273,11 @@ endif
 ifneq ($(GPUCC),)
 $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so : $(cxx_objects) $(cu_objects)
 	@if [ ! -d $(LIBDIR) ]; then echo "mkdir -p $(LIBDIR)"; mkdir -p $(LIBDIR); fi
-	$(GPUCC) -shared -o $@ $(cxx_objects) $(cu_objects)
+	$(GPUCC) -shared -o $@ $(cxx_objects) $(cu_objects) $(LDFLAGS)
 else
 $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so : $(cxx_objects)
 	@if [ ! -d $(LIBDIR) ]; then echo "mkdir -p $(LIBDIR)"; mkdir -p $(LIBDIR); fi
-	$(CXX) -shared -o $@ $(cxx_objects)
+	$(CXX) -shared -o $@ $(cxx_objects) $(LDFLAGS)
 endif
 
 #-------------------------------------------------------------------------------
diff --git a/epochX/cudacpp/gg_ttg.sa/src/mgOnGpuConfig.h b/epochX/cudacpp/gg_ttg.sa/src/mgOnGpuConfig.h
index 205accb85b..da4ba36ad8 100644
--- a/epochX/cudacpp/gg_ttg.sa/src/mgOnGpuConfig.h
+++ b/epochX/cudacpp/gg_ttg.sa/src/mgOnGpuConfig.h
@@ -15,7 +15,6 @@
 #define MGONGPUCPP_GPUIMPL cuda
 #elif defined __HIPCC__
 #define MGONGPUCPP_GPUIMPL hip
-#include "hip/hip_runtime.h"
 #else
 #undef MGONGPUCPP_GPUIMPL
 #endif
@@ -24,16 +23,19 @@
 // ** NB2 Baseline on b7g47n0004 fluctuates (probably depends on load on other VMs)
 
 // Choose if curand is supported for generating random numbers
-// For CUDA, by default, it is supported
-// For HIP, by default, it is not supported
-// For C++, by default, do not inline, but allow this macro to be set from outside with e.g. -DMGONGPU_HAS_NO_CURAND
-#ifdef __CUDACC__
-#undef MGONGPU_HAS_NO_CURAND
-#elif defined __HIPCC__
+// For HIP, by default, do not use curand (common random numbers will be used instead)
+// For both CUDA and C++, by default, do not inline, but allow this macro to be set from outside with e.g. -DMGONGPU_HAS_NO_CURAND
+// (there exist CUDA installations, e.g. using the HPC package, which do not include curand - see PR #784 and #785)
+#if defined __HIPCC__
 #define MGONGPU_HAS_NO_CURAND 1
 #else
+//#ifdef __CUDACC__
+//#undef MGONGPU_HAS_NO_CURAND // default
+////#define MGONGPU_HAS_NO_CURAND 1
+//#else
 //#undef MGONGPU_HAS_NO_CURAND // default
 ////#define MGONGPU_HAS_NO_CURAND 1
+//#endif
 #endif
 
 // Choose floating point precision (for everything but color algebra #537)
diff --git a/epochX/cudacpp/gg_ttg.sa/src/mgOnGpuCxtypes.h b/epochX/cudacpp/gg_ttg.sa/src/mgOnGpuCxtypes.h
index 46d9f02733..5532e22fa1 100644
--- a/epochX/cudacpp/gg_ttg.sa/src/mgOnGpuCxtypes.h
+++ b/epochX/cudacpp/gg_ttg.sa/src/mgOnGpuCxtypes.h
@@ -159,6 +159,12 @@ namespace mg5amcCpu
     return cxsmpl<float>( a, 0 ) * b;
   }
 
+  inline __host__ __device__ constexpr cxsmpl<float>
+  operator*( const cxsmpl<float>& a, const double& b )
+  {
+    return a * cxsmpl<float>( b, 0 );
+  }
+
   template<typename FP>
   inline __host__ __device__ constexpr cxsmpl<FP>
   operator/( const cxsmpl<FP>& a, const cxsmpl<FP>& b )
diff --git a/epochX/cudacpp/gg_ttgg.mad/CODEGEN_mad_gg_ttgg_log.txt b/epochX/cudacpp/gg_ttgg.mad/CODEGEN_mad_gg_ttgg_log.txt
index 24d824ed19..88b6b72cf1 100644
--- a/epochX/cudacpp/gg_ttgg.mad/CODEGEN_mad_gg_ttgg_log.txt
+++ b/epochX/cudacpp/gg_ttgg.mad/CODEGEN_mad_gg_ttgg_log.txt
@@ -14,7 +14,7 @@ Running MG5 in debug mode
 *                   *        * *        *                  *
 *                 *                       *                *
 *                                                          *
-*         VERSION 3.5.1_lo_vect         2023-08-08         *
+*         VERSION 3.5.2_lo_vect         2023-11-08         *
 [1;31m*                                                          *[1;0m
 [1;31m*          WARNING: UNKNOWN DEVELOPMENT VERSION.           *[1;0m
 [1;31m*            WARNING: DO NOT USE FOR PRODUCTION            *[1;0m
@@ -53,7 +53,7 @@ Note that you can still compile and run aMC@NLO with the built-in PDFs
 Using default text editor "vi". Set another one in ./input/mg5_configuration.txt
 Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt
 Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt
-import /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_mad_gg_ttgg.mg
+import /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg.mg
 The import format was not given, so we guess it as command
 set stdout_level DEBUG
 set output information to level: 10
@@ -62,7 +62,7 @@ generate g g > t t~ g g
 No model currently active, so we import the Standard Model
 INFO: load particles 
 INFO: load vertices 
-[1;32mDEBUG: model prefixing  takes 0.005574703216552734 [0m
+[1;32mDEBUG: model prefixing  takes 0.0054705142974853516 [0m
 INFO: Restrict model sm with file models/sm/restrict_default.dat . 
 [1;32mDEBUG: Simplifying conditional expressions [0m
 [1;32mDEBUG: remove interactions: u s w+ at order: QED=1 [0m
@@ -157,76 +157,49 @@ INFO: Trying process: g g > t t~ g g WEIGHTED<=4 @1
 INFO: Process has 123 diagrams 
 1 processes with 123 diagrams generated in 0.157 s
 Total: 1 processes with 123 diagrams
-output madevent CODEGEN_mad_gg_ttgg --hel_recycling=False --vector_size=16384 --me_exporter=standalone_cudacpp
-Load PLUGIN.CUDACPP_SA_OUTPUT
-[1mAddition matrix-element will be done with PLUGIN: CUDACPP_SA_OUTPUT[0m
-[1mOutput will be done with PLUGIN: CUDACPP_SA_OUTPUT[0m
+output madevent ../TMPOUT/CODEGEN_mad_gg_ttgg --hel_recycling=False --vector_size=32 --me_exporter=standalone_cudacpp
+Load PLUGIN.CUDACPP_OUTPUT
+[1mAddition matrix-element will be done with PLUGIN: CUDACPP_OUTPUT[0m
+[1mOutput will be done with PLUGIN: CUDACPP_OUTPUT[0m
 [1;32mDEBUG:  cformat = [0m standalone_cudacpp [1;30m[export_cpp.py at line 3071][0m [0m
-[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [1;30m[output.py at line 152][0m [0m
+[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [1;30m[output.py at line 160][0m [0m
 INFO: initialize a new directory: CODEGEN_mad_gg_ttgg 
 INFO: remove old information in CODEGEN_mad_gg_ttgg 
-[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [1;30m[output.py at line 157][0m [0m
-[1;34mWARNING: File exists /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_mad_gg_ttgg [0m
-INFO: Creating subdirectories in directory /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_mad_gg_ttgg 
-[1;34mWARNING: File exists /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_mad_gg_ttgg/Cards [0m
-[1;34mWARNING: File exists /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_mad_gg_ttgg/SubProcesses [0m
+[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [1;30m[output.py at line 165][0m [0m
+[1;34mWARNING: File exists /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg [0m
+INFO: Creating subdirectories in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg 
+[1;34mWARNING: File exists /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg/Cards [0m
+[1;34mWARNING: File exists /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg/SubProcesses [0m
 INFO: Organizing processes into subprocess groups 
 INFO: Generating Helas calls for process: g g > t t~ g g WEIGHTED<=4 @1 
 INFO: Processing color information for process: g g > t t~ g g @1 
 INFO: Creating files in directory P1_gg_ttxgg 
-[1;32mDEBUG:  Entering PLUGIN_OneProcessExporter.__init__ [1;30m[model_handling.py at line 1039][0m [0m
-[1;32mDEBUG:  kwargs[prefix] = 0 [1;30m[model_handling.py at line 1040][0m [0m
-[1;32mDEBUG:  proc_id = [0m 1 [1;30m[model_handling.py at line 1045][0m [0m
-[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_SA_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7f2794d7fe50> [1;30m[export_v4.py at line 6179][0m [0m
+[1;32mDEBUG:  kwargs[prefix] = 0 [1;30m[model_handling.py at line 1058][0m [0m
+[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7fddb2149c70> [1;30m[export_v4.py at line 6262][0m [0m
 INFO: Creating files in directory . 
-[1;32mDEBUG:  Entering PLUGIN_OneProcessExporter.generate_process_files [1;30m[model_handling.py at line 1297][0m [0m
-[1;32mDEBUG:  self.include_multi_channel is already defined: this is madevent+second_exporter mode [1;30m[model_handling.py at line 1299][0m [0m
-FileWriter <class 'PLUGIN.CUDACPP_SA_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.h
-[1;32mDEBUG:  Entering PLUGIN_OneProcessExporter.write_process_h_file [1;30m[model_handling.py at line 1453][0m [0m
-FileWriter <class 'PLUGIN.CUDACPP_SA_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.cc
-[1;32mDEBUG:  Entering PLUGIN_OneProcessExporter.write_process_cc_file [1;30m[model_handling.py at line 1475][0m [0m
-[1;32mDEBUG:  Entering PLUGIN_OneProcessExporter.get_sigmaKin_lines [1;30m[model_handling.py at line 1143][0m [0m
-[1;32mDEBUG:  self.include_multi_channel = [0m [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 0, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 0, 46, 47, 48, 49, 50, 51, 52, 53, 54, 0, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 0, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 0, 88, 89, 90, 91, 92, 93, 0, 94, 95, 96, 97, 98, 99, 0, 100, 101, 102, 103, 104, 105, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] [1;30m[model_handling.py at line 1144][0m [0m
-[1;32mDEBUG:  self.support_multichannel = [0m True [1;30m[model_handling.py at line 1145][0m [0m
-[1;32mDEBUG:  type(self.helas_call_writer) = [0m <class 'PLUGIN.CUDACPP_SA_OUTPUT.model_handling.PLUGIN_GPUFOHelasCallWriter'> [1;30m[model_handling.py at line 1162][0m [0m
-[1;32mDEBUG:  self.support_multichannel, self.include_multi_channel = [0m True [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 0, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 0, 46, 47, 48, 49, 50, 51, 52, 53, 54, 0, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 0, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 0, 88, 89, 90, 91, 92, 93, 0, 94, 95, 96, 97, 98, 99, 0, 100, 101, 102, 103, 104, 105, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] [1;30m[model_handling.py at line 1163][0m [0m
-[1;32mDEBUG:  multi_channel = [0m {1: [1], 2: [2], 3: [3], 4: [4], 5: [5], 6: [6], 7: [7], 8: [8], 9: [9], 10: [10], 11: [11], 12: [12], 13: [13], 14: [14], 15: [15], 16: [16], 17: [17], 18: [18], 19: [19], 20: [20], 21: [21], 22: [22], 23: [23], 24: [24], 25: [25], 26: [26], 27: [27], 28: [28], 29: [29], 30: [30], 31: [32], 32: [33], 33: [34], 34: [35], 35: [36], 36: [37], 37: [38], 38: [39], 39: [40], 40: [41], 41: [42], 42: [43], 43: [44], 44: [45], 45: [46], 46: [48], 47: [49], 48: [50], 49: [51], 50: [52], 51: [53], 52: [54], 53: [55], 54: [56], 55: [58], 56: [59], 57: [60], 58: [61], 59: [62], 60: [63], 61: [64], 62: [65], 63: [66], 64: [67], 65: [68], 66: [69], 67: [70], 68: [71], 69: [72], 70: [74], 71: [75], 72: [76], 73: [77], 74: [78], 75: [79], 76: [80], 77: [81], 78: [82], 79: [83], 80: [84], 81: [85], 82: [86], 83: [87], 84: [88], 85: [89], 86: [90], 87: [91], 88: [93], 89: [94], 90: [95], 91: [96], 92: [97], 93: [98], 94: [100], 95: [101], 96: [102], 97: [103], 98: [104], 99: [105], 100: [107], 101: [108], 102: [109], 103: [110], 104: [111], 105: [112]} [1;30m[model_handling.py at line 1169][0m [0m
-[1;32mDEBUG:  multi_channel_map = [0m {1: [1], 2: [2], 3: [3], 4: [4], 5: [5], 6: [6], 7: [7], 8: [8], 9: [9], 10: [10], 11: [11], 12: [12], 13: [13], 14: [14], 15: [15], 16: [16], 17: [17], 18: [18], 19: [19], 20: [20], 21: [21], 22: [22], 23: [23], 24: [24], 25: [25], 26: [26], 27: [27], 28: [28], 29: [29], 30: [30], 31: [32], 32: [33], 33: [34], 34: [35], 35: [36], 36: [37], 37: [38], 38: [39], 39: [40], 40: [41], 41: [42], 42: [43], 43: [44], 44: [45], 45: [46], 46: [48], 47: [49], 48: [50], 49: [51], 50: [52], 51: [53], 52: [54], 53: [55], 54: [56], 55: [58], 56: [59], 57: [60], 58: [61], 59: [62], 60: [63], 61: [64], 62: [65], 63: [66], 64: [67], 65: [68], 66: [69], 67: [70], 68: [71], 69: [72], 70: [74], 71: [75], 72: [76], 73: [77], 74: [78], 75: [79], 76: [80], 77: [81], 78: [82], 79: [83], 80: [84], 81: [85], 82: [86], 83: [87], 84: [88], 85: [89], 86: [90], 87: [91], 88: [93], 89: [94], 90: [95], 91: [96], 92: [97], 93: [98], 94: [100], 95: [101], 96: [102], 97: [103], 98: [104], 99: [105], 100: [107], 101: [108], 102: [109], 103: [110], 104: [111], 105: [112]} [1;30m[model_handling.py at line 1654][0m [0m
-[1;32mDEBUG:  diag_to_config = [0m {4: 1, 5: 2, 6: 3, 7: 4, 8: 5, 9: 6, 10: 7, 11: 8, 12: 9, 13: 10, 14: 11, 15: 12, 16: 13, 17: 14, 18: 15, 19: 16, 20: 17, 21: 18, 22: 19, 23: 20, 24: 21, 25: 22, 26: 23, 27: 24, 28: 25, 29: 26, 30: 27, 31: 28, 32: 29, 33: 30, 37: 31, 38: 32, 39: 33, 40: 34, 41: 35, 42: 36, 43: 37, 44: 38, 45: 39, 46: 40, 47: 41, 48: 42, 49: 43, 50: 44, 51: 45, 55: 46, 56: 47, 57: 48, 58: 49, 59: 50, 60: 51, 61: 52, 62: 53, 63: 54, 67: 55, 68: 56, 69: 57, 70: 58, 71: 59, 72: 60, 73: 61, 74: 62, 75: 63, 76: 64, 77: 65, 78: 66, 79: 67, 80: 68, 81: 69, 85: 70, 86: 71, 87: 72, 88: 73, 89: 74, 90: 75, 91: 76, 92: 77, 93: 78, 94: 79, 95: 80, 96: 81, 97: 82, 98: 83, 99: 84, 100: 85, 101: 86, 102: 87, 106: 88, 107: 89, 108: 90, 109: 91, 110: 92, 111: 93, 115: 94, 116: 95, 117: 96, 118: 97, 119: 98, 120: 99, 124: 100, 125: 101, 126: 102, 127: 103, 128: 104, 129: 105} [1;30m[model_handling.py at line 1711][0m [0m
-[1;32mDEBUG:  call = [0m vxxxxx( momenta,m_pars->%s, cHel[ihel][%d],%+d, w_sv[%d], %d ); [1;30m[model_handling.py at line 1823][0m [0m
-[1;32mDEBUG:  ('ZERO', 0, -1, 0, 0) [1;30m[model_handling.py at line 1824][0m [0m
-[1;32mDEBUG:  call = [0m vxxxxx( momenta,m_pars->%s, cHel[ihel][%d],%+d, w_sv[%d], %d ); [1;30m[model_handling.py at line 1823][0m [0m
-[1;32mDEBUG:  ('ZERO', 1, -1, 1, 1) [1;30m[model_handling.py at line 1824][0m [0m
-[1;32mDEBUG:  call = [0m vxxxxx( momenta,m_pars->%s, cHel[ihel][%d],%+d, w_sv[%d], %d ); [1;30m[model_handling.py at line 1823][0m [0m
-[1;32mDEBUG:  ('ZERO', 4, 1, 4, 4) [1;30m[model_handling.py at line 1824][0m [0m
-[1;32mDEBUG:  call = [0m vxxxxx( momenta,m_pars->%s, cHel[ihel][%d],%+d, w_sv[%d], %d ); [1;30m[model_handling.py at line 1823][0m [0m
-[1;32mDEBUG:  ('ZERO', 5, 1, 5, 5) [1;30m[model_handling.py at line 1824][0m [0m
+FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.h
+FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.cc
 INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./. 
-[1;32mDEBUG:  Entering PLUGIN_OneProcessExporter.edit_CMakeLists [1;30m[model_handling.py at line 1343][0m [0m
-[1;32mDEBUG:  Entering PLUGIN_OneProcessExporter.edit_check_sa [1;30m[model_handling.py at line 1352][0m [0m
-[1;32mDEBUG:  Entering PLUGIN_OneProcessExporter.edit_mgonGPU [1;30m[model_handling.py at line 1369][0m [0m
-[1;32mDEBUG:  Entering PLUGIN_OneProcessExporter.edit_processidfile [1;30m[model_handling.py at line 1389][0m [0m
-[1;32mDEBUG:  Entering PLUGIN_OneProcessExporter.edit_coloramps [1;30m[model_handling.py at line 1401][0m [0m
-[1;32mDEBUG:  Entering PLUGIN_OneProcessExporter.edit_testxxx [1;30m[model_handling.py at line 1419][0m [0m
-[1;32mDEBUG:  Entering PLUGIN_OneProcessExporter.edit_memorybuffers [1;30m[model_handling.py at line 1430][0m [0m
-[1;32mDEBUG:  Entering PLUGIN_OneProcessExporter.edit_memoryaccesscouplings [1;30m[model_handling.py at line 1441][0m [0m
-[1;32mDEBUG:  'Copying test reference file: ', template_ref  = [0m Copying test reference file:  /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/../../../test/ref/dump_CPUTest.Sigma_sm_gg_ttxgg.txt [1;30m[model_handling.py at line 1335][0m [0m
 [1;32mDEBUG:  proc_id = [0m 1 [1;30m[export_cpp.py at line 710][0m [0m
 [1;32mDEBUG:  config_map = [0m [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 0, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 0, 46, 47, 48, 49, 50, 51, 52, 53, 54, 0, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 0, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 0, 88, 89, 90, 91, 92, 93, 0, 94, 95, 96, 97, 98, 99, 0, 100, 101, 102, 103, 104, 105, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] [1;30m[export_cpp.py at line 711][0m [0m
 [1;32mDEBUG:  subproc_number = [0m 0 [1;30m[export_cpp.py at line 712][0m [0m
 [1;32mDEBUG:  Done [1;30m[export_cpp.py at line 713][0m [0m
+[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m False True 32 [1;30m[export_v4.py at line 1872][0m [0m
+[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m False True 32 [1;30m[export_v4.py at line 1872][0m [0m
+[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m 32 True 32 [1;30m[export_v4.py at line 1872][0m [0m
+[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m 32 True 32 [1;30m[export_v4.py at line 1872][0m [0m
 INFO: Generating Feynman diagrams for Process: g g > t t~ g g WEIGHTED<=4 @1 
 INFO: Finding symmetric diagrams for subprocess group gg_ttxgg 
-Generated helas calls for 1 subprocesses (123 diagrams) in 0.424 s
-Wrote files for 222 helas calls in 0.735 s
+Generated helas calls for 1 subprocesses (123 diagrams) in 0.429 s
+Wrote files for 222 helas calls in 0.688 s
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates VVV1 routines[0m
 ALOHA: aloha creates FFV1 routines[0m
 ALOHA: aloha creates VVVV1 routines[0m
 ALOHA: aloha creates VVVV3 routines[0m
 ALOHA: aloha creates VVVV4 routines[0m
-ALOHA: aloha creates 5 routines in  0.332 s
-[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.convert_model (create the model) [1;30m[output.py at line 194][0m [0m
+ALOHA: aloha creates 5 routines in  0.331 s
+[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.convert_model (create the model) [1;30m[output.py at line 202][0m [0m
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates VVV1 routines[0m
 ALOHA: aloha creates FFV1 routines[0m
@@ -246,32 +219,103 @@ ALOHA: aloha creates 10 routines in  0.315 s
 <class 'aloha.create_aloha.AbstractRoutine'> VVVV3
 <class 'aloha.create_aloha.AbstractRoutine'> VVVV4
 <class 'aloha.create_aloha.AbstractRoutine'> VVVV4
-FileWriter <class 'PLUGIN.CUDACPP_SA_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_mad_gg_ttgg/src/./HelAmps_sm.h
-INFO: Created file HelAmps_sm.h in directory /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_mad_gg_ttgg/src/. 
+FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg/src/./HelAmps_sm.h
+INFO: Created file HelAmps_sm.h in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg/src/. 
 super_write_set_parameters_onlyfixMajorana (hardcoded=False)
-[1;32mDEBUG:  'pardef_lines size =', len(pardef_lines), ', keys size =', len(pardef_lines.keys())  = [0m pardef_lines size = 48 , keys size = 48 [1;30m[model_handling.py at line 729][0m [0m
 super_write_set_parameters_onlyfixMajorana (hardcoded=True)
-[1;32mDEBUG:  'pardef_lines size =', len(pardef_lines), ', keys size =', len(pardef_lines.keys())  = [0m pardef_lines size = 3 , keys size = 3 [1;30m[model_handling.py at line 729][0m [0m
-[1;32mDEBUG:  'pardef_lines size =', len(pardef_lines), ', keys size =', len(pardef_lines.keys())  = [0m pardef_lines size = 3 , keys size = 3 [1;30m[model_handling.py at line 729][0m [0m
-[1;32mDEBUG:  'pardef_lines size =', len(pardef_lines), ', keys size =', len(pardef_lines.keys())  = [0m pardef_lines size = 3 , keys size = 3 [1;30m[model_handling.py at line 729][0m [0m
-[1;32mDEBUG:  'pardef_lines size =', len(pardef_lines), ', keys size =', len(pardef_lines.keys())  = [0m pardef_lines size = 3 , keys size = 3 [1;30m[model_handling.py at line 729][0m [0m
-[1;32mDEBUG:  'pardef_lines size =', len(pardef_lines), ', keys size =', len(pardef_lines.keys())  = [0m pardef_lines size = 3 , keys size = 3 [1;30m[model_handling.py at line 729][0m [0m
-FileWriter <class 'PLUGIN.CUDACPP_SA_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_mad_gg_ttgg/src/./Parameters_sm.h
-FileWriter <class 'PLUGIN.CUDACPP_SA_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_mad_gg_ttgg/src/./Parameters_sm.cc
+FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg/src/./Parameters_sm.h
+FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg/src/./Parameters_sm.cc
 INFO: Created files Parameters_sm.h and Parameters_sm.cc in directory 
-INFO: /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_mad_gg_ttgg/src/. and /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_mad_gg_ttgg/src/. 
+INFO: /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg/src/. and /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg/src/. 
 The option zerowidth_tchannel is modified [True] but will not be written in the configuration files.
 If you want to make this value the default for future session, you can run 'save options --all'
-save configuration file to /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_mad_gg_ttgg/Cards/me5_configuration.txt
+save configuration file to /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg/Cards/me5_configuration.txt
 INFO: Use Fortran compiler gfortran 
 INFO: Use c++ compiler g++ 
 INFO: Generate web pages 
-Output to directory /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_mad_gg_ttgg done.
+DEBUG: cd /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg; patch -p4 -i /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.common
+patching file Source/genps.inc
+patching file Source/makefile
+patching file SubProcesses/makefile
+patching file bin/internal/gen_ximprove.py
+Hunk #1 succeeded at 391 (offset 6 lines).
+patching file bin/internal/madevent_interface.py
+DEBUG: cd /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg/SubProcesses/P1_gg_ttxgg; patch -p6 -i /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.P1
+patching file auto_dsig1.f
+patching file driver.f
+patching file matrix1.f
+Hunk #2 succeeded at 191 (offset 48 lines).
+Hunk #3 succeeded at 269 (offset 48 lines).
+Hunk #4 succeeded at 297 (offset 48 lines).
+Hunk #5 succeeded at 342 (offset 48 lines).
+[1;32mDEBUG:  p.returncode = [0m 0 [1;30m[output.py at line 237][0m [0m
+Output to directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg done.
 Type "launch" to generate events from this process, or see
-/data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_mad_gg_ttgg/README
+/data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg/README
 Run "open index.html" to see more information about this process.
 quit
 
-real	0m3.284s
-user	0m3.067s
-sys	0m0.205s
+real	0m3.278s
+user	0m3.005s
+sys	0m0.263s
+Code generation completed in 4 seconds
+************************************************************
+*                                                          *
+*                      W E L C O M E to                    *
+*             M A D G R A P H 5 _ a M C @ N L O            *
+*                      M A D E V E N T                     *
+*                                                          *
+*                 *                       *                *
+*                   *        * *        *                  *
+*                     * * * * 5 * * * *                    *
+*                   *        * *        *                  *
+*                 *                       *                *
+*                                                          *
+*         VERSION 3.5.2_lo_vect                            *
+*                                                          *
+*    The MadGraph5_aMC@NLO Development Team - Find us at   *
+*    https://server06.fynu.ucl.ac.be/projects/madgraph     *
+*                                                          *
+*               Type 'help' for in-line help.              *
+*                                                          *
+************************************************************
+INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg/Cards/me5_configuration.txt  
+INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/mg5amcnlo/input/mg5_configuration.txt  
+INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg/Cards/me5_configuration.txt  
+Using default text editor "vi". Set another one in ./input/mg5_configuration.txt
+Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt
+Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt
+treatcards run
+quit
+INFO:  
+launch in debug mode
+************************************************************
+*                                                          *
+*                      W E L C O M E to                    *
+*             M A D G R A P H 5 _ a M C @ N L O            *
+*                      M A D E V E N T                     *
+*                                                          *
+*                 *                       *                *
+*                   *        * *        *                  *
+*                     * * * * 5 * * * *                    *
+*                   *        * *        *                  *
+*                 *                       *                *
+*                                                          *
+*         VERSION 3.5.2_lo_vect                            *
+*                                                          *
+*    The MadGraph5_aMC@NLO Development Team - Find us at   *
+*    https://server06.fynu.ucl.ac.be/projects/madgraph     *
+*                                                          *
+*               Type 'help' for in-line help.              *
+*                                                          *
+************************************************************
+INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg/Cards/me5_configuration.txt  
+INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/mg5amcnlo/input/mg5_configuration.txt  
+INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg/Cards/me5_configuration.txt  
+Using default text editor "vi". Set another one in ./input/mg5_configuration.txt
+Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt
+Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt
+treatcards param
+quit
+INFO:  
+launch in debug mode
diff --git a/epochX/cudacpp/gg_ttgg.mad/Cards/me5_configuration.txt b/epochX/cudacpp/gg_ttgg.mad/Cards/me5_configuration.txt
index 00d7c6f8d6..cdeedc7863 100644
--- a/epochX/cudacpp/gg_ttgg.mad/Cards/me5_configuration.txt
+++ b/epochX/cudacpp/gg_ttgg.mad/Cards/me5_configuration.txt
@@ -234,7 +234,7 @@
 # pineappl = pineappl
 
 
-#mg5_path = /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo 
+#mg5_path = /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/mg5amcnlo 
 
 # MG5 MAIN DIRECTORY
-#mg5_path = /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo
+#mg5_path = /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/mg5amcnlo
diff --git a/epochX/cudacpp/gg_ttgg.mad/Cards/proc_card_mg5.dat b/epochX/cudacpp/gg_ttgg.mad/Cards/proc_card_mg5.dat
index 19b71e686a..ea9cfcde68 100644
--- a/epochX/cudacpp/gg_ttgg.mad/Cards/proc_card_mg5.dat
+++ b/epochX/cudacpp/gg_ttgg.mad/Cards/proc_card_mg5.dat
@@ -8,7 +8,7 @@
 #*                *                       *                 *
 #*                                                          *
 #*                                                          *
-#*         VERSION 3.5.1_lo_vect         2023-08-08         *
+#*         VERSION 3.5.2_lo_vect         2023-11-08         *
 [1;31m#*                                                          *[1;0m
 [1;31m#*          WARNING: UNKNOWN DEVELOPMENT VERSION.           *[1;0m
 [1;31m#*            WARNING: DO NOT USE FOR PRODUCTION            *[1;0m
@@ -45,5 +45,5 @@ define l+ = e+ mu+
 define l- = e- mu-
 define vl = ve vm vt
 define vl~ = ve~ vm~ vt~
-output madevent CODEGEN_mad_gg_ttgg --hel_recycling=False --vector_siz\
-e=16384 --me_exporter=standalone_cudacpp
+output madevent ../TMPOUT/CODEGEN_mad_gg_ttgg --hel_recycling=False --\
+vector_size=32 --me_exporter=standalone_cudacpp
diff --git a/epochX/cudacpp/gg_ttgg.mad/Cards/run_card.dat b/epochX/cudacpp/gg_ttgg.mad/Cards/run_card.dat
index 88d4377d71..42728a48f3 100644
--- a/epochX/cudacpp/gg_ttgg.mad/Cards/run_card.dat
+++ b/epochX/cudacpp/gg_ttgg.mad/Cards/run_card.dat
@@ -80,7 +80,23 @@
    1  = sde_strategy  ! default integration strategy (hep-ph/2021.00773)
                              ! 1 is old strategy (using amp square)
 			     ! 2 is new strategy (using only the denominator)
-# To see advanced option for Phase-Space optimization: type "update psoptim"			     
+#*********************************************************************
+# Phase-Space Optim (advanced)
+#*********************************************************************
+   0 = job_strategy ! see appendix of 1507.00020 (page 26)
+   0 =  hard_survey ! force to have better estimate of the integral at survey for difficult mode like interference
+   -1.0 = tmin_for_channel ! limit the non-singular reach of --some-- channel of integration related to T-channel diagram (value between -1 and 0), -1 is no impact
+   -1 = survey_splitting ! for loop-induced control how many core are used at survey for the computation of a single iteration.
+   2 = survey_nchannel_per_job ! control how many Channel are integrated inside a single job on cluster/multicore
+   -1 = refine_evt_by_job ! control the maximal number of events for the first iteration of the refine (larger means less jobs)  
+#*********************************************************************
+# Compilation flag. 
+#*********************************************************************   
+   -O3 -ffast-math -fbounds-check = global_flag ! build flags for all Fortran code (for a fair comparison to cudacpp; default is -O)
+   --fast-math  = aloha_flag ! fortran optimization flag for aloha function. Suggestions: '-ffast-math'
+   -O3 = matrix_flag ! fortran optimization flag for matrix.f function. Suggestions: '-O3'
+   16384 = vector_size ! size of fortran arrays allocated in the multi-event API for SIMD/GPU (VECSIZE_MEMMAX)
+			     
 #*********************************************************************
 # Customization (custom cuts/scale/bias/...)                         *
 # list of files containing fortran function that overwrite default   *
@@ -168,12 +184,10 @@
 systematics = systematics_program ! none, systematics [python], SysCalc [depreceted, C++]
 ['--mur=0.5,1,2', '--muf=0.5,1,2', '--pdf=errorset'] = systematics_arguments ! see: https://cp3.irmp.ucl.ac.be/projects/madgraph/wiki/Systematics#Systematicspythonmodule
 
-#*********************************************************************
-# Options for the cudacpp plugin
-#*********************************************************************
-
-# Set cudacpp-specific values of non-cudacpp-specific options
--O3 -ffast-math -fbounds-check = global_flag ! build flags for Fortran code (for a fair comparison to cudacpp)
+#***********************************************************************
+# SIMD/GPU configuration for the CUDACPP plugin
+#************************************************************************
+ d = floating_type ! floating point precision: f (single), d (double), m (mixed: double for amplitudes, single for colors)
+ auto = avx_level ! SIMD vectorization level: none, sse4, avx2, 512y, 512z, auto
+ CPP = cudacpp_backend ! CUDACPP backend: FORTRAN, CPP, CUDA
 
-# New cudacpp-specific options (default values are defined in banner.py)
-CPP = cudacpp_backend ! valid backends are FORTRAN, CPP, CUDA
diff --git a/epochX/cudacpp/gg_ttgg.mad/Cards/run_card_default.dat b/epochX/cudacpp/gg_ttgg.mad/Cards/run_card_default.dat
index 26fa7f39dd..0374d7ba60 100644
--- a/epochX/cudacpp/gg_ttgg.mad/Cards/run_card_default.dat
+++ b/epochX/cudacpp/gg_ttgg.mad/Cards/run_card_default.dat
@@ -80,7 +80,23 @@
    2  = sde_strategy  ! default integration strategy (hep-ph/2021.00773)
                              ! 1 is old strategy (using amp square)
 			     ! 2 is new strategy (using only the denominator)
-# To see advanced option for Phase-Space optimization: type "update psoptim"			     
+#*********************************************************************
+# Phase-Space Optim (advanced)
+#*********************************************************************
+   0 = job_strategy ! see appendix of 1507.00020 (page 26)
+   0 =  hard_survey ! force to have better estimate of the integral at survey for difficult mode like interference
+   -1.0 = tmin_for_channel ! limit the non-singular reach of --some-- channel of integration related to T-channel diagram (value between -1 and 0), -1 is no impact
+   -1 = survey_splitting ! for loop-induced control how many core are used at survey for the computation of a single iteration.
+   2 = survey_nchannel_per_job ! control how many Channel are integrated inside a single job on cluster/multicore
+   -1 = refine_evt_by_job ! control the maximal number of events for the first iteration of the refine (larger means less jobs)  
+#*********************************************************************
+# Compilation flag. 
+#*********************************************************************   
+   -O = global_flag ! fortran optimization flag use for the all code.
+   --fast-math  = aloha_flag ! fortran optimization flag for aloha function. Suggestions: '-ffast-math'
+   -O3 = matrix_flag ! fortran optimization flag for matrix.f function. Suggestions: '-O3'
+   16 = vector_size ! size of fortran arrays allocated in the multi-event API for SIMD/GPU (VECSIZE_MEMMAX)
+			     
 #*********************************************************************
 # Customization (custom cuts/scale/bias/...)                         *
 # list of files containing fortran function that overwrite default   *
@@ -167,3 +183,11 @@
 #
 systematics = systematics_program ! none, systematics [python], SysCalc [depreceted, C++]
 ['--mur=0.5,1,2', '--muf=0.5,1,2', '--pdf=errorset'] = systematics_arguments ! see: https://cp3.irmp.ucl.ac.be/projects/madgraph/wiki/Systematics#Systematicspythonmodule
+
+#***********************************************************************
+# SIMD/GPU configuration for the CUDACPP plugin
+#************************************************************************
+ d = floating_type ! floating point precision: f (single), d (double), m (mixed: double for amplitudes, single for colors)
+ auto = avx_level ! SIMD vectorization level: none, sse4, avx2, 512y, 512z, auto
+ CPP = cudacpp_backend ! CUDACPP backend: FORTRAN, CPP, CUDA
+
diff --git a/epochX/cudacpp/gg_ttgg.mad/MGMEVersion.txt b/epochX/cudacpp/gg_ttgg.mad/MGMEVersion.txt
index 1c1a95761b..85c67c3554 100644
--- a/epochX/cudacpp/gg_ttgg.mad/MGMEVersion.txt
+++ b/epochX/cudacpp/gg_ttgg.mad/MGMEVersion.txt
@@ -1 +1 @@
-3.5.1_lo_vect
\ No newline at end of file
+3.5.2_lo_vect
\ No newline at end of file
diff --git a/epochX/cudacpp/gg_ttgg.mad/Source/DHELAS/aloha_file.inc b/epochX/cudacpp/gg_ttgg.mad/Source/DHELAS/aloha_file.inc
index ec923afd6d..fa0f3d86f5 100644
--- a/epochX/cudacpp/gg_ttgg.mad/Source/DHELAS/aloha_file.inc
+++ b/epochX/cudacpp/gg_ttgg.mad/Source/DHELAS/aloha_file.inc
@@ -1 +1 @@
-ALOHARoutine = FFV1_1.o VVVV4_0.o VVVV4P0_1.o FFV1_0.o VVV1_0.o FFV1_2.o VVVV3_0.o VVVV1_0.o VVVV3P0_1.o VVVV1P0_1.o VVV1P0_1.o FFV1P0_3.o
+ALOHARoutine = FFV1P0_3.o FFV1_0.o FFV1_1.o FFV1_2.o VVV1P0_1.o VVV1_0.o VVVV1P0_1.o VVVV1_0.o VVVV3P0_1.o VVVV3_0.o VVVV4P0_1.o VVVV4_0.o
diff --git a/epochX/cudacpp/gg_ttgg.mad/Source/PDF/pdfwrap_lhapdf.f b/epochX/cudacpp/gg_ttgg.mad/Source/PDF/pdfwrap_lhapdf.f
index 0be926e6cd..3f36905346 100644
--- a/epochX/cudacpp/gg_ttgg.mad/Source/PDF/pdfwrap_lhapdf.f
+++ b/epochX/cudacpp/gg_ttgg.mad/Source/PDF/pdfwrap_lhapdf.f
@@ -5,6 +5,7 @@ SUBROUTINE PDFWRAP
 C     
       INCLUDE 'pdf.inc'
       INCLUDE '../alfas.inc'
+      INCLUDE '../vector.inc'
       INCLUDE '../coupl.inc'
       REAL*8 ZMASS
       DATA ZMASS/91.188D0/
diff --git a/epochX/cudacpp/gg_ttgg.mad/Source/make_opts b/epochX/cudacpp/gg_ttgg.mad/Source/make_opts
index bd3c24228d..e4b87ee6ad 100644
--- a/epochX/cudacpp/gg_ttgg.mad/Source/make_opts
+++ b/epochX/cudacpp/gg_ttgg.mad/Source/make_opts
@@ -1,17 +1,12 @@
-pdlabel1=
-pdlabel2=
-lhapdf=
-PYTHIA8_PATH=NotInstalled
-MG5AMC_VERSION=3.5.0_lo_vect
-GLOBAL_FLAG=-O3 -ffast-math -fbounds-check
-ALOHA_FLAG=
-MATRIX_FLAG=
 DEFAULT_CPP_COMPILER=g++
+DEFAULT_F2PY_COMPILER=f2py3
+DEFAULT_F_COMPILER=gfortran
+GLOBAL_FLAG=-O3 -ffast-math -fbounds-check
 MACFLAG=
-STDLIB=-lstdc++
+MG5AMC_VERSION=SpecifiedByMG5aMCAtRunTime
+PYTHIA8_PATH=NotInstalled
 STDLIB_FLAG=
-DEFAULT_F_COMPILER=gfortran
-DEFAULT_F2PY_COMPILER=f2py3
+STDLIB=-lstdc++
 #end_of_make_opts_variables
 
 BIASLIBDIR=../../../lib/
diff --git a/epochX/cudacpp/gg_ttgg.mad/Source/makefile b/epochX/cudacpp/gg_ttgg.mad/Source/makefile
index dbe08b846e..00c73099a0 100644
--- a/epochX/cudacpp/gg_ttgg.mad/Source/makefile
+++ b/epochX/cudacpp/gg_ttgg.mad/Source/makefile
@@ -136,5 +136,7 @@ cleanSource:
 clean: cleanSource
 	for i in `ls -d ../SubProcesses/P*`; do cd $$i; make clean; cd -; done;
 
-cleanall: cleanSource
+cleanavx:
+	for i in `ls -d ../SubProcesses/P*`; do cd $$i; make cleanavxs; cd -; done;
+cleanall: cleanSource # THIS IS THE ONE
 	for i in `ls -d ../SubProcesses/P*`; do cd $$i; make cleanavxs; cd -; done;
diff --git a/epochX/cudacpp/gg_ttgg.mad/Source/param_card.inc b/epochX/cudacpp/gg_ttgg.mad/Source/param_card.inc
index 1fcfce55bb..081365c16b 100644
--- a/epochX/cudacpp/gg_ttgg.mad/Source/param_card.inc
+++ b/epochX/cudacpp/gg_ttgg.mad/Source/param_card.inc
@@ -1,15 +1,15 @@
-      MDL_WZ = 2.441404D+00
-      MDL_WW = 2.047600D+00
-      MDL_WH = 6.382339D-03
-      MDL_WT = 1.491500D+00
+      MDL_MB = 4.700000D+00
+      MDL_MT = 1.730000D+02
       MDL_MTA = 1.777000D+00
       MDL_MZ = 9.118800D+01
       MDL_MH = 1.250000D+02
-      MDL_MB = 4.700000D+00
-      MDL_MT = 1.730000D+02
       AEWM1 = 1.325070D+02
       MDL_GF = 1.166390D-05
       AS = 1.180000D-01
-      MDL_YMTAU = 1.777000D+00
       MDL_YMB = 4.700000D+00
       MDL_YMT = 1.730000D+02
+      MDL_YMTAU = 1.777000D+00
+      MDL_WT = 1.491500D+00
+      MDL_WZ = 2.441404D+00
+      MDL_WW = 2.047600D+00
+      MDL_WH = 6.382339D-03
diff --git a/epochX/cudacpp/gg_ttgg.mad/Source/vector.inc b/epochX/cudacpp/gg_ttgg.mad/Source/vector.inc
index 92254c0f2a..863eebbc70 100644
--- a/epochX/cudacpp/gg_ttgg.mad/Source/vector.inc
+++ b/epochX/cudacpp/gg_ttgg.mad/Source/vector.inc
@@ -28,5 +28,4 @@ C     BECAUSE IT DOES NOT GO THROUGH THE CPP PREPROCESSOR
 C     (see https://github.com/madgraph5/madgraph4gpu/issues/458).
 C     
       INTEGER VECSIZE_MEMMAX
-      PARAMETER (VECSIZE_MEMMAX=16384) ! NB: 16k events per GPU grid is the minimum required to fill a V100 GPU
-c     PARAMETER (VECSIZE_MEMMAX=32) ! NB: workaround for out-of-memory on Juwels: 32 is enough for no-CUDA builds (issue #498)
+      PARAMETER (VECSIZE_MEMMAX=16384)
diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/Bridge.h b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/Bridge.h
index f37c972b24..89437b4c42 100644
--- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/Bridge.h
+++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/Bridge.h
@@ -18,6 +18,7 @@
 #include <cassert>
 #include <cmath>
 #include <cstring>
+#include <filesystem>
 #include <iostream>
 #include <memory>
 #include <type_traits>
@@ -244,14 +245,21 @@ namespace mg5amcCpu
     }
     std::cout << "WARNING! Instantiate device Bridge (nevt=" << m_nevt << ", gpublocks=" << m_gpublocks << ", gputhreads=" << m_gputhreads
               << ", gpublocks*gputhreads=" << m_gpublocks * m_gputhreads << ")" << std::endl;
-    CPPProcess process( /*verbose=*/false );
     m_pmek.reset( new MatrixElementKernelDevice( m_devMomentaC, m_devGs, m_devRndHel, m_devRndCol, m_devMEs, m_devSelHel, m_devSelCol, m_gpublocks, m_gputhreads ) );
 #else
     std::cout << "WARNING! Instantiate host Bridge (nevt=" << m_nevt << ")" << std::endl;
-    CPPProcess process( /*verbose=*/false );
     m_pmek.reset( new MatrixElementKernelHost( m_hstMomentaC, m_hstGs, m_hstRndHel, m_hstRndCol, m_hstMEs, m_hstSelHel, m_hstSelCol, m_nevt ) );
 #endif // MGONGPUCPP_GPUIMPL
-    process.initProc( "../../Cards/param_card.dat" );
+    // Create a process object, read param card and set parameters
+    // FIXME: the process instance can happily go out of scope because it is only needed to read parameters?
+    // FIXME: the CPPProcess should really be a singleton? what if fbridgecreate is called from several Fortran threads?
+    CPPProcess process( /*verbose=*/false );
+    std::string paramCard = "../../Cards/param_card.dat";
+    if( !std::filesystem::exists( paramCard ) )
+    {
+      paramCard = "../" + paramCard;
+    }
+    process.initProc( paramCard );
   }
 
 #ifdef MGONGPUCPP_GPUIMPL
diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/GpuAbstraction.h b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/GpuAbstraction.h
index 9c467b1e04..6a7d9c05c0 100644
--- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/GpuAbstraction.h
+++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/GpuAbstraction.h
@@ -39,6 +39,8 @@
 
 #elif defined __HIPCC__
 
+#include "hip/hip_runtime.h"
+
 #define gpuError_t hipError_t
 #define gpuPeekAtLastError hipPeekAtLastError
 #define gpuGetErrorString hipGetErrorString
diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MGVersion.txt b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MGVersion.txt
index 1c1a95761b..85c67c3554 100644
--- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MGVersion.txt
+++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MGVersion.txt
@@ -1 +1 @@
-3.5.1_lo_vect
\ No newline at end of file
+3.5.2_lo_vect
\ No newline at end of file
diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MadgraphTest.h b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MadgraphTest.h
index 176338151a..a64c05c26a 100644
--- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MadgraphTest.h
+++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MadgraphTest.h
@@ -14,6 +14,7 @@
 
 #include <array>
 #include <cmath>
+#include <filesystem>
 #include <fstream>
 #include <iomanip>
 #include <memory>
@@ -215,19 +216,16 @@ TEST_P( MadgraphTest, CompareMomentaAndME )
 #endif
   constexpr fptype energy = 1500; // historical default, Ecms = 1500 GeV = 1.5 TeV (above the Z peak)
   // Dump events to a new reference file?
-  constexpr bool dumpEvents = false;
-  std::string dumpFileName = std::string( "dump_" ) + testing::UnitTest::GetInstance()->current_test_info()->test_suite_name() + '.' + testing::UnitTest::GetInstance()->current_test_info()->name() + ".txt";
-  while( dumpFileName.find( '/' ) != std::string::npos )
-  {
-    dumpFileName.replace( dumpFileName.find( '/' ), 1, "_" );
-  }
+  const char* dumpEventsC = getenv( "CUDACPP_RUNTEST_DUMPEVENTS" );
+  const bool dumpEvents = ( dumpEventsC != 0 ) && ( std::string( dumpEventsC ) != "" );
+  const std::string refFileName = testDriver->getRefFileName();
+  const std::string dumpFileName = std::filesystem::path( refFileName ).filename();
   std::ofstream dumpFile;
   if( dumpEvents )
   {
     dumpFile.open( dumpFileName, std::ios::trunc );
   }
   // Read reference data
-  const std::string refFileName = testDriver->getRefFileName();
   std::map<unsigned int, ReferenceData> referenceData;
   if( !dumpEvents )
   {
diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MatrixElementKernels.cc b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MatrixElementKernels.cc
index d6d6c4f179..81699dfea9 100644
--- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MatrixElementKernels.cc
+++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MatrixElementKernels.cc
@@ -112,10 +112,17 @@ namespace mg5amcCpu
     // See https://community.arm.com/arm-community-blogs/b/operating-systems-blog/posts/runtime-detection-of-cpu-features-on-an-armv8-a-cpu
     bool ok = true; // this is just an assumption!
     const std::string tag = "arm neon (128bit as in SSE4.2)";
-#else
+#elif defined( __x86_64__ ) || defined( __i386__ )
     bool known = true;
     bool ok = __builtin_cpu_supports( "sse4.2" );
     const std::string tag = "nehalem (SSE4.2)";
+#else // AV FIXME! Added by OM for Mac, should identify the correct __xxx__ flag that should be targeted
+    bool known = false; // __builtin_cpu_supports is not supported
+    // See https://gcc.gnu.org/onlinedocs/gcc/Basic-PowerPC-Built-in-Functions-Available-on-all-Configurations.html
+    // See https://stackoverflow.com/q/62783908
+    // See https://community.arm.com/arm-community-blogs/b/operating-systems-blog/posts/runtime-detection-of-cpu-features-on-an-armv8-a-cpu
+    bool ok = true; // this is just an assumption!
+    const std::string tag = "arm neon (128bit as in SSE4.2)";
 #endif
 #else
     bool known = true;
diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/CPPProcess.cc b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/CPPProcess.cc
index 136e334fcf..2f4b1f9d0e 100644
--- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/CPPProcess.cc
+++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/CPPProcess.cc
@@ -7,7 +7,7 @@
 // Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+// MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
@@ -251,11 +251,11 @@ namespace mg5amcCpu
 
       vxxxxx<M_ACCESS, W_ACCESS>( momenta, 0., cHel[ihel][5], +1, w_fp[5], 5 );
 
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[1], COUPs[0], 0., 0., w_fp[6] );
-      FFV1P0_3<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[2], COUPs[1], 0., 0., w_fp[7] );
+      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[1], COUPs[0], 1.0, 0., 0., w_fp[6] );
+      FFV1P0_3<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[2], COUPs[1], 1.0, 0., 0., w_fp[7] );
 
       // Amplitude(s) for diagram number 1
-      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[6], w_fp[7], w_fp[4], w_fp[5], COUPs[2], &amp_fp[0] );
+      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[6], w_fp[7], w_fp[4], w_fp[5], COUPs[2], 1.0, &amp_fp[0] );
       jamp_sv[0] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[1] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[6] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -264,7 +264,7 @@ namespace mg5amcCpu
       jamp_sv[17] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[22] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[23] -= cxtype( 0, 1 ) * amp_sv[0];
-      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[6], w_fp[7], w_fp[4], w_fp[5], COUPs[2], &amp_fp[0] );
+      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[6], w_fp[7], w_fp[4], w_fp[5], COUPs[2], 1.0, &amp_fp[0] );
       jamp_sv[0] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[6] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[12] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -273,7 +273,7 @@ namespace mg5amcCpu
       jamp_sv[20] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[22] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[23] -= cxtype( 0, 1 ) * amp_sv[0];
-      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[6], w_fp[7], w_fp[4], w_fp[5], COUPs[2], &amp_fp[0] );
+      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[6], w_fp[7], w_fp[4], w_fp[5], COUPs[2], 1.0, &amp_fp[0] );
       jamp_sv[1] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[7] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[12] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -286,10 +286,10 @@ namespace mg5amcCpu
       // *** DIAGRAM 2 OF 123 ***
 
       // Wavefunction(s) for diagram number 2
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[6], w_fp[4], COUPs[0], 0., 0., w_fp[8] );
+      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[6], w_fp[4], COUPs[0], 1.0, 0., 0., w_fp[8] );
 
       // Amplitude(s) for diagram number 2
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[7], w_fp[5], w_fp[8], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[7], w_fp[5], w_fp[8], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 2 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -306,10 +306,10 @@ namespace mg5amcCpu
       // *** DIAGRAM 3 OF 123 ***
 
       // Wavefunction(s) for diagram number 3
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[6], w_fp[5], COUPs[0], 0., 0., w_fp[9] );
+      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[6], w_fp[5], COUPs[0], 1.0, 0., 0., w_fp[9] );
 
       // Amplitude(s) for diagram number 3
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[7], w_fp[4], w_fp[9], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[7], w_fp[4], w_fp[9], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 3 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -326,10 +326,10 @@ namespace mg5amcCpu
       // *** DIAGRAM 4 OF 123 ***
 
       // Wavefunction(s) for diagram number 4
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[4], w_fp[5], COUPs[0], 0., 0., w_fp[10] );
+      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[4], w_fp[5], COUPs[0], 1.0, 0., 0., w_fp[10] );
 
       // Amplitude(s) for diagram number 4
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[6], w_fp[7], w_fp[10], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[6], w_fp[7], w_fp[10], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 4 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -346,11 +346,11 @@ namespace mg5amcCpu
       // *** DIAGRAM 5 OF 123 ***
 
       // Wavefunction(s) for diagram number 5
-      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[2], w_fp[4], COUPs[1], cIPD[0], cIPD[1], w_fp[11] );
-      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[6], COUPs[1], cIPD[0], cIPD[1], w_fp[12] );
+      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[2], w_fp[4], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[11] );
+      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[6], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[12] );
 
       // Amplitude(s) for diagram number 5
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[12], w_fp[11], w_fp[5], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[12], w_fp[11], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 5 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -364,7 +364,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 6
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[11], w_fp[9], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[11], w_fp[9], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 6 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -377,10 +377,10 @@ namespace mg5amcCpu
       // *** DIAGRAM 7 OF 123 ***
 
       // Wavefunction(s) for diagram number 7
-      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[5], COUPs[1], cIPD[0], cIPD[1], w_fp[13] );
+      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[5], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[13] );
 
       // Amplitude(s) for diagram number 7
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[13], w_fp[11], w_fp[6], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[13], w_fp[11], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 7 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -391,10 +391,10 @@ namespace mg5amcCpu
       // *** DIAGRAM 8 OF 123 ***
 
       // Wavefunction(s) for diagram number 8
-      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[2], w_fp[5], COUPs[1], cIPD[0], cIPD[1], w_fp[14] );
+      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[2], w_fp[5], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[14] );
 
       // Amplitude(s) for diagram number 8
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[12], w_fp[14], w_fp[4], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[12], w_fp[14], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 8 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -408,7 +408,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 9
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[14], w_fp[8], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[14], w_fp[8], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 9 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -421,10 +421,10 @@ namespace mg5amcCpu
       // *** DIAGRAM 10 OF 123 ***
 
       // Wavefunction(s) for diagram number 10
-      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[4], COUPs[1], cIPD[0], cIPD[1], w_fp[15] );
+      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[4], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[15] );
 
       // Amplitude(s) for diagram number 10
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[15], w_fp[14], w_fp[6], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[15], w_fp[14], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 10 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -435,10 +435,10 @@ namespace mg5amcCpu
       // *** DIAGRAM 11 OF 123 ***
 
       // Wavefunction(s) for diagram number 11
-      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[2], w_fp[6], COUPs[1], cIPD[0], cIPD[1], w_fp[16] );
+      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[2], w_fp[6], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[16] );
 
       // Amplitude(s) for diagram number 11
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[15], w_fp[16], w_fp[5], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[15], w_fp[16], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 11 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -452,7 +452,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 12
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[15], w_fp[2], w_fp[9], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[15], w_fp[2], w_fp[9], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 12 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -468,7 +468,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 13
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[13], w_fp[16], w_fp[4], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[13], w_fp[16], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 13 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -482,7 +482,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 14
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[13], w_fp[2], w_fp[8], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[13], w_fp[2], w_fp[8], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 14 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -498,7 +498,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 15
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[16], w_fp[10], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[16], w_fp[10], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 15 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -514,7 +514,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 16
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[12], w_fp[2], w_fp[10], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[12], w_fp[2], w_fp[10], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 16 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -527,12 +527,12 @@ namespace mg5amcCpu
       // *** DIAGRAM 17 OF 123 ***
 
       // Wavefunction(s) for diagram number 17
-      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[2], w_fp[0], COUPs[1], cIPD[0], cIPD[1], w_fp[12] );
-      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[1], COUPs[1], cIPD[0], cIPD[1], w_fp[16] );
-      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[12], w_fp[4], COUPs[1], cIPD[0], cIPD[1], w_fp[8] );
+      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[2], w_fp[0], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[12] );
+      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[1], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[16] );
+      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[12], w_fp[4], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[8] );
 
       // Amplitude(s) for diagram number 17
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[16], w_fp[8], w_fp[5], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[16], w_fp[8], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 17 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -542,10 +542,10 @@ namespace mg5amcCpu
       // *** DIAGRAM 18 OF 123 ***
 
       // Wavefunction(s) for diagram number 18
-      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[12], w_fp[5], COUPs[1], cIPD[0], cIPD[1], w_fp[9] );
+      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[12], w_fp[5], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[9] );
 
       // Amplitude(s) for diagram number 18
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[16], w_fp[9], w_fp[4], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[16], w_fp[9], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 18 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -558,7 +558,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 19
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[16], w_fp[12], w_fp[10], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[16], w_fp[12], w_fp[10], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 19 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -569,11 +569,11 @@ namespace mg5amcCpu
       // *** DIAGRAM 20 OF 123 ***
 
       // Wavefunction(s) for diagram number 20
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[1], w_fp[4], COUPs[0], 0., 0., w_fp[6] );
-      FFV1P0_3<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[12], COUPs[1], 0., 0., w_fp[17] );
+      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[1], w_fp[4], COUPs[0], 1.0, 0., 0., w_fp[6] );
+      FFV1P0_3<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[12], COUPs[1], 1.0, 0., 0., w_fp[17] );
 
       // Amplitude(s) for diagram number 20
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[6], w_fp[5], w_fp[17], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[6], w_fp[5], w_fp[17], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 20 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -589,7 +589,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 21
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[9], w_fp[6], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[9], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 21 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -603,7 +603,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 22
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[13], w_fp[12], w_fp[6], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[13], w_fp[12], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 22 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -614,10 +614,10 @@ namespace mg5amcCpu
       // *** DIAGRAM 23 OF 123 ***
 
       // Wavefunction(s) for diagram number 23
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[1], w_fp[5], COUPs[0], 0., 0., w_fp[18] );
+      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[1], w_fp[5], COUPs[0], 1.0, 0., 0., w_fp[18] );
 
       // Amplitude(s) for diagram number 23
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[18], w_fp[4], w_fp[17], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[18], w_fp[4], w_fp[17], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 23 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -633,7 +633,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 24
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[8], w_fp[18], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[8], w_fp[18], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 24 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -647,7 +647,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 25
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[15], w_fp[12], w_fp[18], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[15], w_fp[12], w_fp[18], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 25 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -658,10 +658,10 @@ namespace mg5amcCpu
       // *** DIAGRAM 26 OF 123 ***
 
       // Wavefunction(s) for diagram number 26
-      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[12], w_fp[1], COUPs[1], cIPD[0], cIPD[1], w_fp[19] );
+      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[12], w_fp[1], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[19] );
 
       // Amplitude(s) for diagram number 26
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[15], w_fp[19], w_fp[5], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[15], w_fp[19], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 26 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -674,7 +674,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 27
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[15], w_fp[9], w_fp[1], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[15], w_fp[9], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 27 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -687,7 +687,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 28
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[13], w_fp[19], w_fp[4], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[13], w_fp[19], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 28 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -700,7 +700,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 29
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[13], w_fp[8], w_fp[1], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[13], w_fp[8], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 29 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -713,7 +713,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 30
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[19], w_fp[10], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[19], w_fp[10], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 30 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -727,7 +727,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 31
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[10], w_fp[17], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[10], w_fp[17], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 31 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -740,22 +740,22 @@ namespace mg5amcCpu
       // *** DIAGRAM 32 OF 123 ***
 
       // Wavefunction(s) for diagram number 32
-      VVVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[1], w_fp[4], w_fp[5], COUPs[2], 0., 0., w_fp[17] );
-      VVVV3P0_1<W_ACCESS, CD_ACCESS>( w_fp[1], w_fp[4], w_fp[5], COUPs[2], 0., 0., w_fp[19] );
-      VVVV4P0_1<W_ACCESS, CD_ACCESS>( w_fp[1], w_fp[4], w_fp[5], COUPs[2], 0., 0., w_fp[8] );
+      VVVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[1], w_fp[4], w_fp[5], COUPs[2], 1.0, 0., 0., w_fp[17] );
+      VVVV3P0_1<W_ACCESS, CD_ACCESS>( w_fp[1], w_fp[4], w_fp[5], COUPs[2], 1.0, 0., 0., w_fp[19] );
+      VVVV4P0_1<W_ACCESS, CD_ACCESS>( w_fp[1], w_fp[4], w_fp[5], COUPs[2], 1.0, 0., 0., w_fp[8] );
 
       // Amplitude(s) for diagram number 32
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[12], w_fp[17], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[12], w_fp[17], COUPs[1], 1.0, &amp_fp[0] );
       jamp_sv[0] += amp_sv[0];
       jamp_sv[1] -= amp_sv[0];
       jamp_sv[3] -= amp_sv[0];
       jamp_sv[5] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[12], w_fp[19], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[12], w_fp[19], COUPs[1], 1.0, &amp_fp[0] );
       jamp_sv[1] -= amp_sv[0];
       jamp_sv[2] += amp_sv[0];
       jamp_sv[3] -= amp_sv[0];
       jamp_sv[4] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[12], w_fp[8], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[12], w_fp[8], COUPs[1], 1.0, &amp_fp[0] );
       jamp_sv[0] -= amp_sv[0];
       jamp_sv[2] += amp_sv[0];
       jamp_sv[4] += amp_sv[0];
@@ -764,12 +764,12 @@ namespace mg5amcCpu
       // *** DIAGRAM 33 OF 123 ***
 
       // Wavefunction(s) for diagram number 33
-      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[0], COUPs[1], cIPD[0], cIPD[1], w_fp[12] );
-      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[2], w_fp[1], COUPs[1], cIPD[0], cIPD[1], w_fp[9] );
-      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[12], w_fp[4], COUPs[1], cIPD[0], cIPD[1], w_fp[20] );
+      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[0], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[12] );
+      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[2], w_fp[1], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[9] );
+      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[12], w_fp[4], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[20] );
 
       // Amplitude(s) for diagram number 33
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[20], w_fp[9], w_fp[5], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[20], w_fp[9], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 33 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -779,10 +779,10 @@ namespace mg5amcCpu
       // *** DIAGRAM 34 OF 123 ***
 
       // Wavefunction(s) for diagram number 34
-      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[12], w_fp[5], COUPs[1], cIPD[0], cIPD[1], w_fp[21] );
+      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[12], w_fp[5], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[21] );
 
       // Amplitude(s) for diagram number 34
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[21], w_fp[9], w_fp[4], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[21], w_fp[9], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 34 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -795,7 +795,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 35
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[12], w_fp[9], w_fp[10], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[12], w_fp[9], w_fp[10], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 35 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -806,10 +806,10 @@ namespace mg5amcCpu
       // *** DIAGRAM 36 OF 123 ***
 
       // Wavefunction(s) for diagram number 36
-      FFV1P0_3<W_ACCESS, CD_ACCESS>( w_fp[12], w_fp[2], COUPs[1], 0., 0., w_fp[22] );
+      FFV1P0_3<W_ACCESS, CD_ACCESS>( w_fp[12], w_fp[2], COUPs[1], 1.0, 0., 0., w_fp[22] );
 
       // Amplitude(s) for diagram number 36
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[6], w_fp[5], w_fp[22], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[6], w_fp[5], w_fp[22], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 36 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -825,7 +825,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 37
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[21], w_fp[2], w_fp[6], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[21], w_fp[2], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 37 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -839,7 +839,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 38
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[12], w_fp[14], w_fp[6], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[12], w_fp[14], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 38 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -853,7 +853,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 39
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[18], w_fp[4], w_fp[22], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[18], w_fp[4], w_fp[22], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 39 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -869,7 +869,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 40
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[20], w_fp[2], w_fp[18], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[20], w_fp[2], w_fp[18], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 40 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -883,7 +883,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 41
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[12], w_fp[11], w_fp[18], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[12], w_fp[11], w_fp[18], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 41 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -894,10 +894,10 @@ namespace mg5amcCpu
       // *** DIAGRAM 42 OF 123 ***
 
       // Wavefunction(s) for diagram number 42
-      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[12], w_fp[1], COUPs[1], cIPD[0], cIPD[1], w_fp[23] );
+      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[12], w_fp[1], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[23] );
 
       // Amplitude(s) for diagram number 42
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[23], w_fp[11], w_fp[5], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[23], w_fp[11], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 42 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -910,7 +910,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 43
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[21], w_fp[11], w_fp[1], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[21], w_fp[11], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 43 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -923,7 +923,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 44
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[23], w_fp[14], w_fp[4], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[23], w_fp[14], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 44 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -936,7 +936,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 45
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[20], w_fp[14], w_fp[1], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[20], w_fp[14], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 45 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -949,7 +949,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 46
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[23], w_fp[2], w_fp[10], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[23], w_fp[2], w_fp[10], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 46 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -963,7 +963,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 47
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[10], w_fp[22], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[10], w_fp[22], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 47 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -979,17 +979,17 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 48
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[12], w_fp[2], w_fp[17], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[12], w_fp[2], w_fp[17], COUPs[1], 1.0, &amp_fp[0] );
       jamp_sv[9] += amp_sv[0];
       jamp_sv[11] -= amp_sv[0];
       jamp_sv[17] -= amp_sv[0];
       jamp_sv[23] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[12], w_fp[2], w_fp[19], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[12], w_fp[2], w_fp[19], COUPs[1], 1.0, &amp_fp[0] );
       jamp_sv[11] -= amp_sv[0];
       jamp_sv[15] += amp_sv[0];
       jamp_sv[17] -= amp_sv[0];
       jamp_sv[21] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[12], w_fp[2], w_fp[8], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[12], w_fp[2], w_fp[8], COUPs[1], 1.0, &amp_fp[0] );
       jamp_sv[9] -= amp_sv[0];
       jamp_sv[15] += amp_sv[0];
       jamp_sv[21] += amp_sv[0];
@@ -998,11 +998,11 @@ namespace mg5amcCpu
       // *** DIAGRAM 49 OF 123 ***
 
       // Wavefunction(s) for diagram number 49
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[4], COUPs[0], 0., 0., w_fp[12] );
-      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[12], COUPs[1], cIPD[0], cIPD[1], w_fp[22] );
+      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[4], COUPs[0], 1.0, 0., 0., w_fp[12] );
+      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[12], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[22] );
 
       // Amplitude(s) for diagram number 49
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[22], w_fp[9], w_fp[5], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[22], w_fp[9], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 49 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -1013,10 +1013,10 @@ namespace mg5amcCpu
       // *** DIAGRAM 50 OF 123 ***
 
       // Wavefunction(s) for diagram number 50
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[12], w_fp[5], COUPs[0], 0., 0., w_fp[23] );
+      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[12], w_fp[5], COUPs[0], 1.0, 0., 0., w_fp[23] );
 
       // Amplitude(s) for diagram number 50
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[9], w_fp[23], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[9], w_fp[23], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 50 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -1032,7 +1032,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 51
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[13], w_fp[9], w_fp[12], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[13], w_fp[9], w_fp[12], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 51 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -1043,10 +1043,10 @@ namespace mg5amcCpu
       // *** DIAGRAM 52 OF 123 ***
 
       // Wavefunction(s) for diagram number 52
-      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[2], w_fp[12], COUPs[1], cIPD[0], cIPD[1], w_fp[20] );
+      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[2], w_fp[12], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[20] );
 
       // Amplitude(s) for diagram number 52
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[16], w_fp[20], w_fp[5], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[16], w_fp[20], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 52 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -1060,7 +1060,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 53
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[16], w_fp[2], w_fp[23], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[16], w_fp[2], w_fp[23], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 53 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -1076,7 +1076,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 54
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[16], w_fp[14], w_fp[12], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[16], w_fp[14], w_fp[12], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 54 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -1090,7 +1090,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 55
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[20], w_fp[18], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[20], w_fp[18], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 55 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -1106,7 +1106,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 56
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[22], w_fp[2], w_fp[18], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[22], w_fp[2], w_fp[18], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 56 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -1122,7 +1122,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 57
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[12], w_fp[18], w_fp[7], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[12], w_fp[18], w_fp[7], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 57 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -1142,7 +1142,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 58
-      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[12], w_fp[1], w_fp[7], w_fp[5], COUPs[2], &amp_fp[0] );
+      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[12], w_fp[1], w_fp[7], w_fp[5], COUPs[2], 1.0, &amp_fp[0] );
       jamp_sv[2] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[6] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[8] += cxtype( 0, 1 ) * amp_sv[0];
@@ -1151,7 +1151,7 @@ namespace mg5amcCpu
       jamp_sv[20] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[21] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[22] += cxtype( 0, 1 ) * amp_sv[0];
-      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[12], w_fp[1], w_fp[7], w_fp[5], COUPs[2], &amp_fp[0] );
+      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[12], w_fp[1], w_fp[7], w_fp[5], COUPs[2], 1.0, &amp_fp[0] );
       jamp_sv[2] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[3] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[10] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -1160,7 +1160,7 @@ namespace mg5amcCpu
       jamp_sv[13] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[20] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[21] -= cxtype( 0, 1 ) * amp_sv[0];
-      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[12], w_fp[1], w_fp[7], w_fp[5], COUPs[2], &amp_fp[0] );
+      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[12], w_fp[1], w_fp[7], w_fp[5], COUPs[2], 1.0, &amp_fp[0] );
       jamp_sv[3] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[6] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[8] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -1173,10 +1173,10 @@ namespace mg5amcCpu
       // *** DIAGRAM 59 OF 123 ***
 
       // Wavefunction(s) for diagram number 59
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[12], w_fp[1], COUPs[0], 0., 0., w_fp[21] );
+      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[12], w_fp[1], COUPs[0], 1.0, 0., 0., w_fp[21] );
 
       // Amplitude(s) for diagram number 59
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[7], w_fp[5], w_fp[21], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[7], w_fp[5], w_fp[21], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 59 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -1196,7 +1196,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 60
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[7], w_fp[23], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[7], w_fp[23], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 60 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -1216,7 +1216,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 61
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[14], w_fp[21], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[14], w_fp[21], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 61 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -1232,7 +1232,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 62
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[22], w_fp[14], w_fp[1], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[22], w_fp[14], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 62 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -1246,7 +1246,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 63
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[13], w_fp[2], w_fp[21], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[13], w_fp[2], w_fp[21], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 63 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -1262,7 +1262,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 64
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[13], w_fp[20], w_fp[1], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[13], w_fp[20], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 64 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -1273,11 +1273,11 @@ namespace mg5amcCpu
       // *** DIAGRAM 65 OF 123 ***
 
       // Wavefunction(s) for diagram number 65
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[5], COUPs[0], 0., 0., w_fp[20] );
-      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[20], COUPs[1], cIPD[0], cIPD[1], w_fp[21] );
+      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[5], COUPs[0], 1.0, 0., 0., w_fp[20] );
+      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[20], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[21] );
 
       // Amplitude(s) for diagram number 65
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[21], w_fp[9], w_fp[4], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[21], w_fp[9], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 65 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -1288,10 +1288,10 @@ namespace mg5amcCpu
       // *** DIAGRAM 66 OF 123 ***
 
       // Wavefunction(s) for diagram number 66
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[20], w_fp[4], COUPs[0], 0., 0., w_fp[22] );
+      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[20], w_fp[4], COUPs[0], 1.0, 0., 0., w_fp[22] );
 
       // Amplitude(s) for diagram number 66
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[9], w_fp[22], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[9], w_fp[22], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 66 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -1307,7 +1307,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 67
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[15], w_fp[9], w_fp[20], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[15], w_fp[9], w_fp[20], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 67 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -1318,10 +1318,10 @@ namespace mg5amcCpu
       // *** DIAGRAM 68 OF 123 ***
 
       // Wavefunction(s) for diagram number 68
-      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[2], w_fp[20], COUPs[1], cIPD[0], cIPD[1], w_fp[23] );
+      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[2], w_fp[20], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[23] );
 
       // Amplitude(s) for diagram number 68
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[16], w_fp[23], w_fp[4], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[16], w_fp[23], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 68 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -1335,7 +1335,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 69
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[16], w_fp[2], w_fp[22], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[16], w_fp[2], w_fp[22], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 69 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -1351,7 +1351,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 70
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[16], w_fp[11], w_fp[20], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[16], w_fp[11], w_fp[20], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 70 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -1365,7 +1365,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 71
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[23], w_fp[6], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[23], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 71 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -1381,7 +1381,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 72
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[21], w_fp[2], w_fp[6], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[21], w_fp[2], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 72 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -1397,7 +1397,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 73
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[20], w_fp[6], w_fp[7], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[20], w_fp[6], w_fp[7], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 73 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -1417,7 +1417,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 74
-      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[20], w_fp[1], w_fp[7], w_fp[4], COUPs[2], &amp_fp[0] );
+      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[20], w_fp[1], w_fp[7], w_fp[4], COUPs[2], 1.0, &amp_fp[0] );
       jamp_sv[4] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[7] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[10] += cxtype( 0, 1 ) * amp_sv[0];
@@ -1426,7 +1426,7 @@ namespace mg5amcCpu
       jamp_sv[15] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[16] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[18] -= cxtype( 0, 1 ) * amp_sv[0];
-      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[20], w_fp[1], w_fp[7], w_fp[4], COUPs[2], &amp_fp[0] );
+      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[20], w_fp[1], w_fp[7], w_fp[4], COUPs[2], 1.0, &amp_fp[0] );
       jamp_sv[4] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[5] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[8] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -1435,7 +1435,7 @@ namespace mg5amcCpu
       jamp_sv[15] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[18] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[19] += cxtype( 0, 1 ) * amp_sv[0];
-      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[20], w_fp[1], w_fp[7], w_fp[4], COUPs[2], &amp_fp[0] );
+      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[20], w_fp[1], w_fp[7], w_fp[4], COUPs[2], 1.0, &amp_fp[0] );
       jamp_sv[5] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[7] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[8] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -1448,10 +1448,10 @@ namespace mg5amcCpu
       // *** DIAGRAM 75 OF 123 ***
 
       // Wavefunction(s) for diagram number 75
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[20], w_fp[1], COUPs[0], 0., 0., w_fp[12] );
+      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[20], w_fp[1], COUPs[0], 1.0, 0., 0., w_fp[12] );
 
       // Amplitude(s) for diagram number 75
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[7], w_fp[4], w_fp[12], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[7], w_fp[4], w_fp[12], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 75 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -1471,7 +1471,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 76
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[7], w_fp[22], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[7], w_fp[22], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 76 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -1491,7 +1491,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 77
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[11], w_fp[12], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[11], w_fp[12], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 77 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -1507,7 +1507,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 78
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[21], w_fp[11], w_fp[1], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[21], w_fp[11], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 78 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -1521,7 +1521,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 79
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[15], w_fp[2], w_fp[12], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[15], w_fp[2], w_fp[12], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 79 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -1537,7 +1537,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 80
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[15], w_fp[23], w_fp[1], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[15], w_fp[23], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 80 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -1548,10 +1548,10 @@ namespace mg5amcCpu
       // *** DIAGRAM 81 OF 123 ***
 
       // Wavefunction(s) for diagram number 81
-      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[9], w_fp[0], COUPs[1], cIPD[0], cIPD[1], w_fp[23] );
+      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[9], w_fp[0], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[23] );
 
       // Amplitude(s) for diagram number 81
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[15], w_fp[23], w_fp[5], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[15], w_fp[23], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 81 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -1561,10 +1561,10 @@ namespace mg5amcCpu
       // *** DIAGRAM 82 OF 123 ***
 
       // Wavefunction(s) for diagram number 82
-      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[15], w_fp[0], COUPs[1], cIPD[0], cIPD[1], w_fp[12] );
+      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[15], w_fp[0], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[12] );
 
       // Amplitude(s) for diagram number 82
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[12], w_fp[9], w_fp[5], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[12], w_fp[9], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 82 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -1577,7 +1577,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 83
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[13], w_fp[23], w_fp[4], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[13], w_fp[23], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 83 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -1587,10 +1587,10 @@ namespace mg5amcCpu
       // *** DIAGRAM 84 OF 123 ***
 
       // Wavefunction(s) for diagram number 84
-      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[13], w_fp[0], COUPs[1], cIPD[0], cIPD[1], w_fp[21] );
+      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[13], w_fp[0], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[21] );
 
       // Amplitude(s) for diagram number 84
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[21], w_fp[9], w_fp[4], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[21], w_fp[9], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 84 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -1603,7 +1603,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 85
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[23], w_fp[10], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[23], w_fp[10], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 85 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -1614,10 +1614,10 @@ namespace mg5amcCpu
       // *** DIAGRAM 86 OF 123 ***
 
       // Wavefunction(s) for diagram number 86
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[10], COUPs[0], 0., 0., w_fp[23] );
+      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[10], COUPs[0], 1.0, 0., 0., w_fp[23] );
 
       // Amplitude(s) for diagram number 86
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[9], w_fp[23], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[9], w_fp[23], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 86 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -1630,10 +1630,10 @@ namespace mg5amcCpu
       // *** DIAGRAM 87 OF 123 ***
 
       // Wavefunction(s) for diagram number 87
-      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[16], w_fp[0], COUPs[1], cIPD[0], cIPD[1], w_fp[22] );
+      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[16], w_fp[0], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[22] );
 
       // Amplitude(s) for diagram number 87
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[22], w_fp[11], w_fp[5], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[22], w_fp[11], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 87 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -1643,10 +1643,10 @@ namespace mg5amcCpu
       // *** DIAGRAM 88 OF 123 ***
 
       // Wavefunction(s) for diagram number 88
-      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[11], w_fp[0], COUPs[1], cIPD[0], cIPD[1], w_fp[20] );
+      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[11], w_fp[0], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[20] );
 
       // Amplitude(s) for diagram number 88
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[16], w_fp[20], w_fp[5], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[16], w_fp[20], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 88 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -1659,7 +1659,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 89
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[22], w_fp[14], w_fp[4], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[22], w_fp[14], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 89 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -1669,10 +1669,10 @@ namespace mg5amcCpu
       // *** DIAGRAM 90 OF 123 ***
 
       // Wavefunction(s) for diagram number 90
-      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[14], w_fp[0], COUPs[1], cIPD[0], cIPD[1], w_fp[24] );
+      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[14], w_fp[0], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[24] );
 
       // Amplitude(s) for diagram number 90
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[16], w_fp[24], w_fp[4], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[16], w_fp[24], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 90 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -1685,7 +1685,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 91
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[22], w_fp[2], w_fp[10], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[22], w_fp[2], w_fp[10], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 91 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -1699,7 +1699,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 92
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[16], w_fp[2], w_fp[23], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[16], w_fp[2], w_fp[23], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 92 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -1715,7 +1715,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 93
-      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[6], w_fp[7], w_fp[5], COUPs[2], &amp_fp[0] );
+      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[6], w_fp[7], w_fp[5], COUPs[2], 1.0, &amp_fp[0] );
       jamp_sv[0] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[2] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[8] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -1724,7 +1724,7 @@ namespace mg5amcCpu
       jamp_sv[19] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[21] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[23] -= cxtype( 0, 1 ) * amp_sv[0];
-      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[6], w_fp[7], w_fp[5], COUPs[2], &amp_fp[0] );
+      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[6], w_fp[7], w_fp[5], COUPs[2], 1.0, &amp_fp[0] );
       jamp_sv[0] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[2] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[4] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -1733,7 +1733,7 @@ namespace mg5amcCpu
       jamp_sv[15] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[21] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[23] -= cxtype( 0, 1 ) * amp_sv[0];
-      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[6], w_fp[7], w_fp[5], COUPs[2], &amp_fp[0] );
+      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[6], w_fp[7], w_fp[5], COUPs[2], 1.0, &amp_fp[0] );
       jamp_sv[4] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[5] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[8] += cxtype( 0, 1 ) * amp_sv[0];
@@ -1746,10 +1746,10 @@ namespace mg5amcCpu
       // *** DIAGRAM 94 OF 123 ***
 
       // Wavefunction(s) for diagram number 94
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[6], COUPs[0], 0., 0., w_fp[22] );
+      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[6], COUPs[0], 1.0, 0., 0., w_fp[22] );
 
       // Amplitude(s) for diagram number 94
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[7], w_fp[5], w_fp[22], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[7], w_fp[5], w_fp[22], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 94 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -1766,10 +1766,10 @@ namespace mg5amcCpu
       // *** DIAGRAM 95 OF 123 ***
 
       // Wavefunction(s) for diagram number 95
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[7], COUPs[0], 0., 0., w_fp[25] );
+      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[7], COUPs[0], 1.0, 0., 0., w_fp[25] );
 
       // Amplitude(s) for diagram number 95
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[6], w_fp[5], w_fp[25], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[6], w_fp[5], w_fp[25], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 95 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -1789,7 +1789,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 96
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[14], w_fp[22], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[14], w_fp[22], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 96 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -1805,7 +1805,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 97
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[24], w_fp[6], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[24], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 97 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -1819,7 +1819,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 98
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[13], w_fp[2], w_fp[22], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[13], w_fp[2], w_fp[22], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 98 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -1835,7 +1835,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 99
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[21], w_fp[2], w_fp[6], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[21], w_fp[2], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 99 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -1849,7 +1849,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 100
-      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[18], w_fp[7], w_fp[4], COUPs[2], &amp_fp[0] );
+      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[18], w_fp[7], w_fp[4], COUPs[2], 1.0, &amp_fp[0] );
       jamp_sv[1] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[4] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[10] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -1858,7 +1858,7 @@ namespace mg5amcCpu
       jamp_sv[15] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[17] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[20] += cxtype( 0, 1 ) * amp_sv[0];
-      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[18], w_fp[7], w_fp[4], COUPs[2], &amp_fp[0] );
+      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[18], w_fp[7], w_fp[4], COUPs[2], 1.0, &amp_fp[0] );
       jamp_sv[1] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[2] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[3] += cxtype( 0, 1 ) * amp_sv[0];
@@ -1867,7 +1867,7 @@ namespace mg5amcCpu
       jamp_sv[15] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[17] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[21] += cxtype( 0, 1 ) * amp_sv[0];
-      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[18], w_fp[7], w_fp[4], COUPs[2], &amp_fp[0] );
+      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[18], w_fp[7], w_fp[4], COUPs[2], 1.0, &amp_fp[0] );
       jamp_sv[2] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[3] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[10] += cxtype( 0, 1 ) * amp_sv[0];
@@ -1880,10 +1880,10 @@ namespace mg5amcCpu
       // *** DIAGRAM 101 OF 123 ***
 
       // Wavefunction(s) for diagram number 101
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[18], COUPs[0], 0., 0., w_fp[6] );
+      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[18], COUPs[0], 1.0, 0., 0., w_fp[6] );
 
       // Amplitude(s) for diagram number 101
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[7], w_fp[4], w_fp[6], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[7], w_fp[4], w_fp[6], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 101 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -1903,7 +1903,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 102
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[18], w_fp[4], w_fp[25], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[18], w_fp[4], w_fp[25], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 102 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -1923,7 +1923,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 103
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[11], w_fp[6], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[11], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 103 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -1939,7 +1939,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 104
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[20], w_fp[18], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[20], w_fp[18], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 104 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -1953,7 +1953,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 105
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[15], w_fp[2], w_fp[6], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[15], w_fp[2], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 105 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -1969,7 +1969,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 106
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[12], w_fp[2], w_fp[18], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[12], w_fp[2], w_fp[18], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 106 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -1983,7 +1983,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 107
-      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[1], w_fp[7], w_fp[10], COUPs[2], &amp_fp[0] );
+      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[1], w_fp[7], w_fp[10], COUPs[2], 1.0, &amp_fp[0] );
       jamp_sv[0] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[1] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[6] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -1992,7 +1992,7 @@ namespace mg5amcCpu
       jamp_sv[17] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[22] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[23] -= cxtype( 0, 1 ) * amp_sv[0];
-      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[1], w_fp[7], w_fp[10], COUPs[2], &amp_fp[0] );
+      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[1], w_fp[7], w_fp[10], COUPs[2], 1.0, &amp_fp[0] );
       jamp_sv[0] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[1] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[3] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -2001,7 +2001,7 @@ namespace mg5amcCpu
       jamp_sv[11] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[17] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[23] -= cxtype( 0, 1 ) * amp_sv[0];
-      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[1], w_fp[7], w_fp[10], COUPs[2], &amp_fp[0] );
+      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[1], w_fp[7], w_fp[10], COUPs[2], 1.0, &amp_fp[0] );
       jamp_sv[3] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[5] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[6] += cxtype( 0, 1 ) * amp_sv[0];
@@ -2017,7 +2017,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 108
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[10], w_fp[25], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[10], w_fp[25], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 108 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -2037,7 +2037,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 109
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[7], w_fp[23], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[7], w_fp[23], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 109 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -2057,7 +2057,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 110
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[13], w_fp[20], w_fp[1], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[13], w_fp[20], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 110 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -2070,7 +2070,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 111
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[21], w_fp[11], w_fp[1], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[21], w_fp[11], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 111 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -2083,7 +2083,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 112
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[15], w_fp[24], w_fp[1], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[15], w_fp[24], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 112 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -2096,7 +2096,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 113
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[12], w_fp[14], w_fp[1], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[12], w_fp[14], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 113 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -2106,12 +2106,12 @@ namespace mg5amcCpu
       // *** DIAGRAM 114 OF 123 ***
 
       // Wavefunction(s) for diagram number 114
-      VVVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[1], w_fp[4], COUPs[2], 0., 0., w_fp[12] );
-      VVVV3P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[1], w_fp[4], COUPs[2], 0., 0., w_fp[24] );
-      VVVV4P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[1], w_fp[4], COUPs[2], 0., 0., w_fp[21] );
+      VVVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[1], w_fp[4], COUPs[2], 1.0, 0., 0., w_fp[12] );
+      VVVV3P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[1], w_fp[4], COUPs[2], 1.0, 0., 0., w_fp[24] );
+      VVVV4P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[1], w_fp[4], COUPs[2], 1.0, 0., 0., w_fp[21] );
 
       // Amplitude(s) for diagram number 114
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[12], w_fp[7], w_fp[5], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[12], w_fp[7], w_fp[5], COUPs[0], 1.0, &amp_fp[0] );
       jamp_sv[0] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[2] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[8] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -2120,7 +2120,7 @@ namespace mg5amcCpu
       jamp_sv[19] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[21] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[23] -= cxtype( 0, 1 ) * amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[24], w_fp[7], w_fp[5], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[24], w_fp[7], w_fp[5], COUPs[0], 1.0, &amp_fp[0] );
       jamp_sv[2] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[6] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[8] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -2129,7 +2129,7 @@ namespace mg5amcCpu
       jamp_sv[20] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[21] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[22] -= cxtype( 0, 1 ) * amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[21], w_fp[7], w_fp[5], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[21], w_fp[7], w_fp[5], COUPs[0], 1.0, &amp_fp[0] );
       jamp_sv[0] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[6] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[12] += cxtype( 0, 1 ) * amp_sv[0];
@@ -2145,17 +2145,17 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 115
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[14], w_fp[12], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[14], w_fp[12], COUPs[1], 1.0, &amp_fp[0] );
       jamp_sv[18] += amp_sv[0];
       jamp_sv[19] -= amp_sv[0];
       jamp_sv[21] -= amp_sv[0];
       jamp_sv[23] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[14], w_fp[24], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[14], w_fp[24], COUPs[1], 1.0, &amp_fp[0] );
       jamp_sv[19] -= amp_sv[0];
       jamp_sv[20] += amp_sv[0];
       jamp_sv[21] -= amp_sv[0];
       jamp_sv[22] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[14], w_fp[21], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[14], w_fp[21], COUPs[1], 1.0, &amp_fp[0] );
       jamp_sv[18] -= amp_sv[0];
       jamp_sv[20] += amp_sv[0];
       jamp_sv[22] += amp_sv[0];
@@ -2167,17 +2167,17 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 116
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[13], w_fp[2], w_fp[12], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[13], w_fp[2], w_fp[12], COUPs[1], 1.0, &amp_fp[0] );
       jamp_sv[0] += amp_sv[0];
       jamp_sv[2] -= amp_sv[0];
       jamp_sv[8] -= amp_sv[0];
       jamp_sv[14] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[13], w_fp[2], w_fp[24], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[13], w_fp[2], w_fp[24], COUPs[1], 1.0, &amp_fp[0] );
       jamp_sv[2] -= amp_sv[0];
       jamp_sv[6] += amp_sv[0];
       jamp_sv[8] -= amp_sv[0];
       jamp_sv[12] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[13], w_fp[2], w_fp[21], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[13], w_fp[2], w_fp[21], COUPs[1], 1.0, &amp_fp[0] );
       jamp_sv[0] -= amp_sv[0];
       jamp_sv[6] += amp_sv[0];
       jamp_sv[12] += amp_sv[0];
@@ -2186,12 +2186,12 @@ namespace mg5amcCpu
       // *** DIAGRAM 117 OF 123 ***
 
       // Wavefunction(s) for diagram number 117
-      VVVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[1], w_fp[5], COUPs[2], 0., 0., w_fp[21] );
-      VVVV3P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[1], w_fp[5], COUPs[2], 0., 0., w_fp[13] );
-      VVVV4P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[1], w_fp[5], COUPs[2], 0., 0., w_fp[24] );
+      VVVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[1], w_fp[5], COUPs[2], 1.0, 0., 0., w_fp[21] );
+      VVVV3P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[1], w_fp[5], COUPs[2], 1.0, 0., 0., w_fp[13] );
+      VVVV4P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[1], w_fp[5], COUPs[2], 1.0, 0., 0., w_fp[24] );
 
       // Amplitude(s) for diagram number 117
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[21], w_fp[7], w_fp[4], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[21], w_fp[7], w_fp[4], COUPs[0], 1.0, &amp_fp[0] );
       jamp_sv[1] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[4] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[10] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -2200,7 +2200,7 @@ namespace mg5amcCpu
       jamp_sv[15] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[17] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[20] += cxtype( 0, 1 ) * amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[13], w_fp[7], w_fp[4], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[13], w_fp[7], w_fp[4], COUPs[0], 1.0, &amp_fp[0] );
       jamp_sv[4] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[7] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[10] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -2209,7 +2209,7 @@ namespace mg5amcCpu
       jamp_sv[15] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[16] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[18] += cxtype( 0, 1 ) * amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[24], w_fp[7], w_fp[4], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[24], w_fp[7], w_fp[4], COUPs[0], 1.0, &amp_fp[0] );
       jamp_sv[1] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[7] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[12] += cxtype( 0, 1 ) * amp_sv[0];
@@ -2225,17 +2225,17 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 118
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[11], w_fp[21], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[11], w_fp[21], COUPs[1], 1.0, &amp_fp[0] );
       jamp_sv[12] += amp_sv[0];
       jamp_sv[13] -= amp_sv[0];
       jamp_sv[15] -= amp_sv[0];
       jamp_sv[17] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[11], w_fp[13], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[11], w_fp[13], COUPs[1], 1.0, &amp_fp[0] );
       jamp_sv[13] -= amp_sv[0];
       jamp_sv[14] += amp_sv[0];
       jamp_sv[15] -= amp_sv[0];
       jamp_sv[16] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[11], w_fp[24], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[11], w_fp[24], COUPs[1], 1.0, &amp_fp[0] );
       jamp_sv[12] -= amp_sv[0];
       jamp_sv[14] += amp_sv[0];
       jamp_sv[16] += amp_sv[0];
@@ -2247,17 +2247,17 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 119
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[15], w_fp[2], w_fp[21], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[15], w_fp[2], w_fp[21], COUPs[1], 1.0, &amp_fp[0] );
       jamp_sv[1] += amp_sv[0];
       jamp_sv[4] -= amp_sv[0];
       jamp_sv[10] -= amp_sv[0];
       jamp_sv[20] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[15], w_fp[2], w_fp[13], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[15], w_fp[2], w_fp[13], COUPs[1], 1.0, &amp_fp[0] );
       jamp_sv[4] -= amp_sv[0];
       jamp_sv[7] += amp_sv[0];
       jamp_sv[10] -= amp_sv[0];
       jamp_sv[18] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[15], w_fp[2], w_fp[24], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[15], w_fp[2], w_fp[24], COUPs[1], 1.0, &amp_fp[0] );
       jamp_sv[1] -= amp_sv[0];
       jamp_sv[7] += amp_sv[0];
       jamp_sv[18] += amp_sv[0];
@@ -2266,22 +2266,22 @@ namespace mg5amcCpu
       // *** DIAGRAM 120 OF 123 ***
 
       // Wavefunction(s) for diagram number 120
-      VVVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[4], w_fp[5], COUPs[2], 0., 0., w_fp[24] );
-      VVVV3P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[4], w_fp[5], COUPs[2], 0., 0., w_fp[15] );
-      VVVV4P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[4], w_fp[5], COUPs[2], 0., 0., w_fp[13] );
+      VVVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[4], w_fp[5], COUPs[2], 1.0, 0., 0., w_fp[24] );
+      VVVV3P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[4], w_fp[5], COUPs[2], 1.0, 0., 0., w_fp[15] );
+      VVVV4P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[4], w_fp[5], COUPs[2], 1.0, 0., 0., w_fp[13] );
 
       // Amplitude(s) for diagram number 120
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[9], w_fp[24], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[9], w_fp[24], COUPs[1], 1.0, &amp_fp[0] );
       jamp_sv[6] += amp_sv[0];
       jamp_sv[7] -= amp_sv[0];
       jamp_sv[9] -= amp_sv[0];
       jamp_sv[11] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[9], w_fp[15], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[9], w_fp[15], COUPs[1], 1.0, &amp_fp[0] );
       jamp_sv[7] -= amp_sv[0];
       jamp_sv[8] += amp_sv[0];
       jamp_sv[9] -= amp_sv[0];
       jamp_sv[10] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[9], w_fp[13], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[9], w_fp[13], COUPs[1], 1.0, &amp_fp[0] );
       jamp_sv[6] -= amp_sv[0];
       jamp_sv[8] += amp_sv[0];
       jamp_sv[10] += amp_sv[0];
@@ -2293,17 +2293,17 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 121
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[16], w_fp[2], w_fp[24], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[16], w_fp[2], w_fp[24], COUPs[1], 1.0, &amp_fp[0] );
       jamp_sv[3] += amp_sv[0];
       jamp_sv[5] -= amp_sv[0];
       jamp_sv[16] -= amp_sv[0];
       jamp_sv[22] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[16], w_fp[2], w_fp[15], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[16], w_fp[2], w_fp[15], COUPs[1], 1.0, &amp_fp[0] );
       jamp_sv[5] -= amp_sv[0];
       jamp_sv[13] += amp_sv[0];
       jamp_sv[16] -= amp_sv[0];
       jamp_sv[19] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[16], w_fp[2], w_fp[13], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[16], w_fp[2], w_fp[13], COUPs[1], 1.0, &amp_fp[0] );
       jamp_sv[3] -= amp_sv[0];
       jamp_sv[13] += amp_sv[0];
       jamp_sv[19] += amp_sv[0];
@@ -2315,7 +2315,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 122
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[24], w_fp[1], w_fp[7], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[24], w_fp[1], w_fp[7], COUPs[0], 1.0, &amp_fp[0] );
       jamp_sv[3] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[5] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[6] += cxtype( 0, 1 ) * amp_sv[0];
@@ -2324,7 +2324,7 @@ namespace mg5amcCpu
       jamp_sv[11] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[16] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[22] -= cxtype( 0, 1 ) * amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[15], w_fp[1], w_fp[7], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[15], w_fp[1], w_fp[7], COUPs[0], 1.0, &amp_fp[0] );
       jamp_sv[5] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[7] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[8] += cxtype( 0, 1 ) * amp_sv[0];
@@ -2333,7 +2333,7 @@ namespace mg5amcCpu
       jamp_sv[13] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[16] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[19] -= cxtype( 0, 1 ) * amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[13], w_fp[1], w_fp[7], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[13], w_fp[1], w_fp[7], COUPs[0], 1.0, &amp_fp[0] );
       jamp_sv[3] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[6] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[8] += cxtype( 0, 1 ) * amp_sv[0];
@@ -2349,7 +2349,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 123
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[17], w_fp[7], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[17], w_fp[7], COUPs[0], 1.0, &amp_fp[0] );
       jamp_sv[0] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[1] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[3] += cxtype( 0, 1 ) * amp_sv[0];
@@ -2358,7 +2358,7 @@ namespace mg5amcCpu
       jamp_sv[11] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[17] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[23] += cxtype( 0, 1 ) * amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[19], w_fp[7], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[19], w_fp[7], COUPs[0], 1.0, &amp_fp[0] );
       jamp_sv[1] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[2] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[3] += cxtype( 0, 1 ) * amp_sv[0];
@@ -2367,7 +2367,7 @@ namespace mg5amcCpu
       jamp_sv[15] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[17] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[21] += cxtype( 0, 1 ) * amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[8], w_fp[7], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[8], w_fp[7], COUPs[0], 1.0, &amp_fp[0] );
       jamp_sv[0] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[2] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[4] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -2813,12 +2813,12 @@ namespace mg5amcCpu
                        fptype* allDenominators,    // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
                        bool* isGoodHel )           // output: isGoodHel[ncomb] - device array (CUDA implementation)
-  { /* clang-format on */
-    fptype allMEsLast = 0;
+  {                                                         /* clang-format on */
     const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid
-    allMEs[ievt] = 0;
     for( int ihel = 0; ihel < ncomb; ihel++ )
     {
+      // NEW IMPLEMENTATION OF GETGOODHEL (#630): RESET THE RUNNING SUM OVER HELICITIES TO 0 BEFORE ADDING A NEW HELICITY
+      allMEs[ievt] = 0;
       // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s)
       constexpr fptype_sv* jamp2_sv = nullptr; // no need for color selection during helicity filtering
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
@@ -2827,12 +2827,11 @@ namespace mg5amcCpu
 #else
       calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv );
 #endif
-      if( allMEs[ievt] != allMEsLast )
+      if( allMEs[ievt] != 0 ) // NEW IMPLEMENTATION OF GETGOODHEL (#630): COMPARE EACH HELICITY CONTRIBUTION TO 0
       {
         //if ( !isGoodHel[ihel] ) std::cout << "sigmaKin_getGoodHel ihel=" << ihel << " TRUE" << std::endl;
         isGoodHel[ihel] = true;
       }
-      allMEsLast = allMEs[ievt]; // running sum up to helicity ihel for event ievt
     }
   }
 #else
@@ -2851,19 +2850,11 @@ namespace mg5amcCpu
     //assert( (size_t)(allMEs) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS]
     // Allocate arrays at build time to contain at least 16 events (or at least neppV events if neppV>16, e.g. in future VPUs)
     constexpr int maxtry0 = std::max( 16, neppV ); // 16, but at least neppV (otherwise the npagV loop does not even start)
-    fptype allMEsLast[maxtry0] = { 0 };            // allocated at build time: maxtry0 must be a constexpr
     // Loop over only nevt events if nevt is < 16 (note that nevt is always >= neppV)
     assert( nevt >= neppV );
     const int maxtry = std::min( maxtry0, nevt ); // 16, but at most nevt (avoid invalid memory access if nevt<maxtry0)
 
-    // PART 0 - INITIALISATION (before calculate_wavefunctions)
-    // Reset the "matrix elements" - running sums of |M|^2 over helicities for the given event
-    for( int ievt = 0; ievt < maxtry; ++ievt )
-    {
-      allMEs[ievt] = 0; // all zeros
-    }
-
-    // PART 1 - HELICITY LOOP: CALCULATE WAVEFUNCTIONS
+    // HELICITY LOOP: CALCULATE WAVEFUNCTIONS
     const int npagV = maxtry / neppV;
 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
     // Mixed fptypes #537: float for color algebra and double elsewhere
@@ -2882,6 +2873,16 @@ namespace mg5amcCpu
 #endif
       for( int ihel = 0; ihel < ncomb; ihel++ )
       {
+        // NEW IMPLEMENTATION OF GETGOODHEL (#630): RESET THE RUNNING SUM OVER HELICITIES TO 0 BEFORE ADDING A NEW HELICITY
+        for( int ieppV = 0; ieppV < neppV; ++ieppV )
+        {
+          const int ievt = ievt00 + ieppV;
+          allMEs[ievt] = 0;
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+          const int ievt2 = ievt00 + ieppV + neppV;
+          allMEs[ievt2] = 0;
+#endif
+        }
         constexpr fptype_sv* jamp2_sv = nullptr; // no need for color selection during helicity filtering
         //std::cout << "sigmaKin_getGoodHel ihel=" << ihel << ( isGoodHel[ihel] ? " true" : " false" ) << std::endl;
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
@@ -2893,22 +2894,18 @@ namespace mg5amcCpu
         for( int ieppV = 0; ieppV < neppV; ++ieppV )
         {
           const int ievt = ievt00 + ieppV;
-          const bool differs = ( allMEs[ievt] != allMEsLast[ievt] );
-          if( differs )
+          if( allMEs[ievt] != 0 ) // NEW IMPLEMENTATION OF GETGOODHEL (#630): COMPARE EACH HELICITY CONTRIBUTION TO 0
           {
             //if ( !isGoodHel[ihel] ) std::cout << "sigmaKin_getGoodHel ihel=" << ihel << " TRUE" << std::endl;
             isGoodHel[ihel] = true;
           }
-          allMEsLast[ievt] = allMEs[ievt]; // running sum up to helicity ihel
 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
           const int ievt2 = ievt00 + ieppV + neppV;
-          const bool differs2 = ( allMEs[ievt2] != allMEsLast[ievt2] );
-          if( differs2 )
+          if( allMEs[ievt2] != 0 ) // NEW IMPLEMENTATION OF GETGOODHEL (#630): COMPARE EACH HELICITY CONTRIBUTION TO 0
           {
             //if ( !isGoodHel[ihel] ) std::cout << "sigmaKin_getGoodHel ihel=" << ihel << " TRUE" << std::endl;
             isGoodHel[ihel] = true;
           }
-          allMEsLast[ievt2] = allMEs[ievt2]; // running sum up to helicity ihel
 #endif
         }
       }
@@ -2965,13 +2962,12 @@ namespace mg5amcCpu
   {
     mgDebugInitialise();
 
-    // SANITY CHECKS for cudacpp code generation (see issues #272 and #343 and PRs #619, #626, #360 and #396)
+    // SANITY CHECKS for cudacpp code generation (see issues #272 and #343 and PRs #619, #626, #360, #396 and #754)
     // These variable are not used anywhere else in the code and their scope is limited to this sanity check
     {
-      // nprocesses>1 was last observed for "mirror processes" in uux_ttx in the 270 branch (see issue #343 and PRs #360 and #396)
+      // nprocesses == 2 may happen for "mirror processes" such as P0_uux_ttx within pp_tt012j (see PR #754)
       constexpr int nprocesses = 1;
-      static_assert( nprocesses == 1, "Assume nprocesses == 1" );
-      // process_id corresponds to the index of DSIG1 Fortran functions (must be 1 because cudacpp is unable to handle DSIG2)
+      static_assert( nprocesses == 1 || nprocesses == 2, "Assume nprocesses == 1 or 2" );
       constexpr int process_id = 1; // code generation source: madevent + cudacpp exporter
       static_assert( process_id == 1, "Assume process_id == 1" );
     }
@@ -3057,23 +3053,26 @@ namespace mg5amcCpu
     }
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     // Event-by-event random choice of color #402
-    const int channelIdC = channelId - 1; // coloramps.h uses the C array indexing starting at 0
-    fptype targetamp[ncolor] = { 0 };
-    for( int icolC = 0; icolC < ncolor; icolC++ )
+    if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783)
     {
-      if( icolC == 0 )
-        targetamp[icolC] = 0;
-      else
-        targetamp[icolC] = targetamp[icolC - 1];
-      if( mgOnGpu::icolamp[channelIdC][icolC] ) targetamp[icolC] += jamp2_sv[icolC];
-    }
-    //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] );
-    for( int icolC = 0; icolC < ncolor; icolC++ )
-    {
-      if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) )
+      const unsigned int channelIdC = channelId - 1; // coloramps.h uses the C array indexing starting at 0
+      fptype targetamp[ncolor] = { 0 };
+      for( int icolC = 0; icolC < ncolor; icolC++ )
       {
-        allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1]
-        break;
+        if( icolC == 0 )
+          targetamp[icolC] = 0;
+        else
+          targetamp[icolC] = targetamp[icolC - 1];
+        if( mgOnGpu::icolamp[channelIdC][icolC] ) targetamp[icolC] += jamp2_sv[icolC];
+      }
+      //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] );
+      for( int icolC = 0; icolC < ncolor; icolC++ )
+      {
+        if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) )
+        {
+          allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1]
+          break;
+        }
       }
     }
 #endif
@@ -3168,57 +3167,60 @@ namespace mg5amcCpu
 #endif
       }
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // multichannel enabled (random color choice)
-      const int channelIdC = channelId - 1; // coloramps.h uses the C array indexing starting at 0
       // Event-by-event random choice of color #402
-      fptype_sv targetamp[ncolor] = { 0 };
-      for( int icolC = 0; icolC < ncolor; icolC++ )
+      if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783)
       {
-        if( icolC == 0 )
-          targetamp[icolC] = fptype_sv{ 0 };
-        else
-          targetamp[icolC] = targetamp[icolC - 1];
-        if( mgOnGpu::icolamp[channelIdC][icolC] ) targetamp[icolC] += jamp2_sv[icolC];
-      }
+        const unsigned int channelIdC = channelId - 1; // coloramps.h uses the C array indexing starting at 0
+        fptype_sv targetamp[ncolor] = { 0 };
+        for( int icolC = 0; icolC < ncolor; icolC++ )
+        {
+          if( icolC == 0 )
+            targetamp[icolC] = fptype_sv{ 0 };
+          else
+            targetamp[icolC] = targetamp[icolC - 1];
+          if( mgOnGpu::icolamp[channelIdC][icolC] ) targetamp[icolC] += jamp2_sv[icolC];
+        }
 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-      fptype_sv targetamp2[ncolor] = { 0 };
-      for( int icolC = 0; icolC < ncolor; icolC++ )
-      {
-        if( icolC == 0 )
-          targetamp2[icolC] = fptype_sv{ 0 };
-        else
-          targetamp2[icolC] = targetamp2[icolC - 1];
-        if( mgOnGpu::icolamp[channelIdC][icolC] ) targetamp2[icolC] += jamp2_sv[ncolor + icolC];
-      }
-#endif
-      for( int ieppV = 0; ieppV < neppV; ++ieppV )
-      {
-        const int ievt = ievt00 + ieppV;
-        //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] );
+        fptype_sv targetamp2[ncolor] = { 0 };
         for( int icolC = 0; icolC < ncolor; icolC++ )
         {
+          if( icolC == 0 )
+            targetamp2[icolC] = fptype_sv{ 0 };
+          else
+            targetamp2[icolC] = targetamp2[icolC - 1];
+          if( mgOnGpu::icolamp[channelIdC][icolC] ) targetamp2[icolC] += jamp2_sv[ncolor + icolC];
+        }
+#endif
+        for( int ieppV = 0; ieppV < neppV; ++ieppV )
+        {
+          const int ievt = ievt00 + ieppV;
+          //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] );
+          for( int icolC = 0; icolC < ncolor; icolC++ )
+          {
 #if defined MGONGPU_CPPSIMD
-          const bool okcol = allrndcol[ievt] < ( targetamp[icolC][ieppV] / targetamp[ncolor - 1][ieppV] );
+            const bool okcol = allrndcol[ievt] < ( targetamp[icolC][ieppV] / targetamp[ncolor - 1][ieppV] );
 #else
-          const bool okcol = allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] );
+            const bool okcol = allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] );
 #endif
-          if( okcol )
-          {
-            allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1]
-            break;
+            if( okcol )
+            {
+              allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1]
+              break;
+            }
           }
-        }
 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-        const int ievt2 = ievt00 + ieppV + neppV;
-        //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt2, allrndcol[ievt2] );
-        for( int icolC = 0; icolC < ncolor; icolC++ )
-        {
-          if( allrndcol[ievt2] < ( targetamp2[icolC][ieppV] / targetamp2[ncolor - 1][ieppV] ) )
+          const int ievt2 = ievt00 + ieppV + neppV;
+          //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt2, allrndcol[ievt2] );
+          for( int icolC = 0; icolC < ncolor; icolC++ )
           {
-            allselcol[ievt2] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1]
-            break;
+            if( allrndcol[ievt2] < ( targetamp2[icolC][ieppV] / targetamp2[ncolor - 1][ieppV] ) )
+            {
+              allselcol[ievt2] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1]
+              break;
+            }
           }
-        }
 #endif
+        }
       }
 #endif // multichannel enabled (random color choice)
     }
diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/CPPProcess.h b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/CPPProcess.h
index d85e33bfee..deb1358992 100644
--- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/CPPProcess.h
+++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/CPPProcess.h
@@ -7,7 +7,7 @@
 // Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+// MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/auto_dsig.f b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/auto_dsig.f
index 9d747e6dc1..adf0afbe05 100644
--- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/auto_dsig.f
+++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/auto_dsig.f
@@ -359,7 +359,7 @@ SUBROUTINE DSIG_VEC(ALL_P,ALL_WGT,ALL_XBK,ALL_Q2FACT,ALL_CM_RAP
       DOUBLE PRECISION FUNCTION DSIG(PP,WGT,IMODE)
 C     ****************************************************
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+C     Generated by MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/auto_dsig1.f b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/auto_dsig1.f
index d12d34daf6..e4e527260c 100644
--- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/auto_dsig1.f
+++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/auto_dsig1.f
@@ -1,7 +1,7 @@
       DOUBLE PRECISION FUNCTION DSIG1(PP,WGT,IMODE)
 C     ****************************************************
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+C     Generated by MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
@@ -39,6 +39,7 @@ DOUBLE PRECISION FUNCTION DSIG1(PP,WGT,IMODE)
 C     LOCAL VARIABLES 
 C     
       INTEGER I,ITYPE,LP,IPROC
+      DOUBLE PRECISION QSCALE
       DOUBLE PRECISION G1
       DOUBLE PRECISION G2
       DOUBLE PRECISION XPQ(-7:7),PD(0:MAXPROC)
@@ -126,11 +127,24 @@ DOUBLE PRECISION FUNCTION DSIG1(PP,WGT,IMODE)
 
       IF (ABS(LPP(IB(1))).GE.1) THEN
           !LP=SIGN(1,LPP(IB(1)))
-        G1=PDG2PDF(LPP(IB(1)),0, IB(1),XBK(IB(1)),DSQRT(Q2FACT(IB(1))))
+        IF (DSQRT(Q2FACT(IB(1))).EQ.0D0) THEN
+          QSCALE=0D0
+          DO I=3,NEXTERNAL
+            QSCALE=QSCALE+DSQRT(MAX(0D0,(PP(0,I)+PP(3,I))*(PP(0,I)
+     $       -PP(3,I))))
+          ENDDO
+          QSCALE=QSCALE/2D0
+        ELSE
+          QSCALE=DSQRT(Q2FACT(IB(1)))
+        ENDIF
+        G1=PDG2PDF(LPP(IB(1)),0, IB(1),XBK(IB(1)), QSCALE)
       ENDIF
       IF (ABS(LPP(IB(2))).GE.1) THEN
           !LP=SIGN(1,LPP(IB(2)))
-        G2=PDG2PDF(LPP(IB(2)),0, IB(2),XBK(IB(2)),DSQRT(Q2FACT(IB(2))))
+        IF (DSQRT(Q2FACT(IB(2))).NE.0D0) THEN
+          QSCALE=DSQRT(Q2FACT(IB(2)))
+        ENDIF
+        G2=PDG2PDF(LPP(IB(2)),0, IB(2),XBK(IB(2)), QSCALE)
       ENDIF
       PD(0) = 0D0
       IPROC = 0
@@ -202,7 +216,7 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT,
      $  ALL_CM_RAP, ALL_WGT, IMODE, ALL_OUT, VECSIZE_USED)
 C     ****************************************************
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+C     Generated by MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
@@ -249,6 +263,7 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT,
 C     
 C     LOCAL VARIABLES 
 C     
+      DOUBLE PRECISION QSCALE
       INTEGER I,ITYPE,LP,IPROC
       DOUBLE PRECISION G1(VECSIZE_MEMMAX)
       DOUBLE PRECISION G2(VECSIZE_MEMMAX)
diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/check_sa.cc b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/check_sa.cc
index 1bad694d1c..7cac5ab47b 100644
--- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/check_sa.cc
+++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/check_sa.cc
@@ -29,7 +29,9 @@
 
 #include <algorithm>
 #include <array>
+#include <cfenv> // for feenableexcept
 #include <cmath>
+#include <csignal> // for signal and SIGFPE
 #include <cstring>
 #include <fstream>
 #include <iomanip>
@@ -74,6 +76,23 @@ usage( char* argv0, int ret = 1 )
   return ret;
 }
 
+#ifdef __CUDACC__
+namespace mg5amcGpu
+#else
+namespace mg5amcCpu
+#endif
+{
+  inline void FPEhandler( int sig )
+  {
+#ifdef __CUDACC__
+    std::cerr << "Floating Point Exception (GPU)" << std::endl;
+#else
+    std::cerr << "Floating Point Exception (CPU)" << std::endl;
+#endif
+    exit( 0 );
+  }
+}
+
 int
 main( int argc, char** argv )
 {
@@ -84,6 +103,18 @@ main( int argc, char** argv )
   using namespace mg5amcCpu;
 #endif
 
+  // Enable FPEs (test #701 and #733 - except on MacOS where feenableexcept is not defined #730)
+#ifndef __APPLE__
+  const char* enableFPEc = getenv( "CUDACPP_RUNTIME_ENABLEFPE" );
+  const bool enableFPE = ( enableFPEc != 0 ) && ( std::string( enableFPEc ) != "" );
+  if( enableFPE )
+  {
+    std::cout << "WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions" << std::endl;
+    feenableexcept( FE_INVALID | FE_DIVBYZERO | FE_OVERFLOW | FE_UNDERFLOW ); // debug #701
+    signal( SIGFPE, FPEhandler );
+  }
+#endif
+
   // DEFAULTS FOR COMMAND LINE ARGUMENTS
   bool verbose = false;
   bool debug = false;
@@ -103,12 +134,14 @@ main( int argc, char** argv )
     CurandHost = 1,
     CurandDevice = 2
   };
-#ifdef __CUDACC__
-  RandomNumberMode rndgen = RandomNumberMode::CurandDevice; // default on CUDA GPU
-#elif not defined MGONGPU_HAS_NO_CURAND
-  RandomNumberMode rndgen = RandomNumberMode::CurandHost;  // default on CPU if build has curand
+#ifdef MGONGPU_HAS_NO_CURAND
+  RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // this is the only supported mode if build has no curand (PR #784 and #785)
+#elif defined __HIPCC__
+#error Internal error: MGONGPU_HAS_NO_CURAND should have been set for __HIPCC__ // default on AMD GPUs should be common random
+#elif defined __CUDACC__
+  RandomNumberMode rndgen = RandomNumberMode::CurandDevice; // default on NVidia GPU if build has curand
 #else
-  RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // default on CPU if build has no curand and on HIP GPU
+  RandomNumberMode rndgen = RandomNumberMode::CurandHost; // default on CPU if build has curand
 #endif
   // Rambo sampling mode (NB RamboHost implies CommonRandom or CurandHost!)
   enum class RamboSamplingMode
@@ -146,18 +179,20 @@ main( int argc, char** argv )
     }
     else if( arg == "--curdev" )
     {
-#ifdef __CUDACC__
-      rndgen = RandomNumberMode::CurandDevice;
+#ifndef __CUDACC__
+      throw std::runtime_error( "CurandDevice is not supported on CPUs or non-NVidia GPUs" );
+#elif defined MGONGPU_HAS_NO_CURAND
+      throw std::runtime_error( "CurandDevice is not supported because this application was built without Curand support" );
 #else
-      throw std::runtime_error( "CurandDevice is not supported on CPUs or on HIP GPUs" );
+      rndgen = RandomNumberMode::CurandDevice;
 #endif
     }
     else if( arg == "--curhst" )
     {
-#ifndef MGONGPU_HAS_NO_CURAND
-      rndgen = RandomNumberMode::CurandHost;
-#else
+#ifdef MGONGPU_HAS_NO_CURAND
       throw std::runtime_error( "CurandHost is not supported because this application was built without Curand support" );
+#else
+      rndgen = RandomNumberMode::CurandHost;
 #endif
     }
     else if( arg == "--common" )
@@ -278,10 +313,10 @@ main( int argc, char** argv )
   const std::string procKey = "0a ProcInit";
   timermap.start( procKey );
 
-  // Create a process object
+  // Create a process object, read param card and set parameters
+  // FIXME: the process instance can happily go out of scope because it is only needed to read parameters?
+  // FIXME: the CPPProcess should really be a singleton? (for instance, in bridge mode this will be called twice here?)
   CPPProcess process( verbose );
-
-  // Read param_card and set parameters
   process.initProc( "../../Cards/param_card.dat" );
   const fptype energy = 1500; // historical default, Ecms = 1500 GeV = 1.5 TeV (above the Z peak)
   //const fptype energy = 91.2; // Ecms = 91.2 GeV (Z peak)
@@ -389,30 +424,26 @@ main( int argc, char** argv )
   {
     prnk.reset( new CommonRandomNumberKernel( hstRndmom ) );
   }
-#ifndef MGONGPU_HAS_NO_CURAND
   else if( rndgen == RandomNumberMode::CurandHost )
   {
+#ifdef MGONGPU_HAS_NO_CURAND
+    throw std::runtime_error( "INTERNAL ERROR! CurandHost is not supported because this application was built without Curand support" ); // INTERNAL ERROR (no path to this statement)
+#else
     const bool onDevice = false;
     prnk.reset( new CurandRandomNumberKernel( hstRndmom, onDevice ) );
+#endif
   }
-#ifdef __CUDACC__
   else
   {
+#ifdef MGONGPU_HAS_NO_CURAND
+    throw std::runtime_error( "INTERNAL ERROR! CurandDevice is not supported because this application was built without Curand support" ); // INTERNAL ERROR (no path to this statement)
+#elif defined __CUDACC__
     const bool onDevice = true;
     prnk.reset( new CurandRandomNumberKernel( devRndmom, onDevice ) );
-  }
 #else
-  else
-  {
-    throw std::logic_error( "CurandDevice is not supported on CPUs or HIP GPUs" ); // INTERNAL ERROR (no path to this statement)
-  }
+    throw std::logic_error( "INTERNAL ERROR! CurandDevice is not supported on CPUs or non-NVidia GPUs" ); // INTERNAL ERROR (no path to this statement)
 #endif
-#else
-  else
-  {
-    throw std::logic_error( "This application was built without Curand support" ); // INTERNAL ERROR (no path to this statement)
   }
-#endif
 
   // --- 0c. Create rambo sampling kernel [keep this in 0c for the moment]
   std::unique_ptr<SamplingKernelBase> prsk;
@@ -747,7 +778,7 @@ main( int argc, char** argv )
   wrkflwtxt += "HIP:";
 #else
   wrkflwtxt += "CPP:";
-#endif
+#endif /* clang-format off */
   // -- DOUBLE or FLOAT?
 #if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
   wrkflwtxt += "MIX+"; // mixed fptypes (single precision color algebra #537)
@@ -757,7 +788,7 @@ main( int argc, char** argv )
   wrkflwtxt += "FLT+";
 #else
   wrkflwtxt += "???+"; // no path to this statement
-#endif
+#endif /* clang-format on */
   // -- CUCOMPLEX or THRUST or STD complex numbers?
 #ifdef __CUDACC__
 #if defined MGONGPU_CUCXTYPE_CUCOMPLEX
diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/counters.cc b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/counters.cc
index 71fa817036..3bbdec9387 100644
--- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/counters.cc
+++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/counters.cc
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #include "timer.h"
 #define TIMERTYPE std::chrono::high_resolution_clock
@@ -36,13 +36,10 @@ extern "C"
 
   static mgOnGpu::Timer<TIMERTYPE> program_timer;
   static float program_totaltime = 0;
-  static mgOnGpu::Timer<TIMERTYPE> matrix1_timer;
-  static float matrix1_totaltime = 0;
   static mgOnGpu::Timer<TIMERTYPE> smatrix1_timer;
   static float smatrix1_totaltime = 0;
   static mgOnGpu::Timer<TIMERTYPE> smatrix1multi_timer[nimplC];
   static float smatrix1multi_totaltime[nimplC] = { 0 };
-  static int matrix1_counter = 0;
   static int smatrix1_counter = 0;
   static int smatrix1multi_counter[nimplC] = { 0 };
 
@@ -52,19 +49,6 @@ extern "C"
     return;
   }
 
-  void counters_matrix1_start_()
-  {
-    matrix1_counter++;
-    matrix1_timer.Start();
-    return;
-  }
-
-  void counters_matrix1_stop_()
-  {
-    matrix1_totaltime += matrix1_timer.GetDuration();
-    return;
-  }
-
   void counters_smatrix1_start_()
   {
     smatrix1_counter++;
diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/matrix1.f b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/matrix1.f
index dc6e4b80f3..272c6bd97d 100644
--- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/matrix1.f
+++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/matrix1.f
@@ -1,7 +1,7 @@
       SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL,
      $  ICOL)
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+C     Generated by MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
@@ -349,7 +349,7 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL,
 
       REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+C     Generated by MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
@@ -391,7 +391,7 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
 C     LOCAL VARIABLES 
 C     
       INTEGER I,J,M,N
-      COMPLEX*16 ZTEMP, TMP_JAMP(163)
+      COMPLEX*16 ZTEMP, TMP_JAMP(155)
       REAL*8 CF(NCOLOR,NCOLOR)
       COMPLEX*16 AMP(NGRAPHS), JAMP(NCOLOR,NAMPSO)
       COMPLEX*16 W(6,NWAVEFUNCS)
@@ -830,7 +830,6 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
 C     ----------
 C     BEGIN CODE
 C     ----------
-      call counters_matrix1_start()
       IF (FIRST) THEN
         FIRST=.FALSE.
         IF(ZERO.NE.0D0) FK_ZERO = SIGN(MAX(ABS(ZERO), ABS(ZERO
@@ -1219,362 +1218,318 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
       TMP_JAMP(3) = AMP(84) +  AMP(86)  ! used 8 times
       TMP_JAMP(2) = AMP(81) - AMP(83)  ! used 8 times
       TMP_JAMP(1) = AMP(82) +  AMP(85)  ! used 8 times
-      TMP_JAMP(30) = TMP_JAMP(11) +  AMP(121)  ! used 8 times
-      TMP_JAMP(29) = TMP_JAMP(12) - AMP(132)  ! used 8 times
+      TMP_JAMP(30) = TMP_JAMP(15) - AMP(157)  ! used 8 times
+      TMP_JAMP(29) = TMP_JAMP(14) +  AMP(159)  ! used 8 times
       TMP_JAMP(28) = TMP_JAMP(13) +  AMP(130)  ! used 8 times
-      TMP_JAMP(27) = TMP_JAMP(14) +  AMP(159)  ! used 8 times
-      TMP_JAMP(26) = TMP_JAMP(15) - AMP(157)  ! used 8 times
-      TMP_JAMP(25) = TMP_JAMP(8) - AMP(131)  ! used 8 times
+      TMP_JAMP(27) = TMP_JAMP(12) - AMP(132)  ! used 8 times
+      TMP_JAMP(26) = TMP_JAMP(11) +  AMP(121)  ! used 8 times
+      TMP_JAMP(25) = TMP_JAMP(10) +  AMP(154)  ! used 8 times
       TMP_JAMP(24) = TMP_JAMP(9) - AMP(156)  ! used 8 times
-      TMP_JAMP(23) = TMP_JAMP(10) +  AMP(154)  ! used 8 times
-      TMP_JAMP(22) = TMP_JAMP(6) +  AMP(114)  ! used 8 times
-      TMP_JAMP(21) = TMP_JAMP(7) +  AMP(158)  ! used 8 times
-      TMP_JAMP(20) = TMP_JAMP(4) - AMP(141)  ! used 8 times
-      TMP_JAMP(19) = TMP_JAMP(5) +  AMP(139)  ! used 8 times
-      TMP_JAMP(18) = TMP_JAMP(2) +  AMP(105)  ! used 8 times
-      TMP_JAMP(17) = TMP_JAMP(3) - AMP(155)  ! used 8 times
+      TMP_JAMP(23) = TMP_JAMP(8) - AMP(131)  ! used 8 times
+      TMP_JAMP(22) = TMP_JAMP(7) +  AMP(158)  ! used 8 times
+      TMP_JAMP(21) = TMP_JAMP(6) +  AMP(114)  ! used 8 times
+      TMP_JAMP(20) = TMP_JAMP(5) +  AMP(139)  ! used 8 times
+      TMP_JAMP(19) = TMP_JAMP(4) - AMP(141)  ! used 8 times
+      TMP_JAMP(18) = TMP_JAMP(3) - AMP(155)  ! used 8 times
+      TMP_JAMP(17) = TMP_JAMP(2) +  AMP(105)  ! used 8 times
       TMP_JAMP(16) = TMP_JAMP(1) - AMP(140)  ! used 8 times
-      TMP_JAMP(90) = AMP(108) +  AMP(133)  ! used 4 times
-      TMP_JAMP(89) = AMP(51) +  AMP(52)  ! used 4 times
-      TMP_JAMP(88) = AMP(40) - AMP(54)  ! used 4 times
-      TMP_JAMP(87) = AMP(11) - AMP(135)  ! used 4 times
-      TMP_JAMP(86) = TMP_JAMP(26) + ((-0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * AMP(51)  ! used 4 times
-      TMP_JAMP(85) = TMP_JAMP(28) +  TMP_JAMP(27)  ! used 4 times
-      TMP_JAMP(84) = TMP_JAMP(29) + ((-0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * AMP(11)  ! used 4 times
-      TMP_JAMP(83) = TMP_JAMP(30) +  TMP_JAMP(29)  ! used 4 times
-      TMP_JAMP(82) = AMP(102) +  AMP(151)  ! used 4 times
-      TMP_JAMP(81) = AMP(69) - AMP(134)  ! used 4 times
-      TMP_JAMP(80) = AMP(59) - AMP(153)  ! used 4 times
-      TMP_JAMP(79) = TMP_JAMP(23) + ((-0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * AMP(102)  ! used 4 times
-      TMP_JAMP(78) = TMP_JAMP(24) + ((-0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * AMP(59)  ! used 4 times
-      TMP_JAMP(77) = TMP_JAMP(25) + ((0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * AMP(69)  ! used 4 times
-      TMP_JAMP(76) = TMP_JAMP(29) +  TMP_JAMP(25)  ! used 4 times
-      TMP_JAMP(75) = TMP_JAMP(30) - TMP_JAMP(23)  ! used 4 times
-      TMP_JAMP(74) = AMP(43) - AMP(53)  ! used 4 times
-      TMP_JAMP(73) = TMP_JAMP(21) + ((0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * AMP(43)  ! used 4 times
-      TMP_JAMP(72) = TMP_JAMP(22) + ((0.000000000000000D+00,
+      TMP_JAMP(80) = TMP_JAMP(30) +  TMP_JAMP(29)  ! used 4 times
+      TMP_JAMP(79) = TMP_JAMP(30) - TMP_JAMP(22)  ! used 4 times
+      TMP_JAMP(78) = TMP_JAMP(29) +  TMP_JAMP(22)  ! used 4 times
+      TMP_JAMP(77) = TMP_JAMP(28) +  TMP_JAMP(27)  ! used 4 times
+      TMP_JAMP(76) = TMP_JAMP(28) - TMP_JAMP(23)  ! used 4 times
+      TMP_JAMP(75) = TMP_JAMP(27) +  TMP_JAMP(23)  ! used 4 times
+      TMP_JAMP(74) = TMP_JAMP(27) +  TMP_JAMP(19)  ! used 4 times
+      TMP_JAMP(73) = TMP_JAMP(26) + ((-0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * AMP(18)  ! used 4 times
+      TMP_JAMP(72) = TMP_JAMP(26) - TMP_JAMP(25)  ! used 4 times
+      TMP_JAMP(71) = TMP_JAMP(26) - TMP_JAMP(19)  ! used 4 times
+      TMP_JAMP(70) = TMP_JAMP(26) + ((0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * AMP(17)  ! used 4 times
+      TMP_JAMP(69) = TMP_JAMP(25) +  TMP_JAMP(24)  ! used 4 times
+      TMP_JAMP(68) = TMP_JAMP(25) - TMP_JAMP(18)  ! used 4 times
+      TMP_JAMP(67) = TMP_JAMP(24) - TMP_JAMP(23)  ! used 4 times
+      TMP_JAMP(66) = TMP_JAMP(24) +  TMP_JAMP(18)  ! used 4 times
+      TMP_JAMP(65) = TMP_JAMP(22) +  TMP_JAMP(20)  ! used 4 times
+      TMP_JAMP(64) = TMP_JAMP(21) + ((0.000000000000000D+00,
      $ -1.000000000000000D+00)) * AMP(62)  ! used 4 times
-      TMP_JAMP(71) = TMP_JAMP(22) +  TMP_JAMP(21)  ! used 4 times
-      TMP_JAMP(70) = TMP_JAMP(27) +  TMP_JAMP(21)  ! used 4 times
-      TMP_JAMP(69) = TMP_JAMP(28) - TMP_JAMP(25)  ! used 4 times
-      TMP_JAMP(68) = AMP(119) +  AMP(145)  ! used 4 times
-      TMP_JAMP(67) = AMP(14) - AMP(147)  ! used 4 times
-      TMP_JAMP(66) = TMP_JAMP(20) +  TMP_JAMP(19)  ! used 4 times
-      TMP_JAMP(65) = TMP_JAMP(22) - TMP_JAMP(19)  ! used 4 times
-      TMP_JAMP(64) = TMP_JAMP(29) +  TMP_JAMP(20)  ! used 4 times
-      TMP_JAMP(63) = AMP(77) - AMP(152)  ! used 4 times
-      TMP_JAMP(62) = TMP_JAMP(17) + ((0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * AMP(77)  ! used 4 times
-      TMP_JAMP(61) = TMP_JAMP(18) + ((-0.000000000000000D+00
+      TMP_JAMP(63) = TMP_JAMP(21) - TMP_JAMP(20)  ! used 4 times
+      TMP_JAMP(62) = TMP_JAMP(21) + ((-0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * AMP(61)  ! used 4 times
+      TMP_JAMP(61) = TMP_JAMP(20) +  TMP_JAMP(19)  ! used 4 times
+      TMP_JAMP(60) = TMP_JAMP(20) - TMP_JAMP(16)  ! used 4 times
+      TMP_JAMP(59) = TMP_JAMP(19) +  TMP_JAMP(16)  ! used 4 times
+      TMP_JAMP(58) = TMP_JAMP(18) - TMP_JAMP(16)  ! used 4 times
+      TMP_JAMP(57) = TMP_JAMP(17) + ((-0.000000000000000D+00
      $ ,1.000000000000000D+00)) * AMP(79)  ! used 4 times
-      TMP_JAMP(60) = TMP_JAMP(24) +  TMP_JAMP(17)  ! used 4 times
-      TMP_JAMP(59) = TMP_JAMP(28) - TMP_JAMP(18)  ! used 4 times
-      TMP_JAMP(58) = AMP(89) - AMP(146)  ! used 4 times
-      TMP_JAMP(57) = TMP_JAMP(20) +  TMP_JAMP(16)  ! used 4 times
-      TMP_JAMP(56) = AMP(117) +  AMP(142)  ! used 4 times
-      TMP_JAMP(55) = AMP(8) - AMP(144)  ! used 4 times
-      TMP_JAMP(54) = TMP_JAMP(19) + ((-0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * AMP(117)  ! used 4 times
-      TMP_JAMP(53) = TMP_JAMP(20) + ((-0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * AMP(8)  ! used 4 times
-      TMP_JAMP(52) = TMP_JAMP(26) - TMP_JAMP(21)  ! used 4 times
-      TMP_JAMP(51) = TMP_JAMP(30) - TMP_JAMP(20)  ! used 4 times
-      TMP_JAMP(50) = AMP(87) - AMP(143)  ! used 4 times
-      TMP_JAMP(49) = TMP_JAMP(16) + ((0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * AMP(87)  ! used 4 times
-      TMP_JAMP(48) = TMP_JAMP(23) - TMP_JAMP(17)  ! used 4 times
-      TMP_JAMP(47) = TMP_JAMP(18) + ((0.000000000000000D+00,
+      TMP_JAMP(56) = TMP_JAMP(17) - TMP_JAMP(16)  ! used 4 times
+      TMP_JAMP(55) = TMP_JAMP(17) + ((0.000000000000000D+00,
      $ -1.000000000000000D+00)) * AMP(80)  ! used 4 times
-      TMP_JAMP(46) = TMP_JAMP(19) - TMP_JAMP(16)  ! used 4 times
-      TMP_JAMP(45) = TMP_JAMP(27) +  TMP_JAMP(18)  ! used 4 times
-      TMP_JAMP(44) = TMP_JAMP(28) + ((0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * AMP(110)  ! used 4 times
-      TMP_JAMP(43) = TMP_JAMP(29) + ((0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * AMP(16)  ! used 4 times
-      TMP_JAMP(42) = TMP_JAMP(22) + ((-0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * AMP(61)  ! used 4 times
-      TMP_JAMP(41) = TMP_JAMP(24) - TMP_JAMP(22)  ! used 4 times
-      TMP_JAMP(40) = TMP_JAMP(25) + ((-0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * AMP(71)  ! used 4 times
-      TMP_JAMP(39) = AMP(96) +  AMP(148)  ! used 4 times
-      TMP_JAMP(38) = TMP_JAMP(23) + ((0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * AMP(96)  ! used 4 times
-      TMP_JAMP(37) = TMP_JAMP(24) + ((0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * AMP(56)  ! used 4 times
-      TMP_JAMP(36) = TMP_JAMP(26) +  TMP_JAMP(23)  ! used 4 times
-      TMP_JAMP(35) = TMP_JAMP(17) + ((-0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * AMP(74)  ! used 4 times
-      TMP_JAMP(34) = TMP_JAMP(30) + ((0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * AMP(17)  ! used 4 times
-      TMP_JAMP(33) = TMP_JAMP(26) + ((0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * AMP(33)  ! used 4 times
-      TMP_JAMP(32) = TMP_JAMP(27) + ((0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * AMP(22)  ! used 4 times
-      TMP_JAMP(31) = TMP_JAMP(21) + ((-0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * AMP(25)  ! used 4 times
-      TMP_JAMP(98) = TMP_JAMP(43) + ((0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * AMP(138)  ! used 4 times
-      TMP_JAMP(97) = TMP_JAMP(44) + ((0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * AMP(136)  ! used 4 times
-      TMP_JAMP(96) = TMP_JAMP(40) + ((-0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * AMP(137)  ! used 4 times
-      TMP_JAMP(95) = TMP_JAMP(37) + ((0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * AMP(150)  ! used 4 times
-      TMP_JAMP(94) = TMP_JAMP(35) + ((-0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * AMP(149)  ! used 4 times
-      TMP_JAMP(93) = TMP_JAMP(32) + ((0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * AMP(36)  ! used 4 times
-      TMP_JAMP(92) = TMP_JAMP(33) + ((0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * AMP(34)  ! used 4 times
-      TMP_JAMP(91) = TMP_JAMP(31) + ((-0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * AMP(35)  ! used 4 times
-      TMP_JAMP(151) = AMP(18) + ((0.000000000000000D+00,
+      TMP_JAMP(54) = AMP(108) +  AMP(133)  ! used 4 times
+      TMP_JAMP(53) = AMP(51) +  AMP(52)  ! used 4 times
+      TMP_JAMP(52) = AMP(40) - AMP(54)  ! used 4 times
+      TMP_JAMP(51) = AMP(11) - AMP(135)  ! used 4 times
+      TMP_JAMP(50) = AMP(102) +  AMP(151)  ! used 4 times
+      TMP_JAMP(49) = AMP(69) - AMP(134)  ! used 4 times
+      TMP_JAMP(48) = AMP(59) - AMP(153)  ! used 4 times
+      TMP_JAMP(47) = AMP(43) - AMP(53)  ! used 4 times
+      TMP_JAMP(46) = AMP(119) +  AMP(145)  ! used 4 times
+      TMP_JAMP(45) = AMP(14) - AMP(147)  ! used 4 times
+      TMP_JAMP(44) = AMP(77) - AMP(152)  ! used 4 times
+      TMP_JAMP(43) = AMP(89) - AMP(146)  ! used 4 times
+      TMP_JAMP(42) = AMP(117) +  AMP(142)  ! used 4 times
+      TMP_JAMP(41) = AMP(8) - AMP(144)  ! used 4 times
+      TMP_JAMP(40) = AMP(87) - AMP(143)  ! used 4 times
+      TMP_JAMP(39) = AMP(110) +  AMP(136)  ! used 4 times
+      TMP_JAMP(38) = AMP(16) - AMP(138)  ! used 4 times
+      TMP_JAMP(37) = AMP(71) - AMP(137)  ! used 4 times
+      TMP_JAMP(36) = AMP(96) +  AMP(148)  ! used 4 times
+      TMP_JAMP(35) = AMP(56) - AMP(150)  ! used 4 times
+      TMP_JAMP(34) = AMP(74) - AMP(149)  ! used 4 times
+      TMP_JAMP(33) = AMP(33) +  AMP(34)  ! used 4 times
+      TMP_JAMP(32) = AMP(22) - AMP(36)  ! used 4 times
+      TMP_JAMP(31) = AMP(25) - AMP(35)  ! used 4 times
+      TMP_JAMP(142) = TMP_JAMP(80) +  TMP_JAMP(77)  ! used 2 times
+      TMP_JAMP(141) = TMP_JAMP(80) +  TMP_JAMP(68)  ! used 2 times
+      TMP_JAMP(140) = TMP_JAMP(79) - TMP_JAMP(61)  ! used 2 times
+      TMP_JAMP(139) = TMP_JAMP(79) +  TMP_JAMP(69)  ! used 2 times
+      TMP_JAMP(138) = TMP_JAMP(78) + ((0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * TMP_JAMP(47)  ! used 2 times
+      TMP_JAMP(137) = TMP_JAMP(77) + ((0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * TMP_JAMP(39)  ! used 2 times
+      TMP_JAMP(136) = TMP_JAMP(76) + ((0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * TMP_JAMP(54)  ! used 2 times
+      TMP_JAMP(135) = TMP_JAMP(76) +  TMP_JAMP(66)  ! used 2 times
+      TMP_JAMP(134) = TMP_JAMP(76) + ((-0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * TMP_JAMP(39)  ! used 2 times
+      TMP_JAMP(133) = TMP_JAMP(75) +  TMP_JAMP(61)  ! used 2 times
+      TMP_JAMP(132) = TMP_JAMP(73) +  AMP(50)  ! used 2 times
+      TMP_JAMP(131) = TMP_JAMP(70) + ((0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * TMP_JAMP(36)  ! used 2 times
+      TMP_JAMP(130) = TMP_JAMP(67) - TMP_JAMP(27)  ! used 2 times
+      TMP_JAMP(129) = TMP_JAMP(61) + ((0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * TMP_JAMP(46)  ! used 2 times
+      TMP_JAMP(128) = TMP_JAMP(61) + ((-0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * TMP_JAMP(41)  ! used 2 times
+      TMP_JAMP(127) = TMP_JAMP(58) - TMP_JAMP(25)  ! used 2 times
+      TMP_JAMP(126) = TMP_JAMP(58) +  TMP_JAMP(24)  ! used 2 times
+      TMP_JAMP(125) = TMP_JAMP(58) + ((-0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * TMP_JAMP(43)  ! used 2 times
+      TMP_JAMP(124) = TMP_JAMP(55) +  AMP(111)  ! used 2 times
+      TMP_JAMP(123) = TMP_JAMP(54) + ((-0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * AMP(109)  ! used 2 times
+      TMP_JAMP(122) = TMP_JAMP(53) + ((0.000000000000000D+00,
      $ -1.000000000000000D+00)) * AMP(50)  ! used 2 times
-      TMP_JAMP(150) = TMP_JAMP(87) + ((0.000000000000000D+00,
+      TMP_JAMP(121) = TMP_JAMP(53) - TMP_JAMP(47)  ! used 2 times
+      TMP_JAMP(120) = TMP_JAMP(52) + ((0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * AMP(42)  ! used 2 times
+      TMP_JAMP(119) = TMP_JAMP(52) + ((-0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * AMP(41)  ! used 2 times
+      TMP_JAMP(118) = TMP_JAMP(51) + ((0.000000000000000D+00,
      $ -1.000000000000000D+00)) * AMP(10)  ! used 2 times
-      TMP_JAMP(149) = TMP_JAMP(90) +  TMP_JAMP(88)  ! used 2 times
-      TMP_JAMP(148) = TMP_JAMP(82) - AMP(18)  ! used 2 times
-      TMP_JAMP(147) = TMP_JAMP(74) + ((-0.000000000000000D+00
+      TMP_JAMP(117) = TMP_JAMP(51) +  TMP_JAMP(49)  ! used 2 times
+      TMP_JAMP(116) = TMP_JAMP(51) + ((-0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * AMP(12)  ! used 2 times
+      TMP_JAMP(115) = TMP_JAMP(50) - TMP_JAMP(44)  ! used 2 times
+      TMP_JAMP(114) = TMP_JAMP(49) + ((0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * AMP(70)  ! used 2 times
+      TMP_JAMP(113) = TMP_JAMP(48) + ((0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * AMP(60)  ! used 2 times
+      TMP_JAMP(112) = TMP_JAMP(48) + ((-0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * AMP(58)  ! used 2 times
+      TMP_JAMP(111) = TMP_JAMP(47) + ((-0.000000000000000D+00
      $ ,1.000000000000000D+00)) * AMP(44)  ! used 2 times
-      TMP_JAMP(146) = TMP_JAMP(68) +  TMP_JAMP(67)  ! used 2 times
-      TMP_JAMP(145) = TMP_JAMP(77) + ((0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * AMP(134)  ! used 2 times
-      TMP_JAMP(144) = AMP(79) + ((0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * AMP(109)  ! used 2 times
-      TMP_JAMP(143) = TMP_JAMP(63) + ((-0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * AMP(76)  ! used 2 times
-      TMP_JAMP(142) = TMP_JAMP(90) + ((-0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * TMP_JAMP(59)  ! used 2 times
-      TMP_JAMP(141) = TMP_JAMP(67) +  TMP_JAMP(58)  ! used 2 times
-      TMP_JAMP(140) = AMP(7) + ((-0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * AMP(18)  ! used 2 times
-      TMP_JAMP(139) = TMP_JAMP(54) + ((-0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * AMP(142)  ! used 2 times
-      TMP_JAMP(138) = TMP_JAMP(55) + ((0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * TMP_JAMP(51)  ! used 2 times
-      TMP_JAMP(137) = TMP_JAMP(89) - TMP_JAMP(74)  ! used 2 times
-      TMP_JAMP(136) = TMP_JAMP(49) + ((0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * AMP(143)  ! used 2 times
-      TMP_JAMP(135) = TMP_JAMP(82) - TMP_JAMP(63)  ! used 2 times
-      TMP_JAMP(134) = AMP(41) + ((-0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * AMP(80)  ! used 2 times
-      TMP_JAMP(133) = TMP_JAMP(56) - TMP_JAMP(50)  ! used 2 times
-      TMP_JAMP(132) = TMP_JAMP(88) + ((-0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * TMP_JAMP(45)  ! used 2 times
-      TMP_JAMP(131) = TMP_JAMP(47) +  AMP(111)  ! used 2 times
-      TMP_JAMP(130) = TMP_JAMP(53) - AMP(9)  ! used 2 times
-      TMP_JAMP(129) = TMP_JAMP(98) +  TMP_JAMP(97)  ! used 2 times
-      TMP_JAMP(128) = AMP(58) + ((-0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * AMP(61)  ! used 2 times
-      TMP_JAMP(127) = TMP_JAMP(80) + ((-0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * TMP_JAMP(41)  ! used 2 times
-      TMP_JAMP(126) = TMP_JAMP(42) - AMP(72)  ! used 2 times
-      TMP_JAMP(125) = TMP_JAMP(96) - TMP_JAMP(42)  ! used 2 times
-      TMP_JAMP(124) = TMP_JAMP(98) +  TMP_JAMP(96)  ! used 2 times
-      TMP_JAMP(123) = TMP_JAMP(36) - AMP(39)  ! used 2 times
-      TMP_JAMP(122) = TMP_JAMP(89) - TMP_JAMP(39)  ! used 2 times
-      TMP_JAMP(121) = TMP_JAMP(95) - AMP(55)  ! used 2 times
-      TMP_JAMP(120) = TMP_JAMP(58) + ((0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * TMP_JAMP(46)  ! used 2 times
-      TMP_JAMP(119) = TMP_JAMP(68) - TMP_JAMP(58)  ! used 2 times
-      TMP_JAMP(118) = TMP_JAMP(94) + ((-0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * TMP_JAMP(58)  ! used 2 times
-      TMP_JAMP(117) = TMP_JAMP(95) +  TMP_JAMP(94)  ! used 2 times
-      TMP_JAMP(116) = TMP_JAMP(94) - AMP(73)  ! used 2 times
-      TMP_JAMP(115) = TMP_JAMP(95) +  AMP(57)  ! used 2 times
-      TMP_JAMP(114) = TMP_JAMP(96) - TMP_JAMP(95)  ! used 2 times
-      TMP_JAMP(113) = TMP_JAMP(97) - TMP_JAMP(96)  ! used 2 times
-      TMP_JAMP(112) = TMP_JAMP(38) +  AMP(95)  ! used 2 times
-      TMP_JAMP(111) = TMP_JAMP(67) + ((-0.000000000000000D+00
+      TMP_JAMP(110) = TMP_JAMP(46) + ((0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * AMP(120)  ! used 2 times
+      TMP_JAMP(109) = TMP_JAMP(46) - TMP_JAMP(43)  ! used 2 times
+      TMP_JAMP(108) = TMP_JAMP(45) + ((0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * AMP(12)  ! used 2 times
+      TMP_JAMP(107) = TMP_JAMP(45) +  TMP_JAMP(43)  ! used 2 times
+      TMP_JAMP(106) = TMP_JAMP(45) + ((-0.000000000000000D+00
      $ ,1.000000000000000D+00)) * AMP(13)  ! used 2 times
-      TMP_JAMP(110) = TMP_JAMP(67) + ((-0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * TMP_JAMP(34)  ! used 2 times
-      TMP_JAMP(109) = TMP_JAMP(98) +  AMP(15)  ! used 2 times
-      TMP_JAMP(108) = TMP_JAMP(98) +  TMP_JAMP(34)  ! used 2 times
-      TMP_JAMP(107) = TMP_JAMP(61) - AMP(23)  ! used 2 times
-      TMP_JAMP(106) = TMP_JAMP(93) +  TMP_JAMP(92)  ! used 2 times
-      TMP_JAMP(105) = TMP_JAMP(68) + ((-0.000000000000000D+00
+      TMP_JAMP(105) = TMP_JAMP(44) + ((-0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * AMP(76)  ! used 2 times
+      TMP_JAMP(104) = TMP_JAMP(42) + ((0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * AMP(45)  ! used 2 times
+      TMP_JAMP(103) = TMP_JAMP(42) + ((-0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * AMP(118)  ! used 2 times
+      TMP_JAMP(102) = TMP_JAMP(41) + ((0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * AMP(7)  ! used 2 times
+      TMP_JAMP(101) = TMP_JAMP(40) + ((-0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * AMP(78)  ! used 2 times
+      TMP_JAMP(100) = TMP_JAMP(40) + ((0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * AMP(88)  ! used 2 times
+      TMP_JAMP(99) = TMP_JAMP(39) + ((0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * AMP(111)  ! used 2 times
+      TMP_JAMP(98) = TMP_JAMP(39) - TMP_JAMP(37)  ! used 2 times
+      TMP_JAMP(97) = TMP_JAMP(38) + ((0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * AMP(9)  ! used 2 times
+      TMP_JAMP(96) = TMP_JAMP(38) + ((-0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * AMP(15)  ! used 2 times
+      TMP_JAMP(95) = TMP_JAMP(37) + ((-0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * AMP(72)  ! used 2 times
+      TMP_JAMP(94) = TMP_JAMP(36) + ((0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * AMP(39)  ! used 2 times
+      TMP_JAMP(93) = TMP_JAMP(35) + ((0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * AMP(55)  ! used 2 times
+      TMP_JAMP(92) = TMP_JAMP(35) + ((-0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * AMP(57)  ! used 2 times
+      TMP_JAMP(91) = TMP_JAMP(34) + ((-0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * AMP(75)  ! used 2 times
+      TMP_JAMP(90) = TMP_JAMP(34) + ((0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * AMP(73)  ! used 2 times
+      TMP_JAMP(89) = TMP_JAMP(33) + ((0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * AMP(21)  ! used 2 times
+      TMP_JAMP(88) = TMP_JAMP(33) + ((-0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * AMP(32)  ! used 2 times
+      TMP_JAMP(87) = TMP_JAMP(32) + ((0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * AMP(23)  ! used 2 times
+      TMP_JAMP(86) = TMP_JAMP(32) + ((-0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * AMP(24)  ! used 2 times
+      TMP_JAMP(85) = TMP_JAMP(31) + ((-0.000000000000000D+00
      $ ,1.000000000000000D+00)) * AMP(27)  ! used 2 times
-      TMP_JAMP(104) = TMP_JAMP(91) + ((-0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * TMP_JAMP(68)  ! used 2 times
-      TMP_JAMP(103) = TMP_JAMP(93) +  TMP_JAMP(91)  ! used 2 times
-      TMP_JAMP(102) = TMP_JAMP(91) - AMP(26)  ! used 2 times
-      TMP_JAMP(101) = TMP_JAMP(92) - TMP_JAMP(91)  ! used 2 times
-      TMP_JAMP(100) = TMP_JAMP(97) +  TMP_JAMP(93)  ! used 2 times
-      TMP_JAMP(99) = TMP_JAMP(92) +  TMP_JAMP(34)  ! used 2 times
-      TMP_JAMP(163) = TMP_JAMP(149) + ((0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * AMP(42)  ! used 2 times
-      TMP_JAMP(162) = TMP_JAMP(144) - TMP_JAMP(142)  ! used 2 times
-      TMP_JAMP(161) = TMP_JAMP(140) + ((-0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * TMP_JAMP(138)  ! used 2 times
-      TMP_JAMP(160) = TMP_JAMP(135) + ((-0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * TMP_JAMP(48)  ! used 2 times
-      TMP_JAMP(159) = TMP_JAMP(133) + ((-0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * TMP_JAMP(46)  ! used 2 times
-      TMP_JAMP(158) = TMP_JAMP(134) + ((-0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * TMP_JAMP(132)  ! used 2 times
-      TMP_JAMP(157) = TMP_JAMP(130) + ((0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * AMP(144)  ! used 2 times
-      TMP_JAMP(156) = TMP_JAMP(128) + ((-0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * TMP_JAMP(127)  ! used 2 times
-      TMP_JAMP(155) = TMP_JAMP(123) + ((0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * TMP_JAMP(122)  ! used 2 times
-      TMP_JAMP(154) = TMP_JAMP(112) + ((-0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * AMP(148)  ! used 2 times
-      TMP_JAMP(153) = TMP_JAMP(100) +  AMP(24)  ! used 2 times
-      TMP_JAMP(152) = TMP_JAMP(99) +  AMP(32)  ! used 2 times
+      TMP_JAMP(84) = TMP_JAMP(31) + ((0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * AMP(26)  ! used 2 times
+      TMP_JAMP(83) = TMP_JAMP(25) +  AMP(95)  ! used 2 times
+      TMP_JAMP(82) = AMP(18) + ((-0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * AMP(101)  ! used 2 times
+      TMP_JAMP(81) = AMP(79) + ((-0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * AMP(90)  ! used 2 times
+      TMP_JAMP(155) = TMP_JAMP(131) - TMP_JAMP(83)  ! used 2 times
+      TMP_JAMP(154) = TMP_JAMP(119) + ((-0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * TMP_JAMP(55)  ! used 2 times
+      TMP_JAMP(153) = TMP_JAMP(114) + ((0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * TMP_JAMP(64)  ! used 2 times
+      TMP_JAMP(152) = TMP_JAMP(111) + ((-0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * TMP_JAMP(64)  ! used 2 times
+      TMP_JAMP(151) = TMP_JAMP(105) + ((-0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * TMP_JAMP(57)  ! used 2 times
+      TMP_JAMP(150) = TMP_JAMP(103) + ((-0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * TMP_JAMP(62)  ! used 2 times
+      TMP_JAMP(149) = TMP_JAMP(100) + ((0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * TMP_JAMP(55)  ! used 2 times
+      TMP_JAMP(148) = TMP_JAMP(95) + ((-0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * TMP_JAMP(62)  ! used 2 times
+      TMP_JAMP(147) = TMP_JAMP(94) - TMP_JAMP(53)  ! used 2 times
+      TMP_JAMP(146) = TMP_JAMP(89) - TMP_JAMP(50)  ! used 2 times
+      TMP_JAMP(145) = TMP_JAMP(88) + ((-0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * TMP_JAMP(70)  ! used 2 times
+      TMP_JAMP(144) = TMP_JAMP(84) + ((0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * TMP_JAMP(62)  ! used 2 times
+      TMP_JAMP(143) = TMP_JAMP(81) + ((-0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * TMP_JAMP(56)  ! used 2 times
       JAMP(1,1) = (-1.000000000000000D+00)*AMP(30)
-     $ +((0.000000000000000D+00,1.000000000000000D+00))*TMP_JAMP(109)
-     $ +((0.000000000000000D+00,1.000000000000000D+00))*TMP_JAMP(152)
-     $ +((0.000000000000000D+00,1.000000000000000D+00))*TMP_JAMP(153)
-      JAMP(2,1) = ((0.000000000000000D+00,1.000000000000000D+00))
-     $ *AMP(27)+(-1.000000000000000D+00)*AMP(28)+((0.000000000000000D
-     $ +00,1.000000000000000D+00))*TMP_JAMP(66)+((0.000000000000000D
-     $ +00,1.000000000000000D+00))*TMP_JAMP(104)+TMP_JAMP(111)
-     $ +((0.000000000000000D+00,-1.000000000000000D+00))*TMP_JAMP(152)
+     $ +((0.000000000000000D+00,1.000000000000000D+00))*TMP_JAMP(80)
+     $ +TMP_JAMP(86)+TMP_JAMP(96)+((0.000000000000000D+00
+     $ ,1.000000000000000D+00))*TMP_JAMP(137)+TMP_JAMP(145)
+      JAMP(2,1) = (-1.000000000000000D+00)*AMP(28)
+     $ +((0.000000000000000D+00,-1.000000000000000D+00))*TMP_JAMP(79)
+     $ +TMP_JAMP(85)+TMP_JAMP(106)+((0.000000000000000D+00
+     $ ,1.000000000000000D+00))*TMP_JAMP(129)+(-1.000000000000000D+00)
+     $ *TMP_JAMP(145)
       JAMP(3,1) = (-1.000000000000000D+00)*AMP(31)
-     $ +((0.000000000000000D+00,1.000000000000000D+00))*AMP(72)
-     $ +((0.000000000000000D+00,-1.000000000000000D+00))*TMP_JAMP(102)
-     $ +((0.000000000000000D+00,1.000000000000000D+00))*TMP_JAMP(125)
-     $ +((0.000000000000000D+00,-1.000000000000000D+00))*TMP_JAMP(153)
-      JAMP(4,1) = (-1.000000000000000D+00)*AMP(19)
-     $ +((0.000000000000000D+00,1.000000000000000D+00))*AMP(21)
-     $ +((0.000000000000000D+00,-1.000000000000000D+00))*AMP(26)
-     $ +AMP(151)+((0.000000000000000D+00,-1.000000000000000D+00))
-     $ *TMP_JAMP(79)+((0.000000000000000D+00,-1.000000000000000D+00))
-     $ *TMP_JAMP(101)+((0.000000000000000D+00,1.000000000000000D+00))
-     $ *TMP_JAMP(156)
+     $ +((0.000000000000000D+00,-1.000000000000000D+00))*TMP_JAMP(78)
+     $ +(-1.000000000000000D+00)*TMP_JAMP(86)+TMP_JAMP(95)
+     $ +((0.000000000000000D+00,-1.000000000000000D+00))*TMP_JAMP(134)
+     $ +(-1.000000000000000D+00)*TMP_JAMP(144)
+      JAMP(4,1) = (-1.000000000000000D+00)*AMP(19)+TMP_JAMP(112)
+     $ +((0.000000000000000D+00,-1.000000000000000D+00))*TMP_JAMP(139)
+     $ +TMP_JAMP(144)+(-1.000000000000000D+00)*TMP_JAMP(146)
       JAMP(5,1) = (-1.000000000000000D+00)*AMP(29)
-     $ +((0.000000000000000D+00,1.000000000000000D+00))*AMP(90)
-     $ +((0.000000000000000D+00,-1.000000000000000D+00))*TMP_JAMP(103)
-     $ +(-1.000000000000000D+00)*TMP_JAMP(105)+((0.000000000000000D+00
-     $ ,-1.000000000000000D+00))*TMP_JAMP(107)+TMP_JAMP(120)
-      JAMP(6,1) = (-1.000000000000000D+00)*AMP(20)
-     $ +((0.000000000000000D+00,-1.000000000000000D+00))*AMP(21)
-     $ +((0.000000000000000D+00,1.000000000000000D+00))*AMP(76)
-     $ +((0.000000000000000D+00,1.000000000000000D+00))*TMP_JAMP(106)
-     $ +((0.000000000000000D+00,1.000000000000000D+00))*TMP_JAMP(107)
-     $ +(-1.000000000000000D+00)*TMP_JAMP(160)
-      JAMP(7,1) = ((0.000000000000000D+00,-1.000000000000000D+00))
-     $ *AMP(15)+((0.000000000000000D+00,1.000000000000000D+00))*AMP(57)
-     $ +(-1.000000000000000D+00)*AMP(93)+((0.000000000000000D+00,
-     $ -1.000000000000000D+00))*TMP_JAMP(108)+((0.000000000000000D+00,
-     $ -1.000000000000000D+00))*TMP_JAMP(114)+((0.000000000000000D+00
-     $ ,1.000000000000000D+00))*TMP_JAMP(154)
-      JAMP(8,1) = ((0.000000000000000D+00,-1.000000000000000D+00))
-     $ *AMP(13)+((0.000000000000000D+00,1.000000000000000D+00))*AMP(75)
-     $ +(-1.000000000000000D+00)*AMP(91)+((0.000000000000000D+00,
-     $ -1.000000000000000D+00))*TMP_JAMP(57)+(-1.000000000000000D+00)
-     $ *TMP_JAMP(110)+((0.000000000000000D+00,1.000000000000000D+00))
-     $ *TMP_JAMP(118)+((0.000000000000000D+00,-1.000000000000000D+00))
-     $ *TMP_JAMP(154)
-      JAMP(9,1) = (-1.000000000000000D+00)*AMP(94)
-     $ +((0.000000000000000D+00,-1.000000000000000D+00))*TMP_JAMP(113)
-     $ +((0.000000000000000D+00,-1.000000000000000D+00))*TMP_JAMP(115)
-     $ +((0.000000000000000D+00,-1.000000000000000D+00))*TMP_JAMP(116)
-     $ +((0.000000000000000D+00,1.000000000000000D+00))*TMP_JAMP(131)
-      JAMP(10,1) = (-1.000000000000000D+00)*AMP(38)
-     $ +((0.000000000000000D+00,1.000000000000000D+00))*TMP_JAMP(116)
-     $ +((0.000000000000000D+00,-1.000000000000000D+00))*TMP_JAMP(155)
-     $ +((0.000000000000000D+00,1.000000000000000D+00))*TMP_JAMP(158)
-      JAMP(11,1) = ((0.000000000000000D+00,1.000000000000000D+00))
-     $ *AMP(55)+((0.000000000000000D+00,-1.000000000000000D+00))
-     $ *AMP(75)+(-1.000000000000000D+00)*AMP(92)+((0.000000000000000D
-     $ +00,1.000000000000000D+00))*AMP(120)+((0.000000000000000D+00,
-     $ -1.000000000000000D+00))*TMP_JAMP(46)+((0.000000000000000D+00
-     $ ,1.000000000000000D+00))*TMP_JAMP(72)+((0.000000000000000D+00,
-     $ -1.000000000000000D+00))*TMP_JAMP(117)+(-1.000000000000000D+00)
-     $ *TMP_JAMP(119)
-      JAMP(12,1) = (-1.000000000000000D+00)*AMP(37)+(
-     $ -1.000000000000000D+00)*AMP(62)+((0.000000000000000D+00,
-     $ -1.000000000000000D+00))*TMP_JAMP(71)+((0.000000000000000D+00
-     $ ,1.000000000000000D+00))*TMP_JAMP(121)+TMP_JAMP(147)
+     $ +((0.000000000000000D+00,-1.000000000000000D+00))*TMP_JAMP(29)
+     $ +((0.000000000000000D+00,-1.000000000000000D+00))*TMP_JAMP(65)
+     $ +(-1.000000000000000D+00)*TMP_JAMP(85)+(-1.000000000000000D+00)
+     $ *TMP_JAMP(87)+(-1.000000000000000D+00)*TMP_JAMP(109)
+     $ +TMP_JAMP(143)
+      JAMP(6,1) = (-1.000000000000000D+00)*AMP(20)+TMP_JAMP(87)
+     $ +((0.000000000000000D+00,1.000000000000000D+00))*TMP_JAMP(141)
+     $ +TMP_JAMP(146)+TMP_JAMP(151)
+      JAMP(7,1) = (-1.000000000000000D+00)*AMP(93)+(
+     $ -1.000000000000000D+00)*TMP_JAMP(37)+TMP_JAMP(92)+(
+     $ -1.000000000000000D+00)*TMP_JAMP(96)+((0.000000000000000D+00
+     $ ,1.000000000000000D+00))*TMP_JAMP(130)+((0.000000000000000D+00,
+     $ -1.000000000000000D+00))*TMP_JAMP(155)
+      JAMP(8,1) = (-1.000000000000000D+00)*AMP(91)
+     $ +((0.000000000000000D+00,-1.000000000000000D+00))*TMP_JAMP(19)
+     $ +TMP_JAMP(91)+(-1.000000000000000D+00)*TMP_JAMP(106)
+     $ +((0.000000000000000D+00,1.000000000000000D+00))*TMP_JAMP(125)
      $ +((0.000000000000000D+00,1.000000000000000D+00))*TMP_JAMP(155)
-      JAMP(13,1) = ((0.000000000000000D+00,1.000000000000000D+00))
-     $ *AMP(118)+(-1.000000000000000D+00)*AMP(126)
-     $ +((0.000000000000000D+00,-1.000000000000000D+00))*TMP_JAMP(124)
-     $ +((0.000000000000000D+00,1.000000000000000D+00))*TMP_JAMP(126)
-     $ +((0.000000000000000D+00,-1.000000000000000D+00))*TMP_JAMP(139)
-     $ +((0.000000000000000D+00,-1.000000000000000D+00))*TMP_JAMP(157)
-      JAMP(14,1) = ((0.000000000000000D+00,1.000000000000000D+00))
-     $ *AMP(78)+(-1.000000000000000D+00)*AMP(98)+((0.000000000000000D
-     $ +00,-1.000000000000000D+00))*AMP(118)+AMP(152)
-     $ +((0.000000000000000D+00,1.000000000000000D+00))*TMP_JAMP(62)
-     $ +((0.000000000000000D+00,-1.000000000000000D+00))*TMP_JAMP(156)
-     $ +(-1.000000000000000D+00)*TMP_JAMP(159)
-      JAMP(15,1) = ((0.000000000000000D+00,1.000000000000000D+00))
-     $ *AMP(88)+(-1.000000000000000D+00)*AMP(127)+((0.000000000000000D
-     $ +00,1.000000000000000D+00))*TMP_JAMP(129)+((0.000000000000000D
-     $ +00,-1.000000000000000D+00))*TMP_JAMP(131)+((0.000000000000000D
-     $ +00,1.000000000000000D+00))*TMP_JAMP(136)+((0.000000000000000D
-     $ +00,1.000000000000000D+00))*TMP_JAMP(157)
-      JAMP(16,1) = ((0.000000000000000D+00,1.000000000000000D+00))
-     $ *AMP(45)+(-1.000000000000000D+00)*AMP(47)+AMP(53)
-     $ +((0.000000000000000D+00,-1.000000000000000D+00))*AMP(88)
-     $ +((0.000000000000000D+00,1.000000000000000D+00))*TMP_JAMP(73)
-     $ +((0.000000000000000D+00,-1.000000000000000D+00))*TMP_JAMP(158)
-     $ +(-1.000000000000000D+00)*TMP_JAMP(159)
-      JAMP(17,1) = ((0.000000000000000D+00,-1.000000000000000D+00))
-     $ *AMP(78)+(-1.000000000000000D+00)*AMP(97)+((0.000000000000000D
-     $ +00,1.000000000000000D+00))*AMP(101)+((0.000000000000000D+00
-     $ ,1.000000000000000D+00))*TMP_JAMP(136)+(-1.000000000000000D+00)
-     $ *TMP_JAMP(160)+((0.000000000000000D+00,1.000000000000000D+00))
-     $ *TMP_JAMP(161)
-      JAMP(18,1) = ((0.000000000000000D+00,-1.000000000000000D+00))
-     $ *AMP(45)+(-1.000000000000000D+00)*AMP(46)+((0.000000000000000D
-     $ +00,1.000000000000000D+00))*AMP(50)+((0.000000000000000D+00
-     $ ,1.000000000000000D+00))*TMP_JAMP(52)+(-1.000000000000000D+00)
-     $ *TMP_JAMP(137)+((0.000000000000000D+00,-1.000000000000000D+00))
-     $ *TMP_JAMP(139)+((0.000000000000000D+00,-1.000000000000000D+00))
-     $ *TMP_JAMP(161)
-      JAMP(19,1) = ((0.000000000000000D+00,1.000000000000000D+00))
-     $ *AMP(12)+((0.000000000000000D+00,-1.000000000000000D+00))
-     $ *AMP(90)+(-1.000000000000000D+00)*AMP(128)+(-1.000000000000000D
-     $ +00)*AMP(135)+((0.000000000000000D+00,-1.000000000000000D+00))
-     $ *TMP_JAMP(57)+((0.000000000000000D+00,-1.000000000000000D+00))
-     $ *TMP_JAMP(84)+(-1.000000000000000D+00)*TMP_JAMP(141)+(
-     $ -1.000000000000000D+00)*TMP_JAMP(162)
-      JAMP(20,1) = ((0.000000000000000D+00,1.000000000000000D+00))
-     $ *AMP(60)+(-1.000000000000000D+00)*AMP(100)+((0.000000000000000D
-     $ +00,1.000000000000000D+00))*TMP_JAMP(60)+(-1.000000000000000D
-     $ +00)*TMP_JAMP(80)+(-1.000000000000000D+00)*TMP_JAMP(143)
-     $ +((0.000000000000000D+00,-1.000000000000000D+00))*TMP_JAMP(145)
-     $ +TMP_JAMP(162)
-      JAMP(21,1) = ((0.000000000000000D+00,-1.000000000000000D+00))
-     $ *AMP(12)+(-1.000000000000000D+00)*AMP(62)+((0.000000000000000D
-     $ +00,1.000000000000000D+00))*AMP(70)+((0.000000000000000D+00,
-     $ -1.000000000000000D+00))*AMP(120)+(-1.000000000000000D+00)
-     $ *AMP(129)+((0.000000000000000D+00,1.000000000000000D+00))
-     $ *TMP_JAMP(64)+((0.000000000000000D+00,-1.000000000000000D+00))
-     $ *TMP_JAMP(65)+(-1.000000000000000D+00)*TMP_JAMP(87)
-     $ +((0.000000000000000D+00,1.000000000000000D+00))*TMP_JAMP(145)
-     $ +TMP_JAMP(146)
+      JAMP(9,1) = (-1.000000000000000D+00)*AMP(94)+(
+     $ -1.000000000000000D+00)*TMP_JAMP(90)+(-1.000000000000000D+00)
+     $ *TMP_JAMP(92)+(-1.000000000000000D+00)*TMP_JAMP(98)
+     $ +((0.000000000000000D+00,1.000000000000000D+00))*TMP_JAMP(124)
+     $ +((0.000000000000000D+00,-1.000000000000000D+00))*TMP_JAMP(135)
+      JAMP(10,1) = (-1.000000000000000D+00)*AMP(38)+TMP_JAMP(90)
+     $ +((0.000000000000000D+00,-1.000000000000000D+00))*TMP_JAMP(141)
+     $ +(-1.000000000000000D+00)*TMP_JAMP(147)+TMP_JAMP(154)
+      JAMP(11,1) = AMP(62)+(-1.000000000000000D+00)*AMP(92)
+     $ +((0.000000000000000D+00,1.000000000000000D+00))*AMP(120)
+     $ +((0.000000000000000D+00,1.000000000000000D+00))*TMP_JAMP(63)+(
+     $ -1.000000000000000D+00)*TMP_JAMP(91)+(-1.000000000000000D+00)
+     $ *TMP_JAMP(93)+(-1.000000000000000D+00)*TMP_JAMP(109)
+     $ +((0.000000000000000D+00,-1.000000000000000D+00))*TMP_JAMP(126)
+      JAMP(12,1) = (-1.000000000000000D+00)*AMP(37)+TMP_JAMP(93)
+     $ +((0.000000000000000D+00,1.000000000000000D+00))*TMP_JAMP(139)
+     $ +TMP_JAMP(147)+TMP_JAMP(152)
+      JAMP(13,1) = (-1.000000000000000D+00)*AMP(126)
+     $ +((0.000000000000000D+00,-1.000000000000000D+00))*TMP_JAMP(75)
+     $ +(-1.000000000000000D+00)*TMP_JAMP(97)+TMP_JAMP(103)
+     $ +((0.000000000000000D+00,-1.000000000000000D+00))*TMP_JAMP(128)
+     $ +(-1.000000000000000D+00)*TMP_JAMP(148)
+      JAMP(14,1) = (-1.000000000000000D+00)*AMP(98)
+     $ +((0.000000000000000D+00,1.000000000000000D+00))*TMP_JAMP(20)+(
+     $ -1.000000000000000D+00)*TMP_JAMP(44)+TMP_JAMP(101)+(
+     $ -1.000000000000000D+00)*TMP_JAMP(112)+((0.000000000000000D+00
+     $ ,1.000000000000000D+00))*TMP_JAMP(126)+(-1.000000000000000D+00)
+     $ *TMP_JAMP(150)
+      JAMP(15,1) = (-1.000000000000000D+00)*AMP(127)+(
+     $ -1.000000000000000D+00)*TMP_JAMP(41)+((0.000000000000000D+00
+     $ ,1.000000000000000D+00))*TMP_JAMP(59)+((0.000000000000000D+00
+     $ ,1.000000000000000D+00))*TMP_JAMP(77)+TMP_JAMP(97)+TMP_JAMP(99)
+     $ +(-1.000000000000000D+00)*TMP_JAMP(149)
+      JAMP(16,1) = (-1.000000000000000D+00)*AMP(47)
+     $ +((0.000000000000000D+00,1.000000000000000D+00))*TMP_JAMP(60)+(
+     $ -1.000000000000000D+00)*TMP_JAMP(104)+(-1.000000000000000D+00)
+     $ *TMP_JAMP(119)+((0.000000000000000D+00,1.000000000000000D+00))
+     $ *TMP_JAMP(138)+TMP_JAMP(149)
+      JAMP(17,1) = (-1.000000000000000D+00)*AMP(97)
+     $ +((0.000000000000000D+00,-1.000000000000000D+00))*TMP_JAMP(71)
+     $ +TMP_JAMP(82)+(-1.000000000000000D+00)*TMP_JAMP(101)+(
+     $ -1.000000000000000D+00)*TMP_JAMP(102)+(-1.000000000000000D+00)
+     $ *TMP_JAMP(115)+((0.000000000000000D+00,-1.000000000000000D+00))
+     $ *TMP_JAMP(127)
+      JAMP(18,1) = (-1.000000000000000D+00)*AMP(46)+TMP_JAMP(102)
+     $ +TMP_JAMP(104)+(-1.000000000000000D+00)*TMP_JAMP(121)
+     $ +((0.000000000000000D+00,1.000000000000000D+00))*TMP_JAMP(132)
+     $ +((0.000000000000000D+00,1.000000000000000D+00))*TMP_JAMP(140)
+      JAMP(19,1) = (-1.000000000000000D+00)*AMP(128)
+     $ +((0.000000000000000D+00,-1.000000000000000D+00))*TMP_JAMP(28)
+     $ +((0.000000000000000D+00,-1.000000000000000D+00))*TMP_JAMP(74)
+     $ +(-1.000000000000000D+00)*TMP_JAMP(107)+TMP_JAMP(116)
+     $ +TMP_JAMP(123)+(-1.000000000000000D+00)*TMP_JAMP(143)
+      JAMP(20,1) = (-1.000000000000000D+00)*AMP(100)+TMP_JAMP(49)+(
+     $ -1.000000000000000D+00)*TMP_JAMP(113)+(-1.000000000000000D+00)
+     $ *TMP_JAMP(123)+((0.000000000000000D+00,1.000000000000000D+00))
+     $ *TMP_JAMP(135)+(-1.000000000000000D+00)*TMP_JAMP(151)
+      JAMP(21,1) = (-1.000000000000000D+00)*AMP(129)+(
+     $ -1.000000000000000D+00)*TMP_JAMP(51)+TMP_JAMP(108)+TMP_JAMP(110)
+     $ +((0.000000000000000D+00,1.000000000000000D+00))*TMP_JAMP(133)
+     $ +(-1.000000000000000D+00)*TMP_JAMP(153)
       JAMP(22,1) = (-1.000000000000000D+00)*AMP(49)
-     $ +((0.000000000000000D+00,-1.000000000000000D+00))*AMP(70)
-     $ +((0.000000000000000D+00,1.000000000000000D+00))*TMP_JAMP(69)
-     $ +((0.000000000000000D+00,1.000000000000000D+00))*TMP_JAMP(70)
-     $ +((0.000000000000000D+00,1.000000000000000D+00))*TMP_JAMP(72)
-     $ +TMP_JAMP(81)+(-1.000000000000000D+00)*TMP_JAMP(147)+(
-     $ -1.000000000000000D+00)*TMP_JAMP(163)
-      JAMP(23,1) = ((0.000000000000000D+00,-1.000000000000000D+00))
-     $ *AMP(60)+(-1.000000000000000D+00)*AMP(99)+((0.000000000000000D
-     $ +00,-1.000000000000000D+00))*AMP(101)+(-1.000000000000000D+00)
-     $ *AMP(153)+((0.000000000000000D+00,1.000000000000000D+00))
-     $ *TMP_JAMP(75)+((0.000000000000000D+00,1.000000000000000D+00))
-     $ *TMP_JAMP(76)+((0.000000000000000D+00,-1.000000000000000D+00))
-     $ *TMP_JAMP(78)+(-1.000000000000000D+00)*TMP_JAMP(81)
-     $ +TMP_JAMP(148)+(-1.000000000000000D+00)*TMP_JAMP(150)
-      JAMP(24,1) = (-1.000000000000000D+00)*AMP(48)+AMP(52)
-     $ +((0.000000000000000D+00,-1.000000000000000D+00))*TMP_JAMP(83)
-     $ +((0.000000000000000D+00,-1.000000000000000D+00))*TMP_JAMP(85)
-     $ +((0.000000000000000D+00,-1.000000000000000D+00))*TMP_JAMP(86)
-     $ +TMP_JAMP(150)+TMP_JAMP(151)+TMP_JAMP(163)
+     $ +((0.000000000000000D+00,1.000000000000000D+00))*TMP_JAMP(78)
+     $ +TMP_JAMP(114)+(-1.000000000000000D+00)*TMP_JAMP(120)
+     $ +((0.000000000000000D+00,1.000000000000000D+00))*TMP_JAMP(136)
+     $ +(-1.000000000000000D+00)*TMP_JAMP(152)
+      JAMP(23,1) = ((0.000000000000000D+00,1.000000000000000D+00))
+     $ *AMP(10)+(-1.000000000000000D+00)*AMP(99)+TMP_JAMP(50)
+     $ +((0.000000000000000D+00,1.000000000000000D+00))*TMP_JAMP(72)+(
+     $ -1.000000000000000D+00)*TMP_JAMP(82)+TMP_JAMP(113)+(
+     $ -1.000000000000000D+00)*TMP_JAMP(117)+((0.000000000000000D+00,
+     $ -1.000000000000000D+00))*TMP_JAMP(130)
+      JAMP(24,1) = (-1.000000000000000D+00)*AMP(48)+TMP_JAMP(54)
+     $ +((0.000000000000000D+00,-1.000000000000000D+00))*TMP_JAMP(73)
+     $ +TMP_JAMP(118)+TMP_JAMP(120)+TMP_JAMP(122)+((0.000000000000000D
+     $ +00,-1.000000000000000D+00))*TMP_JAMP(142)
 
       IF(INIT_MODE)THEN
         DO I=1, NGRAPHS
@@ -1717,7 +1672,6 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
         ENDDO
       ENDDO
 
-      call counters_matrix1_stop()
       END
 
       SUBROUTINE PRINT_ZERO_AMP_1()
diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/cudacpp.mk b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/cudacpp.mk
index 59a2c906eb..f2cfa349da 100644
--- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/cudacpp.mk
+++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/cudacpp.mk
@@ -4,10 +4,13 @@
 # Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 
 #=== Determine the name of this makefile (https://ftp.gnu.org/old-gnu/Manuals/make-3.80/html_node/make_17.html)
-#=== NB: different names (e.g. cudacpp.mk and cudacpp_src.mk) are used in the Subprocess and src directories
+#=== NB: use ':=' to ensure that the value of CUDACPP_MAKEFILE is not modified further down after including make_opts
+#=== NB: use 'override' to ensure that the value can not be modified from the outside
+override CUDACPP_MAKEFILE := $(word $(words $(MAKEFILE_LIST)),$(MAKEFILE_LIST))
+###$(info CUDACPP_MAKEFILE='$(CUDACPP_MAKEFILE)')
 
-CUDACPP_MAKEFILE = $(word $(words $(MAKEFILE_LIST)),$(MAKEFILE_LIST))
-CUDACPP_SRC_MAKEFILE = cudacpp_src.mk
+#=== NB: different names (e.g. cudacpp.mk and cudacpp_src.mk) are used in the Subprocess and src directories
+override CUDACPP_SRC_MAKEFILE = cudacpp_src.mk
 
 #-------------------------------------------------------------------------------
 
@@ -29,7 +32,17 @@ UNAME_P := $(shell uname -p)
 
 #-------------------------------------------------------------------------------
 
-#=== Configure common compiler flags for C++ and CUDA
+#=== Include the common MG5aMC Makefile options
+
+# OM: this is crucial for MG5aMC flag consistency/documentation
+# AV: temporarely comment this out because it breaks cudacpp builds
+ifneq ($(wildcard ../../Source/make_opts),)
+include ../../Source/make_opts
+endif
+
+#-------------------------------------------------------------------------------
+
+#=== Configure common compiler flags for C++ and CUDA/HIP
 
 INCFLAGS = -I.
 OPTFLAGS = -O3 # this ends up in GPUFLAGS too (should it?), cannot add -Ofast or -ffast-math here
@@ -101,68 +114,85 @@ endif
 # Note: AR, CXX and FC are implicitly defined if not set externally
 # See https://www.gnu.org/software/make/manual/html_node/Implicit-Variables.html
 
-#-------------------------------------------------------------------------------
-
-CUDA_COMPILER_PATH := $(shell compiler="`which nvcc 2>/dev/null`" && while [ -L "$$compiler" ]; do compiler=`readlink "$$compiler"`; done && echo "$$compiler")
-HIP_COMPILER_PATH := $(shell compiler="`which hipcc 2>/dev/null`" && while [ -L "$$compiler" ]; do compiler=`readlink "$$compiler"`; done && echo "$$compiler")
-
-ifeq ($(findstring nvcc,$(CUDA_COMPILER_PATH)),nvcc)
-  #=== Configure the CUDA compiler
-
-  # If CXX is not a single word (example "clang++ --gcc-toolchain...") then disable CUDA builds (issue #505)
-  # This is because it is impossible to pass this to "GPUFLAGS += -ccbin <host-compiler>" below
-  ifneq ($(words $(subst ccache ,,$(CXX))),1) # allow at most "CXX=ccache <host-compiler>" from outside
-    $(warning CUDA builds are not supported for multi-word CXX "$(CXX)")
-    override CUDA_HOME=disabled
-  endif
+# Add -mmacosx-version-min=11.3 to avoid "ld: warning: object file was built for newer macOS version than being linked"
+ifneq ($(shell $(CXX) --version | egrep '^Apple clang'),)
+CXXFLAGS += -mmacosx-version-min=11.3
+endif
 
-  # If CUDA_HOME is not set, try to set it from the location of nvcc
-  ifndef CUDA_HOME
-    CUDA_HOME = $(patsubst %bin/nvcc,%,$(shell which nvcc 2>/dev/null))
-    $(warning CUDA_HOME was not set: using "$(CUDA_HOME)")
-  endif
+#-------------------------------------------------------------------------------
 
-  # Set GPUCC as $(CUDA_HOME)/bin/nvcc if it exists
-  ifneq ($(wildcard $(CUDA_HOME)/bin/nvcc),)
-    GPUCC = $(CUDA_HOME)/bin/nvcc
-    USE_NVTX ?=-DUSE_NVTX
-    # See https://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/index.html
-    # See https://arnon.dk/matching-sm-architectures-arch-and-gencode-for-various-nvidia-cards/
-    # Default: use compute capability 70 for V100 (CERN lxbatch, CERN itscrd, Juwels Cluster).
-    # Embed device code for 70, and PTX for 70+.
-    # Export MADGRAPH_CUDA_ARCHITECTURE (comma-separated list) to use another value or list of values (see #533).
-    # Examples: use 60 for P100 (Piz Daint), 80 for A100 (Juwels Booster, NVidia raplab/Curiosity).
-    MADGRAPH_CUDA_ARCHITECTURE ?= 70
-    ###CUARCHFLAGS = -gencode arch=compute_$(MADGRAPH_CUDA_ARCHITECTURE),code=compute_$(MADGRAPH_CUDA_ARCHITECTURE) -gencode arch=compute_$(MADGRAPH_CUDA_ARCHITECTURE),code=sm_$(MADGRAPH_CUDA_ARCHITECTURE) # Older implementation (AV): go back to this one for multi-GPU support #533
-    ###CUARCHFLAGS = --gpu-architecture=compute_$(MADGRAPH_CUDA_ARCHITECTURE) --gpu-code=sm_$(MADGRAPH_CUDA_ARCHITECTURE),compute_$(MADGRAPH_CUDA_ARCHITECTURE) # Newer implementation (SH): cannot use this as-is for multi-GPU support #533
-    comma:=,
-    CUARCHFLAGS = $(foreach arch,$(subst $(comma), ,$(MADGRAPH_CUDA_ARCHITECTURE)),-gencode arch=compute_$(arch),code=compute_$(arch) -gencode arch=compute_$(arch),code=sm_$(arch))
-    CUINC = -I$(CUDA_HOME)/include/
-    CURANDLIBFLAGS = -L$(CUDA_HOME)/lib64/ -lcurand # NB: -lcuda is not needed here!
-    CUOPTFLAGS = -lineinfo
-    GPUFLAGS = $(foreach opt, $(OPTFLAGS), -Xcompiler $(opt)) $(CUOPTFLAGS) $(INCFLAGS) $(CUINC) $(USE_NVTX) $(CUARCHFLAGS) -use_fast_math
-    ###GPUFLAGS += -Xcompiler -Wall -Xcompiler -Wextra -Xcompiler -Wshadow
-    ###GPUCC_VERSION = $(shell $(GPUCC) --version | grep 'Cuda compilation tools' | cut -d' ' -f5 | cut -d, -f1)
-    GPUFLAGS += -std=c++17 # need CUDA >= 11.2 (see #333): this is enforced in mgOnGpuConfig.h
-    # Without -maxrregcount: baseline throughput: 6.5E8 (16384 32 12) up to 7.3E8 (65536 128 12)
-    ###GPUFLAGS+= --maxrregcount 160 # improves throughput: 6.9E8 (16384 32 12) up to 7.7E8 (65536 128 12)
-    ###GPUFLAGS+= --maxrregcount 128 # improves throughput: 7.3E8 (16384 32 12) up to 7.6E8 (65536 128 12)
-    ###GPUFLAGS+= --maxrregcount 96 # degrades throughput: 4.1E8 (16384 32 12) up to 4.5E8 (65536 128 12)
-    ###GPUFLAGS+= --maxrregcount 64 # degrades throughput: 1.7E8 (16384 32 12) flat at 1.7E8 (65536 128 12)
-    CUBUILDRULEFLAGS = -Xcompiler -fPIC -c
-    CCBUILDRULEFLAGS = -Xcompiler -fPIC -c -x cu
-    CUDATESTFLAGS = -lcuda
-  else ifneq ($(origin REQUIRE_CUDA),undefined)
-    # If REQUIRE_CUDA is set but no cuda is found, stop here (e.g. for CI tests on GPU #443)
-    $(error No cuda installation found (set CUDA_HOME or make GPUCC visible in PATH))
+#=== Configure the GPU compiler (CUDA or HIP)
+
+# FIXME! (AV 24.01.2024)
+# In the current implementation (without separate builds for C++ and CUDA/HIP), we first check for cudacc and hipcc in CUDA_HOME and HIP_HOME.
+# If CUDA_HOME or HIP_HOME are not set, try to determine them from the path to cudacc and hipcc.
+# While convoluted, this is currently necessary to allow disabling CUDA/HIP builds by setting CUDA_HOME or HIP_HOME to invalid paths.
+# This will (probably?) be fixed when separate C++ and CUDA/HIP builds are implemented (PR #775).
+
+# If CXX is not a single word (example "clang++ --gcc-toolchain...") then disable CUDA and HIP builds (issue #505)
+# This is because it is impossible to pass this to "GPUFLAGS += -ccbin <host-compiler>" below
+ifneq ($(words $(subst ccache ,,$(CXX))),1) # allow at most "CXX=ccache <host-compiler>" from outside
+  $(warning CUDA and HIP builds are not supported for multi-word CXX "$(CXX)")
+  override CUDA_HOME=disabled
+  override HIP_HOME=disabled
+endif
+
+# If CUDA_HOME is not set, try to set it from the path to nvcc
+ifndef CUDA_HOME
+  CUDA_HOME = $(patsubst %bin/nvcc,%,$(shell which nvcc 2>/dev/null))
+  $(warning CUDA_HOME was not set: using "$(CUDA_HOME)")
+endif
+
+# If HIP_HOME is not set, try to set it from the path to hipcc
+ifndef HIP_HOME
+  HIP_HOME = $(patsubst %bin/hipcc,%,$(HIP_COMPILER_PATH))
+  $(warning HIP_HOME was not set: using "$(HIP_HOME)")
+endif
+
+# FIXME! (AV 24.01.2024)
+# In the current implementation (without separate builds for C++ and CUDA/HIP),
+# builds are performed for HIP only if CUDA is not found in the path.
+# If both CUDA and HIP are installed, HIP builds can be triggered by unsetting CUDA_HOME.
+# This will be fixed when separate C++ and CUDA/HIP builds are implemented (PR #775).
+
+#--- Option 1: CUDA exists -> use CUDA
+
+# Set GPUCC as $(CUDA_HOME)/bin/nvcc if it exists
+ifneq ($(wildcard $(CUDA_HOME)/bin/nvcc),)
+
+  GPUCC = $(CUDA_HOME)/bin/nvcc
+  USE_NVTX ?=-DUSE_NVTX
+  # See https://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/index.html
+  # See https://arnon.dk/matching-sm-architectures-arch-and-gencode-for-various-nvidia-cards/
+  # Default: use compute capability 70 for V100 (CERN lxbatch, CERN itscrd, Juwels Cluster).
+  # Embed device code for 70, and PTX for 70+.
+  # Export MADGRAPH_CUDA_ARCHITECTURE (comma-separated list) to use another value or list of values (see #533).
+  # Examples: use 60 for P100 (Piz Daint), 80 for A100 (Juwels Booster, NVidia raplab/Curiosity).
+  MADGRAPH_CUDA_ARCHITECTURE ?= 70
+  ###CUARCHFLAGS = -gencode arch=compute_$(MADGRAPH_CUDA_ARCHITECTURE),code=compute_$(MADGRAPH_CUDA_ARCHITECTURE) -gencode arch=compute_$(MADGRAPH_CUDA_ARCHITECTURE),code=sm_$(MADGRAPH_CUDA_ARCHITECTURE) # Older implementation (AV): go back to this one for multi-GPU support #533
+  ###CUARCHFLAGS = --gpu-architecture=compute_$(MADGRAPH_CUDA_ARCHITECTURE) --gpu-code=sm_$(MADGRAPH_CUDA_ARCHITECTURE),compute_$(MADGRAPH_CUDA_ARCHITECTURE) # Newer implementation (SH): cannot use this as-is for multi-GPU support #533
+  comma:=,
+  CUARCHFLAGS = $(foreach arch,$(subst $(comma), ,$(MADGRAPH_CUDA_ARCHITECTURE)),-gencode arch=compute_$(arch),code=compute_$(arch) -gencode arch=compute_$(arch),code=sm_$(arch))
+  CUINC = -I$(CUDA_HOME)/include/
+  ifeq ($(RNDGEN),hasNoCurand)
+    CURANDLIBFLAGS=
   else
-    # No cuda. Switch cuda compilation off and go to common random numbers in C++
-    $(warning CUDA_HOME is not set or is invalid: export CUDA_HOME to compile with cuda)
-    override GPUCC=
-    override USE_NVTX=
-    override CUINC=
-    override CURANDLIBFLAGS=
+    CURANDLIBFLAGS = -L$(CUDA_HOME)/lib64/ -lcurand # NB: -lcuda is not needed here!
   endif
+  CUOPTFLAGS = -lineinfo
+  ###GPUFLAGS = $(OPTFLAGS) $(CUOPTFLAGS) $(INCFLAGS) $(CUINC) $(USE_NVTX) $(CUARCHFLAGS) -use_fast_math
+  GPUFLAGS = $(foreach opt, $(OPTFLAGS), -Xcompiler $(opt)) $(CUOPTFLAGS) $(INCFLAGS) $(CUINC) $(USE_NVTX) $(CUARCHFLAGS) -use_fast_math
+  ###GPUFLAGS += -Xcompiler -Wall -Xcompiler -Wextra -Xcompiler -Wshadow
+  ###GPUCC_VERSION = $(shell $(GPUCC) --version | grep 'Cuda compilation tools' | cut -d' ' -f5 | cut -d, -f1)
+  GPUFLAGS += -std=c++17 # need CUDA >= 11.2 (see #333): this is enforced in mgOnGpuConfig.h
+  # Without -maxrregcount: baseline throughput: 6.5E8 (16384 32 12) up to 7.3E8 (65536 128 12)
+  ###GPUFLAGS+= --maxrregcount 160 # improves throughput: 6.9E8 (16384 32 12) up to 7.7E8 (65536 128 12)
+  ###GPUFLAGS+= --maxrregcount 128 # improves throughput: 7.3E8 (16384 32 12) up to 7.6E8 (65536 128 12)
+  ###GPUFLAGS+= --maxrregcount 96 # degrades throughput: 4.1E8 (16384 32 12) up to 4.5E8 (65536 128 12)
+  ###GPUFLAGS+= --maxrregcount 64 # degrades throughput: 1.7E8 (16384 32 12) flat at 1.7E8 (65536 128 12)
+  CUBUILDRULEFLAGS = -Xcompiler -fPIC -c
+  CCBUILDRULEFLAGS = -Xcompiler -fPIC -c -x cu
+  CUDATESTFLAGS = -lcuda
 
   # Set the host C++ compiler for GPUCC via "-ccbin <host-compiler>"
   # (NB issue #505: this must be a single word, "clang++ --gcc-toolchain..." is not supported)
@@ -173,71 +203,55 @@ ifeq ($(findstring nvcc,$(CUDA_COMPILER_PATH)),nvcc)
   GPUFLAGS += -allow-unsupported-compiler
   endif
 
-else ifeq ($(findstring hipcc,$(HIP_COMPILER_PATH)),hipcc)
-  #=== Configure the HIP compiler
+else ifneq ($(origin REQUIRE_CUDA),undefined)
 
-  # If CXX is not a single word (example "clang++ --gcc-toolchain...") then disable CUDA/HIP builds (issue #505)
-  # This is because it is impossible to pass this to "GPUFLAGS += -ccbin <host-compiler>" below
-  ifneq ($(words $(subst ccache ,,$(CXX))),1) # allow at most "CXX=ccache <host-compiler>" from outside
-    $(warning HIP builds are not supported for multi-word CXX "$(CXX)")
-    override HIP_HOME=disabled
-  endif
+  # If REQUIRE_CUDA is set but no cuda is found, stop here (e.g. for CI tests on GPU #443)
+  $(error No cuda installation found (set CUDA_HOME or make GPUCC visible in PATH))
 
-  # If HIP_HOME is not set, try to set it from the location of GPUCC
-  ifndef HIP_HOME
-    HIP_HOME = $(patsubst %bin/hipcc,%,$(HIP_COMPILER_PATH))
-    $(warning HIP_HOME was not set: using "$(HIP_HOME)")
-  endif
+#--- Option 2: CUDA does not exist, HIP exists -> use HIP
 
-  # Set GPUCC as $(HIP_HOME)/bin/hipcc if it exists
-  ifneq ($(wildcard $(HIP_HOME)/bin/hipcc),)
-    GPUCC = $(HIP_HOME)/bin/hipcc
-
-    # Should maybe find something equivelant to this in HIP
-    #USE_NVTX ?=-DUSE_NVTX
-
-    HIPARCHFLAGS = -target x86_64-linux-gnu --offload-arch=gfx90a
-    HIPINC = -I$(HIP_HOME)/include/
-
-    # -DHIP_FAST_MATH equivelant to -use_fast_math in HIP 
-    # (But only for single precision line 208: https://rocm-developer-tools.github.io/HIP/hcc__detail_2math__functions_8h_source.html)
-    GPUFLAGS = $(OPTFLAGS) $(CUOPTFLAGS) $(INCFLAGS) $(HIPINC) $(HIPARCHFLAGS) -DHIP_FAST_MATH -DHIP_PLATFORM=amd -fPIC
-    ###GPUFLAGS += -Xcompiler -Wall -Xcompiler -Wextra -Xcompiler -Wshadow
-    GPUFLAGS += -std=c++17
-    # Without -maxrregcount: baseline throughput: 6.5E8 (16384 32 12) up to 7.3E8 (65536 128 12)
-    ###GPUFLAGS+= --maxrregcount 160 # improves throughput: 6.9E8 (16384 32 12) up to 7.7E8 (65536 128 12)
-    ###GPUFLAGS+= --maxrregcount 128 # improves throughput: 7.3E8 (16384 32 12) up to 7.6E8 (65536 128 12)
-    ###GPUFLAGS+= --maxrregcount 96 # degrades throughput: 4.1E8 (16384 32 12) up to 4.5E8 (65536 128 12)
-    ###GPUFLAGS+= --maxrregcount 64 # degrades throughput: 1.7E8 (16384 32 12) flat at 1.7E8 (65536 128 12)
-
-    CUBUILDRULEFLAGS = -fPIC -c
-    CCBUILDRULEFLAGS = -fPIC -c
-
-  else ifneq ($(origin REQUIRE_HIP),undefined)
-    # If REQUIRE_HIP is set but no cuda is found, stop here (e.g. for CI tests on GPU #443)
-    $(error No hip installation found (set HIP_HOME or make GPUCC visible in PATH))
-  else
-    # No hip. Switch hip compilation off and go to common random numbers in C++
-    $(warning HIP_HOME is not set or is invalid: export HIP_HOME to compile with hip)
-    override GPUCC=
-    override USE_NVTX=
-    override CUINC=
-    override CURANDLIBFLAGS=
-  endif
+# Set GPUCC as $(HIP_HOME)/bin/hipcc if it exists
+else ifneq ($(wildcard $(HIP_HOME)/bin/hipcc),)
 
-  # Allow newer (unsupported) C++ compilers with older versions of CUDA if ALLOW_UNSUPPORTED_COMPILER_IN_CUDA is set (#504)
-  ifneq ($(origin ALLOW_UNSUPPORTED_COMPILER_IN_CUDA),undefined)
-  GPUFLAGS += -allow-unsupported-compiler
-  endif
+  GPUCC = $(HIP_HOME)/bin/hipcc
+  #USE_NVTX ?=-DUSE_NVTX # should maybe find something equivalent to this in HIP?
+  HIPARCHFLAGS = -target x86_64-linux-gnu --offload-arch=gfx90a
+  HIPINC = -I$(HIP_HOME)/include/
+  # Note: -DHIP_FAST_MATH is equivalent to -use_fast_math in HIP 
+  # (but only for single precision line 208: https://rocm-developer-tools.github.io/HIP/hcc__detail_2math__functions_8h_source.html)
+  GPUFLAGS = $(OPTFLAGS) $(CUOPTFLAGS) $(INCFLAGS) $(HIPINC) $(HIPARCHFLAGS) -DHIP_FAST_MATH -DHIP_PLATFORM=amd -fPIC
+  ###GPUFLAGS += -Xcompiler -Wall -Xcompiler -Wextra -Xcompiler -Wshadow
+  GPUFLAGS += -std=c++17
+  ###GPUFLAGS+= --maxrregcount 255 # (AV: is this option valid on HIP and meaningful on AMD GPUs?)
+  CUBUILDRULEFLAGS = -fPIC -c
+  CCBUILDRULEFLAGS = -fPIC -c
+
+else ifneq ($(origin REQUIRE_HIP),undefined)
+
+  # If REQUIRE_HIP is set but no HIP is found, stop here (e.g. for CI tests on GPU #443)
+  $(error No hip installation found (set HIP_HOME or make GPUCC visible in PATH))
+
+#--- Option 3: CUDA does not exist, HIP does not exist -> switch off both CUDA and HIP
+
+else
+
+  # No cudacc and no hipcc: switch CUDA and HIP compilation off and go to common random numbers in C++
+  $(warning CUDA_HOME is not set or is invalid: export CUDA_HOME to compile with cuda)
+  $(warning HIP_HOME is not set or is invalid: export HIP_HOME to compile with hip)
+  override GPUCC=
+  override USE_NVTX=
+  override CUINC=
+  override CURANDLIBFLAGS=
 
 endif
 
+# Export GPUCC (so that it can also be used in cudacpp_src.mk?)
 export GPUCC
 export GPUFLAGS
 
 #-------------------------------------------------------------------------------
 
-#=== Configure ccache for C++ and CUDA builds
+#=== Configure ccache for C++ and CUDA/HIP builds
 
 # Enable ccache if USECCACHE=1
 ifeq ($(USECCACHE)$(shell echo $(CXX) | grep ccache),1)
@@ -254,7 +268,7 @@ endif
 
 #-------------------------------------------------------------------------------
 
-#=== Configure PowerPC-specific compiler flags for C++ and CUDA
+#=== Configure PowerPC-specific compiler flags for C++ and CUDA/HIP
 
 # PowerPC-specific CXX compiler flags (being reviewed)
 ifeq ($(UNAME_P),ppc64le)
@@ -270,7 +284,7 @@ else
   ######CXXFLAGS+= -fno-semantic-interposition # no benefit (neither alone, nor combined with -flto)
 endif
 
-# PowerPC-specific CUDA compiler flags (to be reviewed!)
+# PowerPC-specific CUDA/HIP compiler flags (to be reviewed!)
 ifeq ($(UNAME_P),ppc64le)
   GPUFLAGS+= -Xcompiler -mno-float128
 endif
@@ -285,12 +299,14 @@ override OMPFLAGS = -fopenmp
 ###override OMPFLAGS = # disable OpenMP MT on Intel (was ok without GPUCC but not ok with GPUCC before #578)
 else ifneq ($(shell $(CXX) --version | egrep '^(clang)'),)
 override OMPFLAGS = -fopenmp
-###override OMPFLAGS = # disable OpenMP MT on clang (was not ok without or with GPUCC before #578)
-else ifneq ($(shell $(CXX) --version | egrep '^(Apple clang)'),)
-override OMPFLAGS = # disable OpenMP MT on Apple clang (builds fail in the CI #578)
+###override OMPFLAGS = # disable OpenMP MT on clang (was not ok without or with nvcc before #578)
+###else ifneq ($(shell $(CXX) --version | egrep '^(Apple clang)'),) # AV for Mac (Apple clang compiler)
+else ifeq ($(UNAME_S),Darwin) # OM for Mac (any compiler)
+override OMPFLAGS = # AV disable OpenMP MT on Apple clang (builds fail in the CI #578)
+###override OMPFLAGS = -fopenmp # OM reenable OpenMP MT on Apple clang? (AV Oct 2023: this still fails in the CI)
 else
-override OMPFLAGS = -fopenmp
-###override OMPFLAGS = # disable OpenMP MT (default before #575)
+override OMPFLAGS = -fopenmp # enable OpenMP MT by default on all other platforms
+###override OMPFLAGS = # disable OpenMP MT on all other platforms (default before #575)
 endif
 
 # Set the default AVX (vectorization) choice
@@ -356,7 +372,7 @@ export OMPFLAGS
 
 #-------------------------------------------------------------------------------
 
-#=== Set the CUDA/C++ compiler flags appropriate to user-defined choices of AVX, FPTYPE, HELINL, HRDCOD, RNDGEN
+#=== Set the CUDA/HIP/C++ compiler flags appropriate to user-defined choices of AVX, FPTYPE, HELINL, HRDCOD, RNDGEN
 
 # Set the build flags appropriate to OMPFLAGS
 $(info OMPFLAGS=$(OMPFLAGS))
@@ -573,8 +589,9 @@ $(BUILDDIR)/gcheck_sa.o: CXXFLAGS += $(USE_NVTX) $(CUINC)
 
 # Apply special build flags only to check_sa and CurandRandomNumberKernel (curand headers, #679)
 $(BUILDDIR)/check_sa.o: CXXFLAGS += $(CXXFLAGSCURAND)
-$(BUILDDIR)/gcheck_sa.o: CXXFLAGS += $(CXXFLAGSCURAND)
+$(BUILDDIR)/gcheck_sa.o: CUFLAGS += $(CXXFLAGSCURAND)
 $(BUILDDIR)/CurandRandomNumberKernel.o: CXXFLAGS += $(CXXFLAGSCURAND)
+$(BUILDDIR)/gCurandRandomNumberKernel.o: CUFLAGS += $(CXXFLAGSCURAND)
 ifeq ($(RNDGEN),hasCurand)
 $(BUILDDIR)/CurandRandomNumberKernel.o: CXXFLAGS += $(CUINC)
 endif
@@ -772,12 +789,18 @@ $(testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(cxx_object
 	  $(GPUCC) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) -ldl $(LIBFLAGS) $(CUDATESTFLAGS)
 endif
 
+# Use target gtestlibs to build only googletest
+ifneq ($(GTESTLIBS),)
+gtestlibs: $(GTESTLIBS)
+endif
+
 # Use flock (Linux only, no Mac) to allow 'make -j' if googletest has not yet been downloaded https://stackoverflow.com/a/32666215
 $(GTESTLIBS):
 ifneq ($(shell which flock 2>/dev/null),)
+	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
 	flock $(BUILDDIR)/.make_test.lock $(MAKE) -C $(TESTDIR)
 else
-	$(MAKE) -C $(TESTDIR)
+	if [ -d $(TESTDIR) ]; then $(MAKE) -C $(TESTDIR); fi
 endif
 
 #-------------------------------------------------------------------------------
diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/dummy_fct.f b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/dummy_fct.f
index 076cf29d67..4f7a204b8f 100644
--- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/dummy_fct.f
+++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/dummy_fct.f
@@ -32,7 +32,7 @@ logical FUNCTION dummy_cuts(P)
       LOGICAL  IS_A_NU(NEXTERNAL),IS_HEAVY(NEXTERNAL)
       logical  do_cuts(nexternal)
       COMMON /TO_SPECISA/IS_A_J,IS_A_A,IS_A_L,IS_A_B,IS_A_NU,IS_HEAVY,
-     . IS_A_ONIUM, do_cuts
+     & IS_A_ONIUM, do_cuts
 
       dummy_cuts=.true.
 
@@ -118,15 +118,16 @@ double precision function user_dynamical_scale(P)
       
       
 C ************************************************************
-C default for the library implementing a dummt bias function
+C default for the library implementing a dummy bias function
 C ************************************************************
       subroutine bias_wgt_custom(p, original_weight, bias_weight)
-          implicit none
+      implicit none
 C
 C Parameters
 C
           include 'nexternal.inc'
-C
+
+C     
 C Arguments
 C
           double precision p(0:3, nexternal)
@@ -161,3 +162,4 @@ subroutine bias_wgt_custom(p, original_weight, bias_weight)
 
       return
       end subroutine bias_wgt_custom
+
diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/fbridge.cc b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/fbridge.cc
index 2b956730d4..22ce3f5115 100644
--- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/fbridge.cc
+++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/fbridge.cc
@@ -49,11 +49,7 @@ extern "C"
 #ifdef MGONGPUCPP_GPUIMPL
     GpuRuntime::setUp();
 #endif
-    // Create a process object, read parm card and set parameters
-    // FIXME: the process instance can happily go out of scope because it is only needed to read parameters?
-    // FIXME: the CPPProcess should really be a singleton? what if fbridgecreate is called from several Fortran threads?
-    CPPProcess process( /*verbose=*/false );
-    process.initProc( "../../Cards/param_card.dat" );
+    // (NB: CPPProcess::initProc no longer needs to be executed here because it is called in the Bridge constructor)
     // FIXME: disable OMP in Bridge when called from Fortran
     *ppbridge = new Bridge<FORTRANFPTYPE>( *pnevtF, *pnparF, *pnp4F );
   }
diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/genps.f b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/genps.f
index fe9c61504b..c00e33d954 100644
--- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/genps.f
+++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/genps.f
@@ -1877,12 +1877,12 @@ double precision function get_channel_cut(p, config)
          d1 = iforest(1, -i, config)
          d2 = iforest(2, -i, config)
          do j=0,3
-            if (d1.gt.0.and.d1.le.2) then
+            if (d1.gt.0.and.d1.le.nincoming) then
                ptemp(j,-i) = ptemp(j,-i) - ptemp(j, d1)
             else
                ptemp(j,-i) = ptemp(j,-i)+ptemp(j, d1)
             endif
-            if (d2.gt.0.and.d2.le.2) then
+            if (d2.gt.0.and.d2.le.nincoming) then
                ptemp(j,-i) = ptemp(j,-i) - ptemp(j, d2)
             else
                ptemp(j,-i) = ptemp(j,-i)+ptemp(j, d2)
diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/makefile b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/makefile
index 74db44d848..d572486c2e 100644
--- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/makefile
+++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/makefile
@@ -9,6 +9,12 @@ FFLAGS+= -cpp
 # Compile counters with -O3 as in the cudacpp makefile (avoid being "unfair" to Fortran #740)
 CXXFLAGS = -O3 -Wall -Wshadow -Wextra
 
+# Add -std=c++17 explicitly to avoid build errors on macOS
+# Add -mmacosx-version-min=11.3 to avoid "ld: warning: object file was built for newer macOS version than being linked"
+ifneq ($(shell $(CXX) --version | egrep '^Apple clang'),)
+CXXFLAGS += -std=c++17 -mmacosx-version-min=11.3
+endif
+
 # Enable ccache if USECCACHE=1
 ifeq ($(USECCACHE)$(shell echo $(CXX) | grep ccache),1)
   override CXX:=ccache $(CXX)
@@ -51,7 +57,7 @@ CUDACPP_MAKEFILE=cudacpp.mk
 CUDACPP_MAKEENV:=$(shell echo '$(.VARIABLES)' | tr " " "\n" | egrep "(USEBUILDDIR|AVX|FPTYPE|HELINL|HRDCOD)")
 ###$(info CUDACPP_MAKEENV=$(CUDACPP_MAKEENV))
 ###$(info $(foreach v,$(CUDACPP_MAKEENV),$(v)="$($(v))"))
-CUDACPP_BUILDDIR:=$(shell $(MAKE) $(foreach v,$(CUDACPP_MAKEENV),$(v)="$($(v))") -f $(CUDACPP_MAKEFILE) -pn |& awk '/Building/{print $$3}' | sed s/BUILDDIR=//)
+CUDACPP_BUILDDIR:=$(shell $(MAKE) $(foreach v,$(CUDACPP_MAKEENV),$(v)="$($(v))") -f $(CUDACPP_MAKEFILE) -pn 2>&1 | awk '/Building/{print $$3}' | sed s/BUILDDIR=//)
 ifeq ($(CUDACPP_BUILDDIR),)
 $(error CUDACPP_BUILDDIR='$(CUDACPP_BUILDDIR)' should not be empty!)
 else
@@ -89,7 +95,12 @@ SYMMETRY = symmetry.o idenparts.o
 
 # Binaries
 
-LDFLAGS+=-Wl,--no-relax # avoid 'failed to convert GOTPCREL relocation' error #458
+ifeq ($(UNAME),Darwin)
+LDFLAGS += -lc++ # avoid 'Undefined symbols' for chrono::steady_clock on macOS (checked with otool -L libmg5amc_gg_ttx_cpp.so) 
+LDFLAGS += -mmacosx-version-min=11.3 # avoid "ld: warning: object file was built for newer macOS version than being linked"  
+else
+LDFLAGS += -Wl,--no-relax # avoid 'failed to convert GOTPCREL relocation' error #458 (not supported on macOS)
+endif
 
 all: $(PROG)_fortran $(CUDACPP_BUILDDIR)/$(PROG)_cpp # also builds $(PROG)_cuda if $(CUDACPP_CULIB) exists (#503)
 
@@ -100,8 +111,8 @@ LINKLIBS += -lintlc # undefined reference to `_intel_fast_memcpy'
 else ifneq ($(shell $(CXX) --version | egrep '^clang'),)
 override OMPFLAGS = -fopenmp
 $(CUDACPP_BUILDDIR)/$(PROG)_cpp: LINKLIBS += -L $(shell dirname $(shell $(CXX) -print-file-name=libc++.so)) -lomp # see #604
-###else ifneq ($(shell $(CXX) --version | egrep '^Apple clang'),)
-###override OMPFLAGS = -fopenmp # OMP is not supported yet by cudacpp for Apple clang
+else ifneq ($(shell $(CXX) --version | egrep '^Apple clang'),)
+override OMPFLAGS = # OMP is not supported yet by cudacpp for Apple clang
 else
 override OMPFLAGS = -fopenmp
 endif
diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/runTest.cc b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/runTest.cc
index 0ed26180ca..de327f2321 100644
--- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/runTest.cc
+++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/runTest.cc
@@ -71,6 +71,8 @@ struct CPUTest : public CUDA_CPU_TestBase
     , hstSelCol( nevt )
     , hstIsGoodHel( CPPProcess::ncomb )
   {
+    // FIXME: the process instance can happily go out of scope because it is only needed to read parameters?
+    // FIXME: the CPPProcess should really be a singleton?
     process.initProc( "../../Cards/param_card.dat" );
   }
 
@@ -183,6 +185,8 @@ struct CUDATest : public CUDA_CPU_TestBase
     , devSelCol( nevt )
     , devIsGoodHel( CPPProcess::ncomb )
   {
+    // FIXME: the process instance can happily go out of scope because it is only needed to read parameters?
+    // FIXME: the CPPProcess should really be a singleton?
     process.initProc( "../../Cards/param_card.dat" );
   }
 
diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/testxxx.cc b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/testxxx.cc
index 016bc0f472..e5167de00c 100644
--- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/testxxx.cc
+++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/testxxx.cc
@@ -59,7 +59,8 @@ TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testxxx )
   using namespace mg5amcCpu;
 #endif
 #ifndef __APPLE__ // test #701 (except on MacOS where feenableexcept is not defined #730)
-  const bool enableFPE = !getenv( "CUDACPP_RUNTIME_DISABLEFPE" );
+  const char* enableFPEc = getenv( "CUDACPP_RUNTIME_ENABLEFPE" );
+  const bool enableFPE = ( enableFPEc != 0 ) && ( std::string( enableFPEc ) != "" );
   if( enableFPE )
   {
     feenableexcept( FE_INVALID | FE_DIVBYZERO | FE_OVERFLOW | FE_UNDERFLOW ); // debug #701
diff --git a/epochX/cudacpp/gg_ttgg.mad/bin/generate_events b/epochX/cudacpp/gg_ttgg.mad/bin/generate_events
index 107313b25d..5577cc66a0 100755
--- a/epochX/cudacpp/gg_ttgg.mad/bin/generate_events
+++ b/epochX/cudacpp/gg_ttgg.mad/bin/generate_events
@@ -46,7 +46,7 @@ if __debug__ and (not os.path.exists(pjoin(root_path,'../..', 'bin','create_rele
 
 sys.path.append(pjoin(root_path,'bin','internal'))
 import madevent_interface as ME        
-
+import misc as misc
 
 import logging
 import logging.config
@@ -160,17 +160,31 @@ if '__main__' == __name__:
     # Check that python version is valid
 
     set_configuration()
-    argument = sys.argv    
+    argument = sys.argv
+
+    # check for plugin customization of the launch command
+    launch_interface = ME.MadEventCmdShell
+    if os.path.exists(pjoin(root_path, 'bin','internal', 'launch_plugin.py')):
+        with  misc.TMP_variable(sys, 'path', sys.path + [pjoin(root_path, 'bin', 'internal')]):
+            from importlib import reload
+            try:
+                reload('launch_plugin')
+            except Exception as error:
+                import launch_plugin
+        launch_interface =  launch_plugin.MEINTERFACE
+
+
+    
     try:
         if '-h' in argument or '--help' in argument:
-            launch = ME.MadEventCmdShell(me_dir=root_path, force_run=True)
+            launch = launch_interface(me_dir=root_path, force_run=True)
             launch.exec_cmd('help generate_events')
             sys.exit()
         elif len(argument) > 1 and argument[1] in ['0', '1', '2']:
             argument = treat_old_argument(argument)
         
         with ME.MadEventCmdShell.RunWebHandling(root_path, ):
-            launch = ME.MadEventCmdShell(me_dir=root_path, force_run=True)
+            launch = launch_interface(me_dir=root_path, force_run=True)
             launch.run_cmd('generate_events %s' % ' '.join(argument[1:]))
             launch.run_cmd('quit')
     except ME.MadEventAlreadyRunning as message:
diff --git a/epochX/cudacpp/gg_ttgg.mad/bin/internal/__init__.py b/epochX/cudacpp/gg_ttgg.mad/bin/internal/__init__.py
index 16e60d8182..0d17042f0d 100755
--- a/epochX/cudacpp/gg_ttgg.mad/bin/internal/__init__.py
+++ b/epochX/cudacpp/gg_ttgg.mad/bin/internal/__init__.py
@@ -26,6 +26,7 @@ class aMCatNLOError(MadGraph5Error):
 import os
 import logging
 import time
+pjoin = os.path.join
 
 #Look for basic file position MG5DIR and MG4DIR
 MG5DIR = os.path.realpath(os.path.join(os.path.dirname(__file__),
diff --git a/epochX/cudacpp/gg_ttgg.mad/bin/internal/banner.py b/epochX/cudacpp/gg_ttgg.mad/bin/internal/banner.py
index c1e54d3cb9..bd1517985f 100755
--- a/epochX/cudacpp/gg_ttgg.mad/bin/internal/banner.py
+++ b/epochX/cudacpp/gg_ttgg.mad/bin/internal/banner.py
@@ -537,7 +537,7 @@ def charge_card(self, tag):
             self.param_card = param_card_reader.ParamCard(param_card)
             return self.param_card
         elif tag == 'mgruncard':
-            self.run_card = RunCard(self[tag])
+            self.run_card = RunCard(self[tag], unknown_warning=False)
             return self.run_card
         elif tag == 'mg5proccard':
             proc_card = self[tag].split('\n')
@@ -1002,14 +1002,18 @@ def __init__(self, finput=None, **opt):
         self.allowed_value = {}
         
         self.default_setup()
+        self.plugin_input(finput)
         
 
         # if input is define read that input
         if isinstance(finput, (file, str, StringIO.StringIO)):
             self.read(finput, **opt)
+        
 
 
 
+    def plugin_input(self, finput=None):
+        pass
 
 
     def default_setup(self):
@@ -2621,7 +2625,28 @@ class RunCard(ConfigFile):
     default_include_file = 'run_card.inc'
     default_autodef_file = 'run.inc'
     donewarning = []
+    include_as_parameter = []
+
+    def plugin_input(self, finput):
 
+        if not finput and not MADEVENT:
+            return
+        curr_dir = None
+        if isinstance(finput, file):
+            # expected path to be like "XXXX/Cards/run_card.dat"
+            curr_dir = os.path.dirname(os.path.dirname(finput.name))
+        elif isinstance(finput, str):
+            curr_dir = os.path.dirname(os.path.dirname(finput))
+        
+        if curr_dir:
+            if os.path.exists(pjoin(curr_dir, 'bin', 'internal', 'plugin_run_card')):
+                # expected format {} passing everything as optional argument
+                for line in open(pjoin(curr_dir, 'bin', 'internal', 'plugin_run_card')):
+                    if line.startswith('#'):
+                        continue
+                    opts = dict(eval(line))
+                    self.add_param(**opts)
+        
     @classmethod
     def fill_post_set_from_blocks(cls):
         """set the post_set function for any parameter defined in a run_block"""
@@ -2647,18 +2672,48 @@ def __new__(cls, finput=None, **opt):
             elif isinstance(finput, cls):
                 target_class = finput.__class__
             elif isinstance(finput, str):
+                path = finput
                 if '\n' not in finput:
                     finput = open(finput).read()
                 if 'req_acc_FO' in finput:
                     target_class = RunCardNLO
                 else:
                     target_class = RunCardLO
+                    if MADEVENT and os.path.exists(pjoin(MEDIR, 'bin','internal', 'launch_plugin.py')):
+                        with  misc.TMP_variable(sys, 'path', sys.path + [pjoin(MEDIR, 'bin', 'internal')]):
+                            from importlib import reload
+                            try:
+                                reload('launch_plugin')
+                            except Exception as error:
+                                import launch_plugin
+                        target_class = launch_plugin.RunCard
+                    elif not MADEVENT:
+                        if 'run_card.dat' in path:
+                            launch_plugin_path = path.replace('run_card.dat', '../bin/internal/launch_plugin.py')
+                        elif 'run_card_default.dat' in path:
+                             launch_plugin_path = path.replace('run_card_default.dat', '../bin/internal/launch_plugin.py')
+                        else:
+                            launch_plugin_path = None
+                        if launch_plugin_path and os.path.exists(launch_plugin_path):
+                            misc.sprint('try to use plugin class', path.replace('run_card.dat', '../bin/internal/launch_plugin.py'))
+                            pydir = os.path.dirname(launch_plugin_path)
+                            with  misc.TMP_variable(sys, 'path', sys.path + [pydir]):
+                                from importlib import reload
+                                try:
+                                    reload('launch_plugin')
+                                except Exception as error:
+                                    import launch_plugin
+                            target_class = launch_plugin.RunCard
+            elif issubclass(finput, RunCard):
+                target_class = finput
             else:
                 return None
 
             target_class.fill_post_set_from_blocks()
-
-            return super(RunCard, cls).__new__(target_class, finput, **opt)
+            out = super(RunCard, cls).__new__(target_class, finput, **opt)
+            if not isinstance(out, RunCard): #should not happen but in presence of missmatch of library loaded.
+                out.__init__(finput, **opt)
+            return out
         else:
             return super(RunCard, cls).__new__(cls, finput, **opt)
 
@@ -2686,7 +2741,7 @@ def __init__(self, *args, **opts):
         self.system_default = {}
         
         self.display_block = [] # set some block to be displayed
-
+        self.fct_mod = {} # {param: (fct_pointer, *argument, **opts)}
 
         self.cut_class = {} 
         self.warned=False
@@ -2723,7 +2778,7 @@ def get_lepton_densities(cls):
 
     def add_param(self, name, value, fortran_name=None, include=True, 
                   hidden=False, legacy=False, cut=False, system=False, sys_default=None,
-                  autodef=False, 
+                  autodef=False, fct_mod=None,
                   **opts):
         """ add a parameter to the card. value is the default value and 
         defines the type (int/float/bool/str) of the input.
@@ -2737,6 +2792,7 @@ def add_param(self, name, value, fortran_name=None, include=True,
                  If a path (Source/PDF/pdf.inc) the definition will be added within that file
                  Default is False (does not add the definition)
                  entry added in the run_card will automatically have this on True.
+        fct_mod: defines a function to run if the parameter is modify in the include file
         options of **opts:
         - allowed: list of valid options. '*' means anything else should be allowed.
                  empty list means anything possible as well. 
@@ -2761,15 +2817,22 @@ def add_param(self, name, value, fortran_name=None, include=True,
         if autodef:
             self.definition_path[autodef].append(name)
             self.user_set.add(name)
+        # function to trigger if a value is modified in the include file
+        # main target is action to force correct recompilation (like for compilation flag/...)
+        if fct_mod:
+            self.fct_mod[name] = fct_mod
 
-    def read(self, finput, consistency=True):
+    def read(self, finput, consistency=True, unknown_warning=True, **opt):
         """Read the input file, this can be a path to a file, 
            a file object, a str with the content of the file."""
            
         if isinstance(finput, str):
             if "\n" in finput:
                 finput = finput.split('\n')
+                if 'path' in opt:
+                    self.path = opt['path']
             elif os.path.isfile(finput):
+                self.path = finput
                 finput = open(finput)
             else:
                 raise Exception("No such file %s" % finput)
@@ -2784,7 +2847,7 @@ def read(self, finput, consistency=True):
             name = name.lower().strip()
             if name not in self:
                 #looks like an entry added by a user -> add it nicely
-                self.add_unknown_entry(name, value)
+                self.add_unknown_entry(name, value, unknown_warning)
             else:
                 self.set( name, value, user=True)
         # parameter not set in the run_card can be set to compatiblity value
@@ -2796,7 +2859,7 @@ def read(self, finput, consistency=True):
                         logger.warning(str(error))
                     else:
                         raise
-    def add_unknown_entry(self, name, value):
+    def add_unknown_entry(self, name, value, unknow_warning):
         """function to add an entry to the run_card when the associated parameter does not exists.
            This is based on the guess_entry_fromname for the various syntax providing input.
            This then call add_param accordingly.
@@ -2835,7 +2898,7 @@ def add_unknown_entry(self, name, value):
                 raise Exception("dictionary need to have at least one entry")
             default['dict']['__type__'] = default[self.guess_type_from_value(default_value[0])]
 
-        if name not in RunCard.donewarning:
+        if name not in RunCard.donewarning and unknow_warning:
             logger.warning("Found unexpected entry in run_card: \"%s\" with value \"%s\".\n"+\
                 "  The type was assigned to %s. \n"+\
                 "  The definition of that variable will %sbe automatically added to fortran file %s\n"+\
@@ -2873,7 +2936,17 @@ def valid_line(self, line, tmp):
                 return False 
         else:
             return True      
-                    
+
+
+    def reset_simd(self, old_value, new_value, name, *args, **opts):
+        #return
+        raise Exception('pass in reset simd')
+
+    def make_clean(self,old_value, new_value, name, dir):
+        raise Exception('pass make clean for ', dir)
+
+    def make_Ptouch(self,old_value, new_value, name, reset):
+        raise Exception('pass Ptouch for ', reset)             
                 
     def write(self, output_file, template=None, python_template=False,
                     write_hidden=False, template_options=None, **opt):
@@ -2898,11 +2971,12 @@ def write(self, output_file, template=None, python_template=False,
         if python_template and not to_write:
             import string
             if self.blocks:
-                text = string.Template(text)
                 mapping = {}
                 for b in self.blocks:
                     mapping[b.name] =  b.get_template(self)
-                text = text.substitute(mapping)
+                    if "$%s" % b.name not in text:
+                        text += "\n$%s\n" % b.name
+                text = string.Template(text).substitute(mapping)
 
             if not self.list_parameter:
                 text = text % self
@@ -3048,6 +3122,77 @@ def write(self, output_file, template=None, python_template=False,
         else:
             output_file.write(text)
 
+    def get_last_value_include(self, output_dir):
+        """For paraeter in self.fct_mod
+        parse the associate inc file to get the value of the previous run.
+        We return a dictionary {name: old_value}
+        if inc file does not exist we will return the current value (i.e. set has no change)
+        """
+
+        #remember that 
+        # default_include_file is a class variable
+        # self.includepath is on the form include_path : [list of param ]
+        out = {}
+
+        # setup inc_to_parse to be like self.includepath (include_path : [list of param ])
+        # BUT only containing the parameter that need to be tracked for the fct_mod option
+        inc_to_parse = {}
+        for inc_file, params in self.includepath.items():
+            if not inc_file:
+                continue
+            if any(p in params for p in self.fct_mod):
+                inc_to_parse[inc_file] = [name for name in self.includepath[inc_file] if name in self.fct_mod]
+
+        # now loop over the files and ask the associate function
+        for inc_file, params in inc_to_parse.items():
+            if inc_file is True:
+                inc_file = self.default_include_file
+            out.update(self.get_value_from_include(inc_file, params, output_dir))
+
+        return out
+
+    def get_value_from_include(self, path, list_of_params, output_dir):
+        """for a given include file return the current value of the requested parameter
+        return a dictionary {name: value}
+        if path does not exists return the current value in self for all parameter"""
+
+        #WARNING DOES NOT HANDLE LIST/DICT so far
+
+        # handle case where file is missing
+        if not os.path.exists(pjoin(output_dir,path)):
+            misc.sprint("include file not existing", pjoin(output_dir,path))
+            out = {name: self[name] for name in list_of_params}
+
+        with open(pjoin(output_dir,path), 'r') as fsock:
+            text = fsock.read()
+        
+        for name in list_of_params:
+            misc.sprint(name, name in self.fortran_name)
+            misc.sprint(self.fortran_name[name] if name in self.fortran_name[name] else name)
+        to_track = [self.fortran_name[name] if name in self.fortran_name else name for name in list_of_params]
+        pattern = re.compile(r"\(?(%(names)s)\s?=\s?([^)]*)\)?" % {'names':'|'.join(to_track)}, re.I)
+        out =  dict(pattern.findall(text))
+        misc.sprint(out)
+        for name in list_of_params:
+            if name in self.fortran_name:
+                value = out[self.fortran_name[name]]
+                del out[self.fortran_name[name]]
+                out[name] = value
+
+        for name, value in out.items():
+            try:
+                out[name] = self.format_variable(value, type(self[name]))
+            except Exception:
+                continue
+
+        if len(out) != len(list_of_params):
+            misc.sprint(list_of_params)
+            misc.sprint(to_track)
+            misc.sprint(self.fortran_name)
+            misc.sprint(text)
+            raise Exception
+        return out 
+
 
     def get_default(self, name, default=None, log_level=None):
         """return self[name] if exist otherwise default. log control if we 
@@ -3338,71 +3483,93 @@ def write_include_file(self, output_dir, output_file=None):
         #ensusre that system only parameter are correctly set
         self.update_system_parameter_for_include()
 
+        value_in_old_include = self.get_last_value_include(output_dir)
+
+
         if output_dir:
             self.write_autodef(output_dir, output_file=None)
             # check/fix status of customised functions
             self.edit_dummy_fct_from_file(self["custom_fcts"], os.path.dirname(output_dir))
         
         for incname in self.includepath:
-            if incname is True:
-                pathinc = self.default_include_file
-            elif incname is False:
-                continue
-            else:
-                pathinc = incname
+            self.write_one_include_file(output_dir, incname, output_file)
+ 
+        for name,value in value_in_old_include.items():
+            if value != self[name]:
+                self.fct_mod[name][0](value, self[name], name, *self.fct_mod[name][1],**self.fct_mod[name][2])
 
-            if output_file:
-                fsock = output_file
+    def write_one_include_file(self, output_dir, incname, output_file=None):
+        """write one include file at the time"""
+
+        misc.sprint(incname)
+        if incname is True:
+            pathinc = self.default_include_file
+        elif incname is False:
+            return
+        else:
+            pathinc = incname
+
+        if output_file:
+            fsock = output_file
+        else:
+            fsock = file_writers.FortranWriter(pjoin(output_dir,pathinc+'.tmp'))
+
+
+        for key in self.includepath[incname]:                
+            #define the fortran name
+            if key in self.fortran_name:
+                fortran_name = self.fortran_name[key]
             else:
-                fsock = file_writers.FortranWriter(pjoin(output_dir,pathinc+'.tmp'))  
-            for key in self.includepath[incname]:                
-                #define the fortran name
-                if key in self.fortran_name:
-                    fortran_name = self.fortran_name[key]
+                fortran_name = key
+                
+            if incname in self.include_as_parameter:
+                fsock.writelines('INTEGER %s\n' % fortran_name)
+            #get the value with warning if the user didn't set it
+            value = self.get_default(key)
+            if hasattr(self, 'mod_inc_%s' % key):
+                value = getattr(self, 'mod_inc_%s' % key)(value)
+            # Special treatment for strings containing a list of
+            # strings. Convert it to a list of strings
+            if isinstance(value, list):
+                # in case of a list, add the length of the list as 0th
+                # element in fortran. Only in case of integer or float
+                # list (not for bool nor string)
+                targettype = self.list_parameter[key]                        
+                if targettype is bool:
+                    pass
+                elif targettype is int:
+                    line = '%s(%s) = %s \n' % (fortran_name, 0, self.f77_formatting(len(value)))
+                    fsock.writelines(line)
+                elif targettype is float:
+                    line = '%s(%s) = %s \n' % (fortran_name, 0, self.f77_formatting(float(len(value))))
+                    fsock.writelines(line)
+                # output the rest of the list in fortran
+                for i,v in enumerate(value):
+                    line = '%s(%s) = %s \n' % (fortran_name, i+1, self.f77_formatting(v))
+                    fsock.writelines(line)
+            elif isinstance(value, dict):
+                for fortran_name, onevalue in value.items():
+                    line = '%s = %s \n' % (fortran_name, self.f77_formatting(onevalue))
+                    fsock.writelines(line)                       
+            elif isinstance(incname,str) and 'compile' in incname:
+                if incname in self.include_as_parameter:
+                    line = 'PARAMETER (%s=%s)' %( fortran_name, value)
                 else:
-                    fortran_name = key
-                    
-                #get the value with warning if the user didn't set it
-                value = self.get_default(key)
-                if hasattr(self, 'mod_inc_%s' % key):
-                    value = getattr(self, 'mod_inc_%s' % key)(value)
-                # Special treatment for strings containing a list of
-                # strings. Convert it to a list of strings
-                if isinstance(value, list):
-                    # in case of a list, add the length of the list as 0th
-                    # element in fortran. Only in case of integer or float
-                    # list (not for bool nor string)
-                    targettype = self.list_parameter[key]                        
-                    if targettype is bool:
-                        pass
-                    elif targettype is int:
-                        line = '%s(%s) = %s \n' % (fortran_name, 0, self.f77_formatting(len(value)))
-                        fsock.writelines(line)
-                    elif targettype is float:
-                        line = '%s(%s) = %s \n' % (fortran_name, 0, self.f77_formatting(float(len(value))))
-                        fsock.writelines(line)
-                    # output the rest of the list in fortran
-                    for i,v in enumerate(value):
-                        line = '%s(%s) = %s \n' % (fortran_name, i+1, self.f77_formatting(v))
-                        fsock.writelines(line)
-                elif isinstance(value, dict):
-                    for fortran_name, onevalue in value.items():
-                        line = '%s = %s \n' % (fortran_name, self.f77_formatting(onevalue))
-                        fsock.writelines(line)                       
-                elif isinstance(incname,str) and 'compile' in incname:
                     line = '%s = %s \n' % (fortran_name, value)
-                    fsock.write(line)
+                fsock.write(line)
+            else:
+                if incname in self.include_as_parameter:
+                    line = 'PARAMETER (%s=%s)' %( fortran_name, self.f77_formatting(value))
                 else:
                     line = '%s = %s \n' % (fortran_name, self.f77_formatting(value))
-                    fsock.writelines(line)
-            if not output_file:
-                fsock.close()
-                path = pjoin(output_dir,pathinc)
-                if not os.path.exists(path) or not filecmp.cmp(path,  path+'.tmp'):
-                    files.mv(path+'.tmp', path)
-                else:
-                    os.remove(path+'.tmp')
-
+                fsock.writelines(line)
+        if not output_file:
+            fsock.close()
+            path = pjoin(output_dir,pathinc)
+            if not os.path.exists(path) or not filecmp.cmp(path,  path+'.tmp'):
+                files.mv(path+'.tmp', path)
+            else:
+                os.remove(path+'.tmp')
 
     def write_autodef(self, output_dir, output_file=None):
         """ Add the definition of variable to run.inc if the variable is set with autodef.
@@ -3741,13 +3908,14 @@ def remove_all_cut(self):
    %(tmin_for_channel)s = tmin_for_channel ! limit the non-singular reach of --some-- channel of integration related to T-channel diagram (value between -1 and 0), -1 is no impact
    %(survey_splitting)s = survey_splitting ! for loop-induced control how many core are used at survey for the computation of a single iteration.
    %(survey_nchannel_per_job)s = survey_nchannel_per_job ! control how many Channel are integrated inside a single job on cluster/multicore
-   %(refine_evt_by_job)s = refine_evt_by_job ! control the maximal number of events for the first iteration of the refine (larger means less jobs)
+   %(refine_evt_by_job)s = refine_evt_by_job ! control the maximal number of events for the first iteration of the refine (larger means less jobs)  
 #*********************************************************************
-# Compilation flag. No automatic re-compilation (need manual "make clean" in Source)
+# Compilation flag. 
 #*********************************************************************   
    %(global_flag)s = global_flag ! fortran optimization flag use for the all code.
    %(aloha_flag)s  = aloha_flag ! fortran optimization flag for aloha function. Suggestions: '-ffast-math'
    %(matrix_flag)s = matrix_flag ! fortran optimization flag for matrix.f function. Suggestions: '-O3'
+   %(vector_size)s = vector_size ! size of fortran arrays allocated in the multi-event API for SIMD/GPU (VECSIZE_MEMMAX)
 """
 
 template_off = '# To see advanced option for Phase-Space optimization: type "update psoptim"'
@@ -3903,9 +4071,12 @@ class RunCardLO(RunCard):
                       "get_dummy_x1_x2": pjoin("SubProcesses","dummy_fct.f"), 
                       "dummy_boostframe": pjoin("SubProcesses","dummy_fct.f"),
                       "user_dynamical_scale": pjoin("SubProcesses","dummy_fct.f"),
+                      "bias_wgt_custom": pjoin("SubProcesses","dummy_fct.f"),
                       "user_": pjoin("SubProcesses","dummy_fct.f") # all function starting by user will be added to that file
                       }
     
+    include_as_parameter = ['vector.inc']
+
     if MG5DIR:
         default_run_card = pjoin(MG5DIR, "internal", "default_run_card_lo.dat")
     
@@ -4139,10 +4310,15 @@ def default_setup(self):
         self.add_param('hel_splitamp', True, hidden=True, include=False, comment='decide if amplitude aloha call can be splitted in two or not when doing helicity per helicity optimization.')
         self.add_param('hel_zeroamp', True, hidden=True, include=False, comment='decide if zero amplitude can be removed from the computation when doing helicity per helicity optimization.')
         self.add_param('SDE_strategy', 1, allowed=[1,2], fortran_name="sde_strat", comment="decide how Multi-channel should behaves \"1\" means full single diagram enhanced (hep-ph/0208156), \"2\" use the product of the denominator")
-        self.add_param('global_flag', '-O', include=False, hidden=True, comment='global fortran compilation flag, suggestion -fbound-check')
-        self.add_param('aloha_flag', '', include=False, hidden=True, comment='global fortran compilation flag, suggestion: -ffast-math')
-        self.add_param('matrix_flag', '', include=False, hidden=True, comment='fortran compilation flag	for the	matrix-element files, suggestion -O3')        
-        
+        self.add_param('global_flag', '-O', include=False, hidden=True, comment='global fortran compilation flag, suggestion -fbound-check',
+                       fct_mod=(self.make_clean, ('Source'),{}))
+        self.add_param('aloha_flag', '', include=False, hidden=True, comment='global fortran compilation flag, suggestion: -ffast-math',
+                       fct_mod=(self.make_clean, ('Source/DHELAS'),{}))
+        self.add_param('matrix_flag', '', include=False, hidden=True, comment='fortran compilation flag	for the	matrix-element files, suggestion -O3',
+                       fct_mod=(self.make_Ptouch, ('matrix'),{}))        
+        self.add_param('vector_size', 1, include='vector.inc', hidden=True, comment='lockstep size for parralelism run', 
+                       fortran_name='VECSIZE_MEMMAX', fct_mod=(self.reset_simd,(),{}))
+
         # parameter allowing to define simple cut via the pdg
         # Special syntax are related to those. (can not be edit directly)
         self.add_param('pt_min_pdg',{'__type__':0.}, include=False, cut=True)
@@ -4164,8 +4340,7 @@ def default_setup(self):
         self.add_param('mxxmin4pdg',[-1.], system=True)
         self.add_param('mxxpart_antipart', [False], system=True)
                      
-        # CUDACPP parameters
-        self.add_param('cudacpp_backend', 'CPP', include=False, hidden=False)
+        
              
     def check_validity(self):
         """ """
@@ -4704,6 +4879,9 @@ def create_default_for_process(self, proc_characteristic, history, proc_def):
                 continue
             break
 
+        if proc_characteristic['ninitial'] == 1:
+            self['SDE_strategy'] =1
+
         if 'MLM' in proc_characteristic['limitations']:
             if self['dynamical_scale_choice'] ==  -1:
                 self['dynamical_scale_choice'] = 3
@@ -5769,7 +5947,7 @@ def default_setup(self):
         self.add_param("CheckCycle", 3)
         self.add_param("MaxAttempts", 10)
         self.add_param("ZeroThres", 1e-9)
-        self.add_param("OSThres", 1.0e-13)
+        self.add_param("OSThres", 1.0e-8)
         self.add_param("DoubleCheckHelicityFilter", True)
         self.add_param("WriteOutFilters", True)
         self.add_param("UseLoopFilter", False)
diff --git a/epochX/cudacpp/gg_ttgg.mad/bin/internal/check_param_card.py b/epochX/cudacpp/gg_ttgg.mad/bin/internal/check_param_card.py
index fe874a06a4..71089d7480 100755
--- a/epochX/cudacpp/gg_ttgg.mad/bin/internal/check_param_card.py
+++ b/epochX/cudacpp/gg_ttgg.mad/bin/internal/check_param_card.py
@@ -85,7 +85,7 @@ def load_str(self, text):
             self.value= ' '.join(data[len(self.lhacode):])
             # check that lhacode are the first entry otherwise return invalid param.
             if ' '.join([str(i) for i in self.lhacode]) != ' '.join(data[:len(self.lhacode)]):
-                raise InvalidParam
+                raise InvalidParam("line was %s" % str(data))
         else:
             self.value = data[-1]
         
diff --git a/epochX/cudacpp/gg_ttgg.mad/bin/internal/common_run_interface.py b/epochX/cudacpp/gg_ttgg.mad/bin/internal/common_run_interface.py
index 5d0187e3fa..87cb4b88df 100755
--- a/epochX/cudacpp/gg_ttgg.mad/bin/internal/common_run_interface.py
+++ b/epochX/cudacpp/gg_ttgg.mad/bin/internal/common_run_interface.py
@@ -749,13 +749,15 @@ def writeRunWeb(me_dir):
         
     class RunWebHandling(object):
         
-        def __init__(self, me_dir, crashifpresent=True, warnifpresent=True):
+        def __init__(self, me_dir, crashifpresent=True, warnifpresent=True, force_run=False):
             """raise error if RunWeb already exists
             me_dir is the directory where the write RunWeb"""
             
             self.remove_run_web = True
             self.me_dir = me_dir
-            
+            if force_run:
+                self.remove_run_web = False
+                return            
             if crashifpresent or warnifpresent:
                 if os.path.exists(pjoin(me_dir, 'RunWeb')):
                     pid = open(pjoin(me_dir, 'RunWeb')).read()
@@ -4904,6 +4906,7 @@ def __init__(self, question, cards=[], from_banner=None, banner=None, mode='auto
         self.load_default()        
         self.define_paths(**opt)
         self.last_editline_pos = 0
+        self.update_dependent_done = False
 
         if 'allow_arg' not in opt or not opt['allow_arg']:
             # add some mininal content for this:
@@ -6574,7 +6577,7 @@ def reask(self, *args, **opt):
     fail_due_to_format = 0 #parameter to avoid infinite loop
     def postcmd(self, stop, line):
 
-        if line not in [None, '0', 'done', '']:
+        if line not in [None, '0', 'done', '',0]:
             ending_question = cmd.OneLinePathCompletion.postcmd(self,stop,line)
         else:
             ending_question = True
@@ -6583,7 +6586,9 @@ def postcmd(self, stop, line):
             self.check_card_consistency()
             if self.param_consistency:
                 try:
-                    self.do_update('dependent', timer=20)
+                    if not self.update_dependent_done:
+                        self.do_update('dependent', timer=20)
+                    self.update_dependent_done = False
                 except MadGraph5Error as error:
                     if 'Missing block:' in str(error):
                         self.fail_due_to_format +=1
@@ -6636,6 +6641,8 @@ def do_update(self, line, timer=0):
             self.update_dependent(self.mother_interface, self.me_dir, self.param_card,
                                    self.paths['param'], timer, run_card=self.run_card,
                                    lhapdfconfig=self.lhapdf)
+            self.update_dependent_done = True
+            
 
         elif args[0] == 'missing':
             self.update_missing()
@@ -6715,12 +6722,13 @@ class TimeOutError(Exception):
         def handle_alarm(signum, frame): 
             raise TimeOutError
         signal.signal(signal.SIGALRM, handle_alarm)
+
         if timer:
-            signal.alarm(timer)
             log_level=30
         else:
             log_level=20
 
+
         if run_card:
             as_for_pdf = {'cteq6_m': 0.118,
                           'cteq6_d': 0.118, 
@@ -6779,6 +6787,10 @@ def handle_alarm(signum, frame):
                     logger.log(log_level, "update the strong coupling value (alpha_s) to the value from the pdf selected: %s",  as_for_pdf[pdlabel])
                     modify = True
 
+        if timer:
+            signal.alarm(timer)
+
+
         # Try to load the model in the limited amount of time allowed
         try:
             model = mecmd.get_model()
@@ -6907,7 +6919,8 @@ def check_block(self, blockname):
     def check_answer_consistency(self):
         """function called if the code reads a file"""
         self.check_card_consistency()
-        self.do_update('dependent', timer=20) 
+        if not self.update_dependent_done:
+            self.do_update('dependent', timer=20) 
       
     def help_set(self):
         '''help message for set'''
@@ -7533,7 +7546,8 @@ def open_file(self, answer):
             else:
                 raise
         if time.time() - start < .5:
-            self.mother_interface.ask("Are you really that fast? If you are using an editor that returns directly. Please confirm that you have finised to edit the file", 'y')
+            self.mother_interface.ask("Are you really that fast? If you are using an editor that returns directly. Please confirm that you have finised to edit the file", 'y',
+                                      timeout=False)
         self.reload_card(path)
         
     def reload_card(self, path): 
diff --git a/epochX/cudacpp/gg_ttgg.mad/bin/internal/extended_cmd.py b/epochX/cudacpp/gg_ttgg.mad/bin/internal/extended_cmd.py
index a6a8609dce..2f37070580 100755
--- a/epochX/cudacpp/gg_ttgg.mad/bin/internal/extended_cmd.py
+++ b/epochX/cudacpp/gg_ttgg.mad/bin/internal/extended_cmd.py
@@ -1108,9 +1108,12 @@ def ask(self, question, default, choices=[], path_msg=None,
         if alias:
             choices += list(alias.keys())
         
+
+
         question_instance = obj(question, allow_arg=choices, default=default, 
                                                    mother_interface=self, **opt)
-        
+        if fct_timeout is None:
+            fct_timeout = lambda x: question_instance.postcmd(x, default) if x and default else False
         if first_cmd:
             if isinstance(first_cmd, str):
                 question_instance.onecmd(first_cmd)
@@ -2271,6 +2274,9 @@ def postcmd(self, stop, line):
                 if n:
                     self.default(line)
                     return self.postcmd(stop, line)
+            elif self.value is None and line:
+                self.default(line)
+                return self.postcmd(stop, line) 
             if not self.casesensitive:
                 for ans in self.allow_arg:
                     if ans.lower() == self.value.lower():
diff --git a/epochX/cudacpp/gg_ttgg.mad/bin/internal/gen_ximprove.py b/epochX/cudacpp/gg_ttgg.mad/bin/internal/gen_ximprove.py
index 3b8ec31215..5fd170d18d 100755
--- a/epochX/cudacpp/gg_ttgg.mad/bin/internal/gen_ximprove.py
+++ b/epochX/cudacpp/gg_ttgg.mad/bin/internal/gen_ximprove.py
@@ -154,9 +154,18 @@ def get_helicity(self, to_submit=True, clean=True):
             p = misc.Popen(['./gensym'], stdout=subprocess.PIPE,
                                  stderr=subprocess.STDOUT, cwd=Pdir)
             #sym_input = "%(points)d %(iterations)d %(accuracy)f \n" % self.opts
+            
             (stdout, _) = p.communicate(''.encode())
             stdout = stdout.decode('ascii',errors='ignore')
-            nb_channel = max([math.floor(float(d)) for d in stdout.split()])
+            if stdout:
+                nb_channel = max([math.floor(float(d)) for d in stdout.split()])
+            else:
+                for matrix_file in misc.glob('matrix*orig.f', Pdir):
+                    files.cp(matrix_file, matrix_file.replace('orig','optim'))
+                P_zero_result.append(Pdir)
+                if os.path.exists(pjoin(self.me_dir, 'error')):
+                    os.remove(pjoin(self.me_dir, 'error'))
+                continue # bypass bad process
             
             self.cmd.compile(['madevent_forhel'], cwd=Pdir)
             if not os.path.exists(pjoin(Pdir, 'madevent_forhel')):
@@ -178,11 +187,13 @@ def get_helicity(self, to_submit=True, clean=True):
             #sym_input = "%(points)d %(iterations)d %(accuracy)f \n" % self.opts
             (stdout, _) = p.communicate(" ".encode())
             stdout = stdout.decode('ascii',errors='ignore')
-            if os.path.exists(pjoin(self.me_dir,'error')):
+            if os.path.exists(pjoin(self.me_dir, 'error')):
                 raise Exception(pjoin(self.me_dir,'error')) 
                 # note a continue is not enough here, we have in top to link
                 # the matrixX_optim.f to matrixX_orig.f to let the code to work
                 # after this error.
+                #                for matrix_file in misc.glob('matrix*orig.f', Pdir):
+                #    files.cp(matrix_file, matrix_file.replace('orig','optim'))
 
             if 'no events passed cuts' in stdout:
                 raise Exception
diff --git a/epochX/cudacpp/gg_ttgg.mad/bin/internal/lhe_parser.py b/epochX/cudacpp/gg_ttgg.mad/bin/internal/lhe_parser.py
index cff8789e38..a6b8582e1a 100755
--- a/epochX/cudacpp/gg_ttgg.mad/bin/internal/lhe_parser.py
+++ b/epochX/cudacpp/gg_ttgg.mad/bin/internal/lhe_parser.py
@@ -342,7 +342,12 @@ def next_event(self):
                 text.append(line)
                 
             if '</event>' in line:
-                if self.parsing:
+                if self.parsing == "wgt_only":
+                    out = Event(text, parse_momenta=False)
+                    #if len(out) == 0  and not self.allow_empty_event:
+                    #    raise Exception
+                    return out
+                elif self.parsing:
                     out = Event(text)
                     if len(out) == 0  and not self.allow_empty_event:
                         raise Exception
@@ -448,6 +453,8 @@ def unweight(self, outputpath, get_wgt=None, max_wgt=0, trunc_error=0,
         event_target reweight for that many event with maximal trunc_error.
         (stop to write event when target is reached)
         """
+        self.parsing = 'wgt_only'
+
         if not get_wgt:
             def weight(event):
                 return event.wgt
@@ -914,6 +921,8 @@ class MultiEventFile(EventFile):
        The number of events in each file need to be provide in advance 
        (if not provide the file is first read to find that number"""
     
+    parsing = True # check if/when we need to parse the event.
+
     def __new__(cls, start_list=[],parse=True):
         return object.__new__(MultiEventFile)
     
@@ -986,6 +995,7 @@ def next(self):
         nb_event = random.randint(1, remaining_event)
         sum_nb=0
         for i, obj in enumerate(self.files):
+            obj.parsing = "wgt_only"
             sum_nb += self.initial_nb_events[i] - self.curr_nb_events[i]
             if nb_event <= sum_nb:
                 self.curr_nb_events[i] += 1
@@ -1065,6 +1075,8 @@ def define_init_banner(self, wgt, lha_strategy, proc_charac=None):
             # check special case without PDF for one (or both) beam
             if init_information["idbmup1"] in [0,9]:
                 event = next(self)
+                if len(event) == 0:
+                    event = Event(str(event))
                 init_information["idbmup1"]= event[0].pdg
                 if init_information["idbmup2"] == 0:
                     init_information["idbmup2"]= event[1].pdg
@@ -1115,6 +1127,7 @@ def initialize_unweighting(self, getwgt, trunc_error):
         total_event = 0
         sum_cross = collections.defaultdict(int)
         for i,f in enumerate(self.files):
+            f.parsing = 'wgt_only'
             nb_event = 0 
             # We need to loop over the event file to get some information about the 
             # new cross-section/ wgt of event.
@@ -1302,7 +1315,7 @@ class Event(list):
 
     warning_order = True # raise a warning if the order of the particle are not in accordance of child/mother
 
-    def __init__(self, text=None):
+    def __init__(self, text=None, parse_momenta=True):
         """The initialization of an empty Event (or one associate to a text file)"""
         list.__init__(self)
         
@@ -1322,15 +1335,15 @@ def __init__(self, text=None):
         self.matched_scale_data = None
         self.syscalc_data = {}
         if text:
-            self.parse(text)
+            self.parse(text, parse_momenta=parse_momenta)
 
 
-            
-    def parse(self, text):
+    event_flag_pattern = re.compile(r"""(\w*)=(?:(?:['"])([^'"]*)(?=['"])|(\S*))""")   
+    def parse(self, text, parse_momenta=True):
         """Take the input file and create the structured information"""
         #text = re.sub(r'</?event>', '', text) # remove pointless tag
         status = 'first' 
-
+        tags = []
         if not isinstance(text, list):
             text = text.split('\n')
 
@@ -1354,24 +1367,28 @@ def parse(self, text):
                 if '<rwgt>' in line:
                     status = 'tag'
                 else:
-                    self.assign_scale_line(line)
+                    self.assign_scale_line(line, convert=parse_momenta)
                     status = 'part' 
                     continue
             if '<' in line:
                 status = 'tag'
                 
             if 'part' == status:
-                part = Particle(line, event=self)
-                if part.E != 0 or part.status==-1:
-                    self.append(part)
-                elif self.nexternal:
-                    self.nexternal-=1
+                if parse_momenta:
+                    part = Particle(line, event=self)
+                    if part.E != 0 or part.status==-1:
+                        self.append(part)
+                    elif self.nexternal:
+                        self.nexternal-=1
+                else:
+                    tags.append(line)
             else:
-                if '</event>' in line:
+                if line.endswith('</event>'):
                     line = line.replace('</event>','',1)
-                self.tag += '%s\n' % line
-                
-        self.assign_mother()
+                tags.append(line) 
+        self.tag += "\n".join(tags)
+        if parse_momenta:     
+            self.assign_mother()
     
     
     def assign_mother(self):
@@ -1905,19 +1922,27 @@ def check(self):
         #3. check mass
                    
          
-    def assign_scale_line(self, line):
+    def assign_scale_line(self, line, convert=True):
         """read the line corresponding to global event line
         format of the line is:
         Nexternal IEVENT WEIGHT SCALE AEW AS
         """
         inputs = line.split()
         assert len(inputs) == 6
-        self.nexternal=int(inputs[0])
-        self.ievent=int(inputs[1])
-        self.wgt=float(inputs[2])
-        self.scale=float(inputs[3])
-        self.aqed=float(inputs[4])
-        self.aqcd=float(inputs[5])
+        if convert:
+            self.nexternal=int(inputs[0])
+            self.ievent=int(inputs[1])
+            self.wgt=float(inputs[2])
+            self.scale=float(inputs[3])
+            self.aqed=float(inputs[4])
+            self.aqcd=float(inputs[5])
+        else:
+            self.nexternal=inputs[0]
+            self.ievent=inputs[1]
+            self.wgt=float(inputs[2])
+            self.scale=inputs[3]
+            self.aqed=inputs[4]
+            self.aqcd=inputs[5]
         
     def get_tag_and_order(self):
         """Return the unique tag identifying the SubProcesses for the generation.
@@ -2269,7 +2294,11 @@ def __str__(self, event_id=''):
         else:
             event_flag = ''
 
-        scale_str = "%2d %6d %+13.7e %14.8e %14.8e %14.8e" % \
+        try:
+            scale_str = "%2d %6d %+13.7e %14.8e %14.8e %14.8e" % \
+            (self.nexternal,self.ievent,self.wgt,self.scale,self.aqed,self.aqcd)
+        except:
+            scale_str = "%s %s %+13.7e %s %s %s" % \
             (self.nexternal,self.ievent,self.wgt,self.scale,self.aqed,self.aqcd)
 
             
diff --git a/epochX/cudacpp/gg_ttgg.mad/bin/internal/madevent_interface.py b/epochX/cudacpp/gg_ttgg.mad/bin/internal/madevent_interface.py
index b70b548e53..cb6bf4ca57 100755
--- a/epochX/cudacpp/gg_ttgg.mad/bin/internal/madevent_interface.py
+++ b/epochX/cudacpp/gg_ttgg.mad/bin/internal/madevent_interface.py
@@ -1,4 +1,4 @@
-################################################################################
+###############################################################################
 #
 # Copyright (c) 2011 The MadGraph5_aMC@NLO Development team and Contributors
 #
@@ -3191,7 +3191,7 @@ def do_treatcards(self, line, mode=None, opt=None):
       
         if mode in ['run', 'all']:
             if not hasattr(self, 'run_card'):
-                run_card = banner_mod.RunCard(opt['run_card'])
+                run_card = banner_mod.RunCard(opt['run_card'], path=pjoin(self.me_dir, 'Cards', 'run_card.dat'))
             else:
                 run_card = self.run_card
             self.run_card = run_card
@@ -3675,7 +3675,7 @@ def do_refine(self, line):
         devnull.close()
     
     ############################################################################ 
-    def do_combine_iteration(self, line):
+    def do_comine_iteration(self, line):
         """Not in help: Combine a given iteration combine_iteration Pdir Gdir S|R step
             S is for survey 
             R is for refine
@@ -3703,8 +3703,9 @@ def do_combine_iteration(self, line):
     ############################################################################ 
     def do_combine_events(self, line):
         """Advanced commands: Launch combine events"""
-
+        start=time.time()
         args = self.split_arg(line)
+        start = time.time()
         # Check argument's validity
         self.check_combine_events(args)
         self.update_status('Combining Events', level='parton')
@@ -3795,8 +3796,9 @@ def do_combine_events(self, line):
     
         if self.run_card['bias_module'].lower() not in  ['dummy', 'none'] and nb_event:
             self.correct_bias()
-        
-        
+        elif self.run_card['custom_fcts']:
+            self.correct_bias()
+        logger.info("combination of events done in %s s ", time.time()-start)
         
         self.to_store.append('event')
     
@@ -7364,7 +7366,7 @@ def wait_monitoring(Idle, Running, Done):
     import optparse
     # Get the directory of the script real path (bin)                                                                                                                                                           
     # and add it to the current PYTHONPATH                                                                                                                                                                      
-    root_path = os.path.dirname(os.path.dirname(os.path.realpath( __file__ )))
+    #root_path = os.path.dirname(os.path.dirname(os.path.dirname(os.path.realpath( __file__ ))))
     sys.path.insert(0, root_path)
 
     class MyOptParser(optparse.OptionParser):    
@@ -7407,7 +7409,13 @@ def error(self, msg=''):
     import logging.config
     # Set logging level according to the logging level given by options                                                                                                                                         
     #logging.basicConfig(level=vars(logging)[options.logging])                                                                                                                                                  
+    import internal
     import internal.coloring_logging
+    # internal.file = XXX/bin/internal/__init__.py
+    # => need three dirname to get XXX
+    # we use internal to have any issue with pythonpath finding the wrong file
+    me_dir = os.path.dirname(os.path.dirname(os.path.dirname(internal.__file__)))
+    print("me_dir is", me_dir)
     try:
         if __debug__ and options.logging == 'INFO':
             options.logging = 'DEBUG'
@@ -7415,7 +7423,8 @@ def error(self, msg=''):
             level = int(options.logging)
         else:
             level = eval('logging.' + options.logging)
-        logging.config.fileConfig(os.path.join(root_path, 'internal', 'me5_logging.conf'))
+        log_path = os.path.join(me_dir, 'bin', 'internal', 'me5_logging.conf')
+        logging.config.fileConfig(log_path)
         logging.root.setLevel(level)
         logging.getLogger('madgraph').setLevel(level)
     except:
@@ -7429,9 +7438,9 @@ def error(self, msg=''):
             if '--web' in args:
                 i = args.index('--web') 
                 args.pop(i)                                                                                                                                                                     
-                cmd_line = MadEventCmd(os.path.dirname(root_path),force_run=True)
+                cmd_line = MadEventCmd(me_dir, force_run=True)
             else:
-                cmd_line = MadEventCmdShell(os.path.dirname(root_path),force_run=True)
+                cmd_line = MadEventCmdShell(me_dir, force_run=True)
             if not hasattr(cmd_line, 'do_%s' % args[0]):
                 if parser_error:
                     print(parser_error)
diff --git a/epochX/cudacpp/gg_ttgg.mad/bin/internal/misc.py b/epochX/cudacpp/gg_ttgg.mad/bin/internal/misc.py
index d3fed3baa2..91cd3e5c22 100755
--- a/epochX/cudacpp/gg_ttgg.mad/bin/internal/misc.py
+++ b/epochX/cudacpp/gg_ttgg.mad/bin/internal/misc.py
@@ -347,7 +347,7 @@ def tell(msg):
     if dependency=='ninja':
         if cmd.options['ninja'] in ['None',None,''] or\
          (cmd.options['ninja'] == './HEPTools/lib' and not MG5dir is None and\
-         which_lib(pjoin(MG5dir,cmd.options['ninja'],'libninja.a')) is None):
+         which_lib(pjoin(MG5dir,cmd.options['ninja'],'lib','libninja.a')) is None):
             tell("Installing ninja...")
             cmd.do_install('ninja')
  
diff --git a/epochX/cudacpp/gg_ttgg.mad/bin/internal/shower_card.py b/epochX/cudacpp/gg_ttgg.mad/bin/internal/shower_card.py
index c6d3948cc4..c344ea1b15 100755
--- a/epochX/cudacpp/gg_ttgg.mad/bin/internal/shower_card.py
+++ b/epochX/cudacpp/gg_ttgg.mad/bin/internal/shower_card.py
@@ -45,7 +45,9 @@ class ShowerCard(dict):
     false = ['.false.', 'f', 'false', '0']
     logical_vars = ['ue_enabled', 'hadronize', 'b_stable', 'pi_stable', 'wp_stable', 
                     'wm_stable', 'z_stable', 'h_stable', 'tap_stable', 'tam_stable', 
-                    'mup_stable', 'mum_stable', 'is_4lep', 'is_bbar', 'combine_td']
+                    'mup_stable', 'mum_stable', 'is_4lep', 'is_bbar', 'combine_td',
+                    'space_shower_me_corrections', 'time_shower_me_corrections',
+                    'time_shower_me_extended', 'time_shower_me_after_first']
     string_vars = ['extralibs', 'extrapaths', 'includepaths', 'analyse']
     for i in range(1,100):
         string_vars.append('dm_'+str(i))
@@ -82,7 +84,11 @@ class ShowerCard(dict):
             'b_mass' : {'HERWIG6':'b_mass', 'PYTHIA6': 'b_mass', 'HERWIGPP': 'b_mass', 'PYTHIA8': 'b_mass'},
             'analyse' : {'HERWIG6':'hwuti', 'PYTHIA6':'pyuti', 'HERWIGPP':'hwpputi', 'PYTHIA8':'py8uti'},
             'qcut' : {'PYTHIA8':'qcut'},
-            'njmax' : {'PYTHIA8':'njmax'}}
+            'njmax' : {'PYTHIA8':'njmax'},
+            'space_shower_me_corrections' : {'PYTHIA8':'space_shower_me_corrections'},
+            'time_shower_me_corrections' : {'PYTHIA8':'time_shower_me_corrections'},
+            'time_shower_me_extended' : {'PYTHIA8':'time_shower_me_extended'},
+            'time_shower_me_after_first' : {'PYTHIA8':'time_shower_me_after_first'}}
     
     stdhep_dict = {'HERWIG6':'mcatnlo_hwan_stdhep.o', 'PYTHIA6':'mcatnlo_pyan_stdhep.o'}
     
diff --git a/epochX/cudacpp/gg_ttgg.mad/bin/internal/ufomodel/py3_model.pkl b/epochX/cudacpp/gg_ttgg.mad/bin/internal/ufomodel/py3_model.pkl
deleted file mode 100644
index f71ba45bbc6d4acc8d32bb06662fe900a694009f..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 42822
zcmb__2bdJa_B|}1h$1K|m=F<Vm%KZ>uqY~s1X)3xl^}}iusdw;xPiViOVY+T0_L1^
z&N=6tbB@o5@%hX-{m-ec4%1z`!0-F}-gl|Gb?cmas;fiwboU-sRNIuQtC!RQYglcI
zq=(H*Wz#h+^D^n$T(;ZPmb!FfH@id0;daO3*_}Id=+K;MO4~)Vi%`rq*_~#uhr)uC
z<Qi%l(^<Q7dtlRb*q#m9TsON5FxM$wllX4-x`^#sY!%J!2)2~H-t4aYw>j6KvgxMX
z>#GE_3mTg34XQfL?t+-wmc|ybyH|Cby$)+uO6t@0hLccM|EUg5schEnp^9@RB=@p=
zs-BA)>T;I7k=0XiEvT!nsl`~>8&_?nA{!f;({?XaT$Eb4#NK3fHy|#MmIbN$R1Tb*
zR&|)YE?f=S8V--W*^1fSQO%|s=hrZE<?OveA>@`UNZWm8GbMZT+3Vw<+LqSlTw3t|
zCVLCV-g5Q^aPFC2kZWn4*4i{LE%sKx=V<I}Z(Y@))#3>C^+sr^y^ZA#%2K<Zg|V<O
z0#+c~>Y;|CsjjhR>KuDpYa_opYmVLD+PJE>)yvw%+SJ<I+H8f@dxg~pC8V*P1y;-F
z8$dozghhkx?LnxDL+u?xjJtk^X8adsoMjITGtRP$!PuvOaizytQph;SQ&?{g)(Y$G
zA)2pifkLaNg^dT~O|4$;RA`mPCH7DY8>UdzutJXEo~ngudjv>2A^nBvWp*iOiedxl
z(`*15&dO{~QVsQ1&Mx!E&sUQGQ^#QZ%6+CLyTW5?T4s-IpJ_u3l>vNIA$X+^p0h`L
z;JIb?j_rf@G=uL{2tLN!4z2diz~+W;U1sm1p*vwi?7C`(`p@d&k5G{rde_3?AL~Ka
z+T%2I?J|2eBeXwlr;fMB>&@NS%(8nS%LI=lXHV2D7@J9}VcEx?9G<!R*n41ptz#yf
zQb<_k5!TszYQnl@_S6vJgRjR%tfLuvS|Rja9(2~8uA#Hb?7iEE?qr5e7D88h(DUql
zH1xb>_6#F*urYSCXX=g7#mq9RkY%>VV%htGg-12Zo&%DOvE3~onQZSD-WHSX{jn`J
z$4<xteO)SRrR(gu{x0Yr!VV~;t|2w2Z+fVGpjMb3Y^StB9yA{`9KzD}Jgsnjv%*@?
zg!c40((pji)H>8ogU2low&$;gXsTVWiMpGKtU{s&k0_TKYG*W2E;ZOb$V|jDlsXOU
zGtO?rc=8Cw`mB9Yt#%Vew1=6t*{4mbQ&R)aR;k)tLrZhkZfWnNzDE%0UI#3|naghX
zIv~`gn=J^>7OQR6P-o{b%AGA<O0*VRv)%utt?qN}g^?W8g+NWJu|7RdQr*V$vY<LA
z4bAo4>_zCk*s+%s>nn!z{Q36M+WDTGZWDE>n|-jFo2B+4dD}_a%UtEznuexSJ+4G#
ziGAowHG<P>YZkOD!b$otoTPcyZ)|B<P~&?K#|e90N?jcwjdXKvseQy;`$$D!jsHsR
zqf+X+3ReLOaG_D3weyaBbXA9{4oBO|RoJBQHHn11BCBS5i`1nhYp-<dW7Ndjd&2Ja
zu~kLc*&BM)!3ElJ)2u$~nuC{l_VKefLdXJKXtc__6t`ZDx)wQMTC2r@KC#%!;iTt|
z@JV`v)%c!laoA3A>{E-arIt4^r{(R_llB?bp(`y&w_KqX#GR=)mA11|7Sx<=`9emH
zw$I60$o^dG*s6|Miz!;iTV4+5F^5h%hx3cALHwV};R2Pzh3#(X`Q7Y`)K+V-FP>{(
zQj9_DSXGoh+P*YtU*_1C&r3P>6=4Zi=IyJJ_SJC-*C+`yXU;6OuT>JR3nY||u&-AV
zZg4li_}%Rr4N`8Rl+Iep&02~Ub4#&Ob8E46k%dfeE1vQ%{$sjG_V%QGhhyJ4&uzP_
zxT>hClWIrX-AVf%$G&%7+Oh8o3%NgUKajK^ED)k79?}#K`xG3HN0RoVj{TUXc)S?H
z<WoG6w4ZeBr%3U1nBtkd{cO^HE>7{hrg*`p=md%vllDuF{j#Qbg%m}eidU2NYmWUo
zDc%TEyqUM(O4@H1P^3Ad-f?$VY!7UZchye8a>9PkFPGczr#aXDL#B>L+aGwlPHCC_
zp|#xl5W`q>wEdCwVJoNK$JUM5X`<8b6K=CEdYgS(Y;{$a+N*G!wQ$XAnJ;!NT;W=g
z%i5JzZ~sE@v*0W=KuzV()t33fvA+z9|0-{PowWa3AU;L$-&pPx{2!(u7iBmcl-u8`
z8Tg%XC;FZ$*3l|{D7IEv19-4h+&?PrpB($=F!wKc``4uXTb%oMJ-$3n{h@Y*KZ6}1
zQEvaG()b%`#0T>qrm?P0<6oTylaLNL>oX7Oh=L<Uu%T?E6Y6>C44afLu(7mc9eDK~
zT3ssZqQ=~$D}((*&8RY24-RH1-PmF{;K=%DV4hvwJY@q_?}{ytQB%owXICb>AqtN4
zPyzl?P<o=CmyKYPvN3Et*<SGa)3i)BK~-hDDWm;OS|OXEu9EG|2E&L-A2cx8^>nhE
zt2#GtUzO?>?8sENM8T1*RCAE()~M&DFKkk_fsLoyk6ty~WLp;fv8+f)e>hZn+p)zo
zmIKhh^t$QvwpaC4YW}%nxdXd0!GS0^Qmg`k1WQoQ%OKdK42F#-I0W8|KY~LU<L8*D
zkYRAB9EY>TG=wA2z#P}tIhLxr+S+;$%h;7UCQxvsTm=L<R-m4jk+4Y_1sl(?5?+6M
zSIB5o)%M<z(SEiSqhu#IRJLQ-V#;=BG%(u@bhf*wdS9!rw?E6S?8<D%qTt9l6%b^*
z8|rx(51W+TVdL3OpjYkMGLc0;!I2d*2@aLuWVV<R+yf0vu)9uhimIyw)r_oSN2a$Y
z3XV)w%|Uw8P|wR=ut}K?8&7X<cr*SEo@9)lVP&aQ!=W<Vhb^W_IRg#Ma6_HpOjTF=
zjNah0*p)f*(!r5^RX~vA9Mto&A8b<ghmGer7v5k}9>5qs$HYjffkWkZAX`kMm_h?{
z?4fg<r|P)NBekkjWMLU;t@-*fboP-tlKOXYs;sVRyQJx*=J_bJdrh@ZH>qdv#dF2N
zzdXBkNFOZ?C?_QY>&QW{xU`Z+wg(r8(u8_mnqiaD0vjK>1@Nll_{51bOT|V_?J2@w
z|HxP>S=7}C=Gb7`%B^VN2=>$?xKPzOf{RqC$iiybTJvRccmx-d)E~jU^aw7Ymm|0o
zg^V{u2a`<?(IF@&#esEX87znBP_~DL=rGjtayV>Kj)091(UI_G{E<0|G0QW0Kr(7P
z3NN80j}Q*UMtUfYR&@@=a#bp_u)4R*%X-VOmJbib3ex*SQLTq!CA}PqV^Hvi;#jij
zp*RlZq#O_H$O*6<iWAu$8j6!p&&$cMNjU{JJ`|_I8w|y1jA?%;PDcobVq-lNXRw~(
zK4n$8SK@LeE6n{Y6#U%JW*{yY-MM}aYDqa4){*mInf>{U2xWf(>Up^kHYpdu#<RZ|
z-XQx+7~>x$N=xNZIMf`!j4g%*g<OsXs_CWGT)}#V^V`}>tGSXDs<{dUU(MAF^wnI0
zT2iisb>uo&s=1yKAvHIko|hY8lX4SmT+Pk!25N3$%<_!ZkWmsM)ACk?F#An(_P4Q~
z;gqzhb@sQj!tC!r!O#9q2Kw3Gg<4YXhIQl~SZ04OBSP8Vhk9P_hfT@@u<`64gg40k
zA;z?y{lf@h_M7VLA7S0gUR_zsqeNl$kD=ga|2PBnJbD7<q&x}h$WyS){b{y`a(@Q(
zygUn=l;>dMxjzrDK4w;z%L}NfW9Ewt_9s+jLSBMHO{kaIVmfHPf(DA(OpAJzb#M1@
zuP9$*Uy6Di1z*$~4Ai3DL^&yM!8-CbEJeM;_K>J|QP0bJut|9zHZJNv?BSgW%H#u7
zy;?#(M2%}Z`G`@0$chp2F+5y0A1$A-3x5T|@~EOxK1G!aJz0f<QT=CVp!(ig{pYNE
zyOXQ_3-+b@FH!K-f5kwp{%e$z@?Tg-zJaCs|FJ!!{#(@Z@*QkazK4yg|A9R+6rJ(4
z{zxBYB^72UE~6!7l%mMq^^=Ou;g=P4`$m3d-J74Tm|x&hVtyr9J!==wmERbs#r%$P
zQvQH-<WE?N`HSr#F@K|;mw#ZB@-J*$Ob6Uyjf&~0e7*`TOKqPbG#R%~C-%Yi*<4HN
ztm-_Eby1}v3%^EKYyLAt_&l}_N&WL!m0szsOD{KfR}}maT90gcgu0=el=Wd9*#MR!
z)Sd025!w*-y!3!gN>AAM2yFzf+S+Qz-Iy`U)xAAT-O>>wq!&DD&)tMgragC4G;nOT
z&||Zis&j05t5T7LpHnRJ?<khx$CL2b^dV_%Y&NHtW3vSc{@83uHa#|5p`4VhVIAoU
z%dy#p?V+*hhk9PNg-uF-*!b9N2X8Po0~iw;n}lo+j~bgD*kl@;foR~^Y^lelSk*Z;
zC8|_p;YT3L{OgZ}pM0#y?>ynL8AQ_9*bJtZV>1K=e{6=5O^?kml#?<X){zmg9Gg<M
zhsLH1^}HltlTr>FADaqzgRvROn9$galu_`gv8iN}X>3NLfn&3k9-AFm&v1XjC4{>~
z--#898H0i^W@iR!F}t9glwDyR84F7><JcY&vm5GIcfw-b2^$wP0p38&M8<@~l*%M{
zl$gnEGKtv(4HUDr7BhwQ40ixuOcg5>vnL9^n5hiZVy2;-l)YdbnGQ=ad$T<xCW(4p
zs$r9|4{Th_40r=EGZ_;SQ!caMQDXR0hEdGEXrP$BTFe~QGn}!$nEhCxnEg@k#mr@(
z7IOf~NvVN#<Um-8NwGa7W*+K!sfA5S9c)}o8s0$6e8z;tjFNhIlo*RmrY+Ne28!86
zi^;I=PP1w~%?@H;ifKf_7t_Q*Ev6Y|tQ%n+SpZ8hHrqpD1ogaRVUv=Bjf-i8HxRRs
zF(EN!vIrg}W-*&gVwRwRV)|(@OIdfPnJ4C8_NACZQ1Hb#4Af$lp`4UMVI4UPmSPTP
zdq~U?sORNK*rXf<8yAy@HxP3)V?tsoWH~%a%nCM{#H>UE#cZp^9K*Uh%{(#3vM<FP
zhk`HWcm`@QC!m~^6JZ@W36^3`W_w7?DX8b=RM@1P1{)W1I=q3HGZ+&RQz>V{qr{xW
zCX<-6(Lgc%wU~2Mok#j}RjJ6rue_G|=UvP2n{N0>e;!HwpLJ{WO80zvIW`xd;K+rF
zCb(5}5$bul7&a-Fz{W*i3U9FRxQsFWeU5}&j=GY31shC~uS5eSZ>J?+rRtP?wJH@^
z_`%sS|Iln1eq9brzJ{c}<T@?+T6!t@Iux){R5XF)8&Jne5f&>&*tq1I;SD6;!kFb5
ztu&*AMy_RVMF=$x&>C-3b!xm_m5MC<)@_-8<F@d-wiWp)JFM{z()$`4w8lH>rN+BZ
zaO7@96KK2#^}O5*o0R)t;~MXWH_-S1W7^mFAVR2dd#&*y)-#;8R*m~hjXcZ>lX?UN
zM;>KFXa{=?^}IX|o0KPD<4HXUZ^mCAKE;?|eK=a4hC@xZXV_wzY|o;BN$#MNe2(=D
zCz@5KlYE{PCiwyij=ac-P?9g9o|l(llky5|Jjqw#4U&9~F@BPx6Y@G7D#<t4VoLH&
zG%(45I?1<K&v2$#4LZrUSz(gzpy0^6j0h$99_o2{A2uoffsH5m0lYzyA2KFLa<qH|
zhf4BewwRLq1Px5GSSR@@>)z&emuIWkmq~txf+L@+9>E;^0`<In37eF!VB<-C%^vE2
z>aCp1<-hQ$mGd`j46i2rrSfR`AKI`KGp;MYRsC{RB|7o%SofxYoA~!|sl<Ok!I2+T
zk09}%P|wTHuu1s^HlFye?4c6R`1$@uUpU`Pds+UjqH|S)be4Z8L_G)GEdPW{W%(Bh
zj{L291X=!rdS3p8O-ctmuM^L*BfQZpi|AY9EIX;_T-9KmWoK3AajlCg6<K%!(OUhJ
zhu-sm;p5smKnRX&HQs`jUe2kmC^)j7q6tKILp?9+!zN_|*tqEK@CN7T4H@H4{PGdf
z0}eIud$Pqe@i#&PC;kwvd1F<l=3c5)WZ_vzYt5g53~Szmr2etIPHWzjUTWS91xI=-
znm}_O)bp}AY*My>jceW#-azwKj9H%1N;67m<b1U?La1@5*4S6osc{=sDzflMrnTme
zVum&LBfYP&L2KNWUTW-*f+O20nn2?K)bp}EY*KcBjcXhTZ=kW5G3{$CK?pSt(;5e{
z?%paKW35s@DGeqHlNy48f898gf%>{}7|K`$!(tT-%j?EcwucVJWvF8n42xATY&`c0
zc!S(WGA5Xlr7{W*H76_CVw#hq(LgQ3wU!-OcW?Z8T6SVzY8iuquVrTjYAw5<j72ak
z7QwL8GLG#bExVzPMKCND!LV^H6W|TBOk_-;rA#Klp|nh9i%H8KXrPu6TFVsH-TQu?
zmMZq8mOWAMwM=E8)-nxctbt*%28N}Uz1bepl0+SAU|6hyVdGk6z#C|p$(TS(LT16C
zwD6%Rqn3TqKrN+O%N*9-IW|_$vHjSWTJ}f5*D{xZTFU__V+jn4B`_?tq}U$PG7ohu
zfnl)(hK*}U!y9Os&zL|<xzxj<v{-C0ZI=c#P)nKCl40GQW1g0S*q2%wQSh}iF;Hu1
zMj0z$Sge3ysl{e{NQ<D36)-GTz_4*Gt?&j~7BVK#QXz}rP+AtV#iV5k8mJ|qwJc@b
zonxMsgV~o_4ne`!;xJHaS%xwez_3^V!&1xPY!7KU0(C5aVX**)jcdun8)!M2F@ctm
zvK$VjWd&PIT2`WgTFSMSV_0|R*aSVtj%8nJISvJ1%kd1<T24S2>t9%`e_^TRWVVO2
zoPs*mzpz;U!p60n4sW3448{anM#-6QC@p8P#iZqIG*C;0)^ZN(?i};9oXft{avlo4
zmh%~?wOoL5QZ9sb<RVyVxtQ%CEtjC4mrG%jav5w~%jNI}TCQMBprul-ghOe$iY+EB
zSEGSiMrtkBu<p(=Ps_FJOD)%-;A^>_fm+KAC@1AcSVwMxrIwr79@26P>Up^pHYvBk
z#<koIZ=mH4#speM%bjp2EqAfSq~&fjP|GN-<sR1EyI*7Ug7aSXrI!0p@U`5}K&|Bg
zl#}uxtRoM>Qp>|^4{3P>^}IX^o0P|3<60hvH&~ZG!5IH`Lzz5@x{~q~8%$E3Mgyf(
zYAMgK?%w+vtED{4zLfGD3ci%*8K|YafO1k^gmvU4SW0=B?I9_zpq`gkVUzM2Y+TCg
z@CH)eV9fH2R*+EwB9~=vB80h**15mMx_hf@g3kSIqA>S&Q1Ekqmw`I>_fW=fmavZe
z2bQ^i!1hq?AEKU@k6@GXF>E~dPv8x5|CBN9=e`Od%za0l`)90sQ%F4zEuRyGxqpFz
zpZk{#^nWb>3bmws4eQ8%VVV6mj0k1_Kh*Q`Eo@T0gN<kZJ-k8oKQN~KAIpD42(#Zw
zXa5uH-mG!6|CuPv{udPd?0;pTpZ#yBCFOTmNB)3i_J1-Wl>J|*=jCtMr2GRL&;DO{
zgX}xt0ov7M-w`3qevHn(h;?t?xY>6i3bXHwf}edC2Kw2rgIZG7g~fBJu*`lvMuf8O
zhB_8tuvmb>#<TAZZ;<_ljA?)Ddmx0_@2s=$$+|c8?t=;&5rx@rjDnwiF9zyazX{4o
z*%a21&0v{(Z?=bW?}IuPQm|M^!Nzmn65imh$ySW<52TglvNas)=+l=irlZd`XrPu|
zw3dFXdt1WQvMu{kOMeu6E!#0rYZ-uYQnrV6WCvJk8OZjKmSWWNQUaTlL9lTxgW(Od
z3}K9~Wpud=g+pl>#uk&7;b@?iUA2}Gtb2R3dyAr!eW|4k1z$^ofm%yB%1Nn!bz~$g
zwTxnWNJ}N^c^M6xlpSH?T6SU&b%yh96jjI=_|%P}o!J;@t0<LS;3;?j!I53zGO8S_
zdgrRfYL(+y_hz4~ayRy+%JC@pDtBj~RyhIXq)dc$WD+b@PG);Z<sPW#WeRLks$k<P
z_hb*HGUJPzN*{Khn52wS6gh!RQ_;Dqaazn?3b6$Dh^z_v(b4H}DKUGa;EPEzP>ZQX
z87n4OM`plM%uKe2#LPlHFWeYO*%vl0W)6Eq#Oz1k8jIOqMdzw^(_-c-M0LY?xW4c`
z04^n_1_i91RFB}ULJIY~%!5rzEo?l?I(UOCr!-^QzpF4GA>1j(>!j*gckeEi>!d8A
zaJn?0;7CUG2$DJo^}IB~CZ!2Bo>ViuK~gP@X+Nn22w_sY>!fVf-5bi4Iw>Ivlggst
zNKW+#l4?agFAHIlvIsVw)M9vpq?RzI{iK#6gh@@%Ngd33rVGz_!}VM|geXkPK><&k
zsvbd7hoYXB!(fwgIBYzrBj62^I+8K|F>Yj;90i9u#^u>!I>sH11|~UCC%K&UOfM!`
zu9IBBzD#l@3XUA3dIU)xi+Wy;gT>RLu<;~MfHz3;M8+)7=+rVQv&de25<-~NB%RdB
ztY@}iQk6QXQ;5Q(PDR0y(^QWjsnb!<%NejqITJRX)LHNbNuAA@Hj_%b-wsF#KCOV)
zD6?`7!W}smwzy=joX3A74`jzb(l9e64XJsJ>27j9;*t3U3gkj|#=A;A)qy>|c!9`8
zaLtC;4kLz*kc&~#Zv;v@)hL&+QN2Q&UXX6ClS@%AQNIi<Of|Nq<ucT#wa&|ElFLzb
z<O*26c*>Uv`SxgIs%chrO%2#;YRcqFv}P13KVsmw4;oY^SE0%3lObtFF=Zn7SEI#|
zYha1bPb{$i_^PR;i%aENc2mH8&`{cMXg?p^BOhKiOs+#1B=S|)+LopTjp@a5J$o7n
zZ$OJ9H^RmU%j70@Q-u8Lg-<3o!w(s4qu!!WZ`G*YFz9R=<Tf9dP1;>NVn}I;+`c;K
zI~4St8rrl45^`4ozMuD?K?%9r#cz8H+@tXC)%XQlAR+hpz~l6QjGI+G$S2TSAu*_U
zXfd7XTuL{4(o6acRvS&MmRZ#$a=)8R+XMN4%I3j%Hf5L(5Bb?J&7po8^6=_lA5pN6
znqbT1F(0<wY4-T)NS;t6Pnt*)@{~`qc{?LnCQq-9<r&5DY@8*55qi#N*{vNGY%gzT
zif=K!m+1}m{OZzqL8bGeDV;KT$xml1ZZI`ROcSWAL|$GU(JPARRTEJ{Uh|3Yd}@bM
zd3|+gZz!}kH5%We=4_fe$2+dbTRx(9D405jN3^$H<hJ*xcNFrwZ6TM-dq(7Pd4F}t
z|53;vXk>mxhB;?#$?+)sp%2>wE&Xsl$yvkLSX@@pPd-{5_{R$T6AkVU$Yg&&KK1ea
z0h!DJS+zRk&lK|K8reI5)wVQe>TBF1eL}wQ!TYM@-NQIzTJg1=w3;~Tq>zA?FI~pA
z4<cVF#;@CCEL)tA|E?J$Lc#dW>KOm07{6_kF#*Q!){GIMVEle{j6W#GAKPT)@$#oN
zV?-zze_kEqFN*QkHW_*R`EAV@5emlNSI78=V*ImB#<Im_^4FR%A{2~&ua5B_#rSWV
zj0rGyz#9kdMMArC6`^44h~Rdw9EuR&NGI6v1y*?PD3#7@#)wNMFm_oT<2s6Q-F6tu
zr0be8Qsa7UF{ahWGSZ82&D)Gi&u)mrtuNTQvH@(n*Sz?aSgL7!O6r2^-tOpw>)s9V
z58lZp04^EvPlwcmSyM};CyMIwa?;oXPZ~X}@2_JNeWQh|=9b!nWFxdIPxaJ2rcIx*
zPql1J4=)=L!;gPSFF5APCa_f<Qwlm;n`%tUrtt8hSZA;qs<=A_%M5ri%>(hI`O*h2
zH-%lkSXfkXZq2zFvSvt>alto7v8pJQi9l`vw<BA^5)$_tI;5&)D|X`r^Q2uLsMz(8
zZ%(#ia9b0uQw*-Jg4@Q0%kc75<2?7q#i>t!ov(1zhsb8NG|o>qXLIR>X6Z*hH_dH{
z+cAdQU*T@&;;Nyea-7m-0Q<VRe)(?ymsTCPo11G0b9-XOrSG6H2Wm|3<f5)Wq&Pr*
zXw`w^R*u>;hFU_@_y`PAsDmS@>T*Yh1gNjS^8M9+e%=&A9ZJ;BG5y07>Tr$feOzZr
zQ>}~$Fn`+n>wMq4lI|FDcI-Q&YTT@GQpyl_c*{^Uz$f6w>*uhX1p)j#sbF6<ykmPl
z$N!#pQx0K{BxaXb%A*u!WehVXqXW#klaMAD)NBo5?numaVwgKA%rP;{wCwC-?mG5?
z(*~?M<%}5SF2r0vhPkW492>)Il5qj%y&J5!Y}t!n#V~gxX7?E8c!jyU#`LDvlBQg$
zRVD<ee>I<Z{9YgI6hoaz)E+U^NeXqci)wAFH_OcDx|Z(n<q`#=*5XsgRaeU%WO29o
z6r%Nxp;algJvG{Ry$^-wP$mR2l_2qXIZc7=r9md^l+uTVwrU7uIzi%Fb#Dcd)F3l8
z$j0I676PdzNPN2Oqd;b8ko`1B%~*3LGYPVJY)ob;5FUWBA6WDBm=sK?P$v5lBtD_$
zD3JX$NWBIrm_Z?s{RtAEL30(z0UD%HgA`1d5J(L{;uGdT1(MPr{q>l1TK>;TyX?J<
zJ7Lh)VfP8+$L%ZgP}DP~7DdC1se{{*G%QD_Si^Nl9sJH_>*w7`90Hk7kUp^?u2&$I
z2H8Uk`>p@>_OkOcF^~p=#Ai%KfgGek_SYaw2OKc?*$;cfKpF`W-`GtGq*;SBxga|q
z`RnOl9QfrmF_0F51VfxwXKD8WU;&!)V#CfAA)I#;D~k$l+0qp{tfG^&!slMg@nBmz
z+yz<)Z}_fNhL|d^(4DbG@Hw&=mRn(<i?R?Xp_NzRniBV_a5to_=JMg?!?5;NOC6*!
z$c<ajpzgK=3)7O}Dq_-n<=2b9l9Ccx5>472j-`gw4_2uk;-;>?UluGRIVukOsM}zh
zLQgbfMBt)t8Da{?$fY}!U3s1gjnrZA;mIFZ4!?J~lupYL3|2e?z*81d6byh`!Fpp+
zB1fW&f%YgOMQHQzIdU{CY5f)LEPqA2oShWCcV?J6i!0g{@EbT*GCaa@41A6p3mcwU
z*+pe?9HSNZwwzh|uF6!OeR$#AkmC_=AU%OF5z-Uk!&6AG;qk0pBquXiarQ2phaNo=
zE0I$WWnem$og+-A!RN^7uw>$g?t+Ozk!P@vA~a1Dvh|ZQ5o4e`i(MmhXT#^nIj~`2
z*+p_LgUOR2qXF$a21L-#htH7<V8dwjrE(#IRo0%ObXsi=4B<s+GvHs$E)o1o;B(|s
z*aG}Tav6h7_)u~=+6?$tuuBC0O86YP3YPfZ#o)s9Lb;mJihwsPi`^Xub60JgMI}lL
zcCTv?X5hM(y(3)L!G{-pV9CWBNW=n{zq8!HUW(JZMT}PNEjOaUz;YAgBP=(==g2Lv
zVMX{cL2hNVLf=kvPxbFS4KFSmrlf_^DUsXI!$5mGfg-ebz~{)Fu;C50WS-o`V8yyM
z$B~kkG|dYghRWQr<e1-$SOecZgoyCn3!fwR!4~k<%KZ$E^3|I89zd*t??FOD_#T1}
zPl~}7@TKJu21ohQX1+%eYv6l~5D~t|;dA5(*aE&Ld6L0Vz9uu@Q;0S2Jxz!R-!t$z
z@+@otUrwH5aFj1+=6fEo2EG>v5#f6gK0LMtOFr+5sY|$Mc$wjfahqVqpux=f3Zf01
zuM#A}`5JtVybeoFZ)b06oh)xKT2XS>h+I8j-%->ByT+U7VW54BKoQ!v;lsB(U`gwp
z2KOjkoG6ud8Lx<Wlu<i}yKjVVP9@|$gn`Y!8=G-21m0)w2-|<)!?Sd-Q8wJZW_<f>
zylwjt!VGL5vv-8;6ZjnY6gJA1kX4LtpN+R{KSP*-?Q`~yuzdlaBVWQ6u;JcnnS90g
zLN<4=4@Q!=YrjUAf$hKS9bx+hK74fqHp-TeZy8_6=I#>#8*kNqhcE-%_v{^E`vE?@
z<pmpID=U+q7+=U1*H$J!Bh0||3wuY{eudAG-(aI`3HhDzg=}$cW%38Y3~Yb0cZBUP
z`0#ERY=kXQCjT(LkS(q)A^#%Gz}5j*x>2@{@ZsZ^uu-;zbYgrVTU=X0IwQ=$)`h(z
z+SY;3k#%7U*l=gMRJt-=v3bA9;P(~GLUlM<QY-7B-9Xcg-6AyW!-sF9z(#4xq&wr|
zH0~t}Xf{N<fu;w$MQD1$=g3B|QJREo%=kD>!c)@=?FO1n*eyb{DSUVY5;j7ELt<~n
zM`^G#ximTHgLVVW=Ij=s*#bV?@P&=il*v|%kJIQu%gNSgH_-HDw+PKP@Hx^CHcFF_
zZ5bb@(Sw$g{%AMQY{za9ngQ@RvOR18P5I)|gzUihC=Jdq?x4++foM0-6ti1|rUX7m
z2Ej&X$`Ue|@lhI_GCZ0gXgAOdWw!{;F!&r94jZ9KBxD5Rqck{Icr>MGH_()^TZAS7
zAMW<TlEz!0rqlj4cLh5sF7M2oPV<s`B>dp;e}mSi+))gVa8$zQ$Y|IwM|P3Vu_HSb
zaAX&eV<-3x9Ag+B;n*2IM|Ob?bJPYLyRuUON3G%*3%`M59K$t7+WReJH#qTP9qe4$
z9oE0OGXWJxCenpb^d7x&Z;aqKlu2lCxqIR&CVVd=RgKl@WHc9~Kn9Q8yMMF?e2z?k
zB?X@_m?Tw<Q<`{LR&ej4THm{g+`8BkJqoB?NmB_F8PRF*IkFcl3B6V7B!8n#XCFn5
zHF1YY++KUbWk63d(i_WGb%i&-wb}iSzf_}fuIvL_T<iWppqj=8d_G^6bepJyy2%VQ
z<5#zt_{Wi1j73TuOZ|usuPW4!^uk*^iUEIwr?m;6z0b<NbaB9z${ZB&O;uPobE;a3
zzewc$#9t!&qYXc@X~MavIdT9j3Hghdq@{-437QFjPy>*9xa~lc1B4W6j?9Dg5E`1K
z7F9C>zw$421qf+{Fkd6^k-BWUR_YB9+(Ao+4X<s<7B4Lks{o}zp=2})w-`RnUn>V0
zV7Ru%Pm$STU;(GG0H;aeG;17YQ=6@o76XjVCfm@=Y!(!t*a}57ipqvCSp$s9h9GV>
zIfbJh8E<8Geb`xu@?2S@Jo>4@#i%&4gf6Q{eYw=#=6v)?KPl!eL;c6Zh9`!}Qp95l
z2P+98dz07O7Y|m@hiGWN4CF#@0cd?E1P4j3Rtm74D7L#wS!Q53R52W;8Ti!_mtnIu
zlb6HI=tmgPk5uSKx#-rnE_5$IGv!F;k~iZXZNOcwa96mvna!}zxzJns(B9FbU~pum
znc^4&#j%RwIF~~G#D$O7jg;dVst!iHi{c&$gOgYx+X;v**u?H;IFS&MW8g{f;T=m@
z?!D?bH@;?+oWfAWMrG<5z<|uZRaCpEL{3Gdf$KB^M7U0e&yh1=LtOYA5knPM3Aoh#
zp@6GE81BmJn^a^z3;hhtXA>^Md=7kg5*RkbJX+3UsAA^4cBh87{R*hgN1%b~0``wk
zT?iju_=F`D)-3#NCa%veW-PffT=5&gE-`>zs=zLbfR)STJ{TAD2Cyp(U{@-zt0G_(
za<vb}^|}G<8Uxt13hcTF*hsnF15-~KtBJFRTAkm31_SnujE@ZAP4MCUQrPeij*?p#
zt2USaU@EZ_aw{4P*tao0f_*!Dj@$to#;%k*85_gK%KI)f7_jeVd<6R*`0&kK*f92J
zxsS1N?1bEp1_Sm3jE`VH2p`^8g$-esjgW^K8^gw``Vll3upec71p6`g9C;izj9n^E
zFgA{zkSEb#z<!GH5$vbobL1J=Fm{<d%UEJ(x}szNdyWASu;<}(<OSF;SVCT8tODzf
zk^$@`21LMKhR=~#V8dYL@+xB$*v2Rsz+PiO1nhPA9C-sa3|1j;GFE~0M#%v776T$+
zZ^MV%g|K0;k@7BM71&lN8Nl9SKm_c4`0$1;Y#3~me85-*wk=8qun!p!0s9C(M?QuP
zgH_5Wj3roTuUD(mPtjn&Ud8wb_Gj?n>rAj=?9uWCV`JD@ihhX(1NK*pk6?cdA70dj
z4Phrn$Ty6QVPpOIKQtJyzh!&``#bm?`5rcmT`E5?R$=pJ!lCXN+q+s)r#1d0^CO}R
zL_e`}gy?7Z9Qg${OjIVnGBzM8FF0lTM86@*K=eC1M~MD_4=)76g2-AC!X9bH{>zO0
zw*mVfh5fI_{>#NKUE}k9u%s@L4!D1BP}C9bsG=hH@c1Vz6lI3_mu<tjWs2Ls<ltpN
zn6|T-wu^yw9YwpYrmgX5ck=WGt9Lc9ZC>^i<{(|oDXeEmp_@u!eK!T~awIWIHejrp
zG`uR-7kK_PPZpO~-4S6J-VNC?GGTha=SWZ3@XV}~jToy)`m2F9Tzm(YbmZI^9Soek
zh!f%51U^SLg$;9#mdzNeIJfmiIdZ*UNZA|l2FgB!iBN71pCenqau)UQ7e#~GCUQ5N
z+PLn$5<D5OrI~Rn1LM|;v9D$v<ug`zjP5I3dh!(PocakC+-6caxUVmi$Tnv3eg^Vw
z6?uP6KB*0IZzZxO<lC9a2N=k=SL8cr@>y+=yY~a*+t)Pg1I^^c2J#X`K1h?-8_B(!
z_-jT!*i1geKt5EF57Xp}+8|$>VIOWLA7LObRpe!we0dw>YcuQ#GkLjzyh4$W)Z}Lx
z$-SQe)@;~EnaL{+<f9e&j+*?+Hptgz*mp9Mk1>$%tjKrK<af3~zBa?YtC@VPfqa}I
z-%XRNzlxxapInW*w~=rkd9{b`@n*u^4TKXE;Y3Zi3l{;3u-H4QtqI>GGv8zb-yVu@
zip%Gn11rj;im{4$2PIWq2eifQ9#TqVPjoVzFQyVJa=w@bAMP*1@_f<XKmBjFI-M?;
zOgB^QZJ<gjs%n?Ymsl?QFjh%y?>-Jsy)zJL&^40)5nZ$3bA*pigigH`6|yg56&cTn
z-W^EqhD!7Vq3`67dk#7pxc4JognNJZ9GMFn<{l{rFjjH5b1R2jH3&3t9mxI>t`vN@
zvklAbx0Ap9hH?9O7xdn)(S}UDDQnHFbq3b7Vx6y9`KL)-UBT`cIrh0Hcs!6+JkouF
zi#NyW&4iYLut5=KG$FqR>k|6AW1D=Tn{fx3`5Fy;O^UDC<@4t1$Pv=QShWdv^rw%%
zHx^v9c@OaF#IRZYD_3e?03gFAwuv9v!~&lqS=i8Y8d)ki#;RQQiRaSxOvZ!9dNEND
z6KtW-h_!;kkmW)$N3vW5A0C2+<wQKnKVVjKB65cL!&E>Uj0#3*ZBkoePHm|nwS!e^
zhq$Tv8#5seW7Poe8%wg?T_xDI#xQ><UD>h>Jcdy`l=P8NJPbZOAPpPd*5z^pV^yk`
z`P;g!m6$h*Yr1gr?pKEvkRC)E6k4DsKG=o<C7#bl$_CX(QAtE~9zHyu4a>PR$v>Q2
z46Pi)Hq!923|LiSZA1rI{(ly_+$?m3LFh^)^cYvDKNCmFv5Zxlcl)+BFP`fQ6z~Xg
z9Qqi>_jp1@#`gsH@F(|R!?S9XoWxi~JHXokB7+xWJsDjLtfvqu!g?xv_?`-En6*+)
zXRKo7lat;#%6-q<ea9j6ETf!(XanJy1c?xy1)n2l!-CK{C&XdCC!}TzpXfcu%yF)P
z<2=Q2zRTfl!BHdR0>&y)L)+hi<?i_}GzaBE^fgGlh>#J97sKbsC9t6}9fiLi$XG=m
zoR=b-vf#F!yIl>rTm~RRE|(KOlFJqFIdUZ|a>=ac@3o`bm`jEP{Oc?KP$E~E6S&%t
zz%?p?YuyC=ZIO`c7^?<jNE@lNDgSzOHORk#h!OcW!iO)7!iG0Rx!lZH#lB|yGB5aU
zK_3I{t%Qou-Ugo|x5I{ME94HwD%v&Lm&tl3x)@mRB2t9)Zusyg7h%J!BjsMkD%Snh
zXxihz<()kJ>j7`ud%GhJbB2j|AD9fu-p>Rg$vyy|BM-uclN}`wF;*ozXHDkC|2@}-
zL1f7F5%NcJeH1=N9)k_%S}Bh+b}a{1U)ZhXz&-&cL$XgYfk?7X!G}Lu2^&s!v^>Mu
zwHw&~TdvQ7$dK!E<d5X~Jbd^a4mOl)<p_C^u`1UG+nTZQ0~KD^)9>T0*|kQyQ`v)R
z@Aw>?9^z0z3MKLq)EEZjWr~ap$Sd$U@+vF_Bvb7luJ6V$v~!W=@vq&*me(t|WC;|7
zF7pd^{eoc+!sAfk0eH<k0IwSc;0-kZZ@L5Eoh~ZN<SoXk8NpZA+*<-m=BZbw{QKc=
zBgQb!@33oRoZp4dk@sLZ&ip&9E;tX%`gfcHrAy|yuNL`S@0+>)W8nHgaeb({w(h)I
zuG$FKM`o^%4P2inu1{SqfA}h76=T)#ZLDX~lG^a#$!7>KjNRw#7a6-R;KQpEu;HC{
zq<qC##estROVE<Ee2t0$_`eK}0Dl7?-jaX~1CNq#8B5>{-;gqZeP;msUV;4(0jrcB
zJ+RrKiR8Xd=x^Mg5M!YEnO!3}{{o*Qzru1N@eekwW+J5{8}~Oe*Y5_dKNQ!WE|<4)
zM~{%d7#kV*Cixp)gNT0^91-y^e2#R$t5czYA6+UP;c*9^uZkJKiWm?9>ja-8og-jn
z(!~P{4}4R25|+q1h%wNt%dQccuJGY=&9EGJ{t>Fx417~$;JcZ*);Dl%pt!nguA!^n
zgL4tC4b5CV3|u`G*G4XvzmY3sW5yQl!MX6@NiPH##%>e#i;Uf-@Zp^a*zooqDZLr1
zIQT@W`Yl(zM%M>^1N7z$k3eq$AKsII4MUHTtr)BJVZM`R0NdIC)>naT69KD~ejeCt
z^+bfabHzl%<542pBFaG2pPeIFZwH?v17JCs_{YNBVF=&O53hYm7TLtxo7r|Sunkmf
z#hR^`&o%~w-sWAu;B7E|H_H90K=e1+5_1lN3^@!|ISkP`Oj`RK@UE@OHJZava}L7{
zISf}hjL<nOTKgOlZRAjD&Y{eZLqg?Hu5;LaZF9hnw{7H5Va{QsA%{^ahf1BpthLXf
zt!**doWqWW9ClJUjL|tPU;7-|+7>&TbJ)d@!>%fau{wuQYnubVQfHo%<IFkiX2@Z@
z%3*h%L;c$4(AKt?V9sHpA%{sShsipJGuJ+cwzkC{<{YMka!6-(!y{FBse*5=?8zLw
zNBO6s;>a|*aOduN_XvsPIt-m8d%;_9tY!b`QFgil*xLojT<TwRd#_UYSKN~%8O3w6
zsy3kRqflqKsMd#WTHbGWJ|q`U!3#Mv&G55A@ab0Th`GYo$zw0u?AaoCTV`oD*%z&N
z*Jcj>ab!P+>PO#a%&0Dv{ozn2@a%$y=B&&`b*>!1s44&AKV_;qrqwGr*_^c2=34p6
z3<5Iz9x7gp(f9|#g+GzSo<9D}Qklm{7qymcA=FOk3~|(#A*L56hTy{<gZr_ND3Lle
z1lVcR9GTDl*r;b|yn5F0d3^lOMT%r(sa_3kFZHnh#EDRxfkPu4q!5buw>ESxeAIfn
zv}C3hm+?c&jR*+RXhO}AW_HpN0azw2aA?2<Yy(^X$a8-Hhy^<<(_M5M(E+-kh9BpM
z5TGX{2Zu&)Wm^Hdm#zj}h=2fa5o-7yjh%44z(3%sahAkzB3WWH1X(UE06kcN9-=|*
z0Mzk8z3s}^<|d_@no``h9L;3~NQWw<!!*(d0n*_yB);MkAPp@o8RQP>&{D!4Q2=|S
zf;~#Zei*>!W3Z#zgT<yQ8x$v<UhI+{T|l~Ak*?6B{OhB7s8_Bf>EL!rk0~HMR*@d3
zNyqSl-`&xUUu)9xc9fn_KzgDgJ;^0iJJrl;>{KViq1jJiTUDoEx70g$DR*$p|Kd{a
z;HM%gn5w6thHnC~C#LGU!A3hng8@$Oq5MSg)O1c7Y5|1j3(qV7I!gha?E+cX>Xgp$
zQM|c%Fi?1?!Bh%?sq}atia9xd182fLQk6kS%*muYCj)%RVW*s1ko9>g>+^NiN9n9D
zP+6<vo2Nc<gu77nyGZx@L-)H__3ISg1ewB7x<vK6RQG#V_q)vRhrW8W`K+=(K;=<k
zeoHV8WkbqK<Z>Vd8}|yucBRXf*(BJwR~ZPWwnI3$UmJvKU-jF9@lP<Bs|%93MkRBt
zn~e3ImVTWflQoncVY|M7?FPkmqh@<wv)!~hw!-0ry0*04T)=jVV!PF4%Zv^N`ZfdM
zjCKYZcc^{F!dy!Dy}596p?o&W?JB9Xdc}A_N^%Xgjp>@YhT5Fmp%T1PWwV*pwW_z(
zx2lgdV9FP`(y&%lZE6jw+Q1q<<tJ3t-z2Hp&?=wu18e-_A5}fA$|>Km#=p8zRb-8s
z@{fwd;kK%?HFnBhto3GX9c#jrKUm{m*{SMiRjJDk_4gU8I$3*7`I~KvFhEsZtm-L$
zvbLPH&8^v0>svKbex`?i)TXMtRX62F)~;r)ht)9Ud)D|jp<C4(%MH!=5^+ks{OpRn
zONqQ2R=qeb>G{fElWQ3v_o%je4Q-`zpK80GZG4Tnz9m=Fl*+V7syUUF2jHHj4x}xD
zZwl8~4Iq_TYl%Flx;+G2Y~2W{$5s`&iD50CP0PY`T}`cpm#5S6Fk<kjpSpTUEwF|y
z#Gn3XY0VC6YRJ}h+l!#-y6SYUn>>QBlB7JU5I3A=Z8go>dRnXEu5D>u(AdyiU!#)9
z%43Lev%+8PNHu30TAJM^c^obHoRh-1v8onX)wZfn124p%Dq&a-??pd}=Ghr&9bT8p
zrG}^Q$3jxsY{T$L(`S^8C`$~VI;kQxb^P#pE46TFQ+-9MskXVXWq1=ls@#|zo^9H0
zyMqU6xPd$N8<-eYF|4$vu?1fktIZ7@(r+L@(`+Et&=HmR|3F8cA}4O{bf{0YVoaV!
zb*?<aJgYEK(&~!|wRn}fCW{XkH{=>X@GM$as%<T;&CQgbZcMwcqCcm)KM$+6b*izk
lWl_z7hT4Phedib8U7<8LHZ(QlQaQ-Z%8RPUOR%lO{vY*^v@!qy

diff --git a/epochX/cudacpp/gg_ttgg.mad/bin/madevent b/epochX/cudacpp/gg_ttgg.mad/bin/madevent
index c944aa1faf..dff9711b73 100755
--- a/epochX/cudacpp/gg_ttgg.mad/bin/madevent
+++ b/epochX/cudacpp/gg_ttgg.mad/bin/madevent
@@ -32,6 +32,7 @@ except ImportError:
 
     
 import os
+pjoin = os.path.join 
 import optparse
 
 # Get the directory of the script real path (bin)
@@ -160,13 +161,30 @@ except:
     pass
 import internal.madevent_interface as cmd_interface
 
+# check for plugin customization of the launch command
+launch_interface = cmd_interface.MadEventCmdShell
+if os.path.exists(pjoin(root_path, 'bin','internal', 'launch_plugin.py')):
+    with  misc.TMP_variable(sys, 'path', sys.path + [pjoin(root_path, 'bin', 'internal')]):
+        from importlib import reload
+        try:
+            reload('launch_plugin')
+        except Exception as error:
+            import launch_plugin
+    launch_interface =  launch_plugin.MEINTERFACE
+
+
+#Source use this executable for compilation always allow it
+force_run = False
+if (args and args[0] == 'treatcards'):
+    force_run=True    
+
 # Call the cmd interface main loop
 try:
     if '-h' in args or '--help' in args:
-        launch = ME.MadEventCmdShell(me_dir=os.path.dirname(root_path), force_run=True)
+        launch = launch_interface(me_dir=os.path.dirname(root_path), force_run=True)
         launch.exec_cmd('help generate_events')
         sys.exit(0)
-    with cmd_interface.MadEventCmdShell.RunWebHandling(os.path.dirname(root_path), ):
+    with cmd_interface.MadEventCmdShell.RunWebHandling(os.path.dirname(root_path), force_run=force_run):
         if (args and os.path.isfile(args[0])):
             # They are an input file 
             input_file = args[0]
@@ -178,7 +196,7 @@ try:
                 cmd_line.run_cmd('import command ' + input_file)
                 cmd_line.run_cmd('quit')      
             else:
-                cmd_line = cmd_interface.MadEventCmdShell(force_run=True)
+                cmd_line = launch_interface(force_run=True)
                 cmd_line.use_rawinput = False
                 cmd_line.haspiping = False
                 cmd_line.run_cmd('import command ' + input_file)
@@ -188,7 +206,7 @@ try:
             if options.web:
                 cmd_line = cmd_interface.MadEventCmd(force_run=True)
             else:
-                cmd_line = cmd_interface.MadEventCmdShell(force_run=True)
+                cmd_line = launch_interface(force_run=True)
             if not hasattr(cmd_line, 'do_%s' % args[0]):
                 if parser_error:
                     print( parser_error)
diff --git a/epochX/cudacpp/gg_ttgg.mad/src/HelAmps_sm.h b/epochX/cudacpp/gg_ttgg.mad/src/HelAmps_sm.h
index 459f21394d..8b4ad719be 100644
--- a/epochX/cudacpp/gg_ttgg.mad/src/HelAmps_sm.h
+++ b/epochX/cudacpp/gg_ttgg.mad/src/HelAmps_sm.h
@@ -8,7 +8,7 @@
 // Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+// MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
@@ -863,6 +863,7 @@ namespace mg5amcCpu
           const fptype allV2[],
           const fptype allV3[],
           const fptype allCOUP[],
+          const double Ccoeff,
           fptype allvertexes[] ) ALWAYS_INLINE;
 
   //--------------------------------------------------------------------------
@@ -873,6 +874,7 @@ namespace mg5amcCpu
   VVV1P0_1( const fptype allV2[],
             const fptype allV3[],
             const fptype allCOUP[],
+            const double Ccoeff,
             const fptype M1,
             const fptype W1,
             fptype allV1[] ) ALWAYS_INLINE;
@@ -886,6 +888,7 @@ namespace mg5amcCpu
           const fptype allF2[],
           const fptype allV3[],
           const fptype allCOUP[],
+          const double Ccoeff,
           fptype allvertexes[] ) ALWAYS_INLINE;
 
   //--------------------------------------------------------------------------
@@ -896,6 +899,7 @@ namespace mg5amcCpu
   FFV1_1( const fptype allF2[],
           const fptype allV3[],
           const fptype allCOUP[],
+          const double Ccoeff,
           const fptype M1,
           const fptype W1,
           fptype allF1[] ) ALWAYS_INLINE;
@@ -908,6 +912,7 @@ namespace mg5amcCpu
   FFV1_2( const fptype allF1[],
           const fptype allV3[],
           const fptype allCOUP[],
+          const double Ccoeff,
           const fptype M2,
           const fptype W2,
           fptype allF2[] ) ALWAYS_INLINE;
@@ -920,6 +925,7 @@ namespace mg5amcCpu
   FFV1P0_3( const fptype allF1[],
             const fptype allF2[],
             const fptype allCOUP[],
+            const double Ccoeff,
             const fptype M3,
             const fptype W3,
             fptype allV3[] ) ALWAYS_INLINE;
@@ -934,6 +940,7 @@ namespace mg5amcCpu
            const fptype allV3[],
            const fptype allV4[],
            const fptype allCOUP[],
+           const double Ccoeff,
            fptype allvertexes[] ) ALWAYS_INLINE;
 
   //--------------------------------------------------------------------------
@@ -945,6 +952,7 @@ namespace mg5amcCpu
              const fptype allV3[],
              const fptype allV4[],
              const fptype allCOUP[],
+             const double Ccoeff,
              const fptype M1,
              const fptype W1,
              fptype allV1[] ) ALWAYS_INLINE;
@@ -959,6 +967,7 @@ namespace mg5amcCpu
            const fptype allV3[],
            const fptype allV4[],
            const fptype allCOUP[],
+           const double Ccoeff,
            fptype allvertexes[] ) ALWAYS_INLINE;
 
   //--------------------------------------------------------------------------
@@ -970,6 +979,7 @@ namespace mg5amcCpu
              const fptype allV3[],
              const fptype allV4[],
              const fptype allCOUP[],
+             const double Ccoeff,
              const fptype M1,
              const fptype W1,
              fptype allV1[] ) ALWAYS_INLINE;
@@ -984,6 +994,7 @@ namespace mg5amcCpu
            const fptype allV3[],
            const fptype allV4[],
            const fptype allCOUP[],
+           const double Ccoeff,
            fptype allvertexes[] ) ALWAYS_INLINE;
 
   //--------------------------------------------------------------------------
@@ -995,6 +1006,7 @@ namespace mg5amcCpu
              const fptype allV3[],
              const fptype allV4[],
              const fptype allCOUP[],
+             const double Ccoeff,
              const fptype M1,
              const fptype W1,
              fptype allV1[] ) ALWAYS_INLINE;
@@ -1008,6 +1020,7 @@ namespace mg5amcCpu
           const fptype allV2[],
           const fptype allV3[],
           const fptype allCOUP[],
+          const double Ccoeff,
           fptype allvertexes[] )
   {
     mgDebug( 0, __FUNCTION__ );
@@ -1042,6 +1055,7 @@ namespace mg5amcCpu
   VVV1P0_1( const fptype allV2[],
             const fptype allV3[],
             const fptype allCOUP[],
+            const double Ccoeff,
             const fptype M1,
             const fptype W1,
             fptype allV1[] )
@@ -1080,6 +1094,7 @@ namespace mg5amcCpu
           const fptype allF2[],
           const fptype allV3[],
           const fptype allCOUP[],
+          const double Ccoeff,
           fptype allvertexes[] )
   {
     mgDebug( 0, __FUNCTION__ );
@@ -1103,6 +1118,7 @@ namespace mg5amcCpu
   FFV1_1( const fptype allF2[],
           const fptype allV3[],
           const fptype allCOUP[],
+          const double Ccoeff,
           const fptype M1,
           const fptype W1,
           fptype allF1[] )
@@ -1134,6 +1150,7 @@ namespace mg5amcCpu
   FFV1_2( const fptype allF1[],
           const fptype allV3[],
           const fptype allCOUP[],
+          const double Ccoeff,
           const fptype M2,
           const fptype W2,
           fptype allF2[] )
@@ -1165,6 +1182,7 @@ namespace mg5amcCpu
   FFV1P0_3( const fptype allF1[],
             const fptype allF2[],
             const fptype allCOUP[],
+            const double Ccoeff,
             const fptype M3,
             const fptype W3,
             fptype allV3[] )
@@ -1197,6 +1215,7 @@ namespace mg5amcCpu
            const fptype allV3[],
            const fptype allV4[],
            const fptype allCOUP[],
+           const double Ccoeff,
            fptype allvertexes[] )
   {
     mgDebug( 0, __FUNCTION__ );
@@ -1225,6 +1244,7 @@ namespace mg5amcCpu
              const fptype allV3[],
              const fptype allV4[],
              const fptype allCOUP[],
+             const double Ccoeff,
              const fptype M1,
              const fptype W1,
              fptype allV1[] )
@@ -1260,6 +1280,7 @@ namespace mg5amcCpu
            const fptype allV3[],
            const fptype allV4[],
            const fptype allCOUP[],
+           const double Ccoeff,
            fptype allvertexes[] )
   {
     mgDebug( 0, __FUNCTION__ );
@@ -1288,6 +1309,7 @@ namespace mg5amcCpu
              const fptype allV3[],
              const fptype allV4[],
              const fptype allCOUP[],
+             const double Ccoeff,
              const fptype M1,
              const fptype W1,
              fptype allV1[] )
@@ -1323,6 +1345,7 @@ namespace mg5amcCpu
            const fptype allV3[],
            const fptype allV4[],
            const fptype allCOUP[],
+           const double Ccoeff,
            fptype allvertexes[] )
   {
     mgDebug( 0, __FUNCTION__ );
@@ -1351,6 +1374,7 @@ namespace mg5amcCpu
              const fptype allV3[],
              const fptype allV4[],
              const fptype allCOUP[],
+             const double Ccoeff,
              const fptype M1,
              const fptype W1,
              fptype allV1[] )
diff --git a/epochX/cudacpp/gg_ttgg.mad/src/Parameters_sm.cc b/epochX/cudacpp/gg_ttgg.mad/src/Parameters_sm.cc
index 05eba20217..067445b198 100644
--- a/epochX/cudacpp/gg_ttgg.mad/src/Parameters_sm.cc
+++ b/epochX/cudacpp/gg_ttgg.mad/src/Parameters_sm.cc
@@ -7,7 +7,7 @@
 // Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+// MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
diff --git a/epochX/cudacpp/gg_ttgg.mad/src/Parameters_sm.h b/epochX/cudacpp/gg_ttgg.mad/src/Parameters_sm.h
index 41830f87ca..9581d66e0e 100644
--- a/epochX/cudacpp/gg_ttgg.mad/src/Parameters_sm.h
+++ b/epochX/cudacpp/gg_ttgg.mad/src/Parameters_sm.h
@@ -7,7 +7,7 @@
 // Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+// MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
diff --git a/epochX/cudacpp/gg_ttgg.mad/src/cudacpp_src.mk b/epochX/cudacpp/gg_ttgg.mad/src/cudacpp_src.mk
index f2804ffb85..159e19a46d 100644
--- a/epochX/cudacpp/gg_ttgg.mad/src/cudacpp_src.mk
+++ b/epochX/cudacpp/gg_ttgg.mad/src/cudacpp_src.mk
@@ -36,6 +36,13 @@ endif
 # See https://www.gnu.org/software/make/manual/html_node/Implicit-Variables.html
 ###RANLIB = ranlib
 
+# Add -mmacosx-version-min=11.3 to avoid "ld: warning: object file was built for newer macOS version than being linked"
+LDFLAGS =
+ifneq ($(shell $(CXX) --version | egrep '^Apple clang'),)
+CXXFLAGS += -mmacosx-version-min=11.3
+LDFLAGS += -mmacosx-version-min=11.3
+endif
+
 #-------------------------------------------------------------------------------
 
 #=== Configure the CUDA compiler (note: GPUCC is already exported including ccache)
@@ -266,11 +273,11 @@ endif
 ifneq ($(GPUCC),)
 $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so : $(cxx_objects) $(cu_objects)
 	@if [ ! -d $(LIBDIR) ]; then echo "mkdir -p $(LIBDIR)"; mkdir -p $(LIBDIR); fi
-	$(GPUCC) -shared -o $@ $(cxx_objects) $(cu_objects)
+	$(GPUCC) -shared -o $@ $(cxx_objects) $(cu_objects) $(LDFLAGS)
 else
 $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so : $(cxx_objects)
 	@if [ ! -d $(LIBDIR) ]; then echo "mkdir -p $(LIBDIR)"; mkdir -p $(LIBDIR); fi
-	$(CXX) -shared -o $@ $(cxx_objects)
+	$(CXX) -shared -o $@ $(cxx_objects) $(LDFLAGS)
 endif
 
 #-------------------------------------------------------------------------------
diff --git a/epochX/cudacpp/gg_ttgg.mad/src/mgOnGpuConfig.h b/epochX/cudacpp/gg_ttgg.mad/src/mgOnGpuConfig.h
index e540c8587c..55d03f1252 100644
--- a/epochX/cudacpp/gg_ttgg.mad/src/mgOnGpuConfig.h
+++ b/epochX/cudacpp/gg_ttgg.mad/src/mgOnGpuConfig.h
@@ -15,7 +15,6 @@
 #define MGONGPUCPP_GPUIMPL cuda
 #elif defined __HIPCC__
 #define MGONGPUCPP_GPUIMPL hip
-#include "hip/hip_runtime.h"
 #else
 #undef MGONGPUCPP_GPUIMPL
 #endif
@@ -24,16 +23,19 @@
 // ** NB2 Baseline on b7g47n0004 fluctuates (probably depends on load on other VMs)
 
 // Choose if curand is supported for generating random numbers
-// For CUDA, by default, it is supported
-// For HIP, by default, it is not supported
-// For C++, by default, do not inline, but allow this macro to be set from outside with e.g. -DMGONGPU_HAS_NO_CURAND
-#ifdef __CUDACC__
-#undef MGONGPU_HAS_NO_CURAND
-#elif defined __HIPCC__
+// For HIP, by default, do not use curand (common random numbers will be used instead)
+// For both CUDA and C++, by default, do not inline, but allow this macro to be set from outside with e.g. -DMGONGPU_HAS_NO_CURAND
+// (there exist CUDA installations, e.g. using the HPC package, which do not include curand - see PR #784 and #785)
+#if defined __HIPCC__
 #define MGONGPU_HAS_NO_CURAND 1
 #else
+//#ifdef __CUDACC__
+//#undef MGONGPU_HAS_NO_CURAND // default
+////#define MGONGPU_HAS_NO_CURAND 1
+//#else
 //#undef MGONGPU_HAS_NO_CURAND // default
 ////#define MGONGPU_HAS_NO_CURAND 1
+//#endif
 #endif
 
 // Choose floating point precision (for everything but color algebra #537)
diff --git a/epochX/cudacpp/gg_ttgg.mad/src/mgOnGpuCxtypes.h b/epochX/cudacpp/gg_ttgg.mad/src/mgOnGpuCxtypes.h
index 46d9f02733..5532e22fa1 100644
--- a/epochX/cudacpp/gg_ttgg.mad/src/mgOnGpuCxtypes.h
+++ b/epochX/cudacpp/gg_ttgg.mad/src/mgOnGpuCxtypes.h
@@ -159,6 +159,12 @@ namespace mg5amcCpu
     return cxsmpl<float>( a, 0 ) * b;
   }
 
+  inline __host__ __device__ constexpr cxsmpl<float>
+  operator*( const cxsmpl<float>& a, const double& b )
+  {
+    return a * cxsmpl<float>( b, 0 );
+  }
+
   template<typename FP>
   inline __host__ __device__ constexpr cxsmpl<FP>
   operator/( const cxsmpl<FP>& a, const cxsmpl<FP>& b )
diff --git a/epochX/cudacpp/gg_ttgg.sa/CODEGEN_cudacpp_gg_ttgg_log.txt b/epochX/cudacpp/gg_ttgg.sa/CODEGEN_cudacpp_gg_ttgg_log.txt
index b06fbec52e..3f8e2f83ed 100644
--- a/epochX/cudacpp/gg_ttgg.sa/CODEGEN_cudacpp_gg_ttgg_log.txt
+++ b/epochX/cudacpp/gg_ttgg.sa/CODEGEN_cudacpp_gg_ttgg_log.txt
@@ -14,7 +14,7 @@ Running MG5 in debug mode
 *                   *        * *        *                  *
 *                 *                       *                *
 *                                                          *
-*         VERSION 3.5.1_lo_vect         2023-08-08         *
+*         VERSION 3.5.2_lo_vect         2023-11-08         *
 [1;31m*                                                          *[1;0m
 [1;31m*          WARNING: UNKNOWN DEVELOPMENT VERSION.           *[1;0m
 [1;31m*            WARNING: DO NOT USE FOR PRODUCTION            *[1;0m
@@ -53,7 +53,7 @@ Note that you can still compile and run aMC@NLO with the built-in PDFs
 Using default text editor "vi". Set another one in ./input/mg5_configuration.txt
 Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt
 Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt
-import /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_cudacpp_gg_ttgg.mg
+import /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttgg.mg
 The import format was not given, so we guess it as command
 set stdout_level DEBUG
 set output information to level: 10
@@ -62,7 +62,7 @@ generate g g > t t~ g g
 No model currently active, so we import the Standard Model
 INFO: load particles 
 INFO: load vertices 
-[1;32mDEBUG: model prefixing  takes 0.005280971527099609 [0m
+[1;32mDEBUG: model prefixing  takes 0.005356311798095703 [0m
 INFO: Restrict model sm with file models/sm/restrict_default.dat . 
 [1;32mDEBUG: Simplifying conditional expressions [0m
 [1;32mDEBUG: remove interactions: u s w+ at order: QED=1 [0m
@@ -155,64 +155,35 @@ INFO: Please specify coupling orders to bypass this step.
 INFO: Trying coupling order WEIGHTED<=4: WEIGTHED IS QCD+2*QED 
 INFO: Trying process: g g > t t~ g g WEIGHTED<=4 @1  
 INFO: Process has 123 diagrams 
-1 processes with 123 diagrams generated in 0.156 s
+1 processes with 123 diagrams generated in 0.159 s
 Total: 1 processes with 123 diagrams
-output standalone_cudacpp CODEGEN_cudacpp_gg_ttgg
-Load PLUGIN.CUDACPP_SA_OUTPUT
-[1mOutput will be done with PLUGIN: CUDACPP_SA_OUTPUT[0m
+output standalone_cudacpp ../TMPOUT/CODEGEN_cudacpp_gg_ttgg
+Load PLUGIN.CUDACPP_OUTPUT
+[1mOutput will be done with PLUGIN: CUDACPP_OUTPUT[0m
 [1;32mDEBUG:  cformat = [0m plugin [1;30m[export_cpp.py at line 3071][0m [0m
-[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [1;30m[output.py at line 152][0m [0m
-[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [1;30m[output.py at line 157][0m [0m
-INFO: Creating subdirectories in directory /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_cudacpp_gg_ttgg 
+[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [1;30m[output.py at line 160][0m [0m
+[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [1;30m[output.py at line 165][0m [0m
+INFO: Creating subdirectories in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttgg 
 INFO: Organizing processes into subprocess groups 
 INFO: Generating Helas calls for process: g g > t t~ g g WEIGHTED<=4 @1 
 INFO: Processing color information for process: g g > t t~ g g @1 
-[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.generate_subprocess_directory (create the directory) [1;30m[output.py at line 186][0m [0m
-[1;32mDEBUG:    type(subproc_group)=<class 'madgraph.core.helas_objects.HelasMatrixElement'> [1;30m[output.py at line 187][0m [0m
-[1;32mDEBUG:    type(fortran_model)=<class 'PLUGIN.CUDACPP_SA_OUTPUT.model_handling.PLUGIN_GPUFOHelasCallWriter'> [1;30m[output.py at line 188][0m [0m
-[1;32mDEBUG:    type(me)=<class 'int'> me=0 [1;30m[output.py at line 189][0m [0m
-[1;32mDEBUG:  Entering PLUGIN_OneProcessExporter.__init__ [1;30m[model_handling.py at line 1039][0m [0m
-[1;32mDEBUG:  proc_id = [0m 0 [1;30m[model_handling.py at line 1045][0m [0m
-INFO: Creating files in directory /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_cudacpp_gg_ttgg/SubProcesses/P1_Sigma_sm_gg_ttxgg 
-[1;32mDEBUG:  Entering PLUGIN_OneProcessExporter.generate_process_files [1;30m[model_handling.py at line 1297][0m [0m
-[1;32mDEBUG:  self.include_multi_channel is not yet defined: this is standalone_cudacpp mode [1;30m[model_handling.py at line 1301][0m [0m
-FileWriter <class 'PLUGIN.CUDACPP_SA_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_cudacpp_gg_ttgg/SubProcesses/P1_Sigma_sm_gg_ttxgg/./CPPProcess.h
-[1;32mDEBUG:  Entering PLUGIN_OneProcessExporter.write_process_h_file [1;30m[model_handling.py at line 1453][0m [0m
-FileWriter <class 'PLUGIN.CUDACPP_SA_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_cudacpp_gg_ttgg/SubProcesses/P1_Sigma_sm_gg_ttxgg/./CPPProcess.cc
-[1;32mDEBUG:  Entering PLUGIN_OneProcessExporter.write_process_cc_file [1;30m[model_handling.py at line 1475][0m [0m
-[1;32mDEBUG:  Entering PLUGIN_OneProcessExporter.get_sigmaKin_lines [1;30m[model_handling.py at line 1143][0m [0m
-[1;32mDEBUG:  self.include_multi_channel = [0m False [1;30m[model_handling.py at line 1144][0m [0m
-[1;32mDEBUG:  self.support_multichannel = [0m True [1;30m[model_handling.py at line 1145][0m [0m
-[1;32mDEBUG:  type(self.helas_call_writer) = [0m <class 'PLUGIN.CUDACPP_SA_OUTPUT.model_handling.PLUGIN_GPUFOHelasCallWriter'> [1;30m[model_handling.py at line 1162][0m [0m
-[1;32mDEBUG:  self.support_multichannel, self.include_multi_channel = [0m True False [1;30m[model_handling.py at line 1163][0m [0m
-[1;32mDEBUG:  multi_channel_map = [0m None [1;30m[model_handling.py at line 1654][0m [0m
-[1;32mDEBUG:  diag_to_config = [0m {} [1;30m[model_handling.py at line 1711][0m [0m
-[1;32mDEBUG:  call = [0m vxxxxx( momenta,m_pars->%s, cHel[ihel][%d],%+d, w_sv[%d], %d ); [1;30m[model_handling.py at line 1823][0m [0m
-[1;32mDEBUG:  ('ZERO', 0, -1, 0, 0) [1;30m[model_handling.py at line 1824][0m [0m
-[1;32mDEBUG:  call = [0m vxxxxx( momenta,m_pars->%s, cHel[ihel][%d],%+d, w_sv[%d], %d ); [1;30m[model_handling.py at line 1823][0m [0m
-[1;32mDEBUG:  ('ZERO', 1, -1, 1, 1) [1;30m[model_handling.py at line 1824][0m [0m
-[1;32mDEBUG:  call = [0m vxxxxx( momenta,m_pars->%s, cHel[ihel][%d],%+d, w_sv[%d], %d ); [1;30m[model_handling.py at line 1823][0m [0m
-[1;32mDEBUG:  ('ZERO', 4, 1, 4, 4) [1;30m[model_handling.py at line 1824][0m [0m
-[1;32mDEBUG:  call = [0m vxxxxx( momenta,m_pars->%s, cHel[ihel][%d],%+d, w_sv[%d], %d ); [1;30m[model_handling.py at line 1823][0m [0m
-[1;32mDEBUG:  ('ZERO', 5, 1, 5, 5) [1;30m[model_handling.py at line 1824][0m [0m
-INFO: Created files CPPProcess.h and CPPProcess.cc in directory /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_cudacpp_gg_ttgg/SubProcesses/P1_Sigma_sm_gg_ttxgg/. 
-[1;32mDEBUG:  Entering PLUGIN_OneProcessExporter.edit_CMakeLists [1;30m[model_handling.py at line 1343][0m [0m
-[1;32mDEBUG:  Entering PLUGIN_OneProcessExporter.edit_check_sa [1;30m[model_handling.py at line 1352][0m [0m
-[1;32mDEBUG:  Entering PLUGIN_OneProcessExporter.edit_mgonGPU [1;30m[model_handling.py at line 1369][0m [0m
-[1;32mDEBUG:  Entering PLUGIN_OneProcessExporter.edit_processidfile [1;30m[model_handling.py at line 1389][0m [0m
-[1;32mDEBUG:  Entering PLUGIN_OneProcessExporter.edit_testxxx [1;30m[model_handling.py at line 1419][0m [0m
-[1;32mDEBUG:  Entering PLUGIN_OneProcessExporter.edit_memorybuffers [1;30m[model_handling.py at line 1430][0m [0m
-[1;32mDEBUG:  Entering PLUGIN_OneProcessExporter.edit_memoryaccesscouplings [1;30m[model_handling.py at line 1441][0m [0m
-[1;32mDEBUG:  'Copying test reference file: ', template_ref  = [0m Copying test reference file:  /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/../../../test/ref/dump_CPUTest.Sigma_sm_gg_ttxgg.txt [1;30m[model_handling.py at line 1335][0m [0m
-Generated helas calls for 1 subprocesses (123 diagrams) in 0.436 s
-[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.convert_model (create the model) [1;30m[output.py at line 194][0m [0m
+[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.generate_subprocess_directory (create the directory) [1;30m[output.py at line 194][0m [0m
+[1;32mDEBUG:    type(subproc_group)=<class 'madgraph.core.helas_objects.HelasMatrixElement'> [1;30m[output.py at line 195][0m [0m
+[1;32mDEBUG:    type(fortran_model)=<class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_GPUFOHelasCallWriter'> [1;30m[output.py at line 196][0m [0m
+[1;32mDEBUG:    type(me)=<class 'int'> me=0 [1;30m[output.py at line 197][0m [0m
+INFO: Creating files in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttgg/SubProcesses/P1_Sigma_sm_gg_ttxgg 
+FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttgg/SubProcesses/P1_Sigma_sm_gg_ttxgg/./CPPProcess.h
+FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttgg/SubProcesses/P1_Sigma_sm_gg_ttxgg/./CPPProcess.cc
+INFO: Created files CPPProcess.h and CPPProcess.cc in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttgg/SubProcesses/P1_Sigma_sm_gg_ttxgg/. 
+Generated helas calls for 1 subprocesses (123 diagrams) in 0.420 s
+[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.convert_model (create the model) [1;30m[output.py at line 202][0m [0m
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates VVV1 routines[0m
 ALOHA: aloha creates FFV1 routines[0m
 ALOHA: aloha creates VVVV1 routines[0m
 ALOHA: aloha creates VVVV3 routines[0m
 ALOHA: aloha creates VVVV4 routines[0m
-ALOHA: aloha creates 5 routines in  0.313 s
+ALOHA: aloha creates 5 routines in  0.315 s
 <class 'aloha.create_aloha.AbstractRoutine'> VVV1
 <class 'aloha.create_aloha.AbstractRoutine'> VVV1
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
@@ -225,23 +196,17 @@ ALOHA: aloha creates 5 routines in  0.313 s
 <class 'aloha.create_aloha.AbstractRoutine'> VVVV3
 <class 'aloha.create_aloha.AbstractRoutine'> VVVV4
 <class 'aloha.create_aloha.AbstractRoutine'> VVVV4
-FileWriter <class 'PLUGIN.CUDACPP_SA_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_cudacpp_gg_ttgg/src/./HelAmps_sm.h
-INFO: Created file HelAmps_sm.h in directory /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_cudacpp_gg_ttgg/src/. 
+FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttgg/src/./HelAmps_sm.h
+INFO: Created file HelAmps_sm.h in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttgg/src/. 
 super_write_set_parameters_onlyfixMajorana (hardcoded=False)
-[1;32mDEBUG:  'pardef_lines size =', len(pardef_lines), ', keys size =', len(pardef_lines.keys())  = [0m pardef_lines size = 48 , keys size = 48 [1;30m[model_handling.py at line 729][0m [0m
 super_write_set_parameters_onlyfixMajorana (hardcoded=True)
-[1;32mDEBUG:  'pardef_lines size =', len(pardef_lines), ', keys size =', len(pardef_lines.keys())  = [0m pardef_lines size = 3 , keys size = 3 [1;30m[model_handling.py at line 729][0m [0m
-[1;32mDEBUG:  'pardef_lines size =', len(pardef_lines), ', keys size =', len(pardef_lines.keys())  = [0m pardef_lines size = 3 , keys size = 3 [1;30m[model_handling.py at line 729][0m [0m
-[1;32mDEBUG:  'pardef_lines size =', len(pardef_lines), ', keys size =', len(pardef_lines.keys())  = [0m pardef_lines size = 3 , keys size = 3 [1;30m[model_handling.py at line 729][0m [0m
-[1;32mDEBUG:  'pardef_lines size =', len(pardef_lines), ', keys size =', len(pardef_lines.keys())  = [0m pardef_lines size = 3 , keys size = 3 [1;30m[model_handling.py at line 729][0m [0m
-[1;32mDEBUG:  'pardef_lines size =', len(pardef_lines), ', keys size =', len(pardef_lines.keys())  = [0m pardef_lines size = 3 , keys size = 3 [1;30m[model_handling.py at line 729][0m [0m
-FileWriter <class 'PLUGIN.CUDACPP_SA_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_cudacpp_gg_ttgg/src/./Parameters_sm.h
-FileWriter <class 'PLUGIN.CUDACPP_SA_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_cudacpp_gg_ttgg/src/./Parameters_sm.cc
+FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttgg/src/./Parameters_sm.h
+FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttgg/src/./Parameters_sm.cc
 INFO: Created files Parameters_sm.h and Parameters_sm.cc in directory 
-INFO: /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_cudacpp_gg_ttgg/src/. and /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_cudacpp_gg_ttgg/src/. 
-[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.finalize [1;30m[output.py at line 203][0m [0m
+INFO: /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttgg/src/. and /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttgg/src/. 
 quit
 
-real	0m1.504s
-user	0m1.444s
-sys	0m0.051s
+real	0m1.429s
+user	0m1.363s
+sys	0m0.054s
+Code generation completed in 2 seconds
diff --git a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/Bridge.h b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/Bridge.h
index f37c972b24..89437b4c42 100644
--- a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/Bridge.h
+++ b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/Bridge.h
@@ -18,6 +18,7 @@
 #include <cassert>
 #include <cmath>
 #include <cstring>
+#include <filesystem>
 #include <iostream>
 #include <memory>
 #include <type_traits>
@@ -244,14 +245,21 @@ namespace mg5amcCpu
     }
     std::cout << "WARNING! Instantiate device Bridge (nevt=" << m_nevt << ", gpublocks=" << m_gpublocks << ", gputhreads=" << m_gputhreads
               << ", gpublocks*gputhreads=" << m_gpublocks * m_gputhreads << ")" << std::endl;
-    CPPProcess process( /*verbose=*/false );
     m_pmek.reset( new MatrixElementKernelDevice( m_devMomentaC, m_devGs, m_devRndHel, m_devRndCol, m_devMEs, m_devSelHel, m_devSelCol, m_gpublocks, m_gputhreads ) );
 #else
     std::cout << "WARNING! Instantiate host Bridge (nevt=" << m_nevt << ")" << std::endl;
-    CPPProcess process( /*verbose=*/false );
     m_pmek.reset( new MatrixElementKernelHost( m_hstMomentaC, m_hstGs, m_hstRndHel, m_hstRndCol, m_hstMEs, m_hstSelHel, m_hstSelCol, m_nevt ) );
 #endif // MGONGPUCPP_GPUIMPL
-    process.initProc( "../../Cards/param_card.dat" );
+    // Create a process object, read param card and set parameters
+    // FIXME: the process instance can happily go out of scope because it is only needed to read parameters?
+    // FIXME: the CPPProcess should really be a singleton? what if fbridgecreate is called from several Fortran threads?
+    CPPProcess process( /*verbose=*/false );
+    std::string paramCard = "../../Cards/param_card.dat";
+    if( !std::filesystem::exists( paramCard ) )
+    {
+      paramCard = "../" + paramCard;
+    }
+    process.initProc( paramCard );
   }
 
 #ifdef MGONGPUCPP_GPUIMPL
diff --git a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/GpuAbstraction.h b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/GpuAbstraction.h
index 9c467b1e04..6a7d9c05c0 100644
--- a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/GpuAbstraction.h
+++ b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/GpuAbstraction.h
@@ -39,6 +39,8 @@
 
 #elif defined __HIPCC__
 
+#include "hip/hip_runtime.h"
+
 #define gpuError_t hipError_t
 #define gpuPeekAtLastError hipPeekAtLastError
 #define gpuGetErrorString hipGetErrorString
diff --git a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MadgraphTest.h b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MadgraphTest.h
index 176338151a..a64c05c26a 100644
--- a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MadgraphTest.h
+++ b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MadgraphTest.h
@@ -14,6 +14,7 @@
 
 #include <array>
 #include <cmath>
+#include <filesystem>
 #include <fstream>
 #include <iomanip>
 #include <memory>
@@ -215,19 +216,16 @@ TEST_P( MadgraphTest, CompareMomentaAndME )
 #endif
   constexpr fptype energy = 1500; // historical default, Ecms = 1500 GeV = 1.5 TeV (above the Z peak)
   // Dump events to a new reference file?
-  constexpr bool dumpEvents = false;
-  std::string dumpFileName = std::string( "dump_" ) + testing::UnitTest::GetInstance()->current_test_info()->test_suite_name() + '.' + testing::UnitTest::GetInstance()->current_test_info()->name() + ".txt";
-  while( dumpFileName.find( '/' ) != std::string::npos )
-  {
-    dumpFileName.replace( dumpFileName.find( '/' ), 1, "_" );
-  }
+  const char* dumpEventsC = getenv( "CUDACPP_RUNTEST_DUMPEVENTS" );
+  const bool dumpEvents = ( dumpEventsC != 0 ) && ( std::string( dumpEventsC ) != "" );
+  const std::string refFileName = testDriver->getRefFileName();
+  const std::string dumpFileName = std::filesystem::path( refFileName ).filename();
   std::ofstream dumpFile;
   if( dumpEvents )
   {
     dumpFile.open( dumpFileName, std::ios::trunc );
   }
   // Read reference data
-  const std::string refFileName = testDriver->getRefFileName();
   std::map<unsigned int, ReferenceData> referenceData;
   if( !dumpEvents )
   {
diff --git a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MatrixElementKernels.cc b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MatrixElementKernels.cc
index d6d6c4f179..81699dfea9 100644
--- a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MatrixElementKernels.cc
+++ b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MatrixElementKernels.cc
@@ -112,10 +112,17 @@ namespace mg5amcCpu
     // See https://community.arm.com/arm-community-blogs/b/operating-systems-blog/posts/runtime-detection-of-cpu-features-on-an-armv8-a-cpu
     bool ok = true; // this is just an assumption!
     const std::string tag = "arm neon (128bit as in SSE4.2)";
-#else
+#elif defined( __x86_64__ ) || defined( __i386__ )
     bool known = true;
     bool ok = __builtin_cpu_supports( "sse4.2" );
     const std::string tag = "nehalem (SSE4.2)";
+#else // AV FIXME! Added by OM for Mac, should identify the correct __xxx__ flag that should be targeted
+    bool known = false; // __builtin_cpu_supports is not supported
+    // See https://gcc.gnu.org/onlinedocs/gcc/Basic-PowerPC-Built-in-Functions-Available-on-all-Configurations.html
+    // See https://stackoverflow.com/q/62783908
+    // See https://community.arm.com/arm-community-blogs/b/operating-systems-blog/posts/runtime-detection-of-cpu-features-on-an-armv8-a-cpu
+    bool ok = true; // this is just an assumption!
+    const std::string tag = "arm neon (128bit as in SSE4.2)";
 #endif
 #else
     bool known = true;
diff --git a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/P1_Sigma_sm_gg_ttxgg/CPPProcess.cc b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/P1_Sigma_sm_gg_ttxgg/CPPProcess.cc
index 8f585d1aef..d59cc349e3 100644
--- a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/P1_Sigma_sm_gg_ttxgg/CPPProcess.cc
+++ b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/P1_Sigma_sm_gg_ttxgg/CPPProcess.cc
@@ -7,7 +7,7 @@
 // Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+// MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
@@ -251,11 +251,11 @@ namespace mg5amcCpu
 
       vxxxxx<M_ACCESS, W_ACCESS>( momenta, 0., cHel[ihel][5], +1, w_fp[5], 5 );
 
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[1], COUPs[0], 0., 0., w_fp[6] );
-      FFV1P0_3<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[2], COUPs[1], 0., 0., w_fp[7] );
+      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[1], COUPs[0], 1.0, 0., 0., w_fp[6] );
+      FFV1P0_3<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[2], COUPs[1], 1.0, 0., 0., w_fp[7] );
 
       // Amplitude(s) for diagram number 1
-      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[6], w_fp[7], w_fp[4], w_fp[5], COUPs[2], &amp_fp[0] );
+      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[6], w_fp[7], w_fp[4], w_fp[5], COUPs[2], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -267,7 +267,7 @@ namespace mg5amcCpu
       jamp_sv[17] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[22] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[23] -= cxtype( 0, 1 ) * amp_sv[0];
-      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[6], w_fp[7], w_fp[4], w_fp[5], COUPs[2], &amp_fp[0] );
+      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[6], w_fp[7], w_fp[4], w_fp[5], COUPs[2], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -279,7 +279,7 @@ namespace mg5amcCpu
       jamp_sv[20] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[22] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[23] -= cxtype( 0, 1 ) * amp_sv[0];
-      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[6], w_fp[7], w_fp[4], w_fp[5], COUPs[2], &amp_fp[0] );
+      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[6], w_fp[7], w_fp[4], w_fp[5], COUPs[2], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -295,10 +295,10 @@ namespace mg5amcCpu
       // *** DIAGRAM 2 OF 123 ***
 
       // Wavefunction(s) for diagram number 2
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[6], w_fp[4], COUPs[0], 0., 0., w_fp[8] );
+      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[6], w_fp[4], COUPs[0], 1.0, 0., 0., w_fp[8] );
 
       // Amplitude(s) for diagram number 2
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[7], w_fp[5], w_fp[8], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[7], w_fp[5], w_fp[8], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -314,10 +314,10 @@ namespace mg5amcCpu
       // *** DIAGRAM 3 OF 123 ***
 
       // Wavefunction(s) for diagram number 3
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[6], w_fp[5], COUPs[0], 0., 0., w_fp[9] );
+      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[6], w_fp[5], COUPs[0], 1.0, 0., 0., w_fp[9] );
 
       // Amplitude(s) for diagram number 3
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[7], w_fp[4], w_fp[9], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[7], w_fp[4], w_fp[9], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -333,10 +333,10 @@ namespace mg5amcCpu
       // *** DIAGRAM 4 OF 123 ***
 
       // Wavefunction(s) for diagram number 4
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[4], w_fp[5], COUPs[0], 0., 0., w_fp[10] );
+      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[4], w_fp[5], COUPs[0], 1.0, 0., 0., w_fp[10] );
 
       // Amplitude(s) for diagram number 4
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[6], w_fp[7], w_fp[10], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[6], w_fp[7], w_fp[10], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -352,11 +352,11 @@ namespace mg5amcCpu
       // *** DIAGRAM 5 OF 123 ***
 
       // Wavefunction(s) for diagram number 5
-      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[2], w_fp[4], COUPs[1], cIPD[0], cIPD[1], w_fp[11] );
-      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[6], COUPs[1], cIPD[0], cIPD[1], w_fp[12] );
+      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[2], w_fp[4], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[11] );
+      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[6], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[12] );
 
       // Amplitude(s) for diagram number 5
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[12], w_fp[11], w_fp[5], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[12], w_fp[11], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -369,7 +369,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 6
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[11], w_fp[9], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[11], w_fp[9], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -381,10 +381,10 @@ namespace mg5amcCpu
       // *** DIAGRAM 7 OF 123 ***
 
       // Wavefunction(s) for diagram number 7
-      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[5], COUPs[1], cIPD[0], cIPD[1], w_fp[13] );
+      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[5], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[13] );
 
       // Amplitude(s) for diagram number 7
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[13], w_fp[11], w_fp[6], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[13], w_fp[11], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -394,10 +394,10 @@ namespace mg5amcCpu
       // *** DIAGRAM 8 OF 123 ***
 
       // Wavefunction(s) for diagram number 8
-      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[2], w_fp[5], COUPs[1], cIPD[0], cIPD[1], w_fp[14] );
+      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[2], w_fp[5], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[14] );
 
       // Amplitude(s) for diagram number 8
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[12], w_fp[14], w_fp[4], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[12], w_fp[14], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -410,7 +410,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 9
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[14], w_fp[8], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[14], w_fp[8], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -422,10 +422,10 @@ namespace mg5amcCpu
       // *** DIAGRAM 10 OF 123 ***
 
       // Wavefunction(s) for diagram number 10
-      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[4], COUPs[1], cIPD[0], cIPD[1], w_fp[15] );
+      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[4], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[15] );
 
       // Amplitude(s) for diagram number 10
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[15], w_fp[14], w_fp[6], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[15], w_fp[14], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -435,10 +435,10 @@ namespace mg5amcCpu
       // *** DIAGRAM 11 OF 123 ***
 
       // Wavefunction(s) for diagram number 11
-      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[2], w_fp[6], COUPs[1], cIPD[0], cIPD[1], w_fp[16] );
+      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[2], w_fp[6], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[16] );
 
       // Amplitude(s) for diagram number 11
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[15], w_fp[16], w_fp[5], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[15], w_fp[16], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -451,7 +451,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 12
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[15], w_fp[2], w_fp[9], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[15], w_fp[2], w_fp[9], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -466,7 +466,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 13
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[13], w_fp[16], w_fp[4], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[13], w_fp[16], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -479,7 +479,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 14
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[13], w_fp[2], w_fp[8], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[13], w_fp[2], w_fp[8], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -494,7 +494,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 15
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[16], w_fp[10], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[16], w_fp[10], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -509,7 +509,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 16
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[12], w_fp[2], w_fp[10], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[12], w_fp[2], w_fp[10], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -521,12 +521,12 @@ namespace mg5amcCpu
       // *** DIAGRAM 17 OF 123 ***
 
       // Wavefunction(s) for diagram number 17
-      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[2], w_fp[0], COUPs[1], cIPD[0], cIPD[1], w_fp[12] );
-      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[1], COUPs[1], cIPD[0], cIPD[1], w_fp[16] );
-      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[12], w_fp[4], COUPs[1], cIPD[0], cIPD[1], w_fp[8] );
+      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[2], w_fp[0], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[12] );
+      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[1], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[16] );
+      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[12], w_fp[4], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[8] );
 
       // Amplitude(s) for diagram number 17
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[16], w_fp[8], w_fp[5], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[16], w_fp[8], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -535,10 +535,10 @@ namespace mg5amcCpu
       // *** DIAGRAM 18 OF 123 ***
 
       // Wavefunction(s) for diagram number 18
-      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[12], w_fp[5], COUPs[1], cIPD[0], cIPD[1], w_fp[9] );
+      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[12], w_fp[5], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[9] );
 
       // Amplitude(s) for diagram number 18
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[16], w_fp[9], w_fp[4], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[16], w_fp[9], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -550,7 +550,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 19
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[16], w_fp[12], w_fp[10], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[16], w_fp[12], w_fp[10], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -560,11 +560,11 @@ namespace mg5amcCpu
       // *** DIAGRAM 20 OF 123 ***
 
       // Wavefunction(s) for diagram number 20
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[1], w_fp[4], COUPs[0], 0., 0., w_fp[6] );
-      FFV1P0_3<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[12], COUPs[1], 0., 0., w_fp[17] );
+      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[1], w_fp[4], COUPs[0], 1.0, 0., 0., w_fp[6] );
+      FFV1P0_3<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[12], COUPs[1], 1.0, 0., 0., w_fp[17] );
 
       // Amplitude(s) for diagram number 20
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[6], w_fp[5], w_fp[17], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[6], w_fp[5], w_fp[17], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -579,7 +579,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 21
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[9], w_fp[6], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[9], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -592,7 +592,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 22
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[13], w_fp[12], w_fp[6], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[13], w_fp[12], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -602,10 +602,10 @@ namespace mg5amcCpu
       // *** DIAGRAM 23 OF 123 ***
 
       // Wavefunction(s) for diagram number 23
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[1], w_fp[5], COUPs[0], 0., 0., w_fp[18] );
+      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[1], w_fp[5], COUPs[0], 1.0, 0., 0., w_fp[18] );
 
       // Amplitude(s) for diagram number 23
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[18], w_fp[4], w_fp[17], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[18], w_fp[4], w_fp[17], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -620,7 +620,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 24
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[8], w_fp[18], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[8], w_fp[18], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -633,7 +633,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 25
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[15], w_fp[12], w_fp[18], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[15], w_fp[12], w_fp[18], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -643,10 +643,10 @@ namespace mg5amcCpu
       // *** DIAGRAM 26 OF 123 ***
 
       // Wavefunction(s) for diagram number 26
-      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[12], w_fp[1], COUPs[1], cIPD[0], cIPD[1], w_fp[19] );
+      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[12], w_fp[1], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[19] );
 
       // Amplitude(s) for diagram number 26
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[15], w_fp[19], w_fp[5], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[15], w_fp[19], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -658,7 +658,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 27
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[15], w_fp[9], w_fp[1], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[15], w_fp[9], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -670,7 +670,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 28
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[13], w_fp[19], w_fp[4], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[13], w_fp[19], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -682,7 +682,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 29
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[13], w_fp[8], w_fp[1], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[13], w_fp[8], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -694,7 +694,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 30
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[19], w_fp[10], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[19], w_fp[10], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -707,7 +707,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 31
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[10], w_fp[17], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[10], w_fp[17], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -719,12 +719,12 @@ namespace mg5amcCpu
       // *** DIAGRAM 32 OF 123 ***
 
       // Wavefunction(s) for diagram number 32
-      VVVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[1], w_fp[4], w_fp[5], COUPs[2], 0., 0., w_fp[17] );
-      VVVV3P0_1<W_ACCESS, CD_ACCESS>( w_fp[1], w_fp[4], w_fp[5], COUPs[2], 0., 0., w_fp[19] );
-      VVVV4P0_1<W_ACCESS, CD_ACCESS>( w_fp[1], w_fp[4], w_fp[5], COUPs[2], 0., 0., w_fp[8] );
+      VVVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[1], w_fp[4], w_fp[5], COUPs[2], 1.0, 0., 0., w_fp[17] );
+      VVVV3P0_1<W_ACCESS, CD_ACCESS>( w_fp[1], w_fp[4], w_fp[5], COUPs[2], 1.0, 0., 0., w_fp[19] );
+      VVVV4P0_1<W_ACCESS, CD_ACCESS>( w_fp[1], w_fp[4], w_fp[5], COUPs[2], 1.0, 0., 0., w_fp[8] );
 
       // Amplitude(s) for diagram number 32
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[12], w_fp[17], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[12], w_fp[17], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -732,7 +732,7 @@ namespace mg5amcCpu
       jamp_sv[1] -= amp_sv[0];
       jamp_sv[3] -= amp_sv[0];
       jamp_sv[5] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[12], w_fp[19], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[12], w_fp[19], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -740,7 +740,7 @@ namespace mg5amcCpu
       jamp_sv[2] += amp_sv[0];
       jamp_sv[3] -= amp_sv[0];
       jamp_sv[4] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[12], w_fp[8], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[12], w_fp[8], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -752,12 +752,12 @@ namespace mg5amcCpu
       // *** DIAGRAM 33 OF 123 ***
 
       // Wavefunction(s) for diagram number 33
-      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[0], COUPs[1], cIPD[0], cIPD[1], w_fp[12] );
-      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[2], w_fp[1], COUPs[1], cIPD[0], cIPD[1], w_fp[9] );
-      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[12], w_fp[4], COUPs[1], cIPD[0], cIPD[1], w_fp[20] );
+      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[0], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[12] );
+      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[2], w_fp[1], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[9] );
+      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[12], w_fp[4], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[20] );
 
       // Amplitude(s) for diagram number 33
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[20], w_fp[9], w_fp[5], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[20], w_fp[9], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -766,10 +766,10 @@ namespace mg5amcCpu
       // *** DIAGRAM 34 OF 123 ***
 
       // Wavefunction(s) for diagram number 34
-      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[12], w_fp[5], COUPs[1], cIPD[0], cIPD[1], w_fp[21] );
+      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[12], w_fp[5], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[21] );
 
       // Amplitude(s) for diagram number 34
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[21], w_fp[9], w_fp[4], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[21], w_fp[9], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -781,7 +781,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 35
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[12], w_fp[9], w_fp[10], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[12], w_fp[9], w_fp[10], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -791,10 +791,10 @@ namespace mg5amcCpu
       // *** DIAGRAM 36 OF 123 ***
 
       // Wavefunction(s) for diagram number 36
-      FFV1P0_3<W_ACCESS, CD_ACCESS>( w_fp[12], w_fp[2], COUPs[1], 0., 0., w_fp[22] );
+      FFV1P0_3<W_ACCESS, CD_ACCESS>( w_fp[12], w_fp[2], COUPs[1], 1.0, 0., 0., w_fp[22] );
 
       // Amplitude(s) for diagram number 36
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[6], w_fp[5], w_fp[22], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[6], w_fp[5], w_fp[22], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -809,7 +809,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 37
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[21], w_fp[2], w_fp[6], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[21], w_fp[2], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -822,7 +822,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 38
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[12], w_fp[14], w_fp[6], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[12], w_fp[14], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -835,7 +835,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 39
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[18], w_fp[4], w_fp[22], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[18], w_fp[4], w_fp[22], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -850,7 +850,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 40
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[20], w_fp[2], w_fp[18], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[20], w_fp[2], w_fp[18], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -863,7 +863,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 41
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[12], w_fp[11], w_fp[18], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[12], w_fp[11], w_fp[18], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -873,10 +873,10 @@ namespace mg5amcCpu
       // *** DIAGRAM 42 OF 123 ***
 
       // Wavefunction(s) for diagram number 42
-      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[12], w_fp[1], COUPs[1], cIPD[0], cIPD[1], w_fp[23] );
+      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[12], w_fp[1], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[23] );
 
       // Amplitude(s) for diagram number 42
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[23], w_fp[11], w_fp[5], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[23], w_fp[11], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -888,7 +888,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 43
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[21], w_fp[11], w_fp[1], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[21], w_fp[11], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -900,7 +900,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 44
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[23], w_fp[14], w_fp[4], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[23], w_fp[14], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -912,7 +912,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 45
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[20], w_fp[14], w_fp[1], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[20], w_fp[14], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -924,7 +924,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 46
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[23], w_fp[2], w_fp[10], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[23], w_fp[2], w_fp[10], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -937,7 +937,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 47
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[10], w_fp[22], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[10], w_fp[22], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -952,7 +952,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 48
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[12], w_fp[2], w_fp[17], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[12], w_fp[2], w_fp[17], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -960,7 +960,7 @@ namespace mg5amcCpu
       jamp_sv[11] -= amp_sv[0];
       jamp_sv[17] -= amp_sv[0];
       jamp_sv[23] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[12], w_fp[2], w_fp[19], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[12], w_fp[2], w_fp[19], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -968,7 +968,7 @@ namespace mg5amcCpu
       jamp_sv[15] += amp_sv[0];
       jamp_sv[17] -= amp_sv[0];
       jamp_sv[21] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[12], w_fp[2], w_fp[8], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[12], w_fp[2], w_fp[8], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -980,11 +980,11 @@ namespace mg5amcCpu
       // *** DIAGRAM 49 OF 123 ***
 
       // Wavefunction(s) for diagram number 49
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[4], COUPs[0], 0., 0., w_fp[12] );
-      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[12], COUPs[1], cIPD[0], cIPD[1], w_fp[22] );
+      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[4], COUPs[0], 1.0, 0., 0., w_fp[12] );
+      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[12], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[22] );
 
       // Amplitude(s) for diagram number 49
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[22], w_fp[9], w_fp[5], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[22], w_fp[9], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -994,10 +994,10 @@ namespace mg5amcCpu
       // *** DIAGRAM 50 OF 123 ***
 
       // Wavefunction(s) for diagram number 50
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[12], w_fp[5], COUPs[0], 0., 0., w_fp[23] );
+      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[12], w_fp[5], COUPs[0], 1.0, 0., 0., w_fp[23] );
 
       // Amplitude(s) for diagram number 50
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[9], w_fp[23], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[9], w_fp[23], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -1012,7 +1012,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 51
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[13], w_fp[9], w_fp[12], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[13], w_fp[9], w_fp[12], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -1022,10 +1022,10 @@ namespace mg5amcCpu
       // *** DIAGRAM 52 OF 123 ***
 
       // Wavefunction(s) for diagram number 52
-      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[2], w_fp[12], COUPs[1], cIPD[0], cIPD[1], w_fp[20] );
+      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[2], w_fp[12], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[20] );
 
       // Amplitude(s) for diagram number 52
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[16], w_fp[20], w_fp[5], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[16], w_fp[20], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -1038,7 +1038,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 53
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[16], w_fp[2], w_fp[23], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[16], w_fp[2], w_fp[23], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -1053,7 +1053,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 54
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[16], w_fp[14], w_fp[12], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[16], w_fp[14], w_fp[12], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -1066,7 +1066,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 55
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[20], w_fp[18], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[20], w_fp[18], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -1081,7 +1081,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 56
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[22], w_fp[2], w_fp[18], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[22], w_fp[2], w_fp[18], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -1096,7 +1096,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 57
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[12], w_fp[18], w_fp[7], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[12], w_fp[18], w_fp[7], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -1115,7 +1115,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 58
-      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[12], w_fp[1], w_fp[7], w_fp[5], COUPs[2], &amp_fp[0] );
+      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[12], w_fp[1], w_fp[7], w_fp[5], COUPs[2], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -1127,7 +1127,7 @@ namespace mg5amcCpu
       jamp_sv[20] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[21] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[22] += cxtype( 0, 1 ) * amp_sv[0];
-      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[12], w_fp[1], w_fp[7], w_fp[5], COUPs[2], &amp_fp[0] );
+      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[12], w_fp[1], w_fp[7], w_fp[5], COUPs[2], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -1139,7 +1139,7 @@ namespace mg5amcCpu
       jamp_sv[13] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[20] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[21] -= cxtype( 0, 1 ) * amp_sv[0];
-      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[12], w_fp[1], w_fp[7], w_fp[5], COUPs[2], &amp_fp[0] );
+      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[12], w_fp[1], w_fp[7], w_fp[5], COUPs[2], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -1155,10 +1155,10 @@ namespace mg5amcCpu
       // *** DIAGRAM 59 OF 123 ***
 
       // Wavefunction(s) for diagram number 59
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[12], w_fp[1], COUPs[0], 0., 0., w_fp[21] );
+      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[12], w_fp[1], COUPs[0], 1.0, 0., 0., w_fp[21] );
 
       // Amplitude(s) for diagram number 59
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[7], w_fp[5], w_fp[21], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[7], w_fp[5], w_fp[21], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -1177,7 +1177,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 60
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[7], w_fp[23], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[7], w_fp[23], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -1196,7 +1196,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 61
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[14], w_fp[21], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[14], w_fp[21], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -1211,7 +1211,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 62
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[22], w_fp[14], w_fp[1], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[22], w_fp[14], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -1224,7 +1224,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 63
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[13], w_fp[2], w_fp[21], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[13], w_fp[2], w_fp[21], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -1239,7 +1239,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 64
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[13], w_fp[20], w_fp[1], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[13], w_fp[20], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -1249,11 +1249,11 @@ namespace mg5amcCpu
       // *** DIAGRAM 65 OF 123 ***
 
       // Wavefunction(s) for diagram number 65
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[5], COUPs[0], 0., 0., w_fp[20] );
-      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[20], COUPs[1], cIPD[0], cIPD[1], w_fp[21] );
+      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[5], COUPs[0], 1.0, 0., 0., w_fp[20] );
+      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[20], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[21] );
 
       // Amplitude(s) for diagram number 65
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[21], w_fp[9], w_fp[4], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[21], w_fp[9], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -1263,10 +1263,10 @@ namespace mg5amcCpu
       // *** DIAGRAM 66 OF 123 ***
 
       // Wavefunction(s) for diagram number 66
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[20], w_fp[4], COUPs[0], 0., 0., w_fp[22] );
+      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[20], w_fp[4], COUPs[0], 1.0, 0., 0., w_fp[22] );
 
       // Amplitude(s) for diagram number 66
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[9], w_fp[22], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[9], w_fp[22], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -1281,7 +1281,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 67
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[15], w_fp[9], w_fp[20], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[15], w_fp[9], w_fp[20], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -1291,10 +1291,10 @@ namespace mg5amcCpu
       // *** DIAGRAM 68 OF 123 ***
 
       // Wavefunction(s) for diagram number 68
-      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[2], w_fp[20], COUPs[1], cIPD[0], cIPD[1], w_fp[23] );
+      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[2], w_fp[20], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[23] );
 
       // Amplitude(s) for diagram number 68
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[16], w_fp[23], w_fp[4], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[16], w_fp[23], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -1307,7 +1307,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 69
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[16], w_fp[2], w_fp[22], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[16], w_fp[2], w_fp[22], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -1322,7 +1322,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 70
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[16], w_fp[11], w_fp[20], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[16], w_fp[11], w_fp[20], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -1335,7 +1335,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 71
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[23], w_fp[6], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[23], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -1350,7 +1350,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 72
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[21], w_fp[2], w_fp[6], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[21], w_fp[2], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -1365,7 +1365,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 73
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[20], w_fp[6], w_fp[7], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[20], w_fp[6], w_fp[7], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -1384,7 +1384,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 74
-      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[20], w_fp[1], w_fp[7], w_fp[4], COUPs[2], &amp_fp[0] );
+      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[20], w_fp[1], w_fp[7], w_fp[4], COUPs[2], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -1396,7 +1396,7 @@ namespace mg5amcCpu
       jamp_sv[15] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[16] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[18] -= cxtype( 0, 1 ) * amp_sv[0];
-      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[20], w_fp[1], w_fp[7], w_fp[4], COUPs[2], &amp_fp[0] );
+      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[20], w_fp[1], w_fp[7], w_fp[4], COUPs[2], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -1408,7 +1408,7 @@ namespace mg5amcCpu
       jamp_sv[15] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[18] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[19] += cxtype( 0, 1 ) * amp_sv[0];
-      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[20], w_fp[1], w_fp[7], w_fp[4], COUPs[2], &amp_fp[0] );
+      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[20], w_fp[1], w_fp[7], w_fp[4], COUPs[2], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -1424,10 +1424,10 @@ namespace mg5amcCpu
       // *** DIAGRAM 75 OF 123 ***
 
       // Wavefunction(s) for diagram number 75
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[20], w_fp[1], COUPs[0], 0., 0., w_fp[12] );
+      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[20], w_fp[1], COUPs[0], 1.0, 0., 0., w_fp[12] );
 
       // Amplitude(s) for diagram number 75
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[7], w_fp[4], w_fp[12], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[7], w_fp[4], w_fp[12], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -1446,7 +1446,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 76
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[7], w_fp[22], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[7], w_fp[22], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -1465,7 +1465,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 77
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[11], w_fp[12], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[11], w_fp[12], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -1480,7 +1480,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 78
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[21], w_fp[11], w_fp[1], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[21], w_fp[11], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -1493,7 +1493,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 79
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[15], w_fp[2], w_fp[12], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[15], w_fp[2], w_fp[12], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -1508,7 +1508,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 80
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[15], w_fp[23], w_fp[1], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[15], w_fp[23], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -1518,10 +1518,10 @@ namespace mg5amcCpu
       // *** DIAGRAM 81 OF 123 ***
 
       // Wavefunction(s) for diagram number 81
-      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[9], w_fp[0], COUPs[1], cIPD[0], cIPD[1], w_fp[23] );
+      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[9], w_fp[0], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[23] );
 
       // Amplitude(s) for diagram number 81
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[15], w_fp[23], w_fp[5], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[15], w_fp[23], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -1530,10 +1530,10 @@ namespace mg5amcCpu
       // *** DIAGRAM 82 OF 123 ***
 
       // Wavefunction(s) for diagram number 82
-      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[15], w_fp[0], COUPs[1], cIPD[0], cIPD[1], w_fp[12] );
+      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[15], w_fp[0], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[12] );
 
       // Amplitude(s) for diagram number 82
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[12], w_fp[9], w_fp[5], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[12], w_fp[9], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -1545,7 +1545,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 83
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[13], w_fp[23], w_fp[4], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[13], w_fp[23], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -1554,10 +1554,10 @@ namespace mg5amcCpu
       // *** DIAGRAM 84 OF 123 ***
 
       // Wavefunction(s) for diagram number 84
-      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[13], w_fp[0], COUPs[1], cIPD[0], cIPD[1], w_fp[21] );
+      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[13], w_fp[0], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[21] );
 
       // Amplitude(s) for diagram number 84
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[21], w_fp[9], w_fp[4], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[21], w_fp[9], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -1569,7 +1569,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 85
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[23], w_fp[10], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[23], w_fp[10], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -1579,10 +1579,10 @@ namespace mg5amcCpu
       // *** DIAGRAM 86 OF 123 ***
 
       // Wavefunction(s) for diagram number 86
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[10], COUPs[0], 0., 0., w_fp[23] );
+      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[10], COUPs[0], 1.0, 0., 0., w_fp[23] );
 
       // Amplitude(s) for diagram number 86
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[9], w_fp[23], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[9], w_fp[23], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -1594,10 +1594,10 @@ namespace mg5amcCpu
       // *** DIAGRAM 87 OF 123 ***
 
       // Wavefunction(s) for diagram number 87
-      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[16], w_fp[0], COUPs[1], cIPD[0], cIPD[1], w_fp[22] );
+      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[16], w_fp[0], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[22] );
 
       // Amplitude(s) for diagram number 87
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[22], w_fp[11], w_fp[5], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[22], w_fp[11], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -1606,10 +1606,10 @@ namespace mg5amcCpu
       // *** DIAGRAM 88 OF 123 ***
 
       // Wavefunction(s) for diagram number 88
-      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[11], w_fp[0], COUPs[1], cIPD[0], cIPD[1], w_fp[20] );
+      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[11], w_fp[0], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[20] );
 
       // Amplitude(s) for diagram number 88
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[16], w_fp[20], w_fp[5], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[16], w_fp[20], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -1621,7 +1621,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 89
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[22], w_fp[14], w_fp[4], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[22], w_fp[14], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -1630,10 +1630,10 @@ namespace mg5amcCpu
       // *** DIAGRAM 90 OF 123 ***
 
       // Wavefunction(s) for diagram number 90
-      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[14], w_fp[0], COUPs[1], cIPD[0], cIPD[1], w_fp[24] );
+      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[14], w_fp[0], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[24] );
 
       // Amplitude(s) for diagram number 90
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[16], w_fp[24], w_fp[4], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[16], w_fp[24], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -1645,7 +1645,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 91
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[22], w_fp[2], w_fp[10], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[22], w_fp[2], w_fp[10], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -1658,7 +1658,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 92
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[16], w_fp[2], w_fp[23], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[16], w_fp[2], w_fp[23], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -1673,7 +1673,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 93
-      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[6], w_fp[7], w_fp[5], COUPs[2], &amp_fp[0] );
+      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[6], w_fp[7], w_fp[5], COUPs[2], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -1685,7 +1685,7 @@ namespace mg5amcCpu
       jamp_sv[19] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[21] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[23] -= cxtype( 0, 1 ) * amp_sv[0];
-      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[6], w_fp[7], w_fp[5], COUPs[2], &amp_fp[0] );
+      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[6], w_fp[7], w_fp[5], COUPs[2], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -1697,7 +1697,7 @@ namespace mg5amcCpu
       jamp_sv[15] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[21] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[23] -= cxtype( 0, 1 ) * amp_sv[0];
-      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[6], w_fp[7], w_fp[5], COUPs[2], &amp_fp[0] );
+      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[6], w_fp[7], w_fp[5], COUPs[2], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -1713,10 +1713,10 @@ namespace mg5amcCpu
       // *** DIAGRAM 94 OF 123 ***
 
       // Wavefunction(s) for diagram number 94
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[6], COUPs[0], 0., 0., w_fp[22] );
+      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[6], COUPs[0], 1.0, 0., 0., w_fp[22] );
 
       // Amplitude(s) for diagram number 94
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[7], w_fp[5], w_fp[22], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[7], w_fp[5], w_fp[22], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -1732,10 +1732,10 @@ namespace mg5amcCpu
       // *** DIAGRAM 95 OF 123 ***
 
       // Wavefunction(s) for diagram number 95
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[7], COUPs[0], 0., 0., w_fp[25] );
+      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[7], COUPs[0], 1.0, 0., 0., w_fp[25] );
 
       // Amplitude(s) for diagram number 95
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[6], w_fp[5], w_fp[25], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[6], w_fp[5], w_fp[25], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -1754,7 +1754,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 96
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[14], w_fp[22], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[14], w_fp[22], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -1769,7 +1769,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 97
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[24], w_fp[6], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[24], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -1782,7 +1782,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 98
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[13], w_fp[2], w_fp[22], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[13], w_fp[2], w_fp[22], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -1797,7 +1797,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 99
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[21], w_fp[2], w_fp[6], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[21], w_fp[2], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -1810,7 +1810,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 100
-      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[18], w_fp[7], w_fp[4], COUPs[2], &amp_fp[0] );
+      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[18], w_fp[7], w_fp[4], COUPs[2], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -1822,7 +1822,7 @@ namespace mg5amcCpu
       jamp_sv[15] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[17] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[20] += cxtype( 0, 1 ) * amp_sv[0];
-      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[18], w_fp[7], w_fp[4], COUPs[2], &amp_fp[0] );
+      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[18], w_fp[7], w_fp[4], COUPs[2], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -1834,7 +1834,7 @@ namespace mg5amcCpu
       jamp_sv[15] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[17] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[21] += cxtype( 0, 1 ) * amp_sv[0];
-      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[18], w_fp[7], w_fp[4], COUPs[2], &amp_fp[0] );
+      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[18], w_fp[7], w_fp[4], COUPs[2], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -1850,10 +1850,10 @@ namespace mg5amcCpu
       // *** DIAGRAM 101 OF 123 ***
 
       // Wavefunction(s) for diagram number 101
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[18], COUPs[0], 0., 0., w_fp[6] );
+      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[18], COUPs[0], 1.0, 0., 0., w_fp[6] );
 
       // Amplitude(s) for diagram number 101
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[7], w_fp[4], w_fp[6], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[7], w_fp[4], w_fp[6], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -1872,7 +1872,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 102
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[18], w_fp[4], w_fp[25], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[18], w_fp[4], w_fp[25], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -1891,7 +1891,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 103
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[11], w_fp[6], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[11], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -1906,7 +1906,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 104
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[20], w_fp[18], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[20], w_fp[18], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -1919,7 +1919,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 105
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[15], w_fp[2], w_fp[6], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[15], w_fp[2], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -1934,7 +1934,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 106
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[12], w_fp[2], w_fp[18], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[12], w_fp[2], w_fp[18], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -1947,7 +1947,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 107
-      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[1], w_fp[7], w_fp[10], COUPs[2], &amp_fp[0] );
+      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[1], w_fp[7], w_fp[10], COUPs[2], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -1959,7 +1959,7 @@ namespace mg5amcCpu
       jamp_sv[17] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[22] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[23] -= cxtype( 0, 1 ) * amp_sv[0];
-      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[1], w_fp[7], w_fp[10], COUPs[2], &amp_fp[0] );
+      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[1], w_fp[7], w_fp[10], COUPs[2], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -1971,7 +1971,7 @@ namespace mg5amcCpu
       jamp_sv[11] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[17] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[23] -= cxtype( 0, 1 ) * amp_sv[0];
-      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[1], w_fp[7], w_fp[10], COUPs[2], &amp_fp[0] );
+      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[1], w_fp[7], w_fp[10], COUPs[2], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -1990,7 +1990,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 108
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[10], w_fp[25], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[10], w_fp[25], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -2009,7 +2009,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 109
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[7], w_fp[23], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[7], w_fp[23], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -2028,7 +2028,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 110
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[13], w_fp[20], w_fp[1], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[13], w_fp[20], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -2040,7 +2040,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 111
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[21], w_fp[11], w_fp[1], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[21], w_fp[11], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -2052,7 +2052,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 112
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[15], w_fp[24], w_fp[1], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[15], w_fp[24], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -2064,7 +2064,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 113
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[12], w_fp[14], w_fp[1], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[12], w_fp[14], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -2073,12 +2073,12 @@ namespace mg5amcCpu
       // *** DIAGRAM 114 OF 123 ***
 
       // Wavefunction(s) for diagram number 114
-      VVVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[1], w_fp[4], COUPs[2], 0., 0., w_fp[12] );
-      VVVV3P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[1], w_fp[4], COUPs[2], 0., 0., w_fp[24] );
-      VVVV4P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[1], w_fp[4], COUPs[2], 0., 0., w_fp[21] );
+      VVVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[1], w_fp[4], COUPs[2], 1.0, 0., 0., w_fp[12] );
+      VVVV3P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[1], w_fp[4], COUPs[2], 1.0, 0., 0., w_fp[24] );
+      VVVV4P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[1], w_fp[4], COUPs[2], 1.0, 0., 0., w_fp[21] );
 
       // Amplitude(s) for diagram number 114
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[12], w_fp[7], w_fp[5], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[12], w_fp[7], w_fp[5], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -2090,7 +2090,7 @@ namespace mg5amcCpu
       jamp_sv[19] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[21] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[23] -= cxtype( 0, 1 ) * amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[24], w_fp[7], w_fp[5], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[24], w_fp[7], w_fp[5], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -2102,7 +2102,7 @@ namespace mg5amcCpu
       jamp_sv[20] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[21] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[22] -= cxtype( 0, 1 ) * amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[21], w_fp[7], w_fp[5], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[21], w_fp[7], w_fp[5], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -2121,7 +2121,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 115
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[14], w_fp[12], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[14], w_fp[12], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -2129,7 +2129,7 @@ namespace mg5amcCpu
       jamp_sv[19] -= amp_sv[0];
       jamp_sv[21] -= amp_sv[0];
       jamp_sv[23] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[14], w_fp[24], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[14], w_fp[24], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -2137,7 +2137,7 @@ namespace mg5amcCpu
       jamp_sv[20] += amp_sv[0];
       jamp_sv[21] -= amp_sv[0];
       jamp_sv[22] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[14], w_fp[21], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[14], w_fp[21], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -2152,7 +2152,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 116
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[13], w_fp[2], w_fp[12], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[13], w_fp[2], w_fp[12], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -2160,7 +2160,7 @@ namespace mg5amcCpu
       jamp_sv[2] -= amp_sv[0];
       jamp_sv[8] -= amp_sv[0];
       jamp_sv[14] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[13], w_fp[2], w_fp[24], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[13], w_fp[2], w_fp[24], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -2168,7 +2168,7 @@ namespace mg5amcCpu
       jamp_sv[6] += amp_sv[0];
       jamp_sv[8] -= amp_sv[0];
       jamp_sv[12] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[13], w_fp[2], w_fp[21], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[13], w_fp[2], w_fp[21], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -2180,12 +2180,12 @@ namespace mg5amcCpu
       // *** DIAGRAM 117 OF 123 ***
 
       // Wavefunction(s) for diagram number 117
-      VVVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[1], w_fp[5], COUPs[2], 0., 0., w_fp[21] );
-      VVVV3P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[1], w_fp[5], COUPs[2], 0., 0., w_fp[13] );
-      VVVV4P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[1], w_fp[5], COUPs[2], 0., 0., w_fp[24] );
+      VVVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[1], w_fp[5], COUPs[2], 1.0, 0., 0., w_fp[21] );
+      VVVV3P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[1], w_fp[5], COUPs[2], 1.0, 0., 0., w_fp[13] );
+      VVVV4P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[1], w_fp[5], COUPs[2], 1.0, 0., 0., w_fp[24] );
 
       // Amplitude(s) for diagram number 117
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[21], w_fp[7], w_fp[4], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[21], w_fp[7], w_fp[4], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -2197,7 +2197,7 @@ namespace mg5amcCpu
       jamp_sv[15] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[17] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[20] += cxtype( 0, 1 ) * amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[13], w_fp[7], w_fp[4], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[13], w_fp[7], w_fp[4], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -2209,7 +2209,7 @@ namespace mg5amcCpu
       jamp_sv[15] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[16] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[18] += cxtype( 0, 1 ) * amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[24], w_fp[7], w_fp[4], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[24], w_fp[7], w_fp[4], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -2228,7 +2228,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 118
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[11], w_fp[21], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[11], w_fp[21], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -2236,7 +2236,7 @@ namespace mg5amcCpu
       jamp_sv[13] -= amp_sv[0];
       jamp_sv[15] -= amp_sv[0];
       jamp_sv[17] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[11], w_fp[13], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[11], w_fp[13], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -2244,7 +2244,7 @@ namespace mg5amcCpu
       jamp_sv[14] += amp_sv[0];
       jamp_sv[15] -= amp_sv[0];
       jamp_sv[16] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[11], w_fp[24], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[11], w_fp[24], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -2259,7 +2259,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 119
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[15], w_fp[2], w_fp[21], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[15], w_fp[2], w_fp[21], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -2267,7 +2267,7 @@ namespace mg5amcCpu
       jamp_sv[4] -= amp_sv[0];
       jamp_sv[10] -= amp_sv[0];
       jamp_sv[20] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[15], w_fp[2], w_fp[13], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[15], w_fp[2], w_fp[13], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -2275,7 +2275,7 @@ namespace mg5amcCpu
       jamp_sv[7] += amp_sv[0];
       jamp_sv[10] -= amp_sv[0];
       jamp_sv[18] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[15], w_fp[2], w_fp[24], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[15], w_fp[2], w_fp[24], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -2287,12 +2287,12 @@ namespace mg5amcCpu
       // *** DIAGRAM 120 OF 123 ***
 
       // Wavefunction(s) for diagram number 120
-      VVVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[4], w_fp[5], COUPs[2], 0., 0., w_fp[24] );
-      VVVV3P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[4], w_fp[5], COUPs[2], 0., 0., w_fp[15] );
-      VVVV4P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[4], w_fp[5], COUPs[2], 0., 0., w_fp[13] );
+      VVVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[4], w_fp[5], COUPs[2], 1.0, 0., 0., w_fp[24] );
+      VVVV3P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[4], w_fp[5], COUPs[2], 1.0, 0., 0., w_fp[15] );
+      VVVV4P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[4], w_fp[5], COUPs[2], 1.0, 0., 0., w_fp[13] );
 
       // Amplitude(s) for diagram number 120
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[9], w_fp[24], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[9], w_fp[24], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -2300,7 +2300,7 @@ namespace mg5amcCpu
       jamp_sv[7] -= amp_sv[0];
       jamp_sv[9] -= amp_sv[0];
       jamp_sv[11] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[9], w_fp[15], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[9], w_fp[15], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -2308,7 +2308,7 @@ namespace mg5amcCpu
       jamp_sv[8] += amp_sv[0];
       jamp_sv[9] -= amp_sv[0];
       jamp_sv[10] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[9], w_fp[13], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[9], w_fp[13], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -2323,7 +2323,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 121
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[16], w_fp[2], w_fp[24], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[16], w_fp[2], w_fp[24], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -2331,7 +2331,7 @@ namespace mg5amcCpu
       jamp_sv[5] -= amp_sv[0];
       jamp_sv[16] -= amp_sv[0];
       jamp_sv[22] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[16], w_fp[2], w_fp[15], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[16], w_fp[2], w_fp[15], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -2339,7 +2339,7 @@ namespace mg5amcCpu
       jamp_sv[13] += amp_sv[0];
       jamp_sv[16] -= amp_sv[0];
       jamp_sv[19] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[16], w_fp[2], w_fp[13], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[16], w_fp[2], w_fp[13], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -2354,7 +2354,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 122
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[24], w_fp[1], w_fp[7], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[24], w_fp[1], w_fp[7], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -2366,7 +2366,7 @@ namespace mg5amcCpu
       jamp_sv[11] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[16] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[22] -= cxtype( 0, 1 ) * amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[15], w_fp[1], w_fp[7], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[15], w_fp[1], w_fp[7], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -2378,7 +2378,7 @@ namespace mg5amcCpu
       jamp_sv[13] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[16] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[19] -= cxtype( 0, 1 ) * amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[13], w_fp[1], w_fp[7], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[13], w_fp[1], w_fp[7], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -2397,7 +2397,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 123
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[17], w_fp[7], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[17], w_fp[7], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -2409,7 +2409,7 @@ namespace mg5amcCpu
       jamp_sv[11] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[17] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[23] += cxtype( 0, 1 ) * amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[19], w_fp[7], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[19], w_fp[7], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -2421,7 +2421,7 @@ namespace mg5amcCpu
       jamp_sv[15] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[17] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[21] += cxtype( 0, 1 ) * amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[8], w_fp[7], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[8], w_fp[7], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -2870,12 +2870,12 @@ namespace mg5amcCpu
                        fptype* allDenominators,    // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
                        bool* isGoodHel )           // output: isGoodHel[ncomb] - device array (CUDA implementation)
-  { /* clang-format on */
-    fptype allMEsLast = 0;
+  {                                                         /* clang-format on */
     const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid
-    allMEs[ievt] = 0;
     for( int ihel = 0; ihel < ncomb; ihel++ )
     {
+      // NEW IMPLEMENTATION OF GETGOODHEL (#630): RESET THE RUNNING SUM OVER HELICITIES TO 0 BEFORE ADDING A NEW HELICITY
+      allMEs[ievt] = 0;
       // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s)
       constexpr fptype_sv* jamp2_sv = nullptr; // no need for color selection during helicity filtering
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
@@ -2884,12 +2884,11 @@ namespace mg5amcCpu
 #else
       calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv );
 #endif
-      if( allMEs[ievt] != allMEsLast )
+      if( allMEs[ievt] != 0 ) // NEW IMPLEMENTATION OF GETGOODHEL (#630): COMPARE EACH HELICITY CONTRIBUTION TO 0
       {
         //if ( !isGoodHel[ihel] ) std::cout << "sigmaKin_getGoodHel ihel=" << ihel << " TRUE" << std::endl;
         isGoodHel[ihel] = true;
       }
-      allMEsLast = allMEs[ievt]; // running sum up to helicity ihel for event ievt
     }
   }
 #else
@@ -2908,19 +2907,11 @@ namespace mg5amcCpu
     //assert( (size_t)(allMEs) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS]
     // Allocate arrays at build time to contain at least 16 events (or at least neppV events if neppV>16, e.g. in future VPUs)
     constexpr int maxtry0 = std::max( 16, neppV ); // 16, but at least neppV (otherwise the npagV loop does not even start)
-    fptype allMEsLast[maxtry0] = { 0 };            // allocated at build time: maxtry0 must be a constexpr
     // Loop over only nevt events if nevt is < 16 (note that nevt is always >= neppV)
     assert( nevt >= neppV );
     const int maxtry = std::min( maxtry0, nevt ); // 16, but at most nevt (avoid invalid memory access if nevt<maxtry0)
 
-    // PART 0 - INITIALISATION (before calculate_wavefunctions)
-    // Reset the "matrix elements" - running sums of |M|^2 over helicities for the given event
-    for( int ievt = 0; ievt < maxtry; ++ievt )
-    {
-      allMEs[ievt] = 0; // all zeros
-    }
-
-    // PART 1 - HELICITY LOOP: CALCULATE WAVEFUNCTIONS
+    // HELICITY LOOP: CALCULATE WAVEFUNCTIONS
     const int npagV = maxtry / neppV;
 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
     // Mixed fptypes #537: float for color algebra and double elsewhere
@@ -2939,6 +2930,16 @@ namespace mg5amcCpu
 #endif
       for( int ihel = 0; ihel < ncomb; ihel++ )
       {
+        // NEW IMPLEMENTATION OF GETGOODHEL (#630): RESET THE RUNNING SUM OVER HELICITIES TO 0 BEFORE ADDING A NEW HELICITY
+        for( int ieppV = 0; ieppV < neppV; ++ieppV )
+        {
+          const int ievt = ievt00 + ieppV;
+          allMEs[ievt] = 0;
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+          const int ievt2 = ievt00 + ieppV + neppV;
+          allMEs[ievt2] = 0;
+#endif
+        }
         constexpr fptype_sv* jamp2_sv = nullptr; // no need for color selection during helicity filtering
         //std::cout << "sigmaKin_getGoodHel ihel=" << ihel << ( isGoodHel[ihel] ? " true" : " false" ) << std::endl;
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
@@ -2950,22 +2951,18 @@ namespace mg5amcCpu
         for( int ieppV = 0; ieppV < neppV; ++ieppV )
         {
           const int ievt = ievt00 + ieppV;
-          const bool differs = ( allMEs[ievt] != allMEsLast[ievt] );
-          if( differs )
+          if( allMEs[ievt] != 0 ) // NEW IMPLEMENTATION OF GETGOODHEL (#630): COMPARE EACH HELICITY CONTRIBUTION TO 0
           {
             //if ( !isGoodHel[ihel] ) std::cout << "sigmaKin_getGoodHel ihel=" << ihel << " TRUE" << std::endl;
             isGoodHel[ihel] = true;
           }
-          allMEsLast[ievt] = allMEs[ievt]; // running sum up to helicity ihel
 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
           const int ievt2 = ievt00 + ieppV + neppV;
-          const bool differs2 = ( allMEs[ievt2] != allMEsLast[ievt2] );
-          if( differs2 )
+          if( allMEs[ievt2] != 0 ) // NEW IMPLEMENTATION OF GETGOODHEL (#630): COMPARE EACH HELICITY CONTRIBUTION TO 0
           {
             //if ( !isGoodHel[ihel] ) std::cout << "sigmaKin_getGoodHel ihel=" << ihel << " TRUE" << std::endl;
             isGoodHel[ihel] = true;
           }
-          allMEsLast[ievt2] = allMEs[ievt2]; // running sum up to helicity ihel
 #endif
         }
       }
@@ -3022,13 +3019,12 @@ namespace mg5amcCpu
   {
     mgDebugInitialise();
 
-    // SANITY CHECKS for cudacpp code generation (see issues #272 and #343 and PRs #619, #626, #360 and #396)
+    // SANITY CHECKS for cudacpp code generation (see issues #272 and #343 and PRs #619, #626, #360, #396 and #754)
     // These variable are not used anywhere else in the code and their scope is limited to this sanity check
     {
-      // nprocesses>1 was last observed for "mirror processes" in uux_ttx in the 270 branch (see issue #343 and PRs #360 and #396)
+      // nprocesses == 2 may happen for "mirror processes" such as P0_uux_ttx within pp_tt012j (see PR #754)
       constexpr int nprocesses = 1;
-      static_assert( nprocesses == 1, "Assume nprocesses == 1" );
-      // process_id corresponds to the index of DSIG1 Fortran functions (must be 1 because cudacpp is unable to handle DSIG2)
+      static_assert( nprocesses == 1 || nprocesses == 2, "Assume nprocesses == 1 or 2" );
       constexpr int process_id = 1; // code generation source: standalone_cudacpp
       static_assert( process_id == 1, "Assume process_id == 1" );
     }
@@ -3114,23 +3110,26 @@ namespace mg5amcCpu
     }
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     // Event-by-event random choice of color #402
-    const int channelIdC = channelId - 1; // coloramps.h uses the C array indexing starting at 0
-    fptype targetamp[ncolor] = { 0 };
-    for( int icolC = 0; icolC < ncolor; icolC++ )
+    if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783)
     {
-      if( icolC == 0 )
-        targetamp[icolC] = 0;
-      else
-        targetamp[icolC] = targetamp[icolC - 1];
-      if( mgOnGpu::icolamp[channelIdC][icolC] ) targetamp[icolC] += jamp2_sv[icolC];
-    }
-    //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] );
-    for( int icolC = 0; icolC < ncolor; icolC++ )
-    {
-      if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) )
+      const unsigned int channelIdC = channelId - 1; // coloramps.h uses the C array indexing starting at 0
+      fptype targetamp[ncolor] = { 0 };
+      for( int icolC = 0; icolC < ncolor; icolC++ )
       {
-        allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1]
-        break;
+        if( icolC == 0 )
+          targetamp[icolC] = 0;
+        else
+          targetamp[icolC] = targetamp[icolC - 1];
+        if( mgOnGpu::icolamp[channelIdC][icolC] ) targetamp[icolC] += jamp2_sv[icolC];
+      }
+      //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] );
+      for( int icolC = 0; icolC < ncolor; icolC++ )
+      {
+        if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) )
+        {
+          allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1]
+          break;
+        }
       }
     }
 #endif
@@ -3225,57 +3224,60 @@ namespace mg5amcCpu
 #endif
       }
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // multichannel enabled (random color choice)
-      const int channelIdC = channelId - 1; // coloramps.h uses the C array indexing starting at 0
       // Event-by-event random choice of color #402
-      fptype_sv targetamp[ncolor] = { 0 };
-      for( int icolC = 0; icolC < ncolor; icolC++ )
+      if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783)
       {
-        if( icolC == 0 )
-          targetamp[icolC] = fptype_sv{ 0 };
-        else
-          targetamp[icolC] = targetamp[icolC - 1];
-        if( mgOnGpu::icolamp[channelIdC][icolC] ) targetamp[icolC] += jamp2_sv[icolC];
-      }
+        const unsigned int channelIdC = channelId - 1; // coloramps.h uses the C array indexing starting at 0
+        fptype_sv targetamp[ncolor] = { 0 };
+        for( int icolC = 0; icolC < ncolor; icolC++ )
+        {
+          if( icolC == 0 )
+            targetamp[icolC] = fptype_sv{ 0 };
+          else
+            targetamp[icolC] = targetamp[icolC - 1];
+          if( mgOnGpu::icolamp[channelIdC][icolC] ) targetamp[icolC] += jamp2_sv[icolC];
+        }
 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-      fptype_sv targetamp2[ncolor] = { 0 };
-      for( int icolC = 0; icolC < ncolor; icolC++ )
-      {
-        if( icolC == 0 )
-          targetamp2[icolC] = fptype_sv{ 0 };
-        else
-          targetamp2[icolC] = targetamp2[icolC - 1];
-        if( mgOnGpu::icolamp[channelIdC][icolC] ) targetamp2[icolC] += jamp2_sv[ncolor + icolC];
-      }
-#endif
-      for( int ieppV = 0; ieppV < neppV; ++ieppV )
-      {
-        const int ievt = ievt00 + ieppV;
-        //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] );
+        fptype_sv targetamp2[ncolor] = { 0 };
         for( int icolC = 0; icolC < ncolor; icolC++ )
         {
+          if( icolC == 0 )
+            targetamp2[icolC] = fptype_sv{ 0 };
+          else
+            targetamp2[icolC] = targetamp2[icolC - 1];
+          if( mgOnGpu::icolamp[channelIdC][icolC] ) targetamp2[icolC] += jamp2_sv[ncolor + icolC];
+        }
+#endif
+        for( int ieppV = 0; ieppV < neppV; ++ieppV )
+        {
+          const int ievt = ievt00 + ieppV;
+          //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] );
+          for( int icolC = 0; icolC < ncolor; icolC++ )
+          {
 #if defined MGONGPU_CPPSIMD
-          const bool okcol = allrndcol[ievt] < ( targetamp[icolC][ieppV] / targetamp[ncolor - 1][ieppV] );
+            const bool okcol = allrndcol[ievt] < ( targetamp[icolC][ieppV] / targetamp[ncolor - 1][ieppV] );
 #else
-          const bool okcol = allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] );
+            const bool okcol = allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] );
 #endif
-          if( okcol )
-          {
-            allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1]
-            break;
+            if( okcol )
+            {
+              allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1]
+              break;
+            }
           }
-        }
 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-        const int ievt2 = ievt00 + ieppV + neppV;
-        //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt2, allrndcol[ievt2] );
-        for( int icolC = 0; icolC < ncolor; icolC++ )
-        {
-          if( allrndcol[ievt2] < ( targetamp2[icolC][ieppV] / targetamp2[ncolor - 1][ieppV] ) )
+          const int ievt2 = ievt00 + ieppV + neppV;
+          //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt2, allrndcol[ievt2] );
+          for( int icolC = 0; icolC < ncolor; icolC++ )
           {
-            allselcol[ievt2] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1]
-            break;
+            if( allrndcol[ievt2] < ( targetamp2[icolC][ieppV] / targetamp2[ncolor - 1][ieppV] ) )
+            {
+              allselcol[ievt2] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1]
+              break;
+            }
           }
-        }
 #endif
+        }
       }
 #endif // multichannel enabled (random color choice)
     }
diff --git a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/P1_Sigma_sm_gg_ttxgg/CPPProcess.h b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/P1_Sigma_sm_gg_ttxgg/CPPProcess.h
index d85e33bfee..deb1358992 100644
--- a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/P1_Sigma_sm_gg_ttxgg/CPPProcess.h
+++ b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/P1_Sigma_sm_gg_ttxgg/CPPProcess.h
@@ -7,7 +7,7 @@
 // Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+// MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
diff --git a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/P1_Sigma_sm_gg_ttxgg/check_sa.cc b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/P1_Sigma_sm_gg_ttxgg/check_sa.cc
index 1bad694d1c..7cac5ab47b 100644
--- a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/P1_Sigma_sm_gg_ttxgg/check_sa.cc
+++ b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/P1_Sigma_sm_gg_ttxgg/check_sa.cc
@@ -29,7 +29,9 @@
 
 #include <algorithm>
 #include <array>
+#include <cfenv> // for feenableexcept
 #include <cmath>
+#include <csignal> // for signal and SIGFPE
 #include <cstring>
 #include <fstream>
 #include <iomanip>
@@ -74,6 +76,23 @@ usage( char* argv0, int ret = 1 )
   return ret;
 }
 
+#ifdef __CUDACC__
+namespace mg5amcGpu
+#else
+namespace mg5amcCpu
+#endif
+{
+  inline void FPEhandler( int sig )
+  {
+#ifdef __CUDACC__
+    std::cerr << "Floating Point Exception (GPU)" << std::endl;
+#else
+    std::cerr << "Floating Point Exception (CPU)" << std::endl;
+#endif
+    exit( 0 );
+  }
+}
+
 int
 main( int argc, char** argv )
 {
@@ -84,6 +103,18 @@ main( int argc, char** argv )
   using namespace mg5amcCpu;
 #endif
 
+  // Enable FPEs (test #701 and #733 - except on MacOS where feenableexcept is not defined #730)
+#ifndef __APPLE__
+  const char* enableFPEc = getenv( "CUDACPP_RUNTIME_ENABLEFPE" );
+  const bool enableFPE = ( enableFPEc != 0 ) && ( std::string( enableFPEc ) != "" );
+  if( enableFPE )
+  {
+    std::cout << "WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions" << std::endl;
+    feenableexcept( FE_INVALID | FE_DIVBYZERO | FE_OVERFLOW | FE_UNDERFLOW ); // debug #701
+    signal( SIGFPE, FPEhandler );
+  }
+#endif
+
   // DEFAULTS FOR COMMAND LINE ARGUMENTS
   bool verbose = false;
   bool debug = false;
@@ -103,12 +134,14 @@ main( int argc, char** argv )
     CurandHost = 1,
     CurandDevice = 2
   };
-#ifdef __CUDACC__
-  RandomNumberMode rndgen = RandomNumberMode::CurandDevice; // default on CUDA GPU
-#elif not defined MGONGPU_HAS_NO_CURAND
-  RandomNumberMode rndgen = RandomNumberMode::CurandHost;  // default on CPU if build has curand
+#ifdef MGONGPU_HAS_NO_CURAND
+  RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // this is the only supported mode if build has no curand (PR #784 and #785)
+#elif defined __HIPCC__
+#error Internal error: MGONGPU_HAS_NO_CURAND should have been set for __HIPCC__ // default on AMD GPUs should be common random
+#elif defined __CUDACC__
+  RandomNumberMode rndgen = RandomNumberMode::CurandDevice; // default on NVidia GPU if build has curand
 #else
-  RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // default on CPU if build has no curand and on HIP GPU
+  RandomNumberMode rndgen = RandomNumberMode::CurandHost; // default on CPU if build has curand
 #endif
   // Rambo sampling mode (NB RamboHost implies CommonRandom or CurandHost!)
   enum class RamboSamplingMode
@@ -146,18 +179,20 @@ main( int argc, char** argv )
     }
     else if( arg == "--curdev" )
     {
-#ifdef __CUDACC__
-      rndgen = RandomNumberMode::CurandDevice;
+#ifndef __CUDACC__
+      throw std::runtime_error( "CurandDevice is not supported on CPUs or non-NVidia GPUs" );
+#elif defined MGONGPU_HAS_NO_CURAND
+      throw std::runtime_error( "CurandDevice is not supported because this application was built without Curand support" );
 #else
-      throw std::runtime_error( "CurandDevice is not supported on CPUs or on HIP GPUs" );
+      rndgen = RandomNumberMode::CurandDevice;
 #endif
     }
     else if( arg == "--curhst" )
     {
-#ifndef MGONGPU_HAS_NO_CURAND
-      rndgen = RandomNumberMode::CurandHost;
-#else
+#ifdef MGONGPU_HAS_NO_CURAND
       throw std::runtime_error( "CurandHost is not supported because this application was built without Curand support" );
+#else
+      rndgen = RandomNumberMode::CurandHost;
 #endif
     }
     else if( arg == "--common" )
@@ -278,10 +313,10 @@ main( int argc, char** argv )
   const std::string procKey = "0a ProcInit";
   timermap.start( procKey );
 
-  // Create a process object
+  // Create a process object, read param card and set parameters
+  // FIXME: the process instance can happily go out of scope because it is only needed to read parameters?
+  // FIXME: the CPPProcess should really be a singleton? (for instance, in bridge mode this will be called twice here?)
   CPPProcess process( verbose );
-
-  // Read param_card and set parameters
   process.initProc( "../../Cards/param_card.dat" );
   const fptype energy = 1500; // historical default, Ecms = 1500 GeV = 1.5 TeV (above the Z peak)
   //const fptype energy = 91.2; // Ecms = 91.2 GeV (Z peak)
@@ -389,30 +424,26 @@ main( int argc, char** argv )
   {
     prnk.reset( new CommonRandomNumberKernel( hstRndmom ) );
   }
-#ifndef MGONGPU_HAS_NO_CURAND
   else if( rndgen == RandomNumberMode::CurandHost )
   {
+#ifdef MGONGPU_HAS_NO_CURAND
+    throw std::runtime_error( "INTERNAL ERROR! CurandHost is not supported because this application was built without Curand support" ); // INTERNAL ERROR (no path to this statement)
+#else
     const bool onDevice = false;
     prnk.reset( new CurandRandomNumberKernel( hstRndmom, onDevice ) );
+#endif
   }
-#ifdef __CUDACC__
   else
   {
+#ifdef MGONGPU_HAS_NO_CURAND
+    throw std::runtime_error( "INTERNAL ERROR! CurandDevice is not supported because this application was built without Curand support" ); // INTERNAL ERROR (no path to this statement)
+#elif defined __CUDACC__
     const bool onDevice = true;
     prnk.reset( new CurandRandomNumberKernel( devRndmom, onDevice ) );
-  }
 #else
-  else
-  {
-    throw std::logic_error( "CurandDevice is not supported on CPUs or HIP GPUs" ); // INTERNAL ERROR (no path to this statement)
-  }
+    throw std::logic_error( "INTERNAL ERROR! CurandDevice is not supported on CPUs or non-NVidia GPUs" ); // INTERNAL ERROR (no path to this statement)
 #endif
-#else
-  else
-  {
-    throw std::logic_error( "This application was built without Curand support" ); // INTERNAL ERROR (no path to this statement)
   }
-#endif
 
   // --- 0c. Create rambo sampling kernel [keep this in 0c for the moment]
   std::unique_ptr<SamplingKernelBase> prsk;
@@ -747,7 +778,7 @@ main( int argc, char** argv )
   wrkflwtxt += "HIP:";
 #else
   wrkflwtxt += "CPP:";
-#endif
+#endif /* clang-format off */
   // -- DOUBLE or FLOAT?
 #if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
   wrkflwtxt += "MIX+"; // mixed fptypes (single precision color algebra #537)
@@ -757,7 +788,7 @@ main( int argc, char** argv )
   wrkflwtxt += "FLT+";
 #else
   wrkflwtxt += "???+"; // no path to this statement
-#endif
+#endif /* clang-format on */
   // -- CUCOMPLEX or THRUST or STD complex numbers?
 #ifdef __CUDACC__
 #if defined MGONGPU_CUCXTYPE_CUCOMPLEX
diff --git a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/cudacpp.mk b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/cudacpp.mk
index 59a2c906eb..f2cfa349da 100644
--- a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/cudacpp.mk
+++ b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/cudacpp.mk
@@ -4,10 +4,13 @@
 # Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 
 #=== Determine the name of this makefile (https://ftp.gnu.org/old-gnu/Manuals/make-3.80/html_node/make_17.html)
-#=== NB: different names (e.g. cudacpp.mk and cudacpp_src.mk) are used in the Subprocess and src directories
+#=== NB: use ':=' to ensure that the value of CUDACPP_MAKEFILE is not modified further down after including make_opts
+#=== NB: use 'override' to ensure that the value can not be modified from the outside
+override CUDACPP_MAKEFILE := $(word $(words $(MAKEFILE_LIST)),$(MAKEFILE_LIST))
+###$(info CUDACPP_MAKEFILE='$(CUDACPP_MAKEFILE)')
 
-CUDACPP_MAKEFILE = $(word $(words $(MAKEFILE_LIST)),$(MAKEFILE_LIST))
-CUDACPP_SRC_MAKEFILE = cudacpp_src.mk
+#=== NB: different names (e.g. cudacpp.mk and cudacpp_src.mk) are used in the Subprocess and src directories
+override CUDACPP_SRC_MAKEFILE = cudacpp_src.mk
 
 #-------------------------------------------------------------------------------
 
@@ -29,7 +32,17 @@ UNAME_P := $(shell uname -p)
 
 #-------------------------------------------------------------------------------
 
-#=== Configure common compiler flags for C++ and CUDA
+#=== Include the common MG5aMC Makefile options
+
+# OM: this is crucial for MG5aMC flag consistency/documentation
+# AV: temporarely comment this out because it breaks cudacpp builds
+ifneq ($(wildcard ../../Source/make_opts),)
+include ../../Source/make_opts
+endif
+
+#-------------------------------------------------------------------------------
+
+#=== Configure common compiler flags for C++ and CUDA/HIP
 
 INCFLAGS = -I.
 OPTFLAGS = -O3 # this ends up in GPUFLAGS too (should it?), cannot add -Ofast or -ffast-math here
@@ -101,68 +114,85 @@ endif
 # Note: AR, CXX and FC are implicitly defined if not set externally
 # See https://www.gnu.org/software/make/manual/html_node/Implicit-Variables.html
 
-#-------------------------------------------------------------------------------
-
-CUDA_COMPILER_PATH := $(shell compiler="`which nvcc 2>/dev/null`" && while [ -L "$$compiler" ]; do compiler=`readlink "$$compiler"`; done && echo "$$compiler")
-HIP_COMPILER_PATH := $(shell compiler="`which hipcc 2>/dev/null`" && while [ -L "$$compiler" ]; do compiler=`readlink "$$compiler"`; done && echo "$$compiler")
-
-ifeq ($(findstring nvcc,$(CUDA_COMPILER_PATH)),nvcc)
-  #=== Configure the CUDA compiler
-
-  # If CXX is not a single word (example "clang++ --gcc-toolchain...") then disable CUDA builds (issue #505)
-  # This is because it is impossible to pass this to "GPUFLAGS += -ccbin <host-compiler>" below
-  ifneq ($(words $(subst ccache ,,$(CXX))),1) # allow at most "CXX=ccache <host-compiler>" from outside
-    $(warning CUDA builds are not supported for multi-word CXX "$(CXX)")
-    override CUDA_HOME=disabled
-  endif
+# Add -mmacosx-version-min=11.3 to avoid "ld: warning: object file was built for newer macOS version than being linked"
+ifneq ($(shell $(CXX) --version | egrep '^Apple clang'),)
+CXXFLAGS += -mmacosx-version-min=11.3
+endif
 
-  # If CUDA_HOME is not set, try to set it from the location of nvcc
-  ifndef CUDA_HOME
-    CUDA_HOME = $(patsubst %bin/nvcc,%,$(shell which nvcc 2>/dev/null))
-    $(warning CUDA_HOME was not set: using "$(CUDA_HOME)")
-  endif
+#-------------------------------------------------------------------------------
 
-  # Set GPUCC as $(CUDA_HOME)/bin/nvcc if it exists
-  ifneq ($(wildcard $(CUDA_HOME)/bin/nvcc),)
-    GPUCC = $(CUDA_HOME)/bin/nvcc
-    USE_NVTX ?=-DUSE_NVTX
-    # See https://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/index.html
-    # See https://arnon.dk/matching-sm-architectures-arch-and-gencode-for-various-nvidia-cards/
-    # Default: use compute capability 70 for V100 (CERN lxbatch, CERN itscrd, Juwels Cluster).
-    # Embed device code for 70, and PTX for 70+.
-    # Export MADGRAPH_CUDA_ARCHITECTURE (comma-separated list) to use another value or list of values (see #533).
-    # Examples: use 60 for P100 (Piz Daint), 80 for A100 (Juwels Booster, NVidia raplab/Curiosity).
-    MADGRAPH_CUDA_ARCHITECTURE ?= 70
-    ###CUARCHFLAGS = -gencode arch=compute_$(MADGRAPH_CUDA_ARCHITECTURE),code=compute_$(MADGRAPH_CUDA_ARCHITECTURE) -gencode arch=compute_$(MADGRAPH_CUDA_ARCHITECTURE),code=sm_$(MADGRAPH_CUDA_ARCHITECTURE) # Older implementation (AV): go back to this one for multi-GPU support #533
-    ###CUARCHFLAGS = --gpu-architecture=compute_$(MADGRAPH_CUDA_ARCHITECTURE) --gpu-code=sm_$(MADGRAPH_CUDA_ARCHITECTURE),compute_$(MADGRAPH_CUDA_ARCHITECTURE) # Newer implementation (SH): cannot use this as-is for multi-GPU support #533
-    comma:=,
-    CUARCHFLAGS = $(foreach arch,$(subst $(comma), ,$(MADGRAPH_CUDA_ARCHITECTURE)),-gencode arch=compute_$(arch),code=compute_$(arch) -gencode arch=compute_$(arch),code=sm_$(arch))
-    CUINC = -I$(CUDA_HOME)/include/
-    CURANDLIBFLAGS = -L$(CUDA_HOME)/lib64/ -lcurand # NB: -lcuda is not needed here!
-    CUOPTFLAGS = -lineinfo
-    GPUFLAGS = $(foreach opt, $(OPTFLAGS), -Xcompiler $(opt)) $(CUOPTFLAGS) $(INCFLAGS) $(CUINC) $(USE_NVTX) $(CUARCHFLAGS) -use_fast_math
-    ###GPUFLAGS += -Xcompiler -Wall -Xcompiler -Wextra -Xcompiler -Wshadow
-    ###GPUCC_VERSION = $(shell $(GPUCC) --version | grep 'Cuda compilation tools' | cut -d' ' -f5 | cut -d, -f1)
-    GPUFLAGS += -std=c++17 # need CUDA >= 11.2 (see #333): this is enforced in mgOnGpuConfig.h
-    # Without -maxrregcount: baseline throughput: 6.5E8 (16384 32 12) up to 7.3E8 (65536 128 12)
-    ###GPUFLAGS+= --maxrregcount 160 # improves throughput: 6.9E8 (16384 32 12) up to 7.7E8 (65536 128 12)
-    ###GPUFLAGS+= --maxrregcount 128 # improves throughput: 7.3E8 (16384 32 12) up to 7.6E8 (65536 128 12)
-    ###GPUFLAGS+= --maxrregcount 96 # degrades throughput: 4.1E8 (16384 32 12) up to 4.5E8 (65536 128 12)
-    ###GPUFLAGS+= --maxrregcount 64 # degrades throughput: 1.7E8 (16384 32 12) flat at 1.7E8 (65536 128 12)
-    CUBUILDRULEFLAGS = -Xcompiler -fPIC -c
-    CCBUILDRULEFLAGS = -Xcompiler -fPIC -c -x cu
-    CUDATESTFLAGS = -lcuda
-  else ifneq ($(origin REQUIRE_CUDA),undefined)
-    # If REQUIRE_CUDA is set but no cuda is found, stop here (e.g. for CI tests on GPU #443)
-    $(error No cuda installation found (set CUDA_HOME or make GPUCC visible in PATH))
+#=== Configure the GPU compiler (CUDA or HIP)
+
+# FIXME! (AV 24.01.2024)
+# In the current implementation (without separate builds for C++ and CUDA/HIP), we first check for cudacc and hipcc in CUDA_HOME and HIP_HOME.
+# If CUDA_HOME or HIP_HOME are not set, try to determine them from the path to cudacc and hipcc.
+# While convoluted, this is currently necessary to allow disabling CUDA/HIP builds by setting CUDA_HOME or HIP_HOME to invalid paths.
+# This will (probably?) be fixed when separate C++ and CUDA/HIP builds are implemented (PR #775).
+
+# If CXX is not a single word (example "clang++ --gcc-toolchain...") then disable CUDA and HIP builds (issue #505)
+# This is because it is impossible to pass this to "GPUFLAGS += -ccbin <host-compiler>" below
+ifneq ($(words $(subst ccache ,,$(CXX))),1) # allow at most "CXX=ccache <host-compiler>" from outside
+  $(warning CUDA and HIP builds are not supported for multi-word CXX "$(CXX)")
+  override CUDA_HOME=disabled
+  override HIP_HOME=disabled
+endif
+
+# If CUDA_HOME is not set, try to set it from the path to nvcc
+ifndef CUDA_HOME
+  CUDA_HOME = $(patsubst %bin/nvcc,%,$(shell which nvcc 2>/dev/null))
+  $(warning CUDA_HOME was not set: using "$(CUDA_HOME)")
+endif
+
+# If HIP_HOME is not set, try to set it from the path to hipcc
+ifndef HIP_HOME
+  HIP_HOME = $(patsubst %bin/hipcc,%,$(HIP_COMPILER_PATH))
+  $(warning HIP_HOME was not set: using "$(HIP_HOME)")
+endif
+
+# FIXME! (AV 24.01.2024)
+# In the current implementation (without separate builds for C++ and CUDA/HIP),
+# builds are performed for HIP only if CUDA is not found in the path.
+# If both CUDA and HIP are installed, HIP builds can be triggered by unsetting CUDA_HOME.
+# This will be fixed when separate C++ and CUDA/HIP builds are implemented (PR #775).
+
+#--- Option 1: CUDA exists -> use CUDA
+
+# Set GPUCC as $(CUDA_HOME)/bin/nvcc if it exists
+ifneq ($(wildcard $(CUDA_HOME)/bin/nvcc),)
+
+  GPUCC = $(CUDA_HOME)/bin/nvcc
+  USE_NVTX ?=-DUSE_NVTX
+  # See https://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/index.html
+  # See https://arnon.dk/matching-sm-architectures-arch-and-gencode-for-various-nvidia-cards/
+  # Default: use compute capability 70 for V100 (CERN lxbatch, CERN itscrd, Juwels Cluster).
+  # Embed device code for 70, and PTX for 70+.
+  # Export MADGRAPH_CUDA_ARCHITECTURE (comma-separated list) to use another value or list of values (see #533).
+  # Examples: use 60 for P100 (Piz Daint), 80 for A100 (Juwels Booster, NVidia raplab/Curiosity).
+  MADGRAPH_CUDA_ARCHITECTURE ?= 70
+  ###CUARCHFLAGS = -gencode arch=compute_$(MADGRAPH_CUDA_ARCHITECTURE),code=compute_$(MADGRAPH_CUDA_ARCHITECTURE) -gencode arch=compute_$(MADGRAPH_CUDA_ARCHITECTURE),code=sm_$(MADGRAPH_CUDA_ARCHITECTURE) # Older implementation (AV): go back to this one for multi-GPU support #533
+  ###CUARCHFLAGS = --gpu-architecture=compute_$(MADGRAPH_CUDA_ARCHITECTURE) --gpu-code=sm_$(MADGRAPH_CUDA_ARCHITECTURE),compute_$(MADGRAPH_CUDA_ARCHITECTURE) # Newer implementation (SH): cannot use this as-is for multi-GPU support #533
+  comma:=,
+  CUARCHFLAGS = $(foreach arch,$(subst $(comma), ,$(MADGRAPH_CUDA_ARCHITECTURE)),-gencode arch=compute_$(arch),code=compute_$(arch) -gencode arch=compute_$(arch),code=sm_$(arch))
+  CUINC = -I$(CUDA_HOME)/include/
+  ifeq ($(RNDGEN),hasNoCurand)
+    CURANDLIBFLAGS=
   else
-    # No cuda. Switch cuda compilation off and go to common random numbers in C++
-    $(warning CUDA_HOME is not set or is invalid: export CUDA_HOME to compile with cuda)
-    override GPUCC=
-    override USE_NVTX=
-    override CUINC=
-    override CURANDLIBFLAGS=
+    CURANDLIBFLAGS = -L$(CUDA_HOME)/lib64/ -lcurand # NB: -lcuda is not needed here!
   endif
+  CUOPTFLAGS = -lineinfo
+  ###GPUFLAGS = $(OPTFLAGS) $(CUOPTFLAGS) $(INCFLAGS) $(CUINC) $(USE_NVTX) $(CUARCHFLAGS) -use_fast_math
+  GPUFLAGS = $(foreach opt, $(OPTFLAGS), -Xcompiler $(opt)) $(CUOPTFLAGS) $(INCFLAGS) $(CUINC) $(USE_NVTX) $(CUARCHFLAGS) -use_fast_math
+  ###GPUFLAGS += -Xcompiler -Wall -Xcompiler -Wextra -Xcompiler -Wshadow
+  ###GPUCC_VERSION = $(shell $(GPUCC) --version | grep 'Cuda compilation tools' | cut -d' ' -f5 | cut -d, -f1)
+  GPUFLAGS += -std=c++17 # need CUDA >= 11.2 (see #333): this is enforced in mgOnGpuConfig.h
+  # Without -maxrregcount: baseline throughput: 6.5E8 (16384 32 12) up to 7.3E8 (65536 128 12)
+  ###GPUFLAGS+= --maxrregcount 160 # improves throughput: 6.9E8 (16384 32 12) up to 7.7E8 (65536 128 12)
+  ###GPUFLAGS+= --maxrregcount 128 # improves throughput: 7.3E8 (16384 32 12) up to 7.6E8 (65536 128 12)
+  ###GPUFLAGS+= --maxrregcount 96 # degrades throughput: 4.1E8 (16384 32 12) up to 4.5E8 (65536 128 12)
+  ###GPUFLAGS+= --maxrregcount 64 # degrades throughput: 1.7E8 (16384 32 12) flat at 1.7E8 (65536 128 12)
+  CUBUILDRULEFLAGS = -Xcompiler -fPIC -c
+  CCBUILDRULEFLAGS = -Xcompiler -fPIC -c -x cu
+  CUDATESTFLAGS = -lcuda
 
   # Set the host C++ compiler for GPUCC via "-ccbin <host-compiler>"
   # (NB issue #505: this must be a single word, "clang++ --gcc-toolchain..." is not supported)
@@ -173,71 +203,55 @@ ifeq ($(findstring nvcc,$(CUDA_COMPILER_PATH)),nvcc)
   GPUFLAGS += -allow-unsupported-compiler
   endif
 
-else ifeq ($(findstring hipcc,$(HIP_COMPILER_PATH)),hipcc)
-  #=== Configure the HIP compiler
+else ifneq ($(origin REQUIRE_CUDA),undefined)
 
-  # If CXX is not a single word (example "clang++ --gcc-toolchain...") then disable CUDA/HIP builds (issue #505)
-  # This is because it is impossible to pass this to "GPUFLAGS += -ccbin <host-compiler>" below
-  ifneq ($(words $(subst ccache ,,$(CXX))),1) # allow at most "CXX=ccache <host-compiler>" from outside
-    $(warning HIP builds are not supported for multi-word CXX "$(CXX)")
-    override HIP_HOME=disabled
-  endif
+  # If REQUIRE_CUDA is set but no cuda is found, stop here (e.g. for CI tests on GPU #443)
+  $(error No cuda installation found (set CUDA_HOME or make GPUCC visible in PATH))
 
-  # If HIP_HOME is not set, try to set it from the location of GPUCC
-  ifndef HIP_HOME
-    HIP_HOME = $(patsubst %bin/hipcc,%,$(HIP_COMPILER_PATH))
-    $(warning HIP_HOME was not set: using "$(HIP_HOME)")
-  endif
+#--- Option 2: CUDA does not exist, HIP exists -> use HIP
 
-  # Set GPUCC as $(HIP_HOME)/bin/hipcc if it exists
-  ifneq ($(wildcard $(HIP_HOME)/bin/hipcc),)
-    GPUCC = $(HIP_HOME)/bin/hipcc
-
-    # Should maybe find something equivelant to this in HIP
-    #USE_NVTX ?=-DUSE_NVTX
-
-    HIPARCHFLAGS = -target x86_64-linux-gnu --offload-arch=gfx90a
-    HIPINC = -I$(HIP_HOME)/include/
-
-    # -DHIP_FAST_MATH equivelant to -use_fast_math in HIP 
-    # (But only for single precision line 208: https://rocm-developer-tools.github.io/HIP/hcc__detail_2math__functions_8h_source.html)
-    GPUFLAGS = $(OPTFLAGS) $(CUOPTFLAGS) $(INCFLAGS) $(HIPINC) $(HIPARCHFLAGS) -DHIP_FAST_MATH -DHIP_PLATFORM=amd -fPIC
-    ###GPUFLAGS += -Xcompiler -Wall -Xcompiler -Wextra -Xcompiler -Wshadow
-    GPUFLAGS += -std=c++17
-    # Without -maxrregcount: baseline throughput: 6.5E8 (16384 32 12) up to 7.3E8 (65536 128 12)
-    ###GPUFLAGS+= --maxrregcount 160 # improves throughput: 6.9E8 (16384 32 12) up to 7.7E8 (65536 128 12)
-    ###GPUFLAGS+= --maxrregcount 128 # improves throughput: 7.3E8 (16384 32 12) up to 7.6E8 (65536 128 12)
-    ###GPUFLAGS+= --maxrregcount 96 # degrades throughput: 4.1E8 (16384 32 12) up to 4.5E8 (65536 128 12)
-    ###GPUFLAGS+= --maxrregcount 64 # degrades throughput: 1.7E8 (16384 32 12) flat at 1.7E8 (65536 128 12)
-
-    CUBUILDRULEFLAGS = -fPIC -c
-    CCBUILDRULEFLAGS = -fPIC -c
-
-  else ifneq ($(origin REQUIRE_HIP),undefined)
-    # If REQUIRE_HIP is set but no cuda is found, stop here (e.g. for CI tests on GPU #443)
-    $(error No hip installation found (set HIP_HOME or make GPUCC visible in PATH))
-  else
-    # No hip. Switch hip compilation off and go to common random numbers in C++
-    $(warning HIP_HOME is not set or is invalid: export HIP_HOME to compile with hip)
-    override GPUCC=
-    override USE_NVTX=
-    override CUINC=
-    override CURANDLIBFLAGS=
-  endif
+# Set GPUCC as $(HIP_HOME)/bin/hipcc if it exists
+else ifneq ($(wildcard $(HIP_HOME)/bin/hipcc),)
 
-  # Allow newer (unsupported) C++ compilers with older versions of CUDA if ALLOW_UNSUPPORTED_COMPILER_IN_CUDA is set (#504)
-  ifneq ($(origin ALLOW_UNSUPPORTED_COMPILER_IN_CUDA),undefined)
-  GPUFLAGS += -allow-unsupported-compiler
-  endif
+  GPUCC = $(HIP_HOME)/bin/hipcc
+  #USE_NVTX ?=-DUSE_NVTX # should maybe find something equivalent to this in HIP?
+  HIPARCHFLAGS = -target x86_64-linux-gnu --offload-arch=gfx90a
+  HIPINC = -I$(HIP_HOME)/include/
+  # Note: -DHIP_FAST_MATH is equivalent to -use_fast_math in HIP 
+  # (but only for single precision line 208: https://rocm-developer-tools.github.io/HIP/hcc__detail_2math__functions_8h_source.html)
+  GPUFLAGS = $(OPTFLAGS) $(CUOPTFLAGS) $(INCFLAGS) $(HIPINC) $(HIPARCHFLAGS) -DHIP_FAST_MATH -DHIP_PLATFORM=amd -fPIC
+  ###GPUFLAGS += -Xcompiler -Wall -Xcompiler -Wextra -Xcompiler -Wshadow
+  GPUFLAGS += -std=c++17
+  ###GPUFLAGS+= --maxrregcount 255 # (AV: is this option valid on HIP and meaningful on AMD GPUs?)
+  CUBUILDRULEFLAGS = -fPIC -c
+  CCBUILDRULEFLAGS = -fPIC -c
+
+else ifneq ($(origin REQUIRE_HIP),undefined)
+
+  # If REQUIRE_HIP is set but no HIP is found, stop here (e.g. for CI tests on GPU #443)
+  $(error No hip installation found (set HIP_HOME or make GPUCC visible in PATH))
+
+#--- Option 3: CUDA does not exist, HIP does not exist -> switch off both CUDA and HIP
+
+else
+
+  # No cudacc and no hipcc: switch CUDA and HIP compilation off and go to common random numbers in C++
+  $(warning CUDA_HOME is not set or is invalid: export CUDA_HOME to compile with cuda)
+  $(warning HIP_HOME is not set or is invalid: export HIP_HOME to compile with hip)
+  override GPUCC=
+  override USE_NVTX=
+  override CUINC=
+  override CURANDLIBFLAGS=
 
 endif
 
+# Export GPUCC (so that it can also be used in cudacpp_src.mk?)
 export GPUCC
 export GPUFLAGS
 
 #-------------------------------------------------------------------------------
 
-#=== Configure ccache for C++ and CUDA builds
+#=== Configure ccache for C++ and CUDA/HIP builds
 
 # Enable ccache if USECCACHE=1
 ifeq ($(USECCACHE)$(shell echo $(CXX) | grep ccache),1)
@@ -254,7 +268,7 @@ endif
 
 #-------------------------------------------------------------------------------
 
-#=== Configure PowerPC-specific compiler flags for C++ and CUDA
+#=== Configure PowerPC-specific compiler flags for C++ and CUDA/HIP
 
 # PowerPC-specific CXX compiler flags (being reviewed)
 ifeq ($(UNAME_P),ppc64le)
@@ -270,7 +284,7 @@ else
   ######CXXFLAGS+= -fno-semantic-interposition # no benefit (neither alone, nor combined with -flto)
 endif
 
-# PowerPC-specific CUDA compiler flags (to be reviewed!)
+# PowerPC-specific CUDA/HIP compiler flags (to be reviewed!)
 ifeq ($(UNAME_P),ppc64le)
   GPUFLAGS+= -Xcompiler -mno-float128
 endif
@@ -285,12 +299,14 @@ override OMPFLAGS = -fopenmp
 ###override OMPFLAGS = # disable OpenMP MT on Intel (was ok without GPUCC but not ok with GPUCC before #578)
 else ifneq ($(shell $(CXX) --version | egrep '^(clang)'),)
 override OMPFLAGS = -fopenmp
-###override OMPFLAGS = # disable OpenMP MT on clang (was not ok without or with GPUCC before #578)
-else ifneq ($(shell $(CXX) --version | egrep '^(Apple clang)'),)
-override OMPFLAGS = # disable OpenMP MT on Apple clang (builds fail in the CI #578)
+###override OMPFLAGS = # disable OpenMP MT on clang (was not ok without or with nvcc before #578)
+###else ifneq ($(shell $(CXX) --version | egrep '^(Apple clang)'),) # AV for Mac (Apple clang compiler)
+else ifeq ($(UNAME_S),Darwin) # OM for Mac (any compiler)
+override OMPFLAGS = # AV disable OpenMP MT on Apple clang (builds fail in the CI #578)
+###override OMPFLAGS = -fopenmp # OM reenable OpenMP MT on Apple clang? (AV Oct 2023: this still fails in the CI)
 else
-override OMPFLAGS = -fopenmp
-###override OMPFLAGS = # disable OpenMP MT (default before #575)
+override OMPFLAGS = -fopenmp # enable OpenMP MT by default on all other platforms
+###override OMPFLAGS = # disable OpenMP MT on all other platforms (default before #575)
 endif
 
 # Set the default AVX (vectorization) choice
@@ -356,7 +372,7 @@ export OMPFLAGS
 
 #-------------------------------------------------------------------------------
 
-#=== Set the CUDA/C++ compiler flags appropriate to user-defined choices of AVX, FPTYPE, HELINL, HRDCOD, RNDGEN
+#=== Set the CUDA/HIP/C++ compiler flags appropriate to user-defined choices of AVX, FPTYPE, HELINL, HRDCOD, RNDGEN
 
 # Set the build flags appropriate to OMPFLAGS
 $(info OMPFLAGS=$(OMPFLAGS))
@@ -573,8 +589,9 @@ $(BUILDDIR)/gcheck_sa.o: CXXFLAGS += $(USE_NVTX) $(CUINC)
 
 # Apply special build flags only to check_sa and CurandRandomNumberKernel (curand headers, #679)
 $(BUILDDIR)/check_sa.o: CXXFLAGS += $(CXXFLAGSCURAND)
-$(BUILDDIR)/gcheck_sa.o: CXXFLAGS += $(CXXFLAGSCURAND)
+$(BUILDDIR)/gcheck_sa.o: CUFLAGS += $(CXXFLAGSCURAND)
 $(BUILDDIR)/CurandRandomNumberKernel.o: CXXFLAGS += $(CXXFLAGSCURAND)
+$(BUILDDIR)/gCurandRandomNumberKernel.o: CUFLAGS += $(CXXFLAGSCURAND)
 ifeq ($(RNDGEN),hasCurand)
 $(BUILDDIR)/CurandRandomNumberKernel.o: CXXFLAGS += $(CUINC)
 endif
@@ -772,12 +789,18 @@ $(testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(cxx_object
 	  $(GPUCC) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) -ldl $(LIBFLAGS) $(CUDATESTFLAGS)
 endif
 
+# Use target gtestlibs to build only googletest
+ifneq ($(GTESTLIBS),)
+gtestlibs: $(GTESTLIBS)
+endif
+
 # Use flock (Linux only, no Mac) to allow 'make -j' if googletest has not yet been downloaded https://stackoverflow.com/a/32666215
 $(GTESTLIBS):
 ifneq ($(shell which flock 2>/dev/null),)
+	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
 	flock $(BUILDDIR)/.make_test.lock $(MAKE) -C $(TESTDIR)
 else
-	$(MAKE) -C $(TESTDIR)
+	if [ -d $(TESTDIR) ]; then $(MAKE) -C $(TESTDIR); fi
 endif
 
 #-------------------------------------------------------------------------------
diff --git a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/fbridge.cc b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/fbridge.cc
index 2b956730d4..22ce3f5115 100644
--- a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/fbridge.cc
+++ b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/fbridge.cc
@@ -49,11 +49,7 @@ extern "C"
 #ifdef MGONGPUCPP_GPUIMPL
     GpuRuntime::setUp();
 #endif
-    // Create a process object, read parm card and set parameters
-    // FIXME: the process instance can happily go out of scope because it is only needed to read parameters?
-    // FIXME: the CPPProcess should really be a singleton? what if fbridgecreate is called from several Fortran threads?
-    CPPProcess process( /*verbose=*/false );
-    process.initProc( "../../Cards/param_card.dat" );
+    // (NB: CPPProcess::initProc no longer needs to be executed here because it is called in the Bridge constructor)
     // FIXME: disable OMP in Bridge when called from Fortran
     *ppbridge = new Bridge<FORTRANFPTYPE>( *pnevtF, *pnparF, *pnp4F );
   }
diff --git a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/runTest.cc b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/runTest.cc
index 0ed26180ca..de327f2321 100644
--- a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/runTest.cc
+++ b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/runTest.cc
@@ -71,6 +71,8 @@ struct CPUTest : public CUDA_CPU_TestBase
     , hstSelCol( nevt )
     , hstIsGoodHel( CPPProcess::ncomb )
   {
+    // FIXME: the process instance can happily go out of scope because it is only needed to read parameters?
+    // FIXME: the CPPProcess should really be a singleton?
     process.initProc( "../../Cards/param_card.dat" );
   }
 
@@ -183,6 +185,8 @@ struct CUDATest : public CUDA_CPU_TestBase
     , devSelCol( nevt )
     , devIsGoodHel( CPPProcess::ncomb )
   {
+    // FIXME: the process instance can happily go out of scope because it is only needed to read parameters?
+    // FIXME: the CPPProcess should really be a singleton?
     process.initProc( "../../Cards/param_card.dat" );
   }
 
diff --git a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/testxxx.cc b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/testxxx.cc
index 016bc0f472..e5167de00c 100644
--- a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/testxxx.cc
+++ b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/testxxx.cc
@@ -59,7 +59,8 @@ TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testxxx )
   using namespace mg5amcCpu;
 #endif
 #ifndef __APPLE__ // test #701 (except on MacOS where feenableexcept is not defined #730)
-  const bool enableFPE = !getenv( "CUDACPP_RUNTIME_DISABLEFPE" );
+  const char* enableFPEc = getenv( "CUDACPP_RUNTIME_ENABLEFPE" );
+  const bool enableFPE = ( enableFPEc != 0 ) && ( std::string( enableFPEc ) != "" );
   if( enableFPE )
   {
     feenableexcept( FE_INVALID | FE_DIVBYZERO | FE_OVERFLOW | FE_UNDERFLOW ); // debug #701
diff --git a/epochX/cudacpp/gg_ttgg.sa/src/HelAmps_sm.h b/epochX/cudacpp/gg_ttgg.sa/src/HelAmps_sm.h
index 459f21394d..8b4ad719be 100644
--- a/epochX/cudacpp/gg_ttgg.sa/src/HelAmps_sm.h
+++ b/epochX/cudacpp/gg_ttgg.sa/src/HelAmps_sm.h
@@ -8,7 +8,7 @@
 // Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+// MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
@@ -863,6 +863,7 @@ namespace mg5amcCpu
           const fptype allV2[],
           const fptype allV3[],
           const fptype allCOUP[],
+          const double Ccoeff,
           fptype allvertexes[] ) ALWAYS_INLINE;
 
   //--------------------------------------------------------------------------
@@ -873,6 +874,7 @@ namespace mg5amcCpu
   VVV1P0_1( const fptype allV2[],
             const fptype allV3[],
             const fptype allCOUP[],
+            const double Ccoeff,
             const fptype M1,
             const fptype W1,
             fptype allV1[] ) ALWAYS_INLINE;
@@ -886,6 +888,7 @@ namespace mg5amcCpu
           const fptype allF2[],
           const fptype allV3[],
           const fptype allCOUP[],
+          const double Ccoeff,
           fptype allvertexes[] ) ALWAYS_INLINE;
 
   //--------------------------------------------------------------------------
@@ -896,6 +899,7 @@ namespace mg5amcCpu
   FFV1_1( const fptype allF2[],
           const fptype allV3[],
           const fptype allCOUP[],
+          const double Ccoeff,
           const fptype M1,
           const fptype W1,
           fptype allF1[] ) ALWAYS_INLINE;
@@ -908,6 +912,7 @@ namespace mg5amcCpu
   FFV1_2( const fptype allF1[],
           const fptype allV3[],
           const fptype allCOUP[],
+          const double Ccoeff,
           const fptype M2,
           const fptype W2,
           fptype allF2[] ) ALWAYS_INLINE;
@@ -920,6 +925,7 @@ namespace mg5amcCpu
   FFV1P0_3( const fptype allF1[],
             const fptype allF2[],
             const fptype allCOUP[],
+            const double Ccoeff,
             const fptype M3,
             const fptype W3,
             fptype allV3[] ) ALWAYS_INLINE;
@@ -934,6 +940,7 @@ namespace mg5amcCpu
            const fptype allV3[],
            const fptype allV4[],
            const fptype allCOUP[],
+           const double Ccoeff,
            fptype allvertexes[] ) ALWAYS_INLINE;
 
   //--------------------------------------------------------------------------
@@ -945,6 +952,7 @@ namespace mg5amcCpu
              const fptype allV3[],
              const fptype allV4[],
              const fptype allCOUP[],
+             const double Ccoeff,
              const fptype M1,
              const fptype W1,
              fptype allV1[] ) ALWAYS_INLINE;
@@ -959,6 +967,7 @@ namespace mg5amcCpu
            const fptype allV3[],
            const fptype allV4[],
            const fptype allCOUP[],
+           const double Ccoeff,
            fptype allvertexes[] ) ALWAYS_INLINE;
 
   //--------------------------------------------------------------------------
@@ -970,6 +979,7 @@ namespace mg5amcCpu
              const fptype allV3[],
              const fptype allV4[],
              const fptype allCOUP[],
+             const double Ccoeff,
              const fptype M1,
              const fptype W1,
              fptype allV1[] ) ALWAYS_INLINE;
@@ -984,6 +994,7 @@ namespace mg5amcCpu
            const fptype allV3[],
            const fptype allV4[],
            const fptype allCOUP[],
+           const double Ccoeff,
            fptype allvertexes[] ) ALWAYS_INLINE;
 
   //--------------------------------------------------------------------------
@@ -995,6 +1006,7 @@ namespace mg5amcCpu
              const fptype allV3[],
              const fptype allV4[],
              const fptype allCOUP[],
+             const double Ccoeff,
              const fptype M1,
              const fptype W1,
              fptype allV1[] ) ALWAYS_INLINE;
@@ -1008,6 +1020,7 @@ namespace mg5amcCpu
           const fptype allV2[],
           const fptype allV3[],
           const fptype allCOUP[],
+          const double Ccoeff,
           fptype allvertexes[] )
   {
     mgDebug( 0, __FUNCTION__ );
@@ -1042,6 +1055,7 @@ namespace mg5amcCpu
   VVV1P0_1( const fptype allV2[],
             const fptype allV3[],
             const fptype allCOUP[],
+            const double Ccoeff,
             const fptype M1,
             const fptype W1,
             fptype allV1[] )
@@ -1080,6 +1094,7 @@ namespace mg5amcCpu
           const fptype allF2[],
           const fptype allV3[],
           const fptype allCOUP[],
+          const double Ccoeff,
           fptype allvertexes[] )
   {
     mgDebug( 0, __FUNCTION__ );
@@ -1103,6 +1118,7 @@ namespace mg5amcCpu
   FFV1_1( const fptype allF2[],
           const fptype allV3[],
           const fptype allCOUP[],
+          const double Ccoeff,
           const fptype M1,
           const fptype W1,
           fptype allF1[] )
@@ -1134,6 +1150,7 @@ namespace mg5amcCpu
   FFV1_2( const fptype allF1[],
           const fptype allV3[],
           const fptype allCOUP[],
+          const double Ccoeff,
           const fptype M2,
           const fptype W2,
           fptype allF2[] )
@@ -1165,6 +1182,7 @@ namespace mg5amcCpu
   FFV1P0_3( const fptype allF1[],
             const fptype allF2[],
             const fptype allCOUP[],
+            const double Ccoeff,
             const fptype M3,
             const fptype W3,
             fptype allV3[] )
@@ -1197,6 +1215,7 @@ namespace mg5amcCpu
            const fptype allV3[],
            const fptype allV4[],
            const fptype allCOUP[],
+           const double Ccoeff,
            fptype allvertexes[] )
   {
     mgDebug( 0, __FUNCTION__ );
@@ -1225,6 +1244,7 @@ namespace mg5amcCpu
              const fptype allV3[],
              const fptype allV4[],
              const fptype allCOUP[],
+             const double Ccoeff,
              const fptype M1,
              const fptype W1,
              fptype allV1[] )
@@ -1260,6 +1280,7 @@ namespace mg5amcCpu
            const fptype allV3[],
            const fptype allV4[],
            const fptype allCOUP[],
+           const double Ccoeff,
            fptype allvertexes[] )
   {
     mgDebug( 0, __FUNCTION__ );
@@ -1288,6 +1309,7 @@ namespace mg5amcCpu
              const fptype allV3[],
              const fptype allV4[],
              const fptype allCOUP[],
+             const double Ccoeff,
              const fptype M1,
              const fptype W1,
              fptype allV1[] )
@@ -1323,6 +1345,7 @@ namespace mg5amcCpu
            const fptype allV3[],
            const fptype allV4[],
            const fptype allCOUP[],
+           const double Ccoeff,
            fptype allvertexes[] )
   {
     mgDebug( 0, __FUNCTION__ );
@@ -1351,6 +1374,7 @@ namespace mg5amcCpu
              const fptype allV3[],
              const fptype allV4[],
              const fptype allCOUP[],
+             const double Ccoeff,
              const fptype M1,
              const fptype W1,
              fptype allV1[] )
diff --git a/epochX/cudacpp/gg_ttgg.sa/src/Parameters_sm.cc b/epochX/cudacpp/gg_ttgg.sa/src/Parameters_sm.cc
index 05eba20217..067445b198 100644
--- a/epochX/cudacpp/gg_ttgg.sa/src/Parameters_sm.cc
+++ b/epochX/cudacpp/gg_ttgg.sa/src/Parameters_sm.cc
@@ -7,7 +7,7 @@
 // Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+// MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
diff --git a/epochX/cudacpp/gg_ttgg.sa/src/Parameters_sm.h b/epochX/cudacpp/gg_ttgg.sa/src/Parameters_sm.h
index 41830f87ca..9581d66e0e 100644
--- a/epochX/cudacpp/gg_ttgg.sa/src/Parameters_sm.h
+++ b/epochX/cudacpp/gg_ttgg.sa/src/Parameters_sm.h
@@ -7,7 +7,7 @@
 // Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+// MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
diff --git a/epochX/cudacpp/gg_ttgg.sa/src/cudacpp_src.mk b/epochX/cudacpp/gg_ttgg.sa/src/cudacpp_src.mk
index f2804ffb85..159e19a46d 100644
--- a/epochX/cudacpp/gg_ttgg.sa/src/cudacpp_src.mk
+++ b/epochX/cudacpp/gg_ttgg.sa/src/cudacpp_src.mk
@@ -36,6 +36,13 @@ endif
 # See https://www.gnu.org/software/make/manual/html_node/Implicit-Variables.html
 ###RANLIB = ranlib
 
+# Add -mmacosx-version-min=11.3 to avoid "ld: warning: object file was built for newer macOS version than being linked"
+LDFLAGS =
+ifneq ($(shell $(CXX) --version | egrep '^Apple clang'),)
+CXXFLAGS += -mmacosx-version-min=11.3
+LDFLAGS += -mmacosx-version-min=11.3
+endif
+
 #-------------------------------------------------------------------------------
 
 #=== Configure the CUDA compiler (note: GPUCC is already exported including ccache)
@@ -266,11 +273,11 @@ endif
 ifneq ($(GPUCC),)
 $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so : $(cxx_objects) $(cu_objects)
 	@if [ ! -d $(LIBDIR) ]; then echo "mkdir -p $(LIBDIR)"; mkdir -p $(LIBDIR); fi
-	$(GPUCC) -shared -o $@ $(cxx_objects) $(cu_objects)
+	$(GPUCC) -shared -o $@ $(cxx_objects) $(cu_objects) $(LDFLAGS)
 else
 $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so : $(cxx_objects)
 	@if [ ! -d $(LIBDIR) ]; then echo "mkdir -p $(LIBDIR)"; mkdir -p $(LIBDIR); fi
-	$(CXX) -shared -o $@ $(cxx_objects)
+	$(CXX) -shared -o $@ $(cxx_objects) $(LDFLAGS)
 endif
 
 #-------------------------------------------------------------------------------
diff --git a/epochX/cudacpp/gg_ttgg.sa/src/mgOnGpuConfig.h b/epochX/cudacpp/gg_ttgg.sa/src/mgOnGpuConfig.h
index 205accb85b..da4ba36ad8 100644
--- a/epochX/cudacpp/gg_ttgg.sa/src/mgOnGpuConfig.h
+++ b/epochX/cudacpp/gg_ttgg.sa/src/mgOnGpuConfig.h
@@ -15,7 +15,6 @@
 #define MGONGPUCPP_GPUIMPL cuda
 #elif defined __HIPCC__
 #define MGONGPUCPP_GPUIMPL hip
-#include "hip/hip_runtime.h"
 #else
 #undef MGONGPUCPP_GPUIMPL
 #endif
@@ -24,16 +23,19 @@
 // ** NB2 Baseline on b7g47n0004 fluctuates (probably depends on load on other VMs)
 
 // Choose if curand is supported for generating random numbers
-// For CUDA, by default, it is supported
-// For HIP, by default, it is not supported
-// For C++, by default, do not inline, but allow this macro to be set from outside with e.g. -DMGONGPU_HAS_NO_CURAND
-#ifdef __CUDACC__
-#undef MGONGPU_HAS_NO_CURAND
-#elif defined __HIPCC__
+// For HIP, by default, do not use curand (common random numbers will be used instead)
+// For both CUDA and C++, by default, do not inline, but allow this macro to be set from outside with e.g. -DMGONGPU_HAS_NO_CURAND
+// (there exist CUDA installations, e.g. using the HPC package, which do not include curand - see PR #784 and #785)
+#if defined __HIPCC__
 #define MGONGPU_HAS_NO_CURAND 1
 #else
+//#ifdef __CUDACC__
+//#undef MGONGPU_HAS_NO_CURAND // default
+////#define MGONGPU_HAS_NO_CURAND 1
+//#else
 //#undef MGONGPU_HAS_NO_CURAND // default
 ////#define MGONGPU_HAS_NO_CURAND 1
+//#endif
 #endif
 
 // Choose floating point precision (for everything but color algebra #537)
diff --git a/epochX/cudacpp/gg_ttgg.sa/src/mgOnGpuCxtypes.h b/epochX/cudacpp/gg_ttgg.sa/src/mgOnGpuCxtypes.h
index 46d9f02733..5532e22fa1 100644
--- a/epochX/cudacpp/gg_ttgg.sa/src/mgOnGpuCxtypes.h
+++ b/epochX/cudacpp/gg_ttgg.sa/src/mgOnGpuCxtypes.h
@@ -159,6 +159,12 @@ namespace mg5amcCpu
     return cxsmpl<float>( a, 0 ) * b;
   }
 
+  inline __host__ __device__ constexpr cxsmpl<float>
+  operator*( const cxsmpl<float>& a, const double& b )
+  {
+    return a * cxsmpl<float>( b, 0 );
+  }
+
   template<typename FP>
   inline __host__ __device__ constexpr cxsmpl<FP>
   operator/( const cxsmpl<FP>& a, const cxsmpl<FP>& b )
diff --git a/epochX/cudacpp/gg_ttggg.mad/CODEGEN_mad_gg_ttggg_log.txt b/epochX/cudacpp/gg_ttggg.mad/CODEGEN_mad_gg_ttggg_log.txt
index 71cf69851d..1163910eb2 100644
--- a/epochX/cudacpp/gg_ttggg.mad/CODEGEN_mad_gg_ttggg_log.txt
+++ b/epochX/cudacpp/gg_ttggg.mad/CODEGEN_mad_gg_ttggg_log.txt
@@ -14,7 +14,7 @@ Running MG5 in debug mode
 *                   *        * *        *                  *
 *                 *                       *                *
 *                                                          *
-*         VERSION 3.5.1_lo_vect         2023-08-08         *
+*         VERSION 3.5.2_lo_vect         2023-11-08         *
 [1;31m*                                                          *[1;0m
 [1;31m*          WARNING: UNKNOWN DEVELOPMENT VERSION.           *[1;0m
 [1;31m*            WARNING: DO NOT USE FOR PRODUCTION            *[1;0m
@@ -53,7 +53,7 @@ Note that you can still compile and run aMC@NLO with the built-in PDFs
 Using default text editor "vi". Set another one in ./input/mg5_configuration.txt
 Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt
 Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt
-import /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_mad_gg_ttggg.mg
+import /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg.mg
 The import format was not given, so we guess it as command
 set stdout_level DEBUG
 set output information to level: 10
@@ -62,7 +62,7 @@ generate g g > t t~ g g g
 No model currently active, so we import the Standard Model
 INFO: load particles 
 INFO: load vertices 
-[1;32mDEBUG: model prefixing  takes 0.0058345794677734375 [0m
+[1;32mDEBUG: model prefixing  takes 0.005507230758666992 [0m
 INFO: Restrict model sm with file models/sm/restrict_default.dat . 
 [1;32mDEBUG: Simplifying conditional expressions [0m
 [1;32mDEBUG: remove interactions: u s w+ at order: QED=1 [0m
@@ -155,89 +155,60 @@ INFO: Please specify coupling orders to bypass this step.
 INFO: Trying coupling order WEIGHTED<=5: WEIGTHED IS QCD+2*QED 
 INFO: Trying process: g g > t t~ g g g WEIGHTED<=5 @1  
 INFO: Process has 1240 diagrams 
-1 processes with 1240 diagrams generated in 1.894 s
+1 processes with 1240 diagrams generated in 1.909 s
 Total: 1 processes with 1240 diagrams
-output madevent CODEGEN_mad_gg_ttggg --hel_recycling=False --vector_size=16384 --me_exporter=standalone_cudacpp
-Load PLUGIN.CUDACPP_SA_OUTPUT
-[1mAddition matrix-element will be done with PLUGIN: CUDACPP_SA_OUTPUT[0m
-[1mOutput will be done with PLUGIN: CUDACPP_SA_OUTPUT[0m
+output madevent ../TMPOUT/CODEGEN_mad_gg_ttggg --hel_recycling=False --vector_size=32 --me_exporter=standalone_cudacpp
+Load PLUGIN.CUDACPP_OUTPUT
+[1mAddition matrix-element will be done with PLUGIN: CUDACPP_OUTPUT[0m
+[1mOutput will be done with PLUGIN: CUDACPP_OUTPUT[0m
 [1;32mDEBUG:  cformat = [0m standalone_cudacpp [1;30m[export_cpp.py at line 3071][0m [0m
-[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [1;30m[output.py at line 152][0m [0m
+[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [1;30m[output.py at line 160][0m [0m
 INFO: initialize a new directory: CODEGEN_mad_gg_ttggg 
 INFO: remove old information in CODEGEN_mad_gg_ttggg 
-[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [1;30m[output.py at line 157][0m [0m
-[1;34mWARNING: File exists /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_mad_gg_ttggg [0m
-INFO: Creating subdirectories in directory /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_mad_gg_ttggg 
-[1;34mWARNING: File exists /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_mad_gg_ttggg/Cards [0m
-[1;34mWARNING: File exists /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_mad_gg_ttggg/SubProcesses [0m
+[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [1;30m[output.py at line 165][0m [0m
+[1;34mWARNING: File exists /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg [0m
+INFO: Creating subdirectories in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg 
+[1;34mWARNING: File exists /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg/Cards [0m
+[1;34mWARNING: File exists /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg/SubProcesses [0m
 INFO: Organizing processes into subprocess groups 
 INFO: Generating Helas calls for process: g g > t t~ g g g WEIGHTED<=5 @1 
 INFO: Processing color information for process: g g > t t~ g g g @1 
 INFO: Creating files in directory P1_gg_ttxggg 
 INFO: Computing Color-Flow optimization [15120 term] 
-INFO: Color-Flow passed to 1592 term in 36s. Introduce 2768 contraction 
-[1;32mDEBUG:  Entering PLUGIN_OneProcessExporter.__init__ [1;30m[model_handling.py at line 1039][0m [0m
-[1;32mDEBUG:  kwargs[prefix] = 0 [1;30m[model_handling.py at line 1040][0m [0m
-[1;32mDEBUG:  proc_id = [0m 1 [1;30m[model_handling.py at line 1045][0m [0m
-[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_SA_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7f7c3d985610> [1;30m[export_v4.py at line 6179][0m [0m
+INFO: Color-Flow passed to 1630 term in 8s. Introduce 3030 contraction 
+[1;32mDEBUG:  kwargs[prefix] = 0 [1;30m[model_handling.py at line 1058][0m [0m
+[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7ff5a06b78e0> [1;30m[export_v4.py at line 6262][0m [0m
 INFO: Creating files in directory . 
-[1;32mDEBUG:  Entering PLUGIN_OneProcessExporter.generate_process_files [1;30m[model_handling.py at line 1297][0m [0m
-[1;32mDEBUG:  self.include_multi_channel is already defined: this is madevent+second_exporter mode [1;30m[model_handling.py at line 1299][0m [0m
-FileWriter <class 'PLUGIN.CUDACPP_SA_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.h
-[1;32mDEBUG:  Entering PLUGIN_OneProcessExporter.write_process_h_file [1;30m[model_handling.py at line 1453][0m [0m
-FileWriter <class 'PLUGIN.CUDACPP_SA_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.cc
-[1;32mDEBUG:  Entering PLUGIN_OneProcessExporter.write_process_cc_file [1;30m[model_handling.py at line 1475][0m [0m
-[1;32mDEBUG:  Entering PLUGIN_OneProcessExporter.get_sigmaKin_lines [1;30m[model_handling.py at line 1143][0m [0m
-[1;32mDEBUG:  self.include_multi_channel = [0m [1, 2, 0, 3, 4, 0, 5, 6, 0, 0, 0, 0, 0, 7, 8, 9, 0, 10, 11, 12, 0, 13, 14, 15, 0, 16, 17, 18, 19, 20, 21, 0, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 0, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 0, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 0, 67, 68, 69, 70, 71, 72, 73, 74, 75, 0, 76, 77, 78, 79, 80, 81, 82, 83, 84, 0, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 0, 0, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 0, 121, 122, 0, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 0, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 0, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196, 0, 197, 198, 199, 200, 201, 202, 0, 203, 204, 205, 206, 207, 208, 0, 209, 210, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223, 224, 225, 0, 226, 227, 0, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239, 240, 241, 242, 0, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254, 255, 256, 257, 0, 258, 259, 260, 261, 262, 263, 264, 265, 266, 267, 268, 269, 270, 271, 272, 273, 274, 275, 276, 277, 278, 279, 280, 281, 282, 283, 284, 285, 286, 287, 288, 289, 290, 291, 292, 293, 294, 295, 296, 297, 298, 299, 300, 301, 0, 302, 303, 304, 305, 306, 307, 0, 308, 309, 310, 311, 312, 313, 0, 314, 315, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 316, 317, 318, 319, 320, 321, 0, 322, 323, 324, 325, 326, 327, 328, 329, 330, 331, 332, 333, 334, 335, 336, 0, 337, 338, 339, 340, 341, 342, 343, 344, 345, 346, 347, 348, 349, 350, 351, 0, 352, 353, 354, 355, 356, 357, 358, 359, 360, 361, 362, 363, 364, 365, 366, 0, 367, 368, 369, 370, 371, 372, 373, 374, 375, 376, 377, 0, 378, 379, 0, 380, 381, 0, 0, 0, 0, 0, 382, 383, 384, 385, 386, 387, 388, 389, 390, 0, 391, 392, 393, 394, 395, 396, 397, 398, 399, 0, 400, 401, 402, 403, 404, 405, 406, 407, 408, 0, 409, 410, 411, 412, 413, 414, 0, 415, 416, 417, 418, 419, 420, 0, 0, 0, 421, 422, 423, 424, 425, 426, 0, 427, 428, 429, 430, 431, 432, 433, 434, 435, 436, 437, 438, 439, 440, 441, 0, 442, 443, 444, 445, 446, 447, 448, 449, 450, 451, 452, 453, 454, 455, 456, 0, 457, 458, 459, 460, 461, 462, 463, 464, 465, 466, 467, 468, 469, 470, 471, 0, 472, 473, 474, 475, 476, 477, 478, 479, 480, 481, 482, 0, 483, 484, 0, 485, 486, 0, 0, 0, 0, 0, 487, 488, 489, 490, 491, 492, 493, 494, 495, 0, 496, 497, 498, 499, 500, 501, 502, 503, 504, 0, 505, 506, 507, 508, 509, 510, 511, 512, 513, 0, 514, 515, 516, 517, 518, 519, 0, 520, 521, 522, 523, 524, 525, 0, 0, 0, 526, 527, 528, 529, 530, 531, 0, 532, 533, 534, 535, 536, 537, 538, 539, 540, 541, 542, 543, 544, 545, 546, 0, 547, 548, 549, 550, 551, 552, 553, 554, 555, 556, 557, 558, 559, 560, 561, 0, 562, 563, 564, 565, 566, 567, 568, 569, 570, 571, 572, 573, 574, 575, 576, 0, 577, 578, 579, 580, 581, 582, 583, 584, 585, 586, 587, 0, 588, 589, 0, 590, 591, 0, 0, 0, 0, 0, 592, 593, 594, 595, 596, 597, 598, 599, 600, 0, 601, 602, 603, 604, 605, 606, 607, 608, 609, 0, 610, 611, 612, 613, 614, 615, 616, 617, 618, 0, 619, 620, 621, 622, 623, 624, 0, 625, 626, 627, 628, 629, 630, 0, 0, 0, 631, 632, 633, 634, 635, 636, 637, 638, 639, 640, 641, 642, 643, 644, 645, 646, 647, 648, 649, 650, 651, 652, 653, 654, 655, 656, 657, 658, 659, 660, 661, 662, 663, 0, 664, 665, 666, 667, 668, 669, 0, 670, 671, 672, 673, 674, 675, 0, 0, 0, 676, 677, 678, 679, 680, 681, 682, 683, 684, 685, 686, 687, 688, 689, 690, 691, 692, 693, 694, 695, 696, 697, 698, 699, 700, 701, 702, 703, 704, 705, 706, 707, 708, 0, 709, 710, 711, 712, 713, 714, 0, 715, 716, 717, 718, 719, 720, 0, 0, 0, 721, 722, 0, 723, 724, 0, 725, 726, 0, 0, 0, 0, 0, 727, 728, 729, 730, 731, 732, 733, 734, 735, 0, 736, 737, 738, 739, 740, 741, 742, 743, 744, 0, 745, 746, 747, 748, 749, 750, 751, 752, 753, 0, 754, 755, 756, 757, 758, 759, 0, 760, 761, 762, 763, 764, 765, 766, 767, 0, 768, 769, 0, 770, 771, 0, 0, 0, 0, 0, 772, 773, 774, 775, 776, 777, 778, 779, 780, 0, 781, 782, 783, 784, 785, 786, 787, 788, 789, 0, 790, 791, 792, 793, 794, 795, 796, 797, 798, 0, 799, 800, 801, 802, 803, 804, 0, 805, 806, 807, 808, 809, 810, 811, 812, 0, 813, 814, 0, 815, 816, 0, 0, 0, 0, 0, 817, 818, 819, 820, 821, 822, 823, 824, 825, 0, 826, 827, 828, 829, 830, 831, 832, 833, 834, 0, 835, 836, 837, 838, 839, 840, 841, 842, 843, 0, 844, 845, 846, 847, 848, 849, 0, 850, 851, 852, 853, 854, 855, 856, 857, 0, 858, 859, 0, 860, 861, 0, 0, 0, 0, 862, 863, 0, 864, 865, 0, 866, 867, 0, 0, 0, 0, 868, 869, 0, 870, 871, 0, 872, 873, 0, 0, 0, 0, 0, 0, 0, 874, 875, 876, 877, 878, 879, 880, 881, 882, 883, 884, 885, 886, 887, 888, 889, 890, 891, 0, 892, 893, 894, 895, 896, 897, 898, 899, 900, 901, 902, 903, 904, 905, 906, 907, 908, 909, 0, 910, 911, 912, 913, 914, 915, 916, 917, 918, 919, 920, 921, 922, 923, 924, 925, 926, 927, 0, 928, 929, 930, 931, 932, 933, 0, 934, 935, 936, 937, 938, 939, 0, 940, 941, 942, 943, 944, 945, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] [1;30m[model_handling.py at line 1144][0m [0m
-[1;32mDEBUG:  self.support_multichannel = [0m True [1;30m[model_handling.py at line 1145][0m [0m
-[1;32mDEBUG:  type(self.helas_call_writer) = [0m <class 'PLUGIN.CUDACPP_SA_OUTPUT.model_handling.PLUGIN_GPUFOHelasCallWriter'> [1;30m[model_handling.py at line 1162][0m [0m
-[1;32mDEBUG:  self.support_multichannel, self.include_multi_channel = [0m True [1, 2, 0, 3, 4, 0, 5, 6, 0, 0, 0, 0, 0, 7, 8, 9, 0, 10, 11, 12, 0, 13, 14, 15, 0, 16, 17, 18, 19, 20, 21, 0, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 0, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 0, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 0, 67, 68, 69, 70, 71, 72, 73, 74, 75, 0, 76, 77, 78, 79, 80, 81, 82, 83, 84, 0, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 0, 0, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 0, 121, 122, 0, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 0, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 0, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196, 0, 197, 198, 199, 200, 201, 202, 0, 203, 204, 205, 206, 207, 208, 0, 209, 210, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223, 224, 225, 0, 226, 227, 0, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239, 240, 241, 242, 0, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254, 255, 256, 257, 0, 258, 259, 260, 261, 262, 263, 264, 265, 266, 267, 268, 269, 270, 271, 272, 273, 274, 275, 276, 277, 278, 279, 280, 281, 282, 283, 284, 285, 286, 287, 288, 289, 290, 291, 292, 293, 294, 295, 296, 297, 298, 299, 300, 301, 0, 302, 303, 304, 305, 306, 307, 0, 308, 309, 310, 311, 312, 313, 0, 314, 315, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 316, 317, 318, 319, 320, 321, 0, 322, 323, 324, 325, 326, 327, 328, 329, 330, 331, 332, 333, 334, 335, 336, 0, 337, 338, 339, 340, 341, 342, 343, 344, 345, 346, 347, 348, 349, 350, 351, 0, 352, 353, 354, 355, 356, 357, 358, 359, 360, 361, 362, 363, 364, 365, 366, 0, 367, 368, 369, 370, 371, 372, 373, 374, 375, 376, 377, 0, 378, 379, 0, 380, 381, 0, 0, 0, 0, 0, 382, 383, 384, 385, 386, 387, 388, 389, 390, 0, 391, 392, 393, 394, 395, 396, 397, 398, 399, 0, 400, 401, 402, 403, 404, 405, 406, 407, 408, 0, 409, 410, 411, 412, 413, 414, 0, 415, 416, 417, 418, 419, 420, 0, 0, 0, 421, 422, 423, 424, 425, 426, 0, 427, 428, 429, 430, 431, 432, 433, 434, 435, 436, 437, 438, 439, 440, 441, 0, 442, 443, 444, 445, 446, 447, 448, 449, 450, 451, 452, 453, 454, 455, 456, 0, 457, 458, 459, 460, 461, 462, 463, 464, 465, 466, 467, 468, 469, 470, 471, 0, 472, 473, 474, 475, 476, 477, 478, 479, 480, 481, 482, 0, 483, 484, 0, 485, 486, 0, 0, 0, 0, 0, 487, 488, 489, 490, 491, 492, 493, 494, 495, 0, 496, 497, 498, 499, 500, 501, 502, 503, 504, 0, 505, 506, 507, 508, 509, 510, 511, 512, 513, 0, 514, 515, 516, 517, 518, 519, 0, 520, 521, 522, 523, 524, 525, 0, 0, 0, 526, 527, 528, 529, 530, 531, 0, 532, 533, 534, 535, 536, 537, 538, 539, 540, 541, 542, 543, 544, 545, 546, 0, 547, 548, 549, 550, 551, 552, 553, 554, 555, 556, 557, 558, 559, 560, 561, 0, 562, 563, 564, 565, 566, 567, 568, 569, 570, 571, 572, 573, 574, 575, 576, 0, 577, 578, 579, 580, 581, 582, 583, 584, 585, 586, 587, 0, 588, 589, 0, 590, 591, 0, 0, 0, 0, 0, 592, 593, 594, 595, 596, 597, 598, 599, 600, 0, 601, 602, 603, 604, 605, 606, 607, 608, 609, 0, 610, 611, 612, 613, 614, 615, 616, 617, 618, 0, 619, 620, 621, 622, 623, 624, 0, 625, 626, 627, 628, 629, 630, 0, 0, 0, 631, 632, 633, 634, 635, 636, 637, 638, 639, 640, 641, 642, 643, 644, 645, 646, 647, 648, 649, 650, 651, 652, 653, 654, 655, 656, 657, 658, 659, 660, 661, 662, 663, 0, 664, 665, 666, 667, 668, 669, 0, 670, 671, 672, 673, 674, 675, 0, 0, 0, 676, 677, 678, 679, 680, 681, 682, 683, 684, 685, 686, 687, 688, 689, 690, 691, 692, 693, 694, 695, 696, 697, 698, 699, 700, 701, 702, 703, 704, 705, 706, 707, 708, 0, 709, 710, 711, 712, 713, 714, 0, 715, 716, 717, 718, 719, 720, 0, 0, 0, 721, 722, 0, 723, 724, 0, 725, 726, 0, 0, 0, 0, 0, 727, 728, 729, 730, 731, 732, 733, 734, 735, 0, 736, 737, 738, 739, 740, 741, 742, 743, 744, 0, 745, 746, 747, 748, 749, 750, 751, 752, 753, 0, 754, 755, 756, 757, 758, 759, 0, 760, 761, 762, 763, 764, 765, 766, 767, 0, 768, 769, 0, 770, 771, 0, 0, 0, 0, 0, 772, 773, 774, 775, 776, 777, 778, 779, 780, 0, 781, 782, 783, 784, 785, 786, 787, 788, 789, 0, 790, 791, 792, 793, 794, 795, 796, 797, 798, 0, 799, 800, 801, 802, 803, 804, 0, 805, 806, 807, 808, 809, 810, 811, 812, 0, 813, 814, 0, 815, 816, 0, 0, 0, 0, 0, 817, 818, 819, 820, 821, 822, 823, 824, 825, 0, 826, 827, 828, 829, 830, 831, 832, 833, 834, 0, 835, 836, 837, 838, 839, 840, 841, 842, 843, 0, 844, 845, 846, 847, 848, 849, 0, 850, 851, 852, 853, 854, 855, 856, 857, 0, 858, 859, 0, 860, 861, 0, 0, 0, 0, 862, 863, 0, 864, 865, 0, 866, 867, 0, 0, 0, 0, 868, 869, 0, 870, 871, 0, 872, 873, 0, 0, 0, 0, 0, 0, 0, 874, 875, 876, 877, 878, 879, 880, 881, 882, 883, 884, 885, 886, 887, 888, 889, 890, 891, 0, 892, 893, 894, 895, 896, 897, 898, 899, 900, 901, 902, 903, 904, 905, 906, 907, 908, 909, 0, 910, 911, 912, 913, 914, 915, 916, 917, 918, 919, 920, 921, 922, 923, 924, 925, 926, 927, 0, 928, 929, 930, 931, 932, 933, 0, 934, 935, 936, 937, 938, 939, 0, 940, 941, 942, 943, 944, 945, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] [1;30m[model_handling.py at line 1163][0m [0m
-[1;32mDEBUG:  multi_channel = [0m {1: [0], 2: [1], 3: [3], 4: [4], 5: [6], 6: [7], 7: [13], 8: [14], 9: [15], 10: [17], 11: [18], 12: [19], 13: [21], 14: [22], 15: [23], 16: [25], 17: [26], 18: [27], 19: [28], 20: [29], 21: [30], 22: [32], 23: [33], 24: [34], 25: [35], 26: [36], 27: [37], 28: [38], 29: [39], 30: [40], 31: [41], 32: [42], 33: [43], 34: [44], 35: [45], 36: [46], 37: [48], 38: [49], 39: [50], 40: [51], 41: [52], 42: [53], 43: [54], 44: [55], 45: [56], 46: [57], 47: [58], 48: [59], 49: [60], 50: [61], 51: [62], 52: [64], 53: [65], 54: [66], 55: [67], 56: [68], 57: [69], 58: [70], 59: [71], 60: [72], 61: [73], 62: [74], 63: [75], 64: [76], 65: [77], 66: [78], 67: [80], 68: [81], 69: [82], 70: [83], 71: [84], 72: [85], 73: [86], 74: [87], 75: [88], 76: [90], 77: [91], 78: [92], 79: [93], 80: [94], 81: [95], 82: [96], 83: [97], 84: [98], 85: [100], 86: [101], 87: [102], 88: [103], 89: [104], 90: [105], 91: [106], 92: [107], 93: [108], 94: [109], 95: [110], 96: [111], 97: [112], 98: [113], 99: [114], 100: [115], 101: [116], 102: [117], 103: [118], 104: [119], 105: [120], 106: [123], 107: [124], 108: [125], 109: [126], 110: [127], 111: [128], 112: [129], 113: [130], 114: [131], 115: [132], 116: [133], 117: [134], 118: [135], 119: [136], 120: [137], 121: [139], 122: [140], 123: [142], 124: [143], 125: [144], 126: [145], 127: [146], 128: [147], 129: [148], 130: [149], 131: [150], 132: [151], 133: [152], 134: [153], 135: [154], 136: [155], 137: [156], 138: [158], 139: [159], 140: [160], 141: [161], 142: [162], 143: [163], 144: [164], 145: [165], 146: [166], 147: [167], 148: [168], 149: [169], 150: [170], 151: [171], 152: [172], 153: [174], 154: [175], 155: [176], 156: [177], 157: [178], 158: [179], 159: [180], 160: [181], 161: [182], 162: [183], 163: [184], 164: [185], 165: [186], 166: [187], 167: [188], 168: [189], 169: [190], 170: [191], 171: [192], 172: [193], 173: [194], 174: [195], 175: [196], 176: [197], 177: [198], 178: [199], 179: [200], 180: [201], 181: [202], 182: [203], 183: [204], 184: [205], 185: [206], 186: [207], 187: [208], 188: [209], 189: [210], 190: [211], 191: [212], 192: [213], 193: [214], 194: [215], 195: [216], 196: [217], 197: [219], 198: [220], 199: [221], 200: [222], 201: [223], 202: [224], 203: [226], 204: [227], 205: [228], 206: [229], 207: [230], 208: [231], 209: [233], 210: [234], 211: [246], 212: [247], 213: [248], 214: [249], 215: [250], 216: [251], 217: [252], 218: [253], 219: [254], 220: [255], 221: [256], 222: [257], 223: [258], 224: [259], 225: [260], 226: [262], 227: [263], 228: [265], 229: [266], 230: [267], 231: [268], 232: [269], 233: [270], 234: [271], 235: [272], 236: [273], 237: [274], 238: [275], 239: [276], 240: [277], 241: [278], 242: [279], 243: [281], 244: [282], 245: [283], 246: [284], 247: [285], 248: [286], 249: [287], 250: [288], 251: [289], 252: [290], 253: [291], 254: [292], 255: [293], 256: [294], 257: [295], 258: [297], 259: [298], 260: [299], 261: [300], 262: [301], 263: [302], 264: [303], 265: [304], 266: [305], 267: [306], 268: [307], 269: [308], 270: [309], 271: [310], 272: [311], 273: [312], 274: [313], 275: [314], 276: [315], 277: [316], 278: [317], 279: [318], 280: [319], 281: [320], 282: [321], 283: [322], 284: [323], 285: [324], 286: [325], 287: [326], 288: [327], 289: [328], 290: [329], 291: [330], 292: [331], 293: [332], 294: [333], 295: [334], 296: [335], 297: [336], 298: [337], 299: [338], 300: [339], 301: [340], 302: [342], 303: [343], 304: [344], 305: [345], 306: [346], 307: [347], 308: [349], 309: [350], 310: [351], 311: [352], 312: [353], 313: [354], 314: [356], 315: [357], 316: [369], 317: [370], 318: [371], 319: [372], 320: [373], 321: [374], 322: [376], 323: [377], 324: [378], 325: [379], 326: [380], 327: [381], 328: [382], 329: [383], 330: [384], 331: [385], 332: [386], 333: [387], 334: [388], 335: [389], 336: [390], 337: [392], 338: [393], 339: [394], 340: [395], 341: [396], 342: [397], 343: [398], 344: [399], 345: [400], 346: [401], 347: [402], 348: [403], 349: [404], 350: [405], 351: [406], 352: [408], 353: [409], 354: [410], 355: [411], 356: [412], 357: [413], 358: [414], 359: [415], 360: [416], 361: [417], 362: [418], 363: [419], 364: [420], 365: [421], 366: [422], 367: [424], 368: [425], 369: [426], 370: [427], 371: [428], 372: [429], 373: [430], 374: [431], 375: [432], 376: [433], 377: [434], 378: [436], 379: [437], 380: [439], 381: [440], 382: [446], 383: [447], 384: [448], 385: [449], 386: [450], 387: [451], 388: [452], 389: [453], 390: [454], 391: [456], 392: [457], 393: [458], 394: [459], 395: [460], 396: [461], 397: [462], 398: [463], 399: [464], 400: [466], 401: [467], 402: [468], 403: [469], 404: [470], 405: [471], 406: [472], 407: [473], 408: [474], 409: [476], 410: [477], 411: [478], 412: [479], 413: [480], 414: [481], 415: [483], 416: [484], 417: [485], 418: [486], 419: [487], 420: [488], 421: [492], 422: [493], 423: [494], 424: [495], 425: [496], 426: [497], 427: [499], 428: [500], 429: [501], 430: [502], 431: [503], 432: [504], 433: [505], 434: [506], 435: [507], 436: [508], 437: [509], 438: [510], 439: [511], 440: [512], 441: [513], 442: [515], 443: [516], 444: [517], 445: [518], 446: [519], 447: [520], 448: [521], 449: [522], 450: [523], 451: [524], 452: [525], 453: [526], 454: [527], 455: [528], 456: [529], 457: [531], 458: [532], 459: [533], 460: [534], 461: [535], 462: [536], 463: [537], 464: [538], 465: [539], 466: [540], 467: [541], 468: [542], 469: [543], 470: [544], 471: [545], 472: [547], 473: [548], 474: [549], 475: [550], 476: [551], 477: [552], 478: [553], 479: [554], 480: [555], 481: [556], 482: [557], 483: [559], 484: [560], 485: [562], 486: [563], 487: [569], 488: [570], 489: [571], 490: [572], 491: [573], 492: [574], 493: [575], 494: [576], 495: [577], 496: [579], 497: [580], 498: [581], 499: [582], 500: [583], 501: [584], 502: [585], 503: [586], 504: [587], 505: [589], 506: [590], 507: [591], 508: [592], 509: [593], 510: [594], 511: [595], 512: [596], 513: [597], 514: [599], 515: [600], 516: [601], 517: [602], 518: [603], 519: [604], 520: [606], 521: [607], 522: [608], 523: [609], 524: [610], 525: [611], 526: [615], 527: [616], 528: [617], 529: [618], 530: [619], 531: [620], 532: [622], 533: [623], 534: [624], 535: [625], 536: [626], 537: [627], 538: [628], 539: [629], 540: [630], 541: [631], 542: [632], 543: [633], 544: [634], 545: [635], 546: [636], 547: [638], 548: [639], 549: [640], 550: [641], 551: [642], 552: [643], 553: [644], 554: [645], 555: [646], 556: [647], 557: [648], 558: [649], 559: [650], 560: [651], 561: [652], 562: [654], 563: [655], 564: [656], 565: [657], 566: [658], 567: [659], 568: [660], 569: [661], 570: [662], 571: [663], 572: [664], 573: [665], 574: [666], 575: [667], 576: [668], 577: [670], 578: [671], 579: [672], 580: [673], 581: [674], 582: [675], 583: [676], 584: [677], 585: [678], 586: [679], 587: [680], 588: [682], 589: [683], 590: [685], 591: [686], 592: [692], 593: [693], 594: [694], 595: [695], 596: [696], 597: [697], 598: [698], 599: [699], 600: [700], 601: [702], 602: [703], 603: [704], 604: [705], 605: [706], 606: [707], 607: [708], 608: [709], 609: [710], 610: [712], 611: [713], 612: [714], 613: [715], 614: [716], 615: [717], 616: [718], 617: [719], 618: [720], 619: [722], 620: [723], 621: [724], 622: [725], 623: [726], 624: [727], 625: [729], 626: [730], 627: [731], 628: [732], 629: [733], 630: [734], 631: [738], 632: [739], 633: [740], 634: [741], 635: [742], 636: [743], 637: [744], 638: [745], 639: [746], 640: [747], 641: [748], 642: [749], 643: [750], 644: [751], 645: [752], 646: [753], 647: [754], 648: [755], 649: [756], 650: [757], 651: [758], 652: [759], 653: [760], 654: [761], 655: [762], 656: [763], 657: [764], 658: [765], 659: [766], 660: [767], 661: [768], 662: [769], 663: [770], 664: [772], 665: [773], 666: [774], 667: [775], 668: [776], 669: [777], 670: [779], 671: [780], 672: [781], 673: [782], 674: [783], 675: [784], 676: [788], 677: [789], 678: [790], 679: [791], 680: [792], 681: [793], 682: [794], 683: [795], 684: [796], 685: [797], 686: [798], 687: [799], 688: [800], 689: [801], 690: [802], 691: [803], 692: [804], 693: [805], 694: [806], 695: [807], 696: [808], 697: [809], 698: [810], 699: [811], 700: [812], 701: [813], 702: [814], 703: [815], 704: [816], 705: [817], 706: [818], 707: [819], 708: [820], 709: [822], 710: [823], 711: [824], 712: [825], 713: [826], 714: [827], 715: [829], 716: [830], 717: [831], 718: [832], 719: [833], 720: [834], 721: [838], 722: [839], 723: [841], 724: [842], 725: [844], 726: [845], 727: [851], 728: [852], 729: [853], 730: [854], 731: [855], 732: [856], 733: [857], 734: [858], 735: [859], 736: [861], 737: [862], 738: [863], 739: [864], 740: [865], 741: [866], 742: [867], 743: [868], 744: [869], 745: [871], 746: [872], 747: [873], 748: [874], 749: [875], 750: [876], 751: [877], 752: [878], 753: [879], 754: [881], 755: [882], 756: [883], 757: [884], 758: [885], 759: [886], 760: [888], 761: [889], 762: [890], 763: [891], 764: [892], 765: [893], 766: [894], 767: [895], 768: [897], 769: [898], 770: [900], 771: [901], 772: [907], 773: [908], 774: [909], 775: [910], 776: [911], 777: [912], 778: [913], 779: [914], 780: [915], 781: [917], 782: [918], 783: [919], 784: [920], 785: [921], 786: [922], 787: [923], 788: [924], 789: [925], 790: [927], 791: [928], 792: [929], 793: [930], 794: [931], 795: [932], 796: [933], 797: [934], 798: [935], 799: [937], 800: [938], 801: [939], 802: [940], 803: [941], 804: [942], 805: [944], 806: [945], 807: [946], 808: [947], 809: [948], 810: [949], 811: [950], 812: [951], 813: [953], 814: [954], 815: [956], 816: [957], 817: [963], 818: [964], 819: [965], 820: [966], 821: [967], 822: [968], 823: [969], 824: [970], 825: [971], 826: [973], 827: [974], 828: [975], 829: [976], 830: [977], 831: [978], 832: [979], 833: [980], 834: [981], 835: [983], 836: [984], 837: [985], 838: [986], 839: [987], 840: [988], 841: [989], 842: [990], 843: [991], 844: [993], 845: [994], 846: [995], 847: [996], 848: [997], 849: [998], 850: [1000], 851: [1001], 852: [1002], 853: [1003], 854: [1004], 855: [1005], 856: [1006], 857: [1007], 858: [1009], 859: [1010], 860: [1012], 861: [1013], 862: [1018], 863: [1019], 864: [1021], 865: [1022], 866: [1024], 867: [1025], 868: [1030], 869: [1031], 870: [1033], 871: [1034], 872: [1036], 873: [1037], 874: [1045], 875: [1046], 876: [1047], 877: [1048], 878: [1049], 879: [1050], 880: [1051], 881: [1052], 882: [1053], 883: [1054], 884: [1055], 885: [1056], 886: [1057], 887: [1058], 888: [1059], 889: [1060], 890: [1061], 891: [1062], 892: [1064], 893: [1065], 894: [1066], 895: [1067], 896: [1068], 897: [1069], 898: [1070], 899: [1071], 900: [1072], 901: [1073], 902: [1074], 903: [1075], 904: [1076], 905: [1077], 906: [1078], 907: [1079], 908: [1080], 909: [1081], 910: [1083], 911: [1084], 912: [1085], 913: [1086], 914: [1087], 915: [1088], 916: [1089], 917: [1090], 918: [1091], 919: [1092], 920: [1093], 921: [1094], 922: [1095], 923: [1096], 924: [1097], 925: [1098], 926: [1099], 927: [1100], 928: [1102], 929: [1103], 930: [1104], 931: [1105], 932: [1106], 933: [1107], 934: [1109], 935: [1110], 936: [1111], 937: [1112], 938: [1113], 939: [1114], 940: [1116], 941: [1117], 942: [1118], 943: [1119], 944: [1120], 945: [1121]} [1;30m[model_handling.py at line 1169][0m [0m
-[1;32mDEBUG:  multi_channel_map = [0m {1: [0], 2: [1], 3: [3], 4: [4], 5: [6], 6: [7], 7: [13], 8: [14], 9: [15], 10: [17], 11: [18], 12: [19], 13: [21], 14: [22], 15: [23], 16: [25], 17: [26], 18: [27], 19: [28], 20: [29], 21: [30], 22: [32], 23: [33], 24: [34], 25: [35], 26: [36], 27: [37], 28: [38], 29: [39], 30: [40], 31: [41], 32: [42], 33: [43], 34: [44], 35: [45], 36: [46], 37: [48], 38: [49], 39: [50], 40: [51], 41: [52], 42: [53], 43: [54], 44: [55], 45: [56], 46: [57], 47: [58], 48: [59], 49: [60], 50: [61], 51: [62], 52: [64], 53: [65], 54: [66], 55: [67], 56: [68], 57: [69], 58: [70], 59: [71], 60: [72], 61: [73], 62: [74], 63: [75], 64: [76], 65: [77], 66: [78], 67: [80], 68: [81], 69: [82], 70: [83], 71: [84], 72: [85], 73: [86], 74: [87], 75: [88], 76: [90], 77: [91], 78: [92], 79: [93], 80: [94], 81: [95], 82: [96], 83: [97], 84: [98], 85: [100], 86: [101], 87: [102], 88: [103], 89: [104], 90: [105], 91: [106], 92: [107], 93: [108], 94: [109], 95: [110], 96: [111], 97: [112], 98: [113], 99: [114], 100: [115], 101: [116], 102: [117], 103: [118], 104: [119], 105: [120], 106: [123], 107: [124], 108: [125], 109: [126], 110: [127], 111: [128], 112: [129], 113: [130], 114: [131], 115: [132], 116: [133], 117: [134], 118: [135], 119: [136], 120: [137], 121: [139], 122: [140], 123: [142], 124: [143], 125: [144], 126: [145], 127: [146], 128: [147], 129: [148], 130: [149], 131: [150], 132: [151], 133: [152], 134: [153], 135: [154], 136: [155], 137: [156], 138: [158], 139: [159], 140: [160], 141: [161], 142: [162], 143: [163], 144: [164], 145: [165], 146: [166], 147: [167], 148: [168], 149: [169], 150: [170], 151: [171], 152: [172], 153: [174], 154: [175], 155: [176], 156: [177], 157: [178], 158: [179], 159: [180], 160: [181], 161: [182], 162: [183], 163: [184], 164: [185], 165: [186], 166: [187], 167: [188], 168: [189], 169: [190], 170: [191], 171: [192], 172: [193], 173: [194], 174: [195], 175: [196], 176: [197], 177: [198], 178: [199], 179: [200], 180: [201], 181: [202], 182: [203], 183: [204], 184: [205], 185: [206], 186: [207], 187: [208], 188: [209], 189: [210], 190: [211], 191: [212], 192: [213], 193: [214], 194: [215], 195: [216], 196: [217], 197: [219], 198: [220], 199: [221], 200: [222], 201: [223], 202: [224], 203: [226], 204: [227], 205: [228], 206: [229], 207: [230], 208: [231], 209: [233], 210: [234], 211: [246], 212: [247], 213: [248], 214: [249], 215: [250], 216: [251], 217: [252], 218: [253], 219: [254], 220: [255], 221: [256], 222: [257], 223: [258], 224: [259], 225: [260], 226: [262], 227: [263], 228: [265], 229: [266], 230: [267], 231: [268], 232: [269], 233: [270], 234: [271], 235: [272], 236: [273], 237: [274], 238: [275], 239: [276], 240: [277], 241: [278], 242: [279], 243: [281], 244: [282], 245: [283], 246: [284], 247: [285], 248: [286], 249: [287], 250: [288], 251: [289], 252: [290], 253: [291], 254: [292], 255: [293], 256: [294], 257: [295], 258: [297], 259: [298], 260: [299], 261: [300], 262: [301], 263: [302], 264: [303], 265: [304], 266: [305], 267: [306], 268: [307], 269: [308], 270: [309], 271: [310], 272: [311], 273: [312], 274: [313], 275: [314], 276: [315], 277: [316], 278: [317], 279: [318], 280: [319], 281: [320], 282: [321], 283: [322], 284: [323], 285: [324], 286: [325], 287: [326], 288: [327], 289: [328], 290: [329], 291: [330], 292: [331], 293: [332], 294: [333], 295: [334], 296: [335], 297: [336], 298: [337], 299: [338], 300: [339], 301: [340], 302: [342], 303: [343], 304: [344], 305: [345], 306: [346], 307: [347], 308: [349], 309: [350], 310: [351], 311: [352], 312: [353], 313: [354], 314: [356], 315: [357], 316: [369], 317: [370], 318: [371], 319: [372], 320: [373], 321: [374], 322: [376], 323: [377], 324: [378], 325: [379], 326: [380], 327: [381], 328: [382], 329: [383], 330: [384], 331: [385], 332: [386], 333: [387], 334: [388], 335: [389], 336: [390], 337: [392], 338: [393], 339: [394], 340: [395], 341: [396], 342: [397], 343: [398], 344: [399], 345: [400], 346: [401], 347: [402], 348: [403], 349: [404], 350: [405], 351: [406], 352: [408], 353: [409], 354: [410], 355: [411], 356: [412], 357: [413], 358: [414], 359: [415], 360: [416], 361: [417], 362: [418], 363: [419], 364: [420], 365: [421], 366: [422], 367: [424], 368: [425], 369: [426], 370: [427], 371: [428], 372: [429], 373: [430], 374: [431], 375: [432], 376: [433], 377: [434], 378: [436], 379: [437], 380: [439], 381: [440], 382: [446], 383: [447], 384: [448], 385: [449], 386: [450], 387: [451], 388: [452], 389: [453], 390: [454], 391: [456], 392: [457], 393: [458], 394: [459], 395: [460], 396: [461], 397: [462], 398: [463], 399: [464], 400: [466], 401: [467], 402: [468], 403: [469], 404: [470], 405: [471], 406: [472], 407: [473], 408: [474], 409: [476], 410: [477], 411: [478], 412: [479], 413: [480], 414: [481], 415: [483], 416: [484], 417: [485], 418: [486], 419: [487], 420: [488], 421: [492], 422: [493], 423: [494], 424: [495], 425: [496], 426: [497], 427: [499], 428: [500], 429: [501], 430: [502], 431: [503], 432: [504], 433: [505], 434: [506], 435: [507], 436: [508], 437: [509], 438: [510], 439: [511], 440: [512], 441: [513], 442: [515], 443: [516], 444: [517], 445: [518], 446: [519], 447: [520], 448: [521], 449: [522], 450: [523], 451: [524], 452: [525], 453: [526], 454: [527], 455: [528], 456: [529], 457: [531], 458: [532], 459: [533], 460: [534], 461: [535], 462: [536], 463: [537], 464: [538], 465: [539], 466: [540], 467: [541], 468: [542], 469: [543], 470: [544], 471: [545], 472: [547], 473: [548], 474: [549], 475: [550], 476: [551], 477: [552], 478: [553], 479: [554], 480: [555], 481: [556], 482: [557], 483: [559], 484: [560], 485: [562], 486: [563], 487: [569], 488: [570], 489: [571], 490: [572], 491: [573], 492: [574], 493: [575], 494: [576], 495: [577], 496: [579], 497: [580], 498: [581], 499: [582], 500: [583], 501: [584], 502: [585], 503: [586], 504: [587], 505: [589], 506: [590], 507: [591], 508: [592], 509: [593], 510: [594], 511: [595], 512: [596], 513: [597], 514: [599], 515: [600], 516: [601], 517: [602], 518: [603], 519: [604], 520: [606], 521: [607], 522: [608], 523: [609], 524: [610], 525: [611], 526: [615], 527: [616], 528: [617], 529: [618], 530: [619], 531: [620], 532: [622], 533: [623], 534: [624], 535: [625], 536: [626], 537: [627], 538: [628], 539: [629], 540: [630], 541: [631], 542: [632], 543: [633], 544: [634], 545: [635], 546: [636], 547: [638], 548: [639], 549: [640], 550: [641], 551: [642], 552: [643], 553: [644], 554: [645], 555: [646], 556: [647], 557: [648], 558: [649], 559: [650], 560: [651], 561: [652], 562: [654], 563: [655], 564: [656], 565: [657], 566: [658], 567: [659], 568: [660], 569: [661], 570: [662], 571: [663], 572: [664], 573: [665], 574: [666], 575: [667], 576: [668], 577: [670], 578: [671], 579: [672], 580: [673], 581: [674], 582: [675], 583: [676], 584: [677], 585: [678], 586: [679], 587: [680], 588: [682], 589: [683], 590: [685], 591: [686], 592: [692], 593: [693], 594: [694], 595: [695], 596: [696], 597: [697], 598: [698], 599: [699], 600: [700], 601: [702], 602: [703], 603: [704], 604: [705], 605: [706], 606: [707], 607: [708], 608: [709], 609: [710], 610: [712], 611: [713], 612: [714], 613: [715], 614: [716], 615: [717], 616: [718], 617: [719], 618: [720], 619: [722], 620: [723], 621: [724], 622: [725], 623: [726], 624: [727], 625: [729], 626: [730], 627: [731], 628: [732], 629: [733], 630: [734], 631: [738], 632: [739], 633: [740], 634: [741], 635: [742], 636: [743], 637: [744], 638: [745], 639: [746], 640: [747], 641: [748], 642: [749], 643: [750], 644: [751], 645: [752], 646: [753], 647: [754], 648: [755], 649: [756], 650: [757], 651: [758], 652: [759], 653: [760], 654: [761], 655: [762], 656: [763], 657: [764], 658: [765], 659: [766], 660: [767], 661: [768], 662: [769], 663: [770], 664: [772], 665: [773], 666: [774], 667: [775], 668: [776], 669: [777], 670: [779], 671: [780], 672: [781], 673: [782], 674: [783], 675: [784], 676: [788], 677: [789], 678: [790], 679: [791], 680: [792], 681: [793], 682: [794], 683: [795], 684: [796], 685: [797], 686: [798], 687: [799], 688: [800], 689: [801], 690: [802], 691: [803], 692: [804], 693: [805], 694: [806], 695: [807], 696: [808], 697: [809], 698: [810], 699: [811], 700: [812], 701: [813], 702: [814], 703: [815], 704: [816], 705: [817], 706: [818], 707: [819], 708: [820], 709: [822], 710: [823], 711: [824], 712: [825], 713: [826], 714: [827], 715: [829], 716: [830], 717: [831], 718: [832], 719: [833], 720: [834], 721: [838], 722: [839], 723: [841], 724: [842], 725: [844], 726: [845], 727: [851], 728: [852], 729: [853], 730: [854], 731: [855], 732: [856], 733: [857], 734: [858], 735: [859], 736: [861], 737: [862], 738: [863], 739: [864], 740: [865], 741: [866], 742: [867], 743: [868], 744: [869], 745: [871], 746: [872], 747: [873], 748: [874], 749: [875], 750: [876], 751: [877], 752: [878], 753: [879], 754: [881], 755: [882], 756: [883], 757: [884], 758: [885], 759: [886], 760: [888], 761: [889], 762: [890], 763: [891], 764: [892], 765: [893], 766: [894], 767: [895], 768: [897], 769: [898], 770: [900], 771: [901], 772: [907], 773: [908], 774: [909], 775: [910], 776: [911], 777: [912], 778: [913], 779: [914], 780: [915], 781: [917], 782: [918], 783: [919], 784: [920], 785: [921], 786: [922], 787: [923], 788: [924], 789: [925], 790: [927], 791: [928], 792: [929], 793: [930], 794: [931], 795: [932], 796: [933], 797: [934], 798: [935], 799: [937], 800: [938], 801: [939], 802: [940], 803: [941], 804: [942], 805: [944], 806: [945], 807: [946], 808: [947], 809: [948], 810: [949], 811: [950], 812: [951], 813: [953], 814: [954], 815: [956], 816: [957], 817: [963], 818: [964], 819: [965], 820: [966], 821: [967], 822: [968], 823: [969], 824: [970], 825: [971], 826: [973], 827: [974], 828: [975], 829: [976], 830: [977], 831: [978], 832: [979], 833: [980], 834: [981], 835: [983], 836: [984], 837: [985], 838: [986], 839: [987], 840: [988], 841: [989], 842: [990], 843: [991], 844: [993], 845: [994], 846: [995], 847: [996], 848: [997], 849: [998], 850: [1000], 851: [1001], 852: [1002], 853: [1003], 854: [1004], 855: [1005], 856: [1006], 857: [1007], 858: [1009], 859: [1010], 860: [1012], 861: [1013], 862: [1018], 863: [1019], 864: [1021], 865: [1022], 866: [1024], 867: [1025], 868: [1030], 869: [1031], 870: [1033], 871: [1034], 872: [1036], 873: [1037], 874: [1045], 875: [1046], 876: [1047], 877: [1048], 878: [1049], 879: [1050], 880: [1051], 881: [1052], 882: [1053], 883: [1054], 884: [1055], 885: [1056], 886: [1057], 887: [1058], 888: [1059], 889: [1060], 890: [1061], 891: [1062], 892: [1064], 893: [1065], 894: [1066], 895: [1067], 896: [1068], 897: [1069], 898: [1070], 899: [1071], 900: [1072], 901: [1073], 902: [1074], 903: [1075], 904: [1076], 905: [1077], 906: [1078], 907: [1079], 908: [1080], 909: [1081], 910: [1083], 911: [1084], 912: [1085], 913: [1086], 914: [1087], 915: [1088], 916: [1089], 917: [1090], 918: [1091], 919: [1092], 920: [1093], 921: [1094], 922: [1095], 923: [1096], 924: [1097], 925: [1098], 926: [1099], 927: [1100], 928: [1102], 929: [1103], 930: [1104], 931: [1105], 932: [1106], 933: [1107], 934: [1109], 935: [1110], 936: [1111], 937: [1112], 938: [1113], 939: [1114], 940: [1116], 941: [1117], 942: [1118], 943: [1119], 944: [1120], 945: [1121]} [1;30m[model_handling.py at line 1654][0m [0m
-[1;32mDEBUG:  diag_to_config = [0m {1: 1, 2: 2, 6: 3, 7: 4, 11: 5, 12: 6, 28: 7, 29: 8, 30: 9, 34: 10, 35: 11, 36: 12, 40: 13, 41: 14, 42: 15, 46: 16, 47: 17, 48: 18, 49: 19, 50: 20, 51: 21, 55: 22, 56: 23, 57: 24, 58: 25, 59: 26, 60: 27, 61: 28, 62: 29, 63: 30, 64: 31, 65: 32, 66: 33, 67: 34, 68: 35, 69: 36, 73: 37, 74: 38, 75: 39, 76: 40, 77: 41, 78: 42, 79: 43, 80: 44, 81: 45, 82: 46, 83: 47, 84: 48, 85: 49, 86: 50, 87: 51, 91: 52, 92: 53, 93: 54, 94: 55, 95: 56, 96: 57, 97: 58, 98: 59, 99: 60, 100: 61, 101: 62, 102: 63, 103: 64, 104: 65, 105: 66, 109: 67, 110: 68, 111: 69, 112: 70, 113: 71, 114: 72, 115: 73, 116: 74, 117: 75, 121: 76, 122: 77, 123: 78, 124: 79, 125: 80, 126: 81, 127: 82, 128: 83, 129: 84, 133: 85, 134: 86, 135: 87, 136: 88, 137: 89, 138: 90, 139: 91, 140: 92, 141: 93, 142: 94, 143: 95, 144: 96, 145: 97, 146: 98, 147: 99, 148: 100, 149: 101, 150: 102, 151: 103, 152: 104, 153: 105, 160: 106, 161: 107, 162: 108, 163: 109, 164: 110, 165: 111, 166: 112, 167: 113, 168: 114, 169: 115, 170: 116, 171: 117, 172: 118, 173: 119, 174: 120, 178: 121, 179: 122, 183: 123, 184: 124, 185: 125, 186: 126, 187: 127, 188: 128, 189: 129, 190: 130, 191: 131, 192: 132, 193: 133, 194: 134, 195: 135, 196: 136, 197: 137, 201: 138, 202: 139, 203: 140, 204: 141, 205: 142, 206: 143, 207: 144, 208: 145, 209: 146, 210: 147, 211: 148, 212: 149, 213: 150, 214: 151, 215: 152, 219: 153, 220: 154, 221: 155, 222: 156, 223: 157, 224: 158, 225: 159, 226: 160, 227: 161, 228: 162, 229: 163, 230: 164, 231: 165, 232: 166, 233: 167, 234: 168, 235: 169, 236: 170, 237: 171, 238: 172, 239: 173, 240: 174, 241: 175, 242: 176, 243: 177, 244: 178, 245: 179, 246: 180, 247: 181, 248: 182, 249: 183, 250: 184, 251: 185, 252: 186, 253: 187, 254: 188, 255: 189, 256: 190, 257: 191, 258: 192, 259: 193, 260: 194, 261: 195, 262: 196, 266: 197, 267: 198, 268: 199, 269: 200, 270: 201, 271: 202, 275: 203, 276: 204, 277: 205, 278: 206, 279: 207, 280: 208, 284: 209, 285: 210, 319: 211, 320: 212, 321: 213, 322: 214, 323: 215, 324: 216, 325: 217, 326: 218, 327: 219, 328: 220, 329: 221, 330: 222, 331: 223, 332: 224, 333: 225, 337: 226, 338: 227, 342: 228, 343: 229, 344: 230, 345: 231, 346: 232, 347: 233, 348: 234, 349: 235, 350: 236, 351: 237, 352: 238, 353: 239, 354: 240, 355: 241, 356: 242, 360: 243, 361: 244, 362: 245, 363: 246, 364: 247, 365: 248, 366: 249, 367: 250, 368: 251, 369: 252, 370: 253, 371: 254, 372: 255, 373: 256, 374: 257, 378: 258, 379: 259, 380: 260, 381: 261, 382: 262, 383: 263, 384: 264, 385: 265, 386: 266, 387: 267, 388: 268, 389: 269, 390: 270, 391: 271, 392: 272, 393: 273, 394: 274, 395: 275, 396: 276, 397: 277, 398: 278, 399: 279, 400: 280, 401: 281, 402: 282, 403: 283, 404: 284, 405: 285, 406: 286, 407: 287, 408: 288, 409: 289, 410: 290, 411: 291, 412: 292, 413: 293, 414: 294, 415: 295, 416: 296, 417: 297, 418: 298, 419: 299, 420: 300, 421: 301, 425: 302, 426: 303, 427: 304, 428: 305, 429: 306, 430: 307, 434: 308, 435: 309, 436: 310, 437: 311, 438: 312, 439: 313, 443: 314, 444: 315, 478: 316, 479: 317, 480: 318, 481: 319, 482: 320, 483: 321, 487: 322, 488: 323, 489: 324, 490: 325, 491: 326, 492: 327, 493: 328, 494: 329, 495: 330, 496: 331, 497: 332, 498: 333, 499: 334, 500: 335, 501: 336, 505: 337, 506: 338, 507: 339, 508: 340, 509: 341, 510: 342, 511: 343, 512: 344, 513: 345, 514: 346, 515: 347, 516: 348, 517: 349, 518: 350, 519: 351, 523: 352, 524: 353, 525: 354, 526: 355, 527: 356, 528: 357, 529: 358, 530: 359, 531: 360, 532: 361, 533: 362, 534: 363, 535: 364, 536: 365, 537: 366, 541: 367, 542: 368, 543: 369, 544: 370, 545: 371, 546: 372, 547: 373, 548: 374, 549: 375, 550: 376, 551: 377, 555: 378, 556: 379, 560: 380, 561: 381, 577: 382, 578: 383, 579: 384, 580: 385, 581: 386, 582: 387, 583: 388, 584: 389, 585: 390, 589: 391, 590: 392, 591: 393, 592: 394, 593: 395, 594: 396, 595: 397, 596: 398, 597: 399, 601: 400, 602: 401, 603: 402, 604: 403, 605: 404, 606: 405, 607: 406, 608: 407, 609: 408, 613: 409, 614: 410, 615: 411, 616: 412, 617: 413, 618: 414, 622: 415, 623: 416, 624: 417, 625: 418, 626: 419, 627: 420, 637: 421, 638: 422, 639: 423, 640: 424, 641: 425, 642: 426, 646: 427, 647: 428, 648: 429, 649: 430, 650: 431, 651: 432, 652: 433, 653: 434, 654: 435, 655: 436, 656: 437, 657: 438, 658: 439, 659: 440, 660: 441, 664: 442, 665: 443, 666: 444, 667: 445, 668: 446, 669: 447, 670: 448, 671: 449, 672: 450, 673: 451, 674: 452, 675: 453, 676: 454, 677: 455, 678: 456, 682: 457, 683: 458, 684: 459, 685: 460, 686: 461, 687: 462, 688: 463, 689: 464, 690: 465, 691: 466, 692: 467, 693: 468, 694: 469, 695: 470, 696: 471, 700: 472, 701: 473, 702: 474, 703: 475, 704: 476, 705: 477, 706: 478, 707: 479, 708: 480, 709: 481, 710: 482, 714: 483, 715: 484, 719: 485, 720: 486, 736: 487, 737: 488, 738: 489, 739: 490, 740: 491, 741: 492, 742: 493, 743: 494, 744: 495, 748: 496, 749: 497, 750: 498, 751: 499, 752: 500, 753: 501, 754: 502, 755: 503, 756: 504, 760: 505, 761: 506, 762: 507, 763: 508, 764: 509, 765: 510, 766: 511, 767: 512, 768: 513, 772: 514, 773: 515, 774: 516, 775: 517, 776: 518, 777: 519, 781: 520, 782: 521, 783: 522, 784: 523, 785: 524, 786: 525, 796: 526, 797: 527, 798: 528, 799: 529, 800: 530, 801: 531, 805: 532, 806: 533, 807: 534, 808: 535, 809: 536, 810: 537, 811: 538, 812: 539, 813: 540, 814: 541, 815: 542, 816: 543, 817: 544, 818: 545, 819: 546, 823: 547, 824: 548, 825: 549, 826: 550, 827: 551, 828: 552, 829: 553, 830: 554, 831: 555, 832: 556, 833: 557, 834: 558, 835: 559, 836: 560, 837: 561, 841: 562, 842: 563, 843: 564, 844: 565, 845: 566, 846: 567, 847: 568, 848: 569, 849: 570, 850: 571, 851: 572, 852: 573, 853: 574, 854: 575, 855: 576, 859: 577, 860: 578, 861: 579, 862: 580, 863: 581, 864: 582, 865: 583, 866: 584, 867: 585, 868: 586, 869: 587, 873: 588, 874: 589, 878: 590, 879: 591, 895: 592, 896: 593, 897: 594, 898: 595, 899: 596, 900: 597, 901: 598, 902: 599, 903: 600, 907: 601, 908: 602, 909: 603, 910: 604, 911: 605, 912: 606, 913: 607, 914: 608, 915: 609, 919: 610, 920: 611, 921: 612, 922: 613, 923: 614, 924: 615, 925: 616, 926: 617, 927: 618, 931: 619, 932: 620, 933: 621, 934: 622, 935: 623, 936: 624, 940: 625, 941: 626, 942: 627, 943: 628, 944: 629, 945: 630, 955: 631, 956: 632, 957: 633, 958: 634, 959: 635, 960: 636, 961: 637, 962: 638, 963: 639, 964: 640, 965: 641, 966: 642, 967: 643, 968: 644, 969: 645, 970: 646, 971: 647, 972: 648, 973: 649, 974: 650, 975: 651, 976: 652, 977: 653, 978: 654, 979: 655, 980: 656, 981: 657, 982: 658, 983: 659, 984: 660, 985: 661, 986: 662, 987: 663, 991: 664, 992: 665, 993: 666, 994: 667, 995: 668, 996: 669, 1000: 670, 1001: 671, 1002: 672, 1003: 673, 1004: 674, 1005: 675, 1015: 676, 1016: 677, 1017: 678, 1018: 679, 1019: 680, 1020: 681, 1021: 682, 1022: 683, 1023: 684, 1024: 685, 1025: 686, 1026: 687, 1027: 688, 1028: 689, 1029: 690, 1030: 691, 1031: 692, 1032: 693, 1033: 694, 1034: 695, 1035: 696, 1036: 697, 1037: 698, 1038: 699, 1039: 700, 1040: 701, 1041: 702, 1042: 703, 1043: 704, 1044: 705, 1045: 706, 1046: 707, 1047: 708, 1051: 709, 1052: 710, 1053: 711, 1054: 712, 1055: 713, 1056: 714, 1060: 715, 1061: 716, 1062: 717, 1063: 718, 1064: 719, 1065: 720, 1075: 721, 1076: 722, 1080: 723, 1081: 724, 1085: 725, 1086: 726, 1102: 727, 1103: 728, 1104: 729, 1105: 730, 1106: 731, 1107: 732, 1108: 733, 1109: 734, 1110: 735, 1114: 736, 1115: 737, 1116: 738, 1117: 739, 1118: 740, 1119: 741, 1120: 742, 1121: 743, 1122: 744, 1126: 745, 1127: 746, 1128: 747, 1129: 748, 1130: 749, 1131: 750, 1132: 751, 1133: 752, 1134: 753, 1138: 754, 1139: 755, 1140: 756, 1141: 757, 1142: 758, 1143: 759, 1147: 760, 1148: 761, 1149: 762, 1150: 763, 1151: 764, 1152: 765, 1153: 766, 1154: 767, 1158: 768, 1159: 769, 1163: 770, 1164: 771, 1180: 772, 1181: 773, 1182: 774, 1183: 775, 1184: 776, 1185: 777, 1186: 778, 1187: 779, 1188: 780, 1192: 781, 1193: 782, 1194: 783, 1195: 784, 1196: 785, 1197: 786, 1198: 787, 1199: 788, 1200: 789, 1204: 790, 1205: 791, 1206: 792, 1207: 793, 1208: 794, 1209: 795, 1210: 796, 1211: 797, 1212: 798, 1216: 799, 1217: 800, 1218: 801, 1219: 802, 1220: 803, 1221: 804, 1225: 805, 1226: 806, 1227: 807, 1228: 808, 1229: 809, 1230: 810, 1231: 811, 1232: 812, 1236: 813, 1237: 814, 1241: 815, 1242: 816, 1258: 817, 1259: 818, 1260: 819, 1261: 820, 1262: 821, 1263: 822, 1264: 823, 1265: 824, 1266: 825, 1270: 826, 1271: 827, 1272: 828, 1273: 829, 1274: 830, 1275: 831, 1276: 832, 1277: 833, 1278: 834, 1282: 835, 1283: 836, 1284: 837, 1285: 838, 1286: 839, 1287: 840, 1288: 841, 1289: 842, 1290: 843, 1294: 844, 1295: 845, 1296: 846, 1297: 847, 1298: 848, 1299: 849, 1303: 850, 1304: 851, 1305: 852, 1306: 853, 1307: 854, 1308: 855, 1309: 856, 1310: 857, 1314: 858, 1315: 859, 1319: 860, 1320: 861, 1333: 862, 1334: 863, 1338: 864, 1339: 865, 1343: 866, 1344: 867, 1357: 868, 1358: 869, 1362: 870, 1363: 871, 1367: 872, 1368: 873, 1396: 874, 1397: 875, 1398: 876, 1399: 877, 1400: 878, 1401: 879, 1402: 880, 1403: 881, 1404: 882, 1405: 883, 1406: 884, 1407: 885, 1408: 886, 1409: 887, 1410: 888, 1411: 889, 1412: 890, 1413: 891, 1417: 892, 1418: 893, 1419: 894, 1420: 895, 1421: 896, 1422: 897, 1423: 898, 1424: 899, 1425: 900, 1426: 901, 1427: 902, 1428: 903, 1429: 904, 1430: 905, 1431: 906, 1432: 907, 1433: 908, 1434: 909, 1438: 910, 1439: 911, 1440: 912, 1441: 913, 1442: 914, 1443: 915, 1444: 916, 1445: 917, 1446: 918, 1447: 919, 1448: 920, 1449: 921, 1450: 922, 1451: 923, 1452: 924, 1453: 925, 1454: 926, 1455: 927, 1459: 928, 1460: 929, 1461: 930, 1462: 931, 1463: 932, 1464: 933, 1468: 934, 1469: 935, 1470: 936, 1471: 937, 1472: 938, 1473: 939, 1477: 940, 1478: 941, 1479: 942, 1480: 943, 1481: 944, 1482: 945} [1;30m[model_handling.py at line 1711][0m [0m
-[1;32mDEBUG:  call = [0m vxxxxx( momenta,m_pars->%s, cHel[ihel][%d],%+d, w_sv[%d], %d ); [1;30m[model_handling.py at line 1823][0m [0m
-[1;32mDEBUG:  ('ZERO', 0, -1, 0, 0) [1;30m[model_handling.py at line 1824][0m [0m
-[1;32mDEBUG:  call = [0m vxxxxx( momenta,m_pars->%s, cHel[ihel][%d],%+d, w_sv[%d], %d ); [1;30m[model_handling.py at line 1823][0m [0m
-[1;32mDEBUG:  ('ZERO', 1, -1, 1, 1) [1;30m[model_handling.py at line 1824][0m [0m
-[1;32mDEBUG:  call = [0m vxxxxx( momenta,m_pars->%s, cHel[ihel][%d],%+d, w_sv[%d], %d ); [1;30m[model_handling.py at line 1823][0m [0m
-[1;32mDEBUG:  ('ZERO', 4, 1, 4, 4) [1;30m[model_handling.py at line 1824][0m [0m
-[1;32mDEBUG:  call = [0m vxxxxx( momenta,m_pars->%s, cHel[ihel][%d],%+d, w_sv[%d], %d ); [1;30m[model_handling.py at line 1823][0m [0m
-[1;32mDEBUG:  ('ZERO', 5, 1, 5, 5) [1;30m[model_handling.py at line 1824][0m [0m
-[1;32mDEBUG:  call = [0m vxxxxx( momenta,m_pars->%s, cHel[ihel][%d],%+d, w_sv[%d], %d ); [1;30m[model_handling.py at line 1823][0m [0m
-[1;32mDEBUG:  ('ZERO', 6, 1, 6, 6) [1;30m[model_handling.py at line 1824][0m [0m
+FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.h
+FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.cc
 INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./. 
-[1;32mDEBUG:  Entering PLUGIN_OneProcessExporter.edit_CMakeLists [1;30m[model_handling.py at line 1343][0m [0m
-[1;32mDEBUG:  Entering PLUGIN_OneProcessExporter.edit_check_sa [1;30m[model_handling.py at line 1352][0m [0m
-[1;32mDEBUG:  Entering PLUGIN_OneProcessExporter.edit_mgonGPU [1;30m[model_handling.py at line 1369][0m [0m
-[1;32mDEBUG:  Entering PLUGIN_OneProcessExporter.edit_processidfile [1;30m[model_handling.py at line 1389][0m [0m
-[1;32mDEBUG:  Entering PLUGIN_OneProcessExporter.edit_coloramps [1;30m[model_handling.py at line 1401][0m [0m
-[1;32mDEBUG:  Entering PLUGIN_OneProcessExporter.edit_testxxx [1;30m[model_handling.py at line 1419][0m [0m
-[1;32mDEBUG:  Entering PLUGIN_OneProcessExporter.edit_memorybuffers [1;30m[model_handling.py at line 1430][0m [0m
-[1;32mDEBUG:  Entering PLUGIN_OneProcessExporter.edit_memoryaccesscouplings [1;30m[model_handling.py at line 1441][0m [0m
-[1;32mDEBUG:  'Copying test reference file: ', template_ref  = [0m Copying test reference file:  /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/../../../test/ref/dump_CPUTest.Sigma_sm_gg_ttxggg.txt [1;30m[model_handling.py at line 1335][0m [0m
 [1;32mDEBUG:  proc_id = [0m 1 [1;30m[export_cpp.py at line 710][0m [0m
 [1;32mDEBUG:  config_map = [0m [1, 2, 0, 3, 4, 0, 5, 6, 0, 0, 0, 0, 0, 7, 8, 9, 0, 10, 11, 12, 0, 13, 14, 15, 0, 16, 17, 18, 19, 20, 21, 0, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 0, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 0, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 0, 67, 68, 69, 70, 71, 72, 73, 74, 75, 0, 76, 77, 78, 79, 80, 81, 82, 83, 84, 0, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 0, 0, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 0, 121, 122, 0, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 0, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 0, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196, 0, 197, 198, 199, 200, 201, 202, 0, 203, 204, 205, 206, 207, 208, 0, 209, 210, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223, 224, 225, 0, 226, 227, 0, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239, 240, 241, 242, 0, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254, 255, 256, 257, 0, 258, 259, 260, 261, 262, 263, 264, 265, 266, 267, 268, 269, 270, 271, 272, 273, 274, 275, 276, 277, 278, 279, 280, 281, 282, 283, 284, 285, 286, 287, 288, 289, 290, 291, 292, 293, 294, 295, 296, 297, 298, 299, 300, 301, 0, 302, 303, 304, 305, 306, 307, 0, 308, 309, 310, 311, 312, 313, 0, 314, 315, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 316, 317, 318, 319, 320, 321, 0, 322, 323, 324, 325, 326, 327, 328, 329, 330, 331, 332, 333, 334, 335, 336, 0, 337, 338, 339, 340, 341, 342, 343, 344, 345, 346, 347, 348, 349, 350, 351, 0, 352, 353, 354, 355, 356, 357, 358, 359, 360, 361, 362, 363, 364, 365, 366, 0, 367, 368, 369, 370, 371, 372, 373, 374, 375, 376, 377, 0, 378, 379, 0, 380, 381, 0, 0, 0, 0, 0, 382, 383, 384, 385, 386, 387, 388, 389, 390, 0, 391, 392, 393, 394, 395, 396, 397, 398, 399, 0, 400, 401, 402, 403, 404, 405, 406, 407, 408, 0, 409, 410, 411, 412, 413, 414, 0, 415, 416, 417, 418, 419, 420, 0, 0, 0, 421, 422, 423, 424, 425, 426, 0, 427, 428, 429, 430, 431, 432, 433, 434, 435, 436, 437, 438, 439, 440, 441, 0, 442, 443, 444, 445, 446, 447, 448, 449, 450, 451, 452, 453, 454, 455, 456, 0, 457, 458, 459, 460, 461, 462, 463, 464, 465, 466, 467, 468, 469, 470, 471, 0, 472, 473, 474, 475, 476, 477, 478, 479, 480, 481, 482, 0, 483, 484, 0, 485, 486, 0, 0, 0, 0, 0, 487, 488, 489, 490, 491, 492, 493, 494, 495, 0, 496, 497, 498, 499, 500, 501, 502, 503, 504, 0, 505, 506, 507, 508, 509, 510, 511, 512, 513, 0, 514, 515, 516, 517, 518, 519, 0, 520, 521, 522, 523, 524, 525, 0, 0, 0, 526, 527, 528, 529, 530, 531, 0, 532, 533, 534, 535, 536, 537, 538, 539, 540, 541, 542, 543, 544, 545, 546, 0, 547, 548, 549, 550, 551, 552, 553, 554, 555, 556, 557, 558, 559, 560, 561, 0, 562, 563, 564, 565, 566, 567, 568, 569, 570, 571, 572, 573, 574, 575, 576, 0, 577, 578, 579, 580, 581, 582, 583, 584, 585, 586, 587, 0, 588, 589, 0, 590, 591, 0, 0, 0, 0, 0, 592, 593, 594, 595, 596, 597, 598, 599, 600, 0, 601, 602, 603, 604, 605, 606, 607, 608, 609, 0, 610, 611, 612, 613, 614, 615, 616, 617, 618, 0, 619, 620, 621, 622, 623, 624, 0, 625, 626, 627, 628, 629, 630, 0, 0, 0, 631, 632, 633, 634, 635, 636, 637, 638, 639, 640, 641, 642, 643, 644, 645, 646, 647, 648, 649, 650, 651, 652, 653, 654, 655, 656, 657, 658, 659, 660, 661, 662, 663, 0, 664, 665, 666, 667, 668, 669, 0, 670, 671, 672, 673, 674, 675, 0, 0, 0, 676, 677, 678, 679, 680, 681, 682, 683, 684, 685, 686, 687, 688, 689, 690, 691, 692, 693, 694, 695, 696, 697, 698, 699, 700, 701, 702, 703, 704, 705, 706, 707, 708, 0, 709, 710, 711, 712, 713, 714, 0, 715, 716, 717, 718, 719, 720, 0, 0, 0, 721, 722, 0, 723, 724, 0, 725, 726, 0, 0, 0, 0, 0, 727, 728, 729, 730, 731, 732, 733, 734, 735, 0, 736, 737, 738, 739, 740, 741, 742, 743, 744, 0, 745, 746, 747, 748, 749, 750, 751, 752, 753, 0, 754, 755, 756, 757, 758, 759, 0, 760, 761, 762, 763, 764, 765, 766, 767, 0, 768, 769, 0, 770, 771, 0, 0, 0, 0, 0, 772, 773, 774, 775, 776, 777, 778, 779, 780, 0, 781, 782, 783, 784, 785, 786, 787, 788, 789, 0, 790, 791, 792, 793, 794, 795, 796, 797, 798, 0, 799, 800, 801, 802, 803, 804, 0, 805, 806, 807, 808, 809, 810, 811, 812, 0, 813, 814, 0, 815, 816, 0, 0, 0, 0, 0, 817, 818, 819, 820, 821, 822, 823, 824, 825, 0, 826, 827, 828, 829, 830, 831, 832, 833, 834, 0, 835, 836, 837, 838, 839, 840, 841, 842, 843, 0, 844, 845, 846, 847, 848, 849, 0, 850, 851, 852, 853, 854, 855, 856, 857, 0, 858, 859, 0, 860, 861, 0, 0, 0, 0, 862, 863, 0, 864, 865, 0, 866, 867, 0, 0, 0, 0, 868, 869, 0, 870, 871, 0, 872, 873, 0, 0, 0, 0, 0, 0, 0, 874, 875, 876, 877, 878, 879, 880, 881, 882, 883, 884, 885, 886, 887, 888, 889, 890, 891, 0, 892, 893, 894, 895, 896, 897, 898, 899, 900, 901, 902, 903, 904, 905, 906, 907, 908, 909, 0, 910, 911, 912, 913, 914, 915, 916, 917, 918, 919, 920, 921, 922, 923, 924, 925, 926, 927, 0, 928, 929, 930, 931, 932, 933, 0, 934, 935, 936, 937, 938, 939, 0, 940, 941, 942, 943, 944, 945, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] [1;30m[export_cpp.py at line 711][0m [0m
 [1;32mDEBUG:  subproc_number = [0m 0 [1;30m[export_cpp.py at line 712][0m [0m
 [1;32mDEBUG:  Done [1;30m[export_cpp.py at line 713][0m [0m
+[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m False True 32 [1;30m[export_v4.py at line 1872][0m [0m
+[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m False True 32 [1;30m[export_v4.py at line 1872][0m [0m
+[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m 32 True 32 [1;30m[export_v4.py at line 1872][0m [0m
+[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m 32 True 32 [1;30m[export_v4.py at line 1872][0m [0m
 INFO: Generating Feynman diagrams for Process: g g > t t~ g g g WEIGHTED<=5 @1 
 INFO: Finding symmetric diagrams for subprocess group gg_ttxggg 
-Generated helas calls for 1 subprocesses (1240 diagrams) in 6.556 s
-Wrote files for 2281 helas calls in 46.775 s
+Generated helas calls for 1 subprocesses (1240 diagrams) in 6.589 s
+Wrote files for 2281 helas calls in 18.549 s
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates VVV1 routines[0m
 ALOHA: aloha creates FFV1 routines[0m
 ALOHA: aloha creates VVVV1 routines[0m
 ALOHA: aloha creates VVVV3 routines[0m
 ALOHA: aloha creates VVVV4 routines[0m
-ALOHA: aloha creates 5 routines in  0.315 s
-[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.convert_model (create the model) [1;30m[output.py at line 194][0m [0m
+ALOHA: aloha creates 5 routines in  0.321 s
+[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.convert_model (create the model) [1;30m[output.py at line 202][0m [0m
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates VVV1 routines[0m
 ALOHA: aloha creates FFV1 routines[0m
 ALOHA: aloha creates VVVV1 routines[0m
 ALOHA: aloha creates VVVV3 routines[0m
 ALOHA: aloha creates VVVV4 routines[0m
-ALOHA: aloha creates 10 routines in  0.312 s
+ALOHA: aloha creates 10 routines in  0.314 s
 <class 'aloha.create_aloha.AbstractRoutine'> VVV1
 <class 'aloha.create_aloha.AbstractRoutine'> VVV1
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
@@ -250,32 +221,103 @@ ALOHA: aloha creates 10 routines in  0.312 s
 <class 'aloha.create_aloha.AbstractRoutine'> VVVV3
 <class 'aloha.create_aloha.AbstractRoutine'> VVVV4
 <class 'aloha.create_aloha.AbstractRoutine'> VVVV4
-FileWriter <class 'PLUGIN.CUDACPP_SA_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_mad_gg_ttggg/src/./HelAmps_sm.h
-INFO: Created file HelAmps_sm.h in directory /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_mad_gg_ttggg/src/. 
+FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg/src/./HelAmps_sm.h
+INFO: Created file HelAmps_sm.h in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg/src/. 
 super_write_set_parameters_onlyfixMajorana (hardcoded=False)
-[1;32mDEBUG:  'pardef_lines size =', len(pardef_lines), ', keys size =', len(pardef_lines.keys())  = [0m pardef_lines size = 48 , keys size = 48 [1;30m[model_handling.py at line 729][0m [0m
 super_write_set_parameters_onlyfixMajorana (hardcoded=True)
-[1;32mDEBUG:  'pardef_lines size =', len(pardef_lines), ', keys size =', len(pardef_lines.keys())  = [0m pardef_lines size = 3 , keys size = 3 [1;30m[model_handling.py at line 729][0m [0m
-[1;32mDEBUG:  'pardef_lines size =', len(pardef_lines), ', keys size =', len(pardef_lines.keys())  = [0m pardef_lines size = 3 , keys size = 3 [1;30m[model_handling.py at line 729][0m [0m
-[1;32mDEBUG:  'pardef_lines size =', len(pardef_lines), ', keys size =', len(pardef_lines.keys())  = [0m pardef_lines size = 3 , keys size = 3 [1;30m[model_handling.py at line 729][0m [0m
-[1;32mDEBUG:  'pardef_lines size =', len(pardef_lines), ', keys size =', len(pardef_lines.keys())  = [0m pardef_lines size = 3 , keys size = 3 [1;30m[model_handling.py at line 729][0m [0m
-[1;32mDEBUG:  'pardef_lines size =', len(pardef_lines), ', keys size =', len(pardef_lines.keys())  = [0m pardef_lines size = 3 , keys size = 3 [1;30m[model_handling.py at line 729][0m [0m
-FileWriter <class 'PLUGIN.CUDACPP_SA_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_mad_gg_ttggg/src/./Parameters_sm.h
-FileWriter <class 'PLUGIN.CUDACPP_SA_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_mad_gg_ttggg/src/./Parameters_sm.cc
+FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg/src/./Parameters_sm.h
+FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg/src/./Parameters_sm.cc
 INFO: Created files Parameters_sm.h and Parameters_sm.cc in directory 
-INFO: /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_mad_gg_ttggg/src/. and /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_mad_gg_ttggg/src/. 
+INFO: /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg/src/. and /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg/src/. 
 The option zerowidth_tchannel is modified [True] but will not be written in the configuration files.
 If you want to make this value the default for future session, you can run 'save options --all'
-save configuration file to /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_mad_gg_ttggg/Cards/me5_configuration.txt
+save configuration file to /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg/Cards/me5_configuration.txt
 INFO: Use Fortran compiler gfortran 
 INFO: Use c++ compiler g++ 
 INFO: Generate web pages 
-Output to directory /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_mad_gg_ttggg done.
+DEBUG: cd /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg; patch -p4 -i /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.common
+patching file Source/genps.inc
+patching file Source/makefile
+patching file SubProcesses/makefile
+patching file bin/internal/gen_ximprove.py
+Hunk #1 succeeded at 391 (offset 6 lines).
+patching file bin/internal/madevent_interface.py
+DEBUG: cd /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg/SubProcesses/P1_gg_ttxggg; patch -p6 -i /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.P1
+patching file auto_dsig1.f
+patching file driver.f
+patching file matrix1.f
+Hunk #2 succeeded at 255 (offset 112 lines).
+Hunk #3 succeeded at 333 (offset 112 lines).
+Hunk #4 succeeded at 361 (offset 112 lines).
+Hunk #5 succeeded at 406 (offset 112 lines).
+[1;32mDEBUG:  p.returncode = [0m 0 [1;30m[output.py at line 237][0m [0m
+Output to directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg done.
 Type "launch" to generate events from this process, or see
-/data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_mad_gg_ttggg/README
+/data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg/README
 Run "open index.html" to see more information about this process.
 quit
 
-real	0m57.415s
-user	0m56.319s
-sys	0m0.892s
+real	0m29.295s
+user	0m28.742s
+sys	0m0.442s
+Code generation completed in 29 seconds
+************************************************************
+*                                                          *
+*                      W E L C O M E to                    *
+*             M A D G R A P H 5 _ a M C @ N L O            *
+*                      M A D E V E N T                     *
+*                                                          *
+*                 *                       *                *
+*                   *        * *        *                  *
+*                     * * * * 5 * * * *                    *
+*                   *        * *        *                  *
+*                 *                       *                *
+*                                                          *
+*         VERSION 3.5.2_lo_vect                            *
+*                                                          *
+*    The MadGraph5_aMC@NLO Development Team - Find us at   *
+*    https://server06.fynu.ucl.ac.be/projects/madgraph     *
+*                                                          *
+*               Type 'help' for in-line help.              *
+*                                                          *
+************************************************************
+INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg/Cards/me5_configuration.txt  
+INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/mg5amcnlo/input/mg5_configuration.txt  
+INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg/Cards/me5_configuration.txt  
+Using default text editor "vi". Set another one in ./input/mg5_configuration.txt
+Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt
+Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt
+treatcards run
+quit
+INFO:  
+launch in debug mode
+************************************************************
+*                                                          *
+*                      W E L C O M E to                    *
+*             M A D G R A P H 5 _ a M C @ N L O            *
+*                      M A D E V E N T                     *
+*                                                          *
+*                 *                       *                *
+*                   *        * *        *                  *
+*                     * * * * 5 * * * *                    *
+*                   *        * *        *                  *
+*                 *                       *                *
+*                                                          *
+*         VERSION 3.5.2_lo_vect                            *
+*                                                          *
+*    The MadGraph5_aMC@NLO Development Team - Find us at   *
+*    https://server06.fynu.ucl.ac.be/projects/madgraph     *
+*                                                          *
+*               Type 'help' for in-line help.              *
+*                                                          *
+************************************************************
+INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg/Cards/me5_configuration.txt  
+INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/mg5amcnlo/input/mg5_configuration.txt  
+INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg/Cards/me5_configuration.txt  
+Using default text editor "vi". Set another one in ./input/mg5_configuration.txt
+Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt
+Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt
+treatcards param
+quit
+INFO:  
+launch in debug mode
diff --git a/epochX/cudacpp/gg_ttggg.mad/Cards/me5_configuration.txt b/epochX/cudacpp/gg_ttggg.mad/Cards/me5_configuration.txt
index 00d7c6f8d6..cdeedc7863 100644
--- a/epochX/cudacpp/gg_ttggg.mad/Cards/me5_configuration.txt
+++ b/epochX/cudacpp/gg_ttggg.mad/Cards/me5_configuration.txt
@@ -234,7 +234,7 @@
 # pineappl = pineappl
 
 
-#mg5_path = /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo 
+#mg5_path = /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/mg5amcnlo 
 
 # MG5 MAIN DIRECTORY
-#mg5_path = /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo
+#mg5_path = /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/mg5amcnlo
diff --git a/epochX/cudacpp/gg_ttggg.mad/Cards/proc_card_mg5.dat b/epochX/cudacpp/gg_ttggg.mad/Cards/proc_card_mg5.dat
index 5e6deebdd7..3923568dd8 100644
--- a/epochX/cudacpp/gg_ttggg.mad/Cards/proc_card_mg5.dat
+++ b/epochX/cudacpp/gg_ttggg.mad/Cards/proc_card_mg5.dat
@@ -8,7 +8,7 @@
 #*                *                       *                 *
 #*                                                          *
 #*                                                          *
-#*         VERSION 3.5.1_lo_vect         2023-08-08         *
+#*         VERSION 3.5.2_lo_vect         2023-11-08         *
 [1;31m#*                                                          *[1;0m
 [1;31m#*          WARNING: UNKNOWN DEVELOPMENT VERSION.           *[1;0m
 [1;31m#*            WARNING: DO NOT USE FOR PRODUCTION            *[1;0m
@@ -45,5 +45,5 @@ define l+ = e+ mu+
 define l- = e- mu-
 define vl = ve vm vt
 define vl~ = ve~ vm~ vt~
-output madevent CODEGEN_mad_gg_ttggg --hel_recycling=False --vector_si\
-ze=16384 --me_exporter=standalone_cudacpp
+output madevent ../TMPOUT/CODEGEN_mad_gg_ttggg --hel_recycling=False -\
+-vector_size=32 --me_exporter=standalone_cudacpp
diff --git a/epochX/cudacpp/gg_ttggg.mad/Cards/run_card.dat b/epochX/cudacpp/gg_ttggg.mad/Cards/run_card.dat
index 0a6bf20eb9..8170176a11 100644
--- a/epochX/cudacpp/gg_ttggg.mad/Cards/run_card.dat
+++ b/epochX/cudacpp/gg_ttggg.mad/Cards/run_card.dat
@@ -80,7 +80,23 @@
    1  = sde_strategy  ! default integration strategy (hep-ph/2021.00773)
                              ! 1 is old strategy (using amp square)
 			     ! 2 is new strategy (using only the denominator)
-# To see advanced option for Phase-Space optimization: type "update psoptim"			     
+#*********************************************************************
+# Phase-Space Optim (advanced)
+#*********************************************************************
+   0 = job_strategy ! see appendix of 1507.00020 (page 26)
+   0 =  hard_survey ! force to have better estimate of the integral at survey for difficult mode like interference
+   -1.0 = tmin_for_channel ! limit the non-singular reach of --some-- channel of integration related to T-channel diagram (value between -1 and 0), -1 is no impact
+   -1 = survey_splitting ! for loop-induced control how many core are used at survey for the computation of a single iteration.
+   2 = survey_nchannel_per_job ! control how many Channel are integrated inside a single job on cluster/multicore
+   -1 = refine_evt_by_job ! control the maximal number of events for the first iteration of the refine (larger means less jobs)  
+#*********************************************************************
+# Compilation flag. 
+#*********************************************************************   
+   -O3 -ffast-math -fbounds-check = global_flag ! build flags for all Fortran code (for a fair comparison to cudacpp; default is -O)
+   --fast-math  = aloha_flag ! fortran optimization flag for aloha function. Suggestions: '-ffast-math'
+   -O3 = matrix_flag ! fortran optimization flag for matrix.f function. Suggestions: '-O3'
+   16384 = vector_size ! size of fortran arrays allocated in the multi-event API for SIMD/GPU (VECSIZE_MEMMAX)
+			     
 #*********************************************************************
 # Customization (custom cuts/scale/bias/...)                         *
 # list of files containing fortran function that overwrite default   *
@@ -174,12 +190,10 @@
 systematics = systematics_program ! none, systematics [python], SysCalc [depreceted, C++]
 ['--mur=0.5,1,2', '--muf=0.5,1,2', '--pdf=errorset'] = systematics_arguments ! see: https://cp3.irmp.ucl.ac.be/projects/madgraph/wiki/Systematics#Systematicspythonmodule
 
-#*********************************************************************
-# Options for the cudacpp plugin
-#*********************************************************************
-
-# Set cudacpp-specific values of non-cudacpp-specific options
--O3 -ffast-math -fbounds-check = global_flag ! build flags for Fortran code (for a fair comparison to cudacpp)
+#***********************************************************************
+# SIMD/GPU configuration for the CUDACPP plugin
+#************************************************************************
+ d = floating_type ! floating point precision: f (single), d (double), m (mixed: double for amplitudes, single for colors)
+ auto = avx_level ! SIMD vectorization level: none, sse4, avx2, 512y, 512z, auto
+ CPP = cudacpp_backend ! CUDACPP backend: FORTRAN, CPP, CUDA
 
-# New cudacpp-specific options (default values are defined in banner.py)
-CPP = cudacpp_backend ! valid backends are FORTRAN, CPP, CUDA
diff --git a/epochX/cudacpp/gg_ttggg.mad/Cards/run_card_default.dat b/epochX/cudacpp/gg_ttggg.mad/Cards/run_card_default.dat
index 3714e71997..8da9ac1563 100644
--- a/epochX/cudacpp/gg_ttggg.mad/Cards/run_card_default.dat
+++ b/epochX/cudacpp/gg_ttggg.mad/Cards/run_card_default.dat
@@ -80,7 +80,23 @@
    2  = sde_strategy  ! default integration strategy (hep-ph/2021.00773)
                              ! 1 is old strategy (using amp square)
 			     ! 2 is new strategy (using only the denominator)
-# To see advanced option for Phase-Space optimization: type "update psoptim"			     
+#*********************************************************************
+# Phase-Space Optim (advanced)
+#*********************************************************************
+   0 = job_strategy ! see appendix of 1507.00020 (page 26)
+   0 =  hard_survey ! force to have better estimate of the integral at survey for difficult mode like interference
+   -1.0 = tmin_for_channel ! limit the non-singular reach of --some-- channel of integration related to T-channel diagram (value between -1 and 0), -1 is no impact
+   -1 = survey_splitting ! for loop-induced control how many core are used at survey for the computation of a single iteration.
+   2 = survey_nchannel_per_job ! control how many Channel are integrated inside a single job on cluster/multicore
+   -1 = refine_evt_by_job ! control the maximal number of events for the first iteration of the refine (larger means less jobs)  
+#*********************************************************************
+# Compilation flag. 
+#*********************************************************************   
+   -O = global_flag ! fortran optimization flag use for the all code.
+   --fast-math  = aloha_flag ! fortran optimization flag for aloha function. Suggestions: '-ffast-math'
+   -O3 = matrix_flag ! fortran optimization flag for matrix.f function. Suggestions: '-O3'
+   16 = vector_size ! size of fortran arrays allocated in the multi-event API for SIMD/GPU (VECSIZE_MEMMAX)
+			     
 #*********************************************************************
 # Customization (custom cuts/scale/bias/...)                         *
 # list of files containing fortran function that overwrite default   *
@@ -173,3 +189,11 @@
 #
 systematics = systematics_program ! none, systematics [python], SysCalc [depreceted, C++]
 ['--mur=0.5,1,2', '--muf=0.5,1,2', '--pdf=errorset'] = systematics_arguments ! see: https://cp3.irmp.ucl.ac.be/projects/madgraph/wiki/Systematics#Systematicspythonmodule
+
+#***********************************************************************
+# SIMD/GPU configuration for the CUDACPP plugin
+#************************************************************************
+ d = floating_type ! floating point precision: f (single), d (double), m (mixed: double for amplitudes, single for colors)
+ auto = avx_level ! SIMD vectorization level: none, sse4, avx2, 512y, 512z, auto
+ CPP = cudacpp_backend ! CUDACPP backend: FORTRAN, CPP, CUDA
+
diff --git a/epochX/cudacpp/gg_ttggg.mad/MGMEVersion.txt b/epochX/cudacpp/gg_ttggg.mad/MGMEVersion.txt
index 1c1a95761b..85c67c3554 100644
--- a/epochX/cudacpp/gg_ttggg.mad/MGMEVersion.txt
+++ b/epochX/cudacpp/gg_ttggg.mad/MGMEVersion.txt
@@ -1 +1 @@
-3.5.1_lo_vect
\ No newline at end of file
+3.5.2_lo_vect
\ No newline at end of file
diff --git a/epochX/cudacpp/gg_ttggg.mad/Source/DHELAS/aloha_file.inc b/epochX/cudacpp/gg_ttggg.mad/Source/DHELAS/aloha_file.inc
index ec923afd6d..fa0f3d86f5 100644
--- a/epochX/cudacpp/gg_ttggg.mad/Source/DHELAS/aloha_file.inc
+++ b/epochX/cudacpp/gg_ttggg.mad/Source/DHELAS/aloha_file.inc
@@ -1 +1 @@
-ALOHARoutine = FFV1_1.o VVVV4_0.o VVVV4P0_1.o FFV1_0.o VVV1_0.o FFV1_2.o VVVV3_0.o VVVV1_0.o VVVV3P0_1.o VVVV1P0_1.o VVV1P0_1.o FFV1P0_3.o
+ALOHARoutine = FFV1P0_3.o FFV1_0.o FFV1_1.o FFV1_2.o VVV1P0_1.o VVV1_0.o VVVV1P0_1.o VVVV1_0.o VVVV3P0_1.o VVVV3_0.o VVVV4P0_1.o VVVV4_0.o
diff --git a/epochX/cudacpp/gg_ttggg.mad/Source/PDF/pdfwrap_lhapdf.f b/epochX/cudacpp/gg_ttggg.mad/Source/PDF/pdfwrap_lhapdf.f
index 0be926e6cd..3f36905346 100644
--- a/epochX/cudacpp/gg_ttggg.mad/Source/PDF/pdfwrap_lhapdf.f
+++ b/epochX/cudacpp/gg_ttggg.mad/Source/PDF/pdfwrap_lhapdf.f
@@ -5,6 +5,7 @@ SUBROUTINE PDFWRAP
 C     
       INCLUDE 'pdf.inc'
       INCLUDE '../alfas.inc'
+      INCLUDE '../vector.inc'
       INCLUDE '../coupl.inc'
       REAL*8 ZMASS
       DATA ZMASS/91.188D0/
diff --git a/epochX/cudacpp/gg_ttggg.mad/Source/make_opts b/epochX/cudacpp/gg_ttggg.mad/Source/make_opts
index bd3c24228d..e4b87ee6ad 100644
--- a/epochX/cudacpp/gg_ttggg.mad/Source/make_opts
+++ b/epochX/cudacpp/gg_ttggg.mad/Source/make_opts
@@ -1,17 +1,12 @@
-pdlabel1=
-pdlabel2=
-lhapdf=
-PYTHIA8_PATH=NotInstalled
-MG5AMC_VERSION=3.5.0_lo_vect
-GLOBAL_FLAG=-O3 -ffast-math -fbounds-check
-ALOHA_FLAG=
-MATRIX_FLAG=
 DEFAULT_CPP_COMPILER=g++
+DEFAULT_F2PY_COMPILER=f2py3
+DEFAULT_F_COMPILER=gfortran
+GLOBAL_FLAG=-O3 -ffast-math -fbounds-check
 MACFLAG=
-STDLIB=-lstdc++
+MG5AMC_VERSION=SpecifiedByMG5aMCAtRunTime
+PYTHIA8_PATH=NotInstalled
 STDLIB_FLAG=
-DEFAULT_F_COMPILER=gfortran
-DEFAULT_F2PY_COMPILER=f2py3
+STDLIB=-lstdc++
 #end_of_make_opts_variables
 
 BIASLIBDIR=../../../lib/
diff --git a/epochX/cudacpp/gg_ttggg.mad/Source/makefile b/epochX/cudacpp/gg_ttggg.mad/Source/makefile
index dbe08b846e..00c73099a0 100644
--- a/epochX/cudacpp/gg_ttggg.mad/Source/makefile
+++ b/epochX/cudacpp/gg_ttggg.mad/Source/makefile
@@ -136,5 +136,7 @@ cleanSource:
 clean: cleanSource
 	for i in `ls -d ../SubProcesses/P*`; do cd $$i; make clean; cd -; done;
 
-cleanall: cleanSource
+cleanavx:
+	for i in `ls -d ../SubProcesses/P*`; do cd $$i; make cleanavxs; cd -; done;
+cleanall: cleanSource # THIS IS THE ONE
 	for i in `ls -d ../SubProcesses/P*`; do cd $$i; make cleanavxs; cd -; done;
diff --git a/epochX/cudacpp/gg_ttggg.mad/Source/param_card.inc b/epochX/cudacpp/gg_ttggg.mad/Source/param_card.inc
index 1fcfce55bb..081365c16b 100644
--- a/epochX/cudacpp/gg_ttggg.mad/Source/param_card.inc
+++ b/epochX/cudacpp/gg_ttggg.mad/Source/param_card.inc
@@ -1,15 +1,15 @@
-      MDL_WZ = 2.441404D+00
-      MDL_WW = 2.047600D+00
-      MDL_WH = 6.382339D-03
-      MDL_WT = 1.491500D+00
+      MDL_MB = 4.700000D+00
+      MDL_MT = 1.730000D+02
       MDL_MTA = 1.777000D+00
       MDL_MZ = 9.118800D+01
       MDL_MH = 1.250000D+02
-      MDL_MB = 4.700000D+00
-      MDL_MT = 1.730000D+02
       AEWM1 = 1.325070D+02
       MDL_GF = 1.166390D-05
       AS = 1.180000D-01
-      MDL_YMTAU = 1.777000D+00
       MDL_YMB = 4.700000D+00
       MDL_YMT = 1.730000D+02
+      MDL_YMTAU = 1.777000D+00
+      MDL_WT = 1.491500D+00
+      MDL_WZ = 2.441404D+00
+      MDL_WW = 2.047600D+00
+      MDL_WH = 6.382339D-03
diff --git a/epochX/cudacpp/gg_ttggg.mad/Source/vector.inc b/epochX/cudacpp/gg_ttggg.mad/Source/vector.inc
index 92254c0f2a..863eebbc70 100644
--- a/epochX/cudacpp/gg_ttggg.mad/Source/vector.inc
+++ b/epochX/cudacpp/gg_ttggg.mad/Source/vector.inc
@@ -28,5 +28,4 @@ C     BECAUSE IT DOES NOT GO THROUGH THE CPP PREPROCESSOR
 C     (see https://github.com/madgraph5/madgraph4gpu/issues/458).
 C     
       INTEGER VECSIZE_MEMMAX
-      PARAMETER (VECSIZE_MEMMAX=16384) ! NB: 16k events per GPU grid is the minimum required to fill a V100 GPU
-c     PARAMETER (VECSIZE_MEMMAX=32) ! NB: workaround for out-of-memory on Juwels: 32 is enough for no-CUDA builds (issue #498)
+      PARAMETER (VECSIZE_MEMMAX=16384)
diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/Bridge.h b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/Bridge.h
index f37c972b24..89437b4c42 100644
--- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/Bridge.h
+++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/Bridge.h
@@ -18,6 +18,7 @@
 #include <cassert>
 #include <cmath>
 #include <cstring>
+#include <filesystem>
 #include <iostream>
 #include <memory>
 #include <type_traits>
@@ -244,14 +245,21 @@ namespace mg5amcCpu
     }
     std::cout << "WARNING! Instantiate device Bridge (nevt=" << m_nevt << ", gpublocks=" << m_gpublocks << ", gputhreads=" << m_gputhreads
               << ", gpublocks*gputhreads=" << m_gpublocks * m_gputhreads << ")" << std::endl;
-    CPPProcess process( /*verbose=*/false );
     m_pmek.reset( new MatrixElementKernelDevice( m_devMomentaC, m_devGs, m_devRndHel, m_devRndCol, m_devMEs, m_devSelHel, m_devSelCol, m_gpublocks, m_gputhreads ) );
 #else
     std::cout << "WARNING! Instantiate host Bridge (nevt=" << m_nevt << ")" << std::endl;
-    CPPProcess process( /*verbose=*/false );
     m_pmek.reset( new MatrixElementKernelHost( m_hstMomentaC, m_hstGs, m_hstRndHel, m_hstRndCol, m_hstMEs, m_hstSelHel, m_hstSelCol, m_nevt ) );
 #endif // MGONGPUCPP_GPUIMPL
-    process.initProc( "../../Cards/param_card.dat" );
+    // Create a process object, read param card and set parameters
+    // FIXME: the process instance can happily go out of scope because it is only needed to read parameters?
+    // FIXME: the CPPProcess should really be a singleton? what if fbridgecreate is called from several Fortran threads?
+    CPPProcess process( /*verbose=*/false );
+    std::string paramCard = "../../Cards/param_card.dat";
+    if( !std::filesystem::exists( paramCard ) )
+    {
+      paramCard = "../" + paramCard;
+    }
+    process.initProc( paramCard );
   }
 
 #ifdef MGONGPUCPP_GPUIMPL
diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/GpuAbstraction.h b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/GpuAbstraction.h
index 9c467b1e04..6a7d9c05c0 100644
--- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/GpuAbstraction.h
+++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/GpuAbstraction.h
@@ -39,6 +39,8 @@
 
 #elif defined __HIPCC__
 
+#include "hip/hip_runtime.h"
+
 #define gpuError_t hipError_t
 #define gpuPeekAtLastError hipPeekAtLastError
 #define gpuGetErrorString hipGetErrorString
diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MGVersion.txt b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MGVersion.txt
index 1c1a95761b..85c67c3554 100644
--- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MGVersion.txt
+++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MGVersion.txt
@@ -1 +1 @@
-3.5.1_lo_vect
\ No newline at end of file
+3.5.2_lo_vect
\ No newline at end of file
diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MadgraphTest.h b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MadgraphTest.h
index 176338151a..a64c05c26a 100644
--- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MadgraphTest.h
+++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MadgraphTest.h
@@ -14,6 +14,7 @@
 
 #include <array>
 #include <cmath>
+#include <filesystem>
 #include <fstream>
 #include <iomanip>
 #include <memory>
@@ -215,19 +216,16 @@ TEST_P( MadgraphTest, CompareMomentaAndME )
 #endif
   constexpr fptype energy = 1500; // historical default, Ecms = 1500 GeV = 1.5 TeV (above the Z peak)
   // Dump events to a new reference file?
-  constexpr bool dumpEvents = false;
-  std::string dumpFileName = std::string( "dump_" ) + testing::UnitTest::GetInstance()->current_test_info()->test_suite_name() + '.' + testing::UnitTest::GetInstance()->current_test_info()->name() + ".txt";
-  while( dumpFileName.find( '/' ) != std::string::npos )
-  {
-    dumpFileName.replace( dumpFileName.find( '/' ), 1, "_" );
-  }
+  const char* dumpEventsC = getenv( "CUDACPP_RUNTEST_DUMPEVENTS" );
+  const bool dumpEvents = ( dumpEventsC != 0 ) && ( std::string( dumpEventsC ) != "" );
+  const std::string refFileName = testDriver->getRefFileName();
+  const std::string dumpFileName = std::filesystem::path( refFileName ).filename();
   std::ofstream dumpFile;
   if( dumpEvents )
   {
     dumpFile.open( dumpFileName, std::ios::trunc );
   }
   // Read reference data
-  const std::string refFileName = testDriver->getRefFileName();
   std::map<unsigned int, ReferenceData> referenceData;
   if( !dumpEvents )
   {
diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MatrixElementKernels.cc b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MatrixElementKernels.cc
index d6d6c4f179..81699dfea9 100644
--- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MatrixElementKernels.cc
+++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MatrixElementKernels.cc
@@ -112,10 +112,17 @@ namespace mg5amcCpu
     // See https://community.arm.com/arm-community-blogs/b/operating-systems-blog/posts/runtime-detection-of-cpu-features-on-an-armv8-a-cpu
     bool ok = true; // this is just an assumption!
     const std::string tag = "arm neon (128bit as in SSE4.2)";
-#else
+#elif defined( __x86_64__ ) || defined( __i386__ )
     bool known = true;
     bool ok = __builtin_cpu_supports( "sse4.2" );
     const std::string tag = "nehalem (SSE4.2)";
+#else // AV FIXME! Added by OM for Mac, should identify the correct __xxx__ flag that should be targeted
+    bool known = false; // __builtin_cpu_supports is not supported
+    // See https://gcc.gnu.org/onlinedocs/gcc/Basic-PowerPC-Built-in-Functions-Available-on-all-Configurations.html
+    // See https://stackoverflow.com/q/62783908
+    // See https://community.arm.com/arm-community-blogs/b/operating-systems-blog/posts/runtime-detection-of-cpu-features-on-an-armv8-a-cpu
+    bool ok = true; // this is just an assumption!
+    const std::string tag = "arm neon (128bit as in SSE4.2)";
 #endif
 #else
     bool known = true;
diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/CPPProcess.cc b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/CPPProcess.cc
index 58958364eb..a478ecb28e 100644
--- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/CPPProcess.cc
+++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/CPPProcess.cc
@@ -7,7 +7,7 @@
 // Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+// MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
@@ -253,13 +253,13 @@ namespace mg5amcCpu
 
       vxxxxx<M_ACCESS, W_ACCESS>( momenta, 0., cHel[ihel][6], +1, w_fp[6], 6 );
 
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[1], COUPs[0], 0., 0., w_fp[7] );
-      FFV1P0_3<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[2], COUPs[1], 0., 0., w_fp[8] );
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[7], w_fp[4], COUPs[0], 0., 0., w_fp[9] );
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[8], w_fp[5], COUPs[0], 0., 0., w_fp[10] );
+      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[1], COUPs[0], 1.0, 0., 0., w_fp[7] );
+      FFV1P0_3<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[2], COUPs[1], 1.0, 0., 0., w_fp[8] );
+      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[7], w_fp[4], COUPs[0], 1.0, 0., 0., w_fp[9] );
+      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[8], w_fp[5], COUPs[0], 1.0, 0., 0., w_fp[10] );
 
       // Amplitude(s) for diagram number 1
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[9], w_fp[10], w_fp[6], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[9], w_fp[10], w_fp[6], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 1 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -284,10 +284,10 @@ namespace mg5amcCpu
       // *** DIAGRAM 2 OF 1240 ***
 
       // Wavefunction(s) for diagram number 2
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[8], w_fp[6], COUPs[0], 0., 0., w_fp[11] );
+      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[8], w_fp[6], COUPs[0], 1.0, 0., 0., w_fp[11] );
 
       // Amplitude(s) for diagram number 2
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[9], w_fp[11], w_fp[5], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[9], w_fp[11], w_fp[5], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 2 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -315,7 +315,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 3
-      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[5], w_fp[6], w_fp[9], COUPs[2], &amp_fp[0] );
+      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[5], w_fp[6], w_fp[9], COUPs[2], 1.0, &amp_fp[0] );
       jamp_sv[1] -= amp_sv[0];
       jamp_sv[25] += amp_sv[0];
       jamp_sv[49] += amp_sv[0];
@@ -332,7 +332,7 @@ namespace mg5amcCpu
       jamp_sv[102] -= amp_sv[0];
       jamp_sv[108] -= amp_sv[0];
       jamp_sv[110] += amp_sv[0];
-      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[5], w_fp[6], w_fp[9], COUPs[2], &amp_fp[0] );
+      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[5], w_fp[6], w_fp[9], COUPs[2], 1.0, &amp_fp[0] );
       jamp_sv[0] -= amp_sv[0];
       jamp_sv[24] += amp_sv[0];
       jamp_sv[48] += amp_sv[0];
@@ -349,7 +349,7 @@ namespace mg5amcCpu
       jamp_sv[116] += amp_sv[0];
       jamp_sv[118] += amp_sv[0];
       jamp_sv[119] -= amp_sv[0];
-      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[5], w_fp[6], w_fp[9], COUPs[2], &amp_fp[0] );
+      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[5], w_fp[6], w_fp[9], COUPs[2], 1.0, &amp_fp[0] );
       jamp_sv[0] -= amp_sv[0];
       jamp_sv[1] += amp_sv[0];
       jamp_sv[24] += amp_sv[0];
@@ -370,11 +370,11 @@ namespace mg5amcCpu
       // *** DIAGRAM 4 OF 1240 ***
 
       // Wavefunction(s) for diagram number 4
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[7], w_fp[5], COUPs[0], 0., 0., w_fp[12] );
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[8], w_fp[4], COUPs[0], 0., 0., w_fp[13] );
+      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[7], w_fp[5], COUPs[0], 1.0, 0., 0., w_fp[12] );
+      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[8], w_fp[4], COUPs[0], 1.0, 0., 0., w_fp[13] );
 
       // Amplitude(s) for diagram number 4
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[12], w_fp[13], w_fp[6], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[12], w_fp[13], w_fp[6], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 4 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -402,7 +402,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 5
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[12], w_fp[11], w_fp[4], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[12], w_fp[11], w_fp[4], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 5 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -430,7 +430,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 6
-      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[4], w_fp[6], w_fp[12], COUPs[2], &amp_fp[0] );
+      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[4], w_fp[6], w_fp[12], COUPs[2], 1.0, &amp_fp[0] );
       jamp_sv[3] -= amp_sv[0];
       jamp_sv[27] += amp_sv[0];
       jamp_sv[48] += amp_sv[0];
@@ -447,7 +447,7 @@ namespace mg5amcCpu
       jamp_sv[103] -= amp_sv[0];
       jamp_sv[114] -= amp_sv[0];
       jamp_sv[116] += amp_sv[0];
-      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[4], w_fp[6], w_fp[12], COUPs[2], &amp_fp[0] );
+      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[4], w_fp[6], w_fp[12], COUPs[2], 1.0, &amp_fp[0] );
       jamp_sv[2] -= amp_sv[0];
       jamp_sv[26] += amp_sv[0];
       jamp_sv[48] += amp_sv[0];
@@ -464,7 +464,7 @@ namespace mg5amcCpu
       jamp_sv[113] -= amp_sv[0];
       jamp_sv[114] -= amp_sv[0];
       jamp_sv[116] += amp_sv[0];
-      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[4], w_fp[6], w_fp[12], COUPs[2], &amp_fp[0] );
+      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[4], w_fp[6], w_fp[12], COUPs[2], 1.0, &amp_fp[0] );
       jamp_sv[2] -= amp_sv[0];
       jamp_sv[3] += amp_sv[0];
       jamp_sv[26] += amp_sv[0];
@@ -485,10 +485,10 @@ namespace mg5amcCpu
       // *** DIAGRAM 7 OF 1240 ***
 
       // Wavefunction(s) for diagram number 7
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[7], w_fp[6], COUPs[0], 0., 0., w_fp[14] );
+      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[7], w_fp[6], COUPs[0], 1.0, 0., 0., w_fp[14] );
 
       // Amplitude(s) for diagram number 7
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[14], w_fp[13], w_fp[5], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[14], w_fp[13], w_fp[5], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 7 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -516,7 +516,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 8
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[14], w_fp[10], w_fp[4], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[14], w_fp[10], w_fp[4], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 8 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -544,7 +544,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 9
-      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[4], w_fp[5], w_fp[14], COUPs[2], &amp_fp[0] );
+      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[4], w_fp[5], w_fp[14], COUPs[2], 1.0, &amp_fp[0] );
       jamp_sv[5] -= amp_sv[0];
       jamp_sv[29] += amp_sv[0];
       jamp_sv[49] += amp_sv[0];
@@ -561,7 +561,7 @@ namespace mg5amcCpu
       jamp_sv[92] += amp_sv[0];
       jamp_sv[97] += amp_sv[0];
       jamp_sv[103] -= amp_sv[0];
-      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[4], w_fp[5], w_fp[14], COUPs[2], &amp_fp[0] );
+      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[4], w_fp[5], w_fp[14], COUPs[2], 1.0, &amp_fp[0] );
       jamp_sv[4] -= amp_sv[0];
       jamp_sv[28] += amp_sv[0];
       jamp_sv[49] += amp_sv[0];
@@ -578,7 +578,7 @@ namespace mg5amcCpu
       jamp_sv[92] += amp_sv[0];
       jamp_sv[96] += amp_sv[0];
       jamp_sv[102] -= amp_sv[0];
-      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[4], w_fp[5], w_fp[14], COUPs[2], &amp_fp[0] );
+      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[4], w_fp[5], w_fp[14], COUPs[2], 1.0, &amp_fp[0] );
       jamp_sv[4] -= amp_sv[0];
       jamp_sv[5] += amp_sv[0];
       jamp_sv[28] += amp_sv[0];
@@ -599,12 +599,12 @@ namespace mg5amcCpu
       // *** DIAGRAM 10 OF 1240 ***
 
       // Wavefunction(s) for diagram number 10
-      VVVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[7], w_fp[4], w_fp[5], COUPs[2], 0., 0., w_fp[15] );
-      VVVV3P0_1<W_ACCESS, CD_ACCESS>( w_fp[7], w_fp[4], w_fp[5], COUPs[2], 0., 0., w_fp[16] );
-      VVVV4P0_1<W_ACCESS, CD_ACCESS>( w_fp[7], w_fp[4], w_fp[5], COUPs[2], 0., 0., w_fp[17] );
+      VVVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[7], w_fp[4], w_fp[5], COUPs[2], 1.0, 0., 0., w_fp[15] );
+      VVVV3P0_1<W_ACCESS, CD_ACCESS>( w_fp[7], w_fp[4], w_fp[5], COUPs[2], 1.0, 0., 0., w_fp[16] );
+      VVVV4P0_1<W_ACCESS, CD_ACCESS>( w_fp[7], w_fp[4], w_fp[5], COUPs[2], 1.0, 0., 0., w_fp[17] );
 
       // Amplitude(s) for diagram number 10
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[6], w_fp[15], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[6], w_fp[15], COUPs[0], 1.0, &amp_fp[0] );
       jamp_sv[0] += amp_sv[0];
       jamp_sv[2] -= amp_sv[0];
       jamp_sv[24] -= amp_sv[0];
@@ -621,7 +621,7 @@ namespace mg5amcCpu
       jamp_sv[113] -= amp_sv[0];
       jamp_sv[118] -= amp_sv[0];
       jamp_sv[119] += amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[6], w_fp[16], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[6], w_fp[16], COUPs[0], 1.0, &amp_fp[0] );
       jamp_sv[2] -= amp_sv[0];
       jamp_sv[26] += amp_sv[0];
       jamp_sv[48] += amp_sv[0];
@@ -638,7 +638,7 @@ namespace mg5amcCpu
       jamp_sv[113] -= amp_sv[0];
       jamp_sv[114] -= amp_sv[0];
       jamp_sv[116] += amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[6], w_fp[17], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[6], w_fp[17], COUPs[0], 1.0, &amp_fp[0] );
       jamp_sv[0] -= amp_sv[0];
       jamp_sv[24] += amp_sv[0];
       jamp_sv[48] += amp_sv[0];
@@ -659,12 +659,12 @@ namespace mg5amcCpu
       // *** DIAGRAM 11 OF 1240 ***
 
       // Wavefunction(s) for diagram number 11
-      VVVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[7], w_fp[4], w_fp[6], COUPs[2], 0., 0., w_fp[18] );
-      VVVV3P0_1<W_ACCESS, CD_ACCESS>( w_fp[7], w_fp[4], w_fp[6], COUPs[2], 0., 0., w_fp[19] );
-      VVVV4P0_1<W_ACCESS, CD_ACCESS>( w_fp[7], w_fp[4], w_fp[6], COUPs[2], 0., 0., w_fp[20] );
+      VVVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[7], w_fp[4], w_fp[6], COUPs[2], 1.0, 0., 0., w_fp[18] );
+      VVVV3P0_1<W_ACCESS, CD_ACCESS>( w_fp[7], w_fp[4], w_fp[6], COUPs[2], 1.0, 0., 0., w_fp[19] );
+      VVVV4P0_1<W_ACCESS, CD_ACCESS>( w_fp[7], w_fp[4], w_fp[6], COUPs[2], 1.0, 0., 0., w_fp[20] );
 
       // Amplitude(s) for diagram number 11
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[5], w_fp[18], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[5], w_fp[18], COUPs[0], 1.0, &amp_fp[0] );
       jamp_sv[1] += amp_sv[0];
       jamp_sv[4] -= amp_sv[0];
       jamp_sv[25] -= amp_sv[0];
@@ -681,7 +681,7 @@ namespace mg5amcCpu
       jamp_sv[95] += amp_sv[0];
       jamp_sv[108] += amp_sv[0];
       jamp_sv[110] -= amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[5], w_fp[19], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[5], w_fp[19], COUPs[0], 1.0, &amp_fp[0] );
       jamp_sv[4] -= amp_sv[0];
       jamp_sv[28] += amp_sv[0];
       jamp_sv[49] += amp_sv[0];
@@ -698,7 +698,7 @@ namespace mg5amcCpu
       jamp_sv[92] += amp_sv[0];
       jamp_sv[96] += amp_sv[0];
       jamp_sv[102] -= amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[5], w_fp[20], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[5], w_fp[20], COUPs[0], 1.0, &amp_fp[0] );
       jamp_sv[1] -= amp_sv[0];
       jamp_sv[25] += amp_sv[0];
       jamp_sv[49] += amp_sv[0];
@@ -719,12 +719,12 @@ namespace mg5amcCpu
       // *** DIAGRAM 12 OF 1240 ***
 
       // Wavefunction(s) for diagram number 12
-      VVVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[7], w_fp[5], w_fp[6], COUPs[2], 0., 0., w_fp[21] );
-      VVVV3P0_1<W_ACCESS, CD_ACCESS>( w_fp[7], w_fp[5], w_fp[6], COUPs[2], 0., 0., w_fp[22] );
-      VVVV4P0_1<W_ACCESS, CD_ACCESS>( w_fp[7], w_fp[5], w_fp[6], COUPs[2], 0., 0., w_fp[23] );
+      VVVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[7], w_fp[5], w_fp[6], COUPs[2], 1.0, 0., 0., w_fp[21] );
+      VVVV3P0_1<W_ACCESS, CD_ACCESS>( w_fp[7], w_fp[5], w_fp[6], COUPs[2], 1.0, 0., 0., w_fp[22] );
+      VVVV4P0_1<W_ACCESS, CD_ACCESS>( w_fp[7], w_fp[5], w_fp[6], COUPs[2], 1.0, 0., 0., w_fp[23] );
 
       // Amplitude(s) for diagram number 12
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[4], w_fp[21], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[4], w_fp[21], COUPs[0], 1.0, &amp_fp[0] );
       jamp_sv[3] += amp_sv[0];
       jamp_sv[5] -= amp_sv[0];
       jamp_sv[27] -= amp_sv[0];
@@ -741,7 +741,7 @@ namespace mg5amcCpu
       jamp_sv[92] += amp_sv[0];
       jamp_sv[114] += amp_sv[0];
       jamp_sv[116] -= amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[4], w_fp[22], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[4], w_fp[22], COUPs[0], 1.0, &amp_fp[0] );
       jamp_sv[5] -= amp_sv[0];
       jamp_sv[29] += amp_sv[0];
       jamp_sv[49] += amp_sv[0];
@@ -758,7 +758,7 @@ namespace mg5amcCpu
       jamp_sv[92] += amp_sv[0];
       jamp_sv[97] += amp_sv[0];
       jamp_sv[103] -= amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[4], w_fp[23], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[4], w_fp[23], COUPs[0], 1.0, &amp_fp[0] );
       jamp_sv[3] -= amp_sv[0];
       jamp_sv[27] += amp_sv[0];
       jamp_sv[48] += amp_sv[0];
@@ -779,10 +779,10 @@ namespace mg5amcCpu
       // *** DIAGRAM 13 OF 1240 ***
 
       // Wavefunction(s) for diagram number 13
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[4], w_fp[5], COUPs[0], 0., 0., w_fp[24] );
+      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[4], w_fp[5], COUPs[0], 1.0, 0., 0., w_fp[24] );
 
       // Amplitude(s) for diagram number 13
-      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[7], w_fp[8], w_fp[24], w_fp[6], COUPs[2], &amp_fp[0] );
+      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[7], w_fp[8], w_fp[24], w_fp[6], COUPs[2], 1.0, &amp_fp[0] );
       jamp_sv[0] += amp_sv[0];
       jamp_sv[2] -= amp_sv[0];
       jamp_sv[4] -= amp_sv[0];
@@ -799,7 +799,7 @@ namespace mg5amcCpu
       jamp_sv[113] -= amp_sv[0];
       jamp_sv[118] -= amp_sv[0];
       jamp_sv[119] += amp_sv[0];
-      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[7], w_fp[8], w_fp[24], w_fp[6], COUPs[2], &amp_fp[0] );
+      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[7], w_fp[8], w_fp[24], w_fp[6], COUPs[2], 1.0, &amp_fp[0] );
       jamp_sv[0] += amp_sv[0];
       jamp_sv[2] -= amp_sv[0];
       jamp_sv[24] -= amp_sv[0];
@@ -816,7 +816,7 @@ namespace mg5amcCpu
       jamp_sv[113] -= amp_sv[0];
       jamp_sv[118] -= amp_sv[0];
       jamp_sv[119] += amp_sv[0];
-      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[7], w_fp[8], w_fp[24], w_fp[6], COUPs[2], &amp_fp[0] );
+      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[7], w_fp[8], w_fp[24], w_fp[6], COUPs[2], 1.0, &amp_fp[0] );
       jamp_sv[4] += amp_sv[0];
       jamp_sv[5] -= amp_sv[0];
       jamp_sv[28] -= amp_sv[0];
@@ -837,10 +837,10 @@ namespace mg5amcCpu
       // *** DIAGRAM 14 OF 1240 ***
 
       // Wavefunction(s) for diagram number 14
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[7], w_fp[8], COUPs[0], 0., 0., w_fp[25] );
+      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[7], w_fp[8], COUPs[0], 1.0, 0., 0., w_fp[25] );
 
       // Amplitude(s) for diagram number 14
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[24], w_fp[6], w_fp[25], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[24], w_fp[6], w_fp[25], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 14 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -865,10 +865,10 @@ namespace mg5amcCpu
       // *** DIAGRAM 15 OF 1240 ***
 
       // Wavefunction(s) for diagram number 15
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[7], w_fp[24], COUPs[0], 0., 0., w_fp[26] );
+      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[7], w_fp[24], COUPs[0], 1.0, 0., 0., w_fp[26] );
 
       // Amplitude(s) for diagram number 15
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[6], w_fp[26], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[6], w_fp[26], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 15 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -896,7 +896,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 16
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[24], w_fp[14], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[24], w_fp[14], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 16 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -921,10 +921,10 @@ namespace mg5amcCpu
       // *** DIAGRAM 17 OF 1240 ***
 
       // Wavefunction(s) for diagram number 17
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[4], w_fp[6], COUPs[0], 0., 0., w_fp[27] );
+      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[4], w_fp[6], COUPs[0], 1.0, 0., 0., w_fp[27] );
 
       // Amplitude(s) for diagram number 17
-      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[7], w_fp[8], w_fp[27], w_fp[5], COUPs[2], &amp_fp[0] );
+      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[7], w_fp[8], w_fp[27], w_fp[5], COUPs[2], 1.0, &amp_fp[0] );
       jamp_sv[1] += amp_sv[0];
       jamp_sv[2] -= amp_sv[0];
       jamp_sv[3] += amp_sv[0];
@@ -941,7 +941,7 @@ namespace mg5amcCpu
       jamp_sv[95] += amp_sv[0];
       jamp_sv[112] += amp_sv[0];
       jamp_sv[113] -= amp_sv[0];
-      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[7], w_fp[8], w_fp[27], w_fp[5], COUPs[2], &amp_fp[0] );
+      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[7], w_fp[8], w_fp[27], w_fp[5], COUPs[2], 1.0, &amp_fp[0] );
       jamp_sv[1] += amp_sv[0];
       jamp_sv[4] -= amp_sv[0];
       jamp_sv[25] -= amp_sv[0];
@@ -958,7 +958,7 @@ namespace mg5amcCpu
       jamp_sv[95] += amp_sv[0];
       jamp_sv[108] += amp_sv[0];
       jamp_sv[110] -= amp_sv[0];
-      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[7], w_fp[8], w_fp[27], w_fp[5], COUPs[2], &amp_fp[0] );
+      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[7], w_fp[8], w_fp[27], w_fp[5], COUPs[2], 1.0, &amp_fp[0] );
       jamp_sv[2] += amp_sv[0];
       jamp_sv[3] -= amp_sv[0];
       jamp_sv[26] -= amp_sv[0];
@@ -982,7 +982,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 18
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[27], w_fp[5], w_fp[25], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[27], w_fp[5], w_fp[25], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 18 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -1007,10 +1007,10 @@ namespace mg5amcCpu
       // *** DIAGRAM 19 OF 1240 ***
 
       // Wavefunction(s) for diagram number 19
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[7], w_fp[27], COUPs[0], 0., 0., w_fp[28] );
+      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[7], w_fp[27], COUPs[0], 1.0, 0., 0., w_fp[28] );
 
       // Amplitude(s) for diagram number 19
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[5], w_fp[28], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[5], w_fp[28], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 19 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -1038,7 +1038,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 20
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[27], w_fp[12], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[27], w_fp[12], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 20 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -1063,10 +1063,10 @@ namespace mg5amcCpu
       // *** DIAGRAM 21 OF 1240 ***
 
       // Wavefunction(s) for diagram number 21
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[5], w_fp[6], COUPs[0], 0., 0., w_fp[29] );
+      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[5], w_fp[6], COUPs[0], 1.0, 0., 0., w_fp[29] );
 
       // Amplitude(s) for diagram number 21
-      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[7], w_fp[8], w_fp[4], w_fp[29], COUPs[2], &amp_fp[0] );
+      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[7], w_fp[8], w_fp[4], w_fp[29], COUPs[2], 1.0, &amp_fp[0] );
       jamp_sv[0] += amp_sv[0];
       jamp_sv[1] -= amp_sv[0];
       jamp_sv[3] -= amp_sv[0];
@@ -1083,7 +1083,7 @@ namespace mg5amcCpu
       jamp_sv[95] -= amp_sv[0];
       jamp_sv[118] -= amp_sv[0];
       jamp_sv[119] += amp_sv[0];
-      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[7], w_fp[8], w_fp[4], w_fp[29], COUPs[2], &amp_fp[0] );
+      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[7], w_fp[8], w_fp[4], w_fp[29], COUPs[2], 1.0, &amp_fp[0] );
       jamp_sv[0] += amp_sv[0];
       jamp_sv[1] -= amp_sv[0];
       jamp_sv[24] -= amp_sv[0];
@@ -1100,7 +1100,7 @@ namespace mg5amcCpu
       jamp_sv[116] -= amp_sv[0];
       jamp_sv[118] -= amp_sv[0];
       jamp_sv[119] += amp_sv[0];
-      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[7], w_fp[8], w_fp[4], w_fp[29], COUPs[2], &amp_fp[0] );
+      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[7], w_fp[8], w_fp[4], w_fp[29], COUPs[2], 1.0, &amp_fp[0] );
       jamp_sv[3] += amp_sv[0];
       jamp_sv[5] -= amp_sv[0];
       jamp_sv[27] -= amp_sv[0];
@@ -1124,7 +1124,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 22
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[4], w_fp[29], w_fp[25], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[4], w_fp[29], w_fp[25], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 22 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -1152,7 +1152,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 23
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[29], w_fp[9], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[29], w_fp[9], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 23 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -1177,10 +1177,10 @@ namespace mg5amcCpu
       // *** DIAGRAM 24 OF 1240 ***
 
       // Wavefunction(s) for diagram number 24
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[7], w_fp[29], COUPs[0], 0., 0., w_fp[25] );
+      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[7], w_fp[29], COUPs[0], 1.0, 0., 0., w_fp[25] );
 
       // Amplitude(s) for diagram number 24
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[4], w_fp[25], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[4], w_fp[25], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 24 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -1205,12 +1205,12 @@ namespace mg5amcCpu
       // *** DIAGRAM 25 OF 1240 ***
 
       // Wavefunction(s) for diagram number 25
-      VVVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[4], w_fp[5], w_fp[6], COUPs[2], 0., 0., w_fp[30] );
-      VVVV3P0_1<W_ACCESS, CD_ACCESS>( w_fp[4], w_fp[5], w_fp[6], COUPs[2], 0., 0., w_fp[31] );
-      VVVV4P0_1<W_ACCESS, CD_ACCESS>( w_fp[4], w_fp[5], w_fp[6], COUPs[2], 0., 0., w_fp[32] );
+      VVVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[4], w_fp[5], w_fp[6], COUPs[2], 1.0, 0., 0., w_fp[30] );
+      VVVV3P0_1<W_ACCESS, CD_ACCESS>( w_fp[4], w_fp[5], w_fp[6], COUPs[2], 1.0, 0., 0., w_fp[31] );
+      VVVV4P0_1<W_ACCESS, CD_ACCESS>( w_fp[4], w_fp[5], w_fp[6], COUPs[2], 1.0, 0., 0., w_fp[32] );
 
       // Amplitude(s) for diagram number 25
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[7], w_fp[8], w_fp[30], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[7], w_fp[8], w_fp[30], COUPs[0], 1.0, &amp_fp[0] );
       jamp_sv[0] += amp_sv[0];
       jamp_sv[1] -= amp_sv[0];
       jamp_sv[3] -= amp_sv[0];
@@ -1227,7 +1227,7 @@ namespace mg5amcCpu
       jamp_sv[95] -= amp_sv[0];
       jamp_sv[118] -= amp_sv[0];
       jamp_sv[119] += amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[7], w_fp[8], w_fp[31], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[7], w_fp[8], w_fp[31], COUPs[0], 1.0, &amp_fp[0] );
       jamp_sv[1] -= amp_sv[0];
       jamp_sv[2] += amp_sv[0];
       jamp_sv[3] -= amp_sv[0];
@@ -1244,7 +1244,7 @@ namespace mg5amcCpu
       jamp_sv[95] -= amp_sv[0];
       jamp_sv[112] -= amp_sv[0];
       jamp_sv[113] += amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[7], w_fp[8], w_fp[32], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[7], w_fp[8], w_fp[32], COUPs[0], 1.0, &amp_fp[0] );
       jamp_sv[0] -= amp_sv[0];
       jamp_sv[2] += amp_sv[0];
       jamp_sv[4] += amp_sv[0];
@@ -1265,12 +1265,12 @@ namespace mg5amcCpu
       // *** DIAGRAM 26 OF 1240 ***
 
       // Wavefunction(s) for diagram number 26
-      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[2], w_fp[4], COUPs[1], cIPD[0], cIPD[1], w_fp[33] );
-      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[7], COUPs[1], cIPD[0], cIPD[1], w_fp[34] );
-      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[33], w_fp[5], COUPs[1], cIPD[0], cIPD[1], w_fp[35] );
+      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[2], w_fp[4], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[33] );
+      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[7], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[34] );
+      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[33], w_fp[5], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[35] );
 
       // Amplitude(s) for diagram number 26
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[34], w_fp[35], w_fp[6], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[34], w_fp[35], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 26 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -1281,10 +1281,10 @@ namespace mg5amcCpu
       // *** DIAGRAM 27 OF 1240 ***
 
       // Wavefunction(s) for diagram number 27
-      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[33], w_fp[6], COUPs[1], cIPD[0], cIPD[1], w_fp[36] );
+      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[33], w_fp[6], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[36] );
 
       // Amplitude(s) for diagram number 27
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[34], w_fp[36], w_fp[5], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[34], w_fp[36], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 27 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -1295,10 +1295,10 @@ namespace mg5amcCpu
       // *** DIAGRAM 28 OF 1240 ***
 
       // Wavefunction(s) for diagram number 28
-      FFV1P0_3<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[33], COUPs[1], 0., 0., w_fp[37] );
+      FFV1P0_3<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[33], COUPs[1], 1.0, 0., 0., w_fp[37] );
 
       // Amplitude(s) for diagram number 28
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[12], w_fp[37], w_fp[6], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[12], w_fp[37], w_fp[6], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 28 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -1318,7 +1318,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 29
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[36], w_fp[12], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[36], w_fp[12], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 29 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -1334,7 +1334,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 30
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[14], w_fp[37], w_fp[5], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[14], w_fp[37], w_fp[5], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 30 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -1354,7 +1354,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 31
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[35], w_fp[14], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[35], w_fp[14], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 31 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -1370,7 +1370,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 32
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[33], w_fp[21], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[33], w_fp[21], COUPs[1], 1.0, &amp_fp[0] );
       jamp_sv[48] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[49] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[54] += cxtype( 0, 1 ) * amp_sv[0];
@@ -1379,7 +1379,7 @@ namespace mg5amcCpu
       jamp_sv[65] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[70] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[71] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[33], w_fp[22], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[33], w_fp[22], COUPs[1], 1.0, &amp_fp[0] );
       jamp_sv[49] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[55] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[60] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -1388,7 +1388,7 @@ namespace mg5amcCpu
       jamp_sv[65] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[66] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[68] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[33], w_fp[23], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[33], w_fp[23], COUPs[1], 1.0, &amp_fp[0] );
       jamp_sv[48] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[54] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[60] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -1401,11 +1401,11 @@ namespace mg5amcCpu
       // *** DIAGRAM 33 OF 1240 ***
 
       // Wavefunction(s) for diagram number 33
-      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[5], COUPs[1], cIPD[0], cIPD[1], w_fp[38] );
-      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[33], w_fp[7], COUPs[1], cIPD[0], cIPD[1], w_fp[39] );
+      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[5], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[38] );
+      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[33], w_fp[7], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[39] );
 
       // Amplitude(s) for diagram number 33
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[39], w_fp[6], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[39], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 33 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -1416,10 +1416,10 @@ namespace mg5amcCpu
       // *** DIAGRAM 34 OF 1240 ***
 
       // Wavefunction(s) for diagram number 34
-      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[38], w_fp[7], COUPs[1], cIPD[0], cIPD[1], w_fp[40] );
+      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[38], w_fp[7], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[40] );
 
       // Amplitude(s) for diagram number 34
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[40], w_fp[33], w_fp[6], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[40], w_fp[33], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 34 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -1433,7 +1433,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 35
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[33], w_fp[14], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[33], w_fp[14], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 35 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -1446,10 +1446,10 @@ namespace mg5amcCpu
       // *** DIAGRAM 36 OF 1240 ***
 
       // Wavefunction(s) for diagram number 36
-      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[6], COUPs[1], cIPD[0], cIPD[1], w_fp[41] );
+      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[6], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[41] );
 
       // Amplitude(s) for diagram number 36
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[39], w_fp[5], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[39], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 36 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -1460,10 +1460,10 @@ namespace mg5amcCpu
       // *** DIAGRAM 37 OF 1240 ***
 
       // Wavefunction(s) for diagram number 37
-      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[41], w_fp[7], COUPs[1], cIPD[0], cIPD[1], w_fp[42] );
+      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[41], w_fp[7], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[42] );
 
       // Amplitude(s) for diagram number 37
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[42], w_fp[33], w_fp[5], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[42], w_fp[33], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 37 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -1477,7 +1477,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 38
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[33], w_fp[12], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[33], w_fp[12], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 38 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -1493,7 +1493,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 39
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[39], w_fp[29], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[39], w_fp[29], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 39 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -1509,7 +1509,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 40
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[34], w_fp[33], w_fp[29], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[34], w_fp[33], w_fp[29], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 40 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -1525,7 +1525,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 41
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[33], w_fp[25], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[33], w_fp[25], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 41 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -1542,11 +1542,11 @@ namespace mg5amcCpu
       // *** DIAGRAM 42 OF 1240 ***
 
       // Wavefunction(s) for diagram number 42
-      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[2], w_fp[5], COUPs[1], cIPD[0], cIPD[1], w_fp[39] );
-      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[39], w_fp[4], COUPs[1], cIPD[0], cIPD[1], w_fp[43] );
+      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[2], w_fp[5], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[39] );
+      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[39], w_fp[4], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[43] );
 
       // Amplitude(s) for diagram number 42
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[34], w_fp[43], w_fp[6], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[34], w_fp[43], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 42 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -1557,10 +1557,10 @@ namespace mg5amcCpu
       // *** DIAGRAM 43 OF 1240 ***
 
       // Wavefunction(s) for diagram number 43
-      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[39], w_fp[6], COUPs[1], cIPD[0], cIPD[1], w_fp[44] );
+      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[39], w_fp[6], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[44] );
 
       // Amplitude(s) for diagram number 43
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[34], w_fp[44], w_fp[4], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[34], w_fp[44], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 43 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -1571,10 +1571,10 @@ namespace mg5amcCpu
       // *** DIAGRAM 44 OF 1240 ***
 
       // Wavefunction(s) for diagram number 44
-      FFV1P0_3<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[39], COUPs[1], 0., 0., w_fp[45] );
+      FFV1P0_3<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[39], COUPs[1], 1.0, 0., 0., w_fp[45] );
 
       // Amplitude(s) for diagram number 44
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[9], w_fp[45], w_fp[6], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[9], w_fp[45], w_fp[6], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 44 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -1594,7 +1594,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 45
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[44], w_fp[9], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[44], w_fp[9], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 45 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -1610,7 +1610,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 46
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[14], w_fp[45], w_fp[4], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[14], w_fp[45], w_fp[4], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 46 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -1630,7 +1630,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 47
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[43], w_fp[14], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[43], w_fp[14], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 47 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -1646,7 +1646,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 48
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[39], w_fp[18], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[39], w_fp[18], COUPs[1], 1.0, &amp_fp[0] );
       jamp_sv[72] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[73] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[78] += cxtype( 0, 1 ) * amp_sv[0];
@@ -1655,7 +1655,7 @@ namespace mg5amcCpu
       jamp_sv[89] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[94] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[95] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[39], w_fp[19], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[39], w_fp[19], COUPs[1], 1.0, &amp_fp[0] );
       jamp_sv[73] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[79] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[84] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -1664,7 +1664,7 @@ namespace mg5amcCpu
       jamp_sv[89] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[90] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[92] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[39], w_fp[20], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[39], w_fp[20], COUPs[1], 1.0, &amp_fp[0] );
       jamp_sv[72] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[78] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[84] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -1677,11 +1677,11 @@ namespace mg5amcCpu
       // *** DIAGRAM 49 OF 1240 ***
 
       // Wavefunction(s) for diagram number 49
-      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[4], COUPs[1], cIPD[0], cIPD[1], w_fp[46] );
-      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[39], w_fp[7], COUPs[1], cIPD[0], cIPD[1], w_fp[47] );
+      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[4], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[46] );
+      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[39], w_fp[7], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[47] );
 
       // Amplitude(s) for diagram number 49
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[47], w_fp[6], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[47], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 49 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -1692,10 +1692,10 @@ namespace mg5amcCpu
       // *** DIAGRAM 50 OF 1240 ***
 
       // Wavefunction(s) for diagram number 50
-      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[46], w_fp[7], COUPs[1], cIPD[0], cIPD[1], w_fp[48] );
+      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[46], w_fp[7], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[48] );
 
       // Amplitude(s) for diagram number 50
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[48], w_fp[39], w_fp[6], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[48], w_fp[39], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 50 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -1709,7 +1709,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 51
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[39], w_fp[14], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[39], w_fp[14], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 51 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -1725,7 +1725,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 52
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[47], w_fp[4], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[47], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 52 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -1739,7 +1739,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 53
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[42], w_fp[39], w_fp[4], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[42], w_fp[39], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 53 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -1753,7 +1753,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 54
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[39], w_fp[9], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[39], w_fp[9], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 54 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -1769,7 +1769,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 55
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[47], w_fp[27], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[47], w_fp[27], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 55 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -1785,7 +1785,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 56
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[34], w_fp[39], w_fp[27], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[34], w_fp[39], w_fp[27], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 56 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -1801,7 +1801,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 57
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[39], w_fp[28], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[39], w_fp[28], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 57 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -1818,11 +1818,11 @@ namespace mg5amcCpu
       // *** DIAGRAM 58 OF 1240 ***
 
       // Wavefunction(s) for diagram number 58
-      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[2], w_fp[6], COUPs[1], cIPD[0], cIPD[1], w_fp[47] );
-      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[47], w_fp[4], COUPs[1], cIPD[0], cIPD[1], w_fp[49] );
+      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[2], w_fp[6], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[47] );
+      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[47], w_fp[4], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[49] );
 
       // Amplitude(s) for diagram number 58
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[34], w_fp[49], w_fp[5], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[34], w_fp[49], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 58 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -1833,10 +1833,10 @@ namespace mg5amcCpu
       // *** DIAGRAM 59 OF 1240 ***
 
       // Wavefunction(s) for diagram number 59
-      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[47], w_fp[5], COUPs[1], cIPD[0], cIPD[1], w_fp[50] );
+      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[47], w_fp[5], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[50] );
 
       // Amplitude(s) for diagram number 59
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[34], w_fp[50], w_fp[4], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[34], w_fp[50], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 59 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -1847,10 +1847,10 @@ namespace mg5amcCpu
       // *** DIAGRAM 60 OF 1240 ***
 
       // Wavefunction(s) for diagram number 60
-      FFV1P0_3<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[47], COUPs[1], 0., 0., w_fp[51] );
+      FFV1P0_3<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[47], COUPs[1], 1.0, 0., 0., w_fp[51] );
 
       // Amplitude(s) for diagram number 60
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[9], w_fp[51], w_fp[5], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[9], w_fp[51], w_fp[5], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 60 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -1870,7 +1870,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 61
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[50], w_fp[9], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[50], w_fp[9], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 61 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -1886,7 +1886,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 62
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[12], w_fp[51], w_fp[4], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[12], w_fp[51], w_fp[4], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 62 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -1906,7 +1906,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 63
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[49], w_fp[12], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[49], w_fp[12], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 63 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -1922,7 +1922,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 64
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[47], w_fp[15], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[47], w_fp[15], COUPs[1], 1.0, &amp_fp[0] );
       jamp_sv[96] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[97] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[102] += cxtype( 0, 1 ) * amp_sv[0];
@@ -1931,7 +1931,7 @@ namespace mg5amcCpu
       jamp_sv[113] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[118] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[119] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[47], w_fp[16], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[47], w_fp[16], COUPs[1], 1.0, &amp_fp[0] );
       jamp_sv[97] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[103] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[108] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -1940,7 +1940,7 @@ namespace mg5amcCpu
       jamp_sv[113] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[114] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[116] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[47], w_fp[17], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[47], w_fp[17], COUPs[1], 1.0, &amp_fp[0] );
       jamp_sv[96] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[102] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[108] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -1953,10 +1953,10 @@ namespace mg5amcCpu
       // *** DIAGRAM 65 OF 1240 ***
 
       // Wavefunction(s) for diagram number 65
-      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[47], w_fp[7], COUPs[1], cIPD[0], cIPD[1], w_fp[52] );
+      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[47], w_fp[7], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[52] );
 
       // Amplitude(s) for diagram number 65
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[52], w_fp[5], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[52], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 65 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -1970,7 +1970,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 66
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[48], w_fp[47], w_fp[5], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[48], w_fp[47], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 66 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -1984,7 +1984,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 67
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[47], w_fp[12], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[47], w_fp[12], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 67 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -2000,7 +2000,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 68
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[52], w_fp[4], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[52], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 68 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -2014,7 +2014,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 69
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[40], w_fp[47], w_fp[4], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[40], w_fp[47], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 69 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -2028,7 +2028,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 70
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[47], w_fp[9], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[47], w_fp[9], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 70 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -2044,7 +2044,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 71
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[52], w_fp[24], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[52], w_fp[24], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 71 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -2060,7 +2060,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 72
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[34], w_fp[47], w_fp[24], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[34], w_fp[47], w_fp[24], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 72 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -2076,7 +2076,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 73
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[47], w_fp[26], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[47], w_fp[26], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 73 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -2093,11 +2093,11 @@ namespace mg5amcCpu
       // *** DIAGRAM 74 OF 1240 ***
 
       // Wavefunction(s) for diagram number 74
-      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[2], w_fp[7], COUPs[1], cIPD[0], cIPD[1], w_fp[52] );
-      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[46], w_fp[5], COUPs[1], cIPD[0], cIPD[1], w_fp[7] );
+      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[2], w_fp[7], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[52] );
+      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[46], w_fp[5], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[7] );
 
       // Amplitude(s) for diagram number 74
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[7], w_fp[52], w_fp[6], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[7], w_fp[52], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 74 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -2108,10 +2108,10 @@ namespace mg5amcCpu
       // *** DIAGRAM 75 OF 1240 ***
 
       // Wavefunction(s) for diagram number 75
-      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[46], w_fp[6], COUPs[1], cIPD[0], cIPD[1], w_fp[53] );
+      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[46], w_fp[6], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[53] );
 
       // Amplitude(s) for diagram number 75
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[53], w_fp[52], w_fp[5], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[53], w_fp[52], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 75 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -2122,10 +2122,10 @@ namespace mg5amcCpu
       // *** DIAGRAM 76 OF 1240 ***
 
       // Wavefunction(s) for diagram number 76
-      FFV1P0_3<W_ACCESS, CD_ACCESS>( w_fp[46], w_fp[2], COUPs[1], 0., 0., w_fp[54] );
+      FFV1P0_3<W_ACCESS, CD_ACCESS>( w_fp[46], w_fp[2], COUPs[1], 1.0, 0., 0., w_fp[54] );
 
       // Amplitude(s) for diagram number 76
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[12], w_fp[54], w_fp[6], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[12], w_fp[54], w_fp[6], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 76 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -2145,7 +2145,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 77
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[53], w_fp[2], w_fp[12], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[53], w_fp[2], w_fp[12], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 77 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -2161,7 +2161,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 78
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[14], w_fp[54], w_fp[5], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[14], w_fp[54], w_fp[5], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 78 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -2181,7 +2181,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 79
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[7], w_fp[2], w_fp[14], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[7], w_fp[2], w_fp[14], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 79 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -2197,7 +2197,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 80
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[2], w_fp[21], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[2], w_fp[21], COUPs[1], 1.0, &amp_fp[0] );
       jamp_sv[3] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[5] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[27] += cxtype( 0, 1 ) * amp_sv[0];
@@ -2206,7 +2206,7 @@ namespace mg5amcCpu
       jamp_sv[92] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[114] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[116] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[2], w_fp[22], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[2], w_fp[22], COUPs[1], 1.0, &amp_fp[0] );
       jamp_sv[5] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[29] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[73] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -2215,7 +2215,7 @@ namespace mg5amcCpu
       jamp_sv[92] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[97] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[103] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[2], w_fp[23], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[2], w_fp[23], COUPs[1], 1.0, &amp_fp[0] );
       jamp_sv[3] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[27] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[73] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -2231,7 +2231,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 81
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[52], w_fp[29], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[52], w_fp[29], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 81 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -2247,7 +2247,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 82
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[48], w_fp[2], w_fp[29], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[48], w_fp[2], w_fp[29], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 82 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -2263,7 +2263,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 83
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[2], w_fp[25], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[2], w_fp[25], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 83 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -2280,10 +2280,10 @@ namespace mg5amcCpu
       // *** DIAGRAM 84 OF 1240 ***
 
       // Wavefunction(s) for diagram number 84
-      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[38], w_fp[4], COUPs[1], cIPD[0], cIPD[1], w_fp[25] );
+      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[38], w_fp[4], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[25] );
 
       // Amplitude(s) for diagram number 84
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[25], w_fp[52], w_fp[6], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[25], w_fp[52], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 84 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -2294,10 +2294,10 @@ namespace mg5amcCpu
       // *** DIAGRAM 85 OF 1240 ***
 
       // Wavefunction(s) for diagram number 85
-      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[38], w_fp[6], COUPs[1], cIPD[0], cIPD[1], w_fp[48] );
+      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[38], w_fp[6], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[48] );
 
       // Amplitude(s) for diagram number 85
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[48], w_fp[52], w_fp[4], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[48], w_fp[52], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 85 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -2308,10 +2308,10 @@ namespace mg5amcCpu
       // *** DIAGRAM 86 OF 1240 ***
 
       // Wavefunction(s) for diagram number 86
-      FFV1P0_3<W_ACCESS, CD_ACCESS>( w_fp[38], w_fp[2], COUPs[1], 0., 0., w_fp[23] );
+      FFV1P0_3<W_ACCESS, CD_ACCESS>( w_fp[38], w_fp[2], COUPs[1], 1.0, 0., 0., w_fp[23] );
 
       // Amplitude(s) for diagram number 86
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[9], w_fp[23], w_fp[6], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[9], w_fp[23], w_fp[6], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 86 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -2331,7 +2331,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 87
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[48], w_fp[2], w_fp[9], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[48], w_fp[2], w_fp[9], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 87 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -2347,7 +2347,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 88
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[14], w_fp[23], w_fp[4], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[14], w_fp[23], w_fp[4], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 88 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -2367,7 +2367,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 89
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[25], w_fp[2], w_fp[14], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[25], w_fp[2], w_fp[14], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 89 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -2383,7 +2383,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 90
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[2], w_fp[18], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[2], w_fp[18], COUPs[1], 1.0, &amp_fp[0] );
       jamp_sv[1] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[4] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[25] += cxtype( 0, 1 ) * amp_sv[0];
@@ -2392,7 +2392,7 @@ namespace mg5amcCpu
       jamp_sv[68] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[108] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[110] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[2], w_fp[19], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[2], w_fp[19], COUPs[1], 1.0, &amp_fp[0] );
       jamp_sv[4] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[28] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[49] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -2401,7 +2401,7 @@ namespace mg5amcCpu
       jamp_sv[68] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[96] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[102] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[2], w_fp[20], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[2], w_fp[20], COUPs[1], 1.0, &amp_fp[0] );
       jamp_sv[1] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[25] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[49] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -2417,7 +2417,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 91
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[52], w_fp[27], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[52], w_fp[27], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 91 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -2433,7 +2433,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 92
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[40], w_fp[2], w_fp[27], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[40], w_fp[2], w_fp[27], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 92 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -2449,7 +2449,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 93
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[2], w_fp[28], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[2], w_fp[28], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 93 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -2466,10 +2466,10 @@ namespace mg5amcCpu
       // *** DIAGRAM 94 OF 1240 ***
 
       // Wavefunction(s) for diagram number 94
-      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[41], w_fp[4], COUPs[1], cIPD[0], cIPD[1], w_fp[28] );
+      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[41], w_fp[4], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[28] );
 
       // Amplitude(s) for diagram number 94
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[28], w_fp[52], w_fp[5], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[28], w_fp[52], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 94 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -2480,10 +2480,10 @@ namespace mg5amcCpu
       // *** DIAGRAM 95 OF 1240 ***
 
       // Wavefunction(s) for diagram number 95
-      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[41], w_fp[5], COUPs[1], cIPD[0], cIPD[1], w_fp[40] );
+      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[41], w_fp[5], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[40] );
 
       // Amplitude(s) for diagram number 95
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[40], w_fp[52], w_fp[4], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[40], w_fp[52], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 95 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -2494,10 +2494,10 @@ namespace mg5amcCpu
       // *** DIAGRAM 96 OF 1240 ***
 
       // Wavefunction(s) for diagram number 96
-      FFV1P0_3<W_ACCESS, CD_ACCESS>( w_fp[41], w_fp[2], COUPs[1], 0., 0., w_fp[20] );
+      FFV1P0_3<W_ACCESS, CD_ACCESS>( w_fp[41], w_fp[2], COUPs[1], 1.0, 0., 0., w_fp[20] );
 
       // Amplitude(s) for diagram number 96
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[9], w_fp[20], w_fp[5], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[9], w_fp[20], w_fp[5], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 96 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -2517,7 +2517,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 97
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[40], w_fp[2], w_fp[9], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[40], w_fp[2], w_fp[9], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 97 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -2533,7 +2533,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 98
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[12], w_fp[20], w_fp[4], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[12], w_fp[20], w_fp[4], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 98 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -2553,7 +2553,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 99
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[28], w_fp[2], w_fp[12], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[28], w_fp[2], w_fp[12], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 99 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -2569,7 +2569,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 100
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[2], w_fp[15], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[2], w_fp[15], COUPs[1], 1.0, &amp_fp[0] );
       jamp_sv[0] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[2] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[24] += cxtype( 0, 1 ) * amp_sv[0];
@@ -2578,7 +2578,7 @@ namespace mg5amcCpu
       jamp_sv[62] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[84] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[86] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[2], w_fp[16], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[2], w_fp[16], COUPs[1], 1.0, &amp_fp[0] );
       jamp_sv[2] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[26] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[48] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -2587,7 +2587,7 @@ namespace mg5amcCpu
       jamp_sv[62] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[72] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[78] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[2], w_fp[17], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[2], w_fp[17], COUPs[1], 1.0, &amp_fp[0] );
       jamp_sv[0] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[24] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[48] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -2603,7 +2603,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 101
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[52], w_fp[24], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[52], w_fp[24], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 101 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -2619,7 +2619,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 102
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[42], w_fp[2], w_fp[24], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[42], w_fp[2], w_fp[24], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 102 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -2635,7 +2635,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 103
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[2], w_fp[26], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[2], w_fp[26], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 103 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -2652,10 +2652,10 @@ namespace mg5amcCpu
       // *** DIAGRAM 104 OF 1240 ***
 
       // Wavefunction(s) for diagram number 104
-      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[24], COUPs[1], cIPD[0], cIPD[1], w_fp[26] );
+      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[24], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[26] );
 
       // Amplitude(s) for diagram number 104
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[26], w_fp[52], w_fp[6], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[26], w_fp[52], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 104 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -2668,10 +2668,10 @@ namespace mg5amcCpu
       // *** DIAGRAM 105 OF 1240 ***
 
       // Wavefunction(s) for diagram number 105
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[24], w_fp[6], COUPs[0], 0., 0., w_fp[42] );
+      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[24], w_fp[6], COUPs[0], 1.0, 0., 0., w_fp[42] );
 
       // Amplitude(s) for diagram number 105
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[52], w_fp[42], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[52], w_fp[42], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 105 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -2688,10 +2688,10 @@ namespace mg5amcCpu
       // *** DIAGRAM 106 OF 1240 ***
 
       // Wavefunction(s) for diagram number 106
-      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[2], w_fp[24], COUPs[1], cIPD[0], cIPD[1], w_fp[17] );
+      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[2], w_fp[24], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[17] );
 
       // Amplitude(s) for diagram number 106
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[34], w_fp[17], w_fp[6], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[34], w_fp[17], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 106 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -2707,7 +2707,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 107
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[34], w_fp[2], w_fp[42], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[34], w_fp[2], w_fp[42], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 107 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -2727,7 +2727,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 108
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[17], w_fp[14], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[17], w_fp[14], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 108 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -2747,7 +2747,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 109
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[26], w_fp[2], w_fp[14], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[26], w_fp[2], w_fp[14], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 109 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -2764,10 +2764,10 @@ namespace mg5amcCpu
       // *** DIAGRAM 110 OF 1240 ***
 
       // Wavefunction(s) for diagram number 110
-      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[27], COUPs[1], cIPD[0], cIPD[1], w_fp[14] );
+      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[27], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[14] );
 
       // Amplitude(s) for diagram number 110
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[14], w_fp[52], w_fp[5], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[14], w_fp[52], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 110 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -2780,10 +2780,10 @@ namespace mg5amcCpu
       // *** DIAGRAM 111 OF 1240 ***
 
       // Wavefunction(s) for diagram number 111
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[27], w_fp[5], COUPs[0], 0., 0., w_fp[16] );
+      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[27], w_fp[5], COUPs[0], 1.0, 0., 0., w_fp[16] );
 
       // Amplitude(s) for diagram number 111
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[52], w_fp[16], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[52], w_fp[16], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 111 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -2800,10 +2800,10 @@ namespace mg5amcCpu
       // *** DIAGRAM 112 OF 1240 ***
 
       // Wavefunction(s) for diagram number 112
-      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[2], w_fp[27], COUPs[1], cIPD[0], cIPD[1], w_fp[15] );
+      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[2], w_fp[27], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[15] );
 
       // Amplitude(s) for diagram number 112
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[34], w_fp[15], w_fp[5], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[34], w_fp[15], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 112 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -2819,7 +2819,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 113
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[34], w_fp[2], w_fp[16], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[34], w_fp[2], w_fp[16], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 113 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -2839,7 +2839,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 114
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[15], w_fp[12], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[15], w_fp[12], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 114 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -2859,7 +2859,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 115
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[14], w_fp[2], w_fp[12], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[14], w_fp[2], w_fp[12], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 115 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -2876,10 +2876,10 @@ namespace mg5amcCpu
       // *** DIAGRAM 116 OF 1240 ***
 
       // Wavefunction(s) for diagram number 116
-      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[29], COUPs[1], cIPD[0], cIPD[1], w_fp[12] );
+      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[29], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[12] );
 
       // Amplitude(s) for diagram number 116
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[12], w_fp[52], w_fp[4], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[12], w_fp[52], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 116 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -2892,10 +2892,10 @@ namespace mg5amcCpu
       // *** DIAGRAM 117 OF 1240 ***
 
       // Wavefunction(s) for diagram number 117
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[4], w_fp[29], COUPs[0], 0., 0., w_fp[19] );
+      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[4], w_fp[29], COUPs[0], 1.0, 0., 0., w_fp[19] );
 
       // Amplitude(s) for diagram number 117
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[52], w_fp[19], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[52], w_fp[19], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 117 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -2912,10 +2912,10 @@ namespace mg5amcCpu
       // *** DIAGRAM 118 OF 1240 ***
 
       // Wavefunction(s) for diagram number 118
-      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[2], w_fp[29], COUPs[1], cIPD[0], cIPD[1], w_fp[18] );
+      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[2], w_fp[29], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[18] );
 
       // Amplitude(s) for diagram number 118
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[34], w_fp[18], w_fp[4], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[34], w_fp[18], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 118 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -2931,7 +2931,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 119
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[34], w_fp[2], w_fp[19], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[34], w_fp[2], w_fp[19], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 119 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -2951,7 +2951,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 120
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[18], w_fp[9], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[18], w_fp[9], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 120 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -2971,7 +2971,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 121
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[12], w_fp[2], w_fp[9], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[12], w_fp[2], w_fp[9], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 121 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -2991,7 +2991,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 122
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[52], w_fp[30], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[52], w_fp[30], COUPs[1], 1.0, &amp_fp[0] );
       jamp_sv[0] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[1] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[3] += cxtype( 0, 1 ) * amp_sv[0];
@@ -3000,7 +3000,7 @@ namespace mg5amcCpu
       jamp_sv[25] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[27] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[29] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[52], w_fp[31], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[52], w_fp[31], COUPs[1], 1.0, &amp_fp[0] );
       jamp_sv[1] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[2] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[3] += cxtype( 0, 1 ) * amp_sv[0];
@@ -3009,7 +3009,7 @@ namespace mg5amcCpu
       jamp_sv[26] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[27] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[28] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[52], w_fp[32], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[52], w_fp[32], COUPs[1], 1.0, &amp_fp[0] );
       jamp_sv[0] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[2] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[4] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -3025,7 +3025,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 123
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[34], w_fp[2], w_fp[30], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[34], w_fp[2], w_fp[30], COUPs[1], 1.0, &amp_fp[0] );
       jamp_sv[64] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[65] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[70] += cxtype( 0, 1 ) * amp_sv[0];
@@ -3034,7 +3034,7 @@ namespace mg5amcCpu
       jamp_sv[95] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[118] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[119] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[34], w_fp[2], w_fp[31], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[34], w_fp[2], w_fp[31], COUPs[1], 1.0, &amp_fp[0] );
       jamp_sv[70] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[71] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[88] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -3043,7 +3043,7 @@ namespace mg5amcCpu
       jamp_sv[95] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[112] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[113] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[34], w_fp[2], w_fp[32], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[34], w_fp[2], w_fp[32], COUPs[1], 1.0, &amp_fp[0] );
       jamp_sv[64] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[65] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[88] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -3056,13 +3056,13 @@ namespace mg5amcCpu
       // *** DIAGRAM 124 OF 1240 ***
 
       // Wavefunction(s) for diagram number 124
-      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[2], w_fp[0], COUPs[1], cIPD[0], cIPD[1], w_fp[34] );
-      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[1], COUPs[1], cIPD[0], cIPD[1], w_fp[52] );
-      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[34], w_fp[4], COUPs[1], cIPD[0], cIPD[1], w_fp[9] );
-      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[52], w_fp[5], COUPs[1], cIPD[0], cIPD[1], w_fp[22] );
+      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[2], w_fp[0], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[34] );
+      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[1], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[52] );
+      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[34], w_fp[4], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[9] );
+      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[52], w_fp[5], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[22] );
 
       // Amplitude(s) for diagram number 124
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[22], w_fp[9], w_fp[6], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[22], w_fp[9], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 124 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -3072,10 +3072,10 @@ namespace mg5amcCpu
       // *** DIAGRAM 125 OF 1240 ***
 
       // Wavefunction(s) for diagram number 125
-      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[52], w_fp[6], COUPs[1], cIPD[0], cIPD[1], w_fp[21] );
+      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[52], w_fp[6], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[21] );
 
       // Amplitude(s) for diagram number 125
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[21], w_fp[9], w_fp[5], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[21], w_fp[9], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 125 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -3085,11 +3085,11 @@ namespace mg5amcCpu
       // *** DIAGRAM 126 OF 1240 ***
 
       // Wavefunction(s) for diagram number 126
-      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[34], w_fp[5], COUPs[1], cIPD[0], cIPD[1], w_fp[55] );
-      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[52], w_fp[4], COUPs[1], cIPD[0], cIPD[1], w_fp[56] );
+      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[34], w_fp[5], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[55] );
+      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[52], w_fp[4], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[56] );
 
       // Amplitude(s) for diagram number 126
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[56], w_fp[55], w_fp[6], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[56], w_fp[55], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 126 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -3102,7 +3102,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 127
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[21], w_fp[55], w_fp[4], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[21], w_fp[55], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 127 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -3112,10 +3112,10 @@ namespace mg5amcCpu
       // *** DIAGRAM 128 OF 1240 ***
 
       // Wavefunction(s) for diagram number 128
-      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[34], w_fp[6], COUPs[1], cIPD[0], cIPD[1], w_fp[57] );
+      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[34], w_fp[6], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[57] );
 
       // Amplitude(s) for diagram number 128
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[56], w_fp[57], w_fp[5], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[56], w_fp[57], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 128 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -3128,7 +3128,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 129
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[22], w_fp[57], w_fp[4], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[22], w_fp[57], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 129 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -3138,10 +3138,10 @@ namespace mg5amcCpu
       // *** DIAGRAM 130 OF 1240 ***
 
       // Wavefunction(s) for diagram number 130
-      FFV1P0_3<W_ACCESS, CD_ACCESS>( w_fp[52], w_fp[34], COUPs[1], 0., 0., w_fp[58] );
+      FFV1P0_3<W_ACCESS, CD_ACCESS>( w_fp[52], w_fp[34], COUPs[1], 1.0, 0., 0., w_fp[58] );
 
       // Amplitude(s) for diagram number 130
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[24], w_fp[6], w_fp[58], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[24], w_fp[6], w_fp[58], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 130 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -3154,10 +3154,10 @@ namespace mg5amcCpu
       // *** DIAGRAM 131 OF 1240 ***
 
       // Wavefunction(s) for diagram number 131
-      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[34], w_fp[24], COUPs[1], cIPD[0], cIPD[1], w_fp[59] );
+      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[34], w_fp[24], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[59] );
 
       // Amplitude(s) for diagram number 131
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[59], w_fp[6], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[59], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 131 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -3171,7 +3171,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 132
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[57], w_fp[24], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[57], w_fp[24], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 132 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -3185,7 +3185,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 133
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[27], w_fp[5], w_fp[58], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[27], w_fp[5], w_fp[58], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 133 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -3198,10 +3198,10 @@ namespace mg5amcCpu
       // *** DIAGRAM 134 OF 1240 ***
 
       // Wavefunction(s) for diagram number 134
-      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[34], w_fp[27], COUPs[1], cIPD[0], cIPD[1], w_fp[60] );
+      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[34], w_fp[27], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[60] );
 
       // Amplitude(s) for diagram number 134
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[60], w_fp[5], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[60], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 134 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -3215,7 +3215,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 135
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[55], w_fp[27], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[55], w_fp[27], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 135 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -3229,7 +3229,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 136
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[4], w_fp[29], w_fp[58], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[4], w_fp[29], w_fp[58], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 136 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -3245,7 +3245,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 137
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[9], w_fp[29], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[9], w_fp[29], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 137 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -3256,10 +3256,10 @@ namespace mg5amcCpu
       // *** DIAGRAM 138 OF 1240 ***
 
       // Wavefunction(s) for diagram number 138
-      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[34], w_fp[29], COUPs[1], cIPD[0], cIPD[1], w_fp[58] );
+      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[34], w_fp[29], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[58] );
 
       // Amplitude(s) for diagram number 138
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[58], w_fp[4], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[58], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 138 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -3273,17 +3273,17 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 139
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[34], w_fp[30], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[34], w_fp[30], COUPs[1], 1.0, &amp_fp[0] );
       jamp_sv[9] += amp_sv[0];
       jamp_sv[11] -= amp_sv[0];
       jamp_sv[17] -= amp_sv[0];
       jamp_sv[23] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[34], w_fp[31], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[34], w_fp[31], COUPs[1], 1.0, &amp_fp[0] );
       jamp_sv[11] -= amp_sv[0];
       jamp_sv[15] += amp_sv[0];
       jamp_sv[17] -= amp_sv[0];
       jamp_sv[21] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[34], w_fp[32], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[34], w_fp[32], COUPs[1], 1.0, &amp_fp[0] );
       jamp_sv[9] -= amp_sv[0];
       jamp_sv[15] += amp_sv[0];
       jamp_sv[21] += amp_sv[0];
@@ -3292,12 +3292,12 @@ namespace mg5amcCpu
       // *** DIAGRAM 140 OF 1240 ***
 
       // Wavefunction(s) for diagram number 140
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[1], w_fp[4], COUPs[0], 0., 0., w_fp[61] );
-      FFV1P0_3<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[34], COUPs[1], 0., 0., w_fp[62] );
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[61], w_fp[5], COUPs[0], 0., 0., w_fp[63] );
+      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[1], w_fp[4], COUPs[0], 1.0, 0., 0., w_fp[61] );
+      FFV1P0_3<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[34], COUPs[1], 1.0, 0., 0., w_fp[62] );
+      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[61], w_fp[5], COUPs[0], 1.0, 0., 0., w_fp[63] );
 
       // Amplitude(s) for diagram number 140
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[62], w_fp[63], w_fp[6], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[62], w_fp[63], w_fp[6], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 140 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -3314,10 +3314,10 @@ namespace mg5amcCpu
       // *** DIAGRAM 141 OF 1240 ***
 
       // Wavefunction(s) for diagram number 141
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[61], w_fp[6], COUPs[0], 0., 0., w_fp[64] );
+      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[61], w_fp[6], COUPs[0], 1.0, 0., 0., w_fp[64] );
 
       // Amplitude(s) for diagram number 141
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[62], w_fp[64], w_fp[5], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[62], w_fp[64], w_fp[5], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 141 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -3337,7 +3337,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 142
-      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[61], w_fp[5], w_fp[6], w_fp[62], COUPs[2], &amp_fp[0] );
+      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[61], w_fp[5], w_fp[6], w_fp[62], COUPs[2], 1.0, &amp_fp[0] );
       jamp_sv[0] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[6] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[12] += cxtype( 0, 1 ) * amp_sv[0];
@@ -3346,7 +3346,7 @@ namespace mg5amcCpu
       jamp_sv[20] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[22] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[23] += cxtype( 0, 1 ) * amp_sv[0];
-      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[61], w_fp[5], w_fp[6], w_fp[62], COUPs[2], &amp_fp[0] );
+      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[61], w_fp[5], w_fp[6], w_fp[62], COUPs[2], 1.0, &amp_fp[0] );
       jamp_sv[1] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[7] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[12] += cxtype( 0, 1 ) * amp_sv[0];
@@ -3355,7 +3355,7 @@ namespace mg5amcCpu
       jamp_sv[17] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[18] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[20] -= cxtype( 0, 1 ) * amp_sv[0];
-      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[61], w_fp[5], w_fp[6], w_fp[62], COUPs[2], &amp_fp[0] );
+      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[61], w_fp[5], w_fp[6], w_fp[62], COUPs[2], 1.0, &amp_fp[0] );
       jamp_sv[0] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[1] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[6] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -3368,10 +3368,10 @@ namespace mg5amcCpu
       // *** DIAGRAM 143 OF 1240 ***
 
       // Wavefunction(s) for diagram number 143
-      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[61], COUPs[1], cIPD[0], cIPD[1], w_fp[65] );
+      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[61], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[65] );
 
       // Amplitude(s) for diagram number 143
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[65], w_fp[55], w_fp[6], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[65], w_fp[55], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 143 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -3385,7 +3385,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 144
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[55], w_fp[64], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[55], w_fp[64], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 144 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -3401,7 +3401,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 145
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[65], w_fp[57], w_fp[5], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[65], w_fp[57], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 145 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -3415,7 +3415,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 146
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[57], w_fp[63], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[57], w_fp[63], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 146 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -3428,10 +3428,10 @@ namespace mg5amcCpu
       // *** DIAGRAM 147 OF 1240 ***
 
       // Wavefunction(s) for diagram number 147
-      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[34], w_fp[61], COUPs[1], cIPD[0], cIPD[1], w_fp[66] );
+      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[34], w_fp[61], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[66] );
 
       // Amplitude(s) for diagram number 147
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[66], w_fp[6], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[66], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 147 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -3442,10 +3442,10 @@ namespace mg5amcCpu
       // *** DIAGRAM 148 OF 1240 ***
 
       // Wavefunction(s) for diagram number 148
-      FFV1P0_3<W_ACCESS, CD_ACCESS>( w_fp[38], w_fp[34], COUPs[1], 0., 0., w_fp[67] );
+      FFV1P0_3<W_ACCESS, CD_ACCESS>( w_fp[38], w_fp[34], COUPs[1], 1.0, 0., 0., w_fp[67] );
 
       // Amplitude(s) for diagram number 148
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[61], w_fp[6], w_fp[67], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[61], w_fp[6], w_fp[67], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 148 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -3461,7 +3461,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 149
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[57], w_fp[61], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[57], w_fp[61], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 149 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -3475,7 +3475,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 150
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[66], w_fp[5], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[66], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 150 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -3486,10 +3486,10 @@ namespace mg5amcCpu
       // *** DIAGRAM 151 OF 1240 ***
 
       // Wavefunction(s) for diagram number 151
-      FFV1P0_3<W_ACCESS, CD_ACCESS>( w_fp[41], w_fp[34], COUPs[1], 0., 0., w_fp[68] );
+      FFV1P0_3<W_ACCESS, CD_ACCESS>( w_fp[41], w_fp[34], COUPs[1], 1.0, 0., 0., w_fp[68] );
 
       // Amplitude(s) for diagram number 151
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[61], w_fp[5], w_fp[68], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[61], w_fp[5], w_fp[68], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 151 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -3505,7 +3505,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 152
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[55], w_fp[61], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[55], w_fp[61], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 152 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -3519,7 +3519,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 153
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[66], w_fp[29], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[66], w_fp[29], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 153 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -3535,7 +3535,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 154
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[61], w_fp[29], w_fp[62], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[61], w_fp[29], w_fp[62], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 154 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -3555,7 +3555,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 155
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[58], w_fp[61], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[58], w_fp[61], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 155 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -3568,11 +3568,11 @@ namespace mg5amcCpu
       // *** DIAGRAM 156 OF 1240 ***
 
       // Wavefunction(s) for diagram number 156
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[1], w_fp[5], COUPs[0], 0., 0., w_fp[66] );
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[66], w_fp[4], COUPs[0], 0., 0., w_fp[69] );
+      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[1], w_fp[5], COUPs[0], 1.0, 0., 0., w_fp[66] );
+      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[66], w_fp[4], COUPs[0], 1.0, 0., 0., w_fp[69] );
 
       // Amplitude(s) for diagram number 156
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[62], w_fp[69], w_fp[6], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[62], w_fp[69], w_fp[6], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 156 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -3589,10 +3589,10 @@ namespace mg5amcCpu
       // *** DIAGRAM 157 OF 1240 ***
 
       // Wavefunction(s) for diagram number 157
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[66], w_fp[6], COUPs[0], 0., 0., w_fp[70] );
+      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[66], w_fp[6], COUPs[0], 1.0, 0., 0., w_fp[70] );
 
       // Amplitude(s) for diagram number 157
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[62], w_fp[70], w_fp[4], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[62], w_fp[70], w_fp[4], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 157 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -3612,7 +3612,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 158
-      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[66], w_fp[4], w_fp[6], w_fp[62], COUPs[2], &amp_fp[0] );
+      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[66], w_fp[4], w_fp[6], w_fp[62], COUPs[2], 1.0, &amp_fp[0] );
       jamp_sv[2] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[6] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[8] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -3621,7 +3621,7 @@ namespace mg5amcCpu
       jamp_sv[20] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[21] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[22] -= cxtype( 0, 1 ) * amp_sv[0];
-      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[66], w_fp[4], w_fp[6], w_fp[62], COUPs[2], &amp_fp[0] );
+      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[66], w_fp[4], w_fp[6], w_fp[62], COUPs[2], 1.0, &amp_fp[0] );
       jamp_sv[3] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[6] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[8] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -3630,7 +3630,7 @@ namespace mg5amcCpu
       jamp_sv[13] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[19] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[22] -= cxtype( 0, 1 ) * amp_sv[0];
-      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[66], w_fp[4], w_fp[6], w_fp[62], COUPs[2], &amp_fp[0] );
+      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[66], w_fp[4], w_fp[6], w_fp[62], COUPs[2], 1.0, &amp_fp[0] );
       jamp_sv[2] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[3] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[10] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -3643,10 +3643,10 @@ namespace mg5amcCpu
       // *** DIAGRAM 159 OF 1240 ***
 
       // Wavefunction(s) for diagram number 159
-      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[66], COUPs[1], cIPD[0], cIPD[1], w_fp[71] );
+      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[66], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[71] );
 
       // Amplitude(s) for diagram number 159
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[71], w_fp[9], w_fp[6], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[71], w_fp[9], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 159 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -3660,7 +3660,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 160
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[9], w_fp[70], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[9], w_fp[70], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 160 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -3676,7 +3676,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 161
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[71], w_fp[57], w_fp[4], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[71], w_fp[57], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 161 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -3690,7 +3690,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 162
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[57], w_fp[69], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[57], w_fp[69], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 162 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -3703,10 +3703,10 @@ namespace mg5amcCpu
       // *** DIAGRAM 163 OF 1240 ***
 
       // Wavefunction(s) for diagram number 163
-      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[34], w_fp[66], COUPs[1], cIPD[0], cIPD[1], w_fp[72] );
+      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[34], w_fp[66], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[72] );
 
       // Amplitude(s) for diagram number 163
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[72], w_fp[6], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[72], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 163 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -3717,10 +3717,10 @@ namespace mg5amcCpu
       // *** DIAGRAM 164 OF 1240 ***
 
       // Wavefunction(s) for diagram number 164
-      FFV1P0_3<W_ACCESS, CD_ACCESS>( w_fp[46], w_fp[34], COUPs[1], 0., 0., w_fp[73] );
+      FFV1P0_3<W_ACCESS, CD_ACCESS>( w_fp[46], w_fp[34], COUPs[1], 1.0, 0., 0., w_fp[73] );
 
       // Amplitude(s) for diagram number 164
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[66], w_fp[6], w_fp[73], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[66], w_fp[6], w_fp[73], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 164 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -3736,7 +3736,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 165
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[57], w_fp[66], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[57], w_fp[66], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 165 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -3750,7 +3750,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 166
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[72], w_fp[4], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[72], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 166 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -3764,7 +3764,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 167
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[66], w_fp[4], w_fp[68], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[66], w_fp[4], w_fp[68], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 167 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -3780,7 +3780,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 168
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[9], w_fp[66], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[9], w_fp[66], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 168 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -3794,7 +3794,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 169
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[72], w_fp[27], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[72], w_fp[27], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 169 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -3810,7 +3810,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 170
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[66], w_fp[27], w_fp[62], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[66], w_fp[27], w_fp[62], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 170 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -3830,7 +3830,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 171
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[60], w_fp[66], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[60], w_fp[66], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 171 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -3843,11 +3843,11 @@ namespace mg5amcCpu
       // *** DIAGRAM 172 OF 1240 ***
 
       // Wavefunction(s) for diagram number 172
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[1], w_fp[6], COUPs[0], 0., 0., w_fp[72] );
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[72], w_fp[4], COUPs[0], 0., 0., w_fp[74] );
+      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[1], w_fp[6], COUPs[0], 1.0, 0., 0., w_fp[72] );
+      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[72], w_fp[4], COUPs[0], 1.0, 0., 0., w_fp[74] );
 
       // Amplitude(s) for diagram number 172
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[62], w_fp[74], w_fp[5], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[62], w_fp[74], w_fp[5], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 172 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -3864,10 +3864,10 @@ namespace mg5amcCpu
       // *** DIAGRAM 173 OF 1240 ***
 
       // Wavefunction(s) for diagram number 173
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[72], w_fp[5], COUPs[0], 0., 0., w_fp[75] );
+      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[72], w_fp[5], COUPs[0], 1.0, 0., 0., w_fp[75] );
 
       // Amplitude(s) for diagram number 173
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[62], w_fp[75], w_fp[4], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[62], w_fp[75], w_fp[4], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 173 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -3887,7 +3887,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 174
-      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[72], w_fp[4], w_fp[5], w_fp[62], COUPs[2], &amp_fp[0] );
+      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[72], w_fp[4], w_fp[5], w_fp[62], COUPs[2], 1.0, &amp_fp[0] );
       jamp_sv[4] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[7] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[10] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -3896,7 +3896,7 @@ namespace mg5amcCpu
       jamp_sv[15] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[16] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[18] += cxtype( 0, 1 ) * amp_sv[0];
-      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[72], w_fp[4], w_fp[5], w_fp[62], COUPs[2], &amp_fp[0] );
+      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[72], w_fp[4], w_fp[5], w_fp[62], COUPs[2], 1.0, &amp_fp[0] );
       jamp_sv[5] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[7] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[8] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -3905,7 +3905,7 @@ namespace mg5amcCpu
       jamp_sv[13] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[16] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[19] += cxtype( 0, 1 ) * amp_sv[0];
-      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[72], w_fp[4], w_fp[5], w_fp[62], COUPs[2], &amp_fp[0] );
+      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[72], w_fp[4], w_fp[5], w_fp[62], COUPs[2], 1.0, &amp_fp[0] );
       jamp_sv[4] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[5] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[8] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -3918,10 +3918,10 @@ namespace mg5amcCpu
       // *** DIAGRAM 175 OF 1240 ***
 
       // Wavefunction(s) for diagram number 175
-      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[72], COUPs[1], cIPD[0], cIPD[1], w_fp[76] );
+      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[72], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[76] );
 
       // Amplitude(s) for diagram number 175
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[76], w_fp[9], w_fp[5], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[76], w_fp[9], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 175 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -3935,7 +3935,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 176
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[9], w_fp[75], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[9], w_fp[75], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 176 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -3951,7 +3951,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 177
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[76], w_fp[55], w_fp[4], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[76], w_fp[55], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 177 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -3965,7 +3965,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 178
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[55], w_fp[74], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[55], w_fp[74], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 178 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -3978,10 +3978,10 @@ namespace mg5amcCpu
       // *** DIAGRAM 179 OF 1240 ***
 
       // Wavefunction(s) for diagram number 179
-      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[34], w_fp[72], COUPs[1], cIPD[0], cIPD[1], w_fp[77] );
+      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[34], w_fp[72], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[77] );
 
       // Amplitude(s) for diagram number 179
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[77], w_fp[5], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[77], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 179 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -3995,7 +3995,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 180
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[72], w_fp[5], w_fp[73], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[72], w_fp[5], w_fp[73], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 180 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -4011,7 +4011,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 181
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[55], w_fp[72], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[55], w_fp[72], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 181 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -4025,7 +4025,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 182
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[77], w_fp[4], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[77], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 182 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -4039,7 +4039,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 183
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[72], w_fp[4], w_fp[67], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[72], w_fp[4], w_fp[67], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 183 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -4055,7 +4055,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 184
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[9], w_fp[72], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[9], w_fp[72], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 184 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -4069,7 +4069,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 185
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[77], w_fp[24], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[77], w_fp[24], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 185 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -4085,7 +4085,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 186
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[72], w_fp[24], w_fp[62], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[72], w_fp[24], w_fp[62], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 186 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -4105,7 +4105,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 187
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[59], w_fp[72], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[59], w_fp[72], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 187 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -4118,10 +4118,10 @@ namespace mg5amcCpu
       // *** DIAGRAM 188 OF 1240 ***
 
       // Wavefunction(s) for diagram number 188
-      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[34], w_fp[1], COUPs[1], cIPD[0], cIPD[1], w_fp[77] );
+      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[34], w_fp[1], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[77] );
 
       // Amplitude(s) for diagram number 188
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[7], w_fp[77], w_fp[6], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[7], w_fp[77], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 188 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -4134,7 +4134,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 189
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[53], w_fp[77], w_fp[5], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[53], w_fp[77], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 189 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -4144,10 +4144,10 @@ namespace mg5amcCpu
       // *** DIAGRAM 190 OF 1240 ***
 
       // Wavefunction(s) for diagram number 190
-      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[46], w_fp[1], COUPs[1], cIPD[0], cIPD[1], w_fp[78] );
+      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[46], w_fp[1], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[78] );
 
       // Amplitude(s) for diagram number 190
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[78], w_fp[55], w_fp[6], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[78], w_fp[55], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 190 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -4160,7 +4160,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 191
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[53], w_fp[55], w_fp[1], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[53], w_fp[55], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 191 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -4173,7 +4173,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 192
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[78], w_fp[57], w_fp[5], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[78], w_fp[57], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 192 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -4186,7 +4186,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 193
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[7], w_fp[57], w_fp[1], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[7], w_fp[57], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 193 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -4199,7 +4199,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 194
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[77], w_fp[29], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[77], w_fp[29], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 194 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -4213,7 +4213,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 195
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[29], w_fp[73], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[29], w_fp[73], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 195 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -4229,7 +4229,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 196
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[58], w_fp[1], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[58], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 196 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -4243,7 +4243,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 197
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[25], w_fp[77], w_fp[6], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[25], w_fp[77], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 197 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -4256,7 +4256,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 198
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[48], w_fp[77], w_fp[4], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[48], w_fp[77], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 198 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -4266,10 +4266,10 @@ namespace mg5amcCpu
       // *** DIAGRAM 199 OF 1240 ***
 
       // Wavefunction(s) for diagram number 199
-      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[38], w_fp[1], COUPs[1], cIPD[0], cIPD[1], w_fp[58] );
+      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[38], w_fp[1], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[58] );
 
       // Amplitude(s) for diagram number 199
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[58], w_fp[9], w_fp[6], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[58], w_fp[9], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 199 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -4282,7 +4282,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 200
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[48], w_fp[9], w_fp[1], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[48], w_fp[9], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 200 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -4295,7 +4295,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 201
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[58], w_fp[57], w_fp[4], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[58], w_fp[57], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 201 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -4308,7 +4308,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 202
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[25], w_fp[57], w_fp[1], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[25], w_fp[57], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 202 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -4321,7 +4321,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 203
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[77], w_fp[27], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[77], w_fp[27], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 203 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -4335,7 +4335,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 204
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[27], w_fp[67], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[27], w_fp[67], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 204 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -4351,7 +4351,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 205
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[60], w_fp[1], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[60], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 205 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -4365,7 +4365,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 206
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[28], w_fp[77], w_fp[5], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[28], w_fp[77], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 206 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -4378,7 +4378,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 207
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[40], w_fp[77], w_fp[4], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[40], w_fp[77], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 207 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -4388,10 +4388,10 @@ namespace mg5amcCpu
       // *** DIAGRAM 208 OF 1240 ***
 
       // Wavefunction(s) for diagram number 208
-      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[41], w_fp[1], COUPs[1], cIPD[0], cIPD[1], w_fp[60] );
+      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[41], w_fp[1], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[60] );
 
       // Amplitude(s) for diagram number 208
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[60], w_fp[9], w_fp[5], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[60], w_fp[9], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 208 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -4404,7 +4404,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 209
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[40], w_fp[9], w_fp[1], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[40], w_fp[9], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 209 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -4417,7 +4417,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 210
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[60], w_fp[55], w_fp[4], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[60], w_fp[55], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 210 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -4430,7 +4430,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 211
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[28], w_fp[55], w_fp[1], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[28], w_fp[55], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 211 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -4443,7 +4443,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 212
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[77], w_fp[24], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[77], w_fp[24], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 212 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -4457,7 +4457,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 213
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[24], w_fp[68], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[24], w_fp[68], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 213 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -4473,7 +4473,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 214
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[59], w_fp[1], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[59], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 214 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -4487,7 +4487,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 215
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[26], w_fp[77], w_fp[6], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[26], w_fp[77], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 215 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -4501,7 +4501,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 216
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[77], w_fp[42], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[77], w_fp[42], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 216 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -4514,10 +4514,10 @@ namespace mg5amcCpu
       // *** DIAGRAM 217 OF 1240 ***
 
       // Wavefunction(s) for diagram number 217
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[1], w_fp[24], COUPs[0], 0., 0., w_fp[59] );
+      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[1], w_fp[24], COUPs[0], 1.0, 0., 0., w_fp[59] );
 
       // Amplitude(s) for diagram number 217
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[62], w_fp[59], w_fp[6], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[62], w_fp[59], w_fp[6], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 217 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -4537,7 +4537,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 218
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[62], w_fp[1], w_fp[42], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[62], w_fp[1], w_fp[42], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 218 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -4557,7 +4557,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 219
-      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[24], w_fp[6], w_fp[62], COUPs[2], &amp_fp[0] );
+      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[24], w_fp[6], w_fp[62], COUPs[2], 1.0, &amp_fp[0] );
       jamp_sv[0] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[2] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[8] += cxtype( 0, 1 ) * amp_sv[0];
@@ -4566,7 +4566,7 @@ namespace mg5amcCpu
       jamp_sv[19] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[21] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[23] += cxtype( 0, 1 ) * amp_sv[0];
-      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[24], w_fp[6], w_fp[62], COUPs[2], &amp_fp[0] );
+      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[24], w_fp[6], w_fp[62], COUPs[2], 1.0, &amp_fp[0] );
       jamp_sv[4] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[5] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[8] += cxtype( 0, 1 ) * amp_sv[0];
@@ -4575,7 +4575,7 @@ namespace mg5amcCpu
       jamp_sv[15] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[18] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[19] -= cxtype( 0, 1 ) * amp_sv[0];
-      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[24], w_fp[6], w_fp[62], COUPs[2], &amp_fp[0] );
+      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[24], w_fp[6], w_fp[62], COUPs[2], 1.0, &amp_fp[0] );
       jamp_sv[0] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[2] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[4] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -4591,7 +4591,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 220
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[57], w_fp[59], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[57], w_fp[59], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 220 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -4607,7 +4607,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 221
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[26], w_fp[57], w_fp[1], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[26], w_fp[57], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 221 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -4621,7 +4621,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 222
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[14], w_fp[77], w_fp[5], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[14], w_fp[77], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 222 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -4635,7 +4635,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 223
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[77], w_fp[16], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[77], w_fp[16], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 223 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -4648,10 +4648,10 @@ namespace mg5amcCpu
       // *** DIAGRAM 224 OF 1240 ***
 
       // Wavefunction(s) for diagram number 224
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[1], w_fp[27], COUPs[0], 0., 0., w_fp[68] );
+      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[1], w_fp[27], COUPs[0], 1.0, 0., 0., w_fp[68] );
 
       // Amplitude(s) for diagram number 224
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[62], w_fp[68], w_fp[5], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[62], w_fp[68], w_fp[5], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 224 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -4671,7 +4671,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 225
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[62], w_fp[1], w_fp[16], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[62], w_fp[1], w_fp[16], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 225 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -4691,7 +4691,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 226
-      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[27], w_fp[5], w_fp[62], COUPs[2], &amp_fp[0] );
+      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[27], w_fp[5], w_fp[62], COUPs[2], 1.0, &amp_fp[0] );
       jamp_sv[1] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[4] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[10] += cxtype( 0, 1 ) * amp_sv[0];
@@ -4700,7 +4700,7 @@ namespace mg5amcCpu
       jamp_sv[15] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[17] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[20] -= cxtype( 0, 1 ) * amp_sv[0];
-      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[27], w_fp[5], w_fp[62], COUPs[2], &amp_fp[0] );
+      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[27], w_fp[5], w_fp[62], COUPs[2], 1.0, &amp_fp[0] );
       jamp_sv[2] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[3] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[10] += cxtype( 0, 1 ) * amp_sv[0];
@@ -4709,7 +4709,7 @@ namespace mg5amcCpu
       jamp_sv[13] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[20] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[21] += cxtype( 0, 1 ) * amp_sv[0];
-      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[27], w_fp[5], w_fp[62], COUPs[2], &amp_fp[0] );
+      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[27], w_fp[5], w_fp[62], COUPs[2], 1.0, &amp_fp[0] );
       jamp_sv[1] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[2] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[3] += cxtype( 0, 1 ) * amp_sv[0];
@@ -4725,7 +4725,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 227
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[55], w_fp[68], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[55], w_fp[68], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 227 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -4741,7 +4741,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 228
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[14], w_fp[55], w_fp[1], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[14], w_fp[55], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 228 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -4755,7 +4755,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 229
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[12], w_fp[77], w_fp[4], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[12], w_fp[77], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 229 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -4769,7 +4769,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 230
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[77], w_fp[19], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[77], w_fp[19], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 230 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -4782,10 +4782,10 @@ namespace mg5amcCpu
       // *** DIAGRAM 231 OF 1240 ***
 
       // Wavefunction(s) for diagram number 231
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[1], w_fp[29], COUPs[0], 0., 0., w_fp[67] );
+      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[1], w_fp[29], COUPs[0], 1.0, 0., 0., w_fp[67] );
 
       // Amplitude(s) for diagram number 231
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[62], w_fp[67], w_fp[4], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[62], w_fp[67], w_fp[4], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 231 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -4805,7 +4805,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 232
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[62], w_fp[1], w_fp[19], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[62], w_fp[1], w_fp[19], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 232 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -4825,7 +4825,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 233
-      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[4], w_fp[29], w_fp[62], COUPs[2], &amp_fp[0] );
+      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[4], w_fp[29], w_fp[62], COUPs[2], 1.0, &amp_fp[0] );
       jamp_sv[0] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[1] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[6] += cxtype( 0, 1 ) * amp_sv[0];
@@ -4834,7 +4834,7 @@ namespace mg5amcCpu
       jamp_sv[17] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[22] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[23] += cxtype( 0, 1 ) * amp_sv[0];
-      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[4], w_fp[29], w_fp[62], COUPs[2], &amp_fp[0] );
+      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[4], w_fp[29], w_fp[62], COUPs[2], 1.0, &amp_fp[0] );
       jamp_sv[3] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[5] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[6] += cxtype( 0, 1 ) * amp_sv[0];
@@ -4843,7 +4843,7 @@ namespace mg5amcCpu
       jamp_sv[11] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[16] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[22] -= cxtype( 0, 1 ) * amp_sv[0];
-      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[4], w_fp[29], w_fp[62], COUPs[2], &amp_fp[0] );
+      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[4], w_fp[29], w_fp[62], COUPs[2], 1.0, &amp_fp[0] );
       jamp_sv[0] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[1] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[3] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -4859,7 +4859,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 234
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[9], w_fp[67], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[9], w_fp[67], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 234 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -4875,7 +4875,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 235
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[12], w_fp[9], w_fp[1], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[12], w_fp[9], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 235 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -4886,12 +4886,12 @@ namespace mg5amcCpu
       // *** DIAGRAM 236 OF 1240 ***
 
       // Wavefunction(s) for diagram number 236
-      VVVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[1], w_fp[4], w_fp[5], COUPs[2], 0., 0., w_fp[73] );
-      VVVV3P0_1<W_ACCESS, CD_ACCESS>( w_fp[1], w_fp[4], w_fp[5], COUPs[2], 0., 0., w_fp[79] );
-      VVVV4P0_1<W_ACCESS, CD_ACCESS>( w_fp[1], w_fp[4], w_fp[5], COUPs[2], 0., 0., w_fp[80] );
+      VVVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[1], w_fp[4], w_fp[5], COUPs[2], 1.0, 0., 0., w_fp[73] );
+      VVVV3P0_1<W_ACCESS, CD_ACCESS>( w_fp[1], w_fp[4], w_fp[5], COUPs[2], 1.0, 0., 0., w_fp[79] );
+      VVVV4P0_1<W_ACCESS, CD_ACCESS>( w_fp[1], w_fp[4], w_fp[5], COUPs[2], 1.0, 0., 0., w_fp[80] );
 
       // Amplitude(s) for diagram number 236
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[73], w_fp[6], w_fp[62], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[73], w_fp[6], w_fp[62], COUPs[0], 1.0, &amp_fp[0] );
       jamp_sv[0] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[2] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[8] += cxtype( 0, 1 ) * amp_sv[0];
@@ -4900,7 +4900,7 @@ namespace mg5amcCpu
       jamp_sv[19] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[21] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[23] += cxtype( 0, 1 ) * amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[79], w_fp[6], w_fp[62], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[79], w_fp[6], w_fp[62], COUPs[0], 1.0, &amp_fp[0] );
       jamp_sv[2] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[6] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[8] += cxtype( 0, 1 ) * amp_sv[0];
@@ -4909,7 +4909,7 @@ namespace mg5amcCpu
       jamp_sv[20] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[21] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[22] += cxtype( 0, 1 ) * amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[80], w_fp[6], w_fp[62], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[80], w_fp[6], w_fp[62], COUPs[0], 1.0, &amp_fp[0] );
       jamp_sv[0] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[6] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[12] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -4925,17 +4925,17 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 237
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[57], w_fp[73], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[57], w_fp[73], COUPs[1], 1.0, &amp_fp[0] );
       jamp_sv[18] += amp_sv[0];
       jamp_sv[19] -= amp_sv[0];
       jamp_sv[21] -= amp_sv[0];
       jamp_sv[23] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[57], w_fp[79], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[57], w_fp[79], COUPs[1], 1.0, &amp_fp[0] );
       jamp_sv[19] -= amp_sv[0];
       jamp_sv[20] += amp_sv[0];
       jamp_sv[21] -= amp_sv[0];
       jamp_sv[22] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[57], w_fp[80], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[57], w_fp[80], COUPs[1], 1.0, &amp_fp[0] );
       jamp_sv[18] -= amp_sv[0];
       jamp_sv[20] += amp_sv[0];
       jamp_sv[22] += amp_sv[0];
@@ -4947,17 +4947,17 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 238
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[34], w_fp[73], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[34], w_fp[73], COUPs[1], 1.0, &amp_fp[0] );
       jamp_sv[0] += amp_sv[0];
       jamp_sv[2] -= amp_sv[0];
       jamp_sv[8] -= amp_sv[0];
       jamp_sv[14] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[34], w_fp[79], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[34], w_fp[79], COUPs[1], 1.0, &amp_fp[0] );
       jamp_sv[2] -= amp_sv[0];
       jamp_sv[6] += amp_sv[0];
       jamp_sv[8] -= amp_sv[0];
       jamp_sv[12] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[34], w_fp[80], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[34], w_fp[80], COUPs[1], 1.0, &amp_fp[0] );
       jamp_sv[0] -= amp_sv[0];
       jamp_sv[6] += amp_sv[0];
       jamp_sv[12] += amp_sv[0];
@@ -4966,12 +4966,12 @@ namespace mg5amcCpu
       // *** DIAGRAM 239 OF 1240 ***
 
       // Wavefunction(s) for diagram number 239
-      VVVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[1], w_fp[4], w_fp[6], COUPs[2], 0., 0., w_fp[57] );
-      VVVV3P0_1<W_ACCESS, CD_ACCESS>( w_fp[1], w_fp[4], w_fp[6], COUPs[2], 0., 0., w_fp[81] );
-      VVVV4P0_1<W_ACCESS, CD_ACCESS>( w_fp[1], w_fp[4], w_fp[6], COUPs[2], 0., 0., w_fp[82] );
+      VVVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[1], w_fp[4], w_fp[6], COUPs[2], 1.0, 0., 0., w_fp[57] );
+      VVVV3P0_1<W_ACCESS, CD_ACCESS>( w_fp[1], w_fp[4], w_fp[6], COUPs[2], 1.0, 0., 0., w_fp[81] );
+      VVVV4P0_1<W_ACCESS, CD_ACCESS>( w_fp[1], w_fp[4], w_fp[6], COUPs[2], 1.0, 0., 0., w_fp[82] );
 
       // Amplitude(s) for diagram number 239
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[57], w_fp[5], w_fp[62], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[57], w_fp[5], w_fp[62], COUPs[0], 1.0, &amp_fp[0] );
       jamp_sv[1] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[4] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[10] += cxtype( 0, 1 ) * amp_sv[0];
@@ -4980,7 +4980,7 @@ namespace mg5amcCpu
       jamp_sv[15] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[17] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[20] -= cxtype( 0, 1 ) * amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[81], w_fp[5], w_fp[62], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[81], w_fp[5], w_fp[62], COUPs[0], 1.0, &amp_fp[0] );
       jamp_sv[4] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[7] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[10] += cxtype( 0, 1 ) * amp_sv[0];
@@ -4989,7 +4989,7 @@ namespace mg5amcCpu
       jamp_sv[15] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[16] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[18] -= cxtype( 0, 1 ) * amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[82], w_fp[5], w_fp[62], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[82], w_fp[5], w_fp[62], COUPs[0], 1.0, &amp_fp[0] );
       jamp_sv[1] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[7] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[12] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -5005,17 +5005,17 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 240
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[55], w_fp[57], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[55], w_fp[57], COUPs[1], 1.0, &amp_fp[0] );
       jamp_sv[12] += amp_sv[0];
       jamp_sv[13] -= amp_sv[0];
       jamp_sv[15] -= amp_sv[0];
       jamp_sv[17] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[55], w_fp[81], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[55], w_fp[81], COUPs[1], 1.0, &amp_fp[0] );
       jamp_sv[13] -= amp_sv[0];
       jamp_sv[14] += amp_sv[0];
       jamp_sv[15] -= amp_sv[0];
       jamp_sv[16] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[55], w_fp[82], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[55], w_fp[82], COUPs[1], 1.0, &amp_fp[0] );
       jamp_sv[12] -= amp_sv[0];
       jamp_sv[14] += amp_sv[0];
       jamp_sv[16] += amp_sv[0];
@@ -5027,17 +5027,17 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 241
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[34], w_fp[57], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[34], w_fp[57], COUPs[1], 1.0, &amp_fp[0] );
       jamp_sv[1] += amp_sv[0];
       jamp_sv[4] -= amp_sv[0];
       jamp_sv[10] -= amp_sv[0];
       jamp_sv[20] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[34], w_fp[81], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[34], w_fp[81], COUPs[1], 1.0, &amp_fp[0] );
       jamp_sv[4] -= amp_sv[0];
       jamp_sv[7] += amp_sv[0];
       jamp_sv[10] -= amp_sv[0];
       jamp_sv[18] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[34], w_fp[82], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[34], w_fp[82], COUPs[1], 1.0, &amp_fp[0] );
       jamp_sv[1] -= amp_sv[0];
       jamp_sv[7] += amp_sv[0];
       jamp_sv[18] += amp_sv[0];
@@ -5046,12 +5046,12 @@ namespace mg5amcCpu
       // *** DIAGRAM 242 OF 1240 ***
 
       // Wavefunction(s) for diagram number 242
-      VVVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[1], w_fp[5], w_fp[6], COUPs[2], 0., 0., w_fp[55] );
-      VVVV3P0_1<W_ACCESS, CD_ACCESS>( w_fp[1], w_fp[5], w_fp[6], COUPs[2], 0., 0., w_fp[83] );
-      VVVV4P0_1<W_ACCESS, CD_ACCESS>( w_fp[1], w_fp[5], w_fp[6], COUPs[2], 0., 0., w_fp[84] );
+      VVVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[1], w_fp[5], w_fp[6], COUPs[2], 1.0, 0., 0., w_fp[55] );
+      VVVV3P0_1<W_ACCESS, CD_ACCESS>( w_fp[1], w_fp[5], w_fp[6], COUPs[2], 1.0, 0., 0., w_fp[83] );
+      VVVV4P0_1<W_ACCESS, CD_ACCESS>( w_fp[1], w_fp[5], w_fp[6], COUPs[2], 1.0, 0., 0., w_fp[84] );
 
       // Amplitude(s) for diagram number 242
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[55], w_fp[4], w_fp[62], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[55], w_fp[4], w_fp[62], COUPs[0], 1.0, &amp_fp[0] );
       jamp_sv[3] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[5] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[6] += cxtype( 0, 1 ) * amp_sv[0];
@@ -5060,7 +5060,7 @@ namespace mg5amcCpu
       jamp_sv[11] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[16] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[22] -= cxtype( 0, 1 ) * amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[83], w_fp[4], w_fp[62], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[83], w_fp[4], w_fp[62], COUPs[0], 1.0, &amp_fp[0] );
       jamp_sv[5] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[7] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[8] += cxtype( 0, 1 ) * amp_sv[0];
@@ -5069,7 +5069,7 @@ namespace mg5amcCpu
       jamp_sv[13] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[16] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[19] -= cxtype( 0, 1 ) * amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[84], w_fp[4], w_fp[62], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[84], w_fp[4], w_fp[62], COUPs[0], 1.0, &amp_fp[0] );
       jamp_sv[3] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[6] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[8] += cxtype( 0, 1 ) * amp_sv[0];
@@ -5085,17 +5085,17 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 243
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[9], w_fp[55], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[9], w_fp[55], COUPs[1], 1.0, &amp_fp[0] );
       jamp_sv[6] += amp_sv[0];
       jamp_sv[7] -= amp_sv[0];
       jamp_sv[9] -= amp_sv[0];
       jamp_sv[11] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[9], w_fp[83], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[9], w_fp[83], COUPs[1], 1.0, &amp_fp[0] );
       jamp_sv[7] -= amp_sv[0];
       jamp_sv[8] += amp_sv[0];
       jamp_sv[9] -= amp_sv[0];
       jamp_sv[10] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[9], w_fp[84], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[9], w_fp[84], COUPs[1], 1.0, &amp_fp[0] );
       jamp_sv[6] -= amp_sv[0];
       jamp_sv[8] += amp_sv[0];
       jamp_sv[10] += amp_sv[0];
@@ -5107,17 +5107,17 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 244
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[34], w_fp[55], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[34], w_fp[55], COUPs[1], 1.0, &amp_fp[0] );
       jamp_sv[3] += amp_sv[0];
       jamp_sv[5] -= amp_sv[0];
       jamp_sv[16] -= amp_sv[0];
       jamp_sv[22] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[34], w_fp[83], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[34], w_fp[83], COUPs[1], 1.0, &amp_fp[0] );
       jamp_sv[5] -= amp_sv[0];
       jamp_sv[13] += amp_sv[0];
       jamp_sv[16] -= amp_sv[0];
       jamp_sv[19] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[34], w_fp[84], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[34], w_fp[84], COUPs[1], 1.0, &amp_fp[0] );
       jamp_sv[3] -= amp_sv[0];
       jamp_sv[13] += amp_sv[0];
       jamp_sv[19] += amp_sv[0];
@@ -5129,17 +5129,17 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 245
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[77], w_fp[30], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[77], w_fp[30], COUPs[1], 1.0, &amp_fp[0] );
       jamp_sv[0] += amp_sv[0];
       jamp_sv[1] -= amp_sv[0];
       jamp_sv[3] -= amp_sv[0];
       jamp_sv[5] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[77], w_fp[31], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[77], w_fp[31], COUPs[1], 1.0, &amp_fp[0] );
       jamp_sv[1] -= amp_sv[0];
       jamp_sv[2] += amp_sv[0];
       jamp_sv[3] -= amp_sv[0];
       jamp_sv[4] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[77], w_fp[32], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[77], w_fp[32], COUPs[1], 1.0, &amp_fp[0] );
       jamp_sv[0] -= amp_sv[0];
       jamp_sv[2] += amp_sv[0];
       jamp_sv[4] += amp_sv[0];
@@ -5151,7 +5151,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 246
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[30], w_fp[62], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[30], w_fp[62], COUPs[0], 1.0, &amp_fp[0] );
       jamp_sv[0] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[1] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[3] += cxtype( 0, 1 ) * amp_sv[0];
@@ -5160,7 +5160,7 @@ namespace mg5amcCpu
       jamp_sv[11] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[17] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[23] += cxtype( 0, 1 ) * amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[31], w_fp[62], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[31], w_fp[62], COUPs[0], 1.0, &amp_fp[0] );
       jamp_sv[1] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[2] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[3] += cxtype( 0, 1 ) * amp_sv[0];
@@ -5169,7 +5169,7 @@ namespace mg5amcCpu
       jamp_sv[15] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[17] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[21] += cxtype( 0, 1 ) * amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[32], w_fp[62], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[32], w_fp[62], COUPs[0], 1.0, &amp_fp[0] );
       jamp_sv[0] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[2] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[4] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -5182,13 +5182,13 @@ namespace mg5amcCpu
       // *** DIAGRAM 247 OF 1240 ***
 
       // Wavefunction(s) for diagram number 247
-      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[0], COUPs[1], cIPD[0], cIPD[1], w_fp[62] );
-      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[2], w_fp[1], COUPs[1], cIPD[0], cIPD[1], w_fp[77] );
-      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[62], w_fp[4], COUPs[1], cIPD[0], cIPD[1], w_fp[34] );
-      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[77], w_fp[5], COUPs[1], cIPD[0], cIPD[1], w_fp[9] );
+      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[0], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[62] );
+      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[2], w_fp[1], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[77] );
+      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[62], w_fp[4], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[34] );
+      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[77], w_fp[5], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[9] );
 
       // Amplitude(s) for diagram number 247
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[34], w_fp[9], w_fp[6], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[34], w_fp[9], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 247 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -5198,10 +5198,10 @@ namespace mg5amcCpu
       // *** DIAGRAM 248 OF 1240 ***
 
       // Wavefunction(s) for diagram number 248
-      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[77], w_fp[6], COUPs[1], cIPD[0], cIPD[1], w_fp[85] );
+      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[77], w_fp[6], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[85] );
 
       // Amplitude(s) for diagram number 248
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[34], w_fp[85], w_fp[5], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[34], w_fp[85], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 248 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -5211,11 +5211,11 @@ namespace mg5amcCpu
       // *** DIAGRAM 249 OF 1240 ***
 
       // Wavefunction(s) for diagram number 249
-      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[62], w_fp[5], COUPs[1], cIPD[0], cIPD[1], w_fp[86] );
-      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[77], w_fp[4], COUPs[1], cIPD[0], cIPD[1], w_fp[87] );
+      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[62], w_fp[5], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[86] );
+      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[77], w_fp[4], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[87] );
 
       // Amplitude(s) for diagram number 249
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[86], w_fp[87], w_fp[6], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[86], w_fp[87], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 249 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -5228,7 +5228,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 250
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[86], w_fp[85], w_fp[4], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[86], w_fp[85], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 250 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -5238,10 +5238,10 @@ namespace mg5amcCpu
       // *** DIAGRAM 251 OF 1240 ***
 
       // Wavefunction(s) for diagram number 251
-      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[62], w_fp[6], COUPs[1], cIPD[0], cIPD[1], w_fp[88] );
+      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[62], w_fp[6], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[88] );
 
       // Amplitude(s) for diagram number 251
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[88], w_fp[87], w_fp[5], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[88], w_fp[87], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 251 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -5254,7 +5254,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 252
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[88], w_fp[9], w_fp[4], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[88], w_fp[9], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 252 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -5264,10 +5264,10 @@ namespace mg5amcCpu
       // *** DIAGRAM 253 OF 1240 ***
 
       // Wavefunction(s) for diagram number 253
-      FFV1P0_3<W_ACCESS, CD_ACCESS>( w_fp[62], w_fp[77], COUPs[1], 0., 0., w_fp[89] );
+      FFV1P0_3<W_ACCESS, CD_ACCESS>( w_fp[62], w_fp[77], COUPs[1], 1.0, 0., 0., w_fp[89] );
 
       // Amplitude(s) for diagram number 253
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[24], w_fp[6], w_fp[89], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[24], w_fp[6], w_fp[89], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 253 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -5280,10 +5280,10 @@ namespace mg5amcCpu
       // *** DIAGRAM 254 OF 1240 ***
 
       // Wavefunction(s) for diagram number 254
-      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[62], w_fp[24], COUPs[1], cIPD[0], cIPD[1], w_fp[90] );
+      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[62], w_fp[24], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[90] );
 
       // Amplitude(s) for diagram number 254
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[90], w_fp[77], w_fp[6], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[90], w_fp[77], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 254 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -5297,7 +5297,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 255
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[88], w_fp[77], w_fp[24], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[88], w_fp[77], w_fp[24], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 255 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -5311,7 +5311,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 256
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[27], w_fp[5], w_fp[89], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[27], w_fp[5], w_fp[89], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 256 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -5324,10 +5324,10 @@ namespace mg5amcCpu
       // *** DIAGRAM 257 OF 1240 ***
 
       // Wavefunction(s) for diagram number 257
-      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[62], w_fp[27], COUPs[1], cIPD[0], cIPD[1], w_fp[91] );
+      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[62], w_fp[27], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[91] );
 
       // Amplitude(s) for diagram number 257
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[91], w_fp[77], w_fp[5], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[91], w_fp[77], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 257 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -5341,7 +5341,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 258
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[86], w_fp[77], w_fp[27], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[86], w_fp[77], w_fp[27], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 258 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -5355,7 +5355,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 259
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[4], w_fp[29], w_fp[89], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[4], w_fp[29], w_fp[89], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 259 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -5371,7 +5371,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 260
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[34], w_fp[77], w_fp[29], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[34], w_fp[77], w_fp[29], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 260 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -5382,10 +5382,10 @@ namespace mg5amcCpu
       // *** DIAGRAM 261 OF 1240 ***
 
       // Wavefunction(s) for diagram number 261
-      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[62], w_fp[29], COUPs[1], cIPD[0], cIPD[1], w_fp[89] );
+      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[62], w_fp[29], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[89] );
 
       // Amplitude(s) for diagram number 261
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[89], w_fp[77], w_fp[4], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[89], w_fp[77], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 261 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -5399,17 +5399,17 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 262
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[62], w_fp[77], w_fp[30], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[62], w_fp[77], w_fp[30], COUPs[1], 1.0, &amp_fp[0] );
       jamp_sv[33] += amp_sv[0];
       jamp_sv[35] -= amp_sv[0];
       jamp_sv[41] -= amp_sv[0];
       jamp_sv[47] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[62], w_fp[77], w_fp[31], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[62], w_fp[77], w_fp[31], COUPs[1], 1.0, &amp_fp[0] );
       jamp_sv[35] -= amp_sv[0];
       jamp_sv[39] += amp_sv[0];
       jamp_sv[41] -= amp_sv[0];
       jamp_sv[45] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[62], w_fp[77], w_fp[32], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[62], w_fp[77], w_fp[32], COUPs[1], 1.0, &amp_fp[0] );
       jamp_sv[33] -= amp_sv[0];
       jamp_sv[39] += amp_sv[0];
       jamp_sv[45] += amp_sv[0];
@@ -5418,10 +5418,10 @@ namespace mg5amcCpu
       // *** DIAGRAM 263 OF 1240 ***
 
       // Wavefunction(s) for diagram number 263
-      FFV1P0_3<W_ACCESS, CD_ACCESS>( w_fp[62], w_fp[2], COUPs[1], 0., 0., w_fp[92] );
+      FFV1P0_3<W_ACCESS, CD_ACCESS>( w_fp[62], w_fp[2], COUPs[1], 1.0, 0., 0., w_fp[92] );
 
       // Amplitude(s) for diagram number 263
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[92], w_fp[63], w_fp[6], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[92], w_fp[63], w_fp[6], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 263 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -5441,7 +5441,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 264
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[92], w_fp[64], w_fp[5], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[92], w_fp[64], w_fp[5], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 264 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -5461,7 +5461,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 265
-      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[61], w_fp[5], w_fp[6], w_fp[92], COUPs[2], &amp_fp[0] );
+      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[61], w_fp[5], w_fp[6], w_fp[92], COUPs[2], 1.0, &amp_fp[0] );
       jamp_sv[33] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[57] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[81] += cxtype( 0, 1 ) * amp_sv[0];
@@ -5470,7 +5470,7 @@ namespace mg5amcCpu
       jamp_sv[111] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[117] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[119] += cxtype( 0, 1 ) * amp_sv[0];
-      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[61], w_fp[5], w_fp[6], w_fp[92], COUPs[2], &amp_fp[0] );
+      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[61], w_fp[5], w_fp[6], w_fp[92], COUPs[2], 1.0, &amp_fp[0] );
       jamp_sv[35] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[59] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[81] += cxtype( 0, 1 ) * amp_sv[0];
@@ -5479,7 +5479,7 @@ namespace mg5amcCpu
       jamp_sv[95] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[105] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[111] -= cxtype( 0, 1 ) * amp_sv[0];
-      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[61], w_fp[5], w_fp[6], w_fp[92], COUPs[2], &amp_fp[0] );
+      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[61], w_fp[5], w_fp[6], w_fp[92], COUPs[2], 1.0, &amp_fp[0] );
       jamp_sv[33] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[35] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[57] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -5492,10 +5492,10 @@ namespace mg5amcCpu
       // *** DIAGRAM 266 OF 1240 ***
 
       // Wavefunction(s) for diagram number 266
-      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[2], w_fp[61], COUPs[1], cIPD[0], cIPD[1], w_fp[93] );
+      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[2], w_fp[61], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[93] );
 
       // Amplitude(s) for diagram number 266
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[86], w_fp[93], w_fp[6], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[86], w_fp[93], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 266 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -5509,7 +5509,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 267
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[86], w_fp[2], w_fp[64], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[86], w_fp[2], w_fp[64], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 267 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -5525,7 +5525,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 268
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[88], w_fp[93], w_fp[5], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[88], w_fp[93], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 268 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -5539,7 +5539,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 269
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[88], w_fp[2], w_fp[63], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[88], w_fp[2], w_fp[63], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 269 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -5552,10 +5552,10 @@ namespace mg5amcCpu
       // *** DIAGRAM 270 OF 1240 ***
 
       // Wavefunction(s) for diagram number 270
-      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[62], w_fp[61], COUPs[1], cIPD[0], cIPD[1], w_fp[94] );
+      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[62], w_fp[61], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[94] );
 
       // Amplitude(s) for diagram number 270
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[94], w_fp[39], w_fp[6], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[94], w_fp[39], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 270 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -5566,10 +5566,10 @@ namespace mg5amcCpu
       // *** DIAGRAM 271 OF 1240 ***
 
       // Wavefunction(s) for diagram number 271
-      FFV1P0_3<W_ACCESS, CD_ACCESS>( w_fp[62], w_fp[39], COUPs[1], 0., 0., w_fp[95] );
+      FFV1P0_3<W_ACCESS, CD_ACCESS>( w_fp[62], w_fp[39], COUPs[1], 1.0, 0., 0., w_fp[95] );
 
       // Amplitude(s) for diagram number 271
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[61], w_fp[6], w_fp[95], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[61], w_fp[6], w_fp[95], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 271 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -5585,7 +5585,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 272
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[88], w_fp[39], w_fp[61], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[88], w_fp[39], w_fp[61], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 272 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -5599,7 +5599,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 273
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[94], w_fp[47], w_fp[5], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[94], w_fp[47], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 273 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -5610,10 +5610,10 @@ namespace mg5amcCpu
       // *** DIAGRAM 274 OF 1240 ***
 
       // Wavefunction(s) for diagram number 274
-      FFV1P0_3<W_ACCESS, CD_ACCESS>( w_fp[62], w_fp[47], COUPs[1], 0., 0., w_fp[96] );
+      FFV1P0_3<W_ACCESS, CD_ACCESS>( w_fp[62], w_fp[47], COUPs[1], 1.0, 0., 0., w_fp[96] );
 
       // Amplitude(s) for diagram number 274
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[61], w_fp[5], w_fp[96], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[61], w_fp[5], w_fp[96], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 274 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -5629,7 +5629,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 275
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[86], w_fp[47], w_fp[61], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[86], w_fp[47], w_fp[61], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 275 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -5643,7 +5643,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 276
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[94], w_fp[2], w_fp[29], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[94], w_fp[2], w_fp[29], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 276 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -5659,7 +5659,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 277
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[61], w_fp[29], w_fp[92], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[61], w_fp[29], w_fp[92], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 277 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -5679,7 +5679,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 278
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[89], w_fp[2], w_fp[61], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[89], w_fp[2], w_fp[61], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 278 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -5695,7 +5695,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 279
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[92], w_fp[69], w_fp[6], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[92], w_fp[69], w_fp[6], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 279 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -5715,7 +5715,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 280
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[92], w_fp[70], w_fp[4], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[92], w_fp[70], w_fp[4], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 280 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -5735,7 +5735,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 281
-      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[66], w_fp[4], w_fp[6], w_fp[92], COUPs[2], &amp_fp[0] );
+      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[66], w_fp[4], w_fp[6], w_fp[92], COUPs[2], 1.0, &amp_fp[0] );
       jamp_sv[39] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[57] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[63] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -5744,7 +5744,7 @@ namespace mg5amcCpu
       jamp_sv[111] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[113] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[117] -= cxtype( 0, 1 ) * amp_sv[0];
-      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[66], w_fp[4], w_fp[6], w_fp[92], COUPs[2], &amp_fp[0] );
+      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[66], w_fp[4], w_fp[6], w_fp[92], COUPs[2], 1.0, &amp_fp[0] );
       jamp_sv[41] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[57] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[63] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -5753,7 +5753,7 @@ namespace mg5amcCpu
       jamp_sv[83] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[107] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[117] -= cxtype( 0, 1 ) * amp_sv[0];
-      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[66], w_fp[4], w_fp[6], w_fp[92], COUPs[2], &amp_fp[0] );
+      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[66], w_fp[4], w_fp[6], w_fp[92], COUPs[2], 1.0, &amp_fp[0] );
       jamp_sv[39] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[41] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[69] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -5766,10 +5766,10 @@ namespace mg5amcCpu
       // *** DIAGRAM 282 OF 1240 ***
 
       // Wavefunction(s) for diagram number 282
-      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[2], w_fp[66], COUPs[1], cIPD[0], cIPD[1], w_fp[94] );
+      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[2], w_fp[66], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[94] );
 
       // Amplitude(s) for diagram number 282
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[34], w_fp[94], w_fp[6], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[34], w_fp[94], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 282 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -5783,7 +5783,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 283
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[34], w_fp[2], w_fp[70], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[34], w_fp[2], w_fp[70], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 283 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -5799,7 +5799,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 284
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[88], w_fp[94], w_fp[4], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[88], w_fp[94], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 284 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -5813,7 +5813,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 285
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[88], w_fp[2], w_fp[69], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[88], w_fp[2], w_fp[69], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 285 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -5826,10 +5826,10 @@ namespace mg5amcCpu
       // *** DIAGRAM 286 OF 1240 ***
 
       // Wavefunction(s) for diagram number 286
-      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[62], w_fp[66], COUPs[1], cIPD[0], cIPD[1], w_fp[97] );
+      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[62], w_fp[66], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[97] );
 
       // Amplitude(s) for diagram number 286
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[97], w_fp[33], w_fp[6], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[97], w_fp[33], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 286 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -5840,10 +5840,10 @@ namespace mg5amcCpu
       // *** DIAGRAM 287 OF 1240 ***
 
       // Wavefunction(s) for diagram number 287
-      FFV1P0_3<W_ACCESS, CD_ACCESS>( w_fp[62], w_fp[33], COUPs[1], 0., 0., w_fp[98] );
+      FFV1P0_3<W_ACCESS, CD_ACCESS>( w_fp[62], w_fp[33], COUPs[1], 1.0, 0., 0., w_fp[98] );
 
       // Amplitude(s) for diagram number 287
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[66], w_fp[6], w_fp[98], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[66], w_fp[6], w_fp[98], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 287 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -5859,7 +5859,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 288
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[88], w_fp[33], w_fp[66], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[88], w_fp[33], w_fp[66], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 288 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -5873,7 +5873,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 289
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[97], w_fp[47], w_fp[4], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[97], w_fp[47], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 289 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -5887,7 +5887,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 290
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[66], w_fp[4], w_fp[96], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[66], w_fp[4], w_fp[96], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 290 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -5903,7 +5903,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 291
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[34], w_fp[47], w_fp[66], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[34], w_fp[47], w_fp[66], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 291 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -5917,7 +5917,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 292
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[97], w_fp[2], w_fp[27], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[97], w_fp[2], w_fp[27], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 292 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -5933,7 +5933,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 293
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[66], w_fp[27], w_fp[92], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[66], w_fp[27], w_fp[92], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 293 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -5953,7 +5953,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 294
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[91], w_fp[2], w_fp[66], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[91], w_fp[2], w_fp[66], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 294 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -5969,7 +5969,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 295
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[92], w_fp[74], w_fp[5], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[92], w_fp[74], w_fp[5], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 295 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -5989,7 +5989,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 296
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[92], w_fp[75], w_fp[4], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[92], w_fp[75], w_fp[4], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 296 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -6009,7 +6009,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 297
-      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[72], w_fp[4], w_fp[5], w_fp[92], COUPs[2], &amp_fp[0] );
+      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[72], w_fp[4], w_fp[5], w_fp[92], COUPs[2], 1.0, &amp_fp[0] );
       jamp_sv[45] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[59] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[69] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -6018,7 +6018,7 @@ namespace mg5amcCpu
       jamp_sv[89] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[93] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[105] += cxtype( 0, 1 ) * amp_sv[0];
-      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[72], w_fp[4], w_fp[5], w_fp[92], COUPs[2], &amp_fp[0] );
+      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[72], w_fp[4], w_fp[5], w_fp[92], COUPs[2], 1.0, &amp_fp[0] );
       jamp_sv[47] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[59] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[63] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -6027,7 +6027,7 @@ namespace mg5amcCpu
       jamp_sv[83] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[93] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[107] += cxtype( 0, 1 ) * amp_sv[0];
-      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[72], w_fp[4], w_fp[5], w_fp[92], COUPs[2], &amp_fp[0] );
+      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[72], w_fp[4], w_fp[5], w_fp[92], COUPs[2], 1.0, &amp_fp[0] );
       jamp_sv[45] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[47] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[63] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -6040,10 +6040,10 @@ namespace mg5amcCpu
       // *** DIAGRAM 298 OF 1240 ***
 
       // Wavefunction(s) for diagram number 298
-      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[2], w_fp[72], COUPs[1], cIPD[0], cIPD[1], w_fp[97] );
+      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[2], w_fp[72], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[97] );
 
       // Amplitude(s) for diagram number 298
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[34], w_fp[97], w_fp[5], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[34], w_fp[97], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 298 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -6057,7 +6057,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 299
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[34], w_fp[2], w_fp[75], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[34], w_fp[2], w_fp[75], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 299 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -6073,7 +6073,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 300
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[86], w_fp[97], w_fp[4], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[86], w_fp[97], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 300 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -6087,7 +6087,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 301
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[86], w_fp[2], w_fp[74], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[86], w_fp[2], w_fp[74], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 301 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -6100,10 +6100,10 @@ namespace mg5amcCpu
       // *** DIAGRAM 302 OF 1240 ***
 
       // Wavefunction(s) for diagram number 302
-      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[62], w_fp[72], COUPs[1], cIPD[0], cIPD[1], w_fp[99] );
+      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[62], w_fp[72], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[99] );
 
       // Amplitude(s) for diagram number 302
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[33], w_fp[5], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[33], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 302 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -6117,7 +6117,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 303
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[72], w_fp[5], w_fp[98], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[72], w_fp[5], w_fp[98], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 303 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -6133,7 +6133,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 304
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[86], w_fp[33], w_fp[72], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[86], w_fp[33], w_fp[72], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 304 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -6147,7 +6147,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 305
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[39], w_fp[4], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[39], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 305 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -6161,7 +6161,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 306
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[72], w_fp[4], w_fp[95], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[72], w_fp[4], w_fp[95], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 306 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -6177,7 +6177,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 307
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[34], w_fp[39], w_fp[72], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[34], w_fp[39], w_fp[72], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 307 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -6191,7 +6191,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 308
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[2], w_fp[24], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[2], w_fp[24], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 308 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -6207,7 +6207,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 309
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[72], w_fp[24], w_fp[92], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[72], w_fp[24], w_fp[92], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 309 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -6227,7 +6227,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 310
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[90], w_fp[2], w_fp[72], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[90], w_fp[2], w_fp[72], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 310 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -6240,10 +6240,10 @@ namespace mg5amcCpu
       // *** DIAGRAM 311 OF 1240 ***
 
       // Wavefunction(s) for diagram number 311
-      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[62], w_fp[1], COUPs[1], cIPD[0], cIPD[1], w_fp[99] );
+      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[62], w_fp[1], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[99] );
 
       // Amplitude(s) for diagram number 311
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[35], w_fp[6], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[35], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 311 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -6256,7 +6256,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 312
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[36], w_fp[5], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[36], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 312 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -6266,10 +6266,10 @@ namespace mg5amcCpu
       // *** DIAGRAM 313 OF 1240 ***
 
       // Wavefunction(s) for diagram number 313
-      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[33], w_fp[1], COUPs[1], cIPD[0], cIPD[1], w_fp[100] );
+      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[33], w_fp[1], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[100] );
 
       // Amplitude(s) for diagram number 313
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[86], w_fp[100], w_fp[6], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[86], w_fp[100], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 313 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -6282,7 +6282,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 314
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[86], w_fp[36], w_fp[1], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[86], w_fp[36], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 314 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -6295,7 +6295,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 315
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[88], w_fp[100], w_fp[5], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[88], w_fp[100], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 315 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -6308,7 +6308,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 316
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[88], w_fp[35], w_fp[1], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[88], w_fp[35], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 316 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -6321,7 +6321,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 317
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[33], w_fp[29], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[33], w_fp[29], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 317 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -6335,7 +6335,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 318
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[29], w_fp[98], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[29], w_fp[98], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 318 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -6351,7 +6351,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 319
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[89], w_fp[33], w_fp[1], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[89], w_fp[33], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 319 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -6365,7 +6365,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 320
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[43], w_fp[6], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[43], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 320 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -6378,7 +6378,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 321
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[44], w_fp[4], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[44], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 321 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -6388,10 +6388,10 @@ namespace mg5amcCpu
       // *** DIAGRAM 322 OF 1240 ***
 
       // Wavefunction(s) for diagram number 322
-      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[39], w_fp[1], COUPs[1], cIPD[0], cIPD[1], w_fp[89] );
+      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[39], w_fp[1], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[89] );
 
       // Amplitude(s) for diagram number 322
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[34], w_fp[89], w_fp[6], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[34], w_fp[89], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 322 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -6404,7 +6404,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 323
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[34], w_fp[44], w_fp[1], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[34], w_fp[44], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 323 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -6417,7 +6417,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 324
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[88], w_fp[89], w_fp[4], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[88], w_fp[89], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 324 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -6430,7 +6430,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 325
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[88], w_fp[43], w_fp[1], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[88], w_fp[43], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 325 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -6443,7 +6443,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 326
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[39], w_fp[27], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[39], w_fp[27], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 326 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -6457,7 +6457,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 327
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[27], w_fp[95], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[27], w_fp[95], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 327 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -6473,7 +6473,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 328
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[91], w_fp[39], w_fp[1], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[91], w_fp[39], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 328 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -6487,7 +6487,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 329
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[49], w_fp[5], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[49], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 329 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -6500,7 +6500,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 330
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[50], w_fp[4], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[50], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 330 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -6510,10 +6510,10 @@ namespace mg5amcCpu
       // *** DIAGRAM 331 OF 1240 ***
 
       // Wavefunction(s) for diagram number 331
-      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[47], w_fp[1], COUPs[1], cIPD[0], cIPD[1], w_fp[91] );
+      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[47], w_fp[1], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[91] );
 
       // Amplitude(s) for diagram number 331
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[34], w_fp[91], w_fp[5], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[34], w_fp[91], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 331 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -6526,7 +6526,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 332
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[34], w_fp[50], w_fp[1], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[34], w_fp[50], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 332 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -6539,7 +6539,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 333
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[86], w_fp[91], w_fp[4], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[86], w_fp[91], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 333 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -6552,7 +6552,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 334
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[86], w_fp[49], w_fp[1], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[86], w_fp[49], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 334 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -6565,7 +6565,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 335
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[47], w_fp[24], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[47], w_fp[24], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 335 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -6579,7 +6579,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 336
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[24], w_fp[96], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[24], w_fp[96], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 336 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -6595,7 +6595,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 337
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[90], w_fp[47], w_fp[1], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[90], w_fp[47], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 337 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -6609,7 +6609,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 338
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[17], w_fp[6], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[17], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 338 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -6623,7 +6623,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 339
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[2], w_fp[42], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[2], w_fp[42], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 339 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -6639,7 +6639,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 340
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[92], w_fp[59], w_fp[6], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[92], w_fp[59], w_fp[6], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 340 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -6659,7 +6659,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 341
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[92], w_fp[1], w_fp[42], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[92], w_fp[1], w_fp[42], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 341 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -6679,7 +6679,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 342
-      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[24], w_fp[6], w_fp[92], COUPs[2], &amp_fp[0] );
+      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[24], w_fp[6], w_fp[92], COUPs[2], 1.0, &amp_fp[0] );
       jamp_sv[33] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[39] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[63] += cxtype( 0, 1 ) * amp_sv[0];
@@ -6688,7 +6688,7 @@ namespace mg5amcCpu
       jamp_sv[107] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[113] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[119] += cxtype( 0, 1 ) * amp_sv[0];
-      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[24], w_fp[6], w_fp[92], COUPs[2], &amp_fp[0] );
+      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[24], w_fp[6], w_fp[92], COUPs[2], 1.0, &amp_fp[0] );
       jamp_sv[45] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[47] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[63] += cxtype( 0, 1 ) * amp_sv[0];
@@ -6697,7 +6697,7 @@ namespace mg5amcCpu
       jamp_sv[89] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[105] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[107] -= cxtype( 0, 1 ) * amp_sv[0];
-      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[24], w_fp[6], w_fp[92], COUPs[2], &amp_fp[0] );
+      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[24], w_fp[6], w_fp[92], COUPs[2], 1.0, &amp_fp[0] );
       jamp_sv[33] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[39] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[45] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -6713,7 +6713,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 343
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[88], w_fp[2], w_fp[59], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[88], w_fp[2], w_fp[59], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 343 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -6729,7 +6729,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 344
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[88], w_fp[17], w_fp[1], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[88], w_fp[17], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 344 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -6743,7 +6743,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 345
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[15], w_fp[5], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[15], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 345 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -6757,7 +6757,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 346
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[2], w_fp[16], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[2], w_fp[16], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 346 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -6773,7 +6773,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 347
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[92], w_fp[68], w_fp[5], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[92], w_fp[68], w_fp[5], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 347 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -6793,7 +6793,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 348
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[92], w_fp[1], w_fp[16], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[92], w_fp[1], w_fp[16], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 348 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -6813,7 +6813,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 349
-      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[27], w_fp[5], w_fp[92], COUPs[2], &amp_fp[0] );
+      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[27], w_fp[5], w_fp[92], COUPs[2], 1.0, &amp_fp[0] );
       jamp_sv[35] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[45] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[69] += cxtype( 0, 1 ) * amp_sv[0];
@@ -6822,7 +6822,7 @@ namespace mg5amcCpu
       jamp_sv[89] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[95] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[111] -= cxtype( 0, 1 ) * amp_sv[0];
-      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[27], w_fp[5], w_fp[92], COUPs[2], &amp_fp[0] );
+      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[27], w_fp[5], w_fp[92], COUPs[2], 1.0, &amp_fp[0] );
       jamp_sv[39] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[41] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[69] += cxtype( 0, 1 ) * amp_sv[0];
@@ -6831,7 +6831,7 @@ namespace mg5amcCpu
       jamp_sv[83] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[111] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[113] += cxtype( 0, 1 ) * amp_sv[0];
-      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[27], w_fp[5], w_fp[92], COUPs[2], &amp_fp[0] );
+      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[27], w_fp[5], w_fp[92], COUPs[2], 1.0, &amp_fp[0] );
       jamp_sv[35] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[39] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[41] += cxtype( 0, 1 ) * amp_sv[0];
@@ -6847,7 +6847,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 350
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[86], w_fp[2], w_fp[68], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[86], w_fp[2], w_fp[68], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 350 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -6863,7 +6863,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 351
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[86], w_fp[15], w_fp[1], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[86], w_fp[15], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 351 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -6877,7 +6877,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 352
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[18], w_fp[4], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[18], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 352 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -6891,7 +6891,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 353
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[2], w_fp[19], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[2], w_fp[19], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 353 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -6907,7 +6907,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 354
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[92], w_fp[67], w_fp[4], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[92], w_fp[67], w_fp[4], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 354 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -6927,7 +6927,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 355
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[92], w_fp[1], w_fp[19], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[92], w_fp[1], w_fp[19], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 355 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -6947,7 +6947,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 356
-      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[4], w_fp[29], w_fp[92], COUPs[2], &amp_fp[0] );
+      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[4], w_fp[29], w_fp[92], COUPs[2], 1.0, &amp_fp[0] );
       jamp_sv[33] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[35] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[57] += cxtype( 0, 1 ) * amp_sv[0];
@@ -6956,7 +6956,7 @@ namespace mg5amcCpu
       jamp_sv[95] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[117] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[119] += cxtype( 0, 1 ) * amp_sv[0];
-      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[4], w_fp[29], w_fp[92], COUPs[2], &amp_fp[0] );
+      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[4], w_fp[29], w_fp[92], COUPs[2], 1.0, &amp_fp[0] );
       jamp_sv[41] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[47] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[57] += cxtype( 0, 1 ) * amp_sv[0];
@@ -6965,7 +6965,7 @@ namespace mg5amcCpu
       jamp_sv[71] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[93] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[117] -= cxtype( 0, 1 ) * amp_sv[0];
-      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[4], w_fp[29], w_fp[92], COUPs[2], &amp_fp[0] );
+      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[4], w_fp[29], w_fp[92], COUPs[2], 1.0, &amp_fp[0] );
       jamp_sv[33] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[35] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[41] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -6981,7 +6981,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 357
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[34], w_fp[2], w_fp[67], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[34], w_fp[2], w_fp[67], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 357 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -6997,7 +6997,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 358
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[34], w_fp[18], w_fp[1], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[34], w_fp[18], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 358 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -7011,7 +7011,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 359
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[73], w_fp[6], w_fp[92], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[73], w_fp[6], w_fp[92], COUPs[0], 1.0, &amp_fp[0] );
       jamp_sv[33] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[39] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[63] += cxtype( 0, 1 ) * amp_sv[0];
@@ -7020,7 +7020,7 @@ namespace mg5amcCpu
       jamp_sv[107] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[113] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[119] += cxtype( 0, 1 ) * amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[79], w_fp[6], w_fp[92], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[79], w_fp[6], w_fp[92], COUPs[0], 1.0, &amp_fp[0] );
       jamp_sv[39] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[57] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[63] += cxtype( 0, 1 ) * amp_sv[0];
@@ -7029,7 +7029,7 @@ namespace mg5amcCpu
       jamp_sv[111] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[113] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[117] += cxtype( 0, 1 ) * amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[80], w_fp[6], w_fp[92], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[80], w_fp[6], w_fp[92], COUPs[0], 1.0, &amp_fp[0] );
       jamp_sv[33] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[57] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[81] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -7045,17 +7045,17 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 360
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[88], w_fp[2], w_fp[73], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[88], w_fp[2], w_fp[73], COUPs[1], 1.0, &amp_fp[0] );
       jamp_sv[33] += amp_sv[0];
       jamp_sv[39] -= amp_sv[0];
       jamp_sv[63] -= amp_sv[0];
       jamp_sv[87] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[88], w_fp[2], w_fp[79], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[88], w_fp[2], w_fp[79], COUPs[1], 1.0, &amp_fp[0] );
       jamp_sv[39] -= amp_sv[0];
       jamp_sv[57] += amp_sv[0];
       jamp_sv[63] -= amp_sv[0];
       jamp_sv[81] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[88], w_fp[2], w_fp[80], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[88], w_fp[2], w_fp[80], COUPs[1], 1.0, &amp_fp[0] );
       jamp_sv[33] -= amp_sv[0];
       jamp_sv[57] += amp_sv[0];
       jamp_sv[81] += amp_sv[0];
@@ -7067,17 +7067,17 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 361
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[62], w_fp[47], w_fp[73], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[62], w_fp[47], w_fp[73], COUPs[1], 1.0, &amp_fp[0] );
       jamp_sv[105] += amp_sv[0];
       jamp_sv[107] -= amp_sv[0];
       jamp_sv[113] -= amp_sv[0];
       jamp_sv[119] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[62], w_fp[47], w_fp[79], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[62], w_fp[47], w_fp[79], COUPs[1], 1.0, &amp_fp[0] );
       jamp_sv[107] -= amp_sv[0];
       jamp_sv[111] += amp_sv[0];
       jamp_sv[113] -= amp_sv[0];
       jamp_sv[117] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[62], w_fp[47], w_fp[80], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[62], w_fp[47], w_fp[80], COUPs[1], 1.0, &amp_fp[0] );
       jamp_sv[105] -= amp_sv[0];
       jamp_sv[111] += amp_sv[0];
       jamp_sv[117] += amp_sv[0];
@@ -7089,7 +7089,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 362
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[57], w_fp[5], w_fp[92], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[57], w_fp[5], w_fp[92], COUPs[0], 1.0, &amp_fp[0] );
       jamp_sv[35] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[45] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[69] += cxtype( 0, 1 ) * amp_sv[0];
@@ -7098,7 +7098,7 @@ namespace mg5amcCpu
       jamp_sv[89] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[95] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[111] -= cxtype( 0, 1 ) * amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[81], w_fp[5], w_fp[92], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[81], w_fp[5], w_fp[92], COUPs[0], 1.0, &amp_fp[0] );
       jamp_sv[45] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[59] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[69] += cxtype( 0, 1 ) * amp_sv[0];
@@ -7107,7 +7107,7 @@ namespace mg5amcCpu
       jamp_sv[89] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[93] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[105] -= cxtype( 0, 1 ) * amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[82], w_fp[5], w_fp[92], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[82], w_fp[5], w_fp[92], COUPs[0], 1.0, &amp_fp[0] );
       jamp_sv[35] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[59] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[81] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -7123,17 +7123,17 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 363
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[86], w_fp[2], w_fp[57], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[86], w_fp[2], w_fp[57], COUPs[1], 1.0, &amp_fp[0] );
       jamp_sv[35] += amp_sv[0];
       jamp_sv[45] -= amp_sv[0];
       jamp_sv[69] -= amp_sv[0];
       jamp_sv[111] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[86], w_fp[2], w_fp[81], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[86], w_fp[2], w_fp[81], COUPs[1], 1.0, &amp_fp[0] );
       jamp_sv[45] -= amp_sv[0];
       jamp_sv[59] += amp_sv[0];
       jamp_sv[69] -= amp_sv[0];
       jamp_sv[105] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[86], w_fp[2], w_fp[82], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[86], w_fp[2], w_fp[82], COUPs[1], 1.0, &amp_fp[0] );
       jamp_sv[35] -= amp_sv[0];
       jamp_sv[59] += amp_sv[0];
       jamp_sv[105] += amp_sv[0];
@@ -7145,17 +7145,17 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 364
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[62], w_fp[39], w_fp[57], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[62], w_fp[39], w_fp[57], COUPs[1], 1.0, &amp_fp[0] );
       jamp_sv[81] += amp_sv[0];
       jamp_sv[83] -= amp_sv[0];
       jamp_sv[89] -= amp_sv[0];
       jamp_sv[95] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[62], w_fp[39], w_fp[81], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[62], w_fp[39], w_fp[81], COUPs[1], 1.0, &amp_fp[0] );
       jamp_sv[83] -= amp_sv[0];
       jamp_sv[87] += amp_sv[0];
       jamp_sv[89] -= amp_sv[0];
       jamp_sv[93] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[62], w_fp[39], w_fp[82], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[62], w_fp[39], w_fp[82], COUPs[1], 1.0, &amp_fp[0] );
       jamp_sv[81] -= amp_sv[0];
       jamp_sv[87] += amp_sv[0];
       jamp_sv[93] += amp_sv[0];
@@ -7167,7 +7167,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 365
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[55], w_fp[4], w_fp[92], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[55], w_fp[4], w_fp[92], COUPs[0], 1.0, &amp_fp[0] );
       jamp_sv[41] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[47] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[57] += cxtype( 0, 1 ) * amp_sv[0];
@@ -7176,7 +7176,7 @@ namespace mg5amcCpu
       jamp_sv[71] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[93] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[117] -= cxtype( 0, 1 ) * amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[83], w_fp[4], w_fp[92], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[83], w_fp[4], w_fp[92], COUPs[0], 1.0, &amp_fp[0] );
       jamp_sv[47] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[59] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[63] += cxtype( 0, 1 ) * amp_sv[0];
@@ -7185,7 +7185,7 @@ namespace mg5amcCpu
       jamp_sv[83] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[93] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[107] -= cxtype( 0, 1 ) * amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[84], w_fp[4], w_fp[92], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[84], w_fp[4], w_fp[92], COUPs[0], 1.0, &amp_fp[0] );
       jamp_sv[41] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[57] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[63] += cxtype( 0, 1 ) * amp_sv[0];
@@ -7201,17 +7201,17 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 366
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[34], w_fp[2], w_fp[55], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[34], w_fp[2], w_fp[55], COUPs[1], 1.0, &amp_fp[0] );
       jamp_sv[41] += amp_sv[0];
       jamp_sv[47] -= amp_sv[0];
       jamp_sv[93] -= amp_sv[0];
       jamp_sv[117] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[34], w_fp[2], w_fp[83], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[34], w_fp[2], w_fp[83], COUPs[1], 1.0, &amp_fp[0] );
       jamp_sv[47] -= amp_sv[0];
       jamp_sv[83] += amp_sv[0];
       jamp_sv[93] -= amp_sv[0];
       jamp_sv[107] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[34], w_fp[2], w_fp[84], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[34], w_fp[2], w_fp[84], COUPs[1], 1.0, &amp_fp[0] );
       jamp_sv[41] -= amp_sv[0];
       jamp_sv[83] += amp_sv[0];
       jamp_sv[107] += amp_sv[0];
@@ -7223,17 +7223,17 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 367
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[62], w_fp[33], w_fp[55], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[62], w_fp[33], w_fp[55], COUPs[1], 1.0, &amp_fp[0] );
       jamp_sv[57] += amp_sv[0];
       jamp_sv[59] -= amp_sv[0];
       jamp_sv[65] -= amp_sv[0];
       jamp_sv[71] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[62], w_fp[33], w_fp[83], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[62], w_fp[33], w_fp[83], COUPs[1], 1.0, &amp_fp[0] );
       jamp_sv[59] -= amp_sv[0];
       jamp_sv[63] += amp_sv[0];
       jamp_sv[65] -= amp_sv[0];
       jamp_sv[69] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[62], w_fp[33], w_fp[84], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[62], w_fp[33], w_fp[84], COUPs[1], 1.0, &amp_fp[0] );
       jamp_sv[57] -= amp_sv[0];
       jamp_sv[63] += amp_sv[0];
       jamp_sv[69] += amp_sv[0];
@@ -7245,17 +7245,17 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 368
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[2], w_fp[30], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[2], w_fp[30], COUPs[1], 1.0, &amp_fp[0] );
       jamp_sv[65] += amp_sv[0];
       jamp_sv[71] -= amp_sv[0];
       jamp_sv[95] -= amp_sv[0];
       jamp_sv[119] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[2], w_fp[31], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[2], w_fp[31], COUPs[1], 1.0, &amp_fp[0] );
       jamp_sv[71] -= amp_sv[0];
       jamp_sv[89] += amp_sv[0];
       jamp_sv[95] -= amp_sv[0];
       jamp_sv[113] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[2], w_fp[32], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[2], w_fp[32], COUPs[1], 1.0, &amp_fp[0] );
       jamp_sv[65] -= amp_sv[0];
       jamp_sv[89] += amp_sv[0];
       jamp_sv[113] += amp_sv[0];
@@ -7267,7 +7267,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 369
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[30], w_fp[92], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[30], w_fp[92], COUPs[0], 1.0, &amp_fp[0] );
       jamp_sv[33] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[35] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[41] += cxtype( 0, 1 ) * amp_sv[0];
@@ -7276,7 +7276,7 @@ namespace mg5amcCpu
       jamp_sv[71] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[95] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[119] += cxtype( 0, 1 ) * amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[31], w_fp[92], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[31], w_fp[92], COUPs[0], 1.0, &amp_fp[0] );
       jamp_sv[35] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[39] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[41] += cxtype( 0, 1 ) * amp_sv[0];
@@ -7285,7 +7285,7 @@ namespace mg5amcCpu
       jamp_sv[89] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[95] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[113] += cxtype( 0, 1 ) * amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[32], w_fp[92], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[32], w_fp[92], COUPs[0], 1.0, &amp_fp[0] );
       jamp_sv[33] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[39] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[45] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -7298,11 +7298,11 @@ namespace mg5amcCpu
       // *** DIAGRAM 370 OF 1240 ***
 
       // Wavefunction(s) for diagram number 370
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[4], COUPs[0], 0., 0., w_fp[92] );
-      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[92], COUPs[1], cIPD[0], cIPD[1], w_fp[99] );
+      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[4], COUPs[0], 1.0, 0., 0., w_fp[92] );
+      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[92], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[99] );
 
       // Amplitude(s) for diagram number 370
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[9], w_fp[6], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[9], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 370 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -7316,7 +7316,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 371
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[85], w_fp[5], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[85], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 371 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -7327,11 +7327,11 @@ namespace mg5amcCpu
       // *** DIAGRAM 372 OF 1240 ***
 
       // Wavefunction(s) for diagram number 372
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[92], w_fp[5], COUPs[0], 0., 0., w_fp[62] );
-      FFV1P0_3<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[77], COUPs[1], 0., 0., w_fp[34] );
+      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[92], w_fp[5], COUPs[0], 1.0, 0., 0., w_fp[62] );
+      FFV1P0_3<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[77], COUPs[1], 1.0, 0., 0., w_fp[34] );
 
       // Amplitude(s) for diagram number 372
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[62], w_fp[34], w_fp[6], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[62], w_fp[34], w_fp[6], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 372 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -7351,7 +7351,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 373
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[85], w_fp[62], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[85], w_fp[62], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 373 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -7364,10 +7364,10 @@ namespace mg5amcCpu
       // *** DIAGRAM 374 OF 1240 ***
 
       // Wavefunction(s) for diagram number 374
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[92], w_fp[6], COUPs[0], 0., 0., w_fp[86] );
+      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[92], w_fp[6], COUPs[0], 1.0, 0., 0., w_fp[86] );
 
       // Amplitude(s) for diagram number 374
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[86], w_fp[34], w_fp[5], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[86], w_fp[34], w_fp[5], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 374 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -7387,7 +7387,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 375
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[9], w_fp[86], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[9], w_fp[86], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 375 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -7400,12 +7400,12 @@ namespace mg5amcCpu
       // *** DIAGRAM 376 OF 1240 ***
 
       // Wavefunction(s) for diagram number 376
-      VVVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[92], w_fp[5], w_fp[6], COUPs[2], 0., 0., w_fp[88] );
-      VVVV3P0_1<W_ACCESS, CD_ACCESS>( w_fp[92], w_fp[5], w_fp[6], COUPs[2], 0., 0., w_fp[90] );
-      VVVV4P0_1<W_ACCESS, CD_ACCESS>( w_fp[92], w_fp[5], w_fp[6], COUPs[2], 0., 0., w_fp[96] );
+      VVVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[92], w_fp[5], w_fp[6], COUPs[2], 1.0, 0., 0., w_fp[88] );
+      VVVV3P0_1<W_ACCESS, CD_ACCESS>( w_fp[92], w_fp[5], w_fp[6], COUPs[2], 1.0, 0., 0., w_fp[90] );
+      VVVV4P0_1<W_ACCESS, CD_ACCESS>( w_fp[92], w_fp[5], w_fp[6], COUPs[2], 1.0, 0., 0., w_fp[96] );
 
       // Amplitude(s) for diagram number 376
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[77], w_fp[88], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[77], w_fp[88], COUPs[1], 1.0, &amp_fp[0] );
       jamp_sv[24] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[25] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[30] += cxtype( 0, 1 ) * amp_sv[0];
@@ -7414,7 +7414,7 @@ namespace mg5amcCpu
       jamp_sv[41] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[46] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[47] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[77], w_fp[90], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[77], w_fp[90], COUPs[1], 1.0, &amp_fp[0] );
       jamp_sv[25] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[31] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[36] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -7423,7 +7423,7 @@ namespace mg5amcCpu
       jamp_sv[41] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[42] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[44] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[77], w_fp[96], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[77], w_fp[96], COUPs[1], 1.0, &amp_fp[0] );
       jamp_sv[24] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[30] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[36] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -7436,10 +7436,10 @@ namespace mg5amcCpu
       // *** DIAGRAM 377 OF 1240 ***
 
       // Wavefunction(s) for diagram number 377
-      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[77], w_fp[92], COUPs[1], cIPD[0], cIPD[1], w_fp[95] );
+      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[77], w_fp[92], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[95] );
 
       // Amplitude(s) for diagram number 377
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[95], w_fp[6], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[95], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 377 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -7450,10 +7450,10 @@ namespace mg5amcCpu
       // *** DIAGRAM 378 OF 1240 ***
 
       // Wavefunction(s) for diagram number 378
-      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[38], w_fp[92], COUPs[1], cIPD[0], cIPD[1], w_fp[98] );
+      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[38], w_fp[92], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[98] );
 
       // Amplitude(s) for diagram number 378
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[98], w_fp[77], w_fp[6], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[98], w_fp[77], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 378 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -7467,7 +7467,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 379
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[77], w_fp[86], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[77], w_fp[86], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 379 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -7483,7 +7483,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 380
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[95], w_fp[5], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[95], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 380 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -7494,10 +7494,10 @@ namespace mg5amcCpu
       // *** DIAGRAM 381 OF 1240 ***
 
       // Wavefunction(s) for diagram number 381
-      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[41], w_fp[92], COUPs[1], cIPD[0], cIPD[1], w_fp[101] );
+      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[41], w_fp[92], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[101] );
 
       // Amplitude(s) for diagram number 381
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[101], w_fp[77], w_fp[5], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[101], w_fp[77], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 381 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -7511,7 +7511,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 382
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[77], w_fp[62], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[77], w_fp[62], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 382 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -7527,7 +7527,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 383
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[95], w_fp[29], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[95], w_fp[29], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 383 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -7543,7 +7543,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 384
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[77], w_fp[29], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[77], w_fp[29], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 384 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -7556,10 +7556,10 @@ namespace mg5amcCpu
       // *** DIAGRAM 385 OF 1240 ***
 
       // Wavefunction(s) for diagram number 385
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[92], w_fp[29], COUPs[0], 0., 0., w_fp[95] );
+      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[92], w_fp[29], COUPs[0], 1.0, 0., 0., w_fp[95] );
 
       // Amplitude(s) for diagram number 385
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[77], w_fp[95], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[77], w_fp[95], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 385 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -7576,10 +7576,10 @@ namespace mg5amcCpu
       // *** DIAGRAM 386 OF 1240 ***
 
       // Wavefunction(s) for diagram number 386
-      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[2], w_fp[92], COUPs[1], cIPD[0], cIPD[1], w_fp[102] );
+      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[2], w_fp[92], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[102] );
 
       // Amplitude(s) for diagram number 386
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[22], w_fp[102], w_fp[6], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[22], w_fp[102], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 386 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -7593,7 +7593,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 387
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[21], w_fp[102], w_fp[5], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[21], w_fp[102], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 387 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -7604,10 +7604,10 @@ namespace mg5amcCpu
       // *** DIAGRAM 388 OF 1240 ***
 
       // Wavefunction(s) for diagram number 388
-      FFV1P0_3<W_ACCESS, CD_ACCESS>( w_fp[52], w_fp[2], COUPs[1], 0., 0., w_fp[103] );
+      FFV1P0_3<W_ACCESS, CD_ACCESS>( w_fp[52], w_fp[2], COUPs[1], 1.0, 0., 0., w_fp[103] );
 
       // Amplitude(s) for diagram number 388
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[62], w_fp[103], w_fp[6], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[62], w_fp[103], w_fp[6], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 388 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -7627,7 +7627,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 389
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[21], w_fp[2], w_fp[62], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[21], w_fp[2], w_fp[62], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 389 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -7643,7 +7643,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 390
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[86], w_fp[103], w_fp[5], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[86], w_fp[103], w_fp[5], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 390 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -7663,7 +7663,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 391
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[22], w_fp[2], w_fp[86], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[22], w_fp[2], w_fp[86], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 391 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -7679,7 +7679,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 392
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[2], w_fp[88], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[2], w_fp[88], COUPs[1], 1.0, &amp_fp[0] );
       jamp_sv[9] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[11] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[51] += cxtype( 0, 1 ) * amp_sv[0];
@@ -7688,7 +7688,7 @@ namespace mg5amcCpu
       jamp_sv[94] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[115] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[118] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[2], w_fp[90], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[2], w_fp[90], COUPs[1], 1.0, &amp_fp[0] );
       jamp_sv[11] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[53] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[75] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -7697,7 +7697,7 @@ namespace mg5amcCpu
       jamp_sv[94] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[99] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[109] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[2], w_fp[96], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[2], w_fp[96], COUPs[1], 1.0, &amp_fp[0] );
       jamp_sv[9] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[51] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[75] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -7710,10 +7710,10 @@ namespace mg5amcCpu
       // *** DIAGRAM 393 OF 1240 ***
 
       // Wavefunction(s) for diagram number 393
-      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[52], w_fp[92], COUPs[1], cIPD[0], cIPD[1], w_fp[104] );
+      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[52], w_fp[92], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[104] );
 
       // Amplitude(s) for diagram number 393
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[104], w_fp[39], w_fp[6], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[104], w_fp[39], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 393 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -7724,10 +7724,10 @@ namespace mg5amcCpu
       // *** DIAGRAM 394 OF 1240 ***
 
       // Wavefunction(s) for diagram number 394
-      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[39], w_fp[92], COUPs[1], cIPD[0], cIPD[1], w_fp[105] );
+      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[39], w_fp[92], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[105] );
 
       // Amplitude(s) for diagram number 394
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[105], w_fp[6], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[105], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 394 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -7741,7 +7741,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 395
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[39], w_fp[86], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[39], w_fp[86], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 395 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -7757,7 +7757,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 396
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[104], w_fp[47], w_fp[5], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[104], w_fp[47], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 396 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -7768,10 +7768,10 @@ namespace mg5amcCpu
       // *** DIAGRAM 397 OF 1240 ***
 
       // Wavefunction(s) for diagram number 397
-      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[47], w_fp[92], COUPs[1], cIPD[0], cIPD[1], w_fp[106] );
+      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[47], w_fp[92], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[106] );
 
       // Amplitude(s) for diagram number 397
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[106], w_fp[5], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[106], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 397 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -7785,7 +7785,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 398
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[47], w_fp[62], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[47], w_fp[62], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 398 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -7801,7 +7801,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 399
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[104], w_fp[2], w_fp[29], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[104], w_fp[2], w_fp[29], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 399 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -7817,7 +7817,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 400
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[102], w_fp[29], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[102], w_fp[29], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 400 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -7833,7 +7833,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 401
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[2], w_fp[95], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[2], w_fp[95], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 401 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -7853,7 +7853,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 402
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[71], w_fp[102], w_fp[6], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[71], w_fp[102], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 402 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -7869,7 +7869,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 403
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[102], w_fp[70], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[102], w_fp[70], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 403 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -7889,7 +7889,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 404
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[94], w_fp[6], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[94], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 404 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -7905,7 +7905,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 405
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[2], w_fp[70], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[2], w_fp[70], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 405 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -7925,7 +7925,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 406
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[94], w_fp[86], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[94], w_fp[86], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 406 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -7945,7 +7945,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 407
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[71], w_fp[2], w_fp[86], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[71], w_fp[2], w_fp[86], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 407 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -7965,7 +7965,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 408
-      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[92], w_fp[66], w_fp[8], w_fp[6], COUPs[2], &amp_fp[0] );
+      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[92], w_fp[66], w_fp[8], w_fp[6], COUPs[2], 1.0, &amp_fp[0] );
       jamp_sv[6] += amp_sv[0];
       jamp_sv[8] -= amp_sv[0];
       jamp_sv[36] -= amp_sv[0];
@@ -7982,7 +7982,7 @@ namespace mg5amcCpu
       jamp_sv[109] -= amp_sv[0];
       jamp_sv[116] -= amp_sv[0];
       jamp_sv[117] += amp_sv[0];
-      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[92], w_fp[66], w_fp[8], w_fp[6], COUPs[2], &amp_fp[0] );
+      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[92], w_fp[66], w_fp[8], w_fp[6], COUPs[2], 1.0, &amp_fp[0] );
       jamp_sv[6] += amp_sv[0];
       jamp_sv[8] -= amp_sv[0];
       jamp_sv[10] -= amp_sv[0];
@@ -7999,7 +7999,7 @@ namespace mg5amcCpu
       jamp_sv[107] -= amp_sv[0];
       jamp_sv[116] -= amp_sv[0];
       jamp_sv[117] += amp_sv[0];
-      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[92], w_fp[66], w_fp[8], w_fp[6], COUPs[2], &amp_fp[0] );
+      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[92], w_fp[66], w_fp[8], w_fp[6], COUPs[2], 1.0, &amp_fp[0] );
       jamp_sv[10] -= amp_sv[0];
       jamp_sv[11] += amp_sv[0];
       jamp_sv[36] += amp_sv[0];
@@ -8020,10 +8020,10 @@ namespace mg5amcCpu
       // *** DIAGRAM 409 OF 1240 ***
 
       // Wavefunction(s) for diagram number 409
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[92], w_fp[66], COUPs[0], 0., 0., w_fp[104] );
+      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[92], w_fp[66], COUPs[0], 1.0, 0., 0., w_fp[104] );
 
       // Amplitude(s) for diagram number 409
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[6], w_fp[104], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[6], w_fp[104], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 409 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -8048,10 +8048,10 @@ namespace mg5amcCpu
       // *** DIAGRAM 410 OF 1240 ***
 
       // Wavefunction(s) for diagram number 410
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[92], w_fp[8], COUPs[0], 0., 0., w_fp[107] );
+      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[92], w_fp[8], COUPs[0], 1.0, 0., 0., w_fp[107] );
 
       // Amplitude(s) for diagram number 410
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[66], w_fp[6], w_fp[107], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[66], w_fp[6], w_fp[107], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 410 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -8079,7 +8079,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 411
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[66], w_fp[8], w_fp[86], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[66], w_fp[8], w_fp[86], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 411 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -8107,7 +8107,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 412
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[47], w_fp[104], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[47], w_fp[104], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 412 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -8127,7 +8127,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 413
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[106], w_fp[66], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[106], w_fp[66], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 413 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -8143,7 +8143,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 414
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[47], w_fp[66], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[47], w_fp[66], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 414 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -8159,7 +8159,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 415
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[2], w_fp[104], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[2], w_fp[104], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 415 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -8179,7 +8179,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 416
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[102], w_fp[66], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[102], w_fp[66], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 416 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -8195,7 +8195,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 417
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[101], w_fp[2], w_fp[66], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[101], w_fp[2], w_fp[66], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 417 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -8211,7 +8211,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 418
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[76], w_fp[102], w_fp[5], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[76], w_fp[102], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 418 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -8227,7 +8227,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 419
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[102], w_fp[75], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[102], w_fp[75], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 419 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -8247,7 +8247,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 420
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[97], w_fp[5], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[97], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 420 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -8263,7 +8263,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 421
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[2], w_fp[75], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[2], w_fp[75], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 421 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -8283,7 +8283,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 422
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[97], w_fp[62], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[97], w_fp[62], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 422 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -8303,7 +8303,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 423
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[76], w_fp[2], w_fp[62], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[76], w_fp[2], w_fp[62], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 423 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -8323,7 +8323,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 424
-      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[92], w_fp[72], w_fp[8], w_fp[5], COUPs[2], &amp_fp[0] );
+      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[92], w_fp[72], w_fp[8], w_fp[5], COUPs[2], 1.0, &amp_fp[0] );
       jamp_sv[7] += amp_sv[0];
       jamp_sv[10] -= amp_sv[0];
       jamp_sv[42] -= amp_sv[0];
@@ -8340,7 +8340,7 @@ namespace mg5amcCpu
       jamp_sv[93] += amp_sv[0];
       jamp_sv[102] += amp_sv[0];
       jamp_sv[104] -= amp_sv[0];
-      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[92], w_fp[72], w_fp[8], w_fp[5], COUPs[2], &amp_fp[0] );
+      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[92], w_fp[72], w_fp[8], w_fp[5], COUPs[2], 1.0, &amp_fp[0] );
       jamp_sv[7] += amp_sv[0];
       jamp_sv[8] -= amp_sv[0];
       jamp_sv[9] += amp_sv[0];
@@ -8357,7 +8357,7 @@ namespace mg5amcCpu
       jamp_sv[93] += amp_sv[0];
       jamp_sv[106] += amp_sv[0];
       jamp_sv[107] -= amp_sv[0];
-      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[92], w_fp[72], w_fp[8], w_fp[5], COUPs[2], &amp_fp[0] );
+      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[92], w_fp[72], w_fp[8], w_fp[5], COUPs[2], 1.0, &amp_fp[0] );
       jamp_sv[8] -= amp_sv[0];
       jamp_sv[9] += amp_sv[0];
       jamp_sv[42] += amp_sv[0];
@@ -8378,10 +8378,10 @@ namespace mg5amcCpu
       // *** DIAGRAM 425 OF 1240 ***
 
       // Wavefunction(s) for diagram number 425
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[92], w_fp[72], COUPs[0], 0., 0., w_fp[104] );
+      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[92], w_fp[72], COUPs[0], 1.0, 0., 0., w_fp[104] );
 
       // Amplitude(s) for diagram number 425
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[5], w_fp[104], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[5], w_fp[104], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 425 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -8409,7 +8409,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 426
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[72], w_fp[5], w_fp[107], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[72], w_fp[5], w_fp[107], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 426 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -8437,7 +8437,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 427
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[72], w_fp[8], w_fp[62], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[72], w_fp[8], w_fp[62], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 427 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -8465,7 +8465,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 428
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[39], w_fp[104], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[39], w_fp[104], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 428 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -8485,7 +8485,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 429
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[105], w_fp[72], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[105], w_fp[72], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 429 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -8501,7 +8501,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 430
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[39], w_fp[72], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[39], w_fp[72], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 430 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -8517,7 +8517,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 431
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[2], w_fp[104], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[2], w_fp[104], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 431 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -8537,7 +8537,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 432
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[102], w_fp[72], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[102], w_fp[72], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 432 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -8553,7 +8553,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 433
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[98], w_fp[2], w_fp[72], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[98], w_fp[2], w_fp[72], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 433 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -8566,10 +8566,10 @@ namespace mg5amcCpu
       // *** DIAGRAM 434 OF 1240 ***
 
       // Wavefunction(s) for diagram number 434
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[92], w_fp[1], COUPs[0], 0., 0., w_fp[104] );
+      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[92], w_fp[1], COUPs[0], 1.0, 0., 0., w_fp[104] );
 
       // Amplitude(s) for diagram number 434
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[104], w_fp[10], w_fp[6], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[104], w_fp[10], w_fp[6], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 434 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -8597,7 +8597,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 435
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[104], w_fp[11], w_fp[5], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[104], w_fp[11], w_fp[5], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 435 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -8625,7 +8625,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 436
-      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[5], w_fp[6], w_fp[104], COUPs[2], &amp_fp[0] );
+      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[5], w_fp[6], w_fp[104], COUPs[2], 1.0, &amp_fp[0] );
       jamp_sv[7] -= amp_sv[0];
       jamp_sv[25] += amp_sv[0];
       jamp_sv[31] -= amp_sv[0];
@@ -8642,7 +8642,7 @@ namespace mg5amcCpu
       jamp_sv[102] -= amp_sv[0];
       jamp_sv[104] += amp_sv[0];
       jamp_sv[108] -= amp_sv[0];
-      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[5], w_fp[6], w_fp[104], COUPs[2], &amp_fp[0] );
+      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[5], w_fp[6], w_fp[104], COUPs[2], 1.0, &amp_fp[0] );
       jamp_sv[6] -= amp_sv[0];
       jamp_sv[24] += amp_sv[0];
       jamp_sv[30] -= amp_sv[0];
@@ -8659,7 +8659,7 @@ namespace mg5amcCpu
       jamp_sv[116] += amp_sv[0];
       jamp_sv[117] -= amp_sv[0];
       jamp_sv[118] += amp_sv[0];
-      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[5], w_fp[6], w_fp[104], COUPs[2], &amp_fp[0] );
+      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[5], w_fp[6], w_fp[104], COUPs[2], 1.0, &amp_fp[0] );
       jamp_sv[6] -= amp_sv[0];
       jamp_sv[7] += amp_sv[0];
       jamp_sv[24] += amp_sv[0];
@@ -8680,10 +8680,10 @@ namespace mg5amcCpu
       // *** DIAGRAM 437 OF 1240 ***
 
       // Wavefunction(s) for diagram number 437
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], COUPs[0], 0., 0., w_fp[108] );
+      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], COUPs[0], 1.0, 0., 0., w_fp[108] );
 
       // Amplitude(s) for diagram number 437
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[62], w_fp[108], w_fp[6], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[62], w_fp[108], w_fp[6], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 437 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -8711,7 +8711,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 438
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[62], w_fp[1], w_fp[11], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[62], w_fp[1], w_fp[11], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 438 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -8739,7 +8739,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 439
-      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[6], w_fp[62], COUPs[2], &amp_fp[0] );
+      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[6], w_fp[62], COUPs[2], 1.0, &amp_fp[0] );
       jamp_sv[9] += amp_sv[0];
       jamp_sv[24] -= amp_sv[0];
       jamp_sv[30] += amp_sv[0];
@@ -8756,7 +8756,7 @@ namespace mg5amcCpu
       jamp_sv[109] += amp_sv[0];
       jamp_sv[115] += amp_sv[0];
       jamp_sv[118] -= amp_sv[0];
-      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[6], w_fp[62], COUPs[2], &amp_fp[0] );
+      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[6], w_fp[62], COUPs[2], 1.0, &amp_fp[0] );
       jamp_sv[8] -= amp_sv[0];
       jamp_sv[9] += amp_sv[0];
       jamp_sv[42] += amp_sv[0];
@@ -8773,7 +8773,7 @@ namespace mg5amcCpu
       jamp_sv[104] += amp_sv[0];
       jamp_sv[106] += amp_sv[0];
       jamp_sv[107] -= amp_sv[0];
-      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[6], w_fp[62], COUPs[2], &amp_fp[0] );
+      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[6], w_fp[62], COUPs[2], 1.0, &amp_fp[0] );
       jamp_sv[8] -= amp_sv[0];
       jamp_sv[24] += amp_sv[0];
       jamp_sv[30] -= amp_sv[0];
@@ -8797,7 +8797,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 440
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[86], w_fp[108], w_fp[5], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[86], w_fp[108], w_fp[5], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 440 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -8825,7 +8825,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 441
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[86], w_fp[1], w_fp[10], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[86], w_fp[1], w_fp[10], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 441 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -8853,7 +8853,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 442
-      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[5], w_fp[86], COUPs[2], &amp_fp[0] );
+      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[5], w_fp[86], COUPs[2], 1.0, &amp_fp[0] );
       jamp_sv[11] += amp_sv[0];
       jamp_sv[25] -= amp_sv[0];
       jamp_sv[31] += amp_sv[0];
@@ -8870,7 +8870,7 @@ namespace mg5amcCpu
       jamp_sv[94] -= amp_sv[0];
       jamp_sv[99] -= amp_sv[0];
       jamp_sv[109] += amp_sv[0];
-      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[5], w_fp[86], COUPs[2], &amp_fp[0] );
+      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[5], w_fp[86], COUPs[2], 1.0, &amp_fp[0] );
       jamp_sv[10] -= amp_sv[0];
       jamp_sv[11] += amp_sv[0];
       jamp_sv[36] += amp_sv[0];
@@ -8887,7 +8887,7 @@ namespace mg5amcCpu
       jamp_sv[99] -= amp_sv[0];
       jamp_sv[108] -= amp_sv[0];
       jamp_sv[109] += amp_sv[0];
-      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[5], w_fp[86], COUPs[2], &amp_fp[0] );
+      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[5], w_fp[86], COUPs[2], 1.0, &amp_fp[0] );
       jamp_sv[10] -= amp_sv[0];
       jamp_sv[25] += amp_sv[0];
       jamp_sv[31] -= amp_sv[0];
@@ -8908,12 +8908,12 @@ namespace mg5amcCpu
       // *** DIAGRAM 443 OF 1240 ***
 
       // Wavefunction(s) for diagram number 443
-      VVVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[92], w_fp[1], w_fp[5], COUPs[2], 0., 0., w_fp[109] );
-      VVVV3P0_1<W_ACCESS, CD_ACCESS>( w_fp[92], w_fp[1], w_fp[5], COUPs[2], 0., 0., w_fp[110] );
-      VVVV4P0_1<W_ACCESS, CD_ACCESS>( w_fp[92], w_fp[1], w_fp[5], COUPs[2], 0., 0., w_fp[111] );
+      VVVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[92], w_fp[1], w_fp[5], COUPs[2], 1.0, 0., 0., w_fp[109] );
+      VVVV3P0_1<W_ACCESS, CD_ACCESS>( w_fp[92], w_fp[1], w_fp[5], COUPs[2], 1.0, 0., 0., w_fp[110] );
+      VVVV4P0_1<W_ACCESS, CD_ACCESS>( w_fp[92], w_fp[1], w_fp[5], COUPs[2], 1.0, 0., 0., w_fp[111] );
 
       // Amplitude(s) for diagram number 443
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[6], w_fp[109], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[6], w_fp[109], COUPs[0], 1.0, &amp_fp[0] );
       jamp_sv[6] += amp_sv[0];
       jamp_sv[8] -= amp_sv[0];
       jamp_sv[36] -= amp_sv[0];
@@ -8930,7 +8930,7 @@ namespace mg5amcCpu
       jamp_sv[109] -= amp_sv[0];
       jamp_sv[116] -= amp_sv[0];
       jamp_sv[117] += amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[6], w_fp[110], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[6], w_fp[110], COUPs[0], 1.0, &amp_fp[0] );
       jamp_sv[8] -= amp_sv[0];
       jamp_sv[24] += amp_sv[0];
       jamp_sv[30] -= amp_sv[0];
@@ -8947,7 +8947,7 @@ namespace mg5amcCpu
       jamp_sv[109] -= amp_sv[0];
       jamp_sv[115] -= amp_sv[0];
       jamp_sv[118] += amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[6], w_fp[111], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[6], w_fp[111], COUPs[0], 1.0, &amp_fp[0] );
       jamp_sv[6] -= amp_sv[0];
       jamp_sv[24] += amp_sv[0];
       jamp_sv[30] -= amp_sv[0];
@@ -8968,12 +8968,12 @@ namespace mg5amcCpu
       // *** DIAGRAM 444 OF 1240 ***
 
       // Wavefunction(s) for diagram number 444
-      VVVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[92], w_fp[1], w_fp[6], COUPs[2], 0., 0., w_fp[112] );
-      VVVV3P0_1<W_ACCESS, CD_ACCESS>( w_fp[92], w_fp[1], w_fp[6], COUPs[2], 0., 0., w_fp[113] );
-      VVVV4P0_1<W_ACCESS, CD_ACCESS>( w_fp[92], w_fp[1], w_fp[6], COUPs[2], 0., 0., w_fp[114] );
+      VVVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[92], w_fp[1], w_fp[6], COUPs[2], 1.0, 0., 0., w_fp[112] );
+      VVVV3P0_1<W_ACCESS, CD_ACCESS>( w_fp[92], w_fp[1], w_fp[6], COUPs[2], 1.0, 0., 0., w_fp[113] );
+      VVVV4P0_1<W_ACCESS, CD_ACCESS>( w_fp[92], w_fp[1], w_fp[6], COUPs[2], 1.0, 0., 0., w_fp[114] );
 
       // Amplitude(s) for diagram number 444
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[5], w_fp[112], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[5], w_fp[112], COUPs[0], 1.0, &amp_fp[0] );
       jamp_sv[7] += amp_sv[0];
       jamp_sv[10] -= amp_sv[0];
       jamp_sv[42] -= amp_sv[0];
@@ -8990,7 +8990,7 @@ namespace mg5amcCpu
       jamp_sv[93] += amp_sv[0];
       jamp_sv[102] += amp_sv[0];
       jamp_sv[104] -= amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[5], w_fp[113], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[5], w_fp[113], COUPs[0], 1.0, &amp_fp[0] );
       jamp_sv[10] -= amp_sv[0];
       jamp_sv[25] += amp_sv[0];
       jamp_sv[31] -= amp_sv[0];
@@ -9007,7 +9007,7 @@ namespace mg5amcCpu
       jamp_sv[94] += amp_sv[0];
       jamp_sv[98] += amp_sv[0];
       jamp_sv[108] -= amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[5], w_fp[114], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[5], w_fp[114], COUPs[0], 1.0, &amp_fp[0] );
       jamp_sv[7] -= amp_sv[0];
       jamp_sv[25] += amp_sv[0];
       jamp_sv[31] -= amp_sv[0];
@@ -9031,7 +9031,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 445
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[88], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[88], COUPs[0], 1.0, &amp_fp[0] );
       jamp_sv[9] -= amp_sv[0];
       jamp_sv[11] += amp_sv[0];
       jamp_sv[24] += amp_sv[0];
@@ -9048,7 +9048,7 @@ namespace mg5amcCpu
       jamp_sv[94] -= amp_sv[0];
       jamp_sv[115] -= amp_sv[0];
       jamp_sv[118] += amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[90], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[90], COUPs[0], 1.0, &amp_fp[0] );
       jamp_sv[11] += amp_sv[0];
       jamp_sv[25] -= amp_sv[0];
       jamp_sv[31] += amp_sv[0];
@@ -9065,7 +9065,7 @@ namespace mg5amcCpu
       jamp_sv[94] -= amp_sv[0];
       jamp_sv[99] -= amp_sv[0];
       jamp_sv[109] += amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[96], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[96], COUPs[0], 1.0, &amp_fp[0] );
       jamp_sv[9] += amp_sv[0];
       jamp_sv[24] -= amp_sv[0];
       jamp_sv[30] += amp_sv[0];
@@ -9089,7 +9089,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 446
-      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[92], w_fp[1], w_fp[8], w_fp[29], COUPs[2], &amp_fp[0] );
+      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[92], w_fp[1], w_fp[8], w_fp[29], COUPs[2], 1.0, &amp_fp[0] );
       jamp_sv[6] += amp_sv[0];
       jamp_sv[7] -= amp_sv[0];
       jamp_sv[24] -= amp_sv[0];
@@ -9106,7 +9106,7 @@ namespace mg5amcCpu
       jamp_sv[116] -= amp_sv[0];
       jamp_sv[117] += amp_sv[0];
       jamp_sv[118] -= amp_sv[0];
-      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[92], w_fp[1], w_fp[8], w_fp[29], COUPs[2], &amp_fp[0] );
+      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[92], w_fp[1], w_fp[8], w_fp[29], COUPs[2], 1.0, &amp_fp[0] );
       jamp_sv[6] += amp_sv[0];
       jamp_sv[7] -= amp_sv[0];
       jamp_sv[9] -= amp_sv[0];
@@ -9123,7 +9123,7 @@ namespace mg5amcCpu
       jamp_sv[93] -= amp_sv[0];
       jamp_sv[116] -= amp_sv[0];
       jamp_sv[117] += amp_sv[0];
-      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[92], w_fp[1], w_fp[8], w_fp[29], COUPs[2], &amp_fp[0] );
+      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[92], w_fp[1], w_fp[8], w_fp[29], COUPs[2], 1.0, &amp_fp[0] );
       jamp_sv[9] -= amp_sv[0];
       jamp_sv[11] += amp_sv[0];
       jamp_sv[24] += amp_sv[0];
@@ -9147,7 +9147,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 447
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[29], w_fp[104], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[29], w_fp[104], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 447 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -9175,7 +9175,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 448
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[29], w_fp[107], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[29], w_fp[107], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 448 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -9203,7 +9203,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 449
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[95], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[95], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 449 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -9231,7 +9231,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 450
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[104], w_fp[45], w_fp[6], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[104], w_fp[45], w_fp[6], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 450 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -9251,7 +9251,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 451
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[44], w_fp[104], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[44], w_fp[104], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 451 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -9267,7 +9267,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 452
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[89], w_fp[6], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[89], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 452 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -9281,7 +9281,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 453
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[44], w_fp[1], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[44], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 453 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -9295,7 +9295,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 454
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[89], w_fp[86], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[89], w_fp[86], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 454 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -9311,7 +9311,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 455
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[86], w_fp[1], w_fp[45], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[86], w_fp[1], w_fp[45], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 455 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -9331,7 +9331,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 456
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[39], w_fp[112], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[39], w_fp[112], COUPs[1], 1.0, &amp_fp[0] );
       jamp_sv[74] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[75] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[82] += cxtype( 0, 1 ) * amp_sv[0];
@@ -9340,7 +9340,7 @@ namespace mg5amcCpu
       jamp_sv[85] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[92] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[93] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[39], w_fp[113], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[39], w_fp[113], COUPs[1], 1.0, &amp_fp[0] );
       jamp_sv[75] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[78] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[80] += cxtype( 0, 1 ) * amp_sv[0];
@@ -9349,7 +9349,7 @@ namespace mg5amcCpu
       jamp_sv[85] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[91] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[94] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[39], w_fp[114], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[39], w_fp[114], COUPs[1], 1.0, &amp_fp[0] );
       jamp_sv[74] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[78] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[80] += cxtype( 0, 1 ) * amp_sv[0];
@@ -9365,7 +9365,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 457
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[39], w_fp[104], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[39], w_fp[104], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 457 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -9381,7 +9381,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 458
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[105], w_fp[1], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[105], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 458 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -9395,7 +9395,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 459
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[101], w_fp[39], w_fp[1], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[101], w_fp[39], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 459 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -9409,7 +9409,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 460
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[104], w_fp[51], w_fp[5], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[104], w_fp[51], w_fp[5], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 460 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -9429,7 +9429,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 461
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[50], w_fp[104], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[50], w_fp[104], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 461 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -9445,7 +9445,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 462
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[91], w_fp[5], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[91], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 462 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -9459,7 +9459,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 463
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[50], w_fp[1], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[50], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 463 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -9473,7 +9473,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 464
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[91], w_fp[62], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[91], w_fp[62], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 464 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -9489,7 +9489,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 465
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[62], w_fp[1], w_fp[51], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[62], w_fp[1], w_fp[51], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 465 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -9509,7 +9509,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 466
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[47], w_fp[109], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[47], w_fp[109], COUPs[1], 1.0, &amp_fp[0] );
       jamp_sv[98] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[99] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[106] += cxtype( 0, 1 ) * amp_sv[0];
@@ -9518,7 +9518,7 @@ namespace mg5amcCpu
       jamp_sv[109] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[116] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[117] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[47], w_fp[110], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[47], w_fp[110], COUPs[1], 1.0, &amp_fp[0] );
       jamp_sv[99] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[102] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[104] += cxtype( 0, 1 ) * amp_sv[0];
@@ -9527,7 +9527,7 @@ namespace mg5amcCpu
       jamp_sv[109] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[115] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[118] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[47], w_fp[111], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[47], w_fp[111], COUPs[1], 1.0, &amp_fp[0] );
       jamp_sv[98] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[102] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[104] += cxtype( 0, 1 ) * amp_sv[0];
@@ -9543,7 +9543,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 467
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[47], w_fp[104], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[47], w_fp[104], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 467 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -9559,7 +9559,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 468
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[106], w_fp[1], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[106], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 468 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -9573,7 +9573,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 469
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[98], w_fp[47], w_fp[1], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[98], w_fp[47], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 469 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -9587,7 +9587,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 470
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[104], w_fp[23], w_fp[6], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[104], w_fp[23], w_fp[6], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 470 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -9607,7 +9607,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 471
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[48], w_fp[2], w_fp[104], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[48], w_fp[2], w_fp[104], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 471 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -9623,7 +9623,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 472
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[58], w_fp[102], w_fp[6], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[58], w_fp[102], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 472 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -9637,7 +9637,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 473
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[48], w_fp[102], w_fp[1], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[48], w_fp[102], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 473 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -9651,7 +9651,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 474
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[58], w_fp[2], w_fp[86], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[58], w_fp[2], w_fp[86], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 474 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -9667,7 +9667,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 475
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[86], w_fp[1], w_fp[23], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[86], w_fp[1], w_fp[23], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 475 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -9687,7 +9687,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 476
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[2], w_fp[112], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[2], w_fp[112], COUPs[1], 1.0, &amp_fp[0] );
       jamp_sv[7] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[10] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[42] += cxtype( 0, 1 ) * amp_sv[0];
@@ -9696,7 +9696,7 @@ namespace mg5amcCpu
       jamp_sv[52] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[102] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[104] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[2], w_fp[113], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[2], w_fp[113], COUPs[1], 1.0, &amp_fp[0] );
       jamp_sv[10] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[25] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[31] += cxtype( 0, 1 ) * amp_sv[0];
@@ -9705,7 +9705,7 @@ namespace mg5amcCpu
       jamp_sv[52] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[98] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[108] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[2], w_fp[114], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[2], w_fp[114], COUPs[1], 1.0, &amp_fp[0] );
       jamp_sv[7] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[25] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[31] += cxtype( 0, 1 ) * amp_sv[0];
@@ -9721,7 +9721,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 477
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[104], w_fp[20], w_fp[5], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[104], w_fp[20], w_fp[5], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 477 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -9741,7 +9741,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 478
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[40], w_fp[2], w_fp[104], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[40], w_fp[2], w_fp[104], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 478 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -9757,7 +9757,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 479
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[60], w_fp[102], w_fp[5], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[60], w_fp[102], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 479 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -9771,7 +9771,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 480
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[40], w_fp[102], w_fp[1], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[40], w_fp[102], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 480 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -9785,7 +9785,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 481
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[60], w_fp[2], w_fp[62], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[60], w_fp[2], w_fp[62], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 481 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -9801,7 +9801,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 482
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[62], w_fp[1], w_fp[20], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[62], w_fp[1], w_fp[20], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 482 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -9821,7 +9821,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 483
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[2], w_fp[109], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[2], w_fp[109], COUPs[1], 1.0, &amp_fp[0] );
       jamp_sv[6] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[8] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[36] += cxtype( 0, 1 ) * amp_sv[0];
@@ -9830,7 +9830,7 @@ namespace mg5amcCpu
       jamp_sv[50] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[78] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[80] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[2], w_fp[110], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[2], w_fp[110], COUPs[1], 1.0, &amp_fp[0] );
       jamp_sv[8] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[24] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[30] += cxtype( 0, 1 ) * amp_sv[0];
@@ -9839,7 +9839,7 @@ namespace mg5amcCpu
       jamp_sv[50] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[74] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[84] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[2], w_fp[111], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[2], w_fp[111], COUPs[1], 1.0, &amp_fp[0] );
       jamp_sv[6] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[24] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[30] += cxtype( 0, 1 ) * amp_sv[0];
@@ -9855,7 +9855,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 484
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[18], w_fp[104], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[18], w_fp[104], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 484 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -9875,7 +9875,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 485
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[12], w_fp[2], w_fp[104], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[12], w_fp[2], w_fp[104], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 485 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -9895,7 +9895,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 486
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[102], w_fp[67], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[102], w_fp[67], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 486 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -9915,7 +9915,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 487
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[12], w_fp[102], w_fp[1], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[12], w_fp[102], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 487 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -9931,7 +9931,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 488
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[2], w_fp[67], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[2], w_fp[67], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 488 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -9951,7 +9951,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 489
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[18], w_fp[1], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[18], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 489 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -9967,7 +9967,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 490
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[102], w_fp[55], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[102], w_fp[55], COUPs[1], 1.0, &amp_fp[0] );
       jamp_sv[6] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[7] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[9] += cxtype( 0, 1 ) * amp_sv[0];
@@ -9976,7 +9976,7 @@ namespace mg5amcCpu
       jamp_sv[49] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[51] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[53] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[102], w_fp[83], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[102], w_fp[83], COUPs[1], 1.0, &amp_fp[0] );
       jamp_sv[7] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[8] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[9] += cxtype( 0, 1 ) * amp_sv[0];
@@ -9985,7 +9985,7 @@ namespace mg5amcCpu
       jamp_sv[50] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[51] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[52] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[102], w_fp[84], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[102], w_fp[84], COUPs[1], 1.0, &amp_fp[0] );
       jamp_sv[6] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[8] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[10] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -10001,7 +10001,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 491
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[2], w_fp[55], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[2], w_fp[55], COUPs[1], 1.0, &amp_fp[0] );
       jamp_sv[40] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[41] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[46] += cxtype( 0, 1 ) * amp_sv[0];
@@ -10010,7 +10010,7 @@ namespace mg5amcCpu
       jamp_sv[93] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[116] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[117] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[2], w_fp[83], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[2], w_fp[83], COUPs[1], 1.0, &amp_fp[0] );
       jamp_sv[46] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[47] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[82] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -10019,7 +10019,7 @@ namespace mg5amcCpu
       jamp_sv[93] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[106] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[107] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[2], w_fp[84], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[2], w_fp[84], COUPs[1], 1.0, &amp_fp[0] );
       jamp_sv[40] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[41] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[82] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -10035,7 +10035,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 492
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[92], w_fp[55], w_fp[8], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[92], w_fp[55], w_fp[8], COUPs[0], 1.0, &amp_fp[0] );
       jamp_sv[6] -= amp_sv[0];
       jamp_sv[7] += amp_sv[0];
       jamp_sv[9] += amp_sv[0];
@@ -10052,7 +10052,7 @@ namespace mg5amcCpu
       jamp_sv[93] += amp_sv[0];
       jamp_sv[116] += amp_sv[0];
       jamp_sv[117] -= amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[92], w_fp[83], w_fp[8], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[92], w_fp[83], w_fp[8], COUPs[0], 1.0, &amp_fp[0] );
       jamp_sv[7] += amp_sv[0];
       jamp_sv[8] -= amp_sv[0];
       jamp_sv[9] += amp_sv[0];
@@ -10069,7 +10069,7 @@ namespace mg5amcCpu
       jamp_sv[93] += amp_sv[0];
       jamp_sv[106] += amp_sv[0];
       jamp_sv[107] -= amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[92], w_fp[84], w_fp[8], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[92], w_fp[84], w_fp[8], COUPs[0], 1.0, &amp_fp[0] );
       jamp_sv[6] += amp_sv[0];
       jamp_sv[8] -= amp_sv[0];
       jamp_sv[10] -= amp_sv[0];
@@ -10090,11 +10090,11 @@ namespace mg5amcCpu
       // *** DIAGRAM 493 OF 1240 ***
 
       // Wavefunction(s) for diagram number 493
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[5], COUPs[0], 0., 0., w_fp[92] );
-      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[92], COUPs[1], cIPD[0], cIPD[1], w_fp[99] );
+      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[5], COUPs[0], 1.0, 0., 0., w_fp[92] );
+      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[92], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[99] );
 
       // Amplitude(s) for diagram number 493
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[87], w_fp[6], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[87], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 493 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -10108,7 +10108,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 494
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[85], w_fp[4], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[85], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 494 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -10119,10 +10119,10 @@ namespace mg5amcCpu
       // *** DIAGRAM 495 OF 1240 ***
 
       // Wavefunction(s) for diagram number 495
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[92], w_fp[4], COUPs[0], 0., 0., w_fp[102] );
+      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[92], w_fp[4], COUPs[0], 1.0, 0., 0., w_fp[102] );
 
       // Amplitude(s) for diagram number 495
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[102], w_fp[34], w_fp[6], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[102], w_fp[34], w_fp[6], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 495 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -10142,7 +10142,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 496
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[85], w_fp[102], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[85], w_fp[102], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 496 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -10155,10 +10155,10 @@ namespace mg5amcCpu
       // *** DIAGRAM 497 OF 1240 ***
 
       // Wavefunction(s) for diagram number 497
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[92], w_fp[6], COUPs[0], 0., 0., w_fp[104] );
+      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[92], w_fp[6], COUPs[0], 1.0, 0., 0., w_fp[104] );
 
       // Amplitude(s) for diagram number 497
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[104], w_fp[34], w_fp[4], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[104], w_fp[34], w_fp[4], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 497 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -10178,7 +10178,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 498
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[87], w_fp[104], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[87], w_fp[104], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 498 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -10191,12 +10191,12 @@ namespace mg5amcCpu
       // *** DIAGRAM 499 OF 1240 ***
 
       // Wavefunction(s) for diagram number 499
-      VVVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[92], w_fp[4], w_fp[6], COUPs[2], 0., 0., w_fp[111] );
-      VVVV3P0_1<W_ACCESS, CD_ACCESS>( w_fp[92], w_fp[4], w_fp[6], COUPs[2], 0., 0., w_fp[110] );
-      VVVV4P0_1<W_ACCESS, CD_ACCESS>( w_fp[92], w_fp[4], w_fp[6], COUPs[2], 0., 0., w_fp[109] );
+      VVVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[92], w_fp[4], w_fp[6], COUPs[2], 1.0, 0., 0., w_fp[111] );
+      VVVV3P0_1<W_ACCESS, CD_ACCESS>( w_fp[92], w_fp[4], w_fp[6], COUPs[2], 1.0, 0., 0., w_fp[110] );
+      VVVV4P0_1<W_ACCESS, CD_ACCESS>( w_fp[92], w_fp[4], w_fp[6], COUPs[2], 1.0, 0., 0., w_fp[109] );
 
       // Amplitude(s) for diagram number 499
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[77], w_fp[111], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[77], w_fp[111], COUPs[1], 1.0, &amp_fp[0] );
       jamp_sv[26] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[27] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[34] += cxtype( 0, 1 ) * amp_sv[0];
@@ -10205,7 +10205,7 @@ namespace mg5amcCpu
       jamp_sv[37] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[44] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[45] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[77], w_fp[110], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[77], w_fp[110], COUPs[1], 1.0, &amp_fp[0] );
       jamp_sv[27] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[30] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[32] += cxtype( 0, 1 ) * amp_sv[0];
@@ -10214,7 +10214,7 @@ namespace mg5amcCpu
       jamp_sv[37] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[43] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[46] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[77], w_fp[109], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[77], w_fp[109], COUPs[1], 1.0, &amp_fp[0] );
       jamp_sv[26] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[30] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[32] += cxtype( 0, 1 ) * amp_sv[0];
@@ -10227,10 +10227,10 @@ namespace mg5amcCpu
       // *** DIAGRAM 500 OF 1240 ***
 
       // Wavefunction(s) for diagram number 500
-      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[77], w_fp[92], COUPs[1], cIPD[0], cIPD[1], w_fp[62] );
+      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[77], w_fp[92], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[62] );
 
       // Amplitude(s) for diagram number 500
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[62], w_fp[6], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[62], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 500 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -10241,10 +10241,10 @@ namespace mg5amcCpu
       // *** DIAGRAM 501 OF 1240 ***
 
       // Wavefunction(s) for diagram number 501
-      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[46], w_fp[92], COUPs[1], cIPD[0], cIPD[1], w_fp[114] );
+      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[46], w_fp[92], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[114] );
 
       // Amplitude(s) for diagram number 501
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[114], w_fp[77], w_fp[6], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[114], w_fp[77], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 501 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -10258,7 +10258,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 502
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[77], w_fp[104], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[77], w_fp[104], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 502 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -10274,7 +10274,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 503
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[62], w_fp[4], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[62], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 503 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -10285,10 +10285,10 @@ namespace mg5amcCpu
       // *** DIAGRAM 504 OF 1240 ***
 
       // Wavefunction(s) for diagram number 504
-      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[41], w_fp[92], COUPs[1], cIPD[0], cIPD[1], w_fp[113] );
+      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[41], w_fp[92], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[113] );
 
       // Amplitude(s) for diagram number 504
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[113], w_fp[77], w_fp[4], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[113], w_fp[77], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 504 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -10302,7 +10302,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 505
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[77], w_fp[102], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[77], w_fp[102], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 505 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -10318,7 +10318,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 506
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[62], w_fp[27], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[62], w_fp[27], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 506 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -10334,7 +10334,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 507
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[77], w_fp[27], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[77], w_fp[27], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 507 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -10347,10 +10347,10 @@ namespace mg5amcCpu
       // *** DIAGRAM 508 OF 1240 ***
 
       // Wavefunction(s) for diagram number 508
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[92], w_fp[27], COUPs[0], 0., 0., w_fp[62] );
+      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[92], w_fp[27], COUPs[0], 1.0, 0., 0., w_fp[62] );
 
       // Amplitude(s) for diagram number 508
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[77], w_fp[62], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[77], w_fp[62], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 508 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -10367,10 +10367,10 @@ namespace mg5amcCpu
       // *** DIAGRAM 509 OF 1240 ***
 
       // Wavefunction(s) for diagram number 509
-      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[2], w_fp[92], COUPs[1], cIPD[0], cIPD[1], w_fp[112] );
+      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[2], w_fp[92], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[112] );
 
       // Amplitude(s) for diagram number 509
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[56], w_fp[112], w_fp[6], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[56], w_fp[112], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 509 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -10384,7 +10384,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 510
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[21], w_fp[112], w_fp[4], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[21], w_fp[112], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 510 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -10398,7 +10398,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 511
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[102], w_fp[103], w_fp[6], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[102], w_fp[103], w_fp[6], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 511 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -10418,7 +10418,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 512
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[21], w_fp[2], w_fp[102], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[21], w_fp[2], w_fp[102], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 512 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -10434,7 +10434,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 513
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[104], w_fp[103], w_fp[4], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[104], w_fp[103], w_fp[4], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 513 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -10454,7 +10454,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 514
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[56], w_fp[2], w_fp[104], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[56], w_fp[2], w_fp[104], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 514 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -10470,7 +10470,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 515
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[2], w_fp[111], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[2], w_fp[111], COUPs[1], 1.0, &amp_fp[0] );
       jamp_sv[15] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[17] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[67] += cxtype( 0, 1 ) * amp_sv[0];
@@ -10479,7 +10479,7 @@ namespace mg5amcCpu
       jamp_sv[77] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[109] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[112] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[2], w_fp[110], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[2], w_fp[110], COUPs[1], 1.0, &amp_fp[0] );
       jamp_sv[17] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[51] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[61] += cxtype( 0, 1 ) * amp_sv[0];
@@ -10488,7 +10488,7 @@ namespace mg5amcCpu
       jamp_sv[77] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[101] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[115] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[2], w_fp[109], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[2], w_fp[109], COUPs[1], 1.0, &amp_fp[0] );
       jamp_sv[15] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[51] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[61] += cxtype( 0, 1 ) * amp_sv[0];
@@ -10501,10 +10501,10 @@ namespace mg5amcCpu
       // *** DIAGRAM 516 OF 1240 ***
 
       // Wavefunction(s) for diagram number 516
-      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[52], w_fp[92], COUPs[1], cIPD[0], cIPD[1], w_fp[86] );
+      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[52], w_fp[92], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[86] );
 
       // Amplitude(s) for diagram number 516
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[86], w_fp[33], w_fp[6], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[86], w_fp[33], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 516 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -10515,10 +10515,10 @@ namespace mg5amcCpu
       // *** DIAGRAM 517 OF 1240 ***
 
       // Wavefunction(s) for diagram number 517
-      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[33], w_fp[92], COUPs[1], cIPD[0], cIPD[1], w_fp[98] );
+      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[33], w_fp[92], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[98] );
 
       // Amplitude(s) for diagram number 517
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[98], w_fp[6], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[98], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 517 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -10532,7 +10532,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 518
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[33], w_fp[104], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[33], w_fp[104], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 518 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -10548,7 +10548,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 519
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[86], w_fp[47], w_fp[4], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[86], w_fp[47], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 519 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -10559,10 +10559,10 @@ namespace mg5amcCpu
       // *** DIAGRAM 520 OF 1240 ***
 
       // Wavefunction(s) for diagram number 520
-      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[47], w_fp[92], COUPs[1], cIPD[0], cIPD[1], w_fp[106] );
+      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[47], w_fp[92], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[106] );
 
       // Amplitude(s) for diagram number 520
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[106], w_fp[4], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[106], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 520 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -10576,7 +10576,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 521
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[47], w_fp[102], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[47], w_fp[102], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 521 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -10592,7 +10592,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 522
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[86], w_fp[2], w_fp[27], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[86], w_fp[2], w_fp[27], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 522 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -10608,7 +10608,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 523
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[112], w_fp[27], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[112], w_fp[27], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 523 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -10624,7 +10624,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 524
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[2], w_fp[62], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[2], w_fp[62], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 524 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -10644,7 +10644,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 525
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[65], w_fp[112], w_fp[6], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[65], w_fp[112], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 525 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -10660,7 +10660,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 526
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[112], w_fp[64], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[112], w_fp[64], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 526 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -10680,7 +10680,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 527
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[93], w_fp[6], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[93], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 527 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -10696,7 +10696,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 528
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[2], w_fp[64], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[2], w_fp[64], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 528 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -10716,7 +10716,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 529
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[93], w_fp[104], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[93], w_fp[104], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 529 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -10736,7 +10736,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 530
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[65], w_fp[2], w_fp[104], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[65], w_fp[2], w_fp[104], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 530 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -10756,7 +10756,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 531
-      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[92], w_fp[61], w_fp[8], w_fp[6], COUPs[2], &amp_fp[0] );
+      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[92], w_fp[61], w_fp[8], w_fp[6], COUPs[2], 1.0, &amp_fp[0] );
       jamp_sv[12] += amp_sv[0];
       jamp_sv[14] -= amp_sv[0];
       jamp_sv[30] -= amp_sv[0];
@@ -10773,7 +10773,7 @@ namespace mg5amcCpu
       jamp_sv[111] += amp_sv[0];
       jamp_sv[114] += amp_sv[0];
       jamp_sv[115] -= amp_sv[0];
-      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[92], w_fp[61], w_fp[8], w_fp[6], COUPs[2], &amp_fp[0] );
+      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[92], w_fp[61], w_fp[8], w_fp[6], COUPs[2], 1.0, &amp_fp[0] );
       jamp_sv[12] += amp_sv[0];
       jamp_sv[14] -= amp_sv[0];
       jamp_sv[16] -= amp_sv[0];
@@ -10790,7 +10790,7 @@ namespace mg5amcCpu
       jamp_sv[105] -= amp_sv[0];
       jamp_sv[110] -= amp_sv[0];
       jamp_sv[111] += amp_sv[0];
-      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[92], w_fp[61], w_fp[8], w_fp[6], COUPs[2], &amp_fp[0] );
+      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[92], w_fp[61], w_fp[8], w_fp[6], COUPs[2], 1.0, &amp_fp[0] );
       jamp_sv[16] -= amp_sv[0];
       jamp_sv[17] += amp_sv[0];
       jamp_sv[30] += amp_sv[0];
@@ -10811,10 +10811,10 @@ namespace mg5amcCpu
       // *** DIAGRAM 532 OF 1240 ***
 
       // Wavefunction(s) for diagram number 532
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[92], w_fp[61], COUPs[0], 0., 0., w_fp[86] );
+      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[92], w_fp[61], COUPs[0], 1.0, 0., 0., w_fp[86] );
 
       // Amplitude(s) for diagram number 532
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[6], w_fp[86], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[6], w_fp[86], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 532 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -10839,10 +10839,10 @@ namespace mg5amcCpu
       // *** DIAGRAM 533 OF 1240 ***
 
       // Wavefunction(s) for diagram number 533
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[92], w_fp[8], COUPs[0], 0., 0., w_fp[101] );
+      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[92], w_fp[8], COUPs[0], 1.0, 0., 0., w_fp[101] );
 
       // Amplitude(s) for diagram number 533
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[61], w_fp[6], w_fp[101], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[61], w_fp[6], w_fp[101], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 533 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -10870,7 +10870,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 534
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[61], w_fp[8], w_fp[104], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[61], w_fp[8], w_fp[104], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 534 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -10898,7 +10898,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 535
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[47], w_fp[86], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[47], w_fp[86], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 535 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -10918,7 +10918,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 536
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[106], w_fp[61], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[106], w_fp[61], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 536 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -10934,7 +10934,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 537
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[47], w_fp[61], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[47], w_fp[61], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 537 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -10950,7 +10950,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 538
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[2], w_fp[86], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[2], w_fp[86], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 538 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -10970,7 +10970,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 539
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[112], w_fp[61], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[112], w_fp[61], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 539 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -10986,7 +10986,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 540
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[113], w_fp[2], w_fp[61], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[113], w_fp[2], w_fp[61], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 540 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -11002,7 +11002,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 541
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[76], w_fp[112], w_fp[4], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[76], w_fp[112], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 541 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -11018,7 +11018,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 542
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[112], w_fp[74], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[112], w_fp[74], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 542 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -11038,7 +11038,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 543
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[97], w_fp[4], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[97], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 543 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -11054,7 +11054,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 544
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[2], w_fp[74], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[2], w_fp[74], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 544 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -11074,7 +11074,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 545
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[97], w_fp[102], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[97], w_fp[102], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 545 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -11094,7 +11094,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 546
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[76], w_fp[2], w_fp[102], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[76], w_fp[2], w_fp[102], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 546 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -11114,7 +11114,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 547
-      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[92], w_fp[72], w_fp[8], w_fp[4], COUPs[2], &amp_fp[0] );
+      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[92], w_fp[72], w_fp[8], w_fp[4], COUPs[2], 1.0, &amp_fp[0] );
       jamp_sv[13] += amp_sv[0];
       jamp_sv[16] -= amp_sv[0];
       jamp_sv[43] -= amp_sv[0];
@@ -11131,7 +11131,7 @@ namespace mg5amcCpu
       jamp_sv[76] += amp_sv[0];
       jamp_sv[103] += amp_sv[0];
       jamp_sv[106] -= amp_sv[0];
-      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[92], w_fp[72], w_fp[8], w_fp[4], COUPs[2], &amp_fp[0] );
+      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[92], w_fp[72], w_fp[8], w_fp[4], COUPs[2], 1.0, &amp_fp[0] );
       jamp_sv[13] += amp_sv[0];
       jamp_sv[14] -= amp_sv[0];
       jamp_sv[15] += amp_sv[0];
@@ -11148,7 +11148,7 @@ namespace mg5amcCpu
       jamp_sv[76] += amp_sv[0];
       jamp_sv[104] += amp_sv[0];
       jamp_sv[105] -= amp_sv[0];
-      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[92], w_fp[72], w_fp[8], w_fp[4], COUPs[2], &amp_fp[0] );
+      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[92], w_fp[72], w_fp[8], w_fp[4], COUPs[2], 1.0, &amp_fp[0] );
       jamp_sv[14] -= amp_sv[0];
       jamp_sv[15] += amp_sv[0];
       jamp_sv[43] += amp_sv[0];
@@ -11169,10 +11169,10 @@ namespace mg5amcCpu
       // *** DIAGRAM 548 OF 1240 ***
 
       // Wavefunction(s) for diagram number 548
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[92], w_fp[72], COUPs[0], 0., 0., w_fp[86] );
+      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[92], w_fp[72], COUPs[0], 1.0, 0., 0., w_fp[86] );
 
       // Amplitude(s) for diagram number 548
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[4], w_fp[86], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[4], w_fp[86], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 548 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -11200,7 +11200,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 549
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[72], w_fp[4], w_fp[101], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[72], w_fp[4], w_fp[101], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 549 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -11228,7 +11228,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 550
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[72], w_fp[8], w_fp[102], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[72], w_fp[8], w_fp[102], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 550 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -11256,7 +11256,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 551
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[33], w_fp[86], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[33], w_fp[86], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 551 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -11276,7 +11276,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 552
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[98], w_fp[72], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[98], w_fp[72], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 552 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -11292,7 +11292,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 553
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[33], w_fp[72], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[33], w_fp[72], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 553 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -11308,7 +11308,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 554
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[2], w_fp[86], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[2], w_fp[86], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 554 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -11328,7 +11328,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 555
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[112], w_fp[72], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[112], w_fp[72], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 555 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -11344,7 +11344,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 556
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[114], w_fp[2], w_fp[72], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[114], w_fp[2], w_fp[72], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 556 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -11357,10 +11357,10 @@ namespace mg5amcCpu
       // *** DIAGRAM 557 OF 1240 ***
 
       // Wavefunction(s) for diagram number 557
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[92], w_fp[1], COUPs[0], 0., 0., w_fp[86] );
+      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[92], w_fp[1], COUPs[0], 1.0, 0., 0., w_fp[86] );
 
       // Amplitude(s) for diagram number 557
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[86], w_fp[13], w_fp[6], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[86], w_fp[13], w_fp[6], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 557 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -11388,7 +11388,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 558
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[86], w_fp[11], w_fp[4], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[86], w_fp[11], w_fp[4], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 558 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -11416,7 +11416,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 559
-      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[4], w_fp[6], w_fp[86], COUPs[2], &amp_fp[0] );
+      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[4], w_fp[6], w_fp[86], COUPs[2], 1.0, &amp_fp[0] );
       jamp_sv[13] -= amp_sv[0];
       jamp_sv[27] += amp_sv[0];
       jamp_sv[37] -= amp_sv[0];
@@ -11433,7 +11433,7 @@ namespace mg5amcCpu
       jamp_sv[103] -= amp_sv[0];
       jamp_sv[106] += amp_sv[0];
       jamp_sv[114] -= amp_sv[0];
-      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[4], w_fp[6], w_fp[86], COUPs[2], &amp_fp[0] );
+      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[4], w_fp[6], w_fp[86], COUPs[2], 1.0, &amp_fp[0] );
       jamp_sv[12] -= amp_sv[0];
       jamp_sv[26] += amp_sv[0];
       jamp_sv[36] -= amp_sv[0];
@@ -11450,7 +11450,7 @@ namespace mg5amcCpu
       jamp_sv[111] -= amp_sv[0];
       jamp_sv[112] += amp_sv[0];
       jamp_sv[114] -= amp_sv[0];
-      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[4], w_fp[6], w_fp[86], COUPs[2], &amp_fp[0] );
+      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[4], w_fp[6], w_fp[86], COUPs[2], 1.0, &amp_fp[0] );
       jamp_sv[12] -= amp_sv[0];
       jamp_sv[13] += amp_sv[0];
       jamp_sv[26] += amp_sv[0];
@@ -11474,7 +11474,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 560
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[102], w_fp[108], w_fp[6], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[102], w_fp[108], w_fp[6], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 560 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -11502,7 +11502,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 561
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[102], w_fp[1], w_fp[11], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[102], w_fp[1], w_fp[11], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 561 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -11530,7 +11530,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 562
-      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[6], w_fp[102], COUPs[2], &amp_fp[0] );
+      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[6], w_fp[102], COUPs[2], 1.0, &amp_fp[0] );
       jamp_sv[15] += amp_sv[0];
       jamp_sv[26] -= amp_sv[0];
       jamp_sv[30] += amp_sv[0];
@@ -11547,7 +11547,7 @@ namespace mg5amcCpu
       jamp_sv[109] += amp_sv[0];
       jamp_sv[112] -= amp_sv[0];
       jamp_sv[115] += amp_sv[0];
-      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[6], w_fp[102], COUPs[2], &amp_fp[0] );
+      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[6], w_fp[102], COUPs[2], 1.0, &amp_fp[0] );
       jamp_sv[14] -= amp_sv[0];
       jamp_sv[15] += amp_sv[0];
       jamp_sv[43] += amp_sv[0];
@@ -11564,7 +11564,7 @@ namespace mg5amcCpu
       jamp_sv[104] += amp_sv[0];
       jamp_sv[105] -= amp_sv[0];
       jamp_sv[106] += amp_sv[0];
-      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[6], w_fp[102], COUPs[2], &amp_fp[0] );
+      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[6], w_fp[102], COUPs[2], 1.0, &amp_fp[0] );
       jamp_sv[14] -= amp_sv[0];
       jamp_sv[26] += amp_sv[0];
       jamp_sv[30] -= amp_sv[0];
@@ -11588,7 +11588,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 563
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[104], w_fp[108], w_fp[4], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[104], w_fp[108], w_fp[4], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 563 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -11616,7 +11616,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 564
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[104], w_fp[1], w_fp[13], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[104], w_fp[1], w_fp[13], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 564 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -11644,7 +11644,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 565
-      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[4], w_fp[104], COUPs[2], &amp_fp[0] );
+      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[4], w_fp[104], COUPs[2], 1.0, &amp_fp[0] );
       jamp_sv[17] += amp_sv[0];
       jamp_sv[27] -= amp_sv[0];
       jamp_sv[30] += amp_sv[0];
@@ -11661,7 +11661,7 @@ namespace mg5amcCpu
       jamp_sv[77] -= amp_sv[0];
       jamp_sv[101] -= amp_sv[0];
       jamp_sv[115] += amp_sv[0];
-      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[4], w_fp[104], COUPs[2], &amp_fp[0] );
+      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[4], w_fp[104], COUPs[2], 1.0, &amp_fp[0] );
       jamp_sv[16] -= amp_sv[0];
       jamp_sv[17] += amp_sv[0];
       jamp_sv[30] += amp_sv[0];
@@ -11678,7 +11678,7 @@ namespace mg5amcCpu
       jamp_sv[101] -= amp_sv[0];
       jamp_sv[114] -= amp_sv[0];
       jamp_sv[115] += amp_sv[0];
-      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[4], w_fp[104], COUPs[2], &amp_fp[0] );
+      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[4], w_fp[104], COUPs[2], 1.0, &amp_fp[0] );
       jamp_sv[16] -= amp_sv[0];
       jamp_sv[27] += amp_sv[0];
       jamp_sv[37] -= amp_sv[0];
@@ -11699,12 +11699,12 @@ namespace mg5amcCpu
       // *** DIAGRAM 566 OF 1240 ***
 
       // Wavefunction(s) for diagram number 566
-      VVVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[92], w_fp[1], w_fp[4], COUPs[2], 0., 0., w_fp[105] );
-      VVVV3P0_1<W_ACCESS, CD_ACCESS>( w_fp[92], w_fp[1], w_fp[4], COUPs[2], 0., 0., w_fp[95] );
-      VVVV4P0_1<W_ACCESS, CD_ACCESS>( w_fp[92], w_fp[1], w_fp[4], COUPs[2], 0., 0., w_fp[107] );
+      VVVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[92], w_fp[1], w_fp[4], COUPs[2], 1.0, 0., 0., w_fp[105] );
+      VVVV3P0_1<W_ACCESS, CD_ACCESS>( w_fp[92], w_fp[1], w_fp[4], COUPs[2], 1.0, 0., 0., w_fp[95] );
+      VVVV4P0_1<W_ACCESS, CD_ACCESS>( w_fp[92], w_fp[1], w_fp[4], COUPs[2], 1.0, 0., 0., w_fp[107] );
 
       // Amplitude(s) for diagram number 566
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[6], w_fp[105], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[6], w_fp[105], COUPs[0], 1.0, &amp_fp[0] );
       jamp_sv[12] += amp_sv[0];
       jamp_sv[14] -= amp_sv[0];
       jamp_sv[30] -= amp_sv[0];
@@ -11721,7 +11721,7 @@ namespace mg5amcCpu
       jamp_sv[111] += amp_sv[0];
       jamp_sv[114] += amp_sv[0];
       jamp_sv[115] -= amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[6], w_fp[95], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[6], w_fp[95], COUPs[0], 1.0, &amp_fp[0] );
       jamp_sv[14] -= amp_sv[0];
       jamp_sv[26] += amp_sv[0];
       jamp_sv[30] -= amp_sv[0];
@@ -11738,7 +11738,7 @@ namespace mg5amcCpu
       jamp_sv[109] -= amp_sv[0];
       jamp_sv[112] += amp_sv[0];
       jamp_sv[115] -= amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[6], w_fp[107], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[6], w_fp[107], COUPs[0], 1.0, &amp_fp[0] );
       jamp_sv[12] -= amp_sv[0];
       jamp_sv[26] += amp_sv[0];
       jamp_sv[36] -= amp_sv[0];
@@ -11759,12 +11759,12 @@ namespace mg5amcCpu
       // *** DIAGRAM 567 OF 1240 ***
 
       // Wavefunction(s) for diagram number 567
-      VVVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[92], w_fp[1], w_fp[6], COUPs[2], 0., 0., w_fp[96] );
-      VVVV3P0_1<W_ACCESS, CD_ACCESS>( w_fp[92], w_fp[1], w_fp[6], COUPs[2], 0., 0., w_fp[90] );
-      VVVV4P0_1<W_ACCESS, CD_ACCESS>( w_fp[92], w_fp[1], w_fp[6], COUPs[2], 0., 0., w_fp[88] );
+      VVVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[92], w_fp[1], w_fp[6], COUPs[2], 1.0, 0., 0., w_fp[96] );
+      VVVV3P0_1<W_ACCESS, CD_ACCESS>( w_fp[92], w_fp[1], w_fp[6], COUPs[2], 1.0, 0., 0., w_fp[90] );
+      VVVV4P0_1<W_ACCESS, CD_ACCESS>( w_fp[92], w_fp[1], w_fp[6], COUPs[2], 1.0, 0., 0., w_fp[88] );
 
       // Amplitude(s) for diagram number 567
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[4], w_fp[96], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[4], w_fp[96], COUPs[0], 1.0, &amp_fp[0] );
       jamp_sv[13] += amp_sv[0];
       jamp_sv[16] -= amp_sv[0];
       jamp_sv[43] -= amp_sv[0];
@@ -11781,7 +11781,7 @@ namespace mg5amcCpu
       jamp_sv[76] += amp_sv[0];
       jamp_sv[103] += amp_sv[0];
       jamp_sv[106] -= amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[4], w_fp[90], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[4], w_fp[90], COUPs[0], 1.0, &amp_fp[0] );
       jamp_sv[16] -= amp_sv[0];
       jamp_sv[27] += amp_sv[0];
       jamp_sv[37] -= amp_sv[0];
@@ -11798,7 +11798,7 @@ namespace mg5amcCpu
       jamp_sv[76] += amp_sv[0];
       jamp_sv[100] += amp_sv[0];
       jamp_sv[114] -= amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[4], w_fp[88], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[4], w_fp[88], COUPs[0], 1.0, &amp_fp[0] );
       jamp_sv[13] -= amp_sv[0];
       jamp_sv[27] += amp_sv[0];
       jamp_sv[37] -= amp_sv[0];
@@ -11822,7 +11822,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 568
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[111], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[111], COUPs[0], 1.0, &amp_fp[0] );
       jamp_sv[15] -= amp_sv[0];
       jamp_sv[17] += amp_sv[0];
       jamp_sv[26] += amp_sv[0];
@@ -11839,7 +11839,7 @@ namespace mg5amcCpu
       jamp_sv[77] -= amp_sv[0];
       jamp_sv[109] -= amp_sv[0];
       jamp_sv[112] += amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[110], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[110], COUPs[0], 1.0, &amp_fp[0] );
       jamp_sv[17] += amp_sv[0];
       jamp_sv[27] -= amp_sv[0];
       jamp_sv[30] += amp_sv[0];
@@ -11856,7 +11856,7 @@ namespace mg5amcCpu
       jamp_sv[77] -= amp_sv[0];
       jamp_sv[101] -= amp_sv[0];
       jamp_sv[115] += amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[109], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[109], COUPs[0], 1.0, &amp_fp[0] );
       jamp_sv[15] += amp_sv[0];
       jamp_sv[26] -= amp_sv[0];
       jamp_sv[30] += amp_sv[0];
@@ -11880,7 +11880,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 569
-      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[92], w_fp[1], w_fp[8], w_fp[27], COUPs[2], &amp_fp[0] );
+      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[92], w_fp[1], w_fp[8], w_fp[27], COUPs[2], 1.0, &amp_fp[0] );
       jamp_sv[12] += amp_sv[0];
       jamp_sv[13] -= amp_sv[0];
       jamp_sv[26] -= amp_sv[0];
@@ -11897,7 +11897,7 @@ namespace mg5amcCpu
       jamp_sv[110] -= amp_sv[0];
       jamp_sv[111] += amp_sv[0];
       jamp_sv[112] -= amp_sv[0];
-      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[92], w_fp[1], w_fp[8], w_fp[27], COUPs[2], &amp_fp[0] );
+      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[92], w_fp[1], w_fp[8], w_fp[27], COUPs[2], 1.0, &amp_fp[0] );
       jamp_sv[12] += amp_sv[0];
       jamp_sv[13] -= amp_sv[0];
       jamp_sv[15] -= amp_sv[0];
@@ -11914,7 +11914,7 @@ namespace mg5amcCpu
       jamp_sv[77] -= amp_sv[0];
       jamp_sv[110] -= amp_sv[0];
       jamp_sv[111] += amp_sv[0];
-      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[92], w_fp[1], w_fp[8], w_fp[27], COUPs[2], &amp_fp[0] );
+      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[92], w_fp[1], w_fp[8], w_fp[27], COUPs[2], 1.0, &amp_fp[0] );
       jamp_sv[15] -= amp_sv[0];
       jamp_sv[17] += amp_sv[0];
       jamp_sv[26] += amp_sv[0];
@@ -11938,7 +11938,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 570
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[27], w_fp[86], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[27], w_fp[86], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 570 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -11966,7 +11966,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 571
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[27], w_fp[101], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[27], w_fp[101], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 571 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -11994,7 +11994,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 572
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[62], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[62], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 572 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -12022,7 +12022,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 573
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[86], w_fp[37], w_fp[6], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[86], w_fp[37], w_fp[6], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 573 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -12042,7 +12042,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 574
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[36], w_fp[86], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[36], w_fp[86], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 574 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -12058,7 +12058,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 575
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[100], w_fp[6], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[100], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 575 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -12072,7 +12072,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 576
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[36], w_fp[1], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[36], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 576 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -12086,7 +12086,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 577
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[100], w_fp[104], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[100], w_fp[104], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 577 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -12102,7 +12102,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 578
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[104], w_fp[1], w_fp[37], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[104], w_fp[1], w_fp[37], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 578 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -12122,7 +12122,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 579
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[33], w_fp[96], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[33], w_fp[96], COUPs[1], 1.0, &amp_fp[0] );
       jamp_sv[50] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[51] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[58] += cxtype( 0, 1 ) * amp_sv[0];
@@ -12131,7 +12131,7 @@ namespace mg5amcCpu
       jamp_sv[61] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[68] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[69] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[33], w_fp[90], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[33], w_fp[90], COUPs[1], 1.0, &amp_fp[0] );
       jamp_sv[51] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[54] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[56] += cxtype( 0, 1 ) * amp_sv[0];
@@ -12140,7 +12140,7 @@ namespace mg5amcCpu
       jamp_sv[61] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[67] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[70] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[33], w_fp[88], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[33], w_fp[88], COUPs[1], 1.0, &amp_fp[0] );
       jamp_sv[50] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[54] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[56] += cxtype( 0, 1 ) * amp_sv[0];
@@ -12156,7 +12156,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 580
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[33], w_fp[86], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[33], w_fp[86], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 580 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -12172,7 +12172,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 581
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[98], w_fp[1], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[98], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 581 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -12186,7 +12186,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 582
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[113], w_fp[33], w_fp[1], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[113], w_fp[33], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 582 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -12200,7 +12200,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 583
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[86], w_fp[51], w_fp[4], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[86], w_fp[51], w_fp[4], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 583 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -12220,7 +12220,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 584
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[49], w_fp[86], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[49], w_fp[86], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 584 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -12236,7 +12236,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 585
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[91], w_fp[4], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[91], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 585 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -12250,7 +12250,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 586
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[49], w_fp[1], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[49], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 586 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -12264,7 +12264,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 587
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[91], w_fp[102], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[91], w_fp[102], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 587 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -12280,7 +12280,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 588
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[102], w_fp[1], w_fp[51], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[102], w_fp[1], w_fp[51], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 588 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -12300,7 +12300,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 589
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[47], w_fp[105], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[47], w_fp[105], COUPs[1], 1.0, &amp_fp[0] );
       jamp_sv[100] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[101] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[104] += cxtype( 0, 1 ) * amp_sv[0];
@@ -12309,7 +12309,7 @@ namespace mg5amcCpu
       jamp_sv[111] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[114] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[115] -= cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[47], w_fp[95], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[47], w_fp[95], COUPs[1], 1.0, &amp_fp[0] );
       jamp_sv[101] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[103] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[104] += cxtype( 0, 1 ) * amp_sv[0];
@@ -12318,7 +12318,7 @@ namespace mg5amcCpu
       jamp_sv[109] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[112] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[115] -= cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[47], w_fp[107], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[47], w_fp[107], COUPs[1], 1.0, &amp_fp[0] );
       jamp_sv[100] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[103] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[106] += cxtype( 0, 1 ) * amp_sv[0];
@@ -12334,7 +12334,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 590
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[47], w_fp[86], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[47], w_fp[86], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 590 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -12350,7 +12350,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 591
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[106], w_fp[1], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[106], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 591 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -12364,7 +12364,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 592
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[114], w_fp[47], w_fp[1], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[114], w_fp[47], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 592 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -12378,7 +12378,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 593
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[86], w_fp[54], w_fp[6], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[86], w_fp[54], w_fp[6], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 593 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -12398,7 +12398,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 594
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[53], w_fp[2], w_fp[86], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[53], w_fp[2], w_fp[86], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 594 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -12414,7 +12414,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 595
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[78], w_fp[112], w_fp[6], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[78], w_fp[112], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 595 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -12428,7 +12428,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 596
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[53], w_fp[112], w_fp[1], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[53], w_fp[112], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 596 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -12442,7 +12442,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 597
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[78], w_fp[2], w_fp[104], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[78], w_fp[2], w_fp[104], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 597 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -12458,7 +12458,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 598
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[104], w_fp[1], w_fp[54], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[104], w_fp[1], w_fp[54], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 598 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -12478,7 +12478,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 599
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[2], w_fp[96], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[2], w_fp[96], COUPs[1], 1.0, &amp_fp[0] );
       jamp_sv[13] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[16] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[43] += cxtype( 0, 1 ) * amp_sv[0];
@@ -12487,7 +12487,7 @@ namespace mg5amcCpu
       jamp_sv[76] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[103] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[106] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[2], w_fp[90], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[2], w_fp[90], COUPs[1], 1.0, &amp_fp[0] );
       jamp_sv[16] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[27] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[37] += cxtype( 0, 1 ) * amp_sv[0];
@@ -12496,7 +12496,7 @@ namespace mg5amcCpu
       jamp_sv[76] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[100] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[114] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[2], w_fp[88], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[2], w_fp[88], COUPs[1], 1.0, &amp_fp[0] );
       jamp_sv[13] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[27] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[37] += cxtype( 0, 1 ) * amp_sv[0];
@@ -12512,7 +12512,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 600
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[86], w_fp[20], w_fp[4], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[86], w_fp[20], w_fp[4], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 600 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -12532,7 +12532,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 601
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[28], w_fp[2], w_fp[86], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[28], w_fp[2], w_fp[86], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 601 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -12548,7 +12548,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 602
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[60], w_fp[112], w_fp[4], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[60], w_fp[112], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 602 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -12562,7 +12562,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 603
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[28], w_fp[112], w_fp[1], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[28], w_fp[112], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 603 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -12576,7 +12576,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 604
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[60], w_fp[2], w_fp[102], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[60], w_fp[2], w_fp[102], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 604 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -12592,7 +12592,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 605
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[102], w_fp[1], w_fp[20], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[102], w_fp[1], w_fp[20], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 605 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -12612,7 +12612,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 606
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[2], w_fp[105], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[2], w_fp[105], COUPs[1], 1.0, &amp_fp[0] );
       jamp_sv[12] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[14] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[30] += cxtype( 0, 1 ) * amp_sv[0];
@@ -12621,7 +12621,7 @@ namespace mg5amcCpu
       jamp_sv[56] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[72] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[74] -= cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[2], w_fp[95], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[2], w_fp[95], COUPs[1], 1.0, &amp_fp[0] );
       jamp_sv[14] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[26] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[30] += cxtype( 0, 1 ) * amp_sv[0];
@@ -12630,7 +12630,7 @@ namespace mg5amcCpu
       jamp_sv[50] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[60] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[74] -= cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[2], w_fp[107], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[2], w_fp[107], COUPs[1], 1.0, &amp_fp[0] );
       jamp_sv[12] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[26] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[36] += cxtype( 0, 1 ) * amp_sv[0];
@@ -12646,7 +12646,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 607
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[15], w_fp[86], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[15], w_fp[86], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 607 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -12666,7 +12666,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 608
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[14], w_fp[2], w_fp[86], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[14], w_fp[2], w_fp[86], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 608 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -12686,7 +12686,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 609
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[112], w_fp[68], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[112], w_fp[68], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 609 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -12706,7 +12706,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 610
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[14], w_fp[112], w_fp[1], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[14], w_fp[112], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 610 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -12722,7 +12722,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 611
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[2], w_fp[68], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[2], w_fp[68], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 611 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -12742,7 +12742,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 612
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[15], w_fp[1], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[15], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 612 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -12758,7 +12758,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 613
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[112], w_fp[57], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[112], w_fp[57], COUPs[1], 1.0, &amp_fp[0] );
       jamp_sv[12] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[13] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[15] += cxtype( 0, 1 ) * amp_sv[0];
@@ -12767,7 +12767,7 @@ namespace mg5amcCpu
       jamp_sv[73] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[75] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[77] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[112], w_fp[81], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[112], w_fp[81], COUPs[1], 1.0, &amp_fp[0] );
       jamp_sv[13] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[14] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[15] += cxtype( 0, 1 ) * amp_sv[0];
@@ -12776,7 +12776,7 @@ namespace mg5amcCpu
       jamp_sv[74] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[75] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[76] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[112], w_fp[82], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[112], w_fp[82], COUPs[1], 1.0, &amp_fp[0] );
       jamp_sv[12] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[14] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[16] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -12792,7 +12792,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 614
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[2], w_fp[57], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[2], w_fp[57], COUPs[1], 1.0, &amp_fp[0] );
       jamp_sv[34] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[35] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[44] += cxtype( 0, 1 ) * amp_sv[0];
@@ -12801,7 +12801,7 @@ namespace mg5amcCpu
       jamp_sv[69] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[110] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[111] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[2], w_fp[81], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[2], w_fp[81], COUPs[1], 1.0, &amp_fp[0] );
       jamp_sv[44] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[45] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[58] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -12810,7 +12810,7 @@ namespace mg5amcCpu
       jamp_sv[69] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[104] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[105] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[2], w_fp[82], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[2], w_fp[82], COUPs[1], 1.0, &amp_fp[0] );
       jamp_sv[34] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[35] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[58] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -12826,7 +12826,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 615
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[92], w_fp[57], w_fp[8], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[92], w_fp[57], w_fp[8], COUPs[0], 1.0, &amp_fp[0] );
       jamp_sv[12] -= amp_sv[0];
       jamp_sv[13] += amp_sv[0];
       jamp_sv[15] += amp_sv[0];
@@ -12843,7 +12843,7 @@ namespace mg5amcCpu
       jamp_sv[77] += amp_sv[0];
       jamp_sv[110] += amp_sv[0];
       jamp_sv[111] -= amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[92], w_fp[81], w_fp[8], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[92], w_fp[81], w_fp[8], COUPs[0], 1.0, &amp_fp[0] );
       jamp_sv[13] += amp_sv[0];
       jamp_sv[14] -= amp_sv[0];
       jamp_sv[15] += amp_sv[0];
@@ -12860,7 +12860,7 @@ namespace mg5amcCpu
       jamp_sv[76] += amp_sv[0];
       jamp_sv[104] += amp_sv[0];
       jamp_sv[105] -= amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[92], w_fp[82], w_fp[8], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[92], w_fp[82], w_fp[8], COUPs[0], 1.0, &amp_fp[0] );
       jamp_sv[12] += amp_sv[0];
       jamp_sv[14] -= amp_sv[0];
       jamp_sv[16] -= amp_sv[0];
@@ -12881,11 +12881,11 @@ namespace mg5amcCpu
       // *** DIAGRAM 616 OF 1240 ***
 
       // Wavefunction(s) for diagram number 616
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[6], COUPs[0], 0., 0., w_fp[92] );
-      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[92], COUPs[1], cIPD[0], cIPD[1], w_fp[99] );
+      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[6], COUPs[0], 1.0, 0., 0., w_fp[92] );
+      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[92], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[99] );
 
       // Amplitude(s) for diagram number 616
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[87], w_fp[5], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[87], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 616 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -12899,7 +12899,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 617
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[9], w_fp[4], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[9], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 617 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -12910,10 +12910,10 @@ namespace mg5amcCpu
       // *** DIAGRAM 618 OF 1240 ***
 
       // Wavefunction(s) for diagram number 618
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[92], w_fp[4], COUPs[0], 0., 0., w_fp[112] );
+      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[92], w_fp[4], COUPs[0], 1.0, 0., 0., w_fp[112] );
 
       // Amplitude(s) for diagram number 618
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[112], w_fp[34], w_fp[5], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[112], w_fp[34], w_fp[5], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 618 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -12933,7 +12933,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 619
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[9], w_fp[112], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[9], w_fp[112], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 619 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -12946,10 +12946,10 @@ namespace mg5amcCpu
       // *** DIAGRAM 620 OF 1240 ***
 
       // Wavefunction(s) for diagram number 620
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[92], w_fp[5], COUPs[0], 0., 0., w_fp[86] );
+      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[92], w_fp[5], COUPs[0], 1.0, 0., 0., w_fp[86] );
 
       // Amplitude(s) for diagram number 620
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[86], w_fp[34], w_fp[4], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[86], w_fp[34], w_fp[4], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 620 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -12969,7 +12969,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 621
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[87], w_fp[86], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[87], w_fp[86], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 621 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -12982,12 +12982,12 @@ namespace mg5amcCpu
       // *** DIAGRAM 622 OF 1240 ***
 
       // Wavefunction(s) for diagram number 622
-      VVVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[92], w_fp[4], w_fp[5], COUPs[2], 0., 0., w_fp[107] );
-      VVVV3P0_1<W_ACCESS, CD_ACCESS>( w_fp[92], w_fp[4], w_fp[5], COUPs[2], 0., 0., w_fp[95] );
-      VVVV4P0_1<W_ACCESS, CD_ACCESS>( w_fp[92], w_fp[4], w_fp[5], COUPs[2], 0., 0., w_fp[105] );
+      VVVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[92], w_fp[4], w_fp[5], COUPs[2], 1.0, 0., 0., w_fp[107] );
+      VVVV3P0_1<W_ACCESS, CD_ACCESS>( w_fp[92], w_fp[4], w_fp[5], COUPs[2], 1.0, 0., 0., w_fp[95] );
+      VVVV4P0_1<W_ACCESS, CD_ACCESS>( w_fp[92], w_fp[4], w_fp[5], COUPs[2], 1.0, 0., 0., w_fp[105] );
 
       // Amplitude(s) for diagram number 622
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[77], w_fp[107], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[77], w_fp[107], COUPs[1], 1.0, &amp_fp[0] );
       jamp_sv[28] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[29] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[32] += cxtype( 0, 1 ) * amp_sv[0];
@@ -12996,7 +12996,7 @@ namespace mg5amcCpu
       jamp_sv[39] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[42] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[43] -= cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[77], w_fp[95], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[77], w_fp[95], COUPs[1], 1.0, &amp_fp[0] );
       jamp_sv[29] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[31] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[32] += cxtype( 0, 1 ) * amp_sv[0];
@@ -13005,7 +13005,7 @@ namespace mg5amcCpu
       jamp_sv[37] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[40] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[43] -= cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[77], w_fp[105], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[77], w_fp[105], COUPs[1], 1.0, &amp_fp[0] );
       jamp_sv[28] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[31] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[34] += cxtype( 0, 1 ) * amp_sv[0];
@@ -13018,10 +13018,10 @@ namespace mg5amcCpu
       // *** DIAGRAM 623 OF 1240 ***
 
       // Wavefunction(s) for diagram number 623
-      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[77], w_fp[92], COUPs[1], cIPD[0], cIPD[1], w_fp[102] );
+      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[77], w_fp[92], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[102] );
 
       // Amplitude(s) for diagram number 623
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[102], w_fp[5], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[102], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 623 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -13032,10 +13032,10 @@ namespace mg5amcCpu
       // *** DIAGRAM 624 OF 1240 ***
 
       // Wavefunction(s) for diagram number 624
-      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[46], w_fp[92], COUPs[1], cIPD[0], cIPD[1], w_fp[88] );
+      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[46], w_fp[92], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[88] );
 
       // Amplitude(s) for diagram number 624
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[88], w_fp[77], w_fp[5], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[88], w_fp[77], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 624 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -13049,7 +13049,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 625
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[77], w_fp[86], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[77], w_fp[86], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 625 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -13065,7 +13065,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 626
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[102], w_fp[4], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[102], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 626 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -13076,10 +13076,10 @@ namespace mg5amcCpu
       // *** DIAGRAM 627 OF 1240 ***
 
       // Wavefunction(s) for diagram number 627
-      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[38], w_fp[92], COUPs[1], cIPD[0], cIPD[1], w_fp[90] );
+      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[38], w_fp[92], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[90] );
 
       // Amplitude(s) for diagram number 627
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[90], w_fp[77], w_fp[4], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[90], w_fp[77], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 627 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -13093,7 +13093,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 628
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[77], w_fp[112], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[77], w_fp[112], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 628 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -13109,7 +13109,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 629
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[102], w_fp[24], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[102], w_fp[24], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 629 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -13125,7 +13125,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 630
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[77], w_fp[24], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[77], w_fp[24], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 630 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -13138,10 +13138,10 @@ namespace mg5amcCpu
       // *** DIAGRAM 631 OF 1240 ***
 
       // Wavefunction(s) for diagram number 631
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[92], w_fp[24], COUPs[0], 0., 0., w_fp[102] );
+      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[92], w_fp[24], COUPs[0], 1.0, 0., 0., w_fp[102] );
 
       // Amplitude(s) for diagram number 631
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[77], w_fp[102], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[77], w_fp[102], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 631 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -13158,10 +13158,10 @@ namespace mg5amcCpu
       // *** DIAGRAM 632 OF 1240 ***
 
       // Wavefunction(s) for diagram number 632
-      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[2], w_fp[92], COUPs[1], cIPD[0], cIPD[1], w_fp[96] );
+      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[2], w_fp[92], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[96] );
 
       // Amplitude(s) for diagram number 632
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[56], w_fp[96], w_fp[5], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[56], w_fp[96], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 632 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -13175,7 +13175,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 633
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[22], w_fp[96], w_fp[4], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[22], w_fp[96], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 633 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -13189,7 +13189,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 634
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[112], w_fp[103], w_fp[5], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[112], w_fp[103], w_fp[5], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 634 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -13209,7 +13209,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 635
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[22], w_fp[2], w_fp[112], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[22], w_fp[2], w_fp[112], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 635 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -13225,7 +13225,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 636
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[86], w_fp[103], w_fp[4], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[86], w_fp[103], w_fp[4], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 636 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -13245,7 +13245,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 637
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[56], w_fp[2], w_fp[86], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[56], w_fp[2], w_fp[86], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 637 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -13261,7 +13261,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 638
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[2], w_fp[107], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[2], w_fp[107], COUPs[1], 1.0, &amp_fp[0] );
       jamp_sv[21] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[23] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[61] += cxtype( 0, 1 ) * amp_sv[0];
@@ -13270,7 +13270,7 @@ namespace mg5amcCpu
       jamp_sv[88] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[99] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[101] -= cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[2], w_fp[95], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[2], w_fp[95], COUPs[1], 1.0, &amp_fp[0] );
       jamp_sv[23] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[53] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[61] += cxtype( 0, 1 ) * amp_sv[0];
@@ -13279,7 +13279,7 @@ namespace mg5amcCpu
       jamp_sv[77] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[91] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[101] -= cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[2], w_fp[105], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[2], w_fp[105], COUPs[1], 1.0, &amp_fp[0] );
       jamp_sv[21] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[53] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[67] += cxtype( 0, 1 ) * amp_sv[0];
@@ -13292,10 +13292,10 @@ namespace mg5amcCpu
       // *** DIAGRAM 639 OF 1240 ***
 
       // Wavefunction(s) for diagram number 639
-      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[52], w_fp[92], COUPs[1], cIPD[0], cIPD[1], w_fp[104] );
+      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[52], w_fp[92], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[104] );
 
       // Amplitude(s) for diagram number 639
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[104], w_fp[33], w_fp[5], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[104], w_fp[33], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 639 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -13306,10 +13306,10 @@ namespace mg5amcCpu
       // *** DIAGRAM 640 OF 1240 ***
 
       // Wavefunction(s) for diagram number 640
-      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[33], w_fp[92], COUPs[1], cIPD[0], cIPD[1], w_fp[114] );
+      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[33], w_fp[92], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[114] );
 
       // Amplitude(s) for diagram number 640
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[114], w_fp[5], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[114], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 640 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -13323,7 +13323,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 641
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[33], w_fp[86], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[33], w_fp[86], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 641 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -13339,7 +13339,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 642
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[104], w_fp[39], w_fp[4], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[104], w_fp[39], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 642 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -13350,10 +13350,10 @@ namespace mg5amcCpu
       // *** DIAGRAM 643 OF 1240 ***
 
       // Wavefunction(s) for diagram number 643
-      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[39], w_fp[92], COUPs[1], cIPD[0], cIPD[1], w_fp[106] );
+      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[39], w_fp[92], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[106] );
 
       // Amplitude(s) for diagram number 643
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[106], w_fp[4], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[106], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 643 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -13367,7 +13367,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 644
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[39], w_fp[112], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[39], w_fp[112], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 644 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -13383,7 +13383,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 645
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[104], w_fp[2], w_fp[24], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[104], w_fp[2], w_fp[24], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 645 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -13399,7 +13399,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 646
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[96], w_fp[24], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[96], w_fp[24], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 646 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -13415,7 +13415,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 647
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[2], w_fp[102], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[2], w_fp[102], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 647 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -13435,7 +13435,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 648
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[65], w_fp[96], w_fp[5], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[65], w_fp[96], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 648 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -13451,7 +13451,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 649
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[96], w_fp[63], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[96], w_fp[63], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 649 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -13471,7 +13471,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 650
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[93], w_fp[5], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[93], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 650 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -13487,7 +13487,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 651
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[2], w_fp[63], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[2], w_fp[63], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 651 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -13507,7 +13507,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 652
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[93], w_fp[86], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[93], w_fp[86], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 652 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -13527,7 +13527,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 653
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[65], w_fp[2], w_fp[86], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[65], w_fp[2], w_fp[86], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 653 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -13547,7 +13547,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 654
-      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[92], w_fp[61], w_fp[8], w_fp[5], COUPs[2], &amp_fp[0] );
+      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[92], w_fp[61], w_fp[8], w_fp[5], COUPs[2], 1.0, &amp_fp[0] );
       jamp_sv[18] += amp_sv[0];
       jamp_sv[20] -= amp_sv[0];
       jamp_sv[31] -= amp_sv[0];
@@ -13564,7 +13564,7 @@ namespace mg5amcCpu
       jamp_sv[91] -= amp_sv[0];
       jamp_sv[96] -= amp_sv[0];
       jamp_sv[98] += amp_sv[0];
-      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[92], w_fp[61], w_fp[8], w_fp[5], COUPs[2], &amp_fp[0] );
+      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[92], w_fp[61], w_fp[8], w_fp[5], COUPs[2], 1.0, &amp_fp[0] );
       jamp_sv[18] += amp_sv[0];
       jamp_sv[20] -= amp_sv[0];
       jamp_sv[22] -= amp_sv[0];
@@ -13581,7 +13581,7 @@ namespace mg5amcCpu
       jamp_sv[98] += amp_sv[0];
       jamp_sv[100] += amp_sv[0];
       jamp_sv[101] -= amp_sv[0];
-      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[92], w_fp[61], w_fp[8], w_fp[5], COUPs[2], &amp_fp[0] );
+      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[92], w_fp[61], w_fp[8], w_fp[5], COUPs[2], 1.0, &amp_fp[0] );
       jamp_sv[22] -= amp_sv[0];
       jamp_sv[23] += amp_sv[0];
       jamp_sv[31] += amp_sv[0];
@@ -13602,10 +13602,10 @@ namespace mg5amcCpu
       // *** DIAGRAM 655 OF 1240 ***
 
       // Wavefunction(s) for diagram number 655
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[92], w_fp[61], COUPs[0], 0., 0., w_fp[104] );
+      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[92], w_fp[61], COUPs[0], 1.0, 0., 0., w_fp[104] );
 
       // Amplitude(s) for diagram number 655
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[5], w_fp[104], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[5], w_fp[104], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 655 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -13630,10 +13630,10 @@ namespace mg5amcCpu
       // *** DIAGRAM 656 OF 1240 ***
 
       // Wavefunction(s) for diagram number 656
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[92], w_fp[8], COUPs[0], 0., 0., w_fp[113] );
+      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[92], w_fp[8], COUPs[0], 1.0, 0., 0., w_fp[113] );
 
       // Amplitude(s) for diagram number 656
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[61], w_fp[5], w_fp[113], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[61], w_fp[5], w_fp[113], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 656 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -13661,7 +13661,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 657
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[61], w_fp[8], w_fp[86], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[61], w_fp[8], w_fp[86], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 657 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -13689,7 +13689,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 658
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[39], w_fp[104], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[39], w_fp[104], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 658 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -13709,7 +13709,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 659
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[106], w_fp[61], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[106], w_fp[61], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 659 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -13725,7 +13725,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 660
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[39], w_fp[61], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[39], w_fp[61], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 660 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -13741,7 +13741,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 661
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[2], w_fp[104], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[2], w_fp[104], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 661 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -13761,7 +13761,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 662
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[96], w_fp[61], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[96], w_fp[61], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 662 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -13777,7 +13777,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 663
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[90], w_fp[2], w_fp[61], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[90], w_fp[2], w_fp[61], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 663 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -13793,7 +13793,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 664
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[71], w_fp[96], w_fp[4], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[71], w_fp[96], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 664 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -13809,7 +13809,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 665
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[96], w_fp[69], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[96], w_fp[69], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 665 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -13829,7 +13829,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 666
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[94], w_fp[4], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[94], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 666 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -13845,7 +13845,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 667
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[2], w_fp[69], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[2], w_fp[69], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 667 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -13865,7 +13865,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 668
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[94], w_fp[112], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[94], w_fp[112], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 668 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -13885,7 +13885,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 669
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[71], w_fp[2], w_fp[112], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[71], w_fp[2], w_fp[112], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 669 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -13905,7 +13905,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 670
-      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[92], w_fp[66], w_fp[8], w_fp[4], COUPs[2], &amp_fp[0] );
+      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[92], w_fp[66], w_fp[8], w_fp[4], COUPs[2], 1.0, &amp_fp[0] );
       jamp_sv[19] += amp_sv[0];
       jamp_sv[22] -= amp_sv[0];
       jamp_sv[37] -= amp_sv[0];
@@ -13922,7 +13922,7 @@ namespace mg5amcCpu
       jamp_sv[82] -= amp_sv[0];
       jamp_sv[97] -= amp_sv[0];
       jamp_sv[100] += amp_sv[0];
-      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[92], w_fp[66], w_fp[8], w_fp[4], COUPs[2], &amp_fp[0] );
+      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[92], w_fp[66], w_fp[8], w_fp[4], COUPs[2], 1.0, &amp_fp[0] );
       jamp_sv[19] += amp_sv[0];
       jamp_sv[20] -= amp_sv[0];
       jamp_sv[21] += amp_sv[0];
@@ -13939,7 +13939,7 @@ namespace mg5amcCpu
       jamp_sv[98] += amp_sv[0];
       jamp_sv[99] -= amp_sv[0];
       jamp_sv[100] += amp_sv[0];
-      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[92], w_fp[66], w_fp[8], w_fp[4], COUPs[2], &amp_fp[0] );
+      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[92], w_fp[66], w_fp[8], w_fp[4], COUPs[2], 1.0, &amp_fp[0] );
       jamp_sv[20] -= amp_sv[0];
       jamp_sv[21] += amp_sv[0];
       jamp_sv[37] += amp_sv[0];
@@ -13960,10 +13960,10 @@ namespace mg5amcCpu
       // *** DIAGRAM 671 OF 1240 ***
 
       // Wavefunction(s) for diagram number 671
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[92], w_fp[66], COUPs[0], 0., 0., w_fp[104] );
+      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[92], w_fp[66], COUPs[0], 1.0, 0., 0., w_fp[104] );
 
       // Amplitude(s) for diagram number 671
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[4], w_fp[104], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[4], w_fp[104], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 671 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -13991,7 +13991,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 672
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[66], w_fp[4], w_fp[113], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[66], w_fp[4], w_fp[113], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 672 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -14019,7 +14019,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 673
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[66], w_fp[8], w_fp[112], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[66], w_fp[8], w_fp[112], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 673 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -14047,7 +14047,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 674
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[33], w_fp[104], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[33], w_fp[104], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 674 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -14067,7 +14067,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 675
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[114], w_fp[66], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[114], w_fp[66], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 675 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -14083,7 +14083,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 676
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[33], w_fp[66], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[33], w_fp[66], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 676 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -14099,7 +14099,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 677
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[2], w_fp[104], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[2], w_fp[104], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 677 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -14119,7 +14119,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 678
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[96], w_fp[66], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[96], w_fp[66], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 678 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -14135,7 +14135,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 679
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[88], w_fp[2], w_fp[66], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[88], w_fp[2], w_fp[66], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 679 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -14148,10 +14148,10 @@ namespace mg5amcCpu
       // *** DIAGRAM 680 OF 1240 ***
 
       // Wavefunction(s) for diagram number 680
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[92], w_fp[1], COUPs[0], 0., 0., w_fp[104] );
+      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[92], w_fp[1], COUPs[0], 1.0, 0., 0., w_fp[104] );
 
       // Amplitude(s) for diagram number 680
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[104], w_fp[13], w_fp[5], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[104], w_fp[13], w_fp[5], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 680 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -14179,7 +14179,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 681
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[104], w_fp[10], w_fp[4], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[104], w_fp[10], w_fp[4], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 681 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -14207,7 +14207,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 682
-      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[4], w_fp[5], w_fp[104], COUPs[2], &amp_fp[0] );
+      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[4], w_fp[5], w_fp[104], COUPs[2], 1.0, &amp_fp[0] );
       jamp_sv[19] -= amp_sv[0];
       jamp_sv[29] += amp_sv[0];
       jamp_sv[43] -= amp_sv[0];
@@ -14224,7 +14224,7 @@ namespace mg5amcCpu
       jamp_sv[82] += amp_sv[0];
       jamp_sv[90] -= amp_sv[0];
       jamp_sv[97] += amp_sv[0];
-      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[4], w_fp[5], w_fp[104], COUPs[2], &amp_fp[0] );
+      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[4], w_fp[5], w_fp[104], COUPs[2], 1.0, &amp_fp[0] );
       jamp_sv[18] -= amp_sv[0];
       jamp_sv[28] += amp_sv[0];
       jamp_sv[42] -= amp_sv[0];
@@ -14241,7 +14241,7 @@ namespace mg5amcCpu
       jamp_sv[88] += amp_sv[0];
       jamp_sv[90] -= amp_sv[0];
       jamp_sv[96] += amp_sv[0];
-      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[4], w_fp[5], w_fp[104], COUPs[2], &amp_fp[0] );
+      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[4], w_fp[5], w_fp[104], COUPs[2], 1.0, &amp_fp[0] );
       jamp_sv[18] -= amp_sv[0];
       jamp_sv[19] += amp_sv[0];
       jamp_sv[28] += amp_sv[0];
@@ -14265,7 +14265,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 683
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[112], w_fp[108], w_fp[5], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[112], w_fp[108], w_fp[5], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 683 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -14293,7 +14293,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 684
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[112], w_fp[1], w_fp[10], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[112], w_fp[1], w_fp[10], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 684 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -14321,7 +14321,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 685
-      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[5], w_fp[112], COUPs[2], &amp_fp[0] );
+      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[5], w_fp[112], COUPs[2], 1.0, &amp_fp[0] );
       jamp_sv[21] += amp_sv[0];
       jamp_sv[28] -= amp_sv[0];
       jamp_sv[31] += amp_sv[0];
@@ -14338,7 +14338,7 @@ namespace mg5amcCpu
       jamp_sv[88] -= amp_sv[0];
       jamp_sv[91] += amp_sv[0];
       jamp_sv[99] -= amp_sv[0];
-      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[5], w_fp[112], COUPs[2], &amp_fp[0] );
+      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[5], w_fp[112], COUPs[2], 1.0, &amp_fp[0] );
       jamp_sv[20] -= amp_sv[0];
       jamp_sv[21] += amp_sv[0];
       jamp_sv[37] += amp_sv[0];
@@ -14355,7 +14355,7 @@ namespace mg5amcCpu
       jamp_sv[82] += amp_sv[0];
       jamp_sv[98] += amp_sv[0];
       jamp_sv[99] -= amp_sv[0];
-      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[5], w_fp[112], COUPs[2], &amp_fp[0] );
+      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[5], w_fp[112], COUPs[2], 1.0, &amp_fp[0] );
       jamp_sv[20] -= amp_sv[0];
       jamp_sv[28] += amp_sv[0];
       jamp_sv[31] -= amp_sv[0];
@@ -14379,7 +14379,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 686
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[86], w_fp[108], w_fp[4], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[86], w_fp[108], w_fp[4], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 686 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -14407,7 +14407,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 687
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[86], w_fp[1], w_fp[13], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[86], w_fp[1], w_fp[13], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 687 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -14435,7 +14435,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 688
-      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[4], w_fp[86], COUPs[2], &amp_fp[0] );
+      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[4], w_fp[86], COUPs[2], 1.0, &amp_fp[0] );
       jamp_sv[23] += amp_sv[0];
       jamp_sv[29] -= amp_sv[0];
       jamp_sv[31] += amp_sv[0];
@@ -14452,7 +14452,7 @@ namespace mg5amcCpu
       jamp_sv[77] -= amp_sv[0];
       jamp_sv[91] += amp_sv[0];
       jamp_sv[101] -= amp_sv[0];
-      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[4], w_fp[86], COUPs[2], &amp_fp[0] );
+      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[4], w_fp[86], COUPs[2], 1.0, &amp_fp[0] );
       jamp_sv[22] -= amp_sv[0];
       jamp_sv[23] += amp_sv[0];
       jamp_sv[31] += amp_sv[0];
@@ -14469,7 +14469,7 @@ namespace mg5amcCpu
       jamp_sv[91] += amp_sv[0];
       jamp_sv[100] += amp_sv[0];
       jamp_sv[101] -= amp_sv[0];
-      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[4], w_fp[86], COUPs[2], &amp_fp[0] );
+      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[4], w_fp[86], COUPs[2], 1.0, &amp_fp[0] );
       jamp_sv[22] -= amp_sv[0];
       jamp_sv[29] += amp_sv[0];
       jamp_sv[37] -= amp_sv[0];
@@ -14490,12 +14490,12 @@ namespace mg5amcCpu
       // *** DIAGRAM 689 OF 1240 ***
 
       // Wavefunction(s) for diagram number 689
-      VVVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[92], w_fp[1], w_fp[4], COUPs[2], 0., 0., w_fp[98] );
-      VVVV3P0_1<W_ACCESS, CD_ACCESS>( w_fp[92], w_fp[1], w_fp[4], COUPs[2], 0., 0., w_fp[62] );
-      VVVV4P0_1<W_ACCESS, CD_ACCESS>( w_fp[92], w_fp[1], w_fp[4], COUPs[2], 0., 0., w_fp[101] );
+      VVVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[92], w_fp[1], w_fp[4], COUPs[2], 1.0, 0., 0., w_fp[98] );
+      VVVV3P0_1<W_ACCESS, CD_ACCESS>( w_fp[92], w_fp[1], w_fp[4], COUPs[2], 1.0, 0., 0., w_fp[62] );
+      VVVV4P0_1<W_ACCESS, CD_ACCESS>( w_fp[92], w_fp[1], w_fp[4], COUPs[2], 1.0, 0., 0., w_fp[101] );
 
       // Amplitude(s) for diagram number 689
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[5], w_fp[98], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[5], w_fp[98], COUPs[0], 1.0, &amp_fp[0] );
       jamp_sv[18] += amp_sv[0];
       jamp_sv[20] -= amp_sv[0];
       jamp_sv[31] -= amp_sv[0];
@@ -14512,7 +14512,7 @@ namespace mg5amcCpu
       jamp_sv[91] -= amp_sv[0];
       jamp_sv[96] -= amp_sv[0];
       jamp_sv[98] += amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[5], w_fp[62], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[5], w_fp[62], COUPs[0], 1.0, &amp_fp[0] );
       jamp_sv[20] -= amp_sv[0];
       jamp_sv[28] += amp_sv[0];
       jamp_sv[31] -= amp_sv[0];
@@ -14529,7 +14529,7 @@ namespace mg5amcCpu
       jamp_sv[88] += amp_sv[0];
       jamp_sv[91] -= amp_sv[0];
       jamp_sv[98] += amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[5], w_fp[101], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[5], w_fp[101], COUPs[0], 1.0, &amp_fp[0] );
       jamp_sv[18] -= amp_sv[0];
       jamp_sv[28] += amp_sv[0];
       jamp_sv[42] -= amp_sv[0];
@@ -14550,12 +14550,12 @@ namespace mg5amcCpu
       // *** DIAGRAM 690 OF 1240 ***
 
       // Wavefunction(s) for diagram number 690
-      VVVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[92], w_fp[1], w_fp[5], COUPs[2], 0., 0., w_fp[109] );
-      VVVV3P0_1<W_ACCESS, CD_ACCESS>( w_fp[92], w_fp[1], w_fp[5], COUPs[2], 0., 0., w_fp[110] );
-      VVVV4P0_1<W_ACCESS, CD_ACCESS>( w_fp[92], w_fp[1], w_fp[5], COUPs[2], 0., 0., w_fp[111] );
+      VVVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[92], w_fp[1], w_fp[5], COUPs[2], 1.0, 0., 0., w_fp[109] );
+      VVVV3P0_1<W_ACCESS, CD_ACCESS>( w_fp[92], w_fp[1], w_fp[5], COUPs[2], 1.0, 0., 0., w_fp[110] );
+      VVVV4P0_1<W_ACCESS, CD_ACCESS>( w_fp[92], w_fp[1], w_fp[5], COUPs[2], 1.0, 0., 0., w_fp[111] );
 
       // Amplitude(s) for diagram number 690
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[4], w_fp[109], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[4], w_fp[109], COUPs[0], 1.0, &amp_fp[0] );
       jamp_sv[19] += amp_sv[0];
       jamp_sv[22] -= amp_sv[0];
       jamp_sv[37] -= amp_sv[0];
@@ -14572,7 +14572,7 @@ namespace mg5amcCpu
       jamp_sv[82] -= amp_sv[0];
       jamp_sv[97] -= amp_sv[0];
       jamp_sv[100] += amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[4], w_fp[110], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[4], w_fp[110], COUPs[0], 1.0, &amp_fp[0] );
       jamp_sv[22] -= amp_sv[0];
       jamp_sv[29] += amp_sv[0];
       jamp_sv[37] -= amp_sv[0];
@@ -14589,7 +14589,7 @@ namespace mg5amcCpu
       jamp_sv[76] += amp_sv[0];
       jamp_sv[90] -= amp_sv[0];
       jamp_sv[100] += amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[4], w_fp[111], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[4], w_fp[111], COUPs[0], 1.0, &amp_fp[0] );
       jamp_sv[19] -= amp_sv[0];
       jamp_sv[29] += amp_sv[0];
       jamp_sv[43] -= amp_sv[0];
@@ -14613,7 +14613,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 691
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[107], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[107], COUPs[0], 1.0, &amp_fp[0] );
       jamp_sv[21] -= amp_sv[0];
       jamp_sv[23] += amp_sv[0];
       jamp_sv[28] += amp_sv[0];
@@ -14630,7 +14630,7 @@ namespace mg5amcCpu
       jamp_sv[88] += amp_sv[0];
       jamp_sv[99] += amp_sv[0];
       jamp_sv[101] -= amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[95], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[95], COUPs[0], 1.0, &amp_fp[0] );
       jamp_sv[23] += amp_sv[0];
       jamp_sv[29] -= amp_sv[0];
       jamp_sv[31] += amp_sv[0];
@@ -14647,7 +14647,7 @@ namespace mg5amcCpu
       jamp_sv[77] -= amp_sv[0];
       jamp_sv[91] += amp_sv[0];
       jamp_sv[101] -= amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[105], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[105], COUPs[0], 1.0, &amp_fp[0] );
       jamp_sv[21] += amp_sv[0];
       jamp_sv[28] -= amp_sv[0];
       jamp_sv[31] += amp_sv[0];
@@ -14671,7 +14671,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 692
-      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[92], w_fp[1], w_fp[8], w_fp[24], COUPs[2], &amp_fp[0] );
+      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[92], w_fp[1], w_fp[8], w_fp[24], COUPs[2], 1.0, &amp_fp[0] );
       jamp_sv[18] += amp_sv[0];
       jamp_sv[19] -= amp_sv[0];
       jamp_sv[28] -= amp_sv[0];
@@ -14688,7 +14688,7 @@ namespace mg5amcCpu
       jamp_sv[88] -= amp_sv[0];
       jamp_sv[96] -= amp_sv[0];
       jamp_sv[97] += amp_sv[0];
-      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[92], w_fp[1], w_fp[8], w_fp[24], COUPs[2], &amp_fp[0] );
+      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[92], w_fp[1], w_fp[8], w_fp[24], COUPs[2], 1.0, &amp_fp[0] );
       jamp_sv[18] += amp_sv[0];
       jamp_sv[19] -= amp_sv[0];
       jamp_sv[21] -= amp_sv[0];
@@ -14705,7 +14705,7 @@ namespace mg5amcCpu
       jamp_sv[97] += amp_sv[0];
       jamp_sv[99] += amp_sv[0];
       jamp_sv[101] -= amp_sv[0];
-      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[92], w_fp[1], w_fp[8], w_fp[24], COUPs[2], &amp_fp[0] );
+      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[92], w_fp[1], w_fp[8], w_fp[24], COUPs[2], 1.0, &amp_fp[0] );
       jamp_sv[21] -= amp_sv[0];
       jamp_sv[23] += amp_sv[0];
       jamp_sv[28] += amp_sv[0];
@@ -14729,7 +14729,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 693
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[24], w_fp[104], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[24], w_fp[104], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 693 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -14757,7 +14757,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 694
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[24], w_fp[113], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[24], w_fp[113], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 694 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -14785,7 +14785,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 695
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[102], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[102], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 695 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -14813,7 +14813,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 696
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[104], w_fp[37], w_fp[5], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[104], w_fp[37], w_fp[5], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 696 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -14833,7 +14833,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 697
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[35], w_fp[104], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[35], w_fp[104], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 697 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -14849,7 +14849,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 698
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[100], w_fp[5], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[100], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 698 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -14863,7 +14863,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 699
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[35], w_fp[1], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[35], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 699 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -14877,7 +14877,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 700
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[100], w_fp[86], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[100], w_fp[86], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 700 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -14893,7 +14893,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 701
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[86], w_fp[1], w_fp[37], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[86], w_fp[1], w_fp[37], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 701 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -14913,7 +14913,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 702
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[33], w_fp[109], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[33], w_fp[109], COUPs[1], 1.0, &amp_fp[0] );
       jamp_sv[52] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[53] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[56] += cxtype( 0, 1 ) * amp_sv[0];
@@ -14922,7 +14922,7 @@ namespace mg5amcCpu
       jamp_sv[63] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[66] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[67] -= cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[33], w_fp[110], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[33], w_fp[110], COUPs[1], 1.0, &amp_fp[0] );
       jamp_sv[53] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[55] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[56] += cxtype( 0, 1 ) * amp_sv[0];
@@ -14931,7 +14931,7 @@ namespace mg5amcCpu
       jamp_sv[61] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[64] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[67] -= cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[33], w_fp[111], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[33], w_fp[111], COUPs[1], 1.0, &amp_fp[0] );
       jamp_sv[52] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[55] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[58] += cxtype( 0, 1 ) * amp_sv[0];
@@ -14947,7 +14947,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 703
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[33], w_fp[104], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[33], w_fp[104], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 703 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -14963,7 +14963,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 704
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[114], w_fp[1], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[114], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 704 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -14977,7 +14977,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 705
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[90], w_fp[33], w_fp[1], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[90], w_fp[33], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 705 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -14991,7 +14991,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 706
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[104], w_fp[45], w_fp[4], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[104], w_fp[45], w_fp[4], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 706 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -15011,7 +15011,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 707
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[43], w_fp[104], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[43], w_fp[104], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 707 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -15027,7 +15027,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 708
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[89], w_fp[4], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[89], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 708 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -15041,7 +15041,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 709
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[43], w_fp[1], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[43], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 709 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -15055,7 +15055,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 710
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[89], w_fp[112], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[89], w_fp[112], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 710 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -15071,7 +15071,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 711
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[112], w_fp[1], w_fp[45], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[112], w_fp[1], w_fp[45], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 711 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -15091,7 +15091,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 712
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[39], w_fp[98], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[39], w_fp[98], COUPs[1], 1.0, &amp_fp[0] );
       jamp_sv[76] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[77] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[80] += cxtype( 0, 1 ) * amp_sv[0];
@@ -15100,7 +15100,7 @@ namespace mg5amcCpu
       jamp_sv[87] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[90] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[91] -= cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[39], w_fp[62], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[39], w_fp[62], COUPs[1], 1.0, &amp_fp[0] );
       jamp_sv[77] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[79] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[80] += cxtype( 0, 1 ) * amp_sv[0];
@@ -15109,7 +15109,7 @@ namespace mg5amcCpu
       jamp_sv[85] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[88] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[91] -= cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[39], w_fp[101], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[39], w_fp[101], COUPs[1], 1.0, &amp_fp[0] );
       jamp_sv[76] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[79] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[82] += cxtype( 0, 1 ) * amp_sv[0];
@@ -15125,7 +15125,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 713
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[39], w_fp[104], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[39], w_fp[104], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 713 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -15141,7 +15141,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 714
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[106], w_fp[1], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[106], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 714 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -15155,7 +15155,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 715
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[88], w_fp[39], w_fp[1], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[88], w_fp[39], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 715 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -15169,7 +15169,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 716
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[104], w_fp[54], w_fp[5], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[104], w_fp[54], w_fp[5], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 716 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -15189,7 +15189,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 717
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[7], w_fp[2], w_fp[104], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[7], w_fp[2], w_fp[104], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 717 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -15205,7 +15205,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 718
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[78], w_fp[96], w_fp[5], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[78], w_fp[96], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 718 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -15219,7 +15219,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 719
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[7], w_fp[96], w_fp[1], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[7], w_fp[96], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 719 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -15233,7 +15233,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 720
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[78], w_fp[2], w_fp[86], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[78], w_fp[2], w_fp[86], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 720 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -15249,7 +15249,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 721
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[86], w_fp[1], w_fp[54], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[86], w_fp[1], w_fp[54], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 721 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -15269,7 +15269,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 722
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[2], w_fp[109], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[2], w_fp[109], COUPs[1], 1.0, &amp_fp[0] );
       jamp_sv[19] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[22] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[37] += cxtype( 0, 1 ) * amp_sv[0];
@@ -15278,7 +15278,7 @@ namespace mg5amcCpu
       jamp_sv[82] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[97] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[100] -= cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[2], w_fp[110], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[2], w_fp[110], COUPs[1], 1.0, &amp_fp[0] );
       jamp_sv[22] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[29] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[37] += cxtype( 0, 1 ) * amp_sv[0];
@@ -15287,7 +15287,7 @@ namespace mg5amcCpu
       jamp_sv[76] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[90] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[100] -= cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[2], w_fp[111], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[2], w_fp[111], COUPs[1], 1.0, &amp_fp[0] );
       jamp_sv[19] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[29] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[43] += cxtype( 0, 1 ) * amp_sv[0];
@@ -15303,7 +15303,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 723
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[104], w_fp[23], w_fp[4], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[104], w_fp[23], w_fp[4], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 723 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -15323,7 +15323,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 724
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[25], w_fp[2], w_fp[104], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[25], w_fp[2], w_fp[104], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 724 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -15339,7 +15339,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 725
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[58], w_fp[96], w_fp[4], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[58], w_fp[96], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 725 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -15353,7 +15353,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 726
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[25], w_fp[96], w_fp[1], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[25], w_fp[96], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 726 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -15367,7 +15367,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 727
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[58], w_fp[2], w_fp[112], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[58], w_fp[2], w_fp[112], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 727 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -15383,7 +15383,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 728
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[112], w_fp[1], w_fp[23], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[112], w_fp[1], w_fp[23], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 728 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -15403,7 +15403,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 729
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[2], w_fp[98], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[2], w_fp[98], COUPs[1], 1.0, &amp_fp[0] );
       jamp_sv[18] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[20] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[31] += cxtype( 0, 1 ) * amp_sv[0];
@@ -15412,7 +15412,7 @@ namespace mg5amcCpu
       jamp_sv[58] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[96] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[98] -= cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[2], w_fp[62], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[2], w_fp[62], COUPs[1], 1.0, &amp_fp[0] );
       jamp_sv[20] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[28] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[31] += cxtype( 0, 1 ) * amp_sv[0];
@@ -15421,7 +15421,7 @@ namespace mg5amcCpu
       jamp_sv[52] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[66] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[98] -= cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[2], w_fp[101], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[2], w_fp[101], COUPs[1], 1.0, &amp_fp[0] );
       jamp_sv[18] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[28] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[42] += cxtype( 0, 1 ) * amp_sv[0];
@@ -15437,7 +15437,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 730
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[17], w_fp[104], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[17], w_fp[104], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 730 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -15457,7 +15457,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 731
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[26], w_fp[2], w_fp[104], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[26], w_fp[2], w_fp[104], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 731 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -15477,7 +15477,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 732
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[96], w_fp[59], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[96], w_fp[59], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 732 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -15497,7 +15497,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 733
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[26], w_fp[96], w_fp[1], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[26], w_fp[96], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 733 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -15513,7 +15513,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 734
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[2], w_fp[59], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[2], w_fp[59], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 734 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -15533,7 +15533,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 735
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[17], w_fp[1], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[17], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 735 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -15549,7 +15549,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 736
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[96], w_fp[73], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[96], w_fp[73], COUPs[1], 1.0, &amp_fp[0] );
       jamp_sv[18] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[19] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[21] += cxtype( 0, 1 ) * amp_sv[0];
@@ -15558,7 +15558,7 @@ namespace mg5amcCpu
       jamp_sv[97] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[99] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[101] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[96], w_fp[79], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[96], w_fp[79], COUPs[1], 1.0, &amp_fp[0] );
       jamp_sv[19] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[20] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[21] += cxtype( 0, 1 ) * amp_sv[0];
@@ -15567,7 +15567,7 @@ namespace mg5amcCpu
       jamp_sv[98] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[99] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[100] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[96], w_fp[80], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[96], w_fp[80], COUPs[1], 1.0, &amp_fp[0] );
       jamp_sv[18] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[20] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[22] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -15583,7 +15583,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 737
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[2], w_fp[73], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[2], w_fp[73], COUPs[1], 1.0, &amp_fp[0] );
       jamp_sv[32] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[33] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[38] += cxtype( 0, 1 ) * amp_sv[0];
@@ -15592,7 +15592,7 @@ namespace mg5amcCpu
       jamp_sv[63] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[86] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[87] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[2], w_fp[79], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[2], w_fp[79], COUPs[1], 1.0, &amp_fp[0] );
       jamp_sv[38] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[39] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[56] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -15601,7 +15601,7 @@ namespace mg5amcCpu
       jamp_sv[63] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[80] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[81] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[2], w_fp[80], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[2], w_fp[80], COUPs[1], 1.0, &amp_fp[0] );
       jamp_sv[32] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[33] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[56] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -15617,7 +15617,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 738
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[92], w_fp[73], w_fp[8], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[92], w_fp[73], w_fp[8], COUPs[0], 1.0, &amp_fp[0] );
       jamp_sv[18] -= amp_sv[0];
       jamp_sv[19] += amp_sv[0];
       jamp_sv[21] += amp_sv[0];
@@ -15634,7 +15634,7 @@ namespace mg5amcCpu
       jamp_sv[97] -= amp_sv[0];
       jamp_sv[99] -= amp_sv[0];
       jamp_sv[101] += amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[92], w_fp[79], w_fp[8], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[92], w_fp[79], w_fp[8], COUPs[0], 1.0, &amp_fp[0] );
       jamp_sv[19] += amp_sv[0];
       jamp_sv[20] -= amp_sv[0];
       jamp_sv[21] += amp_sv[0];
@@ -15651,7 +15651,7 @@ namespace mg5amcCpu
       jamp_sv[98] += amp_sv[0];
       jamp_sv[99] -= amp_sv[0];
       jamp_sv[100] += amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[92], w_fp[80], w_fp[8], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[92], w_fp[80], w_fp[8], COUPs[0], 1.0, &amp_fp[0] );
       jamp_sv[18] += amp_sv[0];
       jamp_sv[20] -= amp_sv[0];
       jamp_sv[22] -= amp_sv[0];
@@ -15672,10 +15672,10 @@ namespace mg5amcCpu
       // *** DIAGRAM 739 OF 1240 ***
 
       // Wavefunction(s) for diagram number 739
-      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[77], w_fp[0], COUPs[1], cIPD[0], cIPD[1], w_fp[92] );
+      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[77], w_fp[0], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[92] );
 
       // Amplitude(s) for diagram number 739
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[7], w_fp[92], w_fp[6], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[7], w_fp[92], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 739 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -15688,7 +15688,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 740
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[53], w_fp[92], w_fp[5], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[53], w_fp[92], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 740 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -15698,10 +15698,10 @@ namespace mg5amcCpu
       // *** DIAGRAM 741 OF 1240 ***
 
       // Wavefunction(s) for diagram number 741
-      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[46], w_fp[0], COUPs[1], cIPD[0], cIPD[1], w_fp[99] );
+      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[46], w_fp[0], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[99] );
 
       // Amplitude(s) for diagram number 741
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[9], w_fp[6], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[9], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 741 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -15714,7 +15714,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 742
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[85], w_fp[5], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[85], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 742 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -15727,7 +15727,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 743
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[53], w_fp[9], w_fp[0], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[53], w_fp[9], w_fp[0], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 743 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -15740,7 +15740,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 744
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[7], w_fp[85], w_fp[0], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[7], w_fp[85], w_fp[0], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 744 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -15753,7 +15753,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 745
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[92], w_fp[29], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[92], w_fp[29], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 745 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -15767,7 +15767,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 746
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[77], w_fp[29], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[77], w_fp[29], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 746 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -15778,10 +15778,10 @@ namespace mg5amcCpu
       // *** DIAGRAM 747 OF 1240 ***
 
       // Wavefunction(s) for diagram number 747
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[29], COUPs[0], 0., 0., w_fp[96] );
+      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[29], COUPs[0], 1.0, 0., 0., w_fp[96] );
 
       // Amplitude(s) for diagram number 747
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[77], w_fp[96], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[77], w_fp[96], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 747 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -15797,7 +15797,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 748
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[25], w_fp[92], w_fp[6], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[25], w_fp[92], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 748 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -15810,7 +15810,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 749
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[48], w_fp[92], w_fp[4], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[48], w_fp[92], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 749 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -15820,10 +15820,10 @@ namespace mg5amcCpu
       // *** DIAGRAM 750 OF 1240 ***
 
       // Wavefunction(s) for diagram number 750
-      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[38], w_fp[0], COUPs[1], cIPD[0], cIPD[1], w_fp[104] );
+      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[38], w_fp[0], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[104] );
 
       // Amplitude(s) for diagram number 750
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[104], w_fp[87], w_fp[6], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[104], w_fp[87], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 750 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -15836,7 +15836,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 751
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[104], w_fp[85], w_fp[4], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[104], w_fp[85], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 751 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -15849,7 +15849,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 752
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[48], w_fp[87], w_fp[0], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[48], w_fp[87], w_fp[0], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 752 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -15862,7 +15862,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 753
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[25], w_fp[85], w_fp[0], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[25], w_fp[85], w_fp[0], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 753 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -15875,7 +15875,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 754
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[92], w_fp[27], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[92], w_fp[27], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 754 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -15889,7 +15889,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 755
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[104], w_fp[77], w_fp[27], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[104], w_fp[77], w_fp[27], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 755 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -15900,10 +15900,10 @@ namespace mg5amcCpu
       // *** DIAGRAM 756 OF 1240 ***
 
       // Wavefunction(s) for diagram number 756
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[27], COUPs[0], 0., 0., w_fp[101] );
+      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[27], COUPs[0], 1.0, 0., 0., w_fp[101] );
 
       // Amplitude(s) for diagram number 756
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[77], w_fp[101], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[77], w_fp[101], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 756 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -15919,7 +15919,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 757
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[28], w_fp[92], w_fp[5], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[28], w_fp[92], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 757 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -15932,7 +15932,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 758
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[40], w_fp[92], w_fp[4], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[40], w_fp[92], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 758 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -15942,10 +15942,10 @@ namespace mg5amcCpu
       // *** DIAGRAM 759 OF 1240 ***
 
       // Wavefunction(s) for diagram number 759
-      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[41], w_fp[0], COUPs[1], cIPD[0], cIPD[1], w_fp[62] );
+      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[41], w_fp[0], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[62] );
 
       // Amplitude(s) for diagram number 759
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[62], w_fp[87], w_fp[5], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[62], w_fp[87], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 759 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -15958,7 +15958,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 760
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[62], w_fp[9], w_fp[4], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[62], w_fp[9], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 760 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -15971,7 +15971,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 761
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[40], w_fp[87], w_fp[0], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[40], w_fp[87], w_fp[0], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 761 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -15984,7 +15984,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 762
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[28], w_fp[9], w_fp[0], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[28], w_fp[9], w_fp[0], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 762 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -15997,7 +15997,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 763
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[92], w_fp[24], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[92], w_fp[24], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 763 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -16011,7 +16011,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 764
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[62], w_fp[77], w_fp[24], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[62], w_fp[77], w_fp[24], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 764 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -16022,10 +16022,10 @@ namespace mg5amcCpu
       // *** DIAGRAM 765 OF 1240 ***
 
       // Wavefunction(s) for diagram number 765
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[24], COUPs[0], 0., 0., w_fp[98] );
+      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[24], COUPs[0], 1.0, 0., 0., w_fp[98] );
 
       // Amplitude(s) for diagram number 765
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[77], w_fp[98], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[77], w_fp[98], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 765 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -16041,7 +16041,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 766
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[26], w_fp[92], w_fp[6], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[26], w_fp[92], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 766 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -16055,7 +16055,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 767
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[92], w_fp[42], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[92], w_fp[42], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 767 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -16071,7 +16071,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 768
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[98], w_fp[34], w_fp[6], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[98], w_fp[34], w_fp[6], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 768 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -16091,7 +16091,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 769
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[85], w_fp[98], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[85], w_fp[98], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 769 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -16107,7 +16107,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 770
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[34], w_fp[42], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[34], w_fp[42], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 770 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -16127,7 +16127,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 771
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[26], w_fp[85], w_fp[0], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[26], w_fp[85], w_fp[0], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 771 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -16138,12 +16138,12 @@ namespace mg5amcCpu
       // *** DIAGRAM 772 OF 1240 ***
 
       // Wavefunction(s) for diagram number 772
-      VVVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[24], w_fp[6], COUPs[2], 0., 0., w_fp[85] );
-      VVVV3P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[24], w_fp[6], COUPs[2], 0., 0., w_fp[112] );
-      VVVV4P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[24], w_fp[6], COUPs[2], 0., 0., w_fp[111] );
+      VVVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[24], w_fp[6], COUPs[2], 1.0, 0., 0., w_fp[85] );
+      VVVV3P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[24], w_fp[6], COUPs[2], 1.0, 0., 0., w_fp[112] );
+      VVVV4P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[24], w_fp[6], COUPs[2], 1.0, 0., 0., w_fp[111] );
 
       // Amplitude(s) for diagram number 772
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[77], w_fp[85], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[77], w_fp[85], COUPs[1], 1.0, &amp_fp[0] );
       jamp_sv[24] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[26] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[28] += cxtype( 0, 1 ) * amp_sv[0];
@@ -16152,7 +16152,7 @@ namespace mg5amcCpu
       jamp_sv[39] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[45] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[47] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[77], w_fp[112], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[77], w_fp[112], COUPs[1], 1.0, &amp_fp[0] );
       jamp_sv[28] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[29] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[32] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -16161,7 +16161,7 @@ namespace mg5amcCpu
       jamp_sv[39] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[42] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[43] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[77], w_fp[111], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[77], w_fp[111], COUPs[1], 1.0, &amp_fp[0] );
       jamp_sv[24] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[26] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[32] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -16177,7 +16177,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 773
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[14], w_fp[92], w_fp[5], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[14], w_fp[92], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 773 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -16191,7 +16191,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 774
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[92], w_fp[16], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[92], w_fp[16], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 774 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -16207,7 +16207,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 775
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[101], w_fp[34], w_fp[5], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[101], w_fp[34], w_fp[5], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 775 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -16227,7 +16227,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 776
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[9], w_fp[101], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[9], w_fp[101], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 776 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -16243,7 +16243,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 777
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[34], w_fp[16], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[34], w_fp[16], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 777 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -16263,7 +16263,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 778
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[14], w_fp[9], w_fp[0], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[14], w_fp[9], w_fp[0], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 778 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -16274,12 +16274,12 @@ namespace mg5amcCpu
       // *** DIAGRAM 779 OF 1240 ***
 
       // Wavefunction(s) for diagram number 779
-      VVVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[27], w_fp[5], COUPs[2], 0., 0., w_fp[9] );
-      VVVV3P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[27], w_fp[5], COUPs[2], 0., 0., w_fp[110] );
-      VVVV4P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[27], w_fp[5], COUPs[2], 0., 0., w_fp[109] );
+      VVVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[27], w_fp[5], COUPs[2], 1.0, 0., 0., w_fp[9] );
+      VVVV3P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[27], w_fp[5], COUPs[2], 1.0, 0., 0., w_fp[110] );
+      VVVV4P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[27], w_fp[5], COUPs[2], 1.0, 0., 0., w_fp[109] );
 
       // Amplitude(s) for diagram number 779
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[77], w_fp[9], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[77], w_fp[9], COUPs[1], 1.0, &amp_fp[0] );
       jamp_sv[25] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[26] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[27] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -16288,7 +16288,7 @@ namespace mg5amcCpu
       jamp_sv[39] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[41] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[45] -= cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[77], w_fp[110], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[77], w_fp[110], COUPs[1], 1.0, &amp_fp[0] );
       jamp_sv[26] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[27] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[34] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -16297,7 +16297,7 @@ namespace mg5amcCpu
       jamp_sv[37] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[44] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[45] -= cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[77], w_fp[109], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[77], w_fp[109], COUPs[1], 1.0, &amp_fp[0] );
       jamp_sv[25] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[28] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[34] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -16313,7 +16313,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 780
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[12], w_fp[92], w_fp[4], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[12], w_fp[92], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 780 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -16327,7 +16327,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 781
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[92], w_fp[19], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[92], w_fp[19], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 781 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -16343,7 +16343,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 782
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[96], w_fp[34], w_fp[4], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[96], w_fp[34], w_fp[4], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 782 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -16363,7 +16363,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 783
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[87], w_fp[96], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[87], w_fp[96], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 783 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -16379,7 +16379,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 784
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[34], w_fp[19], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[34], w_fp[19], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 784 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -16399,7 +16399,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 785
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[12], w_fp[87], w_fp[0], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[12], w_fp[87], w_fp[0], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 785 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -16410,12 +16410,12 @@ namespace mg5amcCpu
       // *** DIAGRAM 786 OF 1240 ***
 
       // Wavefunction(s) for diagram number 786
-      VVVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[4], w_fp[29], COUPs[2], 0., 0., w_fp[87] );
-      VVVV3P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[4], w_fp[29], COUPs[2], 0., 0., w_fp[34] );
-      VVVV4P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[4], w_fp[29], COUPs[2], 0., 0., w_fp[86] );
+      VVVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[4], w_fp[29], COUPs[2], 1.0, 0., 0., w_fp[87] );
+      VVVV3P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[4], w_fp[29], COUPs[2], 1.0, 0., 0., w_fp[34] );
+      VVVV4P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[4], w_fp[29], COUPs[2], 1.0, 0., 0., w_fp[86] );
 
       // Amplitude(s) for diagram number 786
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[77], w_fp[87], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[77], w_fp[87], COUPs[1], 1.0, &amp_fp[0] );
       jamp_sv[24] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[25] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[27] += cxtype( 0, 1 ) * amp_sv[0];
@@ -16424,7 +16424,7 @@ namespace mg5amcCpu
       jamp_sv[35] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[41] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[47] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[77], w_fp[34], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[77], w_fp[34], COUPs[1], 1.0, &amp_fp[0] );
       jamp_sv[27] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[29] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[30] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -16433,7 +16433,7 @@ namespace mg5amcCpu
       jamp_sv[35] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[40] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[46] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[77], w_fp[86], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[77], w_fp[86], COUPs[1], 1.0, &amp_fp[0] );
       jamp_sv[24] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[25] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[30] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -16449,17 +16449,17 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 787
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[92], w_fp[30], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[92], w_fp[30], COUPs[1], 1.0, &amp_fp[0] );
       jamp_sv[24] += amp_sv[0];
       jamp_sv[25] -= amp_sv[0];
       jamp_sv[27] -= amp_sv[0];
       jamp_sv[29] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[92], w_fp[31], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[92], w_fp[31], COUPs[1], 1.0, &amp_fp[0] );
       jamp_sv[25] -= amp_sv[0];
       jamp_sv[26] += amp_sv[0];
       jamp_sv[27] -= amp_sv[0];
       jamp_sv[28] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[92], w_fp[32], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[92], w_fp[32], COUPs[1], 1.0, &amp_fp[0] );
       jamp_sv[24] -= amp_sv[0];
       jamp_sv[26] += amp_sv[0];
       jamp_sv[28] += amp_sv[0];
@@ -16468,12 +16468,12 @@ namespace mg5amcCpu
       // *** DIAGRAM 788 OF 1240 ***
 
       // Wavefunction(s) for diagram number 788
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[30], COUPs[0], 0., 0., w_fp[92] );
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[31], COUPs[0], 0., 0., w_fp[88] );
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[32], COUPs[0], 0., 0., w_fp[106] );
+      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[30], COUPs[0], 1.0, 0., 0., w_fp[92] );
+      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[31], COUPs[0], 1.0, 0., 0., w_fp[88] );
+      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[32], COUPs[0], 1.0, 0., 0., w_fp[106] );
 
       // Amplitude(s) for diagram number 788
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[77], w_fp[92], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[77], w_fp[92], COUPs[1], 1.0, &amp_fp[0] );
       jamp_sv[24] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[25] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[27] += cxtype( 0, 1 ) * amp_sv[0];
@@ -16482,7 +16482,7 @@ namespace mg5amcCpu
       jamp_sv[35] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[41] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[47] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[77], w_fp[88], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[77], w_fp[88], COUPs[1], 1.0, &amp_fp[0] );
       jamp_sv[25] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[26] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[27] += cxtype( 0, 1 ) * amp_sv[0];
@@ -16491,7 +16491,7 @@ namespace mg5amcCpu
       jamp_sv[39] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[41] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[45] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[77], w_fp[106], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[77], w_fp[106], COUPs[1], 1.0, &amp_fp[0] );
       jamp_sv[24] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[26] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[28] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -16504,10 +16504,10 @@ namespace mg5amcCpu
       // *** DIAGRAM 789 OF 1240 ***
 
       // Wavefunction(s) for diagram number 789
-      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[52], w_fp[0], COUPs[1], cIPD[0], cIPD[1], w_fp[90] );
+      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[52], w_fp[0], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[90] );
 
       // Amplitude(s) for diagram number 789
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[90], w_fp[35], w_fp[6], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[90], w_fp[35], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 789 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -16520,7 +16520,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 790
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[90], w_fp[36], w_fp[5], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[90], w_fp[36], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 790 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -16530,10 +16530,10 @@ namespace mg5amcCpu
       // *** DIAGRAM 791 OF 1240 ***
 
       // Wavefunction(s) for diagram number 791
-      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[33], w_fp[0], COUPs[1], cIPD[0], cIPD[1], w_fp[114] );
+      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[33], w_fp[0], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[114] );
 
       // Amplitude(s) for diagram number 791
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[22], w_fp[114], w_fp[6], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[22], w_fp[114], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 791 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -16546,7 +16546,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 792
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[21], w_fp[114], w_fp[5], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[21], w_fp[114], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 792 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -16559,7 +16559,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 793
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[22], w_fp[36], w_fp[0], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[22], w_fp[36], w_fp[0], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 793 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -16572,7 +16572,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 794
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[21], w_fp[35], w_fp[0], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[21], w_fp[35], w_fp[0], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 794 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -16585,7 +16585,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 795
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[90], w_fp[33], w_fp[29], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[90], w_fp[33], w_fp[29], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 795 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -16599,7 +16599,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 796
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[114], w_fp[29], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[114], w_fp[29], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 796 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -16613,7 +16613,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 797
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[33], w_fp[96], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[33], w_fp[96], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 797 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -16629,7 +16629,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 798
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[90], w_fp[43], w_fp[6], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[90], w_fp[43], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 798 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -16642,7 +16642,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 799
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[90], w_fp[44], w_fp[4], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[90], w_fp[44], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 799 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -16652,10 +16652,10 @@ namespace mg5amcCpu
       // *** DIAGRAM 800 OF 1240 ***
 
       // Wavefunction(s) for diagram number 800
-      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[39], w_fp[0], COUPs[1], cIPD[0], cIPD[1], w_fp[102] );
+      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[39], w_fp[0], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[102] );
 
       // Amplitude(s) for diagram number 800
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[56], w_fp[102], w_fp[6], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[56], w_fp[102], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 800 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -16668,7 +16668,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 801
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[21], w_fp[102], w_fp[4], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[21], w_fp[102], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 801 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -16681,7 +16681,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 802
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[56], w_fp[44], w_fp[0], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[56], w_fp[44], w_fp[0], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 802 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -16694,7 +16694,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 803
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[21], w_fp[43], w_fp[0], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[21], w_fp[43], w_fp[0], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 803 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -16707,7 +16707,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 804
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[90], w_fp[39], w_fp[27], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[90], w_fp[39], w_fp[27], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 804 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -16721,7 +16721,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 805
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[102], w_fp[27], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[102], w_fp[27], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 805 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -16735,7 +16735,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 806
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[39], w_fp[101], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[39], w_fp[101], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 806 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -16751,7 +16751,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 807
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[90], w_fp[49], w_fp[5], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[90], w_fp[49], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 807 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -16764,7 +16764,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 808
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[90], w_fp[50], w_fp[4], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[90], w_fp[50], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 808 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -16774,10 +16774,10 @@ namespace mg5amcCpu
       // *** DIAGRAM 809 OF 1240 ***
 
       // Wavefunction(s) for diagram number 809
-      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[47], w_fp[0], COUPs[1], cIPD[0], cIPD[1], w_fp[113] );
+      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[47], w_fp[0], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[113] );
 
       // Amplitude(s) for diagram number 809
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[56], w_fp[113], w_fp[5], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[56], w_fp[113], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 809 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -16790,7 +16790,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 810
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[22], w_fp[113], w_fp[4], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[22], w_fp[113], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 810 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -16803,7 +16803,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 811
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[56], w_fp[50], w_fp[0], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[56], w_fp[50], w_fp[0], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 811 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -16816,7 +16816,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 812
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[22], w_fp[49], w_fp[0], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[22], w_fp[49], w_fp[0], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 812 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -16829,7 +16829,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 813
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[90], w_fp[47], w_fp[24], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[90], w_fp[47], w_fp[24], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 813 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -16843,7 +16843,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 814
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[113], w_fp[24], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[113], w_fp[24], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 814 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -16857,7 +16857,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 815
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[47], w_fp[98], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[47], w_fp[98], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 815 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -16873,7 +16873,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 816
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[90], w_fp[17], w_fp[6], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[90], w_fp[17], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 816 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -16887,7 +16887,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 817
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[90], w_fp[2], w_fp[42], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[90], w_fp[2], w_fp[42], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 817 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -16903,7 +16903,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 818
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[98], w_fp[103], w_fp[6], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[98], w_fp[103], w_fp[6], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 818 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -16923,7 +16923,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 819
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[21], w_fp[2], w_fp[98], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[21], w_fp[2], w_fp[98], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 819 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -16939,7 +16939,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 820
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[103], w_fp[42], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[103], w_fp[42], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 820 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -16959,7 +16959,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 821
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[21], w_fp[17], w_fp[0], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[21], w_fp[17], w_fp[0], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 821 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -16973,7 +16973,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 822
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[2], w_fp[85], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[2], w_fp[85], COUPs[1], 1.0, &amp_fp[0] );
       jamp_sv[9] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[15] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[21] += cxtype( 0, 1 ) * amp_sv[0];
@@ -16982,7 +16982,7 @@ namespace mg5amcCpu
       jamp_sv[88] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[112] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[118] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[2], w_fp[112], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[2], w_fp[112], COUPs[1], 1.0, &amp_fp[0] );
       jamp_sv[21] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[23] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[61] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -16991,7 +16991,7 @@ namespace mg5amcCpu
       jamp_sv[88] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[99] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[101] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[2], w_fp[111], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[2], w_fp[111], COUPs[1], 1.0, &amp_fp[0] );
       jamp_sv[9] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[15] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[61] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -17007,7 +17007,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 823
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[90], w_fp[15], w_fp[5], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[90], w_fp[15], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 823 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -17021,7 +17021,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 824
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[90], w_fp[2], w_fp[16], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[90], w_fp[2], w_fp[16], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 824 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -17037,7 +17037,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 825
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[101], w_fp[103], w_fp[5], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[101], w_fp[103], w_fp[5], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 825 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -17057,7 +17057,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 826
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[22], w_fp[2], w_fp[101], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[22], w_fp[2], w_fp[101], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 826 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -17073,7 +17073,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 827
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[103], w_fp[16], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[103], w_fp[16], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 827 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -17093,7 +17093,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 828
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[22], w_fp[15], w_fp[0], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[22], w_fp[15], w_fp[0], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 828 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -17107,7 +17107,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 829
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[2], w_fp[9], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[2], w_fp[9], COUPs[1], 1.0, &amp_fp[0] );
       jamp_sv[11] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[15] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[17] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -17116,7 +17116,7 @@ namespace mg5amcCpu
       jamp_sv[88] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[94] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[112] -= cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[2], w_fp[110], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[2], w_fp[110], COUPs[1], 1.0, &amp_fp[0] );
       jamp_sv[15] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[17] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[67] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -17125,7 +17125,7 @@ namespace mg5amcCpu
       jamp_sv[77] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[109] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[112] -= cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[2], w_fp[109], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[2], w_fp[109], COUPs[1], 1.0, &amp_fp[0] );
       jamp_sv[11] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[21] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[67] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -17141,7 +17141,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 830
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[90], w_fp[18], w_fp[4], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[90], w_fp[18], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 830 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -17155,7 +17155,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 831
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[90], w_fp[2], w_fp[19], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[90], w_fp[2], w_fp[19], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 831 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -17171,7 +17171,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 832
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[96], w_fp[103], w_fp[4], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[96], w_fp[103], w_fp[4], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 832 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -17191,7 +17191,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 833
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[56], w_fp[2], w_fp[96], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[56], w_fp[2], w_fp[96], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 833 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -17207,7 +17207,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 834
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[103], w_fp[19], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[103], w_fp[19], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 834 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -17227,7 +17227,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 835
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[56], w_fp[18], w_fp[0], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[56], w_fp[18], w_fp[0], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 835 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -17241,7 +17241,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 836
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[2], w_fp[87], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[2], w_fp[87], COUPs[1], 1.0, &amp_fp[0] );
       jamp_sv[9] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[11] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[17] += cxtype( 0, 1 ) * amp_sv[0];
@@ -17250,7 +17250,7 @@ namespace mg5amcCpu
       jamp_sv[70] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[94] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[118] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[2], w_fp[34], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[2], w_fp[34], COUPs[1], 1.0, &amp_fp[0] );
       jamp_sv[17] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[23] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[51] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -17259,7 +17259,7 @@ namespace mg5amcCpu
       jamp_sv[70] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[91] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[115] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[2], w_fp[86], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[2], w_fp[86], COUPs[1], 1.0, &amp_fp[0] );
       jamp_sv[9] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[11] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[51] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -17275,17 +17275,17 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 837
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[90], w_fp[2], w_fp[30], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[90], w_fp[2], w_fp[30], COUPs[1], 1.0, &amp_fp[0] );
       jamp_sv[64] += amp_sv[0];
       jamp_sv[70] -= amp_sv[0];
       jamp_sv[94] -= amp_sv[0];
       jamp_sv[118] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[90], w_fp[2], w_fp[31], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[90], w_fp[2], w_fp[31], COUPs[1], 1.0, &amp_fp[0] );
       jamp_sv[70] -= amp_sv[0];
       jamp_sv[88] += amp_sv[0];
       jamp_sv[94] -= amp_sv[0];
       jamp_sv[112] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[90], w_fp[2], w_fp[32], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[90], w_fp[2], w_fp[32], COUPs[1], 1.0, &amp_fp[0] );
       jamp_sv[64] -= amp_sv[0];
       jamp_sv[88] += amp_sv[0];
       jamp_sv[112] += amp_sv[0];
@@ -17297,7 +17297,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 838
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[2], w_fp[92], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[2], w_fp[92], COUPs[1], 1.0, &amp_fp[0] );
       jamp_sv[9] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[11] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[17] += cxtype( 0, 1 ) * amp_sv[0];
@@ -17306,7 +17306,7 @@ namespace mg5amcCpu
       jamp_sv[70] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[94] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[118] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[2], w_fp[88], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[2], w_fp[88], COUPs[1], 1.0, &amp_fp[0] );
       jamp_sv[11] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[15] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[17] += cxtype( 0, 1 ) * amp_sv[0];
@@ -17315,7 +17315,7 @@ namespace mg5amcCpu
       jamp_sv[88] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[94] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[112] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[2], w_fp[106], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[2], w_fp[106], COUPs[1], 1.0, &amp_fp[0] );
       jamp_sv[9] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[15] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[21] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -17328,10 +17328,10 @@ namespace mg5amcCpu
       // *** DIAGRAM 839 OF 1240 ***
 
       // Wavefunction(s) for diagram number 839
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[61], COUPs[0], 0., 0., w_fp[90] );
+      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[61], COUPs[0], 1.0, 0., 0., w_fp[90] );
 
       // Amplitude(s) for diagram number 839
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[90], w_fp[10], w_fp[6], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[90], w_fp[10], w_fp[6], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 839 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -17359,7 +17359,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 840
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[90], w_fp[11], w_fp[5], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[90], w_fp[11], w_fp[5], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 840 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -17387,7 +17387,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 841
-      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[5], w_fp[6], w_fp[90], COUPs[2], &amp_fp[0] );
+      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[5], w_fp[6], w_fp[90], COUPs[2], 1.0, &amp_fp[0] );
       jamp_sv[1] -= amp_sv[0];
       jamp_sv[7] += amp_sv[0];
       jamp_sv[31] += amp_sv[0];
@@ -17404,7 +17404,7 @@ namespace mg5amcCpu
       jamp_sv[98] -= amp_sv[0];
       jamp_sv[104] -= amp_sv[0];
       jamp_sv[110] += amp_sv[0];
-      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[5], w_fp[6], w_fp[90], COUPs[2], &amp_fp[0] );
+      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[5], w_fp[6], w_fp[90], COUPs[2], 1.0, &amp_fp[0] );
       jamp_sv[0] -= amp_sv[0];
       jamp_sv[6] += amp_sv[0];
       jamp_sv[30] += amp_sv[0];
@@ -17421,7 +17421,7 @@ namespace mg5amcCpu
       jamp_sv[115] += amp_sv[0];
       jamp_sv[117] += amp_sv[0];
       jamp_sv[119] -= amp_sv[0];
-      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[5], w_fp[6], w_fp[90], COUPs[2], &amp_fp[0] );
+      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[5], w_fp[6], w_fp[90], COUPs[2], 1.0, &amp_fp[0] );
       jamp_sv[0] -= amp_sv[0];
       jamp_sv[1] += amp_sv[0];
       jamp_sv[6] += amp_sv[0];
@@ -17442,10 +17442,10 @@ namespace mg5amcCpu
       // *** DIAGRAM 842 OF 1240 ***
 
       // Wavefunction(s) for diagram number 842
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[8], COUPs[0], 0., 0., w_fp[56] );
+      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[8], COUPs[0], 1.0, 0., 0., w_fp[56] );
 
       // Amplitude(s) for diagram number 842
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[56], w_fp[63], w_fp[6], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[56], w_fp[63], w_fp[6], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 842 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -17473,7 +17473,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 843
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[56], w_fp[64], w_fp[5], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[56], w_fp[64], w_fp[5], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 843 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -17501,7 +17501,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 844
-      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[61], w_fp[5], w_fp[6], w_fp[56], COUPs[2], &amp_fp[0] );
+      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[61], w_fp[5], w_fp[6], w_fp[56], COUPs[2], 1.0, &amp_fp[0] );
       jamp_sv[0] += amp_sv[0];
       jamp_sv[6] -= amp_sv[0];
       jamp_sv[12] -= amp_sv[0];
@@ -17518,7 +17518,7 @@ namespace mg5amcCpu
       jamp_sv[111] -= amp_sv[0];
       jamp_sv[117] -= amp_sv[0];
       jamp_sv[119] += amp_sv[0];
-      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[61], w_fp[5], w_fp[6], w_fp[56], COUPs[2], &amp_fp[0] );
+      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[61], w_fp[5], w_fp[6], w_fp[56], COUPs[2], 1.0, &amp_fp[0] );
       jamp_sv[1] += amp_sv[0];
       jamp_sv[7] -= amp_sv[0];
       jamp_sv[12] -= amp_sv[0];
@@ -17535,7 +17535,7 @@ namespace mg5amcCpu
       jamp_sv[95] += amp_sv[0];
       jamp_sv[105] += amp_sv[0];
       jamp_sv[111] -= amp_sv[0];
-      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[61], w_fp[5], w_fp[6], w_fp[56], COUPs[2], &amp_fp[0] );
+      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[61], w_fp[5], w_fp[6], w_fp[56], COUPs[2], 1.0, &amp_fp[0] );
       jamp_sv[0] -= amp_sv[0];
       jamp_sv[1] += amp_sv[0];
       jamp_sv[6] += amp_sv[0];
@@ -17559,7 +17559,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 845
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[63], w_fp[11], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[63], w_fp[11], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 845 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -17587,7 +17587,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 846
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[64], w_fp[10], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[64], w_fp[10], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 846 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -17612,12 +17612,12 @@ namespace mg5amcCpu
       // *** DIAGRAM 847 OF 1240 ***
 
       // Wavefunction(s) for diagram number 847
-      VVVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[61], w_fp[5], COUPs[2], 0., 0., w_fp[103] );
-      VVVV3P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[61], w_fp[5], COUPs[2], 0., 0., w_fp[22] );
-      VVVV4P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[61], w_fp[5], COUPs[2], 0., 0., w_fp[21] );
+      VVVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[61], w_fp[5], COUPs[2], 1.0, 0., 0., w_fp[103] );
+      VVVV3P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[61], w_fp[5], COUPs[2], 1.0, 0., 0., w_fp[22] );
+      VVVV4P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[61], w_fp[5], COUPs[2], 1.0, 0., 0., w_fp[21] );
 
       // Amplitude(s) for diagram number 847
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[6], w_fp[103], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[6], w_fp[103], COUPs[0], 1.0, &amp_fp[0] );
       jamp_sv[0] += amp_sv[0];
       jamp_sv[6] -= amp_sv[0];
       jamp_sv[12] -= amp_sv[0];
@@ -17634,7 +17634,7 @@ namespace mg5amcCpu
       jamp_sv[111] -= amp_sv[0];
       jamp_sv[117] -= amp_sv[0];
       jamp_sv[119] += amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[6], w_fp[22], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[6], w_fp[22], COUPs[0], 1.0, &amp_fp[0] );
       jamp_sv[12] -= amp_sv[0];
       jamp_sv[14] += amp_sv[0];
       jamp_sv[30] += amp_sv[0];
@@ -17651,7 +17651,7 @@ namespace mg5amcCpu
       jamp_sv[111] -= amp_sv[0];
       jamp_sv[114] -= amp_sv[0];
       jamp_sv[115] += amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[6], w_fp[21], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[6], w_fp[21], COUPs[0], 1.0, &amp_fp[0] );
       jamp_sv[0] -= amp_sv[0];
       jamp_sv[6] += amp_sv[0];
       jamp_sv[30] += amp_sv[0];
@@ -17672,12 +17672,12 @@ namespace mg5amcCpu
       // *** DIAGRAM 848 OF 1240 ***
 
       // Wavefunction(s) for diagram number 848
-      VVVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[61], w_fp[6], COUPs[2], 0., 0., w_fp[105] );
-      VVVV3P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[61], w_fp[6], COUPs[2], 0., 0., w_fp[95] );
-      VVVV4P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[61], w_fp[6], COUPs[2], 0., 0., w_fp[107] );
+      VVVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[61], w_fp[6], COUPs[2], 1.0, 0., 0., w_fp[105] );
+      VVVV3P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[61], w_fp[6], COUPs[2], 1.0, 0., 0., w_fp[95] );
+      VVVV4P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[61], w_fp[6], COUPs[2], 1.0, 0., 0., w_fp[107] );
 
       // Amplitude(s) for diagram number 848
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[5], w_fp[105], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[5], w_fp[105], COUPs[0], 1.0, &amp_fp[0] );
       jamp_sv[1] += amp_sv[0];
       jamp_sv[7] -= amp_sv[0];
       jamp_sv[18] -= amp_sv[0];
@@ -17694,7 +17694,7 @@ namespace mg5amcCpu
       jamp_sv[95] += amp_sv[0];
       jamp_sv[104] += amp_sv[0];
       jamp_sv[110] -= amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[5], w_fp[95], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[5], w_fp[95], COUPs[0], 1.0, &amp_fp[0] );
       jamp_sv[18] -= amp_sv[0];
       jamp_sv[20] += amp_sv[0];
       jamp_sv[31] += amp_sv[0];
@@ -17711,7 +17711,7 @@ namespace mg5amcCpu
       jamp_sv[91] += amp_sv[0];
       jamp_sv[96] += amp_sv[0];
       jamp_sv[98] -= amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[5], w_fp[107], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[5], w_fp[107], COUPs[0], 1.0, &amp_fp[0] );
       jamp_sv[1] -= amp_sv[0];
       jamp_sv[7] += amp_sv[0];
       jamp_sv[31] += amp_sv[0];
@@ -17732,12 +17732,12 @@ namespace mg5amcCpu
       // *** DIAGRAM 849 OF 1240 ***
 
       // Wavefunction(s) for diagram number 849
-      VVVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[8], w_fp[5], COUPs[2], 0., 0., w_fp[115] );
-      VVVV3P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[8], w_fp[5], COUPs[2], 0., 0., w_fp[116] );
-      VVVV4P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[8], w_fp[5], COUPs[2], 0., 0., w_fp[117] );
+      VVVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[8], w_fp[5], COUPs[2], 1.0, 0., 0., w_fp[115] );
+      VVVV3P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[8], w_fp[5], COUPs[2], 1.0, 0., 0., w_fp[116] );
+      VVVV4P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[8], w_fp[5], COUPs[2], 1.0, 0., 0., w_fp[117] );
 
       // Amplitude(s) for diagram number 849
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[61], w_fp[6], w_fp[115], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[61], w_fp[6], w_fp[115], COUPs[0], 1.0, &amp_fp[0] );
       jamp_sv[1] -= amp_sv[0];
       jamp_sv[7] += amp_sv[0];
       jamp_sv[18] += amp_sv[0];
@@ -17754,7 +17754,7 @@ namespace mg5amcCpu
       jamp_sv[95] -= amp_sv[0];
       jamp_sv[104] -= amp_sv[0];
       jamp_sv[110] += amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[61], w_fp[6], w_fp[116], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[61], w_fp[6], w_fp[116], COUPs[0], 1.0, &amp_fp[0] );
       jamp_sv[12] -= amp_sv[0];
       jamp_sv[14] += amp_sv[0];
       jamp_sv[16] += amp_sv[0];
@@ -17771,7 +17771,7 @@ namespace mg5amcCpu
       jamp_sv[105] += amp_sv[0];
       jamp_sv[110] += amp_sv[0];
       jamp_sv[111] -= amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[61], w_fp[6], w_fp[117], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[61], w_fp[6], w_fp[117], COUPs[0], 1.0, &amp_fp[0] );
       jamp_sv[1] += amp_sv[0];
       jamp_sv[7] -= amp_sv[0];
       jamp_sv[12] -= amp_sv[0];
@@ -17792,12 +17792,12 @@ namespace mg5amcCpu
       // *** DIAGRAM 850 OF 1240 ***
 
       // Wavefunction(s) for diagram number 850
-      VVVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[8], w_fp[6], COUPs[2], 0., 0., w_fp[118] );
-      VVVV3P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[8], w_fp[6], COUPs[2], 0., 0., w_fp[119] );
-      VVVV4P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[8], w_fp[6], COUPs[2], 0., 0., w_fp[120] );
+      VVVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[8], w_fp[6], COUPs[2], 1.0, 0., 0., w_fp[118] );
+      VVVV3P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[8], w_fp[6], COUPs[2], 1.0, 0., 0., w_fp[119] );
+      VVVV4P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[8], w_fp[6], COUPs[2], 1.0, 0., 0., w_fp[120] );
 
       // Amplitude(s) for diagram number 850
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[61], w_fp[5], w_fp[118], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[61], w_fp[5], w_fp[118], COUPs[0], 1.0, &amp_fp[0] );
       jamp_sv[0] -= amp_sv[0];
       jamp_sv[6] += amp_sv[0];
       jamp_sv[12] += amp_sv[0];
@@ -17814,7 +17814,7 @@ namespace mg5amcCpu
       jamp_sv[111] += amp_sv[0];
       jamp_sv[117] += amp_sv[0];
       jamp_sv[119] -= amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[61], w_fp[5], w_fp[119], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[61], w_fp[5], w_fp[119], COUPs[0], 1.0, &amp_fp[0] );
       jamp_sv[18] -= amp_sv[0];
       jamp_sv[20] += amp_sv[0];
       jamp_sv[22] += amp_sv[0];
@@ -17831,7 +17831,7 @@ namespace mg5amcCpu
       jamp_sv[98] -= amp_sv[0];
       jamp_sv[100] -= amp_sv[0];
       jamp_sv[101] += amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[61], w_fp[5], w_fp[120], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[61], w_fp[5], w_fp[120], COUPs[0], 1.0, &amp_fp[0] );
       jamp_sv[0] += amp_sv[0];
       jamp_sv[6] -= amp_sv[0];
       jamp_sv[12] -= amp_sv[0];
@@ -17855,7 +17855,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 851
-      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[61], w_fp[8], w_fp[29], COUPs[2], &amp_fp[0] );
+      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[61], w_fp[8], w_fp[29], COUPs[2], 1.0, &amp_fp[0] );
       jamp_sv[0] += amp_sv[0];
       jamp_sv[1] -= amp_sv[0];
       jamp_sv[6] -= amp_sv[0];
@@ -17872,7 +17872,7 @@ namespace mg5amcCpu
       jamp_sv[115] -= amp_sv[0];
       jamp_sv[117] -= amp_sv[0];
       jamp_sv[119] += amp_sv[0];
-      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[61], w_fp[8], w_fp[29], COUPs[2], &amp_fp[0] );
+      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[61], w_fp[8], w_fp[29], COUPs[2], 1.0, &amp_fp[0] );
       jamp_sv[0] += amp_sv[0];
       jamp_sv[1] -= amp_sv[0];
       jamp_sv[6] -= amp_sv[0];
@@ -17889,7 +17889,7 @@ namespace mg5amcCpu
       jamp_sv[95] -= amp_sv[0];
       jamp_sv[117] -= amp_sv[0];
       jamp_sv[119] += amp_sv[0];
-      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[61], w_fp[8], w_fp[29], COUPs[2], &amp_fp[0] );
+      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[61], w_fp[8], w_fp[29], COUPs[2], 1.0, &amp_fp[0] );
       jamp_sv[16] -= amp_sv[0];
       jamp_sv[17] += amp_sv[0];
       jamp_sv[22] += amp_sv[0];
@@ -17913,7 +17913,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 852
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[29], w_fp[90], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[29], w_fp[90], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 852 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -17941,7 +17941,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 853
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[61], w_fp[29], w_fp[56], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[61], w_fp[29], w_fp[56], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 853 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -17969,7 +17969,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 854
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[61], w_fp[8], w_fp[96], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[61], w_fp[8], w_fp[96], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 854 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -17997,7 +17997,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 855
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[90], w_fp[45], w_fp[6], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[90], w_fp[45], w_fp[6], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 855 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -18017,7 +18017,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 856
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[44], w_fp[90], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[44], w_fp[90], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 856 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -18033,7 +18033,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 857
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[65], w_fp[102], w_fp[6], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[65], w_fp[102], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 857 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -18047,7 +18047,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 858
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[102], w_fp[64], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[102], w_fp[64], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 858 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -18063,7 +18063,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 859
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[65], w_fp[44], w_fp[0], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[65], w_fp[44], w_fp[0], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 859 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -18077,7 +18077,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 860
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[64], w_fp[45], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[64], w_fp[45], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 860 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -18097,7 +18097,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 861
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[39], w_fp[105], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[39], w_fp[105], COUPs[1], 1.0, &amp_fp[0] );
       jamp_sv[72] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[74] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[76] += cxtype( 0, 1 ) * amp_sv[0];
@@ -18106,7 +18106,7 @@ namespace mg5amcCpu
       jamp_sv[87] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[93] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[95] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[39], w_fp[95], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[39], w_fp[95], COUPs[1], 1.0, &amp_fp[0] );
       jamp_sv[76] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[77] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[80] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -18115,7 +18115,7 @@ namespace mg5amcCpu
       jamp_sv[87] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[90] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[91] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[39], w_fp[107], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[39], w_fp[107], COUPs[1], 1.0, &amp_fp[0] );
       jamp_sv[72] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[74] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[80] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -18131,7 +18131,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 862
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[39], w_fp[90], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[39], w_fp[90], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 862 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -18147,7 +18147,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 863
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[102], w_fp[61], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[102], w_fp[61], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 863 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -18161,7 +18161,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 864
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[62], w_fp[39], w_fp[61], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[62], w_fp[39], w_fp[61], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 864 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -18175,7 +18175,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 865
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[90], w_fp[51], w_fp[5], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[90], w_fp[51], w_fp[5], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 865 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -18195,7 +18195,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 866
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[50], w_fp[90], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[50], w_fp[90], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 866 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -18211,7 +18211,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 867
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[65], w_fp[113], w_fp[5], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[65], w_fp[113], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 867 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -18225,7 +18225,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 868
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[113], w_fp[63], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[113], w_fp[63], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 868 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -18241,7 +18241,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 869
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[65], w_fp[50], w_fp[0], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[65], w_fp[50], w_fp[0], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 869 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -18255,7 +18255,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 870
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[63], w_fp[51], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[63], w_fp[51], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 870 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -18275,7 +18275,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 871
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[47], w_fp[103], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[47], w_fp[103], COUPs[1], 1.0, &amp_fp[0] );
       jamp_sv[96] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[98] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[100] += cxtype( 0, 1 ) * amp_sv[0];
@@ -18284,7 +18284,7 @@ namespace mg5amcCpu
       jamp_sv[111] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[117] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[119] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[47], w_fp[22], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[47], w_fp[22], COUPs[1], 1.0, &amp_fp[0] );
       jamp_sv[100] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[101] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[104] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -18293,7 +18293,7 @@ namespace mg5amcCpu
       jamp_sv[111] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[114] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[115] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[47], w_fp[21], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[47], w_fp[21], COUPs[1], 1.0, &amp_fp[0] );
       jamp_sv[96] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[98] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[104] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -18309,7 +18309,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 872
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[47], w_fp[90], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[47], w_fp[90], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 872 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -18325,7 +18325,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 873
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[113], w_fp[61], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[113], w_fp[61], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 873 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -18339,7 +18339,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 874
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[104], w_fp[47], w_fp[61], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[104], w_fp[47], w_fp[61], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 874 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -18353,7 +18353,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 875
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[90], w_fp[23], w_fp[6], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[90], w_fp[23], w_fp[6], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 875 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -18373,7 +18373,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 876
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[48], w_fp[2], w_fp[90], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[48], w_fp[2], w_fp[90], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 876 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -18389,7 +18389,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 877
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[104], w_fp[93], w_fp[6], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[104], w_fp[93], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 877 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -18403,7 +18403,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 878
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[104], w_fp[2], w_fp[64], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[104], w_fp[2], w_fp[64], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 878 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -18419,7 +18419,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 879
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[48], w_fp[93], w_fp[0], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[48], w_fp[93], w_fp[0], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 879 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -18433,7 +18433,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 880
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[64], w_fp[23], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[64], w_fp[23], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 880 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -18453,7 +18453,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 881
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[2], w_fp[105], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[2], w_fp[105], COUPs[1], 1.0, &amp_fp[0] );
       jamp_sv[1] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[7] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[18] += cxtype( 0, 1 ) * amp_sv[0];
@@ -18462,7 +18462,7 @@ namespace mg5amcCpu
       jamp_sv[58] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[104] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[110] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[2], w_fp[95], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[2], w_fp[95], COUPs[1], 1.0, &amp_fp[0] );
       jamp_sv[18] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[20] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[31] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -18471,7 +18471,7 @@ namespace mg5amcCpu
       jamp_sv[58] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[96] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[98] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[2], w_fp[107], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[2], w_fp[107], COUPs[1], 1.0, &amp_fp[0] );
       jamp_sv[1] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[7] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[31] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -18487,7 +18487,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 882
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[90], w_fp[20], w_fp[5], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[90], w_fp[20], w_fp[5], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 882 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -18507,7 +18507,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 883
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[40], w_fp[2], w_fp[90], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[40], w_fp[2], w_fp[90], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 883 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -18523,7 +18523,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 884
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[62], w_fp[93], w_fp[5], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[62], w_fp[93], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 884 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -18537,7 +18537,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 885
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[62], w_fp[2], w_fp[63], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[62], w_fp[2], w_fp[63], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 885 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -18553,7 +18553,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 886
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[40], w_fp[93], w_fp[0], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[40], w_fp[93], w_fp[0], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 886 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -18567,7 +18567,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 887
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[63], w_fp[20], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[63], w_fp[20], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 887 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -18587,7 +18587,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 888
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[2], w_fp[103], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[2], w_fp[103], COUPs[1], 1.0, &amp_fp[0] );
       jamp_sv[0] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[6] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[12] += cxtype( 0, 1 ) * amp_sv[0];
@@ -18596,7 +18596,7 @@ namespace mg5amcCpu
       jamp_sv[56] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[80] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[86] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[2], w_fp[22], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[2], w_fp[22], COUPs[1], 1.0, &amp_fp[0] );
       jamp_sv[12] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[14] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[30] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -18605,7 +18605,7 @@ namespace mg5amcCpu
       jamp_sv[56] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[72] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[74] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[2], w_fp[21], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[2], w_fp[21], COUPs[1], 1.0, &amp_fp[0] );
       jamp_sv[0] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[6] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[30] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -18621,7 +18621,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 889
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[18], w_fp[90], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[18], w_fp[90], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 889 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -18641,7 +18641,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 890
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[12], w_fp[2], w_fp[90], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[12], w_fp[2], w_fp[90], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 890 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -18661,7 +18661,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 891
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[93], w_fp[96], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[93], w_fp[96], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 891 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -18681,7 +18681,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 892
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[65], w_fp[2], w_fp[96], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[65], w_fp[2], w_fp[96], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 892 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -18701,7 +18701,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 893
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[12], w_fp[93], w_fp[0], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[12], w_fp[93], w_fp[0], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 893 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -18717,7 +18717,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 894
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[65], w_fp[18], w_fp[0], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[65], w_fp[18], w_fp[0], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 894 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -18730,10 +18730,10 @@ namespace mg5amcCpu
       // *** DIAGRAM 895 OF 1240 ***
 
       // Wavefunction(s) for diagram number 895
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[66], COUPs[0], 0., 0., w_fp[65] );
+      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[66], COUPs[0], 1.0, 0., 0., w_fp[65] );
 
       // Amplitude(s) for diagram number 895
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[65], w_fp[13], w_fp[6], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[65], w_fp[13], w_fp[6], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 895 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -18761,7 +18761,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 896
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[65], w_fp[11], w_fp[4], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[65], w_fp[11], w_fp[4], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 896 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -18789,7 +18789,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 897
-      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[4], w_fp[6], w_fp[65], COUPs[2], &amp_fp[0] );
+      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[4], w_fp[6], w_fp[65], COUPs[2], 1.0, &amp_fp[0] );
       jamp_sv[3] -= amp_sv[0];
       jamp_sv[13] += amp_sv[0];
       jamp_sv[37] += amp_sv[0];
@@ -18806,7 +18806,7 @@ namespace mg5amcCpu
       jamp_sv[100] -= amp_sv[0];
       jamp_sv[106] -= amp_sv[0];
       jamp_sv[116] += amp_sv[0];
-      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[4], w_fp[6], w_fp[65], COUPs[2], &amp_fp[0] );
+      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[4], w_fp[6], w_fp[65], COUPs[2], 1.0, &amp_fp[0] );
       jamp_sv[2] -= amp_sv[0];
       jamp_sv[12] += amp_sv[0];
       jamp_sv[36] += amp_sv[0];
@@ -18823,7 +18823,7 @@ namespace mg5amcCpu
       jamp_sv[111] += amp_sv[0];
       jamp_sv[113] -= amp_sv[0];
       jamp_sv[116] += amp_sv[0];
-      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[4], w_fp[6], w_fp[65], COUPs[2], &amp_fp[0] );
+      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[4], w_fp[6], w_fp[65], COUPs[2], 1.0, &amp_fp[0] );
       jamp_sv[2] -= amp_sv[0];
       jamp_sv[3] += amp_sv[0];
       jamp_sv[12] += amp_sv[0];
@@ -18847,7 +18847,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 898
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[56], w_fp[69], w_fp[6], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[56], w_fp[69], w_fp[6], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 898 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -18875,7 +18875,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 899
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[56], w_fp[70], w_fp[4], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[56], w_fp[70], w_fp[4], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 899 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -18903,7 +18903,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 900
-      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[66], w_fp[4], w_fp[6], w_fp[56], COUPs[2], &amp_fp[0] );
+      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[66], w_fp[4], w_fp[6], w_fp[56], COUPs[2], 1.0, &amp_fp[0] );
       jamp_sv[2] += amp_sv[0];
       jamp_sv[6] -= amp_sv[0];
       jamp_sv[8] += amp_sv[0];
@@ -18920,7 +18920,7 @@ namespace mg5amcCpu
       jamp_sv[111] -= amp_sv[0];
       jamp_sv[113] += amp_sv[0];
       jamp_sv[117] -= amp_sv[0];
-      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[66], w_fp[4], w_fp[6], w_fp[56], COUPs[2], &amp_fp[0] );
+      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[66], w_fp[4], w_fp[6], w_fp[56], COUPs[2], 1.0, &amp_fp[0] );
       jamp_sv[3] += amp_sv[0];
       jamp_sv[6] -= amp_sv[0];
       jamp_sv[8] += amp_sv[0];
@@ -18937,7 +18937,7 @@ namespace mg5amcCpu
       jamp_sv[83] += amp_sv[0];
       jamp_sv[107] += amp_sv[0];
       jamp_sv[117] -= amp_sv[0];
-      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[66], w_fp[4], w_fp[6], w_fp[56], COUPs[2], &amp_fp[0] );
+      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[66], w_fp[4], w_fp[6], w_fp[56], COUPs[2], 1.0, &amp_fp[0] );
       jamp_sv[2] -= amp_sv[0];
       jamp_sv[3] += amp_sv[0];
       jamp_sv[10] += amp_sv[0];
@@ -18961,7 +18961,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 901
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[69], w_fp[11], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[69], w_fp[11], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 901 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -18989,7 +18989,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 902
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[70], w_fp[13], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[70], w_fp[13], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 902 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -19014,12 +19014,12 @@ namespace mg5amcCpu
       // *** DIAGRAM 903 OF 1240 ***
 
       // Wavefunction(s) for diagram number 903
-      VVVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[66], w_fp[4], COUPs[2], 0., 0., w_fp[93] );
-      VVVV3P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[66], w_fp[4], COUPs[2], 0., 0., w_fp[90] );
-      VVVV4P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[66], w_fp[4], COUPs[2], 0., 0., w_fp[21] );
+      VVVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[66], w_fp[4], COUPs[2], 1.0, 0., 0., w_fp[93] );
+      VVVV3P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[66], w_fp[4], COUPs[2], 1.0, 0., 0., w_fp[90] );
+      VVVV4P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[66], w_fp[4], COUPs[2], 1.0, 0., 0., w_fp[21] );
 
       // Amplitude(s) for diagram number 903
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[6], w_fp[93], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[6], w_fp[93], COUPs[0], 1.0, &amp_fp[0] );
       jamp_sv[2] += amp_sv[0];
       jamp_sv[6] -= amp_sv[0];
       jamp_sv[8] += amp_sv[0];
@@ -19036,7 +19036,7 @@ namespace mg5amcCpu
       jamp_sv[111] -= amp_sv[0];
       jamp_sv[113] += amp_sv[0];
       jamp_sv[117] -= amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[6], w_fp[90], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[6], w_fp[90], COUPs[0], 1.0, &amp_fp[0] );
       jamp_sv[6] -= amp_sv[0];
       jamp_sv[8] += amp_sv[0];
       jamp_sv[36] += amp_sv[0];
@@ -19053,7 +19053,7 @@ namespace mg5amcCpu
       jamp_sv[109] += amp_sv[0];
       jamp_sv[116] += amp_sv[0];
       jamp_sv[117] -= amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[6], w_fp[21], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[6], w_fp[21], COUPs[0], 1.0, &amp_fp[0] );
       jamp_sv[2] -= amp_sv[0];
       jamp_sv[12] += amp_sv[0];
       jamp_sv[36] += amp_sv[0];
@@ -19074,12 +19074,12 @@ namespace mg5amcCpu
       // *** DIAGRAM 904 OF 1240 ***
 
       // Wavefunction(s) for diagram number 904
-      VVVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[66], w_fp[6], COUPs[2], 0., 0., w_fp[22] );
-      VVVV3P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[66], w_fp[6], COUPs[2], 0., 0., w_fp[103] );
-      VVVV4P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[66], w_fp[6], COUPs[2], 0., 0., w_fp[63] );
+      VVVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[66], w_fp[6], COUPs[2], 1.0, 0., 0., w_fp[22] );
+      VVVV3P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[66], w_fp[6], COUPs[2], 1.0, 0., 0., w_fp[103] );
+      VVVV4P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[66], w_fp[6], COUPs[2], 1.0, 0., 0., w_fp[63] );
 
       // Amplitude(s) for diagram number 904
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[4], w_fp[22], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[4], w_fp[22], COUPs[0], 1.0, &amp_fp[0] );
       jamp_sv[3] += amp_sv[0];
       jamp_sv[13] -= amp_sv[0];
       jamp_sv[19] -= amp_sv[0];
@@ -19096,7 +19096,7 @@ namespace mg5amcCpu
       jamp_sv[82] += amp_sv[0];
       jamp_sv[106] += amp_sv[0];
       jamp_sv[116] -= amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[4], w_fp[103], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[4], w_fp[103], COUPs[0], 1.0, &amp_fp[0] );
       jamp_sv[19] -= amp_sv[0];
       jamp_sv[22] += amp_sv[0];
       jamp_sv[37] += amp_sv[0];
@@ -19113,7 +19113,7 @@ namespace mg5amcCpu
       jamp_sv[82] += amp_sv[0];
       jamp_sv[97] += amp_sv[0];
       jamp_sv[100] -= amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[4], w_fp[63], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[4], w_fp[63], COUPs[0], 1.0, &amp_fp[0] );
       jamp_sv[3] -= amp_sv[0];
       jamp_sv[13] += amp_sv[0];
       jamp_sv[37] += amp_sv[0];
@@ -19134,12 +19134,12 @@ namespace mg5amcCpu
       // *** DIAGRAM 905 OF 1240 ***
 
       // Wavefunction(s) for diagram number 905
-      VVVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[8], w_fp[4], COUPs[2], 0., 0., w_fp[107] );
-      VVVV3P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[8], w_fp[4], COUPs[2], 0., 0., w_fp[95] );
-      VVVV4P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[8], w_fp[4], COUPs[2], 0., 0., w_fp[105] );
+      VVVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[8], w_fp[4], COUPs[2], 1.0, 0., 0., w_fp[107] );
+      VVVV3P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[8], w_fp[4], COUPs[2], 1.0, 0., 0., w_fp[95] );
+      VVVV4P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[8], w_fp[4], COUPs[2], 1.0, 0., 0., w_fp[105] );
 
       // Amplitude(s) for diagram number 905
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[66], w_fp[6], w_fp[107], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[66], w_fp[6], w_fp[107], COUPs[0], 1.0, &amp_fp[0] );
       jamp_sv[3] -= amp_sv[0];
       jamp_sv[13] += amp_sv[0];
       jamp_sv[19] += amp_sv[0];
@@ -19156,7 +19156,7 @@ namespace mg5amcCpu
       jamp_sv[82] -= amp_sv[0];
       jamp_sv[106] -= amp_sv[0];
       jamp_sv[116] += amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[66], w_fp[6], w_fp[95], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[66], w_fp[6], w_fp[95], COUPs[0], 1.0, &amp_fp[0] );
       jamp_sv[6] -= amp_sv[0];
       jamp_sv[8] += amp_sv[0];
       jamp_sv[10] += amp_sv[0];
@@ -19173,7 +19173,7 @@ namespace mg5amcCpu
       jamp_sv[107] += amp_sv[0];
       jamp_sv[116] += amp_sv[0];
       jamp_sv[117] -= amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[66], w_fp[6], w_fp[105], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[66], w_fp[6], w_fp[105], COUPs[0], 1.0, &amp_fp[0] );
       jamp_sv[3] += amp_sv[0];
       jamp_sv[6] -= amp_sv[0];
       jamp_sv[8] += amp_sv[0];
@@ -19197,7 +19197,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 906
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[66], w_fp[4], w_fp[118], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[66], w_fp[4], w_fp[118], COUPs[0], 1.0, &amp_fp[0] );
       jamp_sv[2] -= amp_sv[0];
       jamp_sv[6] += amp_sv[0];
       jamp_sv[8] -= amp_sv[0];
@@ -19214,7 +19214,7 @@ namespace mg5amcCpu
       jamp_sv[111] += amp_sv[0];
       jamp_sv[113] -= amp_sv[0];
       jamp_sv[117] += amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[66], w_fp[4], w_fp[119], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[66], w_fp[4], w_fp[119], COUPs[0], 1.0, &amp_fp[0] );
       jamp_sv[19] -= amp_sv[0];
       jamp_sv[20] += amp_sv[0];
       jamp_sv[21] -= amp_sv[0];
@@ -19231,7 +19231,7 @@ namespace mg5amcCpu
       jamp_sv[98] -= amp_sv[0];
       jamp_sv[99] += amp_sv[0];
       jamp_sv[100] -= amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[66], w_fp[4], w_fp[120], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[66], w_fp[4], w_fp[120], COUPs[0], 1.0, &amp_fp[0] );
       jamp_sv[2] += amp_sv[0];
       jamp_sv[6] -= amp_sv[0];
       jamp_sv[8] += amp_sv[0];
@@ -19255,7 +19255,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 907
-      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[66], w_fp[8], w_fp[27], COUPs[2], &amp_fp[0] );
+      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[66], w_fp[8], w_fp[27], COUPs[2], 1.0, &amp_fp[0] );
       jamp_sv[2] += amp_sv[0];
       jamp_sv[3] -= amp_sv[0];
       jamp_sv[12] -= amp_sv[0];
@@ -19272,7 +19272,7 @@ namespace mg5amcCpu
       jamp_sv[109] -= amp_sv[0];
       jamp_sv[111] -= amp_sv[0];
       jamp_sv[113] += amp_sv[0];
-      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[66], w_fp[8], w_fp[27], COUPs[2], &amp_fp[0] );
+      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[66], w_fp[8], w_fp[27], COUPs[2], 1.0, &amp_fp[0] );
       jamp_sv[2] += amp_sv[0];
       jamp_sv[3] -= amp_sv[0];
       jamp_sv[10] -= amp_sv[0];
@@ -19289,7 +19289,7 @@ namespace mg5amcCpu
       jamp_sv[83] -= amp_sv[0];
       jamp_sv[111] -= amp_sv[0];
       jamp_sv[113] += amp_sv[0];
-      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[66], w_fp[8], w_fp[27], COUPs[2], &amp_fp[0] );
+      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[66], w_fp[8], w_fp[27], COUPs[2], 1.0, &amp_fp[0] );
       jamp_sv[10] -= amp_sv[0];
       jamp_sv[11] += amp_sv[0];
       jamp_sv[20] += amp_sv[0];
@@ -19313,7 +19313,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 908
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[27], w_fp[65], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[27], w_fp[65], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 908 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -19341,7 +19341,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 909
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[66], w_fp[27], w_fp[56], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[66], w_fp[27], w_fp[56], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 909 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -19369,7 +19369,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 910
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[66], w_fp[8], w_fp[101], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[66], w_fp[8], w_fp[101], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 910 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -19397,7 +19397,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 911
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[65], w_fp[37], w_fp[6], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[65], w_fp[37], w_fp[6], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 911 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -19417,7 +19417,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 912
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[36], w_fp[65], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[36], w_fp[65], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 912 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -19433,7 +19433,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 913
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[71], w_fp[114], w_fp[6], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[71], w_fp[114], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 913 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -19447,7 +19447,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 914
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[114], w_fp[70], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[114], w_fp[70], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 914 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -19463,7 +19463,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 915
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[71], w_fp[36], w_fp[0], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[71], w_fp[36], w_fp[0], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 915 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -19477,7 +19477,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 916
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[70], w_fp[37], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[70], w_fp[37], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 916 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -19497,7 +19497,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 917
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[33], w_fp[22], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[33], w_fp[22], COUPs[1], 1.0, &amp_fp[0] );
       jamp_sv[48] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[50] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[52] += cxtype( 0, 1 ) * amp_sv[0];
@@ -19506,7 +19506,7 @@ namespace mg5amcCpu
       jamp_sv[63] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[69] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[71] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[33], w_fp[103], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[33], w_fp[103], COUPs[1], 1.0, &amp_fp[0] );
       jamp_sv[52] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[53] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[56] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -19515,7 +19515,7 @@ namespace mg5amcCpu
       jamp_sv[63] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[66] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[67] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[33], w_fp[63], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[33], w_fp[63], COUPs[1], 1.0, &amp_fp[0] );
       jamp_sv[48] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[50] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[56] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -19531,7 +19531,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 918
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[33], w_fp[65], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[33], w_fp[65], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 918 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -19547,7 +19547,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 919
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[114], w_fp[66], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[114], w_fp[66], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 919 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -19561,7 +19561,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 920
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[62], w_fp[33], w_fp[66], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[62], w_fp[33], w_fp[66], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 920 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -19575,7 +19575,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 921
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[65], w_fp[51], w_fp[4], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[65], w_fp[51], w_fp[4], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 921 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -19595,7 +19595,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 922
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[49], w_fp[65], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[49], w_fp[65], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 922 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -19611,7 +19611,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 923
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[71], w_fp[113], w_fp[4], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[71], w_fp[113], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 923 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -19625,7 +19625,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 924
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[113], w_fp[69], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[113], w_fp[69], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 924 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -19641,7 +19641,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 925
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[71], w_fp[49], w_fp[0], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[71], w_fp[49], w_fp[0], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 925 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -19655,7 +19655,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 926
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[69], w_fp[51], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[69], w_fp[51], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 926 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -19675,7 +19675,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 927
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[47], w_fp[93], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[47], w_fp[93], COUPs[1], 1.0, &amp_fp[0] );
       jamp_sv[97] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[98] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[99] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -19684,7 +19684,7 @@ namespace mg5amcCpu
       jamp_sv[111] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[113] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[117] -= cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[47], w_fp[90], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[47], w_fp[90], COUPs[1], 1.0, &amp_fp[0] );
       jamp_sv[98] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[99] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[106] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -19693,7 +19693,7 @@ namespace mg5amcCpu
       jamp_sv[109] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[116] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[117] -= cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[47], w_fp[21], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[47], w_fp[21], COUPs[1], 1.0, &amp_fp[0] );
       jamp_sv[97] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[100] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[106] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -19709,7 +19709,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 928
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[47], w_fp[65], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[47], w_fp[65], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 928 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -19725,7 +19725,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 929
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[113], w_fp[66], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[113], w_fp[66], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 929 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -19739,7 +19739,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 930
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[47], w_fp[66], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[47], w_fp[66], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 930 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -19753,7 +19753,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 931
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[65], w_fp[54], w_fp[6], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[65], w_fp[54], w_fp[6], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 931 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -19773,7 +19773,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 932
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[53], w_fp[2], w_fp[65], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[53], w_fp[2], w_fp[65], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 932 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -19789,7 +19789,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 933
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[94], w_fp[6], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[94], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 933 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -19803,7 +19803,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 934
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[2], w_fp[70], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[2], w_fp[70], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 934 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -19819,7 +19819,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 935
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[53], w_fp[94], w_fp[0], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[53], w_fp[94], w_fp[0], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 935 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -19833,7 +19833,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 936
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[70], w_fp[54], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[70], w_fp[54], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 936 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -19853,7 +19853,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 937
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[2], w_fp[22], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[2], w_fp[22], COUPs[1], 1.0, &amp_fp[0] );
       jamp_sv[3] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[13] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[19] += cxtype( 0, 1 ) * amp_sv[0];
@@ -19862,7 +19862,7 @@ namespace mg5amcCpu
       jamp_sv[82] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[106] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[116] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[2], w_fp[103], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[2], w_fp[103], COUPs[1], 1.0, &amp_fp[0] );
       jamp_sv[19] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[22] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[37] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -19871,7 +19871,7 @@ namespace mg5amcCpu
       jamp_sv[82] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[97] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[100] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[2], w_fp[63], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[2], w_fp[63], COUPs[1], 1.0, &amp_fp[0] );
       jamp_sv[3] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[13] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[37] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -19887,7 +19887,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 938
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[65], w_fp[20], w_fp[4], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[65], w_fp[20], w_fp[4], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 938 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -19907,7 +19907,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 939
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[28], w_fp[2], w_fp[65], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[28], w_fp[2], w_fp[65], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 939 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -19923,7 +19923,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 940
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[62], w_fp[94], w_fp[4], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[62], w_fp[94], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 940 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -19937,7 +19937,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 941
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[62], w_fp[2], w_fp[69], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[62], w_fp[2], w_fp[69], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 941 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -19953,7 +19953,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 942
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[28], w_fp[94], w_fp[0], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[28], w_fp[94], w_fp[0], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 942 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -19967,7 +19967,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 943
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[69], w_fp[20], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[69], w_fp[20], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 943 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -19987,7 +19987,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 944
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[2], w_fp[93], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[2], w_fp[93], COUPs[1], 1.0, &amp_fp[0] );
       jamp_sv[2] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[6] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[8] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -19996,7 +19996,7 @@ namespace mg5amcCpu
       jamp_sv[56] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[62] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[80] -= cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[2], w_fp[90], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[2], w_fp[90], COUPs[1], 1.0, &amp_fp[0] );
       jamp_sv[6] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[8] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[36] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -20005,7 +20005,7 @@ namespace mg5amcCpu
       jamp_sv[50] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[78] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[80] -= cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[2], w_fp[21], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[2], w_fp[21], COUPs[1], 1.0, &amp_fp[0] );
       jamp_sv[2] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[12] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[36] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -20021,7 +20021,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 945
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[15], w_fp[65], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[15], w_fp[65], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 945 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -20041,7 +20041,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 946
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[14], w_fp[2], w_fp[65], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[14], w_fp[2], w_fp[65], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 946 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -20061,7 +20061,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 947
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[94], w_fp[101], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[94], w_fp[101], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 947 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -20081,7 +20081,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 948
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[71], w_fp[2], w_fp[101], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[71], w_fp[2], w_fp[101], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 948 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -20101,7 +20101,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 949
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[14], w_fp[94], w_fp[0], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[14], w_fp[94], w_fp[0], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 949 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -20117,7 +20117,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 950
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[71], w_fp[15], w_fp[0], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[71], w_fp[15], w_fp[0], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 950 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -20130,10 +20130,10 @@ namespace mg5amcCpu
       // *** DIAGRAM 951 OF 1240 ***
 
       // Wavefunction(s) for diagram number 951
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[72], COUPs[0], 0., 0., w_fp[71] );
+      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[72], COUPs[0], 1.0, 0., 0., w_fp[71] );
 
       // Amplitude(s) for diagram number 951
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[71], w_fp[13], w_fp[5], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[71], w_fp[13], w_fp[5], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 951 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -20161,7 +20161,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 952
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[71], w_fp[10], w_fp[4], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[71], w_fp[10], w_fp[4], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 952 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -20189,7 +20189,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 953
-      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[4], w_fp[5], w_fp[71], COUPs[2], &amp_fp[0] );
+      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[4], w_fp[5], w_fp[71], COUPs[2], 1.0, &amp_fp[0] );
       jamp_sv[5] -= amp_sv[0];
       jamp_sv[19] += amp_sv[0];
       jamp_sv[43] += amp_sv[0];
@@ -20206,7 +20206,7 @@ namespace mg5amcCpu
       jamp_sv[82] -= amp_sv[0];
       jamp_sv[92] += amp_sv[0];
       jamp_sv[103] -= amp_sv[0];
-      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[4], w_fp[5], w_fp[71], COUPs[2], &amp_fp[0] );
+      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[4], w_fp[5], w_fp[71], COUPs[2], 1.0, &amp_fp[0] );
       jamp_sv[4] -= amp_sv[0];
       jamp_sv[18] += amp_sv[0];
       jamp_sv[42] += amp_sv[0];
@@ -20223,7 +20223,7 @@ namespace mg5amcCpu
       jamp_sv[89] -= amp_sv[0];
       jamp_sv[92] += amp_sv[0];
       jamp_sv[102] -= amp_sv[0];
-      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[4], w_fp[5], w_fp[71], COUPs[2], &amp_fp[0] );
+      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[4], w_fp[5], w_fp[71], COUPs[2], 1.0, &amp_fp[0] );
       jamp_sv[4] -= amp_sv[0];
       jamp_sv[5] += amp_sv[0];
       jamp_sv[18] += amp_sv[0];
@@ -20247,7 +20247,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 954
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[56], w_fp[74], w_fp[5], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[56], w_fp[74], w_fp[5], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 954 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -20275,7 +20275,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 955
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[56], w_fp[75], w_fp[4], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[56], w_fp[75], w_fp[4], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 955 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -20303,7 +20303,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 956
-      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[72], w_fp[4], w_fp[5], w_fp[56], COUPs[2], &amp_fp[0] );
+      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[72], w_fp[4], w_fp[5], w_fp[56], COUPs[2], 1.0, &amp_fp[0] );
       jamp_sv[4] += amp_sv[0];
       jamp_sv[7] -= amp_sv[0];
       jamp_sv[10] += amp_sv[0];
@@ -20320,7 +20320,7 @@ namespace mg5amcCpu
       jamp_sv[89] += amp_sv[0];
       jamp_sv[93] -= amp_sv[0];
       jamp_sv[105] += amp_sv[0];
-      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[72], w_fp[4], w_fp[5], w_fp[56], COUPs[2], &amp_fp[0] );
+      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[72], w_fp[4], w_fp[5], w_fp[56], COUPs[2], 1.0, &amp_fp[0] );
       jamp_sv[5] += amp_sv[0];
       jamp_sv[7] -= amp_sv[0];
       jamp_sv[8] += amp_sv[0];
@@ -20337,7 +20337,7 @@ namespace mg5amcCpu
       jamp_sv[83] += amp_sv[0];
       jamp_sv[93] -= amp_sv[0];
       jamp_sv[107] += amp_sv[0];
-      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[72], w_fp[4], w_fp[5], w_fp[56], COUPs[2], &amp_fp[0] );
+      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[72], w_fp[4], w_fp[5], w_fp[56], COUPs[2], 1.0, &amp_fp[0] );
       jamp_sv[4] -= amp_sv[0];
       jamp_sv[5] += amp_sv[0];
       jamp_sv[8] += amp_sv[0];
@@ -20361,7 +20361,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 957
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[74], w_fp[10], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[74], w_fp[10], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 957 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -20389,7 +20389,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 958
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[75], w_fp[13], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[75], w_fp[13], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 958 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -20414,12 +20414,12 @@ namespace mg5amcCpu
       // *** DIAGRAM 959 OF 1240 ***
 
       // Wavefunction(s) for diagram number 959
-      VVVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[72], w_fp[4], COUPs[2], 0., 0., w_fp[94] );
-      VVVV3P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[72], w_fp[4], COUPs[2], 0., 0., w_fp[65] );
-      VVVV4P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[72], w_fp[4], COUPs[2], 0., 0., w_fp[21] );
+      VVVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[72], w_fp[4], COUPs[2], 1.0, 0., 0., w_fp[94] );
+      VVVV3P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[72], w_fp[4], COUPs[2], 1.0, 0., 0., w_fp[65] );
+      VVVV4P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[72], w_fp[4], COUPs[2], 1.0, 0., 0., w_fp[21] );
 
       // Amplitude(s) for diagram number 959
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[5], w_fp[94], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[5], w_fp[94], COUPs[0], 1.0, &amp_fp[0] );
       jamp_sv[4] += amp_sv[0];
       jamp_sv[7] -= amp_sv[0];
       jamp_sv[10] += amp_sv[0];
@@ -20436,7 +20436,7 @@ namespace mg5amcCpu
       jamp_sv[89] += amp_sv[0];
       jamp_sv[93] -= amp_sv[0];
       jamp_sv[104] += amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[5], w_fp[65], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[5], w_fp[65], COUPs[0], 1.0, &amp_fp[0] );
       jamp_sv[7] -= amp_sv[0];
       jamp_sv[10] += amp_sv[0];
       jamp_sv[42] += amp_sv[0];
@@ -20453,7 +20453,7 @@ namespace mg5amcCpu
       jamp_sv[93] -= amp_sv[0];
       jamp_sv[102] -= amp_sv[0];
       jamp_sv[104] += amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[5], w_fp[21], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[5], w_fp[21], COUPs[0], 1.0, &amp_fp[0] );
       jamp_sv[4] -= amp_sv[0];
       jamp_sv[18] += amp_sv[0];
       jamp_sv[42] += amp_sv[0];
@@ -20474,12 +20474,12 @@ namespace mg5amcCpu
       // *** DIAGRAM 960 OF 1240 ***
 
       // Wavefunction(s) for diagram number 960
-      VVVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[72], w_fp[5], COUPs[2], 0., 0., w_fp[90] );
-      VVVV3P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[72], w_fp[5], COUPs[2], 0., 0., w_fp[93] );
-      VVVV4P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[72], w_fp[5], COUPs[2], 0., 0., w_fp[69] );
+      VVVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[72], w_fp[5], COUPs[2], 1.0, 0., 0., w_fp[90] );
+      VVVV3P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[72], w_fp[5], COUPs[2], 1.0, 0., 0., w_fp[93] );
+      VVVV4P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[72], w_fp[5], COUPs[2], 1.0, 0., 0., w_fp[69] );
 
       // Amplitude(s) for diagram number 960
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[4], w_fp[90], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[4], w_fp[90], COUPs[0], 1.0, &amp_fp[0] );
       jamp_sv[5] += amp_sv[0];
       jamp_sv[13] -= amp_sv[0];
       jamp_sv[16] += amp_sv[0];
@@ -20496,7 +20496,7 @@ namespace mg5amcCpu
       jamp_sv[82] += amp_sv[0];
       jamp_sv[92] -= amp_sv[0];
       jamp_sv[106] += amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[4], w_fp[93], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[4], w_fp[93], COUPs[0], 1.0, &amp_fp[0] );
       jamp_sv[13] -= amp_sv[0];
       jamp_sv[16] += amp_sv[0];
       jamp_sv[43] += amp_sv[0];
@@ -20513,7 +20513,7 @@ namespace mg5amcCpu
       jamp_sv[76] -= amp_sv[0];
       jamp_sv[103] -= amp_sv[0];
       jamp_sv[106] += amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[4], w_fp[69], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[4], w_fp[69], COUPs[0], 1.0, &amp_fp[0] );
       jamp_sv[5] -= amp_sv[0];
       jamp_sv[19] += amp_sv[0];
       jamp_sv[43] += amp_sv[0];
@@ -20537,7 +20537,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 961
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[72], w_fp[5], w_fp[107], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[72], w_fp[5], w_fp[107], COUPs[0], 1.0, &amp_fp[0] );
       jamp_sv[5] -= amp_sv[0];
       jamp_sv[13] += amp_sv[0];
       jamp_sv[16] -= amp_sv[0];
@@ -20554,7 +20554,7 @@ namespace mg5amcCpu
       jamp_sv[82] -= amp_sv[0];
       jamp_sv[92] += amp_sv[0];
       jamp_sv[106] -= amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[72], w_fp[5], w_fp[95], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[72], w_fp[5], w_fp[95], COUPs[0], 1.0, &amp_fp[0] );
       jamp_sv[7] -= amp_sv[0];
       jamp_sv[8] += amp_sv[0];
       jamp_sv[9] -= amp_sv[0];
@@ -20571,7 +20571,7 @@ namespace mg5amcCpu
       jamp_sv[93] -= amp_sv[0];
       jamp_sv[106] -= amp_sv[0];
       jamp_sv[107] += amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[72], w_fp[5], w_fp[105], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[72], w_fp[5], w_fp[105], COUPs[0], 1.0, &amp_fp[0] );
       jamp_sv[5] += amp_sv[0];
       jamp_sv[7] -= amp_sv[0];
       jamp_sv[8] += amp_sv[0];
@@ -20595,7 +20595,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 962
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[72], w_fp[4], w_fp[115], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[72], w_fp[4], w_fp[115], COUPs[0], 1.0, &amp_fp[0] );
       jamp_sv[4] -= amp_sv[0];
       jamp_sv[7] += amp_sv[0];
       jamp_sv[10] -= amp_sv[0];
@@ -20612,7 +20612,7 @@ namespace mg5amcCpu
       jamp_sv[89] -= amp_sv[0];
       jamp_sv[93] += amp_sv[0];
       jamp_sv[104] -= amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[72], w_fp[4], w_fp[116], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[72], w_fp[4], w_fp[116], COUPs[0], 1.0, &amp_fp[0] );
       jamp_sv[13] -= amp_sv[0];
       jamp_sv[14] += amp_sv[0];
       jamp_sv[15] -= amp_sv[0];
@@ -20629,7 +20629,7 @@ namespace mg5amcCpu
       jamp_sv[76] -= amp_sv[0];
       jamp_sv[104] -= amp_sv[0];
       jamp_sv[105] += amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[72], w_fp[4], w_fp[117], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[72], w_fp[4], w_fp[117], COUPs[0], 1.0, &amp_fp[0] );
       jamp_sv[4] += amp_sv[0];
       jamp_sv[7] -= amp_sv[0];
       jamp_sv[10] += amp_sv[0];
@@ -20653,7 +20653,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 963
-      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[72], w_fp[8], w_fp[24], COUPs[2], &amp_fp[0] );
+      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[72], w_fp[8], w_fp[24], COUPs[2], 1.0, &amp_fp[0] );
       jamp_sv[4] += amp_sv[0];
       jamp_sv[5] -= amp_sv[0];
       jamp_sv[18] -= amp_sv[0];
@@ -20670,7 +20670,7 @@ namespace mg5amcCpu
       jamp_sv[89] += amp_sv[0];
       jamp_sv[102] += amp_sv[0];
       jamp_sv[103] -= amp_sv[0];
-      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[72], w_fp[8], w_fp[24], COUPs[2], &amp_fp[0] );
+      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[72], w_fp[8], w_fp[24], COUPs[2], 1.0, &amp_fp[0] );
       jamp_sv[4] += amp_sv[0];
       jamp_sv[5] -= amp_sv[0];
       jamp_sv[8] -= amp_sv[0];
@@ -20687,7 +20687,7 @@ namespace mg5amcCpu
       jamp_sv[89] += amp_sv[0];
       jamp_sv[105] += amp_sv[0];
       jamp_sv[107] -= amp_sv[0];
-      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[72], w_fp[8], w_fp[24], COUPs[2], &amp_fp[0] );
+      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[72], w_fp[8], w_fp[24], COUPs[2], 1.0, &amp_fp[0] );
       jamp_sv[8] -= amp_sv[0];
       jamp_sv[9] += amp_sv[0];
       jamp_sv[14] += amp_sv[0];
@@ -20711,7 +20711,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 964
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[24], w_fp[71], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[24], w_fp[71], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 964 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -20739,7 +20739,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 965
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[72], w_fp[24], w_fp[56], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[72], w_fp[24], w_fp[56], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 965 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -20767,7 +20767,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 966
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[72], w_fp[8], w_fp[98], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[72], w_fp[8], w_fp[98], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 966 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -20795,7 +20795,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 967
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[71], w_fp[37], w_fp[5], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[71], w_fp[37], w_fp[5], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 967 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -20815,7 +20815,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 968
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[35], w_fp[71], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[35], w_fp[71], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 968 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -20831,7 +20831,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 969
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[76], w_fp[114], w_fp[5], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[76], w_fp[114], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 969 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -20845,7 +20845,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 970
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[114], w_fp[75], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[114], w_fp[75], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 970 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -20861,7 +20861,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 971
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[76], w_fp[35], w_fp[0], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[76], w_fp[35], w_fp[0], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 971 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -20875,7 +20875,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 972
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[75], w_fp[37], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[75], w_fp[37], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 972 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -20895,7 +20895,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 973
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[33], w_fp[90], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[33], w_fp[90], COUPs[1], 1.0, &amp_fp[0] );
       jamp_sv[49] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[50] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[51] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -20904,7 +20904,7 @@ namespace mg5amcCpu
       jamp_sv[63] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[65] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[69] -= cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[33], w_fp[93], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[33], w_fp[93], COUPs[1], 1.0, &amp_fp[0] );
       jamp_sv[50] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[51] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[58] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -20913,7 +20913,7 @@ namespace mg5amcCpu
       jamp_sv[61] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[68] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[69] -= cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[33], w_fp[69], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[33], w_fp[69], COUPs[1], 1.0, &amp_fp[0] );
       jamp_sv[49] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[52] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[58] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -20929,7 +20929,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 974
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[33], w_fp[71], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[33], w_fp[71], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 974 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -20945,7 +20945,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 975
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[114], w_fp[72], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[114], w_fp[72], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 975 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -20959,7 +20959,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 976
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[104], w_fp[33], w_fp[72], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[104], w_fp[33], w_fp[72], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 976 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -20973,7 +20973,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 977
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[71], w_fp[45], w_fp[4], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[71], w_fp[45], w_fp[4], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 977 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -20993,7 +20993,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 978
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[43], w_fp[71], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[43], w_fp[71], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 978 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -21009,7 +21009,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 979
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[76], w_fp[102], w_fp[4], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[76], w_fp[102], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 979 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -21023,7 +21023,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 980
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[102], w_fp[74], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[102], w_fp[74], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 980 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -21039,7 +21039,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 981
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[76], w_fp[43], w_fp[0], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[76], w_fp[43], w_fp[0], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 981 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -21053,7 +21053,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 982
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[74], w_fp[45], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[74], w_fp[45], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 982 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -21073,7 +21073,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 983
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[39], w_fp[94], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[39], w_fp[94], COUPs[1], 1.0, &amp_fp[0] );
       jamp_sv[73] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[74] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[75] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -21082,7 +21082,7 @@ namespace mg5amcCpu
       jamp_sv[87] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[89] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[93] -= cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[39], w_fp[65], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[39], w_fp[65], COUPs[1], 1.0, &amp_fp[0] );
       jamp_sv[74] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[75] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[82] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -21091,7 +21091,7 @@ namespace mg5amcCpu
       jamp_sv[85] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[92] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[93] -= cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[39], w_fp[21], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[39], w_fp[21], COUPs[1], 1.0, &amp_fp[0] );
       jamp_sv[73] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[76] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[82] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -21107,7 +21107,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 984
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[39], w_fp[71], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[39], w_fp[71], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 984 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -21123,7 +21123,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 985
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[102], w_fp[72], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[102], w_fp[72], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 985 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -21137,7 +21137,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 986
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[39], w_fp[72], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[39], w_fp[72], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 986 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -21151,7 +21151,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 987
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[71], w_fp[54], w_fp[5], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[71], w_fp[54], w_fp[5], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 987 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -21171,7 +21171,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 988
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[7], w_fp[2], w_fp[71], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[7], w_fp[2], w_fp[71], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 988 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -21187,7 +21187,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 989
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[97], w_fp[5], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[97], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 989 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -21201,7 +21201,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 990
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[2], w_fp[75], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[2], w_fp[75], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 990 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -21217,7 +21217,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 991
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[7], w_fp[97], w_fp[0], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[7], w_fp[97], w_fp[0], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 991 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -21231,7 +21231,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 992
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[75], w_fp[54], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[75], w_fp[54], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 992 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -21251,7 +21251,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 993
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[2], w_fp[90], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[2], w_fp[90], COUPs[1], 1.0, &amp_fp[0] );
       jamp_sv[5] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[13] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[16] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -21260,7 +21260,7 @@ namespace mg5amcCpu
       jamp_sv[82] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[92] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[106] -= cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[2], w_fp[93], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[2], w_fp[93], COUPs[1], 1.0, &amp_fp[0] );
       jamp_sv[13] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[16] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[43] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -21269,7 +21269,7 @@ namespace mg5amcCpu
       jamp_sv[76] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[103] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[106] -= cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[2], w_fp[69], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[2], w_fp[69], COUPs[1], 1.0, &amp_fp[0] );
       jamp_sv[5] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[19] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[43] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -21285,7 +21285,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 994
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[71], w_fp[23], w_fp[4], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[71], w_fp[23], w_fp[4], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 994 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -21305,7 +21305,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 995
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[25], w_fp[2], w_fp[71], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[25], w_fp[2], w_fp[71], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 995 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -21321,7 +21321,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 996
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[104], w_fp[97], w_fp[4], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[104], w_fp[97], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 996 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -21335,7 +21335,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 997
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[104], w_fp[2], w_fp[74], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[104], w_fp[2], w_fp[74], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 997 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -21351,7 +21351,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 998
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[25], w_fp[97], w_fp[0], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[25], w_fp[97], w_fp[0], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 998 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -21365,7 +21365,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 999
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[74], w_fp[23], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[74], w_fp[23], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 999 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -21385,7 +21385,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 1000
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[2], w_fp[94], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[2], w_fp[94], COUPs[1], 1.0, &amp_fp[0] );
       jamp_sv[4] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[7] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[10] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -21394,7 +21394,7 @@ namespace mg5amcCpu
       jamp_sv[58] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[68] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[104] -= cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[2], w_fp[65], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[2], w_fp[65], COUPs[1], 1.0, &amp_fp[0] );
       jamp_sv[7] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[10] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[42] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -21403,7 +21403,7 @@ namespace mg5amcCpu
       jamp_sv[52] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[102] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[104] -= cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[2], w_fp[21], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[2], w_fp[21], COUPs[1], 1.0, &amp_fp[0] );
       jamp_sv[4] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[18] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[42] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -21419,7 +21419,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 1001
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[17], w_fp[71], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[17], w_fp[71], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 1001 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -21439,7 +21439,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 1002
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[26], w_fp[2], w_fp[71], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[26], w_fp[2], w_fp[71], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 1002 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -21459,7 +21459,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 1003
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[97], w_fp[98], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[97], w_fp[98], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 1003 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -21479,7 +21479,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 1004
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[76], w_fp[2], w_fp[98], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[76], w_fp[2], w_fp[98], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 1004 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -21499,7 +21499,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 1005
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[26], w_fp[97], w_fp[0], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[26], w_fp[97], w_fp[0], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 1005 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -21515,7 +21515,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 1006
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[76], w_fp[17], w_fp[0], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[76], w_fp[17], w_fp[0], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 1006 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -21531,7 +21531,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 1007
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[56], w_fp[59], w_fp[6], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[56], w_fp[59], w_fp[6], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 1007 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -21559,7 +21559,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 1008
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[56], w_fp[1], w_fp[42], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[56], w_fp[1], w_fp[42], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 1008 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -21587,7 +21587,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 1009
-      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[24], w_fp[6], w_fp[56], COUPs[2], &amp_fp[0] );
+      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[24], w_fp[6], w_fp[56], COUPs[2], 1.0, &amp_fp[0] );
       jamp_sv[0] += amp_sv[0];
       jamp_sv[2] -= amp_sv[0];
       jamp_sv[8] -= amp_sv[0];
@@ -21604,7 +21604,7 @@ namespace mg5amcCpu
       jamp_sv[107] -= amp_sv[0];
       jamp_sv[113] -= amp_sv[0];
       jamp_sv[119] += amp_sv[0];
-      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[24], w_fp[6], w_fp[56], COUPs[2], &amp_fp[0] );
+      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[24], w_fp[6], w_fp[56], COUPs[2], 1.0, &amp_fp[0] );
       jamp_sv[4] += amp_sv[0];
       jamp_sv[5] -= amp_sv[0];
       jamp_sv[8] -= amp_sv[0];
@@ -21621,7 +21621,7 @@ namespace mg5amcCpu
       jamp_sv[89] += amp_sv[0];
       jamp_sv[105] += amp_sv[0];
       jamp_sv[107] -= amp_sv[0];
-      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[24], w_fp[6], w_fp[56], COUPs[2], &amp_fp[0] );
+      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[24], w_fp[6], w_fp[56], COUPs[2], 1.0, &amp_fp[0] );
       jamp_sv[0] -= amp_sv[0];
       jamp_sv[2] += amp_sv[0];
       jamp_sv[4] += amp_sv[0];
@@ -21645,7 +21645,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 1010
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[98], w_fp[108], w_fp[6], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[98], w_fp[108], w_fp[6], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 1010 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -21673,7 +21673,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 1011
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[98], w_fp[1], w_fp[11], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[98], w_fp[1], w_fp[11], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 1011 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -21701,7 +21701,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 1012
-      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[6], w_fp[98], COUPs[2], &amp_fp[0] );
+      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[6], w_fp[98], COUPs[2], 1.0, &amp_fp[0] );
       jamp_sv[9] += amp_sv[0];
       jamp_sv[15] -= amp_sv[0];
       jamp_sv[24] -= amp_sv[0];
@@ -21718,7 +21718,7 @@ namespace mg5amcCpu
       jamp_sv[101] += amp_sv[0];
       jamp_sv[112] += amp_sv[0];
       jamp_sv[118] -= amp_sv[0];
-      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[6], w_fp[98], COUPs[2], &amp_fp[0] );
+      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[6], w_fp[98], COUPs[2], 1.0, &amp_fp[0] );
       jamp_sv[8] -= amp_sv[0];
       jamp_sv[9] += amp_sv[0];
       jamp_sv[14] += amp_sv[0];
@@ -21735,7 +21735,7 @@ namespace mg5amcCpu
       jamp_sv[103] += amp_sv[0];
       jamp_sv[105] += amp_sv[0];
       jamp_sv[107] -= amp_sv[0];
-      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[6], w_fp[98], COUPs[2], &amp_fp[0] );
+      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[6], w_fp[98], COUPs[2], 1.0, &amp_fp[0] );
       jamp_sv[8] -= amp_sv[0];
       jamp_sv[14] += amp_sv[0];
       jamp_sv[24] += amp_sv[0];
@@ -21759,7 +21759,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 1013
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[108], w_fp[42], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[108], w_fp[42], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 1013 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -21787,7 +21787,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 1014
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[59], w_fp[11], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[59], w_fp[11], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 1014 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -21812,12 +21812,12 @@ namespace mg5amcCpu
       // *** DIAGRAM 1015 OF 1240 ***
 
       // Wavefunction(s) for diagram number 1015
-      VVVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[1], w_fp[8], COUPs[2], 0., 0., w_fp[11] );
-      VVVV3P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[1], w_fp[8], COUPs[2], 0., 0., w_fp[42] );
-      VVVV4P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[1], w_fp[8], COUPs[2], 0., 0., w_fp[76] );
+      VVVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[1], w_fp[8], COUPs[2], 1.0, 0., 0., w_fp[11] );
+      VVVV3P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[1], w_fp[8], COUPs[2], 1.0, 0., 0., w_fp[42] );
+      VVVV4P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[1], w_fp[8], COUPs[2], 1.0, 0., 0., w_fp[76] );
 
       // Amplitude(s) for diagram number 1015
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[24], w_fp[6], w_fp[11], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[24], w_fp[6], w_fp[11], COUPs[0], 1.0, &amp_fp[0] );
       jamp_sv[9] += amp_sv[0];
       jamp_sv[15] -= amp_sv[0];
       jamp_sv[21] -= amp_sv[0];
@@ -21834,7 +21834,7 @@ namespace mg5amcCpu
       jamp_sv[88] += amp_sv[0];
       jamp_sv[112] += amp_sv[0];
       jamp_sv[118] -= amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[24], w_fp[6], w_fp[42], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[24], w_fp[6], w_fp[42], COUPs[0], 1.0, &amp_fp[0] );
       jamp_sv[0] -= amp_sv[0];
       jamp_sv[2] += amp_sv[0];
       jamp_sv[4] += amp_sv[0];
@@ -21851,7 +21851,7 @@ namespace mg5amcCpu
       jamp_sv[89] += amp_sv[0];
       jamp_sv[113] += amp_sv[0];
       jamp_sv[119] -= amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[24], w_fp[6], w_fp[76], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[24], w_fp[6], w_fp[76], COUPs[0], 1.0, &amp_fp[0] );
       jamp_sv[0] -= amp_sv[0];
       jamp_sv[2] += amp_sv[0];
       jamp_sv[4] += amp_sv[0];
@@ -21872,12 +21872,12 @@ namespace mg5amcCpu
       // *** DIAGRAM 1016 OF 1240 ***
 
       // Wavefunction(s) for diagram number 1016
-      VVVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[1], w_fp[24], COUPs[2], 0., 0., w_fp[97] );
-      VVVV3P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[1], w_fp[24], COUPs[2], 0., 0., w_fp[71] );
-      VVVV4P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[1], w_fp[24], COUPs[2], 0., 0., w_fp[21] );
+      VVVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[1], w_fp[24], COUPs[2], 1.0, 0., 0., w_fp[97] );
+      VVVV3P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[1], w_fp[24], COUPs[2], 1.0, 0., 0., w_fp[71] );
+      VVVV4P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[1], w_fp[24], COUPs[2], 1.0, 0., 0., w_fp[21] );
 
       // Amplitude(s) for diagram number 1016
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[6], w_fp[97], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[6], w_fp[97], COUPs[0], 1.0, &amp_fp[0] );
       jamp_sv[0] += amp_sv[0];
       jamp_sv[2] -= amp_sv[0];
       jamp_sv[8] -= amp_sv[0];
@@ -21894,7 +21894,7 @@ namespace mg5amcCpu
       jamp_sv[107] -= amp_sv[0];
       jamp_sv[113] -= amp_sv[0];
       jamp_sv[119] += amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[6], w_fp[71], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[6], w_fp[71], COUPs[0], 1.0, &amp_fp[0] );
       jamp_sv[8] -= amp_sv[0];
       jamp_sv[14] += amp_sv[0];
       jamp_sv[24] += amp_sv[0];
@@ -21911,7 +21911,7 @@ namespace mg5amcCpu
       jamp_sv[107] -= amp_sv[0];
       jamp_sv[112] -= amp_sv[0];
       jamp_sv[118] += amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[6], w_fp[21], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[6], w_fp[21], COUPs[0], 1.0, &amp_fp[0] );
       jamp_sv[0] -= amp_sv[0];
       jamp_sv[2] += amp_sv[0];
       jamp_sv[24] += amp_sv[0];
@@ -21935,7 +21935,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 1017
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[24], w_fp[118], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[24], w_fp[118], COUPs[0], 1.0, &amp_fp[0] );
       jamp_sv[0] -= amp_sv[0];
       jamp_sv[2] += amp_sv[0];
       jamp_sv[8] += amp_sv[0];
@@ -21952,7 +21952,7 @@ namespace mg5amcCpu
       jamp_sv[107] += amp_sv[0];
       jamp_sv[113] += amp_sv[0];
       jamp_sv[119] -= amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[24], w_fp[119], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[24], w_fp[119], COUPs[0], 1.0, &amp_fp[0] );
       jamp_sv[18] -= amp_sv[0];
       jamp_sv[19] += amp_sv[0];
       jamp_sv[21] += amp_sv[0];
@@ -21969,7 +21969,7 @@ namespace mg5amcCpu
       jamp_sv[97] -= amp_sv[0];
       jamp_sv[99] -= amp_sv[0];
       jamp_sv[101] += amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[24], w_fp[120], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[24], w_fp[120], COUPs[0], 1.0, &amp_fp[0] );
       jamp_sv[0] += amp_sv[0];
       jamp_sv[2] -= amp_sv[0];
       jamp_sv[8] -= amp_sv[0];
@@ -21993,7 +21993,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 1018
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[85], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[85], COUPs[0], 1.0, &amp_fp[0] );
       jamp_sv[9] -= amp_sv[0];
       jamp_sv[15] += amp_sv[0];
       jamp_sv[21] += amp_sv[0];
@@ -22010,7 +22010,7 @@ namespace mg5amcCpu
       jamp_sv[88] -= amp_sv[0];
       jamp_sv[112] -= amp_sv[0];
       jamp_sv[118] += amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[112], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[112], COUPs[0], 1.0, &amp_fp[0] );
       jamp_sv[21] += amp_sv[0];
       jamp_sv[23] -= amp_sv[0];
       jamp_sv[28] -= amp_sv[0];
@@ -22027,7 +22027,7 @@ namespace mg5amcCpu
       jamp_sv[88] -= amp_sv[0];
       jamp_sv[99] -= amp_sv[0];
       jamp_sv[101] += amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[111], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[111], COUPs[0], 1.0, &amp_fp[0] );
       jamp_sv[9] += amp_sv[0];
       jamp_sv[15] -= amp_sv[0];
       jamp_sv[24] -= amp_sv[0];
@@ -22051,7 +22051,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 1019
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[56], w_fp[68], w_fp[5], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[56], w_fp[68], w_fp[5], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 1019 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -22079,7 +22079,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 1020
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[56], w_fp[1], w_fp[16], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[56], w_fp[1], w_fp[16], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 1020 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -22107,7 +22107,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 1021
-      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[27], w_fp[5], w_fp[56], COUPs[2], &amp_fp[0] );
+      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[27], w_fp[5], w_fp[56], COUPs[2], 1.0, &amp_fp[0] );
       jamp_sv[1] += amp_sv[0];
       jamp_sv[4] -= amp_sv[0];
       jamp_sv[10] -= amp_sv[0];
@@ -22124,7 +22124,7 @@ namespace mg5amcCpu
       jamp_sv[89] -= amp_sv[0];
       jamp_sv[95] += amp_sv[0];
       jamp_sv[111] -= amp_sv[0];
-      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[27], w_fp[5], w_fp[56], COUPs[2], &amp_fp[0] );
+      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[27], w_fp[5], w_fp[56], COUPs[2], 1.0, &amp_fp[0] );
       jamp_sv[2] += amp_sv[0];
       jamp_sv[3] -= amp_sv[0];
       jamp_sv[10] -= amp_sv[0];
@@ -22141,7 +22141,7 @@ namespace mg5amcCpu
       jamp_sv[83] -= amp_sv[0];
       jamp_sv[111] -= amp_sv[0];
       jamp_sv[113] += amp_sv[0];
-      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[27], w_fp[5], w_fp[56], COUPs[2], &amp_fp[0] );
+      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[27], w_fp[5], w_fp[56], COUPs[2], 1.0, &amp_fp[0] );
       jamp_sv[1] -= amp_sv[0];
       jamp_sv[2] += amp_sv[0];
       jamp_sv[3] -= amp_sv[0];
@@ -22165,7 +22165,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 1022
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[101], w_fp[108], w_fp[5], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[101], w_fp[108], w_fp[5], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 1022 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -22193,7 +22193,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 1023
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[101], w_fp[1], w_fp[10], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[101], w_fp[1], w_fp[10], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 1023 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -22221,7 +22221,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 1024
-      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[5], w_fp[101], COUPs[2], &amp_fp[0] );
+      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[5], w_fp[101], COUPs[2], 1.0, &amp_fp[0] );
       jamp_sv[11] += amp_sv[0];
       jamp_sv[21] -= amp_sv[0];
       jamp_sv[25] -= amp_sv[0];
@@ -22238,7 +22238,7 @@ namespace mg5amcCpu
       jamp_sv[88] += amp_sv[0];
       jamp_sv[94] -= amp_sv[0];
       jamp_sv[109] += amp_sv[0];
-      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[5], w_fp[101], COUPs[2], &amp_fp[0] );
+      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[5], w_fp[101], COUPs[2], 1.0, &amp_fp[0] );
       jamp_sv[10] -= amp_sv[0];
       jamp_sv[11] += amp_sv[0];
       jamp_sv[20] += amp_sv[0];
@@ -22255,7 +22255,7 @@ namespace mg5amcCpu
       jamp_sv[83] -= amp_sv[0];
       jamp_sv[108] -= amp_sv[0];
       jamp_sv[109] += amp_sv[0];
-      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[5], w_fp[101], COUPs[2], &amp_fp[0] );
+      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[5], w_fp[101], COUPs[2], 1.0, &amp_fp[0] );
       jamp_sv[10] -= amp_sv[0];
       jamp_sv[20] += amp_sv[0];
       jamp_sv[25] += amp_sv[0];
@@ -22279,7 +22279,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 1025
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[108], w_fp[16], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[108], w_fp[16], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 1025 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -22307,7 +22307,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 1026
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[68], w_fp[10], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[68], w_fp[10], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 1026 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -22335,7 +22335,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 1027
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[27], w_fp[5], w_fp[11], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[27], w_fp[5], w_fp[11], COUPs[0], 1.0, &amp_fp[0] );
       jamp_sv[11] += amp_sv[0];
       jamp_sv[15] -= amp_sv[0];
       jamp_sv[17] += amp_sv[0];
@@ -22352,7 +22352,7 @@ namespace mg5amcCpu
       jamp_sv[88] += amp_sv[0];
       jamp_sv[94] -= amp_sv[0];
       jamp_sv[112] += amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[27], w_fp[5], w_fp[42], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[27], w_fp[5], w_fp[42], COUPs[0], 1.0, &amp_fp[0] );
       jamp_sv[1] -= amp_sv[0];
       jamp_sv[2] += amp_sv[0];
       jamp_sv[3] -= amp_sv[0];
@@ -22369,7 +22369,7 @@ namespace mg5amcCpu
       jamp_sv[89] += amp_sv[0];
       jamp_sv[95] -= amp_sv[0];
       jamp_sv[113] += amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[27], w_fp[5], w_fp[76], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[27], w_fp[5], w_fp[76], COUPs[0], 1.0, &amp_fp[0] );
       jamp_sv[1] -= amp_sv[0];
       jamp_sv[2] += amp_sv[0];
       jamp_sv[3] -= amp_sv[0];
@@ -22390,12 +22390,12 @@ namespace mg5amcCpu
       // *** DIAGRAM 1028 OF 1240 ***
 
       // Wavefunction(s) for diagram number 1028
-      VVVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[1], w_fp[27], COUPs[2], 0., 0., w_fp[10] );
-      VVVV3P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[1], w_fp[27], COUPs[2], 0., 0., w_fp[16] );
-      VVVV4P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[1], w_fp[27], COUPs[2], 0., 0., w_fp[111] );
+      VVVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[1], w_fp[27], COUPs[2], 1.0, 0., 0., w_fp[10] );
+      VVVV3P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[1], w_fp[27], COUPs[2], 1.0, 0., 0., w_fp[16] );
+      VVVV4P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[1], w_fp[27], COUPs[2], 1.0, 0., 0., w_fp[111] );
 
       // Amplitude(s) for diagram number 1028
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[5], w_fp[10], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[5], w_fp[10], COUPs[0], 1.0, &amp_fp[0] );
       jamp_sv[1] += amp_sv[0];
       jamp_sv[4] -= amp_sv[0];
       jamp_sv[10] -= amp_sv[0];
@@ -22412,7 +22412,7 @@ namespace mg5amcCpu
       jamp_sv[89] -= amp_sv[0];
       jamp_sv[95] += amp_sv[0];
       jamp_sv[110] -= amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[5], w_fp[16], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[5], w_fp[16], COUPs[0], 1.0, &amp_fp[0] );
       jamp_sv[10] -= amp_sv[0];
       jamp_sv[20] += amp_sv[0];
       jamp_sv[25] += amp_sv[0];
@@ -22429,7 +22429,7 @@ namespace mg5amcCpu
       jamp_sv[88] -= amp_sv[0];
       jamp_sv[94] += amp_sv[0];
       jamp_sv[108] -= amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[5], w_fp[111], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[5], w_fp[111], COUPs[0], 1.0, &amp_fp[0] );
       jamp_sv[1] -= amp_sv[0];
       jamp_sv[4] += amp_sv[0];
       jamp_sv[25] += amp_sv[0];
@@ -22453,7 +22453,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 1029
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[27], w_fp[115], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[27], w_fp[115], COUPs[0], 1.0, &amp_fp[0] );
       jamp_sv[1] -= amp_sv[0];
       jamp_sv[4] += amp_sv[0];
       jamp_sv[10] += amp_sv[0];
@@ -22470,7 +22470,7 @@ namespace mg5amcCpu
       jamp_sv[89] += amp_sv[0];
       jamp_sv[95] -= amp_sv[0];
       jamp_sv[110] += amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[27], w_fp[116], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[27], w_fp[116], COUPs[0], 1.0, &amp_fp[0] );
       jamp_sv[12] -= amp_sv[0];
       jamp_sv[13] += amp_sv[0];
       jamp_sv[15] += amp_sv[0];
@@ -22487,7 +22487,7 @@ namespace mg5amcCpu
       jamp_sv[77] += amp_sv[0];
       jamp_sv[110] += amp_sv[0];
       jamp_sv[111] -= amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[27], w_fp[117], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[27], w_fp[117], COUPs[0], 1.0, &amp_fp[0] );
       jamp_sv[1] += amp_sv[0];
       jamp_sv[4] -= amp_sv[0];
       jamp_sv[10] -= amp_sv[0];
@@ -22511,7 +22511,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 1030
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[9], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[9], COUPs[0], 1.0, &amp_fp[0] );
       jamp_sv[11] -= amp_sv[0];
       jamp_sv[15] += amp_sv[0];
       jamp_sv[17] -= amp_sv[0];
@@ -22528,7 +22528,7 @@ namespace mg5amcCpu
       jamp_sv[88] -= amp_sv[0];
       jamp_sv[94] += amp_sv[0];
       jamp_sv[112] -= amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[110], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[110], COUPs[0], 1.0, &amp_fp[0] );
       jamp_sv[15] += amp_sv[0];
       jamp_sv[17] -= amp_sv[0];
       jamp_sv[26] -= amp_sv[0];
@@ -22545,7 +22545,7 @@ namespace mg5amcCpu
       jamp_sv[77] += amp_sv[0];
       jamp_sv[109] += amp_sv[0];
       jamp_sv[112] -= amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[109], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[109], COUPs[0], 1.0, &amp_fp[0] );
       jamp_sv[11] += amp_sv[0];
       jamp_sv[21] -= amp_sv[0];
       jamp_sv[25] -= amp_sv[0];
@@ -22569,7 +22569,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 1031
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[56], w_fp[67], w_fp[4], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[56], w_fp[67], w_fp[4], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 1031 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -22597,7 +22597,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 1032
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[56], w_fp[1], w_fp[19], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[56], w_fp[1], w_fp[19], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 1032 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -22625,7 +22625,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 1033
-      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[4], w_fp[29], w_fp[56], COUPs[2], &amp_fp[0] );
+      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[4], w_fp[29], w_fp[56], COUPs[2], 1.0, &amp_fp[0] );
       jamp_sv[0] += amp_sv[0];
       jamp_sv[1] -= amp_sv[0];
       jamp_sv[6] -= amp_sv[0];
@@ -22642,7 +22642,7 @@ namespace mg5amcCpu
       jamp_sv[95] -= amp_sv[0];
       jamp_sv[117] -= amp_sv[0];
       jamp_sv[119] += amp_sv[0];
-      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[4], w_fp[29], w_fp[56], COUPs[2], &amp_fp[0] );
+      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[4], w_fp[29], w_fp[56], COUPs[2], 1.0, &amp_fp[0] );
       jamp_sv[3] += amp_sv[0];
       jamp_sv[5] -= amp_sv[0];
       jamp_sv[6] -= amp_sv[0];
@@ -22659,7 +22659,7 @@ namespace mg5amcCpu
       jamp_sv[71] += amp_sv[0];
       jamp_sv[93] += amp_sv[0];
       jamp_sv[117] -= amp_sv[0];
-      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[4], w_fp[29], w_fp[56], COUPs[2], &amp_fp[0] );
+      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[4], w_fp[29], w_fp[56], COUPs[2], 1.0, &amp_fp[0] );
       jamp_sv[0] -= amp_sv[0];
       jamp_sv[1] += amp_sv[0];
       jamp_sv[3] += amp_sv[0];
@@ -22683,7 +22683,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 1034
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[96], w_fp[108], w_fp[4], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[96], w_fp[108], w_fp[4], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 1034 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -22711,7 +22711,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 1035
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[96], w_fp[1], w_fp[13], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[96], w_fp[1], w_fp[13], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 1035 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -22739,7 +22739,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 1036
-      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[4], w_fp[96], COUPs[2], &amp_fp[0] );
+      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[4], w_fp[96], COUPs[2], 1.0, &amp_fp[0] );
       jamp_sv[17] += amp_sv[0];
       jamp_sv[23] -= amp_sv[0];
       jamp_sv[27] -= amp_sv[0];
@@ -22756,7 +22756,7 @@ namespace mg5amcCpu
       jamp_sv[70] -= amp_sv[0];
       jamp_sv[91] -= amp_sv[0];
       jamp_sv[115] += amp_sv[0];
-      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[4], w_fp[96], COUPs[2], &amp_fp[0] );
+      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[4], w_fp[96], COUPs[2], 1.0, &amp_fp[0] );
       jamp_sv[16] -= amp_sv[0];
       jamp_sv[17] += amp_sv[0];
       jamp_sv[22] += amp_sv[0];
@@ -22773,7 +22773,7 @@ namespace mg5amcCpu
       jamp_sv[91] -= amp_sv[0];
       jamp_sv[114] -= amp_sv[0];
       jamp_sv[115] += amp_sv[0];
-      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[4], w_fp[96], COUPs[2], &amp_fp[0] );
+      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[4], w_fp[96], COUPs[2], 1.0, &amp_fp[0] );
       jamp_sv[16] -= amp_sv[0];
       jamp_sv[22] += amp_sv[0];
       jamp_sv[27] += amp_sv[0];
@@ -22797,7 +22797,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 1037
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[108], w_fp[19], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[108], w_fp[19], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 1037 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -22825,7 +22825,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 1038
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[67], w_fp[13], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[67], w_fp[13], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 1038 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -22853,7 +22853,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 1039
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[4], w_fp[29], w_fp[11], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[4], w_fp[29], w_fp[11], COUPs[0], 1.0, &amp_fp[0] );
       jamp_sv[9] += amp_sv[0];
       jamp_sv[11] -= amp_sv[0];
       jamp_sv[17] -= amp_sv[0];
@@ -22870,7 +22870,7 @@ namespace mg5amcCpu
       jamp_sv[70] += amp_sv[0];
       jamp_sv[94] += amp_sv[0];
       jamp_sv[118] -= amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[4], w_fp[29], w_fp[42], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[4], w_fp[29], w_fp[42], COUPs[0], 1.0, &amp_fp[0] );
       jamp_sv[0] -= amp_sv[0];
       jamp_sv[1] += amp_sv[0];
       jamp_sv[3] += amp_sv[0];
@@ -22887,7 +22887,7 @@ namespace mg5amcCpu
       jamp_sv[71] += amp_sv[0];
       jamp_sv[95] += amp_sv[0];
       jamp_sv[119] -= amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[4], w_fp[29], w_fp[76], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[4], w_fp[29], w_fp[76], COUPs[0], 1.0, &amp_fp[0] );
       jamp_sv[0] -= amp_sv[0];
       jamp_sv[1] += amp_sv[0];
       jamp_sv[3] += amp_sv[0];
@@ -22908,12 +22908,12 @@ namespace mg5amcCpu
       // *** DIAGRAM 1040 OF 1240 ***
 
       // Wavefunction(s) for diagram number 1040
-      VVVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[1], w_fp[29], COUPs[2], 0., 0., w_fp[76] );
-      VVVV3P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[1], w_fp[29], COUPs[2], 0., 0., w_fp[42] );
-      VVVV4P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[1], w_fp[29], COUPs[2], 0., 0., w_fp[11] );
+      VVVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[1], w_fp[29], COUPs[2], 1.0, 0., 0., w_fp[76] );
+      VVVV3P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[1], w_fp[29], COUPs[2], 1.0, 0., 0., w_fp[42] );
+      VVVV4P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[1], w_fp[29], COUPs[2], 1.0, 0., 0., w_fp[11] );
 
       // Amplitude(s) for diagram number 1040
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[4], w_fp[76], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[4], w_fp[76], COUPs[0], 1.0, &amp_fp[0] );
       jamp_sv[3] += amp_sv[0];
       jamp_sv[5] -= amp_sv[0];
       jamp_sv[16] -= amp_sv[0];
@@ -22930,7 +22930,7 @@ namespace mg5amcCpu
       jamp_sv[71] += amp_sv[0];
       jamp_sv[92] += amp_sv[0];
       jamp_sv[116] -= amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[4], w_fp[42], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[4], w_fp[42], COUPs[0], 1.0, &amp_fp[0] );
       jamp_sv[16] -= amp_sv[0];
       jamp_sv[22] += amp_sv[0];
       jamp_sv[27] += amp_sv[0];
@@ -22947,7 +22947,7 @@ namespace mg5amcCpu
       jamp_sv[70] += amp_sv[0];
       jamp_sv[90] += amp_sv[0];
       jamp_sv[114] -= amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[4], w_fp[11], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[4], w_fp[11], COUPs[0], 1.0, &amp_fp[0] );
       jamp_sv[3] -= amp_sv[0];
       jamp_sv[5] += amp_sv[0];
       jamp_sv[27] += amp_sv[0];
@@ -22971,7 +22971,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 1041
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[29], w_fp[107], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[29], w_fp[107], COUPs[0], 1.0, &amp_fp[0] );
       jamp_sv[3] -= amp_sv[0];
       jamp_sv[5] += amp_sv[0];
       jamp_sv[16] += amp_sv[0];
@@ -22988,7 +22988,7 @@ namespace mg5amcCpu
       jamp_sv[71] -= amp_sv[0];
       jamp_sv[92] -= amp_sv[0];
       jamp_sv[116] += amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[29], w_fp[95], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[29], w_fp[95], COUPs[0], 1.0, &amp_fp[0] );
       jamp_sv[6] -= amp_sv[0];
       jamp_sv[7] += amp_sv[0];
       jamp_sv[9] += amp_sv[0];
@@ -23005,7 +23005,7 @@ namespace mg5amcCpu
       jamp_sv[93] += amp_sv[0];
       jamp_sv[116] += amp_sv[0];
       jamp_sv[117] -= amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[29], w_fp[105], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[29], w_fp[105], COUPs[0], 1.0, &amp_fp[0] );
       jamp_sv[3] += amp_sv[0];
       jamp_sv[5] -= amp_sv[0];
       jamp_sv[6] -= amp_sv[0];
@@ -23029,7 +23029,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 1042
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[87], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[87], COUPs[0], 1.0, &amp_fp[0] );
       jamp_sv[9] -= amp_sv[0];
       jamp_sv[11] += amp_sv[0];
       jamp_sv[17] += amp_sv[0];
@@ -23046,7 +23046,7 @@ namespace mg5amcCpu
       jamp_sv[70] -= amp_sv[0];
       jamp_sv[94] -= amp_sv[0];
       jamp_sv[118] += amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[34], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[34], COUPs[0], 1.0, &amp_fp[0] );
       jamp_sv[17] += amp_sv[0];
       jamp_sv[23] -= amp_sv[0];
       jamp_sv[27] -= amp_sv[0];
@@ -23063,7 +23063,7 @@ namespace mg5amcCpu
       jamp_sv[70] -= amp_sv[0];
       jamp_sv[91] -= amp_sv[0];
       jamp_sv[115] += amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[86], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[86], COUPs[0], 1.0, &amp_fp[0] );
       jamp_sv[9] += amp_sv[0];
       jamp_sv[11] -= amp_sv[0];
       jamp_sv[24] -= amp_sv[0];
@@ -23087,7 +23087,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 1043
-      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[1], w_fp[8], w_fp[30], COUPs[2], &amp_fp[0] );
+      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[1], w_fp[8], w_fp[30], COUPs[2], 1.0, &amp_fp[0] );
       jamp_sv[0] += amp_sv[0];
       jamp_sv[1] -= amp_sv[0];
       jamp_sv[3] -= amp_sv[0];
@@ -23104,7 +23104,7 @@ namespace mg5amcCpu
       jamp_sv[95] -= amp_sv[0];
       jamp_sv[118] -= amp_sv[0];
       jamp_sv[119] += amp_sv[0];
-      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[1], w_fp[8], w_fp[30], COUPs[2], &amp_fp[0] );
+      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[1], w_fp[8], w_fp[30], COUPs[2], 1.0, &amp_fp[0] );
       jamp_sv[0] += amp_sv[0];
       jamp_sv[1] -= amp_sv[0];
       jamp_sv[3] -= amp_sv[0];
@@ -23121,7 +23121,7 @@ namespace mg5amcCpu
       jamp_sv[71] -= amp_sv[0];
       jamp_sv[95] -= amp_sv[0];
       jamp_sv[119] += amp_sv[0];
-      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[1], w_fp[8], w_fp[30], COUPs[2], &amp_fp[0] );
+      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[1], w_fp[8], w_fp[30], COUPs[2], 1.0, &amp_fp[0] );
       jamp_sv[9] -= amp_sv[0];
       jamp_sv[11] += amp_sv[0];
       jamp_sv[17] += amp_sv[0];
@@ -23138,7 +23138,7 @@ namespace mg5amcCpu
       jamp_sv[70] -= amp_sv[0];
       jamp_sv[94] -= amp_sv[0];
       jamp_sv[118] += amp_sv[0];
-      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[1], w_fp[8], w_fp[31], COUPs[2], &amp_fp[0] );
+      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[1], w_fp[8], w_fp[31], COUPs[2], 1.0, &amp_fp[0] );
       jamp_sv[1] -= amp_sv[0];
       jamp_sv[2] += amp_sv[0];
       jamp_sv[3] -= amp_sv[0];
@@ -23155,7 +23155,7 @@ namespace mg5amcCpu
       jamp_sv[95] -= amp_sv[0];
       jamp_sv[112] -= amp_sv[0];
       jamp_sv[113] += amp_sv[0];
-      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[1], w_fp[8], w_fp[31], COUPs[2], &amp_fp[0] );
+      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[1], w_fp[8], w_fp[31], COUPs[2], 1.0, &amp_fp[0] );
       jamp_sv[1] -= amp_sv[0];
       jamp_sv[2] += amp_sv[0];
       jamp_sv[3] -= amp_sv[0];
@@ -23172,7 +23172,7 @@ namespace mg5amcCpu
       jamp_sv[89] += amp_sv[0];
       jamp_sv[95] -= amp_sv[0];
       jamp_sv[113] += amp_sv[0];
-      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[1], w_fp[8], w_fp[31], COUPs[2], &amp_fp[0] );
+      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[1], w_fp[8], w_fp[31], COUPs[2], 1.0, &amp_fp[0] );
       jamp_sv[11] += amp_sv[0];
       jamp_sv[15] -= amp_sv[0];
       jamp_sv[17] += amp_sv[0];
@@ -23189,7 +23189,7 @@ namespace mg5amcCpu
       jamp_sv[88] += amp_sv[0];
       jamp_sv[94] -= amp_sv[0];
       jamp_sv[112] += amp_sv[0];
-      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[1], w_fp[8], w_fp[32], COUPs[2], &amp_fp[0] );
+      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[1], w_fp[8], w_fp[32], COUPs[2], 1.0, &amp_fp[0] );
       jamp_sv[0] -= amp_sv[0];
       jamp_sv[2] += amp_sv[0];
       jamp_sv[4] += amp_sv[0];
@@ -23206,7 +23206,7 @@ namespace mg5amcCpu
       jamp_sv[113] += amp_sv[0];
       jamp_sv[118] += amp_sv[0];
       jamp_sv[119] -= amp_sv[0];
-      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[1], w_fp[8], w_fp[32], COUPs[2], &amp_fp[0] );
+      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[1], w_fp[8], w_fp[32], COUPs[2], 1.0, &amp_fp[0] );
       jamp_sv[0] -= amp_sv[0];
       jamp_sv[2] += amp_sv[0];
       jamp_sv[4] += amp_sv[0];
@@ -23223,7 +23223,7 @@ namespace mg5amcCpu
       jamp_sv[89] += amp_sv[0];
       jamp_sv[113] += amp_sv[0];
       jamp_sv[119] -= amp_sv[0];
-      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[1], w_fp[8], w_fp[32], COUPs[2], &amp_fp[0] );
+      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[1], w_fp[8], w_fp[32], COUPs[2], 1.0, &amp_fp[0] );
       jamp_sv[9] += amp_sv[0];
       jamp_sv[15] -= amp_sv[0];
       jamp_sv[21] -= amp_sv[0];
@@ -23247,7 +23247,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 1044
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[30], w_fp[56], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[30], w_fp[56], COUPs[0], 1.0, &amp_fp[0] );
       jamp_sv[0] += amp_sv[0];
       jamp_sv[1] -= amp_sv[0];
       jamp_sv[3] -= amp_sv[0];
@@ -23264,7 +23264,7 @@ namespace mg5amcCpu
       jamp_sv[71] -= amp_sv[0];
       jamp_sv[95] -= amp_sv[0];
       jamp_sv[119] += amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[31], w_fp[56], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[31], w_fp[56], COUPs[0], 1.0, &amp_fp[0] );
       jamp_sv[1] -= amp_sv[0];
       jamp_sv[2] += amp_sv[0];
       jamp_sv[3] -= amp_sv[0];
@@ -23281,7 +23281,7 @@ namespace mg5amcCpu
       jamp_sv[89] += amp_sv[0];
       jamp_sv[95] -= amp_sv[0];
       jamp_sv[113] += amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[32], w_fp[56], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[32], w_fp[56], COUPs[0], 1.0, &amp_fp[0] );
       jamp_sv[0] -= amp_sv[0];
       jamp_sv[2] += amp_sv[0];
       jamp_sv[4] += amp_sv[0];
@@ -23305,7 +23305,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 1045
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[92], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[92], COUPs[0], 1.0, &amp_fp[0] );
       jamp_sv[9] -= amp_sv[0];
       jamp_sv[11] += amp_sv[0];
       jamp_sv[17] += amp_sv[0];
@@ -23322,7 +23322,7 @@ namespace mg5amcCpu
       jamp_sv[70] -= amp_sv[0];
       jamp_sv[94] -= amp_sv[0];
       jamp_sv[118] += amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[88], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[88], COUPs[0], 1.0, &amp_fp[0] );
       jamp_sv[11] += amp_sv[0];
       jamp_sv[15] -= amp_sv[0];
       jamp_sv[17] += amp_sv[0];
@@ -23339,7 +23339,7 @@ namespace mg5amcCpu
       jamp_sv[88] += amp_sv[0];
       jamp_sv[94] -= amp_sv[0];
       jamp_sv[112] += amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[106], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[106], COUPs[0], 1.0, &amp_fp[0] );
       jamp_sv[9] += amp_sv[0];
       jamp_sv[15] -= amp_sv[0];
       jamp_sv[21] -= amp_sv[0];
@@ -23363,7 +23363,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 1046
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[58], w_fp[114], w_fp[6], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[58], w_fp[114], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 1046 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -23376,7 +23376,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 1047
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[48], w_fp[114], w_fp[1], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[48], w_fp[114], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 1047 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -23389,7 +23389,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 1048
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[104], w_fp[100], w_fp[6], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[104], w_fp[100], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 1048 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -23402,7 +23402,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 1049
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[104], w_fp[36], w_fp[1], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[104], w_fp[36], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 1049 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -23415,7 +23415,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 1050
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[48], w_fp[100], w_fp[0], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[48], w_fp[100], w_fp[0], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 1050 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -23428,7 +23428,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 1051
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[58], w_fp[36], w_fp[0], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[58], w_fp[36], w_fp[0], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 1051 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -23441,7 +23441,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 1052
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[60], w_fp[114], w_fp[5], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[60], w_fp[114], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 1052 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -23454,7 +23454,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 1053
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[40], w_fp[114], w_fp[1], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[40], w_fp[114], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 1053 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -23467,7 +23467,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 1054
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[62], w_fp[100], w_fp[5], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[62], w_fp[100], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 1054 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -23480,7 +23480,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 1055
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[62], w_fp[35], w_fp[1], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[62], w_fp[35], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 1055 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -23493,7 +23493,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 1056
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[40], w_fp[100], w_fp[0], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[40], w_fp[100], w_fp[0], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 1056 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -23506,7 +23506,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 1057
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[60], w_fp[35], w_fp[0], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[60], w_fp[35], w_fp[0], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 1057 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -23519,7 +23519,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 1058
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[114], w_fp[67], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[114], w_fp[67], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 1058 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -23535,7 +23535,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 1059
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[12], w_fp[114], w_fp[1], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[12], w_fp[114], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 1059 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -23549,7 +23549,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 1060
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[100], w_fp[96], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[100], w_fp[96], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 1060 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -23565,7 +23565,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 1061
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[96], w_fp[1], w_fp[37], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[96], w_fp[1], w_fp[37], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 1061 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -23585,7 +23585,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 1062
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[12], w_fp[100], w_fp[0], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[12], w_fp[100], w_fp[0], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 1062 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -23599,7 +23599,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 1063
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[67], w_fp[37], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[67], w_fp[37], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 1063 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -23619,7 +23619,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 1064
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[33], w_fp[76], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[33], w_fp[76], COUPs[1], 1.0, &amp_fp[0] );
       jamp_sv[48] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[49] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[51] += cxtype( 0, 1 ) * amp_sv[0];
@@ -23628,7 +23628,7 @@ namespace mg5amcCpu
       jamp_sv[59] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[65] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[71] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[33], w_fp[42], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[33], w_fp[42], COUPs[1], 1.0, &amp_fp[0] );
       jamp_sv[51] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[53] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[54] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -23637,7 +23637,7 @@ namespace mg5amcCpu
       jamp_sv[59] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[64] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[70] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[33], w_fp[11], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[33], w_fp[11], COUPs[1], 1.0, &amp_fp[0] );
       jamp_sv[48] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[49] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[54] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -23653,7 +23653,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 1065
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[78], w_fp[102], w_fp[6], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[78], w_fp[102], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 1065 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -23666,7 +23666,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 1066
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[53], w_fp[102], w_fp[1], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[53], w_fp[102], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 1066 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -23679,7 +23679,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 1067
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[89], w_fp[6], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[89], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 1067 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -23692,7 +23692,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 1068
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[44], w_fp[1], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[44], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 1068 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -23705,7 +23705,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 1069
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[53], w_fp[89], w_fp[0], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[53], w_fp[89], w_fp[0], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 1069 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -23718,7 +23718,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 1070
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[78], w_fp[44], w_fp[0], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[78], w_fp[44], w_fp[0], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 1070 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -23731,7 +23731,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 1071
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[60], w_fp[102], w_fp[4], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[60], w_fp[102], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 1071 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -23744,7 +23744,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 1072
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[28], w_fp[102], w_fp[1], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[28], w_fp[102], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 1072 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -23757,7 +23757,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 1073
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[62], w_fp[89], w_fp[4], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[62], w_fp[89], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 1073 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -23770,7 +23770,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 1074
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[62], w_fp[43], w_fp[1], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[62], w_fp[43], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 1074 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -23783,7 +23783,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 1075
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[28], w_fp[89], w_fp[0], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[28], w_fp[89], w_fp[0], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 1075 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -23796,7 +23796,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 1076
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[60], w_fp[43], w_fp[0], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[60], w_fp[43], w_fp[0], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 1076 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -23809,7 +23809,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 1077
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[102], w_fp[68], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[102], w_fp[68], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 1077 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -23825,7 +23825,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 1078
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[14], w_fp[102], w_fp[1], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[14], w_fp[102], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 1078 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -23839,7 +23839,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 1079
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[89], w_fp[101], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[89], w_fp[101], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 1079 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -23855,7 +23855,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 1080
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[101], w_fp[1], w_fp[45], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[101], w_fp[1], w_fp[45], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 1080 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -23875,7 +23875,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 1081
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[14], w_fp[89], w_fp[0], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[14], w_fp[89], w_fp[0], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 1081 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -23889,7 +23889,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 1082
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[68], w_fp[45], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[68], w_fp[45], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 1082 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -23909,7 +23909,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 1083
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[39], w_fp[10], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[39], w_fp[10], COUPs[1], 1.0, &amp_fp[0] );
       jamp_sv[72] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[73] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[75] += cxtype( 0, 1 ) * amp_sv[0];
@@ -23918,7 +23918,7 @@ namespace mg5amcCpu
       jamp_sv[83] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[89] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[95] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[39], w_fp[16], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[39], w_fp[16], COUPs[1], 1.0, &amp_fp[0] );
       jamp_sv[75] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[77] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[78] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -23927,7 +23927,7 @@ namespace mg5amcCpu
       jamp_sv[83] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[88] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[94] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[39], w_fp[111], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[39], w_fp[111], COUPs[1], 1.0, &amp_fp[0] );
       jamp_sv[72] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[73] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[78] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -23943,7 +23943,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 1084
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[78], w_fp[113], w_fp[5], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[78], w_fp[113], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 1084 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -23956,7 +23956,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 1085
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[7], w_fp[113], w_fp[1], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[7], w_fp[113], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 1085 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -23969,7 +23969,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 1086
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[91], w_fp[5], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[91], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 1086 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -23982,7 +23982,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 1087
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[50], w_fp[1], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[50], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 1087 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -23995,7 +23995,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 1088
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[7], w_fp[91], w_fp[0], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[7], w_fp[91], w_fp[0], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 1088 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -24008,7 +24008,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 1089
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[78], w_fp[50], w_fp[0], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[78], w_fp[50], w_fp[0], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 1089 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -24021,7 +24021,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 1090
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[58], w_fp[113], w_fp[4], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[58], w_fp[113], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 1090 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -24034,7 +24034,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 1091
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[25], w_fp[113], w_fp[1], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[25], w_fp[113], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 1091 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -24047,7 +24047,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 1092
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[104], w_fp[91], w_fp[4], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[104], w_fp[91], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 1092 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -24060,7 +24060,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 1093
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[104], w_fp[49], w_fp[1], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[104], w_fp[49], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 1093 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -24073,7 +24073,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 1094
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[25], w_fp[91], w_fp[0], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[25], w_fp[91], w_fp[0], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 1094 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -24086,7 +24086,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 1095
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[58], w_fp[49], w_fp[0], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[58], w_fp[49], w_fp[0], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 1095 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -24099,7 +24099,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 1096
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[113], w_fp[59], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[113], w_fp[59], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 1096 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -24115,7 +24115,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 1097
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[26], w_fp[113], w_fp[1], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[26], w_fp[113], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 1097 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -24129,7 +24129,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 1098
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[91], w_fp[98], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[91], w_fp[98], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 1098 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -24145,7 +24145,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 1099
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[98], w_fp[1], w_fp[51], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[98], w_fp[1], w_fp[51], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 1099 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -24165,7 +24165,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 1100
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[26], w_fp[91], w_fp[0], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[26], w_fp[91], w_fp[0], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 1100 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -24179,7 +24179,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 1101
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[59], w_fp[51], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[59], w_fp[51], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 1101 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -24199,7 +24199,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 1102
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[47], w_fp[97], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[47], w_fp[97], COUPs[1], 1.0, &amp_fp[0] );
       jamp_sv[96] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[97] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[99] += cxtype( 0, 1 ) * amp_sv[0];
@@ -24208,7 +24208,7 @@ namespace mg5amcCpu
       jamp_sv[107] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[113] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[119] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[47], w_fp[71], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[47], w_fp[71], COUPs[1], 1.0, &amp_fp[0] );
       jamp_sv[99] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[101] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[102] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -24217,7 +24217,7 @@ namespace mg5amcCpu
       jamp_sv[107] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[112] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[118] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[47], w_fp[21], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[47], w_fp[21], COUPs[1], 1.0, &amp_fp[0] );
       jamp_sv[96] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[97] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[102] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -24233,7 +24233,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 1103
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[2], w_fp[67], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[2], w_fp[67], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 1103 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -24249,7 +24249,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 1104
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[18], w_fp[1], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[18], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 1104 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -24263,7 +24263,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 1105
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[78], w_fp[2], w_fp[96], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[78], w_fp[2], w_fp[96], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 1105 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -24279,7 +24279,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 1106
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[96], w_fp[1], w_fp[54], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[96], w_fp[1], w_fp[54], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 1106 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -24299,7 +24299,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 1107
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[78], w_fp[18], w_fp[0], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[78], w_fp[18], w_fp[0], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 1107 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -24313,7 +24313,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 1108
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[67], w_fp[54], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[67], w_fp[54], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 1108 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -24333,7 +24333,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 1109
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[2], w_fp[76], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[2], w_fp[76], COUPs[1], 1.0, &amp_fp[0] );
       jamp_sv[3] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[5] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[16] += cxtype( 0, 1 ) * amp_sv[0];
@@ -24342,7 +24342,7 @@ namespace mg5amcCpu
       jamp_sv[46] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[92] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[116] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[2], w_fp[42], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[2], w_fp[42], COUPs[1], 1.0, &amp_fp[0] );
       jamp_sv[16] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[22] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[27] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -24351,7 +24351,7 @@ namespace mg5amcCpu
       jamp_sv[46] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[90] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[114] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[2], w_fp[11], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[2], w_fp[11], COUPs[1], 1.0, &amp_fp[0] );
       jamp_sv[3] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[5] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[27] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -24367,7 +24367,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 1110
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[104], w_fp[2], w_fp[68], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[104], w_fp[2], w_fp[68], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 1110 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -24383,7 +24383,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 1111
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[104], w_fp[15], w_fp[1], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[104], w_fp[15], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 1111 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -24397,7 +24397,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 1112
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[58], w_fp[2], w_fp[101], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[58], w_fp[2], w_fp[101], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 1112 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -24413,7 +24413,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 1113
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[101], w_fp[1], w_fp[23], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[101], w_fp[1], w_fp[23], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 1113 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -24433,7 +24433,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 1114
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[58], w_fp[15], w_fp[0], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[58], w_fp[15], w_fp[0], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 1114 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -24447,7 +24447,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 1115
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[68], w_fp[23], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[68], w_fp[23], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 1115 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -24467,7 +24467,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 1116
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[2], w_fp[10], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[2], w_fp[10], COUPs[1], 1.0, &amp_fp[0] );
       jamp_sv[1] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[4] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[10] += cxtype( 0, 1 ) * amp_sv[0];
@@ -24476,7 +24476,7 @@ namespace mg5amcCpu
       jamp_sv[44] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[68] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[110] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[2], w_fp[16], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[2], w_fp[16], COUPs[1], 1.0, &amp_fp[0] );
       jamp_sv[10] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[20] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[25] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -24485,7 +24485,7 @@ namespace mg5amcCpu
       jamp_sv[44] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[66] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[108] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[2], w_fp[111], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[2], w_fp[111], COUPs[1], 1.0, &amp_fp[0] );
       jamp_sv[1] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[4] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[25] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -24501,7 +24501,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 1117
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[62], w_fp[2], w_fp[59], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[62], w_fp[2], w_fp[59], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 1117 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -24517,7 +24517,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 1118
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[62], w_fp[17], w_fp[1], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[62], w_fp[17], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 1118 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -24531,7 +24531,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 1119
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[60], w_fp[2], w_fp[98], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[60], w_fp[2], w_fp[98], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 1119 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -24547,7 +24547,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 1120
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[98], w_fp[1], w_fp[20], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[98], w_fp[1], w_fp[20], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 1120 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -24567,7 +24567,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 1121
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[60], w_fp[17], w_fp[0], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[60], w_fp[17], w_fp[0], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 1121 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -24581,7 +24581,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 1122
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[59], w_fp[20], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[59], w_fp[20], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( channelId == 1122 ) numerators_sv += cxabs2( amp_sv[0] );
       if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
@@ -24601,7 +24601,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 1123
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[2], w_fp[97], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[2], w_fp[97], COUPs[1], 1.0, &amp_fp[0] );
       jamp_sv[0] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[2] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[8] += cxtype( 0, 1 ) * amp_sv[0];
@@ -24610,7 +24610,7 @@ namespace mg5amcCpu
       jamp_sv[38] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[62] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[86] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[2], w_fp[71], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[2], w_fp[71], COUPs[1], 1.0, &amp_fp[0] );
       jamp_sv[8] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[14] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[24] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -24619,7 +24619,7 @@ namespace mg5amcCpu
       jamp_sv[38] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[60] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[84] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[2], w_fp[21], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[2], w_fp[21], COUPs[1], 1.0, &amp_fp[0] );
       jamp_sv[0] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[2] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[24] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -24632,12 +24632,12 @@ namespace mg5amcCpu
       // *** DIAGRAM 1124 OF 1240 ***
 
       // Wavefunction(s) for diagram number 1124
-      VVVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[1], w_fp[4], COUPs[2], 0., 0., w_fp[21] );
-      VVVV3P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[1], w_fp[4], COUPs[2], 0., 0., w_fp[71] );
-      VVVV4P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[1], w_fp[4], COUPs[2], 0., 0., w_fp[97] );
+      VVVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[1], w_fp[4], COUPs[2], 1.0, 0., 0., w_fp[21] );
+      VVVV3P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[1], w_fp[4], COUPs[2], 1.0, 0., 0., w_fp[71] );
+      VVVV4P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[1], w_fp[4], COUPs[2], 1.0, 0., 0., w_fp[97] );
 
       // Amplitude(s) for diagram number 1124
-      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[21], w_fp[8], w_fp[5], w_fp[6], COUPs[2], &amp_fp[0] );
+      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[21], w_fp[8], w_fp[5], w_fp[6], COUPs[2], 1.0, &amp_fp[0] );
       jamp_sv[0] += amp_sv[0];
       jamp_sv[1] -= amp_sv[0];
       jamp_sv[6] -= amp_sv[0];
@@ -24654,7 +24654,7 @@ namespace mg5amcCpu
       jamp_sv[115] -= amp_sv[0];
       jamp_sv[117] -= amp_sv[0];
       jamp_sv[119] += amp_sv[0];
-      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[21], w_fp[8], w_fp[5], w_fp[6], COUPs[2], &amp_fp[0] );
+      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[21], w_fp[8], w_fp[5], w_fp[6], COUPs[2], 1.0, &amp_fp[0] );
       jamp_sv[0] += amp_sv[0];
       jamp_sv[6] -= amp_sv[0];
       jamp_sv[30] -= amp_sv[0];
@@ -24671,7 +24671,7 @@ namespace mg5amcCpu
       jamp_sv[115] -= amp_sv[0];
       jamp_sv[117] -= amp_sv[0];
       jamp_sv[119] += amp_sv[0];
-      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[21], w_fp[8], w_fp[5], w_fp[6], COUPs[2], &amp_fp[0] );
+      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[21], w_fp[8], w_fp[5], w_fp[6], COUPs[2], 1.0, &amp_fp[0] );
       jamp_sv[1] += amp_sv[0];
       jamp_sv[7] -= amp_sv[0];
       jamp_sv[31] -= amp_sv[0];
@@ -24688,7 +24688,7 @@ namespace mg5amcCpu
       jamp_sv[98] += amp_sv[0];
       jamp_sv[104] += amp_sv[0];
       jamp_sv[110] -= amp_sv[0];
-      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[71], w_fp[8], w_fp[5], w_fp[6], COUPs[2], &amp_fp[0] );
+      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[71], w_fp[8], w_fp[5], w_fp[6], COUPs[2], 1.0, &amp_fp[0] );
       jamp_sv[6] -= amp_sv[0];
       jamp_sv[7] += amp_sv[0];
       jamp_sv[24] += amp_sv[0];
@@ -24705,7 +24705,7 @@ namespace mg5amcCpu
       jamp_sv[116] += amp_sv[0];
       jamp_sv[117] -= amp_sv[0];
       jamp_sv[118] += amp_sv[0];
-      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[71], w_fp[8], w_fp[5], w_fp[6], COUPs[2], &amp_fp[0] );
+      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[71], w_fp[8], w_fp[5], w_fp[6], COUPs[2], 1.0, &amp_fp[0] );
       jamp_sv[6] -= amp_sv[0];
       jamp_sv[24] += amp_sv[0];
       jamp_sv[30] -= amp_sv[0];
@@ -24722,7 +24722,7 @@ namespace mg5amcCpu
       jamp_sv[116] += amp_sv[0];
       jamp_sv[117] -= amp_sv[0];
       jamp_sv[118] += amp_sv[0];
-      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[71], w_fp[8], w_fp[5], w_fp[6], COUPs[2], &amp_fp[0] );
+      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[71], w_fp[8], w_fp[5], w_fp[6], COUPs[2], 1.0, &amp_fp[0] );
       jamp_sv[7] -= amp_sv[0];
       jamp_sv[25] += amp_sv[0];
       jamp_sv[31] -= amp_sv[0];
@@ -24739,7 +24739,7 @@ namespace mg5amcCpu
       jamp_sv[102] -= amp_sv[0];
       jamp_sv[104] += amp_sv[0];
       jamp_sv[108] -= amp_sv[0];
-      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[97], w_fp[8], w_fp[5], w_fp[6], COUPs[2], &amp_fp[0] );
+      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[97], w_fp[8], w_fp[5], w_fp[6], COUPs[2], 1.0, &amp_fp[0] );
       jamp_sv[0] -= amp_sv[0];
       jamp_sv[1] += amp_sv[0];
       jamp_sv[24] += amp_sv[0];
@@ -24756,7 +24756,7 @@ namespace mg5amcCpu
       jamp_sv[116] += amp_sv[0];
       jamp_sv[118] += amp_sv[0];
       jamp_sv[119] -= amp_sv[0];
-      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[97], w_fp[8], w_fp[5], w_fp[6], COUPs[2], &amp_fp[0] );
+      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[97], w_fp[8], w_fp[5], w_fp[6], COUPs[2], 1.0, &amp_fp[0] );
       jamp_sv[0] -= amp_sv[0];
       jamp_sv[24] += amp_sv[0];
       jamp_sv[48] += amp_sv[0];
@@ -24773,7 +24773,7 @@ namespace mg5amcCpu
       jamp_sv[116] += amp_sv[0];
       jamp_sv[118] += amp_sv[0];
       jamp_sv[119] -= amp_sv[0];
-      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[97], w_fp[8], w_fp[5], w_fp[6], COUPs[2], &amp_fp[0] );
+      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[97], w_fp[8], w_fp[5], w_fp[6], COUPs[2], 1.0, &amp_fp[0] );
       jamp_sv[1] -= amp_sv[0];
       jamp_sv[25] += amp_sv[0];
       jamp_sv[49] += amp_sv[0];
@@ -24794,12 +24794,12 @@ namespace mg5amcCpu
       // *** DIAGRAM 1125 OF 1240 ***
 
       // Wavefunction(s) for diagram number 1125
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[21], w_fp[5], COUPs[0], 0., 0., w_fp[59] );
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[71], w_fp[5], COUPs[0], 0., 0., w_fp[20] );
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[97], w_fp[5], COUPs[0], 0., 0., w_fp[60] );
+      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[21], w_fp[5], COUPs[0], 1.0, 0., 0., w_fp[59] );
+      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[71], w_fp[5], COUPs[0], 1.0, 0., 0., w_fp[20] );
+      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[97], w_fp[5], COUPs[0], 1.0, 0., 0., w_fp[60] );
 
       // Amplitude(s) for diagram number 1125
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[6], w_fp[59], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[6], w_fp[59], COUPs[0], 1.0, &amp_fp[0] );
       jamp_sv[0] += amp_sv[0];
       jamp_sv[6] -= amp_sv[0];
       jamp_sv[30] -= amp_sv[0];
@@ -24816,7 +24816,7 @@ namespace mg5amcCpu
       jamp_sv[115] -= amp_sv[0];
       jamp_sv[117] -= amp_sv[0];
       jamp_sv[119] += amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[6], w_fp[20], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[6], w_fp[20], COUPs[0], 1.0, &amp_fp[0] );
       jamp_sv[6] -= amp_sv[0];
       jamp_sv[24] += amp_sv[0];
       jamp_sv[30] -= amp_sv[0];
@@ -24833,7 +24833,7 @@ namespace mg5amcCpu
       jamp_sv[116] += amp_sv[0];
       jamp_sv[117] -= amp_sv[0];
       jamp_sv[118] += amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[6], w_fp[60], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[6], w_fp[60], COUPs[0], 1.0, &amp_fp[0] );
       jamp_sv[0] -= amp_sv[0];
       jamp_sv[24] += amp_sv[0];
       jamp_sv[48] += amp_sv[0];
@@ -24854,12 +24854,12 @@ namespace mg5amcCpu
       // *** DIAGRAM 1126 OF 1240 ***
 
       // Wavefunction(s) for diagram number 1126
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[21], w_fp[6], COUPs[0], 0., 0., w_fp[17] );
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[71], w_fp[6], COUPs[0], 0., 0., w_fp[98] );
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[97], w_fp[6], COUPs[0], 0., 0., w_fp[111] );
+      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[21], w_fp[6], COUPs[0], 1.0, 0., 0., w_fp[17] );
+      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[71], w_fp[6], COUPs[0], 1.0, 0., 0., w_fp[98] );
+      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[97], w_fp[6], COUPs[0], 1.0, 0., 0., w_fp[111] );
 
       // Amplitude(s) for diagram number 1126
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[5], w_fp[17], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[5], w_fp[17], COUPs[0], 1.0, &amp_fp[0] );
       jamp_sv[1] += amp_sv[0];
       jamp_sv[7] -= amp_sv[0];
       jamp_sv[31] -= amp_sv[0];
@@ -24876,7 +24876,7 @@ namespace mg5amcCpu
       jamp_sv[98] += amp_sv[0];
       jamp_sv[104] += amp_sv[0];
       jamp_sv[110] -= amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[5], w_fp[98], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[5], w_fp[98], COUPs[0], 1.0, &amp_fp[0] );
       jamp_sv[7] -= amp_sv[0];
       jamp_sv[25] += amp_sv[0];
       jamp_sv[31] -= amp_sv[0];
@@ -24893,7 +24893,7 @@ namespace mg5amcCpu
       jamp_sv[102] -= amp_sv[0];
       jamp_sv[104] += amp_sv[0];
       jamp_sv[108] -= amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[5], w_fp[111], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[5], w_fp[111], COUPs[0], 1.0, &amp_fp[0] );
       jamp_sv[1] -= amp_sv[0];
       jamp_sv[25] += amp_sv[0];
       jamp_sv[49] += amp_sv[0];
@@ -24917,7 +24917,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 1127
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[21], w_fp[8], w_fp[29], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[21], w_fp[8], w_fp[29], COUPs[0], 1.0, &amp_fp[0] );
       jamp_sv[0] += amp_sv[0];
       jamp_sv[1] -= amp_sv[0];
       jamp_sv[6] -= amp_sv[0];
@@ -24934,7 +24934,7 @@ namespace mg5amcCpu
       jamp_sv[115] -= amp_sv[0];
       jamp_sv[117] -= amp_sv[0];
       jamp_sv[119] += amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[71], w_fp[8], w_fp[29], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[71], w_fp[8], w_fp[29], COUPs[0], 1.0, &amp_fp[0] );
       jamp_sv[6] -= amp_sv[0];
       jamp_sv[7] += amp_sv[0];
       jamp_sv[24] += amp_sv[0];
@@ -24951,7 +24951,7 @@ namespace mg5amcCpu
       jamp_sv[116] += amp_sv[0];
       jamp_sv[117] -= amp_sv[0];
       jamp_sv[118] += amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[97], w_fp[8], w_fp[29], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[97], w_fp[8], w_fp[29], COUPs[0], 1.0, &amp_fp[0] );
       jamp_sv[0] -= amp_sv[0];
       jamp_sv[1] += amp_sv[0];
       jamp_sv[24] += amp_sv[0];
@@ -24972,22 +24972,22 @@ namespace mg5amcCpu
       // *** DIAGRAM 1128 OF 1240 ***
 
       // Wavefunction(s) for diagram number 1128
-      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[21], COUPs[1], cIPD[0], cIPD[1], w_fp[16] );
-      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[71], COUPs[1], cIPD[0], cIPD[1], w_fp[10] );
-      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[97], COUPs[1], cIPD[0], cIPD[1], w_fp[68] );
+      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[21], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[16] );
+      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[71], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[10] );
+      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[97], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[68] );
 
       // Amplitude(s) for diagram number 1128
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[16], w_fp[39], w_fp[6], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[16], w_fp[39], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
       jamp_sv[90] += amp_sv[0];
       jamp_sv[91] -= amp_sv[0];
       jamp_sv[93] -= amp_sv[0];
       jamp_sv[95] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[10], w_fp[39], w_fp[6], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[10], w_fp[39], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
       jamp_sv[91] -= amp_sv[0];
       jamp_sv[92] += amp_sv[0];
       jamp_sv[93] -= amp_sv[0];
       jamp_sv[94] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[68], w_fp[39], w_fp[6], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[68], w_fp[39], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
       jamp_sv[90] -= amp_sv[0];
       jamp_sv[92] += amp_sv[0];
       jamp_sv[94] += amp_sv[0];
@@ -24999,7 +24999,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 1129
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[39], w_fp[17], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[39], w_fp[17], COUPs[1], 1.0, &amp_fp[0] );
       jamp_sv[72] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[74] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[80] += cxtype( 0, 1 ) * amp_sv[0];
@@ -25008,7 +25008,7 @@ namespace mg5amcCpu
       jamp_sv[91] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[93] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[95] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[39], w_fp[98], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[39], w_fp[98], COUPs[1], 1.0, &amp_fp[0] );
       jamp_sv[74] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[78] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[80] += cxtype( 0, 1 ) * amp_sv[0];
@@ -25017,7 +25017,7 @@ namespace mg5amcCpu
       jamp_sv[92] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[93] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[94] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[39], w_fp[111], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[39], w_fp[111], COUPs[1], 1.0, &amp_fp[0] );
       jamp_sv[72] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[78] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[84] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -25033,17 +25033,17 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 1130
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[39], w_fp[21], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[39], w_fp[21], COUPs[1], 1.0, &amp_fp[0] );
       jamp_sv[72] += amp_sv[0];
       jamp_sv[74] -= amp_sv[0];
       jamp_sv[80] -= amp_sv[0];
       jamp_sv[86] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[39], w_fp[71], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[39], w_fp[71], COUPs[1], 1.0, &amp_fp[0] );
       jamp_sv[74] -= amp_sv[0];
       jamp_sv[78] += amp_sv[0];
       jamp_sv[80] -= amp_sv[0];
       jamp_sv[84] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[39], w_fp[97], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[39], w_fp[97], COUPs[1], 1.0, &amp_fp[0] );
       jamp_sv[72] -= amp_sv[0];
       jamp_sv[78] += amp_sv[0];
       jamp_sv[84] += amp_sv[0];
@@ -25055,17 +25055,17 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 1131
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[16], w_fp[47], w_fp[5], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[16], w_fp[47], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
       jamp_sv[114] += amp_sv[0];
       jamp_sv[115] -= amp_sv[0];
       jamp_sv[117] -= amp_sv[0];
       jamp_sv[119] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[10], w_fp[47], w_fp[5], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[10], w_fp[47], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
       jamp_sv[115] -= amp_sv[0];
       jamp_sv[116] += amp_sv[0];
       jamp_sv[117] -= amp_sv[0];
       jamp_sv[118] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[68], w_fp[47], w_fp[5], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[68], w_fp[47], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
       jamp_sv[114] -= amp_sv[0];
       jamp_sv[116] += amp_sv[0];
       jamp_sv[118] += amp_sv[0];
@@ -25077,7 +25077,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 1132
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[47], w_fp[59], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[47], w_fp[59], COUPs[1], 1.0, &amp_fp[0] );
       jamp_sv[96] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[98] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[104] += cxtype( 0, 1 ) * amp_sv[0];
@@ -25086,7 +25086,7 @@ namespace mg5amcCpu
       jamp_sv[115] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[117] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[119] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[47], w_fp[20], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[47], w_fp[20], COUPs[1], 1.0, &amp_fp[0] );
       jamp_sv[98] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[102] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[104] += cxtype( 0, 1 ) * amp_sv[0];
@@ -25095,7 +25095,7 @@ namespace mg5amcCpu
       jamp_sv[116] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[117] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[118] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[47], w_fp[60], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[47], w_fp[60], COUPs[1], 1.0, &amp_fp[0] );
       jamp_sv[96] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[102] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[108] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -25111,17 +25111,17 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 1133
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[47], w_fp[21], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[47], w_fp[21], COUPs[1], 1.0, &amp_fp[0] );
       jamp_sv[96] += amp_sv[0];
       jamp_sv[98] -= amp_sv[0];
       jamp_sv[104] -= amp_sv[0];
       jamp_sv[110] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[47], w_fp[71], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[47], w_fp[71], COUPs[1], 1.0, &amp_fp[0] );
       jamp_sv[98] -= amp_sv[0];
       jamp_sv[102] += amp_sv[0];
       jamp_sv[104] -= amp_sv[0];
       jamp_sv[108] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[47], w_fp[97], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[47], w_fp[97], COUPs[1], 1.0, &amp_fp[0] );
       jamp_sv[96] -= amp_sv[0];
       jamp_sv[102] += amp_sv[0];
       jamp_sv[108] += amp_sv[0];
@@ -25130,22 +25130,22 @@ namespace mg5amcCpu
       // *** DIAGRAM 1134 OF 1240 ***
 
       // Wavefunction(s) for diagram number 1134
-      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[2], w_fp[21], COUPs[1], cIPD[0], cIPD[1], w_fp[23] );
-      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[2], w_fp[71], COUPs[1], cIPD[0], cIPD[1], w_fp[21] );
-      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[2], w_fp[97], COUPs[1], cIPD[0], cIPD[1], w_fp[71] );
+      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[2], w_fp[21], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[23] );
+      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[2], w_fp[71], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[21] );
+      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[2], w_fp[97], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[71] );
 
       // Amplitude(s) for diagram number 1134
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[23], w_fp[6], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[23], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
       jamp_sv[1] += amp_sv[0];
       jamp_sv[7] -= amp_sv[0];
       jamp_sv[31] -= amp_sv[0];
       jamp_sv[55] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[21], w_fp[6], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[21], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
       jamp_sv[7] -= amp_sv[0];
       jamp_sv[25] += amp_sv[0];
       jamp_sv[31] -= amp_sv[0];
       jamp_sv[49] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[71], w_fp[6], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[71], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
       jamp_sv[1] -= amp_sv[0];
       jamp_sv[25] += amp_sv[0];
       jamp_sv[49] += amp_sv[0];
@@ -25157,7 +25157,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 1135
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[2], w_fp[17], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[2], w_fp[17], COUPs[1], 1.0, &amp_fp[0] );
       jamp_sv[1] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[7] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[31] += cxtype( 0, 1 ) * amp_sv[0];
@@ -25166,7 +25166,7 @@ namespace mg5amcCpu
       jamp_sv[98] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[104] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[110] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[2], w_fp[98], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[2], w_fp[98], COUPs[1], 1.0, &amp_fp[0] );
       jamp_sv[7] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[25] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[31] += cxtype( 0, 1 ) * amp_sv[0];
@@ -25175,7 +25175,7 @@ namespace mg5amcCpu
       jamp_sv[102] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[104] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[108] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[2], w_fp[111], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[2], w_fp[111], COUPs[1], 1.0, &amp_fp[0] );
       jamp_sv[1] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[25] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[49] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -25191,17 +25191,17 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 1136
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[23], w_fp[5], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[23], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
       jamp_sv[0] += amp_sv[0];
       jamp_sv[6] -= amp_sv[0];
       jamp_sv[30] -= amp_sv[0];
       jamp_sv[54] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[21], w_fp[5], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[21], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
       jamp_sv[6] -= amp_sv[0];
       jamp_sv[24] += amp_sv[0];
       jamp_sv[30] -= amp_sv[0];
       jamp_sv[48] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[71], w_fp[5], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[71], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
       jamp_sv[0] -= amp_sv[0];
       jamp_sv[24] += amp_sv[0];
       jamp_sv[48] += amp_sv[0];
@@ -25213,7 +25213,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 1137
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[2], w_fp[59], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[2], w_fp[59], COUPs[1], 1.0, &amp_fp[0] );
       jamp_sv[0] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[6] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[30] += cxtype( 0, 1 ) * amp_sv[0];
@@ -25222,7 +25222,7 @@ namespace mg5amcCpu
       jamp_sv[74] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[80] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[86] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[2], w_fp[20], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[2], w_fp[20], COUPs[1], 1.0, &amp_fp[0] );
       jamp_sv[6] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[24] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[30] += cxtype( 0, 1 ) * amp_sv[0];
@@ -25231,7 +25231,7 @@ namespace mg5amcCpu
       jamp_sv[78] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[80] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[84] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[2], w_fp[60], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[2], w_fp[60], COUPs[1], 1.0, &amp_fp[0] );
       jamp_sv[0] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[24] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[48] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -25247,7 +25247,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 1138
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[23], w_fp[29], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[23], w_fp[29], COUPs[1], 1.0, &amp_fp[0] );
       jamp_sv[0] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[1] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[6] += cxtype( 0, 1 ) * amp_sv[0];
@@ -25256,7 +25256,7 @@ namespace mg5amcCpu
       jamp_sv[31] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[54] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[55] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[21], w_fp[29], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[21], w_fp[29], COUPs[1], 1.0, &amp_fp[0] );
       jamp_sv[6] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[7] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[24] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -25265,7 +25265,7 @@ namespace mg5amcCpu
       jamp_sv[31] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[48] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[49] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[71], w_fp[29], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[71], w_fp[29], COUPs[1], 1.0, &amp_fp[0] );
       jamp_sv[0] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[1] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[24] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -25281,7 +25281,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 1139
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[16], w_fp[2], w_fp[29], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[16], w_fp[2], w_fp[29], COUPs[1], 1.0, &amp_fp[0] );
       jamp_sv[90] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[91] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[93] += cxtype( 0, 1 ) * amp_sv[0];
@@ -25290,7 +25290,7 @@ namespace mg5amcCpu
       jamp_sv[115] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[117] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[119] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[10], w_fp[2], w_fp[29], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[10], w_fp[2], w_fp[29], COUPs[1], 1.0, &amp_fp[0] );
       jamp_sv[91] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[92] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[93] += cxtype( 0, 1 ) * amp_sv[0];
@@ -25299,7 +25299,7 @@ namespace mg5amcCpu
       jamp_sv[116] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[117] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[118] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[68], w_fp[2], w_fp[29], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[68], w_fp[2], w_fp[29], COUPs[1], 1.0, &amp_fp[0] );
       jamp_sv[90] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[92] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[94] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -25312,12 +25312,12 @@ namespace mg5amcCpu
       // *** DIAGRAM 1140 OF 1240 ***
 
       // Wavefunction(s) for diagram number 1140
-      VVVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[1], w_fp[5], COUPs[2], 0., 0., w_fp[68] );
-      VVVV3P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[1], w_fp[5], COUPs[2], 0., 0., w_fp[29] );
-      VVVV4P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[1], w_fp[5], COUPs[2], 0., 0., w_fp[10] );
+      VVVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[1], w_fp[5], COUPs[2], 1.0, 0., 0., w_fp[68] );
+      VVVV3P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[1], w_fp[5], COUPs[2], 1.0, 0., 0., w_fp[29] );
+      VVVV4P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[1], w_fp[5], COUPs[2], 1.0, 0., 0., w_fp[10] );
 
       // Amplitude(s) for diagram number 1140
-      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[68], w_fp[8], w_fp[4], w_fp[6], COUPs[2], &amp_fp[0] );
+      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[68], w_fp[8], w_fp[4], w_fp[6], COUPs[2], 1.0, &amp_fp[0] );
       jamp_sv[2] += amp_sv[0];
       jamp_sv[3] -= amp_sv[0];
       jamp_sv[12] -= amp_sv[0];
@@ -25334,7 +25334,7 @@ namespace mg5amcCpu
       jamp_sv[109] -= amp_sv[0];
       jamp_sv[111] -= amp_sv[0];
       jamp_sv[113] += amp_sv[0];
-      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[68], w_fp[8], w_fp[4], w_fp[6], COUPs[2], &amp_fp[0] );
+      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[68], w_fp[8], w_fp[4], w_fp[6], COUPs[2], 1.0, &amp_fp[0] );
       jamp_sv[2] += amp_sv[0];
       jamp_sv[12] -= amp_sv[0];
       jamp_sv[36] -= amp_sv[0];
@@ -25351,7 +25351,7 @@ namespace mg5amcCpu
       jamp_sv[111] -= amp_sv[0];
       jamp_sv[113] += amp_sv[0];
       jamp_sv[116] -= amp_sv[0];
-      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[68], w_fp[8], w_fp[4], w_fp[6], COUPs[2], &amp_fp[0] );
+      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[68], w_fp[8], w_fp[4], w_fp[6], COUPs[2], 1.0, &amp_fp[0] );
       jamp_sv[3] += amp_sv[0];
       jamp_sv[13] -= amp_sv[0];
       jamp_sv[37] -= amp_sv[0];
@@ -25368,7 +25368,7 @@ namespace mg5amcCpu
       jamp_sv[100] += amp_sv[0];
       jamp_sv[106] += amp_sv[0];
       jamp_sv[116] -= amp_sv[0];
-      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[29], w_fp[8], w_fp[4], w_fp[6], COUPs[2], &amp_fp[0] );
+      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[29], w_fp[8], w_fp[4], w_fp[6], COUPs[2], 1.0, &amp_fp[0] );
       jamp_sv[12] -= amp_sv[0];
       jamp_sv[13] += amp_sv[0];
       jamp_sv[26] += amp_sv[0];
@@ -25385,7 +25385,7 @@ namespace mg5amcCpu
       jamp_sv[110] += amp_sv[0];
       jamp_sv[111] -= amp_sv[0];
       jamp_sv[112] += amp_sv[0];
-      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[29], w_fp[8], w_fp[4], w_fp[6], COUPs[2], &amp_fp[0] );
+      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[29], w_fp[8], w_fp[4], w_fp[6], COUPs[2], 1.0, &amp_fp[0] );
       jamp_sv[12] -= amp_sv[0];
       jamp_sv[26] += amp_sv[0];
       jamp_sv[36] -= amp_sv[0];
@@ -25402,7 +25402,7 @@ namespace mg5amcCpu
       jamp_sv[111] -= amp_sv[0];
       jamp_sv[112] += amp_sv[0];
       jamp_sv[114] -= amp_sv[0];
-      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[29], w_fp[8], w_fp[4], w_fp[6], COUPs[2], &amp_fp[0] );
+      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[29], w_fp[8], w_fp[4], w_fp[6], COUPs[2], 1.0, &amp_fp[0] );
       jamp_sv[13] -= amp_sv[0];
       jamp_sv[27] += amp_sv[0];
       jamp_sv[37] -= amp_sv[0];
@@ -25419,7 +25419,7 @@ namespace mg5amcCpu
       jamp_sv[103] -= amp_sv[0];
       jamp_sv[106] += amp_sv[0];
       jamp_sv[114] -= amp_sv[0];
-      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[10], w_fp[8], w_fp[4], w_fp[6], COUPs[2], &amp_fp[0] );
+      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[10], w_fp[8], w_fp[4], w_fp[6], COUPs[2], 1.0, &amp_fp[0] );
       jamp_sv[2] -= amp_sv[0];
       jamp_sv[3] += amp_sv[0];
       jamp_sv[26] += amp_sv[0];
@@ -25436,7 +25436,7 @@ namespace mg5amcCpu
       jamp_sv[110] += amp_sv[0];
       jamp_sv[112] += amp_sv[0];
       jamp_sv[113] -= amp_sv[0];
-      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[10], w_fp[8], w_fp[4], w_fp[6], COUPs[2], &amp_fp[0] );
+      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[10], w_fp[8], w_fp[4], w_fp[6], COUPs[2], 1.0, &amp_fp[0] );
       jamp_sv[2] -= amp_sv[0];
       jamp_sv[26] += amp_sv[0];
       jamp_sv[48] += amp_sv[0];
@@ -25453,7 +25453,7 @@ namespace mg5amcCpu
       jamp_sv[113] -= amp_sv[0];
       jamp_sv[114] -= amp_sv[0];
       jamp_sv[116] += amp_sv[0];
-      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[10], w_fp[8], w_fp[4], w_fp[6], COUPs[2], &amp_fp[0] );
+      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[10], w_fp[8], w_fp[4], w_fp[6], COUPs[2], 1.0, &amp_fp[0] );
       jamp_sv[3] -= amp_sv[0];
       jamp_sv[27] += amp_sv[0];
       jamp_sv[48] += amp_sv[0];
@@ -25474,12 +25474,12 @@ namespace mg5amcCpu
       // *** DIAGRAM 1141 OF 1240 ***
 
       // Wavefunction(s) for diagram number 1141
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[68], w_fp[4], COUPs[0], 0., 0., w_fp[16] );
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[29], w_fp[4], COUPs[0], 0., 0., w_fp[71] );
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[10], w_fp[4], COUPs[0], 0., 0., w_fp[21] );
+      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[68], w_fp[4], COUPs[0], 1.0, 0., 0., w_fp[16] );
+      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[29], w_fp[4], COUPs[0], 1.0, 0., 0., w_fp[71] );
+      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[10], w_fp[4], COUPs[0], 1.0, 0., 0., w_fp[21] );
 
       // Amplitude(s) for diagram number 1141
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[6], w_fp[16], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[6], w_fp[16], COUPs[0], 1.0, &amp_fp[0] );
       jamp_sv[2] += amp_sv[0];
       jamp_sv[12] -= amp_sv[0];
       jamp_sv[36] -= amp_sv[0];
@@ -25496,7 +25496,7 @@ namespace mg5amcCpu
       jamp_sv[111] -= amp_sv[0];
       jamp_sv[113] += amp_sv[0];
       jamp_sv[116] -= amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[6], w_fp[71], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[6], w_fp[71], COUPs[0], 1.0, &amp_fp[0] );
       jamp_sv[12] -= amp_sv[0];
       jamp_sv[26] += amp_sv[0];
       jamp_sv[36] -= amp_sv[0];
@@ -25513,7 +25513,7 @@ namespace mg5amcCpu
       jamp_sv[111] -= amp_sv[0];
       jamp_sv[112] += amp_sv[0];
       jamp_sv[114] -= amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[6], w_fp[21], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[6], w_fp[21], COUPs[0], 1.0, &amp_fp[0] );
       jamp_sv[2] -= amp_sv[0];
       jamp_sv[26] += amp_sv[0];
       jamp_sv[48] += amp_sv[0];
@@ -25534,12 +25534,12 @@ namespace mg5amcCpu
       // *** DIAGRAM 1142 OF 1240 ***
 
       // Wavefunction(s) for diagram number 1142
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[68], w_fp[6], COUPs[0], 0., 0., w_fp[23] );
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[29], w_fp[6], COUPs[0], 0., 0., w_fp[60] );
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[10], w_fp[6], COUPs[0], 0., 0., w_fp[20] );
+      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[68], w_fp[6], COUPs[0], 1.0, 0., 0., w_fp[23] );
+      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[29], w_fp[6], COUPs[0], 1.0, 0., 0., w_fp[60] );
+      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[10], w_fp[6], COUPs[0], 1.0, 0., 0., w_fp[20] );
 
       // Amplitude(s) for diagram number 1142
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[4], w_fp[23], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[4], w_fp[23], COUPs[0], 1.0, &amp_fp[0] );
       jamp_sv[3] += amp_sv[0];
       jamp_sv[13] -= amp_sv[0];
       jamp_sv[37] -= amp_sv[0];
@@ -25556,7 +25556,7 @@ namespace mg5amcCpu
       jamp_sv[100] += amp_sv[0];
       jamp_sv[106] += amp_sv[0];
       jamp_sv[116] -= amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[4], w_fp[60], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[4], w_fp[60], COUPs[0], 1.0, &amp_fp[0] );
       jamp_sv[13] -= amp_sv[0];
       jamp_sv[27] += amp_sv[0];
       jamp_sv[37] -= amp_sv[0];
@@ -25573,7 +25573,7 @@ namespace mg5amcCpu
       jamp_sv[103] -= amp_sv[0];
       jamp_sv[106] += amp_sv[0];
       jamp_sv[114] -= amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[4], w_fp[20], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[4], w_fp[20], COUPs[0], 1.0, &amp_fp[0] );
       jamp_sv[3] -= amp_sv[0];
       jamp_sv[27] += amp_sv[0];
       jamp_sv[48] += amp_sv[0];
@@ -25597,7 +25597,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 1143
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[68], w_fp[8], w_fp[27], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[68], w_fp[8], w_fp[27], COUPs[0], 1.0, &amp_fp[0] );
       jamp_sv[2] += amp_sv[0];
       jamp_sv[3] -= amp_sv[0];
       jamp_sv[12] -= amp_sv[0];
@@ -25614,7 +25614,7 @@ namespace mg5amcCpu
       jamp_sv[109] -= amp_sv[0];
       jamp_sv[111] -= amp_sv[0];
       jamp_sv[113] += amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[29], w_fp[8], w_fp[27], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[29], w_fp[8], w_fp[27], COUPs[0], 1.0, &amp_fp[0] );
       jamp_sv[12] -= amp_sv[0];
       jamp_sv[13] += amp_sv[0];
       jamp_sv[26] += amp_sv[0];
@@ -25631,7 +25631,7 @@ namespace mg5amcCpu
       jamp_sv[110] += amp_sv[0];
       jamp_sv[111] -= amp_sv[0];
       jamp_sv[112] += amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[10], w_fp[8], w_fp[27], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[10], w_fp[8], w_fp[27], COUPs[0], 1.0, &amp_fp[0] );
       jamp_sv[2] -= amp_sv[0];
       jamp_sv[3] += amp_sv[0];
       jamp_sv[26] += amp_sv[0];
@@ -25652,22 +25652,22 @@ namespace mg5amcCpu
       // *** DIAGRAM 1144 OF 1240 ***
 
       // Wavefunction(s) for diagram number 1144
-      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[68], COUPs[1], cIPD[0], cIPD[1], w_fp[59] );
-      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[29], COUPs[1], cIPD[0], cIPD[1], w_fp[111] );
-      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[10], COUPs[1], cIPD[0], cIPD[1], w_fp[98] );
+      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[68], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[59] );
+      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[29], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[111] );
+      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[10], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[98] );
 
       // Amplitude(s) for diagram number 1144
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[59], w_fp[33], w_fp[6], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[59], w_fp[33], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
       jamp_sv[66] += amp_sv[0];
       jamp_sv[67] -= amp_sv[0];
       jamp_sv[69] -= amp_sv[0];
       jamp_sv[71] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[111], w_fp[33], w_fp[6], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[111], w_fp[33], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
       jamp_sv[67] -= amp_sv[0];
       jamp_sv[68] += amp_sv[0];
       jamp_sv[69] -= amp_sv[0];
       jamp_sv[70] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[98], w_fp[33], w_fp[6], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[98], w_fp[33], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
       jamp_sv[66] -= amp_sv[0];
       jamp_sv[68] += amp_sv[0];
       jamp_sv[70] += amp_sv[0];
@@ -25679,7 +25679,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 1145
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[33], w_fp[23], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[33], w_fp[23], COUPs[1], 1.0, &amp_fp[0] );
       jamp_sv[48] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[50] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[56] += cxtype( 0, 1 ) * amp_sv[0];
@@ -25688,7 +25688,7 @@ namespace mg5amcCpu
       jamp_sv[67] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[69] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[71] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[33], w_fp[60], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[33], w_fp[60], COUPs[1], 1.0, &amp_fp[0] );
       jamp_sv[50] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[54] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[56] += cxtype( 0, 1 ) * amp_sv[0];
@@ -25697,7 +25697,7 @@ namespace mg5amcCpu
       jamp_sv[68] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[69] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[70] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[33], w_fp[20], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[33], w_fp[20], COUPs[1], 1.0, &amp_fp[0] );
       jamp_sv[48] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[54] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[60] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -25713,17 +25713,17 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 1146
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[33], w_fp[68], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[33], w_fp[68], COUPs[1], 1.0, &amp_fp[0] );
       jamp_sv[48] += amp_sv[0];
       jamp_sv[50] -= amp_sv[0];
       jamp_sv[56] -= amp_sv[0];
       jamp_sv[62] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[33], w_fp[29], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[33], w_fp[29], COUPs[1], 1.0, &amp_fp[0] );
       jamp_sv[50] -= amp_sv[0];
       jamp_sv[54] += amp_sv[0];
       jamp_sv[56] -= amp_sv[0];
       jamp_sv[60] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[33], w_fp[10], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[33], w_fp[10], COUPs[1], 1.0, &amp_fp[0] );
       jamp_sv[48] -= amp_sv[0];
       jamp_sv[54] += amp_sv[0];
       jamp_sv[60] += amp_sv[0];
@@ -25735,17 +25735,17 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 1147
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[59], w_fp[47], w_fp[4], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[59], w_fp[47], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
       jamp_sv[108] += amp_sv[0];
       jamp_sv[109] -= amp_sv[0];
       jamp_sv[111] -= amp_sv[0];
       jamp_sv[113] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[111], w_fp[47], w_fp[4], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[111], w_fp[47], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
       jamp_sv[109] -= amp_sv[0];
       jamp_sv[110] += amp_sv[0];
       jamp_sv[111] -= amp_sv[0];
       jamp_sv[112] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[98], w_fp[47], w_fp[4], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[98], w_fp[47], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
       jamp_sv[108] -= amp_sv[0];
       jamp_sv[110] += amp_sv[0];
       jamp_sv[112] += amp_sv[0];
@@ -25757,7 +25757,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 1148
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[47], w_fp[16], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[47], w_fp[16], COUPs[1], 1.0, &amp_fp[0] );
       jamp_sv[97] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[100] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[106] += cxtype( 0, 1 ) * amp_sv[0];
@@ -25766,7 +25766,7 @@ namespace mg5amcCpu
       jamp_sv[111] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[113] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[116] -= cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[47], w_fp[71], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[47], w_fp[71], COUPs[1], 1.0, &amp_fp[0] );
       jamp_sv[100] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[103] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[106] += cxtype( 0, 1 ) * amp_sv[0];
@@ -25775,7 +25775,7 @@ namespace mg5amcCpu
       jamp_sv[111] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[112] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[114] -= cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[47], w_fp[21], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[47], w_fp[21], COUPs[1], 1.0, &amp_fp[0] );
       jamp_sv[97] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[103] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[108] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -25791,17 +25791,17 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 1149
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[47], w_fp[68], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[47], w_fp[68], COUPs[1], 1.0, &amp_fp[0] );
       jamp_sv[97] += amp_sv[0];
       jamp_sv[100] -= amp_sv[0];
       jamp_sv[106] -= amp_sv[0];
       jamp_sv[116] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[47], w_fp[29], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[47], w_fp[29], COUPs[1], 1.0, &amp_fp[0] );
       jamp_sv[100] -= amp_sv[0];
       jamp_sv[103] += amp_sv[0];
       jamp_sv[106] -= amp_sv[0];
       jamp_sv[114] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[47], w_fp[10], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[47], w_fp[10], COUPs[1], 1.0, &amp_fp[0] );
       jamp_sv[97] -= amp_sv[0];
       jamp_sv[103] += amp_sv[0];
       jamp_sv[114] += amp_sv[0];
@@ -25810,22 +25810,22 @@ namespace mg5amcCpu
       // *** DIAGRAM 1150 OF 1240 ***
 
       // Wavefunction(s) for diagram number 1150
-      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[2], w_fp[68], COUPs[1], cIPD[0], cIPD[1], w_fp[17] );
-      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[2], w_fp[29], COUPs[1], cIPD[0], cIPD[1], w_fp[68] );
-      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[2], w_fp[10], COUPs[1], cIPD[0], cIPD[1], w_fp[29] );
+      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[2], w_fp[68], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[17] );
+      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[2], w_fp[29], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[68] );
+      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[2], w_fp[10], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[29] );
 
       // Amplitude(s) for diagram number 1150
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[17], w_fp[6], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[17], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
       jamp_sv[3] += amp_sv[0];
       jamp_sv[13] -= amp_sv[0];
       jamp_sv[37] -= amp_sv[0];
       jamp_sv[79] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[68], w_fp[6], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[68], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
       jamp_sv[13] -= amp_sv[0];
       jamp_sv[27] += amp_sv[0];
       jamp_sv[37] -= amp_sv[0];
       jamp_sv[73] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[29], w_fp[6], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[29], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
       jamp_sv[3] -= amp_sv[0];
       jamp_sv[27] += amp_sv[0];
       jamp_sv[73] += amp_sv[0];
@@ -25837,7 +25837,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 1151
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[2], w_fp[23], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[2], w_fp[23], COUPs[1], 1.0, &amp_fp[0] );
       jamp_sv[3] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[13] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[37] += cxtype( 0, 1 ) * amp_sv[0];
@@ -25846,7 +25846,7 @@ namespace mg5amcCpu
       jamp_sv[100] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[106] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[116] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[2], w_fp[60], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[2], w_fp[60], COUPs[1], 1.0, &amp_fp[0] );
       jamp_sv[13] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[27] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[37] += cxtype( 0, 1 ) * amp_sv[0];
@@ -25855,7 +25855,7 @@ namespace mg5amcCpu
       jamp_sv[103] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[106] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[114] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[2], w_fp[20], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[2], w_fp[20], COUPs[1], 1.0, &amp_fp[0] );
       jamp_sv[3] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[27] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[73] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -25871,17 +25871,17 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 1152
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[17], w_fp[4], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[17], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
       jamp_sv[2] += amp_sv[0];
       jamp_sv[12] -= amp_sv[0];
       jamp_sv[36] -= amp_sv[0];
       jamp_sv[78] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[68], w_fp[4], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[68], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
       jamp_sv[12] -= amp_sv[0];
       jamp_sv[26] += amp_sv[0];
       jamp_sv[36] -= amp_sv[0];
       jamp_sv[72] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[29], w_fp[4], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[29], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
       jamp_sv[2] -= amp_sv[0];
       jamp_sv[26] += amp_sv[0];
       jamp_sv[72] += amp_sv[0];
@@ -25893,7 +25893,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 1153
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[2], w_fp[16], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[2], w_fp[16], COUPs[1], 1.0, &amp_fp[0] );
       jamp_sv[2] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[12] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[36] += cxtype( 0, 1 ) * amp_sv[0];
@@ -25902,7 +25902,7 @@ namespace mg5amcCpu
       jamp_sv[56] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[62] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[78] -= cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[2], w_fp[71], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[2], w_fp[71], COUPs[1], 1.0, &amp_fp[0] );
       jamp_sv[12] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[26] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[36] += cxtype( 0, 1 ) * amp_sv[0];
@@ -25911,7 +25911,7 @@ namespace mg5amcCpu
       jamp_sv[56] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[60] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[72] -= cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[2], w_fp[21], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[2], w_fp[21], COUPs[1], 1.0, &amp_fp[0] );
       jamp_sv[2] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[26] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[48] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -25927,7 +25927,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 1154
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[17], w_fp[27], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[17], w_fp[27], COUPs[1], 1.0, &amp_fp[0] );
       jamp_sv[2] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[3] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[12] += cxtype( 0, 1 ) * amp_sv[0];
@@ -25936,7 +25936,7 @@ namespace mg5amcCpu
       jamp_sv[37] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[78] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[79] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[68], w_fp[27], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[68], w_fp[27], COUPs[1], 1.0, &amp_fp[0] );
       jamp_sv[12] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[13] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[26] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -25945,7 +25945,7 @@ namespace mg5amcCpu
       jamp_sv[37] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[72] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[73] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[29], w_fp[27], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[29], w_fp[27], COUPs[1], 1.0, &amp_fp[0] );
       jamp_sv[2] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[3] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[26] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -25961,7 +25961,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 1155
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[59], w_fp[2], w_fp[27], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[59], w_fp[2], w_fp[27], COUPs[1], 1.0, &amp_fp[0] );
       jamp_sv[66] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[67] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[69] += cxtype( 0, 1 ) * amp_sv[0];
@@ -25970,7 +25970,7 @@ namespace mg5amcCpu
       jamp_sv[109] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[111] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[113] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[111], w_fp[2], w_fp[27], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[111], w_fp[2], w_fp[27], COUPs[1], 1.0, &amp_fp[0] );
       jamp_sv[67] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[68] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[69] += cxtype( 0, 1 ) * amp_sv[0];
@@ -25979,7 +25979,7 @@ namespace mg5amcCpu
       jamp_sv[110] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[111] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[112] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[98], w_fp[2], w_fp[27], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[98], w_fp[2], w_fp[27], COUPs[1], 1.0, &amp_fp[0] );
       jamp_sv[66] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[68] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[70] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -25992,12 +25992,12 @@ namespace mg5amcCpu
       // *** DIAGRAM 1156 OF 1240 ***
 
       // Wavefunction(s) for diagram number 1156
-      VVVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[1], w_fp[6], COUPs[2], 0., 0., w_fp[98] );
-      VVVV3P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[1], w_fp[6], COUPs[2], 0., 0., w_fp[27] );
-      VVVV4P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[1], w_fp[6], COUPs[2], 0., 0., w_fp[111] );
+      VVVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[1], w_fp[6], COUPs[2], 1.0, 0., 0., w_fp[98] );
+      VVVV3P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[1], w_fp[6], COUPs[2], 1.0, 0., 0., w_fp[27] );
+      VVVV4P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[1], w_fp[6], COUPs[2], 1.0, 0., 0., w_fp[111] );
 
       // Amplitude(s) for diagram number 1156
-      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[98], w_fp[8], w_fp[4], w_fp[5], COUPs[2], &amp_fp[0] );
+      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[98], w_fp[8], w_fp[4], w_fp[5], COUPs[2], 1.0, &amp_fp[0] );
       jamp_sv[4] += amp_sv[0];
       jamp_sv[5] -= amp_sv[0];
       jamp_sv[18] -= amp_sv[0];
@@ -26014,7 +26014,7 @@ namespace mg5amcCpu
       jamp_sv[89] += amp_sv[0];
       jamp_sv[102] += amp_sv[0];
       jamp_sv[103] -= amp_sv[0];
-      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[98], w_fp[8], w_fp[4], w_fp[5], COUPs[2], &amp_fp[0] );
+      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[98], w_fp[8], w_fp[4], w_fp[5], COUPs[2], 1.0, &amp_fp[0] );
       jamp_sv[4] += amp_sv[0];
       jamp_sv[18] -= amp_sv[0];
       jamp_sv[42] -= amp_sv[0];
@@ -26031,7 +26031,7 @@ namespace mg5amcCpu
       jamp_sv[89] += amp_sv[0];
       jamp_sv[92] -= amp_sv[0];
       jamp_sv[102] += amp_sv[0];
-      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[98], w_fp[8], w_fp[4], w_fp[5], COUPs[2], &amp_fp[0] );
+      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[98], w_fp[8], w_fp[4], w_fp[5], COUPs[2], 1.0, &amp_fp[0] );
       jamp_sv[5] += amp_sv[0];
       jamp_sv[19] -= amp_sv[0];
       jamp_sv[43] -= amp_sv[0];
@@ -26048,7 +26048,7 @@ namespace mg5amcCpu
       jamp_sv[82] += amp_sv[0];
       jamp_sv[92] -= amp_sv[0];
       jamp_sv[103] += amp_sv[0];
-      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[27], w_fp[8], w_fp[4], w_fp[5], COUPs[2], &amp_fp[0] );
+      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[27], w_fp[8], w_fp[4], w_fp[5], COUPs[2], 1.0, &amp_fp[0] );
       jamp_sv[18] -= amp_sv[0];
       jamp_sv[19] += amp_sv[0];
       jamp_sv[28] += amp_sv[0];
@@ -26065,7 +26065,7 @@ namespace mg5amcCpu
       jamp_sv[88] += amp_sv[0];
       jamp_sv[96] += amp_sv[0];
       jamp_sv[97] -= amp_sv[0];
-      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[27], w_fp[8], w_fp[4], w_fp[5], COUPs[2], &amp_fp[0] );
+      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[27], w_fp[8], w_fp[4], w_fp[5], COUPs[2], 1.0, &amp_fp[0] );
       jamp_sv[18] -= amp_sv[0];
       jamp_sv[28] += amp_sv[0];
       jamp_sv[42] -= amp_sv[0];
@@ -26082,7 +26082,7 @@ namespace mg5amcCpu
       jamp_sv[88] += amp_sv[0];
       jamp_sv[90] -= amp_sv[0];
       jamp_sv[96] += amp_sv[0];
-      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[27], w_fp[8], w_fp[4], w_fp[5], COUPs[2], &amp_fp[0] );
+      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[27], w_fp[8], w_fp[4], w_fp[5], COUPs[2], 1.0, &amp_fp[0] );
       jamp_sv[19] -= amp_sv[0];
       jamp_sv[29] += amp_sv[0];
       jamp_sv[43] -= amp_sv[0];
@@ -26099,7 +26099,7 @@ namespace mg5amcCpu
       jamp_sv[82] += amp_sv[0];
       jamp_sv[90] -= amp_sv[0];
       jamp_sv[97] += amp_sv[0];
-      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[111], w_fp[8], w_fp[4], w_fp[5], COUPs[2], &amp_fp[0] );
+      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[111], w_fp[8], w_fp[4], w_fp[5], COUPs[2], 1.0, &amp_fp[0] );
       jamp_sv[4] -= amp_sv[0];
       jamp_sv[5] += amp_sv[0];
       jamp_sv[28] += amp_sv[0];
@@ -26116,7 +26116,7 @@ namespace mg5amcCpu
       jamp_sv[97] -= amp_sv[0];
       jamp_sv[102] -= amp_sv[0];
       jamp_sv[103] += amp_sv[0];
-      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[111], w_fp[8], w_fp[4], w_fp[5], COUPs[2], &amp_fp[0] );
+      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[111], w_fp[8], w_fp[4], w_fp[5], COUPs[2], 1.0, &amp_fp[0] );
       jamp_sv[4] -= amp_sv[0];
       jamp_sv[28] += amp_sv[0];
       jamp_sv[49] += amp_sv[0];
@@ -26133,7 +26133,7 @@ namespace mg5amcCpu
       jamp_sv[92] += amp_sv[0];
       jamp_sv[96] += amp_sv[0];
       jamp_sv[102] -= amp_sv[0];
-      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[111], w_fp[8], w_fp[4], w_fp[5], COUPs[2], &amp_fp[0] );
+      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[111], w_fp[8], w_fp[4], w_fp[5], COUPs[2], 1.0, &amp_fp[0] );
       jamp_sv[5] -= amp_sv[0];
       jamp_sv[29] += amp_sv[0];
       jamp_sv[49] += amp_sv[0];
@@ -26154,12 +26154,12 @@ namespace mg5amcCpu
       // *** DIAGRAM 1157 OF 1240 ***
 
       // Wavefunction(s) for diagram number 1157
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[98], w_fp[4], COUPs[0], 0., 0., w_fp[59] );
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[27], w_fp[4], COUPs[0], 0., 0., w_fp[29] );
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[111], w_fp[4], COUPs[0], 0., 0., w_fp[68] );
+      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[98], w_fp[4], COUPs[0], 1.0, 0., 0., w_fp[59] );
+      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[27], w_fp[4], COUPs[0], 1.0, 0., 0., w_fp[29] );
+      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[111], w_fp[4], COUPs[0], 1.0, 0., 0., w_fp[68] );
 
       // Amplitude(s) for diagram number 1157
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[5], w_fp[59], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[5], w_fp[59], COUPs[0], 1.0, &amp_fp[0] );
       jamp_sv[4] += amp_sv[0];
       jamp_sv[18] -= amp_sv[0];
       jamp_sv[42] -= amp_sv[0];
@@ -26176,7 +26176,7 @@ namespace mg5amcCpu
       jamp_sv[89] += amp_sv[0];
       jamp_sv[92] -= amp_sv[0];
       jamp_sv[102] += amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[5], w_fp[29], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[5], w_fp[29], COUPs[0], 1.0, &amp_fp[0] );
       jamp_sv[18] -= amp_sv[0];
       jamp_sv[28] += amp_sv[0];
       jamp_sv[42] -= amp_sv[0];
@@ -26193,7 +26193,7 @@ namespace mg5amcCpu
       jamp_sv[88] += amp_sv[0];
       jamp_sv[90] -= amp_sv[0];
       jamp_sv[96] += amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[5], w_fp[68], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[5], w_fp[68], COUPs[0], 1.0, &amp_fp[0] );
       jamp_sv[4] -= amp_sv[0];
       jamp_sv[28] += amp_sv[0];
       jamp_sv[49] += amp_sv[0];
@@ -26214,12 +26214,12 @@ namespace mg5amcCpu
       // *** DIAGRAM 1158 OF 1240 ***
 
       // Wavefunction(s) for diagram number 1158
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[98], w_fp[5], COUPs[0], 0., 0., w_fp[17] );
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[27], w_fp[5], COUPs[0], 0., 0., w_fp[21] );
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[111], w_fp[5], COUPs[0], 0., 0., w_fp[71] );
+      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[98], w_fp[5], COUPs[0], 1.0, 0., 0., w_fp[17] );
+      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[27], w_fp[5], COUPs[0], 1.0, 0., 0., w_fp[21] );
+      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[111], w_fp[5], COUPs[0], 1.0, 0., 0., w_fp[71] );
 
       // Amplitude(s) for diagram number 1158
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[4], w_fp[17], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[4], w_fp[17], COUPs[0], 1.0, &amp_fp[0] );
       jamp_sv[5] += amp_sv[0];
       jamp_sv[19] -= amp_sv[0];
       jamp_sv[43] -= amp_sv[0];
@@ -26236,7 +26236,7 @@ namespace mg5amcCpu
       jamp_sv[82] += amp_sv[0];
       jamp_sv[92] -= amp_sv[0];
       jamp_sv[103] += amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[4], w_fp[21], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[4], w_fp[21], COUPs[0], 1.0, &amp_fp[0] );
       jamp_sv[19] -= amp_sv[0];
       jamp_sv[29] += amp_sv[0];
       jamp_sv[43] -= amp_sv[0];
@@ -26253,7 +26253,7 @@ namespace mg5amcCpu
       jamp_sv[82] += amp_sv[0];
       jamp_sv[90] -= amp_sv[0];
       jamp_sv[97] += amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[4], w_fp[71], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[4], w_fp[71], COUPs[0], 1.0, &amp_fp[0] );
       jamp_sv[5] -= amp_sv[0];
       jamp_sv[29] += amp_sv[0];
       jamp_sv[49] += amp_sv[0];
@@ -26277,7 +26277,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 1159
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[98], w_fp[8], w_fp[24], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[98], w_fp[8], w_fp[24], COUPs[0], 1.0, &amp_fp[0] );
       jamp_sv[4] += amp_sv[0];
       jamp_sv[5] -= amp_sv[0];
       jamp_sv[18] -= amp_sv[0];
@@ -26294,7 +26294,7 @@ namespace mg5amcCpu
       jamp_sv[89] += amp_sv[0];
       jamp_sv[102] += amp_sv[0];
       jamp_sv[103] -= amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[27], w_fp[8], w_fp[24], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[27], w_fp[8], w_fp[24], COUPs[0], 1.0, &amp_fp[0] );
       jamp_sv[18] -= amp_sv[0];
       jamp_sv[19] += amp_sv[0];
       jamp_sv[28] += amp_sv[0];
@@ -26311,7 +26311,7 @@ namespace mg5amcCpu
       jamp_sv[88] += amp_sv[0];
       jamp_sv[96] += amp_sv[0];
       jamp_sv[97] -= amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[111], w_fp[8], w_fp[24], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[111], w_fp[8], w_fp[24], COUPs[0], 1.0, &amp_fp[0] );
       jamp_sv[4] -= amp_sv[0];
       jamp_sv[5] += amp_sv[0];
       jamp_sv[28] += amp_sv[0];
@@ -26332,22 +26332,22 @@ namespace mg5amcCpu
       // *** DIAGRAM 1160 OF 1240 ***
 
       // Wavefunction(s) for diagram number 1160
-      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[98], COUPs[1], cIPD[0], cIPD[1], w_fp[16] );
-      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[27], COUPs[1], cIPD[0], cIPD[1], w_fp[20] );
-      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[111], COUPs[1], cIPD[0], cIPD[1], w_fp[60] );
+      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[98], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[16] );
+      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[27], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[20] );
+      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[111], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[60] );
 
       // Amplitude(s) for diagram number 1160
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[16], w_fp[33], w_fp[5], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[16], w_fp[33], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
       jamp_sv[60] += amp_sv[0];
       jamp_sv[61] -= amp_sv[0];
       jamp_sv[63] -= amp_sv[0];
       jamp_sv[65] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[20], w_fp[33], w_fp[5], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[20], w_fp[33], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
       jamp_sv[61] -= amp_sv[0];
       jamp_sv[62] += amp_sv[0];
       jamp_sv[63] -= amp_sv[0];
       jamp_sv[64] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[60], w_fp[33], w_fp[5], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[60], w_fp[33], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
       jamp_sv[60] -= amp_sv[0];
       jamp_sv[62] += amp_sv[0];
       jamp_sv[64] += amp_sv[0];
@@ -26359,7 +26359,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 1161
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[33], w_fp[17], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[33], w_fp[17], COUPs[1], 1.0, &amp_fp[0] );
       jamp_sv[49] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[52] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[58] += cxtype( 0, 1 ) * amp_sv[0];
@@ -26368,7 +26368,7 @@ namespace mg5amcCpu
       jamp_sv[63] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[65] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[68] -= cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[33], w_fp[21], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[33], w_fp[21], COUPs[1], 1.0, &amp_fp[0] );
       jamp_sv[52] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[55] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[58] += cxtype( 0, 1 ) * amp_sv[0];
@@ -26377,7 +26377,7 @@ namespace mg5amcCpu
       jamp_sv[63] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[64] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[66] -= cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[33], w_fp[71], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[33], w_fp[71], COUPs[1], 1.0, &amp_fp[0] );
       jamp_sv[49] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[55] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[60] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -26393,17 +26393,17 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 1162
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[33], w_fp[98], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[33], w_fp[98], COUPs[1], 1.0, &amp_fp[0] );
       jamp_sv[49] += amp_sv[0];
       jamp_sv[52] -= amp_sv[0];
       jamp_sv[58] -= amp_sv[0];
       jamp_sv[68] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[33], w_fp[27], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[33], w_fp[27], COUPs[1], 1.0, &amp_fp[0] );
       jamp_sv[52] -= amp_sv[0];
       jamp_sv[55] += amp_sv[0];
       jamp_sv[58] -= amp_sv[0];
       jamp_sv[66] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[33], w_fp[111], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[33], w_fp[111], COUPs[1], 1.0, &amp_fp[0] );
       jamp_sv[49] -= amp_sv[0];
       jamp_sv[55] += amp_sv[0];
       jamp_sv[66] += amp_sv[0];
@@ -26415,17 +26415,17 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 1163
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[16], w_fp[39], w_fp[4], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[16], w_fp[39], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
       jamp_sv[84] += amp_sv[0];
       jamp_sv[85] -= amp_sv[0];
       jamp_sv[87] -= amp_sv[0];
       jamp_sv[89] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[20], w_fp[39], w_fp[4], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[20], w_fp[39], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
       jamp_sv[85] -= amp_sv[0];
       jamp_sv[86] += amp_sv[0];
       jamp_sv[87] -= amp_sv[0];
       jamp_sv[88] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[60], w_fp[39], w_fp[4], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[60], w_fp[39], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
       jamp_sv[84] -= amp_sv[0];
       jamp_sv[86] += amp_sv[0];
       jamp_sv[88] += amp_sv[0];
@@ -26437,7 +26437,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 1164
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[39], w_fp[59], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[39], w_fp[59], COUPs[1], 1.0, &amp_fp[0] );
       jamp_sv[73] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[76] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[82] += cxtype( 0, 1 ) * amp_sv[0];
@@ -26446,7 +26446,7 @@ namespace mg5amcCpu
       jamp_sv[87] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[89] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[92] -= cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[39], w_fp[29], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[39], w_fp[29], COUPs[1], 1.0, &amp_fp[0] );
       jamp_sv[76] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[79] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[82] += cxtype( 0, 1 ) * amp_sv[0];
@@ -26455,7 +26455,7 @@ namespace mg5amcCpu
       jamp_sv[87] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[88] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[90] -= cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[39], w_fp[68], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[39], w_fp[68], COUPs[1], 1.0, &amp_fp[0] );
       jamp_sv[73] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[79] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[84] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -26471,17 +26471,17 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 1165
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[39], w_fp[98], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[39], w_fp[98], COUPs[1], 1.0, &amp_fp[0] );
       jamp_sv[73] += amp_sv[0];
       jamp_sv[76] -= amp_sv[0];
       jamp_sv[82] -= amp_sv[0];
       jamp_sv[92] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[39], w_fp[27], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[39], w_fp[27], COUPs[1], 1.0, &amp_fp[0] );
       jamp_sv[76] -= amp_sv[0];
       jamp_sv[79] += amp_sv[0];
       jamp_sv[82] -= amp_sv[0];
       jamp_sv[90] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[39], w_fp[111], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[39], w_fp[111], COUPs[1], 1.0, &amp_fp[0] );
       jamp_sv[73] -= amp_sv[0];
       jamp_sv[79] += amp_sv[0];
       jamp_sv[90] += amp_sv[0];
@@ -26490,22 +26490,22 @@ namespace mg5amcCpu
       // *** DIAGRAM 1166 OF 1240 ***
 
       // Wavefunction(s) for diagram number 1166
-      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[2], w_fp[98], COUPs[1], cIPD[0], cIPD[1], w_fp[23] );
-      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[2], w_fp[27], COUPs[1], cIPD[0], cIPD[1], w_fp[98] );
-      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[2], w_fp[111], COUPs[1], cIPD[0], cIPD[1], w_fp[27] );
+      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[2], w_fp[98], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[23] );
+      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[2], w_fp[27], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[98] );
+      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[2], w_fp[111], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[27] );
 
       // Amplitude(s) for diagram number 1166
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[23], w_fp[5], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[23], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
       jamp_sv[5] += amp_sv[0];
       jamp_sv[19] -= amp_sv[0];
       jamp_sv[43] -= amp_sv[0];
       jamp_sv[103] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[98], w_fp[5], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[98], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
       jamp_sv[19] -= amp_sv[0];
       jamp_sv[29] += amp_sv[0];
       jamp_sv[43] -= amp_sv[0];
       jamp_sv[97] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[27], w_fp[5], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[27], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
       jamp_sv[5] -= amp_sv[0];
       jamp_sv[29] += amp_sv[0];
       jamp_sv[97] += amp_sv[0];
@@ -26517,7 +26517,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 1167
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[2], w_fp[17], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[2], w_fp[17], COUPs[1], 1.0, &amp_fp[0] );
       jamp_sv[5] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[19] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[43] += cxtype( 0, 1 ) * amp_sv[0];
@@ -26526,7 +26526,7 @@ namespace mg5amcCpu
       jamp_sv[82] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[92] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[103] -= cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[2], w_fp[21], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[2], w_fp[21], COUPs[1], 1.0, &amp_fp[0] );
       jamp_sv[19] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[29] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[43] += cxtype( 0, 1 ) * amp_sv[0];
@@ -26535,7 +26535,7 @@ namespace mg5amcCpu
       jamp_sv[82] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[90] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[97] -= cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[2], w_fp[71], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[2], w_fp[71], COUPs[1], 1.0, &amp_fp[0] );
       jamp_sv[5] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[29] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[73] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -26551,17 +26551,17 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 1168
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[23], w_fp[4], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[23], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
       jamp_sv[4] += amp_sv[0];
       jamp_sv[18] -= amp_sv[0];
       jamp_sv[42] -= amp_sv[0];
       jamp_sv[102] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[98], w_fp[4], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[98], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
       jamp_sv[18] -= amp_sv[0];
       jamp_sv[28] += amp_sv[0];
       jamp_sv[42] -= amp_sv[0];
       jamp_sv[96] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[27], w_fp[4], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[27], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
       jamp_sv[4] -= amp_sv[0];
       jamp_sv[28] += amp_sv[0];
       jamp_sv[96] += amp_sv[0];
@@ -26573,7 +26573,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 1169
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[2], w_fp[59], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[2], w_fp[59], COUPs[1], 1.0, &amp_fp[0] );
       jamp_sv[4] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[18] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[42] += cxtype( 0, 1 ) * amp_sv[0];
@@ -26582,7 +26582,7 @@ namespace mg5amcCpu
       jamp_sv[58] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[68] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[102] -= cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[2], w_fp[29], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[2], w_fp[29], COUPs[1], 1.0, &amp_fp[0] );
       jamp_sv[18] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[28] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[42] += cxtype( 0, 1 ) * amp_sv[0];
@@ -26591,7 +26591,7 @@ namespace mg5amcCpu
       jamp_sv[58] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[66] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[96] -= cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[2], w_fp[68], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[2], w_fp[68], COUPs[1], 1.0, &amp_fp[0] );
       jamp_sv[4] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[28] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[49] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -26607,7 +26607,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 1170
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[23], w_fp[24], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[23], w_fp[24], COUPs[1], 1.0, &amp_fp[0] );
       jamp_sv[4] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[5] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[18] += cxtype( 0, 1 ) * amp_sv[0];
@@ -26616,7 +26616,7 @@ namespace mg5amcCpu
       jamp_sv[43] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[102] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[103] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[98], w_fp[24], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[98], w_fp[24], COUPs[1], 1.0, &amp_fp[0] );
       jamp_sv[18] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[19] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[28] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -26625,7 +26625,7 @@ namespace mg5amcCpu
       jamp_sv[43] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[96] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[97] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[27], w_fp[24], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[27], w_fp[24], COUPs[1], 1.0, &amp_fp[0] );
       jamp_sv[4] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[5] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[28] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -26641,7 +26641,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 1171
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[16], w_fp[2], w_fp[24], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[16], w_fp[2], w_fp[24], COUPs[1], 1.0, &amp_fp[0] );
       jamp_sv[60] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[61] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[63] += cxtype( 0, 1 ) * amp_sv[0];
@@ -26650,7 +26650,7 @@ namespace mg5amcCpu
       jamp_sv[85] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[87] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[89] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[20], w_fp[2], w_fp[24], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[20], w_fp[2], w_fp[24], COUPs[1], 1.0, &amp_fp[0] );
       jamp_sv[61] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[62] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[63] += cxtype( 0, 1 ) * amp_sv[0];
@@ -26659,7 +26659,7 @@ namespace mg5amcCpu
       jamp_sv[86] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[87] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[88] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[60], w_fp[2], w_fp[24], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[60], w_fp[2], w_fp[24], COUPs[1], 1.0, &amp_fp[0] );
       jamp_sv[60] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[62] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[64] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -26672,25 +26672,25 @@ namespace mg5amcCpu
       // *** DIAGRAM 1172 OF 1240 ***
 
       // Wavefunction(s) for diagram number 1172
-      VVVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[4], w_fp[5], COUPs[2], 0., 0., w_fp[60] );
-      VVVV3P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[4], w_fp[5], COUPs[2], 0., 0., w_fp[24] );
-      VVVV4P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[4], w_fp[5], COUPs[2], 0., 0., w_fp[20] );
-      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[60], COUPs[1], cIPD[0], cIPD[1], w_fp[16] );
-      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[24], COUPs[1], cIPD[0], cIPD[1], w_fp[27] );
-      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[20], COUPs[1], cIPD[0], cIPD[1], w_fp[98] );
+      VVVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[4], w_fp[5], COUPs[2], 1.0, 0., 0., w_fp[60] );
+      VVVV3P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[4], w_fp[5], COUPs[2], 1.0, 0., 0., w_fp[24] );
+      VVVV4P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[4], w_fp[5], COUPs[2], 1.0, 0., 0., w_fp[20] );
+      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[60], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[16] );
+      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[24], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[27] );
+      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[20], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[98] );
 
       // Amplitude(s) for diagram number 1172
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[16], w_fp[77], w_fp[6], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[16], w_fp[77], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
       jamp_sv[42] += amp_sv[0];
       jamp_sv[43] -= amp_sv[0];
       jamp_sv[45] -= amp_sv[0];
       jamp_sv[47] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[27], w_fp[77], w_fp[6], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[27], w_fp[77], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
       jamp_sv[43] -= amp_sv[0];
       jamp_sv[44] += amp_sv[0];
       jamp_sv[45] -= amp_sv[0];
       jamp_sv[46] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[98], w_fp[77], w_fp[6], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[98], w_fp[77], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
       jamp_sv[42] -= amp_sv[0];
       jamp_sv[44] += amp_sv[0];
       jamp_sv[46] += amp_sv[0];
@@ -26699,12 +26699,12 @@ namespace mg5amcCpu
       // *** DIAGRAM 1173 OF 1240 ***
 
       // Wavefunction(s) for diagram number 1173
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[60], w_fp[6], COUPs[0], 0., 0., w_fp[23] );
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[24], w_fp[6], COUPs[0], 0., 0., w_fp[68] );
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[20], w_fp[6], COUPs[0], 0., 0., w_fp[29] );
+      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[60], w_fp[6], COUPs[0], 1.0, 0., 0., w_fp[23] );
+      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[24], w_fp[6], COUPs[0], 1.0, 0., 0., w_fp[68] );
+      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[20], w_fp[6], COUPs[0], 1.0, 0., 0., w_fp[29] );
 
       // Amplitude(s) for diagram number 1173
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[77], w_fp[23], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[77], w_fp[23], COUPs[1], 1.0, &amp_fp[0] );
       jamp_sv[24] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[26] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[32] += cxtype( 0, 1 ) * amp_sv[0];
@@ -26713,7 +26713,7 @@ namespace mg5amcCpu
       jamp_sv[43] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[45] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[47] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[77], w_fp[68], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[77], w_fp[68], COUPs[1], 1.0, &amp_fp[0] );
       jamp_sv[26] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[30] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[32] += cxtype( 0, 1 ) * amp_sv[0];
@@ -26722,7 +26722,7 @@ namespace mg5amcCpu
       jamp_sv[44] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[45] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[46] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[77], w_fp[29], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[77], w_fp[29], COUPs[1], 1.0, &amp_fp[0] );
       jamp_sv[24] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[30] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[36] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -26738,17 +26738,17 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 1174
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[77], w_fp[60], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[77], w_fp[60], COUPs[1], 1.0, &amp_fp[0] );
       jamp_sv[24] += amp_sv[0];
       jamp_sv[26] -= amp_sv[0];
       jamp_sv[32] -= amp_sv[0];
       jamp_sv[38] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[77], w_fp[24], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[77], w_fp[24], COUPs[1], 1.0, &amp_fp[0] );
       jamp_sv[26] -= amp_sv[0];
       jamp_sv[30] += amp_sv[0];
       jamp_sv[32] -= amp_sv[0];
       jamp_sv[36] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[77], w_fp[20], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[77], w_fp[20], COUPs[1], 1.0, &amp_fp[0] );
       jamp_sv[24] -= amp_sv[0];
       jamp_sv[30] += amp_sv[0];
       jamp_sv[36] += amp_sv[0];
@@ -26757,22 +26757,22 @@ namespace mg5amcCpu
       // *** DIAGRAM 1175 OF 1240 ***
 
       // Wavefunction(s) for diagram number 1175
-      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[2], w_fp[60], COUPs[1], cIPD[0], cIPD[1], w_fp[59] );
-      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[2], w_fp[24], COUPs[1], cIPD[0], cIPD[1], w_fp[71] );
-      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[2], w_fp[20], COUPs[1], cIPD[0], cIPD[1], w_fp[21] );
+      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[2], w_fp[60], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[59] );
+      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[2], w_fp[24], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[71] );
+      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[2], w_fp[20], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[21] );
 
       // Amplitude(s) for diagram number 1175
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[59], w_fp[6], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[59], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
       jamp_sv[9] += amp_sv[0];
       jamp_sv[15] -= amp_sv[0];
       jamp_sv[61] -= amp_sv[0];
       jamp_sv[85] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[71], w_fp[6], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[71], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
       jamp_sv[15] -= amp_sv[0];
       jamp_sv[51] += amp_sv[0];
       jamp_sv[61] -= amp_sv[0];
       jamp_sv[75] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[21], w_fp[6], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[21], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
       jamp_sv[9] -= amp_sv[0];
       jamp_sv[51] += amp_sv[0];
       jamp_sv[75] += amp_sv[0];
@@ -26784,7 +26784,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 1176
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[2], w_fp[23], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[2], w_fp[23], COUPs[1], 1.0, &amp_fp[0] );
       jamp_sv[9] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[15] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[61] += cxtype( 0, 1 ) * amp_sv[0];
@@ -26793,7 +26793,7 @@ namespace mg5amcCpu
       jamp_sv[101] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[112] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[118] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[2], w_fp[68], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[2], w_fp[68], COUPs[1], 1.0, &amp_fp[0] );
       jamp_sv[15] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[51] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[61] += cxtype( 0, 1 ) * amp_sv[0];
@@ -26802,7 +26802,7 @@ namespace mg5amcCpu
       jamp_sv[109] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[112] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[115] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[2], w_fp[29], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[2], w_fp[29], COUPs[1], 1.0, &amp_fp[0] );
       jamp_sv[9] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[51] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[75] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -26818,17 +26818,17 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 1177
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[47], w_fp[60], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[47], w_fp[60], COUPs[1], 1.0, &amp_fp[0] );
       jamp_sv[99] += amp_sv[0];
       jamp_sv[101] -= amp_sv[0];
       jamp_sv[112] -= amp_sv[0];
       jamp_sv[118] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[47], w_fp[24], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[47], w_fp[24], COUPs[1], 1.0, &amp_fp[0] );
       jamp_sv[101] -= amp_sv[0];
       jamp_sv[109] += amp_sv[0];
       jamp_sv[112] -= amp_sv[0];
       jamp_sv[115] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[47], w_fp[20], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[47], w_fp[20], COUPs[1], 1.0, &amp_fp[0] );
       jamp_sv[99] -= amp_sv[0];
       jamp_sv[109] += amp_sv[0];
       jamp_sv[115] += amp_sv[0];
@@ -26840,7 +26840,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 1178
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[59], w_fp[72], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[59], w_fp[72], COUPs[1], 1.0, &amp_fp[0] );
       jamp_sv[8] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[9] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[14] += cxtype( 0, 1 ) * amp_sv[0];
@@ -26849,7 +26849,7 @@ namespace mg5amcCpu
       jamp_sv[61] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[84] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[85] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[71], w_fp[72], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[71], w_fp[72], COUPs[1], 1.0, &amp_fp[0] );
       jamp_sv[14] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[15] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[50] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -26858,7 +26858,7 @@ namespace mg5amcCpu
       jamp_sv[61] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[74] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[75] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[21], w_fp[72], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[21], w_fp[72], COUPs[1], 1.0, &amp_fp[0] );
       jamp_sv[8] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[9] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[50] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -26874,7 +26874,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 1179
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[16], w_fp[2], w_fp[72], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[16], w_fp[2], w_fp[72], COUPs[1], 1.0, &amp_fp[0] );
       jamp_sv[42] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[43] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[45] += cxtype( 0, 1 ) * amp_sv[0];
@@ -26883,7 +26883,7 @@ namespace mg5amcCpu
       jamp_sv[103] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[105] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[107] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[27], w_fp[2], w_fp[72], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[27], w_fp[2], w_fp[72], COUPs[1], 1.0, &amp_fp[0] );
       jamp_sv[43] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[44] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[45] += cxtype( 0, 1 ) * amp_sv[0];
@@ -26892,7 +26892,7 @@ namespace mg5amcCpu
       jamp_sv[104] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[105] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[106] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[98], w_fp[2], w_fp[72], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[98], w_fp[2], w_fp[72], COUPs[1], 1.0, &amp_fp[0] );
       jamp_sv[42] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[44] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[46] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -26908,7 +26908,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 1180
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[60], w_fp[72], w_fp[8], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[60], w_fp[72], w_fp[8], COUPs[0], 1.0, &amp_fp[0] );
       jamp_sv[8] -= amp_sv[0];
       jamp_sv[9] += amp_sv[0];
       jamp_sv[14] += amp_sv[0];
@@ -26925,7 +26925,7 @@ namespace mg5amcCpu
       jamp_sv[103] += amp_sv[0];
       jamp_sv[105] += amp_sv[0];
       jamp_sv[107] -= amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[24], w_fp[72], w_fp[8], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[24], w_fp[72], w_fp[8], COUPs[0], 1.0, &amp_fp[0] );
       jamp_sv[14] += amp_sv[0];
       jamp_sv[15] -= amp_sv[0];
       jamp_sv[43] -= amp_sv[0];
@@ -26942,7 +26942,7 @@ namespace mg5amcCpu
       jamp_sv[104] -= amp_sv[0];
       jamp_sv[105] += amp_sv[0];
       jamp_sv[106] -= amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[20], w_fp[72], w_fp[8], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[20], w_fp[72], w_fp[8], COUPs[0], 1.0, &amp_fp[0] );
       jamp_sv[8] += amp_sv[0];
       jamp_sv[9] -= amp_sv[0];
       jamp_sv[42] -= amp_sv[0];
@@ -26966,7 +26966,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 1181
-      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[60], w_fp[1], w_fp[8], w_fp[6], COUPs[2], &amp_fp[0] );
+      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[60], w_fp[1], w_fp[8], w_fp[6], COUPs[2], 1.0, &amp_fp[0] );
       jamp_sv[8] += amp_sv[0];
       jamp_sv[14] -= amp_sv[0];
       jamp_sv[24] -= amp_sv[0];
@@ -26983,7 +26983,7 @@ namespace mg5amcCpu
       jamp_sv[107] += amp_sv[0];
       jamp_sv[112] += amp_sv[0];
       jamp_sv[118] -= amp_sv[0];
-      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[60], w_fp[1], w_fp[8], w_fp[6], COUPs[2], &amp_fp[0] );
+      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[60], w_fp[1], w_fp[8], w_fp[6], COUPs[2], 1.0, &amp_fp[0] );
       jamp_sv[8] += amp_sv[0];
       jamp_sv[9] -= amp_sv[0];
       jamp_sv[14] -= amp_sv[0];
@@ -27000,7 +27000,7 @@ namespace mg5amcCpu
       jamp_sv[103] -= amp_sv[0];
       jamp_sv[105] -= amp_sv[0];
       jamp_sv[107] += amp_sv[0];
-      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[60], w_fp[1], w_fp[8], w_fp[6], COUPs[2], &amp_fp[0] );
+      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[60], w_fp[1], w_fp[8], w_fp[6], COUPs[2], 1.0, &amp_fp[0] );
       jamp_sv[9] -= amp_sv[0];
       jamp_sv[15] += amp_sv[0];
       jamp_sv[24] += amp_sv[0];
@@ -27017,7 +27017,7 @@ namespace mg5amcCpu
       jamp_sv[101] -= amp_sv[0];
       jamp_sv[112] -= amp_sv[0];
       jamp_sv[118] += amp_sv[0];
-      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[24], w_fp[1], w_fp[8], w_fp[6], COUPs[2], &amp_fp[0] );
+      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[24], w_fp[1], w_fp[8], w_fp[6], COUPs[2], 1.0, &amp_fp[0] );
       jamp_sv[14] -= amp_sv[0];
       jamp_sv[26] += amp_sv[0];
       jamp_sv[30] -= amp_sv[0];
@@ -27034,7 +27034,7 @@ namespace mg5amcCpu
       jamp_sv[109] -= amp_sv[0];
       jamp_sv[112] += amp_sv[0];
       jamp_sv[115] -= amp_sv[0];
-      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[24], w_fp[1], w_fp[8], w_fp[6], COUPs[2], &amp_fp[0] );
+      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[24], w_fp[1], w_fp[8], w_fp[6], COUPs[2], 1.0, &amp_fp[0] );
       jamp_sv[14] -= amp_sv[0];
       jamp_sv[15] += amp_sv[0];
       jamp_sv[43] += amp_sv[0];
@@ -27051,7 +27051,7 @@ namespace mg5amcCpu
       jamp_sv[104] += amp_sv[0];
       jamp_sv[105] -= amp_sv[0];
       jamp_sv[106] += amp_sv[0];
-      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[24], w_fp[1], w_fp[8], w_fp[6], COUPs[2], &amp_fp[0] );
+      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[24], w_fp[1], w_fp[8], w_fp[6], COUPs[2], 1.0, &amp_fp[0] );
       jamp_sv[15] += amp_sv[0];
       jamp_sv[26] -= amp_sv[0];
       jamp_sv[30] += amp_sv[0];
@@ -27068,7 +27068,7 @@ namespace mg5amcCpu
       jamp_sv[109] += amp_sv[0];
       jamp_sv[112] -= amp_sv[0];
       jamp_sv[115] += amp_sv[0];
-      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[20], w_fp[1], w_fp[8], w_fp[6], COUPs[2], &amp_fp[0] );
+      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[20], w_fp[1], w_fp[8], w_fp[6], COUPs[2], 1.0, &amp_fp[0] );
       jamp_sv[8] -= amp_sv[0];
       jamp_sv[24] += amp_sv[0];
       jamp_sv[30] -= amp_sv[0];
@@ -27085,7 +27085,7 @@ namespace mg5amcCpu
       jamp_sv[109] -= amp_sv[0];
       jamp_sv[115] -= amp_sv[0];
       jamp_sv[118] += amp_sv[0];
-      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[20], w_fp[1], w_fp[8], w_fp[6], COUPs[2], &amp_fp[0] );
+      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[20], w_fp[1], w_fp[8], w_fp[6], COUPs[2], 1.0, &amp_fp[0] );
       jamp_sv[8] -= amp_sv[0];
       jamp_sv[9] += amp_sv[0];
       jamp_sv[42] += amp_sv[0];
@@ -27102,7 +27102,7 @@ namespace mg5amcCpu
       jamp_sv[104] += amp_sv[0];
       jamp_sv[106] += amp_sv[0];
       jamp_sv[107] -= amp_sv[0];
-      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[20], w_fp[1], w_fp[8], w_fp[6], COUPs[2], &amp_fp[0] );
+      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[20], w_fp[1], w_fp[8], w_fp[6], COUPs[2], 1.0, &amp_fp[0] );
       jamp_sv[9] += amp_sv[0];
       jamp_sv[24] -= amp_sv[0];
       jamp_sv[30] += amp_sv[0];
@@ -27123,12 +27123,12 @@ namespace mg5amcCpu
       // *** DIAGRAM 1182 OF 1240 ***
 
       // Wavefunction(s) for diagram number 1182
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[60], w_fp[1], COUPs[0], 0., 0., w_fp[72] );
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[24], w_fp[1], COUPs[0], 0., 0., w_fp[60] );
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[20], w_fp[1], COUPs[0], 0., 0., w_fp[24] );
+      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[60], w_fp[1], COUPs[0], 1.0, 0., 0., w_fp[72] );
+      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[24], w_fp[1], COUPs[0], 1.0, 0., 0., w_fp[60] );
+      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[20], w_fp[1], COUPs[0], 1.0, 0., 0., w_fp[24] );
 
       // Amplitude(s) for diagram number 1182
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[6], w_fp[72], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[6], w_fp[72], COUPs[0], 1.0, &amp_fp[0] );
       jamp_sv[8] += amp_sv[0];
       jamp_sv[14] -= amp_sv[0];
       jamp_sv[24] -= amp_sv[0];
@@ -27145,7 +27145,7 @@ namespace mg5amcCpu
       jamp_sv[107] += amp_sv[0];
       jamp_sv[112] += amp_sv[0];
       jamp_sv[118] -= amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[6], w_fp[60], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[6], w_fp[60], COUPs[0], 1.0, &amp_fp[0] );
       jamp_sv[14] -= amp_sv[0];
       jamp_sv[26] += amp_sv[0];
       jamp_sv[30] -= amp_sv[0];
@@ -27162,7 +27162,7 @@ namespace mg5amcCpu
       jamp_sv[109] -= amp_sv[0];
       jamp_sv[112] += amp_sv[0];
       jamp_sv[115] -= amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[6], w_fp[24], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[6], w_fp[24], COUPs[0], 1.0, &amp_fp[0] );
       jamp_sv[8] -= amp_sv[0];
       jamp_sv[24] += amp_sv[0];
       jamp_sv[30] -= amp_sv[0];
@@ -27186,7 +27186,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 1183
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[23], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[23], COUPs[0], 1.0, &amp_fp[0] );
       jamp_sv[9] -= amp_sv[0];
       jamp_sv[15] += amp_sv[0];
       jamp_sv[24] += amp_sv[0];
@@ -27203,7 +27203,7 @@ namespace mg5amcCpu
       jamp_sv[101] -= amp_sv[0];
       jamp_sv[112] -= amp_sv[0];
       jamp_sv[118] += amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[68], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[68], COUPs[0], 1.0, &amp_fp[0] );
       jamp_sv[15] += amp_sv[0];
       jamp_sv[26] -= amp_sv[0];
       jamp_sv[30] += amp_sv[0];
@@ -27220,7 +27220,7 @@ namespace mg5amcCpu
       jamp_sv[109] += amp_sv[0];
       jamp_sv[112] -= amp_sv[0];
       jamp_sv[115] += amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[29], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[29], COUPs[0], 1.0, &amp_fp[0] );
       jamp_sv[9] += amp_sv[0];
       jamp_sv[24] -= amp_sv[0];
       jamp_sv[30] += amp_sv[0];
@@ -27244,7 +27244,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 1184
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[47], w_fp[72], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[47], w_fp[72], COUPs[1], 1.0, &amp_fp[0] );
       jamp_sv[99] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[101] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[102] += cxtype( 0, 1 ) * amp_sv[0];
@@ -27253,7 +27253,7 @@ namespace mg5amcCpu
       jamp_sv[107] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[112] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[118] -= cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[47], w_fp[60], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[47], w_fp[60], COUPs[1], 1.0, &amp_fp[0] );
       jamp_sv[101] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[103] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[104] += cxtype( 0, 1 ) * amp_sv[0];
@@ -27262,7 +27262,7 @@ namespace mg5amcCpu
       jamp_sv[109] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[112] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[115] -= cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[47], w_fp[24], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[47], w_fp[24], COUPs[1], 1.0, &amp_fp[0] );
       jamp_sv[99] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[102] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[104] += cxtype( 0, 1 ) * amp_sv[0];
@@ -27278,17 +27278,17 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 1185
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[16], w_fp[47], w_fp[1], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[16], w_fp[47], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
       jamp_sv[102] += amp_sv[0];
       jamp_sv[103] -= amp_sv[0];
       jamp_sv[105] -= amp_sv[0];
       jamp_sv[107] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[27], w_fp[47], w_fp[1], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[27], w_fp[47], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
       jamp_sv[103] -= amp_sv[0];
       jamp_sv[104] += amp_sv[0];
       jamp_sv[105] -= amp_sv[0];
       jamp_sv[106] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[98], w_fp[47], w_fp[1], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[98], w_fp[47], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
       jamp_sv[102] -= amp_sv[0];
       jamp_sv[104] += amp_sv[0];
       jamp_sv[106] += amp_sv[0];
@@ -27300,7 +27300,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 1186
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[2], w_fp[72], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[2], w_fp[72], COUPs[1], 1.0, &amp_fp[0] );
       jamp_sv[8] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[14] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[24] += cxtype( 0, 1 ) * amp_sv[0];
@@ -27309,7 +27309,7 @@ namespace mg5amcCpu
       jamp_sv[38] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[60] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[84] -= cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[2], w_fp[60], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[2], w_fp[60], COUPs[1], 1.0, &amp_fp[0] );
       jamp_sv[14] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[26] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[30] += cxtype( 0, 1 ) * amp_sv[0];
@@ -27318,7 +27318,7 @@ namespace mg5amcCpu
       jamp_sv[50] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[60] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[74] -= cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[2], w_fp[24], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[2], w_fp[24], COUPs[1], 1.0, &amp_fp[0] );
       jamp_sv[8] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[24] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[30] += cxtype( 0, 1 ) * amp_sv[0];
@@ -27334,17 +27334,17 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 1187
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[59], w_fp[1], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[59], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
       jamp_sv[8] += amp_sv[0];
       jamp_sv[14] -= amp_sv[0];
       jamp_sv[60] -= amp_sv[0];
       jamp_sv[84] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[71], w_fp[1], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[71], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
       jamp_sv[14] -= amp_sv[0];
       jamp_sv[50] += amp_sv[0];
       jamp_sv[60] -= amp_sv[0];
       jamp_sv[74] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[21], w_fp[1], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[21], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
       jamp_sv[8] -= amp_sv[0];
       jamp_sv[50] += amp_sv[0];
       jamp_sv[74] += amp_sv[0];
@@ -27353,25 +27353,25 @@ namespace mg5amcCpu
       // *** DIAGRAM 1188 OF 1240 ***
 
       // Wavefunction(s) for diagram number 1188
-      VVVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[4], w_fp[6], COUPs[2], 0., 0., w_fp[21] );
-      VVVV3P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[4], w_fp[6], COUPs[2], 0., 0., w_fp[71] );
-      VVVV4P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[4], w_fp[6], COUPs[2], 0., 0., w_fp[59] );
-      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[21], COUPs[1], cIPD[0], cIPD[1], w_fp[24] );
-      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[71], COUPs[1], cIPD[0], cIPD[1], w_fp[60] );
-      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[59], COUPs[1], cIPD[0], cIPD[1], w_fp[72] );
+      VVVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[4], w_fp[6], COUPs[2], 1.0, 0., 0., w_fp[21] );
+      VVVV3P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[4], w_fp[6], COUPs[2], 1.0, 0., 0., w_fp[71] );
+      VVVV4P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[4], w_fp[6], COUPs[2], 1.0, 0., 0., w_fp[59] );
+      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[21], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[24] );
+      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[71], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[60] );
+      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[59], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[72] );
 
       // Amplitude(s) for diagram number 1188
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[24], w_fp[77], w_fp[5], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[24], w_fp[77], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
       jamp_sv[36] += amp_sv[0];
       jamp_sv[37] -= amp_sv[0];
       jamp_sv[39] -= amp_sv[0];
       jamp_sv[41] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[60], w_fp[77], w_fp[5], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[60], w_fp[77], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
       jamp_sv[37] -= amp_sv[0];
       jamp_sv[38] += amp_sv[0];
       jamp_sv[39] -= amp_sv[0];
       jamp_sv[40] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[72], w_fp[77], w_fp[5], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[72], w_fp[77], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
       jamp_sv[36] -= amp_sv[0];
       jamp_sv[38] += amp_sv[0];
       jamp_sv[40] += amp_sv[0];
@@ -27380,12 +27380,12 @@ namespace mg5amcCpu
       // *** DIAGRAM 1189 OF 1240 ***
 
       // Wavefunction(s) for diagram number 1189
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[21], w_fp[5], COUPs[0], 0., 0., w_fp[98] );
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[71], w_fp[5], COUPs[0], 0., 0., w_fp[27] );
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[59], w_fp[5], COUPs[0], 0., 0., w_fp[16] );
+      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[21], w_fp[5], COUPs[0], 1.0, 0., 0., w_fp[98] );
+      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[71], w_fp[5], COUPs[0], 1.0, 0., 0., w_fp[27] );
+      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[59], w_fp[5], COUPs[0], 1.0, 0., 0., w_fp[16] );
 
       // Amplitude(s) for diagram number 1189
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[77], w_fp[98], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[77], w_fp[98], COUPs[1], 1.0, &amp_fp[0] );
       jamp_sv[25] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[28] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[34] += cxtype( 0, 1 ) * amp_sv[0];
@@ -27394,7 +27394,7 @@ namespace mg5amcCpu
       jamp_sv[39] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[41] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[44] -= cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[77], w_fp[27], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[77], w_fp[27], COUPs[1], 1.0, &amp_fp[0] );
       jamp_sv[28] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[31] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[34] += cxtype( 0, 1 ) * amp_sv[0];
@@ -27403,7 +27403,7 @@ namespace mg5amcCpu
       jamp_sv[39] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[40] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[42] -= cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[77], w_fp[16], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[77], w_fp[16], COUPs[1], 1.0, &amp_fp[0] );
       jamp_sv[25] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[31] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[36] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -27419,17 +27419,17 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 1190
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[77], w_fp[21], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[77], w_fp[21], COUPs[1], 1.0, &amp_fp[0] );
       jamp_sv[25] += amp_sv[0];
       jamp_sv[28] -= amp_sv[0];
       jamp_sv[34] -= amp_sv[0];
       jamp_sv[44] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[77], w_fp[71], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[77], w_fp[71], COUPs[1], 1.0, &amp_fp[0] );
       jamp_sv[28] -= amp_sv[0];
       jamp_sv[31] += amp_sv[0];
       jamp_sv[34] -= amp_sv[0];
       jamp_sv[42] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[77], w_fp[59], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[77], w_fp[59], COUPs[1], 1.0, &amp_fp[0] );
       jamp_sv[25] -= amp_sv[0];
       jamp_sv[31] += amp_sv[0];
       jamp_sv[42] += amp_sv[0];
@@ -27438,22 +27438,22 @@ namespace mg5amcCpu
       // *** DIAGRAM 1191 OF 1240 ***
 
       // Wavefunction(s) for diagram number 1191
-      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[2], w_fp[21], COUPs[1], cIPD[0], cIPD[1], w_fp[29] );
-      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[2], w_fp[71], COUPs[1], cIPD[0], cIPD[1], w_fp[68] );
-      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[2], w_fp[59], COUPs[1], cIPD[0], cIPD[1], w_fp[23] );
+      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[2], w_fp[21], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[29] );
+      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[2], w_fp[71], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[68] );
+      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[2], w_fp[59], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[23] );
 
       // Amplitude(s) for diagram number 1191
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[29], w_fp[5], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[29], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
       jamp_sv[11] += amp_sv[0];
       jamp_sv[21] -= amp_sv[0];
       jamp_sv[67] -= amp_sv[0];
       jamp_sv[109] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[68], w_fp[5], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[68], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
       jamp_sv[21] -= amp_sv[0];
       jamp_sv[53] += amp_sv[0];
       jamp_sv[67] -= amp_sv[0];
       jamp_sv[99] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[23], w_fp[5], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[23], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
       jamp_sv[11] -= amp_sv[0];
       jamp_sv[53] += amp_sv[0];
       jamp_sv[99] += amp_sv[0];
@@ -27465,7 +27465,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 1192
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[2], w_fp[98], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[2], w_fp[98], COUPs[1], 1.0, &amp_fp[0] );
       jamp_sv[11] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[21] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[67] += cxtype( 0, 1 ) * amp_sv[0];
@@ -27474,7 +27474,7 @@ namespace mg5amcCpu
       jamp_sv[88] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[94] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[109] -= cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[2], w_fp[27], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[2], w_fp[27], COUPs[1], 1.0, &amp_fp[0] );
       jamp_sv[21] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[53] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[67] += cxtype( 0, 1 ) * amp_sv[0];
@@ -27483,7 +27483,7 @@ namespace mg5amcCpu
       jamp_sv[88] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[91] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[99] -= cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[2], w_fp[16], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[2], w_fp[16], COUPs[1], 1.0, &amp_fp[0] );
       jamp_sv[11] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[53] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[75] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -27499,17 +27499,17 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 1193
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[39], w_fp[21], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[39], w_fp[21], COUPs[1], 1.0, &amp_fp[0] );
       jamp_sv[75] += amp_sv[0];
       jamp_sv[77] -= amp_sv[0];
       jamp_sv[88] -= amp_sv[0];
       jamp_sv[94] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[39], w_fp[71], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[39], w_fp[71], COUPs[1], 1.0, &amp_fp[0] );
       jamp_sv[77] -= amp_sv[0];
       jamp_sv[85] += amp_sv[0];
       jamp_sv[88] -= amp_sv[0];
       jamp_sv[91] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[39], w_fp[59], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[39], w_fp[59], COUPs[1], 1.0, &amp_fp[0] );
       jamp_sv[75] -= amp_sv[0];
       jamp_sv[85] += amp_sv[0];
       jamp_sv[91] += amp_sv[0];
@@ -27521,7 +27521,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 1194
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[29], w_fp[66], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[29], w_fp[66], COUPs[1], 1.0, &amp_fp[0] );
       jamp_sv[10] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[11] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[20] += cxtype( 0, 1 ) * amp_sv[0];
@@ -27530,7 +27530,7 @@ namespace mg5amcCpu
       jamp_sv[67] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[108] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[109] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[68], w_fp[66], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[68], w_fp[66], COUPs[1], 1.0, &amp_fp[0] );
       jamp_sv[20] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[21] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[52] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -27539,7 +27539,7 @@ namespace mg5amcCpu
       jamp_sv[67] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[98] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[99] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[23], w_fp[66], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[23], w_fp[66], COUPs[1], 1.0, &amp_fp[0] );
       jamp_sv[10] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[11] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[52] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -27555,7 +27555,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 1195
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[24], w_fp[2], w_fp[66], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[24], w_fp[2], w_fp[66], COUPs[1], 1.0, &amp_fp[0] );
       jamp_sv[36] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[37] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[39] += cxtype( 0, 1 ) * amp_sv[0];
@@ -27564,7 +27564,7 @@ namespace mg5amcCpu
       jamp_sv[79] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[81] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[83] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[60], w_fp[2], w_fp[66], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[60], w_fp[2], w_fp[66], COUPs[1], 1.0, &amp_fp[0] );
       jamp_sv[37] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[38] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[39] += cxtype( 0, 1 ) * amp_sv[0];
@@ -27573,7 +27573,7 @@ namespace mg5amcCpu
       jamp_sv[80] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[81] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[82] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[72], w_fp[2], w_fp[66], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[72], w_fp[2], w_fp[66], COUPs[1], 1.0, &amp_fp[0] );
       jamp_sv[36] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[38] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[40] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -27589,7 +27589,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 1196
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[21], w_fp[66], w_fp[8], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[21], w_fp[66], w_fp[8], COUPs[0], 1.0, &amp_fp[0] );
       jamp_sv[10] -= amp_sv[0];
       jamp_sv[11] += amp_sv[0];
       jamp_sv[20] += amp_sv[0];
@@ -27606,7 +27606,7 @@ namespace mg5amcCpu
       jamp_sv[83] -= amp_sv[0];
       jamp_sv[108] -= amp_sv[0];
       jamp_sv[109] += amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[71], w_fp[66], w_fp[8], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[71], w_fp[66], w_fp[8], COUPs[0], 1.0, &amp_fp[0] );
       jamp_sv[20] += amp_sv[0];
       jamp_sv[21] -= amp_sv[0];
       jamp_sv[37] -= amp_sv[0];
@@ -27623,7 +27623,7 @@ namespace mg5amcCpu
       jamp_sv[82] -= amp_sv[0];
       jamp_sv[98] -= amp_sv[0];
       jamp_sv[99] += amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[59], w_fp[66], w_fp[8], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[59], w_fp[66], w_fp[8], COUPs[0], 1.0, &amp_fp[0] );
       jamp_sv[10] += amp_sv[0];
       jamp_sv[11] -= amp_sv[0];
       jamp_sv[36] -= amp_sv[0];
@@ -27647,7 +27647,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 1197
-      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[21], w_fp[1], w_fp[8], w_fp[5], COUPs[2], &amp_fp[0] );
+      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[21], w_fp[1], w_fp[8], w_fp[5], COUPs[2], 1.0, &amp_fp[0] );
       jamp_sv[10] += amp_sv[0];
       jamp_sv[20] -= amp_sv[0];
       jamp_sv[25] -= amp_sv[0];
@@ -27664,7 +27664,7 @@ namespace mg5amcCpu
       jamp_sv[88] += amp_sv[0];
       jamp_sv[94] -= amp_sv[0];
       jamp_sv[108] += amp_sv[0];
-      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[21], w_fp[1], w_fp[8], w_fp[5], COUPs[2], &amp_fp[0] );
+      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[21], w_fp[1], w_fp[8], w_fp[5], COUPs[2], 1.0, &amp_fp[0] );
       jamp_sv[10] += amp_sv[0];
       jamp_sv[11] -= amp_sv[0];
       jamp_sv[20] -= amp_sv[0];
@@ -27681,7 +27681,7 @@ namespace mg5amcCpu
       jamp_sv[83] += amp_sv[0];
       jamp_sv[108] += amp_sv[0];
       jamp_sv[109] -= amp_sv[0];
-      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[21], w_fp[1], w_fp[8], w_fp[5], COUPs[2], &amp_fp[0] );
+      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[21], w_fp[1], w_fp[8], w_fp[5], COUPs[2], 1.0, &amp_fp[0] );
       jamp_sv[11] -= amp_sv[0];
       jamp_sv[21] += amp_sv[0];
       jamp_sv[25] += amp_sv[0];
@@ -27698,7 +27698,7 @@ namespace mg5amcCpu
       jamp_sv[88] -= amp_sv[0];
       jamp_sv[94] += amp_sv[0];
       jamp_sv[109] -= amp_sv[0];
-      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[71], w_fp[1], w_fp[8], w_fp[5], COUPs[2], &amp_fp[0] );
+      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[71], w_fp[1], w_fp[8], w_fp[5], COUPs[2], 1.0, &amp_fp[0] );
       jamp_sv[20] -= amp_sv[0];
       jamp_sv[28] += amp_sv[0];
       jamp_sv[31] -= amp_sv[0];
@@ -27715,7 +27715,7 @@ namespace mg5amcCpu
       jamp_sv[88] += amp_sv[0];
       jamp_sv[91] -= amp_sv[0];
       jamp_sv[98] += amp_sv[0];
-      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[71], w_fp[1], w_fp[8], w_fp[5], COUPs[2], &amp_fp[0] );
+      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[71], w_fp[1], w_fp[8], w_fp[5], COUPs[2], 1.0, &amp_fp[0] );
       jamp_sv[20] -= amp_sv[0];
       jamp_sv[21] += amp_sv[0];
       jamp_sv[37] += amp_sv[0];
@@ -27732,7 +27732,7 @@ namespace mg5amcCpu
       jamp_sv[82] += amp_sv[0];
       jamp_sv[98] += amp_sv[0];
       jamp_sv[99] -= amp_sv[0];
-      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[71], w_fp[1], w_fp[8], w_fp[5], COUPs[2], &amp_fp[0] );
+      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[71], w_fp[1], w_fp[8], w_fp[5], COUPs[2], 1.0, &amp_fp[0] );
       jamp_sv[21] += amp_sv[0];
       jamp_sv[28] -= amp_sv[0];
       jamp_sv[31] += amp_sv[0];
@@ -27749,7 +27749,7 @@ namespace mg5amcCpu
       jamp_sv[88] -= amp_sv[0];
       jamp_sv[91] += amp_sv[0];
       jamp_sv[99] -= amp_sv[0];
-      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[59], w_fp[1], w_fp[8], w_fp[5], COUPs[2], &amp_fp[0] );
+      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[59], w_fp[1], w_fp[8], w_fp[5], COUPs[2], 1.0, &amp_fp[0] );
       jamp_sv[10] -= amp_sv[0];
       jamp_sv[25] += amp_sv[0];
       jamp_sv[31] -= amp_sv[0];
@@ -27766,7 +27766,7 @@ namespace mg5amcCpu
       jamp_sv[94] += amp_sv[0];
       jamp_sv[98] += amp_sv[0];
       jamp_sv[108] -= amp_sv[0];
-      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[59], w_fp[1], w_fp[8], w_fp[5], COUPs[2], &amp_fp[0] );
+      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[59], w_fp[1], w_fp[8], w_fp[5], COUPs[2], 1.0, &amp_fp[0] );
       jamp_sv[10] -= amp_sv[0];
       jamp_sv[11] += amp_sv[0];
       jamp_sv[36] += amp_sv[0];
@@ -27783,7 +27783,7 @@ namespace mg5amcCpu
       jamp_sv[99] -= amp_sv[0];
       jamp_sv[108] -= amp_sv[0];
       jamp_sv[109] += amp_sv[0];
-      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[59], w_fp[1], w_fp[8], w_fp[5], COUPs[2], &amp_fp[0] );
+      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[59], w_fp[1], w_fp[8], w_fp[5], COUPs[2], 1.0, &amp_fp[0] );
       jamp_sv[11] += amp_sv[0];
       jamp_sv[25] -= amp_sv[0];
       jamp_sv[31] += amp_sv[0];
@@ -27804,12 +27804,12 @@ namespace mg5amcCpu
       // *** DIAGRAM 1198 OF 1240 ***
 
       // Wavefunction(s) for diagram number 1198
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[21], w_fp[1], COUPs[0], 0., 0., w_fp[66] );
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[71], w_fp[1], COUPs[0], 0., 0., w_fp[21] );
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[59], w_fp[1], COUPs[0], 0., 0., w_fp[71] );
+      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[21], w_fp[1], COUPs[0], 1.0, 0., 0., w_fp[66] );
+      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[71], w_fp[1], COUPs[0], 1.0, 0., 0., w_fp[21] );
+      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[59], w_fp[1], COUPs[0], 1.0, 0., 0., w_fp[71] );
 
       // Amplitude(s) for diagram number 1198
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[5], w_fp[66], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[5], w_fp[66], COUPs[0], 1.0, &amp_fp[0] );
       jamp_sv[10] += amp_sv[0];
       jamp_sv[20] -= amp_sv[0];
       jamp_sv[25] -= amp_sv[0];
@@ -27826,7 +27826,7 @@ namespace mg5amcCpu
       jamp_sv[88] += amp_sv[0];
       jamp_sv[94] -= amp_sv[0];
       jamp_sv[108] += amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[5], w_fp[21], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[5], w_fp[21], COUPs[0], 1.0, &amp_fp[0] );
       jamp_sv[20] -= amp_sv[0];
       jamp_sv[28] += amp_sv[0];
       jamp_sv[31] -= amp_sv[0];
@@ -27843,7 +27843,7 @@ namespace mg5amcCpu
       jamp_sv[88] += amp_sv[0];
       jamp_sv[91] -= amp_sv[0];
       jamp_sv[98] += amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[5], w_fp[71], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[5], w_fp[71], COUPs[0], 1.0, &amp_fp[0] );
       jamp_sv[10] -= amp_sv[0];
       jamp_sv[25] += amp_sv[0];
       jamp_sv[31] -= amp_sv[0];
@@ -27867,7 +27867,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 1199
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[98], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[98], COUPs[0], 1.0, &amp_fp[0] );
       jamp_sv[11] -= amp_sv[0];
       jamp_sv[21] += amp_sv[0];
       jamp_sv[25] += amp_sv[0];
@@ -27884,7 +27884,7 @@ namespace mg5amcCpu
       jamp_sv[88] -= amp_sv[0];
       jamp_sv[94] += amp_sv[0];
       jamp_sv[109] -= amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[27], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[27], COUPs[0], 1.0, &amp_fp[0] );
       jamp_sv[21] += amp_sv[0];
       jamp_sv[28] -= amp_sv[0];
       jamp_sv[31] += amp_sv[0];
@@ -27901,7 +27901,7 @@ namespace mg5amcCpu
       jamp_sv[88] -= amp_sv[0];
       jamp_sv[91] += amp_sv[0];
       jamp_sv[99] -= amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[16], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[16], COUPs[0], 1.0, &amp_fp[0] );
       jamp_sv[11] += amp_sv[0];
       jamp_sv[25] -= amp_sv[0];
       jamp_sv[31] += amp_sv[0];
@@ -27925,7 +27925,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 1200
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[39], w_fp[66], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[39], w_fp[66], COUPs[1], 1.0, &amp_fp[0] );
       jamp_sv[75] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[77] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[78] += cxtype( 0, 1 ) * amp_sv[0];
@@ -27934,7 +27934,7 @@ namespace mg5amcCpu
       jamp_sv[83] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[88] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[94] -= cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[39], w_fp[21], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[39], w_fp[21], COUPs[1], 1.0, &amp_fp[0] );
       jamp_sv[77] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[79] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[80] += cxtype( 0, 1 ) * amp_sv[0];
@@ -27943,7 +27943,7 @@ namespace mg5amcCpu
       jamp_sv[85] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[88] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[91] -= cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[39], w_fp[71], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[39], w_fp[71], COUPs[1], 1.0, &amp_fp[0] );
       jamp_sv[75] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[78] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[80] += cxtype( 0, 1 ) * amp_sv[0];
@@ -27959,17 +27959,17 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 1201
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[24], w_fp[39], w_fp[1], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[24], w_fp[39], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
       jamp_sv[78] += amp_sv[0];
       jamp_sv[79] -= amp_sv[0];
       jamp_sv[81] -= amp_sv[0];
       jamp_sv[83] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[60], w_fp[39], w_fp[1], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[60], w_fp[39], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
       jamp_sv[79] -= amp_sv[0];
       jamp_sv[80] += amp_sv[0];
       jamp_sv[81] -= amp_sv[0];
       jamp_sv[82] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[72], w_fp[39], w_fp[1], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[72], w_fp[39], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
       jamp_sv[78] -= amp_sv[0];
       jamp_sv[80] += amp_sv[0];
       jamp_sv[82] += amp_sv[0];
@@ -27981,7 +27981,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 1202
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[2], w_fp[66], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[2], w_fp[66], COUPs[1], 1.0, &amp_fp[0] );
       jamp_sv[10] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[20] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[25] += cxtype( 0, 1 ) * amp_sv[0];
@@ -27990,7 +27990,7 @@ namespace mg5amcCpu
       jamp_sv[44] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[66] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[108] -= cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[2], w_fp[21], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[2], w_fp[21], COUPs[1], 1.0, &amp_fp[0] );
       jamp_sv[20] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[28] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[31] += cxtype( 0, 1 ) * amp_sv[0];
@@ -27999,7 +27999,7 @@ namespace mg5amcCpu
       jamp_sv[52] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[66] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[98] -= cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[2], w_fp[71], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[2], w_fp[71], COUPs[1], 1.0, &amp_fp[0] );
       jamp_sv[10] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[25] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[31] += cxtype( 0, 1 ) * amp_sv[0];
@@ -28015,17 +28015,17 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 1203
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[29], w_fp[1], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[29], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
       jamp_sv[10] += amp_sv[0];
       jamp_sv[20] -= amp_sv[0];
       jamp_sv[66] -= amp_sv[0];
       jamp_sv[108] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[68], w_fp[1], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[68], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
       jamp_sv[20] -= amp_sv[0];
       jamp_sv[52] += amp_sv[0];
       jamp_sv[66] -= amp_sv[0];
       jamp_sv[98] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[23], w_fp[1], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[23], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
       jamp_sv[10] -= amp_sv[0];
       jamp_sv[52] += amp_sv[0];
       jamp_sv[98] += amp_sv[0];
@@ -28034,25 +28034,25 @@ namespace mg5amcCpu
       // *** DIAGRAM 1204 OF 1240 ***
 
       // Wavefunction(s) for diagram number 1204
-      VVVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[5], w_fp[6], COUPs[2], 0., 0., w_fp[23] );
-      VVVV3P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[5], w_fp[6], COUPs[2], 0., 0., w_fp[68] );
-      VVVV4P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[5], w_fp[6], COUPs[2], 0., 0., w_fp[29] );
-      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[23], COUPs[1], cIPD[0], cIPD[1], w_fp[71] );
-      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[68], COUPs[1], cIPD[0], cIPD[1], w_fp[21] );
-      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[29], COUPs[1], cIPD[0], cIPD[1], w_fp[66] );
+      VVVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[5], w_fp[6], COUPs[2], 1.0, 0., 0., w_fp[23] );
+      VVVV3P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[5], w_fp[6], COUPs[2], 1.0, 0., 0., w_fp[68] );
+      VVVV4P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[5], w_fp[6], COUPs[2], 1.0, 0., 0., w_fp[29] );
+      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[23], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[71] );
+      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[68], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[21] );
+      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[29], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[66] );
 
       // Amplitude(s) for diagram number 1204
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[71], w_fp[77], w_fp[4], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[71], w_fp[77], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
       jamp_sv[30] += amp_sv[0];
       jamp_sv[31] -= amp_sv[0];
       jamp_sv[33] -= amp_sv[0];
       jamp_sv[35] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[21], w_fp[77], w_fp[4], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[21], w_fp[77], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
       jamp_sv[31] -= amp_sv[0];
       jamp_sv[32] += amp_sv[0];
       jamp_sv[33] -= amp_sv[0];
       jamp_sv[34] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[66], w_fp[77], w_fp[4], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[66], w_fp[77], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
       jamp_sv[30] -= amp_sv[0];
       jamp_sv[32] += amp_sv[0];
       jamp_sv[34] += amp_sv[0];
@@ -28061,12 +28061,12 @@ namespace mg5amcCpu
       // *** DIAGRAM 1205 OF 1240 ***
 
       // Wavefunction(s) for diagram number 1205
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[23], w_fp[4], COUPs[0], 0., 0., w_fp[72] );
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[68], w_fp[4], COUPs[0], 0., 0., w_fp[60] );
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[29], w_fp[4], COUPs[0], 0., 0., w_fp[24] );
+      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[23], w_fp[4], COUPs[0], 1.0, 0., 0., w_fp[72] );
+      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[68], w_fp[4], COUPs[0], 1.0, 0., 0., w_fp[60] );
+      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[29], w_fp[4], COUPs[0], 1.0, 0., 0., w_fp[24] );
 
       // Amplitude(s) for diagram number 1205
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[77], w_fp[72], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[77], w_fp[72], COUPs[1], 1.0, &amp_fp[0] );
       jamp_sv[27] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[29] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[30] += cxtype( 0, 1 ) * amp_sv[0];
@@ -28075,7 +28075,7 @@ namespace mg5amcCpu
       jamp_sv[35] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[40] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[46] -= cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[77], w_fp[60], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[77], w_fp[60], COUPs[1], 1.0, &amp_fp[0] );
       jamp_sv[29] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[31] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[32] += cxtype( 0, 1 ) * amp_sv[0];
@@ -28084,7 +28084,7 @@ namespace mg5amcCpu
       jamp_sv[37] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[40] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[43] -= cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[77], w_fp[24], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[77], w_fp[24], COUPs[1], 1.0, &amp_fp[0] );
       jamp_sv[27] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[30] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[32] += cxtype( 0, 1 ) * amp_sv[0];
@@ -28100,17 +28100,17 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 1206
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[77], w_fp[23], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[77], w_fp[23], COUPs[1], 1.0, &amp_fp[0] );
       jamp_sv[27] += amp_sv[0];
       jamp_sv[29] -= amp_sv[0];
       jamp_sv[40] -= amp_sv[0];
       jamp_sv[46] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[77], w_fp[68], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[77], w_fp[68], COUPs[1], 1.0, &amp_fp[0] );
       jamp_sv[29] -= amp_sv[0];
       jamp_sv[37] += amp_sv[0];
       jamp_sv[40] -= amp_sv[0];
       jamp_sv[43] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[77], w_fp[29], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[77], w_fp[29], COUPs[1], 1.0, &amp_fp[0] );
       jamp_sv[27] -= amp_sv[0];
       jamp_sv[37] += amp_sv[0];
       jamp_sv[43] += amp_sv[0];
@@ -28119,22 +28119,22 @@ namespace mg5amcCpu
       // *** DIAGRAM 1207 OF 1240 ***
 
       // Wavefunction(s) for diagram number 1207
-      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[2], w_fp[23], COUPs[1], cIPD[0], cIPD[1], w_fp[77] );
-      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[2], w_fp[68], COUPs[1], cIPD[0], cIPD[1], w_fp[16] );
-      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[2], w_fp[29], COUPs[1], cIPD[0], cIPD[1], w_fp[27] );
+      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[2], w_fp[23], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[77] );
+      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[2], w_fp[68], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[16] );
+      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[2], w_fp[29], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[27] );
 
       // Amplitude(s) for diagram number 1207
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[77], w_fp[4], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[77], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
       jamp_sv[17] += amp_sv[0];
       jamp_sv[23] -= amp_sv[0];
       jamp_sv[91] -= amp_sv[0];
       jamp_sv[115] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[16], w_fp[4], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[16], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
       jamp_sv[23] -= amp_sv[0];
       jamp_sv[77] += amp_sv[0];
       jamp_sv[91] -= amp_sv[0];
       jamp_sv[101] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[27], w_fp[4], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[27], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
       jamp_sv[17] -= amp_sv[0];
       jamp_sv[77] += amp_sv[0];
       jamp_sv[101] += amp_sv[0];
@@ -28146,7 +28146,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 1208
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[2], w_fp[72], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[2], w_fp[72], COUPs[1], 1.0, &amp_fp[0] );
       jamp_sv[17] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[23] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[51] += cxtype( 0, 1 ) * amp_sv[0];
@@ -28155,7 +28155,7 @@ namespace mg5amcCpu
       jamp_sv[70] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[91] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[115] -= cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[2], w_fp[60], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[2], w_fp[60], COUPs[1], 1.0, &amp_fp[0] );
       jamp_sv[23] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[53] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[61] += cxtype( 0, 1 ) * amp_sv[0];
@@ -28164,7 +28164,7 @@ namespace mg5amcCpu
       jamp_sv[77] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[91] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[101] -= cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[2], w_fp[24], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[2], w_fp[24], COUPs[1], 1.0, &amp_fp[0] );
       jamp_sv[17] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[51] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[61] += cxtype( 0, 1 ) * amp_sv[0];
@@ -28180,17 +28180,17 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 1209
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[33], w_fp[23], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[33], w_fp[23], COUPs[1], 1.0, &amp_fp[0] );
       jamp_sv[51] += amp_sv[0];
       jamp_sv[53] -= amp_sv[0];
       jamp_sv[64] -= amp_sv[0];
       jamp_sv[70] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[33], w_fp[68], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[33], w_fp[68], COUPs[1], 1.0, &amp_fp[0] );
       jamp_sv[53] -= amp_sv[0];
       jamp_sv[61] += amp_sv[0];
       jamp_sv[64] -= amp_sv[0];
       jamp_sv[67] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[33], w_fp[29], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[33], w_fp[29], COUPs[1], 1.0, &amp_fp[0] );
       jamp_sv[51] -= amp_sv[0];
       jamp_sv[61] += amp_sv[0];
       jamp_sv[67] += amp_sv[0];
@@ -28202,7 +28202,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 1210
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[77], w_fp[61], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[77], w_fp[61], COUPs[1], 1.0, &amp_fp[0] );
       jamp_sv[16] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[17] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[22] += cxtype( 0, 1 ) * amp_sv[0];
@@ -28211,7 +28211,7 @@ namespace mg5amcCpu
       jamp_sv[91] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[114] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[115] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[16], w_fp[61], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[16], w_fp[61], COUPs[1], 1.0, &amp_fp[0] );
       jamp_sv[22] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[23] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[76] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -28220,7 +28220,7 @@ namespace mg5amcCpu
       jamp_sv[91] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[100] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[101] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[27], w_fp[61], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[27], w_fp[61], COUPs[1], 1.0, &amp_fp[0] );
       jamp_sv[16] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[17] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[76] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -28236,7 +28236,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 1211
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[71], w_fp[2], w_fp[61], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[71], w_fp[2], w_fp[61], COUPs[1], 1.0, &amp_fp[0] );
       jamp_sv[30] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[31] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[33] += cxtype( 0, 1 ) * amp_sv[0];
@@ -28245,7 +28245,7 @@ namespace mg5amcCpu
       jamp_sv[55] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[57] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[59] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[21], w_fp[2], w_fp[61], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[21], w_fp[2], w_fp[61], COUPs[1], 1.0, &amp_fp[0] );
       jamp_sv[31] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[32] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[33] += cxtype( 0, 1 ) * amp_sv[0];
@@ -28254,7 +28254,7 @@ namespace mg5amcCpu
       jamp_sv[56] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[57] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[58] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[66], w_fp[2], w_fp[61], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[66], w_fp[2], w_fp[61], COUPs[1], 1.0, &amp_fp[0] );
       jamp_sv[30] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[32] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[34] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -28270,7 +28270,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 1212
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[23], w_fp[61], w_fp[8], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[23], w_fp[61], w_fp[8], COUPs[0], 1.0, &amp_fp[0] );
       jamp_sv[16] -= amp_sv[0];
       jamp_sv[17] += amp_sv[0];
       jamp_sv[22] += amp_sv[0];
@@ -28287,7 +28287,7 @@ namespace mg5amcCpu
       jamp_sv[91] -= amp_sv[0];
       jamp_sv[114] -= amp_sv[0];
       jamp_sv[115] += amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[68], w_fp[61], w_fp[8], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[68], w_fp[61], w_fp[8], COUPs[0], 1.0, &amp_fp[0] );
       jamp_sv[22] += amp_sv[0];
       jamp_sv[23] -= amp_sv[0];
       jamp_sv[31] -= amp_sv[0];
@@ -28304,7 +28304,7 @@ namespace mg5amcCpu
       jamp_sv[91] -= amp_sv[0];
       jamp_sv[100] -= amp_sv[0];
       jamp_sv[101] += amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[29], w_fp[61], w_fp[8], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[29], w_fp[61], w_fp[8], COUPs[0], 1.0, &amp_fp[0] );
       jamp_sv[16] += amp_sv[0];
       jamp_sv[17] -= amp_sv[0];
       jamp_sv[30] -= amp_sv[0];
@@ -28328,7 +28328,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 1213
-      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[23], w_fp[1], w_fp[8], w_fp[4], COUPs[2], &amp_fp[0] );
+      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[23], w_fp[1], w_fp[8], w_fp[4], COUPs[2], 1.0, &amp_fp[0] );
       jamp_sv[16] += amp_sv[0];
       jamp_sv[22] -= amp_sv[0];
       jamp_sv[27] -= amp_sv[0];
@@ -28345,7 +28345,7 @@ namespace mg5amcCpu
       jamp_sv[70] -= amp_sv[0];
       jamp_sv[90] -= amp_sv[0];
       jamp_sv[114] += amp_sv[0];
-      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[23], w_fp[1], w_fp[8], w_fp[4], COUPs[2], &amp_fp[0] );
+      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[23], w_fp[1], w_fp[8], w_fp[4], COUPs[2], 1.0, &amp_fp[0] );
       jamp_sv[16] += amp_sv[0];
       jamp_sv[17] -= amp_sv[0];
       jamp_sv[22] -= amp_sv[0];
@@ -28362,7 +28362,7 @@ namespace mg5amcCpu
       jamp_sv[91] += amp_sv[0];
       jamp_sv[114] += amp_sv[0];
       jamp_sv[115] -= amp_sv[0];
-      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[23], w_fp[1], w_fp[8], w_fp[4], COUPs[2], &amp_fp[0] );
+      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[23], w_fp[1], w_fp[8], w_fp[4], COUPs[2], 1.0, &amp_fp[0] );
       jamp_sv[17] -= amp_sv[0];
       jamp_sv[23] += amp_sv[0];
       jamp_sv[27] += amp_sv[0];
@@ -28379,7 +28379,7 @@ namespace mg5amcCpu
       jamp_sv[70] += amp_sv[0];
       jamp_sv[91] += amp_sv[0];
       jamp_sv[115] -= amp_sv[0];
-      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[68], w_fp[1], w_fp[8], w_fp[4], COUPs[2], &amp_fp[0] );
+      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[68], w_fp[1], w_fp[8], w_fp[4], COUPs[2], 1.0, &amp_fp[0] );
       jamp_sv[22] -= amp_sv[0];
       jamp_sv[29] += amp_sv[0];
       jamp_sv[37] -= amp_sv[0];
@@ -28396,7 +28396,7 @@ namespace mg5amcCpu
       jamp_sv[76] += amp_sv[0];
       jamp_sv[90] -= amp_sv[0];
       jamp_sv[100] += amp_sv[0];
-      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[68], w_fp[1], w_fp[8], w_fp[4], COUPs[2], &amp_fp[0] );
+      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[68], w_fp[1], w_fp[8], w_fp[4], COUPs[2], 1.0, &amp_fp[0] );
       jamp_sv[22] -= amp_sv[0];
       jamp_sv[23] += amp_sv[0];
       jamp_sv[31] += amp_sv[0];
@@ -28413,7 +28413,7 @@ namespace mg5amcCpu
       jamp_sv[91] += amp_sv[0];
       jamp_sv[100] += amp_sv[0];
       jamp_sv[101] -= amp_sv[0];
-      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[68], w_fp[1], w_fp[8], w_fp[4], COUPs[2], &amp_fp[0] );
+      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[68], w_fp[1], w_fp[8], w_fp[4], COUPs[2], 1.0, &amp_fp[0] );
       jamp_sv[23] += amp_sv[0];
       jamp_sv[29] -= amp_sv[0];
       jamp_sv[31] += amp_sv[0];
@@ -28430,7 +28430,7 @@ namespace mg5amcCpu
       jamp_sv[77] -= amp_sv[0];
       jamp_sv[91] += amp_sv[0];
       jamp_sv[101] -= amp_sv[0];
-      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[29], w_fp[1], w_fp[8], w_fp[4], COUPs[2], &amp_fp[0] );
+      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[29], w_fp[1], w_fp[8], w_fp[4], COUPs[2], 1.0, &amp_fp[0] );
       jamp_sv[16] -= amp_sv[0];
       jamp_sv[27] += amp_sv[0];
       jamp_sv[37] -= amp_sv[0];
@@ -28447,7 +28447,7 @@ namespace mg5amcCpu
       jamp_sv[76] += amp_sv[0];
       jamp_sv[100] += amp_sv[0];
       jamp_sv[114] -= amp_sv[0];
-      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[29], w_fp[1], w_fp[8], w_fp[4], COUPs[2], &amp_fp[0] );
+      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[29], w_fp[1], w_fp[8], w_fp[4], COUPs[2], 1.0, &amp_fp[0] );
       jamp_sv[16] -= amp_sv[0];
       jamp_sv[17] += amp_sv[0];
       jamp_sv[30] += amp_sv[0];
@@ -28464,7 +28464,7 @@ namespace mg5amcCpu
       jamp_sv[101] -= amp_sv[0];
       jamp_sv[114] -= amp_sv[0];
       jamp_sv[115] += amp_sv[0];
-      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[29], w_fp[1], w_fp[8], w_fp[4], COUPs[2], &amp_fp[0] );
+      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[29], w_fp[1], w_fp[8], w_fp[4], COUPs[2], 1.0, &amp_fp[0] );
       jamp_sv[17] += amp_sv[0];
       jamp_sv[27] -= amp_sv[0];
       jamp_sv[30] += amp_sv[0];
@@ -28485,12 +28485,12 @@ namespace mg5amcCpu
       // *** DIAGRAM 1214 OF 1240 ***
 
       // Wavefunction(s) for diagram number 1214
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[23], w_fp[1], COUPs[0], 0., 0., w_fp[61] );
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[68], w_fp[1], COUPs[0], 0., 0., w_fp[23] );
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[29], w_fp[1], COUPs[0], 0., 0., w_fp[68] );
+      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[23], w_fp[1], COUPs[0], 1.0, 0., 0., w_fp[61] );
+      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[68], w_fp[1], COUPs[0], 1.0, 0., 0., w_fp[23] );
+      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[29], w_fp[1], COUPs[0], 1.0, 0., 0., w_fp[68] );
 
       // Amplitude(s) for diagram number 1214
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[4], w_fp[61], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[4], w_fp[61], COUPs[0], 1.0, &amp_fp[0] );
       jamp_sv[16] += amp_sv[0];
       jamp_sv[22] -= amp_sv[0];
       jamp_sv[27] -= amp_sv[0];
@@ -28507,7 +28507,7 @@ namespace mg5amcCpu
       jamp_sv[70] -= amp_sv[0];
       jamp_sv[90] -= amp_sv[0];
       jamp_sv[114] += amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[4], w_fp[23], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[4], w_fp[23], COUPs[0], 1.0, &amp_fp[0] );
       jamp_sv[22] -= amp_sv[0];
       jamp_sv[29] += amp_sv[0];
       jamp_sv[37] -= amp_sv[0];
@@ -28524,7 +28524,7 @@ namespace mg5amcCpu
       jamp_sv[76] += amp_sv[0];
       jamp_sv[90] -= amp_sv[0];
       jamp_sv[100] += amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[4], w_fp[68], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[4], w_fp[68], COUPs[0], 1.0, &amp_fp[0] );
       jamp_sv[16] -= amp_sv[0];
       jamp_sv[27] += amp_sv[0];
       jamp_sv[37] -= amp_sv[0];
@@ -28548,7 +28548,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 1215
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[72], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[72], COUPs[0], 1.0, &amp_fp[0] );
       jamp_sv[17] -= amp_sv[0];
       jamp_sv[23] += amp_sv[0];
       jamp_sv[27] += amp_sv[0];
@@ -28565,7 +28565,7 @@ namespace mg5amcCpu
       jamp_sv[70] += amp_sv[0];
       jamp_sv[91] += amp_sv[0];
       jamp_sv[115] -= amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[60], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[60], COUPs[0], 1.0, &amp_fp[0] );
       jamp_sv[23] += amp_sv[0];
       jamp_sv[29] -= amp_sv[0];
       jamp_sv[31] += amp_sv[0];
@@ -28582,7 +28582,7 @@ namespace mg5amcCpu
       jamp_sv[77] -= amp_sv[0];
       jamp_sv[91] += amp_sv[0];
       jamp_sv[101] -= amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[24], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[24], COUPs[0], 1.0, &amp_fp[0] );
       jamp_sv[17] += amp_sv[0];
       jamp_sv[27] -= amp_sv[0];
       jamp_sv[30] += amp_sv[0];
@@ -28606,7 +28606,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 1216
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[33], w_fp[61], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[33], w_fp[61], COUPs[1], 1.0, &amp_fp[0] );
       jamp_sv[51] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[53] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[54] += cxtype( 0, 1 ) * amp_sv[0];
@@ -28615,7 +28615,7 @@ namespace mg5amcCpu
       jamp_sv[59] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[64] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[70] -= cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[33], w_fp[23], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[33], w_fp[23], COUPs[1], 1.0, &amp_fp[0] );
       jamp_sv[53] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[55] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[56] += cxtype( 0, 1 ) * amp_sv[0];
@@ -28624,7 +28624,7 @@ namespace mg5amcCpu
       jamp_sv[61] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[64] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[67] -= cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[33], w_fp[68], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[33], w_fp[68], COUPs[1], 1.0, &amp_fp[0] );
       jamp_sv[51] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[54] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[56] += cxtype( 0, 1 ) * amp_sv[0];
@@ -28640,17 +28640,17 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 1217
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[71], w_fp[33], w_fp[1], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[71], w_fp[33], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
       jamp_sv[54] += amp_sv[0];
       jamp_sv[55] -= amp_sv[0];
       jamp_sv[57] -= amp_sv[0];
       jamp_sv[59] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[21], w_fp[33], w_fp[1], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[21], w_fp[33], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
       jamp_sv[55] -= amp_sv[0];
       jamp_sv[56] += amp_sv[0];
       jamp_sv[57] -= amp_sv[0];
       jamp_sv[58] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[66], w_fp[33], w_fp[1], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[66], w_fp[33], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
       jamp_sv[54] -= amp_sv[0];
       jamp_sv[56] += amp_sv[0];
       jamp_sv[58] += amp_sv[0];
@@ -28662,7 +28662,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 1218
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[2], w_fp[61], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[2], w_fp[61], COUPs[1], 1.0, &amp_fp[0] );
       jamp_sv[16] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[22] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[27] += cxtype( 0, 1 ) * amp_sv[0];
@@ -28671,7 +28671,7 @@ namespace mg5amcCpu
       jamp_sv[46] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[90] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[114] -= cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[2], w_fp[23], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[2], w_fp[23], COUPs[1], 1.0, &amp_fp[0] );
       jamp_sv[22] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[29] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[37] += cxtype( 0, 1 ) * amp_sv[0];
@@ -28680,7 +28680,7 @@ namespace mg5amcCpu
       jamp_sv[76] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[90] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[100] -= cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[2], w_fp[68], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[2], w_fp[68], COUPs[1], 1.0, &amp_fp[0] );
       jamp_sv[16] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[27] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[37] += cxtype( 0, 1 ) * amp_sv[0];
@@ -28696,17 +28696,17 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 1219
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[77], w_fp[1], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[77], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
       jamp_sv[16] += amp_sv[0];
       jamp_sv[22] -= amp_sv[0];
       jamp_sv[90] -= amp_sv[0];
       jamp_sv[114] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[16], w_fp[1], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[16], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
       jamp_sv[22] -= amp_sv[0];
       jamp_sv[76] += amp_sv[0];
       jamp_sv[90] -= amp_sv[0];
       jamp_sv[100] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[27], w_fp[1], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[27], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
       jamp_sv[16] -= amp_sv[0];
       jamp_sv[76] += amp_sv[0];
       jamp_sv[100] += amp_sv[0];
@@ -28718,7 +28718,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 1220
-      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[73], w_fp[8], w_fp[6], COUPs[2], &amp_fp[0] );
+      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[73], w_fp[8], w_fp[6], COUPs[2], 1.0, &amp_fp[0] );
       jamp_sv[0] += amp_sv[0];
       jamp_sv[2] -= amp_sv[0];
       jamp_sv[8] -= amp_sv[0];
@@ -28735,7 +28735,7 @@ namespace mg5amcCpu
       jamp_sv[107] -= amp_sv[0];
       jamp_sv[113] -= amp_sv[0];
       jamp_sv[119] += amp_sv[0];
-      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[73], w_fp[8], w_fp[6], COUPs[2], &amp_fp[0] );
+      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[73], w_fp[8], w_fp[6], COUPs[2], 1.0, &amp_fp[0] );
       jamp_sv[0] += amp_sv[0];
       jamp_sv[2] -= amp_sv[0];
       jamp_sv[8] -= amp_sv[0];
@@ -28752,7 +28752,7 @@ namespace mg5amcCpu
       jamp_sv[107] -= amp_sv[0];
       jamp_sv[113] -= amp_sv[0];
       jamp_sv[119] += amp_sv[0];
-      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[73], w_fp[8], w_fp[6], COUPs[2], &amp_fp[0] );
+      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[73], w_fp[8], w_fp[6], COUPs[2], 1.0, &amp_fp[0] );
       jamp_sv[18] -= amp_sv[0];
       jamp_sv[19] += amp_sv[0];
       jamp_sv[21] += amp_sv[0];
@@ -28769,7 +28769,7 @@ namespace mg5amcCpu
       jamp_sv[97] -= amp_sv[0];
       jamp_sv[99] -= amp_sv[0];
       jamp_sv[101] += amp_sv[0];
-      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[79], w_fp[8], w_fp[6], COUPs[2], &amp_fp[0] );
+      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[79], w_fp[8], w_fp[6], COUPs[2], 1.0, &amp_fp[0] );
       jamp_sv[2] -= amp_sv[0];
       jamp_sv[6] += amp_sv[0];
       jamp_sv[8] -= amp_sv[0];
@@ -28786,7 +28786,7 @@ namespace mg5amcCpu
       jamp_sv[111] += amp_sv[0];
       jamp_sv[113] -= amp_sv[0];
       jamp_sv[117] += amp_sv[0];
-      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[79], w_fp[8], w_fp[6], COUPs[2], &amp_fp[0] );
+      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[79], w_fp[8], w_fp[6], COUPs[2], 1.0, &amp_fp[0] );
       jamp_sv[2] -= amp_sv[0];
       jamp_sv[6] += amp_sv[0];
       jamp_sv[8] -= amp_sv[0];
@@ -28803,7 +28803,7 @@ namespace mg5amcCpu
       jamp_sv[111] += amp_sv[0];
       jamp_sv[113] -= amp_sv[0];
       jamp_sv[117] += amp_sv[0];
-      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[79], w_fp[8], w_fp[6], COUPs[2], &amp_fp[0] );
+      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[79], w_fp[8], w_fp[6], COUPs[2], 1.0, &amp_fp[0] );
       jamp_sv[19] += amp_sv[0];
       jamp_sv[20] -= amp_sv[0];
       jamp_sv[21] += amp_sv[0];
@@ -28820,7 +28820,7 @@ namespace mg5amcCpu
       jamp_sv[98] += amp_sv[0];
       jamp_sv[99] -= amp_sv[0];
       jamp_sv[100] += amp_sv[0];
-      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[80], w_fp[8], w_fp[6], COUPs[2], &amp_fp[0] );
+      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[80], w_fp[8], w_fp[6], COUPs[2], 1.0, &amp_fp[0] );
       jamp_sv[0] -= amp_sv[0];
       jamp_sv[6] += amp_sv[0];
       jamp_sv[12] += amp_sv[0];
@@ -28837,7 +28837,7 @@ namespace mg5amcCpu
       jamp_sv[111] += amp_sv[0];
       jamp_sv[117] += amp_sv[0];
       jamp_sv[119] -= amp_sv[0];
-      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[80], w_fp[8], w_fp[6], COUPs[2], &amp_fp[0] );
+      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[80], w_fp[8], w_fp[6], COUPs[2], 1.0, &amp_fp[0] );
       jamp_sv[0] -= amp_sv[0];
       jamp_sv[6] += amp_sv[0];
       jamp_sv[12] += amp_sv[0];
@@ -28854,7 +28854,7 @@ namespace mg5amcCpu
       jamp_sv[111] += amp_sv[0];
       jamp_sv[117] += amp_sv[0];
       jamp_sv[119] -= amp_sv[0];
-      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[80], w_fp[8], w_fp[6], COUPs[2], &amp_fp[0] );
+      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[80], w_fp[8], w_fp[6], COUPs[2], 1.0, &amp_fp[0] );
       jamp_sv[18] += amp_sv[0];
       jamp_sv[20] -= amp_sv[0];
       jamp_sv[22] -= amp_sv[0];
@@ -28875,12 +28875,12 @@ namespace mg5amcCpu
       // *** DIAGRAM 1221 OF 1240 ***
 
       // Wavefunction(s) for diagram number 1221
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[73], COUPs[0], 0., 0., w_fp[27] );
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[79], COUPs[0], 0., 0., w_fp[1] );
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[80], COUPs[0], 0., 0., w_fp[16] );
+      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[73], COUPs[0], 1.0, 0., 0., w_fp[27] );
+      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[79], COUPs[0], 1.0, 0., 0., w_fp[1] );
+      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[80], COUPs[0], 1.0, 0., 0., w_fp[16] );
 
       // Amplitude(s) for diagram number 1221
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[6], w_fp[27], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[6], w_fp[27], COUPs[0], 1.0, &amp_fp[0] );
       jamp_sv[0] += amp_sv[0];
       jamp_sv[2] -= amp_sv[0];
       jamp_sv[8] -= amp_sv[0];
@@ -28897,7 +28897,7 @@ namespace mg5amcCpu
       jamp_sv[107] -= amp_sv[0];
       jamp_sv[113] -= amp_sv[0];
       jamp_sv[119] += amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[6], w_fp[1], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[6], w_fp[1], COUPs[0], 1.0, &amp_fp[0] );
       jamp_sv[2] -= amp_sv[0];
       jamp_sv[6] += amp_sv[0];
       jamp_sv[8] -= amp_sv[0];
@@ -28914,7 +28914,7 @@ namespace mg5amcCpu
       jamp_sv[111] += amp_sv[0];
       jamp_sv[113] -= amp_sv[0];
       jamp_sv[117] += amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[6], w_fp[16], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[6], w_fp[16], COUPs[0], 1.0, &amp_fp[0] );
       jamp_sv[0] -= amp_sv[0];
       jamp_sv[6] += amp_sv[0];
       jamp_sv[12] += amp_sv[0];
@@ -28938,7 +28938,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 1222
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[73], w_fp[6], w_fp[56], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[73], w_fp[6], w_fp[56], COUPs[0], 1.0, &amp_fp[0] );
       jamp_sv[0] += amp_sv[0];
       jamp_sv[2] -= amp_sv[0];
       jamp_sv[8] -= amp_sv[0];
@@ -28955,7 +28955,7 @@ namespace mg5amcCpu
       jamp_sv[107] -= amp_sv[0];
       jamp_sv[113] -= amp_sv[0];
       jamp_sv[119] += amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[79], w_fp[6], w_fp[56], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[79], w_fp[6], w_fp[56], COUPs[0], 1.0, &amp_fp[0] );
       jamp_sv[2] -= amp_sv[0];
       jamp_sv[6] += amp_sv[0];
       jamp_sv[8] -= amp_sv[0];
@@ -28972,7 +28972,7 @@ namespace mg5amcCpu
       jamp_sv[111] += amp_sv[0];
       jamp_sv[113] -= amp_sv[0];
       jamp_sv[117] += amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[80], w_fp[6], w_fp[56], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[80], w_fp[6], w_fp[56], COUPs[0], 1.0, &amp_fp[0] );
       jamp_sv[0] -= amp_sv[0];
       jamp_sv[6] += amp_sv[0];
       jamp_sv[12] += amp_sv[0];
@@ -28996,7 +28996,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 1223
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[47], w_fp[27], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[47], w_fp[27], COUPs[1], 1.0, &amp_fp[0] );
       jamp_sv[96] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[97] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[99] += cxtype( 0, 1 ) * amp_sv[0];
@@ -29005,7 +29005,7 @@ namespace mg5amcCpu
       jamp_sv[107] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[113] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[119] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[47], w_fp[1], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[47], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
       jamp_sv[97] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[98] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[99] += cxtype( 0, 1 ) * amp_sv[0];
@@ -29014,7 +29014,7 @@ namespace mg5amcCpu
       jamp_sv[111] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[113] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[117] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[47], w_fp[16], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[47], w_fp[16], COUPs[1], 1.0, &amp_fp[0] );
       jamp_sv[96] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[98] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[100] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -29030,17 +29030,17 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 1224
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[113], w_fp[73], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[113], w_fp[73], COUPs[1], 1.0, &amp_fp[0] );
       jamp_sv[96] += amp_sv[0];
       jamp_sv[97] -= amp_sv[0];
       jamp_sv[99] -= amp_sv[0];
       jamp_sv[101] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[113], w_fp[79], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[113], w_fp[79], COUPs[1], 1.0, &amp_fp[0] );
       jamp_sv[97] -= amp_sv[0];
       jamp_sv[98] += amp_sv[0];
       jamp_sv[99] -= amp_sv[0];
       jamp_sv[100] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[113], w_fp[80], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[113], w_fp[80], COUPs[1], 1.0, &amp_fp[0] );
       jamp_sv[96] -= amp_sv[0];
       jamp_sv[98] += amp_sv[0];
       jamp_sv[100] += amp_sv[0];
@@ -29052,7 +29052,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 1225
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[2], w_fp[27], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[2], w_fp[27], COUPs[1], 1.0, &amp_fp[0] );
       jamp_sv[0] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[2] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[8] += cxtype( 0, 1 ) * amp_sv[0];
@@ -29061,7 +29061,7 @@ namespace mg5amcCpu
       jamp_sv[38] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[62] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[86] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[2], w_fp[1], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[2], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
       jamp_sv[2] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[6] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[8] += cxtype( 0, 1 ) * amp_sv[0];
@@ -29070,7 +29070,7 @@ namespace mg5amcCpu
       jamp_sv[56] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[62] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[80] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[2], w_fp[16], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[2], w_fp[16], COUPs[1], 1.0, &amp_fp[0] );
       jamp_sv[0] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[6] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[12] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -29086,17 +29086,17 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 1226
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[62], w_fp[2], w_fp[73], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[62], w_fp[2], w_fp[73], COUPs[1], 1.0, &amp_fp[0] );
       jamp_sv[32] += amp_sv[0];
       jamp_sv[38] -= amp_sv[0];
       jamp_sv[62] -= amp_sv[0];
       jamp_sv[86] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[62], w_fp[2], w_fp[79], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[62], w_fp[2], w_fp[79], COUPs[1], 1.0, &amp_fp[0] );
       jamp_sv[38] -= amp_sv[0];
       jamp_sv[56] += amp_sv[0];
       jamp_sv[62] -= amp_sv[0];
       jamp_sv[80] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[62], w_fp[2], w_fp[80], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[62], w_fp[2], w_fp[80], COUPs[1], 1.0, &amp_fp[0] );
       jamp_sv[32] -= amp_sv[0];
       jamp_sv[56] += amp_sv[0];
       jamp_sv[80] += amp_sv[0];
@@ -29108,7 +29108,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 1227
-      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[57], w_fp[8], w_fp[5], COUPs[2], &amp_fp[0] );
+      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[57], w_fp[8], w_fp[5], COUPs[2], 1.0, &amp_fp[0] );
       jamp_sv[1] += amp_sv[0];
       jamp_sv[4] -= amp_sv[0];
       jamp_sv[10] -= amp_sv[0];
@@ -29125,7 +29125,7 @@ namespace mg5amcCpu
       jamp_sv[89] -= amp_sv[0];
       jamp_sv[95] += amp_sv[0];
       jamp_sv[110] -= amp_sv[0];
-      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[57], w_fp[8], w_fp[5], COUPs[2], &amp_fp[0] );
+      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[57], w_fp[8], w_fp[5], COUPs[2], 1.0, &amp_fp[0] );
       jamp_sv[1] += amp_sv[0];
       jamp_sv[4] -= amp_sv[0];
       jamp_sv[10] -= amp_sv[0];
@@ -29142,7 +29142,7 @@ namespace mg5amcCpu
       jamp_sv[89] -= amp_sv[0];
       jamp_sv[95] += amp_sv[0];
       jamp_sv[111] -= amp_sv[0];
-      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[57], w_fp[8], w_fp[5], COUPs[2], &amp_fp[0] );
+      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[57], w_fp[8], w_fp[5], COUPs[2], 1.0, &amp_fp[0] );
       jamp_sv[12] -= amp_sv[0];
       jamp_sv[13] += amp_sv[0];
       jamp_sv[15] += amp_sv[0];
@@ -29159,7 +29159,7 @@ namespace mg5amcCpu
       jamp_sv[77] += amp_sv[0];
       jamp_sv[110] += amp_sv[0];
       jamp_sv[111] -= amp_sv[0];
-      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[81], w_fp[8], w_fp[5], COUPs[2], &amp_fp[0] );
+      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[81], w_fp[8], w_fp[5], COUPs[2], 1.0, &amp_fp[0] );
       jamp_sv[4] -= amp_sv[0];
       jamp_sv[7] += amp_sv[0];
       jamp_sv[10] -= amp_sv[0];
@@ -29176,7 +29176,7 @@ namespace mg5amcCpu
       jamp_sv[89] -= amp_sv[0];
       jamp_sv[93] += amp_sv[0];
       jamp_sv[104] -= amp_sv[0];
-      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[81], w_fp[8], w_fp[5], COUPs[2], &amp_fp[0] );
+      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[81], w_fp[8], w_fp[5], COUPs[2], 1.0, &amp_fp[0] );
       jamp_sv[4] -= amp_sv[0];
       jamp_sv[7] += amp_sv[0];
       jamp_sv[10] -= amp_sv[0];
@@ -29193,7 +29193,7 @@ namespace mg5amcCpu
       jamp_sv[89] -= amp_sv[0];
       jamp_sv[93] += amp_sv[0];
       jamp_sv[105] -= amp_sv[0];
-      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[81], w_fp[8], w_fp[5], COUPs[2], &amp_fp[0] );
+      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[81], w_fp[8], w_fp[5], COUPs[2], 1.0, &amp_fp[0] );
       jamp_sv[13] += amp_sv[0];
       jamp_sv[14] -= amp_sv[0];
       jamp_sv[15] += amp_sv[0];
@@ -29210,7 +29210,7 @@ namespace mg5amcCpu
       jamp_sv[76] += amp_sv[0];
       jamp_sv[104] += amp_sv[0];
       jamp_sv[105] -= amp_sv[0];
-      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[82], w_fp[8], w_fp[5], COUPs[2], &amp_fp[0] );
+      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[82], w_fp[8], w_fp[5], COUPs[2], 1.0, &amp_fp[0] );
       jamp_sv[1] -= amp_sv[0];
       jamp_sv[7] += amp_sv[0];
       jamp_sv[18] += amp_sv[0];
@@ -29227,7 +29227,7 @@ namespace mg5amcCpu
       jamp_sv[95] -= amp_sv[0];
       jamp_sv[104] -= amp_sv[0];
       jamp_sv[110] += amp_sv[0];
-      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[82], w_fp[8], w_fp[5], COUPs[2], &amp_fp[0] );
+      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[82], w_fp[8], w_fp[5], COUPs[2], 1.0, &amp_fp[0] );
       jamp_sv[1] -= amp_sv[0];
       jamp_sv[7] += amp_sv[0];
       jamp_sv[12] += amp_sv[0];
@@ -29244,7 +29244,7 @@ namespace mg5amcCpu
       jamp_sv[95] -= amp_sv[0];
       jamp_sv[105] -= amp_sv[0];
       jamp_sv[111] += amp_sv[0];
-      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[82], w_fp[8], w_fp[5], COUPs[2], &amp_fp[0] );
+      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[82], w_fp[8], w_fp[5], COUPs[2], 1.0, &amp_fp[0] );
       jamp_sv[12] += amp_sv[0];
       jamp_sv[14] -= amp_sv[0];
       jamp_sv[16] -= amp_sv[0];
@@ -29265,12 +29265,12 @@ namespace mg5amcCpu
       // *** DIAGRAM 1228 OF 1240 ***
 
       // Wavefunction(s) for diagram number 1228
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[57], COUPs[0], 0., 0., w_fp[62] );
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[81], COUPs[0], 0., 0., w_fp[80] );
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[82], COUPs[0], 0., 0., w_fp[79] );
+      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[57], COUPs[0], 1.0, 0., 0., w_fp[62] );
+      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[81], COUPs[0], 1.0, 0., 0., w_fp[80] );
+      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[82], COUPs[0], 1.0, 0., 0., w_fp[79] );
 
       // Amplitude(s) for diagram number 1228
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[5], w_fp[62], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[5], w_fp[62], COUPs[0], 1.0, &amp_fp[0] );
       jamp_sv[1] += amp_sv[0];
       jamp_sv[4] -= amp_sv[0];
       jamp_sv[10] -= amp_sv[0];
@@ -29287,7 +29287,7 @@ namespace mg5amcCpu
       jamp_sv[89] -= amp_sv[0];
       jamp_sv[95] += amp_sv[0];
       jamp_sv[110] -= amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[5], w_fp[80], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[5], w_fp[80], COUPs[0], 1.0, &amp_fp[0] );
       jamp_sv[4] -= amp_sv[0];
       jamp_sv[7] += amp_sv[0];
       jamp_sv[10] -= amp_sv[0];
@@ -29304,7 +29304,7 @@ namespace mg5amcCpu
       jamp_sv[89] -= amp_sv[0];
       jamp_sv[93] += amp_sv[0];
       jamp_sv[104] -= amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[5], w_fp[79], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[5], w_fp[79], COUPs[0], 1.0, &amp_fp[0] );
       jamp_sv[1] -= amp_sv[0];
       jamp_sv[7] += amp_sv[0];
       jamp_sv[18] += amp_sv[0];
@@ -29328,7 +29328,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 1229
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[57], w_fp[5], w_fp[56], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[57], w_fp[5], w_fp[56], COUPs[0], 1.0, &amp_fp[0] );
       jamp_sv[1] += amp_sv[0];
       jamp_sv[4] -= amp_sv[0];
       jamp_sv[10] -= amp_sv[0];
@@ -29345,7 +29345,7 @@ namespace mg5amcCpu
       jamp_sv[89] -= amp_sv[0];
       jamp_sv[95] += amp_sv[0];
       jamp_sv[111] -= amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[81], w_fp[5], w_fp[56], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[81], w_fp[5], w_fp[56], COUPs[0], 1.0, &amp_fp[0] );
       jamp_sv[4] -= amp_sv[0];
       jamp_sv[7] += amp_sv[0];
       jamp_sv[10] -= amp_sv[0];
@@ -29362,7 +29362,7 @@ namespace mg5amcCpu
       jamp_sv[89] -= amp_sv[0];
       jamp_sv[93] += amp_sv[0];
       jamp_sv[105] -= amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[82], w_fp[5], w_fp[56], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[82], w_fp[5], w_fp[56], COUPs[0], 1.0, &amp_fp[0] );
       jamp_sv[1] -= amp_sv[0];
       jamp_sv[7] += amp_sv[0];
       jamp_sv[12] += amp_sv[0];
@@ -29386,7 +29386,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 1230
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[39], w_fp[62], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[39], w_fp[62], COUPs[1], 1.0, &amp_fp[0] );
       jamp_sv[72] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[73] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[75] += cxtype( 0, 1 ) * amp_sv[0];
@@ -29395,7 +29395,7 @@ namespace mg5amcCpu
       jamp_sv[83] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[89] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[95] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[39], w_fp[80], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[39], w_fp[80], COUPs[1], 1.0, &amp_fp[0] );
       jamp_sv[73] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[74] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[75] += cxtype( 0, 1 ) * amp_sv[0];
@@ -29404,7 +29404,7 @@ namespace mg5amcCpu
       jamp_sv[87] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[89] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[93] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[39], w_fp[79], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[39], w_fp[79], COUPs[1], 1.0, &amp_fp[0] );
       jamp_sv[72] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[74] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[76] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -29420,17 +29420,17 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 1231
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[102], w_fp[57], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[102], w_fp[57], COUPs[1], 1.0, &amp_fp[0] );
       jamp_sv[72] += amp_sv[0];
       jamp_sv[73] -= amp_sv[0];
       jamp_sv[75] -= amp_sv[0];
       jamp_sv[77] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[102], w_fp[81], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[102], w_fp[81], COUPs[1], 1.0, &amp_fp[0] );
       jamp_sv[73] -= amp_sv[0];
       jamp_sv[74] += amp_sv[0];
       jamp_sv[75] -= amp_sv[0];
       jamp_sv[76] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[102], w_fp[82], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[102], w_fp[82], COUPs[1], 1.0, &amp_fp[0] );
       jamp_sv[72] -= amp_sv[0];
       jamp_sv[74] += amp_sv[0];
       jamp_sv[76] += amp_sv[0];
@@ -29442,7 +29442,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 1232
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[2], w_fp[62], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[2], w_fp[62], COUPs[1], 1.0, &amp_fp[0] );
       jamp_sv[1] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[4] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[10] += cxtype( 0, 1 ) * amp_sv[0];
@@ -29451,7 +29451,7 @@ namespace mg5amcCpu
       jamp_sv[44] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[68] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[110] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[2], w_fp[80], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[2], w_fp[80], COUPs[1], 1.0, &amp_fp[0] );
       jamp_sv[4] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[7] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[10] += cxtype( 0, 1 ) * amp_sv[0];
@@ -29460,7 +29460,7 @@ namespace mg5amcCpu
       jamp_sv[58] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[68] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[104] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[2], w_fp[79], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[2], w_fp[79], COUPs[1], 1.0, &amp_fp[0] );
       jamp_sv[1] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[7] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[18] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -29476,17 +29476,17 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 1233
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[104], w_fp[2], w_fp[57], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[104], w_fp[2], w_fp[57], COUPs[1], 1.0, &amp_fp[0] );
       jamp_sv[34] += amp_sv[0];
       jamp_sv[44] -= amp_sv[0];
       jamp_sv[68] -= amp_sv[0];
       jamp_sv[110] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[104], w_fp[2], w_fp[81], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[104], w_fp[2], w_fp[81], COUPs[1], 1.0, &amp_fp[0] );
       jamp_sv[44] -= amp_sv[0];
       jamp_sv[58] += amp_sv[0];
       jamp_sv[68] -= amp_sv[0];
       jamp_sv[104] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[104], w_fp[2], w_fp[82], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[104], w_fp[2], w_fp[82], COUPs[1], 1.0, &amp_fp[0] );
       jamp_sv[34] -= amp_sv[0];
       jamp_sv[58] += amp_sv[0];
       jamp_sv[104] += amp_sv[0];
@@ -29498,7 +29498,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 1234
-      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[55], w_fp[8], w_fp[4], COUPs[2], &amp_fp[0] );
+      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[55], w_fp[8], w_fp[4], COUPs[2], 1.0, &amp_fp[0] );
       jamp_sv[3] += amp_sv[0];
       jamp_sv[5] -= amp_sv[0];
       jamp_sv[16] -= amp_sv[0];
@@ -29515,7 +29515,7 @@ namespace mg5amcCpu
       jamp_sv[71] += amp_sv[0];
       jamp_sv[92] += amp_sv[0];
       jamp_sv[116] -= amp_sv[0];
-      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[55], w_fp[8], w_fp[4], COUPs[2], &amp_fp[0] );
+      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[55], w_fp[8], w_fp[4], COUPs[2], 1.0, &amp_fp[0] );
       jamp_sv[3] += amp_sv[0];
       jamp_sv[5] -= amp_sv[0];
       jamp_sv[6] -= amp_sv[0];
@@ -29532,7 +29532,7 @@ namespace mg5amcCpu
       jamp_sv[71] += amp_sv[0];
       jamp_sv[93] += amp_sv[0];
       jamp_sv[117] -= amp_sv[0];
-      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[55], w_fp[8], w_fp[4], COUPs[2], &amp_fp[0] );
+      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[55], w_fp[8], w_fp[4], COUPs[2], 1.0, &amp_fp[0] );
       jamp_sv[6] -= amp_sv[0];
       jamp_sv[7] += amp_sv[0];
       jamp_sv[9] += amp_sv[0];
@@ -29549,7 +29549,7 @@ namespace mg5amcCpu
       jamp_sv[93] += amp_sv[0];
       jamp_sv[116] += amp_sv[0];
       jamp_sv[117] -= amp_sv[0];
-      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[83], w_fp[8], w_fp[4], COUPs[2], &amp_fp[0] );
+      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[83], w_fp[8], w_fp[4], COUPs[2], 1.0, &amp_fp[0] );
       jamp_sv[5] -= amp_sv[0];
       jamp_sv[13] += amp_sv[0];
       jamp_sv[16] -= amp_sv[0];
@@ -29566,7 +29566,7 @@ namespace mg5amcCpu
       jamp_sv[82] -= amp_sv[0];
       jamp_sv[92] += amp_sv[0];
       jamp_sv[106] -= amp_sv[0];
-      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[83], w_fp[8], w_fp[4], COUPs[2], &amp_fp[0] );
+      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[83], w_fp[8], w_fp[4], COUPs[2], 1.0, &amp_fp[0] );
       jamp_sv[5] -= amp_sv[0];
       jamp_sv[7] += amp_sv[0];
       jamp_sv[8] -= amp_sv[0];
@@ -29583,7 +29583,7 @@ namespace mg5amcCpu
       jamp_sv[83] -= amp_sv[0];
       jamp_sv[93] += amp_sv[0];
       jamp_sv[107] -= amp_sv[0];
-      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[83], w_fp[8], w_fp[4], COUPs[2], &amp_fp[0] );
+      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[83], w_fp[8], w_fp[4], COUPs[2], 1.0, &amp_fp[0] );
       jamp_sv[7] += amp_sv[0];
       jamp_sv[8] -= amp_sv[0];
       jamp_sv[9] += amp_sv[0];
@@ -29600,7 +29600,7 @@ namespace mg5amcCpu
       jamp_sv[93] += amp_sv[0];
       jamp_sv[106] += amp_sv[0];
       jamp_sv[107] -= amp_sv[0];
-      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[84], w_fp[8], w_fp[4], COUPs[2], &amp_fp[0] );
+      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[84], w_fp[8], w_fp[4], COUPs[2], 1.0, &amp_fp[0] );
       jamp_sv[3] -= amp_sv[0];
       jamp_sv[13] += amp_sv[0];
       jamp_sv[19] += amp_sv[0];
@@ -29617,7 +29617,7 @@ namespace mg5amcCpu
       jamp_sv[82] -= amp_sv[0];
       jamp_sv[106] -= amp_sv[0];
       jamp_sv[116] += amp_sv[0];
-      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[84], w_fp[8], w_fp[4], COUPs[2], &amp_fp[0] );
+      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[84], w_fp[8], w_fp[4], COUPs[2], 1.0, &amp_fp[0] );
       jamp_sv[3] -= amp_sv[0];
       jamp_sv[6] += amp_sv[0];
       jamp_sv[8] -= amp_sv[0];
@@ -29634,7 +29634,7 @@ namespace mg5amcCpu
       jamp_sv[83] -= amp_sv[0];
       jamp_sv[107] -= amp_sv[0];
       jamp_sv[117] += amp_sv[0];
-      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[84], w_fp[8], w_fp[4], COUPs[2], &amp_fp[0] );
+      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[84], w_fp[8], w_fp[4], COUPs[2], 1.0, &amp_fp[0] );
       jamp_sv[6] += amp_sv[0];
       jamp_sv[8] -= amp_sv[0];
       jamp_sv[10] -= amp_sv[0];
@@ -29655,12 +29655,12 @@ namespace mg5amcCpu
       // *** DIAGRAM 1235 OF 1240 ***
 
       // Wavefunction(s) for diagram number 1235
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[55], COUPs[0], 0., 0., w_fp[104] );
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[83], COUPs[0], 0., 0., w_fp[82] );
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[84], COUPs[0], 0., 0., w_fp[81] );
+      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[55], COUPs[0], 1.0, 0., 0., w_fp[104] );
+      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[83], COUPs[0], 1.0, 0., 0., w_fp[82] );
+      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[84], COUPs[0], 1.0, 0., 0., w_fp[81] );
 
       // Amplitude(s) for diagram number 1235
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[4], w_fp[104], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[4], w_fp[104], COUPs[0], 1.0, &amp_fp[0] );
       jamp_sv[3] += amp_sv[0];
       jamp_sv[5] -= amp_sv[0];
       jamp_sv[16] -= amp_sv[0];
@@ -29677,7 +29677,7 @@ namespace mg5amcCpu
       jamp_sv[71] += amp_sv[0];
       jamp_sv[92] += amp_sv[0];
       jamp_sv[116] -= amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[4], w_fp[82], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[4], w_fp[82], COUPs[0], 1.0, &amp_fp[0] );
       jamp_sv[5] -= amp_sv[0];
       jamp_sv[13] += amp_sv[0];
       jamp_sv[16] -= amp_sv[0];
@@ -29694,7 +29694,7 @@ namespace mg5amcCpu
       jamp_sv[82] -= amp_sv[0];
       jamp_sv[92] += amp_sv[0];
       jamp_sv[106] -= amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[4], w_fp[81], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[4], w_fp[81], COUPs[0], 1.0, &amp_fp[0] );
       jamp_sv[3] -= amp_sv[0];
       jamp_sv[13] += amp_sv[0];
       jamp_sv[19] += amp_sv[0];
@@ -29718,7 +29718,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 1236
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[55], w_fp[4], w_fp[56], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[55], w_fp[4], w_fp[56], COUPs[0], 1.0, &amp_fp[0] );
       jamp_sv[3] += amp_sv[0];
       jamp_sv[5] -= amp_sv[0];
       jamp_sv[6] -= amp_sv[0];
@@ -29735,7 +29735,7 @@ namespace mg5amcCpu
       jamp_sv[71] += amp_sv[0];
       jamp_sv[93] += amp_sv[0];
       jamp_sv[117] -= amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[83], w_fp[4], w_fp[56], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[83], w_fp[4], w_fp[56], COUPs[0], 1.0, &amp_fp[0] );
       jamp_sv[5] -= amp_sv[0];
       jamp_sv[7] += amp_sv[0];
       jamp_sv[8] -= amp_sv[0];
@@ -29752,7 +29752,7 @@ namespace mg5amcCpu
       jamp_sv[83] -= amp_sv[0];
       jamp_sv[93] += amp_sv[0];
       jamp_sv[107] -= amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[84], w_fp[4], w_fp[56], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[84], w_fp[4], w_fp[56], COUPs[0], 1.0, &amp_fp[0] );
       jamp_sv[3] -= amp_sv[0];
       jamp_sv[6] += amp_sv[0];
       jamp_sv[8] -= amp_sv[0];
@@ -29776,7 +29776,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 1237
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[33], w_fp[104], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[33], w_fp[104], COUPs[1], 1.0, &amp_fp[0] );
       jamp_sv[48] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[49] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[51] += cxtype( 0, 1 ) * amp_sv[0];
@@ -29785,7 +29785,7 @@ namespace mg5amcCpu
       jamp_sv[59] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[65] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[71] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[33], w_fp[82], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[33], w_fp[82], COUPs[1], 1.0, &amp_fp[0] );
       jamp_sv[49] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[50] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[51] += cxtype( 0, 1 ) * amp_sv[0];
@@ -29794,7 +29794,7 @@ namespace mg5amcCpu
       jamp_sv[63] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[65] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[69] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[33], w_fp[81], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[33], w_fp[81], COUPs[1], 1.0, &amp_fp[0] );
       jamp_sv[48] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[50] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[52] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -29810,17 +29810,17 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 1238
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[114], w_fp[55], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[114], w_fp[55], COUPs[1], 1.0, &amp_fp[0] );
       jamp_sv[48] += amp_sv[0];
       jamp_sv[49] -= amp_sv[0];
       jamp_sv[51] -= amp_sv[0];
       jamp_sv[53] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[114], w_fp[83], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[114], w_fp[83], COUPs[1], 1.0, &amp_fp[0] );
       jamp_sv[49] -= amp_sv[0];
       jamp_sv[50] += amp_sv[0];
       jamp_sv[51] -= amp_sv[0];
       jamp_sv[52] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[114], w_fp[84], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[114], w_fp[84], COUPs[1], 1.0, &amp_fp[0] );
       jamp_sv[48] -= amp_sv[0];
       jamp_sv[50] += amp_sv[0];
       jamp_sv[52] += amp_sv[0];
@@ -29832,7 +29832,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 1239
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[2], w_fp[104], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[2], w_fp[104], COUPs[1], 1.0, &amp_fp[0] );
       jamp_sv[3] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[5] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[16] += cxtype( 0, 1 ) * amp_sv[0];
@@ -29841,7 +29841,7 @@ namespace mg5amcCpu
       jamp_sv[46] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[92] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[116] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[2], w_fp[82], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[2], w_fp[82], COUPs[1], 1.0, &amp_fp[0] );
       jamp_sv[5] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[13] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[16] += cxtype( 0, 1 ) * amp_sv[0];
@@ -29850,7 +29850,7 @@ namespace mg5amcCpu
       jamp_sv[82] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[92] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[106] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[2], w_fp[81], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[2], w_fp[81], COUPs[1], 1.0, &amp_fp[0] );
       jamp_sv[3] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[13] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[19] -= cxtype( 0, 1 ) * amp_sv[0];
@@ -29866,17 +29866,17 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 1240
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[2], w_fp[55], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[2], w_fp[55], COUPs[1], 1.0, &amp_fp[0] );
       jamp_sv[40] += amp_sv[0];
       jamp_sv[46] -= amp_sv[0];
       jamp_sv[92] -= amp_sv[0];
       jamp_sv[116] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[2], w_fp[83], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[2], w_fp[83], COUPs[1], 1.0, &amp_fp[0] );
       jamp_sv[46] -= amp_sv[0];
       jamp_sv[82] += amp_sv[0];
       jamp_sv[92] -= amp_sv[0];
       jamp_sv[106] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[2], w_fp[84], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[2], w_fp[84], COUPs[1], 1.0, &amp_fp[0] );
       jamp_sv[40] -= amp_sv[0];
       jamp_sv[82] += amp_sv[0];
       jamp_sv[106] += amp_sv[0];
@@ -30480,12 +30480,12 @@ namespace mg5amcCpu
                        fptype* allDenominators,    // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
                        bool* isGoodHel )           // output: isGoodHel[ncomb] - device array (CUDA implementation)
-  { /* clang-format on */
-    fptype allMEsLast = 0;
+  {                                                         /* clang-format on */
     const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid
-    allMEs[ievt] = 0;
     for( int ihel = 0; ihel < ncomb; ihel++ )
     {
+      // NEW IMPLEMENTATION OF GETGOODHEL (#630): RESET THE RUNNING SUM OVER HELICITIES TO 0 BEFORE ADDING A NEW HELICITY
+      allMEs[ievt] = 0;
       // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s)
       constexpr fptype_sv* jamp2_sv = nullptr; // no need for color selection during helicity filtering
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
@@ -30494,12 +30494,11 @@ namespace mg5amcCpu
 #else
       calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv );
 #endif
-      if( allMEs[ievt] != allMEsLast )
+      if( allMEs[ievt] != 0 ) // NEW IMPLEMENTATION OF GETGOODHEL (#630): COMPARE EACH HELICITY CONTRIBUTION TO 0
       {
         //if ( !isGoodHel[ihel] ) std::cout << "sigmaKin_getGoodHel ihel=" << ihel << " TRUE" << std::endl;
         isGoodHel[ihel] = true;
       }
-      allMEsLast = allMEs[ievt]; // running sum up to helicity ihel for event ievt
     }
   }
 #else
@@ -30518,19 +30517,11 @@ namespace mg5amcCpu
     //assert( (size_t)(allMEs) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS]
     // Allocate arrays at build time to contain at least 16 events (or at least neppV events if neppV>16, e.g. in future VPUs)
     constexpr int maxtry0 = std::max( 16, neppV ); // 16, but at least neppV (otherwise the npagV loop does not even start)
-    fptype allMEsLast[maxtry0] = { 0 };            // allocated at build time: maxtry0 must be a constexpr
     // Loop over only nevt events if nevt is < 16 (note that nevt is always >= neppV)
     assert( nevt >= neppV );
     const int maxtry = std::min( maxtry0, nevt ); // 16, but at most nevt (avoid invalid memory access if nevt<maxtry0)
 
-    // PART 0 - INITIALISATION (before calculate_wavefunctions)
-    // Reset the "matrix elements" - running sums of |M|^2 over helicities for the given event
-    for( int ievt = 0; ievt < maxtry; ++ievt )
-    {
-      allMEs[ievt] = 0; // all zeros
-    }
-
-    // PART 1 - HELICITY LOOP: CALCULATE WAVEFUNCTIONS
+    // HELICITY LOOP: CALCULATE WAVEFUNCTIONS
     const int npagV = maxtry / neppV;
 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
     // Mixed fptypes #537: float for color algebra and double elsewhere
@@ -30549,6 +30540,16 @@ namespace mg5amcCpu
 #endif
       for( int ihel = 0; ihel < ncomb; ihel++ )
       {
+        // NEW IMPLEMENTATION OF GETGOODHEL (#630): RESET THE RUNNING SUM OVER HELICITIES TO 0 BEFORE ADDING A NEW HELICITY
+        for( int ieppV = 0; ieppV < neppV; ++ieppV )
+        {
+          const int ievt = ievt00 + ieppV;
+          allMEs[ievt] = 0;
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+          const int ievt2 = ievt00 + ieppV + neppV;
+          allMEs[ievt2] = 0;
+#endif
+        }
         constexpr fptype_sv* jamp2_sv = nullptr; // no need for color selection during helicity filtering
         //std::cout << "sigmaKin_getGoodHel ihel=" << ihel << ( isGoodHel[ihel] ? " true" : " false" ) << std::endl;
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
@@ -30560,22 +30561,18 @@ namespace mg5amcCpu
         for( int ieppV = 0; ieppV < neppV; ++ieppV )
         {
           const int ievt = ievt00 + ieppV;
-          const bool differs = ( allMEs[ievt] != allMEsLast[ievt] );
-          if( differs )
+          if( allMEs[ievt] != 0 ) // NEW IMPLEMENTATION OF GETGOODHEL (#630): COMPARE EACH HELICITY CONTRIBUTION TO 0
           {
             //if ( !isGoodHel[ihel] ) std::cout << "sigmaKin_getGoodHel ihel=" << ihel << " TRUE" << std::endl;
             isGoodHel[ihel] = true;
           }
-          allMEsLast[ievt] = allMEs[ievt]; // running sum up to helicity ihel
 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
           const int ievt2 = ievt00 + ieppV + neppV;
-          const bool differs2 = ( allMEs[ievt2] != allMEsLast[ievt2] );
-          if( differs2 )
+          if( allMEs[ievt2] != 0 ) // NEW IMPLEMENTATION OF GETGOODHEL (#630): COMPARE EACH HELICITY CONTRIBUTION TO 0
           {
             //if ( !isGoodHel[ihel] ) std::cout << "sigmaKin_getGoodHel ihel=" << ihel << " TRUE" << std::endl;
             isGoodHel[ihel] = true;
           }
-          allMEsLast[ievt2] = allMEs[ievt2]; // running sum up to helicity ihel
 #endif
         }
       }
@@ -30632,13 +30629,12 @@ namespace mg5amcCpu
   {
     mgDebugInitialise();
 
-    // SANITY CHECKS for cudacpp code generation (see issues #272 and #343 and PRs #619, #626, #360 and #396)
+    // SANITY CHECKS for cudacpp code generation (see issues #272 and #343 and PRs #619, #626, #360, #396 and #754)
     // These variable are not used anywhere else in the code and their scope is limited to this sanity check
     {
-      // nprocesses>1 was last observed for "mirror processes" in uux_ttx in the 270 branch (see issue #343 and PRs #360 and #396)
+      // nprocesses == 2 may happen for "mirror processes" such as P0_uux_ttx within pp_tt012j (see PR #754)
       constexpr int nprocesses = 1;
-      static_assert( nprocesses == 1, "Assume nprocesses == 1" );
-      // process_id corresponds to the index of DSIG1 Fortran functions (must be 1 because cudacpp is unable to handle DSIG2)
+      static_assert( nprocesses == 1 || nprocesses == 2, "Assume nprocesses == 1 or 2" );
       constexpr int process_id = 1; // code generation source: madevent + cudacpp exporter
       static_assert( process_id == 1, "Assume process_id == 1" );
     }
@@ -30724,23 +30720,26 @@ namespace mg5amcCpu
     }
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     // Event-by-event random choice of color #402
-    const int channelIdC = channelId - 1; // coloramps.h uses the C array indexing starting at 0
-    fptype targetamp[ncolor] = { 0 };
-    for( int icolC = 0; icolC < ncolor; icolC++ )
+    if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783)
     {
-      if( icolC == 0 )
-        targetamp[icolC] = 0;
-      else
-        targetamp[icolC] = targetamp[icolC - 1];
-      if( mgOnGpu::icolamp[channelIdC][icolC] ) targetamp[icolC] += jamp2_sv[icolC];
-    }
-    //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] );
-    for( int icolC = 0; icolC < ncolor; icolC++ )
-    {
-      if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) )
+      const unsigned int channelIdC = channelId - 1; // coloramps.h uses the C array indexing starting at 0
+      fptype targetamp[ncolor] = { 0 };
+      for( int icolC = 0; icolC < ncolor; icolC++ )
       {
-        allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1]
-        break;
+        if( icolC == 0 )
+          targetamp[icolC] = 0;
+        else
+          targetamp[icolC] = targetamp[icolC - 1];
+        if( mgOnGpu::icolamp[channelIdC][icolC] ) targetamp[icolC] += jamp2_sv[icolC];
+      }
+      //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] );
+      for( int icolC = 0; icolC < ncolor; icolC++ )
+      {
+        if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) )
+        {
+          allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1]
+          break;
+        }
       }
     }
 #endif
@@ -30835,57 +30834,60 @@ namespace mg5amcCpu
 #endif
       }
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // multichannel enabled (random color choice)
-      const int channelIdC = channelId - 1; // coloramps.h uses the C array indexing starting at 0
       // Event-by-event random choice of color #402
-      fptype_sv targetamp[ncolor] = { 0 };
-      for( int icolC = 0; icolC < ncolor; icolC++ )
+      if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783)
       {
-        if( icolC == 0 )
-          targetamp[icolC] = fptype_sv{ 0 };
-        else
-          targetamp[icolC] = targetamp[icolC - 1];
-        if( mgOnGpu::icolamp[channelIdC][icolC] ) targetamp[icolC] += jamp2_sv[icolC];
-      }
+        const unsigned int channelIdC = channelId - 1; // coloramps.h uses the C array indexing starting at 0
+        fptype_sv targetamp[ncolor] = { 0 };
+        for( int icolC = 0; icolC < ncolor; icolC++ )
+        {
+          if( icolC == 0 )
+            targetamp[icolC] = fptype_sv{ 0 };
+          else
+            targetamp[icolC] = targetamp[icolC - 1];
+          if( mgOnGpu::icolamp[channelIdC][icolC] ) targetamp[icolC] += jamp2_sv[icolC];
+        }
 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-      fptype_sv targetamp2[ncolor] = { 0 };
-      for( int icolC = 0; icolC < ncolor; icolC++ )
-      {
-        if( icolC == 0 )
-          targetamp2[icolC] = fptype_sv{ 0 };
-        else
-          targetamp2[icolC] = targetamp2[icolC - 1];
-        if( mgOnGpu::icolamp[channelIdC][icolC] ) targetamp2[icolC] += jamp2_sv[ncolor + icolC];
-      }
-#endif
-      for( int ieppV = 0; ieppV < neppV; ++ieppV )
-      {
-        const int ievt = ievt00 + ieppV;
-        //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] );
+        fptype_sv targetamp2[ncolor] = { 0 };
         for( int icolC = 0; icolC < ncolor; icolC++ )
         {
+          if( icolC == 0 )
+            targetamp2[icolC] = fptype_sv{ 0 };
+          else
+            targetamp2[icolC] = targetamp2[icolC - 1];
+          if( mgOnGpu::icolamp[channelIdC][icolC] ) targetamp2[icolC] += jamp2_sv[ncolor + icolC];
+        }
+#endif
+        for( int ieppV = 0; ieppV < neppV; ++ieppV )
+        {
+          const int ievt = ievt00 + ieppV;
+          //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] );
+          for( int icolC = 0; icolC < ncolor; icolC++ )
+          {
 #if defined MGONGPU_CPPSIMD
-          const bool okcol = allrndcol[ievt] < ( targetamp[icolC][ieppV] / targetamp[ncolor - 1][ieppV] );
+            const bool okcol = allrndcol[ievt] < ( targetamp[icolC][ieppV] / targetamp[ncolor - 1][ieppV] );
 #else
-          const bool okcol = allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] );
+            const bool okcol = allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] );
 #endif
-          if( okcol )
-          {
-            allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1]
-            break;
+            if( okcol )
+            {
+              allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1]
+              break;
+            }
           }
-        }
 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-        const int ievt2 = ievt00 + ieppV + neppV;
-        //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt2, allrndcol[ievt2] );
-        for( int icolC = 0; icolC < ncolor; icolC++ )
-        {
-          if( allrndcol[ievt2] < ( targetamp2[icolC][ieppV] / targetamp2[ncolor - 1][ieppV] ) )
+          const int ievt2 = ievt00 + ieppV + neppV;
+          //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt2, allrndcol[ievt2] );
+          for( int icolC = 0; icolC < ncolor; icolC++ )
           {
-            allselcol[ievt2] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1]
-            break;
+            if( allrndcol[ievt2] < ( targetamp2[icolC][ieppV] / targetamp2[ncolor - 1][ieppV] ) )
+            {
+              allselcol[ievt2] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1]
+              break;
+            }
           }
-        }
 #endif
+        }
       }
 #endif // multichannel enabled (random color choice)
     }
diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/CPPProcess.h b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/CPPProcess.h
index be0f0bc396..fff95b66e2 100644
--- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/CPPProcess.h
+++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/CPPProcess.h
@@ -7,7 +7,7 @@
 // Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+// MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/auto_dsig.f b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/auto_dsig.f
index 2d3c5725be..d2a61fa2ac 100644
--- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/auto_dsig.f
+++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/auto_dsig.f
@@ -359,7 +359,7 @@ SUBROUTINE DSIG_VEC(ALL_P,ALL_WGT,ALL_XBK,ALL_Q2FACT,ALL_CM_RAP
       DOUBLE PRECISION FUNCTION DSIG(PP,WGT,IMODE)
 C     ****************************************************
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+C     Generated by MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/auto_dsig1.f b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/auto_dsig1.f
index e6d2fc3099..f22dfbf5e6 100644
--- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/auto_dsig1.f
+++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/auto_dsig1.f
@@ -1,7 +1,7 @@
       DOUBLE PRECISION FUNCTION DSIG1(PP,WGT,IMODE)
 C     ****************************************************
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+C     Generated by MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
@@ -39,6 +39,7 @@ DOUBLE PRECISION FUNCTION DSIG1(PP,WGT,IMODE)
 C     LOCAL VARIABLES 
 C     
       INTEGER I,ITYPE,LP,IPROC
+      DOUBLE PRECISION QSCALE
       DOUBLE PRECISION G1
       DOUBLE PRECISION G2
       DOUBLE PRECISION XPQ(-7:7),PD(0:MAXPROC)
@@ -126,11 +127,24 @@ DOUBLE PRECISION FUNCTION DSIG1(PP,WGT,IMODE)
 
       IF (ABS(LPP(IB(1))).GE.1) THEN
           !LP=SIGN(1,LPP(IB(1)))
-        G1=PDG2PDF(LPP(IB(1)),0, IB(1),XBK(IB(1)),DSQRT(Q2FACT(IB(1))))
+        IF (DSQRT(Q2FACT(IB(1))).EQ.0D0) THEN
+          QSCALE=0D0
+          DO I=3,NEXTERNAL
+            QSCALE=QSCALE+DSQRT(MAX(0D0,(PP(0,I)+PP(3,I))*(PP(0,I)
+     $       -PP(3,I))))
+          ENDDO
+          QSCALE=QSCALE/2D0
+        ELSE
+          QSCALE=DSQRT(Q2FACT(IB(1)))
+        ENDIF
+        G1=PDG2PDF(LPP(IB(1)),0, IB(1),XBK(IB(1)), QSCALE)
       ENDIF
       IF (ABS(LPP(IB(2))).GE.1) THEN
           !LP=SIGN(1,LPP(IB(2)))
-        G2=PDG2PDF(LPP(IB(2)),0, IB(2),XBK(IB(2)),DSQRT(Q2FACT(IB(2))))
+        IF (DSQRT(Q2FACT(IB(2))).NE.0D0) THEN
+          QSCALE=DSQRT(Q2FACT(IB(2)))
+        ENDIF
+        G2=PDG2PDF(LPP(IB(2)),0, IB(2),XBK(IB(2)), QSCALE)
       ENDIF
       PD(0) = 0D0
       IPROC = 0
@@ -202,7 +216,7 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT,
      $  ALL_CM_RAP, ALL_WGT, IMODE, ALL_OUT, VECSIZE_USED)
 C     ****************************************************
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+C     Generated by MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
@@ -249,6 +263,7 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT,
 C     
 C     LOCAL VARIABLES 
 C     
+      DOUBLE PRECISION QSCALE
       INTEGER I,ITYPE,LP,IPROC
       DOUBLE PRECISION G1(VECSIZE_MEMMAX)
       DOUBLE PRECISION G2(VECSIZE_MEMMAX)
diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/check_sa.cc b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/check_sa.cc
index 1bad694d1c..7cac5ab47b 100644
--- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/check_sa.cc
+++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/check_sa.cc
@@ -29,7 +29,9 @@
 
 #include <algorithm>
 #include <array>
+#include <cfenv> // for feenableexcept
 #include <cmath>
+#include <csignal> // for signal and SIGFPE
 #include <cstring>
 #include <fstream>
 #include <iomanip>
@@ -74,6 +76,23 @@ usage( char* argv0, int ret = 1 )
   return ret;
 }
 
+#ifdef __CUDACC__
+namespace mg5amcGpu
+#else
+namespace mg5amcCpu
+#endif
+{
+  inline void FPEhandler( int sig )
+  {
+#ifdef __CUDACC__
+    std::cerr << "Floating Point Exception (GPU)" << std::endl;
+#else
+    std::cerr << "Floating Point Exception (CPU)" << std::endl;
+#endif
+    exit( 0 );
+  }
+}
+
 int
 main( int argc, char** argv )
 {
@@ -84,6 +103,18 @@ main( int argc, char** argv )
   using namespace mg5amcCpu;
 #endif
 
+  // Enable FPEs (test #701 and #733 - except on MacOS where feenableexcept is not defined #730)
+#ifndef __APPLE__
+  const char* enableFPEc = getenv( "CUDACPP_RUNTIME_ENABLEFPE" );
+  const bool enableFPE = ( enableFPEc != 0 ) && ( std::string( enableFPEc ) != "" );
+  if( enableFPE )
+  {
+    std::cout << "WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions" << std::endl;
+    feenableexcept( FE_INVALID | FE_DIVBYZERO | FE_OVERFLOW | FE_UNDERFLOW ); // debug #701
+    signal( SIGFPE, FPEhandler );
+  }
+#endif
+
   // DEFAULTS FOR COMMAND LINE ARGUMENTS
   bool verbose = false;
   bool debug = false;
@@ -103,12 +134,14 @@ main( int argc, char** argv )
     CurandHost = 1,
     CurandDevice = 2
   };
-#ifdef __CUDACC__
-  RandomNumberMode rndgen = RandomNumberMode::CurandDevice; // default on CUDA GPU
-#elif not defined MGONGPU_HAS_NO_CURAND
-  RandomNumberMode rndgen = RandomNumberMode::CurandHost;  // default on CPU if build has curand
+#ifdef MGONGPU_HAS_NO_CURAND
+  RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // this is the only supported mode if build has no curand (PR #784 and #785)
+#elif defined __HIPCC__
+#error Internal error: MGONGPU_HAS_NO_CURAND should have been set for __HIPCC__ // default on AMD GPUs should be common random
+#elif defined __CUDACC__
+  RandomNumberMode rndgen = RandomNumberMode::CurandDevice; // default on NVidia GPU if build has curand
 #else
-  RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // default on CPU if build has no curand and on HIP GPU
+  RandomNumberMode rndgen = RandomNumberMode::CurandHost; // default on CPU if build has curand
 #endif
   // Rambo sampling mode (NB RamboHost implies CommonRandom or CurandHost!)
   enum class RamboSamplingMode
@@ -146,18 +179,20 @@ main( int argc, char** argv )
     }
     else if( arg == "--curdev" )
     {
-#ifdef __CUDACC__
-      rndgen = RandomNumberMode::CurandDevice;
+#ifndef __CUDACC__
+      throw std::runtime_error( "CurandDevice is not supported on CPUs or non-NVidia GPUs" );
+#elif defined MGONGPU_HAS_NO_CURAND
+      throw std::runtime_error( "CurandDevice is not supported because this application was built without Curand support" );
 #else
-      throw std::runtime_error( "CurandDevice is not supported on CPUs or on HIP GPUs" );
+      rndgen = RandomNumberMode::CurandDevice;
 #endif
     }
     else if( arg == "--curhst" )
     {
-#ifndef MGONGPU_HAS_NO_CURAND
-      rndgen = RandomNumberMode::CurandHost;
-#else
+#ifdef MGONGPU_HAS_NO_CURAND
       throw std::runtime_error( "CurandHost is not supported because this application was built without Curand support" );
+#else
+      rndgen = RandomNumberMode::CurandHost;
 #endif
     }
     else if( arg == "--common" )
@@ -278,10 +313,10 @@ main( int argc, char** argv )
   const std::string procKey = "0a ProcInit";
   timermap.start( procKey );
 
-  // Create a process object
+  // Create a process object, read param card and set parameters
+  // FIXME: the process instance can happily go out of scope because it is only needed to read parameters?
+  // FIXME: the CPPProcess should really be a singleton? (for instance, in bridge mode this will be called twice here?)
   CPPProcess process( verbose );
-
-  // Read param_card and set parameters
   process.initProc( "../../Cards/param_card.dat" );
   const fptype energy = 1500; // historical default, Ecms = 1500 GeV = 1.5 TeV (above the Z peak)
   //const fptype energy = 91.2; // Ecms = 91.2 GeV (Z peak)
@@ -389,30 +424,26 @@ main( int argc, char** argv )
   {
     prnk.reset( new CommonRandomNumberKernel( hstRndmom ) );
   }
-#ifndef MGONGPU_HAS_NO_CURAND
   else if( rndgen == RandomNumberMode::CurandHost )
   {
+#ifdef MGONGPU_HAS_NO_CURAND
+    throw std::runtime_error( "INTERNAL ERROR! CurandHost is not supported because this application was built without Curand support" ); // INTERNAL ERROR (no path to this statement)
+#else
     const bool onDevice = false;
     prnk.reset( new CurandRandomNumberKernel( hstRndmom, onDevice ) );
+#endif
   }
-#ifdef __CUDACC__
   else
   {
+#ifdef MGONGPU_HAS_NO_CURAND
+    throw std::runtime_error( "INTERNAL ERROR! CurandDevice is not supported because this application was built without Curand support" ); // INTERNAL ERROR (no path to this statement)
+#elif defined __CUDACC__
     const bool onDevice = true;
     prnk.reset( new CurandRandomNumberKernel( devRndmom, onDevice ) );
-  }
 #else
-  else
-  {
-    throw std::logic_error( "CurandDevice is not supported on CPUs or HIP GPUs" ); // INTERNAL ERROR (no path to this statement)
-  }
+    throw std::logic_error( "INTERNAL ERROR! CurandDevice is not supported on CPUs or non-NVidia GPUs" ); // INTERNAL ERROR (no path to this statement)
 #endif
-#else
-  else
-  {
-    throw std::logic_error( "This application was built without Curand support" ); // INTERNAL ERROR (no path to this statement)
   }
-#endif
 
   // --- 0c. Create rambo sampling kernel [keep this in 0c for the moment]
   std::unique_ptr<SamplingKernelBase> prsk;
@@ -747,7 +778,7 @@ main( int argc, char** argv )
   wrkflwtxt += "HIP:";
 #else
   wrkflwtxt += "CPP:";
-#endif
+#endif /* clang-format off */
   // -- DOUBLE or FLOAT?
 #if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
   wrkflwtxt += "MIX+"; // mixed fptypes (single precision color algebra #537)
@@ -757,7 +788,7 @@ main( int argc, char** argv )
   wrkflwtxt += "FLT+";
 #else
   wrkflwtxt += "???+"; // no path to this statement
-#endif
+#endif /* clang-format on */
   // -- CUCOMPLEX or THRUST or STD complex numbers?
 #ifdef __CUDACC__
 #if defined MGONGPU_CUCXTYPE_CUCOMPLEX
diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/counters.cc b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/counters.cc
index 71fa817036..3bbdec9387 100644
--- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/counters.cc
+++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/counters.cc
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #include "timer.h"
 #define TIMERTYPE std::chrono::high_resolution_clock
@@ -36,13 +36,10 @@ extern "C"
 
   static mgOnGpu::Timer<TIMERTYPE> program_timer;
   static float program_totaltime = 0;
-  static mgOnGpu::Timer<TIMERTYPE> matrix1_timer;
-  static float matrix1_totaltime = 0;
   static mgOnGpu::Timer<TIMERTYPE> smatrix1_timer;
   static float smatrix1_totaltime = 0;
   static mgOnGpu::Timer<TIMERTYPE> smatrix1multi_timer[nimplC];
   static float smatrix1multi_totaltime[nimplC] = { 0 };
-  static int matrix1_counter = 0;
   static int smatrix1_counter = 0;
   static int smatrix1multi_counter[nimplC] = { 0 };
 
@@ -52,19 +49,6 @@ extern "C"
     return;
   }
 
-  void counters_matrix1_start_()
-  {
-    matrix1_counter++;
-    matrix1_timer.Start();
-    return;
-  }
-
-  void counters_matrix1_stop_()
-  {
-    matrix1_totaltime += matrix1_timer.GetDuration();
-    return;
-  }
-
   void counters_smatrix1_start_()
   {
     smatrix1_counter++;
diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/matrix1.f b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/matrix1.f
index 7da1a11e92..41dbc97183 100644
--- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/matrix1.f
+++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/matrix1.f
@@ -1,7 +1,7 @@
       SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL,
      $  ICOL)
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+C     Generated by MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
@@ -413,7 +413,7 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL,
 
       REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+C     Generated by MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
@@ -455,7 +455,7 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
 C     LOCAL VARIABLES 
 C     
       INTEGER I,J,M,N
-      COMPLEX*16 ZTEMP, TMP_JAMP(2768)
+      COMPLEX*16 ZTEMP, TMP_JAMP(3030)
       REAL*8 CF(NCOLOR,NCOLOR)
       COMPLEX*16 AMP(NGRAPHS), JAMP(NCOLOR,NAMPSO)
       COMPLEX*16 W(6,NWAVEFUNCS)
@@ -9862,7 +9862,6 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
 C     ----------
 C     BEGIN CODE
 C     ----------
-      call counters_matrix1_start()
       IF (FIRST) THEN
         FIRST=.FALSE.
         IF(ZERO.NE.0D0) FK_ZERO = SIGN(MAX(ABS(ZERO), ABS(ZERO
@@ -13559,5081 +13558,5242 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
 
       JAMP(:,:) = (0D0,0D0)
 C     JAMPs contributing to orders ALL_ORDERS=1
-      TMP_JAMP(210) = AMP(1370) - AMP(1382)  ! used 16 times
-      TMP_JAMP(209) = AMP(1358) - AMP(1361)  ! used 16 times
-      TMP_JAMP(208) = AMP(1329) +  AMP(1811)  ! used 16 times
-      TMP_JAMP(207) = AMP(1327) - AMP(1810)  ! used 16 times
-      TMP_JAMP(206) = AMP(1322) +  AMP(1388)  ! used 16 times
-      TMP_JAMP(205) = AMP(1320) +  AMP(1324)  ! used 16 times
-      TMP_JAMP(204) = AMP(1310) - AMP(1313)  ! used 16 times
-      TMP_JAMP(203) = AMP(1309) +  AMP(1311)  ! used 16 times
-      TMP_JAMP(202) = AMP(1103) +  AMP(1359)  ! used 16 times
-      TMP_JAMP(201) = AMP(1102) +  AMP(1486)  ! used 16 times
-      TMP_JAMP(200) = AMP(1098) - AMP(1817)  ! used 16 times
-      TMP_JAMP(199) = AMP(1096) +  AMP(1816)  ! used 16 times
-      TMP_JAMP(198) = AMP(1089) - AMP(1487)  ! used 16 times
-      TMP_JAMP(197) = AMP(1085) +  AMP(1087)  ! used 16 times
-      TMP_JAMP(196) = AMP(1084) - AMP(1100)  ! used 16 times
-      TMP_JAMP(195) = AMP(1080) +  AMP(1082)  ! used 16 times
-      TMP_JAMP(194) = AMP(1079) - AMP(1099)  ! used 16 times
-      TMP_JAMP(193) = AMP(1076) +  AMP(1078)  ! used 16 times
-      TMP_JAMP(192) = AMP(45) +  AMP(1323)  ! used 16 times
-      TMP_JAMP(191) = AMP(43) - AMP(1371)  ! used 16 times
-      TMP_JAMP(190) = AMP(41) - AMP(1492)  ! used 16 times
-      TMP_JAMP(189) = AMP(37) +  AMP(40)  ! used 16 times
-      TMP_JAMP(188) = AMP(29) - AMP(1326)  ! used 16 times
-      TMP_JAMP(187) = AMP(25) +  AMP(28)  ! used 16 times
-      TMP_JAMP(186) = AMP(18) +  AMP(1493)  ! used 16 times
-      TMP_JAMP(185) = AMP(16) +  AMP(26)  ! used 16 times
-      TMP_JAMP(184) = AMP(5) - AMP(38)  ! used 16 times
-      TMP_JAMP(183) = AMP(2) +  AMP(4)  ! used 16 times
-      TMP_JAMP(182) = AMP(1378) +  AMP(1383)  ! used 16 times
-      TMP_JAMP(181) = AMP(1367) +  AMP(1369)  ! used 16 times
-      TMP_JAMP(180) = AMP(1332) - AMP(1677)  ! used 16 times
-      TMP_JAMP(179) = AMP(1330) - AMP(1389)  ! used 16 times
-      TMP_JAMP(178) = AMP(1325) - AMP(1675)  ! used 16 times
-      TMP_JAMP(177) = AMP(1319) +  AMP(1321)  ! used 16 times
-      TMP_JAMP(176) = AMP(1315) - AMP(1318)  ! used 16 times
-      TMP_JAMP(175) = AMP(1314) +  AMP(1316)  ! used 16 times
-      TMP_JAMP(174) = AMP(579) - AMP(1380)  ! used 16 times
-      TMP_JAMP(173) = AMP(577) - AMP(1489)  ! used 16 times
-      TMP_JAMP(172) = AMP(573) +  AMP(1683)  ! used 16 times
-      TMP_JAMP(171) = AMP(571) +  AMP(576)  ! used 16 times
-      TMP_JAMP(170) = AMP(567) +  AMP(1490)  ! used 16 times
-      TMP_JAMP(169) = AMP(566) +  AMP(1681)  ! used 16 times
-      TMP_JAMP(168) = AMP(556) - AMP(559)  ! used 16 times
-      TMP_JAMP(167) = AMP(555) +  AMP(557)  ! used 16 times
-      TMP_JAMP(166) = AMP(554) - AMP(574)  ! used 16 times
-      TMP_JAMP(165) = AMP(551) +  AMP(553)  ! used 16 times
-      TMP_JAMP(164) = AMP(1377) +  AMP(1865)  ! used 16 times
-      TMP_JAMP(163) = AMP(1357) +  AMP(1360)  ! used 16 times
-      TMP_JAMP(162) = AMP(1176) - AMP(1814)  ! used 16 times
-      TMP_JAMP(161) = AMP(1174) +  AMP(1813)  ! used 16 times
-      TMP_JAMP(160) = AMP(1173) - AMP(1871)  ! used 16 times
-      TMP_JAMP(159) = AMP(1163) +  AMP(1165)  ! used 16 times
-      TMP_JAMP(158) = AMP(1159) +  AMP(1161)  ! used 16 times
-      TMP_JAMP(157) = AMP(1158) +  AMP(1160)  ! used 16 times
-      TMP_JAMP(156) = AMP(636) - AMP(1172)  ! used 16 times
-      TMP_JAMP(155) = AMP(634) +  AMP(1376)  ! used 16 times
-      TMP_JAMP(154) = AMP(575) +  AMP(578)  ! used 16 times
-      TMP_JAMP(153) = AMP(565) - AMP(1166)  ! used 16 times
-      TMP_JAMP(152) = AMP(521) +  AMP(524)  ! used 16 times
-      TMP_JAMP(151) = AMP(520) +  AMP(523)  ! used 16 times
-      TMP_JAMP(150) = AMP(1375) - AMP(1864)  ! used 16 times
-      TMP_JAMP(149) = AMP(1368) +  AMP(1372)  ! used 16 times
-      TMP_JAMP(148) = AMP(1171) +  AMP(1870)  ! used 16 times
-      TMP_JAMP(147) = AMP(1170) - AMP(1542)  ! used 16 times
-      TMP_JAMP(146) = AMP(1167) - AMP(1541)  ! used 16 times
-      TMP_JAMP(145) = AMP(1164) +  AMP(1168)  ! used 16 times
-      TMP_JAMP(144) = AMP(1154) +  AMP(1156)  ! used 16 times
-      TMP_JAMP(143) = AMP(1153) +  AMP(1155)  ! used 16 times
-      TMP_JAMP(142) = AMP(42) - AMP(1374)  ! used 16 times
-      TMP_JAMP(141) = AMP(24) +  AMP(1548)  ! used 16 times
-      TMP_JAMP(140) = AMP(22) +  AMP(39)  ! used 16 times
-      TMP_JAMP(139) = AMP(17) +  AMP(1547)  ! used 16 times
-      TMP_JAMP(138) = AMP(7) +  AMP(9)  ! used 16 times
-      TMP_JAMP(137) = AMP(6) +  AMP(8)  ! used 16 times
-      TMP_JAMP(136) = AMP(1379) - AMP(1785)  ! used 16 times
-      TMP_JAMP(135) = AMP(1365) +  AMP(1780)  ! used 16 times
-      TMP_JAMP(134) = AMP(1362) +  AMP(1364)  ! used 16 times
-      TMP_JAMP(133) = AMP(1101) +  AMP(1104)  ! used 16 times
-      TMP_JAMP(132) = AMP(732) +  AMP(1680)  ! used 16 times
-      TMP_JAMP(131) = AMP(731) +  AMP(1791)  ! used 16 times
-      TMP_JAMP(130) = AMP(725) +  AMP(1678)  ! used 16 times
-      TMP_JAMP(129) = AMP(724) - AMP(1088)  ! used 16 times
-      TMP_JAMP(128) = AMP(722) - AMP(1782)  ! used 16 times
-      TMP_JAMP(127) = AMP(719) +  AMP(721)  ! used 16 times
-      TMP_JAMP(126) = AMP(715) - AMP(718)  ! used 16 times
-      TMP_JAMP(125) = AMP(714) +  AMP(716)  ! used 16 times
-      TMP_JAMP(124) = AMP(681) +  AMP(684)  ! used 16 times
-      TMP_JAMP(123) = AMP(679) +  AMP(682)  ! used 16 times
-      TMP_JAMP(122) = AMP(1373) - AMP(1783)  ! used 16 times
-      TMP_JAMP(121) = AMP(1363) - AMP(1366)  ! used 16 times
-      TMP_JAMP(120) = AMP(729) +  AMP(1545)  ! used 16 times
-      TMP_JAMP(119) = AMP(728) +  AMP(1789)  ! used 16 times
-      TMP_JAMP(118) = AMP(726) +  AMP(1544)  ! used 16 times
-      TMP_JAMP(117) = AMP(720) - AMP(723)  ! used 16 times
-      TMP_JAMP(116) = AMP(710) +  AMP(712)  ! used 16 times
-      TMP_JAMP(115) = AMP(709) +  AMP(711)  ! used 16 times
-      TMP_JAMP(114) = AMP(1346) +  AMP(1385)  ! used 16 times
-      TMP_JAMP(113) = AMP(1334) - AMP(1337)  ! used 16 times
-      TMP_JAMP(112) = AMP(1181) +  AMP(1336)  ! used 16 times
-      TMP_JAMP(111) = AMP(1180) +  AMP(1540)  ! used 16 times
-      TMP_JAMP(110) = AMP(1162) - AMP(1178)  ! used 16 times
-      TMP_JAMP(109) = AMP(1157) - AMP(1177)  ! used 16 times
-      TMP_JAMP(108) = AMP(44) +  AMP(1347)  ! used 16 times
-      TMP_JAMP(107) = AMP(36) - AMP(1546)  ! used 16 times
-      TMP_JAMP(106) = AMP(31) +  AMP(34)  ! used 16 times
-      TMP_JAMP(105) = AMP(10) - AMP(33)  ! used 16 times
-      TMP_JAMP(104) = AMP(1354) - AMP(1386)  ! used 16 times
-      TMP_JAMP(103) = AMP(1343) +  AMP(1345)  ! used 16 times
-      TMP_JAMP(102) = AMP(738) - AMP(1355)  ! used 16 times
-      TMP_JAMP(101) = AMP(736) - AMP(1543)  ! used 16 times
-      TMP_JAMP(100) = AMP(730) +  AMP(735)  ! used 16 times
-      TMP_JAMP(99) = AMP(713) - AMP(733)  ! used 16 times
-      TMP_JAMP(98) = AMP(1353) +  AMP(1838)  ! used 16 times
-      TMP_JAMP(97) = AMP(1333) +  AMP(1335)  ! used 16 times
-      TMP_JAMP(96) = AMP(1095) - AMP(1844)  ! used 16 times
-      TMP_JAMP(95) = AMP(1081) +  AMP(1083)  ! used 16 times
-      TMP_JAMP(94) = AMP(795) - AMP(1094)  ! used 16 times
-      TMP_JAMP(93) = AMP(793) +  AMP(1352)  ! used 16 times
-      TMP_JAMP(92) = AMP(734) +  AMP(737)  ! used 16 times
-      TMP_JAMP(91) = AMP(680) +  AMP(683)  ! used 16 times
-      TMP_JAMP(90) = AMP(1351) - AMP(1837)  ! used 16 times
-      TMP_JAMP(89) = AMP(1344) +  AMP(1348)  ! used 16 times
-      TMP_JAMP(88) = AMP(1093) +  AMP(1843)  ! used 16 times
-      TMP_JAMP(87) = AMP(1092) - AMP(1488)  ! used 16 times
-      TMP_JAMP(86) = AMP(1086) +  AMP(1090)  ! used 16 times
-      TMP_JAMP(85) = AMP(1075) +  AMP(1077)  ! used 16 times
-      TMP_JAMP(84) = AMP(35) - AMP(1350)  ! used 16 times
-      TMP_JAMP(83) = AMP(21) +  AMP(1494)  ! used 16 times
-      TMP_JAMP(82) = AMP(19) +  AMP(32)  ! used 16 times
-      TMP_JAMP(81) = AMP(1) +  AMP(3)  ! used 16 times
-      TMP_JAMP(80) = AMP(1356) - AMP(1731)  ! used 16 times
-      TMP_JAMP(79) = AMP(1341) +  AMP(1726)  ! used 16 times
-      TMP_JAMP(78) = AMP(1338) +  AMP(1340)  ! used 16 times
-      TMP_JAMP(77) = AMP(1179) +  AMP(1182)  ! used 16 times
-      TMP_JAMP(76) = AMP(572) +  AMP(1737)  ! used 16 times
-      TMP_JAMP(75) = AMP(563) - AMP(1728)  ! used 16 times
-      TMP_JAMP(74) = AMP(560) +  AMP(562)  ! used 16 times
-      TMP_JAMP(73) = AMP(522) +  AMP(525)  ! used 16 times
-      TMP_JAMP(72) = AMP(1349) - AMP(1729)  ! used 16 times
-      TMP_JAMP(71) = AMP(1339) - AMP(1342)  ! used 16 times
-      TMP_JAMP(70) = AMP(570) +  AMP(1491)  ! used 16 times
-      TMP_JAMP(69) = AMP(569) +  AMP(1735)  ! used 16 times
-      TMP_JAMP(68) = AMP(561) - AMP(564)  ! used 16 times
-      TMP_JAMP(67) = AMP(550) +  AMP(552)  ! used 16 times
-      TMP_JAMP(66) = AMP(1317) +  AMP(1672)  ! used 16 times
-      TMP_JAMP(65) = AMP(1259) +  AMP(1312)  ! used 16 times
-      TMP_JAMP(64) = AMP(1257) +  AMP(1260)  ! used 16 times
-      TMP_JAMP(63) = AMP(1251) - AMP(1868)  ! used 16 times
-      TMP_JAMP(62) = AMP(1240) - AMP(1256)  ! used 16 times
-      TMP_JAMP(61) = AMP(1237) +  AMP(1239)  ! used 16 times
-      TMP_JAMP(60) = AMP(635) - AMP(1250)  ! used 16 times
-      TMP_JAMP(59) = AMP(558) - AMP(1674)  ! used 16 times
-      TMP_JAMP(58) = AMP(540) +  AMP(543)  ! used 16 times
-      TMP_JAMP(57) = AMP(539) +  AMP(542)  ! used 16 times
-      TMP_JAMP(56) = AMP(1249) +  AMP(1867)  ! used 16 times
-      TMP_JAMP(55) = AMP(1242) +  AMP(1246)  ! used 16 times
-      TMP_JAMP(54) = AMP(727) - AMP(1247)  ! used 16 times
-      TMP_JAMP(53) = AMP(717) - AMP(1673)  ! used 16 times
-      TMP_JAMP(52) = AMP(699) +  AMP(702)  ! used 16 times
-      TMP_JAMP(51) = AMP(697) +  AMP(700)  ! used 16 times
-      TMP_JAMP(50) = AMP(1254) - AMP(1841)  ! used 16 times
-      TMP_JAMP(49) = AMP(1236) +  AMP(1238)  ! used 16 times
-      TMP_JAMP(48) = AMP(794) - AMP(1253)  ! used 16 times
-      TMP_JAMP(47) = AMP(698) +  AMP(701)  ! used 16 times
-      TMP_JAMP(46) = AMP(1252) +  AMP(1840)  ! used 16 times
-      TMP_JAMP(45) = AMP(1241) +  AMP(1243)  ! used 16 times
-      TMP_JAMP(44) = AMP(568) - AMP(1244)  ! used 16 times
-      TMP_JAMP(43) = AMP(538) +  AMP(541)  ! used 16 times
-      TMP_JAMP(42) = AMP(1258) +  AMP(1594)  ! used 16 times
-      TMP_JAMP(41) = AMP(1248) - AMP(1596)  ! used 16 times
-      TMP_JAMP(40) = AMP(1235) - AMP(1255)  ! used 16 times
-      TMP_JAMP(39) = AMP(1231) +  AMP(1233)  ! used 16 times
-      TMP_JAMP(38) = AMP(30) - AMP(1600)  ! used 16 times
-      TMP_JAMP(37) = AMP(23) +  AMP(1602)  ! used 16 times
-      TMP_JAMP(36) = AMP(15) - AMP(27)  ! used 16 times
-      TMP_JAMP(35) = AMP(11) +  AMP(13)  ! used 16 times
-      TMP_JAMP(34) = AMP(1245) - AMP(1595)  ! used 16 times
-      TMP_JAMP(33) = AMP(1232) +  AMP(1234)  ! used 16 times
-      TMP_JAMP(32) = AMP(20) +  AMP(1601)  ! used 16 times
-      TMP_JAMP(31) = AMP(12) +  AMP(14)  ! used 16 times
-      TMP_JAMP(30) = AMP(954) - AMP(1097)  ! used 16 times
-      TMP_JAMP(29) = AMP(952) +  AMP(1328)  ! used 16 times
-      TMP_JAMP(28) = AMP(897) - AMP(1331)  ! used 16 times
-      TMP_JAMP(27) = AMP(893) +  AMP(896)  ! used 16 times
-      TMP_JAMP(26) = AMP(890) +  AMP(1788)  ! used 16 times
-      TMP_JAMP(25) = AMP(889) +  AMP(894)  ! used 16 times
-      TMP_JAMP(24) = AMP(881) - AMP(1781)  ! used 16 times
-      TMP_JAMP(23) = AMP(878) +  AMP(880)  ! used 16 times
-      TMP_JAMP(22) = AMP(840) +  AMP(843)  ! used 16 times
-      TMP_JAMP(21) = AMP(839) +  AMP(842)  ! used 16 times
-      TMP_JAMP(20) = AMP(953) - AMP(1175)  ! used 16 times
-      TMP_JAMP(19) = AMP(887) +  AMP(1786)  ! used 16 times
-      TMP_JAMP(18) = AMP(886) - AMP(1169)  ! used 16 times
-      TMP_JAMP(17) = AMP(879) - AMP(882)  ! used 16 times
-      TMP_JAMP(16) = AMP(857) +  AMP(860)  ! used 16 times
-      TMP_JAMP(15) = AMP(856) +  AMP(859)  ! used 16 times
-      TMP_JAMP(14) = AMP(891) +  AMP(1734)  ! used 16 times
-      TMP_JAMP(13) = AMP(876) - AMP(1727)  ! used 16 times
-      TMP_JAMP(12) = AMP(873) +  AMP(875)  ! used 16 times
-      TMP_JAMP(11) = AMP(858) +  AMP(861)  ! used 16 times
-      TMP_JAMP(10) = AMP(884) +  AMP(1732)  ! used 16 times
-      TMP_JAMP(9) = AMP(883) - AMP(1091)  ! used 16 times
-      TMP_JAMP(8) = AMP(874) - AMP(877)  ! used 16 times
-      TMP_JAMP(7) = AMP(838) +  AMP(841)  ! used 16 times
-      TMP_JAMP(6) = AMP(895) - AMP(1597)  ! used 16 times
-      TMP_JAMP(5) = AMP(888) +  AMP(1599)  ! used 16 times
-      TMP_JAMP(4) = AMP(872) - AMP(892)  ! used 16 times
-      TMP_JAMP(3) = AMP(868) +  AMP(870)  ! used 16 times
-      TMP_JAMP(2) = AMP(885) +  AMP(1598)  ! used 16 times
-      TMP_JAMP(1) = AMP(869) +  AMP(871)  ! used 16 times
-      TMP_JAMP(315) = TMP_JAMP(186) +  TMP_JAMP(183)  ! used 16 times
-      TMP_JAMP(314) = TMP_JAMP(188) +  TMP_JAMP(185)  ! used 16 times
-      TMP_JAMP(313) = TMP_JAMP(190) - TMP_JAMP(184)  ! used 16 times
-      TMP_JAMP(312) = TMP_JAMP(191) +  TMP_JAMP(189)  ! used 16 times
-      TMP_JAMP(311) = TMP_JAMP(192) - TMP_JAMP(187)  ! used 16 times
-      TMP_JAMP(310) = TMP_JAMP(198) +  TMP_JAMP(193)  ! used 16 times
-      TMP_JAMP(309) = TMP_JAMP(199) - TMP_JAMP(197)  ! used 16 times
-      TMP_JAMP(308) = TMP_JAMP(200) +  TMP_JAMP(195)  ! used 16 times
-      TMP_JAMP(307) = TMP_JAMP(201) - TMP_JAMP(194)  ! used 16 times
-      TMP_JAMP(306) = TMP_JAMP(202) - TMP_JAMP(196)  ! used 16 times
-      TMP_JAMP(305) = TMP_JAMP(206) - TMP_JAMP(204)  ! used 16 times
-      TMP_JAMP(304) = TMP_JAMP(207) - TMP_JAMP(205)  ! used 16 times
-      TMP_JAMP(303) = TMP_JAMP(208) +  TMP_JAMP(203)  ! used 16 times
-      TMP_JAMP(302) = TMP_JAMP(210) - TMP_JAMP(209)  ! used 16 times
-      TMP_JAMP(301) = TMP_JAMP(169) - TMP_JAMP(168)  ! used 16 times
-      TMP_JAMP(300) = TMP_JAMP(170) +  TMP_JAMP(165)  ! used 16 times
-      TMP_JAMP(299) = TMP_JAMP(172) +  TMP_JAMP(167)  ! used 16 times
-      TMP_JAMP(298) = TMP_JAMP(173) - TMP_JAMP(166)  ! used 16 times
-      TMP_JAMP(297) = TMP_JAMP(174) +  TMP_JAMP(171)  ! used 16 times
-      TMP_JAMP(296) = TMP_JAMP(178) - TMP_JAMP(176)  ! used 16 times
-      TMP_JAMP(295) = TMP_JAMP(179) - TMP_JAMP(177)  ! used 16 times
-      TMP_JAMP(294) = TMP_JAMP(180) +  TMP_JAMP(175)  ! used 16 times
-      TMP_JAMP(293) = TMP_JAMP(182) - TMP_JAMP(181)  ! used 16 times
-      TMP_JAMP(292) = TMP_JAMP(153) +  TMP_JAMP(151)  ! used 16 times
-      TMP_JAMP(291) = TMP_JAMP(155) - TMP_JAMP(154)  ! used 16 times
-      TMP_JAMP(290) = TMP_JAMP(156) +  TMP_JAMP(152)  ! used 16 times
-      TMP_JAMP(289) = TMP_JAMP(160) +  TMP_JAMP(158)  ! used 16 times
-      TMP_JAMP(288) = TMP_JAMP(161) - TMP_JAMP(159)  ! used 16 times
-      TMP_JAMP(287) = TMP_JAMP(162) +  TMP_JAMP(157)  ! used 16 times
-      TMP_JAMP(286) = TMP_JAMP(164) +  TMP_JAMP(163)  ! used 16 times
-      TMP_JAMP(285) = TMP_JAMP(139) +  TMP_JAMP(138)  ! used 16 times
-      TMP_JAMP(284) = TMP_JAMP(141) +  TMP_JAMP(137)  ! used 16 times
-      TMP_JAMP(283) = TMP_JAMP(142) +  TMP_JAMP(140)  ! used 16 times
-      TMP_JAMP(282) = TMP_JAMP(146) +  TMP_JAMP(144)  ! used 16 times
-      TMP_JAMP(281) = TMP_JAMP(147) +  TMP_JAMP(143)  ! used 16 times
-      TMP_JAMP(280) = TMP_JAMP(148) - TMP_JAMP(145)  ! used 16 times
-      TMP_JAMP(279) = TMP_JAMP(150) - TMP_JAMP(149)  ! used 16 times
-      TMP_JAMP(278) = TMP_JAMP(128) +  TMP_JAMP(124)  ! used 16 times
-      TMP_JAMP(277) = TMP_JAMP(129) +  TMP_JAMP(123)  ! used 16 times
-      TMP_JAMP(276) = TMP_JAMP(130) - TMP_JAMP(126)  ! used 16 times
-      TMP_JAMP(275) = TMP_JAMP(131) +  TMP_JAMP(127)  ! used 16 times
-      TMP_JAMP(274) = TMP_JAMP(132) +  TMP_JAMP(125)  ! used 16 times
-      TMP_JAMP(273) = TMP_JAMP(135) +  TMP_JAMP(133)  ! used 16 times
-      TMP_JAMP(272) = TMP_JAMP(136) +  TMP_JAMP(134)  ! used 16 times
-      TMP_JAMP(271) = TMP_JAMP(118) +  TMP_JAMP(116)  ! used 16 times
-      TMP_JAMP(270) = TMP_JAMP(119) - TMP_JAMP(117)  ! used 16 times
-      TMP_JAMP(269) = TMP_JAMP(120) +  TMP_JAMP(115)  ! used 16 times
-      TMP_JAMP(268) = TMP_JAMP(122) - TMP_JAMP(121)  ! used 16 times
-      TMP_JAMP(267) = TMP_JAMP(107) - TMP_JAMP(105)  ! used 16 times
-      TMP_JAMP(266) = TMP_JAMP(108) - TMP_JAMP(106)  ! used 16 times
-      TMP_JAMP(265) = TMP_JAMP(111) - TMP_JAMP(109)  ! used 16 times
-      TMP_JAMP(264) = TMP_JAMP(112) - TMP_JAMP(110)  ! used 16 times
-      TMP_JAMP(263) = TMP_JAMP(114) - TMP_JAMP(113)  ! used 16 times
-      TMP_JAMP(262) = TMP_JAMP(101) - TMP_JAMP(99)  ! used 16 times
-      TMP_JAMP(261) = TMP_JAMP(102) +  TMP_JAMP(100)  ! used 16 times
-      TMP_JAMP(260) = TMP_JAMP(104) - TMP_JAMP(103)  ! used 16 times
-      TMP_JAMP(259) = TMP_JAMP(93) - TMP_JAMP(92)  ! used 16 times
-      TMP_JAMP(258) = TMP_JAMP(94) +  TMP_JAMP(91)  ! used 16 times
-      TMP_JAMP(257) = TMP_JAMP(96) +  TMP_JAMP(95)  ! used 16 times
-      TMP_JAMP(256) = TMP_JAMP(98) +  TMP_JAMP(97)  ! used 16 times
-      TMP_JAMP(255) = TMP_JAMP(83) +  TMP_JAMP(81)  ! used 16 times
-      TMP_JAMP(254) = TMP_JAMP(84) +  TMP_JAMP(82)  ! used 16 times
-      TMP_JAMP(253) = TMP_JAMP(87) +  TMP_JAMP(85)  ! used 16 times
-      TMP_JAMP(252) = TMP_JAMP(88) - TMP_JAMP(86)  ! used 16 times
-      TMP_JAMP(251) = TMP_JAMP(90) - TMP_JAMP(89)  ! used 16 times
-      TMP_JAMP(250) = TMP_JAMP(75) +  TMP_JAMP(73)  ! used 16 times
-      TMP_JAMP(249) = TMP_JAMP(76) +  TMP_JAMP(74)  ! used 16 times
-      TMP_JAMP(248) = TMP_JAMP(79) +  TMP_JAMP(77)  ! used 16 times
-      TMP_JAMP(247) = TMP_JAMP(80) +  TMP_JAMP(78)  ! used 16 times
-      TMP_JAMP(246) = TMP_JAMP(69) - TMP_JAMP(68)  ! used 16 times
-      TMP_JAMP(245) = TMP_JAMP(70) +  TMP_JAMP(67)  ! used 16 times
-      TMP_JAMP(244) = TMP_JAMP(72) - TMP_JAMP(71)  ! used 16 times
-      TMP_JAMP(243) = TMP_JAMP(59) +  TMP_JAMP(58)  ! used 16 times
-      TMP_JAMP(242) = TMP_JAMP(60) +  TMP_JAMP(57)  ! used 16 times
-      TMP_JAMP(241) = TMP_JAMP(63) +  TMP_JAMP(61)  ! used 16 times
-      TMP_JAMP(240) = TMP_JAMP(65) - TMP_JAMP(62)  ! used 16 times
-      TMP_JAMP(239) = TMP_JAMP(66) +  TMP_JAMP(64)  ! used 16 times
-      TMP_JAMP(238) = TMP_JAMP(53) +  TMP_JAMP(52)  ! used 16 times
-      TMP_JAMP(237) = TMP_JAMP(54) +  TMP_JAMP(51)  ! used 16 times
-      TMP_JAMP(236) = TMP_JAMP(56) - TMP_JAMP(55)  ! used 16 times
-      TMP_JAMP(235) = TMP_JAMP(48) +  TMP_JAMP(47)  ! used 16 times
-      TMP_JAMP(234) = TMP_JAMP(50) +  TMP_JAMP(49)  ! used 16 times
-      TMP_JAMP(233) = TMP_JAMP(44) +  TMP_JAMP(43)  ! used 16 times
-      TMP_JAMP(232) = TMP_JAMP(46) - TMP_JAMP(45)  ! used 16 times
-      TMP_JAMP(231) = TMP_JAMP(37) +  TMP_JAMP(35)  ! used 16 times
-      TMP_JAMP(230) = TMP_JAMP(38) - TMP_JAMP(36)  ! used 16 times
-      TMP_JAMP(229) = TMP_JAMP(41) +  TMP_JAMP(39)  ! used 16 times
-      TMP_JAMP(228) = TMP_JAMP(42) - TMP_JAMP(40)  ! used 16 times
-      TMP_JAMP(227) = TMP_JAMP(32) +  TMP_JAMP(31)  ! used 16 times
-      TMP_JAMP(226) = TMP_JAMP(34) +  TMP_JAMP(33)  ! used 16 times
-      TMP_JAMP(225) = TMP_JAMP(24) +  TMP_JAMP(22)  ! used 16 times
-      TMP_JAMP(224) = TMP_JAMP(26) +  TMP_JAMP(23)  ! used 16 times
-      TMP_JAMP(223) = TMP_JAMP(28) +  TMP_JAMP(25)  ! used 16 times
-      TMP_JAMP(222) = TMP_JAMP(29) - TMP_JAMP(27)  ! used 16 times
-      TMP_JAMP(221) = TMP_JAMP(30) +  TMP_JAMP(21)  ! used 16 times
-      TMP_JAMP(220) = TMP_JAMP(18) +  TMP_JAMP(15)  ! used 16 times
-      TMP_JAMP(219) = TMP_JAMP(19) - TMP_JAMP(17)  ! used 16 times
-      TMP_JAMP(218) = TMP_JAMP(20) +  TMP_JAMP(16)  ! used 16 times
-      TMP_JAMP(217) = TMP_JAMP(13) +  TMP_JAMP(11)  ! used 16 times
-      TMP_JAMP(216) = TMP_JAMP(14) +  TMP_JAMP(12)  ! used 16 times
-      TMP_JAMP(215) = TMP_JAMP(9) +  TMP_JAMP(7)  ! used 16 times
-      TMP_JAMP(214) = TMP_JAMP(10) - TMP_JAMP(8)  ! used 16 times
-      TMP_JAMP(213) = TMP_JAMP(5) +  TMP_JAMP(3)  ! used 16 times
-      TMP_JAMP(212) = TMP_JAMP(6) - TMP_JAMP(4)  ! used 16 times
-      TMP_JAMP(211) = TMP_JAMP(2) +  TMP_JAMP(1)  ! used 16 times
-      TMP_JAMP(405) = TMP_JAMP(302) - AMP(1390)  ! used 16 times
-      TMP_JAMP(404) = TMP_JAMP(303) +  AMP(1822)  ! used 16 times
-      TMP_JAMP(403) = TMP_JAMP(304) - AMP(1819)  ! used 16 times
-      TMP_JAMP(402) = TMP_JAMP(305) +  AMP(1392)  ! used 16 times
-      TMP_JAMP(401) = TMP_JAMP(307) +  AMP(1501)  ! used 16 times
-      TMP_JAMP(400) = TMP_JAMP(308) - AMP(1824)  ! used 16 times
-      TMP_JAMP(399) = TMP_JAMP(309) +  AMP(1821)  ! used 16 times
-      TMP_JAMP(398) = TMP_JAMP(310) - AMP(1495)  ! used 16 times
-      TMP_JAMP(397) = TMP_JAMP(311) +  AMP(1387)  ! used 16 times
-      TMP_JAMP(396) = TMP_JAMP(312) +  AMP(1381)  ! used 16 times
-      TMP_JAMP(395) = TMP_JAMP(313) - AMP(1503)  ! used 16 times
-      TMP_JAMP(394) = TMP_JAMP(315) +  AMP(1497)  ! used 16 times
-      TMP_JAMP(393) = TMP_JAMP(293) +  AMP(1393)  ! used 16 times
-      TMP_JAMP(392) = TMP_JAMP(294) - AMP(1687)  ! used 16 times
-      TMP_JAMP(391) = TMP_JAMP(295) - AMP(1395)  ! used 16 times
-      TMP_JAMP(390) = TMP_JAMP(296) - AMP(1684)  ! used 16 times
-      TMP_JAMP(389) = TMP_JAMP(298) - AMP(1502)  ! used 16 times
-      TMP_JAMP(388) = TMP_JAMP(299) +  AMP(1689)  ! used 16 times
-      TMP_JAMP(387) = TMP_JAMP(300) +  AMP(1496)  ! used 16 times
-      TMP_JAMP(386) = TMP_JAMP(301) +  AMP(1686)  ! used 16 times
-      TMP_JAMP(385) = TMP_JAMP(286) +  AMP(1876)  ! used 16 times
-      TMP_JAMP(384) = TMP_JAMP(287) - AMP(1823)  ! used 16 times
-      TMP_JAMP(383) = TMP_JAMP(288) +  AMP(1820)  ! used 16 times
-      TMP_JAMP(382) = TMP_JAMP(289) - AMP(1878)  ! used 16 times
-      TMP_JAMP(381) = TMP_JAMP(290) +  AMP(1872)  ! used 16 times
-      TMP_JAMP(380) = TMP_JAMP(291) +  AMP(1866)  ! used 16 times
-      TMP_JAMP(379) = TMP_JAMP(279) - AMP(1873)  ! used 16 times
-      TMP_JAMP(378) = TMP_JAMP(280) +  AMP(1875)  ! used 16 times
-      TMP_JAMP(377) = TMP_JAMP(281) - AMP(1552)  ! used 16 times
-      TMP_JAMP(376) = TMP_JAMP(282) - AMP(1549)  ! used 16 times
-      TMP_JAMP(375) = TMP_JAMP(284) +  AMP(1554)  ! used 16 times
-      TMP_JAMP(374) = TMP_JAMP(285) +  AMP(1551)  ! used 16 times
-      TMP_JAMP(373) = TMP_JAMP(272) - AMP(1795)  ! used 16 times
-      TMP_JAMP(372) = TMP_JAMP(273) - AMP(1784)  ! used 16 times
-      TMP_JAMP(371) = TMP_JAMP(274) +  AMP(1688)  ! used 16 times
-      TMP_JAMP(370) = TMP_JAMP(275) +  AMP(1797)  ! used 16 times
-      TMP_JAMP(369) = TMP_JAMP(276) +  AMP(1685)  ! used 16 times
-      TMP_JAMP(368) = TMP_JAMP(278) +  AMP(1790)  ! used 16 times
-      TMP_JAMP(367) = TMP_JAMP(268) - AMP(1792)  ! used 16 times
-      TMP_JAMP(366) = TMP_JAMP(269) +  AMP(1553)  ! used 16 times
-      TMP_JAMP(365) = TMP_JAMP(270) +  AMP(1794)  ! used 16 times
-      TMP_JAMP(364) = TMP_JAMP(271) +  AMP(1550)  ! used 16 times
-      TMP_JAMP(363) = TMP_JAMP(263) +  AMP(1391)  ! used 16 times
-      TMP_JAMP(362) = TMP_JAMP(265) +  AMP(1555)  ! used 16 times
-      TMP_JAMP(361) = TMP_JAMP(266) +  AMP(1384)  ! used 16 times
-      TMP_JAMP(360) = TMP_JAMP(267) - AMP(1557)  ! used 16 times
-      TMP_JAMP(359) = TMP_JAMP(260) - AMP(1394)  ! used 16 times
-      TMP_JAMP(358) = TMP_JAMP(262) - AMP(1556)  ! used 16 times
-      TMP_JAMP(357) = TMP_JAMP(256) +  AMP(1849)  ! used 16 times
-      TMP_JAMP(356) = TMP_JAMP(257) - AMP(1851)  ! used 16 times
-      TMP_JAMP(355) = TMP_JAMP(258) +  AMP(1845)  ! used 16 times
-      TMP_JAMP(354) = TMP_JAMP(259) +  AMP(1839)  ! used 16 times
-      TMP_JAMP(353) = TMP_JAMP(251) - AMP(1846)  ! used 16 times
-      TMP_JAMP(352) = TMP_JAMP(252) +  AMP(1848)  ! used 16 times
-      TMP_JAMP(351) = TMP_JAMP(253) - AMP(1498)  ! used 16 times
-      TMP_JAMP(350) = TMP_JAMP(255) +  AMP(1500)  ! used 16 times
-      TMP_JAMP(349) = TMP_JAMP(247) - AMP(1741)  ! used 16 times
-      TMP_JAMP(348) = TMP_JAMP(248) - AMP(1730)  ! used 16 times
-      TMP_JAMP(347) = TMP_JAMP(249) +  AMP(1743)  ! used 16 times
-      TMP_JAMP(346) = TMP_JAMP(250) +  AMP(1736)  ! used 16 times
-      TMP_JAMP(345) = TMP_JAMP(244) - AMP(1738)  ! used 16 times
-      TMP_JAMP(344) = TMP_JAMP(245) +  AMP(1499)  ! used 16 times
-      TMP_JAMP(343) = TMP_JAMP(246) +  AMP(1740)  ! used 16 times
-      TMP_JAMP(342) = TMP_JAMP(239) - AMP(1676)  ! used 16 times
-      TMP_JAMP(341) = TMP_JAMP(241) - AMP(1877)  ! used 16 times
-      TMP_JAMP(340) = TMP_JAMP(242) +  AMP(1869)  ! used 16 times
-      TMP_JAMP(339) = TMP_JAMP(243) +  AMP(1682)  ! used 16 times
-      TMP_JAMP(338) = TMP_JAMP(236) +  AMP(1874)  ! used 16 times
-      TMP_JAMP(337) = TMP_JAMP(238) +  AMP(1679)  ! used 16 times
-      TMP_JAMP(336) = TMP_JAMP(234) - AMP(1850)  ! used 16 times
-      TMP_JAMP(335) = TMP_JAMP(235) +  AMP(1842)  ! used 16 times
-      TMP_JAMP(334) = TMP_JAMP(232) +  AMP(1847)  ! used 16 times
-      TMP_JAMP(333) = TMP_JAMP(228) +  AMP(1609)  ! used 16 times
-      TMP_JAMP(332) = TMP_JAMP(229) - AMP(1606)  ! used 16 times
-      TMP_JAMP(331) = TMP_JAMP(230) - AMP(1611)  ! used 16 times
-      TMP_JAMP(330) = TMP_JAMP(231) +  AMP(1608)  ! used 16 times
-      TMP_JAMP(329) = TMP_JAMP(226) - AMP(1603)  ! used 16 times
-      TMP_JAMP(328) = TMP_JAMP(227) +  AMP(1605)  ! used 16 times
-      TMP_JAMP(327) = TMP_JAMP(221) +  AMP(1818)  ! used 16 times
-      TMP_JAMP(326) = TMP_JAMP(222) +  AMP(1812)  ! used 16 times
-      TMP_JAMP(325) = TMP_JAMP(224) +  AMP(1796)  ! used 16 times
-      TMP_JAMP(324) = TMP_JAMP(225) +  AMP(1787)  ! used 16 times
-      TMP_JAMP(323) = TMP_JAMP(218) +  AMP(1815)  ! used 16 times
-      TMP_JAMP(322) = TMP_JAMP(219) +  AMP(1793)  ! used 16 times
-      TMP_JAMP(321) = TMP_JAMP(216) +  AMP(1742)  ! used 16 times
-      TMP_JAMP(320) = TMP_JAMP(217) +  AMP(1733)  ! used 16 times
-      TMP_JAMP(319) = TMP_JAMP(214) +  AMP(1739)  ! used 16 times
-      TMP_JAMP(318) = TMP_JAMP(212) - AMP(1610)  ! used 16 times
-      TMP_JAMP(317) = TMP_JAMP(213) +  AMP(1607)  ! used 16 times
-      TMP_JAMP(316) = TMP_JAMP(211) +  AMP(1604)  ! used 16 times
-      TMP_JAMP(1030) = AMP(1455) +  AMP(1456)  ! used 8 times
-      TMP_JAMP(1029) = AMP(1147) +  AMP(1537)  ! used 8 times
-      TMP_JAMP(1028) = AMP(1125) - AMP(1516)  ! used 8 times
-      TMP_JAMP(1027) = AMP(1122) +  AMP(1123)  ! used 8 times
-      TMP_JAMP(1026) = AMP(1117) +  AMP(1125)  ! used 8 times
-      TMP_JAMP(1025) = AMP(439) - AMP(442)  ! used 8 times
-      TMP_JAMP(1024) = AMP(421) - AMP(424)  ! used 8 times
-      TMP_JAMP(1023) = AMP(420) +  AMP(422)  ! used 8 times
-      TMP_JAMP(1022) = AMP(353) +  AMP(440)  ! used 8 times
-      TMP_JAMP(1021) = AMP(341) - AMP(353)  ! used 8 times
-      TMP_JAMP(1020) = AMP(339) - AMP(447)  ! used 8 times
-      TMP_JAMP(1019) = AMP(337) +  AMP(339)  ! used 8 times
-      TMP_JAMP(1018) = AMP(152) - AMP(1539)  ! used 8 times
-      TMP_JAMP(1017) = AMP(151) +  AMP(157)  ! used 8 times
-      TMP_JAMP(1016) = AMP(139) - AMP(159)  ! used 8 times
-      TMP_JAMP(1015) = AMP(99) - AMP(1458)  ! used 8 times
-      TMP_JAMP(1014) = AMP(90) +  AMP(1518)  ! used 8 times
-      TMP_JAMP(1013) = AMP(88) +  AMP(99)  ! used 8 times
-      TMP_JAMP(1012) = AMP(84) +  AMP(90)  ! used 8 times
-      TMP_JAMP(1011) = TMP_JAMP(306) + ((0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * AMP(341)  ! used 8 times
-      TMP_JAMP(1010) = TMP_JAMP(314) + ((0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * AMP(88)  ! used 8 times
-      TMP_JAMP(1009) = TMP_JAMP(394) + ((-0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * AMP(84)  ! used 8 times
-      TMP_JAMP(1008) = TMP_JAMP(395) - TMP_JAMP(394)  ! used 8 times
-      TMP_JAMP(1007) = TMP_JAMP(397) - TMP_JAMP(396)  ! used 8 times
-      TMP_JAMP(1006) = TMP_JAMP(398) + ((-0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * AMP(1117)  ! used 8 times
-      TMP_JAMP(1005) = TMP_JAMP(400) + ((0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * AMP(337)  ! used 8 times
-      TMP_JAMP(1004) = TMP_JAMP(400) - TMP_JAMP(399)  ! used 8 times
-      TMP_JAMP(1003) = TMP_JAMP(401) + ((0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * AMP(1147)  ! used 8 times
-      TMP_JAMP(1002) = TMP_JAMP(401) - TMP_JAMP(398)  ! used 8 times
-      TMP_JAMP(1001) = TMP_JAMP(404) - TMP_JAMP(403)  ! used 8 times
-      TMP_JAMP(1000) = TMP_JAMP(405) +  TMP_JAMP(402)  ! used 8 times
-      TMP_JAMP(999) = AMP(1457) - AMP(1690)  ! used 8 times
-      TMP_JAMP(998) = AMP(1453) - AMP(1457)  ! used 8 times
-      TMP_JAMP(997) = AMP(1064) - AMP(1066)  ! used 8 times
-      TMP_JAMP(996) = AMP(1046) - AMP(1048)  ! used 8 times
-      TMP_JAMP(995) = AMP(1044) +  AMP(1050)  ! used 8 times
-      TMP_JAMP(994) = AMP(622) - AMP(1538)  ! used 8 times
-      TMP_JAMP(993) = AMP(597) - AMP(599)  ! used 8 times
-      TMP_JAMP(992) = AMP(592) +  AMP(600)  ! used 8 times
-      TMP_JAMP(991) = AMP(513) - AMP(1068)  ! used 8 times
-      TMP_JAMP(990) = AMP(504) +  AMP(1662)  ! used 8 times
-      TMP_JAMP(989) = AMP(502) +  AMP(513)  ! used 8 times
-      TMP_JAMP(988) = AMP(498) +  AMP(504)  ! used 8 times
-      TMP_JAMP(987) = TMP_JAMP(297) + ((0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * AMP(502)  ! used 8 times
-      TMP_JAMP(986) = TMP_JAMP(387) +  TMP_JAMP(386)  ! used 8 times
-      TMP_JAMP(985) = TMP_JAMP(388) + ((-0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * AMP(498)  ! used 8 times
-      TMP_JAMP(984) = TMP_JAMP(388) - TMP_JAMP(386)  ! used 8 times
-      TMP_JAMP(983) = TMP_JAMP(389) + ((-0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * AMP(622)  ! used 8 times
-      TMP_JAMP(982) = TMP_JAMP(389) - TMP_JAMP(387)  ! used 8 times
-      TMP_JAMP(981) = TMP_JAMP(390) + ((0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * AMP(1453)  ! used 8 times
-      TMP_JAMP(980) = TMP_JAMP(392) - TMP_JAMP(390)  ! used 8 times
-      TMP_JAMP(979) = TMP_JAMP(393) +  TMP_JAMP(391)  ! used 8 times
-      TMP_JAMP(978) = TMP_JAMP(394) +  TMP_JAMP(387)  ! used 8 times
-      TMP_JAMP(977) = TMP_JAMP(395) +  TMP_JAMP(389)  ! used 8 times
-      TMP_JAMP(976) = TMP_JAMP(396) - TMP_JAMP(393)  ! used 8 times
-      TMP_JAMP(975) = TMP_JAMP(397) +  TMP_JAMP(391)  ! used 8 times
-      TMP_JAMP(974) = AMP(1201) - AMP(1826)  ! used 8 times
-      TMP_JAMP(973) = AMP(1200) +  AMP(1201)  ! used 8 times
-      TMP_JAMP(972) = AMP(626) +  AMP(631)  ! used 8 times
-      TMP_JAMP(971) = AMP(598) - AMP(1202)  ! used 8 times
-      TMP_JAMP(970) = AMP(526) +  AMP(598)  ! used 8 times
-      TMP_JAMP(969) = AMP(517) - AMP(633)  ! used 8 times
-      TMP_JAMP(968) = AMP(441) +  AMP(463)  ! used 8 times
-      TMP_JAMP(967) = AMP(438) +  AMP(441)  ! used 8 times
-      TMP_JAMP(966) = AMP(356) +  AMP(358)  ! used 8 times
-      TMP_JAMP(965) = AMP(355) +  AMP(357)  ! used 8 times
-      TMP_JAMP(964) = TMP_JAMP(292) + ((0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * AMP(526)  ! used 8 times
-      TMP_JAMP(963) = TMP_JAMP(380) + ((-0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * AMP(626)  ! used 8 times
-      TMP_JAMP(962) = TMP_JAMP(381) - TMP_JAMP(380)  ! used 8 times
-      TMP_JAMP(961) = TMP_JAMP(383) + ((0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * AMP(1200)  ! used 8 times
-      TMP_JAMP(960) = TMP_JAMP(384) - TMP_JAMP(383)  ! used 8 times
-      TMP_JAMP(959) = TMP_JAMP(385) + ((-0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * AMP(438)  ! used 8 times
-      TMP_JAMP(958) = TMP_JAMP(385) +  TMP_JAMP(380)  ! used 8 times
-      TMP_JAMP(957) = TMP_JAMP(385) +  TMP_JAMP(382)  ! used 8 times
-      TMP_JAMP(956) = TMP_JAMP(387) - TMP_JAMP(292)  ! used 8 times
-      TMP_JAMP(955) = TMP_JAMP(398) - TMP_JAMP(387)  ! used 8 times
-      TMP_JAMP(954) = TMP_JAMP(399) +  TMP_JAMP(383)  ! used 8 times
-      TMP_JAMP(953) = TMP_JAMP(400) +  TMP_JAMP(384)  ! used 8 times
-      TMP_JAMP(952) = TMP_JAMP(401) - TMP_JAMP(389)  ! used 8 times
-      TMP_JAMP(951) = AMP(1464) +  AMP(1465)  ! used 8 times
-      TMP_JAMP(950) = AMP(1212) +  AMP(1213)  ! used 8 times
-      TMP_JAMP(949) = AMP(1207) +  AMP(1215)  ! used 8 times
-      TMP_JAMP(948) = AMP(1203) - AMP(1570)  ! used 8 times
-      TMP_JAMP(947) = AMP(1195) +  AMP(1203)  ! used 8 times
-      TMP_JAMP(946) = AMP(111) - AMP(1467)  ! used 8 times
-      TMP_JAMP(945) = AMP(108) +  AMP(1581)  ! used 8 times
-      TMP_JAMP(944) = AMP(106) +  AMP(111)  ! used 8 times
-      TMP_JAMP(943) = AMP(102) +  AMP(108)  ! used 8 times
-      TMP_JAMP(942) = AMP(89) +  AMP(1572)  ! used 8 times
-      TMP_JAMP(941) = AMP(86) +  AMP(89)  ! used 8 times
-      TMP_JAMP(940) = TMP_JAMP(283) + ((-0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * AMP(106)  ! used 8 times
-      TMP_JAMP(939) = TMP_JAMP(374) + ((0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * AMP(86)  ! used 8 times
-      TMP_JAMP(938) = TMP_JAMP(375) + ((0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * AMP(102)  ! used 8 times
-      TMP_JAMP(937) = TMP_JAMP(376) + ((0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * AMP(1195)  ! used 8 times
-      TMP_JAMP(936) = TMP_JAMP(377) +  TMP_JAMP(376)  ! used 8 times
-      TMP_JAMP(935) = TMP_JAMP(379) +  TMP_JAMP(378)  ! used 8 times
-      TMP_JAMP(934) = TMP_JAMP(380) +  TMP_JAMP(379)  ! used 8 times
-      TMP_JAMP(933) = TMP_JAMP(381) - TMP_JAMP(378)  ! used 8 times
-      TMP_JAMP(932) = TMP_JAMP(394) +  TMP_JAMP(374)  ! used 8 times
-      TMP_JAMP(931) = TMP_JAMP(395) +  TMP_JAMP(283)  ! used 8 times
-      TMP_JAMP(930) = AMP(1150) +  AMP(1774)  ! used 8 times
-      TMP_JAMP(929) = AMP(1067) - AMP(1768)  ! used 8 times
-      TMP_JAMP(928) = AMP(1062) +  AMP(1067)  ! used 8 times
-      TMP_JAMP(927) = AMP(758) +  AMP(1691)  ! used 8 times
-      TMP_JAMP(926) = AMP(757) - AMP(1124)  ! used 8 times
-      TMP_JAMP(925) = AMP(756) - AMP(758)  ! used 8 times
-      TMP_JAMP(924) = AMP(685) +  AMP(757)  ! used 8 times
-      TMP_JAMP(923) = AMP(678) - AMP(1776)  ! used 8 times
-      TMP_JAMP(922) = AMP(663) +  AMP(1661)  ! used 8 times
-      TMP_JAMP(921) = AMP(659) +  AMP(662)  ! used 8 times
-      TMP_JAMP(920) = AMP(657) +  AMP(663)  ! used 8 times
-      TMP_JAMP(919) = TMP_JAMP(277) + ((-0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * AMP(685)  ! used 8 times
-      TMP_JAMP(918) = TMP_JAMP(368) + ((0.000000000000000D+00
+      TMP_JAMP(15) = AMP(37) +  AMP(40)  ! used 16 times
+      TMP_JAMP(14) = AMP(25) +  AMP(28)  ! used 16 times
+      TMP_JAMP(13) = AMP(16) +  AMP(26)  ! used 16 times
+      TMP_JAMP(12) = AMP(5) - AMP(38)  ! used 16 times
+      TMP_JAMP(11) = AMP(2) +  AMP(4)  ! used 16 times
+      TMP_JAMP(10) = AMP(22) +  AMP(39)  ! used 16 times
+      TMP_JAMP(9) = AMP(7) +  AMP(9)  ! used 16 times
+      TMP_JAMP(8) = AMP(6) +  AMP(8)  ! used 16 times
+      TMP_JAMP(7) = AMP(31) +  AMP(34)  ! used 16 times
+      TMP_JAMP(6) = AMP(10) - AMP(33)  ! used 16 times
+      TMP_JAMP(5) = AMP(19) +  AMP(32)  ! used 16 times
+      TMP_JAMP(4) = AMP(1) +  AMP(3)  ! used 16 times
+      TMP_JAMP(3) = AMP(15) - AMP(27)  ! used 16 times
+      TMP_JAMP(2) = AMP(11) +  AMP(13)  ! used 16 times
+      TMP_JAMP(1) = AMP(12) +  AMP(14)  ! used 16 times
+      TMP_JAMP(30) = TMP_JAMP(15) +  AMP(43)  ! used 16 times
+      TMP_JAMP(29) = TMP_JAMP(14) - AMP(45)  ! used 16 times
+      TMP_JAMP(28) = TMP_JAMP(13) +  AMP(29)  ! used 16 times
+      TMP_JAMP(27) = TMP_JAMP(12) - AMP(41)  ! used 16 times
+      TMP_JAMP(26) = TMP_JAMP(11) +  AMP(18)  ! used 16 times
+      TMP_JAMP(25) = TMP_JAMP(10) +  AMP(42)  ! used 16 times
+      TMP_JAMP(24) = TMP_JAMP(9) +  AMP(17)  ! used 16 times
+      TMP_JAMP(23) = TMP_JAMP(8) +  AMP(24)  ! used 16 times
+      TMP_JAMP(22) = TMP_JAMP(7) - AMP(44)  ! used 16 times
+      TMP_JAMP(21) = TMP_JAMP(6) - AMP(36)  ! used 16 times
+      TMP_JAMP(20) = TMP_JAMP(5) +  AMP(35)  ! used 16 times
+      TMP_JAMP(19) = TMP_JAMP(4) +  AMP(21)  ! used 16 times
+      TMP_JAMP(18) = TMP_JAMP(3) - AMP(30)  ! used 16 times
+      TMP_JAMP(17) = TMP_JAMP(2) +  AMP(23)  ! used 16 times
+      TMP_JAMP(16) = TMP_JAMP(1) +  AMP(20)  ! used 16 times
+      TMP_JAMP(113) = TMP_JAMP(30) +  TMP_JAMP(29)  ! used 8 times
+      TMP_JAMP(112) = TMP_JAMP(30) - TMP_JAMP(22)  ! used 8 times
+      TMP_JAMP(111) = TMP_JAMP(30) - TMP_JAMP(25)  ! used 8 times
+      TMP_JAMP(110) = TMP_JAMP(29) +  TMP_JAMP(28)  ! used 8 times
+      TMP_JAMP(109) = TMP_JAMP(29) +  TMP_JAMP(22)  ! used 8 times
+      TMP_JAMP(108) = TMP_JAMP(28) - TMP_JAMP(26)  ! used 8 times
+      TMP_JAMP(107) = TMP_JAMP(28) +  TMP_JAMP(24)  ! used 8 times
+      TMP_JAMP(106) = TMP_JAMP(27) + ((-0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * AMP(152)  ! used 8 times
+      TMP_JAMP(105) = TMP_JAMP(27) +  TMP_JAMP(26)  ! used 8 times
+      TMP_JAMP(104) = TMP_JAMP(27) - TMP_JAMP(25)  ! used 8 times
+      TMP_JAMP(103) = TMP_JAMP(27) - TMP_JAMP(19)  ! used 8 times
+      TMP_JAMP(102) = TMP_JAMP(27) + ((0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * AMP(153)  ! used 8 times
+      TMP_JAMP(101) = TMP_JAMP(26) +  TMP_JAMP(24)  ! used 8 times
+      TMP_JAMP(100) = TMP_JAMP(26) +  TMP_JAMP(19)  ! used 8 times
+      TMP_JAMP(99) = TMP_JAMP(25) - TMP_JAMP(23)  ! used 8 times
+      TMP_JAMP(98) = TMP_JAMP(25) +  TMP_JAMP(17)  ! used 8 times
+      TMP_JAMP(97) = TMP_JAMP(24) +  TMP_JAMP(23)  ! used 8 times
+      TMP_JAMP(96) = TMP_JAMP(23) +  TMP_JAMP(17)  ! used 8 times
+      TMP_JAMP(95) = TMP_JAMP(22) +  TMP_JAMP(20)  ! used 8 times
+      TMP_JAMP(94) = TMP_JAMP(21) + ((-0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * AMP(146)  ! used 8 times
+      TMP_JAMP(93) = TMP_JAMP(21) - TMP_JAMP(20)  ! used 8 times
+      TMP_JAMP(92) = TMP_JAMP(21) + ((0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * AMP(147)  ! used 8 times
+      TMP_JAMP(91) = TMP_JAMP(20) - TMP_JAMP(19)  ! used 8 times
+      TMP_JAMP(90) = TMP_JAMP(20) +  TMP_JAMP(16)  ! used 8 times
+      TMP_JAMP(89) = TMP_JAMP(19) +  TMP_JAMP(16)  ! used 8 times
+      TMP_JAMP(88) = TMP_JAMP(18) - TMP_JAMP(17)  ! used 8 times
+      TMP_JAMP(87) = TMP_JAMP(18) +  TMP_JAMP(16)  ! used 8 times
+      TMP_JAMP(86) = TMP_JAMP(18) + ((-0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * AMP(140)  ! used 8 times
+      TMP_JAMP(85) = TMP_JAMP(17) +  TMP_JAMP(16)  ! used 8 times
+      TMP_JAMP(84) = AMP(439) - AMP(442)  ! used 8 times
+      TMP_JAMP(83) = AMP(421) - AMP(424)  ! used 8 times
+      TMP_JAMP(82) = AMP(420) +  AMP(422)  ! used 8 times
+      TMP_JAMP(81) = AMP(341) - AMP(353)  ! used 8 times
+      TMP_JAMP(80) = AMP(337) +  AMP(339)  ! used 8 times
+      TMP_JAMP(79) = AMP(151) +  AMP(157)  ! used 8 times
+      TMP_JAMP(78) = AMP(139) - AMP(159)  ! used 8 times
+      TMP_JAMP(77) = AMP(88) +  AMP(99)  ! used 8 times
+      TMP_JAMP(76) = AMP(84) +  AMP(90)  ! used 8 times
+      TMP_JAMP(75) = AMP(438) +  AMP(441)  ! used 8 times
+      TMP_JAMP(74) = AMP(356) +  AMP(358)  ! used 8 times
+      TMP_JAMP(73) = AMP(355) +  AMP(357)  ! used 8 times
+      TMP_JAMP(72) = AMP(106) +  AMP(111)  ! used 8 times
+      TMP_JAMP(71) = AMP(102) +  AMP(108)  ! used 8 times
+      TMP_JAMP(70) = AMP(86) +  AMP(89)  ! used 8 times
+      TMP_JAMP(69) = AMP(430) - AMP(433)  ! used 8 times
+      TMP_JAMP(68) = AMP(359) - AMP(371)  ! used 8 times
+      TMP_JAMP(67) = AMP(145) - AMP(158)  ! used 8 times
+      TMP_JAMP(66) = AMP(429) +  AMP(431)  ! used 8 times
+      TMP_JAMP(65) = AMP(338) +  AMP(340)  ! used 8 times
+      TMP_JAMP(64) = AMP(118) +  AMP(123)  ! used 8 times
+      TMP_JAMP(63) = AMP(114) +  AMP(120)  ! used 8 times
+      TMP_JAMP(62) = AMP(377) - AMP(389)  ! used 8 times
+      TMP_JAMP(61) = AMP(374) +  AMP(376)  ! used 8 times
+      TMP_JAMP(60) = AMP(373) +  AMP(375)  ! used 8 times
+      TMP_JAMP(59) = AMP(104) +  AMP(107)  ! used 8 times
+      TMP_JAMP(58) = AMP(116) +  AMP(119)  ! used 8 times
+      TMP_JAMP(57) = AMP(70) +  AMP(81)  ! used 8 times
+      TMP_JAMP(56) = AMP(66) +  AMP(72)  ! used 8 times
+      TMP_JAMP(55) = AMP(68) +  AMP(71)  ! used 8 times
+      TMP_JAMP(54) = AMP(130) +  AMP(135)  ! used 8 times
+      TMP_JAMP(53) = AMP(126) +  AMP(132)  ! used 8 times
+      TMP_JAMP(52) = AMP(128) +  AMP(131)  ! used 8 times
+      TMP_JAMP(51) = AMP(52) +  AMP(63)  ! used 8 times
+      TMP_JAMP(50) = AMP(48) +  AMP(54)  ! used 8 times
+      TMP_JAMP(49) = AMP(50) +  AMP(53)  ! used 8 times
+      TMP_JAMP(48) = AMP(149) +  AMP(154)  ! used 8 times
+      TMP_JAMP(47) = AMP(137) - AMP(156)  ! used 8 times
+      TMP_JAMP(46) = AMP(143) - AMP(155)  ! used 8 times
+      TMP_JAMP(45) = AMP(280) - AMP(283)  ! used 8 times
+      TMP_JAMP(44) = AMP(262) - AMP(265)  ! used 8 times
+      TMP_JAMP(43) = AMP(261) +  AMP(263)  ! used 8 times
+      TMP_JAMP(42) = AMP(182) - AMP(194)  ! used 8 times
+      TMP_JAMP(41) = AMP(178) +  AMP(180)  ! used 8 times
+      TMP_JAMP(40) = AMP(279) +  AMP(282)  ! used 8 times
+      TMP_JAMP(39) = AMP(197) +  AMP(199)  ! used 8 times
+      TMP_JAMP(38) = AMP(196) +  AMP(198)  ! used 8 times
+      TMP_JAMP(37) = AMP(271) - AMP(274)  ! used 8 times
+      TMP_JAMP(36) = AMP(200) - AMP(212)  ! used 8 times
+      TMP_JAMP(35) = AMP(270) +  AMP(272)  ! used 8 times
+      TMP_JAMP(34) = AMP(179) +  AMP(181)  ! used 8 times
+      TMP_JAMP(33) = AMP(218) - AMP(230)  ! used 8 times
+      TMP_JAMP(32) = AMP(215) +  AMP(217)  ! used 8 times
+      TMP_JAMP(31) = AMP(214) +  AMP(216)  ! used 8 times
+      TMP_JAMP(140) = TMP_JAMP(82) +  AMP(445)  ! used 8 times
+      TMP_JAMP(139) = TMP_JAMP(81) - AMP(440)  ! used 8 times
+      TMP_JAMP(138) = TMP_JAMP(80) - AMP(447)  ! used 8 times
+      TMP_JAMP(137) = TMP_JAMP(75) +  AMP(463)  ! used 8 times
+      TMP_JAMP(136) = TMP_JAMP(74) - AMP(465)  ! used 8 times
+      TMP_JAMP(135) = TMP_JAMP(73) - AMP(446)  ! used 8 times
+      TMP_JAMP(134) = TMP_JAMP(68) - AMP(432)  ! used 8 times
+      TMP_JAMP(133) = TMP_JAMP(66) +  AMP(454)  ! used 8 times
+      TMP_JAMP(132) = TMP_JAMP(65) - AMP(456)  ! used 8 times
+      TMP_JAMP(131) = TMP_JAMP(62) - AMP(423)  ! used 8 times
+      TMP_JAMP(130) = TMP_JAMP(61) - AMP(464)  ! used 8 times
+      TMP_JAMP(129) = TMP_JAMP(60) - AMP(455)  ! used 8 times
+      TMP_JAMP(128) = TMP_JAMP(45) +  AMP(316)  ! used 8 times
+      TMP_JAMP(127) = TMP_JAMP(44) - AMP(318)  ! used 8 times
+      TMP_JAMP(126) = TMP_JAMP(43) +  AMP(286)  ! used 8 times
+      TMP_JAMP(125) = TMP_JAMP(42) - AMP(281)  ! used 8 times
+      TMP_JAMP(124) = TMP_JAMP(41) - AMP(288)  ! used 8 times
+      TMP_JAMP(123) = TMP_JAMP(40) +  AMP(304)  ! used 8 times
+      TMP_JAMP(122) = TMP_JAMP(39) - AMP(306)  ! used 8 times
+      TMP_JAMP(121) = TMP_JAMP(38) - AMP(287)  ! used 8 times
+      TMP_JAMP(120) = TMP_JAMP(37) - AMP(317)  ! used 8 times
+      TMP_JAMP(119) = TMP_JAMP(36) - AMP(273)  ! used 8 times
+      TMP_JAMP(118) = TMP_JAMP(35) +  AMP(295)  ! used 8 times
+      TMP_JAMP(117) = TMP_JAMP(34) - AMP(297)  ! used 8 times
+      TMP_JAMP(116) = TMP_JAMP(33) - AMP(264)  ! used 8 times
+      TMP_JAMP(115) = TMP_JAMP(32) - AMP(305)  ! used 8 times
+      TMP_JAMP(114) = TMP_JAMP(31) - AMP(296)  ! used 8 times
+      TMP_JAMP(312) = TMP_JAMP(140) +  TMP_JAMP(138)  ! used 4 times
+      TMP_JAMP(311) = TMP_JAMP(140) - TMP_JAMP(135)  ! used 4 times
+      TMP_JAMP(310) = TMP_JAMP(139) + ((-0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * AMP(352)  ! used 4 times
+      TMP_JAMP(309) = TMP_JAMP(139) - TMP_JAMP(138)  ! used 4 times
+      TMP_JAMP(308) = TMP_JAMP(139) - TMP_JAMP(137)  ! used 4 times
+      TMP_JAMP(307) = TMP_JAMP(139) +  TMP_JAMP(132)  ! used 4 times
+      TMP_JAMP(306) = TMP_JAMP(139) + ((0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * AMP(354)  ! used 4 times
+      TMP_JAMP(305) = TMP_JAMP(138) +  TMP_JAMP(135)  ! used 4 times
+      TMP_JAMP(304) = TMP_JAMP(138) +  TMP_JAMP(132)  ! used 4 times
+      TMP_JAMP(303) = TMP_JAMP(137) +  TMP_JAMP(136)  ! used 4 times
+      TMP_JAMP(302) = TMP_JAMP(137) - TMP_JAMP(84)  ! used 4 times
+      TMP_JAMP(301) = TMP_JAMP(136) +  TMP_JAMP(135)  ! used 4 times
+      TMP_JAMP(300) = TMP_JAMP(136) +  TMP_JAMP(130)  ! used 4 times
+      TMP_JAMP(299) = TMP_JAMP(134) + ((-0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * AMP(370)  ! used 4 times
+      TMP_JAMP(298) = TMP_JAMP(134) - TMP_JAMP(133)  ! used 4 times
+      TMP_JAMP(297) = TMP_JAMP(134) + ((0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * AMP(372)  ! used 4 times
+      TMP_JAMP(296) = TMP_JAMP(133) +  TMP_JAMP(132)  ! used 4 times
+      TMP_JAMP(295) = TMP_JAMP(133) - TMP_JAMP(129)  ! used 4 times
+      TMP_JAMP(294) = TMP_JAMP(132) +  TMP_JAMP(129)  ! used 4 times
+      TMP_JAMP(293) = TMP_JAMP(131) +  TMP_JAMP(130)  ! used 4 times
+      TMP_JAMP(292) = TMP_JAMP(131) - TMP_JAMP(129)  ! used 4 times
+      TMP_JAMP(291) = TMP_JAMP(131) + ((-0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * AMP(388)  ! used 4 times
+      TMP_JAMP(290) = TMP_JAMP(130) +  TMP_JAMP(129)  ! used 4 times
+      TMP_JAMP(289) = TMP_JAMP(128) +  TMP_JAMP(127)  ! used 4 times
+      TMP_JAMP(288) = TMP_JAMP(128) - TMP_JAMP(120)  ! used 4 times
+      TMP_JAMP(287) = TMP_JAMP(128) - TMP_JAMP(123)  ! used 4 times
+      TMP_JAMP(286) = TMP_JAMP(127) +  TMP_JAMP(126)  ! used 4 times
+      TMP_JAMP(285) = TMP_JAMP(127) +  TMP_JAMP(120)  ! used 4 times
+      TMP_JAMP(284) = TMP_JAMP(126) +  TMP_JAMP(124)  ! used 4 times
+      TMP_JAMP(283) = TMP_JAMP(126) - TMP_JAMP(121)  ! used 4 times
+      TMP_JAMP(282) = TMP_JAMP(125) + ((-0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * AMP(195)  ! used 4 times
+      TMP_JAMP(281) = TMP_JAMP(125) - TMP_JAMP(124)  ! used 4 times
+      TMP_JAMP(280) = TMP_JAMP(125) - TMP_JAMP(123)  ! used 4 times
+      TMP_JAMP(279) = TMP_JAMP(125) +  TMP_JAMP(117)  ! used 4 times
+      TMP_JAMP(278) = TMP_JAMP(125) + ((0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * AMP(193)  ! used 4 times
+      TMP_JAMP(277) = TMP_JAMP(124) +  TMP_JAMP(121)  ! used 4 times
+      TMP_JAMP(276) = TMP_JAMP(124) +  TMP_JAMP(117)  ! used 4 times
+      TMP_JAMP(275) = TMP_JAMP(123) +  TMP_JAMP(122)  ! used 4 times
+      TMP_JAMP(274) = TMP_JAMP(123) - TMP_JAMP(115)  ! used 4 times
+      TMP_JAMP(273) = TMP_JAMP(122) +  TMP_JAMP(121)  ! used 4 times
+      TMP_JAMP(272) = TMP_JAMP(122) +  TMP_JAMP(115)  ! used 4 times
+      TMP_JAMP(271) = TMP_JAMP(120) +  TMP_JAMP(118)  ! used 4 times
+      TMP_JAMP(270) = TMP_JAMP(119) + ((-0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * AMP(213)  ! used 4 times
+      TMP_JAMP(269) = TMP_JAMP(119) - TMP_JAMP(118)  ! used 4 times
+      TMP_JAMP(268) = TMP_JAMP(119) + ((0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * AMP(211)  ! used 4 times
+      TMP_JAMP(267) = TMP_JAMP(118) +  TMP_JAMP(117)  ! used 4 times
+      TMP_JAMP(266) = TMP_JAMP(118) - TMP_JAMP(114)  ! used 4 times
+      TMP_JAMP(265) = TMP_JAMP(117) +  TMP_JAMP(114)  ! used 4 times
+      TMP_JAMP(264) = TMP_JAMP(116) +  TMP_JAMP(115)  ! used 4 times
+      TMP_JAMP(263) = TMP_JAMP(116) - TMP_JAMP(114)  ! used 4 times
+      TMP_JAMP(262) = TMP_JAMP(116) + ((-0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * AMP(231)  ! used 4 times
+      TMP_JAMP(261) = TMP_JAMP(115) +  TMP_JAMP(114)  ! used 4 times
+      TMP_JAMP(260) = TMP_JAMP(112) - TMP_JAMP(91)  ! used 4 times
+      TMP_JAMP(259) = TMP_JAMP(112) - TMP_JAMP(99)  ! used 4 times
+      TMP_JAMP(258) = TMP_JAMP(109) + ((-0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * TMP_JAMP(67)  ! used 4 times
+      TMP_JAMP(257) = TMP_JAMP(109) +  TMP_JAMP(107)  ! used 4 times
+      TMP_JAMP(256) = TMP_JAMP(108) + ((-0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * TMP_JAMP(54)  ! used 4 times
+      TMP_JAMP(255) = TMP_JAMP(107) + ((0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * TMP_JAMP(54)  ! used 4 times
+      TMP_JAMP(254) = TMP_JAMP(102) + ((0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * TMP_JAMP(51)  ! used 4 times
+      TMP_JAMP(253) = TMP_JAMP(101) - TMP_JAMP(91)  ! used 4 times
+      TMP_JAMP(252) = TMP_JAMP(97) +  TMP_JAMP(26)  ! used 4 times
+      TMP_JAMP(251) = TMP_JAMP(97) + ((0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * TMP_JAMP(71)  ! used 4 times
+      TMP_JAMP(250) = TMP_JAMP(91) + ((-0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * TMP_JAMP(64)  ! used 4 times
+      TMP_JAMP(249) = TMP_JAMP(91) + ((0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * TMP_JAMP(56)  ! used 4 times
+      TMP_JAMP(248) = TMP_JAMP(86) +  AMP(134)  ! used 4 times
+      TMP_JAMP(247) = TMP_JAMP(85) +  TMP_JAMP(25)  ! used 4 times
+      TMP_JAMP(246) = TMP_JAMP(85) +  TMP_JAMP(23)  ! used 4 times
+      TMP_JAMP(245) = TMP_JAMP(85) + ((0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * TMP_JAMP(58)  ! used 4 times
+      TMP_JAMP(244) = TMP_JAMP(84) + ((0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * AMP(437)  ! used 4 times
+      TMP_JAMP(243) = TMP_JAMP(84) +  TMP_JAMP(83)  ! used 4 times
+      TMP_JAMP(242) = TMP_JAMP(84) - TMP_JAMP(69)  ! used 4 times
+      TMP_JAMP(241) = TMP_JAMP(83) + ((0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * AMP(419)  ! used 4 times
+      TMP_JAMP(240) = TMP_JAMP(83) +  TMP_JAMP(69)  ! used 4 times
+      TMP_JAMP(239) = TMP_JAMP(79) + ((0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * AMP(437)  ! used 4 times
+      TMP_JAMP(238) = TMP_JAMP(79) + ((0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * AMP(150)  ! used 4 times
+      TMP_JAMP(237) = TMP_JAMP(79) - TMP_JAMP(67)  ! used 4 times
+      TMP_JAMP(236) = TMP_JAMP(78) + ((0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * AMP(419)  ! used 4 times
+      TMP_JAMP(235) = TMP_JAMP(78) + ((0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * AMP(98)  ! used 4 times
+      TMP_JAMP(234) = TMP_JAMP(78) + ((-0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * AMP(138)  ! used 4 times
+      TMP_JAMP(233) = TMP_JAMP(77) + ((0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * AMP(98)  ! used 4 times
+      TMP_JAMP(232) = TMP_JAMP(77) - TMP_JAMP(76)  ! used 4 times
+      TMP_JAMP(231) = TMP_JAMP(77) + ((-0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * TMP_JAMP(28)  ! used 4 times
+      TMP_JAMP(230) = TMP_JAMP(76) + ((-0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * AMP(85)  ! used 4 times
+      TMP_JAMP(229) = TMP_JAMP(76) + ((0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * AMP(96)  ! used 4 times
+      TMP_JAMP(228) = TMP_JAMP(72) - TMP_JAMP(71)  ! used 4 times
+      TMP_JAMP(227) = TMP_JAMP(72) +  TMP_JAMP(59)  ! used 4 times
+      TMP_JAMP(226) = TMP_JAMP(71) + ((0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * AMP(103)  ! used 4 times
+      TMP_JAMP(225) = TMP_JAMP(70) + ((0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * AMP(93)  ! used 4 times
+      TMP_JAMP(224) = TMP_JAMP(70) + ((-0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * AMP(87)  ! used 4 times
+      TMP_JAMP(223) = TMP_JAMP(69) + ((-0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * AMP(428)  ! used 4 times
+      TMP_JAMP(222) = TMP_JAMP(67) + ((-0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * AMP(428)  ! used 4 times
+      TMP_JAMP(221) = TMP_JAMP(67) + ((-0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * AMP(144)  ! used 4 times
+      TMP_JAMP(220) = TMP_JAMP(64) + ((0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * AMP(122)  ! used 4 times
+      TMP_JAMP(219) = TMP_JAMP(64) +  TMP_JAMP(58)  ! used 4 times
+      TMP_JAMP(218) = TMP_JAMP(63) + ((-0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * AMP(96)  ! used 4 times
+      TMP_JAMP(217) = TMP_JAMP(63) +  TMP_JAMP(58)  ! used 4 times
+      TMP_JAMP(216) = TMP_JAMP(63) + ((0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * AMP(115)  ! used 4 times
+      TMP_JAMP(215) = TMP_JAMP(59) + ((0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * AMP(105)  ! used 4 times
+      TMP_JAMP(214) = TMP_JAMP(58) + ((0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * AMP(117)  ! used 4 times
+      TMP_JAMP(213) = TMP_JAMP(57) + ((0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * AMP(80)  ! used 4 times
+      TMP_JAMP(212) = TMP_JAMP(57) + ((-0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * AMP(79)  ! used 4 times
+      TMP_JAMP(211) = TMP_JAMP(56) + ((-0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * AMP(67)  ! used 4 times
+      TMP_JAMP(210) = TMP_JAMP(55) + ((0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * AMP(75)  ! used 4 times
+      TMP_JAMP(209) = TMP_JAMP(55) + ((-0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * AMP(69)  ! used 4 times
+      TMP_JAMP(208) = TMP_JAMP(54) + ((0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * AMP(134)  ! used 4 times
+      TMP_JAMP(207) = TMP_JAMP(54) +  TMP_JAMP(52)  ! used 4 times
+      TMP_JAMP(206) = TMP_JAMP(53) + ((-0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * AMP(78)  ! used 4 times
+      TMP_JAMP(205) = TMP_JAMP(53) + ((0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * AMP(127)  ! used 4 times
+      TMP_JAMP(204) = TMP_JAMP(52) + ((0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * AMP(129)  ! used 4 times
+      TMP_JAMP(203) = TMP_JAMP(51) + ((0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * AMP(62)  ! used 4 times
+      TMP_JAMP(202) = TMP_JAMP(50) + ((-0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * AMP(49)  ! used 4 times
+      TMP_JAMP(201) = TMP_JAMP(50) + ((0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * AMP(60)  ! used 4 times
+      TMP_JAMP(200) = TMP_JAMP(49) + ((0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * AMP(57)  ! used 4 times
+      TMP_JAMP(199) = TMP_JAMP(49) + ((-0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * AMP(51)  ! used 4 times
+      TMP_JAMP(198) = TMP_JAMP(48) + ((0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * AMP(109)  ! used 4 times
+      TMP_JAMP(197) = TMP_JAMP(48) + ((-0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * AMP(148)  ! used 4 times
+      TMP_JAMP(196) = TMP_JAMP(47) + ((0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * TMP_JAMP(29)  ! used 4 times
+      TMP_JAMP(195) = TMP_JAMP(47) + ((-0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * AMP(133)  ! used 4 times
+      TMP_JAMP(194) = TMP_JAMP(46) + ((-0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * AMP(121)  ! used 4 times
+      TMP_JAMP(193) = TMP_JAMP(46) + ((0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * AMP(142)  ! used 4 times
+      TMP_JAMP(192) = TMP_JAMP(28) - AMP(97)  ! used 4 times
+      TMP_JAMP(191) = TMP_JAMP(25) - AMP(61)  ! used 4 times
+      TMP_JAMP(190) = AMP(416) +  AMP(451)  ! used 4 times
+      TMP_JAMP(189) = AMP(350) - AMP(453)  ! used 4 times
+      TMP_JAMP(188) = AMP(85) + ((0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * AMP(152)  ! used 4 times
+      TMP_JAMP(187) = AMP(443) +  AMP(466)  ! used 4 times
+      TMP_JAMP(186) = AMP(368) - AMP(452)  ! used 4 times
+      TMP_JAMP(185) = AMP(361) - AMP(468)  ! used 4 times
+      TMP_JAMP(184) = AMP(110) + ((0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * AMP(152)  ! used 4 times
+      TMP_JAMP(183) = AMP(434) +  AMP(457)  ! used 4 times
+      TMP_JAMP(182) = AMP(343) - AMP(459)  ! used 4 times
+      TMP_JAMP(181) = AMP(379) - AMP(467)  ! used 4 times
+      TMP_JAMP(180) = AMP(381) - AMP(458)  ! used 4 times
+      TMP_JAMP(179) = AMP(97) + ((0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * AMP(141)  ! used 4 times
+      TMP_JAMP(178) = AMP(117) + ((0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * AMP(141)  ! used 4 times
+      TMP_JAMP(177) = AMP(407) +  AMP(460)  ! used 4 times
+      TMP_JAMP(176) = AMP(347) - AMP(462)  ! used 4 times
+      TMP_JAMP(175) = AMP(386) - AMP(461)  ! used 4 times
+      TMP_JAMP(174) = AMP(425) +  AMP(448)  ! used 4 times
+      TMP_JAMP(173) = AMP(345) - AMP(450)  ! used 4 times
+      TMP_JAMP(172) = AMP(363) - AMP(449)  ! used 4 times
+      TMP_JAMP(171) = AMP(398) +  AMP(469)  ! used 4 times
+      TMP_JAMP(170) = AMP(365) - AMP(471)  ! used 4 times
+      TMP_JAMP(169) = AMP(383) - AMP(470)  ! used 4 times
+      TMP_JAMP(168) = AMP(331) +  AMP(334)  ! used 4 times
+      TMP_JAMP(167) = AMP(325) - AMP(336)  ! used 4 times
+      TMP_JAMP(166) = AMP(328) - AMP(335)  ! used 4 times
+      TMP_JAMP(165) = AMP(136) + ((0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * AMP(141)  ! used 4 times
+      TMP_JAMP(164) = AMP(266) +  AMP(289)  ! used 4 times
+      TMP_JAMP(163) = AMP(186) - AMP(291)  ! used 4 times
+      TMP_JAMP(162) = AMP(172) +  AMP(175)  ! used 4 times
+      TMP_JAMP(161) = AMP(166) - AMP(177)  ! used 4 times
+      TMP_JAMP(160) = AMP(239) +  AMP(310)  ! used 4 times
+      TMP_JAMP(159) = AMP(206) - AMP(312)  ! used 4 times
+      TMP_JAMP(158) = AMP(204) - AMP(290)  ! used 4 times
+      TMP_JAMP(157) = AMP(169) - AMP(176)  ! used 4 times
+      TMP_JAMP(156) = AMP(248) +  AMP(301)  ! used 4 times
+      TMP_JAMP(155) = AMP(188) - AMP(303)  ! used 4 times
+      TMP_JAMP(154) = AMP(224) - AMP(311)  ! used 4 times
+      TMP_JAMP(153) = AMP(227) - AMP(302)  ! used 4 times
+      TMP_JAMP(152) = AMP(275) +  AMP(298)  ! used 4 times
+      TMP_JAMP(151) = AMP(184) - AMP(300)  ! used 4 times
+      TMP_JAMP(150) = AMP(222) - AMP(299)  ! used 4 times
+      TMP_JAMP(149) = AMP(257) +  AMP(292)  ! used 4 times
+      TMP_JAMP(148) = AMP(191) - AMP(294)  ! used 4 times
+      TMP_JAMP(147) = AMP(209) - AMP(293)  ! used 4 times
+      TMP_JAMP(146) = AMP(284) +  AMP(307)  ! used 4 times
+      TMP_JAMP(145) = AMP(202) - AMP(309)  ! used 4 times
+      TMP_JAMP(144) = AMP(220) - AMP(308)  ! used 4 times
+      TMP_JAMP(143) = AMP(278) +  AMP(313)  ! used 4 times
+      TMP_JAMP(142) = AMP(260) - AMP(315)  ! used 4 times
+      TMP_JAMP(141) = AMP(269) - AMP(314)  ! used 4 times
+      TMP_JAMP(324) = TMP_JAMP(254) - TMP_JAMP(191)  ! used 4 times
+      TMP_JAMP(323) = TMP_JAMP(234) + ((-0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * TMP_JAMP(86)  ! used 4 times
+      TMP_JAMP(322) = TMP_JAMP(224) + ((-0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * TMP_JAMP(94)  ! used 4 times
+      TMP_JAMP(321) = TMP_JAMP(221) + ((-0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * TMP_JAMP(94)  ! used 4 times
+      TMP_JAMP(320) = TMP_JAMP(215) + ((0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * TMP_JAMP(88)  ! used 4 times
+      TMP_JAMP(319) = TMP_JAMP(212) + ((-0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * TMP_JAMP(92)  ! used 4 times
+      TMP_JAMP(318) = TMP_JAMP(209) + ((-0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * TMP_JAMP(86)  ! used 4 times
+      TMP_JAMP(317) = TMP_JAMP(204) + ((0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * TMP_JAMP(92)  ! used 4 times
+      TMP_JAMP(316) = TMP_JAMP(198) - TMP_JAMP(72)  ! used 4 times
+      TMP_JAMP(315) = TMP_JAMP(197) + ((-0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * TMP_JAMP(102)  ! used 4 times
+      TMP_JAMP(314) = TMP_JAMP(196) + ((0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * TMP_JAMP(165)  ! used 4 times
+      TMP_JAMP(313) = TMP_JAMP(193) + ((0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * TMP_JAMP(92)  ! used 4 times
+      TMP_JAMP(325) = TMP_JAMP(190) + ((0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * TMP_JAMP(140)  ! used 3 times
+      TMP_JAMP(531) = TMP_JAMP(325) +  TMP_JAMP(189)  ! used 2 times
+      TMP_JAMP(530) = TMP_JAMP(325) + ((-0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * AMP(417)  ! used 2 times
+      TMP_JAMP(529) = TMP_JAMP(324) - TMP_JAMP(245)  ! used 2 times
+      TMP_JAMP(528) = TMP_JAMP(324) +  TMP_JAMP(252)  ! used 2 times
+      TMP_JAMP(527) = TMP_JAMP(322) + ((0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * TMP_JAMP(253)  ! used 2 times
+      TMP_JAMP(526) = TMP_JAMP(321) +  TMP_JAMP(299)  ! used 2 times
+      TMP_JAMP(525) = TMP_JAMP(321) + ((0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * TMP_JAMP(259)  ! used 2 times
+      TMP_JAMP(524) = TMP_JAMP(320) + ((0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * TMP_JAMP(251)  ! used 2 times
+      TMP_JAMP(523) = TMP_JAMP(320) - TMP_JAMP(314)  ! used 2 times
+      TMP_JAMP(522) = TMP_JAMP(318) +  TMP_JAMP(234)  ! used 2 times
+      TMP_JAMP(521) = TMP_JAMP(317) + ((0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * TMP_JAMP(249)  ! used 2 times
+      TMP_JAMP(520) = TMP_JAMP(316) +  TMP_JAMP(314)  ! used 2 times
+      TMP_JAMP(519) = TMP_JAMP(316) + ((-0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * TMP_JAMP(160)  ! used 2 times
+      TMP_JAMP(518) = TMP_JAMP(315) + ((-0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * TMP_JAMP(250)  ! used 2 times
+      TMP_JAMP(517) = TMP_JAMP(315) + ((0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * TMP_JAMP(256)  ! used 2 times
+      TMP_JAMP(516) = TMP_JAMP(315) - TMP_JAMP(278)  ! used 2 times
+      TMP_JAMP(515) = TMP_JAMP(314) +  TMP_JAMP(214)  ! used 2 times
+      TMP_JAMP(514) = TMP_JAMP(313) + ((0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * TMP_JAMP(259)  ! used 2 times
+      TMP_JAMP(513) = TMP_JAMP(313) + ((-0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * TMP_JAMP(255)  ! used 2 times
+      TMP_JAMP(512) = TMP_JAMP(313) +  TMP_JAMP(268)  ! used 2 times
+      TMP_JAMP(511) = TMP_JAMP(312) + ((-0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * TMP_JAMP(174)  ! used 2 times
+      TMP_JAMP(510) = TMP_JAMP(311) + ((0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * TMP_JAMP(174)  ! used 2 times
+      TMP_JAMP(509) = TMP_JAMP(306) + ((0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * TMP_JAMP(171)  ! used 2 times
+      TMP_JAMP(508) = TMP_JAMP(305) +  TMP_JAMP(296)  ! used 2 times
+      TMP_JAMP(507) = TMP_JAMP(302) - TMP_JAMP(239)  ! used 2 times
+      TMP_JAMP(506) = TMP_JAMP(301) +  TMP_JAMP(138)  ! used 2 times
+      TMP_JAMP(505) = TMP_JAMP(296) + ((0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * TMP_JAMP(176)  ! used 2 times
+      TMP_JAMP(504) = TMP_JAMP(296) - TMP_JAMP(242)  ! used 2 times
+      TMP_JAMP(503) = TMP_JAMP(293) - AMP(378)  ! used 2 times
+      TMP_JAMP(502) = TMP_JAMP(291) +  AMP(418)  ! used 2 times
+      TMP_JAMP(501) = TMP_JAMP(291) - AMP(426)  ! used 2 times
+      TMP_JAMP(500) = TMP_JAMP(290) +  TMP_JAMP(136)  ! used 2 times
+      TMP_JAMP(499) = TMP_JAMP(290) + ((-0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * TMP_JAMP(180)  ! used 2 times
+      TMP_JAMP(498) = TMP_JAMP(288) - TMP_JAMP(267)  ! used 2 times
+      TMP_JAMP(497) = TMP_JAMP(288) - TMP_JAMP(275)  ! used 2 times
+      TMP_JAMP(496) = TMP_JAMP(285) + ((-0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * TMP_JAMP(157)  ! used 2 times
+      TMP_JAMP(495) = TMP_JAMP(285) +  TMP_JAMP(283)  ! used 2 times
+      TMP_JAMP(494) = TMP_JAMP(278) + ((0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * TMP_JAMP(146)  ! used 2 times
+      TMP_JAMP(493) = TMP_JAMP(277) +  TMP_JAMP(267)  ! used 2 times
+      TMP_JAMP(492) = TMP_JAMP(273) +  TMP_JAMP(124)  ! used 2 times
+      TMP_JAMP(491) = TMP_JAMP(267) + ((0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * TMP_JAMP(151)  ! used 2 times
+      TMP_JAMP(490) = TMP_JAMP(262) - AMP(258)  ! used 2 times
+      TMP_JAMP(489) = TMP_JAMP(261) - TMP_JAMP(123)  ! used 2 times
+      TMP_JAMP(488) = TMP_JAMP(261) +  TMP_JAMP(122)  ! used 2 times
+      TMP_JAMP(487) = TMP_JAMP(261) + ((-0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * TMP_JAMP(153)  ! used 2 times
+      TMP_JAMP(486) = TMP_JAMP(257) + ((-0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * TMP_JAMP(233)  ! used 2 times
+      TMP_JAMP(485) = TMP_JAMP(256) + ((-0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * TMP_JAMP(205)  ! used 2 times
+      TMP_JAMP(484) = TMP_JAMP(255) + ((0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * TMP_JAMP(204)  ! used 2 times
+      TMP_JAMP(483) = TMP_JAMP(250) + ((-0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * TMP_JAMP(216)  ! used 2 times
+      TMP_JAMP(482) = TMP_JAMP(246) + ((-0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * TMP_JAMP(226)  ! used 2 times
+      TMP_JAMP(481) = TMP_JAMP(246) + ((0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * TMP_JAMP(219)  ! used 2 times
+      TMP_JAMP(480) = TMP_JAMP(240) + ((0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * TMP_JAMP(167)  ! used 2 times
+      TMP_JAMP(479) = TMP_JAMP(239) - TMP_JAMP(203)  ! used 2 times
+      TMP_JAMP(478) = TMP_JAMP(238) - AMP(436)  ! used 2 times
+      TMP_JAMP(477) = TMP_JAMP(238) +  TMP_JAMP(235)  ! used 2 times
+      TMP_JAMP(476) = TMP_JAMP(234) +  TMP_JAMP(213)  ! used 2 times
+      TMP_JAMP(475) = TMP_JAMP(232) + ((0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * TMP_JAMP(188)  ! used 2 times
+      TMP_JAMP(474) = TMP_JAMP(231) +  TMP_JAMP(225)  ! used 2 times
+      TMP_JAMP(473) = TMP_JAMP(230) +  TMP_JAMP(228)  ! used 2 times
+      TMP_JAMP(472) = TMP_JAMP(229) - TMP_JAMP(217)  ! used 2 times
+      TMP_JAMP(471) = TMP_JAMP(227) - TMP_JAMP(211)  ! used 2 times
+      TMP_JAMP(470) = TMP_JAMP(226) +  AMP(101)  ! used 2 times
+      TMP_JAMP(469) = TMP_JAMP(225) + ((0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * TMP_JAMP(184)  ! used 2 times
+      TMP_JAMP(468) = TMP_JAMP(224) +  AMP(82)  ! used 2 times
+      TMP_JAMP(467) = TMP_JAMP(223) +  AMP(427)  ! used 2 times
+      TMP_JAMP(466) = TMP_JAMP(222) +  TMP_JAMP(213)  ! used 2 times
+      TMP_JAMP(465) = TMP_JAMP(220) - TMP_JAMP(218)  ! used 2 times
+      TMP_JAMP(464) = TMP_JAMP(216) - TMP_JAMP(200)  ! used 2 times
+      TMP_JAMP(463) = TMP_JAMP(216) - TMP_JAMP(194)  ! used 2 times
+      TMP_JAMP(462) = TMP_JAMP(214) +  TMP_JAMP(194)  ! used 2 times
+      TMP_JAMP(461) = TMP_JAMP(213) - TMP_JAMP(211)  ! used 2 times
+      TMP_JAMP(460) = TMP_JAMP(212) +  TMP_JAMP(206)  ! used 2 times
+      TMP_JAMP(459) = TMP_JAMP(210) + ((0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * TMP_JAMP(184)  ! used 2 times
+      TMP_JAMP(458) = TMP_JAMP(210) - TMP_JAMP(59)  ! used 2 times
+      TMP_JAMP(457) = TMP_JAMP(208) - TMP_JAMP(206)  ! used 2 times
+      TMP_JAMP(456) = TMP_JAMP(207) - TMP_JAMP(201)  ! used 2 times
+      TMP_JAMP(455) = TMP_JAMP(205) - TMP_JAMP(201)  ! used 2 times
+      TMP_JAMP(454) = TMP_JAMP(205) - TMP_JAMP(195)  ! used 2 times
+      TMP_JAMP(453) = TMP_JAMP(204) +  TMP_JAMP(195)  ! used 2 times
+      TMP_JAMP(452) = TMP_JAMP(203) - AMP(397)  ! used 2 times
+      TMP_JAMP(451) = TMP_JAMP(203) - TMP_JAMP(79)  ! used 2 times
+      TMP_JAMP(450) = TMP_JAMP(202) +  AMP(47)  ! used 2 times
+      TMP_JAMP(449) = TMP_JAMP(202) +  TMP_JAMP(200)  ! used 2 times
+      TMP_JAMP(448) = TMP_JAMP(199) + ((-0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * TMP_JAMP(113)  ! used 2 times
+      TMP_JAMP(447) = TMP_JAMP(199) + ((0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * TMP_JAMP(107)  ! used 2 times
+      TMP_JAMP(446) = TMP_JAMP(195) + ((0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * TMP_JAMP(149)  ! used 2 times
+      TMP_JAMP(445) = TMP_JAMP(194) + ((0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * TMP_JAMP(156)  ! used 2 times
+      TMP_JAMP(444) = TMP_JAMP(192) +  TMP_JAMP(178)  ! used 2 times
+      TMP_JAMP(443) = TMP_JAMP(189) +  TMP_JAMP(186)  ! used 2 times
+      TMP_JAMP(442) = TMP_JAMP(189) + ((-0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * AMP(351)  ! used 2 times
+      TMP_JAMP(441) = TMP_JAMP(187) + ((-0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * AMP(332)  ! used 2 times
+      TMP_JAMP(440) = TMP_JAMP(186) + ((-0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * AMP(369)  ! used 2 times
+      TMP_JAMP(439) = TMP_JAMP(186) + ((0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * AMP(367)  ! used 2 times
+      TMP_JAMP(438) = TMP_JAMP(185) + ((0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * AMP(369)  ! used 2 times
+      TMP_JAMP(437) = TMP_JAMP(185) +  TMP_JAMP(181)  ! used 2 times
+      TMP_JAMP(436) = TMP_JAMP(185) + ((-0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * AMP(360)  ! used 2 times
+      TMP_JAMP(435) = TMP_JAMP(183) + ((0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * AMP(435)  ! used 2 times
+      TMP_JAMP(434) = TMP_JAMP(183) - TMP_JAMP(180)  ! used 2 times
+      TMP_JAMP(433) = TMP_JAMP(182) + ((0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * AMP(351)  ! used 2 times
+      TMP_JAMP(432) = TMP_JAMP(182) +  TMP_JAMP(180)  ! used 2 times
+      TMP_JAMP(431) = TMP_JAMP(182) + ((-0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * AMP(342)  ! used 2 times
+      TMP_JAMP(430) = TMP_JAMP(181) + ((0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * TMP_JAMP(137)  ! used 2 times
+      TMP_JAMP(429) = TMP_JAMP(180) + ((-0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * AMP(380)  ! used 2 times
+      TMP_JAMP(428) = TMP_JAMP(179) + ((0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * AMP(91)  ! used 2 times
+      TMP_JAMP(427) = TMP_JAMP(177) + ((0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * AMP(406)  ! used 2 times
+      TMP_JAMP(426) = TMP_JAMP(177) + ((-0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * AMP(408)  ! used 2 times
+      TMP_JAMP(425) = TMP_JAMP(176) + ((0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * AMP(346)  ! used 2 times
+      TMP_JAMP(424) = TMP_JAMP(175) + ((-0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * AMP(387)  ! used 2 times
+      TMP_JAMP(423) = TMP_JAMP(175) + ((0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * AMP(385)  ! used 2 times
+      TMP_JAMP(422) = TMP_JAMP(174) + ((0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * AMP(426)  ! used 2 times
+      TMP_JAMP(421) = TMP_JAMP(174) - TMP_JAMP(172)  ! used 2 times
+      TMP_JAMP(420) = TMP_JAMP(173) + ((0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * AMP(348)  ! used 2 times
+      TMP_JAMP(419) = TMP_JAMP(173) + ((-0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * AMP(344)  ! used 2 times
+      TMP_JAMP(418) = TMP_JAMP(172) + ((-0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * AMP(362)  ! used 2 times
+      TMP_JAMP(417) = TMP_JAMP(171) + ((0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * AMP(397)  ! used 2 times
+      TMP_JAMP(416) = TMP_JAMP(171) +  TMP_JAMP(170)  ! used 2 times
+      TMP_JAMP(415) = TMP_JAMP(170) + ((-0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * AMP(366)  ! used 2 times
+      TMP_JAMP(414) = TMP_JAMP(169) + ((-0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * AMP(384)  ! used 2 times
+      TMP_JAMP(413) = TMP_JAMP(169) + ((0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * AMP(382)  ! used 2 times
+      TMP_JAMP(412) = TMP_JAMP(168) + ((0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * AMP(332)  ! used 2 times
+      TMP_JAMP(411) = TMP_JAMP(168) +  TMP_JAMP(167)  ! used 2 times
+      TMP_JAMP(410) = TMP_JAMP(168) + ((-0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * AMP(333)  ! used 2 times
+      TMP_JAMP(409) = TMP_JAMP(167) + ((-0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * AMP(327)  ! used 2 times
+      TMP_JAMP(408) = TMP_JAMP(166) + ((-0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * AMP(330)  ! used 2 times
+      TMP_JAMP(407) = TMP_JAMP(166) + ((0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * AMP(329)  ! used 2 times
+      TMP_JAMP(406) = TMP_JAMP(164) +  TMP_JAMP(163)  ! used 2 times
+      TMP_JAMP(405) = TMP_JAMP(163) + ((0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * AMP(185)  ! used 2 times
+      TMP_JAMP(404) = TMP_JAMP(163) + ((-0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * AMP(189)  ! used 2 times
+      TMP_JAMP(403) = TMP_JAMP(162) + ((0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * AMP(174)  ! used 2 times
+      TMP_JAMP(402) = TMP_JAMP(162) - TMP_JAMP(157)  ! used 2 times
+      TMP_JAMP(401) = TMP_JAMP(161) + ((0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * AMP(168)  ! used 2 times
+      TMP_JAMP(400) = TMP_JAMP(161) + ((-0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * AMP(167)  ! used 2 times
+      TMP_JAMP(399) = TMP_JAMP(160) +  TMP_JAMP(159)  ! used 2 times
+      TMP_JAMP(398) = TMP_JAMP(160) - TMP_JAMP(154)  ! used 2 times
+      TMP_JAMP(397) = TMP_JAMP(159) +  TMP_JAMP(154)  ! used 2 times
+      TMP_JAMP(396) = TMP_JAMP(159) + ((-0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * AMP(205)  ! used 2 times
+      TMP_JAMP(395) = TMP_JAMP(158) + ((-0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * AMP(207)  ! used 2 times
+      TMP_JAMP(394) = TMP_JAMP(158) + ((0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * AMP(203)  ! used 2 times
+      TMP_JAMP(393) = TMP_JAMP(157) + ((-0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * AMP(170)  ! used 2 times
+      TMP_JAMP(392) = TMP_JAMP(156) + ((0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * AMP(249)  ! used 2 times
+      TMP_JAMP(391) = TMP_JAMP(156) - TMP_JAMP(153)  ! used 2 times
+      TMP_JAMP(390) = TMP_JAMP(155) + ((0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * AMP(189)  ! used 2 times
+      TMP_JAMP(389) = TMP_JAMP(155) +  TMP_JAMP(153)  ! used 2 times
+      TMP_JAMP(388) = TMP_JAMP(155) + ((-0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * AMP(187)  ! used 2 times
+      TMP_JAMP(387) = TMP_JAMP(154) + ((-0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * AMP(223)  ! used 2 times
+      TMP_JAMP(386) = TMP_JAMP(153) + ((-0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * AMP(226)  ! used 2 times
+      TMP_JAMP(385) = TMP_JAMP(152) + ((0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * AMP(171)  ! used 2 times
+      TMP_JAMP(384) = TMP_JAMP(152) + ((-0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * AMP(276)  ! used 2 times
+      TMP_JAMP(383) = TMP_JAMP(151) + ((0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * AMP(183)  ! used 2 times
+      TMP_JAMP(382) = TMP_JAMP(150) + ((-0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * AMP(225)  ! used 2 times
+      TMP_JAMP(381) = TMP_JAMP(150) + ((0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * AMP(221)  ! used 2 times
+      TMP_JAMP(380) = TMP_JAMP(149) + ((0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * AMP(258)  ! used 2 times
+      TMP_JAMP(379) = TMP_JAMP(149) - TMP_JAMP(147)  ! used 2 times
+      TMP_JAMP(378) = TMP_JAMP(148) + ((0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * AMP(192)  ! used 2 times
+      TMP_JAMP(377) = TMP_JAMP(148) + ((-0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * AMP(190)  ! used 2 times
+      TMP_JAMP(376) = TMP_JAMP(147) + ((-0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * AMP(208)  ! used 2 times
+      TMP_JAMP(375) = TMP_JAMP(146) + ((0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * AMP(173)  ! used 2 times
+      TMP_JAMP(374) = TMP_JAMP(145) + ((0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * AMP(201)  ! used 2 times
+      TMP_JAMP(373) = TMP_JAMP(145) + ((-0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * AMP(210)  ! used 2 times
+      TMP_JAMP(372) = TMP_JAMP(144) + ((-0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * AMP(228)  ! used 2 times
+      TMP_JAMP(371) = TMP_JAMP(144) + ((0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * AMP(219)  ! used 2 times
+      TMP_JAMP(370) = TMP_JAMP(143) + ((0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * AMP(238)  ! used 2 times
+      TMP_JAMP(369) = TMP_JAMP(143) + ((-0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * AMP(277)  ! used 2 times
+      TMP_JAMP(368) = TMP_JAMP(142) + ((0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * TMP_JAMP(127)  ! used 2 times
+      TMP_JAMP(367) = TMP_JAMP(142) + ((-0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * AMP(256)  ! used 2 times
+      TMP_JAMP(366) = TMP_JAMP(141) + ((-0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * AMP(247)  ! used 2 times
+      TMP_JAMP(365) = TMP_JAMP(141) + ((0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * AMP(268)  ! used 2 times
+      TMP_JAMP(364) = TMP_JAMP(137) +  AMP(399)  ! used 2 times
+      TMP_JAMP(363) = TMP_JAMP(136) - AMP(364)  ! used 2 times
+      TMP_JAMP(362) = TMP_JAMP(126) +  AMP(267)  ! used 2 times
+      TMP_JAMP(361) = TMP_JAMP(123) +  AMP(285)  ! used 2 times
+      TMP_JAMP(360) = TMP_JAMP(113) + ((-0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * AMP(125)  ! used 2 times
+      TMP_JAMP(359) = TMP_JAMP(112) + ((0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * AMP(113)  ! used 2 times
+      TMP_JAMP(358) = TMP_JAMP(111) + ((-0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * AMP(100)  ! used 2 times
+      TMP_JAMP(357) = TMP_JAMP(110) - TMP_JAMP(105)  ! used 2 times
+      TMP_JAMP(356) = TMP_JAMP(109) + ((0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * AMP(124)  ! used 2 times
+      TMP_JAMP(355) = TMP_JAMP(108) - TMP_JAMP(89)  ! used 2 times
+      TMP_JAMP(354) = TMP_JAMP(106) + ((0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * AMP(65)  ! used 2 times
+      TMP_JAMP(353) = TMP_JAMP(104) + ((0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * AMP(92)  ! used 2 times
+      TMP_JAMP(352) = TMP_JAMP(103) + ((-0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * AMP(74)  ! used 2 times
+      TMP_JAMP(351) = TMP_JAMP(101) + ((-0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * AMP(76)  ! used 2 times
+      TMP_JAMP(350) = TMP_JAMP(100) +  TMP_JAMP(87)  ! used 2 times
+      TMP_JAMP(349) = TMP_JAMP(98) + ((-0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * AMP(46)  ! used 2 times
+      TMP_JAMP(348) = TMP_JAMP(96) + ((0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * AMP(59)  ! used 2 times
+      TMP_JAMP(347) = TMP_JAMP(95) +  TMP_JAMP(87)  ! used 2 times
+      TMP_JAMP(346) = TMP_JAMP(93) + ((-0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * AMP(146)  ! used 2 times
+      TMP_JAMP(345) = TMP_JAMP(90) + ((-0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * AMP(64)  ! used 2 times
+      TMP_JAMP(344) = TMP_JAMP(77) - AMP(94)  ! used 2 times
+      TMP_JAMP(343) = TMP_JAMP(76) - AMP(95)  ! used 2 times
+      TMP_JAMP(342) = TMP_JAMP(64) +  AMP(112)  ! used 2 times
+      TMP_JAMP(341) = TMP_JAMP(56) - AMP(77)  ! used 2 times
+      TMP_JAMP(340) = TMP_JAMP(52) - AMP(58)  ! used 2 times
+      TMP_JAMP(339) = TMP_JAMP(30) + ((0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * AMP(83)  ! used 2 times
+      TMP_JAMP(338) = TMP_JAMP(20) + ((-0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * AMP(73)  ! used 2 times
+      TMP_JAMP(337) = TMP_JAMP(19) + ((-0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * AMP(55)  ! used 2 times
+      TMP_JAMP(336) = AMP(349) + ((-0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * AMP(352)  ! used 2 times
+      TMP_JAMP(335) = AMP(352) + ((-0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * AMP(444)  ! used 2 times
+      TMP_JAMP(334) = AMP(378) + ((-0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * AMP(390)  ! used 2 times
+      TMP_JAMP(333) = AMP(380) + ((-0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * AMP(390)  ! used 2 times
+      TMP_JAMP(332) = AMP(56) + ((-0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * AMP(122)  ! used 2 times
+      TMP_JAMP(331) = AMP(326) + ((-0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * AMP(390)  ! used 2 times
+      TMP_JAMP(330) = AMP(185) + ((-0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * AMP(195)  ! used 2 times
+      TMP_JAMP(329) = AMP(195) + ((-0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * AMP(240)  ! used 2 times
+      TMP_JAMP(328) = AMP(223) + ((-0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * AMP(229)  ! used 2 times
+      TMP_JAMP(327) = AMP(226) + ((-0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * AMP(229)  ! used 2 times
+      TMP_JAMP(326) = AMP(229) + ((-0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * AMP(259)  ! used 2 times
+      TMP_JAMP(578) = TMP_JAMP(525) - TMP_JAMP(450)  ! used 2 times
+      TMP_JAMP(577) = TMP_JAMP(519) + ((-0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * TMP_JAMP(370)  ! used 2 times
+      TMP_JAMP(576) = TMP_JAMP(516) + ((0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * TMP_JAMP(369)  ! used 2 times
+      TMP_JAMP(575) = TMP_JAMP(512) + ((-0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * TMP_JAMP(365)  ! used 2 times
+      TMP_JAMP(574) = TMP_JAMP(509) - TMP_JAMP(364)  ! used 2 times
+      TMP_JAMP(573) = TMP_JAMP(494) - TMP_JAMP(361)  ! used 2 times
+      TMP_JAMP(572) = TMP_JAMP(486) + ((-0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * TMP_JAMP(468)  ! used 2 times
+      TMP_JAMP(571) = TMP_JAMP(482) + ((-0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * TMP_JAMP(458)  ! used 2 times
+      TMP_JAMP(570) = TMP_JAMP(481) + ((0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * TMP_JAMP(449)  ! used 2 times
+      TMP_JAMP(569) = TMP_JAMP(477) +  TMP_JAMP(475)  ! used 2 times
+      TMP_JAMP(568) = TMP_JAMP(474) + ((-0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * TMP_JAMP(428)  ! used 2 times
+      TMP_JAMP(567) = TMP_JAMP(473) +  TMP_JAMP(469)  ! used 2 times
+      TMP_JAMP(566) = TMP_JAMP(472) + ((-0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * TMP_JAMP(444)  ! used 2 times
+      TMP_JAMP(565) = TMP_JAMP(471) - TMP_JAMP(459)  ! used 2 times
+      TMP_JAMP(564) = TMP_JAMP(465) +  TMP_JAMP(343)  ! used 2 times
+      TMP_JAMP(563) = TMP_JAMP(464) + ((0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * TMP_JAMP(337)  ! used 2 times
+      TMP_JAMP(562) = TMP_JAMP(460) + ((0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * TMP_JAMP(351)  ! used 2 times
+      TMP_JAMP(561) = TMP_JAMP(457) + ((0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * TMP_JAMP(355)  ! used 2 times
+      TMP_JAMP(560) = TMP_JAMP(456) - TMP_JAMP(447)  ! used 2 times
+      TMP_JAMP(559) = TMP_JAMP(455) +  TMP_JAMP(340)  ! used 2 times
+      TMP_JAMP(558) = TMP_JAMP(446) + ((0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * TMP_JAMP(367)  ! used 2 times
+      TMP_JAMP(557) = TMP_JAMP(445) + ((0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * TMP_JAMP(366)  ! used 2 times
+      TMP_JAMP(556) = TMP_JAMP(426) + ((-0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * TMP_JAMP(297)  ! used 2 times
+      TMP_JAMP(555) = TMP_JAMP(418) + ((-0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * TMP_JAMP(297)  ! used 2 times
+      TMP_JAMP(554) = TMP_JAMP(410) + ((-0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * TMP_JAMP(306)  ! used 2 times
+      TMP_JAMP(553) = TMP_JAMP(408) +  TMP_JAMP(183)  ! used 2 times
+      TMP_JAMP(552) = TMP_JAMP(407) + ((0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * TMP_JAMP(297)  ! used 2 times
+      TMP_JAMP(551) = TMP_JAMP(400) + ((-0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * TMP_JAMP(262)  ! used 2 times
+      TMP_JAMP(550) = TMP_JAMP(394) + ((0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * TMP_JAMP(270)  ! used 2 times
+      TMP_JAMP(549) = TMP_JAMP(393) + ((-0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * TMP_JAMP(270)  ! used 2 times
+      TMP_JAMP(548) = TMP_JAMP(384) + ((-0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * TMP_JAMP(268)  ! used 2 times
+      TMP_JAMP(547) = TMP_JAMP(381) + ((0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * TMP_JAMP(262)  ! used 2 times
+      TMP_JAMP(546) = TMP_JAMP(375) - TMP_JAMP(162)  ! used 2 times
+      TMP_JAMP(545) = TMP_JAMP(368) - TMP_JAMP(326)  ! used 2 times
+      TMP_JAMP(544) = TMP_JAMP(362) + ((-0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * TMP_JAMP(164)  ! used 2 times
+      TMP_JAMP(543) = TMP_JAMP(357) +  TMP_JAMP(339)  ! used 2 times
+      TMP_JAMP(542) = TMP_JAMP(354) - TMP_JAMP(260)  ! used 2 times
+      TMP_JAMP(541) = TMP_JAMP(353) +  TMP_JAMP(252)  ! used 2 times
+      TMP_JAMP(540) = TMP_JAMP(352) - TMP_JAMP(247)  ! used 2 times
+      TMP_JAMP(539) = TMP_JAMP(350) + ((-0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * TMP_JAMP(344)  ! used 2 times
+      TMP_JAMP(538) = TMP_JAMP(349) + ((-0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * TMP_JAMP(323)  ! used 2 times
+      TMP_JAMP(537) = TMP_JAMP(348) - TMP_JAMP(248)  ! used 2 times
+      TMP_JAMP(536) = TMP_JAMP(347) + ((0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * TMP_JAMP(342)  ! used 2 times
+      TMP_JAMP(535) = TMP_JAMP(346) + ((-0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * TMP_JAMP(332)  ! used 2 times
+      TMP_JAMP(534) = TMP_JAMP(341) +  TMP_JAMP(318)  ! used 2 times
+      TMP_JAMP(533) = TMP_JAMP(338) + ((-0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * TMP_JAMP(319)  ! used 2 times
+      TMP_JAMP(532) = TMP_JAMP(335) - TMP_JAMP(187)  ! used 2 times
+      TMP_JAMP(593) = TMP_JAMP(571) +  TMP_JAMP(533)  ! used 2 times
+      TMP_JAMP(592) = TMP_JAMP(570) - TMP_JAMP(535)  ! used 2 times
+      TMP_JAMP(591) = TMP_JAMP(569) + ((0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * TMP_JAMP(543)  ! used 2 times
+      TMP_JAMP(590) = TMP_JAMP(568) - TMP_JAMP(524)  ! used 2 times
+      TMP_JAMP(589) = TMP_JAMP(567) + ((0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * TMP_JAMP(541)  ! used 2 times
+      TMP_JAMP(588) = TMP_JAMP(566) + ((-0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * TMP_JAMP(539)  ! used 2 times
+      TMP_JAMP(587) = TMP_JAMP(565) + ((-0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * TMP_JAMP(540)  ! used 2 times
+      TMP_JAMP(586) = TMP_JAMP(564) +  TMP_JAMP(527)  ! used 2 times
+      TMP_JAMP(585) = TMP_JAMP(563) + ((0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * TMP_JAMP(529)  ! used 2 times
+      TMP_JAMP(584) = TMP_JAMP(562) +  TMP_JAMP(521)  ! used 2 times
+      TMP_JAMP(583) = TMP_JAMP(561) +  TMP_JAMP(534)  ! used 2 times
+      TMP_JAMP(582) = TMP_JAMP(560) + ((-0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * TMP_JAMP(537)  ! used 2 times
+      TMP_JAMP(581) = TMP_JAMP(559) + ((0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * TMP_JAMP(528)  ! used 2 times
+      TMP_JAMP(580) = TMP_JAMP(542) + ((0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * TMP_JAMP(461)  ! used 2 times
+      TMP_JAMP(579) = TMP_JAMP(538) + ((-0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * TMP_JAMP(448)  ! used 2 times
+      TMP_JAMP(638) = AMP(571) +  AMP(576)  ! used 16 times
+      TMP_JAMP(637) = AMP(556) - AMP(559)  ! used 16 times
+      TMP_JAMP(636) = AMP(555) +  AMP(557)  ! used 16 times
+      TMP_JAMP(635) = AMP(554) - AMP(574)  ! used 16 times
+      TMP_JAMP(634) = AMP(551) +  AMP(553)  ! used 16 times
+      TMP_JAMP(633) = AMP(575) +  AMP(578)  ! used 16 times
+      TMP_JAMP(632) = AMP(521) +  AMP(524)  ! used 16 times
+      TMP_JAMP(631) = AMP(520) +  AMP(523)  ! used 16 times
+      TMP_JAMP(630) = AMP(719) +  AMP(721)  ! used 16 times
+      TMP_JAMP(629) = AMP(715) - AMP(718)  ! used 16 times
+      TMP_JAMP(628) = AMP(714) +  AMP(716)  ! used 16 times
+      TMP_JAMP(627) = AMP(681) +  AMP(684)  ! used 16 times
+      TMP_JAMP(626) = AMP(679) +  AMP(682)  ! used 16 times
+      TMP_JAMP(625) = AMP(720) - AMP(723)  ! used 16 times
+      TMP_JAMP(624) = AMP(710) +  AMP(712)  ! used 16 times
+      TMP_JAMP(623) = AMP(709) +  AMP(711)  ! used 16 times
+      TMP_JAMP(622) = AMP(730) +  AMP(735)  ! used 16 times
+      TMP_JAMP(621) = AMP(713) - AMP(733)  ! used 16 times
+      TMP_JAMP(620) = AMP(734) +  AMP(737)  ! used 16 times
+      TMP_JAMP(619) = AMP(680) +  AMP(683)  ! used 16 times
+      TMP_JAMP(618) = AMP(560) +  AMP(562)  ! used 16 times
+      TMP_JAMP(617) = AMP(522) +  AMP(525)  ! used 16 times
+      TMP_JAMP(616) = AMP(561) - AMP(564)  ! used 16 times
+      TMP_JAMP(615) = AMP(550) +  AMP(552)  ! used 16 times
+      TMP_JAMP(614) = AMP(540) +  AMP(543)  ! used 16 times
+      TMP_JAMP(613) = AMP(539) +  AMP(542)  ! used 16 times
+      TMP_JAMP(612) = AMP(699) +  AMP(702)  ! used 16 times
+      TMP_JAMP(611) = AMP(697) +  AMP(700)  ! used 16 times
+      TMP_JAMP(610) = AMP(698) +  AMP(701)  ! used 16 times
+      TMP_JAMP(609) = AMP(538) +  AMP(541)  ! used 16 times
+      TMP_JAMP(608) = AMP(893) +  AMP(896)  ! used 16 times
+      TMP_JAMP(607) = AMP(889) +  AMP(894)  ! used 16 times
+      TMP_JAMP(606) = AMP(878) +  AMP(880)  ! used 16 times
+      TMP_JAMP(605) = AMP(840) +  AMP(843)  ! used 16 times
+      TMP_JAMP(604) = AMP(839) +  AMP(842)  ! used 16 times
+      TMP_JAMP(603) = AMP(879) - AMP(882)  ! used 16 times
+      TMP_JAMP(602) = AMP(857) +  AMP(860)  ! used 16 times
+      TMP_JAMP(601) = AMP(856) +  AMP(859)  ! used 16 times
+      TMP_JAMP(600) = AMP(873) +  AMP(875)  ! used 16 times
+      TMP_JAMP(599) = AMP(858) +  AMP(861)  ! used 16 times
+      TMP_JAMP(598) = AMP(874) - AMP(877)  ! used 16 times
+      TMP_JAMP(597) = AMP(838) +  AMP(841)  ! used 16 times
+      TMP_JAMP(596) = AMP(872) - AMP(892)  ! used 16 times
+      TMP_JAMP(595) = AMP(868) +  AMP(870)  ! used 16 times
+      TMP_JAMP(594) = AMP(869) +  AMP(871)  ! used 16 times
+      TMP_JAMP(680) = TMP_JAMP(638) +  AMP(579)  ! used 16 times
+      TMP_JAMP(679) = TMP_JAMP(637) - AMP(566)  ! used 16 times
+      TMP_JAMP(678) = TMP_JAMP(636) +  AMP(573)  ! used 16 times
+      TMP_JAMP(677) = TMP_JAMP(635) - AMP(577)  ! used 16 times
+      TMP_JAMP(676) = TMP_JAMP(634) +  AMP(567)  ! used 16 times
+      TMP_JAMP(675) = TMP_JAMP(633) - AMP(634)  ! used 16 times
+      TMP_JAMP(674) = TMP_JAMP(632) +  AMP(636)  ! used 16 times
+      TMP_JAMP(673) = TMP_JAMP(631) +  AMP(565)  ! used 16 times
+      TMP_JAMP(672) = TMP_JAMP(630) +  AMP(731)  ! used 16 times
+      TMP_JAMP(671) = TMP_JAMP(629) - AMP(725)  ! used 16 times
+      TMP_JAMP(670) = TMP_JAMP(628) +  AMP(732)  ! used 16 times
+      TMP_JAMP(669) = TMP_JAMP(627) +  AMP(722)  ! used 16 times
+      TMP_JAMP(668) = TMP_JAMP(626) +  AMP(724)  ! used 16 times
+      TMP_JAMP(667) = TMP_JAMP(625) - AMP(728)  ! used 16 times
+      TMP_JAMP(666) = TMP_JAMP(624) +  AMP(726)  ! used 16 times
+      TMP_JAMP(665) = TMP_JAMP(623) +  AMP(729)  ! used 16 times
+      TMP_JAMP(664) = TMP_JAMP(622) +  AMP(738)  ! used 16 times
+      TMP_JAMP(663) = TMP_JAMP(621) - AMP(736)  ! used 16 times
+      TMP_JAMP(662) = TMP_JAMP(620) - AMP(793)  ! used 16 times
+      TMP_JAMP(661) = TMP_JAMP(619) +  AMP(795)  ! used 16 times
+      TMP_JAMP(660) = TMP_JAMP(618) +  AMP(572)  ! used 16 times
+      TMP_JAMP(659) = TMP_JAMP(617) +  AMP(563)  ! used 16 times
+      TMP_JAMP(658) = TMP_JAMP(616) - AMP(569)  ! used 16 times
+      TMP_JAMP(657) = TMP_JAMP(615) +  AMP(570)  ! used 16 times
+      TMP_JAMP(656) = TMP_JAMP(614) +  AMP(558)  ! used 16 times
+      TMP_JAMP(655) = TMP_JAMP(613) +  AMP(635)  ! used 16 times
+      TMP_JAMP(654) = TMP_JAMP(612) +  AMP(717)  ! used 16 times
+      TMP_JAMP(653) = TMP_JAMP(611) +  AMP(727)  ! used 16 times
+      TMP_JAMP(652) = TMP_JAMP(610) +  AMP(794)  ! used 16 times
+      TMP_JAMP(651) = TMP_JAMP(609) +  AMP(568)  ! used 16 times
+      TMP_JAMP(650) = TMP_JAMP(607) +  AMP(897)  ! used 16 times
+      TMP_JAMP(649) = TMP_JAMP(606) +  AMP(890)  ! used 16 times
+      TMP_JAMP(648) = TMP_JAMP(605) +  AMP(881)  ! used 16 times
+      TMP_JAMP(647) = TMP_JAMP(603) - AMP(887)  ! used 16 times
+      TMP_JAMP(646) = TMP_JAMP(601) +  AMP(886)  ! used 16 times
+      TMP_JAMP(645) = TMP_JAMP(600) +  AMP(891)  ! used 16 times
+      TMP_JAMP(644) = TMP_JAMP(599) +  AMP(876)  ! used 16 times
+      TMP_JAMP(643) = TMP_JAMP(598) - AMP(884)  ! used 16 times
+      TMP_JAMP(642) = TMP_JAMP(597) +  AMP(883)  ! used 16 times
+      TMP_JAMP(641) = TMP_JAMP(596) - AMP(895)  ! used 16 times
+      TMP_JAMP(640) = TMP_JAMP(595) +  AMP(888)  ! used 16 times
+      TMP_JAMP(639) = TMP_JAMP(594) +  AMP(885)  ! used 16 times
+      TMP_JAMP(835) = TMP_JAMP(680) - TMP_JAMP(678)  ! used 8 times
+      TMP_JAMP(834) = TMP_JAMP(680) +  TMP_JAMP(660)  ! used 8 times
+      TMP_JAMP(833) = TMP_JAMP(680) +  TMP_JAMP(675)  ! used 8 times
+      TMP_JAMP(832) = TMP_JAMP(679) - TMP_JAMP(676)  ! used 8 times
+      TMP_JAMP(831) = TMP_JAMP(679) +  TMP_JAMP(678)  ! used 8 times
+      TMP_JAMP(830) = TMP_JAMP(679) +  TMP_JAMP(671)  ! used 8 times
+      TMP_JAMP(829) = TMP_JAMP(679) - TMP_JAMP(673)  ! used 8 times
+      TMP_JAMP(828) = TMP_JAMP(678) +  TMP_JAMP(670)  ! used 8 times
+      TMP_JAMP(827) = TMP_JAMP(678) +  TMP_JAMP(660)  ! used 8 times
+      TMP_JAMP(826) = TMP_JAMP(677) + ((0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * AMP(622)  ! used 8 times
+      TMP_JAMP(825) = TMP_JAMP(677) +  TMP_JAMP(676)  ! used 8 times
+      TMP_JAMP(824) = TMP_JAMP(677) - TMP_JAMP(675)  ! used 8 times
+      TMP_JAMP(823) = TMP_JAMP(677) - TMP_JAMP(657)  ! used 8 times
+      TMP_JAMP(822) = TMP_JAMP(677) + ((-0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * AMP(623)  ! used 8 times
+      TMP_JAMP(821) = TMP_JAMP(676) - TMP_JAMP(673)  ! used 8 times
+      TMP_JAMP(820) = TMP_JAMP(676) +  TMP_JAMP(657)  ! used 8 times
+      TMP_JAMP(819) = TMP_JAMP(675) +  TMP_JAMP(674)  ! used 8 times
+      TMP_JAMP(818) = TMP_JAMP(675) - TMP_JAMP(655)  ! used 8 times
+      TMP_JAMP(817) = TMP_JAMP(674) +  TMP_JAMP(673)  ! used 8 times
+      TMP_JAMP(816) = TMP_JAMP(674) +  TMP_JAMP(655)  ! used 8 times
+      TMP_JAMP(815) = TMP_JAMP(672) +  TMP_JAMP(670)  ! used 8 times
+      TMP_JAMP(814) = TMP_JAMP(672) +  TMP_JAMP(664)  ! used 8 times
+      TMP_JAMP(813) = TMP_JAMP(672) +  TMP_JAMP(667)  ! used 8 times
+      TMP_JAMP(812) = TMP_JAMP(671) - TMP_JAMP(668)  ! used 8 times
+      TMP_JAMP(811) = TMP_JAMP(671) +  TMP_JAMP(670)  ! used 8 times
+      TMP_JAMP(810) = TMP_JAMP(671) - TMP_JAMP(666)  ! used 8 times
+      TMP_JAMP(809) = TMP_JAMP(670) - TMP_JAMP(664)  ! used 8 times
+      TMP_JAMP(808) = TMP_JAMP(669) + ((0.000000000000000D+00
      $ ,1.000000000000000D+00)) * AMP(678)  ! used 8 times
-      TMP_JAMP(917) = TMP_JAMP(369) + ((-0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * AMP(756)  ! used 8 times
-      TMP_JAMP(916) = TMP_JAMP(371) + ((0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * AMP(657)  ! used 8 times
-      TMP_JAMP(915) = TMP_JAMP(371) - TMP_JAMP(369)  ! used 8 times
-      TMP_JAMP(914) = TMP_JAMP(372) + ((0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * AMP(1150)  ! used 8 times
-      TMP_JAMP(913) = TMP_JAMP(373) + ((0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * AMP(1062)  ! used 8 times
-      TMP_JAMP(912) = TMP_JAMP(373) +  TMP_JAMP(370)  ! used 8 times
-      TMP_JAMP(911) = TMP_JAMP(386) +  TMP_JAMP(369)  ! used 8 times
-      TMP_JAMP(910) = TMP_JAMP(388) +  TMP_JAMP(371)  ! used 8 times
-      TMP_JAMP(909) = TMP_JAMP(398) - TMP_JAMP(277)  ! used 8 times
-      TMP_JAMP(908) = TMP_JAMP(401) - TMP_JAMP(372)  ! used 8 times
-      TMP_JAMP(907) = AMP(1462) - AMP(1466)  ! used 8 times
-      TMP_JAMP(906) = AMP(771) +  AMP(1580)  ! used 8 times
-      TMP_JAMP(905) = AMP(768) - AMP(770)  ! used 8 times
-      TMP_JAMP(904) = AMP(763) +  AMP(771)  ! used 8 times
-      TMP_JAMP(903) = AMP(751) +  AMP(759)  ! used 8 times
-      TMP_JAMP(902) = TMP_JAMP(366) + ((-0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * AMP(763)  ! used 8 times
-      TMP_JAMP(901) = TMP_JAMP(367) +  TMP_JAMP(365)  ! used 8 times
-      TMP_JAMP(900) = TMP_JAMP(368) +  TMP_JAMP(365)  ! used 8 times
-      TMP_JAMP(899) = TMP_JAMP(372) +  TMP_JAMP(367)  ! used 8 times
-      TMP_JAMP(898) = TMP_JAMP(374) +  TMP_JAMP(364)  ! used 8 times
-      TMP_JAMP(897) = TMP_JAMP(375) +  TMP_JAMP(366)  ! used 8 times
-      TMP_JAMP(896) = AMP(1225) +  AMP(1591)  ! used 8 times
-      TMP_JAMP(895) = AMP(430) - AMP(433)  ! used 8 times
-      TMP_JAMP(894) = AMP(371) +  AMP(432)  ! used 8 times
-      TMP_JAMP(893) = AMP(359) - AMP(371)  ! used 8 times
-      TMP_JAMP(892) = AMP(146) - AMP(1593)  ! used 8 times
-      TMP_JAMP(891) = AMP(145) - AMP(158)  ! used 8 times
-      TMP_JAMP(890) = TMP_JAMP(264) + ((0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * AMP(359)  ! used 8 times
-      TMP_JAMP(889) = TMP_JAMP(362) +  TMP_JAMP(360)  ! used 8 times
-      TMP_JAMP(888) = TMP_JAMP(363) +  TMP_JAMP(361)  ! used 8 times
-      TMP_JAMP(887) = TMP_JAMP(374) +  TMP_JAMP(314)  ! used 8 times
-      TMP_JAMP(886) = TMP_JAMP(374) - TMP_JAMP(360)  ! used 8 times
-      TMP_JAMP(885) = TMP_JAMP(376) - TMP_JAMP(362)  ! used 8 times
-      TMP_JAMP(884) = TMP_JAMP(384) +  TMP_JAMP(264)  ! used 8 times
-      TMP_JAMP(883) = TMP_JAMP(397) +  TMP_JAMP(361)  ! used 8 times
-      TMP_JAMP(882) = TMP_JAMP(402) +  TMP_JAMP(363)  ! used 8 times
-      TMP_JAMP(881) = TMP_JAMP(403) - TMP_JAMP(383)  ! used 8 times
-      TMP_JAMP(880) = TMP_JAMP(404) - TMP_JAMP(384)  ! used 8 times
-      TMP_JAMP(879) = AMP(1057) - AMP(1073)  ! used 8 times
-      TMP_JAMP(878) = AMP(1055) - AMP(1057)  ! used 8 times
-      TMP_JAMP(877) = AMP(781) - AMP(1592)  ! used 8 times
-      TMP_JAMP(876) = AMP(672) - AMP(1058)  ! used 8 times
-      TMP_JAMP(875) = AMP(661) +  AMP(672)  ! used 8 times
-      TMP_JAMP(874) = TMP_JAMP(261) + ((0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * AMP(661)  ! used 8 times
-      TMP_JAMP(873) = TMP_JAMP(358) + ((-0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * AMP(781)  ! used 8 times
-      TMP_JAMP(872) = TMP_JAMP(359) + ((-0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * AMP(1055)  ! used 8 times
-      TMP_JAMP(871) = TMP_JAMP(360) +  TMP_JAMP(358)  ! used 8 times
-      TMP_JAMP(870) = TMP_JAMP(361) +  TMP_JAMP(359)  ! used 8 times
-      TMP_JAMP(869) = TMP_JAMP(364) - TMP_JAMP(358)  ! used 8 times
-      TMP_JAMP(868) = TMP_JAMP(390) - TMP_JAMP(369)  ! used 8 times
-      TMP_JAMP(867) = TMP_JAMP(391) +  TMP_JAMP(359)  ! used 8 times
-      TMP_JAMP(866) = TMP_JAMP(392) - TMP_JAMP(371)  ! used 8 times
-      TMP_JAMP(865) = AMP(785) +  AMP(790)  ! used 8 times
-      TMP_JAMP(864) = AMP(676) - AMP(792)  ! used 8 times
-      TMP_JAMP(863) = AMP(429) +  AMP(431)  ! used 8 times
-      TMP_JAMP(862) = AMP(340) - AMP(456)  ! used 8 times
-      TMP_JAMP(861) = AMP(338) +  AMP(340)  ! used 8 times
-      TMP_JAMP(860) = TMP_JAMP(355) + ((0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * AMP(676)  ! used 8 times
-      TMP_JAMP(859) = TMP_JAMP(355) - TMP_JAMP(354)  ! used 8 times
-      TMP_JAMP(858) = TMP_JAMP(356) + ((-0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * AMP(338)  ! used 8 times
-      TMP_JAMP(857) = TMP_JAMP(357) +  TMP_JAMP(354)  ! used 8 times
-      TMP_JAMP(856) = TMP_JAMP(357) +  TMP_JAMP(356)  ! used 8 times
-      TMP_JAMP(855) = TMP_JAMP(362) - TMP_JAMP(358)  ! used 8 times
-      TMP_JAMP(854) = TMP_JAMP(376) - TMP_JAMP(364)  ! used 8 times
-      TMP_JAMP(853) = TMP_JAMP(399) +  TMP_JAMP(277)  ! used 8 times
-      TMP_JAMP(852) = TMP_JAMP(400) +  TMP_JAMP(356)  ! used 8 times
-      TMP_JAMP(851) = AMP(1473) +  AMP(1474)  ! used 8 times
-      TMP_JAMP(850) = AMP(1134) +  AMP(1135)  ! used 8 times
-      TMP_JAMP(849) = AMP(1129) +  AMP(1137)  ! used 8 times
-      TMP_JAMP(848) = AMP(123) - AMP(1476)  ! used 8 times
-      TMP_JAMP(847) = AMP(120) +  AMP(1527)  ! used 8 times
-      TMP_JAMP(846) = AMP(118) +  AMP(123)  ! used 8 times
-      TMP_JAMP(845) = AMP(114) +  AMP(120)  ! used 8 times
-      TMP_JAMP(844) = TMP_JAMP(254) + ((-0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * AMP(118)  ! used 8 times
-      TMP_JAMP(843) = TMP_JAMP(350) + ((0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * AMP(114)  ! used 8 times
-      TMP_JAMP(842) = TMP_JAMP(351) +  TMP_JAMP(350)  ! used 8 times
-      TMP_JAMP(841) = TMP_JAMP(353) +  TMP_JAMP(352)  ! used 8 times
-      TMP_JAMP(840) = TMP_JAMP(354) +  TMP_JAMP(353)  ! used 8 times
-      TMP_JAMP(839) = TMP_JAMP(355) - TMP_JAMP(352)  ! used 8 times
-      TMP_JAMP(838) = TMP_JAMP(360) +  TMP_JAMP(254)  ! used 8 times
-      TMP_JAMP(837) = TMP_JAMP(394) +  TMP_JAMP(350)  ! used 8 times
-      TMP_JAMP(836) = TMP_JAMP(398) +  TMP_JAMP(351)  ! used 8 times
-      TMP_JAMP(835) = AMP(1228) +  AMP(1720)  ! used 8 times
-      TMP_JAMP(834) = AMP(1053) +  AMP(1059)  ! used 8 times
-      TMP_JAMP(833) = AMP(519) - AMP(1722)  ! used 8 times
-      TMP_JAMP(832) = AMP(503) +  AMP(1716)  ! used 8 times
-      TMP_JAMP(831) = AMP(500) +  AMP(503)  ! used 8 times
-      TMP_JAMP(830) = TMP_JAMP(346) + ((0.000000000000000D+00
+      TMP_JAMP(807) = TMP_JAMP(669) - TMP_JAMP(668)  ! used 8 times
+      TMP_JAMP(806) = TMP_JAMP(669) - TMP_JAMP(667)  ! used 8 times
+      TMP_JAMP(805) = TMP_JAMP(669) +  TMP_JAMP(661)  ! used 8 times
+      TMP_JAMP(804) = TMP_JAMP(669) + ((-0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * AMP(677)  ! used 8 times
+      TMP_JAMP(803) = TMP_JAMP(668) - TMP_JAMP(666)  ! used 8 times
+      TMP_JAMP(802) = TMP_JAMP(668) +  TMP_JAMP(661)  ! used 8 times
+      TMP_JAMP(801) = TMP_JAMP(667) - TMP_JAMP(665)  ! used 8 times
+      TMP_JAMP(800) = TMP_JAMP(667) +  TMP_JAMP(647)  ! used 8 times
+      TMP_JAMP(799) = TMP_JAMP(667) - TMP_JAMP(653)  ! used 8 times
+      TMP_JAMP(798) = TMP_JAMP(666) +  TMP_JAMP(665)  ! used 8 times
+      TMP_JAMP(797) = TMP_JAMP(665) - TMP_JAMP(653)  ! used 8 times
+      TMP_JAMP(796) = TMP_JAMP(664) +  TMP_JAMP(662)  ! used 8 times
+      TMP_JAMP(795) = TMP_JAMP(663) + ((0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * AMP(781)  ! used 8 times
+      TMP_JAMP(794) = TMP_JAMP(663) - TMP_JAMP(662)  ! used 8 times
+      TMP_JAMP(793) = TMP_JAMP(663) + ((-0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * AMP(782)  ! used 8 times
+      TMP_JAMP(792) = TMP_JAMP(662) +  TMP_JAMP(661)  ! used 8 times
+      TMP_JAMP(791) = TMP_JAMP(662) - TMP_JAMP(652)  ! used 8 times
+      TMP_JAMP(790) = TMP_JAMP(661) +  TMP_JAMP(652)  ! used 8 times
+      TMP_JAMP(789) = TMP_JAMP(660) +  TMP_JAMP(658)  ! used 8 times
+      TMP_JAMP(788) = TMP_JAMP(659) + ((0.000000000000000D+00
      $ ,1.000000000000000D+00)) * AMP(519)  ! used 8 times
-      TMP_JAMP(829) = TMP_JAMP(347) + ((0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * AMP(500)  ! used 8 times
-      TMP_JAMP(828) = TMP_JAMP(347) +  TMP_JAMP(346)  ! used 8 times
-      TMP_JAMP(827) = TMP_JAMP(348) +  TMP_JAMP(346)  ! used 8 times
-      TMP_JAMP(826) = TMP_JAMP(349) - TMP_JAMP(261)  ! used 8 times
-      TMP_JAMP(825) = TMP_JAMP(349) +  TMP_JAMP(347)  ! used 8 times
-      TMP_JAMP(824) = TMP_JAMP(349) +  TMP_JAMP(348)  ! used 8 times
-      TMP_JAMP(823) = TMP_JAMP(362) - TMP_JAMP(348)  ! used 8 times
-      TMP_JAMP(822) = TMP_JAMP(386) +  TMP_JAMP(292)  ! used 8 times
-      TMP_JAMP(821) = TMP_JAMP(388) +  TMP_JAMP(347)  ! used 8 times
-      TMP_JAMP(820) = AMP(1471) - AMP(1475)  ! used 8 times
-      TMP_JAMP(819) = AMP(612) +  AMP(1526)  ! used 8 times
-      TMP_JAMP(818) = AMP(609) - AMP(611)  ! used 8 times
-      TMP_JAMP(817) = AMP(604) +  AMP(612)  ! used 8 times
-      TMP_JAMP(816) = TMP_JAMP(344) + ((-0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * AMP(604)  ! used 8 times
-      TMP_JAMP(815) = TMP_JAMP(345) +  TMP_JAMP(343)  ! used 8 times
-      TMP_JAMP(814) = TMP_JAMP(346) +  TMP_JAMP(343)  ! used 8 times
-      TMP_JAMP(813) = TMP_JAMP(348) +  TMP_JAMP(345)  ! used 8 times
-      TMP_JAMP(812) = TMP_JAMP(350) +  TMP_JAMP(344)  ! used 8 times
-      TMP_JAMP(811) = TMP_JAMP(387) +  TMP_JAMP(344)  ! used 8 times
-      TMP_JAMP(810) = AMP(1305) +  AMP(1669)  ! used 8 times
-      TMP_JAMP(809) = AMP(536) - AMP(1671)  ! used 8 times
-      TMP_JAMP(808) = AMP(535) - AMP(632)  ! used 8 times
-      TMP_JAMP(807) = AMP(389) +  AMP(423)  ! used 8 times
-      TMP_JAMP(806) = AMP(377) - AMP(389)  ! used 8 times
-      TMP_JAMP(805) = AMP(374) +  AMP(376)  ! used 8 times
-      TMP_JAMP(804) = TMP_JAMP(240) + ((-0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * AMP(377)  ! used 8 times
-      TMP_JAMP(803) = TMP_JAMP(341) - TMP_JAMP(340)  ! used 8 times
-      TMP_JAMP(802) = TMP_JAMP(342) + ((-0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * AMP(1305)  ! used 8 times
-      TMP_JAMP(801) = TMP_JAMP(342) +  TMP_JAMP(339)  ! used 8 times
-      TMP_JAMP(800) = TMP_JAMP(381) +  TMP_JAMP(340)  ! used 8 times
-      TMP_JAMP(799) = TMP_JAMP(382) +  TMP_JAMP(341)  ! used 8 times
-      TMP_JAMP(798) = TMP_JAMP(386) +  TMP_JAMP(339)  ! used 8 times
-      TMP_JAMP(797) = TMP_JAMP(390) +  TMP_JAMP(342)  ! used 8 times
-      TMP_JAMP(796) = TMP_JAMP(403) - TMP_JAMP(390)  ! used 8 times
-      TMP_JAMP(795) = TMP_JAMP(404) +  TMP_JAMP(240)  ! used 8 times
-      TMP_JAMP(794) = AMP(1291) - AMP(1886)  ! used 8 times
-      TMP_JAMP(793) = AMP(1290) +  AMP(1291)  ! used 8 times
-      TMP_JAMP(792) = AMP(769) - AMP(1292)  ! used 8 times
-      TMP_JAMP(791) = AMP(706) +  AMP(769)  ! used 8 times
-      TMP_JAMP(790) = AMP(695) - AMP(1670)  ! used 8 times
-      TMP_JAMP(789) = TMP_JAMP(237) + ((-0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * AMP(706)  ! used 8 times
-      TMP_JAMP(788) = TMP_JAMP(338) + ((-0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * AMP(1290)  ! used 8 times
-      TMP_JAMP(787) = TMP_JAMP(339) +  TMP_JAMP(337)  ! used 8 times
-      TMP_JAMP(786) = TMP_JAMP(340) - TMP_JAMP(338)  ! used 8 times
-      TMP_JAMP(785) = TMP_JAMP(369) +  TMP_JAMP(337)  ! used 8 times
-      TMP_JAMP(784) = TMP_JAMP(377) - TMP_JAMP(366)  ! used 8 times
-      TMP_JAMP(783) = TMP_JAMP(378) +  TMP_JAMP(338)  ! used 8 times
-      TMP_JAMP(782) = AMP(694) - AMP(791)  ! used 8 times
-      TMP_JAMP(781) = AMP(373) +  AMP(375)  ! used 8 times
-      TMP_JAMP(780) = TMP_JAMP(342) - TMP_JAMP(337)  ! used 8 times
-      TMP_JAMP(779) = TMP_JAMP(355) +  TMP_JAMP(335)  ! used 8 times
-      TMP_JAMP(778) = TMP_JAMP(356) +  TMP_JAMP(336)  ! used 8 times
-      TMP_JAMP(777) = AMP(1300) - AMP(1859)  ! used 8 times
-      TMP_JAMP(776) = AMP(1299) +  AMP(1300)  ! used 8 times
-      TMP_JAMP(775) = AMP(610) - AMP(1301)  ! used 8 times
-      TMP_JAMP(774) = AMP(547) +  AMP(610)  ! used 8 times
-      TMP_JAMP(773) = TMP_JAMP(233) + ((-0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * AMP(547)  ! used 8 times
-      TMP_JAMP(772) = TMP_JAMP(334) + ((-0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * AMP(1299)  ! used 8 times
-      TMP_JAMP(771) = TMP_JAMP(335) - TMP_JAMP(334)  ! used 8 times
-      TMP_JAMP(770) = TMP_JAMP(339) - TMP_JAMP(233)  ! used 8 times
-      TMP_JAMP(769) = TMP_JAMP(344) - TMP_JAMP(233)  ! used 8 times
-      TMP_JAMP(768) = TMP_JAMP(351) - TMP_JAMP(344)  ! used 8 times
-      TMP_JAMP(767) = TMP_JAMP(352) +  TMP_JAMP(334)  ! used 8 times
-      TMP_JAMP(766) = AMP(1304) +  AMP(1642)  ! used 8 times
-      TMP_JAMP(765) = AMP(1285) +  AMP(1293)  ! used 8 times
-      TMP_JAMP(764) = AMP(141) - AMP(1644)  ! used 8 times
-      TMP_JAMP(763) = AMP(107) +  AMP(1635)  ! used 8 times
-      TMP_JAMP(762) = AMP(104) +  AMP(107)  ! used 8 times
-      TMP_JAMP(761) = TMP_JAMP(330) + ((-0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * AMP(104)  ! used 8 times
-      TMP_JAMP(760) = TMP_JAMP(331) + ((-0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * AMP(141)  ! used 8 times
-      TMP_JAMP(759) = TMP_JAMP(331) +  TMP_JAMP(314)  ! used 8 times
-      TMP_JAMP(758) = TMP_JAMP(332) - TMP_JAMP(237)  ! used 8 times
-      TMP_JAMP(757) = TMP_JAMP(333) +  TMP_JAMP(332)  ! used 8 times
-      TMP_JAMP(756) = TMP_JAMP(342) - TMP_JAMP(333)  ! used 8 times
-      TMP_JAMP(755) = TMP_JAMP(375) +  TMP_JAMP(330)  ! used 8 times
-      TMP_JAMP(754) = AMP(1302) - AMP(1639)  ! used 8 times
-      TMP_JAMP(753) = AMP(1294) +  AMP(1302)  ! used 8 times
-      TMP_JAMP(752) = AMP(116) +  AMP(119)  ! used 8 times
-      TMP_JAMP(751) = TMP_JAMP(329) + ((-0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * AMP(1294)  ! used 8 times
-      TMP_JAMP(750) = TMP_JAMP(329) - TMP_JAMP(233)  ! used 8 times
-      TMP_JAMP(749) = TMP_JAMP(333) - TMP_JAMP(329)  ! used 8 times
-      TMP_JAMP(748) = TMP_JAMP(350) +  TMP_JAMP(328)  ! used 8 times
-      TMP_JAMP(747) = AMP(942) +  AMP(946)  ! used 8 times
-      TMP_JAMP(746) = AMP(837) - AMP(1775)  ! used 8 times
-      TMP_JAMP(745) = AMP(833) - AMP(948)  ! used 8 times
-      TMP_JAMP(744) = AMP(831) - AMP(1049)  ! used 8 times
-      TMP_JAMP(743) = AMP(821) +  AMP(1769)  ! used 8 times
-      TMP_JAMP(742) = AMP(820) +  AMP(831)  ! used 8 times
-      TMP_JAMP(741) = AMP(818) +  AMP(821)  ! used 8 times
-      TMP_JAMP(740) = TMP_JAMP(223) + ((-0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * AMP(820)  ! used 8 times
-      TMP_JAMP(739) = TMP_JAMP(325) + ((-0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * AMP(818)  ! used 8 times
-      TMP_JAMP(738) = TMP_JAMP(325) +  TMP_JAMP(324)  ! used 8 times
-      TMP_JAMP(737) = TMP_JAMP(327) - TMP_JAMP(326)  ! used 8 times
-      TMP_JAMP(736) = TMP_JAMP(368) +  TMP_JAMP(324)  ! used 8 times
-      TMP_JAMP(735) = TMP_JAMP(370) +  TMP_JAMP(325)  ! used 8 times
-      TMP_JAMP(734) = TMP_JAMP(392) - TMP_JAMP(223)  ! used 8 times
-      TMP_JAMP(733) = TMP_JAMP(399) - TMP_JAMP(327)  ! used 8 times
-      TMP_JAMP(732) = TMP_JAMP(403) +  TMP_JAMP(326)  ! used 8 times
-      TMP_JAMP(731) = AMP(929) +  AMP(1805)  ! used 8 times
-      TMP_JAMP(730) = AMP(928) - AMP(1214)  ! used 8 times
-      TMP_JAMP(729) = AMP(927) - AMP(929)  ! used 8 times
-      TMP_JAMP(728) = AMP(865) +  AMP(928)  ! used 8 times
-      TMP_JAMP(727) = AMP(851) - AMP(947)  ! used 8 times
-      TMP_JAMP(726) = TMP_JAMP(220) + ((0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * AMP(865)  ! used 8 times
-      TMP_JAMP(725) = TMP_JAMP(322) + ((0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * AMP(927)  ! used 8 times
-      TMP_JAMP(724) = TMP_JAMP(324) +  TMP_JAMP(322)  ! used 8 times
-      TMP_JAMP(723) = TMP_JAMP(327) +  TMP_JAMP(323)  ! used 8 times
-      TMP_JAMP(722) = TMP_JAMP(365) +  TMP_JAMP(322)  ! used 8 times
-      TMP_JAMP(721) = TMP_JAMP(377) - TMP_JAMP(220)  ! used 8 times
-      TMP_JAMP(720) = TMP_JAMP(383) - TMP_JAMP(323)  ! used 8 times
-      TMP_JAMP(719) = AMP(855) - AMP(1721)  ! used 8 times
-      TMP_JAMP(718) = AMP(816) +  AMP(822)  ! used 8 times
-      TMP_JAMP(717) = TMP_JAMP(326) +  TMP_JAMP(323)  ! used 8 times
-      TMP_JAMP(716) = TMP_JAMP(346) +  TMP_JAMP(320)  ! used 8 times
-      TMP_JAMP(715) = TMP_JAMP(347) +  TMP_JAMP(321)  ! used 8 times
-      TMP_JAMP(714) = AMP(938) +  AMP(1751)  ! used 8 times
-      TMP_JAMP(713) = AMP(936) - AMP(938)  ! used 8 times
-      TMP_JAMP(712) = AMP(847) +  AMP(937)  ! used 8 times
-      TMP_JAMP(711) = TMP_JAMP(319) + ((0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * AMP(936)  ! used 8 times
-      TMP_JAMP(710) = TMP_JAMP(320) +  TMP_JAMP(319)  ! used 8 times
-      TMP_JAMP(709) = TMP_JAMP(343) +  TMP_JAMP(319)  ! used 8 times
-      TMP_JAMP(708) = TMP_JAMP(351) - TMP_JAMP(215)  ! used 8 times
-      TMP_JAMP(707) = AMP(941) - AMP(1643)  ! used 8 times
-      TMP_JAMP(706) = AMP(930) +  AMP(1634)  ! used 8 times
-      TMP_JAMP(705) = AMP(922) +  AMP(930)  ! used 8 times
-      TMP_JAMP(704) = TMP_JAMP(317) + ((0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * AMP(922)  ! used 8 times
-      TMP_JAMP(703) = TMP_JAMP(330) +  TMP_JAMP(317)  ! used 8 times
-      TMP_JAMP(702) = TMP_JAMP(331) +  TMP_JAMP(318)  ! used 8 times
-      TMP_JAMP(701) = AMP(939) +  AMP(1640)  ! used 8 times
-      TMP_JAMP(700) = AMP(931) +  AMP(939)  ! used 8 times
-      TMP_JAMP(699) = TMP_JAMP(316) + ((0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * AMP(931)  ! used 8 times
-      TMP_JAMP(698) = TMP_JAMP(318) - TMP_JAMP(316)  ! used 8 times
-      TMP_JAMP(697) = TMP_JAMP(328) +  TMP_JAMP(316)  ! used 8 times
-      TMP_JAMP(696) = AMP(1435) +  AMP(1852)  ! used 8 times
-      TMP_JAMP(695) = AMP(1434) +  AMP(1435)  ! used 8 times
-      TMP_JAMP(694) = AMP(1113) - AMP(1507)  ! used 8 times
-      TMP_JAMP(693) = AMP(1111) - AMP(1854)  ! used 8 times
-      TMP_JAMP(692) = AMP(1110) +  AMP(1111)  ! used 8 times
-      TMP_JAMP(691) = AMP(1105) +  AMP(1113)  ! used 8 times
-      TMP_JAMP(690) = AMP(81) - AMP(1437)  ! used 8 times
-      TMP_JAMP(689) = AMP(72) +  AMP(1509)  ! used 8 times
-      TMP_JAMP(688) = AMP(70) +  AMP(81)  ! used 8 times
-      TMP_JAMP(687) = AMP(66) +  AMP(72)  ! used 8 times
-      TMP_JAMP(686) = TMP_JAMP(254) + ((0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * AMP(70)  ! used 8 times
-      TMP_JAMP(685) = TMP_JAMP(350) + ((-0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * AMP(66)  ! used 8 times
-      TMP_JAMP(684) = TMP_JAMP(351) + ((-0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * AMP(1105)  ! used 8 times
-      TMP_JAMP(683) = TMP_JAMP(352) + ((-0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * AMP(1110)  ! used 8 times
-      TMP_JAMP(682) = TMP_JAMP(353) + ((-0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * AMP(1434)  ! used 8 times
-      TMP_JAMP(681) = TMP_JAMP(356) - TMP_JAMP(352)  ! used 8 times
-      TMP_JAMP(680) = TMP_JAMP(357) - TMP_JAMP(353)  ! used 8 times
-      TMP_JAMP(679) = TMP_JAMP(395) +  TMP_JAMP(350)  ! used 8 times
-      TMP_JAMP(678) = TMP_JAMP(396) +  TMP_JAMP(361)  ! used 8 times
-      TMP_JAMP(677) = TMP_JAMP(401) +  TMP_JAMP(351)  ! used 8 times
-      TMP_JAMP(676) = TMP_JAMP(405) - TMP_JAMP(363)  ! used 8 times
-      TMP_JAMP(675) = AMP(1436) - AMP(1744)  ! used 8 times
-      TMP_JAMP(674) = AMP(1432) - AMP(1436)  ! used 8 times
-      TMP_JAMP(673) = AMP(588) +  AMP(1508)  ! used 8 times
-      TMP_JAMP(672) = AMP(587) +  AMP(1746)  ! used 8 times
-      TMP_JAMP(671) = AMP(585) - AMP(587)  ! used 8 times
-      TMP_JAMP(670) = AMP(580) +  AMP(588)  ! used 8 times
-      TMP_JAMP(669) = TMP_JAMP(343) + ((0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * AMP(585)  ! used 8 times
-      TMP_JAMP(668) = TMP_JAMP(344) + ((0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * AMP(580)  ! used 8 times
-      TMP_JAMP(667) = TMP_JAMP(345) + ((0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * AMP(1432)  ! used 8 times
-      TMP_JAMP(666) = TMP_JAMP(347) - TMP_JAMP(343)  ! used 8 times
-      TMP_JAMP(665) = TMP_JAMP(349) - TMP_JAMP(345)  ! used 8 times
-      TMP_JAMP(664) = TMP_JAMP(389) +  TMP_JAMP(344)  ! used 8 times
-      TMP_JAMP(663) = TMP_JAMP(393) - TMP_JAMP(359)  ! used 8 times
-      TMP_JAMP(662) = AMP(1279) - AMP(1853)  ! used 8 times
-      TMP_JAMP(661) = AMP(1278) +  AMP(1279)  ! used 8 times
-      TMP_JAMP(660) = AMP(586) - AMP(1280)  ! used 8 times
-      TMP_JAMP(659) = AMP(544) +  AMP(586)  ! used 8 times
-      TMP_JAMP(658) = TMP_JAMP(233) + ((0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * AMP(544)  ! used 8 times
-      TMP_JAMP(657) = TMP_JAMP(334) + ((0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * AMP(1278)  ! used 8 times
-      TMP_JAMP(656) = TMP_JAMP(336) - TMP_JAMP(334)  ! used 8 times
-      TMP_JAMP(655) = TMP_JAMP(380) +  TMP_JAMP(340)  ! used 8 times
-      TMP_JAMP(654) = TMP_JAMP(385) - TMP_JAMP(341)  ! used 8 times
-      TMP_JAMP(653) = AMP(1281) - AMP(1624)  ! used 8 times
-      TMP_JAMP(652) = AMP(1273) +  AMP(1281)  ! used 8 times
-      TMP_JAMP(651) = AMP(71) +  AMP(1626)  ! used 8 times
-      TMP_JAMP(650) = AMP(68) +  AMP(71)  ! used 8 times
-      TMP_JAMP(649) = TMP_JAMP(328) + ((0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * AMP(68)  ! used 8 times
-      TMP_JAMP(648) = TMP_JAMP(329) + ((0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * AMP(1273)  ! used 8 times
-      TMP_JAMP(647) = TMP_JAMP(330) +  TMP_JAMP(283)  ! used 8 times
-      TMP_JAMP(646) = TMP_JAMP(330) +  TMP_JAMP(328)  ! used 8 times
-      TMP_JAMP(645) = TMP_JAMP(332) +  TMP_JAMP(329)  ! used 8 times
-      TMP_JAMP(644) = TMP_JAMP(379) - TMP_JAMP(338)  ! used 8 times
-      TMP_JAMP(643) = AMP(917) +  AMP(1745)  ! used 8 times
-      TMP_JAMP(642) = AMP(916) - AMP(1112)  ! used 8 times
-      TMP_JAMP(641) = AMP(915) - AMP(917)  ! used 8 times
-      TMP_JAMP(640) = AMP(844) +  AMP(916)  ! used 8 times
-      TMP_JAMP(639) = TMP_JAMP(215) + ((-0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * AMP(844)  ! used 8 times
-      TMP_JAMP(638) = TMP_JAMP(319) + ((-0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * AMP(915)  ! used 8 times
-      TMP_JAMP(637) = TMP_JAMP(321) - TMP_JAMP(319)  ! used 8 times
-      TMP_JAMP(636) = TMP_JAMP(372) - TMP_JAMP(324)  ! used 8 times
-      TMP_JAMP(635) = TMP_JAMP(373) - TMP_JAMP(325)  ! used 8 times
-      TMP_JAMP(634) = AMP(918) +  AMP(1625)  ! used 8 times
-      TMP_JAMP(633) = AMP(910) +  AMP(918)  ! used 8 times
-      TMP_JAMP(632) = TMP_JAMP(316) + ((-0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * AMP(910)  ! used 8 times
-      TMP_JAMP(631) = TMP_JAMP(317) +  TMP_JAMP(316)  ! used 8 times
-      TMP_JAMP(630) = TMP_JAMP(367) - TMP_JAMP(322)  ! used 8 times
-      TMP_JAMP(629) = AMP(1303) +  AMP(1645)  ! used 8 times
-      TMP_JAMP(628) = AMP(140) - AMP(1647)  ! used 8 times
-      TMP_JAMP(627) = TMP_JAMP(328) +  TMP_JAMP(254)  ! used 8 times
-      TMP_JAMP(626) = TMP_JAMP(331) + ((0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * AMP(140)  ! used 8 times
-      TMP_JAMP(625) = TMP_JAMP(333) + ((0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * AMP(1303)  ! used 8 times
-      TMP_JAMP(624) = TMP_JAMP(353) - TMP_JAMP(334)  ! used 8 times
-      TMP_JAMP(623) = TMP_JAMP(357) - TMP_JAMP(336)  ! used 8 times
-      TMP_JAMP(622) = TMP_JAMP(397) +  TMP_JAMP(331)  ! used 8 times
-      TMP_JAMP(621) = TMP_JAMP(402) +  TMP_JAMP(240)  ! used 8 times
-      TMP_JAMP(620) = AMP(940) - AMP(1646)  ! used 8 times
-      TMP_JAMP(619) = TMP_JAMP(318) + ((-0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * AMP(940)  ! used 8 times
-      TMP_JAMP(618) = TMP_JAMP(345) - TMP_JAMP(319)  ! used 8 times
-      TMP_JAMP(617) = TMP_JAMP(349) - TMP_JAMP(321)  ! used 8 times
-      TMP_JAMP(616) = TMP_JAMP(391) - TMP_JAMP(223)  ! used 8 times
-      TMP_JAMP(615) = AMP(944) +  AMP(949)  ! used 8 times
-      TMP_JAMP(614) = AMP(835) - AMP(951)  ! used 8 times
-      TMP_JAMP(613) = TMP_JAMP(326) + ((-0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * AMP(944)  ! used 8 times
-      TMP_JAMP(612) = TMP_JAMP(327) + ((0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * AMP(835)  ! used 8 times
-      TMP_JAMP(611) = TMP_JAMP(329) - TMP_JAMP(316)  ! used 8 times
-      TMP_JAMP(610) = TMP_JAMP(333) - TMP_JAMP(318)  ! used 8 times
-      TMP_JAMP(609) = TMP_JAMP(352) +  TMP_JAMP(215)  ! used 8 times
-      TMP_JAMP(608) = TMP_JAMP(400) - TMP_JAMP(327)  ! used 8 times
-      TMP_JAMP(607) = TMP_JAMP(404) +  TMP_JAMP(326)  ! used 8 times
-      TMP_JAMP(606) = AMP(1483) +  AMP(1831)  ! used 8 times
-      TMP_JAMP(605) = AMP(1146) - AMP(1531)  ! used 8 times
-      TMP_JAMP(604) = AMP(1144) - AMP(1833)  ! used 8 times
-      TMP_JAMP(603) = AMP(135) - AMP(1485)  ! used 8 times
-      TMP_JAMP(602) = AMP(132) +  AMP(1533)  ! used 8 times
-      TMP_JAMP(601) = TMP_JAMP(314) + ((-0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * AMP(130)  ! used 8 times
-      TMP_JAMP(600) = TMP_JAMP(394) + ((0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * AMP(126)  ! used 8 times
-      TMP_JAMP(599) = TMP_JAMP(398) + ((0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * AMP(1138)  ! used 8 times
-      TMP_JAMP(598) = TMP_JAMP(399) + ((0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * AMP(1143)  ! used 8 times
-      TMP_JAMP(597) = TMP_JAMP(403) + ((0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * AMP(1482)  ! used 8 times
-      TMP_JAMP(596) = AMP(1306) +  AMP(1666)  ! used 8 times
-      TMP_JAMP(595) = AMP(537) - AMP(1668)  ! used 8 times
-      TMP_JAMP(594) = TMP_JAMP(339) + ((0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * AMP(537)  ! used 8 times
-      TMP_JAMP(593) = TMP_JAMP(342) + ((0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * AMP(1306)  ! used 8 times
-      TMP_JAMP(592) = TMP_JAMP(343) +  TMP_JAMP(233)  ! used 8 times
-      TMP_JAMP(591) = TMP_JAMP(388) +  TMP_JAMP(339)  ! used 8 times
-      TMP_JAMP(590) = TMP_JAMP(392) +  TMP_JAMP(342)  ! used 8 times
-      TMP_JAMP(589) = AMP(1484) - AMP(1696)  ! used 8 times
-      TMP_JAMP(588) = AMP(621) +  AMP(1532)  ! used 8 times
-      TMP_JAMP(587) = AMP(620) +  AMP(1698)  ! used 8 times
-      TMP_JAMP(586) = TMP_JAMP(386) + ((-0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * AMP(618)  ! used 8 times
-      TMP_JAMP(585) = TMP_JAMP(387) + ((-0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * AMP(613)  ! used 8 times
-      TMP_JAMP(584) = TMP_JAMP(390) + ((-0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * AMP(1480)  ! used 8 times
-      TMP_JAMP(583) = AMP(1227) +  AMP(1723)  ! used 8 times
-      TMP_JAMP(582) = AMP(518) - AMP(1725)  ! used 8 times
-      TMP_JAMP(581) = TMP_JAMP(346) + ((-0.000000000000000D+00,
+      TMP_JAMP(787) = TMP_JAMP(659) - TMP_JAMP(658)  ! used 8 times
+      TMP_JAMP(786) = TMP_JAMP(659) + ((-0.000000000000000D+00,
      $ -1.000000000000000D+00)) * AMP(518)  ! used 8 times
-      TMP_JAMP(580) = TMP_JAMP(348) + ((-0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * AMP(1227)  ! used 8 times
-      TMP_JAMP(579) = TMP_JAMP(353) - TMP_JAMP(345)  ! used 8 times
-      TMP_JAMP(578) = TMP_JAMP(381) +  TMP_JAMP(346)  ! used 8 times
-      TMP_JAMP(577) = TMP_JAMP(382) - TMP_JAMP(264)  ! used 8 times
-      TMP_JAMP(576) = AMP(854) - AMP(1724)  ! used 8 times
-      TMP_JAMP(575) = TMP_JAMP(320) + ((0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * AMP(854)  ! used 8 times
-      TMP_JAMP(574) = TMP_JAMP(332) - TMP_JAMP(317)  ! used 8 times
-      TMP_JAMP(573) = TMP_JAMP(378) +  TMP_JAMP(220)  ! used 8 times
-      TMP_JAMP(572) = AMP(853) - AMP(950)  ! used 8 times
-      TMP_JAMP(571) = TMP_JAMP(323) + ((-0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * AMP(853)  ! used 8 times
-      TMP_JAMP(570) = TMP_JAMP(348) - TMP_JAMP(320)  ! used 8 times
-      TMP_JAMP(569) = TMP_JAMP(384) - TMP_JAMP(323)  ! used 8 times
-      TMP_JAMP(568) = AMP(1222) - AMP(1832)  ! used 8 times
-      TMP_JAMP(567) = AMP(619) - AMP(1223)  ! used 8 times
-      TMP_JAMP(566) = TMP_JAMP(292) + ((-0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * AMP(529)  ! used 8 times
-      TMP_JAMP(565) = TMP_JAMP(383) + ((-0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * AMP(1221)  ! used 8 times
-      TMP_JAMP(564) = AMP(1226) +  AMP(1588)  ! used 8 times
-      TMP_JAMP(563) = AMP(147) - AMP(1590)  ! used 8 times
-      TMP_JAMP(562) = TMP_JAMP(360) + ((-0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * AMP(147)  ! used 8 times
-      TMP_JAMP(561) = TMP_JAMP(362) + ((-0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * AMP(1226)  ! used 8 times
-      TMP_JAMP(560) = TMP_JAMP(375) +  TMP_JAMP(360)  ! used 8 times
-      TMP_JAMP(559) = TMP_JAMP(377) +  TMP_JAMP(362)  ! used 8 times
-      TMP_JAMP(558) = AMP(1224) - AMP(1585)  ! used 8 times
-      TMP_JAMP(557) = AMP(131) +  AMP(1587)  ! used 8 times
-      TMP_JAMP(556) = TMP_JAMP(374) + ((-0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * AMP(128)  ! used 8 times
-      TMP_JAMP(555) = TMP_JAMP(376) + ((-0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * AMP(1216)  ! used 8 times
-      TMP_JAMP(554) = AMP(783) +  AMP(787)  ! used 8 times
-      TMP_JAMP(553) = AMP(674) - AMP(789)  ! used 8 times
-      TMP_JAMP(552) = TMP_JAMP(354) + ((0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * AMP(783)  ! used 8 times
-      TMP_JAMP(551) = TMP_JAMP(355) + ((-0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * AMP(674)  ! used 8 times
-      TMP_JAMP(550) = TMP_JAMP(368) +  TMP_JAMP(355)  ! used 8 times
-      TMP_JAMP(549) = TMP_JAMP(370) +  TMP_JAMP(261)  ! used 8 times
-      TMP_JAMP(548) = AMP(692) - AMP(788)  ! used 8 times
-      TMP_JAMP(547) = TMP_JAMP(335) + ((0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * AMP(692)  ! used 8 times
-      TMP_JAMP(546) = TMP_JAMP(365) +  TMP_JAMP(237)  ! used 8 times
-      TMP_JAMP(545) = AMP(696) - AMP(1667)  ! used 8 times
-      TMP_JAMP(544) = TMP_JAMP(337) + ((-0.000000000000000D+00
+      TMP_JAMP(785) = TMP_JAMP(658) - TMP_JAMP(657)  ! used 8 times
+      TMP_JAMP(784) = TMP_JAMP(658) +  TMP_JAMP(643)  ! used 8 times
+      TMP_JAMP(783) = TMP_JAMP(658) - TMP_JAMP(651)  ! used 8 times
+      TMP_JAMP(782) = TMP_JAMP(657) - TMP_JAMP(651)  ! used 8 times
+      TMP_JAMP(781) = TMP_JAMP(656) +  TMP_JAMP(655)  ! used 8 times
+      TMP_JAMP(780) = TMP_JAMP(656) - TMP_JAMP(651)  ! used 8 times
+      TMP_JAMP(779) = TMP_JAMP(656) + ((0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * AMP(537)  ! used 8 times
+      TMP_JAMP(778) = TMP_JAMP(655) +  TMP_JAMP(651)  ! used 8 times
+      TMP_JAMP(777) = TMP_JAMP(654) - TMP_JAMP(653)  ! used 8 times
+      TMP_JAMP(776) = TMP_JAMP(654) +  TMP_JAMP(652)  ! used 8 times
+      TMP_JAMP(775) = TMP_JAMP(654) + ((-0.000000000000000D+00
      $ ,1.000000000000000D+00)) * AMP(696)  ! used 8 times
-      TMP_JAMP(543) = TMP_JAMP(354) +  TMP_JAMP(335)  ! used 8 times
-      TMP_JAMP(542) = TMP_JAMP(371) +  TMP_JAMP(337)  ! used 8 times
-      TMP_JAMP(541) = AMP(779) +  AMP(1697)  ! used 8 times
-      TMP_JAMP(540) = AMP(778) - AMP(1145)  ! used 8 times
-      TMP_JAMP(539) = TMP_JAMP(277) + ((0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * AMP(688)  ! used 8 times
-      TMP_JAMP(538) = TMP_JAMP(369) + ((0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * AMP(777)  ! used 8 times
-      TMP_JAMP(537) = AMP(782) - AMP(1589)  ! used 8 times
-      TMP_JAMP(536) = TMP_JAMP(358) + ((0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * AMP(782)  ! used 8 times
-      TMP_JAMP(535) = TMP_JAMP(366) +  TMP_JAMP(358)  ! used 8 times
-      TMP_JAMP(534) = AMP(780) +  AMP(1586)  ! used 8 times
-      TMP_JAMP(533) = TMP_JAMP(364) + ((0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * AMP(772)  ! used 8 times
-      TMP_JAMP(532) = AMP(1414) +  AMP(1879)  ! used 8 times
-      TMP_JAMP(531) = AMP(1413) +  AMP(1414)  ! used 8 times
-      TMP_JAMP(530) = AMP(1191) - AMP(1561)  ! used 8 times
-      TMP_JAMP(529) = AMP(1189) - AMP(1881)  ! used 8 times
-      TMP_JAMP(528) = AMP(1188) +  AMP(1189)  ! used 8 times
-      TMP_JAMP(527) = AMP(63) - AMP(1416)  ! used 8 times
-      TMP_JAMP(526) = AMP(54) +  AMP(1563)  ! used 8 times
-      TMP_JAMP(525) = AMP(52) +  AMP(63)  ! used 8 times
-      TMP_JAMP(524) = TMP_JAMP(283) + ((0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * AMP(52)  ! used 8 times
-      TMP_JAMP(523) = TMP_JAMP(375) + ((-0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * AMP(48)  ! used 8 times
-      TMP_JAMP(522) = TMP_JAMP(377) + ((-0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * AMP(1183)  ! used 8 times
-      TMP_JAMP(521) = TMP_JAMP(378) + ((-0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * AMP(1188)  ! used 8 times
-      TMP_JAMP(520) = TMP_JAMP(379) + ((-0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * AMP(1413)  ! used 8 times
-      TMP_JAMP(519) = TMP_JAMP(382) - TMP_JAMP(378)  ! used 8 times
-      TMP_JAMP(518) = TMP_JAMP(396) - TMP_JAMP(283)  ! used 8 times
-      TMP_JAMP(517) = TMP_JAMP(405) +  TMP_JAMP(385)  ! used 8 times
-      TMP_JAMP(516) = AMP(1415) - AMP(1798)  ! used 8 times
-      TMP_JAMP(515) = AMP(1411) - AMP(1415)  ! used 8 times
-      TMP_JAMP(514) = AMP(747) +  AMP(1562)  ! used 8 times
-      TMP_JAMP(513) = AMP(746) +  AMP(1800)  ! used 8 times
-      TMP_JAMP(512) = AMP(744) - AMP(746)  ! used 8 times
-      TMP_JAMP(511) = TMP_JAMP(365) + ((0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * AMP(744)  ! used 8 times
-      TMP_JAMP(510) = TMP_JAMP(366) + ((0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * AMP(739)  ! used 8 times
-      TMP_JAMP(509) = TMP_JAMP(367) + ((0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * AMP(1411)  ! used 8 times
-      TMP_JAMP(508) = TMP_JAMP(370) - TMP_JAMP(365)  ! used 8 times
-      TMP_JAMP(507) = TMP_JAMP(393) +  TMP_JAMP(373)  ! used 8 times
-      TMP_JAMP(506) = AMP(1267) - AMP(1880)  ! used 8 times
-      TMP_JAMP(505) = AMP(1266) +  AMP(1267)  ! used 8 times
-      TMP_JAMP(504) = AMP(745) - AMP(1268)  ! used 8 times
-      TMP_JAMP(503) = TMP_JAMP(237) + ((0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * AMP(703)  ! used 8 times
-      TMP_JAMP(502) = TMP_JAMP(338) + ((0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * AMP(1266)  ! used 8 times
-      TMP_JAMP(501) = TMP_JAMP(341) - TMP_JAMP(338)  ! used 8 times
-      TMP_JAMP(500) = AMP(1269) - AMP(1615)  ! used 8 times
-      TMP_JAMP(499) = AMP(53) +  AMP(1617)  ! used 8 times
-      TMP_JAMP(498) = TMP_JAMP(330) + ((0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * AMP(50)  ! used 8 times
-      TMP_JAMP(497) = TMP_JAMP(332) + ((0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * AMP(1261)  ! used 8 times
-      TMP_JAMP(496) = AMP(905) +  AMP(1799)  ! used 8 times
-      TMP_JAMP(495) = AMP(904) - AMP(1190)  ! used 8 times
-      TMP_JAMP(494) = AMP(903) - AMP(905)  ! used 8 times
-      TMP_JAMP(493) = TMP_JAMP(220) + ((-0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * AMP(862)  ! used 8 times
-      TMP_JAMP(492) = TMP_JAMP(322) + ((-0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * AMP(903)  ! used 8 times
-      TMP_JAMP(491) = TMP_JAMP(325) - TMP_JAMP(322)  ! used 8 times
-      TMP_JAMP(490) = AMP(906) +  AMP(1616)  ! used 8 times
-      TMP_JAMP(489) = TMP_JAMP(317) + ((-0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * AMP(898)  ! used 8 times
-      TMP_JAMP(488) = AMP(1149) +  AMP(1777)  ! used 8 times
-      TMP_JAMP(487) = TMP_JAMP(368) + ((-0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * AMP(677)  ! used 8 times
-      TMP_JAMP(486) = TMP_JAMP(372) + ((-0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * AMP(1149)  ! used 8 times
-      TMP_JAMP(485) = TMP_JAMP(372) +  TMP_JAMP(306)  ! used 8 times
-      TMP_JAMP(484) = TMP_JAMP(379) - TMP_JAMP(367)  ! used 8 times
-      TMP_JAMP(483) = TMP_JAMP(324) + ((0.000000000000000D+00,
+      TMP_JAMP(774) = TMP_JAMP(653) +  TMP_JAMP(652)  ! used 8 times
+      TMP_JAMP(773) = TMP_JAMP(650) +  TMP_JAMP(649)  ! used 8 times
+      TMP_JAMP(772) = TMP_JAMP(650) - TMP_JAMP(645)  ! used 8 times
+      TMP_JAMP(771) = TMP_JAMP(649) +  TMP_JAMP(645)  ! used 8 times
+      TMP_JAMP(770) = TMP_JAMP(649) +  TMP_JAMP(647)  ! used 8 times
+      TMP_JAMP(769) = TMP_JAMP(648) + ((-0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * AMP(837)  ! used 8 times
+      TMP_JAMP(768) = TMP_JAMP(648) - TMP_JAMP(642)  ! used 8 times
+      TMP_JAMP(767) = TMP_JAMP(648) + ((0.000000000000000D+00,
      $ -1.000000000000000D+00)) * AMP(836)  ! used 8 times
-      TMP_JAMP(482) = TMP_JAMP(395) + ((-0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * AMP(153)  ! used 8 times
-      TMP_JAMP(481) = TMP_JAMP(401) + ((-0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * AMP(1148)  ! used 8 times
-      TMP_JAMP(480) = AMP(624) +  AMP(628)  ! used 8 times
-      TMP_JAMP(479) = TMP_JAMP(380) + ((0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * AMP(624)  ! used 8 times
-      TMP_JAMP(478) = TMP_JAMP(380) - TMP_JAMP(297)  ! used 8 times
-      TMP_JAMP(477) = TMP_JAMP(381) + ((-0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * AMP(515)  ! used 8 times
-      TMP_JAMP(476) = TMP_JAMP(340) + ((0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * AMP(533)  ! used 8 times
-      TMP_JAMP(475) = TMP_JAMP(389) + ((0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * AMP(623)  ! used 8 times
-      TMP_JAMP(474) = AMP(1006) +  AMP(1012)  ! used 8 times
-      TMP_JAMP(473) = AMP(1004) - AMP(1006)  ! used 8 times
-      TMP_JAMP(472) = AMP(990) - AMP(1651)  ! used 8 times
-      TMP_JAMP(471) = AMP(988) - AMP(1014)  ! used 8 times
-      TMP_JAMP(470) = AMP(986) - AMP(988)  ! used 8 times
-      TMP_JAMP(469) = AMP(495) - AMP(1008)  ! used 8 times
-      TMP_JAMP(468) = AMP(486) +  AMP(1653)  ! used 8 times
-      TMP_JAMP(467) = TMP_JAMP(297) + ((-0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * AMP(484)  ! used 8 times
-      TMP_JAMP(466) = TMP_JAMP(388) + ((0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * AMP(480)  ! used 8 times
-      TMP_JAMP(465) = TMP_JAMP(391) + ((-0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * AMP(986)  ! used 8 times
-      TMP_JAMP(464) = TMP_JAMP(392) + ((0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * AMP(984)  ! used 8 times
-      TMP_JAMP(463) = TMP_JAMP(393) + ((-0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * AMP(1004)  ! used 8 times
-      TMP_JAMP(462) = TMP_JAMP(402) - TMP_JAMP(391)  ! used 8 times
-      TMP_JAMP(461) = TMP_JAMP(405) - TMP_JAMP(393)  ! used 8 times
-      TMP_JAMP(460) = AMP(1007) - AMP(1759)  ! used 8 times
-      TMP_JAMP(459) = AMP(645) +  AMP(1652)  ! used 8 times
-      TMP_JAMP(458) = AMP(644) +  AMP(1761)  ! used 8 times
-      TMP_JAMP(457) = TMP_JAMP(370) + ((-0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * AMP(641)  ! used 8 times
-      TMP_JAMP(456) = TMP_JAMP(371) + ((-0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * AMP(639)  ! used 8 times
-      TMP_JAMP(455) = TMP_JAMP(373) + ((-0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * AMP(1002)  ! used 8 times
-      TMP_JAMP(454) = AMP(997) - AMP(1013)  ! used 8 times
-      TMP_JAMP(453) = AMP(995) - AMP(997)  ! used 8 times
-      TMP_JAMP(452) = AMP(654) - AMP(998)  ! used 8 times
-      TMP_JAMP(451) = TMP_JAMP(261) + ((-0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * AMP(643)  ! used 8 times
-      TMP_JAMP(450) = TMP_JAMP(359) + ((0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * AMP(995)  ! used 8 times
-      TMP_JAMP(449) = TMP_JAMP(363) - TMP_JAMP(359)  ! used 8 times
-      TMP_JAMP(448) = AMP(999) - AMP(1705)  ! used 8 times
-      TMP_JAMP(447) = AMP(485) +  AMP(1707)  ! used 8 times
-      TMP_JAMP(446) = TMP_JAMP(347) + ((-0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * AMP(482)  ! used 8 times
-      TMP_JAMP(445) = TMP_JAMP(349) + ((-0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * AMP(993)  ! used 8 times
-      TMP_JAMP(444) = AMP(813) - AMP(989)  ! used 8 times
-      TMP_JAMP(443) = AMP(803) +  AMP(1760)  ! used 8 times
-      TMP_JAMP(442) = TMP_JAMP(223) + ((0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * AMP(802)  ! used 8 times
-      TMP_JAMP(441) = TMP_JAMP(325) + ((0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * AMP(800)  ! used 8 times
-      TMP_JAMP(440) = AMP(804) +  AMP(1706)  ! used 8 times
-      TMP_JAMP(439) = TMP_JAMP(321) + ((0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * AMP(798)  ! used 8 times
-      TMP_JAMP(438) = TMP_JAMP(396) + ((-0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * AMP(149)  ! used 8 times
-      TMP_JAMP(437) = TMP_JAMP(397) + ((0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * AMP(137)  ! used 8 times
-      TMP_JAMP(436) = TMP_JAMP(361) + ((-0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * AMP(143)  ! used 8 times
-      TMP_JAMP(435) = AMP(283) - AMP(316)  ! used 8 times
-      TMP_JAMP(434) = AMP(265) +  AMP(318)  ! used 8 times
-      TMP_JAMP(433) = AMP(263) +  AMP(286)  ! used 8 times
-      TMP_JAMP(432) = AMP(194) +  AMP(281)  ! used 8 times
-      TMP_JAMP(431) = AMP(180) - AMP(288)  ! used 8 times
-      TMP_JAMP(430) = TMP_JAMP(306) + ((-0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * AMP(182)  ! used 8 times
-      TMP_JAMP(429) = TMP_JAMP(400) + ((-0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * AMP(178)  ! used 8 times
-      TMP_JAMP(428) = TMP_JAMP(402) + ((0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * AMP(262)  ! used 8 times
-      TMP_JAMP(427) = TMP_JAMP(404) + ((-0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * AMP(261)  ! used 8 times
-      TMP_JAMP(426) = TMP_JAMP(405) + ((0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * AMP(280)  ! used 8 times
-      TMP_JAMP(425) = AMP(282) +  AMP(304)  ! used 8 times
-      TMP_JAMP(424) = AMP(199) - AMP(306)  ! used 8 times
-      TMP_JAMP(423) = AMP(198) - AMP(287)  ! used 8 times
-      TMP_JAMP(422) = TMP_JAMP(382) + ((0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * AMP(197)  ! used 8 times
-      TMP_JAMP(421) = TMP_JAMP(384) + ((0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * AMP(196)  ! used 8 times
-      TMP_JAMP(420) = TMP_JAMP(385) + ((0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * AMP(279)  ! used 8 times
-      TMP_JAMP(419) = AMP(274) +  AMP(317)  ! used 8 times
-      TMP_JAMP(418) = AMP(212) +  AMP(273)  ! used 8 times
-      TMP_JAMP(417) = TMP_JAMP(264) + ((-0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * AMP(200)  ! used 8 times
-      TMP_JAMP(416) = TMP_JAMP(363) + ((-0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * AMP(271)  ! used 8 times
-      TMP_JAMP(415) = AMP(272) +  AMP(295)  ! used 8 times
-      TMP_JAMP(414) = AMP(181) - AMP(297)  ! used 8 times
-      TMP_JAMP(413) = TMP_JAMP(356) + ((0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * AMP(179)  ! used 8 times
-      TMP_JAMP(412) = TMP_JAMP(357) + ((0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * AMP(270)  ! used 8 times
-      TMP_JAMP(411) = AMP(230) +  AMP(264)  ! used 8 times
-      TMP_JAMP(410) = AMP(217) - AMP(305)  ! used 8 times
-      TMP_JAMP(409) = TMP_JAMP(240) + ((0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * AMP(218)  ! used 8 times
-      TMP_JAMP(408) = TMP_JAMP(341) + ((-0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * AMP(215)  ! used 8 times
-      TMP_JAMP(407) = AMP(216) - AMP(296)  ! used 8 times
-      TMP_JAMP(406) = TMP_JAMP(336) + ((-0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * AMP(214)  ! used 8 times
-      TMP_JAMP(1123) = TMP_JAMP(1023) +  AMP(445)  ! used 8 times
-      TMP_JAMP(1122) = TMP_JAMP(1024) - AMP(477)  ! used 8 times
-      TMP_JAMP(1121) = TMP_JAMP(1025) +  AMP(475)  ! used 8 times
-      TMP_JAMP(1120) = TMP_JAMP(1027) - AMP(1827)  ! used 8 times
-      TMP_JAMP(1119) = TMP_JAMP(1030) +  AMP(1825)  ! used 8 times
-      TMP_JAMP(1118) = TMP_JAMP(992) +  AMP(1517)  ! used 8 times
-      TMP_JAMP(1117) = TMP_JAMP(993) - AMP(1692)  ! used 8 times
-      TMP_JAMP(1116) = TMP_JAMP(995) - AMP(1660)  ! used 8 times
-      TMP_JAMP(1115) = TMP_JAMP(996) +  AMP(1074)  ! used 8 times
-      TMP_JAMP(1114) = TMP_JAMP(997) - AMP(1072)  ! used 8 times
-      TMP_JAMP(1113) = TMP_JAMP(965) - AMP(446)  ! used 8 times
-      TMP_JAMP(1112) = TMP_JAMP(966) - AMP(465)  ! used 8 times
-      TMP_JAMP(1111) = TMP_JAMP(949) - AMP(1579)  ! used 8 times
-      TMP_JAMP(1110) = TMP_JAMP(950) - AMP(1887)  ! used 8 times
-      TMP_JAMP(1109) = TMP_JAMP(951) +  AMP(1885)  ! used 8 times
-      TMP_JAMP(1108) = TMP_JAMP(921) +  AMP(1770)  ! used 8 times
-      TMP_JAMP(1107) = TMP_JAMP(903) +  AMP(1571)  ! used 8 times
-      TMP_JAMP(1106) = TMP_JAMP(905) - AMP(1806)  ! used 8 times
-      TMP_JAMP(1105) = TMP_JAMP(907) +  AMP(1804)  ! used 8 times
-      TMP_JAMP(1104) = TMP_JAMP(895) - AMP(476)  ! used 8 times
-      TMP_JAMP(1103) = TMP_JAMP(863) +  AMP(454)  ! used 8 times
-      TMP_JAMP(1102) = TMP_JAMP(849) - AMP(1525)  ! used 8 times
-      TMP_JAMP(1101) = TMP_JAMP(850) - AMP(1860)  ! used 8 times
-      TMP_JAMP(1100) = TMP_JAMP(851) +  AMP(1858)  ! used 8 times
-      TMP_JAMP(1099) = TMP_JAMP(834) - AMP(1714)  ! used 8 times
-      TMP_JAMP(1098) = TMP_JAMP(818) - AMP(1752)  ! used 8 times
-      TMP_JAMP(1097) = TMP_JAMP(820) +  AMP(1750)  ! used 8 times
-      TMP_JAMP(1096) = TMP_JAMP(805) - AMP(464)  ! used 8 times
-      TMP_JAMP(1095) = TMP_JAMP(781) - AMP(455)  ! used 8 times
-      TMP_JAMP(1094) = TMP_JAMP(765) - AMP(1633)  ! used 8 times
-      TMP_JAMP(1093) = TMP_JAMP(752) +  AMP(1641)  ! used 8 times
-      TMP_JAMP(1092) = TMP_JAMP(718) +  AMP(1715)  ! used 8 times
-      TMP_JAMP(1091) = TMP_JAMP(712) - AMP(1136)  ! used 8 times
-      TMP_JAMP(1090) = TMP_JAMP(602) + ((-0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * TMP_JAMP(600)  ! used 8 times
-      TMP_JAMP(1089) = TMP_JAMP(603) + ((0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * TMP_JAMP(601)  ! used 8 times
-      TMP_JAMP(1088) = TMP_JAMP(604) + ((0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * TMP_JAMP(598)  ! used 8 times
-      TMP_JAMP(1087) = TMP_JAMP(605) + ((-0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * TMP_JAMP(599)  ! used 8 times
-      TMP_JAMP(1086) = TMP_JAMP(606) + ((0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * TMP_JAMP(597)  ! used 8 times
-      TMP_JAMP(1085) = TMP_JAMP(587) + ((0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * TMP_JAMP(586)  ! used 8 times
-      TMP_JAMP(1084) = TMP_JAMP(588) + ((0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * TMP_JAMP(585)  ! used 8 times
-      TMP_JAMP(1083) = TMP_JAMP(589) + ((0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * TMP_JAMP(584)  ! used 8 times
-      TMP_JAMP(1082) = TMP_JAMP(567) + ((0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * TMP_JAMP(566)  ! used 8 times
-      TMP_JAMP(1081) = TMP_JAMP(568) + ((-0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * TMP_JAMP(565)  ! used 8 times
-      TMP_JAMP(1080) = TMP_JAMP(557) + ((0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * TMP_JAMP(556)  ! used 8 times
-      TMP_JAMP(1079) = TMP_JAMP(558) + ((0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * TMP_JAMP(555)  ! used 8 times
-      TMP_JAMP(1078) = TMP_JAMP(540) + ((-0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * TMP_JAMP(539)  ! used 8 times
-      TMP_JAMP(1077) = TMP_JAMP(541) + ((-0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * TMP_JAMP(538)  ! used 8 times
-      TMP_JAMP(1076) = TMP_JAMP(534) + ((-0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * TMP_JAMP(533)  ! used 8 times
-      TMP_JAMP(1075) = TMP_JAMP(526) + ((-0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * TMP_JAMP(523)  ! used 8 times
-      TMP_JAMP(1074) = TMP_JAMP(530) + ((-0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * TMP_JAMP(522)  ! used 8 times
-      TMP_JAMP(1073) = TMP_JAMP(514) + ((0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * TMP_JAMP(510)  ! used 8 times
-      TMP_JAMP(1072) = TMP_JAMP(504) + ((0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * TMP_JAMP(503)  ! used 8 times
-      TMP_JAMP(1071) = TMP_JAMP(499) + ((0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * TMP_JAMP(498)  ! used 8 times
-      TMP_JAMP(1070) = TMP_JAMP(500) + ((0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * TMP_JAMP(497)  ! used 8 times
-      TMP_JAMP(1069) = TMP_JAMP(495) + ((-0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * TMP_JAMP(493)  ! used 8 times
-      TMP_JAMP(1068) = TMP_JAMP(490) + ((-0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * TMP_JAMP(489)  ! used 8 times
-      TMP_JAMP(1067) = TMP_JAMP(487) + ((-0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * AMP(1779)  ! used 8 times
-      TMP_JAMP(1066) = TMP_JAMP(483) + ((0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * AMP(1778)  ! used 8 times
-      TMP_JAMP(1065) = TMP_JAMP(481) + ((-0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * AMP(1534)  ! used 8 times
-      TMP_JAMP(1064) = TMP_JAMP(482) + ((-0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * AMP(1536)  ! used 8 times
-      TMP_JAMP(1063) = TMP_JAMP(477) + ((-0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * AMP(630)  ! used 8 times
-      TMP_JAMP(1062) = TMP_JAMP(476) + ((0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * AMP(629)  ! used 8 times
-      TMP_JAMP(1061) = TMP_JAMP(475) + ((0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * AMP(1535)  ! used 8 times
-      TMP_JAMP(1060) = TMP_JAMP(468) + ((-0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * TMP_JAMP(466)  ! used 8 times
-      TMP_JAMP(1059) = TMP_JAMP(469) + ((0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * TMP_JAMP(467)  ! used 8 times
-      TMP_JAMP(1058) = TMP_JAMP(472) + ((-0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * TMP_JAMP(464)  ! used 8 times
-      TMP_JAMP(1057) = TMP_JAMP(458) + ((0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * TMP_JAMP(457)  ! used 8 times
-      TMP_JAMP(1056) = TMP_JAMP(459) + ((0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * TMP_JAMP(456)  ! used 8 times
-      TMP_JAMP(1055) = TMP_JAMP(460) + ((0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * TMP_JAMP(455)  ! used 8 times
-      TMP_JAMP(1054) = TMP_JAMP(452) + ((0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * TMP_JAMP(451)  ! used 8 times
-      TMP_JAMP(1053) = TMP_JAMP(447) + ((0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * TMP_JAMP(446)  ! used 8 times
-      TMP_JAMP(1052) = TMP_JAMP(448) + ((0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * TMP_JAMP(445)  ! used 8 times
-      TMP_JAMP(1051) = TMP_JAMP(443) + ((-0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * TMP_JAMP(441)  ! used 8 times
-      TMP_JAMP(1050) = TMP_JAMP(444) + ((-0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * TMP_JAMP(442)  ! used 8 times
-      TMP_JAMP(1049) = TMP_JAMP(440) + ((-0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * TMP_JAMP(439)  ! used 8 times
-      TMP_JAMP(1048) = TMP_JAMP(437) + ((0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * AMP(156)  ! used 8 times
-      TMP_JAMP(1047) = TMP_JAMP(438) + ((-0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * AMP(154)  ! used 8 times
-      TMP_JAMP(1046) = TMP_JAMP(436) + ((-0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * AMP(155)  ! used 8 times
-      TMP_JAMP(1045) = TMP_JAMP(431) + ((0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * TMP_JAMP(429)  ! used 8 times
-      TMP_JAMP(1044) = TMP_JAMP(432) + ((0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * TMP_JAMP(430)  ! used 8 times
-      TMP_JAMP(1043) = TMP_JAMP(433) + ((0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * TMP_JAMP(427)  ! used 8 times
-      TMP_JAMP(1042) = TMP_JAMP(434) + ((-0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * TMP_JAMP(428)  ! used 8 times
-      TMP_JAMP(1041) = TMP_JAMP(435) + ((-0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * TMP_JAMP(426)  ! used 8 times
-      TMP_JAMP(1040) = TMP_JAMP(423) + ((-0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * TMP_JAMP(421)  ! used 8 times
-      TMP_JAMP(1039) = TMP_JAMP(424) + ((-0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * TMP_JAMP(422)  ! used 8 times
-      TMP_JAMP(1038) = TMP_JAMP(425) + ((-0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * TMP_JAMP(420)  ! used 8 times
-      TMP_JAMP(1037) = TMP_JAMP(418) + ((0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * TMP_JAMP(417)  ! used 8 times
-      TMP_JAMP(1036) = TMP_JAMP(419) + ((0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * TMP_JAMP(416)  ! used 8 times
-      TMP_JAMP(1035) = TMP_JAMP(414) + ((-0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * TMP_JAMP(413)  ! used 8 times
-      TMP_JAMP(1034) = TMP_JAMP(415) + ((-0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * TMP_JAMP(412)  ! used 8 times
-      TMP_JAMP(1033) = TMP_JAMP(410) + ((0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * TMP_JAMP(408)  ! used 8 times
-      TMP_JAMP(1032) = TMP_JAMP(411) + ((-0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * TMP_JAMP(409)  ! used 8 times
-      TMP_JAMP(1031) = TMP_JAMP(407) + ((0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * TMP_JAMP(406)  ! used 8 times
-      TMP_JAMP(1140) = TMP_JAMP(1012) +  AMP(1518)  ! used 7 times
-      TMP_JAMP(1139) = TMP_JAMP(1019) - AMP(447)  ! used 7 times
-      TMP_JAMP(1138) = TMP_JAMP(988) +  AMP(1662)  ! used 7 times
-      TMP_JAMP(1137) = TMP_JAMP(998) +  AMP(1690)  ! used 7 times
-      TMP_JAMP(1136) = TMP_JAMP(947) - AMP(1570)  ! used 7 times
-      TMP_JAMP(1135) = TMP_JAMP(920) +  AMP(1661)  ! used 7 times
-      TMP_JAMP(1134) = TMP_JAMP(925) - AMP(1691)  ! used 7 times
-      TMP_JAMP(1133) = TMP_JAMP(861) - AMP(456)  ! used 7 times
-      TMP_JAMP(1132) = TMP_JAMP(845) +  AMP(1527)  ! used 7 times
-      TMP_JAMP(1131) = TMP_JAMP(831) +  AMP(1716)  ! used 7 times
-      TMP_JAMP(1130) = TMP_JAMP(774) - AMP(1301)  ! used 7 times
-      TMP_JAMP(1129) = TMP_JAMP(753) - AMP(1639)  ! used 7 times
-      TMP_JAMP(1128) = TMP_JAMP(741) +  AMP(1769)  ! used 7 times
-      TMP_JAMP(1127) = TMP_JAMP(705) +  AMP(1634)  ! used 7 times
-      TMP_JAMP(1126) = TMP_JAMP(700) +  AMP(1640)  ! used 7 times
-      TMP_JAMP(1125) = TMP_JAMP(695) +  AMP(1852)  ! used 7 times
-      TMP_JAMP(1124) = TMP_JAMP(674) +  AMP(1744)  ! used 7 times
-      TMP_JAMP(1158) = TMP_JAMP(1026) - AMP(1516)  ! used 6 times
-      TMP_JAMP(1157) = TMP_JAMP(967) +  AMP(463)  ! used 6 times
-      TMP_JAMP(1156) = TMP_JAMP(973) - AMP(1826)  ! used 6 times
-      TMP_JAMP(1155) = TMP_JAMP(941) +  AMP(1572)  ! used 6 times
-      TMP_JAMP(1154) = TMP_JAMP(928) - AMP(1768)  ! used 6 times
-      TMP_JAMP(1153) = TMP_JAMP(904) +  AMP(1580)  ! used 6 times
-      TMP_JAMP(1152) = TMP_JAMP(943) +  AMP(1581)  ! used 6 times
-      TMP_JAMP(1151) = TMP_JAMP(878) +  AMP(1073)  ! used 6 times
-      TMP_JAMP(1150) = TMP_JAMP(846) - AMP(1476)  ! used 6 times
-      TMP_JAMP(1149) = TMP_JAMP(817) +  AMP(1526)  ! used 6 times
-      TMP_JAMP(1148) = TMP_JAMP(762) +  AMP(1635)  ! used 6 times
-      TMP_JAMP(1147) = TMP_JAMP(742) - AMP(1049)  ! used 6 times
-      TMP_JAMP(1146) = TMP_JAMP(728) - AMP(1214)  ! used 6 times
-      TMP_JAMP(1145) = TMP_JAMP(692) - AMP(1854)  ! used 6 times
-      TMP_JAMP(1144) = TMP_JAMP(671) - AMP(1746)  ! used 6 times
-      TMP_JAMP(1143) = TMP_JAMP(650) +  AMP(1626)  ! used 6 times
-      TMP_JAMP(1142) = TMP_JAMP(652) - AMP(1624)  ! used 6 times
-      TMP_JAMP(1141) = TMP_JAMP(633) +  AMP(1625)  ! used 6 times
-      TMP_JAMP(1168) = TMP_JAMP(1015) + ((0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * TMP_JAMP(1010)  ! used 5 times
-      TMP_JAMP(1167) = TMP_JAMP(971) + ((0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * TMP_JAMP(964)  ! used 5 times
-      TMP_JAMP(1166) = TMP_JAMP(924) - AMP(1124)  ! used 5 times
-      TMP_JAMP(1165) = TMP_JAMP(894) + ((0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * TMP_JAMP(890)  ! used 5 times
-      TMP_JAMP(1164) = TMP_JAMP(876) + ((0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * TMP_JAMP(874)  ! used 5 times
-      TMP_JAMP(1163) = TMP_JAMP(793) - AMP(1886)  ! used 5 times
-      TMP_JAMP(1162) = TMP_JAMP(776) - AMP(1859)  ! used 5 times
-      TMP_JAMP(1161) = TMP_JAMP(791) - AMP(1292)  ! used 5 times
-      TMP_JAMP(1160) = TMP_JAMP(729) - AMP(1805)  ! used 5 times
-      TMP_JAMP(1159) = TMP_JAMP(713) - AMP(1751)  ! used 5 times
-      TMP_JAMP(1837) = AMP(437) +  AMP(472)  ! used 4 times
-      TMP_JAMP(1836) = AMP(419) - AMP(474)  ! used 4 times
-      TMP_JAMP(1835) = AMP(416) +  AMP(451)  ! used 4 times
-      TMP_JAMP(1834) = AMP(350) - AMP(453)  ! used 4 times
-      TMP_JAMP(1833) = AMP(85) - AMP(1515)  ! used 4 times
-      TMP_JAMP(1832) = TMP_JAMP(1011) +  AMP(352)  ! used 4 times
-      TMP_JAMP(1831) = TMP_JAMP(1018) + ((0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * AMP(85)  ! used 4 times
-      TMP_JAMP(1830) = TMP_JAMP(1022) + ((0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * TMP_JAMP(1011)  ! used 4 times
-      TMP_JAMP(1829) = TMP_JAMP(1029) +  TMP_JAMP(1022)  ! used 4 times
-      TMP_JAMP(1828) = TMP_JAMP(1119) + ((0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * AMP(416)  ! used 4 times
-      TMP_JAMP(1827) = TMP_JAMP(1120) + ((0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * AMP(350)  ! used 4 times
-      TMP_JAMP(1826) = TMP_JAMP(1120) +  TMP_JAMP(1119)  ! used 4 times
-      TMP_JAMP(1825) = TMP_JAMP(1121) +  TMP_JAMP(1017)  ! used 4 times
-      TMP_JAMP(1824) = TMP_JAMP(1122) + ((0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * AMP(419)  ! used 4 times
-      TMP_JAMP(1823) = TMP_JAMP(1123) + ((0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * AMP(416)  ! used 4 times
-      TMP_JAMP(1822) = TMP_JAMP(1123) +  TMP_JAMP(1122)  ! used 4 times
-      TMP_JAMP(1821) = TMP_JAMP(1158) + ((-0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * AMP(1118)  ! used 4 times
-      TMP_JAMP(1820) = TMP_JAMP(1168) + ((0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * AMP(98)  ! used 4 times
-      TMP_JAMP(1819) = TMP_JAMP(1168) +  TMP_JAMP(1016)  ! used 4 times
-      TMP_JAMP(1818) = AMP(1061) +  AMP(1069)  ! used 4 times
-      TMP_JAMP(1817) = AMP(1043) - AMP(1071)  ! used 4 times
-      TMP_JAMP(1816) = AMP(1041) +  AMP(1663)  ! used 4 times
-      TMP_JAMP(1815) = AMP(593) - AMP(1514)  ! used 4 times
-      TMP_JAMP(1814) = AMP(510) - AMP(1665)  ! used 4 times
-      TMP_JAMP(1813) = TMP_JAMP(987) +  AMP(511)  ! used 4 times
-      TMP_JAMP(1812) = TMP_JAMP(991) + ((0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * TMP_JAMP(987)  ! used 4 times
-      TMP_JAMP(1811) = TMP_JAMP(994) - TMP_JAMP(991)  ! used 4 times
-      TMP_JAMP(1810) = TMP_JAMP(1016) + ((-0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * AMP(1043)  ! used 4 times
-      TMP_JAMP(1809) = TMP_JAMP(1114) + ((-0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * AMP(1061)  ! used 4 times
-      TMP_JAMP(1808) = TMP_JAMP(1114) +  TMP_JAMP(1017)  ! used 4 times
-      TMP_JAMP(1807) = TMP_JAMP(1115) +  TMP_JAMP(1016)  ! used 4 times
-      TMP_JAMP(1806) = TMP_JAMP(1116) + ((-0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * AMP(1041)  ! used 4 times
-      TMP_JAMP(1805) = TMP_JAMP(1116) +  TMP_JAMP(1115)  ! used 4 times
-      TMP_JAMP(1804) = TMP_JAMP(1118) + ((0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * AMP(593)  ! used 4 times
-      TMP_JAMP(1803) = TMP_JAMP(1137) +  TMP_JAMP(1116)  ! used 4 times
-      TMP_JAMP(1802) = TMP_JAMP(1137) +  TMP_JAMP(1117)  ! used 4 times
-      TMP_JAMP(1801) = TMP_JAMP(1140) +  TMP_JAMP(1118)  ! used 4 times
-      TMP_JAMP(1800) = AMP(443) +  AMP(466)  ! used 4 times
-      TMP_JAMP(1799) = AMP(368) - AMP(452)  ! used 4 times
-      TMP_JAMP(1798) = AMP(361) - AMP(468)  ! used 4 times
-      TMP_JAMP(1797) = TMP_JAMP(972) + ((0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * AMP(627)  ! used 4 times
-      TMP_JAMP(1796) = TMP_JAMP(1029) - TMP_JAMP(994)  ! used 4 times
-      TMP_JAMP(1795) = TMP_JAMP(1112) + ((-0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * AMP(361)  ! used 4 times
-      TMP_JAMP(1794) = TMP_JAMP(1112) - TMP_JAMP(969)  ! used 4 times
-      TMP_JAMP(1793) = TMP_JAMP(1139) +  TMP_JAMP(1113)  ! used 4 times
-      TMP_JAMP(1792) = TMP_JAMP(1157) + ((-0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * AMP(443)  ! used 4 times
-      TMP_JAMP(1791) = TMP_JAMP(1157) - TMP_JAMP(972)  ! used 4 times
-      TMP_JAMP(1790) = AMP(1459) +  AMP(1888)  ! used 4 times
-      TMP_JAMP(1789) = AMP(1210) - AMP(1890)  ! used 4 times
-      TMP_JAMP(1788) = AMP(1204) +  AMP(1573)  ! used 4 times
-      TMP_JAMP(1787) = TMP_JAMP(944) - AMP(1467)  ! used 4 times
-      TMP_JAMP(1786) = TMP_JAMP(969) + ((-0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * AMP(1210)  ! used 4 times
-      TMP_JAMP(1785) = TMP_JAMP(1018) + ((-0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * AMP(110)  ! used 4 times
-      TMP_JAMP(1784) = TMP_JAMP(1109) + ((0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * AMP(1459)  ! used 4 times
-      TMP_JAMP(1783) = TMP_JAMP(1111) + ((-0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * AMP(1204)  ! used 4 times
-      TMP_JAMP(1782) = TMP_JAMP(1111) - TMP_JAMP(1110)  ! used 4 times
-      TMP_JAMP(1781) = TMP_JAMP(1155) + ((0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * AMP(93)  ! used 4 times
-      TMP_JAMP(1780) = AMP(1063) +  AMP(1765)  ! used 4 times
-      TMP_JAMP(1779) = AMP(669) - AMP(1664)  ! used 4 times
-      TMP_JAMP(1778) = AMP(660) - AMP(1767)  ! used 4 times
-      TMP_JAMP(1777) = TMP_JAMP(930) + ((0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * AMP(1152)  ! used 4 times
-      TMP_JAMP(1776) = TMP_JAMP(1108) + ((0.000000000000000D+00,
+      TMP_JAMP(766) = TMP_JAMP(647) - TMP_JAMP(646)  ! used 8 times
+      TMP_JAMP(765) = TMP_JAMP(647) - TMP_JAMP(640)  ! used 8 times
+      TMP_JAMP(764) = TMP_JAMP(646) - TMP_JAMP(640)  ! used 8 times
+      TMP_JAMP(763) = TMP_JAMP(645) +  TMP_JAMP(643)  ! used 8 times
+      TMP_JAMP(762) = TMP_JAMP(644) + ((-0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * AMP(855)  ! used 8 times
+      TMP_JAMP(761) = TMP_JAMP(644) - TMP_JAMP(643)  ! used 8 times
+      TMP_JAMP(760) = TMP_JAMP(644) + ((0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * AMP(854)  ! used 8 times
+      TMP_JAMP(759) = TMP_JAMP(643) - TMP_JAMP(642)  ! used 8 times
+      TMP_JAMP(758) = TMP_JAMP(643) - TMP_JAMP(639)  ! used 8 times
+      TMP_JAMP(757) = TMP_JAMP(642) - TMP_JAMP(639)  ! used 8 times
+      TMP_JAMP(756) = TMP_JAMP(641) - TMP_JAMP(640)  ! used 8 times
+      TMP_JAMP(755) = TMP_JAMP(641) +  TMP_JAMP(639)  ! used 8 times
+      TMP_JAMP(754) = TMP_JAMP(641) + ((0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * AMP(940)  ! used 8 times
+      TMP_JAMP(753) = TMP_JAMP(640) +  TMP_JAMP(639)  ! used 8 times
+      TMP_JAMP(752) = TMP_JAMP(608) + ((-0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * AMP(942)  ! used 8 times
+      TMP_JAMP(751) = TMP_JAMP(608) + ((0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * AMP(944)  ! used 8 times
+      TMP_JAMP(750) = TMP_JAMP(604) + ((-0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * AMP(833)  ! used 8 times
+      TMP_JAMP(749) = TMP_JAMP(604) + ((0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * AMP(835)  ! used 8 times
+      TMP_JAMP(748) = TMP_JAMP(602) + ((0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * AMP(851)  ! used 8 times
+      TMP_JAMP(747) = TMP_JAMP(602) + ((-0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * AMP(853)  ! used 8 times
+      TMP_JAMP(746) = AMP(597) - AMP(599)  ! used 8 times
+      TMP_JAMP(745) = AMP(592) +  AMP(600)  ! used 8 times
+      TMP_JAMP(744) = AMP(502) +  AMP(513)  ! used 8 times
+      TMP_JAMP(743) = AMP(498) +  AMP(504)  ! used 8 times
+      TMP_JAMP(742) = AMP(626) +  AMP(631)  ! used 8 times
+      TMP_JAMP(741) = AMP(526) +  AMP(598)  ! used 8 times
+      TMP_JAMP(740) = AMP(517) - AMP(633)  ! used 8 times
+      TMP_JAMP(739) = AMP(756) - AMP(758)  ! used 8 times
+      TMP_JAMP(738) = AMP(685) +  AMP(757)  ! used 8 times
+      TMP_JAMP(737) = AMP(659) +  AMP(662)  ! used 8 times
+      TMP_JAMP(736) = AMP(657) +  AMP(663)  ! used 8 times
+      TMP_JAMP(735) = AMP(768) - AMP(770)  ! used 8 times
+      TMP_JAMP(734) = AMP(763) +  AMP(771)  ! used 8 times
+      TMP_JAMP(733) = AMP(751) +  AMP(759)  ! used 8 times
+      TMP_JAMP(732) = AMP(661) +  AMP(672)  ! used 8 times
+      TMP_JAMP(731) = AMP(785) +  AMP(790)  ! used 8 times
+      TMP_JAMP(730) = AMP(676) - AMP(792)  ! used 8 times
+      TMP_JAMP(729) = AMP(500) +  AMP(503)  ! used 8 times
+      TMP_JAMP(728) = AMP(609) - AMP(611)  ! used 8 times
+      TMP_JAMP(727) = AMP(604) +  AMP(612)  ! used 8 times
+      TMP_JAMP(726) = AMP(535) - AMP(632)  ! used 8 times
+      TMP_JAMP(725) = AMP(706) +  AMP(769)  ! used 8 times
+      TMP_JAMP(724) = AMP(694) - AMP(791)  ! used 8 times
+      TMP_JAMP(723) = AMP(547) +  AMP(610)  ! used 8 times
+      TMP_JAMP(722) = AMP(820) +  AMP(831)  ! used 8 times
+      TMP_JAMP(721) = AMP(818) +  AMP(821)  ! used 8 times
+      TMP_JAMP(720) = AMP(927) - AMP(929)  ! used 8 times
+      TMP_JAMP(719) = AMP(865) +  AMP(928)  ! used 8 times
+      TMP_JAMP(718) = AMP(816) +  AMP(822)  ! used 8 times
+      TMP_JAMP(717) = AMP(936) - AMP(938)  ! used 8 times
+      TMP_JAMP(716) = AMP(847) +  AMP(937)  ! used 8 times
+      TMP_JAMP(715) = AMP(922) +  AMP(930)  ! used 8 times
+      TMP_JAMP(714) = AMP(931) +  AMP(939)  ! used 8 times
+      TMP_JAMP(713) = AMP(585) - AMP(587)  ! used 8 times
+      TMP_JAMP(712) = AMP(580) +  AMP(588)  ! used 8 times
+      TMP_JAMP(711) = AMP(544) +  AMP(586)  ! used 8 times
+      TMP_JAMP(710) = AMP(915) - AMP(917)  ! used 8 times
+      TMP_JAMP(709) = AMP(844) +  AMP(916)  ! used 8 times
+      TMP_JAMP(708) = AMP(910) +  AMP(918)  ! used 8 times
+      TMP_JAMP(707) = AMP(618) - AMP(620)  ! used 8 times
+      TMP_JAMP(706) = AMP(613) +  AMP(621)  ! used 8 times
+      TMP_JAMP(705) = AMP(529) +  AMP(619)  ! used 8 times
+      TMP_JAMP(704) = AMP(783) +  AMP(787)  ! used 8 times
+      TMP_JAMP(703) = AMP(674) - AMP(789)  ! used 8 times
+      TMP_JAMP(702) = AMP(692) - AMP(788)  ! used 8 times
+      TMP_JAMP(701) = AMP(777) - AMP(779)  ! used 8 times
+      TMP_JAMP(700) = AMP(688) +  AMP(778)  ! used 8 times
+      TMP_JAMP(699) = AMP(772) +  AMP(780)  ! used 8 times
+      TMP_JAMP(698) = AMP(744) - AMP(746)  ! used 8 times
+      TMP_JAMP(697) = AMP(739) +  AMP(747)  ! used 8 times
+      TMP_JAMP(696) = AMP(703) +  AMP(745)  ! used 8 times
+      TMP_JAMP(695) = AMP(903) - AMP(905)  ! used 8 times
+      TMP_JAMP(694) = AMP(862) +  AMP(904)  ! used 8 times
+      TMP_JAMP(693) = AMP(898) +  AMP(906)  ! used 8 times
+      TMP_JAMP(692) = AMP(624) +  AMP(628)  ! used 8 times
+      TMP_JAMP(691) = AMP(515) - AMP(630)  ! used 8 times
+      TMP_JAMP(690) = AMP(533) - AMP(629)  ! used 8 times
+      TMP_JAMP(689) = AMP(484) +  AMP(495)  ! used 8 times
+      TMP_JAMP(688) = AMP(480) +  AMP(486)  ! used 8 times
+      TMP_JAMP(687) = AMP(641) +  AMP(644)  ! used 8 times
+      TMP_JAMP(686) = AMP(639) +  AMP(645)  ! used 8 times
+      TMP_JAMP(685) = AMP(643) +  AMP(654)  ! used 8 times
+      TMP_JAMP(684) = AMP(482) +  AMP(485)  ! used 8 times
+      TMP_JAMP(683) = AMP(802) +  AMP(813)  ! used 8 times
+      TMP_JAMP(682) = AMP(800) +  AMP(803)  ! used 8 times
+      TMP_JAMP(681) = AMP(798) +  AMP(804)  ! used 8 times
+      TMP_JAMP(1043) = TMP_JAMP(834) +  TMP_JAMP(785)  ! used 4 times
+      TMP_JAMP(1042) = TMP_JAMP(834) +  TMP_JAMP(819)  ! used 4 times
+      TMP_JAMP(1041) = TMP_JAMP(831) - TMP_JAMP(680)  ! used 4 times
+      TMP_JAMP(1040) = TMP_JAMP(829) +  TMP_JAMP(827)  ! used 4 times
+      TMP_JAMP(1039) = TMP_JAMP(829) + ((-0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * TMP_JAMP(707)  ! used 4 times
+      TMP_JAMP(1038) = TMP_JAMP(827) + ((0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * TMP_JAMP(729)  ! used 4 times
+      TMP_JAMP(1037) = TMP_JAMP(822) + ((-0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * TMP_JAMP(692)  ! used 4 times
+      TMP_JAMP(1036) = TMP_JAMP(821) - TMP_JAMP(785)  ! used 4 times
+      TMP_JAMP(1035) = TMP_JAMP(817) - TMP_JAMP(676)  ! used 4 times
+      TMP_JAMP(1034) = TMP_JAMP(817) + ((0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * TMP_JAMP(741)  ! used 4 times
+      TMP_JAMP(1033) = TMP_JAMP(814) +  TMP_JAMP(771)  ! used 4 times
+      TMP_JAMP(1032) = TMP_JAMP(814) +  TMP_JAMP(792)  ! used 4 times
+      TMP_JAMP(1031) = TMP_JAMP(814) +  TMP_JAMP(801)  ! used 4 times
+      TMP_JAMP(1030) = TMP_JAMP(811) +  TMP_JAMP(672)  ! used 4 times
+      TMP_JAMP(1029) = TMP_JAMP(810) +  TMP_JAMP(809)  ! used 4 times
+      TMP_JAMP(1028) = TMP_JAMP(810) + ((0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * TMP_JAMP(701)  ! used 4 times
+      TMP_JAMP(1027) = TMP_JAMP(809) + ((-0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * TMP_JAMP(732)  ! used 4 times
+      TMP_JAMP(1026) = TMP_JAMP(804) +  TMP_JAMP(767)  ! used 4 times
+      TMP_JAMP(1025) = TMP_JAMP(803) +  TMP_JAMP(792)  ! used 4 times
+      TMP_JAMP(1024) = TMP_JAMP(798) - TMP_JAMP(668)  ! used 4 times
+      TMP_JAMP(1023) = TMP_JAMP(798) + ((-0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * TMP_JAMP(734)  ! used 4 times
+      TMP_JAMP(1022) = TMP_JAMP(792) + ((0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * TMP_JAMP(731)  ! used 4 times
+      TMP_JAMP(1021) = TMP_JAMP(792) + ((-0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * TMP_JAMP(703)  ! used 4 times
+      TMP_JAMP(1020) = TMP_JAMP(786) +  TMP_JAMP(760)  ! used 4 times
+      TMP_JAMP(1019) = TMP_JAMP(785) + ((0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * TMP_JAMP(728)  ! used 4 times
+      TMP_JAMP(1018) = TMP_JAMP(785) +  TMP_JAMP(759)  ! used 4 times
+      TMP_JAMP(1017) = TMP_JAMP(785) + ((-0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * TMP_JAMP(712)  ! used 4 times
+      TMP_JAMP(1016) = TMP_JAMP(779) - AMP(617)  ! used 4 times
+      TMP_JAMP(1015) = TMP_JAMP(779) +  TMP_JAMP(775)  ! used 4 times
+      TMP_JAMP(1014) = TMP_JAMP(778) - TMP_JAMP(675)  ! used 4 times
+      TMP_JAMP(1013) = TMP_JAMP(778) +  TMP_JAMP(674)  ! used 4 times
+      TMP_JAMP(1012) = TMP_JAMP(778) + ((0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * TMP_JAMP(723)  ! used 4 times
+      TMP_JAMP(1011) = TMP_JAMP(775) - AMP(776)  ! used 4 times
+      TMP_JAMP(1010) = TMP_JAMP(774) - TMP_JAMP(665)  ! used 4 times
+      TMP_JAMP(1009) = TMP_JAMP(774) + ((0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * TMP_JAMP(724)  ! used 4 times
+      TMP_JAMP(1008) = TMP_JAMP(773) + ((-0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * TMP_JAMP(721)  ! used 4 times
+      TMP_JAMP(1007) = TMP_JAMP(772) + ((0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * TMP_JAMP(718)  ! used 4 times
+      TMP_JAMP(1006) = TMP_JAMP(771) +  TMP_JAMP(759)  ! used 4 times
+      TMP_JAMP(1005) = TMP_JAMP(771) +  TMP_JAMP(766)  ! used 4 times
+      TMP_JAMP(1004) = TMP_JAMP(769) - AMP(832)  ! used 4 times
+      TMP_JAMP(1003) = TMP_JAMP(769) +  AMP(819)  ! used 4 times
+      TMP_JAMP(1002) = TMP_JAMP(766) + ((-0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * TMP_JAMP(720)  ! used 4 times
+      TMP_JAMP(1001) = TMP_JAMP(759) + ((-0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * TMP_JAMP(717)  ! used 4 times
+      TMP_JAMP(1000) = TMP_JAMP(753) - TMP_JAMP(646)  ! used 4 times
+      TMP_JAMP(999) = TMP_JAMP(752) +  AMP(943)  ! used 4 times
+      TMP_JAMP(998) = TMP_JAMP(751) +  TMP_JAMP(749)  ! used 4 times
+      TMP_JAMP(997) = TMP_JAMP(751) - TMP_JAMP(747)  ! used 4 times
+      TMP_JAMP(996) = TMP_JAMP(750) - AMP(832)  ! used 4 times
+      TMP_JAMP(995) = TMP_JAMP(750) +  TMP_JAMP(748)  ! used 4 times
+      TMP_JAMP(994) = TMP_JAMP(749) - AMP(834)  ! used 4 times
+      TMP_JAMP(993) = TMP_JAMP(748) - AMP(850)  ! used 4 times
+      TMP_JAMP(992) = TMP_JAMP(747) - AMP(852)  ! used 4 times
+      TMP_JAMP(991) = TMP_JAMP(746) - TMP_JAMP(745)  ! used 4 times
+      TMP_JAMP(990) = TMP_JAMP(746) +  TMP_JAMP(739)  ! used 4 times
+      TMP_JAMP(989) = TMP_JAMP(746) + ((0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * AMP(596)  ! used 4 times
+      TMP_JAMP(988) = TMP_JAMP(745) - TMP_JAMP(741)  ! used 4 times
+      TMP_JAMP(987) = TMP_JAMP(745) + ((-0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * AMP(601)  ! used 4 times
+      TMP_JAMP(986) = TMP_JAMP(744) + ((0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * AMP(511)  ! used 4 times
+      TMP_JAMP(985) = TMP_JAMP(744) +  TMP_JAMP(729)  ! used 4 times
+      TMP_JAMP(984) = TMP_JAMP(743) + ((-0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * AMP(510)  ! used 4 times
+      TMP_JAMP(983) = TMP_JAMP(743) + ((0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * AMP(499)  ! used 4 times
+      TMP_JAMP(982) = TMP_JAMP(742) - TMP_JAMP(726)  ! used 4 times
+      TMP_JAMP(981) = TMP_JAMP(742) + ((-0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * AMP(494)  ! used 4 times
+      TMP_JAMP(980) = TMP_JAMP(741) + ((0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * AMP(528)  ! used 4 times
+      TMP_JAMP(979) = TMP_JAMP(741) + ((-0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * AMP(527)  ! used 4 times
+      TMP_JAMP(978) = TMP_JAMP(740) + ((0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * AMP(528)  ! used 4 times
+      TMP_JAMP(977) = TMP_JAMP(740) +  TMP_JAMP(726)  ! used 4 times
+      TMP_JAMP(976) = TMP_JAMP(740) + ((-0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * AMP(516)  ! used 4 times
+      TMP_JAMP(975) = TMP_JAMP(739) - TMP_JAMP(738)  ! used 4 times
+      TMP_JAMP(974) = TMP_JAMP(739) + ((-0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * AMP(755)  ! used 4 times
+      TMP_JAMP(973) = TMP_JAMP(738) + ((-0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * AMP(686)  ! used 4 times
+      TMP_JAMP(972) = TMP_JAMP(738) + ((0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * AMP(687)  ! used 4 times
+      TMP_JAMP(971) = TMP_JAMP(737) + ((0.000000000000000D+00,
      $ -1.000000000000000D+00)) * AMP(660)  ! used 4 times
-      TMP_JAMP(1775) = TMP_JAMP(1134) + ((0.000000000000000D+00
+      TMP_JAMP(970) = TMP_JAMP(737) +  TMP_JAMP(721)  ! used 4 times
+      TMP_JAMP(969) = TMP_JAMP(737) +  TMP_JAMP(732)  ! used 4 times
+      TMP_JAMP(968) = TMP_JAMP(736) + ((0.000000000000000D+00
      $ ,1.000000000000000D+00)) * AMP(669)  ! used 4 times
-      TMP_JAMP(1774) = TMP_JAMP(1134) +  TMP_JAMP(1117)  ! used 4 times
-      TMP_JAMP(1773) = TMP_JAMP(1138) +  TMP_JAMP(1135)  ! used 4 times
-      TMP_JAMP(1772) = TMP_JAMP(1154) + ((0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * AMP(1063)  ! used 4 times
-      TMP_JAMP(1771) = AMP(1461) +  AMP(1807)  ! used 4 times
-      TMP_JAMP(1770) = AMP(767) - AMP(1809)  ! used 4 times
-      TMP_JAMP(1769) = AMP(760) - AMP(1574)  ! used 4 times
-      TMP_JAMP(1768) = TMP_JAMP(923) + ((-0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * AMP(767)  ! used 4 times
-      TMP_JAMP(1767) = TMP_JAMP(1105) + ((-0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * AMP(1461)  ! used 4 times
-      TMP_JAMP(1766) = TMP_JAMP(1106) +  TMP_JAMP(923)  ! used 4 times
-      TMP_JAMP(1765) = TMP_JAMP(1106) +  TMP_JAMP(1105)  ! used 4 times
-      TMP_JAMP(1764) = TMP_JAMP(1107) + ((-0.000000000000000D+00,
+      TMP_JAMP(967) = TMP_JAMP(736) + ((-0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * AMP(658)  ! used 4 times
+      TMP_JAMP(966) = TMP_JAMP(735) - TMP_JAMP(734)  ! used 4 times
+      TMP_JAMP(965) = TMP_JAMP(735) +  TMP_JAMP(720)  ! used 4 times
+      TMP_JAMP(964) = TMP_JAMP(735) - TMP_JAMP(725)  ! used 4 times
+      TMP_JAMP(963) = TMP_JAMP(735) + ((0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * AMP(648)  ! used 4 times
+      TMP_JAMP(962) = TMP_JAMP(734) - TMP_JAMP(725)  ! used 4 times
+      TMP_JAMP(961) = TMP_JAMP(734) + ((-0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * AMP(764)  ! used 4 times
+      TMP_JAMP(960) = TMP_JAMP(733) + ((-0.000000000000000D+00,
      $ -1.000000000000000D+00)) * AMP(760)  ! used 4 times
-      TMP_JAMP(1763) = AMP(1196) +  AMP(1567)  ! used 4 times
-      TMP_JAMP(1762) = AMP(428) - AMP(473)  ! used 4 times
-      TMP_JAMP(1761) = AMP(87) - AMP(1569)  ! used 4 times
-      TMP_JAMP(1760) = TMP_JAMP(892) + ((0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * AMP(87)  ! used 4 times
-      TMP_JAMP(1759) = TMP_JAMP(896) + ((0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * AMP(370)  ! used 4 times
-      TMP_JAMP(1758) = TMP_JAMP(1016) +  TMP_JAMP(891)  ! used 4 times
-      TMP_JAMP(1757) = TMP_JAMP(1104) +  TMP_JAMP(891)  ! used 4 times
-      TMP_JAMP(1756) = TMP_JAMP(1122) +  TMP_JAMP(1104)  ! used 4 times
-      TMP_JAMP(1755) = TMP_JAMP(1123) - TMP_JAMP(1113)  ! used 4 times
-      TMP_JAMP(1754) = TMP_JAMP(1136) + ((-0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * AMP(1196)  ! used 4 times
-      TMP_JAMP(1753) = TMP_JAMP(1136) - TMP_JAMP(896)  ! used 4 times
-      TMP_JAMP(1752) = AMP(1052) - AMP(1070)  ! used 4 times
-      TMP_JAMP(1751) = AMP(752) - AMP(1568)  ! used 4 times
-      TMP_JAMP(1750) = TMP_JAMP(892) +  TMP_JAMP(877)  ! used 4 times
-      TMP_JAMP(1749) = TMP_JAMP(1107) + ((0.000000000000000D+00
+      TMP_JAMP(959) = TMP_JAMP(733) + ((0.000000000000000D+00
      $ ,1.000000000000000D+00)) * AMP(752)  ! used 4 times
-      TMP_JAMP(1748) = TMP_JAMP(1134) - TMP_JAMP(1107)  ! used 4 times
-      TMP_JAMP(1747) = TMP_JAMP(1151) + ((0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * AMP(1052)  ! used 4 times
-      TMP_JAMP(1746) = TMP_JAMP(1151) +  TMP_JAMP(891)  ! used 4 times
-      TMP_JAMP(1745) = AMP(434) +  AMP(457)  ! used 4 times
-      TMP_JAMP(1744) = AMP(343) - AMP(459)  ! used 4 times
-      TMP_JAMP(1743) = TMP_JAMP(877) + ((0.000000000000000D+00,
+      TMP_JAMP(958) = TMP_JAMP(732) + ((0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * AMP(670)  ! used 4 times
+      TMP_JAMP(957) = TMP_JAMP(731) + ((0.000000000000000D+00,
      $ -1.000000000000000D+00)) * AMP(786)  ! used 4 times
-      TMP_JAMP(1742) = TMP_JAMP(877) +  TMP_JAMP(865)  ! used 4 times
-      TMP_JAMP(1741) = TMP_JAMP(1133) + ((-0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * AMP(343)  ! used 4 times
-      TMP_JAMP(1740) = TMP_JAMP(1133) +  TMP_JAMP(1103)  ! used 4 times
-      TMP_JAMP(1739) = TMP_JAMP(1136) - TMP_JAMP(1107)  ! used 4 times
-      TMP_JAMP(1738) = AMP(1468) +  AMP(1861)  ! used 4 times
-      TMP_JAMP(1737) = AMP(1132) - AMP(1863)  ! used 4 times
-      TMP_JAMP(1736) = AMP(1126) +  AMP(1519)  ! used 4 times
-      TMP_JAMP(1735) = AMP(96) - AMP(1521)  ! used 4 times
-      TMP_JAMP(1734) = TMP_JAMP(1100) + ((0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * AMP(1468)  ! used 4 times
-      TMP_JAMP(1733) = TMP_JAMP(1100) - TMP_JAMP(865)  ! used 4 times
-      TMP_JAMP(1732) = TMP_JAMP(1102) - TMP_JAMP(1101)  ! used 4 times
-      TMP_JAMP(1731) = TMP_JAMP(1140) - TMP_JAMP(1132)  ! used 4 times
-      TMP_JAMP(1730) = TMP_JAMP(1150) - TMP_JAMP(892)  ! used 4 times
-      TMP_JAMP(1729) = AMP(1054) +  AMP(1711)  ! used 4 times
-      TMP_JAMP(1728) = AMP(501) - AMP(1713)  ! used 4 times
-      TMP_JAMP(1727) = TMP_JAMP(896) + ((-0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * AMP(1230)  ! used 4 times
-      TMP_JAMP(1726) = TMP_JAMP(896) - TMP_JAMP(835)  ! used 4 times
-      TMP_JAMP(1725) = TMP_JAMP(1131) + ((0.000000000000000D+00,
+      TMP_JAMP(956) = TMP_JAMP(731) - TMP_JAMP(724)  ! used 4 times
+      TMP_JAMP(955) = TMP_JAMP(730) + ((0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * AMP(687)  ! used 4 times
+      TMP_JAMP(954) = TMP_JAMP(730) +  TMP_JAMP(724)  ! used 4 times
+      TMP_JAMP(953) = TMP_JAMP(730) + ((-0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * AMP(675)  ! used 4 times
+      TMP_JAMP(952) = TMP_JAMP(729) + ((0.000000000000000D+00,
      $ -1.000000000000000D+00)) * AMP(501)  ! used 4 times
-      TMP_JAMP(1724) = TMP_JAMP(1131) +  TMP_JAMP(1099)  ! used 4 times
-      TMP_JAMP(1723) = AMP(1470) +  AMP(1753)  ! used 4 times
-      TMP_JAMP(1722) = AMP(608) - AMP(1755)  ! used 4 times
-      TMP_JAMP(1721) = AMP(601) - AMP(1520)  ! used 4 times
-      TMP_JAMP(1720) = TMP_JAMP(835) + ((-0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * AMP(1470)  ! used 4 times
-      TMP_JAMP(1719) = TMP_JAMP(1097) + ((-0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * AMP(1470)  ! used 4 times
-      TMP_JAMP(1718) = TMP_JAMP(1098) + ((-0.000000000000000D+00
+      TMP_JAMP(951) = TMP_JAMP(729) +  TMP_JAMP(718)  ! used 4 times
+      TMP_JAMP(950) = TMP_JAMP(728) + ((-0.000000000000000D+00
      $ ,1.000000000000000D+00)) * AMP(608)  ! used 4 times
-      TMP_JAMP(1717) = TMP_JAMP(1098) +  TMP_JAMP(1097)  ! used 4 times
-      TMP_JAMP(1716) = TMP_JAMP(1149) - TMP_JAMP(1118)  ! used 4 times
-      TMP_JAMP(1715) = AMP(1452) +  AMP(1693)  ! used 4 times
-      TMP_JAMP(1714) = AMP(596) - AMP(1695)  ! used 4 times
-      TMP_JAMP(1713) = AMP(379) - AMP(467)  ! used 4 times
-      TMP_JAMP(1712) = TMP_JAMP(806) - AMP(423)  ! used 4 times
-      TMP_JAMP(1711) = TMP_JAMP(969) +  TMP_JAMP(808)  ! used 4 times
-      TMP_JAMP(1710) = TMP_JAMP(1096) + ((0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * AMP(379)  ! used 4 times
-      TMP_JAMP(1709) = TMP_JAMP(1112) +  TMP_JAMP(1096)  ! used 4 times
-      TMP_JAMP(1708) = TMP_JAMP(1117) + ((0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * AMP(596)  ! used 4 times
-      TMP_JAMP(1707) = TMP_JAMP(1117) +  TMP_JAMP(809)  ! used 4 times
-      TMP_JAMP(1706) = TMP_JAMP(1137) - TMP_JAMP(1119)  ! used 4 times
-      TMP_JAMP(1705) = AMP(1288) - AMP(1889)  ! used 4 times
-      TMP_JAMP(1704) = AMP(755) - AMP(1694)  ! used 4 times
-      TMP_JAMP(1703) = TMP_JAMP(790) + ((-0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * AMP(708)  ! used 4 times
-      TMP_JAMP(1702) = TMP_JAMP(808) + ((0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * AMP(1288)  ! used 4 times
-      TMP_JAMP(1701) = TMP_JAMP(809) +  TMP_JAMP(790)  ! used 4 times
-      TMP_JAMP(1700) = AMP(381) - AMP(458)  ! used 4 times
-      TMP_JAMP(1699) = TMP_JAMP(790) + ((0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * AMP(693)  ! used 4 times
-      TMP_JAMP(1698) = TMP_JAMP(810) - TMP_JAMP(790)  ! used 4 times
-      TMP_JAMP(1697) = TMP_JAMP(1095) + ((0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * AMP(381)  ! used 4 times
-      TMP_JAMP(1696) = TMP_JAMP(1095) - TMP_JAMP(782)  ! used 4 times
-      TMP_JAMP(1695) = AMP(1297) - AMP(1862)  ! used 4 times
-      TMP_JAMP(1694) = TMP_JAMP(782) + ((0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * AMP(1297)  ! used 4 times
-      TMP_JAMP(1693) = TMP_JAMP(809) + ((-0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * AMP(549)  ! used 4 times
-      TMP_JAMP(1692) = TMP_JAMP(1130) - TMP_JAMP(809)  ! used 4 times
-      TMP_JAMP(1691) = AMP(1286) +  AMP(1630)  ! used 4 times
-      TMP_JAMP(1690) = TMP_JAMP(764) + ((0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * AMP(97)  ! used 4 times
-      TMP_JAMP(1689) = TMP_JAMP(810) + ((-0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * AMP(1307)  ! used 4 times
-      TMP_JAMP(1688) = TMP_JAMP(1094) + ((0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * AMP(1286)  ! used 4 times
-      TMP_JAMP(1687) = TMP_JAMP(1094) +  TMP_JAMP(766)  ! used 4 times
-      TMP_JAMP(1686) = TMP_JAMP(1148) + ((0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * AMP(105)  ! used 4 times
-      TMP_JAMP(1685) = AMP(1295) +  AMP(1636)  ! used 4 times
-      TMP_JAMP(1684) = AMP(117) - AMP(1638)  ! used 4 times
-      TMP_JAMP(1683) = TMP_JAMP(766) + ((-0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * AMP(1295)  ! used 4 times
-      TMP_JAMP(1682) = TMP_JAMP(1093) + ((0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * AMP(117)  ! used 4 times
-      TMP_JAMP(1681) = TMP_JAMP(1129) + ((0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * AMP(1295)  ! used 4 times
-      TMP_JAMP(1680) = TMP_JAMP(1130) - TMP_JAMP(1129)  ! used 4 times
-      TMP_JAMP(1679) = AMP(1120) - AMP(1830)  ! used 4 times
-      TMP_JAMP(1678) = AMP(819) - AMP(1766)  ! used 4 times
-      TMP_JAMP(1677) = TMP_JAMP(733) +  AMP(1120)  ! used 4 times
-      TMP_JAMP(1676) = TMP_JAMP(923) +  TMP_JAMP(746)  ! used 4 times
-      TMP_JAMP(1675) = TMP_JAMP(1119) + ((-0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * AMP(1450)  ! used 4 times
-      TMP_JAMP(1674) = TMP_JAMP(1120) + ((-0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * TMP_JAMP(733)  ! used 4 times
-      TMP_JAMP(1673) = TMP_JAMP(1120) - TMP_JAMP(745)  ! used 4 times
-      TMP_JAMP(1672) = TMP_JAMP(1128) +  TMP_JAMP(1108)  ! used 4 times
-      TMP_JAMP(1671) = TMP_JAMP(1147) + ((-0.000000000000000D+00
+      TMP_JAMP(949) = TMP_JAMP(728) +  TMP_JAMP(717)  ! used 4 times
+      TMP_JAMP(948) = TMP_JAMP(728) - TMP_JAMP(723)  ! used 4 times
+      TMP_JAMP(947) = TMP_JAMP(727) + ((0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * AMP(601)  ! used 4 times
+      TMP_JAMP(946) = TMP_JAMP(727) - TMP_JAMP(723)  ! used 4 times
+      TMP_JAMP(945) = TMP_JAMP(727) + ((-0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * AMP(605)  ! used 4 times
+      TMP_JAMP(944) = TMP_JAMP(726) + ((-0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * AMP(534)  ! used 4 times
+      TMP_JAMP(943) = TMP_JAMP(725) + ((0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * AMP(708)  ! used 4 times
+      TMP_JAMP(942) = TMP_JAMP(724) + ((-0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * AMP(693)  ! used 4 times
+      TMP_JAMP(941) = TMP_JAMP(723) + ((0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * AMP(549)  ! used 4 times
+      TMP_JAMP(940) = TMP_JAMP(722) + ((-0.000000000000000D+00
      $ ,1.000000000000000D+00)) * AMP(830)  ! used 4 times
-      TMP_JAMP(1670) = TMP_JAMP(1147) - TMP_JAMP(747)  ! used 4 times
-      TMP_JAMP(1669) = AMP(1198) - AMP(1829)  ! used 4 times
-      TMP_JAMP(1668) = AMP(926) - AMP(1808)  ! used 4 times
-      TMP_JAMP(1667) = TMP_JAMP(746) + ((0.000000000000000D+00
+      TMP_JAMP(939) = TMP_JAMP(722) + ((0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * AMP(829)  ! used 4 times
+      TMP_JAMP(938) = TMP_JAMP(721) + ((-0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * AMP(819)  ! used 4 times
+      TMP_JAMP(937) = TMP_JAMP(721) +  TMP_JAMP(718)  ! used 4 times
+      TMP_JAMP(936) = TMP_JAMP(720) + ((0.000000000000000D+00
      $ ,1.000000000000000D+00)) * AMP(926)  ! used 4 times
-      TMP_JAMP(1666) = TMP_JAMP(1146) + ((-0.000000000000000D+00
+      TMP_JAMP(935) = TMP_JAMP(720) - TMP_JAMP(715)  ! used 4 times
+      TMP_JAMP(934) = TMP_JAMP(719) + ((-0.000000000000000D+00
      $ ,1.000000000000000D+00)) * AMP(866)  ! used 4 times
-      TMP_JAMP(1665) = TMP_JAMP(1146) +  TMP_JAMP(727)  ! used 4 times
-      TMP_JAMP(1664) = AMP(817) - AMP(1712)  ! used 4 times
-      TMP_JAMP(1663) = TMP_JAMP(727) + ((0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * AMP(850)  ! used 4 times
-      TMP_JAMP(1662) = TMP_JAMP(727) + ((0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * TMP_JAMP(717)  ! used 4 times
-      TMP_JAMP(1661) = TMP_JAMP(747) - TMP_JAMP(727)  ! used 4 times
-      TMP_JAMP(1660) = TMP_JAMP(1092) + ((-0.000000000000000D+00,
+      TMP_JAMP(933) = TMP_JAMP(719) + ((0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * AMP(867)  ! used 4 times
+      TMP_JAMP(932) = TMP_JAMP(718) + ((-0.000000000000000D+00,
      $ -1.000000000000000D+00)) * AMP(817)  ! used 4 times
-      TMP_JAMP(1659) = TMP_JAMP(1092) +  TMP_JAMP(719)  ! used 4 times
-      TMP_JAMP(1658) = AMP(935) - AMP(1754)  ! used 4 times
-      TMP_JAMP(1657) = TMP_JAMP(719) + ((0.000000000000000D+00
+      TMP_JAMP(931) = TMP_JAMP(717) + ((0.000000000000000D+00
      $ ,1.000000000000000D+00)) * AMP(935)  ! used 4 times
-      TMP_JAMP(1656) = TMP_JAMP(745) + ((-0.000000000000000D+00
+      TMP_JAMP(930) = TMP_JAMP(717) - TMP_JAMP(714)  ! used 4 times
+      TMP_JAMP(929) = TMP_JAMP(716) + ((-0.000000000000000D+00
      $ ,1.000000000000000D+00)) * AMP(848)  ! used 4 times
-      TMP_JAMP(1655) = TMP_JAMP(1102) + ((0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * TMP_JAMP(708)  ! used 4 times
-      TMP_JAMP(1654) = TMP_JAMP(1102) - TMP_JAMP(1091)  ! used 4 times
-      TMP_JAMP(1653) = AMP(923) - AMP(1631)  ! used 4 times
-      TMP_JAMP(1652) = TMP_JAMP(747) + ((-0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * AMP(943)  ! used 4 times
-      TMP_JAMP(1651) = TMP_JAMP(747) +  TMP_JAMP(707)  ! used 4 times
-      TMP_JAMP(1650) = AMP(932) - AMP(1637)  ! used 4 times
-      TMP_JAMP(1649) = TMP_JAMP(707) + ((0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * AMP(932)  ! used 4 times
-      TMP_JAMP(1648) = TMP_JAMP(1126) + ((-0.000000000000000D+00,
+      TMP_JAMP(928) = TMP_JAMP(716) + ((0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * AMP(849)  ! used 4 times
+      TMP_JAMP(927) = TMP_JAMP(715) + ((-0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * AMP(923)  ! used 4 times
+      TMP_JAMP(926) = TMP_JAMP(715) + ((-0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * TMP_JAMP(646)  ! used 4 times
+      TMP_JAMP(925) = TMP_JAMP(714) + ((-0.000000000000000D+00,
      $ -1.000000000000000D+00)) * AMP(932)  ! used 4 times
-      TMP_JAMP(1647) = TMP_JAMP(1126) - TMP_JAMP(1091)  ! used 4 times
-      TMP_JAMP(1646) = AMP(1506) + ((-0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * AMP(1509)  ! used 4 times
-      TMP_JAMP(1645) = AMP(1504) + ((0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * AMP(1507)  ! used 4 times
-      TMP_JAMP(1644) = AMP(407) +  AMP(460)  ! used 4 times
-      TMP_JAMP(1643) = AMP(347) - AMP(462)  ! used 4 times
-      TMP_JAMP(1642) = TMP_JAMP(677) - AMP(1106)  ! used 4 times
-      TMP_JAMP(1641) = TMP_JAMP(679) - AMP(67)  ! used 4 times
-      TMP_JAMP(1640) = TMP_JAMP(690) + ((0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * TMP_JAMP(686)  ! used 4 times
-      TMP_JAMP(1639) = TMP_JAMP(891) + ((0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * AMP(80)  ! used 4 times
-      TMP_JAMP(1638) = TMP_JAMP(1017) - TMP_JAMP(891)  ! used 4 times
-      TMP_JAMP(1637) = TMP_JAMP(1018) +  TMP_JAMP(687)  ! used 4 times
-      TMP_JAMP(1636) = TMP_JAMP(1029) +  TMP_JAMP(691)  ! used 4 times
-      TMP_JAMP(1635) = TMP_JAMP(1103) + ((0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * AMP(407)  ! used 4 times
-      TMP_JAMP(1634) = TMP_JAMP(1121) - TMP_JAMP(1104)  ! used 4 times
-      TMP_JAMP(1633) = TMP_JAMP(1125) + ((0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * AMP(407)  ! used 4 times
-      TMP_JAMP(1632) = TMP_JAMP(1125) +  TMP_JAMP(1103)  ! used 4 times
-      TMP_JAMP(1631) = TMP_JAMP(1145) + ((0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * AMP(347)  ! used 4 times
-      TMP_JAMP(1630) = AMP(1505) + ((0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * AMP(1508)  ! used 4 times
-      TMP_JAMP(1629) = AMP(1032) +  AMP(1717)  ! used 4 times
-      TMP_JAMP(1628) = AMP(507) - AMP(1719)  ! used 4 times
-      TMP_JAMP(1627) = TMP_JAMP(664) - AMP(581)  ! used 4 times
-      TMP_JAMP(1626) = TMP_JAMP(994) +  TMP_JAMP(670)  ! used 4 times
-      TMP_JAMP(1625) = TMP_JAMP(1099) + ((-0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * AMP(1032)  ! used 4 times
-      TMP_JAMP(1624) = TMP_JAMP(1124) + ((-0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * AMP(1032)  ! used 4 times
-      TMP_JAMP(1623) = TMP_JAMP(1124) +  TMP_JAMP(1099)  ! used 4 times
-      TMP_JAMP(1622) = TMP_JAMP(1144) + ((-0.000000000000000D+00
+      TMP_JAMP(924) = TMP_JAMP(714) + ((-0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * TMP_JAMP(642)  ! used 4 times
+      TMP_JAMP(923) = TMP_JAMP(713) + ((-0.000000000000000D+00
      $ ,1.000000000000000D+00)) * AMP(507)  ! used 4 times
-      TMP_JAMP(1621) = AMP(386) - AMP(461)  ! used 4 times
-      TMP_JAMP(1620) = TMP_JAMP(660) + ((0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * TMP_JAMP(658)  ! used 4 times
-      TMP_JAMP(1619) = TMP_JAMP(661) - AMP(1853)  ! used 4 times
-      TMP_JAMP(1618) = TMP_JAMP(808) + ((0.000000000000000D+00,
+      TMP_JAMP(922) = TMP_JAMP(713) + ((0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * AMP(584)  ! used 4 times
+      TMP_JAMP(921) = TMP_JAMP(712) + ((0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * AMP(581)  ! used 4 times
+      TMP_JAMP(920) = TMP_JAMP(711) + ((0.000000000000000D+00,
      $ -1.000000000000000D+00)) * AMP(546)  ! used 4 times
-      TMP_JAMP(1617) = TMP_JAMP(972) - TMP_JAMP(808)  ! used 4 times
-      TMP_JAMP(1616) = TMP_JAMP(1095) + ((-0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * AMP(386)  ! used 4 times
-      TMP_JAMP(1615) = TMP_JAMP(1096) +  TMP_JAMP(1095)  ! used 4 times
-      TMP_JAMP(1614) = AMP(1282) +  AMP(1627)  ! used 4 times
-      TMP_JAMP(1613) = AMP(75) - AMP(1629)  ! used 4 times
-      TMP_JAMP(1612) = TMP_JAMP(946) + ((-0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * TMP_JAMP(940)  ! used 4 times
-      TMP_JAMP(1611) = TMP_JAMP(1142) - TMP_JAMP(1094)  ! used 4 times
-      TMP_JAMP(1610) = TMP_JAMP(1143) + ((0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * AMP(75)  ! used 4 times
-      TMP_JAMP(1609) = AMP(828) - AMP(1718)  ! used 4 times
-      TMP_JAMP(1608) = TMP_JAMP(641) - AMP(1745)  ! used 4 times
-      TMP_JAMP(1607) = TMP_JAMP(642) + ((-0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * TMP_JAMP(639)  ! used 4 times
-      TMP_JAMP(1606) = TMP_JAMP(746) + ((0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * AMP(845)  ! used 4 times
-      TMP_JAMP(1605) = TMP_JAMP(930) - TMP_JAMP(746)  ! used 4 times
-      TMP_JAMP(1604) = TMP_JAMP(1092) + ((0.000000000000000D+00
+      TMP_JAMP(919) = TMP_JAMP(711) + ((-0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * AMP(545)  ! used 4 times
+      TMP_JAMP(918) = TMP_JAMP(710) + ((0.000000000000000D+00
      $ ,1.000000000000000D+00)) * AMP(828)  ! used 4 times
-      TMP_JAMP(1603) = TMP_JAMP(1128) +  TMP_JAMP(1092)  ! used 4 times
-      TMP_JAMP(1602) = AMP(919) - AMP(1628)  ! used 4 times
-      TMP_JAMP(1601) = TMP_JAMP(1141) - TMP_JAMP(1127)  ! used 4 times
-      TMP_JAMP(1600) = AMP(1274) +  AMP(1621)  ! used 4 times
-      TMP_JAMP(1599) = AMP(69) - AMP(1623)  ! used 4 times
-      TMP_JAMP(1598) = TMP_JAMP(622) +  AMP(138)  ! used 4 times
-      TMP_JAMP(1597) = TMP_JAMP(625) + ((0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * AMP(1645)  ! used 4 times
-      TMP_JAMP(1596) = TMP_JAMP(627) - TMP_JAMP(624)  ! used 4 times
-      TMP_JAMP(1595) = TMP_JAMP(1016) - TMP_JAMP(628)  ! used 4 times
-      TMP_JAMP(1594) = TMP_JAMP(1103) - TMP_JAMP(1095)  ! used 4 times
-      TMP_JAMP(1593) = TMP_JAMP(1143) +  TMP_JAMP(688)  ! used 4 times
-      TMP_JAMP(1592) = AMP(911) - AMP(1622)  ! used 4 times
-      TMP_JAMP(1591) = TMP_JAMP(619) + ((-0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * AMP(1646)  ! used 4 times
-      TMP_JAMP(1590) = TMP_JAMP(627) - TMP_JAMP(618)  ! used 4 times
-      TMP_JAMP(1589) = TMP_JAMP(1099) - TMP_JAMP(1092)  ! used 4 times
-      TMP_JAMP(1588) = TMP_JAMP(1147) + ((0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * TMP_JAMP(616)  ! used 4 times
-      TMP_JAMP(1587) = TMP_JAMP(1147) +  TMP_JAMP(1115)  ! used 4 times
-      TMP_JAMP(1586) = TMP_JAMP(607) - AMP(425)  ! used 4 times
-      TMP_JAMP(1585) = TMP_JAMP(608) - AMP(345)  ! used 4 times
-      TMP_JAMP(1584) = TMP_JAMP(609) + ((0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * AMP(1112)  ! used 4 times
-      TMP_JAMP(1583) = TMP_JAMP(611) +  TMP_JAMP(609)  ! used 4 times
-      TMP_JAMP(1582) = TMP_JAMP(611) - TMP_JAMP(610)  ! used 4 times
-      TMP_JAMP(1581) = TMP_JAMP(629) - TMP_JAMP(620)  ! used 4 times
-      TMP_JAMP(1580) = TMP_JAMP(807) + ((-0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * TMP_JAMP(804)  ! used 4 times
-      TMP_JAMP(1579) = TMP_JAMP(1123) - TMP_JAMP(615)  ! used 4 times
-      TMP_JAMP(1578) = TMP_JAMP(1139) - TMP_JAMP(614)  ! used 4 times
-      TMP_JAMP(1577) = TMP_JAMP(1142) - TMP_JAMP(1141)  ! used 4 times
-      TMP_JAMP(1576) = TMP_JAMP(1145) - TMP_JAMP(640)  ! used 4 times
-      TMP_JAMP(1575) = AMP(1477) +  AMP(1834)  ! used 4 times
-      TMP_JAMP(1574) = AMP(1141) - AMP(1836)  ! used 4 times
-      TMP_JAMP(1573) = AMP(1114) +  AMP(1510)  ! used 4 times
-      TMP_JAMP(1572) = AMP(949) + ((-0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * AMP(1477)  ! used 4 times
-      TMP_JAMP(1571) = AMP(78) - AMP(1512)  ! used 4 times
-      TMP_JAMP(1570) = TMP_JAMP(612) + ((-0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * AMP(951)  ! used 4 times
-      TMP_JAMP(1569) = TMP_JAMP(613) + ((0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * AMP(949)  ! used 4 times
-      TMP_JAMP(1568) = TMP_JAMP(626) - AMP(134)  ! used 4 times
-      TMP_JAMP(1567) = TMP_JAMP(685) +  AMP(78)  ! used 4 times
-      TMP_JAMP(1566) = TMP_JAMP(689) + ((0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * TMP_JAMP(685)  ! used 4 times
-      TMP_JAMP(1565) = TMP_JAMP(694) + ((0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * TMP_JAMP(684)  ! used 4 times
-      TMP_JAMP(1564) = TMP_JAMP(1086) + ((0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * TMP_JAMP(613)  ! used 4 times
-      TMP_JAMP(1563) = TMP_JAMP(1088) - TMP_JAMP(1087)  ! used 4 times
-      TMP_JAMP(1562) = TMP_JAMP(1090) - TMP_JAMP(689)  ! used 4 times
-      TMP_JAMP(1561) = TMP_JAMP(1090) - TMP_JAMP(1089)  ! used 4 times
-      TMP_JAMP(1560) = TMP_JAMP(590) +  AMP(1045)  ! used 4 times
-      TMP_JAMP(1559) = TMP_JAMP(591) +  AMP(499)  ! used 4 times
-      TMP_JAMP(1558) = TMP_JAMP(611) - TMP_JAMP(592)  ! used 4 times
-      TMP_JAMP(1557) = TMP_JAMP(1116) +  TMP_JAMP(596)  ! used 4 times
-      TMP_JAMP(1556) = TMP_JAMP(1138) +  TMP_JAMP(595)  ! used 4 times
-      TMP_JAMP(1555) = TMP_JAMP(1144) - TMP_JAMP(659)  ! used 4 times
-      TMP_JAMP(1554) = AMP(1668) + ((0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * AMP(1701)  ! used 4 times
-      TMP_JAMP(1553) = AMP(1666) + ((-0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * AMP(1699)  ! used 4 times
-      TMP_JAMP(1552) = AMP(1479) + ((0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * AMP(1666)  ! used 4 times
-      TMP_JAMP(1551) = AMP(617) + ((0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * AMP(1668)  ! used 4 times
-      TMP_JAMP(1550) = AMP(589) - AMP(1511)  ! used 4 times
-      TMP_JAMP(1549) = TMP_JAMP(593) - AMP(1479)  ! used 4 times
-      TMP_JAMP(1548) = TMP_JAMP(594) - AMP(617)  ! used 4 times
-      TMP_JAMP(1547) = TMP_JAMP(668) +  AMP(589)  ! used 4 times
-      TMP_JAMP(1546) = TMP_JAMP(673) + ((-0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * TMP_JAMP(668)  ! used 4 times
-      TMP_JAMP(1545) = TMP_JAMP(1083) + ((0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * TMP_JAMP(593)  ! used 4 times
-      TMP_JAMP(1544) = TMP_JAMP(1084) - TMP_JAMP(673)  ! used 4 times
-      TMP_JAMP(1543) = TMP_JAMP(1085) + ((0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * TMP_JAMP(594)  ! used 4 times
-      TMP_JAMP(1542) = TMP_JAMP(1089) - TMP_JAMP(1083)  ! used 4 times
-      TMP_JAMP(1541) = TMP_JAMP(1090) +  TMP_JAMP(1084)  ! used 4 times
-      TMP_JAMP(1540) = AMP(1431) +  AMP(1747)  ! used 4 times
-      TMP_JAMP(1539) = AMP(584) - AMP(1749)  ! used 4 times
-      TMP_JAMP(1538) = TMP_JAMP(578) - AMP(516)  ! used 4 times
-      TMP_JAMP(1537) = TMP_JAMP(580) + ((-0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * AMP(1723)  ! used 4 times
-      TMP_JAMP(1536) = TMP_JAMP(592) - TMP_JAMP(579)  ! used 4 times
-      TMP_JAMP(1535) = TMP_JAMP(662) + ((0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * TMP_JAMP(657)  ! used 4 times
-      TMP_JAMP(1534) = TMP_JAMP(969) - TMP_JAMP(582)  ! used 4 times
-      TMP_JAMP(1533) = TMP_JAMP(1125) - TMP_JAMP(1124)  ! used 4 times
-      TMP_JAMP(1532) = AMP(914) - AMP(1748)  ! used 4 times
-      TMP_JAMP(1531) = TMP_JAMP(575) + ((0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * AMP(1724)  ! used 4 times
-      TMP_JAMP(1530) = TMP_JAMP(643) + ((0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * TMP_JAMP(638)  ! used 4 times
-      TMP_JAMP(1529) = TMP_JAMP(1127) - TMP_JAMP(1094)  ! used 4 times
-      TMP_JAMP(1528) = TMP_JAMP(1146) + ((0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * TMP_JAMP(573)  ! used 4 times
-      TMP_JAMP(1527) = TMP_JAMP(1146) - TMP_JAMP(1110)  ! used 4 times
-      TMP_JAMP(1526) = TMP_JAMP(569) - AMP(363)  ! used 4 times
-      TMP_JAMP(1525) = TMP_JAMP(583) - TMP_JAMP(576)  ! used 4 times
-      TMP_JAMP(1524) = TMP_JAMP(609) +  TMP_JAMP(579)  ! used 4 times
-      TMP_JAMP(1523) = TMP_JAMP(1113) - TMP_JAMP(572)  ! used 4 times
-      TMP_JAMP(1522) = AMP(1219) - AMP(1835)  ! used 4 times
-      TMP_JAMP(1521) = AMP(950) + ((-0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * AMP(1219)  ! used 4 times
-      TMP_JAMP(1520) = AMP(531) + ((0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * AMP(1725)  ! used 4 times
-      TMP_JAMP(1519) = TMP_JAMP(571) + ((0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * AMP(950)  ! used 4 times
-      TMP_JAMP(1518) = TMP_JAMP(581) +  AMP(531)  ! used 4 times
-      TMP_JAMP(1517) = TMP_JAMP(1081) + ((-0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * TMP_JAMP(571)  ! used 4 times
-      TMP_JAMP(1516) = TMP_JAMP(1082) + ((0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * TMP_JAMP(581)  ! used 4 times
-      TMP_JAMP(1515) = TMP_JAMP(1082) - TMP_JAMP(1081)  ! used 4 times
-      TMP_JAMP(1514) = TMP_JAMP(1087) - TMP_JAMP(1084)  ! used 4 times
-      TMP_JAMP(1513) = TMP_JAMP(1088) +  TMP_JAMP(1081)  ! used 4 times
-      TMP_JAMP(1512) = TMP_JAMP(559) - AMP(1208)  ! used 4 times
-      TMP_JAMP(1511) = TMP_JAMP(560) - AMP(103)  ! used 4 times
-      TMP_JAMP(1510) = TMP_JAMP(1111) +  TMP_JAMP(564)  ! used 4 times
-      TMP_JAMP(1509) = TMP_JAMP(1152) +  TMP_JAMP(563)  ! used 4 times
-      TMP_JAMP(1508) = AMP(1584) + ((-0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * AMP(1590)  ! used 4 times
-      TMP_JAMP(1507) = AMP(1582) + ((0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * AMP(1588)  ! used 4 times
-      TMP_JAMP(1506) = TMP_JAMP(561) +  AMP(1217)  ! used 4 times
-      TMP_JAMP(1505) = TMP_JAMP(562) +  AMP(129)  ! used 4 times
-      TMP_JAMP(1504) = TMP_JAMP(1080) +  TMP_JAMP(1079)  ! used 4 times
-      TMP_JAMP(1503) = TMP_JAMP(1082) - TMP_JAMP(1079)  ! used 4 times
-      TMP_JAMP(1502) = TMP_JAMP(1090) +  TMP_JAMP(1080)  ! used 4 times
-      TMP_JAMP(1501) = AMP(1429) +  AMP(1855)  ! used 4 times
-      TMP_JAMP(1500) = AMP(1108) - AMP(1857)  ! used 4 times
-      TMP_JAMP(1499) = TMP_JAMP(550) - AMP(673)  ! used 4 times
-      TMP_JAMP(1498) = TMP_JAMP(552) + ((0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * AMP(787)  ! used 4 times
-      TMP_JAMP(1497) = TMP_JAMP(923) - TMP_JAMP(553)  ! used 4 times
-      TMP_JAMP(1496) = AMP(1276) - AMP(1856)  ! used 4 times
-      TMP_JAMP(1495) = TMP_JAMP(547) + ((0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * AMP(788)  ! used 4 times
-      TMP_JAMP(1494) = TMP_JAMP(1161) + ((-0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * TMP_JAMP(546)  ! used 4 times
-      TMP_JAMP(1493) = TMP_JAMP(1161) - TMP_JAMP(1106)  ! used 4 times
-      TMP_JAMP(1492) = TMP_JAMP(542) +  AMP(658)  ! used 4 times
-      TMP_JAMP(1491) = TMP_JAMP(554) - TMP_JAMP(548)  ! used 4 times
-      TMP_JAMP(1490) = TMP_JAMP(1135) +  TMP_JAMP(545)  ! used 4 times
-      TMP_JAMP(1489) = AMP(1667) + ((-0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * AMP(1700)  ! used 4 times
-      TMP_JAMP(1488) = AMP(776) + ((-0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * AMP(1667)  ! used 4 times
-      TMP_JAMP(1487) = AMP(689) + ((0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * AMP(789)  ! used 4 times
-      TMP_JAMP(1486) = TMP_JAMP(544) - AMP(776)  ! used 4 times
-      TMP_JAMP(1485) = TMP_JAMP(551) +  AMP(689)  ! used 4 times
-      TMP_JAMP(1484) = TMP_JAMP(1077) + ((-0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * TMP_JAMP(544)  ! used 4 times
-      TMP_JAMP(1483) = TMP_JAMP(1078) + ((-0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * TMP_JAMP(551)  ! used 4 times
-      TMP_JAMP(1482) = TMP_JAMP(1085) +  TMP_JAMP(1077)  ! used 4 times
-      TMP_JAMP(1481) = TMP_JAMP(1087) - TMP_JAMP(1078)  ! used 4 times
-      TMP_JAMP(1480) = TMP_JAMP(535) - AMP(764)  ! used 4 times
-      TMP_JAMP(1479) = TMP_JAMP(1153) +  TMP_JAMP(537)  ! used 4 times
-      TMP_JAMP(1478) = AMP(1583) + ((0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * AMP(1589)  ! used 4 times
-      TMP_JAMP(1477) = TMP_JAMP(536) +  AMP(773)  ! used 4 times
-      TMP_JAMP(1476) = TMP_JAMP(1080) +  TMP_JAMP(1076)  ! used 4 times
-      TMP_JAMP(1475) = AMP(471) + ((-0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * AMP(1881)  ! used 4 times
-      TMP_JAMP(1474) = AMP(398) +  AMP(469)  ! used 4 times
-      TMP_JAMP(1473) = AMP(49) - AMP(1560)  ! used 4 times
-      TMP_JAMP(1472) = TMP_JAMP(518) - AMP(62)  ! used 4 times
-      TMP_JAMP(1471) = TMP_JAMP(519) +  AMP(365)  ! used 4 times
-      TMP_JAMP(1470) = TMP_JAMP(532) + ((0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * TMP_JAMP(520)  ! used 4 times
-      TMP_JAMP(1469) = TMP_JAMP(892) + ((-0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * AMP(49)  ! used 4 times
-      TMP_JAMP(1468) = TMP_JAMP(896) + ((-0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * AMP(1184)  ! used 4 times
-      TMP_JAMP(1467) = TMP_JAMP(1017) - TMP_JAMP(525)  ! used 4 times
-      TMP_JAMP(1466) = TMP_JAMP(1075) +  TMP_JAMP(892)  ! used 4 times
-      TMP_JAMP(1465) = TMP_JAMP(1075) +  TMP_JAMP(1074)  ! used 4 times
-      TMP_JAMP(1464) = TMP_JAMP(1112) +  TMP_JAMP(528)  ! used 4 times
-      TMP_JAMP(1463) = TMP_JAMP(1157) - TMP_JAMP(1121)  ! used 4 times
-      TMP_JAMP(1462) = AMP(1773) + ((-0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * AMP(1800)  ! used 4 times
-      TMP_JAMP(1461) = AMP(1023) +  AMP(1771)  ! used 4 times
-      TMP_JAMP(1460) = AMP(740) - AMP(1559)  ! used 4 times
-      TMP_JAMP(1459) = TMP_JAMP(508) - AMP(666)  ! used 4 times
-      TMP_JAMP(1458) = TMP_JAMP(516) + ((0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * TMP_JAMP(509)  ! used 4 times
-      TMP_JAMP(1457) = TMP_JAMP(877) + ((0.000000000000000D+00
+      TMP_JAMP(917) = TMP_JAMP(710) + ((-0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * AMP(914)  ! used 4 times
+      TMP_JAMP(916) = TMP_JAMP(709) + ((-0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * AMP(845)  ! used 4 times
+      TMP_JAMP(915) = TMP_JAMP(709) + ((0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * AMP(846)  ! used 4 times
+      TMP_JAMP(914) = TMP_JAMP(708) + ((-0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * AMP(919)  ! used 4 times
+      TMP_JAMP(913) = TMP_JAMP(708) + ((0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * AMP(911)  ! used 4 times
+      TMP_JAMP(912) = TMP_JAMP(707) + ((-0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * AMP(617)  ! used 4 times
+      TMP_JAMP(911) = TMP_JAMP(707) - TMP_JAMP(706)  ! used 4 times
+      TMP_JAMP(910) = TMP_JAMP(707) +  TMP_JAMP(701)  ! used 4 times
+      TMP_JAMP(909) = TMP_JAMP(707) - TMP_JAMP(705)  ! used 4 times
+      TMP_JAMP(908) = TMP_JAMP(706) + ((0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * AMP(589)  ! used 4 times
+      TMP_JAMP(907) = TMP_JAMP(706) - TMP_JAMP(705)  ! used 4 times
+      TMP_JAMP(906) = TMP_JAMP(705) + ((0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * AMP(531)  ! used 4 times
+      TMP_JAMP(905) = TMP_JAMP(704) + ((0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * AMP(671)  ! used 4 times
+      TMP_JAMP(904) = TMP_JAMP(704) + ((-0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * AMP(784)  ! used 4 times
+      TMP_JAMP(903) = TMP_JAMP(703) + ((0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * AMP(673)  ! used 4 times
+      TMP_JAMP(902) = TMP_JAMP(702) + ((-0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * AMP(707)  ! used 4 times
+      TMP_JAMP(901) = TMP_JAMP(702) + ((0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * AMP(691)  ! used 4 times
+      TMP_JAMP(900) = TMP_JAMP(701) + ((0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * AMP(776)  ! used 4 times
+      TMP_JAMP(899) = TMP_JAMP(701) - TMP_JAMP(700)  ! used 4 times
+      TMP_JAMP(898) = TMP_JAMP(701) - TMP_JAMP(699)  ! used 4 times
+      TMP_JAMP(897) = TMP_JAMP(700) + ((-0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * AMP(689)  ! used 4 times
+      TMP_JAMP(896) = TMP_JAMP(700) - TMP_JAMP(699)  ! used 4 times
+      TMP_JAMP(895) = TMP_JAMP(699) + ((-0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * AMP(773)  ! used 4 times
+      TMP_JAMP(894) = TMP_JAMP(698) + ((-0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * AMP(666)  ! used 4 times
+      TMP_JAMP(893) = TMP_JAMP(698) + ((0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * AMP(743)  ! used 4 times
+      TMP_JAMP(892) = TMP_JAMP(697) + ((0.000000000000000D+00
      $ ,1.000000000000000D+00)) * AMP(740)  ! used 4 times
-      TMP_JAMP(1456) = TMP_JAMP(1073) +  TMP_JAMP(877)  ! used 4 times
-      TMP_JAMP(1455) = TMP_JAMP(1075) +  TMP_JAMP(1073)  ! used 4 times
-      TMP_JAMP(1454) = TMP_JAMP(1108) +  TMP_JAMP(512)  ! used 4 times
-      TMP_JAMP(1453) = TMP_JAMP(1154) + ((-0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * TMP_JAMP(507)  ! used 4 times
-      TMP_JAMP(1452) = TMP_JAMP(1154) - TMP_JAMP(1114)  ! used 4 times
-      TMP_JAMP(1451) = AMP(470) + ((0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * AMP(1880)  ! used 4 times
-      TMP_JAMP(1450) = TMP_JAMP(501) +  AMP(383)  ! used 4 times
-      TMP_JAMP(1449) = TMP_JAMP(782) + ((0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * TMP_JAMP(543)  ! used 4 times
-      TMP_JAMP(1448) = TMP_JAMP(865) + ((-0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * TMP_JAMP(543)  ! used 4 times
-      TMP_JAMP(1447) = TMP_JAMP(865) - TMP_JAMP(782)  ! used 4 times
-      TMP_JAMP(1446) = TMP_JAMP(1072) +  TMP_JAMP(782)  ! used 4 times
-      TMP_JAMP(1445) = TMP_JAMP(1074) - TMP_JAMP(1073)  ! used 4 times
-      TMP_JAMP(1444) = TMP_JAMP(1096) +  TMP_JAMP(505)  ! used 4 times
-      TMP_JAMP(1443) = AMP(1270) +  AMP(1618)  ! used 4 times
-      TMP_JAMP(1442) = AMP(57) - AMP(1620)  ! used 4 times
-      TMP_JAMP(1441) = TMP_JAMP(1070) + ((0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * AMP(1270)  ! used 4 times
-      TMP_JAMP(1440) = TMP_JAMP(1072) - TMP_JAMP(1070)  ! used 4 times
-      TMP_JAMP(1439) = TMP_JAMP(1075) +  TMP_JAMP(1071)  ! used 4 times
-      TMP_JAMP(1438) = TMP_JAMP(1093) + ((-0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * AMP(57)  ! used 4 times
-      TMP_JAMP(1437) = TMP_JAMP(1093) - TMP_JAMP(1071)  ! used 4 times
-      TMP_JAMP(1436) = TMP_JAMP(1150) +  TMP_JAMP(1093)  ! used 4 times
-      TMP_JAMP(1435) = AMP(1772) + ((0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * AMP(1799)  ! used 4 times
-      TMP_JAMP(1434) = TMP_JAMP(491) - AMP(825)  ! used 4 times
-      TMP_JAMP(1433) = TMP_JAMP(719) + ((0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * TMP_JAMP(570)  ! used 4 times
-      TMP_JAMP(1432) = TMP_JAMP(835) - TMP_JAMP(719)  ! used 4 times
-      TMP_JAMP(1431) = TMP_JAMP(1069) - TMP_JAMP(719)  ! used 4 times
-      TMP_JAMP(1430) = TMP_JAMP(1074) - TMP_JAMP(1069)  ! used 4 times
-      TMP_JAMP(1429) = TMP_JAMP(1128) +  TMP_JAMP(494)  ! used 4 times
-      TMP_JAMP(1428) = AMP(907) - AMP(1619)  ! used 4 times
-      TMP_JAMP(1427) = TMP_JAMP(1071) +  TMP_JAMP(1068)  ! used 4 times
-      TMP_JAMP(1426) = AMP(1262) +  AMP(1612)  ! used 4 times
-      TMP_JAMP(1425) = TMP_JAMP(1070) + ((-0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * AMP(1262)  ! used 4 times
-      TMP_JAMP(1424) = TMP_JAMP(1071) + ((-0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * AMP(51)  ! used 4 times
-      TMP_JAMP(1423) = AMP(899) - AMP(1613)  ! used 4 times
-      TMP_JAMP(1422) = TMP_JAMP(1068) + ((0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * AMP(899)  ! used 4 times
-      TMP_JAMP(1421) = TMP_JAMP(1069) + ((0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * AMP(864)  ! used 4 times
-      TMP_JAMP(1420) = TMP_JAMP(1070) - TMP_JAMP(1068)  ! used 4 times
-      TMP_JAMP(1419) = AMP(1192) +  AMP(1564)  ! used 4 times
-      TMP_JAMP(1418) = AMP(60) - AMP(1566)  ! used 4 times
-      TMP_JAMP(1417) = TMP_JAMP(1074) + ((0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * AMP(1192)  ! used 4 times
-      TMP_JAMP(1416) = TMP_JAMP(1075) + ((0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * AMP(60)  ! used 4 times
-      TMP_JAMP(1415) = TMP_JAMP(1079) - TMP_JAMP(1074)  ! used 4 times
-      TMP_JAMP(1414) = TMP_JAMP(1080) - TMP_JAMP(1075)  ! used 4 times
-      TMP_JAMP(1413) = TMP_JAMP(1086) - TMP_JAMP(1081)  ! used 4 times
-      TMP_JAMP(1412) = TMP_JAMP(1089) +  TMP_JAMP(1080)  ! used 4 times
-      TMP_JAMP(1411) = TMP_JAMP(1072) + ((-0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * AMP(704)  ! used 4 times
-      TMP_JAMP(1410) = AMP(748) - AMP(1565)  ! used 4 times
-      TMP_JAMP(1409) = TMP_JAMP(1073) + ((-0.000000000000000D+00,
+      TMP_JAMP(891) = TMP_JAMP(697) + ((-0.000000000000000D+00,
      $ -1.000000000000000D+00)) * AMP(748)  ! used 4 times
-      TMP_JAMP(1408) = TMP_JAMP(1076) - TMP_JAMP(1073)  ! used 4 times
-      TMP_JAMP(1407) = TMP_JAMP(1083) - TMP_JAMP(1077)  ! used 4 times
-      TMP_JAMP(1406) = AMP(1410) +  AMP(1801)  ! used 4 times
-      TMP_JAMP(1405) = AMP(743) - AMP(1803)  ! used 4 times
-      TMP_JAMP(1404) = TMP_JAMP(484) + ((0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * AMP(1798)  ! used 4 times
-      TMP_JAMP(1403) = TMP_JAMP(485) - AMP(354)  ! used 4 times
-      TMP_JAMP(1402) = TMP_JAMP(511) - AMP(743)  ! used 4 times
-      TMP_JAMP(1401) = TMP_JAMP(513) + ((-0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * TMP_JAMP(511)  ! used 4 times
-      TMP_JAMP(1400) = TMP_JAMP(531) - TMP_JAMP(515)  ! used 4 times
-      TMP_JAMP(1399) = TMP_JAMP(1021) +  TMP_JAMP(488)  ! used 4 times
-      TMP_JAMP(1398) = TMP_JAMP(1067) + ((-0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * TMP_JAMP(513)  ! used 4 times
-      TMP_JAMP(1397) = AMP(902) - AMP(1802)  ! used 4 times
-      TMP_JAMP(1396) = TMP_JAMP(496) + ((0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * TMP_JAMP(492)  ! used 4 times
-      TMP_JAMP(1395) = TMP_JAMP(1067) +  TMP_JAMP(1066)  ! used 4 times
-      TMP_JAMP(1394) = TMP_JAMP(1091) + ((0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * AMP(849)  ! used 4 times
-      TMP_JAMP(1393) = TMP_JAMP(1091) + ((0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * TMP_JAMP(609)  ! used 4 times
-      TMP_JAMP(1392) = TMP_JAMP(1091) + ((0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * TMP_JAMP(1066)  ! used 4 times
-      TMP_JAMP(1391) = TMP_JAMP(1101) - TMP_JAMP(1091)  ! used 4 times
-      TMP_JAMP(1390) = TMP_JAMP(1129) - TMP_JAMP(1126)  ! used 4 times
-      TMP_JAMP(1389) = TMP_JAMP(1066) - AMP(834)  ! used 4 times
-      TMP_JAMP(1388) = TMP_JAMP(1067) +  AMP(690)  ! used 4 times
-      TMP_JAMP(1387) = TMP_JAMP(1078) + ((0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * AMP(690)  ! used 4 times
-      TMP_JAMP(1386) = TMP_JAMP(1078) + ((0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * TMP_JAMP(1067)  ! used 4 times
-      TMP_JAMP(1385) = TMP_JAMP(1079) - TMP_JAMP(1076)  ! used 4 times
-      TMP_JAMP(1384) = TMP_JAMP(1088) - TMP_JAMP(1078)  ! used 4 times
-      TMP_JAMP(1383) = AMP(1130) +  AMP(1522)  ! used 4 times
-      TMP_JAMP(1382) = AMP(115) - AMP(1524)  ! used 4 times
-      TMP_JAMP(1381) = TMP_JAMP(486) +  AMP(1151)  ! used 4 times
-      TMP_JAMP(1380) = TMP_JAMP(524) - AMP(61)  ! used 4 times
-      TMP_JAMP(1379) = TMP_JAMP(527) + ((-0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * TMP_JAMP(524)  ! used 4 times
-      TMP_JAMP(1378) = TMP_JAMP(1064) + ((-0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * TMP_JAMP(527)  ! used 4 times
-      TMP_JAMP(1377) = TMP_JAMP(1102) + ((0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * AMP(1130)  ! used 4 times
-      TMP_JAMP(1376) = TMP_JAMP(1102) + ((0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * TMP_JAMP(1065)  ! used 4 times
-      TMP_JAMP(1375) = AMP(1139) +  AMP(1528)  ! used 4 times
-      TMP_JAMP(1374) = AMP(127) - AMP(1530)  ! used 4 times
-      TMP_JAMP(1373) = TMP_JAMP(1087) + ((0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * AMP(1139)  ! used 4 times
-      TMP_JAMP(1372) = TMP_JAMP(1087) + ((0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * TMP_JAMP(1065)  ! used 4 times
-      TMP_JAMP(1371) = TMP_JAMP(1090) + ((0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * AMP(127)  ! used 4 times
-      TMP_JAMP(1370) = TMP_JAMP(1090) + ((0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * TMP_JAMP(1064)  ! used 4 times
-      TMP_JAMP(1369) = AMP(1408) +  AMP(1882)  ! used 4 times
-      TMP_JAMP(1368) = AMP(1186) - AMP(1884)  ! used 4 times
-      TMP_JAMP(1367) = TMP_JAMP(478) +  AMP(512)  ! used 4 times
-      TMP_JAMP(1366) = TMP_JAMP(521) +  AMP(1186)  ! used 4 times
-      TMP_JAMP(1365) = TMP_JAMP(529) + ((-0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * TMP_JAMP(521)  ! used 4 times
-      TMP_JAMP(1364) = TMP_JAMP(989) - TMP_JAMP(480)  ! used 4 times
-      TMP_JAMP(1363) = TMP_JAMP(1063) + ((-0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * TMP_JAMP(529)  ! used 4 times
-      TMP_JAMP(1362) = AMP(1264) - AMP(1883)  ! used 4 times
-      TMP_JAMP(1361) = TMP_JAMP(502) +  AMP(1264)  ! used 4 times
-      TMP_JAMP(1360) = TMP_JAMP(506) + ((0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * TMP_JAMP(502)  ! used 4 times
-      TMP_JAMP(1359) = TMP_JAMP(1062) + ((0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * TMP_JAMP(506)  ! used 4 times
-      TMP_JAMP(1358) = TMP_JAMP(1063) +  TMP_JAMP(1062)  ! used 4 times
-      TMP_JAMP(1357) = TMP_JAMP(1130) + ((-0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * TMP_JAMP(592)  ! used 4 times
-      TMP_JAMP(1356) = TMP_JAMP(1130) - TMP_JAMP(1098)  ! used 4 times
-      TMP_JAMP(1355) = TMP_JAMP(1062) - AMP(532)  ! used 4 times
-      TMP_JAMP(1354) = TMP_JAMP(1063) +  AMP(530)  ! used 4 times
-      TMP_JAMP(1353) = TMP_JAMP(1082) + ((-0.000000000000000D+00
+      TMP_JAMP(890) = TMP_JAMP(696) + ((0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * AMP(705)  ! used 4 times
+      TMP_JAMP(889) = TMP_JAMP(696) + ((-0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * AMP(704)  ! used 4 times
+      TMP_JAMP(888) = TMP_JAMP(695) + ((0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * AMP(825)  ! used 4 times
+      TMP_JAMP(887) = TMP_JAMP(695) + ((-0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * AMP(902)  ! used 4 times
+      TMP_JAMP(886) = TMP_JAMP(694) + ((-0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * AMP(863)  ! used 4 times
+      TMP_JAMP(885) = TMP_JAMP(694) + ((0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * AMP(864)  ! used 4 times
+      TMP_JAMP(884) = TMP_JAMP(693) + ((-0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * AMP(907)  ! used 4 times
+      TMP_JAMP(883) = TMP_JAMP(693) + ((0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * AMP(899)  ! used 4 times
+      TMP_JAMP(882) = TMP_JAMP(692) + ((0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * AMP(512)  ! used 4 times
+      TMP_JAMP(881) = TMP_JAMP(691) + ((0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * AMP(514)  ! used 4 times
+      TMP_JAMP(880) = TMP_JAMP(691) + ((-0.000000000000000D+00
      $ ,1.000000000000000D+00)) * AMP(530)  ! used 4 times
-      TMP_JAMP(1352) = TMP_JAMP(1082) + ((-0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * TMP_JAMP(1063)  ! used 4 times
-      TMP_JAMP(1351) = TMP_JAMP(1085) +  TMP_JAMP(1082)  ! used 4 times
-      TMP_JAMP(1350) = AMP(625) + ((0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * AMP(628)  ! used 4 times
-      TMP_JAMP(1349) = AMP(605) - AMP(1523)  ! used 4 times
-      TMP_JAMP(1348) = TMP_JAMP(479) - AMP(625)  ! used 4 times
-      TMP_JAMP(1347) = TMP_JAMP(1061) - TMP_JAMP(479)  ! used 4 times
-      TMP_JAMP(1346) = TMP_JAMP(1064) +  TMP_JAMP(1061)  ! used 4 times
-      TMP_JAMP(1345) = AMP(614) - AMP(1529)  ! used 4 times
-      TMP_JAMP(1344) = TMP_JAMP(1084) + ((-0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * AMP(614)  ! used 4 times
-      TMP_JAMP(1343) = TMP_JAMP(1084) + ((-0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * TMP_JAMP(1061)  ! used 4 times
-      TMP_JAMP(1342) = AMP(985) +  AMP(1648)  ! used 4 times
-      TMP_JAMP(1341) = AMP(481) - AMP(1650)  ! used 4 times
-      TMP_JAMP(1340) = AMP(336) + ((-0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * AMP(1014)  ! used 4 times
-      TMP_JAMP(1339) = AMP(334) + ((0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * AMP(1012)  ! used 4 times
-      TMP_JAMP(1338) = TMP_JAMP(461) +  AMP(331)  ! used 4 times
-      TMP_JAMP(1337) = TMP_JAMP(462) +  AMP(325)  ! used 4 times
-      TMP_JAMP(1336) = TMP_JAMP(809) + ((-0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * AMP(481)  ! used 4 times
-      TMP_JAMP(1335) = TMP_JAMP(810) + ((-0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * AMP(985)  ! used 4 times
-      TMP_JAMP(1334) = TMP_JAMP(1058) +  TMP_JAMP(810)  ! used 4 times
-      TMP_JAMP(1333) = TMP_JAMP(1060) +  TMP_JAMP(809)  ! used 4 times
-      TMP_JAMP(1332) = TMP_JAMP(1060) - TMP_JAMP(1059)  ! used 4 times
-      TMP_JAMP(1331) = TMP_JAMP(1121) +  TMP_JAMP(473)  ! used 4 times
-      TMP_JAMP(1330) = TMP_JAMP(1122) +  TMP_JAMP(470)  ! used 4 times
-      TMP_JAMP(1329) = AMP(963) +  AMP(1762)  ! used 4 times
-      TMP_JAMP(1328) = AMP(648) - AMP(1764)  ! used 4 times
-      TMP_JAMP(1327) = AMP(640) - AMP(1649)  ! used 4 times
-      TMP_JAMP(1326) = TMP_JAMP(790) + ((0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * AMP(640)  ! used 4 times
-      TMP_JAMP(1325) = TMP_JAMP(1056) +  TMP_JAMP(790)  ! used 4 times
-      TMP_JAMP(1324) = TMP_JAMP(1059) - TMP_JAMP(1055)  ! used 4 times
-      TMP_JAMP(1323) = TMP_JAMP(1060) +  TMP_JAMP(1056)  ! used 4 times
-      TMP_JAMP(1322) = TMP_JAMP(1105) + ((0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * AMP(963)  ! used 4 times
-      TMP_JAMP(1321) = TMP_JAMP(1105) + ((0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * TMP_JAMP(484)  ! used 4 times
-      TMP_JAMP(1320) = TMP_JAMP(1105) +  TMP_JAMP(1055)  ! used 4 times
-      TMP_JAMP(1319) = TMP_JAMP(1106) + ((0.000000000000000D+00,
+      TMP_JAMP(879) = TMP_JAMP(690) + ((-0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * AMP(548)  ! used 4 times
+      TMP_JAMP(878) = TMP_JAMP(690) + ((0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * AMP(532)  ! used 4 times
+      TMP_JAMP(877) = TMP_JAMP(689) + ((0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * AMP(494)  ! used 4 times
+      TMP_JAMP(876) = TMP_JAMP(689) - TMP_JAMP(688)  ! used 4 times
+      TMP_JAMP(875) = TMP_JAMP(689) + ((-0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * AMP(493)  ! used 4 times
+      TMP_JAMP(874) = TMP_JAMP(688) +  TMP_JAMP(686)  ! used 4 times
+      TMP_JAMP(873) = TMP_JAMP(688) + ((0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * AMP(492)  ! used 4 times
+      TMP_JAMP(872) = TMP_JAMP(687) + ((0.000000000000000D+00,
      $ -1.000000000000000D+00)) * AMP(648)  ! used 4 times
-      TMP_JAMP(1318) = TMP_JAMP(1106) +  TMP_JAMP(1057)  ! used 4 times
-      TMP_JAMP(1317) = TMP_JAMP(1109) - TMP_JAMP(1105)  ! used 4 times
-      TMP_JAMP(1316) = AMP(335) + ((0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * AMP(1013)  ! used 4 times
-      TMP_JAMP(1315) = TMP_JAMP(449) +  AMP(328)  ! used 4 times
-      TMP_JAMP(1314) = TMP_JAMP(865) + ((-0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * AMP(653)  ! used 4 times
-      TMP_JAMP(1313) = TMP_JAMP(1054) - TMP_JAMP(865)  ! used 4 times
-      TMP_JAMP(1312) = TMP_JAMP(1058) - TMP_JAMP(1056)  ! used 4 times
-      TMP_JAMP(1311) = TMP_JAMP(1104) +  TMP_JAMP(453)  ! used 4 times
-      TMP_JAMP(1310) = AMP(972) +  AMP(1708)  ! used 4 times
-      TMP_JAMP(1309) = AMP(489) - AMP(1710)  ! used 4 times
-      TMP_JAMP(1308) = TMP_JAMP(1054) - TMP_JAMP(1052)  ! used 4 times
-      TMP_JAMP(1307) = TMP_JAMP(1097) + ((0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * AMP(972)  ! used 4 times
-      TMP_JAMP(1306) = TMP_JAMP(1097) + ((0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * TMP_JAMP(579)  ! used 4 times
-      TMP_JAMP(1305) = TMP_JAMP(1097) +  TMP_JAMP(1052)  ! used 4 times
-      TMP_JAMP(1304) = TMP_JAMP(1098) + ((0.000000000000000D+00,
+      TMP_JAMP(871) = TMP_JAMP(687) +  TMP_JAMP(686)  ! used 4 times
+      TMP_JAMP(870) = TMP_JAMP(687) + ((-0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * AMP(642)  ! used 4 times
+      TMP_JAMP(869) = TMP_JAMP(686) + ((-0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * AMP(651)  ! used 4 times
+      TMP_JAMP(868) = TMP_JAMP(685) + ((0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * AMP(653)  ! used 4 times
+      TMP_JAMP(867) = TMP_JAMP(685) + ((-0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * AMP(652)  ! used 4 times
+      TMP_JAMP(866) = TMP_JAMP(684) + ((0.000000000000000D+00,
      $ -1.000000000000000D+00)) * AMP(489)  ! used 4 times
-      TMP_JAMP(1303) = TMP_JAMP(1098) +  TMP_JAMP(1053)  ! used 4 times
-      TMP_JAMP(1302) = TMP_JAMP(1100) - TMP_JAMP(1097)  ! used 4 times
-      TMP_JAMP(1301) = AMP(807) - AMP(1763)  ! used 4 times
-      TMP_JAMP(1300) = TMP_JAMP(707) + ((0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * TMP_JAMP(610)  ! used 4 times
-      TMP_JAMP(1299) = TMP_JAMP(766) - TMP_JAMP(707)  ! used 4 times
-      TMP_JAMP(1298) = TMP_JAMP(1050) - TMP_JAMP(707)  ! used 4 times
-      TMP_JAMP(1297) = TMP_JAMP(1051) + ((-0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * AMP(807)  ! used 4 times
-      TMP_JAMP(1296) = TMP_JAMP(1057) +  TMP_JAMP(1051)  ! used 4 times
-      TMP_JAMP(1295) = TMP_JAMP(1058) - TMP_JAMP(1050)  ! used 4 times
-      TMP_JAMP(1294) = AMP(810) - AMP(1709)  ! used 4 times
-      TMP_JAMP(1293) = TMP_JAMP(1053) +  TMP_JAMP(1049)  ! used 4 times
-      TMP_JAMP(1292) = AMP(994) +  AMP(1702)  ! used 4 times
-      TMP_JAMP(1291) = AMP(483) - AMP(1704)  ! used 4 times
-      TMP_JAMP(1290) = TMP_JAMP(1052) + ((-0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * AMP(994)  ! used 4 times
-      TMP_JAMP(1289) = TMP_JAMP(1053) + ((-0.000000000000000D+00
+      TMP_JAMP(865) = TMP_JAMP(684) + ((-0.000000000000000D+00
      $ ,1.000000000000000D+00)) * AMP(483)  ! used 4 times
-      TMP_JAMP(1288) = TMP_JAMP(1059) +  TMP_JAMP(1053)  ! used 4 times
-      TMP_JAMP(1287) = AMP(799) - AMP(1703)  ! used 4 times
-      TMP_JAMP(1286) = TMP_JAMP(1049) + ((0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * AMP(799)  ! used 4 times
-      TMP_JAMP(1285) = TMP_JAMP(1055) - TMP_JAMP(1051)  ! used 4 times
-      TMP_JAMP(1284) = TMP_JAMP(1050) + ((0.000000000000000D+00,
+      TMP_JAMP(864) = TMP_JAMP(683) + ((-0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * TMP_JAMP(650)  ! used 4 times
+      TMP_JAMP(863) = TMP_JAMP(683) + ((0.000000000000000D+00,
      $ -1.000000000000000D+00)) * AMP(812)  ! used 4 times
-      TMP_JAMP(1283) = TMP_JAMP(1052) - TMP_JAMP(1049)  ! used 4 times
-      TMP_JAMP(1282) = AMP(981) +  AMP(1654)  ! used 4 times
-      TMP_JAMP(1281) = AMP(492) - AMP(1656)  ! used 4 times
-      TMP_JAMP(1280) = TMP_JAMP(1058) + ((0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * AMP(981)  ! used 4 times
-      TMP_JAMP(1279) = TMP_JAMP(1083) - TMP_JAMP(1058)  ! used 4 times
-      TMP_JAMP(1278) = TMP_JAMP(1085) - TMP_JAMP(1060)  ! used 4 times
-      TMP_JAMP(1277) = TMP_JAMP(1086) +  TMP_JAMP(1083)  ! used 4 times
-      TMP_JAMP(1276) = TMP_JAMP(1054) + ((-0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * AMP(652)  ! used 4 times
-      TMP_JAMP(1275) = TMP_JAMP(1057) +  TMP_JAMP(1054)  ! used 4 times
-      TMP_JAMP(1274) = AMP(651) - AMP(1655)  ! used 4 times
-      TMP_JAMP(1273) = TMP_JAMP(1077) - TMP_JAMP(1056)  ! used 4 times
-      TMP_JAMP(1272) = AMP(642) - AMP(1758)  ! used 4 times
-      TMP_JAMP(1271) = TMP_JAMP(1055) + ((-0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * AMP(1003)  ! used 4 times
-      TMP_JAMP(1270) = TMP_JAMP(1067) + ((0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * TMP_JAMP(1057)  ! used 4 times
-      TMP_JAMP(1269) = AMP(801) - AMP(1757)  ! used 4 times
-      TMP_JAMP(1268) = TMP_JAMP(1066) + ((-0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * TMP_JAMP(1051)  ! used 4 times
-      TMP_JAMP(1267) = TMP_JAMP(1059) + ((-0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * AMP(493)  ! used 4 times
-      TMP_JAMP(1266) = TMP_JAMP(1061) + ((-0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * TMP_JAMP(1059)  ! used 4 times
-      TMP_JAMP(1265) = TMP_JAMP(1065) - TMP_JAMP(1061)  ! used 4 times
-      TMP_JAMP(1264) = AMP(1001) +  AMP(1009)  ! used 4 times
-      TMP_JAMP(1263) = AMP(983) - AMP(1011)  ! used 4 times
-      TMP_JAMP(1262) = TMP_JAMP(471) + ((-0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * TMP_JAMP(465)  ! used 4 times
-      TMP_JAMP(1261) = TMP_JAMP(474) + ((-0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * TMP_JAMP(463)  ! used 4 times
-      TMP_JAMP(1260) = TMP_JAMP(1048) - TMP_JAMP(1047)  ! used 4 times
-      TMP_JAMP(1259) = AMP(992) - AMP(1010)  ! used 4 times
-      TMP_JAMP(1258) = TMP_JAMP(450) +  AMP(992)  ! used 4 times
-      TMP_JAMP(1257) = TMP_JAMP(454) + ((0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * TMP_JAMP(450)  ! used 4 times
-      TMP_JAMP(1256) = TMP_JAMP(1046) + ((-0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * TMP_JAMP(454)  ! used 4 times
-      TMP_JAMP(1255) = TMP_JAMP(1048) +  TMP_JAMP(1046)  ! used 4 times
-      TMP_JAMP(1254) = TMP_JAMP(1046) +  AMP(142)  ! used 4 times
-      TMP_JAMP(1253) = TMP_JAMP(1047) +  TMP_JAMP(1046)  ! used 4 times
-      TMP_JAMP(1252) = TMP_JAMP(1089) + ((-0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * AMP(133)  ! used 4 times
-      TMP_JAMP(1251) = TMP_JAMP(1089) + ((-0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * TMP_JAMP(1048)  ! used 4 times
-      TMP_JAMP(1250) = TMP_JAMP(1064) +  AMP(148)  ! used 4 times
-      TMP_JAMP(1249) = TMP_JAMP(1064) +  TMP_JAMP(1047)  ! used 4 times
-      TMP_JAMP(1248) = AMP(266) +  AMP(289)  ! used 4 times
-      TMP_JAMP(1247) = AMP(186) - AMP(291)  ! used 4 times
-      TMP_JAMP(1246) = AMP(172) +  AMP(175)  ! used 4 times
-      TMP_JAMP(1245) = AMP(166) - AMP(177)  ! used 4 times
-      TMP_JAMP(1244) = TMP_JAMP(745) + ((-0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * AMP(186)  ! used 4 times
-      TMP_JAMP(1243) = TMP_JAMP(747) + ((-0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * AMP(266)  ! used 4 times
-      TMP_JAMP(1242) = TMP_JAMP(1043) - TMP_JAMP(747)  ! used 4 times
-      TMP_JAMP(1241) = TMP_JAMP(1044) + ((0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * AMP(195)  ! used 4 times
-      TMP_JAMP(1240) = TMP_JAMP(1045) - TMP_JAMP(745)  ! used 4 times
-      TMP_JAMP(1239) = TMP_JAMP(1045) +  TMP_JAMP(1044)  ! used 4 times
-      TMP_JAMP(1238) = TMP_JAMP(1114) + ((0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * AMP(172)  ! used 4 times
-      TMP_JAMP(1237) = TMP_JAMP(1114) - TMP_JAMP(1041)  ! used 4 times
-      TMP_JAMP(1236) = TMP_JAMP(1115) + ((0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * AMP(166)  ! used 4 times
-      TMP_JAMP(1235) = TMP_JAMP(1115) - TMP_JAMP(1042)  ! used 4 times
-      TMP_JAMP(1234) = AMP(239) +  AMP(310)  ! used 4 times
-      TMP_JAMP(1233) = AMP(206) - AMP(312)  ! used 4 times
-      TMP_JAMP(1232) = AMP(204) - AMP(290)  ! used 4 times
-      TMP_JAMP(1231) = TMP_JAMP(727) + ((0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * AMP(204)  ! used 4 times
-      TMP_JAMP(1230) = TMP_JAMP(1040) - TMP_JAMP(727)  ! used 4 times
-      TMP_JAMP(1229) = TMP_JAMP(1045) +  TMP_JAMP(1040)  ! used 4 times
-      TMP_JAMP(1228) = TMP_JAMP(1109) + ((-0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * AMP(239)  ! used 4 times
-      TMP_JAMP(1227) = TMP_JAMP(1109) +  TMP_JAMP(1038)  ! used 4 times
-      TMP_JAMP(1226) = TMP_JAMP(1110) + ((-0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * AMP(206)  ! used 4 times
-      TMP_JAMP(1225) = TMP_JAMP(1110) +  TMP_JAMP(1039)  ! used 4 times
-      TMP_JAMP(1224) = AMP(169) - AMP(176)  ! used 4 times
-      TMP_JAMP(1223) = TMP_JAMP(835) + ((-0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * AMP(213)  ! used 4 times
-      TMP_JAMP(1222) = TMP_JAMP(1036) + ((0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * AMP(169)  ! used 4 times
-      TMP_JAMP(1221) = TMP_JAMP(1037) - TMP_JAMP(835)  ! used 4 times
-      TMP_JAMP(1220) = TMP_JAMP(1037) +  TMP_JAMP(1036)  ! used 4 times
-      TMP_JAMP(1219) = TMP_JAMP(1040) +  TMP_JAMP(1037)  ! used 4 times
-      TMP_JAMP(1218) = TMP_JAMP(1042) +  TMP_JAMP(1036)  ! used 4 times
-      TMP_JAMP(1217) = TMP_JAMP(1043) - TMP_JAMP(1040)  ! used 4 times
-      TMP_JAMP(1216) = AMP(248) +  AMP(301)  ! used 4 times
-      TMP_JAMP(1215) = AMP(188) - AMP(303)  ! used 4 times
-      TMP_JAMP(1214) = TMP_JAMP(1100) + ((-0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * AMP(248)  ! used 4 times
-      TMP_JAMP(1213) = TMP_JAMP(1100) +  TMP_JAMP(1034)  ! used 4 times
-      TMP_JAMP(1212) = TMP_JAMP(1101) + ((-0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * AMP(188)  ! used 4 times
-      TMP_JAMP(1211) = TMP_JAMP(1101) +  TMP_JAMP(1035)  ! used 4 times
-      TMP_JAMP(1210) = AMP(224) - AMP(311)  ! used 4 times
-      TMP_JAMP(1209) = TMP_JAMP(766) + ((-0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * AMP(229)  ! used 4 times
-      TMP_JAMP(1208) = TMP_JAMP(1032) +  TMP_JAMP(766)  ! used 4 times
-      TMP_JAMP(1207) = TMP_JAMP(1033) + ((0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * AMP(224)  ! used 4 times
-      TMP_JAMP(1206) = TMP_JAMP(1039) +  TMP_JAMP(1033)  ! used 4 times
-      TMP_JAMP(1205) = TMP_JAMP(1043) +  TMP_JAMP(1032)  ! used 4 times
-      TMP_JAMP(1204) = AMP(227) - AMP(302)  ! used 4 times
-      TMP_JAMP(1203) = TMP_JAMP(1035) +  TMP_JAMP(1031)  ! used 4 times
-      TMP_JAMP(1202) = AMP(275) +  AMP(298)  ! used 4 times
-      TMP_JAMP(1201) = AMP(184) - AMP(300)  ! used 4 times
-      TMP_JAMP(1200) = TMP_JAMP(1034) + ((0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * AMP(275)  ! used 4 times
-      TMP_JAMP(1199) = TMP_JAMP(1035) + ((0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * AMP(184)  ! used 4 times
-      TMP_JAMP(1198) = TMP_JAMP(1041) - TMP_JAMP(1036)  ! used 4 times
-      TMP_JAMP(1197) = TMP_JAMP(1044) - TMP_JAMP(1035)  ! used 4 times
-      TMP_JAMP(1196) = AMP(222) - AMP(299)  ! used 4 times
-      TMP_JAMP(1195) = TMP_JAMP(1031) + ((-0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * AMP(222)  ! used 4 times
-      TMP_JAMP(1194) = TMP_JAMP(1038) - TMP_JAMP(1033)  ! used 4 times
-      TMP_JAMP(1193) = TMP_JAMP(1032) + ((0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * AMP(231)  ! used 4 times
-      TMP_JAMP(1192) = TMP_JAMP(1034) - TMP_JAMP(1031)  ! used 4 times
-      TMP_JAMP(1191) = TMP_JAMP(1042) +  TMP_JAMP(1032)  ! used 4 times
-      TMP_JAMP(1190) = AMP(257) +  AMP(292)  ! used 4 times
-      TMP_JAMP(1189) = AMP(191) - AMP(294)  ! used 4 times
-      TMP_JAMP(1188) = TMP_JAMP(1086) +  TMP_JAMP(1043)  ! used 4 times
-      TMP_JAMP(1187) = TMP_JAMP(1088) +  TMP_JAMP(1045)  ! used 4 times
-      TMP_JAMP(1186) = TMP_JAMP(1037) + ((-0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * AMP(211)  ! used 4 times
-      TMP_JAMP(1185) = TMP_JAMP(1039) - TMP_JAMP(1037)  ! used 4 times
-      TMP_JAMP(1184) = AMP(209) - AMP(293)  ! used 4 times
-      TMP_JAMP(1183) = TMP_JAMP(1081) +  TMP_JAMP(1040)  ! used 4 times
-      TMP_JAMP(1182) = AMP(284) +  AMP(307)  ! used 4 times
-      TMP_JAMP(1181) = AMP(202) - AMP(309)  ! used 4 times
-      TMP_JAMP(1180) = TMP_JAMP(1038) + ((0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * AMP(284)  ! used 4 times
-      TMP_JAMP(1179) = TMP_JAMP(1041) +  TMP_JAMP(1038)  ! used 4 times
-      TMP_JAMP(1178) = TMP_JAMP(1063) + ((0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * TMP_JAMP(1039)  ! used 4 times
-      TMP_JAMP(1177) = AMP(220) - AMP(308)  ! used 4 times
-      TMP_JAMP(1176) = TMP_JAMP(1062) + ((-0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * TMP_JAMP(1033)  ! used 4 times
-      TMP_JAMP(1175) = TMP_JAMP(1065) + ((0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * TMP_JAMP(1044)  ! used 4 times
-      TMP_JAMP(1174) = AMP(278) +  AMP(313)  ! used 4 times
-      TMP_JAMP(1173) = AMP(260) - AMP(315)  ! used 4 times
-      TMP_JAMP(1172) = TMP_JAMP(1047) + ((0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * TMP_JAMP(1041)  ! used 4 times
-      TMP_JAMP(1171) = TMP_JAMP(1048) + ((-0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * TMP_JAMP(1042)  ! used 4 times
-      TMP_JAMP(1170) = AMP(269) - AMP(314)  ! used 4 times
-      TMP_JAMP(1169) = TMP_JAMP(1046) + ((0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * TMP_JAMP(1036)  ! used 4 times
-      TMP_JAMP(1905) = TMP_JAMP(1821) + ((-0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * AMP(1513)  ! used 4 times
-      TMP_JAMP(1904) = TMP_JAMP(1781) + ((0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * AMP(1575)  ! used 4 times
-      TMP_JAMP(1903) = TMP_JAMP(1739) + ((0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * TMP_JAMP(854)  ! used 4 times
-      TMP_JAMP(1902) = TMP_JAMP(1735) + ((0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * TMP_JAMP(1731)  ! used 4 times
-      TMP_JAMP(1901) = TMP_JAMP(1721) + ((-0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * TMP_JAMP(1716)  ! used 4 times
-      TMP_JAMP(1900) = TMP_JAMP(1686) + ((0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * AMP(1632)  ! used 4 times
-      TMP_JAMP(1899) = TMP_JAMP(1675) + ((-0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * AMP(1828)  ! used 4 times
-      TMP_JAMP(1898) = TMP_JAMP(1645) - TMP_JAMP(1642)  ! used 4 times
-      TMP_JAMP(1897) = TMP_JAMP(1646) +  TMP_JAMP(1641)  ! used 4 times
-      TMP_JAMP(1896) = TMP_JAMP(1630) +  TMP_JAMP(1627)  ! used 4 times
-      TMP_JAMP(1895) = TMP_JAMP(1617) + ((-0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * TMP_JAMP(655)  ! used 4 times
-      TMP_JAMP(1894) = TMP_JAMP(1605) + ((-0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * TMP_JAMP(636)  ! used 4 times
-      TMP_JAMP(1893) = TMP_JAMP(1593) - AMP(1437)  ! used 4 times
-      TMP_JAMP(1892) = TMP_JAMP(1594) + ((-0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * TMP_JAMP(623)  ! used 4 times
-      TMP_JAMP(1891) = TMP_JAMP(1598) + ((0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * TMP_JAMP(1595)  ! used 4 times
-      TMP_JAMP(1890) = TMP_JAMP(1589) + ((0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * TMP_JAMP(617)  ! used 4 times
-      TMP_JAMP(1889) = TMP_JAMP(1585) + ((-0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * TMP_JAMP(1578)  ! used 4 times
-      TMP_JAMP(1888) = TMP_JAMP(1586) + ((-0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * TMP_JAMP(1579)  ! used 4 times
-      TMP_JAMP(1887) = TMP_JAMP(1568) + ((-0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * AMP(1647)  ! used 4 times
-      TMP_JAMP(1886) = TMP_JAMP(1573) + ((0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * TMP_JAMP(1565)  ! used 4 times
-      TMP_JAMP(1885) = TMP_JAMP(1574) - TMP_JAMP(1570)  ! used 4 times
-      TMP_JAMP(1884) = TMP_JAMP(1555) +  AMP(1280)  ! used 4 times
-      TMP_JAMP(1883) = TMP_JAMP(1559) + ((0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * TMP_JAMP(1556)  ! used 4 times
-      TMP_JAMP(1882) = TMP_JAMP(1560) + ((0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * TMP_JAMP(1557)  ! used 4 times
-      TMP_JAMP(1881) = TMP_JAMP(1538) + ((-0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * TMP_JAMP(1534)  ! used 4 times
-      TMP_JAMP(1880) = TMP_JAMP(1529) + ((-0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * TMP_JAMP(574)  ! used 4 times
-      TMP_JAMP(1879) = TMP_JAMP(1525) + ((-0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * TMP_JAMP(570)  ! used 4 times
-      TMP_JAMP(1878) = TMP_JAMP(1526) + ((0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * TMP_JAMP(1523)  ! used 4 times
-      TMP_JAMP(1877) = TMP_JAMP(1511) + ((-0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * TMP_JAMP(1509)  ! used 4 times
-      TMP_JAMP(1876) = TMP_JAMP(1512) + ((-0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * TMP_JAMP(1510)  ! used 4 times
-      TMP_JAMP(1875) = TMP_JAMP(1507) +  TMP_JAMP(1506)  ! used 4 times
-      TMP_JAMP(1874) = TMP_JAMP(1508) - TMP_JAMP(1505)  ! used 4 times
-      TMP_JAMP(1873) = TMP_JAMP(1499) + ((-0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * TMP_JAMP(1497)  ! used 4 times
-      TMP_JAMP(1872) = TMP_JAMP(1491) + ((-0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * TMP_JAMP(543)  ! used 4 times
-      TMP_JAMP(1871) = TMP_JAMP(1492) + ((-0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * TMP_JAMP(1490)  ! used 4 times
-      TMP_JAMP(1870) = TMP_JAMP(1480) + ((0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * TMP_JAMP(1479)  ! used 4 times
-      TMP_JAMP(1869) = TMP_JAMP(1478) - TMP_JAMP(1477)  ! used 4 times
-      TMP_JAMP(1868) = TMP_JAMP(1468) + ((-0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * AMP(1558)  ! used 4 times
-      TMP_JAMP(1867) = TMP_JAMP(1472) + ((-0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * TMP_JAMP(1467)  ! used 4 times
-      TMP_JAMP(1866) = TMP_JAMP(1475) - TMP_JAMP(1471)  ! used 4 times
-      TMP_JAMP(1865) = TMP_JAMP(1462) +  TMP_JAMP(1459)  ! used 4 times
-      TMP_JAMP(1864) = TMP_JAMP(1451) - TMP_JAMP(1450)  ! used 4 times
-      TMP_JAMP(1863) = TMP_JAMP(1435) +  TMP_JAMP(1434)  ! used 4 times
-      TMP_JAMP(1862) = TMP_JAMP(1424) + ((-0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * AMP(1614)  ! used 4 times
-      TMP_JAMP(1861) = TMP_JAMP(1403) + ((-0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * TMP_JAMP(1399)  ! used 4 times
-      TMP_JAMP(1860) = TMP_JAMP(1404) + ((0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * TMP_JAMP(1400)  ! used 4 times
-      TMP_JAMP(1859) = TMP_JAMP(1397) + ((-0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * TMP_JAMP(1396)  ! used 4 times
-      TMP_JAMP(1858) = TMP_JAMP(1381) + ((0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * AMP(1777)  ! used 4 times
-      TMP_JAMP(1857) = TMP_JAMP(1367) + ((0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * TMP_JAMP(1364)  ! used 4 times
-      TMP_JAMP(1856) = TMP_JAMP(1339) +  TMP_JAMP(1338)  ! used 4 times
-      TMP_JAMP(1855) = TMP_JAMP(1340) - TMP_JAMP(1337)  ! used 4 times
-      TMP_JAMP(1854) = TMP_JAMP(1316) - TMP_JAMP(1315)  ! used 4 times
-      TMP_JAMP(1853) = TMP_JAMP(1281) + ((0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * TMP_JAMP(1278)  ! used 4 times
-      TMP_JAMP(1852) = TMP_JAMP(1274) + ((-0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * TMP_JAMP(1273)  ! used 4 times
-      TMP_JAMP(1851) = TMP_JAMP(1271) + ((-0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * AMP(1756)  ! used 4 times
-      TMP_JAMP(1850) = TMP_JAMP(1272) +  TMP_JAMP(1270)  ! used 4 times
-      TMP_JAMP(1849) = TMP_JAMP(1269) +  TMP_JAMP(1268)  ! used 4 times
-      TMP_JAMP(1848) = TMP_JAMP(1263) + ((0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * TMP_JAMP(1262)  ! used 4 times
-      TMP_JAMP(1847) = TMP_JAMP(1264) + ((0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * TMP_JAMP(1261)  ! used 4 times
-      TMP_JAMP(1846) = TMP_JAMP(1189) + ((0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * TMP_JAMP(1187)  ! used 4 times
-      TMP_JAMP(1845) = TMP_JAMP(1190) + ((0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * TMP_JAMP(1188)  ! used 4 times
-      TMP_JAMP(1844) = TMP_JAMP(1184) + ((-0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * TMP_JAMP(1183)  ! used 4 times
-      TMP_JAMP(1843) = TMP_JAMP(1181) +  TMP_JAMP(1178)  ! used 4 times
-      TMP_JAMP(1842) = TMP_JAMP(1177) +  TMP_JAMP(1176)  ! used 4 times
-      TMP_JAMP(1841) = TMP_JAMP(1175) +  AMP(193)  ! used 4 times
-      TMP_JAMP(1840) = TMP_JAMP(1173) - TMP_JAMP(1171)  ! used 4 times
-      TMP_JAMP(1839) = TMP_JAMP(1174) +  TMP_JAMP(1172)  ! used 4 times
-      TMP_JAMP(1838) = TMP_JAMP(1170) - TMP_JAMP(1169)  ! used 4 times
-      TMP_JAMP(1928) = TMP_JAMP(1897) + ((-0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * TMP_JAMP(1637)  ! used 4 times
-      TMP_JAMP(1927) = TMP_JAMP(1898) + ((0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * TMP_JAMP(1636)  ! used 4 times
-      TMP_JAMP(1926) = TMP_JAMP(1896) + ((0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * TMP_JAMP(1626)  ! used 4 times
-      TMP_JAMP(1925) = TMP_JAMP(1888) - AMP(448)  ! used 4 times
-      TMP_JAMP(1924) = TMP_JAMP(1889) +  AMP(450)  ! used 4 times
-      TMP_JAMP(1923) = TMP_JAMP(1882) +  AMP(1657)  ! used 4 times
-      TMP_JAMP(1922) = TMP_JAMP(1883) - AMP(1659)  ! used 4 times
-      TMP_JAMP(1921) = TMP_JAMP(1878) +  AMP(449)  ! used 4 times
-      TMP_JAMP(1920) = TMP_JAMP(1876) - AMP(1576)  ! used 4 times
-      TMP_JAMP(1919) = TMP_JAMP(1877) +  AMP(1578)  ! used 4 times
-      TMP_JAMP(1918) = TMP_JAMP(1871) - AMP(1658)  ! used 4 times
-      TMP_JAMP(1917) = TMP_JAMP(1870) +  AMP(1577)  ! used 4 times
-      TMP_JAMP(1916) = TMP_JAMP(1866) + ((-0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * TMP_JAMP(1464)  ! used 4 times
-      TMP_JAMP(1915) = TMP_JAMP(1867) + ((-0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * AMP(1416)  ! used 4 times
-      TMP_JAMP(1914) = TMP_JAMP(1865) + ((-0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * TMP_JAMP(1454)  ! used 4 times
-      TMP_JAMP(1913) = TMP_JAMP(1864) + ((0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * TMP_JAMP(1444)  ! used 4 times
-      TMP_JAMP(1912) = TMP_JAMP(1863) + ((0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * TMP_JAMP(1429)  ! used 4 times
-      TMP_JAMP(1911) = TMP_JAMP(1860) + ((0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * AMP(1879)  ! used 4 times
-      TMP_JAMP(1910) = TMP_JAMP(1861) + ((-0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * AMP(440)  ! used 4 times
-      TMP_JAMP(1909) = TMP_JAMP(1857) + ((0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * AMP(1068)  ! used 4 times
-      TMP_JAMP(1908) = TMP_JAMP(1855) + ((-0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * TMP_JAMP(1330)  ! used 4 times
-      TMP_JAMP(1907) = TMP_JAMP(1856) + ((0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * TMP_JAMP(1331)  ! used 4 times
-      TMP_JAMP(1906) = TMP_JAMP(1854) + ((0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * TMP_JAMP(1311)  ! used 4 times
-      TMP_JAMP(1989) = TMP_JAMP(1167) + ((0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * AMP(528)  ! used 3 times
-      TMP_JAMP(1988) = TMP_JAMP(1799) + ((-0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * TMP_JAMP(1156)  ! used 3 times
-      TMP_JAMP(1987) = TMP_JAMP(1827) + ((-0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * AMP(453)  ! used 3 times
-      TMP_JAMP(1986) = TMP_JAMP(1830) + ((-0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * AMP(352)  ! used 3 times
-      TMP_JAMP(1985) = TMP_JAMP(970) - AMP(1202)  ! used 3 times
-      TMP_JAMP(1984) = TMP_JAMP(1166) + ((-0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * TMP_JAMP(909)  ! used 3 times
-      TMP_JAMP(1983) = TMP_JAMP(1812) + ((-0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * AMP(511)  ! used 3 times
-      TMP_JAMP(1982) = TMP_JAMP(1165) + ((0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * AMP(370)  ! used 3 times
-      TMP_JAMP(1981) = TMP_JAMP(1164) + ((0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * AMP(670)  ! used 3 times
-      TMP_JAMP(1980) = TMP_JAMP(893) - AMP(432)  ! used 3 times
-      TMP_JAMP(1979) = TMP_JAMP(1166) + ((0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * AMP(687)  ! used 3 times
-      TMP_JAMP(1978) = TMP_JAMP(926) + ((-0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * TMP_JAMP(919)  ! used 3 times
-      TMP_JAMP(1977) = TMP_JAMP(1737) + ((0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * TMP_JAMP(864)  ! used 3 times
-      TMP_JAMP(1976) = TMP_JAMP(1902) +  TMP_JAMP(837)  ! used 3 times
-      TMP_JAMP(1975) = TMP_JAMP(875) - AMP(1058)  ! used 3 times
-      TMP_JAMP(1974) = TMP_JAMP(833) + ((-0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * AMP(527)  ! used 3 times
-      TMP_JAMP(1973) = TMP_JAMP(792) + ((0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * TMP_JAMP(789)  ! used 3 times
-      TMP_JAMP(1972) = TMP_JAMP(1704) - TMP_JAMP(785)  ! used 3 times
-      TMP_JAMP(1971) = TMP_JAMP(1708) + ((-0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * AMP(1695)  ! used 3 times
-      TMP_JAMP(1970) = TMP_JAMP(864) + ((-0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * TMP_JAMP(779)  ! used 3 times
-      TMP_JAMP(1969) = TMP_JAMP(1690) - TMP_JAMP(1013)  ! used 3 times
-      TMP_JAMP(1968) = TMP_JAMP(1669) + ((-0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * TMP_JAMP(1156)  ! used 3 times
-      TMP_JAMP(1967) = TMP_JAMP(833) + ((-0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * TMP_JAMP(716)  ! used 3 times
-      TMP_JAMP(1966) = TMP_JAMP(1571) + ((0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * TMP_JAMP(1566)  ! used 3 times
-      TMP_JAMP(1965) = TMP_JAMP(1581) + ((-0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * TMP_JAMP(610)  ! used 3 times
-      TMP_JAMP(1964) = TMP_JAMP(1553) + ((-0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * TMP_JAMP(1549)  ! used 3 times
-      TMP_JAMP(1963) = TMP_JAMP(1532) + ((-0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * TMP_JAMP(1530)  ! used 3 times
-      TMP_JAMP(1962) = TMP_JAMP(1576) +  AMP(1112)  ! used 3 times
-      TMP_JAMP(1961) = TMP_JAMP(1522) - TMP_JAMP(1519)  ! used 3 times
-      TMP_JAMP(1960) = TMP_JAMP(1535) + ((0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * TMP_JAMP(1496)  ! used 3 times
-      TMP_JAMP(1959) = TMP_JAMP(1489) + ((-0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * TMP_JAMP(1486)  ! used 3 times
-      TMP_JAMP(1958) = TMP_JAMP(1554) + ((-0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * TMP_JAMP(1548)  ! used 3 times
-      TMP_JAMP(1957) = TMP_JAMP(1469) + ((-0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * AMP(1560)  ! used 3 times
-      TMP_JAMP(1956) = TMP_JAMP(1446) + ((0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * AMP(705)  ! used 3 times
-      TMP_JAMP(1955) = TMP_JAMP(1575) +  TMP_JAMP(1569)  ! used 3 times
-      TMP_JAMP(1954) = TMP_JAMP(864) + ((-0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * AMP(675)  ! used 3 times
-      TMP_JAMP(1953) = TMP_JAMP(1405) + ((-0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * TMP_JAMP(1401)  ! used 3 times
-      TMP_JAMP(1952) = TMP_JAMP(1379) + ((0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * AMP(61)  ! used 3 times
-      TMP_JAMP(1951) = TMP_JAMP(833) + ((-0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * AMP(514)  ! used 3 times
-      TMP_JAMP(1950) = TMP_JAMP(1362) + ((-0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * TMP_JAMP(1360)  ! used 3 times
-      TMP_JAMP(1949) = TMP_JAMP(1368) + ((-0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * TMP_JAMP(1365)  ! used 3 times
-      TMP_JAMP(1948) = TMP_JAMP(1348) + ((-0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * AMP(628)  ! used 3 times
-      TMP_JAMP(1947) = TMP_JAMP(1382) + ((-0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * TMP_JAMP(1132)  ! used 3 times
-      TMP_JAMP(1946) = TMP_JAMP(1336) + ((-0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * AMP(1650)  ! used 3 times
-      TMP_JAMP(1945) = TMP_JAMP(794) + ((0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * TMP_JAMP(788)  ! used 3 times
-      TMP_JAMP(1944) = TMP_JAMP(1326) + ((0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * AMP(1649)  ! used 3 times
-      TMP_JAMP(1943) = TMP_JAMP(1313) + ((0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * AMP(653)  ! used 3 times
-      TMP_JAMP(1942) = TMP_JAMP(1335) + ((0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * AMP(1648)  ! used 3 times
-      TMP_JAMP(1941) = TMP_JAMP(777) + ((0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * TMP_JAMP(772)  ! used 3 times
-      TMP_JAMP(1940) = TMP_JAMP(731) + ((0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * TMP_JAMP(725)  ! used 3 times
-      TMP_JAMP(1939) = TMP_JAMP(1319) + ((-0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * AMP(1764)  ! used 3 times
-      TMP_JAMP(1938) = TMP_JAMP(714) + ((0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * TMP_JAMP(711)  ! used 3 times
-      TMP_JAMP(1937) = TMP_JAMP(1377) + ((-0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * AMP(1522)  ! used 3 times
-      TMP_JAMP(1936) = TMP_JAMP(1373) + ((-0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * AMP(1528)  ! used 3 times
-      TMP_JAMP(1935) = TMP_JAMP(760) +  AMP(136)  ! used 3 times
-      TMP_JAMP(1934) = TMP_JAMP(1259) + ((-0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * TMP_JAMP(1257)  ! used 3 times
-      TMP_JAMP(1933) = TMP_JAMP(1231) + ((0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * AMP(290)  ! used 3 times
-      TMP_JAMP(1932) = TMP_JAMP(1236) + ((-0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * AMP(177)  ! used 3 times
-      TMP_JAMP(1931) = TMP_JAMP(1243) + ((0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * AMP(289)  ! used 3 times
-      TMP_JAMP(1930) = TMP_JAMP(1226) + ((0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * AMP(312)  ! used 3 times
-      TMP_JAMP(1929) = TMP_JAMP(1238) + ((-0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * AMP(175)  ! used 3 times
-      TMP_JAMP(1991) = TMP_JAMP(1969) +  AMP(1458)  ! used 3 times
-      TMP_JAMP(1990) = TMP_JAMP(1935) + ((-0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * AMP(1644)  ! used 3 times
-      TMP_JAMP(2641) = AMP(150) + ((0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * AMP(436)  ! used 2 times
-      TMP_JAMP(2640) = TMP_JAMP(1831) - AMP(83)  ! used 2 times
-      TMP_JAMP(2639) = TMP_JAMP(1836) + ((0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * AMP(415)  ! used 2 times
-      TMP_JAMP(2638) = TMP_JAMP(1837) +  AMP(150)  ! used 2 times
-      TMP_JAMP(2637) = TMP_JAMP(1905) +  AMP(349)  ! used 2 times
-      TMP_JAMP(2636) = AMP(150) + ((-0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * AMP(1060)  ! used 2 times
-      TMP_JAMP(2635) = TMP_JAMP(1814) + ((0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * TMP_JAMP(1802)  ! used 2 times
-      TMP_JAMP(2634) = TMP_JAMP(1815) + ((-0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * TMP_JAMP(1801)  ! used 2 times
-      TMP_JAMP(2633) = TMP_JAMP(1820) + ((-0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * TMP_JAMP(1816)  ! used 2 times
-      TMP_JAMP(2632) = TMP_JAMP(1796) + ((-0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * AMP(1514)  ! used 2 times
-      TMP_JAMP(2631) = TMP_JAMP(1798) + ((0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * AMP(369)  ! used 2 times
-      TMP_JAMP(2630) = TMP_JAMP(1800) + ((0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * TMP_JAMP(1791)  ! used 2 times
-      TMP_JAMP(2629) = TMP_JAMP(1804) +  TMP_JAMP(1796)  ! used 2 times
-      TMP_JAMP(2628) = TMP_JAMP(1905) - TMP_JAMP(1804)  ! used 2 times
-      TMP_JAMP(2627) = TMP_JAMP(1986) +  AMP(444)  ! used 2 times
-      TMP_JAMP(2626) = TMP_JAMP(1987) +  TMP_JAMP(1793)  ! used 2 times
-      TMP_JAMP(2625) = TMP_JAMP(1989) + ((0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * TMP_JAMP(1988)  ! used 2 times
-      TMP_JAMP(2624) = TMP_JAMP(931) + ((-0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * AMP(92)  ! used 2 times
-      TMP_JAMP(2623) = TMP_JAMP(945) + ((-0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * TMP_JAMP(938)  ! used 2 times
-      TMP_JAMP(2622) = TMP_JAMP(1782) +  AMP(1206)  ! used 2 times
-      TMP_JAMP(2621) = TMP_JAMP(1784) - AMP(1460)  ! used 2 times
-      TMP_JAMP(2620) = TMP_JAMP(1785) + ((-0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * TMP_JAMP(931)  ! used 2 times
-      TMP_JAMP(2619) = TMP_JAMP(1787) - TMP_JAMP(1785)  ! used 2 times
-      TMP_JAMP(2618) = TMP_JAMP(1788) + ((0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * TMP_JAMP(1786)  ! used 2 times
-      TMP_JAMP(2617) = TMP_JAMP(1833) + ((-0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * TMP_JAMP(1787)  ! used 2 times
-      TMP_JAMP(2616) = TMP_JAMP(1904) + ((0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * TMP_JAMP(1833)  ! used 2 times
-      TMP_JAMP(2615) = AMP(1065) + ((-0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * AMP(1765)  ! used 2 times
-      TMP_JAMP(2614) = TMP_JAMP(1772) - AMP(1065)  ! used 2 times
-      TMP_JAMP(2613) = TMP_JAMP(1774) +  TMP_JAMP(1773)  ! used 2 times
-      TMP_JAMP(2612) = TMP_JAMP(1776) + ((0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * AMP(1767)  ! used 2 times
-      TMP_JAMP(2611) = TMP_JAMP(1777) - AMP(1121)  ! used 2 times
-      TMP_JAMP(2610) = TMP_JAMP(1777) + ((0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * AMP(686)  ! used 2 times
-      TMP_JAMP(2609) = TMP_JAMP(1814) +  TMP_JAMP(1779)  ! used 2 times
-      TMP_JAMP(2608) = TMP_JAMP(1983) - TMP_JAMP(1772)  ! used 2 times
-      TMP_JAMP(2607) = TMP_JAMP(1984) - TMP_JAMP(1905)  ! used 2 times
-      TMP_JAMP(2606) = AMP(686) + ((-0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * AMP(761)  ! used 2 times
-      TMP_JAMP(2605) = TMP_JAMP(1152) + ((0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * TMP_JAMP(898)  ! used 2 times
-      TMP_JAMP(2604) = TMP_JAMP(1904) - TMP_JAMP(1152)  ! used 2 times
-      TMP_JAMP(2603) = AMP(82) + ((-0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * AMP(144)  ! used 2 times
-      TMP_JAMP(2602) = TMP_JAMP(886) - TMP_JAMP(883)  ! used 2 times
-      TMP_JAMP(2601) = TMP_JAMP(1763) + ((0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * AMP(367)  ! used 2 times
-      TMP_JAMP(2600) = TMP_JAMP(1820) +  TMP_JAMP(1155)  ! used 2 times
-      TMP_JAMP(2599) = TMP_JAMP(1836) + ((-0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * TMP_JAMP(1756)  ! used 2 times
-      TMP_JAMP(2598) = TMP_JAMP(1982) + ((0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * TMP_JAMP(1762)  ! used 2 times
-      TMP_JAMP(2597) = TMP_JAMP(1988) + ((0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * TMP_JAMP(1755)  ! used 2 times
-      TMP_JAMP(2596) = TMP_JAMP(1746) + ((0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * AMP(144)  ! used 2 times
-      TMP_JAMP(2595) = TMP_JAMP(1752) + ((-0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * TMP_JAMP(1746)  ! used 2 times
-      TMP_JAMP(2594) = TMP_JAMP(1761) - TMP_JAMP(869)  ! used 2 times
-      TMP_JAMP(2593) = TMP_JAMP(1779) - TMP_JAMP(1751)  ! used 2 times
-      TMP_JAMP(2592) = TMP_JAMP(1816) + ((-0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * TMP_JAMP(1803)  ! used 2 times
-      TMP_JAMP(2591) = TMP_JAMP(1981) + ((0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * TMP_JAMP(1752)  ! used 2 times
-      TMP_JAMP(2590) = TMP_JAMP(853) + ((0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * AMP(351)  ! used 2 times
-      TMP_JAMP(2589) = TMP_JAMP(855) + ((-0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * AMP(435)  ! used 2 times
-      TMP_JAMP(2588) = TMP_JAMP(860) + ((0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * AMP(792)  ! used 2 times
-      TMP_JAMP(2587) = TMP_JAMP(974) + ((-0.000000000000000D+00
+      TMP_JAMP(862) = TMP_JAMP(682) + ((-0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * AMP(807)  ! used 4 times
+      TMP_JAMP(861) = TMP_JAMP(682) + ((0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * AMP(801)  ! used 4 times
+      TMP_JAMP(860) = TMP_JAMP(681) + ((-0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * AMP(810)  ! used 4 times
+      TMP_JAMP(859) = TMP_JAMP(681) + ((0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * AMP(799)  ! used 4 times
+      TMP_JAMP(858) = TMP_JAMP(676) - AMP(614)  ! used 4 times
+      TMP_JAMP(857) = TMP_JAMP(675) +  AMP(625)  ! used 4 times
+      TMP_JAMP(856) = TMP_JAMP(668) - AMP(690)  ! used 4 times
+      TMP_JAMP(855) = AMP(475) - AMP(477)  ! used 4 times
+      TMP_JAMP(854) = AMP(474) + ((-0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * AMP(477)  ! used 4 times
+      TMP_JAMP(853) = AMP(472) + ((0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * AMP(475)  ! used 4 times
+      TMP_JAMP(852) = AMP(593) + ((-0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * AMP(622)  ! used 4 times
+      TMP_JAMP(851) = AMP(622) + ((0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * AMP(627)  ! used 4 times
+      TMP_JAMP(850) = AMP(678) + ((0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * AMP(686)  ! used 4 times
+      TMP_JAMP(849) = AMP(678) + ((-0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * AMP(767)  ! used 4 times
+      TMP_JAMP(848) = AMP(476) +  AMP(477)  ! used 4 times
+      TMP_JAMP(847) = AMP(473) + ((0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * AMP(476)  ! used 4 times
+      TMP_JAMP(846) = AMP(534) + ((0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * AMP(536)  ! used 4 times
+      TMP_JAMP(845) = AMP(695) + ((-0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * AMP(708)  ! used 4 times
+      TMP_JAMP(844) = AMP(536) +  AMP(695)  ! used 4 times
+      TMP_JAMP(843) = AMP(693) + ((0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * AMP(695)  ! used 4 times
+      TMP_JAMP(842) = AMP(536) + ((-0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * AMP(549)  ! used 4 times
+      TMP_JAMP(841) = AMP(923) + ((-0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * AMP(941)  ! used 4 times
+      TMP_JAMP(840) = AMP(932) + ((-0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * AMP(941)  ! used 4 times
+      TMP_JAMP(839) = AMP(475) +  AMP(476)  ! used 4 times
+      TMP_JAMP(838) = AMP(481) + ((0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * AMP(536)  ! used 4 times
+      TMP_JAMP(837) = AMP(640) + ((-0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * AMP(695)  ! used 4 times
+      TMP_JAMP(836) = AMP(811) + ((0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * AMP(941)  ! used 4 times
+      TMP_JAMP(1062) = TMP_JAMP(1037) - TMP_JAMP(857)  ! used 4 times
+      TMP_JAMP(1061) = TMP_JAMP(993) +  TMP_JAMP(762)  ! used 4 times
+      TMP_JAMP(1060) = TMP_JAMP(979) + ((-0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * TMP_JAMP(788)  ! used 4 times
+      TMP_JAMP(1059) = TMP_JAMP(959) + ((0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * TMP_JAMP(795)  ! used 4 times
+      TMP_JAMP(1058) = TMP_JAMP(958) + ((0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * TMP_JAMP(795)  ! used 4 times
+      TMP_JAMP(1057) = TMP_JAMP(952) + ((0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * TMP_JAMP(788)  ! used 4 times
+      TMP_JAMP(1056) = TMP_JAMP(940) + ((-0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * TMP_JAMP(752)  ! used 4 times
+      TMP_JAMP(1055) = TMP_JAMP(932) + ((-0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * TMP_JAMP(762)  ! used 4 times
+      TMP_JAMP(1054) = TMP_JAMP(929) + ((-0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * TMP_JAMP(750)  ! used 4 times
+      TMP_JAMP(1053) = TMP_JAMP(914) + ((-0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * TMP_JAMP(753)  ! used 4 times
+      TMP_JAMP(1052) = TMP_JAMP(913) + ((0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * TMP_JAMP(754)  ! used 4 times
+      TMP_JAMP(1051) = TMP_JAMP(904) + ((-0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * TMP_JAMP(793)  ! used 4 times
+      TMP_JAMP(1050) = TMP_JAMP(895) + ((-0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * TMP_JAMP(793)  ! used 4 times
+      TMP_JAMP(1049) = TMP_JAMP(883) + ((0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * TMP_JAMP(754)  ! used 4 times
+      TMP_JAMP(1048) = TMP_JAMP(882) - TMP_JAMP(744)  ! used 4 times
+      TMP_JAMP(1047) = TMP_JAMP(875) + ((-0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * TMP_JAMP(822)  ! used 4 times
+      TMP_JAMP(1046) = TMP_JAMP(867) + ((-0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * TMP_JAMP(793)  ! used 4 times
+      TMP_JAMP(1045) = TMP_JAMP(864) + ((-0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * TMP_JAMP(836)  ! used 4 times
+      TMP_JAMP(1044) = TMP_JAMP(863) + ((0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * TMP_JAMP(751)  ! used 4 times
+      TMP_JAMP(1065) = TMP_JAMP(944) + ((0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * TMP_JAMP(781)  ! used 3 times
+      TMP_JAMP(1064) = TMP_JAMP(943) + ((0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * TMP_JAMP(777)  ! used 3 times
+      TMP_JAMP(1063) = TMP_JAMP(893) + ((0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * TMP_JAMP(667)  ! used 3 times
+      TMP_JAMP(1304) = TMP_JAMP(1065) +  TMP_JAMP(1064)  ! used 2 times
+      TMP_JAMP(1303) = TMP_JAMP(1065) - TMP_JAMP(981)  ! used 2 times
+      TMP_JAMP(1302) = TMP_JAMP(1063) + ((0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * TMP_JAMP(804)  ! used 2 times
+      TMP_JAMP(1301) = TMP_JAMP(1063) - TMP_JAMP(896)  ! used 2 times
+      TMP_JAMP(1300) = TMP_JAMP(1062) +  TMP_JAMP(1012)  ! used 2 times
+      TMP_JAMP(1299) = TMP_JAMP(1062) + ((0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * TMP_JAMP(907)  ! used 2 times
+      TMP_JAMP(1298) = TMP_JAMP(1061) + ((0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * TMP_JAMP(1054)  ! used 2 times
+      TMP_JAMP(1297) = TMP_JAMP(1059) + ((-0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * TMP_JAMP(1025)  ! used 2 times
+      TMP_JAMP(1296) = TMP_JAMP(1058) + ((-0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * TMP_JAMP(1031)  ! used 2 times
+      TMP_JAMP(1295) = TMP_JAMP(1057) +  TMP_JAMP(1055)  ! used 2 times
+      TMP_JAMP(1294) = TMP_JAMP(1056) + ((-0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * TMP_JAMP(996)  ! used 2 times
+      TMP_JAMP(1293) = TMP_JAMP(1055) + ((-0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * TMP_JAMP(993)  ! used 2 times
+      TMP_JAMP(1292) = TMP_JAMP(1054) + ((0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * TMP_JAMP(999)  ! used 2 times
+      TMP_JAMP(1291) = TMP_JAMP(1053) +  TMP_JAMP(933)  ! used 2 times
+      TMP_JAMP(1290) = TMP_JAMP(1052) +  TMP_JAMP(939)  ! used 2 times
+      TMP_JAMP(1289) = TMP_JAMP(1050) + ((-0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * TMP_JAMP(1021)  ! used 2 times
+      TMP_JAMP(1288) = TMP_JAMP(1049) - TMP_JAMP(939)  ! used 2 times
+      TMP_JAMP(1287) = TMP_JAMP(1048) + ((0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * TMP_JAMP(1042)  ! used 2 times
+      TMP_JAMP(1286) = TMP_JAMP(1047) - TMP_JAMP(945)  ! used 2 times
+      TMP_JAMP(1285) = TMP_JAMP(1047) + ((0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * TMP_JAMP(1041)  ! used 2 times
+      TMP_JAMP(1284) = TMP_JAMP(1046) +  TMP_JAMP(895)  ! used 2 times
+      TMP_JAMP(1283) = TMP_JAMP(1045) +  TMP_JAMP(925)  ! used 2 times
+      TMP_JAMP(1282) = TMP_JAMP(1044) + ((0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * TMP_JAMP(992)  ! used 2 times
+      TMP_JAMP(1281) = TMP_JAMP(1044) + ((-0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * TMP_JAMP(994)  ! used 2 times
+      TMP_JAMP(1280) = TMP_JAMP(1043) +  TMP_JAMP(1006)  ! used 2 times
+      TMP_JAMP(1279) = TMP_JAMP(1041) +  TMP_JAMP(1030)  ! used 2 times
+      TMP_JAMP(1278) = TMP_JAMP(1040) + ((0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * TMP_JAMP(984)  ! used 2 times
+      TMP_JAMP(1277) = TMP_JAMP(1040) +  TMP_JAMP(1029)  ! used 2 times
+      TMP_JAMP(1276) = TMP_JAMP(1035) + ((0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * TMP_JAMP(988)  ! used 2 times
+      TMP_JAMP(1275) = TMP_JAMP(1031) +  TMP_JAMP(1005)  ! used 2 times
+      TMP_JAMP(1274) = TMP_JAMP(1031) + ((0.000000000000000D+00
      $ ,1.000000000000000D+00)) * TMP_JAMP(961)  ! used 2 times
-      TMP_JAMP(2586) = TMP_JAMP(1744) + ((0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * TMP_JAMP(1740)  ! used 2 times
-      TMP_JAMP(2585) = TMP_JAMP(1745) +  TMP_JAMP(1744)  ! used 2 times
-      TMP_JAMP(2584) = TMP_JAMP(1903) + ((0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * TMP_JAMP(1763)  ! used 2 times
-      TMP_JAMP(2583) = TMP_JAMP(1987) + ((-0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * TMP_JAMP(853)  ! used 2 times
-      TMP_JAMP(2582) = TMP_JAMP(1987) - TMP_JAMP(1979)  ! used 2 times
-      TMP_JAMP(2581) = AMP(122) + ((0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * AMP(1469)  ! used 2 times
-      TMP_JAMP(2580) = AMP(95) + ((-0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * AMP(122)  ! used 2 times
-      TMP_JAMP(2579) = TMP_JAMP(939) - TMP_JAMP(838)  ! used 2 times
-      TMP_JAMP(2578) = TMP_JAMP(1158) - AMP(1128)  ! used 2 times
-      TMP_JAMP(2577) = TMP_JAMP(1730) +  TMP_JAMP(942)  ! used 2 times
-      TMP_JAMP(2576) = TMP_JAMP(1736) + ((0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * TMP_JAMP(1732)  ! used 2 times
-      TMP_JAMP(2575) = TMP_JAMP(1976) - TMP_JAMP(1761)  ! used 2 times
-      TMP_JAMP(2574) = AMP(1056) + ((-0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * AMP(1230)  ! used 2 times
-      TMP_JAMP(2573) = TMP_JAMP(822) + ((-0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * AMP(509)  ! used 2 times
-      TMP_JAMP(2572) = TMP_JAMP(830) + ((0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * AMP(1722)  ! used 2 times
-      TMP_JAMP(2571) = TMP_JAMP(1728) + ((0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * TMP_JAMP(1724)  ! used 2 times
-      TMP_JAMP(2570) = TMP_JAMP(1729) +  TMP_JAMP(1728)  ! used 2 times
-      TMP_JAMP(2569) = TMP_JAMP(1985) + ((-0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * TMP_JAMP(1814)  ! used 2 times
-      TMP_JAMP(2568) = AMP(122) + ((-0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * AMP(1472)  ! used 2 times
-      TMP_JAMP(2567) = TMP_JAMP(811) + ((-0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * AMP(602)  ! used 2 times
-      TMP_JAMP(2566) = TMP_JAMP(1901) + ((-0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * TMP_JAMP(1167)  ! used 2 times
-      TMP_JAMP(2565) = AMP(378) + ((-0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * AMP(390)  ! used 2 times
-      TMP_JAMP(2564) = TMP_JAMP(795) + ((-0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * AMP(417)  ! used 2 times
-      TMP_JAMP(2563) = TMP_JAMP(795) - AMP(390)  ! used 2 times
-      TMP_JAMP(2562) = TMP_JAMP(798) + ((-0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * AMP(594)  ! used 2 times
-      TMP_JAMP(2561) = TMP_JAMP(798) - AMP(534)  ! used 2 times
-      TMP_JAMP(2560) = TMP_JAMP(1706) + ((0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * TMP_JAMP(796)  ! used 2 times
-      TMP_JAMP(2559) = TMP_JAMP(1715) + ((0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * TMP_JAMP(1712)  ! used 2 times
-      TMP_JAMP(2558) = TMP_JAMP(1798) + ((-0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * TMP_JAMP(1709)  ! used 2 times
-      TMP_JAMP(2557) = TMP_JAMP(1798) +  TMP_JAMP(1713)  ! used 2 times
-      TMP_JAMP(2556) = TMP_JAMP(1989) + ((-0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * TMP_JAMP(800)  ! used 2 times
-      TMP_JAMP(2555) = AMP(534) - AMP(708)  ! used 2 times
-      TMP_JAMP(2554) = TMP_JAMP(1163) + ((-0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * TMP_JAMP(783)  ! used 2 times
-      TMP_JAMP(2553) = TMP_JAMP(1903) + ((-0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * TMP_JAMP(1769)  ! used 2 times
-      TMP_JAMP(2552) = TMP_JAMP(1972) + ((-0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * AMP(762)  ! used 2 times
-      TMP_JAMP(2551) = TMP_JAMP(1972) + ((-0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * TMP_JAMP(1134)  ! used 2 times
-      TMP_JAMP(2550) = TMP_JAMP(1972) + ((-0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * TMP_JAMP(1971)  ! used 2 times
-      TMP_JAMP(2549) = TMP_JAMP(1973) - TMP_JAMP(1163)  ! used 2 times
-      TMP_JAMP(2548) = AMP(380) + ((-0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * AMP(390)  ! used 2 times
-      TMP_JAMP(2547) = TMP_JAMP(1700) - TMP_JAMP(780)  ! used 2 times
-      TMP_JAMP(2546) = TMP_JAMP(1741) + ((0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * TMP_JAMP(778)  ! used 2 times
-      TMP_JAMP(2545) = TMP_JAMP(1979) +  TMP_JAMP(1970)  ! used 2 times
-      TMP_JAMP(2544) = AMP(549) + ((0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * AMP(1296)  ! used 2 times
-      TMP_JAMP(2543) = TMP_JAMP(768) +  TMP_JAMP(767)  ! used 2 times
-      TMP_JAMP(2542) = TMP_JAMP(1737) + ((-0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * TMP_JAMP(1162)  ! used 2 times
-      TMP_JAMP(2541) = TMP_JAMP(1971) + ((-0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * TMP_JAMP(1901)  ! used 2 times
-      TMP_JAMP(2540) = TMP_JAMP(759) + ((-0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * AMP(1454)  ! used 2 times
-      TMP_JAMP(2539) = TMP_JAMP(1689) + ((-0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * TMP_JAMP(759)  ! used 2 times
-      TMP_JAMP(2538) = TMP_JAMP(1691) + ((0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * AMP(1289)  ! used 2 times
-      TMP_JAMP(2537) = TMP_JAMP(1715) + ((-0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * TMP_JAMP(1689)  ! used 2 times
-      TMP_JAMP(2536) = TMP_JAMP(1769) + ((-0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * TMP_JAMP(906)  ! used 2 times
-      TMP_JAMP(2535) = TMP_JAMP(1900) - AMP(91)  ! used 2 times
-      TMP_JAMP(2534) = TMP_JAMP(1991) + ((0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * TMP_JAMP(1715)  ! used 2 times
-      TMP_JAMP(2533) = TMP_JAMP(1991) - TMP_JAMP(1904)  ! used 2 times
-      TMP_JAMP(2532) = TMP_JAMP(749) +  AMP(1636)  ! used 2 times
-      TMP_JAMP(2531) = TMP_JAMP(1680) + ((-0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * TMP_JAMP(748)  ! used 2 times
-      TMP_JAMP(2530) = TMP_JAMP(1682) - AMP(94)  ! used 2 times
-      TMP_JAMP(2529) = AMP(1119) + ((-0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * AMP(1830)  ! used 2 times
-      TMP_JAMP(2528) = AMP(830) +  AMP(832)  ! used 2 times
-      TMP_JAMP(2527) = AMP(814) + ((-0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * AMP(830)  ! used 2 times
-      TMP_JAMP(2526) = AMP(686) + ((-0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * AMP(1119)  ! used 2 times
-      TMP_JAMP(2525) = AMP(686) - AMP(832)  ! used 2 times
-      TMP_JAMP(2524) = TMP_JAMP(1673) + ((-0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * TMP_JAMP(736)  ! used 2 times
-      TMP_JAMP(2523) = TMP_JAMP(1678) + ((-0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * AMP(814)  ! used 2 times
-      TMP_JAMP(2522) = TMP_JAMP(1778) + ((-0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * TMP_JAMP(1672)  ! used 2 times
-      TMP_JAMP(2521) = TMP_JAMP(1778) +  TMP_JAMP(1678)  ! used 2 times
-      TMP_JAMP(2520) = TMP_JAMP(1899) + ((-0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * TMP_JAMP(734)  ! used 2 times
-      TMP_JAMP(2519) = TMP_JAMP(1978) + ((0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * TMP_JAMP(1677)  ! used 2 times
-      TMP_JAMP(2518) = AMP(832) - AMP(866)  ! used 2 times
-      TMP_JAMP(2517) = TMP_JAMP(721) + ((-0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * AMP(1205)  ! used 2 times
-      TMP_JAMP(2516) = TMP_JAMP(1160) + ((0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * TMP_JAMP(722)  ! used 2 times
-      TMP_JAMP(2515) = TMP_JAMP(1770) + ((-0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * TMP_JAMP(1160)  ! used 2 times
-      TMP_JAMP(2514) = TMP_JAMP(1968) +  TMP_JAMP(720)  ! used 2 times
-      TMP_JAMP(2513) = TMP_JAMP(1968) + ((-0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * TMP_JAMP(1783)  ! used 2 times
-      TMP_JAMP(2512) = AMP(815) + ((-0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * AMP(850)  ! used 2 times
-      TMP_JAMP(2511) = TMP_JAMP(1664) +  TMP_JAMP(717)  ! used 2 times
-      TMP_JAMP(2510) = TMP_JAMP(1725) + ((-0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * TMP_JAMP(715)  ! used 2 times
-      TMP_JAMP(2509) = TMP_JAMP(1967) + ((-0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * AMP(527)  ! used 2 times
-      TMP_JAMP(2508) = TMP_JAMP(1028) + ((-0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * TMP_JAMP(1006)  ! used 2 times
-      TMP_JAMP(2507) = TMP_JAMP(1159) + ((0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * TMP_JAMP(709)  ! used 2 times
-      TMP_JAMP(2506) = TMP_JAMP(1674) - TMP_JAMP(1656)  ! used 2 times
-      TMP_JAMP(2505) = TMP_JAMP(1736) +  TMP_JAMP(1679)  ! used 2 times
-      TMP_JAMP(2504) = AMP(925) + ((-0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * AMP(943)  ! used 2 times
-      TMP_JAMP(2503) = TMP_JAMP(1653) + ((-0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * AMP(925)  ! used 2 times
-      TMP_JAMP(2502) = TMP_JAMP(1899) + ((0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * TMP_JAMP(702)  ! used 2 times
-      TMP_JAMP(2501) = TMP_JAMP(1900) + ((-0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * TMP_JAMP(1653)  ! used 2 times
-      TMP_JAMP(2500) = AMP(346) + ((-0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * AMP(462)  ! used 2 times
-      TMP_JAMP(2499) = AMP(65) + ((-0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * AMP(80)  ! used 2 times
-      TMP_JAMP(2498) = TMP_JAMP(681) + ((0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * AMP(346)  ! used 2 times
-      TMP_JAMP(2497) = TMP_JAMP(1133) + ((0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * TMP_JAMP(681)  ! used 2 times
-      TMP_JAMP(2496) = TMP_JAMP(1631) +  TMP_JAMP(1133)  ! used 2 times
-      TMP_JAMP(2495) = TMP_JAMP(1638) + ((-0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * TMP_JAMP(678)  ! used 2 times
-      TMP_JAMP(2494) = TMP_JAMP(1837) - TMP_JAMP(1762)  ! used 2 times
-      TMP_JAMP(2493) = TMP_JAMP(1928) + ((-0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * TMP_JAMP(1640)  ! used 2 times
-      TMP_JAMP(2492) = TMP_JAMP(1986) - TMP_JAMP(1631)  ! used 2 times
-      TMP_JAMP(2491) = TMP_JAMP(1986) + ((-0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * TMP_JAMP(1927)  ! used 2 times
-      TMP_JAMP(2490) = TMP_JAMP(666) + ((-0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * AMP(505)  ! used 2 times
-      TMP_JAMP(2489) = TMP_JAMP(1622) +  TMP_JAMP(1131)  ! used 2 times
-      TMP_JAMP(2488) = TMP_JAMP(1747) +  AMP(1030)  ! used 2 times
-      TMP_JAMP(2487) = TMP_JAMP(1983) + ((-0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * TMP_JAMP(1926)  ! used 2 times
-      TMP_JAMP(2486) = AMP(583) + ((-0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * AMP(627)  ! used 2 times
-      TMP_JAMP(2485) = AMP(546) + ((0.000000000000000D+00,
+      TMP_JAMP(1273) = TMP_JAMP(1030) + ((-0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * TMP_JAMP(899)  ! used 2 times
+      TMP_JAMP(1272) = TMP_JAMP(1029) + ((-0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * TMP_JAMP(968)  ! used 2 times
+      TMP_JAMP(1271) = TMP_JAMP(1022) + ((0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * TMP_JAMP(953)  ! used 2 times
+      TMP_JAMP(1270) = TMP_JAMP(1020) + ((0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * TMP_JAMP(976)  ! used 2 times
+      TMP_JAMP(1269) = TMP_JAMP(1020) +  TMP_JAMP(992)  ! used 2 times
+      TMP_JAMP(1268) = TMP_JAMP(1013) + ((-0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * TMP_JAMP(976)  ! used 2 times
+      TMP_JAMP(1267) = TMP_JAMP(1013) + ((0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * TMP_JAMP(948)  ! used 2 times
+      TMP_JAMP(1266) = TMP_JAMP(1010) + ((-0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * TMP_JAMP(961)  ! used 2 times
+      TMP_JAMP(1265) = TMP_JAMP(1010) + ((0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * TMP_JAMP(956)  ! used 2 times
+      TMP_JAMP(1264) = TMP_JAMP(1006) +  TMP_JAMP(1003)  ! used 2 times
+      TMP_JAMP(1263) = TMP_JAMP(1005) + ((0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * TMP_JAMP(888)  ! used 2 times
+      TMP_JAMP(1262) = TMP_JAMP(1004) +  TMP_JAMP(995)  ! used 2 times
+      TMP_JAMP(1261) = TMP_JAMP(1003) + ((0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * TMP_JAMP(937)  ! used 2 times
+      TMP_JAMP(1260) = TMP_JAMP(1000) + ((-0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * TMP_JAMP(930)  ! used 2 times
+      TMP_JAMP(1259) = TMP_JAMP(998) + ((0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * TMP_JAMP(915)  ! used 2 times
+      TMP_JAMP(1258) = TMP_JAMP(997) + ((-0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * TMP_JAMP(885)  ! used 2 times
+      TMP_JAMP(1257) = TMP_JAMP(994) + ((-0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * TMP_JAMP(887)  ! used 2 times
+      TMP_JAMP(1256) = TMP_JAMP(994) + ((0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * TMP_JAMP(861)  ! used 2 times
+      TMP_JAMP(1255) = TMP_JAMP(992) + ((0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * TMP_JAMP(859)  ! used 2 times
+      TMP_JAMP(1254) = TMP_JAMP(991) - TMP_JAMP(986)  ! used 2 times
+      TMP_JAMP(1253) = TMP_JAMP(989) - AMP(594)  ! used 2 times
+      TMP_JAMP(1252) = TMP_JAMP(989) + ((0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * TMP_JAMP(679)  ! used 2 times
+      TMP_JAMP(1251) = TMP_JAMP(987) - TMP_JAMP(946)  ! used 2 times
+      TMP_JAMP(1250) = TMP_JAMP(985) +  TMP_JAMP(923)  ! used 2 times
+      TMP_JAMP(1249) = TMP_JAMP(984) + ((-0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * TMP_JAMP(852)  ! used 2 times
+      TMP_JAMP(1248) = TMP_JAMP(984) +  TMP_JAMP(968)  ! used 2 times
+      TMP_JAMP(1247) = TMP_JAMP(983) + ((0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * TMP_JAMP(779)  ! used 2 times
+      TMP_JAMP(1246) = TMP_JAMP(983) +  TMP_JAMP(967)  ! used 2 times
+      TMP_JAMP(1245) = TMP_JAMP(982) +  TMP_JAMP(921)  ! used 2 times
+      TMP_JAMP(1244) = TMP_JAMP(981) - TMP_JAMP(876)  ! used 2 times
+      TMP_JAMP(1243) = TMP_JAMP(979) +  AMP(509)  ! used 2 times
+      TMP_JAMP(1242) = TMP_JAMP(978) +  TMP_JAMP(851)  ! used 2 times
+      TMP_JAMP(1241) = TMP_JAMP(976) + ((-0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * TMP_JAMP(786)  ! used 2 times
+      TMP_JAMP(1240) = TMP_JAMP(976) - TMP_JAMP(877)  ! used 2 times
+      TMP_JAMP(1239) = TMP_JAMP(975) +  TMP_JAMP(971)  ! used 2 times
+      TMP_JAMP(1238) = TMP_JAMP(974) + ((-0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * TMP_JAMP(830)  ! used 2 times
+      TMP_JAMP(1237) = TMP_JAMP(974) + ((0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * TMP_JAMP(671)  ! used 2 times
+      TMP_JAMP(1236) = TMP_JAMP(973) - TMP_JAMP(966)  ! used 2 times
+      TMP_JAMP(1235) = TMP_JAMP(972) +  TMP_JAMP(954)  ! used 2 times
+      TMP_JAMP(1234) = TMP_JAMP(969) - TMP_JAMP(905)  ! used 2 times
+      TMP_JAMP(1233) = TMP_JAMP(968) +  TMP_JAMP(850)  ! used 2 times
+      TMP_JAMP(1232) = TMP_JAMP(967) +  TMP_JAMP(905)  ! used 2 times
+      TMP_JAMP(1231) = TMP_JAMP(967) + ((0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * TMP_JAMP(775)  ! used 2 times
+      TMP_JAMP(1230) = TMP_JAMP(963) +  TMP_JAMP(871)  ! used 2 times
+      TMP_JAMP(1229) = TMP_JAMP(960) +  TMP_JAMP(849)  ! used 2 times
+      TMP_JAMP(1228) = TMP_JAMP(960) - AMP(762)  ! used 2 times
+      TMP_JAMP(1227) = TMP_JAMP(959) - AMP(667)  ! used 2 times
+      TMP_JAMP(1226) = TMP_JAMP(957) +  TMP_JAMP(955)  ! used 2 times
+      TMP_JAMP(1225) = TMP_JAMP(953) +  TMP_JAMP(890)  ! used 2 times
+      TMP_JAMP(1224) = TMP_JAMP(953) +  TMP_JAMP(928)  ! used 2 times
+      TMP_JAMP(1223) = TMP_JAMP(950) - TMP_JAMP(947)  ! used 2 times
+      TMP_JAMP(1222) = TMP_JAMP(945) - TMP_JAMP(879)  ! used 2 times
+      TMP_JAMP(1221) = TMP_JAMP(942) +  TMP_JAMP(868)  ! used 2 times
+      TMP_JAMP(1220) = TMP_JAMP(941) - TMP_JAMP(866)  ! used 2 times
+      TMP_JAMP(1219) = TMP_JAMP(939) - TMP_JAMP(918)  ! used 2 times
+      TMP_JAMP(1218) = TMP_JAMP(939) +  TMP_JAMP(888)  ! used 2 times
+      TMP_JAMP(1217) = TMP_JAMP(938) + ((-0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * TMP_JAMP(773)  ! used 2 times
+      TMP_JAMP(1216) = TMP_JAMP(937) +  TMP_JAMP(918)  ! used 2 times
+      TMP_JAMP(1215) = TMP_JAMP(936) - TMP_JAMP(934)  ! used 2 times
+      TMP_JAMP(1214) = TMP_JAMP(935) - TMP_JAMP(916)  ! used 2 times
+      TMP_JAMP(1213) = TMP_JAMP(934) - TMP_JAMP(926)  ! used 2 times
+      TMP_JAMP(1212) = TMP_JAMP(933) - TMP_JAMP(926)  ! used 2 times
+      TMP_JAMP(1211) = TMP_JAMP(933) - TMP_JAMP(862)  ! used 2 times
+      TMP_JAMP(1210) = TMP_JAMP(931) - AMP(933)  ! used 2 times
+      TMP_JAMP(1209) = TMP_JAMP(928) - TMP_JAMP(924)  ! used 2 times
+      TMP_JAMP(1208) = TMP_JAMP(927) - TMP_JAMP(862)  ! used 2 times
+      TMP_JAMP(1207) = TMP_JAMP(926) +  AMP(921)  ! used 2 times
+      TMP_JAMP(1206) = TMP_JAMP(924) + ((-0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * TMP_JAMP(840)  ! used 2 times
+      TMP_JAMP(1205) = TMP_JAMP(924) - TMP_JAMP(884)  ! used 2 times
+      TMP_JAMP(1204) = TMP_JAMP(923) +  TMP_JAMP(918)  ! used 2 times
+      TMP_JAMP(1203) = TMP_JAMP(923) - TMP_JAMP(919)  ! used 2 times
+      TMP_JAMP(1202) = TMP_JAMP(922) - TMP_JAMP(920)  ! used 2 times
+      TMP_JAMP(1201) = TMP_JAMP(922) +  TMP_JAMP(917)  ! used 2 times
+      TMP_JAMP(1200) = TMP_JAMP(922) +  TMP_JAMP(908)  ! used 2 times
+      TMP_JAMP(1199) = TMP_JAMP(921) + ((0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * TMP_JAMP(826)  ! used 2 times
+      TMP_JAMP(1198) = TMP_JAMP(920) - TMP_JAMP(851)  ! used 2 times
+      TMP_JAMP(1197) = TMP_JAMP(919) + ((-0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * TMP_JAMP(779)  ! used 2 times
+      TMP_JAMP(1196) = TMP_JAMP(919) +  TMP_JAMP(901)  ! used 2 times
+      TMP_JAMP(1195) = TMP_JAMP(917) +  AMP(921)  ! used 2 times
+      TMP_JAMP(1194) = TMP_JAMP(917) + ((0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * TMP_JAMP(760)  ! used 2 times
+      TMP_JAMP(1193) = TMP_JAMP(916) +  AMP(827)  ! used 2 times
+      TMP_JAMP(1192) = TMP_JAMP(915) + ((-0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * TMP_JAMP(759)  ! used 2 times
+      TMP_JAMP(1191) = TMP_JAMP(912) - TMP_JAMP(908)  ! used 2 times
+      TMP_JAMP(1190) = TMP_JAMP(911) +  TMP_JAMP(873)  ! used 2 times
+      TMP_JAMP(1189) = TMP_JAMP(909) - TMP_JAMP(880)  ! used 2 times
+      TMP_JAMP(1188) = TMP_JAMP(908) - TMP_JAMP(906)  ! used 2 times
+      TMP_JAMP(1187) = TMP_JAMP(906) + ((-0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * TMP_JAMP(786)  ! used 2 times
+      TMP_JAMP(1186) = TMP_JAMP(906) - TMP_JAMP(873)  ! used 2 times
+      TMP_JAMP(1185) = TMP_JAMP(905) - TMP_JAMP(901)  ! used 2 times
+      TMP_JAMP(1184) = TMP_JAMP(904) +  TMP_JAMP(897)  ! used 2 times
+      TMP_JAMP(1183) = TMP_JAMP(903) + ((0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * TMP_JAMP(808)  ! used 2 times
+      TMP_JAMP(1182) = TMP_JAMP(903) +  TMP_JAMP(902)  ! used 2 times
+      TMP_JAMP(1181) = TMP_JAMP(902) +  TMP_JAMP(725)  ! used 2 times
+      TMP_JAMP(1180) = TMP_JAMP(901) + ((-0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * TMP_JAMP(775)  ! used 2 times
+      TMP_JAMP(1179) = TMP_JAMP(900) - TMP_JAMP(897)  ! used 2 times
+      TMP_JAMP(1178) = TMP_JAMP(898) +  TMP_JAMP(891)  ! used 2 times
+      TMP_JAMP(1177) = TMP_JAMP(896) +  TMP_JAMP(891)  ! used 2 times
+      TMP_JAMP(1176) = TMP_JAMP(894) +  TMP_JAMP(737)  ! used 2 times
+      TMP_JAMP(1175) = TMP_JAMP(894) +  TMP_JAMP(888)  ! used 2 times
+      TMP_JAMP(1174) = TMP_JAMP(892) - AMP(664)  ! used 2 times
+      TMP_JAMP(1173) = TMP_JAMP(892) - TMP_JAMP(890)  ! used 2 times
+      TMP_JAMP(1172) = TMP_JAMP(891) + ((0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * TMP_JAMP(856)  ! used 2 times
+      TMP_JAMP(1171) = TMP_JAMP(889) + ((-0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * TMP_JAMP(815)  ! used 2 times
+      TMP_JAMP(1170) = TMP_JAMP(889) + ((0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * TMP_JAMP(810)  ! used 2 times
+      TMP_JAMP(1169) = TMP_JAMP(887) - TMP_JAMP(884)  ! used 2 times
+      TMP_JAMP(1168) = TMP_JAMP(887) + ((0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * TMP_JAMP(767)  ! used 2 times
+      TMP_JAMP(1167) = TMP_JAMP(886) +  AMP(824)  ! used 2 times
+      TMP_JAMP(1166) = TMP_JAMP(886) - TMP_JAMP(884)  ! used 2 times
+      TMP_JAMP(1165) = TMP_JAMP(885) + ((-0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * TMP_JAMP(766)  ! used 2 times
+      TMP_JAMP(1164) = TMP_JAMP(881) - AMP(496)  ! used 2 times
+      TMP_JAMP(1163) = TMP_JAMP(881) +  TMP_JAMP(879)  ! used 2 times
+      TMP_JAMP(1162) = TMP_JAMP(880) + ((0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * TMP_JAMP(858)  ! used 2 times
+      TMP_JAMP(1161) = TMP_JAMP(878) + ((0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * TMP_JAMP(835)  ! used 2 times
+      TMP_JAMP(1160) = TMP_JAMP(878) + ((-0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * TMP_JAMP(829)  ! used 2 times
+      TMP_JAMP(1159) = TMP_JAMP(877) +  TMP_JAMP(865)  ! used 2 times
+      TMP_JAMP(1158) = TMP_JAMP(876) + ((0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * TMP_JAMP(838)  ! used 2 times
+      TMP_JAMP(1157) = TMP_JAMP(872) +  AMP(646)  ! used 2 times
+      TMP_JAMP(1156) = TMP_JAMP(870) + ((-0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * TMP_JAMP(804)  ! used 2 times
+      TMP_JAMP(1155) = TMP_JAMP(870) +  TMP_JAMP(861)  ! used 2 times
+      TMP_JAMP(1154) = TMP_JAMP(870) +  TMP_JAMP(869)  ! used 2 times
+      TMP_JAMP(1153) = TMP_JAMP(869) +  AMP(649)  ! used 2 times
+      TMP_JAMP(1152) = TMP_JAMP(868) +  AMP(637)  ! used 2 times
+      TMP_JAMP(1151) = TMP_JAMP(866) +  AMP(487)  ! used 2 times
+      TMP_JAMP(1150) = TMP_JAMP(865) + ((-0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * TMP_JAMP(786)  ! used 2 times
+      TMP_JAMP(1149) = TMP_JAMP(865) - AMP(491)  ! used 2 times
+      TMP_JAMP(1148) = TMP_JAMP(862) - AMP(806)  ! used 2 times
+      TMP_JAMP(1147) = TMP_JAMP(861) + ((-0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * TMP_JAMP(767)  ! used 2 times
+      TMP_JAMP(1146) = TMP_JAMP(860) + ((-0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * TMP_JAMP(763)  ! used 2 times
+      TMP_JAMP(1145) = TMP_JAMP(860) - AMP(809)  ! used 2 times
+      TMP_JAMP(1144) = TMP_JAMP(859) - AMP(806)  ! used 2 times
+      TMP_JAMP(1143) = TMP_JAMP(859) + ((-0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * TMP_JAMP(760)  ! used 2 times
+      TMP_JAMP(1142) = TMP_JAMP(858) + ((-0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * AMP(490)  ! used 2 times
+      TMP_JAMP(1141) = TMP_JAMP(856) - TMP_JAMP(804)  ! used 2 times
+      TMP_JAMP(1140) = TMP_JAMP(856) + ((0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * AMP(650)  ! used 2 times
+      TMP_JAMP(1139) = TMP_JAMP(854) - TMP_JAMP(853)  ! used 2 times
+      TMP_JAMP(1138) = TMP_JAMP(854) +  TMP_JAMP(847)  ! used 2 times
+      TMP_JAMP(1137) = TMP_JAMP(849) + ((0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * TMP_JAMP(805)  ! used 2 times
+      TMP_JAMP(1136) = TMP_JAMP(843) - TMP_JAMP(802)  ! used 2 times
+      TMP_JAMP(1135) = TMP_JAMP(842) + ((-0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * TMP_JAMP(820)  ! used 2 times
+      TMP_JAMP(1134) = TMP_JAMP(841) +  TMP_JAMP(756)  ! used 2 times
+      TMP_JAMP(1133) = TMP_JAMP(838) +  TMP_JAMP(678)  ! used 2 times
+      TMP_JAMP(1132) = TMP_JAMP(838) +  TMP_JAMP(789)  ! used 2 times
+      TMP_JAMP(1131) = TMP_JAMP(837) +  TMP_JAMP(828)  ! used 2 times
+      TMP_JAMP(1130) = TMP_JAMP(837) +  TMP_JAMP(670)  ! used 2 times
+      TMP_JAMP(1129) = TMP_JAMP(833) + ((-0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * AMP(479)  ! used 2 times
+      TMP_JAMP(1128) = TMP_JAMP(832) - TMP_JAMP(782)  ! used 2 times
+      TMP_JAMP(1127) = TMP_JAMP(827) - TMP_JAMP(772)  ! used 2 times
+      TMP_JAMP(1126) = TMP_JAMP(825) + ((0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * AMP(508)  ! used 2 times
+      TMP_JAMP(1125) = TMP_JAMP(824) + ((-0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * TMP_JAMP(742)  ! used 2 times
+      TMP_JAMP(1124) = TMP_JAMP(823) + ((0.000000000000000D+00,
      $ -1.000000000000000D+00)) * AMP(583)  ! used 2 times
-      TMP_JAMP(2484) = TMP_JAMP(654) - AMP(466)  ! used 2 times
-      TMP_JAMP(2483) = TMP_JAMP(1619) + ((-0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * TMP_JAMP(656)  ! used 2 times
-      TMP_JAMP(2482) = TMP_JAMP(1620) + ((0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * AMP(546)  ! used 2 times
-      TMP_JAMP(2481) = TMP_JAMP(1713) + ((0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * AMP(387)  ! used 2 times
-      TMP_JAMP(2480) = TMP_JAMP(1713) - TMP_JAMP(1621)  ! used 2 times
-      TMP_JAMP(2479) = TMP_JAMP(1792) + ((0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * TMP_JAMP(654)  ! used 2 times
-      TMP_JAMP(2478) = TMP_JAMP(1895) - TMP_JAMP(1792)  ! used 2 times
-      TMP_JAMP(2477) = TMP_JAMP(1926) + ((-0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * TMP_JAMP(1620)  ! used 2 times
-      TMP_JAMP(2476) = TMP_JAMP(1927) +  TMP_JAMP(1926)  ! used 2 times
-      TMP_JAMP(2475) = AMP(110) +  AMP(1629)  ! used 2 times
-      TMP_JAMP(2474) = TMP_JAMP(646) + ((0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * AMP(74)  ! used 2 times
-      TMP_JAMP(2473) = TMP_JAMP(1163) + ((0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * TMP_JAMP(644)  ! used 2 times
-      TMP_JAMP(2472) = TMP_JAMP(1610) - TMP_JAMP(1148)  ! used 2 times
-      TMP_JAMP(2471) = TMP_JAMP(1611) + ((0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * TMP_JAMP(645)  ! used 2 times
-      TMP_JAMP(2470) = TMP_JAMP(1614) + ((0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * TMP_JAMP(1611)  ! used 2 times
-      TMP_JAMP(2469) = TMP_JAMP(1705) + ((0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * TMP_JAMP(1163)  ! used 2 times
-      TMP_JAMP(2468) = TMP_JAMP(1895) + ((0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * TMP_JAMP(1705)  ! used 2 times
-      TMP_JAMP(2467) = TMP_JAMP(1928) + ((0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * TMP_JAMP(1612)  ! used 2 times
-      TMP_JAMP(2466) = AMP(1109) + ((-0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * AMP(1152)  ! used 2 times
-      TMP_JAMP(2465) = AMP(845) + ((0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * AMP(1109)  ! used 2 times
-      TMP_JAMP(2464) = TMP_JAMP(1607) + ((-0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * AMP(845)  ! used 2 times
-      TMP_JAMP(2463) = TMP_JAMP(1608) + ((0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * TMP_JAMP(637)  ! used 2 times
-      TMP_JAMP(2462) = TMP_JAMP(1678) + ((0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * AMP(827)  ! used 2 times
-      TMP_JAMP(2461) = TMP_JAMP(1678) - TMP_JAMP(1609)  ! used 2 times
-      TMP_JAMP(2460) = TMP_JAMP(1927) + ((-0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * TMP_JAMP(1607)  ! used 2 times
-      TMP_JAMP(2459) = TMP_JAMP(1927) + ((-0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * TMP_JAMP(1894)  ! used 2 times
-      TMP_JAMP(2458) = TMP_JAMP(1160) + ((-0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * TMP_JAMP(630)  ! used 2 times
-      TMP_JAMP(2457) = TMP_JAMP(1601) + ((-0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * TMP_JAMP(631)  ! used 2 times
-      TMP_JAMP(2456) = TMP_JAMP(1602) + ((-0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * TMP_JAMP(1601)  ! used 2 times
-      TMP_JAMP(2455) = TMP_JAMP(1668) + ((0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * TMP_JAMP(1160)  ! used 2 times
-      TMP_JAMP(2454) = TMP_JAMP(1894) + ((0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * TMP_JAMP(1668)  ! used 2 times
-      TMP_JAMP(2453) = TMP_JAMP(621) +  AMP(388)  ! used 2 times
-      TMP_JAMP(2452) = TMP_JAMP(653) + ((-0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * TMP_JAMP(648)  ! used 2 times
-      TMP_JAMP(2451) = TMP_JAMP(1619) - AMP(385)  ! used 2 times
-      TMP_JAMP(2450) = TMP_JAMP(1619) + ((0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * TMP_JAMP(1596)  ! used 2 times
-      TMP_JAMP(2449) = TMP_JAMP(1619) + ((0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * TMP_JAMP(1600)  ! used 2 times
-      TMP_JAMP(2448) = TMP_JAMP(1639) +  AMP(64)  ! used 2 times
-      TMP_JAMP(2447) = TMP_JAMP(1639) + ((-0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * TMP_JAMP(1599)  ! used 2 times
-      TMP_JAMP(2446) = TMP_JAMP(1712) + ((-0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * TMP_JAMP(621)  ! used 2 times
-      TMP_JAMP(2445) = TMP_JAMP(1712) + ((-0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * TMP_JAMP(1597)  ! used 2 times
-      TMP_JAMP(2444) = TMP_JAMP(1762) - TMP_JAMP(888)  ! used 2 times
-      TMP_JAMP(2443) = TMP_JAMP(1892) + ((-0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * TMP_JAMP(1621)  ! used 2 times
-      TMP_JAMP(2442) = TMP_JAMP(1893) + ((-0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * TMP_JAMP(1891)  ! used 2 times
-      TMP_JAMP(2441) = TMP_JAMP(616) + ((-0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * AMP(1042)  ! used 2 times
-      TMP_JAMP(2440) = TMP_JAMP(616) - AMP(829)  ! used 2 times
-      TMP_JAMP(2439) = TMP_JAMP(634) + ((0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * TMP_JAMP(632)  ! used 2 times
-      TMP_JAMP(2438) = TMP_JAMP(1591) + ((-0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * TMP_JAMP(1587)  ! used 2 times
-      TMP_JAMP(2437) = TMP_JAMP(1599) +  TMP_JAMP(1592)  ! used 2 times
-      TMP_JAMP(2436) = TMP_JAMP(1608) + ((-0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * TMP_JAMP(1590)  ! used 2 times
-      TMP_JAMP(2435) = TMP_JAMP(1608) + ((-0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * TMP_JAMP(1592)  ! used 2 times
-      TMP_JAMP(2434) = TMP_JAMP(1817) + ((-0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * TMP_JAMP(1587)  ! used 2 times
-      TMP_JAMP(2433) = TMP_JAMP(1890) + ((0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * TMP_JAMP(1609)  ! used 2 times
-      TMP_JAMP(2432) = TMP_JAMP(1891) +  TMP_JAMP(1817)  ! used 2 times
-      TMP_JAMP(2431) = TMP_JAMP(1893) - TMP_JAMP(1608)  ! used 2 times
-      TMP_JAMP(2430) = AMP(846) + ((0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * AMP(913)  ! used 2 times
-      TMP_JAMP(2429) = AMP(426) + ((-0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * AMP(945)  ! used 2 times
-      TMP_JAMP(2428) = TMP_JAMP(1580) + ((-0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * AMP(388)  ! used 2 times
-      TMP_JAMP(2427) = TMP_JAMP(1592) + ((0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * AMP(913)  ! used 2 times
-      TMP_JAMP(2426) = TMP_JAMP(1600) - TMP_JAMP(1592)  ! used 2 times
-      TMP_JAMP(2425) = TMP_JAMP(1643) + ((-0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * AMP(348)  ! used 2 times
-      TMP_JAMP(2424) = TMP_JAMP(1925) +  TMP_JAMP(1924)  ! used 2 times
-      TMP_JAMP(2423) = TMP_JAMP(649) + ((0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * AMP(77)  ! used 2 times
-      TMP_JAMP(2422) = TMP_JAMP(651) + ((0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * TMP_JAMP(649)  ! used 2 times
-      TMP_JAMP(2421) = TMP_JAMP(1591) +  AMP(945)  ! used 2 times
-      TMP_JAMP(2420) = TMP_JAMP(1599) + ((-0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * TMP_JAMP(651)  ! used 2 times
-      TMP_JAMP(2419) = TMP_JAMP(1607) +  AMP(1116)  ! used 2 times
-      TMP_JAMP(2418) = TMP_JAMP(1607) + ((-0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * AMP(846)  ! used 2 times
-      TMP_JAMP(2417) = TMP_JAMP(1886) +  TMP_JAMP(1885)  ! used 2 times
-      TMP_JAMP(2416) = TMP_JAMP(1966) - TMP_JAMP(1599)  ! used 2 times
-      TMP_JAMP(2415) = TMP_JAMP(1966) - TMP_JAMP(1887)  ! used 2 times
-      TMP_JAMP(2414) = AMP(1277) + ((-0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * AMP(1308)  ! used 2 times
-      TMP_JAMP(2413) = AMP(829) + ((-0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * AMP(1047)  ! used 2 times
-      TMP_JAMP(2412) = TMP_JAMP(744) + ((-0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * TMP_JAMP(740)  ! used 2 times
-      TMP_JAMP(2411) = TMP_JAMP(1577) + ((0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * TMP_JAMP(1558)  ! used 2 times
-      TMP_JAMP(2410) = TMP_JAMP(1600) + ((-0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * AMP(1277)  ! used 2 times
-      TMP_JAMP(2409) = TMP_JAMP(1600) - AMP(545)  ! used 2 times
-      TMP_JAMP(2408) = TMP_JAMP(1628) + ((-0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * AMP(506)  ! used 2 times
-      TMP_JAMP(2407) = TMP_JAMP(1922) + ((0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * TMP_JAMP(1884)  ! used 2 times
-      TMP_JAMP(2406) = TMP_JAMP(1965) + ((-0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * TMP_JAMP(1923)  ! used 2 times
-      TMP_JAMP(2405) = TMP_JAMP(1550) + ((-0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * TMP_JAMP(1546)  ! used 2 times
-      TMP_JAMP(2404) = TMP_JAMP(1597) - AMP(1308)  ! used 2 times
-      TMP_JAMP(2403) = TMP_JAMP(1620) - AMP(590)  ! used 2 times
-      TMP_JAMP(2402) = TMP_JAMP(1620) + ((0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * AMP(545)  ! used 2 times
-      TMP_JAMP(2401) = TMP_JAMP(1964) + ((-0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * TMP_JAMP(1887)  ! used 2 times
-      TMP_JAMP(2400) = AMP(360) + ((-0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * AMP(372)  ! used 2 times
-      TMP_JAMP(2399) = TMP_JAMP(1536) + ((-0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * TMP_JAMP(1533)  ! used 2 times
-      TMP_JAMP(2398) = TMP_JAMP(1537) - TMP_JAMP(577)  ! used 2 times
-      TMP_JAMP(2397) = TMP_JAMP(1540) +  TMP_JAMP(1539)  ! used 2 times
-      TMP_JAMP(2396) = TMP_JAMP(1884) + ((0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * TMP_JAMP(1536)  ! used 2 times
-      TMP_JAMP(2395) = TMP_JAMP(1884) - TMP_JAMP(1618)  ! used 2 times
-      TMP_JAMP(2394) = TMP_JAMP(1980) + ((0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * TMP_JAMP(577)  ! used 2 times
-      TMP_JAMP(2393) = TMP_JAMP(1980) + ((0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * TMP_JAMP(1881)  ! used 2 times
-      TMP_JAMP(2392) = TMP_JAMP(573) + ((-0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * AMP(1209)  ! used 2 times
-      TMP_JAMP(2391) = TMP_JAMP(573) - AMP(867)  ! used 2 times
-      TMP_JAMP(2390) = TMP_JAMP(1531) + ((0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * TMP_JAMP(1527)  ! used 2 times
-      TMP_JAMP(2389) = TMP_JAMP(1602) + ((0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * AMP(921)  ! used 2 times
-      TMP_JAMP(2388) = TMP_JAMP(1602) + ((0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * TMP_JAMP(1577)  ! used 2 times
-      TMP_JAMP(2387) = TMP_JAMP(1789) + ((-0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * TMP_JAMP(1527)  ! used 2 times
-      TMP_JAMP(2386) = TMP_JAMP(1880) + ((-0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * TMP_JAMP(1602)  ! used 2 times
-      TMP_JAMP(2385) = TMP_JAMP(1880) + ((-0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * TMP_JAMP(1614)  ! used 2 times
-      TMP_JAMP(2384) = TMP_JAMP(1881) - TMP_JAMP(1789)  ! used 2 times
-      TMP_JAMP(2383) = TMP_JAMP(1963) +  TMP_JAMP(1539)  ! used 2 times
-      TMP_JAMP(2382) = AMP(372) - AMP(852)  ! used 2 times
-      TMP_JAMP(2381) = AMP(362) + ((-0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * AMP(372)  ! used 2 times
-      TMP_JAMP(2380) = TMP_JAMP(1165) - AMP(362)  ! used 2 times
-      TMP_JAMP(2379) = TMP_JAMP(1879) + ((-0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * TMP_JAMP(1540)  ! used 2 times
-      TMP_JAMP(2378) = TMP_JAMP(1921) + ((0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * TMP_JAMP(1165)  ! used 2 times
-      TMP_JAMP(2377) = TMP_JAMP(1924) +  TMP_JAMP(1921)  ! used 2 times
-      TMP_JAMP(2376) = TMP_JAMP(1962) +  TMP_JAMP(1533)  ! used 2 times
-      TMP_JAMP(2375) = TMP_JAMP(1963) - AMP(846)  ! used 2 times
-      TMP_JAMP(2374) = AMP(852) + ((-0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * AMP(1218)  ! used 2 times
-      TMP_JAMP(2373) = TMP_JAMP(669) + ((0.000000000000000D+00,
+      TMP_JAMP(1123) = TMP_JAMP(821) + ((0.000000000000000D+00,
      $ -1.000000000000000D+00)) * AMP(591)  ! used 2 times
-      TMP_JAMP(2372) = TMP_JAMP(1520) + ((0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * TMP_JAMP(1516)  ! used 2 times
-      TMP_JAMP(2371) = TMP_JAMP(1539) + ((-0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * TMP_JAMP(672)  ! used 2 times
-      TMP_JAMP(2370) = TMP_JAMP(1961) + ((-0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * AMP(1218)  ! used 2 times
-      TMP_JAMP(2369) = TMP_JAMP(1961) - TMP_JAMP(1531)  ! used 2 times
-      TMP_JAMP(2368) = AMP(1229) + ((0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * AMP(1433)  ! used 2 times
-      TMP_JAMP(2367) = AMP(1211) + ((-0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * AMP(1229)  ! used 2 times
-      TMP_JAMP(2366) = TMP_JAMP(726) + ((0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * AMP(1211)  ! used 2 times
-      TMP_JAMP(2365) = TMP_JAMP(726) - AMP(867)  ! used 2 times
-      TMP_JAMP(2364) = TMP_JAMP(761) + ((-0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * AMP(73)  ! used 2 times
-      TMP_JAMP(2363) = TMP_JAMP(763) + ((0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * TMP_JAMP(761)  ! used 2 times
-      TMP_JAMP(2362) = TMP_JAMP(1540) - AMP(79)  ! used 2 times
-      TMP_JAMP(2361) = TMP_JAMP(1613) + ((-0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * TMP_JAMP(763)  ! used 2 times
-      TMP_JAMP(2360) = TMP_JAMP(1879) +  TMP_JAMP(730)  ! used 2 times
-      TMP_JAMP(2359) = TMP_JAMP(1893) + ((-0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * TMP_JAMP(1613)  ! used 2 times
-      TMP_JAMP(2358) = TMP_JAMP(1919) + ((-0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * TMP_JAMP(1893)  ! used 2 times
-      TMP_JAMP(2357) = TMP_JAMP(1920) + ((-0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * TMP_JAMP(730)  ! used 2 times
-      TMP_JAMP(2356) = TMP_JAMP(1920) +  TMP_JAMP(1919)  ! used 2 times
-      TMP_JAMP(2355) = TMP_JAMP(1518) + ((-0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * AMP(1725)  ! used 2 times
-      TMP_JAMP(2354) = TMP_JAMP(1537) + ((-0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * AMP(1220)  ! used 2 times
-      TMP_JAMP(2353) = TMP_JAMP(1547) + ((-0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * TMP_JAMP(1544)  ! used 2 times
-      TMP_JAMP(2352) = TMP_JAMP(1640) - AMP(76)  ! used 2 times
-      TMP_JAMP(2351) = TMP_JAMP(1875) - TMP_JAMP(1874)  ! used 2 times
-      TMP_JAMP(2350) = AMP(671) + ((0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * AMP(1031)  ! used 2 times
-      TMP_JAMP(2349) = TMP_JAMP(549) + ((-0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * AMP(655)  ! used 2 times
-      TMP_JAMP(2348) = TMP_JAMP(1500) +  TMP_JAMP(1498)  ! used 2 times
-      TMP_JAMP(2347) = TMP_JAMP(1533) +  AMP(1031)  ! used 2 times
-      TMP_JAMP(2346) = TMP_JAMP(1533) + ((-0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * TMP_JAMP(1501)  ! used 2 times
-      TMP_JAMP(2345) = TMP_JAMP(1962) +  TMP_JAMP(1606)  ! used 2 times
-      TMP_JAMP(2344) = TMP_JAMP(1975) + ((-0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * TMP_JAMP(549)  ! used 2 times
-      TMP_JAMP(2343) = TMP_JAMP(1975) + ((-0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * TMP_JAMP(1873)  ! used 2 times
-      TMP_JAMP(2342) = TMP_JAMP(546) + ((0.000000000000000D+00,
+      TMP_JAMP(1122) = TMP_JAMP(818) + ((0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * AMP(497)  ! used 2 times
+      TMP_JAMP(1121) = TMP_JAMP(817) + ((-0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * AMP(616)  ! used 2 times
+      TMP_JAMP(1120) = TMP_JAMP(816) + ((0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * AMP(615)  ! used 2 times
+      TMP_JAMP(1119) = TMP_JAMP(813) + ((-0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * AMP(647)  ! used 2 times
+      TMP_JAMP(1118) = TMP_JAMP(812) - TMP_JAMP(790)  ! used 2 times
+      TMP_JAMP(1117) = TMP_JAMP(807) + ((0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * AMP(668)  ! used 2 times
+      TMP_JAMP(1116) = TMP_JAMP(806) + ((-0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * AMP(761)  ! used 2 times
+      TMP_JAMP(1115) = TMP_JAMP(803) + ((-0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * AMP(775)  ! used 2 times
+      TMP_JAMP(1114) = TMP_JAMP(799) + ((0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * AMP(665)  ! used 2 times
+      TMP_JAMP(1113) = TMP_JAMP(798) + ((0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * AMP(750)  ! used 2 times
+      TMP_JAMP(1112) = TMP_JAMP(797) + ((-0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * AMP(749)  ! used 2 times
+      TMP_JAMP(1111) = TMP_JAMP(796) - TMP_JAMP(776)  ! used 2 times
+      TMP_JAMP(1110) = TMP_JAMP(794) - AMP(786)  ! used 2 times
+      TMP_JAMP(1109) = TMP_JAMP(791) + ((0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * AMP(656)  ! used 2 times
+      TMP_JAMP(1108) = TMP_JAMP(787) - AMP(608)  ! used 2 times
+      TMP_JAMP(1107) = TMP_JAMP(783) + ((0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * AMP(506)  ! used 2 times
+      TMP_JAMP(1106) = TMP_JAMP(780) + ((0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * AMP(603)  ! used 2 times
+      TMP_JAMP(1105) = TMP_JAMP(780) + ((-0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * AMP(488)  ! used 2 times
+      TMP_JAMP(1104) = TMP_JAMP(776) + ((-0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * AMP(753)  ! used 2 times
+      TMP_JAMP(1103) = TMP_JAMP(774) + ((0.000000000000000D+00,
      $ -1.000000000000000D+00)) * AMP(765)  ! used 2 times
-      TMP_JAMP(2341) = TMP_JAMP(1495) + ((0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * TMP_JAMP(1493)  ! used 2 times
-      TMP_JAMP(2340) = TMP_JAMP(1614) + ((-0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * AMP(1283)  ! used 2 times
-      TMP_JAMP(2339) = TMP_JAMP(1614) - AMP(707)  ! used 2 times
-      TMP_JAMP(2338) = TMP_JAMP(1770) + ((-0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * TMP_JAMP(1493)  ! used 2 times
-      TMP_JAMP(2337) = TMP_JAMP(1873) - TMP_JAMP(1770)  ! used 2 times
-      TMP_JAMP(2336) = TMP_JAMP(1960) + ((0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * TMP_JAMP(1500)  ! used 2 times
-      TMP_JAMP(2335) = AMP(671) - AMP(691)  ! used 2 times
-      TMP_JAMP(2334) = TMP_JAMP(1164) - AMP(656)  ! used 2 times
-      TMP_JAMP(2333) = TMP_JAMP(1872) + ((-0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * TMP_JAMP(1501)  ! used 2 times
-      TMP_JAMP(2332) = TMP_JAMP(1918) + ((-0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * TMP_JAMP(1164)  ! used 2 times
-      TMP_JAMP(2331) = TMP_JAMP(1922) +  TMP_JAMP(1918)  ! used 2 times
-      TMP_JAMP(2330) = TMP_JAMP(1960) + ((-0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * AMP(545)  ! used 2 times
-      TMP_JAMP(2329) = AMP(789) - AMP(1115)  ! used 2 times
-      TMP_JAMP(2328) = AMP(691) + ((-0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * AMP(774)  ! used 2 times
-      TMP_JAMP(2327) = TMP_JAMP(693) + ((0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * TMP_JAMP(683)  ! used 2 times
-      TMP_JAMP(2326) = TMP_JAMP(1485) + ((0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * TMP_JAMP(1481)  ! used 2 times
-      TMP_JAMP(2325) = TMP_JAMP(1886) +  TMP_JAMP(1500)  ! used 2 times
-      TMP_JAMP(2324) = TMP_JAMP(1959) + ((-0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * TMP_JAMP(1495)  ! used 2 times
-      TMP_JAMP(2323) = AMP(784) + ((0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * AMP(1430)  ! used 2 times
-      TMP_JAMP(2322) = AMP(766) + ((-0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * AMP(784)  ! used 2 times
-      TMP_JAMP(2321) = TMP_JAMP(1501) +  AMP(79)  ! used 2 times
-      TMP_JAMP(2320) = TMP_JAMP(1872) + ((-0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * AMP(707)  ! used 2 times
-      TMP_JAMP(2319) = TMP_JAMP(1973) + ((0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * TMP_JAMP(1917)  ! used 2 times
-      TMP_JAMP(2318) = TMP_JAMP(1498) + ((0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * AMP(775)  ! used 2 times
-      TMP_JAMP(2317) = TMP_JAMP(1874) +  TMP_JAMP(1869)  ! used 2 times
-      TMP_JAMP(2316) = AMP(47) + ((-0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * AMP(144)  ! used 2 times
-      TMP_JAMP(2315) = TMP_JAMP(517) + ((0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * AMP(397)  ! used 2 times
-      TMP_JAMP(2314) = TMP_JAMP(1463) + ((0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * TMP_JAMP(517)  ! used 2 times
-      TMP_JAMP(2313) = TMP_JAMP(1470) +  TMP_JAMP(1463)  ! used 2 times
-      TMP_JAMP(2312) = TMP_JAMP(1837) + ((-0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * TMP_JAMP(1470)  ! used 2 times
-      TMP_JAMP(2311) = TMP_JAMP(1837) - TMP_JAMP(1474)  ! used 2 times
-      TMP_JAMP(2310) = TMP_JAMP(1868) + ((-0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * TMP_JAMP(889)  ! used 2 times
-      TMP_JAMP(2309) = TMP_JAMP(1957) + ((-0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * TMP_JAMP(1915)  ! used 2 times
-      TMP_JAMP(2308) = TMP_JAMP(1982) + ((-0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * TMP_JAMP(1916)  ! used 2 times
-      TMP_JAMP(2307) = TMP_JAMP(1452) + ((-0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * TMP_JAMP(507)  ! used 2 times
-      TMP_JAMP(2306) = TMP_JAMP(1457) + ((0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * AMP(1559)  ! used 2 times
-      TMP_JAMP(2305) = TMP_JAMP(1461) + ((0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * TMP_JAMP(1458)  ! used 2 times
-      TMP_JAMP(2304) = TMP_JAMP(1818) + ((-0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * TMP_JAMP(1458)  ! used 2 times
-      TMP_JAMP(2303) = TMP_JAMP(1915) - TMP_JAMP(1818)  ! used 2 times
-      TMP_JAMP(2302) = TMP_JAMP(1957) + ((0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * TMP_JAMP(871)  ! used 2 times
-      TMP_JAMP(2301) = TMP_JAMP(1981) +  AMP(664)  ! used 2 times
-      TMP_JAMP(2300) = TMP_JAMP(1981) + ((-0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * TMP_JAMP(1914)  ! used 2 times
-      TMP_JAMP(2299) = AMP(742) + ((-0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * AMP(786)  ! used 2 times
-      TMP_JAMP(2298) = TMP_JAMP(1868) + ((0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * TMP_JAMP(855)  ! used 2 times
-      TMP_JAMP(2297) = TMP_JAMP(1892) + ((0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * TMP_JAMP(1700)  ! used 2 times
-      TMP_JAMP(2296) = TMP_JAMP(1892) + ((0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * TMP_JAMP(1745)  ! used 2 times
-      TMP_JAMP(2295) = TMP_JAMP(1913) + ((0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * AMP(384)  ! used 2 times
-      TMP_JAMP(2294) = TMP_JAMP(1956) + ((0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * TMP_JAMP(1913)  ! used 2 times
-      TMP_JAMP(2293) = TMP_JAMP(1162) - AMP(1272)  ! used 2 times
-      TMP_JAMP(2292) = TMP_JAMP(1442) + ((0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * AMP(56)  ! used 2 times
-      TMP_JAMP(2291) = TMP_JAMP(1460) + ((-0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * TMP_JAMP(1456)  ! used 2 times
-      TMP_JAMP(2290) = TMP_JAMP(1596) + ((0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * TMP_JAMP(1436)  ! used 2 times
-      TMP_JAMP(2289) = TMP_JAMP(1695) + ((0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * TMP_JAMP(1162)  ! used 2 times
-      TMP_JAMP(2288) = TMP_JAMP(1695) +  TMP_JAMP(1443)  ! used 2 times
-      TMP_JAMP(2287) = AMP(1187) + ((-0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * AMP(1230)  ! used 2 times
-      TMP_JAMP(2286) = AMP(824) + ((-0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * AMP(863)  ! used 2 times
-      TMP_JAMP(2285) = TMP_JAMP(1430) + ((0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * TMP_JAMP(570)  ! used 2 times
-      TMP_JAMP(2284) = TMP_JAMP(1868) - TMP_JAMP(1432)  ! used 2 times
-      TMP_JAMP(2283) = TMP_JAMP(1890) + ((-0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * TMP_JAMP(1664)  ! used 2 times
-      TMP_JAMP(2282) = TMP_JAMP(1890) + ((-0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * TMP_JAMP(1729)  ! used 2 times
-      TMP_JAMP(2281) = TMP_JAMP(1912) + ((0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * AMP(824)  ! used 2 times
-      TMP_JAMP(2280) = TMP_JAMP(1914) +  TMP_JAMP(1912)  ! used 2 times
-      TMP_JAMP(2279) = TMP_JAMP(1159) + ((-0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * AMP(863)  ! used 2 times
-      TMP_JAMP(2278) = TMP_JAMP(1590) + ((-0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * TMP_JAMP(1436)  ! used 2 times
-      TMP_JAMP(2277) = TMP_JAMP(1658) + ((0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * TMP_JAMP(1159)  ! used 2 times
-      TMP_JAMP(2276) = TMP_JAMP(1658) - TMP_JAMP(1428)  ! used 2 times
-      TMP_JAMP(2275) = AMP(382) + ((-0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * AMP(388)  ! used 2 times
-      TMP_JAMP(2274) = TMP_JAMP(1425) + ((-0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * AMP(1612)  ! used 2 times
-      TMP_JAMP(2273) = TMP_JAMP(1597) + ((-0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * TMP_JAMP(1425)  ! used 2 times
-      TMP_JAMP(2272) = TMP_JAMP(1862) + ((-0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * TMP_JAMP(1597)  ! used 2 times
-      TMP_JAMP(2271) = TMP_JAMP(1913) + ((-0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * AMP(382)  ! used 2 times
-      TMP_JAMP(2270) = TMP_JAMP(1913) +  TMP_JAMP(1474)  ! used 2 times
-      TMP_JAMP(2269) = TMP_JAMP(1915) - TMP_JAMP(1891)  ! used 2 times
-      TMP_JAMP(2268) = AMP(823) + ((-0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * AMP(829)  ! used 2 times
-      TMP_JAMP(2267) = TMP_JAMP(1422) + ((0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * AMP(1613)  ! used 2 times
-      TMP_JAMP(2266) = TMP_JAMP(1591) + ((0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * TMP_JAMP(1422)  ! used 2 times
-      TMP_JAMP(2265) = TMP_JAMP(1862) + ((0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * TMP_JAMP(1591)  ! used 2 times
-      TMP_JAMP(2264) = TMP_JAMP(1912) + ((-0.000000000000000D+00
+      TMP_JAMP(1102) = TMP_JAMP(773) + ((0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * AMP(796)  ! used 2 times
+      TMP_JAMP(1101) = TMP_JAMP(772) + ((0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * AMP(815)  ! used 2 times
+      TMP_JAMP(1100) = TMP_JAMP(772) + ((-0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * AMP(797)  ! used 2 times
+      TMP_JAMP(1099) = TMP_JAMP(770) +  TMP_JAMP(756)  ! used 2 times
+      TMP_JAMP(1098) = TMP_JAMP(769) + ((-0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * AMP(814)  ! used 2 times
+      TMP_JAMP(1097) = TMP_JAMP(768) - AMP(926)  ! used 2 times
+      TMP_JAMP(1096) = TMP_JAMP(766) + ((-0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * AMP(924)  ! used 2 times
+      TMP_JAMP(1095) = TMP_JAMP(765) + ((-0.000000000000000D+00
      $ ,1.000000000000000D+00)) * AMP(823)  ! used 2 times
-      TMP_JAMP(2263) = TMP_JAMP(1912) +  TMP_JAMP(1461)  ! used 2 times
-      TMP_JAMP(2262) = AMP(901) + ((-0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * AMP(945)  ! used 2 times
-      TMP_JAMP(2261) = TMP_JAMP(1423) + ((0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * TMP_JAMP(1420)  ! used 2 times
-      TMP_JAMP(2260) = TMP_JAMP(1916) + ((0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * TMP_JAMP(1421)  ! used 2 times
-      TMP_JAMP(2259) = TMP_JAMP(1921) - TMP_JAMP(1916)  ! used 2 times
-      TMP_JAMP(2258) = TMP_JAMP(1925) - TMP_JAMP(1921)  ! used 2 times
-      TMP_JAMP(2257) = TMP_JAMP(1965) + ((0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * TMP_JAMP(1426)  ! used 2 times
-      TMP_JAMP(2256) = AMP(59) + ((-0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * AMP(1566)  ! used 2 times
-      TMP_JAMP(2255) = TMP_JAMP(1416) - TMP_JAMP(1412)  ! used 2 times
-      TMP_JAMP(2254) = TMP_JAMP(1419) + ((0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * TMP_JAMP(1415)  ! used 2 times
-      TMP_JAMP(2253) = TMP_JAMP(1421) +  AMP(1194)  ! used 2 times
-      TMP_JAMP(2252) = TMP_JAMP(1887) + ((0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * TMP_JAMP(1862)  ! used 2 times
-      TMP_JAMP(2251) = TMP_JAMP(1961) - TMP_JAMP(1955)  ! used 2 times
-      TMP_JAMP(2250) = AMP(1265) + ((-0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * AMP(1308)  ! used 2 times
-      TMP_JAMP(2249) = TMP_JAMP(1914) + ((0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * TMP_JAMP(1411)  ! used 2 times
-      TMP_JAMP(2248) = TMP_JAMP(1918) +  TMP_JAMP(1914)  ! used 2 times
-      TMP_JAMP(2247) = TMP_JAMP(1923) - TMP_JAMP(1918)  ! used 2 times
-      TMP_JAMP(2246) = TMP_JAMP(1410) + ((-0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * TMP_JAMP(1408)  ! used 2 times
-      TMP_JAMP(2245) = TMP_JAMP(1411) - AMP(749)  ! used 2 times
-      TMP_JAMP(2244) = TMP_JAMP(1964) +  TMP_JAMP(1959)  ! used 2 times
-      TMP_JAMP(2243) = TMP_JAMP(959) + ((-0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * AMP(399)  ! used 2 times
-      TMP_JAMP(2242) = TMP_JAMP(1406) + ((0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * TMP_JAMP(968)  ! used 2 times
-      TMP_JAMP(2241) = TMP_JAMP(1911) - TMP_JAMP(1474)  ! used 2 times
-      TMP_JAMP(2240) = TMP_JAMP(1954) + ((-0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * TMP_JAMP(1910)  ! used 2 times
-      TMP_JAMP(2239) = TMP_JAMP(1956) +  AMP(741)  ! used 2 times
-      TMP_JAMP(2238) = AMP(849) + ((0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * AMP(1131)  ! used 2 times
-      TMP_JAMP(2237) = AMP(849) + ((0.000000000000000D+00,
+      TMP_JAMP(1094) = TMP_JAMP(764) + ((0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * AMP(901)  ! used 2 times
+      TMP_JAMP(1093) = TMP_JAMP(761) - AMP(935)  ! used 2 times
+      TMP_JAMP(1092) = TMP_JAMP(758) + ((-0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * AMP(826)  ! used 2 times
+      TMP_JAMP(1091) = TMP_JAMP(757) + ((0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * AMP(913)  ! used 2 times
+      TMP_JAMP(1090) = TMP_JAMP(755) + ((0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * AMP(934)  ! used 2 times
+      TMP_JAMP(1089) = TMP_JAMP(755) + ((-0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * AMP(808)  ! used 2 times
+      TMP_JAMP(1088) = TMP_JAMP(753) + ((0.000000000000000D+00,
      $ -1.000000000000000D+00)) * AMP(909)  ! used 2 times
-      TMP_JAMP(2236) = TMP_JAMP(1420) - TMP_JAMP(1390)  ! used 2 times
-      TMP_JAMP(2235) = TMP_JAMP(1443) - TMP_JAMP(1428)  ! used 2 times
-      TMP_JAMP(2234) = TMP_JAMP(1737) + ((-0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * TMP_JAMP(1391)  ! used 2 times
-      TMP_JAMP(2233) = TMP_JAMP(1859) - TMP_JAMP(1428)  ! used 2 times
-      TMP_JAMP(2232) = TMP_JAMP(1953) +  TMP_JAMP(1859)  ! used 2 times
-      TMP_JAMP(2231) = TMP_JAMP(1389) + ((-0.000000000000000D+00
+      TMP_JAMP(1087) = TMP_JAMP(749) + ((-0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * AMP(912)  ! used 2 times
+      TMP_JAMP(1086) = TMP_JAMP(748) + ((-0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * AMP(925)  ! used 2 times
+      TMP_JAMP(1085) = TMP_JAMP(747) + ((-0.000000000000000D+00
      $ ,1.000000000000000D+00)) * AMP(900)  ! used 2 times
-      TMP_JAMP(2230) = TMP_JAMP(1859) + ((0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * TMP_JAMP(1421)  ! used 2 times
-      TMP_JAMP(2229) = TMP_JAMP(1924) + ((0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * AMP(344)  ! used 2 times
-      TMP_JAMP(2228) = TMP_JAMP(1924) +  TMP_JAMP(1910)  ! used 2 times
-      TMP_JAMP(2227) = TMP_JAMP(1385) + ((0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * AMP(1564)  ! used 2 times
-      TMP_JAMP(2226) = TMP_JAMP(1417) - TMP_JAMP(1409)  ! used 2 times
-      TMP_JAMP(2225) = TMP_JAMP(1885) + ((-0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * TMP_JAMP(1384)  ! used 2 times
-      TMP_JAMP(2224) = TMP_JAMP(1953) + ((-0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * AMP(750)  ! used 2 times
-      TMP_JAMP(2223) = TMP_JAMP(708) + ((-0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * AMP(1133)  ! used 2 times
-      TMP_JAMP(2222) = TMP_JAMP(1406) + ((-0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * AMP(1412)  ! used 2 times
-      TMP_JAMP(2221) = TMP_JAMP(1458) + ((0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * TMP_JAMP(1406)  ! used 2 times
-      TMP_JAMP(2220) = TMP_JAMP(1858) + ((0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * TMP_JAMP(1458)  ! used 2 times
-      TMP_JAMP(2219) = TMP_JAMP(1370) - AMP(58)  ! used 2 times
-      TMP_JAMP(2218) = TMP_JAMP(1374) + ((0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * TMP_JAMP(1370)  ! used 2 times
-      TMP_JAMP(2217) = TMP_JAMP(1414) + ((0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * TMP_JAMP(1374)  ! used 2 times
-      TMP_JAMP(2216) = TMP_JAMP(1418) + ((-0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * TMP_JAMP(1414)  ! used 2 times
-      TMP_JAMP(2215) = TMP_JAMP(1858) + ((-0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * AMP(1142)  ! used 2 times
-      TMP_JAMP(2214) = TMP_JAMP(1952) + ((0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * TMP_JAMP(1418)  ! used 2 times
-      TMP_JAMP(2213) = TMP_JAMP(913) + ((0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * AMP(1022)  ! used 2 times
-      TMP_JAMP(2212) = TMP_JAMP(1369) + ((0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * TMP_JAMP(929)  ! used 2 times
-      TMP_JAMP(2211) = TMP_JAMP(1431) + ((0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * AMP(863)  ! used 2 times
-      TMP_JAMP(2210) = TMP_JAMP(1911) - TMP_JAMP(1461)  ! used 2 times
-      TMP_JAMP(2209) = TMP_JAMP(1951) + ((-0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * TMP_JAMP(1909)  ! used 2 times
-      TMP_JAMP(2208) = AMP(548) + ((0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * AMP(606)  ! used 2 times
-      TMP_JAMP(2207) = TMP_JAMP(1722) + ((-0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * TMP_JAMP(1356)  ! used 2 times
-      TMP_JAMP(2206) = TMP_JAMP(1950) +  TMP_JAMP(1949)  ! used 2 times
-      TMP_JAMP(2205) = TMP_JAMP(1355) + ((-0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * AMP(1263)  ! used 2 times
-      TMP_JAMP(2204) = TMP_JAMP(1922) + ((-0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * AMP(497)  ! used 2 times
-      TMP_JAMP(2203) = TMP_JAMP(1922) +  TMP_JAMP(1909)  ! used 2 times
-      TMP_JAMP(2202) = TMP_JAMP(1950) + ((0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * TMP_JAMP(1411)  ! used 2 times
-      TMP_JAMP(2201) = TMP_JAMP(1949) + ((-0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * AMP(1193)  ! used 2 times
-      TMP_JAMP(2200) = TMP_JAMP(1958) +  TMP_JAMP(1351)  ! used 2 times
-      TMP_JAMP(2199) = AMP(548) + ((-0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * AMP(607)  ! used 2 times
-      TMP_JAMP(2198) = TMP_JAMP(819) + ((-0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * TMP_JAMP(816)  ! used 2 times
-      TMP_JAMP(2197) = TMP_JAMP(1369) + ((-0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * AMP(1409)  ! used 2 times
-      TMP_JAMP(2196) = TMP_JAMP(1470) + ((0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * TMP_JAMP(1369)  ! used 2 times
-      TMP_JAMP(2195) = TMP_JAMP(1947) +  TMP_JAMP(1349)  ! used 2 times
-      TMP_JAMP(2194) = TMP_JAMP(1952) +  TMP_JAMP(1470)  ! used 2 times
-      TMP_JAMP(2193) = TMP_JAMP(1952) + ((0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * TMP_JAMP(1948)  ! used 2 times
-      TMP_JAMP(2192) = TMP_JAMP(1343) +  AMP(616)  ! used 2 times
-      TMP_JAMP(2191) = TMP_JAMP(1948) - TMP_JAMP(1345)  ! used 2 times
-      TMP_JAMP(2190) = AMP(479) + ((-0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * AMP(534)  ! used 2 times
-      TMP_JAMP(2189) = AMP(332) + ((-0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * AMP(494)  ! used 2 times
-      TMP_JAMP(2188) = AMP(326) + ((-0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * AMP(390)  ! used 2 times
-      TMP_JAMP(2187) = TMP_JAMP(1580) - AMP(326)  ! used 2 times
-      TMP_JAMP(2186) = TMP_JAMP(1895) +  AMP(479)  ! used 2 times
-      TMP_JAMP(2185) = TMP_JAMP(1908) + ((-0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * TMP_JAMP(1580)  ! used 2 times
-      TMP_JAMP(2184) = TMP_JAMP(1908) - TMP_JAMP(1907)  ! used 2 times
-      TMP_JAMP(2183) = TMP_JAMP(1946) +  TMP_JAMP(1895)  ! used 2 times
-      TMP_JAMP(2182) = AMP(494) + ((-0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * AMP(962)  ! used 2 times
-      TMP_JAMP(2181) = TMP_JAMP(1317) + ((-0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * TMP_JAMP(484)  ! used 2 times
-      TMP_JAMP(2180) = TMP_JAMP(1323) + ((0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * TMP_JAMP(787)  ! used 2 times
-      TMP_JAMP(2179) = TMP_JAMP(1494) +  AMP(647)  ! used 2 times
-      TMP_JAMP(2178) = TMP_JAMP(1790) - TMP_JAMP(1329)  ! used 2 times
-      TMP_JAMP(2177) = TMP_JAMP(1946) +  TMP_JAMP(1323)  ! used 2 times
-      TMP_JAMP(2176) = TMP_JAMP(1946) +  TMP_JAMP(1944)  ! used 2 times
-      TMP_JAMP(2175) = AMP(638) + ((-0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * AMP(693)  ! used 2 times
-      TMP_JAMP(2174) = TMP_JAMP(1745) + ((-0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * AMP(330)  ! used 2 times
-      TMP_JAMP(2173) = TMP_JAMP(1906) - TMP_JAMP(1745)  ! used 2 times
-      TMP_JAMP(2172) = TMP_JAMP(1942) + ((0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * TMP_JAMP(780)  ! used 2 times
-      TMP_JAMP(2171) = TMP_JAMP(1943) + ((0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * TMP_JAMP(1906)  ! used 2 times
-      TMP_JAMP(2170) = TMP_JAMP(1944) - TMP_JAMP(1449)  ! used 2 times
-      TMP_JAMP(2169) = TMP_JAMP(1302) +  AMP(971)  ! used 2 times
-      TMP_JAMP(2168) = TMP_JAMP(1309) + ((0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * TMP_JAMP(1303)  ! used 2 times
-      TMP_JAMP(2167) = TMP_JAMP(1310) +  TMP_JAMP(1309)  ! used 2 times
-      TMP_JAMP(2166) = TMP_JAMP(1357) +  AMP(488)  ! used 2 times
-      TMP_JAMP(2165) = TMP_JAMP(1738) - TMP_JAMP(1310)  ! used 2 times
-      TMP_JAMP(2164) = AMP(987) + ((-0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * AMP(1307)  ! used 2 times
-      TMP_JAMP(2163) = TMP_JAMP(1295) - AMP(987)  ! used 2 times
-      TMP_JAMP(2162) = TMP_JAMP(1295) + ((0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * AMP(811)  ! used 2 times
-      TMP_JAMP(2161) = TMP_JAMP(1653) + ((0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * AMP(805)  ! used 2 times
-      TMP_JAMP(2160) = TMP_JAMP(1880) + ((0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * TMP_JAMP(1691)  ! used 2 times
-      TMP_JAMP(2159) = TMP_JAMP(1940) + ((0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * TMP_JAMP(1301)  ! used 2 times
-      TMP_JAMP(2158) = TMP_JAMP(1942) - TMP_JAMP(1299)  ! used 2 times
-      TMP_JAMP(2157) = AMP(808) + ((-0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * AMP(811)  ! used 2 times
-      TMP_JAMP(2156) = TMP_JAMP(1304) +  TMP_JAMP(1293)  ! used 2 times
-      TMP_JAMP(2155) = TMP_JAMP(1685) - TMP_JAMP(1650)  ! used 2 times
-      TMP_JAMP(2154) = TMP_JAMP(1938) + ((0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * TMP_JAMP(1294)  ! used 2 times
-      TMP_JAMP(2153) = AMP(329) + ((-0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * AMP(372)  ! used 2 times
-      TMP_JAMP(2152) = TMP_JAMP(1288) +  AMP(478)  ! used 2 times
-      TMP_JAMP(2151) = TMP_JAMP(1537) + ((0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * TMP_JAMP(1290)  ! used 2 times
-      TMP_JAMP(2150) = TMP_JAMP(1881) +  TMP_JAMP(1291)  ! used 2 times
-      TMP_JAMP(2149) = TMP_JAMP(1906) + ((-0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * AMP(329)  ! used 2 times
-      TMP_JAMP(2148) = TMP_JAMP(1907) +  TMP_JAMP(1906)  ! used 2 times
-      TMP_JAMP(2147) = AMP(806) + ((-0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * AMP(867)  ! used 2 times
-      TMP_JAMP(2146) = TMP_JAMP(1301) + ((0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * TMP_JAMP(1285)  ! used 2 times
-      TMP_JAMP(2145) = TMP_JAMP(1531) + ((-0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * TMP_JAMP(1286)  ! used 2 times
-      TMP_JAMP(2144) = AMP(797) + ((-0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * AMP(852)  ! used 2 times
-      TMP_JAMP(2143) = TMP_JAMP(1284) - AMP(327)  ! used 2 times
-      TMP_JAMP(2142) = TMP_JAMP(1287) + ((0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * TMP_JAMP(1283)  ! used 2 times
-      TMP_JAMP(2141) = TMP_JAMP(1879) + ((0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * TMP_JAMP(1292)  ! used 2 times
-      TMP_JAMP(2140) = TMP_JAMP(1925) +  TMP_JAMP(1908)  ! used 2 times
-      TMP_JAMP(2139) = AMP(980) + ((-0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * AMP(1654)  ! used 2 times
-      TMP_JAMP(2138) = AMP(491) + ((-0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * AMP(1704)  ! used 2 times
-      TMP_JAMP(2137) = TMP_JAMP(1277) +  AMP(980)  ! used 2 times
-      TMP_JAMP(2136) = TMP_JAMP(1280) - TMP_JAMP(1277)  ! used 2 times
-      TMP_JAMP(2135) = TMP_JAMP(1289) - AMP(491)  ! used 2 times
-      TMP_JAMP(2134) = TMP_JAMP(1853) + ((0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * TMP_JAMP(1289)  ! used 2 times
-      TMP_JAMP(2133) = TMP_JAMP(1955) + ((-0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * TMP_JAMP(1280)  ! used 2 times
-      TMP_JAMP(2132) = TMP_JAMP(1955) + ((-0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * TMP_JAMP(1284)  ! used 2 times
-      TMP_JAMP(2131) = AMP(996) + ((-0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * AMP(1229)  ! used 2 times
-      TMP_JAMP(2130) = TMP_JAMP(1275) +  AMP(646)  ! used 2 times
-      TMP_JAMP(2129) = TMP_JAMP(1939) + ((-0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * TMP_JAMP(1917)  ! used 2 times
-      TMP_JAMP(2128) = TMP_JAMP(1276) - AMP(649)  ! used 2 times
-      TMP_JAMP(2127) = TMP_JAMP(1853) +  TMP_JAMP(1852)  ! used 2 times
-      TMP_JAMP(2126) = TMP_JAMP(1869) + ((-0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * TMP_JAMP(1385)  ! used 2 times
-      TMP_JAMP(2125) = TMP_JAMP(1869) +  TMP_JAMP(1852)  ! used 2 times
-      TMP_JAMP(2124) = TMP_JAMP(1875) +  TMP_JAMP(1869)  ! used 2 times
-      TMP_JAMP(2123) = TMP_JAMP(859) + ((0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * AMP(637)  ! used 2 times
-      TMP_JAMP(2122) = TMP_JAMP(1851) + ((-0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * TMP_JAMP(1850)  ! used 2 times
-      TMP_JAMP(2121) = TMP_JAMP(1910) - TMP_JAMP(1907)  ! used 2 times
-      TMP_JAMP(2120) = TMP_JAMP(1954) - TMP_JAMP(1943)  ! used 2 times
-      TMP_JAMP(2119) = TMP_JAMP(1294) + ((0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * TMP_JAMP(1283)  ! used 2 times
-      TMP_JAMP(2118) = TMP_JAMP(1849) + ((-0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * AMP(809)  ! used 2 times
-      TMP_JAMP(2117) = TMP_JAMP(1850) +  TMP_JAMP(1849)  ! used 2 times
-      TMP_JAMP(2116) = AMP(796) + ((-0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * AMP(834)  ! used 2 times
-      TMP_JAMP(2115) = TMP_JAMP(1284) +  AMP(796)  ! used 2 times
-      TMP_JAMP(2114) = TMP_JAMP(1849) + ((0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * TMP_JAMP(1284)  ! used 2 times
-      TMP_JAMP(2113) = TMP_JAMP(1851) + ((0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * TMP_JAMP(1849)  ! used 2 times
-      TMP_JAMP(2112) = TMP_JAMP(1852) - TMP_JAMP(1850)  ! used 2 times
-      TMP_JAMP(2111) = TMP_JAMP(1267) + ((-0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * TMP_JAMP(1265)  ! used 2 times
-      TMP_JAMP(2110) = TMP_JAMP(1294) + ((-0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * TMP_JAMP(1159)  ! used 2 times
-      TMP_JAMP(2109) = TMP_JAMP(1858) + ((-0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * TMP_JAMP(1851)  ! used 2 times
-      TMP_JAMP(2108) = TMP_JAMP(1937) + ((-0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * TMP_JAMP(1349)  ! used 2 times
-      TMP_JAMP(2107) = AMP(490) + ((-0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * AMP(1529)  ! used 2 times
-      TMP_JAMP(2106) = TMP_JAMP(1853) + ((-0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * TMP_JAMP(1344)  ! used 2 times
-      TMP_JAMP(2105) = AMP(109) + ((0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * AMP(961)  ! used 2 times
-      TMP_JAMP(2104) = TMP_JAMP(647) + ((0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * AMP(100)  ! used 2 times
-      TMP_JAMP(2103) = TMP_JAMP(1298) + ((0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * AMP(811)  ! used 2 times
-      TMP_JAMP(2102) = TMP_JAMP(1847) + ((0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * AMP(961)  ! used 2 times
-      TMP_JAMP(2101) = TMP_JAMP(1848) +  TMP_JAMP(1847)  ! used 2 times
-      TMP_JAMP(2100) = TMP_JAMP(1900) +  TMP_JAMP(1787)  ! used 2 times
-      TMP_JAMP(2099) = AMP(121) + ((0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * AMP(970)  ! used 2 times
-      TMP_JAMP(2098) = TMP_JAMP(1436) +  AMP(112)  ! used 2 times
-      TMP_JAMP(2097) = TMP_JAMP(1684) + ((-0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * TMP_JAMP(1436)  ! used 2 times
-      TMP_JAMP(2096) = TMP_JAMP(1934) +  TMP_JAMP(1848)  ! used 2 times
-      TMP_JAMP(2095) = TMP_JAMP(1990) +  TMP_JAMP(1684)  ! used 2 times
-      TMP_JAMP(2094) = AMP(101) + ((-0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * AMP(109)  ! used 2 times
-      TMP_JAMP(2093) = TMP_JAMP(1919) + ((0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * TMP_JAMP(1612)  ! used 2 times
-      TMP_JAMP(2092) = TMP_JAMP(1934) + ((0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * AMP(991)  ! used 2 times
-      TMP_JAMP(2091) = AMP(133) + ((0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * AMP(979)  ! used 2 times
-      TMP_JAMP(2090) = TMP_JAMP(1279) - TMP_JAMP(1251)  ! used 2 times
-      TMP_JAMP(2089) = TMP_JAMP(1848) +  TMP_JAMP(1282)  ! used 2 times
-      TMP_JAMP(2088) = TMP_JAMP(1874) + ((0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * AMP(124)  ! used 2 times
-      TMP_JAMP(2087) = AMP(493) + ((-0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * AMP(1000)  ! used 2 times
-      TMP_JAMP(2086) = TMP_JAMP(844) + ((-0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * AMP(113)  ! used 2 times
-      TMP_JAMP(2085) = TMP_JAMP(1249) +  AMP(148)  ! used 2 times
-      TMP_JAMP(2084) = TMP_JAMP(1349) + ((-0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * TMP_JAMP(1149)  ! used 2 times
-      TMP_JAMP(2083) = TMP_JAMP(1847) - TMP_JAMP(1266)  ! used 2 times
-      TMP_JAMP(2082) = TMP_JAMP(1947) + ((-0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * TMP_JAMP(848)  ! used 2 times
-      TMP_JAMP(2081) = TMP_JAMP(1371) +  AMP(125)  ! used 2 times
-      TMP_JAMP(2080) = AMP(185) + ((-0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * AMP(291)  ! used 2 times
-      TMP_JAMP(2079) = AMP(174) + ((-0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * AMP(195)  ! used 2 times
-      TMP_JAMP(2078) = TMP_JAMP(1588) - AMP(168)  ! used 2 times
-      TMP_JAMP(2077) = TMP_JAMP(1780) + ((-0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * TMP_JAMP(1453)  ! used 2 times
-      TMP_JAMP(2076) = TMP_JAMP(1894) +  TMP_JAMP(1244)  ! used 2 times
-      TMP_JAMP(2075) = TMP_JAMP(1234) + ((0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * TMP_JAMP(1227)  ! used 2 times
-      TMP_JAMP(2074) = TMP_JAMP(1528) - AMP(207)  ! used 2 times
-      TMP_JAMP(2073) = TMP_JAMP(1771) + ((-0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * TMP_JAMP(1321)  ! used 2 times
-      TMP_JAMP(2072) = TMP_JAMP(1933) + ((0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * TMP_JAMP(723)  ! used 2 times
-      TMP_JAMP(2071) = AMP(203) + ((-0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * AMP(850)  ! used 2 times
-      TMP_JAMP(2070) = TMP_JAMP(879) + ((0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * TMP_JAMP(872)  ! used 2 times
-      TMP_JAMP(2069) = TMP_JAMP(1221) + ((0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * AMP(213)  ! used 2 times
-      TMP_JAMP(2068) = TMP_JAMP(1729) +  TMP_JAMP(1224)  ! used 2 times
-      TMP_JAMP(2067) = TMP_JAMP(1931) + ((0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * TMP_JAMP(717)  ! used 2 times
-      TMP_JAMP(2066) = TMP_JAMP(1933) - TMP_JAMP(1433)  ! used 2 times
-      TMP_JAMP(2065) = TMP_JAMP(1213) +  TMP_JAMP(1211)  ! used 2 times
-      TMP_JAMP(2064) = TMP_JAMP(1216) +  TMP_JAMP(1215)  ! used 2 times
-      TMP_JAMP(2063) = TMP_JAMP(1240) +  AMP(189)  ! used 2 times
-      TMP_JAMP(2062) = TMP_JAMP(1393) + ((0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * TMP_JAMP(1247)  ! used 2 times
-      TMP_JAMP(2061) = TMP_JAMP(1723) + ((-0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * TMP_JAMP(1306)  ! used 2 times
-      TMP_JAMP(2060) = AMP(267) + ((-0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * AMP(943)  ! used 2 times
-      TMP_JAMP(2059) = TMP_JAMP(1209) +  TMP_JAMP(1205)  ! used 2 times
-      TMP_JAMP(2058) = TMP_JAMP(1931) +  TMP_JAMP(1300)  ! used 2 times
-      TMP_JAMP(2057) = TMP_JAMP(1945) + ((0.000000000000000D+00
+      TMP_JAMP(1084) = TMP_JAMP(745) +  AMP(602)  ! used 2 times
+      TMP_JAMP(1083) = TMP_JAMP(742) - AMP(478)  ! used 2 times
+      TMP_JAMP(1082) = TMP_JAMP(738) - AMP(754)  ! used 2 times
+      TMP_JAMP(1081) = TMP_JAMP(731) +  AMP(638)  ! used 2 times
+      TMP_JAMP(1080) = TMP_JAMP(726) +  AMP(582)  ! used 2 times
+      TMP_JAMP(1079) = TMP_JAMP(720) +  AMP(805)  ! used 2 times
+      TMP_JAMP(1078) = TMP_JAMP(712) +  AMP(590)  ! used 2 times
+      TMP_JAMP(1077) = TMP_JAMP(703) +  AMP(774)  ! used 2 times
+      TMP_JAMP(1076) = TMP_JAMP(662) + ((0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * AMP(766)  ! used 2 times
+      TMP_JAMP(1075) = TMP_JAMP(661) + ((-0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * AMP(741)  ! used 2 times
+      TMP_JAMP(1074) = TMP_JAMP(657) + ((0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * AMP(607)  ! used 2 times
+      TMP_JAMP(1073) = AMP(593) + ((0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * AMP(595)  ! used 2 times
+      TMP_JAMP(1072) = AMP(472) +  AMP(473)  ! used 2 times
+      TMP_JAMP(1071) = AMP(505) + ((-0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * AMP(511)  ! used 2 times
+      TMP_JAMP(1070) = AMP(837) +  AMP(920)  ! used 2 times
+      TMP_JAMP(1069) = AMP(655) + ((-0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * AMP(660)  ! used 2 times
+      TMP_JAMP(1068) = AMP(742) +  AMP(781)  ! used 2 times
+      TMP_JAMP(1067) = AMP(855) +  AMP(908)  ! used 2 times
+      TMP_JAMP(1066) = AMP(519) - AMP(606)  ! used 2 times
+      TMP_JAMP(1376) = TMP_JAMP(1298) + ((0.000000000000000D+00
      $ ,1.000000000000000D+00)) * TMP_JAMP(1210)  ! used 2 times
-      TMP_JAMP(2056) = TMP_JAMP(1212) +  TMP_JAMP(1203)  ! used 2 times
-      TMP_JAMP(2055) = TMP_JAMP(1941) + ((0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * TMP_JAMP(1204)  ! used 2 times
-      TMP_JAMP(2054) = AMP(183) + ((-0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * AMP(195)  ! used 2 times
-      TMP_JAMP(2053) = AMP(171) + ((-0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * AMP(671)  ! used 2 times
-      TMP_JAMP(2052) = TMP_JAMP(1197) + ((-0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * TMP_JAMP(914)  ! used 2 times
-      TMP_JAMP(2051) = TMP_JAMP(1498) + ((-0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * TMP_JAMP(1200)  ! used 2 times
-      TMP_JAMP(2050) = TMP_JAMP(1873) +  TMP_JAMP(1201)  ! used 2 times
-      TMP_JAMP(2049) = TMP_JAMP(1929) + ((-0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * TMP_JAMP(1224)  ! used 2 times
-      TMP_JAMP(2048) = AMP(225) + ((-0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * AMP(707)  ! used 2 times
-      TMP_JAMP(2047) = TMP_JAMP(1194) + ((0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * AMP(310)  ! used 2 times
-      TMP_JAMP(2046) = TMP_JAMP(1228) + ((0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * TMP_JAMP(1210)  ! used 2 times
-      TMP_JAMP(2045) = TMP_JAMP(1495) + ((-0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * TMP_JAMP(1195)  ! used 2 times
-      TMP_JAMP(2044) = AMP(221) + ((-0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * AMP(691)  ! used 2 times
-      TMP_JAMP(2043) = TMP_JAMP(1191) - AMP(167)  ! used 2 times
-      TMP_JAMP(2042) = TMP_JAMP(1196) + ((0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * TMP_JAMP(1192)  ! used 2 times
-      TMP_JAMP(2041) = TMP_JAMP(1872) + ((0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * TMP_JAMP(1202)  ! used 2 times
-      TMP_JAMP(2040) = TMP_JAMP(1932) + ((-0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * TMP_JAMP(1923)  ! used 2 times
-      TMP_JAMP(2039) = AMP(192) + ((-0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * AMP(300)  ! used 2 times
-      TMP_JAMP(2038) = TMP_JAMP(1193) +  AMP(258)  ! used 2 times
-      TMP_JAMP(2037) = TMP_JAMP(1199) +  AMP(192)  ! used 2 times
-      TMP_JAMP(2036) = TMP_JAMP(1483) - TMP_JAMP(1199)  ! used 2 times
-      TMP_JAMP(2035) = TMP_JAMP(1487) + ((-0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * TMP_JAMP(1483)  ! used 2 times
-      TMP_JAMP(2034) = TMP_JAMP(1846) - TMP_JAMP(1487)  ! used 2 times
-      TMP_JAMP(2033) = TMP_JAMP(1846) +  TMP_JAMP(1845)  ! used 2 times
-      TMP_JAMP(2032) = AMP(276) + ((-0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * AMP(784)  ! used 2 times
-      TMP_JAMP(2031) = TMP_JAMP(1185) - AMP(205)  ! used 2 times
-      TMP_JAMP(2030) = TMP_JAMP(1930) + ((0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * TMP_JAMP(1920)  ! used 2 times
-      TMP_JAMP(2029) = TMP_JAMP(1186) - AMP(208)  ! used 2 times
-      TMP_JAMP(2028) = TMP_JAMP(1875) +  TMP_JAMP(1844)  ! used 2 times
-      TMP_JAMP(2027) = TMP_JAMP(1182) + ((0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * TMP_JAMP(1179)  ! used 2 times
-      TMP_JAMP(2026) = TMP_JAMP(1929) + ((-0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * TMP_JAMP(1909)  ! used 2 times
-      TMP_JAMP(2025) = TMP_JAMP(1951) - AMP(201)  ! used 2 times
-      TMP_JAMP(2024) = TMP_JAMP(1951) + ((0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * TMP_JAMP(1843)  ! used 2 times
-      TMP_JAMP(2023) = AMP(228) + ((-0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * AMP(548)  ! used 2 times
-      TMP_JAMP(2022) = TMP_JAMP(1192) + ((0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * AMP(301)  ! used 2 times
-      TMP_JAMP(2021) = TMP_JAMP(1214) +  TMP_JAMP(1192)  ! used 2 times
-      TMP_JAMP(2020) = TMP_JAMP(1214) + ((0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * TMP_JAMP(1204)  ! used 2 times
-      TMP_JAMP(2019) = TMP_JAMP(1842) + ((-0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * AMP(228)  ! used 2 times
-      TMP_JAMP(2018) = TMP_JAMP(1843) +  TMP_JAMP(1842)  ! used 2 times
-      TMP_JAMP(2017) = TMP_JAMP(1842) + ((0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * AMP(219)  ! used 2 times
-      TMP_JAMP(2016) = TMP_JAMP(1843) + ((-0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * AMP(210)  ! used 2 times
-      TMP_JAMP(2015) = TMP_JAMP(1844) - TMP_JAMP(1843)  ! used 2 times
-      TMP_JAMP(2014) = TMP_JAMP(1845) - TMP_JAMP(1844)  ! used 2 times
-      TMP_JAMP(2013) = TMP_JAMP(1180) +  AMP(285)  ! used 2 times
-      TMP_JAMP(2012) = TMP_JAMP(1204) + ((-0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * TMP_JAMP(1162)  ! used 2 times
-      TMP_JAMP(2011) = TMP_JAMP(1937) +  AMP(187)  ! used 2 times
-      TMP_JAMP(2010) = TMP_JAMP(1937) + ((-0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * TMP_JAMP(1841)  ! used 2 times
-      TMP_JAMP(2009) = TMP_JAMP(1841) + ((-0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * AMP(190)  ! used 2 times
-      TMP_JAMP(2008) = TMP_JAMP(1936) + ((-0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * TMP_JAMP(1846)  ! used 2 times
-      TMP_JAMP(2007) = AMP(229) + ((-0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * AMP(259)  ! used 2 times
-      TMP_JAMP(2006) = AMP(109) + ((-0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * AMP(238)  ! used 2 times
-      TMP_JAMP(2005) = TMP_JAMP(1208) + ((0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * AMP(229)  ! used 2 times
-      TMP_JAMP(2004) = TMP_JAMP(1839) - AMP(109)  ! used 2 times
-      TMP_JAMP(2003) = TMP_JAMP(1990) + ((-0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * TMP_JAMP(1208)  ! used 2 times
-      TMP_JAMP(2002) = TMP_JAMP(1990) - TMP_JAMP(1840)  ! used 2 times
-      TMP_JAMP(2001) = AMP(121) + ((-0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * AMP(247)  ! used 2 times
-      TMP_JAMP(2000) = TMP_JAMP(1838) +  AMP(121)  ! used 2 times
-      TMP_JAMP(1999) = TMP_JAMP(1840) +  TMP_JAMP(1838)  ! used 2 times
-      TMP_JAMP(1998) = AMP(142) + ((-0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * AMP(268)  ! used 2 times
-      TMP_JAMP(1997) = TMP_JAMP(1838) - AMP(142)  ! used 2 times
-      TMP_JAMP(1996) = TMP_JAMP(1839) - TMP_JAMP(1838)  ! used 2 times
-      TMP_JAMP(1995) = TMP_JAMP(1252) - AMP(256)  ! used 2 times
-      TMP_JAMP(1994) = TMP_JAMP(1845) +  TMP_JAMP(1840)  ! used 2 times
-      TMP_JAMP(1993) = TMP_JAMP(1250) + ((-0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * AMP(277)  ! used 2 times
-      TMP_JAMP(1992) = TMP_JAMP(1841) +  TMP_JAMP(1839)  ! used 2 times
-      TMP_JAMP(2744) = TMP_JAMP(2640) + ((0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * AMP(1515)  ! used 2 times
-      TMP_JAMP(2743) = TMP_JAMP(2621) + ((0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * AMP(1888)  ! used 2 times
-      TMP_JAMP(2742) = TMP_JAMP(2622) + ((-0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * TMP_JAMP(2618)  ! used 2 times
-      TMP_JAMP(2741) = TMP_JAMP(2602) + ((-0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * TMP_JAMP(2600)  ! used 2 times
-      TMP_JAMP(2740) = TMP_JAMP(2592) +  TMP_JAMP(868)  ! used 2 times
-      TMP_JAMP(2739) = TMP_JAMP(2593) + ((0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * AMP(667)  ! used 2 times
-      TMP_JAMP(2738) = TMP_JAMP(2578) + ((0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * TMP_JAMP(2576)  ! used 2 times
-      TMP_JAMP(2737) = TMP_JAMP(2579) + ((0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * TMP_JAMP(2577)  ! used 2 times
-      TMP_JAMP(2736) = TMP_JAMP(2573) + ((-0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * TMP_JAMP(2569)  ! used 2 times
-      TMP_JAMP(2735) = TMP_JAMP(2567) +  TMP_JAMP(2566)  ! used 2 times
-      TMP_JAMP(2734) = TMP_JAMP(2568) - AMP(1753)  ! used 2 times
-      TMP_JAMP(2733) = TMP_JAMP(2560) + ((0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * TMP_JAMP(2559)  ! used 2 times
-      TMP_JAMP(2732) = TMP_JAMP(2555) + ((-0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * AMP(1287)  ! used 2 times
-      TMP_JAMP(2731) = TMP_JAMP(2545) +  AMP(753)  ! used 2 times
-      TMP_JAMP(2730) = TMP_JAMP(2546) + ((0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * AMP(459)  ! used 2 times
-      TMP_JAMP(2729) = TMP_JAMP(2541) +  AMP(603)  ! used 2 times
-      TMP_JAMP(2728) = TMP_JAMP(2536) +  TMP_JAMP(902)  ! used 2 times
-      TMP_JAMP(2727) = TMP_JAMP(2530) + ((0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * AMP(1638)  ! used 2 times
-      TMP_JAMP(2726) = TMP_JAMP(2520) +  AMP(1040)  ! used 2 times
-      TMP_JAMP(2725) = TMP_JAMP(2524) - TMP_JAMP(2519)  ! used 2 times
-      TMP_JAMP(2724) = TMP_JAMP(2517) +  TMP_JAMP(2513)  ! used 2 times
-      TMP_JAMP(2723) = TMP_JAMP(2518) + ((-0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * AMP(924)  ! used 2 times
-      TMP_JAMP(2722) = TMP_JAMP(2509) +  AMP(1197)  ! used 2 times
-      TMP_JAMP(2721) = TMP_JAMP(2510) + ((-0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * AMP(1713)  ! used 2 times
-      TMP_JAMP(2720) = TMP_JAMP(2505) + ((-0.000000000000000D+00
+      TMP_JAMP(1375) = TMP_JAMP(1287) +  TMP_JAMP(1164)  ! used 2 times
+      TMP_JAMP(1374) = TMP_JAMP(1286) +  TMP_JAMP(1151)  ! used 2 times
+      TMP_JAMP(1373) = TMP_JAMP(1284) - TMP_JAMP(1153)  ! used 2 times
+      TMP_JAMP(1372) = TMP_JAMP(1276) + ((0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * TMP_JAMP(1242)  ! used 2 times
+      TMP_JAMP(1371) = TMP_JAMP(1267) + ((0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * TMP_JAMP(1163)  ! used 2 times
+      TMP_JAMP(1370) = TMP_JAMP(1266) + ((-0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * TMP_JAMP(1181)  ! used 2 times
+      TMP_JAMP(1369) = TMP_JAMP(1265) + ((0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * TMP_JAMP(1173)  ! used 2 times
+      TMP_JAMP(1368) = TMP_JAMP(1260) + ((-0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * TMP_JAMP(1166)  ! used 2 times
+      TMP_JAMP(1367) = TMP_JAMP(1249) + ((-0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * TMP_JAMP(1126)  ! used 2 times
+      TMP_JAMP(1366) = TMP_JAMP(1246) + ((-0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * TMP_JAMP(1015)  ! used 2 times
+      TMP_JAMP(1365) = TMP_JAMP(1245) - TMP_JAMP(1198)  ! used 2 times
+      TMP_JAMP(1364) = TMP_JAMP(1243) - TMP_JAMP(1057)  ! used 2 times
+      TMP_JAMP(1363) = TMP_JAMP(1236) - TMP_JAMP(1229)  ! used 2 times
+      TMP_JAMP(1362) = TMP_JAMP(1234) - TMP_JAMP(1183)  ! used 2 times
+      TMP_JAMP(1361) = TMP_JAMP(1233) + ((0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * TMP_JAMP(1117)  ! used 2 times
+      TMP_JAMP(1360) = TMP_JAMP(1227) +  TMP_JAMP(1058)  ! used 2 times
+      TMP_JAMP(1359) = TMP_JAMP(1226) +  TMP_JAMP(1082)  ! used 2 times
+      TMP_JAMP(1358) = TMP_JAMP(1222) + ((-0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * TMP_JAMP(1074)  ! used 2 times
+      TMP_JAMP(1357) = TMP_JAMP(1217) + ((-0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * TMP_JAMP(1098)  ! used 2 times
+      TMP_JAMP(1356) = TMP_JAMP(1215) + ((0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * TMP_JAMP(1096)  ! used 2 times
+      TMP_JAMP(1355) = TMP_JAMP(1214) + ((-0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * TMP_JAMP(1097)  ! used 2 times
+      TMP_JAMP(1354) = TMP_JAMP(1213) + ((0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * TMP_JAMP(1134)  ! used 2 times
+      TMP_JAMP(1353) = TMP_JAMP(1208) + ((0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * TMP_JAMP(1099)  ! used 2 times
+      TMP_JAMP(1352) = TMP_JAMP(1206) + ((-0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * TMP_JAMP(1090)  ! used 2 times
+      TMP_JAMP(1351) = TMP_JAMP(1202) - TMP_JAMP(1080)  ! used 2 times
+      TMP_JAMP(1350) = TMP_JAMP(1199) - TMP_JAMP(1071)  ! used 2 times
+      TMP_JAMP(1349) = TMP_JAMP(1192) + ((-0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * TMP_JAMP(1087)  ! used 2 times
+      TMP_JAMP(1348) = TMP_JAMP(1190) + ((-0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * TMP_JAMP(1142)  ! used 2 times
+      TMP_JAMP(1347) = TMP_JAMP(1189) - TMP_JAMP(1160)  ! used 2 times
+      TMP_JAMP(1346) = TMP_JAMP(1186) - TMP_JAMP(1149)  ! used 2 times
+      TMP_JAMP(1345) = TMP_JAMP(1184) + ((0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * TMP_JAMP(1115)  ! used 2 times
+      TMP_JAMP(1344) = TMP_JAMP(1182) - TMP_JAMP(1137)  ! used 2 times
+      TMP_JAMP(1343) = TMP_JAMP(1178) - TMP_JAMP(1170)  ! used 2 times
+      TMP_JAMP(1342) = TMP_JAMP(1175) +  TMP_JAMP(970)  ! used 2 times
+      TMP_JAMP(1341) = TMP_JAMP(1171) + ((-0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * TMP_JAMP(1114)  ! used 2 times
+      TMP_JAMP(1340) = TMP_JAMP(1165) + ((-0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * TMP_JAMP(1085)  ! used 2 times
+      TMP_JAMP(1339) = TMP_JAMP(1162) + ((0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * TMP_JAMP(1121)  ! used 2 times
+      TMP_JAMP(1338) = TMP_JAMP(1157) +  TMP_JAMP(1046)  ! used 2 times
+      TMP_JAMP(1337) = TMP_JAMP(1154) + ((0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * TMP_JAMP(1140)  ! used 2 times
+      TMP_JAMP(1336) = TMP_JAMP(1146) + ((-0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * TMP_JAMP(1089)  ! used 2 times
+      TMP_JAMP(1335) = TMP_JAMP(1136) - TMP_JAMP(1104)  ! used 2 times
+      TMP_JAMP(1334) = TMP_JAMP(1135) + ((-0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * TMP_JAMP(1106)  ! used 2 times
+      TMP_JAMP(1333) = TMP_JAMP(1132) +  TMP_JAMP(1105)  ! used 2 times
+      TMP_JAMP(1332) = TMP_JAMP(1128) + ((0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * TMP_JAMP(1078)  ! used 2 times
+      TMP_JAMP(1331) = TMP_JAMP(1125) - TMP_JAMP(1073)  ! used 2 times
+      TMP_JAMP(1330) = TMP_JAMP(1124) +  TMP_JAMP(1014)  ! used 2 times
+      TMP_JAMP(1329) = TMP_JAMP(1123) - TMP_JAMP(1017)  ! used 2 times
+      TMP_JAMP(1328) = TMP_JAMP(1122) + ((0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * TMP_JAMP(1048)  ! used 2 times
+      TMP_JAMP(1327) = TMP_JAMP(1120) +  TMP_JAMP(1016)  ! used 2 times
+      TMP_JAMP(1326) = TMP_JAMP(1119) + ((-0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * TMP_JAMP(1064)  ! used 2 times
+      TMP_JAMP(1325) = TMP_JAMP(1118) + ((-0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * TMP_JAMP(1077)  ! used 2 times
+      TMP_JAMP(1324) = TMP_JAMP(1116) +  TMP_JAMP(1024)  ! used 2 times
+      TMP_JAMP(1323) = TMP_JAMP(1112) +  TMP_JAMP(1011)  ! used 2 times
+      TMP_JAMP(1322) = TMP_JAMP(1111) + ((-0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * TMP_JAMP(1081)  ! used 2 times
+      TMP_JAMP(1321) = TMP_JAMP(1110) + ((0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * TMP_JAMP(1068)  ! used 2 times
+      TMP_JAMP(1320) = TMP_JAMP(1109) - TMP_JAMP(1027)  ! used 2 times
+      TMP_JAMP(1319) = TMP_JAMP(1108) + ((0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * TMP_JAMP(1066)  ! used 2 times
+      TMP_JAMP(1318) = TMP_JAMP(1103) + ((0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * TMP_JAMP(964)  ! used 2 times
+      TMP_JAMP(1317) = TMP_JAMP(1101) + ((0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * TMP_JAMP(1056)  ! used 2 times
+      TMP_JAMP(1316) = TMP_JAMP(1094) + ((0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * TMP_JAMP(1049)  ! used 2 times
+      TMP_JAMP(1315) = TMP_JAMP(1093) + ((-0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * TMP_JAMP(1067)  ! used 2 times
+      TMP_JAMP(1314) = TMP_JAMP(1091) + ((0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * TMP_JAMP(1052)  ! used 2 times
+      TMP_JAMP(1313) = TMP_JAMP(1086) - TMP_JAMP(999)  ! used 2 times
+      TMP_JAMP(1312) = TMP_JAMP(1084) - TMP_JAMP(1060)  ! used 2 times
+      TMP_JAMP(1311) = TMP_JAMP(1083) + ((0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * TMP_JAMP(1042)  ! used 2 times
+      TMP_JAMP(1310) = TMP_JAMP(1079) +  TMP_JAMP(1045)  ! used 2 times
+      TMP_JAMP(1309) = TMP_JAMP(1076) + ((0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * TMP_JAMP(1051)  ! used 2 times
+      TMP_JAMP(1308) = TMP_JAMP(1075) +  TMP_JAMP(1009)  ! used 2 times
+      TMP_JAMP(1307) = TMP_JAMP(1072) + ((-0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * TMP_JAMP(839)  ! used 2 times
+      TMP_JAMP(1306) = TMP_JAMP(1070) +  TMP_JAMP(1053)  ! used 2 times
+      TMP_JAMP(1305) = TMP_JAMP(1069) + ((-0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * TMP_JAMP(1032)  ! used 2 times
+      TMP_JAMP(1416) = TMP_JAMP(1372) - TMP_JAMP(1331)  ! used 2 times
+      TMP_JAMP(1415) = TMP_JAMP(1371) +  TMP_JAMP(1319)  ! used 2 times
+      TMP_JAMP(1414) = TMP_JAMP(1370) - TMP_JAMP(1309)  ! used 2 times
+      TMP_JAMP(1413) = TMP_JAMP(1369) +  TMP_JAMP(1321)  ! used 2 times
+      TMP_JAMP(1412) = TMP_JAMP(1368) +  TMP_JAMP(1315)  ! used 2 times
+      TMP_JAMP(1411) = TMP_JAMP(1367) +  TMP_JAMP(1254)  ! used 2 times
+      TMP_JAMP(1410) = TMP_JAMP(1365) + ((-0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * TMP_JAMP(1330)  ! used 2 times
+      TMP_JAMP(1409) = TMP_JAMP(1363) + ((0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * TMP_JAMP(1324)  ! used 2 times
+      TMP_JAMP(1408) = TMP_JAMP(1362) +  TMP_JAMP(1305)  ! used 2 times
+      TMP_JAMP(1407) = TMP_JAMP(1361) +  TMP_JAMP(1239)  ! used 2 times
+      TMP_JAMP(1406) = TMP_JAMP(1359) - TMP_JAMP(1297)  ! used 2 times
+      TMP_JAMP(1405) = TMP_JAMP(1358) + ((-0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * TMP_JAMP(1300)  ! used 2 times
+      TMP_JAMP(1404) = TMP_JAMP(1357) +  TMP_JAMP(1294)  ! used 2 times
+      TMP_JAMP(1403) = TMP_JAMP(1356) + ((0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * TMP_JAMP(1262)  ! used 2 times
+      TMP_JAMP(1402) = TMP_JAMP(1355) +  TMP_JAMP(1306)  ! used 2 times
+      TMP_JAMP(1401) = TMP_JAMP(1354) + ((0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * TMP_JAMP(1313)  ! used 2 times
+      TMP_JAMP(1400) = TMP_JAMP(1353) - TMP_JAMP(1310)  ! used 2 times
+      TMP_JAMP(1399) = TMP_JAMP(1352) - TMP_JAMP(1292)  ! used 2 times
+      TMP_JAMP(1398) = TMP_JAMP(1348) - TMP_JAMP(1285)  ! used 2 times
+      TMP_JAMP(1397) = TMP_JAMP(1347) + ((0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * TMP_JAMP(1327)  ! used 2 times
+      TMP_JAMP(1396) = TMP_JAMP(1345) - TMP_JAMP(1289)  ! used 2 times
+      TMP_JAMP(1395) = TMP_JAMP(1344) + ((-0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * TMP_JAMP(1318)  ! used 2 times
+      TMP_JAMP(1394) = TMP_JAMP(1343) + ((-0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * TMP_JAMP(1323)  ! used 2 times
+      TMP_JAMP(1393) = TMP_JAMP(1339) + ((0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * TMP_JAMP(1299)  ! used 2 times
+      TMP_JAMP(1392) = TMP_JAMP(1338) + ((-0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * TMP_JAMP(1274)  ! used 2 times
+      TMP_JAMP(1391) = TMP_JAMP(1337) + ((0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * TMP_JAMP(1273)  ! used 2 times
+      TMP_JAMP(1390) = TMP_JAMP(1336) - TMP_JAMP(1283)  ! used 2 times
+      TMP_JAMP(1389) = TMP_JAMP(1335) + ((0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * TMP_JAMP(1235)  ! used 2 times
+      TMP_JAMP(1388) = TMP_JAMP(1334) - TMP_JAMP(1251)  ! used 2 times
+      TMP_JAMP(1387) = TMP_JAMP(1333) + ((-0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * TMP_JAMP(1220)  ! used 2 times
+      TMP_JAMP(1386) = TMP_JAMP(1332) + ((0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * TMP_JAMP(1191)  ! used 2 times
+      TMP_JAMP(1385) = TMP_JAMP(1328) + ((0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * TMP_JAMP(1161)  ! used 2 times
+      TMP_JAMP(1384) = TMP_JAMP(1326) + ((-0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * TMP_JAMP(1230)  ! used 2 times
+      TMP_JAMP(1383) = TMP_JAMP(1325) + ((-0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * TMP_JAMP(1179)  ! used 2 times
+      TMP_JAMP(1382) = TMP_JAMP(1322) + ((-0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * TMP_JAMP(1221)  ! used 2 times
+      TMP_JAMP(1381) = TMP_JAMP(1317) + ((0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * TMP_JAMP(1293)  ! used 2 times
+      TMP_JAMP(1380) = TMP_JAMP(1316) - TMP_JAMP(1258)  ! used 2 times
+      TMP_JAMP(1379) = TMP_JAMP(1314) +  TMP_JAMP(1259)  ! used 2 times
+      TMP_JAMP(1378) = TMP_JAMP(1312) +  TMP_JAMP(1223)  ! used 2 times
+      TMP_JAMP(1377) = TMP_JAMP(1308) + ((-0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * TMP_JAMP(1225)  ! used 2 times
+      TMP_JAMP(1419) = TMP_JAMP(1180) + ((-0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * TMP_JAMP(547)  ! used 2 times
+      TMP_JAMP(1418) = TMP_JAMP(1139) + ((-0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * TMP_JAMP(241)  ! used 2 times
+      TMP_JAMP(1417) = TMP_JAMP(855) +  TMP_JAMP(243)  ! used 2 times
+      TMP_JAMP(1479) = AMP(1371) - AMP(1381)  ! used 16 times
+      TMP_JAMP(1478) = AMP(1370) - AMP(1382)  ! used 16 times
+      TMP_JAMP(1477) = AMP(1358) - AMP(1361)  ! used 16 times
+      TMP_JAMP(1476) = AMP(1323) +  AMP(1387)  ! used 16 times
+      TMP_JAMP(1475) = AMP(1322) +  AMP(1388)  ! used 16 times
+      TMP_JAMP(1474) = AMP(1320) +  AMP(1324)  ! used 16 times
+      TMP_JAMP(1473) = AMP(1310) - AMP(1313)  ! used 16 times
+      TMP_JAMP(1472) = AMP(1309) +  AMP(1311)  ! used 16 times
+      TMP_JAMP(1471) = AMP(1103) +  AMP(1359)  ! used 16 times
+      TMP_JAMP(1470) = AMP(1085) +  AMP(1087)  ! used 16 times
+      TMP_JAMP(1469) = AMP(1084) - AMP(1100)  ! used 16 times
+      TMP_JAMP(1468) = AMP(1080) +  AMP(1082)  ! used 16 times
+      TMP_JAMP(1467) = AMP(1079) - AMP(1099)  ! used 16 times
+      TMP_JAMP(1466) = AMP(1076) +  AMP(1078)  ! used 16 times
+      TMP_JAMP(1465) = AMP(1378) +  AMP(1383)  ! used 16 times
+      TMP_JAMP(1464) = AMP(1367) +  AMP(1369)  ! used 16 times
+      TMP_JAMP(1463) = AMP(1330) - AMP(1389)  ! used 16 times
+      TMP_JAMP(1462) = AMP(1319) +  AMP(1321)  ! used 16 times
+      TMP_JAMP(1461) = AMP(1315) - AMP(1318)  ! used 16 times
+      TMP_JAMP(1460) = AMP(1314) +  AMP(1316)  ! used 16 times
+      TMP_JAMP(1459) = AMP(1357) +  AMP(1360)  ! used 16 times
+      TMP_JAMP(1458) = AMP(1163) +  AMP(1165)  ! used 16 times
+      TMP_JAMP(1457) = AMP(1159) +  AMP(1161)  ! used 16 times
+      TMP_JAMP(1456) = AMP(1158) +  AMP(1160)  ! used 16 times
+      TMP_JAMP(1455) = AMP(1368) +  AMP(1372)  ! used 16 times
+      TMP_JAMP(1454) = AMP(1164) +  AMP(1168)  ! used 16 times
+      TMP_JAMP(1453) = AMP(1154) +  AMP(1156)  ! used 16 times
+      TMP_JAMP(1452) = AMP(1153) +  AMP(1155)  ! used 16 times
+      TMP_JAMP(1451) = AMP(1362) +  AMP(1364)  ! used 16 times
+      TMP_JAMP(1450) = AMP(1101) +  AMP(1104)  ! used 16 times
+      TMP_JAMP(1449) = AMP(1363) - AMP(1366)  ! used 16 times
+      TMP_JAMP(1448) = AMP(1347) +  AMP(1384)  ! used 16 times
+      TMP_JAMP(1447) = AMP(1346) +  AMP(1385)  ! used 16 times
+      TMP_JAMP(1446) = AMP(1334) - AMP(1337)  ! used 16 times
+      TMP_JAMP(1445) = AMP(1181) +  AMP(1336)  ! used 16 times
+      TMP_JAMP(1444) = AMP(1162) - AMP(1178)  ! used 16 times
+      TMP_JAMP(1443) = AMP(1157) - AMP(1177)  ! used 16 times
+      TMP_JAMP(1442) = AMP(1354) - AMP(1386)  ! used 16 times
+      TMP_JAMP(1441) = AMP(1343) +  AMP(1345)  ! used 16 times
+      TMP_JAMP(1440) = AMP(1333) +  AMP(1335)  ! used 16 times
+      TMP_JAMP(1439) = AMP(1081) +  AMP(1083)  ! used 16 times
+      TMP_JAMP(1438) = AMP(1344) +  AMP(1348)  ! used 16 times
+      TMP_JAMP(1437) = AMP(1086) +  AMP(1090)  ! used 16 times
+      TMP_JAMP(1436) = AMP(1075) +  AMP(1077)  ! used 16 times
+      TMP_JAMP(1435) = AMP(1338) +  AMP(1340)  ! used 16 times
+      TMP_JAMP(1434) = AMP(1179) +  AMP(1182)  ! used 16 times
+      TMP_JAMP(1433) = AMP(1339) - AMP(1342)  ! used 16 times
+      TMP_JAMP(1432) = AMP(1259) +  AMP(1312)  ! used 16 times
+      TMP_JAMP(1431) = AMP(1257) +  AMP(1260)  ! used 16 times
+      TMP_JAMP(1430) = AMP(1240) - AMP(1256)  ! used 16 times
+      TMP_JAMP(1429) = AMP(1237) +  AMP(1239)  ! used 16 times
+      TMP_JAMP(1428) = AMP(1242) +  AMP(1246)  ! used 16 times
+      TMP_JAMP(1427) = AMP(1236) +  AMP(1238)  ! used 16 times
+      TMP_JAMP(1426) = AMP(1241) +  AMP(1243)  ! used 16 times
+      TMP_JAMP(1425) = AMP(1235) - AMP(1255)  ! used 16 times
+      TMP_JAMP(1424) = AMP(1231) +  AMP(1233)  ! used 16 times
+      TMP_JAMP(1423) = AMP(1232) +  AMP(1234)  ! used 16 times
+      TMP_JAMP(1422) = AMP(954) - AMP(1097)  ! used 16 times
+      TMP_JAMP(1421) = AMP(952) +  AMP(1328)  ! used 16 times
+      TMP_JAMP(1420) = AMP(953) - AMP(1175)  ! used 16 times
+      TMP_JAMP(1524) = TMP_JAMP(1477) +  AMP(1390)  ! used 16 times
+      TMP_JAMP(1523) = TMP_JAMP(1474) - AMP(1327)  ! used 16 times
+      TMP_JAMP(1522) = TMP_JAMP(1473) - AMP(1392)  ! used 16 times
+      TMP_JAMP(1521) = TMP_JAMP(1472) +  AMP(1329)  ! used 16 times
+      TMP_JAMP(1520) = TMP_JAMP(1471) - TMP_JAMP(1469)  ! used 16 times
+      TMP_JAMP(1519) = TMP_JAMP(1470) - AMP(1096)  ! used 16 times
+      TMP_JAMP(1518) = TMP_JAMP(1468) +  AMP(1098)  ! used 16 times
+      TMP_JAMP(1517) = TMP_JAMP(1467) - AMP(1102)  ! used 16 times
+      TMP_JAMP(1516) = TMP_JAMP(1466) +  AMP(1089)  ! used 16 times
+      TMP_JAMP(1515) = TMP_JAMP(1464) - AMP(1393)  ! used 16 times
+      TMP_JAMP(1514) = TMP_JAMP(1462) +  AMP(1395)  ! used 16 times
+      TMP_JAMP(1513) = TMP_JAMP(1461) - AMP(1325)  ! used 16 times
+      TMP_JAMP(1512) = TMP_JAMP(1460) +  AMP(1332)  ! used 16 times
+      TMP_JAMP(1511) = TMP_JAMP(1459) +  AMP(1377)  ! used 16 times
+      TMP_JAMP(1510) = TMP_JAMP(1458) - AMP(1174)  ! used 16 times
+      TMP_JAMP(1509) = TMP_JAMP(1457) +  AMP(1173)  ! used 16 times
+      TMP_JAMP(1508) = TMP_JAMP(1456) +  AMP(1176)  ! used 16 times
+      TMP_JAMP(1507) = TMP_JAMP(1455) - AMP(1375)  ! used 16 times
+      TMP_JAMP(1506) = TMP_JAMP(1454) - AMP(1171)  ! used 16 times
+      TMP_JAMP(1505) = TMP_JAMP(1453) +  AMP(1167)  ! used 16 times
+      TMP_JAMP(1504) = TMP_JAMP(1452) +  AMP(1170)  ! used 16 times
+      TMP_JAMP(1503) = TMP_JAMP(1451) +  AMP(1379)  ! used 16 times
+      TMP_JAMP(1502) = TMP_JAMP(1450) +  AMP(1365)  ! used 16 times
+      TMP_JAMP(1501) = TMP_JAMP(1449) - AMP(1373)  ! used 16 times
+      TMP_JAMP(1500) = TMP_JAMP(1446) - AMP(1391)  ! used 16 times
+      TMP_JAMP(1499) = TMP_JAMP(1445) - TMP_JAMP(1444)  ! used 16 times
+      TMP_JAMP(1498) = TMP_JAMP(1443) - AMP(1180)  ! used 16 times
+      TMP_JAMP(1497) = TMP_JAMP(1441) +  AMP(1394)  ! used 16 times
+      TMP_JAMP(1496) = TMP_JAMP(1440) +  AMP(1353)  ! used 16 times
+      TMP_JAMP(1495) = TMP_JAMP(1439) +  AMP(1095)  ! used 16 times
+      TMP_JAMP(1494) = TMP_JAMP(1438) - AMP(1351)  ! used 16 times
+      TMP_JAMP(1493) = TMP_JAMP(1437) - AMP(1093)  ! used 16 times
+      TMP_JAMP(1492) = TMP_JAMP(1436) +  AMP(1092)  ! used 16 times
+      TMP_JAMP(1491) = TMP_JAMP(1435) +  AMP(1356)  ! used 16 times
+      TMP_JAMP(1490) = TMP_JAMP(1434) +  AMP(1341)  ! used 16 times
+      TMP_JAMP(1489) = TMP_JAMP(1433) - AMP(1349)  ! used 16 times
+      TMP_JAMP(1488) = TMP_JAMP(1432) - TMP_JAMP(1430)  ! used 16 times
+      TMP_JAMP(1487) = TMP_JAMP(1431) +  AMP(1317)  ! used 16 times
+      TMP_JAMP(1486) = TMP_JAMP(1429) +  AMP(1251)  ! used 16 times
+      TMP_JAMP(1485) = TMP_JAMP(1428) - AMP(1249)  ! used 16 times
+      TMP_JAMP(1484) = TMP_JAMP(1427) +  AMP(1254)  ! used 16 times
+      TMP_JAMP(1483) = TMP_JAMP(1426) - AMP(1252)  ! used 16 times
+      TMP_JAMP(1482) = TMP_JAMP(1425) - AMP(1258)  ! used 16 times
+      TMP_JAMP(1481) = TMP_JAMP(1424) +  AMP(1248)  ! used 16 times
+      TMP_JAMP(1480) = TMP_JAMP(1423) +  AMP(1245)  ! used 16 times
+      TMP_JAMP(1530) = TMP_JAMP(1524) - TMP_JAMP(1478)  ! used 16 times
+      TMP_JAMP(1529) = TMP_JAMP(1522) - TMP_JAMP(1475)  ! used 16 times
+      TMP_JAMP(1528) = TMP_JAMP(1515) - TMP_JAMP(1465)  ! used 16 times
+      TMP_JAMP(1527) = TMP_JAMP(1514) - TMP_JAMP(1463)  ! used 16 times
+      TMP_JAMP(1526) = TMP_JAMP(1500) - TMP_JAMP(1447)  ! used 16 times
+      TMP_JAMP(1525) = TMP_JAMP(1497) - TMP_JAMP(1442)  ! used 16 times
+      TMP_JAMP(1708) = TMP_JAMP(1530) +  TMP_JAMP(1520)  ! used 8 times
+      TMP_JAMP(1707) = TMP_JAMP(1530) +  TMP_JAMP(1529)  ! used 8 times
+      TMP_JAMP(1706) = TMP_JAMP(1530) - TMP_JAMP(1526)  ! used 8 times
+      TMP_JAMP(1705) = TMP_JAMP(1530) - TMP_JAMP(1511)  ! used 8 times
+      TMP_JAMP(1704) = TMP_JAMP(1529) +  TMP_JAMP(1521)  ! used 8 times
+      TMP_JAMP(1703) = TMP_JAMP(1529) +  TMP_JAMP(1526)  ! used 8 times
+      TMP_JAMP(1702) = TMP_JAMP(1529) - TMP_JAMP(1488)  ! used 8 times
+      TMP_JAMP(1701) = TMP_JAMP(1528) + ((-0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * AMP(1072)  ! used 8 times
+      TMP_JAMP(1700) = TMP_JAMP(1528) - TMP_JAMP(1479)  ! used 8 times
+      TMP_JAMP(1699) = TMP_JAMP(1528) - TMP_JAMP(1503)  ! used 8 times
+      TMP_JAMP(1698) = TMP_JAMP(1528) + ((0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * AMP(1012)  ! used 8 times
+      TMP_JAMP(1697) = TMP_JAMP(1527) + ((-0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * AMP(1074)  ! used 8 times
+      TMP_JAMP(1696) = TMP_JAMP(1527) - TMP_JAMP(1476)  ! used 8 times
+      TMP_JAMP(1695) = TMP_JAMP(1527) +  TMP_JAMP(1512)  ! used 8 times
+      TMP_JAMP(1694) = TMP_JAMP(1527) + ((0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * AMP(1014)  ! used 8 times
+      TMP_JAMP(1693) = TMP_JAMP(1526) - TMP_JAMP(1448)  ! used 8 times
+      TMP_JAMP(1692) = TMP_JAMP(1526) +  TMP_JAMP(1496)  ! used 8 times
+      TMP_JAMP(1691) = TMP_JAMP(1525) + ((0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * AMP(1073)  ! used 8 times
+      TMP_JAMP(1690) = TMP_JAMP(1525) +  TMP_JAMP(1491)  ! used 8 times
+      TMP_JAMP(1689) = TMP_JAMP(1525) + ((-0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * AMP(1013)  ! used 8 times
+      TMP_JAMP(1688) = TMP_JAMP(1523) +  TMP_JAMP(1519)  ! used 8 times
+      TMP_JAMP(1687) = TMP_JAMP(1523) +  TMP_JAMP(1521)  ! used 8 times
+      TMP_JAMP(1686) = TMP_JAMP(1523) - TMP_JAMP(1510)  ! used 8 times
+      TMP_JAMP(1685) = TMP_JAMP(1523) - TMP_JAMP(1513)  ! used 8 times
+      TMP_JAMP(1684) = TMP_JAMP(1521) +  TMP_JAMP(1518)  ! used 8 times
+      TMP_JAMP(1683) = TMP_JAMP(1521) - TMP_JAMP(1508)  ! used 8 times
+      TMP_JAMP(1682) = TMP_JAMP(1521) +  TMP_JAMP(1488)  ! used 8 times
+      TMP_JAMP(1681) = TMP_JAMP(1520) +  TMP_JAMP(1518)  ! used 8 times
+      TMP_JAMP(1680) = TMP_JAMP(1520) +  TMP_JAMP(1511)  ! used 8 times
+      TMP_JAMP(1679) = TMP_JAMP(1520) - TMP_JAMP(1495)  ! used 8 times
+      TMP_JAMP(1678) = TMP_JAMP(1519) +  TMP_JAMP(1518)  ! used 8 times
+      TMP_JAMP(1677) = TMP_JAMP(1519) +  TMP_JAMP(1510)  ! used 8 times
+      TMP_JAMP(1676) = TMP_JAMP(1518) +  TMP_JAMP(1508)  ! used 8 times
+      TMP_JAMP(1675) = TMP_JAMP(1518) +  TMP_JAMP(1495)  ! used 8 times
+      TMP_JAMP(1674) = TMP_JAMP(1517) + ((-0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * AMP(1147)  ! used 8 times
+      TMP_JAMP(1673) = TMP_JAMP(1517) +  TMP_JAMP(1516)  ! used 8 times
+      TMP_JAMP(1672) = TMP_JAMP(1517) - TMP_JAMP(1492)  ! used 8 times
+      TMP_JAMP(1671) = TMP_JAMP(1517) + ((0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * AMP(1148)  ! used 8 times
+      TMP_JAMP(1670) = TMP_JAMP(1516) +  TMP_JAMP(1492)  ! used 8 times
+      TMP_JAMP(1669) = TMP_JAMP(1513) - AMP(1326)  ! used 8 times
+      TMP_JAMP(1668) = TMP_JAMP(1513) +  TMP_JAMP(1512)  ! used 8 times
+      TMP_JAMP(1667) = TMP_JAMP(1511) +  AMP(1376)  ! used 8 times
+      TMP_JAMP(1666) = TMP_JAMP(1511) - TMP_JAMP(1486)  ! used 8 times
+      TMP_JAMP(1665) = TMP_JAMP(1511) +  TMP_JAMP(1507)  ! used 8 times
+      TMP_JAMP(1664) = TMP_JAMP(1510) - TMP_JAMP(1505)  ! used 8 times
+      TMP_JAMP(1663) = TMP_JAMP(1509) +  TMP_JAMP(1508)  ! used 8 times
+      TMP_JAMP(1662) = TMP_JAMP(1509) +  TMP_JAMP(1486)  ! used 8 times
+      TMP_JAMP(1661) = TMP_JAMP(1509) - TMP_JAMP(1499)  ! used 8 times
+      TMP_JAMP(1660) = TMP_JAMP(1509) +  TMP_JAMP(1506)  ! used 8 times
+      TMP_JAMP(1659) = TMP_JAMP(1508) +  TMP_JAMP(1499)  ! used 8 times
+      TMP_JAMP(1658) = TMP_JAMP(1507) +  TMP_JAMP(1506)  ! used 8 times
+      TMP_JAMP(1657) = TMP_JAMP(1507) - TMP_JAMP(1485)  ! used 8 times
+      TMP_JAMP(1656) = TMP_JAMP(1507) - TMP_JAMP(1501)  ! used 8 times
+      TMP_JAMP(1655) = TMP_JAMP(1506) - AMP(1172)  ! used 8 times
+      TMP_JAMP(1654) = TMP_JAMP(1506) - TMP_JAMP(1504)  ! used 8 times
+      TMP_JAMP(1653) = TMP_JAMP(1506) +  TMP_JAMP(1485)  ! used 8 times
+      TMP_JAMP(1652) = TMP_JAMP(1505) +  TMP_JAMP(1504)  ! used 8 times
+      TMP_JAMP(1651) = TMP_JAMP(1503) +  TMP_JAMP(1501)  ! used 8 times
+      TMP_JAMP(1650) = TMP_JAMP(1502) + ((0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * AMP(1150)  ! used 8 times
+      TMP_JAMP(1649) = TMP_JAMP(1502) - TMP_JAMP(1501)  ! used 8 times
+      TMP_JAMP(1648) = TMP_JAMP(1502) + ((-0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * AMP(1149)  ! used 8 times
+      TMP_JAMP(1647) = TMP_JAMP(1501) - AMP(1374)  ! used 8 times
+      TMP_JAMP(1646) = TMP_JAMP(1499) +  TMP_JAMP(1496)  ! used 8 times
+      TMP_JAMP(1645) = TMP_JAMP(1498) + ((-0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * AMP(1225)  ! used 8 times
+      TMP_JAMP(1644) = TMP_JAMP(1498) + ((0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * AMP(1226)  ! used 8 times
+      TMP_JAMP(1643) = TMP_JAMP(1496) +  TMP_JAMP(1495)  ! used 8 times
+      TMP_JAMP(1642) = TMP_JAMP(1496) +  TMP_JAMP(1494)  ! used 8 times
+      TMP_JAMP(1641) = TMP_JAMP(1496) - TMP_JAMP(1484)  ! used 8 times
+      TMP_JAMP(1640) = TMP_JAMP(1495) +  AMP(1094)  ! used 8 times
+      TMP_JAMP(1639) = TMP_JAMP(1495) +  TMP_JAMP(1484)  ! used 8 times
+      TMP_JAMP(1638) = TMP_JAMP(1495) +  TMP_JAMP(1493)  ! used 8 times
+      TMP_JAMP(1637) = TMP_JAMP(1494) - AMP(1350)  ! used 8 times
+      TMP_JAMP(1636) = TMP_JAMP(1494) +  TMP_JAMP(1493)  ! used 8 times
+      TMP_JAMP(1635) = TMP_JAMP(1494) - TMP_JAMP(1483)  ! used 8 times
+      TMP_JAMP(1634) = TMP_JAMP(1494) - TMP_JAMP(1489)  ! used 8 times
+      TMP_JAMP(1633) = TMP_JAMP(1493) - AMP(1094)  ! used 8 times
+      TMP_JAMP(1632) = TMP_JAMP(1493) - TMP_JAMP(1492)  ! used 8 times
+      TMP_JAMP(1631) = TMP_JAMP(1493) +  TMP_JAMP(1483)  ! used 8 times
+      TMP_JAMP(1630) = TMP_JAMP(1491) +  TMP_JAMP(1489)  ! used 8 times
+      TMP_JAMP(1629) = TMP_JAMP(1490) + ((0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * AMP(1228)  ! used 8 times
+      TMP_JAMP(1628) = TMP_JAMP(1490) - TMP_JAMP(1489)  ! used 8 times
+      TMP_JAMP(1627) = TMP_JAMP(1490) + ((-0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * AMP(1227)  ! used 8 times
+      TMP_JAMP(1626) = TMP_JAMP(1489) - AMP(1350)  ! used 8 times
+      TMP_JAMP(1625) = TMP_JAMP(1488) - TMP_JAMP(1486)  ! used 8 times
+      TMP_JAMP(1624) = TMP_JAMP(1488) +  TMP_JAMP(1484)  ! used 8 times
+      TMP_JAMP(1623) = TMP_JAMP(1487) + ((-0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * AMP(1305)  ! used 8 times
+      TMP_JAMP(1622) = TMP_JAMP(1487) + ((0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * AMP(1306)  ! used 8 times
+      TMP_JAMP(1621) = TMP_JAMP(1486) +  AMP(1250)  ! used 8 times
+      TMP_JAMP(1620) = TMP_JAMP(1486) +  TMP_JAMP(1485)  ! used 8 times
+      TMP_JAMP(1619) = TMP_JAMP(1485) - AMP(1250)  ! used 8 times
+      TMP_JAMP(1618) = TMP_JAMP(1485) - TMP_JAMP(1481)  ! used 8 times
+      TMP_JAMP(1617) = TMP_JAMP(1484) +  TMP_JAMP(1483)  ! used 8 times
+      TMP_JAMP(1616) = TMP_JAMP(1483) - TMP_JAMP(1480)  ! used 8 times
+      TMP_JAMP(1615) = TMP_JAMP(1482) - TMP_JAMP(1481)  ! used 8 times
+      TMP_JAMP(1614) = TMP_JAMP(1482) +  TMP_JAMP(1480)  ! used 8 times
+      TMP_JAMP(1613) = TMP_JAMP(1482) + ((-0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * AMP(1303)  ! used 8 times
+      TMP_JAMP(1612) = TMP_JAMP(1481) +  TMP_JAMP(1480)  ! used 8 times
+      TMP_JAMP(1611) = TMP_JAMP(1479) +  TMP_JAMP(1476)  ! used 8 times
+      TMP_JAMP(1610) = TMP_JAMP(1479) - TMP_JAMP(1448)  ! used 8 times
+      TMP_JAMP(1609) = TMP_JAMP(1479) - AMP(1374)  ! used 8 times
+      TMP_JAMP(1608) = TMP_JAMP(1476) +  AMP(1326)  ! used 8 times
+      TMP_JAMP(1607) = TMP_JAMP(1476) +  TMP_JAMP(1448)  ! used 8 times
+      TMP_JAMP(1606) = TMP_JAMP(1448) +  AMP(1350)  ! used 8 times
+      TMP_JAMP(1605) = TMP_JAMP(1422) + ((-0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * AMP(948)  ! used 8 times
+      TMP_JAMP(1604) = TMP_JAMP(1422) + ((0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * AMP(951)  ! used 8 times
+      TMP_JAMP(1603) = TMP_JAMP(1421) + ((0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * AMP(946)  ! used 8 times
+      TMP_JAMP(1602) = TMP_JAMP(1421) + ((-0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * AMP(949)  ! used 8 times
+      TMP_JAMP(1601) = TMP_JAMP(1420) + ((0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * AMP(947)  ! used 8 times
+      TMP_JAMP(1600) = TMP_JAMP(1420) + ((-0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * AMP(950)  ! used 8 times
+      TMP_JAMP(1599) = AMP(1122) +  AMP(1123)  ! used 8 times
+      TMP_JAMP(1598) = AMP(1117) +  AMP(1125)  ! used 8 times
+      TMP_JAMP(1597) = AMP(1068) + ((-0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * AMP(1380)  ! used 8 times
+      TMP_JAMP(1596) = AMP(1064) - AMP(1066)  ! used 8 times
+      TMP_JAMP(1595) = AMP(1046) - AMP(1048)  ! used 8 times
+      TMP_JAMP(1594) = AMP(1044) +  AMP(1050)  ! used 8 times
+      TMP_JAMP(1593) = AMP(1200) +  AMP(1201)  ! used 8 times
+      TMP_JAMP(1592) = AMP(1172) +  AMP(1376)  ! used 8 times
+      TMP_JAMP(1591) = AMP(1166) + ((-0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * AMP(1202)  ! used 8 times
+      TMP_JAMP(1590) = AMP(1166) +  AMP(1172)  ! used 8 times
+      TMP_JAMP(1589) = AMP(1212) +  AMP(1213)  ! used 8 times
+      TMP_JAMP(1588) = AMP(1207) +  AMP(1215)  ! used 8 times
+      TMP_JAMP(1587) = AMP(1195) +  AMP(1203)  ! used 8 times
+      TMP_JAMP(1586) = AMP(1088) + ((0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * AMP(1124)  ! used 8 times
+      TMP_JAMP(1585) = AMP(1062) +  AMP(1067)  ! used 8 times
+      TMP_JAMP(1584) = AMP(1058) + ((-0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * AMP(1355)  ! used 8 times
+      TMP_JAMP(1583) = AMP(1055) - AMP(1057)  ! used 8 times
+      TMP_JAMP(1582) = AMP(1094) +  AMP(1352)  ! used 8 times
+      TMP_JAMP(1581) = AMP(1088) +  AMP(1094)  ! used 8 times
+      TMP_JAMP(1580) = AMP(1134) +  AMP(1135)  ! used 8 times
+      TMP_JAMP(1579) = AMP(1129) +  AMP(1137)  ! used 8 times
+      TMP_JAMP(1578) = AMP(1053) +  AMP(1059)  ! used 8 times
+      TMP_JAMP(1577) = AMP(1172) +  AMP(1250)  ! used 8 times
+      TMP_JAMP(1576) = AMP(1290) +  AMP(1291)  ! used 8 times
+      TMP_JAMP(1575) = AMP(1247) + ((0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * AMP(1292)  ! used 8 times
+      TMP_JAMP(1574) = AMP(1094) +  AMP(1253)  ! used 8 times
+      TMP_JAMP(1573) = AMP(1299) +  AMP(1300)  ! used 8 times
+      TMP_JAMP(1572) = AMP(1244) + ((0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * AMP(1301)  ! used 8 times
+      TMP_JAMP(1571) = AMP(1285) +  AMP(1293)  ! used 8 times
+      TMP_JAMP(1570) = AMP(1294) +  AMP(1302)  ! used 8 times
+      TMP_JAMP(1569) = AMP(1049) + ((0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * AMP(1331)  ! used 8 times
+      TMP_JAMP(1568) = AMP(1169) + ((-0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * AMP(1214)  ! used 8 times
+      TMP_JAMP(1567) = AMP(1091) + ((-0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * AMP(1136)  ! used 8 times
+      TMP_JAMP(1566) = AMP(1110) +  AMP(1111)  ! used 8 times
+      TMP_JAMP(1565) = AMP(1105) +  AMP(1113)  ! used 8 times
+      TMP_JAMP(1564) = AMP(1278) +  AMP(1279)  ! used 8 times
+      TMP_JAMP(1563) = AMP(1250) - AMP(1376)  ! used 8 times
+      TMP_JAMP(1562) = AMP(1244) + ((-0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * AMP(1280)  ! used 8 times
+      TMP_JAMP(1561) = AMP(1244) +  AMP(1250)  ! used 8 times
+      TMP_JAMP(1560) = AMP(1273) +  AMP(1281)  ! used 8 times
+      TMP_JAMP(1559) = AMP(1091) + ((0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * AMP(1112)  ! used 8 times
+      TMP_JAMP(1558) = AMP(1143) +  AMP(1144)  ! used 8 times
+      TMP_JAMP(1557) = AMP(1138) +  AMP(1146)  ! used 8 times
+      TMP_JAMP(1556) = AMP(1221) +  AMP(1222)  ! used 8 times
+      TMP_JAMP(1555) = AMP(1166) + ((0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * AMP(1223)  ! used 8 times
+      TMP_JAMP(1554) = AMP(1216) +  AMP(1224)  ! used 8 times
+      TMP_JAMP(1553) = AMP(1352) +  AMP(1355)  ! used 8 times
+      TMP_JAMP(1552) = AMP(1247) +  AMP(1253)  ! used 8 times
+      TMP_JAMP(1551) = AMP(1253) - AMP(1352)  ! used 8 times
+      TMP_JAMP(1550) = AMP(1088) + ((-0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * AMP(1145)  ! used 8 times
+      TMP_JAMP(1549) = AMP(1413) +  AMP(1414)  ! used 8 times
+      TMP_JAMP(1548) = AMP(1374) + ((-0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * AMP(1416)  ! used 8 times
+      TMP_JAMP(1547) = AMP(1188) +  AMP(1189)  ! used 8 times
+      TMP_JAMP(1546) = AMP(1183) +  AMP(1191)  ! used 8 times
+      TMP_JAMP(1545) = AMP(1411) - AMP(1415)  ! used 8 times
+      TMP_JAMP(1544) = AMP(1266) +  AMP(1267)  ! used 8 times
+      TMP_JAMP(1543) = AMP(1247) + ((-0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * AMP(1268)  ! used 8 times
+      TMP_JAMP(1542) = AMP(1261) +  AMP(1269)  ! used 8 times
+      TMP_JAMP(1541) = AMP(1169) + ((0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * AMP(1190)  ! used 8 times
+      TMP_JAMP(1540) = AMP(1376) +  AMP(1380)  ! used 8 times
+      TMP_JAMP(1539) = AMP(1008) + ((-0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * AMP(1380)  ! used 8 times
+      TMP_JAMP(1538) = AMP(1004) - AMP(1006)  ! used 8 times
+      TMP_JAMP(1537) = AMP(986) - AMP(988)  ! used 8 times
+      TMP_JAMP(1536) = AMP(984) +  AMP(990)  ! used 8 times
+      TMP_JAMP(1535) = AMP(1002) +  AMP(1007)  ! used 8 times
+      TMP_JAMP(1534) = AMP(998) + ((-0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * AMP(1355)  ! used 8 times
+      TMP_JAMP(1533) = AMP(995) - AMP(997)  ! used 8 times
+      TMP_JAMP(1532) = AMP(993) +  AMP(999)  ! used 8 times
+      TMP_JAMP(1531) = AMP(989) + ((0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * AMP(1331)  ! used 8 times
+      TMP_JAMP(1711) = TMP_JAMP(1698) + ((0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * TMP_JAMP(1538)  ! used 8 times
+      TMP_JAMP(1710) = TMP_JAMP(1694) + ((0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * TMP_JAMP(1537)  ! used 8 times
+      TMP_JAMP(1709) = TMP_JAMP(1689) + ((-0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * TMP_JAMP(1533)  ! used 8 times
+      TMP_JAMP(1712) = TMP_JAMP(1691) + ((0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * TMP_JAMP(1583)  ! used 6 times
+      TMP_JAMP(1714) = TMP_JAMP(1697) + ((-0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * TMP_JAMP(1595)  ! used 5 times
+      TMP_JAMP(1713) = TMP_JAMP(1557) + ((-0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * TMP_JAMP(1516)  ! used 5 times
+      TMP_JAMP(1843) = TMP_JAMP(1711) +  TMP_JAMP(1710)  ! used 4 times
+      TMP_JAMP(1842) = TMP_JAMP(1711) - TMP_JAMP(1709)  ! used 4 times
+      TMP_JAMP(1841) = TMP_JAMP(1710) +  TMP_JAMP(1709)  ! used 4 times
+      TMP_JAMP(1840) = TMP_JAMP(1710) + ((-0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * TMP_JAMP(1531)  ! used 4 times
+      TMP_JAMP(1839) = TMP_JAMP(1709) + ((-0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * TMP_JAMP(1534)  ! used 4 times
+      TMP_JAMP(1838) = TMP_JAMP(1703) - TMP_JAMP(1659)  ! used 4 times
+      TMP_JAMP(1837) = TMP_JAMP(1701) + ((0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * TMP_JAMP(1596)  ! used 4 times
+      TMP_JAMP(1836) = TMP_JAMP(1685) +  TMP_JAMP(1521)  ! used 4 times
+      TMP_JAMP(1835) = TMP_JAMP(1679) - TMP_JAMP(1632)  ! used 4 times
+      TMP_JAMP(1834) = TMP_JAMP(1673) + ((-0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * TMP_JAMP(1598)  ! used 4 times
+      TMP_JAMP(1833) = TMP_JAMP(1671) - AMP(1139)  ! used 4 times
+      TMP_JAMP(1832) = TMP_JAMP(1668) - AMP(1041)  ! used 4 times
+      TMP_JAMP(1831) = TMP_JAMP(1664) +  TMP_JAMP(1659)  ! used 4 times
+      TMP_JAMP(1830) = TMP_JAMP(1661) +  TMP_JAMP(1654)  ! used 4 times
+      TMP_JAMP(1829) = TMP_JAMP(1651) + ((-0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * TMP_JAMP(1585)  ! used 4 times
+      TMP_JAMP(1828) = TMP_JAMP(1648) +  AMP(1151)  ! used 4 times
+      TMP_JAMP(1827) = TMP_JAMP(1648) + ((0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * TMP_JAMP(1535)  ! used 4 times
+      TMP_JAMP(1826) = TMP_JAMP(1645) +  AMP(1184)  ! used 4 times
+      TMP_JAMP(1825) = TMP_JAMP(1642) - TMP_JAMP(1606)  ! used 4 times
+      TMP_JAMP(1824) = TMP_JAMP(1635) +  TMP_JAMP(1551)  ! used 4 times
+      TMP_JAMP(1823) = TMP_JAMP(1630) - TMP_JAMP(1606)  ! used 4 times
+      TMP_JAMP(1822) = TMP_JAMP(1629) +  AMP(1230)  ! used 4 times
+      TMP_JAMP(1821) = TMP_JAMP(1627) +  AMP(1229)  ! used 4 times
+      TMP_JAMP(1820) = TMP_JAMP(1627) + ((0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * TMP_JAMP(1532)  ! used 4 times
+      TMP_JAMP(1819) = TMP_JAMP(1623) +  AMP(1307)  ! used 4 times
+      TMP_JAMP(1818) = TMP_JAMP(1623) + ((0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * TMP_JAMP(1536)  ! used 4 times
+      TMP_JAMP(1817) = TMP_JAMP(1620) + ((-0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * TMP_JAMP(1576)  ! used 4 times
+      TMP_JAMP(1816) = TMP_JAMP(1617) + ((-0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * TMP_JAMP(1573)  ! used 4 times
+      TMP_JAMP(1815) = TMP_JAMP(1615) + ((0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * TMP_JAMP(1571)  ! used 4 times
+      TMP_JAMP(1814) = TMP_JAMP(1614) + ((-0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * TMP_JAMP(1570)  ! used 4 times
+      TMP_JAMP(1813) = TMP_JAMP(1613) +  AMP(1308)  ! used 4 times
+      TMP_JAMP(1812) = TMP_JAMP(1612) + ((0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * TMP_JAMP(1571)  ! used 4 times
+      TMP_JAMP(1811) = TMP_JAMP(1612) + ((0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * TMP_JAMP(1570)  ! used 4 times
+      TMP_JAMP(1810) = TMP_JAMP(1607) +  AMP(1326)  ! used 4 times
+      TMP_JAMP(1809) = TMP_JAMP(1605) - TMP_JAMP(1603)  ! used 4 times
+      TMP_JAMP(1808) = TMP_JAMP(1605) +  TMP_JAMP(1601)  ! used 4 times
+      TMP_JAMP(1807) = TMP_JAMP(1604) - TMP_JAMP(1559)  ! used 4 times
+      TMP_JAMP(1806) = TMP_JAMP(1604) - AMP(1141)  ! used 4 times
+      TMP_JAMP(1805) = TMP_JAMP(1604) +  TMP_JAMP(1600)  ! used 4 times
+      TMP_JAMP(1804) = TMP_JAMP(1603) + ((0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * TMP_JAMP(1569)  ! used 4 times
+      TMP_JAMP(1803) = TMP_JAMP(1603) +  TMP_JAMP(1601)  ! used 4 times
+      TMP_JAMP(1802) = TMP_JAMP(1602) - AMP(945)  ! used 4 times
+      TMP_JAMP(1801) = TMP_JAMP(1602) +  TMP_JAMP(1600)  ! used 4 times
+      TMP_JAMP(1800) = TMP_JAMP(1602) + ((0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * TMP_JAMP(1531)  ! used 4 times
+      TMP_JAMP(1799) = TMP_JAMP(1600) - AMP(1219)  ! used 4 times
+      TMP_JAMP(1798) = TMP_JAMP(1600) - TMP_JAMP(1541)  ! used 4 times
+      TMP_JAMP(1797) = TMP_JAMP(1599) - TMP_JAMP(1598)  ! used 4 times
+      TMP_JAMP(1796) = TMP_JAMP(1599) +  TMP_JAMP(1593)  ! used 4 times
+      TMP_JAMP(1795) = TMP_JAMP(1599) + ((-0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * AMP(1120)  ! used 4 times
+      TMP_JAMP(1794) = TMP_JAMP(1598) + ((-0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * AMP(1118)  ! used 4 times
+      TMP_JAMP(1793) = TMP_JAMP(1598) + ((0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * AMP(1126)  ! used 4 times
+      TMP_JAMP(1792) = TMP_JAMP(1596) - AMP(1072)  ! used 4 times
+      TMP_JAMP(1791) = TMP_JAMP(1596) - TMP_JAMP(1585)  ! used 4 times
+      TMP_JAMP(1790) = TMP_JAMP(1594) + ((-0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * AMP(1041)  ! used 4 times
+      TMP_JAMP(1789) = TMP_JAMP(1594) +  TMP_JAMP(1569)  ! used 4 times
+      TMP_JAMP(1788) = TMP_JAMP(1594) + ((0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * AMP(1045)  ! used 4 times
+      TMP_JAMP(1787) = TMP_JAMP(1593) + ((0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * AMP(1198)  ! used 4 times
+      TMP_JAMP(1786) = TMP_JAMP(1589) - TMP_JAMP(1588)  ! used 4 times
+      TMP_JAMP(1785) = TMP_JAMP(1589) +  TMP_JAMP(1576)  ! used 4 times
+      TMP_JAMP(1784) = TMP_JAMP(1589) + ((-0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * TMP_JAMP(1568)  ! used 4 times
+      TMP_JAMP(1783) = TMP_JAMP(1588) + ((0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * TMP_JAMP(1568)  ! used 4 times
+      TMP_JAMP(1782) = TMP_JAMP(1588) + ((0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * AMP(1208)  ! used 4 times
+      TMP_JAMP(1781) = TMP_JAMP(1587) + ((0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * AMP(1204)  ! used 4 times
+      TMP_JAMP(1780) = TMP_JAMP(1587) + ((-0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * AMP(1196)  ! used 4 times
+      TMP_JAMP(1779) = TMP_JAMP(1585) + ((0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * AMP(1063)  ! used 4 times
+      TMP_JAMP(1778) = TMP_JAMP(1580) + ((0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * AMP(1132)  ! used 4 times
+      TMP_JAMP(1777) = TMP_JAMP(1580) +  TMP_JAMP(1573)  ! used 4 times
+      TMP_JAMP(1776) = TMP_JAMP(1580) + ((-0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * TMP_JAMP(1567)  ! used 4 times
+      TMP_JAMP(1775) = TMP_JAMP(1579) + ((-0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * AMP(1126)  ! used 4 times
+      TMP_JAMP(1774) = TMP_JAMP(1579) + ((0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * TMP_JAMP(1567)  ! used 4 times
+      TMP_JAMP(1773) = TMP_JAMP(1579) + ((0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * AMP(1130)  ! used 4 times
+      TMP_JAMP(1772) = TMP_JAMP(1578) + ((0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * AMP(1054)  ! used 4 times
+      TMP_JAMP(1771) = TMP_JAMP(1578) + ((-0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * AMP(1032)  ! used 4 times
+      TMP_JAMP(1770) = TMP_JAMP(1576) + ((-0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * AMP(1288)  ! used 4 times
+      TMP_JAMP(1769) = TMP_JAMP(1576) - TMP_JAMP(1571)  ! used 4 times
+      TMP_JAMP(1768) = TMP_JAMP(1573) + ((-0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * AMP(1297)  ! used 4 times
+      TMP_JAMP(1767) = TMP_JAMP(1573) - TMP_JAMP(1570)  ! used 4 times
+      TMP_JAMP(1766) = TMP_JAMP(1566) +  TMP_JAMP(1564)  ! used 4 times
+      TMP_JAMP(1765) = TMP_JAMP(1566) + ((-0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * TMP_JAMP(1559)  ! used 4 times
+      TMP_JAMP(1764) = TMP_JAMP(1566) + ((-0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * AMP(1108)  ! used 4 times
+      TMP_JAMP(1763) = TMP_JAMP(1565) + ((-0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * AMP(1106)  ! used 4 times
+      TMP_JAMP(1762) = TMP_JAMP(1565) + ((0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * TMP_JAMP(1559)  ! used 4 times
+      TMP_JAMP(1761) = TMP_JAMP(1565) + ((0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * AMP(1114)  ! used 4 times
+      TMP_JAMP(1760) = TMP_JAMP(1564) + ((0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * AMP(1276)  ! used 4 times
+      TMP_JAMP(1759) = TMP_JAMP(1561) + ((-0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * AMP(1280)  ! used 4 times
+      TMP_JAMP(1758) = TMP_JAMP(1561) +  AMP(1172)  ! used 4 times
+      TMP_JAMP(1757) = TMP_JAMP(1561) + ((-0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * AMP(1301)  ! used 4 times
+      TMP_JAMP(1756) = TMP_JAMP(1560) + ((0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * AMP(1282)  ! used 4 times
+      TMP_JAMP(1755) = TMP_JAMP(1560) + ((-0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * AMP(1274)  ! used 4 times
+      TMP_JAMP(1754) = TMP_JAMP(1558) + ((0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * AMP(1141)  ! used 4 times
+      TMP_JAMP(1753) = TMP_JAMP(1558) + ((-0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * TMP_JAMP(1550)  ! used 4 times
+      TMP_JAMP(1752) = TMP_JAMP(1556) + ((-0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * TMP_JAMP(1555)  ! used 4 times
+      TMP_JAMP(1751) = TMP_JAMP(1556) - TMP_JAMP(1554)  ! used 4 times
+      TMP_JAMP(1750) = TMP_JAMP(1555) + ((-0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * TMP_JAMP(1554)  ! used 4 times
+      TMP_JAMP(1749) = TMP_JAMP(1554) + ((0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * AMP(1217)  ! used 4 times
+      TMP_JAMP(1748) = TMP_JAMP(1549) + ((0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * TMP_JAMP(1548)  ! used 4 times
+      TMP_JAMP(1747) = TMP_JAMP(1549) - TMP_JAMP(1544)  ! used 4 times
+      TMP_JAMP(1746) = TMP_JAMP(1549) + ((-0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * AMP(1408)  ! used 4 times
+      TMP_JAMP(1745) = TMP_JAMP(1547) + ((-0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * TMP_JAMP(1541)  ! used 4 times
+      TMP_JAMP(1744) = TMP_JAMP(1547) + ((-0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * AMP(1186)  ! used 4 times
+      TMP_JAMP(1743) = TMP_JAMP(1546) + ((-0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * AMP(1184)  ! used 4 times
+      TMP_JAMP(1742) = TMP_JAMP(1546) + ((0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * TMP_JAMP(1541)  ! used 4 times
+      TMP_JAMP(1741) = TMP_JAMP(1546) + ((0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * AMP(1192)  ! used 4 times
+      TMP_JAMP(1740) = TMP_JAMP(1545) + ((-0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * AMP(1023)  ! used 4 times
+      TMP_JAMP(1739) = TMP_JAMP(1545) + ((0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * AMP(1410)  ! used 4 times
+      TMP_JAMP(1738) = TMP_JAMP(1544) + ((-0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * TMP_JAMP(1543)  ! used 4 times
+      TMP_JAMP(1737) = TMP_JAMP(1544) + ((0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * AMP(1264)  ! used 4 times
+      TMP_JAMP(1736) = TMP_JAMP(1542) + ((0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * AMP(1270)  ! used 4 times
+      TMP_JAMP(1735) = TMP_JAMP(1542) + ((-0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * AMP(1262)  ! used 4 times
+      TMP_JAMP(1734) = TMP_JAMP(1540) + ((0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * AMP(1068)  ! used 4 times
+      TMP_JAMP(1733) = TMP_JAMP(1539) +  TMP_JAMP(1535)  ! used 4 times
+      TMP_JAMP(1732) = TMP_JAMP(1536) +  TMP_JAMP(1531)  ! used 4 times
+      TMP_JAMP(1731) = TMP_JAMP(1536) + ((0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * AMP(981)  ! used 4 times
+      TMP_JAMP(1730) = TMP_JAMP(1535) + ((0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * AMP(963)  ! used 4 times
+      TMP_JAMP(1729) = TMP_JAMP(1534) +  TMP_JAMP(1532)  ! used 4 times
+      TMP_JAMP(1728) = TMP_JAMP(1532) + ((0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * AMP(972)  ! used 4 times
+      TMP_JAMP(1727) = TMP_JAMP(1512) +  AMP(985)  ! used 4 times
+      TMP_JAMP(1726) = TMP_JAMP(1503) +  AMP(1003)  ! used 4 times
+      TMP_JAMP(1725) = TMP_JAMP(1491) +  AMP(994)  ! used 4 times
+      TMP_JAMP(1724) = AMP(1118) + ((0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * AMP(1147)  ! used 4 times
+      TMP_JAMP(1723) = AMP(1061) +  AMP(1069)  ! used 4 times
+      TMP_JAMP(1722) = AMP(1043) - AMP(1071)  ! used 4 times
+      TMP_JAMP(1721) = AMP(1147) + ((-0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * AMP(1152)  ! used 4 times
+      TMP_JAMP(1720) = AMP(1052) - AMP(1070)  ! used 4 times
+      TMP_JAMP(1719) = AMP(1286) + ((0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * AMP(1304)  ! used 4 times
+      TMP_JAMP(1718) = AMP(1295) + ((0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * AMP(1304)  ! used 4 times
+      TMP_JAMP(1717) = AMP(1001) +  AMP(1009)  ! used 4 times
+      TMP_JAMP(1716) = AMP(983) - AMP(1011)  ! used 4 times
+      TMP_JAMP(1715) = AMP(992) - AMP(1010)  ! used 4 times
+      TMP_JAMP(1857) = TMP_JAMP(1833) + ((-0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * TMP_JAMP(1713)  ! used 4 times
+      TMP_JAMP(1856) = TMP_JAMP(1813) +  TMP_JAMP(1622)  ! used 4 times
+      TMP_JAMP(1855) = TMP_JAMP(1788) + ((0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * TMP_JAMP(1622)  ! used 4 times
+      TMP_JAMP(1854) = TMP_JAMP(1782) + ((0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * TMP_JAMP(1644)  ! used 4 times
+      TMP_JAMP(1853) = TMP_JAMP(1781) + ((0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * TMP_JAMP(1652)  ! used 4 times
+      TMP_JAMP(1852) = TMP_JAMP(1780) + ((-0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * TMP_JAMP(1645)  ! used 4 times
+      TMP_JAMP(1851) = TMP_JAMP(1779) + ((0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * TMP_JAMP(1650)  ! used 4 times
+      TMP_JAMP(1850) = TMP_JAMP(1773) + ((0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * TMP_JAMP(1671)  ! used 4 times
+      TMP_JAMP(1849) = TMP_JAMP(1749) + ((0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * TMP_JAMP(1644)  ! used 4 times
+      TMP_JAMP(1848) = TMP_JAMP(1741) + ((0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * TMP_JAMP(1652)  ! used 4 times
+      TMP_JAMP(1847) = TMP_JAMP(1731) + ((0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * TMP_JAMP(1668)  ! used 4 times
+      TMP_JAMP(1846) = TMP_JAMP(1730) + ((0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * TMP_JAMP(1651)  ! used 4 times
+      TMP_JAMP(1845) = TMP_JAMP(1721) + ((-0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * TMP_JAMP(1650)  ! used 4 times
+      TMP_JAMP(1844) = TMP_JAMP(1720) - TMP_JAMP(1712)  ! used 4 times
+      TMP_JAMP(1862) = TMP_JAMP(1722) - TMP_JAMP(1714)  ! used 3 times
+      TMP_JAMP(1861) = TMP_JAMP(1670) + ((0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * TMP_JAMP(1557)  ! used 3 times
+      TMP_JAMP(1860) = TMP_JAMP(1631) - TMP_JAMP(1574)  ! used 3 times
+      TMP_JAMP(1859) = TMP_JAMP(1608) +  TMP_JAMP(1479)  ! used 3 times
+      TMP_JAMP(1858) = TMP_JAMP(1595) +  AMP(1074)  ! used 3 times
+      TMP_JAMP(1863) = TMP_JAMP(1861) + ((0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * TMP_JAMP(1761)  ! used 3 times
+      TMP_JAMP(2077) = TMP_JAMP(1863) - TMP_JAMP(1807)  ! used 2 times
+      TMP_JAMP(2076) = TMP_JAMP(1862) + ((0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * TMP_JAMP(1790)  ! used 2 times
+      TMP_JAMP(2075) = TMP_JAMP(1857) +  TMP_JAMP(1828)  ! used 2 times
+      TMP_JAMP(2074) = TMP_JAMP(1857) + ((0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * TMP_JAMP(1558)  ! used 2 times
+      TMP_JAMP(2073) = TMP_JAMP(1856) - TMP_JAMP(1669)  ! used 2 times
+      TMP_JAMP(2072) = TMP_JAMP(1855) + ((-0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * TMP_JAMP(1695)  ! used 2 times
+      TMP_JAMP(2071) = TMP_JAMP(1854) + ((0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * TMP_JAMP(1568)  ! used 2 times
+      TMP_JAMP(2070) = TMP_JAMP(1853) +  TMP_JAMP(1786)  ! used 2 times
+      TMP_JAMP(2069) = TMP_JAMP(1852) + ((-0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * TMP_JAMP(1831)  ! used 2 times
+      TMP_JAMP(2068) = TMP_JAMP(1852) + ((0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * TMP_JAMP(1591)  ! used 2 times
+      TMP_JAMP(2067) = TMP_JAMP(1850) + ((0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * TMP_JAMP(1567)  ! used 2 times
+      TMP_JAMP(2066) = TMP_JAMP(1849) + ((0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * TMP_JAMP(1555)  ! used 2 times
+      TMP_JAMP(2065) = TMP_JAMP(1848) + ((-0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * TMP_JAMP(1750)  ! used 2 times
+      TMP_JAMP(2064) = TMP_JAMP(1846) + ((-0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * TMP_JAMP(1711)  ! used 2 times
+      TMP_JAMP(2063) = TMP_JAMP(1845) + ((0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * TMP_JAMP(1647)  ! used 2 times
+      TMP_JAMP(2062) = TMP_JAMP(1844) - TMP_JAMP(1823)  ! used 2 times
+      TMP_JAMP(2061) = TMP_JAMP(1840) - TMP_JAMP(1716)  ! used 2 times
+      TMP_JAMP(2060) = TMP_JAMP(1839) +  TMP_JAMP(1710)  ! used 2 times
+      TMP_JAMP(2059) = TMP_JAMP(1839) - TMP_JAMP(1711)  ! used 2 times
+      TMP_JAMP(2058) = TMP_JAMP(1838) +  TMP_JAMP(1521)  ! used 2 times
+      TMP_JAMP(2057) = TMP_JAMP(1837) +  TMP_JAMP(1734)  ! used 2 times
+      TMP_JAMP(2056) = TMP_JAMP(1836) +  TMP_JAMP(1623)  ! used 2 times
+      TMP_JAMP(2055) = TMP_JAMP(1836) +  TMP_JAMP(1622)  ! used 2 times
+      TMP_JAMP(2054) = TMP_JAMP(1835) - TMP_JAMP(1825)  ! used 2 times
+      TMP_JAMP(2053) = TMP_JAMP(1835) +  TMP_JAMP(1666)  ! used 2 times
+      TMP_JAMP(2052) = TMP_JAMP(1831) +  TMP_JAMP(1810)  ! used 2 times
+      TMP_JAMP(2051) = TMP_JAMP(1831) + ((-0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * TMP_JAMP(1556)  ! used 2 times
+      TMP_JAMP(2050) = TMP_JAMP(1830) +  TMP_JAMP(1665)  ! used 2 times
+      TMP_JAMP(2049) = TMP_JAMP(1830) + ((0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * TMP_JAMP(1743)  ! used 2 times
+      TMP_JAMP(2048) = TMP_JAMP(1830) - TMP_JAMP(1641)  ! used 2 times
+      TMP_JAMP(2047) = TMP_JAMP(1828) - TMP_JAMP(1567)  ! used 2 times
+      TMP_JAMP(2046) = TMP_JAMP(1828) + ((-0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * TMP_JAMP(1733)  ! used 2 times
+      TMP_JAMP(2045) = TMP_JAMP(1827) +  TMP_JAMP(1726)  ! used 2 times
+      TMP_JAMP(2044) = TMP_JAMP(1825) - TMP_JAMP(1476)  ! used 2 times
+      TMP_JAMP(2043) = TMP_JAMP(1824) + ((0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * TMP_JAMP(1760)  ! used 2 times
+      TMP_JAMP(2042) = TMP_JAMP(1824) - AMP(1350)  ! used 2 times
+      TMP_JAMP(2041) = TMP_JAMP(1823) + ((0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * TMP_JAMP(1728)  ! used 2 times
+      TMP_JAMP(2040) = TMP_JAMP(1822) +  TMP_JAMP(1591)  ! used 2 times
+      TMP_JAMP(2039) = TMP_JAMP(1822) - TMP_JAMP(1626)  ! used 2 times
+      TMP_JAMP(2038) = TMP_JAMP(1822) + ((-0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * TMP_JAMP(1742)  ! used 2 times
+      TMP_JAMP(2037) = TMP_JAMP(1821) - TMP_JAMP(1626)  ! used 2 times
+      TMP_JAMP(2036) = TMP_JAMP(1821) +  TMP_JAMP(1555)  ! used 2 times
+      TMP_JAMP(2035) = TMP_JAMP(1821) + ((-0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * TMP_JAMP(1729)  ! used 2 times
+      TMP_JAMP(2034) = TMP_JAMP(1820) +  TMP_JAMP(1725)  ! used 2 times
+      TMP_JAMP(2033) = TMP_JAMP(1819) - TMP_JAMP(1669)  ! used 2 times
+      TMP_JAMP(2032) = TMP_JAMP(1819) + ((-0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * TMP_JAMP(1732)  ! used 2 times
+      TMP_JAMP(2031) = TMP_JAMP(1818) +  TMP_JAMP(1727)  ! used 2 times
+      TMP_JAMP(2030) = TMP_JAMP(1815) +  TMP_JAMP(1719)  ! used 2 times
+      TMP_JAMP(2029) = TMP_JAMP(1814) - TMP_JAMP(1718)  ! used 2 times
+      TMP_JAMP(2028) = TMP_JAMP(1813) + ((0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * TMP_JAMP(1569)  ! used 2 times
+      TMP_JAMP(2027) = TMP_JAMP(1812) + ((-0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * TMP_JAMP(1760)  ! used 2 times
+      TMP_JAMP(2026) = TMP_JAMP(1811) +  TMP_JAMP(1757)  ! used 2 times
+      TMP_JAMP(2025) = TMP_JAMP(1809) + ((-0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * TMP_JAMP(1795)  ! used 2 times
+      TMP_JAMP(2024) = TMP_JAMP(1808) + ((0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * TMP_JAMP(1795)  ! used 2 times
+      TMP_JAMP(2023) = TMP_JAMP(1805) - TMP_JAMP(1676)  ! used 2 times
+      TMP_JAMP(2022) = TMP_JAMP(1803) + ((-0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * TMP_JAMP(1787)  ! used 2 times
+      TMP_JAMP(2021) = TMP_JAMP(1800) + ((-0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * AMP(980)  ! used 2 times
+      TMP_JAMP(2020) = TMP_JAMP(1799) + ((0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * AMP(1218)  ! used 2 times
+      TMP_JAMP(2019) = TMP_JAMP(1798) + ((-0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * TMP_JAMP(1751)  ! used 2 times
+      TMP_JAMP(2018) = TMP_JAMP(1795) + ((-0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * TMP_JAMP(1688)  ! used 2 times
+      TMP_JAMP(2017) = TMP_JAMP(1794) + ((0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * TMP_JAMP(1673)  ! used 2 times
+      TMP_JAMP(2016) = TMP_JAMP(1793) - TMP_JAMP(1774)  ! used 2 times
+      TMP_JAMP(2015) = TMP_JAMP(1792) + ((0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * TMP_JAMP(1699)  ! used 2 times
+      TMP_JAMP(2014) = TMP_JAMP(1789) - AMP(1040)  ! used 2 times
+      TMP_JAMP(2013) = TMP_JAMP(1787) + ((0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * TMP_JAMP(1677)  ! used 2 times
+      TMP_JAMP(2012) = TMP_JAMP(1787) + ((-0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * TMP_JAMP(1686)  ! used 2 times
+      TMP_JAMP(2011) = TMP_JAMP(1784) +  TMP_JAMP(1769)  ! used 2 times
+      TMP_JAMP(2010) = TMP_JAMP(1784) + ((0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * TMP_JAMP(1658)  ! used 2 times
+      TMP_JAMP(2009) = TMP_JAMP(1783) - AMP(1205)  ! used 2 times
+      TMP_JAMP(2008) = TMP_JAMP(1779) +  TMP_JAMP(1597)  ! used 2 times
+      TMP_JAMP(2007) = TMP_JAMP(1778) - TMP_JAMP(1775)  ! used 2 times
+      TMP_JAMP(2006) = TMP_JAMP(1776) + ((0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * TMP_JAMP(1636)  ! used 2 times
+      TMP_JAMP(2005) = TMP_JAMP(1776) + ((-0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * TMP_JAMP(1518)  ! used 2 times
+      TMP_JAMP(2004) = TMP_JAMP(1772) +  TMP_JAMP(1584)  ! used 2 times
+      TMP_JAMP(2003) = TMP_JAMP(1771) +  AMP(1030)  ! used 2 times
+      TMP_JAMP(2002) = TMP_JAMP(1769) + ((0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * TMP_JAMP(1719)  ! used 2 times
+      TMP_JAMP(2001) = TMP_JAMP(1767) +  TMP_JAMP(1736)  ! used 2 times
+      TMP_JAMP(2000) = TMP_JAMP(1767) + ((0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * TMP_JAMP(1718)  ! used 2 times
+      TMP_JAMP(1999) = TMP_JAMP(1765) + ((-0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * TMP_JAMP(1638)  ! used 2 times
+      TMP_JAMP(1998) = TMP_JAMP(1764) +  TMP_JAMP(1760)  ! used 2 times
+      TMP_JAMP(1997) = TMP_JAMP(1763) + ((-0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * TMP_JAMP(1674)  ! used 2 times
+      TMP_JAMP(1996) = TMP_JAMP(1762) + ((0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * TMP_JAMP(1672)  ! used 2 times
+      TMP_JAMP(1995) = TMP_JAMP(1758) + ((0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * AMP(1280)  ! used 2 times
+      TMP_JAMP(1994) = TMP_JAMP(1757) +  AMP(1172)  ! used 2 times
+      TMP_JAMP(1993) = TMP_JAMP(1756) + ((0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * TMP_JAMP(1612)  ! used 2 times
+      TMP_JAMP(1992) = TMP_JAMP(1756) +  AMP(1283)  ! used 2 times
+      TMP_JAMP(1991) = TMP_JAMP(1755) + ((0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * TMP_JAMP(1562)  ! used 2 times
+      TMP_JAMP(1990) = TMP_JAMP(1754) - AMP(1116)  ! used 2 times
+      TMP_JAMP(1989) = TMP_JAMP(1753) +  AMP(1140)  ! used 2 times
+      TMP_JAMP(1988) = TMP_JAMP(1752) +  AMP(1218)  ! used 2 times
+      TMP_JAMP(1987) = TMP_JAMP(1752) + ((-0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * TMP_JAMP(1677)  ! used 2 times
+      TMP_JAMP(1986) = TMP_JAMP(1746) +  TMP_JAMP(1744)  ! used 2 times
+      TMP_JAMP(1985) = TMP_JAMP(1746) - TMP_JAMP(1737)  ! used 2 times
+      TMP_JAMP(1984) = TMP_JAMP(1745) + ((-0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * TMP_JAMP(1660)  ! used 2 times
+      TMP_JAMP(1983) = TMP_JAMP(1744) - AMP(1193)  ! used 2 times
+      TMP_JAMP(1982) = TMP_JAMP(1740) + ((-0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * TMP_JAMP(1723)  ! used 2 times
+      TMP_JAMP(1981) = TMP_JAMP(1740) + ((0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * TMP_JAMP(1734)  ! used 2 times
+      TMP_JAMP(1980) = TMP_JAMP(1739) + ((-0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * TMP_JAMP(1548)  ! used 2 times
+      TMP_JAMP(1979) = TMP_JAMP(1738) + ((-0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * TMP_JAMP(1620)  ! used 2 times
+      TMP_JAMP(1978) = TMP_JAMP(1737) + ((0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * TMP_JAMP(1653)  ! used 2 times
+      TMP_JAMP(1977) = TMP_JAMP(1736) +  AMP(1271)  ! used 2 times
+      TMP_JAMP(1976) = TMP_JAMP(1735) + ((-0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * TMP_JAMP(1625)  ! used 2 times
+      TMP_JAMP(1975) = TMP_JAMP(1735) + ((0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * TMP_JAMP(1543)  ! used 2 times
+      TMP_JAMP(1974) = TMP_JAMP(1734) - TMP_JAMP(1705)  ! used 2 times
+      TMP_JAMP(1973) = TMP_JAMP(1728) + ((0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * TMP_JAMP(1630)  ! used 2 times
+      TMP_JAMP(1972) = TMP_JAMP(1727) - TMP_JAMP(1529)  ! used 2 times
+      TMP_JAMP(1971) = TMP_JAMP(1727) + ((-0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * AMP(987)  ! used 2 times
+      TMP_JAMP(1970) = TMP_JAMP(1726) + ((-0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * AMP(1005)  ! used 2 times
+      TMP_JAMP(1969) = TMP_JAMP(1725) + ((-0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * AMP(996)  ! used 2 times
+      TMP_JAMP(1968) = TMP_JAMP(1723) - TMP_JAMP(1700)  ! used 2 times
+      TMP_JAMP(1967) = TMP_JAMP(1717) - TMP_JAMP(1711)  ! used 2 times
+      TMP_JAMP(1966) = TMP_JAMP(1717) +  TMP_JAMP(1716)  ! used 2 times
+      TMP_JAMP(1965) = TMP_JAMP(1716) + ((-0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * AMP(979)  ! used 2 times
+      TMP_JAMP(1964) = TMP_JAMP(1715) + ((-0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * AMP(970)  ! used 2 times
+      TMP_JAMP(1963) = TMP_JAMP(1715) + ((0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * AMP(991)  ! used 2 times
+      TMP_JAMP(1962) = TMP_JAMP(1712) - TMP_JAMP(1692)  ! used 2 times
+      TMP_JAMP(1961) = TMP_JAMP(1707) +  TMP_JAMP(1687)  ! used 2 times
+      TMP_JAMP(1960) = TMP_JAMP(1706) - TMP_JAMP(1479)  ! used 2 times
+      TMP_JAMP(1959) = TMP_JAMP(1705) - TMP_JAMP(1526)  ! used 2 times
+      TMP_JAMP(1958) = TMP_JAMP(1703) +  TMP_JAMP(1687)  ! used 2 times
+      TMP_JAMP(1957) = TMP_JAMP(1702) - TMP_JAMP(1476)  ! used 2 times
+      TMP_JAMP(1956) = TMP_JAMP(1701) - TMP_JAMP(1651)  ! used 2 times
+      TMP_JAMP(1955) = TMP_JAMP(1692) - TMP_JAMP(1624)  ! used 2 times
+      TMP_JAMP(1954) = TMP_JAMP(1690) +  TMP_JAMP(1629)  ! used 2 times
+      TMP_JAMP(1953) = TMP_JAMP(1688) - AMP(1326)  ! used 2 times
+      TMP_JAMP(1952) = TMP_JAMP(1686) - AMP(1326)  ! used 2 times
+      TMP_JAMP(1951) = TMP_JAMP(1681) +  TMP_JAMP(1530)  ! used 2 times
+      TMP_JAMP(1950) = TMP_JAMP(1680) +  TMP_JAMP(1656)  ! used 2 times
+      TMP_JAMP(1949) = TMP_JAMP(1678) +  TMP_JAMP(1520)  ! used 2 times
+      TMP_JAMP(1948) = TMP_JAMP(1678) +  TMP_JAMP(1643)  ! used 2 times
+      TMP_JAMP(1947) = TMP_JAMP(1677) +  TMP_JAMP(1663)  ! used 2 times
+      TMP_JAMP(1946) = TMP_JAMP(1675) +  TMP_JAMP(1624)  ! used 2 times
+      TMP_JAMP(1945) = TMP_JAMP(1670) + ((-0.000000000000000D+00
      $ ,1.000000000000000D+00)) * AMP(1127)  ! used 2 times
-      TMP_JAMP(2719) = TMP_JAMP(2508) - TMP_JAMP(2506)  ! used 2 times
-      TMP_JAMP(2718) = TMP_JAMP(2502) - AMP(1451)  ! used 2 times
-      TMP_JAMP(2717) = TMP_JAMP(2499) +  TMP_JAMP(2495)  ! used 2 times
-      TMP_JAMP(2716) = TMP_JAMP(2487) + ((-0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * AMP(1719)  ! used 2 times
-      TMP_JAMP(2715) = TMP_JAMP(2488) + ((-0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * AMP(1070)  ! used 2 times
-      TMP_JAMP(2714) = TMP_JAMP(2490) + ((-0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * TMP_JAMP(2489)  ! used 2 times
-      TMP_JAMP(2713) = TMP_JAMP(2472) + ((0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * TMP_JAMP(2467)  ! used 2 times
-      TMP_JAMP(2712) = TMP_JAMP(2475) - TMP_JAMP(2474)  ! used 2 times
-      TMP_JAMP(2711) = TMP_JAMP(2413) + ((-0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * TMP_JAMP(2412)  ! used 2 times
-      TMP_JAMP(2710) = TMP_JAMP(2395) - AMP(582)  ! used 2 times
-      TMP_JAMP(2709) = TMP_JAMP(2375) + ((0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * AMP(912)  ! used 2 times
-      TMP_JAMP(2708) = TMP_JAMP(2373) - TMP_JAMP(2371)  ! used 2 times
-      TMP_JAMP(2707) = TMP_JAMP(2353) - AMP(1511)  ! used 2 times
-      TMP_JAMP(2706) = TMP_JAMP(2345) +  AMP(1107)  ! used 2 times
-      TMP_JAMP(2705) = TMP_JAMP(2330) +  AMP(1275)  ! used 2 times
-      TMP_JAMP(2704) = TMP_JAMP(2326) - TMP_JAMP(2325)  ! used 2 times
-      TMP_JAMP(2703) = TMP_JAMP(2329) +  TMP_JAMP(2327)  ! used 2 times
-      TMP_JAMP(2702) = TMP_JAMP(2308) +  AMP(364)  ! used 2 times
-      TMP_JAMP(2701) = TMP_JAMP(2307) +  AMP(1021)  ! used 2 times
-      TMP_JAMP(2700) = TMP_JAMP(2287) - TMP_JAMP(2285)  ! used 2 times
-      TMP_JAMP(2699) = TMP_JAMP(2269) + ((0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * AMP(46)  ! used 2 times
-      TMP_JAMP(2698) = TMP_JAMP(2261) + ((0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * TMP_JAMP(2257)  ! used 2 times
-      TMP_JAMP(2697) = TMP_JAMP(2256) - TMP_JAMP(2255)  ! used 2 times
-      TMP_JAMP(2696) = TMP_JAMP(2244) - TMP_JAMP(1407)  ! used 2 times
-      TMP_JAMP(2695) = TMP_JAMP(2243) - TMP_JAMP(2242)  ! used 2 times
-      TMP_JAMP(2694) = TMP_JAMP(2236) + ((-0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * TMP_JAMP(2235)  ! used 2 times
-      TMP_JAMP(2693) = TMP_JAMP(2231) - TMP_JAMP(2230)  ! used 2 times
-      TMP_JAMP(2692) = TMP_JAMP(2225) + ((-0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * AMP(1140)  ! used 2 times
-      TMP_JAMP(2691) = TMP_JAMP(2227) - TMP_JAMP(2226)  ! used 2 times
-      TMP_JAMP(2690) = TMP_JAMP(2211) - AMP(1185)  ! used 2 times
-      TMP_JAMP(2689) = TMP_JAMP(2213) +  TMP_JAMP(2212)  ! used 2 times
-      TMP_JAMP(2688) = TMP_JAMP(2205) - TMP_JAMP(2202)  ! used 2 times
-      TMP_JAMP(2687) = TMP_JAMP(2200) +  AMP(615)  ! used 2 times
-      TMP_JAMP(2686) = TMP_JAMP(2192) + ((-0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * TMP_JAMP(2191)  ! used 2 times
-      TMP_JAMP(2685) = TMP_JAMP(2182) + ((-0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * TMP_JAMP(2181)  ! used 2 times
-      TMP_JAMP(2684) = TMP_JAMP(2175) +  TMP_JAMP(2170)  ! used 2 times
-      TMP_JAMP(2683) = TMP_JAMP(2169) + ((-0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * TMP_JAMP(2165)  ! used 2 times
-      TMP_JAMP(2682) = TMP_JAMP(2155) + ((-0.000000000000000D+00
+      TMP_JAMP(1944) = TMP_JAMP(1668) + ((-0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * AMP(1039)  ! used 2 times
+      TMP_JAMP(1943) = TMP_JAMP(1663) - TMP_JAMP(1625)  ! used 2 times
+      TMP_JAMP(1942) = TMP_JAMP(1663) +  TMP_JAMP(1518)  ! used 2 times
+      TMP_JAMP(1941) = TMP_JAMP(1661) +  TMP_JAMP(1526)  ! used 2 times
+      TMP_JAMP(1940) = TMP_JAMP(1661) +  AMP(1172)  ! used 2 times
+      TMP_JAMP(1939) = TMP_JAMP(1658) - TMP_JAMP(1592)  ! used 2 times
+      TMP_JAMP(1938) = TMP_JAMP(1657) - AMP(1376)  ! used 2 times
+      TMP_JAMP(1937) = TMP_JAMP(1653) - TMP_JAMP(1577)  ! used 2 times
+      TMP_JAMP(1936) = TMP_JAMP(1649) + ((0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * AMP(1150)  ! used 2 times
+      TMP_JAMP(1935) = TMP_JAMP(1648) + ((-0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * TMP_JAMP(1549)  ! used 2 times
+      TMP_JAMP(1934) = TMP_JAMP(1645) + ((0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * TMP_JAMP(1547)  ! used 2 times
+      TMP_JAMP(1933) = TMP_JAMP(1643) +  TMP_JAMP(1582)  ! used 2 times
+      TMP_JAMP(1932) = TMP_JAMP(1637) +  TMP_JAMP(1632)  ! used 2 times
+      TMP_JAMP(1931) = TMP_JAMP(1634) +  TMP_JAMP(1627)  ! used 2 times
+      TMP_JAMP(1930) = TMP_JAMP(1630) + ((0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * AMP(1031)  ! used 2 times
+      TMP_JAMP(1929) = TMP_JAMP(1628) + ((0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * AMP(1228)  ! used 2 times
+      TMP_JAMP(1928) = TMP_JAMP(1624) +  TMP_JAMP(1616)  ! used 2 times
+      TMP_JAMP(1927) = TMP_JAMP(1620) +  TMP_JAMP(1551)  ! used 2 times
+      TMP_JAMP(1926) = TMP_JAMP(1620) +  TMP_JAMP(1615)  ! used 2 times
+      TMP_JAMP(1925) = TMP_JAMP(1618) +  TMP_JAMP(1613)  ! used 2 times
+      TMP_JAMP(1924) = TMP_JAMP(1617) - AMP(1376)  ! used 2 times
+      TMP_JAMP(1923) = TMP_JAMP(1617) - TMP_JAMP(1614)  ! used 2 times
+      TMP_JAMP(1922) = TMP_JAMP(1613) + ((-0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * TMP_JAMP(1564)  ! used 2 times
+      TMP_JAMP(1921) = TMP_JAMP(1612) +  TMP_JAMP(1543)  ! used 2 times
+      TMP_JAMP(1920) = TMP_JAMP(1611) - TMP_JAMP(1548)  ! used 2 times
+      TMP_JAMP(1919) = TMP_JAMP(1609) + ((0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * AMP(961)  ! used 2 times
+      TMP_JAMP(1918) = TMP_JAMP(1609) - TMP_JAMP(1530)  ! used 2 times
+      TMP_JAMP(1917) = TMP_JAMP(1598) - AMP(1128)  ! used 2 times
+      TMP_JAMP(1916) = TMP_JAMP(1597) +  AMP(1060)  ! used 2 times
+      TMP_JAMP(1915) = TMP_JAMP(1592) + ((-0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * TMP_JAMP(1539)  ! used 2 times
+      TMP_JAMP(1914) = TMP_JAMP(1591) +  TMP_JAMP(1577)  ! used 2 times
+      TMP_JAMP(1913) = TMP_JAMP(1591) + ((-0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * AMP(1197)  ! used 2 times
+      TMP_JAMP(1912) = TMP_JAMP(1590) + ((-0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * AMP(1202)  ! used 2 times
+      TMP_JAMP(1911) = TMP_JAMP(1586) + ((0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * AMP(1121)  ! used 2 times
+      TMP_JAMP(1910) = TMP_JAMP(1586) + ((-0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * AMP(1119)  ! used 2 times
+      TMP_JAMP(1909) = TMP_JAMP(1584) +  AMP(1051)  ! used 2 times
+      TMP_JAMP(1908) = TMP_JAMP(1583) +  AMP(1073)  ! used 2 times
+      TMP_JAMP(1907) = TMP_JAMP(1577) +  TMP_JAMP(1510)  ! used 2 times
+      TMP_JAMP(1906) = TMP_JAMP(1575) + ((0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * AMP(1287)  ! used 2 times
+      TMP_JAMP(1905) = TMP_JAMP(1575) + ((-0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * AMP(1289)  ! used 2 times
+      TMP_JAMP(1904) = TMP_JAMP(1574) +  TMP_JAMP(1519)  ! used 2 times
+      TMP_JAMP(1903) = TMP_JAMP(1572) + ((0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * AMP(1296)  ! used 2 times
+      TMP_JAMP(1902) = TMP_JAMP(1572) + ((-0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * AMP(1298)  ! used 2 times
+      TMP_JAMP(1901) = TMP_JAMP(1569) +  AMP(1042)  ! used 2 times
+      TMP_JAMP(1900) = TMP_JAMP(1563) + ((-0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * TMP_JAMP(1539)  ! used 2 times
+      TMP_JAMP(1899) = TMP_JAMP(1562) + ((-0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * AMP(1275)  ! used 2 times
+      TMP_JAMP(1898) = TMP_JAMP(1559) + ((-0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * AMP(1107)  ! used 2 times
+      TMP_JAMP(1897) = TMP_JAMP(1553) + ((0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * AMP(1058)  ! used 2 times
+      TMP_JAMP(1896) = TMP_JAMP(1551) - TMP_JAMP(1529)  ! used 2 times
+      TMP_JAMP(1895) = TMP_JAMP(1550) + ((-0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * AMP(1115)  ! used 2 times
+      TMP_JAMP(1894) = TMP_JAMP(1550) + ((-0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * AMP(1142)  ! used 2 times
+      TMP_JAMP(1893) = TMP_JAMP(1548) + ((0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * AMP(1021)  ! used 2 times
+      TMP_JAMP(1892) = TMP_JAMP(1548) + ((-0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * AMP(1409)  ! used 2 times
+      TMP_JAMP(1891) = TMP_JAMP(1543) + ((-0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * AMP(1263)  ! used 2 times
+      TMP_JAMP(1890) = TMP_JAMP(1541) + ((-0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * AMP(1185)  ! used 2 times
+      TMP_JAMP(1889) = TMP_JAMP(1539) - AMP(962)  ! used 2 times
+      TMP_JAMP(1888) = TMP_JAMP(1539) +  AMP(1000)  ! used 2 times
+      TMP_JAMP(1887) = TMP_JAMP(1534) - AMP(971)  ! used 2 times
+      TMP_JAMP(1886) = TMP_JAMP(1512) + ((0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * AMP(1047)  ! used 2 times
+      TMP_JAMP(1885) = TMP_JAMP(1507) + ((-0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * AMP(1022)  ! used 2 times
+      TMP_JAMP(1884) = TMP_JAMP(1505) + ((0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * AMP(1199)  ! used 2 times
+      TMP_JAMP(1883) = TMP_JAMP(1505) + ((-0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * AMP(1220)  ! used 2 times
+      TMP_JAMP(1882) = TMP_JAMP(1504) + ((-0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * AMP(1211)  ! used 2 times
+      TMP_JAMP(1881) = TMP_JAMP(1504) + ((0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * AMP(1187)  ! used 2 times
+      TMP_JAMP(1880) = TMP_JAMP(1503) + ((0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * AMP(1065)  ! used 2 times
+      TMP_JAMP(1879) = TMP_JAMP(1501) + ((-0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * AMP(1412)  ! used 2 times
+      TMP_JAMP(1878) = TMP_JAMP(1492) + ((-0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * AMP(1133)  ! used 2 times
+      TMP_JAMP(1877) = TMP_JAMP(1491) + ((0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * AMP(1056)  ! used 2 times
+      TMP_JAMP(1876) = TMP_JAMP(1481) + ((0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * AMP(1265)  ! used 2 times
+      TMP_JAMP(1875) = TMP_JAMP(1480) + ((0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * AMP(1277)  ! used 2 times
+      TMP_JAMP(1874) = TMP_JAMP(1476) + ((0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * AMP(982)  ! used 2 times
+      TMP_JAMP(1873) = AMP(1374) +  AMP(1376)  ! used 2 times
+      TMP_JAMP(1872) = AMP(1206) + ((-0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * AMP(1210)  ! used 2 times
+      TMP_JAMP(1871) = AMP(1124) + ((-0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * AMP(1352)  ! used 2 times
+      TMP_JAMP(1870) = AMP(1284) + ((-0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * AMP(1288)  ! used 2 times
+      TMP_JAMP(1869) = AMP(1106) + ((-0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * AMP(1109)  ! used 2 times
+      TMP_JAMP(1868) = AMP(1209) + ((-0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * AMP(1210)  ! used 2 times
+      TMP_JAMP(1867) = AMP(1094) + ((-0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * AMP(1292)  ! used 2 times
+      TMP_JAMP(1866) = AMP(1272) + ((-0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * AMP(1297)  ! used 2 times
+      TMP_JAMP(1865) = AMP(1194) + ((-0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * AMP(1219)  ! used 2 times
+      TMP_JAMP(1864) = AMP(1131) + ((-0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * AMP(1132)  ! used 2 times
+      TMP_JAMP(2141) = TMP_JAMP(2077) + ((0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * TMP_JAMP(1990)  ! used 2 times
+      TMP_JAMP(2140) = TMP_JAMP(2076) - TMP_JAMP(1944)  ! used 2 times
+      TMP_JAMP(2139) = TMP_JAMP(2072) +  TMP_JAMP(1858)  ! used 2 times
+      TMP_JAMP(2138) = TMP_JAMP(2071) + ((0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * TMP_JAMP(1882)  ! used 2 times
+      TMP_JAMP(2137) = TMP_JAMP(2070) - TMP_JAMP(1872)  ! used 2 times
+      TMP_JAMP(2136) = TMP_JAMP(2065) - TMP_JAMP(1983)  ! used 2 times
+      TMP_JAMP(2135) = TMP_JAMP(2062) + ((0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * TMP_JAMP(2003)  ! used 2 times
+      TMP_JAMP(2134) = TMP_JAMP(2054) +  TMP_JAMP(1960)  ! used 2 times
+      TMP_JAMP(2133) = TMP_JAMP(2053) - TMP_JAMP(1924)  ! used 2 times
+      TMP_JAMP(2132) = TMP_JAMP(2051) + ((-0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * TMP_JAMP(1849)  ! used 2 times
+      TMP_JAMP(2131) = TMP_JAMP(2048) +  TMP_JAMP(1927)  ! used 2 times
+      TMP_JAMP(2130) = TMP_JAMP(2046) +  TMP_JAMP(1970)  ! used 2 times
+      TMP_JAMP(2129) = TMP_JAMP(2041) - TMP_JAMP(1964)  ! used 2 times
+      TMP_JAMP(2128) = TMP_JAMP(2035) +  TMP_JAMP(1969)  ! used 2 times
+      TMP_JAMP(2127) = TMP_JAMP(2032) +  TMP_JAMP(1971)  ! used 2 times
+      TMP_JAMP(2126) = TMP_JAMP(2030) - TMP_JAMP(1905)  ! used 2 times
+      TMP_JAMP(2125) = TMP_JAMP(2029) +  TMP_JAMP(1902)  ! used 2 times
+      TMP_JAMP(2124) = TMP_JAMP(2024) + ((0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * TMP_JAMP(2013)  ! used 2 times
+      TMP_JAMP(2123) = TMP_JAMP(2021) + ((-0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * TMP_JAMP(1847)  ! used 2 times
+      TMP_JAMP(2122) = TMP_JAMP(2017) + ((0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * TMP_JAMP(1911)  ! used 2 times
+      TMP_JAMP(2121) = TMP_JAMP(2016) + ((-0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * TMP_JAMP(1945)  ! used 2 times
+      TMP_JAMP(2120) = TMP_JAMP(2015) - TMP_JAMP(1851)  ! used 2 times
+      TMP_JAMP(2119) = TMP_JAMP(2014) + ((0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * TMP_JAMP(1832)  ! used 2 times
+      TMP_JAMP(2118) = TMP_JAMP(2009) - TMP_JAMP(1853)  ! used 2 times
+      TMP_JAMP(2117) = TMP_JAMP(2007) +  TMP_JAMP(1917)  ! used 2 times
+      TMP_JAMP(2116) = TMP_JAMP(2004) + ((0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * TMP_JAMP(1877)  ! used 2 times
+      TMP_JAMP(2115) = TMP_JAMP(2002) + ((0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * TMP_JAMP(1926)  ! used 2 times
+      TMP_JAMP(2114) = TMP_JAMP(2000) + ((0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * TMP_JAMP(1923)  ! used 2 times
+      TMP_JAMP(2113) = TMP_JAMP(1996) + ((0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * TMP_JAMP(1869)  ! used 2 times
+      TMP_JAMP(2112) = TMP_JAMP(1993) - TMP_JAMP(1870)  ! used 2 times
+      TMP_JAMP(2111) = TMP_JAMP(1991) + ((0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * TMP_JAMP(1875)  ! used 2 times
+      TMP_JAMP(2110) = TMP_JAMP(1989) + ((-0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * TMP_JAMP(1806)  ! used 2 times
+      TMP_JAMP(2109) = TMP_JAMP(1982) + ((-0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * TMP_JAMP(1956)  ! used 2 times
+      TMP_JAMP(2108) = TMP_JAMP(1980) + ((-0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * TMP_JAMP(1879)  ! used 2 times
+      TMP_JAMP(2107) = TMP_JAMP(1976) + ((-0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * TMP_JAMP(1925)  ! used 2 times
+      TMP_JAMP(2106) = TMP_JAMP(1975) + ((0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * TMP_JAMP(1876)  ! used 2 times
+      TMP_JAMP(2105) = TMP_JAMP(1973) +  TMP_JAMP(1887)  ! used 2 times
+      TMP_JAMP(2104) = TMP_JAMP(1965) + ((-0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * TMP_JAMP(1847)  ! used 2 times
+      TMP_JAMP(2103) = TMP_JAMP(1955) - TMP_JAMP(1896)  ! used 2 times
+      TMP_JAMP(2102) = TMP_JAMP(1951) +  TMP_JAMP(1704)  ! used 2 times
+      TMP_JAMP(2101) = TMP_JAMP(1949) - TMP_JAMP(1859)  ! used 2 times
+      TMP_JAMP(2100) = TMP_JAMP(1946) +  TMP_JAMP(1904)  ! used 2 times
+      TMP_JAMP(2099) = TMP_JAMP(1943) +  TMP_JAMP(1907)  ! used 2 times
+      TMP_JAMP(2098) = TMP_JAMP(1942) +  TMP_JAMP(1680)  ! used 2 times
+      TMP_JAMP(2097) = TMP_JAMP(1935) + ((-0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * TMP_JAMP(1739)  ! used 2 times
+      TMP_JAMP(2096) = TMP_JAMP(1931) +  TMP_JAMP(1646)  ! used 2 times
+      TMP_JAMP(2095) = TMP_JAMP(1930) + ((0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * TMP_JAMP(1771)  ! used 2 times
+      TMP_JAMP(2094) = TMP_JAMP(1922) + ((-0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * TMP_JAMP(1755)  ! used 2 times
+      TMP_JAMP(2093) = TMP_JAMP(1921) + ((0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * TMP_JAMP(1866)  ! used 2 times
+      TMP_JAMP(2092) = TMP_JAMP(1916) +  TMP_JAMP(1792)  ! used 2 times
+      TMP_JAMP(2091) = TMP_JAMP(1909) + ((-0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * TMP_JAMP(1844)  ! used 2 times
+      TMP_JAMP(2090) = TMP_JAMP(1908) +  TMP_JAMP(1772)  ! used 2 times
+      TMP_JAMP(2089) = TMP_JAMP(1906) + ((0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * TMP_JAMP(1770)  ! used 2 times
+      TMP_JAMP(2088) = TMP_JAMP(1903) + ((0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * TMP_JAMP(1768)  ! used 2 times
+      TMP_JAMP(2087) = TMP_JAMP(1893) + ((0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * TMP_JAMP(1791)  ! used 2 times
+      TMP_JAMP(2086) = TMP_JAMP(1889) +  TMP_JAMP(1846)  ! used 2 times
+      TMP_JAMP(2085) = TMP_JAMP(1888) + ((-0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * TMP_JAMP(1717)  ! used 2 times
+      TMP_JAMP(2084) = TMP_JAMP(1886) + ((0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * TMP_JAMP(1855)  ! used 2 times
+      TMP_JAMP(2083) = TMP_JAMP(1885) - TMP_JAMP(1829)  ! used 2 times
+      TMP_JAMP(2082) = TMP_JAMP(1881) - TMP_JAMP(1826)  ! used 2 times
+      TMP_JAMP(2081) = TMP_JAMP(1880) + ((0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * TMP_JAMP(1845)  ! used 2 times
+      TMP_JAMP(2080) = TMP_JAMP(1871) + ((-0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * TMP_JAMP(1581)  ! used 2 times
+      TMP_JAMP(2079) = TMP_JAMP(1867) +  TMP_JAMP(1552)  ! used 2 times
+      TMP_JAMP(2078) = TMP_JAMP(1865) - TMP_JAMP(1848)  ! used 2 times
+      TMP_JAMP(2153) = TMP_JAMP(2109) + ((-0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * TMP_JAMP(2087)  ! used 2 times
+      TMP_JAMP(2152) = TMP_JAMP(2101) +  TMP_JAMP(1961)  ! used 2 times
+      TMP_JAMP(2151) = TMP_JAMP(2097) +  TMP_JAMP(1950)  ! used 2 times
+      TMP_JAMP(2150) = TMP_JAMP(2094) - TMP_JAMP(1928)  ! used 2 times
+      TMP_JAMP(2149) = TMP_JAMP(2093) + ((0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * TMP_JAMP(2001)  ! used 2 times
+      TMP_JAMP(2148) = TMP_JAMP(2092) + ((-0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * TMP_JAMP(1968)  ! used 2 times
+      TMP_JAMP(2147) = TMP_JAMP(2090) + ((-0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * TMP_JAMP(1954)  ! used 2 times
+      TMP_JAMP(2146) = TMP_JAMP(2084) +  TMP_JAMP(2028)  ! used 2 times
+      TMP_JAMP(2145) = TMP_JAMP(2083) + ((-0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * TMP_JAMP(1981)  ! used 2 times
+      TMP_JAMP(2144) = TMP_JAMP(2082) - TMP_JAMP(2038)  ! used 2 times
+      TMP_JAMP(2143) = TMP_JAMP(2081) + ((0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * TMP_JAMP(2008)  ! used 2 times
+      TMP_JAMP(2142) = TMP_JAMP(2078) + ((-0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * TMP_JAMP(2019)  ! used 2 times
+      TMP_JAMP(2225) = AMP(1817) +  AMP(1824)  ! used 16 times
+      TMP_JAMP(2224) = AMP(1811) +  AMP(1822)  ! used 16 times
+      TMP_JAMP(2223) = AMP(1816) +  AMP(1821)  ! used 16 times
+      TMP_JAMP(2222) = AMP(1810) +  AMP(1819)  ! used 16 times
+      TMP_JAMP(2221) = AMP(1493) +  AMP(1497)  ! used 16 times
+      TMP_JAMP(2220) = AMP(1487) +  AMP(1495)  ! used 16 times
+      TMP_JAMP(2219) = AMP(1492) +  AMP(1503)  ! used 16 times
+      TMP_JAMP(2218) = AMP(1486) +  AMP(1501)  ! used 16 times
+      TMP_JAMP(2217) = AMP(1683) +  AMP(1689)  ! used 16 times
+      TMP_JAMP(2216) = AMP(1677) +  AMP(1687)  ! used 16 times
+      TMP_JAMP(2215) = AMP(1681) +  AMP(1686)  ! used 16 times
+      TMP_JAMP(2214) = AMP(1675) +  AMP(1684)  ! used 16 times
+      TMP_JAMP(2213) = AMP(1490) +  AMP(1496)  ! used 16 times
+      TMP_JAMP(2212) = AMP(1489) +  AMP(1502)  ! used 16 times
+      TMP_JAMP(2211) = AMP(1871) +  AMP(1878)  ! used 16 times
+      TMP_JAMP(2210) = AMP(1865) +  AMP(1876)  ! used 16 times
+      TMP_JAMP(2209) = AMP(1814) +  AMP(1823)  ! used 16 times
+      TMP_JAMP(2208) = AMP(1813) +  AMP(1820)  ! used 16 times
+      TMP_JAMP(2207) = AMP(1870) +  AMP(1875)  ! used 16 times
+      TMP_JAMP(2206) = AMP(1864) +  AMP(1873)  ! used 16 times
+      TMP_JAMP(2205) = AMP(1548) +  AMP(1554)  ! used 16 times
+      TMP_JAMP(2204) = AMP(1542) +  AMP(1552)  ! used 16 times
+      TMP_JAMP(2203) = AMP(1547) +  AMP(1551)  ! used 16 times
+      TMP_JAMP(2202) = AMP(1541) +  AMP(1549)  ! used 16 times
+      TMP_JAMP(2201) = AMP(1791) +  AMP(1797)  ! used 16 times
+      TMP_JAMP(2200) = AMP(1785) +  AMP(1795)  ! used 16 times
+      TMP_JAMP(2199) = AMP(1782) - AMP(1790)  ! used 16 times
+      TMP_JAMP(2198) = AMP(1780) - AMP(1784)  ! used 16 times
+      TMP_JAMP(2197) = AMP(1680) +  AMP(1688)  ! used 16 times
+      TMP_JAMP(2196) = AMP(1678) +  AMP(1685)  ! used 16 times
+      TMP_JAMP(2195) = AMP(1789) +  AMP(1794)  ! used 16 times
+      TMP_JAMP(2194) = AMP(1783) +  AMP(1792)  ! used 16 times
+      TMP_JAMP(2193) = AMP(1545) +  AMP(1553)  ! used 16 times
+      TMP_JAMP(2192) = AMP(1544) +  AMP(1550)  ! used 16 times
+      TMP_JAMP(2191) = AMP(1546) +  AMP(1557)  ! used 16 times
+      TMP_JAMP(2190) = AMP(1540) +  AMP(1555)  ! used 16 times
+      TMP_JAMP(2189) = AMP(1543) +  AMP(1556)  ! used 16 times
+      TMP_JAMP(2188) = AMP(1844) +  AMP(1851)  ! used 16 times
+      TMP_JAMP(2187) = AMP(1838) +  AMP(1849)  ! used 16 times
+      TMP_JAMP(2186) = AMP(1843) +  AMP(1848)  ! used 16 times
+      TMP_JAMP(2185) = AMP(1837) +  AMP(1846)  ! used 16 times
+      TMP_JAMP(2184) = AMP(1494) +  AMP(1500)  ! used 16 times
+      TMP_JAMP(2183) = AMP(1488) +  AMP(1498)  ! used 16 times
+      TMP_JAMP(2182) = AMP(1737) +  AMP(1743)  ! used 16 times
+      TMP_JAMP(2181) = AMP(1731) +  AMP(1741)  ! used 16 times
+      TMP_JAMP(2180) = AMP(1728) - AMP(1736)  ! used 16 times
+      TMP_JAMP(2179) = AMP(1726) - AMP(1730)  ! used 16 times
+      TMP_JAMP(2178) = AMP(1735) +  AMP(1740)  ! used 16 times
+      TMP_JAMP(2177) = AMP(1729) +  AMP(1738)  ! used 16 times
+      TMP_JAMP(2176) = AMP(1491) +  AMP(1499)  ! used 16 times
+      TMP_JAMP(2175) = AMP(1868) +  AMP(1877)  ! used 16 times
+      TMP_JAMP(2174) = AMP(1674) - AMP(1682)  ! used 16 times
+      TMP_JAMP(2173) = AMP(1672) - AMP(1676)  ! used 16 times
+      TMP_JAMP(2172) = AMP(1867) +  AMP(1874)  ! used 16 times
+      TMP_JAMP(2171) = AMP(1673) - AMP(1679)  ! used 16 times
+      TMP_JAMP(2170) = AMP(1841) +  AMP(1850)  ! used 16 times
+      TMP_JAMP(2169) = AMP(1840) +  AMP(1847)  ! used 16 times
+      TMP_JAMP(2168) = AMP(1602) +  AMP(1608)  ! used 16 times
+      TMP_JAMP(2167) = AMP(1596) +  AMP(1606)  ! used 16 times
+      TMP_JAMP(2166) = AMP(1600) +  AMP(1611)  ! used 16 times
+      TMP_JAMP(2165) = AMP(1594) +  AMP(1609)  ! used 16 times
+      TMP_JAMP(2164) = AMP(1601) +  AMP(1605)  ! used 16 times
+      TMP_JAMP(2163) = AMP(1595) +  AMP(1603)  ! used 16 times
+      TMP_JAMP(2162) = AMP(1788) +  AMP(1796)  ! used 16 times
+      TMP_JAMP(2161) = AMP(1781) - AMP(1787)  ! used 16 times
+      TMP_JAMP(2160) = AMP(1786) +  AMP(1793)  ! used 16 times
+      TMP_JAMP(2159) = AMP(1734) +  AMP(1742)  ! used 16 times
+      TMP_JAMP(2158) = AMP(1727) - AMP(1733)  ! used 16 times
+      TMP_JAMP(2157) = AMP(1732) +  AMP(1739)  ! used 16 times
+      TMP_JAMP(2156) = AMP(1599) +  AMP(1607)  ! used 16 times
+      TMP_JAMP(2155) = AMP(1597) +  AMP(1610)  ! used 16 times
+      TMP_JAMP(2154) = AMP(1598) +  AMP(1604)  ! used 16 times
+      TMP_JAMP(2380) = TMP_JAMP(2225) - TMP_JAMP(2224)  ! used 8 times
+      TMP_JAMP(2379) = TMP_JAMP(2225) +  TMP_JAMP(2209)  ! used 8 times
+      TMP_JAMP(2378) = TMP_JAMP(2224) +  TMP_JAMP(2209)  ! used 8 times
+      TMP_JAMP(2377) = TMP_JAMP(2223) + ((-0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * AMP(1827)  ! used 8 times
+      TMP_JAMP(2376) = TMP_JAMP(2223) + ((0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * AMP(1833)  ! used 8 times
+      TMP_JAMP(2375) = TMP_JAMP(2222) + ((0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * AMP(1825)  ! used 8 times
+      TMP_JAMP(2374) = TMP_JAMP(2222) + ((-0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * AMP(1831)  ! used 8 times
+      TMP_JAMP(2373) = TMP_JAMP(2221) + ((-0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * AMP(1518)  ! used 8 times
+      TMP_JAMP(2372) = TMP_JAMP(2221) + ((0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * AMP(1533)  ! used 8 times
+      TMP_JAMP(2371) = TMP_JAMP(2220) + ((0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * AMP(1516)  ! used 8 times
+      TMP_JAMP(2370) = TMP_JAMP(2220) + ((-0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * AMP(1531)  ! used 8 times
+      TMP_JAMP(2369) = TMP_JAMP(2219) + ((-0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * AMP(1539)  ! used 8 times
+      TMP_JAMP(2368) = TMP_JAMP(2219) + ((0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * AMP(1536)  ! used 8 times
+      TMP_JAMP(2367) = TMP_JAMP(2218) + ((0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * AMP(1537)  ! used 8 times
+      TMP_JAMP(2366) = TMP_JAMP(2218) + ((-0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * AMP(1534)  ! used 8 times
+      TMP_JAMP(2365) = TMP_JAMP(2217) + ((-0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * AMP(1662)  ! used 8 times
+      TMP_JAMP(2364) = TMP_JAMP(2217) + ((0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * AMP(1653)  ! used 8 times
+      TMP_JAMP(2363) = TMP_JAMP(2216) + ((0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * AMP(1660)  ! used 8 times
+      TMP_JAMP(2362) = TMP_JAMP(2216) + ((-0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * AMP(1651)  ! used 8 times
+      TMP_JAMP(2361) = TMP_JAMP(2215) + ((0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * AMP(1692)  ! used 8 times
+      TMP_JAMP(2360) = TMP_JAMP(2215) + ((-0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * AMP(1698)  ! used 8 times
+      TMP_JAMP(2359) = TMP_JAMP(2214) + ((-0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * AMP(1690)  ! used 8 times
+      TMP_JAMP(2358) = TMP_JAMP(2214) + ((0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * AMP(1696)  ! used 8 times
+      TMP_JAMP(2357) = TMP_JAMP(2213) + ((0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * AMP(1517)  ! used 8 times
+      TMP_JAMP(2356) = TMP_JAMP(2213) + ((-0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * AMP(1532)  ! used 8 times
+      TMP_JAMP(2355) = TMP_JAMP(2212) + ((0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * AMP(1538)  ! used 8 times
+      TMP_JAMP(2354) = TMP_JAMP(2212) + ((-0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * AMP(1535)  ! used 8 times
+      TMP_JAMP(2353) = TMP_JAMP(2211) - TMP_JAMP(2210)  ! used 8 times
+      TMP_JAMP(2352) = TMP_JAMP(2211) +  TMP_JAMP(2175)  ! used 8 times
+      TMP_JAMP(2351) = TMP_JAMP(2210) +  TMP_JAMP(2175)  ! used 8 times
+      TMP_JAMP(2350) = TMP_JAMP(2208) + ((0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * AMP(1826)  ! used 8 times
+      TMP_JAMP(2349) = TMP_JAMP(2208) + ((-0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * AMP(1832)  ! used 8 times
+      TMP_JAMP(2348) = TMP_JAMP(2207) + ((0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * AMP(1887)  ! used 8 times
+      TMP_JAMP(2347) = TMP_JAMP(2207) + ((-0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * AMP(1881)  ! used 8 times
+      TMP_JAMP(2346) = TMP_JAMP(2206) + ((-0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * AMP(1885)  ! used 8 times
+      TMP_JAMP(2345) = TMP_JAMP(2206) + ((0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * AMP(1879)  ! used 8 times
+      TMP_JAMP(2344) = TMP_JAMP(2205) + ((0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * AMP(1581)  ! used 8 times
+      TMP_JAMP(2343) = TMP_JAMP(2205) + ((-0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * AMP(1563)  ! used 8 times
+      TMP_JAMP(2342) = TMP_JAMP(2204) + ((-0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * AMP(1579)  ! used 8 times
+      TMP_JAMP(2341) = TMP_JAMP(2204) + ((0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * AMP(1561)  ! used 8 times
+      TMP_JAMP(2340) = TMP_JAMP(2203) + ((0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * AMP(1572)  ! used 8 times
+      TMP_JAMP(2339) = TMP_JAMP(2203) + ((-0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * AMP(1587)  ! used 8 times
+      TMP_JAMP(2338) = TMP_JAMP(2202) + ((-0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * AMP(1570)  ! used 8 times
+      TMP_JAMP(2337) = TMP_JAMP(2202) + ((0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * AMP(1585)  ! used 8 times
+      TMP_JAMP(2336) = TMP_JAMP(2201) + ((0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * AMP(1770)  ! used 8 times
+      TMP_JAMP(2335) = TMP_JAMP(2201) + ((-0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * AMP(1761)  ! used 8 times
+      TMP_JAMP(2334) = TMP_JAMP(2200) + ((-0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * AMP(1768)  ! used 8 times
+      TMP_JAMP(2333) = TMP_JAMP(2200) + ((0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * AMP(1759)  ! used 8 times
+      TMP_JAMP(2332) = TMP_JAMP(2199) + ((-0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * AMP(1776)  ! used 8 times
+      TMP_JAMP(2331) = TMP_JAMP(2199) + ((0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * AMP(1779)  ! used 8 times
+      TMP_JAMP(2330) = TMP_JAMP(2198) + ((0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * AMP(1774)  ! used 8 times
+      TMP_JAMP(2329) = TMP_JAMP(2198) + ((-0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * AMP(1777)  ! used 8 times
+      TMP_JAMP(2328) = TMP_JAMP(2197) + ((0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * AMP(1661)  ! used 8 times
+      TMP_JAMP(2327) = TMP_JAMP(2197) + ((-0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * AMP(1652)  ! used 8 times
+      TMP_JAMP(2326) = TMP_JAMP(2196) + ((-0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * AMP(1691)  ! used 8 times
+      TMP_JAMP(2325) = TMP_JAMP(2196) + ((0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * AMP(1697)  ! used 8 times
+      TMP_JAMP(2324) = TMP_JAMP(2195) + ((-0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * AMP(1806)  ! used 8 times
+      TMP_JAMP(2323) = TMP_JAMP(2195) + ((0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * AMP(1800)  ! used 8 times
+      TMP_JAMP(2322) = TMP_JAMP(2194) + ((0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * AMP(1804)  ! used 8 times
+      TMP_JAMP(2321) = TMP_JAMP(2194) + ((-0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * AMP(1798)  ! used 8 times
+      TMP_JAMP(2320) = TMP_JAMP(2193) + ((-0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * AMP(1580)  ! used 8 times
+      TMP_JAMP(2319) = TMP_JAMP(2193) + ((0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * AMP(1562)  ! used 8 times
+      TMP_JAMP(2318) = TMP_JAMP(2192) + ((-0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * AMP(1571)  ! used 8 times
+      TMP_JAMP(2317) = TMP_JAMP(2192) + ((0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * AMP(1586)  ! used 8 times
+      TMP_JAMP(2316) = TMP_JAMP(2191) + ((-0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * AMP(1593)  ! used 8 times
+      TMP_JAMP(2315) = TMP_JAMP(2191) + ((0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * AMP(1590)  ! used 8 times
+      TMP_JAMP(2314) = TMP_JAMP(2190) + ((0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * AMP(1591)  ! used 8 times
+      TMP_JAMP(2313) = TMP_JAMP(2190) + ((-0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * AMP(1588)  ! used 8 times
+      TMP_JAMP(2312) = TMP_JAMP(2189) + ((0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * AMP(1592)  ! used 8 times
+      TMP_JAMP(2311) = TMP_JAMP(2189) + ((-0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * AMP(1589)  ! used 8 times
+      TMP_JAMP(2310) = TMP_JAMP(2188) - TMP_JAMP(2187)  ! used 8 times
+      TMP_JAMP(2309) = TMP_JAMP(2188) +  TMP_JAMP(2170)  ! used 8 times
+      TMP_JAMP(2308) = TMP_JAMP(2187) +  TMP_JAMP(2170)  ! used 8 times
+      TMP_JAMP(2307) = TMP_JAMP(2186) + ((0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * AMP(1860)  ! used 8 times
+      TMP_JAMP(2306) = TMP_JAMP(2186) + ((-0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * AMP(1854)  ! used 8 times
+      TMP_JAMP(2305) = TMP_JAMP(2185) + ((-0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * AMP(1858)  ! used 8 times
+      TMP_JAMP(2304) = TMP_JAMP(2185) + ((0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * AMP(1852)  ! used 8 times
+      TMP_JAMP(2303) = TMP_JAMP(2184) + ((0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * AMP(1527)  ! used 8 times
+      TMP_JAMP(2302) = TMP_JAMP(2184) + ((-0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * AMP(1509)  ! used 8 times
+      TMP_JAMP(2301) = TMP_JAMP(2183) + ((-0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * AMP(1525)  ! used 8 times
+      TMP_JAMP(2300) = TMP_JAMP(2183) + ((0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * AMP(1507)  ! used 8 times
+      TMP_JAMP(2299) = TMP_JAMP(2182) + ((0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * AMP(1716)  ! used 8 times
+      TMP_JAMP(2298) = TMP_JAMP(2182) + ((-0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * AMP(1707)  ! used 8 times
+      TMP_JAMP(2297) = TMP_JAMP(2181) + ((-0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * AMP(1714)  ! used 8 times
+      TMP_JAMP(2296) = TMP_JAMP(2181) + ((0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * AMP(1705)  ! used 8 times
+      TMP_JAMP(2295) = TMP_JAMP(2180) + ((-0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * AMP(1722)  ! used 8 times
+      TMP_JAMP(2294) = TMP_JAMP(2180) + ((0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * AMP(1725)  ! used 8 times
+      TMP_JAMP(2293) = TMP_JAMP(2179) + ((0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * AMP(1720)  ! used 8 times
+      TMP_JAMP(2292) = TMP_JAMP(2179) + ((-0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * AMP(1723)  ! used 8 times
+      TMP_JAMP(2291) = TMP_JAMP(2178) + ((-0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * AMP(1752)  ! used 8 times
+      TMP_JAMP(2290) = TMP_JAMP(2178) + ((0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * AMP(1746)  ! used 8 times
+      TMP_JAMP(2289) = TMP_JAMP(2177) + ((0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * AMP(1750)  ! used 8 times
+      TMP_JAMP(2288) = TMP_JAMP(2177) + ((-0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * AMP(1744)  ! used 8 times
+      TMP_JAMP(2287) = TMP_JAMP(2176) + ((-0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * AMP(1526)  ! used 8 times
+      TMP_JAMP(2286) = TMP_JAMP(2176) + ((0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * AMP(1508)  ! used 8 times
+      TMP_JAMP(2285) = TMP_JAMP(2175) +  TMP_JAMP(2170)  ! used 8 times
+      TMP_JAMP(2284) = TMP_JAMP(2174) + ((0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * AMP(1671)  ! used 8 times
+      TMP_JAMP(2283) = TMP_JAMP(2174) + ((-0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * AMP(1668)  ! used 8 times
+      TMP_JAMP(2282) = TMP_JAMP(2173) + ((-0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * AMP(1669)  ! used 8 times
+      TMP_JAMP(2281) = TMP_JAMP(2173) + ((0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * AMP(1666)  ! used 8 times
+      TMP_JAMP(2280) = TMP_JAMP(2172) + ((-0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * AMP(1886)  ! used 8 times
+      TMP_JAMP(2279) = TMP_JAMP(2172) + ((0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * AMP(1880)  ! used 8 times
+      TMP_JAMP(2278) = TMP_JAMP(2171) + ((-0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * AMP(1670)  ! used 8 times
+      TMP_JAMP(2277) = TMP_JAMP(2171) + ((0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * AMP(1667)  ! used 8 times
+      TMP_JAMP(2276) = TMP_JAMP(2169) + ((-0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * AMP(1859)  ! used 8 times
+      TMP_JAMP(2275) = TMP_JAMP(2169) + ((0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * AMP(1853)  ! used 8 times
+      TMP_JAMP(2274) = TMP_JAMP(2168) + ((-0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * AMP(1635)  ! used 8 times
+      TMP_JAMP(2273) = TMP_JAMP(2168) + ((0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * AMP(1617)  ! used 8 times
+      TMP_JAMP(2272) = TMP_JAMP(2167) + ((0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * AMP(1633)  ! used 8 times
+      TMP_JAMP(2271) = TMP_JAMP(2167) + ((-0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * AMP(1615)  ! used 8 times
+      TMP_JAMP(2270) = TMP_JAMP(2166) + ((0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * AMP(1644)  ! used 8 times
+      TMP_JAMP(2269) = TMP_JAMP(2166) + ((-0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * AMP(1647)  ! used 8 times
+      TMP_JAMP(2268) = TMP_JAMP(2165) + ((-0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * AMP(1642)  ! used 8 times
+      TMP_JAMP(2267) = TMP_JAMP(2165) + ((0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * AMP(1645)  ! used 8 times
+      TMP_JAMP(2266) = TMP_JAMP(2164) + ((-0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * AMP(1641)  ! used 8 times
+      TMP_JAMP(2265) = TMP_JAMP(2164) + ((0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * AMP(1626)  ! used 8 times
+      TMP_JAMP(2264) = TMP_JAMP(2163) + ((0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * AMP(1639)  ! used 8 times
+      TMP_JAMP(2263) = TMP_JAMP(2163) + ((-0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * AMP(1624)  ! used 8 times
+      TMP_JAMP(2262) = TMP_JAMP(2162) + ((-0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * AMP(1769)  ! used 8 times
+      TMP_JAMP(2261) = TMP_JAMP(2162) + ((0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * AMP(1760)  ! used 8 times
+      TMP_JAMP(2260) = TMP_JAMP(2161) + ((0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * AMP(1775)  ! used 8 times
+      TMP_JAMP(2259) = TMP_JAMP(2161) + ((-0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * AMP(1778)  ! used 8 times
+      TMP_JAMP(2258) = TMP_JAMP(2160) + ((0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * AMP(1805)  ! used 8 times
+      TMP_JAMP(2257) = TMP_JAMP(2160) + ((-0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * AMP(1799)  ! used 8 times
+      TMP_JAMP(2256) = TMP_JAMP(2159) + ((-0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * AMP(1715)  ! used 8 times
+      TMP_JAMP(2255) = TMP_JAMP(2159) + ((0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * AMP(1706)  ! used 8 times
+      TMP_JAMP(2254) = TMP_JAMP(2158) + ((0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * AMP(1721)  ! used 8 times
+      TMP_JAMP(2253) = TMP_JAMP(2158) + ((-0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * AMP(1724)  ! used 8 times
+      TMP_JAMP(2252) = TMP_JAMP(2157) + ((0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * AMP(1751)  ! used 8 times
+      TMP_JAMP(2251) = TMP_JAMP(2157) + ((-0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * AMP(1745)  ! used 8 times
+      TMP_JAMP(2250) = TMP_JAMP(2156) + ((0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * AMP(1634)  ! used 8 times
+      TMP_JAMP(2249) = TMP_JAMP(2156) + ((-0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * AMP(1616)  ! used 8 times
+      TMP_JAMP(2248) = TMP_JAMP(2155) + ((-0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * AMP(1643)  ! used 8 times
+      TMP_JAMP(2247) = TMP_JAMP(2155) + ((0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * AMP(1646)  ! used 8 times
+      TMP_JAMP(2246) = TMP_JAMP(2154) + ((0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * AMP(1640)  ! used 8 times
+      TMP_JAMP(2245) = TMP_JAMP(2154) + ((-0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * AMP(1625)  ! used 8 times
+      TMP_JAMP(2244) = AMP(1455) +  AMP(1456)  ! used 8 times
+      TMP_JAMP(2243) = AMP(1453) - AMP(1457)  ! used 8 times
+      TMP_JAMP(2242) = AMP(1866) - AMP(1872)  ! used 8 times
+      TMP_JAMP(2241) = AMP(1464) +  AMP(1465)  ! used 8 times
+      TMP_JAMP(2240) = AMP(1462) - AMP(1466)  ! used 8 times
+      TMP_JAMP(2239) = AMP(1839) - AMP(1845)  ! used 8 times
+      TMP_JAMP(2238) = AMP(1473) +  AMP(1474)  ! used 8 times
+      TMP_JAMP(2237) = AMP(1471) - AMP(1475)  ! used 8 times
+      TMP_JAMP(2236) = AMP(1869) +  AMP(1872)  ! used 8 times
+      TMP_JAMP(2235) = AMP(1842) +  AMP(1845)  ! used 8 times
+      TMP_JAMP(2234) = AMP(1812) - AMP(1818)  ! used 8 times
+      TMP_JAMP(2233) = AMP(1815) +  AMP(1818)  ! used 8 times
+      TMP_JAMP(2232) = AMP(1812) +  AMP(1815)  ! used 8 times
+      TMP_JAMP(2231) = AMP(1434) +  AMP(1435)  ! used 8 times
+      TMP_JAMP(2230) = AMP(1432) - AMP(1436)  ! used 8 times
+      TMP_JAMP(2229) = AMP(1866) +  AMP(1869)  ! used 8 times
+      TMP_JAMP(2228) = AMP(1482) +  AMP(1483)  ! used 8 times
+      TMP_JAMP(2227) = AMP(1480) - AMP(1484)  ! used 8 times
+      TMP_JAMP(2226) = AMP(1839) +  AMP(1842)  ! used 8 times
+      TMP_JAMP(2390) = TMP_JAMP(2375) + ((0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * TMP_JAMP(2244)  ! used 8 times
+      TMP_JAMP(2389) = TMP_JAMP(2374) + ((-0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * TMP_JAMP(2228)  ! used 8 times
+      TMP_JAMP(2388) = TMP_JAMP(2359) + ((-0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * TMP_JAMP(2243)  ! used 8 times
+      TMP_JAMP(2387) = TMP_JAMP(2358) + ((0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * TMP_JAMP(2227)  ! used 8 times
+      TMP_JAMP(2386) = TMP_JAMP(2346) + ((-0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * TMP_JAMP(2241)  ! used 8 times
+      TMP_JAMP(2385) = TMP_JAMP(2322) + ((0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * TMP_JAMP(2240)  ! used 8 times
+      TMP_JAMP(2384) = TMP_JAMP(2305) + ((-0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * TMP_JAMP(2238)  ! used 8 times
+      TMP_JAMP(2383) = TMP_JAMP(2304) + ((0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * TMP_JAMP(2231)  ! used 8 times
+      TMP_JAMP(2382) = TMP_JAMP(2289) + ((0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * TMP_JAMP(2237)  ! used 8 times
+      TMP_JAMP(2381) = TMP_JAMP(2288) + ((-0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * TMP_JAMP(2230)  ! used 8 times
+      TMP_JAMP(2671) = TMP_JAMP(2390) + ((0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * AMP(1458)  ! used 4 times
+      TMP_JAMP(2670) = TMP_JAMP(2390) - TMP_JAMP(2377)  ! used 4 times
+      TMP_JAMP(2669) = TMP_JAMP(2390) - TMP_JAMP(2388)  ! used 4 times
+      TMP_JAMP(2668) = TMP_JAMP(2389) - TMP_JAMP(2376)  ! used 4 times
+      TMP_JAMP(2667) = TMP_JAMP(2389) +  TMP_JAMP(2349)  ! used 4 times
+      TMP_JAMP(2666) = TMP_JAMP(2389) - TMP_JAMP(2387)  ! used 4 times
+      TMP_JAMP(2665) = TMP_JAMP(2388) - AMP(1663)  ! used 4 times
+      TMP_JAMP(2664) = TMP_JAMP(2388) + ((-0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * AMP(1458)  ! used 4 times
+      TMP_JAMP(2663) = TMP_JAMP(2388) - TMP_JAMP(2282)  ! used 4 times
+      TMP_JAMP(2662) = TMP_JAMP(2387) + ((0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * AMP(1485)  ! used 4 times
+      TMP_JAMP(2661) = TMP_JAMP(2387) - TMP_JAMP(2360)  ! used 4 times
+      TMP_JAMP(2660) = TMP_JAMP(2387) +  TMP_JAMP(2325)  ! used 4 times
+      TMP_JAMP(2659) = TMP_JAMP(2386) - TMP_JAMP(2348)  ! used 4 times
+      TMP_JAMP(2658) = TMP_JAMP(2386) - TMP_JAMP(2385)  ! used 4 times
+      TMP_JAMP(2657) = TMP_JAMP(2385) + ((0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * AMP(1467)  ! used 4 times
+      TMP_JAMP(2656) = TMP_JAMP(2385) - AMP(1762)  ! used 4 times
+      TMP_JAMP(2655) = TMP_JAMP(2384) - TMP_JAMP(2307)  ! used 4 times
+      TMP_JAMP(2654) = TMP_JAMP(2384) +  TMP_JAMP(2276)  ! used 4 times
+      TMP_JAMP(2653) = TMP_JAMP(2384) - TMP_JAMP(2382)  ! used 4 times
+      TMP_JAMP(2652) = TMP_JAMP(2383) + ((0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * AMP(1437)  ! used 4 times
+      TMP_JAMP(2651) = TMP_JAMP(2383) - TMP_JAMP(2306)  ! used 4 times
+      TMP_JAMP(2650) = TMP_JAMP(2383) +  TMP_JAMP(2275)  ! used 4 times
+      TMP_JAMP(2649) = TMP_JAMP(2383) - TMP_JAMP(2381)  ! used 4 times
+      TMP_JAMP(2648) = TMP_JAMP(2382) + ((0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * AMP(1476)  ! used 4 times
+      TMP_JAMP(2647) = TMP_JAMP(2382) - AMP(1708)  ! used 4 times
+      TMP_JAMP(2646) = TMP_JAMP(2381) - AMP(1717)  ! used 4 times
+      TMP_JAMP(2645) = TMP_JAMP(2381) + ((-0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * AMP(1437)  ! used 4 times
+      TMP_JAMP(2644) = TMP_JAMP(2380) +  TMP_JAMP(2309)  ! used 4 times
+      TMP_JAMP(2643) = TMP_JAMP(2380) - TMP_JAMP(2234)  ! used 4 times
+      TMP_JAMP(2642) = TMP_JAMP(2379) +  TMP_JAMP(2233)  ! used 4 times
+      TMP_JAMP(2641) = TMP_JAMP(2378) +  TMP_JAMP(2352)  ! used 4 times
+      TMP_JAMP(2640) = TMP_JAMP(2378) +  TMP_JAMP(2232)  ! used 4 times
+      TMP_JAMP(2639) = TMP_JAMP(2377) +  TMP_JAMP(2350)  ! used 4 times
+      TMP_JAMP(2638) = TMP_JAMP(2377) - AMP(1830)  ! used 4 times
+      TMP_JAMP(2637) = TMP_JAMP(2376) - AMP(1836)  ! used 4 times
+      TMP_JAMP(2636) = TMP_JAMP(2376) +  TMP_JAMP(2349)  ! used 4 times
+      TMP_JAMP(2635) = TMP_JAMP(2373) + ((-0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * AMP(1458)  ! used 4 times
+      TMP_JAMP(2634) = TMP_JAMP(2373) - TMP_JAMP(2371)  ! used 4 times
+      TMP_JAMP(2633) = TMP_JAMP(2373) +  TMP_JAMP(2357)  ! used 4 times
+      TMP_JAMP(2632) = TMP_JAMP(2373) - AMP(1521)  ! used 4 times
+      TMP_JAMP(2631) = TMP_JAMP(2372) - TMP_JAMP(2370)  ! used 4 times
+      TMP_JAMP(2630) = TMP_JAMP(2372) +  TMP_JAMP(2356)  ! used 4 times
+      TMP_JAMP(2629) = TMP_JAMP(2371) +  AMP(1513)  ! used 4 times
+      TMP_JAMP(2628) = TMP_JAMP(2371) +  TMP_JAMP(2357)  ! used 4 times
+      TMP_JAMP(2627) = TMP_JAMP(2370) +  TMP_JAMP(2356)  ! used 4 times
+      TMP_JAMP(2626) = TMP_JAMP(2369) +  AMP(1515)  ! used 4 times
+      TMP_JAMP(2625) = TMP_JAMP(2369) +  TMP_JAMP(2355)  ! used 4 times
+      TMP_JAMP(2624) = TMP_JAMP(2369) + ((0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * AMP(1467)  ! used 4 times
+      TMP_JAMP(2623) = TMP_JAMP(2369) - AMP(1506)  ! used 4 times
+      TMP_JAMP(2622) = TMP_JAMP(2368) - TMP_JAMP(2366)  ! used 4 times
+      TMP_JAMP(2621) = TMP_JAMP(2368) +  AMP(1530)  ! used 4 times
+      TMP_JAMP(2620) = TMP_JAMP(2368) +  TMP_JAMP(2354)  ! used 4 times
+      TMP_JAMP(2619) = TMP_JAMP(2367) +  AMP(1513)  ! used 4 times
+      TMP_JAMP(2618) = TMP_JAMP(2367) +  TMP_JAMP(2355)  ! used 4 times
+      TMP_JAMP(2617) = TMP_JAMP(2367) - TMP_JAMP(2330)  ! used 4 times
+      TMP_JAMP(2616) = TMP_JAMP(2366) +  AMP(1528)  ! used 4 times
+      TMP_JAMP(2615) = TMP_JAMP(2366) +  TMP_JAMP(2354)  ! used 4 times
+      TMP_JAMP(2614) = TMP_JAMP(2365) +  AMP(1665)  ! used 4 times
+      TMP_JAMP(2613) = TMP_JAMP(2365) - TMP_JAMP(2363)  ! used 4 times
+      TMP_JAMP(2612) = TMP_JAMP(2365) +  TMP_JAMP(2328)  ! used 4 times
+      TMP_JAMP(2611) = TMP_JAMP(2365) +  TMP_JAMP(2299)  ! used 4 times
+      TMP_JAMP(2610) = TMP_JAMP(2364) - TMP_JAMP(2362)  ! used 4 times
+      TMP_JAMP(2609) = TMP_JAMP(2364) +  TMP_JAMP(2327)  ! used 4 times
+      TMP_JAMP(2608) = TMP_JAMP(2364) +  TMP_JAMP(2298)  ! used 4 times
+      TMP_JAMP(2607) = TMP_JAMP(2363) +  AMP(1663)  ! used 4 times
+      TMP_JAMP(2606) = TMP_JAMP(2363) +  TMP_JAMP(2328)  ! used 4 times
+      TMP_JAMP(2605) = TMP_JAMP(2362) +  TMP_JAMP(2327)  ! used 4 times
+      TMP_JAMP(2604) = TMP_JAMP(2362) +  AMP(1654)  ! used 4 times
+      TMP_JAMP(2603) = TMP_JAMP(2361) - AMP(1665)  ! used 4 times
+      TMP_JAMP(2602) = TMP_JAMP(2361) +  TMP_JAMP(2357)  ! used 4 times
+      TMP_JAMP(2601) = TMP_JAMP(2361) +  TMP_JAMP(2326)  ! used 4 times
+      TMP_JAMP(2600) = TMP_JAMP(2361) +  AMP(1695)  ! used 4 times
+      TMP_JAMP(2599) = TMP_JAMP(2360) +  TMP_JAMP(2356)  ! used 4 times
+      TMP_JAMP(2598) = TMP_JAMP(2360) +  TMP_JAMP(2325)  ! used 4 times
+      TMP_JAMP(2597) = TMP_JAMP(2360) - AMP(1656)  ! used 4 times
+      TMP_JAMP(2596) = TMP_JAMP(2355) +  AMP(1514)  ! used 4 times
+      TMP_JAMP(2595) = TMP_JAMP(2354) +  AMP(1529)  ! used 4 times
+      TMP_JAMP(2594) = TMP_JAMP(2353) - TMP_JAMP(2242)  ! used 4 times
+      TMP_JAMP(2593) = TMP_JAMP(2350) - TMP_JAMP(2338)  ! used 4 times
+      TMP_JAMP(2592) = TMP_JAMP(2350) - AMP(1829)  ! used 4 times
+      TMP_JAMP(2591) = TMP_JAMP(2349) - AMP(1835)  ! used 4 times
+      TMP_JAMP(2590) = TMP_JAMP(2349) - TMP_JAMP(2337)  ! used 4 times
+      TMP_JAMP(2589) = TMP_JAMP(2348) - AMP(1890)  ! used 4 times
+      TMP_JAMP(2588) = TMP_JAMP(2348) +  TMP_JAMP(2280)  ! used 4 times
+      TMP_JAMP(2587) = TMP_JAMP(2347) - TMP_JAMP(2345)  ! used 4 times
+      TMP_JAMP(2586) = TMP_JAMP(2347) +  TMP_JAMP(2279)  ! used 4 times
+      TMP_JAMP(2585) = TMP_JAMP(2347) - AMP(1884)  ! used 4 times
+      TMP_JAMP(2584) = TMP_JAMP(2345) +  TMP_JAMP(2279)  ! used 4 times
+      TMP_JAMP(2583) = TMP_JAMP(2345) - AMP(1882)  ! used 4 times
+      TMP_JAMP(2582) = TMP_JAMP(2344) + ((0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * AMP(1467)  ! used 4 times
+      TMP_JAMP(2581) = TMP_JAMP(2344) - TMP_JAMP(2342)  ! used 4 times
+      TMP_JAMP(2580) = TMP_JAMP(2344) +  TMP_JAMP(2320)  ! used 4 times
+      TMP_JAMP(2579) = TMP_JAMP(2343) - TMP_JAMP(2341)  ! used 4 times
+      TMP_JAMP(2578) = TMP_JAMP(2343) +  TMP_JAMP(2319)  ! used 4 times
+      TMP_JAMP(2577) = TMP_JAMP(2343) - AMP(1566)  ! used 4 times
+      TMP_JAMP(2576) = TMP_JAMP(2342) +  TMP_JAMP(2320)  ! used 4 times
+      TMP_JAMP(2575) = TMP_JAMP(2341) +  TMP_JAMP(2319)  ! used 4 times
+      TMP_JAMP(2574) = TMP_JAMP(2340) - AMP(1575)  ! used 4 times
+      TMP_JAMP(2573) = TMP_JAMP(2340) - TMP_JAMP(2338)  ! used 4 times
+      TMP_JAMP(2572) = TMP_JAMP(2340) +  TMP_JAMP(2318)  ! used 4 times
+      TMP_JAMP(2571) = TMP_JAMP(2340) + ((-0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * AMP(1458)  ! used 4 times
+      TMP_JAMP(2570) = TMP_JAMP(2339) - TMP_JAMP(2337)  ! used 4 times
+      TMP_JAMP(2569) = TMP_JAMP(2339) +  TMP_JAMP(2317)  ! used 4 times
+      TMP_JAMP(2568) = TMP_JAMP(2339) - AMP(1566)  ! used 4 times
+      TMP_JAMP(2567) = TMP_JAMP(2339) + ((0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * AMP(1485)  ! used 4 times
+      TMP_JAMP(2566) = TMP_JAMP(2338) - AMP(1573)  ! used 4 times
+      TMP_JAMP(2565) = TMP_JAMP(2338) +  TMP_JAMP(2318)  ! used 4 times
+      TMP_JAMP(2564) = TMP_JAMP(2337) - AMP(1564)  ! used 4 times
+      TMP_JAMP(2563) = TMP_JAMP(2337) +  TMP_JAMP(2317)  ! used 4 times
+      TMP_JAMP(2562) = TMP_JAMP(2336) +  TMP_JAMP(2328)  ! used 4 times
+      TMP_JAMP(2561) = TMP_JAMP(2336) - TMP_JAMP(2334)  ! used 4 times
+      TMP_JAMP(2560) = TMP_JAMP(2336) +  TMP_JAMP(2262)  ! used 4 times
+      TMP_JAMP(2559) = TMP_JAMP(2336) +  AMP(1773)  ! used 4 times
+      TMP_JAMP(2558) = TMP_JAMP(2335) +  TMP_JAMP(2327)  ! used 4 times
+      TMP_JAMP(2557) = TMP_JAMP(2335) - TMP_JAMP(2333)  ! used 4 times
+      TMP_JAMP(2556) = TMP_JAMP(2335) +  TMP_JAMP(2261)  ! used 4 times
+      TMP_JAMP(2555) = TMP_JAMP(2334) +  TMP_JAMP(2262)  ! used 4 times
+      TMP_JAMP(2554) = TMP_JAMP(2333) +  AMP(1762)  ! used 4 times
+      TMP_JAMP(2553) = TMP_JAMP(2333) +  TMP_JAMP(2261)  ! used 4 times
+      TMP_JAMP(2552) = TMP_JAMP(2332) +  AMP(1767)  ! used 4 times
+      TMP_JAMP(2551) = TMP_JAMP(2332) - TMP_JAMP(2330)  ! used 4 times
+      TMP_JAMP(2550) = TMP_JAMP(2332) - AMP(1809)  ! used 4 times
+      TMP_JAMP(2549) = TMP_JAMP(2332) +  TMP_JAMP(2260)  ! used 4 times
+      TMP_JAMP(2548) = TMP_JAMP(2331) - TMP_JAMP(2329)  ! used 4 times
+      TMP_JAMP(2547) = TMP_JAMP(2331) +  TMP_JAMP(2259)  ! used 4 times
+      TMP_JAMP(2546) = TMP_JAMP(2331) +  AMP(1758)  ! used 4 times
+      TMP_JAMP(2545) = TMP_JAMP(2330) +  AMP(1765)  ! used 4 times
+      TMP_JAMP(2544) = TMP_JAMP(2330) +  TMP_JAMP(2260)  ! used 4 times
+      TMP_JAMP(2543) = TMP_JAMP(2329) +  TMP_JAMP(2259)  ! used 4 times
+      TMP_JAMP(2542) = TMP_JAMP(2329) +  AMP(1756)  ! used 4 times
+      TMP_JAMP(2541) = TMP_JAMP(2328) +  AMP(1664)  ! used 4 times
+      TMP_JAMP(2540) = TMP_JAMP(2326) - AMP(1664)  ! used 4 times
+      TMP_JAMP(2539) = TMP_JAMP(2326) +  TMP_JAMP(2318)  ! used 4 times
+      TMP_JAMP(2538) = TMP_JAMP(2325) - AMP(1655)  ! used 4 times
+      TMP_JAMP(2537) = TMP_JAMP(2324) +  AMP(1809)  ! used 4 times
+      TMP_JAMP(2536) = TMP_JAMP(2324) +  TMP_JAMP(2320)  ! used 4 times
+      TMP_JAMP(2535) = TMP_JAMP(2324) - AMP(1764)  ! used 4 times
+      TMP_JAMP(2534) = TMP_JAMP(2323) - AMP(1773)  ! used 4 times
+      TMP_JAMP(2533) = TMP_JAMP(2323) +  TMP_JAMP(2319)  ! used 4 times
+      TMP_JAMP(2532) = TMP_JAMP(2323) +  AMP(1803)  ! used 4 times
+      TMP_JAMP(2531) = TMP_JAMP(2321) - AMP(1771)  ! used 4 times
+      TMP_JAMP(2530) = TMP_JAMP(2321) +  AMP(1801)  ! used 4 times
+      TMP_JAMP(2529) = TMP_JAMP(2318) - AMP(1574)  ! used 4 times
+      TMP_JAMP(2528) = TMP_JAMP(2317) - AMP(1565)  ! used 4 times
+      TMP_JAMP(2527) = TMP_JAMP(2316) +  AMP(1569)  ! used 4 times
+      TMP_JAMP(2526) = TMP_JAMP(2316) - AMP(1560)  ! used 4 times
+      TMP_JAMP(2525) = TMP_JAMP(2315) - AMP(1578)  ! used 4 times
+      TMP_JAMP(2524) = TMP_JAMP(2315) +  AMP(1584)  ! used 4 times
+      TMP_JAMP(2523) = TMP_JAMP(2314) +  AMP(1567)  ! used 4 times
+      TMP_JAMP(2522) = TMP_JAMP(2314) - AMP(1558)  ! used 4 times
+      TMP_JAMP(2521) = TMP_JAMP(2313) - AMP(1576)  ! used 4 times
+      TMP_JAMP(2520) = TMP_JAMP(2313) +  AMP(1582)  ! used 4 times
+      TMP_JAMP(2519) = TMP_JAMP(2312) +  AMP(1568)  ! used 4 times
+      TMP_JAMP(2518) = TMP_JAMP(2312) - AMP(1559)  ! used 4 times
+      TMP_JAMP(2517) = TMP_JAMP(2311) - AMP(1577)  ! used 4 times
+      TMP_JAMP(2516) = TMP_JAMP(2311) +  AMP(1583)  ! used 4 times
+      TMP_JAMP(2515) = TMP_JAMP(2310) - TMP_JAMP(2239)  ! used 4 times
+      TMP_JAMP(2514) = TMP_JAMP(2307) - AMP(1863)  ! used 4 times
+      TMP_JAMP(2513) = TMP_JAMP(2307) +  TMP_JAMP(2276)  ! used 4 times
+      TMP_JAMP(2512) = TMP_JAMP(2306) +  TMP_JAMP(2275)  ! used 4 times
+      TMP_JAMP(2511) = TMP_JAMP(2306) - AMP(1857)  ! used 4 times
+      TMP_JAMP(2510) = TMP_JAMP(2303) - AMP(1521)  ! used 4 times
+      TMP_JAMP(2509) = TMP_JAMP(2303) + ((0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * AMP(1476)  ! used 4 times
+      TMP_JAMP(2508) = TMP_JAMP(2303) +  AMP(1524)  ! used 4 times
+      TMP_JAMP(2507) = TMP_JAMP(2302) +  AMP(1506)  ! used 4 times
+      TMP_JAMP(2506) = TMP_JAMP(2302) + ((-0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * AMP(1437)  ! used 4 times
+      TMP_JAMP(2505) = TMP_JAMP(2302) - AMP(1512)  ! used 4 times
+      TMP_JAMP(2504) = TMP_JAMP(2301) - AMP(1519)  ! used 4 times
+      TMP_JAMP(2503) = TMP_JAMP(2301) +  AMP(1522)  ! used 4 times
+      TMP_JAMP(2502) = TMP_JAMP(2300) +  AMP(1504)  ! used 4 times
+      TMP_JAMP(2501) = TMP_JAMP(2300) - AMP(1510)  ! used 4 times
+      TMP_JAMP(2500) = TMP_JAMP(2299) - TMP_JAMP(2297)  ! used 4 times
+      TMP_JAMP(2499) = TMP_JAMP(2299) +  TMP_JAMP(2256)  ! used 4 times
+      TMP_JAMP(2498) = TMP_JAMP(2298) - TMP_JAMP(2296)  ! used 4 times
+      TMP_JAMP(2497) = TMP_JAMP(2298) +  TMP_JAMP(2255)  ! used 4 times
+      TMP_JAMP(2496) = TMP_JAMP(2297) +  AMP(1717)  ! used 4 times
+      TMP_JAMP(2495) = TMP_JAMP(2297) +  TMP_JAMP(2256)  ! used 4 times
+      TMP_JAMP(2494) = TMP_JAMP(2296) +  AMP(1708)  ! used 4 times
+      TMP_JAMP(2493) = TMP_JAMP(2296) +  TMP_JAMP(2255)  ! used 4 times
+      TMP_JAMP(2492) = TMP_JAMP(2295) +  AMP(1713)  ! used 4 times
+      TMP_JAMP(2491) = TMP_JAMP(2295) - TMP_JAMP(2293)  ! used 4 times
+      TMP_JAMP(2490) = TMP_JAMP(2295) +  TMP_JAMP(2254)  ! used 4 times
+      TMP_JAMP(2489) = TMP_JAMP(2294) - TMP_JAMP(2292)  ! used 4 times
+      TMP_JAMP(2488) = TMP_JAMP(2294) +  TMP_JAMP(2253)  ! used 4 times
+      TMP_JAMP(2487) = TMP_JAMP(2294) +  AMP(1704)  ! used 4 times
+      TMP_JAMP(2486) = TMP_JAMP(2293) +  AMP(1711)  ! used 4 times
+      TMP_JAMP(2485) = TMP_JAMP(2293) +  TMP_JAMP(2254)  ! used 4 times
+      TMP_JAMP(2484) = TMP_JAMP(2292) +  TMP_JAMP(2253)  ! used 4 times
+      TMP_JAMP(2483) = TMP_JAMP(2292) +  AMP(1702)  ! used 4 times
+      TMP_JAMP(2482) = TMP_JAMP(2291) +  AMP(1755)  ! used 4 times
+      TMP_JAMP(2481) = TMP_JAMP(2291) - AMP(1710)  ! used 4 times
+      TMP_JAMP(2480) = TMP_JAMP(2290) - AMP(1719)  ! used 4 times
+      TMP_JAMP(2479) = TMP_JAMP(2290) +  AMP(1749)  ! used 4 times
+      TMP_JAMP(2478) = TMP_JAMP(2287) - AMP(1520)  ! used 4 times
+      TMP_JAMP(2477) = TMP_JAMP(2287) +  AMP(1523)  ! used 4 times
+      TMP_JAMP(2476) = TMP_JAMP(2286) +  AMP(1505)  ! used 4 times
+      TMP_JAMP(2475) = TMP_JAMP(2286) - AMP(1511)  ! used 4 times
+      TMP_JAMP(2474) = TMP_JAMP(2284) - AMP(1695)  ! used 4 times
+      TMP_JAMP(2473) = TMP_JAMP(2284) - TMP_JAMP(2282)  ! used 4 times
+      TMP_JAMP(2472) = TMP_JAMP(2284) +  AMP(1650)  ! used 4 times
+      TMP_JAMP(2471) = TMP_JAMP(2283) +  AMP(1659)  ! used 4 times
+      TMP_JAMP(2470) = TMP_JAMP(2283) - AMP(1701)  ! used 4 times
+      TMP_JAMP(2469) = TMP_JAMP(2282) +  AMP(1648)  ! used 4 times
+      TMP_JAMP(2468) = TMP_JAMP(2281) +  AMP(1657)  ! used 4 times
+      TMP_JAMP(2467) = TMP_JAMP(2281) - TMP_JAMP(2267)  ! used 4 times
+      TMP_JAMP(2466) = TMP_JAMP(2280) - AMP(1889)  ! used 4 times
+      TMP_JAMP(2465) = TMP_JAMP(2280) - TMP_JAMP(2272)  ! used 4 times
+      TMP_JAMP(2464) = TMP_JAMP(2279) - AMP(1883)  ! used 4 times
+      TMP_JAMP(2463) = TMP_JAMP(2278) - AMP(1694)  ! used 4 times
+      TMP_JAMP(2462) = TMP_JAMP(2278) +  AMP(1649)  ! used 4 times
+      TMP_JAMP(2461) = TMP_JAMP(2277) +  AMP(1658)  ! used 4 times
+      TMP_JAMP(2460) = TMP_JAMP(2277) - AMP(1700)  ! used 4 times
+      TMP_JAMP(2459) = TMP_JAMP(2276) - AMP(1862)  ! used 4 times
+      TMP_JAMP(2458) = TMP_JAMP(2275) - AMP(1856)  ! used 4 times
+      TMP_JAMP(2457) = TMP_JAMP(2274) - TMP_JAMP(2272)  ! used 4 times
+      TMP_JAMP(2456) = TMP_JAMP(2274) +  TMP_JAMP(2250)  ! used 4 times
+      TMP_JAMP(2455) = TMP_JAMP(2274) - AMP(1629)  ! used 4 times
+      TMP_JAMP(2454) = TMP_JAMP(2274) + ((0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * AMP(1467)  ! used 4 times
+      TMP_JAMP(2453) = TMP_JAMP(2273) - TMP_JAMP(2271)  ! used 4 times
+      TMP_JAMP(2452) = TMP_JAMP(2273) +  TMP_JAMP(2249)  ! used 4 times
+      TMP_JAMP(2451) = TMP_JAMP(2272) +  TMP_JAMP(2250)  ! used 4 times
+      TMP_JAMP(2450) = TMP_JAMP(2271) +  TMP_JAMP(2249)  ! used 4 times
+      TMP_JAMP(2449) = TMP_JAMP(2270) - AMP(1632)  ! used 4 times
+      TMP_JAMP(2448) = TMP_JAMP(2270) + ((0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * AMP(1458)  ! used 4 times
+      TMP_JAMP(2447) = TMP_JAMP(2270) - TMP_JAMP(2268)  ! used 4 times
+      TMP_JAMP(2446) = TMP_JAMP(2270) +  AMP(1638)  ! used 4 times
+      TMP_JAMP(2445) = TMP_JAMP(2270) +  TMP_JAMP(2248)  ! used 4 times
+      TMP_JAMP(2444) = TMP_JAMP(2269) - TMP_JAMP(2267)  ! used 4 times
+      TMP_JAMP(2443) = TMP_JAMP(2269) +  TMP_JAMP(2247)  ! used 4 times
+      TMP_JAMP(2442) = TMP_JAMP(2269) + ((0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * AMP(1485)  ! used 4 times
+      TMP_JAMP(2441) = TMP_JAMP(2269) - AMP(1614)  ! used 4 times
+      TMP_JAMP(2440) = TMP_JAMP(2268) - AMP(1630)  ! used 4 times
+      TMP_JAMP(2439) = TMP_JAMP(2268) +  TMP_JAMP(2248)  ! used 4 times
+      TMP_JAMP(2438) = TMP_JAMP(2267) +  TMP_JAMP(2247)  ! used 4 times
+      TMP_JAMP(2437) = TMP_JAMP(2267) - AMP(1612)  ! used 4 times
+      TMP_JAMP(2436) = TMP_JAMP(2266) +  AMP(1638)  ! used 4 times
+      TMP_JAMP(2435) = TMP_JAMP(2266) - AMP(1620)  ! used 4 times
+      TMP_JAMP(2434) = TMP_JAMP(2266) + ((0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * AMP(1476)  ! used 4 times
+      TMP_JAMP(2433) = TMP_JAMP(2265) - AMP(1629)  ! used 4 times
+      TMP_JAMP(2432) = TMP_JAMP(2265) +  AMP(1623)  ! used 4 times
+      TMP_JAMP(2431) = TMP_JAMP(2265) + ((-0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * AMP(1437)  ! used 4 times
+      TMP_JAMP(2430) = TMP_JAMP(2264) +  AMP(1636)  ! used 4 times
+      TMP_JAMP(2429) = TMP_JAMP(2264) - AMP(1618)  ! used 4 times
+      TMP_JAMP(2428) = TMP_JAMP(2263) - AMP(1627)  ! used 4 times
+      TMP_JAMP(2427) = TMP_JAMP(2263) +  AMP(1621)  ! used 4 times
+      TMP_JAMP(2426) = TMP_JAMP(2262) +  TMP_JAMP(2256)  ! used 4 times
+      TMP_JAMP(2425) = TMP_JAMP(2262) +  AMP(1772)  ! used 4 times
+      TMP_JAMP(2424) = TMP_JAMP(2261) +  TMP_JAMP(2255)  ! used 4 times
+      TMP_JAMP(2423) = TMP_JAMP(2260) +  AMP(1766)  ! used 4 times
+      TMP_JAMP(2422) = TMP_JAMP(2260) - AMP(1808)  ! used 4 times
+      TMP_JAMP(2421) = TMP_JAMP(2259) +  AMP(1757)  ! used 4 times
+      TMP_JAMP(2420) = TMP_JAMP(2258) +  AMP(1808)  ! used 4 times
+      TMP_JAMP(2419) = TMP_JAMP(2258) +  TMP_JAMP(2250)  ! used 4 times
+      TMP_JAMP(2418) = TMP_JAMP(2258) - AMP(1763)  ! used 4 times
+      TMP_JAMP(2417) = TMP_JAMP(2257) - AMP(1772)  ! used 4 times
+      TMP_JAMP(2416) = TMP_JAMP(2257) +  TMP_JAMP(2249)  ! used 4 times
+      TMP_JAMP(2415) = TMP_JAMP(2257) +  AMP(1802)  ! used 4 times
+      TMP_JAMP(2414) = TMP_JAMP(2254) +  AMP(1712)  ! used 4 times
+      TMP_JAMP(2413) = TMP_JAMP(2253) +  AMP(1703)  ! used 4 times
+      TMP_JAMP(2412) = TMP_JAMP(2252) +  AMP(1754)  ! used 4 times
+      TMP_JAMP(2411) = TMP_JAMP(2252) - AMP(1709)  ! used 4 times
+      TMP_JAMP(2410) = TMP_JAMP(2251) - AMP(1718)  ! used 4 times
+      TMP_JAMP(2409) = TMP_JAMP(2251) +  AMP(1748)  ! used 4 times
+      TMP_JAMP(2408) = TMP_JAMP(2248) - AMP(1631)  ! used 4 times
+      TMP_JAMP(2407) = TMP_JAMP(2247) - AMP(1613)  ! used 4 times
+      TMP_JAMP(2406) = TMP_JAMP(2246) +  AMP(1637)  ! used 4 times
+      TMP_JAMP(2405) = TMP_JAMP(2246) - AMP(1619)  ! used 4 times
+      TMP_JAMP(2404) = TMP_JAMP(2245) - AMP(1628)  ! used 4 times
+      TMP_JAMP(2403) = TMP_JAMP(2245) +  AMP(1622)  ! used 4 times
+      TMP_JAMP(2402) = TMP_JAMP(2211) +  TMP_JAMP(2187)  ! used 4 times
+      TMP_JAMP(2401) = TMP_JAMP(2210) +  TMP_JAMP(2188)  ! used 4 times
+      TMP_JAMP(2400) = AMP(1459) +  AMP(1888)  ! used 4 times
+      TMP_JAMP(2399) = AMP(1461) +  AMP(1807)  ! used 4 times
+      TMP_JAMP(2398) = AMP(1468) +  AMP(1861)  ! used 4 times
+      TMP_JAMP(2397) = AMP(1470) +  AMP(1753)  ! used 4 times
+      TMP_JAMP(2396) = AMP(1452) +  AMP(1693)  ! used 4 times
+      TMP_JAMP(2395) = AMP(1450) +  AMP(1828)  ! used 4 times
+      TMP_JAMP(2394) = AMP(1477) +  AMP(1834)  ! used 4 times
+      TMP_JAMP(2393) = AMP(1479) +  AMP(1699)  ! used 4 times
+      TMP_JAMP(2392) = AMP(1431) +  AMP(1747)  ! used 4 times
+      TMP_JAMP(2391) = AMP(1429) +  AMP(1855)  ! used 4 times
+      TMP_JAMP(2673) = TMP_JAMP(2402) +  TMP_JAMP(2285)  ! used 4 times
+      TMP_JAMP(2672) = TMP_JAMP(2401) +  TMP_JAMP(2285)  ! used 4 times
+      TMP_JAMP(2675) = TMP_JAMP(2531) - TMP_JAMP(2334)  ! used 3 times
+      TMP_JAMP(2674) = TMP_JAMP(2393) - TMP_JAMP(2281)  ! used 3 times
+      TMP_JAMP(2881) = TMP_JAMP(2674) +  TMP_JAMP(2660)  ! used 2 times
+      TMP_JAMP(2880) = TMP_JAMP(2673) +  TMP_JAMP(2236)  ! used 2 times
+      TMP_JAMP(2879) = TMP_JAMP(2672) +  TMP_JAMP(2235)  ! used 2 times
+      TMP_JAMP(2878) = TMP_JAMP(2666) +  TMP_JAMP(2604)  ! used 2 times
+      TMP_JAMP(2877) = TMP_JAMP(2658) +  TMP_JAMP(2554)  ! used 2 times
+      TMP_JAMP(2876) = TMP_JAMP(2658) - TMP_JAMP(2399)  ! used 2 times
+      TMP_JAMP(2875) = TMP_JAMP(2653) +  TMP_JAMP(2494)  ! used 2 times
+      TMP_JAMP(2874) = TMP_JAMP(2649) +  TMP_JAMP(2496)  ! used 2 times
+      TMP_JAMP(2873) = TMP_JAMP(2643) +  TMP_JAMP(2309)  ! used 2 times
+      TMP_JAMP(2872) = TMP_JAMP(2642) +  TMP_JAMP(2310)  ! used 2 times
+      TMP_JAMP(2871) = TMP_JAMP(2642) +  TMP_JAMP(2353)  ! used 2 times
+      TMP_JAMP(2870) = TMP_JAMP(2641) +  TMP_JAMP(2236)  ! used 2 times
+      TMP_JAMP(2869) = TMP_JAMP(2640) +  TMP_JAMP(2352)  ! used 2 times
+      TMP_JAMP(2868) = TMP_JAMP(2638) +  TMP_JAMP(2234)  ! used 2 times
+      TMP_JAMP(2867) = TMP_JAMP(2638) - TMP_JAMP(2233)  ! used 2 times
+      TMP_JAMP(2866) = TMP_JAMP(2638) - TMP_JAMP(2504)  ! used 2 times
+      TMP_JAMP(2865) = TMP_JAMP(2630) +  TMP_JAMP(2621)  ! used 2 times
+      TMP_JAMP(2864) = TMP_JAMP(2627) +  TMP_JAMP(2501)  ! used 2 times
+      TMP_JAMP(2863) = TMP_JAMP(2627) +  TMP_JAMP(2616)  ! used 2 times
+      TMP_JAMP(2862) = TMP_JAMP(2623) - TMP_JAMP(2506)  ! used 2 times
+      TMP_JAMP(2861) = TMP_JAMP(2621) - TMP_JAMP(2616)  ! used 2 times
+      TMP_JAMP(2860) = TMP_JAMP(2621) +  TMP_JAMP(2595)  ! used 2 times
+      TMP_JAMP(2859) = TMP_JAMP(2620) - TMP_JAMP(2508)  ! used 2 times
+      TMP_JAMP(2858) = TMP_JAMP(2619) +  TMP_JAMP(2596)  ! used 2 times
+      TMP_JAMP(2857) = TMP_JAMP(2618) - TMP_JAMP(2502)  ! used 2 times
+      TMP_JAMP(2856) = TMP_JAMP(2617) +  TMP_JAMP(2399)  ! used 2 times
+      TMP_JAMP(2855) = TMP_JAMP(2616) +  TMP_JAMP(2595)  ! used 2 times
+      TMP_JAMP(2854) = TMP_JAMP(2615) - TMP_JAMP(2503)  ! used 2 times
+      TMP_JAMP(2853) = TMP_JAMP(2612) +  TMP_JAMP(2561)  ! used 2 times
+      TMP_JAMP(2852) = TMP_JAMP(2607) - TMP_JAMP(2388)  ! used 2 times
+      TMP_JAMP(2851) = TMP_JAMP(2604) - TMP_JAMP(2364)  ! used 2 times
+      TMP_JAMP(2850) = TMP_JAMP(2603) - TMP_JAMP(2365)  ! used 2 times
+      TMP_JAMP(2849) = TMP_JAMP(2602) +  TMP_JAMP(2596)  ! used 2 times
+      TMP_JAMP(2848) = TMP_JAMP(2600) - TMP_JAMP(2473)  ! used 2 times
+      TMP_JAMP(2847) = TMP_JAMP(2597) - TMP_JAMP(2364)  ! used 2 times
+      TMP_JAMP(2846) = TMP_JAMP(2597) +  TMP_JAMP(2538)  ! used 2 times
+      TMP_JAMP(2845) = TMP_JAMP(2597) +  TMP_JAMP(2595)  ! used 2 times
+      TMP_JAMP(2844) = TMP_JAMP(2594) +  TMP_JAMP(2379)  ! used 2 times
+      TMP_JAMP(2843) = TMP_JAMP(2592) - TMP_JAMP(2233)  ! used 2 times
+      TMP_JAMP(2842) = TMP_JAMP(2592) - TMP_JAMP(2566)  ! used 2 times
+      TMP_JAMP(2841) = TMP_JAMP(2592) - TMP_JAMP(2232)  ! used 2 times
+      TMP_JAMP(2840) = TMP_JAMP(2591) - TMP_JAMP(2233)  ! used 2 times
+      TMP_JAMP(2839) = TMP_JAMP(2591) - TMP_JAMP(2564)  ! used 2 times
+      TMP_JAMP(2838) = TMP_JAMP(2587) +  TMP_JAMP(2353)  ! used 2 times
+      TMP_JAMP(2837) = TMP_JAMP(2585) - TMP_JAMP(2564)  ! used 2 times
+      TMP_JAMP(2836) = TMP_JAMP(2583) - TMP_JAMP(2242)  ! used 2 times
+      TMP_JAMP(2835) = TMP_JAMP(2580) - TMP_JAMP(2525)  ! used 2 times
+      TMP_JAMP(2834) = TMP_JAMP(2577) +  TMP_JAMP(2567)  ! used 2 times
+      TMP_JAMP(2833) = TMP_JAMP(2576) - TMP_JAMP(2521)  ! used 2 times
+      TMP_JAMP(2832) = TMP_JAMP(2575) +  TMP_JAMP(2564)  ! used 2 times
+      TMP_JAMP(2831) = TMP_JAMP(2574) - TMP_JAMP(2566)  ! used 2 times
+      TMP_JAMP(2830) = TMP_JAMP(2574) +  TMP_JAMP(2529)  ! used 2 times
+      TMP_JAMP(2829) = TMP_JAMP(2567) +  TMP_JAMP(2389)  ! used 2 times
+      TMP_JAMP(2828) = TMP_JAMP(2566) +  TMP_JAMP(2529)  ! used 2 times
+      TMP_JAMP(2827) = TMP_JAMP(2564) +  TMP_JAMP(2528)  ! used 2 times
+      TMP_JAMP(2826) = TMP_JAMP(2559) - TMP_JAMP(2533)  ! used 2 times
+      TMP_JAMP(2825) = TMP_JAMP(2557) +  TMP_JAMP(2542)  ! used 2 times
+      TMP_JAMP(2824) = TMP_JAMP(2554) +  TMP_JAMP(2535)  ! used 2 times
+      TMP_JAMP(2823) = TMP_JAMP(2552) - TMP_JAMP(2545)  ! used 2 times
+      TMP_JAMP(2822) = TMP_JAMP(2552) +  TMP_JAMP(2423)  ! used 2 times
+      TMP_JAMP(2821) = TMP_JAMP(2548) +  TMP_JAMP(2530)  ! used 2 times
+      TMP_JAMP(2820) = TMP_JAMP(2547) - TMP_JAMP(2415)  ! used 2 times
+      TMP_JAMP(2819) = TMP_JAMP(2547) - TMP_JAMP(2532)  ! used 2 times
+      TMP_JAMP(2818) = TMP_JAMP(2546) - TMP_JAMP(2542)  ! used 2 times
+      TMP_JAMP(2817) = TMP_JAMP(2546) +  TMP_JAMP(2421)  ! used 2 times
+      TMP_JAMP(2816) = TMP_JAMP(2546) +  TMP_JAMP(2538)  ! used 2 times
+      TMP_JAMP(2815) = TMP_JAMP(2545) +  TMP_JAMP(2423)  ! used 2 times
+      TMP_JAMP(2814) = TMP_JAMP(2543) - TMP_JAMP(2530)  ! used 2 times
+      TMP_JAMP(2813) = TMP_JAMP(2542) +  TMP_JAMP(2421)  ! used 2 times
+      TMP_JAMP(2812) = TMP_JAMP(2541) - TMP_JAMP(2539)  ! used 2 times
+      TMP_JAMP(2811) = TMP_JAMP(2538) - TMP_JAMP(2327)  ! used 2 times
+      TMP_JAMP(2810) = TMP_JAMP(2537) +  TMP_JAMP(2235)  ! used 2 times
+      TMP_JAMP(2809) = TMP_JAMP(2536) +  TMP_JAMP(2529)  ! used 2 times
+      TMP_JAMP(2808) = TMP_JAMP(2532) - TMP_JAMP(2530)  ! used 2 times
+      TMP_JAMP(2807) = TMP_JAMP(2532) +  TMP_JAMP(2528)  ! used 2 times
+      TMP_JAMP(2806) = TMP_JAMP(2530) +  TMP_JAMP(2415)  ! used 2 times
+      TMP_JAMP(2805) = TMP_JAMP(2528) - TMP_JAMP(2460)  ! used 2 times
+      TMP_JAMP(2804) = TMP_JAMP(2527) - TMP_JAMP(2523)  ! used 2 times
+      TMP_JAMP(2803) = TMP_JAMP(2527) +  TMP_JAMP(2519)  ! used 2 times
+      TMP_JAMP(2802) = TMP_JAMP(2527) +  TMP_JAMP(2509)  ! used 2 times
+      TMP_JAMP(2801) = TMP_JAMP(2526) - TMP_JAMP(2522)  ! used 2 times
+      TMP_JAMP(2800) = TMP_JAMP(2526) +  TMP_JAMP(2518)  ! used 2 times
+      TMP_JAMP(2799) = TMP_JAMP(2525) - TMP_JAMP(2521)  ! used 2 times
+      TMP_JAMP(2798) = TMP_JAMP(2525) +  TMP_JAMP(2517)  ! used 2 times
+      TMP_JAMP(2797) = TMP_JAMP(2524) - TMP_JAMP(2520)  ! used 2 times
+      TMP_JAMP(2796) = TMP_JAMP(2524) +  TMP_JAMP(2516)  ! used 2 times
+      TMP_JAMP(2795) = TMP_JAMP(2523) +  TMP_JAMP(2519)  ! used 2 times
+      TMP_JAMP(2794) = TMP_JAMP(2522) +  TMP_JAMP(2518)  ! used 2 times
+      TMP_JAMP(2793) = TMP_JAMP(2521) +  TMP_JAMP(2517)  ! used 2 times
+      TMP_JAMP(2792) = TMP_JAMP(2520) +  TMP_JAMP(2516)  ! used 2 times
+      TMP_JAMP(2791) = TMP_JAMP(2518) +  TMP_JAMP(2226)  ! used 2 times
+      TMP_JAMP(2790) = TMP_JAMP(2517) +  TMP_JAMP(2226)  ! used 2 times
+      TMP_JAMP(2789) = TMP_JAMP(2517) +  TMP_JAMP(2335)  ! used 2 times
+      TMP_JAMP(2788) = TMP_JAMP(2515) +  TMP_JAMP(2379)  ! used 2 times
+      TMP_JAMP(2787) = TMP_JAMP(2514) - TMP_JAMP(2504)  ! used 2 times
+      TMP_JAMP(2786) = TMP_JAMP(2511) +  TMP_JAMP(2239)  ! used 2 times
+      TMP_JAMP(2785) = TMP_JAMP(2510) +  TMP_JAMP(2436)  ! used 2 times
+      TMP_JAMP(2784) = TMP_JAMP(2508) - TMP_JAMP(2503)  ! used 2 times
+      TMP_JAMP(2783) = TMP_JAMP(2508) +  TMP_JAMP(2477)  ! used 2 times
+      TMP_JAMP(2782) = TMP_JAMP(2505) - TMP_JAMP(2501)  ! used 2 times
+      TMP_JAMP(2781) = TMP_JAMP(2504) +  TMP_JAMP(2478)  ! used 2 times
+      TMP_JAMP(2780) = TMP_JAMP(2501) +  TMP_JAMP(2475)  ! used 2 times
+      TMP_JAMP(2779) = TMP_JAMP(2496) +  TMP_JAMP(2480)  ! used 2 times
+      TMP_JAMP(2778) = TMP_JAMP(2494) +  TMP_JAMP(2481)  ! used 2 times
+      TMP_JAMP(2777) = TMP_JAMP(2492) - TMP_JAMP(2486)  ! used 2 times
+      TMP_JAMP(2776) = TMP_JAMP(2492) +  TMP_JAMP(2414)  ! used 2 times
+      TMP_JAMP(2775) = TMP_JAMP(2491) +  TMP_JAMP(2397)  ! used 2 times
+      TMP_JAMP(2774) = TMP_JAMP(2490) - TMP_JAMP(2482)  ! used 2 times
+      TMP_JAMP(2773) = TMP_JAMP(2488) - TMP_JAMP(2409)  ! used 2 times
+      TMP_JAMP(2772) = TMP_JAMP(2488) - TMP_JAMP(2479)  ! used 2 times
+      TMP_JAMP(2771) = TMP_JAMP(2487) - TMP_JAMP(2483)  ! used 2 times
+      TMP_JAMP(2770) = TMP_JAMP(2487) +  TMP_JAMP(2413)  ! used 2 times
+      TMP_JAMP(2769) = TMP_JAMP(2486) +  TMP_JAMP(2414)  ! used 2 times
+      TMP_JAMP(2768) = TMP_JAMP(2485) - TMP_JAMP(2397)  ! used 2 times
+      TMP_JAMP(2767) = TMP_JAMP(2483) +  TMP_JAMP(2413)  ! used 2 times
+      TMP_JAMP(2766) = TMP_JAMP(2482) +  TMP_JAMP(2478)  ! used 2 times
+      TMP_JAMP(2765) = TMP_JAMP(2482) +  TMP_JAMP(2412)  ! used 2 times
+      TMP_JAMP(2764) = TMP_JAMP(2482) +  TMP_JAMP(2236)  ! used 2 times
+      TMP_JAMP(2763) = TMP_JAMP(2481) +  TMP_JAMP(2477)  ! used 2 times
+      TMP_JAMP(2762) = TMP_JAMP(2480) +  TMP_JAMP(2476)  ! used 2 times
+      TMP_JAMP(2761) = TMP_JAMP(2479) - TMP_JAMP(2392)  ! used 2 times
+      TMP_JAMP(2760) = TMP_JAMP(2479) +  TMP_JAMP(2475)  ! used 2 times
+      TMP_JAMP(2759) = TMP_JAMP(2477) - TMP_JAMP(2229)  ! used 2 times
+      TMP_JAMP(2758) = TMP_JAMP(2477) - TMP_JAMP(2298)  ! used 2 times
+      TMP_JAMP(2757) = TMP_JAMP(2476) - TMP_JAMP(2299)  ! used 2 times
+      TMP_JAMP(2756) = TMP_JAMP(2476) - TMP_JAMP(2229)  ! used 2 times
+      TMP_JAMP(2755) = TMP_JAMP(2475) - TMP_JAMP(2470)  ! used 2 times
+      TMP_JAMP(2754) = TMP_JAMP(2474) +  TMP_JAMP(2463)  ! used 2 times
+      TMP_JAMP(2753) = TMP_JAMP(2473) +  TMP_JAMP(2396)  ! used 2 times
+      TMP_JAMP(2752) = TMP_JAMP(2472) - TMP_JAMP(2469)  ! used 2 times
+      TMP_JAMP(2751) = TMP_JAMP(2472) +  TMP_JAMP(2462)  ! used 2 times
+      TMP_JAMP(2750) = TMP_JAMP(2471) - TMP_JAMP(2468)  ! used 2 times
+      TMP_JAMP(2749) = TMP_JAMP(2471) +  TMP_JAMP(2461)  ! used 2 times
+      TMP_JAMP(2748) = TMP_JAMP(2470) +  TMP_JAMP(2460)  ! used 2 times
+      TMP_JAMP(2747) = TMP_JAMP(2470) - TMP_JAMP(2236)  ! used 2 times
+      TMP_JAMP(2746) = TMP_JAMP(2469) +  TMP_JAMP(2462)  ! used 2 times
+      TMP_JAMP(2745) = TMP_JAMP(2468) +  TMP_JAMP(2461)  ! used 2 times
+      TMP_JAMP(2744) = TMP_JAMP(2466) - TMP_JAMP(2236)  ! used 2 times
+      TMP_JAMP(2743) = TMP_JAMP(2465) +  TMP_JAMP(2386)  ! used 2 times
+      TMP_JAMP(2742) = TMP_JAMP(2464) - TMP_JAMP(2236)  ! used 2 times
+      TMP_JAMP(2741) = TMP_JAMP(2464) - TMP_JAMP(2229)  ! used 2 times
+      TMP_JAMP(2740) = TMP_JAMP(2463) - TMP_JAMP(2235)  ! used 2 times
+      TMP_JAMP(2739) = TMP_JAMP(2463) - TMP_JAMP(2326)  ! used 2 times
+      TMP_JAMP(2738) = TMP_JAMP(2460) - TMP_JAMP(2235)  ! used 2 times
+      TMP_JAMP(2737) = TMP_JAMP(2459) - TMP_JAMP(2235)  ! used 2 times
+      TMP_JAMP(2736) = TMP_JAMP(2458) - TMP_JAMP(2235)  ! used 2 times
+      TMP_JAMP(2735) = TMP_JAMP(2458) - TMP_JAMP(2226)  ! used 2 times
+      TMP_JAMP(2734) = TMP_JAMP(2455) +  TMP_JAMP(2431)  ! used 2 times
+      TMP_JAMP(2733) = TMP_JAMP(2454) +  TMP_JAMP(2433)  ! used 2 times
+      TMP_JAMP(2732) = TMP_JAMP(2454) - TMP_JAMP(2449)  ! used 2 times
+      TMP_JAMP(2731) = TMP_JAMP(2453) - TMP_JAMP(2429)  ! used 2 times
+      TMP_JAMP(2730) = TMP_JAMP(2451) - TMP_JAMP(2440)  ! used 2 times
+      TMP_JAMP(2729) = TMP_JAMP(2450) +  TMP_JAMP(2429)  ! used 2 times
+      TMP_JAMP(2728) = TMP_JAMP(2448) - AMP(1632)  ! used 2 times
+      TMP_JAMP(2727) = TMP_JAMP(2445) +  TMP_JAMP(2406)  ! used 2 times
+      TMP_JAMP(2726) = TMP_JAMP(2442) +  TMP_JAMP(2432)  ! used 2 times
+      TMP_JAMP(2725) = TMP_JAMP(2441) - TMP_JAMP(2437)  ! used 2 times
+      TMP_JAMP(2724) = TMP_JAMP(2441) +  TMP_JAMP(2407)  ! used 2 times
+      TMP_JAMP(2723) = TMP_JAMP(2440) +  TMP_JAMP(2408)  ! used 2 times
+      TMP_JAMP(2722) = TMP_JAMP(2439) +  TMP_JAMP(2430)  ! used 2 times
+      TMP_JAMP(2721) = TMP_JAMP(2438) +  TMP_JAMP(2427)  ! used 2 times
+      TMP_JAMP(2720) = TMP_JAMP(2437) +  TMP_JAMP(2407)  ! used 2 times
+      TMP_JAMP(2719) = TMP_JAMP(2434) - AMP(1620)  ! used 2 times
+      TMP_JAMP(2718) = TMP_JAMP(2431) +  AMP(1623)  ! used 2 times
+      TMP_JAMP(2717) = TMP_JAMP(2430) +  TMP_JAMP(2406)  ! used 2 times
+      TMP_JAMP(2716) = TMP_JAMP(2428) +  TMP_JAMP(2272)  ! used 2 times
+      TMP_JAMP(2715) = TMP_JAMP(2427) +  TMP_JAMP(2403)  ! used 2 times
+      TMP_JAMP(2714) = TMP_JAMP(2426) +  TMP_JAMP(2299)  ! used 2 times
+      TMP_JAMP(2713) = TMP_JAMP(2424) +  TMP_JAMP(2298)  ! used 2 times
+      TMP_JAMP(2712) = TMP_JAMP(2424) +  TMP_JAMP(2335)  ! used 2 times
+      TMP_JAMP(2711) = TMP_JAMP(2423) +  TMP_JAMP(2410)  ! used 2 times
+      TMP_JAMP(2710) = TMP_JAMP(2421) +  TMP_JAMP(2411)  ! used 2 times
+      TMP_JAMP(2709) = TMP_JAMP(2419) +  TMP_JAMP(2404)  ! used 2 times
+      TMP_JAMP(2708) = TMP_JAMP(2418) - TMP_JAMP(2408)  ! used 2 times
+      TMP_JAMP(2707) = TMP_JAMP(2418) +  TMP_JAMP(2413)  ! used 2 times
+      TMP_JAMP(2706) = TMP_JAMP(2417) +  TMP_JAMP(2414)  ! used 2 times
+      TMP_JAMP(2705) = TMP_JAMP(2416) - TMP_JAMP(2407)  ! used 2 times
+      TMP_JAMP(2704) = TMP_JAMP(2415) +  TMP_JAMP(2405)  ! used 2 times
+      TMP_JAMP(2703) = TMP_JAMP(2412) +  TMP_JAMP(2397)  ! used 2 times
+      TMP_JAMP(2702) = TMP_JAMP(2412) +  TMP_JAMP(2405)  ! used 2 times
+      TMP_JAMP(2701) = TMP_JAMP(2411) +  TMP_JAMP(2406)  ! used 2 times
+      TMP_JAMP(2700) = TMP_JAMP(2410) +  TMP_JAMP(2403)  ! used 2 times
+      TMP_JAMP(2699) = TMP_JAMP(2409) +  TMP_JAMP(2404)  ! used 2 times
+      TMP_JAMP(2698) = TMP_JAMP(2409) +  TMP_JAMP(2392)  ! used 2 times
+      TMP_JAMP(2697) = TMP_JAMP(2400) +  TMP_JAMP(2242)  ! used 2 times
+      TMP_JAMP(2696) = TMP_JAMP(2400) +  TMP_JAMP(2229)  ! used 2 times
+      TMP_JAMP(2695) = TMP_JAMP(2398) +  TMP_JAMP(2239)  ! used 2 times
+      TMP_JAMP(2694) = TMP_JAMP(2398) +  TMP_JAMP(2226)  ! used 2 times
+      TMP_JAMP(2693) = TMP_JAMP(2396) +  TMP_JAMP(2326)  ! used 2 times
+      TMP_JAMP(2692) = TMP_JAMP(2396) + ((-0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * AMP(1454)  ! used 2 times
+      TMP_JAMP(2691) = TMP_JAMP(2395) - TMP_JAMP(2390)  ! used 2 times
+      TMP_JAMP(2690) = TMP_JAMP(2395) +  TMP_JAMP(2232)  ! used 2 times
+      TMP_JAMP(2689) = TMP_JAMP(2395) + ((-0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * AMP(1451)  ! used 2 times
+      TMP_JAMP(2688) = TMP_JAMP(2394) +  TMP_JAMP(2234)  ! used 2 times
+      TMP_JAMP(2687) = TMP_JAMP(2394) +  TMP_JAMP(2232)  ! used 2 times
+      TMP_JAMP(2686) = TMP_JAMP(2392) + ((-0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * AMP(1433)  ! used 2 times
+      TMP_JAMP(2685) = TMP_JAMP(2391) +  TMP_JAMP(2239)  ! used 2 times
+      TMP_JAMP(2684) = TMP_JAMP(2391) +  TMP_JAMP(2226)  ! used 2 times
+      TMP_JAMP(2683) = TMP_JAMP(2391) + ((-0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * AMP(1430)  ! used 2 times
+      TMP_JAMP(2682) = TMP_JAMP(2389) + ((-0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * AMP(1478)  ! used 2 times
+      TMP_JAMP(2681) = TMP_JAMP(2386) + ((-0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * AMP(1460)  ! used 2 times
+      TMP_JAMP(2680) = TMP_JAMP(2385) + ((0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * AMP(1463)  ! used 2 times
+      TMP_JAMP(2679) = TMP_JAMP(2384) + ((-0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * AMP(1469)  ! used 2 times
+      TMP_JAMP(2678) = TMP_JAMP(2382) + ((0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * AMP(1472)  ! used 2 times
+      TMP_JAMP(2677) = TMP_JAMP(2351) +  TMP_JAMP(2229)  ! used 2 times
+      TMP_JAMP(2676) = TMP_JAMP(2308) +  TMP_JAMP(2226)  ! used 2 times
+      TMP_JAMP(2911) = TMP_JAMP(2861) +  TMP_JAMP(2631)  ! used 2 times
+      TMP_JAMP(2910) = TMP_JAMP(2856) +  TMP_JAMP(2680)  ! used 2 times
+      TMP_JAMP(2909) = TMP_JAMP(2840) +  TMP_JAMP(2637)  ! used 2 times
+      TMP_JAMP(2908) = TMP_JAMP(2831) +  TMP_JAMP(2581)  ! used 2 times
+      TMP_JAMP(2907) = TMP_JAMP(2802) +  TMP_JAMP(2632)  ! used 2 times
+      TMP_JAMP(2906) = TMP_JAMP(2801) - TMP_JAMP(2579)  ! used 2 times
+      TMP_JAMP(2905) = TMP_JAMP(2799) - TMP_JAMP(2581)  ! used 2 times
+      TMP_JAMP(2904) = TMP_JAMP(2792) +  TMP_JAMP(2563)  ! used 2 times
+      TMP_JAMP(2903) = TMP_JAMP(2784) - TMP_JAMP(2622)  ! used 2 times
+      TMP_JAMP(2902) = TMP_JAMP(2782) +  TMP_JAMP(2631)  ! used 2 times
+      TMP_JAMP(2901) = TMP_JAMP(2781) +  TMP_JAMP(2628)  ! used 2 times
+      TMP_JAMP(2900) = TMP_JAMP(2777) - TMP_JAMP(2500)  ! used 2 times
+      TMP_JAMP(2899) = TMP_JAMP(2771) - TMP_JAMP(2498)  ! used 2 times
+      TMP_JAMP(2898) = TMP_JAMP(2754) - TMP_JAMP(2601)  ! used 2 times
+      TMP_JAMP(2897) = TMP_JAMP(2752) - TMP_JAMP(2610)  ! used 2 times
+      TMP_JAMP(2896) = TMP_JAMP(2750) - TMP_JAMP(2613)  ! used 2 times
+      TMP_JAMP(2895) = TMP_JAMP(2748) - TMP_JAMP(2598)  ! used 2 times
+      TMP_JAMP(2894) = TMP_JAMP(2746) - TMP_JAMP(2605)  ! used 2 times
+      TMP_JAMP(2893) = TMP_JAMP(2744) +  TMP_JAMP(2589)  ! used 2 times
+      TMP_JAMP(2892) = TMP_JAMP(2742) +  TMP_JAMP(2585)  ! used 2 times
+      TMP_JAMP(2891) = TMP_JAMP(2741) +  TMP_JAMP(2583)  ! used 2 times
+      TMP_JAMP(2890) = TMP_JAMP(2737) +  TMP_JAMP(2514)  ! used 2 times
+      TMP_JAMP(2889) = TMP_JAMP(2736) +  TMP_JAMP(2511)  ! used 2 times
+      TMP_JAMP(2888) = TMP_JAMP(2733) +  TMP_JAMP(2507)  ! used 2 times
+      TMP_JAMP(2887) = TMP_JAMP(2725) - TMP_JAMP(2453)  ! used 2 times
+      TMP_JAMP(2886) = TMP_JAMP(2709) - TMP_JAMP(2422)  ! used 2 times
+      TMP_JAMP(2885) = TMP_JAMP(2705) - TMP_JAMP(2425)  ! used 2 times
+      TMP_JAMP(2884) = TMP_JAMP(2697) +  TMP_JAMP(2589)  ! used 2 times
+      TMP_JAMP(2883) = TMP_JAMP(2694) - TMP_JAMP(2459)  ! used 2 times
+      TMP_JAMP(2882) = TMP_JAMP(2688) +  TMP_JAMP(2637)  ! used 2 times
+      TMP_JAMP(2941) = TMP_JAMP(2910) + ((0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * TMP_JAMP(2063)  ! used 2 times
+      TMP_JAMP(2940) = TMP_JAMP(2903) + ((-0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * TMP_JAMP(1850)  ! used 2 times
+      TMP_JAMP(2939) = TMP_JAMP(2893) - TMP_JAMP(1653)  ! used 2 times
+      TMP_JAMP(2938) = TMP_JAMP(2891) + ((0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * TMP_JAMP(1985)  ! used 2 times
+      TMP_JAMP(2937) = TMP_JAMP(2890) - TMP_JAMP(1860)  ! used 2 times
+      TMP_JAMP(2936) = TMP_JAMP(2889) + ((-0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * TMP_JAMP(1998)  ! used 2 times
+      TMP_JAMP(2935) = TMP_JAMP(2878) - TMP_JAMP(2123)  ! used 2 times
+      TMP_JAMP(2934) = TMP_JAMP(2876) +  TMP_JAMP(1936)  ! used 2 times
+      TMP_JAMP(2933) = TMP_JAMP(2852) +  TMP_JAMP(2140)  ! used 2 times
+      TMP_JAMP(2932) = TMP_JAMP(2716) + ((0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * TMP_JAMP(1992)  ! used 2 times
+      TMP_JAMP(2931) = TMP_JAMP(2696) - TMP_JAMP(1657)  ! used 2 times
+      TMP_JAMP(2930) = TMP_JAMP(2692) - TMP_JAMP(2033)  ! used 2 times
+      TMP_JAMP(2929) = TMP_JAMP(2682) - TMP_JAMP(1802)  ! used 2 times
+      TMP_JAMP(2928) = TMP_JAMP(2681) - TMP_JAMP(1873)  ! used 2 times
+      TMP_JAMP(2927) = TMP_JAMP(2676) +  TMP_JAMP(2103)  ! used 2 times
+      TMP_JAMP(2926) = TMP_JAMP(2675) + ((-0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * TMP_JAMP(2153)  ! used 2 times
+      TMP_JAMP(2925) = TMP_JAMP(2673) - TMP_JAMP(2131)  ! used 2 times
+      TMP_JAMP(2924) = TMP_JAMP(2672) +  TMP_JAMP(2133)  ! used 2 times
+      TMP_JAMP(2923) = TMP_JAMP(2653) +  TMP_JAMP(1929)  ! used 2 times
+      TMP_JAMP(2922) = TMP_JAMP(2649) +  TMP_JAMP(2096)  ! used 2 times
+      TMP_JAMP(2921) = TMP_JAMP(2644) - TMP_JAMP(2100)  ! used 2 times
+      TMP_JAMP(2920) = TMP_JAMP(2643) - TMP_JAMP(2102)  ! used 2 times
+      TMP_JAMP(2919) = TMP_JAMP(2640) +  TMP_JAMP(2058)  ! used 2 times
+      TMP_JAMP(2918) = TMP_JAMP(2639) + ((0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * TMP_JAMP(1796)  ! used 2 times
+      TMP_JAMP(2917) = TMP_JAMP(2428) + ((-0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * TMP_JAMP(2112)  ! used 2 times
+      TMP_JAMP(2916) = TMP_JAMP(2380) - TMP_JAMP(2152)  ! used 2 times
+      TMP_JAMP(2915) = TMP_JAMP(2378) +  TMP_JAMP(1958)  ! used 2 times
+      TMP_JAMP(2914) = TMP_JAMP(2351) +  TMP_JAMP(1665)  ! used 2 times
+      TMP_JAMP(2913) = TMP_JAMP(2310) +  TMP_JAMP(2134)  ! used 2 times
+      TMP_JAMP(2912) = TMP_JAMP(2073) + ((-0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * AMP(1481)  ! used 2 times
+      TMP_JAMP(3030) = TMP_JAMP(2935) + ((0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * TMP_JAMP(1044)  ! used 2 times
+      TMP_JAMP(3029) = TMP_JAMP(2934) - TMP_JAMP(329)  ! used 2 times
+      TMP_JAMP(3028) = TMP_JAMP(2926) + ((-0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * TMP_JAMP(451)  ! used 2 times
+      TMP_JAMP(3027) = TMP_JAMP(2907) + ((0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * TMP_JAMP(586)  ! used 2 times
+      TMP_JAMP(3026) = TMP_JAMP(2898) + ((-0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * TMP_JAMP(1238)  ! used 2 times
+      TMP_JAMP(3025) = TMP_JAMP(2894) - TMP_JAMP(1130)  ! used 2 times
+      TMP_JAMP(3024) = TMP_JAMP(2888) + ((0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * TMP_JAMP(587)  ! used 2 times
+      TMP_JAMP(3023) = TMP_JAMP(2886) + ((-0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * TMP_JAMP(1402)  ! used 2 times
+      TMP_JAMP(3022) = TMP_JAMP(2885) - TMP_JAMP(1095)  ! used 2 times
+      TMP_JAMP(3021) = TMP_JAMP(2862) +  TMP_JAMP(580)  ! used 2 times
+      TMP_JAMP(3020) = TMP_JAMP(2850) + ((-0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * TMP_JAMP(1364)  ! used 2 times
+      TMP_JAMP(3019) = TMP_JAMP(2849) + ((0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * TMP_JAMP(1411)  ! used 2 times
+      TMP_JAMP(3018) = TMP_JAMP(2845) + ((-0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * TMP_JAMP(1398)  ! used 2 times
+      TMP_JAMP(3017) = TMP_JAMP(2834) + ((0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * TMP_JAMP(582)  ! used 2 times
+      TMP_JAMP(3016) = TMP_JAMP(2826) + ((-0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * TMP_JAMP(1174)  ! used 2 times
+      TMP_JAMP(3015) = TMP_JAMP(2812) + ((-0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * TMP_JAMP(1360)  ! used 2 times
+      TMP_JAMP(3014) = TMP_JAMP(2809) + ((-0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * TMP_JAMP(1409)  ! used 2 times
+      TMP_JAMP(3013) = TMP_JAMP(2807) +  TMP_JAMP(1113)  ! used 2 times
+      TMP_JAMP(3012) = TMP_JAMP(2785) + ((-0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * TMP_JAMP(588)  ! used 2 times
+      TMP_JAMP(3011) = TMP_JAMP(2760) +  TMP_JAMP(1329)  ! used 2 times
+      TMP_JAMP(3010) = TMP_JAMP(2751) - TMP_JAMP(1131)  ! used 2 times
+      TMP_JAMP(3009) = TMP_JAMP(2749) + ((0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * TMP_JAMP(1366)  ! used 2 times
+      TMP_JAMP(3008) = TMP_JAMP(2734) +  TMP_JAMP(593)  ! used 2 times
+      TMP_JAMP(3007) = TMP_JAMP(2732) - TMP_JAMP(358)  ! used 2 times
+      TMP_JAMP(3006) = TMP_JAMP(2728) + ((0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * TMP_JAMP(590)  ! used 2 times
+      TMP_JAMP(3005) = TMP_JAMP(2726) + ((0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * TMP_JAMP(583)  ! used 2 times
+      TMP_JAMP(3004) = TMP_JAMP(2719) +  TMP_JAMP(592)  ! used 2 times
+      TMP_JAMP(3003) = TMP_JAMP(2718) +  TMP_JAMP(345)  ! used 2 times
+      TMP_JAMP(3002) = TMP_JAMP(2711) + ((-0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * TMP_JAMP(1193)  ! used 2 times
+      TMP_JAMP(3001) = TMP_JAMP(2710) + ((0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * TMP_JAMP(1145)  ! used 2 times
+      TMP_JAMP(3000) = TMP_JAMP(2708) + ((-0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * TMP_JAMP(1400)  ! used 2 times
+      TMP_JAMP(2999) = TMP_JAMP(2706) + ((-0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * TMP_JAMP(1167)  ! used 2 times
+      TMP_JAMP(2998) = TMP_JAMP(2704) +  TMP_JAMP(1088)  ! used 2 times
+      TMP_JAMP(2997) = TMP_JAMP(2702) +  TMP_JAMP(1412)  ! used 2 times
+      TMP_JAMP(2996) = TMP_JAMP(2701) + ((-0.000000000000000D+00
      $ ,1.000000000000000D+00)) * TMP_JAMP(1390)  ! used 2 times
-      TMP_JAMP(2681) = TMP_JAMP(2156) + ((-0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * AMP(1710)  ! used 2 times
-      TMP_JAMP(2680) = TMP_JAMP(2151) +  AMP(1702)  ! used 2 times
-      TMP_JAMP(2679) = TMP_JAMP(2152) + ((-0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * TMP_JAMP(2150)  ! used 2 times
-      TMP_JAMP(2678) = TMP_JAMP(2145) - AMP(1703)  ! used 2 times
-      TMP_JAMP(2677) = TMP_JAMP(2142) + ((0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * TMP_JAMP(2141)  ! used 2 times
-      TMP_JAMP(2676) = TMP_JAMP(2130) +  TMP_JAMP(2129)  ! used 2 times
-      TMP_JAMP(2675) = TMP_JAMP(2121) + ((0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * AMP(333)  ! used 2 times
-      TMP_JAMP(2674) = TMP_JAMP(2123) + ((0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * TMP_JAMP(2120)  ! used 2 times
-      TMP_JAMP(2673) = TMP_JAMP(2112) + ((0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * AMP(650)  ! used 2 times
-      TMP_JAMP(2672) = TMP_JAMP(2111) + ((-0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * TMP_JAMP(2109)  ! used 2 times
-      TMP_JAMP(2671) = TMP_JAMP(2107) + ((-0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * TMP_JAMP(2106)  ! used 2 times
-      TMP_JAMP(2670) = TMP_JAMP(2103) - AMP(982)  ! used 2 times
-      TMP_JAMP(2669) = TMP_JAMP(2104) + ((0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * TMP_JAMP(2100)  ! used 2 times
-      TMP_JAMP(2668) = TMP_JAMP(2091) + ((0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * TMP_JAMP(2090)  ! used 2 times
-      TMP_JAMP(2667) = TMP_JAMP(2086) +  TMP_JAMP(2082)  ! used 2 times
-      TMP_JAMP(2666) = TMP_JAMP(2087) - TMP_JAMP(2085)  ! used 2 times
-      TMP_JAMP(2665) = TMP_JAMP(2081) + ((-0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * AMP(1530)  ! used 2 times
-      TMP_JAMP(2664) = TMP_JAMP(2080) +  TMP_JAMP(2076)  ! used 2 times
-      TMP_JAMP(2663) = TMP_JAMP(2073) + ((-0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * AMP(240)  ! used 2 times
-      TMP_JAMP(2662) = TMP_JAMP(2068) + ((-0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * AMP(170)  ! used 2 times
-      TMP_JAMP(2661) = TMP_JAMP(2071) +  TMP_JAMP(2066)  ! used 2 times
-      TMP_JAMP(2660) = TMP_JAMP(2061) + ((-0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * AMP(249)  ! used 2 times
-      TMP_JAMP(2659) = TMP_JAMP(2063) - TMP_JAMP(2062)  ! used 2 times
-      TMP_JAMP(2658) = TMP_JAMP(2065) + ((-0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * TMP_JAMP(2064)  ! used 2 times
-      TMP_JAMP(2657) = TMP_JAMP(2060) +  TMP_JAMP(2059)  ! used 2 times
-      TMP_JAMP(2656) = TMP_JAMP(2056) + ((0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * AMP(303)  ! used 2 times
-      TMP_JAMP(2655) = TMP_JAMP(2050) + ((0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * AMP(1774)  ! used 2 times
-      TMP_JAMP(2654) = TMP_JAMP(2051) - AMP(298)  ! used 2 times
-      TMP_JAMP(2653) = TMP_JAMP(2054) +  TMP_JAMP(2052)  ! used 2 times
-      TMP_JAMP(2652) = TMP_JAMP(2045) - AMP(299)  ! used 2 times
-      TMP_JAMP(2651) = TMP_JAMP(2047) +  TMP_JAMP(2046)  ! used 2 times
-      TMP_JAMP(2650) = TMP_JAMP(2042) + ((0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * TMP_JAMP(2041)  ! used 2 times
-      TMP_JAMP(2649) = TMP_JAMP(2043) - TMP_JAMP(2040)  ! used 2 times
-      TMP_JAMP(2648) = TMP_JAMP(2031) +  TMP_JAMP(2030)  ! used 2 times
-      TMP_JAMP(2647) = TMP_JAMP(2029) + ((0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * TMP_JAMP(2028)  ! used 2 times
-      TMP_JAMP(2646) = TMP_JAMP(2027) + ((0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * TMP_JAMP(2026)  ! used 2 times
-      TMP_JAMP(2645) = TMP_JAMP(2017) - AMP(532)  ! used 2 times
-      TMP_JAMP(2644) = TMP_JAMP(2013) + ((-0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * AMP(307)  ! used 2 times
-      TMP_JAMP(2643) = TMP_JAMP(1995) + ((0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * TMP_JAMP(1994)  ! used 2 times
-      TMP_JAMP(2642) = TMP_JAMP(1993) +  TMP_JAMP(1992)  ! used 2 times
-      TMP_JAMP(2768) = TMP_JAMP(2742) + ((-0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * AMP(1890)  ! used 2 times
-      TMP_JAMP(2767) = TMP_JAMP(2737) +  TMP_JAMP(2575)  ! used 2 times
-      TMP_JAMP(2766) = TMP_JAMP(2724) +  AMP(1573)  ! used 2 times
-      TMP_JAMP(2765) = TMP_JAMP(2720) + ((-0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * TMP_JAMP(2719)  ! used 2 times
-      TMP_JAMP(2764) = TMP_JAMP(2717) + ((-0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * TMP_JAMP(2493)  ! used 2 times
-      TMP_JAMP(2763) = TMP_JAMP(2716) + ((-0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * TMP_JAMP(2714)  ! used 2 times
-      TMP_JAMP(2762) = TMP_JAMP(2713) + ((0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * TMP_JAMP(2712)  ! used 2 times
-      TMP_JAMP(2761) = TMP_JAMP(2704) + ((0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * TMP_JAMP(2703)  ! used 2 times
-      TMP_JAMP(2760) = TMP_JAMP(2700) - TMP_JAMP(2284)  ! used 2 times
-      TMP_JAMP(2759) = TMP_JAMP(2697) + ((-0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * TMP_JAMP(2252)  ! used 2 times
-      TMP_JAMP(2758) = TMP_JAMP(2695) - TMP_JAMP(2241)  ! used 2 times
-      TMP_JAMP(2757) = TMP_JAMP(2691) + ((0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * AMP(1565)  ! used 2 times
-      TMP_JAMP(2756) = TMP_JAMP(2689) +  TMP_JAMP(2210)  ! used 2 times
-      TMP_JAMP(2755) = TMP_JAMP(2685) +  TMP_JAMP(2178)  ! used 2 times
-      TMP_JAMP(2754) = TMP_JAMP(2676) + ((-0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * AMP(652)  ! used 2 times
-      TMP_JAMP(2753) = TMP_JAMP(2672) +  AMP(1005)  ! used 2 times
-      TMP_JAMP(2752) = TMP_JAMP(2668) - TMP_JAMP(2089)  ! used 2 times
-      TMP_JAMP(2751) = TMP_JAMP(2666) +  TMP_JAMP(2083)  ! used 2 times
-      TMP_JAMP(2750) = TMP_JAMP(2659) + ((-0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * AMP(848)  ! used 2 times
-      TMP_JAMP(2749) = TMP_JAMP(2657) - TMP_JAMP(2058)  ! used 2 times
-      TMP_JAMP(2748) = TMP_JAMP(2655) + ((0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * TMP_JAMP(2653)  ! used 2 times
-      TMP_JAMP(2747) = TMP_JAMP(2649) + ((0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * AMP(231)  ! used 2 times
-      TMP_JAMP(2746) = TMP_JAMP(2648) + ((0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * AMP(211)  ! used 2 times
-      TMP_JAMP(2745) = TMP_JAMP(2646) + ((0.000000000000000D+00,
-     $ -1.000000000000000D+00)) * AMP(173)  ! used 2 times
-      JAMP(1,1) = ((0.000000000000000D+00,1.000000000000000D+00))
-     $ *AMP(190)+(-1.000000000000000D+00)*AMP(251)
-     $ +((0.000000000000000D+00,1.000000000000000D+00))*TMP_JAMP(2008)
-     $ +TMP_JAMP(2642)+((0.000000000000000D+00,-1.000000000000000D+00))
-     $ *TMP_JAMP(2643)+((0.000000000000000D+00,1.000000000000000D+00))
-     $ *TMP_JAMP(2665)
-      JAMP(2,1) = (-1.000000000000000D+00)*AMP(242)
-     $ +((0.000000000000000D+00,1.000000000000000D+00))*AMP(247)+(
-     $ -1.000000000000000D+00)*TMP_JAMP(841)+(-1.000000000000000D+00)
-     $ *TMP_JAMP(842)+TMP_JAMP(2000)+((0.000000000000000D+00
-     $ ,1.000000000000000D+00))*TMP_JAMP(2011)+(-1.000000000000000D+00)
-     $ *TMP_JAMP(2642)+((0.000000000000000D+00,-1.000000000000000D+00))
-     $ *TMP_JAMP(2658)+TMP_JAMP(2667)
-      JAMP(3,1) = (-1.000000000000000D+00)*AMP(250)
-     $ +((0.000000000000000D+00,1.000000000000000D+00))*AMP(268)
-     $ +((0.000000000000000D+00,1.000000000000000D+00))*TMP_JAMP(1504)
-     $ +(-1.000000000000000D+00)*TMP_JAMP(1997)+(-1.000000000000000D
-     $ +00)*TMP_JAMP(2088)+((0.000000000000000D+00,1.000000000000000D
-     $ +00))*TMP_JAMP(2643)+((0.000000000000000D+00,
-     $ -1.000000000000000D+00))*TMP_JAMP(2647)
-      JAMP(4,1) = ((0.000000000000000D+00,1.000000000000000D+00))
-     $ *AMP(101)+(-1.000000000000000D+00)*AMP(233)+(
-     $ -1.000000000000000D+00)*TMP_JAMP(935)+(-1.000000000000000D+00)
-     $ *TMP_JAMP(1996)+(-1.000000000000000D+00)*TMP_JAMP(1998)
-     $ +TMP_JAMP(2006)+TMP_JAMP(2075)+(-1.000000000000000D+00)
-     $ *TMP_JAMP(2093)+((0.000000000000000D+00,-1.000000000000000D+00))
-     $ *TMP_JAMP(2746)
-      JAMP(5,1) = ((0.000000000000000D+00,1.000000000000000D+00))
-     $ *AMP(112)+((0.000000000000000D+00,1.000000000000000D+00))
-     $ *AMP(226)+(-1.000000000000000D+00)*AMP(241)+(
-     $ -1.000000000000000D+00)*AMP(301)+(-1.000000000000000D+00)
-     $ *TMP_JAMP(1596)+((0.000000000000000D+00,1.000000000000000D+00))
-     $ *TMP_JAMP(1681)+(-1.000000000000000D+00)*TMP_JAMP(1999)+(
-     $ -1.000000000000000D+00)*TMP_JAMP(2001)+TMP_JAMP(2003)
-     $ +TMP_JAMP(2007)+TMP_JAMP(2012)+((0.000000000000000D+00
-     $ ,1.000000000000000D+00))*TMP_JAMP(2021)+TMP_JAMP(2097)
-     $ +TMP_JAMP(2532)
-      JAMP(6,1) = ((0.000000000000000D+00,1.000000000000000D+00))
-     $ *AMP(223)+(-1.000000000000000D+00)*AMP(232)
-     $ +((0.000000000000000D+00,-1.000000000000000D+00))*AMP(238)
-     $ +((0.000000000000000D+00,-1.000000000000000D+00))*AMP(259)
-     $ +AMP(1630)+(-1.000000000000000D+00)*TMP_JAMP(757)
-     $ +((0.000000000000000D+00,1.000000000000000D+00))*TMP_JAMP(1688)
-     $ +(-1.000000000000000D+00)*TMP_JAMP(2002)+TMP_JAMP(2004)
-     $ +((0.000000000000000D+00,1.000000000000000D+00))*TMP_JAMP(2005)
-     $ +((0.000000000000000D+00,-1.000000000000000D+00))*TMP_JAMP(2473)
-     $ +((0.000000000000000D+00,1.000000000000000D+00))*TMP_JAMP(2651)
-     $ +(-1.000000000000000D+00)*TMP_JAMP(2669)
-      JAMP(7,1) = ((0.000000000000000D+00,1.000000000000000D+00))
-     $ *AMP(210)+(-1.000000000000000D+00)*AMP(253)
-     $ +((0.000000000000000D+00,-1.000000000000000D+00))*TMP_JAMP(1353)
-     $ +((0.000000000000000D+00,-1.000000000000000D+00))*TMP_JAMP(2008)
-     $ +(-1.000000000000000D+00)*TMP_JAMP(2009)+(-1.000000000000000D
-     $ +00)*TMP_JAMP(2015)+((0.000000000000000D+00,1.000000000000000D
-     $ +00))*TMP_JAMP(2644)+((0.000000000000000D+00,1.000000000000000D
-     $ +00))*TMP_JAMP(2686)
-      JAMP(8,1) = ((0.000000000000000D+00,-1.000000000000000D+00))
-     $ *AMP(187)+(-1.000000000000000D+00)*AMP(244)+TMP_JAMP(773)
-     $ +((0.000000000000000D+00,-1.000000000000000D+00))*TMP_JAMP(775)
-     $ +(-1.000000000000000D+00)*TMP_JAMP(1347)+(-1.000000000000000D
-     $ +00)*TMP_JAMP(1350)+((0.000000000000000D+00,-1.000000000000000D
-     $ +00))*TMP_JAMP(2010)+(-1.000000000000000D+00)*TMP_JAMP(2012)
-     $ +TMP_JAMP(2019)+TMP_JAMP(2084)+TMP_JAMP(2199)+TMP_JAMP(2543)
-     $ +((0.000000000000000D+00,-1.000000000000000D+00))*TMP_JAMP(2644)
-     $ +((0.000000000000000D+00,1.000000000000000D+00))*TMP_JAMP(2656)
-      JAMP(9,1) = (-1.000000000000000D+00)*AMP(252)+(
-     $ -1.000000000000000D+00)*AMP(530)+AMP(1699)+((0.000000000000000D
-     $ +00,1.000000000000000D+00))*TMP_JAMP(1545)+TMP_JAMP(1552)+(
-     $ -1.000000000000000D+00)*TMP_JAMP(2014)+(-1.000000000000000D+00)
-     $ *TMP_JAMP(2016)+((0.000000000000000D+00,1.000000000000000D+00))
-     $ *TMP_JAMP(2038)+(-1.000000000000000D+00)*TMP_JAMP(2645)
-     $ +((0.000000000000000D+00,1.000000000000000D+00))*TMP_JAMP(2687)
-      JAMP(10,1) = (-1.000000000000000D+00)*AMP(161)+(
-     $ -1.000000000000000D+00)*TMP_JAMP(979)+TMP_JAMP(2204)
-     $ +TMP_JAMP(2645)+(-1.000000000000000D+00)*TMP_JAMP(2745)
-     $ +((0.000000000000000D+00,-1.000000000000000D+00))*TMP_JAMP(2747)
-      JAMP(11,1) = (-1.000000000000000D+00)*AMP(243)
-     $ +((0.000000000000000D+00,1.000000000000000D+00))*AMP(606)+(
-     $ -1.000000000000000D+00)*TMP_JAMP(592)+(-1.000000000000000D+00)
-     $ *TMP_JAMP(827)+((0.000000000000000D+00,-1.000000000000000D+00))
-     $ *TMP_JAMP(1941)+(-1.000000000000000D+00)*TMP_JAMP(2018)
-     $ +((0.000000000000000D+00,1.000000000000000D+00))*TMP_JAMP(2020)
-     $ +((0.000000000000000D+00,1.000000000000000D+00))*TMP_JAMP(2022)
-     $ +((0.000000000000000D+00,-1.000000000000000D+00))*TMP_JAMP(2023)
-     $ +((0.000000000000000D+00,-1.000000000000000D+00))*TMP_JAMP(2025)
-     $ +((0.000000000000000D+00,1.000000000000000D+00))*TMP_JAMP(2069)
-     $ +TMP_JAMP(2207)+TMP_JAMP(2660)
+      TMP_JAMP(2995) = TMP_JAMP(2700) - TMP_JAMP(1092)  ! used 2 times
+      TMP_JAMP(2994) = TMP_JAMP(2668) + ((0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * TMP_JAMP(284)  ! used 2 times
+      TMP_JAMP(2993) = TMP_JAMP(2663) + ((-0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * TMP_JAMP(1237)  ! used 2 times
+      TMP_JAMP(2992) = TMP_JAMP(2626) + ((-0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * TMP_JAMP(591)  ! used 2 times
+      TMP_JAMP(2991) = TMP_JAMP(2571) +  TMP_JAMP(572)  ! used 2 times
+      TMP_JAMP(2990) = TMP_JAMP(2568) + ((-0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * TMP_JAMP(581)  ! used 2 times
+      TMP_JAMP(2989) = TMP_JAMP(2540) + ((-0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * TMP_JAMP(1407)  ! used 2 times
+      TMP_JAMP(2988) = TMP_JAMP(2534) + ((-0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * TMP_JAMP(1341)  ! used 2 times
+      TMP_JAMP(2987) = TMP_JAMP(2484) + ((0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * TMP_JAMP(1194)  ! used 2 times
+      TMP_JAMP(2986) = TMP_JAMP(2435) + ((-0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * TMP_JAMP(585)  ! used 2 times
+      TMP_JAMP(2985) = TMP_JAMP(2434) +  TMP_JAMP(536)  ! used 2 times
+      TMP_JAMP(2984) = TMP_JAMP(2420) + ((0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * TMP_JAMP(1403)  ! used 2 times
+      TMP_JAMP(2983) = TMP_JAMP(2151) + ((-0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * TMP_JAMP(574)  ! used 2 times
+      TMP_JAMP(2982) = TMP_JAMP(2150) - TMP_JAMP(423)  ! used 2 times
+      TMP_JAMP(2981) = TMP_JAMP(2147) + ((-0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * TMP_JAMP(549)  ! used 2 times
+      TMP_JAMP(2980) = TMP_JAMP(2132) +  TMP_JAMP(376)  ! used 2 times
+      TMP_JAMP(2979) = TMP_JAMP(2115) +  TMP_JAMP(264)  ! used 2 times
+      TMP_JAMP(2978) = TMP_JAMP(2114) - TMP_JAMP(263)  ! used 2 times
+      TMP_JAMP(2977) = TMP_JAMP(2111) - TMP_JAMP(1197)  ! used 2 times
+      TMP_JAMP(2976) = TMP_JAMP(2110) + ((-0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * TMP_JAMP(1026)  ! used 2 times
+      TMP_JAMP(2975) = TMP_JAMP(2107) + ((-0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * TMP_JAMP(413)  ! used 2 times
+      TMP_JAMP(2974) = TMP_JAMP(2098) + ((0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * TMP_JAMP(492)  ! used 2 times
+      TMP_JAMP(2973) = TMP_JAMP(2080) - TMP_JAMP(1406)  ! used 2 times
+      TMP_JAMP(2972) = TMP_JAMP(2079) + ((-0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * TMP_JAMP(1395)  ! used 2 times
+      TMP_JAMP(2971) = TMP_JAMP(2074) - TMP_JAMP(377)  ! used 2 times
+      TMP_JAMP(2970) = TMP_JAMP(2056) +  TMP_JAMP(530)  ! used 2 times
+      TMP_JAMP(2969) = TMP_JAMP(2045) - TMP_JAMP(554)  ! used 2 times
+      TMP_JAMP(2968) = TMP_JAMP(2034) +  TMP_JAMP(552)  ! used 2 times
+      TMP_JAMP(2967) = TMP_JAMP(1999) + ((-0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * TMP_JAMP(420)  ! used 2 times
+      TMP_JAMP(2966) = TMP_JAMP(1997) + ((-0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * TMP_JAMP(425)  ! used 2 times
+      TMP_JAMP(2965) = TMP_JAMP(1995) + ((0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * TMP_JAMP(1351)  ! used 2 times
+      TMP_JAMP(2964) = TMP_JAMP(1984) + ((-0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * TMP_JAMP(415)  ! used 2 times
+      TMP_JAMP(2963) = TMP_JAMP(1957) +  TMP_JAMP(545)  ! used 2 times
+      TMP_JAMP(2962) = TMP_JAMP(1940) - TMP_JAMP(374)  ! used 2 times
+      TMP_JAMP(2961) = TMP_JAMP(1934) + ((0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * TMP_JAMP(363)  ! used 2 times
+      TMP_JAMP(2960) = TMP_JAMP(1918) + ((-0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * TMP_JAMP(577)  ! used 2 times
+      TMP_JAMP(2959) = TMP_JAMP(1912) - TMP_JAMP(1416)  ! used 2 times
+      TMP_JAMP(2958) = TMP_JAMP(1897) + ((0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * TMP_JAMP(1408)  ! used 2 times
+      TMP_JAMP(2957) = TMP_JAMP(1878) + ((-0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * TMP_JAMP(928)  ! used 2 times
+      TMP_JAMP(2956) = TMP_JAMP(1864) + ((-0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * TMP_JAMP(1026)  ! used 2 times
+      TMP_JAMP(2955) = TMP_JAMP(1854) + ((-0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * TMP_JAMP(396)  ! used 2 times
+      TMP_JAMP(2954) = TMP_JAMP(1843) + ((0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * TMP_JAMP(1417)  ! used 2 times
+      TMP_JAMP(2953) = TMP_JAMP(1810) - TMP_JAMP(356)  ! used 2 times
+      TMP_JAMP(2952) = TMP_JAMP(1803) - TMP_JAMP(1381)  ! used 2 times
+      TMP_JAMP(2951) = TMP_JAMP(1800) - TMP_JAMP(409)  ! used 2 times
+      TMP_JAMP(2950) = TMP_JAMP(1759) + ((-0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * TMP_JAMP(1410)  ! used 2 times
+      TMP_JAMP(2949) = TMP_JAMP(1757) - TMP_JAMP(372)  ! used 2 times
+      TMP_JAMP(2948) = TMP_JAMP(1753) + ((-0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * TMP_JAMP(378)  ! used 2 times
+      TMP_JAMP(2947) = TMP_JAMP(1714) +  TMP_JAMP(401)  ! used 2 times
+      TMP_JAMP(2946) = TMP_JAMP(1682) + ((0.000000000000000D+00,
+     $ -1.000000000000000D+00)) * TMP_JAMP(544)  ! used 2 times
+      TMP_JAMP(2945) = TMP_JAMP(1610) + ((0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * TMP_JAMP(578)  ! used 2 times
+      TMP_JAMP(2944) = TMP_JAMP(1586) +  TMP_JAMP(1389)  ! used 2 times
+      TMP_JAMP(2943) = TMP_JAMP(1584) + ((-0.000000000000000D+00
+     $ ,1.000000000000000D+00)) * TMP_JAMP(1320)  ! used 2 times
+      TMP_JAMP(2942) = TMP_JAMP(1575) - TMP_JAMP(1414)  ! used 2 times
+      JAMP(1,1) = (-1.000000000000000D+00)*AMP(251)
+     $ +((0.000000000000000D+00,-1.000000000000000D+00))*TMP_JAMP(289)
+     $ +TMP_JAMP(360)+TMP_JAMP(485)+((0.000000000000000D+00,
+     $ -1.000000000000000D+00))*TMP_JAMP(558)+((0.000000000000000D+00,
+     $ -1.000000000000000D+00))*TMP_JAMP(576)+((0.000000000000000D+00
+     $ ,1.000000000000000D+00))*AMP(1485)+(-1.000000000000000D+00)
+     $ *TMP_JAMP(2911)+(-1.000000000000000D+00)*TMP_JAMP(2916)+(
+     $ -1.000000000000000D+00)*TMP_JAMP(2971)+TMP_JAMP(2994)
+      JAMP(2,1) = (-1.000000000000000D+00)*AMP(242)+(
+     $ -1.000000000000000D+00)*TMP_JAMP(359)+TMP_JAMP(388)
+     $ +TMP_JAMP(483)+((0.000000000000000D+00,1.000000000000000D+00))
+     $ *TMP_JAMP(498)+((0.000000000000000D+00,-1.000000000000000D+00))
+     $ *TMP_JAMP(557)+((0.000000000000000D+00,1.000000000000000D+00))
+     $ *TMP_JAMP(576)+((0.000000000000000D+00,-1.000000000000000D+00))
+     $ *TMP_JAMP(1580)+((0.000000000000000D+00,1.000000000000000D+00))
+     $ *AMP(1476)+TMP_JAMP(2655)+(-1.000000000000000D+00)
+     $ *TMP_JAMP(2913)+(-1.000000000000000D+00)*TMP_JAMP(2940)
+      JAMP(3,1) = (-1.000000000000000D+00)*AMP(250)+(
+     $ -1.000000000000000D+00)*TMP_JAMP(484)+((0.000000000000000D+00
+     $ ,1.000000000000000D+00))*TMP_JAMP(495)+((0.000000000000000D+00
+     $ ,1.000000000000000D+00))*TMP_JAMP(558)+((0.000000000000000D+00
+     $ ,1.000000000000000D+00))*TMP_JAMP(575)+(-1.000000000000000D+00)
+     $ *TMP_JAMP(2590)+(-1.000000000000000D+00)*TMP_JAMP(2797)+(
+     $ -1.000000000000000D+00)*TMP_JAMP(2829)+(-1.000000000000000D+00)
+     $ *TMP_JAMP(2915)+TMP_JAMP(2953)+TMP_JAMP(2980)
+      JAMP(4,1) = (-1.000000000000000D+00)*AMP(233)+(
+     $ -1.000000000000000D+00)*TMP_JAMP(259)+((0.000000000000000D+00
+     $ ,1.000000000000000D+00))*TMP_JAMP(470)+((0.000000000000000D+00
+     $ ,1.000000000000000D+00))*TMP_JAMP(497)+((0.000000000000000D+00,
+     $ -1.000000000000000D+00))*TMP_JAMP(575)+((0.000000000000000D+00,
+     $ -1.000000000000000D+00))*TMP_JAMP(1589)+TMP_JAMP(1693)
+     $ +TMP_JAMP(2050)+((0.000000000000000D+00,1.000000000000000D+00))
+     $ *AMP(1467)+(-1.000000000000000D+00)*TMP_JAMP(2353)
+     $ +TMP_JAMP(2659)+TMP_JAMP(2905)+((0.000000000000000D+00
+     $ ,1.000000000000000D+00))*TMP_JAMP(2955)+TMP_JAMP(2960)
+      JAMP(5,1) = (-1.000000000000000D+00)*AMP(241)
+     $ +((0.000000000000000D+00,1.000000000000000D+00))*TMP_JAMP(271)
+     $ +TMP_JAMP(386)+((0.000000000000000D+00,1.000000000000000D+00))
+     $ *TMP_JAMP(515)+((0.000000000000000D+00,1.000000000000000D+00))
+     $ *TMP_JAMP(557)+(-1.000000000000000D+00)*TMP_JAMP(1526)+(
+     $ -1.000000000000000D+00)*TMP_JAMP(1825)+TMP_JAMP(2268)+(
+     $ -1.000000000000000D+00)*TMP_JAMP(2308)+TMP_JAMP(2430)+(
+     $ -1.000000000000000D+00)*TMP_JAMP(2446)+(-1.000000000000000D+00)
+     $ *TMP_JAMP(2654)+(-1.000000000000000D+00)*TMP_JAMP(2963)
+     $ +((0.000000000000000D+00,-1.000000000000000D+00))*TMP_JAMP(2978)
+     $ +(-1.000000000000000D+00)*TMP_JAMP(2985)
+      JAMP(6,1) = (-1.000000000000000D+00)*AMP(232)
+     $ +((0.000000000000000D+00,-1.000000000000000D+00))*TMP_JAMP(287)
+     $ +TMP_JAMP(387)+((0.000000000000000D+00,1.000000000000000D+00))
+     $ *TMP_JAMP(523)+(-1.000000000000000D+00)*TMP_JAMP(2440)+(
+     $ -1.000000000000000D+00)*TMP_JAMP(2743)+(-1.000000000000000D+00)
+     $ *TMP_JAMP(2914)+(-1.000000000000000D+00)*TMP_JAMP(2960)
+     $ +TMP_JAMP(2963)+((0.000000000000000D+00,-1.000000000000000D+00))
+     $ *TMP_JAMP(2979)+(-1.000000000000000D+00)*TMP_JAMP(3007)
+      JAMP(7,1) = (-1.000000000000000D+00)*AMP(253)+(
+     $ -1.000000000000000D+00)*TMP_JAMP(147)+TMP_JAMP(373)
+     $ +((0.000000000000000D+00,-1.000000000000000D+00))*TMP_JAMP(573)
+     $ +((0.000000000000000D+00,-1.000000000000000D+00))*TMP_JAMP(1393)
+     $ +(-1.000000000000000D+00)*TMP_JAMP(1592)+((0.000000000000000D
+     $ +00,1.000000000000000D+00))*TMP_JAMP(1987)+(-1.000000000000000D
+     $ +00)*TMP_JAMP(2627)+TMP_JAMP(2636)+TMP_JAMP(2844)+(
+     $ -1.000000000000000D+00)*TMP_JAMP(2855)+TMP_JAMP(2971)+(
+     $ -1.000000000000000D+00)*TMP_JAMP(2974)
+      JAMP(8,1) = (-1.000000000000000D+00)*AMP(244)
+     $ +((0.000000000000000D+00,1.000000000000000D+00))*TMP_JAMP(117)
+     $ +(-1.000000000000000D+00)*TMP_JAMP(388)+((0.000000000000000D+00
+     $ ,1.000000000000000D+00))*TMP_JAMP(487)+((0.000000000000000D+00
+     $ ,1.000000000000000D+00))*TMP_JAMP(573)+((0.000000000000000D+00
+     $ ,1.000000000000000D+00))*TMP_JAMP(1405)+((0.000000000000000D+00
+     $ ,1.000000000000000D+00))*TMP_JAMP(1777)+((0.000000000000000D+00
+     $ ,-1.000000000000000D+00))*TMP_JAMP(1850)+TMP_JAMP(2513)+(
+     $ -1.000000000000000D+00)*TMP_JAMP(2759)+TMP_JAMP(2854)
+     $ +TMP_JAMP(2924)+(-1.000000000000000D+00)*TMP_JAMP(2949)
+      JAMP(9,1) = (-1.000000000000000D+00)*AMP(252)
+     $ +((0.000000000000000D+00,-1.000000000000000D+00))*TMP_JAMP(272)
+     $ +((0.000000000000000D+00,1.000000000000000D+00))*TMP_JAMP(283)
+     $ +(-1.000000000000000D+00)*TMP_JAMP(371)+(-1.000000000000000D+00)
+     $ *TMP_JAMP(373)+(-1.000000000000000D+00)*TMP_JAMP(379)
+     $ +((0.000000000000000D+00,-1.000000000000000D+00))*TMP_JAMP(490)
+     $ +((0.000000000000000D+00,-1.000000000000000D+00))*TMP_JAMP(1397)
+     $ +((0.000000000000000D+00,-1.000000000000000D+00))*TMP_JAMP(1752)
+     $ +(-1.000000000000000D+00)*TMP_JAMP(2055)+TMP_JAMP(2099)+(
+     $ -1.000000000000000D+00)*TMP_JAMP(2641)+TMP_JAMP(2661)+(
+     $ -1.000000000000000D+00)*TMP_JAMP(2667)+TMP_JAMP(2674)
+     $ +TMP_JAMP(2747)
+      JAMP(10,1) = (-1.000000000000000D+00)*AMP(161)
+     $ +((0.000000000000000D+00,-1.000000000000000D+00))*TMP_JAMP(274)
+     $ +((0.000000000000000D+00,1.000000000000000D+00))*TMP_JAMP(289)
+     $ +TMP_JAMP(371)+(-1.000000000000000D+00)*TMP_JAMP(546)
+     $ +TMP_JAMP(551)+((0.000000000000000D+00,1.000000000000000D+00))
+     $ *TMP_JAMP(1247)+(-1.000000000000000D+00)*TMP_JAMP(1385)+(
+     $ -1.000000000000000D+00)*TMP_JAMP(1621)+(-1.000000000000000D+00)
+     $ *TMP_JAMP(1702)+TMP_JAMP(1837)+TMP_JAMP(1974)
+     $ +((0.000000000000000D+00,1.000000000000000D+00))*TMP_JAMP(2139)
+     $ +TMP_JAMP(2677)+(-1.000000000000000D+00)*TMP_JAMP(2896)
+      JAMP(11,1) = AMP(213)+(-1.000000000000000D+00)*AMP(243)
+     $ +((0.000000000000000D+00,1.000000000000000D+00))*AMP(249)
+     $ +((0.000000000000000D+00,-1.000000000000000D+00))*TMP_JAMP(269)
+     $ +(-1.000000000000000D+00)*TMP_JAMP(391)+((0.000000000000000D+00
+     $ ,-1.000000000000000D+00))*TMP_JAMP(488)+(-1.000000000000000D+00)
+     $ *TMP_JAMP(1415)+TMP_JAMP(1486)+(-1.000000000000000D+00)
+     $ *TMP_JAMP(1642)+TMP_JAMP(1816)+(-1.000000000000000D+00)
+     $ *TMP_JAMP(2276)+(-1.000000000000000D+00)*TMP_JAMP(2673)+(
+     $ -1.000000000000000D+00)*TMP_JAMP(2764)+TMP_JAMP(2775)+(
+     $ -1.000000000000000D+00)*TMP_JAMP(2923)+TMP_JAMP(2949)
+     $ +TMP_JAMP(2962)
       JAMP(12,1) = (-1.000000000000000D+00)*AMP(160)
-     $ +((0.000000000000000D+00,-1.000000000000000D+00))*AMP(201)
-     $ +((0.000000000000000D+00,1.000000000000000D+00))*AMP(496)
-     $ +TMP_JAMP(663)+TMP_JAMP(824)+TMP_JAMP(828)+((0.000000000000000D
-     $ +00,1.000000000000000D+00))*TMP_JAMP(1151)+((0.000000000000000D
-     $ +00,-1.000000000000000D+00))*TMP_JAMP(1220)
-     $ +((0.000000000000000D+00,1.000000000000000D+00))*TMP_JAMP(1223)
-     $ +((0.000000000000000D+00,1.000000000000000D+00))*TMP_JAMP(2024)
-     $ +TMP_JAMP(2571)+TMP_JAMP(2662)+TMP_JAMP(2745)
+     $ +((0.000000000000000D+00,-1.000000000000000D+00))*TMP_JAMP(497)
+     $ +TMP_JAMP(546)+((0.000000000000000D+00,1.000000000000000D+00))
+     $ *TMP_JAMP(1057)+((0.000000000000000D+00,-1.000000000000000D+00))
+     $ *TMP_JAMP(1375)+TMP_JAMP(1959)+(-1.000000000000000D+00)
+     $ *TMP_JAMP(2057)+TMP_JAMP(2594)+(-1.000000000000000D+00)
+     $ *TMP_JAMP(2900)+(-1.000000000000000D+00)*TMP_JAMP(2962)
+     $ +((0.000000000000000D+00,1.000000000000000D+00))*TMP_JAMP(2981)
       JAMP(13,1) = (-1.000000000000000D+00)*AMP(255)
-     $ +((0.000000000000000D+00,1.000000000000000D+00))*AMP(775)
-     $ +((0.000000000000000D+00,1.000000000000000D+00))*TMP_JAMP(2032)
-     $ +(-1.000000000000000D+00)*TMP_JAMP(2034)+((0.000000000000000D
-     $ +00,-1.000000000000000D+00))*TMP_JAMP(2036)
-     $ +((0.000000000000000D+00,1.000000000000000D+00))*TMP_JAMP(2039)
-     $ +(-1.000000000000000D+00)*TMP_JAMP(2126)+((0.000000000000000D
-     $ +00,1.000000000000000D+00))*TMP_JAMP(2647)+(-1.000000000000000D
-     $ +00)*TMP_JAMP(2654)
-      JAMP(14,1) = (-1.000000000000000D+00)*AMP(235)+AMP(311)
-     $ +((0.000000000000000D+00,1.000000000000000D+00))*AMP(766)
-     $ +((0.000000000000000D+00,1.000000000000000D+00))*TMP_JAMP(1207)
-     $ +((0.000000000000000D+00,-1.000000000000000D+00))*TMP_JAMP(2032)
-     $ +((0.000000000000000D+00,1.000000000000000D+00))*TMP_JAMP(2048)
-     $ +((0.000000000000000D+00,-1.000000000000000D+00))*TMP_JAMP(2319)
-     $ +((0.000000000000000D+00,1.000000000000000D+00))*TMP_JAMP(2554)
-     $ +TMP_JAMP(2650)+((0.000000000000000D+00,1.000000000000000D+00))
-     $ *TMP_JAMP(2746)
-      JAMP(15,1) = (-1.000000000000000D+00)*AMP(254)+AMP(300)
-     $ +((0.000000000000000D+00,1.000000000000000D+00))*AMP(774)
-     $ +TMP_JAMP(2033)+(-1.000000000000000D+00)*TMP_JAMP(2035)
-     $ +((0.000000000000000D+00,-1.000000000000000D+00))*TMP_JAMP(2037)
-     $ +((0.000000000000000D+00,-1.000000000000000D+00))*TMP_JAMP(2038)
-     $ +((0.000000000000000D+00,1.000000000000000D+00))*TMP_JAMP(2044)
-     $ +(-1.000000000000000D+00)*TMP_JAMP(2652)+((0.000000000000000D
-     $ +00,1.000000000000000D+00))*TMP_JAMP(2696)
-      JAMP(16,1) = (-1.000000000000000D+00)*AMP(163)+AMP(176)
-     $ +((0.000000000000000D+00,1.000000000000000D+00))*AMP(656)
-     $ +TMP_JAMP(867)+((0.000000000000000D+00,-1.000000000000000D+00))
-     $ *TMP_JAMP(1151)+((0.000000000000000D+00,1.000000000000000D+00))
-     $ *TMP_JAMP(1222)+((0.000000000000000D+00,-1.000000000000000D+00))
-     $ *TMP_JAMP(2044)+((0.000000000000000D+00,1.000000000000000D+00))
-     $ *TMP_JAMP(2053)+TMP_JAMP(2332)+TMP_JAMP(2650)
-     $ +((0.000000000000000D+00,1.000000000000000D+00))*TMP_JAMP(2747)
+     $ +((0.000000000000000D+00,1.000000000000000D+00))*TMP_JAMP(277)
+     $ +((0.000000000000000D+00,1.000000000000000D+00))*TMP_JAMP(491)
+     $ +TMP_JAMP(548)+((0.000000000000000D+00,-1.000000000000000D+00))
+     $ *TMP_JAMP(1396)+(-1.000000000000000D+00)*TMP_JAMP(1678)+(
+     $ -1.000000000000000D+00)*TMP_JAMP(1933)+TMP_JAMP(2636)
+     $ +TMP_JAMP(2788)+(-1.000000000000000D+00)*TMP_JAMP(2904)
+     $ +((0.000000000000000D+00,1.000000000000000D+00))*TMP_JAMP(2948)
+     $ +(-1.000000000000000D+00)*TMP_JAMP(2980)
+      JAMP(14,1) = (-1.000000000000000D+00)*AMP(235)
+     $ +((0.000000000000000D+00,-1.000000000000000D+00))*TMP_JAMP(118)
+     $ +(-1.000000000000000D+00)*TMP_JAMP(154)+TMP_JAMP(382)
+     $ +((0.000000000000000D+00,1.000000000000000D+00))*TMP_JAMP(488)
+     $ +(-1.000000000000000D+00)*TMP_JAMP(548)+((0.000000000000000D+00
+     $ ,1.000000000000000D+00))*TMP_JAMP(1785)+TMP_JAMP(2588)
+     $ +TMP_JAMP(2790)+(-1.000000000000000D+00)*TMP_JAMP(2833)
+     $ +TMP_JAMP(2925)+(-1.000000000000000D+00)*TMP_JAMP(2942)
+     $ +((0.000000000000000D+00,-1.000000000000000D+00))*TMP_JAMP(2955)
+      JAMP(15,1) = (-1.000000000000000D+00)*AMP(254)+(
+     $ -1.000000000000000D+00)*TMP_JAMP(151)+((0.000000000000000D+00,
+     $ -1.000000000000000D+00))*TMP_JAMP(265)+TMP_JAMP(380)
+     $ +TMP_JAMP(1383)+((0.000000000000000D+00,1.000000000000000D+00))
+     $ *TMP_JAMP(1419)+TMP_JAMP(2055)+TMP_JAMP(2738)+(
+     $ -1.000000000000000D+00)*TMP_JAMP(2881)+(-1.000000000000000D+00)
+     $ *TMP_JAMP(2921)+((0.000000000000000D+00,-1.000000000000000D+00))
+     $ *TMP_JAMP(2948)+TMP_JAMP(2994)
+      JAMP(16,1) = (-1.000000000000000D+00)*AMP(163)
+     $ +((0.000000000000000D+00,-1.000000000000000D+00))*TMP_JAMP(266)
+     $ +(-1.000000000000000D+00)*TMP_JAMP(385)+(-1.000000000000000D+00)
+     $ *TMP_JAMP(400)+((0.000000000000000D+00,-1.000000000000000D+00))
+     $ *TMP_JAMP(496)+((0.000000000000000D+00,1.000000000000000D+00))
+     $ *TMP_JAMP(1232)+((0.000000000000000D+00,-1.000000000000000D+00))
+     $ *TMP_JAMP(1419)+(-1.000000000000000D+00)*TMP_JAMP(1712)
+     $ +((0.000000000000000D+00,-1.000000000000000D+00))*TMP_JAMP(2139)
+     $ +TMP_JAMP(2606)+(-1.000000000000000D+00)*TMP_JAMP(2745)
+     $ +TMP_JAMP(2927)+((0.000000000000000D+00,1.000000000000000D+00))
+     $ *TMP_JAMP(2943)
       JAMP(17,1) = (-1.000000000000000D+00)*AMP(234)
-     $ +((0.000000000000000D+00,-1.000000000000000D+00))*TMP_JAMP(1945)
-     $ +((0.000000000000000D+00,-1.000000000000000D+00))*TMP_JAMP(2048)
-     $ +TMP_JAMP(2338)+(-1.000000000000000D+00)*TMP_JAMP(2342)
-     $ +((0.000000000000000D+00,1.000000000000000D+00))*TMP_JAMP(2651)
-     $ +(-1.000000000000000D+00)*TMP_JAMP(2652)+TMP_JAMP(2663)+(
-     $ -1.000000000000000D+00)*TMP_JAMP(2748)
+     $ +((0.000000000000000D+00,-1.000000000000000D+00))*TMP_JAMP(279)
+     $ +(-1.000000000000000D+00)*TMP_JAMP(382)+(-1.000000000000000D+00)
+     $ *TMP_JAMP(383)+(-1.000000000000000D+00)*TMP_JAMP(398)
+     $ +((0.000000000000000D+00,-1.000000000000000D+00))*TMP_JAMP(489)
+     $ +TMP_JAMP(667)+(-1.000000000000000D+00)*TMP_JAMP(1520)
+     $ +TMP_JAMP(1639)+(-1.000000000000000D+00)*TMP_JAMP(1665)
+     $ +TMP_JAMP(1817)+(-1.000000000000000D+00)*TMP_JAMP(2280)
+     $ +TMP_JAMP(2551)+(-1.000000000000000D+00)*TMP_JAMP(2672)+(
+     $ -1.000000000000000D+00)*TMP_JAMP(2810)+TMP_JAMP(2972)+(
+     $ -1.000000000000000D+00)*TMP_JAMP(3029)
       JAMP(18,1) = (-1.000000000000000D+00)*AMP(162)
      $ +((0.000000000000000D+00,1.000000000000000D+00))*AMP(174)
-     $ +((0.000000000000000D+00,1.000000000000000D+00))*AMP(655)
-     $ +((0.000000000000000D+00,1.000000000000000D+00))*TMP_JAMP(1198)
-     $ +((0.000000000000000D+00,-1.000000000000000D+00))*TMP_JAMP(2049)
-     $ +((0.000000000000000D+00,-1.000000000000000D+00))*TMP_JAMP(2053)
-     $ +((0.000000000000000D+00,-1.000000000000000D+00))*TMP_JAMP(2070)
-     $ +TMP_JAMP(2077)+((0.000000000000000D+00,1.000000000000000D+00))
-     $ *TMP_JAMP(2344)+((0.000000000000000D+00,1.000000000000000D+00))
-     $ *TMP_JAMP(2612)+(-1.000000000000000D+00)*TMP_JAMP(2654)
-     $ +TMP_JAMP(2748)
-      JAMP(19,1) = ((0.000000000000000D+00,-1.000000000000000D+00))
-     $ *AMP(226)+(-1.000000000000000D+00)*AMP(246)
-     $ +((0.000000000000000D+00,1.000000000000000D+00))*AMP(934)
-     $ +TMP_JAMP(611)+TMP_JAMP(737)+((0.000000000000000D+00
-     $ ,1.000000000000000D+00))*TMP_JAMP(2055)+((0.000000000000000D+00
-     $ ,1.000000000000000D+00))*TMP_JAMP(2656)+(-1.000000000000000D+00)
-     $ *TMP_JAMP(2682)+((0.000000000000000D+00,1.000000000000000D+00))
-     $ *TMP_JAMP(2749)+((0.000000000000000D+00,1.000000000000000D+00))
-     $ *TMP_JAMP(2750)
-      JAMP(20,1) = ((0.000000000000000D+00,-1.000000000000000D+00))
-     $ *AMP(223)+(-1.000000000000000D+00)*AMP(237)+AMP(866)
-     $ +TMP_JAMP(717)+((0.000000000000000D+00,1.000000000000000D+00))
-     $ *TMP_JAMP(1206)+((0.000000000000000D+00,1.000000000000000D+00))
-     $ *TMP_JAMP(1230)+TMP_JAMP(1232)+((0.000000000000000D+00
-     $ ,1.000000000000000D+00))*TMP_JAMP(1930)+((0.000000000000000D+00
-     $ ,1.000000000000000D+00))*TMP_JAMP(2057)+((0.000000000000000D+00
-     $ ,-1.000000000000000D+00))*TMP_JAMP(2074)+((0.000000000000000D
-     $ +00,1.000000000000000D+00))*TMP_JAMP(2160)+TMP_JAMP(2503)
-     $ +((0.000000000000000D+00,-1.000000000000000D+00))*TMP_JAMP(2749)
-      JAMP(21,1) = (-1.000000000000000D+00)*AMP(245)
-     $ +((0.000000000000000D+00,1.000000000000000D+00))*AMP(933)+(
-     $ -1.000000000000000D+00)*TMP_JAMP(723)+((0.000000000000000D+00,
-     $ -1.000000000000000D+00))*TMP_JAMP(1219)+((0.000000000000000D+00
-     $ ,1.000000000000000D+00))*TMP_JAMP(1223)+TMP_JAMP(1658)
-     $ +((0.000000000000000D+00,1.000000000000000D+00))*TMP_JAMP(1938)
-     $ +((0.000000000000000D+00,-1.000000000000000D+00))*TMP_JAMP(2658)
-     $ +(-1.000000000000000D+00)*TMP_JAMP(2660)+((0.000000000000000D
-     $ +00,1.000000000000000D+00))*TMP_JAMP(2661)+((0.000000000000000D
-     $ +00,-1.000000000000000D+00))*TMP_JAMP(2750)
-      JAMP(22,1) = (-1.000000000000000D+00)*AMP(165)
-     $ +((0.000000000000000D+00,1.000000000000000D+00))*AMP(815)
-     $ +AMP(830)+((0.000000000000000D+00,-1.000000000000000D+00))
-     $ *TMP_JAMP(1217)+((0.000000000000000D+00,1.000000000000000D+00))
-     $ *TMP_JAMP(1218)+((0.000000000000000D+00,-1.000000000000000D+00))
-     $ *TMP_JAMP(1932)+((0.000000000000000D+00,1.000000000000000D+00))
-     $ *TMP_JAMP(2067)+((0.000000000000000D+00,1.000000000000000D+00))
-     $ *TMP_JAMP(2069)+((0.000000000000000D+00,1.000000000000000D+00))
-     $ *TMP_JAMP(2070)+((0.000000000000000D+00,-1.000000000000000D+00))
-     $ *TMP_JAMP(2078)+((0.000000000000000D+00,-1.000000000000000D+00))
-     $ *TMP_JAMP(2283)+((0.000000000000000D+00,-1.000000000000000D+00))
-     $ *TMP_JAMP(2661)+(-1.000000000000000D+00)*TMP_JAMP(2662)
+     $ +((0.000000000000000D+00,1.000000000000000D+00))*TMP_JAMP(282)
+     $ +TMP_JAMP(383)+TMP_JAMP(385)+(-1.000000000000000D+00)
+     $ *TMP_JAMP(402)+((0.000000000000000D+00,-1.000000000000000D+00))
+     $ *TMP_JAMP(498)+(-1.000000000000000D+00)*TMP_JAMP(1640)
+     $ +TMP_JAMP(1708)+TMP_JAMP(1962)+((0.000000000000000D+00,
+     $ -1.000000000000000D+00))*TMP_JAMP(2120)+TMP_JAMP(2515)
+     $ +TMP_JAMP(2561)+(-1.000000000000000D+00)*TMP_JAMP(2823)+(
+     $ -1.000000000000000D+00)*TMP_JAMP(2958)
+      JAMP(19,1) = (-1.000000000000000D+00)*AMP(246)
+     $ +((0.000000000000000D+00,1.000000000000000D+00))*TMP_JAMP(276)
+     $ +((0.000000000000000D+00,-1.000000000000000D+00))*TMP_JAMP(327)
+     $ +(-1.000000000000000D+00)*TMP_JAMP(389)+TMP_JAMP(404)
+     $ +((0.000000000000000D+00,1.000000000000000D+00))*TMP_JAMP(1399)
+     $ +(-1.000000000000000D+00)*TMP_JAMP(1638)+TMP_JAMP(1809)
+     $ +((0.000000000000000D+00,1.000000000000000D+00))*TMP_JAMP(2005)
+     $ +(-1.000000000000000D+00)*TMP_JAMP(2439)+TMP_JAMP(2513)+(
+     $ -1.000000000000000D+00)*TMP_JAMP(2717)+TMP_JAMP(2873)+(
+     $ -1.000000000000000D+00)*TMP_JAMP(2946)+((0.000000000000000D+00
+     $ ,1.000000000000000D+00))*TMP_JAMP(2978)
+      JAMP(20,1) = (-1.000000000000000D+00)*AMP(237)
+     $ +((0.000000000000000D+00,1.000000000000000D+00))*TMP_JAMP(273)
+     $ +((0.000000000000000D+00,-1.000000000000000D+00))*TMP_JAMP(328)
+     $ +TMP_JAMP(395)+(-1.000000000000000D+00)*TMP_JAMP(397)
+     $ +((0.000000000000000D+00,-1.000000000000000D+00))*TMP_JAMP(1401)
+     $ +(-1.000000000000000D+00)*TMP_JAMP(1508)+(-1.000000000000000D
+     $ +00)*TMP_JAMP(1660)+((0.000000000000000D+00,1.000000000000000D
+     $ +00))*TMP_JAMP(1784)+TMP_JAMP(1803)+(-1.000000000000000D+00)
+     $ *TMP_JAMP(2451)+TMP_JAMP(2588)+TMP_JAMP(2723)+TMP_JAMP(2869)
+     $ +TMP_JAMP(2946)+((0.000000000000000D+00,1.000000000000000D+00))
+     $ *TMP_JAMP(2979)
+      JAMP(21,1) = (-1.000000000000000D+00)*AMP(245)+(
+     $ -1.000000000000000D+00)*TMP_JAMP(163)+TMP_JAMP(390)
+     $ +TMP_JAMP(392)+((0.000000000000000D+00,-1.000000000000000D+00))
+     $ *TMP_JAMP(493)+(-1.000000000000000D+00)*TMP_JAMP(550)
+     $ +TMP_JAMP(759)+(-1.000000000000000D+00)*TMP_JAMP(1376)
+     $ +TMP_JAMP(1636)+TMP_JAMP(1643)+TMP_JAMP(1659)+(
+     $ -1.000000000000000D+00)*TMP_JAMP(1808)+((0.000000000000000D+00,
+     $ -1.000000000000000D+00))*TMP_JAMP(2005)+(-1.000000000000000D+00)
+     $ *TMP_JAMP(2307)+TMP_JAMP(2485)+(-1.000000000000000D+00)
+     $ *TMP_JAMP(2703)+(-1.000000000000000D+00)*TMP_JAMP(2872)
+     $ +TMP_JAMP(2923)
+      JAMP(22,1) = (-1.000000000000000D+00)*AMP(165)+(
+     $ -1.000000000000000D+00)*TMP_JAMP(164)+TMP_JAMP(394)
+     $ +((0.000000000000000D+00,-1.000000000000000D+00))*TMP_JAMP(495)
+     $ +((0.000000000000000D+00,1.000000000000000D+00))*TMP_JAMP(1569)
+     $ +TMP_JAMP(2495)+(-1.000000000000000D+00)*TMP_JAMP(2769)
+     $ +TMP_JAMP(2919)+(-1.000000000000000D+00)*TMP_JAMP(2947)
+     $ +TMP_JAMP(2952)+((0.000000000000000D+00,-1.000000000000000D+00))
+     $ *TMP_JAMP(2981)
       JAMP(23,1) = (-1.000000000000000D+00)*AMP(236)
-     $ +((0.000000000000000D+00,-1.000000000000000D+00))*TMP_JAMP(1225)
-     $ +((0.000000000000000D+00,-1.000000000000000D+00))*TMP_JAMP(1229)
-     $ +TMP_JAMP(1233)+((0.000000000000000D+00,-1.000000000000000D+00))
-     $ *TMP_JAMP(1241)+TMP_JAMP(1668)+((0.000000000000000D+00
-     $ ,1.000000000000000D+00))*TMP_JAMP(1940)+((0.000000000000000D+00
-     $ ,1.000000000000000D+00))*TMP_JAMP(2072)+((0.000000000000000D+00
-     $ ,1.000000000000000D+00))*TMP_JAMP(2074)+TMP_JAMP(2075)+(
-     $ -1.000000000000000D+00)*TMP_JAMP(2663)+((0.000000000000000D+00
-     $ ,1.000000000000000D+00))*TMP_JAMP(2664)+TMP_JAMP(2723)
-      JAMP(24,1) = (-1.000000000000000D+00)*AMP(164)+TMP_JAMP(737)
-     $ +TMP_JAMP(739)+((0.000000000000000D+00,1.000000000000000D+00))
-     $ *TMP_JAMP(743)+((0.000000000000000D+00,1.000000000000000D+00))
-     $ *TMP_JAMP(1235)+((0.000000000000000D+00,1.000000000000000D+00))
-     $ *TMP_JAMP(1237)+((0.000000000000000D+00,1.000000000000000D+00))
-     $ *TMP_JAMP(1239)+((0.000000000000000D+00,1.000000000000000D+00))
-     $ *TMP_JAMP(1242)+TMP_JAMP(1245)+TMP_JAMP(1246)+TMP_JAMP(1248)+(
-     $ -1.000000000000000D+00)*TMP_JAMP(2077)+((0.000000000000000D+00
-     $ ,1.000000000000000D+00))*TMP_JAMP(2078)+((0.000000000000000D+00
-     $ ,-1.000000000000000D+00))*TMP_JAMP(2079)+TMP_JAMP(2523)+(
-     $ -1.000000000000000D+00)*TMP_JAMP(2528)+((0.000000000000000D+00,
-     $ -1.000000000000000D+00))*TMP_JAMP(2664)
-      JAMP(25,1) = (-1.000000000000000D+00)*AMP(974)
-     $ +((0.000000000000000D+00,-1.000000000000000D+00))*TMP_JAMP(2665)
-     $ +((0.000000000000000D+00,1.000000000000000D+00))*TMP_JAMP(2671)
-     $ +TMP_JAMP(2751)+(-1.000000000000000D+00)*TMP_JAMP(2752)
-      JAMP(26,1) = ((0.000000000000000D+00,1.000000000000000D+00))
-     $ *AMP(487)+(-1.000000000000000D+00)*AMP(965)+(
-     $ -1.000000000000000D+00)*AMP(1010)+TMP_JAMP(812)+TMP_JAMP(815)
-     $ +TMP_JAMP(1256)+TMP_JAMP(1258)+((0.000000000000000D+00
-     $ ,1.000000000000000D+00))*TMP_JAMP(1303)+((0.000000000000000D+00
-     $ ,1.000000000000000D+00))*TMP_JAMP(1305)+(-1.000000000000000D+00)
-     $ *TMP_JAMP(2084)+(-1.000000000000000D+00)*TMP_JAMP(2099)
-     $ +TMP_JAMP(2167)+(-1.000000000000000D+00)*TMP_JAMP(2667)+(
-     $ -1.000000000000000D+00)*TMP_JAMP(2751)
-      JAMP(27,1) = (-1.000000000000000D+00)*AMP(973)+(
-     $ -1.000000000000000D+00)*TMP_JAMP(1254)+((0.000000000000000D+00,
-     $ -1.000000000000000D+00))*TMP_JAMP(1476)+TMP_JAMP(2088)+(
-     $ -1.000000000000000D+00)*TMP_JAMP(2092)+TMP_JAMP(2125)
-     $ +((0.000000000000000D+00,-1.000000000000000D+00))*TMP_JAMP(2128)
-     $ +TMP_JAMP(2752)
-      JAMP(28,1) = AMP(142)+(-1.000000000000000D+00)*AMP(956)
-     $ +TMP_JAMP(901)+TMP_JAMP(1253)+((0.000000000000000D+00
-     $ ,1.000000000000000D+00))*TMP_JAMP(1320)+TMP_JAMP(1329)
-     $ +TMP_JAMP(2092)+TMP_JAMP(2093)+((0.000000000000000D+00,
-     $ -1.000000000000000D+00))*TMP_JAMP(2094)+(-1.000000000000000D+00)
-     $ *TMP_JAMP(2102)+((0.000000000000000D+00,1.000000000000000D+00))
-     $ *TMP_JAMP(2754)
-      JAMP(29,1) = ((0.000000000000000D+00,1.000000000000000D+00))
-     $ *AMP(808)+(-1.000000000000000D+00)*AMP(964)+AMP(1637)+(
-     $ -1.000000000000000D+00)*AMP(1708)+(-1.000000000000000D+00)
-     $ *TMP_JAMP(698)+(-1.000000000000000D+00)*TMP_JAMP(1255)
-     $ +((0.000000000000000D+00,-1.000000000000000D+00))*TMP_JAMP(1283)
-     $ +((0.000000000000000D+00,-1.000000000000000D+00))*TMP_JAMP(1307)
-     $ +TMP_JAMP(1590)+((0.000000000000000D+00,-1.000000000000000D+00))
-     $ *TMP_JAMP(1648)+(-1.000000000000000D+00)*TMP_JAMP(2095)+(
-     $ -1.000000000000000D+00)*TMP_JAMP(2096)+((0.000000000000000D+00,
-     $ -1.000000000000000D+00))*TMP_JAMP(2098)+TMP_JAMP(2099)
-     $ +TMP_JAMP(2110)+((0.000000000000000D+00,-1.000000000000000D+00))
-     $ *TMP_JAMP(2670)
-      JAMP(30,1) = AMP(136)+(-1.000000000000000D+00)*AMP(955)+(
-     $ -1.000000000000000D+00)*AMP(1762)+TMP_JAMP(702)+TMP_JAMP(704)
-     $ +((0.000000000000000D+00,-1.000000000000000D+00))*TMP_JAMP(706)
-     $ +((0.000000000000000D+00,-1.000000000000000D+00))*TMP_JAMP(764)
-     $ +TMP_JAMP(1260)+((0.000000000000000D+00,-1.000000000000000D+00))
-     $ *TMP_JAMP(1322)+TMP_JAMP(2101)+TMP_JAMP(2105)+TMP_JAMP(2146)+(
-     $ -1.000000000000000D+00)*TMP_JAMP(2161)+((0.000000000000000D+00
-     $ ,1.000000000000000D+00))*TMP_JAMP(2458)+TMP_JAMP(2669)
-     $ +((0.000000000000000D+00,1.000000000000000D+00))*TMP_JAMP(2670)
-      JAMP(31,1) = (-1.000000000000000D+00)*AMP(977)
-     $ +((0.000000000000000D+00,1.000000000000000D+00))*AMP(1142)
-     $ +((0.000000000000000D+00,1.000000000000000D+00))*TMP_JAMP(1387)
-     $ +((0.000000000000000D+00,-1.000000000000000D+00))*TMP_JAMP(1936)
-     $ +((0.000000000000000D+00,-1.000000000000000D+00))*TMP_JAMP(2671)
-     $ +(-1.000000000000000D+00)*TMP_JAMP(2673)+((0.000000000000000D
-     $ +00,1.000000000000000D+00))*TMP_JAMP(2753)
+     $ +((0.000000000000000D+00,1.000000000000000D+00))*TMP_JAMP(280)
+     $ +(-1.000000000000000D+00)*TMP_JAMP(395)+TMP_JAMP(399)+(
+     $ -1.000000000000000D+00)*TMP_JAMP(405)+(-1.000000000000000D+00)
+     $ *TMP_JAMP(1808)+((0.000000000000000D+00,-1.000000000000000D+00))
+     $ *TMP_JAMP(2010)+(-1.000000000000000D+00)*TMP_JAMP(2348)
+     $ +TMP_JAMP(2544)+(-1.000000000000000D+00)*TMP_JAMP(2871)
+     $ +TMP_JAMP(2974)+(-1.000000000000000D+00)*TMP_JAMP(2984)
+     $ +TMP_JAMP(3029)
+      JAMP(24,1) = (-1.000000000000000D+00)*AMP(164)
+     $ +((0.000000000000000D+00,1.000000000000000D+00))*TMP_JAMP(128)
+     $ +((0.000000000000000D+00,-1.000000000000000D+00))*TMP_JAMP(281)
+     $ +((0.000000000000000D+00,1.000000000000000D+00))*TMP_JAMP(286)
+     $ +((0.000000000000000D+00,-1.000000000000000D+00))*TMP_JAMP(330)
+     $ +TMP_JAMP(403)+TMP_JAMP(406)+((0.000000000000000D+00
+     $ ,1.000000000000000D+00))*TMP_JAMP(1404)+TMP_JAMP(1605)+(
+     $ -1.000000000000000D+00)*TMP_JAMP(1804)+((0.000000000000000D+00
+     $ ,1.000000000000000D+00))*TMP_JAMP(2120)+TMP_JAMP(2555)+(
+     $ -1.000000000000000D+00)*TMP_JAMP(2815)+TMP_JAMP(2920)
+     $ +TMP_JAMP(2947)
+      JAMP(25,1) = (-1.000000000000000D+00)*TMP_JAMP(360)
+     $ +((0.000000000000000D+00,-1.000000000000000D+00))*TMP_JAMP(454)
+     $ +((0.000000000000000D+00,1.000000000000000D+00))*TMP_JAMP(517)
+     $ +(-1.000000000000000D+00)*AMP(974)+(-1.000000000000000D+00)
+     $ *TMP_JAMP(1843)+TMP_JAMP(1859)+((0.000000000000000D+00
+     $ ,1.000000000000000D+00))*TMP_JAMP(2085)+TMP_JAMP(2104)+(
+     $ -1.000000000000000D+00)*TMP_JAMP(2662)+TMP_JAMP(2851)
+     $ +TMP_JAMP(2865)+TMP_JAMP(3018)
+      JAMP(26,1) = TMP_JAMP(359)+((0.000000000000000D+00,
+     $ -1.000000000000000D+00))*TMP_JAMP(463)+((0.000000000000000D+00,
+     $ -1.000000000000000D+00))*TMP_JAMP(518)+(-1.000000000000000D+00)
+     $ *TMP_JAMP(834)+(-1.000000000000000D+00)*TMP_JAMP(1019)
+     $ +((0.000000000000000D+00,1.000000000000000D+00))*TMP_JAMP(1374)
+     $ +(-1.000000000000000D+00)*AMP(965)+(-1.000000000000000D+00)
+     $ *TMP_JAMP(1479)+TMP_JAMP(1842)+((0.000000000000000D+00,
+     $ -1.000000000000000D+00))*TMP_JAMP(2085)+(-1.000000000000000D+00)
+     $ *TMP_JAMP(2129)+(-1.000000000000000D+00)*TMP_JAMP(2648)
+     $ +TMP_JAMP(2758)+TMP_JAMP(2778)+(-1.000000000000000D+00)
+     $ *TMP_JAMP(2859)
+      JAMP(27,1) = ((0.000000000000000D+00,-1.000000000000000D+00))
+     $ *TMP_JAMP(453)+((0.000000000000000D+00,-1.000000000000000D+00))
+     $ *TMP_JAMP(513)+(-1.000000000000000D+00)*TMP_JAMP(809)+(
+     $ -1.000000000000000D+00)*TMP_JAMP(1028)+((0.000000000000000D+00,
+     $ -1.000000000000000D+00))*TMP_JAMP(1373)+(-1.000000000000000D+00)
+     $ *AMP(973)+(-1.000000000000000D+00)*TMP_JAMP(1963)+TMP_JAMP(2060)
+     $ +(-1.000000000000000D+00)*TMP_JAMP(2104)+TMP_JAMP(2317)
+     $ +TMP_JAMP(2387)+TMP_JAMP(2567)+(-1.000000000000000D+00)
+     $ *TMP_JAMP(2604)+TMP_JAMP(2796)+TMP_JAMP(2811)+(
+     $ -1.000000000000000D+00)*TMP_JAMP(2953)
+      JAMP(28,1) = ((0.000000000000000D+00,-1.000000000000000D+00))
+     $ *TMP_JAMP(316)+((0.000000000000000D+00,-1.000000000000000D+00))
+     $ *TMP_JAMP(470)+((0.000000000000000D+00,1.000000000000000D+00))
+     $ *TMP_JAMP(514)+((0.000000000000000D+00,1.000000000000000D+00))
+     $ *TMP_JAMP(735)+((0.000000000000000D+00,1.000000000000000D+00))
+     $ *TMP_JAMP(1392)+(-1.000000000000000D+00)*AMP(956)+TMP_JAMP(1448)
+     $ +(-1.000000000000000D+00)*TMP_JAMP(1839)+((0.000000000000000D
+     $ +00,1.000000000000000D+00))*TMP_JAMP(1846)+(-1.000000000000000D
+     $ +00)*TMP_JAMP(1919)+TMP_JAMP(1963)+(-1.000000000000000D+00)
+     $ *TMP_JAMP(1967)+(-1.000000000000000D+00)*TMP_JAMP(2657)+(
+     $ -1.000000000000000D+00)*TMP_JAMP(2789)+TMP_JAMP(2824)
+     $ +TMP_JAMP(2835)
+      JAMP(29,1) = ((0.000000000000000D+00,-1.000000000000000D+00))
+     $ *TMP_JAMP(314)+((0.000000000000000D+00,-1.000000000000000D+00))
+     $ *TMP_JAMP(462)+((0.000000000000000D+00,1.000000000000000D+00))
+     $ *TMP_JAMP(717)+(-1.000000000000000D+00)*AMP(964)+TMP_JAMP(1709)
+     $ +(-1.000000000000000D+00)*TMP_JAMP(1874)+TMP_JAMP(2061)
+     $ +TMP_JAMP(2129)+AMP(1638)+TMP_JAMP(2445)+(-1.000000000000000D
+     $ +00)*TMP_JAMP(2493)+TMP_JAMP(2647)+TMP_JAMP(2985)+TMP_JAMP(2996)
+      JAMP(30,1) = ((0.000000000000000D+00,-1.000000000000000D+00))
+     $ *TMP_JAMP(320)+((0.000000000000000D+00,1.000000000000000D+00))
+     $ *TMP_JAMP(520)+(-1.000000000000000D+00)*AMP(955)+(
+     $ -1.000000000000000D+00)*TMP_JAMP(1840)+TMP_JAMP(1874)
+     $ +TMP_JAMP(1919)+TMP_JAMP(1966)+((0.000000000000000D+00,
+     $ -1.000000000000000D+00))*TMP_JAMP(2064)+TMP_JAMP(2250)+(
+     $ -1.000000000000000D+00)*TMP_JAMP(2553)+TMP_JAMP(2656)
+     $ +TMP_JAMP(3000)+TMP_JAMP(3007)
+      JAMP(31,1) = TMP_JAMP(804)+((0.000000000000000D+00,
+     $ -1.000000000000000D+00))*TMP_JAMP(1391)+(-1.000000000000000D+00)
+     $ *AMP(977)+TMP_JAMP(1857)+TMP_JAMP(1894)+TMP_JAMP(2130)
+     $ +TMP_JAMP(2609)+(-1.000000000000000D+00)*TMP_JAMP(2816)
+     $ +TMP_JAMP(2825)+(-1.000000000000000D+00)*TMP_JAMP(2863)+(
+     $ -1.000000000000000D+00)*TMP_JAMP(3018)
       JAMP(32,1) = ((0.000000000000000D+00,-1.000000000000000D+00))
-     $ *AMP(487)+(-1.000000000000000D+00)*AMP(968)+(
-     $ -1.000000000000000D+00)*TMP_JAMP(709)+((0.000000000000000D+00
-     $ ,1.000000000000000D+00))*TMP_JAMP(1394)+((0.000000000000000D+00
-     $ ,-1.000000000000000D+00))*TMP_JAMP(2108)+(-1.000000000000000D
-     $ +00)*TMP_JAMP(2110)+TMP_JAMP(2118)+((0.000000000000000D+00
-     $ ,1.000000000000000D+00))*TMP_JAMP(2198)+TMP_JAMP(2223)
-     $ +((0.000000000000000D+00,-1.000000000000000D+00))*TMP_JAMP(2681)
-     $ +((0.000000000000000D+00,-1.000000000000000D+00))*TMP_JAMP(2753)
-      JAMP(33,1) = (-1.000000000000000D+00)*AMP(690)+(
-     $ -1.000000000000000D+00)*AMP(975)+(-1.000000000000000D+00)
-     $ *AMP(1654)+(-1.000000000000000D+00)*TMP_JAMP(2114)
-     $ +((0.000000000000000D+00,1.000000000000000D+00))*TMP_JAMP(2116)
-     $ +TMP_JAMP(2133)+((0.000000000000000D+00,1.000000000000000D+00))
-     $ *TMP_JAMP(2137)+TMP_JAMP(2673)+TMP_JAMP(2692)
-      JAMP(34,1) = (-1.000000000000000D+00)*AMP(323)
-     $ +((0.000000000000000D+00,1.000000000000000D+00))*AMP(327)+(
-     $ -1.000000000000000D+00)*AMP(834)+((0.000000000000000D+00
-     $ ,1.000000000000000D+00))*TMP_JAMP(2113)+((0.000000000000000D+00
-     $ ,-1.000000000000000D+00))*TMP_JAMP(2115)+(-1.000000000000000D
-     $ +00)*TMP_JAMP(2140)+(-1.000000000000000D+00)*TMP_JAMP(2229)+(
-     $ -1.000000000000000D+00)*TMP_JAMP(2675)
-      JAMP(35,1) = ((0.000000000000000D+00,-1.000000000000000D+00))
-     $ *AMP(809)+(-1.000000000000000D+00)*AMP(966)+TMP_JAMP(1524)
-     $ +((0.000000000000000D+00,-1.000000000000000D+00))*TMP_JAMP(1938)
-     $ +(-1.000000000000000D+00)*TMP_JAMP(2117)+TMP_JAMP(2119)
-     $ +TMP_JAMP(2234)+(-1.000000000000000D+00)*TMP_JAMP(2238)+(
-     $ -1.000000000000000D+00)*TMP_JAMP(2674)+((0.000000000000000D+00
-     $ ,1.000000000000000D+00))*TMP_JAMP(2683)
-      JAMP(36,1) = (-1.000000000000000D+00)*AMP(321)
-     $ +((0.000000000000000D+00,1.000000000000000D+00))*AMP(330)
-     $ +((0.000000000000000D+00,1.000000000000000D+00))*AMP(342)+(
-     $ -1.000000000000000D+00)*TMP_JAMP(856)+((0.000000000000000D+00,
-     $ -1.000000000000000D+00))*TMP_JAMP(2122)+(-1.000000000000000D+00)
-     $ *TMP_JAMP(2173)+TMP_JAMP(2586)+TMP_JAMP(2674)+TMP_JAMP(2675)
-      JAMP(37,1) = (-1.000000000000000D+00)*AMP(978)
-     $ +((0.000000000000000D+00,1.000000000000000D+00))*AMP(1220)+(
-     $ -1.000000000000000D+00)*AMP(1704)+((0.000000000000000D+00,
-     $ -1.000000000000000D+00))*TMP_JAMP(1385)+(-1.000000000000000D+00)
-     $ *TMP_JAMP(2124)+(-1.000000000000000D+00)*TMP_JAMP(2127)
-     $ +((0.000000000000000D+00,1.000000000000000D+00))*TMP_JAMP(2128)
-     $ +((0.000000000000000D+00,1.000000000000000D+00))*TMP_JAMP(2131)
-     $ +((0.000000000000000D+00,-1.000000000000000D+00))*TMP_JAMP(2135)
-     $ +TMP_JAMP(2372)+TMP_JAMP(2680)
-      JAMP(38,1) = (-1.000000000000000D+00)*AMP(959)+AMP(1763)
-     $ +((0.000000000000000D+00,-1.000000000000000D+00))*TMP_JAMP(1297)
-     $ +((0.000000000000000D+00,-1.000000000000000D+00))*TMP_JAMP(2131)
-     $ +((0.000000000000000D+00,1.000000000000000D+00))*TMP_JAMP(2147)
-     $ +TMP_JAMP(2357)+(-1.000000000000000D+00)*TMP_JAMP(2366)
-     $ +((0.000000000000000D+00,-1.000000000000000D+00))*TMP_JAMP(2516)
-     $ +TMP_JAMP(2677)+((0.000000000000000D+00,-1.000000000000000D+00))
-     $ *TMP_JAMP(2754)
-      JAMP(39,1) = (-1.000000000000000D+00)*AMP(976)
-     $ +((0.000000000000000D+00,-1.000000000000000D+00))*TMP_JAMP(1515)
-     $ +(-1.000000000000000D+00)*TMP_JAMP(2132)+TMP_JAMP(2134)
-     $ +((0.000000000000000D+00,1.000000000000000D+00))*TMP_JAMP(2136)
-     $ +((0.000000000000000D+00,-1.000000000000000D+00))*TMP_JAMP(2138)
-     $ +((0.000000000000000D+00,-1.000000000000000D+00))*TMP_JAMP(2139)
-     $ +((0.000000000000000D+00,1.000000000000000D+00))*TMP_JAMP(2144)
-     $ +(-1.000000000000000D+00)*TMP_JAMP(2355)+TMP_JAMP(2370)+(
-     $ -1.000000000000000D+00)*TMP_JAMP(2678)
-      JAMP(40,1) = (-1.000000000000000D+00)*AMP(324)+TMP_JAMP(2140)
-     $ +((0.000000000000000D+00,1.000000000000000D+00))*TMP_JAMP(2143)
-     $ +((0.000000000000000D+00,-1.000000000000000D+00))*TMP_JAMP(2144)
-     $ +TMP_JAMP(2149)+(-1.000000000000000D+00)*TMP_JAMP(2378)
-     $ +((0.000000000000000D+00,1.000000000000000D+00))*TMP_JAMP(2381)
-     $ +TMP_JAMP(2677)
-      JAMP(41,1) = ((0.000000000000000D+00,-1.000000000000000D+00))
-     $ *AMP(631)+(-1.000000000000000D+00)*AMP(957)+TMP_JAMP(963)
-     $ +((0.000000000000000D+00,-1.000000000000000D+00))*TMP_JAMP(1940)
-     $ +TMP_JAMP(2146)+((0.000000000000000D+00,-1.000000000000000D+00))
-     $ *TMP_JAMP(2147)+TMP_JAMP(2387)+TMP_JAMP(2392)+(
-     $ -1.000000000000000D+00)*TMP_JAMP(2678)+((0.000000000000000D+00
-     $ ,1.000000000000000D+00))*TMP_JAMP(2679)+TMP_JAMP(2755)
-      JAMP(42,1) = (-1.000000000000000D+00)*AMP(319)
-     $ +((0.000000000000000D+00,1.000000000000000D+00))*AMP(360)+(
-     $ -1.000000000000000D+00)*AMP(468)+(-1.000000000000000D+00)
-     $ *TMP_JAMP(958)+((0.000000000000000D+00,-1.000000000000000D+00))
-     $ *TMP_JAMP(1795)+(-1.000000000000000D+00)*TMP_JAMP(2148)
-     $ +((0.000000000000000D+00,-1.000000000000000D+00))*TMP_JAMP(2153)
-     $ +((0.000000000000000D+00,1.000000000000000D+00))*TMP_JAMP(2189)
-     $ +((0.000000000000000D+00,-1.000000000000000D+00))*TMP_JAMP(2394)
-     $ +TMP_JAMP(2630)+((0.000000000000000D+00,-1.000000000000000D+00))
-     $ *TMP_JAMP(2679)+TMP_JAMP(2680)
-      JAMP(43,1) = AMP(549)+(-1.000000000000000D+00)*AMP(969)
-     $ +((0.000000000000000D+00,1.000000000000000D+00))*AMP(1298)
-     $ +AMP(1307)+TMP_JAMP(801)+((0.000000000000000D+00,
-     $ -1.000000000000000D+00))*TMP_JAMP(1333)+TMP_JAMP(1341)
-     $ +TMP_JAMP(1582)+((0.000000000000000D+00,1.000000000000000D+00))
-     $ *TMP_JAMP(2154)+((0.000000000000000D+00,-1.000000000000000D+00))
-     $ *TMP_JAMP(2157)+((0.000000000000000D+00,-1.000000000000000D+00))
-     $ *TMP_JAMP(2158)+((0.000000000000000D+00,-1.000000000000000D+00))
-     $ *TMP_JAMP(2163)+((0.000000000000000D+00,1.000000000000000D+00))
-     $ *TMP_JAMP(2166)+((0.000000000000000D+00,-1.000000000000000D+00))
-     $ *TMP_JAMP(2681)+(-1.000000000000000D+00)*TMP_JAMP(2682)
-      JAMP(44,1) = AMP(708)+(-1.000000000000000D+00)*AMP(960)
-     $ +((0.000000000000000D+00,1.000000000000000D+00))*AMP(1289)
-     $ +TMP_JAMP(610)+(-1.000000000000000D+00)*TMP_JAMP(780)
-     $ +((0.000000000000000D+00,-1.000000000000000D+00))*TMP_JAMP(1296)
-     $ +((0.000000000000000D+00,-1.000000000000000D+00))*TMP_JAMP(1325)
-     $ +TMP_JAMP(1327)+((0.000000000000000D+00,-1.000000000000000D+00))
-     $ *TMP_JAMP(1939)+((0.000000000000000D+00,1.000000000000000D+00))
-     $ *TMP_JAMP(2158)+((0.000000000000000D+00,1.000000000000000D+00))
-     $ *TMP_JAMP(2159)+((0.000000000000000D+00,1.000000000000000D+00))
-     $ *TMP_JAMP(2160)+TMP_JAMP(2161)+((0.000000000000000D+00
-     $ ,1.000000000000000D+00))*TMP_JAMP(2162)+((0.000000000000000D+00
-     $ ,-1.000000000000000D+00))*TMP_JAMP(2164)+((0.000000000000000D
-     $ +00,1.000000000000000D+00))*TMP_JAMP(2179)
-      JAMP(45,1) = (-1.000000000000000D+00)*AMP(967)+(
-     $ -1.000000000000000D+00)*TMP_JAMP(579)+(-1.000000000000000D+00)
-     $ *TMP_JAMP(787)+((0.000000000000000D+00,-1.000000000000000D+00))
-     $ *TMP_JAMP(1308)+((0.000000000000000D+00,1.000000000000000D+00))
-     $ *TMP_JAMP(1314)+TMP_JAMP(1695)+((0.000000000000000D+00
-     $ ,1.000000000000000D+00))*TMP_JAMP(1941)+((0.000000000000000D+00
-     $ ,-1.000000000000000D+00))*TMP_JAMP(2166)+TMP_JAMP(2168)
-     $ +((0.000000000000000D+00,1.000000000000000D+00))*TMP_JAMP(2177)
-     $ +(-1.000000000000000D+00)*TMP_JAMP(2544)+((0.000000000000000D
-     $ +00,-1.000000000000000D+00))*TMP_JAMP(2683)
-     $ +((0.000000000000000D+00,1.000000000000000D+00))*TMP_JAMP(2684)
+     $ *TMP_JAMP(949)+((0.000000000000000D+00,-1.000000000000000D+00))
+     $ *TMP_JAMP(1147)+TMP_JAMP(1280)+((0.000000000000000D+00,
+     $ -1.000000000000000D+00))*TMP_JAMP(1374)+(-1.000000000000000D+00)
+     $ *AMP(968)+((0.000000000000000D+00,-1.000000000000000D+00))
+     $ *TMP_JAMP(2067)+(-1.000000000000000D+00)*TMP_JAMP(2130)
+     $ +TMP_JAMP(2333)+(-1.000000000000000D+00)*TMP_JAMP(2542)
+     $ +TMP_JAMP(2713)+(-1.000000000000000D+00)*TMP_JAMP(2763)
+     $ +TMP_JAMP(2854)+TMP_JAMP(2957)+(-1.000000000000000D+00)
+     $ *TMP_JAMP(3001)
+      JAMP(33,1) = (-1.000000000000000D+00)*TMP_JAMP(1102)+(
+     $ -1.000000000000000D+00)*TMP_JAMP(1256)+((0.000000000000000D+00
+     $ ,1.000000000000000D+00))*TMP_JAMP(1391)+(-1.000000000000000D+00)
+     $ *AMP(975)+(-1.000000000000000D+00)*TMP_JAMP(1688)+(
+     $ -1.000000000000000D+00)*TMP_JAMP(2556)+TMP_JAMP(2811)
+     $ +TMP_JAMP(2817)+TMP_JAMP(2882)+((0.000000000000000D+00
+     $ ,1.000000000000000D+00))*TMP_JAMP(2976)+(-1.000000000000000D+00)
+     $ *TMP_JAMP(3030)
+      JAMP(34,1) = (-1.000000000000000D+00)*AMP(323)+TMP_JAMP(419)
+     $ +((0.000000000000000D+00,-1.000000000000000D+00))*TMP_JAMP(511)
+     $ +TMP_JAMP(1102)+((0.000000000000000D+00,-1.000000000000000D+00))
+     $ *TMP_JAMP(1147)+((0.000000000000000D+00,-1.000000000000000D+00))
+     $ *TMP_JAMP(1281)+TMP_JAMP(1604)+TMP_JAMP(2553)+(
+     $ -1.000000000000000D+00)*TMP_JAMP(2813)+TMP_JAMP(2920)+(
+     $ -1.000000000000000D+00)*TMP_JAMP(2951)+TMP_JAMP(2954)+(
+     $ -1.000000000000000D+00)*TMP_JAMP(2969)
+      JAMP(35,1) = (-1.000000000000000D+00)*TMP_JAMP(1001)+(
+     $ -1.000000000000000D+00)*TMP_JAMP(1022)+(-1.000000000000000D+00)
+     $ *TMP_JAMP(1033)+((0.000000000000000D+00,1.000000000000000D+00))
+     $ *TMP_JAMP(1152)+((0.000000000000000D+00,1.000000000000000D+00))
+     $ *TMP_JAMP(1155)+((0.000000000000000D+00,-1.000000000000000D+00))
+     $ *TMP_JAMP(1224)+(-1.000000000000000D+00)*AMP(966)+TMP_JAMP(1582)
+     $ +((0.000000000000000D+00,1.000000000000000D+00))*TMP_JAMP(2006)
+     $ +((0.000000000000000D+00,-1.000000000000000D+00))*TMP_JAMP(2105)
+     $ +TMP_JAMP(2514)+TMP_JAMP(2546)+TMP_JAMP(2695)+(
+     $ -1.000000000000000D+00)*TMP_JAMP(2712)+(-1.000000000000000D+00)
+     $ *TMP_JAMP(2875)+((0.000000000000000D+00,1.000000000000000D+00))
+     $ *TMP_JAMP(2956)+TMP_JAMP(3001)
+      JAMP(36,1) = (-1.000000000000000D+00)*AMP(321)+TMP_JAMP(431)
+     $ +((0.000000000000000D+00,-1.000000000000000D+00))*TMP_JAMP(504)
+     $ +TMP_JAMP(553)+TMP_JAMP(814)+((0.000000000000000D+00
+     $ ,1.000000000000000D+00))*TMP_JAMP(839)+((0.000000000000000D+00,
+     $ -1.000000000000000D+00))*TMP_JAMP(1152)+((0.000000000000000D+00
+     $ ,-1.000000000000000D+00))*TMP_JAMP(1156)+TMP_JAMP(1271)
+     $ +TMP_JAMP(1520)+TMP_JAMP(1706)+(-1.000000000000000D+00)
+     $ *TMP_JAMP(1933)+TMP_JAMP(2059)+TMP_JAMP(2515)+TMP_JAMP(2557)+(
+     $ -1.000000000000000D+00)*TMP_JAMP(2818)+TMP_JAMP(2969)
+      JAMP(37,1) = TMP_JAMP(786)+((0.000000000000000D+00,
+     $ -1.000000000000000D+00))*TMP_JAMP(910)+TMP_JAMP(1277)
+     $ +((0.000000000000000D+00,1.000000000000000D+00))*TMP_JAMP(1346)
+     $ +((0.000000000000000D+00,1.000000000000000D+00))*TMP_JAMP(1373)
+     $ +(-1.000000000000000D+00)*AMP(978)+TMP_JAMP(1883)
+     $ +((0.000000000000000D+00,-1.000000000000000D+00))*TMP_JAMP(2066)
+     $ +TMP_JAMP(2128)+TMP_JAMP(2609)+(-1.000000000000000D+00)
+     $ *TMP_JAMP(2846)+(-1.000000000000000D+00)*TMP_JAMP(2899)+(
+     $ -1.000000000000000D+00)*TMP_JAMP(2904)
+      JAMP(38,1) = ((0.000000000000000D+00,1.000000000000000D+00))
+     $ *TMP_JAMP(933)+((0.000000000000000D+00,-1.000000000000000D+00))
+     $ *TMP_JAMP(965)+TMP_JAMP(1005)+((0.000000000000000D+00,
+     $ -1.000000000000000D+00))*TMP_JAMP(1143)+((0.000000000000000D+00
+     $ ,-1.000000000000000D+00))*TMP_JAMP(1148)+((0.000000000000000D
+     $ +00,-1.000000000000000D+00))*TMP_JAMP(1392)+(
+     $ -1.000000000000000D+00)*AMP(959)+(-1.000000000000000D+00)
+     $ *TMP_JAMP(2128)+((0.000000000000000D+00,-1.000000000000000D+00))
+     $ *TMP_JAMP(2138)+TMP_JAMP(2296)+(-1.000000000000000D+00)
+     $ *TMP_JAMP(2483)+(-1.000000000000000D+00)*TMP_JAMP(2535)+(
+     $ -1.000000000000000D+00)*TMP_JAMP(2576)+(-1.000000000000000D+00)
+     $ *TMP_JAMP(2707)+TMP_JAMP(2712)+TMP_JAMP(2793)
+      JAMP(39,1) = (-1.000000000000000D+00)*TMP_JAMP(827)+(
+     $ -1.000000000000000D+00)*TMP_JAMP(1020)+(-1.000000000000000D+00)
+     $ *TMP_JAMP(1039)+TMP_JAMP(1100)+(-1.000000000000000D+00)
+     $ *TMP_JAMP(1255)+((0.000000000000000D+00,-1.000000000000000D+00))
+     $ *TMP_JAMP(1346)+(-1.000000000000000D+00)*AMP(976)+TMP_JAMP(1686)
+     $ +(-1.000000000000000D+00)*TMP_JAMP(1799)+((0.000000000000000D
+     $ +00,1.000000000000000D+00))*TMP_JAMP(1988)+(-1.000000000000000D
+     $ +00)*TMP_JAMP(2497)+TMP_JAMP(2591)+(-1.000000000000000D+00)
+     $ *TMP_JAMP(2687)+TMP_JAMP(2770)+TMP_JAMP(2847)+TMP_JAMP(3030)
+      JAMP(40,1) = (-1.000000000000000D+00)*AMP(324)
+     $ +((0.000000000000000D+00,1.000000000000000D+00))*TMP_JAMP(240)
+     $ +TMP_JAMP(418)+((0.000000000000000D+00,1.000000000000000D+00))
+     $ *TMP_JAMP(510)+((0.000000000000000D+00,-1.000000000000000D+00))
+     $ *TMP_JAMP(848)+(-1.000000000000000D+00)*TMP_JAMP(1100)
+     $ +((0.000000000000000D+00,-1.000000000000000D+00))*TMP_JAMP(1143)
+     $ +((0.000000000000000D+00,1.000000000000000D+00))*TMP_JAMP(1282)
+     $ +TMP_JAMP(1600)+(-1.000000000000000D+00)*TMP_JAMP(1841)
+     $ +TMP_JAMP(2493)+(-1.000000000000000D+00)*TMP_JAMP(2767)
+     $ +TMP_JAMP(2919)+TMP_JAMP(2951)+(-1.000000000000000D+00)
+     $ *TMP_JAMP(2968)
+      JAMP(41,1) = (-1.000000000000000D+00)*TMP_JAMP(771)+(
+     $ -1.000000000000000D+00)*TMP_JAMP(1002)+((0.000000000000000D+00
+     $ ,1.000000000000000D+00))*TMP_JAMP(1144)+((0.000000000000000D+00
+     $ ,1.000000000000000D+00))*TMP_JAMP(1159)+((0.000000000000000D+00
+     $ ,-1.000000000000000D+00))*TMP_JAMP(1211)+(-1.000000000000000D
+     $ +00)*TMP_JAMP(1270)+((0.000000000000000D+00,-1.000000000000000D
+     $ +00))*TMP_JAMP(1311)+(-1.000000000000000D+00)*AMP(957)
+     $ +((0.000000000000000D+00,1.000000000000000D+00))*TMP_JAMP(1784)
+     $ +((0.000000000000000D+00,1.000000000000000D+00))*TMP_JAMP(1868)
+     $ +(-1.000000000000000D+00)*TMP_JAMP(1939)+((0.000000000000000D
+     $ +00,-1.000000000000000D+00))*TMP_JAMP(2086)+TMP_JAMP(2487)
+     $ +TMP_JAMP(2707)+(-1.000000000000000D+00)*TMP_JAMP(2713)+(
+     $ -1.000000000000000D+00)*TMP_JAMP(2877)+TMP_JAMP(2884)
+      JAMP(42,1) = (-1.000000000000000D+00)*AMP(319)+TMP_JAMP(187)
+     $ +((0.000000000000000D+00,1.000000000000000D+00))*TMP_JAMP(242)
+     $ +((0.000000000000000D+00,-1.000000000000000D+00))*TMP_JAMP(303)
+     $ +(-1.000000000000000D+00)*TMP_JAMP(412)+TMP_JAMP(436)
+     $ +((0.000000000000000D+00,1.000000000000000D+00))*TMP_JAMP(839)
+     $ +((0.000000000000000D+00,-1.000000000000000D+00))*TMP_JAMP(1150)
+     $ +((0.000000000000000D+00,1.000000000000000D+00))*TMP_JAMP(1240)
+     $ +((0.000000000000000D+00,1.000000000000000D+00))*TMP_JAMP(1311)
+     $ +TMP_JAMP(1705)+(-1.000000000000000D+00)*TMP_JAMP(1842)+(
+     $ -1.000000000000000D+00)*TMP_JAMP(1915)+(-1.000000000000000D+00)
+     $ *TMP_JAMP(1941)+TMP_JAMP(2594)+(-1.000000000000000D+00)
+     $ *TMP_JAMP(2899)+TMP_JAMP(2968)
+      JAMP(43,1) = TMP_JAMP(678)+((0.000000000000000D+00,
+     $ -1.000000000000000D+00))*TMP_JAMP(688)+((0.000000000000000D+00,
+     $ -1.000000000000000D+00))*TMP_JAMP(949)+TMP_JAMP(1387)+(
+     $ -1.000000000000000D+00)*AMP(969)+TMP_JAMP(2125)+TMP_JAMP(2127)
+     $ +(-1.000000000000000D+00)*TMP_JAMP(2481)+TMP_JAMP(2497)+(
+     $ -1.000000000000000D+00)*TMP_JAMP(2722)+(-1.000000000000000D+00)
+     $ *TMP_JAMP(2897)+(-1.000000000000000D+00)*TMP_JAMP(2996)
+      JAMP(44,1) = TMP_JAMP(1384)+(-1.000000000000000D+00)*AMP(960)+(
+     $ -1.000000000000000D+00)*TMP_JAMP(2126)+(-1.000000000000000D+00)
+     $ *TMP_JAMP(2127)+(-1.000000000000000D+00)*TMP_JAMP(2535)
+     $ +TMP_JAMP(2556)+(-1.000000000000000D+00)*TMP_JAMP(2730)+(
+     $ -1.000000000000000D+00)*TMP_JAMP(3000)+(-1.000000000000000D+00)
+     $ *TMP_JAMP(3025)
+      JAMP(45,1) = ((0.000000000000000D+00,1.000000000000000D+00))
+     $ *TMP_JAMP(728)+((0.000000000000000D+00,1.000000000000000D+00))
+     $ *TMP_JAMP(874)+TMP_JAMP(1382)+(-1.000000000000000D+00)
+     $ *TMP_JAMP(1387)+(-1.000000000000000D+00)*AMP(967)+TMP_JAMP(1824)
+     $ +(-1.000000000000000D+00)*TMP_JAMP(2088)+((0.000000000000000D
+     $ +00,1.000000000000000D+00))*TMP_JAMP(2105)+(-1.000000000000000D
+     $ +00)*TMP_JAMP(2327)+(-1.000000000000000D+00)*TMP_JAMP(2608)
+     $ +TMP_JAMP(2653)+TMP_JAMP(2778)+(-1.000000000000000D+00)
+     $ *TMP_JAMP(2883)+TMP_JAMP(3010)
       JAMP(46,1) = (-1.000000000000000D+00)*AMP(322)
-     $ +((0.000000000000000D+00,1.000000000000000D+00))*AMP(380)
-     $ +((0.000000000000000D+00,1.000000000000000D+00))*TMP_JAMP(1312)
-     $ +((0.000000000000000D+00,1.000000000000000D+00))*TMP_JAMP(2171)
-     $ +((0.000000000000000D+00,1.000000000000000D+00))*TMP_JAMP(2172)
-     $ +(-1.000000000000000D+00)*TMP_JAMP(2174)+TMP_JAMP(2185)
-     $ +((0.000000000000000D+00,1.000000000000000D+00))*TMP_JAMP(2188)
-     $ +((0.000000000000000D+00,1.000000000000000D+00))*TMP_JAMP(2297)
-     $ +((0.000000000000000D+00,-1.000000000000000D+00))*TMP_JAMP(2684)
-      JAMP(47,1) = (-1.000000000000000D+00)*AMP(958)
-     $ +((0.000000000000000D+00,1.000000000000000D+00))*TMP_JAMP(1318)
-     $ +((0.000000000000000D+00,-1.000000000000000D+00))*TMP_JAMP(1324)
-     $ +TMP_JAMP(1328)+TMP_JAMP(1705)+((0.000000000000000D+00
-     $ ,1.000000000000000D+00))*TMP_JAMP(1945)+((0.000000000000000D+00
-     $ ,1.000000000000000D+00))*TMP_JAMP(2176)+((0.000000000000000D+00
-     $ ,-1.000000000000000D+00))*TMP_JAMP(2179)+((0.000000000000000D
-     $ +00,1.000000000000000D+00))*TMP_JAMP(2180)+((0.000000000000000D
-     $ +00,1.000000000000000D+00))*TMP_JAMP(2186)+TMP_JAMP(2732)+(
-     $ -1.000000000000000D+00)*TMP_JAMP(2755)
-      JAMP(48,1) = (-1.000000000000000D+00)*AMP(320)+(
-     $ -1.000000000000000D+00)*AMP(466)+(-1.000000000000000D+00)
-     $ *AMP(467)+TMP_JAMP(801)+((0.000000000000000D+00,
-     $ -1.000000000000000D+00))*TMP_JAMP(1332)+((0.000000000000000D+00
-     $ ,-1.000000000000000D+00))*TMP_JAMP(1334)+TMP_JAMP(1342)
-     $ +((0.000000000000000D+00,-1.000000000000000D+00))*TMP_JAMP(1710)
-     $ +((0.000000000000000D+00,-1.000000000000000D+00))*TMP_JAMP(2183)
-     $ +(-1.000000000000000D+00)*TMP_JAMP(2184)+((0.000000000000000D
-     $ +00,1.000000000000000D+00))*TMP_JAMP(2187)+((0.000000000000000D
-     $ +00,-1.000000000000000D+00))*TMP_JAMP(2189)
-     $ +((0.000000000000000D+00,-1.000000000000000D+00))*TMP_JAMP(2190)
-     $ +((0.000000000000000D+00,1.000000000000000D+00))*TMP_JAMP(2479)
-     $ +((0.000000000000000D+00,1.000000000000000D+00))*TMP_JAMP(2565)
+     $ +((0.000000000000000D+00,1.000000000000000D+00))*TMP_JAMP(133)
+     $ +((0.000000000000000D+00,1.000000000000000D+00))*TMP_JAMP(292)
+     $ +((0.000000000000000D+00,1.000000000000000D+00))*TMP_JAMP(331)
+     $ +TMP_JAMP(429)+((0.000000000000000D+00,1.000000000000000D+00))
+     $ *TMP_JAMP(480)+(-1.000000000000000D+00)*TMP_JAMP(553)
+     $ +((0.000000000000000D+00,-1.000000000000000D+00))*TMP_JAMP(686)
+     $ +((0.000000000000000D+00,-1.000000000000000D+00))*TMP_JAMP(848)
+     $ +(-1.000000000000000D+00)*TMP_JAMP(1382)+(-1.000000000000000D
+     $ +00)*TMP_JAMP(2031)+(-1.000000000000000D+00)*TMP_JAMP(2060)
+     $ +TMP_JAMP(2927)+(-1.000000000000000D+00)*TMP_JAMP(3025)
+      JAMP(47,1) = TMP_JAMP(1129)+((0.000000000000000D+00,
+     $ -1.000000000000000D+00))*TMP_JAMP(1158)+((0.000000000000000D+00
+     $ ,-1.000000000000000D+00))*TMP_JAMP(1303)+(-1.000000000000000D
+     $ +00)*TMP_JAMP(1384)+(-1.000000000000000D+00)*AMP(958)
+     $ +TMP_JAMP(1563)+((0.000000000000000D+00,1.000000000000000D+00))
+     $ *TMP_JAMP(2086)+(-1.000000000000000D+00)*TMP_JAMP(2089)+(
+     $ -1.000000000000000D+00)*TMP_JAMP(2364)+TMP_JAMP(2466)+(
+     $ -1.000000000000000D+00)*TMP_JAMP(2558)+TMP_JAMP(2658)
+     $ +TMP_JAMP(2824)+(-1.000000000000000D+00)*TMP_JAMP(2931)
+     $ +TMP_JAMP(3010)
+      JAMP(48,1) = (-1.000000000000000D+00)*AMP(320)
+     $ +((0.000000000000000D+00,-1.000000000000000D+00))*TMP_JAMP(331)
+     $ +TMP_JAMP(411)+TMP_JAMP(430)+(-1.000000000000000D+00)
+     $ *TMP_JAMP(441)+((0.000000000000000D+00,-1.000000000000000D+00))
+     $ *TMP_JAMP(503)+((0.000000000000000D+00,1.000000000000000D+00))
+     $ *TMP_JAMP(1065)+(-1.000000000000000D+00)*TMP_JAMP(1129)
+     $ +TMP_JAMP(1133)+((0.000000000000000D+00,-1.000000000000000D+00))
+     $ *TMP_JAMP(1244)+TMP_JAMP(1625)+(-1.000000000000000D+00)
+     $ *TMP_JAMP(1705)+TMP_JAMP(1818)+(-1.000000000000000D+00)
+     $ *TMP_JAMP(1900)+TMP_JAMP(1972)+TMP_JAMP(2677)+(
+     $ -1.000000000000000D+00)*TMP_JAMP(2897)+TMP_JAMP(2954)
       JAMP(49,1) = ((0.000000000000000D+00,1.000000000000000D+00))
-     $ *AMP(58)+(-1.000000000000000D+00)*AMP(530)+(-1.000000000000000D
-     $ +00)*AMP(1403)+((0.000000000000000D+00,1.000000000000000D+00))
-     $ *TMP_JAMP(1352)+((0.000000000000000D+00,-1.000000000000000D+00))
-     $ *TMP_JAMP(2194)+TMP_JAMP(2197)+TMP_JAMP(2201)+TMP_JAMP(2216)+(
-     $ -1.000000000000000D+00)*TMP_JAMP(2218)+TMP_JAMP(2254)
-     $ +((0.000000000000000D+00,-1.000000000000000D+00))*TMP_JAMP(2686)
-      JAMP(50,1) = ((0.000000000000000D+00,1.000000000000000D+00))
-     $ *AMP(55)+((0.000000000000000D+00,1.000000000000000D+00))
-     $ *AMP(1271)+(-1.000000000000000D+00)*AMP(1397)
-     $ +((0.000000000000000D+00,-1.000000000000000D+00))*AMP(1409)
-     $ +AMP(1618)+(-1.000000000000000D+00)*AMP(1883)+TMP_JAMP(750)
-     $ +TMP_JAMP(1346)+(-1.000000000000000D+00)*TMP_JAMP(1359)
-     $ +TMP_JAMP(1361)+((0.000000000000000D+00,-1.000000000000000D+00))
-     $ *TMP_JAMP(1437)+((0.000000000000000D+00,1.000000000000000D+00))
-     $ *TMP_JAMP(1441)+TMP_JAMP(1442)+((0.000000000000000D+00
-     $ ,1.000000000000000D+00))*TMP_JAMP(2193)+(-1.000000000000000D+00)
-     $ *TMP_JAMP(2195)+((0.000000000000000D+00,1.000000000000000D+00))
-     $ *TMP_JAMP(2196)+((0.000000000000000D+00,-1.000000000000000D+00))
-     $ *TMP_JAMP(2198)+(-1.000000000000000D+00)*TMP_JAMP(2199)
-     $ +((0.000000000000000D+00,1.000000000000000D+00))*TMP_JAMP(2531)
+     $ *TMP_JAMP(1393)+(-1.000000000000000D+00)*AMP(1403)
+     $ +((0.000000000000000D+00,-1.000000000000000D+00))*TMP_JAMP(1746)
+     $ +TMP_JAMP(1892)+(-1.000000000000000D+00)*TMP_JAMP(1939)
+     $ +((0.000000000000000D+00,1.000000000000000D+00))*TMP_JAMP(2136)
+     $ +TMP_JAMP(2579)+TMP_JAMP(2630)+(-1.000000000000000D+00)
+     $ *TMP_JAMP(2836)+TMP_JAMP(2837)+TMP_JAMP(2860)+TMP_JAMP(2990)
+      JAMP(50,1) = ((0.000000000000000D+00,-1.000000000000000D+00))
+     $ *TMP_JAMP(1405)+(-1.000000000000000D+00)*AMP(1397)+(
+     $ -1.000000000000000D+00)*TMP_JAMP(1892)+TMP_JAMP(1938)
+     $ +((0.000000000000000D+00,1.000000000000000D+00))*TMP_JAMP(1977)
+     $ +TMP_JAMP(2026)+(-1.000000000000000D+00)*TMP_JAMP(2620)
+     $ +TMP_JAMP(2731)+TMP_JAMP(2783)+TMP_JAMP(2938)+TMP_JAMP(2986)
       JAMP(51,1) = ((0.000000000000000D+00,1.000000000000000D+00))
-     $ *AMP(749)+(-1.000000000000000D+00)*AMP(1402)+AMP(1700)
-     $ +TMP_JAMP(1354)+((0.000000000000000D+00,-1.000000000000000D+00))
-     $ *TMP_JAMP(1484)+(-1.000000000000000D+00)*TMP_JAMP(1488)+(
-     $ -1.000000000000000D+00)*TMP_JAMP(2201)+((0.000000000000000D+00,
-     $ -1.000000000000000D+00))*TMP_JAMP(2687)+TMP_JAMP(2688)
-     $ +((0.000000000000000D+00,1.000000000000000D+00))*TMP_JAMP(2757)
+     $ *TMP_JAMP(1394)+((0.000000000000000D+00,1.000000000000000D+00))
+     $ *TMP_JAMP(1397)+(-1.000000000000000D+00)*AMP(1402)
+     $ +((0.000000000000000D+00,1.000000000000000D+00))*TMP_JAMP(1737)
+     $ +TMP_JAMP(1891)+TMP_JAMP(1937)+((0.000000000000000D+00,
+     $ -1.000000000000000D+00))*TMP_JAMP(2136)+TMP_JAMP(2575)
+     $ +TMP_JAMP(2827)+(-1.000000000000000D+00)*TMP_JAMP(2892)+(
+     $ -1.000000000000000D+00)*TMP_JAMP(2895)
       JAMP(52,1) = ((0.000000000000000D+00,-1.000000000000000D+00))
-     $ *AMP(497)+((0.000000000000000D+00,1.000000000000000D+00))
-     $ *AMP(665)+(-1.000000000000000D+00)*AMP(1018)+(
-     $ -1.000000000000000D+00)*TMP_JAMP(2203)+(-1.000000000000000D+00)
-     $ *TMP_JAMP(2248)+(-1.000000000000000D+00)*TMP_JAMP(2688)+(
-     $ -1.000000000000000D+00)*TMP_JAMP(2756)
-      JAMP(53,1) = (-1.000000000000000D+00)*AMP(514)
-     $ +((0.000000000000000D+00,1.000000000000000D+00))*AMP(908)
-     $ +((0.000000000000000D+00,-1.000000000000000D+00))*AMP(1271)+(
-     $ -1.000000000000000D+00)*AMP(1396)+TMP_JAMP(710)+TMP_JAMP(1358)
-     $ +(-1.000000000000000D+00)*TMP_JAMP(1558)+(-1.000000000000000D
-     $ +00)*TMP_JAMP(2206)+(-1.000000000000000D+00)*TMP_JAMP(2207)
-     $ +TMP_JAMP(2208)+(-1.000000000000000D+00)*TMP_JAMP(2277)
-     $ +TMP_JAMP(2572)+((0.000000000000000D+00,-1.000000000000000D+00))
-     $ *TMP_JAMP(2690)+((0.000000000000000D+00,-1.000000000000000D+00))
-     $ *TMP_JAMP(2694)
+     $ *TMP_JAMP(1176)+TMP_JAMP(1385)+(-1.000000000000000D+00)
+     $ *AMP(1018)+(-1.000000000000000D+00)*TMP_JAMP(1619)+(
+     $ -1.000000000000000D+00)*TMP_JAMP(1891)+TMP_JAMP(2145)+(
+     $ -1.000000000000000D+00)*TMP_JAMP(2531)+(-1.000000000000000D+00)
+     $ *TMP_JAMP(2853)+TMP_JAMP(2938)+TMP_JAMP(2988)+TMP_JAMP(3009)
+      JAMP(53,1) = TMP_JAMP(1415)+(-1.000000000000000D+00)*AMP(1396)
+     $ +((0.000000000000000D+00,1.000000000000000D+00))*TMP_JAMP(1744)
+     $ +(-1.000000000000000D+00)*TMP_JAMP(1811)+TMP_JAMP(1890)
+     $ +((0.000000000000000D+00,-1.000000000000000D+00))*TMP_JAMP(1977)
+     $ +((0.000000000000000D+00,1.000000000000000D+00))*TMP_JAMP(1978)
+     $ +(-1.000000000000000D+00)*TMP_JAMP(1994)+TMP_JAMP(2729)+(
+     $ -1.000000000000000D+00)*TMP_JAMP(2774)+(-1.000000000000000D+00)
+     $ *TMP_JAMP(2892)+TMP_JAMP(2997)
       JAMP(54,1) = ((0.000000000000000D+00,-1.000000000000000D+00))
-     $ *AMP(496)+(-1.000000000000000D+00)*AMP(1017)+AMP(1712)+(
-     $ -1.000000000000000D+00)*AMP(1884)+(-1.000000000000000D+00)
-     $ *TMP_JAMP(716)+(-1.000000000000000D+00)*TMP_JAMP(1363)
-     $ +TMP_JAMP(1366)+((0.000000000000000D+00,-1.000000000000000D+00))
-     $ *TMP_JAMP(1660)+((0.000000000000000D+00,-1.000000000000000D+00))
-     $ *TMP_JAMP(2209)+(-1.000000000000000D+00)*TMP_JAMP(2281)
-     $ +((0.000000000000000D+00,1.000000000000000D+00))*TMP_JAMP(2690)
-     $ +((0.000000000000000D+00,-1.000000000000000D+00))*TMP_JAMP(2721)
-     $ +TMP_JAMP(2756)
-      JAMP(55,1) = (-1.000000000000000D+00)*AMP(690)+(
-     $ -1.000000000000000D+00)*AMP(1406)+((0.000000000000000D+00
-     $ ,1.000000000000000D+00))*AMP(1412)+((0.000000000000000D+00
-     $ ,1.000000000000000D+00))*TMP_JAMP(1372)+TMP_JAMP(1375)
-     $ +((0.000000000000000D+00,-1.000000000000000D+00))*TMP_JAMP(1386)
-     $ +((0.000000000000000D+00,1.000000000000000D+00))*TMP_JAMP(2214)
-     $ +(-1.000000000000000D+00)*TMP_JAMP(2215)+((0.000000000000000D
-     $ +00,1.000000000000000D+00))*TMP_JAMP(2217)+((0.000000000000000D
-     $ +00,1.000000000000000D+00))*TMP_JAMP(2219)+((0.000000000000000D
-     $ +00,-1.000000000000000D+00))*TMP_JAMP(2221)+TMP_JAMP(2224)+(
-     $ -1.000000000000000D+00)*TMP_JAMP(2246)
-      JAMP(56,1) = ((0.000000000000000D+00,-1.000000000000000D+00))
-     $ *AMP(55)+(-1.000000000000000D+00)*AMP(1400)+AMP(1620)+(
-     $ -1.000000000000000D+00)*TMP_JAMP(697)+(-1.000000000000000D+00)
-     $ *TMP_JAMP(843)+((0.000000000000000D+00,1.000000000000000D+00))
-     $ *TMP_JAMP(847)+((0.000000000000000D+00,1.000000000000000D+00))
-     $ *TMP_JAMP(1126)+((0.000000000000000D+00,1.000000000000000D+00))
-     $ *TMP_JAMP(1376)+(-1.000000000000000D+00)*TMP_JAMP(1378)+(
-     $ -1.000000000000000D+00)*TMP_JAMP(1380)+TMP_JAMP(1382)
-     $ +TMP_JAMP(1383)+((0.000000000000000D+00,-1.000000000000000D+00))
-     $ *TMP_JAMP(1392)+((0.000000000000000D+00,-1.000000000000000D+00))
-     $ *TMP_JAMP(1427)+((0.000000000000000D+00,1.000000000000000D+00))
-     $ *TMP_JAMP(1438)+TMP_JAMP(2220)+(-1.000000000000000D+00)
-     $ *TMP_JAMP(2222)+(-1.000000000000000D+00)*TMP_JAMP(2223)
-     $ +TMP_JAMP(2233)+(-1.000000000000000D+00)*TMP_JAMP(2237)
+     $ *TMP_JAMP(721)+(-1.000000000000000D+00)*TMP_JAMP(1263)
+     $ +((0.000000000000000D+00,-1.000000000000000D+00))*TMP_JAMP(1295)
+     $ +((0.000000000000000D+00,1.000000000000000D+00))*TMP_JAMP(1375)
+     $ +(-1.000000000000000D+00)*AMP(1017)+(-1.000000000000000D+00)
+     $ *TMP_JAMP(1655)+(-1.000000000000000D+00)*TMP_JAMP(1890)
+     $ +((0.000000000000000D+00,-1.000000000000000D+00))*TMP_JAMP(1986)
+     $ +(-1.000000000000000D+00)*TMP_JAMP(2145)+TMP_JAMP(2492)
+     $ +TMP_JAMP(2585)+TMP_JAMP(2675)+(-1.000000000000000D+00)
+     $ *TMP_JAMP(2714)+(-1.000000000000000D+00)*TMP_JAMP(2836)
+     $ +TMP_JAMP(2999)
+      JAMP(55,1) = ((0.000000000000000D+00,1.000000000000000D+00))
+     $ *TMP_JAMP(1063)+TMP_JAMP(1141)+((0.000000000000000D+00,
+     $ -1.000000000000000D+00))*TMP_JAMP(1177)+(-1.000000000000000D+00)
+     $ *AMP(1406)+(-1.000000000000000D+00)*TMP_JAMP(1894)+(
+     $ -1.000000000000000D+00)*TMP_JAMP(2075)+((0.000000000000000D+00
+     $ ,1.000000000000000D+00))*TMP_JAMP(2108)+(-1.000000000000000D+00)
+     $ *TMP_JAMP(2578)+TMP_JAMP(2821)+(-1.000000000000000D+00)
+     $ *TMP_JAMP(2911)+(-1.000000000000000D+00)*TMP_JAMP(2990)+(
+     $ -1.000000000000000D+00)*TMP_JAMP(3013)
+      JAMP(56,1) = TMP_JAMP(647)+((0.000000000000000D+00
+     $ ,1.000000000000000D+00))*TMP_JAMP(1168)+((0.000000000000000D+00
+     $ ,1.000000000000000D+00))*TMP_JAMP(1205)+(-1.000000000000000D+00)
+     $ *AMP(1400)+TMP_JAMP(2047)+((0.000000000000000D+00,
+     $ -1.000000000000000D+00))*TMP_JAMP(2108)+(-1.000000000000000D+00)
+     $ *TMP_JAMP(2452)+TMP_JAMP(2814)+(-1.000000000000000D+00)
+     $ *TMP_JAMP(2940)+(-1.000000000000000D+00)*TMP_JAMP(2957)+(
+     $ -1.000000000000000D+00)*TMP_JAMP(2986)+(-1.000000000000000D+00)
+     $ *TMP_JAMP(2998)
       JAMP(57,1) = ((0.000000000000000D+00,1.000000000000000D+00))
-     $ *AMP(1194)+(-1.000000000000000D+00)*AMP(1404)+AMP(1835)
-     $ +TMP_JAMP(1388)+((0.000000000000000D+00,-1.000000000000000D+00))
-     $ *TMP_JAMP(1517)+((0.000000000000000D+00,-1.000000000000000D+00))
-     $ *TMP_JAMP(1521)+(-1.000000000000000D+00)*TMP_JAMP(2224)+(
-     $ -1.000000000000000D+00)*TMP_JAMP(2692)+TMP_JAMP(2693)
-     $ +((0.000000000000000D+00,1.000000000000000D+00))*TMP_JAMP(2757)
-      JAMP(58,1) = ((0.000000000000000D+00,-1.000000000000000D+00))
-     $ *AMP(344)+((0.000000000000000D+00,1.000000000000000D+00))
-     $ *AMP(366)+(-1.000000000000000D+00)*AMP(395)+TMP_JAMP(2228)
-     $ +TMP_JAMP(2259)+(-1.000000000000000D+00)*TMP_JAMP(2693)
-     $ +TMP_JAMP(2758)
-      JAMP(59,1) = (-1.000000000000000D+00)*AMP(675)
-     $ +((0.000000000000000D+00,-1.000000000000000D+00))*AMP(1131)
-     $ +((0.000000000000000D+00,1.000000000000000D+00))*AMP(1272)+(
-     $ -1.000000000000000D+00)*AMP(1398)+TMP_JAMP(771)+TMP_JAMP(1395)
-     $ +(-1.000000000000000D+00)*TMP_JAMP(1583)+(-1.000000000000000D
-     $ +00)*TMP_JAMP(2232)+(-1.000000000000000D+00)*TMP_JAMP(2234)
-     $ +TMP_JAMP(2237)+((0.000000000000000D+00,1.000000000000000D+00))
-     $ *TMP_JAMP(2239)+(-1.000000000000000D+00)*TMP_JAMP(2289)
-     $ +TMP_JAMP(2588)+((0.000000000000000D+00,-1.000000000000000D+00))
-     $ *TMP_JAMP(2694)
-      JAMP(60,1) = ((0.000000000000000D+00,-1.000000000000000D+00))
-     $ *AMP(342)+(-1.000000000000000D+00)*AMP(393)+AMP(458)+(
-     $ -1.000000000000000D+00)*AMP(1803)+(-1.000000000000000D+00)
-     $ *TMP_JAMP(779)+(-1.000000000000000D+00)*TMP_JAMP(1398)+(
-     $ -1.000000000000000D+00)*TMP_JAMP(1402)+((0.000000000000000D+00
-     $ ,1.000000000000000D+00))*TMP_JAMP(1697)+((0.000000000000000D+00
-     $ ,-1.000000000000000D+00))*TMP_JAMP(2239)+((0.000000000000000D
-     $ +00,-1.000000000000000D+00))*TMP_JAMP(2240)+(
-     $ -1.000000000000000D+00)*TMP_JAMP(2295)+((0.000000000000000D+00
-     $ ,1.000000000000000D+00))*TMP_JAMP(2730)+(-1.000000000000000D+00)
-     $ *TMP_JAMP(2758)
-      JAMP(61,1) = (-1.000000000000000D+00)*AMP(1407)
-     $ +((0.000000000000000D+00,1.000000000000000D+00))*AMP(1481)
-     $ +AMP(1612)+((0.000000000000000D+00,1.000000000000000D+00))
-     $ *TMP_JAMP(2245)+(-1.000000000000000D+00)*TMP_JAMP(2246)
-     $ +((0.000000000000000D+00,1.000000000000000D+00))*TMP_JAMP(2250)
-     $ +(-1.000000000000000D+00)*TMP_JAMP(2273)+((0.000000000000000D
-     $ +00,1.000000000000000D+00))*TMP_JAMP(2696)+((0.000000000000000D
-     $ +00,1.000000000000000D+00))*TMP_JAMP(2759)
-      JAMP(62,1) = ((0.000000000000000D+00,-1.000000000000000D+00))
-     $ *AMP(665)+(-1.000000000000000D+00)*AMP(1020)+(
-     $ -1.000000000000000D+00)*TMP_JAMP(2247)+TMP_JAMP(2249)
-     $ +((0.000000000000000D+00,-1.000000000000000D+00))*TMP_JAMP(2250)
-     $ +TMP_JAMP(2264)+TMP_JAMP(2698)+TMP_JAMP(2711)
-      JAMP(63,1) = (-1.000000000000000D+00)*AMP(1405)
-     $ +((0.000000000000000D+00,1.000000000000000D+00))*AMP(1478)
-     $ +AMP(1613)+((0.000000000000000D+00,-1.000000000000000D+00))
-     $ *TMP_JAMP(1413)+TMP_JAMP(2251)+((0.000000000000000D+00,
-     $ -1.000000000000000D+00))*TMP_JAMP(2253)+TMP_JAMP(2254)
-     $ +((0.000000000000000D+00,1.000000000000000D+00))*TMP_JAMP(2262)
-     $ +TMP_JAMP(2266)+((0.000000000000000D+00,-1.000000000000000D+00))
-     $ *TMP_JAMP(2759)
-      JAMP(64,1) = ((0.000000000000000D+00,-1.000000000000000D+00))
-     $ *AMP(366)+(-1.000000000000000D+00)*AMP(396)
-     $ +((0.000000000000000D+00,1.000000000000000D+00))*AMP(426)
-     $ +TMP_JAMP(2258)+TMP_JAMP(2260)+((0.000000000000000D+00,
-     $ -1.000000000000000D+00))*TMP_JAMP(2262)+TMP_JAMP(2271)
-     $ +((0.000000000000000D+00,1.000000000000000D+00))*TMP_JAMP(2428)
-     $ +TMP_JAMP(2698)
-      JAMP(65,1) = (-1.000000000000000D+00)*AMP(1015)+(
-     $ -1.000000000000000D+00)*TMP_JAMP(2263)+((0.000000000000000D+00
-     $ ,1.000000000000000D+00))*TMP_JAMP(2265)+((0.000000000000000D+00
-     $ ,1.000000000000000D+00))*TMP_JAMP(2267)+((0.000000000000000D+00
-     $ ,-1.000000000000000D+00))*TMP_JAMP(2268)+TMP_JAMP(2304)
-     $ +TMP_JAMP(2434)+TMP_JAMP(2441)+(-1.000000000000000D+00)
-     $ *TMP_JAMP(2699)+((0.000000000000000D+00,1.000000000000000D+00))
-     $ *TMP_JAMP(2701)
+     $ *TMP_JAMP(1172)+TMP_JAMP(1257)+((0.000000000000000D+00,
+     $ -1.000000000000000D+00))*TMP_JAMP(1301)+((0.000000000000000D+00
+     $ ,1.000000000000000D+00))*TMP_JAMP(1340)+(-1.000000000000000D+00)
+     $ *AMP(1404)+TMP_JAMP(1677)+((0.000000000000000D+00
+     $ ,1.000000000000000D+00))*TMP_JAMP(2142)+(-1.000000000000000D+00)
+     $ *TMP_JAMP(2820)+TMP_JAMP(2832)+(-1.000000000000000D+00)
+     $ *TMP_JAMP(2909)+((0.000000000000000D+00,-1.000000000000000D+00))
+     $ *TMP_JAMP(2976)+TMP_JAMP(3013)
+      JAMP(58,1) = (-1.000000000000000D+00)*AMP(395)+(
+     $ -1.000000000000000D+00)*TMP_JAMP(172)+(-1.000000000000000D+00)
+     $ *TMP_JAMP(419)+((0.000000000000000D+00,1.000000000000000D+00))
+     $ *TMP_JAMP(506)+(-1.000000000000000D+00)*TMP_JAMP(994)
+     $ +((0.000000000000000D+00,1.000000000000000D+00))*TMP_JAMP(1168)
+     $ +((0.000000000000000D+00,-1.000000000000000D+00))*TMP_JAMP(1340)
+     $ +(-1.000000000000000D+00)*TMP_JAMP(2023)+TMP_JAMP(2543)+(
+     $ -1.000000000000000D+00)*TMP_JAMP(2642)+(-1.000000000000000D+00)
+     $ *TMP_JAMP(2806)+(-1.000000000000000D+00)*TMP_JAMP(2838)
+     $ +((0.000000000000000D+00,1.000000000000000D+00))*TMP_JAMP(2964)
+     $ +TMP_JAMP(2983)
+      JAMP(59,1) = (-1.000000000000000D+00)*TMP_JAMP(800)
+     $ +((0.000000000000000D+00,-1.000000000000000D+00))*TMP_JAMP(893)
+     $ +((0.000000000000000D+00,-1.000000000000000D+00))*TMP_JAMP(1169)
+     $ +((0.000000000000000D+00,1.000000000000000D+00))*TMP_JAMP(1209)
+     $ +TMP_JAMP(1377)+(-1.000000000000000D+00)*AMP(1398)
+     $ +((0.000000000000000D+00,-1.000000000000000D+00))*TMP_JAMP(1776)
+     $ +(-1.000000000000000D+00)*TMP_JAMP(2149)+TMP_JAMP(2729)+(
+     $ -1.000000000000000D+00)*TMP_JAMP(2819)+(-1.000000000000000D+00)
+     $ *TMP_JAMP(2937)+((0.000000000000000D+00,-1.000000000000000D+00))
+     $ *TMP_JAMP(2956)+TMP_JAMP(2998)
+      JAMP(60,1) = (-1.000000000000000D+00)*AMP(393)
+     $ +((0.000000000000000D+00,1.000000000000000D+00))*TMP_JAMP(132)
+     $ +TMP_JAMP(414)+(-1.000000000000000D+00)*TMP_JAMP(431)
+     $ +((0.000000000000000D+00,1.000000000000000D+00))*TMP_JAMP(499)
+     $ +((0.000000000000000D+00,1.000000000000000D+00))*TMP_JAMP(1302)
+     $ +(-1.000000000000000D+00)*TMP_JAMP(1377)+TMP_JAMP(1574)
+     $ +TMP_JAMP(1639)+((0.000000000000000D+00,1.000000000000000D+00))
+     $ *TMP_JAMP(1979)+TMP_JAMP(2548)+(-1.000000000000000D+00)
+     $ *TMP_JAMP(2584)+(-1.000000000000000D+00)*TMP_JAMP(2808)+(
+     $ -1.000000000000000D+00)*TMP_JAMP(2879)+(-1.000000000000000D+00)
+     $ *TMP_JAMP(2983)
+      JAMP(61,1) = ((0.000000000000000D+00,-1.000000000000000D+00))
+     $ *TMP_JAMP(1394)+(-1.000000000000000D+00)*AMP(1407)
+     $ +((0.000000000000000D+00,-1.000000000000000D+00))*TMP_JAMP(2106)
+     $ +(-1.000000000000000D+00)*TMP_JAMP(2319)+(-1.000000000000000D
+     $ +00)*TMP_JAMP(2805)+(-1.000000000000000D+00)*TMP_JAMP(2881)
+     $ +TMP_JAMP(2887)+TMP_JAMP(2912)+(-1.000000000000000D+00)
+     $ *TMP_JAMP(3017)
+      JAMP(62,1) = TMP_JAMP(773)+((0.000000000000000D+00
+     $ ,1.000000000000000D+00))*TMP_JAMP(1231)+((0.000000000000000D+00
+     $ ,-1.000000000000000D+00))*TMP_JAMP(1288)+((0.000000000000000D
+     $ +00,1.000000000000000D+00))*TMP_JAMP(1342)+(-1.000000000000000D
+     $ +00)*AMP(1020)+((0.000000000000000D+00,1.000000000000000D+00))
+     $ *TMP_JAMP(2106)+(-1.000000000000000D+00)*TMP_JAMP(2146)+(
+     $ -1.000000000000000D+00)*TMP_JAMP(2271)+TMP_JAMP(2363)
+     $ +TMP_JAMP(2437)+TMP_JAMP(2562)+(-1.000000000000000D+00)
+     $ *TMP_JAMP(2745)+(-1.000000000000000D+00)*TMP_JAMP(2988)+(
+     $ -1.000000000000000D+00)*TMP_JAMP(3022)
+      JAMP(63,1) = (-1.000000000000000D+00)*TMP_JAMP(1380)+(
+     $ -1.000000000000000D+00)*AMP(1405)+TMP_JAMP(1952)
+     $ +((0.000000000000000D+00,-1.000000000000000D+00))*TMP_JAMP(2142)
+     $ +(-1.000000000000000D+00)*TMP_JAMP(2341)+TMP_JAMP(2452)+(
+     $ -1.000000000000000D+00)*TMP_JAMP(2687)+(-1.000000000000000D+00)
+     $ *TMP_JAMP(2724)+TMP_JAMP(2839)+TMP_JAMP(2929)+TMP_JAMP(3017)
+      JAMP(64,1) = (-1.000000000000000D+00)*AMP(396)
+     $ +((0.000000000000000D+00,-1.000000000000000D+00))*TMP_JAMP(300)
+     $ +((0.000000000000000D+00,1.000000000000000D+00))*TMP_JAMP(311)
+     $ +(-1.000000000000000D+00)*TMP_JAMP(421)+((0.000000000000000D+00
+     $ ,-1.000000000000000D+00))*TMP_JAMP(501)+TMP_JAMP(1380)+(
+     $ -1.000000000000000D+00)*AMP(945)+((0.000000000000000D+00,
+     $ -1.000000000000000D+00))*TMP_JAMP(1544)+TMP_JAMP(1683)
+     $ +TMP_JAMP(1801)+(-1.000000000000000D+00)*TMP_JAMP(2450)
+     $ +TMP_JAMP(2586)+TMP_JAMP(2720)+TMP_JAMP(2869)
+     $ +((0.000000000000000D+00,-1.000000000000000D+00))*TMP_JAMP(2964)
+     $ +((0.000000000000000D+00,1.000000000000000D+00))*TMP_JAMP(2975)
+      JAMP(65,1) = TMP_JAMP(579)+(-1.000000000000000D+00)
+     $ *TMP_JAMP(1008)+((0.000000000000000D+00,1.000000000000000D+00))
+     $ *TMP_JAMP(1049)+((0.000000000000000D+00,-1.000000000000000D+00))
+     $ *TMP_JAMP(1218)+(-1.000000000000000D+00)*AMP(1015)
+     $ +TMP_JAMP(1611)+TMP_JAMP(1862)+((0.000000000000000D+00
+     $ ,1.000000000000000D+00))*TMP_JAMP(1901)+TMP_JAMP(2273)+(
+     $ -1.000000000000000D+00)*TMP_JAMP(2441)+TMP_JAMP(3022)
+     $ +TMP_JAMP(3028)
       JAMP(66,1) = (-1.000000000000000D+00)*AMP(391)
-     $ +((0.000000000000000D+00,1.000000000000000D+00))*AMP(397)
-     $ +((0.000000000000000D+00,1.000000000000000D+00))*AMP(418)+(
-     $ -1.000000000000000D+00)*AMP(474)+((0.000000000000000D+00
-     $ ,1.000000000000000D+00))*TMP_JAMP(1824)+(-1.000000000000000D+00)
-     $ *TMP_JAMP(2270)+((0.000000000000000D+00,-1.000000000000000D+00))
-     $ *TMP_JAMP(2272)+((0.000000000000000D+00,-1.000000000000000D+00))
-     $ *TMP_JAMP(2274)+((0.000000000000000D+00,-1.000000000000000D+00))
-     $ *TMP_JAMP(2275)+TMP_JAMP(2312)+((0.000000000000000D+00,
-     $ -1.000000000000000D+00))*TMP_JAMP(2314)+((0.000000000000000D+00
-     $ ,1.000000000000000D+00))*TMP_JAMP(2446)+TMP_JAMP(2699)
-      JAMP(67,1) = ((0.000000000000000D+00,-1.000000000000000D+00))
-     $ *AMP(908)+(-1.000000000000000D+00)*AMP(1401)+(
-     $ -1.000000000000000D+00)*TMP_JAMP(699)+((0.000000000000000D+00
-     $ ,1.000000000000000D+00))*TMP_JAMP(701)+(-1.000000000000000D+00)
-     $ *TMP_JAMP(889)+((0.000000000000000D+00,-1.000000000000000D+00))
-     $ *TMP_JAMP(1427)+((0.000000000000000D+00,-1.000000000000000D+00))
-     $ *TMP_JAMP(1466)+TMP_JAMP(1473)+((0.000000000000000D+00
-     $ ,1.000000000000000D+00))*TMP_JAMP(1719)+TMP_JAMP(2276)+(
-     $ -1.000000000000000D+00)*TMP_JAMP(2278)+((0.000000000000000D+00,
-     $ -1.000000000000000D+00))*TMP_JAMP(2279)+(-1.000000000000000D+00)
-     $ *TMP_JAMP(2292)+TMP_JAMP(2734)+((0.000000000000000D+00
-     $ ,1.000000000000000D+00))*TMP_JAMP(2760)
-      JAMP(68,1) = (-1.000000000000000D+00)*AMP(1019)
-     $ +((0.000000000000000D+00,1.000000000000000D+00))*AMP(1056)
-     $ +TMP_JAMP(855)+TMP_JAMP(1664)+TMP_JAMP(2280)
-     $ +((0.000000000000000D+00,-1.000000000000000D+00))*TMP_JAMP(2282)
-     $ +((0.000000000000000D+00,-1.000000000000000D+00))*TMP_JAMP(2286)
-     $ +TMP_JAMP(2291)+((0.000000000000000D+00,1.000000000000000D+00))
-     $ *TMP_JAMP(2301)+((0.000000000000000D+00,-1.000000000000000D+00))
-     $ *TMP_JAMP(2760)
-      JAMP(69,1) = (-1.000000000000000D+00)*AMP(705)+(
-     $ -1.000000000000000D+00)*AMP(1399)+(-1.000000000000000D+00)
-     $ *AMP(1861)+(-1.000000000000000D+00)*TMP_JAMP(543)+TMP_JAMP(751)
-     $ +((0.000000000000000D+00,-1.000000000000000D+00))*TMP_JAMP(754)
-     $ +((0.000000000000000D+00,1.000000000000000D+00))*TMP_JAMP(1439)
-     $ +((0.000000000000000D+00,-1.000000000000000D+00))*TMP_JAMP(1440)
-     $ +((0.000000000000000D+00,1.000000000000000D+00))*TMP_JAMP(1447)
-     $ +((0.000000000000000D+00,-1.000000000000000D+00))*TMP_JAMP(1734)
-     $ +TMP_JAMP(2288)+TMP_JAMP(2290)+(-1.000000000000000D+00)
-     $ *TMP_JAMP(2291)+TMP_JAMP(2292)+((0.000000000000000D+00
-     $ ,1.000000000000000D+00))*TMP_JAMP(2293)+((0.000000000000000D+00
-     $ ,1.000000000000000D+00))*TMP_JAMP(2299)+((0.000000000000000D+00
-     $ ,1.000000000000000D+00))*TMP_JAMP(2302)+(-1.000000000000000D+00)
-     $ *TMP_JAMP(2581)
-      JAMP(70,1) = ((0.000000000000000D+00,-1.000000000000000D+00))
-     $ *AMP(384)+(-1.000000000000000D+00)*AMP(394)
-     $ +((0.000000000000000D+00,1.000000000000000D+00))*AMP(435)
-     $ +((0.000000000000000D+00,1.000000000000000D+00))*TMP_JAMP(1445)
-     $ +((0.000000000000000D+00,-1.000000000000000D+00))*TMP_JAMP(1448)
-     $ +TMP_JAMP(1700)+((0.000000000000000D+00,1.000000000000000D+00))
-     $ *TMP_JAMP(2294)+((0.000000000000000D+00,1.000000000000000D+00))
-     $ *TMP_JAMP(2296)+((0.000000000000000D+00,1.000000000000000D+00))
-     $ *TMP_JAMP(2298)+((0.000000000000000D+00,-1.000000000000000D+00))
-     $ *TMP_JAMP(2299)+((0.000000000000000D+00,-1.000000000000000D+00))
-     $ *TMP_JAMP(2306)+((0.000000000000000D+00,1.000000000000000D+00))
-     $ *TMP_JAMP(2702)
+     $ +((0.000000000000000D+00,1.000000000000000D+00))*TMP_JAMP(130)
+     $ +((0.000000000000000D+00,-1.000000000000000D+00))*TMP_JAMP(302)
+     $ +(-1.000000000000000D+00)*TMP_JAMP(417)+((0.000000000000000D+00
+     $ ,1.000000000000000D+00))*TMP_JAMP(479)+((0.000000000000000D+00
+     $ ,1.000000000000000D+00))*TMP_JAMP(502)+(-1.000000000000000D+00)
+     $ *TMP_JAMP(579)+(-1.000000000000000D+00)*TMP_JAMP(1418)
+     $ +TMP_JAMP(1707)+((0.000000000000000D+00,-1.000000000000000D+00))
+     $ *TMP_JAMP(1747)+(-1.000000000000000D+00)*TMP_JAMP(1920)+(
+     $ -1.000000000000000D+00)*TMP_JAMP(2584)+TMP_JAMP(2887)+(
+     $ -1.000000000000000D+00)*TMP_JAMP(2914)+((0.000000000000000D+00,
+     $ -1.000000000000000D+00))*TMP_JAMP(2975)
+      JAMP(67,1) = (-1.000000000000000D+00)*AMP(1401)+(
+     $ -1.000000000000000D+00)*TMP_JAMP(1626)+(-1.000000000000000D+00)
+     $ *TMP_JAMP(2144)+(-1.000000000000000D+00)*TMP_JAMP(2452)+(
+     $ -1.000000000000000D+00)*TMP_JAMP(2678)+TMP_JAMP(2768)
+     $ +TMP_JAMP(2906)+(-1.000000000000000D+00)*TMP_JAMP(2997)+(
+     $ -1.000000000000000D+00)*TMP_JAMP(3004)
+      JAMP(68,1) = ((0.000000000000000D+00,1.000000000000000D+00))
+     $ *TMP_JAMP(1055)+((0.000000000000000D+00,1.000000000000000D+00))
+     $ *TMP_JAMP(1058)+TMP_JAMP(1275)+((0.000000000000000D+00
+     $ ,1.000000000000000D+00))*TMP_JAMP(1342)+(-1.000000000000000D+00)
+     $ *AMP(1019)+((0.000000000000000D+00,-1.000000000000000D+00))
+     $ *TMP_JAMP(2116)+TMP_JAMP(2144)+TMP_JAMP(2297)+(
+     $ -1.000000000000000D+00)*TMP_JAMP(2341)+TMP_JAMP(2426)+(
+     $ -1.000000000000000D+00)*TMP_JAMP(2486)+TMP_JAMP(2794)+(
+     $ -1.000000000000000D+00)*TMP_JAMP(2999)+TMP_JAMP(3016)
+      JAMP(69,1) = (-1.000000000000000D+00)*TMP_JAMP(1413)+(
+     $ -1.000000000000000D+00)*AMP(1399)+TMP_JAMP(2042)+TMP_JAMP(2149)
+     $ +TMP_JAMP(2578)+TMP_JAMP(2679)+TMP_JAMP(2731)+(
+     $ -1.000000000000000D+00)*TMP_JAMP(2800)+(-1.000000000000000D+00)
+     $ *TMP_JAMP(2883)+TMP_JAMP(3004)
+      JAMP(70,1) = AMP(370)+(-1.000000000000000D+00)*AMP(394)
+     $ +((0.000000000000000D+00,1.000000000000000D+00))*AMP(435)+(
+     $ -1.000000000000000D+00)*TMP_JAMP(170)+((0.000000000000000D+00,
+     $ -1.000000000000000D+00))*TMP_JAMP(290)+((0.000000000000000D+00,
+     $ -1.000000000000000D+00))*TMP_JAMP(298)+(-1.000000000000000D+00)
+     $ *TMP_JAMP(414)+(-1.000000000000000D+00)*TMP_JAMP(434)
+     $ +TMP_JAMP(1413)+((0.000000000000000D+00,-1.000000000000000D+00))
+     $ *TMP_JAMP(1738)+((0.000000000000000D+00,1.000000000000000D+00))
+     $ *TMP_JAMP(1743)+TMP_JAMP(2522)+(-1.000000000000000D+00)
+     $ *TMP_JAMP(2575)+TMP_JAMP(2586)+TMP_JAMP(2791)+TMP_JAMP(2925)+(
+     $ -1.000000000000000D+00)*TMP_JAMP(2961)
       JAMP(71,1) = ((0.000000000000000D+00,-1.000000000000000D+00))
-     $ *AMP(664)+(-1.000000000000000D+00)*AMP(1016)
-     $ +((0.000000000000000D+00,1.000000000000000D+00))*AMP(1051)
-     $ +TMP_JAMP(870)+((0.000000000000000D+00,1.000000000000000D+00))
-     $ *TMP_JAMP(1455)+((0.000000000000000D+00,-1.000000000000000D+00))
-     $ *TMP_JAMP(2300)+((0.000000000000000D+00,1.000000000000000D+00))
-     $ *TMP_JAMP(2302)+TMP_JAMP(2303)+TMP_JAMP(2305)
-     $ +((0.000000000000000D+00,1.000000000000000D+00))*TMP_JAMP(2306)
-     $ +((0.000000000000000D+00,1.000000000000000D+00))*TMP_JAMP(2316)
-     $ +TMP_JAMP(2595)+((0.000000000000000D+00,-1.000000000000000D+00))
-     $ *TMP_JAMP(2701)
+     $ *TMP_JAMP(1176)+((0.000000000000000D+00,-1.000000000000000D+00))
+     $ *TMP_JAMP(1296)+(-1.000000000000000D+00)*AMP(1016)
+     $ +((0.000000000000000D+00,1.000000000000000D+00))*TMP_JAMP(2091)
+     $ +TMP_JAMP(2343)+(-1.000000000000000D+00)*TMP_JAMP(2800)+(
+     $ -1.000000000000000D+00)*TMP_JAMP(2945)+(-1.000000000000000D+00)
+     $ *TMP_JAMP(3016)+(-1.000000000000000D+00)*TMP_JAMP(3028)
       JAMP(72,1) = (-1.000000000000000D+00)*AMP(392)
-     $ +((0.000000000000000D+00,1.000000000000000D+00))*AMP(427)
-     $ +((0.000000000000000D+00,-1.000000000000000D+00))*TMP_JAMP(1465)
-     $ +((0.000000000000000D+00,1.000000000000000D+00))*TMP_JAMP(1757)
-     $ +((0.000000000000000D+00,-1.000000000000000D+00))*TMP_JAMP(2309)
-     $ +((0.000000000000000D+00,-1.000000000000000D+00))*TMP_JAMP(2310)
-     $ +(-1.000000000000000D+00)*TMP_JAMP(2311)+((0.000000000000000D
-     $ +00,1.000000000000000D+00))*TMP_JAMP(2313)+TMP_JAMP(2315)
-     $ +((0.000000000000000D+00,-1.000000000000000D+00))*TMP_JAMP(2316)
-     $ +TMP_JAMP(2444)+((0.000000000000000D+00,-1.000000000000000D+00))
-     $ *TMP_JAMP(2702)
-      JAMP(73,1) = (-1.000000000000000D+00)*AMP(1424)+(
-     $ -1.000000000000000D+00)*AMP(1512)+TMP_JAMP(682)
-     $ +((0.000000000000000D+00,-1.000000000000000D+00))*TMP_JAMP(696)
-     $ +((0.000000000000000D+00,-1.000000000000000D+00))*TMP_JAMP(1476)
-     $ +((0.000000000000000D+00,-1.000000000000000D+00))*TMP_JAMP(1562)
-     $ +TMP_JAMP(1567)+TMP_JAMP(2317)+TMP_JAMP(2318)+TMP_JAMP(2321)+(
-     $ -1.000000000000000D+00)*TMP_JAMP(2323)+((0.000000000000000D+00,
-     $ -1.000000000000000D+00))*TMP_JAMP(2352)+(-1.000000000000000D+00)
-     $ *TMP_JAMP(2761)
-      JAMP(74,1) = (-1.000000000000000D+00)*AMP(1418)
-     $ +((0.000000000000000D+00,-1.000000000000000D+00))*AMP(1430)
-     $ +((0.000000000000000D+00,1.000000000000000D+00))*TMP_JAMP(1125)
-     $ +TMP_JAMP(1496)+((0.000000000000000D+00,1.000000000000000D+00))
-     $ *TMP_JAMP(2319)+((0.000000000000000D+00,-1.000000000000000D+00))
-     $ *TMP_JAMP(2320)+(-1.000000000000000D+00)*TMP_JAMP(2321)
-     $ +((0.000000000000000D+00,-1.000000000000000D+00))*TMP_JAMP(2322)
-     $ +TMP_JAMP(2340)+TMP_JAMP(2358)+TMP_JAMP(2361)+TMP_JAMP(2364)
-     $ +((0.000000000000000D+00,-1.000000000000000D+00))*TMP_JAMP(2450)
-     $ +((0.000000000000000D+00,1.000000000000000D+00))*TMP_JAMP(2471)
-      JAMP(75,1) = (-1.000000000000000D+00)*AMP(1423)
-     $ +((0.000000000000000D+00,-1.000000000000000D+00))*TMP_JAMP(1482)
-     $ +((0.000000000000000D+00,-1.000000000000000D+00))*TMP_JAMP(1958)
-     $ +((0.000000000000000D+00,-1.000000000000000D+00))*TMP_JAMP(2324)
-     $ +(-1.000000000000000D+00)*TMP_JAMP(2328)+((0.000000000000000D
-     $ +00,-1.000000000000000D+00))*TMP_JAMP(2403)
-     $ +((0.000000000000000D+00,1.000000000000000D+00))*TMP_JAMP(2705)
-     $ +TMP_JAMP(2707)+TMP_JAMP(2761)
-      JAMP(76,1) = (-1.000000000000000D+00)*AMP(1027)+AMP(1717)+(
-     $ -1.000000000000000D+00)*TMP_JAMP(825)+((0.000000000000000D+00,
-     $ -1.000000000000000D+00))*TMP_JAMP(1131)+((0.000000000000000D+00
-     $ ,-1.000000000000000D+00))*TMP_JAMP(1625)+(-1.000000000000000D
-     $ +00)*TMP_JAMP(2331)+((0.000000000000000D+00,-1.000000000000000D
-     $ +00))*TMP_JAMP(2333)+((0.000000000000000D+00,1.000000000000000D
-     $ +00))*TMP_JAMP(2334)+(-1.000000000000000D+00)*TMP_JAMP(2335)
-     $ +((0.000000000000000D+00,1.000000000000000D+00))*TMP_JAMP(2347)
-     $ +((0.000000000000000D+00,-1.000000000000000D+00))*TMP_JAMP(2396)
-     $ +TMP_JAMP(2408)+((0.000000000000000D+00,-1.000000000000000D+00))
-     $ *TMP_JAMP(2705)
-      JAMP(77,1) = ((0.000000000000000D+00,1.000000000000000D+00))
-     $ *AMP(920)+((0.000000000000000D+00,-1.000000000000000D+00))
-     $ *AMP(1283)+(-1.000000000000000D+00)*AMP(1417)+TMP_JAMP(724)
-     $ +((0.000000000000000D+00,-1.000000000000000D+00))*TMP_JAMP(1577)
-     $ +(-1.000000000000000D+00)*TMP_JAMP(1583)+((0.000000000000000D
-     $ +00,1.000000000000000D+00))*TMP_JAMP(2336)+TMP_JAMP(2337)+(
-     $ -1.000000000000000D+00)*TMP_JAMP(2339)+TMP_JAMP(2341)
-     $ +TMP_JAMP(2342)+((0.000000000000000D+00,-1.000000000000000D+00))
-     $ *TMP_JAMP(2386)+(-1.000000000000000D+00)*TMP_JAMP(2455)
-     $ +((0.000000000000000D+00,1.000000000000000D+00))*TMP_JAMP(2706)
-      JAMP(78,1) = (-1.000000000000000D+00)*AMP(1026)+(
-     $ -1.000000000000000D+00)*TMP_JAMP(738)+TMP_JAMP(1524)
-     $ +((0.000000000000000D+00,1.000000000000000D+00))*TMP_JAMP(1530)
-     $ +(-1.000000000000000D+00)*TMP_JAMP(1629)+((0.000000000000000D
-     $ +00,-1.000000000000000D+00))*TMP_JAMP(2343)
-     $ +((0.000000000000000D+00,-1.000000000000000D+00))*TMP_JAMP(2346)
-     $ +TMP_JAMP(2348)+(-1.000000000000000D+00)*TMP_JAMP(2349)
-     $ +TMP_JAMP(2350)+((0.000000000000000D+00,1.000000000000000D+00))
-     $ *TMP_JAMP(2433)+(-1.000000000000000D+00)*TMP_JAMP(2462)+(
-     $ -1.000000000000000D+00)*TMP_JAMP(2522)+((0.000000000000000D+00,
-     $ -1.000000000000000D+00))*TMP_JAMP(2706)
-      JAMP(79,1) = (-1.000000000000000D+00)*AMP(1427)+(
-     $ -1.000000000000000D+00)*TMP_JAMP(667)+((0.000000000000000D+00,
-     $ -1.000000000000000D+00))*TMP_JAMP(675)+((0.000000000000000D+00
-     $ ,1.000000000000000D+00))*TMP_JAMP(1502)+((0.000000000000000D+00
-     $ ,-1.000000000000000D+00))*TMP_JAMP(1503)+(-1.000000000000000D
-     $ +00)*TMP_JAMP(1966)+TMP_JAMP(2351)+((0.000000000000000D+00
-     $ ,1.000000000000000D+00))*TMP_JAMP(2352)+(-1.000000000000000D+00)
-     $ *TMP_JAMP(2354)+(-1.000000000000000D+00)*TMP_JAMP(2355)
-     $ +TMP_JAMP(2362)+(-1.000000000000000D+00)*TMP_JAMP(2368)+(
-     $ -1.000000000000000D+00)*TMP_JAMP(2707)+(-1.000000000000000D+00)
-     $ *TMP_JAMP(2708)
-      JAMP(80,1) = ((0.000000000000000D+00,-1.000000000000000D+00))
-     $ *AMP(73)+(-1.000000000000000D+00)*AMP(1421)
-     $ +((0.000000000000000D+00,-1.000000000000000D+00))*AMP(1433)
-     $ +((0.000000000000000D+00,-1.000000000000000D+00))*TMP_JAMP(1124)
-     $ +TMP_JAMP(1532)+(-1.000000000000000D+00)*TMP_JAMP(2356)
-     $ +((0.000000000000000D+00,-1.000000000000000D+00))*TMP_JAMP(2359)
-     $ +((0.000000000000000D+00,-1.000000000000000D+00))*TMP_JAMP(2360)
-     $ +(-1.000000000000000D+00)*TMP_JAMP(2362)+((0.000000000000000D
-     $ +00,1.000000000000000D+00))*TMP_JAMP(2363)+TMP_JAMP(2365)
-     $ +((0.000000000000000D+00,-1.000000000000000D+00))*TMP_JAMP(2367)
-     $ +(-1.000000000000000D+00)*TMP_JAMP(2389)+((0.000000000000000D
-     $ +00,1.000000000000000D+00))*TMP_JAMP(2436)+((0.000000000000000D
-     $ +00,-1.000000000000000D+00))*TMP_JAMP(2457)
-      JAMP(81,1) = (-1.000000000000000D+00)*AMP(1425)
-     $ +((0.000000000000000D+00,-1.000000000000000D+00))*TMP_JAMP(1513)
-     $ +((0.000000000000000D+00,1.000000000000000D+00))*TMP_JAMP(1514)
-     $ +(-1.000000000000000D+00)*TMP_JAMP(2369)+TMP_JAMP(2372)+(
-     $ -1.000000000000000D+00)*TMP_JAMP(2374)+TMP_JAMP(2405)+(
-     $ -1.000000000000000D+00)*TMP_JAMP(2417)+((0.000000000000000D+00
-     $ ,1.000000000000000D+00))*TMP_JAMP(2419)+TMP_JAMP(2708)+(
-     $ -1.000000000000000D+00)*TMP_JAMP(2709)
+     $ +((0.000000000000000D+00,1.000000000000000D+00))*TMP_JAMP(299)
+     $ +TMP_JAMP(416)+((0.000000000000000D+00,1.000000000000000D+00))
+     $ *TMP_JAMP(452)+((0.000000000000000D+00,1.000000000000000D+00))
+     $ *TMP_JAMP(467)+((0.000000000000000D+00,1.000000000000000D+00))
+     $ *TMP_JAMP(507)+(-1.000000000000000D+00)*TMP_JAMP(1307)
+     $ +TMP_JAMP(1665)+(-1.000000000000000D+00)*TMP_JAMP(1706)
+     $ +((0.000000000000000D+00,1.000000000000000D+00))*TMP_JAMP(1748)
+     $ +TMP_JAMP(2049)+(-1.000000000000000D+00)*TMP_JAMP(2838)
+     $ +TMP_JAMP(2906)+TMP_JAMP(2945)+TMP_JAMP(2961)
+      JAMP(73,1) = ((0.000000000000000D+00,-1.000000000000000D+00))
+     $ *TMP_JAMP(584)+((0.000000000000000D+00,1.000000000000000D+00))
+     $ *TMP_JAMP(1396)+TMP_JAMP(1582)+((0.000000000000000D+00,
+     $ -1.000000000000000D+00))*TMP_JAMP(1713)+((0.000000000000000D+00
+     $ ,1.000000000000000D+00))*TMP_JAMP(1761)+((0.000000000000000D+00
+     $ ,-1.000000000000000D+00))*TMP_JAMP(1764)+TMP_JAMP(1895)+(
+     $ -1.000000000000000D+00)*TMP_JAMP(1932)+(-1.000000000000000D+00)
+     $ *AMP(1424)+TMP_JAMP(2569)+(-1.000000000000000D+00)
+     $ *TMP_JAMP(2652)+TMP_JAMP(2683)+TMP_JAMP(2786)+TMP_JAMP(2796)
+     $ +TMP_JAMP(2902)
+      JAMP(74,1) = TMP_JAMP(2027)+TMP_JAMP(2042)+(-1.000000000000000D
+     $ +00)*AMP(1418)+TMP_JAMP(2383)+TMP_JAMP(2580)+(
+     $ -1.000000000000000D+00)*TMP_JAMP(2683)+TMP_JAMP(2735)+(
+     $ -1.000000000000000D+00)*TMP_JAMP(2798)+(-1.000000000000000D+00)
+     $ *TMP_JAMP(2932)+TMP_JAMP(2942)+TMP_JAMP(3008)
+      JAMP(75,1) = TMP_JAMP(1015)+((0.000000000000000D+00,
+     $ -1.000000000000000D+00))*TMP_JAMP(1196)+(-1.000000000000000D+00)
+     $ *TMP_JAMP(1383)+(-1.000000000000000D+00)*TMP_JAMP(1386)
+     $ +TMP_JAMP(1860)+(-1.000000000000000D+00)*TMP_JAMP(1863)+(
+     $ -1.000000000000000D+00)*TMP_JAMP(1895)+TMP_JAMP(1899)+(
+     $ -1.000000000000000D+00)*AMP(1423)+TMP_JAMP(2627)+TMP_JAMP(2780)
+     $ +(-1.000000000000000D+00)*TMP_JAMP(2895)+(-1.000000000000000D
+     $ +00)*TMP_JAMP(2936)
+      JAMP(76,1) = (-1.000000000000000D+00)*TMP_JAMP(1038)+(
+     $ -1.000000000000000D+00)*TMP_JAMP(1107)+((0.000000000000000D+00,
+     $ -1.000000000000000D+00))*TMP_JAMP(1185)+((0.000000000000000D+00
+     $ ,-1.000000000000000D+00))*TMP_JAMP(1203)+(-1.000000000000000D
+     $ +00)*AMP(1027)+(-1.000000000000000D+00)*TMP_JAMP(1899)
+     $ +TMP_JAMP(2043)+(-1.000000000000000D+00)*TMP_JAMP(2095)+(
+     $ -1.000000000000000D+00)*TMP_JAMP(2328)+TMP_JAMP(2458)+(
+     $ -1.000000000000000D+00)*TMP_JAMP(2611)+TMP_JAMP(2649)+(
+     $ -1.000000000000000D+00)*TMP_JAMP(2684)+TMP_JAMP(2779)
+     $ +((0.000000000000000D+00,-1.000000000000000D+00))*TMP_JAMP(2943)
+     $ +TMP_JAMP(3009)
+      JAMP(77,1) = (-1.000000000000000D+00)*TMP_JAMP(800)
+     $ +TMP_JAMP(1631)+(-1.000000000000000D+00)*TMP_JAMP(1812)
+     $ +TMP_JAMP(1898)+(-1.000000000000000D+00)*AMP(1417)+(
+     $ -1.000000000000000D+00)*TMP_JAMP(2332)+TMP_JAMP(2537)
+     $ +TMP_JAMP(2932)+(-1.000000000000000D+00)*TMP_JAMP(2936)+(
+     $ -1.000000000000000D+00)*TMP_JAMP(2972)+TMP_JAMP(3023)
+      JAMP(78,1) = ((0.000000000000000D+00,-1.000000000000000D+00))
+     $ *TMP_JAMP(1216)+(-1.000000000000000D+00)*TMP_JAMP(1264)+(
+     $ -1.000000000000000D+00)*AMP(1026)+(-1.000000000000000D+00)
+     $ *TMP_JAMP(1494)+(-1.000000000000000D+00)*TMP_JAMP(1633)
+     $ +((0.000000000000000D+00,-1.000000000000000D+00))*TMP_JAMP(1764)
+     $ +(-1.000000000000000D+00)*TMP_JAMP(1898)+TMP_JAMP(2095)+(
+     $ -1.000000000000000D+00)*TMP_JAMP(2336)+(-1.000000000000000D+00)
+     $ *TMP_JAMP(2426)+TMP_JAMP(2511)+TMP_JAMP(2552)+TMP_JAMP(2685)+(
+     $ -1.000000000000000D+00)*TMP_JAMP(2874)+TMP_JAMP(2958)
+     $ +TMP_JAMP(3002)
+      JAMP(79,1) = ((0.000000000000000D+00,1.000000000000000D+00))
+     $ *TMP_JAMP(584)+((0.000000000000000D+00,-1.000000000000000D+00))
+     $ *TMP_JAMP(1187)+((0.000000000000000D+00,1.000000000000000D+00))
+     $ *TMP_JAMP(1200)+TMP_JAMP(1626)+((0.000000000000000D+00
+     $ ,1.000000000000000D+00))*TMP_JAMP(1849)+(-1.000000000000000D+00)
+     $ *TMP_JAMP(1883)+(-1.000000000000000D+00)*TMP_JAMP(2036)+(
+     $ -1.000000000000000D+00)*AMP(1427)+TMP_JAMP(2489)+(
+     $ -1.000000000000000D+00)*TMP_JAMP(2505)+(-1.000000000000000D+00)
+     $ *TMP_JAMP(2570)+(-1.000000000000000D+00)*TMP_JAMP(2630)
+     $ +TMP_JAMP(2645)+TMP_JAMP(2686)+(-1.000000000000000D+00)
+     $ *TMP_JAMP(2797)+(-1.000000000000000D+00)*TMP_JAMP(3011)
+      JAMP(80,1) = TMP_JAMP(643)+((0.000000000000000D+00
+     $ ,1.000000000000000D+00))*TMP_JAMP(1207)+((0.000000000000000D+00
+     $ ,-1.000000000000000D+00))*TMP_JAMP(1291)+TMP_JAMP(2037)
+     $ +((0.000000000000000D+00,1.000000000000000D+00))*TMP_JAMP(2138)
+     $ +(-1.000000000000000D+00)*AMP(1421)+(-1.000000000000000D+00)
+     $ *TMP_JAMP(2250)+(-1.000000000000000D+00)*TMP_JAMP(2381)+(
+     $ -1.000000000000000D+00)*TMP_JAMP(2686)+(-1.000000000000000D+00)
+     $ *TMP_JAMP(2699)+TMP_JAMP(2905)+TMP_JAMP(2987)+(
+     $ -1.000000000000000D+00)*TMP_JAMP(3008)
+      JAMP(81,1) = ((0.000000000000000D+00,-1.000000000000000D+00))
+     $ *TMP_JAMP(1188)+((0.000000000000000D+00,-1.000000000000000D+00))
+     $ *TMP_JAMP(1201)+TMP_JAMP(1269)+((0.000000000000000D+00
+     $ ,1.000000000000000D+00))*TMP_JAMP(1349)+((0.000000000000000D+00
+     $ ,-1.000000000000000D+00))*TMP_JAMP(1987)+TMP_JAMP(2020)+(
+     $ -1.000000000000000D+00)*TMP_JAMP(2141)+(-1.000000000000000D+00)
+     $ *AMP(1425)+(-1.000000000000000D+00)*TMP_JAMP(2773)
+     $ +TMP_JAMP(2864)+(-1.000000000000000D+00)*TMP_JAMP(2909)
+     $ +TMP_JAMP(3011)
       JAMP(82,1) = (-1.000000000000000D+00)*AMP(404)
-     $ +((0.000000000000000D+00,1.000000000000000D+00))*AMP(408)
-     $ +AMP(460)+TMP_JAMP(856)+((0.000000000000000D+00
-     $ ,1.000000000000000D+00))*TMP_JAMP(1133)+(-1.000000000000000D+00)
-     $ *TMP_JAMP(1524)+((0.000000000000000D+00,1.000000000000000D+00))
-     $ *TMP_JAMP(1635)+((0.000000000000000D+00,1.000000000000000D+00))
-     $ *TMP_JAMP(2376)+TMP_JAMP(2377)+((0.000000000000000D+00,
-     $ -1.000000000000000D+00))*TMP_JAMP(2379)+((0.000000000000000D+00
-     $ ,1.000000000000000D+00))*TMP_JAMP(2380)+(-1.000000000000000D+00)
-     $ *TMP_JAMP(2382)+TMP_JAMP(2425)+TMP_JAMP(2709)
-      JAMP(83,1) = ((0.000000000000000D+00,-1.000000000000000D+00))
-     $ *AMP(921)+((0.000000000000000D+00,-1.000000000000000D+00))
-     $ *AMP(1209)+((0.000000000000000D+00,1.000000000000000D+00))
-     $ *AMP(1284)+(-1.000000000000000D+00)*AMP(1419)+TMP_JAMP(786)+(
-     $ -1.000000000000000D+00)*TMP_JAMP(1558)+(-1.000000000000000D+00)
-     $ *TMP_JAMP(2383)+TMP_JAMP(2384)+((0.000000000000000D+00,
-     $ -1.000000000000000D+00))*TMP_JAMP(2385)+TMP_JAMP(2388)
-     $ +TMP_JAMP(2390)+(-1.000000000000000D+00)*TMP_JAMP(2391)+(
-     $ -1.000000000000000D+00)*TMP_JAMP(2469)+((0.000000000000000D+00,
-     $ -1.000000000000000D+00))*TMP_JAMP(2710)
+     $ +((0.000000000000000D+00,1.000000000000000D+00))*TMP_JAMP(305)
+     $ +TMP_JAMP(426)+((0.000000000000000D+00,1.000000000000000D+00))
+     $ *TMP_JAMP(505)+(-1.000000000000000D+00)*TMP_JAMP(555)+(
+     $ -1.000000000000000D+00)*TMP_JAMP(992)+((0.000000000000000D+00,
+     $ -1.000000000000000D+00))*TMP_JAMP(1349)+(-1.000000000000000D+00)
+     $ *TMP_JAMP(2023)+(-1.000000000000000D+00)*TMP_JAMP(2306)+(
+     $ -1.000000000000000D+00)*TMP_JAMP(2698)+(-1.000000000000000D+00)
+     $ *TMP_JAMP(2872)+TMP_JAMP(2922)+((0.000000000000000D+00
+     $ ,1.000000000000000D+00))*TMP_JAMP(2967)+TMP_JAMP(2987)
+      JAMP(83,1) = (-1.000000000000000D+00)*TMP_JAMP(784)
+     $ +TMP_JAMP(1020)+((0.000000000000000D+00,1.000000000000000D+00))
+     $ *TMP_JAMP(1053)+((0.000000000000000D+00,-1.000000000000000D+00))
+     $ *TMP_JAMP(1195)+((0.000000000000000D+00,1.000000000000000D+00))
+     $ *TMP_JAMP(1212)+TMP_JAMP(1268)+((0.000000000000000D+00,
+     $ -1.000000000000000D+00))*TMP_JAMP(1868)+((0.000000000000000D+00
+     $ ,-1.000000000000000D+00))*TMP_JAMP(2011)+(-1.000000000000000D
+     $ +00)*AMP(1419)+TMP_JAMP(2451)+TMP_JAMP(2699)+(
+     $ -1.000000000000000D+00)*TMP_JAMP(2772)+TMP_JAMP(2917)+(
+     $ -1.000000000000000D+00)*TMP_JAMP(2939)+(-1.000000000000000D+00)
+     $ *TMP_JAMP(2965)
       JAMP(84,1) = (-1.000000000000000D+00)*AMP(402)
-     $ +((0.000000000000000D+00,-1.000000000000000D+00))*AMP(408)
-     $ +TMP_JAMP(803)+((0.000000000000000D+00,1.000000000000000D+00))
-     $ *TMP_JAMP(1535)+(-1.000000000000000D+00)*TMP_JAMP(1644)
-     $ +((0.000000000000000D+00,1.000000000000000D+00))*TMP_JAMP(2393)
-     $ +TMP_JAMP(2397)+(-1.000000000000000D+00)*TMP_JAMP(2398)+(
-     $ -1.000000000000000D+00)*TMP_JAMP(2399)+((0.000000000000000D+00,
-     $ -1.000000000000000D+00))*TMP_JAMP(2400)+((0.000000000000000D+00
-     $ ,-1.000000000000000D+00))*TMP_JAMP(2443)+(-1.000000000000000D
-     $ +00)*TMP_JAMP(2481)+(-1.000000000000000D+00)*TMP_JAMP(2558)
-     $ +((0.000000000000000D+00,1.000000000000000D+00))*TMP_JAMP(2710)
-      JAMP(85,1) = ((0.000000000000000D+00,1.000000000000000D+00))
-     $ *AMP(77)+((0.000000000000000D+00,-1.000000000000000D+00))
-     $ *AMP(590)+(-1.000000000000000D+00)*AMP(1428)
-     $ +((0.000000000000000D+00,-1.000000000000000D+00))*AMP(1481)+(
-     $ -1.000000000000000D+00)*AMP(1701)+((0.000000000000000D+00
-     $ ,1.000000000000000D+00))*TMP_JAMP(1541)+((0.000000000000000D+00
-     $ ,-1.000000000000000D+00))*TMP_JAMP(1542)+((0.000000000000000D
-     $ +00,1.000000000000000D+00))*TMP_JAMP(1543)+TMP_JAMP(1551)
-     $ +((0.000000000000000D+00,-1.000000000000000D+00))*TMP_JAMP(2401)
-     $ +((0.000000000000000D+00,1.000000000000000D+00))*TMP_JAMP(2402)
-     $ +TMP_JAMP(2404)+(-1.000000000000000D+00)*TMP_JAMP(2405)
-     $ +TMP_JAMP(2410)+(-1.000000000000000D+00)*TMP_JAMP(2416)
-     $ +((0.000000000000000D+00,-1.000000000000000D+00))*TMP_JAMP(2422)
-     $ +((0.000000000000000D+00,-1.000000000000000D+00))*TMP_JAMP(2452)
+     $ +((0.000000000000000D+00,-1.000000000000000D+00))*TMP_JAMP(133)
+     $ +(-1.000000000000000D+00)*TMP_JAMP(181)+TMP_JAMP(424)+(
+     $ -1.000000000000000D+00)*TMP_JAMP(436)+((0.000000000000000D+00
+     $ ,1.000000000000000D+00))*TMP_JAMP(500)+(-1.000000000000000D+00)
+     $ *TMP_JAMP(556)+TMP_JAMP(658)+(-1.000000000000000D+00)
+     $ *TMP_JAMP(1013)+((0.000000000000000D+00,-1.000000000000000D+00))
+     $ *TMP_JAMP(1241)+((0.000000000000000D+00,1.000000000000000D+00))
+     $ *TMP_JAMP(1564)+TMP_JAMP(1617)+TMP_JAMP(1662)+(
+     $ -1.000000000000000D+00)*TMP_JAMP(2275)+TMP_JAMP(2489)+(
+     $ -1.000000000000000D+00)*TMP_JAMP(2761)+(-1.000000000000000D+00)
+     $ *TMP_JAMP(2880)+(-1.000000000000000D+00)*TMP_JAMP(2922)
+     $ +TMP_JAMP(2965)
+      JAMP(85,1) = TMP_JAMP(1386)+(-1.000000000000000D+00)*AMP(1428)+(
+     $ -1.000000000000000D+00)*TMP_JAMP(2372)+TMP_JAMP(2387)
+     $ +TMP_JAMP(2393)+TMP_JAMP(2427)+(-1.000000000000000D+00)
+     $ *TMP_JAMP(2467)+(-1.000000000000000D+00)*TMP_JAMP(2505)+(
+     $ -1.000000000000000D+00)*TMP_JAMP(2599)+(-1.000000000000000D+00)
+     $ *TMP_JAMP(2755)+(-1.000000000000000D+00)*TMP_JAMP(2912)
+     $ +((0.000000000000000D+00,-1.000000000000000D+00))*TMP_JAMP(2977)
+     $ +(-1.000000000000000D+00)*TMP_JAMP(3005)
       JAMP(86,1) = ((0.000000000000000D+00,1.000000000000000D+00))
-     $ *AMP(826)+(-1.000000000000000D+00)*AMP(1029)+AMP(1718)
-     $ +TMP_JAMP(637)+TMP_JAMP(829)+((0.000000000000000D+00
-     $ ,1.000000000000000D+00))*TMP_JAMP(832)+((0.000000000000000D+00
-     $ ,1.000000000000000D+00))*TMP_JAMP(1604)+((0.000000000000000D+00
-     $ ,-1.000000000000000D+00))*TMP_JAMP(2406)+TMP_JAMP(2407)+(
-     $ -1.000000000000000D+00)*TMP_JAMP(2408)+(-1.000000000000000D+00)
-     $ *TMP_JAMP(2409)+((0.000000000000000D+00,1.000000000000000D+00))
-     $ *TMP_JAMP(2411)+((0.000000000000000D+00,-1.000000000000000D+00))
-     $ *TMP_JAMP(2414)+((0.000000000000000D+00,1.000000000000000D+00))
-     $ *TMP_JAMP(2435)+(-1.000000000000000D+00)*TMP_JAMP(2711)
-      JAMP(87,1) = ((0.000000000000000D+00,-1.000000000000000D+00))
-     $ *AMP(1116)+(-1.000000000000000D+00)*AMP(1426)
-     $ +((0.000000000000000D+00,-1.000000000000000D+00))*AMP(1478)
-     $ +AMP(1834)+((0.000000000000000D+00,-1.000000000000000D+00))
-     $ *TMP_JAMP(1561)+((0.000000000000000D+00,1.000000000000000D+00))
-     $ *TMP_JAMP(1563)+((0.000000000000000D+00,1.000000000000000D+00))
-     $ *TMP_JAMP(1564)+((0.000000000000000D+00,-1.000000000000000D+00))
-     $ *TMP_JAMP(1572)+TMP_JAMP(2415)+TMP_JAMP(2417)
-     $ +((0.000000000000000D+00,-1.000000000000000D+00))*TMP_JAMP(2418)
-     $ +(-1.000000000000000D+00)*TMP_JAMP(2420)+(-1.000000000000000D
-     $ +00)*TMP_JAMP(2421)+TMP_JAMP(2423)+(-1.000000000000000D+00)
-     $ *TMP_JAMP(2427)+((0.000000000000000D+00,1.000000000000000D+00))
-     $ *TMP_JAMP(2439)
-      JAMP(88,1) = (-1.000000000000000D+00)*AMP(405)+AMP(461)+(
-     $ -1.000000000000000D+00)*TMP_JAMP(656)+(-1.000000000000000D+00)
-     $ *TMP_JAMP(858)+((0.000000000000000D+00,-1.000000000000000D+00))
-     $ *TMP_JAMP(862)+((0.000000000000000D+00,-1.000000000000000D+00))
-     $ *TMP_JAMP(1576)+((0.000000000000000D+00,1.000000000000000D+00))
-     $ *TMP_JAMP(1577)+((0.000000000000000D+00,-1.000000000000000D+00))
-     $ *TMP_JAMP(1581)+TMP_JAMP(1582)+TMP_JAMP(1584)
-     $ +((0.000000000000000D+00,-1.000000000000000D+00))*TMP_JAMP(1616)
-     $ +(-1.000000000000000D+00)*TMP_JAMP(2424)+(-1.000000000000000D
-     $ +00)*TMP_JAMP(2425)+(-1.000000000000000D+00)*TMP_JAMP(2426)
-     $ +((0.000000000000000D+00,-1.000000000000000D+00))*TMP_JAMP(2428)
-     $ +((0.000000000000000D+00,-1.000000000000000D+00))*TMP_JAMP(2429)
-     $ +TMP_JAMP(2430)+((0.000000000000000D+00,-1.000000000000000D+00))
-     $ *TMP_JAMP(2451)
-      JAMP(89,1) = ((0.000000000000000D+00,-1.000000000000000D+00))
-     $ *AMP(826)+(-1.000000000000000D+00)*AMP(1024)
-     $ +((0.000000000000000D+00,-1.000000000000000D+00))*AMP(1042)+(
-     $ -1.000000000000000D+00)*AMP(1717)+(-1.000000000000000D+00)
-     $ *TMP_JAMP(870)+TMP_JAMP(1590)+((0.000000000000000D+00
-     $ ,1.000000000000000D+00))*TMP_JAMP(1624)+((0.000000000000000D+00
-     $ ,1.000000000000000D+00))*TMP_JAMP(2431)+(-1.000000000000000D+00)
-     $ *TMP_JAMP(2432)+((0.000000000000000D+00,1.000000000000000D+00))
-     $ *TMP_JAMP(2433)+(-1.000000000000000D+00)*TMP_JAMP(2437)+(
-     $ -1.000000000000000D+00)*TMP_JAMP(2438)+((0.000000000000000D+00
-     $ ,1.000000000000000D+00))*TMP_JAMP(2439)+(-1.000000000000000D+00)
-     $ *TMP_JAMP(2440)+((0.000000000000000D+00,1.000000000000000D+00))
-     $ *TMP_JAMP(2448)+((0.000000000000000D+00,1.000000000000000D+00))
-     $ *TMP_JAMP(2715)
-      JAMP(90,1) = ((0.000000000000000D+00,-1.000000000000000D+00))
-     $ *AMP(64)+((0.000000000000000D+00,-1.000000000000000D+00))
-     $ *AMP(385)+(-1.000000000000000D+00)*AMP(400)
-     $ +((0.000000000000000D+00,1.000000000000000D+00))*AMP(406)
-     $ +((0.000000000000000D+00,-1.000000000000000D+00))*AMP(418)+(
-     $ -1.000000000000000D+00)*AMP(460)+(-1.000000000000000D+00)
-     $ *TMP_JAMP(1596)+((0.000000000000000D+00,-1.000000000000000D+00))
-     $ *TMP_JAMP(1633)+((0.000000000000000D+00,-1.000000000000000D+00))
-     $ *TMP_JAMP(2442)+((0.000000000000000D+00,-1.000000000000000D+00))
-     $ *TMP_JAMP(2443)+(-1.000000000000000D+00)*TMP_JAMP(2444)
-     $ +((0.000000000000000D+00,-1.000000000000000D+00))*TMP_JAMP(2445)
-     $ +((0.000000000000000D+00,-1.000000000000000D+00))*TMP_JAMP(2447)
-     $ +((0.000000000000000D+00,1.000000000000000D+00))*TMP_JAMP(2449)
-     $ +((0.000000000000000D+00,-1.000000000000000D+00))*TMP_JAMP(2452)
-     $ +TMP_JAMP(2453)+(-1.000000000000000D+00)*TMP_JAMP(2599)
-      JAMP(91,1) = ((0.000000000000000D+00,-1.000000000000000D+00))
-     $ *AMP(920)+AMP(1152)+(-1.000000000000000D+00)*AMP(1422)
-     $ +((0.000000000000000D+00,1.000000000000000D+00))*AMP(1463)+(
-     $ -1.000000000000000D+00)*AMP(1807)+(-1.000000000000000D+00)
-     $ *TMP_JAMP(631)+((0.000000000000000D+00,1.000000000000000D+00))
-     $ *TMP_JAMP(1767)+((0.000000000000000D+00,1.000000000000000D+00))
-     $ *TMP_JAMP(2454)+(-1.000000000000000D+00)*TMP_JAMP(2456)
-     $ +((0.000000000000000D+00,-1.000000000000000D+00))*TMP_JAMP(2458)
-     $ +TMP_JAMP(2460)+(-1.000000000000000D+00)*TMP_JAMP(2465)
-     $ +((0.000000000000000D+00,-1.000000000000000D+00))*TMP_JAMP(2762)
-      JAMP(92,1) = ((0.000000000000000D+00,-1.000000000000000D+00))
-     $ *AMP(827)+(-1.000000000000000D+00)*AMP(1028)+(
-     $ -1.000000000000000D+00)*AMP(1765)+(-1.000000000000000D+00)
-     $ *TMP_JAMP(635)+((0.000000000000000D+00,1.000000000000000D+00))
-     $ *TMP_JAMP(1603)+(-1.000000000000000D+00)*TMP_JAMP(2459)
-     $ +TMP_JAMP(2461)+((0.000000000000000D+00,1.000000000000000D+00))
-     $ *TMP_JAMP(2463)+((0.000000000000000D+00,-1.000000000000000D+00))
-     $ *TMP_JAMP(2464)+((0.000000000000000D+00,-1.000000000000000D+00))
-     $ *TMP_JAMP(2466)+((0.000000000000000D+00,-1.000000000000000D+00))
-     $ *TMP_JAMP(2614)+((0.000000000000000D+00,1.000000000000000D+00))
-     $ *TMP_JAMP(2763)
-      JAMP(93,1) = AMP(627)+((0.000000000000000D+00,
-     $ -1.000000000000000D+00))*AMP(1284)+(-1.000000000000000D+00)
-     $ *AMP(1420)+TMP_JAMP(645)+((0.000000000000000D+00
-     $ ,1.000000000000000D+00))*TMP_JAMP(2468)+TMP_JAMP(2470)
-     $ +((0.000000000000000D+00,1.000000000000000D+00))*TMP_JAMP(2473)
-     $ +TMP_JAMP(2477)+(-1.000000000000000D+00)*TMP_JAMP(2485)
-     $ +((0.000000000000000D+00,-1.000000000000000D+00))*TMP_JAMP(2743)
-     $ +((0.000000000000000D+00,1.000000000000000D+00))*TMP_JAMP(2762)
-      JAMP(94,1) = ((0.000000000000000D+00,-1.000000000000000D+00))
-     $ *AMP(387)+(-1.000000000000000D+00)*AMP(403)
-     $ +((0.000000000000000D+00,1.000000000000000D+00))*AMP(444)
-     $ +((0.000000000000000D+00,-1.000000000000000D+00))*TMP_JAMP(1615)
-     $ +(-1.000000000000000D+00)*TMP_JAMP(2476)+((0.000000000000000D
-     $ +00,-1.000000000000000D+00))*TMP_JAMP(2478)+TMP_JAMP(2480)
-     $ +((0.000000000000000D+00,1.000000000000000D+00))*TMP_JAMP(2482)
-     $ +((0.000000000000000D+00,-1.000000000000000D+00))*TMP_JAMP(2483)
-     $ +TMP_JAMP(2484)+((0.000000000000000D+00,-1.000000000000000D+00))
-     $ *TMP_JAMP(2486)+((0.000000000000000D+00,1.000000000000000D+00))
-     $ *TMP_JAMP(2492)+((0.000000000000000D+00,-1.000000000000000D+00))
-     $ *TMP_JAMP(2497)+((0.000000000000000D+00,1.000000000000000D+00))
-     $ *TMP_JAMP(2500)
-      JAMP(95,1) = (-1.000000000000000D+00)*AMP(1025)+(
-     $ -1.000000000000000D+00)*AMP(1069)+(-1.000000000000000D+00)
-     $ *TMP_JAMP(663)+(-1.000000000000000D+00)*TMP_JAMP(665)
-     $ +((0.000000000000000D+00,-1.000000000000000D+00))*TMP_JAMP(1623)
-     $ +TMP_JAMP(1629)+((0.000000000000000D+00,1.000000000000000D+00))
-     $ *TMP_JAMP(1809)+TMP_JAMP(2636)+((0.000000000000000D+00,
-     $ -1.000000000000000D+00))*TMP_JAMP(2715)+((0.000000000000000D+00
-     $ ,-1.000000000000000D+00))*TMP_JAMP(2763)+((0.000000000000000D
-     $ +00,1.000000000000000D+00))*TMP_JAMP(2764)
-      JAMP(96,1) = (-1.000000000000000D+00)*AMP(401)
-     $ +((0.000000000000000D+00,-1.000000000000000D+00))*AMP(406)+(
-     $ -1.000000000000000D+00)*AMP(462)+TMP_JAMP(676)+TMP_JAMP(680)
-     $ +((0.000000000000000D+00,1.000000000000000D+00))*TMP_JAMP(1632)
-     $ +((0.000000000000000D+00,-1.000000000000000D+00))*TMP_JAMP(1634)
-     $ +TMP_JAMP(1644)+((0.000000000000000D+00,-1.000000000000000D+00))
-     $ *TMP_JAMP(2491)+(-1.000000000000000D+00)*TMP_JAMP(2494)
-     $ +((0.000000000000000D+00,1.000000000000000D+00))*TMP_JAMP(2496)
-     $ +TMP_JAMP(2498)+(-1.000000000000000D+00)*TMP_JAMP(2641)
-     $ +((0.000000000000000D+00,-1.000000000000000D+00))*TMP_JAMP(2764)
+     $ *TMP_JAMP(951)+((0.000000000000000D+00,1.000000000000000D+00))
+     $ *TMP_JAMP(983)+TMP_JAMP(1107)+TMP_JAMP(1127)
+     $ +((0.000000000000000D+00,1.000000000000000D+00))*TMP_JAMP(1204)
+     $ +((0.000000000000000D+00,-1.000000000000000D+00))*TMP_JAMP(1290)
+     $ +(-1.000000000000000D+00)*AMP(1029)+TMP_JAMP(2146)+(
+     $ -1.000000000000000D+00)*TMP_JAMP(2480)+TMP_JAMP(2499)+(
+     $ -1.000000000000000D+00)*TMP_JAMP(2721)+(-1.000000000000000D+00)
+     $ *TMP_JAMP(2896)+((0.000000000000000D+00,1.000000000000000D+00))
+     $ *TMP_JAMP(2977)+(-1.000000000000000D+00)*TMP_JAMP(2995)
+      JAMP(87,1) = (-1.000000000000000D+00)*TMP_JAMP(1379)+(
+     $ -1.000000000000000D+00)*TMP_JAMP(1953)+TMP_JAMP(2141)+(
+     $ -1.000000000000000D+00)*AMP(1426)+TMP_JAMP(2247)+TMP_JAMP(2403)
+     $ +TMP_JAMP(2882)+TMP_JAMP(2902)+(-1.000000000000000D+00)
+     $ *TMP_JAMP(2929)+TMP_JAMP(3005)
+      JAMP(88,1) = (-1.000000000000000D+00)*AMP(405)+(
+     $ -1.000000000000000D+00)*TMP_JAMP(176)+((0.000000000000000D+00
+     $ ,1.000000000000000D+00))*TMP_JAMP(291)+((0.000000000000000D+00,
+     $ -1.000000000000000D+00))*TMP_JAMP(294)+((0.000000000000000D+00,
+     $ -1.000000000000000D+00))*TMP_JAMP(312)+TMP_JAMP(422)
+     $ +TMP_JAMP(1379)+TMP_JAMP(1604)+(-1.000000000000000D+00)
+     $ *TMP_JAMP(1684)+(-1.000000000000000D+00)*TMP_JAMP(1802)+(
+     $ -1.000000000000000D+00)*TMP_JAMP(2438)+TMP_JAMP(2512)+(
+     $ -1.000000000000000D+00)*TMP_JAMP(2715)+TMP_JAMP(2873)
+     $ +((0.000000000000000D+00,-1.000000000000000D+00))*TMP_JAMP(2967)
+     $ +TMP_JAMP(2982)
+      JAMP(89,1) = TMP_JAMP(258)+((0.000000000000000D+00
+     $ ,1.000000000000000D+00))*TMP_JAMP(318)+((0.000000000000000D+00
+     $ ,1.000000000000000D+00))*TMP_JAMP(476)+TMP_JAMP(1007)
+     $ +((0.000000000000000D+00,1.000000000000000D+00))*TMP_JAMP(1052)
+     $ +((0.000000000000000D+00,1.000000000000000D+00))*TMP_JAMP(1219)
+     $ +(-1.000000000000000D+00)*AMP(1024)+TMP_JAMP(1696)+(
+     $ -1.000000000000000D+00)*TMP_JAMP(1722)+((0.000000000000000D+00
+     $ ,1.000000000000000D+00))*TMP_JAMP(1858)+((0.000000000000000D+00
+     $ ,-1.000000000000000D+00))*TMP_JAMP(1901)+(-1.000000000000000D
+     $ +00)*TMP_JAMP(2135)+TMP_JAMP(2443)+(-1.000000000000000D+00)
+     $ *TMP_JAMP(2495)+TMP_JAMP(2646)+TMP_JAMP(2995)+TMP_JAMP(3003)
+      JAMP(90,1) = (-1.000000000000000D+00)*AMP(400)+(
+     $ -1.000000000000000D+00)*AMP(419)+(-1.000000000000000D+00)
+     $ *TMP_JAMP(109)+((0.000000000000000D+00,-1.000000000000000D+00))
+     $ *TMP_JAMP(240)+((0.000000000000000D+00,-1.000000000000000D+00))
+     $ *TMP_JAMP(295)+(-1.000000000000000D+00)*TMP_JAMP(427)
+     $ +((0.000000000000000D+00,-1.000000000000000D+00))*TMP_JAMP(466)
+     $ +((0.000000000000000D+00,-1.000000000000000D+00))*TMP_JAMP(502)
+     $ +((0.000000000000000D+00,-1.000000000000000D+00))*TMP_JAMP(522)
+     $ +TMP_JAMP(1138)+(-1.000000000000000D+00)*TMP_JAMP(1703)+(
+     $ -1.000000000000000D+00)*TMP_JAMP(2044)+(-1.000000000000000D+00)
+     $ *TMP_JAMP(2308)+TMP_JAMP(2427)+(-1.000000000000000D+00)
+     $ *TMP_JAMP(2444)+(-1.000000000000000D+00)*TMP_JAMP(2650)+(
+     $ -1.000000000000000D+00)*TMP_JAMP(2982)+(-1.000000000000000D+00)
+     $ *TMP_JAMP(3003)
+      JAMP(91,1) = TMP_JAMP(647)+((0.000000000000000D+00,
+     $ -1.000000000000000D+00))*TMP_JAMP(2113)+(-1.000000000000000D+00)
+     $ *AMP(1422)+TMP_JAMP(2369)+TMP_JAMP(2502)+(-1.000000000000000D
+     $ +00)*TMP_JAMP(2941)+(-1.000000000000000D+00)*TMP_JAMP(3023)+(
+     $ -1.000000000000000D+00)*TMP_JAMP(3024)
+      JAMP(92,1) = ((0.000000000000000D+00,1.000000000000000D+00))
+     $ *TMP_JAMP(985)+((0.000000000000000D+00,1.000000000000000D+00))
+     $ *TMP_JAMP(1204)+TMP_JAMP(1261)+TMP_JAMP(1280)
+     $ +((0.000000000000000D+00,-1.000000000000000D+00))*TMP_JAMP(1350)
+     $ +(-1.000000000000000D+00)*AMP(1028)+((0.000000000000000D+00
+     $ ,1.000000000000000D+00))*TMP_JAMP(2113)+(-1.000000000000000D+00)
+     $ *TMP_JAMP(2143)+TMP_JAMP(2334)+(-1.000000000000000D+00)
+     $ *TMP_JAMP(2545)+TMP_JAMP(2714)+(-1.000000000000000D+00)
+     $ *TMP_JAMP(2762)+TMP_JAMP(2857)+(-1.000000000000000D+00)
+     $ *TMP_JAMP(3002)
+      JAMP(93,1) = ((0.000000000000000D+00,1.000000000000000D+00))
+     $ *TMP_JAMP(1769)+(-1.000000000000000D+00)*AMP(1420)+(
+     $ -1.000000000000000D+00)*AMP(1889)+TMP_JAMP(2465)+TMP_JAMP(2476)
+     $ +(-1.000000000000000D+00)*TMP_JAMP(2625)+(-1.000000000000000D
+     $ +00)*TMP_JAMP(2917)+TMP_JAMP(2928)+(-1.000000000000000D+00)
+     $ *TMP_JAMP(2931)+TMP_JAMP(2950)+TMP_JAMP(3024)
+      JAMP(94,1) = (-1.000000000000000D+00)*AMP(403)
+     $ +((0.000000000000000D+00,-1.000000000000000D+00))*TMP_JAMP(290)
+     $ +((0.000000000000000D+00,-1.000000000000000D+00))*TMP_JAMP(307)
+     $ +(-1.000000000000000D+00)*TMP_JAMP(424)+TMP_JAMP(430)
+     $ +TMP_JAMP(532)+((0.000000000000000D+00,-1.000000000000000D+00))
+     $ *TMP_JAMP(1766)+TMP_JAMP(2512)+(-1.000000000000000D+00)
+     $ *TMP_JAMP(2756)+TMP_JAMP(2857)+TMP_JAMP(2924)+(
+     $ -1.000000000000000D+00)*TMP_JAMP(2950)+((0.000000000000000D+00
+     $ ,1.000000000000000D+00))*TMP_JAMP(2966)
+      JAMP(95,1) = AMP(150)+((0.000000000000000D+00,1.000000000000000D
+     $ +00))*TMP_JAMP(237)+(-1.000000000000000D+00)*TMP_JAMP(1043)
+     $ +((0.000000000000000D+00,-1.000000000000000D+00))*TMP_JAMP(1250)
+     $ +((0.000000000000000D+00,1.000000000000000D+00))*TMP_JAMP(1350)
+     $ +(-1.000000000000000D+00)*AMP(1025)+TMP_JAMP(2135)
+     $ +((0.000000000000000D+00,1.000000000000000D+00))*TMP_JAMP(2148)
+     $ +(-1.000000000000000D+00)*TMP_JAMP(2355)+(-1.000000000000000D
+     $ +00)*TMP_JAMP(2381)+TMP_JAMP(2757)+TMP_JAMP(2779)+(
+     $ -1.000000000000000D+00)*TMP_JAMP(3021)
+      JAMP(96,1) = (-1.000000000000000D+00)*AMP(401)+(
+     $ -1.000000000000000D+00)*AMP(437)+((0.000000000000000D+00
+     $ ,1.000000000000000D+00))*TMP_JAMP(222)+((0.000000000000000D+00
+     $ ,1.000000000000000D+00))*TMP_JAMP(310)+TMP_JAMP(427)
+     $ +((0.000000000000000D+00,-1.000000000000000D+00))*TMP_JAMP(478)
+     $ +((0.000000000000000D+00,1.000000000000000D+00))*TMP_JAMP(504)
+     $ +(-1.000000000000000D+00)*TMP_JAMP(1307)+((0.000000000000000D
+     $ +00,1.000000000000000D+00))*TMP_JAMP(1566)+(-1.000000000000000D
+     $ +00)*TMP_JAMP(2367)+TMP_JAMP(2502)+TMP_JAMP(2651)+(
+     $ -1.000000000000000D+00)*TMP_JAMP(2913)+((0.000000000000000D+00,
+     $ -1.000000000000000D+00))*TMP_JAMP(2966)+TMP_JAMP(3021)
       JAMP(97,1) = ((0.000000000000000D+00,-1.000000000000000D+00))
-     $ *AMP(934)+(-1.000000000000000D+00)*AMP(1445)+AMP(1637)
-     $ +TMP_JAMP(697)+TMP_JAMP(732)+((0.000000000000000D+00,
-     $ -1.000000000000000D+00))*TMP_JAMP(1168)+((0.000000000000000D+00
-     $ ,-1.000000000000000D+00))*TMP_JAMP(1647)+((0.000000000000000D
-     $ +00,1.000000000000000D+00))*TMP_JAMP(1649)+((0.000000000000000D
-     $ +00,1.000000000000000D+00))*TMP_JAMP(1652)+((0.000000000000000D
-     $ +00,-1.000000000000000D+00))*TMP_JAMP(1655)
-     $ +((0.000000000000000D+00,1.000000000000000D+00))*TMP_JAMP(1690)
-     $ +TMP_JAMP(1976)+((0.000000000000000D+00,-1.000000000000000D+00))
-     $ *TMP_JAMP(2718)+((0.000000000000000D+00,-1.000000000000000D+00))
-     $ *TMP_JAMP(2727)+TMP_JAMP(2765)
+     $ *TMP_JAMP(1399)+(-1.000000000000000D+00)*TMP_JAMP(1953)+(
+     $ -1.000000000000000D+00)*TMP_JAMP(2025)+((0.000000000000000D+00
+     $ ,1.000000000000000D+00))*TMP_JAMP(2121)+(-1.000000000000000D+00)
+     $ *AMP(1445)+TMP_JAMP(2234)+TMP_JAMP(2634)+(-1.000000000000000D
+     $ +00)*TMP_JAMP(2671)+TMP_JAMP(2689)+TMP_JAMP(2727)+TMP_JAMP(2866)
+     $ +TMP_JAMP(3012)
       JAMP(98,1) = ((0.000000000000000D+00,1.000000000000000D+00))
-     $ *AMP(91)+(-1.000000000000000D+00)*AMP(1439)+TMP_JAMP(703)+(
-     $ -1.000000000000000D+00)*TMP_JAMP(881)+TMP_JAMP(887)
-     $ +TMP_JAMP(937)+((0.000000000000000D+00,1.000000000000000D+00))
-     $ *TMP_JAMP(948)+((0.000000000000000D+00,-1.000000000000000D+00))
-     $ *TMP_JAMP(1127)+((0.000000000000000D+00,-1.000000000000000D+00))
-     $ *TMP_JAMP(1651)+((0.000000000000000D+00,1.000000000000000D+00))
-     $ *TMP_JAMP(1662)+((0.000000000000000D+00,1.000000000000000D+00))
-     $ *TMP_JAMP(1666)+((0.000000000000000D+00,-1.000000000000000D+00))
-     $ *TMP_JAMP(2501)+((0.000000000000000D+00,-1.000000000000000D+00))
-     $ *TMP_JAMP(2504)+((0.000000000000000D+00,-1.000000000000000D+00))
-     $ *TMP_JAMP(2533)+((0.000000000000000D+00,-1.000000000000000D+00))
-     $ *TMP_JAMP(2623)+((0.000000000000000D+00,1.000000000000000D+00))
-     $ *TMP_JAMP(2718)+TMP_JAMP(2766)
-      JAMP(99,1) = ((0.000000000000000D+00,-1.000000000000000D+00))
-     $ *AMP(933)+(-1.000000000000000D+00)*AMP(1444)+AMP(1754)+AMP(1755)
-     $ +(-1.000000000000000D+00)*TMP_JAMP(708)+((0.000000000000000D+00
-     $ ,1.000000000000000D+00))*TMP_JAMP(1654)+((0.000000000000000D+00
-     $ ,1.000000000000000D+00))*TMP_JAMP(1657)+((0.000000000000000D+00
-     $ ,-1.000000000000000D+00))*TMP_JAMP(1663)+((0.000000000000000D
-     $ +00,1.000000000000000D+00))*TMP_JAMP(1718)+((0.000000000000000D
-     $ +00,1.000000000000000D+00))*TMP_JAMP(2507)+(-1.000000000000000D
-     $ +00)*TMP_JAMP(2514)+((0.000000000000000D+00,1.000000000000000D
-     $ +00))*TMP_JAMP(2722)+TMP_JAMP(2735)+(-1.000000000000000D+00)
-     $ *TMP_JAMP(2765)
-      JAMP(100,1) = (-1.000000000000000D+00)*AMP(1036)+AMP(1663)+(
-     $ -1.000000000000000D+00)*TMP_JAMP(796)+(-1.000000000000000D+00)
-     $ *TMP_JAMP(985)+((0.000000000000000D+00,-1.000000000000000D+00))
-     $ *TMP_JAMP(990)+((0.000000000000000D+00,-1.000000000000000D+00))
-     $ *TMP_JAMP(1659)+((0.000000000000000D+00,-1.000000000000000D+00))
-     $ *TMP_JAMP(1661)+TMP_JAMP(1669)+((0.000000000000000D+00
-     $ ,1.000000000000000D+00))*TMP_JAMP(1671)+((0.000000000000000D+00
-     $ ,-1.000000000000000D+00))*TMP_JAMP(1802)+((0.000000000000000D
-     $ +00,-1.000000000000000D+00))*TMP_JAMP(1806)+(
-     $ -1.000000000000000D+00)*TMP_JAMP(2511)+((0.000000000000000D+00,
-     $ -1.000000000000000D+00))*TMP_JAMP(2512)+((0.000000000000000D+00
-     $ ,-1.000000000000000D+00))*TMP_JAMP(2587)+((0.000000000000000D
-     $ +00,-1.000000000000000D+00))*TMP_JAMP(2721)
-     $ +((0.000000000000000D+00,-1.000000000000000D+00))*TMP_JAMP(2722)
-     $ +((0.000000000000000D+00,1.000000000000000D+00))*TMP_JAMP(2726)
-     $ +TMP_JAMP(2736)
-      JAMP(101,1) = ((0.000000000000000D+00,1.000000000000000D+00))
-     $ *AMP(761)+(-1.000000000000000D+00)*AMP(1438)+AMP(1808)+AMP(1830)
-     $ +(-1.000000000000000D+00)*TMP_JAMP(720)+TMP_JAMP(722)
-     $ +((0.000000000000000D+00,-1.000000000000000D+00))*TMP_JAMP(1665)
-     $ +((0.000000000000000D+00,1.000000000000000D+00))*TMP_JAMP(1667)
-     $ +((0.000000000000000D+00,1.000000000000000D+00))*TMP_JAMP(1766)
-     $ +((0.000000000000000D+00,-1.000000000000000D+00))*TMP_JAMP(1903)
-     $ +(-1.000000000000000D+00)*TMP_JAMP(2515)+TMP_JAMP(2526)+(
-     $ -1.000000000000000D+00)*TMP_JAMP(2723)+((0.000000000000000D+00
-     $ ,1.000000000000000D+00))*TMP_JAMP(2725)+TMP_JAMP(2728)+(
-     $ -1.000000000000000D+00)*TMP_JAMP(2766)
-      JAMP(102,1) = ((0.000000000000000D+00,1.000000000000000D+00))
-     $ *AMP(668)+(-1.000000000000000D+00)*AMP(1035)+(
-     $ -1.000000000000000D+00)*AMP(1664)+TMP_JAMP(732)+(
-     $ -1.000000000000000D+00)*TMP_JAMP(735)+(-1.000000000000000D+00)
-     $ *TMP_JAMP(916)+((0.000000000000000D+00,-1.000000000000000D+00))
-     $ *TMP_JAMP(922)+((0.000000000000000D+00,-1.000000000000000D+00))
-     $ *TMP_JAMP(1670)+((0.000000000000000D+00,-1.000000000000000D+00))
-     $ *TMP_JAMP(1672)+((0.000000000000000D+00,-1.000000000000000D+00))
-     $ *TMP_JAMP(1676)+((0.000000000000000D+00,-1.000000000000000D+00))
-     $ *TMP_JAMP(1775)+(-1.000000000000000D+00)*TMP_JAMP(2521)+(
-     $ -1.000000000000000D+00)*TMP_JAMP(2525)+((0.000000000000000D+00,
-     $ -1.000000000000000D+00))*TMP_JAMP(2527)+((0.000000000000000D+00
-     $ ,-1.000000000000000D+00))*TMP_JAMP(2529)+((0.000000000000000D
-     $ +00,-1.000000000000000D+00))*TMP_JAMP(2725)
-     $ +((0.000000000000000D+00,-1.000000000000000D+00))*TMP_JAMP(2726)
-     $ +(-1.000000000000000D+00)*TMP_JAMP(2740)
-      JAMP(103,1) = ((0.000000000000000D+00,-1.000000000000000D+00))
-     $ *AMP(1298)+(-1.000000000000000D+00)*AMP(1448)
-     $ +((0.000000000000000D+00,1.000000000000000D+00))*AMP(1454)+(
-     $ -1.000000000000000D+00)*TMP_JAMP(769)+(-1.000000000000000D+00)
-     $ *TMP_JAMP(797)+(-1.000000000000000D+00)*TMP_JAMP(798)+(
-     $ -1.000000000000000D+00)*TMP_JAMP(978)+((0.000000000000000D+00
-     $ ,1.000000000000000D+00))*TMP_JAMP(1137)+((0.000000000000000D+00
-     $ ,-1.000000000000000D+00))*TMP_JAMP(1683)+((0.000000000000000D
-     $ +00,1.000000000000000D+00))*TMP_JAMP(1693)+(-1.000000000000000D
-     $ +00)*TMP_JAMP(1902)+((0.000000000000000D+00,-1.000000000000000D
-     $ +00))*TMP_JAMP(2531)+TMP_JAMP(2532)+((0.000000000000000D+00,
-     $ -1.000000000000000D+00))*TMP_JAMP(2534)+((0.000000000000000D+00
-     $ ,1.000000000000000D+00))*TMP_JAMP(2539)+((0.000000000000000D+00
-     $ ,1.000000000000000D+00))*TMP_JAMP(2727)+((0.000000000000000D+00
-     $ ,1.000000000000000D+00))*TMP_JAMP(2729)
-      JAMP(104,1) = (-1.000000000000000D+00)*AMP(1442)+(
-     $ -1.000000000000000D+00)*TMP_JAMP(755)+TMP_JAMP(756)+(
-     $ -1.000000000000000D+00)*TMP_JAMP(758)+TMP_JAMP(981)
-     $ +((0.000000000000000D+00,1.000000000000000D+00))*TMP_JAMP(999)
-     $ +((0.000000000000000D+00,-1.000000000000000D+00))*TMP_JAMP(1161)
-     $ +((0.000000000000000D+00,1.000000000000000D+00))*TMP_JAMP(1687)
-     $ +((0.000000000000000D+00,1.000000000000000D+00))*TMP_JAMP(1703)
-     $ +((0.000000000000000D+00,1.000000000000000D+00))*TMP_JAMP(1748)
-     $ +((0.000000000000000D+00,1.000000000000000D+00))*TMP_JAMP(2533)
-     $ +((0.000000000000000D+00,1.000000000000000D+00))*TMP_JAMP(2535)
-     $ +(-1.000000000000000D+00)*TMP_JAMP(2537)+TMP_JAMP(2538)+(
-     $ -1.000000000000000D+00)*TMP_JAMP(2540)+TMP_JAMP(2552)
-     $ +((0.000000000000000D+00,1.000000000000000D+00))*TMP_JAMP(2605)
-     $ +(-1.000000000000000D+00)*TMP_JAMP(2728)
-      JAMP(105,1) = (-1.000000000000000D+00)*AMP(1446)+AMP(1862)
-     $ +TMP_JAMP(770)+(-1.000000000000000D+00)*TMP_JAMP(909)
-     $ +TMP_JAMP(986)+((0.000000000000000D+00,1.000000000000000D+00))
-     $ *TMP_JAMP(1692)+((0.000000000000000D+00,1.000000000000000D+00))
-     $ *TMP_JAMP(1694)+((0.000000000000000D+00,-1.000000000000000D+00))
-     $ *TMP_JAMP(1699)+(-1.000000000000000D+00)*TMP_JAMP(2542)+(
-     $ -1.000000000000000D+00)*TMP_JAMP(2543)+TMP_JAMP(2544)+(
-     $ -1.000000000000000D+00)*TMP_JAMP(2551)+((0.000000000000000D+00,
-     $ -1.000000000000000D+00))*TMP_JAMP(2729)+((0.000000000000000D+00
-     $ ,1.000000000000000D+00))*TMP_JAMP(2731)+((0.000000000000000D+00
-     $ ,-1.000000000000000D+00))*TMP_JAMP(2738)
-      JAMP(106,1) = ((0.000000000000000D+00,1.000000000000000D+00))
-     $ *AMP(351)+(-1.000000000000000D+00)*AMP(413)+AMP(451)+AMP(693)+(
-     $ -1.000000000000000D+00)*TMP_JAMP(917)+((0.000000000000000D+00,
-     $ -1.000000000000000D+00))*TMP_JAMP(927)+TMP_JAMP(1005)
-     $ +((0.000000000000000D+00,1.000000000000000D+00))*TMP_JAMP(1020)
-     $ +((0.000000000000000D+00,1.000000000000000D+00))*TMP_JAMP(1696)
-     $ +((0.000000000000000D+00,-1.000000000000000D+00))*TMP_JAMP(1698)
-     $ +TMP_JAMP(1704)+((0.000000000000000D+00,1.000000000000000D+00))
-     $ *TMP_JAMP(1823)+(-1.000000000000000D+00)*TMP_JAMP(2547)
-     $ +((0.000000000000000D+00,-1.000000000000000D+00))*TMP_JAMP(2548)
-     $ +TMP_JAMP(2564)+((0.000000000000000D+00,1.000000000000000D+00))
-     $ *TMP_JAMP(2583)+((0.000000000000000D+00,1.000000000000000D+00))
-     $ *TMP_JAMP(2730)+((0.000000000000000D+00,-1.000000000000000D+00))
-     $ *TMP_JAMP(2731)+((0.000000000000000D+00,-1.000000000000000D+00))
-     $ *TMP_JAMP(2733)
+     $ *TMP_JAMP(1401)+TMP_JAMP(1952)+(-1.000000000000000D+00)
+     $ *TMP_JAMP(2022)+((0.000000000000000D+00,-1.000000000000000D+00))
+     $ *TMP_JAMP(2118)+(-1.000000000000000D+00)*AMP(1439)
+     $ +TMP_JAMP(2390)+(-1.000000000000000D+00)*TMP_JAMP(2408)
+     $ +TMP_JAMP(2456)+(-1.000000000000000D+00)*TMP_JAMP(2689)
+     $ +TMP_JAMP(2841)+TMP_JAMP(2908)+(-1.000000000000000D+00)
+     $ *TMP_JAMP(3006)
+      JAMP(99,1) = TMP_JAMP(821)+(-1.000000000000000D+00)
+     $ *TMP_JAMP(1018)+TMP_JAMP(1376)+((0.000000000000000D+00
+     $ ,1.000000000000000D+00))*TMP_JAMP(1378)+TMP_JAMP(1913)
+     $ +((0.000000000000000D+00,-1.000000000000000D+00))*TMP_JAMP(2121)
+     $ +TMP_JAMP(2124)+(-1.000000000000000D+00)*AMP(1444)+(
+     $ -1.000000000000000D+00)*TMP_JAMP(2490)+(-1.000000000000000D+00)
+     $ *TMP_JAMP(2638)+TMP_JAMP(2765)+(-1.000000000000000D+00)
+     $ *TMP_JAMP(2843)+TMP_JAMP(2901)
+      JAMP(100,1) = ((0.000000000000000D+00,-1.000000000000000D+00))
+     $ *TMP_JAMP(746)+(-1.000000000000000D+00)*TMP_JAMP(1278)+(
+     $ -1.000000000000000D+00)*AMP(1036)+(-1.000000000000000D+00)
+     $ *TMP_JAMP(1913)+((0.000000000000000D+00,-1.000000000000000D+00))
+     $ *TMP_JAMP(2012)+((0.000000000000000D+00,-1.000000000000000D+00))
+     $ *TMP_JAMP(2119)+(-1.000000000000000D+00)*TMP_JAMP(2499)
+     $ +TMP_JAMP(2592)+TMP_JAMP(2607)+TMP_JAMP(2669)+(
+     $ -1.000000000000000D+00)*TMP_JAMP(2690)+TMP_JAMP(2776)+(
+     $ -1.000000000000000D+00)*TMP_JAMP(2952)+TMP_JAMP(3020)
+      JAMP(101,1) = TMP_JAMP(1910)+((0.000000000000000D+00
+     $ ,1.000000000000000D+00))*TMP_JAMP(2118)+TMP_JAMP(2124)+(
+     $ -1.000000000000000D+00)*AMP(1438)+AMP(1809)+TMP_JAMP(2342)+(
+     $ -1.000000000000000D+00)*TMP_JAMP(2549)+(-1.000000000000000D+00)
+     $ *TMP_JAMP(2842)+(-1.000000000000000D+00)*TMP_JAMP(2867)
+     $ +TMP_JAMP(2984)+TMP_JAMP(3014)
+      JAMP(102,1) = (-1.000000000000000D+00)*TMP_JAMP(1030)
+     $ +((0.000000000000000D+00,-1.000000000000000D+00))*TMP_JAMP(1404)
+     $ +(-1.000000000000000D+00)*AMP(1035)+(-1.000000000000000D+00)
+     $ *TMP_JAMP(1809)+(-1.000000000000000D+00)*TMP_JAMP(1910)
+     $ +((0.000000000000000D+00,-1.000000000000000D+00))*TMP_JAMP(2018)
+     $ +((0.000000000000000D+00,1.000000000000000D+00))*TMP_JAMP(2119)
+     $ +(-1.000000000000000D+00)*TMP_JAMP(2560)+(-1.000000000000000D
+     $ +00)*TMP_JAMP(2606)+TMP_JAMP(2665)+TMP_JAMP(2691)+TMP_JAMP(2822)
+     $ +TMP_JAMP(2868)+TMP_JAMP(2989)
+      JAMP(103,1) = ((0.000000000000000D+00,1.000000000000000D+00))
+     $ *TMP_JAMP(1252)+((0.000000000000000D+00,1.000000000000000D+00))
+     $ *TMP_JAMP(1388)+(-1.000000000000000D+00)*TMP_JAMP(2125)+(
+     $ -1.000000000000000D+00)*AMP(1448)+TMP_JAMP(2430)+(
+     $ -1.000000000000000D+00)*TMP_JAMP(2447)+(-1.000000000000000D+00)
+     $ *TMP_JAMP(2478)+(-1.000000000000000D+00)*TMP_JAMP(2633)
+     $ +TMP_JAMP(2664)+(-1.000000000000000D+00)*TMP_JAMP(2848)
+     $ +TMP_JAMP(2930)+(-1.000000000000000D+00)*TMP_JAMP(3012)
+      JAMP(104,1) = (-1.000000000000000D+00)*TMP_JAMP(777)+(
+     $ -1.000000000000000D+00)*TMP_JAMP(798)+((0.000000000000000D+00
+     $ ,1.000000000000000D+00))*TMP_JAMP(845)+((0.000000000000000D+00
+     $ ,1.000000000000000D+00))*TMP_JAMP(962)+((0.000000000000000D+00,
+     $ -1.000000000000000D+00))*TMP_JAMP(1228)+TMP_JAMP(2126)+(
+     $ -1.000000000000000D+00)*AMP(1442)+(-1.000000000000000D+00)
+     $ *TMP_JAMP(2440)+(-1.000000000000000D+00)*TMP_JAMP(2457)+(
+     $ -1.000000000000000D+00)*TMP_JAMP(2580)+TMP_JAMP(2739)+(
+     $ -1.000000000000000D+00)*TMP_JAMP(2830)+(-1.000000000000000D+00)
+     $ *TMP_JAMP(2930)+(-1.000000000000000D+00)*TMP_JAMP(2993)
+     $ +TMP_JAMP(3006)
+      JAMP(105,1) = ((0.000000000000000D+00,-1.000000000000000D+00))
+     $ *TMP_JAMP(989)+((0.000000000000000D+00,-1.000000000000000D+00))
+     $ *TMP_JAMP(1388)+(-1.000000000000000D+00)*TMP_JAMP(1670)
+     $ +TMP_JAMP(2088)+((0.000000000000000D+00,-1.000000000000000D+00))
+     $ *TMP_JAMP(2117)+(-1.000000000000000D+00)*AMP(1446)
+     $ +TMP_JAMP(2901)+(-1.000000000000000D+00)*TMP_JAMP(2937)+(
+     $ -1.000000000000000D+00)*TMP_JAMP(2944)+(-1.000000000000000D+00)
+     $ *TMP_JAMP(3026)
+      JAMP(106,1) = (-1.000000000000000D+00)*AMP(413)
+     $ +((0.000000000000000D+00,-1.000000000000000D+00))*TMP_JAMP(292)
+     $ +((0.000000000000000D+00,1.000000000000000D+00))*TMP_JAMP(304)
+     $ +((0.000000000000000D+00,-1.000000000000000D+00))*TMP_JAMP(333)
+     $ +(-1.000000000000000D+00)*TMP_JAMP(432)+TMP_JAMP(442)
+     $ +((0.000000000000000D+00,1.000000000000000D+00))*TMP_JAMP(1599)
+     $ +TMP_JAMP(2670)+(-1.000000000000000D+00)*TMP_JAMP(2693)
+     $ +TMP_JAMP(2740)+(-1.000000000000000D+00)*TMP_JAMP(2921)
+     $ +TMP_JAMP(2944)+TMP_JAMP(2970)+(-1.000000000000000D+00)
+     $ *TMP_JAMP(2993)
       JAMP(107,1) = ((0.000000000000000D+00,-1.000000000000000D+00))
-     $ *AMP(762)+(-1.000000000000000D+00)*AMP(1440)+AMP(1889)+(
-     $ -1.000000000000000D+00)*TMP_JAMP(783)+(-1.000000000000000D+00)
-     $ *TMP_JAMP(784)+((0.000000000000000D+00,-1.000000000000000D+00))
-     $ *TMP_JAMP(1134)+((0.000000000000000D+00,-1.000000000000000D+00))
-     $ *TMP_JAMP(1153)+((0.000000000000000D+00,-1.000000000000000D+00))
-     $ *TMP_JAMP(1701)+((0.000000000000000D+00,1.000000000000000D+00))
-     $ *TMP_JAMP(1702)+((0.000000000000000D+00,1.000000000000000D+00))
-     $ *TMP_JAMP(2549)+(-1.000000000000000D+00)*TMP_JAMP(2550)
-     $ +((0.000000000000000D+00,-1.000000000000000D+00))*TMP_JAMP(2553)
-     $ +((0.000000000000000D+00,1.000000000000000D+00))*TMP_JAMP(2556)
-     $ +TMP_JAMP(2562)+(-1.000000000000000D+00)*TMP_JAMP(2732)
-     $ +((0.000000000000000D+00,1.000000000000000D+00))*TMP_JAMP(2768)
-      JAMP(108,1) = ((0.000000000000000D+00,1.000000000000000D+00))
-     $ *AMP(369)+((0.000000000000000D+00,-1.000000000000000D+00))
-     $ *AMP(378)+(-1.000000000000000D+00)*AMP(411)
-     $ +((0.000000000000000D+00,-1.000000000000000D+00))*AMP(417)
-     $ +((0.000000000000000D+00,-1.000000000000000D+00))*AMP(594)
-     $ +((0.000000000000000D+00,1.000000000000000D+00))*AMP(1669)
-     $ +TMP_JAMP(799)+(-1.000000000000000D+00)*TMP_JAMP(802)
-     $ +TMP_JAMP(960)+((0.000000000000000D+00,1.000000000000000D+00))
-     $ *TMP_JAMP(1707)+((0.000000000000000D+00,1.000000000000000D+00))
-     $ *TMP_JAMP(1709)+((0.000000000000000D+00,-1.000000000000000D+00))
-     $ *TMP_JAMP(1711)+TMP_JAMP(1714)+(-1.000000000000000D+00)
-     $ *TMP_JAMP(1835)+((0.000000000000000D+00,-1.000000000000000D+00))
-     $ *TMP_JAMP(2556)+(-1.000000000000000D+00)*TMP_JAMP(2557)+(
-     $ -1.000000000000000D+00)*TMP_JAMP(2561)+(-1.000000000000000D+00)
-     $ *TMP_JAMP(2563)+TMP_JAMP(2597)+((0.000000000000000D+00
-     $ ,1.000000000000000D+00))*TMP_JAMP(2733)
-      JAMP(109,1) = ((0.000000000000000D+00,1.000000000000000D+00))
-     $ *AMP(95)+((0.000000000000000D+00,1.000000000000000D+00))
-     $ *AMP(1199)+(-1.000000000000000D+00)*AMP(1449)+AMP(1567)+(
-     $ -1.000000000000000D+00)*TMP_JAMP(813)+(-1.000000000000000D+00)
-     $ *TMP_JAMP(814)+(-1.000000000000000D+00)*TMP_JAMP(885)
-     $ +((0.000000000000000D+00,-1.000000000000000D+00))*TMP_JAMP(1717)
-     $ +((0.000000000000000D+00,-1.000000000000000D+00))*TMP_JAMP(1720)
-     $ +TMP_JAMP(1722)+((0.000000000000000D+00,1.000000000000000D+00))
-     $ *TMP_JAMP(1727)+((0.000000000000000D+00,-1.000000000000000D+00))
-     $ *TMP_JAMP(1754)+((0.000000000000000D+00,-1.000000000000000D+00))
-     $ *TMP_JAMP(1974)+(-1.000000000000000D+00)*TMP_JAMP(2734)+(
-     $ -1.000000000000000D+00)*TMP_JAMP(2735)+(-1.000000000000000D+00)
-     $ *TMP_JAMP(2767)
-      JAMP(110,1) = AMP(527)+(-1.000000000000000D+00)*AMP(670)+(
-     $ -1.000000000000000D+00)*AMP(1038)+((0.000000000000000D+00,
-     $ -1.000000000000000D+00))*AMP(1199)+((0.000000000000000D+00,
-     $ -1.000000000000000D+00))*AMP(1592)+TMP_JAMP(821)+(
-     $ -1.000000000000000D+00)*TMP_JAMP(823)+TMP_JAMP(826)
-     $ +TMP_JAMP(873)+TMP_JAMP(915)+((0.000000000000000D+00
-     $ ,1.000000000000000D+00))*TMP_JAMP(1724)+((0.000000000000000D+00
-     $ ,-1.000000000000000D+00))*TMP_JAMP(1726)+((0.000000000000000D
-     $ +00,-1.000000000000000D+00))*TMP_JAMP(1975)+TMP_JAMP(2570)
-     $ +TMP_JAMP(2572)+((0.000000000000000D+00,-1.000000000000000D+00))
-     $ *TMP_JAMP(2574)+((0.000000000000000D+00,1.000000000000000D+00))
-     $ *TMP_JAMP(2584)+((0.000000000000000D+00,1.000000000000000D+00))
-     $ *TMP_JAMP(2613)+(-1.000000000000000D+00)*TMP_JAMP(2736)+(
-     $ -1.000000000000000D+00)*TMP_JAMP(2739)
-      JAMP(111,1) = (-1.000000000000000D+00)*AMP(687)
-     $ +((0.000000000000000D+00,1.000000000000000D+00))*AMP(754)+(
-     $ -1.000000000000000D+00)*AMP(1447)+((0.000000000000000D+00,
-     $ -1.000000000000000D+00))*AMP(1469)+AMP(1568)+TMP_JAMP(836)+(
-     $ -1.000000000000000D+00)*TMP_JAMP(839)+TMP_JAMP(840)
-     $ +TMP_JAMP(869)+((0.000000000000000D+00,1.000000000000000D+00))
-     $ *TMP_JAMP(1733)+TMP_JAMP(1738)+((0.000000000000000D+00,
-     $ -1.000000000000000D+00))*TMP_JAMP(1743)+((0.000000000000000D+00
-     $ ,1.000000000000000D+00))*TMP_JAMP(1749)+TMP_JAMP(1977)
-     $ +((0.000000000000000D+00,-1.000000000000000D+00))*TMP_JAMP(1978)
-     $ +((0.000000000000000D+00,-1.000000000000000D+00))*TMP_JAMP(2580)
-     $ +((0.000000000000000D+00,1.000000000000000D+00))*TMP_JAMP(2738)
-     $ +TMP_JAMP(2767)
-      JAMP(112,1) = ((0.000000000000000D+00,1.000000000000000D+00))
-     $ *AMP(367)+(-1.000000000000000D+00)*AMP(414)
-     $ +((0.000000000000000D+00,-1.000000000000000D+00))*AMP(754)
-     $ +AMP(786)+(-1.000000000000000D+00)*TMP_JAMP(852)+(
-     $ -1.000000000000000D+00)*TMP_JAMP(857)+(-1.000000000000000D+00)
-     $ *TMP_JAMP(884)+((0.000000000000000D+00,-1.000000000000000D+00))
-     $ *TMP_JAMP(1740)+((0.000000000000000D+00,1.000000000000000D+00))
-     $ *TMP_JAMP(1742)+TMP_JAMP(1751)+((0.000000000000000D+00,
-     $ -1.000000000000000D+00))*TMP_JAMP(1759)+((0.000000000000000D+00
-     $ ,-1.000000000000000D+00))*TMP_JAMP(1793)+(-1.000000000000000D
-     $ +00)*TMP_JAMP(1799)+((0.000000000000000D+00,1.000000000000000D
-     $ +00))*TMP_JAMP(1980)+((0.000000000000000D+00,
-     $ -1.000000000000000D+00))*TMP_JAMP(2582)+((0.000000000000000D+00
-     $ ,1.000000000000000D+00))*TMP_JAMP(2584)+TMP_JAMP(2585)
-     $ +((0.000000000000000D+00,-1.000000000000000D+00))*TMP_JAMP(2587)
-     $ +TMP_JAMP(2588)+(-1.000000000000000D+00)*TMP_JAMP(2589)
-     $ +TMP_JAMP(2590)
+     $ *TMP_JAMP(844)+((0.000000000000000D+00,1.000000000000000D+00))
+     $ *TMP_JAMP(978)+TMP_JAMP(1023)+TMP_JAMP(1034)
+     $ +((0.000000000000000D+00,1.000000000000000D+00))*TMP_JAMP(1228)
+     $ +((0.000000000000000D+00,-1.000000000000000D+00))*TMP_JAMP(1253)
+     $ +((0.000000000000000D+00,1.000000000000000D+00))*TMP_JAMP(1304)
+     $ +(-1.000000000000000D+00)*TMP_JAMP(1914)+TMP_JAMP(2089)
+     $ +((0.000000000000000D+00,-1.000000000000000D+00))*TMP_JAMP(2137)
+     $ +(-1.000000000000000D+00)*AMP(1440)+TMP_JAMP(2576)
+     $ +TMP_JAMP(2828)+(-1.000000000000000D+00)*TMP_JAMP(2939)+(
+     $ -1.000000000000000D+00)*TMP_JAMP(3026)
+      JAMP(108,1) = (-1.000000000000000D+00)*AMP(411)
+     $ +((0.000000000000000D+00,1.000000000000000D+00))*TMP_JAMP(293)
+     $ +((0.000000000000000D+00,1.000000000000000D+00))*TMP_JAMP(301)
+     $ +((0.000000000000000D+00,-1.000000000000000D+00))*TMP_JAMP(334)
+     $ +(-1.000000000000000D+00)*TMP_JAMP(437)+TMP_JAMP(440)
+     $ +((0.000000000000000D+00,-1.000000000000000D+00))*AMP(594)+(
+     $ -1.000000000000000D+00)*TMP_JAMP(781)+(-1.000000000000000D+00)
+     $ *TMP_JAMP(817)+TMP_JAMP(846)+((0.000000000000000D+00,
+     $ -1.000000000000000D+00))*TMP_JAMP(977)+((0.000000000000000D+00,
+     $ -1.000000000000000D+00))*TMP_JAMP(980)+((0.000000000000000D+00
+     $ ,1.000000000000000D+00))*TMP_JAMP(1252)+TMP_JAMP(1591)
+     $ +((0.000000000000000D+00,1.000000000000000D+00))*TMP_JAMP(1593)
+     $ +TMP_JAMP(2099)+(-1.000000000000000D+00)*TMP_JAMP(2350)+(
+     $ -1.000000000000000D+00)*TMP_JAMP(2600)+(-1.000000000000000D+00)
+     $ *TMP_JAMP(2669)+TMP_JAMP(2753)+(-1.000000000000000D+00)
+     $ *TMP_JAMP(2870)+(-1.000000000000000D+00)*TMP_JAMP(2970)
+      JAMP(109,1) = (-1.000000000000000D+00)*TMP_JAMP(1036)
+     $ +((0.000000000000000D+00,-1.000000000000000D+00))*TMP_JAMP(1378)
+     $ +(-1.000000000000000D+00)*TMP_JAMP(1884)+(-1.000000000000000D
+     $ +00)*TMP_JAMP(2039)+((0.000000000000000D+00,-1.000000000000000D
+     $ +00))*TMP_JAMP(2068)+(-1.000000000000000D+00)*AMP(1449)+(
+     $ -1.000000000000000D+00)*TMP_JAMP(2357)+TMP_JAMP(2523)+(
+     $ -1.000000000000000D+00)*TMP_JAMP(2573)+TMP_JAMP(2678)+(
+     $ -1.000000000000000D+00)*TMP_JAMP(2766)+TMP_JAMP(2775)+(
+     $ -1.000000000000000D+00)*TMP_JAMP(3027)
+      JAMP(110,1) = ((0.000000000000000D+00,1.000000000000000D+00))
+     $ *TMP_JAMP(990)+((0.000000000000000D+00,1.000000000000000D+00))
+     $ *TMP_JAMP(1248)+TMP_JAMP(1277)+(-1.000000000000000D+00)
+     $ *AMP(1038)+((0.000000000000000D+00,1.000000000000000D+00))
+     $ *TMP_JAMP(1852)+TMP_JAMP(1884)+TMP_JAMP(2040)
+     $ +((0.000000000000000D+00,1.000000000000000D+00))*TMP_JAMP(2116)
+     $ +(-1.000000000000000D+00)*TMP_JAMP(2338)+(-1.000000000000000D
+     $ +00)*TMP_JAMP(2795)+(-1.000000000000000D+00)*TMP_JAMP(2900)
+     $ +TMP_JAMP(3015)+(-1.000000000000000D+00)*TMP_JAMP(3020)
+      JAMP(111,1) = TMP_JAMP(1516)+(-1.000000000000000D+00)
+     $ *TMP_JAMP(1932)+((0.000000000000000D+00,1.000000000000000D+00))
+     $ *TMP_JAMP(2117)+(-1.000000000000000D+00)*AMP(1447)+(
+     $ -1.000000000000000D+00)*TMP_JAMP(2371)+TMP_JAMP(2519)
+     $ +TMP_JAMP(2572)+(-1.000000000000000D+00)*TMP_JAMP(2679)
+     $ +TMP_JAMP(2695)+TMP_JAMP(2787)+((0.000000000000000D+00
+     $ ,1.000000000000000D+00))*TMP_JAMP(2973)+TMP_JAMP(3027)
+      JAMP(112,1) = (-1.000000000000000D+00)*AMP(414)+(
+     $ -1.000000000000000D+00)*TMP_JAMP(189)+((0.000000000000000D+00
+     $ ,1.000000000000000D+00))*TMP_JAMP(299)+TMP_JAMP(433)
+     $ +TMP_JAMP(435)+(-1.000000000000000D+00)*TMP_JAMP(439)
+     $ +((0.000000000000000D+00,-1.000000000000000D+00))*TMP_JAMP(508)
+     $ +(-1.000000000000000D+00)*TMP_JAMP(1948)+((0.000000000000000D
+     $ +00,1.000000000000000D+00))*TMP_JAMP(2069)+(-1.000000000000000D
+     $ +00)*TMP_JAMP(2565)+TMP_JAMP(2788)+(-1.000000000000000D+00)
+     $ *TMP_JAMP(2795)+TMP_JAMP(2918)+((0.000000000000000D+00,
+     $ -1.000000000000000D+00))*TMP_JAMP(2973)
       JAMP(113,1) = ((0.000000000000000D+00,1.000000000000000D+00))
-     $ *AMP(82)+(-1.000000000000000D+00)*AMP(1033)
-     $ +((0.000000000000000D+00,1.000000000000000D+00))*AMP(1039)
-     $ +((0.000000000000000D+00,-1.000000000000000D+00))*AMP(1051)
-     $ +TMP_JAMP(866)+(-1.000000000000000D+00)*TMP_JAMP(867)
-     $ +((0.000000000000000D+00,-1.000000000000000D+00))*TMP_JAMP(1135)
-     $ +((0.000000000000000D+00,-1.000000000000000D+00))*TMP_JAMP(1748)
-     $ +((0.000000000000000D+00,-1.000000000000000D+00))*TMP_JAMP(1750)
-     $ +((0.000000000000000D+00,1.000000000000000D+00))*TMP_JAMP(1807)
-     $ +(-1.000000000000000D+00)*TMP_JAMP(1817)+((0.000000000000000D
-     $ +00,1.000000000000000D+00))*TMP_JAMP(2591)+(-1.000000000000000D
-     $ +00)*TMP_JAMP(2594)+((0.000000000000000D+00,1.000000000000000D
-     $ +00))*TMP_JAMP(2596)+TMP_JAMP(2739)+(-1.000000000000000D+00)
-     $ *TMP_JAMP(2740)+TMP_JAMP(2741)
+     $ *TMP_JAMP(78)+((0.000000000000000D+00,1.000000000000000D+00))
+     $ *TMP_JAMP(321)+((0.000000000000000D+00,-1.000000000000000D+00))
+     $ *TMP_JAMP(739)+(-1.000000000000000D+00)*TMP_JAMP(1272)+(
+     $ -1.000000000000000D+00)*AMP(1033)+(-1.000000000000000D+00)
+     $ *TMP_JAMP(1810)+((0.000000000000000D+00,-1.000000000000000D+00))
+     $ *TMP_JAMP(2091)+TMP_JAMP(2803)+(-1.000000000000000D+00)
+     $ *TMP_JAMP(2933)+TMP_JAMP(2991)+(-1.000000000000000D+00)
+     $ *TMP_JAMP(3015)
       JAMP(114,1) = (-1.000000000000000D+00)*AMP(409)
      $ +((0.000000000000000D+00,1.000000000000000D+00))*AMP(415)
-     $ +((0.000000000000000D+00,-1.000000000000000D+00))*AMP(427)+(
-     $ -1.000000000000000D+00)*AMP(451)+(-1.000000000000000D+00)
-     $ *AMP(1569)+(-1.000000000000000D+00)*TMP_JAMP(880)+TMP_JAMP(881)
-     $ +TMP_JAMP(882)+(-1.000000000000000D+00)*TMP_JAMP(885)
-     $ +((0.000000000000000D+00,-1.000000000000000D+00))*TMP_JAMP(1753)
-     $ +((0.000000000000000D+00,-1.000000000000000D+00))*TMP_JAMP(1758)
-     $ +((0.000000000000000D+00,1.000000000000000D+00))*TMP_JAMP(1760)
-     $ +((0.000000000000000D+00,-1.000000000000000D+00))*TMP_JAMP(1828)
-     $ +TMP_JAMP(2597)+((0.000000000000000D+00,1.000000000000000D+00))
-     $ *TMP_JAMP(2598)+(-1.000000000000000D+00)*TMP_JAMP(2599)
-     $ +TMP_JAMP(2601)+((0.000000000000000D+00,-1.000000000000000D+00))
-     $ *TMP_JAMP(2603)+(-1.000000000000000D+00)*TMP_JAMP(2741)
-      JAMP(115,1) = ((0.000000000000000D+00,1.000000000000000D+00))
-     $ *AMP(92)+(-1.000000000000000D+00)*AMP(1443)
-     $ +((0.000000000000000D+00,-1.000000000000000D+00))*AMP(1463)
-     $ +((0.000000000000000D+00,1.000000000000000D+00))*AMP(1537)
-     $ +AMP(1574)+(-1.000000000000000D+00)*AMP(1809)+(
-     $ -1.000000000000000D+00)*TMP_JAMP(897)+(-1.000000000000000D+00)
-     $ *TMP_JAMP(898)+(-1.000000000000000D+00)*TMP_JAMP(899)+(
-     $ -1.000000000000000D+00)*TMP_JAMP(900)+TMP_JAMP(1003)+(
-     $ -1.000000000000000D+00)*TMP_JAMP(1009)+((0.000000000000000D+00,
-     $ -1.000000000000000D+00))*TMP_JAMP(1014)+((0.000000000000000D+00
-     $ ,1.000000000000000D+00))*TMP_JAMP(1153)+((0.000000000000000D+00
-     $ ,-1.000000000000000D+00))*TMP_JAMP(1764)+((0.000000000000000D
-     $ +00,-1.000000000000000D+00))*TMP_JAMP(1765)
-     $ +((0.000000000000000D+00,-1.000000000000000D+00))*TMP_JAMP(1768)
-     $ +TMP_JAMP(1771)+((0.000000000000000D+00,-1.000000000000000D+00))
-     $ *TMP_JAMP(2604)+(-1.000000000000000D+00)*TMP_JAMP(2606)
-     $ +((0.000000000000000D+00,1.000000000000000D+00))*TMP_JAMP(2607)
-     $ +((0.000000000000000D+00,-1.000000000000000D+00))*TMP_JAMP(2611)
-     $ +TMP_JAMP(2617)+((0.000000000000000D+00,1.000000000000000D+00))
-     $ *TMP_JAMP(2620)
-      JAMP(116,1) = ((0.000000000000000D+00,1.000000000000000D+00))
-     $ *AMP(508)+((0.000000000000000D+00,-1.000000000000000D+00))
-     $ *AMP(668)+(-1.000000000000000D+00)*AMP(1037)
-     $ +((0.000000000000000D+00,-1.000000000000000D+00))*AMP(1121)+(
-     $ -1.000000000000000D+00)*AMP(1514)+((0.000000000000000D+00,
-     $ -1.000000000000000D+00))*AMP(1776)+(-1.000000000000000D+00)
-     $ *TMP_JAMP(908)+TMP_JAMP(910)+(-1.000000000000000D+00)
-     $ *TMP_JAMP(911)+TMP_JAMP(912)+TMP_JAMP(918)+TMP_JAMP(982)
-     $ +((0.000000000000000D+00,-1.000000000000000D+00))*TMP_JAMP(2607)
-     $ +((0.000000000000000D+00,-1.000000000000000D+00))*TMP_JAMP(2608)
-     $ +(-1.000000000000000D+00)*TMP_JAMP(2609)+((0.000000000000000D
-     $ +00,1.000000000000000D+00))*TMP_JAMP(2610)+((0.000000000000000D
-     $ +00,1.000000000000000D+00))*TMP_JAMP(2612)+((0.000000000000000D
-     $ +00,1.000000000000000D+00))*TMP_JAMP(2613)+((0.000000000000000D
-     $ +00,-1.000000000000000D+00))*TMP_JAMP(2615)
-     $ +((0.000000000000000D+00,-1.000000000000000D+00))*TMP_JAMP(2629)
-      JAMP(117,1) = (-1.000000000000000D+00)*AMP(528)
-     $ +((0.000000000000000D+00,1.000000000000000D+00))*AMP(595)+(
-     $ -1.000000000000000D+00)*AMP(1441)+((0.000000000000000D+00
-     $ ,1.000000000000000D+00))*AMP(1538)+TMP_JAMP(932)+(
-     $ -1.000000000000000D+00)*TMP_JAMP(933)+TMP_JAMP(934)
-     $ +TMP_JAMP(936)+TMP_JAMP(956)+(-1.000000000000000D+00)
-     $ *TMP_JAMP(983)+((0.000000000000000D+00,1.000000000000000D+00))
-     $ *TMP_JAMP(1136)+((0.000000000000000D+00,-1.000000000000000D+00))
-     $ *TMP_JAMP(1797)+((0.000000000000000D+00,-1.000000000000000D+00))
-     $ *TMP_JAMP(1985)+((0.000000000000000D+00,1.000000000000000D+00))
-     $ *TMP_JAMP(2616)+((0.000000000000000D+00,1.000000000000000D+00))
-     $ *TMP_JAMP(2619)+((0.000000000000000D+00,-1.000000000000000D+00))
-     $ *TMP_JAMP(2623)+(-1.000000000000000D+00)*TMP_JAMP(2624)+(
-     $ -1.000000000000000D+00)*TMP_JAMP(2634)+((0.000000000000000D+00
-     $ ,1.000000000000000D+00))*TMP_JAMP(2743)+((0.000000000000000D+00
-     $ ,-1.000000000000000D+00))*TMP_JAMP(2768)
+     $ +((0.000000000000000D+00,-1.000000000000000D+00))*TMP_JAMP(83)
+     $ +(-1.000000000000000D+00)*TMP_JAMP(190)+((0.000000000000000D+00
+     $ ,-1.000000000000000D+00))*TMP_JAMP(236)+((0.000000000000000D+00
+     $ ,-1.000000000000000D+00))*TMP_JAMP(311)+TMP_JAMP(439)
+     $ +((0.000000000000000D+00,-1.000000000000000D+00))*TMP_JAMP(467)
+     $ +((0.000000000000000D+00,-1.000000000000000D+00))*TMP_JAMP(526)
+     $ +TMP_JAMP(1138)+((0.000000000000000D+00,1.000000000000000D+00))
+     $ *TMP_JAMP(1593)+((0.000000000000000D+00,-1.000000000000000D+00))
+     $ *TMP_JAMP(1852)+TMP_JAMP(2052)+(-1.000000000000000D+00)
+     $ *TMP_JAMP(2390)+(-1.000000000000000D+00)*TMP_JAMP(2593)+(
+     $ -1.000000000000000D+00)*TMP_JAMP(2804)+(-1.000000000000000D+00)
+     $ *TMP_JAMP(2915)+(-1.000000000000000D+00)*TMP_JAMP(2991)
+      JAMP(115,1) = ((0.000000000000000D+00,-1.000000000000000D+00))
+     $ *TMP_JAMP(589)+((0.000000000000000D+00,-1.000000000000000D+00))
+     $ *TMP_JAMP(2122)+(-1.000000000000000D+00)*AMP(1443)+(
+     $ -1.000000000000000D+00)*TMP_JAMP(2373)+TMP_JAMP(2550)+(
+     $ -1.000000000000000D+00)*TMP_JAMP(2574)+(-1.000000000000000D+00)
+     $ *TMP_JAMP(2582)+(-1.000000000000000D+00)*TMP_JAMP(2626)
+     $ +TMP_JAMP(2629)+TMP_JAMP(2941)+(-1.000000000000000D+00)
+     $ *TMP_JAMP(3014)
+      JAMP(116,1) = TMP_JAMP(1279)+(-1.000000000000000D+00)*AMP(1037)
+     $ +((0.000000000000000D+00,1.000000000000000D+00))*TMP_JAMP(2122)
+     $ +TMP_JAMP(2143)+AMP(1665)+(-1.000000000000000D+00)
+     $ *TMP_JAMP(2371)+(-1.000000000000000D+00)*TMP_JAMP(2619)+(
+     $ -1.000000000000000D+00)*TMP_JAMP(2823)+TMP_JAMP(2853)+(
+     $ -1.000000000000000D+00)*TMP_JAMP(2989)+(-1.000000000000000D+00)
+     $ *TMP_JAMP(3019)
+      JAMP(117,1) = ((0.000000000000000D+00,1.000000000000000D+00))
+     $ *TMP_JAMP(589)+(-1.000000000000000D+00)*TMP_JAMP(1658)
+     $ +((0.000000000000000D+00,1.000000000000000D+00))*TMP_JAMP(2137)
+     $ +(-1.000000000000000D+00)*AMP(1441)+AMP(1515)+TMP_JAMP(2596)
+     $ +TMP_JAMP(2624)+TMP_JAMP(2633)+TMP_JAMP(2884)+TMP_JAMP(2908)+(
+     $ -1.000000000000000D+00)*TMP_JAMP(2928)+TMP_JAMP(2959)
       JAMP(118,1) = ((0.000000000000000D+00,1.000000000000000D+00))
      $ *AMP(349)+(-1.000000000000000D+00)*AMP(412)
-     $ +((0.000000000000000D+00,-1.000000000000000D+00))*AMP(595)
-     $ +AMP(627)+(-1.000000000000000D+00)*TMP_JAMP(952)+(
-     $ -1.000000000000000D+00)*TMP_JAMP(953)+TMP_JAMP(954)
-     $ +TMP_JAMP(955)+(-1.000000000000000D+00)*TMP_JAMP(957)
-     $ +TMP_JAMP(962)+((0.000000000000000D+00,-1.000000000000000D+00))
-     $ *TMP_JAMP(1794)+((0.000000000000000D+00,1.000000000000000D+00))
-     $ *TMP_JAMP(2625)+((0.000000000000000D+00,-1.000000000000000D+00))
-     $ *TMP_JAMP(2626)+((0.000000000000000D+00,-1.000000000000000D+00))
-     $ *TMP_JAMP(2627)+((0.000000000000000D+00,1.000000000000000D+00))
-     $ *TMP_JAMP(2628)+TMP_JAMP(2630)+TMP_JAMP(2631)
-     $ +((0.000000000000000D+00,-1.000000000000000D+00))*TMP_JAMP(2632)
-      JAMP(119,1) = ((0.000000000000000D+00,-1.000000000000000D+00))
-     $ *AMP(508)+(-1.000000000000000D+00)*AMP(1034)
-     $ +((0.000000000000000D+00,-1.000000000000000D+00))*AMP(1039)+(
-     $ -1.000000000000000D+00)*AMP(1071)+TMP_JAMP(975)+(
-     $ -1.000000000000000D+00)*TMP_JAMP(976)+(-1.000000000000000D+00)
-     $ *TMP_JAMP(977)+TMP_JAMP(978)+(-1.000000000000000D+00)
-     $ *TMP_JAMP(980)+(-1.000000000000000D+00)*TMP_JAMP(984)
-     $ +((0.000000000000000D+00,-1.000000000000000D+00))*TMP_JAMP(1138)
-     $ +((0.000000000000000D+00,-1.000000000000000D+00))*TMP_JAMP(1805)
-     $ +((0.000000000000000D+00,-1.000000000000000D+00))*TMP_JAMP(1808)
-     $ +((0.000000000000000D+00,-1.000000000000000D+00))*TMP_JAMP(1810)
-     $ +((0.000000000000000D+00,-1.000000000000000D+00))*TMP_JAMP(1811)
-     $ +TMP_JAMP(1813)+TMP_JAMP(1818)+((0.000000000000000D+00,
-     $ -1.000000000000000D+00))*TMP_JAMP(2633)+(-1.000000000000000D+00)
-     $ *TMP_JAMP(2634)+TMP_JAMP(2635)+(-1.000000000000000D+00)
-     $ *TMP_JAMP(2636)+((0.000000000000000D+00,-1.000000000000000D+00))
-     $ *TMP_JAMP(2744)
-      JAMP(120,1) = AMP(98)+(-1.000000000000000D+00)*AMP(410)
-     $ +((0.000000000000000D+00,-1.000000000000000D+00))*AMP(436)+(
-     $ -1.000000000000000D+00)*TMP_JAMP(1000)+TMP_JAMP(1001)
-     $ +TMP_JAMP(1002)+TMP_JAMP(1004)+(-1.000000000000000D+00)
-     $ *TMP_JAMP(1007)+TMP_JAMP(1008)+((0.000000000000000D+00
-     $ ,1.000000000000000D+00))*TMP_JAMP(1139)+((0.000000000000000D+00
-     $ ,-1.000000000000000D+00))*TMP_JAMP(1140)+((0.000000000000000D
-     $ +00,1.000000000000000D+00))*TMP_JAMP(1819)+((0.000000000000000D
-     $ +00,1.000000000000000D+00))*TMP_JAMP(1822)+((0.000000000000000D
-     $ +00,1.000000000000000D+00))*TMP_JAMP(1825)+((0.000000000000000D
-     $ +00,1.000000000000000D+00))*TMP_JAMP(1826)+((0.000000000000000D
-     $ +00,1.000000000000000D+00))*TMP_JAMP(1829)+TMP_JAMP(1832)
-     $ +TMP_JAMP(1834)+TMP_JAMP(1835)+((0.000000000000000D+00,
-     $ -1.000000000000000D+00))*TMP_JAMP(2637)+TMP_JAMP(2638)
-     $ +TMP_JAMP(2639)+((0.000000000000000D+00,1.000000000000000D+00))
-     $ *TMP_JAMP(2744)
+     $ +((0.000000000000000D+00,1.000000000000000D+00))*TMP_JAMP(308)
+     $ +TMP_JAMP(438)+(-1.000000000000000D+00)*TMP_JAMP(443)
+     $ +((0.000000000000000D+00,-1.000000000000000D+00))*TMP_JAMP(506)
+     $ +(-1.000000000000000D+00)*TMP_JAMP(532)+(-1.000000000000000D+00)
+     $ *TMP_JAMP(1667)+(-1.000000000000000D+00)*TMP_JAMP(1681)+(
+     $ -1.000000000000000D+00)*TMP_JAMP(1724)+TMP_JAMP(1834)+(
+     $ -1.000000000000000D+00)*TMP_JAMP(1947)+(-1.000000000000000D+00)
+     $ *TMP_JAMP(2628)+TMP_JAMP(2844)+(-1.000000000000000D+00)
+     $ *TMP_JAMP(2858)+TMP_JAMP(2918)+(-1.000000000000000D+00)
+     $ *TMP_JAMP(2959)
+      JAMP(119,1) = (-1.000000000000000D+00)*TMP_JAMP(1041)+(
+     $ -1.000000000000000D+00)*AMP(1034)+TMP_JAMP(1608)
+     $ +((0.000000000000000D+00,-1.000000000000000D+00))*TMP_JAMP(2148)
+     $ +(-1.000000000000000D+00)*TMP_JAMP(2614)+TMP_JAMP(2635)
+     $ +TMP_JAMP(2933)+TMP_JAMP(2992)+TMP_JAMP(3019)
+      JAMP(120,1) = (-1.000000000000000D+00)*AMP(410)
+     $ +((0.000000000000000D+00,-1.000000000000000D+00))*AMP(415)
+     $ +((0.000000000000000D+00,-1.000000000000000D+00))*AMP(436)
+     $ +((0.000000000000000D+00,1.000000000000000D+00))*TMP_JAMP(244)
+     $ +((0.000000000000000D+00,-1.000000000000000D+00))*TMP_JAMP(309)
+     $ +((0.000000000000000D+00,-1.000000000000000D+00))*TMP_JAMP(336)
+     $ +TMP_JAMP(531)+(-1.000000000000000D+00)*TMP_JAMP(1418)+(
+     $ -1.000000000000000D+00)*TMP_JAMP(1673)+TMP_JAMP(1724)
+     $ +((0.000000000000000D+00,1.000000000000000D+00))*TMP_JAMP(1797)
+     $ +((0.000000000000000D+00,-1.000000000000000D+00))*AMP(1458)
+     $ +TMP_JAMP(2619)+(-1.000000000000000D+00)*TMP_JAMP(2634)
+     $ +TMP_JAMP(2670)+(-1.000000000000000D+00)*TMP_JAMP(2916)+(
+     $ -1.000000000000000D+00)*TMP_JAMP(2992)
 
       IF(INIT_MODE)THEN
         DO I=1, NGRAPHS
@@ -19616,7 +19776,6 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
         ENDDO
       ENDDO
 
-      call counters_matrix1_stop()
       END
 
       SUBROUTINE PRINT_ZERO_AMP_1()
diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/cudacpp.mk b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/cudacpp.mk
index 59a2c906eb..f2cfa349da 100644
--- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/cudacpp.mk
+++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/cudacpp.mk
@@ -4,10 +4,13 @@
 # Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 
 #=== Determine the name of this makefile (https://ftp.gnu.org/old-gnu/Manuals/make-3.80/html_node/make_17.html)
-#=== NB: different names (e.g. cudacpp.mk and cudacpp_src.mk) are used in the Subprocess and src directories
+#=== NB: use ':=' to ensure that the value of CUDACPP_MAKEFILE is not modified further down after including make_opts
+#=== NB: use 'override' to ensure that the value can not be modified from the outside
+override CUDACPP_MAKEFILE := $(word $(words $(MAKEFILE_LIST)),$(MAKEFILE_LIST))
+###$(info CUDACPP_MAKEFILE='$(CUDACPP_MAKEFILE)')
 
-CUDACPP_MAKEFILE = $(word $(words $(MAKEFILE_LIST)),$(MAKEFILE_LIST))
-CUDACPP_SRC_MAKEFILE = cudacpp_src.mk
+#=== NB: different names (e.g. cudacpp.mk and cudacpp_src.mk) are used in the Subprocess and src directories
+override CUDACPP_SRC_MAKEFILE = cudacpp_src.mk
 
 #-------------------------------------------------------------------------------
 
@@ -29,7 +32,17 @@ UNAME_P := $(shell uname -p)
 
 #-------------------------------------------------------------------------------
 
-#=== Configure common compiler flags for C++ and CUDA
+#=== Include the common MG5aMC Makefile options
+
+# OM: this is crucial for MG5aMC flag consistency/documentation
+# AV: temporarely comment this out because it breaks cudacpp builds
+ifneq ($(wildcard ../../Source/make_opts),)
+include ../../Source/make_opts
+endif
+
+#-------------------------------------------------------------------------------
+
+#=== Configure common compiler flags for C++ and CUDA/HIP
 
 INCFLAGS = -I.
 OPTFLAGS = -O3 # this ends up in GPUFLAGS too (should it?), cannot add -Ofast or -ffast-math here
@@ -101,68 +114,85 @@ endif
 # Note: AR, CXX and FC are implicitly defined if not set externally
 # See https://www.gnu.org/software/make/manual/html_node/Implicit-Variables.html
 
-#-------------------------------------------------------------------------------
-
-CUDA_COMPILER_PATH := $(shell compiler="`which nvcc 2>/dev/null`" && while [ -L "$$compiler" ]; do compiler=`readlink "$$compiler"`; done && echo "$$compiler")
-HIP_COMPILER_PATH := $(shell compiler="`which hipcc 2>/dev/null`" && while [ -L "$$compiler" ]; do compiler=`readlink "$$compiler"`; done && echo "$$compiler")
-
-ifeq ($(findstring nvcc,$(CUDA_COMPILER_PATH)),nvcc)
-  #=== Configure the CUDA compiler
-
-  # If CXX is not a single word (example "clang++ --gcc-toolchain...") then disable CUDA builds (issue #505)
-  # This is because it is impossible to pass this to "GPUFLAGS += -ccbin <host-compiler>" below
-  ifneq ($(words $(subst ccache ,,$(CXX))),1) # allow at most "CXX=ccache <host-compiler>" from outside
-    $(warning CUDA builds are not supported for multi-word CXX "$(CXX)")
-    override CUDA_HOME=disabled
-  endif
+# Add -mmacosx-version-min=11.3 to avoid "ld: warning: object file was built for newer macOS version than being linked"
+ifneq ($(shell $(CXX) --version | egrep '^Apple clang'),)
+CXXFLAGS += -mmacosx-version-min=11.3
+endif
 
-  # If CUDA_HOME is not set, try to set it from the location of nvcc
-  ifndef CUDA_HOME
-    CUDA_HOME = $(patsubst %bin/nvcc,%,$(shell which nvcc 2>/dev/null))
-    $(warning CUDA_HOME was not set: using "$(CUDA_HOME)")
-  endif
+#-------------------------------------------------------------------------------
 
-  # Set GPUCC as $(CUDA_HOME)/bin/nvcc if it exists
-  ifneq ($(wildcard $(CUDA_HOME)/bin/nvcc),)
-    GPUCC = $(CUDA_HOME)/bin/nvcc
-    USE_NVTX ?=-DUSE_NVTX
-    # See https://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/index.html
-    # See https://arnon.dk/matching-sm-architectures-arch-and-gencode-for-various-nvidia-cards/
-    # Default: use compute capability 70 for V100 (CERN lxbatch, CERN itscrd, Juwels Cluster).
-    # Embed device code for 70, and PTX for 70+.
-    # Export MADGRAPH_CUDA_ARCHITECTURE (comma-separated list) to use another value or list of values (see #533).
-    # Examples: use 60 for P100 (Piz Daint), 80 for A100 (Juwels Booster, NVidia raplab/Curiosity).
-    MADGRAPH_CUDA_ARCHITECTURE ?= 70
-    ###CUARCHFLAGS = -gencode arch=compute_$(MADGRAPH_CUDA_ARCHITECTURE),code=compute_$(MADGRAPH_CUDA_ARCHITECTURE) -gencode arch=compute_$(MADGRAPH_CUDA_ARCHITECTURE),code=sm_$(MADGRAPH_CUDA_ARCHITECTURE) # Older implementation (AV): go back to this one for multi-GPU support #533
-    ###CUARCHFLAGS = --gpu-architecture=compute_$(MADGRAPH_CUDA_ARCHITECTURE) --gpu-code=sm_$(MADGRAPH_CUDA_ARCHITECTURE),compute_$(MADGRAPH_CUDA_ARCHITECTURE) # Newer implementation (SH): cannot use this as-is for multi-GPU support #533
-    comma:=,
-    CUARCHFLAGS = $(foreach arch,$(subst $(comma), ,$(MADGRAPH_CUDA_ARCHITECTURE)),-gencode arch=compute_$(arch),code=compute_$(arch) -gencode arch=compute_$(arch),code=sm_$(arch))
-    CUINC = -I$(CUDA_HOME)/include/
-    CURANDLIBFLAGS = -L$(CUDA_HOME)/lib64/ -lcurand # NB: -lcuda is not needed here!
-    CUOPTFLAGS = -lineinfo
-    GPUFLAGS = $(foreach opt, $(OPTFLAGS), -Xcompiler $(opt)) $(CUOPTFLAGS) $(INCFLAGS) $(CUINC) $(USE_NVTX) $(CUARCHFLAGS) -use_fast_math
-    ###GPUFLAGS += -Xcompiler -Wall -Xcompiler -Wextra -Xcompiler -Wshadow
-    ###GPUCC_VERSION = $(shell $(GPUCC) --version | grep 'Cuda compilation tools' | cut -d' ' -f5 | cut -d, -f1)
-    GPUFLAGS += -std=c++17 # need CUDA >= 11.2 (see #333): this is enforced in mgOnGpuConfig.h
-    # Without -maxrregcount: baseline throughput: 6.5E8 (16384 32 12) up to 7.3E8 (65536 128 12)
-    ###GPUFLAGS+= --maxrregcount 160 # improves throughput: 6.9E8 (16384 32 12) up to 7.7E8 (65536 128 12)
-    ###GPUFLAGS+= --maxrregcount 128 # improves throughput: 7.3E8 (16384 32 12) up to 7.6E8 (65536 128 12)
-    ###GPUFLAGS+= --maxrregcount 96 # degrades throughput: 4.1E8 (16384 32 12) up to 4.5E8 (65536 128 12)
-    ###GPUFLAGS+= --maxrregcount 64 # degrades throughput: 1.7E8 (16384 32 12) flat at 1.7E8 (65536 128 12)
-    CUBUILDRULEFLAGS = -Xcompiler -fPIC -c
-    CCBUILDRULEFLAGS = -Xcompiler -fPIC -c -x cu
-    CUDATESTFLAGS = -lcuda
-  else ifneq ($(origin REQUIRE_CUDA),undefined)
-    # If REQUIRE_CUDA is set but no cuda is found, stop here (e.g. for CI tests on GPU #443)
-    $(error No cuda installation found (set CUDA_HOME or make GPUCC visible in PATH))
+#=== Configure the GPU compiler (CUDA or HIP)
+
+# FIXME! (AV 24.01.2024)
+# In the current implementation (without separate builds for C++ and CUDA/HIP), we first check for cudacc and hipcc in CUDA_HOME and HIP_HOME.
+# If CUDA_HOME or HIP_HOME are not set, try to determine them from the path to cudacc and hipcc.
+# While convoluted, this is currently necessary to allow disabling CUDA/HIP builds by setting CUDA_HOME or HIP_HOME to invalid paths.
+# This will (probably?) be fixed when separate C++ and CUDA/HIP builds are implemented (PR #775).
+
+# If CXX is not a single word (example "clang++ --gcc-toolchain...") then disable CUDA and HIP builds (issue #505)
+# This is because it is impossible to pass this to "GPUFLAGS += -ccbin <host-compiler>" below
+ifneq ($(words $(subst ccache ,,$(CXX))),1) # allow at most "CXX=ccache <host-compiler>" from outside
+  $(warning CUDA and HIP builds are not supported for multi-word CXX "$(CXX)")
+  override CUDA_HOME=disabled
+  override HIP_HOME=disabled
+endif
+
+# If CUDA_HOME is not set, try to set it from the path to nvcc
+ifndef CUDA_HOME
+  CUDA_HOME = $(patsubst %bin/nvcc,%,$(shell which nvcc 2>/dev/null))
+  $(warning CUDA_HOME was not set: using "$(CUDA_HOME)")
+endif
+
+# If HIP_HOME is not set, try to set it from the path to hipcc
+ifndef HIP_HOME
+  HIP_HOME = $(patsubst %bin/hipcc,%,$(HIP_COMPILER_PATH))
+  $(warning HIP_HOME was not set: using "$(HIP_HOME)")
+endif
+
+# FIXME! (AV 24.01.2024)
+# In the current implementation (without separate builds for C++ and CUDA/HIP),
+# builds are performed for HIP only if CUDA is not found in the path.
+# If both CUDA and HIP are installed, HIP builds can be triggered by unsetting CUDA_HOME.
+# This will be fixed when separate C++ and CUDA/HIP builds are implemented (PR #775).
+
+#--- Option 1: CUDA exists -> use CUDA
+
+# Set GPUCC as $(CUDA_HOME)/bin/nvcc if it exists
+ifneq ($(wildcard $(CUDA_HOME)/bin/nvcc),)
+
+  GPUCC = $(CUDA_HOME)/bin/nvcc
+  USE_NVTX ?=-DUSE_NVTX
+  # See https://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/index.html
+  # See https://arnon.dk/matching-sm-architectures-arch-and-gencode-for-various-nvidia-cards/
+  # Default: use compute capability 70 for V100 (CERN lxbatch, CERN itscrd, Juwels Cluster).
+  # Embed device code for 70, and PTX for 70+.
+  # Export MADGRAPH_CUDA_ARCHITECTURE (comma-separated list) to use another value or list of values (see #533).
+  # Examples: use 60 for P100 (Piz Daint), 80 for A100 (Juwels Booster, NVidia raplab/Curiosity).
+  MADGRAPH_CUDA_ARCHITECTURE ?= 70
+  ###CUARCHFLAGS = -gencode arch=compute_$(MADGRAPH_CUDA_ARCHITECTURE),code=compute_$(MADGRAPH_CUDA_ARCHITECTURE) -gencode arch=compute_$(MADGRAPH_CUDA_ARCHITECTURE),code=sm_$(MADGRAPH_CUDA_ARCHITECTURE) # Older implementation (AV): go back to this one for multi-GPU support #533
+  ###CUARCHFLAGS = --gpu-architecture=compute_$(MADGRAPH_CUDA_ARCHITECTURE) --gpu-code=sm_$(MADGRAPH_CUDA_ARCHITECTURE),compute_$(MADGRAPH_CUDA_ARCHITECTURE) # Newer implementation (SH): cannot use this as-is for multi-GPU support #533
+  comma:=,
+  CUARCHFLAGS = $(foreach arch,$(subst $(comma), ,$(MADGRAPH_CUDA_ARCHITECTURE)),-gencode arch=compute_$(arch),code=compute_$(arch) -gencode arch=compute_$(arch),code=sm_$(arch))
+  CUINC = -I$(CUDA_HOME)/include/
+  ifeq ($(RNDGEN),hasNoCurand)
+    CURANDLIBFLAGS=
   else
-    # No cuda. Switch cuda compilation off and go to common random numbers in C++
-    $(warning CUDA_HOME is not set or is invalid: export CUDA_HOME to compile with cuda)
-    override GPUCC=
-    override USE_NVTX=
-    override CUINC=
-    override CURANDLIBFLAGS=
+    CURANDLIBFLAGS = -L$(CUDA_HOME)/lib64/ -lcurand # NB: -lcuda is not needed here!
   endif
+  CUOPTFLAGS = -lineinfo
+  ###GPUFLAGS = $(OPTFLAGS) $(CUOPTFLAGS) $(INCFLAGS) $(CUINC) $(USE_NVTX) $(CUARCHFLAGS) -use_fast_math
+  GPUFLAGS = $(foreach opt, $(OPTFLAGS), -Xcompiler $(opt)) $(CUOPTFLAGS) $(INCFLAGS) $(CUINC) $(USE_NVTX) $(CUARCHFLAGS) -use_fast_math
+  ###GPUFLAGS += -Xcompiler -Wall -Xcompiler -Wextra -Xcompiler -Wshadow
+  ###GPUCC_VERSION = $(shell $(GPUCC) --version | grep 'Cuda compilation tools' | cut -d' ' -f5 | cut -d, -f1)
+  GPUFLAGS += -std=c++17 # need CUDA >= 11.2 (see #333): this is enforced in mgOnGpuConfig.h
+  # Without -maxrregcount: baseline throughput: 6.5E8 (16384 32 12) up to 7.3E8 (65536 128 12)
+  ###GPUFLAGS+= --maxrregcount 160 # improves throughput: 6.9E8 (16384 32 12) up to 7.7E8 (65536 128 12)
+  ###GPUFLAGS+= --maxrregcount 128 # improves throughput: 7.3E8 (16384 32 12) up to 7.6E8 (65536 128 12)
+  ###GPUFLAGS+= --maxrregcount 96 # degrades throughput: 4.1E8 (16384 32 12) up to 4.5E8 (65536 128 12)
+  ###GPUFLAGS+= --maxrregcount 64 # degrades throughput: 1.7E8 (16384 32 12) flat at 1.7E8 (65536 128 12)
+  CUBUILDRULEFLAGS = -Xcompiler -fPIC -c
+  CCBUILDRULEFLAGS = -Xcompiler -fPIC -c -x cu
+  CUDATESTFLAGS = -lcuda
 
   # Set the host C++ compiler for GPUCC via "-ccbin <host-compiler>"
   # (NB issue #505: this must be a single word, "clang++ --gcc-toolchain..." is not supported)
@@ -173,71 +203,55 @@ ifeq ($(findstring nvcc,$(CUDA_COMPILER_PATH)),nvcc)
   GPUFLAGS += -allow-unsupported-compiler
   endif
 
-else ifeq ($(findstring hipcc,$(HIP_COMPILER_PATH)),hipcc)
-  #=== Configure the HIP compiler
+else ifneq ($(origin REQUIRE_CUDA),undefined)
 
-  # If CXX is not a single word (example "clang++ --gcc-toolchain...") then disable CUDA/HIP builds (issue #505)
-  # This is because it is impossible to pass this to "GPUFLAGS += -ccbin <host-compiler>" below
-  ifneq ($(words $(subst ccache ,,$(CXX))),1) # allow at most "CXX=ccache <host-compiler>" from outside
-    $(warning HIP builds are not supported for multi-word CXX "$(CXX)")
-    override HIP_HOME=disabled
-  endif
+  # If REQUIRE_CUDA is set but no cuda is found, stop here (e.g. for CI tests on GPU #443)
+  $(error No cuda installation found (set CUDA_HOME or make GPUCC visible in PATH))
 
-  # If HIP_HOME is not set, try to set it from the location of GPUCC
-  ifndef HIP_HOME
-    HIP_HOME = $(patsubst %bin/hipcc,%,$(HIP_COMPILER_PATH))
-    $(warning HIP_HOME was not set: using "$(HIP_HOME)")
-  endif
+#--- Option 2: CUDA does not exist, HIP exists -> use HIP
 
-  # Set GPUCC as $(HIP_HOME)/bin/hipcc if it exists
-  ifneq ($(wildcard $(HIP_HOME)/bin/hipcc),)
-    GPUCC = $(HIP_HOME)/bin/hipcc
-
-    # Should maybe find something equivelant to this in HIP
-    #USE_NVTX ?=-DUSE_NVTX
-
-    HIPARCHFLAGS = -target x86_64-linux-gnu --offload-arch=gfx90a
-    HIPINC = -I$(HIP_HOME)/include/
-
-    # -DHIP_FAST_MATH equivelant to -use_fast_math in HIP 
-    # (But only for single precision line 208: https://rocm-developer-tools.github.io/HIP/hcc__detail_2math__functions_8h_source.html)
-    GPUFLAGS = $(OPTFLAGS) $(CUOPTFLAGS) $(INCFLAGS) $(HIPINC) $(HIPARCHFLAGS) -DHIP_FAST_MATH -DHIP_PLATFORM=amd -fPIC
-    ###GPUFLAGS += -Xcompiler -Wall -Xcompiler -Wextra -Xcompiler -Wshadow
-    GPUFLAGS += -std=c++17
-    # Without -maxrregcount: baseline throughput: 6.5E8 (16384 32 12) up to 7.3E8 (65536 128 12)
-    ###GPUFLAGS+= --maxrregcount 160 # improves throughput: 6.9E8 (16384 32 12) up to 7.7E8 (65536 128 12)
-    ###GPUFLAGS+= --maxrregcount 128 # improves throughput: 7.3E8 (16384 32 12) up to 7.6E8 (65536 128 12)
-    ###GPUFLAGS+= --maxrregcount 96 # degrades throughput: 4.1E8 (16384 32 12) up to 4.5E8 (65536 128 12)
-    ###GPUFLAGS+= --maxrregcount 64 # degrades throughput: 1.7E8 (16384 32 12) flat at 1.7E8 (65536 128 12)
-
-    CUBUILDRULEFLAGS = -fPIC -c
-    CCBUILDRULEFLAGS = -fPIC -c
-
-  else ifneq ($(origin REQUIRE_HIP),undefined)
-    # If REQUIRE_HIP is set but no cuda is found, stop here (e.g. for CI tests on GPU #443)
-    $(error No hip installation found (set HIP_HOME or make GPUCC visible in PATH))
-  else
-    # No hip. Switch hip compilation off and go to common random numbers in C++
-    $(warning HIP_HOME is not set or is invalid: export HIP_HOME to compile with hip)
-    override GPUCC=
-    override USE_NVTX=
-    override CUINC=
-    override CURANDLIBFLAGS=
-  endif
+# Set GPUCC as $(HIP_HOME)/bin/hipcc if it exists
+else ifneq ($(wildcard $(HIP_HOME)/bin/hipcc),)
 
-  # Allow newer (unsupported) C++ compilers with older versions of CUDA if ALLOW_UNSUPPORTED_COMPILER_IN_CUDA is set (#504)
-  ifneq ($(origin ALLOW_UNSUPPORTED_COMPILER_IN_CUDA),undefined)
-  GPUFLAGS += -allow-unsupported-compiler
-  endif
+  GPUCC = $(HIP_HOME)/bin/hipcc
+  #USE_NVTX ?=-DUSE_NVTX # should maybe find something equivalent to this in HIP?
+  HIPARCHFLAGS = -target x86_64-linux-gnu --offload-arch=gfx90a
+  HIPINC = -I$(HIP_HOME)/include/
+  # Note: -DHIP_FAST_MATH is equivalent to -use_fast_math in HIP 
+  # (but only for single precision line 208: https://rocm-developer-tools.github.io/HIP/hcc__detail_2math__functions_8h_source.html)
+  GPUFLAGS = $(OPTFLAGS) $(CUOPTFLAGS) $(INCFLAGS) $(HIPINC) $(HIPARCHFLAGS) -DHIP_FAST_MATH -DHIP_PLATFORM=amd -fPIC
+  ###GPUFLAGS += -Xcompiler -Wall -Xcompiler -Wextra -Xcompiler -Wshadow
+  GPUFLAGS += -std=c++17
+  ###GPUFLAGS+= --maxrregcount 255 # (AV: is this option valid on HIP and meaningful on AMD GPUs?)
+  CUBUILDRULEFLAGS = -fPIC -c
+  CCBUILDRULEFLAGS = -fPIC -c
+
+else ifneq ($(origin REQUIRE_HIP),undefined)
+
+  # If REQUIRE_HIP is set but no HIP is found, stop here (e.g. for CI tests on GPU #443)
+  $(error No hip installation found (set HIP_HOME or make GPUCC visible in PATH))
+
+#--- Option 3: CUDA does not exist, HIP does not exist -> switch off both CUDA and HIP
+
+else
+
+  # No cudacc and no hipcc: switch CUDA and HIP compilation off and go to common random numbers in C++
+  $(warning CUDA_HOME is not set or is invalid: export CUDA_HOME to compile with cuda)
+  $(warning HIP_HOME is not set or is invalid: export HIP_HOME to compile with hip)
+  override GPUCC=
+  override USE_NVTX=
+  override CUINC=
+  override CURANDLIBFLAGS=
 
 endif
 
+# Export GPUCC (so that it can also be used in cudacpp_src.mk?)
 export GPUCC
 export GPUFLAGS
 
 #-------------------------------------------------------------------------------
 
-#=== Configure ccache for C++ and CUDA builds
+#=== Configure ccache for C++ and CUDA/HIP builds
 
 # Enable ccache if USECCACHE=1
 ifeq ($(USECCACHE)$(shell echo $(CXX) | grep ccache),1)
@@ -254,7 +268,7 @@ endif
 
 #-------------------------------------------------------------------------------
 
-#=== Configure PowerPC-specific compiler flags for C++ and CUDA
+#=== Configure PowerPC-specific compiler flags for C++ and CUDA/HIP
 
 # PowerPC-specific CXX compiler flags (being reviewed)
 ifeq ($(UNAME_P),ppc64le)
@@ -270,7 +284,7 @@ else
   ######CXXFLAGS+= -fno-semantic-interposition # no benefit (neither alone, nor combined with -flto)
 endif
 
-# PowerPC-specific CUDA compiler flags (to be reviewed!)
+# PowerPC-specific CUDA/HIP compiler flags (to be reviewed!)
 ifeq ($(UNAME_P),ppc64le)
   GPUFLAGS+= -Xcompiler -mno-float128
 endif
@@ -285,12 +299,14 @@ override OMPFLAGS = -fopenmp
 ###override OMPFLAGS = # disable OpenMP MT on Intel (was ok without GPUCC but not ok with GPUCC before #578)
 else ifneq ($(shell $(CXX) --version | egrep '^(clang)'),)
 override OMPFLAGS = -fopenmp
-###override OMPFLAGS = # disable OpenMP MT on clang (was not ok without or with GPUCC before #578)
-else ifneq ($(shell $(CXX) --version | egrep '^(Apple clang)'),)
-override OMPFLAGS = # disable OpenMP MT on Apple clang (builds fail in the CI #578)
+###override OMPFLAGS = # disable OpenMP MT on clang (was not ok without or with nvcc before #578)
+###else ifneq ($(shell $(CXX) --version | egrep '^(Apple clang)'),) # AV for Mac (Apple clang compiler)
+else ifeq ($(UNAME_S),Darwin) # OM for Mac (any compiler)
+override OMPFLAGS = # AV disable OpenMP MT on Apple clang (builds fail in the CI #578)
+###override OMPFLAGS = -fopenmp # OM reenable OpenMP MT on Apple clang? (AV Oct 2023: this still fails in the CI)
 else
-override OMPFLAGS = -fopenmp
-###override OMPFLAGS = # disable OpenMP MT (default before #575)
+override OMPFLAGS = -fopenmp # enable OpenMP MT by default on all other platforms
+###override OMPFLAGS = # disable OpenMP MT on all other platforms (default before #575)
 endif
 
 # Set the default AVX (vectorization) choice
@@ -356,7 +372,7 @@ export OMPFLAGS
 
 #-------------------------------------------------------------------------------
 
-#=== Set the CUDA/C++ compiler flags appropriate to user-defined choices of AVX, FPTYPE, HELINL, HRDCOD, RNDGEN
+#=== Set the CUDA/HIP/C++ compiler flags appropriate to user-defined choices of AVX, FPTYPE, HELINL, HRDCOD, RNDGEN
 
 # Set the build flags appropriate to OMPFLAGS
 $(info OMPFLAGS=$(OMPFLAGS))
@@ -573,8 +589,9 @@ $(BUILDDIR)/gcheck_sa.o: CXXFLAGS += $(USE_NVTX) $(CUINC)
 
 # Apply special build flags only to check_sa and CurandRandomNumberKernel (curand headers, #679)
 $(BUILDDIR)/check_sa.o: CXXFLAGS += $(CXXFLAGSCURAND)
-$(BUILDDIR)/gcheck_sa.o: CXXFLAGS += $(CXXFLAGSCURAND)
+$(BUILDDIR)/gcheck_sa.o: CUFLAGS += $(CXXFLAGSCURAND)
 $(BUILDDIR)/CurandRandomNumberKernel.o: CXXFLAGS += $(CXXFLAGSCURAND)
+$(BUILDDIR)/gCurandRandomNumberKernel.o: CUFLAGS += $(CXXFLAGSCURAND)
 ifeq ($(RNDGEN),hasCurand)
 $(BUILDDIR)/CurandRandomNumberKernel.o: CXXFLAGS += $(CUINC)
 endif
@@ -772,12 +789,18 @@ $(testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(cxx_object
 	  $(GPUCC) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) -ldl $(LIBFLAGS) $(CUDATESTFLAGS)
 endif
 
+# Use target gtestlibs to build only googletest
+ifneq ($(GTESTLIBS),)
+gtestlibs: $(GTESTLIBS)
+endif
+
 # Use flock (Linux only, no Mac) to allow 'make -j' if googletest has not yet been downloaded https://stackoverflow.com/a/32666215
 $(GTESTLIBS):
 ifneq ($(shell which flock 2>/dev/null),)
+	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
 	flock $(BUILDDIR)/.make_test.lock $(MAKE) -C $(TESTDIR)
 else
-	$(MAKE) -C $(TESTDIR)
+	if [ -d $(TESTDIR) ]; then $(MAKE) -C $(TESTDIR); fi
 endif
 
 #-------------------------------------------------------------------------------
diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/dummy_fct.f b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/dummy_fct.f
index 076cf29d67..4f7a204b8f 100644
--- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/dummy_fct.f
+++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/dummy_fct.f
@@ -32,7 +32,7 @@ logical FUNCTION dummy_cuts(P)
       LOGICAL  IS_A_NU(NEXTERNAL),IS_HEAVY(NEXTERNAL)
       logical  do_cuts(nexternal)
       COMMON /TO_SPECISA/IS_A_J,IS_A_A,IS_A_L,IS_A_B,IS_A_NU,IS_HEAVY,
-     . IS_A_ONIUM, do_cuts
+     & IS_A_ONIUM, do_cuts
 
       dummy_cuts=.true.
 
@@ -118,15 +118,16 @@ double precision function user_dynamical_scale(P)
       
       
 C ************************************************************
-C default for the library implementing a dummt bias function
+C default for the library implementing a dummy bias function
 C ************************************************************
       subroutine bias_wgt_custom(p, original_weight, bias_weight)
-          implicit none
+      implicit none
 C
 C Parameters
 C
           include 'nexternal.inc'
-C
+
+C     
 C Arguments
 C
           double precision p(0:3, nexternal)
@@ -161,3 +162,4 @@ subroutine bias_wgt_custom(p, original_weight, bias_weight)
 
       return
       end subroutine bias_wgt_custom
+
diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/fbridge.cc b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/fbridge.cc
index 2b956730d4..22ce3f5115 100644
--- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/fbridge.cc
+++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/fbridge.cc
@@ -49,11 +49,7 @@ extern "C"
 #ifdef MGONGPUCPP_GPUIMPL
     GpuRuntime::setUp();
 #endif
-    // Create a process object, read parm card and set parameters
-    // FIXME: the process instance can happily go out of scope because it is only needed to read parameters?
-    // FIXME: the CPPProcess should really be a singleton? what if fbridgecreate is called from several Fortran threads?
-    CPPProcess process( /*verbose=*/false );
-    process.initProc( "../../Cards/param_card.dat" );
+    // (NB: CPPProcess::initProc no longer needs to be executed here because it is called in the Bridge constructor)
     // FIXME: disable OMP in Bridge when called from Fortran
     *ppbridge = new Bridge<FORTRANFPTYPE>( *pnevtF, *pnparF, *pnp4F );
   }
diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/genps.f b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/genps.f
index fe9c61504b..c00e33d954 100644
--- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/genps.f
+++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/genps.f
@@ -1877,12 +1877,12 @@ double precision function get_channel_cut(p, config)
          d1 = iforest(1, -i, config)
          d2 = iforest(2, -i, config)
          do j=0,3
-            if (d1.gt.0.and.d1.le.2) then
+            if (d1.gt.0.and.d1.le.nincoming) then
                ptemp(j,-i) = ptemp(j,-i) - ptemp(j, d1)
             else
                ptemp(j,-i) = ptemp(j,-i)+ptemp(j, d1)
             endif
-            if (d2.gt.0.and.d2.le.2) then
+            if (d2.gt.0.and.d2.le.nincoming) then
                ptemp(j,-i) = ptemp(j,-i) - ptemp(j, d2)
             else
                ptemp(j,-i) = ptemp(j,-i)+ptemp(j, d2)
diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/makefile b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/makefile
index 74db44d848..d572486c2e 100644
--- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/makefile
+++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/makefile
@@ -9,6 +9,12 @@ FFLAGS+= -cpp
 # Compile counters with -O3 as in the cudacpp makefile (avoid being "unfair" to Fortran #740)
 CXXFLAGS = -O3 -Wall -Wshadow -Wextra
 
+# Add -std=c++17 explicitly to avoid build errors on macOS
+# Add -mmacosx-version-min=11.3 to avoid "ld: warning: object file was built for newer macOS version than being linked"
+ifneq ($(shell $(CXX) --version | egrep '^Apple clang'),)
+CXXFLAGS += -std=c++17 -mmacosx-version-min=11.3
+endif
+
 # Enable ccache if USECCACHE=1
 ifeq ($(USECCACHE)$(shell echo $(CXX) | grep ccache),1)
   override CXX:=ccache $(CXX)
@@ -51,7 +57,7 @@ CUDACPP_MAKEFILE=cudacpp.mk
 CUDACPP_MAKEENV:=$(shell echo '$(.VARIABLES)' | tr " " "\n" | egrep "(USEBUILDDIR|AVX|FPTYPE|HELINL|HRDCOD)")
 ###$(info CUDACPP_MAKEENV=$(CUDACPP_MAKEENV))
 ###$(info $(foreach v,$(CUDACPP_MAKEENV),$(v)="$($(v))"))
-CUDACPP_BUILDDIR:=$(shell $(MAKE) $(foreach v,$(CUDACPP_MAKEENV),$(v)="$($(v))") -f $(CUDACPP_MAKEFILE) -pn |& awk '/Building/{print $$3}' | sed s/BUILDDIR=//)
+CUDACPP_BUILDDIR:=$(shell $(MAKE) $(foreach v,$(CUDACPP_MAKEENV),$(v)="$($(v))") -f $(CUDACPP_MAKEFILE) -pn 2>&1 | awk '/Building/{print $$3}' | sed s/BUILDDIR=//)
 ifeq ($(CUDACPP_BUILDDIR),)
 $(error CUDACPP_BUILDDIR='$(CUDACPP_BUILDDIR)' should not be empty!)
 else
@@ -89,7 +95,12 @@ SYMMETRY = symmetry.o idenparts.o
 
 # Binaries
 
-LDFLAGS+=-Wl,--no-relax # avoid 'failed to convert GOTPCREL relocation' error #458
+ifeq ($(UNAME),Darwin)
+LDFLAGS += -lc++ # avoid 'Undefined symbols' for chrono::steady_clock on macOS (checked with otool -L libmg5amc_gg_ttx_cpp.so) 
+LDFLAGS += -mmacosx-version-min=11.3 # avoid "ld: warning: object file was built for newer macOS version than being linked"  
+else
+LDFLAGS += -Wl,--no-relax # avoid 'failed to convert GOTPCREL relocation' error #458 (not supported on macOS)
+endif
 
 all: $(PROG)_fortran $(CUDACPP_BUILDDIR)/$(PROG)_cpp # also builds $(PROG)_cuda if $(CUDACPP_CULIB) exists (#503)
 
@@ -100,8 +111,8 @@ LINKLIBS += -lintlc # undefined reference to `_intel_fast_memcpy'
 else ifneq ($(shell $(CXX) --version | egrep '^clang'),)
 override OMPFLAGS = -fopenmp
 $(CUDACPP_BUILDDIR)/$(PROG)_cpp: LINKLIBS += -L $(shell dirname $(shell $(CXX) -print-file-name=libc++.so)) -lomp # see #604
-###else ifneq ($(shell $(CXX) --version | egrep '^Apple clang'),)
-###override OMPFLAGS = -fopenmp # OMP is not supported yet by cudacpp for Apple clang
+else ifneq ($(shell $(CXX) --version | egrep '^Apple clang'),)
+override OMPFLAGS = # OMP is not supported yet by cudacpp for Apple clang
 else
 override OMPFLAGS = -fopenmp
 endif
diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/runTest.cc b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/runTest.cc
index 0ed26180ca..de327f2321 100644
--- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/runTest.cc
+++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/runTest.cc
@@ -71,6 +71,8 @@ struct CPUTest : public CUDA_CPU_TestBase
     , hstSelCol( nevt )
     , hstIsGoodHel( CPPProcess::ncomb )
   {
+    // FIXME: the process instance can happily go out of scope because it is only needed to read parameters?
+    // FIXME: the CPPProcess should really be a singleton?
     process.initProc( "../../Cards/param_card.dat" );
   }
 
@@ -183,6 +185,8 @@ struct CUDATest : public CUDA_CPU_TestBase
     , devSelCol( nevt )
     , devIsGoodHel( CPPProcess::ncomb )
   {
+    // FIXME: the process instance can happily go out of scope because it is only needed to read parameters?
+    // FIXME: the CPPProcess should really be a singleton?
     process.initProc( "../../Cards/param_card.dat" );
   }
 
diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/testxxx.cc b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/testxxx.cc
index 016bc0f472..e5167de00c 100644
--- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/testxxx.cc
+++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/testxxx.cc
@@ -59,7 +59,8 @@ TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testxxx )
   using namespace mg5amcCpu;
 #endif
 #ifndef __APPLE__ // test #701 (except on MacOS where feenableexcept is not defined #730)
-  const bool enableFPE = !getenv( "CUDACPP_RUNTIME_DISABLEFPE" );
+  const char* enableFPEc = getenv( "CUDACPP_RUNTIME_ENABLEFPE" );
+  const bool enableFPE = ( enableFPEc != 0 ) && ( std::string( enableFPEc ) != "" );
   if( enableFPE )
   {
     feenableexcept( FE_INVALID | FE_DIVBYZERO | FE_OVERFLOW | FE_UNDERFLOW ); // debug #701
diff --git a/epochX/cudacpp/gg_ttggg.mad/bin/generate_events b/epochX/cudacpp/gg_ttggg.mad/bin/generate_events
index 107313b25d..5577cc66a0 100755
--- a/epochX/cudacpp/gg_ttggg.mad/bin/generate_events
+++ b/epochX/cudacpp/gg_ttggg.mad/bin/generate_events
@@ -46,7 +46,7 @@ if __debug__ and (not os.path.exists(pjoin(root_path,'../..', 'bin','create_rele
 
 sys.path.append(pjoin(root_path,'bin','internal'))
 import madevent_interface as ME        
-
+import misc as misc
 
 import logging
 import logging.config
@@ -160,17 +160,31 @@ if '__main__' == __name__:
     # Check that python version is valid
 
     set_configuration()
-    argument = sys.argv    
+    argument = sys.argv
+
+    # check for plugin customization of the launch command
+    launch_interface = ME.MadEventCmdShell
+    if os.path.exists(pjoin(root_path, 'bin','internal', 'launch_plugin.py')):
+        with  misc.TMP_variable(sys, 'path', sys.path + [pjoin(root_path, 'bin', 'internal')]):
+            from importlib import reload
+            try:
+                reload('launch_plugin')
+            except Exception as error:
+                import launch_plugin
+        launch_interface =  launch_plugin.MEINTERFACE
+
+
+    
     try:
         if '-h' in argument or '--help' in argument:
-            launch = ME.MadEventCmdShell(me_dir=root_path, force_run=True)
+            launch = launch_interface(me_dir=root_path, force_run=True)
             launch.exec_cmd('help generate_events')
             sys.exit()
         elif len(argument) > 1 and argument[1] in ['0', '1', '2']:
             argument = treat_old_argument(argument)
         
         with ME.MadEventCmdShell.RunWebHandling(root_path, ):
-            launch = ME.MadEventCmdShell(me_dir=root_path, force_run=True)
+            launch = launch_interface(me_dir=root_path, force_run=True)
             launch.run_cmd('generate_events %s' % ' '.join(argument[1:]))
             launch.run_cmd('quit')
     except ME.MadEventAlreadyRunning as message:
diff --git a/epochX/cudacpp/gg_ttggg.mad/bin/internal/__init__.py b/epochX/cudacpp/gg_ttggg.mad/bin/internal/__init__.py
index 16e60d8182..0d17042f0d 100755
--- a/epochX/cudacpp/gg_ttggg.mad/bin/internal/__init__.py
+++ b/epochX/cudacpp/gg_ttggg.mad/bin/internal/__init__.py
@@ -26,6 +26,7 @@ class aMCatNLOError(MadGraph5Error):
 import os
 import logging
 import time
+pjoin = os.path.join
 
 #Look for basic file position MG5DIR and MG4DIR
 MG5DIR = os.path.realpath(os.path.join(os.path.dirname(__file__),
diff --git a/epochX/cudacpp/gg_ttggg.mad/bin/internal/banner.py b/epochX/cudacpp/gg_ttggg.mad/bin/internal/banner.py
index c1e54d3cb9..bd1517985f 100755
--- a/epochX/cudacpp/gg_ttggg.mad/bin/internal/banner.py
+++ b/epochX/cudacpp/gg_ttggg.mad/bin/internal/banner.py
@@ -537,7 +537,7 @@ def charge_card(self, tag):
             self.param_card = param_card_reader.ParamCard(param_card)
             return self.param_card
         elif tag == 'mgruncard':
-            self.run_card = RunCard(self[tag])
+            self.run_card = RunCard(self[tag], unknown_warning=False)
             return self.run_card
         elif tag == 'mg5proccard':
             proc_card = self[tag].split('\n')
@@ -1002,14 +1002,18 @@ def __init__(self, finput=None, **opt):
         self.allowed_value = {}
         
         self.default_setup()
+        self.plugin_input(finput)
         
 
         # if input is define read that input
         if isinstance(finput, (file, str, StringIO.StringIO)):
             self.read(finput, **opt)
+        
 
 
 
+    def plugin_input(self, finput=None):
+        pass
 
 
     def default_setup(self):
@@ -2621,7 +2625,28 @@ class RunCard(ConfigFile):
     default_include_file = 'run_card.inc'
     default_autodef_file = 'run.inc'
     donewarning = []
+    include_as_parameter = []
+
+    def plugin_input(self, finput):
 
+        if not finput and not MADEVENT:
+            return
+        curr_dir = None
+        if isinstance(finput, file):
+            # expected path to be like "XXXX/Cards/run_card.dat"
+            curr_dir = os.path.dirname(os.path.dirname(finput.name))
+        elif isinstance(finput, str):
+            curr_dir = os.path.dirname(os.path.dirname(finput))
+        
+        if curr_dir:
+            if os.path.exists(pjoin(curr_dir, 'bin', 'internal', 'plugin_run_card')):
+                # expected format {} passing everything as optional argument
+                for line in open(pjoin(curr_dir, 'bin', 'internal', 'plugin_run_card')):
+                    if line.startswith('#'):
+                        continue
+                    opts = dict(eval(line))
+                    self.add_param(**opts)
+        
     @classmethod
     def fill_post_set_from_blocks(cls):
         """set the post_set function for any parameter defined in a run_block"""
@@ -2647,18 +2672,48 @@ def __new__(cls, finput=None, **opt):
             elif isinstance(finput, cls):
                 target_class = finput.__class__
             elif isinstance(finput, str):
+                path = finput
                 if '\n' not in finput:
                     finput = open(finput).read()
                 if 'req_acc_FO' in finput:
                     target_class = RunCardNLO
                 else:
                     target_class = RunCardLO
+                    if MADEVENT and os.path.exists(pjoin(MEDIR, 'bin','internal', 'launch_plugin.py')):
+                        with  misc.TMP_variable(sys, 'path', sys.path + [pjoin(MEDIR, 'bin', 'internal')]):
+                            from importlib import reload
+                            try:
+                                reload('launch_plugin')
+                            except Exception as error:
+                                import launch_plugin
+                        target_class = launch_plugin.RunCard
+                    elif not MADEVENT:
+                        if 'run_card.dat' in path:
+                            launch_plugin_path = path.replace('run_card.dat', '../bin/internal/launch_plugin.py')
+                        elif 'run_card_default.dat' in path:
+                             launch_plugin_path = path.replace('run_card_default.dat', '../bin/internal/launch_plugin.py')
+                        else:
+                            launch_plugin_path = None
+                        if launch_plugin_path and os.path.exists(launch_plugin_path):
+                            misc.sprint('try to use plugin class', path.replace('run_card.dat', '../bin/internal/launch_plugin.py'))
+                            pydir = os.path.dirname(launch_plugin_path)
+                            with  misc.TMP_variable(sys, 'path', sys.path + [pydir]):
+                                from importlib import reload
+                                try:
+                                    reload('launch_plugin')
+                                except Exception as error:
+                                    import launch_plugin
+                            target_class = launch_plugin.RunCard
+            elif issubclass(finput, RunCard):
+                target_class = finput
             else:
                 return None
 
             target_class.fill_post_set_from_blocks()
-
-            return super(RunCard, cls).__new__(target_class, finput, **opt)
+            out = super(RunCard, cls).__new__(target_class, finput, **opt)
+            if not isinstance(out, RunCard): #should not happen but in presence of missmatch of library loaded.
+                out.__init__(finput, **opt)
+            return out
         else:
             return super(RunCard, cls).__new__(cls, finput, **opt)
 
@@ -2686,7 +2741,7 @@ def __init__(self, *args, **opts):
         self.system_default = {}
         
         self.display_block = [] # set some block to be displayed
-
+        self.fct_mod = {} # {param: (fct_pointer, *argument, **opts)}
 
         self.cut_class = {} 
         self.warned=False
@@ -2723,7 +2778,7 @@ def get_lepton_densities(cls):
 
     def add_param(self, name, value, fortran_name=None, include=True, 
                   hidden=False, legacy=False, cut=False, system=False, sys_default=None,
-                  autodef=False, 
+                  autodef=False, fct_mod=None,
                   **opts):
         """ add a parameter to the card. value is the default value and 
         defines the type (int/float/bool/str) of the input.
@@ -2737,6 +2792,7 @@ def add_param(self, name, value, fortran_name=None, include=True,
                  If a path (Source/PDF/pdf.inc) the definition will be added within that file
                  Default is False (does not add the definition)
                  entry added in the run_card will automatically have this on True.
+        fct_mod: defines a function to run if the parameter is modify in the include file
         options of **opts:
         - allowed: list of valid options. '*' means anything else should be allowed.
                  empty list means anything possible as well. 
@@ -2761,15 +2817,22 @@ def add_param(self, name, value, fortran_name=None, include=True,
         if autodef:
             self.definition_path[autodef].append(name)
             self.user_set.add(name)
+        # function to trigger if a value is modified in the include file
+        # main target is action to force correct recompilation (like for compilation flag/...)
+        if fct_mod:
+            self.fct_mod[name] = fct_mod
 
-    def read(self, finput, consistency=True):
+    def read(self, finput, consistency=True, unknown_warning=True, **opt):
         """Read the input file, this can be a path to a file, 
            a file object, a str with the content of the file."""
            
         if isinstance(finput, str):
             if "\n" in finput:
                 finput = finput.split('\n')
+                if 'path' in opt:
+                    self.path = opt['path']
             elif os.path.isfile(finput):
+                self.path = finput
                 finput = open(finput)
             else:
                 raise Exception("No such file %s" % finput)
@@ -2784,7 +2847,7 @@ def read(self, finput, consistency=True):
             name = name.lower().strip()
             if name not in self:
                 #looks like an entry added by a user -> add it nicely
-                self.add_unknown_entry(name, value)
+                self.add_unknown_entry(name, value, unknown_warning)
             else:
                 self.set( name, value, user=True)
         # parameter not set in the run_card can be set to compatiblity value
@@ -2796,7 +2859,7 @@ def read(self, finput, consistency=True):
                         logger.warning(str(error))
                     else:
                         raise
-    def add_unknown_entry(self, name, value):
+    def add_unknown_entry(self, name, value, unknow_warning):
         """function to add an entry to the run_card when the associated parameter does not exists.
            This is based on the guess_entry_fromname for the various syntax providing input.
            This then call add_param accordingly.
@@ -2835,7 +2898,7 @@ def add_unknown_entry(self, name, value):
                 raise Exception("dictionary need to have at least one entry")
             default['dict']['__type__'] = default[self.guess_type_from_value(default_value[0])]
 
-        if name not in RunCard.donewarning:
+        if name not in RunCard.donewarning and unknow_warning:
             logger.warning("Found unexpected entry in run_card: \"%s\" with value \"%s\".\n"+\
                 "  The type was assigned to %s. \n"+\
                 "  The definition of that variable will %sbe automatically added to fortran file %s\n"+\
@@ -2873,7 +2936,17 @@ def valid_line(self, line, tmp):
                 return False 
         else:
             return True      
-                    
+
+
+    def reset_simd(self, old_value, new_value, name, *args, **opts):
+        #return
+        raise Exception('pass in reset simd')
+
+    def make_clean(self,old_value, new_value, name, dir):
+        raise Exception('pass make clean for ', dir)
+
+    def make_Ptouch(self,old_value, new_value, name, reset):
+        raise Exception('pass Ptouch for ', reset)             
                 
     def write(self, output_file, template=None, python_template=False,
                     write_hidden=False, template_options=None, **opt):
@@ -2898,11 +2971,12 @@ def write(self, output_file, template=None, python_template=False,
         if python_template and not to_write:
             import string
             if self.blocks:
-                text = string.Template(text)
                 mapping = {}
                 for b in self.blocks:
                     mapping[b.name] =  b.get_template(self)
-                text = text.substitute(mapping)
+                    if "$%s" % b.name not in text:
+                        text += "\n$%s\n" % b.name
+                text = string.Template(text).substitute(mapping)
 
             if not self.list_parameter:
                 text = text % self
@@ -3048,6 +3122,77 @@ def write(self, output_file, template=None, python_template=False,
         else:
             output_file.write(text)
 
+    def get_last_value_include(self, output_dir):
+        """For paraeter in self.fct_mod
+        parse the associate inc file to get the value of the previous run.
+        We return a dictionary {name: old_value}
+        if inc file does not exist we will return the current value (i.e. set has no change)
+        """
+
+        #remember that 
+        # default_include_file is a class variable
+        # self.includepath is on the form include_path : [list of param ]
+        out = {}
+
+        # setup inc_to_parse to be like self.includepath (include_path : [list of param ])
+        # BUT only containing the parameter that need to be tracked for the fct_mod option
+        inc_to_parse = {}
+        for inc_file, params in self.includepath.items():
+            if not inc_file:
+                continue
+            if any(p in params for p in self.fct_mod):
+                inc_to_parse[inc_file] = [name for name in self.includepath[inc_file] if name in self.fct_mod]
+
+        # now loop over the files and ask the associate function
+        for inc_file, params in inc_to_parse.items():
+            if inc_file is True:
+                inc_file = self.default_include_file
+            out.update(self.get_value_from_include(inc_file, params, output_dir))
+
+        return out
+
+    def get_value_from_include(self, path, list_of_params, output_dir):
+        """for a given include file return the current value of the requested parameter
+        return a dictionary {name: value}
+        if path does not exists return the current value in self for all parameter"""
+
+        #WARNING DOES NOT HANDLE LIST/DICT so far
+
+        # handle case where file is missing
+        if not os.path.exists(pjoin(output_dir,path)):
+            misc.sprint("include file not existing", pjoin(output_dir,path))
+            out = {name: self[name] for name in list_of_params}
+
+        with open(pjoin(output_dir,path), 'r') as fsock:
+            text = fsock.read()
+        
+        for name in list_of_params:
+            misc.sprint(name, name in self.fortran_name)
+            misc.sprint(self.fortran_name[name] if name in self.fortran_name[name] else name)
+        to_track = [self.fortran_name[name] if name in self.fortran_name else name for name in list_of_params]
+        pattern = re.compile(r"\(?(%(names)s)\s?=\s?([^)]*)\)?" % {'names':'|'.join(to_track)}, re.I)
+        out =  dict(pattern.findall(text))
+        misc.sprint(out)
+        for name in list_of_params:
+            if name in self.fortran_name:
+                value = out[self.fortran_name[name]]
+                del out[self.fortran_name[name]]
+                out[name] = value
+
+        for name, value in out.items():
+            try:
+                out[name] = self.format_variable(value, type(self[name]))
+            except Exception:
+                continue
+
+        if len(out) != len(list_of_params):
+            misc.sprint(list_of_params)
+            misc.sprint(to_track)
+            misc.sprint(self.fortran_name)
+            misc.sprint(text)
+            raise Exception
+        return out 
+
 
     def get_default(self, name, default=None, log_level=None):
         """return self[name] if exist otherwise default. log control if we 
@@ -3338,71 +3483,93 @@ def write_include_file(self, output_dir, output_file=None):
         #ensusre that system only parameter are correctly set
         self.update_system_parameter_for_include()
 
+        value_in_old_include = self.get_last_value_include(output_dir)
+
+
         if output_dir:
             self.write_autodef(output_dir, output_file=None)
             # check/fix status of customised functions
             self.edit_dummy_fct_from_file(self["custom_fcts"], os.path.dirname(output_dir))
         
         for incname in self.includepath:
-            if incname is True:
-                pathinc = self.default_include_file
-            elif incname is False:
-                continue
-            else:
-                pathinc = incname
+            self.write_one_include_file(output_dir, incname, output_file)
+ 
+        for name,value in value_in_old_include.items():
+            if value != self[name]:
+                self.fct_mod[name][0](value, self[name], name, *self.fct_mod[name][1],**self.fct_mod[name][2])
 
-            if output_file:
-                fsock = output_file
+    def write_one_include_file(self, output_dir, incname, output_file=None):
+        """write one include file at the time"""
+
+        misc.sprint(incname)
+        if incname is True:
+            pathinc = self.default_include_file
+        elif incname is False:
+            return
+        else:
+            pathinc = incname
+
+        if output_file:
+            fsock = output_file
+        else:
+            fsock = file_writers.FortranWriter(pjoin(output_dir,pathinc+'.tmp'))
+
+
+        for key in self.includepath[incname]:                
+            #define the fortran name
+            if key in self.fortran_name:
+                fortran_name = self.fortran_name[key]
             else:
-                fsock = file_writers.FortranWriter(pjoin(output_dir,pathinc+'.tmp'))  
-            for key in self.includepath[incname]:                
-                #define the fortran name
-                if key in self.fortran_name:
-                    fortran_name = self.fortran_name[key]
+                fortran_name = key
+                
+            if incname in self.include_as_parameter:
+                fsock.writelines('INTEGER %s\n' % fortran_name)
+            #get the value with warning if the user didn't set it
+            value = self.get_default(key)
+            if hasattr(self, 'mod_inc_%s' % key):
+                value = getattr(self, 'mod_inc_%s' % key)(value)
+            # Special treatment for strings containing a list of
+            # strings. Convert it to a list of strings
+            if isinstance(value, list):
+                # in case of a list, add the length of the list as 0th
+                # element in fortran. Only in case of integer or float
+                # list (not for bool nor string)
+                targettype = self.list_parameter[key]                        
+                if targettype is bool:
+                    pass
+                elif targettype is int:
+                    line = '%s(%s) = %s \n' % (fortran_name, 0, self.f77_formatting(len(value)))
+                    fsock.writelines(line)
+                elif targettype is float:
+                    line = '%s(%s) = %s \n' % (fortran_name, 0, self.f77_formatting(float(len(value))))
+                    fsock.writelines(line)
+                # output the rest of the list in fortran
+                for i,v in enumerate(value):
+                    line = '%s(%s) = %s \n' % (fortran_name, i+1, self.f77_formatting(v))
+                    fsock.writelines(line)
+            elif isinstance(value, dict):
+                for fortran_name, onevalue in value.items():
+                    line = '%s = %s \n' % (fortran_name, self.f77_formatting(onevalue))
+                    fsock.writelines(line)                       
+            elif isinstance(incname,str) and 'compile' in incname:
+                if incname in self.include_as_parameter:
+                    line = 'PARAMETER (%s=%s)' %( fortran_name, value)
                 else:
-                    fortran_name = key
-                    
-                #get the value with warning if the user didn't set it
-                value = self.get_default(key)
-                if hasattr(self, 'mod_inc_%s' % key):
-                    value = getattr(self, 'mod_inc_%s' % key)(value)
-                # Special treatment for strings containing a list of
-                # strings. Convert it to a list of strings
-                if isinstance(value, list):
-                    # in case of a list, add the length of the list as 0th
-                    # element in fortran. Only in case of integer or float
-                    # list (not for bool nor string)
-                    targettype = self.list_parameter[key]                        
-                    if targettype is bool:
-                        pass
-                    elif targettype is int:
-                        line = '%s(%s) = %s \n' % (fortran_name, 0, self.f77_formatting(len(value)))
-                        fsock.writelines(line)
-                    elif targettype is float:
-                        line = '%s(%s) = %s \n' % (fortran_name, 0, self.f77_formatting(float(len(value))))
-                        fsock.writelines(line)
-                    # output the rest of the list in fortran
-                    for i,v in enumerate(value):
-                        line = '%s(%s) = %s \n' % (fortran_name, i+1, self.f77_formatting(v))
-                        fsock.writelines(line)
-                elif isinstance(value, dict):
-                    for fortran_name, onevalue in value.items():
-                        line = '%s = %s \n' % (fortran_name, self.f77_formatting(onevalue))
-                        fsock.writelines(line)                       
-                elif isinstance(incname,str) and 'compile' in incname:
                     line = '%s = %s \n' % (fortran_name, value)
-                    fsock.write(line)
+                fsock.write(line)
+            else:
+                if incname in self.include_as_parameter:
+                    line = 'PARAMETER (%s=%s)' %( fortran_name, self.f77_formatting(value))
                 else:
                     line = '%s = %s \n' % (fortran_name, self.f77_formatting(value))
-                    fsock.writelines(line)
-            if not output_file:
-                fsock.close()
-                path = pjoin(output_dir,pathinc)
-                if not os.path.exists(path) or not filecmp.cmp(path,  path+'.tmp'):
-                    files.mv(path+'.tmp', path)
-                else:
-                    os.remove(path+'.tmp')
-
+                fsock.writelines(line)
+        if not output_file:
+            fsock.close()
+            path = pjoin(output_dir,pathinc)
+            if not os.path.exists(path) or not filecmp.cmp(path,  path+'.tmp'):
+                files.mv(path+'.tmp', path)
+            else:
+                os.remove(path+'.tmp')
 
     def write_autodef(self, output_dir, output_file=None):
         """ Add the definition of variable to run.inc if the variable is set with autodef.
@@ -3741,13 +3908,14 @@ def remove_all_cut(self):
    %(tmin_for_channel)s = tmin_for_channel ! limit the non-singular reach of --some-- channel of integration related to T-channel diagram (value between -1 and 0), -1 is no impact
    %(survey_splitting)s = survey_splitting ! for loop-induced control how many core are used at survey for the computation of a single iteration.
    %(survey_nchannel_per_job)s = survey_nchannel_per_job ! control how many Channel are integrated inside a single job on cluster/multicore
-   %(refine_evt_by_job)s = refine_evt_by_job ! control the maximal number of events for the first iteration of the refine (larger means less jobs)
+   %(refine_evt_by_job)s = refine_evt_by_job ! control the maximal number of events for the first iteration of the refine (larger means less jobs)  
 #*********************************************************************
-# Compilation flag. No automatic re-compilation (need manual "make clean" in Source)
+# Compilation flag. 
 #*********************************************************************   
    %(global_flag)s = global_flag ! fortran optimization flag use for the all code.
    %(aloha_flag)s  = aloha_flag ! fortran optimization flag for aloha function. Suggestions: '-ffast-math'
    %(matrix_flag)s = matrix_flag ! fortran optimization flag for matrix.f function. Suggestions: '-O3'
+   %(vector_size)s = vector_size ! size of fortran arrays allocated in the multi-event API for SIMD/GPU (VECSIZE_MEMMAX)
 """
 
 template_off = '# To see advanced option for Phase-Space optimization: type "update psoptim"'
@@ -3903,9 +4071,12 @@ class RunCardLO(RunCard):
                       "get_dummy_x1_x2": pjoin("SubProcesses","dummy_fct.f"), 
                       "dummy_boostframe": pjoin("SubProcesses","dummy_fct.f"),
                       "user_dynamical_scale": pjoin("SubProcesses","dummy_fct.f"),
+                      "bias_wgt_custom": pjoin("SubProcesses","dummy_fct.f"),
                       "user_": pjoin("SubProcesses","dummy_fct.f") # all function starting by user will be added to that file
                       }
     
+    include_as_parameter = ['vector.inc']
+
     if MG5DIR:
         default_run_card = pjoin(MG5DIR, "internal", "default_run_card_lo.dat")
     
@@ -4139,10 +4310,15 @@ def default_setup(self):
         self.add_param('hel_splitamp', True, hidden=True, include=False, comment='decide if amplitude aloha call can be splitted in two or not when doing helicity per helicity optimization.')
         self.add_param('hel_zeroamp', True, hidden=True, include=False, comment='decide if zero amplitude can be removed from the computation when doing helicity per helicity optimization.')
         self.add_param('SDE_strategy', 1, allowed=[1,2], fortran_name="sde_strat", comment="decide how Multi-channel should behaves \"1\" means full single diagram enhanced (hep-ph/0208156), \"2\" use the product of the denominator")
-        self.add_param('global_flag', '-O', include=False, hidden=True, comment='global fortran compilation flag, suggestion -fbound-check')
-        self.add_param('aloha_flag', '', include=False, hidden=True, comment='global fortran compilation flag, suggestion: -ffast-math')
-        self.add_param('matrix_flag', '', include=False, hidden=True, comment='fortran compilation flag	for the	matrix-element files, suggestion -O3')        
-        
+        self.add_param('global_flag', '-O', include=False, hidden=True, comment='global fortran compilation flag, suggestion -fbound-check',
+                       fct_mod=(self.make_clean, ('Source'),{}))
+        self.add_param('aloha_flag', '', include=False, hidden=True, comment='global fortran compilation flag, suggestion: -ffast-math',
+                       fct_mod=(self.make_clean, ('Source/DHELAS'),{}))
+        self.add_param('matrix_flag', '', include=False, hidden=True, comment='fortran compilation flag	for the	matrix-element files, suggestion -O3',
+                       fct_mod=(self.make_Ptouch, ('matrix'),{}))        
+        self.add_param('vector_size', 1, include='vector.inc', hidden=True, comment='lockstep size for parralelism run', 
+                       fortran_name='VECSIZE_MEMMAX', fct_mod=(self.reset_simd,(),{}))
+
         # parameter allowing to define simple cut via the pdg
         # Special syntax are related to those. (can not be edit directly)
         self.add_param('pt_min_pdg',{'__type__':0.}, include=False, cut=True)
@@ -4164,8 +4340,7 @@ def default_setup(self):
         self.add_param('mxxmin4pdg',[-1.], system=True)
         self.add_param('mxxpart_antipart', [False], system=True)
                      
-        # CUDACPP parameters
-        self.add_param('cudacpp_backend', 'CPP', include=False, hidden=False)
+        
              
     def check_validity(self):
         """ """
@@ -4704,6 +4879,9 @@ def create_default_for_process(self, proc_characteristic, history, proc_def):
                 continue
             break
 
+        if proc_characteristic['ninitial'] == 1:
+            self['SDE_strategy'] =1
+
         if 'MLM' in proc_characteristic['limitations']:
             if self['dynamical_scale_choice'] ==  -1:
                 self['dynamical_scale_choice'] = 3
@@ -5769,7 +5947,7 @@ def default_setup(self):
         self.add_param("CheckCycle", 3)
         self.add_param("MaxAttempts", 10)
         self.add_param("ZeroThres", 1e-9)
-        self.add_param("OSThres", 1.0e-13)
+        self.add_param("OSThres", 1.0e-8)
         self.add_param("DoubleCheckHelicityFilter", True)
         self.add_param("WriteOutFilters", True)
         self.add_param("UseLoopFilter", False)
diff --git a/epochX/cudacpp/gg_ttggg.mad/bin/internal/check_param_card.py b/epochX/cudacpp/gg_ttggg.mad/bin/internal/check_param_card.py
index fe874a06a4..71089d7480 100755
--- a/epochX/cudacpp/gg_ttggg.mad/bin/internal/check_param_card.py
+++ b/epochX/cudacpp/gg_ttggg.mad/bin/internal/check_param_card.py
@@ -85,7 +85,7 @@ def load_str(self, text):
             self.value= ' '.join(data[len(self.lhacode):])
             # check that lhacode are the first entry otherwise return invalid param.
             if ' '.join([str(i) for i in self.lhacode]) != ' '.join(data[:len(self.lhacode)]):
-                raise InvalidParam
+                raise InvalidParam("line was %s" % str(data))
         else:
             self.value = data[-1]
         
diff --git a/epochX/cudacpp/gg_ttggg.mad/bin/internal/common_run_interface.py b/epochX/cudacpp/gg_ttggg.mad/bin/internal/common_run_interface.py
index 5d0187e3fa..87cb4b88df 100755
--- a/epochX/cudacpp/gg_ttggg.mad/bin/internal/common_run_interface.py
+++ b/epochX/cudacpp/gg_ttggg.mad/bin/internal/common_run_interface.py
@@ -749,13 +749,15 @@ def writeRunWeb(me_dir):
         
     class RunWebHandling(object):
         
-        def __init__(self, me_dir, crashifpresent=True, warnifpresent=True):
+        def __init__(self, me_dir, crashifpresent=True, warnifpresent=True, force_run=False):
             """raise error if RunWeb already exists
             me_dir is the directory where the write RunWeb"""
             
             self.remove_run_web = True
             self.me_dir = me_dir
-            
+            if force_run:
+                self.remove_run_web = False
+                return            
             if crashifpresent or warnifpresent:
                 if os.path.exists(pjoin(me_dir, 'RunWeb')):
                     pid = open(pjoin(me_dir, 'RunWeb')).read()
@@ -4904,6 +4906,7 @@ def __init__(self, question, cards=[], from_banner=None, banner=None, mode='auto
         self.load_default()        
         self.define_paths(**opt)
         self.last_editline_pos = 0
+        self.update_dependent_done = False
 
         if 'allow_arg' not in opt or not opt['allow_arg']:
             # add some mininal content for this:
@@ -6574,7 +6577,7 @@ def reask(self, *args, **opt):
     fail_due_to_format = 0 #parameter to avoid infinite loop
     def postcmd(self, stop, line):
 
-        if line not in [None, '0', 'done', '']:
+        if line not in [None, '0', 'done', '',0]:
             ending_question = cmd.OneLinePathCompletion.postcmd(self,stop,line)
         else:
             ending_question = True
@@ -6583,7 +6586,9 @@ def postcmd(self, stop, line):
             self.check_card_consistency()
             if self.param_consistency:
                 try:
-                    self.do_update('dependent', timer=20)
+                    if not self.update_dependent_done:
+                        self.do_update('dependent', timer=20)
+                    self.update_dependent_done = False
                 except MadGraph5Error as error:
                     if 'Missing block:' in str(error):
                         self.fail_due_to_format +=1
@@ -6636,6 +6641,8 @@ def do_update(self, line, timer=0):
             self.update_dependent(self.mother_interface, self.me_dir, self.param_card,
                                    self.paths['param'], timer, run_card=self.run_card,
                                    lhapdfconfig=self.lhapdf)
+            self.update_dependent_done = True
+            
 
         elif args[0] == 'missing':
             self.update_missing()
@@ -6715,12 +6722,13 @@ class TimeOutError(Exception):
         def handle_alarm(signum, frame): 
             raise TimeOutError
         signal.signal(signal.SIGALRM, handle_alarm)
+
         if timer:
-            signal.alarm(timer)
             log_level=30
         else:
             log_level=20
 
+
         if run_card:
             as_for_pdf = {'cteq6_m': 0.118,
                           'cteq6_d': 0.118, 
@@ -6779,6 +6787,10 @@ def handle_alarm(signum, frame):
                     logger.log(log_level, "update the strong coupling value (alpha_s) to the value from the pdf selected: %s",  as_for_pdf[pdlabel])
                     modify = True
 
+        if timer:
+            signal.alarm(timer)
+
+
         # Try to load the model in the limited amount of time allowed
         try:
             model = mecmd.get_model()
@@ -6907,7 +6919,8 @@ def check_block(self, blockname):
     def check_answer_consistency(self):
         """function called if the code reads a file"""
         self.check_card_consistency()
-        self.do_update('dependent', timer=20) 
+        if not self.update_dependent_done:
+            self.do_update('dependent', timer=20) 
       
     def help_set(self):
         '''help message for set'''
@@ -7533,7 +7546,8 @@ def open_file(self, answer):
             else:
                 raise
         if time.time() - start < .5:
-            self.mother_interface.ask("Are you really that fast? If you are using an editor that returns directly. Please confirm that you have finised to edit the file", 'y')
+            self.mother_interface.ask("Are you really that fast? If you are using an editor that returns directly. Please confirm that you have finised to edit the file", 'y',
+                                      timeout=False)
         self.reload_card(path)
         
     def reload_card(self, path): 
diff --git a/epochX/cudacpp/gg_ttggg.mad/bin/internal/extended_cmd.py b/epochX/cudacpp/gg_ttggg.mad/bin/internal/extended_cmd.py
index a6a8609dce..2f37070580 100755
--- a/epochX/cudacpp/gg_ttggg.mad/bin/internal/extended_cmd.py
+++ b/epochX/cudacpp/gg_ttggg.mad/bin/internal/extended_cmd.py
@@ -1108,9 +1108,12 @@ def ask(self, question, default, choices=[], path_msg=None,
         if alias:
             choices += list(alias.keys())
         
+
+
         question_instance = obj(question, allow_arg=choices, default=default, 
                                                    mother_interface=self, **opt)
-        
+        if fct_timeout is None:
+            fct_timeout = lambda x: question_instance.postcmd(x, default) if x and default else False
         if first_cmd:
             if isinstance(first_cmd, str):
                 question_instance.onecmd(first_cmd)
@@ -2271,6 +2274,9 @@ def postcmd(self, stop, line):
                 if n:
                     self.default(line)
                     return self.postcmd(stop, line)
+            elif self.value is None and line:
+                self.default(line)
+                return self.postcmd(stop, line) 
             if not self.casesensitive:
                 for ans in self.allow_arg:
                     if ans.lower() == self.value.lower():
diff --git a/epochX/cudacpp/gg_ttggg.mad/bin/internal/gen_ximprove.py b/epochX/cudacpp/gg_ttggg.mad/bin/internal/gen_ximprove.py
index 3b8ec31215..5fd170d18d 100755
--- a/epochX/cudacpp/gg_ttggg.mad/bin/internal/gen_ximprove.py
+++ b/epochX/cudacpp/gg_ttggg.mad/bin/internal/gen_ximprove.py
@@ -154,9 +154,18 @@ def get_helicity(self, to_submit=True, clean=True):
             p = misc.Popen(['./gensym'], stdout=subprocess.PIPE,
                                  stderr=subprocess.STDOUT, cwd=Pdir)
             #sym_input = "%(points)d %(iterations)d %(accuracy)f \n" % self.opts
+            
             (stdout, _) = p.communicate(''.encode())
             stdout = stdout.decode('ascii',errors='ignore')
-            nb_channel = max([math.floor(float(d)) for d in stdout.split()])
+            if stdout:
+                nb_channel = max([math.floor(float(d)) for d in stdout.split()])
+            else:
+                for matrix_file in misc.glob('matrix*orig.f', Pdir):
+                    files.cp(matrix_file, matrix_file.replace('orig','optim'))
+                P_zero_result.append(Pdir)
+                if os.path.exists(pjoin(self.me_dir, 'error')):
+                    os.remove(pjoin(self.me_dir, 'error'))
+                continue # bypass bad process
             
             self.cmd.compile(['madevent_forhel'], cwd=Pdir)
             if not os.path.exists(pjoin(Pdir, 'madevent_forhel')):
@@ -178,11 +187,13 @@ def get_helicity(self, to_submit=True, clean=True):
             #sym_input = "%(points)d %(iterations)d %(accuracy)f \n" % self.opts
             (stdout, _) = p.communicate(" ".encode())
             stdout = stdout.decode('ascii',errors='ignore')
-            if os.path.exists(pjoin(self.me_dir,'error')):
+            if os.path.exists(pjoin(self.me_dir, 'error')):
                 raise Exception(pjoin(self.me_dir,'error')) 
                 # note a continue is not enough here, we have in top to link
                 # the matrixX_optim.f to matrixX_orig.f to let the code to work
                 # after this error.
+                #                for matrix_file in misc.glob('matrix*orig.f', Pdir):
+                #    files.cp(matrix_file, matrix_file.replace('orig','optim'))
 
             if 'no events passed cuts' in stdout:
                 raise Exception
diff --git a/epochX/cudacpp/gg_ttggg.mad/bin/internal/lhe_parser.py b/epochX/cudacpp/gg_ttggg.mad/bin/internal/lhe_parser.py
index cff8789e38..a6b8582e1a 100755
--- a/epochX/cudacpp/gg_ttggg.mad/bin/internal/lhe_parser.py
+++ b/epochX/cudacpp/gg_ttggg.mad/bin/internal/lhe_parser.py
@@ -342,7 +342,12 @@ def next_event(self):
                 text.append(line)
                 
             if '</event>' in line:
-                if self.parsing:
+                if self.parsing == "wgt_only":
+                    out = Event(text, parse_momenta=False)
+                    #if len(out) == 0  and not self.allow_empty_event:
+                    #    raise Exception
+                    return out
+                elif self.parsing:
                     out = Event(text)
                     if len(out) == 0  and not self.allow_empty_event:
                         raise Exception
@@ -448,6 +453,8 @@ def unweight(self, outputpath, get_wgt=None, max_wgt=0, trunc_error=0,
         event_target reweight for that many event with maximal trunc_error.
         (stop to write event when target is reached)
         """
+        self.parsing = 'wgt_only'
+
         if not get_wgt:
             def weight(event):
                 return event.wgt
@@ -914,6 +921,8 @@ class MultiEventFile(EventFile):
        The number of events in each file need to be provide in advance 
        (if not provide the file is first read to find that number"""
     
+    parsing = True # check if/when we need to parse the event.
+
     def __new__(cls, start_list=[],parse=True):
         return object.__new__(MultiEventFile)
     
@@ -986,6 +995,7 @@ def next(self):
         nb_event = random.randint(1, remaining_event)
         sum_nb=0
         for i, obj in enumerate(self.files):
+            obj.parsing = "wgt_only"
             sum_nb += self.initial_nb_events[i] - self.curr_nb_events[i]
             if nb_event <= sum_nb:
                 self.curr_nb_events[i] += 1
@@ -1065,6 +1075,8 @@ def define_init_banner(self, wgt, lha_strategy, proc_charac=None):
             # check special case without PDF for one (or both) beam
             if init_information["idbmup1"] in [0,9]:
                 event = next(self)
+                if len(event) == 0:
+                    event = Event(str(event))
                 init_information["idbmup1"]= event[0].pdg
                 if init_information["idbmup2"] == 0:
                     init_information["idbmup2"]= event[1].pdg
@@ -1115,6 +1127,7 @@ def initialize_unweighting(self, getwgt, trunc_error):
         total_event = 0
         sum_cross = collections.defaultdict(int)
         for i,f in enumerate(self.files):
+            f.parsing = 'wgt_only'
             nb_event = 0 
             # We need to loop over the event file to get some information about the 
             # new cross-section/ wgt of event.
@@ -1302,7 +1315,7 @@ class Event(list):
 
     warning_order = True # raise a warning if the order of the particle are not in accordance of child/mother
 
-    def __init__(self, text=None):
+    def __init__(self, text=None, parse_momenta=True):
         """The initialization of an empty Event (or one associate to a text file)"""
         list.__init__(self)
         
@@ -1322,15 +1335,15 @@ def __init__(self, text=None):
         self.matched_scale_data = None
         self.syscalc_data = {}
         if text:
-            self.parse(text)
+            self.parse(text, parse_momenta=parse_momenta)
 
 
-            
-    def parse(self, text):
+    event_flag_pattern = re.compile(r"""(\w*)=(?:(?:['"])([^'"]*)(?=['"])|(\S*))""")   
+    def parse(self, text, parse_momenta=True):
         """Take the input file and create the structured information"""
         #text = re.sub(r'</?event>', '', text) # remove pointless tag
         status = 'first' 
-
+        tags = []
         if not isinstance(text, list):
             text = text.split('\n')
 
@@ -1354,24 +1367,28 @@ def parse(self, text):
                 if '<rwgt>' in line:
                     status = 'tag'
                 else:
-                    self.assign_scale_line(line)
+                    self.assign_scale_line(line, convert=parse_momenta)
                     status = 'part' 
                     continue
             if '<' in line:
                 status = 'tag'
                 
             if 'part' == status:
-                part = Particle(line, event=self)
-                if part.E != 0 or part.status==-1:
-                    self.append(part)
-                elif self.nexternal:
-                    self.nexternal-=1
+                if parse_momenta:
+                    part = Particle(line, event=self)
+                    if part.E != 0 or part.status==-1:
+                        self.append(part)
+                    elif self.nexternal:
+                        self.nexternal-=1
+                else:
+                    tags.append(line)
             else:
-                if '</event>' in line:
+                if line.endswith('</event>'):
                     line = line.replace('</event>','',1)
-                self.tag += '%s\n' % line
-                
-        self.assign_mother()
+                tags.append(line) 
+        self.tag += "\n".join(tags)
+        if parse_momenta:     
+            self.assign_mother()
     
     
     def assign_mother(self):
@@ -1905,19 +1922,27 @@ def check(self):
         #3. check mass
                    
          
-    def assign_scale_line(self, line):
+    def assign_scale_line(self, line, convert=True):
         """read the line corresponding to global event line
         format of the line is:
         Nexternal IEVENT WEIGHT SCALE AEW AS
         """
         inputs = line.split()
         assert len(inputs) == 6
-        self.nexternal=int(inputs[0])
-        self.ievent=int(inputs[1])
-        self.wgt=float(inputs[2])
-        self.scale=float(inputs[3])
-        self.aqed=float(inputs[4])
-        self.aqcd=float(inputs[5])
+        if convert:
+            self.nexternal=int(inputs[0])
+            self.ievent=int(inputs[1])
+            self.wgt=float(inputs[2])
+            self.scale=float(inputs[3])
+            self.aqed=float(inputs[4])
+            self.aqcd=float(inputs[5])
+        else:
+            self.nexternal=inputs[0]
+            self.ievent=inputs[1]
+            self.wgt=float(inputs[2])
+            self.scale=inputs[3]
+            self.aqed=inputs[4]
+            self.aqcd=inputs[5]
         
     def get_tag_and_order(self):
         """Return the unique tag identifying the SubProcesses for the generation.
@@ -2269,7 +2294,11 @@ def __str__(self, event_id=''):
         else:
             event_flag = ''
 
-        scale_str = "%2d %6d %+13.7e %14.8e %14.8e %14.8e" % \
+        try:
+            scale_str = "%2d %6d %+13.7e %14.8e %14.8e %14.8e" % \
+            (self.nexternal,self.ievent,self.wgt,self.scale,self.aqed,self.aqcd)
+        except:
+            scale_str = "%s %s %+13.7e %s %s %s" % \
             (self.nexternal,self.ievent,self.wgt,self.scale,self.aqed,self.aqcd)
 
             
diff --git a/epochX/cudacpp/gg_ttggg.mad/bin/internal/madevent_interface.py b/epochX/cudacpp/gg_ttggg.mad/bin/internal/madevent_interface.py
index b70b548e53..cb6bf4ca57 100755
--- a/epochX/cudacpp/gg_ttggg.mad/bin/internal/madevent_interface.py
+++ b/epochX/cudacpp/gg_ttggg.mad/bin/internal/madevent_interface.py
@@ -1,4 +1,4 @@
-################################################################################
+###############################################################################
 #
 # Copyright (c) 2011 The MadGraph5_aMC@NLO Development team and Contributors
 #
@@ -3191,7 +3191,7 @@ def do_treatcards(self, line, mode=None, opt=None):
       
         if mode in ['run', 'all']:
             if not hasattr(self, 'run_card'):
-                run_card = banner_mod.RunCard(opt['run_card'])
+                run_card = banner_mod.RunCard(opt['run_card'], path=pjoin(self.me_dir, 'Cards', 'run_card.dat'))
             else:
                 run_card = self.run_card
             self.run_card = run_card
@@ -3675,7 +3675,7 @@ def do_refine(self, line):
         devnull.close()
     
     ############################################################################ 
-    def do_combine_iteration(self, line):
+    def do_comine_iteration(self, line):
         """Not in help: Combine a given iteration combine_iteration Pdir Gdir S|R step
             S is for survey 
             R is for refine
@@ -3703,8 +3703,9 @@ def do_combine_iteration(self, line):
     ############################################################################ 
     def do_combine_events(self, line):
         """Advanced commands: Launch combine events"""
-
+        start=time.time()
         args = self.split_arg(line)
+        start = time.time()
         # Check argument's validity
         self.check_combine_events(args)
         self.update_status('Combining Events', level='parton')
@@ -3795,8 +3796,9 @@ def do_combine_events(self, line):
     
         if self.run_card['bias_module'].lower() not in  ['dummy', 'none'] and nb_event:
             self.correct_bias()
-        
-        
+        elif self.run_card['custom_fcts']:
+            self.correct_bias()
+        logger.info("combination of events done in %s s ", time.time()-start)
         
         self.to_store.append('event')
     
@@ -7364,7 +7366,7 @@ def wait_monitoring(Idle, Running, Done):
     import optparse
     # Get the directory of the script real path (bin)                                                                                                                                                           
     # and add it to the current PYTHONPATH                                                                                                                                                                      
-    root_path = os.path.dirname(os.path.dirname(os.path.realpath( __file__ )))
+    #root_path = os.path.dirname(os.path.dirname(os.path.dirname(os.path.realpath( __file__ ))))
     sys.path.insert(0, root_path)
 
     class MyOptParser(optparse.OptionParser):    
@@ -7407,7 +7409,13 @@ def error(self, msg=''):
     import logging.config
     # Set logging level according to the logging level given by options                                                                                                                                         
     #logging.basicConfig(level=vars(logging)[options.logging])                                                                                                                                                  
+    import internal
     import internal.coloring_logging
+    # internal.file = XXX/bin/internal/__init__.py
+    # => need three dirname to get XXX
+    # we use internal to have any issue with pythonpath finding the wrong file
+    me_dir = os.path.dirname(os.path.dirname(os.path.dirname(internal.__file__)))
+    print("me_dir is", me_dir)
     try:
         if __debug__ and options.logging == 'INFO':
             options.logging = 'DEBUG'
@@ -7415,7 +7423,8 @@ def error(self, msg=''):
             level = int(options.logging)
         else:
             level = eval('logging.' + options.logging)
-        logging.config.fileConfig(os.path.join(root_path, 'internal', 'me5_logging.conf'))
+        log_path = os.path.join(me_dir, 'bin', 'internal', 'me5_logging.conf')
+        logging.config.fileConfig(log_path)
         logging.root.setLevel(level)
         logging.getLogger('madgraph').setLevel(level)
     except:
@@ -7429,9 +7438,9 @@ def error(self, msg=''):
             if '--web' in args:
                 i = args.index('--web') 
                 args.pop(i)                                                                                                                                                                     
-                cmd_line = MadEventCmd(os.path.dirname(root_path),force_run=True)
+                cmd_line = MadEventCmd(me_dir, force_run=True)
             else:
-                cmd_line = MadEventCmdShell(os.path.dirname(root_path),force_run=True)
+                cmd_line = MadEventCmdShell(me_dir, force_run=True)
             if not hasattr(cmd_line, 'do_%s' % args[0]):
                 if parser_error:
                     print(parser_error)
diff --git a/epochX/cudacpp/gg_ttggg.mad/bin/internal/misc.py b/epochX/cudacpp/gg_ttggg.mad/bin/internal/misc.py
index d3fed3baa2..91cd3e5c22 100755
--- a/epochX/cudacpp/gg_ttggg.mad/bin/internal/misc.py
+++ b/epochX/cudacpp/gg_ttggg.mad/bin/internal/misc.py
@@ -347,7 +347,7 @@ def tell(msg):
     if dependency=='ninja':
         if cmd.options['ninja'] in ['None',None,''] or\
          (cmd.options['ninja'] == './HEPTools/lib' and not MG5dir is None and\
-         which_lib(pjoin(MG5dir,cmd.options['ninja'],'libninja.a')) is None):
+         which_lib(pjoin(MG5dir,cmd.options['ninja'],'lib','libninja.a')) is None):
             tell("Installing ninja...")
             cmd.do_install('ninja')
  
diff --git a/epochX/cudacpp/gg_ttggg.mad/bin/internal/shower_card.py b/epochX/cudacpp/gg_ttggg.mad/bin/internal/shower_card.py
index c6d3948cc4..c344ea1b15 100755
--- a/epochX/cudacpp/gg_ttggg.mad/bin/internal/shower_card.py
+++ b/epochX/cudacpp/gg_ttggg.mad/bin/internal/shower_card.py
@@ -45,7 +45,9 @@ class ShowerCard(dict):
     false = ['.false.', 'f', 'false', '0']
     logical_vars = ['ue_enabled', 'hadronize', 'b_stable', 'pi_stable', 'wp_stable', 
                     'wm_stable', 'z_stable', 'h_stable', 'tap_stable', 'tam_stable', 
-                    'mup_stable', 'mum_stable', 'is_4lep', 'is_bbar', 'combine_td']
+                    'mup_stable', 'mum_stable', 'is_4lep', 'is_bbar', 'combine_td',
+                    'space_shower_me_corrections', 'time_shower_me_corrections',
+                    'time_shower_me_extended', 'time_shower_me_after_first']
     string_vars = ['extralibs', 'extrapaths', 'includepaths', 'analyse']
     for i in range(1,100):
         string_vars.append('dm_'+str(i))
@@ -82,7 +84,11 @@ class ShowerCard(dict):
             'b_mass' : {'HERWIG6':'b_mass', 'PYTHIA6': 'b_mass', 'HERWIGPP': 'b_mass', 'PYTHIA8': 'b_mass'},
             'analyse' : {'HERWIG6':'hwuti', 'PYTHIA6':'pyuti', 'HERWIGPP':'hwpputi', 'PYTHIA8':'py8uti'},
             'qcut' : {'PYTHIA8':'qcut'},
-            'njmax' : {'PYTHIA8':'njmax'}}
+            'njmax' : {'PYTHIA8':'njmax'},
+            'space_shower_me_corrections' : {'PYTHIA8':'space_shower_me_corrections'},
+            'time_shower_me_corrections' : {'PYTHIA8':'time_shower_me_corrections'},
+            'time_shower_me_extended' : {'PYTHIA8':'time_shower_me_extended'},
+            'time_shower_me_after_first' : {'PYTHIA8':'time_shower_me_after_first'}}
     
     stdhep_dict = {'HERWIG6':'mcatnlo_hwan_stdhep.o', 'PYTHIA6':'mcatnlo_pyan_stdhep.o'}
     
diff --git a/epochX/cudacpp/gg_ttggg.mad/bin/internal/ufomodel/py3_model.pkl b/epochX/cudacpp/gg_ttggg.mad/bin/internal/ufomodel/py3_model.pkl
deleted file mode 100644
index f71ba45bbc6d4acc8d32bb06662fe900a694009f..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 42822
zcmb__2bdJa_B|}1h$1K|m=F<Vm%KZ>uqY~s1X)3xl^}}iusdw;xPiViOVY+T0_L1^
z&N=6tbB@o5@%hX-{m-ec4%1z`!0-F}-gl|Gb?cmas;fiwboU-sRNIuQtC!RQYglcI
zq=(H*Wz#h+^D^n$T(;ZPmb!FfH@id0;daO3*_}Id=+K;MO4~)Vi%`rq*_~#uhr)uC
z<Qi%l(^<Q7dtlRb*q#m9TsON5FxM$wllX4-x`^#sY!%J!2)2~H-t4aYw>j6KvgxMX
z>#GE_3mTg34XQfL?t+-wmc|ybyH|Cby$)+uO6t@0hLccM|EUg5schEnp^9@RB=@p=
zs-BA)>T;I7k=0XiEvT!nsl`~>8&_?nA{!f;({?XaT$Eb4#NK3fHy|#MmIbN$R1Tb*
zR&|)YE?f=S8V--W*^1fSQO%|s=hrZE<?OveA>@`UNZWm8GbMZT+3Vw<+LqSlTw3t|
zCVLCV-g5Q^aPFC2kZWn4*4i{LE%sKx=V<I}Z(Y@))#3>C^+sr^y^ZA#%2K<Zg|V<O
z0#+c~>Y;|CsjjhR>KuDpYa_opYmVLD+PJE>)yvw%+SJ<I+H8f@dxg~pC8V*P1y;-F
z8$dozghhkx?LnxDL+u?xjJtk^X8adsoMjITGtRP$!PuvOaizytQph;SQ&?{g)(Y$G
zA)2pifkLaNg^dT~O|4$;RA`mPCH7DY8>UdzutJXEo~ngudjv>2A^nBvWp*iOiedxl
z(`*15&dO{~QVsQ1&Mx!E&sUQGQ^#QZ%6+CLyTW5?T4s-IpJ_u3l>vNIA$X+^p0h`L
z;JIb?j_rf@G=uL{2tLN!4z2diz~+W;U1sm1p*vwi?7C`(`p@d&k5G{rde_3?AL~Ka
z+T%2I?J|2eBeXwlr;fMB>&@NS%(8nS%LI=lXHV2D7@J9}VcEx?9G<!R*n41ptz#yf
zQb<_k5!TszYQnl@_S6vJgRjR%tfLuvS|Rja9(2~8uA#Hb?7iEE?qr5e7D88h(DUql
zH1xb>_6#F*urYSCXX=g7#mq9RkY%>VV%htGg-12Zo&%DOvE3~onQZSD-WHSX{jn`J
z$4<xteO)SRrR(gu{x0Yr!VV~;t|2w2Z+fVGpjMb3Y^StB9yA{`9KzD}Jgsnjv%*@?
zg!c40((pji)H>8ogU2low&$;gXsTVWiMpGKtU{s&k0_TKYG*W2E;ZOb$V|jDlsXOU
zGtO?rc=8Cw`mB9Yt#%Vew1=6t*{4mbQ&R)aR;k)tLrZhkZfWnNzDE%0UI#3|naghX
zIv~`gn=J^>7OQR6P-o{b%AGA<O0*VRv)%utt?qN}g^?W8g+NWJu|7RdQr*V$vY<LA
z4bAo4>_zCk*s+%s>nn!z{Q36M+WDTGZWDE>n|-jFo2B+4dD}_a%UtEznuexSJ+4G#
ziGAowHG<P>YZkOD!b$otoTPcyZ)|B<P~&?K#|e90N?jcwjdXKvseQy;`$$D!jsHsR
zqf+X+3ReLOaG_D3weyaBbXA9{4oBO|RoJBQHHn11BCBS5i`1nhYp-<dW7Ndjd&2Ja
zu~kLc*&BM)!3ElJ)2u$~nuC{l_VKefLdXJKXtc__6t`ZDx)wQMTC2r@KC#%!;iTt|
z@JV`v)%c!laoA3A>{E-arIt4^r{(R_llB?bp(`y&w_KqX#GR=)mA11|7Sx<=`9emH
zw$I60$o^dG*s6|Miz!;iTV4+5F^5h%hx3cALHwV};R2Pzh3#(X`Q7Y`)K+V-FP>{(
zQj9_DSXGoh+P*YtU*_1C&r3P>6=4Zi=IyJJ_SJC-*C+`yXU;6OuT>JR3nY||u&-AV
zZg4li_}%Rr4N`8Rl+Iep&02~Ub4#&Ob8E46k%dfeE1vQ%{$sjG_V%QGhhyJ4&uzP_
zxT>hClWIrX-AVf%$G&%7+Oh8o3%NgUKajK^ED)k79?}#K`xG3HN0RoVj{TUXc)S?H
z<WoG6w4ZeBr%3U1nBtkd{cO^HE>7{hrg*`p=md%vllDuF{j#Qbg%m}eidU2NYmWUo
zDc%TEyqUM(O4@H1P^3Ad-f?$VY!7UZchye8a>9PkFPGczr#aXDL#B>L+aGwlPHCC_
zp|#xl5W`q>wEdCwVJoNK$JUM5X`<8b6K=CEdYgS(Y;{$a+N*G!wQ$XAnJ;!NT;W=g
z%i5JzZ~sE@v*0W=KuzV()t33fvA+z9|0-{PowWa3AU;L$-&pPx{2!(u7iBmcl-u8`
z8Tg%XC;FZ$*3l|{D7IEv19-4h+&?PrpB($=F!wKc``4uXTb%oMJ-$3n{h@Y*KZ6}1
zQEvaG()b%`#0T>qrm?P0<6oTylaLNL>oX7Oh=L<Uu%T?E6Y6>C44afLu(7mc9eDK~
zT3ssZqQ=~$D}((*&8RY24-RH1-PmF{;K=%DV4hvwJY@q_?}{ytQB%owXICb>AqtN4
zPyzl?P<o=CmyKYPvN3Et*<SGa)3i)BK~-hDDWm;OS|OXEu9EG|2E&L-A2cx8^>nhE
zt2#GtUzO?>?8sENM8T1*RCAE()~M&DFKkk_fsLoyk6ty~WLp;fv8+f)e>hZn+p)zo
zmIKhh^t$QvwpaC4YW}%nxdXd0!GS0^Qmg`k1WQoQ%OKdK42F#-I0W8|KY~LU<L8*D
zkYRAB9EY>TG=wA2z#P}tIhLxr+S+;$%h;7UCQxvsTm=L<R-m4jk+4Y_1sl(?5?+6M
zSIB5o)%M<z(SEiSqhu#IRJLQ-V#;=BG%(u@bhf*wdS9!rw?E6S?8<D%qTt9l6%b^*
z8|rx(51W+TVdL3OpjYkMGLc0;!I2d*2@aLuWVV<R+yf0vu)9uhimIyw)r_oSN2a$Y
z3XV)w%|Uw8P|wR=ut}K?8&7X<cr*SEo@9)lVP&aQ!=W<Vhb^W_IRg#Ma6_HpOjTF=
zjNah0*p)f*(!r5^RX~vA9Mto&A8b<ghmGer7v5k}9>5qs$HYjffkWkZAX`kMm_h?{
z?4fg<r|P)NBekkjWMLU;t@-*fboP-tlKOXYs;sVRyQJx*=J_bJdrh@ZH>qdv#dF2N
zzdXBkNFOZ?C?_QY>&QW{xU`Z+wg(r8(u8_mnqiaD0vjK>1@Nll_{51bOT|V_?J2@w
z|HxP>S=7}C=Gb7`%B^VN2=>$?xKPzOf{RqC$iiybTJvRccmx-d)E~jU^aw7Ymm|0o
zg^V{u2a`<?(IF@&#esEX87znBP_~DL=rGjtayV>Kj)091(UI_G{E<0|G0QW0Kr(7P
z3NN80j}Q*UMtUfYR&@@=a#bp_u)4R*%X-VOmJbib3ex*SQLTq!CA}PqV^Hvi;#jij
zp*RlZq#O_H$O*6<iWAu$8j6!p&&$cMNjU{JJ`|_I8w|y1jA?%;PDcobVq-lNXRw~(
zK4n$8SK@LeE6n{Y6#U%JW*{yY-MM}aYDqa4){*mInf>{U2xWf(>Up^kHYpdu#<RZ|
z-XQx+7~>x$N=xNZIMf`!j4g%*g<OsXs_CWGT)}#V^V`}>tGSXDs<{dUU(MAF^wnI0
zT2iisb>uo&s=1yKAvHIko|hY8lX4SmT+Pk!25N3$%<_!ZkWmsM)ACk?F#An(_P4Q~
z;gqzhb@sQj!tC!r!O#9q2Kw3Gg<4YXhIQl~SZ04OBSP8Vhk9P_hfT@@u<`64gg40k
zA;z?y{lf@h_M7VLA7S0gUR_zsqeNl$kD=ga|2PBnJbD7<q&x}h$WyS){b{y`a(@Q(
zygUn=l;>dMxjzrDK4w;z%L}NfW9Ewt_9s+jLSBMHO{kaIVmfHPf(DA(OpAJzb#M1@
zuP9$*Uy6Di1z*$~4Ai3DL^&yM!8-CbEJeM;_K>J|QP0bJut|9zHZJNv?BSgW%H#u7
zy;?#(M2%}Z`G`@0$chp2F+5y0A1$A-3x5T|@~EOxK1G!aJz0f<QT=CVp!(ig{pYNE
zyOXQ_3-+b@FH!K-f5kwp{%e$z@?Tg-zJaCs|FJ!!{#(@Z@*QkazK4yg|A9R+6rJ(4
z{zxBYB^72UE~6!7l%mMq^^=Ou;g=P4`$m3d-J74Tm|x&hVtyr9J!==wmERbs#r%$P
zQvQH-<WE?N`HSr#F@K|;mw#ZB@-J*$Ob6Uyjf&~0e7*`TOKqPbG#R%~C-%Yi*<4HN
ztm-_Eby1}v3%^EKYyLAt_&l}_N&WL!m0szsOD{KfR}}maT90gcgu0=el=Wd9*#MR!
z)Sd025!w*-y!3!gN>AAM2yFzf+S+Qz-Iy`U)xAAT-O>>wq!&DD&)tMgragC4G;nOT
z&||Zis&j05t5T7LpHnRJ?<khx$CL2b^dV_%Y&NHtW3vSc{@83uHa#|5p`4VhVIAoU
z%dy#p?V+*hhk9PNg-uF-*!b9N2X8Po0~iw;n}lo+j~bgD*kl@;foR~^Y^lelSk*Z;
zC8|_p;YT3L{OgZ}pM0#y?>ynL8AQ_9*bJtZV>1K=e{6=5O^?kml#?<X){zmg9Gg<M
zhsLH1^}HltlTr>FADaqzgRvROn9$galu_`gv8iN}X>3NLfn&3k9-AFm&v1XjC4{>~
z--#898H0i^W@iR!F}t9glwDyR84F7><JcY&vm5GIcfw-b2^$wP0p38&M8<@~l*%M{
zl$gnEGKtv(4HUDr7BhwQ40ixuOcg5>vnL9^n5hiZVy2;-l)YdbnGQ=ad$T<xCW(4p
zs$r9|4{Th_40r=EGZ_;SQ!caMQDXR0hEdGEXrP$BTFe~QGn}!$nEhCxnEg@k#mr@(
z7IOf~NvVN#<Um-8NwGa7W*+K!sfA5S9c)}o8s0$6e8z;tjFNhIlo*RmrY+Ne28!86
zi^;I=PP1w~%?@H;ifKf_7t_Q*Ev6Y|tQ%n+SpZ8hHrqpD1ogaRVUv=Bjf-i8HxRRs
zF(EN!vIrg}W-*&gVwRwRV)|(@OIdfPnJ4C8_NACZQ1Hb#4Af$lp`4UMVI4UPmSPTP
zdq~U?sORNK*rXf<8yAy@HxP3)V?tsoWH~%a%nCM{#H>UE#cZp^9K*Uh%{(#3vM<FP
zhk`HWcm`@QC!m~^6JZ@W36^3`W_w7?DX8b=RM@1P1{)W1I=q3HGZ+&RQz>V{qr{xW
zCX<-6(Lgc%wU~2Mok#j}RjJ6rue_G|=UvP2n{N0>e;!HwpLJ{WO80zvIW`xd;K+rF
zCb(5}5$bul7&a-Fz{W*i3U9FRxQsFWeU5}&j=GY31shC~uS5eSZ>J?+rRtP?wJH@^
z_`%sS|Iln1eq9brzJ{c}<T@?+T6!t@Iux){R5XF)8&Jne5f&>&*tq1I;SD6;!kFb5
ztu&*AMy_RVMF=$x&>C-3b!xm_m5MC<)@_-8<F@d-wiWp)JFM{z()$`4w8lH>rN+BZ
zaO7@96KK2#^}O5*o0R)t;~MXWH_-S1W7^mFAVR2dd#&*y)-#;8R*m~hjXcZ>lX?UN
zM;>KFXa{=?^}IX|o0KPD<4HXUZ^mCAKE;?|eK=a4hC@xZXV_wzY|o;BN$#MNe2(=D
zCz@5KlYE{PCiwyij=ac-P?9g9o|l(llky5|Jjqw#4U&9~F@BPx6Y@G7D#<t4VoLH&
zG%(45I?1<K&v2$#4LZrUSz(gzpy0^6j0h$99_o2{A2uoffsH5m0lYzyA2KFLa<qH|
zhf4BewwRLq1Px5GSSR@@>)z&emuIWkmq~txf+L@+9>E;^0`<In37eF!VB<-C%^vE2
z>aCp1<-hQ$mGd`j46i2rrSfR`AKI`KGp;MYRsC{RB|7o%SofxYoA~!|sl<Ok!I2+T
zk09}%P|wTHuu1s^HlFye?4c6R`1$@uUpU`Pds+UjqH|S)be4Z8L_G)GEdPW{W%(Bh
zj{L291X=!rdS3p8O-ctmuM^L*BfQZpi|AY9EIX;_T-9KmWoK3AajlCg6<K%!(OUhJ
zhu-sm;p5smKnRX&HQs`jUe2kmC^)j7q6tKILp?9+!zN_|*tqEK@CN7T4H@H4{PGdf
z0}eIud$Pqe@i#&PC;kwvd1F<l=3c5)WZ_vzYt5g53~Szmr2etIPHWzjUTWS91xI=-
znm}_O)bp}AY*My>jceW#-azwKj9H%1N;67m<b1U?La1@5*4S6osc{=sDzflMrnTme
zVum&LBfYP&L2KNWUTW-*f+O20nn2?K)bp}EY*KcBjcXhTZ=kW5G3{$CK?pSt(;5e{
z?%paKW35s@DGeqHlNy48f898gf%>{}7|K`$!(tT-%j?EcwucVJWvF8n42xATY&`c0
zc!S(WGA5Xlr7{W*H76_CVw#hq(LgQ3wU!-OcW?Z8T6SVzY8iuquVrTjYAw5<j72ak
z7QwL8GLG#bExVzPMKCND!LV^H6W|TBOk_-;rA#Klp|nh9i%H8KXrPu6TFVsH-TQu?
zmMZq8mOWAMwM=E8)-nxctbt*%28N}Uz1bepl0+SAU|6hyVdGk6z#C|p$(TS(LT16C
zwD6%Rqn3TqKrN+O%N*9-IW|_$vHjSWTJ}f5*D{xZTFU__V+jn4B`_?tq}U$PG7ohu
zfnl)(hK*}U!y9Os&zL|<xzxj<v{-C0ZI=c#P)nKCl40GQW1g0S*q2%wQSh}iF;Hu1
zMj0z$Sge3ysl{e{NQ<D36)-GTz_4*Gt?&j~7BVK#QXz}rP+AtV#iV5k8mJ|qwJc@b
zonxMsgV~o_4ne`!;xJHaS%xwez_3^V!&1xPY!7KU0(C5aVX**)jcdun8)!M2F@ctm
zvK$VjWd&PIT2`WgTFSMSV_0|R*aSVtj%8nJISvJ1%kd1<T24S2>t9%`e_^TRWVVO2
zoPs*mzpz;U!p60n4sW3448{anM#-6QC@p8P#iZqIG*C;0)^ZN(?i};9oXft{avlo4
zmh%~?wOoL5QZ9sb<RVyVxtQ%CEtjC4mrG%jav5w~%jNI}TCQMBprul-ghOe$iY+EB
zSEGSiMrtkBu<p(=Ps_FJOD)%-;A^>_fm+KAC@1AcSVwMxrIwr79@26P>Up^pHYvBk
z#<koIZ=mH4#speM%bjp2EqAfSq~&fjP|GN-<sR1EyI*7Ug7aSXrI!0p@U`5}K&|Bg
zl#}uxtRoM>Qp>|^4{3P>^}IX^o0P|3<60hvH&~ZG!5IH`Lzz5@x{~q~8%$E3Mgyf(
zYAMgK?%w+vtED{4zLfGD3ci%*8K|YafO1k^gmvU4SW0=B?I9_zpq`gkVUzM2Y+TCg
z@CH)eV9fH2R*+EwB9~=vB80h**15mMx_hf@g3kSIqA>S&Q1Ekqmw`I>_fW=fmavZe
z2bQ^i!1hq?AEKU@k6@GXF>E~dPv8x5|CBN9=e`Od%za0l`)90sQ%F4zEuRyGxqpFz
zpZk{#^nWb>3bmws4eQ8%VVV6mj0k1_Kh*Q`Eo@T0gN<kZJ-k8oKQN~KAIpD42(#Zw
zXa5uH-mG!6|CuPv{udPd?0;pTpZ#yBCFOTmNB)3i_J1-Wl>J|*=jCtMr2GRL&;DO{
zgX}xt0ov7M-w`3qevHn(h;?t?xY>6i3bXHwf}edC2Kw2rgIZG7g~fBJu*`lvMuf8O
zhB_8tuvmb>#<TAZZ;<_ljA?)Ddmx0_@2s=$$+|c8?t=;&5rx@rjDnwiF9zyazX{4o
z*%a21&0v{(Z?=bW?}IuPQm|M^!Nzmn65imh$ySW<52TglvNas)=+l=irlZd`XrPu|
zw3dFXdt1WQvMu{kOMeu6E!#0rYZ-uYQnrV6WCvJk8OZjKmSWWNQUaTlL9lTxgW(Od
z3}K9~Wpud=g+pl>#uk&7;b@?iUA2}Gtb2R3dyAr!eW|4k1z$^ofm%yB%1Nn!bz~$g
zwTxnWNJ}N^c^M6xlpSH?T6SU&b%yh96jjI=_|%P}o!J;@t0<LS;3;?j!I53zGO8S_
zdgrRfYL(+y_hz4~ayRy+%JC@pDtBj~RyhIXq)dc$WD+b@PG);Z<sPW#WeRLks$k<P
z_hb*HGUJPzN*{Khn52wS6gh!RQ_;Dqaazn?3b6$Dh^z_v(b4H}DKUGa;EPEzP>ZQX
z87n4OM`plM%uKe2#LPlHFWeYO*%vl0W)6Eq#Oz1k8jIOqMdzw^(_-c-M0LY?xW4c`
z04^n_1_i91RFB}ULJIY~%!5rzEo?l?I(UOCr!-^QzpF4GA>1j(>!j*gckeEi>!d8A
zaJn?0;7CUG2$DJo^}IB~CZ!2Bo>ViuK~gP@X+Nn22w_sY>!fVf-5bi4Iw>Ivlggst
zNKW+#l4?agFAHIlvIsVw)M9vpq?RzI{iK#6gh@@%Ngd33rVGz_!}VM|geXkPK><&k
zsvbd7hoYXB!(fwgIBYzrBj62^I+8K|F>Yj;90i9u#^u>!I>sH11|~UCC%K&UOfM!`
zu9IBBzD#l@3XUA3dIU)xi+Wy;gT>RLu<;~MfHz3;M8+)7=+rVQv&de25<-~NB%RdB
ztY@}iQk6QXQ;5Q(PDR0y(^QWjsnb!<%NejqITJRX)LHNbNuAA@Hj_%b-wsF#KCOV)
zD6?`7!W}smwzy=joX3A74`jzb(l9e64XJsJ>27j9;*t3U3gkj|#=A;A)qy>|c!9`8
zaLtC;4kLz*kc&~#Zv;v@)hL&+QN2Q&UXX6ClS@%AQNIi<Of|Nq<ucT#wa&|ElFLzb
z<O*26c*>Uv`SxgIs%chrO%2#;YRcqFv}P13KVsmw4;oY^SE0%3lObtFF=Zn7SEI#|
zYha1bPb{$i_^PR;i%aENc2mH8&`{cMXg?p^BOhKiOs+#1B=S|)+LopTjp@a5J$o7n
zZ$OJ9H^RmU%j70@Q-u8Lg-<3o!w(s4qu!!WZ`G*YFz9R=<Tf9dP1;>NVn}I;+`c;K
zI~4St8rrl45^`4ozMuD?K?%9r#cz8H+@tXC)%XQlAR+hpz~l6QjGI+G$S2TSAu*_U
zXfd7XTuL{4(o6acRvS&MmRZ#$a=)8R+XMN4%I3j%Hf5L(5Bb?J&7po8^6=_lA5pN6
znqbT1F(0<wY4-T)NS;t6Pnt*)@{~`qc{?LnCQq-9<r&5DY@8*55qi#N*{vNGY%gzT
zif=K!m+1}m{OZzqL8bGeDV;KT$xml1ZZI`ROcSWAL|$GU(JPARRTEJ{Uh|3Yd}@bM
zd3|+gZz!}kH5%We=4_fe$2+dbTRx(9D405jN3^$H<hJ*xcNFrwZ6TM-dq(7Pd4F}t
z|53;vXk>mxhB;?#$?+)sp%2>wE&Xsl$yvkLSX@@pPd-{5_{R$T6AkVU$Yg&&KK1ea
z0h!DJS+zRk&lK|K8reI5)wVQe>TBF1eL}wQ!TYM@-NQIzTJg1=w3;~Tq>zA?FI~pA
z4<cVF#;@CCEL)tA|E?J$Lc#dW>KOm07{6_kF#*Q!){GIMVEle{j6W#GAKPT)@$#oN
zV?-zze_kEqFN*QkHW_*R`EAV@5emlNSI78=V*ImB#<Im_^4FR%A{2~&ua5B_#rSWV
zj0rGyz#9kdMMArC6`^44h~Rdw9EuR&NGI6v1y*?PD3#7@#)wNMFm_oT<2s6Q-F6tu
zr0be8Qsa7UF{ahWGSZ82&D)Gi&u)mrtuNTQvH@(n*Sz?aSgL7!O6r2^-tOpw>)s9V
z58lZp04^EvPlwcmSyM};CyMIwa?;oXPZ~X}@2_JNeWQh|=9b!nWFxdIPxaJ2rcIx*
zPql1J4=)=L!;gPSFF5APCa_f<Qwlm;n`%tUrtt8hSZA;qs<=A_%M5ri%>(hI`O*h2
zH-%lkSXfkXZq2zFvSvt>alto7v8pJQi9l`vw<BA^5)$_tI;5&)D|X`r^Q2uLsMz(8
zZ%(#ia9b0uQw*-Jg4@Q0%kc75<2?7q#i>t!ov(1zhsb8NG|o>qXLIR>X6Z*hH_dH{
z+cAdQU*T@&;;Nyea-7m-0Q<VRe)(?ymsTCPo11G0b9-XOrSG6H2Wm|3<f5)Wq&Pr*
zXw`w^R*u>;hFU_@_y`PAsDmS@>T*Yh1gNjS^8M9+e%=&A9ZJ;BG5y07>Tr$feOzZr
zQ>}~$Fn`+n>wMq4lI|FDcI-Q&YTT@GQpyl_c*{^Uz$f6w>*uhX1p)j#sbF6<ykmPl
z$N!#pQx0K{BxaXb%A*u!WehVXqXW#klaMAD)NBo5?numaVwgKA%rP;{wCwC-?mG5?
z(*~?M<%}5SF2r0vhPkW492>)Il5qj%y&J5!Y}t!n#V~gxX7?E8c!jyU#`LDvlBQg$
zRVD<ee>I<Z{9YgI6hoaz)E+U^NeXqci)wAFH_OcDx|Z(n<q`#=*5XsgRaeU%WO29o
z6r%Nxp;algJvG{Ry$^-wP$mR2l_2qXIZc7=r9md^l+uTVwrU7uIzi%Fb#Dcd)F3l8
z$j0I676PdzNPN2Oqd;b8ko`1B%~*3LGYPVJY)ob;5FUWBA6WDBm=sK?P$v5lBtD_$
zD3JX$NWBIrm_Z?s{RtAEL30(z0UD%HgA`1d5J(L{;uGdT1(MPr{q>l1TK>;TyX?J<
zJ7Lh)VfP8+$L%ZgP}DP~7DdC1se{{*G%QD_Si^Nl9sJH_>*w7`90Hk7kUp^?u2&$I
z2H8Uk`>p@>_OkOcF^~p=#Ai%KfgGek_SYaw2OKc?*$;cfKpF`W-`GtGq*;SBxga|q
z`RnOl9QfrmF_0F51VfxwXKD8WU;&!)V#CfAA)I#;D~k$l+0qp{tfG^&!slMg@nBmz
z+yz<)Z}_fNhL|d^(4DbG@Hw&=mRn(<i?R?Xp_NzRniBV_a5to_=JMg?!?5;NOC6*!
z$c<ajpzgK=3)7O}Dq_-n<=2b9l9Ccx5>472j-`gw4_2uk;-;>?UluGRIVukOsM}zh
zLQgbfMBt)t8Da{?$fY}!U3s1gjnrZA;mIFZ4!?J~lupYL3|2e?z*81d6byh`!Fpp+
zB1fW&f%YgOMQHQzIdU{CY5f)LEPqA2oShWCcV?J6i!0g{@EbT*GCaa@41A6p3mcwU
z*+pe?9HSNZwwzh|uF6!OeR$#AkmC_=AU%OF5z-Uk!&6AG;qk0pBquXiarQ2phaNo=
zE0I$WWnem$og+-A!RN^7uw>$g?t+Ozk!P@vA~a1Dvh|ZQ5o4e`i(MmhXT#^nIj~`2
z*+p_LgUOR2qXF$a21L-#htH7<V8dwjrE(#IRo0%ObXsi=4B<s+GvHs$E)o1o;B(|s
z*aG}Tav6h7_)u~=+6?$tuuBC0O86YP3YPfZ#o)s9Lb;mJihwsPi`^Xub60JgMI}lL
zcCTv?X5hM(y(3)L!G{-pV9CWBNW=n{zq8!HUW(JZMT}PNEjOaUz;YAgBP=(==g2Lv
zVMX{cL2hNVLf=kvPxbFS4KFSmrlf_^DUsXI!$5mGfg-ebz~{)Fu;C50WS-o`V8yyM
z$B~kkG|dYghRWQr<e1-$SOecZgoyCn3!fwR!4~k<%KZ$E^3|I89zd*t??FOD_#T1}
zPl~}7@TKJu21ohQX1+%eYv6l~5D~t|;dA5(*aE&Ld6L0Vz9uu@Q;0S2Jxz!R-!t$z
z@+@otUrwH5aFj1+=6fEo2EG>v5#f6gK0LMtOFr+5sY|$Mc$wjfahqVqpux=f3Zf01
zuM#A}`5JtVybeoFZ)b06oh)xKT2XS>h+I8j-%->ByT+U7VW54BKoQ!v;lsB(U`gwp
z2KOjkoG6ud8Lx<Wlu<i}yKjVVP9@|$gn`Y!8=G-21m0)w2-|<)!?Sd-Q8wJZW_<f>
zylwjt!VGL5vv-8;6ZjnY6gJA1kX4LtpN+R{KSP*-?Q`~yuzdlaBVWQ6u;JcnnS90g
zLN<4=4@Q!=YrjUAf$hKS9bx+hK74fqHp-TeZy8_6=I#>#8*kNqhcE-%_v{^E`vE?@
z<pmpID=U+q7+=U1*H$J!Bh0||3wuY{eudAG-(aI`3HhDzg=}$cW%38Y3~Yb0cZBUP
z`0#ERY=kXQCjT(LkS(q)A^#%Gz}5j*x>2@{@ZsZ^uu-;zbYgrVTU=X0IwQ=$)`h(z
z+SY;3k#%7U*l=gMRJt-=v3bA9;P(~GLUlM<QY-7B-9Xcg-6AyW!-sF9z(#4xq&wr|
zH0~t}Xf{N<fu;w$MQD1$=g3B|QJREo%=kD>!c)@=?FO1n*eyb{DSUVY5;j7ELt<~n
zM`^G#ximTHgLVVW=Ij=s*#bV?@P&=il*v|%kJIQu%gNSgH_-HDw+PKP@Hx^CHcFF_
zZ5bb@(Sw$g{%AMQY{za9ngQ@RvOR18P5I)|gzUihC=Jdq?x4++foM0-6ti1|rUX7m
z2Ej&X$`Ue|@lhI_GCZ0gXgAOdWw!{;F!&r94jZ9KBxD5Rqck{Icr>MGH_()^TZAS7
zAMW<TlEz!0rqlj4cLh5sF7M2oPV<s`B>dp;e}mSi+))gVa8$zQ$Y|IwM|P3Vu_HSb
zaAX&eV<-3x9Ag+B;n*2IM|Ob?bJPYLyRuUON3G%*3%`M59K$t7+WReJH#qTP9qe4$
z9oE0OGXWJxCenpb^d7x&Z;aqKlu2lCxqIR&CVVd=RgKl@WHc9~Kn9Q8yMMF?e2z?k
zB?X@_m?Tw<Q<`{LR&ej4THm{g+`8BkJqoB?NmB_F8PRF*IkFcl3B6V7B!8n#XCFn5
zHF1YY++KUbWk63d(i_WGb%i&-wb}iSzf_}fuIvL_T<iWppqj=8d_G^6bepJyy2%VQ
z<5#zt_{Wi1j73TuOZ|usuPW4!^uk*^iUEIwr?m;6z0b<NbaB9z${ZB&O;uPobE;a3
zzewc$#9t!&qYXc@X~MavIdT9j3Hghdq@{-437QFjPy>*9xa~lc1B4W6j?9Dg5E`1K
z7F9C>zw$421qf+{Fkd6^k-BWUR_YB9+(Ao+4X<s<7B4Lks{o}zp=2})w-`RnUn>V0
zV7Ru%Pm$STU;(GG0H;aeG;17YQ=6@o76XjVCfm@=Y!(!t*a}57ipqvCSp$s9h9GV>
zIfbJh8E<8Geb`xu@?2S@Jo>4@#i%&4gf6Q{eYw=#=6v)?KPl!eL;c6Zh9`!}Qp95l
z2P+98dz07O7Y|m@hiGWN4CF#@0cd?E1P4j3Rtm74D7L#wS!Q53R52W;8Ti!_mtnIu
zlb6HI=tmgPk5uSKx#-rnE_5$IGv!F;k~iZXZNOcwa96mvna!}zxzJns(B9FbU~pum
znc^4&#j%RwIF~~G#D$O7jg;dVst!iHi{c&$gOgYx+X;v**u?H;IFS&MW8g{f;T=m@
z?!D?bH@;?+oWfAWMrG<5z<|uZRaCpEL{3Gdf$KB^M7U0e&yh1=LtOYA5knPM3Aoh#
zp@6GE81BmJn^a^z3;hhtXA>^Md=7kg5*RkbJX+3UsAA^4cBh87{R*hgN1%b~0``wk
zT?iju_=F`D)-3#NCa%veW-PffT=5&gE-`>zs=zLbfR)STJ{TAD2Cyp(U{@-zt0G_(
za<vb}^|}G<8Uxt13hcTF*hsnF15-~KtBJFRTAkm31_SnujE@ZAP4MCUQrPeij*?p#
zt2USaU@EZ_aw{4P*tao0f_*!Dj@$to#;%k*85_gK%KI)f7_jeVd<6R*`0&kK*f92J
zxsS1N?1bEp1_Sm3jE`VH2p`^8g$-esjgW^K8^gw``Vll3upec71p6`g9C;izj9n^E
zFgA{zkSEb#z<!GH5$vbobL1J=Fm{<d%UEJ(x}szNdyWASu;<}(<OSF;SVCT8tODzf
zk^$@`21LMKhR=~#V8dYL@+xB$*v2Rsz+PiO1nhPA9C-sa3|1j;GFE~0M#%v776T$+
zZ^MV%g|K0;k@7BM71&lN8Nl9SKm_c4`0$1;Y#3~me85-*wk=8qun!p!0s9C(M?QuP
zgH_5Wj3roTuUD(mPtjn&Ud8wb_Gj?n>rAj=?9uWCV`JD@ihhX(1NK*pk6?cdA70dj
z4Phrn$Ty6QVPpOIKQtJyzh!&``#bm?`5rcmT`E5?R$=pJ!lCXN+q+s)r#1d0^CO}R
zL_e`}gy?7Z9Qg${OjIVnGBzM8FF0lTM86@*K=eC1M~MD_4=)76g2-AC!X9bH{>zO0
zw*mVfh5fI_{>#NKUE}k9u%s@L4!D1BP}C9bsG=hH@c1Vz6lI3_mu<tjWs2Ls<ltpN
zn6|T-wu^yw9YwpYrmgX5ck=WGt9Lc9ZC>^i<{(|oDXeEmp_@u!eK!T~awIWIHejrp
zG`uR-7kK_PPZpO~-4S6J-VNC?GGTha=SWZ3@XV}~jToy)`m2F9Tzm(YbmZI^9Soek
zh!f%51U^SLg$;9#mdzNeIJfmiIdZ*UNZA|l2FgB!iBN71pCenqau)UQ7e#~GCUQ5N
z+PLn$5<D5OrI~Rn1LM|;v9D$v<ug`zjP5I3dh!(PocakC+-6caxUVmi$Tnv3eg^Vw
z6?uP6KB*0IZzZxO<lC9a2N=k=SL8cr@>y+=yY~a*+t)Pg1I^^c2J#X`K1h?-8_B(!
z_-jT!*i1geKt5EF57Xp}+8|$>VIOWLA7LObRpe!we0dw>YcuQ#GkLjzyh4$W)Z}Lx
z$-SQe)@;~EnaL{+<f9e&j+*?+Hptgz*mp9Mk1>$%tjKrK<af3~zBa?YtC@VPfqa}I
z-%XRNzlxxapInW*w~=rkd9{b`@n*u^4TKXE;Y3Zi3l{;3u-H4QtqI>GGv8zb-yVu@
zip%Gn11rj;im{4$2PIWq2eifQ9#TqVPjoVzFQyVJa=w@bAMP*1@_f<XKmBjFI-M?;
zOgB^QZJ<gjs%n?Ymsl?QFjh%y?>-Jsy)zJL&^40)5nZ$3bA*pigigH`6|yg56&cTn
z-W^EqhD!7Vq3`67dk#7pxc4JognNJZ9GMFn<{l{rFjjH5b1R2jH3&3t9mxI>t`vN@
zvklAbx0Ap9hH?9O7xdn)(S}UDDQnHFbq3b7Vx6y9`KL)-UBT`cIrh0Hcs!6+JkouF
zi#NyW&4iYLut5=KG$FqR>k|6AW1D=Tn{fx3`5Fy;O^UDC<@4t1$Pv=QShWdv^rw%%
zHx^v9c@OaF#IRZYD_3e?03gFAwuv9v!~&lqS=i8Y8d)ki#;RQQiRaSxOvZ!9dNEND
z6KtW-h_!;kkmW)$N3vW5A0C2+<wQKnKVVjKB65cL!&E>Uj0#3*ZBkoePHm|nwS!e^
zhq$Tv8#5seW7Poe8%wg?T_xDI#xQ><UD>h>Jcdy`l=P8NJPbZOAPpPd*5z^pV^yk`
z`P;g!m6$h*Yr1gr?pKEvkRC)E6k4DsKG=o<C7#bl$_CX(QAtE~9zHyu4a>PR$v>Q2
z46Pi)Hq!923|LiSZA1rI{(ly_+$?m3LFh^)^cYvDKNCmFv5Zxlcl)+BFP`fQ6z~Xg
z9Qqi>_jp1@#`gsH@F(|R!?S9XoWxi~JHXokB7+xWJsDjLtfvqu!g?xv_?`-En6*+)
zXRKo7lat;#%6-q<ea9j6ETf!(XanJy1c?xy1)n2l!-CK{C&XdCC!}TzpXfcu%yF)P
z<2=Q2zRTfl!BHdR0>&y)L)+hi<?i_}GzaBE^fgGlh>#J97sKbsC9t6}9fiLi$XG=m
zoR=b-vf#F!yIl>rTm~RRE|(KOlFJqFIdUZ|a>=ac@3o`bm`jEP{Oc?KP$E~E6S&%t
zz%?p?YuyC=ZIO`c7^?<jNE@lNDgSzOHORk#h!OcW!iO)7!iG0Rx!lZH#lB|yGB5aU
zK_3I{t%Qou-Ugo|x5I{ME94HwD%v&Lm&tl3x)@mRB2t9)Zusyg7h%J!BjsMkD%Snh
zXxihz<()kJ>j7`ud%GhJbB2j|AD9fu-p>Rg$vyy|BM-uclN}`wF;*ozXHDkC|2@}-
zL1f7F5%NcJeH1=N9)k_%S}Bh+b}a{1U)ZhXz&-&cL$XgYfk?7X!G}Lu2^&s!v^>Mu
zwHw&~TdvQ7$dK!E<d5X~Jbd^a4mOl)<p_C^u`1UG+nTZQ0~KD^)9>T0*|kQyQ`v)R
z@Aw>?9^z0z3MKLq)EEZjWr~ap$Sd$U@+vF_Bvb7luJ6V$v~!W=@vq&*me(t|WC;|7
zF7pd^{eoc+!sAfk0eH<k0IwSc;0-kZZ@L5Eoh~ZN<SoXk8NpZA+*<-m=BZbw{QKc=
zBgQb!@33oRoZp4dk@sLZ&ip&9E;tX%`gfcHrAy|yuNL`S@0+>)W8nHgaeb({w(h)I
zuG$FKM`o^%4P2inu1{SqfA}h76=T)#ZLDX~lG^a#$!7>KjNRw#7a6-R;KQpEu;HC{
zq<qC##estROVE<Ee2t0$_`eK}0Dl7?-jaX~1CNq#8B5>{-;gqZeP;msUV;4(0jrcB
zJ+RrKiR8Xd=x^Mg5M!YEnO!3}{{o*Qzru1N@eekwW+J5{8}~Oe*Y5_dKNQ!WE|<4)
zM~{%d7#kV*Cixp)gNT0^91-y^e2#R$t5czYA6+UP;c*9^uZkJKiWm?9>ja-8og-jn
z(!~P{4}4R25|+q1h%wNt%dQccuJGY=&9EGJ{t>Fx417~$;JcZ*);Dl%pt!nguA!^n
zgL4tC4b5CV3|u`G*G4XvzmY3sW5yQl!MX6@NiPH##%>e#i;Uf-@Zp^a*zooqDZLr1
zIQT@W`Yl(zM%M>^1N7z$k3eq$AKsII4MUHTtr)BJVZM`R0NdIC)>naT69KD~ejeCt
z^+bfabHzl%<542pBFaG2pPeIFZwH?v17JCs_{YNBVF=&O53hYm7TLtxo7r|Sunkmf
z#hR^`&o%~w-sWAu;B7E|H_H90K=e1+5_1lN3^@!|ISkP`Oj`RK@UE@OHJZava}L7{
zISf}hjL<nOTKgOlZRAjD&Y{eZLqg?Hu5;LaZF9hnw{7H5Va{QsA%{^ahf1BpthLXf
zt!**doWqWW9ClJUjL|tPU;7-|+7>&TbJ)d@!>%fau{wuQYnubVQfHo%<IFkiX2@Z@
z%3*h%L;c$4(AKt?V9sHpA%{sShsipJGuJ+cwzkC{<{YMka!6-(!y{FBse*5=?8zLw
zNBO6s;>a|*aOduN_XvsPIt-m8d%;_9tY!b`QFgil*xLojT<TwRd#_UYSKN~%8O3w6
zsy3kRqflqKsMd#WTHbGWJ|q`U!3#Mv&G55A@ab0Th`GYo$zw0u?AaoCTV`oD*%z&N
z*Jcj>ab!P+>PO#a%&0Dv{ozn2@a%$y=B&&`b*>!1s44&AKV_;qrqwGr*_^c2=34p6
z3<5Iz9x7gp(f9|#g+GzSo<9D}Qklm{7qymcA=FOk3~|(#A*L56hTy{<gZr_ND3Lle
z1lVcR9GTDl*r;b|yn5F0d3^lOMT%r(sa_3kFZHnh#EDRxfkPu4q!5buw>ESxeAIfn
zv}C3hm+?c&jR*+RXhO}AW_HpN0azw2aA?2<Yy(^X$a8-Hhy^<<(_M5M(E+-kh9BpM
z5TGX{2Zu&)Wm^Hdm#zj}h=2fa5o-7yjh%44z(3%sahAkzB3WWH1X(UE06kcN9-=|*
z0Mzk8z3s}^<|d_@no``h9L;3~NQWw<!!*(d0n*_yB);MkAPp@o8RQP>&{D!4Q2=|S
zf;~#Zei*>!W3Z#zgT<yQ8x$v<UhI+{T|l~Ak*?6B{OhB7s8_Bf>EL!rk0~HMR*@d3
zNyqSl-`&xUUu)9xc9fn_KzgDgJ;^0iJJrl;>{KViq1jJiTUDoEx70g$DR*$p|Kd{a
z;HM%gn5w6thHnC~C#LGU!A3hng8@$Oq5MSg)O1c7Y5|1j3(qV7I!gha?E+cX>Xgp$
zQM|c%Fi?1?!Bh%?sq}atia9xd182fLQk6kS%*muYCj)%RVW*s1ko9>g>+^NiN9n9D
zP+6<vo2Nc<gu77nyGZx@L-)H__3ISg1ewB7x<vK6RQG#V_q)vRhrW8W`K+=(K;=<k
zeoHV8WkbqK<Z>Vd8}|yucBRXf*(BJwR~ZPWwnI3$UmJvKU-jF9@lP<Bs|%93MkRBt
zn~e3ImVTWflQoncVY|M7?FPkmqh@<wv)!~hw!-0ry0*04T)=jVV!PF4%Zv^N`ZfdM
zjCKYZcc^{F!dy!Dy}596p?o&W?JB9Xdc}A_N^%Xgjp>@YhT5Fmp%T1PWwV*pwW_z(
zx2lgdV9FP`(y&%lZE6jw+Q1q<<tJ3t-z2Hp&?=wu18e-_A5}fA$|>Km#=p8zRb-8s
z@{fwd;kK%?HFnBhto3GX9c#jrKUm{m*{SMiRjJDk_4gU8I$3*7`I~KvFhEsZtm-L$
zvbLPH&8^v0>svKbex`?i)TXMtRX62F)~;r)ht)9Ud)D|jp<C4(%MH!=5^+ks{OpRn
zONqQ2R=qeb>G{fElWQ3v_o%je4Q-`zpK80GZG4Tnz9m=Fl*+V7syUUF2jHHj4x}xD
zZwl8~4Iq_TYl%Flx;+G2Y~2W{$5s`&iD50CP0PY`T}`cpm#5S6Fk<kjpSpTUEwF|y
z#Gn3XY0VC6YRJ}h+l!#-y6SYUn>>QBlB7JU5I3A=Z8go>dRnXEu5D>u(AdyiU!#)9
z%43Lev%+8PNHu30TAJM^c^obHoRh-1v8onX)wZfn124p%Dq&a-??pd}=Ghr&9bT8p
zrG}^Q$3jxsY{T$L(`S^8C`$~VI;kQxb^P#pE46TFQ+-9MskXVXWq1=ls@#|zo^9H0
zyMqU6xPd$N8<-eYF|4$vu?1fktIZ7@(r+L@(`+Et&=HmR|3F8cA}4O{bf{0YVoaV!
zb*?<aJgYEK(&~!|wRn}fCW{XkH{=>X@GM$as%<T;&CQgbZcMwcqCcm)KM$+6b*izk
lWl_z7hT4Phedib8U7<8LHZ(QlQaQ-Z%8RPUOR%lO{vY*^v@!qy

diff --git a/epochX/cudacpp/gg_ttggg.mad/bin/madevent b/epochX/cudacpp/gg_ttggg.mad/bin/madevent
index c944aa1faf..dff9711b73 100755
--- a/epochX/cudacpp/gg_ttggg.mad/bin/madevent
+++ b/epochX/cudacpp/gg_ttggg.mad/bin/madevent
@@ -32,6 +32,7 @@ except ImportError:
 
     
 import os
+pjoin = os.path.join 
 import optparse
 
 # Get the directory of the script real path (bin)
@@ -160,13 +161,30 @@ except:
     pass
 import internal.madevent_interface as cmd_interface
 
+# check for plugin customization of the launch command
+launch_interface = cmd_interface.MadEventCmdShell
+if os.path.exists(pjoin(root_path, 'bin','internal', 'launch_plugin.py')):
+    with  misc.TMP_variable(sys, 'path', sys.path + [pjoin(root_path, 'bin', 'internal')]):
+        from importlib import reload
+        try:
+            reload('launch_plugin')
+        except Exception as error:
+            import launch_plugin
+    launch_interface =  launch_plugin.MEINTERFACE
+
+
+#Source use this executable for compilation always allow it
+force_run = False
+if (args and args[0] == 'treatcards'):
+    force_run=True    
+
 # Call the cmd interface main loop
 try:
     if '-h' in args or '--help' in args:
-        launch = ME.MadEventCmdShell(me_dir=os.path.dirname(root_path), force_run=True)
+        launch = launch_interface(me_dir=os.path.dirname(root_path), force_run=True)
         launch.exec_cmd('help generate_events')
         sys.exit(0)
-    with cmd_interface.MadEventCmdShell.RunWebHandling(os.path.dirname(root_path), ):
+    with cmd_interface.MadEventCmdShell.RunWebHandling(os.path.dirname(root_path), force_run=force_run):
         if (args and os.path.isfile(args[0])):
             # They are an input file 
             input_file = args[0]
@@ -178,7 +196,7 @@ try:
                 cmd_line.run_cmd('import command ' + input_file)
                 cmd_line.run_cmd('quit')      
             else:
-                cmd_line = cmd_interface.MadEventCmdShell(force_run=True)
+                cmd_line = launch_interface(force_run=True)
                 cmd_line.use_rawinput = False
                 cmd_line.haspiping = False
                 cmd_line.run_cmd('import command ' + input_file)
@@ -188,7 +206,7 @@ try:
             if options.web:
                 cmd_line = cmd_interface.MadEventCmd(force_run=True)
             else:
-                cmd_line = cmd_interface.MadEventCmdShell(force_run=True)
+                cmd_line = launch_interface(force_run=True)
             if not hasattr(cmd_line, 'do_%s' % args[0]):
                 if parser_error:
                     print( parser_error)
diff --git a/epochX/cudacpp/gg_ttggg.mad/src/HelAmps_sm.h b/epochX/cudacpp/gg_ttggg.mad/src/HelAmps_sm.h
index 459f21394d..8b4ad719be 100644
--- a/epochX/cudacpp/gg_ttggg.mad/src/HelAmps_sm.h
+++ b/epochX/cudacpp/gg_ttggg.mad/src/HelAmps_sm.h
@@ -8,7 +8,7 @@
 // Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+// MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
@@ -863,6 +863,7 @@ namespace mg5amcCpu
           const fptype allV2[],
           const fptype allV3[],
           const fptype allCOUP[],
+          const double Ccoeff,
           fptype allvertexes[] ) ALWAYS_INLINE;
 
   //--------------------------------------------------------------------------
@@ -873,6 +874,7 @@ namespace mg5amcCpu
   VVV1P0_1( const fptype allV2[],
             const fptype allV3[],
             const fptype allCOUP[],
+            const double Ccoeff,
             const fptype M1,
             const fptype W1,
             fptype allV1[] ) ALWAYS_INLINE;
@@ -886,6 +888,7 @@ namespace mg5amcCpu
           const fptype allF2[],
           const fptype allV3[],
           const fptype allCOUP[],
+          const double Ccoeff,
           fptype allvertexes[] ) ALWAYS_INLINE;
 
   //--------------------------------------------------------------------------
@@ -896,6 +899,7 @@ namespace mg5amcCpu
   FFV1_1( const fptype allF2[],
           const fptype allV3[],
           const fptype allCOUP[],
+          const double Ccoeff,
           const fptype M1,
           const fptype W1,
           fptype allF1[] ) ALWAYS_INLINE;
@@ -908,6 +912,7 @@ namespace mg5amcCpu
   FFV1_2( const fptype allF1[],
           const fptype allV3[],
           const fptype allCOUP[],
+          const double Ccoeff,
           const fptype M2,
           const fptype W2,
           fptype allF2[] ) ALWAYS_INLINE;
@@ -920,6 +925,7 @@ namespace mg5amcCpu
   FFV1P0_3( const fptype allF1[],
             const fptype allF2[],
             const fptype allCOUP[],
+            const double Ccoeff,
             const fptype M3,
             const fptype W3,
             fptype allV3[] ) ALWAYS_INLINE;
@@ -934,6 +940,7 @@ namespace mg5amcCpu
            const fptype allV3[],
            const fptype allV4[],
            const fptype allCOUP[],
+           const double Ccoeff,
            fptype allvertexes[] ) ALWAYS_INLINE;
 
   //--------------------------------------------------------------------------
@@ -945,6 +952,7 @@ namespace mg5amcCpu
              const fptype allV3[],
              const fptype allV4[],
              const fptype allCOUP[],
+             const double Ccoeff,
              const fptype M1,
              const fptype W1,
              fptype allV1[] ) ALWAYS_INLINE;
@@ -959,6 +967,7 @@ namespace mg5amcCpu
            const fptype allV3[],
            const fptype allV4[],
            const fptype allCOUP[],
+           const double Ccoeff,
            fptype allvertexes[] ) ALWAYS_INLINE;
 
   //--------------------------------------------------------------------------
@@ -970,6 +979,7 @@ namespace mg5amcCpu
              const fptype allV3[],
              const fptype allV4[],
              const fptype allCOUP[],
+             const double Ccoeff,
              const fptype M1,
              const fptype W1,
              fptype allV1[] ) ALWAYS_INLINE;
@@ -984,6 +994,7 @@ namespace mg5amcCpu
            const fptype allV3[],
            const fptype allV4[],
            const fptype allCOUP[],
+           const double Ccoeff,
            fptype allvertexes[] ) ALWAYS_INLINE;
 
   //--------------------------------------------------------------------------
@@ -995,6 +1006,7 @@ namespace mg5amcCpu
              const fptype allV3[],
              const fptype allV4[],
              const fptype allCOUP[],
+             const double Ccoeff,
              const fptype M1,
              const fptype W1,
              fptype allV1[] ) ALWAYS_INLINE;
@@ -1008,6 +1020,7 @@ namespace mg5amcCpu
           const fptype allV2[],
           const fptype allV3[],
           const fptype allCOUP[],
+          const double Ccoeff,
           fptype allvertexes[] )
   {
     mgDebug( 0, __FUNCTION__ );
@@ -1042,6 +1055,7 @@ namespace mg5amcCpu
   VVV1P0_1( const fptype allV2[],
             const fptype allV3[],
             const fptype allCOUP[],
+            const double Ccoeff,
             const fptype M1,
             const fptype W1,
             fptype allV1[] )
@@ -1080,6 +1094,7 @@ namespace mg5amcCpu
           const fptype allF2[],
           const fptype allV3[],
           const fptype allCOUP[],
+          const double Ccoeff,
           fptype allvertexes[] )
   {
     mgDebug( 0, __FUNCTION__ );
@@ -1103,6 +1118,7 @@ namespace mg5amcCpu
   FFV1_1( const fptype allF2[],
           const fptype allV3[],
           const fptype allCOUP[],
+          const double Ccoeff,
           const fptype M1,
           const fptype W1,
           fptype allF1[] )
@@ -1134,6 +1150,7 @@ namespace mg5amcCpu
   FFV1_2( const fptype allF1[],
           const fptype allV3[],
           const fptype allCOUP[],
+          const double Ccoeff,
           const fptype M2,
           const fptype W2,
           fptype allF2[] )
@@ -1165,6 +1182,7 @@ namespace mg5amcCpu
   FFV1P0_3( const fptype allF1[],
             const fptype allF2[],
             const fptype allCOUP[],
+            const double Ccoeff,
             const fptype M3,
             const fptype W3,
             fptype allV3[] )
@@ -1197,6 +1215,7 @@ namespace mg5amcCpu
            const fptype allV3[],
            const fptype allV4[],
            const fptype allCOUP[],
+           const double Ccoeff,
            fptype allvertexes[] )
   {
     mgDebug( 0, __FUNCTION__ );
@@ -1225,6 +1244,7 @@ namespace mg5amcCpu
              const fptype allV3[],
              const fptype allV4[],
              const fptype allCOUP[],
+             const double Ccoeff,
              const fptype M1,
              const fptype W1,
              fptype allV1[] )
@@ -1260,6 +1280,7 @@ namespace mg5amcCpu
            const fptype allV3[],
            const fptype allV4[],
            const fptype allCOUP[],
+           const double Ccoeff,
            fptype allvertexes[] )
   {
     mgDebug( 0, __FUNCTION__ );
@@ -1288,6 +1309,7 @@ namespace mg5amcCpu
              const fptype allV3[],
              const fptype allV4[],
              const fptype allCOUP[],
+             const double Ccoeff,
              const fptype M1,
              const fptype W1,
              fptype allV1[] )
@@ -1323,6 +1345,7 @@ namespace mg5amcCpu
            const fptype allV3[],
            const fptype allV4[],
            const fptype allCOUP[],
+           const double Ccoeff,
            fptype allvertexes[] )
   {
     mgDebug( 0, __FUNCTION__ );
@@ -1351,6 +1374,7 @@ namespace mg5amcCpu
              const fptype allV3[],
              const fptype allV4[],
              const fptype allCOUP[],
+             const double Ccoeff,
              const fptype M1,
              const fptype W1,
              fptype allV1[] )
diff --git a/epochX/cudacpp/gg_ttggg.mad/src/Parameters_sm.cc b/epochX/cudacpp/gg_ttggg.mad/src/Parameters_sm.cc
index 05eba20217..067445b198 100644
--- a/epochX/cudacpp/gg_ttggg.mad/src/Parameters_sm.cc
+++ b/epochX/cudacpp/gg_ttggg.mad/src/Parameters_sm.cc
@@ -7,7 +7,7 @@
 // Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+// MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
diff --git a/epochX/cudacpp/gg_ttggg.mad/src/Parameters_sm.h b/epochX/cudacpp/gg_ttggg.mad/src/Parameters_sm.h
index 41830f87ca..9581d66e0e 100644
--- a/epochX/cudacpp/gg_ttggg.mad/src/Parameters_sm.h
+++ b/epochX/cudacpp/gg_ttggg.mad/src/Parameters_sm.h
@@ -7,7 +7,7 @@
 // Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+// MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
diff --git a/epochX/cudacpp/gg_ttggg.mad/src/cudacpp_src.mk b/epochX/cudacpp/gg_ttggg.mad/src/cudacpp_src.mk
index f2804ffb85..159e19a46d 100644
--- a/epochX/cudacpp/gg_ttggg.mad/src/cudacpp_src.mk
+++ b/epochX/cudacpp/gg_ttggg.mad/src/cudacpp_src.mk
@@ -36,6 +36,13 @@ endif
 # See https://www.gnu.org/software/make/manual/html_node/Implicit-Variables.html
 ###RANLIB = ranlib
 
+# Add -mmacosx-version-min=11.3 to avoid "ld: warning: object file was built for newer macOS version than being linked"
+LDFLAGS =
+ifneq ($(shell $(CXX) --version | egrep '^Apple clang'),)
+CXXFLAGS += -mmacosx-version-min=11.3
+LDFLAGS += -mmacosx-version-min=11.3
+endif
+
 #-------------------------------------------------------------------------------
 
 #=== Configure the CUDA compiler (note: GPUCC is already exported including ccache)
@@ -266,11 +273,11 @@ endif
 ifneq ($(GPUCC),)
 $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so : $(cxx_objects) $(cu_objects)
 	@if [ ! -d $(LIBDIR) ]; then echo "mkdir -p $(LIBDIR)"; mkdir -p $(LIBDIR); fi
-	$(GPUCC) -shared -o $@ $(cxx_objects) $(cu_objects)
+	$(GPUCC) -shared -o $@ $(cxx_objects) $(cu_objects) $(LDFLAGS)
 else
 $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so : $(cxx_objects)
 	@if [ ! -d $(LIBDIR) ]; then echo "mkdir -p $(LIBDIR)"; mkdir -p $(LIBDIR); fi
-	$(CXX) -shared -o $@ $(cxx_objects)
+	$(CXX) -shared -o $@ $(cxx_objects) $(LDFLAGS)
 endif
 
 #-------------------------------------------------------------------------------
diff --git a/epochX/cudacpp/gg_ttggg.mad/src/mgOnGpuConfig.h b/epochX/cudacpp/gg_ttggg.mad/src/mgOnGpuConfig.h
index e540c8587c..55d03f1252 100644
--- a/epochX/cudacpp/gg_ttggg.mad/src/mgOnGpuConfig.h
+++ b/epochX/cudacpp/gg_ttggg.mad/src/mgOnGpuConfig.h
@@ -15,7 +15,6 @@
 #define MGONGPUCPP_GPUIMPL cuda
 #elif defined __HIPCC__
 #define MGONGPUCPP_GPUIMPL hip
-#include "hip/hip_runtime.h"
 #else
 #undef MGONGPUCPP_GPUIMPL
 #endif
@@ -24,16 +23,19 @@
 // ** NB2 Baseline on b7g47n0004 fluctuates (probably depends on load on other VMs)
 
 // Choose if curand is supported for generating random numbers
-// For CUDA, by default, it is supported
-// For HIP, by default, it is not supported
-// For C++, by default, do not inline, but allow this macro to be set from outside with e.g. -DMGONGPU_HAS_NO_CURAND
-#ifdef __CUDACC__
-#undef MGONGPU_HAS_NO_CURAND
-#elif defined __HIPCC__
+// For HIP, by default, do not use curand (common random numbers will be used instead)
+// For both CUDA and C++, by default, do not inline, but allow this macro to be set from outside with e.g. -DMGONGPU_HAS_NO_CURAND
+// (there exist CUDA installations, e.g. using the HPC package, which do not include curand - see PR #784 and #785)
+#if defined __HIPCC__
 #define MGONGPU_HAS_NO_CURAND 1
 #else
+//#ifdef __CUDACC__
+//#undef MGONGPU_HAS_NO_CURAND // default
+////#define MGONGPU_HAS_NO_CURAND 1
+//#else
 //#undef MGONGPU_HAS_NO_CURAND // default
 ////#define MGONGPU_HAS_NO_CURAND 1
+//#endif
 #endif
 
 // Choose floating point precision (for everything but color algebra #537)
diff --git a/epochX/cudacpp/gg_ttggg.mad/src/mgOnGpuCxtypes.h b/epochX/cudacpp/gg_ttggg.mad/src/mgOnGpuCxtypes.h
index 46d9f02733..5532e22fa1 100644
--- a/epochX/cudacpp/gg_ttggg.mad/src/mgOnGpuCxtypes.h
+++ b/epochX/cudacpp/gg_ttggg.mad/src/mgOnGpuCxtypes.h
@@ -159,6 +159,12 @@ namespace mg5amcCpu
     return cxsmpl<float>( a, 0 ) * b;
   }
 
+  inline __host__ __device__ constexpr cxsmpl<float>
+  operator*( const cxsmpl<float>& a, const double& b )
+  {
+    return a * cxsmpl<float>( b, 0 );
+  }
+
   template<typename FP>
   inline __host__ __device__ constexpr cxsmpl<FP>
   operator/( const cxsmpl<FP>& a, const cxsmpl<FP>& b )
diff --git a/epochX/cudacpp/gg_ttggg.sa/CODEGEN_cudacpp_gg_ttggg_log.txt b/epochX/cudacpp/gg_ttggg.sa/CODEGEN_cudacpp_gg_ttggg_log.txt
index 8047308e05..3eb5706f27 100644
--- a/epochX/cudacpp/gg_ttggg.sa/CODEGEN_cudacpp_gg_ttggg_log.txt
+++ b/epochX/cudacpp/gg_ttggg.sa/CODEGEN_cudacpp_gg_ttggg_log.txt
@@ -14,7 +14,7 @@ Running MG5 in debug mode
 *                   *        * *        *                  *
 *                 *                       *                *
 *                                                          *
-*         VERSION 3.5.1_lo_vect         2023-08-08         *
+*         VERSION 3.5.2_lo_vect         2023-11-08         *
 [1;31m*                                                          *[1;0m
 [1;31m*          WARNING: UNKNOWN DEVELOPMENT VERSION.           *[1;0m
 [1;31m*            WARNING: DO NOT USE FOR PRODUCTION            *[1;0m
@@ -53,7 +53,7 @@ Note that you can still compile and run aMC@NLO with the built-in PDFs
 Using default text editor "vi". Set another one in ./input/mg5_configuration.txt
 Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt
 Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt
-import /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_cudacpp_gg_ttggg.mg
+import /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg.mg
 The import format was not given, so we guess it as command
 set stdout_level DEBUG
 set output information to level: 10
@@ -62,7 +62,7 @@ generate g g > t t~ g g g
 No model currently active, so we import the Standard Model
 INFO: load particles 
 INFO: load vertices 
-[1;32mDEBUG: model prefixing  takes 0.005494117736816406 [0m
+[1;32mDEBUG: model prefixing  takes 0.005807638168334961 [0m
 INFO: Restrict model sm with file models/sm/restrict_default.dat . 
 [1;32mDEBUG: Simplifying conditional expressions [0m
 [1;32mDEBUG: remove interactions: u s w+ at order: QED=1 [0m
@@ -155,66 +155,35 @@ INFO: Please specify coupling orders to bypass this step.
 INFO: Trying coupling order WEIGHTED<=5: WEIGTHED IS QCD+2*QED 
 INFO: Trying process: g g > t t~ g g g WEIGHTED<=5 @1  
 INFO: Process has 1240 diagrams 
-1 processes with 1240 diagrams generated in 1.886 s
+1 processes with 1240 diagrams generated in 1.890 s
 Total: 1 processes with 1240 diagrams
-output standalone_cudacpp CODEGEN_cudacpp_gg_ttggg
-Load PLUGIN.CUDACPP_SA_OUTPUT
-[1mOutput will be done with PLUGIN: CUDACPP_SA_OUTPUT[0m
+output standalone_cudacpp ../TMPOUT/CODEGEN_cudacpp_gg_ttggg
+Load PLUGIN.CUDACPP_OUTPUT
+[1mOutput will be done with PLUGIN: CUDACPP_OUTPUT[0m
 [1;32mDEBUG:  cformat = [0m plugin [1;30m[export_cpp.py at line 3071][0m [0m
-[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [1;30m[output.py at line 152][0m [0m
-[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [1;30m[output.py at line 157][0m [0m
-INFO: Creating subdirectories in directory /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_cudacpp_gg_ttggg 
+[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [1;30m[output.py at line 160][0m [0m
+[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [1;30m[output.py at line 165][0m [0m
+INFO: Creating subdirectories in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg 
 INFO: Organizing processes into subprocess groups 
 INFO: Generating Helas calls for process: g g > t t~ g g g WEIGHTED<=5 @1 
 INFO: Processing color information for process: g g > t t~ g g g @1 
-[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.generate_subprocess_directory (create the directory) [1;30m[output.py at line 186][0m [0m
-[1;32mDEBUG:    type(subproc_group)=<class 'madgraph.core.helas_objects.HelasMatrixElement'> [1;30m[output.py at line 187][0m [0m
-[1;32mDEBUG:    type(fortran_model)=<class 'PLUGIN.CUDACPP_SA_OUTPUT.model_handling.PLUGIN_GPUFOHelasCallWriter'> [1;30m[output.py at line 188][0m [0m
-[1;32mDEBUG:    type(me)=<class 'int'> me=0 [1;30m[output.py at line 189][0m [0m
-[1;32mDEBUG:  Entering PLUGIN_OneProcessExporter.__init__ [1;30m[model_handling.py at line 1039][0m [0m
-[1;32mDEBUG:  proc_id = [0m 0 [1;30m[model_handling.py at line 1045][0m [0m
-INFO: Creating files in directory /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_cudacpp_gg_ttggg/SubProcesses/P1_Sigma_sm_gg_ttxggg 
-[1;32mDEBUG:  Entering PLUGIN_OneProcessExporter.generate_process_files [1;30m[model_handling.py at line 1297][0m [0m
-[1;32mDEBUG:  self.include_multi_channel is not yet defined: this is standalone_cudacpp mode [1;30m[model_handling.py at line 1301][0m [0m
-FileWriter <class 'PLUGIN.CUDACPP_SA_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_cudacpp_gg_ttggg/SubProcesses/P1_Sigma_sm_gg_ttxggg/./CPPProcess.h
-[1;32mDEBUG:  Entering PLUGIN_OneProcessExporter.write_process_h_file [1;30m[model_handling.py at line 1453][0m [0m
-FileWriter <class 'PLUGIN.CUDACPP_SA_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_cudacpp_gg_ttggg/SubProcesses/P1_Sigma_sm_gg_ttxggg/./CPPProcess.cc
-[1;32mDEBUG:  Entering PLUGIN_OneProcessExporter.write_process_cc_file [1;30m[model_handling.py at line 1475][0m [0m
-[1;32mDEBUG:  Entering PLUGIN_OneProcessExporter.get_sigmaKin_lines [1;30m[model_handling.py at line 1143][0m [0m
-[1;32mDEBUG:  self.include_multi_channel = [0m False [1;30m[model_handling.py at line 1144][0m [0m
-[1;32mDEBUG:  self.support_multichannel = [0m True [1;30m[model_handling.py at line 1145][0m [0m
-[1;32mDEBUG:  type(self.helas_call_writer) = [0m <class 'PLUGIN.CUDACPP_SA_OUTPUT.model_handling.PLUGIN_GPUFOHelasCallWriter'> [1;30m[model_handling.py at line 1162][0m [0m
-[1;32mDEBUG:  self.support_multichannel, self.include_multi_channel = [0m True False [1;30m[model_handling.py at line 1163][0m [0m
-[1;32mDEBUG:  multi_channel_map = [0m None [1;30m[model_handling.py at line 1654][0m [0m
-[1;32mDEBUG:  diag_to_config = [0m {} [1;30m[model_handling.py at line 1711][0m [0m
-[1;32mDEBUG:  call = [0m vxxxxx( momenta,m_pars->%s, cHel[ihel][%d],%+d, w_sv[%d], %d ); [1;30m[model_handling.py at line 1823][0m [0m
-[1;32mDEBUG:  ('ZERO', 0, -1, 0, 0) [1;30m[model_handling.py at line 1824][0m [0m
-[1;32mDEBUG:  call = [0m vxxxxx( momenta,m_pars->%s, cHel[ihel][%d],%+d, w_sv[%d], %d ); [1;30m[model_handling.py at line 1823][0m [0m
-[1;32mDEBUG:  ('ZERO', 1, -1, 1, 1) [1;30m[model_handling.py at line 1824][0m [0m
-[1;32mDEBUG:  call = [0m vxxxxx( momenta,m_pars->%s, cHel[ihel][%d],%+d, w_sv[%d], %d ); [1;30m[model_handling.py at line 1823][0m [0m
-[1;32mDEBUG:  ('ZERO', 4, 1, 4, 4) [1;30m[model_handling.py at line 1824][0m [0m
-[1;32mDEBUG:  call = [0m vxxxxx( momenta,m_pars->%s, cHel[ihel][%d],%+d, w_sv[%d], %d ); [1;30m[model_handling.py at line 1823][0m [0m
-[1;32mDEBUG:  ('ZERO', 5, 1, 5, 5) [1;30m[model_handling.py at line 1824][0m [0m
-[1;32mDEBUG:  call = [0m vxxxxx( momenta,m_pars->%s, cHel[ihel][%d],%+d, w_sv[%d], %d ); [1;30m[model_handling.py at line 1823][0m [0m
-[1;32mDEBUG:  ('ZERO', 6, 1, 6, 6) [1;30m[model_handling.py at line 1824][0m [0m
-INFO: Created files CPPProcess.h and CPPProcess.cc in directory /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_cudacpp_gg_ttggg/SubProcesses/P1_Sigma_sm_gg_ttxggg/. 
-[1;32mDEBUG:  Entering PLUGIN_OneProcessExporter.edit_CMakeLists [1;30m[model_handling.py at line 1343][0m [0m
-[1;32mDEBUG:  Entering PLUGIN_OneProcessExporter.edit_check_sa [1;30m[model_handling.py at line 1352][0m [0m
-[1;32mDEBUG:  Entering PLUGIN_OneProcessExporter.edit_mgonGPU [1;30m[model_handling.py at line 1369][0m [0m
-[1;32mDEBUG:  Entering PLUGIN_OneProcessExporter.edit_processidfile [1;30m[model_handling.py at line 1389][0m [0m
-[1;32mDEBUG:  Entering PLUGIN_OneProcessExporter.edit_testxxx [1;30m[model_handling.py at line 1419][0m [0m
-[1;32mDEBUG:  Entering PLUGIN_OneProcessExporter.edit_memorybuffers [1;30m[model_handling.py at line 1430][0m [0m
-[1;32mDEBUG:  Entering PLUGIN_OneProcessExporter.edit_memoryaccesscouplings [1;30m[model_handling.py at line 1441][0m [0m
-[1;32mDEBUG:  'Copying test reference file: ', template_ref  = [0m Copying test reference file:  /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/../../../test/ref/dump_CPUTest.Sigma_sm_gg_ttxggg.txt [1;30m[model_handling.py at line 1335][0m [0m
-Generated helas calls for 1 subprocesses (1240 diagrams) in 6.533 s
-[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.convert_model (create the model) [1;30m[output.py at line 194][0m [0m
+[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.generate_subprocess_directory (create the directory) [1;30m[output.py at line 194][0m [0m
+[1;32mDEBUG:    type(subproc_group)=<class 'madgraph.core.helas_objects.HelasMatrixElement'> [1;30m[output.py at line 195][0m [0m
+[1;32mDEBUG:    type(fortran_model)=<class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_GPUFOHelasCallWriter'> [1;30m[output.py at line 196][0m [0m
+[1;32mDEBUG:    type(me)=<class 'int'> me=0 [1;30m[output.py at line 197][0m [0m
+INFO: Creating files in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg/SubProcesses/P1_Sigma_sm_gg_ttxggg 
+FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg/SubProcesses/P1_Sigma_sm_gg_ttxggg/./CPPProcess.h
+FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg/SubProcesses/P1_Sigma_sm_gg_ttxggg/./CPPProcess.cc
+INFO: Created files CPPProcess.h and CPPProcess.cc in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg/SubProcesses/P1_Sigma_sm_gg_ttxggg/. 
+Generated helas calls for 1 subprocesses (1240 diagrams) in 6.499 s
+[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.convert_model (create the model) [1;30m[output.py at line 202][0m [0m
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates VVV1 routines[0m
 ALOHA: aloha creates FFV1 routines[0m
 ALOHA: aloha creates VVVV1 routines[0m
 ALOHA: aloha creates VVVV3 routines[0m
 ALOHA: aloha creates VVVV4 routines[0m
-ALOHA: aloha creates 5 routines in  0.342 s
+ALOHA: aloha creates 5 routines in  0.347 s
 <class 'aloha.create_aloha.AbstractRoutine'> VVV1
 <class 'aloha.create_aloha.AbstractRoutine'> VVV1
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
@@ -227,23 +196,17 @@ ALOHA: aloha creates 5 routines in  0.342 s
 <class 'aloha.create_aloha.AbstractRoutine'> VVVV3
 <class 'aloha.create_aloha.AbstractRoutine'> VVVV4
 <class 'aloha.create_aloha.AbstractRoutine'> VVVV4
-FileWriter <class 'PLUGIN.CUDACPP_SA_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_cudacpp_gg_ttggg/src/./HelAmps_sm.h
-INFO: Created file HelAmps_sm.h in directory /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_cudacpp_gg_ttggg/src/. 
+FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg/src/./HelAmps_sm.h
+INFO: Created file HelAmps_sm.h in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg/src/. 
 super_write_set_parameters_onlyfixMajorana (hardcoded=False)
-[1;32mDEBUG:  'pardef_lines size =', len(pardef_lines), ', keys size =', len(pardef_lines.keys())  = [0m pardef_lines size = 48 , keys size = 48 [1;30m[model_handling.py at line 729][0m [0m
 super_write_set_parameters_onlyfixMajorana (hardcoded=True)
-[1;32mDEBUG:  'pardef_lines size =', len(pardef_lines), ', keys size =', len(pardef_lines.keys())  = [0m pardef_lines size = 3 , keys size = 3 [1;30m[model_handling.py at line 729][0m [0m
-[1;32mDEBUG:  'pardef_lines size =', len(pardef_lines), ', keys size =', len(pardef_lines.keys())  = [0m pardef_lines size = 3 , keys size = 3 [1;30m[model_handling.py at line 729][0m [0m
-[1;32mDEBUG:  'pardef_lines size =', len(pardef_lines), ', keys size =', len(pardef_lines.keys())  = [0m pardef_lines size = 3 , keys size = 3 [1;30m[model_handling.py at line 729][0m [0m
-[1;32mDEBUG:  'pardef_lines size =', len(pardef_lines), ', keys size =', len(pardef_lines.keys())  = [0m pardef_lines size = 3 , keys size = 3 [1;30m[model_handling.py at line 729][0m [0m
-[1;32mDEBUG:  'pardef_lines size =', len(pardef_lines), ', keys size =', len(pardef_lines.keys())  = [0m pardef_lines size = 3 , keys size = 3 [1;30m[model_handling.py at line 729][0m [0m
-FileWriter <class 'PLUGIN.CUDACPP_SA_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_cudacpp_gg_ttggg/src/./Parameters_sm.h
-FileWriter <class 'PLUGIN.CUDACPP_SA_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_cudacpp_gg_ttggg/src/./Parameters_sm.cc
+FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg/src/./Parameters_sm.h
+FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg/src/./Parameters_sm.cc
 INFO: Created files Parameters_sm.h and Parameters_sm.cc in directory 
-INFO: /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_cudacpp_gg_ttggg/src/. and /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_cudacpp_gg_ttggg/src/. 
-[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.finalize [1;30m[output.py at line 203][0m [0m
+INFO: /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg/src/. and /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg/src/. 
 quit
 
-real	0m13.003s
-user	0m12.866s
-sys	0m0.085s
+real	0m12.939s
+user	0m12.788s
+sys	0m0.093s
+Code generation completed in 13 seconds
diff --git a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/Bridge.h b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/Bridge.h
index f37c972b24..89437b4c42 100644
--- a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/Bridge.h
+++ b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/Bridge.h
@@ -18,6 +18,7 @@
 #include <cassert>
 #include <cmath>
 #include <cstring>
+#include <filesystem>
 #include <iostream>
 #include <memory>
 #include <type_traits>
@@ -244,14 +245,21 @@ namespace mg5amcCpu
     }
     std::cout << "WARNING! Instantiate device Bridge (nevt=" << m_nevt << ", gpublocks=" << m_gpublocks << ", gputhreads=" << m_gputhreads
               << ", gpublocks*gputhreads=" << m_gpublocks * m_gputhreads << ")" << std::endl;
-    CPPProcess process( /*verbose=*/false );
     m_pmek.reset( new MatrixElementKernelDevice( m_devMomentaC, m_devGs, m_devRndHel, m_devRndCol, m_devMEs, m_devSelHel, m_devSelCol, m_gpublocks, m_gputhreads ) );
 #else
     std::cout << "WARNING! Instantiate host Bridge (nevt=" << m_nevt << ")" << std::endl;
-    CPPProcess process( /*verbose=*/false );
     m_pmek.reset( new MatrixElementKernelHost( m_hstMomentaC, m_hstGs, m_hstRndHel, m_hstRndCol, m_hstMEs, m_hstSelHel, m_hstSelCol, m_nevt ) );
 #endif // MGONGPUCPP_GPUIMPL
-    process.initProc( "../../Cards/param_card.dat" );
+    // Create a process object, read param card and set parameters
+    // FIXME: the process instance can happily go out of scope because it is only needed to read parameters?
+    // FIXME: the CPPProcess should really be a singleton? what if fbridgecreate is called from several Fortran threads?
+    CPPProcess process( /*verbose=*/false );
+    std::string paramCard = "../../Cards/param_card.dat";
+    if( !std::filesystem::exists( paramCard ) )
+    {
+      paramCard = "../" + paramCard;
+    }
+    process.initProc( paramCard );
   }
 
 #ifdef MGONGPUCPP_GPUIMPL
diff --git a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/GpuAbstraction.h b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/GpuAbstraction.h
index 9c467b1e04..6a7d9c05c0 100644
--- a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/GpuAbstraction.h
+++ b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/GpuAbstraction.h
@@ -39,6 +39,8 @@
 
 #elif defined __HIPCC__
 
+#include "hip/hip_runtime.h"
+
 #define gpuError_t hipError_t
 #define gpuPeekAtLastError hipPeekAtLastError
 #define gpuGetErrorString hipGetErrorString
diff --git a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MadgraphTest.h b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MadgraphTest.h
index 176338151a..a64c05c26a 100644
--- a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MadgraphTest.h
+++ b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MadgraphTest.h
@@ -14,6 +14,7 @@
 
 #include <array>
 #include <cmath>
+#include <filesystem>
 #include <fstream>
 #include <iomanip>
 #include <memory>
@@ -215,19 +216,16 @@ TEST_P( MadgraphTest, CompareMomentaAndME )
 #endif
   constexpr fptype energy = 1500; // historical default, Ecms = 1500 GeV = 1.5 TeV (above the Z peak)
   // Dump events to a new reference file?
-  constexpr bool dumpEvents = false;
-  std::string dumpFileName = std::string( "dump_" ) + testing::UnitTest::GetInstance()->current_test_info()->test_suite_name() + '.' + testing::UnitTest::GetInstance()->current_test_info()->name() + ".txt";
-  while( dumpFileName.find( '/' ) != std::string::npos )
-  {
-    dumpFileName.replace( dumpFileName.find( '/' ), 1, "_" );
-  }
+  const char* dumpEventsC = getenv( "CUDACPP_RUNTEST_DUMPEVENTS" );
+  const bool dumpEvents = ( dumpEventsC != 0 ) && ( std::string( dumpEventsC ) != "" );
+  const std::string refFileName = testDriver->getRefFileName();
+  const std::string dumpFileName = std::filesystem::path( refFileName ).filename();
   std::ofstream dumpFile;
   if( dumpEvents )
   {
     dumpFile.open( dumpFileName, std::ios::trunc );
   }
   // Read reference data
-  const std::string refFileName = testDriver->getRefFileName();
   std::map<unsigned int, ReferenceData> referenceData;
   if( !dumpEvents )
   {
diff --git a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MatrixElementKernels.cc b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MatrixElementKernels.cc
index d6d6c4f179..81699dfea9 100644
--- a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MatrixElementKernels.cc
+++ b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MatrixElementKernels.cc
@@ -112,10 +112,17 @@ namespace mg5amcCpu
     // See https://community.arm.com/arm-community-blogs/b/operating-systems-blog/posts/runtime-detection-of-cpu-features-on-an-armv8-a-cpu
     bool ok = true; // this is just an assumption!
     const std::string tag = "arm neon (128bit as in SSE4.2)";
-#else
+#elif defined( __x86_64__ ) || defined( __i386__ )
     bool known = true;
     bool ok = __builtin_cpu_supports( "sse4.2" );
     const std::string tag = "nehalem (SSE4.2)";
+#else // AV FIXME! Added by OM for Mac, should identify the correct __xxx__ flag that should be targeted
+    bool known = false; // __builtin_cpu_supports is not supported
+    // See https://gcc.gnu.org/onlinedocs/gcc/Basic-PowerPC-Built-in-Functions-Available-on-all-Configurations.html
+    // See https://stackoverflow.com/q/62783908
+    // See https://community.arm.com/arm-community-blogs/b/operating-systems-blog/posts/runtime-detection-of-cpu-features-on-an-armv8-a-cpu
+    bool ok = true; // this is just an assumption!
+    const std::string tag = "arm neon (128bit as in SSE4.2)";
 #endif
 #else
     bool known = true;
diff --git a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/P1_Sigma_sm_gg_ttxggg/CPPProcess.cc b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/P1_Sigma_sm_gg_ttxggg/CPPProcess.cc
index 11def8bb25..fa23301c50 100644
--- a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/P1_Sigma_sm_gg_ttxggg/CPPProcess.cc
+++ b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/P1_Sigma_sm_gg_ttxggg/CPPProcess.cc
@@ -7,7 +7,7 @@
 // Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+// MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
@@ -253,13 +253,13 @@ namespace mg5amcCpu
 
       vxxxxx<M_ACCESS, W_ACCESS>( momenta, 0., cHel[ihel][6], +1, w_fp[6], 6 );
 
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[1], COUPs[0], 0., 0., w_fp[7] );
-      FFV1P0_3<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[2], COUPs[1], 0., 0., w_fp[8] );
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[7], w_fp[4], COUPs[0], 0., 0., w_fp[9] );
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[8], w_fp[5], COUPs[0], 0., 0., w_fp[10] );
+      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[1], COUPs[0], 1.0, 0., 0., w_fp[7] );
+      FFV1P0_3<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[2], COUPs[1], 1.0, 0., 0., w_fp[8] );
+      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[7], w_fp[4], COUPs[0], 1.0, 0., 0., w_fp[9] );
+      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[8], w_fp[5], COUPs[0], 1.0, 0., 0., w_fp[10] );
 
       // Amplitude(s) for diagram number 1
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[9], w_fp[10], w_fp[6], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[9], w_fp[10], w_fp[6], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -283,10 +283,10 @@ namespace mg5amcCpu
       // *** DIAGRAM 2 OF 1240 ***
 
       // Wavefunction(s) for diagram number 2
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[8], w_fp[6], COUPs[0], 0., 0., w_fp[11] );
+      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[8], w_fp[6], COUPs[0], 1.0, 0., 0., w_fp[11] );
 
       // Amplitude(s) for diagram number 2
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[9], w_fp[11], w_fp[5], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[9], w_fp[11], w_fp[5], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -313,7 +313,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 3
-      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[5], w_fp[6], w_fp[9], COUPs[2], &amp_fp[0] );
+      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[5], w_fp[6], w_fp[9], COUPs[2], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -333,7 +333,7 @@ namespace mg5amcCpu
       jamp_sv[102] -= amp_sv[0];
       jamp_sv[108] -= amp_sv[0];
       jamp_sv[110] += amp_sv[0];
-      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[5], w_fp[6], w_fp[9], COUPs[2], &amp_fp[0] );
+      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[5], w_fp[6], w_fp[9], COUPs[2], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -353,7 +353,7 @@ namespace mg5amcCpu
       jamp_sv[116] += amp_sv[0];
       jamp_sv[118] += amp_sv[0];
       jamp_sv[119] -= amp_sv[0];
-      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[5], w_fp[6], w_fp[9], COUPs[2], &amp_fp[0] );
+      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[5], w_fp[6], w_fp[9], COUPs[2], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -377,11 +377,11 @@ namespace mg5amcCpu
       // *** DIAGRAM 4 OF 1240 ***
 
       // Wavefunction(s) for diagram number 4
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[7], w_fp[5], COUPs[0], 0., 0., w_fp[12] );
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[8], w_fp[4], COUPs[0], 0., 0., w_fp[13] );
+      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[7], w_fp[5], COUPs[0], 1.0, 0., 0., w_fp[12] );
+      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[8], w_fp[4], COUPs[0], 1.0, 0., 0., w_fp[13] );
 
       // Amplitude(s) for diagram number 4
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[12], w_fp[13], w_fp[6], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[12], w_fp[13], w_fp[6], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -408,7 +408,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 5
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[12], w_fp[11], w_fp[4], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[12], w_fp[11], w_fp[4], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -435,7 +435,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 6
-      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[4], w_fp[6], w_fp[12], COUPs[2], &amp_fp[0] );
+      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[4], w_fp[6], w_fp[12], COUPs[2], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -455,7 +455,7 @@ namespace mg5amcCpu
       jamp_sv[103] -= amp_sv[0];
       jamp_sv[114] -= amp_sv[0];
       jamp_sv[116] += amp_sv[0];
-      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[4], w_fp[6], w_fp[12], COUPs[2], &amp_fp[0] );
+      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[4], w_fp[6], w_fp[12], COUPs[2], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -475,7 +475,7 @@ namespace mg5amcCpu
       jamp_sv[113] -= amp_sv[0];
       jamp_sv[114] -= amp_sv[0];
       jamp_sv[116] += amp_sv[0];
-      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[4], w_fp[6], w_fp[12], COUPs[2], &amp_fp[0] );
+      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[4], w_fp[6], w_fp[12], COUPs[2], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -499,10 +499,10 @@ namespace mg5amcCpu
       // *** DIAGRAM 7 OF 1240 ***
 
       // Wavefunction(s) for diagram number 7
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[7], w_fp[6], COUPs[0], 0., 0., w_fp[14] );
+      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[7], w_fp[6], COUPs[0], 1.0, 0., 0., w_fp[14] );
 
       // Amplitude(s) for diagram number 7
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[14], w_fp[13], w_fp[5], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[14], w_fp[13], w_fp[5], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -529,7 +529,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 8
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[14], w_fp[10], w_fp[4], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[14], w_fp[10], w_fp[4], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -556,7 +556,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 9
-      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[4], w_fp[5], w_fp[14], COUPs[2], &amp_fp[0] );
+      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[4], w_fp[5], w_fp[14], COUPs[2], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -576,7 +576,7 @@ namespace mg5amcCpu
       jamp_sv[92] += amp_sv[0];
       jamp_sv[97] += amp_sv[0];
       jamp_sv[103] -= amp_sv[0];
-      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[4], w_fp[5], w_fp[14], COUPs[2], &amp_fp[0] );
+      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[4], w_fp[5], w_fp[14], COUPs[2], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -596,7 +596,7 @@ namespace mg5amcCpu
       jamp_sv[92] += amp_sv[0];
       jamp_sv[96] += amp_sv[0];
       jamp_sv[102] -= amp_sv[0];
-      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[4], w_fp[5], w_fp[14], COUPs[2], &amp_fp[0] );
+      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[4], w_fp[5], w_fp[14], COUPs[2], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -620,12 +620,12 @@ namespace mg5amcCpu
       // *** DIAGRAM 10 OF 1240 ***
 
       // Wavefunction(s) for diagram number 10
-      VVVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[7], w_fp[4], w_fp[5], COUPs[2], 0., 0., w_fp[15] );
-      VVVV3P0_1<W_ACCESS, CD_ACCESS>( w_fp[7], w_fp[4], w_fp[5], COUPs[2], 0., 0., w_fp[16] );
-      VVVV4P0_1<W_ACCESS, CD_ACCESS>( w_fp[7], w_fp[4], w_fp[5], COUPs[2], 0., 0., w_fp[17] );
+      VVVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[7], w_fp[4], w_fp[5], COUPs[2], 1.0, 0., 0., w_fp[15] );
+      VVVV3P0_1<W_ACCESS, CD_ACCESS>( w_fp[7], w_fp[4], w_fp[5], COUPs[2], 1.0, 0., 0., w_fp[16] );
+      VVVV4P0_1<W_ACCESS, CD_ACCESS>( w_fp[7], w_fp[4], w_fp[5], COUPs[2], 1.0, 0., 0., w_fp[17] );
 
       // Amplitude(s) for diagram number 10
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[6], w_fp[15], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[6], w_fp[15], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -645,7 +645,7 @@ namespace mg5amcCpu
       jamp_sv[113] -= amp_sv[0];
       jamp_sv[118] -= amp_sv[0];
       jamp_sv[119] += amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[6], w_fp[16], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[6], w_fp[16], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -665,7 +665,7 @@ namespace mg5amcCpu
       jamp_sv[113] -= amp_sv[0];
       jamp_sv[114] -= amp_sv[0];
       jamp_sv[116] += amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[6], w_fp[17], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[6], w_fp[17], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -689,12 +689,12 @@ namespace mg5amcCpu
       // *** DIAGRAM 11 OF 1240 ***
 
       // Wavefunction(s) for diagram number 11
-      VVVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[7], w_fp[4], w_fp[6], COUPs[2], 0., 0., w_fp[18] );
-      VVVV3P0_1<W_ACCESS, CD_ACCESS>( w_fp[7], w_fp[4], w_fp[6], COUPs[2], 0., 0., w_fp[19] );
-      VVVV4P0_1<W_ACCESS, CD_ACCESS>( w_fp[7], w_fp[4], w_fp[6], COUPs[2], 0., 0., w_fp[20] );
+      VVVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[7], w_fp[4], w_fp[6], COUPs[2], 1.0, 0., 0., w_fp[18] );
+      VVVV3P0_1<W_ACCESS, CD_ACCESS>( w_fp[7], w_fp[4], w_fp[6], COUPs[2], 1.0, 0., 0., w_fp[19] );
+      VVVV4P0_1<W_ACCESS, CD_ACCESS>( w_fp[7], w_fp[4], w_fp[6], COUPs[2], 1.0, 0., 0., w_fp[20] );
 
       // Amplitude(s) for diagram number 11
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[5], w_fp[18], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[5], w_fp[18], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -714,7 +714,7 @@ namespace mg5amcCpu
       jamp_sv[95] += amp_sv[0];
       jamp_sv[108] += amp_sv[0];
       jamp_sv[110] -= amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[5], w_fp[19], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[5], w_fp[19], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -734,7 +734,7 @@ namespace mg5amcCpu
       jamp_sv[92] += amp_sv[0];
       jamp_sv[96] += amp_sv[0];
       jamp_sv[102] -= amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[5], w_fp[20], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[5], w_fp[20], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -758,12 +758,12 @@ namespace mg5amcCpu
       // *** DIAGRAM 12 OF 1240 ***
 
       // Wavefunction(s) for diagram number 12
-      VVVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[7], w_fp[5], w_fp[6], COUPs[2], 0., 0., w_fp[21] );
-      VVVV3P0_1<W_ACCESS, CD_ACCESS>( w_fp[7], w_fp[5], w_fp[6], COUPs[2], 0., 0., w_fp[22] );
-      VVVV4P0_1<W_ACCESS, CD_ACCESS>( w_fp[7], w_fp[5], w_fp[6], COUPs[2], 0., 0., w_fp[23] );
+      VVVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[7], w_fp[5], w_fp[6], COUPs[2], 1.0, 0., 0., w_fp[21] );
+      VVVV3P0_1<W_ACCESS, CD_ACCESS>( w_fp[7], w_fp[5], w_fp[6], COUPs[2], 1.0, 0., 0., w_fp[22] );
+      VVVV4P0_1<W_ACCESS, CD_ACCESS>( w_fp[7], w_fp[5], w_fp[6], COUPs[2], 1.0, 0., 0., w_fp[23] );
 
       // Amplitude(s) for diagram number 12
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[4], w_fp[21], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[4], w_fp[21], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -783,7 +783,7 @@ namespace mg5amcCpu
       jamp_sv[92] += amp_sv[0];
       jamp_sv[114] += amp_sv[0];
       jamp_sv[116] -= amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[4], w_fp[22], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[4], w_fp[22], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -803,7 +803,7 @@ namespace mg5amcCpu
       jamp_sv[92] += amp_sv[0];
       jamp_sv[97] += amp_sv[0];
       jamp_sv[103] -= amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[4], w_fp[23], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[4], w_fp[23], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -827,10 +827,10 @@ namespace mg5amcCpu
       // *** DIAGRAM 13 OF 1240 ***
 
       // Wavefunction(s) for diagram number 13
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[4], w_fp[5], COUPs[0], 0., 0., w_fp[24] );
+      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[4], w_fp[5], COUPs[0], 1.0, 0., 0., w_fp[24] );
 
       // Amplitude(s) for diagram number 13
-      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[7], w_fp[8], w_fp[24], w_fp[6], COUPs[2], &amp_fp[0] );
+      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[7], w_fp[8], w_fp[24], w_fp[6], COUPs[2], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -850,7 +850,7 @@ namespace mg5amcCpu
       jamp_sv[113] -= amp_sv[0];
       jamp_sv[118] -= amp_sv[0];
       jamp_sv[119] += amp_sv[0];
-      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[7], w_fp[8], w_fp[24], w_fp[6], COUPs[2], &amp_fp[0] );
+      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[7], w_fp[8], w_fp[24], w_fp[6], COUPs[2], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -870,7 +870,7 @@ namespace mg5amcCpu
       jamp_sv[113] -= amp_sv[0];
       jamp_sv[118] -= amp_sv[0];
       jamp_sv[119] += amp_sv[0];
-      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[7], w_fp[8], w_fp[24], w_fp[6], COUPs[2], &amp_fp[0] );
+      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[7], w_fp[8], w_fp[24], w_fp[6], COUPs[2], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -894,10 +894,10 @@ namespace mg5amcCpu
       // *** DIAGRAM 14 OF 1240 ***
 
       // Wavefunction(s) for diagram number 14
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[7], w_fp[8], COUPs[0], 0., 0., w_fp[25] );
+      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[7], w_fp[8], COUPs[0], 1.0, 0., 0., w_fp[25] );
 
       // Amplitude(s) for diagram number 14
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[24], w_fp[6], w_fp[25], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[24], w_fp[6], w_fp[25], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -921,10 +921,10 @@ namespace mg5amcCpu
       // *** DIAGRAM 15 OF 1240 ***
 
       // Wavefunction(s) for diagram number 15
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[7], w_fp[24], COUPs[0], 0., 0., w_fp[26] );
+      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[7], w_fp[24], COUPs[0], 1.0, 0., 0., w_fp[26] );
 
       // Amplitude(s) for diagram number 15
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[6], w_fp[26], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[6], w_fp[26], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -951,7 +951,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 16
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[24], w_fp[14], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[24], w_fp[14], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -975,10 +975,10 @@ namespace mg5amcCpu
       // *** DIAGRAM 17 OF 1240 ***
 
       // Wavefunction(s) for diagram number 17
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[4], w_fp[6], COUPs[0], 0., 0., w_fp[27] );
+      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[4], w_fp[6], COUPs[0], 1.0, 0., 0., w_fp[27] );
 
       // Amplitude(s) for diagram number 17
-      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[7], w_fp[8], w_fp[27], w_fp[5], COUPs[2], &amp_fp[0] );
+      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[7], w_fp[8], w_fp[27], w_fp[5], COUPs[2], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -998,7 +998,7 @@ namespace mg5amcCpu
       jamp_sv[95] += amp_sv[0];
       jamp_sv[112] += amp_sv[0];
       jamp_sv[113] -= amp_sv[0];
-      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[7], w_fp[8], w_fp[27], w_fp[5], COUPs[2], &amp_fp[0] );
+      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[7], w_fp[8], w_fp[27], w_fp[5], COUPs[2], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -1018,7 +1018,7 @@ namespace mg5amcCpu
       jamp_sv[95] += amp_sv[0];
       jamp_sv[108] += amp_sv[0];
       jamp_sv[110] -= amp_sv[0];
-      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[7], w_fp[8], w_fp[27], w_fp[5], COUPs[2], &amp_fp[0] );
+      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[7], w_fp[8], w_fp[27], w_fp[5], COUPs[2], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -1045,7 +1045,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 18
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[27], w_fp[5], w_fp[25], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[27], w_fp[5], w_fp[25], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -1069,10 +1069,10 @@ namespace mg5amcCpu
       // *** DIAGRAM 19 OF 1240 ***
 
       // Wavefunction(s) for diagram number 19
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[7], w_fp[27], COUPs[0], 0., 0., w_fp[28] );
+      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[7], w_fp[27], COUPs[0], 1.0, 0., 0., w_fp[28] );
 
       // Amplitude(s) for diagram number 19
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[5], w_fp[28], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[5], w_fp[28], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -1099,7 +1099,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 20
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[27], w_fp[12], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[27], w_fp[12], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -1123,10 +1123,10 @@ namespace mg5amcCpu
       // *** DIAGRAM 21 OF 1240 ***
 
       // Wavefunction(s) for diagram number 21
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[5], w_fp[6], COUPs[0], 0., 0., w_fp[29] );
+      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[5], w_fp[6], COUPs[0], 1.0, 0., 0., w_fp[29] );
 
       // Amplitude(s) for diagram number 21
-      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[7], w_fp[8], w_fp[4], w_fp[29], COUPs[2], &amp_fp[0] );
+      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[7], w_fp[8], w_fp[4], w_fp[29], COUPs[2], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -1146,7 +1146,7 @@ namespace mg5amcCpu
       jamp_sv[95] -= amp_sv[0];
       jamp_sv[118] -= amp_sv[0];
       jamp_sv[119] += amp_sv[0];
-      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[7], w_fp[8], w_fp[4], w_fp[29], COUPs[2], &amp_fp[0] );
+      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[7], w_fp[8], w_fp[4], w_fp[29], COUPs[2], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -1166,7 +1166,7 @@ namespace mg5amcCpu
       jamp_sv[116] -= amp_sv[0];
       jamp_sv[118] -= amp_sv[0];
       jamp_sv[119] += amp_sv[0];
-      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[7], w_fp[8], w_fp[4], w_fp[29], COUPs[2], &amp_fp[0] );
+      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[7], w_fp[8], w_fp[4], w_fp[29], COUPs[2], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -1193,7 +1193,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 22
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[4], w_fp[29], w_fp[25], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[4], w_fp[29], w_fp[25], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -1220,7 +1220,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 23
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[29], w_fp[9], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[29], w_fp[9], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -1244,10 +1244,10 @@ namespace mg5amcCpu
       // *** DIAGRAM 24 OF 1240 ***
 
       // Wavefunction(s) for diagram number 24
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[7], w_fp[29], COUPs[0], 0., 0., w_fp[25] );
+      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[7], w_fp[29], COUPs[0], 1.0, 0., 0., w_fp[25] );
 
       // Amplitude(s) for diagram number 24
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[4], w_fp[25], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[4], w_fp[25], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -1271,12 +1271,12 @@ namespace mg5amcCpu
       // *** DIAGRAM 25 OF 1240 ***
 
       // Wavefunction(s) for diagram number 25
-      VVVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[4], w_fp[5], w_fp[6], COUPs[2], 0., 0., w_fp[30] );
-      VVVV3P0_1<W_ACCESS, CD_ACCESS>( w_fp[4], w_fp[5], w_fp[6], COUPs[2], 0., 0., w_fp[31] );
-      VVVV4P0_1<W_ACCESS, CD_ACCESS>( w_fp[4], w_fp[5], w_fp[6], COUPs[2], 0., 0., w_fp[32] );
+      VVVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[4], w_fp[5], w_fp[6], COUPs[2], 1.0, 0., 0., w_fp[30] );
+      VVVV3P0_1<W_ACCESS, CD_ACCESS>( w_fp[4], w_fp[5], w_fp[6], COUPs[2], 1.0, 0., 0., w_fp[31] );
+      VVVV4P0_1<W_ACCESS, CD_ACCESS>( w_fp[4], w_fp[5], w_fp[6], COUPs[2], 1.0, 0., 0., w_fp[32] );
 
       // Amplitude(s) for diagram number 25
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[7], w_fp[8], w_fp[30], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[7], w_fp[8], w_fp[30], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -1296,7 +1296,7 @@ namespace mg5amcCpu
       jamp_sv[95] -= amp_sv[0];
       jamp_sv[118] -= amp_sv[0];
       jamp_sv[119] += amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[7], w_fp[8], w_fp[31], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[7], w_fp[8], w_fp[31], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -1316,7 +1316,7 @@ namespace mg5amcCpu
       jamp_sv[95] -= amp_sv[0];
       jamp_sv[112] -= amp_sv[0];
       jamp_sv[113] += amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[7], w_fp[8], w_fp[32], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[7], w_fp[8], w_fp[32], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -1340,12 +1340,12 @@ namespace mg5amcCpu
       // *** DIAGRAM 26 OF 1240 ***
 
       // Wavefunction(s) for diagram number 26
-      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[2], w_fp[4], COUPs[1], cIPD[0], cIPD[1], w_fp[33] );
-      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[7], COUPs[1], cIPD[0], cIPD[1], w_fp[34] );
-      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[33], w_fp[5], COUPs[1], cIPD[0], cIPD[1], w_fp[35] );
+      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[2], w_fp[4], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[33] );
+      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[7], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[34] );
+      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[33], w_fp[5], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[35] );
 
       // Amplitude(s) for diagram number 26
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[34], w_fp[35], w_fp[6], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[34], w_fp[35], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -1355,10 +1355,10 @@ namespace mg5amcCpu
       // *** DIAGRAM 27 OF 1240 ***
 
       // Wavefunction(s) for diagram number 27
-      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[33], w_fp[6], COUPs[1], cIPD[0], cIPD[1], w_fp[36] );
+      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[33], w_fp[6], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[36] );
 
       // Amplitude(s) for diagram number 27
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[34], w_fp[36], w_fp[5], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[34], w_fp[36], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -1368,10 +1368,10 @@ namespace mg5amcCpu
       // *** DIAGRAM 28 OF 1240 ***
 
       // Wavefunction(s) for diagram number 28
-      FFV1P0_3<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[33], COUPs[1], 0., 0., w_fp[37] );
+      FFV1P0_3<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[33], COUPs[1], 1.0, 0., 0., w_fp[37] );
 
       // Amplitude(s) for diagram number 28
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[12], w_fp[37], w_fp[6], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[12], w_fp[37], w_fp[6], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -1390,7 +1390,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 29
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[36], w_fp[12], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[36], w_fp[12], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -1405,7 +1405,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 30
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[14], w_fp[37], w_fp[5], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[14], w_fp[37], w_fp[5], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -1424,7 +1424,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 31
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[35], w_fp[14], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[35], w_fp[14], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -1439,7 +1439,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 32
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[33], w_fp[21], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[33], w_fp[21], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -1451,7 +1451,7 @@ namespace mg5amcCpu
       jamp_sv[65] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[70] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[71] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[33], w_fp[22], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[33], w_fp[22], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -1463,7 +1463,7 @@ namespace mg5amcCpu
       jamp_sv[65] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[66] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[68] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[33], w_fp[23], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[33], w_fp[23], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -1479,11 +1479,11 @@ namespace mg5amcCpu
       // *** DIAGRAM 33 OF 1240 ***
 
       // Wavefunction(s) for diagram number 33
-      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[5], COUPs[1], cIPD[0], cIPD[1], w_fp[38] );
-      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[33], w_fp[7], COUPs[1], cIPD[0], cIPD[1], w_fp[39] );
+      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[5], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[38] );
+      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[33], w_fp[7], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[39] );
 
       // Amplitude(s) for diagram number 33
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[39], w_fp[6], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[39], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -1493,10 +1493,10 @@ namespace mg5amcCpu
       // *** DIAGRAM 34 OF 1240 ***
 
       // Wavefunction(s) for diagram number 34
-      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[38], w_fp[7], COUPs[1], cIPD[0], cIPD[1], w_fp[40] );
+      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[38], w_fp[7], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[40] );
 
       // Amplitude(s) for diagram number 34
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[40], w_fp[33], w_fp[6], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[40], w_fp[33], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -1509,7 +1509,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 35
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[33], w_fp[14], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[33], w_fp[14], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -1521,10 +1521,10 @@ namespace mg5amcCpu
       // *** DIAGRAM 36 OF 1240 ***
 
       // Wavefunction(s) for diagram number 36
-      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[6], COUPs[1], cIPD[0], cIPD[1], w_fp[41] );
+      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[6], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[41] );
 
       // Amplitude(s) for diagram number 36
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[39], w_fp[5], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[39], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -1534,10 +1534,10 @@ namespace mg5amcCpu
       // *** DIAGRAM 37 OF 1240 ***
 
       // Wavefunction(s) for diagram number 37
-      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[41], w_fp[7], COUPs[1], cIPD[0], cIPD[1], w_fp[42] );
+      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[41], w_fp[7], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[42] );
 
       // Amplitude(s) for diagram number 37
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[42], w_fp[33], w_fp[5], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[42], w_fp[33], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -1550,7 +1550,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 38
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[33], w_fp[12], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[33], w_fp[12], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -1565,7 +1565,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 39
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[39], w_fp[29], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[39], w_fp[29], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -1580,7 +1580,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 40
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[34], w_fp[33], w_fp[29], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[34], w_fp[33], w_fp[29], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -1595,7 +1595,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 41
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[33], w_fp[25], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[33], w_fp[25], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -1611,11 +1611,11 @@ namespace mg5amcCpu
       // *** DIAGRAM 42 OF 1240 ***
 
       // Wavefunction(s) for diagram number 42
-      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[2], w_fp[5], COUPs[1], cIPD[0], cIPD[1], w_fp[39] );
-      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[39], w_fp[4], COUPs[1], cIPD[0], cIPD[1], w_fp[43] );
+      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[2], w_fp[5], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[39] );
+      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[39], w_fp[4], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[43] );
 
       // Amplitude(s) for diagram number 42
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[34], w_fp[43], w_fp[6], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[34], w_fp[43], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -1625,10 +1625,10 @@ namespace mg5amcCpu
       // *** DIAGRAM 43 OF 1240 ***
 
       // Wavefunction(s) for diagram number 43
-      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[39], w_fp[6], COUPs[1], cIPD[0], cIPD[1], w_fp[44] );
+      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[39], w_fp[6], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[44] );
 
       // Amplitude(s) for diagram number 43
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[34], w_fp[44], w_fp[4], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[34], w_fp[44], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -1638,10 +1638,10 @@ namespace mg5amcCpu
       // *** DIAGRAM 44 OF 1240 ***
 
       // Wavefunction(s) for diagram number 44
-      FFV1P0_3<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[39], COUPs[1], 0., 0., w_fp[45] );
+      FFV1P0_3<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[39], COUPs[1], 1.0, 0., 0., w_fp[45] );
 
       // Amplitude(s) for diagram number 44
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[9], w_fp[45], w_fp[6], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[9], w_fp[45], w_fp[6], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -1660,7 +1660,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 45
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[44], w_fp[9], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[44], w_fp[9], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -1675,7 +1675,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 46
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[14], w_fp[45], w_fp[4], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[14], w_fp[45], w_fp[4], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -1694,7 +1694,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 47
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[43], w_fp[14], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[43], w_fp[14], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -1709,7 +1709,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 48
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[39], w_fp[18], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[39], w_fp[18], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -1721,7 +1721,7 @@ namespace mg5amcCpu
       jamp_sv[89] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[94] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[95] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[39], w_fp[19], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[39], w_fp[19], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -1733,7 +1733,7 @@ namespace mg5amcCpu
       jamp_sv[89] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[90] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[92] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[39], w_fp[20], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[39], w_fp[20], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -1749,11 +1749,11 @@ namespace mg5amcCpu
       // *** DIAGRAM 49 OF 1240 ***
 
       // Wavefunction(s) for diagram number 49
-      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[4], COUPs[1], cIPD[0], cIPD[1], w_fp[46] );
-      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[39], w_fp[7], COUPs[1], cIPD[0], cIPD[1], w_fp[47] );
+      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[4], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[46] );
+      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[39], w_fp[7], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[47] );
 
       // Amplitude(s) for diagram number 49
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[47], w_fp[6], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[47], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -1763,10 +1763,10 @@ namespace mg5amcCpu
       // *** DIAGRAM 50 OF 1240 ***
 
       // Wavefunction(s) for diagram number 50
-      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[46], w_fp[7], COUPs[1], cIPD[0], cIPD[1], w_fp[48] );
+      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[46], w_fp[7], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[48] );
 
       // Amplitude(s) for diagram number 50
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[48], w_fp[39], w_fp[6], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[48], w_fp[39], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -1779,7 +1779,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 51
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[39], w_fp[14], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[39], w_fp[14], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -1794,7 +1794,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 52
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[47], w_fp[4], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[47], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -1807,7 +1807,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 53
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[42], w_fp[39], w_fp[4], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[42], w_fp[39], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -1820,7 +1820,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 54
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[39], w_fp[9], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[39], w_fp[9], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -1835,7 +1835,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 55
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[47], w_fp[27], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[47], w_fp[27], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -1850,7 +1850,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 56
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[34], w_fp[39], w_fp[27], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[34], w_fp[39], w_fp[27], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -1865,7 +1865,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 57
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[39], w_fp[28], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[39], w_fp[28], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -1881,11 +1881,11 @@ namespace mg5amcCpu
       // *** DIAGRAM 58 OF 1240 ***
 
       // Wavefunction(s) for diagram number 58
-      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[2], w_fp[6], COUPs[1], cIPD[0], cIPD[1], w_fp[47] );
-      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[47], w_fp[4], COUPs[1], cIPD[0], cIPD[1], w_fp[49] );
+      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[2], w_fp[6], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[47] );
+      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[47], w_fp[4], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[49] );
 
       // Amplitude(s) for diagram number 58
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[34], w_fp[49], w_fp[5], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[34], w_fp[49], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -1895,10 +1895,10 @@ namespace mg5amcCpu
       // *** DIAGRAM 59 OF 1240 ***
 
       // Wavefunction(s) for diagram number 59
-      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[47], w_fp[5], COUPs[1], cIPD[0], cIPD[1], w_fp[50] );
+      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[47], w_fp[5], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[50] );
 
       // Amplitude(s) for diagram number 59
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[34], w_fp[50], w_fp[4], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[34], w_fp[50], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -1908,10 +1908,10 @@ namespace mg5amcCpu
       // *** DIAGRAM 60 OF 1240 ***
 
       // Wavefunction(s) for diagram number 60
-      FFV1P0_3<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[47], COUPs[1], 0., 0., w_fp[51] );
+      FFV1P0_3<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[47], COUPs[1], 1.0, 0., 0., w_fp[51] );
 
       // Amplitude(s) for diagram number 60
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[9], w_fp[51], w_fp[5], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[9], w_fp[51], w_fp[5], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -1930,7 +1930,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 61
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[50], w_fp[9], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[50], w_fp[9], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -1945,7 +1945,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 62
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[12], w_fp[51], w_fp[4], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[12], w_fp[51], w_fp[4], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -1964,7 +1964,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 63
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[49], w_fp[12], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[49], w_fp[12], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -1979,7 +1979,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 64
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[47], w_fp[15], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[47], w_fp[15], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -1991,7 +1991,7 @@ namespace mg5amcCpu
       jamp_sv[113] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[118] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[119] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[47], w_fp[16], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[47], w_fp[16], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -2003,7 +2003,7 @@ namespace mg5amcCpu
       jamp_sv[113] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[114] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[116] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[47], w_fp[17], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[47], w_fp[17], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -2019,10 +2019,10 @@ namespace mg5amcCpu
       // *** DIAGRAM 65 OF 1240 ***
 
       // Wavefunction(s) for diagram number 65
-      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[47], w_fp[7], COUPs[1], cIPD[0], cIPD[1], w_fp[52] );
+      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[47], w_fp[7], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[52] );
 
       // Amplitude(s) for diagram number 65
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[52], w_fp[5], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[52], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -2035,7 +2035,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 66
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[48], w_fp[47], w_fp[5], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[48], w_fp[47], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -2048,7 +2048,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 67
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[47], w_fp[12], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[47], w_fp[12], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -2063,7 +2063,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 68
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[52], w_fp[4], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[52], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -2076,7 +2076,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 69
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[40], w_fp[47], w_fp[4], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[40], w_fp[47], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -2089,7 +2089,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 70
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[47], w_fp[9], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[47], w_fp[9], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -2104,7 +2104,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 71
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[52], w_fp[24], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[52], w_fp[24], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -2119,7 +2119,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 72
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[34], w_fp[47], w_fp[24], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[34], w_fp[47], w_fp[24], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -2134,7 +2134,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 73
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[47], w_fp[26], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[47], w_fp[26], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -2150,11 +2150,11 @@ namespace mg5amcCpu
       // *** DIAGRAM 74 OF 1240 ***
 
       // Wavefunction(s) for diagram number 74
-      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[2], w_fp[7], COUPs[1], cIPD[0], cIPD[1], w_fp[52] );
-      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[46], w_fp[5], COUPs[1], cIPD[0], cIPD[1], w_fp[7] );
+      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[2], w_fp[7], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[52] );
+      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[46], w_fp[5], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[7] );
 
       // Amplitude(s) for diagram number 74
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[7], w_fp[52], w_fp[6], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[7], w_fp[52], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -2164,10 +2164,10 @@ namespace mg5amcCpu
       // *** DIAGRAM 75 OF 1240 ***
 
       // Wavefunction(s) for diagram number 75
-      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[46], w_fp[6], COUPs[1], cIPD[0], cIPD[1], w_fp[53] );
+      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[46], w_fp[6], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[53] );
 
       // Amplitude(s) for diagram number 75
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[53], w_fp[52], w_fp[5], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[53], w_fp[52], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -2177,10 +2177,10 @@ namespace mg5amcCpu
       // *** DIAGRAM 76 OF 1240 ***
 
       // Wavefunction(s) for diagram number 76
-      FFV1P0_3<W_ACCESS, CD_ACCESS>( w_fp[46], w_fp[2], COUPs[1], 0., 0., w_fp[54] );
+      FFV1P0_3<W_ACCESS, CD_ACCESS>( w_fp[46], w_fp[2], COUPs[1], 1.0, 0., 0., w_fp[54] );
 
       // Amplitude(s) for diagram number 76
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[12], w_fp[54], w_fp[6], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[12], w_fp[54], w_fp[6], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -2199,7 +2199,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 77
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[53], w_fp[2], w_fp[12], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[53], w_fp[2], w_fp[12], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -2214,7 +2214,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 78
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[14], w_fp[54], w_fp[5], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[14], w_fp[54], w_fp[5], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -2233,7 +2233,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 79
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[7], w_fp[2], w_fp[14], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[7], w_fp[2], w_fp[14], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -2248,7 +2248,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 80
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[2], w_fp[21], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[2], w_fp[21], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -2260,7 +2260,7 @@ namespace mg5amcCpu
       jamp_sv[92] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[114] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[116] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[2], w_fp[22], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[2], w_fp[22], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -2272,7 +2272,7 @@ namespace mg5amcCpu
       jamp_sv[92] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[97] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[103] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[2], w_fp[23], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[2], w_fp[23], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -2291,7 +2291,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 81
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[52], w_fp[29], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[52], w_fp[29], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -2306,7 +2306,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 82
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[48], w_fp[2], w_fp[29], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[48], w_fp[2], w_fp[29], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -2321,7 +2321,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 83
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[2], w_fp[25], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[2], w_fp[25], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -2337,10 +2337,10 @@ namespace mg5amcCpu
       // *** DIAGRAM 84 OF 1240 ***
 
       // Wavefunction(s) for diagram number 84
-      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[38], w_fp[4], COUPs[1], cIPD[0], cIPD[1], w_fp[25] );
+      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[38], w_fp[4], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[25] );
 
       // Amplitude(s) for diagram number 84
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[25], w_fp[52], w_fp[6], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[25], w_fp[52], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -2350,10 +2350,10 @@ namespace mg5amcCpu
       // *** DIAGRAM 85 OF 1240 ***
 
       // Wavefunction(s) for diagram number 85
-      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[38], w_fp[6], COUPs[1], cIPD[0], cIPD[1], w_fp[48] );
+      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[38], w_fp[6], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[48] );
 
       // Amplitude(s) for diagram number 85
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[48], w_fp[52], w_fp[4], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[48], w_fp[52], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -2363,10 +2363,10 @@ namespace mg5amcCpu
       // *** DIAGRAM 86 OF 1240 ***
 
       // Wavefunction(s) for diagram number 86
-      FFV1P0_3<W_ACCESS, CD_ACCESS>( w_fp[38], w_fp[2], COUPs[1], 0., 0., w_fp[23] );
+      FFV1P0_3<W_ACCESS, CD_ACCESS>( w_fp[38], w_fp[2], COUPs[1], 1.0, 0., 0., w_fp[23] );
 
       // Amplitude(s) for diagram number 86
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[9], w_fp[23], w_fp[6], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[9], w_fp[23], w_fp[6], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -2385,7 +2385,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 87
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[48], w_fp[2], w_fp[9], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[48], w_fp[2], w_fp[9], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -2400,7 +2400,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 88
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[14], w_fp[23], w_fp[4], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[14], w_fp[23], w_fp[4], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -2419,7 +2419,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 89
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[25], w_fp[2], w_fp[14], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[25], w_fp[2], w_fp[14], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -2434,7 +2434,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 90
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[2], w_fp[18], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[2], w_fp[18], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -2446,7 +2446,7 @@ namespace mg5amcCpu
       jamp_sv[68] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[108] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[110] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[2], w_fp[19], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[2], w_fp[19], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -2458,7 +2458,7 @@ namespace mg5amcCpu
       jamp_sv[68] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[96] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[102] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[2], w_fp[20], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[2], w_fp[20], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -2477,7 +2477,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 91
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[52], w_fp[27], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[52], w_fp[27], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -2492,7 +2492,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 92
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[40], w_fp[2], w_fp[27], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[40], w_fp[2], w_fp[27], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -2507,7 +2507,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 93
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[2], w_fp[28], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[2], w_fp[28], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -2523,10 +2523,10 @@ namespace mg5amcCpu
       // *** DIAGRAM 94 OF 1240 ***
 
       // Wavefunction(s) for diagram number 94
-      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[41], w_fp[4], COUPs[1], cIPD[0], cIPD[1], w_fp[28] );
+      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[41], w_fp[4], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[28] );
 
       // Amplitude(s) for diagram number 94
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[28], w_fp[52], w_fp[5], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[28], w_fp[52], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -2536,10 +2536,10 @@ namespace mg5amcCpu
       // *** DIAGRAM 95 OF 1240 ***
 
       // Wavefunction(s) for diagram number 95
-      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[41], w_fp[5], COUPs[1], cIPD[0], cIPD[1], w_fp[40] );
+      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[41], w_fp[5], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[40] );
 
       // Amplitude(s) for diagram number 95
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[40], w_fp[52], w_fp[4], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[40], w_fp[52], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -2549,10 +2549,10 @@ namespace mg5amcCpu
       // *** DIAGRAM 96 OF 1240 ***
 
       // Wavefunction(s) for diagram number 96
-      FFV1P0_3<W_ACCESS, CD_ACCESS>( w_fp[41], w_fp[2], COUPs[1], 0., 0., w_fp[20] );
+      FFV1P0_3<W_ACCESS, CD_ACCESS>( w_fp[41], w_fp[2], COUPs[1], 1.0, 0., 0., w_fp[20] );
 
       // Amplitude(s) for diagram number 96
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[9], w_fp[20], w_fp[5], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[9], w_fp[20], w_fp[5], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -2571,7 +2571,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 97
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[40], w_fp[2], w_fp[9], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[40], w_fp[2], w_fp[9], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -2586,7 +2586,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 98
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[12], w_fp[20], w_fp[4], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[12], w_fp[20], w_fp[4], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -2605,7 +2605,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 99
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[28], w_fp[2], w_fp[12], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[28], w_fp[2], w_fp[12], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -2620,7 +2620,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 100
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[2], w_fp[15], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[2], w_fp[15], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -2632,7 +2632,7 @@ namespace mg5amcCpu
       jamp_sv[62] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[84] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[86] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[2], w_fp[16], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[2], w_fp[16], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -2644,7 +2644,7 @@ namespace mg5amcCpu
       jamp_sv[62] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[72] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[78] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[2], w_fp[17], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[2], w_fp[17], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -2663,7 +2663,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 101
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[52], w_fp[24], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[52], w_fp[24], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -2678,7 +2678,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 102
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[42], w_fp[2], w_fp[24], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[42], w_fp[2], w_fp[24], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -2693,7 +2693,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 103
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[2], w_fp[26], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[2], w_fp[26], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -2709,10 +2709,10 @@ namespace mg5amcCpu
       // *** DIAGRAM 104 OF 1240 ***
 
       // Wavefunction(s) for diagram number 104
-      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[24], COUPs[1], cIPD[0], cIPD[1], w_fp[26] );
+      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[24], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[26] );
 
       // Amplitude(s) for diagram number 104
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[26], w_fp[52], w_fp[6], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[26], w_fp[52], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -2724,10 +2724,10 @@ namespace mg5amcCpu
       // *** DIAGRAM 105 OF 1240 ***
 
       // Wavefunction(s) for diagram number 105
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[24], w_fp[6], COUPs[0], 0., 0., w_fp[42] );
+      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[24], w_fp[6], COUPs[0], 1.0, 0., 0., w_fp[42] );
 
       // Amplitude(s) for diagram number 105
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[52], w_fp[42], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[52], w_fp[42], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -2743,10 +2743,10 @@ namespace mg5amcCpu
       // *** DIAGRAM 106 OF 1240 ***
 
       // Wavefunction(s) for diagram number 106
-      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[2], w_fp[24], COUPs[1], cIPD[0], cIPD[1], w_fp[17] );
+      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[2], w_fp[24], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[17] );
 
       // Amplitude(s) for diagram number 106
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[34], w_fp[17], w_fp[6], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[34], w_fp[17], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -2761,7 +2761,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 107
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[34], w_fp[2], w_fp[42], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[34], w_fp[2], w_fp[42], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -2780,7 +2780,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 108
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[17], w_fp[14], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[17], w_fp[14], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -2799,7 +2799,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 109
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[26], w_fp[2], w_fp[14], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[26], w_fp[2], w_fp[14], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -2815,10 +2815,10 @@ namespace mg5amcCpu
       // *** DIAGRAM 110 OF 1240 ***
 
       // Wavefunction(s) for diagram number 110
-      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[27], COUPs[1], cIPD[0], cIPD[1], w_fp[14] );
+      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[27], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[14] );
 
       // Amplitude(s) for diagram number 110
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[14], w_fp[52], w_fp[5], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[14], w_fp[52], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -2830,10 +2830,10 @@ namespace mg5amcCpu
       // *** DIAGRAM 111 OF 1240 ***
 
       // Wavefunction(s) for diagram number 111
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[27], w_fp[5], COUPs[0], 0., 0., w_fp[16] );
+      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[27], w_fp[5], COUPs[0], 1.0, 0., 0., w_fp[16] );
 
       // Amplitude(s) for diagram number 111
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[52], w_fp[16], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[52], w_fp[16], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -2849,10 +2849,10 @@ namespace mg5amcCpu
       // *** DIAGRAM 112 OF 1240 ***
 
       // Wavefunction(s) for diagram number 112
-      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[2], w_fp[27], COUPs[1], cIPD[0], cIPD[1], w_fp[15] );
+      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[2], w_fp[27], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[15] );
 
       // Amplitude(s) for diagram number 112
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[34], w_fp[15], w_fp[5], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[34], w_fp[15], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -2867,7 +2867,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 113
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[34], w_fp[2], w_fp[16], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[34], w_fp[2], w_fp[16], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -2886,7 +2886,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 114
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[15], w_fp[12], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[15], w_fp[12], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -2905,7 +2905,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 115
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[14], w_fp[2], w_fp[12], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[14], w_fp[2], w_fp[12], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -2921,10 +2921,10 @@ namespace mg5amcCpu
       // *** DIAGRAM 116 OF 1240 ***
 
       // Wavefunction(s) for diagram number 116
-      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[29], COUPs[1], cIPD[0], cIPD[1], w_fp[12] );
+      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[29], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[12] );
 
       // Amplitude(s) for diagram number 116
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[12], w_fp[52], w_fp[4], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[12], w_fp[52], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -2936,10 +2936,10 @@ namespace mg5amcCpu
       // *** DIAGRAM 117 OF 1240 ***
 
       // Wavefunction(s) for diagram number 117
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[4], w_fp[29], COUPs[0], 0., 0., w_fp[19] );
+      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[4], w_fp[29], COUPs[0], 1.0, 0., 0., w_fp[19] );
 
       // Amplitude(s) for diagram number 117
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[52], w_fp[19], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[52], w_fp[19], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -2955,10 +2955,10 @@ namespace mg5amcCpu
       // *** DIAGRAM 118 OF 1240 ***
 
       // Wavefunction(s) for diagram number 118
-      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[2], w_fp[29], COUPs[1], cIPD[0], cIPD[1], w_fp[18] );
+      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[2], w_fp[29], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[18] );
 
       // Amplitude(s) for diagram number 118
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[34], w_fp[18], w_fp[4], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[34], w_fp[18], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -2973,7 +2973,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 119
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[34], w_fp[2], w_fp[19], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[34], w_fp[2], w_fp[19], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -2992,7 +2992,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 120
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[18], w_fp[9], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[18], w_fp[9], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -3011,7 +3011,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 121
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[12], w_fp[2], w_fp[9], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[12], w_fp[2], w_fp[9], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -3030,7 +3030,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 122
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[52], w_fp[30], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[52], w_fp[30], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -3042,7 +3042,7 @@ namespace mg5amcCpu
       jamp_sv[25] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[27] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[29] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[52], w_fp[31], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[52], w_fp[31], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -3054,7 +3054,7 @@ namespace mg5amcCpu
       jamp_sv[26] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[27] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[28] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[52], w_fp[32], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[52], w_fp[32], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -3073,7 +3073,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 123
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[34], w_fp[2], w_fp[30], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[34], w_fp[2], w_fp[30], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -3085,7 +3085,7 @@ namespace mg5amcCpu
       jamp_sv[95] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[118] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[119] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[34], w_fp[2], w_fp[31], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[34], w_fp[2], w_fp[31], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -3097,7 +3097,7 @@ namespace mg5amcCpu
       jamp_sv[95] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[112] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[113] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[34], w_fp[2], w_fp[32], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[34], w_fp[2], w_fp[32], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -3113,13 +3113,13 @@ namespace mg5amcCpu
       // *** DIAGRAM 124 OF 1240 ***
 
       // Wavefunction(s) for diagram number 124
-      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[2], w_fp[0], COUPs[1], cIPD[0], cIPD[1], w_fp[34] );
-      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[1], COUPs[1], cIPD[0], cIPD[1], w_fp[52] );
-      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[34], w_fp[4], COUPs[1], cIPD[0], cIPD[1], w_fp[9] );
-      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[52], w_fp[5], COUPs[1], cIPD[0], cIPD[1], w_fp[22] );
+      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[2], w_fp[0], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[34] );
+      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[1], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[52] );
+      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[34], w_fp[4], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[9] );
+      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[52], w_fp[5], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[22] );
 
       // Amplitude(s) for diagram number 124
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[22], w_fp[9], w_fp[6], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[22], w_fp[9], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -3128,10 +3128,10 @@ namespace mg5amcCpu
       // *** DIAGRAM 125 OF 1240 ***
 
       // Wavefunction(s) for diagram number 125
-      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[52], w_fp[6], COUPs[1], cIPD[0], cIPD[1], w_fp[21] );
+      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[52], w_fp[6], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[21] );
 
       // Amplitude(s) for diagram number 125
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[21], w_fp[9], w_fp[5], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[21], w_fp[9], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -3140,11 +3140,11 @@ namespace mg5amcCpu
       // *** DIAGRAM 126 OF 1240 ***
 
       // Wavefunction(s) for diagram number 126
-      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[34], w_fp[5], COUPs[1], cIPD[0], cIPD[1], w_fp[55] );
-      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[52], w_fp[4], COUPs[1], cIPD[0], cIPD[1], w_fp[56] );
+      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[34], w_fp[5], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[55] );
+      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[52], w_fp[4], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[56] );
 
       // Amplitude(s) for diagram number 126
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[56], w_fp[55], w_fp[6], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[56], w_fp[55], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -3156,7 +3156,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 127
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[21], w_fp[55], w_fp[4], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[21], w_fp[55], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -3165,10 +3165,10 @@ namespace mg5amcCpu
       // *** DIAGRAM 128 OF 1240 ***
 
       // Wavefunction(s) for diagram number 128
-      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[34], w_fp[6], COUPs[1], cIPD[0], cIPD[1], w_fp[57] );
+      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[34], w_fp[6], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[57] );
 
       // Amplitude(s) for diagram number 128
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[56], w_fp[57], w_fp[5], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[56], w_fp[57], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -3180,7 +3180,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 129
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[22], w_fp[57], w_fp[4], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[22], w_fp[57], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -3189,10 +3189,10 @@ namespace mg5amcCpu
       // *** DIAGRAM 130 OF 1240 ***
 
       // Wavefunction(s) for diagram number 130
-      FFV1P0_3<W_ACCESS, CD_ACCESS>( w_fp[52], w_fp[34], COUPs[1], 0., 0., w_fp[58] );
+      FFV1P0_3<W_ACCESS, CD_ACCESS>( w_fp[52], w_fp[34], COUPs[1], 1.0, 0., 0., w_fp[58] );
 
       // Amplitude(s) for diagram number 130
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[24], w_fp[6], w_fp[58], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[24], w_fp[6], w_fp[58], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -3204,10 +3204,10 @@ namespace mg5amcCpu
       // *** DIAGRAM 131 OF 1240 ***
 
       // Wavefunction(s) for diagram number 131
-      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[34], w_fp[24], COUPs[1], cIPD[0], cIPD[1], w_fp[59] );
+      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[34], w_fp[24], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[59] );
 
       // Amplitude(s) for diagram number 131
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[59], w_fp[6], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[59], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -3220,7 +3220,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 132
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[57], w_fp[24], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[57], w_fp[24], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -3233,7 +3233,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 133
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[27], w_fp[5], w_fp[58], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[27], w_fp[5], w_fp[58], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -3245,10 +3245,10 @@ namespace mg5amcCpu
       // *** DIAGRAM 134 OF 1240 ***
 
       // Wavefunction(s) for diagram number 134
-      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[34], w_fp[27], COUPs[1], cIPD[0], cIPD[1], w_fp[60] );
+      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[34], w_fp[27], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[60] );
 
       // Amplitude(s) for diagram number 134
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[60], w_fp[5], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[60], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -3261,7 +3261,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 135
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[55], w_fp[27], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[55], w_fp[27], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -3274,7 +3274,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 136
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[4], w_fp[29], w_fp[58], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[4], w_fp[29], w_fp[58], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -3289,7 +3289,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 137
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[9], w_fp[29], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[9], w_fp[29], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -3299,10 +3299,10 @@ namespace mg5amcCpu
       // *** DIAGRAM 138 OF 1240 ***
 
       // Wavefunction(s) for diagram number 138
-      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[34], w_fp[29], COUPs[1], cIPD[0], cIPD[1], w_fp[58] );
+      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[34], w_fp[29], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[58] );
 
       // Amplitude(s) for diagram number 138
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[58], w_fp[4], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[58], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -3315,7 +3315,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 139
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[34], w_fp[30], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[34], w_fp[30], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -3323,7 +3323,7 @@ namespace mg5amcCpu
       jamp_sv[11] -= amp_sv[0];
       jamp_sv[17] -= amp_sv[0];
       jamp_sv[23] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[34], w_fp[31], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[34], w_fp[31], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -3331,7 +3331,7 @@ namespace mg5amcCpu
       jamp_sv[15] += amp_sv[0];
       jamp_sv[17] -= amp_sv[0];
       jamp_sv[21] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[34], w_fp[32], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[34], w_fp[32], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -3343,12 +3343,12 @@ namespace mg5amcCpu
       // *** DIAGRAM 140 OF 1240 ***
 
       // Wavefunction(s) for diagram number 140
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[1], w_fp[4], COUPs[0], 0., 0., w_fp[61] );
-      FFV1P0_3<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[34], COUPs[1], 0., 0., w_fp[62] );
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[61], w_fp[5], COUPs[0], 0., 0., w_fp[63] );
+      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[1], w_fp[4], COUPs[0], 1.0, 0., 0., w_fp[61] );
+      FFV1P0_3<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[34], COUPs[1], 1.0, 0., 0., w_fp[62] );
+      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[61], w_fp[5], COUPs[0], 1.0, 0., 0., w_fp[63] );
 
       // Amplitude(s) for diagram number 140
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[62], w_fp[63], w_fp[6], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[62], w_fp[63], w_fp[6], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -3364,10 +3364,10 @@ namespace mg5amcCpu
       // *** DIAGRAM 141 OF 1240 ***
 
       // Wavefunction(s) for diagram number 141
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[61], w_fp[6], COUPs[0], 0., 0., w_fp[64] );
+      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[61], w_fp[6], COUPs[0], 1.0, 0., 0., w_fp[64] );
 
       // Amplitude(s) for diagram number 141
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[62], w_fp[64], w_fp[5], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[62], w_fp[64], w_fp[5], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -3386,7 +3386,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 142
-      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[61], w_fp[5], w_fp[6], w_fp[62], COUPs[2], &amp_fp[0] );
+      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[61], w_fp[5], w_fp[6], w_fp[62], COUPs[2], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -3398,7 +3398,7 @@ namespace mg5amcCpu
       jamp_sv[20] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[22] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[23] += cxtype( 0, 1 ) * amp_sv[0];
-      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[61], w_fp[5], w_fp[6], w_fp[62], COUPs[2], &amp_fp[0] );
+      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[61], w_fp[5], w_fp[6], w_fp[62], COUPs[2], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -3410,7 +3410,7 @@ namespace mg5amcCpu
       jamp_sv[17] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[18] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[20] -= cxtype( 0, 1 ) * amp_sv[0];
-      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[61], w_fp[5], w_fp[6], w_fp[62], COUPs[2], &amp_fp[0] );
+      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[61], w_fp[5], w_fp[6], w_fp[62], COUPs[2], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -3426,10 +3426,10 @@ namespace mg5amcCpu
       // *** DIAGRAM 143 OF 1240 ***
 
       // Wavefunction(s) for diagram number 143
-      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[61], COUPs[1], cIPD[0], cIPD[1], w_fp[65] );
+      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[61], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[65] );
 
       // Amplitude(s) for diagram number 143
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[65], w_fp[55], w_fp[6], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[65], w_fp[55], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -3442,7 +3442,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 144
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[55], w_fp[64], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[55], w_fp[64], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -3457,7 +3457,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 145
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[65], w_fp[57], w_fp[5], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[65], w_fp[57], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -3470,7 +3470,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 146
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[57], w_fp[63], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[57], w_fp[63], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -3482,10 +3482,10 @@ namespace mg5amcCpu
       // *** DIAGRAM 147 OF 1240 ***
 
       // Wavefunction(s) for diagram number 147
-      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[34], w_fp[61], COUPs[1], cIPD[0], cIPD[1], w_fp[66] );
+      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[34], w_fp[61], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[66] );
 
       // Amplitude(s) for diagram number 147
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[66], w_fp[6], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[66], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -3495,10 +3495,10 @@ namespace mg5amcCpu
       // *** DIAGRAM 148 OF 1240 ***
 
       // Wavefunction(s) for diagram number 148
-      FFV1P0_3<W_ACCESS, CD_ACCESS>( w_fp[38], w_fp[34], COUPs[1], 0., 0., w_fp[67] );
+      FFV1P0_3<W_ACCESS, CD_ACCESS>( w_fp[38], w_fp[34], COUPs[1], 1.0, 0., 0., w_fp[67] );
 
       // Amplitude(s) for diagram number 148
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[61], w_fp[6], w_fp[67], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[61], w_fp[6], w_fp[67], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -3513,7 +3513,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 149
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[57], w_fp[61], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[57], w_fp[61], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -3526,7 +3526,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 150
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[66], w_fp[5], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[66], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -3536,10 +3536,10 @@ namespace mg5amcCpu
       // *** DIAGRAM 151 OF 1240 ***
 
       // Wavefunction(s) for diagram number 151
-      FFV1P0_3<W_ACCESS, CD_ACCESS>( w_fp[41], w_fp[34], COUPs[1], 0., 0., w_fp[68] );
+      FFV1P0_3<W_ACCESS, CD_ACCESS>( w_fp[41], w_fp[34], COUPs[1], 1.0, 0., 0., w_fp[68] );
 
       // Amplitude(s) for diagram number 151
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[61], w_fp[5], w_fp[68], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[61], w_fp[5], w_fp[68], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -3554,7 +3554,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 152
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[55], w_fp[61], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[55], w_fp[61], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -3567,7 +3567,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 153
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[66], w_fp[29], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[66], w_fp[29], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -3582,7 +3582,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 154
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[61], w_fp[29], w_fp[62], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[61], w_fp[29], w_fp[62], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -3601,7 +3601,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 155
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[58], w_fp[61], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[58], w_fp[61], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -3613,11 +3613,11 @@ namespace mg5amcCpu
       // *** DIAGRAM 156 OF 1240 ***
 
       // Wavefunction(s) for diagram number 156
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[1], w_fp[5], COUPs[0], 0., 0., w_fp[66] );
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[66], w_fp[4], COUPs[0], 0., 0., w_fp[69] );
+      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[1], w_fp[5], COUPs[0], 1.0, 0., 0., w_fp[66] );
+      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[66], w_fp[4], COUPs[0], 1.0, 0., 0., w_fp[69] );
 
       // Amplitude(s) for diagram number 156
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[62], w_fp[69], w_fp[6], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[62], w_fp[69], w_fp[6], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -3633,10 +3633,10 @@ namespace mg5amcCpu
       // *** DIAGRAM 157 OF 1240 ***
 
       // Wavefunction(s) for diagram number 157
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[66], w_fp[6], COUPs[0], 0., 0., w_fp[70] );
+      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[66], w_fp[6], COUPs[0], 1.0, 0., 0., w_fp[70] );
 
       // Amplitude(s) for diagram number 157
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[62], w_fp[70], w_fp[4], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[62], w_fp[70], w_fp[4], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -3655,7 +3655,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 158
-      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[66], w_fp[4], w_fp[6], w_fp[62], COUPs[2], &amp_fp[0] );
+      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[66], w_fp[4], w_fp[6], w_fp[62], COUPs[2], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -3667,7 +3667,7 @@ namespace mg5amcCpu
       jamp_sv[20] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[21] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[22] -= cxtype( 0, 1 ) * amp_sv[0];
-      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[66], w_fp[4], w_fp[6], w_fp[62], COUPs[2], &amp_fp[0] );
+      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[66], w_fp[4], w_fp[6], w_fp[62], COUPs[2], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -3679,7 +3679,7 @@ namespace mg5amcCpu
       jamp_sv[13] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[19] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[22] -= cxtype( 0, 1 ) * amp_sv[0];
-      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[66], w_fp[4], w_fp[6], w_fp[62], COUPs[2], &amp_fp[0] );
+      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[66], w_fp[4], w_fp[6], w_fp[62], COUPs[2], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -3695,10 +3695,10 @@ namespace mg5amcCpu
       // *** DIAGRAM 159 OF 1240 ***
 
       // Wavefunction(s) for diagram number 159
-      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[66], COUPs[1], cIPD[0], cIPD[1], w_fp[71] );
+      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[66], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[71] );
 
       // Amplitude(s) for diagram number 159
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[71], w_fp[9], w_fp[6], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[71], w_fp[9], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -3711,7 +3711,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 160
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[9], w_fp[70], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[9], w_fp[70], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -3726,7 +3726,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 161
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[71], w_fp[57], w_fp[4], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[71], w_fp[57], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -3739,7 +3739,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 162
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[57], w_fp[69], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[57], w_fp[69], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -3751,10 +3751,10 @@ namespace mg5amcCpu
       // *** DIAGRAM 163 OF 1240 ***
 
       // Wavefunction(s) for diagram number 163
-      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[34], w_fp[66], COUPs[1], cIPD[0], cIPD[1], w_fp[72] );
+      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[34], w_fp[66], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[72] );
 
       // Amplitude(s) for diagram number 163
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[72], w_fp[6], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[72], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -3764,10 +3764,10 @@ namespace mg5amcCpu
       // *** DIAGRAM 164 OF 1240 ***
 
       // Wavefunction(s) for diagram number 164
-      FFV1P0_3<W_ACCESS, CD_ACCESS>( w_fp[46], w_fp[34], COUPs[1], 0., 0., w_fp[73] );
+      FFV1P0_3<W_ACCESS, CD_ACCESS>( w_fp[46], w_fp[34], COUPs[1], 1.0, 0., 0., w_fp[73] );
 
       // Amplitude(s) for diagram number 164
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[66], w_fp[6], w_fp[73], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[66], w_fp[6], w_fp[73], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -3782,7 +3782,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 165
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[57], w_fp[66], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[57], w_fp[66], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -3795,7 +3795,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 166
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[72], w_fp[4], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[72], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -3808,7 +3808,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 167
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[66], w_fp[4], w_fp[68], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[66], w_fp[4], w_fp[68], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -3823,7 +3823,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 168
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[9], w_fp[66], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[9], w_fp[66], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -3836,7 +3836,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 169
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[72], w_fp[27], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[72], w_fp[27], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -3851,7 +3851,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 170
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[66], w_fp[27], w_fp[62], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[66], w_fp[27], w_fp[62], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -3870,7 +3870,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 171
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[60], w_fp[66], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[60], w_fp[66], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -3882,11 +3882,11 @@ namespace mg5amcCpu
       // *** DIAGRAM 172 OF 1240 ***
 
       // Wavefunction(s) for diagram number 172
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[1], w_fp[6], COUPs[0], 0., 0., w_fp[72] );
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[72], w_fp[4], COUPs[0], 0., 0., w_fp[74] );
+      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[1], w_fp[6], COUPs[0], 1.0, 0., 0., w_fp[72] );
+      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[72], w_fp[4], COUPs[0], 1.0, 0., 0., w_fp[74] );
 
       // Amplitude(s) for diagram number 172
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[62], w_fp[74], w_fp[5], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[62], w_fp[74], w_fp[5], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -3902,10 +3902,10 @@ namespace mg5amcCpu
       // *** DIAGRAM 173 OF 1240 ***
 
       // Wavefunction(s) for diagram number 173
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[72], w_fp[5], COUPs[0], 0., 0., w_fp[75] );
+      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[72], w_fp[5], COUPs[0], 1.0, 0., 0., w_fp[75] );
 
       // Amplitude(s) for diagram number 173
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[62], w_fp[75], w_fp[4], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[62], w_fp[75], w_fp[4], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -3924,7 +3924,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 174
-      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[72], w_fp[4], w_fp[5], w_fp[62], COUPs[2], &amp_fp[0] );
+      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[72], w_fp[4], w_fp[5], w_fp[62], COUPs[2], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -3936,7 +3936,7 @@ namespace mg5amcCpu
       jamp_sv[15] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[16] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[18] += cxtype( 0, 1 ) * amp_sv[0];
-      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[72], w_fp[4], w_fp[5], w_fp[62], COUPs[2], &amp_fp[0] );
+      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[72], w_fp[4], w_fp[5], w_fp[62], COUPs[2], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -3948,7 +3948,7 @@ namespace mg5amcCpu
       jamp_sv[13] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[16] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[19] += cxtype( 0, 1 ) * amp_sv[0];
-      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[72], w_fp[4], w_fp[5], w_fp[62], COUPs[2], &amp_fp[0] );
+      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[72], w_fp[4], w_fp[5], w_fp[62], COUPs[2], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -3964,10 +3964,10 @@ namespace mg5amcCpu
       // *** DIAGRAM 175 OF 1240 ***
 
       // Wavefunction(s) for diagram number 175
-      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[72], COUPs[1], cIPD[0], cIPD[1], w_fp[76] );
+      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[72], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[76] );
 
       // Amplitude(s) for diagram number 175
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[76], w_fp[9], w_fp[5], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[76], w_fp[9], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -3980,7 +3980,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 176
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[9], w_fp[75], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[9], w_fp[75], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -3995,7 +3995,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 177
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[76], w_fp[55], w_fp[4], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[76], w_fp[55], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -4008,7 +4008,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 178
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[55], w_fp[74], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[55], w_fp[74], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -4020,10 +4020,10 @@ namespace mg5amcCpu
       // *** DIAGRAM 179 OF 1240 ***
 
       // Wavefunction(s) for diagram number 179
-      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[34], w_fp[72], COUPs[1], cIPD[0], cIPD[1], w_fp[77] );
+      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[34], w_fp[72], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[77] );
 
       // Amplitude(s) for diagram number 179
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[77], w_fp[5], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[77], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -4036,7 +4036,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 180
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[72], w_fp[5], w_fp[73], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[72], w_fp[5], w_fp[73], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -4051,7 +4051,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 181
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[55], w_fp[72], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[55], w_fp[72], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -4064,7 +4064,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 182
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[77], w_fp[4], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[77], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -4077,7 +4077,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 183
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[72], w_fp[4], w_fp[67], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[72], w_fp[4], w_fp[67], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -4092,7 +4092,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 184
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[9], w_fp[72], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[9], w_fp[72], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -4105,7 +4105,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 185
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[77], w_fp[24], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[77], w_fp[24], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -4120,7 +4120,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 186
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[72], w_fp[24], w_fp[62], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[72], w_fp[24], w_fp[62], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -4139,7 +4139,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 187
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[59], w_fp[72], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[59], w_fp[72], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -4151,10 +4151,10 @@ namespace mg5amcCpu
       // *** DIAGRAM 188 OF 1240 ***
 
       // Wavefunction(s) for diagram number 188
-      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[34], w_fp[1], COUPs[1], cIPD[0], cIPD[1], w_fp[77] );
+      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[34], w_fp[1], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[77] );
 
       // Amplitude(s) for diagram number 188
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[7], w_fp[77], w_fp[6], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[7], w_fp[77], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -4166,7 +4166,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 189
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[53], w_fp[77], w_fp[5], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[53], w_fp[77], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -4175,10 +4175,10 @@ namespace mg5amcCpu
       // *** DIAGRAM 190 OF 1240 ***
 
       // Wavefunction(s) for diagram number 190
-      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[46], w_fp[1], COUPs[1], cIPD[0], cIPD[1], w_fp[78] );
+      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[46], w_fp[1], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[78] );
 
       // Amplitude(s) for diagram number 190
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[78], w_fp[55], w_fp[6], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[78], w_fp[55], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -4190,7 +4190,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 191
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[53], w_fp[55], w_fp[1], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[53], w_fp[55], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -4202,7 +4202,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 192
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[78], w_fp[57], w_fp[5], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[78], w_fp[57], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -4214,7 +4214,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 193
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[7], w_fp[57], w_fp[1], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[7], w_fp[57], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -4226,7 +4226,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 194
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[77], w_fp[29], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[77], w_fp[29], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -4239,7 +4239,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 195
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[29], w_fp[73], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[29], w_fp[73], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -4254,7 +4254,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 196
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[58], w_fp[1], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[58], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -4267,7 +4267,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 197
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[25], w_fp[77], w_fp[6], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[25], w_fp[77], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -4279,7 +4279,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 198
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[48], w_fp[77], w_fp[4], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[48], w_fp[77], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -4288,10 +4288,10 @@ namespace mg5amcCpu
       // *** DIAGRAM 199 OF 1240 ***
 
       // Wavefunction(s) for diagram number 199
-      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[38], w_fp[1], COUPs[1], cIPD[0], cIPD[1], w_fp[58] );
+      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[38], w_fp[1], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[58] );
 
       // Amplitude(s) for diagram number 199
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[58], w_fp[9], w_fp[6], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[58], w_fp[9], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -4303,7 +4303,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 200
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[48], w_fp[9], w_fp[1], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[48], w_fp[9], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -4315,7 +4315,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 201
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[58], w_fp[57], w_fp[4], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[58], w_fp[57], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -4327,7 +4327,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 202
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[25], w_fp[57], w_fp[1], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[25], w_fp[57], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -4339,7 +4339,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 203
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[77], w_fp[27], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[77], w_fp[27], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -4352,7 +4352,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 204
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[27], w_fp[67], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[27], w_fp[67], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -4367,7 +4367,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 205
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[60], w_fp[1], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[60], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -4380,7 +4380,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 206
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[28], w_fp[77], w_fp[5], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[28], w_fp[77], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -4392,7 +4392,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 207
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[40], w_fp[77], w_fp[4], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[40], w_fp[77], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -4401,10 +4401,10 @@ namespace mg5amcCpu
       // *** DIAGRAM 208 OF 1240 ***
 
       // Wavefunction(s) for diagram number 208
-      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[41], w_fp[1], COUPs[1], cIPD[0], cIPD[1], w_fp[60] );
+      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[41], w_fp[1], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[60] );
 
       // Amplitude(s) for diagram number 208
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[60], w_fp[9], w_fp[5], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[60], w_fp[9], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -4416,7 +4416,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 209
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[40], w_fp[9], w_fp[1], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[40], w_fp[9], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -4428,7 +4428,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 210
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[60], w_fp[55], w_fp[4], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[60], w_fp[55], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -4440,7 +4440,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 211
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[28], w_fp[55], w_fp[1], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[28], w_fp[55], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -4452,7 +4452,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 212
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[77], w_fp[24], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[77], w_fp[24], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -4465,7 +4465,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 213
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[24], w_fp[68], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[24], w_fp[68], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -4480,7 +4480,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 214
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[59], w_fp[1], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[59], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -4493,7 +4493,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 215
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[26], w_fp[77], w_fp[6], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[26], w_fp[77], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -4506,7 +4506,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 216
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[77], w_fp[42], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[77], w_fp[42], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -4518,10 +4518,10 @@ namespace mg5amcCpu
       // *** DIAGRAM 217 OF 1240 ***
 
       // Wavefunction(s) for diagram number 217
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[1], w_fp[24], COUPs[0], 0., 0., w_fp[59] );
+      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[1], w_fp[24], COUPs[0], 1.0, 0., 0., w_fp[59] );
 
       // Amplitude(s) for diagram number 217
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[62], w_fp[59], w_fp[6], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[62], w_fp[59], w_fp[6], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -4540,7 +4540,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 218
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[62], w_fp[1], w_fp[42], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[62], w_fp[1], w_fp[42], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -4559,7 +4559,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 219
-      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[24], w_fp[6], w_fp[62], COUPs[2], &amp_fp[0] );
+      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[24], w_fp[6], w_fp[62], COUPs[2], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -4571,7 +4571,7 @@ namespace mg5amcCpu
       jamp_sv[19] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[21] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[23] += cxtype( 0, 1 ) * amp_sv[0];
-      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[24], w_fp[6], w_fp[62], COUPs[2], &amp_fp[0] );
+      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[24], w_fp[6], w_fp[62], COUPs[2], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -4583,7 +4583,7 @@ namespace mg5amcCpu
       jamp_sv[15] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[18] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[19] -= cxtype( 0, 1 ) * amp_sv[0];
-      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[24], w_fp[6], w_fp[62], COUPs[2], &amp_fp[0] );
+      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[24], w_fp[6], w_fp[62], COUPs[2], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -4602,7 +4602,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 220
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[57], w_fp[59], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[57], w_fp[59], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -4617,7 +4617,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 221
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[26], w_fp[57], w_fp[1], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[26], w_fp[57], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -4630,7 +4630,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 222
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[14], w_fp[77], w_fp[5], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[14], w_fp[77], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -4643,7 +4643,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 223
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[77], w_fp[16], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[77], w_fp[16], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -4655,10 +4655,10 @@ namespace mg5amcCpu
       // *** DIAGRAM 224 OF 1240 ***
 
       // Wavefunction(s) for diagram number 224
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[1], w_fp[27], COUPs[0], 0., 0., w_fp[68] );
+      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[1], w_fp[27], COUPs[0], 1.0, 0., 0., w_fp[68] );
 
       // Amplitude(s) for diagram number 224
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[62], w_fp[68], w_fp[5], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[62], w_fp[68], w_fp[5], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -4677,7 +4677,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 225
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[62], w_fp[1], w_fp[16], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[62], w_fp[1], w_fp[16], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -4696,7 +4696,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 226
-      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[27], w_fp[5], w_fp[62], COUPs[2], &amp_fp[0] );
+      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[27], w_fp[5], w_fp[62], COUPs[2], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -4708,7 +4708,7 @@ namespace mg5amcCpu
       jamp_sv[15] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[17] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[20] -= cxtype( 0, 1 ) * amp_sv[0];
-      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[27], w_fp[5], w_fp[62], COUPs[2], &amp_fp[0] );
+      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[27], w_fp[5], w_fp[62], COUPs[2], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -4720,7 +4720,7 @@ namespace mg5amcCpu
       jamp_sv[13] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[20] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[21] += cxtype( 0, 1 ) * amp_sv[0];
-      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[27], w_fp[5], w_fp[62], COUPs[2], &amp_fp[0] );
+      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[27], w_fp[5], w_fp[62], COUPs[2], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -4739,7 +4739,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 227
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[55], w_fp[68], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[55], w_fp[68], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -4754,7 +4754,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 228
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[14], w_fp[55], w_fp[1], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[14], w_fp[55], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -4767,7 +4767,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 229
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[12], w_fp[77], w_fp[4], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[12], w_fp[77], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -4780,7 +4780,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 230
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[77], w_fp[19], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[77], w_fp[19], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -4792,10 +4792,10 @@ namespace mg5amcCpu
       // *** DIAGRAM 231 OF 1240 ***
 
       // Wavefunction(s) for diagram number 231
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[1], w_fp[29], COUPs[0], 0., 0., w_fp[67] );
+      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[1], w_fp[29], COUPs[0], 1.0, 0., 0., w_fp[67] );
 
       // Amplitude(s) for diagram number 231
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[62], w_fp[67], w_fp[4], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[62], w_fp[67], w_fp[4], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -4814,7 +4814,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 232
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[62], w_fp[1], w_fp[19], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[62], w_fp[1], w_fp[19], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -4833,7 +4833,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 233
-      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[4], w_fp[29], w_fp[62], COUPs[2], &amp_fp[0] );
+      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[4], w_fp[29], w_fp[62], COUPs[2], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -4845,7 +4845,7 @@ namespace mg5amcCpu
       jamp_sv[17] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[22] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[23] += cxtype( 0, 1 ) * amp_sv[0];
-      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[4], w_fp[29], w_fp[62], COUPs[2], &amp_fp[0] );
+      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[4], w_fp[29], w_fp[62], COUPs[2], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -4857,7 +4857,7 @@ namespace mg5amcCpu
       jamp_sv[11] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[16] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[22] -= cxtype( 0, 1 ) * amp_sv[0];
-      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[4], w_fp[29], w_fp[62], COUPs[2], &amp_fp[0] );
+      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[4], w_fp[29], w_fp[62], COUPs[2], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -4876,7 +4876,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 234
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[9], w_fp[67], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[9], w_fp[67], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -4891,7 +4891,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 235
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[12], w_fp[9], w_fp[1], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[12], w_fp[9], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -4901,12 +4901,12 @@ namespace mg5amcCpu
       // *** DIAGRAM 236 OF 1240 ***
 
       // Wavefunction(s) for diagram number 236
-      VVVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[1], w_fp[4], w_fp[5], COUPs[2], 0., 0., w_fp[73] );
-      VVVV3P0_1<W_ACCESS, CD_ACCESS>( w_fp[1], w_fp[4], w_fp[5], COUPs[2], 0., 0., w_fp[79] );
-      VVVV4P0_1<W_ACCESS, CD_ACCESS>( w_fp[1], w_fp[4], w_fp[5], COUPs[2], 0., 0., w_fp[80] );
+      VVVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[1], w_fp[4], w_fp[5], COUPs[2], 1.0, 0., 0., w_fp[73] );
+      VVVV3P0_1<W_ACCESS, CD_ACCESS>( w_fp[1], w_fp[4], w_fp[5], COUPs[2], 1.0, 0., 0., w_fp[79] );
+      VVVV4P0_1<W_ACCESS, CD_ACCESS>( w_fp[1], w_fp[4], w_fp[5], COUPs[2], 1.0, 0., 0., w_fp[80] );
 
       // Amplitude(s) for diagram number 236
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[73], w_fp[6], w_fp[62], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[73], w_fp[6], w_fp[62], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -4918,7 +4918,7 @@ namespace mg5amcCpu
       jamp_sv[19] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[21] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[23] += cxtype( 0, 1 ) * amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[79], w_fp[6], w_fp[62], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[79], w_fp[6], w_fp[62], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -4930,7 +4930,7 @@ namespace mg5amcCpu
       jamp_sv[20] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[21] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[22] += cxtype( 0, 1 ) * amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[80], w_fp[6], w_fp[62], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[80], w_fp[6], w_fp[62], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -4949,7 +4949,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 237
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[57], w_fp[73], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[57], w_fp[73], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -4957,7 +4957,7 @@ namespace mg5amcCpu
       jamp_sv[19] -= amp_sv[0];
       jamp_sv[21] -= amp_sv[0];
       jamp_sv[23] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[57], w_fp[79], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[57], w_fp[79], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -4965,7 +4965,7 @@ namespace mg5amcCpu
       jamp_sv[20] += amp_sv[0];
       jamp_sv[21] -= amp_sv[0];
       jamp_sv[22] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[57], w_fp[80], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[57], w_fp[80], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -4980,7 +4980,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 238
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[34], w_fp[73], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[34], w_fp[73], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -4988,7 +4988,7 @@ namespace mg5amcCpu
       jamp_sv[2] -= amp_sv[0];
       jamp_sv[8] -= amp_sv[0];
       jamp_sv[14] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[34], w_fp[79], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[34], w_fp[79], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -4996,7 +4996,7 @@ namespace mg5amcCpu
       jamp_sv[6] += amp_sv[0];
       jamp_sv[8] -= amp_sv[0];
       jamp_sv[12] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[34], w_fp[80], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[34], w_fp[80], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -5008,12 +5008,12 @@ namespace mg5amcCpu
       // *** DIAGRAM 239 OF 1240 ***
 
       // Wavefunction(s) for diagram number 239
-      VVVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[1], w_fp[4], w_fp[6], COUPs[2], 0., 0., w_fp[57] );
-      VVVV3P0_1<W_ACCESS, CD_ACCESS>( w_fp[1], w_fp[4], w_fp[6], COUPs[2], 0., 0., w_fp[81] );
-      VVVV4P0_1<W_ACCESS, CD_ACCESS>( w_fp[1], w_fp[4], w_fp[6], COUPs[2], 0., 0., w_fp[82] );
+      VVVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[1], w_fp[4], w_fp[6], COUPs[2], 1.0, 0., 0., w_fp[57] );
+      VVVV3P0_1<W_ACCESS, CD_ACCESS>( w_fp[1], w_fp[4], w_fp[6], COUPs[2], 1.0, 0., 0., w_fp[81] );
+      VVVV4P0_1<W_ACCESS, CD_ACCESS>( w_fp[1], w_fp[4], w_fp[6], COUPs[2], 1.0, 0., 0., w_fp[82] );
 
       // Amplitude(s) for diagram number 239
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[57], w_fp[5], w_fp[62], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[57], w_fp[5], w_fp[62], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -5025,7 +5025,7 @@ namespace mg5amcCpu
       jamp_sv[15] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[17] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[20] -= cxtype( 0, 1 ) * amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[81], w_fp[5], w_fp[62], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[81], w_fp[5], w_fp[62], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -5037,7 +5037,7 @@ namespace mg5amcCpu
       jamp_sv[15] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[16] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[18] -= cxtype( 0, 1 ) * amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[82], w_fp[5], w_fp[62], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[82], w_fp[5], w_fp[62], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -5056,7 +5056,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 240
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[55], w_fp[57], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[55], w_fp[57], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -5064,7 +5064,7 @@ namespace mg5amcCpu
       jamp_sv[13] -= amp_sv[0];
       jamp_sv[15] -= amp_sv[0];
       jamp_sv[17] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[55], w_fp[81], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[55], w_fp[81], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -5072,7 +5072,7 @@ namespace mg5amcCpu
       jamp_sv[14] += amp_sv[0];
       jamp_sv[15] -= amp_sv[0];
       jamp_sv[16] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[55], w_fp[82], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[55], w_fp[82], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -5087,7 +5087,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 241
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[34], w_fp[57], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[34], w_fp[57], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -5095,7 +5095,7 @@ namespace mg5amcCpu
       jamp_sv[4] -= amp_sv[0];
       jamp_sv[10] -= amp_sv[0];
       jamp_sv[20] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[34], w_fp[81], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[34], w_fp[81], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -5103,7 +5103,7 @@ namespace mg5amcCpu
       jamp_sv[7] += amp_sv[0];
       jamp_sv[10] -= amp_sv[0];
       jamp_sv[18] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[34], w_fp[82], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[34], w_fp[82], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -5115,12 +5115,12 @@ namespace mg5amcCpu
       // *** DIAGRAM 242 OF 1240 ***
 
       // Wavefunction(s) for diagram number 242
-      VVVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[1], w_fp[5], w_fp[6], COUPs[2], 0., 0., w_fp[55] );
-      VVVV3P0_1<W_ACCESS, CD_ACCESS>( w_fp[1], w_fp[5], w_fp[6], COUPs[2], 0., 0., w_fp[83] );
-      VVVV4P0_1<W_ACCESS, CD_ACCESS>( w_fp[1], w_fp[5], w_fp[6], COUPs[2], 0., 0., w_fp[84] );
+      VVVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[1], w_fp[5], w_fp[6], COUPs[2], 1.0, 0., 0., w_fp[55] );
+      VVVV3P0_1<W_ACCESS, CD_ACCESS>( w_fp[1], w_fp[5], w_fp[6], COUPs[2], 1.0, 0., 0., w_fp[83] );
+      VVVV4P0_1<W_ACCESS, CD_ACCESS>( w_fp[1], w_fp[5], w_fp[6], COUPs[2], 1.0, 0., 0., w_fp[84] );
 
       // Amplitude(s) for diagram number 242
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[55], w_fp[4], w_fp[62], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[55], w_fp[4], w_fp[62], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -5132,7 +5132,7 @@ namespace mg5amcCpu
       jamp_sv[11] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[16] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[22] -= cxtype( 0, 1 ) * amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[83], w_fp[4], w_fp[62], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[83], w_fp[4], w_fp[62], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -5144,7 +5144,7 @@ namespace mg5amcCpu
       jamp_sv[13] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[16] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[19] -= cxtype( 0, 1 ) * amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[84], w_fp[4], w_fp[62], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[84], w_fp[4], w_fp[62], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -5163,7 +5163,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 243
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[9], w_fp[55], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[9], w_fp[55], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -5171,7 +5171,7 @@ namespace mg5amcCpu
       jamp_sv[7] -= amp_sv[0];
       jamp_sv[9] -= amp_sv[0];
       jamp_sv[11] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[9], w_fp[83], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[9], w_fp[83], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -5179,7 +5179,7 @@ namespace mg5amcCpu
       jamp_sv[8] += amp_sv[0];
       jamp_sv[9] -= amp_sv[0];
       jamp_sv[10] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[9], w_fp[84], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[9], w_fp[84], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -5194,7 +5194,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 244
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[34], w_fp[55], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[34], w_fp[55], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -5202,7 +5202,7 @@ namespace mg5amcCpu
       jamp_sv[5] -= amp_sv[0];
       jamp_sv[16] -= amp_sv[0];
       jamp_sv[22] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[34], w_fp[83], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[34], w_fp[83], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -5210,7 +5210,7 @@ namespace mg5amcCpu
       jamp_sv[13] += amp_sv[0];
       jamp_sv[16] -= amp_sv[0];
       jamp_sv[19] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[34], w_fp[84], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[34], w_fp[84], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -5225,7 +5225,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 245
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[77], w_fp[30], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[77], w_fp[30], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -5233,7 +5233,7 @@ namespace mg5amcCpu
       jamp_sv[1] -= amp_sv[0];
       jamp_sv[3] -= amp_sv[0];
       jamp_sv[5] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[77], w_fp[31], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[77], w_fp[31], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -5241,7 +5241,7 @@ namespace mg5amcCpu
       jamp_sv[2] += amp_sv[0];
       jamp_sv[3] -= amp_sv[0];
       jamp_sv[4] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[77], w_fp[32], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[77], w_fp[32], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -5256,7 +5256,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 246
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[30], w_fp[62], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[30], w_fp[62], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -5268,7 +5268,7 @@ namespace mg5amcCpu
       jamp_sv[11] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[17] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[23] += cxtype( 0, 1 ) * amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[31], w_fp[62], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[31], w_fp[62], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -5280,7 +5280,7 @@ namespace mg5amcCpu
       jamp_sv[15] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[17] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[21] += cxtype( 0, 1 ) * amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[32], w_fp[62], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[32], w_fp[62], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -5296,13 +5296,13 @@ namespace mg5amcCpu
       // *** DIAGRAM 247 OF 1240 ***
 
       // Wavefunction(s) for diagram number 247
-      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[0], COUPs[1], cIPD[0], cIPD[1], w_fp[62] );
-      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[2], w_fp[1], COUPs[1], cIPD[0], cIPD[1], w_fp[77] );
-      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[62], w_fp[4], COUPs[1], cIPD[0], cIPD[1], w_fp[34] );
-      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[77], w_fp[5], COUPs[1], cIPD[0], cIPD[1], w_fp[9] );
+      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[0], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[62] );
+      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[2], w_fp[1], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[77] );
+      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[62], w_fp[4], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[34] );
+      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[77], w_fp[5], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[9] );
 
       // Amplitude(s) for diagram number 247
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[34], w_fp[9], w_fp[6], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[34], w_fp[9], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -5311,10 +5311,10 @@ namespace mg5amcCpu
       // *** DIAGRAM 248 OF 1240 ***
 
       // Wavefunction(s) for diagram number 248
-      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[77], w_fp[6], COUPs[1], cIPD[0], cIPD[1], w_fp[85] );
+      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[77], w_fp[6], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[85] );
 
       // Amplitude(s) for diagram number 248
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[34], w_fp[85], w_fp[5], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[34], w_fp[85], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -5323,11 +5323,11 @@ namespace mg5amcCpu
       // *** DIAGRAM 249 OF 1240 ***
 
       // Wavefunction(s) for diagram number 249
-      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[62], w_fp[5], COUPs[1], cIPD[0], cIPD[1], w_fp[86] );
-      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[77], w_fp[4], COUPs[1], cIPD[0], cIPD[1], w_fp[87] );
+      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[62], w_fp[5], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[86] );
+      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[77], w_fp[4], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[87] );
 
       // Amplitude(s) for diagram number 249
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[86], w_fp[87], w_fp[6], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[86], w_fp[87], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -5339,7 +5339,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 250
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[86], w_fp[85], w_fp[4], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[86], w_fp[85], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -5348,10 +5348,10 @@ namespace mg5amcCpu
       // *** DIAGRAM 251 OF 1240 ***
 
       // Wavefunction(s) for diagram number 251
-      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[62], w_fp[6], COUPs[1], cIPD[0], cIPD[1], w_fp[88] );
+      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[62], w_fp[6], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[88] );
 
       // Amplitude(s) for diagram number 251
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[88], w_fp[87], w_fp[5], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[88], w_fp[87], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -5363,7 +5363,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 252
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[88], w_fp[9], w_fp[4], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[88], w_fp[9], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -5372,10 +5372,10 @@ namespace mg5amcCpu
       // *** DIAGRAM 253 OF 1240 ***
 
       // Wavefunction(s) for diagram number 253
-      FFV1P0_3<W_ACCESS, CD_ACCESS>( w_fp[62], w_fp[77], COUPs[1], 0., 0., w_fp[89] );
+      FFV1P0_3<W_ACCESS, CD_ACCESS>( w_fp[62], w_fp[77], COUPs[1], 1.0, 0., 0., w_fp[89] );
 
       // Amplitude(s) for diagram number 253
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[24], w_fp[6], w_fp[89], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[24], w_fp[6], w_fp[89], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -5387,10 +5387,10 @@ namespace mg5amcCpu
       // *** DIAGRAM 254 OF 1240 ***
 
       // Wavefunction(s) for diagram number 254
-      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[62], w_fp[24], COUPs[1], cIPD[0], cIPD[1], w_fp[90] );
+      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[62], w_fp[24], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[90] );
 
       // Amplitude(s) for diagram number 254
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[90], w_fp[77], w_fp[6], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[90], w_fp[77], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -5403,7 +5403,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 255
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[88], w_fp[77], w_fp[24], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[88], w_fp[77], w_fp[24], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -5416,7 +5416,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 256
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[27], w_fp[5], w_fp[89], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[27], w_fp[5], w_fp[89], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -5428,10 +5428,10 @@ namespace mg5amcCpu
       // *** DIAGRAM 257 OF 1240 ***
 
       // Wavefunction(s) for diagram number 257
-      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[62], w_fp[27], COUPs[1], cIPD[0], cIPD[1], w_fp[91] );
+      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[62], w_fp[27], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[91] );
 
       // Amplitude(s) for diagram number 257
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[91], w_fp[77], w_fp[5], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[91], w_fp[77], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -5444,7 +5444,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 258
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[86], w_fp[77], w_fp[27], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[86], w_fp[77], w_fp[27], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -5457,7 +5457,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 259
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[4], w_fp[29], w_fp[89], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[4], w_fp[29], w_fp[89], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -5472,7 +5472,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 260
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[34], w_fp[77], w_fp[29], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[34], w_fp[77], w_fp[29], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -5482,10 +5482,10 @@ namespace mg5amcCpu
       // *** DIAGRAM 261 OF 1240 ***
 
       // Wavefunction(s) for diagram number 261
-      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[62], w_fp[29], COUPs[1], cIPD[0], cIPD[1], w_fp[89] );
+      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[62], w_fp[29], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[89] );
 
       // Amplitude(s) for diagram number 261
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[89], w_fp[77], w_fp[4], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[89], w_fp[77], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -5498,7 +5498,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 262
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[62], w_fp[77], w_fp[30], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[62], w_fp[77], w_fp[30], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -5506,7 +5506,7 @@ namespace mg5amcCpu
       jamp_sv[35] -= amp_sv[0];
       jamp_sv[41] -= amp_sv[0];
       jamp_sv[47] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[62], w_fp[77], w_fp[31], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[62], w_fp[77], w_fp[31], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -5514,7 +5514,7 @@ namespace mg5amcCpu
       jamp_sv[39] += amp_sv[0];
       jamp_sv[41] -= amp_sv[0];
       jamp_sv[45] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[62], w_fp[77], w_fp[32], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[62], w_fp[77], w_fp[32], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -5526,10 +5526,10 @@ namespace mg5amcCpu
       // *** DIAGRAM 263 OF 1240 ***
 
       // Wavefunction(s) for diagram number 263
-      FFV1P0_3<W_ACCESS, CD_ACCESS>( w_fp[62], w_fp[2], COUPs[1], 0., 0., w_fp[92] );
+      FFV1P0_3<W_ACCESS, CD_ACCESS>( w_fp[62], w_fp[2], COUPs[1], 1.0, 0., 0., w_fp[92] );
 
       // Amplitude(s) for diagram number 263
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[92], w_fp[63], w_fp[6], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[92], w_fp[63], w_fp[6], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -5548,7 +5548,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 264
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[92], w_fp[64], w_fp[5], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[92], w_fp[64], w_fp[5], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -5567,7 +5567,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 265
-      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[61], w_fp[5], w_fp[6], w_fp[92], COUPs[2], &amp_fp[0] );
+      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[61], w_fp[5], w_fp[6], w_fp[92], COUPs[2], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -5579,7 +5579,7 @@ namespace mg5amcCpu
       jamp_sv[111] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[117] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[119] += cxtype( 0, 1 ) * amp_sv[0];
-      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[61], w_fp[5], w_fp[6], w_fp[92], COUPs[2], &amp_fp[0] );
+      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[61], w_fp[5], w_fp[6], w_fp[92], COUPs[2], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -5591,7 +5591,7 @@ namespace mg5amcCpu
       jamp_sv[95] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[105] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[111] -= cxtype( 0, 1 ) * amp_sv[0];
-      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[61], w_fp[5], w_fp[6], w_fp[92], COUPs[2], &amp_fp[0] );
+      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[61], w_fp[5], w_fp[6], w_fp[92], COUPs[2], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -5607,10 +5607,10 @@ namespace mg5amcCpu
       // *** DIAGRAM 266 OF 1240 ***
 
       // Wavefunction(s) for diagram number 266
-      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[2], w_fp[61], COUPs[1], cIPD[0], cIPD[1], w_fp[93] );
+      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[2], w_fp[61], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[93] );
 
       // Amplitude(s) for diagram number 266
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[86], w_fp[93], w_fp[6], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[86], w_fp[93], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -5623,7 +5623,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 267
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[86], w_fp[2], w_fp[64], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[86], w_fp[2], w_fp[64], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -5638,7 +5638,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 268
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[88], w_fp[93], w_fp[5], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[88], w_fp[93], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -5651,7 +5651,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 269
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[88], w_fp[2], w_fp[63], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[88], w_fp[2], w_fp[63], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -5663,10 +5663,10 @@ namespace mg5amcCpu
       // *** DIAGRAM 270 OF 1240 ***
 
       // Wavefunction(s) for diagram number 270
-      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[62], w_fp[61], COUPs[1], cIPD[0], cIPD[1], w_fp[94] );
+      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[62], w_fp[61], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[94] );
 
       // Amplitude(s) for diagram number 270
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[94], w_fp[39], w_fp[6], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[94], w_fp[39], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -5676,10 +5676,10 @@ namespace mg5amcCpu
       // *** DIAGRAM 271 OF 1240 ***
 
       // Wavefunction(s) for diagram number 271
-      FFV1P0_3<W_ACCESS, CD_ACCESS>( w_fp[62], w_fp[39], COUPs[1], 0., 0., w_fp[95] );
+      FFV1P0_3<W_ACCESS, CD_ACCESS>( w_fp[62], w_fp[39], COUPs[1], 1.0, 0., 0., w_fp[95] );
 
       // Amplitude(s) for diagram number 271
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[61], w_fp[6], w_fp[95], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[61], w_fp[6], w_fp[95], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -5694,7 +5694,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 272
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[88], w_fp[39], w_fp[61], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[88], w_fp[39], w_fp[61], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -5707,7 +5707,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 273
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[94], w_fp[47], w_fp[5], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[94], w_fp[47], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -5717,10 +5717,10 @@ namespace mg5amcCpu
       // *** DIAGRAM 274 OF 1240 ***
 
       // Wavefunction(s) for diagram number 274
-      FFV1P0_3<W_ACCESS, CD_ACCESS>( w_fp[62], w_fp[47], COUPs[1], 0., 0., w_fp[96] );
+      FFV1P0_3<W_ACCESS, CD_ACCESS>( w_fp[62], w_fp[47], COUPs[1], 1.0, 0., 0., w_fp[96] );
 
       // Amplitude(s) for diagram number 274
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[61], w_fp[5], w_fp[96], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[61], w_fp[5], w_fp[96], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -5735,7 +5735,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 275
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[86], w_fp[47], w_fp[61], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[86], w_fp[47], w_fp[61], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -5748,7 +5748,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 276
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[94], w_fp[2], w_fp[29], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[94], w_fp[2], w_fp[29], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -5763,7 +5763,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 277
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[61], w_fp[29], w_fp[92], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[61], w_fp[29], w_fp[92], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -5782,7 +5782,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 278
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[89], w_fp[2], w_fp[61], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[89], w_fp[2], w_fp[61], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -5797,7 +5797,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 279
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[92], w_fp[69], w_fp[6], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[92], w_fp[69], w_fp[6], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -5816,7 +5816,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 280
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[92], w_fp[70], w_fp[4], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[92], w_fp[70], w_fp[4], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -5835,7 +5835,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 281
-      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[66], w_fp[4], w_fp[6], w_fp[92], COUPs[2], &amp_fp[0] );
+      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[66], w_fp[4], w_fp[6], w_fp[92], COUPs[2], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -5847,7 +5847,7 @@ namespace mg5amcCpu
       jamp_sv[111] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[113] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[117] -= cxtype( 0, 1 ) * amp_sv[0];
-      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[66], w_fp[4], w_fp[6], w_fp[92], COUPs[2], &amp_fp[0] );
+      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[66], w_fp[4], w_fp[6], w_fp[92], COUPs[2], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -5859,7 +5859,7 @@ namespace mg5amcCpu
       jamp_sv[83] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[107] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[117] -= cxtype( 0, 1 ) * amp_sv[0];
-      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[66], w_fp[4], w_fp[6], w_fp[92], COUPs[2], &amp_fp[0] );
+      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[66], w_fp[4], w_fp[6], w_fp[92], COUPs[2], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -5875,10 +5875,10 @@ namespace mg5amcCpu
       // *** DIAGRAM 282 OF 1240 ***
 
       // Wavefunction(s) for diagram number 282
-      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[2], w_fp[66], COUPs[1], cIPD[0], cIPD[1], w_fp[94] );
+      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[2], w_fp[66], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[94] );
 
       // Amplitude(s) for diagram number 282
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[34], w_fp[94], w_fp[6], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[34], w_fp[94], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -5891,7 +5891,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 283
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[34], w_fp[2], w_fp[70], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[34], w_fp[2], w_fp[70], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -5906,7 +5906,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 284
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[88], w_fp[94], w_fp[4], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[88], w_fp[94], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -5919,7 +5919,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 285
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[88], w_fp[2], w_fp[69], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[88], w_fp[2], w_fp[69], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -5931,10 +5931,10 @@ namespace mg5amcCpu
       // *** DIAGRAM 286 OF 1240 ***
 
       // Wavefunction(s) for diagram number 286
-      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[62], w_fp[66], COUPs[1], cIPD[0], cIPD[1], w_fp[97] );
+      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[62], w_fp[66], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[97] );
 
       // Amplitude(s) for diagram number 286
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[97], w_fp[33], w_fp[6], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[97], w_fp[33], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -5944,10 +5944,10 @@ namespace mg5amcCpu
       // *** DIAGRAM 287 OF 1240 ***
 
       // Wavefunction(s) for diagram number 287
-      FFV1P0_3<W_ACCESS, CD_ACCESS>( w_fp[62], w_fp[33], COUPs[1], 0., 0., w_fp[98] );
+      FFV1P0_3<W_ACCESS, CD_ACCESS>( w_fp[62], w_fp[33], COUPs[1], 1.0, 0., 0., w_fp[98] );
 
       // Amplitude(s) for diagram number 287
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[66], w_fp[6], w_fp[98], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[66], w_fp[6], w_fp[98], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -5962,7 +5962,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 288
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[88], w_fp[33], w_fp[66], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[88], w_fp[33], w_fp[66], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -5975,7 +5975,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 289
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[97], w_fp[47], w_fp[4], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[97], w_fp[47], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -5988,7 +5988,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 290
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[66], w_fp[4], w_fp[96], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[66], w_fp[4], w_fp[96], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -6003,7 +6003,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 291
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[34], w_fp[47], w_fp[66], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[34], w_fp[47], w_fp[66], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -6016,7 +6016,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 292
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[97], w_fp[2], w_fp[27], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[97], w_fp[2], w_fp[27], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -6031,7 +6031,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 293
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[66], w_fp[27], w_fp[92], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[66], w_fp[27], w_fp[92], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -6050,7 +6050,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 294
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[91], w_fp[2], w_fp[66], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[91], w_fp[2], w_fp[66], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -6065,7 +6065,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 295
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[92], w_fp[74], w_fp[5], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[92], w_fp[74], w_fp[5], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -6084,7 +6084,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 296
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[92], w_fp[75], w_fp[4], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[92], w_fp[75], w_fp[4], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -6103,7 +6103,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 297
-      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[72], w_fp[4], w_fp[5], w_fp[92], COUPs[2], &amp_fp[0] );
+      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[72], w_fp[4], w_fp[5], w_fp[92], COUPs[2], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -6115,7 +6115,7 @@ namespace mg5amcCpu
       jamp_sv[89] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[93] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[105] += cxtype( 0, 1 ) * amp_sv[0];
-      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[72], w_fp[4], w_fp[5], w_fp[92], COUPs[2], &amp_fp[0] );
+      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[72], w_fp[4], w_fp[5], w_fp[92], COUPs[2], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -6127,7 +6127,7 @@ namespace mg5amcCpu
       jamp_sv[83] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[93] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[107] += cxtype( 0, 1 ) * amp_sv[0];
-      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[72], w_fp[4], w_fp[5], w_fp[92], COUPs[2], &amp_fp[0] );
+      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[72], w_fp[4], w_fp[5], w_fp[92], COUPs[2], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -6143,10 +6143,10 @@ namespace mg5amcCpu
       // *** DIAGRAM 298 OF 1240 ***
 
       // Wavefunction(s) for diagram number 298
-      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[2], w_fp[72], COUPs[1], cIPD[0], cIPD[1], w_fp[97] );
+      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[2], w_fp[72], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[97] );
 
       // Amplitude(s) for diagram number 298
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[34], w_fp[97], w_fp[5], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[34], w_fp[97], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -6159,7 +6159,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 299
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[34], w_fp[2], w_fp[75], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[34], w_fp[2], w_fp[75], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -6174,7 +6174,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 300
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[86], w_fp[97], w_fp[4], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[86], w_fp[97], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -6187,7 +6187,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 301
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[86], w_fp[2], w_fp[74], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[86], w_fp[2], w_fp[74], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -6199,10 +6199,10 @@ namespace mg5amcCpu
       // *** DIAGRAM 302 OF 1240 ***
 
       // Wavefunction(s) for diagram number 302
-      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[62], w_fp[72], COUPs[1], cIPD[0], cIPD[1], w_fp[99] );
+      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[62], w_fp[72], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[99] );
 
       // Amplitude(s) for diagram number 302
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[33], w_fp[5], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[33], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -6215,7 +6215,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 303
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[72], w_fp[5], w_fp[98], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[72], w_fp[5], w_fp[98], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -6230,7 +6230,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 304
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[86], w_fp[33], w_fp[72], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[86], w_fp[33], w_fp[72], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -6243,7 +6243,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 305
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[39], w_fp[4], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[39], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -6256,7 +6256,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 306
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[72], w_fp[4], w_fp[95], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[72], w_fp[4], w_fp[95], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -6271,7 +6271,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 307
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[34], w_fp[39], w_fp[72], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[34], w_fp[39], w_fp[72], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -6284,7 +6284,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 308
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[2], w_fp[24], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[2], w_fp[24], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -6299,7 +6299,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 309
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[72], w_fp[24], w_fp[92], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[72], w_fp[24], w_fp[92], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -6318,7 +6318,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 310
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[90], w_fp[2], w_fp[72], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[90], w_fp[2], w_fp[72], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -6330,10 +6330,10 @@ namespace mg5amcCpu
       // *** DIAGRAM 311 OF 1240 ***
 
       // Wavefunction(s) for diagram number 311
-      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[62], w_fp[1], COUPs[1], cIPD[0], cIPD[1], w_fp[99] );
+      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[62], w_fp[1], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[99] );
 
       // Amplitude(s) for diagram number 311
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[35], w_fp[6], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[35], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -6345,7 +6345,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 312
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[36], w_fp[5], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[36], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -6354,10 +6354,10 @@ namespace mg5amcCpu
       // *** DIAGRAM 313 OF 1240 ***
 
       // Wavefunction(s) for diagram number 313
-      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[33], w_fp[1], COUPs[1], cIPD[0], cIPD[1], w_fp[100] );
+      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[33], w_fp[1], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[100] );
 
       // Amplitude(s) for diagram number 313
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[86], w_fp[100], w_fp[6], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[86], w_fp[100], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -6369,7 +6369,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 314
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[86], w_fp[36], w_fp[1], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[86], w_fp[36], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -6381,7 +6381,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 315
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[88], w_fp[100], w_fp[5], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[88], w_fp[100], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -6393,7 +6393,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 316
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[88], w_fp[35], w_fp[1], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[88], w_fp[35], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -6405,7 +6405,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 317
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[33], w_fp[29], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[33], w_fp[29], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -6418,7 +6418,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 318
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[29], w_fp[98], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[29], w_fp[98], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -6433,7 +6433,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 319
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[89], w_fp[33], w_fp[1], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[89], w_fp[33], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -6446,7 +6446,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 320
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[43], w_fp[6], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[43], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -6458,7 +6458,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 321
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[44], w_fp[4], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[44], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -6467,10 +6467,10 @@ namespace mg5amcCpu
       // *** DIAGRAM 322 OF 1240 ***
 
       // Wavefunction(s) for diagram number 322
-      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[39], w_fp[1], COUPs[1], cIPD[0], cIPD[1], w_fp[89] );
+      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[39], w_fp[1], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[89] );
 
       // Amplitude(s) for diagram number 322
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[34], w_fp[89], w_fp[6], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[34], w_fp[89], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -6482,7 +6482,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 323
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[34], w_fp[44], w_fp[1], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[34], w_fp[44], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -6494,7 +6494,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 324
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[88], w_fp[89], w_fp[4], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[88], w_fp[89], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -6506,7 +6506,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 325
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[88], w_fp[43], w_fp[1], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[88], w_fp[43], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -6518,7 +6518,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 326
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[39], w_fp[27], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[39], w_fp[27], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -6531,7 +6531,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 327
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[27], w_fp[95], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[27], w_fp[95], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -6546,7 +6546,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 328
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[91], w_fp[39], w_fp[1], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[91], w_fp[39], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -6559,7 +6559,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 329
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[49], w_fp[5], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[49], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -6571,7 +6571,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 330
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[50], w_fp[4], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[50], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -6580,10 +6580,10 @@ namespace mg5amcCpu
       // *** DIAGRAM 331 OF 1240 ***
 
       // Wavefunction(s) for diagram number 331
-      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[47], w_fp[1], COUPs[1], cIPD[0], cIPD[1], w_fp[91] );
+      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[47], w_fp[1], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[91] );
 
       // Amplitude(s) for diagram number 331
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[34], w_fp[91], w_fp[5], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[34], w_fp[91], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -6595,7 +6595,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 332
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[34], w_fp[50], w_fp[1], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[34], w_fp[50], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -6607,7 +6607,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 333
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[86], w_fp[91], w_fp[4], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[86], w_fp[91], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -6619,7 +6619,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 334
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[86], w_fp[49], w_fp[1], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[86], w_fp[49], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -6631,7 +6631,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 335
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[47], w_fp[24], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[47], w_fp[24], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -6644,7 +6644,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 336
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[24], w_fp[96], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[24], w_fp[96], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -6659,7 +6659,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 337
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[90], w_fp[47], w_fp[1], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[90], w_fp[47], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -6672,7 +6672,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 338
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[17], w_fp[6], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[17], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -6685,7 +6685,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 339
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[2], w_fp[42], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[2], w_fp[42], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -6700,7 +6700,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 340
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[92], w_fp[59], w_fp[6], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[92], w_fp[59], w_fp[6], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -6719,7 +6719,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 341
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[92], w_fp[1], w_fp[42], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[92], w_fp[1], w_fp[42], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -6738,7 +6738,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 342
-      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[24], w_fp[6], w_fp[92], COUPs[2], &amp_fp[0] );
+      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[24], w_fp[6], w_fp[92], COUPs[2], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -6750,7 +6750,7 @@ namespace mg5amcCpu
       jamp_sv[107] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[113] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[119] += cxtype( 0, 1 ) * amp_sv[0];
-      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[24], w_fp[6], w_fp[92], COUPs[2], &amp_fp[0] );
+      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[24], w_fp[6], w_fp[92], COUPs[2], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -6762,7 +6762,7 @@ namespace mg5amcCpu
       jamp_sv[89] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[105] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[107] -= cxtype( 0, 1 ) * amp_sv[0];
-      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[24], w_fp[6], w_fp[92], COUPs[2], &amp_fp[0] );
+      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[24], w_fp[6], w_fp[92], COUPs[2], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -6781,7 +6781,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 343
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[88], w_fp[2], w_fp[59], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[88], w_fp[2], w_fp[59], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -6796,7 +6796,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 344
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[88], w_fp[17], w_fp[1], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[88], w_fp[17], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -6809,7 +6809,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 345
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[15], w_fp[5], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[15], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -6822,7 +6822,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 346
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[2], w_fp[16], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[2], w_fp[16], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -6837,7 +6837,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 347
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[92], w_fp[68], w_fp[5], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[92], w_fp[68], w_fp[5], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -6856,7 +6856,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 348
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[92], w_fp[1], w_fp[16], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[92], w_fp[1], w_fp[16], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -6875,7 +6875,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 349
-      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[27], w_fp[5], w_fp[92], COUPs[2], &amp_fp[0] );
+      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[27], w_fp[5], w_fp[92], COUPs[2], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -6887,7 +6887,7 @@ namespace mg5amcCpu
       jamp_sv[89] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[95] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[111] -= cxtype( 0, 1 ) * amp_sv[0];
-      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[27], w_fp[5], w_fp[92], COUPs[2], &amp_fp[0] );
+      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[27], w_fp[5], w_fp[92], COUPs[2], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -6899,7 +6899,7 @@ namespace mg5amcCpu
       jamp_sv[83] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[111] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[113] += cxtype( 0, 1 ) * amp_sv[0];
-      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[27], w_fp[5], w_fp[92], COUPs[2], &amp_fp[0] );
+      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[27], w_fp[5], w_fp[92], COUPs[2], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -6918,7 +6918,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 350
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[86], w_fp[2], w_fp[68], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[86], w_fp[2], w_fp[68], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -6933,7 +6933,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 351
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[86], w_fp[15], w_fp[1], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[86], w_fp[15], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -6946,7 +6946,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 352
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[18], w_fp[4], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[18], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -6959,7 +6959,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 353
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[2], w_fp[19], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[2], w_fp[19], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -6974,7 +6974,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 354
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[92], w_fp[67], w_fp[4], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[92], w_fp[67], w_fp[4], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -6993,7 +6993,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 355
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[92], w_fp[1], w_fp[19], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[92], w_fp[1], w_fp[19], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -7012,7 +7012,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 356
-      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[4], w_fp[29], w_fp[92], COUPs[2], &amp_fp[0] );
+      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[4], w_fp[29], w_fp[92], COUPs[2], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -7024,7 +7024,7 @@ namespace mg5amcCpu
       jamp_sv[95] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[117] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[119] += cxtype( 0, 1 ) * amp_sv[0];
-      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[4], w_fp[29], w_fp[92], COUPs[2], &amp_fp[0] );
+      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[4], w_fp[29], w_fp[92], COUPs[2], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -7036,7 +7036,7 @@ namespace mg5amcCpu
       jamp_sv[71] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[93] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[117] -= cxtype( 0, 1 ) * amp_sv[0];
-      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[4], w_fp[29], w_fp[92], COUPs[2], &amp_fp[0] );
+      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[4], w_fp[29], w_fp[92], COUPs[2], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -7055,7 +7055,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 357
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[34], w_fp[2], w_fp[67], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[34], w_fp[2], w_fp[67], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -7070,7 +7070,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 358
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[34], w_fp[18], w_fp[1], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[34], w_fp[18], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -7083,7 +7083,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 359
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[73], w_fp[6], w_fp[92], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[73], w_fp[6], w_fp[92], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -7095,7 +7095,7 @@ namespace mg5amcCpu
       jamp_sv[107] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[113] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[119] += cxtype( 0, 1 ) * amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[79], w_fp[6], w_fp[92], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[79], w_fp[6], w_fp[92], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -7107,7 +7107,7 @@ namespace mg5amcCpu
       jamp_sv[111] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[113] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[117] += cxtype( 0, 1 ) * amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[80], w_fp[6], w_fp[92], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[80], w_fp[6], w_fp[92], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -7126,7 +7126,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 360
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[88], w_fp[2], w_fp[73], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[88], w_fp[2], w_fp[73], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -7134,7 +7134,7 @@ namespace mg5amcCpu
       jamp_sv[39] -= amp_sv[0];
       jamp_sv[63] -= amp_sv[0];
       jamp_sv[87] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[88], w_fp[2], w_fp[79], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[88], w_fp[2], w_fp[79], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -7142,7 +7142,7 @@ namespace mg5amcCpu
       jamp_sv[57] += amp_sv[0];
       jamp_sv[63] -= amp_sv[0];
       jamp_sv[81] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[88], w_fp[2], w_fp[80], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[88], w_fp[2], w_fp[80], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -7157,7 +7157,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 361
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[62], w_fp[47], w_fp[73], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[62], w_fp[47], w_fp[73], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -7165,7 +7165,7 @@ namespace mg5amcCpu
       jamp_sv[107] -= amp_sv[0];
       jamp_sv[113] -= amp_sv[0];
       jamp_sv[119] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[62], w_fp[47], w_fp[79], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[62], w_fp[47], w_fp[79], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -7173,7 +7173,7 @@ namespace mg5amcCpu
       jamp_sv[111] += amp_sv[0];
       jamp_sv[113] -= amp_sv[0];
       jamp_sv[117] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[62], w_fp[47], w_fp[80], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[62], w_fp[47], w_fp[80], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -7188,7 +7188,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 362
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[57], w_fp[5], w_fp[92], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[57], w_fp[5], w_fp[92], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -7200,7 +7200,7 @@ namespace mg5amcCpu
       jamp_sv[89] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[95] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[111] -= cxtype( 0, 1 ) * amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[81], w_fp[5], w_fp[92], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[81], w_fp[5], w_fp[92], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -7212,7 +7212,7 @@ namespace mg5amcCpu
       jamp_sv[89] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[93] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[105] -= cxtype( 0, 1 ) * amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[82], w_fp[5], w_fp[92], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[82], w_fp[5], w_fp[92], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -7231,7 +7231,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 363
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[86], w_fp[2], w_fp[57], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[86], w_fp[2], w_fp[57], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -7239,7 +7239,7 @@ namespace mg5amcCpu
       jamp_sv[45] -= amp_sv[0];
       jamp_sv[69] -= amp_sv[0];
       jamp_sv[111] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[86], w_fp[2], w_fp[81], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[86], w_fp[2], w_fp[81], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -7247,7 +7247,7 @@ namespace mg5amcCpu
       jamp_sv[59] += amp_sv[0];
       jamp_sv[69] -= amp_sv[0];
       jamp_sv[105] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[86], w_fp[2], w_fp[82], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[86], w_fp[2], w_fp[82], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -7262,7 +7262,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 364
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[62], w_fp[39], w_fp[57], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[62], w_fp[39], w_fp[57], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -7270,7 +7270,7 @@ namespace mg5amcCpu
       jamp_sv[83] -= amp_sv[0];
       jamp_sv[89] -= amp_sv[0];
       jamp_sv[95] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[62], w_fp[39], w_fp[81], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[62], w_fp[39], w_fp[81], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -7278,7 +7278,7 @@ namespace mg5amcCpu
       jamp_sv[87] += amp_sv[0];
       jamp_sv[89] -= amp_sv[0];
       jamp_sv[93] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[62], w_fp[39], w_fp[82], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[62], w_fp[39], w_fp[82], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -7293,7 +7293,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 365
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[55], w_fp[4], w_fp[92], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[55], w_fp[4], w_fp[92], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -7305,7 +7305,7 @@ namespace mg5amcCpu
       jamp_sv[71] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[93] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[117] -= cxtype( 0, 1 ) * amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[83], w_fp[4], w_fp[92], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[83], w_fp[4], w_fp[92], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -7317,7 +7317,7 @@ namespace mg5amcCpu
       jamp_sv[83] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[93] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[107] -= cxtype( 0, 1 ) * amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[84], w_fp[4], w_fp[92], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[84], w_fp[4], w_fp[92], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -7336,7 +7336,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 366
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[34], w_fp[2], w_fp[55], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[34], w_fp[2], w_fp[55], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -7344,7 +7344,7 @@ namespace mg5amcCpu
       jamp_sv[47] -= amp_sv[0];
       jamp_sv[93] -= amp_sv[0];
       jamp_sv[117] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[34], w_fp[2], w_fp[83], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[34], w_fp[2], w_fp[83], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -7352,7 +7352,7 @@ namespace mg5amcCpu
       jamp_sv[83] += amp_sv[0];
       jamp_sv[93] -= amp_sv[0];
       jamp_sv[107] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[34], w_fp[2], w_fp[84], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[34], w_fp[2], w_fp[84], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -7367,7 +7367,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 367
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[62], w_fp[33], w_fp[55], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[62], w_fp[33], w_fp[55], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -7375,7 +7375,7 @@ namespace mg5amcCpu
       jamp_sv[59] -= amp_sv[0];
       jamp_sv[65] -= amp_sv[0];
       jamp_sv[71] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[62], w_fp[33], w_fp[83], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[62], w_fp[33], w_fp[83], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -7383,7 +7383,7 @@ namespace mg5amcCpu
       jamp_sv[63] += amp_sv[0];
       jamp_sv[65] -= amp_sv[0];
       jamp_sv[69] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[62], w_fp[33], w_fp[84], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[62], w_fp[33], w_fp[84], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -7398,7 +7398,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 368
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[2], w_fp[30], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[2], w_fp[30], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -7406,7 +7406,7 @@ namespace mg5amcCpu
       jamp_sv[71] -= amp_sv[0];
       jamp_sv[95] -= amp_sv[0];
       jamp_sv[119] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[2], w_fp[31], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[2], w_fp[31], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -7414,7 +7414,7 @@ namespace mg5amcCpu
       jamp_sv[89] += amp_sv[0];
       jamp_sv[95] -= amp_sv[0];
       jamp_sv[113] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[2], w_fp[32], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[2], w_fp[32], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -7429,7 +7429,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 369
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[30], w_fp[92], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[30], w_fp[92], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -7441,7 +7441,7 @@ namespace mg5amcCpu
       jamp_sv[71] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[95] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[119] += cxtype( 0, 1 ) * amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[31], w_fp[92], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[31], w_fp[92], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -7453,7 +7453,7 @@ namespace mg5amcCpu
       jamp_sv[89] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[95] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[113] += cxtype( 0, 1 ) * amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[32], w_fp[92], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[32], w_fp[92], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -7469,11 +7469,11 @@ namespace mg5amcCpu
       // *** DIAGRAM 370 OF 1240 ***
 
       // Wavefunction(s) for diagram number 370
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[4], COUPs[0], 0., 0., w_fp[92] );
-      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[92], COUPs[1], cIPD[0], cIPD[1], w_fp[99] );
+      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[4], COUPs[0], 1.0, 0., 0., w_fp[92] );
+      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[92], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[99] );
 
       // Amplitude(s) for diagram number 370
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[9], w_fp[6], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[9], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -7486,7 +7486,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 371
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[85], w_fp[5], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[85], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -7496,11 +7496,11 @@ namespace mg5amcCpu
       // *** DIAGRAM 372 OF 1240 ***
 
       // Wavefunction(s) for diagram number 372
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[92], w_fp[5], COUPs[0], 0., 0., w_fp[62] );
-      FFV1P0_3<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[77], COUPs[1], 0., 0., w_fp[34] );
+      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[92], w_fp[5], COUPs[0], 1.0, 0., 0., w_fp[62] );
+      FFV1P0_3<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[77], COUPs[1], 1.0, 0., 0., w_fp[34] );
 
       // Amplitude(s) for diagram number 372
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[62], w_fp[34], w_fp[6], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[62], w_fp[34], w_fp[6], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -7519,7 +7519,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 373
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[85], w_fp[62], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[85], w_fp[62], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -7531,10 +7531,10 @@ namespace mg5amcCpu
       // *** DIAGRAM 374 OF 1240 ***
 
       // Wavefunction(s) for diagram number 374
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[92], w_fp[6], COUPs[0], 0., 0., w_fp[86] );
+      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[92], w_fp[6], COUPs[0], 1.0, 0., 0., w_fp[86] );
 
       // Amplitude(s) for diagram number 374
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[86], w_fp[34], w_fp[5], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[86], w_fp[34], w_fp[5], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -7553,7 +7553,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 375
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[9], w_fp[86], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[9], w_fp[86], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -7565,12 +7565,12 @@ namespace mg5amcCpu
       // *** DIAGRAM 376 OF 1240 ***
 
       // Wavefunction(s) for diagram number 376
-      VVVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[92], w_fp[5], w_fp[6], COUPs[2], 0., 0., w_fp[88] );
-      VVVV3P0_1<W_ACCESS, CD_ACCESS>( w_fp[92], w_fp[5], w_fp[6], COUPs[2], 0., 0., w_fp[90] );
-      VVVV4P0_1<W_ACCESS, CD_ACCESS>( w_fp[92], w_fp[5], w_fp[6], COUPs[2], 0., 0., w_fp[96] );
+      VVVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[92], w_fp[5], w_fp[6], COUPs[2], 1.0, 0., 0., w_fp[88] );
+      VVVV3P0_1<W_ACCESS, CD_ACCESS>( w_fp[92], w_fp[5], w_fp[6], COUPs[2], 1.0, 0., 0., w_fp[90] );
+      VVVV4P0_1<W_ACCESS, CD_ACCESS>( w_fp[92], w_fp[5], w_fp[6], COUPs[2], 1.0, 0., 0., w_fp[96] );
 
       // Amplitude(s) for diagram number 376
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[77], w_fp[88], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[77], w_fp[88], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -7582,7 +7582,7 @@ namespace mg5amcCpu
       jamp_sv[41] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[46] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[47] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[77], w_fp[90], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[77], w_fp[90], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -7594,7 +7594,7 @@ namespace mg5amcCpu
       jamp_sv[41] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[42] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[44] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[77], w_fp[96], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[77], w_fp[96], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -7610,10 +7610,10 @@ namespace mg5amcCpu
       // *** DIAGRAM 377 OF 1240 ***
 
       // Wavefunction(s) for diagram number 377
-      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[77], w_fp[92], COUPs[1], cIPD[0], cIPD[1], w_fp[95] );
+      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[77], w_fp[92], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[95] );
 
       // Amplitude(s) for diagram number 377
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[95], w_fp[6], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[95], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -7623,10 +7623,10 @@ namespace mg5amcCpu
       // *** DIAGRAM 378 OF 1240 ***
 
       // Wavefunction(s) for diagram number 378
-      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[38], w_fp[92], COUPs[1], cIPD[0], cIPD[1], w_fp[98] );
+      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[38], w_fp[92], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[98] );
 
       // Amplitude(s) for diagram number 378
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[98], w_fp[77], w_fp[6], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[98], w_fp[77], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -7639,7 +7639,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 379
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[77], w_fp[86], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[77], w_fp[86], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -7654,7 +7654,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 380
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[95], w_fp[5], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[95], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -7664,10 +7664,10 @@ namespace mg5amcCpu
       // *** DIAGRAM 381 OF 1240 ***
 
       // Wavefunction(s) for diagram number 381
-      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[41], w_fp[92], COUPs[1], cIPD[0], cIPD[1], w_fp[101] );
+      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[41], w_fp[92], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[101] );
 
       // Amplitude(s) for diagram number 381
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[101], w_fp[77], w_fp[5], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[101], w_fp[77], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -7680,7 +7680,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 382
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[77], w_fp[62], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[77], w_fp[62], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -7695,7 +7695,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 383
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[95], w_fp[29], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[95], w_fp[29], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -7710,7 +7710,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 384
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[77], w_fp[29], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[77], w_fp[29], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -7722,10 +7722,10 @@ namespace mg5amcCpu
       // *** DIAGRAM 385 OF 1240 ***
 
       // Wavefunction(s) for diagram number 385
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[92], w_fp[29], COUPs[0], 0., 0., w_fp[95] );
+      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[92], w_fp[29], COUPs[0], 1.0, 0., 0., w_fp[95] );
 
       // Amplitude(s) for diagram number 385
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[77], w_fp[95], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[77], w_fp[95], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -7741,10 +7741,10 @@ namespace mg5amcCpu
       // *** DIAGRAM 386 OF 1240 ***
 
       // Wavefunction(s) for diagram number 386
-      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[2], w_fp[92], COUPs[1], cIPD[0], cIPD[1], w_fp[102] );
+      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[2], w_fp[92], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[102] );
 
       // Amplitude(s) for diagram number 386
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[22], w_fp[102], w_fp[6], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[22], w_fp[102], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -7757,7 +7757,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 387
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[21], w_fp[102], w_fp[5], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[21], w_fp[102], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -7767,10 +7767,10 @@ namespace mg5amcCpu
       // *** DIAGRAM 388 OF 1240 ***
 
       // Wavefunction(s) for diagram number 388
-      FFV1P0_3<W_ACCESS, CD_ACCESS>( w_fp[52], w_fp[2], COUPs[1], 0., 0., w_fp[103] );
+      FFV1P0_3<W_ACCESS, CD_ACCESS>( w_fp[52], w_fp[2], COUPs[1], 1.0, 0., 0., w_fp[103] );
 
       // Amplitude(s) for diagram number 388
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[62], w_fp[103], w_fp[6], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[62], w_fp[103], w_fp[6], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -7789,7 +7789,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 389
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[21], w_fp[2], w_fp[62], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[21], w_fp[2], w_fp[62], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -7804,7 +7804,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 390
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[86], w_fp[103], w_fp[5], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[86], w_fp[103], w_fp[5], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -7823,7 +7823,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 391
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[22], w_fp[2], w_fp[86], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[22], w_fp[2], w_fp[86], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -7838,7 +7838,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 392
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[2], w_fp[88], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[2], w_fp[88], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -7850,7 +7850,7 @@ namespace mg5amcCpu
       jamp_sv[94] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[115] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[118] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[2], w_fp[90], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[2], w_fp[90], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -7862,7 +7862,7 @@ namespace mg5amcCpu
       jamp_sv[94] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[99] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[109] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[2], w_fp[96], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[2], w_fp[96], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -7878,10 +7878,10 @@ namespace mg5amcCpu
       // *** DIAGRAM 393 OF 1240 ***
 
       // Wavefunction(s) for diagram number 393
-      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[52], w_fp[92], COUPs[1], cIPD[0], cIPD[1], w_fp[104] );
+      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[52], w_fp[92], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[104] );
 
       // Amplitude(s) for diagram number 393
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[104], w_fp[39], w_fp[6], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[104], w_fp[39], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -7891,10 +7891,10 @@ namespace mg5amcCpu
       // *** DIAGRAM 394 OF 1240 ***
 
       // Wavefunction(s) for diagram number 394
-      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[39], w_fp[92], COUPs[1], cIPD[0], cIPD[1], w_fp[105] );
+      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[39], w_fp[92], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[105] );
 
       // Amplitude(s) for diagram number 394
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[105], w_fp[6], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[105], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -7907,7 +7907,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 395
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[39], w_fp[86], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[39], w_fp[86], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -7922,7 +7922,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 396
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[104], w_fp[47], w_fp[5], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[104], w_fp[47], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -7932,10 +7932,10 @@ namespace mg5amcCpu
       // *** DIAGRAM 397 OF 1240 ***
 
       // Wavefunction(s) for diagram number 397
-      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[47], w_fp[92], COUPs[1], cIPD[0], cIPD[1], w_fp[106] );
+      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[47], w_fp[92], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[106] );
 
       // Amplitude(s) for diagram number 397
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[106], w_fp[5], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[106], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -7948,7 +7948,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 398
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[47], w_fp[62], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[47], w_fp[62], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -7963,7 +7963,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 399
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[104], w_fp[2], w_fp[29], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[104], w_fp[2], w_fp[29], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -7978,7 +7978,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 400
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[102], w_fp[29], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[102], w_fp[29], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -7993,7 +7993,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 401
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[2], w_fp[95], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[2], w_fp[95], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -8012,7 +8012,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 402
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[71], w_fp[102], w_fp[6], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[71], w_fp[102], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -8027,7 +8027,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 403
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[102], w_fp[70], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[102], w_fp[70], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -8046,7 +8046,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 404
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[94], w_fp[6], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[94], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -8061,7 +8061,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 405
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[2], w_fp[70], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[2], w_fp[70], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -8080,7 +8080,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 406
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[94], w_fp[86], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[94], w_fp[86], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -8099,7 +8099,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 407
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[71], w_fp[2], w_fp[86], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[71], w_fp[2], w_fp[86], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -8118,7 +8118,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 408
-      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[92], w_fp[66], w_fp[8], w_fp[6], COUPs[2], &amp_fp[0] );
+      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[92], w_fp[66], w_fp[8], w_fp[6], COUPs[2], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -8138,7 +8138,7 @@ namespace mg5amcCpu
       jamp_sv[109] -= amp_sv[0];
       jamp_sv[116] -= amp_sv[0];
       jamp_sv[117] += amp_sv[0];
-      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[92], w_fp[66], w_fp[8], w_fp[6], COUPs[2], &amp_fp[0] );
+      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[92], w_fp[66], w_fp[8], w_fp[6], COUPs[2], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -8158,7 +8158,7 @@ namespace mg5amcCpu
       jamp_sv[107] -= amp_sv[0];
       jamp_sv[116] -= amp_sv[0];
       jamp_sv[117] += amp_sv[0];
-      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[92], w_fp[66], w_fp[8], w_fp[6], COUPs[2], &amp_fp[0] );
+      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[92], w_fp[66], w_fp[8], w_fp[6], COUPs[2], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -8182,10 +8182,10 @@ namespace mg5amcCpu
       // *** DIAGRAM 409 OF 1240 ***
 
       // Wavefunction(s) for diagram number 409
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[92], w_fp[66], COUPs[0], 0., 0., w_fp[104] );
+      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[92], w_fp[66], COUPs[0], 1.0, 0., 0., w_fp[104] );
 
       // Amplitude(s) for diagram number 409
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[6], w_fp[104], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[6], w_fp[104], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -8209,10 +8209,10 @@ namespace mg5amcCpu
       // *** DIAGRAM 410 OF 1240 ***
 
       // Wavefunction(s) for diagram number 410
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[92], w_fp[8], COUPs[0], 0., 0., w_fp[107] );
+      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[92], w_fp[8], COUPs[0], 1.0, 0., 0., w_fp[107] );
 
       // Amplitude(s) for diagram number 410
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[66], w_fp[6], w_fp[107], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[66], w_fp[6], w_fp[107], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -8239,7 +8239,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 411
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[66], w_fp[8], w_fp[86], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[66], w_fp[8], w_fp[86], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -8266,7 +8266,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 412
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[47], w_fp[104], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[47], w_fp[104], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -8285,7 +8285,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 413
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[106], w_fp[66], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[106], w_fp[66], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -8300,7 +8300,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 414
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[47], w_fp[66], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[47], w_fp[66], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -8315,7 +8315,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 415
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[2], w_fp[104], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[2], w_fp[104], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -8334,7 +8334,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 416
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[102], w_fp[66], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[102], w_fp[66], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -8349,7 +8349,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 417
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[101], w_fp[2], w_fp[66], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[101], w_fp[2], w_fp[66], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -8364,7 +8364,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 418
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[76], w_fp[102], w_fp[5], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[76], w_fp[102], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -8379,7 +8379,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 419
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[102], w_fp[75], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[102], w_fp[75], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -8398,7 +8398,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 420
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[97], w_fp[5], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[97], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -8413,7 +8413,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 421
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[2], w_fp[75], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[2], w_fp[75], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -8432,7 +8432,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 422
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[97], w_fp[62], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[97], w_fp[62], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -8451,7 +8451,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 423
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[76], w_fp[2], w_fp[62], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[76], w_fp[2], w_fp[62], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -8470,7 +8470,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 424
-      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[92], w_fp[72], w_fp[8], w_fp[5], COUPs[2], &amp_fp[0] );
+      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[92], w_fp[72], w_fp[8], w_fp[5], COUPs[2], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -8490,7 +8490,7 @@ namespace mg5amcCpu
       jamp_sv[93] += amp_sv[0];
       jamp_sv[102] += amp_sv[0];
       jamp_sv[104] -= amp_sv[0];
-      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[92], w_fp[72], w_fp[8], w_fp[5], COUPs[2], &amp_fp[0] );
+      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[92], w_fp[72], w_fp[8], w_fp[5], COUPs[2], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -8510,7 +8510,7 @@ namespace mg5amcCpu
       jamp_sv[93] += amp_sv[0];
       jamp_sv[106] += amp_sv[0];
       jamp_sv[107] -= amp_sv[0];
-      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[92], w_fp[72], w_fp[8], w_fp[5], COUPs[2], &amp_fp[0] );
+      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[92], w_fp[72], w_fp[8], w_fp[5], COUPs[2], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -8534,10 +8534,10 @@ namespace mg5amcCpu
       // *** DIAGRAM 425 OF 1240 ***
 
       // Wavefunction(s) for diagram number 425
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[92], w_fp[72], COUPs[0], 0., 0., w_fp[104] );
+      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[92], w_fp[72], COUPs[0], 1.0, 0., 0., w_fp[104] );
 
       // Amplitude(s) for diagram number 425
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[5], w_fp[104], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[5], w_fp[104], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -8564,7 +8564,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 426
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[72], w_fp[5], w_fp[107], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[72], w_fp[5], w_fp[107], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -8591,7 +8591,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 427
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[72], w_fp[8], w_fp[62], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[72], w_fp[8], w_fp[62], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -8618,7 +8618,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 428
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[39], w_fp[104], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[39], w_fp[104], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -8637,7 +8637,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 429
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[105], w_fp[72], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[105], w_fp[72], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -8652,7 +8652,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 430
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[39], w_fp[72], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[39], w_fp[72], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -8667,7 +8667,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 431
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[2], w_fp[104], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[2], w_fp[104], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -8686,7 +8686,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 432
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[102], w_fp[72], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[102], w_fp[72], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -8701,7 +8701,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 433
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[98], w_fp[2], w_fp[72], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[98], w_fp[2], w_fp[72], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -8713,10 +8713,10 @@ namespace mg5amcCpu
       // *** DIAGRAM 434 OF 1240 ***
 
       // Wavefunction(s) for diagram number 434
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[92], w_fp[1], COUPs[0], 0., 0., w_fp[104] );
+      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[92], w_fp[1], COUPs[0], 1.0, 0., 0., w_fp[104] );
 
       // Amplitude(s) for diagram number 434
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[104], w_fp[10], w_fp[6], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[104], w_fp[10], w_fp[6], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -8743,7 +8743,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 435
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[104], w_fp[11], w_fp[5], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[104], w_fp[11], w_fp[5], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -8770,7 +8770,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 436
-      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[5], w_fp[6], w_fp[104], COUPs[2], &amp_fp[0] );
+      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[5], w_fp[6], w_fp[104], COUPs[2], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -8790,7 +8790,7 @@ namespace mg5amcCpu
       jamp_sv[102] -= amp_sv[0];
       jamp_sv[104] += amp_sv[0];
       jamp_sv[108] -= amp_sv[0];
-      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[5], w_fp[6], w_fp[104], COUPs[2], &amp_fp[0] );
+      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[5], w_fp[6], w_fp[104], COUPs[2], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -8810,7 +8810,7 @@ namespace mg5amcCpu
       jamp_sv[116] += amp_sv[0];
       jamp_sv[117] -= amp_sv[0];
       jamp_sv[118] += amp_sv[0];
-      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[5], w_fp[6], w_fp[104], COUPs[2], &amp_fp[0] );
+      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[5], w_fp[6], w_fp[104], COUPs[2], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -8834,10 +8834,10 @@ namespace mg5amcCpu
       // *** DIAGRAM 437 OF 1240 ***
 
       // Wavefunction(s) for diagram number 437
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], COUPs[0], 0., 0., w_fp[108] );
+      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], COUPs[0], 1.0, 0., 0., w_fp[108] );
 
       // Amplitude(s) for diagram number 437
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[62], w_fp[108], w_fp[6], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[62], w_fp[108], w_fp[6], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -8864,7 +8864,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 438
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[62], w_fp[1], w_fp[11], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[62], w_fp[1], w_fp[11], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -8891,7 +8891,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 439
-      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[6], w_fp[62], COUPs[2], &amp_fp[0] );
+      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[6], w_fp[62], COUPs[2], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -8911,7 +8911,7 @@ namespace mg5amcCpu
       jamp_sv[109] += amp_sv[0];
       jamp_sv[115] += amp_sv[0];
       jamp_sv[118] -= amp_sv[0];
-      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[6], w_fp[62], COUPs[2], &amp_fp[0] );
+      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[6], w_fp[62], COUPs[2], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -8931,7 +8931,7 @@ namespace mg5amcCpu
       jamp_sv[104] += amp_sv[0];
       jamp_sv[106] += amp_sv[0];
       jamp_sv[107] -= amp_sv[0];
-      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[6], w_fp[62], COUPs[2], &amp_fp[0] );
+      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[6], w_fp[62], COUPs[2], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -8958,7 +8958,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 440
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[86], w_fp[108], w_fp[5], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[86], w_fp[108], w_fp[5], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -8985,7 +8985,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 441
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[86], w_fp[1], w_fp[10], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[86], w_fp[1], w_fp[10], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -9012,7 +9012,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 442
-      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[5], w_fp[86], COUPs[2], &amp_fp[0] );
+      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[5], w_fp[86], COUPs[2], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -9032,7 +9032,7 @@ namespace mg5amcCpu
       jamp_sv[94] -= amp_sv[0];
       jamp_sv[99] -= amp_sv[0];
       jamp_sv[109] += amp_sv[0];
-      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[5], w_fp[86], COUPs[2], &amp_fp[0] );
+      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[5], w_fp[86], COUPs[2], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -9052,7 +9052,7 @@ namespace mg5amcCpu
       jamp_sv[99] -= amp_sv[0];
       jamp_sv[108] -= amp_sv[0];
       jamp_sv[109] += amp_sv[0];
-      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[5], w_fp[86], COUPs[2], &amp_fp[0] );
+      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[5], w_fp[86], COUPs[2], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -9076,12 +9076,12 @@ namespace mg5amcCpu
       // *** DIAGRAM 443 OF 1240 ***
 
       // Wavefunction(s) for diagram number 443
-      VVVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[92], w_fp[1], w_fp[5], COUPs[2], 0., 0., w_fp[109] );
-      VVVV3P0_1<W_ACCESS, CD_ACCESS>( w_fp[92], w_fp[1], w_fp[5], COUPs[2], 0., 0., w_fp[110] );
-      VVVV4P0_1<W_ACCESS, CD_ACCESS>( w_fp[92], w_fp[1], w_fp[5], COUPs[2], 0., 0., w_fp[111] );
+      VVVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[92], w_fp[1], w_fp[5], COUPs[2], 1.0, 0., 0., w_fp[109] );
+      VVVV3P0_1<W_ACCESS, CD_ACCESS>( w_fp[92], w_fp[1], w_fp[5], COUPs[2], 1.0, 0., 0., w_fp[110] );
+      VVVV4P0_1<W_ACCESS, CD_ACCESS>( w_fp[92], w_fp[1], w_fp[5], COUPs[2], 1.0, 0., 0., w_fp[111] );
 
       // Amplitude(s) for diagram number 443
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[6], w_fp[109], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[6], w_fp[109], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -9101,7 +9101,7 @@ namespace mg5amcCpu
       jamp_sv[109] -= amp_sv[0];
       jamp_sv[116] -= amp_sv[0];
       jamp_sv[117] += amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[6], w_fp[110], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[6], w_fp[110], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -9121,7 +9121,7 @@ namespace mg5amcCpu
       jamp_sv[109] -= amp_sv[0];
       jamp_sv[115] -= amp_sv[0];
       jamp_sv[118] += amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[6], w_fp[111], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[6], w_fp[111], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -9145,12 +9145,12 @@ namespace mg5amcCpu
       // *** DIAGRAM 444 OF 1240 ***
 
       // Wavefunction(s) for diagram number 444
-      VVVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[92], w_fp[1], w_fp[6], COUPs[2], 0., 0., w_fp[112] );
-      VVVV3P0_1<W_ACCESS, CD_ACCESS>( w_fp[92], w_fp[1], w_fp[6], COUPs[2], 0., 0., w_fp[113] );
-      VVVV4P0_1<W_ACCESS, CD_ACCESS>( w_fp[92], w_fp[1], w_fp[6], COUPs[2], 0., 0., w_fp[114] );
+      VVVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[92], w_fp[1], w_fp[6], COUPs[2], 1.0, 0., 0., w_fp[112] );
+      VVVV3P0_1<W_ACCESS, CD_ACCESS>( w_fp[92], w_fp[1], w_fp[6], COUPs[2], 1.0, 0., 0., w_fp[113] );
+      VVVV4P0_1<W_ACCESS, CD_ACCESS>( w_fp[92], w_fp[1], w_fp[6], COUPs[2], 1.0, 0., 0., w_fp[114] );
 
       // Amplitude(s) for diagram number 444
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[5], w_fp[112], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[5], w_fp[112], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -9170,7 +9170,7 @@ namespace mg5amcCpu
       jamp_sv[93] += amp_sv[0];
       jamp_sv[102] += amp_sv[0];
       jamp_sv[104] -= amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[5], w_fp[113], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[5], w_fp[113], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -9190,7 +9190,7 @@ namespace mg5amcCpu
       jamp_sv[94] += amp_sv[0];
       jamp_sv[98] += amp_sv[0];
       jamp_sv[108] -= amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[5], w_fp[114], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[5], w_fp[114], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -9217,7 +9217,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 445
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[88], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[88], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -9237,7 +9237,7 @@ namespace mg5amcCpu
       jamp_sv[94] -= amp_sv[0];
       jamp_sv[115] -= amp_sv[0];
       jamp_sv[118] += amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[90], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[90], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -9257,7 +9257,7 @@ namespace mg5amcCpu
       jamp_sv[94] -= amp_sv[0];
       jamp_sv[99] -= amp_sv[0];
       jamp_sv[109] += amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[96], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[96], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -9284,7 +9284,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 446
-      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[92], w_fp[1], w_fp[8], w_fp[29], COUPs[2], &amp_fp[0] );
+      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[92], w_fp[1], w_fp[8], w_fp[29], COUPs[2], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -9304,7 +9304,7 @@ namespace mg5amcCpu
       jamp_sv[116] -= amp_sv[0];
       jamp_sv[117] += amp_sv[0];
       jamp_sv[118] -= amp_sv[0];
-      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[92], w_fp[1], w_fp[8], w_fp[29], COUPs[2], &amp_fp[0] );
+      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[92], w_fp[1], w_fp[8], w_fp[29], COUPs[2], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -9324,7 +9324,7 @@ namespace mg5amcCpu
       jamp_sv[93] -= amp_sv[0];
       jamp_sv[116] -= amp_sv[0];
       jamp_sv[117] += amp_sv[0];
-      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[92], w_fp[1], w_fp[8], w_fp[29], COUPs[2], &amp_fp[0] );
+      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[92], w_fp[1], w_fp[8], w_fp[29], COUPs[2], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -9351,7 +9351,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 447
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[29], w_fp[104], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[29], w_fp[104], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -9378,7 +9378,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 448
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[29], w_fp[107], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[29], w_fp[107], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -9405,7 +9405,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 449
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[95], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[95], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -9432,7 +9432,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 450
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[104], w_fp[45], w_fp[6], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[104], w_fp[45], w_fp[6], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -9451,7 +9451,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 451
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[44], w_fp[104], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[44], w_fp[104], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -9466,7 +9466,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 452
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[89], w_fp[6], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[89], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -9479,7 +9479,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 453
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[44], w_fp[1], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[44], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -9492,7 +9492,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 454
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[89], w_fp[86], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[89], w_fp[86], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -9507,7 +9507,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 455
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[86], w_fp[1], w_fp[45], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[86], w_fp[1], w_fp[45], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -9526,7 +9526,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 456
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[39], w_fp[112], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[39], w_fp[112], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -9538,7 +9538,7 @@ namespace mg5amcCpu
       jamp_sv[85] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[92] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[93] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[39], w_fp[113], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[39], w_fp[113], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -9550,7 +9550,7 @@ namespace mg5amcCpu
       jamp_sv[85] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[91] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[94] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[39], w_fp[114], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[39], w_fp[114], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -9569,7 +9569,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 457
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[39], w_fp[104], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[39], w_fp[104], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -9584,7 +9584,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 458
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[105], w_fp[1], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[105], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -9597,7 +9597,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 459
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[101], w_fp[39], w_fp[1], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[101], w_fp[39], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -9610,7 +9610,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 460
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[104], w_fp[51], w_fp[5], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[104], w_fp[51], w_fp[5], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -9629,7 +9629,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 461
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[50], w_fp[104], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[50], w_fp[104], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -9644,7 +9644,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 462
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[91], w_fp[5], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[91], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -9657,7 +9657,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 463
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[50], w_fp[1], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[50], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -9670,7 +9670,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 464
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[91], w_fp[62], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[91], w_fp[62], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -9685,7 +9685,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 465
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[62], w_fp[1], w_fp[51], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[62], w_fp[1], w_fp[51], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -9704,7 +9704,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 466
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[47], w_fp[109], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[47], w_fp[109], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -9716,7 +9716,7 @@ namespace mg5amcCpu
       jamp_sv[109] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[116] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[117] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[47], w_fp[110], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[47], w_fp[110], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -9728,7 +9728,7 @@ namespace mg5amcCpu
       jamp_sv[109] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[115] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[118] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[47], w_fp[111], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[47], w_fp[111], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -9747,7 +9747,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 467
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[47], w_fp[104], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[47], w_fp[104], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -9762,7 +9762,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 468
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[106], w_fp[1], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[106], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -9775,7 +9775,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 469
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[98], w_fp[47], w_fp[1], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[98], w_fp[47], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -9788,7 +9788,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 470
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[104], w_fp[23], w_fp[6], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[104], w_fp[23], w_fp[6], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -9807,7 +9807,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 471
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[48], w_fp[2], w_fp[104], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[48], w_fp[2], w_fp[104], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -9822,7 +9822,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 472
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[58], w_fp[102], w_fp[6], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[58], w_fp[102], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -9835,7 +9835,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 473
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[48], w_fp[102], w_fp[1], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[48], w_fp[102], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -9848,7 +9848,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 474
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[58], w_fp[2], w_fp[86], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[58], w_fp[2], w_fp[86], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -9863,7 +9863,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 475
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[86], w_fp[1], w_fp[23], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[86], w_fp[1], w_fp[23], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -9882,7 +9882,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 476
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[2], w_fp[112], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[2], w_fp[112], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -9894,7 +9894,7 @@ namespace mg5amcCpu
       jamp_sv[52] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[102] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[104] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[2], w_fp[113], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[2], w_fp[113], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -9906,7 +9906,7 @@ namespace mg5amcCpu
       jamp_sv[52] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[98] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[108] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[2], w_fp[114], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[2], w_fp[114], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -9925,7 +9925,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 477
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[104], w_fp[20], w_fp[5], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[104], w_fp[20], w_fp[5], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -9944,7 +9944,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 478
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[40], w_fp[2], w_fp[104], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[40], w_fp[2], w_fp[104], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -9959,7 +9959,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 479
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[60], w_fp[102], w_fp[5], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[60], w_fp[102], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -9972,7 +9972,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 480
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[40], w_fp[102], w_fp[1], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[40], w_fp[102], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -9985,7 +9985,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 481
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[60], w_fp[2], w_fp[62], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[60], w_fp[2], w_fp[62], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -10000,7 +10000,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 482
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[62], w_fp[1], w_fp[20], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[62], w_fp[1], w_fp[20], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -10019,7 +10019,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 483
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[2], w_fp[109], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[2], w_fp[109], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -10031,7 +10031,7 @@ namespace mg5amcCpu
       jamp_sv[50] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[78] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[80] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[2], w_fp[110], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[2], w_fp[110], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -10043,7 +10043,7 @@ namespace mg5amcCpu
       jamp_sv[50] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[74] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[84] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[2], w_fp[111], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[2], w_fp[111], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -10062,7 +10062,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 484
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[18], w_fp[104], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[18], w_fp[104], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -10081,7 +10081,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 485
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[12], w_fp[2], w_fp[104], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[12], w_fp[2], w_fp[104], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -10100,7 +10100,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 486
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[102], w_fp[67], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[102], w_fp[67], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -10119,7 +10119,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 487
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[12], w_fp[102], w_fp[1], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[12], w_fp[102], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -10134,7 +10134,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 488
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[2], w_fp[67], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[2], w_fp[67], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -10153,7 +10153,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 489
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[18], w_fp[1], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[18], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -10168,7 +10168,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 490
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[102], w_fp[55], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[102], w_fp[55], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -10180,7 +10180,7 @@ namespace mg5amcCpu
       jamp_sv[49] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[51] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[53] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[102], w_fp[83], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[102], w_fp[83], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -10192,7 +10192,7 @@ namespace mg5amcCpu
       jamp_sv[50] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[51] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[52] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[102], w_fp[84], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[102], w_fp[84], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -10211,7 +10211,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 491
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[2], w_fp[55], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[2], w_fp[55], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -10223,7 +10223,7 @@ namespace mg5amcCpu
       jamp_sv[93] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[116] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[117] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[2], w_fp[83], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[2], w_fp[83], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -10235,7 +10235,7 @@ namespace mg5amcCpu
       jamp_sv[93] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[106] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[107] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[2], w_fp[84], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[2], w_fp[84], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -10254,7 +10254,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 492
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[92], w_fp[55], w_fp[8], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[92], w_fp[55], w_fp[8], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -10274,7 +10274,7 @@ namespace mg5amcCpu
       jamp_sv[93] += amp_sv[0];
       jamp_sv[116] += amp_sv[0];
       jamp_sv[117] -= amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[92], w_fp[83], w_fp[8], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[92], w_fp[83], w_fp[8], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -10294,7 +10294,7 @@ namespace mg5amcCpu
       jamp_sv[93] += amp_sv[0];
       jamp_sv[106] += amp_sv[0];
       jamp_sv[107] -= amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[92], w_fp[84], w_fp[8], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[92], w_fp[84], w_fp[8], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -10318,11 +10318,11 @@ namespace mg5amcCpu
       // *** DIAGRAM 493 OF 1240 ***
 
       // Wavefunction(s) for diagram number 493
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[5], COUPs[0], 0., 0., w_fp[92] );
-      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[92], COUPs[1], cIPD[0], cIPD[1], w_fp[99] );
+      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[5], COUPs[0], 1.0, 0., 0., w_fp[92] );
+      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[92], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[99] );
 
       // Amplitude(s) for diagram number 493
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[87], w_fp[6], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[87], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -10335,7 +10335,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 494
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[85], w_fp[4], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[85], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -10345,10 +10345,10 @@ namespace mg5amcCpu
       // *** DIAGRAM 495 OF 1240 ***
 
       // Wavefunction(s) for diagram number 495
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[92], w_fp[4], COUPs[0], 0., 0., w_fp[102] );
+      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[92], w_fp[4], COUPs[0], 1.0, 0., 0., w_fp[102] );
 
       // Amplitude(s) for diagram number 495
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[102], w_fp[34], w_fp[6], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[102], w_fp[34], w_fp[6], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -10367,7 +10367,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 496
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[85], w_fp[102], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[85], w_fp[102], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -10379,10 +10379,10 @@ namespace mg5amcCpu
       // *** DIAGRAM 497 OF 1240 ***
 
       // Wavefunction(s) for diagram number 497
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[92], w_fp[6], COUPs[0], 0., 0., w_fp[104] );
+      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[92], w_fp[6], COUPs[0], 1.0, 0., 0., w_fp[104] );
 
       // Amplitude(s) for diagram number 497
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[104], w_fp[34], w_fp[4], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[104], w_fp[34], w_fp[4], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -10401,7 +10401,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 498
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[87], w_fp[104], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[87], w_fp[104], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -10413,12 +10413,12 @@ namespace mg5amcCpu
       // *** DIAGRAM 499 OF 1240 ***
 
       // Wavefunction(s) for diagram number 499
-      VVVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[92], w_fp[4], w_fp[6], COUPs[2], 0., 0., w_fp[111] );
-      VVVV3P0_1<W_ACCESS, CD_ACCESS>( w_fp[92], w_fp[4], w_fp[6], COUPs[2], 0., 0., w_fp[110] );
-      VVVV4P0_1<W_ACCESS, CD_ACCESS>( w_fp[92], w_fp[4], w_fp[6], COUPs[2], 0., 0., w_fp[109] );
+      VVVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[92], w_fp[4], w_fp[6], COUPs[2], 1.0, 0., 0., w_fp[111] );
+      VVVV3P0_1<W_ACCESS, CD_ACCESS>( w_fp[92], w_fp[4], w_fp[6], COUPs[2], 1.0, 0., 0., w_fp[110] );
+      VVVV4P0_1<W_ACCESS, CD_ACCESS>( w_fp[92], w_fp[4], w_fp[6], COUPs[2], 1.0, 0., 0., w_fp[109] );
 
       // Amplitude(s) for diagram number 499
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[77], w_fp[111], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[77], w_fp[111], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -10430,7 +10430,7 @@ namespace mg5amcCpu
       jamp_sv[37] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[44] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[45] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[77], w_fp[110], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[77], w_fp[110], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -10442,7 +10442,7 @@ namespace mg5amcCpu
       jamp_sv[37] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[43] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[46] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[77], w_fp[109], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[77], w_fp[109], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -10458,10 +10458,10 @@ namespace mg5amcCpu
       // *** DIAGRAM 500 OF 1240 ***
 
       // Wavefunction(s) for diagram number 500
-      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[77], w_fp[92], COUPs[1], cIPD[0], cIPD[1], w_fp[62] );
+      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[77], w_fp[92], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[62] );
 
       // Amplitude(s) for diagram number 500
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[62], w_fp[6], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[62], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -10471,10 +10471,10 @@ namespace mg5amcCpu
       // *** DIAGRAM 501 OF 1240 ***
 
       // Wavefunction(s) for diagram number 501
-      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[46], w_fp[92], COUPs[1], cIPD[0], cIPD[1], w_fp[114] );
+      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[46], w_fp[92], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[114] );
 
       // Amplitude(s) for diagram number 501
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[114], w_fp[77], w_fp[6], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[114], w_fp[77], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -10487,7 +10487,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 502
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[77], w_fp[104], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[77], w_fp[104], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -10502,7 +10502,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 503
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[62], w_fp[4], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[62], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -10512,10 +10512,10 @@ namespace mg5amcCpu
       // *** DIAGRAM 504 OF 1240 ***
 
       // Wavefunction(s) for diagram number 504
-      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[41], w_fp[92], COUPs[1], cIPD[0], cIPD[1], w_fp[113] );
+      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[41], w_fp[92], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[113] );
 
       // Amplitude(s) for diagram number 504
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[113], w_fp[77], w_fp[4], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[113], w_fp[77], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -10528,7 +10528,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 505
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[77], w_fp[102], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[77], w_fp[102], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -10543,7 +10543,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 506
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[62], w_fp[27], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[62], w_fp[27], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -10558,7 +10558,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 507
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[77], w_fp[27], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[77], w_fp[27], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -10570,10 +10570,10 @@ namespace mg5amcCpu
       // *** DIAGRAM 508 OF 1240 ***
 
       // Wavefunction(s) for diagram number 508
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[92], w_fp[27], COUPs[0], 0., 0., w_fp[62] );
+      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[92], w_fp[27], COUPs[0], 1.0, 0., 0., w_fp[62] );
 
       // Amplitude(s) for diagram number 508
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[77], w_fp[62], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[77], w_fp[62], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -10589,10 +10589,10 @@ namespace mg5amcCpu
       // *** DIAGRAM 509 OF 1240 ***
 
       // Wavefunction(s) for diagram number 509
-      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[2], w_fp[92], COUPs[1], cIPD[0], cIPD[1], w_fp[112] );
+      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[2], w_fp[92], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[112] );
 
       // Amplitude(s) for diagram number 509
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[56], w_fp[112], w_fp[6], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[56], w_fp[112], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -10605,7 +10605,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 510
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[21], w_fp[112], w_fp[4], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[21], w_fp[112], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -10618,7 +10618,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 511
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[102], w_fp[103], w_fp[6], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[102], w_fp[103], w_fp[6], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -10637,7 +10637,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 512
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[21], w_fp[2], w_fp[102], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[21], w_fp[2], w_fp[102], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -10652,7 +10652,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 513
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[104], w_fp[103], w_fp[4], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[104], w_fp[103], w_fp[4], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -10671,7 +10671,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 514
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[56], w_fp[2], w_fp[104], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[56], w_fp[2], w_fp[104], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -10686,7 +10686,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 515
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[2], w_fp[111], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[2], w_fp[111], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -10698,7 +10698,7 @@ namespace mg5amcCpu
       jamp_sv[77] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[109] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[112] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[2], w_fp[110], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[2], w_fp[110], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -10710,7 +10710,7 @@ namespace mg5amcCpu
       jamp_sv[77] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[101] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[115] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[2], w_fp[109], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[2], w_fp[109], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -10726,10 +10726,10 @@ namespace mg5amcCpu
       // *** DIAGRAM 516 OF 1240 ***
 
       // Wavefunction(s) for diagram number 516
-      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[52], w_fp[92], COUPs[1], cIPD[0], cIPD[1], w_fp[86] );
+      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[52], w_fp[92], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[86] );
 
       // Amplitude(s) for diagram number 516
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[86], w_fp[33], w_fp[6], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[86], w_fp[33], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -10739,10 +10739,10 @@ namespace mg5amcCpu
       // *** DIAGRAM 517 OF 1240 ***
 
       // Wavefunction(s) for diagram number 517
-      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[33], w_fp[92], COUPs[1], cIPD[0], cIPD[1], w_fp[98] );
+      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[33], w_fp[92], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[98] );
 
       // Amplitude(s) for diagram number 517
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[98], w_fp[6], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[98], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -10755,7 +10755,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 518
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[33], w_fp[104], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[33], w_fp[104], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -10770,7 +10770,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 519
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[86], w_fp[47], w_fp[4], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[86], w_fp[47], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -10780,10 +10780,10 @@ namespace mg5amcCpu
       // *** DIAGRAM 520 OF 1240 ***
 
       // Wavefunction(s) for diagram number 520
-      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[47], w_fp[92], COUPs[1], cIPD[0], cIPD[1], w_fp[106] );
+      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[47], w_fp[92], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[106] );
 
       // Amplitude(s) for diagram number 520
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[106], w_fp[4], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[106], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -10796,7 +10796,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 521
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[47], w_fp[102], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[47], w_fp[102], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -10811,7 +10811,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 522
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[86], w_fp[2], w_fp[27], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[86], w_fp[2], w_fp[27], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -10826,7 +10826,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 523
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[112], w_fp[27], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[112], w_fp[27], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -10841,7 +10841,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 524
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[2], w_fp[62], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[2], w_fp[62], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -10860,7 +10860,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 525
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[65], w_fp[112], w_fp[6], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[65], w_fp[112], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -10875,7 +10875,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 526
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[112], w_fp[64], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[112], w_fp[64], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -10894,7 +10894,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 527
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[93], w_fp[6], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[93], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -10909,7 +10909,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 528
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[2], w_fp[64], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[2], w_fp[64], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -10928,7 +10928,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 529
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[93], w_fp[104], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[93], w_fp[104], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -10947,7 +10947,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 530
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[65], w_fp[2], w_fp[104], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[65], w_fp[2], w_fp[104], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -10966,7 +10966,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 531
-      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[92], w_fp[61], w_fp[8], w_fp[6], COUPs[2], &amp_fp[0] );
+      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[92], w_fp[61], w_fp[8], w_fp[6], COUPs[2], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -10986,7 +10986,7 @@ namespace mg5amcCpu
       jamp_sv[111] += amp_sv[0];
       jamp_sv[114] += amp_sv[0];
       jamp_sv[115] -= amp_sv[0];
-      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[92], w_fp[61], w_fp[8], w_fp[6], COUPs[2], &amp_fp[0] );
+      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[92], w_fp[61], w_fp[8], w_fp[6], COUPs[2], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -11006,7 +11006,7 @@ namespace mg5amcCpu
       jamp_sv[105] -= amp_sv[0];
       jamp_sv[110] -= amp_sv[0];
       jamp_sv[111] += amp_sv[0];
-      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[92], w_fp[61], w_fp[8], w_fp[6], COUPs[2], &amp_fp[0] );
+      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[92], w_fp[61], w_fp[8], w_fp[6], COUPs[2], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -11030,10 +11030,10 @@ namespace mg5amcCpu
       // *** DIAGRAM 532 OF 1240 ***
 
       // Wavefunction(s) for diagram number 532
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[92], w_fp[61], COUPs[0], 0., 0., w_fp[86] );
+      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[92], w_fp[61], COUPs[0], 1.0, 0., 0., w_fp[86] );
 
       // Amplitude(s) for diagram number 532
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[6], w_fp[86], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[6], w_fp[86], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -11057,10 +11057,10 @@ namespace mg5amcCpu
       // *** DIAGRAM 533 OF 1240 ***
 
       // Wavefunction(s) for diagram number 533
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[92], w_fp[8], COUPs[0], 0., 0., w_fp[101] );
+      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[92], w_fp[8], COUPs[0], 1.0, 0., 0., w_fp[101] );
 
       // Amplitude(s) for diagram number 533
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[61], w_fp[6], w_fp[101], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[61], w_fp[6], w_fp[101], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -11087,7 +11087,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 534
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[61], w_fp[8], w_fp[104], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[61], w_fp[8], w_fp[104], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -11114,7 +11114,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 535
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[47], w_fp[86], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[47], w_fp[86], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -11133,7 +11133,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 536
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[106], w_fp[61], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[106], w_fp[61], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -11148,7 +11148,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 537
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[47], w_fp[61], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[47], w_fp[61], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -11163,7 +11163,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 538
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[2], w_fp[86], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[2], w_fp[86], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -11182,7 +11182,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 539
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[112], w_fp[61], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[112], w_fp[61], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -11197,7 +11197,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 540
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[113], w_fp[2], w_fp[61], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[113], w_fp[2], w_fp[61], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -11212,7 +11212,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 541
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[76], w_fp[112], w_fp[4], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[76], w_fp[112], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -11227,7 +11227,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 542
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[112], w_fp[74], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[112], w_fp[74], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -11246,7 +11246,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 543
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[97], w_fp[4], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[97], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -11261,7 +11261,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 544
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[2], w_fp[74], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[2], w_fp[74], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -11280,7 +11280,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 545
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[97], w_fp[102], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[97], w_fp[102], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -11299,7 +11299,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 546
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[76], w_fp[2], w_fp[102], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[76], w_fp[2], w_fp[102], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -11318,7 +11318,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 547
-      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[92], w_fp[72], w_fp[8], w_fp[4], COUPs[2], &amp_fp[0] );
+      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[92], w_fp[72], w_fp[8], w_fp[4], COUPs[2], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -11338,7 +11338,7 @@ namespace mg5amcCpu
       jamp_sv[76] += amp_sv[0];
       jamp_sv[103] += amp_sv[0];
       jamp_sv[106] -= amp_sv[0];
-      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[92], w_fp[72], w_fp[8], w_fp[4], COUPs[2], &amp_fp[0] );
+      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[92], w_fp[72], w_fp[8], w_fp[4], COUPs[2], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -11358,7 +11358,7 @@ namespace mg5amcCpu
       jamp_sv[76] += amp_sv[0];
       jamp_sv[104] += amp_sv[0];
       jamp_sv[105] -= amp_sv[0];
-      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[92], w_fp[72], w_fp[8], w_fp[4], COUPs[2], &amp_fp[0] );
+      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[92], w_fp[72], w_fp[8], w_fp[4], COUPs[2], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -11382,10 +11382,10 @@ namespace mg5amcCpu
       // *** DIAGRAM 548 OF 1240 ***
 
       // Wavefunction(s) for diagram number 548
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[92], w_fp[72], COUPs[0], 0., 0., w_fp[86] );
+      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[92], w_fp[72], COUPs[0], 1.0, 0., 0., w_fp[86] );
 
       // Amplitude(s) for diagram number 548
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[4], w_fp[86], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[4], w_fp[86], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -11412,7 +11412,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 549
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[72], w_fp[4], w_fp[101], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[72], w_fp[4], w_fp[101], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -11439,7 +11439,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 550
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[72], w_fp[8], w_fp[102], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[72], w_fp[8], w_fp[102], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -11466,7 +11466,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 551
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[33], w_fp[86], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[33], w_fp[86], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -11485,7 +11485,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 552
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[98], w_fp[72], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[98], w_fp[72], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -11500,7 +11500,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 553
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[33], w_fp[72], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[33], w_fp[72], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -11515,7 +11515,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 554
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[2], w_fp[86], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[2], w_fp[86], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -11534,7 +11534,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 555
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[112], w_fp[72], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[112], w_fp[72], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -11549,7 +11549,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 556
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[114], w_fp[2], w_fp[72], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[114], w_fp[2], w_fp[72], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -11561,10 +11561,10 @@ namespace mg5amcCpu
       // *** DIAGRAM 557 OF 1240 ***
 
       // Wavefunction(s) for diagram number 557
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[92], w_fp[1], COUPs[0], 0., 0., w_fp[86] );
+      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[92], w_fp[1], COUPs[0], 1.0, 0., 0., w_fp[86] );
 
       // Amplitude(s) for diagram number 557
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[86], w_fp[13], w_fp[6], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[86], w_fp[13], w_fp[6], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -11591,7 +11591,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 558
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[86], w_fp[11], w_fp[4], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[86], w_fp[11], w_fp[4], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -11618,7 +11618,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 559
-      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[4], w_fp[6], w_fp[86], COUPs[2], &amp_fp[0] );
+      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[4], w_fp[6], w_fp[86], COUPs[2], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -11638,7 +11638,7 @@ namespace mg5amcCpu
       jamp_sv[103] -= amp_sv[0];
       jamp_sv[106] += amp_sv[0];
       jamp_sv[114] -= amp_sv[0];
-      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[4], w_fp[6], w_fp[86], COUPs[2], &amp_fp[0] );
+      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[4], w_fp[6], w_fp[86], COUPs[2], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -11658,7 +11658,7 @@ namespace mg5amcCpu
       jamp_sv[111] -= amp_sv[0];
       jamp_sv[112] += amp_sv[0];
       jamp_sv[114] -= amp_sv[0];
-      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[4], w_fp[6], w_fp[86], COUPs[2], &amp_fp[0] );
+      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[4], w_fp[6], w_fp[86], COUPs[2], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -11685,7 +11685,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 560
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[102], w_fp[108], w_fp[6], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[102], w_fp[108], w_fp[6], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -11712,7 +11712,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 561
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[102], w_fp[1], w_fp[11], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[102], w_fp[1], w_fp[11], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -11739,7 +11739,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 562
-      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[6], w_fp[102], COUPs[2], &amp_fp[0] );
+      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[6], w_fp[102], COUPs[2], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -11759,7 +11759,7 @@ namespace mg5amcCpu
       jamp_sv[109] += amp_sv[0];
       jamp_sv[112] -= amp_sv[0];
       jamp_sv[115] += amp_sv[0];
-      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[6], w_fp[102], COUPs[2], &amp_fp[0] );
+      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[6], w_fp[102], COUPs[2], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -11779,7 +11779,7 @@ namespace mg5amcCpu
       jamp_sv[104] += amp_sv[0];
       jamp_sv[105] -= amp_sv[0];
       jamp_sv[106] += amp_sv[0];
-      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[6], w_fp[102], COUPs[2], &amp_fp[0] );
+      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[6], w_fp[102], COUPs[2], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -11806,7 +11806,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 563
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[104], w_fp[108], w_fp[4], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[104], w_fp[108], w_fp[4], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -11833,7 +11833,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 564
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[104], w_fp[1], w_fp[13], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[104], w_fp[1], w_fp[13], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -11860,7 +11860,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 565
-      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[4], w_fp[104], COUPs[2], &amp_fp[0] );
+      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[4], w_fp[104], COUPs[2], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -11880,7 +11880,7 @@ namespace mg5amcCpu
       jamp_sv[77] -= amp_sv[0];
       jamp_sv[101] -= amp_sv[0];
       jamp_sv[115] += amp_sv[0];
-      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[4], w_fp[104], COUPs[2], &amp_fp[0] );
+      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[4], w_fp[104], COUPs[2], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -11900,7 +11900,7 @@ namespace mg5amcCpu
       jamp_sv[101] -= amp_sv[0];
       jamp_sv[114] -= amp_sv[0];
       jamp_sv[115] += amp_sv[0];
-      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[4], w_fp[104], COUPs[2], &amp_fp[0] );
+      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[4], w_fp[104], COUPs[2], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -11924,12 +11924,12 @@ namespace mg5amcCpu
       // *** DIAGRAM 566 OF 1240 ***
 
       // Wavefunction(s) for diagram number 566
-      VVVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[92], w_fp[1], w_fp[4], COUPs[2], 0., 0., w_fp[105] );
-      VVVV3P0_1<W_ACCESS, CD_ACCESS>( w_fp[92], w_fp[1], w_fp[4], COUPs[2], 0., 0., w_fp[95] );
-      VVVV4P0_1<W_ACCESS, CD_ACCESS>( w_fp[92], w_fp[1], w_fp[4], COUPs[2], 0., 0., w_fp[107] );
+      VVVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[92], w_fp[1], w_fp[4], COUPs[2], 1.0, 0., 0., w_fp[105] );
+      VVVV3P0_1<W_ACCESS, CD_ACCESS>( w_fp[92], w_fp[1], w_fp[4], COUPs[2], 1.0, 0., 0., w_fp[95] );
+      VVVV4P0_1<W_ACCESS, CD_ACCESS>( w_fp[92], w_fp[1], w_fp[4], COUPs[2], 1.0, 0., 0., w_fp[107] );
 
       // Amplitude(s) for diagram number 566
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[6], w_fp[105], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[6], w_fp[105], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -11949,7 +11949,7 @@ namespace mg5amcCpu
       jamp_sv[111] += amp_sv[0];
       jamp_sv[114] += amp_sv[0];
       jamp_sv[115] -= amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[6], w_fp[95], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[6], w_fp[95], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -11969,7 +11969,7 @@ namespace mg5amcCpu
       jamp_sv[109] -= amp_sv[0];
       jamp_sv[112] += amp_sv[0];
       jamp_sv[115] -= amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[6], w_fp[107], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[6], w_fp[107], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -11993,12 +11993,12 @@ namespace mg5amcCpu
       // *** DIAGRAM 567 OF 1240 ***
 
       // Wavefunction(s) for diagram number 567
-      VVVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[92], w_fp[1], w_fp[6], COUPs[2], 0., 0., w_fp[96] );
-      VVVV3P0_1<W_ACCESS, CD_ACCESS>( w_fp[92], w_fp[1], w_fp[6], COUPs[2], 0., 0., w_fp[90] );
-      VVVV4P0_1<W_ACCESS, CD_ACCESS>( w_fp[92], w_fp[1], w_fp[6], COUPs[2], 0., 0., w_fp[88] );
+      VVVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[92], w_fp[1], w_fp[6], COUPs[2], 1.0, 0., 0., w_fp[96] );
+      VVVV3P0_1<W_ACCESS, CD_ACCESS>( w_fp[92], w_fp[1], w_fp[6], COUPs[2], 1.0, 0., 0., w_fp[90] );
+      VVVV4P0_1<W_ACCESS, CD_ACCESS>( w_fp[92], w_fp[1], w_fp[6], COUPs[2], 1.0, 0., 0., w_fp[88] );
 
       // Amplitude(s) for diagram number 567
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[4], w_fp[96], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[4], w_fp[96], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -12018,7 +12018,7 @@ namespace mg5amcCpu
       jamp_sv[76] += amp_sv[0];
       jamp_sv[103] += amp_sv[0];
       jamp_sv[106] -= amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[4], w_fp[90], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[4], w_fp[90], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -12038,7 +12038,7 @@ namespace mg5amcCpu
       jamp_sv[76] += amp_sv[0];
       jamp_sv[100] += amp_sv[0];
       jamp_sv[114] -= amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[4], w_fp[88], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[4], w_fp[88], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -12065,7 +12065,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 568
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[111], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[111], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -12085,7 +12085,7 @@ namespace mg5amcCpu
       jamp_sv[77] -= amp_sv[0];
       jamp_sv[109] -= amp_sv[0];
       jamp_sv[112] += amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[110], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[110], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -12105,7 +12105,7 @@ namespace mg5amcCpu
       jamp_sv[77] -= amp_sv[0];
       jamp_sv[101] -= amp_sv[0];
       jamp_sv[115] += amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[109], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[109], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -12132,7 +12132,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 569
-      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[92], w_fp[1], w_fp[8], w_fp[27], COUPs[2], &amp_fp[0] );
+      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[92], w_fp[1], w_fp[8], w_fp[27], COUPs[2], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -12152,7 +12152,7 @@ namespace mg5amcCpu
       jamp_sv[110] -= amp_sv[0];
       jamp_sv[111] += amp_sv[0];
       jamp_sv[112] -= amp_sv[0];
-      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[92], w_fp[1], w_fp[8], w_fp[27], COUPs[2], &amp_fp[0] );
+      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[92], w_fp[1], w_fp[8], w_fp[27], COUPs[2], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -12172,7 +12172,7 @@ namespace mg5amcCpu
       jamp_sv[77] -= amp_sv[0];
       jamp_sv[110] -= amp_sv[0];
       jamp_sv[111] += amp_sv[0];
-      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[92], w_fp[1], w_fp[8], w_fp[27], COUPs[2], &amp_fp[0] );
+      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[92], w_fp[1], w_fp[8], w_fp[27], COUPs[2], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -12199,7 +12199,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 570
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[27], w_fp[86], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[27], w_fp[86], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -12226,7 +12226,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 571
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[27], w_fp[101], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[27], w_fp[101], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -12253,7 +12253,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 572
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[62], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[62], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -12280,7 +12280,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 573
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[86], w_fp[37], w_fp[6], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[86], w_fp[37], w_fp[6], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -12299,7 +12299,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 574
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[36], w_fp[86], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[36], w_fp[86], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -12314,7 +12314,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 575
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[100], w_fp[6], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[100], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -12327,7 +12327,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 576
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[36], w_fp[1], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[36], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -12340,7 +12340,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 577
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[100], w_fp[104], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[100], w_fp[104], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -12355,7 +12355,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 578
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[104], w_fp[1], w_fp[37], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[104], w_fp[1], w_fp[37], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -12374,7 +12374,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 579
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[33], w_fp[96], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[33], w_fp[96], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -12386,7 +12386,7 @@ namespace mg5amcCpu
       jamp_sv[61] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[68] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[69] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[33], w_fp[90], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[33], w_fp[90], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -12398,7 +12398,7 @@ namespace mg5amcCpu
       jamp_sv[61] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[67] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[70] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[33], w_fp[88], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[33], w_fp[88], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -12417,7 +12417,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 580
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[33], w_fp[86], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[33], w_fp[86], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -12432,7 +12432,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 581
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[98], w_fp[1], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[98], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -12445,7 +12445,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 582
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[113], w_fp[33], w_fp[1], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[113], w_fp[33], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -12458,7 +12458,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 583
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[86], w_fp[51], w_fp[4], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[86], w_fp[51], w_fp[4], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -12477,7 +12477,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 584
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[49], w_fp[86], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[49], w_fp[86], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -12492,7 +12492,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 585
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[91], w_fp[4], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[91], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -12505,7 +12505,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 586
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[49], w_fp[1], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[49], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -12518,7 +12518,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 587
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[91], w_fp[102], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[91], w_fp[102], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -12533,7 +12533,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 588
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[102], w_fp[1], w_fp[51], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[102], w_fp[1], w_fp[51], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -12552,7 +12552,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 589
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[47], w_fp[105], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[47], w_fp[105], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -12564,7 +12564,7 @@ namespace mg5amcCpu
       jamp_sv[111] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[114] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[115] -= cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[47], w_fp[95], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[47], w_fp[95], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -12576,7 +12576,7 @@ namespace mg5amcCpu
       jamp_sv[109] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[112] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[115] -= cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[47], w_fp[107], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[47], w_fp[107], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -12595,7 +12595,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 590
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[47], w_fp[86], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[47], w_fp[86], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -12610,7 +12610,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 591
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[106], w_fp[1], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[106], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -12623,7 +12623,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 592
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[114], w_fp[47], w_fp[1], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[114], w_fp[47], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -12636,7 +12636,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 593
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[86], w_fp[54], w_fp[6], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[86], w_fp[54], w_fp[6], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -12655,7 +12655,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 594
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[53], w_fp[2], w_fp[86], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[53], w_fp[2], w_fp[86], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -12670,7 +12670,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 595
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[78], w_fp[112], w_fp[6], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[78], w_fp[112], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -12683,7 +12683,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 596
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[53], w_fp[112], w_fp[1], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[53], w_fp[112], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -12696,7 +12696,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 597
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[78], w_fp[2], w_fp[104], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[78], w_fp[2], w_fp[104], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -12711,7 +12711,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 598
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[104], w_fp[1], w_fp[54], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[104], w_fp[1], w_fp[54], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -12730,7 +12730,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 599
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[2], w_fp[96], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[2], w_fp[96], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -12742,7 +12742,7 @@ namespace mg5amcCpu
       jamp_sv[76] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[103] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[106] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[2], w_fp[90], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[2], w_fp[90], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -12754,7 +12754,7 @@ namespace mg5amcCpu
       jamp_sv[76] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[100] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[114] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[2], w_fp[88], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[2], w_fp[88], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -12773,7 +12773,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 600
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[86], w_fp[20], w_fp[4], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[86], w_fp[20], w_fp[4], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -12792,7 +12792,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 601
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[28], w_fp[2], w_fp[86], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[28], w_fp[2], w_fp[86], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -12807,7 +12807,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 602
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[60], w_fp[112], w_fp[4], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[60], w_fp[112], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -12820,7 +12820,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 603
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[28], w_fp[112], w_fp[1], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[28], w_fp[112], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -12833,7 +12833,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 604
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[60], w_fp[2], w_fp[102], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[60], w_fp[2], w_fp[102], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -12848,7 +12848,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 605
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[102], w_fp[1], w_fp[20], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[102], w_fp[1], w_fp[20], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -12867,7 +12867,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 606
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[2], w_fp[105], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[2], w_fp[105], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -12879,7 +12879,7 @@ namespace mg5amcCpu
       jamp_sv[56] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[72] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[74] -= cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[2], w_fp[95], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[2], w_fp[95], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -12891,7 +12891,7 @@ namespace mg5amcCpu
       jamp_sv[50] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[60] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[74] -= cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[2], w_fp[107], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[2], w_fp[107], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -12910,7 +12910,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 607
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[15], w_fp[86], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[15], w_fp[86], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -12929,7 +12929,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 608
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[14], w_fp[2], w_fp[86], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[14], w_fp[2], w_fp[86], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -12948,7 +12948,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 609
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[112], w_fp[68], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[112], w_fp[68], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -12967,7 +12967,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 610
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[14], w_fp[112], w_fp[1], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[14], w_fp[112], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -12982,7 +12982,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 611
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[2], w_fp[68], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[2], w_fp[68], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -13001,7 +13001,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 612
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[15], w_fp[1], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[15], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -13016,7 +13016,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 613
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[112], w_fp[57], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[112], w_fp[57], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -13028,7 +13028,7 @@ namespace mg5amcCpu
       jamp_sv[73] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[75] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[77] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[112], w_fp[81], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[112], w_fp[81], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -13040,7 +13040,7 @@ namespace mg5amcCpu
       jamp_sv[74] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[75] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[76] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[112], w_fp[82], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[112], w_fp[82], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -13059,7 +13059,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 614
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[2], w_fp[57], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[2], w_fp[57], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -13071,7 +13071,7 @@ namespace mg5amcCpu
       jamp_sv[69] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[110] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[111] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[2], w_fp[81], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[2], w_fp[81], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -13083,7 +13083,7 @@ namespace mg5amcCpu
       jamp_sv[69] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[104] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[105] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[2], w_fp[82], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[2], w_fp[82], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -13102,7 +13102,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 615
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[92], w_fp[57], w_fp[8], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[92], w_fp[57], w_fp[8], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -13122,7 +13122,7 @@ namespace mg5amcCpu
       jamp_sv[77] += amp_sv[0];
       jamp_sv[110] += amp_sv[0];
       jamp_sv[111] -= amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[92], w_fp[81], w_fp[8], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[92], w_fp[81], w_fp[8], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -13142,7 +13142,7 @@ namespace mg5amcCpu
       jamp_sv[76] += amp_sv[0];
       jamp_sv[104] += amp_sv[0];
       jamp_sv[105] -= amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[92], w_fp[82], w_fp[8], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[92], w_fp[82], w_fp[8], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -13166,11 +13166,11 @@ namespace mg5amcCpu
       // *** DIAGRAM 616 OF 1240 ***
 
       // Wavefunction(s) for diagram number 616
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[6], COUPs[0], 0., 0., w_fp[92] );
-      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[92], COUPs[1], cIPD[0], cIPD[1], w_fp[99] );
+      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[6], COUPs[0], 1.0, 0., 0., w_fp[92] );
+      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[92], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[99] );
 
       // Amplitude(s) for diagram number 616
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[87], w_fp[5], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[87], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -13183,7 +13183,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 617
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[9], w_fp[4], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[9], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -13193,10 +13193,10 @@ namespace mg5amcCpu
       // *** DIAGRAM 618 OF 1240 ***
 
       // Wavefunction(s) for diagram number 618
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[92], w_fp[4], COUPs[0], 0., 0., w_fp[112] );
+      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[92], w_fp[4], COUPs[0], 1.0, 0., 0., w_fp[112] );
 
       // Amplitude(s) for diagram number 618
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[112], w_fp[34], w_fp[5], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[112], w_fp[34], w_fp[5], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -13215,7 +13215,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 619
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[9], w_fp[112], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[9], w_fp[112], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -13227,10 +13227,10 @@ namespace mg5amcCpu
       // *** DIAGRAM 620 OF 1240 ***
 
       // Wavefunction(s) for diagram number 620
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[92], w_fp[5], COUPs[0], 0., 0., w_fp[86] );
+      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[92], w_fp[5], COUPs[0], 1.0, 0., 0., w_fp[86] );
 
       // Amplitude(s) for diagram number 620
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[86], w_fp[34], w_fp[4], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[86], w_fp[34], w_fp[4], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -13249,7 +13249,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 621
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[87], w_fp[86], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[87], w_fp[86], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -13261,12 +13261,12 @@ namespace mg5amcCpu
       // *** DIAGRAM 622 OF 1240 ***
 
       // Wavefunction(s) for diagram number 622
-      VVVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[92], w_fp[4], w_fp[5], COUPs[2], 0., 0., w_fp[107] );
-      VVVV3P0_1<W_ACCESS, CD_ACCESS>( w_fp[92], w_fp[4], w_fp[5], COUPs[2], 0., 0., w_fp[95] );
-      VVVV4P0_1<W_ACCESS, CD_ACCESS>( w_fp[92], w_fp[4], w_fp[5], COUPs[2], 0., 0., w_fp[105] );
+      VVVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[92], w_fp[4], w_fp[5], COUPs[2], 1.0, 0., 0., w_fp[107] );
+      VVVV3P0_1<W_ACCESS, CD_ACCESS>( w_fp[92], w_fp[4], w_fp[5], COUPs[2], 1.0, 0., 0., w_fp[95] );
+      VVVV4P0_1<W_ACCESS, CD_ACCESS>( w_fp[92], w_fp[4], w_fp[5], COUPs[2], 1.0, 0., 0., w_fp[105] );
 
       // Amplitude(s) for diagram number 622
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[77], w_fp[107], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[77], w_fp[107], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -13278,7 +13278,7 @@ namespace mg5amcCpu
       jamp_sv[39] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[42] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[43] -= cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[77], w_fp[95], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[77], w_fp[95], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -13290,7 +13290,7 @@ namespace mg5amcCpu
       jamp_sv[37] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[40] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[43] -= cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[77], w_fp[105], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[77], w_fp[105], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -13306,10 +13306,10 @@ namespace mg5amcCpu
       // *** DIAGRAM 623 OF 1240 ***
 
       // Wavefunction(s) for diagram number 623
-      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[77], w_fp[92], COUPs[1], cIPD[0], cIPD[1], w_fp[102] );
+      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[77], w_fp[92], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[102] );
 
       // Amplitude(s) for diagram number 623
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[102], w_fp[5], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[102], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -13319,10 +13319,10 @@ namespace mg5amcCpu
       // *** DIAGRAM 624 OF 1240 ***
 
       // Wavefunction(s) for diagram number 624
-      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[46], w_fp[92], COUPs[1], cIPD[0], cIPD[1], w_fp[88] );
+      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[46], w_fp[92], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[88] );
 
       // Amplitude(s) for diagram number 624
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[88], w_fp[77], w_fp[5], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[88], w_fp[77], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -13335,7 +13335,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 625
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[77], w_fp[86], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[77], w_fp[86], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -13350,7 +13350,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 626
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[102], w_fp[4], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[102], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -13360,10 +13360,10 @@ namespace mg5amcCpu
       // *** DIAGRAM 627 OF 1240 ***
 
       // Wavefunction(s) for diagram number 627
-      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[38], w_fp[92], COUPs[1], cIPD[0], cIPD[1], w_fp[90] );
+      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[38], w_fp[92], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[90] );
 
       // Amplitude(s) for diagram number 627
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[90], w_fp[77], w_fp[4], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[90], w_fp[77], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -13376,7 +13376,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 628
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[77], w_fp[112], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[77], w_fp[112], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -13391,7 +13391,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 629
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[102], w_fp[24], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[102], w_fp[24], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -13406,7 +13406,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 630
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[77], w_fp[24], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[77], w_fp[24], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -13418,10 +13418,10 @@ namespace mg5amcCpu
       // *** DIAGRAM 631 OF 1240 ***
 
       // Wavefunction(s) for diagram number 631
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[92], w_fp[24], COUPs[0], 0., 0., w_fp[102] );
+      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[92], w_fp[24], COUPs[0], 1.0, 0., 0., w_fp[102] );
 
       // Amplitude(s) for diagram number 631
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[77], w_fp[102], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[77], w_fp[102], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -13437,10 +13437,10 @@ namespace mg5amcCpu
       // *** DIAGRAM 632 OF 1240 ***
 
       // Wavefunction(s) for diagram number 632
-      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[2], w_fp[92], COUPs[1], cIPD[0], cIPD[1], w_fp[96] );
+      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[2], w_fp[92], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[96] );
 
       // Amplitude(s) for diagram number 632
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[56], w_fp[96], w_fp[5], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[56], w_fp[96], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -13453,7 +13453,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 633
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[22], w_fp[96], w_fp[4], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[22], w_fp[96], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -13466,7 +13466,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 634
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[112], w_fp[103], w_fp[5], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[112], w_fp[103], w_fp[5], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -13485,7 +13485,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 635
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[22], w_fp[2], w_fp[112], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[22], w_fp[2], w_fp[112], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -13500,7 +13500,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 636
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[86], w_fp[103], w_fp[4], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[86], w_fp[103], w_fp[4], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -13519,7 +13519,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 637
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[56], w_fp[2], w_fp[86], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[56], w_fp[2], w_fp[86], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -13534,7 +13534,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 638
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[2], w_fp[107], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[2], w_fp[107], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -13546,7 +13546,7 @@ namespace mg5amcCpu
       jamp_sv[88] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[99] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[101] -= cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[2], w_fp[95], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[2], w_fp[95], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -13558,7 +13558,7 @@ namespace mg5amcCpu
       jamp_sv[77] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[91] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[101] -= cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[2], w_fp[105], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[2], w_fp[105], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -13574,10 +13574,10 @@ namespace mg5amcCpu
       // *** DIAGRAM 639 OF 1240 ***
 
       // Wavefunction(s) for diagram number 639
-      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[52], w_fp[92], COUPs[1], cIPD[0], cIPD[1], w_fp[104] );
+      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[52], w_fp[92], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[104] );
 
       // Amplitude(s) for diagram number 639
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[104], w_fp[33], w_fp[5], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[104], w_fp[33], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -13587,10 +13587,10 @@ namespace mg5amcCpu
       // *** DIAGRAM 640 OF 1240 ***
 
       // Wavefunction(s) for diagram number 640
-      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[33], w_fp[92], COUPs[1], cIPD[0], cIPD[1], w_fp[114] );
+      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[33], w_fp[92], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[114] );
 
       // Amplitude(s) for diagram number 640
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[114], w_fp[5], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[114], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -13603,7 +13603,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 641
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[33], w_fp[86], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[33], w_fp[86], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -13618,7 +13618,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 642
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[104], w_fp[39], w_fp[4], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[104], w_fp[39], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -13628,10 +13628,10 @@ namespace mg5amcCpu
       // *** DIAGRAM 643 OF 1240 ***
 
       // Wavefunction(s) for diagram number 643
-      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[39], w_fp[92], COUPs[1], cIPD[0], cIPD[1], w_fp[106] );
+      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[39], w_fp[92], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[106] );
 
       // Amplitude(s) for diagram number 643
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[106], w_fp[4], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[106], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -13644,7 +13644,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 644
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[39], w_fp[112], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[39], w_fp[112], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -13659,7 +13659,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 645
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[104], w_fp[2], w_fp[24], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[104], w_fp[2], w_fp[24], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -13674,7 +13674,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 646
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[96], w_fp[24], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[96], w_fp[24], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -13689,7 +13689,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 647
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[2], w_fp[102], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[2], w_fp[102], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -13708,7 +13708,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 648
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[65], w_fp[96], w_fp[5], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[65], w_fp[96], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -13723,7 +13723,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 649
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[96], w_fp[63], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[96], w_fp[63], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -13742,7 +13742,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 650
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[93], w_fp[5], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[93], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -13757,7 +13757,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 651
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[2], w_fp[63], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[2], w_fp[63], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -13776,7 +13776,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 652
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[93], w_fp[86], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[93], w_fp[86], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -13795,7 +13795,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 653
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[65], w_fp[2], w_fp[86], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[65], w_fp[2], w_fp[86], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -13814,7 +13814,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 654
-      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[92], w_fp[61], w_fp[8], w_fp[5], COUPs[2], &amp_fp[0] );
+      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[92], w_fp[61], w_fp[8], w_fp[5], COUPs[2], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -13834,7 +13834,7 @@ namespace mg5amcCpu
       jamp_sv[91] -= amp_sv[0];
       jamp_sv[96] -= amp_sv[0];
       jamp_sv[98] += amp_sv[0];
-      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[92], w_fp[61], w_fp[8], w_fp[5], COUPs[2], &amp_fp[0] );
+      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[92], w_fp[61], w_fp[8], w_fp[5], COUPs[2], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -13854,7 +13854,7 @@ namespace mg5amcCpu
       jamp_sv[98] += amp_sv[0];
       jamp_sv[100] += amp_sv[0];
       jamp_sv[101] -= amp_sv[0];
-      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[92], w_fp[61], w_fp[8], w_fp[5], COUPs[2], &amp_fp[0] );
+      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[92], w_fp[61], w_fp[8], w_fp[5], COUPs[2], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -13878,10 +13878,10 @@ namespace mg5amcCpu
       // *** DIAGRAM 655 OF 1240 ***
 
       // Wavefunction(s) for diagram number 655
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[92], w_fp[61], COUPs[0], 0., 0., w_fp[104] );
+      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[92], w_fp[61], COUPs[0], 1.0, 0., 0., w_fp[104] );
 
       // Amplitude(s) for diagram number 655
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[5], w_fp[104], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[5], w_fp[104], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -13905,10 +13905,10 @@ namespace mg5amcCpu
       // *** DIAGRAM 656 OF 1240 ***
 
       // Wavefunction(s) for diagram number 656
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[92], w_fp[8], COUPs[0], 0., 0., w_fp[113] );
+      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[92], w_fp[8], COUPs[0], 1.0, 0., 0., w_fp[113] );
 
       // Amplitude(s) for diagram number 656
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[61], w_fp[5], w_fp[113], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[61], w_fp[5], w_fp[113], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -13935,7 +13935,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 657
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[61], w_fp[8], w_fp[86], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[61], w_fp[8], w_fp[86], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -13962,7 +13962,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 658
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[39], w_fp[104], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[39], w_fp[104], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -13981,7 +13981,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 659
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[106], w_fp[61], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[106], w_fp[61], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -13996,7 +13996,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 660
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[39], w_fp[61], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[39], w_fp[61], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -14011,7 +14011,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 661
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[2], w_fp[104], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[2], w_fp[104], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -14030,7 +14030,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 662
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[96], w_fp[61], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[96], w_fp[61], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -14045,7 +14045,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 663
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[90], w_fp[2], w_fp[61], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[90], w_fp[2], w_fp[61], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -14060,7 +14060,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 664
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[71], w_fp[96], w_fp[4], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[71], w_fp[96], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -14075,7 +14075,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 665
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[96], w_fp[69], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[96], w_fp[69], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -14094,7 +14094,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 666
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[94], w_fp[4], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[94], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -14109,7 +14109,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 667
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[2], w_fp[69], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[2], w_fp[69], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -14128,7 +14128,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 668
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[94], w_fp[112], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[94], w_fp[112], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -14147,7 +14147,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 669
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[71], w_fp[2], w_fp[112], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[71], w_fp[2], w_fp[112], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -14166,7 +14166,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 670
-      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[92], w_fp[66], w_fp[8], w_fp[4], COUPs[2], &amp_fp[0] );
+      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[92], w_fp[66], w_fp[8], w_fp[4], COUPs[2], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -14186,7 +14186,7 @@ namespace mg5amcCpu
       jamp_sv[82] -= amp_sv[0];
       jamp_sv[97] -= amp_sv[0];
       jamp_sv[100] += amp_sv[0];
-      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[92], w_fp[66], w_fp[8], w_fp[4], COUPs[2], &amp_fp[0] );
+      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[92], w_fp[66], w_fp[8], w_fp[4], COUPs[2], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -14206,7 +14206,7 @@ namespace mg5amcCpu
       jamp_sv[98] += amp_sv[0];
       jamp_sv[99] -= amp_sv[0];
       jamp_sv[100] += amp_sv[0];
-      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[92], w_fp[66], w_fp[8], w_fp[4], COUPs[2], &amp_fp[0] );
+      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[92], w_fp[66], w_fp[8], w_fp[4], COUPs[2], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -14230,10 +14230,10 @@ namespace mg5amcCpu
       // *** DIAGRAM 671 OF 1240 ***
 
       // Wavefunction(s) for diagram number 671
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[92], w_fp[66], COUPs[0], 0., 0., w_fp[104] );
+      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[92], w_fp[66], COUPs[0], 1.0, 0., 0., w_fp[104] );
 
       // Amplitude(s) for diagram number 671
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[4], w_fp[104], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[4], w_fp[104], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -14260,7 +14260,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 672
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[66], w_fp[4], w_fp[113], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[66], w_fp[4], w_fp[113], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -14287,7 +14287,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 673
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[66], w_fp[8], w_fp[112], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[66], w_fp[8], w_fp[112], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -14314,7 +14314,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 674
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[33], w_fp[104], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[33], w_fp[104], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -14333,7 +14333,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 675
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[114], w_fp[66], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[114], w_fp[66], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -14348,7 +14348,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 676
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[33], w_fp[66], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[33], w_fp[66], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -14363,7 +14363,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 677
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[2], w_fp[104], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[2], w_fp[104], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -14382,7 +14382,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 678
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[96], w_fp[66], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[96], w_fp[66], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -14397,7 +14397,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 679
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[88], w_fp[2], w_fp[66], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[88], w_fp[2], w_fp[66], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -14409,10 +14409,10 @@ namespace mg5amcCpu
       // *** DIAGRAM 680 OF 1240 ***
 
       // Wavefunction(s) for diagram number 680
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[92], w_fp[1], COUPs[0], 0., 0., w_fp[104] );
+      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[92], w_fp[1], COUPs[0], 1.0, 0., 0., w_fp[104] );
 
       // Amplitude(s) for diagram number 680
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[104], w_fp[13], w_fp[5], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[104], w_fp[13], w_fp[5], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -14439,7 +14439,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 681
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[104], w_fp[10], w_fp[4], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[104], w_fp[10], w_fp[4], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -14466,7 +14466,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 682
-      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[4], w_fp[5], w_fp[104], COUPs[2], &amp_fp[0] );
+      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[4], w_fp[5], w_fp[104], COUPs[2], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -14486,7 +14486,7 @@ namespace mg5amcCpu
       jamp_sv[82] += amp_sv[0];
       jamp_sv[90] -= amp_sv[0];
       jamp_sv[97] += amp_sv[0];
-      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[4], w_fp[5], w_fp[104], COUPs[2], &amp_fp[0] );
+      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[4], w_fp[5], w_fp[104], COUPs[2], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -14506,7 +14506,7 @@ namespace mg5amcCpu
       jamp_sv[88] += amp_sv[0];
       jamp_sv[90] -= amp_sv[0];
       jamp_sv[96] += amp_sv[0];
-      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[4], w_fp[5], w_fp[104], COUPs[2], &amp_fp[0] );
+      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[4], w_fp[5], w_fp[104], COUPs[2], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -14533,7 +14533,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 683
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[112], w_fp[108], w_fp[5], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[112], w_fp[108], w_fp[5], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -14560,7 +14560,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 684
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[112], w_fp[1], w_fp[10], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[112], w_fp[1], w_fp[10], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -14587,7 +14587,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 685
-      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[5], w_fp[112], COUPs[2], &amp_fp[0] );
+      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[5], w_fp[112], COUPs[2], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -14607,7 +14607,7 @@ namespace mg5amcCpu
       jamp_sv[88] -= amp_sv[0];
       jamp_sv[91] += amp_sv[0];
       jamp_sv[99] -= amp_sv[0];
-      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[5], w_fp[112], COUPs[2], &amp_fp[0] );
+      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[5], w_fp[112], COUPs[2], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -14627,7 +14627,7 @@ namespace mg5amcCpu
       jamp_sv[82] += amp_sv[0];
       jamp_sv[98] += amp_sv[0];
       jamp_sv[99] -= amp_sv[0];
-      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[5], w_fp[112], COUPs[2], &amp_fp[0] );
+      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[5], w_fp[112], COUPs[2], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -14654,7 +14654,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 686
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[86], w_fp[108], w_fp[4], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[86], w_fp[108], w_fp[4], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -14681,7 +14681,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 687
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[86], w_fp[1], w_fp[13], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[86], w_fp[1], w_fp[13], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -14708,7 +14708,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 688
-      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[4], w_fp[86], COUPs[2], &amp_fp[0] );
+      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[4], w_fp[86], COUPs[2], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -14728,7 +14728,7 @@ namespace mg5amcCpu
       jamp_sv[77] -= amp_sv[0];
       jamp_sv[91] += amp_sv[0];
       jamp_sv[101] -= amp_sv[0];
-      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[4], w_fp[86], COUPs[2], &amp_fp[0] );
+      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[4], w_fp[86], COUPs[2], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -14748,7 +14748,7 @@ namespace mg5amcCpu
       jamp_sv[91] += amp_sv[0];
       jamp_sv[100] += amp_sv[0];
       jamp_sv[101] -= amp_sv[0];
-      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[4], w_fp[86], COUPs[2], &amp_fp[0] );
+      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[4], w_fp[86], COUPs[2], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -14772,12 +14772,12 @@ namespace mg5amcCpu
       // *** DIAGRAM 689 OF 1240 ***
 
       // Wavefunction(s) for diagram number 689
-      VVVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[92], w_fp[1], w_fp[4], COUPs[2], 0., 0., w_fp[98] );
-      VVVV3P0_1<W_ACCESS, CD_ACCESS>( w_fp[92], w_fp[1], w_fp[4], COUPs[2], 0., 0., w_fp[62] );
-      VVVV4P0_1<W_ACCESS, CD_ACCESS>( w_fp[92], w_fp[1], w_fp[4], COUPs[2], 0., 0., w_fp[101] );
+      VVVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[92], w_fp[1], w_fp[4], COUPs[2], 1.0, 0., 0., w_fp[98] );
+      VVVV3P0_1<W_ACCESS, CD_ACCESS>( w_fp[92], w_fp[1], w_fp[4], COUPs[2], 1.0, 0., 0., w_fp[62] );
+      VVVV4P0_1<W_ACCESS, CD_ACCESS>( w_fp[92], w_fp[1], w_fp[4], COUPs[2], 1.0, 0., 0., w_fp[101] );
 
       // Amplitude(s) for diagram number 689
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[5], w_fp[98], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[5], w_fp[98], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -14797,7 +14797,7 @@ namespace mg5amcCpu
       jamp_sv[91] -= amp_sv[0];
       jamp_sv[96] -= amp_sv[0];
       jamp_sv[98] += amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[5], w_fp[62], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[5], w_fp[62], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -14817,7 +14817,7 @@ namespace mg5amcCpu
       jamp_sv[88] += amp_sv[0];
       jamp_sv[91] -= amp_sv[0];
       jamp_sv[98] += amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[5], w_fp[101], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[5], w_fp[101], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -14841,12 +14841,12 @@ namespace mg5amcCpu
       // *** DIAGRAM 690 OF 1240 ***
 
       // Wavefunction(s) for diagram number 690
-      VVVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[92], w_fp[1], w_fp[5], COUPs[2], 0., 0., w_fp[109] );
-      VVVV3P0_1<W_ACCESS, CD_ACCESS>( w_fp[92], w_fp[1], w_fp[5], COUPs[2], 0., 0., w_fp[110] );
-      VVVV4P0_1<W_ACCESS, CD_ACCESS>( w_fp[92], w_fp[1], w_fp[5], COUPs[2], 0., 0., w_fp[111] );
+      VVVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[92], w_fp[1], w_fp[5], COUPs[2], 1.0, 0., 0., w_fp[109] );
+      VVVV3P0_1<W_ACCESS, CD_ACCESS>( w_fp[92], w_fp[1], w_fp[5], COUPs[2], 1.0, 0., 0., w_fp[110] );
+      VVVV4P0_1<W_ACCESS, CD_ACCESS>( w_fp[92], w_fp[1], w_fp[5], COUPs[2], 1.0, 0., 0., w_fp[111] );
 
       // Amplitude(s) for diagram number 690
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[4], w_fp[109], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[4], w_fp[109], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -14866,7 +14866,7 @@ namespace mg5amcCpu
       jamp_sv[82] -= amp_sv[0];
       jamp_sv[97] -= amp_sv[0];
       jamp_sv[100] += amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[4], w_fp[110], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[4], w_fp[110], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -14886,7 +14886,7 @@ namespace mg5amcCpu
       jamp_sv[76] += amp_sv[0];
       jamp_sv[90] -= amp_sv[0];
       jamp_sv[100] += amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[4], w_fp[111], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[4], w_fp[111], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -14913,7 +14913,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 691
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[107], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[107], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -14933,7 +14933,7 @@ namespace mg5amcCpu
       jamp_sv[88] += amp_sv[0];
       jamp_sv[99] += amp_sv[0];
       jamp_sv[101] -= amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[95], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[95], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -14953,7 +14953,7 @@ namespace mg5amcCpu
       jamp_sv[77] -= amp_sv[0];
       jamp_sv[91] += amp_sv[0];
       jamp_sv[101] -= amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[105], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[105], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -14980,7 +14980,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 692
-      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[92], w_fp[1], w_fp[8], w_fp[24], COUPs[2], &amp_fp[0] );
+      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[92], w_fp[1], w_fp[8], w_fp[24], COUPs[2], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -15000,7 +15000,7 @@ namespace mg5amcCpu
       jamp_sv[88] -= amp_sv[0];
       jamp_sv[96] -= amp_sv[0];
       jamp_sv[97] += amp_sv[0];
-      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[92], w_fp[1], w_fp[8], w_fp[24], COUPs[2], &amp_fp[0] );
+      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[92], w_fp[1], w_fp[8], w_fp[24], COUPs[2], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -15020,7 +15020,7 @@ namespace mg5amcCpu
       jamp_sv[97] += amp_sv[0];
       jamp_sv[99] += amp_sv[0];
       jamp_sv[101] -= amp_sv[0];
-      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[92], w_fp[1], w_fp[8], w_fp[24], COUPs[2], &amp_fp[0] );
+      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[92], w_fp[1], w_fp[8], w_fp[24], COUPs[2], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -15047,7 +15047,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 693
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[24], w_fp[104], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[24], w_fp[104], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -15074,7 +15074,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 694
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[24], w_fp[113], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[24], w_fp[113], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -15101,7 +15101,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 695
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[102], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[102], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -15128,7 +15128,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 696
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[104], w_fp[37], w_fp[5], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[104], w_fp[37], w_fp[5], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -15147,7 +15147,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 697
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[35], w_fp[104], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[35], w_fp[104], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -15162,7 +15162,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 698
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[100], w_fp[5], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[100], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -15175,7 +15175,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 699
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[35], w_fp[1], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[35], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -15188,7 +15188,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 700
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[100], w_fp[86], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[100], w_fp[86], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -15203,7 +15203,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 701
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[86], w_fp[1], w_fp[37], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[86], w_fp[1], w_fp[37], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -15222,7 +15222,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 702
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[33], w_fp[109], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[33], w_fp[109], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -15234,7 +15234,7 @@ namespace mg5amcCpu
       jamp_sv[63] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[66] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[67] -= cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[33], w_fp[110], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[33], w_fp[110], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -15246,7 +15246,7 @@ namespace mg5amcCpu
       jamp_sv[61] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[64] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[67] -= cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[33], w_fp[111], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[33], w_fp[111], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -15265,7 +15265,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 703
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[33], w_fp[104], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[33], w_fp[104], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -15280,7 +15280,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 704
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[114], w_fp[1], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[114], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -15293,7 +15293,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 705
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[90], w_fp[33], w_fp[1], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[90], w_fp[33], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -15306,7 +15306,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 706
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[104], w_fp[45], w_fp[4], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[104], w_fp[45], w_fp[4], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -15325,7 +15325,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 707
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[43], w_fp[104], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[43], w_fp[104], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -15340,7 +15340,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 708
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[89], w_fp[4], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[89], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -15353,7 +15353,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 709
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[43], w_fp[1], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[43], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -15366,7 +15366,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 710
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[89], w_fp[112], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[89], w_fp[112], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -15381,7 +15381,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 711
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[112], w_fp[1], w_fp[45], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[112], w_fp[1], w_fp[45], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -15400,7 +15400,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 712
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[39], w_fp[98], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[39], w_fp[98], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -15412,7 +15412,7 @@ namespace mg5amcCpu
       jamp_sv[87] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[90] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[91] -= cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[39], w_fp[62], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[39], w_fp[62], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -15424,7 +15424,7 @@ namespace mg5amcCpu
       jamp_sv[85] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[88] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[91] -= cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[39], w_fp[101], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[39], w_fp[101], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -15443,7 +15443,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 713
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[39], w_fp[104], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[39], w_fp[104], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -15458,7 +15458,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 714
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[106], w_fp[1], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[106], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -15471,7 +15471,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 715
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[88], w_fp[39], w_fp[1], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[88], w_fp[39], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -15484,7 +15484,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 716
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[104], w_fp[54], w_fp[5], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[104], w_fp[54], w_fp[5], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -15503,7 +15503,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 717
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[7], w_fp[2], w_fp[104], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[7], w_fp[2], w_fp[104], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -15518,7 +15518,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 718
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[78], w_fp[96], w_fp[5], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[78], w_fp[96], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -15531,7 +15531,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 719
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[7], w_fp[96], w_fp[1], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[7], w_fp[96], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -15544,7 +15544,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 720
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[78], w_fp[2], w_fp[86], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[78], w_fp[2], w_fp[86], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -15559,7 +15559,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 721
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[86], w_fp[1], w_fp[54], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[86], w_fp[1], w_fp[54], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -15578,7 +15578,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 722
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[2], w_fp[109], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[2], w_fp[109], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -15590,7 +15590,7 @@ namespace mg5amcCpu
       jamp_sv[82] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[97] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[100] -= cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[2], w_fp[110], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[2], w_fp[110], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -15602,7 +15602,7 @@ namespace mg5amcCpu
       jamp_sv[76] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[90] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[100] -= cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[2], w_fp[111], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[2], w_fp[111], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -15621,7 +15621,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 723
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[104], w_fp[23], w_fp[4], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[104], w_fp[23], w_fp[4], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -15640,7 +15640,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 724
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[25], w_fp[2], w_fp[104], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[25], w_fp[2], w_fp[104], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -15655,7 +15655,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 725
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[58], w_fp[96], w_fp[4], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[58], w_fp[96], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -15668,7 +15668,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 726
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[25], w_fp[96], w_fp[1], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[25], w_fp[96], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -15681,7 +15681,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 727
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[58], w_fp[2], w_fp[112], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[58], w_fp[2], w_fp[112], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -15696,7 +15696,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 728
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[112], w_fp[1], w_fp[23], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[112], w_fp[1], w_fp[23], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -15715,7 +15715,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 729
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[2], w_fp[98], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[2], w_fp[98], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -15727,7 +15727,7 @@ namespace mg5amcCpu
       jamp_sv[58] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[96] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[98] -= cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[2], w_fp[62], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[2], w_fp[62], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -15739,7 +15739,7 @@ namespace mg5amcCpu
       jamp_sv[52] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[66] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[98] -= cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[2], w_fp[101], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[2], w_fp[101], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -15758,7 +15758,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 730
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[17], w_fp[104], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[17], w_fp[104], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -15777,7 +15777,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 731
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[26], w_fp[2], w_fp[104], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[26], w_fp[2], w_fp[104], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -15796,7 +15796,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 732
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[96], w_fp[59], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[96], w_fp[59], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -15815,7 +15815,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 733
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[26], w_fp[96], w_fp[1], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[26], w_fp[96], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -15830,7 +15830,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 734
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[2], w_fp[59], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[2], w_fp[59], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -15849,7 +15849,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 735
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[17], w_fp[1], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[17], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -15864,7 +15864,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 736
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[96], w_fp[73], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[96], w_fp[73], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -15876,7 +15876,7 @@ namespace mg5amcCpu
       jamp_sv[97] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[99] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[101] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[96], w_fp[79], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[96], w_fp[79], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -15888,7 +15888,7 @@ namespace mg5amcCpu
       jamp_sv[98] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[99] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[100] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[96], w_fp[80], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[96], w_fp[80], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -15907,7 +15907,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 737
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[2], w_fp[73], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[2], w_fp[73], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -15919,7 +15919,7 @@ namespace mg5amcCpu
       jamp_sv[63] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[86] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[87] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[2], w_fp[79], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[2], w_fp[79], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -15931,7 +15931,7 @@ namespace mg5amcCpu
       jamp_sv[63] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[80] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[81] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[2], w_fp[80], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[2], w_fp[80], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -15950,7 +15950,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 738
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[92], w_fp[73], w_fp[8], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[92], w_fp[73], w_fp[8], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -15970,7 +15970,7 @@ namespace mg5amcCpu
       jamp_sv[97] -= amp_sv[0];
       jamp_sv[99] -= amp_sv[0];
       jamp_sv[101] += amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[92], w_fp[79], w_fp[8], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[92], w_fp[79], w_fp[8], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -15990,7 +15990,7 @@ namespace mg5amcCpu
       jamp_sv[98] += amp_sv[0];
       jamp_sv[99] -= amp_sv[0];
       jamp_sv[100] += amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[92], w_fp[80], w_fp[8], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[92], w_fp[80], w_fp[8], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -16014,10 +16014,10 @@ namespace mg5amcCpu
       // *** DIAGRAM 739 OF 1240 ***
 
       // Wavefunction(s) for diagram number 739
-      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[77], w_fp[0], COUPs[1], cIPD[0], cIPD[1], w_fp[92] );
+      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[77], w_fp[0], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[92] );
 
       // Amplitude(s) for diagram number 739
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[7], w_fp[92], w_fp[6], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[7], w_fp[92], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -16029,7 +16029,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 740
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[53], w_fp[92], w_fp[5], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[53], w_fp[92], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -16038,10 +16038,10 @@ namespace mg5amcCpu
       // *** DIAGRAM 741 OF 1240 ***
 
       // Wavefunction(s) for diagram number 741
-      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[46], w_fp[0], COUPs[1], cIPD[0], cIPD[1], w_fp[99] );
+      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[46], w_fp[0], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[99] );
 
       // Amplitude(s) for diagram number 741
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[9], w_fp[6], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[9], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -16053,7 +16053,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 742
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[85], w_fp[5], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[85], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -16065,7 +16065,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 743
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[53], w_fp[9], w_fp[0], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[53], w_fp[9], w_fp[0], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -16077,7 +16077,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 744
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[7], w_fp[85], w_fp[0], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[7], w_fp[85], w_fp[0], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -16089,7 +16089,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 745
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[92], w_fp[29], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[92], w_fp[29], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -16102,7 +16102,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 746
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[77], w_fp[29], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[77], w_fp[29], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -16112,10 +16112,10 @@ namespace mg5amcCpu
       // *** DIAGRAM 747 OF 1240 ***
 
       // Wavefunction(s) for diagram number 747
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[29], COUPs[0], 0., 0., w_fp[96] );
+      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[29], COUPs[0], 1.0, 0., 0., w_fp[96] );
 
       // Amplitude(s) for diagram number 747
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[77], w_fp[96], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[77], w_fp[96], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -16130,7 +16130,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 748
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[25], w_fp[92], w_fp[6], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[25], w_fp[92], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -16142,7 +16142,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 749
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[48], w_fp[92], w_fp[4], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[48], w_fp[92], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -16151,10 +16151,10 @@ namespace mg5amcCpu
       // *** DIAGRAM 750 OF 1240 ***
 
       // Wavefunction(s) for diagram number 750
-      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[38], w_fp[0], COUPs[1], cIPD[0], cIPD[1], w_fp[104] );
+      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[38], w_fp[0], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[104] );
 
       // Amplitude(s) for diagram number 750
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[104], w_fp[87], w_fp[6], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[104], w_fp[87], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -16166,7 +16166,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 751
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[104], w_fp[85], w_fp[4], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[104], w_fp[85], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -16178,7 +16178,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 752
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[48], w_fp[87], w_fp[0], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[48], w_fp[87], w_fp[0], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -16190,7 +16190,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 753
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[25], w_fp[85], w_fp[0], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[25], w_fp[85], w_fp[0], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -16202,7 +16202,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 754
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[92], w_fp[27], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[92], w_fp[27], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -16215,7 +16215,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 755
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[104], w_fp[77], w_fp[27], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[104], w_fp[77], w_fp[27], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -16225,10 +16225,10 @@ namespace mg5amcCpu
       // *** DIAGRAM 756 OF 1240 ***
 
       // Wavefunction(s) for diagram number 756
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[27], COUPs[0], 0., 0., w_fp[101] );
+      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[27], COUPs[0], 1.0, 0., 0., w_fp[101] );
 
       // Amplitude(s) for diagram number 756
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[77], w_fp[101], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[77], w_fp[101], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -16243,7 +16243,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 757
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[28], w_fp[92], w_fp[5], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[28], w_fp[92], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -16255,7 +16255,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 758
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[40], w_fp[92], w_fp[4], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[40], w_fp[92], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -16264,10 +16264,10 @@ namespace mg5amcCpu
       // *** DIAGRAM 759 OF 1240 ***
 
       // Wavefunction(s) for diagram number 759
-      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[41], w_fp[0], COUPs[1], cIPD[0], cIPD[1], w_fp[62] );
+      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[41], w_fp[0], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[62] );
 
       // Amplitude(s) for diagram number 759
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[62], w_fp[87], w_fp[5], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[62], w_fp[87], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -16279,7 +16279,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 760
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[62], w_fp[9], w_fp[4], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[62], w_fp[9], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -16291,7 +16291,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 761
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[40], w_fp[87], w_fp[0], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[40], w_fp[87], w_fp[0], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -16303,7 +16303,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 762
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[28], w_fp[9], w_fp[0], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[28], w_fp[9], w_fp[0], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -16315,7 +16315,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 763
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[92], w_fp[24], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[92], w_fp[24], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -16328,7 +16328,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 764
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[62], w_fp[77], w_fp[24], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[62], w_fp[77], w_fp[24], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -16338,10 +16338,10 @@ namespace mg5amcCpu
       // *** DIAGRAM 765 OF 1240 ***
 
       // Wavefunction(s) for diagram number 765
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[24], COUPs[0], 0., 0., w_fp[98] );
+      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[24], COUPs[0], 1.0, 0., 0., w_fp[98] );
 
       // Amplitude(s) for diagram number 765
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[77], w_fp[98], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[77], w_fp[98], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -16356,7 +16356,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 766
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[26], w_fp[92], w_fp[6], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[26], w_fp[92], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -16369,7 +16369,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 767
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[92], w_fp[42], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[92], w_fp[42], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -16384,7 +16384,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 768
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[98], w_fp[34], w_fp[6], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[98], w_fp[34], w_fp[6], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -16403,7 +16403,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 769
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[85], w_fp[98], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[85], w_fp[98], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -16418,7 +16418,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 770
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[34], w_fp[42], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[34], w_fp[42], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -16437,7 +16437,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 771
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[26], w_fp[85], w_fp[0], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[26], w_fp[85], w_fp[0], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -16447,12 +16447,12 @@ namespace mg5amcCpu
       // *** DIAGRAM 772 OF 1240 ***
 
       // Wavefunction(s) for diagram number 772
-      VVVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[24], w_fp[6], COUPs[2], 0., 0., w_fp[85] );
-      VVVV3P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[24], w_fp[6], COUPs[2], 0., 0., w_fp[112] );
-      VVVV4P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[24], w_fp[6], COUPs[2], 0., 0., w_fp[111] );
+      VVVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[24], w_fp[6], COUPs[2], 1.0, 0., 0., w_fp[85] );
+      VVVV3P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[24], w_fp[6], COUPs[2], 1.0, 0., 0., w_fp[112] );
+      VVVV4P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[24], w_fp[6], COUPs[2], 1.0, 0., 0., w_fp[111] );
 
       // Amplitude(s) for diagram number 772
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[77], w_fp[85], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[77], w_fp[85], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -16464,7 +16464,7 @@ namespace mg5amcCpu
       jamp_sv[39] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[45] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[47] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[77], w_fp[112], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[77], w_fp[112], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -16476,7 +16476,7 @@ namespace mg5amcCpu
       jamp_sv[39] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[42] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[43] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[77], w_fp[111], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[77], w_fp[111], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -16495,7 +16495,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 773
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[14], w_fp[92], w_fp[5], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[14], w_fp[92], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -16508,7 +16508,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 774
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[92], w_fp[16], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[92], w_fp[16], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -16523,7 +16523,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 775
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[101], w_fp[34], w_fp[5], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[101], w_fp[34], w_fp[5], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -16542,7 +16542,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 776
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[9], w_fp[101], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[9], w_fp[101], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -16557,7 +16557,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 777
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[34], w_fp[16], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[34], w_fp[16], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -16576,7 +16576,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 778
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[14], w_fp[9], w_fp[0], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[14], w_fp[9], w_fp[0], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -16586,12 +16586,12 @@ namespace mg5amcCpu
       // *** DIAGRAM 779 OF 1240 ***
 
       // Wavefunction(s) for diagram number 779
-      VVVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[27], w_fp[5], COUPs[2], 0., 0., w_fp[9] );
-      VVVV3P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[27], w_fp[5], COUPs[2], 0., 0., w_fp[110] );
-      VVVV4P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[27], w_fp[5], COUPs[2], 0., 0., w_fp[109] );
+      VVVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[27], w_fp[5], COUPs[2], 1.0, 0., 0., w_fp[9] );
+      VVVV3P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[27], w_fp[5], COUPs[2], 1.0, 0., 0., w_fp[110] );
+      VVVV4P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[27], w_fp[5], COUPs[2], 1.0, 0., 0., w_fp[109] );
 
       // Amplitude(s) for diagram number 779
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[77], w_fp[9], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[77], w_fp[9], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -16603,7 +16603,7 @@ namespace mg5amcCpu
       jamp_sv[39] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[41] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[45] -= cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[77], w_fp[110], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[77], w_fp[110], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -16615,7 +16615,7 @@ namespace mg5amcCpu
       jamp_sv[37] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[44] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[45] -= cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[77], w_fp[109], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[77], w_fp[109], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -16634,7 +16634,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 780
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[12], w_fp[92], w_fp[4], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[12], w_fp[92], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -16647,7 +16647,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 781
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[92], w_fp[19], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[92], w_fp[19], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -16662,7 +16662,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 782
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[96], w_fp[34], w_fp[4], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[96], w_fp[34], w_fp[4], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -16681,7 +16681,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 783
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[87], w_fp[96], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[87], w_fp[96], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -16696,7 +16696,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 784
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[34], w_fp[19], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[34], w_fp[19], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -16715,7 +16715,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 785
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[12], w_fp[87], w_fp[0], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[12], w_fp[87], w_fp[0], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -16725,12 +16725,12 @@ namespace mg5amcCpu
       // *** DIAGRAM 786 OF 1240 ***
 
       // Wavefunction(s) for diagram number 786
-      VVVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[4], w_fp[29], COUPs[2], 0., 0., w_fp[87] );
-      VVVV3P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[4], w_fp[29], COUPs[2], 0., 0., w_fp[34] );
-      VVVV4P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[4], w_fp[29], COUPs[2], 0., 0., w_fp[86] );
+      VVVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[4], w_fp[29], COUPs[2], 1.0, 0., 0., w_fp[87] );
+      VVVV3P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[4], w_fp[29], COUPs[2], 1.0, 0., 0., w_fp[34] );
+      VVVV4P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[4], w_fp[29], COUPs[2], 1.0, 0., 0., w_fp[86] );
 
       // Amplitude(s) for diagram number 786
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[77], w_fp[87], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[77], w_fp[87], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -16742,7 +16742,7 @@ namespace mg5amcCpu
       jamp_sv[35] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[41] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[47] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[77], w_fp[34], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[77], w_fp[34], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -16754,7 +16754,7 @@ namespace mg5amcCpu
       jamp_sv[35] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[40] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[46] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[77], w_fp[86], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[77], w_fp[86], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -16773,7 +16773,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 787
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[92], w_fp[30], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[92], w_fp[30], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -16781,7 +16781,7 @@ namespace mg5amcCpu
       jamp_sv[25] -= amp_sv[0];
       jamp_sv[27] -= amp_sv[0];
       jamp_sv[29] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[92], w_fp[31], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[92], w_fp[31], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -16789,7 +16789,7 @@ namespace mg5amcCpu
       jamp_sv[26] += amp_sv[0];
       jamp_sv[27] -= amp_sv[0];
       jamp_sv[28] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[92], w_fp[32], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[92], w_fp[32], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -16801,12 +16801,12 @@ namespace mg5amcCpu
       // *** DIAGRAM 788 OF 1240 ***
 
       // Wavefunction(s) for diagram number 788
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[30], COUPs[0], 0., 0., w_fp[92] );
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[31], COUPs[0], 0., 0., w_fp[88] );
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[32], COUPs[0], 0., 0., w_fp[106] );
+      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[30], COUPs[0], 1.0, 0., 0., w_fp[92] );
+      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[31], COUPs[0], 1.0, 0., 0., w_fp[88] );
+      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[32], COUPs[0], 1.0, 0., 0., w_fp[106] );
 
       // Amplitude(s) for diagram number 788
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[77], w_fp[92], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[77], w_fp[92], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -16818,7 +16818,7 @@ namespace mg5amcCpu
       jamp_sv[35] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[41] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[47] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[77], w_fp[88], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[77], w_fp[88], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -16830,7 +16830,7 @@ namespace mg5amcCpu
       jamp_sv[39] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[41] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[45] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[77], w_fp[106], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[77], w_fp[106], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -16846,10 +16846,10 @@ namespace mg5amcCpu
       // *** DIAGRAM 789 OF 1240 ***
 
       // Wavefunction(s) for diagram number 789
-      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[52], w_fp[0], COUPs[1], cIPD[0], cIPD[1], w_fp[90] );
+      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[52], w_fp[0], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[90] );
 
       // Amplitude(s) for diagram number 789
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[90], w_fp[35], w_fp[6], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[90], w_fp[35], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -16861,7 +16861,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 790
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[90], w_fp[36], w_fp[5], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[90], w_fp[36], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -16870,10 +16870,10 @@ namespace mg5amcCpu
       // *** DIAGRAM 791 OF 1240 ***
 
       // Wavefunction(s) for diagram number 791
-      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[33], w_fp[0], COUPs[1], cIPD[0], cIPD[1], w_fp[114] );
+      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[33], w_fp[0], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[114] );
 
       // Amplitude(s) for diagram number 791
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[22], w_fp[114], w_fp[6], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[22], w_fp[114], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -16885,7 +16885,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 792
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[21], w_fp[114], w_fp[5], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[21], w_fp[114], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -16897,7 +16897,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 793
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[22], w_fp[36], w_fp[0], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[22], w_fp[36], w_fp[0], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -16909,7 +16909,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 794
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[21], w_fp[35], w_fp[0], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[21], w_fp[35], w_fp[0], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -16921,7 +16921,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 795
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[90], w_fp[33], w_fp[29], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[90], w_fp[33], w_fp[29], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -16934,7 +16934,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 796
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[114], w_fp[29], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[114], w_fp[29], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -16947,7 +16947,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 797
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[33], w_fp[96], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[33], w_fp[96], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -16962,7 +16962,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 798
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[90], w_fp[43], w_fp[6], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[90], w_fp[43], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -16974,7 +16974,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 799
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[90], w_fp[44], w_fp[4], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[90], w_fp[44], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -16983,10 +16983,10 @@ namespace mg5amcCpu
       // *** DIAGRAM 800 OF 1240 ***
 
       // Wavefunction(s) for diagram number 800
-      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[39], w_fp[0], COUPs[1], cIPD[0], cIPD[1], w_fp[102] );
+      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[39], w_fp[0], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[102] );
 
       // Amplitude(s) for diagram number 800
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[56], w_fp[102], w_fp[6], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[56], w_fp[102], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -16998,7 +16998,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 801
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[21], w_fp[102], w_fp[4], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[21], w_fp[102], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -17010,7 +17010,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 802
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[56], w_fp[44], w_fp[0], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[56], w_fp[44], w_fp[0], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -17022,7 +17022,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 803
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[21], w_fp[43], w_fp[0], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[21], w_fp[43], w_fp[0], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -17034,7 +17034,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 804
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[90], w_fp[39], w_fp[27], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[90], w_fp[39], w_fp[27], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -17047,7 +17047,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 805
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[102], w_fp[27], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[102], w_fp[27], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -17060,7 +17060,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 806
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[39], w_fp[101], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[39], w_fp[101], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -17075,7 +17075,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 807
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[90], w_fp[49], w_fp[5], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[90], w_fp[49], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -17087,7 +17087,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 808
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[90], w_fp[50], w_fp[4], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[90], w_fp[50], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -17096,10 +17096,10 @@ namespace mg5amcCpu
       // *** DIAGRAM 809 OF 1240 ***
 
       // Wavefunction(s) for diagram number 809
-      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[47], w_fp[0], COUPs[1], cIPD[0], cIPD[1], w_fp[113] );
+      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[47], w_fp[0], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[113] );
 
       // Amplitude(s) for diagram number 809
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[56], w_fp[113], w_fp[5], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[56], w_fp[113], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -17111,7 +17111,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 810
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[22], w_fp[113], w_fp[4], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[22], w_fp[113], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -17123,7 +17123,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 811
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[56], w_fp[50], w_fp[0], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[56], w_fp[50], w_fp[0], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -17135,7 +17135,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 812
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[22], w_fp[49], w_fp[0], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[22], w_fp[49], w_fp[0], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -17147,7 +17147,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 813
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[90], w_fp[47], w_fp[24], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[90], w_fp[47], w_fp[24], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -17160,7 +17160,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 814
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[113], w_fp[24], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[113], w_fp[24], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -17173,7 +17173,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 815
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[47], w_fp[98], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[47], w_fp[98], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -17188,7 +17188,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 816
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[90], w_fp[17], w_fp[6], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[90], w_fp[17], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -17201,7 +17201,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 817
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[90], w_fp[2], w_fp[42], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[90], w_fp[2], w_fp[42], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -17216,7 +17216,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 818
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[98], w_fp[103], w_fp[6], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[98], w_fp[103], w_fp[6], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -17235,7 +17235,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 819
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[21], w_fp[2], w_fp[98], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[21], w_fp[2], w_fp[98], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -17250,7 +17250,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 820
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[103], w_fp[42], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[103], w_fp[42], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -17269,7 +17269,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 821
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[21], w_fp[17], w_fp[0], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[21], w_fp[17], w_fp[0], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -17282,7 +17282,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 822
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[2], w_fp[85], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[2], w_fp[85], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -17294,7 +17294,7 @@ namespace mg5amcCpu
       jamp_sv[88] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[112] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[118] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[2], w_fp[112], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[2], w_fp[112], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -17306,7 +17306,7 @@ namespace mg5amcCpu
       jamp_sv[88] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[99] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[101] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[2], w_fp[111], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[2], w_fp[111], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -17325,7 +17325,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 823
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[90], w_fp[15], w_fp[5], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[90], w_fp[15], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -17338,7 +17338,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 824
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[90], w_fp[2], w_fp[16], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[90], w_fp[2], w_fp[16], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -17353,7 +17353,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 825
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[101], w_fp[103], w_fp[5], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[101], w_fp[103], w_fp[5], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -17372,7 +17372,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 826
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[22], w_fp[2], w_fp[101], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[22], w_fp[2], w_fp[101], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -17387,7 +17387,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 827
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[103], w_fp[16], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[103], w_fp[16], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -17406,7 +17406,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 828
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[22], w_fp[15], w_fp[0], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[22], w_fp[15], w_fp[0], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -17419,7 +17419,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 829
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[2], w_fp[9], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[2], w_fp[9], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -17431,7 +17431,7 @@ namespace mg5amcCpu
       jamp_sv[88] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[94] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[112] -= cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[2], w_fp[110], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[2], w_fp[110], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -17443,7 +17443,7 @@ namespace mg5amcCpu
       jamp_sv[77] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[109] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[112] -= cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[2], w_fp[109], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[2], w_fp[109], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -17462,7 +17462,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 830
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[90], w_fp[18], w_fp[4], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[90], w_fp[18], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -17475,7 +17475,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 831
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[90], w_fp[2], w_fp[19], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[90], w_fp[2], w_fp[19], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -17490,7 +17490,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 832
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[96], w_fp[103], w_fp[4], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[96], w_fp[103], w_fp[4], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -17509,7 +17509,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 833
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[56], w_fp[2], w_fp[96], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[56], w_fp[2], w_fp[96], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -17524,7 +17524,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 834
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[103], w_fp[19], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[103], w_fp[19], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -17543,7 +17543,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 835
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[56], w_fp[18], w_fp[0], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[56], w_fp[18], w_fp[0], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -17556,7 +17556,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 836
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[2], w_fp[87], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[2], w_fp[87], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -17568,7 +17568,7 @@ namespace mg5amcCpu
       jamp_sv[70] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[94] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[118] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[2], w_fp[34], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[2], w_fp[34], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -17580,7 +17580,7 @@ namespace mg5amcCpu
       jamp_sv[70] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[91] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[115] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[2], w_fp[86], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[2], w_fp[86], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -17599,7 +17599,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 837
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[90], w_fp[2], w_fp[30], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[90], w_fp[2], w_fp[30], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -17607,7 +17607,7 @@ namespace mg5amcCpu
       jamp_sv[70] -= amp_sv[0];
       jamp_sv[94] -= amp_sv[0];
       jamp_sv[118] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[90], w_fp[2], w_fp[31], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[90], w_fp[2], w_fp[31], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -17615,7 +17615,7 @@ namespace mg5amcCpu
       jamp_sv[88] += amp_sv[0];
       jamp_sv[94] -= amp_sv[0];
       jamp_sv[112] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[90], w_fp[2], w_fp[32], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[90], w_fp[2], w_fp[32], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -17630,7 +17630,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 838
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[2], w_fp[92], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[2], w_fp[92], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -17642,7 +17642,7 @@ namespace mg5amcCpu
       jamp_sv[70] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[94] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[118] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[2], w_fp[88], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[2], w_fp[88], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -17654,7 +17654,7 @@ namespace mg5amcCpu
       jamp_sv[88] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[94] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[112] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[2], w_fp[106], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[2], w_fp[106], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -17670,10 +17670,10 @@ namespace mg5amcCpu
       // *** DIAGRAM 839 OF 1240 ***
 
       // Wavefunction(s) for diagram number 839
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[61], COUPs[0], 0., 0., w_fp[90] );
+      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[61], COUPs[0], 1.0, 0., 0., w_fp[90] );
 
       // Amplitude(s) for diagram number 839
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[90], w_fp[10], w_fp[6], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[90], w_fp[10], w_fp[6], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -17700,7 +17700,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 840
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[90], w_fp[11], w_fp[5], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[90], w_fp[11], w_fp[5], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -17727,7 +17727,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 841
-      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[5], w_fp[6], w_fp[90], COUPs[2], &amp_fp[0] );
+      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[5], w_fp[6], w_fp[90], COUPs[2], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -17747,7 +17747,7 @@ namespace mg5amcCpu
       jamp_sv[98] -= amp_sv[0];
       jamp_sv[104] -= amp_sv[0];
       jamp_sv[110] += amp_sv[0];
-      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[5], w_fp[6], w_fp[90], COUPs[2], &amp_fp[0] );
+      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[5], w_fp[6], w_fp[90], COUPs[2], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -17767,7 +17767,7 @@ namespace mg5amcCpu
       jamp_sv[115] += amp_sv[0];
       jamp_sv[117] += amp_sv[0];
       jamp_sv[119] -= amp_sv[0];
-      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[5], w_fp[6], w_fp[90], COUPs[2], &amp_fp[0] );
+      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[5], w_fp[6], w_fp[90], COUPs[2], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -17791,10 +17791,10 @@ namespace mg5amcCpu
       // *** DIAGRAM 842 OF 1240 ***
 
       // Wavefunction(s) for diagram number 842
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[8], COUPs[0], 0., 0., w_fp[56] );
+      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[8], COUPs[0], 1.0, 0., 0., w_fp[56] );
 
       // Amplitude(s) for diagram number 842
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[56], w_fp[63], w_fp[6], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[56], w_fp[63], w_fp[6], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -17821,7 +17821,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 843
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[56], w_fp[64], w_fp[5], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[56], w_fp[64], w_fp[5], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -17848,7 +17848,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 844
-      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[61], w_fp[5], w_fp[6], w_fp[56], COUPs[2], &amp_fp[0] );
+      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[61], w_fp[5], w_fp[6], w_fp[56], COUPs[2], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -17868,7 +17868,7 @@ namespace mg5amcCpu
       jamp_sv[111] -= amp_sv[0];
       jamp_sv[117] -= amp_sv[0];
       jamp_sv[119] += amp_sv[0];
-      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[61], w_fp[5], w_fp[6], w_fp[56], COUPs[2], &amp_fp[0] );
+      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[61], w_fp[5], w_fp[6], w_fp[56], COUPs[2], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -17888,7 +17888,7 @@ namespace mg5amcCpu
       jamp_sv[95] += amp_sv[0];
       jamp_sv[105] += amp_sv[0];
       jamp_sv[111] -= amp_sv[0];
-      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[61], w_fp[5], w_fp[6], w_fp[56], COUPs[2], &amp_fp[0] );
+      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[61], w_fp[5], w_fp[6], w_fp[56], COUPs[2], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -17915,7 +17915,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 845
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[63], w_fp[11], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[63], w_fp[11], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -17942,7 +17942,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 846
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[64], w_fp[10], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[64], w_fp[10], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -17966,12 +17966,12 @@ namespace mg5amcCpu
       // *** DIAGRAM 847 OF 1240 ***
 
       // Wavefunction(s) for diagram number 847
-      VVVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[61], w_fp[5], COUPs[2], 0., 0., w_fp[103] );
-      VVVV3P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[61], w_fp[5], COUPs[2], 0., 0., w_fp[22] );
-      VVVV4P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[61], w_fp[5], COUPs[2], 0., 0., w_fp[21] );
+      VVVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[61], w_fp[5], COUPs[2], 1.0, 0., 0., w_fp[103] );
+      VVVV3P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[61], w_fp[5], COUPs[2], 1.0, 0., 0., w_fp[22] );
+      VVVV4P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[61], w_fp[5], COUPs[2], 1.0, 0., 0., w_fp[21] );
 
       // Amplitude(s) for diagram number 847
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[6], w_fp[103], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[6], w_fp[103], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -17991,7 +17991,7 @@ namespace mg5amcCpu
       jamp_sv[111] -= amp_sv[0];
       jamp_sv[117] -= amp_sv[0];
       jamp_sv[119] += amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[6], w_fp[22], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[6], w_fp[22], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -18011,7 +18011,7 @@ namespace mg5amcCpu
       jamp_sv[111] -= amp_sv[0];
       jamp_sv[114] -= amp_sv[0];
       jamp_sv[115] += amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[6], w_fp[21], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[6], w_fp[21], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -18035,12 +18035,12 @@ namespace mg5amcCpu
       // *** DIAGRAM 848 OF 1240 ***
 
       // Wavefunction(s) for diagram number 848
-      VVVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[61], w_fp[6], COUPs[2], 0., 0., w_fp[105] );
-      VVVV3P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[61], w_fp[6], COUPs[2], 0., 0., w_fp[95] );
-      VVVV4P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[61], w_fp[6], COUPs[2], 0., 0., w_fp[107] );
+      VVVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[61], w_fp[6], COUPs[2], 1.0, 0., 0., w_fp[105] );
+      VVVV3P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[61], w_fp[6], COUPs[2], 1.0, 0., 0., w_fp[95] );
+      VVVV4P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[61], w_fp[6], COUPs[2], 1.0, 0., 0., w_fp[107] );
 
       // Amplitude(s) for diagram number 848
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[5], w_fp[105], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[5], w_fp[105], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -18060,7 +18060,7 @@ namespace mg5amcCpu
       jamp_sv[95] += amp_sv[0];
       jamp_sv[104] += amp_sv[0];
       jamp_sv[110] -= amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[5], w_fp[95], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[5], w_fp[95], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -18080,7 +18080,7 @@ namespace mg5amcCpu
       jamp_sv[91] += amp_sv[0];
       jamp_sv[96] += amp_sv[0];
       jamp_sv[98] -= amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[5], w_fp[107], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[5], w_fp[107], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -18104,12 +18104,12 @@ namespace mg5amcCpu
       // *** DIAGRAM 849 OF 1240 ***
 
       // Wavefunction(s) for diagram number 849
-      VVVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[8], w_fp[5], COUPs[2], 0., 0., w_fp[115] );
-      VVVV3P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[8], w_fp[5], COUPs[2], 0., 0., w_fp[116] );
-      VVVV4P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[8], w_fp[5], COUPs[2], 0., 0., w_fp[117] );
+      VVVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[8], w_fp[5], COUPs[2], 1.0, 0., 0., w_fp[115] );
+      VVVV3P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[8], w_fp[5], COUPs[2], 1.0, 0., 0., w_fp[116] );
+      VVVV4P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[8], w_fp[5], COUPs[2], 1.0, 0., 0., w_fp[117] );
 
       // Amplitude(s) for diagram number 849
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[61], w_fp[6], w_fp[115], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[61], w_fp[6], w_fp[115], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -18129,7 +18129,7 @@ namespace mg5amcCpu
       jamp_sv[95] -= amp_sv[0];
       jamp_sv[104] -= amp_sv[0];
       jamp_sv[110] += amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[61], w_fp[6], w_fp[116], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[61], w_fp[6], w_fp[116], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -18149,7 +18149,7 @@ namespace mg5amcCpu
       jamp_sv[105] += amp_sv[0];
       jamp_sv[110] += amp_sv[0];
       jamp_sv[111] -= amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[61], w_fp[6], w_fp[117], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[61], w_fp[6], w_fp[117], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -18173,12 +18173,12 @@ namespace mg5amcCpu
       // *** DIAGRAM 850 OF 1240 ***
 
       // Wavefunction(s) for diagram number 850
-      VVVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[8], w_fp[6], COUPs[2], 0., 0., w_fp[118] );
-      VVVV3P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[8], w_fp[6], COUPs[2], 0., 0., w_fp[119] );
-      VVVV4P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[8], w_fp[6], COUPs[2], 0., 0., w_fp[120] );
+      VVVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[8], w_fp[6], COUPs[2], 1.0, 0., 0., w_fp[118] );
+      VVVV3P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[8], w_fp[6], COUPs[2], 1.0, 0., 0., w_fp[119] );
+      VVVV4P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[8], w_fp[6], COUPs[2], 1.0, 0., 0., w_fp[120] );
 
       // Amplitude(s) for diagram number 850
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[61], w_fp[5], w_fp[118], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[61], w_fp[5], w_fp[118], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -18198,7 +18198,7 @@ namespace mg5amcCpu
       jamp_sv[111] += amp_sv[0];
       jamp_sv[117] += amp_sv[0];
       jamp_sv[119] -= amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[61], w_fp[5], w_fp[119], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[61], w_fp[5], w_fp[119], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -18218,7 +18218,7 @@ namespace mg5amcCpu
       jamp_sv[98] -= amp_sv[0];
       jamp_sv[100] -= amp_sv[0];
       jamp_sv[101] += amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[61], w_fp[5], w_fp[120], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[61], w_fp[5], w_fp[120], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -18245,7 +18245,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 851
-      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[61], w_fp[8], w_fp[29], COUPs[2], &amp_fp[0] );
+      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[61], w_fp[8], w_fp[29], COUPs[2], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -18265,7 +18265,7 @@ namespace mg5amcCpu
       jamp_sv[115] -= amp_sv[0];
       jamp_sv[117] -= amp_sv[0];
       jamp_sv[119] += amp_sv[0];
-      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[61], w_fp[8], w_fp[29], COUPs[2], &amp_fp[0] );
+      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[61], w_fp[8], w_fp[29], COUPs[2], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -18285,7 +18285,7 @@ namespace mg5amcCpu
       jamp_sv[95] -= amp_sv[0];
       jamp_sv[117] -= amp_sv[0];
       jamp_sv[119] += amp_sv[0];
-      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[61], w_fp[8], w_fp[29], COUPs[2], &amp_fp[0] );
+      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[61], w_fp[8], w_fp[29], COUPs[2], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -18312,7 +18312,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 852
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[29], w_fp[90], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[29], w_fp[90], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -18339,7 +18339,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 853
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[61], w_fp[29], w_fp[56], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[61], w_fp[29], w_fp[56], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -18366,7 +18366,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 854
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[61], w_fp[8], w_fp[96], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[61], w_fp[8], w_fp[96], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -18393,7 +18393,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 855
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[90], w_fp[45], w_fp[6], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[90], w_fp[45], w_fp[6], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -18412,7 +18412,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 856
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[44], w_fp[90], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[44], w_fp[90], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -18427,7 +18427,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 857
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[65], w_fp[102], w_fp[6], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[65], w_fp[102], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -18440,7 +18440,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 858
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[102], w_fp[64], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[102], w_fp[64], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -18455,7 +18455,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 859
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[65], w_fp[44], w_fp[0], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[65], w_fp[44], w_fp[0], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -18468,7 +18468,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 860
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[64], w_fp[45], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[64], w_fp[45], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -18487,7 +18487,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 861
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[39], w_fp[105], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[39], w_fp[105], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -18499,7 +18499,7 @@ namespace mg5amcCpu
       jamp_sv[87] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[93] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[95] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[39], w_fp[95], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[39], w_fp[95], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -18511,7 +18511,7 @@ namespace mg5amcCpu
       jamp_sv[87] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[90] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[91] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[39], w_fp[107], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[39], w_fp[107], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -18530,7 +18530,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 862
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[39], w_fp[90], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[39], w_fp[90], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -18545,7 +18545,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 863
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[102], w_fp[61], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[102], w_fp[61], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -18558,7 +18558,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 864
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[62], w_fp[39], w_fp[61], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[62], w_fp[39], w_fp[61], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -18571,7 +18571,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 865
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[90], w_fp[51], w_fp[5], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[90], w_fp[51], w_fp[5], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -18590,7 +18590,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 866
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[50], w_fp[90], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[50], w_fp[90], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -18605,7 +18605,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 867
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[65], w_fp[113], w_fp[5], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[65], w_fp[113], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -18618,7 +18618,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 868
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[113], w_fp[63], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[113], w_fp[63], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -18633,7 +18633,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 869
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[65], w_fp[50], w_fp[0], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[65], w_fp[50], w_fp[0], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -18646,7 +18646,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 870
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[63], w_fp[51], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[63], w_fp[51], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -18665,7 +18665,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 871
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[47], w_fp[103], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[47], w_fp[103], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -18677,7 +18677,7 @@ namespace mg5amcCpu
       jamp_sv[111] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[117] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[119] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[47], w_fp[22], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[47], w_fp[22], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -18689,7 +18689,7 @@ namespace mg5amcCpu
       jamp_sv[111] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[114] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[115] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[47], w_fp[21], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[47], w_fp[21], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -18708,7 +18708,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 872
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[47], w_fp[90], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[47], w_fp[90], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -18723,7 +18723,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 873
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[113], w_fp[61], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[113], w_fp[61], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -18736,7 +18736,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 874
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[104], w_fp[47], w_fp[61], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[104], w_fp[47], w_fp[61], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -18749,7 +18749,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 875
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[90], w_fp[23], w_fp[6], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[90], w_fp[23], w_fp[6], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -18768,7 +18768,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 876
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[48], w_fp[2], w_fp[90], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[48], w_fp[2], w_fp[90], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -18783,7 +18783,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 877
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[104], w_fp[93], w_fp[6], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[104], w_fp[93], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -18796,7 +18796,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 878
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[104], w_fp[2], w_fp[64], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[104], w_fp[2], w_fp[64], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -18811,7 +18811,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 879
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[48], w_fp[93], w_fp[0], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[48], w_fp[93], w_fp[0], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -18824,7 +18824,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 880
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[64], w_fp[23], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[64], w_fp[23], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -18843,7 +18843,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 881
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[2], w_fp[105], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[2], w_fp[105], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -18855,7 +18855,7 @@ namespace mg5amcCpu
       jamp_sv[58] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[104] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[110] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[2], w_fp[95], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[2], w_fp[95], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -18867,7 +18867,7 @@ namespace mg5amcCpu
       jamp_sv[58] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[96] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[98] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[2], w_fp[107], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[2], w_fp[107], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -18886,7 +18886,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 882
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[90], w_fp[20], w_fp[5], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[90], w_fp[20], w_fp[5], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -18905,7 +18905,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 883
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[40], w_fp[2], w_fp[90], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[40], w_fp[2], w_fp[90], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -18920,7 +18920,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 884
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[62], w_fp[93], w_fp[5], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[62], w_fp[93], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -18933,7 +18933,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 885
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[62], w_fp[2], w_fp[63], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[62], w_fp[2], w_fp[63], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -18948,7 +18948,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 886
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[40], w_fp[93], w_fp[0], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[40], w_fp[93], w_fp[0], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -18961,7 +18961,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 887
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[63], w_fp[20], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[63], w_fp[20], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -18980,7 +18980,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 888
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[2], w_fp[103], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[2], w_fp[103], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -18992,7 +18992,7 @@ namespace mg5amcCpu
       jamp_sv[56] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[80] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[86] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[2], w_fp[22], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[2], w_fp[22], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -19004,7 +19004,7 @@ namespace mg5amcCpu
       jamp_sv[56] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[72] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[74] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[2], w_fp[21], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[2], w_fp[21], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -19023,7 +19023,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 889
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[18], w_fp[90], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[18], w_fp[90], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -19042,7 +19042,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 890
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[12], w_fp[2], w_fp[90], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[12], w_fp[2], w_fp[90], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -19061,7 +19061,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 891
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[93], w_fp[96], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[93], w_fp[96], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -19080,7 +19080,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 892
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[65], w_fp[2], w_fp[96], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[65], w_fp[2], w_fp[96], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -19099,7 +19099,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 893
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[12], w_fp[93], w_fp[0], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[12], w_fp[93], w_fp[0], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -19114,7 +19114,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 894
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[65], w_fp[18], w_fp[0], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[65], w_fp[18], w_fp[0], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -19126,10 +19126,10 @@ namespace mg5amcCpu
       // *** DIAGRAM 895 OF 1240 ***
 
       // Wavefunction(s) for diagram number 895
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[66], COUPs[0], 0., 0., w_fp[65] );
+      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[66], COUPs[0], 1.0, 0., 0., w_fp[65] );
 
       // Amplitude(s) for diagram number 895
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[65], w_fp[13], w_fp[6], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[65], w_fp[13], w_fp[6], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -19156,7 +19156,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 896
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[65], w_fp[11], w_fp[4], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[65], w_fp[11], w_fp[4], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -19183,7 +19183,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 897
-      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[4], w_fp[6], w_fp[65], COUPs[2], &amp_fp[0] );
+      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[4], w_fp[6], w_fp[65], COUPs[2], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -19203,7 +19203,7 @@ namespace mg5amcCpu
       jamp_sv[100] -= amp_sv[0];
       jamp_sv[106] -= amp_sv[0];
       jamp_sv[116] += amp_sv[0];
-      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[4], w_fp[6], w_fp[65], COUPs[2], &amp_fp[0] );
+      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[4], w_fp[6], w_fp[65], COUPs[2], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -19223,7 +19223,7 @@ namespace mg5amcCpu
       jamp_sv[111] += amp_sv[0];
       jamp_sv[113] -= amp_sv[0];
       jamp_sv[116] += amp_sv[0];
-      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[4], w_fp[6], w_fp[65], COUPs[2], &amp_fp[0] );
+      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[4], w_fp[6], w_fp[65], COUPs[2], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -19250,7 +19250,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 898
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[56], w_fp[69], w_fp[6], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[56], w_fp[69], w_fp[6], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -19277,7 +19277,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 899
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[56], w_fp[70], w_fp[4], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[56], w_fp[70], w_fp[4], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -19304,7 +19304,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 900
-      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[66], w_fp[4], w_fp[6], w_fp[56], COUPs[2], &amp_fp[0] );
+      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[66], w_fp[4], w_fp[6], w_fp[56], COUPs[2], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -19324,7 +19324,7 @@ namespace mg5amcCpu
       jamp_sv[111] -= amp_sv[0];
       jamp_sv[113] += amp_sv[0];
       jamp_sv[117] -= amp_sv[0];
-      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[66], w_fp[4], w_fp[6], w_fp[56], COUPs[2], &amp_fp[0] );
+      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[66], w_fp[4], w_fp[6], w_fp[56], COUPs[2], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -19344,7 +19344,7 @@ namespace mg5amcCpu
       jamp_sv[83] += amp_sv[0];
       jamp_sv[107] += amp_sv[0];
       jamp_sv[117] -= amp_sv[0];
-      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[66], w_fp[4], w_fp[6], w_fp[56], COUPs[2], &amp_fp[0] );
+      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[66], w_fp[4], w_fp[6], w_fp[56], COUPs[2], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -19371,7 +19371,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 901
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[69], w_fp[11], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[69], w_fp[11], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -19398,7 +19398,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 902
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[70], w_fp[13], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[70], w_fp[13], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -19422,12 +19422,12 @@ namespace mg5amcCpu
       // *** DIAGRAM 903 OF 1240 ***
 
       // Wavefunction(s) for diagram number 903
-      VVVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[66], w_fp[4], COUPs[2], 0., 0., w_fp[93] );
-      VVVV3P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[66], w_fp[4], COUPs[2], 0., 0., w_fp[90] );
-      VVVV4P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[66], w_fp[4], COUPs[2], 0., 0., w_fp[21] );
+      VVVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[66], w_fp[4], COUPs[2], 1.0, 0., 0., w_fp[93] );
+      VVVV3P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[66], w_fp[4], COUPs[2], 1.0, 0., 0., w_fp[90] );
+      VVVV4P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[66], w_fp[4], COUPs[2], 1.0, 0., 0., w_fp[21] );
 
       // Amplitude(s) for diagram number 903
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[6], w_fp[93], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[6], w_fp[93], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -19447,7 +19447,7 @@ namespace mg5amcCpu
       jamp_sv[111] -= amp_sv[0];
       jamp_sv[113] += amp_sv[0];
       jamp_sv[117] -= amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[6], w_fp[90], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[6], w_fp[90], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -19467,7 +19467,7 @@ namespace mg5amcCpu
       jamp_sv[109] += amp_sv[0];
       jamp_sv[116] += amp_sv[0];
       jamp_sv[117] -= amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[6], w_fp[21], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[6], w_fp[21], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -19491,12 +19491,12 @@ namespace mg5amcCpu
       // *** DIAGRAM 904 OF 1240 ***
 
       // Wavefunction(s) for diagram number 904
-      VVVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[66], w_fp[6], COUPs[2], 0., 0., w_fp[22] );
-      VVVV3P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[66], w_fp[6], COUPs[2], 0., 0., w_fp[103] );
-      VVVV4P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[66], w_fp[6], COUPs[2], 0., 0., w_fp[63] );
+      VVVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[66], w_fp[6], COUPs[2], 1.0, 0., 0., w_fp[22] );
+      VVVV3P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[66], w_fp[6], COUPs[2], 1.0, 0., 0., w_fp[103] );
+      VVVV4P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[66], w_fp[6], COUPs[2], 1.0, 0., 0., w_fp[63] );
 
       // Amplitude(s) for diagram number 904
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[4], w_fp[22], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[4], w_fp[22], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -19516,7 +19516,7 @@ namespace mg5amcCpu
       jamp_sv[82] += amp_sv[0];
       jamp_sv[106] += amp_sv[0];
       jamp_sv[116] -= amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[4], w_fp[103], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[4], w_fp[103], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -19536,7 +19536,7 @@ namespace mg5amcCpu
       jamp_sv[82] += amp_sv[0];
       jamp_sv[97] += amp_sv[0];
       jamp_sv[100] -= amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[4], w_fp[63], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[4], w_fp[63], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -19560,12 +19560,12 @@ namespace mg5amcCpu
       // *** DIAGRAM 905 OF 1240 ***
 
       // Wavefunction(s) for diagram number 905
-      VVVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[8], w_fp[4], COUPs[2], 0., 0., w_fp[107] );
-      VVVV3P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[8], w_fp[4], COUPs[2], 0., 0., w_fp[95] );
-      VVVV4P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[8], w_fp[4], COUPs[2], 0., 0., w_fp[105] );
+      VVVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[8], w_fp[4], COUPs[2], 1.0, 0., 0., w_fp[107] );
+      VVVV3P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[8], w_fp[4], COUPs[2], 1.0, 0., 0., w_fp[95] );
+      VVVV4P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[8], w_fp[4], COUPs[2], 1.0, 0., 0., w_fp[105] );
 
       // Amplitude(s) for diagram number 905
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[66], w_fp[6], w_fp[107], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[66], w_fp[6], w_fp[107], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -19585,7 +19585,7 @@ namespace mg5amcCpu
       jamp_sv[82] -= amp_sv[0];
       jamp_sv[106] -= amp_sv[0];
       jamp_sv[116] += amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[66], w_fp[6], w_fp[95], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[66], w_fp[6], w_fp[95], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -19605,7 +19605,7 @@ namespace mg5amcCpu
       jamp_sv[107] += amp_sv[0];
       jamp_sv[116] += amp_sv[0];
       jamp_sv[117] -= amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[66], w_fp[6], w_fp[105], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[66], w_fp[6], w_fp[105], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -19632,7 +19632,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 906
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[66], w_fp[4], w_fp[118], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[66], w_fp[4], w_fp[118], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -19652,7 +19652,7 @@ namespace mg5amcCpu
       jamp_sv[111] += amp_sv[0];
       jamp_sv[113] -= amp_sv[0];
       jamp_sv[117] += amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[66], w_fp[4], w_fp[119], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[66], w_fp[4], w_fp[119], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -19672,7 +19672,7 @@ namespace mg5amcCpu
       jamp_sv[98] -= amp_sv[0];
       jamp_sv[99] += amp_sv[0];
       jamp_sv[100] -= amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[66], w_fp[4], w_fp[120], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[66], w_fp[4], w_fp[120], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -19699,7 +19699,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 907
-      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[66], w_fp[8], w_fp[27], COUPs[2], &amp_fp[0] );
+      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[66], w_fp[8], w_fp[27], COUPs[2], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -19719,7 +19719,7 @@ namespace mg5amcCpu
       jamp_sv[109] -= amp_sv[0];
       jamp_sv[111] -= amp_sv[0];
       jamp_sv[113] += amp_sv[0];
-      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[66], w_fp[8], w_fp[27], COUPs[2], &amp_fp[0] );
+      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[66], w_fp[8], w_fp[27], COUPs[2], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -19739,7 +19739,7 @@ namespace mg5amcCpu
       jamp_sv[83] -= amp_sv[0];
       jamp_sv[111] -= amp_sv[0];
       jamp_sv[113] += amp_sv[0];
-      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[66], w_fp[8], w_fp[27], COUPs[2], &amp_fp[0] );
+      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[66], w_fp[8], w_fp[27], COUPs[2], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -19766,7 +19766,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 908
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[27], w_fp[65], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[27], w_fp[65], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -19793,7 +19793,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 909
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[66], w_fp[27], w_fp[56], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[66], w_fp[27], w_fp[56], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -19820,7 +19820,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 910
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[66], w_fp[8], w_fp[101], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[66], w_fp[8], w_fp[101], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -19847,7 +19847,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 911
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[65], w_fp[37], w_fp[6], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[65], w_fp[37], w_fp[6], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -19866,7 +19866,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 912
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[36], w_fp[65], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[36], w_fp[65], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -19881,7 +19881,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 913
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[71], w_fp[114], w_fp[6], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[71], w_fp[114], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -19894,7 +19894,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 914
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[114], w_fp[70], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[114], w_fp[70], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -19909,7 +19909,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 915
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[71], w_fp[36], w_fp[0], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[71], w_fp[36], w_fp[0], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -19922,7 +19922,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 916
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[70], w_fp[37], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[70], w_fp[37], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -19941,7 +19941,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 917
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[33], w_fp[22], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[33], w_fp[22], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -19953,7 +19953,7 @@ namespace mg5amcCpu
       jamp_sv[63] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[69] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[71] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[33], w_fp[103], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[33], w_fp[103], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -19965,7 +19965,7 @@ namespace mg5amcCpu
       jamp_sv[63] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[66] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[67] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[33], w_fp[63], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[33], w_fp[63], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -19984,7 +19984,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 918
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[33], w_fp[65], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[33], w_fp[65], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -19999,7 +19999,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 919
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[114], w_fp[66], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[114], w_fp[66], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -20012,7 +20012,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 920
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[62], w_fp[33], w_fp[66], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[62], w_fp[33], w_fp[66], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -20025,7 +20025,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 921
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[65], w_fp[51], w_fp[4], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[65], w_fp[51], w_fp[4], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -20044,7 +20044,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 922
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[49], w_fp[65], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[49], w_fp[65], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -20059,7 +20059,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 923
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[71], w_fp[113], w_fp[4], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[71], w_fp[113], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -20072,7 +20072,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 924
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[113], w_fp[69], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[113], w_fp[69], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -20087,7 +20087,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 925
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[71], w_fp[49], w_fp[0], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[71], w_fp[49], w_fp[0], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -20100,7 +20100,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 926
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[69], w_fp[51], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[69], w_fp[51], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -20119,7 +20119,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 927
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[47], w_fp[93], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[47], w_fp[93], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -20131,7 +20131,7 @@ namespace mg5amcCpu
       jamp_sv[111] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[113] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[117] -= cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[47], w_fp[90], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[47], w_fp[90], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -20143,7 +20143,7 @@ namespace mg5amcCpu
       jamp_sv[109] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[116] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[117] -= cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[47], w_fp[21], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[47], w_fp[21], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -20162,7 +20162,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 928
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[47], w_fp[65], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[47], w_fp[65], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -20177,7 +20177,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 929
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[113], w_fp[66], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[113], w_fp[66], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -20190,7 +20190,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 930
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[47], w_fp[66], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[47], w_fp[66], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -20203,7 +20203,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 931
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[65], w_fp[54], w_fp[6], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[65], w_fp[54], w_fp[6], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -20222,7 +20222,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 932
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[53], w_fp[2], w_fp[65], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[53], w_fp[2], w_fp[65], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -20237,7 +20237,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 933
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[94], w_fp[6], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[94], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -20250,7 +20250,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 934
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[2], w_fp[70], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[2], w_fp[70], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -20265,7 +20265,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 935
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[53], w_fp[94], w_fp[0], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[53], w_fp[94], w_fp[0], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -20278,7 +20278,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 936
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[70], w_fp[54], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[70], w_fp[54], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -20297,7 +20297,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 937
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[2], w_fp[22], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[2], w_fp[22], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -20309,7 +20309,7 @@ namespace mg5amcCpu
       jamp_sv[82] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[106] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[116] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[2], w_fp[103], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[2], w_fp[103], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -20321,7 +20321,7 @@ namespace mg5amcCpu
       jamp_sv[82] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[97] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[100] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[2], w_fp[63], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[2], w_fp[63], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -20340,7 +20340,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 938
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[65], w_fp[20], w_fp[4], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[65], w_fp[20], w_fp[4], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -20359,7 +20359,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 939
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[28], w_fp[2], w_fp[65], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[28], w_fp[2], w_fp[65], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -20374,7 +20374,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 940
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[62], w_fp[94], w_fp[4], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[62], w_fp[94], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -20387,7 +20387,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 941
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[62], w_fp[2], w_fp[69], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[62], w_fp[2], w_fp[69], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -20402,7 +20402,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 942
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[28], w_fp[94], w_fp[0], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[28], w_fp[94], w_fp[0], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -20415,7 +20415,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 943
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[69], w_fp[20], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[69], w_fp[20], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -20434,7 +20434,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 944
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[2], w_fp[93], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[2], w_fp[93], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -20446,7 +20446,7 @@ namespace mg5amcCpu
       jamp_sv[56] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[62] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[80] -= cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[2], w_fp[90], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[2], w_fp[90], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -20458,7 +20458,7 @@ namespace mg5amcCpu
       jamp_sv[50] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[78] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[80] -= cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[2], w_fp[21], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[2], w_fp[21], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -20477,7 +20477,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 945
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[15], w_fp[65], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[15], w_fp[65], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -20496,7 +20496,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 946
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[14], w_fp[2], w_fp[65], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[14], w_fp[2], w_fp[65], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -20515,7 +20515,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 947
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[94], w_fp[101], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[94], w_fp[101], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -20534,7 +20534,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 948
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[71], w_fp[2], w_fp[101], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[71], w_fp[2], w_fp[101], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -20553,7 +20553,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 949
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[14], w_fp[94], w_fp[0], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[14], w_fp[94], w_fp[0], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -20568,7 +20568,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 950
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[71], w_fp[15], w_fp[0], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[71], w_fp[15], w_fp[0], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -20580,10 +20580,10 @@ namespace mg5amcCpu
       // *** DIAGRAM 951 OF 1240 ***
 
       // Wavefunction(s) for diagram number 951
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[72], COUPs[0], 0., 0., w_fp[71] );
+      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[72], COUPs[0], 1.0, 0., 0., w_fp[71] );
 
       // Amplitude(s) for diagram number 951
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[71], w_fp[13], w_fp[5], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[71], w_fp[13], w_fp[5], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -20610,7 +20610,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 952
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[71], w_fp[10], w_fp[4], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[71], w_fp[10], w_fp[4], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -20637,7 +20637,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 953
-      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[4], w_fp[5], w_fp[71], COUPs[2], &amp_fp[0] );
+      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[4], w_fp[5], w_fp[71], COUPs[2], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -20657,7 +20657,7 @@ namespace mg5amcCpu
       jamp_sv[82] -= amp_sv[0];
       jamp_sv[92] += amp_sv[0];
       jamp_sv[103] -= amp_sv[0];
-      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[4], w_fp[5], w_fp[71], COUPs[2], &amp_fp[0] );
+      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[4], w_fp[5], w_fp[71], COUPs[2], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -20677,7 +20677,7 @@ namespace mg5amcCpu
       jamp_sv[89] -= amp_sv[0];
       jamp_sv[92] += amp_sv[0];
       jamp_sv[102] -= amp_sv[0];
-      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[4], w_fp[5], w_fp[71], COUPs[2], &amp_fp[0] );
+      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[4], w_fp[5], w_fp[71], COUPs[2], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -20704,7 +20704,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 954
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[56], w_fp[74], w_fp[5], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[56], w_fp[74], w_fp[5], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -20731,7 +20731,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 955
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[56], w_fp[75], w_fp[4], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[56], w_fp[75], w_fp[4], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -20758,7 +20758,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 956
-      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[72], w_fp[4], w_fp[5], w_fp[56], COUPs[2], &amp_fp[0] );
+      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[72], w_fp[4], w_fp[5], w_fp[56], COUPs[2], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -20778,7 +20778,7 @@ namespace mg5amcCpu
       jamp_sv[89] += amp_sv[0];
       jamp_sv[93] -= amp_sv[0];
       jamp_sv[105] += amp_sv[0];
-      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[72], w_fp[4], w_fp[5], w_fp[56], COUPs[2], &amp_fp[0] );
+      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[72], w_fp[4], w_fp[5], w_fp[56], COUPs[2], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -20798,7 +20798,7 @@ namespace mg5amcCpu
       jamp_sv[83] += amp_sv[0];
       jamp_sv[93] -= amp_sv[0];
       jamp_sv[107] += amp_sv[0];
-      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[72], w_fp[4], w_fp[5], w_fp[56], COUPs[2], &amp_fp[0] );
+      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[72], w_fp[4], w_fp[5], w_fp[56], COUPs[2], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -20825,7 +20825,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 957
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[74], w_fp[10], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[74], w_fp[10], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -20852,7 +20852,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 958
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[75], w_fp[13], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[75], w_fp[13], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -20876,12 +20876,12 @@ namespace mg5amcCpu
       // *** DIAGRAM 959 OF 1240 ***
 
       // Wavefunction(s) for diagram number 959
-      VVVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[72], w_fp[4], COUPs[2], 0., 0., w_fp[94] );
-      VVVV3P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[72], w_fp[4], COUPs[2], 0., 0., w_fp[65] );
-      VVVV4P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[72], w_fp[4], COUPs[2], 0., 0., w_fp[21] );
+      VVVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[72], w_fp[4], COUPs[2], 1.0, 0., 0., w_fp[94] );
+      VVVV3P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[72], w_fp[4], COUPs[2], 1.0, 0., 0., w_fp[65] );
+      VVVV4P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[72], w_fp[4], COUPs[2], 1.0, 0., 0., w_fp[21] );
 
       // Amplitude(s) for diagram number 959
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[5], w_fp[94], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[5], w_fp[94], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -20901,7 +20901,7 @@ namespace mg5amcCpu
       jamp_sv[89] += amp_sv[0];
       jamp_sv[93] -= amp_sv[0];
       jamp_sv[104] += amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[5], w_fp[65], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[5], w_fp[65], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -20921,7 +20921,7 @@ namespace mg5amcCpu
       jamp_sv[93] -= amp_sv[0];
       jamp_sv[102] -= amp_sv[0];
       jamp_sv[104] += amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[5], w_fp[21], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[5], w_fp[21], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -20945,12 +20945,12 @@ namespace mg5amcCpu
       // *** DIAGRAM 960 OF 1240 ***
 
       // Wavefunction(s) for diagram number 960
-      VVVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[72], w_fp[5], COUPs[2], 0., 0., w_fp[90] );
-      VVVV3P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[72], w_fp[5], COUPs[2], 0., 0., w_fp[93] );
-      VVVV4P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[72], w_fp[5], COUPs[2], 0., 0., w_fp[69] );
+      VVVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[72], w_fp[5], COUPs[2], 1.0, 0., 0., w_fp[90] );
+      VVVV3P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[72], w_fp[5], COUPs[2], 1.0, 0., 0., w_fp[93] );
+      VVVV4P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[72], w_fp[5], COUPs[2], 1.0, 0., 0., w_fp[69] );
 
       // Amplitude(s) for diagram number 960
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[4], w_fp[90], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[4], w_fp[90], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -20970,7 +20970,7 @@ namespace mg5amcCpu
       jamp_sv[82] += amp_sv[0];
       jamp_sv[92] -= amp_sv[0];
       jamp_sv[106] += amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[4], w_fp[93], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[4], w_fp[93], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -20990,7 +20990,7 @@ namespace mg5amcCpu
       jamp_sv[76] -= amp_sv[0];
       jamp_sv[103] -= amp_sv[0];
       jamp_sv[106] += amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[4], w_fp[69], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[4], w_fp[69], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -21017,7 +21017,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 961
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[72], w_fp[5], w_fp[107], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[72], w_fp[5], w_fp[107], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -21037,7 +21037,7 @@ namespace mg5amcCpu
       jamp_sv[82] -= amp_sv[0];
       jamp_sv[92] += amp_sv[0];
       jamp_sv[106] -= amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[72], w_fp[5], w_fp[95], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[72], w_fp[5], w_fp[95], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -21057,7 +21057,7 @@ namespace mg5amcCpu
       jamp_sv[93] -= amp_sv[0];
       jamp_sv[106] -= amp_sv[0];
       jamp_sv[107] += amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[72], w_fp[5], w_fp[105], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[72], w_fp[5], w_fp[105], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -21084,7 +21084,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 962
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[72], w_fp[4], w_fp[115], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[72], w_fp[4], w_fp[115], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -21104,7 +21104,7 @@ namespace mg5amcCpu
       jamp_sv[89] -= amp_sv[0];
       jamp_sv[93] += amp_sv[0];
       jamp_sv[104] -= amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[72], w_fp[4], w_fp[116], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[72], w_fp[4], w_fp[116], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -21124,7 +21124,7 @@ namespace mg5amcCpu
       jamp_sv[76] -= amp_sv[0];
       jamp_sv[104] -= amp_sv[0];
       jamp_sv[105] += amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[72], w_fp[4], w_fp[117], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[72], w_fp[4], w_fp[117], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -21151,7 +21151,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 963
-      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[72], w_fp[8], w_fp[24], COUPs[2], &amp_fp[0] );
+      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[72], w_fp[8], w_fp[24], COUPs[2], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -21171,7 +21171,7 @@ namespace mg5amcCpu
       jamp_sv[89] += amp_sv[0];
       jamp_sv[102] += amp_sv[0];
       jamp_sv[103] -= amp_sv[0];
-      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[72], w_fp[8], w_fp[24], COUPs[2], &amp_fp[0] );
+      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[72], w_fp[8], w_fp[24], COUPs[2], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -21191,7 +21191,7 @@ namespace mg5amcCpu
       jamp_sv[89] += amp_sv[0];
       jamp_sv[105] += amp_sv[0];
       jamp_sv[107] -= amp_sv[0];
-      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[72], w_fp[8], w_fp[24], COUPs[2], &amp_fp[0] );
+      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[72], w_fp[8], w_fp[24], COUPs[2], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -21218,7 +21218,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 964
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[24], w_fp[71], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[24], w_fp[71], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -21245,7 +21245,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 965
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[72], w_fp[24], w_fp[56], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[72], w_fp[24], w_fp[56], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -21272,7 +21272,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 966
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[72], w_fp[8], w_fp[98], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[72], w_fp[8], w_fp[98], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -21299,7 +21299,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 967
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[71], w_fp[37], w_fp[5], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[71], w_fp[37], w_fp[5], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -21318,7 +21318,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 968
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[35], w_fp[71], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[35], w_fp[71], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -21333,7 +21333,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 969
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[76], w_fp[114], w_fp[5], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[76], w_fp[114], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -21346,7 +21346,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 970
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[114], w_fp[75], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[114], w_fp[75], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -21361,7 +21361,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 971
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[76], w_fp[35], w_fp[0], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[76], w_fp[35], w_fp[0], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -21374,7 +21374,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 972
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[75], w_fp[37], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[75], w_fp[37], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -21393,7 +21393,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 973
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[33], w_fp[90], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[33], w_fp[90], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -21405,7 +21405,7 @@ namespace mg5amcCpu
       jamp_sv[63] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[65] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[69] -= cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[33], w_fp[93], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[33], w_fp[93], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -21417,7 +21417,7 @@ namespace mg5amcCpu
       jamp_sv[61] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[68] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[69] -= cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[33], w_fp[69], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[33], w_fp[69], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -21436,7 +21436,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 974
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[33], w_fp[71], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[33], w_fp[71], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -21451,7 +21451,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 975
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[114], w_fp[72], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[114], w_fp[72], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -21464,7 +21464,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 976
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[104], w_fp[33], w_fp[72], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[104], w_fp[33], w_fp[72], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -21477,7 +21477,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 977
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[71], w_fp[45], w_fp[4], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[71], w_fp[45], w_fp[4], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -21496,7 +21496,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 978
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[43], w_fp[71], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[43], w_fp[71], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -21511,7 +21511,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 979
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[76], w_fp[102], w_fp[4], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[76], w_fp[102], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -21524,7 +21524,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 980
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[102], w_fp[74], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[102], w_fp[74], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -21539,7 +21539,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 981
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[76], w_fp[43], w_fp[0], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[76], w_fp[43], w_fp[0], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -21552,7 +21552,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 982
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[74], w_fp[45], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[74], w_fp[45], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -21571,7 +21571,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 983
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[39], w_fp[94], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[39], w_fp[94], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -21583,7 +21583,7 @@ namespace mg5amcCpu
       jamp_sv[87] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[89] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[93] -= cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[39], w_fp[65], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[39], w_fp[65], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -21595,7 +21595,7 @@ namespace mg5amcCpu
       jamp_sv[85] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[92] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[93] -= cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[39], w_fp[21], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[39], w_fp[21], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -21614,7 +21614,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 984
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[39], w_fp[71], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[39], w_fp[71], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -21629,7 +21629,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 985
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[102], w_fp[72], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[102], w_fp[72], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -21642,7 +21642,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 986
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[39], w_fp[72], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[39], w_fp[72], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -21655,7 +21655,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 987
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[71], w_fp[54], w_fp[5], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[71], w_fp[54], w_fp[5], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -21674,7 +21674,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 988
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[7], w_fp[2], w_fp[71], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[7], w_fp[2], w_fp[71], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -21689,7 +21689,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 989
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[97], w_fp[5], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[97], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -21702,7 +21702,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 990
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[2], w_fp[75], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[2], w_fp[75], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -21717,7 +21717,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 991
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[7], w_fp[97], w_fp[0], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[7], w_fp[97], w_fp[0], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -21730,7 +21730,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 992
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[75], w_fp[54], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[75], w_fp[54], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -21749,7 +21749,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 993
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[2], w_fp[90], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[2], w_fp[90], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -21761,7 +21761,7 @@ namespace mg5amcCpu
       jamp_sv[82] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[92] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[106] -= cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[2], w_fp[93], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[2], w_fp[93], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -21773,7 +21773,7 @@ namespace mg5amcCpu
       jamp_sv[76] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[103] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[106] -= cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[2], w_fp[69], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[2], w_fp[69], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -21792,7 +21792,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 994
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[71], w_fp[23], w_fp[4], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[71], w_fp[23], w_fp[4], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -21811,7 +21811,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 995
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[25], w_fp[2], w_fp[71], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[25], w_fp[2], w_fp[71], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -21826,7 +21826,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 996
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[104], w_fp[97], w_fp[4], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[104], w_fp[97], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -21839,7 +21839,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 997
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[104], w_fp[2], w_fp[74], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[104], w_fp[2], w_fp[74], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -21854,7 +21854,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 998
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[25], w_fp[97], w_fp[0], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[25], w_fp[97], w_fp[0], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -21867,7 +21867,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 999
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[74], w_fp[23], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[74], w_fp[23], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -21886,7 +21886,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 1000
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[2], w_fp[94], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[2], w_fp[94], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -21898,7 +21898,7 @@ namespace mg5amcCpu
       jamp_sv[58] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[68] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[104] -= cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[2], w_fp[65], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[2], w_fp[65], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -21910,7 +21910,7 @@ namespace mg5amcCpu
       jamp_sv[52] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[102] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[104] -= cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[2], w_fp[21], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[2], w_fp[21], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -21929,7 +21929,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 1001
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[17], w_fp[71], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[17], w_fp[71], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -21948,7 +21948,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 1002
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[26], w_fp[2], w_fp[71], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[26], w_fp[2], w_fp[71], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -21967,7 +21967,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 1003
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[97], w_fp[98], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[97], w_fp[98], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -21986,7 +21986,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 1004
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[76], w_fp[2], w_fp[98], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[76], w_fp[2], w_fp[98], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -22005,7 +22005,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 1005
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[26], w_fp[97], w_fp[0], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[26], w_fp[97], w_fp[0], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -22020,7 +22020,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 1006
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[76], w_fp[17], w_fp[0], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[76], w_fp[17], w_fp[0], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -22035,7 +22035,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 1007
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[56], w_fp[59], w_fp[6], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[56], w_fp[59], w_fp[6], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -22062,7 +22062,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 1008
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[56], w_fp[1], w_fp[42], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[56], w_fp[1], w_fp[42], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -22089,7 +22089,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 1009
-      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[24], w_fp[6], w_fp[56], COUPs[2], &amp_fp[0] );
+      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[24], w_fp[6], w_fp[56], COUPs[2], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -22109,7 +22109,7 @@ namespace mg5amcCpu
       jamp_sv[107] -= amp_sv[0];
       jamp_sv[113] -= amp_sv[0];
       jamp_sv[119] += amp_sv[0];
-      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[24], w_fp[6], w_fp[56], COUPs[2], &amp_fp[0] );
+      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[24], w_fp[6], w_fp[56], COUPs[2], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -22129,7 +22129,7 @@ namespace mg5amcCpu
       jamp_sv[89] += amp_sv[0];
       jamp_sv[105] += amp_sv[0];
       jamp_sv[107] -= amp_sv[0];
-      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[24], w_fp[6], w_fp[56], COUPs[2], &amp_fp[0] );
+      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[24], w_fp[6], w_fp[56], COUPs[2], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -22156,7 +22156,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 1010
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[98], w_fp[108], w_fp[6], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[98], w_fp[108], w_fp[6], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -22183,7 +22183,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 1011
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[98], w_fp[1], w_fp[11], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[98], w_fp[1], w_fp[11], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -22210,7 +22210,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 1012
-      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[6], w_fp[98], COUPs[2], &amp_fp[0] );
+      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[6], w_fp[98], COUPs[2], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -22230,7 +22230,7 @@ namespace mg5amcCpu
       jamp_sv[101] += amp_sv[0];
       jamp_sv[112] += amp_sv[0];
       jamp_sv[118] -= amp_sv[0];
-      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[6], w_fp[98], COUPs[2], &amp_fp[0] );
+      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[6], w_fp[98], COUPs[2], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -22250,7 +22250,7 @@ namespace mg5amcCpu
       jamp_sv[103] += amp_sv[0];
       jamp_sv[105] += amp_sv[0];
       jamp_sv[107] -= amp_sv[0];
-      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[6], w_fp[98], COUPs[2], &amp_fp[0] );
+      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[6], w_fp[98], COUPs[2], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -22277,7 +22277,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 1013
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[108], w_fp[42], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[108], w_fp[42], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -22304,7 +22304,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 1014
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[59], w_fp[11], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[59], w_fp[11], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -22328,12 +22328,12 @@ namespace mg5amcCpu
       // *** DIAGRAM 1015 OF 1240 ***
 
       // Wavefunction(s) for diagram number 1015
-      VVVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[1], w_fp[8], COUPs[2], 0., 0., w_fp[11] );
-      VVVV3P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[1], w_fp[8], COUPs[2], 0., 0., w_fp[42] );
-      VVVV4P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[1], w_fp[8], COUPs[2], 0., 0., w_fp[76] );
+      VVVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[1], w_fp[8], COUPs[2], 1.0, 0., 0., w_fp[11] );
+      VVVV3P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[1], w_fp[8], COUPs[2], 1.0, 0., 0., w_fp[42] );
+      VVVV4P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[1], w_fp[8], COUPs[2], 1.0, 0., 0., w_fp[76] );
 
       // Amplitude(s) for diagram number 1015
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[24], w_fp[6], w_fp[11], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[24], w_fp[6], w_fp[11], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -22353,7 +22353,7 @@ namespace mg5amcCpu
       jamp_sv[88] += amp_sv[0];
       jamp_sv[112] += amp_sv[0];
       jamp_sv[118] -= amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[24], w_fp[6], w_fp[42], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[24], w_fp[6], w_fp[42], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -22373,7 +22373,7 @@ namespace mg5amcCpu
       jamp_sv[89] += amp_sv[0];
       jamp_sv[113] += amp_sv[0];
       jamp_sv[119] -= amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[24], w_fp[6], w_fp[76], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[24], w_fp[6], w_fp[76], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -22397,12 +22397,12 @@ namespace mg5amcCpu
       // *** DIAGRAM 1016 OF 1240 ***
 
       // Wavefunction(s) for diagram number 1016
-      VVVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[1], w_fp[24], COUPs[2], 0., 0., w_fp[97] );
-      VVVV3P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[1], w_fp[24], COUPs[2], 0., 0., w_fp[71] );
-      VVVV4P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[1], w_fp[24], COUPs[2], 0., 0., w_fp[21] );
+      VVVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[1], w_fp[24], COUPs[2], 1.0, 0., 0., w_fp[97] );
+      VVVV3P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[1], w_fp[24], COUPs[2], 1.0, 0., 0., w_fp[71] );
+      VVVV4P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[1], w_fp[24], COUPs[2], 1.0, 0., 0., w_fp[21] );
 
       // Amplitude(s) for diagram number 1016
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[6], w_fp[97], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[6], w_fp[97], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -22422,7 +22422,7 @@ namespace mg5amcCpu
       jamp_sv[107] -= amp_sv[0];
       jamp_sv[113] -= amp_sv[0];
       jamp_sv[119] += amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[6], w_fp[71], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[6], w_fp[71], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -22442,7 +22442,7 @@ namespace mg5amcCpu
       jamp_sv[107] -= amp_sv[0];
       jamp_sv[112] -= amp_sv[0];
       jamp_sv[118] += amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[6], w_fp[21], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[6], w_fp[21], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -22469,7 +22469,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 1017
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[24], w_fp[118], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[24], w_fp[118], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -22489,7 +22489,7 @@ namespace mg5amcCpu
       jamp_sv[107] += amp_sv[0];
       jamp_sv[113] += amp_sv[0];
       jamp_sv[119] -= amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[24], w_fp[119], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[24], w_fp[119], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -22509,7 +22509,7 @@ namespace mg5amcCpu
       jamp_sv[97] -= amp_sv[0];
       jamp_sv[99] -= amp_sv[0];
       jamp_sv[101] += amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[24], w_fp[120], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[24], w_fp[120], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -22536,7 +22536,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 1018
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[85], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[85], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -22556,7 +22556,7 @@ namespace mg5amcCpu
       jamp_sv[88] -= amp_sv[0];
       jamp_sv[112] -= amp_sv[0];
       jamp_sv[118] += amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[112], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[112], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -22576,7 +22576,7 @@ namespace mg5amcCpu
       jamp_sv[88] -= amp_sv[0];
       jamp_sv[99] -= amp_sv[0];
       jamp_sv[101] += amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[111], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[111], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -22603,7 +22603,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 1019
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[56], w_fp[68], w_fp[5], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[56], w_fp[68], w_fp[5], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -22630,7 +22630,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 1020
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[56], w_fp[1], w_fp[16], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[56], w_fp[1], w_fp[16], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -22657,7 +22657,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 1021
-      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[27], w_fp[5], w_fp[56], COUPs[2], &amp_fp[0] );
+      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[27], w_fp[5], w_fp[56], COUPs[2], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -22677,7 +22677,7 @@ namespace mg5amcCpu
       jamp_sv[89] -= amp_sv[0];
       jamp_sv[95] += amp_sv[0];
       jamp_sv[111] -= amp_sv[0];
-      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[27], w_fp[5], w_fp[56], COUPs[2], &amp_fp[0] );
+      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[27], w_fp[5], w_fp[56], COUPs[2], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -22697,7 +22697,7 @@ namespace mg5amcCpu
       jamp_sv[83] -= amp_sv[0];
       jamp_sv[111] -= amp_sv[0];
       jamp_sv[113] += amp_sv[0];
-      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[27], w_fp[5], w_fp[56], COUPs[2], &amp_fp[0] );
+      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[27], w_fp[5], w_fp[56], COUPs[2], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -22724,7 +22724,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 1022
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[101], w_fp[108], w_fp[5], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[101], w_fp[108], w_fp[5], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -22751,7 +22751,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 1023
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[101], w_fp[1], w_fp[10], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[101], w_fp[1], w_fp[10], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -22778,7 +22778,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 1024
-      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[5], w_fp[101], COUPs[2], &amp_fp[0] );
+      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[5], w_fp[101], COUPs[2], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -22798,7 +22798,7 @@ namespace mg5amcCpu
       jamp_sv[88] += amp_sv[0];
       jamp_sv[94] -= amp_sv[0];
       jamp_sv[109] += amp_sv[0];
-      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[5], w_fp[101], COUPs[2], &amp_fp[0] );
+      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[5], w_fp[101], COUPs[2], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -22818,7 +22818,7 @@ namespace mg5amcCpu
       jamp_sv[83] -= amp_sv[0];
       jamp_sv[108] -= amp_sv[0];
       jamp_sv[109] += amp_sv[0];
-      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[5], w_fp[101], COUPs[2], &amp_fp[0] );
+      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[5], w_fp[101], COUPs[2], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -22845,7 +22845,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 1025
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[108], w_fp[16], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[108], w_fp[16], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -22872,7 +22872,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 1026
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[68], w_fp[10], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[68], w_fp[10], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -22899,7 +22899,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 1027
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[27], w_fp[5], w_fp[11], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[27], w_fp[5], w_fp[11], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -22919,7 +22919,7 @@ namespace mg5amcCpu
       jamp_sv[88] += amp_sv[0];
       jamp_sv[94] -= amp_sv[0];
       jamp_sv[112] += amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[27], w_fp[5], w_fp[42], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[27], w_fp[5], w_fp[42], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -22939,7 +22939,7 @@ namespace mg5amcCpu
       jamp_sv[89] += amp_sv[0];
       jamp_sv[95] -= amp_sv[0];
       jamp_sv[113] += amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[27], w_fp[5], w_fp[76], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[27], w_fp[5], w_fp[76], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -22963,12 +22963,12 @@ namespace mg5amcCpu
       // *** DIAGRAM 1028 OF 1240 ***
 
       // Wavefunction(s) for diagram number 1028
-      VVVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[1], w_fp[27], COUPs[2], 0., 0., w_fp[10] );
-      VVVV3P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[1], w_fp[27], COUPs[2], 0., 0., w_fp[16] );
-      VVVV4P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[1], w_fp[27], COUPs[2], 0., 0., w_fp[111] );
+      VVVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[1], w_fp[27], COUPs[2], 1.0, 0., 0., w_fp[10] );
+      VVVV3P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[1], w_fp[27], COUPs[2], 1.0, 0., 0., w_fp[16] );
+      VVVV4P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[1], w_fp[27], COUPs[2], 1.0, 0., 0., w_fp[111] );
 
       // Amplitude(s) for diagram number 1028
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[5], w_fp[10], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[5], w_fp[10], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -22988,7 +22988,7 @@ namespace mg5amcCpu
       jamp_sv[89] -= amp_sv[0];
       jamp_sv[95] += amp_sv[0];
       jamp_sv[110] -= amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[5], w_fp[16], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[5], w_fp[16], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -23008,7 +23008,7 @@ namespace mg5amcCpu
       jamp_sv[88] -= amp_sv[0];
       jamp_sv[94] += amp_sv[0];
       jamp_sv[108] -= amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[5], w_fp[111], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[5], w_fp[111], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -23035,7 +23035,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 1029
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[27], w_fp[115], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[27], w_fp[115], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -23055,7 +23055,7 @@ namespace mg5amcCpu
       jamp_sv[89] += amp_sv[0];
       jamp_sv[95] -= amp_sv[0];
       jamp_sv[110] += amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[27], w_fp[116], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[27], w_fp[116], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -23075,7 +23075,7 @@ namespace mg5amcCpu
       jamp_sv[77] += amp_sv[0];
       jamp_sv[110] += amp_sv[0];
       jamp_sv[111] -= amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[27], w_fp[117], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[27], w_fp[117], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -23102,7 +23102,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 1030
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[9], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[9], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -23122,7 +23122,7 @@ namespace mg5amcCpu
       jamp_sv[88] -= amp_sv[0];
       jamp_sv[94] += amp_sv[0];
       jamp_sv[112] -= amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[110], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[110], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -23142,7 +23142,7 @@ namespace mg5amcCpu
       jamp_sv[77] += amp_sv[0];
       jamp_sv[109] += amp_sv[0];
       jamp_sv[112] -= amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[109], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[109], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -23169,7 +23169,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 1031
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[56], w_fp[67], w_fp[4], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[56], w_fp[67], w_fp[4], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -23196,7 +23196,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 1032
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[56], w_fp[1], w_fp[19], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[56], w_fp[1], w_fp[19], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -23223,7 +23223,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 1033
-      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[4], w_fp[29], w_fp[56], COUPs[2], &amp_fp[0] );
+      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[4], w_fp[29], w_fp[56], COUPs[2], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -23243,7 +23243,7 @@ namespace mg5amcCpu
       jamp_sv[95] -= amp_sv[0];
       jamp_sv[117] -= amp_sv[0];
       jamp_sv[119] += amp_sv[0];
-      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[4], w_fp[29], w_fp[56], COUPs[2], &amp_fp[0] );
+      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[4], w_fp[29], w_fp[56], COUPs[2], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -23263,7 +23263,7 @@ namespace mg5amcCpu
       jamp_sv[71] += amp_sv[0];
       jamp_sv[93] += amp_sv[0];
       jamp_sv[117] -= amp_sv[0];
-      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[4], w_fp[29], w_fp[56], COUPs[2], &amp_fp[0] );
+      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[4], w_fp[29], w_fp[56], COUPs[2], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -23290,7 +23290,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 1034
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[96], w_fp[108], w_fp[4], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[96], w_fp[108], w_fp[4], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -23317,7 +23317,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 1035
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[96], w_fp[1], w_fp[13], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[96], w_fp[1], w_fp[13], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -23344,7 +23344,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 1036
-      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[4], w_fp[96], COUPs[2], &amp_fp[0] );
+      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[4], w_fp[96], COUPs[2], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -23364,7 +23364,7 @@ namespace mg5amcCpu
       jamp_sv[70] -= amp_sv[0];
       jamp_sv[91] -= amp_sv[0];
       jamp_sv[115] += amp_sv[0];
-      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[4], w_fp[96], COUPs[2], &amp_fp[0] );
+      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[4], w_fp[96], COUPs[2], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -23384,7 +23384,7 @@ namespace mg5amcCpu
       jamp_sv[91] -= amp_sv[0];
       jamp_sv[114] -= amp_sv[0];
       jamp_sv[115] += amp_sv[0];
-      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[4], w_fp[96], COUPs[2], &amp_fp[0] );
+      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[4], w_fp[96], COUPs[2], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -23411,7 +23411,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 1037
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[108], w_fp[19], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[108], w_fp[19], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -23438,7 +23438,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 1038
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[67], w_fp[13], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[67], w_fp[13], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -23465,7 +23465,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 1039
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[4], w_fp[29], w_fp[11], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[4], w_fp[29], w_fp[11], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -23485,7 +23485,7 @@ namespace mg5amcCpu
       jamp_sv[70] += amp_sv[0];
       jamp_sv[94] += amp_sv[0];
       jamp_sv[118] -= amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[4], w_fp[29], w_fp[42], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[4], w_fp[29], w_fp[42], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -23505,7 +23505,7 @@ namespace mg5amcCpu
       jamp_sv[71] += amp_sv[0];
       jamp_sv[95] += amp_sv[0];
       jamp_sv[119] -= amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[4], w_fp[29], w_fp[76], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[4], w_fp[29], w_fp[76], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -23529,12 +23529,12 @@ namespace mg5amcCpu
       // *** DIAGRAM 1040 OF 1240 ***
 
       // Wavefunction(s) for diagram number 1040
-      VVVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[1], w_fp[29], COUPs[2], 0., 0., w_fp[76] );
-      VVVV3P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[1], w_fp[29], COUPs[2], 0., 0., w_fp[42] );
-      VVVV4P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[1], w_fp[29], COUPs[2], 0., 0., w_fp[11] );
+      VVVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[1], w_fp[29], COUPs[2], 1.0, 0., 0., w_fp[76] );
+      VVVV3P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[1], w_fp[29], COUPs[2], 1.0, 0., 0., w_fp[42] );
+      VVVV4P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[1], w_fp[29], COUPs[2], 1.0, 0., 0., w_fp[11] );
 
       // Amplitude(s) for diagram number 1040
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[4], w_fp[76], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[4], w_fp[76], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -23554,7 +23554,7 @@ namespace mg5amcCpu
       jamp_sv[71] += amp_sv[0];
       jamp_sv[92] += amp_sv[0];
       jamp_sv[116] -= amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[4], w_fp[42], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[4], w_fp[42], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -23574,7 +23574,7 @@ namespace mg5amcCpu
       jamp_sv[70] += amp_sv[0];
       jamp_sv[90] += amp_sv[0];
       jamp_sv[114] -= amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[4], w_fp[11], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[4], w_fp[11], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -23601,7 +23601,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 1041
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[29], w_fp[107], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[29], w_fp[107], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -23621,7 +23621,7 @@ namespace mg5amcCpu
       jamp_sv[71] -= amp_sv[0];
       jamp_sv[92] -= amp_sv[0];
       jamp_sv[116] += amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[29], w_fp[95], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[29], w_fp[95], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -23641,7 +23641,7 @@ namespace mg5amcCpu
       jamp_sv[93] += amp_sv[0];
       jamp_sv[116] += amp_sv[0];
       jamp_sv[117] -= amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[29], w_fp[105], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[29], w_fp[105], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -23668,7 +23668,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 1042
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[87], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[87], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -23688,7 +23688,7 @@ namespace mg5amcCpu
       jamp_sv[70] -= amp_sv[0];
       jamp_sv[94] -= amp_sv[0];
       jamp_sv[118] += amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[34], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[34], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -23708,7 +23708,7 @@ namespace mg5amcCpu
       jamp_sv[70] -= amp_sv[0];
       jamp_sv[91] -= amp_sv[0];
       jamp_sv[115] += amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[86], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[86], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -23735,7 +23735,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 1043
-      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[1], w_fp[8], w_fp[30], COUPs[2], &amp_fp[0] );
+      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[1], w_fp[8], w_fp[30], COUPs[2], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -23755,7 +23755,7 @@ namespace mg5amcCpu
       jamp_sv[95] -= amp_sv[0];
       jamp_sv[118] -= amp_sv[0];
       jamp_sv[119] += amp_sv[0];
-      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[1], w_fp[8], w_fp[30], COUPs[2], &amp_fp[0] );
+      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[1], w_fp[8], w_fp[30], COUPs[2], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -23775,7 +23775,7 @@ namespace mg5amcCpu
       jamp_sv[71] -= amp_sv[0];
       jamp_sv[95] -= amp_sv[0];
       jamp_sv[119] += amp_sv[0];
-      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[1], w_fp[8], w_fp[30], COUPs[2], &amp_fp[0] );
+      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[1], w_fp[8], w_fp[30], COUPs[2], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -23795,7 +23795,7 @@ namespace mg5amcCpu
       jamp_sv[70] -= amp_sv[0];
       jamp_sv[94] -= amp_sv[0];
       jamp_sv[118] += amp_sv[0];
-      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[1], w_fp[8], w_fp[31], COUPs[2], &amp_fp[0] );
+      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[1], w_fp[8], w_fp[31], COUPs[2], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -23815,7 +23815,7 @@ namespace mg5amcCpu
       jamp_sv[95] -= amp_sv[0];
       jamp_sv[112] -= amp_sv[0];
       jamp_sv[113] += amp_sv[0];
-      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[1], w_fp[8], w_fp[31], COUPs[2], &amp_fp[0] );
+      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[1], w_fp[8], w_fp[31], COUPs[2], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -23835,7 +23835,7 @@ namespace mg5amcCpu
       jamp_sv[89] += amp_sv[0];
       jamp_sv[95] -= amp_sv[0];
       jamp_sv[113] += amp_sv[0];
-      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[1], w_fp[8], w_fp[31], COUPs[2], &amp_fp[0] );
+      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[1], w_fp[8], w_fp[31], COUPs[2], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -23855,7 +23855,7 @@ namespace mg5amcCpu
       jamp_sv[88] += amp_sv[0];
       jamp_sv[94] -= amp_sv[0];
       jamp_sv[112] += amp_sv[0];
-      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[1], w_fp[8], w_fp[32], COUPs[2], &amp_fp[0] );
+      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[1], w_fp[8], w_fp[32], COUPs[2], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -23875,7 +23875,7 @@ namespace mg5amcCpu
       jamp_sv[113] += amp_sv[0];
       jamp_sv[118] += amp_sv[0];
       jamp_sv[119] -= amp_sv[0];
-      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[1], w_fp[8], w_fp[32], COUPs[2], &amp_fp[0] );
+      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[1], w_fp[8], w_fp[32], COUPs[2], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -23895,7 +23895,7 @@ namespace mg5amcCpu
       jamp_sv[89] += amp_sv[0];
       jamp_sv[113] += amp_sv[0];
       jamp_sv[119] -= amp_sv[0];
-      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[1], w_fp[8], w_fp[32], COUPs[2], &amp_fp[0] );
+      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[1], w_fp[8], w_fp[32], COUPs[2], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -23922,7 +23922,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 1044
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[30], w_fp[56], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[30], w_fp[56], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -23942,7 +23942,7 @@ namespace mg5amcCpu
       jamp_sv[71] -= amp_sv[0];
       jamp_sv[95] -= amp_sv[0];
       jamp_sv[119] += amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[31], w_fp[56], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[31], w_fp[56], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -23962,7 +23962,7 @@ namespace mg5amcCpu
       jamp_sv[89] += amp_sv[0];
       jamp_sv[95] -= amp_sv[0];
       jamp_sv[113] += amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[32], w_fp[56], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[32], w_fp[56], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -23989,7 +23989,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 1045
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[92], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[92], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -24009,7 +24009,7 @@ namespace mg5amcCpu
       jamp_sv[70] -= amp_sv[0];
       jamp_sv[94] -= amp_sv[0];
       jamp_sv[118] += amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[88], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[88], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -24029,7 +24029,7 @@ namespace mg5amcCpu
       jamp_sv[88] += amp_sv[0];
       jamp_sv[94] -= amp_sv[0];
       jamp_sv[112] += amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[106], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[106], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -24056,7 +24056,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 1046
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[58], w_fp[114], w_fp[6], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[58], w_fp[114], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -24068,7 +24068,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 1047
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[48], w_fp[114], w_fp[1], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[48], w_fp[114], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -24080,7 +24080,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 1048
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[104], w_fp[100], w_fp[6], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[104], w_fp[100], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -24092,7 +24092,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 1049
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[104], w_fp[36], w_fp[1], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[104], w_fp[36], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -24104,7 +24104,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 1050
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[48], w_fp[100], w_fp[0], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[48], w_fp[100], w_fp[0], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -24116,7 +24116,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 1051
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[58], w_fp[36], w_fp[0], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[58], w_fp[36], w_fp[0], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -24128,7 +24128,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 1052
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[60], w_fp[114], w_fp[5], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[60], w_fp[114], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -24140,7 +24140,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 1053
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[40], w_fp[114], w_fp[1], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[40], w_fp[114], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -24152,7 +24152,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 1054
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[62], w_fp[100], w_fp[5], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[62], w_fp[100], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -24164,7 +24164,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 1055
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[62], w_fp[35], w_fp[1], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[62], w_fp[35], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -24176,7 +24176,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 1056
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[40], w_fp[100], w_fp[0], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[40], w_fp[100], w_fp[0], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -24188,7 +24188,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 1057
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[60], w_fp[35], w_fp[0], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[60], w_fp[35], w_fp[0], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -24200,7 +24200,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 1058
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[114], w_fp[67], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[114], w_fp[67], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -24215,7 +24215,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 1059
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[12], w_fp[114], w_fp[1], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[12], w_fp[114], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -24228,7 +24228,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 1060
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[100], w_fp[96], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[100], w_fp[96], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -24243,7 +24243,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 1061
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[96], w_fp[1], w_fp[37], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[96], w_fp[1], w_fp[37], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -24262,7 +24262,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 1062
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[12], w_fp[100], w_fp[0], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[12], w_fp[100], w_fp[0], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -24275,7 +24275,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 1063
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[67], w_fp[37], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[67], w_fp[37], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -24294,7 +24294,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 1064
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[33], w_fp[76], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[33], w_fp[76], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -24306,7 +24306,7 @@ namespace mg5amcCpu
       jamp_sv[59] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[65] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[71] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[33], w_fp[42], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[33], w_fp[42], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -24318,7 +24318,7 @@ namespace mg5amcCpu
       jamp_sv[59] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[64] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[70] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[33], w_fp[11], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[33], w_fp[11], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -24337,7 +24337,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 1065
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[78], w_fp[102], w_fp[6], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[78], w_fp[102], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -24349,7 +24349,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 1066
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[53], w_fp[102], w_fp[1], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[53], w_fp[102], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -24361,7 +24361,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 1067
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[89], w_fp[6], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[89], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -24373,7 +24373,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 1068
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[44], w_fp[1], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[44], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -24385,7 +24385,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 1069
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[53], w_fp[89], w_fp[0], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[53], w_fp[89], w_fp[0], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -24397,7 +24397,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 1070
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[78], w_fp[44], w_fp[0], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[78], w_fp[44], w_fp[0], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -24409,7 +24409,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 1071
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[60], w_fp[102], w_fp[4], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[60], w_fp[102], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -24421,7 +24421,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 1072
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[28], w_fp[102], w_fp[1], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[28], w_fp[102], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -24433,7 +24433,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 1073
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[62], w_fp[89], w_fp[4], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[62], w_fp[89], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -24445,7 +24445,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 1074
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[62], w_fp[43], w_fp[1], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[62], w_fp[43], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -24457,7 +24457,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 1075
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[28], w_fp[89], w_fp[0], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[28], w_fp[89], w_fp[0], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -24469,7 +24469,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 1076
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[60], w_fp[43], w_fp[0], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[60], w_fp[43], w_fp[0], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -24481,7 +24481,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 1077
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[102], w_fp[68], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[102], w_fp[68], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -24496,7 +24496,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 1078
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[14], w_fp[102], w_fp[1], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[14], w_fp[102], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -24509,7 +24509,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 1079
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[89], w_fp[101], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[89], w_fp[101], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -24524,7 +24524,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 1080
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[101], w_fp[1], w_fp[45], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[101], w_fp[1], w_fp[45], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -24543,7 +24543,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 1081
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[14], w_fp[89], w_fp[0], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[14], w_fp[89], w_fp[0], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -24556,7 +24556,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 1082
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[68], w_fp[45], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[68], w_fp[45], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -24575,7 +24575,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 1083
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[39], w_fp[10], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[39], w_fp[10], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -24587,7 +24587,7 @@ namespace mg5amcCpu
       jamp_sv[83] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[89] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[95] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[39], w_fp[16], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[39], w_fp[16], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -24599,7 +24599,7 @@ namespace mg5amcCpu
       jamp_sv[83] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[88] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[94] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[39], w_fp[111], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[39], w_fp[111], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -24618,7 +24618,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 1084
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[78], w_fp[113], w_fp[5], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[78], w_fp[113], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -24630,7 +24630,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 1085
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[7], w_fp[113], w_fp[1], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[7], w_fp[113], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -24642,7 +24642,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 1086
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[91], w_fp[5], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[91], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -24654,7 +24654,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 1087
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[50], w_fp[1], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[50], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -24666,7 +24666,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 1088
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[7], w_fp[91], w_fp[0], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[7], w_fp[91], w_fp[0], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -24678,7 +24678,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 1089
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[78], w_fp[50], w_fp[0], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[78], w_fp[50], w_fp[0], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -24690,7 +24690,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 1090
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[58], w_fp[113], w_fp[4], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[58], w_fp[113], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -24702,7 +24702,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 1091
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[25], w_fp[113], w_fp[1], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[25], w_fp[113], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -24714,7 +24714,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 1092
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[104], w_fp[91], w_fp[4], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[104], w_fp[91], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -24726,7 +24726,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 1093
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[104], w_fp[49], w_fp[1], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[104], w_fp[49], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -24738,7 +24738,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 1094
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[25], w_fp[91], w_fp[0], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[25], w_fp[91], w_fp[0], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -24750,7 +24750,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 1095
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[58], w_fp[49], w_fp[0], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[58], w_fp[49], w_fp[0], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -24762,7 +24762,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 1096
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[113], w_fp[59], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[113], w_fp[59], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -24777,7 +24777,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 1097
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[26], w_fp[113], w_fp[1], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[26], w_fp[113], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -24790,7 +24790,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 1098
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[91], w_fp[98], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[91], w_fp[98], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -24805,7 +24805,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 1099
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[98], w_fp[1], w_fp[51], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[98], w_fp[1], w_fp[51], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -24824,7 +24824,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 1100
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[26], w_fp[91], w_fp[0], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[26], w_fp[91], w_fp[0], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -24837,7 +24837,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 1101
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[59], w_fp[51], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[59], w_fp[51], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -24856,7 +24856,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 1102
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[47], w_fp[97], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[47], w_fp[97], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -24868,7 +24868,7 @@ namespace mg5amcCpu
       jamp_sv[107] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[113] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[119] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[47], w_fp[71], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[47], w_fp[71], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -24880,7 +24880,7 @@ namespace mg5amcCpu
       jamp_sv[107] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[112] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[118] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[47], w_fp[21], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[47], w_fp[21], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -24899,7 +24899,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 1103
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[2], w_fp[67], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[2], w_fp[67], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -24914,7 +24914,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 1104
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[18], w_fp[1], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[18], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -24927,7 +24927,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 1105
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[78], w_fp[2], w_fp[96], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[78], w_fp[2], w_fp[96], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -24942,7 +24942,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 1106
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[96], w_fp[1], w_fp[54], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[96], w_fp[1], w_fp[54], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -24961,7 +24961,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 1107
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[78], w_fp[18], w_fp[0], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[78], w_fp[18], w_fp[0], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -24974,7 +24974,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 1108
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[67], w_fp[54], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[67], w_fp[54], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -24993,7 +24993,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 1109
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[2], w_fp[76], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[2], w_fp[76], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -25005,7 +25005,7 @@ namespace mg5amcCpu
       jamp_sv[46] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[92] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[116] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[2], w_fp[42], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[2], w_fp[42], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -25017,7 +25017,7 @@ namespace mg5amcCpu
       jamp_sv[46] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[90] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[114] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[2], w_fp[11], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[2], w_fp[11], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -25036,7 +25036,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 1110
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[104], w_fp[2], w_fp[68], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[104], w_fp[2], w_fp[68], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -25051,7 +25051,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 1111
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[104], w_fp[15], w_fp[1], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[104], w_fp[15], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -25064,7 +25064,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 1112
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[58], w_fp[2], w_fp[101], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[58], w_fp[2], w_fp[101], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -25079,7 +25079,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 1113
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[101], w_fp[1], w_fp[23], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[101], w_fp[1], w_fp[23], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -25098,7 +25098,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 1114
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[58], w_fp[15], w_fp[0], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[58], w_fp[15], w_fp[0], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -25111,7 +25111,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 1115
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[68], w_fp[23], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[68], w_fp[23], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -25130,7 +25130,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 1116
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[2], w_fp[10], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[2], w_fp[10], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -25142,7 +25142,7 @@ namespace mg5amcCpu
       jamp_sv[44] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[68] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[110] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[2], w_fp[16], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[2], w_fp[16], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -25154,7 +25154,7 @@ namespace mg5amcCpu
       jamp_sv[44] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[66] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[108] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[2], w_fp[111], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[2], w_fp[111], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -25173,7 +25173,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 1117
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[62], w_fp[2], w_fp[59], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[62], w_fp[2], w_fp[59], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -25188,7 +25188,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 1118
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[62], w_fp[17], w_fp[1], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[62], w_fp[17], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -25201,7 +25201,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 1119
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[60], w_fp[2], w_fp[98], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[60], w_fp[2], w_fp[98], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -25216,7 +25216,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 1120
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[98], w_fp[1], w_fp[20], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[98], w_fp[1], w_fp[20], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -25235,7 +25235,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 1121
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[60], w_fp[17], w_fp[0], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[60], w_fp[17], w_fp[0], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -25248,7 +25248,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 1122
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[59], w_fp[20], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[59], w_fp[20], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -25267,7 +25267,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 1123
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[2], w_fp[97], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[2], w_fp[97], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -25279,7 +25279,7 @@ namespace mg5amcCpu
       jamp_sv[38] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[62] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[86] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[2], w_fp[71], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[2], w_fp[71], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -25291,7 +25291,7 @@ namespace mg5amcCpu
       jamp_sv[38] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[60] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[84] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[2], w_fp[21], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[2], w_fp[21], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -25307,12 +25307,12 @@ namespace mg5amcCpu
       // *** DIAGRAM 1124 OF 1240 ***
 
       // Wavefunction(s) for diagram number 1124
-      VVVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[1], w_fp[4], COUPs[2], 0., 0., w_fp[21] );
-      VVVV3P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[1], w_fp[4], COUPs[2], 0., 0., w_fp[71] );
-      VVVV4P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[1], w_fp[4], COUPs[2], 0., 0., w_fp[97] );
+      VVVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[1], w_fp[4], COUPs[2], 1.0, 0., 0., w_fp[21] );
+      VVVV3P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[1], w_fp[4], COUPs[2], 1.0, 0., 0., w_fp[71] );
+      VVVV4P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[1], w_fp[4], COUPs[2], 1.0, 0., 0., w_fp[97] );
 
       // Amplitude(s) for diagram number 1124
-      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[21], w_fp[8], w_fp[5], w_fp[6], COUPs[2], &amp_fp[0] );
+      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[21], w_fp[8], w_fp[5], w_fp[6], COUPs[2], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -25332,7 +25332,7 @@ namespace mg5amcCpu
       jamp_sv[115] -= amp_sv[0];
       jamp_sv[117] -= amp_sv[0];
       jamp_sv[119] += amp_sv[0];
-      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[21], w_fp[8], w_fp[5], w_fp[6], COUPs[2], &amp_fp[0] );
+      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[21], w_fp[8], w_fp[5], w_fp[6], COUPs[2], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -25352,7 +25352,7 @@ namespace mg5amcCpu
       jamp_sv[115] -= amp_sv[0];
       jamp_sv[117] -= amp_sv[0];
       jamp_sv[119] += amp_sv[0];
-      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[21], w_fp[8], w_fp[5], w_fp[6], COUPs[2], &amp_fp[0] );
+      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[21], w_fp[8], w_fp[5], w_fp[6], COUPs[2], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -25372,7 +25372,7 @@ namespace mg5amcCpu
       jamp_sv[98] += amp_sv[0];
       jamp_sv[104] += amp_sv[0];
       jamp_sv[110] -= amp_sv[0];
-      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[71], w_fp[8], w_fp[5], w_fp[6], COUPs[2], &amp_fp[0] );
+      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[71], w_fp[8], w_fp[5], w_fp[6], COUPs[2], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -25392,7 +25392,7 @@ namespace mg5amcCpu
       jamp_sv[116] += amp_sv[0];
       jamp_sv[117] -= amp_sv[0];
       jamp_sv[118] += amp_sv[0];
-      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[71], w_fp[8], w_fp[5], w_fp[6], COUPs[2], &amp_fp[0] );
+      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[71], w_fp[8], w_fp[5], w_fp[6], COUPs[2], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -25412,7 +25412,7 @@ namespace mg5amcCpu
       jamp_sv[116] += amp_sv[0];
       jamp_sv[117] -= amp_sv[0];
       jamp_sv[118] += amp_sv[0];
-      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[71], w_fp[8], w_fp[5], w_fp[6], COUPs[2], &amp_fp[0] );
+      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[71], w_fp[8], w_fp[5], w_fp[6], COUPs[2], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -25432,7 +25432,7 @@ namespace mg5amcCpu
       jamp_sv[102] -= amp_sv[0];
       jamp_sv[104] += amp_sv[0];
       jamp_sv[108] -= amp_sv[0];
-      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[97], w_fp[8], w_fp[5], w_fp[6], COUPs[2], &amp_fp[0] );
+      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[97], w_fp[8], w_fp[5], w_fp[6], COUPs[2], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -25452,7 +25452,7 @@ namespace mg5amcCpu
       jamp_sv[116] += amp_sv[0];
       jamp_sv[118] += amp_sv[0];
       jamp_sv[119] -= amp_sv[0];
-      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[97], w_fp[8], w_fp[5], w_fp[6], COUPs[2], &amp_fp[0] );
+      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[97], w_fp[8], w_fp[5], w_fp[6], COUPs[2], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -25472,7 +25472,7 @@ namespace mg5amcCpu
       jamp_sv[116] += amp_sv[0];
       jamp_sv[118] += amp_sv[0];
       jamp_sv[119] -= amp_sv[0];
-      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[97], w_fp[8], w_fp[5], w_fp[6], COUPs[2], &amp_fp[0] );
+      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[97], w_fp[8], w_fp[5], w_fp[6], COUPs[2], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -25496,12 +25496,12 @@ namespace mg5amcCpu
       // *** DIAGRAM 1125 OF 1240 ***
 
       // Wavefunction(s) for diagram number 1125
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[21], w_fp[5], COUPs[0], 0., 0., w_fp[59] );
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[71], w_fp[5], COUPs[0], 0., 0., w_fp[20] );
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[97], w_fp[5], COUPs[0], 0., 0., w_fp[60] );
+      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[21], w_fp[5], COUPs[0], 1.0, 0., 0., w_fp[59] );
+      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[71], w_fp[5], COUPs[0], 1.0, 0., 0., w_fp[20] );
+      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[97], w_fp[5], COUPs[0], 1.0, 0., 0., w_fp[60] );
 
       // Amplitude(s) for diagram number 1125
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[6], w_fp[59], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[6], w_fp[59], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -25521,7 +25521,7 @@ namespace mg5amcCpu
       jamp_sv[115] -= amp_sv[0];
       jamp_sv[117] -= amp_sv[0];
       jamp_sv[119] += amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[6], w_fp[20], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[6], w_fp[20], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -25541,7 +25541,7 @@ namespace mg5amcCpu
       jamp_sv[116] += amp_sv[0];
       jamp_sv[117] -= amp_sv[0];
       jamp_sv[118] += amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[6], w_fp[60], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[6], w_fp[60], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -25565,12 +25565,12 @@ namespace mg5amcCpu
       // *** DIAGRAM 1126 OF 1240 ***
 
       // Wavefunction(s) for diagram number 1126
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[21], w_fp[6], COUPs[0], 0., 0., w_fp[17] );
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[71], w_fp[6], COUPs[0], 0., 0., w_fp[98] );
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[97], w_fp[6], COUPs[0], 0., 0., w_fp[111] );
+      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[21], w_fp[6], COUPs[0], 1.0, 0., 0., w_fp[17] );
+      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[71], w_fp[6], COUPs[0], 1.0, 0., 0., w_fp[98] );
+      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[97], w_fp[6], COUPs[0], 1.0, 0., 0., w_fp[111] );
 
       // Amplitude(s) for diagram number 1126
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[5], w_fp[17], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[5], w_fp[17], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -25590,7 +25590,7 @@ namespace mg5amcCpu
       jamp_sv[98] += amp_sv[0];
       jamp_sv[104] += amp_sv[0];
       jamp_sv[110] -= amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[5], w_fp[98], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[5], w_fp[98], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -25610,7 +25610,7 @@ namespace mg5amcCpu
       jamp_sv[102] -= amp_sv[0];
       jamp_sv[104] += amp_sv[0];
       jamp_sv[108] -= amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[5], w_fp[111], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[5], w_fp[111], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -25637,7 +25637,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 1127
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[21], w_fp[8], w_fp[29], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[21], w_fp[8], w_fp[29], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -25657,7 +25657,7 @@ namespace mg5amcCpu
       jamp_sv[115] -= amp_sv[0];
       jamp_sv[117] -= amp_sv[0];
       jamp_sv[119] += amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[71], w_fp[8], w_fp[29], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[71], w_fp[8], w_fp[29], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -25677,7 +25677,7 @@ namespace mg5amcCpu
       jamp_sv[116] += amp_sv[0];
       jamp_sv[117] -= amp_sv[0];
       jamp_sv[118] += amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[97], w_fp[8], w_fp[29], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[97], w_fp[8], w_fp[29], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -25701,12 +25701,12 @@ namespace mg5amcCpu
       // *** DIAGRAM 1128 OF 1240 ***
 
       // Wavefunction(s) for diagram number 1128
-      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[21], COUPs[1], cIPD[0], cIPD[1], w_fp[16] );
-      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[71], COUPs[1], cIPD[0], cIPD[1], w_fp[10] );
-      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[97], COUPs[1], cIPD[0], cIPD[1], w_fp[68] );
+      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[21], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[16] );
+      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[71], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[10] );
+      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[97], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[68] );
 
       // Amplitude(s) for diagram number 1128
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[16], w_fp[39], w_fp[6], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[16], w_fp[39], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -25714,7 +25714,7 @@ namespace mg5amcCpu
       jamp_sv[91] -= amp_sv[0];
       jamp_sv[93] -= amp_sv[0];
       jamp_sv[95] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[10], w_fp[39], w_fp[6], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[10], w_fp[39], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -25722,7 +25722,7 @@ namespace mg5amcCpu
       jamp_sv[92] += amp_sv[0];
       jamp_sv[93] -= amp_sv[0];
       jamp_sv[94] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[68], w_fp[39], w_fp[6], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[68], w_fp[39], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -25737,7 +25737,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 1129
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[39], w_fp[17], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[39], w_fp[17], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -25749,7 +25749,7 @@ namespace mg5amcCpu
       jamp_sv[91] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[93] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[95] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[39], w_fp[98], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[39], w_fp[98], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -25761,7 +25761,7 @@ namespace mg5amcCpu
       jamp_sv[92] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[93] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[94] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[39], w_fp[111], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[39], w_fp[111], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -25780,7 +25780,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 1130
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[39], w_fp[21], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[39], w_fp[21], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -25788,7 +25788,7 @@ namespace mg5amcCpu
       jamp_sv[74] -= amp_sv[0];
       jamp_sv[80] -= amp_sv[0];
       jamp_sv[86] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[39], w_fp[71], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[39], w_fp[71], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -25796,7 +25796,7 @@ namespace mg5amcCpu
       jamp_sv[78] += amp_sv[0];
       jamp_sv[80] -= amp_sv[0];
       jamp_sv[84] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[39], w_fp[97], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[39], w_fp[97], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -25811,7 +25811,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 1131
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[16], w_fp[47], w_fp[5], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[16], w_fp[47], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -25819,7 +25819,7 @@ namespace mg5amcCpu
       jamp_sv[115] -= amp_sv[0];
       jamp_sv[117] -= amp_sv[0];
       jamp_sv[119] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[10], w_fp[47], w_fp[5], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[10], w_fp[47], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -25827,7 +25827,7 @@ namespace mg5amcCpu
       jamp_sv[116] += amp_sv[0];
       jamp_sv[117] -= amp_sv[0];
       jamp_sv[118] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[68], w_fp[47], w_fp[5], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[68], w_fp[47], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -25842,7 +25842,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 1132
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[47], w_fp[59], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[47], w_fp[59], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -25854,7 +25854,7 @@ namespace mg5amcCpu
       jamp_sv[115] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[117] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[119] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[47], w_fp[20], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[47], w_fp[20], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -25866,7 +25866,7 @@ namespace mg5amcCpu
       jamp_sv[116] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[117] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[118] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[47], w_fp[60], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[47], w_fp[60], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -25885,7 +25885,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 1133
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[47], w_fp[21], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[47], w_fp[21], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -25893,7 +25893,7 @@ namespace mg5amcCpu
       jamp_sv[98] -= amp_sv[0];
       jamp_sv[104] -= amp_sv[0];
       jamp_sv[110] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[47], w_fp[71], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[47], w_fp[71], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -25901,7 +25901,7 @@ namespace mg5amcCpu
       jamp_sv[102] += amp_sv[0];
       jamp_sv[104] -= amp_sv[0];
       jamp_sv[108] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[47], w_fp[97], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[47], w_fp[97], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -25913,12 +25913,12 @@ namespace mg5amcCpu
       // *** DIAGRAM 1134 OF 1240 ***
 
       // Wavefunction(s) for diagram number 1134
-      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[2], w_fp[21], COUPs[1], cIPD[0], cIPD[1], w_fp[23] );
-      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[2], w_fp[71], COUPs[1], cIPD[0], cIPD[1], w_fp[21] );
-      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[2], w_fp[97], COUPs[1], cIPD[0], cIPD[1], w_fp[71] );
+      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[2], w_fp[21], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[23] );
+      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[2], w_fp[71], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[21] );
+      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[2], w_fp[97], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[71] );
 
       // Amplitude(s) for diagram number 1134
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[23], w_fp[6], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[23], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -25926,7 +25926,7 @@ namespace mg5amcCpu
       jamp_sv[7] -= amp_sv[0];
       jamp_sv[31] -= amp_sv[0];
       jamp_sv[55] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[21], w_fp[6], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[21], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -25934,7 +25934,7 @@ namespace mg5amcCpu
       jamp_sv[25] += amp_sv[0];
       jamp_sv[31] -= amp_sv[0];
       jamp_sv[49] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[71], w_fp[6], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[71], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -25949,7 +25949,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 1135
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[2], w_fp[17], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[2], w_fp[17], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -25961,7 +25961,7 @@ namespace mg5amcCpu
       jamp_sv[98] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[104] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[110] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[2], w_fp[98], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[2], w_fp[98], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -25973,7 +25973,7 @@ namespace mg5amcCpu
       jamp_sv[102] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[104] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[108] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[2], w_fp[111], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[2], w_fp[111], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -25992,7 +25992,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 1136
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[23], w_fp[5], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[23], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -26000,7 +26000,7 @@ namespace mg5amcCpu
       jamp_sv[6] -= amp_sv[0];
       jamp_sv[30] -= amp_sv[0];
       jamp_sv[54] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[21], w_fp[5], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[21], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -26008,7 +26008,7 @@ namespace mg5amcCpu
       jamp_sv[24] += amp_sv[0];
       jamp_sv[30] -= amp_sv[0];
       jamp_sv[48] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[71], w_fp[5], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[71], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -26023,7 +26023,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 1137
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[2], w_fp[59], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[2], w_fp[59], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -26035,7 +26035,7 @@ namespace mg5amcCpu
       jamp_sv[74] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[80] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[86] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[2], w_fp[20], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[2], w_fp[20], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -26047,7 +26047,7 @@ namespace mg5amcCpu
       jamp_sv[78] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[80] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[84] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[2], w_fp[60], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[2], w_fp[60], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -26066,7 +26066,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 1138
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[23], w_fp[29], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[23], w_fp[29], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -26078,7 +26078,7 @@ namespace mg5amcCpu
       jamp_sv[31] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[54] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[55] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[21], w_fp[29], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[21], w_fp[29], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -26090,7 +26090,7 @@ namespace mg5amcCpu
       jamp_sv[31] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[48] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[49] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[71], w_fp[29], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[71], w_fp[29], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -26109,7 +26109,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 1139
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[16], w_fp[2], w_fp[29], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[16], w_fp[2], w_fp[29], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -26121,7 +26121,7 @@ namespace mg5amcCpu
       jamp_sv[115] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[117] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[119] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[10], w_fp[2], w_fp[29], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[10], w_fp[2], w_fp[29], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -26133,7 +26133,7 @@ namespace mg5amcCpu
       jamp_sv[116] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[117] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[118] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[68], w_fp[2], w_fp[29], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[68], w_fp[2], w_fp[29], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -26149,12 +26149,12 @@ namespace mg5amcCpu
       // *** DIAGRAM 1140 OF 1240 ***
 
       // Wavefunction(s) for diagram number 1140
-      VVVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[1], w_fp[5], COUPs[2], 0., 0., w_fp[68] );
-      VVVV3P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[1], w_fp[5], COUPs[2], 0., 0., w_fp[29] );
-      VVVV4P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[1], w_fp[5], COUPs[2], 0., 0., w_fp[10] );
+      VVVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[1], w_fp[5], COUPs[2], 1.0, 0., 0., w_fp[68] );
+      VVVV3P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[1], w_fp[5], COUPs[2], 1.0, 0., 0., w_fp[29] );
+      VVVV4P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[1], w_fp[5], COUPs[2], 1.0, 0., 0., w_fp[10] );
 
       // Amplitude(s) for diagram number 1140
-      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[68], w_fp[8], w_fp[4], w_fp[6], COUPs[2], &amp_fp[0] );
+      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[68], w_fp[8], w_fp[4], w_fp[6], COUPs[2], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -26174,7 +26174,7 @@ namespace mg5amcCpu
       jamp_sv[109] -= amp_sv[0];
       jamp_sv[111] -= amp_sv[0];
       jamp_sv[113] += amp_sv[0];
-      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[68], w_fp[8], w_fp[4], w_fp[6], COUPs[2], &amp_fp[0] );
+      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[68], w_fp[8], w_fp[4], w_fp[6], COUPs[2], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -26194,7 +26194,7 @@ namespace mg5amcCpu
       jamp_sv[111] -= amp_sv[0];
       jamp_sv[113] += amp_sv[0];
       jamp_sv[116] -= amp_sv[0];
-      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[68], w_fp[8], w_fp[4], w_fp[6], COUPs[2], &amp_fp[0] );
+      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[68], w_fp[8], w_fp[4], w_fp[6], COUPs[2], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -26214,7 +26214,7 @@ namespace mg5amcCpu
       jamp_sv[100] += amp_sv[0];
       jamp_sv[106] += amp_sv[0];
       jamp_sv[116] -= amp_sv[0];
-      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[29], w_fp[8], w_fp[4], w_fp[6], COUPs[2], &amp_fp[0] );
+      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[29], w_fp[8], w_fp[4], w_fp[6], COUPs[2], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -26234,7 +26234,7 @@ namespace mg5amcCpu
       jamp_sv[110] += amp_sv[0];
       jamp_sv[111] -= amp_sv[0];
       jamp_sv[112] += amp_sv[0];
-      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[29], w_fp[8], w_fp[4], w_fp[6], COUPs[2], &amp_fp[0] );
+      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[29], w_fp[8], w_fp[4], w_fp[6], COUPs[2], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -26254,7 +26254,7 @@ namespace mg5amcCpu
       jamp_sv[111] -= amp_sv[0];
       jamp_sv[112] += amp_sv[0];
       jamp_sv[114] -= amp_sv[0];
-      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[29], w_fp[8], w_fp[4], w_fp[6], COUPs[2], &amp_fp[0] );
+      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[29], w_fp[8], w_fp[4], w_fp[6], COUPs[2], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -26274,7 +26274,7 @@ namespace mg5amcCpu
       jamp_sv[103] -= amp_sv[0];
       jamp_sv[106] += amp_sv[0];
       jamp_sv[114] -= amp_sv[0];
-      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[10], w_fp[8], w_fp[4], w_fp[6], COUPs[2], &amp_fp[0] );
+      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[10], w_fp[8], w_fp[4], w_fp[6], COUPs[2], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -26294,7 +26294,7 @@ namespace mg5amcCpu
       jamp_sv[110] += amp_sv[0];
       jamp_sv[112] += amp_sv[0];
       jamp_sv[113] -= amp_sv[0];
-      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[10], w_fp[8], w_fp[4], w_fp[6], COUPs[2], &amp_fp[0] );
+      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[10], w_fp[8], w_fp[4], w_fp[6], COUPs[2], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -26314,7 +26314,7 @@ namespace mg5amcCpu
       jamp_sv[113] -= amp_sv[0];
       jamp_sv[114] -= amp_sv[0];
       jamp_sv[116] += amp_sv[0];
-      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[10], w_fp[8], w_fp[4], w_fp[6], COUPs[2], &amp_fp[0] );
+      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[10], w_fp[8], w_fp[4], w_fp[6], COUPs[2], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -26338,12 +26338,12 @@ namespace mg5amcCpu
       // *** DIAGRAM 1141 OF 1240 ***
 
       // Wavefunction(s) for diagram number 1141
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[68], w_fp[4], COUPs[0], 0., 0., w_fp[16] );
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[29], w_fp[4], COUPs[0], 0., 0., w_fp[71] );
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[10], w_fp[4], COUPs[0], 0., 0., w_fp[21] );
+      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[68], w_fp[4], COUPs[0], 1.0, 0., 0., w_fp[16] );
+      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[29], w_fp[4], COUPs[0], 1.0, 0., 0., w_fp[71] );
+      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[10], w_fp[4], COUPs[0], 1.0, 0., 0., w_fp[21] );
 
       // Amplitude(s) for diagram number 1141
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[6], w_fp[16], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[6], w_fp[16], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -26363,7 +26363,7 @@ namespace mg5amcCpu
       jamp_sv[111] -= amp_sv[0];
       jamp_sv[113] += amp_sv[0];
       jamp_sv[116] -= amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[6], w_fp[71], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[6], w_fp[71], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -26383,7 +26383,7 @@ namespace mg5amcCpu
       jamp_sv[111] -= amp_sv[0];
       jamp_sv[112] += amp_sv[0];
       jamp_sv[114] -= amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[6], w_fp[21], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[6], w_fp[21], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -26407,12 +26407,12 @@ namespace mg5amcCpu
       // *** DIAGRAM 1142 OF 1240 ***
 
       // Wavefunction(s) for diagram number 1142
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[68], w_fp[6], COUPs[0], 0., 0., w_fp[23] );
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[29], w_fp[6], COUPs[0], 0., 0., w_fp[60] );
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[10], w_fp[6], COUPs[0], 0., 0., w_fp[20] );
+      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[68], w_fp[6], COUPs[0], 1.0, 0., 0., w_fp[23] );
+      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[29], w_fp[6], COUPs[0], 1.0, 0., 0., w_fp[60] );
+      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[10], w_fp[6], COUPs[0], 1.0, 0., 0., w_fp[20] );
 
       // Amplitude(s) for diagram number 1142
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[4], w_fp[23], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[4], w_fp[23], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -26432,7 +26432,7 @@ namespace mg5amcCpu
       jamp_sv[100] += amp_sv[0];
       jamp_sv[106] += amp_sv[0];
       jamp_sv[116] -= amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[4], w_fp[60], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[4], w_fp[60], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -26452,7 +26452,7 @@ namespace mg5amcCpu
       jamp_sv[103] -= amp_sv[0];
       jamp_sv[106] += amp_sv[0];
       jamp_sv[114] -= amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[4], w_fp[20], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[4], w_fp[20], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -26479,7 +26479,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 1143
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[68], w_fp[8], w_fp[27], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[68], w_fp[8], w_fp[27], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -26499,7 +26499,7 @@ namespace mg5amcCpu
       jamp_sv[109] -= amp_sv[0];
       jamp_sv[111] -= amp_sv[0];
       jamp_sv[113] += amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[29], w_fp[8], w_fp[27], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[29], w_fp[8], w_fp[27], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -26519,7 +26519,7 @@ namespace mg5amcCpu
       jamp_sv[110] += amp_sv[0];
       jamp_sv[111] -= amp_sv[0];
       jamp_sv[112] += amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[10], w_fp[8], w_fp[27], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[10], w_fp[8], w_fp[27], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -26543,12 +26543,12 @@ namespace mg5amcCpu
       // *** DIAGRAM 1144 OF 1240 ***
 
       // Wavefunction(s) for diagram number 1144
-      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[68], COUPs[1], cIPD[0], cIPD[1], w_fp[59] );
-      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[29], COUPs[1], cIPD[0], cIPD[1], w_fp[111] );
-      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[10], COUPs[1], cIPD[0], cIPD[1], w_fp[98] );
+      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[68], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[59] );
+      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[29], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[111] );
+      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[10], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[98] );
 
       // Amplitude(s) for diagram number 1144
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[59], w_fp[33], w_fp[6], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[59], w_fp[33], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -26556,7 +26556,7 @@ namespace mg5amcCpu
       jamp_sv[67] -= amp_sv[0];
       jamp_sv[69] -= amp_sv[0];
       jamp_sv[71] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[111], w_fp[33], w_fp[6], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[111], w_fp[33], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -26564,7 +26564,7 @@ namespace mg5amcCpu
       jamp_sv[68] += amp_sv[0];
       jamp_sv[69] -= amp_sv[0];
       jamp_sv[70] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[98], w_fp[33], w_fp[6], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[98], w_fp[33], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -26579,7 +26579,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 1145
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[33], w_fp[23], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[33], w_fp[23], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -26591,7 +26591,7 @@ namespace mg5amcCpu
       jamp_sv[67] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[69] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[71] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[33], w_fp[60], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[33], w_fp[60], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -26603,7 +26603,7 @@ namespace mg5amcCpu
       jamp_sv[68] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[69] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[70] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[33], w_fp[20], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[33], w_fp[20], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -26622,7 +26622,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 1146
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[33], w_fp[68], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[33], w_fp[68], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -26630,7 +26630,7 @@ namespace mg5amcCpu
       jamp_sv[50] -= amp_sv[0];
       jamp_sv[56] -= amp_sv[0];
       jamp_sv[62] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[33], w_fp[29], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[33], w_fp[29], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -26638,7 +26638,7 @@ namespace mg5amcCpu
       jamp_sv[54] += amp_sv[0];
       jamp_sv[56] -= amp_sv[0];
       jamp_sv[60] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[33], w_fp[10], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[33], w_fp[10], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -26653,7 +26653,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 1147
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[59], w_fp[47], w_fp[4], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[59], w_fp[47], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -26661,7 +26661,7 @@ namespace mg5amcCpu
       jamp_sv[109] -= amp_sv[0];
       jamp_sv[111] -= amp_sv[0];
       jamp_sv[113] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[111], w_fp[47], w_fp[4], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[111], w_fp[47], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -26669,7 +26669,7 @@ namespace mg5amcCpu
       jamp_sv[110] += amp_sv[0];
       jamp_sv[111] -= amp_sv[0];
       jamp_sv[112] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[98], w_fp[47], w_fp[4], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[98], w_fp[47], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -26684,7 +26684,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 1148
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[47], w_fp[16], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[47], w_fp[16], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -26696,7 +26696,7 @@ namespace mg5amcCpu
       jamp_sv[111] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[113] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[116] -= cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[47], w_fp[71], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[47], w_fp[71], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -26708,7 +26708,7 @@ namespace mg5amcCpu
       jamp_sv[111] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[112] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[114] -= cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[47], w_fp[21], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[47], w_fp[21], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -26727,7 +26727,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 1149
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[47], w_fp[68], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[47], w_fp[68], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -26735,7 +26735,7 @@ namespace mg5amcCpu
       jamp_sv[100] -= amp_sv[0];
       jamp_sv[106] -= amp_sv[0];
       jamp_sv[116] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[47], w_fp[29], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[47], w_fp[29], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -26743,7 +26743,7 @@ namespace mg5amcCpu
       jamp_sv[103] += amp_sv[0];
       jamp_sv[106] -= amp_sv[0];
       jamp_sv[114] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[47], w_fp[10], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[47], w_fp[10], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -26755,12 +26755,12 @@ namespace mg5amcCpu
       // *** DIAGRAM 1150 OF 1240 ***
 
       // Wavefunction(s) for diagram number 1150
-      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[2], w_fp[68], COUPs[1], cIPD[0], cIPD[1], w_fp[17] );
-      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[2], w_fp[29], COUPs[1], cIPD[0], cIPD[1], w_fp[68] );
-      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[2], w_fp[10], COUPs[1], cIPD[0], cIPD[1], w_fp[29] );
+      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[2], w_fp[68], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[17] );
+      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[2], w_fp[29], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[68] );
+      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[2], w_fp[10], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[29] );
 
       // Amplitude(s) for diagram number 1150
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[17], w_fp[6], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[17], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -26768,7 +26768,7 @@ namespace mg5amcCpu
       jamp_sv[13] -= amp_sv[0];
       jamp_sv[37] -= amp_sv[0];
       jamp_sv[79] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[68], w_fp[6], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[68], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -26776,7 +26776,7 @@ namespace mg5amcCpu
       jamp_sv[27] += amp_sv[0];
       jamp_sv[37] -= amp_sv[0];
       jamp_sv[73] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[29], w_fp[6], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[29], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -26791,7 +26791,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 1151
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[2], w_fp[23], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[2], w_fp[23], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -26803,7 +26803,7 @@ namespace mg5amcCpu
       jamp_sv[100] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[106] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[116] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[2], w_fp[60], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[2], w_fp[60], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -26815,7 +26815,7 @@ namespace mg5amcCpu
       jamp_sv[103] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[106] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[114] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[2], w_fp[20], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[2], w_fp[20], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -26834,7 +26834,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 1152
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[17], w_fp[4], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[17], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -26842,7 +26842,7 @@ namespace mg5amcCpu
       jamp_sv[12] -= amp_sv[0];
       jamp_sv[36] -= amp_sv[0];
       jamp_sv[78] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[68], w_fp[4], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[68], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -26850,7 +26850,7 @@ namespace mg5amcCpu
       jamp_sv[26] += amp_sv[0];
       jamp_sv[36] -= amp_sv[0];
       jamp_sv[72] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[29], w_fp[4], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[29], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -26865,7 +26865,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 1153
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[2], w_fp[16], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[2], w_fp[16], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -26877,7 +26877,7 @@ namespace mg5amcCpu
       jamp_sv[56] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[62] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[78] -= cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[2], w_fp[71], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[2], w_fp[71], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -26889,7 +26889,7 @@ namespace mg5amcCpu
       jamp_sv[56] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[60] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[72] -= cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[2], w_fp[21], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[2], w_fp[21], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -26908,7 +26908,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 1154
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[17], w_fp[27], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[17], w_fp[27], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -26920,7 +26920,7 @@ namespace mg5amcCpu
       jamp_sv[37] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[78] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[79] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[68], w_fp[27], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[68], w_fp[27], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -26932,7 +26932,7 @@ namespace mg5amcCpu
       jamp_sv[37] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[72] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[73] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[29], w_fp[27], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[29], w_fp[27], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -26951,7 +26951,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 1155
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[59], w_fp[2], w_fp[27], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[59], w_fp[2], w_fp[27], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -26963,7 +26963,7 @@ namespace mg5amcCpu
       jamp_sv[109] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[111] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[113] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[111], w_fp[2], w_fp[27], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[111], w_fp[2], w_fp[27], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -26975,7 +26975,7 @@ namespace mg5amcCpu
       jamp_sv[110] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[111] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[112] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[98], w_fp[2], w_fp[27], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[98], w_fp[2], w_fp[27], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -26991,12 +26991,12 @@ namespace mg5amcCpu
       // *** DIAGRAM 1156 OF 1240 ***
 
       // Wavefunction(s) for diagram number 1156
-      VVVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[1], w_fp[6], COUPs[2], 0., 0., w_fp[98] );
-      VVVV3P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[1], w_fp[6], COUPs[2], 0., 0., w_fp[27] );
-      VVVV4P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[1], w_fp[6], COUPs[2], 0., 0., w_fp[111] );
+      VVVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[1], w_fp[6], COUPs[2], 1.0, 0., 0., w_fp[98] );
+      VVVV3P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[1], w_fp[6], COUPs[2], 1.0, 0., 0., w_fp[27] );
+      VVVV4P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[1], w_fp[6], COUPs[2], 1.0, 0., 0., w_fp[111] );
 
       // Amplitude(s) for diagram number 1156
-      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[98], w_fp[8], w_fp[4], w_fp[5], COUPs[2], &amp_fp[0] );
+      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[98], w_fp[8], w_fp[4], w_fp[5], COUPs[2], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -27016,7 +27016,7 @@ namespace mg5amcCpu
       jamp_sv[89] += amp_sv[0];
       jamp_sv[102] += amp_sv[0];
       jamp_sv[103] -= amp_sv[0];
-      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[98], w_fp[8], w_fp[4], w_fp[5], COUPs[2], &amp_fp[0] );
+      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[98], w_fp[8], w_fp[4], w_fp[5], COUPs[2], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -27036,7 +27036,7 @@ namespace mg5amcCpu
       jamp_sv[89] += amp_sv[0];
       jamp_sv[92] -= amp_sv[0];
       jamp_sv[102] += amp_sv[0];
-      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[98], w_fp[8], w_fp[4], w_fp[5], COUPs[2], &amp_fp[0] );
+      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[98], w_fp[8], w_fp[4], w_fp[5], COUPs[2], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -27056,7 +27056,7 @@ namespace mg5amcCpu
       jamp_sv[82] += amp_sv[0];
       jamp_sv[92] -= amp_sv[0];
       jamp_sv[103] += amp_sv[0];
-      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[27], w_fp[8], w_fp[4], w_fp[5], COUPs[2], &amp_fp[0] );
+      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[27], w_fp[8], w_fp[4], w_fp[5], COUPs[2], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -27076,7 +27076,7 @@ namespace mg5amcCpu
       jamp_sv[88] += amp_sv[0];
       jamp_sv[96] += amp_sv[0];
       jamp_sv[97] -= amp_sv[0];
-      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[27], w_fp[8], w_fp[4], w_fp[5], COUPs[2], &amp_fp[0] );
+      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[27], w_fp[8], w_fp[4], w_fp[5], COUPs[2], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -27096,7 +27096,7 @@ namespace mg5amcCpu
       jamp_sv[88] += amp_sv[0];
       jamp_sv[90] -= amp_sv[0];
       jamp_sv[96] += amp_sv[0];
-      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[27], w_fp[8], w_fp[4], w_fp[5], COUPs[2], &amp_fp[0] );
+      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[27], w_fp[8], w_fp[4], w_fp[5], COUPs[2], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -27116,7 +27116,7 @@ namespace mg5amcCpu
       jamp_sv[82] += amp_sv[0];
       jamp_sv[90] -= amp_sv[0];
       jamp_sv[97] += amp_sv[0];
-      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[111], w_fp[8], w_fp[4], w_fp[5], COUPs[2], &amp_fp[0] );
+      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[111], w_fp[8], w_fp[4], w_fp[5], COUPs[2], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -27136,7 +27136,7 @@ namespace mg5amcCpu
       jamp_sv[97] -= amp_sv[0];
       jamp_sv[102] -= amp_sv[0];
       jamp_sv[103] += amp_sv[0];
-      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[111], w_fp[8], w_fp[4], w_fp[5], COUPs[2], &amp_fp[0] );
+      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[111], w_fp[8], w_fp[4], w_fp[5], COUPs[2], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -27156,7 +27156,7 @@ namespace mg5amcCpu
       jamp_sv[92] += amp_sv[0];
       jamp_sv[96] += amp_sv[0];
       jamp_sv[102] -= amp_sv[0];
-      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[111], w_fp[8], w_fp[4], w_fp[5], COUPs[2], &amp_fp[0] );
+      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[111], w_fp[8], w_fp[4], w_fp[5], COUPs[2], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -27180,12 +27180,12 @@ namespace mg5amcCpu
       // *** DIAGRAM 1157 OF 1240 ***
 
       // Wavefunction(s) for diagram number 1157
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[98], w_fp[4], COUPs[0], 0., 0., w_fp[59] );
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[27], w_fp[4], COUPs[0], 0., 0., w_fp[29] );
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[111], w_fp[4], COUPs[0], 0., 0., w_fp[68] );
+      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[98], w_fp[4], COUPs[0], 1.0, 0., 0., w_fp[59] );
+      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[27], w_fp[4], COUPs[0], 1.0, 0., 0., w_fp[29] );
+      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[111], w_fp[4], COUPs[0], 1.0, 0., 0., w_fp[68] );
 
       // Amplitude(s) for diagram number 1157
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[5], w_fp[59], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[5], w_fp[59], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -27205,7 +27205,7 @@ namespace mg5amcCpu
       jamp_sv[89] += amp_sv[0];
       jamp_sv[92] -= amp_sv[0];
       jamp_sv[102] += amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[5], w_fp[29], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[5], w_fp[29], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -27225,7 +27225,7 @@ namespace mg5amcCpu
       jamp_sv[88] += amp_sv[0];
       jamp_sv[90] -= amp_sv[0];
       jamp_sv[96] += amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[5], w_fp[68], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[5], w_fp[68], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -27249,12 +27249,12 @@ namespace mg5amcCpu
       // *** DIAGRAM 1158 OF 1240 ***
 
       // Wavefunction(s) for diagram number 1158
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[98], w_fp[5], COUPs[0], 0., 0., w_fp[17] );
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[27], w_fp[5], COUPs[0], 0., 0., w_fp[21] );
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[111], w_fp[5], COUPs[0], 0., 0., w_fp[71] );
+      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[98], w_fp[5], COUPs[0], 1.0, 0., 0., w_fp[17] );
+      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[27], w_fp[5], COUPs[0], 1.0, 0., 0., w_fp[21] );
+      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[111], w_fp[5], COUPs[0], 1.0, 0., 0., w_fp[71] );
 
       // Amplitude(s) for diagram number 1158
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[4], w_fp[17], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[4], w_fp[17], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -27274,7 +27274,7 @@ namespace mg5amcCpu
       jamp_sv[82] += amp_sv[0];
       jamp_sv[92] -= amp_sv[0];
       jamp_sv[103] += amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[4], w_fp[21], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[4], w_fp[21], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -27294,7 +27294,7 @@ namespace mg5amcCpu
       jamp_sv[82] += amp_sv[0];
       jamp_sv[90] -= amp_sv[0];
       jamp_sv[97] += amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[4], w_fp[71], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[4], w_fp[71], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -27321,7 +27321,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 1159
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[98], w_fp[8], w_fp[24], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[98], w_fp[8], w_fp[24], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -27341,7 +27341,7 @@ namespace mg5amcCpu
       jamp_sv[89] += amp_sv[0];
       jamp_sv[102] += amp_sv[0];
       jamp_sv[103] -= amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[27], w_fp[8], w_fp[24], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[27], w_fp[8], w_fp[24], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -27361,7 +27361,7 @@ namespace mg5amcCpu
       jamp_sv[88] += amp_sv[0];
       jamp_sv[96] += amp_sv[0];
       jamp_sv[97] -= amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[111], w_fp[8], w_fp[24], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[111], w_fp[8], w_fp[24], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -27385,12 +27385,12 @@ namespace mg5amcCpu
       // *** DIAGRAM 1160 OF 1240 ***
 
       // Wavefunction(s) for diagram number 1160
-      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[98], COUPs[1], cIPD[0], cIPD[1], w_fp[16] );
-      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[27], COUPs[1], cIPD[0], cIPD[1], w_fp[20] );
-      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[111], COUPs[1], cIPD[0], cIPD[1], w_fp[60] );
+      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[98], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[16] );
+      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[27], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[20] );
+      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[111], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[60] );
 
       // Amplitude(s) for diagram number 1160
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[16], w_fp[33], w_fp[5], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[16], w_fp[33], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -27398,7 +27398,7 @@ namespace mg5amcCpu
       jamp_sv[61] -= amp_sv[0];
       jamp_sv[63] -= amp_sv[0];
       jamp_sv[65] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[20], w_fp[33], w_fp[5], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[20], w_fp[33], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -27406,7 +27406,7 @@ namespace mg5amcCpu
       jamp_sv[62] += amp_sv[0];
       jamp_sv[63] -= amp_sv[0];
       jamp_sv[64] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[60], w_fp[33], w_fp[5], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[60], w_fp[33], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -27421,7 +27421,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 1161
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[33], w_fp[17], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[33], w_fp[17], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -27433,7 +27433,7 @@ namespace mg5amcCpu
       jamp_sv[63] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[65] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[68] -= cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[33], w_fp[21], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[33], w_fp[21], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -27445,7 +27445,7 @@ namespace mg5amcCpu
       jamp_sv[63] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[64] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[66] -= cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[33], w_fp[71], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[33], w_fp[71], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -27464,7 +27464,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 1162
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[33], w_fp[98], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[33], w_fp[98], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -27472,7 +27472,7 @@ namespace mg5amcCpu
       jamp_sv[52] -= amp_sv[0];
       jamp_sv[58] -= amp_sv[0];
       jamp_sv[68] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[33], w_fp[27], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[33], w_fp[27], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -27480,7 +27480,7 @@ namespace mg5amcCpu
       jamp_sv[55] += amp_sv[0];
       jamp_sv[58] -= amp_sv[0];
       jamp_sv[66] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[33], w_fp[111], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[33], w_fp[111], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -27495,7 +27495,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 1163
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[16], w_fp[39], w_fp[4], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[16], w_fp[39], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -27503,7 +27503,7 @@ namespace mg5amcCpu
       jamp_sv[85] -= amp_sv[0];
       jamp_sv[87] -= amp_sv[0];
       jamp_sv[89] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[20], w_fp[39], w_fp[4], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[20], w_fp[39], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -27511,7 +27511,7 @@ namespace mg5amcCpu
       jamp_sv[86] += amp_sv[0];
       jamp_sv[87] -= amp_sv[0];
       jamp_sv[88] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[60], w_fp[39], w_fp[4], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[60], w_fp[39], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -27526,7 +27526,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 1164
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[39], w_fp[59], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[39], w_fp[59], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -27538,7 +27538,7 @@ namespace mg5amcCpu
       jamp_sv[87] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[89] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[92] -= cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[39], w_fp[29], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[39], w_fp[29], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -27550,7 +27550,7 @@ namespace mg5amcCpu
       jamp_sv[87] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[88] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[90] -= cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[39], w_fp[68], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[39], w_fp[68], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -27569,7 +27569,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 1165
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[39], w_fp[98], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[39], w_fp[98], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -27577,7 +27577,7 @@ namespace mg5amcCpu
       jamp_sv[76] -= amp_sv[0];
       jamp_sv[82] -= amp_sv[0];
       jamp_sv[92] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[39], w_fp[27], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[39], w_fp[27], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -27585,7 +27585,7 @@ namespace mg5amcCpu
       jamp_sv[79] += amp_sv[0];
       jamp_sv[82] -= amp_sv[0];
       jamp_sv[90] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[39], w_fp[111], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[39], w_fp[111], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -27597,12 +27597,12 @@ namespace mg5amcCpu
       // *** DIAGRAM 1166 OF 1240 ***
 
       // Wavefunction(s) for diagram number 1166
-      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[2], w_fp[98], COUPs[1], cIPD[0], cIPD[1], w_fp[23] );
-      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[2], w_fp[27], COUPs[1], cIPD[0], cIPD[1], w_fp[98] );
-      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[2], w_fp[111], COUPs[1], cIPD[0], cIPD[1], w_fp[27] );
+      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[2], w_fp[98], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[23] );
+      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[2], w_fp[27], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[98] );
+      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[2], w_fp[111], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[27] );
 
       // Amplitude(s) for diagram number 1166
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[23], w_fp[5], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[23], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -27610,7 +27610,7 @@ namespace mg5amcCpu
       jamp_sv[19] -= amp_sv[0];
       jamp_sv[43] -= amp_sv[0];
       jamp_sv[103] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[98], w_fp[5], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[98], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -27618,7 +27618,7 @@ namespace mg5amcCpu
       jamp_sv[29] += amp_sv[0];
       jamp_sv[43] -= amp_sv[0];
       jamp_sv[97] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[27], w_fp[5], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[27], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -27633,7 +27633,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 1167
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[2], w_fp[17], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[2], w_fp[17], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -27645,7 +27645,7 @@ namespace mg5amcCpu
       jamp_sv[82] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[92] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[103] -= cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[2], w_fp[21], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[2], w_fp[21], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -27657,7 +27657,7 @@ namespace mg5amcCpu
       jamp_sv[82] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[90] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[97] -= cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[2], w_fp[71], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[2], w_fp[71], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -27676,7 +27676,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 1168
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[23], w_fp[4], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[23], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -27684,7 +27684,7 @@ namespace mg5amcCpu
       jamp_sv[18] -= amp_sv[0];
       jamp_sv[42] -= amp_sv[0];
       jamp_sv[102] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[98], w_fp[4], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[98], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -27692,7 +27692,7 @@ namespace mg5amcCpu
       jamp_sv[28] += amp_sv[0];
       jamp_sv[42] -= amp_sv[0];
       jamp_sv[96] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[27], w_fp[4], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[27], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -27707,7 +27707,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 1169
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[2], w_fp[59], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[2], w_fp[59], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -27719,7 +27719,7 @@ namespace mg5amcCpu
       jamp_sv[58] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[68] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[102] -= cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[2], w_fp[29], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[2], w_fp[29], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -27731,7 +27731,7 @@ namespace mg5amcCpu
       jamp_sv[58] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[66] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[96] -= cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[2], w_fp[68], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[2], w_fp[68], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -27750,7 +27750,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 1170
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[23], w_fp[24], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[23], w_fp[24], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -27762,7 +27762,7 @@ namespace mg5amcCpu
       jamp_sv[43] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[102] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[103] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[98], w_fp[24], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[98], w_fp[24], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -27774,7 +27774,7 @@ namespace mg5amcCpu
       jamp_sv[43] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[96] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[97] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[27], w_fp[24], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[27], w_fp[24], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -27793,7 +27793,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 1171
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[16], w_fp[2], w_fp[24], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[16], w_fp[2], w_fp[24], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -27805,7 +27805,7 @@ namespace mg5amcCpu
       jamp_sv[85] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[87] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[89] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[20], w_fp[2], w_fp[24], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[20], w_fp[2], w_fp[24], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -27817,7 +27817,7 @@ namespace mg5amcCpu
       jamp_sv[86] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[87] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[88] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[60], w_fp[2], w_fp[24], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[60], w_fp[2], w_fp[24], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -27833,15 +27833,15 @@ namespace mg5amcCpu
       // *** DIAGRAM 1172 OF 1240 ***
 
       // Wavefunction(s) for diagram number 1172
-      VVVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[4], w_fp[5], COUPs[2], 0., 0., w_fp[60] );
-      VVVV3P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[4], w_fp[5], COUPs[2], 0., 0., w_fp[24] );
-      VVVV4P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[4], w_fp[5], COUPs[2], 0., 0., w_fp[20] );
-      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[60], COUPs[1], cIPD[0], cIPD[1], w_fp[16] );
-      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[24], COUPs[1], cIPD[0], cIPD[1], w_fp[27] );
-      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[20], COUPs[1], cIPD[0], cIPD[1], w_fp[98] );
+      VVVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[4], w_fp[5], COUPs[2], 1.0, 0., 0., w_fp[60] );
+      VVVV3P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[4], w_fp[5], COUPs[2], 1.0, 0., 0., w_fp[24] );
+      VVVV4P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[4], w_fp[5], COUPs[2], 1.0, 0., 0., w_fp[20] );
+      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[60], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[16] );
+      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[24], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[27] );
+      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[20], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[98] );
 
       // Amplitude(s) for diagram number 1172
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[16], w_fp[77], w_fp[6], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[16], w_fp[77], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -27849,7 +27849,7 @@ namespace mg5amcCpu
       jamp_sv[43] -= amp_sv[0];
       jamp_sv[45] -= amp_sv[0];
       jamp_sv[47] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[27], w_fp[77], w_fp[6], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[27], w_fp[77], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -27857,7 +27857,7 @@ namespace mg5amcCpu
       jamp_sv[44] += amp_sv[0];
       jamp_sv[45] -= amp_sv[0];
       jamp_sv[46] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[98], w_fp[77], w_fp[6], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[98], w_fp[77], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -27869,12 +27869,12 @@ namespace mg5amcCpu
       // *** DIAGRAM 1173 OF 1240 ***
 
       // Wavefunction(s) for diagram number 1173
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[60], w_fp[6], COUPs[0], 0., 0., w_fp[23] );
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[24], w_fp[6], COUPs[0], 0., 0., w_fp[68] );
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[20], w_fp[6], COUPs[0], 0., 0., w_fp[29] );
+      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[60], w_fp[6], COUPs[0], 1.0, 0., 0., w_fp[23] );
+      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[24], w_fp[6], COUPs[0], 1.0, 0., 0., w_fp[68] );
+      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[20], w_fp[6], COUPs[0], 1.0, 0., 0., w_fp[29] );
 
       // Amplitude(s) for diagram number 1173
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[77], w_fp[23], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[77], w_fp[23], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -27886,7 +27886,7 @@ namespace mg5amcCpu
       jamp_sv[43] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[45] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[47] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[77], w_fp[68], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[77], w_fp[68], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -27898,7 +27898,7 @@ namespace mg5amcCpu
       jamp_sv[44] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[45] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[46] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[77], w_fp[29], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[77], w_fp[29], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -27917,7 +27917,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 1174
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[77], w_fp[60], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[77], w_fp[60], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -27925,7 +27925,7 @@ namespace mg5amcCpu
       jamp_sv[26] -= amp_sv[0];
       jamp_sv[32] -= amp_sv[0];
       jamp_sv[38] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[77], w_fp[24], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[77], w_fp[24], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -27933,7 +27933,7 @@ namespace mg5amcCpu
       jamp_sv[30] += amp_sv[0];
       jamp_sv[32] -= amp_sv[0];
       jamp_sv[36] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[77], w_fp[20], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[77], w_fp[20], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -27945,12 +27945,12 @@ namespace mg5amcCpu
       // *** DIAGRAM 1175 OF 1240 ***
 
       // Wavefunction(s) for diagram number 1175
-      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[2], w_fp[60], COUPs[1], cIPD[0], cIPD[1], w_fp[59] );
-      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[2], w_fp[24], COUPs[1], cIPD[0], cIPD[1], w_fp[71] );
-      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[2], w_fp[20], COUPs[1], cIPD[0], cIPD[1], w_fp[21] );
+      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[2], w_fp[60], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[59] );
+      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[2], w_fp[24], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[71] );
+      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[2], w_fp[20], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[21] );
 
       // Amplitude(s) for diagram number 1175
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[59], w_fp[6], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[59], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -27958,7 +27958,7 @@ namespace mg5amcCpu
       jamp_sv[15] -= amp_sv[0];
       jamp_sv[61] -= amp_sv[0];
       jamp_sv[85] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[71], w_fp[6], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[71], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -27966,7 +27966,7 @@ namespace mg5amcCpu
       jamp_sv[51] += amp_sv[0];
       jamp_sv[61] -= amp_sv[0];
       jamp_sv[75] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[21], w_fp[6], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[21], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -27981,7 +27981,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 1176
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[2], w_fp[23], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[2], w_fp[23], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -27993,7 +27993,7 @@ namespace mg5amcCpu
       jamp_sv[101] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[112] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[118] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[2], w_fp[68], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[2], w_fp[68], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -28005,7 +28005,7 @@ namespace mg5amcCpu
       jamp_sv[109] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[112] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[115] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[2], w_fp[29], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[2], w_fp[29], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -28024,7 +28024,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 1177
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[47], w_fp[60], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[47], w_fp[60], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -28032,7 +28032,7 @@ namespace mg5amcCpu
       jamp_sv[101] -= amp_sv[0];
       jamp_sv[112] -= amp_sv[0];
       jamp_sv[118] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[47], w_fp[24], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[47], w_fp[24], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -28040,7 +28040,7 @@ namespace mg5amcCpu
       jamp_sv[109] += amp_sv[0];
       jamp_sv[112] -= amp_sv[0];
       jamp_sv[115] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[47], w_fp[20], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[47], w_fp[20], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -28055,7 +28055,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 1178
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[59], w_fp[72], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[59], w_fp[72], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -28067,7 +28067,7 @@ namespace mg5amcCpu
       jamp_sv[61] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[84] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[85] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[71], w_fp[72], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[71], w_fp[72], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -28079,7 +28079,7 @@ namespace mg5amcCpu
       jamp_sv[61] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[74] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[75] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[21], w_fp[72], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[21], w_fp[72], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -28098,7 +28098,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 1179
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[16], w_fp[2], w_fp[72], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[16], w_fp[2], w_fp[72], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -28110,7 +28110,7 @@ namespace mg5amcCpu
       jamp_sv[103] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[105] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[107] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[27], w_fp[2], w_fp[72], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[27], w_fp[2], w_fp[72], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -28122,7 +28122,7 @@ namespace mg5amcCpu
       jamp_sv[104] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[105] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[106] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[98], w_fp[2], w_fp[72], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[98], w_fp[2], w_fp[72], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -28141,7 +28141,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 1180
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[60], w_fp[72], w_fp[8], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[60], w_fp[72], w_fp[8], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -28161,7 +28161,7 @@ namespace mg5amcCpu
       jamp_sv[103] += amp_sv[0];
       jamp_sv[105] += amp_sv[0];
       jamp_sv[107] -= amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[24], w_fp[72], w_fp[8], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[24], w_fp[72], w_fp[8], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -28181,7 +28181,7 @@ namespace mg5amcCpu
       jamp_sv[104] -= amp_sv[0];
       jamp_sv[105] += amp_sv[0];
       jamp_sv[106] -= amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[20], w_fp[72], w_fp[8], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[20], w_fp[72], w_fp[8], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -28208,7 +28208,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 1181
-      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[60], w_fp[1], w_fp[8], w_fp[6], COUPs[2], &amp_fp[0] );
+      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[60], w_fp[1], w_fp[8], w_fp[6], COUPs[2], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -28228,7 +28228,7 @@ namespace mg5amcCpu
       jamp_sv[107] += amp_sv[0];
       jamp_sv[112] += amp_sv[0];
       jamp_sv[118] -= amp_sv[0];
-      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[60], w_fp[1], w_fp[8], w_fp[6], COUPs[2], &amp_fp[0] );
+      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[60], w_fp[1], w_fp[8], w_fp[6], COUPs[2], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -28248,7 +28248,7 @@ namespace mg5amcCpu
       jamp_sv[103] -= amp_sv[0];
       jamp_sv[105] -= amp_sv[0];
       jamp_sv[107] += amp_sv[0];
-      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[60], w_fp[1], w_fp[8], w_fp[6], COUPs[2], &amp_fp[0] );
+      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[60], w_fp[1], w_fp[8], w_fp[6], COUPs[2], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -28268,7 +28268,7 @@ namespace mg5amcCpu
       jamp_sv[101] -= amp_sv[0];
       jamp_sv[112] -= amp_sv[0];
       jamp_sv[118] += amp_sv[0];
-      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[24], w_fp[1], w_fp[8], w_fp[6], COUPs[2], &amp_fp[0] );
+      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[24], w_fp[1], w_fp[8], w_fp[6], COUPs[2], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -28288,7 +28288,7 @@ namespace mg5amcCpu
       jamp_sv[109] -= amp_sv[0];
       jamp_sv[112] += amp_sv[0];
       jamp_sv[115] -= amp_sv[0];
-      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[24], w_fp[1], w_fp[8], w_fp[6], COUPs[2], &amp_fp[0] );
+      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[24], w_fp[1], w_fp[8], w_fp[6], COUPs[2], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -28308,7 +28308,7 @@ namespace mg5amcCpu
       jamp_sv[104] += amp_sv[0];
       jamp_sv[105] -= amp_sv[0];
       jamp_sv[106] += amp_sv[0];
-      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[24], w_fp[1], w_fp[8], w_fp[6], COUPs[2], &amp_fp[0] );
+      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[24], w_fp[1], w_fp[8], w_fp[6], COUPs[2], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -28328,7 +28328,7 @@ namespace mg5amcCpu
       jamp_sv[109] += amp_sv[0];
       jamp_sv[112] -= amp_sv[0];
       jamp_sv[115] += amp_sv[0];
-      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[20], w_fp[1], w_fp[8], w_fp[6], COUPs[2], &amp_fp[0] );
+      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[20], w_fp[1], w_fp[8], w_fp[6], COUPs[2], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -28348,7 +28348,7 @@ namespace mg5amcCpu
       jamp_sv[109] -= amp_sv[0];
       jamp_sv[115] -= amp_sv[0];
       jamp_sv[118] += amp_sv[0];
-      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[20], w_fp[1], w_fp[8], w_fp[6], COUPs[2], &amp_fp[0] );
+      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[20], w_fp[1], w_fp[8], w_fp[6], COUPs[2], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -28368,7 +28368,7 @@ namespace mg5amcCpu
       jamp_sv[104] += amp_sv[0];
       jamp_sv[106] += amp_sv[0];
       jamp_sv[107] -= amp_sv[0];
-      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[20], w_fp[1], w_fp[8], w_fp[6], COUPs[2], &amp_fp[0] );
+      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[20], w_fp[1], w_fp[8], w_fp[6], COUPs[2], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -28392,12 +28392,12 @@ namespace mg5amcCpu
       // *** DIAGRAM 1182 OF 1240 ***
 
       // Wavefunction(s) for diagram number 1182
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[60], w_fp[1], COUPs[0], 0., 0., w_fp[72] );
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[24], w_fp[1], COUPs[0], 0., 0., w_fp[60] );
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[20], w_fp[1], COUPs[0], 0., 0., w_fp[24] );
+      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[60], w_fp[1], COUPs[0], 1.0, 0., 0., w_fp[72] );
+      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[24], w_fp[1], COUPs[0], 1.0, 0., 0., w_fp[60] );
+      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[20], w_fp[1], COUPs[0], 1.0, 0., 0., w_fp[24] );
 
       // Amplitude(s) for diagram number 1182
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[6], w_fp[72], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[6], w_fp[72], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -28417,7 +28417,7 @@ namespace mg5amcCpu
       jamp_sv[107] += amp_sv[0];
       jamp_sv[112] += amp_sv[0];
       jamp_sv[118] -= amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[6], w_fp[60], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[6], w_fp[60], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -28437,7 +28437,7 @@ namespace mg5amcCpu
       jamp_sv[109] -= amp_sv[0];
       jamp_sv[112] += amp_sv[0];
       jamp_sv[115] -= amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[6], w_fp[24], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[6], w_fp[24], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -28464,7 +28464,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 1183
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[23], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[23], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -28484,7 +28484,7 @@ namespace mg5amcCpu
       jamp_sv[101] -= amp_sv[0];
       jamp_sv[112] -= amp_sv[0];
       jamp_sv[118] += amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[68], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[68], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -28504,7 +28504,7 @@ namespace mg5amcCpu
       jamp_sv[109] += amp_sv[0];
       jamp_sv[112] -= amp_sv[0];
       jamp_sv[115] += amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[29], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[29], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -28531,7 +28531,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 1184
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[47], w_fp[72], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[47], w_fp[72], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -28543,7 +28543,7 @@ namespace mg5amcCpu
       jamp_sv[107] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[112] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[118] -= cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[47], w_fp[60], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[47], w_fp[60], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -28555,7 +28555,7 @@ namespace mg5amcCpu
       jamp_sv[109] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[112] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[115] -= cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[47], w_fp[24], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[47], w_fp[24], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -28574,7 +28574,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 1185
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[16], w_fp[47], w_fp[1], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[16], w_fp[47], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -28582,7 +28582,7 @@ namespace mg5amcCpu
       jamp_sv[103] -= amp_sv[0];
       jamp_sv[105] -= amp_sv[0];
       jamp_sv[107] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[27], w_fp[47], w_fp[1], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[27], w_fp[47], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -28590,7 +28590,7 @@ namespace mg5amcCpu
       jamp_sv[104] += amp_sv[0];
       jamp_sv[105] -= amp_sv[0];
       jamp_sv[106] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[98], w_fp[47], w_fp[1], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[98], w_fp[47], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -28605,7 +28605,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 1186
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[2], w_fp[72], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[2], w_fp[72], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -28617,7 +28617,7 @@ namespace mg5amcCpu
       jamp_sv[38] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[60] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[84] -= cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[2], w_fp[60], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[2], w_fp[60], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -28629,7 +28629,7 @@ namespace mg5amcCpu
       jamp_sv[50] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[60] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[74] -= cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[2], w_fp[24], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[2], w_fp[24], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -28648,7 +28648,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 1187
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[59], w_fp[1], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[59], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -28656,7 +28656,7 @@ namespace mg5amcCpu
       jamp_sv[14] -= amp_sv[0];
       jamp_sv[60] -= amp_sv[0];
       jamp_sv[84] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[71], w_fp[1], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[71], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -28664,7 +28664,7 @@ namespace mg5amcCpu
       jamp_sv[50] += amp_sv[0];
       jamp_sv[60] -= amp_sv[0];
       jamp_sv[74] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[21], w_fp[1], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[21], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -28676,15 +28676,15 @@ namespace mg5amcCpu
       // *** DIAGRAM 1188 OF 1240 ***
 
       // Wavefunction(s) for diagram number 1188
-      VVVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[4], w_fp[6], COUPs[2], 0., 0., w_fp[21] );
-      VVVV3P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[4], w_fp[6], COUPs[2], 0., 0., w_fp[71] );
-      VVVV4P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[4], w_fp[6], COUPs[2], 0., 0., w_fp[59] );
-      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[21], COUPs[1], cIPD[0], cIPD[1], w_fp[24] );
-      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[71], COUPs[1], cIPD[0], cIPD[1], w_fp[60] );
-      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[59], COUPs[1], cIPD[0], cIPD[1], w_fp[72] );
+      VVVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[4], w_fp[6], COUPs[2], 1.0, 0., 0., w_fp[21] );
+      VVVV3P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[4], w_fp[6], COUPs[2], 1.0, 0., 0., w_fp[71] );
+      VVVV4P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[4], w_fp[6], COUPs[2], 1.0, 0., 0., w_fp[59] );
+      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[21], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[24] );
+      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[71], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[60] );
+      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[59], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[72] );
 
       // Amplitude(s) for diagram number 1188
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[24], w_fp[77], w_fp[5], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[24], w_fp[77], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -28692,7 +28692,7 @@ namespace mg5amcCpu
       jamp_sv[37] -= amp_sv[0];
       jamp_sv[39] -= amp_sv[0];
       jamp_sv[41] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[60], w_fp[77], w_fp[5], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[60], w_fp[77], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -28700,7 +28700,7 @@ namespace mg5amcCpu
       jamp_sv[38] += amp_sv[0];
       jamp_sv[39] -= amp_sv[0];
       jamp_sv[40] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[72], w_fp[77], w_fp[5], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[72], w_fp[77], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -28712,12 +28712,12 @@ namespace mg5amcCpu
       // *** DIAGRAM 1189 OF 1240 ***
 
       // Wavefunction(s) for diagram number 1189
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[21], w_fp[5], COUPs[0], 0., 0., w_fp[98] );
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[71], w_fp[5], COUPs[0], 0., 0., w_fp[27] );
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[59], w_fp[5], COUPs[0], 0., 0., w_fp[16] );
+      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[21], w_fp[5], COUPs[0], 1.0, 0., 0., w_fp[98] );
+      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[71], w_fp[5], COUPs[0], 1.0, 0., 0., w_fp[27] );
+      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[59], w_fp[5], COUPs[0], 1.0, 0., 0., w_fp[16] );
 
       // Amplitude(s) for diagram number 1189
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[77], w_fp[98], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[77], w_fp[98], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -28729,7 +28729,7 @@ namespace mg5amcCpu
       jamp_sv[39] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[41] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[44] -= cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[77], w_fp[27], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[77], w_fp[27], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -28741,7 +28741,7 @@ namespace mg5amcCpu
       jamp_sv[39] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[40] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[42] -= cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[77], w_fp[16], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[77], w_fp[16], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -28760,7 +28760,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 1190
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[77], w_fp[21], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[77], w_fp[21], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -28768,7 +28768,7 @@ namespace mg5amcCpu
       jamp_sv[28] -= amp_sv[0];
       jamp_sv[34] -= amp_sv[0];
       jamp_sv[44] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[77], w_fp[71], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[77], w_fp[71], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -28776,7 +28776,7 @@ namespace mg5amcCpu
       jamp_sv[31] += amp_sv[0];
       jamp_sv[34] -= amp_sv[0];
       jamp_sv[42] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[77], w_fp[59], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[77], w_fp[59], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -28788,12 +28788,12 @@ namespace mg5amcCpu
       // *** DIAGRAM 1191 OF 1240 ***
 
       // Wavefunction(s) for diagram number 1191
-      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[2], w_fp[21], COUPs[1], cIPD[0], cIPD[1], w_fp[29] );
-      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[2], w_fp[71], COUPs[1], cIPD[0], cIPD[1], w_fp[68] );
-      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[2], w_fp[59], COUPs[1], cIPD[0], cIPD[1], w_fp[23] );
+      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[2], w_fp[21], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[29] );
+      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[2], w_fp[71], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[68] );
+      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[2], w_fp[59], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[23] );
 
       // Amplitude(s) for diagram number 1191
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[29], w_fp[5], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[29], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -28801,7 +28801,7 @@ namespace mg5amcCpu
       jamp_sv[21] -= amp_sv[0];
       jamp_sv[67] -= amp_sv[0];
       jamp_sv[109] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[68], w_fp[5], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[68], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -28809,7 +28809,7 @@ namespace mg5amcCpu
       jamp_sv[53] += amp_sv[0];
       jamp_sv[67] -= amp_sv[0];
       jamp_sv[99] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[23], w_fp[5], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[23], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -28824,7 +28824,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 1192
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[2], w_fp[98], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[2], w_fp[98], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -28836,7 +28836,7 @@ namespace mg5amcCpu
       jamp_sv[88] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[94] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[109] -= cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[2], w_fp[27], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[2], w_fp[27], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -28848,7 +28848,7 @@ namespace mg5amcCpu
       jamp_sv[88] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[91] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[99] -= cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[2], w_fp[16], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[2], w_fp[16], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -28867,7 +28867,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 1193
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[39], w_fp[21], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[39], w_fp[21], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -28875,7 +28875,7 @@ namespace mg5amcCpu
       jamp_sv[77] -= amp_sv[0];
       jamp_sv[88] -= amp_sv[0];
       jamp_sv[94] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[39], w_fp[71], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[39], w_fp[71], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -28883,7 +28883,7 @@ namespace mg5amcCpu
       jamp_sv[85] += amp_sv[0];
       jamp_sv[88] -= amp_sv[0];
       jamp_sv[91] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[39], w_fp[59], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[39], w_fp[59], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -28898,7 +28898,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 1194
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[29], w_fp[66], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[29], w_fp[66], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -28910,7 +28910,7 @@ namespace mg5amcCpu
       jamp_sv[67] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[108] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[109] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[68], w_fp[66], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[68], w_fp[66], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -28922,7 +28922,7 @@ namespace mg5amcCpu
       jamp_sv[67] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[98] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[99] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[23], w_fp[66], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[23], w_fp[66], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -28941,7 +28941,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 1195
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[24], w_fp[2], w_fp[66], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[24], w_fp[2], w_fp[66], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -28953,7 +28953,7 @@ namespace mg5amcCpu
       jamp_sv[79] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[81] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[83] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[60], w_fp[2], w_fp[66], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[60], w_fp[2], w_fp[66], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -28965,7 +28965,7 @@ namespace mg5amcCpu
       jamp_sv[80] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[81] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[82] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[72], w_fp[2], w_fp[66], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[72], w_fp[2], w_fp[66], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -28984,7 +28984,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 1196
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[21], w_fp[66], w_fp[8], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[21], w_fp[66], w_fp[8], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -29004,7 +29004,7 @@ namespace mg5amcCpu
       jamp_sv[83] -= amp_sv[0];
       jamp_sv[108] -= amp_sv[0];
       jamp_sv[109] += amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[71], w_fp[66], w_fp[8], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[71], w_fp[66], w_fp[8], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -29024,7 +29024,7 @@ namespace mg5amcCpu
       jamp_sv[82] -= amp_sv[0];
       jamp_sv[98] -= amp_sv[0];
       jamp_sv[99] += amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[59], w_fp[66], w_fp[8], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[59], w_fp[66], w_fp[8], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -29051,7 +29051,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 1197
-      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[21], w_fp[1], w_fp[8], w_fp[5], COUPs[2], &amp_fp[0] );
+      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[21], w_fp[1], w_fp[8], w_fp[5], COUPs[2], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -29071,7 +29071,7 @@ namespace mg5amcCpu
       jamp_sv[88] += amp_sv[0];
       jamp_sv[94] -= amp_sv[0];
       jamp_sv[108] += amp_sv[0];
-      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[21], w_fp[1], w_fp[8], w_fp[5], COUPs[2], &amp_fp[0] );
+      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[21], w_fp[1], w_fp[8], w_fp[5], COUPs[2], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -29091,7 +29091,7 @@ namespace mg5amcCpu
       jamp_sv[83] += amp_sv[0];
       jamp_sv[108] += amp_sv[0];
       jamp_sv[109] -= amp_sv[0];
-      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[21], w_fp[1], w_fp[8], w_fp[5], COUPs[2], &amp_fp[0] );
+      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[21], w_fp[1], w_fp[8], w_fp[5], COUPs[2], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -29111,7 +29111,7 @@ namespace mg5amcCpu
       jamp_sv[88] -= amp_sv[0];
       jamp_sv[94] += amp_sv[0];
       jamp_sv[109] -= amp_sv[0];
-      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[71], w_fp[1], w_fp[8], w_fp[5], COUPs[2], &amp_fp[0] );
+      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[71], w_fp[1], w_fp[8], w_fp[5], COUPs[2], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -29131,7 +29131,7 @@ namespace mg5amcCpu
       jamp_sv[88] += amp_sv[0];
       jamp_sv[91] -= amp_sv[0];
       jamp_sv[98] += amp_sv[0];
-      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[71], w_fp[1], w_fp[8], w_fp[5], COUPs[2], &amp_fp[0] );
+      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[71], w_fp[1], w_fp[8], w_fp[5], COUPs[2], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -29151,7 +29151,7 @@ namespace mg5amcCpu
       jamp_sv[82] += amp_sv[0];
       jamp_sv[98] += amp_sv[0];
       jamp_sv[99] -= amp_sv[0];
-      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[71], w_fp[1], w_fp[8], w_fp[5], COUPs[2], &amp_fp[0] );
+      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[71], w_fp[1], w_fp[8], w_fp[5], COUPs[2], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -29171,7 +29171,7 @@ namespace mg5amcCpu
       jamp_sv[88] -= amp_sv[0];
       jamp_sv[91] += amp_sv[0];
       jamp_sv[99] -= amp_sv[0];
-      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[59], w_fp[1], w_fp[8], w_fp[5], COUPs[2], &amp_fp[0] );
+      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[59], w_fp[1], w_fp[8], w_fp[5], COUPs[2], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -29191,7 +29191,7 @@ namespace mg5amcCpu
       jamp_sv[94] += amp_sv[0];
       jamp_sv[98] += amp_sv[0];
       jamp_sv[108] -= amp_sv[0];
-      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[59], w_fp[1], w_fp[8], w_fp[5], COUPs[2], &amp_fp[0] );
+      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[59], w_fp[1], w_fp[8], w_fp[5], COUPs[2], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -29211,7 +29211,7 @@ namespace mg5amcCpu
       jamp_sv[99] -= amp_sv[0];
       jamp_sv[108] -= amp_sv[0];
       jamp_sv[109] += amp_sv[0];
-      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[59], w_fp[1], w_fp[8], w_fp[5], COUPs[2], &amp_fp[0] );
+      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[59], w_fp[1], w_fp[8], w_fp[5], COUPs[2], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -29235,12 +29235,12 @@ namespace mg5amcCpu
       // *** DIAGRAM 1198 OF 1240 ***
 
       // Wavefunction(s) for diagram number 1198
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[21], w_fp[1], COUPs[0], 0., 0., w_fp[66] );
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[71], w_fp[1], COUPs[0], 0., 0., w_fp[21] );
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[59], w_fp[1], COUPs[0], 0., 0., w_fp[71] );
+      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[21], w_fp[1], COUPs[0], 1.0, 0., 0., w_fp[66] );
+      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[71], w_fp[1], COUPs[0], 1.0, 0., 0., w_fp[21] );
+      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[59], w_fp[1], COUPs[0], 1.0, 0., 0., w_fp[71] );
 
       // Amplitude(s) for diagram number 1198
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[5], w_fp[66], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[5], w_fp[66], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -29260,7 +29260,7 @@ namespace mg5amcCpu
       jamp_sv[88] += amp_sv[0];
       jamp_sv[94] -= amp_sv[0];
       jamp_sv[108] += amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[5], w_fp[21], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[5], w_fp[21], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -29280,7 +29280,7 @@ namespace mg5amcCpu
       jamp_sv[88] += amp_sv[0];
       jamp_sv[91] -= amp_sv[0];
       jamp_sv[98] += amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[5], w_fp[71], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[5], w_fp[71], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -29307,7 +29307,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 1199
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[98], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[98], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -29327,7 +29327,7 @@ namespace mg5amcCpu
       jamp_sv[88] -= amp_sv[0];
       jamp_sv[94] += amp_sv[0];
       jamp_sv[109] -= amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[27], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[27], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -29347,7 +29347,7 @@ namespace mg5amcCpu
       jamp_sv[88] -= amp_sv[0];
       jamp_sv[91] += amp_sv[0];
       jamp_sv[99] -= amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[16], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[16], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -29374,7 +29374,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 1200
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[39], w_fp[66], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[39], w_fp[66], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -29386,7 +29386,7 @@ namespace mg5amcCpu
       jamp_sv[83] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[88] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[94] -= cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[39], w_fp[21], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[39], w_fp[21], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -29398,7 +29398,7 @@ namespace mg5amcCpu
       jamp_sv[85] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[88] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[91] -= cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[39], w_fp[71], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[39], w_fp[71], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -29417,7 +29417,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 1201
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[24], w_fp[39], w_fp[1], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[24], w_fp[39], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -29425,7 +29425,7 @@ namespace mg5amcCpu
       jamp_sv[79] -= amp_sv[0];
       jamp_sv[81] -= amp_sv[0];
       jamp_sv[83] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[60], w_fp[39], w_fp[1], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[60], w_fp[39], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -29433,7 +29433,7 @@ namespace mg5amcCpu
       jamp_sv[80] += amp_sv[0];
       jamp_sv[81] -= amp_sv[0];
       jamp_sv[82] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[72], w_fp[39], w_fp[1], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[72], w_fp[39], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -29448,7 +29448,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 1202
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[2], w_fp[66], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[2], w_fp[66], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -29460,7 +29460,7 @@ namespace mg5amcCpu
       jamp_sv[44] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[66] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[108] -= cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[2], w_fp[21], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[2], w_fp[21], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -29472,7 +29472,7 @@ namespace mg5amcCpu
       jamp_sv[52] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[66] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[98] -= cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[2], w_fp[71], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[2], w_fp[71], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -29491,7 +29491,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 1203
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[29], w_fp[1], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[29], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -29499,7 +29499,7 @@ namespace mg5amcCpu
       jamp_sv[20] -= amp_sv[0];
       jamp_sv[66] -= amp_sv[0];
       jamp_sv[108] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[68], w_fp[1], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[68], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -29507,7 +29507,7 @@ namespace mg5amcCpu
       jamp_sv[52] += amp_sv[0];
       jamp_sv[66] -= amp_sv[0];
       jamp_sv[98] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[23], w_fp[1], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[23], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -29519,15 +29519,15 @@ namespace mg5amcCpu
       // *** DIAGRAM 1204 OF 1240 ***
 
       // Wavefunction(s) for diagram number 1204
-      VVVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[5], w_fp[6], COUPs[2], 0., 0., w_fp[23] );
-      VVVV3P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[5], w_fp[6], COUPs[2], 0., 0., w_fp[68] );
-      VVVV4P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[5], w_fp[6], COUPs[2], 0., 0., w_fp[29] );
-      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[23], COUPs[1], cIPD[0], cIPD[1], w_fp[71] );
-      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[68], COUPs[1], cIPD[0], cIPD[1], w_fp[21] );
-      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[29], COUPs[1], cIPD[0], cIPD[1], w_fp[66] );
+      VVVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[5], w_fp[6], COUPs[2], 1.0, 0., 0., w_fp[23] );
+      VVVV3P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[5], w_fp[6], COUPs[2], 1.0, 0., 0., w_fp[68] );
+      VVVV4P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[5], w_fp[6], COUPs[2], 1.0, 0., 0., w_fp[29] );
+      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[23], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[71] );
+      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[68], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[21] );
+      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[29], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[66] );
 
       // Amplitude(s) for diagram number 1204
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[71], w_fp[77], w_fp[4], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[71], w_fp[77], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -29535,7 +29535,7 @@ namespace mg5amcCpu
       jamp_sv[31] -= amp_sv[0];
       jamp_sv[33] -= amp_sv[0];
       jamp_sv[35] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[21], w_fp[77], w_fp[4], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[21], w_fp[77], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -29543,7 +29543,7 @@ namespace mg5amcCpu
       jamp_sv[32] += amp_sv[0];
       jamp_sv[33] -= amp_sv[0];
       jamp_sv[34] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[66], w_fp[77], w_fp[4], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[66], w_fp[77], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -29555,12 +29555,12 @@ namespace mg5amcCpu
       // *** DIAGRAM 1205 OF 1240 ***
 
       // Wavefunction(s) for diagram number 1205
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[23], w_fp[4], COUPs[0], 0., 0., w_fp[72] );
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[68], w_fp[4], COUPs[0], 0., 0., w_fp[60] );
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[29], w_fp[4], COUPs[0], 0., 0., w_fp[24] );
+      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[23], w_fp[4], COUPs[0], 1.0, 0., 0., w_fp[72] );
+      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[68], w_fp[4], COUPs[0], 1.0, 0., 0., w_fp[60] );
+      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[29], w_fp[4], COUPs[0], 1.0, 0., 0., w_fp[24] );
 
       // Amplitude(s) for diagram number 1205
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[77], w_fp[72], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[77], w_fp[72], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -29572,7 +29572,7 @@ namespace mg5amcCpu
       jamp_sv[35] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[40] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[46] -= cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[77], w_fp[60], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[77], w_fp[60], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -29584,7 +29584,7 @@ namespace mg5amcCpu
       jamp_sv[37] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[40] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[43] -= cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[77], w_fp[24], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[77], w_fp[24], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -29603,7 +29603,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 1206
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[77], w_fp[23], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[77], w_fp[23], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -29611,7 +29611,7 @@ namespace mg5amcCpu
       jamp_sv[29] -= amp_sv[0];
       jamp_sv[40] -= amp_sv[0];
       jamp_sv[46] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[77], w_fp[68], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[77], w_fp[68], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -29619,7 +29619,7 @@ namespace mg5amcCpu
       jamp_sv[37] += amp_sv[0];
       jamp_sv[40] -= amp_sv[0];
       jamp_sv[43] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[77], w_fp[29], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[77], w_fp[29], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -29631,12 +29631,12 @@ namespace mg5amcCpu
       // *** DIAGRAM 1207 OF 1240 ***
 
       // Wavefunction(s) for diagram number 1207
-      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[2], w_fp[23], COUPs[1], cIPD[0], cIPD[1], w_fp[77] );
-      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[2], w_fp[68], COUPs[1], cIPD[0], cIPD[1], w_fp[16] );
-      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[2], w_fp[29], COUPs[1], cIPD[0], cIPD[1], w_fp[27] );
+      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[2], w_fp[23], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[77] );
+      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[2], w_fp[68], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[16] );
+      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[2], w_fp[29], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[27] );
 
       // Amplitude(s) for diagram number 1207
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[77], w_fp[4], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[77], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -29644,7 +29644,7 @@ namespace mg5amcCpu
       jamp_sv[23] -= amp_sv[0];
       jamp_sv[91] -= amp_sv[0];
       jamp_sv[115] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[16], w_fp[4], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[16], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -29652,7 +29652,7 @@ namespace mg5amcCpu
       jamp_sv[77] += amp_sv[0];
       jamp_sv[91] -= amp_sv[0];
       jamp_sv[101] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[27], w_fp[4], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[27], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -29667,7 +29667,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 1208
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[2], w_fp[72], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[2], w_fp[72], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -29679,7 +29679,7 @@ namespace mg5amcCpu
       jamp_sv[70] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[91] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[115] -= cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[2], w_fp[60], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[2], w_fp[60], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -29691,7 +29691,7 @@ namespace mg5amcCpu
       jamp_sv[77] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[91] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[101] -= cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[2], w_fp[24], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[2], w_fp[24], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -29710,7 +29710,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 1209
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[33], w_fp[23], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[33], w_fp[23], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -29718,7 +29718,7 @@ namespace mg5amcCpu
       jamp_sv[53] -= amp_sv[0];
       jamp_sv[64] -= amp_sv[0];
       jamp_sv[70] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[33], w_fp[68], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[33], w_fp[68], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -29726,7 +29726,7 @@ namespace mg5amcCpu
       jamp_sv[61] += amp_sv[0];
       jamp_sv[64] -= amp_sv[0];
       jamp_sv[67] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[33], w_fp[29], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[33], w_fp[29], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -29741,7 +29741,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 1210
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[77], w_fp[61], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[77], w_fp[61], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -29753,7 +29753,7 @@ namespace mg5amcCpu
       jamp_sv[91] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[114] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[115] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[16], w_fp[61], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[16], w_fp[61], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -29765,7 +29765,7 @@ namespace mg5amcCpu
       jamp_sv[91] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[100] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[101] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[27], w_fp[61], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[27], w_fp[61], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -29784,7 +29784,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 1211
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[71], w_fp[2], w_fp[61], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[71], w_fp[2], w_fp[61], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -29796,7 +29796,7 @@ namespace mg5amcCpu
       jamp_sv[55] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[57] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[59] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[21], w_fp[2], w_fp[61], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[21], w_fp[2], w_fp[61], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -29808,7 +29808,7 @@ namespace mg5amcCpu
       jamp_sv[56] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[57] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[58] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[66], w_fp[2], w_fp[61], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[66], w_fp[2], w_fp[61], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -29827,7 +29827,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 1212
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[23], w_fp[61], w_fp[8], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[23], w_fp[61], w_fp[8], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -29847,7 +29847,7 @@ namespace mg5amcCpu
       jamp_sv[91] -= amp_sv[0];
       jamp_sv[114] -= amp_sv[0];
       jamp_sv[115] += amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[68], w_fp[61], w_fp[8], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[68], w_fp[61], w_fp[8], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -29867,7 +29867,7 @@ namespace mg5amcCpu
       jamp_sv[91] -= amp_sv[0];
       jamp_sv[100] -= amp_sv[0];
       jamp_sv[101] += amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[29], w_fp[61], w_fp[8], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[29], w_fp[61], w_fp[8], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -29894,7 +29894,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 1213
-      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[23], w_fp[1], w_fp[8], w_fp[4], COUPs[2], &amp_fp[0] );
+      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[23], w_fp[1], w_fp[8], w_fp[4], COUPs[2], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -29914,7 +29914,7 @@ namespace mg5amcCpu
       jamp_sv[70] -= amp_sv[0];
       jamp_sv[90] -= amp_sv[0];
       jamp_sv[114] += amp_sv[0];
-      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[23], w_fp[1], w_fp[8], w_fp[4], COUPs[2], &amp_fp[0] );
+      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[23], w_fp[1], w_fp[8], w_fp[4], COUPs[2], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -29934,7 +29934,7 @@ namespace mg5amcCpu
       jamp_sv[91] += amp_sv[0];
       jamp_sv[114] += amp_sv[0];
       jamp_sv[115] -= amp_sv[0];
-      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[23], w_fp[1], w_fp[8], w_fp[4], COUPs[2], &amp_fp[0] );
+      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[23], w_fp[1], w_fp[8], w_fp[4], COUPs[2], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -29954,7 +29954,7 @@ namespace mg5amcCpu
       jamp_sv[70] += amp_sv[0];
       jamp_sv[91] += amp_sv[0];
       jamp_sv[115] -= amp_sv[0];
-      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[68], w_fp[1], w_fp[8], w_fp[4], COUPs[2], &amp_fp[0] );
+      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[68], w_fp[1], w_fp[8], w_fp[4], COUPs[2], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -29974,7 +29974,7 @@ namespace mg5amcCpu
       jamp_sv[76] += amp_sv[0];
       jamp_sv[90] -= amp_sv[0];
       jamp_sv[100] += amp_sv[0];
-      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[68], w_fp[1], w_fp[8], w_fp[4], COUPs[2], &amp_fp[0] );
+      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[68], w_fp[1], w_fp[8], w_fp[4], COUPs[2], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -29994,7 +29994,7 @@ namespace mg5amcCpu
       jamp_sv[91] += amp_sv[0];
       jamp_sv[100] += amp_sv[0];
       jamp_sv[101] -= amp_sv[0];
-      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[68], w_fp[1], w_fp[8], w_fp[4], COUPs[2], &amp_fp[0] );
+      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[68], w_fp[1], w_fp[8], w_fp[4], COUPs[2], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -30014,7 +30014,7 @@ namespace mg5amcCpu
       jamp_sv[77] -= amp_sv[0];
       jamp_sv[91] += amp_sv[0];
       jamp_sv[101] -= amp_sv[0];
-      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[29], w_fp[1], w_fp[8], w_fp[4], COUPs[2], &amp_fp[0] );
+      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[29], w_fp[1], w_fp[8], w_fp[4], COUPs[2], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -30034,7 +30034,7 @@ namespace mg5amcCpu
       jamp_sv[76] += amp_sv[0];
       jamp_sv[100] += amp_sv[0];
       jamp_sv[114] -= amp_sv[0];
-      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[29], w_fp[1], w_fp[8], w_fp[4], COUPs[2], &amp_fp[0] );
+      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[29], w_fp[1], w_fp[8], w_fp[4], COUPs[2], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -30054,7 +30054,7 @@ namespace mg5amcCpu
       jamp_sv[101] -= amp_sv[0];
       jamp_sv[114] -= amp_sv[0];
       jamp_sv[115] += amp_sv[0];
-      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[29], w_fp[1], w_fp[8], w_fp[4], COUPs[2], &amp_fp[0] );
+      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[29], w_fp[1], w_fp[8], w_fp[4], COUPs[2], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -30078,12 +30078,12 @@ namespace mg5amcCpu
       // *** DIAGRAM 1214 OF 1240 ***
 
       // Wavefunction(s) for diagram number 1214
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[23], w_fp[1], COUPs[0], 0., 0., w_fp[61] );
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[68], w_fp[1], COUPs[0], 0., 0., w_fp[23] );
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[29], w_fp[1], COUPs[0], 0., 0., w_fp[68] );
+      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[23], w_fp[1], COUPs[0], 1.0, 0., 0., w_fp[61] );
+      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[68], w_fp[1], COUPs[0], 1.0, 0., 0., w_fp[23] );
+      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[29], w_fp[1], COUPs[0], 1.0, 0., 0., w_fp[68] );
 
       // Amplitude(s) for diagram number 1214
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[4], w_fp[61], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[4], w_fp[61], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -30103,7 +30103,7 @@ namespace mg5amcCpu
       jamp_sv[70] -= amp_sv[0];
       jamp_sv[90] -= amp_sv[0];
       jamp_sv[114] += amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[4], w_fp[23], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[4], w_fp[23], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -30123,7 +30123,7 @@ namespace mg5amcCpu
       jamp_sv[76] += amp_sv[0];
       jamp_sv[90] -= amp_sv[0];
       jamp_sv[100] += amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[4], w_fp[68], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[4], w_fp[68], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -30150,7 +30150,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 1215
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[72], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[72], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -30170,7 +30170,7 @@ namespace mg5amcCpu
       jamp_sv[70] += amp_sv[0];
       jamp_sv[91] += amp_sv[0];
       jamp_sv[115] -= amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[60], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[60], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -30190,7 +30190,7 @@ namespace mg5amcCpu
       jamp_sv[77] -= amp_sv[0];
       jamp_sv[91] += amp_sv[0];
       jamp_sv[101] -= amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[24], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[24], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -30217,7 +30217,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 1216
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[33], w_fp[61], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[33], w_fp[61], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -30229,7 +30229,7 @@ namespace mg5amcCpu
       jamp_sv[59] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[64] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[70] -= cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[33], w_fp[23], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[33], w_fp[23], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -30241,7 +30241,7 @@ namespace mg5amcCpu
       jamp_sv[61] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[64] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[67] -= cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[33], w_fp[68], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[33], w_fp[68], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -30260,7 +30260,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 1217
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[71], w_fp[33], w_fp[1], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[71], w_fp[33], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -30268,7 +30268,7 @@ namespace mg5amcCpu
       jamp_sv[55] -= amp_sv[0];
       jamp_sv[57] -= amp_sv[0];
       jamp_sv[59] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[21], w_fp[33], w_fp[1], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[21], w_fp[33], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -30276,7 +30276,7 @@ namespace mg5amcCpu
       jamp_sv[56] += amp_sv[0];
       jamp_sv[57] -= amp_sv[0];
       jamp_sv[58] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[66], w_fp[33], w_fp[1], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[66], w_fp[33], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -30291,7 +30291,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 1218
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[2], w_fp[61], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[2], w_fp[61], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -30303,7 +30303,7 @@ namespace mg5amcCpu
       jamp_sv[46] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[90] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[114] -= cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[2], w_fp[23], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[2], w_fp[23], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -30315,7 +30315,7 @@ namespace mg5amcCpu
       jamp_sv[76] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[90] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[100] -= cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[2], w_fp[68], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[2], w_fp[68], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -30334,7 +30334,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 1219
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[77], w_fp[1], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[77], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -30342,7 +30342,7 @@ namespace mg5amcCpu
       jamp_sv[22] -= amp_sv[0];
       jamp_sv[90] -= amp_sv[0];
       jamp_sv[114] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[16], w_fp[1], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[16], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -30350,7 +30350,7 @@ namespace mg5amcCpu
       jamp_sv[76] += amp_sv[0];
       jamp_sv[90] -= amp_sv[0];
       jamp_sv[100] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[27], w_fp[1], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[27], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -30365,7 +30365,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 1220
-      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[73], w_fp[8], w_fp[6], COUPs[2], &amp_fp[0] );
+      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[73], w_fp[8], w_fp[6], COUPs[2], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -30385,7 +30385,7 @@ namespace mg5amcCpu
       jamp_sv[107] -= amp_sv[0];
       jamp_sv[113] -= amp_sv[0];
       jamp_sv[119] += amp_sv[0];
-      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[73], w_fp[8], w_fp[6], COUPs[2], &amp_fp[0] );
+      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[73], w_fp[8], w_fp[6], COUPs[2], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -30405,7 +30405,7 @@ namespace mg5amcCpu
       jamp_sv[107] -= amp_sv[0];
       jamp_sv[113] -= amp_sv[0];
       jamp_sv[119] += amp_sv[0];
-      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[73], w_fp[8], w_fp[6], COUPs[2], &amp_fp[0] );
+      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[73], w_fp[8], w_fp[6], COUPs[2], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -30425,7 +30425,7 @@ namespace mg5amcCpu
       jamp_sv[97] -= amp_sv[0];
       jamp_sv[99] -= amp_sv[0];
       jamp_sv[101] += amp_sv[0];
-      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[79], w_fp[8], w_fp[6], COUPs[2], &amp_fp[0] );
+      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[79], w_fp[8], w_fp[6], COUPs[2], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -30445,7 +30445,7 @@ namespace mg5amcCpu
       jamp_sv[111] += amp_sv[0];
       jamp_sv[113] -= amp_sv[0];
       jamp_sv[117] += amp_sv[0];
-      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[79], w_fp[8], w_fp[6], COUPs[2], &amp_fp[0] );
+      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[79], w_fp[8], w_fp[6], COUPs[2], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -30465,7 +30465,7 @@ namespace mg5amcCpu
       jamp_sv[111] += amp_sv[0];
       jamp_sv[113] -= amp_sv[0];
       jamp_sv[117] += amp_sv[0];
-      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[79], w_fp[8], w_fp[6], COUPs[2], &amp_fp[0] );
+      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[79], w_fp[8], w_fp[6], COUPs[2], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -30485,7 +30485,7 @@ namespace mg5amcCpu
       jamp_sv[98] += amp_sv[0];
       jamp_sv[99] -= amp_sv[0];
       jamp_sv[100] += amp_sv[0];
-      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[80], w_fp[8], w_fp[6], COUPs[2], &amp_fp[0] );
+      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[80], w_fp[8], w_fp[6], COUPs[2], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -30505,7 +30505,7 @@ namespace mg5amcCpu
       jamp_sv[111] += amp_sv[0];
       jamp_sv[117] += amp_sv[0];
       jamp_sv[119] -= amp_sv[0];
-      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[80], w_fp[8], w_fp[6], COUPs[2], &amp_fp[0] );
+      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[80], w_fp[8], w_fp[6], COUPs[2], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -30525,7 +30525,7 @@ namespace mg5amcCpu
       jamp_sv[111] += amp_sv[0];
       jamp_sv[117] += amp_sv[0];
       jamp_sv[119] -= amp_sv[0];
-      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[80], w_fp[8], w_fp[6], COUPs[2], &amp_fp[0] );
+      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[80], w_fp[8], w_fp[6], COUPs[2], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -30549,12 +30549,12 @@ namespace mg5amcCpu
       // *** DIAGRAM 1221 OF 1240 ***
 
       // Wavefunction(s) for diagram number 1221
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[73], COUPs[0], 0., 0., w_fp[27] );
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[79], COUPs[0], 0., 0., w_fp[1] );
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[80], COUPs[0], 0., 0., w_fp[16] );
+      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[73], COUPs[0], 1.0, 0., 0., w_fp[27] );
+      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[79], COUPs[0], 1.0, 0., 0., w_fp[1] );
+      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[80], COUPs[0], 1.0, 0., 0., w_fp[16] );
 
       // Amplitude(s) for diagram number 1221
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[6], w_fp[27], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[6], w_fp[27], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -30574,7 +30574,7 @@ namespace mg5amcCpu
       jamp_sv[107] -= amp_sv[0];
       jamp_sv[113] -= amp_sv[0];
       jamp_sv[119] += amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[6], w_fp[1], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[6], w_fp[1], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -30594,7 +30594,7 @@ namespace mg5amcCpu
       jamp_sv[111] += amp_sv[0];
       jamp_sv[113] -= amp_sv[0];
       jamp_sv[117] += amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[6], w_fp[16], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[6], w_fp[16], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -30621,7 +30621,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 1222
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[73], w_fp[6], w_fp[56], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[73], w_fp[6], w_fp[56], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -30641,7 +30641,7 @@ namespace mg5amcCpu
       jamp_sv[107] -= amp_sv[0];
       jamp_sv[113] -= amp_sv[0];
       jamp_sv[119] += amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[79], w_fp[6], w_fp[56], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[79], w_fp[6], w_fp[56], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -30661,7 +30661,7 @@ namespace mg5amcCpu
       jamp_sv[111] += amp_sv[0];
       jamp_sv[113] -= amp_sv[0];
       jamp_sv[117] += amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[80], w_fp[6], w_fp[56], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[80], w_fp[6], w_fp[56], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -30688,7 +30688,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 1223
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[47], w_fp[27], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[47], w_fp[27], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -30700,7 +30700,7 @@ namespace mg5amcCpu
       jamp_sv[107] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[113] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[119] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[47], w_fp[1], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[47], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -30712,7 +30712,7 @@ namespace mg5amcCpu
       jamp_sv[111] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[113] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[117] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[47], w_fp[16], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[47], w_fp[16], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -30731,7 +30731,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 1224
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[113], w_fp[73], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[113], w_fp[73], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -30739,7 +30739,7 @@ namespace mg5amcCpu
       jamp_sv[97] -= amp_sv[0];
       jamp_sv[99] -= amp_sv[0];
       jamp_sv[101] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[113], w_fp[79], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[113], w_fp[79], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -30747,7 +30747,7 @@ namespace mg5amcCpu
       jamp_sv[98] += amp_sv[0];
       jamp_sv[99] -= amp_sv[0];
       jamp_sv[100] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[113], w_fp[80], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[113], w_fp[80], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -30762,7 +30762,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 1225
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[2], w_fp[27], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[2], w_fp[27], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -30774,7 +30774,7 @@ namespace mg5amcCpu
       jamp_sv[38] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[62] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[86] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[2], w_fp[1], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[2], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -30786,7 +30786,7 @@ namespace mg5amcCpu
       jamp_sv[56] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[62] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[80] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[2], w_fp[16], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[2], w_fp[16], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -30805,7 +30805,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 1226
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[62], w_fp[2], w_fp[73], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[62], w_fp[2], w_fp[73], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -30813,7 +30813,7 @@ namespace mg5amcCpu
       jamp_sv[38] -= amp_sv[0];
       jamp_sv[62] -= amp_sv[0];
       jamp_sv[86] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[62], w_fp[2], w_fp[79], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[62], w_fp[2], w_fp[79], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -30821,7 +30821,7 @@ namespace mg5amcCpu
       jamp_sv[56] += amp_sv[0];
       jamp_sv[62] -= amp_sv[0];
       jamp_sv[80] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[62], w_fp[2], w_fp[80], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[62], w_fp[2], w_fp[80], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -30836,7 +30836,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 1227
-      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[57], w_fp[8], w_fp[5], COUPs[2], &amp_fp[0] );
+      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[57], w_fp[8], w_fp[5], COUPs[2], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -30856,7 +30856,7 @@ namespace mg5amcCpu
       jamp_sv[89] -= amp_sv[0];
       jamp_sv[95] += amp_sv[0];
       jamp_sv[110] -= amp_sv[0];
-      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[57], w_fp[8], w_fp[5], COUPs[2], &amp_fp[0] );
+      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[57], w_fp[8], w_fp[5], COUPs[2], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -30876,7 +30876,7 @@ namespace mg5amcCpu
       jamp_sv[89] -= amp_sv[0];
       jamp_sv[95] += amp_sv[0];
       jamp_sv[111] -= amp_sv[0];
-      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[57], w_fp[8], w_fp[5], COUPs[2], &amp_fp[0] );
+      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[57], w_fp[8], w_fp[5], COUPs[2], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -30896,7 +30896,7 @@ namespace mg5amcCpu
       jamp_sv[77] += amp_sv[0];
       jamp_sv[110] += amp_sv[0];
       jamp_sv[111] -= amp_sv[0];
-      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[81], w_fp[8], w_fp[5], COUPs[2], &amp_fp[0] );
+      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[81], w_fp[8], w_fp[5], COUPs[2], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -30916,7 +30916,7 @@ namespace mg5amcCpu
       jamp_sv[89] -= amp_sv[0];
       jamp_sv[93] += amp_sv[0];
       jamp_sv[104] -= amp_sv[0];
-      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[81], w_fp[8], w_fp[5], COUPs[2], &amp_fp[0] );
+      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[81], w_fp[8], w_fp[5], COUPs[2], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -30936,7 +30936,7 @@ namespace mg5amcCpu
       jamp_sv[89] -= amp_sv[0];
       jamp_sv[93] += amp_sv[0];
       jamp_sv[105] -= amp_sv[0];
-      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[81], w_fp[8], w_fp[5], COUPs[2], &amp_fp[0] );
+      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[81], w_fp[8], w_fp[5], COUPs[2], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -30956,7 +30956,7 @@ namespace mg5amcCpu
       jamp_sv[76] += amp_sv[0];
       jamp_sv[104] += amp_sv[0];
       jamp_sv[105] -= amp_sv[0];
-      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[82], w_fp[8], w_fp[5], COUPs[2], &amp_fp[0] );
+      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[82], w_fp[8], w_fp[5], COUPs[2], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -30976,7 +30976,7 @@ namespace mg5amcCpu
       jamp_sv[95] -= amp_sv[0];
       jamp_sv[104] -= amp_sv[0];
       jamp_sv[110] += amp_sv[0];
-      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[82], w_fp[8], w_fp[5], COUPs[2], &amp_fp[0] );
+      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[82], w_fp[8], w_fp[5], COUPs[2], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -30996,7 +30996,7 @@ namespace mg5amcCpu
       jamp_sv[95] -= amp_sv[0];
       jamp_sv[105] -= amp_sv[0];
       jamp_sv[111] += amp_sv[0];
-      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[82], w_fp[8], w_fp[5], COUPs[2], &amp_fp[0] );
+      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[82], w_fp[8], w_fp[5], COUPs[2], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -31020,12 +31020,12 @@ namespace mg5amcCpu
       // *** DIAGRAM 1228 OF 1240 ***
 
       // Wavefunction(s) for diagram number 1228
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[57], COUPs[0], 0., 0., w_fp[62] );
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[81], COUPs[0], 0., 0., w_fp[80] );
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[82], COUPs[0], 0., 0., w_fp[79] );
+      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[57], COUPs[0], 1.0, 0., 0., w_fp[62] );
+      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[81], COUPs[0], 1.0, 0., 0., w_fp[80] );
+      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[82], COUPs[0], 1.0, 0., 0., w_fp[79] );
 
       // Amplitude(s) for diagram number 1228
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[5], w_fp[62], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[5], w_fp[62], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -31045,7 +31045,7 @@ namespace mg5amcCpu
       jamp_sv[89] -= amp_sv[0];
       jamp_sv[95] += amp_sv[0];
       jamp_sv[110] -= amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[5], w_fp[80], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[5], w_fp[80], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -31065,7 +31065,7 @@ namespace mg5amcCpu
       jamp_sv[89] -= amp_sv[0];
       jamp_sv[93] += amp_sv[0];
       jamp_sv[104] -= amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[5], w_fp[79], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[5], w_fp[79], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -31092,7 +31092,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 1229
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[57], w_fp[5], w_fp[56], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[57], w_fp[5], w_fp[56], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -31112,7 +31112,7 @@ namespace mg5amcCpu
       jamp_sv[89] -= amp_sv[0];
       jamp_sv[95] += amp_sv[0];
       jamp_sv[111] -= amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[81], w_fp[5], w_fp[56], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[81], w_fp[5], w_fp[56], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -31132,7 +31132,7 @@ namespace mg5amcCpu
       jamp_sv[89] -= amp_sv[0];
       jamp_sv[93] += amp_sv[0];
       jamp_sv[105] -= amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[82], w_fp[5], w_fp[56], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[82], w_fp[5], w_fp[56], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -31159,7 +31159,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 1230
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[39], w_fp[62], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[39], w_fp[62], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -31171,7 +31171,7 @@ namespace mg5amcCpu
       jamp_sv[83] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[89] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[95] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[39], w_fp[80], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[39], w_fp[80], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -31183,7 +31183,7 @@ namespace mg5amcCpu
       jamp_sv[87] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[89] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[93] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[39], w_fp[79], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[39], w_fp[79], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -31202,7 +31202,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 1231
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[102], w_fp[57], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[102], w_fp[57], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -31210,7 +31210,7 @@ namespace mg5amcCpu
       jamp_sv[73] -= amp_sv[0];
       jamp_sv[75] -= amp_sv[0];
       jamp_sv[77] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[102], w_fp[81], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[102], w_fp[81], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -31218,7 +31218,7 @@ namespace mg5amcCpu
       jamp_sv[74] += amp_sv[0];
       jamp_sv[75] -= amp_sv[0];
       jamp_sv[76] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[102], w_fp[82], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[102], w_fp[82], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -31233,7 +31233,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 1232
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[2], w_fp[62], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[2], w_fp[62], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -31245,7 +31245,7 @@ namespace mg5amcCpu
       jamp_sv[44] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[68] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[110] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[2], w_fp[80], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[2], w_fp[80], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -31257,7 +31257,7 @@ namespace mg5amcCpu
       jamp_sv[58] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[68] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[104] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[2], w_fp[79], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[2], w_fp[79], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -31276,7 +31276,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 1233
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[104], w_fp[2], w_fp[57], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[104], w_fp[2], w_fp[57], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -31284,7 +31284,7 @@ namespace mg5amcCpu
       jamp_sv[44] -= amp_sv[0];
       jamp_sv[68] -= amp_sv[0];
       jamp_sv[110] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[104], w_fp[2], w_fp[81], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[104], w_fp[2], w_fp[81], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -31292,7 +31292,7 @@ namespace mg5amcCpu
       jamp_sv[58] += amp_sv[0];
       jamp_sv[68] -= amp_sv[0];
       jamp_sv[104] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[104], w_fp[2], w_fp[82], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[104], w_fp[2], w_fp[82], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -31307,7 +31307,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 1234
-      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[55], w_fp[8], w_fp[4], COUPs[2], &amp_fp[0] );
+      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[55], w_fp[8], w_fp[4], COUPs[2], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -31327,7 +31327,7 @@ namespace mg5amcCpu
       jamp_sv[71] += amp_sv[0];
       jamp_sv[92] += amp_sv[0];
       jamp_sv[116] -= amp_sv[0];
-      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[55], w_fp[8], w_fp[4], COUPs[2], &amp_fp[0] );
+      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[55], w_fp[8], w_fp[4], COUPs[2], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -31347,7 +31347,7 @@ namespace mg5amcCpu
       jamp_sv[71] += amp_sv[0];
       jamp_sv[93] += amp_sv[0];
       jamp_sv[117] -= amp_sv[0];
-      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[55], w_fp[8], w_fp[4], COUPs[2], &amp_fp[0] );
+      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[55], w_fp[8], w_fp[4], COUPs[2], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -31367,7 +31367,7 @@ namespace mg5amcCpu
       jamp_sv[93] += amp_sv[0];
       jamp_sv[116] += amp_sv[0];
       jamp_sv[117] -= amp_sv[0];
-      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[83], w_fp[8], w_fp[4], COUPs[2], &amp_fp[0] );
+      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[83], w_fp[8], w_fp[4], COUPs[2], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -31387,7 +31387,7 @@ namespace mg5amcCpu
       jamp_sv[82] -= amp_sv[0];
       jamp_sv[92] += amp_sv[0];
       jamp_sv[106] -= amp_sv[0];
-      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[83], w_fp[8], w_fp[4], COUPs[2], &amp_fp[0] );
+      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[83], w_fp[8], w_fp[4], COUPs[2], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -31407,7 +31407,7 @@ namespace mg5amcCpu
       jamp_sv[83] -= amp_sv[0];
       jamp_sv[93] += amp_sv[0];
       jamp_sv[107] -= amp_sv[0];
-      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[83], w_fp[8], w_fp[4], COUPs[2], &amp_fp[0] );
+      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[83], w_fp[8], w_fp[4], COUPs[2], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -31427,7 +31427,7 @@ namespace mg5amcCpu
       jamp_sv[93] += amp_sv[0];
       jamp_sv[106] += amp_sv[0];
       jamp_sv[107] -= amp_sv[0];
-      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[84], w_fp[8], w_fp[4], COUPs[2], &amp_fp[0] );
+      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[84], w_fp[8], w_fp[4], COUPs[2], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -31447,7 +31447,7 @@ namespace mg5amcCpu
       jamp_sv[82] -= amp_sv[0];
       jamp_sv[106] -= amp_sv[0];
       jamp_sv[116] += amp_sv[0];
-      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[84], w_fp[8], w_fp[4], COUPs[2], &amp_fp[0] );
+      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[84], w_fp[8], w_fp[4], COUPs[2], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -31467,7 +31467,7 @@ namespace mg5amcCpu
       jamp_sv[83] -= amp_sv[0];
       jamp_sv[107] -= amp_sv[0];
       jamp_sv[117] += amp_sv[0];
-      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[84], w_fp[8], w_fp[4], COUPs[2], &amp_fp[0] );
+      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[84], w_fp[8], w_fp[4], COUPs[2], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -31491,12 +31491,12 @@ namespace mg5amcCpu
       // *** DIAGRAM 1235 OF 1240 ***
 
       // Wavefunction(s) for diagram number 1235
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[55], COUPs[0], 0., 0., w_fp[104] );
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[83], COUPs[0], 0., 0., w_fp[82] );
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[84], COUPs[0], 0., 0., w_fp[81] );
+      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[55], COUPs[0], 1.0, 0., 0., w_fp[104] );
+      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[83], COUPs[0], 1.0, 0., 0., w_fp[82] );
+      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[84], COUPs[0], 1.0, 0., 0., w_fp[81] );
 
       // Amplitude(s) for diagram number 1235
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[4], w_fp[104], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[4], w_fp[104], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -31516,7 +31516,7 @@ namespace mg5amcCpu
       jamp_sv[71] += amp_sv[0];
       jamp_sv[92] += amp_sv[0];
       jamp_sv[116] -= amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[4], w_fp[82], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[4], w_fp[82], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -31536,7 +31536,7 @@ namespace mg5amcCpu
       jamp_sv[82] -= amp_sv[0];
       jamp_sv[92] += amp_sv[0];
       jamp_sv[106] -= amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[4], w_fp[81], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[4], w_fp[81], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -31563,7 +31563,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 1236
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[55], w_fp[4], w_fp[56], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[55], w_fp[4], w_fp[56], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -31583,7 +31583,7 @@ namespace mg5amcCpu
       jamp_sv[71] += amp_sv[0];
       jamp_sv[93] += amp_sv[0];
       jamp_sv[117] -= amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[83], w_fp[4], w_fp[56], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[83], w_fp[4], w_fp[56], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -31603,7 +31603,7 @@ namespace mg5amcCpu
       jamp_sv[83] -= amp_sv[0];
       jamp_sv[93] += amp_sv[0];
       jamp_sv[107] -= amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[84], w_fp[4], w_fp[56], COUPs[0], &amp_fp[0] );
+      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[84], w_fp[4], w_fp[56], COUPs[0], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -31630,7 +31630,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 1237
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[33], w_fp[104], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[33], w_fp[104], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -31642,7 +31642,7 @@ namespace mg5amcCpu
       jamp_sv[59] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[65] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[71] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[33], w_fp[82], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[33], w_fp[82], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -31654,7 +31654,7 @@ namespace mg5amcCpu
       jamp_sv[63] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[65] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[69] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[33], w_fp[81], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[33], w_fp[81], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -31673,7 +31673,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 1238
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[114], w_fp[55], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[114], w_fp[55], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -31681,7 +31681,7 @@ namespace mg5amcCpu
       jamp_sv[49] -= amp_sv[0];
       jamp_sv[51] -= amp_sv[0];
       jamp_sv[53] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[114], w_fp[83], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[114], w_fp[83], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -31689,7 +31689,7 @@ namespace mg5amcCpu
       jamp_sv[50] += amp_sv[0];
       jamp_sv[51] -= amp_sv[0];
       jamp_sv[52] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[114], w_fp[84], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[114], w_fp[84], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -31704,7 +31704,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 1239
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[2], w_fp[104], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[2], w_fp[104], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -31716,7 +31716,7 @@ namespace mg5amcCpu
       jamp_sv[46] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[92] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[116] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[2], w_fp[82], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[2], w_fp[82], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -31728,7 +31728,7 @@ namespace mg5amcCpu
       jamp_sv[82] += cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[92] -= cxtype( 0, 1 ) * amp_sv[0];
       jamp_sv[106] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[2], w_fp[81], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[2], w_fp[81], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -31747,7 +31747,7 @@ namespace mg5amcCpu
       // (none)
 
       // Amplitude(s) for diagram number 1240
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[2], w_fp[55], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[2], w_fp[55], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -31755,7 +31755,7 @@ namespace mg5amcCpu
       jamp_sv[46] -= amp_sv[0];
       jamp_sv[92] -= amp_sv[0];
       jamp_sv[116] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[2], w_fp[83], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[2], w_fp[83], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -31763,7 +31763,7 @@ namespace mg5amcCpu
       jamp_sv[82] += amp_sv[0];
       jamp_sv[92] -= amp_sv[0];
       jamp_sv[106] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[2], w_fp[84], COUPs[1], &amp_fp[0] );
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[2], w_fp[84], COUPs[1], 1.0, &amp_fp[0] );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
 #endif
@@ -32370,12 +32370,12 @@ namespace mg5amcCpu
                        fptype* allDenominators,    // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
                        bool* isGoodHel )           // output: isGoodHel[ncomb] - device array (CUDA implementation)
-  { /* clang-format on */
-    fptype allMEsLast = 0;
+  {                                                         /* clang-format on */
     const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid
-    allMEs[ievt] = 0;
     for( int ihel = 0; ihel < ncomb; ihel++ )
     {
+      // NEW IMPLEMENTATION OF GETGOODHEL (#630): RESET THE RUNNING SUM OVER HELICITIES TO 0 BEFORE ADDING A NEW HELICITY
+      allMEs[ievt] = 0;
       // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s)
       constexpr fptype_sv* jamp2_sv = nullptr; // no need for color selection during helicity filtering
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
@@ -32384,12 +32384,11 @@ namespace mg5amcCpu
 #else
       calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv );
 #endif
-      if( allMEs[ievt] != allMEsLast )
+      if( allMEs[ievt] != 0 ) // NEW IMPLEMENTATION OF GETGOODHEL (#630): COMPARE EACH HELICITY CONTRIBUTION TO 0
       {
         //if ( !isGoodHel[ihel] ) std::cout << "sigmaKin_getGoodHel ihel=" << ihel << " TRUE" << std::endl;
         isGoodHel[ihel] = true;
       }
-      allMEsLast = allMEs[ievt]; // running sum up to helicity ihel for event ievt
     }
   }
 #else
@@ -32408,19 +32407,11 @@ namespace mg5amcCpu
     //assert( (size_t)(allMEs) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS]
     // Allocate arrays at build time to contain at least 16 events (or at least neppV events if neppV>16, e.g. in future VPUs)
     constexpr int maxtry0 = std::max( 16, neppV ); // 16, but at least neppV (otherwise the npagV loop does not even start)
-    fptype allMEsLast[maxtry0] = { 0 };            // allocated at build time: maxtry0 must be a constexpr
     // Loop over only nevt events if nevt is < 16 (note that nevt is always >= neppV)
     assert( nevt >= neppV );
     const int maxtry = std::min( maxtry0, nevt ); // 16, but at most nevt (avoid invalid memory access if nevt<maxtry0)
 
-    // PART 0 - INITIALISATION (before calculate_wavefunctions)
-    // Reset the "matrix elements" - running sums of |M|^2 over helicities for the given event
-    for( int ievt = 0; ievt < maxtry; ++ievt )
-    {
-      allMEs[ievt] = 0; // all zeros
-    }
-
-    // PART 1 - HELICITY LOOP: CALCULATE WAVEFUNCTIONS
+    // HELICITY LOOP: CALCULATE WAVEFUNCTIONS
     const int npagV = maxtry / neppV;
 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
     // Mixed fptypes #537: float for color algebra and double elsewhere
@@ -32439,6 +32430,16 @@ namespace mg5amcCpu
 #endif
       for( int ihel = 0; ihel < ncomb; ihel++ )
       {
+        // NEW IMPLEMENTATION OF GETGOODHEL (#630): RESET THE RUNNING SUM OVER HELICITIES TO 0 BEFORE ADDING A NEW HELICITY
+        for( int ieppV = 0; ieppV < neppV; ++ieppV )
+        {
+          const int ievt = ievt00 + ieppV;
+          allMEs[ievt] = 0;
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+          const int ievt2 = ievt00 + ieppV + neppV;
+          allMEs[ievt2] = 0;
+#endif
+        }
         constexpr fptype_sv* jamp2_sv = nullptr; // no need for color selection during helicity filtering
         //std::cout << "sigmaKin_getGoodHel ihel=" << ihel << ( isGoodHel[ihel] ? " true" : " false" ) << std::endl;
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
@@ -32450,22 +32451,18 @@ namespace mg5amcCpu
         for( int ieppV = 0; ieppV < neppV; ++ieppV )
         {
           const int ievt = ievt00 + ieppV;
-          const bool differs = ( allMEs[ievt] != allMEsLast[ievt] );
-          if( differs )
+          if( allMEs[ievt] != 0 ) // NEW IMPLEMENTATION OF GETGOODHEL (#630): COMPARE EACH HELICITY CONTRIBUTION TO 0
           {
             //if ( !isGoodHel[ihel] ) std::cout << "sigmaKin_getGoodHel ihel=" << ihel << " TRUE" << std::endl;
             isGoodHel[ihel] = true;
           }
-          allMEsLast[ievt] = allMEs[ievt]; // running sum up to helicity ihel
 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
           const int ievt2 = ievt00 + ieppV + neppV;
-          const bool differs2 = ( allMEs[ievt2] != allMEsLast[ievt2] );
-          if( differs2 )
+          if( allMEs[ievt2] != 0 ) // NEW IMPLEMENTATION OF GETGOODHEL (#630): COMPARE EACH HELICITY CONTRIBUTION TO 0
           {
             //if ( !isGoodHel[ihel] ) std::cout << "sigmaKin_getGoodHel ihel=" << ihel << " TRUE" << std::endl;
             isGoodHel[ihel] = true;
           }
-          allMEsLast[ievt2] = allMEs[ievt2]; // running sum up to helicity ihel
 #endif
         }
       }
@@ -32522,13 +32519,12 @@ namespace mg5amcCpu
   {
     mgDebugInitialise();
 
-    // SANITY CHECKS for cudacpp code generation (see issues #272 and #343 and PRs #619, #626, #360 and #396)
+    // SANITY CHECKS for cudacpp code generation (see issues #272 and #343 and PRs #619, #626, #360, #396 and #754)
     // These variable are not used anywhere else in the code and their scope is limited to this sanity check
     {
-      // nprocesses>1 was last observed for "mirror processes" in uux_ttx in the 270 branch (see issue #343 and PRs #360 and #396)
+      // nprocesses == 2 may happen for "mirror processes" such as P0_uux_ttx within pp_tt012j (see PR #754)
       constexpr int nprocesses = 1;
-      static_assert( nprocesses == 1, "Assume nprocesses == 1" );
-      // process_id corresponds to the index of DSIG1 Fortran functions (must be 1 because cudacpp is unable to handle DSIG2)
+      static_assert( nprocesses == 1 || nprocesses == 2, "Assume nprocesses == 1 or 2" );
       constexpr int process_id = 1; // code generation source: standalone_cudacpp
       static_assert( process_id == 1, "Assume process_id == 1" );
     }
@@ -32614,23 +32610,26 @@ namespace mg5amcCpu
     }
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     // Event-by-event random choice of color #402
-    const int channelIdC = channelId - 1; // coloramps.h uses the C array indexing starting at 0
-    fptype targetamp[ncolor] = { 0 };
-    for( int icolC = 0; icolC < ncolor; icolC++ )
+    if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783)
     {
-      if( icolC == 0 )
-        targetamp[icolC] = 0;
-      else
-        targetamp[icolC] = targetamp[icolC - 1];
-      if( mgOnGpu::icolamp[channelIdC][icolC] ) targetamp[icolC] += jamp2_sv[icolC];
-    }
-    //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] );
-    for( int icolC = 0; icolC < ncolor; icolC++ )
-    {
-      if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) )
+      const unsigned int channelIdC = channelId - 1; // coloramps.h uses the C array indexing starting at 0
+      fptype targetamp[ncolor] = { 0 };
+      for( int icolC = 0; icolC < ncolor; icolC++ )
       {
-        allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1]
-        break;
+        if( icolC == 0 )
+          targetamp[icolC] = 0;
+        else
+          targetamp[icolC] = targetamp[icolC - 1];
+        if( mgOnGpu::icolamp[channelIdC][icolC] ) targetamp[icolC] += jamp2_sv[icolC];
+      }
+      //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] );
+      for( int icolC = 0; icolC < ncolor; icolC++ )
+      {
+        if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) )
+        {
+          allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1]
+          break;
+        }
       }
     }
 #endif
@@ -32725,57 +32724,60 @@ namespace mg5amcCpu
 #endif
       }
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // multichannel enabled (random color choice)
-      const int channelIdC = channelId - 1; // coloramps.h uses the C array indexing starting at 0
       // Event-by-event random choice of color #402
-      fptype_sv targetamp[ncolor] = { 0 };
-      for( int icolC = 0; icolC < ncolor; icolC++ )
+      if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783)
       {
-        if( icolC == 0 )
-          targetamp[icolC] = fptype_sv{ 0 };
-        else
-          targetamp[icolC] = targetamp[icolC - 1];
-        if( mgOnGpu::icolamp[channelIdC][icolC] ) targetamp[icolC] += jamp2_sv[icolC];
-      }
+        const unsigned int channelIdC = channelId - 1; // coloramps.h uses the C array indexing starting at 0
+        fptype_sv targetamp[ncolor] = { 0 };
+        for( int icolC = 0; icolC < ncolor; icolC++ )
+        {
+          if( icolC == 0 )
+            targetamp[icolC] = fptype_sv{ 0 };
+          else
+            targetamp[icolC] = targetamp[icolC - 1];
+          if( mgOnGpu::icolamp[channelIdC][icolC] ) targetamp[icolC] += jamp2_sv[icolC];
+        }
 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-      fptype_sv targetamp2[ncolor] = { 0 };
-      for( int icolC = 0; icolC < ncolor; icolC++ )
-      {
-        if( icolC == 0 )
-          targetamp2[icolC] = fptype_sv{ 0 };
-        else
-          targetamp2[icolC] = targetamp2[icolC - 1];
-        if( mgOnGpu::icolamp[channelIdC][icolC] ) targetamp2[icolC] += jamp2_sv[ncolor + icolC];
-      }
-#endif
-      for( int ieppV = 0; ieppV < neppV; ++ieppV )
-      {
-        const int ievt = ievt00 + ieppV;
-        //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] );
+        fptype_sv targetamp2[ncolor] = { 0 };
         for( int icolC = 0; icolC < ncolor; icolC++ )
         {
+          if( icolC == 0 )
+            targetamp2[icolC] = fptype_sv{ 0 };
+          else
+            targetamp2[icolC] = targetamp2[icolC - 1];
+          if( mgOnGpu::icolamp[channelIdC][icolC] ) targetamp2[icolC] += jamp2_sv[ncolor + icolC];
+        }
+#endif
+        for( int ieppV = 0; ieppV < neppV; ++ieppV )
+        {
+          const int ievt = ievt00 + ieppV;
+          //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] );
+          for( int icolC = 0; icolC < ncolor; icolC++ )
+          {
 #if defined MGONGPU_CPPSIMD
-          const bool okcol = allrndcol[ievt] < ( targetamp[icolC][ieppV] / targetamp[ncolor - 1][ieppV] );
+            const bool okcol = allrndcol[ievt] < ( targetamp[icolC][ieppV] / targetamp[ncolor - 1][ieppV] );
 #else
-          const bool okcol = allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] );
+            const bool okcol = allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] );
 #endif
-          if( okcol )
-          {
-            allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1]
-            break;
+            if( okcol )
+            {
+              allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1]
+              break;
+            }
           }
-        }
 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-        const int ievt2 = ievt00 + ieppV + neppV;
-        //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt2, allrndcol[ievt2] );
-        for( int icolC = 0; icolC < ncolor; icolC++ )
-        {
-          if( allrndcol[ievt2] < ( targetamp2[icolC][ieppV] / targetamp2[ncolor - 1][ieppV] ) )
+          const int ievt2 = ievt00 + ieppV + neppV;
+          //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt2, allrndcol[ievt2] );
+          for( int icolC = 0; icolC < ncolor; icolC++ )
           {
-            allselcol[ievt2] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1]
-            break;
+            if( allrndcol[ievt2] < ( targetamp2[icolC][ieppV] / targetamp2[ncolor - 1][ieppV] ) )
+            {
+              allselcol[ievt2] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1]
+              break;
+            }
           }
-        }
 #endif
+        }
       }
 #endif // multichannel enabled (random color choice)
     }
diff --git a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/P1_Sigma_sm_gg_ttxggg/CPPProcess.h b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/P1_Sigma_sm_gg_ttxggg/CPPProcess.h
index be0f0bc396..fff95b66e2 100644
--- a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/P1_Sigma_sm_gg_ttxggg/CPPProcess.h
+++ b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/P1_Sigma_sm_gg_ttxggg/CPPProcess.h
@@ -7,7 +7,7 @@
 // Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+// MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
diff --git a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/P1_Sigma_sm_gg_ttxggg/check_sa.cc b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/P1_Sigma_sm_gg_ttxggg/check_sa.cc
index 1bad694d1c..7cac5ab47b 100644
--- a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/P1_Sigma_sm_gg_ttxggg/check_sa.cc
+++ b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/P1_Sigma_sm_gg_ttxggg/check_sa.cc
@@ -29,7 +29,9 @@
 
 #include <algorithm>
 #include <array>
+#include <cfenv> // for feenableexcept
 #include <cmath>
+#include <csignal> // for signal and SIGFPE
 #include <cstring>
 #include <fstream>
 #include <iomanip>
@@ -74,6 +76,23 @@ usage( char* argv0, int ret = 1 )
   return ret;
 }
 
+#ifdef __CUDACC__
+namespace mg5amcGpu
+#else
+namespace mg5amcCpu
+#endif
+{
+  inline void FPEhandler( int sig )
+  {
+#ifdef __CUDACC__
+    std::cerr << "Floating Point Exception (GPU)" << std::endl;
+#else
+    std::cerr << "Floating Point Exception (CPU)" << std::endl;
+#endif
+    exit( 0 );
+  }
+}
+
 int
 main( int argc, char** argv )
 {
@@ -84,6 +103,18 @@ main( int argc, char** argv )
   using namespace mg5amcCpu;
 #endif
 
+  // Enable FPEs (test #701 and #733 - except on MacOS where feenableexcept is not defined #730)
+#ifndef __APPLE__
+  const char* enableFPEc = getenv( "CUDACPP_RUNTIME_ENABLEFPE" );
+  const bool enableFPE = ( enableFPEc != 0 ) && ( std::string( enableFPEc ) != "" );
+  if( enableFPE )
+  {
+    std::cout << "WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions" << std::endl;
+    feenableexcept( FE_INVALID | FE_DIVBYZERO | FE_OVERFLOW | FE_UNDERFLOW ); // debug #701
+    signal( SIGFPE, FPEhandler );
+  }
+#endif
+
   // DEFAULTS FOR COMMAND LINE ARGUMENTS
   bool verbose = false;
   bool debug = false;
@@ -103,12 +134,14 @@ main( int argc, char** argv )
     CurandHost = 1,
     CurandDevice = 2
   };
-#ifdef __CUDACC__
-  RandomNumberMode rndgen = RandomNumberMode::CurandDevice; // default on CUDA GPU
-#elif not defined MGONGPU_HAS_NO_CURAND
-  RandomNumberMode rndgen = RandomNumberMode::CurandHost;  // default on CPU if build has curand
+#ifdef MGONGPU_HAS_NO_CURAND
+  RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // this is the only supported mode if build has no curand (PR #784 and #785)
+#elif defined __HIPCC__
+#error Internal error: MGONGPU_HAS_NO_CURAND should have been set for __HIPCC__ // default on AMD GPUs should be common random
+#elif defined __CUDACC__
+  RandomNumberMode rndgen = RandomNumberMode::CurandDevice; // default on NVidia GPU if build has curand
 #else
-  RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // default on CPU if build has no curand and on HIP GPU
+  RandomNumberMode rndgen = RandomNumberMode::CurandHost; // default on CPU if build has curand
 #endif
   // Rambo sampling mode (NB RamboHost implies CommonRandom or CurandHost!)
   enum class RamboSamplingMode
@@ -146,18 +179,20 @@ main( int argc, char** argv )
     }
     else if( arg == "--curdev" )
     {
-#ifdef __CUDACC__
-      rndgen = RandomNumberMode::CurandDevice;
+#ifndef __CUDACC__
+      throw std::runtime_error( "CurandDevice is not supported on CPUs or non-NVidia GPUs" );
+#elif defined MGONGPU_HAS_NO_CURAND
+      throw std::runtime_error( "CurandDevice is not supported because this application was built without Curand support" );
 #else
-      throw std::runtime_error( "CurandDevice is not supported on CPUs or on HIP GPUs" );
+      rndgen = RandomNumberMode::CurandDevice;
 #endif
     }
     else if( arg == "--curhst" )
     {
-#ifndef MGONGPU_HAS_NO_CURAND
-      rndgen = RandomNumberMode::CurandHost;
-#else
+#ifdef MGONGPU_HAS_NO_CURAND
       throw std::runtime_error( "CurandHost is not supported because this application was built without Curand support" );
+#else
+      rndgen = RandomNumberMode::CurandHost;
 #endif
     }
     else if( arg == "--common" )
@@ -278,10 +313,10 @@ main( int argc, char** argv )
   const std::string procKey = "0a ProcInit";
   timermap.start( procKey );
 
-  // Create a process object
+  // Create a process object, read param card and set parameters
+  // FIXME: the process instance can happily go out of scope because it is only needed to read parameters?
+  // FIXME: the CPPProcess should really be a singleton? (for instance, in bridge mode this will be called twice here?)
   CPPProcess process( verbose );
-
-  // Read param_card and set parameters
   process.initProc( "../../Cards/param_card.dat" );
   const fptype energy = 1500; // historical default, Ecms = 1500 GeV = 1.5 TeV (above the Z peak)
   //const fptype energy = 91.2; // Ecms = 91.2 GeV (Z peak)
@@ -389,30 +424,26 @@ main( int argc, char** argv )
   {
     prnk.reset( new CommonRandomNumberKernel( hstRndmom ) );
   }
-#ifndef MGONGPU_HAS_NO_CURAND
   else if( rndgen == RandomNumberMode::CurandHost )
   {
+#ifdef MGONGPU_HAS_NO_CURAND
+    throw std::runtime_error( "INTERNAL ERROR! CurandHost is not supported because this application was built without Curand support" ); // INTERNAL ERROR (no path to this statement)
+#else
     const bool onDevice = false;
     prnk.reset( new CurandRandomNumberKernel( hstRndmom, onDevice ) );
+#endif
   }
-#ifdef __CUDACC__
   else
   {
+#ifdef MGONGPU_HAS_NO_CURAND
+    throw std::runtime_error( "INTERNAL ERROR! CurandDevice is not supported because this application was built without Curand support" ); // INTERNAL ERROR (no path to this statement)
+#elif defined __CUDACC__
     const bool onDevice = true;
     prnk.reset( new CurandRandomNumberKernel( devRndmom, onDevice ) );
-  }
 #else
-  else
-  {
-    throw std::logic_error( "CurandDevice is not supported on CPUs or HIP GPUs" ); // INTERNAL ERROR (no path to this statement)
-  }
+    throw std::logic_error( "INTERNAL ERROR! CurandDevice is not supported on CPUs or non-NVidia GPUs" ); // INTERNAL ERROR (no path to this statement)
 #endif
-#else
-  else
-  {
-    throw std::logic_error( "This application was built without Curand support" ); // INTERNAL ERROR (no path to this statement)
   }
-#endif
 
   // --- 0c. Create rambo sampling kernel [keep this in 0c for the moment]
   std::unique_ptr<SamplingKernelBase> prsk;
@@ -747,7 +778,7 @@ main( int argc, char** argv )
   wrkflwtxt += "HIP:";
 #else
   wrkflwtxt += "CPP:";
-#endif
+#endif /* clang-format off */
   // -- DOUBLE or FLOAT?
 #if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
   wrkflwtxt += "MIX+"; // mixed fptypes (single precision color algebra #537)
@@ -757,7 +788,7 @@ main( int argc, char** argv )
   wrkflwtxt += "FLT+";
 #else
   wrkflwtxt += "???+"; // no path to this statement
-#endif
+#endif /* clang-format on */
   // -- CUCOMPLEX or THRUST or STD complex numbers?
 #ifdef __CUDACC__
 #if defined MGONGPU_CUCXTYPE_CUCOMPLEX
diff --git a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/cudacpp.mk b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/cudacpp.mk
index 59a2c906eb..f2cfa349da 100644
--- a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/cudacpp.mk
+++ b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/cudacpp.mk
@@ -4,10 +4,13 @@
 # Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 
 #=== Determine the name of this makefile (https://ftp.gnu.org/old-gnu/Manuals/make-3.80/html_node/make_17.html)
-#=== NB: different names (e.g. cudacpp.mk and cudacpp_src.mk) are used in the Subprocess and src directories
+#=== NB: use ':=' to ensure that the value of CUDACPP_MAKEFILE is not modified further down after including make_opts
+#=== NB: use 'override' to ensure that the value can not be modified from the outside
+override CUDACPP_MAKEFILE := $(word $(words $(MAKEFILE_LIST)),$(MAKEFILE_LIST))
+###$(info CUDACPP_MAKEFILE='$(CUDACPP_MAKEFILE)')
 
-CUDACPP_MAKEFILE = $(word $(words $(MAKEFILE_LIST)),$(MAKEFILE_LIST))
-CUDACPP_SRC_MAKEFILE = cudacpp_src.mk
+#=== NB: different names (e.g. cudacpp.mk and cudacpp_src.mk) are used in the Subprocess and src directories
+override CUDACPP_SRC_MAKEFILE = cudacpp_src.mk
 
 #-------------------------------------------------------------------------------
 
@@ -29,7 +32,17 @@ UNAME_P := $(shell uname -p)
 
 #-------------------------------------------------------------------------------
 
-#=== Configure common compiler flags for C++ and CUDA
+#=== Include the common MG5aMC Makefile options
+
+# OM: this is crucial for MG5aMC flag consistency/documentation
+# AV: temporarely comment this out because it breaks cudacpp builds
+ifneq ($(wildcard ../../Source/make_opts),)
+include ../../Source/make_opts
+endif
+
+#-------------------------------------------------------------------------------
+
+#=== Configure common compiler flags for C++ and CUDA/HIP
 
 INCFLAGS = -I.
 OPTFLAGS = -O3 # this ends up in GPUFLAGS too (should it?), cannot add -Ofast or -ffast-math here
@@ -101,68 +114,85 @@ endif
 # Note: AR, CXX and FC are implicitly defined if not set externally
 # See https://www.gnu.org/software/make/manual/html_node/Implicit-Variables.html
 
-#-------------------------------------------------------------------------------
-
-CUDA_COMPILER_PATH := $(shell compiler="`which nvcc 2>/dev/null`" && while [ -L "$$compiler" ]; do compiler=`readlink "$$compiler"`; done && echo "$$compiler")
-HIP_COMPILER_PATH := $(shell compiler="`which hipcc 2>/dev/null`" && while [ -L "$$compiler" ]; do compiler=`readlink "$$compiler"`; done && echo "$$compiler")
-
-ifeq ($(findstring nvcc,$(CUDA_COMPILER_PATH)),nvcc)
-  #=== Configure the CUDA compiler
-
-  # If CXX is not a single word (example "clang++ --gcc-toolchain...") then disable CUDA builds (issue #505)
-  # This is because it is impossible to pass this to "GPUFLAGS += -ccbin <host-compiler>" below
-  ifneq ($(words $(subst ccache ,,$(CXX))),1) # allow at most "CXX=ccache <host-compiler>" from outside
-    $(warning CUDA builds are not supported for multi-word CXX "$(CXX)")
-    override CUDA_HOME=disabled
-  endif
+# Add -mmacosx-version-min=11.3 to avoid "ld: warning: object file was built for newer macOS version than being linked"
+ifneq ($(shell $(CXX) --version | egrep '^Apple clang'),)
+CXXFLAGS += -mmacosx-version-min=11.3
+endif
 
-  # If CUDA_HOME is not set, try to set it from the location of nvcc
-  ifndef CUDA_HOME
-    CUDA_HOME = $(patsubst %bin/nvcc,%,$(shell which nvcc 2>/dev/null))
-    $(warning CUDA_HOME was not set: using "$(CUDA_HOME)")
-  endif
+#-------------------------------------------------------------------------------
 
-  # Set GPUCC as $(CUDA_HOME)/bin/nvcc if it exists
-  ifneq ($(wildcard $(CUDA_HOME)/bin/nvcc),)
-    GPUCC = $(CUDA_HOME)/bin/nvcc
-    USE_NVTX ?=-DUSE_NVTX
-    # See https://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/index.html
-    # See https://arnon.dk/matching-sm-architectures-arch-and-gencode-for-various-nvidia-cards/
-    # Default: use compute capability 70 for V100 (CERN lxbatch, CERN itscrd, Juwels Cluster).
-    # Embed device code for 70, and PTX for 70+.
-    # Export MADGRAPH_CUDA_ARCHITECTURE (comma-separated list) to use another value or list of values (see #533).
-    # Examples: use 60 for P100 (Piz Daint), 80 for A100 (Juwels Booster, NVidia raplab/Curiosity).
-    MADGRAPH_CUDA_ARCHITECTURE ?= 70
-    ###CUARCHFLAGS = -gencode arch=compute_$(MADGRAPH_CUDA_ARCHITECTURE),code=compute_$(MADGRAPH_CUDA_ARCHITECTURE) -gencode arch=compute_$(MADGRAPH_CUDA_ARCHITECTURE),code=sm_$(MADGRAPH_CUDA_ARCHITECTURE) # Older implementation (AV): go back to this one for multi-GPU support #533
-    ###CUARCHFLAGS = --gpu-architecture=compute_$(MADGRAPH_CUDA_ARCHITECTURE) --gpu-code=sm_$(MADGRAPH_CUDA_ARCHITECTURE),compute_$(MADGRAPH_CUDA_ARCHITECTURE) # Newer implementation (SH): cannot use this as-is for multi-GPU support #533
-    comma:=,
-    CUARCHFLAGS = $(foreach arch,$(subst $(comma), ,$(MADGRAPH_CUDA_ARCHITECTURE)),-gencode arch=compute_$(arch),code=compute_$(arch) -gencode arch=compute_$(arch),code=sm_$(arch))
-    CUINC = -I$(CUDA_HOME)/include/
-    CURANDLIBFLAGS = -L$(CUDA_HOME)/lib64/ -lcurand # NB: -lcuda is not needed here!
-    CUOPTFLAGS = -lineinfo
-    GPUFLAGS = $(foreach opt, $(OPTFLAGS), -Xcompiler $(opt)) $(CUOPTFLAGS) $(INCFLAGS) $(CUINC) $(USE_NVTX) $(CUARCHFLAGS) -use_fast_math
-    ###GPUFLAGS += -Xcompiler -Wall -Xcompiler -Wextra -Xcompiler -Wshadow
-    ###GPUCC_VERSION = $(shell $(GPUCC) --version | grep 'Cuda compilation tools' | cut -d' ' -f5 | cut -d, -f1)
-    GPUFLAGS += -std=c++17 # need CUDA >= 11.2 (see #333): this is enforced in mgOnGpuConfig.h
-    # Without -maxrregcount: baseline throughput: 6.5E8 (16384 32 12) up to 7.3E8 (65536 128 12)
-    ###GPUFLAGS+= --maxrregcount 160 # improves throughput: 6.9E8 (16384 32 12) up to 7.7E8 (65536 128 12)
-    ###GPUFLAGS+= --maxrregcount 128 # improves throughput: 7.3E8 (16384 32 12) up to 7.6E8 (65536 128 12)
-    ###GPUFLAGS+= --maxrregcount 96 # degrades throughput: 4.1E8 (16384 32 12) up to 4.5E8 (65536 128 12)
-    ###GPUFLAGS+= --maxrregcount 64 # degrades throughput: 1.7E8 (16384 32 12) flat at 1.7E8 (65536 128 12)
-    CUBUILDRULEFLAGS = -Xcompiler -fPIC -c
-    CCBUILDRULEFLAGS = -Xcompiler -fPIC -c -x cu
-    CUDATESTFLAGS = -lcuda
-  else ifneq ($(origin REQUIRE_CUDA),undefined)
-    # If REQUIRE_CUDA is set but no cuda is found, stop here (e.g. for CI tests on GPU #443)
-    $(error No cuda installation found (set CUDA_HOME or make GPUCC visible in PATH))
+#=== Configure the GPU compiler (CUDA or HIP)
+
+# FIXME! (AV 24.01.2024)
+# In the current implementation (without separate builds for C++ and CUDA/HIP), we first check for cudacc and hipcc in CUDA_HOME and HIP_HOME.
+# If CUDA_HOME or HIP_HOME are not set, try to determine them from the path to cudacc and hipcc.
+# While convoluted, this is currently necessary to allow disabling CUDA/HIP builds by setting CUDA_HOME or HIP_HOME to invalid paths.
+# This will (probably?) be fixed when separate C++ and CUDA/HIP builds are implemented (PR #775).
+
+# If CXX is not a single word (example "clang++ --gcc-toolchain...") then disable CUDA and HIP builds (issue #505)
+# This is because it is impossible to pass this to "GPUFLAGS += -ccbin <host-compiler>" below
+ifneq ($(words $(subst ccache ,,$(CXX))),1) # allow at most "CXX=ccache <host-compiler>" from outside
+  $(warning CUDA and HIP builds are not supported for multi-word CXX "$(CXX)")
+  override CUDA_HOME=disabled
+  override HIP_HOME=disabled
+endif
+
+# If CUDA_HOME is not set, try to set it from the path to nvcc
+ifndef CUDA_HOME
+  CUDA_HOME = $(patsubst %bin/nvcc,%,$(shell which nvcc 2>/dev/null))
+  $(warning CUDA_HOME was not set: using "$(CUDA_HOME)")
+endif
+
+# If HIP_HOME is not set, try to set it from the path to hipcc
+ifndef HIP_HOME
+  HIP_HOME = $(patsubst %bin/hipcc,%,$(HIP_COMPILER_PATH))
+  $(warning HIP_HOME was not set: using "$(HIP_HOME)")
+endif
+
+# FIXME! (AV 24.01.2024)
+# In the current implementation (without separate builds for C++ and CUDA/HIP),
+# builds are performed for HIP only if CUDA is not found in the path.
+# If both CUDA and HIP are installed, HIP builds can be triggered by unsetting CUDA_HOME.
+# This will be fixed when separate C++ and CUDA/HIP builds are implemented (PR #775).
+
+#--- Option 1: CUDA exists -> use CUDA
+
+# Set GPUCC as $(CUDA_HOME)/bin/nvcc if it exists
+ifneq ($(wildcard $(CUDA_HOME)/bin/nvcc),)
+
+  GPUCC = $(CUDA_HOME)/bin/nvcc
+  USE_NVTX ?=-DUSE_NVTX
+  # See https://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/index.html
+  # See https://arnon.dk/matching-sm-architectures-arch-and-gencode-for-various-nvidia-cards/
+  # Default: use compute capability 70 for V100 (CERN lxbatch, CERN itscrd, Juwels Cluster).
+  # Embed device code for 70, and PTX for 70+.
+  # Export MADGRAPH_CUDA_ARCHITECTURE (comma-separated list) to use another value or list of values (see #533).
+  # Examples: use 60 for P100 (Piz Daint), 80 for A100 (Juwels Booster, NVidia raplab/Curiosity).
+  MADGRAPH_CUDA_ARCHITECTURE ?= 70
+  ###CUARCHFLAGS = -gencode arch=compute_$(MADGRAPH_CUDA_ARCHITECTURE),code=compute_$(MADGRAPH_CUDA_ARCHITECTURE) -gencode arch=compute_$(MADGRAPH_CUDA_ARCHITECTURE),code=sm_$(MADGRAPH_CUDA_ARCHITECTURE) # Older implementation (AV): go back to this one for multi-GPU support #533
+  ###CUARCHFLAGS = --gpu-architecture=compute_$(MADGRAPH_CUDA_ARCHITECTURE) --gpu-code=sm_$(MADGRAPH_CUDA_ARCHITECTURE),compute_$(MADGRAPH_CUDA_ARCHITECTURE) # Newer implementation (SH): cannot use this as-is for multi-GPU support #533
+  comma:=,
+  CUARCHFLAGS = $(foreach arch,$(subst $(comma), ,$(MADGRAPH_CUDA_ARCHITECTURE)),-gencode arch=compute_$(arch),code=compute_$(arch) -gencode arch=compute_$(arch),code=sm_$(arch))
+  CUINC = -I$(CUDA_HOME)/include/
+  ifeq ($(RNDGEN),hasNoCurand)
+    CURANDLIBFLAGS=
   else
-    # No cuda. Switch cuda compilation off and go to common random numbers in C++
-    $(warning CUDA_HOME is not set or is invalid: export CUDA_HOME to compile with cuda)
-    override GPUCC=
-    override USE_NVTX=
-    override CUINC=
-    override CURANDLIBFLAGS=
+    CURANDLIBFLAGS = -L$(CUDA_HOME)/lib64/ -lcurand # NB: -lcuda is not needed here!
   endif
+  CUOPTFLAGS = -lineinfo
+  ###GPUFLAGS = $(OPTFLAGS) $(CUOPTFLAGS) $(INCFLAGS) $(CUINC) $(USE_NVTX) $(CUARCHFLAGS) -use_fast_math
+  GPUFLAGS = $(foreach opt, $(OPTFLAGS), -Xcompiler $(opt)) $(CUOPTFLAGS) $(INCFLAGS) $(CUINC) $(USE_NVTX) $(CUARCHFLAGS) -use_fast_math
+  ###GPUFLAGS += -Xcompiler -Wall -Xcompiler -Wextra -Xcompiler -Wshadow
+  ###GPUCC_VERSION = $(shell $(GPUCC) --version | grep 'Cuda compilation tools' | cut -d' ' -f5 | cut -d, -f1)
+  GPUFLAGS += -std=c++17 # need CUDA >= 11.2 (see #333): this is enforced in mgOnGpuConfig.h
+  # Without -maxrregcount: baseline throughput: 6.5E8 (16384 32 12) up to 7.3E8 (65536 128 12)
+  ###GPUFLAGS+= --maxrregcount 160 # improves throughput: 6.9E8 (16384 32 12) up to 7.7E8 (65536 128 12)
+  ###GPUFLAGS+= --maxrregcount 128 # improves throughput: 7.3E8 (16384 32 12) up to 7.6E8 (65536 128 12)
+  ###GPUFLAGS+= --maxrregcount 96 # degrades throughput: 4.1E8 (16384 32 12) up to 4.5E8 (65536 128 12)
+  ###GPUFLAGS+= --maxrregcount 64 # degrades throughput: 1.7E8 (16384 32 12) flat at 1.7E8 (65536 128 12)
+  CUBUILDRULEFLAGS = -Xcompiler -fPIC -c
+  CCBUILDRULEFLAGS = -Xcompiler -fPIC -c -x cu
+  CUDATESTFLAGS = -lcuda
 
   # Set the host C++ compiler for GPUCC via "-ccbin <host-compiler>"
   # (NB issue #505: this must be a single word, "clang++ --gcc-toolchain..." is not supported)
@@ -173,71 +203,55 @@ ifeq ($(findstring nvcc,$(CUDA_COMPILER_PATH)),nvcc)
   GPUFLAGS += -allow-unsupported-compiler
   endif
 
-else ifeq ($(findstring hipcc,$(HIP_COMPILER_PATH)),hipcc)
-  #=== Configure the HIP compiler
+else ifneq ($(origin REQUIRE_CUDA),undefined)
 
-  # If CXX is not a single word (example "clang++ --gcc-toolchain...") then disable CUDA/HIP builds (issue #505)
-  # This is because it is impossible to pass this to "GPUFLAGS += -ccbin <host-compiler>" below
-  ifneq ($(words $(subst ccache ,,$(CXX))),1) # allow at most "CXX=ccache <host-compiler>" from outside
-    $(warning HIP builds are not supported for multi-word CXX "$(CXX)")
-    override HIP_HOME=disabled
-  endif
+  # If REQUIRE_CUDA is set but no cuda is found, stop here (e.g. for CI tests on GPU #443)
+  $(error No cuda installation found (set CUDA_HOME or make GPUCC visible in PATH))
 
-  # If HIP_HOME is not set, try to set it from the location of GPUCC
-  ifndef HIP_HOME
-    HIP_HOME = $(patsubst %bin/hipcc,%,$(HIP_COMPILER_PATH))
-    $(warning HIP_HOME was not set: using "$(HIP_HOME)")
-  endif
+#--- Option 2: CUDA does not exist, HIP exists -> use HIP
 
-  # Set GPUCC as $(HIP_HOME)/bin/hipcc if it exists
-  ifneq ($(wildcard $(HIP_HOME)/bin/hipcc),)
-    GPUCC = $(HIP_HOME)/bin/hipcc
-
-    # Should maybe find something equivelant to this in HIP
-    #USE_NVTX ?=-DUSE_NVTX
-
-    HIPARCHFLAGS = -target x86_64-linux-gnu --offload-arch=gfx90a
-    HIPINC = -I$(HIP_HOME)/include/
-
-    # -DHIP_FAST_MATH equivelant to -use_fast_math in HIP 
-    # (But only for single precision line 208: https://rocm-developer-tools.github.io/HIP/hcc__detail_2math__functions_8h_source.html)
-    GPUFLAGS = $(OPTFLAGS) $(CUOPTFLAGS) $(INCFLAGS) $(HIPINC) $(HIPARCHFLAGS) -DHIP_FAST_MATH -DHIP_PLATFORM=amd -fPIC
-    ###GPUFLAGS += -Xcompiler -Wall -Xcompiler -Wextra -Xcompiler -Wshadow
-    GPUFLAGS += -std=c++17
-    # Without -maxrregcount: baseline throughput: 6.5E8 (16384 32 12) up to 7.3E8 (65536 128 12)
-    ###GPUFLAGS+= --maxrregcount 160 # improves throughput: 6.9E8 (16384 32 12) up to 7.7E8 (65536 128 12)
-    ###GPUFLAGS+= --maxrregcount 128 # improves throughput: 7.3E8 (16384 32 12) up to 7.6E8 (65536 128 12)
-    ###GPUFLAGS+= --maxrregcount 96 # degrades throughput: 4.1E8 (16384 32 12) up to 4.5E8 (65536 128 12)
-    ###GPUFLAGS+= --maxrregcount 64 # degrades throughput: 1.7E8 (16384 32 12) flat at 1.7E8 (65536 128 12)
-
-    CUBUILDRULEFLAGS = -fPIC -c
-    CCBUILDRULEFLAGS = -fPIC -c
-
-  else ifneq ($(origin REQUIRE_HIP),undefined)
-    # If REQUIRE_HIP is set but no cuda is found, stop here (e.g. for CI tests on GPU #443)
-    $(error No hip installation found (set HIP_HOME or make GPUCC visible in PATH))
-  else
-    # No hip. Switch hip compilation off and go to common random numbers in C++
-    $(warning HIP_HOME is not set or is invalid: export HIP_HOME to compile with hip)
-    override GPUCC=
-    override USE_NVTX=
-    override CUINC=
-    override CURANDLIBFLAGS=
-  endif
+# Set GPUCC as $(HIP_HOME)/bin/hipcc if it exists
+else ifneq ($(wildcard $(HIP_HOME)/bin/hipcc),)
 
-  # Allow newer (unsupported) C++ compilers with older versions of CUDA if ALLOW_UNSUPPORTED_COMPILER_IN_CUDA is set (#504)
-  ifneq ($(origin ALLOW_UNSUPPORTED_COMPILER_IN_CUDA),undefined)
-  GPUFLAGS += -allow-unsupported-compiler
-  endif
+  GPUCC = $(HIP_HOME)/bin/hipcc
+  #USE_NVTX ?=-DUSE_NVTX # should maybe find something equivalent to this in HIP?
+  HIPARCHFLAGS = -target x86_64-linux-gnu --offload-arch=gfx90a
+  HIPINC = -I$(HIP_HOME)/include/
+  # Note: -DHIP_FAST_MATH is equivalent to -use_fast_math in HIP 
+  # (but only for single precision line 208: https://rocm-developer-tools.github.io/HIP/hcc__detail_2math__functions_8h_source.html)
+  GPUFLAGS = $(OPTFLAGS) $(CUOPTFLAGS) $(INCFLAGS) $(HIPINC) $(HIPARCHFLAGS) -DHIP_FAST_MATH -DHIP_PLATFORM=amd -fPIC
+  ###GPUFLAGS += -Xcompiler -Wall -Xcompiler -Wextra -Xcompiler -Wshadow
+  GPUFLAGS += -std=c++17
+  ###GPUFLAGS+= --maxrregcount 255 # (AV: is this option valid on HIP and meaningful on AMD GPUs?)
+  CUBUILDRULEFLAGS = -fPIC -c
+  CCBUILDRULEFLAGS = -fPIC -c
+
+else ifneq ($(origin REQUIRE_HIP),undefined)
+
+  # If REQUIRE_HIP is set but no HIP is found, stop here (e.g. for CI tests on GPU #443)
+  $(error No hip installation found (set HIP_HOME or make GPUCC visible in PATH))
+
+#--- Option 3: CUDA does not exist, HIP does not exist -> switch off both CUDA and HIP
+
+else
+
+  # No cudacc and no hipcc: switch CUDA and HIP compilation off and go to common random numbers in C++
+  $(warning CUDA_HOME is not set or is invalid: export CUDA_HOME to compile with cuda)
+  $(warning HIP_HOME is not set or is invalid: export HIP_HOME to compile with hip)
+  override GPUCC=
+  override USE_NVTX=
+  override CUINC=
+  override CURANDLIBFLAGS=
 
 endif
 
+# Export GPUCC (so that it can also be used in cudacpp_src.mk?)
 export GPUCC
 export GPUFLAGS
 
 #-------------------------------------------------------------------------------
 
-#=== Configure ccache for C++ and CUDA builds
+#=== Configure ccache for C++ and CUDA/HIP builds
 
 # Enable ccache if USECCACHE=1
 ifeq ($(USECCACHE)$(shell echo $(CXX) | grep ccache),1)
@@ -254,7 +268,7 @@ endif
 
 #-------------------------------------------------------------------------------
 
-#=== Configure PowerPC-specific compiler flags for C++ and CUDA
+#=== Configure PowerPC-specific compiler flags for C++ and CUDA/HIP
 
 # PowerPC-specific CXX compiler flags (being reviewed)
 ifeq ($(UNAME_P),ppc64le)
@@ -270,7 +284,7 @@ else
   ######CXXFLAGS+= -fno-semantic-interposition # no benefit (neither alone, nor combined with -flto)
 endif
 
-# PowerPC-specific CUDA compiler flags (to be reviewed!)
+# PowerPC-specific CUDA/HIP compiler flags (to be reviewed!)
 ifeq ($(UNAME_P),ppc64le)
   GPUFLAGS+= -Xcompiler -mno-float128
 endif
@@ -285,12 +299,14 @@ override OMPFLAGS = -fopenmp
 ###override OMPFLAGS = # disable OpenMP MT on Intel (was ok without GPUCC but not ok with GPUCC before #578)
 else ifneq ($(shell $(CXX) --version | egrep '^(clang)'),)
 override OMPFLAGS = -fopenmp
-###override OMPFLAGS = # disable OpenMP MT on clang (was not ok without or with GPUCC before #578)
-else ifneq ($(shell $(CXX) --version | egrep '^(Apple clang)'),)
-override OMPFLAGS = # disable OpenMP MT on Apple clang (builds fail in the CI #578)
+###override OMPFLAGS = # disable OpenMP MT on clang (was not ok without or with nvcc before #578)
+###else ifneq ($(shell $(CXX) --version | egrep '^(Apple clang)'),) # AV for Mac (Apple clang compiler)
+else ifeq ($(UNAME_S),Darwin) # OM for Mac (any compiler)
+override OMPFLAGS = # AV disable OpenMP MT on Apple clang (builds fail in the CI #578)
+###override OMPFLAGS = -fopenmp # OM reenable OpenMP MT on Apple clang? (AV Oct 2023: this still fails in the CI)
 else
-override OMPFLAGS = -fopenmp
-###override OMPFLAGS = # disable OpenMP MT (default before #575)
+override OMPFLAGS = -fopenmp # enable OpenMP MT by default on all other platforms
+###override OMPFLAGS = # disable OpenMP MT on all other platforms (default before #575)
 endif
 
 # Set the default AVX (vectorization) choice
@@ -356,7 +372,7 @@ export OMPFLAGS
 
 #-------------------------------------------------------------------------------
 
-#=== Set the CUDA/C++ compiler flags appropriate to user-defined choices of AVX, FPTYPE, HELINL, HRDCOD, RNDGEN
+#=== Set the CUDA/HIP/C++ compiler flags appropriate to user-defined choices of AVX, FPTYPE, HELINL, HRDCOD, RNDGEN
 
 # Set the build flags appropriate to OMPFLAGS
 $(info OMPFLAGS=$(OMPFLAGS))
@@ -573,8 +589,9 @@ $(BUILDDIR)/gcheck_sa.o: CXXFLAGS += $(USE_NVTX) $(CUINC)
 
 # Apply special build flags only to check_sa and CurandRandomNumberKernel (curand headers, #679)
 $(BUILDDIR)/check_sa.o: CXXFLAGS += $(CXXFLAGSCURAND)
-$(BUILDDIR)/gcheck_sa.o: CXXFLAGS += $(CXXFLAGSCURAND)
+$(BUILDDIR)/gcheck_sa.o: CUFLAGS += $(CXXFLAGSCURAND)
 $(BUILDDIR)/CurandRandomNumberKernel.o: CXXFLAGS += $(CXXFLAGSCURAND)
+$(BUILDDIR)/gCurandRandomNumberKernel.o: CUFLAGS += $(CXXFLAGSCURAND)
 ifeq ($(RNDGEN),hasCurand)
 $(BUILDDIR)/CurandRandomNumberKernel.o: CXXFLAGS += $(CUINC)
 endif
@@ -772,12 +789,18 @@ $(testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(cxx_object
 	  $(GPUCC) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) -ldl $(LIBFLAGS) $(CUDATESTFLAGS)
 endif
 
+# Use target gtestlibs to build only googletest
+ifneq ($(GTESTLIBS),)
+gtestlibs: $(GTESTLIBS)
+endif
+
 # Use flock (Linux only, no Mac) to allow 'make -j' if googletest has not yet been downloaded https://stackoverflow.com/a/32666215
 $(GTESTLIBS):
 ifneq ($(shell which flock 2>/dev/null),)
+	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
 	flock $(BUILDDIR)/.make_test.lock $(MAKE) -C $(TESTDIR)
 else
-	$(MAKE) -C $(TESTDIR)
+	if [ -d $(TESTDIR) ]; then $(MAKE) -C $(TESTDIR); fi
 endif
 
 #-------------------------------------------------------------------------------
diff --git a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/fbridge.cc b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/fbridge.cc
index 2b956730d4..22ce3f5115 100644
--- a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/fbridge.cc
+++ b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/fbridge.cc
@@ -49,11 +49,7 @@ extern "C"
 #ifdef MGONGPUCPP_GPUIMPL
     GpuRuntime::setUp();
 #endif
-    // Create a process object, read parm card and set parameters
-    // FIXME: the process instance can happily go out of scope because it is only needed to read parameters?
-    // FIXME: the CPPProcess should really be a singleton? what if fbridgecreate is called from several Fortran threads?
-    CPPProcess process( /*verbose=*/false );
-    process.initProc( "../../Cards/param_card.dat" );
+    // (NB: CPPProcess::initProc no longer needs to be executed here because it is called in the Bridge constructor)
     // FIXME: disable OMP in Bridge when called from Fortran
     *ppbridge = new Bridge<FORTRANFPTYPE>( *pnevtF, *pnparF, *pnp4F );
   }
diff --git a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/runTest.cc b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/runTest.cc
index 0ed26180ca..de327f2321 100644
--- a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/runTest.cc
+++ b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/runTest.cc
@@ -71,6 +71,8 @@ struct CPUTest : public CUDA_CPU_TestBase
     , hstSelCol( nevt )
     , hstIsGoodHel( CPPProcess::ncomb )
   {
+    // FIXME: the process instance can happily go out of scope because it is only needed to read parameters?
+    // FIXME: the CPPProcess should really be a singleton?
     process.initProc( "../../Cards/param_card.dat" );
   }
 
@@ -183,6 +185,8 @@ struct CUDATest : public CUDA_CPU_TestBase
     , devSelCol( nevt )
     , devIsGoodHel( CPPProcess::ncomb )
   {
+    // FIXME: the process instance can happily go out of scope because it is only needed to read parameters?
+    // FIXME: the CPPProcess should really be a singleton?
     process.initProc( "../../Cards/param_card.dat" );
   }
 
diff --git a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/testxxx.cc b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/testxxx.cc
index 016bc0f472..e5167de00c 100644
--- a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/testxxx.cc
+++ b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/testxxx.cc
@@ -59,7 +59,8 @@ TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testxxx )
   using namespace mg5amcCpu;
 #endif
 #ifndef __APPLE__ // test #701 (except on MacOS where feenableexcept is not defined #730)
-  const bool enableFPE = !getenv( "CUDACPP_RUNTIME_DISABLEFPE" );
+  const char* enableFPEc = getenv( "CUDACPP_RUNTIME_ENABLEFPE" );
+  const bool enableFPE = ( enableFPEc != 0 ) && ( std::string( enableFPEc ) != "" );
   if( enableFPE )
   {
     feenableexcept( FE_INVALID | FE_DIVBYZERO | FE_OVERFLOW | FE_UNDERFLOW ); // debug #701
diff --git a/epochX/cudacpp/gg_ttggg.sa/src/HelAmps_sm.h b/epochX/cudacpp/gg_ttggg.sa/src/HelAmps_sm.h
index 459f21394d..8b4ad719be 100644
--- a/epochX/cudacpp/gg_ttggg.sa/src/HelAmps_sm.h
+++ b/epochX/cudacpp/gg_ttggg.sa/src/HelAmps_sm.h
@@ -8,7 +8,7 @@
 // Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+// MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
@@ -863,6 +863,7 @@ namespace mg5amcCpu
           const fptype allV2[],
           const fptype allV3[],
           const fptype allCOUP[],
+          const double Ccoeff,
           fptype allvertexes[] ) ALWAYS_INLINE;
 
   //--------------------------------------------------------------------------
@@ -873,6 +874,7 @@ namespace mg5amcCpu
   VVV1P0_1( const fptype allV2[],
             const fptype allV3[],
             const fptype allCOUP[],
+            const double Ccoeff,
             const fptype M1,
             const fptype W1,
             fptype allV1[] ) ALWAYS_INLINE;
@@ -886,6 +888,7 @@ namespace mg5amcCpu
           const fptype allF2[],
           const fptype allV3[],
           const fptype allCOUP[],
+          const double Ccoeff,
           fptype allvertexes[] ) ALWAYS_INLINE;
 
   //--------------------------------------------------------------------------
@@ -896,6 +899,7 @@ namespace mg5amcCpu
   FFV1_1( const fptype allF2[],
           const fptype allV3[],
           const fptype allCOUP[],
+          const double Ccoeff,
           const fptype M1,
           const fptype W1,
           fptype allF1[] ) ALWAYS_INLINE;
@@ -908,6 +912,7 @@ namespace mg5amcCpu
   FFV1_2( const fptype allF1[],
           const fptype allV3[],
           const fptype allCOUP[],
+          const double Ccoeff,
           const fptype M2,
           const fptype W2,
           fptype allF2[] ) ALWAYS_INLINE;
@@ -920,6 +925,7 @@ namespace mg5amcCpu
   FFV1P0_3( const fptype allF1[],
             const fptype allF2[],
             const fptype allCOUP[],
+            const double Ccoeff,
             const fptype M3,
             const fptype W3,
             fptype allV3[] ) ALWAYS_INLINE;
@@ -934,6 +940,7 @@ namespace mg5amcCpu
            const fptype allV3[],
            const fptype allV4[],
            const fptype allCOUP[],
+           const double Ccoeff,
            fptype allvertexes[] ) ALWAYS_INLINE;
 
   //--------------------------------------------------------------------------
@@ -945,6 +952,7 @@ namespace mg5amcCpu
              const fptype allV3[],
              const fptype allV4[],
              const fptype allCOUP[],
+             const double Ccoeff,
              const fptype M1,
              const fptype W1,
              fptype allV1[] ) ALWAYS_INLINE;
@@ -959,6 +967,7 @@ namespace mg5amcCpu
            const fptype allV3[],
            const fptype allV4[],
            const fptype allCOUP[],
+           const double Ccoeff,
            fptype allvertexes[] ) ALWAYS_INLINE;
 
   //--------------------------------------------------------------------------
@@ -970,6 +979,7 @@ namespace mg5amcCpu
              const fptype allV3[],
              const fptype allV4[],
              const fptype allCOUP[],
+             const double Ccoeff,
              const fptype M1,
              const fptype W1,
              fptype allV1[] ) ALWAYS_INLINE;
@@ -984,6 +994,7 @@ namespace mg5amcCpu
            const fptype allV3[],
            const fptype allV4[],
            const fptype allCOUP[],
+           const double Ccoeff,
            fptype allvertexes[] ) ALWAYS_INLINE;
 
   //--------------------------------------------------------------------------
@@ -995,6 +1006,7 @@ namespace mg5amcCpu
              const fptype allV3[],
              const fptype allV4[],
              const fptype allCOUP[],
+             const double Ccoeff,
              const fptype M1,
              const fptype W1,
              fptype allV1[] ) ALWAYS_INLINE;
@@ -1008,6 +1020,7 @@ namespace mg5amcCpu
           const fptype allV2[],
           const fptype allV3[],
           const fptype allCOUP[],
+          const double Ccoeff,
           fptype allvertexes[] )
   {
     mgDebug( 0, __FUNCTION__ );
@@ -1042,6 +1055,7 @@ namespace mg5amcCpu
   VVV1P0_1( const fptype allV2[],
             const fptype allV3[],
             const fptype allCOUP[],
+            const double Ccoeff,
             const fptype M1,
             const fptype W1,
             fptype allV1[] )
@@ -1080,6 +1094,7 @@ namespace mg5amcCpu
           const fptype allF2[],
           const fptype allV3[],
           const fptype allCOUP[],
+          const double Ccoeff,
           fptype allvertexes[] )
   {
     mgDebug( 0, __FUNCTION__ );
@@ -1103,6 +1118,7 @@ namespace mg5amcCpu
   FFV1_1( const fptype allF2[],
           const fptype allV3[],
           const fptype allCOUP[],
+          const double Ccoeff,
           const fptype M1,
           const fptype W1,
           fptype allF1[] )
@@ -1134,6 +1150,7 @@ namespace mg5amcCpu
   FFV1_2( const fptype allF1[],
           const fptype allV3[],
           const fptype allCOUP[],
+          const double Ccoeff,
           const fptype M2,
           const fptype W2,
           fptype allF2[] )
@@ -1165,6 +1182,7 @@ namespace mg5amcCpu
   FFV1P0_3( const fptype allF1[],
             const fptype allF2[],
             const fptype allCOUP[],
+            const double Ccoeff,
             const fptype M3,
             const fptype W3,
             fptype allV3[] )
@@ -1197,6 +1215,7 @@ namespace mg5amcCpu
            const fptype allV3[],
            const fptype allV4[],
            const fptype allCOUP[],
+           const double Ccoeff,
            fptype allvertexes[] )
   {
     mgDebug( 0, __FUNCTION__ );
@@ -1225,6 +1244,7 @@ namespace mg5amcCpu
              const fptype allV3[],
              const fptype allV4[],
              const fptype allCOUP[],
+             const double Ccoeff,
              const fptype M1,
              const fptype W1,
              fptype allV1[] )
@@ -1260,6 +1280,7 @@ namespace mg5amcCpu
            const fptype allV3[],
            const fptype allV4[],
            const fptype allCOUP[],
+           const double Ccoeff,
            fptype allvertexes[] )
   {
     mgDebug( 0, __FUNCTION__ );
@@ -1288,6 +1309,7 @@ namespace mg5amcCpu
              const fptype allV3[],
              const fptype allV4[],
              const fptype allCOUP[],
+             const double Ccoeff,
              const fptype M1,
              const fptype W1,
              fptype allV1[] )
@@ -1323,6 +1345,7 @@ namespace mg5amcCpu
            const fptype allV3[],
            const fptype allV4[],
            const fptype allCOUP[],
+           const double Ccoeff,
            fptype allvertexes[] )
   {
     mgDebug( 0, __FUNCTION__ );
@@ -1351,6 +1374,7 @@ namespace mg5amcCpu
              const fptype allV3[],
              const fptype allV4[],
              const fptype allCOUP[],
+             const double Ccoeff,
              const fptype M1,
              const fptype W1,
              fptype allV1[] )
diff --git a/epochX/cudacpp/gg_ttggg.sa/src/Parameters_sm.cc b/epochX/cudacpp/gg_ttggg.sa/src/Parameters_sm.cc
index 05eba20217..067445b198 100644
--- a/epochX/cudacpp/gg_ttggg.sa/src/Parameters_sm.cc
+++ b/epochX/cudacpp/gg_ttggg.sa/src/Parameters_sm.cc
@@ -7,7 +7,7 @@
 // Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+// MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
diff --git a/epochX/cudacpp/gg_ttggg.sa/src/Parameters_sm.h b/epochX/cudacpp/gg_ttggg.sa/src/Parameters_sm.h
index 41830f87ca..9581d66e0e 100644
--- a/epochX/cudacpp/gg_ttggg.sa/src/Parameters_sm.h
+++ b/epochX/cudacpp/gg_ttggg.sa/src/Parameters_sm.h
@@ -7,7 +7,7 @@
 // Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+// MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
diff --git a/epochX/cudacpp/gg_ttggg.sa/src/cudacpp_src.mk b/epochX/cudacpp/gg_ttggg.sa/src/cudacpp_src.mk
index f2804ffb85..159e19a46d 100644
--- a/epochX/cudacpp/gg_ttggg.sa/src/cudacpp_src.mk
+++ b/epochX/cudacpp/gg_ttggg.sa/src/cudacpp_src.mk
@@ -36,6 +36,13 @@ endif
 # See https://www.gnu.org/software/make/manual/html_node/Implicit-Variables.html
 ###RANLIB = ranlib
 
+# Add -mmacosx-version-min=11.3 to avoid "ld: warning: object file was built for newer macOS version than being linked"
+LDFLAGS =
+ifneq ($(shell $(CXX) --version | egrep '^Apple clang'),)
+CXXFLAGS += -mmacosx-version-min=11.3
+LDFLAGS += -mmacosx-version-min=11.3
+endif
+
 #-------------------------------------------------------------------------------
 
 #=== Configure the CUDA compiler (note: GPUCC is already exported including ccache)
@@ -266,11 +273,11 @@ endif
 ifneq ($(GPUCC),)
 $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so : $(cxx_objects) $(cu_objects)
 	@if [ ! -d $(LIBDIR) ]; then echo "mkdir -p $(LIBDIR)"; mkdir -p $(LIBDIR); fi
-	$(GPUCC) -shared -o $@ $(cxx_objects) $(cu_objects)
+	$(GPUCC) -shared -o $@ $(cxx_objects) $(cu_objects) $(LDFLAGS)
 else
 $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so : $(cxx_objects)
 	@if [ ! -d $(LIBDIR) ]; then echo "mkdir -p $(LIBDIR)"; mkdir -p $(LIBDIR); fi
-	$(CXX) -shared -o $@ $(cxx_objects)
+	$(CXX) -shared -o $@ $(cxx_objects) $(LDFLAGS)
 endif
 
 #-------------------------------------------------------------------------------
diff --git a/epochX/cudacpp/gg_ttggg.sa/src/mgOnGpuConfig.h b/epochX/cudacpp/gg_ttggg.sa/src/mgOnGpuConfig.h
index 205accb85b..da4ba36ad8 100644
--- a/epochX/cudacpp/gg_ttggg.sa/src/mgOnGpuConfig.h
+++ b/epochX/cudacpp/gg_ttggg.sa/src/mgOnGpuConfig.h
@@ -15,7 +15,6 @@
 #define MGONGPUCPP_GPUIMPL cuda
 #elif defined __HIPCC__
 #define MGONGPUCPP_GPUIMPL hip
-#include "hip/hip_runtime.h"
 #else
 #undef MGONGPUCPP_GPUIMPL
 #endif
@@ -24,16 +23,19 @@
 // ** NB2 Baseline on b7g47n0004 fluctuates (probably depends on load on other VMs)
 
 // Choose if curand is supported for generating random numbers
-// For CUDA, by default, it is supported
-// For HIP, by default, it is not supported
-// For C++, by default, do not inline, but allow this macro to be set from outside with e.g. -DMGONGPU_HAS_NO_CURAND
-#ifdef __CUDACC__
-#undef MGONGPU_HAS_NO_CURAND
-#elif defined __HIPCC__
+// For HIP, by default, do not use curand (common random numbers will be used instead)
+// For both CUDA and C++, by default, do not inline, but allow this macro to be set from outside with e.g. -DMGONGPU_HAS_NO_CURAND
+// (there exist CUDA installations, e.g. using the HPC package, which do not include curand - see PR #784 and #785)
+#if defined __HIPCC__
 #define MGONGPU_HAS_NO_CURAND 1
 #else
+//#ifdef __CUDACC__
+//#undef MGONGPU_HAS_NO_CURAND // default
+////#define MGONGPU_HAS_NO_CURAND 1
+//#else
 //#undef MGONGPU_HAS_NO_CURAND // default
 ////#define MGONGPU_HAS_NO_CURAND 1
+//#endif
 #endif
 
 // Choose floating point precision (for everything but color algebra #537)
diff --git a/epochX/cudacpp/gg_ttggg.sa/src/mgOnGpuCxtypes.h b/epochX/cudacpp/gg_ttggg.sa/src/mgOnGpuCxtypes.h
index 46d9f02733..5532e22fa1 100644
--- a/epochX/cudacpp/gg_ttggg.sa/src/mgOnGpuCxtypes.h
+++ b/epochX/cudacpp/gg_ttggg.sa/src/mgOnGpuCxtypes.h
@@ -159,6 +159,12 @@ namespace mg5amcCpu
     return cxsmpl<float>( a, 0 ) * b;
   }
 
+  inline __host__ __device__ constexpr cxsmpl<float>
+  operator*( const cxsmpl<float>& a, const double& b )
+  {
+    return a * cxsmpl<float>( b, 0 );
+  }
+
   template<typename FP>
   inline __host__ __device__ constexpr cxsmpl<FP>
   operator/( const cxsmpl<FP>& a, const cxsmpl<FP>& b )
diff --git a/epochX/cudacpp/gq_ttq.mad/CODEGEN_mad_gq_ttq_log.txt b/epochX/cudacpp/gq_ttq.mad/CODEGEN_mad_gq_ttq_log.txt
index 2c0e77fafd..d46fea2318 100644
--- a/epochX/cudacpp/gq_ttq.mad/CODEGEN_mad_gq_ttq_log.txt
+++ b/epochX/cudacpp/gq_ttq.mad/CODEGEN_mad_gq_ttq_log.txt
@@ -52,7 +52,7 @@ Note that you can still compile and run aMC@NLO with the built-in PDFs
 
 Using default text editor "vi". Set another one in ./input/mg5_configuration.txt
 Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt
-No valid web browser found. Please set in ./input/mg5_configuration.txt
+Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt
 import /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq.mg
 The import format was not given, so we guess it as command
 set stdout_level DEBUG
@@ -61,7 +61,7 @@ set zerowidth_tchannel F
 define q = u c d s u~ c~ d~ s~
 INFO: load particles 
 INFO: load vertices 
-[1;32mDEBUG: model prefixing  takes 0.005677223205566406 [0m
+[1;32mDEBUG: model prefixing  takes 0.005537271499633789 [0m
 INFO: Restrict model sm with file models/sm/restrict_default.dat . 
 [1;32mDEBUG: Simplifying conditional expressions [0m
 [1;32mDEBUG: remove interactions: u s w+ at order: QED=1 [0m
@@ -170,7 +170,7 @@ INFO: Crossed process found for g u~ > t t~ u~, reuse diagrams.
 INFO: Crossed process found for g c~ > t t~ c~, reuse diagrams. 
 INFO: Crossed process found for g d~ > t t~ d~, reuse diagrams. 
 INFO: Crossed process found for g s~ > t t~ s~, reuse diagrams. 
-8 processes with 40 diagrams generated in 0.080 s
+8 processes with 40 diagrams generated in 0.078 s
 Total: 8 processes with 40 diagrams
 output madevent ../TMPOUT/CODEGEN_mad_gq_ttq --hel_recycling=False --vector_size=32 --me_exporter=standalone_cudacpp
 Load PLUGIN.CUDACPP_OUTPUT
@@ -198,7 +198,7 @@ INFO: Combined process g d~ > t t~ d~ WEIGHTED<=3 @1 with process g u~ > t t~ u~
 INFO: Combined process g s~ > t t~ s~ WEIGHTED<=3 @1 with process g u~ > t t~ u~ WEIGHTED<=3 @1 
 INFO: Creating files in directory P1_gu_ttxu 
 [1;32mDEBUG:  kwargs[prefix] = 0 [1;30m[model_handling.py at line 1058][0m [0m
-[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7f914b38cc40> [1;30m[export_v4.py at line 6262][0m [0m
+[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7fd95bf71be0> [1;30m[export_v4.py at line 6262][0m [0m
 INFO: Creating files in directory . 
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.h
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.cc
@@ -215,7 +215,7 @@ INFO: Generating Feynman diagrams for Process: g u > t t~ u WEIGHTED<=3 @1
 INFO: Finding symmetric diagrams for subprocess group gu_ttxu 
 INFO: Creating files in directory P1_gux_ttxux 
 [1;32mDEBUG:  kwargs[prefix] = 0 [1;30m[model_handling.py at line 1058][0m [0m
-[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7f914b238c10> [1;30m[export_v4.py at line 6262][0m [0m
+[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7fd95bf71be0> [1;30m[export_v4.py at line 6262][0m [0m
 INFO: Creating files in directory . 
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.h
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.cc
@@ -230,17 +230,17 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./.
 [1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m 32 True 32 [1;30m[export_v4.py at line 1872][0m [0m
 INFO: Generating Feynman diagrams for Process: g u~ > t t~ u~ WEIGHTED<=3 @1 
 INFO: Finding symmetric diagrams for subprocess group gux_ttxux 
-Generated helas calls for 2 subprocesses (10 diagrams) in 0.032 s
-Wrote files for 32 helas calls in 0.231 s
+Generated helas calls for 2 subprocesses (10 diagrams) in 0.031 s
+Wrote files for 32 helas calls in 0.219 s
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates FFV1 routines[0m
 ALOHA: aloha creates VVV1 routines[0m
-ALOHA: aloha creates 2 routines in  0.364 s
+ALOHA: aloha creates 2 routines in  0.145 s
 [1;32mDEBUG:  Entering PLUGIN_ProcessExporter.convert_model (create the model) [1;30m[output.py at line 202][0m [0m
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates FFV1 routines[0m
 ALOHA: aloha creates VVV1 routines[0m
-ALOHA: aloha creates 4 routines in  0.137 s
+ALOHA: aloha creates 4 routines in  0.136 s
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
@@ -294,10 +294,10 @@ Type "launch" to generate events from this process, or see
 Run "open index.html" to see more information about this process.
 quit
 
-real	0m2.934s
-user	0m1.748s
-sys	0m0.220s
-Code generation completed in 3 seconds
+real	0m1.960s
+user	0m1.693s
+sys	0m0.242s
+Code generation completed in 2 seconds
 ************************************************************
 *                                                          *
 *                      W E L C O M E to                    *
@@ -323,7 +323,7 @@ INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/mg5amc
 INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq/Cards/me5_configuration.txt  
 Using default text editor "vi". Set another one in ./input/mg5_configuration.txt
 Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt
-No valid web browser found. Please set in ./input/mg5_configuration.txt
+Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt
 treatcards run
 quit
 INFO:  
@@ -353,7 +353,7 @@ INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/mg5amc
 INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq/Cards/me5_configuration.txt  
 Using default text editor "vi". Set another one in ./input/mg5_configuration.txt
 Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt
-No valid web browser found. Please set in ./input/mg5_configuration.txt
+Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt
 treatcards param
 quit
 INFO:  
diff --git a/epochX/cudacpp/gq_ttq.mad/COPYRIGHT b/epochX/cudacpp/gq_ttq.mad/COPYRIGHT
index a134b5fef9..84a883fbb0 100644
--- a/epochX/cudacpp/gq_ttq.mad/COPYRIGHT
+++ b/epochX/cudacpp/gq_ttq.mad/COPYRIGHT
@@ -15,6 +15,7 @@ The full development team currently includes the following authors :
   Stephan Hageboeck (CERN)
   Olivier Mattelaer (Universite Catholique de Louvain, original author)
   Stefan Roiser (CERN, original author)
+  Joergen Teig (CERN)
   Andrea Valassi (CERN, original author)
   Zenny Wettersten (CERN)
 See https://github.com/madgraph5/madgraph4gpu for more details. For the full
diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/Bridge.h b/epochX/cudacpp/gq_ttq.mad/SubProcesses/Bridge.h
index bf8b5e024d..89437b4c42 100644
--- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/Bridge.h
+++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/Bridge.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: S. Roiser (Nov 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Roiser, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef BRIDGE_H
 #define BRIDGE_H 1
@@ -23,7 +23,7 @@
 #include <memory>
 #include <type_traits>
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -83,7 +83,7 @@ namespace mg5amcCpu
     Bridge& operator=( const Bridge& ) = delete;
     Bridge& operator=( Bridge&& ) = delete;
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     /**
      * Set the gpublocks and gputhreads for the gpusequence - throws if evnt != gpublocks*gputhreads
      * (this is needed for BridgeKernel tests rather than for actual production use in Fortran)
@@ -150,7 +150,7 @@ namespace mg5amcCpu
     unsigned int m_nevt; // number of events
     int m_nGoodHel;      // the number of good helicities (-1 initially when they have not yet been calculated)
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     int m_gputhreads; // number of gpu threads (default set from number of events, can be modified)
     int m_gpublocks;  // number of gpu blocks (default set from number of events, can be modified)
     DeviceBuffer<FORTRANFPTYPE, sizePerEventMomenta> m_devMomentaF;
@@ -187,12 +187,12 @@ namespace mg5amcCpu
   // Forward declare transposition methods
   //
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 
   template<typename Tin, typename Tout>
   __global__ void dev_transposeMomentaF2C( const Tin* in, Tout* out, const unsigned int nevt );
 
-#endif // __CUDACC__
+#endif // MGONGPUCPP_GPUIMPL
 
   template<typename Tin, typename Tout>
   void hst_transposeMomentaF2C( const Tin* in, Tout* out, const unsigned int nevt );
@@ -209,7 +209,7 @@ namespace mg5amcCpu
   Bridge<FORTRANFPTYPE>::Bridge( unsigned int nevtF, unsigned int nparF, unsigned int np4F )
     : m_nevt( nevtF )
     , m_nGoodHel( -1 )
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     , m_gputhreads( 256 )                  // default number of gpu threads
     , m_gpublocks( m_nevt / m_gputhreads ) // this ensures m_nevt <= m_gpublocks*m_gputhreads
     , m_devMomentaF( m_nevt )
@@ -233,7 +233,7 @@ namespace mg5amcCpu
   {
     if( nparF != CPPProcess::npar ) throw std::runtime_error( "Bridge constructor: npar mismatch" );
     if( np4F != CPPProcess::np4 ) throw std::runtime_error( "Bridge constructor: np4 mismatch" );
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     if( ( m_nevt < s_gputhreadsmin ) || ( m_nevt % s_gputhreadsmin != 0 ) )
       throw std::runtime_error( "Bridge constructor: nevt should be a multiple of " + std::to_string( s_gputhreadsmin ) );
     while( m_nevt != m_gpublocks * m_gputhreads )
@@ -249,7 +249,7 @@ namespace mg5amcCpu
 #else
     std::cout << "WARNING! Instantiate host Bridge (nevt=" << m_nevt << ")" << std::endl;
     m_pmek.reset( new MatrixElementKernelHost( m_hstMomentaC, m_hstGs, m_hstRndHel, m_hstRndCol, m_hstMEs, m_hstSelHel, m_hstSelCol, m_nevt ) );
-#endif // __CUDACC__
+#endif // MGONGPUCPP_GPUIMPL
     // Create a process object, read param card and set parameters
     // FIXME: the process instance can happily go out of scope because it is only needed to read parameters?
     // FIXME: the CPPProcess should really be a singleton? what if fbridgecreate is called from several Fortran threads?
@@ -262,7 +262,7 @@ namespace mg5amcCpu
     process.initProc( paramCard );
   }
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   template<typename FORTRANFPTYPE>
   void Bridge<FORTRANFPTYPE>::set_gpugrid( const int gpublocks, const int gputhreads )
   {
@@ -276,7 +276,7 @@ namespace mg5amcCpu
   }
 #endif
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   template<typename FORTRANFPTYPE>
   void Bridge<FORTRANFPTYPE>::gpu_sequence( const FORTRANFPTYPE* momenta,
                                             const FORTRANFPTYPE* gs,
@@ -291,14 +291,14 @@ namespace mg5amcCpu
     constexpr int neppM = MemoryAccessMomenta::neppM;
     if constexpr( neppM == 1 && std::is_same_v<FORTRANFPTYPE, fptype> )
     {
-      checkCuda( cudaMemcpy( m_devMomentaC.data(), momenta, m_devMomentaC.bytes(), cudaMemcpyHostToDevice ) );
+      gpuMemcpy( m_devMomentaC.data(), momenta, m_devMomentaC.bytes(), gpuMemcpyHostToDevice );
     }
     else
     {
-      checkCuda( cudaMemcpy( m_devMomentaF.data(), momenta, m_devMomentaF.bytes(), cudaMemcpyHostToDevice ) );
+      gpuMemcpy( m_devMomentaF.data(), momenta, m_devMomentaF.bytes(), gpuMemcpyHostToDevice );
       const int thrPerEvt = CPPProcess::npar * CPPProcess::np4; // AV: transpose alg does 1 element per thread (NOT 1 event per thread)
       //const int thrPerEvt = 1; // AV: try new alg with 1 event per thread... this seems slower
-      dev_transposeMomentaF2C<<<m_gpublocks * thrPerEvt, m_gputhreads>>>( m_devMomentaF.data(), m_devMomentaC.data(), m_nevt );
+      gpuLaunchKernel( dev_transposeMomentaF2C, m_gpublocks * thrPerEvt, m_gputhreads, m_devMomentaF.data(), m_devMomentaC.data(), m_nevt );
     }
     if constexpr( std::is_same_v<FORTRANFPTYPE, fptype> )
     {
@@ -341,7 +341,7 @@ namespace mg5amcCpu
   }
 #endif
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   template<typename FORTRANFPTYPE>
   void Bridge<FORTRANFPTYPE>::cpu_sequence( const FORTRANFPTYPE* momenta,
                                             const FORTRANFPTYPE* gs,
@@ -396,7 +396,7 @@ namespace mg5amcCpu
   // - C++ array: momenta[npagM][npar][np4][neppM] with nevt=npagM*neppM (AOSOA)
   //
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   template<typename Tin, typename Tout>
   __global__ void dev_transposeMomentaF2C( const Tin* in, Tout* out, const unsigned int nevt )
   {
diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/BridgeKernels.cc b/epochX/cudacpp/gq_ttq.mad/SubProcesses/BridgeKernels.cc
index d58066c9c1..eaf4037a24 100644
--- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/BridgeKernels.cc
+++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/BridgeKernels.cc
@@ -1,17 +1,18 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #include "BridgeKernels.h"
 
+#include "GpuAbstraction.h"
 #include "MemoryAccessMomenta.h"
 
 #include <sstream>
 
 //============================================================================
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -45,7 +46,7 @@ namespace mg5amcCpu
 
 //============================================================================
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 namespace mg5amcCpu
 {
 
@@ -96,7 +97,7 @@ namespace mg5amcCpu
 
 //============================================================================
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 {
 
diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/BridgeKernels.h b/epochX/cudacpp/gq_ttq.mad/SubProcesses/BridgeKernels.h
index 15eb4bff4d..3efef8ce97 100644
--- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/BridgeKernels.h
+++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/BridgeKernels.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef BRIDGEKERNELS_H
 #define BRIDGEKERNELS_H 1
@@ -12,7 +12,7 @@
 #include "MatrixElementKernels.h"
 #include "MemoryBuffers.h"
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -49,7 +49,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A Bridge wrapper class encapsulating matrix element calculations on a CPU host
   class BridgeKernelHost final : public BridgeKernelBase
   {
@@ -89,7 +89,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   // A Bridge wrapper class encapsulating matrix element calculations on a GPU device
   class BridgeKernelDevice : public BridgeKernelBase
   {
diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/CommonRandomNumberKernel.cc b/epochX/cudacpp/gq_ttq.mad/SubProcesses/CommonRandomNumberKernel.cc
index 985b39f576..010bc4cbd0 100644
--- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/CommonRandomNumberKernel.cc
+++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/CommonRandomNumberKernel.cc
@@ -1,15 +1,16 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #include "CommonRandomNumbers.h"
+#include "GpuAbstraction.h"
 #include "MemoryBuffers.h"
 #include "RandomNumberKernels.h"
 
 #include <cassert>
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/CrossSectionKernels.cc b/epochX/cudacpp/gq_ttq.mad/SubProcesses/CrossSectionKernels.cc
index 0b355a3c8d..c15b39844d 100644
--- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/CrossSectionKernels.cc
+++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/CrossSectionKernels.cc
@@ -1,10 +1,11 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #include "CrossSectionKernels.h"
 
+#include "GpuAbstraction.h"
 #include "MemoryAccessMatrixElements.h"
 #include "MemoryAccessWeights.h"
 #include "MemoryBuffers.h"
@@ -77,7 +78,7 @@ debug_me_is_abnormal( const fptype& me, size_t ievtALL )
 
 //============================================================================
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -185,7 +186,7 @@ namespace mg5amcCpu
 
 //============================================================================
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 {
 
diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/CrossSectionKernels.h b/epochX/cudacpp/gq_ttq.mad/SubProcesses/CrossSectionKernels.h
index 7933ca4bbf..4d9659e04e 100644
--- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/CrossSectionKernels.h
+++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/CrossSectionKernels.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef CROSSSECTIONKERNELS_H
 #define CROSSSECTIONKERNELS_H 1
@@ -13,7 +13,7 @@
 
 //============================================================================
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -96,7 +96,7 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
   /*
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   // A class encapsulating the calculation of event statistics on a GPU device
   class CrossSectionKernelDevice : public CrossSectionKernelBase, public NumberOfEvents
   {
diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/CurandRandomNumberKernel.cc b/epochX/cudacpp/gq_ttq.mad/SubProcesses/CurandRandomNumberKernel.cc
index eb56333b03..08a16f6f2c 100644
--- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/CurandRandomNumberKernel.cc
+++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/CurandRandomNumberKernel.cc
@@ -1,9 +1,9 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
-#include "CudaRuntime.h"
+#include "GpuRuntime.h"
 #include "MemoryBuffers.h"
 #include "RandomNumberKernels.h"
 
@@ -22,7 +22,7 @@ inline void assertCurand( curandStatus_t code, const char *file, int line, bool
 }
 #endif /* clang-format on */
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -36,7 +36,7 @@ namespace mg5amcCpu
   {
     if( m_isOnDevice )
     {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
       if( !m_rnarray.isOnDevice() )
         throw std::runtime_error( "CurandRandomNumberKernel on device with a host random number array" );
 #else
@@ -114,7 +114,7 @@ namespace mg5amcCpu
     /*
     printf( "\nCurandRandomNumberKernel::generateRnarray size = %d\n", (int)m_rnarray.size() );
     fptype* data = m_rnarray.data();
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     if( m_rnarray.isOnDevice() )
     {
       data = new fptype[m_rnarray.size()]();
@@ -123,7 +123,7 @@ namespace mg5amcCpu
 #endif
     for( int i = 0; i < ( (int)m_rnarray.size() / 4 ); i++ )
       printf( "[%4d] %f %f %f %f\n", i * 4, data[i * 4], data[i * 4 + 2], data[i * 4 + 2], data[i * 4 + 3] );
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     if( m_rnarray.isOnDevice() ) delete[] data;
 #endif
     */
diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/EventStatistics.h b/epochX/cudacpp/gq_ttq.mad/SubProcesses/EventStatistics.h
index 48b51e0a49..b425a5bade 100644
--- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/EventStatistics.h
+++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/EventStatistics.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef EventStatistics_H
 #define EventStatistics_H 1
@@ -16,7 +16,7 @@
 #include <limits>
 #include <string>
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/GpuAbstraction.h b/epochX/cudacpp/gq_ttq.mad/SubProcesses/GpuAbstraction.h
index 9c467b1e04..6a7d9c05c0 100644
--- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/GpuAbstraction.h
+++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/GpuAbstraction.h
@@ -39,6 +39,8 @@
 
 #elif defined __HIPCC__
 
+#include "hip/hip_runtime.h"
+
 #define gpuError_t hipError_t
 #define gpuPeekAtLastError hipPeekAtLastError
 #define gpuGetErrorString hipGetErrorString
diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/MadgraphTest.h b/epochX/cudacpp/gq_ttq.mad/SubProcesses/MadgraphTest.h
index ef40624c88..a64c05c26a 100644
--- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/MadgraphTest.h
+++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/MadgraphTest.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: S. Hageboeck (Dec 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MADGRAPHTEST_H_
 #define MADGRAPHTEST_H_ 1
@@ -22,7 +22,7 @@
 #include <string>
 #include <vector>
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 using mg5amcGpu::CPPProcess;
 #else
 using mg5amcCpu::CPPProcess;
@@ -201,7 +201,7 @@ class MadgraphTest : public testing::TestWithParam<TestDriverBase*>
 
 // Since we link both the CPU-only and GPU tests into the same executable, we prevent
 // a multiply defined symbol by only compiling this in the non-CUDA phase:
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 
 /// Compare momenta and matrix elements.
 /// This uses an implementation of TestDriverBase to run a madgraph workflow,
@@ -307,6 +307,6 @@ TEST_P( MadgraphTest, CompareMomentaAndME )
   }
 }
 
-#endif // __CUDACC__
+#endif // MGONGPUCPP_GPUIMPL
 
 #endif /* MADGRAPHTEST_H_ */
diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/MatrixElementKernels.cc b/epochX/cudacpp/gq_ttq.mad/SubProcesses/MatrixElementKernels.cc
index 74b5239ebf..81699dfea9 100644
--- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/MatrixElementKernels.cc
+++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/MatrixElementKernels.cc
@@ -1,12 +1,12 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #include "MatrixElementKernels.h"
 
 #include "CPPProcess.h"
-#include "CudaRuntime.h"
+#include "GpuRuntime.h" // Includes the abstraction for Nvidia/AMD compilation
 #include "MemoryAccessMomenta.h"
 #include "MemoryBuffers.h"
 
@@ -14,7 +14,7 @@
 
 //============================================================================
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 namespace mg5amcCpu
 {
 
@@ -150,7 +150,7 @@ namespace mg5amcCpu
 
 //============================================================================
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 {
 
@@ -209,13 +209,13 @@ namespace mg5amcGpu
     PinnedHostBufferHelicityMask hstIsGoodHel( ncomb );
     DeviceBufferHelicityMask devIsGoodHel( ncomb );
     // ... 0d1. Compute good helicity mask on the device
-    computeDependentCouplings<<<m_gpublocks, m_gputhreads>>>( m_gs.data(), m_couplings.data() );
+    gpuLaunchKernel( computeDependentCouplings, m_gpublocks, m_gputhreads, m_gs.data(), m_couplings.data() );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    sigmaKin_getGoodHel<<<m_gpublocks, m_gputhreads>>>( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_numerators.data(), m_denominators.data(), devIsGoodHel.data() );
+    gpuLaunchKernel( sigmaKin_getGoodHel, m_gpublocks, m_gputhreads, m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_numerators.data(), m_denominators.data(), devIsGoodHel.data() );
 #else
-    sigmaKin_getGoodHel<<<m_gpublocks, m_gputhreads>>>( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), devIsGoodHel.data() );
+    gpuLaunchKernel( sigmaKin_getGoodHel, m_gpublocks, m_gputhreads, m_momenta.data(), m_couplings.data(), m_matrixElements.data(), devIsGoodHel.data() );
 #endif
-    checkCuda( cudaPeekAtLastError() );
+    checkGpu( gpuPeekAtLastError() );
     // ... 0d2. Copy back good helicity mask to the host
     copyHostFromDevice( hstIsGoodHel, devIsGoodHel );
     // ... 0d3. Copy back good helicity list to constant memory on the device
@@ -226,19 +226,19 @@ namespace mg5amcGpu
 
   void MatrixElementKernelDevice::computeMatrixElements( const unsigned int channelId )
   {
-    computeDependentCouplings<<<m_gpublocks, m_gputhreads>>>( m_gs.data(), m_couplings.data() );
+    gpuLaunchKernel( computeDependentCouplings, m_gpublocks, m_gputhreads, m_gs.data(), m_couplings.data() );
 #ifndef MGONGPU_NSIGHT_DEBUG
     constexpr unsigned int sharedMemSize = 0;
 #else
     constexpr unsigned int sharedMemSize = ntpbMAX * sizeof( float );
 #endif
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    sigmaKin<<<m_gpublocks, m_gputhreads, sharedMemSize>>>( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), channelId, m_numerators.data(), m_denominators.data(), m_selhel.data(), m_selcol.data() );
+    gpuLaunchKernelSharedMem( sigmaKin, m_gpublocks, m_gputhreads, sharedMemSize, m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), channelId, m_numerators.data(), m_denominators.data(), m_selhel.data(), m_selcol.data() );
 #else
-    sigmaKin<<<m_gpublocks, m_gputhreads, sharedMemSize>>>( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), m_selhel.data(), m_selcol.data() );
+    gpuLaunchKernelSharedMem( sigmaKin, m_gpublocks, m_gputhreads, sharedMemSize, m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), m_selhel.data(), m_selcol.data() );
 #endif
-    checkCuda( cudaPeekAtLastError() );
-    checkCuda( cudaDeviceSynchronize() );
+    checkGpu( gpuPeekAtLastError() );
+    checkGpu( gpuDeviceSynchronize() );
   }
 
   //--------------------------------------------------------------------------
diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/MatrixElementKernels.h b/epochX/cudacpp/gq_ttq.mad/SubProcesses/MatrixElementKernels.h
index 23e84757a2..72bd8f195b 100644
--- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/MatrixElementKernels.h
+++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/MatrixElementKernels.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MATRIXELEMENTKERNELS_H
 #define MATRIXELEMENTKERNELS_H 1
@@ -10,7 +10,7 @@
 
 #include "MemoryBuffers.h"
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -81,7 +81,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating matrix element calculations on a CPU host
   class MatrixElementKernelHost final : public MatrixElementKernelBase, public NumberOfEvents
   {
@@ -130,7 +130,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   // A class encapsulating matrix element calculations on a GPU device
   class MatrixElementKernelDevice : public MatrixElementKernelBase, public NumberOfEvents
   {
diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/MemoryAccessAmplitudes.h b/epochX/cudacpp/gq_ttq.mad/SubProcesses/MemoryAccessAmplitudes.h
index 573b3bbbc9..ffb76e93de 100644
--- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/MemoryAccessAmplitudes.h
+++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/MemoryAccessAmplitudes.h
@@ -15,7 +15,7 @@
 #define MGONGPU_TRIVIAL_AMPLITUDES 1
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/MemoryAccessCouplings.h b/epochX/cudacpp/gq_ttq.mad/SubProcesses/MemoryAccessCouplings.h
index 35a3af42e0..3afdf3e554 100644
--- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/MemoryAccessCouplings.h
+++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/MemoryAccessCouplings.h
@@ -15,7 +15,7 @@
 #include "MemoryBuffers.h"       // for HostBufferCouplings::isaligned
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/MemoryAccessCouplingsFixed.h b/epochX/cudacpp/gq_ttq.mad/SubProcesses/MemoryAccessCouplingsFixed.h
index dc0d93afff..ffcdf4dbef 100644
--- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/MemoryAccessCouplingsFixed.h
+++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/MemoryAccessCouplingsFixed.h
@@ -14,7 +14,7 @@
 //#include "MemoryAccessHelpers.h"
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/MemoryAccessDenominators.h b/epochX/cudacpp/gq_ttq.mad/SubProcesses/MemoryAccessDenominators.h
index 3bce635718..66f2d32a6b 100644
--- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/MemoryAccessDenominators.h
+++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/MemoryAccessDenominators.h
@@ -10,7 +10,7 @@
 #include "MemoryAccessGs.h"
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/MemoryAccessGs.h b/epochX/cudacpp/gq_ttq.mad/SubProcesses/MemoryAccessGs.h
index 31311aa375..4c726b30f3 100644
--- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/MemoryAccessGs.h
+++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/MemoryAccessGs.h
@@ -13,7 +13,7 @@
 #include "MemoryBuffers.h" // for HostBufferMatrixElements::isaligned
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/MemoryAccessHelpers.h b/epochX/cudacpp/gq_ttq.mad/SubProcesses/MemoryAccessHelpers.h
index c82a6c7635..db73e4e064 100644
--- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/MemoryAccessHelpers.h
+++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/MemoryAccessHelpers.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MemoryAccessHelpers_H
 #define MemoryAccessHelpers_H 1
@@ -105,7 +105,7 @@ class KernelAccessHelper : public MemoryAccessHelper<T>
     }
     else
     {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
       const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid
       //printf( "kernelAccessRecord: ievt=%d threadId=%d\n", ievt, threadIdx.x );
       return T::ieventAccessRecord( buffer, ievt ); // NB fptype and fptype_sv coincide for CUDA
diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/MemoryAccessMatrixElements.h b/epochX/cudacpp/gq_ttq.mad/SubProcesses/MemoryAccessMatrixElements.h
index f32e6fea5b..3741011971 100644
--- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/MemoryAccessMatrixElements.h
+++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/MemoryAccessMatrixElements.h
@@ -13,7 +13,7 @@
 #include "MemoryBuffers.h" // for HostBufferMatrixElements::isaligned
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/MemoryAccessMomenta.h b/epochX/cudacpp/gq_ttq.mad/SubProcesses/MemoryAccessMomenta.h
index 29266de32c..3be229d392 100644
--- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/MemoryAccessMomenta.h
+++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/MemoryAccessMomenta.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MemoryAccessMomenta_H
 #define MemoryAccessMomenta_H 1
@@ -13,7 +13,7 @@
 #include "MemoryAccessVectors.h"
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -30,7 +30,7 @@ namespace mg5amcCpu
 
     // Number of Events Per Page in the momenta AOSOA memory buffer layout
     // (these are all best kept as a compile-time constants: see issue #23)
-#ifdef __CUDACC__ /* clang-format off */
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
     // -----------------------------------------------------------------------------------------------
     // --- GPUs: neppM is best set to a power of 2 times the number of fptype's in a 32-byte cacheline
     // --- This is relevant to ensure coalesced access to momenta in global memory
diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/MemoryAccessNumerators.h b/epochX/cudacpp/gq_ttq.mad/SubProcesses/MemoryAccessNumerators.h
index b152183b28..18991f4fa6 100644
--- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/MemoryAccessNumerators.h
+++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/MemoryAccessNumerators.h
@@ -10,7 +10,7 @@
 #include "MemoryAccessGs.h"
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/MemoryAccessRandomNumbers.h b/epochX/cudacpp/gq_ttq.mad/SubProcesses/MemoryAccessRandomNumbers.h
index e2988d39f3..40cb089135 100644
--- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/MemoryAccessRandomNumbers.h
+++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/MemoryAccessRandomNumbers.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MemoryAccessRandomNumbers_H
 #define MemoryAccessRandomNumbers_H 1
@@ -11,7 +11,7 @@
 #include "CPPProcess.h"
 #include "MemoryAccessHelpers.h"
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 using mg5amcGpu::CPPProcess;
 #else
 using mg5amcCpu::CPPProcess;
diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/MemoryAccessVectors.h b/epochX/cudacpp/gq_ttq.mad/SubProcesses/MemoryAccessVectors.h
index e9b197368e..08faccff0f 100644
--- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/MemoryAccessVectors.h
+++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/MemoryAccessVectors.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MemoryAccessVectors_H
 #define MemoryAccessVectors_H 1
@@ -10,7 +10,7 @@
 
 #include "mgOnGpuVectors.h"
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 namespace mg5amcCpu // this is only needed for CPU SIMD vectorization
 {
 
diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/MemoryAccessWavefunctions.h b/epochX/cudacpp/gq_ttq.mad/SubProcesses/MemoryAccessWavefunctions.h
index 5428aaf933..33bef4559e 100644
--- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/MemoryAccessWavefunctions.h
+++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/MemoryAccessWavefunctions.h
@@ -15,7 +15,7 @@
 #define MGONGPU_TRIVIAL_WAVEFUNCTIONS 1
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/MemoryBuffers.h b/epochX/cudacpp/gq_ttq.mad/SubProcesses/MemoryBuffers.h
index 3093e6ed18..7756a71621 100644
--- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/MemoryBuffers.h
+++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/MemoryBuffers.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021, based on earlier work by S. Hageboeck) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Roiser, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MemoryBuffers_H
 #define MemoryBuffers_H 1
@@ -11,12 +11,12 @@
 #include "mgOnGpuCxtypes.h"
 
 #include "CPPProcess.h"
-#include "CudaRuntime.h"
+#include "GpuRuntime.h"
 #include "Parameters_sm.h"
 
 #include <sstream>
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -87,7 +87,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   constexpr bool HostBufferALIGNED = false;   // ismisaligned=false
   constexpr bool HostBufferMISALIGNED = true; // ismisaligned=true
 
@@ -119,7 +119,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   // A class encapsulating a CUDA pinned host buffer
   template<typename T>
   class PinnedHostBufferBase : public BufferBase<T>
@@ -128,18 +128,18 @@ namespace mg5amcCpu
     PinnedHostBufferBase( const size_t size )
       : BufferBase<T>( size, false )
     {
-      checkCuda( cudaMallocHost( &( this->m_data ), this->bytes() ) );
+      gpuMallocHost( &( this->m_data ), this->bytes() );
     }
     virtual ~PinnedHostBufferBase()
     {
-      checkCuda( cudaFreeHost( this->m_data ) );
+      gpuFreeHost( this->m_data );
     }
   };
 #endif
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   // A class encapsulating a CUDA device buffer
   template<typename T>
   class DeviceBufferBase : public BufferBase<T>
@@ -148,18 +148,18 @@ namespace mg5amcCpu
     DeviceBufferBase( const size_t size )
       : BufferBase<T>( size, true )
     {
-      checkCuda( cudaMalloc( &( this->m_data ), this->bytes() ) );
+      gpuMalloc( &( this->m_data ), this->bytes() );
     }
     virtual ~DeviceBufferBase()
     {
-      checkCuda( cudaFree( this->m_data ) );
+      gpuFree( this->m_data );
     }
   };
 #endif
 
   //--------------------------------------------------------------------------
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for a given number of events
   template<typename T, size_t sizePerEvent, bool ismisaligned>
   class HostBuffer : public HostBufferBase<T, ismisaligned>, virtual private NumberOfEvents
@@ -175,7 +175,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   // A class encapsulating a CUDA pinned host buffer for a given number of events
   template<typename T, size_t sizePerEvent>
   class PinnedHostBuffer : public PinnedHostBufferBase<T>, virtual private NumberOfEvents
@@ -191,7 +191,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   // A class encapsulating a CUDA device buffer for a given number of events
   template<typename T, size_t sizePerEvent>
   class DeviceBuffer : public DeviceBufferBase<T>, virtual private NumberOfEvents
@@ -213,7 +213,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for momenta random numbers
   constexpr size_t sizePerEventRndNumMomenta = MemoryBuffers::np4 * MemoryBuffers::nparf;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for momenta random numbers
   typedef HostBuffer<fptype, sizePerEventRndNumMomenta, HostBufferALIGNED> HostBufferRndNumMomenta;
 #else
@@ -232,7 +232,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer with ONE fptype per event
   constexpr size_t sizePerEventOneFp = 1;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer with ONE fptype per event
   typedef HostBuffer<fptype, sizePerEventOneFp, HostBufferALIGNED> HostBufferOneFp;
 #else
@@ -257,7 +257,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for Gs
   constexpr size_t sizePerEventGs = 1;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for gs
   typedef HostBuffer<fptype, sizePerEventGs, HostBufferALIGNED> HostBufferGs;
 #else
@@ -276,7 +276,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for numerators
   constexpr size_t sizePerEventNumerators = 1;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for gs
   typedef HostBuffer<fptype, sizePerEventNumerators, HostBufferALIGNED> HostBufferNumerators;
 #else
@@ -296,7 +296,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for denominators
   constexpr size_t sizePerEventDenominators = 1;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for gs
   typedef HostBuffer<fptype, sizePerEventDenominators, HostBufferALIGNED> HostBufferDenominators;
 #else
@@ -315,7 +315,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for random numbers
   constexpr size_t sizePerEventCouplings = MemoryBuffers::ndcoup * MemoryBuffers::nx2;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for gs
   typedef HostBuffer<fptype, sizePerEventCouplings, HostBufferALIGNED> HostBufferCouplings;
 #else
@@ -333,7 +333,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for momenta
   constexpr size_t sizePerEventMomenta = MemoryBuffers::np4 * MemoryBuffers::npar;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for momenta
   typedef HostBuffer<fptype, sizePerEventMomenta, HostBufferALIGNED> HostBufferMomenta;
   //typedef HostBuffer<fptype, sizePerEventMomenta, HostBufferMISALIGNED> HostBufferMomenta; // TEST MISALIGNMENT!
@@ -352,7 +352,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for sampling weights
   constexpr size_t sizePerEventWeights = 1;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for sampling weights
   typedef HostBuffer<fptype, sizePerEventWeights, HostBufferALIGNED> HostBufferWeights;
 #else
@@ -370,7 +370,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for matrix elements
   constexpr size_t sizePerEventMatrixElements = 1;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for matrix elements
   typedef HostBuffer<fptype, sizePerEventMatrixElements, HostBufferALIGNED> HostBufferMatrixElements;
 #else
@@ -385,7 +385,7 @@ namespace mg5amcCpu
   // A base class encapsulating a memory buffer for the helicity mask
   typedef BufferBase<bool> BufferHelicityMask;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for the helicity mask
   typedef HostBufferBase<bool, HostBufferALIGNED> HostBufferHelicityMask;
 #else
@@ -403,7 +403,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for wavefunctions
   constexpr size_t sizePerEventWavefunctions = MemoryBuffers::nw6 * MemoryBuffers::nx2;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for wavefunctions
   typedef HostBuffer<fptype, sizePerEventWavefunctions, HostBufferALIGNED> HostBufferWavefunctions;
 #else
@@ -421,7 +421,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for helicity random numbers
   constexpr size_t sizePerEventRndNumHelicity = 1;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for helicity random numbers
   typedef HostBuffer<fptype, sizePerEventRndNumHelicity, HostBufferALIGNED> HostBufferRndNumHelicity;
 #else
@@ -439,7 +439,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for color random numbers
   constexpr size_t sizePerEventRndNumColor = 1;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for color random numbers
   typedef HostBuffer<fptype, sizePerEventRndNumColor, HostBufferALIGNED> HostBufferRndNumColor;
 #else
@@ -457,7 +457,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for helicity selection
   constexpr size_t sizePerEventSelectedHelicity = 1;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for helicity selection
   typedef HostBuffer<int, sizePerEventSelectedHelicity, HostBufferALIGNED> HostBufferSelectedHelicity;
 #else
@@ -475,7 +475,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for color selection
   constexpr size_t sizePerEventSelectedColor = 1;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for color selection
   typedef HostBuffer<int, sizePerEventSelectedColor, HostBufferALIGNED> HostBufferSelectedColor;
 #else
@@ -487,7 +487,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   template<class Tdst, class Tsrc>
   void copyDeviceFromHost( Tdst& dst, const Tsrc& src ) // keep the same order of arguments as in memcpy
   {
@@ -504,13 +504,13 @@ namespace mg5amcCpu
       throw std::runtime_error( sstr.str() );
     }
     // NB (PR #45): cudaMemcpy involves an intermediate memcpy to pinned memory if host array is a not a pinned host array
-    checkCuda( cudaMemcpy( dst.data(), src.data(), src.bytes(), cudaMemcpyHostToDevice ) );
+    gpuMemcpy( dst.data(), src.data(), src.bytes(), gpuMemcpyHostToDevice );
   }
 #endif
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   template<class Tdst, class Tsrc>
   void copyHostFromDevice( Tdst& dst, const Tsrc& src ) // keep the same order of arguments as in memcpy
   {
@@ -527,7 +527,7 @@ namespace mg5amcCpu
       throw std::runtime_error( sstr.str() );
     }
     // NB (PR #45): cudaMemcpy involves an intermediate memcpy to pinned memory if host array is a not a pinned host array
-    checkCuda( cudaMemcpy( dst.data(), src.data(), src.bytes(), cudaMemcpyDeviceToHost ) );
+    gpuMemcpy( dst.data(), src.data(), src.bytes(), gpuMemcpyDeviceToHost );
   }
 #endif
 
diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/CPPProcess.cc b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/CPPProcess.cc
index 6242b019fa..a376b0c455 100644
--- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/CPPProcess.cc
+++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/CPPProcess.cc
@@ -4,7 +4,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
 // MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
@@ -16,7 +16,6 @@
 
 #include "mgOnGpuConfig.h"
 
-#include "CudaRuntime.h"
 #include "HelAmps_sm.h"
 #include "MemoryAccessAmplitudes.h"
 #include "MemoryAccessCouplings.h"
@@ -49,7 +48,7 @@
 // Process: g d > t t~ d WEIGHTED<=3 @1
 // Process: g s > t t~ s WEIGHTED<=3 @1
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -83,7 +82,7 @@ namespace mg5amcCpu
   __device__ const fptype cIPD[2] = { (fptype)Parameters_sm::mdl_MT, (fptype)Parameters_sm::mdl_WT };
   __device__ const fptype* cIPC = nullptr; // unused as nicoup=0
 #else
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   __device__ __constant__ fptype cIPD[2];
   __device__ __constant__ fptype* cIPC = nullptr; // unused as nicoup=0
 #else
@@ -93,7 +92,7 @@ namespace mg5amcCpu
 #endif
 
   // Helicity combinations (and filtering of "good" helicity combinations)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   __device__ __constant__ short cHel[ncomb][npar];
   __device__ __constant__ int cNGoodHel;
   __device__ __constant__ int cGoodHel[ncomb];
@@ -121,13 +120,13 @@ namespace mg5amcCpu
                            fptype* allDenominators,       // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
                            fptype_sv* jamp2_sv            // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled)
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
                            , const int ievt00             // input: first event number in current C++ event page (for CUDA, ievt depends on threadid)
 #endif
                            )
   //ALWAYS_INLINE // attributes are not permitted in a function definition
   {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     using namespace mg5amcGpu;
     using M_ACCESS = DeviceAccessMomenta;         // non-trivial access: buffer includes all events
     using E_ACCESS = DeviceAccessMatrixElements;  // non-trivial access: buffer includes all events
@@ -154,7 +153,7 @@ namespace mg5amcCpu
 #endif /* clang-format on */
     mgDebug( 0, __FUNCTION__ );
     //printf( "calculate_wavefunctions: ihel=%2d\n", ihel );
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
     //printf( "calculate_wavefunctions: ievt00=%d\n", ievt00 );
 #endif
 
@@ -190,7 +189,7 @@ namespace mg5amcCpu
 #endif
     for( int iParity = 0; iParity < nParity; ++iParity )
     { // START LOOP ON IPARITY
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
       const int ievt0 = ievt00 + iParity * neppV;
 #endif
       constexpr size_t nxcoup = ndcoup + nicoup; // both dependent and independent couplings
@@ -203,8 +202,10 @@ namespace mg5amcCpu
         allCOUPs[idcoup] = CD_ACCESS::idcoupAccessBufferConst( allcouplings, idcoup ); // dependent couplings, vary event-by-event
       for( size_t iicoup = 0; iicoup < nicoup; iicoup++ )
         allCOUPs[ndcoup + iicoup] = CI_ACCESS::iicoupAccessBufferConst( cIPC, iicoup ); // independent couplings, fixed for all events
+#ifdef MGONGPUCPP_GPUIMPL
 #ifdef __CUDACC__
 #pragma nv_diagnostic pop
+#endif
       // CUDA kernels take input/output buffers with momenta/MEs for all events
       const fptype* momenta = allmomenta;
       const fptype* COUPs[nxcoup];
@@ -341,7 +342,7 @@ namespace mg5amcCpu
         { 4, 0, 12, 4 },
         { 0, 4, 4, 12 } }; // 2-D array[4][4]
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
       // Pre-compute a constexpr triangular color matrix properly normalized #475
       struct TriangularNormalizedColorMatrix
       {
@@ -398,7 +399,7 @@ namespace mg5amcCpu
 #endif
       for( int icol = 0; icol < ncolor; icol++ )
       {
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
         // === C++ START ===
         // Diagonal terms
 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
@@ -457,7 +458,7 @@ namespace mg5amcCpu
       MEs_sv_previous += deltaMEs_previous;
 #endif
       /*
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
       if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", blockDim.x * blockIdx.x + threadIdx.x, ihel, MEs_sv );
 #else
 #ifdef MGONGPU_CPPSIMD
@@ -520,8 +521,8 @@ namespace mg5amcCpu
       { 1, -1, 1, 1, 1 },
       { 1, -1, 1, -1, -1 },
       { 1, -1, 1, -1, 1 } };
-#ifdef __CUDACC__
-    checkCuda( cudaMemcpyToSymbol( cHel, tHel, ncomb * npar * sizeof( short ) ) );
+#ifdef MGONGPUCPP_GPUIMPL
+    gpuMemcpyToSymbol( cHel, tHel, ncomb * npar * sizeof( short ) );
 #else
     memcpy( cHel, tHel, ncomb * npar * sizeof( short ) );
 #endif
@@ -562,9 +563,9 @@ namespace mg5amcCpu
     // Then copy them to CUDA constant memory (issue #39) or its C++ emulation in file-scope static memory
     const fptype tIPD[2] = { (fptype)m_pars->mdl_MT, (fptype)m_pars->mdl_WT };
     //const cxtype tIPC[0] = { ... }; // nicoup=0
-#ifdef __CUDACC__
-    checkCuda( cudaMemcpyToSymbol( cIPD, tIPD, 2 * sizeof( fptype ) ) );
-    //checkCuda( cudaMemcpyToSymbol( cIPC, tIPC, 0 * sizeof( cxtype ) ) ); // nicoup=0
+#ifdef MGONGPUCPP_GPUIMPL
+    gpuMemcpyToSymbol( cIPD, tIPD, 2 * sizeof( fptype ) );
+    //gpuMemcpyToSymbol( cIPC, tIPC, 0 * sizeof( cxtype ) ); // nicoup=0
 #else
     memcpy( cIPD, tIPD, 2 * sizeof( fptype ) );
     //memcpy( cIPC, tIPC, 0 * sizeof( cxtype ) ); // nicoup=0
@@ -601,7 +602,7 @@ namespace mg5amcCpu
   {
     std::stringstream out;
     // CUDA version (NVCC)
-    // [Use __NVCC__ instead of __CUDACC__ here!]
+    // [Use __NVCC__ instead of MGONGPUCPP_GPUIMPL here!]
     // [This tests if 'nvcc' was used even to build a .cc file, even if not necessarily 'nvcc -x cu' for a .cu file]
     // [Check 'nvcc --compiler-options -dM -E dummy.c | grep CUDA': see https://stackoverflow.com/a/53713712]
 #ifdef __NVCC__
@@ -666,12 +667,12 @@ namespace mg5amcCpu
   __global__ void /* clang-format off */
   computeDependentCouplings( const fptype* allgs, // input: Gs[nevt]
                              fptype* allcouplings // output: couplings[nevt*ndcoup*2]
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
                              , const int nevt     // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #endif
   ) /* clang-format on */
   {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     using namespace mg5amcGpu;
     using G_ACCESS = DeviceAccessGs;
     using C_ACCESS = DeviceAccessCouplings;
@@ -692,7 +693,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__ /* clang-format off */
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
   __global__ void
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
@@ -818,9 +819,9 @@ namespace mg5amcCpu
         nGoodHel++;
       }
     }
-#ifdef __CUDACC__
-    checkCuda( cudaMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) ) );
-    checkCuda( cudaMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) ) );
+#ifdef MGONGPUCPP_GPUIMPL
+    gpuMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) );
+    gpuMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) );
 #else
     cNGoodHel = nGoodHel;
     for( int ihel = 0; ihel < ncomb; ihel++ ) cGoodHel[ihel] = goodHel[ihel];
@@ -844,7 +845,7 @@ namespace mg5amcCpu
 #endif
             int* allselhel,                // output: helicity selection[nevt]
             int* allselcol                 // output: helicity selection[nevt]
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
             , const int nevt               // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #endif
             ) /* clang-format on */
@@ -864,7 +865,7 @@ namespace mg5amcCpu
     // Denominators: spins, colors and identical particles
     constexpr int helcolDenominators[1] = { 96 }; // assume nprocesses == 1 (#272 and #343)
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     // Remember: in CUDA this is a kernel for one event, in c++ this processes n events
     const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid
 #else
@@ -878,9 +879,12 @@ namespace mg5amcCpu
 #endif
 
     // Start sigmaKin_lines
+
+#include "GpuAbstraction.h"
+
     // === PART 0 - INITIALISATION (before calculate_wavefunctions) ===
     // Reset the "matrix elements" - running sums of |M|^2 over helicities for the given event
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     allMEs[ievt] = 0;
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     allNumerators[ievt] = 0;
@@ -908,7 +912,7 @@ namespace mg5amcCpu
     // === PART 1 - HELICITY LOOP: CALCULATE WAVEFUNCTIONS ===
     // (in both CUDA and C++, using precomputed good helicities)
 
-#ifdef __CUDACC__ // CUDA OR C++
+#ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++
 
     // *** START OF PART 1a - CUDA (one event per CPU thread) ***
     // Running sum of partial amplitudes squared for event by event color selection (#402)
@@ -1118,7 +1122,7 @@ namespace mg5amcCpu
     // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event
     // [NB 'sum over final spins, average over initial spins', eg see
     // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf]
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     allMEs[ievt] /= helcolDenominators[0];
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     if( channelId > 0 ) allMEs[ievt] *= allNumerators[ievt] / allDenominators[ievt];
diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/CPPProcess.h b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/CPPProcess.h
index bf037c6c28..ce22572055 100644
--- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/CPPProcess.h
+++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/CPPProcess.h
@@ -4,7 +4,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
 // MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
@@ -25,7 +25,7 @@
 
 //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -110,7 +110,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   __global__ void
   computeDependentCouplings( const fptype* allgs,    // input: Gs[nevt]
                              fptype* allcouplings ); // output: couplings[nevt*ndcoup*2]
@@ -123,7 +123,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__ /* clang-format off */
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
   __global__ void
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
@@ -153,7 +153,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__ /* clang-format off */
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
   __global__ void
   sigmaKin( const fptype* allmomenta,      // input: momenta[nevt*npar*4]
             const fptype* allcouplings,    // input: couplings[nevt*ndcoup*2]
diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/CudaRuntime.h b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/CudaRuntime.h
deleted file mode 120000
index ce9e1a487a..0000000000
--- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/CudaRuntime.h
+++ /dev/null
@@ -1 +0,0 @@
-../CudaRuntime.h
\ No newline at end of file
diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/check_sa.cc b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/check_sa.cc
index 3fbf0ffbee..7cac5ab47b 100644
--- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/check_sa.cc
+++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/check_sa.cc
@@ -4,7 +4,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: O. Mattelaer (Nov 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 
 #include "mgOnGpuConfig.h"
@@ -12,6 +12,7 @@
 #include "BridgeKernels.h"
 #include "CPPProcess.h"
 #include "CrossSectionKernels.h"
+#include "GpuRuntime.h"
 #include "MatrixElementKernels.h"
 #include "MemoryAccessMatrixElements.h"
 #include "MemoryAccessMomenta.h"
@@ -65,7 +66,7 @@ usage( char* argv0, int ret = 1 )
   std::cout << std::endl;
   std::cout << "Summary stats are always computed: '-p' and '-j' only control their printout" << std::endl;
   std::cout << "The '-d' flag only enables NaN/abnormal warnings and OMP debugging" << std::endl;
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 #ifdef _OPENMP
   std::cout << std::endl;
   std::cout << "Use the OMP_NUM_THREADS environment variable to control OMP multi-threading" << std::endl;
@@ -96,7 +97,7 @@ int
 main( int argc, char** argv )
 {
   // Namespaces for CUDA and C++ (FIXME - eventually use the same namespace everywhere...)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   using namespace mg5amcGpu;
 #else
   using namespace mg5amcCpu;
@@ -134,9 +135,11 @@ main( int argc, char** argv )
     CurandDevice = 2
   };
 #ifdef MGONGPU_HAS_NO_CURAND
-  RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // this is the only supported mode if build has no curand (PR #784)
+  RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // this is the only supported mode if build has no curand (PR #784 and #785)
+#elif defined __HIPCC__
+#error Internal error: MGONGPU_HAS_NO_CURAND should have been set for __HIPCC__ // default on AMD GPUs should be common random
 #elif defined __CUDACC__
-  RandomNumberMode rndgen = RandomNumberMode::CurandDevice; // default on GPU if build has curand
+  RandomNumberMode rndgen = RandomNumberMode::CurandDevice; // default on NVidia GPU if build has curand
 #else
   RandomNumberMode rndgen = RandomNumberMode::CurandHost; // default on CPU if build has curand
 #endif
@@ -146,10 +149,10 @@ main( int argc, char** argv )
     RamboHost = 1,
     RamboDevice = 2
   };
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   RamboSamplingMode rmbsmp = RamboSamplingMode::RamboDevice; // default on GPU
 #else
-  RamboSamplingMode rmbsmp = RamboSamplingMode::RamboHost;  // default on CPU
+  RamboSamplingMode rmbsmp = RamboSamplingMode::RamboHost; // default on CPU
 #endif
   // Bridge emulation mode (NB Bridge implies RamboHost!)
   bool bridge = false;
@@ -177,7 +180,7 @@ main( int argc, char** argv )
     else if( arg == "--curdev" )
     {
 #ifndef __CUDACC__
-      throw std::runtime_error( "CurandDevice is not supported on CPUs" );
+      throw std::runtime_error( "CurandDevice is not supported on CPUs or non-NVidia GPUs" );
 #elif defined MGONGPU_HAS_NO_CURAND
       throw std::runtime_error( "CurandDevice is not supported because this application was built without Curand support" );
 #else
@@ -198,7 +201,7 @@ main( int argc, char** argv )
     }
     else if( arg == "--rmbdev" )
     {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
       rmbsmp = RamboSamplingMode::RamboDevice;
 #else
       throw std::runtime_error( "RamboDevice is not supported on CPUs" );
@@ -272,13 +275,13 @@ main( int argc, char** argv )
     return usage( argv[0] );
   }
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 #ifdef _OPENMP
   ompnumthreadsNotSetMeansOneThread( debug ? 1 : 0 ); // quiet(-1), info(0), debug(1)
 #endif
 #endif
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // Fail gently and avoid "Illegal instruction (core dumped)" if the host does not support the SIMD used in the ME calculation
   // Note: this prevents a crash on pmpe04 but not on some github CI nodes?
   // [NB: SIMD vectorization in mg5amc C++ code is only used in the ME calculation below MatrixElementKernelHost!]
@@ -296,14 +299,14 @@ main( int argc, char** argv )
 
   // === STEP 0 - INITIALISE
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 
-  // --- 00. Initialise cuda
-  // Instantiate a CudaRuntime at the beginnining of the application's main to
-  // invoke cudaSetDevice(0) in the constructor and book a cudaDeviceReset() call in the destructor
-  const std::string cdinKey = "00 CudaInit";
+  // --- 00. Initialise GPU
+  // Instantiate a GpuRuntime at the beginnining of the application's main.
+  // For CUDA this invokes cudaSetDevice(0) in the constructor and books a cudaDeviceReset() call in the destructor.
+  const std::string cdinKey = "00 GpuInit";
   timermap.start( cdinKey );
-  CudaRuntime cudaRuntime( debug );
+  GpuRuntime GpuRuntime( debug );
 #endif
 
   // --- 0a. Initialise physics process
@@ -325,7 +328,7 @@ main( int argc, char** argv )
   timermap.start( alloKey );
 
   // Memory buffers for random numbers for momenta
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferRndNumMomenta hstRndmom( nevt );
 #else
   PinnedHostBufferRndNumMomenta hstRndmom( nevt );
@@ -333,7 +336,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for sampling weights
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferWeights hstWeights( nevt );
 #else
   PinnedHostBufferWeights hstWeights( nevt );
@@ -341,7 +344,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for momenta
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferMomenta hstMomenta( nevt );
 #else
   PinnedHostBufferMomenta hstMomenta( nevt );
@@ -349,7 +352,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for Gs
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferGs hstGs( nevt );
 #else
   PinnedHostBufferGs hstGs( nevt );
@@ -366,7 +369,7 @@ main( int argc, char** argv )
   }
 
   // Memory buffers for matrix elements
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferMatrixElements hstMatrixElements( nevt );
 #else
   PinnedHostBufferMatrixElements hstMatrixElements( nevt );
@@ -375,7 +378,7 @@ main( int argc, char** argv )
 
   // Memory buffers for random numbers for helicity selection
   // *** NB #403 these buffers always remain initialised at 0: no need for helicity choice in gcheck/check (no LHE produced) ***
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferRndNumHelicity hstRndHel( nevt );
 #else
   PinnedHostBufferRndNumHelicity hstRndHel( nevt );
@@ -384,7 +387,7 @@ main( int argc, char** argv )
 
   // Memory buffers for random numbers for color selection
   // *** NB #402 these buffers always remain initialised at 0: no need for color choice in gcheck/check (no LHE produced) ***
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferRndNumColor hstRndCol( nevt );
 #else
   PinnedHostBufferRndNumColor hstRndCol( nevt );
@@ -392,7 +395,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for helicity selection
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferSelectedHelicity hstSelHel( nevt );
 #else
   PinnedHostBufferSelectedHelicity hstSelHel( nevt );
@@ -400,7 +403,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for color selection
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferSelectedColor hstSelCol( nevt );
 #else
   PinnedHostBufferSelectedColor hstSelCol( nevt );
@@ -438,7 +441,7 @@ main( int argc, char** argv )
     const bool onDevice = true;
     prnk.reset( new CurandRandomNumberKernel( devRndmom, onDevice ) );
 #else
-    throw std::logic_error( "INTERNAL ERROR! CurandDevice is not supported on CPUs" ); // INTERNAL ERROR (no path to this statement)
+    throw std::logic_error( "INTERNAL ERROR! CurandDevice is not supported on CPUs or non-NVidia GPUs" ); // INTERNAL ERROR (no path to this statement)
 #endif
   }
 
@@ -450,7 +453,7 @@ main( int argc, char** argv )
   }
   else
   {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     prsk.reset( new RamboSamplingKernelDevice( energy, devRndmom, devMomenta, devWeights, gpublocks, gputhreads ) );
 #else
     throw std::logic_error( "RamboDevice is not supported on CPUs" ); // INTERNAL ERROR (no path to this statement)
@@ -461,7 +464,7 @@ main( int argc, char** argv )
   std::unique_ptr<MatrixElementKernelBase> pmek;
   if( !bridge )
   {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     pmek.reset( new MatrixElementKernelDevice( devMomenta, devGs, devRndHel, devRndCol, devMatrixElements, devSelHel, devSelCol, gpublocks, gputhreads ) );
 #else
     pmek.reset( new MatrixElementKernelHost( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, nevt ) );
@@ -469,7 +472,7 @@ main( int argc, char** argv )
   }
   else
   {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     pmek.reset( new BridgeKernelDevice( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, gpublocks, gputhreads ) );
 #else
     pmek.reset( new BridgeKernelHost( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, nevt ) );
@@ -511,7 +514,7 @@ main( int argc, char** argv )
     prnk->generateRnarray();
     //std::cout << "Got random numbers" << std::endl;
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     if( rndgen != RandomNumberMode::CurandDevice && rmbsmp == RamboSamplingMode::RamboDevice )
     {
       // --- 1c. Copy rndmom from host to device
@@ -543,7 +546,7 @@ main( int argc, char** argv )
     prsk->getMomentaFinal();
     //std::cout << "Got final momenta" << std::endl;
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     if( rmbsmp == RamboSamplingMode::RamboDevice )
     {
       // --- 2c. CopyDToH Weights
@@ -588,7 +591,7 @@ main( int argc, char** argv )
       dynamic_cast<BridgeKernelBase*>( pmek.get() )->transposeInputMomentaC2F();
     }
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     // --- 2d. CopyHToD Momenta
     const std::string gKey = "0.. CpHTDg";
     rambtime += timermap.start( gKey ); // FIXME! NOT A RAMBO TIMER!
@@ -617,7 +620,7 @@ main( int argc, char** argv )
     wv3atime += timermap.stop(); // calc only
     wavetime += wv3atime;        // calc plus copy
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     if( !bridge )
     {
       // --- 3b. CopyDToH MEs
@@ -760,18 +763,22 @@ main( int argc, char** argv )
     rndgentxt = "CURAND DEVICE";
 #ifdef __CUDACC__
   rndgentxt += " (CUDA code)";
+#elif defined __HIPCC__
+  rndgentxt += " (HIP code)";
 #else
   rndgentxt += " (C++ code)";
 #endif
 
   // Workflow description summary
   std::string wrkflwtxt;
-  // -- CUDA or C++?
+  // -- CUDA or HIP or C++?
 #ifdef __CUDACC__
   wrkflwtxt += "CUD:";
+#elif defined __HIPCC__
+  wrkflwtxt += "HIP:";
 #else
   wrkflwtxt += "CPP:";
-#endif
+#endif /* clang-format off */
   // -- DOUBLE or FLOAT?
 #if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
   wrkflwtxt += "MIX+"; // mixed fptypes (single precision color algebra #537)
@@ -781,7 +788,7 @@ main( int argc, char** argv )
   wrkflwtxt += "FLT+";
 #else
   wrkflwtxt += "???+"; // no path to this statement
-#endif
+#endif /* clang-format on */
   // -- CUCOMPLEX or THRUST or STD complex numbers?
 #ifdef __CUDACC__
 #if defined MGONGPU_CUCXTYPE_CUCOMPLEX
@@ -793,6 +800,12 @@ main( int argc, char** argv )
 #else
   wrkflwtxt += "???:"; // no path to this statement
 #endif
+#elif defined __HIPCC__
+#if defined MGONGPU_CUCXTYPE_CXSMPL
+  wrkflwtxt += "CXS:";
+#else
+  wrkflwtxt += "???:"; // no path to this statement
+#endif
 #else
 #if defined MGONGPU_CPPCXTYPE_STDCOMPLEX
   wrkflwtxt += "STX:";
@@ -818,7 +831,7 @@ main( int argc, char** argv )
     wrkflwtxt += "RMBDEV+";
   else
     wrkflwtxt += "??????+"; // no path to this statement
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   // -- HOST or DEVICE matrix elements? Standalone MEs or BRIDGE?
   if( !bridge )
     wrkflwtxt += "MESDEV";
@@ -874,7 +887,7 @@ main( int argc, char** argv )
 
   if( perf )
   {
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 #ifdef _OPENMP
     // Get the output of "nproc --all" (https://stackoverflow.com/a/478960)
     std::string nprocall;
@@ -895,6 +908,8 @@ main( int argc, char** argv )
     std::cout << std::string( SEP79, '*' ) << std::endl
 #ifdef __CUDACC__
               << "Process                     = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_CUDA"
+#elif defined __HIPCC__
+              << "Process                     = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_HIP"
 #else
               << "Process                     = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_CPP"
 #endif
@@ -921,21 +936,21 @@ main( int argc, char** argv )
 #elif defined MGONGPU_FPTYPE_FLOAT
               << "FP precision                = FLOAT (NaN/abnormal=" << nabn << ", zero=" << nzero << ")" << std::endl
 #endif
-#ifdef __CUDACC__
 #if defined MGONGPU_CUCXTYPE_CUCOMPLEX
               << "Complex type                = CUCOMPLEX" << std::endl
 #elif defined MGONGPU_CUCXTYPE_THRUST
               << "Complex type                = THRUST::COMPLEX" << std::endl
-#endif
-#else
+#elif defined MGONGPU_CUCXTYPE_CXSMPL
               << "Complex type                = STD::COMPLEX" << std::endl
+#else
+              << "Complex type                = ???" << std::endl // no path to this statement...
 #endif
               << "RanNumb memory layout       = AOSOA[" << neppR << "]"
               << ( neppR == 1 ? " == AOS" : "" )
               << " [HARDCODED FOR REPRODUCIBILITY]" << std::endl
               << "Momenta memory layout       = AOSOA[" << neppM << "]"
               << ( neppM == 1 ? " == AOS" : "" ) << std::endl
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     //<< "Wavefunction GPU memory     = LOCAL" << std::endl
 #else
 #if !defined MGONGPU_CPPSIMD
@@ -966,7 +981,7 @@ main( int argc, char** argv )
 #endif
 #endif
               << "Random number generation    = " << rndgentxt << std::endl
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 #ifdef _OPENMP
               << "OMP threads / `nproc --all` = " << omp_get_max_threads() << " / " << nprocall // includes a newline
 #endif
@@ -1062,14 +1077,14 @@ main( int argc, char** argv )
              << "\"FLOAT (NaN/abnormal=" << nabn << ")\"," << std::endl
 #endif
              << "\"Complex type\": "
-#ifdef __CUDACC__
 #if defined MGONGPU_CUCXTYPE_CUCOMPLEX
              << "\"CUCOMPLEX\"," << std::endl
 #elif defined MGONGPU_CUCXTYPE_THRUST
              << "\"THRUST::COMPLEX\"," << std::endl
-#endif
-#else
+#elif defined MGONGPU_CUCXTYPE_CXSMPL
              << "\"STD::COMPLEX\"," << std::endl
+#else
+             << "\"???\"," << std::endl                           // no path to this statement...
 #endif
              << "\"RanNumb memory layout\": "
              << "\"AOSOA[" << neppR << "]\""
@@ -1077,7 +1092,7 @@ main( int argc, char** argv )
              << "\"Momenta memory layout\": "
              << "\"AOSOA[" << neppM << "]\""
              << ( neppM == 1 ? " == AOS" : "" ) << ", " << std::endl
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     //<< "\"Wavefunction GPU memory\": " << "\"LOCAL\"," << std::endl
 #endif
              << "\"Curand generation\": "
diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/CPPProcess.cc b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/CPPProcess.cc
index 90788b2c75..41f17b9fb0 100644
--- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/CPPProcess.cc
+++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/CPPProcess.cc
@@ -4,7 +4,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
 // MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
@@ -16,7 +16,6 @@
 
 #include "mgOnGpuConfig.h"
 
-#include "CudaRuntime.h"
 #include "HelAmps_sm.h"
 #include "MemoryAccessAmplitudes.h"
 #include "MemoryAccessCouplings.h"
@@ -49,7 +48,7 @@
 // Process: g d~ > t t~ d~ WEIGHTED<=3 @1
 // Process: g s~ > t t~ s~ WEIGHTED<=3 @1
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -83,7 +82,7 @@ namespace mg5amcCpu
   __device__ const fptype cIPD[2] = { (fptype)Parameters_sm::mdl_MT, (fptype)Parameters_sm::mdl_WT };
   __device__ const fptype* cIPC = nullptr; // unused as nicoup=0
 #else
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   __device__ __constant__ fptype cIPD[2];
   __device__ __constant__ fptype* cIPC = nullptr; // unused as nicoup=0
 #else
@@ -93,7 +92,7 @@ namespace mg5amcCpu
 #endif
 
   // Helicity combinations (and filtering of "good" helicity combinations)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   __device__ __constant__ short cHel[ncomb][npar];
   __device__ __constant__ int cNGoodHel;
   __device__ __constant__ int cGoodHel[ncomb];
@@ -121,13 +120,13 @@ namespace mg5amcCpu
                            fptype* allDenominators,       // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
                            fptype_sv* jamp2_sv            // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled)
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
                            , const int ievt00             // input: first event number in current C++ event page (for CUDA, ievt depends on threadid)
 #endif
                            )
   //ALWAYS_INLINE // attributes are not permitted in a function definition
   {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     using namespace mg5amcGpu;
     using M_ACCESS = DeviceAccessMomenta;         // non-trivial access: buffer includes all events
     using E_ACCESS = DeviceAccessMatrixElements;  // non-trivial access: buffer includes all events
@@ -154,7 +153,7 @@ namespace mg5amcCpu
 #endif /* clang-format on */
     mgDebug( 0, __FUNCTION__ );
     //printf( "calculate_wavefunctions: ihel=%2d\n", ihel );
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
     //printf( "calculate_wavefunctions: ievt00=%d\n", ievt00 );
 #endif
 
@@ -190,7 +189,7 @@ namespace mg5amcCpu
 #endif
     for( int iParity = 0; iParity < nParity; ++iParity )
     { // START LOOP ON IPARITY
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
       const int ievt0 = ievt00 + iParity * neppV;
 #endif
       constexpr size_t nxcoup = ndcoup + nicoup; // both dependent and independent couplings
@@ -203,8 +202,10 @@ namespace mg5amcCpu
         allCOUPs[idcoup] = CD_ACCESS::idcoupAccessBufferConst( allcouplings, idcoup ); // dependent couplings, vary event-by-event
       for( size_t iicoup = 0; iicoup < nicoup; iicoup++ )
         allCOUPs[ndcoup + iicoup] = CI_ACCESS::iicoupAccessBufferConst( cIPC, iicoup ); // independent couplings, fixed for all events
+#ifdef MGONGPUCPP_GPUIMPL
 #ifdef __CUDACC__
 #pragma nv_diagnostic pop
+#endif
       // CUDA kernels take input/output buffers with momenta/MEs for all events
       const fptype* momenta = allmomenta;
       const fptype* COUPs[nxcoup];
@@ -341,7 +342,7 @@ namespace mg5amcCpu
         { 4, 0, 12, 4 },
         { 0, 4, 4, 12 } }; // 2-D array[4][4]
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
       // Pre-compute a constexpr triangular color matrix properly normalized #475
       struct TriangularNormalizedColorMatrix
       {
@@ -398,7 +399,7 @@ namespace mg5amcCpu
 #endif
       for( int icol = 0; icol < ncolor; icol++ )
       {
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
         // === C++ START ===
         // Diagonal terms
 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
@@ -457,7 +458,7 @@ namespace mg5amcCpu
       MEs_sv_previous += deltaMEs_previous;
 #endif
       /*
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
       if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", blockDim.x * blockIdx.x + threadIdx.x, ihel, MEs_sv );
 #else
 #ifdef MGONGPU_CPPSIMD
@@ -520,8 +521,8 @@ namespace mg5amcCpu
       { 1, 1, 1, 1, -1 },
       { 1, 1, 1, -1, 1 },
       { 1, 1, 1, -1, -1 } };
-#ifdef __CUDACC__
-    checkCuda( cudaMemcpyToSymbol( cHel, tHel, ncomb * npar * sizeof( short ) ) );
+#ifdef MGONGPUCPP_GPUIMPL
+    gpuMemcpyToSymbol( cHel, tHel, ncomb * npar * sizeof( short ) );
 #else
     memcpy( cHel, tHel, ncomb * npar * sizeof( short ) );
 #endif
@@ -562,9 +563,9 @@ namespace mg5amcCpu
     // Then copy them to CUDA constant memory (issue #39) or its C++ emulation in file-scope static memory
     const fptype tIPD[2] = { (fptype)m_pars->mdl_MT, (fptype)m_pars->mdl_WT };
     //const cxtype tIPC[0] = { ... }; // nicoup=0
-#ifdef __CUDACC__
-    checkCuda( cudaMemcpyToSymbol( cIPD, tIPD, 2 * sizeof( fptype ) ) );
-    //checkCuda( cudaMemcpyToSymbol( cIPC, tIPC, 0 * sizeof( cxtype ) ) ); // nicoup=0
+#ifdef MGONGPUCPP_GPUIMPL
+    gpuMemcpyToSymbol( cIPD, tIPD, 2 * sizeof( fptype ) );
+    //gpuMemcpyToSymbol( cIPC, tIPC, 0 * sizeof( cxtype ) ); // nicoup=0
 #else
     memcpy( cIPD, tIPD, 2 * sizeof( fptype ) );
     //memcpy( cIPC, tIPC, 0 * sizeof( cxtype ) ); // nicoup=0
@@ -601,7 +602,7 @@ namespace mg5amcCpu
   {
     std::stringstream out;
     // CUDA version (NVCC)
-    // [Use __NVCC__ instead of __CUDACC__ here!]
+    // [Use __NVCC__ instead of MGONGPUCPP_GPUIMPL here!]
     // [This tests if 'nvcc' was used even to build a .cc file, even if not necessarily 'nvcc -x cu' for a .cu file]
     // [Check 'nvcc --compiler-options -dM -E dummy.c | grep CUDA': see https://stackoverflow.com/a/53713712]
 #ifdef __NVCC__
@@ -666,12 +667,12 @@ namespace mg5amcCpu
   __global__ void /* clang-format off */
   computeDependentCouplings( const fptype* allgs, // input: Gs[nevt]
                              fptype* allcouplings // output: couplings[nevt*ndcoup*2]
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
                              , const int nevt     // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #endif
   ) /* clang-format on */
   {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     using namespace mg5amcGpu;
     using G_ACCESS = DeviceAccessGs;
     using C_ACCESS = DeviceAccessCouplings;
@@ -692,7 +693,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__ /* clang-format off */
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
   __global__ void
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
@@ -818,9 +819,9 @@ namespace mg5amcCpu
         nGoodHel++;
       }
     }
-#ifdef __CUDACC__
-    checkCuda( cudaMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) ) );
-    checkCuda( cudaMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) ) );
+#ifdef MGONGPUCPP_GPUIMPL
+    gpuMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) );
+    gpuMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) );
 #else
     cNGoodHel = nGoodHel;
     for( int ihel = 0; ihel < ncomb; ihel++ ) cGoodHel[ihel] = goodHel[ihel];
@@ -844,7 +845,7 @@ namespace mg5amcCpu
 #endif
             int* allselhel,                // output: helicity selection[nevt]
             int* allselcol                 // output: helicity selection[nevt]
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
             , const int nevt               // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #endif
             ) /* clang-format on */
@@ -864,7 +865,7 @@ namespace mg5amcCpu
     // Denominators: spins, colors and identical particles
     constexpr int helcolDenominators[1] = { 96 }; // assume nprocesses == 1 (#272 and #343)
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     // Remember: in CUDA this is a kernel for one event, in c++ this processes n events
     const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid
 #else
@@ -878,9 +879,12 @@ namespace mg5amcCpu
 #endif
 
     // Start sigmaKin_lines
+
+#include "GpuAbstraction.h"
+
     // === PART 0 - INITIALISATION (before calculate_wavefunctions) ===
     // Reset the "matrix elements" - running sums of |M|^2 over helicities for the given event
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     allMEs[ievt] = 0;
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     allNumerators[ievt] = 0;
@@ -908,7 +912,7 @@ namespace mg5amcCpu
     // === PART 1 - HELICITY LOOP: CALCULATE WAVEFUNCTIONS ===
     // (in both CUDA and C++, using precomputed good helicities)
 
-#ifdef __CUDACC__ // CUDA OR C++
+#ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++
 
     // *** START OF PART 1a - CUDA (one event per CPU thread) ***
     // Running sum of partial amplitudes squared for event by event color selection (#402)
@@ -1118,7 +1122,7 @@ namespace mg5amcCpu
     // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event
     // [NB 'sum over final spins, average over initial spins', eg see
     // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf]
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     allMEs[ievt] /= helcolDenominators[0];
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     if( channelId > 0 ) allMEs[ievt] *= allNumerators[ievt] / allDenominators[ievt];
diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/CPPProcess.h b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/CPPProcess.h
index 0f49f5247b..46c4347506 100644
--- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/CPPProcess.h
+++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/CPPProcess.h
@@ -4,7 +4,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
 // MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
@@ -25,7 +25,7 @@
 
 //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -110,7 +110,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   __global__ void
   computeDependentCouplings( const fptype* allgs,    // input: Gs[nevt]
                              fptype* allcouplings ); // output: couplings[nevt*ndcoup*2]
@@ -123,7 +123,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__ /* clang-format off */
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
   __global__ void
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
@@ -153,7 +153,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__ /* clang-format off */
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
   __global__ void
   sigmaKin( const fptype* allmomenta,      // input: momenta[nevt*npar*4]
             const fptype* allcouplings,    // input: couplings[nevt*ndcoup*2]
diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/CudaRuntime.h b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/CudaRuntime.h
deleted file mode 120000
index ce9e1a487a..0000000000
--- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/CudaRuntime.h
+++ /dev/null
@@ -1 +0,0 @@
-../CudaRuntime.h
\ No newline at end of file
diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/check_sa.cc b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/check_sa.cc
index 3fbf0ffbee..7cac5ab47b 100644
--- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/check_sa.cc
+++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/check_sa.cc
@@ -4,7 +4,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: O. Mattelaer (Nov 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 
 #include "mgOnGpuConfig.h"
@@ -12,6 +12,7 @@
 #include "BridgeKernels.h"
 #include "CPPProcess.h"
 #include "CrossSectionKernels.h"
+#include "GpuRuntime.h"
 #include "MatrixElementKernels.h"
 #include "MemoryAccessMatrixElements.h"
 #include "MemoryAccessMomenta.h"
@@ -65,7 +66,7 @@ usage( char* argv0, int ret = 1 )
   std::cout << std::endl;
   std::cout << "Summary stats are always computed: '-p' and '-j' only control their printout" << std::endl;
   std::cout << "The '-d' flag only enables NaN/abnormal warnings and OMP debugging" << std::endl;
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 #ifdef _OPENMP
   std::cout << std::endl;
   std::cout << "Use the OMP_NUM_THREADS environment variable to control OMP multi-threading" << std::endl;
@@ -96,7 +97,7 @@ int
 main( int argc, char** argv )
 {
   // Namespaces for CUDA and C++ (FIXME - eventually use the same namespace everywhere...)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   using namespace mg5amcGpu;
 #else
   using namespace mg5amcCpu;
@@ -134,9 +135,11 @@ main( int argc, char** argv )
     CurandDevice = 2
   };
 #ifdef MGONGPU_HAS_NO_CURAND
-  RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // this is the only supported mode if build has no curand (PR #784)
+  RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // this is the only supported mode if build has no curand (PR #784 and #785)
+#elif defined __HIPCC__
+#error Internal error: MGONGPU_HAS_NO_CURAND should have been set for __HIPCC__ // default on AMD GPUs should be common random
 #elif defined __CUDACC__
-  RandomNumberMode rndgen = RandomNumberMode::CurandDevice; // default on GPU if build has curand
+  RandomNumberMode rndgen = RandomNumberMode::CurandDevice; // default on NVidia GPU if build has curand
 #else
   RandomNumberMode rndgen = RandomNumberMode::CurandHost; // default on CPU if build has curand
 #endif
@@ -146,10 +149,10 @@ main( int argc, char** argv )
     RamboHost = 1,
     RamboDevice = 2
   };
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   RamboSamplingMode rmbsmp = RamboSamplingMode::RamboDevice; // default on GPU
 #else
-  RamboSamplingMode rmbsmp = RamboSamplingMode::RamboHost;  // default on CPU
+  RamboSamplingMode rmbsmp = RamboSamplingMode::RamboHost; // default on CPU
 #endif
   // Bridge emulation mode (NB Bridge implies RamboHost!)
   bool bridge = false;
@@ -177,7 +180,7 @@ main( int argc, char** argv )
     else if( arg == "--curdev" )
     {
 #ifndef __CUDACC__
-      throw std::runtime_error( "CurandDevice is not supported on CPUs" );
+      throw std::runtime_error( "CurandDevice is not supported on CPUs or non-NVidia GPUs" );
 #elif defined MGONGPU_HAS_NO_CURAND
       throw std::runtime_error( "CurandDevice is not supported because this application was built without Curand support" );
 #else
@@ -198,7 +201,7 @@ main( int argc, char** argv )
     }
     else if( arg == "--rmbdev" )
     {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
       rmbsmp = RamboSamplingMode::RamboDevice;
 #else
       throw std::runtime_error( "RamboDevice is not supported on CPUs" );
@@ -272,13 +275,13 @@ main( int argc, char** argv )
     return usage( argv[0] );
   }
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 #ifdef _OPENMP
   ompnumthreadsNotSetMeansOneThread( debug ? 1 : 0 ); // quiet(-1), info(0), debug(1)
 #endif
 #endif
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // Fail gently and avoid "Illegal instruction (core dumped)" if the host does not support the SIMD used in the ME calculation
   // Note: this prevents a crash on pmpe04 but not on some github CI nodes?
   // [NB: SIMD vectorization in mg5amc C++ code is only used in the ME calculation below MatrixElementKernelHost!]
@@ -296,14 +299,14 @@ main( int argc, char** argv )
 
   // === STEP 0 - INITIALISE
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 
-  // --- 00. Initialise cuda
-  // Instantiate a CudaRuntime at the beginnining of the application's main to
-  // invoke cudaSetDevice(0) in the constructor and book a cudaDeviceReset() call in the destructor
-  const std::string cdinKey = "00 CudaInit";
+  // --- 00. Initialise GPU
+  // Instantiate a GpuRuntime at the beginnining of the application's main.
+  // For CUDA this invokes cudaSetDevice(0) in the constructor and books a cudaDeviceReset() call in the destructor.
+  const std::string cdinKey = "00 GpuInit";
   timermap.start( cdinKey );
-  CudaRuntime cudaRuntime( debug );
+  GpuRuntime GpuRuntime( debug );
 #endif
 
   // --- 0a. Initialise physics process
@@ -325,7 +328,7 @@ main( int argc, char** argv )
   timermap.start( alloKey );
 
   // Memory buffers for random numbers for momenta
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferRndNumMomenta hstRndmom( nevt );
 #else
   PinnedHostBufferRndNumMomenta hstRndmom( nevt );
@@ -333,7 +336,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for sampling weights
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferWeights hstWeights( nevt );
 #else
   PinnedHostBufferWeights hstWeights( nevt );
@@ -341,7 +344,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for momenta
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferMomenta hstMomenta( nevt );
 #else
   PinnedHostBufferMomenta hstMomenta( nevt );
@@ -349,7 +352,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for Gs
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferGs hstGs( nevt );
 #else
   PinnedHostBufferGs hstGs( nevt );
@@ -366,7 +369,7 @@ main( int argc, char** argv )
   }
 
   // Memory buffers for matrix elements
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferMatrixElements hstMatrixElements( nevt );
 #else
   PinnedHostBufferMatrixElements hstMatrixElements( nevt );
@@ -375,7 +378,7 @@ main( int argc, char** argv )
 
   // Memory buffers for random numbers for helicity selection
   // *** NB #403 these buffers always remain initialised at 0: no need for helicity choice in gcheck/check (no LHE produced) ***
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferRndNumHelicity hstRndHel( nevt );
 #else
   PinnedHostBufferRndNumHelicity hstRndHel( nevt );
@@ -384,7 +387,7 @@ main( int argc, char** argv )
 
   // Memory buffers for random numbers for color selection
   // *** NB #402 these buffers always remain initialised at 0: no need for color choice in gcheck/check (no LHE produced) ***
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferRndNumColor hstRndCol( nevt );
 #else
   PinnedHostBufferRndNumColor hstRndCol( nevt );
@@ -392,7 +395,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for helicity selection
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferSelectedHelicity hstSelHel( nevt );
 #else
   PinnedHostBufferSelectedHelicity hstSelHel( nevt );
@@ -400,7 +403,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for color selection
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferSelectedColor hstSelCol( nevt );
 #else
   PinnedHostBufferSelectedColor hstSelCol( nevt );
@@ -438,7 +441,7 @@ main( int argc, char** argv )
     const bool onDevice = true;
     prnk.reset( new CurandRandomNumberKernel( devRndmom, onDevice ) );
 #else
-    throw std::logic_error( "INTERNAL ERROR! CurandDevice is not supported on CPUs" ); // INTERNAL ERROR (no path to this statement)
+    throw std::logic_error( "INTERNAL ERROR! CurandDevice is not supported on CPUs or non-NVidia GPUs" ); // INTERNAL ERROR (no path to this statement)
 #endif
   }
 
@@ -450,7 +453,7 @@ main( int argc, char** argv )
   }
   else
   {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     prsk.reset( new RamboSamplingKernelDevice( energy, devRndmom, devMomenta, devWeights, gpublocks, gputhreads ) );
 #else
     throw std::logic_error( "RamboDevice is not supported on CPUs" ); // INTERNAL ERROR (no path to this statement)
@@ -461,7 +464,7 @@ main( int argc, char** argv )
   std::unique_ptr<MatrixElementKernelBase> pmek;
   if( !bridge )
   {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     pmek.reset( new MatrixElementKernelDevice( devMomenta, devGs, devRndHel, devRndCol, devMatrixElements, devSelHel, devSelCol, gpublocks, gputhreads ) );
 #else
     pmek.reset( new MatrixElementKernelHost( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, nevt ) );
@@ -469,7 +472,7 @@ main( int argc, char** argv )
   }
   else
   {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     pmek.reset( new BridgeKernelDevice( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, gpublocks, gputhreads ) );
 #else
     pmek.reset( new BridgeKernelHost( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, nevt ) );
@@ -511,7 +514,7 @@ main( int argc, char** argv )
     prnk->generateRnarray();
     //std::cout << "Got random numbers" << std::endl;
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     if( rndgen != RandomNumberMode::CurandDevice && rmbsmp == RamboSamplingMode::RamboDevice )
     {
       // --- 1c. Copy rndmom from host to device
@@ -543,7 +546,7 @@ main( int argc, char** argv )
     prsk->getMomentaFinal();
     //std::cout << "Got final momenta" << std::endl;
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     if( rmbsmp == RamboSamplingMode::RamboDevice )
     {
       // --- 2c. CopyDToH Weights
@@ -588,7 +591,7 @@ main( int argc, char** argv )
       dynamic_cast<BridgeKernelBase*>( pmek.get() )->transposeInputMomentaC2F();
     }
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     // --- 2d. CopyHToD Momenta
     const std::string gKey = "0.. CpHTDg";
     rambtime += timermap.start( gKey ); // FIXME! NOT A RAMBO TIMER!
@@ -617,7 +620,7 @@ main( int argc, char** argv )
     wv3atime += timermap.stop(); // calc only
     wavetime += wv3atime;        // calc plus copy
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     if( !bridge )
     {
       // --- 3b. CopyDToH MEs
@@ -760,18 +763,22 @@ main( int argc, char** argv )
     rndgentxt = "CURAND DEVICE";
 #ifdef __CUDACC__
   rndgentxt += " (CUDA code)";
+#elif defined __HIPCC__
+  rndgentxt += " (HIP code)";
 #else
   rndgentxt += " (C++ code)";
 #endif
 
   // Workflow description summary
   std::string wrkflwtxt;
-  // -- CUDA or C++?
+  // -- CUDA or HIP or C++?
 #ifdef __CUDACC__
   wrkflwtxt += "CUD:";
+#elif defined __HIPCC__
+  wrkflwtxt += "HIP:";
 #else
   wrkflwtxt += "CPP:";
-#endif
+#endif /* clang-format off */
   // -- DOUBLE or FLOAT?
 #if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
   wrkflwtxt += "MIX+"; // mixed fptypes (single precision color algebra #537)
@@ -781,7 +788,7 @@ main( int argc, char** argv )
   wrkflwtxt += "FLT+";
 #else
   wrkflwtxt += "???+"; // no path to this statement
-#endif
+#endif /* clang-format on */
   // -- CUCOMPLEX or THRUST or STD complex numbers?
 #ifdef __CUDACC__
 #if defined MGONGPU_CUCXTYPE_CUCOMPLEX
@@ -793,6 +800,12 @@ main( int argc, char** argv )
 #else
   wrkflwtxt += "???:"; // no path to this statement
 #endif
+#elif defined __HIPCC__
+#if defined MGONGPU_CUCXTYPE_CXSMPL
+  wrkflwtxt += "CXS:";
+#else
+  wrkflwtxt += "???:"; // no path to this statement
+#endif
 #else
 #if defined MGONGPU_CPPCXTYPE_STDCOMPLEX
   wrkflwtxt += "STX:";
@@ -818,7 +831,7 @@ main( int argc, char** argv )
     wrkflwtxt += "RMBDEV+";
   else
     wrkflwtxt += "??????+"; // no path to this statement
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   // -- HOST or DEVICE matrix elements? Standalone MEs or BRIDGE?
   if( !bridge )
     wrkflwtxt += "MESDEV";
@@ -874,7 +887,7 @@ main( int argc, char** argv )
 
   if( perf )
   {
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 #ifdef _OPENMP
     // Get the output of "nproc --all" (https://stackoverflow.com/a/478960)
     std::string nprocall;
@@ -895,6 +908,8 @@ main( int argc, char** argv )
     std::cout << std::string( SEP79, '*' ) << std::endl
 #ifdef __CUDACC__
               << "Process                     = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_CUDA"
+#elif defined __HIPCC__
+              << "Process                     = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_HIP"
 #else
               << "Process                     = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_CPP"
 #endif
@@ -921,21 +936,21 @@ main( int argc, char** argv )
 #elif defined MGONGPU_FPTYPE_FLOAT
               << "FP precision                = FLOAT (NaN/abnormal=" << nabn << ", zero=" << nzero << ")" << std::endl
 #endif
-#ifdef __CUDACC__
 #if defined MGONGPU_CUCXTYPE_CUCOMPLEX
               << "Complex type                = CUCOMPLEX" << std::endl
 #elif defined MGONGPU_CUCXTYPE_THRUST
               << "Complex type                = THRUST::COMPLEX" << std::endl
-#endif
-#else
+#elif defined MGONGPU_CUCXTYPE_CXSMPL
               << "Complex type                = STD::COMPLEX" << std::endl
+#else
+              << "Complex type                = ???" << std::endl // no path to this statement...
 #endif
               << "RanNumb memory layout       = AOSOA[" << neppR << "]"
               << ( neppR == 1 ? " == AOS" : "" )
               << " [HARDCODED FOR REPRODUCIBILITY]" << std::endl
               << "Momenta memory layout       = AOSOA[" << neppM << "]"
               << ( neppM == 1 ? " == AOS" : "" ) << std::endl
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     //<< "Wavefunction GPU memory     = LOCAL" << std::endl
 #else
 #if !defined MGONGPU_CPPSIMD
@@ -966,7 +981,7 @@ main( int argc, char** argv )
 #endif
 #endif
               << "Random number generation    = " << rndgentxt << std::endl
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 #ifdef _OPENMP
               << "OMP threads / `nproc --all` = " << omp_get_max_threads() << " / " << nprocall // includes a newline
 #endif
@@ -1062,14 +1077,14 @@ main( int argc, char** argv )
              << "\"FLOAT (NaN/abnormal=" << nabn << ")\"," << std::endl
 #endif
              << "\"Complex type\": "
-#ifdef __CUDACC__
 #if defined MGONGPU_CUCXTYPE_CUCOMPLEX
              << "\"CUCOMPLEX\"," << std::endl
 #elif defined MGONGPU_CUCXTYPE_THRUST
              << "\"THRUST::COMPLEX\"," << std::endl
-#endif
-#else
+#elif defined MGONGPU_CUCXTYPE_CXSMPL
              << "\"STD::COMPLEX\"," << std::endl
+#else
+             << "\"???\"," << std::endl                           // no path to this statement...
 #endif
              << "\"RanNumb memory layout\": "
              << "\"AOSOA[" << neppR << "]\""
@@ -1077,7 +1092,7 @@ main( int argc, char** argv )
              << "\"Momenta memory layout\": "
              << "\"AOSOA[" << neppM << "]\""
              << ( neppM == 1 ? " == AOS" : "" ) << ", " << std::endl
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     //<< "\"Wavefunction GPU memory\": " << "\"LOCAL\"," << std::endl
 #endif
              << "\"Curand generation\": "
diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/RamboSamplingKernels.cc b/epochX/cudacpp/gq_ttq.mad/SubProcesses/RamboSamplingKernels.cc
index da68aa9255..79abbcc4f8 100644
--- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/RamboSamplingKernels.cc
+++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/RamboSamplingKernels.cc
@@ -1,11 +1,11 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #include "RamboSamplingKernels.h"
 
-#include "CudaRuntime.h"
+#include "GpuRuntime.h"
 #include "MemoryAccessMomenta.h"
 #include "MemoryAccessRandomNumbers.h"
 #include "MemoryAccessWeights.h"
@@ -14,7 +14,7 @@
 
 #include <sstream>
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -92,7 +92,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   RamboSamplingKernelDevice::RamboSamplingKernelDevice( const fptype energy,               // input: energy
                                                         const BufferRndNumMomenta& rndmom, // input: random numbers in [0,1]
                                                         BufferMomenta& momenta,            // output: momenta
@@ -135,7 +135,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   __global__ void
   getMomentaInitialDevice( const fptype energy,
                            fptype* momenta )
@@ -147,17 +147,17 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   void
   RamboSamplingKernelDevice::getMomentaInitial()
   {
-    getMomentaInitialDevice<<<m_gpublocks, m_gputhreads>>>( m_energy, m_momenta.data() );
+    gpuLaunchKernel( getMomentaInitialDevice, m_gpublocks, m_gputhreads, m_energy, m_momenta.data() );
   }
 #endif
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   __global__ void
   getMomentaFinalDevice( const fptype energy,
                          const fptype* rndmom,
@@ -171,11 +171,11 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   void
   RamboSamplingKernelDevice::getMomentaFinal()
   {
-    getMomentaFinalDevice<<<m_gpublocks, m_gputhreads>>>( m_energy, m_rndmom.data(), m_momenta.data(), m_weights.data() );
+    gpuLaunchKernel( getMomentaFinalDevice, m_gpublocks, m_gputhreads, m_energy, m_rndmom.data(), m_momenta.data(), m_weights.data() );
   }
 #endif
 
diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/RamboSamplingKernels.h b/epochX/cudacpp/gq_ttq.mad/SubProcesses/RamboSamplingKernels.h
index 184089efd7..7c214cd74b 100644
--- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/RamboSamplingKernels.h
+++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/RamboSamplingKernels.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef RAMBOSAMPLINGKERNELS_H
 #define RAMBOSAMPLINGKERNELS_H 1
@@ -10,7 +10,7 @@
 
 #include "MemoryBuffers.h"
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -93,7 +93,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   // A class encapsulating RAMBO phase space sampling on a GPU device
   class RamboSamplingKernelDevice final : public SamplingKernelBase, public NumberOfEvents
   {
diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/RandomNumberKernels.h b/epochX/cudacpp/gq_ttq.mad/SubProcesses/RandomNumberKernels.h
index 188a72c2c9..21d63beeac 100644
--- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/RandomNumberKernels.h
+++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/RandomNumberKernels.h
@@ -1,14 +1,14 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef RANDOMNUMBERKERNELS_H
 #define RANDOMNUMBERKERNELS_H 1
 
 #include "mgOnGpuConfig.h"
 
-// NB This must come AFTER mgOnGpuConfig.h which contains our definition of __global__ when __CUDACC__ is not defined
+// NB This must come AFTER mgOnGpuConfig.h which contains our definition of __global__ when MGONGPUCPP_GPUIMPL is not defined
 #ifndef MGONGPU_HAS_NO_CURAND
 //#include "curand.h"
 struct curandGenerator_st; // forward definition from curand.h
@@ -16,7 +16,7 @@ struct curandGenerator_st; // forward definition from curand.h
 
 #include "MemoryBuffers.h"
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/cudacpp.mk b/epochX/cudacpp/gq_ttq.mad/SubProcesses/cudacpp.mk
index 509307506b..f2cfa349da 100644
--- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/cudacpp.mk
+++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/cudacpp.mk
@@ -1,7 +1,7 @@
 # Copyright (C) 2020-2023 CERN and UCLouvain.
 # Licensed under the GNU Lesser General Public License (version 3 or later).
 # Created by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin.
-# Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+# Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 
 #=== Determine the name of this makefile (https://ftp.gnu.org/old-gnu/Manuals/make-3.80/html_node/make_17.html)
 #=== NB: use ':=' to ensure that the value of CUDACPP_MAKEFILE is not modified further down after including make_opts
@@ -42,10 +42,10 @@ endif
 
 #-------------------------------------------------------------------------------
 
-#=== Configure common compiler flags for C++ and CUDA
+#=== Configure common compiler flags for C++ and CUDA/HIP
 
 INCFLAGS = -I.
-OPTFLAGS = -O3 # this ends up in CUFLAGS too (should it?), cannot add -Ofast or -ffast-math here
+OPTFLAGS = -O3 # this ends up in GPUFLAGS too (should it?), cannot add -Ofast or -ffast-math here
 
 # Dependency on src directory
 MG5AMC_COMMONLIB = mg5amc_common
@@ -121,24 +121,46 @@ endif
 
 #-------------------------------------------------------------------------------
 
-#=== Configure the CUDA compiler
+#=== Configure the GPU compiler (CUDA or HIP)
 
-# If CXX is not a single word (example "clang++ --gcc-toolchain...") then disable CUDA builds (issue #505)
-# This is because it is impossible to pass this to "CUFLAGS += -ccbin <host-compiler>" below
+# FIXME! (AV 24.01.2024)
+# In the current implementation (without separate builds for C++ and CUDA/HIP), we first check for cudacc and hipcc in CUDA_HOME and HIP_HOME.
+# If CUDA_HOME or HIP_HOME are not set, try to determine them from the path to cudacc and hipcc.
+# While convoluted, this is currently necessary to allow disabling CUDA/HIP builds by setting CUDA_HOME or HIP_HOME to invalid paths.
+# This will (probably?) be fixed when separate C++ and CUDA/HIP builds are implemented (PR #775).
+
+# If CXX is not a single word (example "clang++ --gcc-toolchain...") then disable CUDA and HIP builds (issue #505)
+# This is because it is impossible to pass this to "GPUFLAGS += -ccbin <host-compiler>" below
 ifneq ($(words $(subst ccache ,,$(CXX))),1) # allow at most "CXX=ccache <host-compiler>" from outside
-  $(warning CUDA builds are not supported for multi-word CXX "$(CXX)")
+  $(warning CUDA and HIP builds are not supported for multi-word CXX "$(CXX)")
   override CUDA_HOME=disabled
+  override HIP_HOME=disabled
 endif
 
-# If CUDA_HOME is not set, try to set it from the location of nvcc
+# If CUDA_HOME is not set, try to set it from the path to nvcc
 ifndef CUDA_HOME
   CUDA_HOME = $(patsubst %bin/nvcc,%,$(shell which nvcc 2>/dev/null))
   $(warning CUDA_HOME was not set: using "$(CUDA_HOME)")
 endif
 
-# Set NVCC as $(CUDA_HOME)/bin/nvcc if it exists
+# If HIP_HOME is not set, try to set it from the path to hipcc
+ifndef HIP_HOME
+  HIP_HOME = $(patsubst %bin/hipcc,%,$(HIP_COMPILER_PATH))
+  $(warning HIP_HOME was not set: using "$(HIP_HOME)")
+endif
+
+# FIXME! (AV 24.01.2024)
+# In the current implementation (without separate builds for C++ and CUDA/HIP),
+# builds are performed for HIP only if CUDA is not found in the path.
+# If both CUDA and HIP are installed, HIP builds can be triggered by unsetting CUDA_HOME.
+# This will be fixed when separate C++ and CUDA/HIP builds are implemented (PR #775).
+
+#--- Option 1: CUDA exists -> use CUDA
+
+# Set GPUCC as $(CUDA_HOME)/bin/nvcc if it exists
 ifneq ($(wildcard $(CUDA_HOME)/bin/nvcc),)
-  NVCC = $(CUDA_HOME)/bin/nvcc
+
+  GPUCC = $(CUDA_HOME)/bin/nvcc
   USE_NVTX ?=-DUSE_NVTX
   # See https://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/index.html
   # See https://arnon.dk/matching-sm-architectures-arch-and-gencode-for-various-nvidia-cards/
@@ -158,41 +180,78 @@ ifneq ($(wildcard $(CUDA_HOME)/bin/nvcc),)
     CURANDLIBFLAGS = -L$(CUDA_HOME)/lib64/ -lcurand # NB: -lcuda is not needed here!
   endif
   CUOPTFLAGS = -lineinfo
-  CUFLAGS = $(foreach opt, $(OPTFLAGS), -Xcompiler $(opt)) $(CUOPTFLAGS) $(INCFLAGS) $(CUINC) $(USE_NVTX) $(CUARCHFLAGS) -use_fast_math
-  ###CUFLAGS += -Xcompiler -Wall -Xcompiler -Wextra -Xcompiler -Wshadow
-  ###NVCC_VERSION = $(shell $(NVCC) --version | grep 'Cuda compilation tools' | cut -d' ' -f5 | cut -d, -f1)
-  CUFLAGS += -std=c++17 # need CUDA >= 11.2 (see #333): this is enforced in mgOnGpuConfig.h
+  ###GPUFLAGS = $(OPTFLAGS) $(CUOPTFLAGS) $(INCFLAGS) $(CUINC) $(USE_NVTX) $(CUARCHFLAGS) -use_fast_math
+  GPUFLAGS = $(foreach opt, $(OPTFLAGS), -Xcompiler $(opt)) $(CUOPTFLAGS) $(INCFLAGS) $(CUINC) $(USE_NVTX) $(CUARCHFLAGS) -use_fast_math
+  ###GPUFLAGS += -Xcompiler -Wall -Xcompiler -Wextra -Xcompiler -Wshadow
+  ###GPUCC_VERSION = $(shell $(GPUCC) --version | grep 'Cuda compilation tools' | cut -d' ' -f5 | cut -d, -f1)
+  GPUFLAGS += -std=c++17 # need CUDA >= 11.2 (see #333): this is enforced in mgOnGpuConfig.h
   # Without -maxrregcount: baseline throughput: 6.5E8 (16384 32 12) up to 7.3E8 (65536 128 12)
-  ###CUFLAGS+= --maxrregcount 160 # improves throughput: 6.9E8 (16384 32 12) up to 7.7E8 (65536 128 12)
-  ###CUFLAGS+= --maxrregcount 128 # improves throughput: 7.3E8 (16384 32 12) up to 7.6E8 (65536 128 12)
-  ###CUFLAGS+= --maxrregcount 96 # degrades throughput: 4.1E8 (16384 32 12) up to 4.5E8 (65536 128 12)
-  ###CUFLAGS+= --maxrregcount 64 # degrades throughput: 1.7E8 (16384 32 12) flat at 1.7E8 (65536 128 12)
+  ###GPUFLAGS+= --maxrregcount 160 # improves throughput: 6.9E8 (16384 32 12) up to 7.7E8 (65536 128 12)
+  ###GPUFLAGS+= --maxrregcount 128 # improves throughput: 7.3E8 (16384 32 12) up to 7.6E8 (65536 128 12)
+  ###GPUFLAGS+= --maxrregcount 96 # degrades throughput: 4.1E8 (16384 32 12) up to 4.5E8 (65536 128 12)
+  ###GPUFLAGS+= --maxrregcount 64 # degrades throughput: 1.7E8 (16384 32 12) flat at 1.7E8 (65536 128 12)
+  CUBUILDRULEFLAGS = -Xcompiler -fPIC -c
+  CCBUILDRULEFLAGS = -Xcompiler -fPIC -c -x cu
+  CUDATESTFLAGS = -lcuda
+
+  # Set the host C++ compiler for GPUCC via "-ccbin <host-compiler>"
+  # (NB issue #505: this must be a single word, "clang++ --gcc-toolchain..." is not supported)
+  GPUFLAGS += -ccbin $(shell which $(subst ccache ,,$(CXX)))
+
+  # Allow newer (unsupported) C++ compilers with older versions of CUDA if ALLOW_UNSUPPORTED_COMPILER_IN_CUDA is set (#504)
+  ifneq ($(origin ALLOW_UNSUPPORTED_COMPILER_IN_CUDA),undefined)
+  GPUFLAGS += -allow-unsupported-compiler
+  endif
+
 else ifneq ($(origin REQUIRE_CUDA),undefined)
+
   # If REQUIRE_CUDA is set but no cuda is found, stop here (e.g. for CI tests on GPU #443)
-  $(error No cuda installation found (set CUDA_HOME or make nvcc visible in PATH))
+  $(error No cuda installation found (set CUDA_HOME or make GPUCC visible in PATH))
+
+#--- Option 2: CUDA does not exist, HIP exists -> use HIP
+
+# Set GPUCC as $(HIP_HOME)/bin/hipcc if it exists
+else ifneq ($(wildcard $(HIP_HOME)/bin/hipcc),)
+
+  GPUCC = $(HIP_HOME)/bin/hipcc
+  #USE_NVTX ?=-DUSE_NVTX # should maybe find something equivalent to this in HIP?
+  HIPARCHFLAGS = -target x86_64-linux-gnu --offload-arch=gfx90a
+  HIPINC = -I$(HIP_HOME)/include/
+  # Note: -DHIP_FAST_MATH is equivalent to -use_fast_math in HIP 
+  # (but only for single precision line 208: https://rocm-developer-tools.github.io/HIP/hcc__detail_2math__functions_8h_source.html)
+  GPUFLAGS = $(OPTFLAGS) $(CUOPTFLAGS) $(INCFLAGS) $(HIPINC) $(HIPARCHFLAGS) -DHIP_FAST_MATH -DHIP_PLATFORM=amd -fPIC
+  ###GPUFLAGS += -Xcompiler -Wall -Xcompiler -Wextra -Xcompiler -Wshadow
+  GPUFLAGS += -std=c++17
+  ###GPUFLAGS+= --maxrregcount 255 # (AV: is this option valid on HIP and meaningful on AMD GPUs?)
+  CUBUILDRULEFLAGS = -fPIC -c
+  CCBUILDRULEFLAGS = -fPIC -c
+
+else ifneq ($(origin REQUIRE_HIP),undefined)
+
+  # If REQUIRE_HIP is set but no HIP is found, stop here (e.g. for CI tests on GPU #443)
+  $(error No hip installation found (set HIP_HOME or make GPUCC visible in PATH))
+
+#--- Option 3: CUDA does not exist, HIP does not exist -> switch off both CUDA and HIP
+
 else
-  # No cuda. Switch cuda compilation off and go to common random numbers in C++
+
+  # No cudacc and no hipcc: switch CUDA and HIP compilation off and go to common random numbers in C++
   $(warning CUDA_HOME is not set or is invalid: export CUDA_HOME to compile with cuda)
-  override NVCC=
+  $(warning HIP_HOME is not set or is invalid: export HIP_HOME to compile with hip)
+  override GPUCC=
   override USE_NVTX=
   override CUINC=
   override CURANDLIBFLAGS=
-endif
-export NVCC
-export CUFLAGS
-
-# Set the host C++ compiler for nvcc via "-ccbin <host-compiler>"
-# (NB issue #505: this must be a single word, "clang++ --gcc-toolchain..." is not supported)
-CUFLAGS += -ccbin $(shell which $(subst ccache ,,$(CXX)))
 
-# Allow newer (unsupported) C++ compilers with older versions of CUDA if ALLOW_UNSUPPORTED_COMPILER_IN_CUDA is set (#504)
-ifneq ($(origin ALLOW_UNSUPPORTED_COMPILER_IN_CUDA),undefined)
-CUFLAGS += -allow-unsupported-compiler
 endif
 
+# Export GPUCC (so that it can also be used in cudacpp_src.mk?)
+export GPUCC
+export GPUFLAGS
+
 #-------------------------------------------------------------------------------
 
-#=== Configure ccache for C++ and CUDA builds
+#=== Configure ccache for C++ and CUDA/HIP builds
 
 # Enable ccache if USECCACHE=1
 ifeq ($(USECCACHE)$(shell echo $(CXX) | grep ccache),1)
@@ -201,15 +260,15 @@ endif
 #ifeq ($(USECCACHE)$(shell echo $(AR) | grep ccache),1)
 #  override AR:=ccache $(AR)
 #endif
-ifneq ($(NVCC),)
-  ifeq ($(USECCACHE)$(shell echo $(NVCC) | grep ccache),1)
-    override NVCC:=ccache $(NVCC)
+ifneq ($(GPUCC),)
+  ifeq ($(USECCACHE)$(shell echo $(GPUCC) | grep ccache),1)
+    override GPUCC:=ccache $(GPUCC)
   endif
 endif
 
 #-------------------------------------------------------------------------------
 
-#=== Configure PowerPC-specific compiler flags for C++ and CUDA
+#=== Configure PowerPC-specific compiler flags for C++ and CUDA/HIP
 
 # PowerPC-specific CXX compiler flags (being reviewed)
 ifeq ($(UNAME_P),ppc64le)
@@ -225,9 +284,9 @@ else
   ######CXXFLAGS+= -fno-semantic-interposition # no benefit (neither alone, nor combined with -flto)
 endif
 
-# PowerPC-specific CUDA compiler flags (to be reviewed!)
+# PowerPC-specific CUDA/HIP compiler flags (to be reviewed!)
 ifeq ($(UNAME_P),ppc64le)
-  CUFLAGS+= -Xcompiler -mno-float128
+  GPUFLAGS+= -Xcompiler -mno-float128
 endif
 
 #-------------------------------------------------------------------------------
@@ -237,7 +296,7 @@ endif
 # Set the default OMPFLAGS choice
 ifneq ($(shell $(CXX) --version | egrep '^Intel'),)
 override OMPFLAGS = -fopenmp
-###override OMPFLAGS = # disable OpenMP MT on Intel (was ok without nvcc but not ok with nvcc before #578)
+###override OMPFLAGS = # disable OpenMP MT on Intel (was ok without GPUCC but not ok with GPUCC before #578)
 else ifneq ($(shell $(CXX) --version | egrep '^(clang)'),)
 override OMPFLAGS = -fopenmp
 ###override OMPFLAGS = # disable OpenMP MT on clang (was not ok without or with nvcc before #578)
@@ -293,7 +352,10 @@ endif
 
 # Set the default RNDGEN (random number generator) choice
 ifeq ($(RNDGEN),)
-  ifeq ($(NVCC),)
+  ifeq ($(GPUCC),)
+    override RNDGEN = hasNoCurand
+  # Edgecase for HIP compilation
+  else ifeq ($(findstring hipcc,$(GPUCC)),hipcc)
     override RNDGEN = hasNoCurand
   else ifeq ($(RNDGEN),)
     override RNDGEN = hasCurand
@@ -310,7 +372,7 @@ export OMPFLAGS
 
 #-------------------------------------------------------------------------------
 
-#=== Set the CUDA/C++ compiler flags appropriate to user-defined choices of AVX, FPTYPE, HELINL, HRDCOD, RNDGEN
+#=== Set the CUDA/HIP/C++ compiler flags appropriate to user-defined choices of AVX, FPTYPE, HELINL, HRDCOD, RNDGEN
 
 # Set the build flags appropriate to OMPFLAGS
 $(info OMPFLAGS=$(OMPFLAGS))
@@ -368,13 +430,13 @@ CXXFLAGS+= $(AVXFLAGS)
 $(info FPTYPE=$(FPTYPE))
 ifeq ($(FPTYPE),d)
   CXXFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_DOUBLE
-  CUFLAGS  += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_DOUBLE
+  GPUFLAGS  += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_DOUBLE
 else ifeq ($(FPTYPE),f)
   CXXFLAGS += -DMGONGPU_FPTYPE_FLOAT -DMGONGPU_FPTYPE2_FLOAT
-  CUFLAGS  += -DMGONGPU_FPTYPE_FLOAT -DMGONGPU_FPTYPE2_FLOAT
+  GPUFLAGS  += -DMGONGPU_FPTYPE_FLOAT -DMGONGPU_FPTYPE2_FLOAT
 else ifeq ($(FPTYPE),m)
   CXXFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_FLOAT
-  CUFLAGS  += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_FLOAT
+  GPUFLAGS  += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_FLOAT
 else
   $(error Unknown FPTYPE='$(FPTYPE)': only 'd', 'f' and 'm' are supported)
 endif
@@ -383,7 +445,7 @@ endif
 $(info HELINL=$(HELINL))
 ifeq ($(HELINL),1)
   CXXFLAGS += -DMGONGPU_INLINE_HELAMPS
-  CUFLAGS  += -DMGONGPU_INLINE_HELAMPS
+  GPUFLAGS  += -DMGONGPU_INLINE_HELAMPS
 else ifneq ($(HELINL),0)
   $(error Unknown HELINL='$(HELINL)': only '0' and '1' are supported)
 endif
@@ -392,7 +454,7 @@ endif
 $(info HRDCOD=$(HRDCOD))
 ifeq ($(HRDCOD),1)
   CXXFLAGS += -DMGONGPU_HARDCODE_PARAM
-  CUFLAGS  += -DMGONGPU_HARDCODE_PARAM
+  GPUFLAGS  += -DMGONGPU_HARDCODE_PARAM
 else ifneq ($(HRDCOD),0)
   $(error Unknown HRDCOD='$(HRDCOD)': only '0' and '1' are supported)
 endif
@@ -444,11 +506,11 @@ ifeq ($(UNAME_S),Darwin)
   override CULIBFLAGSRPATH2 =
 else
   # RPATH to cuda/cpp libs when linking executables
-  override CXXLIBFLAGSRPATH = -Wl,-rpath,$(LIBDIRRPATH)
-  override CULIBFLAGSRPATH = -Xlinker -rpath,$(LIBDIRRPATH)
+  override CXXLIBFLAGSRPATH = -Wl,-rpath=$(LIBDIRRPATH)
+  override CULIBFLAGSRPATH = -Xlinker -rpath=$(LIBDIRRPATH)
   # RPATH to common lib when linking cuda/cpp libs
-  override CXXLIBFLAGSRPATH2 = -Wl,-rpath,'$$ORIGIN'
-  override CULIBFLAGSRPATH2 = -Xlinker -rpath,'$$ORIGIN'
+  override CXXLIBFLAGSRPATH2 = -Wl,-rpath='$$ORIGIN'
+  override CULIBFLAGSRPATH2 = -Xlinker -rpath='$$ORIGIN'
 endif
 
 # Setting LD_LIBRARY_PATH or DYLD_LIBRARY_PATH in the RUNTIME is no longer necessary (neither on Linux nor on Mac)
@@ -461,7 +523,7 @@ override RUNTIME =
 cxx_main=$(BUILDDIR)/check.exe
 fcxx_main=$(BUILDDIR)/fcheck.exe
 
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 cu_main=$(BUILDDIR)/gcheck.exe
 fcu_main=$(BUILDDIR)/fgcheck.exe
 else
@@ -492,15 +554,16 @@ $(BUILDDIR)/.build.$(TAG):
 	@touch $(BUILDDIR)/.build.$(TAG)
 
 # Generic target and build rules: objects from CUDA compilation
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 $(BUILDDIR)/%.o : %.cu *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
 	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
-	$(NVCC) $(CPPFLAGS) $(CUFLAGS) -Xcompiler -fPIC -c $< -o $@
+	$(GPUCC) $(CPPFLAGS) $(GPUFLAGS) $(CUBUILDRULEFLAGS) $< -o $@
 
 $(BUILDDIR)/%_cu.o : %.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
 	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
-	$(NVCC) $(CPPFLAGS) $(CUFLAGS) -Xcompiler -fPIC -c -x cu $< -o $@
+	$(GPUCC) $(CPPFLAGS) $(GPUFLAGS) $(CCBUILDRULEFLAGS) $< -o $@
 endif
+# -x cu in line above
 
 # Generic target and build rules: objects from C++ compilation
 # (NB do not include CUINC here! add it only for NVTX or curand #679)
@@ -509,11 +572,14 @@ $(BUILDDIR)/%.o : %.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
 	$(CXX) $(CPPFLAGS) $(CXXFLAGS) -fPIC -c $< -o $@
 
 # Apply special build flags only to CrossSectionKernel.cc and gCrossSectionKernel.cu (no fast math, see #117 and #516)
+# Added edgecase for HIP compilation
 ifeq ($(shell $(CXX) --version | grep ^nvc++),)
 $(BUILDDIR)/CrossSectionKernels.o: CXXFLAGS := $(filter-out -ffast-math,$(CXXFLAGS))
 $(BUILDDIR)/CrossSectionKernels.o: CXXFLAGS += -fno-fast-math
-ifneq ($(NVCC),)
-$(BUILDDIR)/gCrossSectionKernels.o: CUFLAGS += -Xcompiler -fno-fast-math
+ifeq ($(findstring nvcc,$(GPUCC)),nvcc)
+  $(BUILDDIR)/gCrossSectionKernels.o: GPUFLAGS += -Xcompiler -fno-fast-math
+else
+  $(BUILDDIR)/gCrossSectionKernels.o: GPUFLAGS += -fno-fast-math
 endif
 endif
 
@@ -530,10 +596,10 @@ ifeq ($(RNDGEN),hasCurand)
 $(BUILDDIR)/CurandRandomNumberKernel.o: CXXFLAGS += $(CUINC)
 endif
 
-# Avoid "warning: builtin __has_trivial_... is deprecated; use __is_trivially_... instead" in nvcc with icx2023 (#592)
+# Avoid "warning: builtin __has_trivial_... is deprecated; use __is_trivially_... instead" in GPUCC with icx2023 (#592)
 ifneq ($(shell $(CXX) --version | egrep '^(Intel)'),)
-ifneq ($(NVCC),)
-CUFLAGS += -Xcompiler -Wno-deprecated-builtins
+ifneq ($(GPUCC),)
+GPUFLAGS += -Wno-deprecated-builtins
 endif
 endif
 
@@ -541,8 +607,8 @@ endif
 # This patch does remove the warning, but I prefer to keep it disabled for the moment...
 ###ifneq ($(shell $(CXX) --version | egrep '^(clang|Apple clang|Intel)'),)
 ###$(BUILDDIR)/CrossSectionKernels.o: CXXFLAGS += -Wno-overriding-t-option
-###ifneq ($(NVCC),)
-###$(BUILDDIR)/gCrossSectionKernels.o: CUFLAGS += -Xcompiler -Wno-overriding-t-option
+###ifneq ($(GPUCC),)
+###$(BUILDDIR)/gCrossSectionKernels.o: GPUFLAGS += -Xcompiler -Wno-overriding-t-option
 ###endif
 ###endif
 
@@ -569,7 +635,7 @@ MG5AMC_CXXLIB = mg5amc_$(processid_short)_cpp
 cxx_objects_lib=$(BUILDDIR)/CPPProcess.o $(BUILDDIR)/MatrixElementKernels.o $(BUILDDIR)/BridgeKernels.o $(BUILDDIR)/CrossSectionKernels.o
 cxx_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel.o $(BUILDDIR)/RamboSamplingKernels.o
 
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 MG5AMC_CULIB = mg5amc_$(processid_short)_cuda
 cu_objects_lib=$(BUILDDIR)/gCPPProcess.o $(BUILDDIR)/gMatrixElementKernels.o $(BUILDDIR)/gBridgeKernels.o $(BUILDDIR)/gCrossSectionKernels.o
 cu_objects_exe=$(BUILDDIR)/gCommonRandomNumberKernel.o $(BUILDDIR)/gRamboSamplingKernels.o
@@ -581,11 +647,11 @@ $(LIBDIR)/lib$(MG5AMC_CXXLIB).so: cxx_objects_lib += $(BUILDDIR)/fbridge.o
 $(LIBDIR)/lib$(MG5AMC_CXXLIB).so: $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib)
 	$(CXX) -shared -o $@ $(cxx_objects_lib) $(CXXLIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB)
 
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 $(LIBDIR)/lib$(MG5AMC_CULIB).so: $(BUILDDIR)/fbridge_cu.o
 $(LIBDIR)/lib$(MG5AMC_CULIB).so: cu_objects_lib += $(BUILDDIR)/fbridge_cu.o
 $(LIBDIR)/lib$(MG5AMC_CULIB).so: $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cu_objects_lib)
-	$(NVCC) --shared -o $@ $(cu_objects_lib) $(CULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB)
+	$(GPUCC) --shared -o $@ $(cu_objects_lib) $(CULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB)
 endif
 
 #-------------------------------------------------------------------------------
@@ -602,16 +668,16 @@ $(cxx_main): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PAT
 $(cxx_main): $(BUILDDIR)/check_sa.o $(LIBDIR)/lib$(MG5AMC_CXXLIB).so $(cxx_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel.o
 	$(CXX) -o $@ $(BUILDDIR)/check_sa.o $(OMPFLAGS) -ldl -pthread $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_CXXLIB) $(cxx_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel.o $(CURANDLIBFLAGS)
 
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 ifneq ($(shell $(CXX) --version | grep ^Intel),)
-$(cu_main): LIBFLAGS += -lintlc # compile with icpx and link with nvcc (undefined reference to `_intel_fast_memcpy')
-$(cu_main): LIBFLAGS += -lsvml # compile with icpx and link with nvcc (undefined reference to `__svml_cos4_l9')
+$(cu_main): LIBFLAGS += -lintlc # compile with icpx and link with GPUCC (undefined reference to `_intel_fast_memcpy')
+$(cu_main): LIBFLAGS += -lsvml # compile with icpx and link with GPUCC (undefined reference to `__svml_cos4_l9')
 else ifneq ($(shell $(CXX) --version | grep ^nvc++),) # support nvc++ #531
 $(cu_main): LIBFLAGS += -L$(patsubst %bin/nvc++,%lib,$(subst ccache ,,$(CXX))) -lnvhpcatm -lnvcpumath -lnvc
 endif
 $(cu_main): LIBFLAGS += $(CULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH
 $(cu_main): $(BUILDDIR)/gcheck_sa.o $(LIBDIR)/lib$(MG5AMC_CULIB).so $(cu_objects_exe) $(BUILDDIR)/gCurandRandomNumberKernel.o
-	$(NVCC) -o $@ $(BUILDDIR)/gcheck_sa.o $(CUARCHFLAGS) $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe) $(BUILDDIR)/gCurandRandomNumberKernel.o $(CURANDLIBFLAGS)
+	$(GPUCC) -o $@ $(BUILDDIR)/gcheck_sa.o $(CUARCHFLAGS) $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe) $(BUILDDIR)/gCurandRandomNumberKernel.o $(CURANDLIBFLAGS)
 endif
 
 #-------------------------------------------------------------------------------
@@ -637,17 +703,17 @@ $(fcxx_main): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PA
 $(fcxx_main): $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler.o $(LIBDIR)/lib$(MG5AMC_CXXLIB).so $(cxx_objects_exe)
 	$(CXX) -o $@ $(BUILDDIR)/fcheck_sa.o $(OMPFLAGS) $(BUILDDIR)/fsampler.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CXXLIB) $(cxx_objects_exe)
 
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 ifneq ($(shell $(CXX) --version | grep ^Intel),)
-$(fcu_main): LIBFLAGS += -lintlc # compile with icpx and link with nvcc (undefined reference to `_intel_fast_memcpy')
-$(fcu_main): LIBFLAGS += -lsvml # compile with icpx and link with nvcc (undefined reference to `__svml_cos4_l9')
+$(fcu_main): LIBFLAGS += -lintlc # compile with icpx and link with GPUCC (undefined reference to `_intel_fast_memcpy')
+$(fcu_main): LIBFLAGS += -lsvml # compile with icpx and link with GPUCC (undefined reference to `__svml_cos4_l9')
 endif
 ifeq ($(UNAME_S),Darwin)
 $(fcu_main): LIBFLAGS += -L$(shell dirname $(shell $(FC) --print-file-name libgfortran.dylib)) # add path to libgfortran on Mac #375
 endif
 $(fcu_main): LIBFLAGS += $(CULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH
 $(fcu_main): $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler_cu.o $(LIBDIR)/lib$(MG5AMC_CULIB).so $(cu_objects_exe)
-	$(NVCC) -o $@ $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler_cu.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe)
+	$(GPUCC) -o $@ $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler_cu.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe)
 endif
 
 #-------------------------------------------------------------------------------
@@ -659,7 +725,7 @@ $(BUILDDIR)/testxxx.o: testxxx_cc_ref.txt
 $(testmain): $(BUILDDIR)/testxxx.o
 $(testmain): cxx_objects_exe += $(BUILDDIR)/testxxx.o # Comment out this line to skip the C++ test of xxx functions
 
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 $(BUILDDIR)/testxxx_cu.o: $(GTESTLIBS)
 $(BUILDDIR)/testxxx_cu.o: INCFLAGS += $(GTESTINC)
 $(BUILDDIR)/testxxx_cu.o: testxxx_cc_ref.txt
@@ -672,7 +738,7 @@ $(BUILDDIR)/testmisc.o: INCFLAGS += $(GTESTINC)
 $(testmain): $(BUILDDIR)/testmisc.o
 $(testmain): cxx_objects_exe += $(BUILDDIR)/testmisc.o # Comment out this line to skip the C++ miscellaneous tests
 
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 $(BUILDDIR)/testmisc_cu.o: $(GTESTLIBS)
 $(BUILDDIR)/testmisc_cu.o: INCFLAGS += $(GTESTINC)
 $(testmain): $(BUILDDIR)/testmisc_cu.o
@@ -684,12 +750,12 @@ $(BUILDDIR)/runTest.o: INCFLAGS += $(GTESTINC)
 $(testmain): $(BUILDDIR)/runTest.o
 $(testmain): cxx_objects_exe += $(BUILDDIR)/runTest.o
 
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 $(BUILDDIR)/runTest_cu.o: $(GTESTLIBS)
 $(BUILDDIR)/runTest_cu.o: INCFLAGS += $(GTESTINC)
 ifneq ($(shell $(CXX) --version | grep ^Intel),)
-$(testmain): LIBFLAGS += -lintlc # compile with icpx and link with nvcc (undefined reference to `_intel_fast_memcpy')
-$(testmain): LIBFLAGS += -lsvml # compile with icpx and link with nvcc (undefined reference to `__svml_cos4_l9')
+$(testmain): LIBFLAGS += -lintlc # compile with icpx and link with GPUCC (undefined reference to `_intel_fast_memcpy')
+$(testmain): LIBFLAGS += -lsvml # compile with icpx and link with GPUCC (undefined reference to `__svml_cos4_l9')
 else ifneq ($(shell $(CXX) --version | grep ^nvc++),) # support nvc++ #531
 $(testmain): LIBFLAGS += -L$(patsubst %bin/nvc++,%lib,$(subst ccache ,,$(CXX))) -lnvhpcatm -lnvcpumath -lnvc
 endif
@@ -713,14 +779,14 @@ $(testmain): LIBFLAGS += -lgomp
 endif
 endif
 
-ifeq ($(NVCC),) # link only runTest.o
+ifeq ($(GPUCC),) # link only runTest.o
 $(testmain): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH
 $(testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(cxx_objects_exe) $(GTESTLIBS)
 	$(CXX) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) -ldl -pthread $(LIBFLAGS)
 else # link both runTest.o and runTest_cu.o
 $(testmain): LIBFLAGS += $(CULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH
 $(testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) $(GTESTLIBS)
-	$(NVCC) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) -ldl $(LIBFLAGS) -lcuda
+	  $(GPUCC) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) -ldl $(LIBFLAGS) $(CUDATESTFLAGS)
 endif
 
 # Use target gtestlibs to build only googletest
@@ -829,9 +895,9 @@ ifeq ($(USECCACHE),1)
 	ccache --version | head -1
 endif
 	@echo ""
-	@echo NVCC=$(NVCC)
-ifneq ($(NVCC),)
-	$(NVCC) --version
+	@echo GPUCC=$(GPUCC)
+ifneq ($(GPUCC),)
+	$(GPUCC) --version
 endif
 	@echo ""
 	@echo CXX=$(CXX)
@@ -850,7 +916,7 @@ endif
 
 # Target: check (run the C++ test executable)
 # [NB THIS IS WHAT IS USED IN THE GITHUB CI!]
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 check: runTest cmpFcheck cmpFGcheck
 else
 check: runTest cmpFcheck
diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/fbridge.cc b/epochX/cudacpp/gq_ttq.mad/SubProcesses/fbridge.cc
index 2d2b36d560..22ce3f5115 100644
--- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/fbridge.cc
+++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/fbridge.cc
@@ -1,11 +1,11 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: S. Roiser (Oct 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Roiser, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #include "Bridge.h"
 #include "CPPProcess.h"
-#include "CudaRuntime.h"
+#include "GpuRuntime.h"
 
 extern "C"
 {
@@ -22,7 +22,7 @@ extern "C"
    * Using the same Fortran MadEvent code, linking to the hetrerogeneous library would allow access to both CPU and GPU implementations.
    * The specific heterogeneous configuration (how many GPUs, how many threads on each CPU, etc) could be loaded in CUDA/C++ from a data file.
    */
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   using namespace mg5amcGpu;
 #else
   using namespace mg5amcCpu;
@@ -46,8 +46,8 @@ extern "C"
    */
   void fbridgecreate_( CppObjectInFortran** ppbridge, const int* pnevtF, const int* pnparF, const int* pnp4F )
   {
-#ifdef __CUDACC__
-    CudaRuntime::setUp();
+#ifdef MGONGPUCPP_GPUIMPL
+    GpuRuntime::setUp();
 #endif
     // (NB: CPPProcess::initProc no longer needs to be executed here because it is called in the Bridge constructor)
     // FIXME: disable OMP in Bridge when called from Fortran
@@ -65,8 +65,8 @@ extern "C"
     Bridge<FORTRANFPTYPE>* pbridge = dynamic_cast<Bridge<FORTRANFPTYPE>*>( *ppbridge );
     if( pbridge == 0 ) throw std::runtime_error( "fbridgedelete_: invalid Bridge address" );
     delete pbridge;
-#ifdef __CUDACC__
-    CudaRuntime::tearDown();
+#ifdef MGONGPUCPP_GPUIMPL
+    GpuRuntime::tearDown();
 #endif
   }
 
@@ -96,7 +96,7 @@ extern "C"
   {
     Bridge<FORTRANFPTYPE>* pbridge = dynamic_cast<Bridge<FORTRANFPTYPE>*>( *ppbridge );
     if( pbridge == 0 ) throw std::runtime_error( "fbridgesequence_: invalid Bridge address" );
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     // Use the device/GPU implementation in the CUDA library
     // (there is also a host implementation in this library)
     pbridge->gpu_sequence( momenta, gs, rndhel, rndcol, *pchannelId, mes, selhel, selcol );
diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/fsampler.cc b/epochX/cudacpp/gq_ttq.mad/SubProcesses/fsampler.cc
index 2fb445372d..3743934f41 100644
--- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/fsampler.cc
+++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/fsampler.cc
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Feb 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #include "mgOnGpuConfig.h"
 
@@ -13,7 +13,7 @@
 
 //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -40,7 +40,7 @@ namespace mg5amcCpu
   private:
     const int m_nevt; // The number of events in each iteration
     int m_iiter;      // The iteration counter (for random number seeding)
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
     HostBufferRndNumMomenta m_hstRndmom; // Memory buffers for random numbers
     HostBufferMomenta m_hstMomenta;      // Memory buffers for momenta
     HostBufferWeights m_hstWeights;      // Memory buffers for sampling weights
@@ -105,7 +105,7 @@ namespace mg5amcCpu
 
 extern "C"
 {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   using namespace mg5amcGpu;
 #else
   using namespace mg5amcCpu;
diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/runTest.cc b/epochX/cudacpp/gq_ttq.mad/SubProcesses/runTest.cc
index d4a760a71b..de327f2321 100644
--- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/runTest.cc
+++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/runTest.cc
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: S. Hageboeck (Nov 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 //----------------------------------------------------------------------------
 // Use ./runTest.exe --gtest_filter=*xxx to run only testxxx.cc tests
 //----------------------------------------------------------------------------
@@ -18,7 +18,7 @@
 #include "RandomNumberKernels.h"
 #include "epoch_process_id.h"
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 using namespace mg5amcGpu;
 #else
 using namespace mg5amcCpu;
@@ -35,7 +35,7 @@ struct CUDA_CPU_TestBase : public TestDriverBase
     : TestDriverBase( npar, refFileName ) {}
 };
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 struct CPUTest : public CUDA_CPU_TestBase
 {
   // Struct data members (process, and memory structures for random numbers, momenta, matrix elements and weights on host and device)
@@ -119,7 +119,7 @@ struct CPUTest : public CUDA_CPU_TestBase
 };
 #endif
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 struct CUDATest : public CUDA_CPU_TestBase
 {
   // Reset the device when our test goes out of scope. Note that this should happen after
@@ -128,7 +128,7 @@ struct CUDATest : public CUDA_CPU_TestBase
   {
     ~DeviceReset()
     {
-      checkCuda( cudaDeviceReset() ); // this is needed by cuda-memcheck --leak-check full
+      checkGpu( gpuDeviceReset() ); // this is needed by cuda-memcheck --leak-check full
     }
   } deviceResetter;
 
@@ -256,7 +256,7 @@ INSTANTIATE_TEST_SUITE_P( prefix, \
                           test_suite_name, \
                           testing::Values( new CUDATest( MG_EPOCH_REFERENCE_FILE_NAME ) ) );
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 MG_INSTANTIATE_TEST_SUITE_GPU( XTESTID_GPU( MG_EPOCH_PROCESS_ID ), MadgraphTest );
 #else
 MG_INSTANTIATE_TEST_SUITE_CPU( XTESTID_CPU( MG_EPOCH_PROCESS_ID ), MadgraphTest );
diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/testmisc.cc b/epochX/cudacpp/gq_ttq.mad/SubProcesses/testmisc.cc
index 895d6eeb56..ba9e59a8a3 100644
--- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/testmisc.cc
+++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/testmisc.cc
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 //----------------------------------------------------------------------------
 // Use ./runTest.exe --gtest_filter=*misc to run only testmisc.cc tests
 //----------------------------------------------------------------------------
@@ -17,7 +17,7 @@
 #include <sstream>
 #include <typeinfo>
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 #define TESTID( s ) s##_GPU_MISC
 #else
 #define TESTID( s ) s##_CPU_MISC
@@ -26,7 +26,7 @@
 #define XTESTID( s ) TESTID( s )
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -59,7 +59,7 @@ namespace mg5amcCpu
 
 TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testmisc )
 {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   using namespace mg5amcGpu;
 #else
   using namespace mg5amcCpu;
diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/testxxx.cc b/epochX/cudacpp/gq_ttq.mad/SubProcesses/testxxx.cc
index 3361fe5aa9..e5167de00c 100644
--- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/testxxx.cc
+++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/testxxx.cc
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Apr 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 //----------------------------------------------------------------------------
 // Use ./runTest.exe --gtest_filter=*xxx to run only testxxx.cc tests
 //----------------------------------------------------------------------------
@@ -24,7 +24,7 @@
 #include <iomanip>
 #include <iostream>
 #include <vector>
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 #define TESTID( s ) s##_GPU_XXX
 #else
 #define TESTID( s ) s##_CPU_XXX
@@ -32,7 +32,7 @@
 
 #define XTESTID( s ) TESTID( s )
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -42,7 +42,7 @@ namespace mg5amcCpu
   int FPEhandlerIevt = -1;
   inline void FPEhandler( int sig )
   {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     std::cerr << "Floating Point Exception (GPU): '" << FPEhandlerMessage << "' ievt=" << FPEhandlerIevt << std::endl;
 #else
     std::cerr << "Floating Point Exception (CPU neppV=" << neppV << "): '" << FPEhandlerMessage << "' ievt=" << FPEhandlerIevt << std::endl;
@@ -53,7 +53,7 @@ namespace mg5amcCpu
 
 TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testxxx )
 {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   using namespace mg5amcGpu;
 #else
   using namespace mg5amcCpu;
@@ -77,7 +77,7 @@ TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testxxx )
   assert( nevt % neppM == 0 ); // nevt must be a multiple of neppM
   assert( nevt % neppV == 0 ); // nevt must be a multiple of neppV
   // Fill in the input momenta
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   mg5amcGpu::PinnedHostBufferMomenta hstMomenta( nevt ); // AOSOA[npagM][npar=4][np4=4][neppM]
 #else
   mg5amcCpu::HostBufferMomenta hstMomenta( nevt ); // AOSOA[npagM][npar=4][np4=4][neppM]
@@ -322,7 +322,7 @@ TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testxxx )
   {
     for( int ievt = 0; ievt < nevt; ievt++ )
     {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
       using namespace mg5amcGpu;
 #else
       using namespace mg5amcCpu;
diff --git a/epochX/cudacpp/gq_ttq.mad/src/HelAmps_sm.h b/epochX/cudacpp/gq_ttq.mad/src/HelAmps_sm.h
index cd4e6de668..45000c7246 100644
--- a/epochX/cudacpp/gq_ttq.mad/src/HelAmps_sm.h
+++ b/epochX/cudacpp/gq_ttq.mad/src/HelAmps_sm.h
@@ -5,7 +5,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: A. Valassi (Sep 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
 // MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
@@ -28,7 +28,7 @@
 //#include <iomanip>
 //#include <iostream>
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/gq_ttq.mad/src/Parameters_sm.cc b/epochX/cudacpp/gq_ttq.mad/src/Parameters_sm.cc
index c06dcbb252..8b92ea0bd6 100644
--- a/epochX/cudacpp/gq_ttq.mad/src/Parameters_sm.cc
+++ b/epochX/cudacpp/gq_ttq.mad/src/Parameters_sm.cc
@@ -4,7 +4,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: A. Valassi (Sep 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
 // MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
@@ -17,7 +17,7 @@
 #include <iomanip>
 #include <iostream>
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 using namespace mg5amcGpu;
 #else
 using namespace mg5amcCpu;
diff --git a/epochX/cudacpp/gq_ttq.mad/src/Parameters_sm.h b/epochX/cudacpp/gq_ttq.mad/src/Parameters_sm.h
index a6eb185434..a3615ec77a 100644
--- a/epochX/cudacpp/gq_ttq.mad/src/Parameters_sm.h
+++ b/epochX/cudacpp/gq_ttq.mad/src/Parameters_sm.h
@@ -27,7 +27,7 @@
 #include "read_slha.h"
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -93,7 +93,7 @@ namespace mg5amcCpu
 #include <limits>
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -217,7 +217,7 @@ namespace mg5amcCpu
 //==========================================================================
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -236,7 +236,7 @@ namespace mg5amcCpu
 #pragma GCC diagnostic push
 #pragma GCC diagnostic ignored "-Wunused-variable"  // e.g. <<warning: unused variable ‘mdl_G__exp__2’ [-Wunused-variable]>>
 #pragma GCC diagnostic ignored "-Wunused-parameter" // e.g. <<warning: unused parameter ‘G’ [-Wunused-parameter]>>
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 #pragma nv_diagnostic push
 #pragma nv_diag_suppress 177 // e.g. <<warning #177-D: variable "mdl_G__exp__2" was declared but never referenced>>
 #endif
@@ -263,7 +263,7 @@ namespace mg5amcCpu
       // End SM implementation - no special handling of vectors of floats as in EFT (#439)
       return out;
     }
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 #pragma GCC diagnostic pop
 #pragma nv_diagnostic pop
 #endif
diff --git a/epochX/cudacpp/gq_ttq.mad/src/cudacpp_src.mk b/epochX/cudacpp/gq_ttq.mad/src/cudacpp_src.mk
index d4cc628aec..159e19a46d 100644
--- a/epochX/cudacpp/gq_ttq.mad/src/cudacpp_src.mk
+++ b/epochX/cudacpp/gq_ttq.mad/src/cudacpp_src.mk
@@ -19,7 +19,7 @@ SHELL := /bin/bash
 #=== Configure common compiler flags for CUDA and C++
 
 INCFLAGS = -I.
-OPTFLAGS = -O3 # this ends up in CUFLAGS too (should it?), cannot add -Ofast or -ffast-math here
+OPTFLAGS = -O3 # this ends up in GPUFLAGS too (should it?), cannot add -Ofast or -ffast-math here
 
 #-------------------------------------------------------------------------------
 
@@ -45,13 +45,13 @@ endif
 
 #-------------------------------------------------------------------------------
 
-#=== Configure the CUDA compiler (note: NVCC is already exported including ccache)
+#=== Configure the CUDA compiler (note: GPUCC is already exported including ccache)
 
-###$(info NVCC=$(NVCC))
+###$(info GPUCC=$(GPUCC))
 
 #-------------------------------------------------------------------------------
 
-#=== Configure ccache for C++ builds (note: NVCC is already exported including ccache)
+#=== Configure ccache for C++ builds (note: GPUCC is already exported including ccache)
 
 # Enable ccache if USECCACHE=1
 ifeq ($(USECCACHE)$(shell echo $(CXX) | grep ccache),1)
@@ -92,6 +92,13 @@ endif
 ###$(info OMPFLAGS=$(OMPFLAGS))
 CXXFLAGS += $(OMPFLAGS)
 
+# Add correct -DHIP_LATFORM when compiling for HIP
+ifeq ($(findstring nvcc,$(GPUCC)),nvcc)
+  GPUFLAGS += -Xcompiler -fPIC -c -x cu
+else ifeq ($(findstring hipcc,$(GPUCC)),hipcc)
+  GPUFLAGS += -fPIC -c
+endif
+
 # Set the build flags appropriate to each AVX choice (example: "make AVX=none")
 # [NB MGONGPU_PVW512 is needed because "-mprefer-vector-width=256" is not exposed in a macro]
 # [See https://gcc.gnu.org/bugzilla/show_bug.cgi?id=96476]
@@ -253,20 +260,20 @@ $(BUILDDIR)/%.o : %.cc *.h $(BUILDDIR)/.build.$(TAG)
 # Generic target and build rules: objects from CUDA compilation
 $(BUILDDIR)/%_cu.o : %.cc *.h $(BUILDDIR)/.build.$(TAG)
 	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
-	$(NVCC) $(CPPFLAGS) $(CUFLAGS) -Xcompiler -fPIC -c -x cu $< -o $@
+	$(GPUCC) $(CPPFLAGS) $(GPUFLAGS) $< -o $@
 
 #-------------------------------------------------------------------------------
 
 cxx_objects=$(addprefix $(BUILDDIR)/, Parameters_sm.o read_slha.o)
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 cu_objects=$(addprefix $(BUILDDIR)/, Parameters_sm_cu.o)
 endif
 
 # Target (and build rules): common (src) library
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so : $(cxx_objects) $(cu_objects)
 	@if [ ! -d $(LIBDIR) ]; then echo "mkdir -p $(LIBDIR)"; mkdir -p $(LIBDIR); fi
-	$(NVCC) -shared -o $@ $(cxx_objects) $(cu_objects) $(LDFLAGS)
+	$(GPUCC) -shared -o $@ $(cxx_objects) $(cu_objects) $(LDFLAGS)
 else
 $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so : $(cxx_objects)
 	@if [ ! -d $(LIBDIR) ]; then echo "mkdir -p $(LIBDIR)"; mkdir -p $(LIBDIR); fi
diff --git a/epochX/cudacpp/gq_ttq.mad/src/mgOnGpuConfig.h b/epochX/cudacpp/gq_ttq.mad/src/mgOnGpuConfig.h
index 80032e528b..55d03f1252 100644
--- a/epochX/cudacpp/gq_ttq.mad/src/mgOnGpuConfig.h
+++ b/epochX/cudacpp/gq_ttq.mad/src/mgOnGpuConfig.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jul 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MGONGPUCONFIG_H
 #define MGONGPUCONFIG_H 1
@@ -10,12 +10,25 @@
 // There are two different code bases for standalone_cudacpp (without multichannel) and madevent+cudacpp (with multichannel)
 #define MGONGPU_SUPPORTS_MULTICHANNEL 1
 
+// Is this a GPU (CUDA, HIP) or CPU implementation?
+#ifdef __CUDACC__
+#define MGONGPUCPP_GPUIMPL cuda
+#elif defined __HIPCC__
+#define MGONGPUCPP_GPUIMPL hip
+#else
+#undef MGONGPUCPP_GPUIMPL
+#endif
+
 // ** NB1 Throughputs (e.g. 6.8E8) are events/sec for "./gcheck.exe -p 65536 128 12"
 // ** NB2 Baseline on b7g47n0004 fluctuates (probably depends on load on other VMs)
 
 // Choose if curand is supported for generating random numbers
+// For HIP, by default, do not use curand (common random numbers will be used instead)
 // For both CUDA and C++, by default, do not inline, but allow this macro to be set from outside with e.g. -DMGONGPU_HAS_NO_CURAND
-// (there exist CUDA installations, e.g. using the HPC package, which do not include curand - see PR #784)
+// (there exist CUDA installations, e.g. using the HPC package, which do not include curand - see PR #784 and #785)
+#if defined __HIPCC__
+#define MGONGPU_HAS_NO_CURAND 1
+#else
 //#ifdef __CUDACC__
 //#undef MGONGPU_HAS_NO_CURAND // default
 ////#define MGONGPU_HAS_NO_CURAND 1
@@ -23,6 +36,7 @@
 //#undef MGONGPU_HAS_NO_CURAND // default
 ////#define MGONGPU_HAS_NO_CURAND 1
 //#endif
+#endif
 
 // Choose floating point precision (for everything but color algebra #537)
 // If one of these macros has been set from outside with e.g. -DMGONGPU_FPTYPE_FLOAT, nothing happens (issue #167)
@@ -54,23 +68,28 @@
 //#undef MGONGPU_HARDCODE_PARAM // default
 ////#define MGONGPU_HARDCODE_PARAM 1
 
-// Complex type in c++: std::complex or cxsmpl (CHOOSE ONLY ONE)
-#ifndef __CUDACC__
-//#define MGONGPU_CPPCXTYPE_STDCOMPLEX 1 // ~8 percent slower on float, same on double (5.1E6/double, 9.4E6/float)
-#define MGONGPU_CPPCXTYPE_CXSMPL 1 // new default (5.1E6/double, 10.2E6/float)
-#endif
-
-// Complex type in cuda: thrust or cucomplex or cxsmpl (CHOOSE ONLY ONE)
+// Complex type in CUDA: thrust or cucomplex or cxsmpl (CHOOSE ONLY ONE)
 #ifdef __CUDACC__
 #define MGONGPU_CUCXTYPE_THRUST 1 // default (~1.15E9/double, ~3.2E9/float)
 //#define MGONGPU_CUCXTYPE_CUCOMPLEX 1 // ~10 percent slower (1.03E9/double, ~2.8E9/float)
 //#define MGONGPU_CUCXTYPE_CXSMPL 1 // ~10 percent slower (1.00E9/double, ~2.9E9/float)
+
+// Complex type in HIP: cxsmpl (ONLY ONE OPTION POSSIBLE)
+#elif defined __HIPCC__
+#define MGONGPU_CUCXTYPE_CXSMPL 1 // ~10 percent slower (1.00E9/double, ~2.9E9/float)
+
+// Complex type in C++: std::complex or cxsmpl (CHOOSE ONLY ONE)
+#else
+//#define MGONGPU_CPPCXTYPE_STDCOMPLEX 1 // ~8 percent slower on float, same on double (5.1E6/double, 9.4E6/float)
+#define MGONGPU_CPPCXTYPE_CXSMPL 1 // new default (5.1E6/double, 10.2E6/float)
 #endif
 
-// Cuda nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation
+// CUDA nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation
 #ifdef __CUDACC__
-#undef MGONGPU_NSIGHT_DEBUG // default
+#undef MGONGPU_NSIGHT_DEBUG // default in CUDA
 //#define MGONGPU_NSIGHT_DEBUG 1
+#else
+#undef MGONGPU_NSIGHT_DEBUG // only option in HIP or C++
 #endif
 
 // SANITY CHECKS (floating point precision for everything but color algebra #537)
@@ -86,17 +105,21 @@
 #error You cannot use double precision for color algebra and single precision elsewhere
 #endif
 
-// SANITY CHECKS (c++ complex number implementation)
-#ifndef __CUDACC__
-#if defined MGONGPU_CPPCXTYPE_STDCOMPLEX and defined MGONGPU_CPPCXTYPE_CXSMPL
-#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CPPCXTYPE_STDCOMPLEX or MGONGPU_CPPCXTYPE_CXSMPL
+// SANITY CHECKS (CUDA complex number implementation)
+#ifdef __CUDACC__
+#if defined MGONGPU_CUCXTYPE_THRUST and defined MGONGPU_CUCXTYPE_CUCOMPLEX
+#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CUCXTYPE_THRUST or MGONGPU_CUCXTYPE_CUCOMPLEX for CUDA
+#elif defined MGONGPU_CUCXTYPE_THRUST and defined MGONGPU_CUCXTYPE_CXSMPL
+#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CUCXTYPE_THRUST or MGONGPU_CUCXTYPE_CXSMPL for CUDA
+#elif defined MGONGPU_CUCXTYPE_CUCOMPLEX and defined MGONGPU_CUCXTYPE_CXSMPL
+#error You must CHOOSE (ONE AND) ONLY ONE OF MGONGPU_CUCXTYPE_CUCOMPLEX or MGONGPU_CUCXTYPE_CXSMPL for CUDA
 #endif
 #endif
 
-// SANITY CHECKS (cuda complex number implementation)
-#ifdef __CUDACC__
-#if defined MGONGPU_CUCXTYPE_THRUST and defined MGONGPU_CUCXTYPE_CUCOMPLEX and defined MGONGPU_CUCXTYPE_CXSMPL
-#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CUCXTYPE_THRUST or MGONGPU_CUCXTYPE_CUCOMPLEX or MGONGPU_CUCXTYPE_CXSMPL
+// SANITY CHECKS (C++ complex number implementation)
+#ifndef MGONGPUCPP_GPUIMPL
+#if defined MGONGPU_CPPCXTYPE_STDCOMPLEX and defined MGONGPU_CPPCXTYPE_CXSMPL
+#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CPPCXTYPE_STDCOMPLEX or MGONGPU_CPPCXTYPE_CXSMPL for C++
 #endif
 #endif
 
@@ -134,7 +157,7 @@ namespace mgOnGpu
   // Alignment requirement for using reinterpret_cast with SIMD vectorized code
   // (using reinterpret_cast with non aligned memory may lead to segmentation faults!)
   // Only needed for C++ code but can be enforced also in NVCC builds of C++ code using CUDA>=11.2 and C++17 (#318, #319, #333)
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   constexpr int cppAlign = 64; // alignment requirement for SIMD vectorization (64-byte i.e. 512-bit)
 #endif
 
@@ -145,7 +168,7 @@ using mgOnGpu::fptype;
 using mgOnGpu::fptype2;
 
 // C++ SIMD vectorization width (this will be used to set neppV)
-#ifdef __CUDACC__ // CUDA implementation has no SIMD
+#ifdef MGONGPUCPP_GPUIMPL // CUDA and HIP implementations have no SIMD
 #undef MGONGPU_CPPSIMD
 #elif defined __AVX512VL__ && defined MGONGPU_PVW512 // C++ "512z" AVX512 with 512 width (512-bit ie 64-byte): 8 (DOUBLE) or 16 (FLOAT)
 #ifdef MGONGPU_FPTYPE_DOUBLE
@@ -175,9 +198,9 @@ using mgOnGpu::fptype2;
 #undef MGONGPU_CPPSIMD
 #endif
 
-// Cuda nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation
+// CUDA nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation
 // Arguments (not used so far): text is __FUNCTION__, code is 0 (start) or 1 (end)
-#if defined __CUDACC__ && defined MGONGPU_NSIGHT_DEBUG /* clang-format off */
+#if defined __CUDA__ && defined MGONGPU_NSIGHT_DEBUG /* clang-format off */
 #define mgDebugDeclare() __shared__ float mgDebugCounter[mgOnGpu::ntpbMAX];
 #define mgDebugInitialise() { mgDebugCounter[threadIdx.x] = 0; }
 #define mgDebug( code, text ) { mgDebugCounter[threadIdx.x] += 1; }
@@ -189,8 +212,8 @@ using mgOnGpu::fptype2;
 #define mgDebugFinalise() { /*noop*/ }
 #endif /* clang-format on */
 
-// Define empty CUDA declaration specifiers for C++
-#ifndef __CUDACC__
+// Define empty CUDA/HIP declaration specifiers for C++
+#ifndef MGONGPUCPP_GPUIMPL
 #define __global__
 #define __host__
 #define __device__
diff --git a/epochX/cudacpp/gq_ttq.mad/src/mgOnGpuCxtypes.h b/epochX/cudacpp/gq_ttq.mad/src/mgOnGpuCxtypes.h
index ca9a9f00c0..5532e22fa1 100644
--- a/epochX/cudacpp/gq_ttq.mad/src/mgOnGpuCxtypes.h
+++ b/epochX/cudacpp/gq_ttq.mad/src/mgOnGpuCxtypes.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022, based on earlier work by D. Smith) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MGONGPUCXTYPES_H
 #define MGONGPUCXTYPES_H 1
@@ -19,7 +19,7 @@
 #include <complex>
 
 // Complex type in cuda: thrust or cucomplex or cxsmpl
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 #if defined MGONGPU_CUCXTYPE_THRUST
 #pragma clang diagnostic push
 #pragma clang diagnostic ignored "-Wtautological-compare" // for icpx2021/clang13 (https://stackoverflow.com/a/15864661)
@@ -82,7 +82,7 @@ namespace mgOnGpu /* clang-format off */
 using mgOnGpu::cxsmpl;
 
 // Printout to stream for user defined types
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -92,7 +92,7 @@ namespace mg5amcCpu
   inline __host__ std::ostream&
   operator<<( std::ostream& out, const cxsmpl<FP>& c )
   {
-    out << std::complex( c.real(), c.imag() );
+    out << std::complex<FP>( c.real(), c.imag() );
     return out;
   }
 
@@ -215,14 +215,14 @@ namespace mg5amcCpu
 //==========================================================================
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
 #endif
 {
   // --- Type definitions (complex type: cxtype)
-#ifdef __CUDACC__ // cuda
+#ifdef MGONGPUCPP_GPUIMPL // cuda
 #if defined MGONGPU_CUCXTYPE_THRUST
   typedef thrust::complex<fptype> cxtype;
 #elif defined MGONGPU_CUCXTYPE_CUCOMPLEX
@@ -255,7 +255,7 @@ namespace mg5amcCpu
 //==========================================================================
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -307,7 +307,7 @@ namespace mg5amcCpu
 
   //==========================================================================
 
-#if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_THRUST // cuda + thrust
+#if defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CUCXTYPE_THRUST // cuda + thrust
 
   //------------------------------
   // CUDA - using thrust::complex
@@ -343,11 +343,11 @@ namespace mg5amcCpu
     return c;
   }
 
-#endif // #if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_THRUST
+#endif // #if defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CUCXTYPE_THRUST
 
   //==========================================================================
 
-#if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_CUCOMPLEX // cuda + cucomplex
+#if defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CUCXTYPE_CUCOMPLEX // cuda + cucomplex
 
   //------------------------------
   // CUDA - using cuComplex
@@ -562,11 +562,11 @@ namespace mg5amcCpu
     return cxmake( c.real(), c.imag() );
   }
 
-#endif // #if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_CUCOMPLEX
+#endif // #if defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CUCXTYPE_CUCOMPLEX
 
   //==========================================================================
 
-#if not defined __CUDACC__ and defined MGONGPU_CPPCXTYPE_STDCOMPLEX // c++ + stdcomplex
+#if not defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CPPCXTYPE_STDCOMPLEX // c++ + stdcomplex
 
   //------------------------------
   // C++ - using std::complex
@@ -610,7 +610,7 @@ namespace mg5amcCpu
   }
 #endif
 
-#endif // #if not defined __CUDACC__ and defined MGONGPU_CPPCXTYPE_STDCOMPLEX
+#endif // #if not defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CPPCXTYPE_STDCOMPLEX
 
   //==========================================================================
 
@@ -633,7 +633,7 @@ namespace mg5amcCpu
 //==========================================================================
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/gq_ttq.mad/src/mgOnGpuFptypes.h b/epochX/cudacpp/gq_ttq.mad/src/mgOnGpuFptypes.h
index 905c97d700..fa3a02664b 100644
--- a/epochX/cudacpp/gq_ttq.mad/src/mgOnGpuFptypes.h
+++ b/epochX/cudacpp/gq_ttq.mad/src/mgOnGpuFptypes.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MGONGPUFPTYPES_H
 #define MGONGPUFPTYPES_H 1
@@ -12,7 +12,7 @@
 #include <cmath>
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL // cuda
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -20,7 +20,7 @@ namespace mg5amcCpu
 {
   //==========================================================================
 
-#ifdef __CUDACC__ // cuda
+#ifdef MGONGPUCPP_GPUIMPL // cuda
 
   //------------------------------
   // Floating point types - Cuda
@@ -64,11 +64,11 @@ namespace mg5amcCpu
 #endif
   }
 
-#endif // #ifdef __CUDACC__
+#endif // #ifdef MGONGPUCPP_GPUIMPL
 
   //==========================================================================
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 
   //------------------------------
   // Floating point types - C++
@@ -92,7 +92,7 @@ namespace mg5amcCpu
     return std::sqrt( f );
   }
 
-#endif // #ifndef __CUDACC__
+#endif // #ifndef MGONGPUCPP_GPUIMPL
 
   //==========================================================================
 
diff --git a/epochX/cudacpp/gq_ttq.mad/src/mgOnGpuVectors.h b/epochX/cudacpp/gq_ttq.mad/src/mgOnGpuVectors.h
index e1299ba81e..cdae04326b 100644
--- a/epochX/cudacpp/gq_ttq.mad/src/mgOnGpuVectors.h
+++ b/epochX/cudacpp/gq_ttq.mad/src/mgOnGpuVectors.h
@@ -32,7 +32,7 @@
 #endif
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -131,7 +131,7 @@ namespace mg5amcCpu
 #endif
 #endif
 
-#else // i.e #ifndef MGONGPU_CPPSIMD (this includes #ifdef __CUDACC__)
+#else // i.e #ifndef MGONGPU_CPPSIMD (this includes #ifdef MGONGPUCPP_GPUIMPL)
 
   const int neppV = 1;
 
@@ -153,13 +153,13 @@ namespace mg5amcCpu
 //==========================================================================
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
 #endif
 {
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 
   // Printout to stream for user defined types
 
@@ -805,11 +805,11 @@ namespace mg5amcCpu
 
 #endif // #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
 
-#endif // #ifndef __CUDACC__
+#endif // #ifndef MGONGPUCPP_GPUIMPL
 
   //==========================================================================
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 
   //------------------------------
   // Vector types - CUDA
@@ -853,12 +853,12 @@ namespace mg5amcCpu
     return mask;
   }
 
-#endif // #ifdef __CUDACC__
+#endif // #ifdef MGONGPUCPP_GPUIMPL
 
   //==========================================================================
 
   // Scalar-or-vector types: scalar in CUDA, vector or scalar in C++
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   typedef bool bool_sv;
   typedef fptype fptype_sv;
   typedef fptype2 fptype2_sv;
@@ -879,7 +879,7 @@ namespace mg5amcCpu
 #endif
 
   // Scalar-or-vector zeros: scalar in CUDA, vector or scalar in C++
-#ifdef __CUDACC__ /* clang-format off */
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
   inline __host__ __device__ cxtype cxzero_sv(){ return cxtype( 0, 0 ); }
 #elif defined MGONGPU_CPPSIMD
   inline cxtype_v cxzero_sv() { return cxtype_v(); } // RRRR=0000 IIII=0000
diff --git a/epochX/cudacpp/gq_ttq.mad/src/rambo.h b/epochX/cudacpp/gq_ttq.mad/src/rambo.h
index e02ea52496..cd7e1008ea 100644
--- a/epochX/cudacpp/gq_ttq.mad/src/rambo.h
+++ b/epochX/cudacpp/gq_ttq.mad/src/rambo.h
@@ -4,7 +4,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 
 #include "mgOnGpuConfig.h"
@@ -18,7 +18,7 @@
 #include <iostream>
 
 // Simplified rambo version for 2 to N (with N>=2) processes with massless particles
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -83,7 +83,7 @@ namespace mg5amcCpu
       static bool first = true;
       if( first )
       {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
         if constexpr( M_ACCESS::isOnDevice() ) // avoid
         {
           const int ievt0 = 0;
@@ -166,7 +166,7 @@ namespace mg5amcCpu
     wt = po2log;
     if( nparf != 2 ) wt = ( 2. * nparf - 4. ) * log( energy ) + z[nparf - 1];
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
     // issue warnings if weight is too small or too large
     static int iwarn[5] = { 0, 0, 0, 0, 0 };
     if( wt < -180. )
diff --git a/epochX/cudacpp/gq_ttq.sa/CODEGEN_cudacpp_gq_ttq_log.txt b/epochX/cudacpp/gq_ttq.sa/CODEGEN_cudacpp_gq_ttq_log.txt
index f659f6bb8d..93395d9159 100644
--- a/epochX/cudacpp/gq_ttq.sa/CODEGEN_cudacpp_gq_ttq_log.txt
+++ b/epochX/cudacpp/gq_ttq.sa/CODEGEN_cudacpp_gq_ttq_log.txt
@@ -52,7 +52,7 @@ Note that you can still compile and run aMC@NLO with the built-in PDFs
 
 Using default text editor "vi". Set another one in ./input/mg5_configuration.txt
 Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt
-No valid web browser found. Please set in ./input/mg5_configuration.txt
+Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt
 import /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq.mg
 The import format was not given, so we guess it as command
 set stdout_level DEBUG
@@ -61,7 +61,7 @@ set zerowidth_tchannel F
 define q = u c d s u~ c~ d~ s~
 INFO: load particles 
 INFO: load vertices 
-[1;32mDEBUG: model prefixing  takes 0.0054836273193359375 [0m
+[1;32mDEBUG: model prefixing  takes 0.005349397659301758 [0m
 INFO: Restrict model sm with file models/sm/restrict_default.dat . 
 [1;32mDEBUG: Simplifying conditional expressions [0m
 [1;32mDEBUG: remove interactions: u s w+ at order: QED=1 [0m
@@ -170,7 +170,7 @@ INFO: Crossed process found for g u~ > t t~ u~, reuse diagrams.
 INFO: Crossed process found for g c~ > t t~ c~, reuse diagrams. 
 INFO: Crossed process found for g d~ > t t~ d~, reuse diagrams. 
 INFO: Crossed process found for g s~ > t t~ s~, reuse diagrams. 
-8 processes with 40 diagrams generated in 0.080 s
+8 processes with 40 diagrams generated in 0.077 s
 Total: 8 processes with 40 diagrams
 output standalone_cudacpp ../TMPOUT/CODEGEN_cudacpp_gq_ttq
 Load PLUGIN.CUDACPP_OUTPUT
@@ -211,7 +211,7 @@ Generated helas calls for 2 subprocesses (10 diagrams) in 0.031 s
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates FFV1 routines[0m
 ALOHA: aloha creates VVV1 routines[0m
-ALOHA: aloha creates 2 routines in  0.146 s
+ALOHA: aloha creates 2 routines in  0.142 s
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
@@ -227,7 +227,7 @@ INFO: Created files Parameters_sm.h and Parameters_sm.cc in directory
 INFO: /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/src/. and /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/src/. 
 quit
 
-real	0m0.709s
-user	0m0.586s
-sys	0m0.064s
-Code generation completed in 0 seconds
+real	0m0.645s
+user	0m0.577s
+sys	0m0.059s
+Code generation completed in 1 seconds
diff --git a/epochX/cudacpp/gq_ttq.sa/COPYRIGHT b/epochX/cudacpp/gq_ttq.sa/COPYRIGHT
index a134b5fef9..84a883fbb0 100644
--- a/epochX/cudacpp/gq_ttq.sa/COPYRIGHT
+++ b/epochX/cudacpp/gq_ttq.sa/COPYRIGHT
@@ -15,6 +15,7 @@ The full development team currently includes the following authors :
   Stephan Hageboeck (CERN)
   Olivier Mattelaer (Universite Catholique de Louvain, original author)
   Stefan Roiser (CERN, original author)
+  Joergen Teig (CERN)
   Andrea Valassi (CERN, original author)
   Zenny Wettersten (CERN)
 See https://github.com/madgraph5/madgraph4gpu for more details. For the full
diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/Bridge.h b/epochX/cudacpp/gq_ttq.sa/SubProcesses/Bridge.h
index bf8b5e024d..89437b4c42 100644
--- a/epochX/cudacpp/gq_ttq.sa/SubProcesses/Bridge.h
+++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/Bridge.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: S. Roiser (Nov 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Roiser, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef BRIDGE_H
 #define BRIDGE_H 1
@@ -23,7 +23,7 @@
 #include <memory>
 #include <type_traits>
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -83,7 +83,7 @@ namespace mg5amcCpu
     Bridge& operator=( const Bridge& ) = delete;
     Bridge& operator=( Bridge&& ) = delete;
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     /**
      * Set the gpublocks and gputhreads for the gpusequence - throws if evnt != gpublocks*gputhreads
      * (this is needed for BridgeKernel tests rather than for actual production use in Fortran)
@@ -150,7 +150,7 @@ namespace mg5amcCpu
     unsigned int m_nevt; // number of events
     int m_nGoodHel;      // the number of good helicities (-1 initially when they have not yet been calculated)
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     int m_gputhreads; // number of gpu threads (default set from number of events, can be modified)
     int m_gpublocks;  // number of gpu blocks (default set from number of events, can be modified)
     DeviceBuffer<FORTRANFPTYPE, sizePerEventMomenta> m_devMomentaF;
@@ -187,12 +187,12 @@ namespace mg5amcCpu
   // Forward declare transposition methods
   //
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 
   template<typename Tin, typename Tout>
   __global__ void dev_transposeMomentaF2C( const Tin* in, Tout* out, const unsigned int nevt );
 
-#endif // __CUDACC__
+#endif // MGONGPUCPP_GPUIMPL
 
   template<typename Tin, typename Tout>
   void hst_transposeMomentaF2C( const Tin* in, Tout* out, const unsigned int nevt );
@@ -209,7 +209,7 @@ namespace mg5amcCpu
   Bridge<FORTRANFPTYPE>::Bridge( unsigned int nevtF, unsigned int nparF, unsigned int np4F )
     : m_nevt( nevtF )
     , m_nGoodHel( -1 )
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     , m_gputhreads( 256 )                  // default number of gpu threads
     , m_gpublocks( m_nevt / m_gputhreads ) // this ensures m_nevt <= m_gpublocks*m_gputhreads
     , m_devMomentaF( m_nevt )
@@ -233,7 +233,7 @@ namespace mg5amcCpu
   {
     if( nparF != CPPProcess::npar ) throw std::runtime_error( "Bridge constructor: npar mismatch" );
     if( np4F != CPPProcess::np4 ) throw std::runtime_error( "Bridge constructor: np4 mismatch" );
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     if( ( m_nevt < s_gputhreadsmin ) || ( m_nevt % s_gputhreadsmin != 0 ) )
       throw std::runtime_error( "Bridge constructor: nevt should be a multiple of " + std::to_string( s_gputhreadsmin ) );
     while( m_nevt != m_gpublocks * m_gputhreads )
@@ -249,7 +249,7 @@ namespace mg5amcCpu
 #else
     std::cout << "WARNING! Instantiate host Bridge (nevt=" << m_nevt << ")" << std::endl;
     m_pmek.reset( new MatrixElementKernelHost( m_hstMomentaC, m_hstGs, m_hstRndHel, m_hstRndCol, m_hstMEs, m_hstSelHel, m_hstSelCol, m_nevt ) );
-#endif // __CUDACC__
+#endif // MGONGPUCPP_GPUIMPL
     // Create a process object, read param card and set parameters
     // FIXME: the process instance can happily go out of scope because it is only needed to read parameters?
     // FIXME: the CPPProcess should really be a singleton? what if fbridgecreate is called from several Fortran threads?
@@ -262,7 +262,7 @@ namespace mg5amcCpu
     process.initProc( paramCard );
   }
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   template<typename FORTRANFPTYPE>
   void Bridge<FORTRANFPTYPE>::set_gpugrid( const int gpublocks, const int gputhreads )
   {
@@ -276,7 +276,7 @@ namespace mg5amcCpu
   }
 #endif
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   template<typename FORTRANFPTYPE>
   void Bridge<FORTRANFPTYPE>::gpu_sequence( const FORTRANFPTYPE* momenta,
                                             const FORTRANFPTYPE* gs,
@@ -291,14 +291,14 @@ namespace mg5amcCpu
     constexpr int neppM = MemoryAccessMomenta::neppM;
     if constexpr( neppM == 1 && std::is_same_v<FORTRANFPTYPE, fptype> )
     {
-      checkCuda( cudaMemcpy( m_devMomentaC.data(), momenta, m_devMomentaC.bytes(), cudaMemcpyHostToDevice ) );
+      gpuMemcpy( m_devMomentaC.data(), momenta, m_devMomentaC.bytes(), gpuMemcpyHostToDevice );
     }
     else
     {
-      checkCuda( cudaMemcpy( m_devMomentaF.data(), momenta, m_devMomentaF.bytes(), cudaMemcpyHostToDevice ) );
+      gpuMemcpy( m_devMomentaF.data(), momenta, m_devMomentaF.bytes(), gpuMemcpyHostToDevice );
       const int thrPerEvt = CPPProcess::npar * CPPProcess::np4; // AV: transpose alg does 1 element per thread (NOT 1 event per thread)
       //const int thrPerEvt = 1; // AV: try new alg with 1 event per thread... this seems slower
-      dev_transposeMomentaF2C<<<m_gpublocks * thrPerEvt, m_gputhreads>>>( m_devMomentaF.data(), m_devMomentaC.data(), m_nevt );
+      gpuLaunchKernel( dev_transposeMomentaF2C, m_gpublocks * thrPerEvt, m_gputhreads, m_devMomentaF.data(), m_devMomentaC.data(), m_nevt );
     }
     if constexpr( std::is_same_v<FORTRANFPTYPE, fptype> )
     {
@@ -341,7 +341,7 @@ namespace mg5amcCpu
   }
 #endif
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   template<typename FORTRANFPTYPE>
   void Bridge<FORTRANFPTYPE>::cpu_sequence( const FORTRANFPTYPE* momenta,
                                             const FORTRANFPTYPE* gs,
@@ -396,7 +396,7 @@ namespace mg5amcCpu
   // - C++ array: momenta[npagM][npar][np4][neppM] with nevt=npagM*neppM (AOSOA)
   //
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   template<typename Tin, typename Tout>
   __global__ void dev_transposeMomentaF2C( const Tin* in, Tout* out, const unsigned int nevt )
   {
diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/BridgeKernels.cc b/epochX/cudacpp/gq_ttq.sa/SubProcesses/BridgeKernels.cc
index d58066c9c1..eaf4037a24 100644
--- a/epochX/cudacpp/gq_ttq.sa/SubProcesses/BridgeKernels.cc
+++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/BridgeKernels.cc
@@ -1,17 +1,18 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #include "BridgeKernels.h"
 
+#include "GpuAbstraction.h"
 #include "MemoryAccessMomenta.h"
 
 #include <sstream>
 
 //============================================================================
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -45,7 +46,7 @@ namespace mg5amcCpu
 
 //============================================================================
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 namespace mg5amcCpu
 {
 
@@ -96,7 +97,7 @@ namespace mg5amcCpu
 
 //============================================================================
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 {
 
diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/BridgeKernels.h b/epochX/cudacpp/gq_ttq.sa/SubProcesses/BridgeKernels.h
index 15eb4bff4d..3efef8ce97 100644
--- a/epochX/cudacpp/gq_ttq.sa/SubProcesses/BridgeKernels.h
+++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/BridgeKernels.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef BRIDGEKERNELS_H
 #define BRIDGEKERNELS_H 1
@@ -12,7 +12,7 @@
 #include "MatrixElementKernels.h"
 #include "MemoryBuffers.h"
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -49,7 +49,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A Bridge wrapper class encapsulating matrix element calculations on a CPU host
   class BridgeKernelHost final : public BridgeKernelBase
   {
@@ -89,7 +89,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   // A Bridge wrapper class encapsulating matrix element calculations on a GPU device
   class BridgeKernelDevice : public BridgeKernelBase
   {
diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/CommonRandomNumberKernel.cc b/epochX/cudacpp/gq_ttq.sa/SubProcesses/CommonRandomNumberKernel.cc
index 985b39f576..010bc4cbd0 100644
--- a/epochX/cudacpp/gq_ttq.sa/SubProcesses/CommonRandomNumberKernel.cc
+++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/CommonRandomNumberKernel.cc
@@ -1,15 +1,16 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #include "CommonRandomNumbers.h"
+#include "GpuAbstraction.h"
 #include "MemoryBuffers.h"
 #include "RandomNumberKernels.h"
 
 #include <cassert>
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/CrossSectionKernels.cc b/epochX/cudacpp/gq_ttq.sa/SubProcesses/CrossSectionKernels.cc
index 0b355a3c8d..c15b39844d 100644
--- a/epochX/cudacpp/gq_ttq.sa/SubProcesses/CrossSectionKernels.cc
+++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/CrossSectionKernels.cc
@@ -1,10 +1,11 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #include "CrossSectionKernels.h"
 
+#include "GpuAbstraction.h"
 #include "MemoryAccessMatrixElements.h"
 #include "MemoryAccessWeights.h"
 #include "MemoryBuffers.h"
@@ -77,7 +78,7 @@ debug_me_is_abnormal( const fptype& me, size_t ievtALL )
 
 //============================================================================
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -185,7 +186,7 @@ namespace mg5amcCpu
 
 //============================================================================
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 {
 
diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/CrossSectionKernels.h b/epochX/cudacpp/gq_ttq.sa/SubProcesses/CrossSectionKernels.h
index 7933ca4bbf..4d9659e04e 100644
--- a/epochX/cudacpp/gq_ttq.sa/SubProcesses/CrossSectionKernels.h
+++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/CrossSectionKernels.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef CROSSSECTIONKERNELS_H
 #define CROSSSECTIONKERNELS_H 1
@@ -13,7 +13,7 @@
 
 //============================================================================
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -96,7 +96,7 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
   /*
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   // A class encapsulating the calculation of event statistics on a GPU device
   class CrossSectionKernelDevice : public CrossSectionKernelBase, public NumberOfEvents
   {
diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/CudaRuntime.h b/epochX/cudacpp/gq_ttq.sa/SubProcesses/CudaRuntime.h
deleted file mode 100644
index 64ce52f4b3..0000000000
--- a/epochX/cudacpp/gq_ttq.sa/SubProcesses/CudaRuntime.h
+++ /dev/null
@@ -1,85 +0,0 @@
-// Copyright (C) 2020-2023 CERN and UCLouvain.
-// Licensed under the GNU Lesser General Public License (version 3 or later).
-// Created by: S. Roiser (Jul 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
-
-#ifndef MG5AMC_CUDARUNTIME_H
-#define MG5AMC_CUDARUNTIME_H 1
-
-// MG5AMC on GPU uses the CUDA runtime API, not the lower level CUDA driver API
-// See https://docs.nvidia.com/cuda/cuda-runtime-api/driver-vs-runtime-api.html#driver-vs-runtime-api
-
-#include <cassert>
-#include <iostream>
-
-//--------------------------------------------------------------------------
-
-// See https://stackoverflow.com/a/14038590
-#ifdef __CUDACC__ /* clang-format off */
-#define checkCuda( code ) { assertCuda( code, __FILE__, __LINE__ ); }
-inline void assertCuda( cudaError_t code, const char* file, int line, bool abort = true )
-{
-  if( code != cudaSuccess )
-  {
-    printf( "ERROR! assertCuda: '%s' (%d) in %s:%d\n", cudaGetErrorString( code ), code, file, line );
-    if( abort ) assert( code == cudaSuccess );
-  }
-}
-#endif /* clang-format on */
-
-//--------------------------------------------------------------------------
-
-#ifdef __CUDACC__
-namespace mg5amcGpu
-{
-  // Instantiate a CudaRuntime at the beginnining of the application's main to
-  // invoke cudaSetDevice(0) in the constructor and book a cudaDeviceReset() call in the destructor
-  // *** FIXME! This will all need to be designed differently when going to multi-GPU nodes! ***
-  struct CudaRuntime final
-  {
-    CudaRuntime( const bool debug = true )
-      : m_debug( debug ) { setUp( m_debug ); }
-    ~CudaRuntime() { tearDown( m_debug ); }
-    CudaRuntime( const CudaRuntime& ) = delete;
-    CudaRuntime( CudaRuntime&& ) = delete;
-    CudaRuntime& operator=( const CudaRuntime& ) = delete;
-    CudaRuntime& operator=( CudaRuntime&& ) = delete;
-    bool m_debug;
-
-    // Set up CUDA application
-    // ** NB: strictly speaking this is not needed when using the CUDA runtime API **
-    // Calling cudaSetDevice on startup is useful to properly book-keep the time spent in CUDA initialization
-    static void setUp( const bool debug = true )
-    {
-      // ** NB: it is useful to call cudaSetDevice, or cudaFree, to properly book-keep the time spent in CUDA initialization
-      // ** NB: otherwise, the first CUDA operation (eg a cudaMemcpyToSymbol in CPPProcess ctor) appears to take much longer!
-      /*
-      // [We initially added cudaFree(0) to "ease profile analysis" only because it shows up as a big recognizable block!]
-      // No explicit initialization is needed: https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#initialization
-      // It is not clear what cudaFree(0) does at all: https://stackoverflow.com/questions/69967813/
-      if ( debug ) std::cout << "__CudaRuntime: calling cudaFree(0)" << std::endl;
-      checkCuda( cudaFree( 0 ) ); // SLOW!
-      */
-      // Replace cudaFree(0) by cudaSetDevice(0), even if it is not really needed either
-      // (but see https://developer.nvidia.com/blog/cuda-pro-tip-always-set-current-device-avoid-multithreading-bugs)
-      if( debug ) std::cout << "__CudaRuntime: calling cudaSetDevice(0)" << std::endl;
-      checkCuda( cudaSetDevice( 0 ) ); // SLOW!
-    }
-
-    // Tear down CUDA application (call cudaDeviceReset)
-    // ** NB: strictly speaking this is not needed when using the CUDA runtime API **
-    // Calling cudaDeviceReset on shutdown is only needed for checking memory leaks in cuda-memcheck
-    // See https://docs.nvidia.com/cuda/cuda-memcheck/index.html#leak-checking
-    static void tearDown( const bool debug = true )
-    {
-      if( debug ) std::cout << "__CudaRuntime: calling cudaDeviceReset()" << std::endl;
-      checkCuda( cudaDeviceReset() );
-    }
-  };
-
-}
-#endif
-
-//--------------------------------------------------------------------------
-
-#endif // MG5AMC_CUDARUNTIME_H
diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/CurandRandomNumberKernel.cc b/epochX/cudacpp/gq_ttq.sa/SubProcesses/CurandRandomNumberKernel.cc
index eb56333b03..08a16f6f2c 100644
--- a/epochX/cudacpp/gq_ttq.sa/SubProcesses/CurandRandomNumberKernel.cc
+++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/CurandRandomNumberKernel.cc
@@ -1,9 +1,9 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
-#include "CudaRuntime.h"
+#include "GpuRuntime.h"
 #include "MemoryBuffers.h"
 #include "RandomNumberKernels.h"
 
@@ -22,7 +22,7 @@ inline void assertCurand( curandStatus_t code, const char *file, int line, bool
 }
 #endif /* clang-format on */
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -36,7 +36,7 @@ namespace mg5amcCpu
   {
     if( m_isOnDevice )
     {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
       if( !m_rnarray.isOnDevice() )
         throw std::runtime_error( "CurandRandomNumberKernel on device with a host random number array" );
 #else
@@ -114,7 +114,7 @@ namespace mg5amcCpu
     /*
     printf( "\nCurandRandomNumberKernel::generateRnarray size = %d\n", (int)m_rnarray.size() );
     fptype* data = m_rnarray.data();
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     if( m_rnarray.isOnDevice() )
     {
       data = new fptype[m_rnarray.size()]();
@@ -123,7 +123,7 @@ namespace mg5amcCpu
 #endif
     for( int i = 0; i < ( (int)m_rnarray.size() / 4 ); i++ )
       printf( "[%4d] %f %f %f %f\n", i * 4, data[i * 4], data[i * 4 + 2], data[i * 4 + 2], data[i * 4 + 3] );
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     if( m_rnarray.isOnDevice() ) delete[] data;
 #endif
     */
diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/EventStatistics.h b/epochX/cudacpp/gq_ttq.sa/SubProcesses/EventStatistics.h
index 48b51e0a49..b425a5bade 100644
--- a/epochX/cudacpp/gq_ttq.sa/SubProcesses/EventStatistics.h
+++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/EventStatistics.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef EventStatistics_H
 #define EventStatistics_H 1
@@ -16,7 +16,7 @@
 #include <limits>
 #include <string>
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/MadgraphTest.h b/epochX/cudacpp/gq_ttq.sa/SubProcesses/MadgraphTest.h
index ef40624c88..a64c05c26a 100644
--- a/epochX/cudacpp/gq_ttq.sa/SubProcesses/MadgraphTest.h
+++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/MadgraphTest.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: S. Hageboeck (Dec 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MADGRAPHTEST_H_
 #define MADGRAPHTEST_H_ 1
@@ -22,7 +22,7 @@
 #include <string>
 #include <vector>
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 using mg5amcGpu::CPPProcess;
 #else
 using mg5amcCpu::CPPProcess;
@@ -201,7 +201,7 @@ class MadgraphTest : public testing::TestWithParam<TestDriverBase*>
 
 // Since we link both the CPU-only and GPU tests into the same executable, we prevent
 // a multiply defined symbol by only compiling this in the non-CUDA phase:
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 
 /// Compare momenta and matrix elements.
 /// This uses an implementation of TestDriverBase to run a madgraph workflow,
@@ -307,6 +307,6 @@ TEST_P( MadgraphTest, CompareMomentaAndME )
   }
 }
 
-#endif // __CUDACC__
+#endif // MGONGPUCPP_GPUIMPL
 
 #endif /* MADGRAPHTEST_H_ */
diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/MatrixElementKernels.cc b/epochX/cudacpp/gq_ttq.sa/SubProcesses/MatrixElementKernels.cc
index 74b5239ebf..81699dfea9 100644
--- a/epochX/cudacpp/gq_ttq.sa/SubProcesses/MatrixElementKernels.cc
+++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/MatrixElementKernels.cc
@@ -1,12 +1,12 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #include "MatrixElementKernels.h"
 
 #include "CPPProcess.h"
-#include "CudaRuntime.h"
+#include "GpuRuntime.h" // Includes the abstraction for Nvidia/AMD compilation
 #include "MemoryAccessMomenta.h"
 #include "MemoryBuffers.h"
 
@@ -14,7 +14,7 @@
 
 //============================================================================
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 namespace mg5amcCpu
 {
 
@@ -150,7 +150,7 @@ namespace mg5amcCpu
 
 //============================================================================
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 {
 
@@ -209,13 +209,13 @@ namespace mg5amcGpu
     PinnedHostBufferHelicityMask hstIsGoodHel( ncomb );
     DeviceBufferHelicityMask devIsGoodHel( ncomb );
     // ... 0d1. Compute good helicity mask on the device
-    computeDependentCouplings<<<m_gpublocks, m_gputhreads>>>( m_gs.data(), m_couplings.data() );
+    gpuLaunchKernel( computeDependentCouplings, m_gpublocks, m_gputhreads, m_gs.data(), m_couplings.data() );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    sigmaKin_getGoodHel<<<m_gpublocks, m_gputhreads>>>( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_numerators.data(), m_denominators.data(), devIsGoodHel.data() );
+    gpuLaunchKernel( sigmaKin_getGoodHel, m_gpublocks, m_gputhreads, m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_numerators.data(), m_denominators.data(), devIsGoodHel.data() );
 #else
-    sigmaKin_getGoodHel<<<m_gpublocks, m_gputhreads>>>( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), devIsGoodHel.data() );
+    gpuLaunchKernel( sigmaKin_getGoodHel, m_gpublocks, m_gputhreads, m_momenta.data(), m_couplings.data(), m_matrixElements.data(), devIsGoodHel.data() );
 #endif
-    checkCuda( cudaPeekAtLastError() );
+    checkGpu( gpuPeekAtLastError() );
     // ... 0d2. Copy back good helicity mask to the host
     copyHostFromDevice( hstIsGoodHel, devIsGoodHel );
     // ... 0d3. Copy back good helicity list to constant memory on the device
@@ -226,19 +226,19 @@ namespace mg5amcGpu
 
   void MatrixElementKernelDevice::computeMatrixElements( const unsigned int channelId )
   {
-    computeDependentCouplings<<<m_gpublocks, m_gputhreads>>>( m_gs.data(), m_couplings.data() );
+    gpuLaunchKernel( computeDependentCouplings, m_gpublocks, m_gputhreads, m_gs.data(), m_couplings.data() );
 #ifndef MGONGPU_NSIGHT_DEBUG
     constexpr unsigned int sharedMemSize = 0;
 #else
     constexpr unsigned int sharedMemSize = ntpbMAX * sizeof( float );
 #endif
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    sigmaKin<<<m_gpublocks, m_gputhreads, sharedMemSize>>>( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), channelId, m_numerators.data(), m_denominators.data(), m_selhel.data(), m_selcol.data() );
+    gpuLaunchKernelSharedMem( sigmaKin, m_gpublocks, m_gputhreads, sharedMemSize, m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), channelId, m_numerators.data(), m_denominators.data(), m_selhel.data(), m_selcol.data() );
 #else
-    sigmaKin<<<m_gpublocks, m_gputhreads, sharedMemSize>>>( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), m_selhel.data(), m_selcol.data() );
+    gpuLaunchKernelSharedMem( sigmaKin, m_gpublocks, m_gputhreads, sharedMemSize, m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), m_selhel.data(), m_selcol.data() );
 #endif
-    checkCuda( cudaPeekAtLastError() );
-    checkCuda( cudaDeviceSynchronize() );
+    checkGpu( gpuPeekAtLastError() );
+    checkGpu( gpuDeviceSynchronize() );
   }
 
   //--------------------------------------------------------------------------
diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/MatrixElementKernels.h b/epochX/cudacpp/gq_ttq.sa/SubProcesses/MatrixElementKernels.h
index 23e84757a2..72bd8f195b 100644
--- a/epochX/cudacpp/gq_ttq.sa/SubProcesses/MatrixElementKernels.h
+++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/MatrixElementKernels.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MATRIXELEMENTKERNELS_H
 #define MATRIXELEMENTKERNELS_H 1
@@ -10,7 +10,7 @@
 
 #include "MemoryBuffers.h"
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -81,7 +81,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating matrix element calculations on a CPU host
   class MatrixElementKernelHost final : public MatrixElementKernelBase, public NumberOfEvents
   {
@@ -130,7 +130,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   // A class encapsulating matrix element calculations on a GPU device
   class MatrixElementKernelDevice : public MatrixElementKernelBase, public NumberOfEvents
   {
diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/MemoryAccessAmplitudes.h b/epochX/cudacpp/gq_ttq.sa/SubProcesses/MemoryAccessAmplitudes.h
index 573b3bbbc9..ffb76e93de 100644
--- a/epochX/cudacpp/gq_ttq.sa/SubProcesses/MemoryAccessAmplitudes.h
+++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/MemoryAccessAmplitudes.h
@@ -15,7 +15,7 @@
 #define MGONGPU_TRIVIAL_AMPLITUDES 1
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/MemoryAccessCouplings.h b/epochX/cudacpp/gq_ttq.sa/SubProcesses/MemoryAccessCouplings.h
index 35a3af42e0..3afdf3e554 100644
--- a/epochX/cudacpp/gq_ttq.sa/SubProcesses/MemoryAccessCouplings.h
+++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/MemoryAccessCouplings.h
@@ -15,7 +15,7 @@
 #include "MemoryBuffers.h"       // for HostBufferCouplings::isaligned
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/MemoryAccessCouplingsFixed.h b/epochX/cudacpp/gq_ttq.sa/SubProcesses/MemoryAccessCouplingsFixed.h
index dc0d93afff..ffcdf4dbef 100644
--- a/epochX/cudacpp/gq_ttq.sa/SubProcesses/MemoryAccessCouplingsFixed.h
+++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/MemoryAccessCouplingsFixed.h
@@ -14,7 +14,7 @@
 //#include "MemoryAccessHelpers.h"
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/MemoryAccessDenominators.h b/epochX/cudacpp/gq_ttq.sa/SubProcesses/MemoryAccessDenominators.h
index 3bce635718..66f2d32a6b 100644
--- a/epochX/cudacpp/gq_ttq.sa/SubProcesses/MemoryAccessDenominators.h
+++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/MemoryAccessDenominators.h
@@ -10,7 +10,7 @@
 #include "MemoryAccessGs.h"
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/MemoryAccessGs.h b/epochX/cudacpp/gq_ttq.sa/SubProcesses/MemoryAccessGs.h
index 31311aa375..4c726b30f3 100644
--- a/epochX/cudacpp/gq_ttq.sa/SubProcesses/MemoryAccessGs.h
+++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/MemoryAccessGs.h
@@ -13,7 +13,7 @@
 #include "MemoryBuffers.h" // for HostBufferMatrixElements::isaligned
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/MemoryAccessHelpers.h b/epochX/cudacpp/gq_ttq.sa/SubProcesses/MemoryAccessHelpers.h
index c82a6c7635..db73e4e064 100644
--- a/epochX/cudacpp/gq_ttq.sa/SubProcesses/MemoryAccessHelpers.h
+++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/MemoryAccessHelpers.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MemoryAccessHelpers_H
 #define MemoryAccessHelpers_H 1
@@ -105,7 +105,7 @@ class KernelAccessHelper : public MemoryAccessHelper<T>
     }
     else
     {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
       const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid
       //printf( "kernelAccessRecord: ievt=%d threadId=%d\n", ievt, threadIdx.x );
       return T::ieventAccessRecord( buffer, ievt ); // NB fptype and fptype_sv coincide for CUDA
diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/MemoryAccessMatrixElements.h b/epochX/cudacpp/gq_ttq.sa/SubProcesses/MemoryAccessMatrixElements.h
index f32e6fea5b..3741011971 100644
--- a/epochX/cudacpp/gq_ttq.sa/SubProcesses/MemoryAccessMatrixElements.h
+++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/MemoryAccessMatrixElements.h
@@ -13,7 +13,7 @@
 #include "MemoryBuffers.h" // for HostBufferMatrixElements::isaligned
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/MemoryAccessMomenta.h b/epochX/cudacpp/gq_ttq.sa/SubProcesses/MemoryAccessMomenta.h
index 29266de32c..3be229d392 100644
--- a/epochX/cudacpp/gq_ttq.sa/SubProcesses/MemoryAccessMomenta.h
+++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/MemoryAccessMomenta.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MemoryAccessMomenta_H
 #define MemoryAccessMomenta_H 1
@@ -13,7 +13,7 @@
 #include "MemoryAccessVectors.h"
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -30,7 +30,7 @@ namespace mg5amcCpu
 
     // Number of Events Per Page in the momenta AOSOA memory buffer layout
     // (these are all best kept as a compile-time constants: see issue #23)
-#ifdef __CUDACC__ /* clang-format off */
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
     // -----------------------------------------------------------------------------------------------
     // --- GPUs: neppM is best set to a power of 2 times the number of fptype's in a 32-byte cacheline
     // --- This is relevant to ensure coalesced access to momenta in global memory
diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/MemoryAccessNumerators.h b/epochX/cudacpp/gq_ttq.sa/SubProcesses/MemoryAccessNumerators.h
index b152183b28..18991f4fa6 100644
--- a/epochX/cudacpp/gq_ttq.sa/SubProcesses/MemoryAccessNumerators.h
+++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/MemoryAccessNumerators.h
@@ -10,7 +10,7 @@
 #include "MemoryAccessGs.h"
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/MemoryAccessRandomNumbers.h b/epochX/cudacpp/gq_ttq.sa/SubProcesses/MemoryAccessRandomNumbers.h
index e2988d39f3..40cb089135 100644
--- a/epochX/cudacpp/gq_ttq.sa/SubProcesses/MemoryAccessRandomNumbers.h
+++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/MemoryAccessRandomNumbers.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MemoryAccessRandomNumbers_H
 #define MemoryAccessRandomNumbers_H 1
@@ -11,7 +11,7 @@
 #include "CPPProcess.h"
 #include "MemoryAccessHelpers.h"
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 using mg5amcGpu::CPPProcess;
 #else
 using mg5amcCpu::CPPProcess;
diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/MemoryAccessVectors.h b/epochX/cudacpp/gq_ttq.sa/SubProcesses/MemoryAccessVectors.h
index e9b197368e..08faccff0f 100644
--- a/epochX/cudacpp/gq_ttq.sa/SubProcesses/MemoryAccessVectors.h
+++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/MemoryAccessVectors.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MemoryAccessVectors_H
 #define MemoryAccessVectors_H 1
@@ -10,7 +10,7 @@
 
 #include "mgOnGpuVectors.h"
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 namespace mg5amcCpu // this is only needed for CPU SIMD vectorization
 {
 
diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/MemoryAccessWavefunctions.h b/epochX/cudacpp/gq_ttq.sa/SubProcesses/MemoryAccessWavefunctions.h
index 5428aaf933..33bef4559e 100644
--- a/epochX/cudacpp/gq_ttq.sa/SubProcesses/MemoryAccessWavefunctions.h
+++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/MemoryAccessWavefunctions.h
@@ -15,7 +15,7 @@
 #define MGONGPU_TRIVIAL_WAVEFUNCTIONS 1
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/MemoryBuffers.h b/epochX/cudacpp/gq_ttq.sa/SubProcesses/MemoryBuffers.h
index 3093e6ed18..7756a71621 100644
--- a/epochX/cudacpp/gq_ttq.sa/SubProcesses/MemoryBuffers.h
+++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/MemoryBuffers.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021, based on earlier work by S. Hageboeck) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Roiser, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MemoryBuffers_H
 #define MemoryBuffers_H 1
@@ -11,12 +11,12 @@
 #include "mgOnGpuCxtypes.h"
 
 #include "CPPProcess.h"
-#include "CudaRuntime.h"
+#include "GpuRuntime.h"
 #include "Parameters_sm.h"
 
 #include <sstream>
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -87,7 +87,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   constexpr bool HostBufferALIGNED = false;   // ismisaligned=false
   constexpr bool HostBufferMISALIGNED = true; // ismisaligned=true
 
@@ -119,7 +119,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   // A class encapsulating a CUDA pinned host buffer
   template<typename T>
   class PinnedHostBufferBase : public BufferBase<T>
@@ -128,18 +128,18 @@ namespace mg5amcCpu
     PinnedHostBufferBase( const size_t size )
       : BufferBase<T>( size, false )
     {
-      checkCuda( cudaMallocHost( &( this->m_data ), this->bytes() ) );
+      gpuMallocHost( &( this->m_data ), this->bytes() );
     }
     virtual ~PinnedHostBufferBase()
     {
-      checkCuda( cudaFreeHost( this->m_data ) );
+      gpuFreeHost( this->m_data );
     }
   };
 #endif
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   // A class encapsulating a CUDA device buffer
   template<typename T>
   class DeviceBufferBase : public BufferBase<T>
@@ -148,18 +148,18 @@ namespace mg5amcCpu
     DeviceBufferBase( const size_t size )
       : BufferBase<T>( size, true )
     {
-      checkCuda( cudaMalloc( &( this->m_data ), this->bytes() ) );
+      gpuMalloc( &( this->m_data ), this->bytes() );
     }
     virtual ~DeviceBufferBase()
     {
-      checkCuda( cudaFree( this->m_data ) );
+      gpuFree( this->m_data );
     }
   };
 #endif
 
   //--------------------------------------------------------------------------
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for a given number of events
   template<typename T, size_t sizePerEvent, bool ismisaligned>
   class HostBuffer : public HostBufferBase<T, ismisaligned>, virtual private NumberOfEvents
@@ -175,7 +175,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   // A class encapsulating a CUDA pinned host buffer for a given number of events
   template<typename T, size_t sizePerEvent>
   class PinnedHostBuffer : public PinnedHostBufferBase<T>, virtual private NumberOfEvents
@@ -191,7 +191,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   // A class encapsulating a CUDA device buffer for a given number of events
   template<typename T, size_t sizePerEvent>
   class DeviceBuffer : public DeviceBufferBase<T>, virtual private NumberOfEvents
@@ -213,7 +213,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for momenta random numbers
   constexpr size_t sizePerEventRndNumMomenta = MemoryBuffers::np4 * MemoryBuffers::nparf;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for momenta random numbers
   typedef HostBuffer<fptype, sizePerEventRndNumMomenta, HostBufferALIGNED> HostBufferRndNumMomenta;
 #else
@@ -232,7 +232,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer with ONE fptype per event
   constexpr size_t sizePerEventOneFp = 1;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer with ONE fptype per event
   typedef HostBuffer<fptype, sizePerEventOneFp, HostBufferALIGNED> HostBufferOneFp;
 #else
@@ -257,7 +257,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for Gs
   constexpr size_t sizePerEventGs = 1;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for gs
   typedef HostBuffer<fptype, sizePerEventGs, HostBufferALIGNED> HostBufferGs;
 #else
@@ -276,7 +276,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for numerators
   constexpr size_t sizePerEventNumerators = 1;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for gs
   typedef HostBuffer<fptype, sizePerEventNumerators, HostBufferALIGNED> HostBufferNumerators;
 #else
@@ -296,7 +296,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for denominators
   constexpr size_t sizePerEventDenominators = 1;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for gs
   typedef HostBuffer<fptype, sizePerEventDenominators, HostBufferALIGNED> HostBufferDenominators;
 #else
@@ -315,7 +315,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for random numbers
   constexpr size_t sizePerEventCouplings = MemoryBuffers::ndcoup * MemoryBuffers::nx2;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for gs
   typedef HostBuffer<fptype, sizePerEventCouplings, HostBufferALIGNED> HostBufferCouplings;
 #else
@@ -333,7 +333,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for momenta
   constexpr size_t sizePerEventMomenta = MemoryBuffers::np4 * MemoryBuffers::npar;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for momenta
   typedef HostBuffer<fptype, sizePerEventMomenta, HostBufferALIGNED> HostBufferMomenta;
   //typedef HostBuffer<fptype, sizePerEventMomenta, HostBufferMISALIGNED> HostBufferMomenta; // TEST MISALIGNMENT!
@@ -352,7 +352,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for sampling weights
   constexpr size_t sizePerEventWeights = 1;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for sampling weights
   typedef HostBuffer<fptype, sizePerEventWeights, HostBufferALIGNED> HostBufferWeights;
 #else
@@ -370,7 +370,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for matrix elements
   constexpr size_t sizePerEventMatrixElements = 1;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for matrix elements
   typedef HostBuffer<fptype, sizePerEventMatrixElements, HostBufferALIGNED> HostBufferMatrixElements;
 #else
@@ -385,7 +385,7 @@ namespace mg5amcCpu
   // A base class encapsulating a memory buffer for the helicity mask
   typedef BufferBase<bool> BufferHelicityMask;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for the helicity mask
   typedef HostBufferBase<bool, HostBufferALIGNED> HostBufferHelicityMask;
 #else
@@ -403,7 +403,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for wavefunctions
   constexpr size_t sizePerEventWavefunctions = MemoryBuffers::nw6 * MemoryBuffers::nx2;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for wavefunctions
   typedef HostBuffer<fptype, sizePerEventWavefunctions, HostBufferALIGNED> HostBufferWavefunctions;
 #else
@@ -421,7 +421,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for helicity random numbers
   constexpr size_t sizePerEventRndNumHelicity = 1;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for helicity random numbers
   typedef HostBuffer<fptype, sizePerEventRndNumHelicity, HostBufferALIGNED> HostBufferRndNumHelicity;
 #else
@@ -439,7 +439,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for color random numbers
   constexpr size_t sizePerEventRndNumColor = 1;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for color random numbers
   typedef HostBuffer<fptype, sizePerEventRndNumColor, HostBufferALIGNED> HostBufferRndNumColor;
 #else
@@ -457,7 +457,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for helicity selection
   constexpr size_t sizePerEventSelectedHelicity = 1;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for helicity selection
   typedef HostBuffer<int, sizePerEventSelectedHelicity, HostBufferALIGNED> HostBufferSelectedHelicity;
 #else
@@ -475,7 +475,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for color selection
   constexpr size_t sizePerEventSelectedColor = 1;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for color selection
   typedef HostBuffer<int, sizePerEventSelectedColor, HostBufferALIGNED> HostBufferSelectedColor;
 #else
@@ -487,7 +487,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   template<class Tdst, class Tsrc>
   void copyDeviceFromHost( Tdst& dst, const Tsrc& src ) // keep the same order of arguments as in memcpy
   {
@@ -504,13 +504,13 @@ namespace mg5amcCpu
       throw std::runtime_error( sstr.str() );
     }
     // NB (PR #45): cudaMemcpy involves an intermediate memcpy to pinned memory if host array is a not a pinned host array
-    checkCuda( cudaMemcpy( dst.data(), src.data(), src.bytes(), cudaMemcpyHostToDevice ) );
+    gpuMemcpy( dst.data(), src.data(), src.bytes(), gpuMemcpyHostToDevice );
   }
 #endif
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   template<class Tdst, class Tsrc>
   void copyHostFromDevice( Tdst& dst, const Tsrc& src ) // keep the same order of arguments as in memcpy
   {
@@ -527,7 +527,7 @@ namespace mg5amcCpu
       throw std::runtime_error( sstr.str() );
     }
     // NB (PR #45): cudaMemcpy involves an intermediate memcpy to pinned memory if host array is a not a pinned host array
-    checkCuda( cudaMemcpy( dst.data(), src.data(), src.bytes(), cudaMemcpyDeviceToHost ) );
+    gpuMemcpy( dst.data(), src.data(), src.bytes(), gpuMemcpyDeviceToHost );
   }
 #endif
 
diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gu_ttxu/CPPProcess.cc b/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gu_ttxu/CPPProcess.cc
index 90e90b3aa9..c1543791ca 100644
--- a/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gu_ttxu/CPPProcess.cc
+++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gu_ttxu/CPPProcess.cc
@@ -4,7 +4,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
 // MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
@@ -16,7 +16,6 @@
 
 #include "mgOnGpuConfig.h"
 
-#include "CudaRuntime.h"
 #include "HelAmps_sm.h"
 #include "MemoryAccessAmplitudes.h"
 #include "MemoryAccessCouplings.h"
@@ -49,7 +48,7 @@
 // Process: g d > t t~ d WEIGHTED<=3 @1
 // Process: g s > t t~ s WEIGHTED<=3 @1
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -83,7 +82,7 @@ namespace mg5amcCpu
   __device__ const fptype cIPD[2] = { (fptype)Parameters_sm::mdl_MT, (fptype)Parameters_sm::mdl_WT };
   __device__ const fptype* cIPC = nullptr; // unused as nicoup=0
 #else
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   __device__ __constant__ fptype cIPD[2];
   __device__ __constant__ fptype* cIPC = nullptr; // unused as nicoup=0
 #else
@@ -93,7 +92,7 @@ namespace mg5amcCpu
 #endif
 
   // Helicity combinations (and filtering of "good" helicity combinations)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   __device__ __constant__ short cHel[ncomb][npar];
   __device__ __constant__ int cNGoodHel;
   __device__ __constant__ int cGoodHel[ncomb];
@@ -121,13 +120,13 @@ namespace mg5amcCpu
                            fptype* allDenominators,       // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
                            fptype_sv* jamp2_sv            // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled)
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
                            , const int ievt00             // input: first event number in current C++ event page (for CUDA, ievt depends on threadid)
 #endif
                            )
   //ALWAYS_INLINE // attributes are not permitted in a function definition
   {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     using namespace mg5amcGpu;
     using M_ACCESS = DeviceAccessMomenta;         // non-trivial access: buffer includes all events
     using E_ACCESS = DeviceAccessMatrixElements;  // non-trivial access: buffer includes all events
@@ -154,7 +153,7 @@ namespace mg5amcCpu
 #endif /* clang-format on */
     mgDebug( 0, __FUNCTION__ );
     //printf( "calculate_wavefunctions: ihel=%2d\n", ihel );
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
     //printf( "calculate_wavefunctions: ievt00=%d\n", ievt00 );
 #endif
 
@@ -190,7 +189,7 @@ namespace mg5amcCpu
 #endif
     for( int iParity = 0; iParity < nParity; ++iParity )
     { // START LOOP ON IPARITY
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
       const int ievt0 = ievt00 + iParity * neppV;
 #endif
       constexpr size_t nxcoup = ndcoup + nicoup; // both dependent and independent couplings
@@ -203,8 +202,10 @@ namespace mg5amcCpu
         allCOUPs[idcoup] = CD_ACCESS::idcoupAccessBufferConst( allcouplings, idcoup ); // dependent couplings, vary event-by-event
       for( size_t iicoup = 0; iicoup < nicoup; iicoup++ )
         allCOUPs[ndcoup + iicoup] = CI_ACCESS::iicoupAccessBufferConst( cIPC, iicoup ); // independent couplings, fixed for all events
+#ifdef MGONGPUCPP_GPUIMPL
 #ifdef __CUDACC__
 #pragma nv_diagnostic pop
+#endif
       // CUDA kernels take input/output buffers with momenta/MEs for all events
       const fptype* momenta = allmomenta;
       const fptype* COUPs[nxcoup];
@@ -336,7 +337,7 @@ namespace mg5amcCpu
         { 4, 0, 12, 4 },
         { 0, 4, 4, 12 } }; // 2-D array[4][4]
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
       // Pre-compute a constexpr triangular color matrix properly normalized #475
       struct TriangularNormalizedColorMatrix
       {
@@ -393,7 +394,7 @@ namespace mg5amcCpu
 #endif
       for( int icol = 0; icol < ncolor; icol++ )
       {
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
         // === C++ START ===
         // Diagonal terms
 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
@@ -452,7 +453,7 @@ namespace mg5amcCpu
       MEs_sv_previous += deltaMEs_previous;
 #endif
       /*
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
       if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", blockDim.x * blockIdx.x + threadIdx.x, ihel, MEs_sv );
 #else
 #ifdef MGONGPU_CPPSIMD
@@ -515,8 +516,8 @@ namespace mg5amcCpu
       { 1, -1, 1, 1, 1 },
       { 1, -1, 1, -1, -1 },
       { 1, -1, 1, -1, 1 } };
-#ifdef __CUDACC__
-    checkCuda( cudaMemcpyToSymbol( cHel, tHel, ncomb * npar * sizeof( short ) ) );
+#ifdef MGONGPUCPP_GPUIMPL
+    gpuMemcpyToSymbol( cHel, tHel, ncomb * npar * sizeof( short ) );
 #else
     memcpy( cHel, tHel, ncomb * npar * sizeof( short ) );
 #endif
@@ -557,9 +558,9 @@ namespace mg5amcCpu
     // Then copy them to CUDA constant memory (issue #39) or its C++ emulation in file-scope static memory
     const fptype tIPD[2] = { (fptype)m_pars->mdl_MT, (fptype)m_pars->mdl_WT };
     //const cxtype tIPC[0] = { ... }; // nicoup=0
-#ifdef __CUDACC__
-    checkCuda( cudaMemcpyToSymbol( cIPD, tIPD, 2 * sizeof( fptype ) ) );
-    //checkCuda( cudaMemcpyToSymbol( cIPC, tIPC, 0 * sizeof( cxtype ) ) ); // nicoup=0
+#ifdef MGONGPUCPP_GPUIMPL
+    gpuMemcpyToSymbol( cIPD, tIPD, 2 * sizeof( fptype ) );
+    //gpuMemcpyToSymbol( cIPC, tIPC, 0 * sizeof( cxtype ) ); // nicoup=0
 #else
     memcpy( cIPD, tIPD, 2 * sizeof( fptype ) );
     //memcpy( cIPC, tIPC, 0 * sizeof( cxtype ) ); // nicoup=0
@@ -596,7 +597,7 @@ namespace mg5amcCpu
   {
     std::stringstream out;
     // CUDA version (NVCC)
-    // [Use __NVCC__ instead of __CUDACC__ here!]
+    // [Use __NVCC__ instead of MGONGPUCPP_GPUIMPL here!]
     // [This tests if 'nvcc' was used even to build a .cc file, even if not necessarily 'nvcc -x cu' for a .cu file]
     // [Check 'nvcc --compiler-options -dM -E dummy.c | grep CUDA': see https://stackoverflow.com/a/53713712]
 #ifdef __NVCC__
@@ -661,12 +662,12 @@ namespace mg5amcCpu
   __global__ void /* clang-format off */
   computeDependentCouplings( const fptype* allgs, // input: Gs[nevt]
                              fptype* allcouplings // output: couplings[nevt*ndcoup*2]
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
                              , const int nevt     // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #endif
   ) /* clang-format on */
   {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     using namespace mg5amcGpu;
     using G_ACCESS = DeviceAccessGs;
     using C_ACCESS = DeviceAccessCouplings;
@@ -687,7 +688,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__ /* clang-format off */
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
   __global__ void
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
@@ -813,9 +814,9 @@ namespace mg5amcCpu
         nGoodHel++;
       }
     }
-#ifdef __CUDACC__
-    checkCuda( cudaMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) ) );
-    checkCuda( cudaMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) ) );
+#ifdef MGONGPUCPP_GPUIMPL
+    gpuMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) );
+    gpuMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) );
 #else
     cNGoodHel = nGoodHel;
     for( int ihel = 0; ihel < ncomb; ihel++ ) cGoodHel[ihel] = goodHel[ihel];
@@ -839,7 +840,7 @@ namespace mg5amcCpu
 #endif
             int* allselhel,                // output: helicity selection[nevt]
             int* allselcol                 // output: helicity selection[nevt]
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
             , const int nevt               // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #endif
             ) /* clang-format on */
@@ -859,7 +860,7 @@ namespace mg5amcCpu
     // Denominators: spins, colors and identical particles
     constexpr int helcolDenominators[1] = { 96 }; // assume nprocesses == 1 (#272 and #343)
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     // Remember: in CUDA this is a kernel for one event, in c++ this processes n events
     const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid
 #else
@@ -873,9 +874,12 @@ namespace mg5amcCpu
 #endif
 
     // Start sigmaKin_lines
+
+#include "GpuAbstraction.h"
+
     // === PART 0 - INITIALISATION (before calculate_wavefunctions) ===
     // Reset the "matrix elements" - running sums of |M|^2 over helicities for the given event
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     allMEs[ievt] = 0;
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     allNumerators[ievt] = 0;
@@ -903,7 +907,7 @@ namespace mg5amcCpu
     // === PART 1 - HELICITY LOOP: CALCULATE WAVEFUNCTIONS ===
     // (in both CUDA and C++, using precomputed good helicities)
 
-#ifdef __CUDACC__ // CUDA OR C++
+#ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++
 
     // *** START OF PART 1a - CUDA (one event per CPU thread) ***
     // Running sum of partial amplitudes squared for event by event color selection (#402)
@@ -1113,7 +1117,7 @@ namespace mg5amcCpu
     // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event
     // [NB 'sum over final spins, average over initial spins', eg see
     // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf]
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     allMEs[ievt] /= helcolDenominators[0];
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     if( channelId > 0 ) allMEs[ievt] *= allNumerators[ievt] / allDenominators[ievt];
diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gu_ttxu/CPPProcess.h b/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gu_ttxu/CPPProcess.h
index bf037c6c28..ce22572055 100644
--- a/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gu_ttxu/CPPProcess.h
+++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gu_ttxu/CPPProcess.h
@@ -4,7 +4,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
 // MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
@@ -25,7 +25,7 @@
 
 //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -110,7 +110,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   __global__ void
   computeDependentCouplings( const fptype* allgs,    // input: Gs[nevt]
                              fptype* allcouplings ); // output: couplings[nevt*ndcoup*2]
@@ -123,7 +123,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__ /* clang-format off */
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
   __global__ void
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
@@ -153,7 +153,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__ /* clang-format off */
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
   __global__ void
   sigmaKin( const fptype* allmomenta,      // input: momenta[nevt*npar*4]
             const fptype* allcouplings,    // input: couplings[nevt*ndcoup*2]
diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gu_ttxu/CudaRuntime.h b/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gu_ttxu/CudaRuntime.h
deleted file mode 120000
index ce9e1a487a..0000000000
--- a/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gu_ttxu/CudaRuntime.h
+++ /dev/null
@@ -1 +0,0 @@
-../CudaRuntime.h
\ No newline at end of file
diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gu_ttxu/check_sa.cc b/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gu_ttxu/check_sa.cc
index 3fbf0ffbee..7cac5ab47b 100644
--- a/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gu_ttxu/check_sa.cc
+++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gu_ttxu/check_sa.cc
@@ -4,7 +4,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: O. Mattelaer (Nov 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 
 #include "mgOnGpuConfig.h"
@@ -12,6 +12,7 @@
 #include "BridgeKernels.h"
 #include "CPPProcess.h"
 #include "CrossSectionKernels.h"
+#include "GpuRuntime.h"
 #include "MatrixElementKernels.h"
 #include "MemoryAccessMatrixElements.h"
 #include "MemoryAccessMomenta.h"
@@ -65,7 +66,7 @@ usage( char* argv0, int ret = 1 )
   std::cout << std::endl;
   std::cout << "Summary stats are always computed: '-p' and '-j' only control their printout" << std::endl;
   std::cout << "The '-d' flag only enables NaN/abnormal warnings and OMP debugging" << std::endl;
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 #ifdef _OPENMP
   std::cout << std::endl;
   std::cout << "Use the OMP_NUM_THREADS environment variable to control OMP multi-threading" << std::endl;
@@ -96,7 +97,7 @@ int
 main( int argc, char** argv )
 {
   // Namespaces for CUDA and C++ (FIXME - eventually use the same namespace everywhere...)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   using namespace mg5amcGpu;
 #else
   using namespace mg5amcCpu;
@@ -134,9 +135,11 @@ main( int argc, char** argv )
     CurandDevice = 2
   };
 #ifdef MGONGPU_HAS_NO_CURAND
-  RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // this is the only supported mode if build has no curand (PR #784)
+  RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // this is the only supported mode if build has no curand (PR #784 and #785)
+#elif defined __HIPCC__
+#error Internal error: MGONGPU_HAS_NO_CURAND should have been set for __HIPCC__ // default on AMD GPUs should be common random
 #elif defined __CUDACC__
-  RandomNumberMode rndgen = RandomNumberMode::CurandDevice; // default on GPU if build has curand
+  RandomNumberMode rndgen = RandomNumberMode::CurandDevice; // default on NVidia GPU if build has curand
 #else
   RandomNumberMode rndgen = RandomNumberMode::CurandHost; // default on CPU if build has curand
 #endif
@@ -146,10 +149,10 @@ main( int argc, char** argv )
     RamboHost = 1,
     RamboDevice = 2
   };
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   RamboSamplingMode rmbsmp = RamboSamplingMode::RamboDevice; // default on GPU
 #else
-  RamboSamplingMode rmbsmp = RamboSamplingMode::RamboHost;  // default on CPU
+  RamboSamplingMode rmbsmp = RamboSamplingMode::RamboHost; // default on CPU
 #endif
   // Bridge emulation mode (NB Bridge implies RamboHost!)
   bool bridge = false;
@@ -177,7 +180,7 @@ main( int argc, char** argv )
     else if( arg == "--curdev" )
     {
 #ifndef __CUDACC__
-      throw std::runtime_error( "CurandDevice is not supported on CPUs" );
+      throw std::runtime_error( "CurandDevice is not supported on CPUs or non-NVidia GPUs" );
 #elif defined MGONGPU_HAS_NO_CURAND
       throw std::runtime_error( "CurandDevice is not supported because this application was built without Curand support" );
 #else
@@ -198,7 +201,7 @@ main( int argc, char** argv )
     }
     else if( arg == "--rmbdev" )
     {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
       rmbsmp = RamboSamplingMode::RamboDevice;
 #else
       throw std::runtime_error( "RamboDevice is not supported on CPUs" );
@@ -272,13 +275,13 @@ main( int argc, char** argv )
     return usage( argv[0] );
   }
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 #ifdef _OPENMP
   ompnumthreadsNotSetMeansOneThread( debug ? 1 : 0 ); // quiet(-1), info(0), debug(1)
 #endif
 #endif
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // Fail gently and avoid "Illegal instruction (core dumped)" if the host does not support the SIMD used in the ME calculation
   // Note: this prevents a crash on pmpe04 but not on some github CI nodes?
   // [NB: SIMD vectorization in mg5amc C++ code is only used in the ME calculation below MatrixElementKernelHost!]
@@ -296,14 +299,14 @@ main( int argc, char** argv )
 
   // === STEP 0 - INITIALISE
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 
-  // --- 00. Initialise cuda
-  // Instantiate a CudaRuntime at the beginnining of the application's main to
-  // invoke cudaSetDevice(0) in the constructor and book a cudaDeviceReset() call in the destructor
-  const std::string cdinKey = "00 CudaInit";
+  // --- 00. Initialise GPU
+  // Instantiate a GpuRuntime at the beginnining of the application's main.
+  // For CUDA this invokes cudaSetDevice(0) in the constructor and books a cudaDeviceReset() call in the destructor.
+  const std::string cdinKey = "00 GpuInit";
   timermap.start( cdinKey );
-  CudaRuntime cudaRuntime( debug );
+  GpuRuntime GpuRuntime( debug );
 #endif
 
   // --- 0a. Initialise physics process
@@ -325,7 +328,7 @@ main( int argc, char** argv )
   timermap.start( alloKey );
 
   // Memory buffers for random numbers for momenta
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferRndNumMomenta hstRndmom( nevt );
 #else
   PinnedHostBufferRndNumMomenta hstRndmom( nevt );
@@ -333,7 +336,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for sampling weights
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferWeights hstWeights( nevt );
 #else
   PinnedHostBufferWeights hstWeights( nevt );
@@ -341,7 +344,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for momenta
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferMomenta hstMomenta( nevt );
 #else
   PinnedHostBufferMomenta hstMomenta( nevt );
@@ -349,7 +352,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for Gs
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferGs hstGs( nevt );
 #else
   PinnedHostBufferGs hstGs( nevt );
@@ -366,7 +369,7 @@ main( int argc, char** argv )
   }
 
   // Memory buffers for matrix elements
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferMatrixElements hstMatrixElements( nevt );
 #else
   PinnedHostBufferMatrixElements hstMatrixElements( nevt );
@@ -375,7 +378,7 @@ main( int argc, char** argv )
 
   // Memory buffers for random numbers for helicity selection
   // *** NB #403 these buffers always remain initialised at 0: no need for helicity choice in gcheck/check (no LHE produced) ***
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferRndNumHelicity hstRndHel( nevt );
 #else
   PinnedHostBufferRndNumHelicity hstRndHel( nevt );
@@ -384,7 +387,7 @@ main( int argc, char** argv )
 
   // Memory buffers for random numbers for color selection
   // *** NB #402 these buffers always remain initialised at 0: no need for color choice in gcheck/check (no LHE produced) ***
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferRndNumColor hstRndCol( nevt );
 #else
   PinnedHostBufferRndNumColor hstRndCol( nevt );
@@ -392,7 +395,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for helicity selection
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferSelectedHelicity hstSelHel( nevt );
 #else
   PinnedHostBufferSelectedHelicity hstSelHel( nevt );
@@ -400,7 +403,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for color selection
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferSelectedColor hstSelCol( nevt );
 #else
   PinnedHostBufferSelectedColor hstSelCol( nevt );
@@ -438,7 +441,7 @@ main( int argc, char** argv )
     const bool onDevice = true;
     prnk.reset( new CurandRandomNumberKernel( devRndmom, onDevice ) );
 #else
-    throw std::logic_error( "INTERNAL ERROR! CurandDevice is not supported on CPUs" ); // INTERNAL ERROR (no path to this statement)
+    throw std::logic_error( "INTERNAL ERROR! CurandDevice is not supported on CPUs or non-NVidia GPUs" ); // INTERNAL ERROR (no path to this statement)
 #endif
   }
 
@@ -450,7 +453,7 @@ main( int argc, char** argv )
   }
   else
   {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     prsk.reset( new RamboSamplingKernelDevice( energy, devRndmom, devMomenta, devWeights, gpublocks, gputhreads ) );
 #else
     throw std::logic_error( "RamboDevice is not supported on CPUs" ); // INTERNAL ERROR (no path to this statement)
@@ -461,7 +464,7 @@ main( int argc, char** argv )
   std::unique_ptr<MatrixElementKernelBase> pmek;
   if( !bridge )
   {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     pmek.reset( new MatrixElementKernelDevice( devMomenta, devGs, devRndHel, devRndCol, devMatrixElements, devSelHel, devSelCol, gpublocks, gputhreads ) );
 #else
     pmek.reset( new MatrixElementKernelHost( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, nevt ) );
@@ -469,7 +472,7 @@ main( int argc, char** argv )
   }
   else
   {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     pmek.reset( new BridgeKernelDevice( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, gpublocks, gputhreads ) );
 #else
     pmek.reset( new BridgeKernelHost( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, nevt ) );
@@ -511,7 +514,7 @@ main( int argc, char** argv )
     prnk->generateRnarray();
     //std::cout << "Got random numbers" << std::endl;
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     if( rndgen != RandomNumberMode::CurandDevice && rmbsmp == RamboSamplingMode::RamboDevice )
     {
       // --- 1c. Copy rndmom from host to device
@@ -543,7 +546,7 @@ main( int argc, char** argv )
     prsk->getMomentaFinal();
     //std::cout << "Got final momenta" << std::endl;
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     if( rmbsmp == RamboSamplingMode::RamboDevice )
     {
       // --- 2c. CopyDToH Weights
@@ -588,7 +591,7 @@ main( int argc, char** argv )
       dynamic_cast<BridgeKernelBase*>( pmek.get() )->transposeInputMomentaC2F();
     }
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     // --- 2d. CopyHToD Momenta
     const std::string gKey = "0.. CpHTDg";
     rambtime += timermap.start( gKey ); // FIXME! NOT A RAMBO TIMER!
@@ -617,7 +620,7 @@ main( int argc, char** argv )
     wv3atime += timermap.stop(); // calc only
     wavetime += wv3atime;        // calc plus copy
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     if( !bridge )
     {
       // --- 3b. CopyDToH MEs
@@ -760,18 +763,22 @@ main( int argc, char** argv )
     rndgentxt = "CURAND DEVICE";
 #ifdef __CUDACC__
   rndgentxt += " (CUDA code)";
+#elif defined __HIPCC__
+  rndgentxt += " (HIP code)";
 #else
   rndgentxt += " (C++ code)";
 #endif
 
   // Workflow description summary
   std::string wrkflwtxt;
-  // -- CUDA or C++?
+  // -- CUDA or HIP or C++?
 #ifdef __CUDACC__
   wrkflwtxt += "CUD:";
+#elif defined __HIPCC__
+  wrkflwtxt += "HIP:";
 #else
   wrkflwtxt += "CPP:";
-#endif
+#endif /* clang-format off */
   // -- DOUBLE or FLOAT?
 #if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
   wrkflwtxt += "MIX+"; // mixed fptypes (single precision color algebra #537)
@@ -781,7 +788,7 @@ main( int argc, char** argv )
   wrkflwtxt += "FLT+";
 #else
   wrkflwtxt += "???+"; // no path to this statement
-#endif
+#endif /* clang-format on */
   // -- CUCOMPLEX or THRUST or STD complex numbers?
 #ifdef __CUDACC__
 #if defined MGONGPU_CUCXTYPE_CUCOMPLEX
@@ -793,6 +800,12 @@ main( int argc, char** argv )
 #else
   wrkflwtxt += "???:"; // no path to this statement
 #endif
+#elif defined __HIPCC__
+#if defined MGONGPU_CUCXTYPE_CXSMPL
+  wrkflwtxt += "CXS:";
+#else
+  wrkflwtxt += "???:"; // no path to this statement
+#endif
 #else
 #if defined MGONGPU_CPPCXTYPE_STDCOMPLEX
   wrkflwtxt += "STX:";
@@ -818,7 +831,7 @@ main( int argc, char** argv )
     wrkflwtxt += "RMBDEV+";
   else
     wrkflwtxt += "??????+"; // no path to this statement
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   // -- HOST or DEVICE matrix elements? Standalone MEs or BRIDGE?
   if( !bridge )
     wrkflwtxt += "MESDEV";
@@ -874,7 +887,7 @@ main( int argc, char** argv )
 
   if( perf )
   {
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 #ifdef _OPENMP
     // Get the output of "nproc --all" (https://stackoverflow.com/a/478960)
     std::string nprocall;
@@ -895,6 +908,8 @@ main( int argc, char** argv )
     std::cout << std::string( SEP79, '*' ) << std::endl
 #ifdef __CUDACC__
               << "Process                     = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_CUDA"
+#elif defined __HIPCC__
+              << "Process                     = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_HIP"
 #else
               << "Process                     = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_CPP"
 #endif
@@ -921,21 +936,21 @@ main( int argc, char** argv )
 #elif defined MGONGPU_FPTYPE_FLOAT
               << "FP precision                = FLOAT (NaN/abnormal=" << nabn << ", zero=" << nzero << ")" << std::endl
 #endif
-#ifdef __CUDACC__
 #if defined MGONGPU_CUCXTYPE_CUCOMPLEX
               << "Complex type                = CUCOMPLEX" << std::endl
 #elif defined MGONGPU_CUCXTYPE_THRUST
               << "Complex type                = THRUST::COMPLEX" << std::endl
-#endif
-#else
+#elif defined MGONGPU_CUCXTYPE_CXSMPL
               << "Complex type                = STD::COMPLEX" << std::endl
+#else
+              << "Complex type                = ???" << std::endl // no path to this statement...
 #endif
               << "RanNumb memory layout       = AOSOA[" << neppR << "]"
               << ( neppR == 1 ? " == AOS" : "" )
               << " [HARDCODED FOR REPRODUCIBILITY]" << std::endl
               << "Momenta memory layout       = AOSOA[" << neppM << "]"
               << ( neppM == 1 ? " == AOS" : "" ) << std::endl
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     //<< "Wavefunction GPU memory     = LOCAL" << std::endl
 #else
 #if !defined MGONGPU_CPPSIMD
@@ -966,7 +981,7 @@ main( int argc, char** argv )
 #endif
 #endif
               << "Random number generation    = " << rndgentxt << std::endl
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 #ifdef _OPENMP
               << "OMP threads / `nproc --all` = " << omp_get_max_threads() << " / " << nprocall // includes a newline
 #endif
@@ -1062,14 +1077,14 @@ main( int argc, char** argv )
              << "\"FLOAT (NaN/abnormal=" << nabn << ")\"," << std::endl
 #endif
              << "\"Complex type\": "
-#ifdef __CUDACC__
 #if defined MGONGPU_CUCXTYPE_CUCOMPLEX
              << "\"CUCOMPLEX\"," << std::endl
 #elif defined MGONGPU_CUCXTYPE_THRUST
              << "\"THRUST::COMPLEX\"," << std::endl
-#endif
-#else
+#elif defined MGONGPU_CUCXTYPE_CXSMPL
              << "\"STD::COMPLEX\"," << std::endl
+#else
+             << "\"???\"," << std::endl                           // no path to this statement...
 #endif
              << "\"RanNumb memory layout\": "
              << "\"AOSOA[" << neppR << "]\""
@@ -1077,7 +1092,7 @@ main( int argc, char** argv )
              << "\"Momenta memory layout\": "
              << "\"AOSOA[" << neppM << "]\""
              << ( neppM == 1 ? " == AOS" : "" ) << ", " << std::endl
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     //<< "\"Wavefunction GPU memory\": " << "\"LOCAL\"," << std::endl
 #endif
              << "\"Curand generation\": "
diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gux_ttxux/CPPProcess.cc b/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gux_ttxux/CPPProcess.cc
index 76c9403933..a9294d1fea 100644
--- a/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gux_ttxux/CPPProcess.cc
+++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gux_ttxux/CPPProcess.cc
@@ -4,7 +4,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
 // MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
@@ -16,7 +16,6 @@
 
 #include "mgOnGpuConfig.h"
 
-#include "CudaRuntime.h"
 #include "HelAmps_sm.h"
 #include "MemoryAccessAmplitudes.h"
 #include "MemoryAccessCouplings.h"
@@ -49,7 +48,7 @@
 // Process: g d~ > t t~ d~ WEIGHTED<=3 @1
 // Process: g s~ > t t~ s~ WEIGHTED<=3 @1
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -83,7 +82,7 @@ namespace mg5amcCpu
   __device__ const fptype cIPD[2] = { (fptype)Parameters_sm::mdl_MT, (fptype)Parameters_sm::mdl_WT };
   __device__ const fptype* cIPC = nullptr; // unused as nicoup=0
 #else
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   __device__ __constant__ fptype cIPD[2];
   __device__ __constant__ fptype* cIPC = nullptr; // unused as nicoup=0
 #else
@@ -93,7 +92,7 @@ namespace mg5amcCpu
 #endif
 
   // Helicity combinations (and filtering of "good" helicity combinations)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   __device__ __constant__ short cHel[ncomb][npar];
   __device__ __constant__ int cNGoodHel;
   __device__ __constant__ int cGoodHel[ncomb];
@@ -121,13 +120,13 @@ namespace mg5amcCpu
                            fptype* allDenominators,       // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
                            fptype_sv* jamp2_sv            // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled)
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
                            , const int ievt00             // input: first event number in current C++ event page (for CUDA, ievt depends on threadid)
 #endif
                            )
   //ALWAYS_INLINE // attributes are not permitted in a function definition
   {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     using namespace mg5amcGpu;
     using M_ACCESS = DeviceAccessMomenta;         // non-trivial access: buffer includes all events
     using E_ACCESS = DeviceAccessMatrixElements;  // non-trivial access: buffer includes all events
@@ -154,7 +153,7 @@ namespace mg5amcCpu
 #endif /* clang-format on */
     mgDebug( 0, __FUNCTION__ );
     //printf( "calculate_wavefunctions: ihel=%2d\n", ihel );
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
     //printf( "calculate_wavefunctions: ievt00=%d\n", ievt00 );
 #endif
 
@@ -190,7 +189,7 @@ namespace mg5amcCpu
 #endif
     for( int iParity = 0; iParity < nParity; ++iParity )
     { // START LOOP ON IPARITY
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
       const int ievt0 = ievt00 + iParity * neppV;
 #endif
       constexpr size_t nxcoup = ndcoup + nicoup; // both dependent and independent couplings
@@ -203,8 +202,10 @@ namespace mg5amcCpu
         allCOUPs[idcoup] = CD_ACCESS::idcoupAccessBufferConst( allcouplings, idcoup ); // dependent couplings, vary event-by-event
       for( size_t iicoup = 0; iicoup < nicoup; iicoup++ )
         allCOUPs[ndcoup + iicoup] = CI_ACCESS::iicoupAccessBufferConst( cIPC, iicoup ); // independent couplings, fixed for all events
+#ifdef MGONGPUCPP_GPUIMPL
 #ifdef __CUDACC__
 #pragma nv_diagnostic pop
+#endif
       // CUDA kernels take input/output buffers with momenta/MEs for all events
       const fptype* momenta = allmomenta;
       const fptype* COUPs[nxcoup];
@@ -336,7 +337,7 @@ namespace mg5amcCpu
         { 4, 0, 12, 4 },
         { 0, 4, 4, 12 } }; // 2-D array[4][4]
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
       // Pre-compute a constexpr triangular color matrix properly normalized #475
       struct TriangularNormalizedColorMatrix
       {
@@ -393,7 +394,7 @@ namespace mg5amcCpu
 #endif
       for( int icol = 0; icol < ncolor; icol++ )
       {
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
         // === C++ START ===
         // Diagonal terms
 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
@@ -452,7 +453,7 @@ namespace mg5amcCpu
       MEs_sv_previous += deltaMEs_previous;
 #endif
       /*
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
       if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", blockDim.x * blockIdx.x + threadIdx.x, ihel, MEs_sv );
 #else
 #ifdef MGONGPU_CPPSIMD
@@ -515,8 +516,8 @@ namespace mg5amcCpu
       { 1, 1, 1, 1, -1 },
       { 1, 1, 1, -1, 1 },
       { 1, 1, 1, -1, -1 } };
-#ifdef __CUDACC__
-    checkCuda( cudaMemcpyToSymbol( cHel, tHel, ncomb * npar * sizeof( short ) ) );
+#ifdef MGONGPUCPP_GPUIMPL
+    gpuMemcpyToSymbol( cHel, tHel, ncomb * npar * sizeof( short ) );
 #else
     memcpy( cHel, tHel, ncomb * npar * sizeof( short ) );
 #endif
@@ -557,9 +558,9 @@ namespace mg5amcCpu
     // Then copy them to CUDA constant memory (issue #39) or its C++ emulation in file-scope static memory
     const fptype tIPD[2] = { (fptype)m_pars->mdl_MT, (fptype)m_pars->mdl_WT };
     //const cxtype tIPC[0] = { ... }; // nicoup=0
-#ifdef __CUDACC__
-    checkCuda( cudaMemcpyToSymbol( cIPD, tIPD, 2 * sizeof( fptype ) ) );
-    //checkCuda( cudaMemcpyToSymbol( cIPC, tIPC, 0 * sizeof( cxtype ) ) ); // nicoup=0
+#ifdef MGONGPUCPP_GPUIMPL
+    gpuMemcpyToSymbol( cIPD, tIPD, 2 * sizeof( fptype ) );
+    //gpuMemcpyToSymbol( cIPC, tIPC, 0 * sizeof( cxtype ) ); // nicoup=0
 #else
     memcpy( cIPD, tIPD, 2 * sizeof( fptype ) );
     //memcpy( cIPC, tIPC, 0 * sizeof( cxtype ) ); // nicoup=0
@@ -596,7 +597,7 @@ namespace mg5amcCpu
   {
     std::stringstream out;
     // CUDA version (NVCC)
-    // [Use __NVCC__ instead of __CUDACC__ here!]
+    // [Use __NVCC__ instead of MGONGPUCPP_GPUIMPL here!]
     // [This tests if 'nvcc' was used even to build a .cc file, even if not necessarily 'nvcc -x cu' for a .cu file]
     // [Check 'nvcc --compiler-options -dM -E dummy.c | grep CUDA': see https://stackoverflow.com/a/53713712]
 #ifdef __NVCC__
@@ -661,12 +662,12 @@ namespace mg5amcCpu
   __global__ void /* clang-format off */
   computeDependentCouplings( const fptype* allgs, // input: Gs[nevt]
                              fptype* allcouplings // output: couplings[nevt*ndcoup*2]
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
                              , const int nevt     // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #endif
   ) /* clang-format on */
   {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     using namespace mg5amcGpu;
     using G_ACCESS = DeviceAccessGs;
     using C_ACCESS = DeviceAccessCouplings;
@@ -687,7 +688,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__ /* clang-format off */
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
   __global__ void
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
@@ -813,9 +814,9 @@ namespace mg5amcCpu
         nGoodHel++;
       }
     }
-#ifdef __CUDACC__
-    checkCuda( cudaMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) ) );
-    checkCuda( cudaMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) ) );
+#ifdef MGONGPUCPP_GPUIMPL
+    gpuMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) );
+    gpuMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) );
 #else
     cNGoodHel = nGoodHel;
     for( int ihel = 0; ihel < ncomb; ihel++ ) cGoodHel[ihel] = goodHel[ihel];
@@ -839,7 +840,7 @@ namespace mg5amcCpu
 #endif
             int* allselhel,                // output: helicity selection[nevt]
             int* allselcol                 // output: helicity selection[nevt]
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
             , const int nevt               // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #endif
             ) /* clang-format on */
@@ -859,7 +860,7 @@ namespace mg5amcCpu
     // Denominators: spins, colors and identical particles
     constexpr int helcolDenominators[1] = { 96 }; // assume nprocesses == 1 (#272 and #343)
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     // Remember: in CUDA this is a kernel for one event, in c++ this processes n events
     const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid
 #else
@@ -873,9 +874,12 @@ namespace mg5amcCpu
 #endif
 
     // Start sigmaKin_lines
+
+#include "GpuAbstraction.h"
+
     // === PART 0 - INITIALISATION (before calculate_wavefunctions) ===
     // Reset the "matrix elements" - running sums of |M|^2 over helicities for the given event
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     allMEs[ievt] = 0;
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     allNumerators[ievt] = 0;
@@ -903,7 +907,7 @@ namespace mg5amcCpu
     // === PART 1 - HELICITY LOOP: CALCULATE WAVEFUNCTIONS ===
     // (in both CUDA and C++, using precomputed good helicities)
 
-#ifdef __CUDACC__ // CUDA OR C++
+#ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++
 
     // *** START OF PART 1a - CUDA (one event per CPU thread) ***
     // Running sum of partial amplitudes squared for event by event color selection (#402)
@@ -1113,7 +1117,7 @@ namespace mg5amcCpu
     // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event
     // [NB 'sum over final spins, average over initial spins', eg see
     // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf]
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     allMEs[ievt] /= helcolDenominators[0];
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     if( channelId > 0 ) allMEs[ievt] *= allNumerators[ievt] / allDenominators[ievt];
diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gux_ttxux/CPPProcess.h b/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gux_ttxux/CPPProcess.h
index 0f49f5247b..46c4347506 100644
--- a/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gux_ttxux/CPPProcess.h
+++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gux_ttxux/CPPProcess.h
@@ -4,7 +4,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
 // MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
@@ -25,7 +25,7 @@
 
 //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -110,7 +110,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   __global__ void
   computeDependentCouplings( const fptype* allgs,    // input: Gs[nevt]
                              fptype* allcouplings ); // output: couplings[nevt*ndcoup*2]
@@ -123,7 +123,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__ /* clang-format off */
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
   __global__ void
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
@@ -153,7 +153,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__ /* clang-format off */
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
   __global__ void
   sigmaKin( const fptype* allmomenta,      // input: momenta[nevt*npar*4]
             const fptype* allcouplings,    // input: couplings[nevt*ndcoup*2]
diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gux_ttxux/CudaRuntime.h b/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gux_ttxux/CudaRuntime.h
deleted file mode 120000
index ce9e1a487a..0000000000
--- a/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gux_ttxux/CudaRuntime.h
+++ /dev/null
@@ -1 +0,0 @@
-../CudaRuntime.h
\ No newline at end of file
diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gux_ttxux/check_sa.cc b/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gux_ttxux/check_sa.cc
index 3fbf0ffbee..7cac5ab47b 100644
--- a/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gux_ttxux/check_sa.cc
+++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gux_ttxux/check_sa.cc
@@ -4,7 +4,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: O. Mattelaer (Nov 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 
 #include "mgOnGpuConfig.h"
@@ -12,6 +12,7 @@
 #include "BridgeKernels.h"
 #include "CPPProcess.h"
 #include "CrossSectionKernels.h"
+#include "GpuRuntime.h"
 #include "MatrixElementKernels.h"
 #include "MemoryAccessMatrixElements.h"
 #include "MemoryAccessMomenta.h"
@@ -65,7 +66,7 @@ usage( char* argv0, int ret = 1 )
   std::cout << std::endl;
   std::cout << "Summary stats are always computed: '-p' and '-j' only control their printout" << std::endl;
   std::cout << "The '-d' flag only enables NaN/abnormal warnings and OMP debugging" << std::endl;
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 #ifdef _OPENMP
   std::cout << std::endl;
   std::cout << "Use the OMP_NUM_THREADS environment variable to control OMP multi-threading" << std::endl;
@@ -96,7 +97,7 @@ int
 main( int argc, char** argv )
 {
   // Namespaces for CUDA and C++ (FIXME - eventually use the same namespace everywhere...)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   using namespace mg5amcGpu;
 #else
   using namespace mg5amcCpu;
@@ -134,9 +135,11 @@ main( int argc, char** argv )
     CurandDevice = 2
   };
 #ifdef MGONGPU_HAS_NO_CURAND
-  RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // this is the only supported mode if build has no curand (PR #784)
+  RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // this is the only supported mode if build has no curand (PR #784 and #785)
+#elif defined __HIPCC__
+#error Internal error: MGONGPU_HAS_NO_CURAND should have been set for __HIPCC__ // default on AMD GPUs should be common random
 #elif defined __CUDACC__
-  RandomNumberMode rndgen = RandomNumberMode::CurandDevice; // default on GPU if build has curand
+  RandomNumberMode rndgen = RandomNumberMode::CurandDevice; // default on NVidia GPU if build has curand
 #else
   RandomNumberMode rndgen = RandomNumberMode::CurandHost; // default on CPU if build has curand
 #endif
@@ -146,10 +149,10 @@ main( int argc, char** argv )
     RamboHost = 1,
     RamboDevice = 2
   };
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   RamboSamplingMode rmbsmp = RamboSamplingMode::RamboDevice; // default on GPU
 #else
-  RamboSamplingMode rmbsmp = RamboSamplingMode::RamboHost;  // default on CPU
+  RamboSamplingMode rmbsmp = RamboSamplingMode::RamboHost; // default on CPU
 #endif
   // Bridge emulation mode (NB Bridge implies RamboHost!)
   bool bridge = false;
@@ -177,7 +180,7 @@ main( int argc, char** argv )
     else if( arg == "--curdev" )
     {
 #ifndef __CUDACC__
-      throw std::runtime_error( "CurandDevice is not supported on CPUs" );
+      throw std::runtime_error( "CurandDevice is not supported on CPUs or non-NVidia GPUs" );
 #elif defined MGONGPU_HAS_NO_CURAND
       throw std::runtime_error( "CurandDevice is not supported because this application was built without Curand support" );
 #else
@@ -198,7 +201,7 @@ main( int argc, char** argv )
     }
     else if( arg == "--rmbdev" )
     {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
       rmbsmp = RamboSamplingMode::RamboDevice;
 #else
       throw std::runtime_error( "RamboDevice is not supported on CPUs" );
@@ -272,13 +275,13 @@ main( int argc, char** argv )
     return usage( argv[0] );
   }
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 #ifdef _OPENMP
   ompnumthreadsNotSetMeansOneThread( debug ? 1 : 0 ); // quiet(-1), info(0), debug(1)
 #endif
 #endif
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // Fail gently and avoid "Illegal instruction (core dumped)" if the host does not support the SIMD used in the ME calculation
   // Note: this prevents a crash on pmpe04 but not on some github CI nodes?
   // [NB: SIMD vectorization in mg5amc C++ code is only used in the ME calculation below MatrixElementKernelHost!]
@@ -296,14 +299,14 @@ main( int argc, char** argv )
 
   // === STEP 0 - INITIALISE
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 
-  // --- 00. Initialise cuda
-  // Instantiate a CudaRuntime at the beginnining of the application's main to
-  // invoke cudaSetDevice(0) in the constructor and book a cudaDeviceReset() call in the destructor
-  const std::string cdinKey = "00 CudaInit";
+  // --- 00. Initialise GPU
+  // Instantiate a GpuRuntime at the beginnining of the application's main.
+  // For CUDA this invokes cudaSetDevice(0) in the constructor and books a cudaDeviceReset() call in the destructor.
+  const std::string cdinKey = "00 GpuInit";
   timermap.start( cdinKey );
-  CudaRuntime cudaRuntime( debug );
+  GpuRuntime GpuRuntime( debug );
 #endif
 
   // --- 0a. Initialise physics process
@@ -325,7 +328,7 @@ main( int argc, char** argv )
   timermap.start( alloKey );
 
   // Memory buffers for random numbers for momenta
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferRndNumMomenta hstRndmom( nevt );
 #else
   PinnedHostBufferRndNumMomenta hstRndmom( nevt );
@@ -333,7 +336,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for sampling weights
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferWeights hstWeights( nevt );
 #else
   PinnedHostBufferWeights hstWeights( nevt );
@@ -341,7 +344,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for momenta
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferMomenta hstMomenta( nevt );
 #else
   PinnedHostBufferMomenta hstMomenta( nevt );
@@ -349,7 +352,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for Gs
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferGs hstGs( nevt );
 #else
   PinnedHostBufferGs hstGs( nevt );
@@ -366,7 +369,7 @@ main( int argc, char** argv )
   }
 
   // Memory buffers for matrix elements
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferMatrixElements hstMatrixElements( nevt );
 #else
   PinnedHostBufferMatrixElements hstMatrixElements( nevt );
@@ -375,7 +378,7 @@ main( int argc, char** argv )
 
   // Memory buffers for random numbers for helicity selection
   // *** NB #403 these buffers always remain initialised at 0: no need for helicity choice in gcheck/check (no LHE produced) ***
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferRndNumHelicity hstRndHel( nevt );
 #else
   PinnedHostBufferRndNumHelicity hstRndHel( nevt );
@@ -384,7 +387,7 @@ main( int argc, char** argv )
 
   // Memory buffers for random numbers for color selection
   // *** NB #402 these buffers always remain initialised at 0: no need for color choice in gcheck/check (no LHE produced) ***
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferRndNumColor hstRndCol( nevt );
 #else
   PinnedHostBufferRndNumColor hstRndCol( nevt );
@@ -392,7 +395,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for helicity selection
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferSelectedHelicity hstSelHel( nevt );
 #else
   PinnedHostBufferSelectedHelicity hstSelHel( nevt );
@@ -400,7 +403,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for color selection
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferSelectedColor hstSelCol( nevt );
 #else
   PinnedHostBufferSelectedColor hstSelCol( nevt );
@@ -438,7 +441,7 @@ main( int argc, char** argv )
     const bool onDevice = true;
     prnk.reset( new CurandRandomNumberKernel( devRndmom, onDevice ) );
 #else
-    throw std::logic_error( "INTERNAL ERROR! CurandDevice is not supported on CPUs" ); // INTERNAL ERROR (no path to this statement)
+    throw std::logic_error( "INTERNAL ERROR! CurandDevice is not supported on CPUs or non-NVidia GPUs" ); // INTERNAL ERROR (no path to this statement)
 #endif
   }
 
@@ -450,7 +453,7 @@ main( int argc, char** argv )
   }
   else
   {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     prsk.reset( new RamboSamplingKernelDevice( energy, devRndmom, devMomenta, devWeights, gpublocks, gputhreads ) );
 #else
     throw std::logic_error( "RamboDevice is not supported on CPUs" ); // INTERNAL ERROR (no path to this statement)
@@ -461,7 +464,7 @@ main( int argc, char** argv )
   std::unique_ptr<MatrixElementKernelBase> pmek;
   if( !bridge )
   {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     pmek.reset( new MatrixElementKernelDevice( devMomenta, devGs, devRndHel, devRndCol, devMatrixElements, devSelHel, devSelCol, gpublocks, gputhreads ) );
 #else
     pmek.reset( new MatrixElementKernelHost( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, nevt ) );
@@ -469,7 +472,7 @@ main( int argc, char** argv )
   }
   else
   {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     pmek.reset( new BridgeKernelDevice( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, gpublocks, gputhreads ) );
 #else
     pmek.reset( new BridgeKernelHost( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, nevt ) );
@@ -511,7 +514,7 @@ main( int argc, char** argv )
     prnk->generateRnarray();
     //std::cout << "Got random numbers" << std::endl;
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     if( rndgen != RandomNumberMode::CurandDevice && rmbsmp == RamboSamplingMode::RamboDevice )
     {
       // --- 1c. Copy rndmom from host to device
@@ -543,7 +546,7 @@ main( int argc, char** argv )
     prsk->getMomentaFinal();
     //std::cout << "Got final momenta" << std::endl;
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     if( rmbsmp == RamboSamplingMode::RamboDevice )
     {
       // --- 2c. CopyDToH Weights
@@ -588,7 +591,7 @@ main( int argc, char** argv )
       dynamic_cast<BridgeKernelBase*>( pmek.get() )->transposeInputMomentaC2F();
     }
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     // --- 2d. CopyHToD Momenta
     const std::string gKey = "0.. CpHTDg";
     rambtime += timermap.start( gKey ); // FIXME! NOT A RAMBO TIMER!
@@ -617,7 +620,7 @@ main( int argc, char** argv )
     wv3atime += timermap.stop(); // calc only
     wavetime += wv3atime;        // calc plus copy
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     if( !bridge )
     {
       // --- 3b. CopyDToH MEs
@@ -760,18 +763,22 @@ main( int argc, char** argv )
     rndgentxt = "CURAND DEVICE";
 #ifdef __CUDACC__
   rndgentxt += " (CUDA code)";
+#elif defined __HIPCC__
+  rndgentxt += " (HIP code)";
 #else
   rndgentxt += " (C++ code)";
 #endif
 
   // Workflow description summary
   std::string wrkflwtxt;
-  // -- CUDA or C++?
+  // -- CUDA or HIP or C++?
 #ifdef __CUDACC__
   wrkflwtxt += "CUD:";
+#elif defined __HIPCC__
+  wrkflwtxt += "HIP:";
 #else
   wrkflwtxt += "CPP:";
-#endif
+#endif /* clang-format off */
   // -- DOUBLE or FLOAT?
 #if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
   wrkflwtxt += "MIX+"; // mixed fptypes (single precision color algebra #537)
@@ -781,7 +788,7 @@ main( int argc, char** argv )
   wrkflwtxt += "FLT+";
 #else
   wrkflwtxt += "???+"; // no path to this statement
-#endif
+#endif /* clang-format on */
   // -- CUCOMPLEX or THRUST or STD complex numbers?
 #ifdef __CUDACC__
 #if defined MGONGPU_CUCXTYPE_CUCOMPLEX
@@ -793,6 +800,12 @@ main( int argc, char** argv )
 #else
   wrkflwtxt += "???:"; // no path to this statement
 #endif
+#elif defined __HIPCC__
+#if defined MGONGPU_CUCXTYPE_CXSMPL
+  wrkflwtxt += "CXS:";
+#else
+  wrkflwtxt += "???:"; // no path to this statement
+#endif
 #else
 #if defined MGONGPU_CPPCXTYPE_STDCOMPLEX
   wrkflwtxt += "STX:";
@@ -818,7 +831,7 @@ main( int argc, char** argv )
     wrkflwtxt += "RMBDEV+";
   else
     wrkflwtxt += "??????+"; // no path to this statement
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   // -- HOST or DEVICE matrix elements? Standalone MEs or BRIDGE?
   if( !bridge )
     wrkflwtxt += "MESDEV";
@@ -874,7 +887,7 @@ main( int argc, char** argv )
 
   if( perf )
   {
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 #ifdef _OPENMP
     // Get the output of "nproc --all" (https://stackoverflow.com/a/478960)
     std::string nprocall;
@@ -895,6 +908,8 @@ main( int argc, char** argv )
     std::cout << std::string( SEP79, '*' ) << std::endl
 #ifdef __CUDACC__
               << "Process                     = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_CUDA"
+#elif defined __HIPCC__
+              << "Process                     = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_HIP"
 #else
               << "Process                     = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_CPP"
 #endif
@@ -921,21 +936,21 @@ main( int argc, char** argv )
 #elif defined MGONGPU_FPTYPE_FLOAT
               << "FP precision                = FLOAT (NaN/abnormal=" << nabn << ", zero=" << nzero << ")" << std::endl
 #endif
-#ifdef __CUDACC__
 #if defined MGONGPU_CUCXTYPE_CUCOMPLEX
               << "Complex type                = CUCOMPLEX" << std::endl
 #elif defined MGONGPU_CUCXTYPE_THRUST
               << "Complex type                = THRUST::COMPLEX" << std::endl
-#endif
-#else
+#elif defined MGONGPU_CUCXTYPE_CXSMPL
               << "Complex type                = STD::COMPLEX" << std::endl
+#else
+              << "Complex type                = ???" << std::endl // no path to this statement...
 #endif
               << "RanNumb memory layout       = AOSOA[" << neppR << "]"
               << ( neppR == 1 ? " == AOS" : "" )
               << " [HARDCODED FOR REPRODUCIBILITY]" << std::endl
               << "Momenta memory layout       = AOSOA[" << neppM << "]"
               << ( neppM == 1 ? " == AOS" : "" ) << std::endl
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     //<< "Wavefunction GPU memory     = LOCAL" << std::endl
 #else
 #if !defined MGONGPU_CPPSIMD
@@ -966,7 +981,7 @@ main( int argc, char** argv )
 #endif
 #endif
               << "Random number generation    = " << rndgentxt << std::endl
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 #ifdef _OPENMP
               << "OMP threads / `nproc --all` = " << omp_get_max_threads() << " / " << nprocall // includes a newline
 #endif
@@ -1062,14 +1077,14 @@ main( int argc, char** argv )
              << "\"FLOAT (NaN/abnormal=" << nabn << ")\"," << std::endl
 #endif
              << "\"Complex type\": "
-#ifdef __CUDACC__
 #if defined MGONGPU_CUCXTYPE_CUCOMPLEX
              << "\"CUCOMPLEX\"," << std::endl
 #elif defined MGONGPU_CUCXTYPE_THRUST
              << "\"THRUST::COMPLEX\"," << std::endl
-#endif
-#else
+#elif defined MGONGPU_CUCXTYPE_CXSMPL
              << "\"STD::COMPLEX\"," << std::endl
+#else
+             << "\"???\"," << std::endl                           // no path to this statement...
 #endif
              << "\"RanNumb memory layout\": "
              << "\"AOSOA[" << neppR << "]\""
@@ -1077,7 +1092,7 @@ main( int argc, char** argv )
              << "\"Momenta memory layout\": "
              << "\"AOSOA[" << neppM << "]\""
              << ( neppM == 1 ? " == AOS" : "" ) << ", " << std::endl
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     //<< "\"Wavefunction GPU memory\": " << "\"LOCAL\"," << std::endl
 #endif
              << "\"Curand generation\": "
diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/RamboSamplingKernels.cc b/epochX/cudacpp/gq_ttq.sa/SubProcesses/RamboSamplingKernels.cc
index da68aa9255..79abbcc4f8 100644
--- a/epochX/cudacpp/gq_ttq.sa/SubProcesses/RamboSamplingKernels.cc
+++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/RamboSamplingKernels.cc
@@ -1,11 +1,11 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #include "RamboSamplingKernels.h"
 
-#include "CudaRuntime.h"
+#include "GpuRuntime.h"
 #include "MemoryAccessMomenta.h"
 #include "MemoryAccessRandomNumbers.h"
 #include "MemoryAccessWeights.h"
@@ -14,7 +14,7 @@
 
 #include <sstream>
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -92,7 +92,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   RamboSamplingKernelDevice::RamboSamplingKernelDevice( const fptype energy,               // input: energy
                                                         const BufferRndNumMomenta& rndmom, // input: random numbers in [0,1]
                                                         BufferMomenta& momenta,            // output: momenta
@@ -135,7 +135,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   __global__ void
   getMomentaInitialDevice( const fptype energy,
                            fptype* momenta )
@@ -147,17 +147,17 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   void
   RamboSamplingKernelDevice::getMomentaInitial()
   {
-    getMomentaInitialDevice<<<m_gpublocks, m_gputhreads>>>( m_energy, m_momenta.data() );
+    gpuLaunchKernel( getMomentaInitialDevice, m_gpublocks, m_gputhreads, m_energy, m_momenta.data() );
   }
 #endif
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   __global__ void
   getMomentaFinalDevice( const fptype energy,
                          const fptype* rndmom,
@@ -171,11 +171,11 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   void
   RamboSamplingKernelDevice::getMomentaFinal()
   {
-    getMomentaFinalDevice<<<m_gpublocks, m_gputhreads>>>( m_energy, m_rndmom.data(), m_momenta.data(), m_weights.data() );
+    gpuLaunchKernel( getMomentaFinalDevice, m_gpublocks, m_gputhreads, m_energy, m_rndmom.data(), m_momenta.data(), m_weights.data() );
   }
 #endif
 
diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/RamboSamplingKernels.h b/epochX/cudacpp/gq_ttq.sa/SubProcesses/RamboSamplingKernels.h
index 184089efd7..7c214cd74b 100644
--- a/epochX/cudacpp/gq_ttq.sa/SubProcesses/RamboSamplingKernels.h
+++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/RamboSamplingKernels.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef RAMBOSAMPLINGKERNELS_H
 #define RAMBOSAMPLINGKERNELS_H 1
@@ -10,7 +10,7 @@
 
 #include "MemoryBuffers.h"
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -93,7 +93,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   // A class encapsulating RAMBO phase space sampling on a GPU device
   class RamboSamplingKernelDevice final : public SamplingKernelBase, public NumberOfEvents
   {
diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/RandomNumberKernels.h b/epochX/cudacpp/gq_ttq.sa/SubProcesses/RandomNumberKernels.h
index 188a72c2c9..21d63beeac 100644
--- a/epochX/cudacpp/gq_ttq.sa/SubProcesses/RandomNumberKernels.h
+++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/RandomNumberKernels.h
@@ -1,14 +1,14 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef RANDOMNUMBERKERNELS_H
 #define RANDOMNUMBERKERNELS_H 1
 
 #include "mgOnGpuConfig.h"
 
-// NB This must come AFTER mgOnGpuConfig.h which contains our definition of __global__ when __CUDACC__ is not defined
+// NB This must come AFTER mgOnGpuConfig.h which contains our definition of __global__ when MGONGPUCPP_GPUIMPL is not defined
 #ifndef MGONGPU_HAS_NO_CURAND
 //#include "curand.h"
 struct curandGenerator_st; // forward definition from curand.h
@@ -16,7 +16,7 @@ struct curandGenerator_st; // forward definition from curand.h
 
 #include "MemoryBuffers.h"
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/cudacpp.mk b/epochX/cudacpp/gq_ttq.sa/SubProcesses/cudacpp.mk
index 509307506b..f2cfa349da 100644
--- a/epochX/cudacpp/gq_ttq.sa/SubProcesses/cudacpp.mk
+++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/cudacpp.mk
@@ -1,7 +1,7 @@
 # Copyright (C) 2020-2023 CERN and UCLouvain.
 # Licensed under the GNU Lesser General Public License (version 3 or later).
 # Created by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin.
-# Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+# Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 
 #=== Determine the name of this makefile (https://ftp.gnu.org/old-gnu/Manuals/make-3.80/html_node/make_17.html)
 #=== NB: use ':=' to ensure that the value of CUDACPP_MAKEFILE is not modified further down after including make_opts
@@ -42,10 +42,10 @@ endif
 
 #-------------------------------------------------------------------------------
 
-#=== Configure common compiler flags for C++ and CUDA
+#=== Configure common compiler flags for C++ and CUDA/HIP
 
 INCFLAGS = -I.
-OPTFLAGS = -O3 # this ends up in CUFLAGS too (should it?), cannot add -Ofast or -ffast-math here
+OPTFLAGS = -O3 # this ends up in GPUFLAGS too (should it?), cannot add -Ofast or -ffast-math here
 
 # Dependency on src directory
 MG5AMC_COMMONLIB = mg5amc_common
@@ -121,24 +121,46 @@ endif
 
 #-------------------------------------------------------------------------------
 
-#=== Configure the CUDA compiler
+#=== Configure the GPU compiler (CUDA or HIP)
 
-# If CXX is not a single word (example "clang++ --gcc-toolchain...") then disable CUDA builds (issue #505)
-# This is because it is impossible to pass this to "CUFLAGS += -ccbin <host-compiler>" below
+# FIXME! (AV 24.01.2024)
+# In the current implementation (without separate builds for C++ and CUDA/HIP), we first check for cudacc and hipcc in CUDA_HOME and HIP_HOME.
+# If CUDA_HOME or HIP_HOME are not set, try to determine them from the path to cudacc and hipcc.
+# While convoluted, this is currently necessary to allow disabling CUDA/HIP builds by setting CUDA_HOME or HIP_HOME to invalid paths.
+# This will (probably?) be fixed when separate C++ and CUDA/HIP builds are implemented (PR #775).
+
+# If CXX is not a single word (example "clang++ --gcc-toolchain...") then disable CUDA and HIP builds (issue #505)
+# This is because it is impossible to pass this to "GPUFLAGS += -ccbin <host-compiler>" below
 ifneq ($(words $(subst ccache ,,$(CXX))),1) # allow at most "CXX=ccache <host-compiler>" from outside
-  $(warning CUDA builds are not supported for multi-word CXX "$(CXX)")
+  $(warning CUDA and HIP builds are not supported for multi-word CXX "$(CXX)")
   override CUDA_HOME=disabled
+  override HIP_HOME=disabled
 endif
 
-# If CUDA_HOME is not set, try to set it from the location of nvcc
+# If CUDA_HOME is not set, try to set it from the path to nvcc
 ifndef CUDA_HOME
   CUDA_HOME = $(patsubst %bin/nvcc,%,$(shell which nvcc 2>/dev/null))
   $(warning CUDA_HOME was not set: using "$(CUDA_HOME)")
 endif
 
-# Set NVCC as $(CUDA_HOME)/bin/nvcc if it exists
+# If HIP_HOME is not set, try to set it from the path to hipcc
+ifndef HIP_HOME
+  HIP_HOME = $(patsubst %bin/hipcc,%,$(HIP_COMPILER_PATH))
+  $(warning HIP_HOME was not set: using "$(HIP_HOME)")
+endif
+
+# FIXME! (AV 24.01.2024)
+# In the current implementation (without separate builds for C++ and CUDA/HIP),
+# builds are performed for HIP only if CUDA is not found in the path.
+# If both CUDA and HIP are installed, HIP builds can be triggered by unsetting CUDA_HOME.
+# This will be fixed when separate C++ and CUDA/HIP builds are implemented (PR #775).
+
+#--- Option 1: CUDA exists -> use CUDA
+
+# Set GPUCC as $(CUDA_HOME)/bin/nvcc if it exists
 ifneq ($(wildcard $(CUDA_HOME)/bin/nvcc),)
-  NVCC = $(CUDA_HOME)/bin/nvcc
+
+  GPUCC = $(CUDA_HOME)/bin/nvcc
   USE_NVTX ?=-DUSE_NVTX
   # See https://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/index.html
   # See https://arnon.dk/matching-sm-architectures-arch-and-gencode-for-various-nvidia-cards/
@@ -158,41 +180,78 @@ ifneq ($(wildcard $(CUDA_HOME)/bin/nvcc),)
     CURANDLIBFLAGS = -L$(CUDA_HOME)/lib64/ -lcurand # NB: -lcuda is not needed here!
   endif
   CUOPTFLAGS = -lineinfo
-  CUFLAGS = $(foreach opt, $(OPTFLAGS), -Xcompiler $(opt)) $(CUOPTFLAGS) $(INCFLAGS) $(CUINC) $(USE_NVTX) $(CUARCHFLAGS) -use_fast_math
-  ###CUFLAGS += -Xcompiler -Wall -Xcompiler -Wextra -Xcompiler -Wshadow
-  ###NVCC_VERSION = $(shell $(NVCC) --version | grep 'Cuda compilation tools' | cut -d' ' -f5 | cut -d, -f1)
-  CUFLAGS += -std=c++17 # need CUDA >= 11.2 (see #333): this is enforced in mgOnGpuConfig.h
+  ###GPUFLAGS = $(OPTFLAGS) $(CUOPTFLAGS) $(INCFLAGS) $(CUINC) $(USE_NVTX) $(CUARCHFLAGS) -use_fast_math
+  GPUFLAGS = $(foreach opt, $(OPTFLAGS), -Xcompiler $(opt)) $(CUOPTFLAGS) $(INCFLAGS) $(CUINC) $(USE_NVTX) $(CUARCHFLAGS) -use_fast_math
+  ###GPUFLAGS += -Xcompiler -Wall -Xcompiler -Wextra -Xcompiler -Wshadow
+  ###GPUCC_VERSION = $(shell $(GPUCC) --version | grep 'Cuda compilation tools' | cut -d' ' -f5 | cut -d, -f1)
+  GPUFLAGS += -std=c++17 # need CUDA >= 11.2 (see #333): this is enforced in mgOnGpuConfig.h
   # Without -maxrregcount: baseline throughput: 6.5E8 (16384 32 12) up to 7.3E8 (65536 128 12)
-  ###CUFLAGS+= --maxrregcount 160 # improves throughput: 6.9E8 (16384 32 12) up to 7.7E8 (65536 128 12)
-  ###CUFLAGS+= --maxrregcount 128 # improves throughput: 7.3E8 (16384 32 12) up to 7.6E8 (65536 128 12)
-  ###CUFLAGS+= --maxrregcount 96 # degrades throughput: 4.1E8 (16384 32 12) up to 4.5E8 (65536 128 12)
-  ###CUFLAGS+= --maxrregcount 64 # degrades throughput: 1.7E8 (16384 32 12) flat at 1.7E8 (65536 128 12)
+  ###GPUFLAGS+= --maxrregcount 160 # improves throughput: 6.9E8 (16384 32 12) up to 7.7E8 (65536 128 12)
+  ###GPUFLAGS+= --maxrregcount 128 # improves throughput: 7.3E8 (16384 32 12) up to 7.6E8 (65536 128 12)
+  ###GPUFLAGS+= --maxrregcount 96 # degrades throughput: 4.1E8 (16384 32 12) up to 4.5E8 (65536 128 12)
+  ###GPUFLAGS+= --maxrregcount 64 # degrades throughput: 1.7E8 (16384 32 12) flat at 1.7E8 (65536 128 12)
+  CUBUILDRULEFLAGS = -Xcompiler -fPIC -c
+  CCBUILDRULEFLAGS = -Xcompiler -fPIC -c -x cu
+  CUDATESTFLAGS = -lcuda
+
+  # Set the host C++ compiler for GPUCC via "-ccbin <host-compiler>"
+  # (NB issue #505: this must be a single word, "clang++ --gcc-toolchain..." is not supported)
+  GPUFLAGS += -ccbin $(shell which $(subst ccache ,,$(CXX)))
+
+  # Allow newer (unsupported) C++ compilers with older versions of CUDA if ALLOW_UNSUPPORTED_COMPILER_IN_CUDA is set (#504)
+  ifneq ($(origin ALLOW_UNSUPPORTED_COMPILER_IN_CUDA),undefined)
+  GPUFLAGS += -allow-unsupported-compiler
+  endif
+
 else ifneq ($(origin REQUIRE_CUDA),undefined)
+
   # If REQUIRE_CUDA is set but no cuda is found, stop here (e.g. for CI tests on GPU #443)
-  $(error No cuda installation found (set CUDA_HOME or make nvcc visible in PATH))
+  $(error No cuda installation found (set CUDA_HOME or make GPUCC visible in PATH))
+
+#--- Option 2: CUDA does not exist, HIP exists -> use HIP
+
+# Set GPUCC as $(HIP_HOME)/bin/hipcc if it exists
+else ifneq ($(wildcard $(HIP_HOME)/bin/hipcc),)
+
+  GPUCC = $(HIP_HOME)/bin/hipcc
+  #USE_NVTX ?=-DUSE_NVTX # should maybe find something equivalent to this in HIP?
+  HIPARCHFLAGS = -target x86_64-linux-gnu --offload-arch=gfx90a
+  HIPINC = -I$(HIP_HOME)/include/
+  # Note: -DHIP_FAST_MATH is equivalent to -use_fast_math in HIP 
+  # (but only for single precision line 208: https://rocm-developer-tools.github.io/HIP/hcc__detail_2math__functions_8h_source.html)
+  GPUFLAGS = $(OPTFLAGS) $(CUOPTFLAGS) $(INCFLAGS) $(HIPINC) $(HIPARCHFLAGS) -DHIP_FAST_MATH -DHIP_PLATFORM=amd -fPIC
+  ###GPUFLAGS += -Xcompiler -Wall -Xcompiler -Wextra -Xcompiler -Wshadow
+  GPUFLAGS += -std=c++17
+  ###GPUFLAGS+= --maxrregcount 255 # (AV: is this option valid on HIP and meaningful on AMD GPUs?)
+  CUBUILDRULEFLAGS = -fPIC -c
+  CCBUILDRULEFLAGS = -fPIC -c
+
+else ifneq ($(origin REQUIRE_HIP),undefined)
+
+  # If REQUIRE_HIP is set but no HIP is found, stop here (e.g. for CI tests on GPU #443)
+  $(error No hip installation found (set HIP_HOME or make GPUCC visible in PATH))
+
+#--- Option 3: CUDA does not exist, HIP does not exist -> switch off both CUDA and HIP
+
 else
-  # No cuda. Switch cuda compilation off and go to common random numbers in C++
+
+  # No cudacc and no hipcc: switch CUDA and HIP compilation off and go to common random numbers in C++
   $(warning CUDA_HOME is not set or is invalid: export CUDA_HOME to compile with cuda)
-  override NVCC=
+  $(warning HIP_HOME is not set or is invalid: export HIP_HOME to compile with hip)
+  override GPUCC=
   override USE_NVTX=
   override CUINC=
   override CURANDLIBFLAGS=
-endif
-export NVCC
-export CUFLAGS
-
-# Set the host C++ compiler for nvcc via "-ccbin <host-compiler>"
-# (NB issue #505: this must be a single word, "clang++ --gcc-toolchain..." is not supported)
-CUFLAGS += -ccbin $(shell which $(subst ccache ,,$(CXX)))
 
-# Allow newer (unsupported) C++ compilers with older versions of CUDA if ALLOW_UNSUPPORTED_COMPILER_IN_CUDA is set (#504)
-ifneq ($(origin ALLOW_UNSUPPORTED_COMPILER_IN_CUDA),undefined)
-CUFLAGS += -allow-unsupported-compiler
 endif
 
+# Export GPUCC (so that it can also be used in cudacpp_src.mk?)
+export GPUCC
+export GPUFLAGS
+
 #-------------------------------------------------------------------------------
 
-#=== Configure ccache for C++ and CUDA builds
+#=== Configure ccache for C++ and CUDA/HIP builds
 
 # Enable ccache if USECCACHE=1
 ifeq ($(USECCACHE)$(shell echo $(CXX) | grep ccache),1)
@@ -201,15 +260,15 @@ endif
 #ifeq ($(USECCACHE)$(shell echo $(AR) | grep ccache),1)
 #  override AR:=ccache $(AR)
 #endif
-ifneq ($(NVCC),)
-  ifeq ($(USECCACHE)$(shell echo $(NVCC) | grep ccache),1)
-    override NVCC:=ccache $(NVCC)
+ifneq ($(GPUCC),)
+  ifeq ($(USECCACHE)$(shell echo $(GPUCC) | grep ccache),1)
+    override GPUCC:=ccache $(GPUCC)
   endif
 endif
 
 #-------------------------------------------------------------------------------
 
-#=== Configure PowerPC-specific compiler flags for C++ and CUDA
+#=== Configure PowerPC-specific compiler flags for C++ and CUDA/HIP
 
 # PowerPC-specific CXX compiler flags (being reviewed)
 ifeq ($(UNAME_P),ppc64le)
@@ -225,9 +284,9 @@ else
   ######CXXFLAGS+= -fno-semantic-interposition # no benefit (neither alone, nor combined with -flto)
 endif
 
-# PowerPC-specific CUDA compiler flags (to be reviewed!)
+# PowerPC-specific CUDA/HIP compiler flags (to be reviewed!)
 ifeq ($(UNAME_P),ppc64le)
-  CUFLAGS+= -Xcompiler -mno-float128
+  GPUFLAGS+= -Xcompiler -mno-float128
 endif
 
 #-------------------------------------------------------------------------------
@@ -237,7 +296,7 @@ endif
 # Set the default OMPFLAGS choice
 ifneq ($(shell $(CXX) --version | egrep '^Intel'),)
 override OMPFLAGS = -fopenmp
-###override OMPFLAGS = # disable OpenMP MT on Intel (was ok without nvcc but not ok with nvcc before #578)
+###override OMPFLAGS = # disable OpenMP MT on Intel (was ok without GPUCC but not ok with GPUCC before #578)
 else ifneq ($(shell $(CXX) --version | egrep '^(clang)'),)
 override OMPFLAGS = -fopenmp
 ###override OMPFLAGS = # disable OpenMP MT on clang (was not ok without or with nvcc before #578)
@@ -293,7 +352,10 @@ endif
 
 # Set the default RNDGEN (random number generator) choice
 ifeq ($(RNDGEN),)
-  ifeq ($(NVCC),)
+  ifeq ($(GPUCC),)
+    override RNDGEN = hasNoCurand
+  # Edgecase for HIP compilation
+  else ifeq ($(findstring hipcc,$(GPUCC)),hipcc)
     override RNDGEN = hasNoCurand
   else ifeq ($(RNDGEN),)
     override RNDGEN = hasCurand
@@ -310,7 +372,7 @@ export OMPFLAGS
 
 #-------------------------------------------------------------------------------
 
-#=== Set the CUDA/C++ compiler flags appropriate to user-defined choices of AVX, FPTYPE, HELINL, HRDCOD, RNDGEN
+#=== Set the CUDA/HIP/C++ compiler flags appropriate to user-defined choices of AVX, FPTYPE, HELINL, HRDCOD, RNDGEN
 
 # Set the build flags appropriate to OMPFLAGS
 $(info OMPFLAGS=$(OMPFLAGS))
@@ -368,13 +430,13 @@ CXXFLAGS+= $(AVXFLAGS)
 $(info FPTYPE=$(FPTYPE))
 ifeq ($(FPTYPE),d)
   CXXFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_DOUBLE
-  CUFLAGS  += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_DOUBLE
+  GPUFLAGS  += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_DOUBLE
 else ifeq ($(FPTYPE),f)
   CXXFLAGS += -DMGONGPU_FPTYPE_FLOAT -DMGONGPU_FPTYPE2_FLOAT
-  CUFLAGS  += -DMGONGPU_FPTYPE_FLOAT -DMGONGPU_FPTYPE2_FLOAT
+  GPUFLAGS  += -DMGONGPU_FPTYPE_FLOAT -DMGONGPU_FPTYPE2_FLOAT
 else ifeq ($(FPTYPE),m)
   CXXFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_FLOAT
-  CUFLAGS  += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_FLOAT
+  GPUFLAGS  += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_FLOAT
 else
   $(error Unknown FPTYPE='$(FPTYPE)': only 'd', 'f' and 'm' are supported)
 endif
@@ -383,7 +445,7 @@ endif
 $(info HELINL=$(HELINL))
 ifeq ($(HELINL),1)
   CXXFLAGS += -DMGONGPU_INLINE_HELAMPS
-  CUFLAGS  += -DMGONGPU_INLINE_HELAMPS
+  GPUFLAGS  += -DMGONGPU_INLINE_HELAMPS
 else ifneq ($(HELINL),0)
   $(error Unknown HELINL='$(HELINL)': only '0' and '1' are supported)
 endif
@@ -392,7 +454,7 @@ endif
 $(info HRDCOD=$(HRDCOD))
 ifeq ($(HRDCOD),1)
   CXXFLAGS += -DMGONGPU_HARDCODE_PARAM
-  CUFLAGS  += -DMGONGPU_HARDCODE_PARAM
+  GPUFLAGS  += -DMGONGPU_HARDCODE_PARAM
 else ifneq ($(HRDCOD),0)
   $(error Unknown HRDCOD='$(HRDCOD)': only '0' and '1' are supported)
 endif
@@ -444,11 +506,11 @@ ifeq ($(UNAME_S),Darwin)
   override CULIBFLAGSRPATH2 =
 else
   # RPATH to cuda/cpp libs when linking executables
-  override CXXLIBFLAGSRPATH = -Wl,-rpath,$(LIBDIRRPATH)
-  override CULIBFLAGSRPATH = -Xlinker -rpath,$(LIBDIRRPATH)
+  override CXXLIBFLAGSRPATH = -Wl,-rpath=$(LIBDIRRPATH)
+  override CULIBFLAGSRPATH = -Xlinker -rpath=$(LIBDIRRPATH)
   # RPATH to common lib when linking cuda/cpp libs
-  override CXXLIBFLAGSRPATH2 = -Wl,-rpath,'$$ORIGIN'
-  override CULIBFLAGSRPATH2 = -Xlinker -rpath,'$$ORIGIN'
+  override CXXLIBFLAGSRPATH2 = -Wl,-rpath='$$ORIGIN'
+  override CULIBFLAGSRPATH2 = -Xlinker -rpath='$$ORIGIN'
 endif
 
 # Setting LD_LIBRARY_PATH or DYLD_LIBRARY_PATH in the RUNTIME is no longer necessary (neither on Linux nor on Mac)
@@ -461,7 +523,7 @@ override RUNTIME =
 cxx_main=$(BUILDDIR)/check.exe
 fcxx_main=$(BUILDDIR)/fcheck.exe
 
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 cu_main=$(BUILDDIR)/gcheck.exe
 fcu_main=$(BUILDDIR)/fgcheck.exe
 else
@@ -492,15 +554,16 @@ $(BUILDDIR)/.build.$(TAG):
 	@touch $(BUILDDIR)/.build.$(TAG)
 
 # Generic target and build rules: objects from CUDA compilation
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 $(BUILDDIR)/%.o : %.cu *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
 	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
-	$(NVCC) $(CPPFLAGS) $(CUFLAGS) -Xcompiler -fPIC -c $< -o $@
+	$(GPUCC) $(CPPFLAGS) $(GPUFLAGS) $(CUBUILDRULEFLAGS) $< -o $@
 
 $(BUILDDIR)/%_cu.o : %.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
 	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
-	$(NVCC) $(CPPFLAGS) $(CUFLAGS) -Xcompiler -fPIC -c -x cu $< -o $@
+	$(GPUCC) $(CPPFLAGS) $(GPUFLAGS) $(CCBUILDRULEFLAGS) $< -o $@
 endif
+# -x cu in line above
 
 # Generic target and build rules: objects from C++ compilation
 # (NB do not include CUINC here! add it only for NVTX or curand #679)
@@ -509,11 +572,14 @@ $(BUILDDIR)/%.o : %.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
 	$(CXX) $(CPPFLAGS) $(CXXFLAGS) -fPIC -c $< -o $@
 
 # Apply special build flags only to CrossSectionKernel.cc and gCrossSectionKernel.cu (no fast math, see #117 and #516)
+# Added edgecase for HIP compilation
 ifeq ($(shell $(CXX) --version | grep ^nvc++),)
 $(BUILDDIR)/CrossSectionKernels.o: CXXFLAGS := $(filter-out -ffast-math,$(CXXFLAGS))
 $(BUILDDIR)/CrossSectionKernels.o: CXXFLAGS += -fno-fast-math
-ifneq ($(NVCC),)
-$(BUILDDIR)/gCrossSectionKernels.o: CUFLAGS += -Xcompiler -fno-fast-math
+ifeq ($(findstring nvcc,$(GPUCC)),nvcc)
+  $(BUILDDIR)/gCrossSectionKernels.o: GPUFLAGS += -Xcompiler -fno-fast-math
+else
+  $(BUILDDIR)/gCrossSectionKernels.o: GPUFLAGS += -fno-fast-math
 endif
 endif
 
@@ -530,10 +596,10 @@ ifeq ($(RNDGEN),hasCurand)
 $(BUILDDIR)/CurandRandomNumberKernel.o: CXXFLAGS += $(CUINC)
 endif
 
-# Avoid "warning: builtin __has_trivial_... is deprecated; use __is_trivially_... instead" in nvcc with icx2023 (#592)
+# Avoid "warning: builtin __has_trivial_... is deprecated; use __is_trivially_... instead" in GPUCC with icx2023 (#592)
 ifneq ($(shell $(CXX) --version | egrep '^(Intel)'),)
-ifneq ($(NVCC),)
-CUFLAGS += -Xcompiler -Wno-deprecated-builtins
+ifneq ($(GPUCC),)
+GPUFLAGS += -Wno-deprecated-builtins
 endif
 endif
 
@@ -541,8 +607,8 @@ endif
 # This patch does remove the warning, but I prefer to keep it disabled for the moment...
 ###ifneq ($(shell $(CXX) --version | egrep '^(clang|Apple clang|Intel)'),)
 ###$(BUILDDIR)/CrossSectionKernels.o: CXXFLAGS += -Wno-overriding-t-option
-###ifneq ($(NVCC),)
-###$(BUILDDIR)/gCrossSectionKernels.o: CUFLAGS += -Xcompiler -Wno-overriding-t-option
+###ifneq ($(GPUCC),)
+###$(BUILDDIR)/gCrossSectionKernels.o: GPUFLAGS += -Xcompiler -Wno-overriding-t-option
 ###endif
 ###endif
 
@@ -569,7 +635,7 @@ MG5AMC_CXXLIB = mg5amc_$(processid_short)_cpp
 cxx_objects_lib=$(BUILDDIR)/CPPProcess.o $(BUILDDIR)/MatrixElementKernels.o $(BUILDDIR)/BridgeKernels.o $(BUILDDIR)/CrossSectionKernels.o
 cxx_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel.o $(BUILDDIR)/RamboSamplingKernels.o
 
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 MG5AMC_CULIB = mg5amc_$(processid_short)_cuda
 cu_objects_lib=$(BUILDDIR)/gCPPProcess.o $(BUILDDIR)/gMatrixElementKernels.o $(BUILDDIR)/gBridgeKernels.o $(BUILDDIR)/gCrossSectionKernels.o
 cu_objects_exe=$(BUILDDIR)/gCommonRandomNumberKernel.o $(BUILDDIR)/gRamboSamplingKernels.o
@@ -581,11 +647,11 @@ $(LIBDIR)/lib$(MG5AMC_CXXLIB).so: cxx_objects_lib += $(BUILDDIR)/fbridge.o
 $(LIBDIR)/lib$(MG5AMC_CXXLIB).so: $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib)
 	$(CXX) -shared -o $@ $(cxx_objects_lib) $(CXXLIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB)
 
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 $(LIBDIR)/lib$(MG5AMC_CULIB).so: $(BUILDDIR)/fbridge_cu.o
 $(LIBDIR)/lib$(MG5AMC_CULIB).so: cu_objects_lib += $(BUILDDIR)/fbridge_cu.o
 $(LIBDIR)/lib$(MG5AMC_CULIB).so: $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cu_objects_lib)
-	$(NVCC) --shared -o $@ $(cu_objects_lib) $(CULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB)
+	$(GPUCC) --shared -o $@ $(cu_objects_lib) $(CULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB)
 endif
 
 #-------------------------------------------------------------------------------
@@ -602,16 +668,16 @@ $(cxx_main): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PAT
 $(cxx_main): $(BUILDDIR)/check_sa.o $(LIBDIR)/lib$(MG5AMC_CXXLIB).so $(cxx_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel.o
 	$(CXX) -o $@ $(BUILDDIR)/check_sa.o $(OMPFLAGS) -ldl -pthread $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_CXXLIB) $(cxx_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel.o $(CURANDLIBFLAGS)
 
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 ifneq ($(shell $(CXX) --version | grep ^Intel),)
-$(cu_main): LIBFLAGS += -lintlc # compile with icpx and link with nvcc (undefined reference to `_intel_fast_memcpy')
-$(cu_main): LIBFLAGS += -lsvml # compile with icpx and link with nvcc (undefined reference to `__svml_cos4_l9')
+$(cu_main): LIBFLAGS += -lintlc # compile with icpx and link with GPUCC (undefined reference to `_intel_fast_memcpy')
+$(cu_main): LIBFLAGS += -lsvml # compile with icpx and link with GPUCC (undefined reference to `__svml_cos4_l9')
 else ifneq ($(shell $(CXX) --version | grep ^nvc++),) # support nvc++ #531
 $(cu_main): LIBFLAGS += -L$(patsubst %bin/nvc++,%lib,$(subst ccache ,,$(CXX))) -lnvhpcatm -lnvcpumath -lnvc
 endif
 $(cu_main): LIBFLAGS += $(CULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH
 $(cu_main): $(BUILDDIR)/gcheck_sa.o $(LIBDIR)/lib$(MG5AMC_CULIB).so $(cu_objects_exe) $(BUILDDIR)/gCurandRandomNumberKernel.o
-	$(NVCC) -o $@ $(BUILDDIR)/gcheck_sa.o $(CUARCHFLAGS) $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe) $(BUILDDIR)/gCurandRandomNumberKernel.o $(CURANDLIBFLAGS)
+	$(GPUCC) -o $@ $(BUILDDIR)/gcheck_sa.o $(CUARCHFLAGS) $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe) $(BUILDDIR)/gCurandRandomNumberKernel.o $(CURANDLIBFLAGS)
 endif
 
 #-------------------------------------------------------------------------------
@@ -637,17 +703,17 @@ $(fcxx_main): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PA
 $(fcxx_main): $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler.o $(LIBDIR)/lib$(MG5AMC_CXXLIB).so $(cxx_objects_exe)
 	$(CXX) -o $@ $(BUILDDIR)/fcheck_sa.o $(OMPFLAGS) $(BUILDDIR)/fsampler.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CXXLIB) $(cxx_objects_exe)
 
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 ifneq ($(shell $(CXX) --version | grep ^Intel),)
-$(fcu_main): LIBFLAGS += -lintlc # compile with icpx and link with nvcc (undefined reference to `_intel_fast_memcpy')
-$(fcu_main): LIBFLAGS += -lsvml # compile with icpx and link with nvcc (undefined reference to `__svml_cos4_l9')
+$(fcu_main): LIBFLAGS += -lintlc # compile with icpx and link with GPUCC (undefined reference to `_intel_fast_memcpy')
+$(fcu_main): LIBFLAGS += -lsvml # compile with icpx and link with GPUCC (undefined reference to `__svml_cos4_l9')
 endif
 ifeq ($(UNAME_S),Darwin)
 $(fcu_main): LIBFLAGS += -L$(shell dirname $(shell $(FC) --print-file-name libgfortran.dylib)) # add path to libgfortran on Mac #375
 endif
 $(fcu_main): LIBFLAGS += $(CULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH
 $(fcu_main): $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler_cu.o $(LIBDIR)/lib$(MG5AMC_CULIB).so $(cu_objects_exe)
-	$(NVCC) -o $@ $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler_cu.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe)
+	$(GPUCC) -o $@ $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler_cu.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe)
 endif
 
 #-------------------------------------------------------------------------------
@@ -659,7 +725,7 @@ $(BUILDDIR)/testxxx.o: testxxx_cc_ref.txt
 $(testmain): $(BUILDDIR)/testxxx.o
 $(testmain): cxx_objects_exe += $(BUILDDIR)/testxxx.o # Comment out this line to skip the C++ test of xxx functions
 
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 $(BUILDDIR)/testxxx_cu.o: $(GTESTLIBS)
 $(BUILDDIR)/testxxx_cu.o: INCFLAGS += $(GTESTINC)
 $(BUILDDIR)/testxxx_cu.o: testxxx_cc_ref.txt
@@ -672,7 +738,7 @@ $(BUILDDIR)/testmisc.o: INCFLAGS += $(GTESTINC)
 $(testmain): $(BUILDDIR)/testmisc.o
 $(testmain): cxx_objects_exe += $(BUILDDIR)/testmisc.o # Comment out this line to skip the C++ miscellaneous tests
 
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 $(BUILDDIR)/testmisc_cu.o: $(GTESTLIBS)
 $(BUILDDIR)/testmisc_cu.o: INCFLAGS += $(GTESTINC)
 $(testmain): $(BUILDDIR)/testmisc_cu.o
@@ -684,12 +750,12 @@ $(BUILDDIR)/runTest.o: INCFLAGS += $(GTESTINC)
 $(testmain): $(BUILDDIR)/runTest.o
 $(testmain): cxx_objects_exe += $(BUILDDIR)/runTest.o
 
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 $(BUILDDIR)/runTest_cu.o: $(GTESTLIBS)
 $(BUILDDIR)/runTest_cu.o: INCFLAGS += $(GTESTINC)
 ifneq ($(shell $(CXX) --version | grep ^Intel),)
-$(testmain): LIBFLAGS += -lintlc # compile with icpx and link with nvcc (undefined reference to `_intel_fast_memcpy')
-$(testmain): LIBFLAGS += -lsvml # compile with icpx and link with nvcc (undefined reference to `__svml_cos4_l9')
+$(testmain): LIBFLAGS += -lintlc # compile with icpx and link with GPUCC (undefined reference to `_intel_fast_memcpy')
+$(testmain): LIBFLAGS += -lsvml # compile with icpx and link with GPUCC (undefined reference to `__svml_cos4_l9')
 else ifneq ($(shell $(CXX) --version | grep ^nvc++),) # support nvc++ #531
 $(testmain): LIBFLAGS += -L$(patsubst %bin/nvc++,%lib,$(subst ccache ,,$(CXX))) -lnvhpcatm -lnvcpumath -lnvc
 endif
@@ -713,14 +779,14 @@ $(testmain): LIBFLAGS += -lgomp
 endif
 endif
 
-ifeq ($(NVCC),) # link only runTest.o
+ifeq ($(GPUCC),) # link only runTest.o
 $(testmain): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH
 $(testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(cxx_objects_exe) $(GTESTLIBS)
 	$(CXX) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) -ldl -pthread $(LIBFLAGS)
 else # link both runTest.o and runTest_cu.o
 $(testmain): LIBFLAGS += $(CULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH
 $(testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) $(GTESTLIBS)
-	$(NVCC) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) -ldl $(LIBFLAGS) -lcuda
+	  $(GPUCC) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) -ldl $(LIBFLAGS) $(CUDATESTFLAGS)
 endif
 
 # Use target gtestlibs to build only googletest
@@ -829,9 +895,9 @@ ifeq ($(USECCACHE),1)
 	ccache --version | head -1
 endif
 	@echo ""
-	@echo NVCC=$(NVCC)
-ifneq ($(NVCC),)
-	$(NVCC) --version
+	@echo GPUCC=$(GPUCC)
+ifneq ($(GPUCC),)
+	$(GPUCC) --version
 endif
 	@echo ""
 	@echo CXX=$(CXX)
@@ -850,7 +916,7 @@ endif
 
 # Target: check (run the C++ test executable)
 # [NB THIS IS WHAT IS USED IN THE GITHUB CI!]
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 check: runTest cmpFcheck cmpFGcheck
 else
 check: runTest cmpFcheck
diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/fbridge.cc b/epochX/cudacpp/gq_ttq.sa/SubProcesses/fbridge.cc
index 2d2b36d560..22ce3f5115 100644
--- a/epochX/cudacpp/gq_ttq.sa/SubProcesses/fbridge.cc
+++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/fbridge.cc
@@ -1,11 +1,11 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: S. Roiser (Oct 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Roiser, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #include "Bridge.h"
 #include "CPPProcess.h"
-#include "CudaRuntime.h"
+#include "GpuRuntime.h"
 
 extern "C"
 {
@@ -22,7 +22,7 @@ extern "C"
    * Using the same Fortran MadEvent code, linking to the hetrerogeneous library would allow access to both CPU and GPU implementations.
    * The specific heterogeneous configuration (how many GPUs, how many threads on each CPU, etc) could be loaded in CUDA/C++ from a data file.
    */
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   using namespace mg5amcGpu;
 #else
   using namespace mg5amcCpu;
@@ -46,8 +46,8 @@ extern "C"
    */
   void fbridgecreate_( CppObjectInFortran** ppbridge, const int* pnevtF, const int* pnparF, const int* pnp4F )
   {
-#ifdef __CUDACC__
-    CudaRuntime::setUp();
+#ifdef MGONGPUCPP_GPUIMPL
+    GpuRuntime::setUp();
 #endif
     // (NB: CPPProcess::initProc no longer needs to be executed here because it is called in the Bridge constructor)
     // FIXME: disable OMP in Bridge when called from Fortran
@@ -65,8 +65,8 @@ extern "C"
     Bridge<FORTRANFPTYPE>* pbridge = dynamic_cast<Bridge<FORTRANFPTYPE>*>( *ppbridge );
     if( pbridge == 0 ) throw std::runtime_error( "fbridgedelete_: invalid Bridge address" );
     delete pbridge;
-#ifdef __CUDACC__
-    CudaRuntime::tearDown();
+#ifdef MGONGPUCPP_GPUIMPL
+    GpuRuntime::tearDown();
 #endif
   }
 
@@ -96,7 +96,7 @@ extern "C"
   {
     Bridge<FORTRANFPTYPE>* pbridge = dynamic_cast<Bridge<FORTRANFPTYPE>*>( *ppbridge );
     if( pbridge == 0 ) throw std::runtime_error( "fbridgesequence_: invalid Bridge address" );
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     // Use the device/GPU implementation in the CUDA library
     // (there is also a host implementation in this library)
     pbridge->gpu_sequence( momenta, gs, rndhel, rndcol, *pchannelId, mes, selhel, selcol );
diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/fsampler.cc b/epochX/cudacpp/gq_ttq.sa/SubProcesses/fsampler.cc
index 2fb445372d..3743934f41 100644
--- a/epochX/cudacpp/gq_ttq.sa/SubProcesses/fsampler.cc
+++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/fsampler.cc
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Feb 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #include "mgOnGpuConfig.h"
 
@@ -13,7 +13,7 @@
 
 //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -40,7 +40,7 @@ namespace mg5amcCpu
   private:
     const int m_nevt; // The number of events in each iteration
     int m_iiter;      // The iteration counter (for random number seeding)
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
     HostBufferRndNumMomenta m_hstRndmom; // Memory buffers for random numbers
     HostBufferMomenta m_hstMomenta;      // Memory buffers for momenta
     HostBufferWeights m_hstWeights;      // Memory buffers for sampling weights
@@ -105,7 +105,7 @@ namespace mg5amcCpu
 
 extern "C"
 {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   using namespace mg5amcGpu;
 #else
   using namespace mg5amcCpu;
diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/runTest.cc b/epochX/cudacpp/gq_ttq.sa/SubProcesses/runTest.cc
index d4a760a71b..de327f2321 100644
--- a/epochX/cudacpp/gq_ttq.sa/SubProcesses/runTest.cc
+++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/runTest.cc
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: S. Hageboeck (Nov 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 //----------------------------------------------------------------------------
 // Use ./runTest.exe --gtest_filter=*xxx to run only testxxx.cc tests
 //----------------------------------------------------------------------------
@@ -18,7 +18,7 @@
 #include "RandomNumberKernels.h"
 #include "epoch_process_id.h"
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 using namespace mg5amcGpu;
 #else
 using namespace mg5amcCpu;
@@ -35,7 +35,7 @@ struct CUDA_CPU_TestBase : public TestDriverBase
     : TestDriverBase( npar, refFileName ) {}
 };
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 struct CPUTest : public CUDA_CPU_TestBase
 {
   // Struct data members (process, and memory structures for random numbers, momenta, matrix elements and weights on host and device)
@@ -119,7 +119,7 @@ struct CPUTest : public CUDA_CPU_TestBase
 };
 #endif
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 struct CUDATest : public CUDA_CPU_TestBase
 {
   // Reset the device when our test goes out of scope. Note that this should happen after
@@ -128,7 +128,7 @@ struct CUDATest : public CUDA_CPU_TestBase
   {
     ~DeviceReset()
     {
-      checkCuda( cudaDeviceReset() ); // this is needed by cuda-memcheck --leak-check full
+      checkGpu( gpuDeviceReset() ); // this is needed by cuda-memcheck --leak-check full
     }
   } deviceResetter;
 
@@ -256,7 +256,7 @@ INSTANTIATE_TEST_SUITE_P( prefix, \
                           test_suite_name, \
                           testing::Values( new CUDATest( MG_EPOCH_REFERENCE_FILE_NAME ) ) );
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 MG_INSTANTIATE_TEST_SUITE_GPU( XTESTID_GPU( MG_EPOCH_PROCESS_ID ), MadgraphTest );
 #else
 MG_INSTANTIATE_TEST_SUITE_CPU( XTESTID_CPU( MG_EPOCH_PROCESS_ID ), MadgraphTest );
diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/testmisc.cc b/epochX/cudacpp/gq_ttq.sa/SubProcesses/testmisc.cc
index 895d6eeb56..ba9e59a8a3 100644
--- a/epochX/cudacpp/gq_ttq.sa/SubProcesses/testmisc.cc
+++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/testmisc.cc
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 //----------------------------------------------------------------------------
 // Use ./runTest.exe --gtest_filter=*misc to run only testmisc.cc tests
 //----------------------------------------------------------------------------
@@ -17,7 +17,7 @@
 #include <sstream>
 #include <typeinfo>
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 #define TESTID( s ) s##_GPU_MISC
 #else
 #define TESTID( s ) s##_CPU_MISC
@@ -26,7 +26,7 @@
 #define XTESTID( s ) TESTID( s )
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -59,7 +59,7 @@ namespace mg5amcCpu
 
 TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testmisc )
 {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   using namespace mg5amcGpu;
 #else
   using namespace mg5amcCpu;
diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/testxxx.cc b/epochX/cudacpp/gq_ttq.sa/SubProcesses/testxxx.cc
index 3361fe5aa9..e5167de00c 100644
--- a/epochX/cudacpp/gq_ttq.sa/SubProcesses/testxxx.cc
+++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/testxxx.cc
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Apr 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 //----------------------------------------------------------------------------
 // Use ./runTest.exe --gtest_filter=*xxx to run only testxxx.cc tests
 //----------------------------------------------------------------------------
@@ -24,7 +24,7 @@
 #include <iomanip>
 #include <iostream>
 #include <vector>
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 #define TESTID( s ) s##_GPU_XXX
 #else
 #define TESTID( s ) s##_CPU_XXX
@@ -32,7 +32,7 @@
 
 #define XTESTID( s ) TESTID( s )
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -42,7 +42,7 @@ namespace mg5amcCpu
   int FPEhandlerIevt = -1;
   inline void FPEhandler( int sig )
   {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     std::cerr << "Floating Point Exception (GPU): '" << FPEhandlerMessage << "' ievt=" << FPEhandlerIevt << std::endl;
 #else
     std::cerr << "Floating Point Exception (CPU neppV=" << neppV << "): '" << FPEhandlerMessage << "' ievt=" << FPEhandlerIevt << std::endl;
@@ -53,7 +53,7 @@ namespace mg5amcCpu
 
 TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testxxx )
 {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   using namespace mg5amcGpu;
 #else
   using namespace mg5amcCpu;
@@ -77,7 +77,7 @@ TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testxxx )
   assert( nevt % neppM == 0 ); // nevt must be a multiple of neppM
   assert( nevt % neppV == 0 ); // nevt must be a multiple of neppV
   // Fill in the input momenta
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   mg5amcGpu::PinnedHostBufferMomenta hstMomenta( nevt ); // AOSOA[npagM][npar=4][np4=4][neppM]
 #else
   mg5amcCpu::HostBufferMomenta hstMomenta( nevt ); // AOSOA[npagM][npar=4][np4=4][neppM]
@@ -322,7 +322,7 @@ TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testxxx )
   {
     for( int ievt = 0; ievt < nevt; ievt++ )
     {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
       using namespace mg5amcGpu;
 #else
       using namespace mg5amcCpu;
diff --git a/epochX/cudacpp/gq_ttq.sa/src/HelAmps_sm.h b/epochX/cudacpp/gq_ttq.sa/src/HelAmps_sm.h
index cd4e6de668..45000c7246 100644
--- a/epochX/cudacpp/gq_ttq.sa/src/HelAmps_sm.h
+++ b/epochX/cudacpp/gq_ttq.sa/src/HelAmps_sm.h
@@ -5,7 +5,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: A. Valassi (Sep 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
 // MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
@@ -28,7 +28,7 @@
 //#include <iomanip>
 //#include <iostream>
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/gq_ttq.sa/src/Parameters_sm.cc b/epochX/cudacpp/gq_ttq.sa/src/Parameters_sm.cc
index c06dcbb252..8b92ea0bd6 100644
--- a/epochX/cudacpp/gq_ttq.sa/src/Parameters_sm.cc
+++ b/epochX/cudacpp/gq_ttq.sa/src/Parameters_sm.cc
@@ -4,7 +4,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: A. Valassi (Sep 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
 // MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
@@ -17,7 +17,7 @@
 #include <iomanip>
 #include <iostream>
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 using namespace mg5amcGpu;
 #else
 using namespace mg5amcCpu;
diff --git a/epochX/cudacpp/gq_ttq.sa/src/Parameters_sm.h b/epochX/cudacpp/gq_ttq.sa/src/Parameters_sm.h
index a6eb185434..a3615ec77a 100644
--- a/epochX/cudacpp/gq_ttq.sa/src/Parameters_sm.h
+++ b/epochX/cudacpp/gq_ttq.sa/src/Parameters_sm.h
@@ -27,7 +27,7 @@
 #include "read_slha.h"
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -93,7 +93,7 @@ namespace mg5amcCpu
 #include <limits>
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -217,7 +217,7 @@ namespace mg5amcCpu
 //==========================================================================
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -236,7 +236,7 @@ namespace mg5amcCpu
 #pragma GCC diagnostic push
 #pragma GCC diagnostic ignored "-Wunused-variable"  // e.g. <<warning: unused variable ‘mdl_G__exp__2’ [-Wunused-variable]>>
 #pragma GCC diagnostic ignored "-Wunused-parameter" // e.g. <<warning: unused parameter ‘G’ [-Wunused-parameter]>>
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 #pragma nv_diagnostic push
 #pragma nv_diag_suppress 177 // e.g. <<warning #177-D: variable "mdl_G__exp__2" was declared but never referenced>>
 #endif
@@ -263,7 +263,7 @@ namespace mg5amcCpu
       // End SM implementation - no special handling of vectors of floats as in EFT (#439)
       return out;
     }
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 #pragma GCC diagnostic pop
 #pragma nv_diagnostic pop
 #endif
diff --git a/epochX/cudacpp/gq_ttq.sa/src/cudacpp_src.mk b/epochX/cudacpp/gq_ttq.sa/src/cudacpp_src.mk
index d4cc628aec..159e19a46d 100644
--- a/epochX/cudacpp/gq_ttq.sa/src/cudacpp_src.mk
+++ b/epochX/cudacpp/gq_ttq.sa/src/cudacpp_src.mk
@@ -19,7 +19,7 @@ SHELL := /bin/bash
 #=== Configure common compiler flags for CUDA and C++
 
 INCFLAGS = -I.
-OPTFLAGS = -O3 # this ends up in CUFLAGS too (should it?), cannot add -Ofast or -ffast-math here
+OPTFLAGS = -O3 # this ends up in GPUFLAGS too (should it?), cannot add -Ofast or -ffast-math here
 
 #-------------------------------------------------------------------------------
 
@@ -45,13 +45,13 @@ endif
 
 #-------------------------------------------------------------------------------
 
-#=== Configure the CUDA compiler (note: NVCC is already exported including ccache)
+#=== Configure the CUDA compiler (note: GPUCC is already exported including ccache)
 
-###$(info NVCC=$(NVCC))
+###$(info GPUCC=$(GPUCC))
 
 #-------------------------------------------------------------------------------
 
-#=== Configure ccache for C++ builds (note: NVCC is already exported including ccache)
+#=== Configure ccache for C++ builds (note: GPUCC is already exported including ccache)
 
 # Enable ccache if USECCACHE=1
 ifeq ($(USECCACHE)$(shell echo $(CXX) | grep ccache),1)
@@ -92,6 +92,13 @@ endif
 ###$(info OMPFLAGS=$(OMPFLAGS))
 CXXFLAGS += $(OMPFLAGS)
 
+# Add correct -DHIP_LATFORM when compiling for HIP
+ifeq ($(findstring nvcc,$(GPUCC)),nvcc)
+  GPUFLAGS += -Xcompiler -fPIC -c -x cu
+else ifeq ($(findstring hipcc,$(GPUCC)),hipcc)
+  GPUFLAGS += -fPIC -c
+endif
+
 # Set the build flags appropriate to each AVX choice (example: "make AVX=none")
 # [NB MGONGPU_PVW512 is needed because "-mprefer-vector-width=256" is not exposed in a macro]
 # [See https://gcc.gnu.org/bugzilla/show_bug.cgi?id=96476]
@@ -253,20 +260,20 @@ $(BUILDDIR)/%.o : %.cc *.h $(BUILDDIR)/.build.$(TAG)
 # Generic target and build rules: objects from CUDA compilation
 $(BUILDDIR)/%_cu.o : %.cc *.h $(BUILDDIR)/.build.$(TAG)
 	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
-	$(NVCC) $(CPPFLAGS) $(CUFLAGS) -Xcompiler -fPIC -c -x cu $< -o $@
+	$(GPUCC) $(CPPFLAGS) $(GPUFLAGS) $< -o $@
 
 #-------------------------------------------------------------------------------
 
 cxx_objects=$(addprefix $(BUILDDIR)/, Parameters_sm.o read_slha.o)
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 cu_objects=$(addprefix $(BUILDDIR)/, Parameters_sm_cu.o)
 endif
 
 # Target (and build rules): common (src) library
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so : $(cxx_objects) $(cu_objects)
 	@if [ ! -d $(LIBDIR) ]; then echo "mkdir -p $(LIBDIR)"; mkdir -p $(LIBDIR); fi
-	$(NVCC) -shared -o $@ $(cxx_objects) $(cu_objects) $(LDFLAGS)
+	$(GPUCC) -shared -o $@ $(cxx_objects) $(cu_objects) $(LDFLAGS)
 else
 $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so : $(cxx_objects)
 	@if [ ! -d $(LIBDIR) ]; then echo "mkdir -p $(LIBDIR)"; mkdir -p $(LIBDIR); fi
diff --git a/epochX/cudacpp/gq_ttq.sa/src/mgOnGpuConfig.h b/epochX/cudacpp/gq_ttq.sa/src/mgOnGpuConfig.h
index b247654dcf..da4ba36ad8 100644
--- a/epochX/cudacpp/gq_ttq.sa/src/mgOnGpuConfig.h
+++ b/epochX/cudacpp/gq_ttq.sa/src/mgOnGpuConfig.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jul 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MGONGPUCONFIG_H
 #define MGONGPUCONFIG_H 1
@@ -10,12 +10,25 @@
 // There are two different code bases for standalone_cudacpp (without multichannel) and madevent+cudacpp (with multichannel)
 #undef MGONGPU_SUPPORTS_MULTICHANNEL
 
+// Is this a GPU (CUDA, HIP) or CPU implementation?
+#ifdef __CUDACC__
+#define MGONGPUCPP_GPUIMPL cuda
+#elif defined __HIPCC__
+#define MGONGPUCPP_GPUIMPL hip
+#else
+#undef MGONGPUCPP_GPUIMPL
+#endif
+
 // ** NB1 Throughputs (e.g. 6.8E8) are events/sec for "./gcheck.exe -p 65536 128 12"
 // ** NB2 Baseline on b7g47n0004 fluctuates (probably depends on load on other VMs)
 
 // Choose if curand is supported for generating random numbers
+// For HIP, by default, do not use curand (common random numbers will be used instead)
 // For both CUDA and C++, by default, do not inline, but allow this macro to be set from outside with e.g. -DMGONGPU_HAS_NO_CURAND
-// (there exist CUDA installations, e.g. using the HPC package, which do not include curand - see PR #784)
+// (there exist CUDA installations, e.g. using the HPC package, which do not include curand - see PR #784 and #785)
+#if defined __HIPCC__
+#define MGONGPU_HAS_NO_CURAND 1
+#else
 //#ifdef __CUDACC__
 //#undef MGONGPU_HAS_NO_CURAND // default
 ////#define MGONGPU_HAS_NO_CURAND 1
@@ -23,6 +36,7 @@
 //#undef MGONGPU_HAS_NO_CURAND // default
 ////#define MGONGPU_HAS_NO_CURAND 1
 //#endif
+#endif
 
 // Choose floating point precision (for everything but color algebra #537)
 // If one of these macros has been set from outside with e.g. -DMGONGPU_FPTYPE_FLOAT, nothing happens (issue #167)
@@ -54,23 +68,28 @@
 //#undef MGONGPU_HARDCODE_PARAM // default
 ////#define MGONGPU_HARDCODE_PARAM 1
 
-// Complex type in c++: std::complex or cxsmpl (CHOOSE ONLY ONE)
-#ifndef __CUDACC__
-//#define MGONGPU_CPPCXTYPE_STDCOMPLEX 1 // ~8 percent slower on float, same on double (5.1E6/double, 9.4E6/float)
-#define MGONGPU_CPPCXTYPE_CXSMPL 1 // new default (5.1E6/double, 10.2E6/float)
-#endif
-
-// Complex type in cuda: thrust or cucomplex or cxsmpl (CHOOSE ONLY ONE)
+// Complex type in CUDA: thrust or cucomplex or cxsmpl (CHOOSE ONLY ONE)
 #ifdef __CUDACC__
 #define MGONGPU_CUCXTYPE_THRUST 1 // default (~1.15E9/double, ~3.2E9/float)
 //#define MGONGPU_CUCXTYPE_CUCOMPLEX 1 // ~10 percent slower (1.03E9/double, ~2.8E9/float)
 //#define MGONGPU_CUCXTYPE_CXSMPL 1 // ~10 percent slower (1.00E9/double, ~2.9E9/float)
+
+// Complex type in HIP: cxsmpl (ONLY ONE OPTION POSSIBLE)
+#elif defined __HIPCC__
+#define MGONGPU_CUCXTYPE_CXSMPL 1 // ~10 percent slower (1.00E9/double, ~2.9E9/float)
+
+// Complex type in C++: std::complex or cxsmpl (CHOOSE ONLY ONE)
+#else
+//#define MGONGPU_CPPCXTYPE_STDCOMPLEX 1 // ~8 percent slower on float, same on double (5.1E6/double, 9.4E6/float)
+#define MGONGPU_CPPCXTYPE_CXSMPL 1 // new default (5.1E6/double, 10.2E6/float)
 #endif
 
-// Cuda nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation
+// CUDA nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation
 #ifdef __CUDACC__
-#undef MGONGPU_NSIGHT_DEBUG // default
+#undef MGONGPU_NSIGHT_DEBUG // default in CUDA
 //#define MGONGPU_NSIGHT_DEBUG 1
+#else
+#undef MGONGPU_NSIGHT_DEBUG // only option in HIP or C++
 #endif
 
 // SANITY CHECKS (floating point precision for everything but color algebra #537)
@@ -86,17 +105,21 @@
 #error You cannot use double precision for color algebra and single precision elsewhere
 #endif
 
-// SANITY CHECKS (c++ complex number implementation)
-#ifndef __CUDACC__
-#if defined MGONGPU_CPPCXTYPE_STDCOMPLEX and defined MGONGPU_CPPCXTYPE_CXSMPL
-#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CPPCXTYPE_STDCOMPLEX or MGONGPU_CPPCXTYPE_CXSMPL
+// SANITY CHECKS (CUDA complex number implementation)
+#ifdef __CUDACC__
+#if defined MGONGPU_CUCXTYPE_THRUST and defined MGONGPU_CUCXTYPE_CUCOMPLEX
+#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CUCXTYPE_THRUST or MGONGPU_CUCXTYPE_CUCOMPLEX for CUDA
+#elif defined MGONGPU_CUCXTYPE_THRUST and defined MGONGPU_CUCXTYPE_CXSMPL
+#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CUCXTYPE_THRUST or MGONGPU_CUCXTYPE_CXSMPL for CUDA
+#elif defined MGONGPU_CUCXTYPE_CUCOMPLEX and defined MGONGPU_CUCXTYPE_CXSMPL
+#error You must CHOOSE (ONE AND) ONLY ONE OF MGONGPU_CUCXTYPE_CUCOMPLEX or MGONGPU_CUCXTYPE_CXSMPL for CUDA
 #endif
 #endif
 
-// SANITY CHECKS (cuda complex number implementation)
-#ifdef __CUDACC__
-#if defined MGONGPU_CUCXTYPE_THRUST and defined MGONGPU_CUCXTYPE_CUCOMPLEX and defined MGONGPU_CUCXTYPE_CXSMPL
-#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CUCXTYPE_THRUST or MGONGPU_CUCXTYPE_CUCOMPLEX or MGONGPU_CUCXTYPE_CXSMPL
+// SANITY CHECKS (C++ complex number implementation)
+#ifndef MGONGPUCPP_GPUIMPL
+#if defined MGONGPU_CPPCXTYPE_STDCOMPLEX and defined MGONGPU_CPPCXTYPE_CXSMPL
+#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CPPCXTYPE_STDCOMPLEX or MGONGPU_CPPCXTYPE_CXSMPL for C++
 #endif
 #endif
 
@@ -134,7 +157,7 @@ namespace mgOnGpu
   // Alignment requirement for using reinterpret_cast with SIMD vectorized code
   // (using reinterpret_cast with non aligned memory may lead to segmentation faults!)
   // Only needed for C++ code but can be enforced also in NVCC builds of C++ code using CUDA>=11.2 and C++17 (#318, #319, #333)
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   constexpr int cppAlign = 64; // alignment requirement for SIMD vectorization (64-byte i.e. 512-bit)
 #endif
 
@@ -145,7 +168,7 @@ using mgOnGpu::fptype;
 using mgOnGpu::fptype2;
 
 // C++ SIMD vectorization width (this will be used to set neppV)
-#ifdef __CUDACC__ // CUDA implementation has no SIMD
+#ifdef MGONGPUCPP_GPUIMPL // CUDA and HIP implementations have no SIMD
 #undef MGONGPU_CPPSIMD
 #elif defined __AVX512VL__ && defined MGONGPU_PVW512 // C++ "512z" AVX512 with 512 width (512-bit ie 64-byte): 8 (DOUBLE) or 16 (FLOAT)
 #ifdef MGONGPU_FPTYPE_DOUBLE
@@ -175,9 +198,9 @@ using mgOnGpu::fptype2;
 #undef MGONGPU_CPPSIMD
 #endif
 
-// Cuda nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation
+// CUDA nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation
 // Arguments (not used so far): text is __FUNCTION__, code is 0 (start) or 1 (end)
-#if defined __CUDACC__ && defined MGONGPU_NSIGHT_DEBUG /* clang-format off */
+#if defined __CUDA__ && defined MGONGPU_NSIGHT_DEBUG /* clang-format off */
 #define mgDebugDeclare() __shared__ float mgDebugCounter[mgOnGpu::ntpbMAX];
 #define mgDebugInitialise() { mgDebugCounter[threadIdx.x] = 0; }
 #define mgDebug( code, text ) { mgDebugCounter[threadIdx.x] += 1; }
@@ -189,8 +212,8 @@ using mgOnGpu::fptype2;
 #define mgDebugFinalise() { /*noop*/ }
 #endif /* clang-format on */
 
-// Define empty CUDA declaration specifiers for C++
-#ifndef __CUDACC__
+// Define empty CUDA/HIP declaration specifiers for C++
+#ifndef MGONGPUCPP_GPUIMPL
 #define __global__
 #define __host__
 #define __device__
diff --git a/epochX/cudacpp/gq_ttq.sa/src/mgOnGpuCxtypes.h b/epochX/cudacpp/gq_ttq.sa/src/mgOnGpuCxtypes.h
index ca9a9f00c0..5532e22fa1 100644
--- a/epochX/cudacpp/gq_ttq.sa/src/mgOnGpuCxtypes.h
+++ b/epochX/cudacpp/gq_ttq.sa/src/mgOnGpuCxtypes.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022, based on earlier work by D. Smith) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MGONGPUCXTYPES_H
 #define MGONGPUCXTYPES_H 1
@@ -19,7 +19,7 @@
 #include <complex>
 
 // Complex type in cuda: thrust or cucomplex or cxsmpl
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 #if defined MGONGPU_CUCXTYPE_THRUST
 #pragma clang diagnostic push
 #pragma clang diagnostic ignored "-Wtautological-compare" // for icpx2021/clang13 (https://stackoverflow.com/a/15864661)
@@ -82,7 +82,7 @@ namespace mgOnGpu /* clang-format off */
 using mgOnGpu::cxsmpl;
 
 // Printout to stream for user defined types
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -92,7 +92,7 @@ namespace mg5amcCpu
   inline __host__ std::ostream&
   operator<<( std::ostream& out, const cxsmpl<FP>& c )
   {
-    out << std::complex( c.real(), c.imag() );
+    out << std::complex<FP>( c.real(), c.imag() );
     return out;
   }
 
@@ -215,14 +215,14 @@ namespace mg5amcCpu
 //==========================================================================
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
 #endif
 {
   // --- Type definitions (complex type: cxtype)
-#ifdef __CUDACC__ // cuda
+#ifdef MGONGPUCPP_GPUIMPL // cuda
 #if defined MGONGPU_CUCXTYPE_THRUST
   typedef thrust::complex<fptype> cxtype;
 #elif defined MGONGPU_CUCXTYPE_CUCOMPLEX
@@ -255,7 +255,7 @@ namespace mg5amcCpu
 //==========================================================================
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -307,7 +307,7 @@ namespace mg5amcCpu
 
   //==========================================================================
 
-#if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_THRUST // cuda + thrust
+#if defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CUCXTYPE_THRUST // cuda + thrust
 
   //------------------------------
   // CUDA - using thrust::complex
@@ -343,11 +343,11 @@ namespace mg5amcCpu
     return c;
   }
 
-#endif // #if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_THRUST
+#endif // #if defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CUCXTYPE_THRUST
 
   //==========================================================================
 
-#if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_CUCOMPLEX // cuda + cucomplex
+#if defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CUCXTYPE_CUCOMPLEX // cuda + cucomplex
 
   //------------------------------
   // CUDA - using cuComplex
@@ -562,11 +562,11 @@ namespace mg5amcCpu
     return cxmake( c.real(), c.imag() );
   }
 
-#endif // #if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_CUCOMPLEX
+#endif // #if defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CUCXTYPE_CUCOMPLEX
 
   //==========================================================================
 
-#if not defined __CUDACC__ and defined MGONGPU_CPPCXTYPE_STDCOMPLEX // c++ + stdcomplex
+#if not defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CPPCXTYPE_STDCOMPLEX // c++ + stdcomplex
 
   //------------------------------
   // C++ - using std::complex
@@ -610,7 +610,7 @@ namespace mg5amcCpu
   }
 #endif
 
-#endif // #if not defined __CUDACC__ and defined MGONGPU_CPPCXTYPE_STDCOMPLEX
+#endif // #if not defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CPPCXTYPE_STDCOMPLEX
 
   //==========================================================================
 
@@ -633,7 +633,7 @@ namespace mg5amcCpu
 //==========================================================================
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/gq_ttq.sa/src/mgOnGpuFptypes.h b/epochX/cudacpp/gq_ttq.sa/src/mgOnGpuFptypes.h
index 905c97d700..fa3a02664b 100644
--- a/epochX/cudacpp/gq_ttq.sa/src/mgOnGpuFptypes.h
+++ b/epochX/cudacpp/gq_ttq.sa/src/mgOnGpuFptypes.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MGONGPUFPTYPES_H
 #define MGONGPUFPTYPES_H 1
@@ -12,7 +12,7 @@
 #include <cmath>
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL // cuda
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -20,7 +20,7 @@ namespace mg5amcCpu
 {
   //==========================================================================
 
-#ifdef __CUDACC__ // cuda
+#ifdef MGONGPUCPP_GPUIMPL // cuda
 
   //------------------------------
   // Floating point types - Cuda
@@ -64,11 +64,11 @@ namespace mg5amcCpu
 #endif
   }
 
-#endif // #ifdef __CUDACC__
+#endif // #ifdef MGONGPUCPP_GPUIMPL
 
   //==========================================================================
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 
   //------------------------------
   // Floating point types - C++
@@ -92,7 +92,7 @@ namespace mg5amcCpu
     return std::sqrt( f );
   }
 
-#endif // #ifndef __CUDACC__
+#endif // #ifndef MGONGPUCPP_GPUIMPL
 
   //==========================================================================
 
diff --git a/epochX/cudacpp/gq_ttq.sa/src/mgOnGpuVectors.h b/epochX/cudacpp/gq_ttq.sa/src/mgOnGpuVectors.h
index e1299ba81e..cdae04326b 100644
--- a/epochX/cudacpp/gq_ttq.sa/src/mgOnGpuVectors.h
+++ b/epochX/cudacpp/gq_ttq.sa/src/mgOnGpuVectors.h
@@ -32,7 +32,7 @@
 #endif
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -131,7 +131,7 @@ namespace mg5amcCpu
 #endif
 #endif
 
-#else // i.e #ifndef MGONGPU_CPPSIMD (this includes #ifdef __CUDACC__)
+#else // i.e #ifndef MGONGPU_CPPSIMD (this includes #ifdef MGONGPUCPP_GPUIMPL)
 
   const int neppV = 1;
 
@@ -153,13 +153,13 @@ namespace mg5amcCpu
 //==========================================================================
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
 #endif
 {
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 
   // Printout to stream for user defined types
 
@@ -805,11 +805,11 @@ namespace mg5amcCpu
 
 #endif // #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
 
-#endif // #ifndef __CUDACC__
+#endif // #ifndef MGONGPUCPP_GPUIMPL
 
   //==========================================================================
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 
   //------------------------------
   // Vector types - CUDA
@@ -853,12 +853,12 @@ namespace mg5amcCpu
     return mask;
   }
 
-#endif // #ifdef __CUDACC__
+#endif // #ifdef MGONGPUCPP_GPUIMPL
 
   //==========================================================================
 
   // Scalar-or-vector types: scalar in CUDA, vector or scalar in C++
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   typedef bool bool_sv;
   typedef fptype fptype_sv;
   typedef fptype2 fptype2_sv;
@@ -879,7 +879,7 @@ namespace mg5amcCpu
 #endif
 
   // Scalar-or-vector zeros: scalar in CUDA, vector or scalar in C++
-#ifdef __CUDACC__ /* clang-format off */
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
   inline __host__ __device__ cxtype cxzero_sv(){ return cxtype( 0, 0 ); }
 #elif defined MGONGPU_CPPSIMD
   inline cxtype_v cxzero_sv() { return cxtype_v(); } // RRRR=0000 IIII=0000
diff --git a/epochX/cudacpp/gq_ttq.sa/src/rambo.h b/epochX/cudacpp/gq_ttq.sa/src/rambo.h
index e02ea52496..cd7e1008ea 100644
--- a/epochX/cudacpp/gq_ttq.sa/src/rambo.h
+++ b/epochX/cudacpp/gq_ttq.sa/src/rambo.h
@@ -4,7 +4,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 
 #include "mgOnGpuConfig.h"
@@ -18,7 +18,7 @@
 #include <iostream>
 
 // Simplified rambo version for 2 to N (with N>=2) processes with massless particles
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -83,7 +83,7 @@ namespace mg5amcCpu
       static bool first = true;
       if( first )
       {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
         if constexpr( M_ACCESS::isOnDevice() ) // avoid
         {
           const int ievt0 = 0;
@@ -166,7 +166,7 @@ namespace mg5amcCpu
     wt = po2log;
     if( nparf != 2 ) wt = ( 2. * nparf - 4. ) * log( energy ) + z[nparf - 1];
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
     // issue warnings if weight is too small or too large
     static int iwarn[5] = { 0, 0, 0, 0, 0 };
     if( wt < -180. )
diff --git a/epochX/cudacpp/heft_gg_h.sa/CODEGEN_cudacpp_heft_gg_h_log.txt b/epochX/cudacpp/heft_gg_h.sa/CODEGEN_cudacpp_heft_gg_h_log.txt
index 800492306f..c56a4ed162 100644
--- a/epochX/cudacpp/heft_gg_h.sa/CODEGEN_cudacpp_heft_gg_h_log.txt
+++ b/epochX/cudacpp/heft_gg_h.sa/CODEGEN_cudacpp_heft_gg_h_log.txt
@@ -52,7 +52,7 @@ Note that you can still compile and run aMC@NLO with the built-in PDFs
 
 Using default text editor "vi". Set another one in ./input/mg5_configuration.txt
 Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt
-No valid web browser found. Please set in ./input/mg5_configuration.txt
+Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt
 import /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_heft_gg_h.mg
 The import format was not given, so we guess it as command
 set stdout_level DEBUG
@@ -153,7 +153,7 @@ Generated helas calls for 1 subprocesses (1 diagrams) in 0.002 s
 [1;32mDEBUG:  Entering PLUGIN_ProcessExporter.convert_model (create the model) [1;30m[output.py at line 202][0m [0m
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates VVS3 routines[0m
-ALOHA: aloha creates 1 routines in  0.062 s
+ALOHA: aloha creates 1 routines in  0.065 s
 <class 'aloha.create_aloha.AbstractRoutine'> VVS3
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_heft_gg_h/src/./HelAmps_heft.h
 INFO: Created file HelAmps_heft.h in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_heft_gg_h/src/. 
@@ -165,7 +165,7 @@ INFO: Created files Parameters_heft.h and Parameters_heft.cc in directory
 INFO: /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_heft_gg_h/src/. and /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_heft_gg_h/src/. 
 quit
 
-real	0m0.471s
-user	0m0.367s
-sys	0m0.052s
+real	0m0.431s
+user	0m0.373s
+sys	0m0.054s
 Code generation completed in 0 seconds
diff --git a/epochX/cudacpp/heft_gg_h.sa/COPYRIGHT b/epochX/cudacpp/heft_gg_h.sa/COPYRIGHT
index a134b5fef9..84a883fbb0 100644
--- a/epochX/cudacpp/heft_gg_h.sa/COPYRIGHT
+++ b/epochX/cudacpp/heft_gg_h.sa/COPYRIGHT
@@ -15,6 +15,7 @@ The full development team currently includes the following authors :
   Stephan Hageboeck (CERN)
   Olivier Mattelaer (Universite Catholique de Louvain, original author)
   Stefan Roiser (CERN, original author)
+  Joergen Teig (CERN)
   Andrea Valassi (CERN, original author)
   Zenny Wettersten (CERN)
 See https://github.com/madgraph5/madgraph4gpu for more details. For the full
diff --git a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/Bridge.h b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/Bridge.h
index bf8b5e024d..89437b4c42 100644
--- a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/Bridge.h
+++ b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/Bridge.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: S. Roiser (Nov 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Roiser, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef BRIDGE_H
 #define BRIDGE_H 1
@@ -23,7 +23,7 @@
 #include <memory>
 #include <type_traits>
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -83,7 +83,7 @@ namespace mg5amcCpu
     Bridge& operator=( const Bridge& ) = delete;
     Bridge& operator=( Bridge&& ) = delete;
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     /**
      * Set the gpublocks and gputhreads for the gpusequence - throws if evnt != gpublocks*gputhreads
      * (this is needed for BridgeKernel tests rather than for actual production use in Fortran)
@@ -150,7 +150,7 @@ namespace mg5amcCpu
     unsigned int m_nevt; // number of events
     int m_nGoodHel;      // the number of good helicities (-1 initially when they have not yet been calculated)
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     int m_gputhreads; // number of gpu threads (default set from number of events, can be modified)
     int m_gpublocks;  // number of gpu blocks (default set from number of events, can be modified)
     DeviceBuffer<FORTRANFPTYPE, sizePerEventMomenta> m_devMomentaF;
@@ -187,12 +187,12 @@ namespace mg5amcCpu
   // Forward declare transposition methods
   //
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 
   template<typename Tin, typename Tout>
   __global__ void dev_transposeMomentaF2C( const Tin* in, Tout* out, const unsigned int nevt );
 
-#endif // __CUDACC__
+#endif // MGONGPUCPP_GPUIMPL
 
   template<typename Tin, typename Tout>
   void hst_transposeMomentaF2C( const Tin* in, Tout* out, const unsigned int nevt );
@@ -209,7 +209,7 @@ namespace mg5amcCpu
   Bridge<FORTRANFPTYPE>::Bridge( unsigned int nevtF, unsigned int nparF, unsigned int np4F )
     : m_nevt( nevtF )
     , m_nGoodHel( -1 )
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     , m_gputhreads( 256 )                  // default number of gpu threads
     , m_gpublocks( m_nevt / m_gputhreads ) // this ensures m_nevt <= m_gpublocks*m_gputhreads
     , m_devMomentaF( m_nevt )
@@ -233,7 +233,7 @@ namespace mg5amcCpu
   {
     if( nparF != CPPProcess::npar ) throw std::runtime_error( "Bridge constructor: npar mismatch" );
     if( np4F != CPPProcess::np4 ) throw std::runtime_error( "Bridge constructor: np4 mismatch" );
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     if( ( m_nevt < s_gputhreadsmin ) || ( m_nevt % s_gputhreadsmin != 0 ) )
       throw std::runtime_error( "Bridge constructor: nevt should be a multiple of " + std::to_string( s_gputhreadsmin ) );
     while( m_nevt != m_gpublocks * m_gputhreads )
@@ -249,7 +249,7 @@ namespace mg5amcCpu
 #else
     std::cout << "WARNING! Instantiate host Bridge (nevt=" << m_nevt << ")" << std::endl;
     m_pmek.reset( new MatrixElementKernelHost( m_hstMomentaC, m_hstGs, m_hstRndHel, m_hstRndCol, m_hstMEs, m_hstSelHel, m_hstSelCol, m_nevt ) );
-#endif // __CUDACC__
+#endif // MGONGPUCPP_GPUIMPL
     // Create a process object, read param card and set parameters
     // FIXME: the process instance can happily go out of scope because it is only needed to read parameters?
     // FIXME: the CPPProcess should really be a singleton? what if fbridgecreate is called from several Fortran threads?
@@ -262,7 +262,7 @@ namespace mg5amcCpu
     process.initProc( paramCard );
   }
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   template<typename FORTRANFPTYPE>
   void Bridge<FORTRANFPTYPE>::set_gpugrid( const int gpublocks, const int gputhreads )
   {
@@ -276,7 +276,7 @@ namespace mg5amcCpu
   }
 #endif
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   template<typename FORTRANFPTYPE>
   void Bridge<FORTRANFPTYPE>::gpu_sequence( const FORTRANFPTYPE* momenta,
                                             const FORTRANFPTYPE* gs,
@@ -291,14 +291,14 @@ namespace mg5amcCpu
     constexpr int neppM = MemoryAccessMomenta::neppM;
     if constexpr( neppM == 1 && std::is_same_v<FORTRANFPTYPE, fptype> )
     {
-      checkCuda( cudaMemcpy( m_devMomentaC.data(), momenta, m_devMomentaC.bytes(), cudaMemcpyHostToDevice ) );
+      gpuMemcpy( m_devMomentaC.data(), momenta, m_devMomentaC.bytes(), gpuMemcpyHostToDevice );
     }
     else
     {
-      checkCuda( cudaMemcpy( m_devMomentaF.data(), momenta, m_devMomentaF.bytes(), cudaMemcpyHostToDevice ) );
+      gpuMemcpy( m_devMomentaF.data(), momenta, m_devMomentaF.bytes(), gpuMemcpyHostToDevice );
       const int thrPerEvt = CPPProcess::npar * CPPProcess::np4; // AV: transpose alg does 1 element per thread (NOT 1 event per thread)
       //const int thrPerEvt = 1; // AV: try new alg with 1 event per thread... this seems slower
-      dev_transposeMomentaF2C<<<m_gpublocks * thrPerEvt, m_gputhreads>>>( m_devMomentaF.data(), m_devMomentaC.data(), m_nevt );
+      gpuLaunchKernel( dev_transposeMomentaF2C, m_gpublocks * thrPerEvt, m_gputhreads, m_devMomentaF.data(), m_devMomentaC.data(), m_nevt );
     }
     if constexpr( std::is_same_v<FORTRANFPTYPE, fptype> )
     {
@@ -341,7 +341,7 @@ namespace mg5amcCpu
   }
 #endif
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   template<typename FORTRANFPTYPE>
   void Bridge<FORTRANFPTYPE>::cpu_sequence( const FORTRANFPTYPE* momenta,
                                             const FORTRANFPTYPE* gs,
@@ -396,7 +396,7 @@ namespace mg5amcCpu
   // - C++ array: momenta[npagM][npar][np4][neppM] with nevt=npagM*neppM (AOSOA)
   //
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   template<typename Tin, typename Tout>
   __global__ void dev_transposeMomentaF2C( const Tin* in, Tout* out, const unsigned int nevt )
   {
diff --git a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/BridgeKernels.cc b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/BridgeKernels.cc
index d58066c9c1..eaf4037a24 100644
--- a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/BridgeKernels.cc
+++ b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/BridgeKernels.cc
@@ -1,17 +1,18 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #include "BridgeKernels.h"
 
+#include "GpuAbstraction.h"
 #include "MemoryAccessMomenta.h"
 
 #include <sstream>
 
 //============================================================================
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -45,7 +46,7 @@ namespace mg5amcCpu
 
 //============================================================================
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 namespace mg5amcCpu
 {
 
@@ -96,7 +97,7 @@ namespace mg5amcCpu
 
 //============================================================================
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 {
 
diff --git a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/BridgeKernels.h b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/BridgeKernels.h
index 15eb4bff4d..3efef8ce97 100644
--- a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/BridgeKernels.h
+++ b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/BridgeKernels.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef BRIDGEKERNELS_H
 #define BRIDGEKERNELS_H 1
@@ -12,7 +12,7 @@
 #include "MatrixElementKernels.h"
 #include "MemoryBuffers.h"
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -49,7 +49,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A Bridge wrapper class encapsulating matrix element calculations on a CPU host
   class BridgeKernelHost final : public BridgeKernelBase
   {
@@ -89,7 +89,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   // A Bridge wrapper class encapsulating matrix element calculations on a GPU device
   class BridgeKernelDevice : public BridgeKernelBase
   {
diff --git a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/CommonRandomNumberKernel.cc b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/CommonRandomNumberKernel.cc
index 985b39f576..010bc4cbd0 100644
--- a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/CommonRandomNumberKernel.cc
+++ b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/CommonRandomNumberKernel.cc
@@ -1,15 +1,16 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #include "CommonRandomNumbers.h"
+#include "GpuAbstraction.h"
 #include "MemoryBuffers.h"
 #include "RandomNumberKernels.h"
 
 #include <cassert>
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/CrossSectionKernels.cc b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/CrossSectionKernels.cc
index 0b355a3c8d..c15b39844d 100644
--- a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/CrossSectionKernels.cc
+++ b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/CrossSectionKernels.cc
@@ -1,10 +1,11 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #include "CrossSectionKernels.h"
 
+#include "GpuAbstraction.h"
 #include "MemoryAccessMatrixElements.h"
 #include "MemoryAccessWeights.h"
 #include "MemoryBuffers.h"
@@ -77,7 +78,7 @@ debug_me_is_abnormal( const fptype& me, size_t ievtALL )
 
 //============================================================================
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -185,7 +186,7 @@ namespace mg5amcCpu
 
 //============================================================================
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 {
 
diff --git a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/CrossSectionKernels.h b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/CrossSectionKernels.h
index 7933ca4bbf..4d9659e04e 100644
--- a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/CrossSectionKernels.h
+++ b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/CrossSectionKernels.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef CROSSSECTIONKERNELS_H
 #define CROSSSECTIONKERNELS_H 1
@@ -13,7 +13,7 @@
 
 //============================================================================
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -96,7 +96,7 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
   /*
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   // A class encapsulating the calculation of event statistics on a GPU device
   class CrossSectionKernelDevice : public CrossSectionKernelBase, public NumberOfEvents
   {
diff --git a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/CudaRuntime.h b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/CudaRuntime.h
deleted file mode 100644
index 64ce52f4b3..0000000000
--- a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/CudaRuntime.h
+++ /dev/null
@@ -1,85 +0,0 @@
-// Copyright (C) 2020-2023 CERN and UCLouvain.
-// Licensed under the GNU Lesser General Public License (version 3 or later).
-// Created by: S. Roiser (Jul 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
-
-#ifndef MG5AMC_CUDARUNTIME_H
-#define MG5AMC_CUDARUNTIME_H 1
-
-// MG5AMC on GPU uses the CUDA runtime API, not the lower level CUDA driver API
-// See https://docs.nvidia.com/cuda/cuda-runtime-api/driver-vs-runtime-api.html#driver-vs-runtime-api
-
-#include <cassert>
-#include <iostream>
-
-//--------------------------------------------------------------------------
-
-// See https://stackoverflow.com/a/14038590
-#ifdef __CUDACC__ /* clang-format off */
-#define checkCuda( code ) { assertCuda( code, __FILE__, __LINE__ ); }
-inline void assertCuda( cudaError_t code, const char* file, int line, bool abort = true )
-{
-  if( code != cudaSuccess )
-  {
-    printf( "ERROR! assertCuda: '%s' (%d) in %s:%d\n", cudaGetErrorString( code ), code, file, line );
-    if( abort ) assert( code == cudaSuccess );
-  }
-}
-#endif /* clang-format on */
-
-//--------------------------------------------------------------------------
-
-#ifdef __CUDACC__
-namespace mg5amcGpu
-{
-  // Instantiate a CudaRuntime at the beginnining of the application's main to
-  // invoke cudaSetDevice(0) in the constructor and book a cudaDeviceReset() call in the destructor
-  // *** FIXME! This will all need to be designed differently when going to multi-GPU nodes! ***
-  struct CudaRuntime final
-  {
-    CudaRuntime( const bool debug = true )
-      : m_debug( debug ) { setUp( m_debug ); }
-    ~CudaRuntime() { tearDown( m_debug ); }
-    CudaRuntime( const CudaRuntime& ) = delete;
-    CudaRuntime( CudaRuntime&& ) = delete;
-    CudaRuntime& operator=( const CudaRuntime& ) = delete;
-    CudaRuntime& operator=( CudaRuntime&& ) = delete;
-    bool m_debug;
-
-    // Set up CUDA application
-    // ** NB: strictly speaking this is not needed when using the CUDA runtime API **
-    // Calling cudaSetDevice on startup is useful to properly book-keep the time spent in CUDA initialization
-    static void setUp( const bool debug = true )
-    {
-      // ** NB: it is useful to call cudaSetDevice, or cudaFree, to properly book-keep the time spent in CUDA initialization
-      // ** NB: otherwise, the first CUDA operation (eg a cudaMemcpyToSymbol in CPPProcess ctor) appears to take much longer!
-      /*
-      // [We initially added cudaFree(0) to "ease profile analysis" only because it shows up as a big recognizable block!]
-      // No explicit initialization is needed: https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#initialization
-      // It is not clear what cudaFree(0) does at all: https://stackoverflow.com/questions/69967813/
-      if ( debug ) std::cout << "__CudaRuntime: calling cudaFree(0)" << std::endl;
-      checkCuda( cudaFree( 0 ) ); // SLOW!
-      */
-      // Replace cudaFree(0) by cudaSetDevice(0), even if it is not really needed either
-      // (but see https://developer.nvidia.com/blog/cuda-pro-tip-always-set-current-device-avoid-multithreading-bugs)
-      if( debug ) std::cout << "__CudaRuntime: calling cudaSetDevice(0)" << std::endl;
-      checkCuda( cudaSetDevice( 0 ) ); // SLOW!
-    }
-
-    // Tear down CUDA application (call cudaDeviceReset)
-    // ** NB: strictly speaking this is not needed when using the CUDA runtime API **
-    // Calling cudaDeviceReset on shutdown is only needed for checking memory leaks in cuda-memcheck
-    // See https://docs.nvidia.com/cuda/cuda-memcheck/index.html#leak-checking
-    static void tearDown( const bool debug = true )
-    {
-      if( debug ) std::cout << "__CudaRuntime: calling cudaDeviceReset()" << std::endl;
-      checkCuda( cudaDeviceReset() );
-    }
-  };
-
-}
-#endif
-
-//--------------------------------------------------------------------------
-
-#endif // MG5AMC_CUDARUNTIME_H
diff --git a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/CurandRandomNumberKernel.cc b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/CurandRandomNumberKernel.cc
index eb56333b03..08a16f6f2c 100644
--- a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/CurandRandomNumberKernel.cc
+++ b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/CurandRandomNumberKernel.cc
@@ -1,9 +1,9 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
-#include "CudaRuntime.h"
+#include "GpuRuntime.h"
 #include "MemoryBuffers.h"
 #include "RandomNumberKernels.h"
 
@@ -22,7 +22,7 @@ inline void assertCurand( curandStatus_t code, const char *file, int line, bool
 }
 #endif /* clang-format on */
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -36,7 +36,7 @@ namespace mg5amcCpu
   {
     if( m_isOnDevice )
     {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
       if( !m_rnarray.isOnDevice() )
         throw std::runtime_error( "CurandRandomNumberKernel on device with a host random number array" );
 #else
@@ -114,7 +114,7 @@ namespace mg5amcCpu
     /*
     printf( "\nCurandRandomNumberKernel::generateRnarray size = %d\n", (int)m_rnarray.size() );
     fptype* data = m_rnarray.data();
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     if( m_rnarray.isOnDevice() )
     {
       data = new fptype[m_rnarray.size()]();
@@ -123,7 +123,7 @@ namespace mg5amcCpu
 #endif
     for( int i = 0; i < ( (int)m_rnarray.size() / 4 ); i++ )
       printf( "[%4d] %f %f %f %f\n", i * 4, data[i * 4], data[i * 4 + 2], data[i * 4 + 2], data[i * 4 + 3] );
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     if( m_rnarray.isOnDevice() ) delete[] data;
 #endif
     */
diff --git a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/EventStatistics.h b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/EventStatistics.h
index 48b51e0a49..b425a5bade 100644
--- a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/EventStatistics.h
+++ b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/EventStatistics.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef EventStatistics_H
 #define EventStatistics_H 1
@@ -16,7 +16,7 @@
 #include <limits>
 #include <string>
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/MadgraphTest.h b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/MadgraphTest.h
index ef40624c88..a64c05c26a 100644
--- a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/MadgraphTest.h
+++ b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/MadgraphTest.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: S. Hageboeck (Dec 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MADGRAPHTEST_H_
 #define MADGRAPHTEST_H_ 1
@@ -22,7 +22,7 @@
 #include <string>
 #include <vector>
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 using mg5amcGpu::CPPProcess;
 #else
 using mg5amcCpu::CPPProcess;
@@ -201,7 +201,7 @@ class MadgraphTest : public testing::TestWithParam<TestDriverBase*>
 
 // Since we link both the CPU-only and GPU tests into the same executable, we prevent
 // a multiply defined symbol by only compiling this in the non-CUDA phase:
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 
 /// Compare momenta and matrix elements.
 /// This uses an implementation of TestDriverBase to run a madgraph workflow,
@@ -307,6 +307,6 @@ TEST_P( MadgraphTest, CompareMomentaAndME )
   }
 }
 
-#endif // __CUDACC__
+#endif // MGONGPUCPP_GPUIMPL
 
 #endif /* MADGRAPHTEST_H_ */
diff --git a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/MatrixElementKernels.cc b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/MatrixElementKernels.cc
index 74b5239ebf..81699dfea9 100644
--- a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/MatrixElementKernels.cc
+++ b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/MatrixElementKernels.cc
@@ -1,12 +1,12 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #include "MatrixElementKernels.h"
 
 #include "CPPProcess.h"
-#include "CudaRuntime.h"
+#include "GpuRuntime.h" // Includes the abstraction for Nvidia/AMD compilation
 #include "MemoryAccessMomenta.h"
 #include "MemoryBuffers.h"
 
@@ -14,7 +14,7 @@
 
 //============================================================================
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 namespace mg5amcCpu
 {
 
@@ -150,7 +150,7 @@ namespace mg5amcCpu
 
 //============================================================================
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 {
 
@@ -209,13 +209,13 @@ namespace mg5amcGpu
     PinnedHostBufferHelicityMask hstIsGoodHel( ncomb );
     DeviceBufferHelicityMask devIsGoodHel( ncomb );
     // ... 0d1. Compute good helicity mask on the device
-    computeDependentCouplings<<<m_gpublocks, m_gputhreads>>>( m_gs.data(), m_couplings.data() );
+    gpuLaunchKernel( computeDependentCouplings, m_gpublocks, m_gputhreads, m_gs.data(), m_couplings.data() );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    sigmaKin_getGoodHel<<<m_gpublocks, m_gputhreads>>>( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_numerators.data(), m_denominators.data(), devIsGoodHel.data() );
+    gpuLaunchKernel( sigmaKin_getGoodHel, m_gpublocks, m_gputhreads, m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_numerators.data(), m_denominators.data(), devIsGoodHel.data() );
 #else
-    sigmaKin_getGoodHel<<<m_gpublocks, m_gputhreads>>>( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), devIsGoodHel.data() );
+    gpuLaunchKernel( sigmaKin_getGoodHel, m_gpublocks, m_gputhreads, m_momenta.data(), m_couplings.data(), m_matrixElements.data(), devIsGoodHel.data() );
 #endif
-    checkCuda( cudaPeekAtLastError() );
+    checkGpu( gpuPeekAtLastError() );
     // ... 0d2. Copy back good helicity mask to the host
     copyHostFromDevice( hstIsGoodHel, devIsGoodHel );
     // ... 0d3. Copy back good helicity list to constant memory on the device
@@ -226,19 +226,19 @@ namespace mg5amcGpu
 
   void MatrixElementKernelDevice::computeMatrixElements( const unsigned int channelId )
   {
-    computeDependentCouplings<<<m_gpublocks, m_gputhreads>>>( m_gs.data(), m_couplings.data() );
+    gpuLaunchKernel( computeDependentCouplings, m_gpublocks, m_gputhreads, m_gs.data(), m_couplings.data() );
 #ifndef MGONGPU_NSIGHT_DEBUG
     constexpr unsigned int sharedMemSize = 0;
 #else
     constexpr unsigned int sharedMemSize = ntpbMAX * sizeof( float );
 #endif
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    sigmaKin<<<m_gpublocks, m_gputhreads, sharedMemSize>>>( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), channelId, m_numerators.data(), m_denominators.data(), m_selhel.data(), m_selcol.data() );
+    gpuLaunchKernelSharedMem( sigmaKin, m_gpublocks, m_gputhreads, sharedMemSize, m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), channelId, m_numerators.data(), m_denominators.data(), m_selhel.data(), m_selcol.data() );
 #else
-    sigmaKin<<<m_gpublocks, m_gputhreads, sharedMemSize>>>( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), m_selhel.data(), m_selcol.data() );
+    gpuLaunchKernelSharedMem( sigmaKin, m_gpublocks, m_gputhreads, sharedMemSize, m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), m_selhel.data(), m_selcol.data() );
 #endif
-    checkCuda( cudaPeekAtLastError() );
-    checkCuda( cudaDeviceSynchronize() );
+    checkGpu( gpuPeekAtLastError() );
+    checkGpu( gpuDeviceSynchronize() );
   }
 
   //--------------------------------------------------------------------------
diff --git a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/MatrixElementKernels.h b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/MatrixElementKernels.h
index 23e84757a2..72bd8f195b 100644
--- a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/MatrixElementKernels.h
+++ b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/MatrixElementKernels.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MATRIXELEMENTKERNELS_H
 #define MATRIXELEMENTKERNELS_H 1
@@ -10,7 +10,7 @@
 
 #include "MemoryBuffers.h"
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -81,7 +81,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating matrix element calculations on a CPU host
   class MatrixElementKernelHost final : public MatrixElementKernelBase, public NumberOfEvents
   {
@@ -130,7 +130,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   // A class encapsulating matrix element calculations on a GPU device
   class MatrixElementKernelDevice : public MatrixElementKernelBase, public NumberOfEvents
   {
diff --git a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/MemoryAccessAmplitudes.h b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/MemoryAccessAmplitudes.h
index 573b3bbbc9..ffb76e93de 100644
--- a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/MemoryAccessAmplitudes.h
+++ b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/MemoryAccessAmplitudes.h
@@ -15,7 +15,7 @@
 #define MGONGPU_TRIVIAL_AMPLITUDES 1
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/MemoryAccessCouplings.h b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/MemoryAccessCouplings.h
index d65c9d6e04..85c3c9ed1c 100644
--- a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/MemoryAccessCouplings.h
+++ b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/MemoryAccessCouplings.h
@@ -15,7 +15,7 @@
 #include "MemoryBuffers.h"       // for HostBufferCouplings::isaligned
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/MemoryAccessCouplingsFixed.h b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/MemoryAccessCouplingsFixed.h
index dc0d93afff..ffcdf4dbef 100644
--- a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/MemoryAccessCouplingsFixed.h
+++ b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/MemoryAccessCouplingsFixed.h
@@ -14,7 +14,7 @@
 //#include "MemoryAccessHelpers.h"
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/MemoryAccessDenominators.h b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/MemoryAccessDenominators.h
index 3bce635718..66f2d32a6b 100644
--- a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/MemoryAccessDenominators.h
+++ b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/MemoryAccessDenominators.h
@@ -10,7 +10,7 @@
 #include "MemoryAccessGs.h"
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/MemoryAccessGs.h b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/MemoryAccessGs.h
index 31311aa375..4c726b30f3 100644
--- a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/MemoryAccessGs.h
+++ b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/MemoryAccessGs.h
@@ -13,7 +13,7 @@
 #include "MemoryBuffers.h" // for HostBufferMatrixElements::isaligned
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/MemoryAccessHelpers.h b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/MemoryAccessHelpers.h
index c82a6c7635..db73e4e064 100644
--- a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/MemoryAccessHelpers.h
+++ b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/MemoryAccessHelpers.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MemoryAccessHelpers_H
 #define MemoryAccessHelpers_H 1
@@ -105,7 +105,7 @@ class KernelAccessHelper : public MemoryAccessHelper<T>
     }
     else
     {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
       const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid
       //printf( "kernelAccessRecord: ievt=%d threadId=%d\n", ievt, threadIdx.x );
       return T::ieventAccessRecord( buffer, ievt ); // NB fptype and fptype_sv coincide for CUDA
diff --git a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/MemoryAccessMatrixElements.h b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/MemoryAccessMatrixElements.h
index f32e6fea5b..3741011971 100644
--- a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/MemoryAccessMatrixElements.h
+++ b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/MemoryAccessMatrixElements.h
@@ -13,7 +13,7 @@
 #include "MemoryBuffers.h" // for HostBufferMatrixElements::isaligned
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/MemoryAccessMomenta.h b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/MemoryAccessMomenta.h
index 29266de32c..3be229d392 100644
--- a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/MemoryAccessMomenta.h
+++ b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/MemoryAccessMomenta.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MemoryAccessMomenta_H
 #define MemoryAccessMomenta_H 1
@@ -13,7 +13,7 @@
 #include "MemoryAccessVectors.h"
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -30,7 +30,7 @@ namespace mg5amcCpu
 
     // Number of Events Per Page in the momenta AOSOA memory buffer layout
     // (these are all best kept as a compile-time constants: see issue #23)
-#ifdef __CUDACC__ /* clang-format off */
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
     // -----------------------------------------------------------------------------------------------
     // --- GPUs: neppM is best set to a power of 2 times the number of fptype's in a 32-byte cacheline
     // --- This is relevant to ensure coalesced access to momenta in global memory
diff --git a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/MemoryAccessNumerators.h b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/MemoryAccessNumerators.h
index b152183b28..18991f4fa6 100644
--- a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/MemoryAccessNumerators.h
+++ b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/MemoryAccessNumerators.h
@@ -10,7 +10,7 @@
 #include "MemoryAccessGs.h"
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/MemoryAccessRandomNumbers.h b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/MemoryAccessRandomNumbers.h
index e2988d39f3..40cb089135 100644
--- a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/MemoryAccessRandomNumbers.h
+++ b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/MemoryAccessRandomNumbers.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MemoryAccessRandomNumbers_H
 #define MemoryAccessRandomNumbers_H 1
@@ -11,7 +11,7 @@
 #include "CPPProcess.h"
 #include "MemoryAccessHelpers.h"
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 using mg5amcGpu::CPPProcess;
 #else
 using mg5amcCpu::CPPProcess;
diff --git a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/MemoryAccessVectors.h b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/MemoryAccessVectors.h
index e9b197368e..08faccff0f 100644
--- a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/MemoryAccessVectors.h
+++ b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/MemoryAccessVectors.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MemoryAccessVectors_H
 #define MemoryAccessVectors_H 1
@@ -10,7 +10,7 @@
 
 #include "mgOnGpuVectors.h"
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 namespace mg5amcCpu // this is only needed for CPU SIMD vectorization
 {
 
diff --git a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/MemoryAccessWavefunctions.h b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/MemoryAccessWavefunctions.h
index 5428aaf933..33bef4559e 100644
--- a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/MemoryAccessWavefunctions.h
+++ b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/MemoryAccessWavefunctions.h
@@ -15,7 +15,7 @@
 #define MGONGPU_TRIVIAL_WAVEFUNCTIONS 1
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/MemoryBuffers.h b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/MemoryBuffers.h
index 8109470148..78004e66cc 100644
--- a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/MemoryBuffers.h
+++ b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/MemoryBuffers.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021, based on earlier work by S. Hageboeck) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Roiser, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MemoryBuffers_H
 #define MemoryBuffers_H 1
@@ -11,12 +11,12 @@
 #include "mgOnGpuCxtypes.h"
 
 #include "CPPProcess.h"
-#include "CudaRuntime.h"
+#include "GpuRuntime.h"
 #include "Parameters_heft.h"
 
 #include <sstream>
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -87,7 +87,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   constexpr bool HostBufferALIGNED = false;   // ismisaligned=false
   constexpr bool HostBufferMISALIGNED = true; // ismisaligned=true
 
@@ -119,7 +119,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   // A class encapsulating a CUDA pinned host buffer
   template<typename T>
   class PinnedHostBufferBase : public BufferBase<T>
@@ -128,18 +128,18 @@ namespace mg5amcCpu
     PinnedHostBufferBase( const size_t size )
       : BufferBase<T>( size, false )
     {
-      checkCuda( cudaMallocHost( &( this->m_data ), this->bytes() ) );
+      gpuMallocHost( &( this->m_data ), this->bytes() );
     }
     virtual ~PinnedHostBufferBase()
     {
-      checkCuda( cudaFreeHost( this->m_data ) );
+      gpuFreeHost( this->m_data );
     }
   };
 #endif
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   // A class encapsulating a CUDA device buffer
   template<typename T>
   class DeviceBufferBase : public BufferBase<T>
@@ -148,18 +148,18 @@ namespace mg5amcCpu
     DeviceBufferBase( const size_t size )
       : BufferBase<T>( size, true )
     {
-      checkCuda( cudaMalloc( &( this->m_data ), this->bytes() ) );
+      gpuMalloc( &( this->m_data ), this->bytes() );
     }
     virtual ~DeviceBufferBase()
     {
-      checkCuda( cudaFree( this->m_data ) );
+      gpuFree( this->m_data );
     }
   };
 #endif
 
   //--------------------------------------------------------------------------
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for a given number of events
   template<typename T, size_t sizePerEvent, bool ismisaligned>
   class HostBuffer : public HostBufferBase<T, ismisaligned>, virtual private NumberOfEvents
@@ -175,7 +175,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   // A class encapsulating a CUDA pinned host buffer for a given number of events
   template<typename T, size_t sizePerEvent>
   class PinnedHostBuffer : public PinnedHostBufferBase<T>, virtual private NumberOfEvents
@@ -191,7 +191,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   // A class encapsulating a CUDA device buffer for a given number of events
   template<typename T, size_t sizePerEvent>
   class DeviceBuffer : public DeviceBufferBase<T>, virtual private NumberOfEvents
@@ -213,7 +213,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for momenta random numbers
   constexpr size_t sizePerEventRndNumMomenta = MemoryBuffers::np4 * MemoryBuffers::nparf;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for momenta random numbers
   typedef HostBuffer<fptype, sizePerEventRndNumMomenta, HostBufferALIGNED> HostBufferRndNumMomenta;
 #else
@@ -232,7 +232,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer with ONE fptype per event
   constexpr size_t sizePerEventOneFp = 1;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer with ONE fptype per event
   typedef HostBuffer<fptype, sizePerEventOneFp, HostBufferALIGNED> HostBufferOneFp;
 #else
@@ -257,7 +257,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for Gs
   constexpr size_t sizePerEventGs = 1;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for gs
   typedef HostBuffer<fptype, sizePerEventGs, HostBufferALIGNED> HostBufferGs;
 #else
@@ -276,7 +276,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for numerators
   constexpr size_t sizePerEventNumerators = 1;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for gs
   typedef HostBuffer<fptype, sizePerEventNumerators, HostBufferALIGNED> HostBufferNumerators;
 #else
@@ -296,7 +296,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for denominators
   constexpr size_t sizePerEventDenominators = 1;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for gs
   typedef HostBuffer<fptype, sizePerEventDenominators, HostBufferALIGNED> HostBufferDenominators;
 #else
@@ -315,7 +315,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for random numbers
   constexpr size_t sizePerEventCouplings = MemoryBuffers::ndcoup * MemoryBuffers::nx2;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for gs
   typedef HostBuffer<fptype, sizePerEventCouplings, HostBufferALIGNED> HostBufferCouplings;
 #else
@@ -333,7 +333,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for momenta
   constexpr size_t sizePerEventMomenta = MemoryBuffers::np4 * MemoryBuffers::npar;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for momenta
   typedef HostBuffer<fptype, sizePerEventMomenta, HostBufferALIGNED> HostBufferMomenta;
   //typedef HostBuffer<fptype, sizePerEventMomenta, HostBufferMISALIGNED> HostBufferMomenta; // TEST MISALIGNMENT!
@@ -352,7 +352,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for sampling weights
   constexpr size_t sizePerEventWeights = 1;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for sampling weights
   typedef HostBuffer<fptype, sizePerEventWeights, HostBufferALIGNED> HostBufferWeights;
 #else
@@ -370,7 +370,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for matrix elements
   constexpr size_t sizePerEventMatrixElements = 1;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for matrix elements
   typedef HostBuffer<fptype, sizePerEventMatrixElements, HostBufferALIGNED> HostBufferMatrixElements;
 #else
@@ -385,7 +385,7 @@ namespace mg5amcCpu
   // A base class encapsulating a memory buffer for the helicity mask
   typedef BufferBase<bool> BufferHelicityMask;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for the helicity mask
   typedef HostBufferBase<bool, HostBufferALIGNED> HostBufferHelicityMask;
 #else
@@ -403,7 +403,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for wavefunctions
   constexpr size_t sizePerEventWavefunctions = MemoryBuffers::nw6 * MemoryBuffers::nx2;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for wavefunctions
   typedef HostBuffer<fptype, sizePerEventWavefunctions, HostBufferALIGNED> HostBufferWavefunctions;
 #else
@@ -421,7 +421,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for helicity random numbers
   constexpr size_t sizePerEventRndNumHelicity = 1;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for helicity random numbers
   typedef HostBuffer<fptype, sizePerEventRndNumHelicity, HostBufferALIGNED> HostBufferRndNumHelicity;
 #else
@@ -439,7 +439,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for color random numbers
   constexpr size_t sizePerEventRndNumColor = 1;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for color random numbers
   typedef HostBuffer<fptype, sizePerEventRndNumColor, HostBufferALIGNED> HostBufferRndNumColor;
 #else
@@ -457,7 +457,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for helicity selection
   constexpr size_t sizePerEventSelectedHelicity = 1;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for helicity selection
   typedef HostBuffer<int, sizePerEventSelectedHelicity, HostBufferALIGNED> HostBufferSelectedHelicity;
 #else
@@ -475,7 +475,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for color selection
   constexpr size_t sizePerEventSelectedColor = 1;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for color selection
   typedef HostBuffer<int, sizePerEventSelectedColor, HostBufferALIGNED> HostBufferSelectedColor;
 #else
@@ -487,7 +487,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   template<class Tdst, class Tsrc>
   void copyDeviceFromHost( Tdst& dst, const Tsrc& src ) // keep the same order of arguments as in memcpy
   {
@@ -504,13 +504,13 @@ namespace mg5amcCpu
       throw std::runtime_error( sstr.str() );
     }
     // NB (PR #45): cudaMemcpy involves an intermediate memcpy to pinned memory if host array is a not a pinned host array
-    checkCuda( cudaMemcpy( dst.data(), src.data(), src.bytes(), cudaMemcpyHostToDevice ) );
+    gpuMemcpy( dst.data(), src.data(), src.bytes(), gpuMemcpyHostToDevice );
   }
 #endif
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   template<class Tdst, class Tsrc>
   void copyHostFromDevice( Tdst& dst, const Tsrc& src ) // keep the same order of arguments as in memcpy
   {
@@ -527,7 +527,7 @@ namespace mg5amcCpu
       throw std::runtime_error( sstr.str() );
     }
     // NB (PR #45): cudaMemcpy involves an intermediate memcpy to pinned memory if host array is a not a pinned host array
-    checkCuda( cudaMemcpy( dst.data(), src.data(), src.bytes(), cudaMemcpyDeviceToHost ) );
+    gpuMemcpy( dst.data(), src.data(), src.bytes(), gpuMemcpyDeviceToHost );
   }
 #endif
 
diff --git a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/P1_Sigma_heft_gg_h/CPPProcess.cc b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/P1_Sigma_heft_gg_h/CPPProcess.cc
index 526bd7d296..3b6085c784 100644
--- a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/P1_Sigma_heft_gg_h/CPPProcess.cc
+++ b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/P1_Sigma_heft_gg_h/CPPProcess.cc
@@ -4,7 +4,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
 // MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
@@ -16,7 +16,6 @@
 
 #include "mgOnGpuConfig.h"
 
-#include "CudaRuntime.h"
 #include "HelAmps_heft.h"
 #include "MemoryAccessAmplitudes.h"
 #include "MemoryAccessCouplings.h"
@@ -46,7 +45,7 @@
 // Class member functions for calculating the matrix elements for
 // Process: g g > h HIG<=1 HIW<=1 WEIGHTED<=2 @1
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -80,7 +79,7 @@ namespace mg5amcCpu
   //__device__ const fptype* cIPD = nullptr; // unused as nparam=0
   __device__ const fptype* cIPC = nullptr; // unused as nicoup=0
 #else
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   //__device__ __constant__ fptype* cIPD = nullptr; // unused as nparam=0
   __device__ __constant__ fptype* cIPC = nullptr; // unused as nicoup=0
 #else
@@ -90,7 +89,7 @@ namespace mg5amcCpu
 #endif
 
   // Helicity combinations (and filtering of "good" helicity combinations)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   __device__ __constant__ short cHel[ncomb][npar];
   __device__ __constant__ int cNGoodHel;
   __device__ __constant__ int cGoodHel[ncomb];
@@ -118,13 +117,13 @@ namespace mg5amcCpu
                            fptype* allDenominators,       // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
                            fptype_sv* jamp2_sv            // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled)
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
                            , const int ievt00             // input: first event number in current C++ event page (for CUDA, ievt depends on threadid)
 #endif
                            )
   //ALWAYS_INLINE // attributes are not permitted in a function definition
   {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     using namespace mg5amcGpu;
     using M_ACCESS = DeviceAccessMomenta;         // non-trivial access: buffer includes all events
     using E_ACCESS = DeviceAccessMatrixElements;  // non-trivial access: buffer includes all events
@@ -151,7 +150,7 @@ namespace mg5amcCpu
 #endif /* clang-format on */
     mgDebug( 0, __FUNCTION__ );
     //printf( "calculate_wavefunctions: ihel=%2d\n", ihel );
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
     //printf( "calculate_wavefunctions: ievt00=%d\n", ievt00 );
 #endif
 
@@ -187,7 +186,7 @@ namespace mg5amcCpu
 #endif
     for( int iParity = 0; iParity < nParity; ++iParity )
     { // START LOOP ON IPARITY
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
       const int ievt0 = ievt00 + iParity * neppV;
 #endif
       constexpr size_t nxcoup = ndcoup + nicoup; // both dependent and independent couplings
@@ -200,8 +199,10 @@ namespace mg5amcCpu
         allCOUPs[idcoup] = CD_ACCESS::idcoupAccessBufferConst( allcouplings, idcoup ); // dependent couplings, vary event-by-event
       for( size_t iicoup = 0; iicoup < nicoup; iicoup++ )
         allCOUPs[ndcoup + iicoup] = CI_ACCESS::iicoupAccessBufferConst( cIPC, iicoup ); // independent couplings, fixed for all events
+#ifdef MGONGPUCPP_GPUIMPL
 #ifdef __CUDACC__
 #pragma nv_diagnostic pop
+#endif
       // CUDA kernels take input/output buffers with momenta/MEs for all events
       const fptype* momenta = allmomenta;
       const fptype* COUPs[nxcoup];
@@ -268,7 +269,7 @@ namespace mg5amcCpu
       // [NB do keep 'static' for these constexpr arrays, see issue #283]
       static constexpr fptype2 cf[ncolor][ncolor] = { { 2 } }; // 2-D array[1][1]
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
       // Pre-compute a constexpr triangular color matrix properly normalized #475
       struct TriangularNormalizedColorMatrix
       {
@@ -325,7 +326,7 @@ namespace mg5amcCpu
 #endif
       for( int icol = 0; icol < ncolor; icol++ )
       {
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
         // === C++ START ===
         // Diagonal terms
 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
@@ -384,7 +385,7 @@ namespace mg5amcCpu
       MEs_sv_previous += deltaMEs_previous;
 #endif
       /*
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
       if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", blockDim.x * blockIdx.x + threadIdx.x, ihel, MEs_sv );
 #else
 #ifdef MGONGPU_CPPSIMD
@@ -419,8 +420,8 @@ namespace mg5amcCpu
       { -1, 1, 0 },
       { 1, -1, 0 },
       { 1, 1, 0 } };
-#ifdef __CUDACC__
-    checkCuda( cudaMemcpyToSymbol( cHel, tHel, ncomb * npar * sizeof( short ) ) );
+#ifdef MGONGPUCPP_GPUIMPL
+    gpuMemcpyToSymbol( cHel, tHel, ncomb * npar * sizeof( short ) );
 #else
     memcpy( cHel, tHel, ncomb * npar * sizeof( short ) );
 #endif
@@ -459,9 +460,9 @@ namespace mg5amcCpu
     // Then copy them to CUDA constant memory (issue #39) or its C++ emulation in file-scope static memory
     //const fptype tIPD[0] = { ... }; // nparam=0
     //const cxtype tIPC[0] = { ... }; // nicoup=0
-#ifdef __CUDACC__
-    //checkCuda( cudaMemcpyToSymbol( cIPD, tIPD, 0 * sizeof( fptype ) ) ); // nparam=0
-    //checkCuda( cudaMemcpyToSymbol( cIPC, tIPC, 0 * sizeof( cxtype ) ) ); // nicoup=0
+#ifdef MGONGPUCPP_GPUIMPL
+    //gpuMemcpyToSymbol( cIPD, tIPD, 0 * sizeof( fptype ) ); // nparam=0
+    //gpuMemcpyToSymbol( cIPC, tIPC, 0 * sizeof( cxtype ) ); // nicoup=0
 #else
     //memcpy( cIPD, tIPD, 0 * sizeof( fptype ) ); // nparam=0
     //memcpy( cIPC, tIPC, 0 * sizeof( cxtype ) ); // nicoup=0
@@ -495,7 +496,7 @@ namespace mg5amcCpu
   {
     std::stringstream out;
     // CUDA version (NVCC)
-    // [Use __NVCC__ instead of __CUDACC__ here!]
+    // [Use __NVCC__ instead of MGONGPUCPP_GPUIMPL here!]
     // [This tests if 'nvcc' was used even to build a .cc file, even if not necessarily 'nvcc -x cu' for a .cu file]
     // [Check 'nvcc --compiler-options -dM -E dummy.c | grep CUDA': see https://stackoverflow.com/a/53713712]
 #ifdef __NVCC__
@@ -560,12 +561,12 @@ namespace mg5amcCpu
   __global__ void /* clang-format off */
   computeDependentCouplings( const fptype* allgs, // input: Gs[nevt]
                              fptype* allcouplings // output: couplings[nevt*ndcoup*2]
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
                              , const int nevt     // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #endif
   ) /* clang-format on */
   {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     using namespace mg5amcGpu;
     using G_ACCESS = DeviceAccessGs;
     using C_ACCESS = DeviceAccessCouplings;
@@ -586,7 +587,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__ /* clang-format off */
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
   __global__ void
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
@@ -712,9 +713,9 @@ namespace mg5amcCpu
         nGoodHel++;
       }
     }
-#ifdef __CUDACC__
-    checkCuda( cudaMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) ) );
-    checkCuda( cudaMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) ) );
+#ifdef MGONGPUCPP_GPUIMPL
+    gpuMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) );
+    gpuMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) );
 #else
     cNGoodHel = nGoodHel;
     for( int ihel = 0; ihel < ncomb; ihel++ ) cGoodHel[ihel] = goodHel[ihel];
@@ -738,7 +739,7 @@ namespace mg5amcCpu
 #endif
             int* allselhel,                // output: helicity selection[nevt]
             int* allselcol                 // output: helicity selection[nevt]
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
             , const int nevt               // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #endif
             ) /* clang-format on */
@@ -758,7 +759,7 @@ namespace mg5amcCpu
     // Denominators: spins, colors and identical particles
     constexpr int helcolDenominators[1] = { 256 }; // assume nprocesses == 1 (#272 and #343)
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     // Remember: in CUDA this is a kernel for one event, in c++ this processes n events
     const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid
 #else
@@ -772,9 +773,12 @@ namespace mg5amcCpu
 #endif
 
     // Start sigmaKin_lines
+
+#include "GpuAbstraction.h"
+
     // === PART 0 - INITIALISATION (before calculate_wavefunctions) ===
     // Reset the "matrix elements" - running sums of |M|^2 over helicities for the given event
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     allMEs[ievt] = 0;
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     allNumerators[ievt] = 0;
@@ -802,7 +806,7 @@ namespace mg5amcCpu
     // === PART 1 - HELICITY LOOP: CALCULATE WAVEFUNCTIONS ===
     // (in both CUDA and C++, using precomputed good helicities)
 
-#ifdef __CUDACC__ // CUDA OR C++
+#ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++
 
     // *** START OF PART 1a - CUDA (one event per CPU thread) ***
     // Running sum of partial amplitudes squared for event by event color selection (#402)
@@ -1012,7 +1016,7 @@ namespace mg5amcCpu
     // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event
     // [NB 'sum over final spins, average over initial spins', eg see
     // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf]
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     allMEs[ievt] /= helcolDenominators[0];
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     if( channelId > 0 ) allMEs[ievt] *= allNumerators[ievt] / allDenominators[ievt];
diff --git a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/P1_Sigma_heft_gg_h/CPPProcess.h b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/P1_Sigma_heft_gg_h/CPPProcess.h
index dbc5aa0e4e..e1caef360b 100644
--- a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/P1_Sigma_heft_gg_h/CPPProcess.h
+++ b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/P1_Sigma_heft_gg_h/CPPProcess.h
@@ -4,7 +4,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
 // MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
@@ -25,7 +25,7 @@
 
 //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -107,7 +107,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   __global__ void
   computeDependentCouplings( const fptype* allgs,    // input: Gs[nevt]
                              fptype* allcouplings ); // output: couplings[nevt*ndcoup*2]
@@ -120,7 +120,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__ /* clang-format off */
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
   __global__ void
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
@@ -150,7 +150,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__ /* clang-format off */
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
   __global__ void
   sigmaKin( const fptype* allmomenta,      // input: momenta[nevt*npar*4]
             const fptype* allcouplings,    // input: couplings[nevt*ndcoup*2]
diff --git a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/P1_Sigma_heft_gg_h/CudaRuntime.h b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/P1_Sigma_heft_gg_h/CudaRuntime.h
deleted file mode 120000
index ce9e1a487a..0000000000
--- a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/P1_Sigma_heft_gg_h/CudaRuntime.h
+++ /dev/null
@@ -1 +0,0 @@
-../CudaRuntime.h
\ No newline at end of file
diff --git a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/P1_Sigma_heft_gg_h/check_sa.cc b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/P1_Sigma_heft_gg_h/check_sa.cc
index 3fbf0ffbee..7cac5ab47b 100644
--- a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/P1_Sigma_heft_gg_h/check_sa.cc
+++ b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/P1_Sigma_heft_gg_h/check_sa.cc
@@ -4,7 +4,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: O. Mattelaer (Nov 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 
 #include "mgOnGpuConfig.h"
@@ -12,6 +12,7 @@
 #include "BridgeKernels.h"
 #include "CPPProcess.h"
 #include "CrossSectionKernels.h"
+#include "GpuRuntime.h"
 #include "MatrixElementKernels.h"
 #include "MemoryAccessMatrixElements.h"
 #include "MemoryAccessMomenta.h"
@@ -65,7 +66,7 @@ usage( char* argv0, int ret = 1 )
   std::cout << std::endl;
   std::cout << "Summary stats are always computed: '-p' and '-j' only control their printout" << std::endl;
   std::cout << "The '-d' flag only enables NaN/abnormal warnings and OMP debugging" << std::endl;
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 #ifdef _OPENMP
   std::cout << std::endl;
   std::cout << "Use the OMP_NUM_THREADS environment variable to control OMP multi-threading" << std::endl;
@@ -96,7 +97,7 @@ int
 main( int argc, char** argv )
 {
   // Namespaces for CUDA and C++ (FIXME - eventually use the same namespace everywhere...)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   using namespace mg5amcGpu;
 #else
   using namespace mg5amcCpu;
@@ -134,9 +135,11 @@ main( int argc, char** argv )
     CurandDevice = 2
   };
 #ifdef MGONGPU_HAS_NO_CURAND
-  RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // this is the only supported mode if build has no curand (PR #784)
+  RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // this is the only supported mode if build has no curand (PR #784 and #785)
+#elif defined __HIPCC__
+#error Internal error: MGONGPU_HAS_NO_CURAND should have been set for __HIPCC__ // default on AMD GPUs should be common random
 #elif defined __CUDACC__
-  RandomNumberMode rndgen = RandomNumberMode::CurandDevice; // default on GPU if build has curand
+  RandomNumberMode rndgen = RandomNumberMode::CurandDevice; // default on NVidia GPU if build has curand
 #else
   RandomNumberMode rndgen = RandomNumberMode::CurandHost; // default on CPU if build has curand
 #endif
@@ -146,10 +149,10 @@ main( int argc, char** argv )
     RamboHost = 1,
     RamboDevice = 2
   };
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   RamboSamplingMode rmbsmp = RamboSamplingMode::RamboDevice; // default on GPU
 #else
-  RamboSamplingMode rmbsmp = RamboSamplingMode::RamboHost;  // default on CPU
+  RamboSamplingMode rmbsmp = RamboSamplingMode::RamboHost; // default on CPU
 #endif
   // Bridge emulation mode (NB Bridge implies RamboHost!)
   bool bridge = false;
@@ -177,7 +180,7 @@ main( int argc, char** argv )
     else if( arg == "--curdev" )
     {
 #ifndef __CUDACC__
-      throw std::runtime_error( "CurandDevice is not supported on CPUs" );
+      throw std::runtime_error( "CurandDevice is not supported on CPUs or non-NVidia GPUs" );
 #elif defined MGONGPU_HAS_NO_CURAND
       throw std::runtime_error( "CurandDevice is not supported because this application was built without Curand support" );
 #else
@@ -198,7 +201,7 @@ main( int argc, char** argv )
     }
     else if( arg == "--rmbdev" )
     {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
       rmbsmp = RamboSamplingMode::RamboDevice;
 #else
       throw std::runtime_error( "RamboDevice is not supported on CPUs" );
@@ -272,13 +275,13 @@ main( int argc, char** argv )
     return usage( argv[0] );
   }
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 #ifdef _OPENMP
   ompnumthreadsNotSetMeansOneThread( debug ? 1 : 0 ); // quiet(-1), info(0), debug(1)
 #endif
 #endif
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // Fail gently and avoid "Illegal instruction (core dumped)" if the host does not support the SIMD used in the ME calculation
   // Note: this prevents a crash on pmpe04 but not on some github CI nodes?
   // [NB: SIMD vectorization in mg5amc C++ code is only used in the ME calculation below MatrixElementKernelHost!]
@@ -296,14 +299,14 @@ main( int argc, char** argv )
 
   // === STEP 0 - INITIALISE
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 
-  // --- 00. Initialise cuda
-  // Instantiate a CudaRuntime at the beginnining of the application's main to
-  // invoke cudaSetDevice(0) in the constructor and book a cudaDeviceReset() call in the destructor
-  const std::string cdinKey = "00 CudaInit";
+  // --- 00. Initialise GPU
+  // Instantiate a GpuRuntime at the beginnining of the application's main.
+  // For CUDA this invokes cudaSetDevice(0) in the constructor and books a cudaDeviceReset() call in the destructor.
+  const std::string cdinKey = "00 GpuInit";
   timermap.start( cdinKey );
-  CudaRuntime cudaRuntime( debug );
+  GpuRuntime GpuRuntime( debug );
 #endif
 
   // --- 0a. Initialise physics process
@@ -325,7 +328,7 @@ main( int argc, char** argv )
   timermap.start( alloKey );
 
   // Memory buffers for random numbers for momenta
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferRndNumMomenta hstRndmom( nevt );
 #else
   PinnedHostBufferRndNumMomenta hstRndmom( nevt );
@@ -333,7 +336,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for sampling weights
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferWeights hstWeights( nevt );
 #else
   PinnedHostBufferWeights hstWeights( nevt );
@@ -341,7 +344,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for momenta
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferMomenta hstMomenta( nevt );
 #else
   PinnedHostBufferMomenta hstMomenta( nevt );
@@ -349,7 +352,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for Gs
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferGs hstGs( nevt );
 #else
   PinnedHostBufferGs hstGs( nevt );
@@ -366,7 +369,7 @@ main( int argc, char** argv )
   }
 
   // Memory buffers for matrix elements
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferMatrixElements hstMatrixElements( nevt );
 #else
   PinnedHostBufferMatrixElements hstMatrixElements( nevt );
@@ -375,7 +378,7 @@ main( int argc, char** argv )
 
   // Memory buffers for random numbers for helicity selection
   // *** NB #403 these buffers always remain initialised at 0: no need for helicity choice in gcheck/check (no LHE produced) ***
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferRndNumHelicity hstRndHel( nevt );
 #else
   PinnedHostBufferRndNumHelicity hstRndHel( nevt );
@@ -384,7 +387,7 @@ main( int argc, char** argv )
 
   // Memory buffers for random numbers for color selection
   // *** NB #402 these buffers always remain initialised at 0: no need for color choice in gcheck/check (no LHE produced) ***
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferRndNumColor hstRndCol( nevt );
 #else
   PinnedHostBufferRndNumColor hstRndCol( nevt );
@@ -392,7 +395,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for helicity selection
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferSelectedHelicity hstSelHel( nevt );
 #else
   PinnedHostBufferSelectedHelicity hstSelHel( nevt );
@@ -400,7 +403,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for color selection
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferSelectedColor hstSelCol( nevt );
 #else
   PinnedHostBufferSelectedColor hstSelCol( nevt );
@@ -438,7 +441,7 @@ main( int argc, char** argv )
     const bool onDevice = true;
     prnk.reset( new CurandRandomNumberKernel( devRndmom, onDevice ) );
 #else
-    throw std::logic_error( "INTERNAL ERROR! CurandDevice is not supported on CPUs" ); // INTERNAL ERROR (no path to this statement)
+    throw std::logic_error( "INTERNAL ERROR! CurandDevice is not supported on CPUs or non-NVidia GPUs" ); // INTERNAL ERROR (no path to this statement)
 #endif
   }
 
@@ -450,7 +453,7 @@ main( int argc, char** argv )
   }
   else
   {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     prsk.reset( new RamboSamplingKernelDevice( energy, devRndmom, devMomenta, devWeights, gpublocks, gputhreads ) );
 #else
     throw std::logic_error( "RamboDevice is not supported on CPUs" ); // INTERNAL ERROR (no path to this statement)
@@ -461,7 +464,7 @@ main( int argc, char** argv )
   std::unique_ptr<MatrixElementKernelBase> pmek;
   if( !bridge )
   {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     pmek.reset( new MatrixElementKernelDevice( devMomenta, devGs, devRndHel, devRndCol, devMatrixElements, devSelHel, devSelCol, gpublocks, gputhreads ) );
 #else
     pmek.reset( new MatrixElementKernelHost( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, nevt ) );
@@ -469,7 +472,7 @@ main( int argc, char** argv )
   }
   else
   {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     pmek.reset( new BridgeKernelDevice( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, gpublocks, gputhreads ) );
 #else
     pmek.reset( new BridgeKernelHost( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, nevt ) );
@@ -511,7 +514,7 @@ main( int argc, char** argv )
     prnk->generateRnarray();
     //std::cout << "Got random numbers" << std::endl;
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     if( rndgen != RandomNumberMode::CurandDevice && rmbsmp == RamboSamplingMode::RamboDevice )
     {
       // --- 1c. Copy rndmom from host to device
@@ -543,7 +546,7 @@ main( int argc, char** argv )
     prsk->getMomentaFinal();
     //std::cout << "Got final momenta" << std::endl;
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     if( rmbsmp == RamboSamplingMode::RamboDevice )
     {
       // --- 2c. CopyDToH Weights
@@ -588,7 +591,7 @@ main( int argc, char** argv )
       dynamic_cast<BridgeKernelBase*>( pmek.get() )->transposeInputMomentaC2F();
     }
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     // --- 2d. CopyHToD Momenta
     const std::string gKey = "0.. CpHTDg";
     rambtime += timermap.start( gKey ); // FIXME! NOT A RAMBO TIMER!
@@ -617,7 +620,7 @@ main( int argc, char** argv )
     wv3atime += timermap.stop(); // calc only
     wavetime += wv3atime;        // calc plus copy
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     if( !bridge )
     {
       // --- 3b. CopyDToH MEs
@@ -760,18 +763,22 @@ main( int argc, char** argv )
     rndgentxt = "CURAND DEVICE";
 #ifdef __CUDACC__
   rndgentxt += " (CUDA code)";
+#elif defined __HIPCC__
+  rndgentxt += " (HIP code)";
 #else
   rndgentxt += " (C++ code)";
 #endif
 
   // Workflow description summary
   std::string wrkflwtxt;
-  // -- CUDA or C++?
+  // -- CUDA or HIP or C++?
 #ifdef __CUDACC__
   wrkflwtxt += "CUD:";
+#elif defined __HIPCC__
+  wrkflwtxt += "HIP:";
 #else
   wrkflwtxt += "CPP:";
-#endif
+#endif /* clang-format off */
   // -- DOUBLE or FLOAT?
 #if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
   wrkflwtxt += "MIX+"; // mixed fptypes (single precision color algebra #537)
@@ -781,7 +788,7 @@ main( int argc, char** argv )
   wrkflwtxt += "FLT+";
 #else
   wrkflwtxt += "???+"; // no path to this statement
-#endif
+#endif /* clang-format on */
   // -- CUCOMPLEX or THRUST or STD complex numbers?
 #ifdef __CUDACC__
 #if defined MGONGPU_CUCXTYPE_CUCOMPLEX
@@ -793,6 +800,12 @@ main( int argc, char** argv )
 #else
   wrkflwtxt += "???:"; // no path to this statement
 #endif
+#elif defined __HIPCC__
+#if defined MGONGPU_CUCXTYPE_CXSMPL
+  wrkflwtxt += "CXS:";
+#else
+  wrkflwtxt += "???:"; // no path to this statement
+#endif
 #else
 #if defined MGONGPU_CPPCXTYPE_STDCOMPLEX
   wrkflwtxt += "STX:";
@@ -818,7 +831,7 @@ main( int argc, char** argv )
     wrkflwtxt += "RMBDEV+";
   else
     wrkflwtxt += "??????+"; // no path to this statement
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   // -- HOST or DEVICE matrix elements? Standalone MEs or BRIDGE?
   if( !bridge )
     wrkflwtxt += "MESDEV";
@@ -874,7 +887,7 @@ main( int argc, char** argv )
 
   if( perf )
   {
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 #ifdef _OPENMP
     // Get the output of "nproc --all" (https://stackoverflow.com/a/478960)
     std::string nprocall;
@@ -895,6 +908,8 @@ main( int argc, char** argv )
     std::cout << std::string( SEP79, '*' ) << std::endl
 #ifdef __CUDACC__
               << "Process                     = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_CUDA"
+#elif defined __HIPCC__
+              << "Process                     = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_HIP"
 #else
               << "Process                     = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_CPP"
 #endif
@@ -921,21 +936,21 @@ main( int argc, char** argv )
 #elif defined MGONGPU_FPTYPE_FLOAT
               << "FP precision                = FLOAT (NaN/abnormal=" << nabn << ", zero=" << nzero << ")" << std::endl
 #endif
-#ifdef __CUDACC__
 #if defined MGONGPU_CUCXTYPE_CUCOMPLEX
               << "Complex type                = CUCOMPLEX" << std::endl
 #elif defined MGONGPU_CUCXTYPE_THRUST
               << "Complex type                = THRUST::COMPLEX" << std::endl
-#endif
-#else
+#elif defined MGONGPU_CUCXTYPE_CXSMPL
               << "Complex type                = STD::COMPLEX" << std::endl
+#else
+              << "Complex type                = ???" << std::endl // no path to this statement...
 #endif
               << "RanNumb memory layout       = AOSOA[" << neppR << "]"
               << ( neppR == 1 ? " == AOS" : "" )
               << " [HARDCODED FOR REPRODUCIBILITY]" << std::endl
               << "Momenta memory layout       = AOSOA[" << neppM << "]"
               << ( neppM == 1 ? " == AOS" : "" ) << std::endl
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     //<< "Wavefunction GPU memory     = LOCAL" << std::endl
 #else
 #if !defined MGONGPU_CPPSIMD
@@ -966,7 +981,7 @@ main( int argc, char** argv )
 #endif
 #endif
               << "Random number generation    = " << rndgentxt << std::endl
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 #ifdef _OPENMP
               << "OMP threads / `nproc --all` = " << omp_get_max_threads() << " / " << nprocall // includes a newline
 #endif
@@ -1062,14 +1077,14 @@ main( int argc, char** argv )
              << "\"FLOAT (NaN/abnormal=" << nabn << ")\"," << std::endl
 #endif
              << "\"Complex type\": "
-#ifdef __CUDACC__
 #if defined MGONGPU_CUCXTYPE_CUCOMPLEX
              << "\"CUCOMPLEX\"," << std::endl
 #elif defined MGONGPU_CUCXTYPE_THRUST
              << "\"THRUST::COMPLEX\"," << std::endl
-#endif
-#else
+#elif defined MGONGPU_CUCXTYPE_CXSMPL
              << "\"STD::COMPLEX\"," << std::endl
+#else
+             << "\"???\"," << std::endl                           // no path to this statement...
 #endif
              << "\"RanNumb memory layout\": "
              << "\"AOSOA[" << neppR << "]\""
@@ -1077,7 +1092,7 @@ main( int argc, char** argv )
              << "\"Momenta memory layout\": "
              << "\"AOSOA[" << neppM << "]\""
              << ( neppM == 1 ? " == AOS" : "" ) << ", " << std::endl
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     //<< "\"Wavefunction GPU memory\": " << "\"LOCAL\"," << std::endl
 #endif
              << "\"Curand generation\": "
diff --git a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/RamboSamplingKernels.cc b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/RamboSamplingKernels.cc
index da68aa9255..79abbcc4f8 100644
--- a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/RamboSamplingKernels.cc
+++ b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/RamboSamplingKernels.cc
@@ -1,11 +1,11 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #include "RamboSamplingKernels.h"
 
-#include "CudaRuntime.h"
+#include "GpuRuntime.h"
 #include "MemoryAccessMomenta.h"
 #include "MemoryAccessRandomNumbers.h"
 #include "MemoryAccessWeights.h"
@@ -14,7 +14,7 @@
 
 #include <sstream>
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -92,7 +92,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   RamboSamplingKernelDevice::RamboSamplingKernelDevice( const fptype energy,               // input: energy
                                                         const BufferRndNumMomenta& rndmom, // input: random numbers in [0,1]
                                                         BufferMomenta& momenta,            // output: momenta
@@ -135,7 +135,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   __global__ void
   getMomentaInitialDevice( const fptype energy,
                            fptype* momenta )
@@ -147,17 +147,17 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   void
   RamboSamplingKernelDevice::getMomentaInitial()
   {
-    getMomentaInitialDevice<<<m_gpublocks, m_gputhreads>>>( m_energy, m_momenta.data() );
+    gpuLaunchKernel( getMomentaInitialDevice, m_gpublocks, m_gputhreads, m_energy, m_momenta.data() );
   }
 #endif
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   __global__ void
   getMomentaFinalDevice( const fptype energy,
                          const fptype* rndmom,
@@ -171,11 +171,11 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   void
   RamboSamplingKernelDevice::getMomentaFinal()
   {
-    getMomentaFinalDevice<<<m_gpublocks, m_gputhreads>>>( m_energy, m_rndmom.data(), m_momenta.data(), m_weights.data() );
+    gpuLaunchKernel( getMomentaFinalDevice, m_gpublocks, m_gputhreads, m_energy, m_rndmom.data(), m_momenta.data(), m_weights.data() );
   }
 #endif
 
diff --git a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/RamboSamplingKernels.h b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/RamboSamplingKernels.h
index 184089efd7..7c214cd74b 100644
--- a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/RamboSamplingKernels.h
+++ b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/RamboSamplingKernels.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef RAMBOSAMPLINGKERNELS_H
 #define RAMBOSAMPLINGKERNELS_H 1
@@ -10,7 +10,7 @@
 
 #include "MemoryBuffers.h"
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -93,7 +93,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   // A class encapsulating RAMBO phase space sampling on a GPU device
   class RamboSamplingKernelDevice final : public SamplingKernelBase, public NumberOfEvents
   {
diff --git a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/RandomNumberKernels.h b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/RandomNumberKernels.h
index 188a72c2c9..21d63beeac 100644
--- a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/RandomNumberKernels.h
+++ b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/RandomNumberKernels.h
@@ -1,14 +1,14 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef RANDOMNUMBERKERNELS_H
 #define RANDOMNUMBERKERNELS_H 1
 
 #include "mgOnGpuConfig.h"
 
-// NB This must come AFTER mgOnGpuConfig.h which contains our definition of __global__ when __CUDACC__ is not defined
+// NB This must come AFTER mgOnGpuConfig.h which contains our definition of __global__ when MGONGPUCPP_GPUIMPL is not defined
 #ifndef MGONGPU_HAS_NO_CURAND
 //#include "curand.h"
 struct curandGenerator_st; // forward definition from curand.h
@@ -16,7 +16,7 @@ struct curandGenerator_st; // forward definition from curand.h
 
 #include "MemoryBuffers.h"
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/cudacpp.mk b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/cudacpp.mk
index 509307506b..f2cfa349da 100644
--- a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/cudacpp.mk
+++ b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/cudacpp.mk
@@ -1,7 +1,7 @@
 # Copyright (C) 2020-2023 CERN and UCLouvain.
 # Licensed under the GNU Lesser General Public License (version 3 or later).
 # Created by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin.
-# Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+# Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 
 #=== Determine the name of this makefile (https://ftp.gnu.org/old-gnu/Manuals/make-3.80/html_node/make_17.html)
 #=== NB: use ':=' to ensure that the value of CUDACPP_MAKEFILE is not modified further down after including make_opts
@@ -42,10 +42,10 @@ endif
 
 #-------------------------------------------------------------------------------
 
-#=== Configure common compiler flags for C++ and CUDA
+#=== Configure common compiler flags for C++ and CUDA/HIP
 
 INCFLAGS = -I.
-OPTFLAGS = -O3 # this ends up in CUFLAGS too (should it?), cannot add -Ofast or -ffast-math here
+OPTFLAGS = -O3 # this ends up in GPUFLAGS too (should it?), cannot add -Ofast or -ffast-math here
 
 # Dependency on src directory
 MG5AMC_COMMONLIB = mg5amc_common
@@ -121,24 +121,46 @@ endif
 
 #-------------------------------------------------------------------------------
 
-#=== Configure the CUDA compiler
+#=== Configure the GPU compiler (CUDA or HIP)
 
-# If CXX is not a single word (example "clang++ --gcc-toolchain...") then disable CUDA builds (issue #505)
-# This is because it is impossible to pass this to "CUFLAGS += -ccbin <host-compiler>" below
+# FIXME! (AV 24.01.2024)
+# In the current implementation (without separate builds for C++ and CUDA/HIP), we first check for cudacc and hipcc in CUDA_HOME and HIP_HOME.
+# If CUDA_HOME or HIP_HOME are not set, try to determine them from the path to cudacc and hipcc.
+# While convoluted, this is currently necessary to allow disabling CUDA/HIP builds by setting CUDA_HOME or HIP_HOME to invalid paths.
+# This will (probably?) be fixed when separate C++ and CUDA/HIP builds are implemented (PR #775).
+
+# If CXX is not a single word (example "clang++ --gcc-toolchain...") then disable CUDA and HIP builds (issue #505)
+# This is because it is impossible to pass this to "GPUFLAGS += -ccbin <host-compiler>" below
 ifneq ($(words $(subst ccache ,,$(CXX))),1) # allow at most "CXX=ccache <host-compiler>" from outside
-  $(warning CUDA builds are not supported for multi-word CXX "$(CXX)")
+  $(warning CUDA and HIP builds are not supported for multi-word CXX "$(CXX)")
   override CUDA_HOME=disabled
+  override HIP_HOME=disabled
 endif
 
-# If CUDA_HOME is not set, try to set it from the location of nvcc
+# If CUDA_HOME is not set, try to set it from the path to nvcc
 ifndef CUDA_HOME
   CUDA_HOME = $(patsubst %bin/nvcc,%,$(shell which nvcc 2>/dev/null))
   $(warning CUDA_HOME was not set: using "$(CUDA_HOME)")
 endif
 
-# Set NVCC as $(CUDA_HOME)/bin/nvcc if it exists
+# If HIP_HOME is not set, try to set it from the path to hipcc
+ifndef HIP_HOME
+  HIP_HOME = $(patsubst %bin/hipcc,%,$(HIP_COMPILER_PATH))
+  $(warning HIP_HOME was not set: using "$(HIP_HOME)")
+endif
+
+# FIXME! (AV 24.01.2024)
+# In the current implementation (without separate builds for C++ and CUDA/HIP),
+# builds are performed for HIP only if CUDA is not found in the path.
+# If both CUDA and HIP are installed, HIP builds can be triggered by unsetting CUDA_HOME.
+# This will be fixed when separate C++ and CUDA/HIP builds are implemented (PR #775).
+
+#--- Option 1: CUDA exists -> use CUDA
+
+# Set GPUCC as $(CUDA_HOME)/bin/nvcc if it exists
 ifneq ($(wildcard $(CUDA_HOME)/bin/nvcc),)
-  NVCC = $(CUDA_HOME)/bin/nvcc
+
+  GPUCC = $(CUDA_HOME)/bin/nvcc
   USE_NVTX ?=-DUSE_NVTX
   # See https://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/index.html
   # See https://arnon.dk/matching-sm-architectures-arch-and-gencode-for-various-nvidia-cards/
@@ -158,41 +180,78 @@ ifneq ($(wildcard $(CUDA_HOME)/bin/nvcc),)
     CURANDLIBFLAGS = -L$(CUDA_HOME)/lib64/ -lcurand # NB: -lcuda is not needed here!
   endif
   CUOPTFLAGS = -lineinfo
-  CUFLAGS = $(foreach opt, $(OPTFLAGS), -Xcompiler $(opt)) $(CUOPTFLAGS) $(INCFLAGS) $(CUINC) $(USE_NVTX) $(CUARCHFLAGS) -use_fast_math
-  ###CUFLAGS += -Xcompiler -Wall -Xcompiler -Wextra -Xcompiler -Wshadow
-  ###NVCC_VERSION = $(shell $(NVCC) --version | grep 'Cuda compilation tools' | cut -d' ' -f5 | cut -d, -f1)
-  CUFLAGS += -std=c++17 # need CUDA >= 11.2 (see #333): this is enforced in mgOnGpuConfig.h
+  ###GPUFLAGS = $(OPTFLAGS) $(CUOPTFLAGS) $(INCFLAGS) $(CUINC) $(USE_NVTX) $(CUARCHFLAGS) -use_fast_math
+  GPUFLAGS = $(foreach opt, $(OPTFLAGS), -Xcompiler $(opt)) $(CUOPTFLAGS) $(INCFLAGS) $(CUINC) $(USE_NVTX) $(CUARCHFLAGS) -use_fast_math
+  ###GPUFLAGS += -Xcompiler -Wall -Xcompiler -Wextra -Xcompiler -Wshadow
+  ###GPUCC_VERSION = $(shell $(GPUCC) --version | grep 'Cuda compilation tools' | cut -d' ' -f5 | cut -d, -f1)
+  GPUFLAGS += -std=c++17 # need CUDA >= 11.2 (see #333): this is enforced in mgOnGpuConfig.h
   # Without -maxrregcount: baseline throughput: 6.5E8 (16384 32 12) up to 7.3E8 (65536 128 12)
-  ###CUFLAGS+= --maxrregcount 160 # improves throughput: 6.9E8 (16384 32 12) up to 7.7E8 (65536 128 12)
-  ###CUFLAGS+= --maxrregcount 128 # improves throughput: 7.3E8 (16384 32 12) up to 7.6E8 (65536 128 12)
-  ###CUFLAGS+= --maxrregcount 96 # degrades throughput: 4.1E8 (16384 32 12) up to 4.5E8 (65536 128 12)
-  ###CUFLAGS+= --maxrregcount 64 # degrades throughput: 1.7E8 (16384 32 12) flat at 1.7E8 (65536 128 12)
+  ###GPUFLAGS+= --maxrregcount 160 # improves throughput: 6.9E8 (16384 32 12) up to 7.7E8 (65536 128 12)
+  ###GPUFLAGS+= --maxrregcount 128 # improves throughput: 7.3E8 (16384 32 12) up to 7.6E8 (65536 128 12)
+  ###GPUFLAGS+= --maxrregcount 96 # degrades throughput: 4.1E8 (16384 32 12) up to 4.5E8 (65536 128 12)
+  ###GPUFLAGS+= --maxrregcount 64 # degrades throughput: 1.7E8 (16384 32 12) flat at 1.7E8 (65536 128 12)
+  CUBUILDRULEFLAGS = -Xcompiler -fPIC -c
+  CCBUILDRULEFLAGS = -Xcompiler -fPIC -c -x cu
+  CUDATESTFLAGS = -lcuda
+
+  # Set the host C++ compiler for GPUCC via "-ccbin <host-compiler>"
+  # (NB issue #505: this must be a single word, "clang++ --gcc-toolchain..." is not supported)
+  GPUFLAGS += -ccbin $(shell which $(subst ccache ,,$(CXX)))
+
+  # Allow newer (unsupported) C++ compilers with older versions of CUDA if ALLOW_UNSUPPORTED_COMPILER_IN_CUDA is set (#504)
+  ifneq ($(origin ALLOW_UNSUPPORTED_COMPILER_IN_CUDA),undefined)
+  GPUFLAGS += -allow-unsupported-compiler
+  endif
+
 else ifneq ($(origin REQUIRE_CUDA),undefined)
+
   # If REQUIRE_CUDA is set but no cuda is found, stop here (e.g. for CI tests on GPU #443)
-  $(error No cuda installation found (set CUDA_HOME or make nvcc visible in PATH))
+  $(error No cuda installation found (set CUDA_HOME or make GPUCC visible in PATH))
+
+#--- Option 2: CUDA does not exist, HIP exists -> use HIP
+
+# Set GPUCC as $(HIP_HOME)/bin/hipcc if it exists
+else ifneq ($(wildcard $(HIP_HOME)/bin/hipcc),)
+
+  GPUCC = $(HIP_HOME)/bin/hipcc
+  #USE_NVTX ?=-DUSE_NVTX # should maybe find something equivalent to this in HIP?
+  HIPARCHFLAGS = -target x86_64-linux-gnu --offload-arch=gfx90a
+  HIPINC = -I$(HIP_HOME)/include/
+  # Note: -DHIP_FAST_MATH is equivalent to -use_fast_math in HIP 
+  # (but only for single precision line 208: https://rocm-developer-tools.github.io/HIP/hcc__detail_2math__functions_8h_source.html)
+  GPUFLAGS = $(OPTFLAGS) $(CUOPTFLAGS) $(INCFLAGS) $(HIPINC) $(HIPARCHFLAGS) -DHIP_FAST_MATH -DHIP_PLATFORM=amd -fPIC
+  ###GPUFLAGS += -Xcompiler -Wall -Xcompiler -Wextra -Xcompiler -Wshadow
+  GPUFLAGS += -std=c++17
+  ###GPUFLAGS+= --maxrregcount 255 # (AV: is this option valid on HIP and meaningful on AMD GPUs?)
+  CUBUILDRULEFLAGS = -fPIC -c
+  CCBUILDRULEFLAGS = -fPIC -c
+
+else ifneq ($(origin REQUIRE_HIP),undefined)
+
+  # If REQUIRE_HIP is set but no HIP is found, stop here (e.g. for CI tests on GPU #443)
+  $(error No hip installation found (set HIP_HOME or make GPUCC visible in PATH))
+
+#--- Option 3: CUDA does not exist, HIP does not exist -> switch off both CUDA and HIP
+
 else
-  # No cuda. Switch cuda compilation off and go to common random numbers in C++
+
+  # No cudacc and no hipcc: switch CUDA and HIP compilation off and go to common random numbers in C++
   $(warning CUDA_HOME is not set or is invalid: export CUDA_HOME to compile with cuda)
-  override NVCC=
+  $(warning HIP_HOME is not set or is invalid: export HIP_HOME to compile with hip)
+  override GPUCC=
   override USE_NVTX=
   override CUINC=
   override CURANDLIBFLAGS=
-endif
-export NVCC
-export CUFLAGS
-
-# Set the host C++ compiler for nvcc via "-ccbin <host-compiler>"
-# (NB issue #505: this must be a single word, "clang++ --gcc-toolchain..." is not supported)
-CUFLAGS += -ccbin $(shell which $(subst ccache ,,$(CXX)))
 
-# Allow newer (unsupported) C++ compilers with older versions of CUDA if ALLOW_UNSUPPORTED_COMPILER_IN_CUDA is set (#504)
-ifneq ($(origin ALLOW_UNSUPPORTED_COMPILER_IN_CUDA),undefined)
-CUFLAGS += -allow-unsupported-compiler
 endif
 
+# Export GPUCC (so that it can also be used in cudacpp_src.mk?)
+export GPUCC
+export GPUFLAGS
+
 #-------------------------------------------------------------------------------
 
-#=== Configure ccache for C++ and CUDA builds
+#=== Configure ccache for C++ and CUDA/HIP builds
 
 # Enable ccache if USECCACHE=1
 ifeq ($(USECCACHE)$(shell echo $(CXX) | grep ccache),1)
@@ -201,15 +260,15 @@ endif
 #ifeq ($(USECCACHE)$(shell echo $(AR) | grep ccache),1)
 #  override AR:=ccache $(AR)
 #endif
-ifneq ($(NVCC),)
-  ifeq ($(USECCACHE)$(shell echo $(NVCC) | grep ccache),1)
-    override NVCC:=ccache $(NVCC)
+ifneq ($(GPUCC),)
+  ifeq ($(USECCACHE)$(shell echo $(GPUCC) | grep ccache),1)
+    override GPUCC:=ccache $(GPUCC)
   endif
 endif
 
 #-------------------------------------------------------------------------------
 
-#=== Configure PowerPC-specific compiler flags for C++ and CUDA
+#=== Configure PowerPC-specific compiler flags for C++ and CUDA/HIP
 
 # PowerPC-specific CXX compiler flags (being reviewed)
 ifeq ($(UNAME_P),ppc64le)
@@ -225,9 +284,9 @@ else
   ######CXXFLAGS+= -fno-semantic-interposition # no benefit (neither alone, nor combined with -flto)
 endif
 
-# PowerPC-specific CUDA compiler flags (to be reviewed!)
+# PowerPC-specific CUDA/HIP compiler flags (to be reviewed!)
 ifeq ($(UNAME_P),ppc64le)
-  CUFLAGS+= -Xcompiler -mno-float128
+  GPUFLAGS+= -Xcompiler -mno-float128
 endif
 
 #-------------------------------------------------------------------------------
@@ -237,7 +296,7 @@ endif
 # Set the default OMPFLAGS choice
 ifneq ($(shell $(CXX) --version | egrep '^Intel'),)
 override OMPFLAGS = -fopenmp
-###override OMPFLAGS = # disable OpenMP MT on Intel (was ok without nvcc but not ok with nvcc before #578)
+###override OMPFLAGS = # disable OpenMP MT on Intel (was ok without GPUCC but not ok with GPUCC before #578)
 else ifneq ($(shell $(CXX) --version | egrep '^(clang)'),)
 override OMPFLAGS = -fopenmp
 ###override OMPFLAGS = # disable OpenMP MT on clang (was not ok without or with nvcc before #578)
@@ -293,7 +352,10 @@ endif
 
 # Set the default RNDGEN (random number generator) choice
 ifeq ($(RNDGEN),)
-  ifeq ($(NVCC),)
+  ifeq ($(GPUCC),)
+    override RNDGEN = hasNoCurand
+  # Edgecase for HIP compilation
+  else ifeq ($(findstring hipcc,$(GPUCC)),hipcc)
     override RNDGEN = hasNoCurand
   else ifeq ($(RNDGEN),)
     override RNDGEN = hasCurand
@@ -310,7 +372,7 @@ export OMPFLAGS
 
 #-------------------------------------------------------------------------------
 
-#=== Set the CUDA/C++ compiler flags appropriate to user-defined choices of AVX, FPTYPE, HELINL, HRDCOD, RNDGEN
+#=== Set the CUDA/HIP/C++ compiler flags appropriate to user-defined choices of AVX, FPTYPE, HELINL, HRDCOD, RNDGEN
 
 # Set the build flags appropriate to OMPFLAGS
 $(info OMPFLAGS=$(OMPFLAGS))
@@ -368,13 +430,13 @@ CXXFLAGS+= $(AVXFLAGS)
 $(info FPTYPE=$(FPTYPE))
 ifeq ($(FPTYPE),d)
   CXXFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_DOUBLE
-  CUFLAGS  += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_DOUBLE
+  GPUFLAGS  += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_DOUBLE
 else ifeq ($(FPTYPE),f)
   CXXFLAGS += -DMGONGPU_FPTYPE_FLOAT -DMGONGPU_FPTYPE2_FLOAT
-  CUFLAGS  += -DMGONGPU_FPTYPE_FLOAT -DMGONGPU_FPTYPE2_FLOAT
+  GPUFLAGS  += -DMGONGPU_FPTYPE_FLOAT -DMGONGPU_FPTYPE2_FLOAT
 else ifeq ($(FPTYPE),m)
   CXXFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_FLOAT
-  CUFLAGS  += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_FLOAT
+  GPUFLAGS  += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_FLOAT
 else
   $(error Unknown FPTYPE='$(FPTYPE)': only 'd', 'f' and 'm' are supported)
 endif
@@ -383,7 +445,7 @@ endif
 $(info HELINL=$(HELINL))
 ifeq ($(HELINL),1)
   CXXFLAGS += -DMGONGPU_INLINE_HELAMPS
-  CUFLAGS  += -DMGONGPU_INLINE_HELAMPS
+  GPUFLAGS  += -DMGONGPU_INLINE_HELAMPS
 else ifneq ($(HELINL),0)
   $(error Unknown HELINL='$(HELINL)': only '0' and '1' are supported)
 endif
@@ -392,7 +454,7 @@ endif
 $(info HRDCOD=$(HRDCOD))
 ifeq ($(HRDCOD),1)
   CXXFLAGS += -DMGONGPU_HARDCODE_PARAM
-  CUFLAGS  += -DMGONGPU_HARDCODE_PARAM
+  GPUFLAGS  += -DMGONGPU_HARDCODE_PARAM
 else ifneq ($(HRDCOD),0)
   $(error Unknown HRDCOD='$(HRDCOD)': only '0' and '1' are supported)
 endif
@@ -444,11 +506,11 @@ ifeq ($(UNAME_S),Darwin)
   override CULIBFLAGSRPATH2 =
 else
   # RPATH to cuda/cpp libs when linking executables
-  override CXXLIBFLAGSRPATH = -Wl,-rpath,$(LIBDIRRPATH)
-  override CULIBFLAGSRPATH = -Xlinker -rpath,$(LIBDIRRPATH)
+  override CXXLIBFLAGSRPATH = -Wl,-rpath=$(LIBDIRRPATH)
+  override CULIBFLAGSRPATH = -Xlinker -rpath=$(LIBDIRRPATH)
   # RPATH to common lib when linking cuda/cpp libs
-  override CXXLIBFLAGSRPATH2 = -Wl,-rpath,'$$ORIGIN'
-  override CULIBFLAGSRPATH2 = -Xlinker -rpath,'$$ORIGIN'
+  override CXXLIBFLAGSRPATH2 = -Wl,-rpath='$$ORIGIN'
+  override CULIBFLAGSRPATH2 = -Xlinker -rpath='$$ORIGIN'
 endif
 
 # Setting LD_LIBRARY_PATH or DYLD_LIBRARY_PATH in the RUNTIME is no longer necessary (neither on Linux nor on Mac)
@@ -461,7 +523,7 @@ override RUNTIME =
 cxx_main=$(BUILDDIR)/check.exe
 fcxx_main=$(BUILDDIR)/fcheck.exe
 
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 cu_main=$(BUILDDIR)/gcheck.exe
 fcu_main=$(BUILDDIR)/fgcheck.exe
 else
@@ -492,15 +554,16 @@ $(BUILDDIR)/.build.$(TAG):
 	@touch $(BUILDDIR)/.build.$(TAG)
 
 # Generic target and build rules: objects from CUDA compilation
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 $(BUILDDIR)/%.o : %.cu *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
 	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
-	$(NVCC) $(CPPFLAGS) $(CUFLAGS) -Xcompiler -fPIC -c $< -o $@
+	$(GPUCC) $(CPPFLAGS) $(GPUFLAGS) $(CUBUILDRULEFLAGS) $< -o $@
 
 $(BUILDDIR)/%_cu.o : %.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
 	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
-	$(NVCC) $(CPPFLAGS) $(CUFLAGS) -Xcompiler -fPIC -c -x cu $< -o $@
+	$(GPUCC) $(CPPFLAGS) $(GPUFLAGS) $(CCBUILDRULEFLAGS) $< -o $@
 endif
+# -x cu in line above
 
 # Generic target and build rules: objects from C++ compilation
 # (NB do not include CUINC here! add it only for NVTX or curand #679)
@@ -509,11 +572,14 @@ $(BUILDDIR)/%.o : %.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
 	$(CXX) $(CPPFLAGS) $(CXXFLAGS) -fPIC -c $< -o $@
 
 # Apply special build flags only to CrossSectionKernel.cc and gCrossSectionKernel.cu (no fast math, see #117 and #516)
+# Added edgecase for HIP compilation
 ifeq ($(shell $(CXX) --version | grep ^nvc++),)
 $(BUILDDIR)/CrossSectionKernels.o: CXXFLAGS := $(filter-out -ffast-math,$(CXXFLAGS))
 $(BUILDDIR)/CrossSectionKernels.o: CXXFLAGS += -fno-fast-math
-ifneq ($(NVCC),)
-$(BUILDDIR)/gCrossSectionKernels.o: CUFLAGS += -Xcompiler -fno-fast-math
+ifeq ($(findstring nvcc,$(GPUCC)),nvcc)
+  $(BUILDDIR)/gCrossSectionKernels.o: GPUFLAGS += -Xcompiler -fno-fast-math
+else
+  $(BUILDDIR)/gCrossSectionKernels.o: GPUFLAGS += -fno-fast-math
 endif
 endif
 
@@ -530,10 +596,10 @@ ifeq ($(RNDGEN),hasCurand)
 $(BUILDDIR)/CurandRandomNumberKernel.o: CXXFLAGS += $(CUINC)
 endif
 
-# Avoid "warning: builtin __has_trivial_... is deprecated; use __is_trivially_... instead" in nvcc with icx2023 (#592)
+# Avoid "warning: builtin __has_trivial_... is deprecated; use __is_trivially_... instead" in GPUCC with icx2023 (#592)
 ifneq ($(shell $(CXX) --version | egrep '^(Intel)'),)
-ifneq ($(NVCC),)
-CUFLAGS += -Xcompiler -Wno-deprecated-builtins
+ifneq ($(GPUCC),)
+GPUFLAGS += -Wno-deprecated-builtins
 endif
 endif
 
@@ -541,8 +607,8 @@ endif
 # This patch does remove the warning, but I prefer to keep it disabled for the moment...
 ###ifneq ($(shell $(CXX) --version | egrep '^(clang|Apple clang|Intel)'),)
 ###$(BUILDDIR)/CrossSectionKernels.o: CXXFLAGS += -Wno-overriding-t-option
-###ifneq ($(NVCC),)
-###$(BUILDDIR)/gCrossSectionKernels.o: CUFLAGS += -Xcompiler -Wno-overriding-t-option
+###ifneq ($(GPUCC),)
+###$(BUILDDIR)/gCrossSectionKernels.o: GPUFLAGS += -Xcompiler -Wno-overriding-t-option
 ###endif
 ###endif
 
@@ -569,7 +635,7 @@ MG5AMC_CXXLIB = mg5amc_$(processid_short)_cpp
 cxx_objects_lib=$(BUILDDIR)/CPPProcess.o $(BUILDDIR)/MatrixElementKernels.o $(BUILDDIR)/BridgeKernels.o $(BUILDDIR)/CrossSectionKernels.o
 cxx_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel.o $(BUILDDIR)/RamboSamplingKernels.o
 
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 MG5AMC_CULIB = mg5amc_$(processid_short)_cuda
 cu_objects_lib=$(BUILDDIR)/gCPPProcess.o $(BUILDDIR)/gMatrixElementKernels.o $(BUILDDIR)/gBridgeKernels.o $(BUILDDIR)/gCrossSectionKernels.o
 cu_objects_exe=$(BUILDDIR)/gCommonRandomNumberKernel.o $(BUILDDIR)/gRamboSamplingKernels.o
@@ -581,11 +647,11 @@ $(LIBDIR)/lib$(MG5AMC_CXXLIB).so: cxx_objects_lib += $(BUILDDIR)/fbridge.o
 $(LIBDIR)/lib$(MG5AMC_CXXLIB).so: $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib)
 	$(CXX) -shared -o $@ $(cxx_objects_lib) $(CXXLIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB)
 
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 $(LIBDIR)/lib$(MG5AMC_CULIB).so: $(BUILDDIR)/fbridge_cu.o
 $(LIBDIR)/lib$(MG5AMC_CULIB).so: cu_objects_lib += $(BUILDDIR)/fbridge_cu.o
 $(LIBDIR)/lib$(MG5AMC_CULIB).so: $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cu_objects_lib)
-	$(NVCC) --shared -o $@ $(cu_objects_lib) $(CULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB)
+	$(GPUCC) --shared -o $@ $(cu_objects_lib) $(CULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB)
 endif
 
 #-------------------------------------------------------------------------------
@@ -602,16 +668,16 @@ $(cxx_main): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PAT
 $(cxx_main): $(BUILDDIR)/check_sa.o $(LIBDIR)/lib$(MG5AMC_CXXLIB).so $(cxx_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel.o
 	$(CXX) -o $@ $(BUILDDIR)/check_sa.o $(OMPFLAGS) -ldl -pthread $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_CXXLIB) $(cxx_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel.o $(CURANDLIBFLAGS)
 
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 ifneq ($(shell $(CXX) --version | grep ^Intel),)
-$(cu_main): LIBFLAGS += -lintlc # compile with icpx and link with nvcc (undefined reference to `_intel_fast_memcpy')
-$(cu_main): LIBFLAGS += -lsvml # compile with icpx and link with nvcc (undefined reference to `__svml_cos4_l9')
+$(cu_main): LIBFLAGS += -lintlc # compile with icpx and link with GPUCC (undefined reference to `_intel_fast_memcpy')
+$(cu_main): LIBFLAGS += -lsvml # compile with icpx and link with GPUCC (undefined reference to `__svml_cos4_l9')
 else ifneq ($(shell $(CXX) --version | grep ^nvc++),) # support nvc++ #531
 $(cu_main): LIBFLAGS += -L$(patsubst %bin/nvc++,%lib,$(subst ccache ,,$(CXX))) -lnvhpcatm -lnvcpumath -lnvc
 endif
 $(cu_main): LIBFLAGS += $(CULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH
 $(cu_main): $(BUILDDIR)/gcheck_sa.o $(LIBDIR)/lib$(MG5AMC_CULIB).so $(cu_objects_exe) $(BUILDDIR)/gCurandRandomNumberKernel.o
-	$(NVCC) -o $@ $(BUILDDIR)/gcheck_sa.o $(CUARCHFLAGS) $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe) $(BUILDDIR)/gCurandRandomNumberKernel.o $(CURANDLIBFLAGS)
+	$(GPUCC) -o $@ $(BUILDDIR)/gcheck_sa.o $(CUARCHFLAGS) $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe) $(BUILDDIR)/gCurandRandomNumberKernel.o $(CURANDLIBFLAGS)
 endif
 
 #-------------------------------------------------------------------------------
@@ -637,17 +703,17 @@ $(fcxx_main): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PA
 $(fcxx_main): $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler.o $(LIBDIR)/lib$(MG5AMC_CXXLIB).so $(cxx_objects_exe)
 	$(CXX) -o $@ $(BUILDDIR)/fcheck_sa.o $(OMPFLAGS) $(BUILDDIR)/fsampler.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CXXLIB) $(cxx_objects_exe)
 
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 ifneq ($(shell $(CXX) --version | grep ^Intel),)
-$(fcu_main): LIBFLAGS += -lintlc # compile with icpx and link with nvcc (undefined reference to `_intel_fast_memcpy')
-$(fcu_main): LIBFLAGS += -lsvml # compile with icpx and link with nvcc (undefined reference to `__svml_cos4_l9')
+$(fcu_main): LIBFLAGS += -lintlc # compile with icpx and link with GPUCC (undefined reference to `_intel_fast_memcpy')
+$(fcu_main): LIBFLAGS += -lsvml # compile with icpx and link with GPUCC (undefined reference to `__svml_cos4_l9')
 endif
 ifeq ($(UNAME_S),Darwin)
 $(fcu_main): LIBFLAGS += -L$(shell dirname $(shell $(FC) --print-file-name libgfortran.dylib)) # add path to libgfortran on Mac #375
 endif
 $(fcu_main): LIBFLAGS += $(CULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH
 $(fcu_main): $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler_cu.o $(LIBDIR)/lib$(MG5AMC_CULIB).so $(cu_objects_exe)
-	$(NVCC) -o $@ $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler_cu.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe)
+	$(GPUCC) -o $@ $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler_cu.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe)
 endif
 
 #-------------------------------------------------------------------------------
@@ -659,7 +725,7 @@ $(BUILDDIR)/testxxx.o: testxxx_cc_ref.txt
 $(testmain): $(BUILDDIR)/testxxx.o
 $(testmain): cxx_objects_exe += $(BUILDDIR)/testxxx.o # Comment out this line to skip the C++ test of xxx functions
 
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 $(BUILDDIR)/testxxx_cu.o: $(GTESTLIBS)
 $(BUILDDIR)/testxxx_cu.o: INCFLAGS += $(GTESTINC)
 $(BUILDDIR)/testxxx_cu.o: testxxx_cc_ref.txt
@@ -672,7 +738,7 @@ $(BUILDDIR)/testmisc.o: INCFLAGS += $(GTESTINC)
 $(testmain): $(BUILDDIR)/testmisc.o
 $(testmain): cxx_objects_exe += $(BUILDDIR)/testmisc.o # Comment out this line to skip the C++ miscellaneous tests
 
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 $(BUILDDIR)/testmisc_cu.o: $(GTESTLIBS)
 $(BUILDDIR)/testmisc_cu.o: INCFLAGS += $(GTESTINC)
 $(testmain): $(BUILDDIR)/testmisc_cu.o
@@ -684,12 +750,12 @@ $(BUILDDIR)/runTest.o: INCFLAGS += $(GTESTINC)
 $(testmain): $(BUILDDIR)/runTest.o
 $(testmain): cxx_objects_exe += $(BUILDDIR)/runTest.o
 
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 $(BUILDDIR)/runTest_cu.o: $(GTESTLIBS)
 $(BUILDDIR)/runTest_cu.o: INCFLAGS += $(GTESTINC)
 ifneq ($(shell $(CXX) --version | grep ^Intel),)
-$(testmain): LIBFLAGS += -lintlc # compile with icpx and link with nvcc (undefined reference to `_intel_fast_memcpy')
-$(testmain): LIBFLAGS += -lsvml # compile with icpx and link with nvcc (undefined reference to `__svml_cos4_l9')
+$(testmain): LIBFLAGS += -lintlc # compile with icpx and link with GPUCC (undefined reference to `_intel_fast_memcpy')
+$(testmain): LIBFLAGS += -lsvml # compile with icpx and link with GPUCC (undefined reference to `__svml_cos4_l9')
 else ifneq ($(shell $(CXX) --version | grep ^nvc++),) # support nvc++ #531
 $(testmain): LIBFLAGS += -L$(patsubst %bin/nvc++,%lib,$(subst ccache ,,$(CXX))) -lnvhpcatm -lnvcpumath -lnvc
 endif
@@ -713,14 +779,14 @@ $(testmain): LIBFLAGS += -lgomp
 endif
 endif
 
-ifeq ($(NVCC),) # link only runTest.o
+ifeq ($(GPUCC),) # link only runTest.o
 $(testmain): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH
 $(testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(cxx_objects_exe) $(GTESTLIBS)
 	$(CXX) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) -ldl -pthread $(LIBFLAGS)
 else # link both runTest.o and runTest_cu.o
 $(testmain): LIBFLAGS += $(CULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH
 $(testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) $(GTESTLIBS)
-	$(NVCC) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) -ldl $(LIBFLAGS) -lcuda
+	  $(GPUCC) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) -ldl $(LIBFLAGS) $(CUDATESTFLAGS)
 endif
 
 # Use target gtestlibs to build only googletest
@@ -829,9 +895,9 @@ ifeq ($(USECCACHE),1)
 	ccache --version | head -1
 endif
 	@echo ""
-	@echo NVCC=$(NVCC)
-ifneq ($(NVCC),)
-	$(NVCC) --version
+	@echo GPUCC=$(GPUCC)
+ifneq ($(GPUCC),)
+	$(GPUCC) --version
 endif
 	@echo ""
 	@echo CXX=$(CXX)
@@ -850,7 +916,7 @@ endif
 
 # Target: check (run the C++ test executable)
 # [NB THIS IS WHAT IS USED IN THE GITHUB CI!]
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 check: runTest cmpFcheck cmpFGcheck
 else
 check: runTest cmpFcheck
diff --git a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/fbridge.cc b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/fbridge.cc
index 2d2b36d560..22ce3f5115 100644
--- a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/fbridge.cc
+++ b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/fbridge.cc
@@ -1,11 +1,11 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: S. Roiser (Oct 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Roiser, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #include "Bridge.h"
 #include "CPPProcess.h"
-#include "CudaRuntime.h"
+#include "GpuRuntime.h"
 
 extern "C"
 {
@@ -22,7 +22,7 @@ extern "C"
    * Using the same Fortran MadEvent code, linking to the hetrerogeneous library would allow access to both CPU and GPU implementations.
    * The specific heterogeneous configuration (how many GPUs, how many threads on each CPU, etc) could be loaded in CUDA/C++ from a data file.
    */
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   using namespace mg5amcGpu;
 #else
   using namespace mg5amcCpu;
@@ -46,8 +46,8 @@ extern "C"
    */
   void fbridgecreate_( CppObjectInFortran** ppbridge, const int* pnevtF, const int* pnparF, const int* pnp4F )
   {
-#ifdef __CUDACC__
-    CudaRuntime::setUp();
+#ifdef MGONGPUCPP_GPUIMPL
+    GpuRuntime::setUp();
 #endif
     // (NB: CPPProcess::initProc no longer needs to be executed here because it is called in the Bridge constructor)
     // FIXME: disable OMP in Bridge when called from Fortran
@@ -65,8 +65,8 @@ extern "C"
     Bridge<FORTRANFPTYPE>* pbridge = dynamic_cast<Bridge<FORTRANFPTYPE>*>( *ppbridge );
     if( pbridge == 0 ) throw std::runtime_error( "fbridgedelete_: invalid Bridge address" );
     delete pbridge;
-#ifdef __CUDACC__
-    CudaRuntime::tearDown();
+#ifdef MGONGPUCPP_GPUIMPL
+    GpuRuntime::tearDown();
 #endif
   }
 
@@ -96,7 +96,7 @@ extern "C"
   {
     Bridge<FORTRANFPTYPE>* pbridge = dynamic_cast<Bridge<FORTRANFPTYPE>*>( *ppbridge );
     if( pbridge == 0 ) throw std::runtime_error( "fbridgesequence_: invalid Bridge address" );
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     // Use the device/GPU implementation in the CUDA library
     // (there is also a host implementation in this library)
     pbridge->gpu_sequence( momenta, gs, rndhel, rndcol, *pchannelId, mes, selhel, selcol );
diff --git a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/fsampler.cc b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/fsampler.cc
index 2fb445372d..3743934f41 100644
--- a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/fsampler.cc
+++ b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/fsampler.cc
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Feb 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #include "mgOnGpuConfig.h"
 
@@ -13,7 +13,7 @@
 
 //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -40,7 +40,7 @@ namespace mg5amcCpu
   private:
     const int m_nevt; // The number of events in each iteration
     int m_iiter;      // The iteration counter (for random number seeding)
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
     HostBufferRndNumMomenta m_hstRndmom; // Memory buffers for random numbers
     HostBufferMomenta m_hstMomenta;      // Memory buffers for momenta
     HostBufferWeights m_hstWeights;      // Memory buffers for sampling weights
@@ -105,7 +105,7 @@ namespace mg5amcCpu
 
 extern "C"
 {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   using namespace mg5amcGpu;
 #else
   using namespace mg5amcCpu;
diff --git a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/runTest.cc b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/runTest.cc
index d4a760a71b..de327f2321 100644
--- a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/runTest.cc
+++ b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/runTest.cc
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: S. Hageboeck (Nov 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 //----------------------------------------------------------------------------
 // Use ./runTest.exe --gtest_filter=*xxx to run only testxxx.cc tests
 //----------------------------------------------------------------------------
@@ -18,7 +18,7 @@
 #include "RandomNumberKernels.h"
 #include "epoch_process_id.h"
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 using namespace mg5amcGpu;
 #else
 using namespace mg5amcCpu;
@@ -35,7 +35,7 @@ struct CUDA_CPU_TestBase : public TestDriverBase
     : TestDriverBase( npar, refFileName ) {}
 };
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 struct CPUTest : public CUDA_CPU_TestBase
 {
   // Struct data members (process, and memory structures for random numbers, momenta, matrix elements and weights on host and device)
@@ -119,7 +119,7 @@ struct CPUTest : public CUDA_CPU_TestBase
 };
 #endif
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 struct CUDATest : public CUDA_CPU_TestBase
 {
   // Reset the device when our test goes out of scope. Note that this should happen after
@@ -128,7 +128,7 @@ struct CUDATest : public CUDA_CPU_TestBase
   {
     ~DeviceReset()
     {
-      checkCuda( cudaDeviceReset() ); // this is needed by cuda-memcheck --leak-check full
+      checkGpu( gpuDeviceReset() ); // this is needed by cuda-memcheck --leak-check full
     }
   } deviceResetter;
 
@@ -256,7 +256,7 @@ INSTANTIATE_TEST_SUITE_P( prefix, \
                           test_suite_name, \
                           testing::Values( new CUDATest( MG_EPOCH_REFERENCE_FILE_NAME ) ) );
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 MG_INSTANTIATE_TEST_SUITE_GPU( XTESTID_GPU( MG_EPOCH_PROCESS_ID ), MadgraphTest );
 #else
 MG_INSTANTIATE_TEST_SUITE_CPU( XTESTID_CPU( MG_EPOCH_PROCESS_ID ), MadgraphTest );
diff --git a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/testmisc.cc b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/testmisc.cc
index 895d6eeb56..ba9e59a8a3 100644
--- a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/testmisc.cc
+++ b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/testmisc.cc
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 //----------------------------------------------------------------------------
 // Use ./runTest.exe --gtest_filter=*misc to run only testmisc.cc tests
 //----------------------------------------------------------------------------
@@ -17,7 +17,7 @@
 #include <sstream>
 #include <typeinfo>
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 #define TESTID( s ) s##_GPU_MISC
 #else
 #define TESTID( s ) s##_CPU_MISC
@@ -26,7 +26,7 @@
 #define XTESTID( s ) TESTID( s )
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -59,7 +59,7 @@ namespace mg5amcCpu
 
 TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testmisc )
 {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   using namespace mg5amcGpu;
 #else
   using namespace mg5amcCpu;
diff --git a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/testxxx.cc b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/testxxx.cc
index a1c3cdc238..688cb8167b 100644
--- a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/testxxx.cc
+++ b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/testxxx.cc
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Apr 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 //----------------------------------------------------------------------------
 // Use ./runTest.exe --gtest_filter=*xxx to run only testxxx.cc tests
 //----------------------------------------------------------------------------
@@ -24,7 +24,7 @@
 #include <iomanip>
 #include <iostream>
 #include <vector>
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 #define TESTID( s ) s##_GPU_XXX
 #else
 #define TESTID( s ) s##_CPU_XXX
@@ -32,7 +32,7 @@
 
 #define XTESTID( s ) TESTID( s )
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -42,7 +42,7 @@ namespace mg5amcCpu
   int FPEhandlerIevt = -1;
   inline void FPEhandler( int sig )
   {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     std::cerr << "Floating Point Exception (GPU): '" << FPEhandlerMessage << "' ievt=" << FPEhandlerIevt << std::endl;
 #else
     std::cerr << "Floating Point Exception (CPU neppV=" << neppV << "): '" << FPEhandlerMessage << "' ievt=" << FPEhandlerIevt << std::endl;
@@ -53,7 +53,7 @@ namespace mg5amcCpu
 
 TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testxxx )
 {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   using namespace mg5amcGpu;
 #else
   using namespace mg5amcCpu;
@@ -77,7 +77,7 @@ TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testxxx )
   assert( nevt % neppM == 0 ); // nevt must be a multiple of neppM
   assert( nevt % neppV == 0 ); // nevt must be a multiple of neppV
   // Fill in the input momenta
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   mg5amcGpu::PinnedHostBufferMomenta hstMomenta( nevt ); // AOSOA[npagM][npar=4][np4=4][neppM]
 #else
   mg5amcCpu::HostBufferMomenta hstMomenta( nevt ); // AOSOA[npagM][npar=4][np4=4][neppM]
@@ -322,7 +322,7 @@ TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testxxx )
   {
     for( int ievt = 0; ievt < nevt; ievt++ )
     {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
       using namespace mg5amcGpu;
 #else
       using namespace mg5amcCpu;
diff --git a/epochX/cudacpp/heft_gg_h.sa/src/HelAmps_heft.h b/epochX/cudacpp/heft_gg_h.sa/src/HelAmps_heft.h
index eae9ff5242..dbff117235 100644
--- a/epochX/cudacpp/heft_gg_h.sa/src/HelAmps_heft.h
+++ b/epochX/cudacpp/heft_gg_h.sa/src/HelAmps_heft.h
@@ -5,7 +5,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: A. Valassi (Sep 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
 // MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
@@ -28,7 +28,7 @@
 //#include <iomanip>
 //#include <iostream>
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/heft_gg_h.sa/src/Parameters_heft.cc b/epochX/cudacpp/heft_gg_h.sa/src/Parameters_heft.cc
index e5442756b1..d3d6058b46 100644
--- a/epochX/cudacpp/heft_gg_h.sa/src/Parameters_heft.cc
+++ b/epochX/cudacpp/heft_gg_h.sa/src/Parameters_heft.cc
@@ -4,7 +4,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: A. Valassi (Sep 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
 // MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
@@ -17,7 +17,7 @@
 #include <iomanip>
 #include <iostream>
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 using namespace mg5amcGpu;
 #else
 using namespace mg5amcCpu;
diff --git a/epochX/cudacpp/heft_gg_h.sa/src/Parameters_heft.h b/epochX/cudacpp/heft_gg_h.sa/src/Parameters_heft.h
index 790485fee0..c2be5bba97 100644
--- a/epochX/cudacpp/heft_gg_h.sa/src/Parameters_heft.h
+++ b/epochX/cudacpp/heft_gg_h.sa/src/Parameters_heft.h
@@ -28,7 +28,7 @@
 #include "read_slha.h"
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -94,7 +94,7 @@ namespace mg5amcCpu
 #include <limits>
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -230,7 +230,7 @@ namespace mg5amcCpu
 //==========================================================================
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -247,7 +247,7 @@ namespace mg5amcCpu
 #pragma GCC diagnostic push
 #pragma GCC diagnostic ignored "-Wunused-variable"  // e.g. <<warning: unused variable ‘mdl_G__exp__2’ [-Wunused-variable]>>
 #pragma GCC diagnostic ignored "-Wunused-parameter" // e.g. <<warning: unused parameter ‘G’ [-Wunused-parameter]>>
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 #pragma nv_diagnostic push
 #pragma nv_diag_suppress 177 // e.g. <<warning #177-D: variable "mdl_G__exp__2" was declared but never referenced>>
 #endif
@@ -298,7 +298,7 @@ namespace mg5amcCpu
       // End non-SM (e.g. EFT) implementation - special handling of vectors of floats (#439)
       return out;
     }
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 #pragma GCC diagnostic pop
 #pragma nv_diagnostic pop
 #endif
diff --git a/epochX/cudacpp/heft_gg_h.sa/src/cudacpp_src.mk b/epochX/cudacpp/heft_gg_h.sa/src/cudacpp_src.mk
index 0bd815c9b3..998d3c84fa 100644
--- a/epochX/cudacpp/heft_gg_h.sa/src/cudacpp_src.mk
+++ b/epochX/cudacpp/heft_gg_h.sa/src/cudacpp_src.mk
@@ -19,7 +19,7 @@ SHELL := /bin/bash
 #=== Configure common compiler flags for CUDA and C++
 
 INCFLAGS = -I.
-OPTFLAGS = -O3 # this ends up in CUFLAGS too (should it?), cannot add -Ofast or -ffast-math here
+OPTFLAGS = -O3 # this ends up in GPUFLAGS too (should it?), cannot add -Ofast or -ffast-math here
 
 #-------------------------------------------------------------------------------
 
@@ -45,13 +45,13 @@ endif
 
 #-------------------------------------------------------------------------------
 
-#=== Configure the CUDA compiler (note: NVCC is already exported including ccache)
+#=== Configure the CUDA compiler (note: GPUCC is already exported including ccache)
 
-###$(info NVCC=$(NVCC))
+###$(info GPUCC=$(GPUCC))
 
 #-------------------------------------------------------------------------------
 
-#=== Configure ccache for C++ builds (note: NVCC is already exported including ccache)
+#=== Configure ccache for C++ builds (note: GPUCC is already exported including ccache)
 
 # Enable ccache if USECCACHE=1
 ifeq ($(USECCACHE)$(shell echo $(CXX) | grep ccache),1)
@@ -92,6 +92,13 @@ endif
 ###$(info OMPFLAGS=$(OMPFLAGS))
 CXXFLAGS += $(OMPFLAGS)
 
+# Add correct -DHIP_LATFORM when compiling for HIP
+ifeq ($(findstring nvcc,$(GPUCC)),nvcc)
+  GPUFLAGS += -Xcompiler -fPIC -c -x cu
+else ifeq ($(findstring hipcc,$(GPUCC)),hipcc)
+  GPUFLAGS += -fPIC -c
+endif
+
 # Set the build flags appropriate to each AVX choice (example: "make AVX=none")
 # [NB MGONGPU_PVW512 is needed because "-mprefer-vector-width=256" is not exposed in a macro]
 # [See https://gcc.gnu.org/bugzilla/show_bug.cgi?id=96476]
@@ -253,20 +260,20 @@ $(BUILDDIR)/%.o : %.cc *.h $(BUILDDIR)/.build.$(TAG)
 # Generic target and build rules: objects from CUDA compilation
 $(BUILDDIR)/%_cu.o : %.cc *.h $(BUILDDIR)/.build.$(TAG)
 	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
-	$(NVCC) $(CPPFLAGS) $(CUFLAGS) -Xcompiler -fPIC -c -x cu $< -o $@
+	$(GPUCC) $(CPPFLAGS) $(GPUFLAGS) $< -o $@
 
 #-------------------------------------------------------------------------------
 
 cxx_objects=$(addprefix $(BUILDDIR)/, Parameters_heft.o read_slha.o)
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 cu_objects=$(addprefix $(BUILDDIR)/, Parameters_heft_cu.o)
 endif
 
 # Target (and build rules): common (src) library
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so : $(cxx_objects) $(cu_objects)
 	@if [ ! -d $(LIBDIR) ]; then echo "mkdir -p $(LIBDIR)"; mkdir -p $(LIBDIR); fi
-	$(NVCC) -shared -o $@ $(cxx_objects) $(cu_objects) $(LDFLAGS)
+	$(GPUCC) -shared -o $@ $(cxx_objects) $(cu_objects) $(LDFLAGS)
 else
 $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so : $(cxx_objects)
 	@if [ ! -d $(LIBDIR) ]; then echo "mkdir -p $(LIBDIR)"; mkdir -p $(LIBDIR); fi
diff --git a/epochX/cudacpp/heft_gg_h.sa/src/mgOnGpuConfig.h b/epochX/cudacpp/heft_gg_h.sa/src/mgOnGpuConfig.h
index b247654dcf..da4ba36ad8 100644
--- a/epochX/cudacpp/heft_gg_h.sa/src/mgOnGpuConfig.h
+++ b/epochX/cudacpp/heft_gg_h.sa/src/mgOnGpuConfig.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jul 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MGONGPUCONFIG_H
 #define MGONGPUCONFIG_H 1
@@ -10,12 +10,25 @@
 // There are two different code bases for standalone_cudacpp (without multichannel) and madevent+cudacpp (with multichannel)
 #undef MGONGPU_SUPPORTS_MULTICHANNEL
 
+// Is this a GPU (CUDA, HIP) or CPU implementation?
+#ifdef __CUDACC__
+#define MGONGPUCPP_GPUIMPL cuda
+#elif defined __HIPCC__
+#define MGONGPUCPP_GPUIMPL hip
+#else
+#undef MGONGPUCPP_GPUIMPL
+#endif
+
 // ** NB1 Throughputs (e.g. 6.8E8) are events/sec for "./gcheck.exe -p 65536 128 12"
 // ** NB2 Baseline on b7g47n0004 fluctuates (probably depends on load on other VMs)
 
 // Choose if curand is supported for generating random numbers
+// For HIP, by default, do not use curand (common random numbers will be used instead)
 // For both CUDA and C++, by default, do not inline, but allow this macro to be set from outside with e.g. -DMGONGPU_HAS_NO_CURAND
-// (there exist CUDA installations, e.g. using the HPC package, which do not include curand - see PR #784)
+// (there exist CUDA installations, e.g. using the HPC package, which do not include curand - see PR #784 and #785)
+#if defined __HIPCC__
+#define MGONGPU_HAS_NO_CURAND 1
+#else
 //#ifdef __CUDACC__
 //#undef MGONGPU_HAS_NO_CURAND // default
 ////#define MGONGPU_HAS_NO_CURAND 1
@@ -23,6 +36,7 @@
 //#undef MGONGPU_HAS_NO_CURAND // default
 ////#define MGONGPU_HAS_NO_CURAND 1
 //#endif
+#endif
 
 // Choose floating point precision (for everything but color algebra #537)
 // If one of these macros has been set from outside with e.g. -DMGONGPU_FPTYPE_FLOAT, nothing happens (issue #167)
@@ -54,23 +68,28 @@
 //#undef MGONGPU_HARDCODE_PARAM // default
 ////#define MGONGPU_HARDCODE_PARAM 1
 
-// Complex type in c++: std::complex or cxsmpl (CHOOSE ONLY ONE)
-#ifndef __CUDACC__
-//#define MGONGPU_CPPCXTYPE_STDCOMPLEX 1 // ~8 percent slower on float, same on double (5.1E6/double, 9.4E6/float)
-#define MGONGPU_CPPCXTYPE_CXSMPL 1 // new default (5.1E6/double, 10.2E6/float)
-#endif
-
-// Complex type in cuda: thrust or cucomplex or cxsmpl (CHOOSE ONLY ONE)
+// Complex type in CUDA: thrust or cucomplex or cxsmpl (CHOOSE ONLY ONE)
 #ifdef __CUDACC__
 #define MGONGPU_CUCXTYPE_THRUST 1 // default (~1.15E9/double, ~3.2E9/float)
 //#define MGONGPU_CUCXTYPE_CUCOMPLEX 1 // ~10 percent slower (1.03E9/double, ~2.8E9/float)
 //#define MGONGPU_CUCXTYPE_CXSMPL 1 // ~10 percent slower (1.00E9/double, ~2.9E9/float)
+
+// Complex type in HIP: cxsmpl (ONLY ONE OPTION POSSIBLE)
+#elif defined __HIPCC__
+#define MGONGPU_CUCXTYPE_CXSMPL 1 // ~10 percent slower (1.00E9/double, ~2.9E9/float)
+
+// Complex type in C++: std::complex or cxsmpl (CHOOSE ONLY ONE)
+#else
+//#define MGONGPU_CPPCXTYPE_STDCOMPLEX 1 // ~8 percent slower on float, same on double (5.1E6/double, 9.4E6/float)
+#define MGONGPU_CPPCXTYPE_CXSMPL 1 // new default (5.1E6/double, 10.2E6/float)
 #endif
 
-// Cuda nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation
+// CUDA nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation
 #ifdef __CUDACC__
-#undef MGONGPU_NSIGHT_DEBUG // default
+#undef MGONGPU_NSIGHT_DEBUG // default in CUDA
 //#define MGONGPU_NSIGHT_DEBUG 1
+#else
+#undef MGONGPU_NSIGHT_DEBUG // only option in HIP or C++
 #endif
 
 // SANITY CHECKS (floating point precision for everything but color algebra #537)
@@ -86,17 +105,21 @@
 #error You cannot use double precision for color algebra and single precision elsewhere
 #endif
 
-// SANITY CHECKS (c++ complex number implementation)
-#ifndef __CUDACC__
-#if defined MGONGPU_CPPCXTYPE_STDCOMPLEX and defined MGONGPU_CPPCXTYPE_CXSMPL
-#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CPPCXTYPE_STDCOMPLEX or MGONGPU_CPPCXTYPE_CXSMPL
+// SANITY CHECKS (CUDA complex number implementation)
+#ifdef __CUDACC__
+#if defined MGONGPU_CUCXTYPE_THRUST and defined MGONGPU_CUCXTYPE_CUCOMPLEX
+#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CUCXTYPE_THRUST or MGONGPU_CUCXTYPE_CUCOMPLEX for CUDA
+#elif defined MGONGPU_CUCXTYPE_THRUST and defined MGONGPU_CUCXTYPE_CXSMPL
+#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CUCXTYPE_THRUST or MGONGPU_CUCXTYPE_CXSMPL for CUDA
+#elif defined MGONGPU_CUCXTYPE_CUCOMPLEX and defined MGONGPU_CUCXTYPE_CXSMPL
+#error You must CHOOSE (ONE AND) ONLY ONE OF MGONGPU_CUCXTYPE_CUCOMPLEX or MGONGPU_CUCXTYPE_CXSMPL for CUDA
 #endif
 #endif
 
-// SANITY CHECKS (cuda complex number implementation)
-#ifdef __CUDACC__
-#if defined MGONGPU_CUCXTYPE_THRUST and defined MGONGPU_CUCXTYPE_CUCOMPLEX and defined MGONGPU_CUCXTYPE_CXSMPL
-#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CUCXTYPE_THRUST or MGONGPU_CUCXTYPE_CUCOMPLEX or MGONGPU_CUCXTYPE_CXSMPL
+// SANITY CHECKS (C++ complex number implementation)
+#ifndef MGONGPUCPP_GPUIMPL
+#if defined MGONGPU_CPPCXTYPE_STDCOMPLEX and defined MGONGPU_CPPCXTYPE_CXSMPL
+#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CPPCXTYPE_STDCOMPLEX or MGONGPU_CPPCXTYPE_CXSMPL for C++
 #endif
 #endif
 
@@ -134,7 +157,7 @@ namespace mgOnGpu
   // Alignment requirement for using reinterpret_cast with SIMD vectorized code
   // (using reinterpret_cast with non aligned memory may lead to segmentation faults!)
   // Only needed for C++ code but can be enforced also in NVCC builds of C++ code using CUDA>=11.2 and C++17 (#318, #319, #333)
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   constexpr int cppAlign = 64; // alignment requirement for SIMD vectorization (64-byte i.e. 512-bit)
 #endif
 
@@ -145,7 +168,7 @@ using mgOnGpu::fptype;
 using mgOnGpu::fptype2;
 
 // C++ SIMD vectorization width (this will be used to set neppV)
-#ifdef __CUDACC__ // CUDA implementation has no SIMD
+#ifdef MGONGPUCPP_GPUIMPL // CUDA and HIP implementations have no SIMD
 #undef MGONGPU_CPPSIMD
 #elif defined __AVX512VL__ && defined MGONGPU_PVW512 // C++ "512z" AVX512 with 512 width (512-bit ie 64-byte): 8 (DOUBLE) or 16 (FLOAT)
 #ifdef MGONGPU_FPTYPE_DOUBLE
@@ -175,9 +198,9 @@ using mgOnGpu::fptype2;
 #undef MGONGPU_CPPSIMD
 #endif
 
-// Cuda nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation
+// CUDA nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation
 // Arguments (not used so far): text is __FUNCTION__, code is 0 (start) or 1 (end)
-#if defined __CUDACC__ && defined MGONGPU_NSIGHT_DEBUG /* clang-format off */
+#if defined __CUDA__ && defined MGONGPU_NSIGHT_DEBUG /* clang-format off */
 #define mgDebugDeclare() __shared__ float mgDebugCounter[mgOnGpu::ntpbMAX];
 #define mgDebugInitialise() { mgDebugCounter[threadIdx.x] = 0; }
 #define mgDebug( code, text ) { mgDebugCounter[threadIdx.x] += 1; }
@@ -189,8 +212,8 @@ using mgOnGpu::fptype2;
 #define mgDebugFinalise() { /*noop*/ }
 #endif /* clang-format on */
 
-// Define empty CUDA declaration specifiers for C++
-#ifndef __CUDACC__
+// Define empty CUDA/HIP declaration specifiers for C++
+#ifndef MGONGPUCPP_GPUIMPL
 #define __global__
 #define __host__
 #define __device__
diff --git a/epochX/cudacpp/heft_gg_h.sa/src/mgOnGpuCxtypes.h b/epochX/cudacpp/heft_gg_h.sa/src/mgOnGpuCxtypes.h
index ca9a9f00c0..5532e22fa1 100644
--- a/epochX/cudacpp/heft_gg_h.sa/src/mgOnGpuCxtypes.h
+++ b/epochX/cudacpp/heft_gg_h.sa/src/mgOnGpuCxtypes.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022, based on earlier work by D. Smith) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MGONGPUCXTYPES_H
 #define MGONGPUCXTYPES_H 1
@@ -19,7 +19,7 @@
 #include <complex>
 
 // Complex type in cuda: thrust or cucomplex or cxsmpl
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 #if defined MGONGPU_CUCXTYPE_THRUST
 #pragma clang diagnostic push
 #pragma clang diagnostic ignored "-Wtautological-compare" // for icpx2021/clang13 (https://stackoverflow.com/a/15864661)
@@ -82,7 +82,7 @@ namespace mgOnGpu /* clang-format off */
 using mgOnGpu::cxsmpl;
 
 // Printout to stream for user defined types
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -92,7 +92,7 @@ namespace mg5amcCpu
   inline __host__ std::ostream&
   operator<<( std::ostream& out, const cxsmpl<FP>& c )
   {
-    out << std::complex( c.real(), c.imag() );
+    out << std::complex<FP>( c.real(), c.imag() );
     return out;
   }
 
@@ -215,14 +215,14 @@ namespace mg5amcCpu
 //==========================================================================
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
 #endif
 {
   // --- Type definitions (complex type: cxtype)
-#ifdef __CUDACC__ // cuda
+#ifdef MGONGPUCPP_GPUIMPL // cuda
 #if defined MGONGPU_CUCXTYPE_THRUST
   typedef thrust::complex<fptype> cxtype;
 #elif defined MGONGPU_CUCXTYPE_CUCOMPLEX
@@ -255,7 +255,7 @@ namespace mg5amcCpu
 //==========================================================================
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -307,7 +307,7 @@ namespace mg5amcCpu
 
   //==========================================================================
 
-#if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_THRUST // cuda + thrust
+#if defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CUCXTYPE_THRUST // cuda + thrust
 
   //------------------------------
   // CUDA - using thrust::complex
@@ -343,11 +343,11 @@ namespace mg5amcCpu
     return c;
   }
 
-#endif // #if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_THRUST
+#endif // #if defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CUCXTYPE_THRUST
 
   //==========================================================================
 
-#if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_CUCOMPLEX // cuda + cucomplex
+#if defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CUCXTYPE_CUCOMPLEX // cuda + cucomplex
 
   //------------------------------
   // CUDA - using cuComplex
@@ -562,11 +562,11 @@ namespace mg5amcCpu
     return cxmake( c.real(), c.imag() );
   }
 
-#endif // #if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_CUCOMPLEX
+#endif // #if defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CUCXTYPE_CUCOMPLEX
 
   //==========================================================================
 
-#if not defined __CUDACC__ and defined MGONGPU_CPPCXTYPE_STDCOMPLEX // c++ + stdcomplex
+#if not defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CPPCXTYPE_STDCOMPLEX // c++ + stdcomplex
 
   //------------------------------
   // C++ - using std::complex
@@ -610,7 +610,7 @@ namespace mg5amcCpu
   }
 #endif
 
-#endif // #if not defined __CUDACC__ and defined MGONGPU_CPPCXTYPE_STDCOMPLEX
+#endif // #if not defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CPPCXTYPE_STDCOMPLEX
 
   //==========================================================================
 
@@ -633,7 +633,7 @@ namespace mg5amcCpu
 //==========================================================================
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/heft_gg_h.sa/src/mgOnGpuFptypes.h b/epochX/cudacpp/heft_gg_h.sa/src/mgOnGpuFptypes.h
index 905c97d700..fa3a02664b 100644
--- a/epochX/cudacpp/heft_gg_h.sa/src/mgOnGpuFptypes.h
+++ b/epochX/cudacpp/heft_gg_h.sa/src/mgOnGpuFptypes.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MGONGPUFPTYPES_H
 #define MGONGPUFPTYPES_H 1
@@ -12,7 +12,7 @@
 #include <cmath>
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL // cuda
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -20,7 +20,7 @@ namespace mg5amcCpu
 {
   //==========================================================================
 
-#ifdef __CUDACC__ // cuda
+#ifdef MGONGPUCPP_GPUIMPL // cuda
 
   //------------------------------
   // Floating point types - Cuda
@@ -64,11 +64,11 @@ namespace mg5amcCpu
 #endif
   }
 
-#endif // #ifdef __CUDACC__
+#endif // #ifdef MGONGPUCPP_GPUIMPL
 
   //==========================================================================
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 
   //------------------------------
   // Floating point types - C++
@@ -92,7 +92,7 @@ namespace mg5amcCpu
     return std::sqrt( f );
   }
 
-#endif // #ifndef __CUDACC__
+#endif // #ifndef MGONGPUCPP_GPUIMPL
 
   //==========================================================================
 
diff --git a/epochX/cudacpp/heft_gg_h.sa/src/mgOnGpuVectors.h b/epochX/cudacpp/heft_gg_h.sa/src/mgOnGpuVectors.h
index e1299ba81e..cdae04326b 100644
--- a/epochX/cudacpp/heft_gg_h.sa/src/mgOnGpuVectors.h
+++ b/epochX/cudacpp/heft_gg_h.sa/src/mgOnGpuVectors.h
@@ -32,7 +32,7 @@
 #endif
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -131,7 +131,7 @@ namespace mg5amcCpu
 #endif
 #endif
 
-#else // i.e #ifndef MGONGPU_CPPSIMD (this includes #ifdef __CUDACC__)
+#else // i.e #ifndef MGONGPU_CPPSIMD (this includes #ifdef MGONGPUCPP_GPUIMPL)
 
   const int neppV = 1;
 
@@ -153,13 +153,13 @@ namespace mg5amcCpu
 //==========================================================================
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
 #endif
 {
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 
   // Printout to stream for user defined types
 
@@ -805,11 +805,11 @@ namespace mg5amcCpu
 
 #endif // #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
 
-#endif // #ifndef __CUDACC__
+#endif // #ifndef MGONGPUCPP_GPUIMPL
 
   //==========================================================================
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 
   //------------------------------
   // Vector types - CUDA
@@ -853,12 +853,12 @@ namespace mg5amcCpu
     return mask;
   }
 
-#endif // #ifdef __CUDACC__
+#endif // #ifdef MGONGPUCPP_GPUIMPL
 
   //==========================================================================
 
   // Scalar-or-vector types: scalar in CUDA, vector or scalar in C++
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   typedef bool bool_sv;
   typedef fptype fptype_sv;
   typedef fptype2 fptype2_sv;
@@ -879,7 +879,7 @@ namespace mg5amcCpu
 #endif
 
   // Scalar-or-vector zeros: scalar in CUDA, vector or scalar in C++
-#ifdef __CUDACC__ /* clang-format off */
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
   inline __host__ __device__ cxtype cxzero_sv(){ return cxtype( 0, 0 ); }
 #elif defined MGONGPU_CPPSIMD
   inline cxtype_v cxzero_sv() { return cxtype_v(); } // RRRR=0000 IIII=0000
diff --git a/epochX/cudacpp/heft_gg_h.sa/src/rambo.h b/epochX/cudacpp/heft_gg_h.sa/src/rambo.h
index e02ea52496..cd7e1008ea 100644
--- a/epochX/cudacpp/heft_gg_h.sa/src/rambo.h
+++ b/epochX/cudacpp/heft_gg_h.sa/src/rambo.h
@@ -4,7 +4,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 
 #include "mgOnGpuConfig.h"
@@ -18,7 +18,7 @@
 #include <iostream>
 
 // Simplified rambo version for 2 to N (with N>=2) processes with massless particles
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -83,7 +83,7 @@ namespace mg5amcCpu
       static bool first = true;
       if( first )
       {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
         if constexpr( M_ACCESS::isOnDevice() ) // avoid
         {
           const int ievt0 = 0;
@@ -166,7 +166,7 @@ namespace mg5amcCpu
     wt = po2log;
     if( nparf != 2 ) wt = ( 2. * nparf - 4. ) * log( energy ) + z[nparf - 1];
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
     // issue warnings if weight is too small or too large
     static int iwarn[5] = { 0, 0, 0, 0, 0 };
     if( wt < -180. )
diff --git a/epochX/cudacpp/pp_tt012j.mad/CODEGEN_mad_pp_tt012j_log.txt b/epochX/cudacpp/pp_tt012j.mad/CODEGEN_mad_pp_tt012j_log.txt
index ff161c336f..bfc7dc3052 100644
--- a/epochX/cudacpp/pp_tt012j.mad/CODEGEN_mad_pp_tt012j_log.txt
+++ b/epochX/cudacpp/pp_tt012j.mad/CODEGEN_mad_pp_tt012j_log.txt
@@ -52,7 +52,7 @@ Note that you can still compile and run aMC@NLO with the built-in PDFs
 
 Using default text editor "vi". Set another one in ./input/mg5_configuration.txt
 Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt
-No valid web browser found. Please set in ./input/mg5_configuration.txt
+Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt
 import /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j.mg
 The import format was not given, so we guess it as command
 set stdout_level DEBUG
@@ -61,7 +61,7 @@ set zerowidth_tchannel F
 define j = p
 INFO: load particles 
 INFO: load vertices 
-[1;32mDEBUG: model prefixing  takes 0.005424976348876953 [0m
+[1;32mDEBUG: model prefixing  takes 0.005826234817504883 [0m
 INFO: Restrict model sm with file models/sm/restrict_default.dat . 
 [1;32mDEBUG: Simplifying conditional expressions [0m
 [1;32mDEBUG: remove interactions: u s w+ at order: QED=1 [0m
@@ -172,7 +172,7 @@ INFO: Process u~ u > t t~ added to mirror process u u~ > t t~
 INFO: Process c~ c > t t~ added to mirror process c c~ > t t~ 
 INFO: Process d~ d > t t~ added to mirror process d d~ > t t~ 
 INFO: Process s~ s > t t~ added to mirror process s s~ > t t~ 
-5 processes with 7 diagrams generated in 0.030 s
+5 processes with 7 diagrams generated in 0.031 s
 Total: 5 processes with 7 diagrams
 add process p p > t t~ j @1
 INFO: Checking for minimal orders which gives processes. 
@@ -212,7 +212,7 @@ INFO: Process d~ g > t t~ d~ added to mirror process g d~ > t t~ d~
 INFO: Process d~ d > t t~ g added to mirror process d d~ > t t~ g 
 INFO: Process s~ g > t t~ s~ added to mirror process g s~ > t t~ s~ 
 INFO: Process s~ s > t t~ g added to mirror process s s~ > t t~ g 
-13 processes with 76 diagrams generated in 0.139 s
+13 processes with 76 diagrams generated in 0.144 s
 Total: 18 processes with 83 diagrams
 add process p p > t t~ j j @2
 INFO: Checking for minimal orders which gives processes. 
@@ -378,7 +378,7 @@ INFO: Process s~ u~ > t t~ u~ s~ added to mirror process u~ s~ > t t~ u~ s~
 INFO: Process s~ c~ > t t~ c~ s~ added to mirror process c~ s~ > t t~ c~ s~ 
 INFO: Process s~ d~ > t t~ d~ s~ added to mirror process d~ s~ > t t~ d~ s~ 
 INFO: Crossed process found for s~ s~ > t t~ s~ s~, reuse diagrams. 
-65 processes with 1119 diagrams generated in 1.876 s
+65 processes with 1119 diagrams generated in 1.923 s
 Total: 83 processes with 1202 diagrams
 output madevent ../TMPOUT/CODEGEN_mad_pp_tt012j --hel_recycling=False --vector_size=32 --me_exporter=standalone_cudacpp
 Load PLUGIN.CUDACPP_OUTPUT
@@ -497,7 +497,7 @@ INFO: Combined process d d~ > t t~ WEIGHTED<=2 with process u u~ > t t~ WEIGHTED
 INFO: Combined process s s~ > t t~ WEIGHTED<=2 with process u u~ > t t~ WEIGHTED<=2 
 INFO: Creating files in directory P2_gg_ttxgg 
 [1;32mDEBUG:  kwargs[prefix] = 0 [1;30m[model_handling.py at line 1058][0m [0m
-[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7ff282794c40> [1;30m[export_v4.py at line 6262][0m [0m
+[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7f0cf1000e80> [1;30m[export_v4.py at line 6262][0m [0m
 INFO: Creating files in directory . 
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.h
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.cc
@@ -514,7 +514,7 @@ INFO: Generating Feynman diagrams for Process: g g > t t~ g g WEIGHTED<=4 @2
 INFO: Finding symmetric diagrams for subprocess group gg_ttxgg 
 INFO: Creating files in directory P2_gg_ttxuux 
 [1;32mDEBUG:  kwargs[prefix] = 0 [1;30m[model_handling.py at line 1058][0m [0m
-[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7ff282794430> [1;30m[export_v4.py at line 6262][0m [0m
+[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7f0cf080ad90> [1;30m[export_v4.py at line 6262][0m [0m
 INFO: Creating files in directory . 
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.h
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.cc
@@ -531,7 +531,7 @@ INFO: Generating Feynman diagrams for Process: g g > t t~ u u~ WEIGHTED<=4 @2
 INFO: Finding symmetric diagrams for subprocess group gg_ttxuux 
 INFO: Creating files in directory P2_gu_ttxgu 
 [1;32mDEBUG:  kwargs[prefix] = 0 [1;30m[model_handling.py at line 1058][0m [0m
-[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7ff282addeb0> [1;30m[export_v4.py at line 6262][0m [0m
+[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7f0cf0b5b190> [1;30m[export_v4.py at line 6262][0m [0m
 INFO: Creating files in directory . 
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.h
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.cc
@@ -548,7 +548,7 @@ INFO: Generating Feynman diagrams for Process: g u > t t~ g u WEIGHTED<=4 @2
 INFO: Finding symmetric diagrams for subprocess group gu_ttxgu 
 INFO: Creating files in directory P2_gux_ttxgux 
 [1;32mDEBUG:  kwargs[prefix] = 0 [1;30m[model_handling.py at line 1058][0m [0m
-[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7ff2832e91c0> [1;30m[export_v4.py at line 6262][0m [0m
+[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7f0cf079a490> [1;30m[export_v4.py at line 6262][0m [0m
 INFO: Creating files in directory . 
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.h
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.cc
@@ -565,7 +565,7 @@ INFO: Generating Feynman diagrams for Process: g u~ > t t~ g u~ WEIGHTED<=4 @2
 INFO: Finding symmetric diagrams for subprocess group gux_ttxgux 
 INFO: Creating files in directory P2_uux_ttxgg 
 [1;32mDEBUG:  kwargs[prefix] = 0 [1;30m[model_handling.py at line 1058][0m [0m
-[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7ff282919250> [1;30m[export_v4.py at line 6262][0m [0m
+[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7f0cf0b5b190> [1;30m[export_v4.py at line 6262][0m [0m
 INFO: Creating files in directory . 
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.h
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.cc
@@ -582,7 +582,7 @@ INFO: Generating Feynman diagrams for Process: u u~ > t t~ g g WEIGHTED<=4 @2
 INFO: Finding symmetric diagrams for subprocess group uux_ttxgg 
 INFO: Creating files in directory P1_gg_ttxg 
 [1;32mDEBUG:  kwargs[prefix] = 0 [1;30m[model_handling.py at line 1058][0m [0m
-[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7ff282919250> [1;30m[export_v4.py at line 6262][0m [0m
+[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7f0cf08127c0> [1;30m[export_v4.py at line 6262][0m [0m
 INFO: Creating files in directory . 
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.h
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.cc
@@ -599,7 +599,7 @@ INFO: Generating Feynman diagrams for Process: g g > t t~ g WEIGHTED<=3 @1
 INFO: Finding symmetric diagrams for subprocess group gg_ttxg 
 INFO: Creating files in directory P2_uu_ttxuu 
 [1;32mDEBUG:  kwargs[prefix] = 0 [1;30m[model_handling.py at line 1058][0m [0m
-[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7ff282ae3ca0> [1;30m[export_v4.py at line 6262][0m [0m
+[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7f0cf08127c0> [1;30m[export_v4.py at line 6262][0m [0m
 INFO: Creating files in directory . 
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.h
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.cc
@@ -616,7 +616,7 @@ INFO: Generating Feynman diagrams for Process: u u > t t~ u u WEIGHTED<=4 @2
 INFO: Finding symmetric diagrams for subprocess group uu_ttxuu 
 INFO: Creating files in directory P2_uux_ttxuux 
 [1;32mDEBUG:  kwargs[prefix] = 0 [1;30m[model_handling.py at line 1058][0m [0m
-[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7ff282aa5be0> [1;30m[export_v4.py at line 6262][0m [0m
+[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7f0cf08127c0> [1;30m[export_v4.py at line 6262][0m [0m
 INFO: Creating files in directory . 
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.h
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.cc
@@ -633,7 +633,7 @@ INFO: Generating Feynman diagrams for Process: u u~ > t t~ u u~ WEIGHTED<=4 @2
 INFO: Finding symmetric diagrams for subprocess group uux_ttxuux 
 INFO: Creating files in directory P2_uxux_ttxuxux 
 [1;32mDEBUG:  kwargs[prefix] = 0 [1;30m[model_handling.py at line 1058][0m [0m
-[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7ff282fa8f70> [1;30m[export_v4.py at line 6262][0m [0m
+[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7f0cf08127c0> [1;30m[export_v4.py at line 6262][0m [0m
 INFO: Creating files in directory . 
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.h
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.cc
@@ -650,7 +650,7 @@ INFO: Generating Feynman diagrams for Process: u~ u~ > t t~ u~ u~ WEIGHTED<=4 @2
 INFO: Finding symmetric diagrams for subprocess group uxux_ttxuxux 
 INFO: Creating files in directory P2_uc_ttxuc 
 [1;32mDEBUG:  kwargs[prefix] = 0 [1;30m[model_handling.py at line 1058][0m [0m
-[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7ff282e2d850> [1;30m[export_v4.py at line 6262][0m [0m
+[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7f0cf1000e80> [1;30m[export_v4.py at line 6262][0m [0m
 INFO: Creating files in directory . 
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.h
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.cc
@@ -667,7 +667,7 @@ INFO: Generating Feynman diagrams for Process: u c > t t~ u c WEIGHTED<=4 @2
 INFO: Finding symmetric diagrams for subprocess group uc_ttxuc 
 INFO: Creating files in directory P2_uux_ttxccx 
 [1;32mDEBUG:  kwargs[prefix] = 0 [1;30m[model_handling.py at line 1058][0m [0m
-[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7ff282addd60> [1;30m[export_v4.py at line 6262][0m [0m
+[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7f0cf1000e80> [1;30m[export_v4.py at line 6262][0m [0m
 INFO: Creating files in directory . 
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.h
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.cc
@@ -684,7 +684,7 @@ INFO: Generating Feynman diagrams for Process: u u~ > t t~ c c~ WEIGHTED<=4 @2
 INFO: Finding symmetric diagrams for subprocess group uux_ttxccx 
 INFO: Creating files in directory P2_ucx_ttxucx 
 [1;32mDEBUG:  kwargs[prefix] = 0 [1;30m[model_handling.py at line 1058][0m [0m
-[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7ff282a84d30> [1;30m[export_v4.py at line 6262][0m [0m
+[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7f0cf080ad00> [1;30m[export_v4.py at line 6262][0m [0m
 INFO: Creating files in directory . 
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.h
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.cc
@@ -701,7 +701,7 @@ INFO: Generating Feynman diagrams for Process: u c~ > t t~ u c~ WEIGHTED<=4 @2
 INFO: Finding symmetric diagrams for subprocess group ucx_ttxucx 
 INFO: Creating files in directory P2_uxcx_ttxuxcx 
 [1;32mDEBUG:  kwargs[prefix] = 0 [1;30m[model_handling.py at line 1058][0m [0m
-[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7ff282e2d850> [1;30m[export_v4.py at line 6262][0m [0m
+[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7f0cf080ad00> [1;30m[export_v4.py at line 6262][0m [0m
 INFO: Creating files in directory . 
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.h
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.cc
@@ -718,7 +718,7 @@ INFO: Generating Feynman diagrams for Process: u~ c~ > t t~ u~ c~ WEIGHTED<=4 @2
 INFO: Finding symmetric diagrams for subprocess group uxcx_ttxuxcx 
 INFO: Creating files in directory P1_gu_ttxu 
 [1;32mDEBUG:  kwargs[prefix] = 0 [1;30m[model_handling.py at line 1058][0m [0m
-[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7ff282e2d850> [1;30m[export_v4.py at line 6262][0m [0m
+[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7f0cf080ad90> [1;30m[export_v4.py at line 6262][0m [0m
 INFO: Creating files in directory . 
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.h
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.cc
@@ -735,7 +735,7 @@ INFO: Generating Feynman diagrams for Process: g u > t t~ u WEIGHTED<=3 @1
 INFO: Finding symmetric diagrams for subprocess group gu_ttxu 
 INFO: Creating files in directory P1_gux_ttxux 
 [1;32mDEBUG:  kwargs[prefix] = 0 [1;30m[model_handling.py at line 1058][0m [0m
-[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7ff282e2d850> [1;30m[export_v4.py at line 6262][0m [0m
+[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7f0cf075e730> [1;30m[export_v4.py at line 6262][0m [0m
 INFO: Creating files in directory . 
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.h
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.cc
@@ -752,7 +752,7 @@ INFO: Generating Feynman diagrams for Process: g u~ > t t~ u~ WEIGHTED<=3 @1
 INFO: Finding symmetric diagrams for subprocess group gux_ttxux 
 INFO: Creating files in directory P1_uux_ttxg 
 [1;32mDEBUG:  kwargs[prefix] = 0 [1;30m[model_handling.py at line 1058][0m [0m
-[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7ff282aa3a00> [1;30m[export_v4.py at line 6262][0m [0m
+[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7f0cf0bd00a0> [1;30m[export_v4.py at line 6262][0m [0m
 INFO: Creating files in directory . 
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.h
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.cc
@@ -769,7 +769,7 @@ INFO: Generating Feynman diagrams for Process: u u~ > t t~ g WEIGHTED<=3 @1
 INFO: Finding symmetric diagrams for subprocess group uux_ttxg 
 INFO: Creating files in directory P0_gg_ttx 
 [1;32mDEBUG:  kwargs[prefix] = 0 [1;30m[model_handling.py at line 1058][0m [0m
-[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7ff282addd60> [1;30m[export_v4.py at line 6262][0m [0m
+[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7f0cf1000e80> [1;30m[export_v4.py at line 6262][0m [0m
 INFO: Creating files in directory . 
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.h
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.cc
@@ -786,7 +786,7 @@ INFO: Generating Feynman diagrams for Process: g g > t t~ WEIGHTED<=2
 INFO: Finding symmetric diagrams for subprocess group gg_ttx 
 INFO: Creating files in directory P0_uux_ttx 
 [1;32mDEBUG:  kwargs[prefix] = 0 [1;30m[model_handling.py at line 1058][0m [0m
-[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7ff282794c40> [1;30m[export_v4.py at line 6262][0m [0m
+[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7f0cf0b5aaf0> [1;30m[export_v4.py at line 6262][0m [0m
 INFO: Creating files in directory . 
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.h
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.cc
@@ -801,15 +801,15 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./.
 [1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m 32 True 32 [1;30m[export_v4.py at line 1872][0m [0m
 INFO: Generating Feynman diagrams for Process: u u~ > t t~ WEIGHTED<=2 
 INFO: Finding symmetric diagrams for subprocess group uux_ttx 
-Generated helas calls for 18 subprocesses (372 diagrams) in 1.312 s
-Wrote files for 810 helas calls in 3.308 s
+Generated helas calls for 18 subprocesses (372 diagrams) in 1.345 s
+Wrote files for 810 helas calls in 3.264 s
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates VVV1 routines[0m
 ALOHA: aloha creates FFV1 routines[0m
 ALOHA: aloha creates VVVV1 routines[0m
 ALOHA: aloha creates VVVV3 routines[0m
 ALOHA: aloha creates VVVV4 routines[0m
-ALOHA: aloha creates 5 routines in  0.342 s
+ALOHA: aloha creates 5 routines in  0.333 s
 [1;32mDEBUG:  Entering PLUGIN_ProcessExporter.convert_model (create the model) [1;30m[output.py at line 202][0m [0m
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates VVV1 routines[0m
@@ -817,7 +817,7 @@ ALOHA: aloha creates FFV1 routines[0m
 ALOHA: aloha creates VVVV1 routines[0m
 ALOHA: aloha creates VVVV3 routines[0m
 ALOHA: aloha creates VVVV4 routines[0m
-ALOHA: aloha creates 10 routines in  0.321 s
+ALOHA: aloha creates 10 routines in  0.310 s
 <class 'aloha.create_aloha.AbstractRoutine'> VVV1
 <class 'aloha.create_aloha.AbstractRoutine'> VVV1
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
@@ -1028,9 +1028,9 @@ Type "launch" to generate events from this process, or see
 Run "open index.html" to see more information about this process.
 quit
 
-real	0m9.073s
-user	0m8.514s
-sys	0m0.464s
+real	0m9.088s
+user	0m8.481s
+sys	0m0.545s
 Code generation completed in 9 seconds
 ************************************************************
 *                                                          *
@@ -1057,7 +1057,7 @@ INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/mg5amc
 INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/Cards/me5_configuration.txt  
 Using default text editor "vi". Set another one in ./input/mg5_configuration.txt
 Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt
-No valid web browser found. Please set in ./input/mg5_configuration.txt
+Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt
 treatcards run
 quit
 INFO:  
@@ -1087,7 +1087,7 @@ INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/mg5amc
 INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/Cards/me5_configuration.txt  
 Using default text editor "vi". Set another one in ./input/mg5_configuration.txt
 Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt
-No valid web browser found. Please set in ./input/mg5_configuration.txt
+Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt
 treatcards param
 quit
 INFO:  
diff --git a/epochX/cudacpp/pp_tt012j.mad/COPYRIGHT b/epochX/cudacpp/pp_tt012j.mad/COPYRIGHT
index a134b5fef9..84a883fbb0 100644
--- a/epochX/cudacpp/pp_tt012j.mad/COPYRIGHT
+++ b/epochX/cudacpp/pp_tt012j.mad/COPYRIGHT
@@ -15,6 +15,7 @@ The full development team currently includes the following authors :
   Stephan Hageboeck (CERN)
   Olivier Mattelaer (Universite Catholique de Louvain, original author)
   Stefan Roiser (CERN, original author)
+  Joergen Teig (CERN)
   Andrea Valassi (CERN, original author)
   Zenny Wettersten (CERN)
 See https://github.com/madgraph5/madgraph4gpu for more details. For the full
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/Bridge.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/Bridge.h
index bf8b5e024d..89437b4c42 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/Bridge.h
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/Bridge.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: S. Roiser (Nov 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Roiser, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef BRIDGE_H
 #define BRIDGE_H 1
@@ -23,7 +23,7 @@
 #include <memory>
 #include <type_traits>
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -83,7 +83,7 @@ namespace mg5amcCpu
     Bridge& operator=( const Bridge& ) = delete;
     Bridge& operator=( Bridge&& ) = delete;
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     /**
      * Set the gpublocks and gputhreads for the gpusequence - throws if evnt != gpublocks*gputhreads
      * (this is needed for BridgeKernel tests rather than for actual production use in Fortran)
@@ -150,7 +150,7 @@ namespace mg5amcCpu
     unsigned int m_nevt; // number of events
     int m_nGoodHel;      // the number of good helicities (-1 initially when they have not yet been calculated)
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     int m_gputhreads; // number of gpu threads (default set from number of events, can be modified)
     int m_gpublocks;  // number of gpu blocks (default set from number of events, can be modified)
     DeviceBuffer<FORTRANFPTYPE, sizePerEventMomenta> m_devMomentaF;
@@ -187,12 +187,12 @@ namespace mg5amcCpu
   // Forward declare transposition methods
   //
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 
   template<typename Tin, typename Tout>
   __global__ void dev_transposeMomentaF2C( const Tin* in, Tout* out, const unsigned int nevt );
 
-#endif // __CUDACC__
+#endif // MGONGPUCPP_GPUIMPL
 
   template<typename Tin, typename Tout>
   void hst_transposeMomentaF2C( const Tin* in, Tout* out, const unsigned int nevt );
@@ -209,7 +209,7 @@ namespace mg5amcCpu
   Bridge<FORTRANFPTYPE>::Bridge( unsigned int nevtF, unsigned int nparF, unsigned int np4F )
     : m_nevt( nevtF )
     , m_nGoodHel( -1 )
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     , m_gputhreads( 256 )                  // default number of gpu threads
     , m_gpublocks( m_nevt / m_gputhreads ) // this ensures m_nevt <= m_gpublocks*m_gputhreads
     , m_devMomentaF( m_nevt )
@@ -233,7 +233,7 @@ namespace mg5amcCpu
   {
     if( nparF != CPPProcess::npar ) throw std::runtime_error( "Bridge constructor: npar mismatch" );
     if( np4F != CPPProcess::np4 ) throw std::runtime_error( "Bridge constructor: np4 mismatch" );
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     if( ( m_nevt < s_gputhreadsmin ) || ( m_nevt % s_gputhreadsmin != 0 ) )
       throw std::runtime_error( "Bridge constructor: nevt should be a multiple of " + std::to_string( s_gputhreadsmin ) );
     while( m_nevt != m_gpublocks * m_gputhreads )
@@ -249,7 +249,7 @@ namespace mg5amcCpu
 #else
     std::cout << "WARNING! Instantiate host Bridge (nevt=" << m_nevt << ")" << std::endl;
     m_pmek.reset( new MatrixElementKernelHost( m_hstMomentaC, m_hstGs, m_hstRndHel, m_hstRndCol, m_hstMEs, m_hstSelHel, m_hstSelCol, m_nevt ) );
-#endif // __CUDACC__
+#endif // MGONGPUCPP_GPUIMPL
     // Create a process object, read param card and set parameters
     // FIXME: the process instance can happily go out of scope because it is only needed to read parameters?
     // FIXME: the CPPProcess should really be a singleton? what if fbridgecreate is called from several Fortran threads?
@@ -262,7 +262,7 @@ namespace mg5amcCpu
     process.initProc( paramCard );
   }
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   template<typename FORTRANFPTYPE>
   void Bridge<FORTRANFPTYPE>::set_gpugrid( const int gpublocks, const int gputhreads )
   {
@@ -276,7 +276,7 @@ namespace mg5amcCpu
   }
 #endif
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   template<typename FORTRANFPTYPE>
   void Bridge<FORTRANFPTYPE>::gpu_sequence( const FORTRANFPTYPE* momenta,
                                             const FORTRANFPTYPE* gs,
@@ -291,14 +291,14 @@ namespace mg5amcCpu
     constexpr int neppM = MemoryAccessMomenta::neppM;
     if constexpr( neppM == 1 && std::is_same_v<FORTRANFPTYPE, fptype> )
     {
-      checkCuda( cudaMemcpy( m_devMomentaC.data(), momenta, m_devMomentaC.bytes(), cudaMemcpyHostToDevice ) );
+      gpuMemcpy( m_devMomentaC.data(), momenta, m_devMomentaC.bytes(), gpuMemcpyHostToDevice );
     }
     else
     {
-      checkCuda( cudaMemcpy( m_devMomentaF.data(), momenta, m_devMomentaF.bytes(), cudaMemcpyHostToDevice ) );
+      gpuMemcpy( m_devMomentaF.data(), momenta, m_devMomentaF.bytes(), gpuMemcpyHostToDevice );
       const int thrPerEvt = CPPProcess::npar * CPPProcess::np4; // AV: transpose alg does 1 element per thread (NOT 1 event per thread)
       //const int thrPerEvt = 1; // AV: try new alg with 1 event per thread... this seems slower
-      dev_transposeMomentaF2C<<<m_gpublocks * thrPerEvt, m_gputhreads>>>( m_devMomentaF.data(), m_devMomentaC.data(), m_nevt );
+      gpuLaunchKernel( dev_transposeMomentaF2C, m_gpublocks * thrPerEvt, m_gputhreads, m_devMomentaF.data(), m_devMomentaC.data(), m_nevt );
     }
     if constexpr( std::is_same_v<FORTRANFPTYPE, fptype> )
     {
@@ -341,7 +341,7 @@ namespace mg5amcCpu
   }
 #endif
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   template<typename FORTRANFPTYPE>
   void Bridge<FORTRANFPTYPE>::cpu_sequence( const FORTRANFPTYPE* momenta,
                                             const FORTRANFPTYPE* gs,
@@ -396,7 +396,7 @@ namespace mg5amcCpu
   // - C++ array: momenta[npagM][npar][np4][neppM] with nevt=npagM*neppM (AOSOA)
   //
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   template<typename Tin, typename Tout>
   __global__ void dev_transposeMomentaF2C( const Tin* in, Tout* out, const unsigned int nevt )
   {
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/BridgeKernels.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/BridgeKernels.cc
index d58066c9c1..eaf4037a24 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/BridgeKernels.cc
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/BridgeKernels.cc
@@ -1,17 +1,18 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #include "BridgeKernels.h"
 
+#include "GpuAbstraction.h"
 #include "MemoryAccessMomenta.h"
 
 #include <sstream>
 
 //============================================================================
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -45,7 +46,7 @@ namespace mg5amcCpu
 
 //============================================================================
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 namespace mg5amcCpu
 {
 
@@ -96,7 +97,7 @@ namespace mg5amcCpu
 
 //============================================================================
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 {
 
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/BridgeKernels.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/BridgeKernels.h
index 15eb4bff4d..3efef8ce97 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/BridgeKernels.h
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/BridgeKernels.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef BRIDGEKERNELS_H
 #define BRIDGEKERNELS_H 1
@@ -12,7 +12,7 @@
 #include "MatrixElementKernels.h"
 #include "MemoryBuffers.h"
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -49,7 +49,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A Bridge wrapper class encapsulating matrix element calculations on a CPU host
   class BridgeKernelHost final : public BridgeKernelBase
   {
@@ -89,7 +89,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   // A Bridge wrapper class encapsulating matrix element calculations on a GPU device
   class BridgeKernelDevice : public BridgeKernelBase
   {
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/CommonRandomNumberKernel.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/CommonRandomNumberKernel.cc
index 985b39f576..010bc4cbd0 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/CommonRandomNumberKernel.cc
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/CommonRandomNumberKernel.cc
@@ -1,15 +1,16 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #include "CommonRandomNumbers.h"
+#include "GpuAbstraction.h"
 #include "MemoryBuffers.h"
 #include "RandomNumberKernels.h"
 
 #include <cassert>
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/CrossSectionKernels.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/CrossSectionKernels.cc
index 0b355a3c8d..c15b39844d 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/CrossSectionKernels.cc
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/CrossSectionKernels.cc
@@ -1,10 +1,11 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #include "CrossSectionKernels.h"
 
+#include "GpuAbstraction.h"
 #include "MemoryAccessMatrixElements.h"
 #include "MemoryAccessWeights.h"
 #include "MemoryBuffers.h"
@@ -77,7 +78,7 @@ debug_me_is_abnormal( const fptype& me, size_t ievtALL )
 
 //============================================================================
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -185,7 +186,7 @@ namespace mg5amcCpu
 
 //============================================================================
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 {
 
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/CrossSectionKernels.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/CrossSectionKernels.h
index 7933ca4bbf..4d9659e04e 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/CrossSectionKernels.h
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/CrossSectionKernels.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef CROSSSECTIONKERNELS_H
 #define CROSSSECTIONKERNELS_H 1
@@ -13,7 +13,7 @@
 
 //============================================================================
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -96,7 +96,7 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
   /*
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   // A class encapsulating the calculation of event statistics on a GPU device
   class CrossSectionKernelDevice : public CrossSectionKernelBase, public NumberOfEvents
   {
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/CudaRuntime.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/CudaRuntime.h
deleted file mode 100644
index 64ce52f4b3..0000000000
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/CudaRuntime.h
+++ /dev/null
@@ -1,85 +0,0 @@
-// Copyright (C) 2020-2023 CERN and UCLouvain.
-// Licensed under the GNU Lesser General Public License (version 3 or later).
-// Created by: S. Roiser (Jul 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
-
-#ifndef MG5AMC_CUDARUNTIME_H
-#define MG5AMC_CUDARUNTIME_H 1
-
-// MG5AMC on GPU uses the CUDA runtime API, not the lower level CUDA driver API
-// See https://docs.nvidia.com/cuda/cuda-runtime-api/driver-vs-runtime-api.html#driver-vs-runtime-api
-
-#include <cassert>
-#include <iostream>
-
-//--------------------------------------------------------------------------
-
-// See https://stackoverflow.com/a/14038590
-#ifdef __CUDACC__ /* clang-format off */
-#define checkCuda( code ) { assertCuda( code, __FILE__, __LINE__ ); }
-inline void assertCuda( cudaError_t code, const char* file, int line, bool abort = true )
-{
-  if( code != cudaSuccess )
-  {
-    printf( "ERROR! assertCuda: '%s' (%d) in %s:%d\n", cudaGetErrorString( code ), code, file, line );
-    if( abort ) assert( code == cudaSuccess );
-  }
-}
-#endif /* clang-format on */
-
-//--------------------------------------------------------------------------
-
-#ifdef __CUDACC__
-namespace mg5amcGpu
-{
-  // Instantiate a CudaRuntime at the beginnining of the application's main to
-  // invoke cudaSetDevice(0) in the constructor and book a cudaDeviceReset() call in the destructor
-  // *** FIXME! This will all need to be designed differently when going to multi-GPU nodes! ***
-  struct CudaRuntime final
-  {
-    CudaRuntime( const bool debug = true )
-      : m_debug( debug ) { setUp( m_debug ); }
-    ~CudaRuntime() { tearDown( m_debug ); }
-    CudaRuntime( const CudaRuntime& ) = delete;
-    CudaRuntime( CudaRuntime&& ) = delete;
-    CudaRuntime& operator=( const CudaRuntime& ) = delete;
-    CudaRuntime& operator=( CudaRuntime&& ) = delete;
-    bool m_debug;
-
-    // Set up CUDA application
-    // ** NB: strictly speaking this is not needed when using the CUDA runtime API **
-    // Calling cudaSetDevice on startup is useful to properly book-keep the time spent in CUDA initialization
-    static void setUp( const bool debug = true )
-    {
-      // ** NB: it is useful to call cudaSetDevice, or cudaFree, to properly book-keep the time spent in CUDA initialization
-      // ** NB: otherwise, the first CUDA operation (eg a cudaMemcpyToSymbol in CPPProcess ctor) appears to take much longer!
-      /*
-      // [We initially added cudaFree(0) to "ease profile analysis" only because it shows up as a big recognizable block!]
-      // No explicit initialization is needed: https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#initialization
-      // It is not clear what cudaFree(0) does at all: https://stackoverflow.com/questions/69967813/
-      if ( debug ) std::cout << "__CudaRuntime: calling cudaFree(0)" << std::endl;
-      checkCuda( cudaFree( 0 ) ); // SLOW!
-      */
-      // Replace cudaFree(0) by cudaSetDevice(0), even if it is not really needed either
-      // (but see https://developer.nvidia.com/blog/cuda-pro-tip-always-set-current-device-avoid-multithreading-bugs)
-      if( debug ) std::cout << "__CudaRuntime: calling cudaSetDevice(0)" << std::endl;
-      checkCuda( cudaSetDevice( 0 ) ); // SLOW!
-    }
-
-    // Tear down CUDA application (call cudaDeviceReset)
-    // ** NB: strictly speaking this is not needed when using the CUDA runtime API **
-    // Calling cudaDeviceReset on shutdown is only needed for checking memory leaks in cuda-memcheck
-    // See https://docs.nvidia.com/cuda/cuda-memcheck/index.html#leak-checking
-    static void tearDown( const bool debug = true )
-    {
-      if( debug ) std::cout << "__CudaRuntime: calling cudaDeviceReset()" << std::endl;
-      checkCuda( cudaDeviceReset() );
-    }
-  };
-
-}
-#endif
-
-//--------------------------------------------------------------------------
-
-#endif // MG5AMC_CUDARUNTIME_H
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/CurandRandomNumberKernel.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/CurandRandomNumberKernel.cc
index eb56333b03..08a16f6f2c 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/CurandRandomNumberKernel.cc
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/CurandRandomNumberKernel.cc
@@ -1,9 +1,9 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
-#include "CudaRuntime.h"
+#include "GpuRuntime.h"
 #include "MemoryBuffers.h"
 #include "RandomNumberKernels.h"
 
@@ -22,7 +22,7 @@ inline void assertCurand( curandStatus_t code, const char *file, int line, bool
 }
 #endif /* clang-format on */
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -36,7 +36,7 @@ namespace mg5amcCpu
   {
     if( m_isOnDevice )
     {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
       if( !m_rnarray.isOnDevice() )
         throw std::runtime_error( "CurandRandomNumberKernel on device with a host random number array" );
 #else
@@ -114,7 +114,7 @@ namespace mg5amcCpu
     /*
     printf( "\nCurandRandomNumberKernel::generateRnarray size = %d\n", (int)m_rnarray.size() );
     fptype* data = m_rnarray.data();
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     if( m_rnarray.isOnDevice() )
     {
       data = new fptype[m_rnarray.size()]();
@@ -123,7 +123,7 @@ namespace mg5amcCpu
 #endif
     for( int i = 0; i < ( (int)m_rnarray.size() / 4 ); i++ )
       printf( "[%4d] %f %f %f %f\n", i * 4, data[i * 4], data[i * 4 + 2], data[i * 4 + 2], data[i * 4 + 3] );
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     if( m_rnarray.isOnDevice() ) delete[] data;
 #endif
     */
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/EventStatistics.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/EventStatistics.h
index 48b51e0a49..b425a5bade 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/EventStatistics.h
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/EventStatistics.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef EventStatistics_H
 #define EventStatistics_H 1
@@ -16,7 +16,7 @@
 #include <limits>
 #include <string>
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/GpuAbstraction.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/GpuAbstraction.h
new file mode 100644
index 0000000000..6a7d9c05c0
--- /dev/null
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/GpuAbstraction.h
@@ -0,0 +1,71 @@
+// Copyright (C) 2020-2023 CERN and UCLouvain.
+// Licensed under the GNU Lesser General Public License (version 3 or later).
+// Created by: J. Teig (Jul 2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+
+#ifndef MG5AMC_GPUABSTRACTION_H
+#define MG5AMC_GPUABSTRACTION_H 1
+
+#include <cassert>
+
+//--------------------------------------------------------------------------
+
+#ifdef __CUDACC__
+
+#define gpuError_t cudaError_t
+#define gpuPeekAtLastError cudaPeekAtLastError
+#define gpuGetErrorString cudaGetErrorString
+#define gpuSuccess cudaSuccess
+
+#define gpuMallocHost( ptr, size ) checkGpu( cudaMallocHost( ptr, size ) )
+#define gpuMalloc( ptr, size ) checkGpu( cudaMalloc( ptr, size ) )
+
+#define gpuMemcpy( dstData, srcData, srcBytes, func ) checkGpu( cudaMemcpy( dstData, srcData, srcBytes, func ) )
+#define gpuMemcpyHostToDevice cudaMemcpyHostToDevice
+#define gpuMemcpyDeviceToHost cudaMemcpyDeviceToHost
+#define gpuMemcpyToSymbol( type1, type2, size ) checkGpu( cudaMemcpyToSymbol( type1, type2, size ) )
+
+#define gpuFree( ptr ) checkGpu( cudaFree( ptr ) )
+#define gpuFreeHost( ptr ) checkGpu( cudaFreeHost( ptr ) )
+
+#define gpuSetDevice cudaSetDevice
+#define gpuDeviceSynchronize cudaDeviceSynchronize
+#define gpuDeviceReset cudaDeviceReset
+
+#define gpuLaunchKernel( kernel, blocks, threads, ... ) kernel<<<blocks, threads>>>( __VA_ARGS__ )
+#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<<blocks, threads, sharedMem>>>( __VA_ARGS__ )
+
+//--------------------------------------------------------------------------
+
+#elif defined __HIPCC__
+
+#include "hip/hip_runtime.h"
+
+#define gpuError_t hipError_t
+#define gpuPeekAtLastError hipPeekAtLastError
+#define gpuGetErrorString hipGetErrorString
+#define gpuSuccess hipSuccess
+
+#define gpuMallocHost( ptr, size ) checkGpu( hipHostMalloc( ptr, size ) ) // HostMalloc better
+#define gpuMalloc( ptr, size ) checkGpu( hipMalloc( ptr, size ) )
+
+#define gpuMemcpy( dstData, srcData, srcBytes, func ) checkGpu( hipMemcpy( dstData, srcData, srcBytes, func ) )
+#define gpuMemcpyHostToDevice hipMemcpyHostToDevice
+#define gpuMemcpyDeviceToHost hipMemcpyDeviceToHost
+#define gpuMemcpyToSymbol( type1, type2, size ) checkGpu( hipMemcpyToSymbol( type1, type2, size ) )
+
+#define gpuFree( ptr ) checkGpu( hipFree( ptr ) )
+#define gpuFreeHost( ptr ) checkGpu( hipHostFree( ptr ) )
+
+#define gpuSetDevice hipSetDevice
+#define gpuDeviceSynchronize hipDeviceSynchronize
+#define gpuDeviceReset hipDeviceReset
+
+#define gpuLaunchKernel( kernel, blocks, threads, ... ) kernel<<<blocks, threads>>>( __VA_ARGS__ )
+#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<<blocks, threads, sharedMem>>>( __VA_ARGS__ )
+
+//--------------------------------------------------------------------------
+
+#endif
+
+#endif // MG5AMC_GPUABSTRACTION_H
diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/CudaRuntime.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/GpuRuntime.h
similarity index 62%
rename from epochX/cudacpp/gq_ttq.mad/SubProcesses/CudaRuntime.h
rename to epochX/cudacpp/pp_tt012j.mad/SubProcesses/GpuRuntime.h
index 64ce52f4b3..93579ef08b 100644
--- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/CudaRuntime.h
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/GpuRuntime.h
@@ -1,49 +1,50 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
-// Created by: S. Roiser (Jul 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+// Created by: J. Teig (Jun 2023, based on earlier work by S. Roiser) for the MG5aMC CUDACPP plugin.
+// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 
-#ifndef MG5AMC_CUDARUNTIME_H
-#define MG5AMC_CUDARUNTIME_H 1
+#ifndef MG5AMC_GPURUNTIME_H
+#define MG5AMC_GPURUNTIME_H 1
 
 // MG5AMC on GPU uses the CUDA runtime API, not the lower level CUDA driver API
 // See https://docs.nvidia.com/cuda/cuda-runtime-api/driver-vs-runtime-api.html#driver-vs-runtime-api
 
-#include <cassert>
+#include "GpuAbstraction.h"
+
 #include <iostream>
 
 //--------------------------------------------------------------------------
 
 // See https://stackoverflow.com/a/14038590
-#ifdef __CUDACC__ /* clang-format off */
-#define checkCuda( code ) { assertCuda( code, __FILE__, __LINE__ ); }
-inline void assertCuda( cudaError_t code, const char* file, int line, bool abort = true )
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
+#define checkGpu( code ) { assertGpu( code, __FILE__, __LINE__ ); }
+inline void assertGpu( gpuError_t code, const char* file, int line, bool abort = true )
 {
-  if( code != cudaSuccess )
+  if( code != gpuSuccess )
   {
-    printf( "ERROR! assertCuda: '%s' (%d) in %s:%d\n", cudaGetErrorString( code ), code, file, line );
-    if( abort ) assert( code == cudaSuccess );
+    printf( "ERROR! assertGpu: '%s' (%d) in %s:%d\n", gpuGetErrorString( code ), code, file, line );
+    if( abort ) assert( code == gpuSuccess );
   }
 }
 #endif /* clang-format on */
 
 //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 {
-  // Instantiate a CudaRuntime at the beginnining of the application's main to
-  // invoke cudaSetDevice(0) in the constructor and book a cudaDeviceReset() call in the destructor
+  // Instantiate a GpuRuntime at the beginnining of the application's main to
+  // invoke gpuSetDevice(0) in the constructor and book a gpuDeviceReset() call in the destructor
   // *** FIXME! This will all need to be designed differently when going to multi-GPU nodes! ***
-  struct CudaRuntime final
+  struct GpuRuntime final
   {
-    CudaRuntime( const bool debug = true )
+    GpuRuntime( const bool debug = true )
       : m_debug( debug ) { setUp( m_debug ); }
-    ~CudaRuntime() { tearDown( m_debug ); }
-    CudaRuntime( const CudaRuntime& ) = delete;
-    CudaRuntime( CudaRuntime&& ) = delete;
-    CudaRuntime& operator=( const CudaRuntime& ) = delete;
-    CudaRuntime& operator=( CudaRuntime&& ) = delete;
+    ~GpuRuntime() { tearDown( m_debug ); }
+    GpuRuntime( const GpuRuntime& ) = delete;
+    GpuRuntime( GpuRuntime&& ) = delete;
+    GpuRuntime& operator=( const GpuRuntime& ) = delete;
+    GpuRuntime& operator=( GpuRuntime&& ) = delete;
     bool m_debug;
 
     // Set up CUDA application
@@ -62,8 +63,8 @@ namespace mg5amcGpu
       */
       // Replace cudaFree(0) by cudaSetDevice(0), even if it is not really needed either
       // (but see https://developer.nvidia.com/blog/cuda-pro-tip-always-set-current-device-avoid-multithreading-bugs)
-      if( debug ) std::cout << "__CudaRuntime: calling cudaSetDevice(0)" << std::endl;
-      checkCuda( cudaSetDevice( 0 ) ); // SLOW!
+      if( debug ) std::cout << "__GpuRuntime: calling GpuSetDevice(0)" << std::endl;
+      checkGpu( gpuSetDevice( 0 ) ); // SLOW!
     }
 
     // Tear down CUDA application (call cudaDeviceReset)
@@ -72,14 +73,13 @@ namespace mg5amcGpu
     // See https://docs.nvidia.com/cuda/cuda-memcheck/index.html#leak-checking
     static void tearDown( const bool debug = true )
     {
-      if( debug ) std::cout << "__CudaRuntime: calling cudaDeviceReset()" << std::endl;
-      checkCuda( cudaDeviceReset() );
+      if( debug ) std::cout << "__GpuRuntime: calling GpuDeviceReset()" << std::endl;
+      checkGpu( gpuDeviceReset() );
     }
   };
-
 }
 #endif
 
 //--------------------------------------------------------------------------
 
-#endif // MG5AMC_CUDARUNTIME_H
+#endif // MG5AMC_GPURUNTIME_H
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/MadgraphTest.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/MadgraphTest.h
index ef40624c88..a64c05c26a 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/MadgraphTest.h
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/MadgraphTest.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: S. Hageboeck (Dec 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MADGRAPHTEST_H_
 #define MADGRAPHTEST_H_ 1
@@ -22,7 +22,7 @@
 #include <string>
 #include <vector>
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 using mg5amcGpu::CPPProcess;
 #else
 using mg5amcCpu::CPPProcess;
@@ -201,7 +201,7 @@ class MadgraphTest : public testing::TestWithParam<TestDriverBase*>
 
 // Since we link both the CPU-only and GPU tests into the same executable, we prevent
 // a multiply defined symbol by only compiling this in the non-CUDA phase:
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 
 /// Compare momenta and matrix elements.
 /// This uses an implementation of TestDriverBase to run a madgraph workflow,
@@ -307,6 +307,6 @@ TEST_P( MadgraphTest, CompareMomentaAndME )
   }
 }
 
-#endif // __CUDACC__
+#endif // MGONGPUCPP_GPUIMPL
 
 #endif /* MADGRAPHTEST_H_ */
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/MatrixElementKernels.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/MatrixElementKernels.cc
index 74b5239ebf..81699dfea9 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/MatrixElementKernels.cc
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/MatrixElementKernels.cc
@@ -1,12 +1,12 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #include "MatrixElementKernels.h"
 
 #include "CPPProcess.h"
-#include "CudaRuntime.h"
+#include "GpuRuntime.h" // Includes the abstraction for Nvidia/AMD compilation
 #include "MemoryAccessMomenta.h"
 #include "MemoryBuffers.h"
 
@@ -14,7 +14,7 @@
 
 //============================================================================
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 namespace mg5amcCpu
 {
 
@@ -150,7 +150,7 @@ namespace mg5amcCpu
 
 //============================================================================
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 {
 
@@ -209,13 +209,13 @@ namespace mg5amcGpu
     PinnedHostBufferHelicityMask hstIsGoodHel( ncomb );
     DeviceBufferHelicityMask devIsGoodHel( ncomb );
     // ... 0d1. Compute good helicity mask on the device
-    computeDependentCouplings<<<m_gpublocks, m_gputhreads>>>( m_gs.data(), m_couplings.data() );
+    gpuLaunchKernel( computeDependentCouplings, m_gpublocks, m_gputhreads, m_gs.data(), m_couplings.data() );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    sigmaKin_getGoodHel<<<m_gpublocks, m_gputhreads>>>( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_numerators.data(), m_denominators.data(), devIsGoodHel.data() );
+    gpuLaunchKernel( sigmaKin_getGoodHel, m_gpublocks, m_gputhreads, m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_numerators.data(), m_denominators.data(), devIsGoodHel.data() );
 #else
-    sigmaKin_getGoodHel<<<m_gpublocks, m_gputhreads>>>( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), devIsGoodHel.data() );
+    gpuLaunchKernel( sigmaKin_getGoodHel, m_gpublocks, m_gputhreads, m_momenta.data(), m_couplings.data(), m_matrixElements.data(), devIsGoodHel.data() );
 #endif
-    checkCuda( cudaPeekAtLastError() );
+    checkGpu( gpuPeekAtLastError() );
     // ... 0d2. Copy back good helicity mask to the host
     copyHostFromDevice( hstIsGoodHel, devIsGoodHel );
     // ... 0d3. Copy back good helicity list to constant memory on the device
@@ -226,19 +226,19 @@ namespace mg5amcGpu
 
   void MatrixElementKernelDevice::computeMatrixElements( const unsigned int channelId )
   {
-    computeDependentCouplings<<<m_gpublocks, m_gputhreads>>>( m_gs.data(), m_couplings.data() );
+    gpuLaunchKernel( computeDependentCouplings, m_gpublocks, m_gputhreads, m_gs.data(), m_couplings.data() );
 #ifndef MGONGPU_NSIGHT_DEBUG
     constexpr unsigned int sharedMemSize = 0;
 #else
     constexpr unsigned int sharedMemSize = ntpbMAX * sizeof( float );
 #endif
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    sigmaKin<<<m_gpublocks, m_gputhreads, sharedMemSize>>>( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), channelId, m_numerators.data(), m_denominators.data(), m_selhel.data(), m_selcol.data() );
+    gpuLaunchKernelSharedMem( sigmaKin, m_gpublocks, m_gputhreads, sharedMemSize, m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), channelId, m_numerators.data(), m_denominators.data(), m_selhel.data(), m_selcol.data() );
 #else
-    sigmaKin<<<m_gpublocks, m_gputhreads, sharedMemSize>>>( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), m_selhel.data(), m_selcol.data() );
+    gpuLaunchKernelSharedMem( sigmaKin, m_gpublocks, m_gputhreads, sharedMemSize, m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), m_selhel.data(), m_selcol.data() );
 #endif
-    checkCuda( cudaPeekAtLastError() );
-    checkCuda( cudaDeviceSynchronize() );
+    checkGpu( gpuPeekAtLastError() );
+    checkGpu( gpuDeviceSynchronize() );
   }
 
   //--------------------------------------------------------------------------
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/MatrixElementKernels.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/MatrixElementKernels.h
index 23e84757a2..72bd8f195b 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/MatrixElementKernels.h
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/MatrixElementKernels.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MATRIXELEMENTKERNELS_H
 #define MATRIXELEMENTKERNELS_H 1
@@ -10,7 +10,7 @@
 
 #include "MemoryBuffers.h"
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -81,7 +81,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating matrix element calculations on a CPU host
   class MatrixElementKernelHost final : public MatrixElementKernelBase, public NumberOfEvents
   {
@@ -130,7 +130,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   // A class encapsulating matrix element calculations on a GPU device
   class MatrixElementKernelDevice : public MatrixElementKernelBase, public NumberOfEvents
   {
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/MemoryAccessAmplitudes.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/MemoryAccessAmplitudes.h
index 573b3bbbc9..ffb76e93de 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/MemoryAccessAmplitudes.h
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/MemoryAccessAmplitudes.h
@@ -15,7 +15,7 @@
 #define MGONGPU_TRIVIAL_AMPLITUDES 1
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/MemoryAccessCouplings.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/MemoryAccessCouplings.h
index 35a3af42e0..3afdf3e554 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/MemoryAccessCouplings.h
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/MemoryAccessCouplings.h
@@ -15,7 +15,7 @@
 #include "MemoryBuffers.h"       // for HostBufferCouplings::isaligned
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/MemoryAccessCouplingsFixed.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/MemoryAccessCouplingsFixed.h
index dc0d93afff..ffcdf4dbef 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/MemoryAccessCouplingsFixed.h
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/MemoryAccessCouplingsFixed.h
@@ -14,7 +14,7 @@
 //#include "MemoryAccessHelpers.h"
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/MemoryAccessDenominators.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/MemoryAccessDenominators.h
index 3bce635718..66f2d32a6b 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/MemoryAccessDenominators.h
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/MemoryAccessDenominators.h
@@ -10,7 +10,7 @@
 #include "MemoryAccessGs.h"
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/MemoryAccessGs.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/MemoryAccessGs.h
index 31311aa375..4c726b30f3 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/MemoryAccessGs.h
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/MemoryAccessGs.h
@@ -13,7 +13,7 @@
 #include "MemoryBuffers.h" // for HostBufferMatrixElements::isaligned
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/MemoryAccessHelpers.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/MemoryAccessHelpers.h
index c82a6c7635..db73e4e064 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/MemoryAccessHelpers.h
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/MemoryAccessHelpers.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MemoryAccessHelpers_H
 #define MemoryAccessHelpers_H 1
@@ -105,7 +105,7 @@ class KernelAccessHelper : public MemoryAccessHelper<T>
     }
     else
     {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
       const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid
       //printf( "kernelAccessRecord: ievt=%d threadId=%d\n", ievt, threadIdx.x );
       return T::ieventAccessRecord( buffer, ievt ); // NB fptype and fptype_sv coincide for CUDA
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/MemoryAccessMatrixElements.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/MemoryAccessMatrixElements.h
index f32e6fea5b..3741011971 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/MemoryAccessMatrixElements.h
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/MemoryAccessMatrixElements.h
@@ -13,7 +13,7 @@
 #include "MemoryBuffers.h" // for HostBufferMatrixElements::isaligned
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/MemoryAccessMomenta.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/MemoryAccessMomenta.h
index 29266de32c..3be229d392 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/MemoryAccessMomenta.h
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/MemoryAccessMomenta.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MemoryAccessMomenta_H
 #define MemoryAccessMomenta_H 1
@@ -13,7 +13,7 @@
 #include "MemoryAccessVectors.h"
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -30,7 +30,7 @@ namespace mg5amcCpu
 
     // Number of Events Per Page in the momenta AOSOA memory buffer layout
     // (these are all best kept as a compile-time constants: see issue #23)
-#ifdef __CUDACC__ /* clang-format off */
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
     // -----------------------------------------------------------------------------------------------
     // --- GPUs: neppM is best set to a power of 2 times the number of fptype's in a 32-byte cacheline
     // --- This is relevant to ensure coalesced access to momenta in global memory
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/MemoryAccessNumerators.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/MemoryAccessNumerators.h
index b152183b28..18991f4fa6 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/MemoryAccessNumerators.h
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/MemoryAccessNumerators.h
@@ -10,7 +10,7 @@
 #include "MemoryAccessGs.h"
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/MemoryAccessRandomNumbers.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/MemoryAccessRandomNumbers.h
index e2988d39f3..40cb089135 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/MemoryAccessRandomNumbers.h
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/MemoryAccessRandomNumbers.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MemoryAccessRandomNumbers_H
 #define MemoryAccessRandomNumbers_H 1
@@ -11,7 +11,7 @@
 #include "CPPProcess.h"
 #include "MemoryAccessHelpers.h"
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 using mg5amcGpu::CPPProcess;
 #else
 using mg5amcCpu::CPPProcess;
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/MemoryAccessVectors.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/MemoryAccessVectors.h
index e9b197368e..08faccff0f 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/MemoryAccessVectors.h
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/MemoryAccessVectors.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MemoryAccessVectors_H
 #define MemoryAccessVectors_H 1
@@ -10,7 +10,7 @@
 
 #include "mgOnGpuVectors.h"
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 namespace mg5amcCpu // this is only needed for CPU SIMD vectorization
 {
 
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/MemoryAccessWavefunctions.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/MemoryAccessWavefunctions.h
index 5428aaf933..33bef4559e 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/MemoryAccessWavefunctions.h
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/MemoryAccessWavefunctions.h
@@ -15,7 +15,7 @@
 #define MGONGPU_TRIVIAL_WAVEFUNCTIONS 1
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/MemoryBuffers.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/MemoryBuffers.h
index 3093e6ed18..7756a71621 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/MemoryBuffers.h
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/MemoryBuffers.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021, based on earlier work by S. Hageboeck) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Roiser, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MemoryBuffers_H
 #define MemoryBuffers_H 1
@@ -11,12 +11,12 @@
 #include "mgOnGpuCxtypes.h"
 
 #include "CPPProcess.h"
-#include "CudaRuntime.h"
+#include "GpuRuntime.h"
 #include "Parameters_sm.h"
 
 #include <sstream>
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -87,7 +87,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   constexpr bool HostBufferALIGNED = false;   // ismisaligned=false
   constexpr bool HostBufferMISALIGNED = true; // ismisaligned=true
 
@@ -119,7 +119,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   // A class encapsulating a CUDA pinned host buffer
   template<typename T>
   class PinnedHostBufferBase : public BufferBase<T>
@@ -128,18 +128,18 @@ namespace mg5amcCpu
     PinnedHostBufferBase( const size_t size )
       : BufferBase<T>( size, false )
     {
-      checkCuda( cudaMallocHost( &( this->m_data ), this->bytes() ) );
+      gpuMallocHost( &( this->m_data ), this->bytes() );
     }
     virtual ~PinnedHostBufferBase()
     {
-      checkCuda( cudaFreeHost( this->m_data ) );
+      gpuFreeHost( this->m_data );
     }
   };
 #endif
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   // A class encapsulating a CUDA device buffer
   template<typename T>
   class DeviceBufferBase : public BufferBase<T>
@@ -148,18 +148,18 @@ namespace mg5amcCpu
     DeviceBufferBase( const size_t size )
       : BufferBase<T>( size, true )
     {
-      checkCuda( cudaMalloc( &( this->m_data ), this->bytes() ) );
+      gpuMalloc( &( this->m_data ), this->bytes() );
     }
     virtual ~DeviceBufferBase()
     {
-      checkCuda( cudaFree( this->m_data ) );
+      gpuFree( this->m_data );
     }
   };
 #endif
 
   //--------------------------------------------------------------------------
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for a given number of events
   template<typename T, size_t sizePerEvent, bool ismisaligned>
   class HostBuffer : public HostBufferBase<T, ismisaligned>, virtual private NumberOfEvents
@@ -175,7 +175,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   // A class encapsulating a CUDA pinned host buffer for a given number of events
   template<typename T, size_t sizePerEvent>
   class PinnedHostBuffer : public PinnedHostBufferBase<T>, virtual private NumberOfEvents
@@ -191,7 +191,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   // A class encapsulating a CUDA device buffer for a given number of events
   template<typename T, size_t sizePerEvent>
   class DeviceBuffer : public DeviceBufferBase<T>, virtual private NumberOfEvents
@@ -213,7 +213,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for momenta random numbers
   constexpr size_t sizePerEventRndNumMomenta = MemoryBuffers::np4 * MemoryBuffers::nparf;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for momenta random numbers
   typedef HostBuffer<fptype, sizePerEventRndNumMomenta, HostBufferALIGNED> HostBufferRndNumMomenta;
 #else
@@ -232,7 +232,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer with ONE fptype per event
   constexpr size_t sizePerEventOneFp = 1;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer with ONE fptype per event
   typedef HostBuffer<fptype, sizePerEventOneFp, HostBufferALIGNED> HostBufferOneFp;
 #else
@@ -257,7 +257,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for Gs
   constexpr size_t sizePerEventGs = 1;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for gs
   typedef HostBuffer<fptype, sizePerEventGs, HostBufferALIGNED> HostBufferGs;
 #else
@@ -276,7 +276,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for numerators
   constexpr size_t sizePerEventNumerators = 1;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for gs
   typedef HostBuffer<fptype, sizePerEventNumerators, HostBufferALIGNED> HostBufferNumerators;
 #else
@@ -296,7 +296,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for denominators
   constexpr size_t sizePerEventDenominators = 1;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for gs
   typedef HostBuffer<fptype, sizePerEventDenominators, HostBufferALIGNED> HostBufferDenominators;
 #else
@@ -315,7 +315,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for random numbers
   constexpr size_t sizePerEventCouplings = MemoryBuffers::ndcoup * MemoryBuffers::nx2;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for gs
   typedef HostBuffer<fptype, sizePerEventCouplings, HostBufferALIGNED> HostBufferCouplings;
 #else
@@ -333,7 +333,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for momenta
   constexpr size_t sizePerEventMomenta = MemoryBuffers::np4 * MemoryBuffers::npar;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for momenta
   typedef HostBuffer<fptype, sizePerEventMomenta, HostBufferALIGNED> HostBufferMomenta;
   //typedef HostBuffer<fptype, sizePerEventMomenta, HostBufferMISALIGNED> HostBufferMomenta; // TEST MISALIGNMENT!
@@ -352,7 +352,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for sampling weights
   constexpr size_t sizePerEventWeights = 1;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for sampling weights
   typedef HostBuffer<fptype, sizePerEventWeights, HostBufferALIGNED> HostBufferWeights;
 #else
@@ -370,7 +370,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for matrix elements
   constexpr size_t sizePerEventMatrixElements = 1;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for matrix elements
   typedef HostBuffer<fptype, sizePerEventMatrixElements, HostBufferALIGNED> HostBufferMatrixElements;
 #else
@@ -385,7 +385,7 @@ namespace mg5amcCpu
   // A base class encapsulating a memory buffer for the helicity mask
   typedef BufferBase<bool> BufferHelicityMask;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for the helicity mask
   typedef HostBufferBase<bool, HostBufferALIGNED> HostBufferHelicityMask;
 #else
@@ -403,7 +403,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for wavefunctions
   constexpr size_t sizePerEventWavefunctions = MemoryBuffers::nw6 * MemoryBuffers::nx2;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for wavefunctions
   typedef HostBuffer<fptype, sizePerEventWavefunctions, HostBufferALIGNED> HostBufferWavefunctions;
 #else
@@ -421,7 +421,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for helicity random numbers
   constexpr size_t sizePerEventRndNumHelicity = 1;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for helicity random numbers
   typedef HostBuffer<fptype, sizePerEventRndNumHelicity, HostBufferALIGNED> HostBufferRndNumHelicity;
 #else
@@ -439,7 +439,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for color random numbers
   constexpr size_t sizePerEventRndNumColor = 1;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for color random numbers
   typedef HostBuffer<fptype, sizePerEventRndNumColor, HostBufferALIGNED> HostBufferRndNumColor;
 #else
@@ -457,7 +457,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for helicity selection
   constexpr size_t sizePerEventSelectedHelicity = 1;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for helicity selection
   typedef HostBuffer<int, sizePerEventSelectedHelicity, HostBufferALIGNED> HostBufferSelectedHelicity;
 #else
@@ -475,7 +475,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for color selection
   constexpr size_t sizePerEventSelectedColor = 1;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for color selection
   typedef HostBuffer<int, sizePerEventSelectedColor, HostBufferALIGNED> HostBufferSelectedColor;
 #else
@@ -487,7 +487,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   template<class Tdst, class Tsrc>
   void copyDeviceFromHost( Tdst& dst, const Tsrc& src ) // keep the same order of arguments as in memcpy
   {
@@ -504,13 +504,13 @@ namespace mg5amcCpu
       throw std::runtime_error( sstr.str() );
     }
     // NB (PR #45): cudaMemcpy involves an intermediate memcpy to pinned memory if host array is a not a pinned host array
-    checkCuda( cudaMemcpy( dst.data(), src.data(), src.bytes(), cudaMemcpyHostToDevice ) );
+    gpuMemcpy( dst.data(), src.data(), src.bytes(), gpuMemcpyHostToDevice );
   }
 #endif
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   template<class Tdst, class Tsrc>
   void copyHostFromDevice( Tdst& dst, const Tsrc& src ) // keep the same order of arguments as in memcpy
   {
@@ -527,7 +527,7 @@ namespace mg5amcCpu
       throw std::runtime_error( sstr.str() );
     }
     // NB (PR #45): cudaMemcpy involves an intermediate memcpy to pinned memory if host array is a not a pinned host array
-    checkCuda( cudaMemcpy( dst.data(), src.data(), src.bytes(), cudaMemcpyDeviceToHost ) );
+    gpuMemcpy( dst.data(), src.data(), src.bytes(), gpuMemcpyDeviceToHost );
   }
 #endif
 
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_gg_ttx/CPPProcess.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_gg_ttx/CPPProcess.cc
index 7f14b5e299..40d8bdea5f 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_gg_ttx/CPPProcess.cc
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_gg_ttx/CPPProcess.cc
@@ -4,7 +4,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
 // MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
@@ -16,7 +16,6 @@
 
 #include "mgOnGpuConfig.h"
 
-#include "CudaRuntime.h"
 #include "HelAmps_sm.h"
 #include "MemoryAccessAmplitudes.h"
 #include "MemoryAccessCouplings.h"
@@ -46,7 +45,7 @@
 // Class member functions for calculating the matrix elements for
 // Process: g g > t t~ WEIGHTED<=2
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -80,7 +79,7 @@ namespace mg5amcCpu
   __device__ const fptype cIPD[2] = { (fptype)Parameters_sm::mdl_MT, (fptype)Parameters_sm::mdl_WT };
   __device__ const fptype* cIPC = nullptr; // unused as nicoup=0
 #else
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   __device__ __constant__ fptype cIPD[2];
   __device__ __constant__ fptype* cIPC = nullptr; // unused as nicoup=0
 #else
@@ -90,7 +89,7 @@ namespace mg5amcCpu
 #endif
 
   // Helicity combinations (and filtering of "good" helicity combinations)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   __device__ __constant__ short cHel[ncomb][npar];
   __device__ __constant__ int cNGoodHel;
   __device__ __constant__ int cGoodHel[ncomb];
@@ -118,13 +117,13 @@ namespace mg5amcCpu
                            fptype* allDenominators,       // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
                            fptype_sv* jamp2_sv            // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled)
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
                            , const int ievt00             // input: first event number in current C++ event page (for CUDA, ievt depends on threadid)
 #endif
                            )
   //ALWAYS_INLINE // attributes are not permitted in a function definition
   {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     using namespace mg5amcGpu;
     using M_ACCESS = DeviceAccessMomenta;         // non-trivial access: buffer includes all events
     using E_ACCESS = DeviceAccessMatrixElements;  // non-trivial access: buffer includes all events
@@ -151,7 +150,7 @@ namespace mg5amcCpu
 #endif /* clang-format on */
     mgDebug( 0, __FUNCTION__ );
     //printf( "calculate_wavefunctions: ihel=%2d\n", ihel );
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
     //printf( "calculate_wavefunctions: ievt00=%d\n", ievt00 );
 #endif
 
@@ -187,7 +186,7 @@ namespace mg5amcCpu
 #endif
     for( int iParity = 0; iParity < nParity; ++iParity )
     { // START LOOP ON IPARITY
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
       const int ievt0 = ievt00 + iParity * neppV;
 #endif
       constexpr size_t nxcoup = ndcoup + nicoup; // both dependent and independent couplings
@@ -200,8 +199,10 @@ namespace mg5amcCpu
         allCOUPs[idcoup] = CD_ACCESS::idcoupAccessBufferConst( allcouplings, idcoup ); // dependent couplings, vary event-by-event
       for( size_t iicoup = 0; iicoup < nicoup; iicoup++ )
         allCOUPs[ndcoup + iicoup] = CI_ACCESS::iicoupAccessBufferConst( cIPC, iicoup ); // independent couplings, fixed for all events
+#ifdef MGONGPUCPP_GPUIMPL
 #ifdef __CUDACC__
 #pragma nv_diagnostic pop
+#endif
       // CUDA kernels take input/output buffers with momenta/MEs for all events
       const fptype* momenta = allmomenta;
       const fptype* COUPs[nxcoup];
@@ -302,7 +303,7 @@ namespace mg5amcCpu
         { 16, -2 },
         { -2, 16 } }; // 2-D array[2][2]
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
       // Pre-compute a constexpr triangular color matrix properly normalized #475
       struct TriangularNormalizedColorMatrix
       {
@@ -359,7 +360,7 @@ namespace mg5amcCpu
 #endif
       for( int icol = 0; icol < ncolor; icol++ )
       {
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
         // === C++ START ===
         // Diagonal terms
 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
@@ -418,7 +419,7 @@ namespace mg5amcCpu
       MEs_sv_previous += deltaMEs_previous;
 #endif
       /*
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
       if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", blockDim.x * blockIdx.x + threadIdx.x, ihel, MEs_sv );
 #else
 #ifdef MGONGPU_CPPSIMD
@@ -465,8 +466,8 @@ namespace mg5amcCpu
       { 1, 1, -1, -1 },
       { 1, 1, 1, 1 },
       { 1, 1, 1, -1 } };
-#ifdef __CUDACC__
-    checkCuda( cudaMemcpyToSymbol( cHel, tHel, ncomb * npar * sizeof( short ) ) );
+#ifdef MGONGPUCPP_GPUIMPL
+    gpuMemcpyToSymbol( cHel, tHel, ncomb * npar * sizeof( short ) );
 #else
     memcpy( cHel, tHel, ncomb * npar * sizeof( short ) );
 #endif
@@ -506,9 +507,9 @@ namespace mg5amcCpu
     // Then copy them to CUDA constant memory (issue #39) or its C++ emulation in file-scope static memory
     const fptype tIPD[2] = { (fptype)m_pars->mdl_MT, (fptype)m_pars->mdl_WT };
     //const cxtype tIPC[0] = { ... }; // nicoup=0
-#ifdef __CUDACC__
-    checkCuda( cudaMemcpyToSymbol( cIPD, tIPD, 2 * sizeof( fptype ) ) );
-    //checkCuda( cudaMemcpyToSymbol( cIPC, tIPC, 0 * sizeof( cxtype ) ) ); // nicoup=0
+#ifdef MGONGPUCPP_GPUIMPL
+    gpuMemcpyToSymbol( cIPD, tIPD, 2 * sizeof( fptype ) );
+    //gpuMemcpyToSymbol( cIPC, tIPC, 0 * sizeof( cxtype ) ); // nicoup=0
 #else
     memcpy( cIPD, tIPD, 2 * sizeof( fptype ) );
     //memcpy( cIPC, tIPC, 0 * sizeof( cxtype ) ); // nicoup=0
@@ -544,7 +545,7 @@ namespace mg5amcCpu
   {
     std::stringstream out;
     // CUDA version (NVCC)
-    // [Use __NVCC__ instead of __CUDACC__ here!]
+    // [Use __NVCC__ instead of MGONGPUCPP_GPUIMPL here!]
     // [This tests if 'nvcc' was used even to build a .cc file, even if not necessarily 'nvcc -x cu' for a .cu file]
     // [Check 'nvcc --compiler-options -dM -E dummy.c | grep CUDA': see https://stackoverflow.com/a/53713712]
 #ifdef __NVCC__
@@ -609,12 +610,12 @@ namespace mg5amcCpu
   __global__ void /* clang-format off */
   computeDependentCouplings( const fptype* allgs, // input: Gs[nevt]
                              fptype* allcouplings // output: couplings[nevt*ndcoup*2]
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
                              , const int nevt     // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #endif
   ) /* clang-format on */
   {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     using namespace mg5amcGpu;
     using G_ACCESS = DeviceAccessGs;
     using C_ACCESS = DeviceAccessCouplings;
@@ -635,7 +636,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__ /* clang-format off */
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
   __global__ void
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
@@ -761,9 +762,9 @@ namespace mg5amcCpu
         nGoodHel++;
       }
     }
-#ifdef __CUDACC__
-    checkCuda( cudaMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) ) );
-    checkCuda( cudaMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) ) );
+#ifdef MGONGPUCPP_GPUIMPL
+    gpuMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) );
+    gpuMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) );
 #else
     cNGoodHel = nGoodHel;
     for( int ihel = 0; ihel < ncomb; ihel++ ) cGoodHel[ihel] = goodHel[ihel];
@@ -787,7 +788,7 @@ namespace mg5amcCpu
 #endif
             int* allselhel,                // output: helicity selection[nevt]
             int* allselcol                 // output: helicity selection[nevt]
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
             , const int nevt               // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #endif
             ) /* clang-format on */
@@ -807,7 +808,7 @@ namespace mg5amcCpu
     // Denominators: spins, colors and identical particles
     constexpr int helcolDenominators[1] = { 256 }; // assume nprocesses == 1 (#272 and #343)
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     // Remember: in CUDA this is a kernel for one event, in c++ this processes n events
     const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid
 #else
@@ -821,9 +822,12 @@ namespace mg5amcCpu
 #endif
 
     // Start sigmaKin_lines
+
+#include "GpuAbstraction.h"
+
     // === PART 0 - INITIALISATION (before calculate_wavefunctions) ===
     // Reset the "matrix elements" - running sums of |M|^2 over helicities for the given event
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     allMEs[ievt] = 0;
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     allNumerators[ievt] = 0;
@@ -851,7 +855,7 @@ namespace mg5amcCpu
     // === PART 1 - HELICITY LOOP: CALCULATE WAVEFUNCTIONS ===
     // (in both CUDA and C++, using precomputed good helicities)
 
-#ifdef __CUDACC__ // CUDA OR C++
+#ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++
 
     // *** START OF PART 1a - CUDA (one event per CPU thread) ***
     // Running sum of partial amplitudes squared for event by event color selection (#402)
@@ -1061,7 +1065,7 @@ namespace mg5amcCpu
     // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event
     // [NB 'sum over final spins, average over initial spins', eg see
     // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf]
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     allMEs[ievt] /= helcolDenominators[0];
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     if( channelId > 0 ) allMEs[ievt] *= allNumerators[ievt] / allDenominators[ievt];
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_gg_ttx/CPPProcess.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_gg_ttx/CPPProcess.h
index 448175be9d..f8a20b77fc 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_gg_ttx/CPPProcess.h
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_gg_ttx/CPPProcess.h
@@ -4,7 +4,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
 // MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
@@ -25,7 +25,7 @@
 
 //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -107,7 +107,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   __global__ void
   computeDependentCouplings( const fptype* allgs,    // input: Gs[nevt]
                              fptype* allcouplings ); // output: couplings[nevt*ndcoup*2]
@@ -120,7 +120,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__ /* clang-format off */
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
   __global__ void
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
@@ -150,7 +150,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__ /* clang-format off */
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
   __global__ void
   sigmaKin( const fptype* allmomenta,      // input: momenta[nevt*npar*4]
             const fptype* allcouplings,    // input: couplings[nevt*ndcoup*2]
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_gg_ttx/CudaRuntime.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_gg_ttx/CudaRuntime.h
deleted file mode 120000
index ce9e1a487a..0000000000
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_gg_ttx/CudaRuntime.h
+++ /dev/null
@@ -1 +0,0 @@
-../CudaRuntime.h
\ No newline at end of file
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_gg_ttx/GpuAbstraction.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_gg_ttx/GpuAbstraction.h
new file mode 120000
index 0000000000..72054e19ba
--- /dev/null
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_gg_ttx/GpuAbstraction.h
@@ -0,0 +1 @@
+../GpuAbstraction.h
\ No newline at end of file
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_gg_ttx/GpuRuntime.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_gg_ttx/GpuRuntime.h
new file mode 120000
index 0000000000..3920e83be4
--- /dev/null
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_gg_ttx/GpuRuntime.h
@@ -0,0 +1 @@
+../GpuRuntime.h
\ No newline at end of file
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_gg_ttx/check_sa.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_gg_ttx/check_sa.cc
index 3fbf0ffbee..7cac5ab47b 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_gg_ttx/check_sa.cc
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_gg_ttx/check_sa.cc
@@ -4,7 +4,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: O. Mattelaer (Nov 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 
 #include "mgOnGpuConfig.h"
@@ -12,6 +12,7 @@
 #include "BridgeKernels.h"
 #include "CPPProcess.h"
 #include "CrossSectionKernels.h"
+#include "GpuRuntime.h"
 #include "MatrixElementKernels.h"
 #include "MemoryAccessMatrixElements.h"
 #include "MemoryAccessMomenta.h"
@@ -65,7 +66,7 @@ usage( char* argv0, int ret = 1 )
   std::cout << std::endl;
   std::cout << "Summary stats are always computed: '-p' and '-j' only control their printout" << std::endl;
   std::cout << "The '-d' flag only enables NaN/abnormal warnings and OMP debugging" << std::endl;
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 #ifdef _OPENMP
   std::cout << std::endl;
   std::cout << "Use the OMP_NUM_THREADS environment variable to control OMP multi-threading" << std::endl;
@@ -96,7 +97,7 @@ int
 main( int argc, char** argv )
 {
   // Namespaces for CUDA and C++ (FIXME - eventually use the same namespace everywhere...)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   using namespace mg5amcGpu;
 #else
   using namespace mg5amcCpu;
@@ -134,9 +135,11 @@ main( int argc, char** argv )
     CurandDevice = 2
   };
 #ifdef MGONGPU_HAS_NO_CURAND
-  RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // this is the only supported mode if build has no curand (PR #784)
+  RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // this is the only supported mode if build has no curand (PR #784 and #785)
+#elif defined __HIPCC__
+#error Internal error: MGONGPU_HAS_NO_CURAND should have been set for __HIPCC__ // default on AMD GPUs should be common random
 #elif defined __CUDACC__
-  RandomNumberMode rndgen = RandomNumberMode::CurandDevice; // default on GPU if build has curand
+  RandomNumberMode rndgen = RandomNumberMode::CurandDevice; // default on NVidia GPU if build has curand
 #else
   RandomNumberMode rndgen = RandomNumberMode::CurandHost; // default on CPU if build has curand
 #endif
@@ -146,10 +149,10 @@ main( int argc, char** argv )
     RamboHost = 1,
     RamboDevice = 2
   };
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   RamboSamplingMode rmbsmp = RamboSamplingMode::RamboDevice; // default on GPU
 #else
-  RamboSamplingMode rmbsmp = RamboSamplingMode::RamboHost;  // default on CPU
+  RamboSamplingMode rmbsmp = RamboSamplingMode::RamboHost; // default on CPU
 #endif
   // Bridge emulation mode (NB Bridge implies RamboHost!)
   bool bridge = false;
@@ -177,7 +180,7 @@ main( int argc, char** argv )
     else if( arg == "--curdev" )
     {
 #ifndef __CUDACC__
-      throw std::runtime_error( "CurandDevice is not supported on CPUs" );
+      throw std::runtime_error( "CurandDevice is not supported on CPUs or non-NVidia GPUs" );
 #elif defined MGONGPU_HAS_NO_CURAND
       throw std::runtime_error( "CurandDevice is not supported because this application was built without Curand support" );
 #else
@@ -198,7 +201,7 @@ main( int argc, char** argv )
     }
     else if( arg == "--rmbdev" )
     {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
       rmbsmp = RamboSamplingMode::RamboDevice;
 #else
       throw std::runtime_error( "RamboDevice is not supported on CPUs" );
@@ -272,13 +275,13 @@ main( int argc, char** argv )
     return usage( argv[0] );
   }
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 #ifdef _OPENMP
   ompnumthreadsNotSetMeansOneThread( debug ? 1 : 0 ); // quiet(-1), info(0), debug(1)
 #endif
 #endif
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // Fail gently and avoid "Illegal instruction (core dumped)" if the host does not support the SIMD used in the ME calculation
   // Note: this prevents a crash on pmpe04 but not on some github CI nodes?
   // [NB: SIMD vectorization in mg5amc C++ code is only used in the ME calculation below MatrixElementKernelHost!]
@@ -296,14 +299,14 @@ main( int argc, char** argv )
 
   // === STEP 0 - INITIALISE
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 
-  // --- 00. Initialise cuda
-  // Instantiate a CudaRuntime at the beginnining of the application's main to
-  // invoke cudaSetDevice(0) in the constructor and book a cudaDeviceReset() call in the destructor
-  const std::string cdinKey = "00 CudaInit";
+  // --- 00. Initialise GPU
+  // Instantiate a GpuRuntime at the beginnining of the application's main.
+  // For CUDA this invokes cudaSetDevice(0) in the constructor and books a cudaDeviceReset() call in the destructor.
+  const std::string cdinKey = "00 GpuInit";
   timermap.start( cdinKey );
-  CudaRuntime cudaRuntime( debug );
+  GpuRuntime GpuRuntime( debug );
 #endif
 
   // --- 0a. Initialise physics process
@@ -325,7 +328,7 @@ main( int argc, char** argv )
   timermap.start( alloKey );
 
   // Memory buffers for random numbers for momenta
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferRndNumMomenta hstRndmom( nevt );
 #else
   PinnedHostBufferRndNumMomenta hstRndmom( nevt );
@@ -333,7 +336,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for sampling weights
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferWeights hstWeights( nevt );
 #else
   PinnedHostBufferWeights hstWeights( nevt );
@@ -341,7 +344,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for momenta
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferMomenta hstMomenta( nevt );
 #else
   PinnedHostBufferMomenta hstMomenta( nevt );
@@ -349,7 +352,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for Gs
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferGs hstGs( nevt );
 #else
   PinnedHostBufferGs hstGs( nevt );
@@ -366,7 +369,7 @@ main( int argc, char** argv )
   }
 
   // Memory buffers for matrix elements
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferMatrixElements hstMatrixElements( nevt );
 #else
   PinnedHostBufferMatrixElements hstMatrixElements( nevt );
@@ -375,7 +378,7 @@ main( int argc, char** argv )
 
   // Memory buffers for random numbers for helicity selection
   // *** NB #403 these buffers always remain initialised at 0: no need for helicity choice in gcheck/check (no LHE produced) ***
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferRndNumHelicity hstRndHel( nevt );
 #else
   PinnedHostBufferRndNumHelicity hstRndHel( nevt );
@@ -384,7 +387,7 @@ main( int argc, char** argv )
 
   // Memory buffers for random numbers for color selection
   // *** NB #402 these buffers always remain initialised at 0: no need for color choice in gcheck/check (no LHE produced) ***
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferRndNumColor hstRndCol( nevt );
 #else
   PinnedHostBufferRndNumColor hstRndCol( nevt );
@@ -392,7 +395,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for helicity selection
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferSelectedHelicity hstSelHel( nevt );
 #else
   PinnedHostBufferSelectedHelicity hstSelHel( nevt );
@@ -400,7 +403,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for color selection
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferSelectedColor hstSelCol( nevt );
 #else
   PinnedHostBufferSelectedColor hstSelCol( nevt );
@@ -438,7 +441,7 @@ main( int argc, char** argv )
     const bool onDevice = true;
     prnk.reset( new CurandRandomNumberKernel( devRndmom, onDevice ) );
 #else
-    throw std::logic_error( "INTERNAL ERROR! CurandDevice is not supported on CPUs" ); // INTERNAL ERROR (no path to this statement)
+    throw std::logic_error( "INTERNAL ERROR! CurandDevice is not supported on CPUs or non-NVidia GPUs" ); // INTERNAL ERROR (no path to this statement)
 #endif
   }
 
@@ -450,7 +453,7 @@ main( int argc, char** argv )
   }
   else
   {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     prsk.reset( new RamboSamplingKernelDevice( energy, devRndmom, devMomenta, devWeights, gpublocks, gputhreads ) );
 #else
     throw std::logic_error( "RamboDevice is not supported on CPUs" ); // INTERNAL ERROR (no path to this statement)
@@ -461,7 +464,7 @@ main( int argc, char** argv )
   std::unique_ptr<MatrixElementKernelBase> pmek;
   if( !bridge )
   {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     pmek.reset( new MatrixElementKernelDevice( devMomenta, devGs, devRndHel, devRndCol, devMatrixElements, devSelHel, devSelCol, gpublocks, gputhreads ) );
 #else
     pmek.reset( new MatrixElementKernelHost( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, nevt ) );
@@ -469,7 +472,7 @@ main( int argc, char** argv )
   }
   else
   {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     pmek.reset( new BridgeKernelDevice( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, gpublocks, gputhreads ) );
 #else
     pmek.reset( new BridgeKernelHost( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, nevt ) );
@@ -511,7 +514,7 @@ main( int argc, char** argv )
     prnk->generateRnarray();
     //std::cout << "Got random numbers" << std::endl;
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     if( rndgen != RandomNumberMode::CurandDevice && rmbsmp == RamboSamplingMode::RamboDevice )
     {
       // --- 1c. Copy rndmom from host to device
@@ -543,7 +546,7 @@ main( int argc, char** argv )
     prsk->getMomentaFinal();
     //std::cout << "Got final momenta" << std::endl;
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     if( rmbsmp == RamboSamplingMode::RamboDevice )
     {
       // --- 2c. CopyDToH Weights
@@ -588,7 +591,7 @@ main( int argc, char** argv )
       dynamic_cast<BridgeKernelBase*>( pmek.get() )->transposeInputMomentaC2F();
     }
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     // --- 2d. CopyHToD Momenta
     const std::string gKey = "0.. CpHTDg";
     rambtime += timermap.start( gKey ); // FIXME! NOT A RAMBO TIMER!
@@ -617,7 +620,7 @@ main( int argc, char** argv )
     wv3atime += timermap.stop(); // calc only
     wavetime += wv3atime;        // calc plus copy
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     if( !bridge )
     {
       // --- 3b. CopyDToH MEs
@@ -760,18 +763,22 @@ main( int argc, char** argv )
     rndgentxt = "CURAND DEVICE";
 #ifdef __CUDACC__
   rndgentxt += " (CUDA code)";
+#elif defined __HIPCC__
+  rndgentxt += " (HIP code)";
 #else
   rndgentxt += " (C++ code)";
 #endif
 
   // Workflow description summary
   std::string wrkflwtxt;
-  // -- CUDA or C++?
+  // -- CUDA or HIP or C++?
 #ifdef __CUDACC__
   wrkflwtxt += "CUD:";
+#elif defined __HIPCC__
+  wrkflwtxt += "HIP:";
 #else
   wrkflwtxt += "CPP:";
-#endif
+#endif /* clang-format off */
   // -- DOUBLE or FLOAT?
 #if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
   wrkflwtxt += "MIX+"; // mixed fptypes (single precision color algebra #537)
@@ -781,7 +788,7 @@ main( int argc, char** argv )
   wrkflwtxt += "FLT+";
 #else
   wrkflwtxt += "???+"; // no path to this statement
-#endif
+#endif /* clang-format on */
   // -- CUCOMPLEX or THRUST or STD complex numbers?
 #ifdef __CUDACC__
 #if defined MGONGPU_CUCXTYPE_CUCOMPLEX
@@ -793,6 +800,12 @@ main( int argc, char** argv )
 #else
   wrkflwtxt += "???:"; // no path to this statement
 #endif
+#elif defined __HIPCC__
+#if defined MGONGPU_CUCXTYPE_CXSMPL
+  wrkflwtxt += "CXS:";
+#else
+  wrkflwtxt += "???:"; // no path to this statement
+#endif
 #else
 #if defined MGONGPU_CPPCXTYPE_STDCOMPLEX
   wrkflwtxt += "STX:";
@@ -818,7 +831,7 @@ main( int argc, char** argv )
     wrkflwtxt += "RMBDEV+";
   else
     wrkflwtxt += "??????+"; // no path to this statement
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   // -- HOST or DEVICE matrix elements? Standalone MEs or BRIDGE?
   if( !bridge )
     wrkflwtxt += "MESDEV";
@@ -874,7 +887,7 @@ main( int argc, char** argv )
 
   if( perf )
   {
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 #ifdef _OPENMP
     // Get the output of "nproc --all" (https://stackoverflow.com/a/478960)
     std::string nprocall;
@@ -895,6 +908,8 @@ main( int argc, char** argv )
     std::cout << std::string( SEP79, '*' ) << std::endl
 #ifdef __CUDACC__
               << "Process                     = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_CUDA"
+#elif defined __HIPCC__
+              << "Process                     = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_HIP"
 #else
               << "Process                     = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_CPP"
 #endif
@@ -921,21 +936,21 @@ main( int argc, char** argv )
 #elif defined MGONGPU_FPTYPE_FLOAT
               << "FP precision                = FLOAT (NaN/abnormal=" << nabn << ", zero=" << nzero << ")" << std::endl
 #endif
-#ifdef __CUDACC__
 #if defined MGONGPU_CUCXTYPE_CUCOMPLEX
               << "Complex type                = CUCOMPLEX" << std::endl
 #elif defined MGONGPU_CUCXTYPE_THRUST
               << "Complex type                = THRUST::COMPLEX" << std::endl
-#endif
-#else
+#elif defined MGONGPU_CUCXTYPE_CXSMPL
               << "Complex type                = STD::COMPLEX" << std::endl
+#else
+              << "Complex type                = ???" << std::endl // no path to this statement...
 #endif
               << "RanNumb memory layout       = AOSOA[" << neppR << "]"
               << ( neppR == 1 ? " == AOS" : "" )
               << " [HARDCODED FOR REPRODUCIBILITY]" << std::endl
               << "Momenta memory layout       = AOSOA[" << neppM << "]"
               << ( neppM == 1 ? " == AOS" : "" ) << std::endl
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     //<< "Wavefunction GPU memory     = LOCAL" << std::endl
 #else
 #if !defined MGONGPU_CPPSIMD
@@ -966,7 +981,7 @@ main( int argc, char** argv )
 #endif
 #endif
               << "Random number generation    = " << rndgentxt << std::endl
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 #ifdef _OPENMP
               << "OMP threads / `nproc --all` = " << omp_get_max_threads() << " / " << nprocall // includes a newline
 #endif
@@ -1062,14 +1077,14 @@ main( int argc, char** argv )
              << "\"FLOAT (NaN/abnormal=" << nabn << ")\"," << std::endl
 #endif
              << "\"Complex type\": "
-#ifdef __CUDACC__
 #if defined MGONGPU_CUCXTYPE_CUCOMPLEX
              << "\"CUCOMPLEX\"," << std::endl
 #elif defined MGONGPU_CUCXTYPE_THRUST
              << "\"THRUST::COMPLEX\"," << std::endl
-#endif
-#else
+#elif defined MGONGPU_CUCXTYPE_CXSMPL
              << "\"STD::COMPLEX\"," << std::endl
+#else
+             << "\"???\"," << std::endl                           // no path to this statement...
 #endif
              << "\"RanNumb memory layout\": "
              << "\"AOSOA[" << neppR << "]\""
@@ -1077,7 +1092,7 @@ main( int argc, char** argv )
              << "\"Momenta memory layout\": "
              << "\"AOSOA[" << neppM << "]\""
              << ( neppM == 1 ? " == AOS" : "" ) << ", " << std::endl
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     //<< "\"Wavefunction GPU memory\": " << "\"LOCAL\"," << std::endl
 #endif
              << "\"Curand generation\": "
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_uux_ttx/CPPProcess.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_uux_ttx/CPPProcess.cc
index 20496eaa70..5f57cf55f3 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_uux_ttx/CPPProcess.cc
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_uux_ttx/CPPProcess.cc
@@ -4,7 +4,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
 // MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
@@ -16,7 +16,6 @@
 
 #include "mgOnGpuConfig.h"
 
-#include "CudaRuntime.h"
 #include "HelAmps_sm.h"
 #include "MemoryAccessAmplitudes.h"
 #include "MemoryAccessCouplings.h"
@@ -49,7 +48,7 @@
 // Process: d d~ > t t~ WEIGHTED<=2
 // Process: s s~ > t t~ WEIGHTED<=2
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -83,7 +82,7 @@ namespace mg5amcCpu
   __device__ const fptype cIPD[2] = { (fptype)Parameters_sm::mdl_MT, (fptype)Parameters_sm::mdl_WT };
   __device__ const fptype* cIPC = nullptr; // unused as nicoup=0
 #else
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   __device__ __constant__ fptype cIPD[2];
   __device__ __constant__ fptype* cIPC = nullptr; // unused as nicoup=0
 #else
@@ -93,7 +92,7 @@ namespace mg5amcCpu
 #endif
 
   // Helicity combinations (and filtering of "good" helicity combinations)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   __device__ __constant__ short cHel[ncomb][npar];
   __device__ __constant__ int cNGoodHel;
   __device__ __constant__ int cGoodHel[ncomb];
@@ -121,13 +120,13 @@ namespace mg5amcCpu
                            fptype* allDenominators,       // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
                            fptype_sv* jamp2_sv            // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled)
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
                            , const int ievt00             // input: first event number in current C++ event page (for CUDA, ievt depends on threadid)
 #endif
                            )
   //ALWAYS_INLINE // attributes are not permitted in a function definition
   {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     using namespace mg5amcGpu;
     using M_ACCESS = DeviceAccessMomenta;         // non-trivial access: buffer includes all events
     using E_ACCESS = DeviceAccessMatrixElements;  // non-trivial access: buffer includes all events
@@ -154,7 +153,7 @@ namespace mg5amcCpu
 #endif /* clang-format on */
     mgDebug( 0, __FUNCTION__ );
     //printf( "calculate_wavefunctions: ihel=%2d\n", ihel );
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
     //printf( "calculate_wavefunctions: ievt00=%d\n", ievt00 );
 #endif
 
@@ -190,7 +189,7 @@ namespace mg5amcCpu
 #endif
     for( int iParity = 0; iParity < nParity; ++iParity )
     { // START LOOP ON IPARITY
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
       const int ievt0 = ievt00 + iParity * neppV;
 #endif
       constexpr size_t nxcoup = ndcoup + nicoup; // both dependent and independent couplings
@@ -203,8 +202,10 @@ namespace mg5amcCpu
         allCOUPs[idcoup] = CD_ACCESS::idcoupAccessBufferConst( allcouplings, idcoup ); // dependent couplings, vary event-by-event
       for( size_t iicoup = 0; iicoup < nicoup; iicoup++ )
         allCOUPs[ndcoup + iicoup] = CI_ACCESS::iicoupAccessBufferConst( cIPC, iicoup ); // independent couplings, fixed for all events
+#ifdef MGONGPUCPP_GPUIMPL
 #ifdef __CUDACC__
 #pragma nv_diagnostic pop
+#endif
       // CUDA kernels take input/output buffers with momenta/MEs for all events
       const fptype* momenta = allmomenta;
       const fptype* COUPs[nxcoup];
@@ -279,7 +280,7 @@ namespace mg5amcCpu
         { 9, 3 },
         { 3, 9 } }; // 2-D array[2][2]
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
       // Pre-compute a constexpr triangular color matrix properly normalized #475
       struct TriangularNormalizedColorMatrix
       {
@@ -336,7 +337,7 @@ namespace mg5amcCpu
 #endif
       for( int icol = 0; icol < ncolor; icol++ )
       {
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
         // === C++ START ===
         // Diagonal terms
 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
@@ -395,7 +396,7 @@ namespace mg5amcCpu
       MEs_sv_previous += deltaMEs_previous;
 #endif
       /*
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
       if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", blockDim.x * blockIdx.x + threadIdx.x, ihel, MEs_sv );
 #else
 #ifdef MGONGPU_CPPSIMD
@@ -442,8 +443,8 @@ namespace mg5amcCpu
       { -1, 1, -1, -1 },
       { -1, 1, 1, 1 },
       { -1, 1, 1, -1 } };
-#ifdef __CUDACC__
-    checkCuda( cudaMemcpyToSymbol( cHel, tHel, ncomb * npar * sizeof( short ) ) );
+#ifdef MGONGPUCPP_GPUIMPL
+    gpuMemcpyToSymbol( cHel, tHel, ncomb * npar * sizeof( short ) );
 #else
     memcpy( cHel, tHel, ncomb * npar * sizeof( short ) );
 #endif
@@ -483,9 +484,9 @@ namespace mg5amcCpu
     // Then copy them to CUDA constant memory (issue #39) or its C++ emulation in file-scope static memory
     const fptype tIPD[2] = { (fptype)m_pars->mdl_MT, (fptype)m_pars->mdl_WT };
     //const cxtype tIPC[0] = { ... }; // nicoup=0
-#ifdef __CUDACC__
-    checkCuda( cudaMemcpyToSymbol( cIPD, tIPD, 2 * sizeof( fptype ) ) );
-    //checkCuda( cudaMemcpyToSymbol( cIPC, tIPC, 0 * sizeof( cxtype ) ) ); // nicoup=0
+#ifdef MGONGPUCPP_GPUIMPL
+    gpuMemcpyToSymbol( cIPD, tIPD, 2 * sizeof( fptype ) );
+    //gpuMemcpyToSymbol( cIPC, tIPC, 0 * sizeof( cxtype ) ); // nicoup=0
 #else
     memcpy( cIPD, tIPD, 2 * sizeof( fptype ) );
     //memcpy( cIPC, tIPC, 0 * sizeof( cxtype ) ); // nicoup=0
@@ -521,7 +522,7 @@ namespace mg5amcCpu
   {
     std::stringstream out;
     // CUDA version (NVCC)
-    // [Use __NVCC__ instead of __CUDACC__ here!]
+    // [Use __NVCC__ instead of MGONGPUCPP_GPUIMPL here!]
     // [This tests if 'nvcc' was used even to build a .cc file, even if not necessarily 'nvcc -x cu' for a .cu file]
     // [Check 'nvcc --compiler-options -dM -E dummy.c | grep CUDA': see https://stackoverflow.com/a/53713712]
 #ifdef __NVCC__
@@ -586,12 +587,12 @@ namespace mg5amcCpu
   __global__ void /* clang-format off */
   computeDependentCouplings( const fptype* allgs, // input: Gs[nevt]
                              fptype* allcouplings // output: couplings[nevt*ndcoup*2]
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
                              , const int nevt     // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #endif
   ) /* clang-format on */
   {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     using namespace mg5amcGpu;
     using G_ACCESS = DeviceAccessGs;
     using C_ACCESS = DeviceAccessCouplings;
@@ -612,7 +613,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__ /* clang-format off */
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
   __global__ void
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
@@ -738,9 +739,9 @@ namespace mg5amcCpu
         nGoodHel++;
       }
     }
-#ifdef __CUDACC__
-    checkCuda( cudaMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) ) );
-    checkCuda( cudaMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) ) );
+#ifdef MGONGPUCPP_GPUIMPL
+    gpuMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) );
+    gpuMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) );
 #else
     cNGoodHel = nGoodHel;
     for( int ihel = 0; ihel < ncomb; ihel++ ) cGoodHel[ihel] = goodHel[ihel];
@@ -764,7 +765,7 @@ namespace mg5amcCpu
 #endif
             int* allselhel,                // output: helicity selection[nevt]
             int* allselcol                 // output: helicity selection[nevt]
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
             , const int nevt               // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #endif
             ) /* clang-format on */
@@ -784,7 +785,7 @@ namespace mg5amcCpu
     // Denominators: spins, colors and identical particles
     constexpr int helcolDenominators[1] = { 36 }; // assume nprocesses == 1 (#272 and #343)
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     // Remember: in CUDA this is a kernel for one event, in c++ this processes n events
     const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid
 #else
@@ -798,9 +799,12 @@ namespace mg5amcCpu
 #endif
 
     // Start sigmaKin_lines
+
+#include "GpuAbstraction.h"
+
     // === PART 0 - INITIALISATION (before calculate_wavefunctions) ===
     // Reset the "matrix elements" - running sums of |M|^2 over helicities for the given event
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     allMEs[ievt] = 0;
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     allNumerators[ievt] = 0;
@@ -828,7 +832,7 @@ namespace mg5amcCpu
     // === PART 1 - HELICITY LOOP: CALCULATE WAVEFUNCTIONS ===
     // (in both CUDA and C++, using precomputed good helicities)
 
-#ifdef __CUDACC__ // CUDA OR C++
+#ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++
 
     // *** START OF PART 1a - CUDA (one event per CPU thread) ***
     // Running sum of partial amplitudes squared for event by event color selection (#402)
@@ -1038,7 +1042,7 @@ namespace mg5amcCpu
     // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event
     // [NB 'sum over final spins, average over initial spins', eg see
     // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf]
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     allMEs[ievt] /= helcolDenominators[0];
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     if( channelId > 0 ) allMEs[ievt] *= allNumerators[ievt] / allDenominators[ievt];
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_uux_ttx/CPPProcess.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_uux_ttx/CPPProcess.h
index e166fa1652..6498b91441 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_uux_ttx/CPPProcess.h
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_uux_ttx/CPPProcess.h
@@ -4,7 +4,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
 // MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
@@ -25,7 +25,7 @@
 
 //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -110,7 +110,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   __global__ void
   computeDependentCouplings( const fptype* allgs,    // input: Gs[nevt]
                              fptype* allcouplings ); // output: couplings[nevt*ndcoup*2]
@@ -123,7 +123,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__ /* clang-format off */
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
   __global__ void
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
@@ -153,7 +153,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__ /* clang-format off */
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
   __global__ void
   sigmaKin( const fptype* allmomenta,      // input: momenta[nevt*npar*4]
             const fptype* allcouplings,    // input: couplings[nevt*ndcoup*2]
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_uux_ttx/CudaRuntime.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_uux_ttx/CudaRuntime.h
deleted file mode 120000
index ce9e1a487a..0000000000
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_uux_ttx/CudaRuntime.h
+++ /dev/null
@@ -1 +0,0 @@
-../CudaRuntime.h
\ No newline at end of file
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_uux_ttx/GpuAbstraction.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_uux_ttx/GpuAbstraction.h
new file mode 120000
index 0000000000..72054e19ba
--- /dev/null
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_uux_ttx/GpuAbstraction.h
@@ -0,0 +1 @@
+../GpuAbstraction.h
\ No newline at end of file
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_uux_ttx/GpuRuntime.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_uux_ttx/GpuRuntime.h
new file mode 120000
index 0000000000..3920e83be4
--- /dev/null
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_uux_ttx/GpuRuntime.h
@@ -0,0 +1 @@
+../GpuRuntime.h
\ No newline at end of file
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_uux_ttx/check_sa.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_uux_ttx/check_sa.cc
index 3fbf0ffbee..7cac5ab47b 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_uux_ttx/check_sa.cc
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_uux_ttx/check_sa.cc
@@ -4,7 +4,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: O. Mattelaer (Nov 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 
 #include "mgOnGpuConfig.h"
@@ -12,6 +12,7 @@
 #include "BridgeKernels.h"
 #include "CPPProcess.h"
 #include "CrossSectionKernels.h"
+#include "GpuRuntime.h"
 #include "MatrixElementKernels.h"
 #include "MemoryAccessMatrixElements.h"
 #include "MemoryAccessMomenta.h"
@@ -65,7 +66,7 @@ usage( char* argv0, int ret = 1 )
   std::cout << std::endl;
   std::cout << "Summary stats are always computed: '-p' and '-j' only control their printout" << std::endl;
   std::cout << "The '-d' flag only enables NaN/abnormal warnings and OMP debugging" << std::endl;
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 #ifdef _OPENMP
   std::cout << std::endl;
   std::cout << "Use the OMP_NUM_THREADS environment variable to control OMP multi-threading" << std::endl;
@@ -96,7 +97,7 @@ int
 main( int argc, char** argv )
 {
   // Namespaces for CUDA and C++ (FIXME - eventually use the same namespace everywhere...)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   using namespace mg5amcGpu;
 #else
   using namespace mg5amcCpu;
@@ -134,9 +135,11 @@ main( int argc, char** argv )
     CurandDevice = 2
   };
 #ifdef MGONGPU_HAS_NO_CURAND
-  RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // this is the only supported mode if build has no curand (PR #784)
+  RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // this is the only supported mode if build has no curand (PR #784 and #785)
+#elif defined __HIPCC__
+#error Internal error: MGONGPU_HAS_NO_CURAND should have been set for __HIPCC__ // default on AMD GPUs should be common random
 #elif defined __CUDACC__
-  RandomNumberMode rndgen = RandomNumberMode::CurandDevice; // default on GPU if build has curand
+  RandomNumberMode rndgen = RandomNumberMode::CurandDevice; // default on NVidia GPU if build has curand
 #else
   RandomNumberMode rndgen = RandomNumberMode::CurandHost; // default on CPU if build has curand
 #endif
@@ -146,10 +149,10 @@ main( int argc, char** argv )
     RamboHost = 1,
     RamboDevice = 2
   };
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   RamboSamplingMode rmbsmp = RamboSamplingMode::RamboDevice; // default on GPU
 #else
-  RamboSamplingMode rmbsmp = RamboSamplingMode::RamboHost;  // default on CPU
+  RamboSamplingMode rmbsmp = RamboSamplingMode::RamboHost; // default on CPU
 #endif
   // Bridge emulation mode (NB Bridge implies RamboHost!)
   bool bridge = false;
@@ -177,7 +180,7 @@ main( int argc, char** argv )
     else if( arg == "--curdev" )
     {
 #ifndef __CUDACC__
-      throw std::runtime_error( "CurandDevice is not supported on CPUs" );
+      throw std::runtime_error( "CurandDevice is not supported on CPUs or non-NVidia GPUs" );
 #elif defined MGONGPU_HAS_NO_CURAND
       throw std::runtime_error( "CurandDevice is not supported because this application was built without Curand support" );
 #else
@@ -198,7 +201,7 @@ main( int argc, char** argv )
     }
     else if( arg == "--rmbdev" )
     {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
       rmbsmp = RamboSamplingMode::RamboDevice;
 #else
       throw std::runtime_error( "RamboDevice is not supported on CPUs" );
@@ -272,13 +275,13 @@ main( int argc, char** argv )
     return usage( argv[0] );
   }
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 #ifdef _OPENMP
   ompnumthreadsNotSetMeansOneThread( debug ? 1 : 0 ); // quiet(-1), info(0), debug(1)
 #endif
 #endif
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // Fail gently and avoid "Illegal instruction (core dumped)" if the host does not support the SIMD used in the ME calculation
   // Note: this prevents a crash on pmpe04 but not on some github CI nodes?
   // [NB: SIMD vectorization in mg5amc C++ code is only used in the ME calculation below MatrixElementKernelHost!]
@@ -296,14 +299,14 @@ main( int argc, char** argv )
 
   // === STEP 0 - INITIALISE
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 
-  // --- 00. Initialise cuda
-  // Instantiate a CudaRuntime at the beginnining of the application's main to
-  // invoke cudaSetDevice(0) in the constructor and book a cudaDeviceReset() call in the destructor
-  const std::string cdinKey = "00 CudaInit";
+  // --- 00. Initialise GPU
+  // Instantiate a GpuRuntime at the beginnining of the application's main.
+  // For CUDA this invokes cudaSetDevice(0) in the constructor and books a cudaDeviceReset() call in the destructor.
+  const std::string cdinKey = "00 GpuInit";
   timermap.start( cdinKey );
-  CudaRuntime cudaRuntime( debug );
+  GpuRuntime GpuRuntime( debug );
 #endif
 
   // --- 0a. Initialise physics process
@@ -325,7 +328,7 @@ main( int argc, char** argv )
   timermap.start( alloKey );
 
   // Memory buffers for random numbers for momenta
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferRndNumMomenta hstRndmom( nevt );
 #else
   PinnedHostBufferRndNumMomenta hstRndmom( nevt );
@@ -333,7 +336,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for sampling weights
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferWeights hstWeights( nevt );
 #else
   PinnedHostBufferWeights hstWeights( nevt );
@@ -341,7 +344,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for momenta
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferMomenta hstMomenta( nevt );
 #else
   PinnedHostBufferMomenta hstMomenta( nevt );
@@ -349,7 +352,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for Gs
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferGs hstGs( nevt );
 #else
   PinnedHostBufferGs hstGs( nevt );
@@ -366,7 +369,7 @@ main( int argc, char** argv )
   }
 
   // Memory buffers for matrix elements
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferMatrixElements hstMatrixElements( nevt );
 #else
   PinnedHostBufferMatrixElements hstMatrixElements( nevt );
@@ -375,7 +378,7 @@ main( int argc, char** argv )
 
   // Memory buffers for random numbers for helicity selection
   // *** NB #403 these buffers always remain initialised at 0: no need for helicity choice in gcheck/check (no LHE produced) ***
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferRndNumHelicity hstRndHel( nevt );
 #else
   PinnedHostBufferRndNumHelicity hstRndHel( nevt );
@@ -384,7 +387,7 @@ main( int argc, char** argv )
 
   // Memory buffers for random numbers for color selection
   // *** NB #402 these buffers always remain initialised at 0: no need for color choice in gcheck/check (no LHE produced) ***
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferRndNumColor hstRndCol( nevt );
 #else
   PinnedHostBufferRndNumColor hstRndCol( nevt );
@@ -392,7 +395,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for helicity selection
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferSelectedHelicity hstSelHel( nevt );
 #else
   PinnedHostBufferSelectedHelicity hstSelHel( nevt );
@@ -400,7 +403,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for color selection
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferSelectedColor hstSelCol( nevt );
 #else
   PinnedHostBufferSelectedColor hstSelCol( nevt );
@@ -438,7 +441,7 @@ main( int argc, char** argv )
     const bool onDevice = true;
     prnk.reset( new CurandRandomNumberKernel( devRndmom, onDevice ) );
 #else
-    throw std::logic_error( "INTERNAL ERROR! CurandDevice is not supported on CPUs" ); // INTERNAL ERROR (no path to this statement)
+    throw std::logic_error( "INTERNAL ERROR! CurandDevice is not supported on CPUs or non-NVidia GPUs" ); // INTERNAL ERROR (no path to this statement)
 #endif
   }
 
@@ -450,7 +453,7 @@ main( int argc, char** argv )
   }
   else
   {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     prsk.reset( new RamboSamplingKernelDevice( energy, devRndmom, devMomenta, devWeights, gpublocks, gputhreads ) );
 #else
     throw std::logic_error( "RamboDevice is not supported on CPUs" ); // INTERNAL ERROR (no path to this statement)
@@ -461,7 +464,7 @@ main( int argc, char** argv )
   std::unique_ptr<MatrixElementKernelBase> pmek;
   if( !bridge )
   {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     pmek.reset( new MatrixElementKernelDevice( devMomenta, devGs, devRndHel, devRndCol, devMatrixElements, devSelHel, devSelCol, gpublocks, gputhreads ) );
 #else
     pmek.reset( new MatrixElementKernelHost( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, nevt ) );
@@ -469,7 +472,7 @@ main( int argc, char** argv )
   }
   else
   {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     pmek.reset( new BridgeKernelDevice( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, gpublocks, gputhreads ) );
 #else
     pmek.reset( new BridgeKernelHost( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, nevt ) );
@@ -511,7 +514,7 @@ main( int argc, char** argv )
     prnk->generateRnarray();
     //std::cout << "Got random numbers" << std::endl;
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     if( rndgen != RandomNumberMode::CurandDevice && rmbsmp == RamboSamplingMode::RamboDevice )
     {
       // --- 1c. Copy rndmom from host to device
@@ -543,7 +546,7 @@ main( int argc, char** argv )
     prsk->getMomentaFinal();
     //std::cout << "Got final momenta" << std::endl;
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     if( rmbsmp == RamboSamplingMode::RamboDevice )
     {
       // --- 2c. CopyDToH Weights
@@ -588,7 +591,7 @@ main( int argc, char** argv )
       dynamic_cast<BridgeKernelBase*>( pmek.get() )->transposeInputMomentaC2F();
     }
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     // --- 2d. CopyHToD Momenta
     const std::string gKey = "0.. CpHTDg";
     rambtime += timermap.start( gKey ); // FIXME! NOT A RAMBO TIMER!
@@ -617,7 +620,7 @@ main( int argc, char** argv )
     wv3atime += timermap.stop(); // calc only
     wavetime += wv3atime;        // calc plus copy
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     if( !bridge )
     {
       // --- 3b. CopyDToH MEs
@@ -760,18 +763,22 @@ main( int argc, char** argv )
     rndgentxt = "CURAND DEVICE";
 #ifdef __CUDACC__
   rndgentxt += " (CUDA code)";
+#elif defined __HIPCC__
+  rndgentxt += " (HIP code)";
 #else
   rndgentxt += " (C++ code)";
 #endif
 
   // Workflow description summary
   std::string wrkflwtxt;
-  // -- CUDA or C++?
+  // -- CUDA or HIP or C++?
 #ifdef __CUDACC__
   wrkflwtxt += "CUD:";
+#elif defined __HIPCC__
+  wrkflwtxt += "HIP:";
 #else
   wrkflwtxt += "CPP:";
-#endif
+#endif /* clang-format off */
   // -- DOUBLE or FLOAT?
 #if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
   wrkflwtxt += "MIX+"; // mixed fptypes (single precision color algebra #537)
@@ -781,7 +788,7 @@ main( int argc, char** argv )
   wrkflwtxt += "FLT+";
 #else
   wrkflwtxt += "???+"; // no path to this statement
-#endif
+#endif /* clang-format on */
   // -- CUCOMPLEX or THRUST or STD complex numbers?
 #ifdef __CUDACC__
 #if defined MGONGPU_CUCXTYPE_CUCOMPLEX
@@ -793,6 +800,12 @@ main( int argc, char** argv )
 #else
   wrkflwtxt += "???:"; // no path to this statement
 #endif
+#elif defined __HIPCC__
+#if defined MGONGPU_CUCXTYPE_CXSMPL
+  wrkflwtxt += "CXS:";
+#else
+  wrkflwtxt += "???:"; // no path to this statement
+#endif
 #else
 #if defined MGONGPU_CPPCXTYPE_STDCOMPLEX
   wrkflwtxt += "STX:";
@@ -818,7 +831,7 @@ main( int argc, char** argv )
     wrkflwtxt += "RMBDEV+";
   else
     wrkflwtxt += "??????+"; // no path to this statement
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   // -- HOST or DEVICE matrix elements? Standalone MEs or BRIDGE?
   if( !bridge )
     wrkflwtxt += "MESDEV";
@@ -874,7 +887,7 @@ main( int argc, char** argv )
 
   if( perf )
   {
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 #ifdef _OPENMP
     // Get the output of "nproc --all" (https://stackoverflow.com/a/478960)
     std::string nprocall;
@@ -895,6 +908,8 @@ main( int argc, char** argv )
     std::cout << std::string( SEP79, '*' ) << std::endl
 #ifdef __CUDACC__
               << "Process                     = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_CUDA"
+#elif defined __HIPCC__
+              << "Process                     = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_HIP"
 #else
               << "Process                     = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_CPP"
 #endif
@@ -921,21 +936,21 @@ main( int argc, char** argv )
 #elif defined MGONGPU_FPTYPE_FLOAT
               << "FP precision                = FLOAT (NaN/abnormal=" << nabn << ", zero=" << nzero << ")" << std::endl
 #endif
-#ifdef __CUDACC__
 #if defined MGONGPU_CUCXTYPE_CUCOMPLEX
               << "Complex type                = CUCOMPLEX" << std::endl
 #elif defined MGONGPU_CUCXTYPE_THRUST
               << "Complex type                = THRUST::COMPLEX" << std::endl
-#endif
-#else
+#elif defined MGONGPU_CUCXTYPE_CXSMPL
               << "Complex type                = STD::COMPLEX" << std::endl
+#else
+              << "Complex type                = ???" << std::endl // no path to this statement...
 #endif
               << "RanNumb memory layout       = AOSOA[" << neppR << "]"
               << ( neppR == 1 ? " == AOS" : "" )
               << " [HARDCODED FOR REPRODUCIBILITY]" << std::endl
               << "Momenta memory layout       = AOSOA[" << neppM << "]"
               << ( neppM == 1 ? " == AOS" : "" ) << std::endl
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     //<< "Wavefunction GPU memory     = LOCAL" << std::endl
 #else
 #if !defined MGONGPU_CPPSIMD
@@ -966,7 +981,7 @@ main( int argc, char** argv )
 #endif
 #endif
               << "Random number generation    = " << rndgentxt << std::endl
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 #ifdef _OPENMP
               << "OMP threads / `nproc --all` = " << omp_get_max_threads() << " / " << nprocall // includes a newline
 #endif
@@ -1062,14 +1077,14 @@ main( int argc, char** argv )
              << "\"FLOAT (NaN/abnormal=" << nabn << ")\"," << std::endl
 #endif
              << "\"Complex type\": "
-#ifdef __CUDACC__
 #if defined MGONGPU_CUCXTYPE_CUCOMPLEX
              << "\"CUCOMPLEX\"," << std::endl
 #elif defined MGONGPU_CUCXTYPE_THRUST
              << "\"THRUST::COMPLEX\"," << std::endl
-#endif
-#else
+#elif defined MGONGPU_CUCXTYPE_CXSMPL
              << "\"STD::COMPLEX\"," << std::endl
+#else
+             << "\"???\"," << std::endl                           // no path to this statement...
 #endif
              << "\"RanNumb memory layout\": "
              << "\"AOSOA[" << neppR << "]\""
@@ -1077,7 +1092,7 @@ main( int argc, char** argv )
              << "\"Momenta memory layout\": "
              << "\"AOSOA[" << neppM << "]\""
              << ( neppM == 1 ? " == AOS" : "" ) << ", " << std::endl
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     //<< "\"Wavefunction GPU memory\": " << "\"LOCAL\"," << std::endl
 #endif
              << "\"Curand generation\": "
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gg_ttxg/CPPProcess.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gg_ttxg/CPPProcess.cc
index afeebde3c6..0e4d5d1157 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gg_ttxg/CPPProcess.cc
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gg_ttxg/CPPProcess.cc
@@ -4,7 +4,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
 // MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
@@ -16,7 +16,6 @@
 
 #include "mgOnGpuConfig.h"
 
-#include "CudaRuntime.h"
 #include "HelAmps_sm.h"
 #include "MemoryAccessAmplitudes.h"
 #include "MemoryAccessCouplings.h"
@@ -46,7 +45,7 @@
 // Class member functions for calculating the matrix elements for
 // Process: g g > t t~ g WEIGHTED<=3 @1
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -80,7 +79,7 @@ namespace mg5amcCpu
   __device__ const fptype cIPD[2] = { (fptype)Parameters_sm::mdl_MT, (fptype)Parameters_sm::mdl_WT };
   __device__ const fptype* cIPC = nullptr; // unused as nicoup=0
 #else
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   __device__ __constant__ fptype cIPD[2];
   __device__ __constant__ fptype* cIPC = nullptr; // unused as nicoup=0
 #else
@@ -90,7 +89,7 @@ namespace mg5amcCpu
 #endif
 
   // Helicity combinations (and filtering of "good" helicity combinations)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   __device__ __constant__ short cHel[ncomb][npar];
   __device__ __constant__ int cNGoodHel;
   __device__ __constant__ int cGoodHel[ncomb];
@@ -118,13 +117,13 @@ namespace mg5amcCpu
                            fptype* allDenominators,       // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
                            fptype_sv* jamp2_sv            // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled)
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
                            , const int ievt00             // input: first event number in current C++ event page (for CUDA, ievt depends on threadid)
 #endif
                            )
   //ALWAYS_INLINE // attributes are not permitted in a function definition
   {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     using namespace mg5amcGpu;
     using M_ACCESS = DeviceAccessMomenta;         // non-trivial access: buffer includes all events
     using E_ACCESS = DeviceAccessMatrixElements;  // non-trivial access: buffer includes all events
@@ -151,7 +150,7 @@ namespace mg5amcCpu
 #endif /* clang-format on */
     mgDebug( 0, __FUNCTION__ );
     //printf( "calculate_wavefunctions: ihel=%2d\n", ihel );
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
     //printf( "calculate_wavefunctions: ievt00=%d\n", ievt00 );
 #endif
 
@@ -187,7 +186,7 @@ namespace mg5amcCpu
 #endif
     for( int iParity = 0; iParity < nParity; ++iParity )
     { // START LOOP ON IPARITY
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
       const int ievt0 = ievt00 + iParity * neppV;
 #endif
       constexpr size_t nxcoup = ndcoup + nicoup; // both dependent and independent couplings
@@ -200,8 +199,10 @@ namespace mg5amcCpu
         allCOUPs[idcoup] = CD_ACCESS::idcoupAccessBufferConst( allcouplings, idcoup ); // dependent couplings, vary event-by-event
       for( size_t iicoup = 0; iicoup < nicoup; iicoup++ )
         allCOUPs[ndcoup + iicoup] = CI_ACCESS::iicoupAccessBufferConst( cIPC, iicoup ); // independent couplings, fixed for all events
+#ifdef MGONGPUCPP_GPUIMPL
 #ifdef __CUDACC__
 #pragma nv_diagnostic pop
+#endif
       // CUDA kernels take input/output buffers with momenta/MEs for all events
       const fptype* momenta = allmomenta;
       const fptype* COUPs[nxcoup];
@@ -505,7 +506,7 @@ namespace mg5amcCpu
         { 1, -8, 10, 1, 64, -8 },
         { 10, 1, 1, -8, -8, 64 } }; // 2-D array[6][6]
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
       // Pre-compute a constexpr triangular color matrix properly normalized #475
       struct TriangularNormalizedColorMatrix
       {
@@ -562,7 +563,7 @@ namespace mg5amcCpu
 #endif
       for( int icol = 0; icol < ncolor; icol++ )
       {
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
         // === C++ START ===
         // Diagonal terms
 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
@@ -621,7 +622,7 @@ namespace mg5amcCpu
       MEs_sv_previous += deltaMEs_previous;
 #endif
       /*
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
       if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", blockDim.x * blockIdx.x + threadIdx.x, ihel, MEs_sv );
 #else
 #ifdef MGONGPU_CPPSIMD
@@ -684,8 +685,8 @@ namespace mg5amcCpu
       { 1, 1, 1, 1, 1 },
       { 1, 1, 1, -1, -1 },
       { 1, 1, 1, -1, 1 } };
-#ifdef __CUDACC__
-    checkCuda( cudaMemcpyToSymbol( cHel, tHel, ncomb * npar * sizeof( short ) ) );
+#ifdef MGONGPUCPP_GPUIMPL
+    gpuMemcpyToSymbol( cHel, tHel, ncomb * npar * sizeof( short ) );
 #else
     memcpy( cHel, tHel, ncomb * npar * sizeof( short ) );
 #endif
@@ -726,9 +727,9 @@ namespace mg5amcCpu
     // Then copy them to CUDA constant memory (issue #39) or its C++ emulation in file-scope static memory
     const fptype tIPD[2] = { (fptype)m_pars->mdl_MT, (fptype)m_pars->mdl_WT };
     //const cxtype tIPC[0] = { ... }; // nicoup=0
-#ifdef __CUDACC__
-    checkCuda( cudaMemcpyToSymbol( cIPD, tIPD, 2 * sizeof( fptype ) ) );
-    //checkCuda( cudaMemcpyToSymbol( cIPC, tIPC, 0 * sizeof( cxtype ) ) ); // nicoup=0
+#ifdef MGONGPUCPP_GPUIMPL
+    gpuMemcpyToSymbol( cIPD, tIPD, 2 * sizeof( fptype ) );
+    //gpuMemcpyToSymbol( cIPC, tIPC, 0 * sizeof( cxtype ) ); // nicoup=0
 #else
     memcpy( cIPD, tIPD, 2 * sizeof( fptype ) );
     //memcpy( cIPC, tIPC, 0 * sizeof( cxtype ) ); // nicoup=0
@@ -765,7 +766,7 @@ namespace mg5amcCpu
   {
     std::stringstream out;
     // CUDA version (NVCC)
-    // [Use __NVCC__ instead of __CUDACC__ here!]
+    // [Use __NVCC__ instead of MGONGPUCPP_GPUIMPL here!]
     // [This tests if 'nvcc' was used even to build a .cc file, even if not necessarily 'nvcc -x cu' for a .cu file]
     // [Check 'nvcc --compiler-options -dM -E dummy.c | grep CUDA': see https://stackoverflow.com/a/53713712]
 #ifdef __NVCC__
@@ -830,12 +831,12 @@ namespace mg5amcCpu
   __global__ void /* clang-format off */
   computeDependentCouplings( const fptype* allgs, // input: Gs[nevt]
                              fptype* allcouplings // output: couplings[nevt*ndcoup*2]
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
                              , const int nevt     // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #endif
   ) /* clang-format on */
   {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     using namespace mg5amcGpu;
     using G_ACCESS = DeviceAccessGs;
     using C_ACCESS = DeviceAccessCouplings;
@@ -856,7 +857,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__ /* clang-format off */
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
   __global__ void
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
@@ -982,9 +983,9 @@ namespace mg5amcCpu
         nGoodHel++;
       }
     }
-#ifdef __CUDACC__
-    checkCuda( cudaMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) ) );
-    checkCuda( cudaMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) ) );
+#ifdef MGONGPUCPP_GPUIMPL
+    gpuMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) );
+    gpuMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) );
 #else
     cNGoodHel = nGoodHel;
     for( int ihel = 0; ihel < ncomb; ihel++ ) cGoodHel[ihel] = goodHel[ihel];
@@ -1008,7 +1009,7 @@ namespace mg5amcCpu
 #endif
             int* allselhel,                // output: helicity selection[nevt]
             int* allselcol                 // output: helicity selection[nevt]
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
             , const int nevt               // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #endif
             ) /* clang-format on */
@@ -1028,7 +1029,7 @@ namespace mg5amcCpu
     // Denominators: spins, colors and identical particles
     constexpr int helcolDenominators[1] = { 256 }; // assume nprocesses == 1 (#272 and #343)
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     // Remember: in CUDA this is a kernel for one event, in c++ this processes n events
     const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid
 #else
@@ -1042,9 +1043,12 @@ namespace mg5amcCpu
 #endif
 
     // Start sigmaKin_lines
+
+#include "GpuAbstraction.h"
+
     // === PART 0 - INITIALISATION (before calculate_wavefunctions) ===
     // Reset the "matrix elements" - running sums of |M|^2 over helicities for the given event
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     allMEs[ievt] = 0;
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     allNumerators[ievt] = 0;
@@ -1072,7 +1076,7 @@ namespace mg5amcCpu
     // === PART 1 - HELICITY LOOP: CALCULATE WAVEFUNCTIONS ===
     // (in both CUDA and C++, using precomputed good helicities)
 
-#ifdef __CUDACC__ // CUDA OR C++
+#ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++
 
     // *** START OF PART 1a - CUDA (one event per CPU thread) ***
     // Running sum of partial amplitudes squared for event by event color selection (#402)
@@ -1282,7 +1286,7 @@ namespace mg5amcCpu
     // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event
     // [NB 'sum over final spins, average over initial spins', eg see
     // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf]
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     allMEs[ievt] /= helcolDenominators[0];
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     if( channelId > 0 ) allMEs[ievt] *= allNumerators[ievt] / allDenominators[ievt];
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gg_ttxg/CPPProcess.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gg_ttxg/CPPProcess.h
index 37d6ebe981..11f562273e 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gg_ttxg/CPPProcess.h
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gg_ttxg/CPPProcess.h
@@ -4,7 +4,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
 // MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
@@ -25,7 +25,7 @@
 
 //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -107,7 +107,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   __global__ void
   computeDependentCouplings( const fptype* allgs,    // input: Gs[nevt]
                              fptype* allcouplings ); // output: couplings[nevt*ndcoup*2]
@@ -120,7 +120,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__ /* clang-format off */
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
   __global__ void
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
@@ -150,7 +150,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__ /* clang-format off */
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
   __global__ void
   sigmaKin( const fptype* allmomenta,      // input: momenta[nevt*npar*4]
             const fptype* allcouplings,    // input: couplings[nevt*ndcoup*2]
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gg_ttxg/CudaRuntime.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gg_ttxg/CudaRuntime.h
deleted file mode 120000
index ce9e1a487a..0000000000
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gg_ttxg/CudaRuntime.h
+++ /dev/null
@@ -1 +0,0 @@
-../CudaRuntime.h
\ No newline at end of file
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gg_ttxg/GpuAbstraction.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gg_ttxg/GpuAbstraction.h
new file mode 120000
index 0000000000..72054e19ba
--- /dev/null
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gg_ttxg/GpuAbstraction.h
@@ -0,0 +1 @@
+../GpuAbstraction.h
\ No newline at end of file
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gg_ttxg/GpuRuntime.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gg_ttxg/GpuRuntime.h
new file mode 120000
index 0000000000..3920e83be4
--- /dev/null
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gg_ttxg/GpuRuntime.h
@@ -0,0 +1 @@
+../GpuRuntime.h
\ No newline at end of file
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gg_ttxg/check_sa.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gg_ttxg/check_sa.cc
index 3fbf0ffbee..7cac5ab47b 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gg_ttxg/check_sa.cc
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gg_ttxg/check_sa.cc
@@ -4,7 +4,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: O. Mattelaer (Nov 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 
 #include "mgOnGpuConfig.h"
@@ -12,6 +12,7 @@
 #include "BridgeKernels.h"
 #include "CPPProcess.h"
 #include "CrossSectionKernels.h"
+#include "GpuRuntime.h"
 #include "MatrixElementKernels.h"
 #include "MemoryAccessMatrixElements.h"
 #include "MemoryAccessMomenta.h"
@@ -65,7 +66,7 @@ usage( char* argv0, int ret = 1 )
   std::cout << std::endl;
   std::cout << "Summary stats are always computed: '-p' and '-j' only control their printout" << std::endl;
   std::cout << "The '-d' flag only enables NaN/abnormal warnings and OMP debugging" << std::endl;
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 #ifdef _OPENMP
   std::cout << std::endl;
   std::cout << "Use the OMP_NUM_THREADS environment variable to control OMP multi-threading" << std::endl;
@@ -96,7 +97,7 @@ int
 main( int argc, char** argv )
 {
   // Namespaces for CUDA and C++ (FIXME - eventually use the same namespace everywhere...)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   using namespace mg5amcGpu;
 #else
   using namespace mg5amcCpu;
@@ -134,9 +135,11 @@ main( int argc, char** argv )
     CurandDevice = 2
   };
 #ifdef MGONGPU_HAS_NO_CURAND
-  RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // this is the only supported mode if build has no curand (PR #784)
+  RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // this is the only supported mode if build has no curand (PR #784 and #785)
+#elif defined __HIPCC__
+#error Internal error: MGONGPU_HAS_NO_CURAND should have been set for __HIPCC__ // default on AMD GPUs should be common random
 #elif defined __CUDACC__
-  RandomNumberMode rndgen = RandomNumberMode::CurandDevice; // default on GPU if build has curand
+  RandomNumberMode rndgen = RandomNumberMode::CurandDevice; // default on NVidia GPU if build has curand
 #else
   RandomNumberMode rndgen = RandomNumberMode::CurandHost; // default on CPU if build has curand
 #endif
@@ -146,10 +149,10 @@ main( int argc, char** argv )
     RamboHost = 1,
     RamboDevice = 2
   };
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   RamboSamplingMode rmbsmp = RamboSamplingMode::RamboDevice; // default on GPU
 #else
-  RamboSamplingMode rmbsmp = RamboSamplingMode::RamboHost;  // default on CPU
+  RamboSamplingMode rmbsmp = RamboSamplingMode::RamboHost; // default on CPU
 #endif
   // Bridge emulation mode (NB Bridge implies RamboHost!)
   bool bridge = false;
@@ -177,7 +180,7 @@ main( int argc, char** argv )
     else if( arg == "--curdev" )
     {
 #ifndef __CUDACC__
-      throw std::runtime_error( "CurandDevice is not supported on CPUs" );
+      throw std::runtime_error( "CurandDevice is not supported on CPUs or non-NVidia GPUs" );
 #elif defined MGONGPU_HAS_NO_CURAND
       throw std::runtime_error( "CurandDevice is not supported because this application was built without Curand support" );
 #else
@@ -198,7 +201,7 @@ main( int argc, char** argv )
     }
     else if( arg == "--rmbdev" )
     {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
       rmbsmp = RamboSamplingMode::RamboDevice;
 #else
       throw std::runtime_error( "RamboDevice is not supported on CPUs" );
@@ -272,13 +275,13 @@ main( int argc, char** argv )
     return usage( argv[0] );
   }
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 #ifdef _OPENMP
   ompnumthreadsNotSetMeansOneThread( debug ? 1 : 0 ); // quiet(-1), info(0), debug(1)
 #endif
 #endif
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // Fail gently and avoid "Illegal instruction (core dumped)" if the host does not support the SIMD used in the ME calculation
   // Note: this prevents a crash on pmpe04 but not on some github CI nodes?
   // [NB: SIMD vectorization in mg5amc C++ code is only used in the ME calculation below MatrixElementKernelHost!]
@@ -296,14 +299,14 @@ main( int argc, char** argv )
 
   // === STEP 0 - INITIALISE
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 
-  // --- 00. Initialise cuda
-  // Instantiate a CudaRuntime at the beginnining of the application's main to
-  // invoke cudaSetDevice(0) in the constructor and book a cudaDeviceReset() call in the destructor
-  const std::string cdinKey = "00 CudaInit";
+  // --- 00. Initialise GPU
+  // Instantiate a GpuRuntime at the beginnining of the application's main.
+  // For CUDA this invokes cudaSetDevice(0) in the constructor and books a cudaDeviceReset() call in the destructor.
+  const std::string cdinKey = "00 GpuInit";
   timermap.start( cdinKey );
-  CudaRuntime cudaRuntime( debug );
+  GpuRuntime GpuRuntime( debug );
 #endif
 
   // --- 0a. Initialise physics process
@@ -325,7 +328,7 @@ main( int argc, char** argv )
   timermap.start( alloKey );
 
   // Memory buffers for random numbers for momenta
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferRndNumMomenta hstRndmom( nevt );
 #else
   PinnedHostBufferRndNumMomenta hstRndmom( nevt );
@@ -333,7 +336,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for sampling weights
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferWeights hstWeights( nevt );
 #else
   PinnedHostBufferWeights hstWeights( nevt );
@@ -341,7 +344,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for momenta
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferMomenta hstMomenta( nevt );
 #else
   PinnedHostBufferMomenta hstMomenta( nevt );
@@ -349,7 +352,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for Gs
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferGs hstGs( nevt );
 #else
   PinnedHostBufferGs hstGs( nevt );
@@ -366,7 +369,7 @@ main( int argc, char** argv )
   }
 
   // Memory buffers for matrix elements
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferMatrixElements hstMatrixElements( nevt );
 #else
   PinnedHostBufferMatrixElements hstMatrixElements( nevt );
@@ -375,7 +378,7 @@ main( int argc, char** argv )
 
   // Memory buffers for random numbers for helicity selection
   // *** NB #403 these buffers always remain initialised at 0: no need for helicity choice in gcheck/check (no LHE produced) ***
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferRndNumHelicity hstRndHel( nevt );
 #else
   PinnedHostBufferRndNumHelicity hstRndHel( nevt );
@@ -384,7 +387,7 @@ main( int argc, char** argv )
 
   // Memory buffers for random numbers for color selection
   // *** NB #402 these buffers always remain initialised at 0: no need for color choice in gcheck/check (no LHE produced) ***
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferRndNumColor hstRndCol( nevt );
 #else
   PinnedHostBufferRndNumColor hstRndCol( nevt );
@@ -392,7 +395,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for helicity selection
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferSelectedHelicity hstSelHel( nevt );
 #else
   PinnedHostBufferSelectedHelicity hstSelHel( nevt );
@@ -400,7 +403,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for color selection
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferSelectedColor hstSelCol( nevt );
 #else
   PinnedHostBufferSelectedColor hstSelCol( nevt );
@@ -438,7 +441,7 @@ main( int argc, char** argv )
     const bool onDevice = true;
     prnk.reset( new CurandRandomNumberKernel( devRndmom, onDevice ) );
 #else
-    throw std::logic_error( "INTERNAL ERROR! CurandDevice is not supported on CPUs" ); // INTERNAL ERROR (no path to this statement)
+    throw std::logic_error( "INTERNAL ERROR! CurandDevice is not supported on CPUs or non-NVidia GPUs" ); // INTERNAL ERROR (no path to this statement)
 #endif
   }
 
@@ -450,7 +453,7 @@ main( int argc, char** argv )
   }
   else
   {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     prsk.reset( new RamboSamplingKernelDevice( energy, devRndmom, devMomenta, devWeights, gpublocks, gputhreads ) );
 #else
     throw std::logic_error( "RamboDevice is not supported on CPUs" ); // INTERNAL ERROR (no path to this statement)
@@ -461,7 +464,7 @@ main( int argc, char** argv )
   std::unique_ptr<MatrixElementKernelBase> pmek;
   if( !bridge )
   {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     pmek.reset( new MatrixElementKernelDevice( devMomenta, devGs, devRndHel, devRndCol, devMatrixElements, devSelHel, devSelCol, gpublocks, gputhreads ) );
 #else
     pmek.reset( new MatrixElementKernelHost( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, nevt ) );
@@ -469,7 +472,7 @@ main( int argc, char** argv )
   }
   else
   {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     pmek.reset( new BridgeKernelDevice( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, gpublocks, gputhreads ) );
 #else
     pmek.reset( new BridgeKernelHost( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, nevt ) );
@@ -511,7 +514,7 @@ main( int argc, char** argv )
     prnk->generateRnarray();
     //std::cout << "Got random numbers" << std::endl;
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     if( rndgen != RandomNumberMode::CurandDevice && rmbsmp == RamboSamplingMode::RamboDevice )
     {
       // --- 1c. Copy rndmom from host to device
@@ -543,7 +546,7 @@ main( int argc, char** argv )
     prsk->getMomentaFinal();
     //std::cout << "Got final momenta" << std::endl;
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     if( rmbsmp == RamboSamplingMode::RamboDevice )
     {
       // --- 2c. CopyDToH Weights
@@ -588,7 +591,7 @@ main( int argc, char** argv )
       dynamic_cast<BridgeKernelBase*>( pmek.get() )->transposeInputMomentaC2F();
     }
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     // --- 2d. CopyHToD Momenta
     const std::string gKey = "0.. CpHTDg";
     rambtime += timermap.start( gKey ); // FIXME! NOT A RAMBO TIMER!
@@ -617,7 +620,7 @@ main( int argc, char** argv )
     wv3atime += timermap.stop(); // calc only
     wavetime += wv3atime;        // calc plus copy
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     if( !bridge )
     {
       // --- 3b. CopyDToH MEs
@@ -760,18 +763,22 @@ main( int argc, char** argv )
     rndgentxt = "CURAND DEVICE";
 #ifdef __CUDACC__
   rndgentxt += " (CUDA code)";
+#elif defined __HIPCC__
+  rndgentxt += " (HIP code)";
 #else
   rndgentxt += " (C++ code)";
 #endif
 
   // Workflow description summary
   std::string wrkflwtxt;
-  // -- CUDA or C++?
+  // -- CUDA or HIP or C++?
 #ifdef __CUDACC__
   wrkflwtxt += "CUD:";
+#elif defined __HIPCC__
+  wrkflwtxt += "HIP:";
 #else
   wrkflwtxt += "CPP:";
-#endif
+#endif /* clang-format off */
   // -- DOUBLE or FLOAT?
 #if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
   wrkflwtxt += "MIX+"; // mixed fptypes (single precision color algebra #537)
@@ -781,7 +788,7 @@ main( int argc, char** argv )
   wrkflwtxt += "FLT+";
 #else
   wrkflwtxt += "???+"; // no path to this statement
-#endif
+#endif /* clang-format on */
   // -- CUCOMPLEX or THRUST or STD complex numbers?
 #ifdef __CUDACC__
 #if defined MGONGPU_CUCXTYPE_CUCOMPLEX
@@ -793,6 +800,12 @@ main( int argc, char** argv )
 #else
   wrkflwtxt += "???:"; // no path to this statement
 #endif
+#elif defined __HIPCC__
+#if defined MGONGPU_CUCXTYPE_CXSMPL
+  wrkflwtxt += "CXS:";
+#else
+  wrkflwtxt += "???:"; // no path to this statement
+#endif
 #else
 #if defined MGONGPU_CPPCXTYPE_STDCOMPLEX
   wrkflwtxt += "STX:";
@@ -818,7 +831,7 @@ main( int argc, char** argv )
     wrkflwtxt += "RMBDEV+";
   else
     wrkflwtxt += "??????+"; // no path to this statement
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   // -- HOST or DEVICE matrix elements? Standalone MEs or BRIDGE?
   if( !bridge )
     wrkflwtxt += "MESDEV";
@@ -874,7 +887,7 @@ main( int argc, char** argv )
 
   if( perf )
   {
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 #ifdef _OPENMP
     // Get the output of "nproc --all" (https://stackoverflow.com/a/478960)
     std::string nprocall;
@@ -895,6 +908,8 @@ main( int argc, char** argv )
     std::cout << std::string( SEP79, '*' ) << std::endl
 #ifdef __CUDACC__
               << "Process                     = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_CUDA"
+#elif defined __HIPCC__
+              << "Process                     = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_HIP"
 #else
               << "Process                     = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_CPP"
 #endif
@@ -921,21 +936,21 @@ main( int argc, char** argv )
 #elif defined MGONGPU_FPTYPE_FLOAT
               << "FP precision                = FLOAT (NaN/abnormal=" << nabn << ", zero=" << nzero << ")" << std::endl
 #endif
-#ifdef __CUDACC__
 #if defined MGONGPU_CUCXTYPE_CUCOMPLEX
               << "Complex type                = CUCOMPLEX" << std::endl
 #elif defined MGONGPU_CUCXTYPE_THRUST
               << "Complex type                = THRUST::COMPLEX" << std::endl
-#endif
-#else
+#elif defined MGONGPU_CUCXTYPE_CXSMPL
               << "Complex type                = STD::COMPLEX" << std::endl
+#else
+              << "Complex type                = ???" << std::endl // no path to this statement...
 #endif
               << "RanNumb memory layout       = AOSOA[" << neppR << "]"
               << ( neppR == 1 ? " == AOS" : "" )
               << " [HARDCODED FOR REPRODUCIBILITY]" << std::endl
               << "Momenta memory layout       = AOSOA[" << neppM << "]"
               << ( neppM == 1 ? " == AOS" : "" ) << std::endl
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     //<< "Wavefunction GPU memory     = LOCAL" << std::endl
 #else
 #if !defined MGONGPU_CPPSIMD
@@ -966,7 +981,7 @@ main( int argc, char** argv )
 #endif
 #endif
               << "Random number generation    = " << rndgentxt << std::endl
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 #ifdef _OPENMP
               << "OMP threads / `nproc --all` = " << omp_get_max_threads() << " / " << nprocall // includes a newline
 #endif
@@ -1062,14 +1077,14 @@ main( int argc, char** argv )
              << "\"FLOAT (NaN/abnormal=" << nabn << ")\"," << std::endl
 #endif
              << "\"Complex type\": "
-#ifdef __CUDACC__
 #if defined MGONGPU_CUCXTYPE_CUCOMPLEX
              << "\"CUCOMPLEX\"," << std::endl
 #elif defined MGONGPU_CUCXTYPE_THRUST
              << "\"THRUST::COMPLEX\"," << std::endl
-#endif
-#else
+#elif defined MGONGPU_CUCXTYPE_CXSMPL
              << "\"STD::COMPLEX\"," << std::endl
+#else
+             << "\"???\"," << std::endl                           // no path to this statement...
 #endif
              << "\"RanNumb memory layout\": "
              << "\"AOSOA[" << neppR << "]\""
@@ -1077,7 +1092,7 @@ main( int argc, char** argv )
              << "\"Momenta memory layout\": "
              << "\"AOSOA[" << neppM << "]\""
              << ( neppM == 1 ? " == AOS" : "" ) << ", " << std::endl
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     //<< "\"Wavefunction GPU memory\": " << "\"LOCAL\"," << std::endl
 #endif
              << "\"Curand generation\": "
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gu_ttxu/CPPProcess.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gu_ttxu/CPPProcess.cc
index b7e3475679..e098c03e3a 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gu_ttxu/CPPProcess.cc
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gu_ttxu/CPPProcess.cc
@@ -4,7 +4,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
 // MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
@@ -16,7 +16,6 @@
 
 #include "mgOnGpuConfig.h"
 
-#include "CudaRuntime.h"
 #include "HelAmps_sm.h"
 #include "MemoryAccessAmplitudes.h"
 #include "MemoryAccessCouplings.h"
@@ -49,7 +48,7 @@
 // Process: g d > t t~ d WEIGHTED<=3 @1
 // Process: g s > t t~ s WEIGHTED<=3 @1
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -83,7 +82,7 @@ namespace mg5amcCpu
   __device__ const fptype cIPD[2] = { (fptype)Parameters_sm::mdl_MT, (fptype)Parameters_sm::mdl_WT };
   __device__ const fptype* cIPC = nullptr; // unused as nicoup=0
 #else
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   __device__ __constant__ fptype cIPD[2];
   __device__ __constant__ fptype* cIPC = nullptr; // unused as nicoup=0
 #else
@@ -93,7 +92,7 @@ namespace mg5amcCpu
 #endif
 
   // Helicity combinations (and filtering of "good" helicity combinations)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   __device__ __constant__ short cHel[ncomb][npar];
   __device__ __constant__ int cNGoodHel;
   __device__ __constant__ int cGoodHel[ncomb];
@@ -121,13 +120,13 @@ namespace mg5amcCpu
                            fptype* allDenominators,       // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
                            fptype_sv* jamp2_sv            // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled)
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
                            , const int ievt00             // input: first event number in current C++ event page (for CUDA, ievt depends on threadid)
 #endif
                            )
   //ALWAYS_INLINE // attributes are not permitted in a function definition
   {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     using namespace mg5amcGpu;
     using M_ACCESS = DeviceAccessMomenta;         // non-trivial access: buffer includes all events
     using E_ACCESS = DeviceAccessMatrixElements;  // non-trivial access: buffer includes all events
@@ -154,7 +153,7 @@ namespace mg5amcCpu
 #endif /* clang-format on */
     mgDebug( 0, __FUNCTION__ );
     //printf( "calculate_wavefunctions: ihel=%2d\n", ihel );
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
     //printf( "calculate_wavefunctions: ievt00=%d\n", ievt00 );
 #endif
 
@@ -190,7 +189,7 @@ namespace mg5amcCpu
 #endif
     for( int iParity = 0; iParity < nParity; ++iParity )
     { // START LOOP ON IPARITY
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
       const int ievt0 = ievt00 + iParity * neppV;
 #endif
       constexpr size_t nxcoup = ndcoup + nicoup; // both dependent and independent couplings
@@ -203,8 +202,10 @@ namespace mg5amcCpu
         allCOUPs[idcoup] = CD_ACCESS::idcoupAccessBufferConst( allcouplings, idcoup ); // dependent couplings, vary event-by-event
       for( size_t iicoup = 0; iicoup < nicoup; iicoup++ )
         allCOUPs[ndcoup + iicoup] = CI_ACCESS::iicoupAccessBufferConst( cIPC, iicoup ); // independent couplings, fixed for all events
+#ifdef MGONGPUCPP_GPUIMPL
 #ifdef __CUDACC__
 #pragma nv_diagnostic pop
+#endif
       // CUDA kernels take input/output buffers with momenta/MEs for all events
       const fptype* momenta = allmomenta;
       const fptype* COUPs[nxcoup];
@@ -341,7 +342,7 @@ namespace mg5amcCpu
         { 4, 0, 12, 4 },
         { 0, 4, 4, 12 } }; // 2-D array[4][4]
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
       // Pre-compute a constexpr triangular color matrix properly normalized #475
       struct TriangularNormalizedColorMatrix
       {
@@ -398,7 +399,7 @@ namespace mg5amcCpu
 #endif
       for( int icol = 0; icol < ncolor; icol++ )
       {
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
         // === C++ START ===
         // Diagonal terms
 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
@@ -457,7 +458,7 @@ namespace mg5amcCpu
       MEs_sv_previous += deltaMEs_previous;
 #endif
       /*
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
       if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", blockDim.x * blockIdx.x + threadIdx.x, ihel, MEs_sv );
 #else
 #ifdef MGONGPU_CPPSIMD
@@ -520,8 +521,8 @@ namespace mg5amcCpu
       { 1, -1, 1, 1, 1 },
       { 1, -1, 1, -1, -1 },
       { 1, -1, 1, -1, 1 } };
-#ifdef __CUDACC__
-    checkCuda( cudaMemcpyToSymbol( cHel, tHel, ncomb * npar * sizeof( short ) ) );
+#ifdef MGONGPUCPP_GPUIMPL
+    gpuMemcpyToSymbol( cHel, tHel, ncomb * npar * sizeof( short ) );
 #else
     memcpy( cHel, tHel, ncomb * npar * sizeof( short ) );
 #endif
@@ -562,9 +563,9 @@ namespace mg5amcCpu
     // Then copy them to CUDA constant memory (issue #39) or its C++ emulation in file-scope static memory
     const fptype tIPD[2] = { (fptype)m_pars->mdl_MT, (fptype)m_pars->mdl_WT };
     //const cxtype tIPC[0] = { ... }; // nicoup=0
-#ifdef __CUDACC__
-    checkCuda( cudaMemcpyToSymbol( cIPD, tIPD, 2 * sizeof( fptype ) ) );
-    //checkCuda( cudaMemcpyToSymbol( cIPC, tIPC, 0 * sizeof( cxtype ) ) ); // nicoup=0
+#ifdef MGONGPUCPP_GPUIMPL
+    gpuMemcpyToSymbol( cIPD, tIPD, 2 * sizeof( fptype ) );
+    //gpuMemcpyToSymbol( cIPC, tIPC, 0 * sizeof( cxtype ) ); // nicoup=0
 #else
     memcpy( cIPD, tIPD, 2 * sizeof( fptype ) );
     //memcpy( cIPC, tIPC, 0 * sizeof( cxtype ) ); // nicoup=0
@@ -601,7 +602,7 @@ namespace mg5amcCpu
   {
     std::stringstream out;
     // CUDA version (NVCC)
-    // [Use __NVCC__ instead of __CUDACC__ here!]
+    // [Use __NVCC__ instead of MGONGPUCPP_GPUIMPL here!]
     // [This tests if 'nvcc' was used even to build a .cc file, even if not necessarily 'nvcc -x cu' for a .cu file]
     // [Check 'nvcc --compiler-options -dM -E dummy.c | grep CUDA': see https://stackoverflow.com/a/53713712]
 #ifdef __NVCC__
@@ -666,12 +667,12 @@ namespace mg5amcCpu
   __global__ void /* clang-format off */
   computeDependentCouplings( const fptype* allgs, // input: Gs[nevt]
                              fptype* allcouplings // output: couplings[nevt*ndcoup*2]
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
                              , const int nevt     // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #endif
   ) /* clang-format on */
   {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     using namespace mg5amcGpu;
     using G_ACCESS = DeviceAccessGs;
     using C_ACCESS = DeviceAccessCouplings;
@@ -692,7 +693,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__ /* clang-format off */
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
   __global__ void
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
@@ -818,9 +819,9 @@ namespace mg5amcCpu
         nGoodHel++;
       }
     }
-#ifdef __CUDACC__
-    checkCuda( cudaMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) ) );
-    checkCuda( cudaMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) ) );
+#ifdef MGONGPUCPP_GPUIMPL
+    gpuMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) );
+    gpuMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) );
 #else
     cNGoodHel = nGoodHel;
     for( int ihel = 0; ihel < ncomb; ihel++ ) cGoodHel[ihel] = goodHel[ihel];
@@ -844,7 +845,7 @@ namespace mg5amcCpu
 #endif
             int* allselhel,                // output: helicity selection[nevt]
             int* allselcol                 // output: helicity selection[nevt]
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
             , const int nevt               // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #endif
             ) /* clang-format on */
@@ -864,7 +865,7 @@ namespace mg5amcCpu
     // Denominators: spins, colors and identical particles
     constexpr int helcolDenominators[1] = { 96 }; // assume nprocesses == 1 (#272 and #343)
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     // Remember: in CUDA this is a kernel for one event, in c++ this processes n events
     const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid
 #else
@@ -878,9 +879,12 @@ namespace mg5amcCpu
 #endif
 
     // Start sigmaKin_lines
+
+#include "GpuAbstraction.h"
+
     // === PART 0 - INITIALISATION (before calculate_wavefunctions) ===
     // Reset the "matrix elements" - running sums of |M|^2 over helicities for the given event
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     allMEs[ievt] = 0;
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     allNumerators[ievt] = 0;
@@ -908,7 +912,7 @@ namespace mg5amcCpu
     // === PART 1 - HELICITY LOOP: CALCULATE WAVEFUNCTIONS ===
     // (in both CUDA and C++, using precomputed good helicities)
 
-#ifdef __CUDACC__ // CUDA OR C++
+#ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++
 
     // *** START OF PART 1a - CUDA (one event per CPU thread) ***
     // Running sum of partial amplitudes squared for event by event color selection (#402)
@@ -1118,7 +1122,7 @@ namespace mg5amcCpu
     // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event
     // [NB 'sum over final spins, average over initial spins', eg see
     // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf]
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     allMEs[ievt] /= helcolDenominators[0];
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     if( channelId > 0 ) allMEs[ievt] *= allNumerators[ievt] / allDenominators[ievt];
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gu_ttxu/CPPProcess.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gu_ttxu/CPPProcess.h
index bf037c6c28..ce22572055 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gu_ttxu/CPPProcess.h
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gu_ttxu/CPPProcess.h
@@ -4,7 +4,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
 // MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
@@ -25,7 +25,7 @@
 
 //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -110,7 +110,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   __global__ void
   computeDependentCouplings( const fptype* allgs,    // input: Gs[nevt]
                              fptype* allcouplings ); // output: couplings[nevt*ndcoup*2]
@@ -123,7 +123,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__ /* clang-format off */
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
   __global__ void
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
@@ -153,7 +153,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__ /* clang-format off */
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
   __global__ void
   sigmaKin( const fptype* allmomenta,      // input: momenta[nevt*npar*4]
             const fptype* allcouplings,    // input: couplings[nevt*ndcoup*2]
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gu_ttxu/CudaRuntime.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gu_ttxu/CudaRuntime.h
deleted file mode 120000
index ce9e1a487a..0000000000
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gu_ttxu/CudaRuntime.h
+++ /dev/null
@@ -1 +0,0 @@
-../CudaRuntime.h
\ No newline at end of file
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gu_ttxu/GpuAbstraction.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gu_ttxu/GpuAbstraction.h
new file mode 120000
index 0000000000..72054e19ba
--- /dev/null
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gu_ttxu/GpuAbstraction.h
@@ -0,0 +1 @@
+../GpuAbstraction.h
\ No newline at end of file
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gu_ttxu/GpuRuntime.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gu_ttxu/GpuRuntime.h
new file mode 120000
index 0000000000..3920e83be4
--- /dev/null
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gu_ttxu/GpuRuntime.h
@@ -0,0 +1 @@
+../GpuRuntime.h
\ No newline at end of file
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gu_ttxu/check_sa.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gu_ttxu/check_sa.cc
index 3fbf0ffbee..7cac5ab47b 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gu_ttxu/check_sa.cc
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gu_ttxu/check_sa.cc
@@ -4,7 +4,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: O. Mattelaer (Nov 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 
 #include "mgOnGpuConfig.h"
@@ -12,6 +12,7 @@
 #include "BridgeKernels.h"
 #include "CPPProcess.h"
 #include "CrossSectionKernels.h"
+#include "GpuRuntime.h"
 #include "MatrixElementKernels.h"
 #include "MemoryAccessMatrixElements.h"
 #include "MemoryAccessMomenta.h"
@@ -65,7 +66,7 @@ usage( char* argv0, int ret = 1 )
   std::cout << std::endl;
   std::cout << "Summary stats are always computed: '-p' and '-j' only control their printout" << std::endl;
   std::cout << "The '-d' flag only enables NaN/abnormal warnings and OMP debugging" << std::endl;
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 #ifdef _OPENMP
   std::cout << std::endl;
   std::cout << "Use the OMP_NUM_THREADS environment variable to control OMP multi-threading" << std::endl;
@@ -96,7 +97,7 @@ int
 main( int argc, char** argv )
 {
   // Namespaces for CUDA and C++ (FIXME - eventually use the same namespace everywhere...)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   using namespace mg5amcGpu;
 #else
   using namespace mg5amcCpu;
@@ -134,9 +135,11 @@ main( int argc, char** argv )
     CurandDevice = 2
   };
 #ifdef MGONGPU_HAS_NO_CURAND
-  RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // this is the only supported mode if build has no curand (PR #784)
+  RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // this is the only supported mode if build has no curand (PR #784 and #785)
+#elif defined __HIPCC__
+#error Internal error: MGONGPU_HAS_NO_CURAND should have been set for __HIPCC__ // default on AMD GPUs should be common random
 #elif defined __CUDACC__
-  RandomNumberMode rndgen = RandomNumberMode::CurandDevice; // default on GPU if build has curand
+  RandomNumberMode rndgen = RandomNumberMode::CurandDevice; // default on NVidia GPU if build has curand
 #else
   RandomNumberMode rndgen = RandomNumberMode::CurandHost; // default on CPU if build has curand
 #endif
@@ -146,10 +149,10 @@ main( int argc, char** argv )
     RamboHost = 1,
     RamboDevice = 2
   };
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   RamboSamplingMode rmbsmp = RamboSamplingMode::RamboDevice; // default on GPU
 #else
-  RamboSamplingMode rmbsmp = RamboSamplingMode::RamboHost;  // default on CPU
+  RamboSamplingMode rmbsmp = RamboSamplingMode::RamboHost; // default on CPU
 #endif
   // Bridge emulation mode (NB Bridge implies RamboHost!)
   bool bridge = false;
@@ -177,7 +180,7 @@ main( int argc, char** argv )
     else if( arg == "--curdev" )
     {
 #ifndef __CUDACC__
-      throw std::runtime_error( "CurandDevice is not supported on CPUs" );
+      throw std::runtime_error( "CurandDevice is not supported on CPUs or non-NVidia GPUs" );
 #elif defined MGONGPU_HAS_NO_CURAND
       throw std::runtime_error( "CurandDevice is not supported because this application was built without Curand support" );
 #else
@@ -198,7 +201,7 @@ main( int argc, char** argv )
     }
     else if( arg == "--rmbdev" )
     {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
       rmbsmp = RamboSamplingMode::RamboDevice;
 #else
       throw std::runtime_error( "RamboDevice is not supported on CPUs" );
@@ -272,13 +275,13 @@ main( int argc, char** argv )
     return usage( argv[0] );
   }
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 #ifdef _OPENMP
   ompnumthreadsNotSetMeansOneThread( debug ? 1 : 0 ); // quiet(-1), info(0), debug(1)
 #endif
 #endif
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // Fail gently and avoid "Illegal instruction (core dumped)" if the host does not support the SIMD used in the ME calculation
   // Note: this prevents a crash on pmpe04 but not on some github CI nodes?
   // [NB: SIMD vectorization in mg5amc C++ code is only used in the ME calculation below MatrixElementKernelHost!]
@@ -296,14 +299,14 @@ main( int argc, char** argv )
 
   // === STEP 0 - INITIALISE
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 
-  // --- 00. Initialise cuda
-  // Instantiate a CudaRuntime at the beginnining of the application's main to
-  // invoke cudaSetDevice(0) in the constructor and book a cudaDeviceReset() call in the destructor
-  const std::string cdinKey = "00 CudaInit";
+  // --- 00. Initialise GPU
+  // Instantiate a GpuRuntime at the beginnining of the application's main.
+  // For CUDA this invokes cudaSetDevice(0) in the constructor and books a cudaDeviceReset() call in the destructor.
+  const std::string cdinKey = "00 GpuInit";
   timermap.start( cdinKey );
-  CudaRuntime cudaRuntime( debug );
+  GpuRuntime GpuRuntime( debug );
 #endif
 
   // --- 0a. Initialise physics process
@@ -325,7 +328,7 @@ main( int argc, char** argv )
   timermap.start( alloKey );
 
   // Memory buffers for random numbers for momenta
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferRndNumMomenta hstRndmom( nevt );
 #else
   PinnedHostBufferRndNumMomenta hstRndmom( nevt );
@@ -333,7 +336,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for sampling weights
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferWeights hstWeights( nevt );
 #else
   PinnedHostBufferWeights hstWeights( nevt );
@@ -341,7 +344,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for momenta
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferMomenta hstMomenta( nevt );
 #else
   PinnedHostBufferMomenta hstMomenta( nevt );
@@ -349,7 +352,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for Gs
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferGs hstGs( nevt );
 #else
   PinnedHostBufferGs hstGs( nevt );
@@ -366,7 +369,7 @@ main( int argc, char** argv )
   }
 
   // Memory buffers for matrix elements
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferMatrixElements hstMatrixElements( nevt );
 #else
   PinnedHostBufferMatrixElements hstMatrixElements( nevt );
@@ -375,7 +378,7 @@ main( int argc, char** argv )
 
   // Memory buffers for random numbers for helicity selection
   // *** NB #403 these buffers always remain initialised at 0: no need for helicity choice in gcheck/check (no LHE produced) ***
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferRndNumHelicity hstRndHel( nevt );
 #else
   PinnedHostBufferRndNumHelicity hstRndHel( nevt );
@@ -384,7 +387,7 @@ main( int argc, char** argv )
 
   // Memory buffers for random numbers for color selection
   // *** NB #402 these buffers always remain initialised at 0: no need for color choice in gcheck/check (no LHE produced) ***
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferRndNumColor hstRndCol( nevt );
 #else
   PinnedHostBufferRndNumColor hstRndCol( nevt );
@@ -392,7 +395,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for helicity selection
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferSelectedHelicity hstSelHel( nevt );
 #else
   PinnedHostBufferSelectedHelicity hstSelHel( nevt );
@@ -400,7 +403,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for color selection
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferSelectedColor hstSelCol( nevt );
 #else
   PinnedHostBufferSelectedColor hstSelCol( nevt );
@@ -438,7 +441,7 @@ main( int argc, char** argv )
     const bool onDevice = true;
     prnk.reset( new CurandRandomNumberKernel( devRndmom, onDevice ) );
 #else
-    throw std::logic_error( "INTERNAL ERROR! CurandDevice is not supported on CPUs" ); // INTERNAL ERROR (no path to this statement)
+    throw std::logic_error( "INTERNAL ERROR! CurandDevice is not supported on CPUs or non-NVidia GPUs" ); // INTERNAL ERROR (no path to this statement)
 #endif
   }
 
@@ -450,7 +453,7 @@ main( int argc, char** argv )
   }
   else
   {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     prsk.reset( new RamboSamplingKernelDevice( energy, devRndmom, devMomenta, devWeights, gpublocks, gputhreads ) );
 #else
     throw std::logic_error( "RamboDevice is not supported on CPUs" ); // INTERNAL ERROR (no path to this statement)
@@ -461,7 +464,7 @@ main( int argc, char** argv )
   std::unique_ptr<MatrixElementKernelBase> pmek;
   if( !bridge )
   {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     pmek.reset( new MatrixElementKernelDevice( devMomenta, devGs, devRndHel, devRndCol, devMatrixElements, devSelHel, devSelCol, gpublocks, gputhreads ) );
 #else
     pmek.reset( new MatrixElementKernelHost( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, nevt ) );
@@ -469,7 +472,7 @@ main( int argc, char** argv )
   }
   else
   {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     pmek.reset( new BridgeKernelDevice( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, gpublocks, gputhreads ) );
 #else
     pmek.reset( new BridgeKernelHost( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, nevt ) );
@@ -511,7 +514,7 @@ main( int argc, char** argv )
     prnk->generateRnarray();
     //std::cout << "Got random numbers" << std::endl;
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     if( rndgen != RandomNumberMode::CurandDevice && rmbsmp == RamboSamplingMode::RamboDevice )
     {
       // --- 1c. Copy rndmom from host to device
@@ -543,7 +546,7 @@ main( int argc, char** argv )
     prsk->getMomentaFinal();
     //std::cout << "Got final momenta" << std::endl;
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     if( rmbsmp == RamboSamplingMode::RamboDevice )
     {
       // --- 2c. CopyDToH Weights
@@ -588,7 +591,7 @@ main( int argc, char** argv )
       dynamic_cast<BridgeKernelBase*>( pmek.get() )->transposeInputMomentaC2F();
     }
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     // --- 2d. CopyHToD Momenta
     const std::string gKey = "0.. CpHTDg";
     rambtime += timermap.start( gKey ); // FIXME! NOT A RAMBO TIMER!
@@ -617,7 +620,7 @@ main( int argc, char** argv )
     wv3atime += timermap.stop(); // calc only
     wavetime += wv3atime;        // calc plus copy
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     if( !bridge )
     {
       // --- 3b. CopyDToH MEs
@@ -760,18 +763,22 @@ main( int argc, char** argv )
     rndgentxt = "CURAND DEVICE";
 #ifdef __CUDACC__
   rndgentxt += " (CUDA code)";
+#elif defined __HIPCC__
+  rndgentxt += " (HIP code)";
 #else
   rndgentxt += " (C++ code)";
 #endif
 
   // Workflow description summary
   std::string wrkflwtxt;
-  // -- CUDA or C++?
+  // -- CUDA or HIP or C++?
 #ifdef __CUDACC__
   wrkflwtxt += "CUD:";
+#elif defined __HIPCC__
+  wrkflwtxt += "HIP:";
 #else
   wrkflwtxt += "CPP:";
-#endif
+#endif /* clang-format off */
   // -- DOUBLE or FLOAT?
 #if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
   wrkflwtxt += "MIX+"; // mixed fptypes (single precision color algebra #537)
@@ -781,7 +788,7 @@ main( int argc, char** argv )
   wrkflwtxt += "FLT+";
 #else
   wrkflwtxt += "???+"; // no path to this statement
-#endif
+#endif /* clang-format on */
   // -- CUCOMPLEX or THRUST or STD complex numbers?
 #ifdef __CUDACC__
 #if defined MGONGPU_CUCXTYPE_CUCOMPLEX
@@ -793,6 +800,12 @@ main( int argc, char** argv )
 #else
   wrkflwtxt += "???:"; // no path to this statement
 #endif
+#elif defined __HIPCC__
+#if defined MGONGPU_CUCXTYPE_CXSMPL
+  wrkflwtxt += "CXS:";
+#else
+  wrkflwtxt += "???:"; // no path to this statement
+#endif
 #else
 #if defined MGONGPU_CPPCXTYPE_STDCOMPLEX
   wrkflwtxt += "STX:";
@@ -818,7 +831,7 @@ main( int argc, char** argv )
     wrkflwtxt += "RMBDEV+";
   else
     wrkflwtxt += "??????+"; // no path to this statement
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   // -- HOST or DEVICE matrix elements? Standalone MEs or BRIDGE?
   if( !bridge )
     wrkflwtxt += "MESDEV";
@@ -874,7 +887,7 @@ main( int argc, char** argv )
 
   if( perf )
   {
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 #ifdef _OPENMP
     // Get the output of "nproc --all" (https://stackoverflow.com/a/478960)
     std::string nprocall;
@@ -895,6 +908,8 @@ main( int argc, char** argv )
     std::cout << std::string( SEP79, '*' ) << std::endl
 #ifdef __CUDACC__
               << "Process                     = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_CUDA"
+#elif defined __HIPCC__
+              << "Process                     = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_HIP"
 #else
               << "Process                     = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_CPP"
 #endif
@@ -921,21 +936,21 @@ main( int argc, char** argv )
 #elif defined MGONGPU_FPTYPE_FLOAT
               << "FP precision                = FLOAT (NaN/abnormal=" << nabn << ", zero=" << nzero << ")" << std::endl
 #endif
-#ifdef __CUDACC__
 #if defined MGONGPU_CUCXTYPE_CUCOMPLEX
               << "Complex type                = CUCOMPLEX" << std::endl
 #elif defined MGONGPU_CUCXTYPE_THRUST
               << "Complex type                = THRUST::COMPLEX" << std::endl
-#endif
-#else
+#elif defined MGONGPU_CUCXTYPE_CXSMPL
               << "Complex type                = STD::COMPLEX" << std::endl
+#else
+              << "Complex type                = ???" << std::endl // no path to this statement...
 #endif
               << "RanNumb memory layout       = AOSOA[" << neppR << "]"
               << ( neppR == 1 ? " == AOS" : "" )
               << " [HARDCODED FOR REPRODUCIBILITY]" << std::endl
               << "Momenta memory layout       = AOSOA[" << neppM << "]"
               << ( neppM == 1 ? " == AOS" : "" ) << std::endl
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     //<< "Wavefunction GPU memory     = LOCAL" << std::endl
 #else
 #if !defined MGONGPU_CPPSIMD
@@ -966,7 +981,7 @@ main( int argc, char** argv )
 #endif
 #endif
               << "Random number generation    = " << rndgentxt << std::endl
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 #ifdef _OPENMP
               << "OMP threads / `nproc --all` = " << omp_get_max_threads() << " / " << nprocall // includes a newline
 #endif
@@ -1062,14 +1077,14 @@ main( int argc, char** argv )
              << "\"FLOAT (NaN/abnormal=" << nabn << ")\"," << std::endl
 #endif
              << "\"Complex type\": "
-#ifdef __CUDACC__
 #if defined MGONGPU_CUCXTYPE_CUCOMPLEX
              << "\"CUCOMPLEX\"," << std::endl
 #elif defined MGONGPU_CUCXTYPE_THRUST
              << "\"THRUST::COMPLEX\"," << std::endl
-#endif
-#else
+#elif defined MGONGPU_CUCXTYPE_CXSMPL
              << "\"STD::COMPLEX\"," << std::endl
+#else
+             << "\"???\"," << std::endl                           // no path to this statement...
 #endif
              << "\"RanNumb memory layout\": "
              << "\"AOSOA[" << neppR << "]\""
@@ -1077,7 +1092,7 @@ main( int argc, char** argv )
              << "\"Momenta memory layout\": "
              << "\"AOSOA[" << neppM << "]\""
              << ( neppM == 1 ? " == AOS" : "" ) << ", " << std::endl
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     //<< "\"Wavefunction GPU memory\": " << "\"LOCAL\"," << std::endl
 #endif
              << "\"Curand generation\": "
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gux_ttxux/CPPProcess.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gux_ttxux/CPPProcess.cc
index 0f999663da..7308f8a2c7 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gux_ttxux/CPPProcess.cc
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gux_ttxux/CPPProcess.cc
@@ -4,7 +4,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
 // MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
@@ -16,7 +16,6 @@
 
 #include "mgOnGpuConfig.h"
 
-#include "CudaRuntime.h"
 #include "HelAmps_sm.h"
 #include "MemoryAccessAmplitudes.h"
 #include "MemoryAccessCouplings.h"
@@ -49,7 +48,7 @@
 // Process: g d~ > t t~ d~ WEIGHTED<=3 @1
 // Process: g s~ > t t~ s~ WEIGHTED<=3 @1
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -83,7 +82,7 @@ namespace mg5amcCpu
   __device__ const fptype cIPD[2] = { (fptype)Parameters_sm::mdl_MT, (fptype)Parameters_sm::mdl_WT };
   __device__ const fptype* cIPC = nullptr; // unused as nicoup=0
 #else
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   __device__ __constant__ fptype cIPD[2];
   __device__ __constant__ fptype* cIPC = nullptr; // unused as nicoup=0
 #else
@@ -93,7 +92,7 @@ namespace mg5amcCpu
 #endif
 
   // Helicity combinations (and filtering of "good" helicity combinations)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   __device__ __constant__ short cHel[ncomb][npar];
   __device__ __constant__ int cNGoodHel;
   __device__ __constant__ int cGoodHel[ncomb];
@@ -121,13 +120,13 @@ namespace mg5amcCpu
                            fptype* allDenominators,       // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
                            fptype_sv* jamp2_sv            // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled)
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
                            , const int ievt00             // input: first event number in current C++ event page (for CUDA, ievt depends on threadid)
 #endif
                            )
   //ALWAYS_INLINE // attributes are not permitted in a function definition
   {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     using namespace mg5amcGpu;
     using M_ACCESS = DeviceAccessMomenta;         // non-trivial access: buffer includes all events
     using E_ACCESS = DeviceAccessMatrixElements;  // non-trivial access: buffer includes all events
@@ -154,7 +153,7 @@ namespace mg5amcCpu
 #endif /* clang-format on */
     mgDebug( 0, __FUNCTION__ );
     //printf( "calculate_wavefunctions: ihel=%2d\n", ihel );
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
     //printf( "calculate_wavefunctions: ievt00=%d\n", ievt00 );
 #endif
 
@@ -190,7 +189,7 @@ namespace mg5amcCpu
 #endif
     for( int iParity = 0; iParity < nParity; ++iParity )
     { // START LOOP ON IPARITY
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
       const int ievt0 = ievt00 + iParity * neppV;
 #endif
       constexpr size_t nxcoup = ndcoup + nicoup; // both dependent and independent couplings
@@ -203,8 +202,10 @@ namespace mg5amcCpu
         allCOUPs[idcoup] = CD_ACCESS::idcoupAccessBufferConst( allcouplings, idcoup ); // dependent couplings, vary event-by-event
       for( size_t iicoup = 0; iicoup < nicoup; iicoup++ )
         allCOUPs[ndcoup + iicoup] = CI_ACCESS::iicoupAccessBufferConst( cIPC, iicoup ); // independent couplings, fixed for all events
+#ifdef MGONGPUCPP_GPUIMPL
 #ifdef __CUDACC__
 #pragma nv_diagnostic pop
+#endif
       // CUDA kernels take input/output buffers with momenta/MEs for all events
       const fptype* momenta = allmomenta;
       const fptype* COUPs[nxcoup];
@@ -341,7 +342,7 @@ namespace mg5amcCpu
         { 4, 0, 12, 4 },
         { 0, 4, 4, 12 } }; // 2-D array[4][4]
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
       // Pre-compute a constexpr triangular color matrix properly normalized #475
       struct TriangularNormalizedColorMatrix
       {
@@ -398,7 +399,7 @@ namespace mg5amcCpu
 #endif
       for( int icol = 0; icol < ncolor; icol++ )
       {
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
         // === C++ START ===
         // Diagonal terms
 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
@@ -457,7 +458,7 @@ namespace mg5amcCpu
       MEs_sv_previous += deltaMEs_previous;
 #endif
       /*
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
       if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", blockDim.x * blockIdx.x + threadIdx.x, ihel, MEs_sv );
 #else
 #ifdef MGONGPU_CPPSIMD
@@ -520,8 +521,8 @@ namespace mg5amcCpu
       { 1, 1, 1, 1, -1 },
       { 1, 1, 1, -1, 1 },
       { 1, 1, 1, -1, -1 } };
-#ifdef __CUDACC__
-    checkCuda( cudaMemcpyToSymbol( cHel, tHel, ncomb * npar * sizeof( short ) ) );
+#ifdef MGONGPUCPP_GPUIMPL
+    gpuMemcpyToSymbol( cHel, tHel, ncomb * npar * sizeof( short ) );
 #else
     memcpy( cHel, tHel, ncomb * npar * sizeof( short ) );
 #endif
@@ -562,9 +563,9 @@ namespace mg5amcCpu
     // Then copy them to CUDA constant memory (issue #39) or its C++ emulation in file-scope static memory
     const fptype tIPD[2] = { (fptype)m_pars->mdl_MT, (fptype)m_pars->mdl_WT };
     //const cxtype tIPC[0] = { ... }; // nicoup=0
-#ifdef __CUDACC__
-    checkCuda( cudaMemcpyToSymbol( cIPD, tIPD, 2 * sizeof( fptype ) ) );
-    //checkCuda( cudaMemcpyToSymbol( cIPC, tIPC, 0 * sizeof( cxtype ) ) ); // nicoup=0
+#ifdef MGONGPUCPP_GPUIMPL
+    gpuMemcpyToSymbol( cIPD, tIPD, 2 * sizeof( fptype ) );
+    //gpuMemcpyToSymbol( cIPC, tIPC, 0 * sizeof( cxtype ) ); // nicoup=0
 #else
     memcpy( cIPD, tIPD, 2 * sizeof( fptype ) );
     //memcpy( cIPC, tIPC, 0 * sizeof( cxtype ) ); // nicoup=0
@@ -601,7 +602,7 @@ namespace mg5amcCpu
   {
     std::stringstream out;
     // CUDA version (NVCC)
-    // [Use __NVCC__ instead of __CUDACC__ here!]
+    // [Use __NVCC__ instead of MGONGPUCPP_GPUIMPL here!]
     // [This tests if 'nvcc' was used even to build a .cc file, even if not necessarily 'nvcc -x cu' for a .cu file]
     // [Check 'nvcc --compiler-options -dM -E dummy.c | grep CUDA': see https://stackoverflow.com/a/53713712]
 #ifdef __NVCC__
@@ -666,12 +667,12 @@ namespace mg5amcCpu
   __global__ void /* clang-format off */
   computeDependentCouplings( const fptype* allgs, // input: Gs[nevt]
                              fptype* allcouplings // output: couplings[nevt*ndcoup*2]
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
                              , const int nevt     // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #endif
   ) /* clang-format on */
   {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     using namespace mg5amcGpu;
     using G_ACCESS = DeviceAccessGs;
     using C_ACCESS = DeviceAccessCouplings;
@@ -692,7 +693,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__ /* clang-format off */
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
   __global__ void
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
@@ -818,9 +819,9 @@ namespace mg5amcCpu
         nGoodHel++;
       }
     }
-#ifdef __CUDACC__
-    checkCuda( cudaMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) ) );
-    checkCuda( cudaMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) ) );
+#ifdef MGONGPUCPP_GPUIMPL
+    gpuMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) );
+    gpuMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) );
 #else
     cNGoodHel = nGoodHel;
     for( int ihel = 0; ihel < ncomb; ihel++ ) cGoodHel[ihel] = goodHel[ihel];
@@ -844,7 +845,7 @@ namespace mg5amcCpu
 #endif
             int* allselhel,                // output: helicity selection[nevt]
             int* allselcol                 // output: helicity selection[nevt]
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
             , const int nevt               // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #endif
             ) /* clang-format on */
@@ -864,7 +865,7 @@ namespace mg5amcCpu
     // Denominators: spins, colors and identical particles
     constexpr int helcolDenominators[1] = { 96 }; // assume nprocesses == 1 (#272 and #343)
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     // Remember: in CUDA this is a kernel for one event, in c++ this processes n events
     const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid
 #else
@@ -878,9 +879,12 @@ namespace mg5amcCpu
 #endif
 
     // Start sigmaKin_lines
+
+#include "GpuAbstraction.h"
+
     // === PART 0 - INITIALISATION (before calculate_wavefunctions) ===
     // Reset the "matrix elements" - running sums of |M|^2 over helicities for the given event
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     allMEs[ievt] = 0;
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     allNumerators[ievt] = 0;
@@ -908,7 +912,7 @@ namespace mg5amcCpu
     // === PART 1 - HELICITY LOOP: CALCULATE WAVEFUNCTIONS ===
     // (in both CUDA and C++, using precomputed good helicities)
 
-#ifdef __CUDACC__ // CUDA OR C++
+#ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++
 
     // *** START OF PART 1a - CUDA (one event per CPU thread) ***
     // Running sum of partial amplitudes squared for event by event color selection (#402)
@@ -1118,7 +1122,7 @@ namespace mg5amcCpu
     // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event
     // [NB 'sum over final spins, average over initial spins', eg see
     // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf]
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     allMEs[ievt] /= helcolDenominators[0];
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     if( channelId > 0 ) allMEs[ievt] *= allNumerators[ievt] / allDenominators[ievt];
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gux_ttxux/CPPProcess.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gux_ttxux/CPPProcess.h
index 0f49f5247b..46c4347506 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gux_ttxux/CPPProcess.h
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gux_ttxux/CPPProcess.h
@@ -4,7 +4,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
 // MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
@@ -25,7 +25,7 @@
 
 //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -110,7 +110,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   __global__ void
   computeDependentCouplings( const fptype* allgs,    // input: Gs[nevt]
                              fptype* allcouplings ); // output: couplings[nevt*ndcoup*2]
@@ -123,7 +123,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__ /* clang-format off */
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
   __global__ void
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
@@ -153,7 +153,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__ /* clang-format off */
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
   __global__ void
   sigmaKin( const fptype* allmomenta,      // input: momenta[nevt*npar*4]
             const fptype* allcouplings,    // input: couplings[nevt*ndcoup*2]
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gux_ttxux/CudaRuntime.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gux_ttxux/CudaRuntime.h
deleted file mode 120000
index ce9e1a487a..0000000000
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gux_ttxux/CudaRuntime.h
+++ /dev/null
@@ -1 +0,0 @@
-../CudaRuntime.h
\ No newline at end of file
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gux_ttxux/GpuAbstraction.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gux_ttxux/GpuAbstraction.h
new file mode 120000
index 0000000000..72054e19ba
--- /dev/null
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gux_ttxux/GpuAbstraction.h
@@ -0,0 +1 @@
+../GpuAbstraction.h
\ No newline at end of file
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gux_ttxux/GpuRuntime.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gux_ttxux/GpuRuntime.h
new file mode 120000
index 0000000000..3920e83be4
--- /dev/null
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gux_ttxux/GpuRuntime.h
@@ -0,0 +1 @@
+../GpuRuntime.h
\ No newline at end of file
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gux_ttxux/check_sa.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gux_ttxux/check_sa.cc
index 3fbf0ffbee..7cac5ab47b 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gux_ttxux/check_sa.cc
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gux_ttxux/check_sa.cc
@@ -4,7 +4,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: O. Mattelaer (Nov 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 
 #include "mgOnGpuConfig.h"
@@ -12,6 +12,7 @@
 #include "BridgeKernels.h"
 #include "CPPProcess.h"
 #include "CrossSectionKernels.h"
+#include "GpuRuntime.h"
 #include "MatrixElementKernels.h"
 #include "MemoryAccessMatrixElements.h"
 #include "MemoryAccessMomenta.h"
@@ -65,7 +66,7 @@ usage( char* argv0, int ret = 1 )
   std::cout << std::endl;
   std::cout << "Summary stats are always computed: '-p' and '-j' only control their printout" << std::endl;
   std::cout << "The '-d' flag only enables NaN/abnormal warnings and OMP debugging" << std::endl;
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 #ifdef _OPENMP
   std::cout << std::endl;
   std::cout << "Use the OMP_NUM_THREADS environment variable to control OMP multi-threading" << std::endl;
@@ -96,7 +97,7 @@ int
 main( int argc, char** argv )
 {
   // Namespaces for CUDA and C++ (FIXME - eventually use the same namespace everywhere...)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   using namespace mg5amcGpu;
 #else
   using namespace mg5amcCpu;
@@ -134,9 +135,11 @@ main( int argc, char** argv )
     CurandDevice = 2
   };
 #ifdef MGONGPU_HAS_NO_CURAND
-  RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // this is the only supported mode if build has no curand (PR #784)
+  RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // this is the only supported mode if build has no curand (PR #784 and #785)
+#elif defined __HIPCC__
+#error Internal error: MGONGPU_HAS_NO_CURAND should have been set for __HIPCC__ // default on AMD GPUs should be common random
 #elif defined __CUDACC__
-  RandomNumberMode rndgen = RandomNumberMode::CurandDevice; // default on GPU if build has curand
+  RandomNumberMode rndgen = RandomNumberMode::CurandDevice; // default on NVidia GPU if build has curand
 #else
   RandomNumberMode rndgen = RandomNumberMode::CurandHost; // default on CPU if build has curand
 #endif
@@ -146,10 +149,10 @@ main( int argc, char** argv )
     RamboHost = 1,
     RamboDevice = 2
   };
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   RamboSamplingMode rmbsmp = RamboSamplingMode::RamboDevice; // default on GPU
 #else
-  RamboSamplingMode rmbsmp = RamboSamplingMode::RamboHost;  // default on CPU
+  RamboSamplingMode rmbsmp = RamboSamplingMode::RamboHost; // default on CPU
 #endif
   // Bridge emulation mode (NB Bridge implies RamboHost!)
   bool bridge = false;
@@ -177,7 +180,7 @@ main( int argc, char** argv )
     else if( arg == "--curdev" )
     {
 #ifndef __CUDACC__
-      throw std::runtime_error( "CurandDevice is not supported on CPUs" );
+      throw std::runtime_error( "CurandDevice is not supported on CPUs or non-NVidia GPUs" );
 #elif defined MGONGPU_HAS_NO_CURAND
       throw std::runtime_error( "CurandDevice is not supported because this application was built without Curand support" );
 #else
@@ -198,7 +201,7 @@ main( int argc, char** argv )
     }
     else if( arg == "--rmbdev" )
     {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
       rmbsmp = RamboSamplingMode::RamboDevice;
 #else
       throw std::runtime_error( "RamboDevice is not supported on CPUs" );
@@ -272,13 +275,13 @@ main( int argc, char** argv )
     return usage( argv[0] );
   }
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 #ifdef _OPENMP
   ompnumthreadsNotSetMeansOneThread( debug ? 1 : 0 ); // quiet(-1), info(0), debug(1)
 #endif
 #endif
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // Fail gently and avoid "Illegal instruction (core dumped)" if the host does not support the SIMD used in the ME calculation
   // Note: this prevents a crash on pmpe04 but not on some github CI nodes?
   // [NB: SIMD vectorization in mg5amc C++ code is only used in the ME calculation below MatrixElementKernelHost!]
@@ -296,14 +299,14 @@ main( int argc, char** argv )
 
   // === STEP 0 - INITIALISE
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 
-  // --- 00. Initialise cuda
-  // Instantiate a CudaRuntime at the beginnining of the application's main to
-  // invoke cudaSetDevice(0) in the constructor and book a cudaDeviceReset() call in the destructor
-  const std::string cdinKey = "00 CudaInit";
+  // --- 00. Initialise GPU
+  // Instantiate a GpuRuntime at the beginnining of the application's main.
+  // For CUDA this invokes cudaSetDevice(0) in the constructor and books a cudaDeviceReset() call in the destructor.
+  const std::string cdinKey = "00 GpuInit";
   timermap.start( cdinKey );
-  CudaRuntime cudaRuntime( debug );
+  GpuRuntime GpuRuntime( debug );
 #endif
 
   // --- 0a. Initialise physics process
@@ -325,7 +328,7 @@ main( int argc, char** argv )
   timermap.start( alloKey );
 
   // Memory buffers for random numbers for momenta
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferRndNumMomenta hstRndmom( nevt );
 #else
   PinnedHostBufferRndNumMomenta hstRndmom( nevt );
@@ -333,7 +336,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for sampling weights
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferWeights hstWeights( nevt );
 #else
   PinnedHostBufferWeights hstWeights( nevt );
@@ -341,7 +344,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for momenta
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferMomenta hstMomenta( nevt );
 #else
   PinnedHostBufferMomenta hstMomenta( nevt );
@@ -349,7 +352,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for Gs
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferGs hstGs( nevt );
 #else
   PinnedHostBufferGs hstGs( nevt );
@@ -366,7 +369,7 @@ main( int argc, char** argv )
   }
 
   // Memory buffers for matrix elements
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferMatrixElements hstMatrixElements( nevt );
 #else
   PinnedHostBufferMatrixElements hstMatrixElements( nevt );
@@ -375,7 +378,7 @@ main( int argc, char** argv )
 
   // Memory buffers for random numbers for helicity selection
   // *** NB #403 these buffers always remain initialised at 0: no need for helicity choice in gcheck/check (no LHE produced) ***
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferRndNumHelicity hstRndHel( nevt );
 #else
   PinnedHostBufferRndNumHelicity hstRndHel( nevt );
@@ -384,7 +387,7 @@ main( int argc, char** argv )
 
   // Memory buffers for random numbers for color selection
   // *** NB #402 these buffers always remain initialised at 0: no need for color choice in gcheck/check (no LHE produced) ***
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferRndNumColor hstRndCol( nevt );
 #else
   PinnedHostBufferRndNumColor hstRndCol( nevt );
@@ -392,7 +395,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for helicity selection
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferSelectedHelicity hstSelHel( nevt );
 #else
   PinnedHostBufferSelectedHelicity hstSelHel( nevt );
@@ -400,7 +403,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for color selection
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferSelectedColor hstSelCol( nevt );
 #else
   PinnedHostBufferSelectedColor hstSelCol( nevt );
@@ -438,7 +441,7 @@ main( int argc, char** argv )
     const bool onDevice = true;
     prnk.reset( new CurandRandomNumberKernel( devRndmom, onDevice ) );
 #else
-    throw std::logic_error( "INTERNAL ERROR! CurandDevice is not supported on CPUs" ); // INTERNAL ERROR (no path to this statement)
+    throw std::logic_error( "INTERNAL ERROR! CurandDevice is not supported on CPUs or non-NVidia GPUs" ); // INTERNAL ERROR (no path to this statement)
 #endif
   }
 
@@ -450,7 +453,7 @@ main( int argc, char** argv )
   }
   else
   {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     prsk.reset( new RamboSamplingKernelDevice( energy, devRndmom, devMomenta, devWeights, gpublocks, gputhreads ) );
 #else
     throw std::logic_error( "RamboDevice is not supported on CPUs" ); // INTERNAL ERROR (no path to this statement)
@@ -461,7 +464,7 @@ main( int argc, char** argv )
   std::unique_ptr<MatrixElementKernelBase> pmek;
   if( !bridge )
   {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     pmek.reset( new MatrixElementKernelDevice( devMomenta, devGs, devRndHel, devRndCol, devMatrixElements, devSelHel, devSelCol, gpublocks, gputhreads ) );
 #else
     pmek.reset( new MatrixElementKernelHost( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, nevt ) );
@@ -469,7 +472,7 @@ main( int argc, char** argv )
   }
   else
   {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     pmek.reset( new BridgeKernelDevice( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, gpublocks, gputhreads ) );
 #else
     pmek.reset( new BridgeKernelHost( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, nevt ) );
@@ -511,7 +514,7 @@ main( int argc, char** argv )
     prnk->generateRnarray();
     //std::cout << "Got random numbers" << std::endl;
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     if( rndgen != RandomNumberMode::CurandDevice && rmbsmp == RamboSamplingMode::RamboDevice )
     {
       // --- 1c. Copy rndmom from host to device
@@ -543,7 +546,7 @@ main( int argc, char** argv )
     prsk->getMomentaFinal();
     //std::cout << "Got final momenta" << std::endl;
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     if( rmbsmp == RamboSamplingMode::RamboDevice )
     {
       // --- 2c. CopyDToH Weights
@@ -588,7 +591,7 @@ main( int argc, char** argv )
       dynamic_cast<BridgeKernelBase*>( pmek.get() )->transposeInputMomentaC2F();
     }
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     // --- 2d. CopyHToD Momenta
     const std::string gKey = "0.. CpHTDg";
     rambtime += timermap.start( gKey ); // FIXME! NOT A RAMBO TIMER!
@@ -617,7 +620,7 @@ main( int argc, char** argv )
     wv3atime += timermap.stop(); // calc only
     wavetime += wv3atime;        // calc plus copy
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     if( !bridge )
     {
       // --- 3b. CopyDToH MEs
@@ -760,18 +763,22 @@ main( int argc, char** argv )
     rndgentxt = "CURAND DEVICE";
 #ifdef __CUDACC__
   rndgentxt += " (CUDA code)";
+#elif defined __HIPCC__
+  rndgentxt += " (HIP code)";
 #else
   rndgentxt += " (C++ code)";
 #endif
 
   // Workflow description summary
   std::string wrkflwtxt;
-  // -- CUDA or C++?
+  // -- CUDA or HIP or C++?
 #ifdef __CUDACC__
   wrkflwtxt += "CUD:";
+#elif defined __HIPCC__
+  wrkflwtxt += "HIP:";
 #else
   wrkflwtxt += "CPP:";
-#endif
+#endif /* clang-format off */
   // -- DOUBLE or FLOAT?
 #if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
   wrkflwtxt += "MIX+"; // mixed fptypes (single precision color algebra #537)
@@ -781,7 +788,7 @@ main( int argc, char** argv )
   wrkflwtxt += "FLT+";
 #else
   wrkflwtxt += "???+"; // no path to this statement
-#endif
+#endif /* clang-format on */
   // -- CUCOMPLEX or THRUST or STD complex numbers?
 #ifdef __CUDACC__
 #if defined MGONGPU_CUCXTYPE_CUCOMPLEX
@@ -793,6 +800,12 @@ main( int argc, char** argv )
 #else
   wrkflwtxt += "???:"; // no path to this statement
 #endif
+#elif defined __HIPCC__
+#if defined MGONGPU_CUCXTYPE_CXSMPL
+  wrkflwtxt += "CXS:";
+#else
+  wrkflwtxt += "???:"; // no path to this statement
+#endif
 #else
 #if defined MGONGPU_CPPCXTYPE_STDCOMPLEX
   wrkflwtxt += "STX:";
@@ -818,7 +831,7 @@ main( int argc, char** argv )
     wrkflwtxt += "RMBDEV+";
   else
     wrkflwtxt += "??????+"; // no path to this statement
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   // -- HOST or DEVICE matrix elements? Standalone MEs or BRIDGE?
   if( !bridge )
     wrkflwtxt += "MESDEV";
@@ -874,7 +887,7 @@ main( int argc, char** argv )
 
   if( perf )
   {
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 #ifdef _OPENMP
     // Get the output of "nproc --all" (https://stackoverflow.com/a/478960)
     std::string nprocall;
@@ -895,6 +908,8 @@ main( int argc, char** argv )
     std::cout << std::string( SEP79, '*' ) << std::endl
 #ifdef __CUDACC__
               << "Process                     = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_CUDA"
+#elif defined __HIPCC__
+              << "Process                     = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_HIP"
 #else
               << "Process                     = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_CPP"
 #endif
@@ -921,21 +936,21 @@ main( int argc, char** argv )
 #elif defined MGONGPU_FPTYPE_FLOAT
               << "FP precision                = FLOAT (NaN/abnormal=" << nabn << ", zero=" << nzero << ")" << std::endl
 #endif
-#ifdef __CUDACC__
 #if defined MGONGPU_CUCXTYPE_CUCOMPLEX
               << "Complex type                = CUCOMPLEX" << std::endl
 #elif defined MGONGPU_CUCXTYPE_THRUST
               << "Complex type                = THRUST::COMPLEX" << std::endl
-#endif
-#else
+#elif defined MGONGPU_CUCXTYPE_CXSMPL
               << "Complex type                = STD::COMPLEX" << std::endl
+#else
+              << "Complex type                = ???" << std::endl // no path to this statement...
 #endif
               << "RanNumb memory layout       = AOSOA[" << neppR << "]"
               << ( neppR == 1 ? " == AOS" : "" )
               << " [HARDCODED FOR REPRODUCIBILITY]" << std::endl
               << "Momenta memory layout       = AOSOA[" << neppM << "]"
               << ( neppM == 1 ? " == AOS" : "" ) << std::endl
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     //<< "Wavefunction GPU memory     = LOCAL" << std::endl
 #else
 #if !defined MGONGPU_CPPSIMD
@@ -966,7 +981,7 @@ main( int argc, char** argv )
 #endif
 #endif
               << "Random number generation    = " << rndgentxt << std::endl
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 #ifdef _OPENMP
               << "OMP threads / `nproc --all` = " << omp_get_max_threads() << " / " << nprocall // includes a newline
 #endif
@@ -1062,14 +1077,14 @@ main( int argc, char** argv )
              << "\"FLOAT (NaN/abnormal=" << nabn << ")\"," << std::endl
 #endif
              << "\"Complex type\": "
-#ifdef __CUDACC__
 #if defined MGONGPU_CUCXTYPE_CUCOMPLEX
              << "\"CUCOMPLEX\"," << std::endl
 #elif defined MGONGPU_CUCXTYPE_THRUST
              << "\"THRUST::COMPLEX\"," << std::endl
-#endif
-#else
+#elif defined MGONGPU_CUCXTYPE_CXSMPL
              << "\"STD::COMPLEX\"," << std::endl
+#else
+             << "\"???\"," << std::endl                           // no path to this statement...
 #endif
              << "\"RanNumb memory layout\": "
              << "\"AOSOA[" << neppR << "]\""
@@ -1077,7 +1092,7 @@ main( int argc, char** argv )
              << "\"Momenta memory layout\": "
              << "\"AOSOA[" << neppM << "]\""
              << ( neppM == 1 ? " == AOS" : "" ) << ", " << std::endl
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     //<< "\"Wavefunction GPU memory\": " << "\"LOCAL\"," << std::endl
 #endif
              << "\"Curand generation\": "
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_uux_ttxg/CPPProcess.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_uux_ttxg/CPPProcess.cc
index 87830582d7..b37df5d33f 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_uux_ttxg/CPPProcess.cc
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_uux_ttxg/CPPProcess.cc
@@ -4,7 +4,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
 // MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
@@ -16,7 +16,6 @@
 
 #include "mgOnGpuConfig.h"
 
-#include "CudaRuntime.h"
 #include "HelAmps_sm.h"
 #include "MemoryAccessAmplitudes.h"
 #include "MemoryAccessCouplings.h"
@@ -49,7 +48,7 @@
 // Process: d d~ > t t~ g WEIGHTED<=3 @1
 // Process: s s~ > t t~ g WEIGHTED<=3 @1
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -83,7 +82,7 @@ namespace mg5amcCpu
   __device__ const fptype cIPD[2] = { (fptype)Parameters_sm::mdl_MT, (fptype)Parameters_sm::mdl_WT };
   __device__ const fptype* cIPC = nullptr; // unused as nicoup=0
 #else
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   __device__ __constant__ fptype cIPD[2];
   __device__ __constant__ fptype* cIPC = nullptr; // unused as nicoup=0
 #else
@@ -93,7 +92,7 @@ namespace mg5amcCpu
 #endif
 
   // Helicity combinations (and filtering of "good" helicity combinations)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   __device__ __constant__ short cHel[ncomb][npar];
   __device__ __constant__ int cNGoodHel;
   __device__ __constant__ int cGoodHel[ncomb];
@@ -121,13 +120,13 @@ namespace mg5amcCpu
                            fptype* allDenominators,       // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
                            fptype_sv* jamp2_sv            // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled)
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
                            , const int ievt00             // input: first event number in current C++ event page (for CUDA, ievt depends on threadid)
 #endif
                            )
   //ALWAYS_INLINE // attributes are not permitted in a function definition
   {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     using namespace mg5amcGpu;
     using M_ACCESS = DeviceAccessMomenta;         // non-trivial access: buffer includes all events
     using E_ACCESS = DeviceAccessMatrixElements;  // non-trivial access: buffer includes all events
@@ -154,7 +153,7 @@ namespace mg5amcCpu
 #endif /* clang-format on */
     mgDebug( 0, __FUNCTION__ );
     //printf( "calculate_wavefunctions: ihel=%2d\n", ihel );
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
     //printf( "calculate_wavefunctions: ievt00=%d\n", ievt00 );
 #endif
 
@@ -190,7 +189,7 @@ namespace mg5amcCpu
 #endif
     for( int iParity = 0; iParity < nParity; ++iParity )
     { // START LOOP ON IPARITY
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
       const int ievt0 = ievt00 + iParity * neppV;
 #endif
       constexpr size_t nxcoup = ndcoup + nicoup; // both dependent and independent couplings
@@ -203,8 +202,10 @@ namespace mg5amcCpu
         allCOUPs[idcoup] = CD_ACCESS::idcoupAccessBufferConst( allcouplings, idcoup ); // dependent couplings, vary event-by-event
       for( size_t iicoup = 0; iicoup < nicoup; iicoup++ )
         allCOUPs[ndcoup + iicoup] = CI_ACCESS::iicoupAccessBufferConst( cIPC, iicoup ); // independent couplings, fixed for all events
+#ifdef MGONGPUCPP_GPUIMPL
 #ifdef __CUDACC__
 #pragma nv_diagnostic pop
+#endif
       // CUDA kernels take input/output buffers with momenta/MEs for all events
       const fptype* momenta = allmomenta;
       const fptype* COUPs[nxcoup];
@@ -341,7 +342,7 @@ namespace mg5amcCpu
         { 4, 0, 12, 4 },
         { 0, 4, 4, 12 } }; // 2-D array[4][4]
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
       // Pre-compute a constexpr triangular color matrix properly normalized #475
       struct TriangularNormalizedColorMatrix
       {
@@ -398,7 +399,7 @@ namespace mg5amcCpu
 #endif
       for( int icol = 0; icol < ncolor; icol++ )
       {
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
         // === C++ START ===
         // Diagonal terms
 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
@@ -457,7 +458,7 @@ namespace mg5amcCpu
       MEs_sv_previous += deltaMEs_previous;
 #endif
       /*
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
       if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", blockDim.x * blockIdx.x + threadIdx.x, ihel, MEs_sv );
 #else
 #ifdef MGONGPU_CPPSIMD
@@ -520,8 +521,8 @@ namespace mg5amcCpu
       { -1, 1, 1, 1, 1 },
       { -1, 1, 1, -1, -1 },
       { -1, 1, 1, -1, 1 } };
-#ifdef __CUDACC__
-    checkCuda( cudaMemcpyToSymbol( cHel, tHel, ncomb * npar * sizeof( short ) ) );
+#ifdef MGONGPUCPP_GPUIMPL
+    gpuMemcpyToSymbol( cHel, tHel, ncomb * npar * sizeof( short ) );
 #else
     memcpy( cHel, tHel, ncomb * npar * sizeof( short ) );
 #endif
@@ -562,9 +563,9 @@ namespace mg5amcCpu
     // Then copy them to CUDA constant memory (issue #39) or its C++ emulation in file-scope static memory
     const fptype tIPD[2] = { (fptype)m_pars->mdl_MT, (fptype)m_pars->mdl_WT };
     //const cxtype tIPC[0] = { ... }; // nicoup=0
-#ifdef __CUDACC__
-    checkCuda( cudaMemcpyToSymbol( cIPD, tIPD, 2 * sizeof( fptype ) ) );
-    //checkCuda( cudaMemcpyToSymbol( cIPC, tIPC, 0 * sizeof( cxtype ) ) ); // nicoup=0
+#ifdef MGONGPUCPP_GPUIMPL
+    gpuMemcpyToSymbol( cIPD, tIPD, 2 * sizeof( fptype ) );
+    //gpuMemcpyToSymbol( cIPC, tIPC, 0 * sizeof( cxtype ) ); // nicoup=0
 #else
     memcpy( cIPD, tIPD, 2 * sizeof( fptype ) );
     //memcpy( cIPC, tIPC, 0 * sizeof( cxtype ) ); // nicoup=0
@@ -601,7 +602,7 @@ namespace mg5amcCpu
   {
     std::stringstream out;
     // CUDA version (NVCC)
-    // [Use __NVCC__ instead of __CUDACC__ here!]
+    // [Use __NVCC__ instead of MGONGPUCPP_GPUIMPL here!]
     // [This tests if 'nvcc' was used even to build a .cc file, even if not necessarily 'nvcc -x cu' for a .cu file]
     // [Check 'nvcc --compiler-options -dM -E dummy.c | grep CUDA': see https://stackoverflow.com/a/53713712]
 #ifdef __NVCC__
@@ -666,12 +667,12 @@ namespace mg5amcCpu
   __global__ void /* clang-format off */
   computeDependentCouplings( const fptype* allgs, // input: Gs[nevt]
                              fptype* allcouplings // output: couplings[nevt*ndcoup*2]
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
                              , const int nevt     // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #endif
   ) /* clang-format on */
   {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     using namespace mg5amcGpu;
     using G_ACCESS = DeviceAccessGs;
     using C_ACCESS = DeviceAccessCouplings;
@@ -692,7 +693,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__ /* clang-format off */
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
   __global__ void
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
@@ -818,9 +819,9 @@ namespace mg5amcCpu
         nGoodHel++;
       }
     }
-#ifdef __CUDACC__
-    checkCuda( cudaMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) ) );
-    checkCuda( cudaMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) ) );
+#ifdef MGONGPUCPP_GPUIMPL
+    gpuMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) );
+    gpuMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) );
 #else
     cNGoodHel = nGoodHel;
     for( int ihel = 0; ihel < ncomb; ihel++ ) cGoodHel[ihel] = goodHel[ihel];
@@ -844,7 +845,7 @@ namespace mg5amcCpu
 #endif
             int* allselhel,                // output: helicity selection[nevt]
             int* allselcol                 // output: helicity selection[nevt]
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
             , const int nevt               // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #endif
             ) /* clang-format on */
@@ -864,7 +865,7 @@ namespace mg5amcCpu
     // Denominators: spins, colors and identical particles
     constexpr int helcolDenominators[1] = { 36 }; // assume nprocesses == 1 (#272 and #343)
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     // Remember: in CUDA this is a kernel for one event, in c++ this processes n events
     const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid
 #else
@@ -878,9 +879,12 @@ namespace mg5amcCpu
 #endif
 
     // Start sigmaKin_lines
+
+#include "GpuAbstraction.h"
+
     // === PART 0 - INITIALISATION (before calculate_wavefunctions) ===
     // Reset the "matrix elements" - running sums of |M|^2 over helicities for the given event
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     allMEs[ievt] = 0;
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     allNumerators[ievt] = 0;
@@ -908,7 +912,7 @@ namespace mg5amcCpu
     // === PART 1 - HELICITY LOOP: CALCULATE WAVEFUNCTIONS ===
     // (in both CUDA and C++, using precomputed good helicities)
 
-#ifdef __CUDACC__ // CUDA OR C++
+#ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++
 
     // *** START OF PART 1a - CUDA (one event per CPU thread) ***
     // Running sum of partial amplitudes squared for event by event color selection (#402)
@@ -1118,7 +1122,7 @@ namespace mg5amcCpu
     // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event
     // [NB 'sum over final spins, average over initial spins', eg see
     // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf]
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     allMEs[ievt] /= helcolDenominators[0];
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     if( channelId > 0 ) allMEs[ievt] *= allNumerators[ievt] / allDenominators[ievt];
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_uux_ttxg/CPPProcess.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_uux_ttxg/CPPProcess.h
index f8bdb38aee..fc7c0d8196 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_uux_ttxg/CPPProcess.h
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_uux_ttxg/CPPProcess.h
@@ -4,7 +4,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
 // MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
@@ -25,7 +25,7 @@
 
 //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -110,7 +110,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   __global__ void
   computeDependentCouplings( const fptype* allgs,    // input: Gs[nevt]
                              fptype* allcouplings ); // output: couplings[nevt*ndcoup*2]
@@ -123,7 +123,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__ /* clang-format off */
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
   __global__ void
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
@@ -153,7 +153,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__ /* clang-format off */
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
   __global__ void
   sigmaKin( const fptype* allmomenta,      // input: momenta[nevt*npar*4]
             const fptype* allcouplings,    // input: couplings[nevt*ndcoup*2]
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_uux_ttxg/CudaRuntime.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_uux_ttxg/CudaRuntime.h
deleted file mode 120000
index ce9e1a487a..0000000000
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_uux_ttxg/CudaRuntime.h
+++ /dev/null
@@ -1 +0,0 @@
-../CudaRuntime.h
\ No newline at end of file
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_uux_ttxg/GpuAbstraction.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_uux_ttxg/GpuAbstraction.h
new file mode 120000
index 0000000000..72054e19ba
--- /dev/null
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_uux_ttxg/GpuAbstraction.h
@@ -0,0 +1 @@
+../GpuAbstraction.h
\ No newline at end of file
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_uux_ttxg/GpuRuntime.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_uux_ttxg/GpuRuntime.h
new file mode 120000
index 0000000000..3920e83be4
--- /dev/null
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_uux_ttxg/GpuRuntime.h
@@ -0,0 +1 @@
+../GpuRuntime.h
\ No newline at end of file
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_uux_ttxg/check_sa.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_uux_ttxg/check_sa.cc
index 3fbf0ffbee..7cac5ab47b 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_uux_ttxg/check_sa.cc
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_uux_ttxg/check_sa.cc
@@ -4,7 +4,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: O. Mattelaer (Nov 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 
 #include "mgOnGpuConfig.h"
@@ -12,6 +12,7 @@
 #include "BridgeKernels.h"
 #include "CPPProcess.h"
 #include "CrossSectionKernels.h"
+#include "GpuRuntime.h"
 #include "MatrixElementKernels.h"
 #include "MemoryAccessMatrixElements.h"
 #include "MemoryAccessMomenta.h"
@@ -65,7 +66,7 @@ usage( char* argv0, int ret = 1 )
   std::cout << std::endl;
   std::cout << "Summary stats are always computed: '-p' and '-j' only control their printout" << std::endl;
   std::cout << "The '-d' flag only enables NaN/abnormal warnings and OMP debugging" << std::endl;
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 #ifdef _OPENMP
   std::cout << std::endl;
   std::cout << "Use the OMP_NUM_THREADS environment variable to control OMP multi-threading" << std::endl;
@@ -96,7 +97,7 @@ int
 main( int argc, char** argv )
 {
   // Namespaces for CUDA and C++ (FIXME - eventually use the same namespace everywhere...)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   using namespace mg5amcGpu;
 #else
   using namespace mg5amcCpu;
@@ -134,9 +135,11 @@ main( int argc, char** argv )
     CurandDevice = 2
   };
 #ifdef MGONGPU_HAS_NO_CURAND
-  RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // this is the only supported mode if build has no curand (PR #784)
+  RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // this is the only supported mode if build has no curand (PR #784 and #785)
+#elif defined __HIPCC__
+#error Internal error: MGONGPU_HAS_NO_CURAND should have been set for __HIPCC__ // default on AMD GPUs should be common random
 #elif defined __CUDACC__
-  RandomNumberMode rndgen = RandomNumberMode::CurandDevice; // default on GPU if build has curand
+  RandomNumberMode rndgen = RandomNumberMode::CurandDevice; // default on NVidia GPU if build has curand
 #else
   RandomNumberMode rndgen = RandomNumberMode::CurandHost; // default on CPU if build has curand
 #endif
@@ -146,10 +149,10 @@ main( int argc, char** argv )
     RamboHost = 1,
     RamboDevice = 2
   };
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   RamboSamplingMode rmbsmp = RamboSamplingMode::RamboDevice; // default on GPU
 #else
-  RamboSamplingMode rmbsmp = RamboSamplingMode::RamboHost;  // default on CPU
+  RamboSamplingMode rmbsmp = RamboSamplingMode::RamboHost; // default on CPU
 #endif
   // Bridge emulation mode (NB Bridge implies RamboHost!)
   bool bridge = false;
@@ -177,7 +180,7 @@ main( int argc, char** argv )
     else if( arg == "--curdev" )
     {
 #ifndef __CUDACC__
-      throw std::runtime_error( "CurandDevice is not supported on CPUs" );
+      throw std::runtime_error( "CurandDevice is not supported on CPUs or non-NVidia GPUs" );
 #elif defined MGONGPU_HAS_NO_CURAND
       throw std::runtime_error( "CurandDevice is not supported because this application was built without Curand support" );
 #else
@@ -198,7 +201,7 @@ main( int argc, char** argv )
     }
     else if( arg == "--rmbdev" )
     {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
       rmbsmp = RamboSamplingMode::RamboDevice;
 #else
       throw std::runtime_error( "RamboDevice is not supported on CPUs" );
@@ -272,13 +275,13 @@ main( int argc, char** argv )
     return usage( argv[0] );
   }
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 #ifdef _OPENMP
   ompnumthreadsNotSetMeansOneThread( debug ? 1 : 0 ); // quiet(-1), info(0), debug(1)
 #endif
 #endif
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // Fail gently and avoid "Illegal instruction (core dumped)" if the host does not support the SIMD used in the ME calculation
   // Note: this prevents a crash on pmpe04 but not on some github CI nodes?
   // [NB: SIMD vectorization in mg5amc C++ code is only used in the ME calculation below MatrixElementKernelHost!]
@@ -296,14 +299,14 @@ main( int argc, char** argv )
 
   // === STEP 0 - INITIALISE
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 
-  // --- 00. Initialise cuda
-  // Instantiate a CudaRuntime at the beginnining of the application's main to
-  // invoke cudaSetDevice(0) in the constructor and book a cudaDeviceReset() call in the destructor
-  const std::string cdinKey = "00 CudaInit";
+  // --- 00. Initialise GPU
+  // Instantiate a GpuRuntime at the beginnining of the application's main.
+  // For CUDA this invokes cudaSetDevice(0) in the constructor and books a cudaDeviceReset() call in the destructor.
+  const std::string cdinKey = "00 GpuInit";
   timermap.start( cdinKey );
-  CudaRuntime cudaRuntime( debug );
+  GpuRuntime GpuRuntime( debug );
 #endif
 
   // --- 0a. Initialise physics process
@@ -325,7 +328,7 @@ main( int argc, char** argv )
   timermap.start( alloKey );
 
   // Memory buffers for random numbers for momenta
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferRndNumMomenta hstRndmom( nevt );
 #else
   PinnedHostBufferRndNumMomenta hstRndmom( nevt );
@@ -333,7 +336,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for sampling weights
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferWeights hstWeights( nevt );
 #else
   PinnedHostBufferWeights hstWeights( nevt );
@@ -341,7 +344,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for momenta
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferMomenta hstMomenta( nevt );
 #else
   PinnedHostBufferMomenta hstMomenta( nevt );
@@ -349,7 +352,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for Gs
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferGs hstGs( nevt );
 #else
   PinnedHostBufferGs hstGs( nevt );
@@ -366,7 +369,7 @@ main( int argc, char** argv )
   }
 
   // Memory buffers for matrix elements
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferMatrixElements hstMatrixElements( nevt );
 #else
   PinnedHostBufferMatrixElements hstMatrixElements( nevt );
@@ -375,7 +378,7 @@ main( int argc, char** argv )
 
   // Memory buffers for random numbers for helicity selection
   // *** NB #403 these buffers always remain initialised at 0: no need for helicity choice in gcheck/check (no LHE produced) ***
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferRndNumHelicity hstRndHel( nevt );
 #else
   PinnedHostBufferRndNumHelicity hstRndHel( nevt );
@@ -384,7 +387,7 @@ main( int argc, char** argv )
 
   // Memory buffers for random numbers for color selection
   // *** NB #402 these buffers always remain initialised at 0: no need for color choice in gcheck/check (no LHE produced) ***
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferRndNumColor hstRndCol( nevt );
 #else
   PinnedHostBufferRndNumColor hstRndCol( nevt );
@@ -392,7 +395,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for helicity selection
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferSelectedHelicity hstSelHel( nevt );
 #else
   PinnedHostBufferSelectedHelicity hstSelHel( nevt );
@@ -400,7 +403,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for color selection
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferSelectedColor hstSelCol( nevt );
 #else
   PinnedHostBufferSelectedColor hstSelCol( nevt );
@@ -438,7 +441,7 @@ main( int argc, char** argv )
     const bool onDevice = true;
     prnk.reset( new CurandRandomNumberKernel( devRndmom, onDevice ) );
 #else
-    throw std::logic_error( "INTERNAL ERROR! CurandDevice is not supported on CPUs" ); // INTERNAL ERROR (no path to this statement)
+    throw std::logic_error( "INTERNAL ERROR! CurandDevice is not supported on CPUs or non-NVidia GPUs" ); // INTERNAL ERROR (no path to this statement)
 #endif
   }
 
@@ -450,7 +453,7 @@ main( int argc, char** argv )
   }
   else
   {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     prsk.reset( new RamboSamplingKernelDevice( energy, devRndmom, devMomenta, devWeights, gpublocks, gputhreads ) );
 #else
     throw std::logic_error( "RamboDevice is not supported on CPUs" ); // INTERNAL ERROR (no path to this statement)
@@ -461,7 +464,7 @@ main( int argc, char** argv )
   std::unique_ptr<MatrixElementKernelBase> pmek;
   if( !bridge )
   {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     pmek.reset( new MatrixElementKernelDevice( devMomenta, devGs, devRndHel, devRndCol, devMatrixElements, devSelHel, devSelCol, gpublocks, gputhreads ) );
 #else
     pmek.reset( new MatrixElementKernelHost( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, nevt ) );
@@ -469,7 +472,7 @@ main( int argc, char** argv )
   }
   else
   {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     pmek.reset( new BridgeKernelDevice( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, gpublocks, gputhreads ) );
 #else
     pmek.reset( new BridgeKernelHost( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, nevt ) );
@@ -511,7 +514,7 @@ main( int argc, char** argv )
     prnk->generateRnarray();
     //std::cout << "Got random numbers" << std::endl;
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     if( rndgen != RandomNumberMode::CurandDevice && rmbsmp == RamboSamplingMode::RamboDevice )
     {
       // --- 1c. Copy rndmom from host to device
@@ -543,7 +546,7 @@ main( int argc, char** argv )
     prsk->getMomentaFinal();
     //std::cout << "Got final momenta" << std::endl;
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     if( rmbsmp == RamboSamplingMode::RamboDevice )
     {
       // --- 2c. CopyDToH Weights
@@ -588,7 +591,7 @@ main( int argc, char** argv )
       dynamic_cast<BridgeKernelBase*>( pmek.get() )->transposeInputMomentaC2F();
     }
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     // --- 2d. CopyHToD Momenta
     const std::string gKey = "0.. CpHTDg";
     rambtime += timermap.start( gKey ); // FIXME! NOT A RAMBO TIMER!
@@ -617,7 +620,7 @@ main( int argc, char** argv )
     wv3atime += timermap.stop(); // calc only
     wavetime += wv3atime;        // calc plus copy
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     if( !bridge )
     {
       // --- 3b. CopyDToH MEs
@@ -760,18 +763,22 @@ main( int argc, char** argv )
     rndgentxt = "CURAND DEVICE";
 #ifdef __CUDACC__
   rndgentxt += " (CUDA code)";
+#elif defined __HIPCC__
+  rndgentxt += " (HIP code)";
 #else
   rndgentxt += " (C++ code)";
 #endif
 
   // Workflow description summary
   std::string wrkflwtxt;
-  // -- CUDA or C++?
+  // -- CUDA or HIP or C++?
 #ifdef __CUDACC__
   wrkflwtxt += "CUD:";
+#elif defined __HIPCC__
+  wrkflwtxt += "HIP:";
 #else
   wrkflwtxt += "CPP:";
-#endif
+#endif /* clang-format off */
   // -- DOUBLE or FLOAT?
 #if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
   wrkflwtxt += "MIX+"; // mixed fptypes (single precision color algebra #537)
@@ -781,7 +788,7 @@ main( int argc, char** argv )
   wrkflwtxt += "FLT+";
 #else
   wrkflwtxt += "???+"; // no path to this statement
-#endif
+#endif /* clang-format on */
   // -- CUCOMPLEX or THRUST or STD complex numbers?
 #ifdef __CUDACC__
 #if defined MGONGPU_CUCXTYPE_CUCOMPLEX
@@ -793,6 +800,12 @@ main( int argc, char** argv )
 #else
   wrkflwtxt += "???:"; // no path to this statement
 #endif
+#elif defined __HIPCC__
+#if defined MGONGPU_CUCXTYPE_CXSMPL
+  wrkflwtxt += "CXS:";
+#else
+  wrkflwtxt += "???:"; // no path to this statement
+#endif
 #else
 #if defined MGONGPU_CPPCXTYPE_STDCOMPLEX
   wrkflwtxt += "STX:";
@@ -818,7 +831,7 @@ main( int argc, char** argv )
     wrkflwtxt += "RMBDEV+";
   else
     wrkflwtxt += "??????+"; // no path to this statement
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   // -- HOST or DEVICE matrix elements? Standalone MEs or BRIDGE?
   if( !bridge )
     wrkflwtxt += "MESDEV";
@@ -874,7 +887,7 @@ main( int argc, char** argv )
 
   if( perf )
   {
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 #ifdef _OPENMP
     // Get the output of "nproc --all" (https://stackoverflow.com/a/478960)
     std::string nprocall;
@@ -895,6 +908,8 @@ main( int argc, char** argv )
     std::cout << std::string( SEP79, '*' ) << std::endl
 #ifdef __CUDACC__
               << "Process                     = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_CUDA"
+#elif defined __HIPCC__
+              << "Process                     = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_HIP"
 #else
               << "Process                     = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_CPP"
 #endif
@@ -921,21 +936,21 @@ main( int argc, char** argv )
 #elif defined MGONGPU_FPTYPE_FLOAT
               << "FP precision                = FLOAT (NaN/abnormal=" << nabn << ", zero=" << nzero << ")" << std::endl
 #endif
-#ifdef __CUDACC__
 #if defined MGONGPU_CUCXTYPE_CUCOMPLEX
               << "Complex type                = CUCOMPLEX" << std::endl
 #elif defined MGONGPU_CUCXTYPE_THRUST
               << "Complex type                = THRUST::COMPLEX" << std::endl
-#endif
-#else
+#elif defined MGONGPU_CUCXTYPE_CXSMPL
               << "Complex type                = STD::COMPLEX" << std::endl
+#else
+              << "Complex type                = ???" << std::endl // no path to this statement...
 #endif
               << "RanNumb memory layout       = AOSOA[" << neppR << "]"
               << ( neppR == 1 ? " == AOS" : "" )
               << " [HARDCODED FOR REPRODUCIBILITY]" << std::endl
               << "Momenta memory layout       = AOSOA[" << neppM << "]"
               << ( neppM == 1 ? " == AOS" : "" ) << std::endl
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     //<< "Wavefunction GPU memory     = LOCAL" << std::endl
 #else
 #if !defined MGONGPU_CPPSIMD
@@ -966,7 +981,7 @@ main( int argc, char** argv )
 #endif
 #endif
               << "Random number generation    = " << rndgentxt << std::endl
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 #ifdef _OPENMP
               << "OMP threads / `nproc --all` = " << omp_get_max_threads() << " / " << nprocall // includes a newline
 #endif
@@ -1062,14 +1077,14 @@ main( int argc, char** argv )
              << "\"FLOAT (NaN/abnormal=" << nabn << ")\"," << std::endl
 #endif
              << "\"Complex type\": "
-#ifdef __CUDACC__
 #if defined MGONGPU_CUCXTYPE_CUCOMPLEX
              << "\"CUCOMPLEX\"," << std::endl
 #elif defined MGONGPU_CUCXTYPE_THRUST
              << "\"THRUST::COMPLEX\"," << std::endl
-#endif
-#else
+#elif defined MGONGPU_CUCXTYPE_CXSMPL
              << "\"STD::COMPLEX\"," << std::endl
+#else
+             << "\"???\"," << std::endl                           // no path to this statement...
 #endif
              << "\"RanNumb memory layout\": "
              << "\"AOSOA[" << neppR << "]\""
@@ -1077,7 +1092,7 @@ main( int argc, char** argv )
              << "\"Momenta memory layout\": "
              << "\"AOSOA[" << neppM << "]\""
              << ( neppM == 1 ? " == AOS" : "" ) << ", " << std::endl
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     //<< "\"Wavefunction GPU memory\": " << "\"LOCAL\"," << std::endl
 #endif
              << "\"Curand generation\": "
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxgg/CPPProcess.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxgg/CPPProcess.cc
index 9051b3108d..b4df38fb35 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxgg/CPPProcess.cc
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxgg/CPPProcess.cc
@@ -4,7 +4,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
 // MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
@@ -16,7 +16,6 @@
 
 #include "mgOnGpuConfig.h"
 
-#include "CudaRuntime.h"
 #include "HelAmps_sm.h"
 #include "MemoryAccessAmplitudes.h"
 #include "MemoryAccessCouplings.h"
@@ -46,7 +45,7 @@
 // Class member functions for calculating the matrix elements for
 // Process: g g > t t~ g g WEIGHTED<=4 @2
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -80,7 +79,7 @@ namespace mg5amcCpu
   __device__ const fptype cIPD[2] = { (fptype)Parameters_sm::mdl_MT, (fptype)Parameters_sm::mdl_WT };
   __device__ const fptype* cIPC = nullptr; // unused as nicoup=0
 #else
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   __device__ __constant__ fptype cIPD[2];
   __device__ __constant__ fptype* cIPC = nullptr; // unused as nicoup=0
 #else
@@ -90,7 +89,7 @@ namespace mg5amcCpu
 #endif
 
   // Helicity combinations (and filtering of "good" helicity combinations)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   __device__ __constant__ short cHel[ncomb][npar];
   __device__ __constant__ int cNGoodHel;
   __device__ __constant__ int cGoodHel[ncomb];
@@ -118,13 +117,13 @@ namespace mg5amcCpu
                            fptype* allDenominators,       // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
                            fptype_sv* jamp2_sv            // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled)
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
                            , const int ievt00             // input: first event number in current C++ event page (for CUDA, ievt depends on threadid)
 #endif
                            )
   //ALWAYS_INLINE // attributes are not permitted in a function definition
   {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     using namespace mg5amcGpu;
     using M_ACCESS = DeviceAccessMomenta;         // non-trivial access: buffer includes all events
     using E_ACCESS = DeviceAccessMatrixElements;  // non-trivial access: buffer includes all events
@@ -151,7 +150,7 @@ namespace mg5amcCpu
 #endif /* clang-format on */
     mgDebug( 0, __FUNCTION__ );
     //printf( "calculate_wavefunctions: ihel=%2d\n", ihel );
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
     //printf( "calculate_wavefunctions: ievt00=%d\n", ievt00 );
 #endif
 
@@ -187,7 +186,7 @@ namespace mg5amcCpu
 #endif
     for( int iParity = 0; iParity < nParity; ++iParity )
     { // START LOOP ON IPARITY
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
       const int ievt0 = ievt00 + iParity * neppV;
 #endif
       constexpr size_t nxcoup = ndcoup + nicoup; // both dependent and independent couplings
@@ -200,8 +199,10 @@ namespace mg5amcCpu
         allCOUPs[idcoup] = CD_ACCESS::idcoupAccessBufferConst( allcouplings, idcoup ); // dependent couplings, vary event-by-event
       for( size_t iicoup = 0; iicoup < nicoup; iicoup++ )
         allCOUPs[ndcoup + iicoup] = CI_ACCESS::iicoupAccessBufferConst( cIPC, iicoup ); // independent couplings, fixed for all events
+#ifdef MGONGPUCPP_GPUIMPL
 #ifdef __CUDACC__
 #pragma nv_diagnostic pop
+#endif
       // CUDA kernels take input/output buffers with momenta/MEs for all events
       const fptype* momenta = allmomenta;
       const fptype* COUPs[nxcoup];
@@ -2417,7 +2418,7 @@ namespace mg5amcCpu
         { 62, 71, -10, 80, -1, 8, -28, 62, 62, -10, -10, -1, -1, 8, -10, -1, -64, 8, 8, -64, 80, 8, 512, -64 },
         { -28, 62, 62, -10, -10, -1, 62, 71, -10, 80, -1, 8, -10, -1, -1, 8, 8, -64, 80, 8, 8, -64, -64, 512 } }; // 2-D array[24][24]
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
       // Pre-compute a constexpr triangular color matrix properly normalized #475
       struct TriangularNormalizedColorMatrix
       {
@@ -2474,7 +2475,7 @@ namespace mg5amcCpu
 #endif
       for( int icol = 0; icol < ncolor; icol++ )
       {
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
         // === C++ START ===
         // Diagonal terms
 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
@@ -2533,7 +2534,7 @@ namespace mg5amcCpu
       MEs_sv_previous += deltaMEs_previous;
 #endif
       /*
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
       if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", blockDim.x * blockIdx.x + threadIdx.x, ihel, MEs_sv );
 #else
 #ifdef MGONGPU_CPPSIMD
@@ -2628,8 +2629,8 @@ namespace mg5amcCpu
       { 1, 1, 1, -1, -1, 1 },
       { 1, 1, 1, -1, 1, -1 },
       { 1, 1, 1, -1, 1, 1 } };
-#ifdef __CUDACC__
-    checkCuda( cudaMemcpyToSymbol( cHel, tHel, ncomb * npar * sizeof( short ) ) );
+#ifdef MGONGPUCPP_GPUIMPL
+    gpuMemcpyToSymbol( cHel, tHel, ncomb * npar * sizeof( short ) );
 #else
     memcpy( cHel, tHel, ncomb * npar * sizeof( short ) );
 #endif
@@ -2671,9 +2672,9 @@ namespace mg5amcCpu
     // Then copy them to CUDA constant memory (issue #39) or its C++ emulation in file-scope static memory
     const fptype tIPD[2] = { (fptype)m_pars->mdl_MT, (fptype)m_pars->mdl_WT };
     //const cxtype tIPC[0] = { ... }; // nicoup=0
-#ifdef __CUDACC__
-    checkCuda( cudaMemcpyToSymbol( cIPD, tIPD, 2 * sizeof( fptype ) ) );
-    //checkCuda( cudaMemcpyToSymbol( cIPC, tIPC, 0 * sizeof( cxtype ) ) ); // nicoup=0
+#ifdef MGONGPUCPP_GPUIMPL
+    gpuMemcpyToSymbol( cIPD, tIPD, 2 * sizeof( fptype ) );
+    //gpuMemcpyToSymbol( cIPC, tIPC, 0 * sizeof( cxtype ) ); // nicoup=0
 #else
     memcpy( cIPD, tIPD, 2 * sizeof( fptype ) );
     //memcpy( cIPC, tIPC, 0 * sizeof( cxtype ) ); // nicoup=0
@@ -2711,7 +2712,7 @@ namespace mg5amcCpu
   {
     std::stringstream out;
     // CUDA version (NVCC)
-    // [Use __NVCC__ instead of __CUDACC__ here!]
+    // [Use __NVCC__ instead of MGONGPUCPP_GPUIMPL here!]
     // [This tests if 'nvcc' was used even to build a .cc file, even if not necessarily 'nvcc -x cu' for a .cu file]
     // [Check 'nvcc --compiler-options -dM -E dummy.c | grep CUDA': see https://stackoverflow.com/a/53713712]
 #ifdef __NVCC__
@@ -2776,12 +2777,12 @@ namespace mg5amcCpu
   __global__ void /* clang-format off */
   computeDependentCouplings( const fptype* allgs, // input: Gs[nevt]
                              fptype* allcouplings // output: couplings[nevt*ndcoup*2]
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
                              , const int nevt     // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #endif
   ) /* clang-format on */
   {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     using namespace mg5amcGpu;
     using G_ACCESS = DeviceAccessGs;
     using C_ACCESS = DeviceAccessCouplings;
@@ -2802,7 +2803,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__ /* clang-format off */
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
   __global__ void
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
@@ -2928,9 +2929,9 @@ namespace mg5amcCpu
         nGoodHel++;
       }
     }
-#ifdef __CUDACC__
-    checkCuda( cudaMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) ) );
-    checkCuda( cudaMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) ) );
+#ifdef MGONGPUCPP_GPUIMPL
+    gpuMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) );
+    gpuMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) );
 #else
     cNGoodHel = nGoodHel;
     for( int ihel = 0; ihel < ncomb; ihel++ ) cGoodHel[ihel] = goodHel[ihel];
@@ -2954,7 +2955,7 @@ namespace mg5amcCpu
 #endif
             int* allselhel,                // output: helicity selection[nevt]
             int* allselcol                 // output: helicity selection[nevt]
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
             , const int nevt               // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #endif
             ) /* clang-format on */
@@ -2974,7 +2975,7 @@ namespace mg5amcCpu
     // Denominators: spins, colors and identical particles
     constexpr int helcolDenominators[1] = { 512 }; // assume nprocesses == 1 (#272 and #343)
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     // Remember: in CUDA this is a kernel for one event, in c++ this processes n events
     const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid
 #else
@@ -2988,9 +2989,12 @@ namespace mg5amcCpu
 #endif
 
     // Start sigmaKin_lines
+
+#include "GpuAbstraction.h"
+
     // === PART 0 - INITIALISATION (before calculate_wavefunctions) ===
     // Reset the "matrix elements" - running sums of |M|^2 over helicities for the given event
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     allMEs[ievt] = 0;
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     allNumerators[ievt] = 0;
@@ -3018,7 +3022,7 @@ namespace mg5amcCpu
     // === PART 1 - HELICITY LOOP: CALCULATE WAVEFUNCTIONS ===
     // (in both CUDA and C++, using precomputed good helicities)
 
-#ifdef __CUDACC__ // CUDA OR C++
+#ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++
 
     // *** START OF PART 1a - CUDA (one event per CPU thread) ***
     // Running sum of partial amplitudes squared for event by event color selection (#402)
@@ -3228,7 +3232,7 @@ namespace mg5amcCpu
     // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event
     // [NB 'sum over final spins, average over initial spins', eg see
     // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf]
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     allMEs[ievt] /= helcolDenominators[0];
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     if( channelId > 0 ) allMEs[ievt] *= allNumerators[ievt] / allDenominators[ievt];
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxgg/CPPProcess.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxgg/CPPProcess.h
index 9f43559181..511b053c2a 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxgg/CPPProcess.h
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxgg/CPPProcess.h
@@ -4,7 +4,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
 // MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
@@ -25,7 +25,7 @@
 
 //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -107,7 +107,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   __global__ void
   computeDependentCouplings( const fptype* allgs,    // input: Gs[nevt]
                              fptype* allcouplings ); // output: couplings[nevt*ndcoup*2]
@@ -120,7 +120,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__ /* clang-format off */
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
   __global__ void
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
@@ -150,7 +150,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__ /* clang-format off */
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
   __global__ void
   sigmaKin( const fptype* allmomenta,      // input: momenta[nevt*npar*4]
             const fptype* allcouplings,    // input: couplings[nevt*ndcoup*2]
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxgg/CudaRuntime.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxgg/CudaRuntime.h
deleted file mode 120000
index ce9e1a487a..0000000000
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxgg/CudaRuntime.h
+++ /dev/null
@@ -1 +0,0 @@
-../CudaRuntime.h
\ No newline at end of file
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxgg/GpuAbstraction.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxgg/GpuAbstraction.h
new file mode 120000
index 0000000000..72054e19ba
--- /dev/null
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxgg/GpuAbstraction.h
@@ -0,0 +1 @@
+../GpuAbstraction.h
\ No newline at end of file
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxgg/GpuRuntime.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxgg/GpuRuntime.h
new file mode 120000
index 0000000000..3920e83be4
--- /dev/null
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxgg/GpuRuntime.h
@@ -0,0 +1 @@
+../GpuRuntime.h
\ No newline at end of file
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxgg/check_sa.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxgg/check_sa.cc
index 3fbf0ffbee..7cac5ab47b 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxgg/check_sa.cc
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxgg/check_sa.cc
@@ -4,7 +4,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: O. Mattelaer (Nov 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 
 #include "mgOnGpuConfig.h"
@@ -12,6 +12,7 @@
 #include "BridgeKernels.h"
 #include "CPPProcess.h"
 #include "CrossSectionKernels.h"
+#include "GpuRuntime.h"
 #include "MatrixElementKernels.h"
 #include "MemoryAccessMatrixElements.h"
 #include "MemoryAccessMomenta.h"
@@ -65,7 +66,7 @@ usage( char* argv0, int ret = 1 )
   std::cout << std::endl;
   std::cout << "Summary stats are always computed: '-p' and '-j' only control their printout" << std::endl;
   std::cout << "The '-d' flag only enables NaN/abnormal warnings and OMP debugging" << std::endl;
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 #ifdef _OPENMP
   std::cout << std::endl;
   std::cout << "Use the OMP_NUM_THREADS environment variable to control OMP multi-threading" << std::endl;
@@ -96,7 +97,7 @@ int
 main( int argc, char** argv )
 {
   // Namespaces for CUDA and C++ (FIXME - eventually use the same namespace everywhere...)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   using namespace mg5amcGpu;
 #else
   using namespace mg5amcCpu;
@@ -134,9 +135,11 @@ main( int argc, char** argv )
     CurandDevice = 2
   };
 #ifdef MGONGPU_HAS_NO_CURAND
-  RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // this is the only supported mode if build has no curand (PR #784)
+  RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // this is the only supported mode if build has no curand (PR #784 and #785)
+#elif defined __HIPCC__
+#error Internal error: MGONGPU_HAS_NO_CURAND should have been set for __HIPCC__ // default on AMD GPUs should be common random
 #elif defined __CUDACC__
-  RandomNumberMode rndgen = RandomNumberMode::CurandDevice; // default on GPU if build has curand
+  RandomNumberMode rndgen = RandomNumberMode::CurandDevice; // default on NVidia GPU if build has curand
 #else
   RandomNumberMode rndgen = RandomNumberMode::CurandHost; // default on CPU if build has curand
 #endif
@@ -146,10 +149,10 @@ main( int argc, char** argv )
     RamboHost = 1,
     RamboDevice = 2
   };
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   RamboSamplingMode rmbsmp = RamboSamplingMode::RamboDevice; // default on GPU
 #else
-  RamboSamplingMode rmbsmp = RamboSamplingMode::RamboHost;  // default on CPU
+  RamboSamplingMode rmbsmp = RamboSamplingMode::RamboHost; // default on CPU
 #endif
   // Bridge emulation mode (NB Bridge implies RamboHost!)
   bool bridge = false;
@@ -177,7 +180,7 @@ main( int argc, char** argv )
     else if( arg == "--curdev" )
     {
 #ifndef __CUDACC__
-      throw std::runtime_error( "CurandDevice is not supported on CPUs" );
+      throw std::runtime_error( "CurandDevice is not supported on CPUs or non-NVidia GPUs" );
 #elif defined MGONGPU_HAS_NO_CURAND
       throw std::runtime_error( "CurandDevice is not supported because this application was built without Curand support" );
 #else
@@ -198,7 +201,7 @@ main( int argc, char** argv )
     }
     else if( arg == "--rmbdev" )
     {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
       rmbsmp = RamboSamplingMode::RamboDevice;
 #else
       throw std::runtime_error( "RamboDevice is not supported on CPUs" );
@@ -272,13 +275,13 @@ main( int argc, char** argv )
     return usage( argv[0] );
   }
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 #ifdef _OPENMP
   ompnumthreadsNotSetMeansOneThread( debug ? 1 : 0 ); // quiet(-1), info(0), debug(1)
 #endif
 #endif
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // Fail gently and avoid "Illegal instruction (core dumped)" if the host does not support the SIMD used in the ME calculation
   // Note: this prevents a crash on pmpe04 but not on some github CI nodes?
   // [NB: SIMD vectorization in mg5amc C++ code is only used in the ME calculation below MatrixElementKernelHost!]
@@ -296,14 +299,14 @@ main( int argc, char** argv )
 
   // === STEP 0 - INITIALISE
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 
-  // --- 00. Initialise cuda
-  // Instantiate a CudaRuntime at the beginnining of the application's main to
-  // invoke cudaSetDevice(0) in the constructor and book a cudaDeviceReset() call in the destructor
-  const std::string cdinKey = "00 CudaInit";
+  // --- 00. Initialise GPU
+  // Instantiate a GpuRuntime at the beginnining of the application's main.
+  // For CUDA this invokes cudaSetDevice(0) in the constructor and books a cudaDeviceReset() call in the destructor.
+  const std::string cdinKey = "00 GpuInit";
   timermap.start( cdinKey );
-  CudaRuntime cudaRuntime( debug );
+  GpuRuntime GpuRuntime( debug );
 #endif
 
   // --- 0a. Initialise physics process
@@ -325,7 +328,7 @@ main( int argc, char** argv )
   timermap.start( alloKey );
 
   // Memory buffers for random numbers for momenta
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferRndNumMomenta hstRndmom( nevt );
 #else
   PinnedHostBufferRndNumMomenta hstRndmom( nevt );
@@ -333,7 +336,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for sampling weights
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferWeights hstWeights( nevt );
 #else
   PinnedHostBufferWeights hstWeights( nevt );
@@ -341,7 +344,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for momenta
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferMomenta hstMomenta( nevt );
 #else
   PinnedHostBufferMomenta hstMomenta( nevt );
@@ -349,7 +352,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for Gs
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferGs hstGs( nevt );
 #else
   PinnedHostBufferGs hstGs( nevt );
@@ -366,7 +369,7 @@ main( int argc, char** argv )
   }
 
   // Memory buffers for matrix elements
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferMatrixElements hstMatrixElements( nevt );
 #else
   PinnedHostBufferMatrixElements hstMatrixElements( nevt );
@@ -375,7 +378,7 @@ main( int argc, char** argv )
 
   // Memory buffers for random numbers for helicity selection
   // *** NB #403 these buffers always remain initialised at 0: no need for helicity choice in gcheck/check (no LHE produced) ***
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferRndNumHelicity hstRndHel( nevt );
 #else
   PinnedHostBufferRndNumHelicity hstRndHel( nevt );
@@ -384,7 +387,7 @@ main( int argc, char** argv )
 
   // Memory buffers for random numbers for color selection
   // *** NB #402 these buffers always remain initialised at 0: no need for color choice in gcheck/check (no LHE produced) ***
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferRndNumColor hstRndCol( nevt );
 #else
   PinnedHostBufferRndNumColor hstRndCol( nevt );
@@ -392,7 +395,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for helicity selection
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferSelectedHelicity hstSelHel( nevt );
 #else
   PinnedHostBufferSelectedHelicity hstSelHel( nevt );
@@ -400,7 +403,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for color selection
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferSelectedColor hstSelCol( nevt );
 #else
   PinnedHostBufferSelectedColor hstSelCol( nevt );
@@ -438,7 +441,7 @@ main( int argc, char** argv )
     const bool onDevice = true;
     prnk.reset( new CurandRandomNumberKernel( devRndmom, onDevice ) );
 #else
-    throw std::logic_error( "INTERNAL ERROR! CurandDevice is not supported on CPUs" ); // INTERNAL ERROR (no path to this statement)
+    throw std::logic_error( "INTERNAL ERROR! CurandDevice is not supported on CPUs or non-NVidia GPUs" ); // INTERNAL ERROR (no path to this statement)
 #endif
   }
 
@@ -450,7 +453,7 @@ main( int argc, char** argv )
   }
   else
   {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     prsk.reset( new RamboSamplingKernelDevice( energy, devRndmom, devMomenta, devWeights, gpublocks, gputhreads ) );
 #else
     throw std::logic_error( "RamboDevice is not supported on CPUs" ); // INTERNAL ERROR (no path to this statement)
@@ -461,7 +464,7 @@ main( int argc, char** argv )
   std::unique_ptr<MatrixElementKernelBase> pmek;
   if( !bridge )
   {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     pmek.reset( new MatrixElementKernelDevice( devMomenta, devGs, devRndHel, devRndCol, devMatrixElements, devSelHel, devSelCol, gpublocks, gputhreads ) );
 #else
     pmek.reset( new MatrixElementKernelHost( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, nevt ) );
@@ -469,7 +472,7 @@ main( int argc, char** argv )
   }
   else
   {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     pmek.reset( new BridgeKernelDevice( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, gpublocks, gputhreads ) );
 #else
     pmek.reset( new BridgeKernelHost( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, nevt ) );
@@ -511,7 +514,7 @@ main( int argc, char** argv )
     prnk->generateRnarray();
     //std::cout << "Got random numbers" << std::endl;
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     if( rndgen != RandomNumberMode::CurandDevice && rmbsmp == RamboSamplingMode::RamboDevice )
     {
       // --- 1c. Copy rndmom from host to device
@@ -543,7 +546,7 @@ main( int argc, char** argv )
     prsk->getMomentaFinal();
     //std::cout << "Got final momenta" << std::endl;
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     if( rmbsmp == RamboSamplingMode::RamboDevice )
     {
       // --- 2c. CopyDToH Weights
@@ -588,7 +591,7 @@ main( int argc, char** argv )
       dynamic_cast<BridgeKernelBase*>( pmek.get() )->transposeInputMomentaC2F();
     }
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     // --- 2d. CopyHToD Momenta
     const std::string gKey = "0.. CpHTDg";
     rambtime += timermap.start( gKey ); // FIXME! NOT A RAMBO TIMER!
@@ -617,7 +620,7 @@ main( int argc, char** argv )
     wv3atime += timermap.stop(); // calc only
     wavetime += wv3atime;        // calc plus copy
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     if( !bridge )
     {
       // --- 3b. CopyDToH MEs
@@ -760,18 +763,22 @@ main( int argc, char** argv )
     rndgentxt = "CURAND DEVICE";
 #ifdef __CUDACC__
   rndgentxt += " (CUDA code)";
+#elif defined __HIPCC__
+  rndgentxt += " (HIP code)";
 #else
   rndgentxt += " (C++ code)";
 #endif
 
   // Workflow description summary
   std::string wrkflwtxt;
-  // -- CUDA or C++?
+  // -- CUDA or HIP or C++?
 #ifdef __CUDACC__
   wrkflwtxt += "CUD:";
+#elif defined __HIPCC__
+  wrkflwtxt += "HIP:";
 #else
   wrkflwtxt += "CPP:";
-#endif
+#endif /* clang-format off */
   // -- DOUBLE or FLOAT?
 #if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
   wrkflwtxt += "MIX+"; // mixed fptypes (single precision color algebra #537)
@@ -781,7 +788,7 @@ main( int argc, char** argv )
   wrkflwtxt += "FLT+";
 #else
   wrkflwtxt += "???+"; // no path to this statement
-#endif
+#endif /* clang-format on */
   // -- CUCOMPLEX or THRUST or STD complex numbers?
 #ifdef __CUDACC__
 #if defined MGONGPU_CUCXTYPE_CUCOMPLEX
@@ -793,6 +800,12 @@ main( int argc, char** argv )
 #else
   wrkflwtxt += "???:"; // no path to this statement
 #endif
+#elif defined __HIPCC__
+#if defined MGONGPU_CUCXTYPE_CXSMPL
+  wrkflwtxt += "CXS:";
+#else
+  wrkflwtxt += "???:"; // no path to this statement
+#endif
 #else
 #if defined MGONGPU_CPPCXTYPE_STDCOMPLEX
   wrkflwtxt += "STX:";
@@ -818,7 +831,7 @@ main( int argc, char** argv )
     wrkflwtxt += "RMBDEV+";
   else
     wrkflwtxt += "??????+"; // no path to this statement
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   // -- HOST or DEVICE matrix elements? Standalone MEs or BRIDGE?
   if( !bridge )
     wrkflwtxt += "MESDEV";
@@ -874,7 +887,7 @@ main( int argc, char** argv )
 
   if( perf )
   {
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 #ifdef _OPENMP
     // Get the output of "nproc --all" (https://stackoverflow.com/a/478960)
     std::string nprocall;
@@ -895,6 +908,8 @@ main( int argc, char** argv )
     std::cout << std::string( SEP79, '*' ) << std::endl
 #ifdef __CUDACC__
               << "Process                     = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_CUDA"
+#elif defined __HIPCC__
+              << "Process                     = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_HIP"
 #else
               << "Process                     = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_CPP"
 #endif
@@ -921,21 +936,21 @@ main( int argc, char** argv )
 #elif defined MGONGPU_FPTYPE_FLOAT
               << "FP precision                = FLOAT (NaN/abnormal=" << nabn << ", zero=" << nzero << ")" << std::endl
 #endif
-#ifdef __CUDACC__
 #if defined MGONGPU_CUCXTYPE_CUCOMPLEX
               << "Complex type                = CUCOMPLEX" << std::endl
 #elif defined MGONGPU_CUCXTYPE_THRUST
               << "Complex type                = THRUST::COMPLEX" << std::endl
-#endif
-#else
+#elif defined MGONGPU_CUCXTYPE_CXSMPL
               << "Complex type                = STD::COMPLEX" << std::endl
+#else
+              << "Complex type                = ???" << std::endl // no path to this statement...
 #endif
               << "RanNumb memory layout       = AOSOA[" << neppR << "]"
               << ( neppR == 1 ? " == AOS" : "" )
               << " [HARDCODED FOR REPRODUCIBILITY]" << std::endl
               << "Momenta memory layout       = AOSOA[" << neppM << "]"
               << ( neppM == 1 ? " == AOS" : "" ) << std::endl
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     //<< "Wavefunction GPU memory     = LOCAL" << std::endl
 #else
 #if !defined MGONGPU_CPPSIMD
@@ -966,7 +981,7 @@ main( int argc, char** argv )
 #endif
 #endif
               << "Random number generation    = " << rndgentxt << std::endl
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 #ifdef _OPENMP
               << "OMP threads / `nproc --all` = " << omp_get_max_threads() << " / " << nprocall // includes a newline
 #endif
@@ -1062,14 +1077,14 @@ main( int argc, char** argv )
              << "\"FLOAT (NaN/abnormal=" << nabn << ")\"," << std::endl
 #endif
              << "\"Complex type\": "
-#ifdef __CUDACC__
 #if defined MGONGPU_CUCXTYPE_CUCOMPLEX
              << "\"CUCOMPLEX\"," << std::endl
 #elif defined MGONGPU_CUCXTYPE_THRUST
              << "\"THRUST::COMPLEX\"," << std::endl
-#endif
-#else
+#elif defined MGONGPU_CUCXTYPE_CXSMPL
              << "\"STD::COMPLEX\"," << std::endl
+#else
+             << "\"???\"," << std::endl                           // no path to this statement...
 #endif
              << "\"RanNumb memory layout\": "
              << "\"AOSOA[" << neppR << "]\""
@@ -1077,7 +1092,7 @@ main( int argc, char** argv )
              << "\"Momenta memory layout\": "
              << "\"AOSOA[" << neppM << "]\""
              << ( neppM == 1 ? " == AOS" : "" ) << ", " << std::endl
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     //<< "\"Wavefunction GPU memory\": " << "\"LOCAL\"," << std::endl
 #endif
              << "\"Curand generation\": "
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxuux/CPPProcess.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxuux/CPPProcess.cc
index 866433ae8b..bc38d1f109 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxuux/CPPProcess.cc
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxuux/CPPProcess.cc
@@ -4,7 +4,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
 // MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
@@ -16,7 +16,6 @@
 
 #include "mgOnGpuConfig.h"
 
-#include "CudaRuntime.h"
 #include "HelAmps_sm.h"
 #include "MemoryAccessAmplitudes.h"
 #include "MemoryAccessCouplings.h"
@@ -49,7 +48,7 @@
 // Process: g g > t t~ d d~ WEIGHTED<=4 @2
 // Process: g g > t t~ s s~ WEIGHTED<=4 @2
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -83,7 +82,7 @@ namespace mg5amcCpu
   __device__ const fptype cIPD[2] = { (fptype)Parameters_sm::mdl_MT, (fptype)Parameters_sm::mdl_WT };
   __device__ const fptype* cIPC = nullptr; // unused as nicoup=0
 #else
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   __device__ __constant__ fptype cIPD[2];
   __device__ __constant__ fptype* cIPC = nullptr; // unused as nicoup=0
 #else
@@ -93,7 +92,7 @@ namespace mg5amcCpu
 #endif
 
   // Helicity combinations (and filtering of "good" helicity combinations)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   __device__ __constant__ short cHel[ncomb][npar];
   __device__ __constant__ int cNGoodHel;
   __device__ __constant__ int cGoodHel[ncomb];
@@ -121,13 +120,13 @@ namespace mg5amcCpu
                            fptype* allDenominators,       // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
                            fptype_sv* jamp2_sv            // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled)
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
                            , const int ievt00             // input: first event number in current C++ event page (for CUDA, ievt depends on threadid)
 #endif
                            )
   //ALWAYS_INLINE // attributes are not permitted in a function definition
   {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     using namespace mg5amcGpu;
     using M_ACCESS = DeviceAccessMomenta;         // non-trivial access: buffer includes all events
     using E_ACCESS = DeviceAccessMatrixElements;  // non-trivial access: buffer includes all events
@@ -154,7 +153,7 @@ namespace mg5amcCpu
 #endif /* clang-format on */
     mgDebug( 0, __FUNCTION__ );
     //printf( "calculate_wavefunctions: ihel=%2d\n", ihel );
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
     //printf( "calculate_wavefunctions: ievt00=%d\n", ievt00 );
 #endif
 
@@ -190,7 +189,7 @@ namespace mg5amcCpu
 #endif
     for( int iParity = 0; iParity < nParity; ++iParity )
     { // START LOOP ON IPARITY
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
       const int ievt0 = ievt00 + iParity * neppV;
 #endif
       constexpr size_t nxcoup = ndcoup + nicoup; // both dependent and independent couplings
@@ -203,8 +202,10 @@ namespace mg5amcCpu
         allCOUPs[idcoup] = CD_ACCESS::idcoupAccessBufferConst( allcouplings, idcoup ); // dependent couplings, vary event-by-event
       for( size_t iicoup = 0; iicoup < nicoup; iicoup++ )
         allCOUPs[ndcoup + iicoup] = CI_ACCESS::iicoupAccessBufferConst( cIPC, iicoup ); // independent couplings, fixed for all events
+#ifdef MGONGPUCPP_GPUIMPL
 #ifdef __CUDACC__
 #pragma nv_diagnostic pop
+#endif
       // CUDA kernels take input/output buffers with momenta/MEs for all events
       const fptype* momenta = allmomenta;
       const fptype* COUPs[nxcoup];
@@ -812,7 +813,7 @@ namespace mg5amcCpu
         { -2, 6, -6, -2, 16, 0, 0, -2, 16, 6, 48, 16 },
         { 6, -2, -2, -6, 0, 16, -2, 0, 6, 16, 16, 48 } }; // 2-D array[12][12]
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
       // Pre-compute a constexpr triangular color matrix properly normalized #475
       struct TriangularNormalizedColorMatrix
       {
@@ -869,7 +870,7 @@ namespace mg5amcCpu
 #endif
       for( int icol = 0; icol < ncolor; icol++ )
       {
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
         // === C++ START ===
         // Diagonal terms
 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
@@ -928,7 +929,7 @@ namespace mg5amcCpu
       MEs_sv_previous += deltaMEs_previous;
 #endif
       /*
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
       if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", blockDim.x * blockIdx.x + threadIdx.x, ihel, MEs_sv );
 #else
 #ifdef MGONGPU_CPPSIMD
@@ -1023,8 +1024,8 @@ namespace mg5amcCpu
       { 1, 1, 1, -1, -1, -1 },
       { 1, 1, 1, -1, 1, 1 },
       { 1, 1, 1, -1, 1, -1 } };
-#ifdef __CUDACC__
-    checkCuda( cudaMemcpyToSymbol( cHel, tHel, ncomb * npar * sizeof( short ) ) );
+#ifdef MGONGPUCPP_GPUIMPL
+    gpuMemcpyToSymbol( cHel, tHel, ncomb * npar * sizeof( short ) );
 #else
     memcpy( cHel, tHel, ncomb * npar * sizeof( short ) );
 #endif
@@ -1066,9 +1067,9 @@ namespace mg5amcCpu
     // Then copy them to CUDA constant memory (issue #39) or its C++ emulation in file-scope static memory
     const fptype tIPD[2] = { (fptype)m_pars->mdl_MT, (fptype)m_pars->mdl_WT };
     //const cxtype tIPC[0] = { ... }; // nicoup=0
-#ifdef __CUDACC__
-    checkCuda( cudaMemcpyToSymbol( cIPD, tIPD, 2 * sizeof( fptype ) ) );
-    //checkCuda( cudaMemcpyToSymbol( cIPC, tIPC, 0 * sizeof( cxtype ) ) ); // nicoup=0
+#ifdef MGONGPUCPP_GPUIMPL
+    gpuMemcpyToSymbol( cIPD, tIPD, 2 * sizeof( fptype ) );
+    //gpuMemcpyToSymbol( cIPC, tIPC, 0 * sizeof( cxtype ) ); // nicoup=0
 #else
     memcpy( cIPD, tIPD, 2 * sizeof( fptype ) );
     //memcpy( cIPC, tIPC, 0 * sizeof( cxtype ) ); // nicoup=0
@@ -1106,7 +1107,7 @@ namespace mg5amcCpu
   {
     std::stringstream out;
     // CUDA version (NVCC)
-    // [Use __NVCC__ instead of __CUDACC__ here!]
+    // [Use __NVCC__ instead of MGONGPUCPP_GPUIMPL here!]
     // [This tests if 'nvcc' was used even to build a .cc file, even if not necessarily 'nvcc -x cu' for a .cu file]
     // [Check 'nvcc --compiler-options -dM -E dummy.c | grep CUDA': see https://stackoverflow.com/a/53713712]
 #ifdef __NVCC__
@@ -1171,12 +1172,12 @@ namespace mg5amcCpu
   __global__ void /* clang-format off */
   computeDependentCouplings( const fptype* allgs, // input: Gs[nevt]
                              fptype* allcouplings // output: couplings[nevt*ndcoup*2]
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
                              , const int nevt     // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #endif
   ) /* clang-format on */
   {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     using namespace mg5amcGpu;
     using G_ACCESS = DeviceAccessGs;
     using C_ACCESS = DeviceAccessCouplings;
@@ -1197,7 +1198,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__ /* clang-format off */
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
   __global__ void
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
@@ -1323,9 +1324,9 @@ namespace mg5amcCpu
         nGoodHel++;
       }
     }
-#ifdef __CUDACC__
-    checkCuda( cudaMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) ) );
-    checkCuda( cudaMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) ) );
+#ifdef MGONGPUCPP_GPUIMPL
+    gpuMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) );
+    gpuMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) );
 #else
     cNGoodHel = nGoodHel;
     for( int ihel = 0; ihel < ncomb; ihel++ ) cGoodHel[ihel] = goodHel[ihel];
@@ -1349,7 +1350,7 @@ namespace mg5amcCpu
 #endif
             int* allselhel,                // output: helicity selection[nevt]
             int* allselcol                 // output: helicity selection[nevt]
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
             , const int nevt               // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #endif
             ) /* clang-format on */
@@ -1369,7 +1370,7 @@ namespace mg5amcCpu
     // Denominators: spins, colors and identical particles
     constexpr int helcolDenominators[1] = { 256 }; // assume nprocesses == 1 (#272 and #343)
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     // Remember: in CUDA this is a kernel for one event, in c++ this processes n events
     const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid
 #else
@@ -1383,9 +1384,12 @@ namespace mg5amcCpu
 #endif
 
     // Start sigmaKin_lines
+
+#include "GpuAbstraction.h"
+
     // === PART 0 - INITIALISATION (before calculate_wavefunctions) ===
     // Reset the "matrix elements" - running sums of |M|^2 over helicities for the given event
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     allMEs[ievt] = 0;
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     allNumerators[ievt] = 0;
@@ -1413,7 +1417,7 @@ namespace mg5amcCpu
     // === PART 1 - HELICITY LOOP: CALCULATE WAVEFUNCTIONS ===
     // (in both CUDA and C++, using precomputed good helicities)
 
-#ifdef __CUDACC__ // CUDA OR C++
+#ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++
 
     // *** START OF PART 1a - CUDA (one event per CPU thread) ***
     // Running sum of partial amplitudes squared for event by event color selection (#402)
@@ -1623,7 +1627,7 @@ namespace mg5amcCpu
     // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event
     // [NB 'sum over final spins, average over initial spins', eg see
     // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf]
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     allMEs[ievt] /= helcolDenominators[0];
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     if( channelId > 0 ) allMEs[ievt] *= allNumerators[ievt] / allDenominators[ievt];
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxuux/CPPProcess.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxuux/CPPProcess.h
index f26b60c5bb..c411623fc8 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxuux/CPPProcess.h
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxuux/CPPProcess.h
@@ -4,7 +4,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
 // MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
@@ -25,7 +25,7 @@
 
 //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -110,7 +110,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   __global__ void
   computeDependentCouplings( const fptype* allgs,    // input: Gs[nevt]
                              fptype* allcouplings ); // output: couplings[nevt*ndcoup*2]
@@ -123,7 +123,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__ /* clang-format off */
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
   __global__ void
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
@@ -153,7 +153,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__ /* clang-format off */
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
   __global__ void
   sigmaKin( const fptype* allmomenta,      // input: momenta[nevt*npar*4]
             const fptype* allcouplings,    // input: couplings[nevt*ndcoup*2]
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxuux/CudaRuntime.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxuux/CudaRuntime.h
deleted file mode 120000
index ce9e1a487a..0000000000
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxuux/CudaRuntime.h
+++ /dev/null
@@ -1 +0,0 @@
-../CudaRuntime.h
\ No newline at end of file
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxuux/GpuAbstraction.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxuux/GpuAbstraction.h
new file mode 120000
index 0000000000..72054e19ba
--- /dev/null
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxuux/GpuAbstraction.h
@@ -0,0 +1 @@
+../GpuAbstraction.h
\ No newline at end of file
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxuux/GpuRuntime.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxuux/GpuRuntime.h
new file mode 120000
index 0000000000..3920e83be4
--- /dev/null
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxuux/GpuRuntime.h
@@ -0,0 +1 @@
+../GpuRuntime.h
\ No newline at end of file
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxuux/check_sa.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxuux/check_sa.cc
index 3fbf0ffbee..7cac5ab47b 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxuux/check_sa.cc
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxuux/check_sa.cc
@@ -4,7 +4,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: O. Mattelaer (Nov 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 
 #include "mgOnGpuConfig.h"
@@ -12,6 +12,7 @@
 #include "BridgeKernels.h"
 #include "CPPProcess.h"
 #include "CrossSectionKernels.h"
+#include "GpuRuntime.h"
 #include "MatrixElementKernels.h"
 #include "MemoryAccessMatrixElements.h"
 #include "MemoryAccessMomenta.h"
@@ -65,7 +66,7 @@ usage( char* argv0, int ret = 1 )
   std::cout << std::endl;
   std::cout << "Summary stats are always computed: '-p' and '-j' only control their printout" << std::endl;
   std::cout << "The '-d' flag only enables NaN/abnormal warnings and OMP debugging" << std::endl;
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 #ifdef _OPENMP
   std::cout << std::endl;
   std::cout << "Use the OMP_NUM_THREADS environment variable to control OMP multi-threading" << std::endl;
@@ -96,7 +97,7 @@ int
 main( int argc, char** argv )
 {
   // Namespaces for CUDA and C++ (FIXME - eventually use the same namespace everywhere...)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   using namespace mg5amcGpu;
 #else
   using namespace mg5amcCpu;
@@ -134,9 +135,11 @@ main( int argc, char** argv )
     CurandDevice = 2
   };
 #ifdef MGONGPU_HAS_NO_CURAND
-  RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // this is the only supported mode if build has no curand (PR #784)
+  RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // this is the only supported mode if build has no curand (PR #784 and #785)
+#elif defined __HIPCC__
+#error Internal error: MGONGPU_HAS_NO_CURAND should have been set for __HIPCC__ // default on AMD GPUs should be common random
 #elif defined __CUDACC__
-  RandomNumberMode rndgen = RandomNumberMode::CurandDevice; // default on GPU if build has curand
+  RandomNumberMode rndgen = RandomNumberMode::CurandDevice; // default on NVidia GPU if build has curand
 #else
   RandomNumberMode rndgen = RandomNumberMode::CurandHost; // default on CPU if build has curand
 #endif
@@ -146,10 +149,10 @@ main( int argc, char** argv )
     RamboHost = 1,
     RamboDevice = 2
   };
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   RamboSamplingMode rmbsmp = RamboSamplingMode::RamboDevice; // default on GPU
 #else
-  RamboSamplingMode rmbsmp = RamboSamplingMode::RamboHost;  // default on CPU
+  RamboSamplingMode rmbsmp = RamboSamplingMode::RamboHost; // default on CPU
 #endif
   // Bridge emulation mode (NB Bridge implies RamboHost!)
   bool bridge = false;
@@ -177,7 +180,7 @@ main( int argc, char** argv )
     else if( arg == "--curdev" )
     {
 #ifndef __CUDACC__
-      throw std::runtime_error( "CurandDevice is not supported on CPUs" );
+      throw std::runtime_error( "CurandDevice is not supported on CPUs or non-NVidia GPUs" );
 #elif defined MGONGPU_HAS_NO_CURAND
       throw std::runtime_error( "CurandDevice is not supported because this application was built without Curand support" );
 #else
@@ -198,7 +201,7 @@ main( int argc, char** argv )
     }
     else if( arg == "--rmbdev" )
     {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
       rmbsmp = RamboSamplingMode::RamboDevice;
 #else
       throw std::runtime_error( "RamboDevice is not supported on CPUs" );
@@ -272,13 +275,13 @@ main( int argc, char** argv )
     return usage( argv[0] );
   }
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 #ifdef _OPENMP
   ompnumthreadsNotSetMeansOneThread( debug ? 1 : 0 ); // quiet(-1), info(0), debug(1)
 #endif
 #endif
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // Fail gently and avoid "Illegal instruction (core dumped)" if the host does not support the SIMD used in the ME calculation
   // Note: this prevents a crash on pmpe04 but not on some github CI nodes?
   // [NB: SIMD vectorization in mg5amc C++ code is only used in the ME calculation below MatrixElementKernelHost!]
@@ -296,14 +299,14 @@ main( int argc, char** argv )
 
   // === STEP 0 - INITIALISE
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 
-  // --- 00. Initialise cuda
-  // Instantiate a CudaRuntime at the beginnining of the application's main to
-  // invoke cudaSetDevice(0) in the constructor and book a cudaDeviceReset() call in the destructor
-  const std::string cdinKey = "00 CudaInit";
+  // --- 00. Initialise GPU
+  // Instantiate a GpuRuntime at the beginnining of the application's main.
+  // For CUDA this invokes cudaSetDevice(0) in the constructor and books a cudaDeviceReset() call in the destructor.
+  const std::string cdinKey = "00 GpuInit";
   timermap.start( cdinKey );
-  CudaRuntime cudaRuntime( debug );
+  GpuRuntime GpuRuntime( debug );
 #endif
 
   // --- 0a. Initialise physics process
@@ -325,7 +328,7 @@ main( int argc, char** argv )
   timermap.start( alloKey );
 
   // Memory buffers for random numbers for momenta
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferRndNumMomenta hstRndmom( nevt );
 #else
   PinnedHostBufferRndNumMomenta hstRndmom( nevt );
@@ -333,7 +336,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for sampling weights
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferWeights hstWeights( nevt );
 #else
   PinnedHostBufferWeights hstWeights( nevt );
@@ -341,7 +344,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for momenta
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferMomenta hstMomenta( nevt );
 #else
   PinnedHostBufferMomenta hstMomenta( nevt );
@@ -349,7 +352,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for Gs
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferGs hstGs( nevt );
 #else
   PinnedHostBufferGs hstGs( nevt );
@@ -366,7 +369,7 @@ main( int argc, char** argv )
   }
 
   // Memory buffers for matrix elements
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferMatrixElements hstMatrixElements( nevt );
 #else
   PinnedHostBufferMatrixElements hstMatrixElements( nevt );
@@ -375,7 +378,7 @@ main( int argc, char** argv )
 
   // Memory buffers for random numbers for helicity selection
   // *** NB #403 these buffers always remain initialised at 0: no need for helicity choice in gcheck/check (no LHE produced) ***
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferRndNumHelicity hstRndHel( nevt );
 #else
   PinnedHostBufferRndNumHelicity hstRndHel( nevt );
@@ -384,7 +387,7 @@ main( int argc, char** argv )
 
   // Memory buffers for random numbers for color selection
   // *** NB #402 these buffers always remain initialised at 0: no need for color choice in gcheck/check (no LHE produced) ***
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferRndNumColor hstRndCol( nevt );
 #else
   PinnedHostBufferRndNumColor hstRndCol( nevt );
@@ -392,7 +395,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for helicity selection
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferSelectedHelicity hstSelHel( nevt );
 #else
   PinnedHostBufferSelectedHelicity hstSelHel( nevt );
@@ -400,7 +403,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for color selection
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferSelectedColor hstSelCol( nevt );
 #else
   PinnedHostBufferSelectedColor hstSelCol( nevt );
@@ -438,7 +441,7 @@ main( int argc, char** argv )
     const bool onDevice = true;
     prnk.reset( new CurandRandomNumberKernel( devRndmom, onDevice ) );
 #else
-    throw std::logic_error( "INTERNAL ERROR! CurandDevice is not supported on CPUs" ); // INTERNAL ERROR (no path to this statement)
+    throw std::logic_error( "INTERNAL ERROR! CurandDevice is not supported on CPUs or non-NVidia GPUs" ); // INTERNAL ERROR (no path to this statement)
 #endif
   }
 
@@ -450,7 +453,7 @@ main( int argc, char** argv )
   }
   else
   {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     prsk.reset( new RamboSamplingKernelDevice( energy, devRndmom, devMomenta, devWeights, gpublocks, gputhreads ) );
 #else
     throw std::logic_error( "RamboDevice is not supported on CPUs" ); // INTERNAL ERROR (no path to this statement)
@@ -461,7 +464,7 @@ main( int argc, char** argv )
   std::unique_ptr<MatrixElementKernelBase> pmek;
   if( !bridge )
   {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     pmek.reset( new MatrixElementKernelDevice( devMomenta, devGs, devRndHel, devRndCol, devMatrixElements, devSelHel, devSelCol, gpublocks, gputhreads ) );
 #else
     pmek.reset( new MatrixElementKernelHost( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, nevt ) );
@@ -469,7 +472,7 @@ main( int argc, char** argv )
   }
   else
   {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     pmek.reset( new BridgeKernelDevice( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, gpublocks, gputhreads ) );
 #else
     pmek.reset( new BridgeKernelHost( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, nevt ) );
@@ -511,7 +514,7 @@ main( int argc, char** argv )
     prnk->generateRnarray();
     //std::cout << "Got random numbers" << std::endl;
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     if( rndgen != RandomNumberMode::CurandDevice && rmbsmp == RamboSamplingMode::RamboDevice )
     {
       // --- 1c. Copy rndmom from host to device
@@ -543,7 +546,7 @@ main( int argc, char** argv )
     prsk->getMomentaFinal();
     //std::cout << "Got final momenta" << std::endl;
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     if( rmbsmp == RamboSamplingMode::RamboDevice )
     {
       // --- 2c. CopyDToH Weights
@@ -588,7 +591,7 @@ main( int argc, char** argv )
       dynamic_cast<BridgeKernelBase*>( pmek.get() )->transposeInputMomentaC2F();
     }
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     // --- 2d. CopyHToD Momenta
     const std::string gKey = "0.. CpHTDg";
     rambtime += timermap.start( gKey ); // FIXME! NOT A RAMBO TIMER!
@@ -617,7 +620,7 @@ main( int argc, char** argv )
     wv3atime += timermap.stop(); // calc only
     wavetime += wv3atime;        // calc plus copy
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     if( !bridge )
     {
       // --- 3b. CopyDToH MEs
@@ -760,18 +763,22 @@ main( int argc, char** argv )
     rndgentxt = "CURAND DEVICE";
 #ifdef __CUDACC__
   rndgentxt += " (CUDA code)";
+#elif defined __HIPCC__
+  rndgentxt += " (HIP code)";
 #else
   rndgentxt += " (C++ code)";
 #endif
 
   // Workflow description summary
   std::string wrkflwtxt;
-  // -- CUDA or C++?
+  // -- CUDA or HIP or C++?
 #ifdef __CUDACC__
   wrkflwtxt += "CUD:";
+#elif defined __HIPCC__
+  wrkflwtxt += "HIP:";
 #else
   wrkflwtxt += "CPP:";
-#endif
+#endif /* clang-format off */
   // -- DOUBLE or FLOAT?
 #if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
   wrkflwtxt += "MIX+"; // mixed fptypes (single precision color algebra #537)
@@ -781,7 +788,7 @@ main( int argc, char** argv )
   wrkflwtxt += "FLT+";
 #else
   wrkflwtxt += "???+"; // no path to this statement
-#endif
+#endif /* clang-format on */
   // -- CUCOMPLEX or THRUST or STD complex numbers?
 #ifdef __CUDACC__
 #if defined MGONGPU_CUCXTYPE_CUCOMPLEX
@@ -793,6 +800,12 @@ main( int argc, char** argv )
 #else
   wrkflwtxt += "???:"; // no path to this statement
 #endif
+#elif defined __HIPCC__
+#if defined MGONGPU_CUCXTYPE_CXSMPL
+  wrkflwtxt += "CXS:";
+#else
+  wrkflwtxt += "???:"; // no path to this statement
+#endif
 #else
 #if defined MGONGPU_CPPCXTYPE_STDCOMPLEX
   wrkflwtxt += "STX:";
@@ -818,7 +831,7 @@ main( int argc, char** argv )
     wrkflwtxt += "RMBDEV+";
   else
     wrkflwtxt += "??????+"; // no path to this statement
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   // -- HOST or DEVICE matrix elements? Standalone MEs or BRIDGE?
   if( !bridge )
     wrkflwtxt += "MESDEV";
@@ -874,7 +887,7 @@ main( int argc, char** argv )
 
   if( perf )
   {
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 #ifdef _OPENMP
     // Get the output of "nproc --all" (https://stackoverflow.com/a/478960)
     std::string nprocall;
@@ -895,6 +908,8 @@ main( int argc, char** argv )
     std::cout << std::string( SEP79, '*' ) << std::endl
 #ifdef __CUDACC__
               << "Process                     = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_CUDA"
+#elif defined __HIPCC__
+              << "Process                     = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_HIP"
 #else
               << "Process                     = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_CPP"
 #endif
@@ -921,21 +936,21 @@ main( int argc, char** argv )
 #elif defined MGONGPU_FPTYPE_FLOAT
               << "FP precision                = FLOAT (NaN/abnormal=" << nabn << ", zero=" << nzero << ")" << std::endl
 #endif
-#ifdef __CUDACC__
 #if defined MGONGPU_CUCXTYPE_CUCOMPLEX
               << "Complex type                = CUCOMPLEX" << std::endl
 #elif defined MGONGPU_CUCXTYPE_THRUST
               << "Complex type                = THRUST::COMPLEX" << std::endl
-#endif
-#else
+#elif defined MGONGPU_CUCXTYPE_CXSMPL
               << "Complex type                = STD::COMPLEX" << std::endl
+#else
+              << "Complex type                = ???" << std::endl // no path to this statement...
 #endif
               << "RanNumb memory layout       = AOSOA[" << neppR << "]"
               << ( neppR == 1 ? " == AOS" : "" )
               << " [HARDCODED FOR REPRODUCIBILITY]" << std::endl
               << "Momenta memory layout       = AOSOA[" << neppM << "]"
               << ( neppM == 1 ? " == AOS" : "" ) << std::endl
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     //<< "Wavefunction GPU memory     = LOCAL" << std::endl
 #else
 #if !defined MGONGPU_CPPSIMD
@@ -966,7 +981,7 @@ main( int argc, char** argv )
 #endif
 #endif
               << "Random number generation    = " << rndgentxt << std::endl
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 #ifdef _OPENMP
               << "OMP threads / `nproc --all` = " << omp_get_max_threads() << " / " << nprocall // includes a newline
 #endif
@@ -1062,14 +1077,14 @@ main( int argc, char** argv )
              << "\"FLOAT (NaN/abnormal=" << nabn << ")\"," << std::endl
 #endif
              << "\"Complex type\": "
-#ifdef __CUDACC__
 #if defined MGONGPU_CUCXTYPE_CUCOMPLEX
              << "\"CUCOMPLEX\"," << std::endl
 #elif defined MGONGPU_CUCXTYPE_THRUST
              << "\"THRUST::COMPLEX\"," << std::endl
-#endif
-#else
+#elif defined MGONGPU_CUCXTYPE_CXSMPL
              << "\"STD::COMPLEX\"," << std::endl
+#else
+             << "\"???\"," << std::endl                           // no path to this statement...
 #endif
              << "\"RanNumb memory layout\": "
              << "\"AOSOA[" << neppR << "]\""
@@ -1077,7 +1092,7 @@ main( int argc, char** argv )
              << "\"Momenta memory layout\": "
              << "\"AOSOA[" << neppM << "]\""
              << ( neppM == 1 ? " == AOS" : "" ) << ", " << std::endl
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     //<< "\"Wavefunction GPU memory\": " << "\"LOCAL\"," << std::endl
 #endif
              << "\"Curand generation\": "
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gu_ttxgu/CPPProcess.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gu_ttxgu/CPPProcess.cc
index 1be98364ee..a17bd3518e 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gu_ttxgu/CPPProcess.cc
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gu_ttxgu/CPPProcess.cc
@@ -4,7 +4,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
 // MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
@@ -16,7 +16,6 @@
 
 #include "mgOnGpuConfig.h"
 
-#include "CudaRuntime.h"
 #include "HelAmps_sm.h"
 #include "MemoryAccessAmplitudes.h"
 #include "MemoryAccessCouplings.h"
@@ -49,7 +48,7 @@
 // Process: g d > t t~ g d WEIGHTED<=4 @2
 // Process: g s > t t~ g s WEIGHTED<=4 @2
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -83,7 +82,7 @@ namespace mg5amcCpu
   __device__ const fptype cIPD[2] = { (fptype)Parameters_sm::mdl_MT, (fptype)Parameters_sm::mdl_WT };
   __device__ const fptype* cIPC = nullptr; // unused as nicoup=0
 #else
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   __device__ __constant__ fptype cIPD[2];
   __device__ __constant__ fptype* cIPC = nullptr; // unused as nicoup=0
 #else
@@ -93,7 +92,7 @@ namespace mg5amcCpu
 #endif
 
   // Helicity combinations (and filtering of "good" helicity combinations)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   __device__ __constant__ short cHel[ncomb][npar];
   __device__ __constant__ int cNGoodHel;
   __device__ __constant__ int cGoodHel[ncomb];
@@ -121,13 +120,13 @@ namespace mg5amcCpu
                            fptype* allDenominators,       // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
                            fptype_sv* jamp2_sv            // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled)
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
                            , const int ievt00             // input: first event number in current C++ event page (for CUDA, ievt depends on threadid)
 #endif
                            )
   //ALWAYS_INLINE // attributes are not permitted in a function definition
   {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     using namespace mg5amcGpu;
     using M_ACCESS = DeviceAccessMomenta;         // non-trivial access: buffer includes all events
     using E_ACCESS = DeviceAccessMatrixElements;  // non-trivial access: buffer includes all events
@@ -154,7 +153,7 @@ namespace mg5amcCpu
 #endif /* clang-format on */
     mgDebug( 0, __FUNCTION__ );
     //printf( "calculate_wavefunctions: ihel=%2d\n", ihel );
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
     //printf( "calculate_wavefunctions: ievt00=%d\n", ievt00 );
 #endif
 
@@ -190,7 +189,7 @@ namespace mg5amcCpu
 #endif
     for( int iParity = 0; iParity < nParity; ++iParity )
     { // START LOOP ON IPARITY
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
       const int ievt0 = ievt00 + iParity * neppV;
 #endif
       constexpr size_t nxcoup = ndcoup + nicoup; // both dependent and independent couplings
@@ -203,8 +202,10 @@ namespace mg5amcCpu
         allCOUPs[idcoup] = CD_ACCESS::idcoupAccessBufferConst( allcouplings, idcoup ); // dependent couplings, vary event-by-event
       for( size_t iicoup = 0; iicoup < nicoup; iicoup++ )
         allCOUPs[ndcoup + iicoup] = CI_ACCESS::iicoupAccessBufferConst( cIPC, iicoup ); // independent couplings, fixed for all events
+#ifdef MGONGPUCPP_GPUIMPL
 #ifdef __CUDACC__
 #pragma nv_diagnostic pop
+#endif
       // CUDA kernels take input/output buffers with momenta/MEs for all events
       const fptype* momenta = allmomenta;
       const fptype* COUPs[nxcoup];
@@ -812,7 +813,7 @@ namespace mg5amcCpu
         { 0, -2, -6, -2, -2, 6, 16, 0, 6, 16, 48, 16 },
         { -2, 0, -2, -6, 6, -2, 0, 16, 16, 6, 16, 48 } }; // 2-D array[12][12]
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
       // Pre-compute a constexpr triangular color matrix properly normalized #475
       struct TriangularNormalizedColorMatrix
       {
@@ -869,7 +870,7 @@ namespace mg5amcCpu
 #endif
       for( int icol = 0; icol < ncolor; icol++ )
       {
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
         // === C++ START ===
         // Diagonal terms
 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
@@ -928,7 +929,7 @@ namespace mg5amcCpu
       MEs_sv_previous += deltaMEs_previous;
 #endif
       /*
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
       if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", blockDim.x * blockIdx.x + threadIdx.x, ihel, MEs_sv );
 #else
 #ifdef MGONGPU_CPPSIMD
@@ -1023,8 +1024,8 @@ namespace mg5amcCpu
       { 1, -1, 1, -1, -1, 1 },
       { 1, -1, 1, -1, 1, -1 },
       { 1, -1, 1, -1, 1, 1 } };
-#ifdef __CUDACC__
-    checkCuda( cudaMemcpyToSymbol( cHel, tHel, ncomb * npar * sizeof( short ) ) );
+#ifdef MGONGPUCPP_GPUIMPL
+    gpuMemcpyToSymbol( cHel, tHel, ncomb * npar * sizeof( short ) );
 #else
     memcpy( cHel, tHel, ncomb * npar * sizeof( short ) );
 #endif
@@ -1066,9 +1067,9 @@ namespace mg5amcCpu
     // Then copy them to CUDA constant memory (issue #39) or its C++ emulation in file-scope static memory
     const fptype tIPD[2] = { (fptype)m_pars->mdl_MT, (fptype)m_pars->mdl_WT };
     //const cxtype tIPC[0] = { ... }; // nicoup=0
-#ifdef __CUDACC__
-    checkCuda( cudaMemcpyToSymbol( cIPD, tIPD, 2 * sizeof( fptype ) ) );
-    //checkCuda( cudaMemcpyToSymbol( cIPC, tIPC, 0 * sizeof( cxtype ) ) ); // nicoup=0
+#ifdef MGONGPUCPP_GPUIMPL
+    gpuMemcpyToSymbol( cIPD, tIPD, 2 * sizeof( fptype ) );
+    //gpuMemcpyToSymbol( cIPC, tIPC, 0 * sizeof( cxtype ) ); // nicoup=0
 #else
     memcpy( cIPD, tIPD, 2 * sizeof( fptype ) );
     //memcpy( cIPC, tIPC, 0 * sizeof( cxtype ) ); // nicoup=0
@@ -1106,7 +1107,7 @@ namespace mg5amcCpu
   {
     std::stringstream out;
     // CUDA version (NVCC)
-    // [Use __NVCC__ instead of __CUDACC__ here!]
+    // [Use __NVCC__ instead of MGONGPUCPP_GPUIMPL here!]
     // [This tests if 'nvcc' was used even to build a .cc file, even if not necessarily 'nvcc -x cu' for a .cu file]
     // [Check 'nvcc --compiler-options -dM -E dummy.c | grep CUDA': see https://stackoverflow.com/a/53713712]
 #ifdef __NVCC__
@@ -1171,12 +1172,12 @@ namespace mg5amcCpu
   __global__ void /* clang-format off */
   computeDependentCouplings( const fptype* allgs, // input: Gs[nevt]
                              fptype* allcouplings // output: couplings[nevt*ndcoup*2]
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
                              , const int nevt     // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #endif
   ) /* clang-format on */
   {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     using namespace mg5amcGpu;
     using G_ACCESS = DeviceAccessGs;
     using C_ACCESS = DeviceAccessCouplings;
@@ -1197,7 +1198,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__ /* clang-format off */
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
   __global__ void
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
@@ -1323,9 +1324,9 @@ namespace mg5amcCpu
         nGoodHel++;
       }
     }
-#ifdef __CUDACC__
-    checkCuda( cudaMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) ) );
-    checkCuda( cudaMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) ) );
+#ifdef MGONGPUCPP_GPUIMPL
+    gpuMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) );
+    gpuMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) );
 #else
     cNGoodHel = nGoodHel;
     for( int ihel = 0; ihel < ncomb; ihel++ ) cGoodHel[ihel] = goodHel[ihel];
@@ -1349,7 +1350,7 @@ namespace mg5amcCpu
 #endif
             int* allselhel,                // output: helicity selection[nevt]
             int* allselcol                 // output: helicity selection[nevt]
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
             , const int nevt               // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #endif
             ) /* clang-format on */
@@ -1369,7 +1370,7 @@ namespace mg5amcCpu
     // Denominators: spins, colors and identical particles
     constexpr int helcolDenominators[1] = { 96 }; // assume nprocesses == 1 (#272 and #343)
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     // Remember: in CUDA this is a kernel for one event, in c++ this processes n events
     const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid
 #else
@@ -1383,9 +1384,12 @@ namespace mg5amcCpu
 #endif
 
     // Start sigmaKin_lines
+
+#include "GpuAbstraction.h"
+
     // === PART 0 - INITIALISATION (before calculate_wavefunctions) ===
     // Reset the "matrix elements" - running sums of |M|^2 over helicities for the given event
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     allMEs[ievt] = 0;
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     allNumerators[ievt] = 0;
@@ -1413,7 +1417,7 @@ namespace mg5amcCpu
     // === PART 1 - HELICITY LOOP: CALCULATE WAVEFUNCTIONS ===
     // (in both CUDA and C++, using precomputed good helicities)
 
-#ifdef __CUDACC__ // CUDA OR C++
+#ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++
 
     // *** START OF PART 1a - CUDA (one event per CPU thread) ***
     // Running sum of partial amplitudes squared for event by event color selection (#402)
@@ -1623,7 +1627,7 @@ namespace mg5amcCpu
     // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event
     // [NB 'sum over final spins, average over initial spins', eg see
     // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf]
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     allMEs[ievt] /= helcolDenominators[0];
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     if( channelId > 0 ) allMEs[ievt] *= allNumerators[ievt] / allDenominators[ievt];
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gu_ttxgu/CPPProcess.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gu_ttxgu/CPPProcess.h
index 853175b477..9c820a5ddb 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gu_ttxgu/CPPProcess.h
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gu_ttxgu/CPPProcess.h
@@ -4,7 +4,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
 // MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
@@ -25,7 +25,7 @@
 
 //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -110,7 +110,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   __global__ void
   computeDependentCouplings( const fptype* allgs,    // input: Gs[nevt]
                              fptype* allcouplings ); // output: couplings[nevt*ndcoup*2]
@@ -123,7 +123,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__ /* clang-format off */
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
   __global__ void
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
@@ -153,7 +153,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__ /* clang-format off */
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
   __global__ void
   sigmaKin( const fptype* allmomenta,      // input: momenta[nevt*npar*4]
             const fptype* allcouplings,    // input: couplings[nevt*ndcoup*2]
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gu_ttxgu/CudaRuntime.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gu_ttxgu/CudaRuntime.h
deleted file mode 120000
index ce9e1a487a..0000000000
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gu_ttxgu/CudaRuntime.h
+++ /dev/null
@@ -1 +0,0 @@
-../CudaRuntime.h
\ No newline at end of file
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gu_ttxgu/GpuAbstraction.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gu_ttxgu/GpuAbstraction.h
new file mode 120000
index 0000000000..72054e19ba
--- /dev/null
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gu_ttxgu/GpuAbstraction.h
@@ -0,0 +1 @@
+../GpuAbstraction.h
\ No newline at end of file
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gu_ttxgu/GpuRuntime.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gu_ttxgu/GpuRuntime.h
new file mode 120000
index 0000000000..3920e83be4
--- /dev/null
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gu_ttxgu/GpuRuntime.h
@@ -0,0 +1 @@
+../GpuRuntime.h
\ No newline at end of file
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gu_ttxgu/check_sa.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gu_ttxgu/check_sa.cc
index 3fbf0ffbee..7cac5ab47b 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gu_ttxgu/check_sa.cc
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gu_ttxgu/check_sa.cc
@@ -4,7 +4,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: O. Mattelaer (Nov 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 
 #include "mgOnGpuConfig.h"
@@ -12,6 +12,7 @@
 #include "BridgeKernels.h"
 #include "CPPProcess.h"
 #include "CrossSectionKernels.h"
+#include "GpuRuntime.h"
 #include "MatrixElementKernels.h"
 #include "MemoryAccessMatrixElements.h"
 #include "MemoryAccessMomenta.h"
@@ -65,7 +66,7 @@ usage( char* argv0, int ret = 1 )
   std::cout << std::endl;
   std::cout << "Summary stats are always computed: '-p' and '-j' only control their printout" << std::endl;
   std::cout << "The '-d' flag only enables NaN/abnormal warnings and OMP debugging" << std::endl;
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 #ifdef _OPENMP
   std::cout << std::endl;
   std::cout << "Use the OMP_NUM_THREADS environment variable to control OMP multi-threading" << std::endl;
@@ -96,7 +97,7 @@ int
 main( int argc, char** argv )
 {
   // Namespaces for CUDA and C++ (FIXME - eventually use the same namespace everywhere...)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   using namespace mg5amcGpu;
 #else
   using namespace mg5amcCpu;
@@ -134,9 +135,11 @@ main( int argc, char** argv )
     CurandDevice = 2
   };
 #ifdef MGONGPU_HAS_NO_CURAND
-  RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // this is the only supported mode if build has no curand (PR #784)
+  RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // this is the only supported mode if build has no curand (PR #784 and #785)
+#elif defined __HIPCC__
+#error Internal error: MGONGPU_HAS_NO_CURAND should have been set for __HIPCC__ // default on AMD GPUs should be common random
 #elif defined __CUDACC__
-  RandomNumberMode rndgen = RandomNumberMode::CurandDevice; // default on GPU if build has curand
+  RandomNumberMode rndgen = RandomNumberMode::CurandDevice; // default on NVidia GPU if build has curand
 #else
   RandomNumberMode rndgen = RandomNumberMode::CurandHost; // default on CPU if build has curand
 #endif
@@ -146,10 +149,10 @@ main( int argc, char** argv )
     RamboHost = 1,
     RamboDevice = 2
   };
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   RamboSamplingMode rmbsmp = RamboSamplingMode::RamboDevice; // default on GPU
 #else
-  RamboSamplingMode rmbsmp = RamboSamplingMode::RamboHost;  // default on CPU
+  RamboSamplingMode rmbsmp = RamboSamplingMode::RamboHost; // default on CPU
 #endif
   // Bridge emulation mode (NB Bridge implies RamboHost!)
   bool bridge = false;
@@ -177,7 +180,7 @@ main( int argc, char** argv )
     else if( arg == "--curdev" )
     {
 #ifndef __CUDACC__
-      throw std::runtime_error( "CurandDevice is not supported on CPUs" );
+      throw std::runtime_error( "CurandDevice is not supported on CPUs or non-NVidia GPUs" );
 #elif defined MGONGPU_HAS_NO_CURAND
       throw std::runtime_error( "CurandDevice is not supported because this application was built without Curand support" );
 #else
@@ -198,7 +201,7 @@ main( int argc, char** argv )
     }
     else if( arg == "--rmbdev" )
     {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
       rmbsmp = RamboSamplingMode::RamboDevice;
 #else
       throw std::runtime_error( "RamboDevice is not supported on CPUs" );
@@ -272,13 +275,13 @@ main( int argc, char** argv )
     return usage( argv[0] );
   }
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 #ifdef _OPENMP
   ompnumthreadsNotSetMeansOneThread( debug ? 1 : 0 ); // quiet(-1), info(0), debug(1)
 #endif
 #endif
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // Fail gently and avoid "Illegal instruction (core dumped)" if the host does not support the SIMD used in the ME calculation
   // Note: this prevents a crash on pmpe04 but not on some github CI nodes?
   // [NB: SIMD vectorization in mg5amc C++ code is only used in the ME calculation below MatrixElementKernelHost!]
@@ -296,14 +299,14 @@ main( int argc, char** argv )
 
   // === STEP 0 - INITIALISE
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 
-  // --- 00. Initialise cuda
-  // Instantiate a CudaRuntime at the beginnining of the application's main to
-  // invoke cudaSetDevice(0) in the constructor and book a cudaDeviceReset() call in the destructor
-  const std::string cdinKey = "00 CudaInit";
+  // --- 00. Initialise GPU
+  // Instantiate a GpuRuntime at the beginnining of the application's main.
+  // For CUDA this invokes cudaSetDevice(0) in the constructor and books a cudaDeviceReset() call in the destructor.
+  const std::string cdinKey = "00 GpuInit";
   timermap.start( cdinKey );
-  CudaRuntime cudaRuntime( debug );
+  GpuRuntime GpuRuntime( debug );
 #endif
 
   // --- 0a. Initialise physics process
@@ -325,7 +328,7 @@ main( int argc, char** argv )
   timermap.start( alloKey );
 
   // Memory buffers for random numbers for momenta
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferRndNumMomenta hstRndmom( nevt );
 #else
   PinnedHostBufferRndNumMomenta hstRndmom( nevt );
@@ -333,7 +336,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for sampling weights
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferWeights hstWeights( nevt );
 #else
   PinnedHostBufferWeights hstWeights( nevt );
@@ -341,7 +344,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for momenta
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferMomenta hstMomenta( nevt );
 #else
   PinnedHostBufferMomenta hstMomenta( nevt );
@@ -349,7 +352,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for Gs
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferGs hstGs( nevt );
 #else
   PinnedHostBufferGs hstGs( nevt );
@@ -366,7 +369,7 @@ main( int argc, char** argv )
   }
 
   // Memory buffers for matrix elements
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferMatrixElements hstMatrixElements( nevt );
 #else
   PinnedHostBufferMatrixElements hstMatrixElements( nevt );
@@ -375,7 +378,7 @@ main( int argc, char** argv )
 
   // Memory buffers for random numbers for helicity selection
   // *** NB #403 these buffers always remain initialised at 0: no need for helicity choice in gcheck/check (no LHE produced) ***
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferRndNumHelicity hstRndHel( nevt );
 #else
   PinnedHostBufferRndNumHelicity hstRndHel( nevt );
@@ -384,7 +387,7 @@ main( int argc, char** argv )
 
   // Memory buffers for random numbers for color selection
   // *** NB #402 these buffers always remain initialised at 0: no need for color choice in gcheck/check (no LHE produced) ***
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferRndNumColor hstRndCol( nevt );
 #else
   PinnedHostBufferRndNumColor hstRndCol( nevt );
@@ -392,7 +395,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for helicity selection
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferSelectedHelicity hstSelHel( nevt );
 #else
   PinnedHostBufferSelectedHelicity hstSelHel( nevt );
@@ -400,7 +403,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for color selection
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferSelectedColor hstSelCol( nevt );
 #else
   PinnedHostBufferSelectedColor hstSelCol( nevt );
@@ -438,7 +441,7 @@ main( int argc, char** argv )
     const bool onDevice = true;
     prnk.reset( new CurandRandomNumberKernel( devRndmom, onDevice ) );
 #else
-    throw std::logic_error( "INTERNAL ERROR! CurandDevice is not supported on CPUs" ); // INTERNAL ERROR (no path to this statement)
+    throw std::logic_error( "INTERNAL ERROR! CurandDevice is not supported on CPUs or non-NVidia GPUs" ); // INTERNAL ERROR (no path to this statement)
 #endif
   }
 
@@ -450,7 +453,7 @@ main( int argc, char** argv )
   }
   else
   {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     prsk.reset( new RamboSamplingKernelDevice( energy, devRndmom, devMomenta, devWeights, gpublocks, gputhreads ) );
 #else
     throw std::logic_error( "RamboDevice is not supported on CPUs" ); // INTERNAL ERROR (no path to this statement)
@@ -461,7 +464,7 @@ main( int argc, char** argv )
   std::unique_ptr<MatrixElementKernelBase> pmek;
   if( !bridge )
   {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     pmek.reset( new MatrixElementKernelDevice( devMomenta, devGs, devRndHel, devRndCol, devMatrixElements, devSelHel, devSelCol, gpublocks, gputhreads ) );
 #else
     pmek.reset( new MatrixElementKernelHost( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, nevt ) );
@@ -469,7 +472,7 @@ main( int argc, char** argv )
   }
   else
   {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     pmek.reset( new BridgeKernelDevice( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, gpublocks, gputhreads ) );
 #else
     pmek.reset( new BridgeKernelHost( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, nevt ) );
@@ -511,7 +514,7 @@ main( int argc, char** argv )
     prnk->generateRnarray();
     //std::cout << "Got random numbers" << std::endl;
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     if( rndgen != RandomNumberMode::CurandDevice && rmbsmp == RamboSamplingMode::RamboDevice )
     {
       // --- 1c. Copy rndmom from host to device
@@ -543,7 +546,7 @@ main( int argc, char** argv )
     prsk->getMomentaFinal();
     //std::cout << "Got final momenta" << std::endl;
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     if( rmbsmp == RamboSamplingMode::RamboDevice )
     {
       // --- 2c. CopyDToH Weights
@@ -588,7 +591,7 @@ main( int argc, char** argv )
       dynamic_cast<BridgeKernelBase*>( pmek.get() )->transposeInputMomentaC2F();
     }
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     // --- 2d. CopyHToD Momenta
     const std::string gKey = "0.. CpHTDg";
     rambtime += timermap.start( gKey ); // FIXME! NOT A RAMBO TIMER!
@@ -617,7 +620,7 @@ main( int argc, char** argv )
     wv3atime += timermap.stop(); // calc only
     wavetime += wv3atime;        // calc plus copy
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     if( !bridge )
     {
       // --- 3b. CopyDToH MEs
@@ -760,18 +763,22 @@ main( int argc, char** argv )
     rndgentxt = "CURAND DEVICE";
 #ifdef __CUDACC__
   rndgentxt += " (CUDA code)";
+#elif defined __HIPCC__
+  rndgentxt += " (HIP code)";
 #else
   rndgentxt += " (C++ code)";
 #endif
 
   // Workflow description summary
   std::string wrkflwtxt;
-  // -- CUDA or C++?
+  // -- CUDA or HIP or C++?
 #ifdef __CUDACC__
   wrkflwtxt += "CUD:";
+#elif defined __HIPCC__
+  wrkflwtxt += "HIP:";
 #else
   wrkflwtxt += "CPP:";
-#endif
+#endif /* clang-format off */
   // -- DOUBLE or FLOAT?
 #if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
   wrkflwtxt += "MIX+"; // mixed fptypes (single precision color algebra #537)
@@ -781,7 +788,7 @@ main( int argc, char** argv )
   wrkflwtxt += "FLT+";
 #else
   wrkflwtxt += "???+"; // no path to this statement
-#endif
+#endif /* clang-format on */
   // -- CUCOMPLEX or THRUST or STD complex numbers?
 #ifdef __CUDACC__
 #if defined MGONGPU_CUCXTYPE_CUCOMPLEX
@@ -793,6 +800,12 @@ main( int argc, char** argv )
 #else
   wrkflwtxt += "???:"; // no path to this statement
 #endif
+#elif defined __HIPCC__
+#if defined MGONGPU_CUCXTYPE_CXSMPL
+  wrkflwtxt += "CXS:";
+#else
+  wrkflwtxt += "???:"; // no path to this statement
+#endif
 #else
 #if defined MGONGPU_CPPCXTYPE_STDCOMPLEX
   wrkflwtxt += "STX:";
@@ -818,7 +831,7 @@ main( int argc, char** argv )
     wrkflwtxt += "RMBDEV+";
   else
     wrkflwtxt += "??????+"; // no path to this statement
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   // -- HOST or DEVICE matrix elements? Standalone MEs or BRIDGE?
   if( !bridge )
     wrkflwtxt += "MESDEV";
@@ -874,7 +887,7 @@ main( int argc, char** argv )
 
   if( perf )
   {
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 #ifdef _OPENMP
     // Get the output of "nproc --all" (https://stackoverflow.com/a/478960)
     std::string nprocall;
@@ -895,6 +908,8 @@ main( int argc, char** argv )
     std::cout << std::string( SEP79, '*' ) << std::endl
 #ifdef __CUDACC__
               << "Process                     = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_CUDA"
+#elif defined __HIPCC__
+              << "Process                     = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_HIP"
 #else
               << "Process                     = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_CPP"
 #endif
@@ -921,21 +936,21 @@ main( int argc, char** argv )
 #elif defined MGONGPU_FPTYPE_FLOAT
               << "FP precision                = FLOAT (NaN/abnormal=" << nabn << ", zero=" << nzero << ")" << std::endl
 #endif
-#ifdef __CUDACC__
 #if defined MGONGPU_CUCXTYPE_CUCOMPLEX
               << "Complex type                = CUCOMPLEX" << std::endl
 #elif defined MGONGPU_CUCXTYPE_THRUST
               << "Complex type                = THRUST::COMPLEX" << std::endl
-#endif
-#else
+#elif defined MGONGPU_CUCXTYPE_CXSMPL
               << "Complex type                = STD::COMPLEX" << std::endl
+#else
+              << "Complex type                = ???" << std::endl // no path to this statement...
 #endif
               << "RanNumb memory layout       = AOSOA[" << neppR << "]"
               << ( neppR == 1 ? " == AOS" : "" )
               << " [HARDCODED FOR REPRODUCIBILITY]" << std::endl
               << "Momenta memory layout       = AOSOA[" << neppM << "]"
               << ( neppM == 1 ? " == AOS" : "" ) << std::endl
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     //<< "Wavefunction GPU memory     = LOCAL" << std::endl
 #else
 #if !defined MGONGPU_CPPSIMD
@@ -966,7 +981,7 @@ main( int argc, char** argv )
 #endif
 #endif
               << "Random number generation    = " << rndgentxt << std::endl
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 #ifdef _OPENMP
               << "OMP threads / `nproc --all` = " << omp_get_max_threads() << " / " << nprocall // includes a newline
 #endif
@@ -1062,14 +1077,14 @@ main( int argc, char** argv )
              << "\"FLOAT (NaN/abnormal=" << nabn << ")\"," << std::endl
 #endif
              << "\"Complex type\": "
-#ifdef __CUDACC__
 #if defined MGONGPU_CUCXTYPE_CUCOMPLEX
              << "\"CUCOMPLEX\"," << std::endl
 #elif defined MGONGPU_CUCXTYPE_THRUST
              << "\"THRUST::COMPLEX\"," << std::endl
-#endif
-#else
+#elif defined MGONGPU_CUCXTYPE_CXSMPL
              << "\"STD::COMPLEX\"," << std::endl
+#else
+             << "\"???\"," << std::endl                           // no path to this statement...
 #endif
              << "\"RanNumb memory layout\": "
              << "\"AOSOA[" << neppR << "]\""
@@ -1077,7 +1092,7 @@ main( int argc, char** argv )
              << "\"Momenta memory layout\": "
              << "\"AOSOA[" << neppM << "]\""
              << ( neppM == 1 ? " == AOS" : "" ) << ", " << std::endl
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     //<< "\"Wavefunction GPU memory\": " << "\"LOCAL\"," << std::endl
 #endif
              << "\"Curand generation\": "
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gux_ttxgux/CPPProcess.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gux_ttxgux/CPPProcess.cc
index dfb05016f5..6a53d09c8e 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gux_ttxgux/CPPProcess.cc
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gux_ttxgux/CPPProcess.cc
@@ -4,7 +4,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
 // MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
@@ -16,7 +16,6 @@
 
 #include "mgOnGpuConfig.h"
 
-#include "CudaRuntime.h"
 #include "HelAmps_sm.h"
 #include "MemoryAccessAmplitudes.h"
 #include "MemoryAccessCouplings.h"
@@ -49,7 +48,7 @@
 // Process: g d~ > t t~ g d~ WEIGHTED<=4 @2
 // Process: g s~ > t t~ g s~ WEIGHTED<=4 @2
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -83,7 +82,7 @@ namespace mg5amcCpu
   __device__ const fptype cIPD[2] = { (fptype)Parameters_sm::mdl_MT, (fptype)Parameters_sm::mdl_WT };
   __device__ const fptype* cIPC = nullptr; // unused as nicoup=0
 #else
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   __device__ __constant__ fptype cIPD[2];
   __device__ __constant__ fptype* cIPC = nullptr; // unused as nicoup=0
 #else
@@ -93,7 +92,7 @@ namespace mg5amcCpu
 #endif
 
   // Helicity combinations (and filtering of "good" helicity combinations)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   __device__ __constant__ short cHel[ncomb][npar];
   __device__ __constant__ int cNGoodHel;
   __device__ __constant__ int cGoodHel[ncomb];
@@ -121,13 +120,13 @@ namespace mg5amcCpu
                            fptype* allDenominators,       // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
                            fptype_sv* jamp2_sv            // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled)
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
                            , const int ievt00             // input: first event number in current C++ event page (for CUDA, ievt depends on threadid)
 #endif
                            )
   //ALWAYS_INLINE // attributes are not permitted in a function definition
   {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     using namespace mg5amcGpu;
     using M_ACCESS = DeviceAccessMomenta;         // non-trivial access: buffer includes all events
     using E_ACCESS = DeviceAccessMatrixElements;  // non-trivial access: buffer includes all events
@@ -154,7 +153,7 @@ namespace mg5amcCpu
 #endif /* clang-format on */
     mgDebug( 0, __FUNCTION__ );
     //printf( "calculate_wavefunctions: ihel=%2d\n", ihel );
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
     //printf( "calculate_wavefunctions: ievt00=%d\n", ievt00 );
 #endif
 
@@ -190,7 +189,7 @@ namespace mg5amcCpu
 #endif
     for( int iParity = 0; iParity < nParity; ++iParity )
     { // START LOOP ON IPARITY
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
       const int ievt0 = ievt00 + iParity * neppV;
 #endif
       constexpr size_t nxcoup = ndcoup + nicoup; // both dependent and independent couplings
@@ -203,8 +202,10 @@ namespace mg5amcCpu
         allCOUPs[idcoup] = CD_ACCESS::idcoupAccessBufferConst( allcouplings, idcoup ); // dependent couplings, vary event-by-event
       for( size_t iicoup = 0; iicoup < nicoup; iicoup++ )
         allCOUPs[ndcoup + iicoup] = CI_ACCESS::iicoupAccessBufferConst( cIPC, iicoup ); // independent couplings, fixed for all events
+#ifdef MGONGPUCPP_GPUIMPL
 #ifdef __CUDACC__
 #pragma nv_diagnostic pop
+#endif
       // CUDA kernels take input/output buffers with momenta/MEs for all events
       const fptype* momenta = allmomenta;
       const fptype* COUPs[nxcoup];
@@ -812,7 +813,7 @@ namespace mg5amcCpu
         { -2, 0, 0, 16, -2, -6, 6, -2, 16, 6, 48, 16 },
         { 0, -2, 16, 0, -6, -2, -2, 6, 6, 16, 16, 48 } }; // 2-D array[12][12]
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
       // Pre-compute a constexpr triangular color matrix properly normalized #475
       struct TriangularNormalizedColorMatrix
       {
@@ -869,7 +870,7 @@ namespace mg5amcCpu
 #endif
       for( int icol = 0; icol < ncolor; icol++ )
       {
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
         // === C++ START ===
         // Diagonal terms
 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
@@ -928,7 +929,7 @@ namespace mg5amcCpu
       MEs_sv_previous += deltaMEs_previous;
 #endif
       /*
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
       if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", blockDim.x * blockIdx.x + threadIdx.x, ihel, MEs_sv );
 #else
 #ifdef MGONGPU_CPPSIMD
@@ -1023,8 +1024,8 @@ namespace mg5amcCpu
       { 1, 1, 1, -1, -1, -1 },
       { 1, 1, 1, -1, 1, 1 },
       { 1, 1, 1, -1, 1, -1 } };
-#ifdef __CUDACC__
-    checkCuda( cudaMemcpyToSymbol( cHel, tHel, ncomb * npar * sizeof( short ) ) );
+#ifdef MGONGPUCPP_GPUIMPL
+    gpuMemcpyToSymbol( cHel, tHel, ncomb * npar * sizeof( short ) );
 #else
     memcpy( cHel, tHel, ncomb * npar * sizeof( short ) );
 #endif
@@ -1066,9 +1067,9 @@ namespace mg5amcCpu
     // Then copy them to CUDA constant memory (issue #39) or its C++ emulation in file-scope static memory
     const fptype tIPD[2] = { (fptype)m_pars->mdl_MT, (fptype)m_pars->mdl_WT };
     //const cxtype tIPC[0] = { ... }; // nicoup=0
-#ifdef __CUDACC__
-    checkCuda( cudaMemcpyToSymbol( cIPD, tIPD, 2 * sizeof( fptype ) ) );
-    //checkCuda( cudaMemcpyToSymbol( cIPC, tIPC, 0 * sizeof( cxtype ) ) ); // nicoup=0
+#ifdef MGONGPUCPP_GPUIMPL
+    gpuMemcpyToSymbol( cIPD, tIPD, 2 * sizeof( fptype ) );
+    //gpuMemcpyToSymbol( cIPC, tIPC, 0 * sizeof( cxtype ) ); // nicoup=0
 #else
     memcpy( cIPD, tIPD, 2 * sizeof( fptype ) );
     //memcpy( cIPC, tIPC, 0 * sizeof( cxtype ) ); // nicoup=0
@@ -1106,7 +1107,7 @@ namespace mg5amcCpu
   {
     std::stringstream out;
     // CUDA version (NVCC)
-    // [Use __NVCC__ instead of __CUDACC__ here!]
+    // [Use __NVCC__ instead of MGONGPUCPP_GPUIMPL here!]
     // [This tests if 'nvcc' was used even to build a .cc file, even if not necessarily 'nvcc -x cu' for a .cu file]
     // [Check 'nvcc --compiler-options -dM -E dummy.c | grep CUDA': see https://stackoverflow.com/a/53713712]
 #ifdef __NVCC__
@@ -1171,12 +1172,12 @@ namespace mg5amcCpu
   __global__ void /* clang-format off */
   computeDependentCouplings( const fptype* allgs, // input: Gs[nevt]
                              fptype* allcouplings // output: couplings[nevt*ndcoup*2]
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
                              , const int nevt     // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #endif
   ) /* clang-format on */
   {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     using namespace mg5amcGpu;
     using G_ACCESS = DeviceAccessGs;
     using C_ACCESS = DeviceAccessCouplings;
@@ -1197,7 +1198,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__ /* clang-format off */
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
   __global__ void
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
@@ -1323,9 +1324,9 @@ namespace mg5amcCpu
         nGoodHel++;
       }
     }
-#ifdef __CUDACC__
-    checkCuda( cudaMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) ) );
-    checkCuda( cudaMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) ) );
+#ifdef MGONGPUCPP_GPUIMPL
+    gpuMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) );
+    gpuMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) );
 #else
     cNGoodHel = nGoodHel;
     for( int ihel = 0; ihel < ncomb; ihel++ ) cGoodHel[ihel] = goodHel[ihel];
@@ -1349,7 +1350,7 @@ namespace mg5amcCpu
 #endif
             int* allselhel,                // output: helicity selection[nevt]
             int* allselcol                 // output: helicity selection[nevt]
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
             , const int nevt               // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #endif
             ) /* clang-format on */
@@ -1369,7 +1370,7 @@ namespace mg5amcCpu
     // Denominators: spins, colors and identical particles
     constexpr int helcolDenominators[1] = { 96 }; // assume nprocesses == 1 (#272 and #343)
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     // Remember: in CUDA this is a kernel for one event, in c++ this processes n events
     const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid
 #else
@@ -1383,9 +1384,12 @@ namespace mg5amcCpu
 #endif
 
     // Start sigmaKin_lines
+
+#include "GpuAbstraction.h"
+
     // === PART 0 - INITIALISATION (before calculate_wavefunctions) ===
     // Reset the "matrix elements" - running sums of |M|^2 over helicities for the given event
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     allMEs[ievt] = 0;
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     allNumerators[ievt] = 0;
@@ -1413,7 +1417,7 @@ namespace mg5amcCpu
     // === PART 1 - HELICITY LOOP: CALCULATE WAVEFUNCTIONS ===
     // (in both CUDA and C++, using precomputed good helicities)
 
-#ifdef __CUDACC__ // CUDA OR C++
+#ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++
 
     // *** START OF PART 1a - CUDA (one event per CPU thread) ***
     // Running sum of partial amplitudes squared for event by event color selection (#402)
@@ -1623,7 +1627,7 @@ namespace mg5amcCpu
     // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event
     // [NB 'sum over final spins, average over initial spins', eg see
     // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf]
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     allMEs[ievt] /= helcolDenominators[0];
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     if( channelId > 0 ) allMEs[ievt] *= allNumerators[ievt] / allDenominators[ievt];
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gux_ttxgux/CPPProcess.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gux_ttxgux/CPPProcess.h
index e60cb5b6d7..a5a285b22d 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gux_ttxgux/CPPProcess.h
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gux_ttxgux/CPPProcess.h
@@ -4,7 +4,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
 // MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
@@ -25,7 +25,7 @@
 
 //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -110,7 +110,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   __global__ void
   computeDependentCouplings( const fptype* allgs,    // input: Gs[nevt]
                              fptype* allcouplings ); // output: couplings[nevt*ndcoup*2]
@@ -123,7 +123,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__ /* clang-format off */
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
   __global__ void
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
@@ -153,7 +153,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__ /* clang-format off */
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
   __global__ void
   sigmaKin( const fptype* allmomenta,      // input: momenta[nevt*npar*4]
             const fptype* allcouplings,    // input: couplings[nevt*ndcoup*2]
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gux_ttxgux/CudaRuntime.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gux_ttxgux/CudaRuntime.h
deleted file mode 120000
index ce9e1a487a..0000000000
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gux_ttxgux/CudaRuntime.h
+++ /dev/null
@@ -1 +0,0 @@
-../CudaRuntime.h
\ No newline at end of file
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gux_ttxgux/GpuAbstraction.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gux_ttxgux/GpuAbstraction.h
new file mode 120000
index 0000000000..72054e19ba
--- /dev/null
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gux_ttxgux/GpuAbstraction.h
@@ -0,0 +1 @@
+../GpuAbstraction.h
\ No newline at end of file
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gux_ttxgux/GpuRuntime.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gux_ttxgux/GpuRuntime.h
new file mode 120000
index 0000000000..3920e83be4
--- /dev/null
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gux_ttxgux/GpuRuntime.h
@@ -0,0 +1 @@
+../GpuRuntime.h
\ No newline at end of file
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gux_ttxgux/check_sa.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gux_ttxgux/check_sa.cc
index 3fbf0ffbee..7cac5ab47b 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gux_ttxgux/check_sa.cc
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gux_ttxgux/check_sa.cc
@@ -4,7 +4,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: O. Mattelaer (Nov 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 
 #include "mgOnGpuConfig.h"
@@ -12,6 +12,7 @@
 #include "BridgeKernels.h"
 #include "CPPProcess.h"
 #include "CrossSectionKernels.h"
+#include "GpuRuntime.h"
 #include "MatrixElementKernels.h"
 #include "MemoryAccessMatrixElements.h"
 #include "MemoryAccessMomenta.h"
@@ -65,7 +66,7 @@ usage( char* argv0, int ret = 1 )
   std::cout << std::endl;
   std::cout << "Summary stats are always computed: '-p' and '-j' only control their printout" << std::endl;
   std::cout << "The '-d' flag only enables NaN/abnormal warnings and OMP debugging" << std::endl;
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 #ifdef _OPENMP
   std::cout << std::endl;
   std::cout << "Use the OMP_NUM_THREADS environment variable to control OMP multi-threading" << std::endl;
@@ -96,7 +97,7 @@ int
 main( int argc, char** argv )
 {
   // Namespaces for CUDA and C++ (FIXME - eventually use the same namespace everywhere...)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   using namespace mg5amcGpu;
 #else
   using namespace mg5amcCpu;
@@ -134,9 +135,11 @@ main( int argc, char** argv )
     CurandDevice = 2
   };
 #ifdef MGONGPU_HAS_NO_CURAND
-  RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // this is the only supported mode if build has no curand (PR #784)
+  RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // this is the only supported mode if build has no curand (PR #784 and #785)
+#elif defined __HIPCC__
+#error Internal error: MGONGPU_HAS_NO_CURAND should have been set for __HIPCC__ // default on AMD GPUs should be common random
 #elif defined __CUDACC__
-  RandomNumberMode rndgen = RandomNumberMode::CurandDevice; // default on GPU if build has curand
+  RandomNumberMode rndgen = RandomNumberMode::CurandDevice; // default on NVidia GPU if build has curand
 #else
   RandomNumberMode rndgen = RandomNumberMode::CurandHost; // default on CPU if build has curand
 #endif
@@ -146,10 +149,10 @@ main( int argc, char** argv )
     RamboHost = 1,
     RamboDevice = 2
   };
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   RamboSamplingMode rmbsmp = RamboSamplingMode::RamboDevice; // default on GPU
 #else
-  RamboSamplingMode rmbsmp = RamboSamplingMode::RamboHost;  // default on CPU
+  RamboSamplingMode rmbsmp = RamboSamplingMode::RamboHost; // default on CPU
 #endif
   // Bridge emulation mode (NB Bridge implies RamboHost!)
   bool bridge = false;
@@ -177,7 +180,7 @@ main( int argc, char** argv )
     else if( arg == "--curdev" )
     {
 #ifndef __CUDACC__
-      throw std::runtime_error( "CurandDevice is not supported on CPUs" );
+      throw std::runtime_error( "CurandDevice is not supported on CPUs or non-NVidia GPUs" );
 #elif defined MGONGPU_HAS_NO_CURAND
       throw std::runtime_error( "CurandDevice is not supported because this application was built without Curand support" );
 #else
@@ -198,7 +201,7 @@ main( int argc, char** argv )
     }
     else if( arg == "--rmbdev" )
     {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
       rmbsmp = RamboSamplingMode::RamboDevice;
 #else
       throw std::runtime_error( "RamboDevice is not supported on CPUs" );
@@ -272,13 +275,13 @@ main( int argc, char** argv )
     return usage( argv[0] );
   }
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 #ifdef _OPENMP
   ompnumthreadsNotSetMeansOneThread( debug ? 1 : 0 ); // quiet(-1), info(0), debug(1)
 #endif
 #endif
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // Fail gently and avoid "Illegal instruction (core dumped)" if the host does not support the SIMD used in the ME calculation
   // Note: this prevents a crash on pmpe04 but not on some github CI nodes?
   // [NB: SIMD vectorization in mg5amc C++ code is only used in the ME calculation below MatrixElementKernelHost!]
@@ -296,14 +299,14 @@ main( int argc, char** argv )
 
   // === STEP 0 - INITIALISE
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 
-  // --- 00. Initialise cuda
-  // Instantiate a CudaRuntime at the beginnining of the application's main to
-  // invoke cudaSetDevice(0) in the constructor and book a cudaDeviceReset() call in the destructor
-  const std::string cdinKey = "00 CudaInit";
+  // --- 00. Initialise GPU
+  // Instantiate a GpuRuntime at the beginnining of the application's main.
+  // For CUDA this invokes cudaSetDevice(0) in the constructor and books a cudaDeviceReset() call in the destructor.
+  const std::string cdinKey = "00 GpuInit";
   timermap.start( cdinKey );
-  CudaRuntime cudaRuntime( debug );
+  GpuRuntime GpuRuntime( debug );
 #endif
 
   // --- 0a. Initialise physics process
@@ -325,7 +328,7 @@ main( int argc, char** argv )
   timermap.start( alloKey );
 
   // Memory buffers for random numbers for momenta
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferRndNumMomenta hstRndmom( nevt );
 #else
   PinnedHostBufferRndNumMomenta hstRndmom( nevt );
@@ -333,7 +336,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for sampling weights
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferWeights hstWeights( nevt );
 #else
   PinnedHostBufferWeights hstWeights( nevt );
@@ -341,7 +344,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for momenta
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferMomenta hstMomenta( nevt );
 #else
   PinnedHostBufferMomenta hstMomenta( nevt );
@@ -349,7 +352,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for Gs
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferGs hstGs( nevt );
 #else
   PinnedHostBufferGs hstGs( nevt );
@@ -366,7 +369,7 @@ main( int argc, char** argv )
   }
 
   // Memory buffers for matrix elements
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferMatrixElements hstMatrixElements( nevt );
 #else
   PinnedHostBufferMatrixElements hstMatrixElements( nevt );
@@ -375,7 +378,7 @@ main( int argc, char** argv )
 
   // Memory buffers for random numbers for helicity selection
   // *** NB #403 these buffers always remain initialised at 0: no need for helicity choice in gcheck/check (no LHE produced) ***
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferRndNumHelicity hstRndHel( nevt );
 #else
   PinnedHostBufferRndNumHelicity hstRndHel( nevt );
@@ -384,7 +387,7 @@ main( int argc, char** argv )
 
   // Memory buffers for random numbers for color selection
   // *** NB #402 these buffers always remain initialised at 0: no need for color choice in gcheck/check (no LHE produced) ***
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferRndNumColor hstRndCol( nevt );
 #else
   PinnedHostBufferRndNumColor hstRndCol( nevt );
@@ -392,7 +395,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for helicity selection
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferSelectedHelicity hstSelHel( nevt );
 #else
   PinnedHostBufferSelectedHelicity hstSelHel( nevt );
@@ -400,7 +403,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for color selection
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferSelectedColor hstSelCol( nevt );
 #else
   PinnedHostBufferSelectedColor hstSelCol( nevt );
@@ -438,7 +441,7 @@ main( int argc, char** argv )
     const bool onDevice = true;
     prnk.reset( new CurandRandomNumberKernel( devRndmom, onDevice ) );
 #else
-    throw std::logic_error( "INTERNAL ERROR! CurandDevice is not supported on CPUs" ); // INTERNAL ERROR (no path to this statement)
+    throw std::logic_error( "INTERNAL ERROR! CurandDevice is not supported on CPUs or non-NVidia GPUs" ); // INTERNAL ERROR (no path to this statement)
 #endif
   }
 
@@ -450,7 +453,7 @@ main( int argc, char** argv )
   }
   else
   {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     prsk.reset( new RamboSamplingKernelDevice( energy, devRndmom, devMomenta, devWeights, gpublocks, gputhreads ) );
 #else
     throw std::logic_error( "RamboDevice is not supported on CPUs" ); // INTERNAL ERROR (no path to this statement)
@@ -461,7 +464,7 @@ main( int argc, char** argv )
   std::unique_ptr<MatrixElementKernelBase> pmek;
   if( !bridge )
   {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     pmek.reset( new MatrixElementKernelDevice( devMomenta, devGs, devRndHel, devRndCol, devMatrixElements, devSelHel, devSelCol, gpublocks, gputhreads ) );
 #else
     pmek.reset( new MatrixElementKernelHost( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, nevt ) );
@@ -469,7 +472,7 @@ main( int argc, char** argv )
   }
   else
   {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     pmek.reset( new BridgeKernelDevice( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, gpublocks, gputhreads ) );
 #else
     pmek.reset( new BridgeKernelHost( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, nevt ) );
@@ -511,7 +514,7 @@ main( int argc, char** argv )
     prnk->generateRnarray();
     //std::cout << "Got random numbers" << std::endl;
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     if( rndgen != RandomNumberMode::CurandDevice && rmbsmp == RamboSamplingMode::RamboDevice )
     {
       // --- 1c. Copy rndmom from host to device
@@ -543,7 +546,7 @@ main( int argc, char** argv )
     prsk->getMomentaFinal();
     //std::cout << "Got final momenta" << std::endl;
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     if( rmbsmp == RamboSamplingMode::RamboDevice )
     {
       // --- 2c. CopyDToH Weights
@@ -588,7 +591,7 @@ main( int argc, char** argv )
       dynamic_cast<BridgeKernelBase*>( pmek.get() )->transposeInputMomentaC2F();
     }
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     // --- 2d. CopyHToD Momenta
     const std::string gKey = "0.. CpHTDg";
     rambtime += timermap.start( gKey ); // FIXME! NOT A RAMBO TIMER!
@@ -617,7 +620,7 @@ main( int argc, char** argv )
     wv3atime += timermap.stop(); // calc only
     wavetime += wv3atime;        // calc plus copy
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     if( !bridge )
     {
       // --- 3b. CopyDToH MEs
@@ -760,18 +763,22 @@ main( int argc, char** argv )
     rndgentxt = "CURAND DEVICE";
 #ifdef __CUDACC__
   rndgentxt += " (CUDA code)";
+#elif defined __HIPCC__
+  rndgentxt += " (HIP code)";
 #else
   rndgentxt += " (C++ code)";
 #endif
 
   // Workflow description summary
   std::string wrkflwtxt;
-  // -- CUDA or C++?
+  // -- CUDA or HIP or C++?
 #ifdef __CUDACC__
   wrkflwtxt += "CUD:";
+#elif defined __HIPCC__
+  wrkflwtxt += "HIP:";
 #else
   wrkflwtxt += "CPP:";
-#endif
+#endif /* clang-format off */
   // -- DOUBLE or FLOAT?
 #if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
   wrkflwtxt += "MIX+"; // mixed fptypes (single precision color algebra #537)
@@ -781,7 +788,7 @@ main( int argc, char** argv )
   wrkflwtxt += "FLT+";
 #else
   wrkflwtxt += "???+"; // no path to this statement
-#endif
+#endif /* clang-format on */
   // -- CUCOMPLEX or THRUST or STD complex numbers?
 #ifdef __CUDACC__
 #if defined MGONGPU_CUCXTYPE_CUCOMPLEX
@@ -793,6 +800,12 @@ main( int argc, char** argv )
 #else
   wrkflwtxt += "???:"; // no path to this statement
 #endif
+#elif defined __HIPCC__
+#if defined MGONGPU_CUCXTYPE_CXSMPL
+  wrkflwtxt += "CXS:";
+#else
+  wrkflwtxt += "???:"; // no path to this statement
+#endif
 #else
 #if defined MGONGPU_CPPCXTYPE_STDCOMPLEX
   wrkflwtxt += "STX:";
@@ -818,7 +831,7 @@ main( int argc, char** argv )
     wrkflwtxt += "RMBDEV+";
   else
     wrkflwtxt += "??????+"; // no path to this statement
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   // -- HOST or DEVICE matrix elements? Standalone MEs or BRIDGE?
   if( !bridge )
     wrkflwtxt += "MESDEV";
@@ -874,7 +887,7 @@ main( int argc, char** argv )
 
   if( perf )
   {
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 #ifdef _OPENMP
     // Get the output of "nproc --all" (https://stackoverflow.com/a/478960)
     std::string nprocall;
@@ -895,6 +908,8 @@ main( int argc, char** argv )
     std::cout << std::string( SEP79, '*' ) << std::endl
 #ifdef __CUDACC__
               << "Process                     = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_CUDA"
+#elif defined __HIPCC__
+              << "Process                     = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_HIP"
 #else
               << "Process                     = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_CPP"
 #endif
@@ -921,21 +936,21 @@ main( int argc, char** argv )
 #elif defined MGONGPU_FPTYPE_FLOAT
               << "FP precision                = FLOAT (NaN/abnormal=" << nabn << ", zero=" << nzero << ")" << std::endl
 #endif
-#ifdef __CUDACC__
 #if defined MGONGPU_CUCXTYPE_CUCOMPLEX
               << "Complex type                = CUCOMPLEX" << std::endl
 #elif defined MGONGPU_CUCXTYPE_THRUST
               << "Complex type                = THRUST::COMPLEX" << std::endl
-#endif
-#else
+#elif defined MGONGPU_CUCXTYPE_CXSMPL
               << "Complex type                = STD::COMPLEX" << std::endl
+#else
+              << "Complex type                = ???" << std::endl // no path to this statement...
 #endif
               << "RanNumb memory layout       = AOSOA[" << neppR << "]"
               << ( neppR == 1 ? " == AOS" : "" )
               << " [HARDCODED FOR REPRODUCIBILITY]" << std::endl
               << "Momenta memory layout       = AOSOA[" << neppM << "]"
               << ( neppM == 1 ? " == AOS" : "" ) << std::endl
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     //<< "Wavefunction GPU memory     = LOCAL" << std::endl
 #else
 #if !defined MGONGPU_CPPSIMD
@@ -966,7 +981,7 @@ main( int argc, char** argv )
 #endif
 #endif
               << "Random number generation    = " << rndgentxt << std::endl
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 #ifdef _OPENMP
               << "OMP threads / `nproc --all` = " << omp_get_max_threads() << " / " << nprocall // includes a newline
 #endif
@@ -1062,14 +1077,14 @@ main( int argc, char** argv )
              << "\"FLOAT (NaN/abnormal=" << nabn << ")\"," << std::endl
 #endif
              << "\"Complex type\": "
-#ifdef __CUDACC__
 #if defined MGONGPU_CUCXTYPE_CUCOMPLEX
              << "\"CUCOMPLEX\"," << std::endl
 #elif defined MGONGPU_CUCXTYPE_THRUST
              << "\"THRUST::COMPLEX\"," << std::endl
-#endif
-#else
+#elif defined MGONGPU_CUCXTYPE_CXSMPL
              << "\"STD::COMPLEX\"," << std::endl
+#else
+             << "\"???\"," << std::endl                           // no path to this statement...
 #endif
              << "\"RanNumb memory layout\": "
              << "\"AOSOA[" << neppR << "]\""
@@ -1077,7 +1092,7 @@ main( int argc, char** argv )
              << "\"Momenta memory layout\": "
              << "\"AOSOA[" << neppM << "]\""
              << ( neppM == 1 ? " == AOS" : "" ) << ", " << std::endl
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     //<< "\"Wavefunction GPU memory\": " << "\"LOCAL\"," << std::endl
 #endif
              << "\"Curand generation\": "
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uc_ttxuc/CPPProcess.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uc_ttxuc/CPPProcess.cc
index ecef3e57ca..fedf955b6a 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uc_ttxuc/CPPProcess.cc
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uc_ttxuc/CPPProcess.cc
@@ -4,7 +4,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
 // MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
@@ -16,7 +16,6 @@
 
 #include "mgOnGpuConfig.h"
 
-#include "CudaRuntime.h"
 #include "HelAmps_sm.h"
 #include "MemoryAccessAmplitudes.h"
 #include "MemoryAccessCouplings.h"
@@ -51,7 +50,7 @@
 // Process: c s > t t~ c s WEIGHTED<=4 @2
 // Process: d s > t t~ d s WEIGHTED<=4 @2
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -85,7 +84,7 @@ namespace mg5amcCpu
   __device__ const fptype cIPD[2] = { (fptype)Parameters_sm::mdl_MT, (fptype)Parameters_sm::mdl_WT };
   __device__ const fptype* cIPC = nullptr; // unused as nicoup=0
 #else
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   __device__ __constant__ fptype cIPD[2];
   __device__ __constant__ fptype* cIPC = nullptr; // unused as nicoup=0
 #else
@@ -95,7 +94,7 @@ namespace mg5amcCpu
 #endif
 
   // Helicity combinations (and filtering of "good" helicity combinations)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   __device__ __constant__ short cHel[ncomb][npar];
   __device__ __constant__ int cNGoodHel;
   __device__ __constant__ int cGoodHel[ncomb];
@@ -123,13 +122,13 @@ namespace mg5amcCpu
                            fptype* allDenominators,       // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
                            fptype_sv* jamp2_sv            // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled)
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
                            , const int ievt00             // input: first event number in current C++ event page (for CUDA, ievt depends on threadid)
 #endif
                            )
   //ALWAYS_INLINE // attributes are not permitted in a function definition
   {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     using namespace mg5amcGpu;
     using M_ACCESS = DeviceAccessMomenta;         // non-trivial access: buffer includes all events
     using E_ACCESS = DeviceAccessMatrixElements;  // non-trivial access: buffer includes all events
@@ -156,7 +155,7 @@ namespace mg5amcCpu
 #endif /* clang-format on */
     mgDebug( 0, __FUNCTION__ );
     //printf( "calculate_wavefunctions: ihel=%2d\n", ihel );
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
     //printf( "calculate_wavefunctions: ievt00=%d\n", ievt00 );
 #endif
 
@@ -192,7 +191,7 @@ namespace mg5amcCpu
 #endif
     for( int iParity = 0; iParity < nParity; ++iParity )
     { // START LOOP ON IPARITY
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
       const int ievt0 = ievt00 + iParity * neppV;
 #endif
       constexpr size_t nxcoup = ndcoup + nicoup; // both dependent and independent couplings
@@ -205,8 +204,10 @@ namespace mg5amcCpu
         allCOUPs[idcoup] = CD_ACCESS::idcoupAccessBufferConst( allcouplings, idcoup ); // dependent couplings, vary event-by-event
       for( size_t iicoup = 0; iicoup < nicoup; iicoup++ )
         allCOUPs[ndcoup + iicoup] = CI_ACCESS::iicoupAccessBufferConst( cIPC, iicoup ); // independent couplings, fixed for all events
+#ifdef MGONGPUCPP_GPUIMPL
 #ifdef __CUDACC__
 #pragma nv_diagnostic pop
+#endif
       // CUDA kernels take input/output buffers with momenta/MEs for all events
       const fptype* momenta = allmomenta;
       const fptype* COUPs[nxcoup];
@@ -387,7 +388,7 @@ namespace mg5amcCpu
         { 3, 9, 9, 3, 27, 9 },
         { 9, 3, 3, 9, 9, 27 } }; // 2-D array[6][6]
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
       // Pre-compute a constexpr triangular color matrix properly normalized #475
       struct TriangularNormalizedColorMatrix
       {
@@ -444,7 +445,7 @@ namespace mg5amcCpu
 #endif
       for( int icol = 0; icol < ncolor; icol++ )
       {
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
         // === C++ START ===
         // Diagonal terms
 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
@@ -503,7 +504,7 @@ namespace mg5amcCpu
       MEs_sv_previous += deltaMEs_previous;
 #endif
       /*
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
       if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", blockDim.x * blockIdx.x + threadIdx.x, ihel, MEs_sv );
 #else
 #ifdef MGONGPU_CPPSIMD
@@ -598,8 +599,8 @@ namespace mg5amcCpu
       { -1, -1, 1, -1, -1, 1 },
       { -1, -1, 1, -1, 1, -1 },
       { -1, -1, 1, -1, 1, 1 } };
-#ifdef __CUDACC__
-    checkCuda( cudaMemcpyToSymbol( cHel, tHel, ncomb * npar * sizeof( short ) ) );
+#ifdef MGONGPUCPP_GPUIMPL
+    gpuMemcpyToSymbol( cHel, tHel, ncomb * npar * sizeof( short ) );
 #else
     memcpy( cHel, tHel, ncomb * npar * sizeof( short ) );
 #endif
@@ -641,9 +642,9 @@ namespace mg5amcCpu
     // Then copy them to CUDA constant memory (issue #39) or its C++ emulation in file-scope static memory
     const fptype tIPD[2] = { (fptype)m_pars->mdl_MT, (fptype)m_pars->mdl_WT };
     //const cxtype tIPC[0] = { ... }; // nicoup=0
-#ifdef __CUDACC__
-    checkCuda( cudaMemcpyToSymbol( cIPD, tIPD, 2 * sizeof( fptype ) ) );
-    //checkCuda( cudaMemcpyToSymbol( cIPC, tIPC, 0 * sizeof( cxtype ) ) ); // nicoup=0
+#ifdef MGONGPUCPP_GPUIMPL
+    gpuMemcpyToSymbol( cIPD, tIPD, 2 * sizeof( fptype ) );
+    //gpuMemcpyToSymbol( cIPC, tIPC, 0 * sizeof( cxtype ) ); // nicoup=0
 #else
     memcpy( cIPD, tIPD, 2 * sizeof( fptype ) );
     //memcpy( cIPC, tIPC, 0 * sizeof( cxtype ) ); // nicoup=0
@@ -681,7 +682,7 @@ namespace mg5amcCpu
   {
     std::stringstream out;
     // CUDA version (NVCC)
-    // [Use __NVCC__ instead of __CUDACC__ here!]
+    // [Use __NVCC__ instead of MGONGPUCPP_GPUIMPL here!]
     // [This tests if 'nvcc' was used even to build a .cc file, even if not necessarily 'nvcc -x cu' for a .cu file]
     // [Check 'nvcc --compiler-options -dM -E dummy.c | grep CUDA': see https://stackoverflow.com/a/53713712]
 #ifdef __NVCC__
@@ -746,12 +747,12 @@ namespace mg5amcCpu
   __global__ void /* clang-format off */
   computeDependentCouplings( const fptype* allgs, // input: Gs[nevt]
                              fptype* allcouplings // output: couplings[nevt*ndcoup*2]
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
                              , const int nevt     // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #endif
   ) /* clang-format on */
   {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     using namespace mg5amcGpu;
     using G_ACCESS = DeviceAccessGs;
     using C_ACCESS = DeviceAccessCouplings;
@@ -772,7 +773,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__ /* clang-format off */
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
   __global__ void
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
@@ -898,9 +899,9 @@ namespace mg5amcCpu
         nGoodHel++;
       }
     }
-#ifdef __CUDACC__
-    checkCuda( cudaMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) ) );
-    checkCuda( cudaMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) ) );
+#ifdef MGONGPUCPP_GPUIMPL
+    gpuMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) );
+    gpuMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) );
 #else
     cNGoodHel = nGoodHel;
     for( int ihel = 0; ihel < ncomb; ihel++ ) cGoodHel[ihel] = goodHel[ihel];
@@ -924,7 +925,7 @@ namespace mg5amcCpu
 #endif
             int* allselhel,                // output: helicity selection[nevt]
             int* allselcol                 // output: helicity selection[nevt]
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
             , const int nevt               // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #endif
             ) /* clang-format on */
@@ -944,7 +945,7 @@ namespace mg5amcCpu
     // Denominators: spins, colors and identical particles
     constexpr int helcolDenominators[1] = { 36 }; // assume nprocesses == 1 (#272 and #343)
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     // Remember: in CUDA this is a kernel for one event, in c++ this processes n events
     const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid
 #else
@@ -958,9 +959,12 @@ namespace mg5amcCpu
 #endif
 
     // Start sigmaKin_lines
+
+#include "GpuAbstraction.h"
+
     // === PART 0 - INITIALISATION (before calculate_wavefunctions) ===
     // Reset the "matrix elements" - running sums of |M|^2 over helicities for the given event
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     allMEs[ievt] = 0;
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     allNumerators[ievt] = 0;
@@ -988,7 +992,7 @@ namespace mg5amcCpu
     // === PART 1 - HELICITY LOOP: CALCULATE WAVEFUNCTIONS ===
     // (in both CUDA and C++, using precomputed good helicities)
 
-#ifdef __CUDACC__ // CUDA OR C++
+#ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++
 
     // *** START OF PART 1a - CUDA (one event per CPU thread) ***
     // Running sum of partial amplitudes squared for event by event color selection (#402)
@@ -1198,7 +1202,7 @@ namespace mg5amcCpu
     // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event
     // [NB 'sum over final spins, average over initial spins', eg see
     // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf]
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     allMEs[ievt] /= helcolDenominators[0];
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     if( channelId > 0 ) allMEs[ievt] *= allNumerators[ievt] / allDenominators[ievt];
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uc_ttxuc/CPPProcess.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uc_ttxuc/CPPProcess.h
index 5329710b87..8c84687f8a 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uc_ttxuc/CPPProcess.h
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uc_ttxuc/CPPProcess.h
@@ -4,7 +4,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
 // MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
@@ -25,7 +25,7 @@
 
 //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -112,7 +112,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   __global__ void
   computeDependentCouplings( const fptype* allgs,    // input: Gs[nevt]
                              fptype* allcouplings ); // output: couplings[nevt*ndcoup*2]
@@ -125,7 +125,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__ /* clang-format off */
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
   __global__ void
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
@@ -155,7 +155,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__ /* clang-format off */
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
   __global__ void
   sigmaKin( const fptype* allmomenta,      // input: momenta[nevt*npar*4]
             const fptype* allcouplings,    // input: couplings[nevt*ndcoup*2]
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uc_ttxuc/CudaRuntime.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uc_ttxuc/CudaRuntime.h
deleted file mode 120000
index ce9e1a487a..0000000000
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uc_ttxuc/CudaRuntime.h
+++ /dev/null
@@ -1 +0,0 @@
-../CudaRuntime.h
\ No newline at end of file
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uc_ttxuc/GpuAbstraction.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uc_ttxuc/GpuAbstraction.h
new file mode 120000
index 0000000000..72054e19ba
--- /dev/null
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uc_ttxuc/GpuAbstraction.h
@@ -0,0 +1 @@
+../GpuAbstraction.h
\ No newline at end of file
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uc_ttxuc/GpuRuntime.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uc_ttxuc/GpuRuntime.h
new file mode 120000
index 0000000000..3920e83be4
--- /dev/null
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uc_ttxuc/GpuRuntime.h
@@ -0,0 +1 @@
+../GpuRuntime.h
\ No newline at end of file
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uc_ttxuc/check_sa.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uc_ttxuc/check_sa.cc
index 3fbf0ffbee..7cac5ab47b 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uc_ttxuc/check_sa.cc
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uc_ttxuc/check_sa.cc
@@ -4,7 +4,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: O. Mattelaer (Nov 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 
 #include "mgOnGpuConfig.h"
@@ -12,6 +12,7 @@
 #include "BridgeKernels.h"
 #include "CPPProcess.h"
 #include "CrossSectionKernels.h"
+#include "GpuRuntime.h"
 #include "MatrixElementKernels.h"
 #include "MemoryAccessMatrixElements.h"
 #include "MemoryAccessMomenta.h"
@@ -65,7 +66,7 @@ usage( char* argv0, int ret = 1 )
   std::cout << std::endl;
   std::cout << "Summary stats are always computed: '-p' and '-j' only control their printout" << std::endl;
   std::cout << "The '-d' flag only enables NaN/abnormal warnings and OMP debugging" << std::endl;
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 #ifdef _OPENMP
   std::cout << std::endl;
   std::cout << "Use the OMP_NUM_THREADS environment variable to control OMP multi-threading" << std::endl;
@@ -96,7 +97,7 @@ int
 main( int argc, char** argv )
 {
   // Namespaces for CUDA and C++ (FIXME - eventually use the same namespace everywhere...)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   using namespace mg5amcGpu;
 #else
   using namespace mg5amcCpu;
@@ -134,9 +135,11 @@ main( int argc, char** argv )
     CurandDevice = 2
   };
 #ifdef MGONGPU_HAS_NO_CURAND
-  RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // this is the only supported mode if build has no curand (PR #784)
+  RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // this is the only supported mode if build has no curand (PR #784 and #785)
+#elif defined __HIPCC__
+#error Internal error: MGONGPU_HAS_NO_CURAND should have been set for __HIPCC__ // default on AMD GPUs should be common random
 #elif defined __CUDACC__
-  RandomNumberMode rndgen = RandomNumberMode::CurandDevice; // default on GPU if build has curand
+  RandomNumberMode rndgen = RandomNumberMode::CurandDevice; // default on NVidia GPU if build has curand
 #else
   RandomNumberMode rndgen = RandomNumberMode::CurandHost; // default on CPU if build has curand
 #endif
@@ -146,10 +149,10 @@ main( int argc, char** argv )
     RamboHost = 1,
     RamboDevice = 2
   };
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   RamboSamplingMode rmbsmp = RamboSamplingMode::RamboDevice; // default on GPU
 #else
-  RamboSamplingMode rmbsmp = RamboSamplingMode::RamboHost;  // default on CPU
+  RamboSamplingMode rmbsmp = RamboSamplingMode::RamboHost; // default on CPU
 #endif
   // Bridge emulation mode (NB Bridge implies RamboHost!)
   bool bridge = false;
@@ -177,7 +180,7 @@ main( int argc, char** argv )
     else if( arg == "--curdev" )
     {
 #ifndef __CUDACC__
-      throw std::runtime_error( "CurandDevice is not supported on CPUs" );
+      throw std::runtime_error( "CurandDevice is not supported on CPUs or non-NVidia GPUs" );
 #elif defined MGONGPU_HAS_NO_CURAND
       throw std::runtime_error( "CurandDevice is not supported because this application was built without Curand support" );
 #else
@@ -198,7 +201,7 @@ main( int argc, char** argv )
     }
     else if( arg == "--rmbdev" )
     {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
       rmbsmp = RamboSamplingMode::RamboDevice;
 #else
       throw std::runtime_error( "RamboDevice is not supported on CPUs" );
@@ -272,13 +275,13 @@ main( int argc, char** argv )
     return usage( argv[0] );
   }
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 #ifdef _OPENMP
   ompnumthreadsNotSetMeansOneThread( debug ? 1 : 0 ); // quiet(-1), info(0), debug(1)
 #endif
 #endif
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // Fail gently and avoid "Illegal instruction (core dumped)" if the host does not support the SIMD used in the ME calculation
   // Note: this prevents a crash on pmpe04 but not on some github CI nodes?
   // [NB: SIMD vectorization in mg5amc C++ code is only used in the ME calculation below MatrixElementKernelHost!]
@@ -296,14 +299,14 @@ main( int argc, char** argv )
 
   // === STEP 0 - INITIALISE
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 
-  // --- 00. Initialise cuda
-  // Instantiate a CudaRuntime at the beginnining of the application's main to
-  // invoke cudaSetDevice(0) in the constructor and book a cudaDeviceReset() call in the destructor
-  const std::string cdinKey = "00 CudaInit";
+  // --- 00. Initialise GPU
+  // Instantiate a GpuRuntime at the beginnining of the application's main.
+  // For CUDA this invokes cudaSetDevice(0) in the constructor and books a cudaDeviceReset() call in the destructor.
+  const std::string cdinKey = "00 GpuInit";
   timermap.start( cdinKey );
-  CudaRuntime cudaRuntime( debug );
+  GpuRuntime GpuRuntime( debug );
 #endif
 
   // --- 0a. Initialise physics process
@@ -325,7 +328,7 @@ main( int argc, char** argv )
   timermap.start( alloKey );
 
   // Memory buffers for random numbers for momenta
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferRndNumMomenta hstRndmom( nevt );
 #else
   PinnedHostBufferRndNumMomenta hstRndmom( nevt );
@@ -333,7 +336,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for sampling weights
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferWeights hstWeights( nevt );
 #else
   PinnedHostBufferWeights hstWeights( nevt );
@@ -341,7 +344,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for momenta
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferMomenta hstMomenta( nevt );
 #else
   PinnedHostBufferMomenta hstMomenta( nevt );
@@ -349,7 +352,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for Gs
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferGs hstGs( nevt );
 #else
   PinnedHostBufferGs hstGs( nevt );
@@ -366,7 +369,7 @@ main( int argc, char** argv )
   }
 
   // Memory buffers for matrix elements
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferMatrixElements hstMatrixElements( nevt );
 #else
   PinnedHostBufferMatrixElements hstMatrixElements( nevt );
@@ -375,7 +378,7 @@ main( int argc, char** argv )
 
   // Memory buffers for random numbers for helicity selection
   // *** NB #403 these buffers always remain initialised at 0: no need for helicity choice in gcheck/check (no LHE produced) ***
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferRndNumHelicity hstRndHel( nevt );
 #else
   PinnedHostBufferRndNumHelicity hstRndHel( nevt );
@@ -384,7 +387,7 @@ main( int argc, char** argv )
 
   // Memory buffers for random numbers for color selection
   // *** NB #402 these buffers always remain initialised at 0: no need for color choice in gcheck/check (no LHE produced) ***
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferRndNumColor hstRndCol( nevt );
 #else
   PinnedHostBufferRndNumColor hstRndCol( nevt );
@@ -392,7 +395,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for helicity selection
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferSelectedHelicity hstSelHel( nevt );
 #else
   PinnedHostBufferSelectedHelicity hstSelHel( nevt );
@@ -400,7 +403,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for color selection
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferSelectedColor hstSelCol( nevt );
 #else
   PinnedHostBufferSelectedColor hstSelCol( nevt );
@@ -438,7 +441,7 @@ main( int argc, char** argv )
     const bool onDevice = true;
     prnk.reset( new CurandRandomNumberKernel( devRndmom, onDevice ) );
 #else
-    throw std::logic_error( "INTERNAL ERROR! CurandDevice is not supported on CPUs" ); // INTERNAL ERROR (no path to this statement)
+    throw std::logic_error( "INTERNAL ERROR! CurandDevice is not supported on CPUs or non-NVidia GPUs" ); // INTERNAL ERROR (no path to this statement)
 #endif
   }
 
@@ -450,7 +453,7 @@ main( int argc, char** argv )
   }
   else
   {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     prsk.reset( new RamboSamplingKernelDevice( energy, devRndmom, devMomenta, devWeights, gpublocks, gputhreads ) );
 #else
     throw std::logic_error( "RamboDevice is not supported on CPUs" ); // INTERNAL ERROR (no path to this statement)
@@ -461,7 +464,7 @@ main( int argc, char** argv )
   std::unique_ptr<MatrixElementKernelBase> pmek;
   if( !bridge )
   {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     pmek.reset( new MatrixElementKernelDevice( devMomenta, devGs, devRndHel, devRndCol, devMatrixElements, devSelHel, devSelCol, gpublocks, gputhreads ) );
 #else
     pmek.reset( new MatrixElementKernelHost( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, nevt ) );
@@ -469,7 +472,7 @@ main( int argc, char** argv )
   }
   else
   {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     pmek.reset( new BridgeKernelDevice( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, gpublocks, gputhreads ) );
 #else
     pmek.reset( new BridgeKernelHost( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, nevt ) );
@@ -511,7 +514,7 @@ main( int argc, char** argv )
     prnk->generateRnarray();
     //std::cout << "Got random numbers" << std::endl;
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     if( rndgen != RandomNumberMode::CurandDevice && rmbsmp == RamboSamplingMode::RamboDevice )
     {
       // --- 1c. Copy rndmom from host to device
@@ -543,7 +546,7 @@ main( int argc, char** argv )
     prsk->getMomentaFinal();
     //std::cout << "Got final momenta" << std::endl;
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     if( rmbsmp == RamboSamplingMode::RamboDevice )
     {
       // --- 2c. CopyDToH Weights
@@ -588,7 +591,7 @@ main( int argc, char** argv )
       dynamic_cast<BridgeKernelBase*>( pmek.get() )->transposeInputMomentaC2F();
     }
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     // --- 2d. CopyHToD Momenta
     const std::string gKey = "0.. CpHTDg";
     rambtime += timermap.start( gKey ); // FIXME! NOT A RAMBO TIMER!
@@ -617,7 +620,7 @@ main( int argc, char** argv )
     wv3atime += timermap.stop(); // calc only
     wavetime += wv3atime;        // calc plus copy
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     if( !bridge )
     {
       // --- 3b. CopyDToH MEs
@@ -760,18 +763,22 @@ main( int argc, char** argv )
     rndgentxt = "CURAND DEVICE";
 #ifdef __CUDACC__
   rndgentxt += " (CUDA code)";
+#elif defined __HIPCC__
+  rndgentxt += " (HIP code)";
 #else
   rndgentxt += " (C++ code)";
 #endif
 
   // Workflow description summary
   std::string wrkflwtxt;
-  // -- CUDA or C++?
+  // -- CUDA or HIP or C++?
 #ifdef __CUDACC__
   wrkflwtxt += "CUD:";
+#elif defined __HIPCC__
+  wrkflwtxt += "HIP:";
 #else
   wrkflwtxt += "CPP:";
-#endif
+#endif /* clang-format off */
   // -- DOUBLE or FLOAT?
 #if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
   wrkflwtxt += "MIX+"; // mixed fptypes (single precision color algebra #537)
@@ -781,7 +788,7 @@ main( int argc, char** argv )
   wrkflwtxt += "FLT+";
 #else
   wrkflwtxt += "???+"; // no path to this statement
-#endif
+#endif /* clang-format on */
   // -- CUCOMPLEX or THRUST or STD complex numbers?
 #ifdef __CUDACC__
 #if defined MGONGPU_CUCXTYPE_CUCOMPLEX
@@ -793,6 +800,12 @@ main( int argc, char** argv )
 #else
   wrkflwtxt += "???:"; // no path to this statement
 #endif
+#elif defined __HIPCC__
+#if defined MGONGPU_CUCXTYPE_CXSMPL
+  wrkflwtxt += "CXS:";
+#else
+  wrkflwtxt += "???:"; // no path to this statement
+#endif
 #else
 #if defined MGONGPU_CPPCXTYPE_STDCOMPLEX
   wrkflwtxt += "STX:";
@@ -818,7 +831,7 @@ main( int argc, char** argv )
     wrkflwtxt += "RMBDEV+";
   else
     wrkflwtxt += "??????+"; // no path to this statement
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   // -- HOST or DEVICE matrix elements? Standalone MEs or BRIDGE?
   if( !bridge )
     wrkflwtxt += "MESDEV";
@@ -874,7 +887,7 @@ main( int argc, char** argv )
 
   if( perf )
   {
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 #ifdef _OPENMP
     // Get the output of "nproc --all" (https://stackoverflow.com/a/478960)
     std::string nprocall;
@@ -895,6 +908,8 @@ main( int argc, char** argv )
     std::cout << std::string( SEP79, '*' ) << std::endl
 #ifdef __CUDACC__
               << "Process                     = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_CUDA"
+#elif defined __HIPCC__
+              << "Process                     = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_HIP"
 #else
               << "Process                     = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_CPP"
 #endif
@@ -921,21 +936,21 @@ main( int argc, char** argv )
 #elif defined MGONGPU_FPTYPE_FLOAT
               << "FP precision                = FLOAT (NaN/abnormal=" << nabn << ", zero=" << nzero << ")" << std::endl
 #endif
-#ifdef __CUDACC__
 #if defined MGONGPU_CUCXTYPE_CUCOMPLEX
               << "Complex type                = CUCOMPLEX" << std::endl
 #elif defined MGONGPU_CUCXTYPE_THRUST
               << "Complex type                = THRUST::COMPLEX" << std::endl
-#endif
-#else
+#elif defined MGONGPU_CUCXTYPE_CXSMPL
               << "Complex type                = STD::COMPLEX" << std::endl
+#else
+              << "Complex type                = ???" << std::endl // no path to this statement...
 #endif
               << "RanNumb memory layout       = AOSOA[" << neppR << "]"
               << ( neppR == 1 ? " == AOS" : "" )
               << " [HARDCODED FOR REPRODUCIBILITY]" << std::endl
               << "Momenta memory layout       = AOSOA[" << neppM << "]"
               << ( neppM == 1 ? " == AOS" : "" ) << std::endl
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     //<< "Wavefunction GPU memory     = LOCAL" << std::endl
 #else
 #if !defined MGONGPU_CPPSIMD
@@ -966,7 +981,7 @@ main( int argc, char** argv )
 #endif
 #endif
               << "Random number generation    = " << rndgentxt << std::endl
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 #ifdef _OPENMP
               << "OMP threads / `nproc --all` = " << omp_get_max_threads() << " / " << nprocall // includes a newline
 #endif
@@ -1062,14 +1077,14 @@ main( int argc, char** argv )
              << "\"FLOAT (NaN/abnormal=" << nabn << ")\"," << std::endl
 #endif
              << "\"Complex type\": "
-#ifdef __CUDACC__
 #if defined MGONGPU_CUCXTYPE_CUCOMPLEX
              << "\"CUCOMPLEX\"," << std::endl
 #elif defined MGONGPU_CUCXTYPE_THRUST
              << "\"THRUST::COMPLEX\"," << std::endl
-#endif
-#else
+#elif defined MGONGPU_CUCXTYPE_CXSMPL
              << "\"STD::COMPLEX\"," << std::endl
+#else
+             << "\"???\"," << std::endl                           // no path to this statement...
 #endif
              << "\"RanNumb memory layout\": "
              << "\"AOSOA[" << neppR << "]\""
@@ -1077,7 +1092,7 @@ main( int argc, char** argv )
              << "\"Momenta memory layout\": "
              << "\"AOSOA[" << neppM << "]\""
              << ( neppM == 1 ? " == AOS" : "" ) << ", " << std::endl
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     //<< "\"Wavefunction GPU memory\": " << "\"LOCAL\"," << std::endl
 #endif
              << "\"Curand generation\": "
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_ucx_ttxucx/CPPProcess.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_ucx_ttxucx/CPPProcess.cc
index e4f9dee3a2..fc99b3bfae 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_ucx_ttxucx/CPPProcess.cc
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_ucx_ttxucx/CPPProcess.cc
@@ -4,7 +4,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
 // MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
@@ -16,7 +16,6 @@
 
 #include "mgOnGpuConfig.h"
 
-#include "CudaRuntime.h"
 #include "HelAmps_sm.h"
 #include "MemoryAccessAmplitudes.h"
 #include "MemoryAccessCouplings.h"
@@ -57,7 +56,7 @@
 // Process: s c~ > t t~ s c~ WEIGHTED<=4 @2
 // Process: s d~ > t t~ s d~ WEIGHTED<=4 @2
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -91,7 +90,7 @@ namespace mg5amcCpu
   __device__ const fptype cIPD[2] = { (fptype)Parameters_sm::mdl_MT, (fptype)Parameters_sm::mdl_WT };
   __device__ const fptype* cIPC = nullptr; // unused as nicoup=0
 #else
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   __device__ __constant__ fptype cIPD[2];
   __device__ __constant__ fptype* cIPC = nullptr; // unused as nicoup=0
 #else
@@ -101,7 +100,7 @@ namespace mg5amcCpu
 #endif
 
   // Helicity combinations (and filtering of "good" helicity combinations)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   __device__ __constant__ short cHel[ncomb][npar];
   __device__ __constant__ int cNGoodHel;
   __device__ __constant__ int cGoodHel[ncomb];
@@ -129,13 +128,13 @@ namespace mg5amcCpu
                            fptype* allDenominators,       // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
                            fptype_sv* jamp2_sv            // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled)
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
                            , const int ievt00             // input: first event number in current C++ event page (for CUDA, ievt depends on threadid)
 #endif
                            )
   //ALWAYS_INLINE // attributes are not permitted in a function definition
   {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     using namespace mg5amcGpu;
     using M_ACCESS = DeviceAccessMomenta;         // non-trivial access: buffer includes all events
     using E_ACCESS = DeviceAccessMatrixElements;  // non-trivial access: buffer includes all events
@@ -162,7 +161,7 @@ namespace mg5amcCpu
 #endif /* clang-format on */
     mgDebug( 0, __FUNCTION__ );
     //printf( "calculate_wavefunctions: ihel=%2d\n", ihel );
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
     //printf( "calculate_wavefunctions: ievt00=%d\n", ievt00 );
 #endif
 
@@ -198,7 +197,7 @@ namespace mg5amcCpu
 #endif
     for( int iParity = 0; iParity < nParity; ++iParity )
     { // START LOOP ON IPARITY
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
       const int ievt0 = ievt00 + iParity * neppV;
 #endif
       constexpr size_t nxcoup = ndcoup + nicoup; // both dependent and independent couplings
@@ -211,8 +210,10 @@ namespace mg5amcCpu
         allCOUPs[idcoup] = CD_ACCESS::idcoupAccessBufferConst( allcouplings, idcoup ); // dependent couplings, vary event-by-event
       for( size_t iicoup = 0; iicoup < nicoup; iicoup++ )
         allCOUPs[ndcoup + iicoup] = CI_ACCESS::iicoupAccessBufferConst( cIPC, iicoup ); // independent couplings, fixed for all events
+#ifdef MGONGPUCPP_GPUIMPL
 #ifdef __CUDACC__
 #pragma nv_diagnostic pop
+#endif
       // CUDA kernels take input/output buffers with momenta/MEs for all events
       const fptype* momenta = allmomenta;
       const fptype* COUPs[nxcoup];
@@ -393,7 +394,7 @@ namespace mg5amcCpu
         { 3, 9, 9, 3, 27, 9 },
         { 9, 3, 3, 9, 9, 27 } }; // 2-D array[6][6]
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
       // Pre-compute a constexpr triangular color matrix properly normalized #475
       struct TriangularNormalizedColorMatrix
       {
@@ -450,7 +451,7 @@ namespace mg5amcCpu
 #endif
       for( int icol = 0; icol < ncolor; icol++ )
       {
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
         // === C++ START ===
         // Diagonal terms
 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
@@ -509,7 +510,7 @@ namespace mg5amcCpu
       MEs_sv_previous += deltaMEs_previous;
 #endif
       /*
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
       if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", blockDim.x * blockIdx.x + threadIdx.x, ihel, MEs_sv );
 #else
 #ifdef MGONGPU_CPPSIMD
@@ -604,8 +605,8 @@ namespace mg5amcCpu
       { -1, 1, 1, -1, -1, -1 },
       { -1, 1, 1, -1, 1, 1 },
       { -1, 1, 1, -1, 1, -1 } };
-#ifdef __CUDACC__
-    checkCuda( cudaMemcpyToSymbol( cHel, tHel, ncomb * npar * sizeof( short ) ) );
+#ifdef MGONGPUCPP_GPUIMPL
+    gpuMemcpyToSymbol( cHel, tHel, ncomb * npar * sizeof( short ) );
 #else
     memcpy( cHel, tHel, ncomb * npar * sizeof( short ) );
 #endif
@@ -647,9 +648,9 @@ namespace mg5amcCpu
     // Then copy them to CUDA constant memory (issue #39) or its C++ emulation in file-scope static memory
     const fptype tIPD[2] = { (fptype)m_pars->mdl_MT, (fptype)m_pars->mdl_WT };
     //const cxtype tIPC[0] = { ... }; // nicoup=0
-#ifdef __CUDACC__
-    checkCuda( cudaMemcpyToSymbol( cIPD, tIPD, 2 * sizeof( fptype ) ) );
-    //checkCuda( cudaMemcpyToSymbol( cIPC, tIPC, 0 * sizeof( cxtype ) ) ); // nicoup=0
+#ifdef MGONGPUCPP_GPUIMPL
+    gpuMemcpyToSymbol( cIPD, tIPD, 2 * sizeof( fptype ) );
+    //gpuMemcpyToSymbol( cIPC, tIPC, 0 * sizeof( cxtype ) ); // nicoup=0
 #else
     memcpy( cIPD, tIPD, 2 * sizeof( fptype ) );
     //memcpy( cIPC, tIPC, 0 * sizeof( cxtype ) ); // nicoup=0
@@ -687,7 +688,7 @@ namespace mg5amcCpu
   {
     std::stringstream out;
     // CUDA version (NVCC)
-    // [Use __NVCC__ instead of __CUDACC__ here!]
+    // [Use __NVCC__ instead of MGONGPUCPP_GPUIMPL here!]
     // [This tests if 'nvcc' was used even to build a .cc file, even if not necessarily 'nvcc -x cu' for a .cu file]
     // [Check 'nvcc --compiler-options -dM -E dummy.c | grep CUDA': see https://stackoverflow.com/a/53713712]
 #ifdef __NVCC__
@@ -752,12 +753,12 @@ namespace mg5amcCpu
   __global__ void /* clang-format off */
   computeDependentCouplings( const fptype* allgs, // input: Gs[nevt]
                              fptype* allcouplings // output: couplings[nevt*ndcoup*2]
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
                              , const int nevt     // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #endif
   ) /* clang-format on */
   {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     using namespace mg5amcGpu;
     using G_ACCESS = DeviceAccessGs;
     using C_ACCESS = DeviceAccessCouplings;
@@ -778,7 +779,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__ /* clang-format off */
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
   __global__ void
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
@@ -904,9 +905,9 @@ namespace mg5amcCpu
         nGoodHel++;
       }
     }
-#ifdef __CUDACC__
-    checkCuda( cudaMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) ) );
-    checkCuda( cudaMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) ) );
+#ifdef MGONGPUCPP_GPUIMPL
+    gpuMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) );
+    gpuMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) );
 #else
     cNGoodHel = nGoodHel;
     for( int ihel = 0; ihel < ncomb; ihel++ ) cGoodHel[ihel] = goodHel[ihel];
@@ -930,7 +931,7 @@ namespace mg5amcCpu
 #endif
             int* allselhel,                // output: helicity selection[nevt]
             int* allselcol                 // output: helicity selection[nevt]
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
             , const int nevt               // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #endif
             ) /* clang-format on */
@@ -950,7 +951,7 @@ namespace mg5amcCpu
     // Denominators: spins, colors and identical particles
     constexpr int helcolDenominators[1] = { 36 }; // assume nprocesses == 1 (#272 and #343)
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     // Remember: in CUDA this is a kernel for one event, in c++ this processes n events
     const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid
 #else
@@ -964,9 +965,12 @@ namespace mg5amcCpu
 #endif
 
     // Start sigmaKin_lines
+
+#include "GpuAbstraction.h"
+
     // === PART 0 - INITIALISATION (before calculate_wavefunctions) ===
     // Reset the "matrix elements" - running sums of |M|^2 over helicities for the given event
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     allMEs[ievt] = 0;
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     allNumerators[ievt] = 0;
@@ -994,7 +998,7 @@ namespace mg5amcCpu
     // === PART 1 - HELICITY LOOP: CALCULATE WAVEFUNCTIONS ===
     // (in both CUDA and C++, using precomputed good helicities)
 
-#ifdef __CUDACC__ // CUDA OR C++
+#ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++
 
     // *** START OF PART 1a - CUDA (one event per CPU thread) ***
     // Running sum of partial amplitudes squared for event by event color selection (#402)
@@ -1204,7 +1208,7 @@ namespace mg5amcCpu
     // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event
     // [NB 'sum over final spins, average over initial spins', eg see
     // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf]
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     allMEs[ievt] /= helcolDenominators[0];
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     if( channelId > 0 ) allMEs[ievt] *= allNumerators[ievt] / allDenominators[ievt];
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_ucx_ttxucx/CPPProcess.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_ucx_ttxucx/CPPProcess.h
index 391789dc81..da747c3465 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_ucx_ttxucx/CPPProcess.h
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_ucx_ttxucx/CPPProcess.h
@@ -4,7 +4,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
 // MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
@@ -25,7 +25,7 @@
 
 //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -118,7 +118,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   __global__ void
   computeDependentCouplings( const fptype* allgs,    // input: Gs[nevt]
                              fptype* allcouplings ); // output: couplings[nevt*ndcoup*2]
@@ -131,7 +131,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__ /* clang-format off */
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
   __global__ void
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
@@ -161,7 +161,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__ /* clang-format off */
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
   __global__ void
   sigmaKin( const fptype* allmomenta,      // input: momenta[nevt*npar*4]
             const fptype* allcouplings,    // input: couplings[nevt*ndcoup*2]
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_ucx_ttxucx/CudaRuntime.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_ucx_ttxucx/CudaRuntime.h
deleted file mode 120000
index ce9e1a487a..0000000000
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_ucx_ttxucx/CudaRuntime.h
+++ /dev/null
@@ -1 +0,0 @@
-../CudaRuntime.h
\ No newline at end of file
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_ucx_ttxucx/GpuAbstraction.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_ucx_ttxucx/GpuAbstraction.h
new file mode 120000
index 0000000000..72054e19ba
--- /dev/null
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_ucx_ttxucx/GpuAbstraction.h
@@ -0,0 +1 @@
+../GpuAbstraction.h
\ No newline at end of file
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_ucx_ttxucx/GpuRuntime.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_ucx_ttxucx/GpuRuntime.h
new file mode 120000
index 0000000000..3920e83be4
--- /dev/null
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_ucx_ttxucx/GpuRuntime.h
@@ -0,0 +1 @@
+../GpuRuntime.h
\ No newline at end of file
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_ucx_ttxucx/check_sa.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_ucx_ttxucx/check_sa.cc
index 3fbf0ffbee..7cac5ab47b 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_ucx_ttxucx/check_sa.cc
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_ucx_ttxucx/check_sa.cc
@@ -4,7 +4,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: O. Mattelaer (Nov 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 
 #include "mgOnGpuConfig.h"
@@ -12,6 +12,7 @@
 #include "BridgeKernels.h"
 #include "CPPProcess.h"
 #include "CrossSectionKernels.h"
+#include "GpuRuntime.h"
 #include "MatrixElementKernels.h"
 #include "MemoryAccessMatrixElements.h"
 #include "MemoryAccessMomenta.h"
@@ -65,7 +66,7 @@ usage( char* argv0, int ret = 1 )
   std::cout << std::endl;
   std::cout << "Summary stats are always computed: '-p' and '-j' only control their printout" << std::endl;
   std::cout << "The '-d' flag only enables NaN/abnormal warnings and OMP debugging" << std::endl;
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 #ifdef _OPENMP
   std::cout << std::endl;
   std::cout << "Use the OMP_NUM_THREADS environment variable to control OMP multi-threading" << std::endl;
@@ -96,7 +97,7 @@ int
 main( int argc, char** argv )
 {
   // Namespaces for CUDA and C++ (FIXME - eventually use the same namespace everywhere...)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   using namespace mg5amcGpu;
 #else
   using namespace mg5amcCpu;
@@ -134,9 +135,11 @@ main( int argc, char** argv )
     CurandDevice = 2
   };
 #ifdef MGONGPU_HAS_NO_CURAND
-  RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // this is the only supported mode if build has no curand (PR #784)
+  RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // this is the only supported mode if build has no curand (PR #784 and #785)
+#elif defined __HIPCC__
+#error Internal error: MGONGPU_HAS_NO_CURAND should have been set for __HIPCC__ // default on AMD GPUs should be common random
 #elif defined __CUDACC__
-  RandomNumberMode rndgen = RandomNumberMode::CurandDevice; // default on GPU if build has curand
+  RandomNumberMode rndgen = RandomNumberMode::CurandDevice; // default on NVidia GPU if build has curand
 #else
   RandomNumberMode rndgen = RandomNumberMode::CurandHost; // default on CPU if build has curand
 #endif
@@ -146,10 +149,10 @@ main( int argc, char** argv )
     RamboHost = 1,
     RamboDevice = 2
   };
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   RamboSamplingMode rmbsmp = RamboSamplingMode::RamboDevice; // default on GPU
 #else
-  RamboSamplingMode rmbsmp = RamboSamplingMode::RamboHost;  // default on CPU
+  RamboSamplingMode rmbsmp = RamboSamplingMode::RamboHost; // default on CPU
 #endif
   // Bridge emulation mode (NB Bridge implies RamboHost!)
   bool bridge = false;
@@ -177,7 +180,7 @@ main( int argc, char** argv )
     else if( arg == "--curdev" )
     {
 #ifndef __CUDACC__
-      throw std::runtime_error( "CurandDevice is not supported on CPUs" );
+      throw std::runtime_error( "CurandDevice is not supported on CPUs or non-NVidia GPUs" );
 #elif defined MGONGPU_HAS_NO_CURAND
       throw std::runtime_error( "CurandDevice is not supported because this application was built without Curand support" );
 #else
@@ -198,7 +201,7 @@ main( int argc, char** argv )
     }
     else if( arg == "--rmbdev" )
     {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
       rmbsmp = RamboSamplingMode::RamboDevice;
 #else
       throw std::runtime_error( "RamboDevice is not supported on CPUs" );
@@ -272,13 +275,13 @@ main( int argc, char** argv )
     return usage( argv[0] );
   }
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 #ifdef _OPENMP
   ompnumthreadsNotSetMeansOneThread( debug ? 1 : 0 ); // quiet(-1), info(0), debug(1)
 #endif
 #endif
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // Fail gently and avoid "Illegal instruction (core dumped)" if the host does not support the SIMD used in the ME calculation
   // Note: this prevents a crash on pmpe04 but not on some github CI nodes?
   // [NB: SIMD vectorization in mg5amc C++ code is only used in the ME calculation below MatrixElementKernelHost!]
@@ -296,14 +299,14 @@ main( int argc, char** argv )
 
   // === STEP 0 - INITIALISE
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 
-  // --- 00. Initialise cuda
-  // Instantiate a CudaRuntime at the beginnining of the application's main to
-  // invoke cudaSetDevice(0) in the constructor and book a cudaDeviceReset() call in the destructor
-  const std::string cdinKey = "00 CudaInit";
+  // --- 00. Initialise GPU
+  // Instantiate a GpuRuntime at the beginnining of the application's main.
+  // For CUDA this invokes cudaSetDevice(0) in the constructor and books a cudaDeviceReset() call in the destructor.
+  const std::string cdinKey = "00 GpuInit";
   timermap.start( cdinKey );
-  CudaRuntime cudaRuntime( debug );
+  GpuRuntime GpuRuntime( debug );
 #endif
 
   // --- 0a. Initialise physics process
@@ -325,7 +328,7 @@ main( int argc, char** argv )
   timermap.start( alloKey );
 
   // Memory buffers for random numbers for momenta
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferRndNumMomenta hstRndmom( nevt );
 #else
   PinnedHostBufferRndNumMomenta hstRndmom( nevt );
@@ -333,7 +336,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for sampling weights
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferWeights hstWeights( nevt );
 #else
   PinnedHostBufferWeights hstWeights( nevt );
@@ -341,7 +344,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for momenta
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferMomenta hstMomenta( nevt );
 #else
   PinnedHostBufferMomenta hstMomenta( nevt );
@@ -349,7 +352,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for Gs
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferGs hstGs( nevt );
 #else
   PinnedHostBufferGs hstGs( nevt );
@@ -366,7 +369,7 @@ main( int argc, char** argv )
   }
 
   // Memory buffers for matrix elements
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferMatrixElements hstMatrixElements( nevt );
 #else
   PinnedHostBufferMatrixElements hstMatrixElements( nevt );
@@ -375,7 +378,7 @@ main( int argc, char** argv )
 
   // Memory buffers for random numbers for helicity selection
   // *** NB #403 these buffers always remain initialised at 0: no need for helicity choice in gcheck/check (no LHE produced) ***
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferRndNumHelicity hstRndHel( nevt );
 #else
   PinnedHostBufferRndNumHelicity hstRndHel( nevt );
@@ -384,7 +387,7 @@ main( int argc, char** argv )
 
   // Memory buffers for random numbers for color selection
   // *** NB #402 these buffers always remain initialised at 0: no need for color choice in gcheck/check (no LHE produced) ***
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferRndNumColor hstRndCol( nevt );
 #else
   PinnedHostBufferRndNumColor hstRndCol( nevt );
@@ -392,7 +395,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for helicity selection
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferSelectedHelicity hstSelHel( nevt );
 #else
   PinnedHostBufferSelectedHelicity hstSelHel( nevt );
@@ -400,7 +403,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for color selection
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferSelectedColor hstSelCol( nevt );
 #else
   PinnedHostBufferSelectedColor hstSelCol( nevt );
@@ -438,7 +441,7 @@ main( int argc, char** argv )
     const bool onDevice = true;
     prnk.reset( new CurandRandomNumberKernel( devRndmom, onDevice ) );
 #else
-    throw std::logic_error( "INTERNAL ERROR! CurandDevice is not supported on CPUs" ); // INTERNAL ERROR (no path to this statement)
+    throw std::logic_error( "INTERNAL ERROR! CurandDevice is not supported on CPUs or non-NVidia GPUs" ); // INTERNAL ERROR (no path to this statement)
 #endif
   }
 
@@ -450,7 +453,7 @@ main( int argc, char** argv )
   }
   else
   {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     prsk.reset( new RamboSamplingKernelDevice( energy, devRndmom, devMomenta, devWeights, gpublocks, gputhreads ) );
 #else
     throw std::logic_error( "RamboDevice is not supported on CPUs" ); // INTERNAL ERROR (no path to this statement)
@@ -461,7 +464,7 @@ main( int argc, char** argv )
   std::unique_ptr<MatrixElementKernelBase> pmek;
   if( !bridge )
   {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     pmek.reset( new MatrixElementKernelDevice( devMomenta, devGs, devRndHel, devRndCol, devMatrixElements, devSelHel, devSelCol, gpublocks, gputhreads ) );
 #else
     pmek.reset( new MatrixElementKernelHost( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, nevt ) );
@@ -469,7 +472,7 @@ main( int argc, char** argv )
   }
   else
   {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     pmek.reset( new BridgeKernelDevice( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, gpublocks, gputhreads ) );
 #else
     pmek.reset( new BridgeKernelHost( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, nevt ) );
@@ -511,7 +514,7 @@ main( int argc, char** argv )
     prnk->generateRnarray();
     //std::cout << "Got random numbers" << std::endl;
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     if( rndgen != RandomNumberMode::CurandDevice && rmbsmp == RamboSamplingMode::RamboDevice )
     {
       // --- 1c. Copy rndmom from host to device
@@ -543,7 +546,7 @@ main( int argc, char** argv )
     prsk->getMomentaFinal();
     //std::cout << "Got final momenta" << std::endl;
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     if( rmbsmp == RamboSamplingMode::RamboDevice )
     {
       // --- 2c. CopyDToH Weights
@@ -588,7 +591,7 @@ main( int argc, char** argv )
       dynamic_cast<BridgeKernelBase*>( pmek.get() )->transposeInputMomentaC2F();
     }
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     // --- 2d. CopyHToD Momenta
     const std::string gKey = "0.. CpHTDg";
     rambtime += timermap.start( gKey ); // FIXME! NOT A RAMBO TIMER!
@@ -617,7 +620,7 @@ main( int argc, char** argv )
     wv3atime += timermap.stop(); // calc only
     wavetime += wv3atime;        // calc plus copy
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     if( !bridge )
     {
       // --- 3b. CopyDToH MEs
@@ -760,18 +763,22 @@ main( int argc, char** argv )
     rndgentxt = "CURAND DEVICE";
 #ifdef __CUDACC__
   rndgentxt += " (CUDA code)";
+#elif defined __HIPCC__
+  rndgentxt += " (HIP code)";
 #else
   rndgentxt += " (C++ code)";
 #endif
 
   // Workflow description summary
   std::string wrkflwtxt;
-  // -- CUDA or C++?
+  // -- CUDA or HIP or C++?
 #ifdef __CUDACC__
   wrkflwtxt += "CUD:";
+#elif defined __HIPCC__
+  wrkflwtxt += "HIP:";
 #else
   wrkflwtxt += "CPP:";
-#endif
+#endif /* clang-format off */
   // -- DOUBLE or FLOAT?
 #if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
   wrkflwtxt += "MIX+"; // mixed fptypes (single precision color algebra #537)
@@ -781,7 +788,7 @@ main( int argc, char** argv )
   wrkflwtxt += "FLT+";
 #else
   wrkflwtxt += "???+"; // no path to this statement
-#endif
+#endif /* clang-format on */
   // -- CUCOMPLEX or THRUST or STD complex numbers?
 #ifdef __CUDACC__
 #if defined MGONGPU_CUCXTYPE_CUCOMPLEX
@@ -793,6 +800,12 @@ main( int argc, char** argv )
 #else
   wrkflwtxt += "???:"; // no path to this statement
 #endif
+#elif defined __HIPCC__
+#if defined MGONGPU_CUCXTYPE_CXSMPL
+  wrkflwtxt += "CXS:";
+#else
+  wrkflwtxt += "???:"; // no path to this statement
+#endif
 #else
 #if defined MGONGPU_CPPCXTYPE_STDCOMPLEX
   wrkflwtxt += "STX:";
@@ -818,7 +831,7 @@ main( int argc, char** argv )
     wrkflwtxt += "RMBDEV+";
   else
     wrkflwtxt += "??????+"; // no path to this statement
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   // -- HOST or DEVICE matrix elements? Standalone MEs or BRIDGE?
   if( !bridge )
     wrkflwtxt += "MESDEV";
@@ -874,7 +887,7 @@ main( int argc, char** argv )
 
   if( perf )
   {
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 #ifdef _OPENMP
     // Get the output of "nproc --all" (https://stackoverflow.com/a/478960)
     std::string nprocall;
@@ -895,6 +908,8 @@ main( int argc, char** argv )
     std::cout << std::string( SEP79, '*' ) << std::endl
 #ifdef __CUDACC__
               << "Process                     = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_CUDA"
+#elif defined __HIPCC__
+              << "Process                     = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_HIP"
 #else
               << "Process                     = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_CPP"
 #endif
@@ -921,21 +936,21 @@ main( int argc, char** argv )
 #elif defined MGONGPU_FPTYPE_FLOAT
               << "FP precision                = FLOAT (NaN/abnormal=" << nabn << ", zero=" << nzero << ")" << std::endl
 #endif
-#ifdef __CUDACC__
 #if defined MGONGPU_CUCXTYPE_CUCOMPLEX
               << "Complex type                = CUCOMPLEX" << std::endl
 #elif defined MGONGPU_CUCXTYPE_THRUST
               << "Complex type                = THRUST::COMPLEX" << std::endl
-#endif
-#else
+#elif defined MGONGPU_CUCXTYPE_CXSMPL
               << "Complex type                = STD::COMPLEX" << std::endl
+#else
+              << "Complex type                = ???" << std::endl // no path to this statement...
 #endif
               << "RanNumb memory layout       = AOSOA[" << neppR << "]"
               << ( neppR == 1 ? " == AOS" : "" )
               << " [HARDCODED FOR REPRODUCIBILITY]" << std::endl
               << "Momenta memory layout       = AOSOA[" << neppM << "]"
               << ( neppM == 1 ? " == AOS" : "" ) << std::endl
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     //<< "Wavefunction GPU memory     = LOCAL" << std::endl
 #else
 #if !defined MGONGPU_CPPSIMD
@@ -966,7 +981,7 @@ main( int argc, char** argv )
 #endif
 #endif
               << "Random number generation    = " << rndgentxt << std::endl
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 #ifdef _OPENMP
               << "OMP threads / `nproc --all` = " << omp_get_max_threads() << " / " << nprocall // includes a newline
 #endif
@@ -1062,14 +1077,14 @@ main( int argc, char** argv )
              << "\"FLOAT (NaN/abnormal=" << nabn << ")\"," << std::endl
 #endif
              << "\"Complex type\": "
-#ifdef __CUDACC__
 #if defined MGONGPU_CUCXTYPE_CUCOMPLEX
              << "\"CUCOMPLEX\"," << std::endl
 #elif defined MGONGPU_CUCXTYPE_THRUST
              << "\"THRUST::COMPLEX\"," << std::endl
-#endif
-#else
+#elif defined MGONGPU_CUCXTYPE_CXSMPL
              << "\"STD::COMPLEX\"," << std::endl
+#else
+             << "\"???\"," << std::endl                           // no path to this statement...
 #endif
              << "\"RanNumb memory layout\": "
              << "\"AOSOA[" << neppR << "]\""
@@ -1077,7 +1092,7 @@ main( int argc, char** argv )
              << "\"Momenta memory layout\": "
              << "\"AOSOA[" << neppM << "]\""
              << ( neppM == 1 ? " == AOS" : "" ) << ", " << std::endl
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     //<< "\"Wavefunction GPU memory\": " << "\"LOCAL\"," << std::endl
 #endif
              << "\"Curand generation\": "
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uu_ttxuu/CPPProcess.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uu_ttxuu/CPPProcess.cc
index 302d63e31d..97912e5855 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uu_ttxuu/CPPProcess.cc
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uu_ttxuu/CPPProcess.cc
@@ -4,7 +4,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
 // MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
@@ -16,7 +16,6 @@
 
 #include "mgOnGpuConfig.h"
 
-#include "CudaRuntime.h"
 #include "HelAmps_sm.h"
 #include "MemoryAccessAmplitudes.h"
 #include "MemoryAccessCouplings.h"
@@ -49,7 +48,7 @@
 // Process: d d > t t~ d d WEIGHTED<=4 @2
 // Process: s s > t t~ s s WEIGHTED<=4 @2
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -83,7 +82,7 @@ namespace mg5amcCpu
   __device__ const fptype cIPD[2] = { (fptype)Parameters_sm::mdl_MT, (fptype)Parameters_sm::mdl_WT };
   __device__ const fptype* cIPC = nullptr; // unused as nicoup=0
 #else
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   __device__ __constant__ fptype cIPD[2];
   __device__ __constant__ fptype* cIPC = nullptr; // unused as nicoup=0
 #else
@@ -93,7 +92,7 @@ namespace mg5amcCpu
 #endif
 
   // Helicity combinations (and filtering of "good" helicity combinations)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   __device__ __constant__ short cHel[ncomb][npar];
   __device__ __constant__ int cNGoodHel;
   __device__ __constant__ int cGoodHel[ncomb];
@@ -121,13 +120,13 @@ namespace mg5amcCpu
                            fptype* allDenominators,       // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
                            fptype_sv* jamp2_sv            // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled)
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
                            , const int ievt00             // input: first event number in current C++ event page (for CUDA, ievt depends on threadid)
 #endif
                            )
   //ALWAYS_INLINE // attributes are not permitted in a function definition
   {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     using namespace mg5amcGpu;
     using M_ACCESS = DeviceAccessMomenta;         // non-trivial access: buffer includes all events
     using E_ACCESS = DeviceAccessMatrixElements;  // non-trivial access: buffer includes all events
@@ -154,7 +153,7 @@ namespace mg5amcCpu
 #endif /* clang-format on */
     mgDebug( 0, __FUNCTION__ );
     //printf( "calculate_wavefunctions: ihel=%2d\n", ihel );
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
     //printf( "calculate_wavefunctions: ievt00=%d\n", ievt00 );
 #endif
 
@@ -190,7 +189,7 @@ namespace mg5amcCpu
 #endif
     for( int iParity = 0; iParity < nParity; ++iParity )
     { // START LOOP ON IPARITY
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
       const int ievt0 = ievt00 + iParity * neppV;
 #endif
       constexpr size_t nxcoup = ndcoup + nicoup; // both dependent and independent couplings
@@ -203,8 +202,10 @@ namespace mg5amcCpu
         allCOUPs[idcoup] = CD_ACCESS::idcoupAccessBufferConst( allcouplings, idcoup ); // dependent couplings, vary event-by-event
       for( size_t iicoup = 0; iicoup < nicoup; iicoup++ )
         allCOUPs[ndcoup + iicoup] = CI_ACCESS::iicoupAccessBufferConst( cIPC, iicoup ); // independent couplings, fixed for all events
+#ifdef MGONGPUCPP_GPUIMPL
 #ifdef __CUDACC__
 #pragma nv_diagnostic pop
+#endif
       // CUDA kernels take input/output buffers with momenta/MEs for all events
       const fptype* momenta = allmomenta;
       const fptype* COUPs[nxcoup];
@@ -497,7 +498,7 @@ namespace mg5amcCpu
         { 3, 9, 9, 3, 27, 9 },
         { 9, 3, 3, 9, 9, 27 } }; // 2-D array[6][6]
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
       // Pre-compute a constexpr triangular color matrix properly normalized #475
       struct TriangularNormalizedColorMatrix
       {
@@ -554,7 +555,7 @@ namespace mg5amcCpu
 #endif
       for( int icol = 0; icol < ncolor; icol++ )
       {
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
         // === C++ START ===
         // Diagonal terms
 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
@@ -613,7 +614,7 @@ namespace mg5amcCpu
       MEs_sv_previous += deltaMEs_previous;
 #endif
       /*
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
       if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", blockDim.x * blockIdx.x + threadIdx.x, ihel, MEs_sv );
 #else
 #ifdef MGONGPU_CPPSIMD
@@ -708,8 +709,8 @@ namespace mg5amcCpu
       { -1, -1, 1, -1, -1, 1 },
       { -1, -1, 1, -1, 1, -1 },
       { -1, -1, 1, -1, 1, 1 } };
-#ifdef __CUDACC__
-    checkCuda( cudaMemcpyToSymbol( cHel, tHel, ncomb * npar * sizeof( short ) ) );
+#ifdef MGONGPUCPP_GPUIMPL
+    gpuMemcpyToSymbol( cHel, tHel, ncomb * npar * sizeof( short ) );
 #else
     memcpy( cHel, tHel, ncomb * npar * sizeof( short ) );
 #endif
@@ -751,9 +752,9 @@ namespace mg5amcCpu
     // Then copy them to CUDA constant memory (issue #39) or its C++ emulation in file-scope static memory
     const fptype tIPD[2] = { (fptype)m_pars->mdl_MT, (fptype)m_pars->mdl_WT };
     //const cxtype tIPC[0] = { ... }; // nicoup=0
-#ifdef __CUDACC__
-    checkCuda( cudaMemcpyToSymbol( cIPD, tIPD, 2 * sizeof( fptype ) ) );
-    //checkCuda( cudaMemcpyToSymbol( cIPC, tIPC, 0 * sizeof( cxtype ) ) ); // nicoup=0
+#ifdef MGONGPUCPP_GPUIMPL
+    gpuMemcpyToSymbol( cIPD, tIPD, 2 * sizeof( fptype ) );
+    //gpuMemcpyToSymbol( cIPC, tIPC, 0 * sizeof( cxtype ) ); // nicoup=0
 #else
     memcpy( cIPD, tIPD, 2 * sizeof( fptype ) );
     //memcpy( cIPC, tIPC, 0 * sizeof( cxtype ) ); // nicoup=0
@@ -791,7 +792,7 @@ namespace mg5amcCpu
   {
     std::stringstream out;
     // CUDA version (NVCC)
-    // [Use __NVCC__ instead of __CUDACC__ here!]
+    // [Use __NVCC__ instead of MGONGPUCPP_GPUIMPL here!]
     // [This tests if 'nvcc' was used even to build a .cc file, even if not necessarily 'nvcc -x cu' for a .cu file]
     // [Check 'nvcc --compiler-options -dM -E dummy.c | grep CUDA': see https://stackoverflow.com/a/53713712]
 #ifdef __NVCC__
@@ -856,12 +857,12 @@ namespace mg5amcCpu
   __global__ void /* clang-format off */
   computeDependentCouplings( const fptype* allgs, // input: Gs[nevt]
                              fptype* allcouplings // output: couplings[nevt*ndcoup*2]
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
                              , const int nevt     // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #endif
   ) /* clang-format on */
   {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     using namespace mg5amcGpu;
     using G_ACCESS = DeviceAccessGs;
     using C_ACCESS = DeviceAccessCouplings;
@@ -882,7 +883,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__ /* clang-format off */
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
   __global__ void
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
@@ -1008,9 +1009,9 @@ namespace mg5amcCpu
         nGoodHel++;
       }
     }
-#ifdef __CUDACC__
-    checkCuda( cudaMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) ) );
-    checkCuda( cudaMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) ) );
+#ifdef MGONGPUCPP_GPUIMPL
+    gpuMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) );
+    gpuMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) );
 #else
     cNGoodHel = nGoodHel;
     for( int ihel = 0; ihel < ncomb; ihel++ ) cGoodHel[ihel] = goodHel[ihel];
@@ -1034,7 +1035,7 @@ namespace mg5amcCpu
 #endif
             int* allselhel,                // output: helicity selection[nevt]
             int* allselcol                 // output: helicity selection[nevt]
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
             , const int nevt               // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #endif
             ) /* clang-format on */
@@ -1054,7 +1055,7 @@ namespace mg5amcCpu
     // Denominators: spins, colors and identical particles
     constexpr int helcolDenominators[1] = { 72 }; // assume nprocesses == 1 (#272 and #343)
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     // Remember: in CUDA this is a kernel for one event, in c++ this processes n events
     const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid
 #else
@@ -1068,9 +1069,12 @@ namespace mg5amcCpu
 #endif
 
     // Start sigmaKin_lines
+
+#include "GpuAbstraction.h"
+
     // === PART 0 - INITIALISATION (before calculate_wavefunctions) ===
     // Reset the "matrix elements" - running sums of |M|^2 over helicities for the given event
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     allMEs[ievt] = 0;
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     allNumerators[ievt] = 0;
@@ -1098,7 +1102,7 @@ namespace mg5amcCpu
     // === PART 1 - HELICITY LOOP: CALCULATE WAVEFUNCTIONS ===
     // (in both CUDA and C++, using precomputed good helicities)
 
-#ifdef __CUDACC__ // CUDA OR C++
+#ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++
 
     // *** START OF PART 1a - CUDA (one event per CPU thread) ***
     // Running sum of partial amplitudes squared for event by event color selection (#402)
@@ -1308,7 +1312,7 @@ namespace mg5amcCpu
     // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event
     // [NB 'sum over final spins, average over initial spins', eg see
     // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf]
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     allMEs[ievt] /= helcolDenominators[0];
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     if( channelId > 0 ) allMEs[ievt] *= allNumerators[ievt] / allDenominators[ievt];
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uu_ttxuu/CPPProcess.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uu_ttxuu/CPPProcess.h
index 2d95f4b170..d8232ea652 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uu_ttxuu/CPPProcess.h
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uu_ttxuu/CPPProcess.h
@@ -4,7 +4,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
 // MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
@@ -25,7 +25,7 @@
 
 //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -110,7 +110,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   __global__ void
   computeDependentCouplings( const fptype* allgs,    // input: Gs[nevt]
                              fptype* allcouplings ); // output: couplings[nevt*ndcoup*2]
@@ -123,7 +123,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__ /* clang-format off */
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
   __global__ void
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
@@ -153,7 +153,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__ /* clang-format off */
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
   __global__ void
   sigmaKin( const fptype* allmomenta,      // input: momenta[nevt*npar*4]
             const fptype* allcouplings,    // input: couplings[nevt*ndcoup*2]
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uu_ttxuu/CudaRuntime.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uu_ttxuu/CudaRuntime.h
deleted file mode 120000
index ce9e1a487a..0000000000
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uu_ttxuu/CudaRuntime.h
+++ /dev/null
@@ -1 +0,0 @@
-../CudaRuntime.h
\ No newline at end of file
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uu_ttxuu/GpuAbstraction.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uu_ttxuu/GpuAbstraction.h
new file mode 120000
index 0000000000..72054e19ba
--- /dev/null
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uu_ttxuu/GpuAbstraction.h
@@ -0,0 +1 @@
+../GpuAbstraction.h
\ No newline at end of file
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uu_ttxuu/GpuRuntime.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uu_ttxuu/GpuRuntime.h
new file mode 120000
index 0000000000..3920e83be4
--- /dev/null
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uu_ttxuu/GpuRuntime.h
@@ -0,0 +1 @@
+../GpuRuntime.h
\ No newline at end of file
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uu_ttxuu/check_sa.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uu_ttxuu/check_sa.cc
index 3fbf0ffbee..7cac5ab47b 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uu_ttxuu/check_sa.cc
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uu_ttxuu/check_sa.cc
@@ -4,7 +4,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: O. Mattelaer (Nov 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 
 #include "mgOnGpuConfig.h"
@@ -12,6 +12,7 @@
 #include "BridgeKernels.h"
 #include "CPPProcess.h"
 #include "CrossSectionKernels.h"
+#include "GpuRuntime.h"
 #include "MatrixElementKernels.h"
 #include "MemoryAccessMatrixElements.h"
 #include "MemoryAccessMomenta.h"
@@ -65,7 +66,7 @@ usage( char* argv0, int ret = 1 )
   std::cout << std::endl;
   std::cout << "Summary stats are always computed: '-p' and '-j' only control their printout" << std::endl;
   std::cout << "The '-d' flag only enables NaN/abnormal warnings and OMP debugging" << std::endl;
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 #ifdef _OPENMP
   std::cout << std::endl;
   std::cout << "Use the OMP_NUM_THREADS environment variable to control OMP multi-threading" << std::endl;
@@ -96,7 +97,7 @@ int
 main( int argc, char** argv )
 {
   // Namespaces for CUDA and C++ (FIXME - eventually use the same namespace everywhere...)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   using namespace mg5amcGpu;
 #else
   using namespace mg5amcCpu;
@@ -134,9 +135,11 @@ main( int argc, char** argv )
     CurandDevice = 2
   };
 #ifdef MGONGPU_HAS_NO_CURAND
-  RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // this is the only supported mode if build has no curand (PR #784)
+  RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // this is the only supported mode if build has no curand (PR #784 and #785)
+#elif defined __HIPCC__
+#error Internal error: MGONGPU_HAS_NO_CURAND should have been set for __HIPCC__ // default on AMD GPUs should be common random
 #elif defined __CUDACC__
-  RandomNumberMode rndgen = RandomNumberMode::CurandDevice; // default on GPU if build has curand
+  RandomNumberMode rndgen = RandomNumberMode::CurandDevice; // default on NVidia GPU if build has curand
 #else
   RandomNumberMode rndgen = RandomNumberMode::CurandHost; // default on CPU if build has curand
 #endif
@@ -146,10 +149,10 @@ main( int argc, char** argv )
     RamboHost = 1,
     RamboDevice = 2
   };
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   RamboSamplingMode rmbsmp = RamboSamplingMode::RamboDevice; // default on GPU
 #else
-  RamboSamplingMode rmbsmp = RamboSamplingMode::RamboHost;  // default on CPU
+  RamboSamplingMode rmbsmp = RamboSamplingMode::RamboHost; // default on CPU
 #endif
   // Bridge emulation mode (NB Bridge implies RamboHost!)
   bool bridge = false;
@@ -177,7 +180,7 @@ main( int argc, char** argv )
     else if( arg == "--curdev" )
     {
 #ifndef __CUDACC__
-      throw std::runtime_error( "CurandDevice is not supported on CPUs" );
+      throw std::runtime_error( "CurandDevice is not supported on CPUs or non-NVidia GPUs" );
 #elif defined MGONGPU_HAS_NO_CURAND
       throw std::runtime_error( "CurandDevice is not supported because this application was built without Curand support" );
 #else
@@ -198,7 +201,7 @@ main( int argc, char** argv )
     }
     else if( arg == "--rmbdev" )
     {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
       rmbsmp = RamboSamplingMode::RamboDevice;
 #else
       throw std::runtime_error( "RamboDevice is not supported on CPUs" );
@@ -272,13 +275,13 @@ main( int argc, char** argv )
     return usage( argv[0] );
   }
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 #ifdef _OPENMP
   ompnumthreadsNotSetMeansOneThread( debug ? 1 : 0 ); // quiet(-1), info(0), debug(1)
 #endif
 #endif
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // Fail gently and avoid "Illegal instruction (core dumped)" if the host does not support the SIMD used in the ME calculation
   // Note: this prevents a crash on pmpe04 but not on some github CI nodes?
   // [NB: SIMD vectorization in mg5amc C++ code is only used in the ME calculation below MatrixElementKernelHost!]
@@ -296,14 +299,14 @@ main( int argc, char** argv )
 
   // === STEP 0 - INITIALISE
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 
-  // --- 00. Initialise cuda
-  // Instantiate a CudaRuntime at the beginnining of the application's main to
-  // invoke cudaSetDevice(0) in the constructor and book a cudaDeviceReset() call in the destructor
-  const std::string cdinKey = "00 CudaInit";
+  // --- 00. Initialise GPU
+  // Instantiate a GpuRuntime at the beginnining of the application's main.
+  // For CUDA this invokes cudaSetDevice(0) in the constructor and books a cudaDeviceReset() call in the destructor.
+  const std::string cdinKey = "00 GpuInit";
   timermap.start( cdinKey );
-  CudaRuntime cudaRuntime( debug );
+  GpuRuntime GpuRuntime( debug );
 #endif
 
   // --- 0a. Initialise physics process
@@ -325,7 +328,7 @@ main( int argc, char** argv )
   timermap.start( alloKey );
 
   // Memory buffers for random numbers for momenta
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferRndNumMomenta hstRndmom( nevt );
 #else
   PinnedHostBufferRndNumMomenta hstRndmom( nevt );
@@ -333,7 +336,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for sampling weights
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferWeights hstWeights( nevt );
 #else
   PinnedHostBufferWeights hstWeights( nevt );
@@ -341,7 +344,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for momenta
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferMomenta hstMomenta( nevt );
 #else
   PinnedHostBufferMomenta hstMomenta( nevt );
@@ -349,7 +352,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for Gs
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferGs hstGs( nevt );
 #else
   PinnedHostBufferGs hstGs( nevt );
@@ -366,7 +369,7 @@ main( int argc, char** argv )
   }
 
   // Memory buffers for matrix elements
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferMatrixElements hstMatrixElements( nevt );
 #else
   PinnedHostBufferMatrixElements hstMatrixElements( nevt );
@@ -375,7 +378,7 @@ main( int argc, char** argv )
 
   // Memory buffers for random numbers for helicity selection
   // *** NB #403 these buffers always remain initialised at 0: no need for helicity choice in gcheck/check (no LHE produced) ***
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferRndNumHelicity hstRndHel( nevt );
 #else
   PinnedHostBufferRndNumHelicity hstRndHel( nevt );
@@ -384,7 +387,7 @@ main( int argc, char** argv )
 
   // Memory buffers for random numbers for color selection
   // *** NB #402 these buffers always remain initialised at 0: no need for color choice in gcheck/check (no LHE produced) ***
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferRndNumColor hstRndCol( nevt );
 #else
   PinnedHostBufferRndNumColor hstRndCol( nevt );
@@ -392,7 +395,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for helicity selection
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferSelectedHelicity hstSelHel( nevt );
 #else
   PinnedHostBufferSelectedHelicity hstSelHel( nevt );
@@ -400,7 +403,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for color selection
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferSelectedColor hstSelCol( nevt );
 #else
   PinnedHostBufferSelectedColor hstSelCol( nevt );
@@ -438,7 +441,7 @@ main( int argc, char** argv )
     const bool onDevice = true;
     prnk.reset( new CurandRandomNumberKernel( devRndmom, onDevice ) );
 #else
-    throw std::logic_error( "INTERNAL ERROR! CurandDevice is not supported on CPUs" ); // INTERNAL ERROR (no path to this statement)
+    throw std::logic_error( "INTERNAL ERROR! CurandDevice is not supported on CPUs or non-NVidia GPUs" ); // INTERNAL ERROR (no path to this statement)
 #endif
   }
 
@@ -450,7 +453,7 @@ main( int argc, char** argv )
   }
   else
   {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     prsk.reset( new RamboSamplingKernelDevice( energy, devRndmom, devMomenta, devWeights, gpublocks, gputhreads ) );
 #else
     throw std::logic_error( "RamboDevice is not supported on CPUs" ); // INTERNAL ERROR (no path to this statement)
@@ -461,7 +464,7 @@ main( int argc, char** argv )
   std::unique_ptr<MatrixElementKernelBase> pmek;
   if( !bridge )
   {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     pmek.reset( new MatrixElementKernelDevice( devMomenta, devGs, devRndHel, devRndCol, devMatrixElements, devSelHel, devSelCol, gpublocks, gputhreads ) );
 #else
     pmek.reset( new MatrixElementKernelHost( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, nevt ) );
@@ -469,7 +472,7 @@ main( int argc, char** argv )
   }
   else
   {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     pmek.reset( new BridgeKernelDevice( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, gpublocks, gputhreads ) );
 #else
     pmek.reset( new BridgeKernelHost( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, nevt ) );
@@ -511,7 +514,7 @@ main( int argc, char** argv )
     prnk->generateRnarray();
     //std::cout << "Got random numbers" << std::endl;
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     if( rndgen != RandomNumberMode::CurandDevice && rmbsmp == RamboSamplingMode::RamboDevice )
     {
       // --- 1c. Copy rndmom from host to device
@@ -543,7 +546,7 @@ main( int argc, char** argv )
     prsk->getMomentaFinal();
     //std::cout << "Got final momenta" << std::endl;
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     if( rmbsmp == RamboSamplingMode::RamboDevice )
     {
       // --- 2c. CopyDToH Weights
@@ -588,7 +591,7 @@ main( int argc, char** argv )
       dynamic_cast<BridgeKernelBase*>( pmek.get() )->transposeInputMomentaC2F();
     }
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     // --- 2d. CopyHToD Momenta
     const std::string gKey = "0.. CpHTDg";
     rambtime += timermap.start( gKey ); // FIXME! NOT A RAMBO TIMER!
@@ -617,7 +620,7 @@ main( int argc, char** argv )
     wv3atime += timermap.stop(); // calc only
     wavetime += wv3atime;        // calc plus copy
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     if( !bridge )
     {
       // --- 3b. CopyDToH MEs
@@ -760,18 +763,22 @@ main( int argc, char** argv )
     rndgentxt = "CURAND DEVICE";
 #ifdef __CUDACC__
   rndgentxt += " (CUDA code)";
+#elif defined __HIPCC__
+  rndgentxt += " (HIP code)";
 #else
   rndgentxt += " (C++ code)";
 #endif
 
   // Workflow description summary
   std::string wrkflwtxt;
-  // -- CUDA or C++?
+  // -- CUDA or HIP or C++?
 #ifdef __CUDACC__
   wrkflwtxt += "CUD:";
+#elif defined __HIPCC__
+  wrkflwtxt += "HIP:";
 #else
   wrkflwtxt += "CPP:";
-#endif
+#endif /* clang-format off */
   // -- DOUBLE or FLOAT?
 #if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
   wrkflwtxt += "MIX+"; // mixed fptypes (single precision color algebra #537)
@@ -781,7 +788,7 @@ main( int argc, char** argv )
   wrkflwtxt += "FLT+";
 #else
   wrkflwtxt += "???+"; // no path to this statement
-#endif
+#endif /* clang-format on */
   // -- CUCOMPLEX or THRUST or STD complex numbers?
 #ifdef __CUDACC__
 #if defined MGONGPU_CUCXTYPE_CUCOMPLEX
@@ -793,6 +800,12 @@ main( int argc, char** argv )
 #else
   wrkflwtxt += "???:"; // no path to this statement
 #endif
+#elif defined __HIPCC__
+#if defined MGONGPU_CUCXTYPE_CXSMPL
+  wrkflwtxt += "CXS:";
+#else
+  wrkflwtxt += "???:"; // no path to this statement
+#endif
 #else
 #if defined MGONGPU_CPPCXTYPE_STDCOMPLEX
   wrkflwtxt += "STX:";
@@ -818,7 +831,7 @@ main( int argc, char** argv )
     wrkflwtxt += "RMBDEV+";
   else
     wrkflwtxt += "??????+"; // no path to this statement
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   // -- HOST or DEVICE matrix elements? Standalone MEs or BRIDGE?
   if( !bridge )
     wrkflwtxt += "MESDEV";
@@ -874,7 +887,7 @@ main( int argc, char** argv )
 
   if( perf )
   {
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 #ifdef _OPENMP
     // Get the output of "nproc --all" (https://stackoverflow.com/a/478960)
     std::string nprocall;
@@ -895,6 +908,8 @@ main( int argc, char** argv )
     std::cout << std::string( SEP79, '*' ) << std::endl
 #ifdef __CUDACC__
               << "Process                     = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_CUDA"
+#elif defined __HIPCC__
+              << "Process                     = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_HIP"
 #else
               << "Process                     = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_CPP"
 #endif
@@ -921,21 +936,21 @@ main( int argc, char** argv )
 #elif defined MGONGPU_FPTYPE_FLOAT
               << "FP precision                = FLOAT (NaN/abnormal=" << nabn << ", zero=" << nzero << ")" << std::endl
 #endif
-#ifdef __CUDACC__
 #if defined MGONGPU_CUCXTYPE_CUCOMPLEX
               << "Complex type                = CUCOMPLEX" << std::endl
 #elif defined MGONGPU_CUCXTYPE_THRUST
               << "Complex type                = THRUST::COMPLEX" << std::endl
-#endif
-#else
+#elif defined MGONGPU_CUCXTYPE_CXSMPL
               << "Complex type                = STD::COMPLEX" << std::endl
+#else
+              << "Complex type                = ???" << std::endl // no path to this statement...
 #endif
               << "RanNumb memory layout       = AOSOA[" << neppR << "]"
               << ( neppR == 1 ? " == AOS" : "" )
               << " [HARDCODED FOR REPRODUCIBILITY]" << std::endl
               << "Momenta memory layout       = AOSOA[" << neppM << "]"
               << ( neppM == 1 ? " == AOS" : "" ) << std::endl
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     //<< "Wavefunction GPU memory     = LOCAL" << std::endl
 #else
 #if !defined MGONGPU_CPPSIMD
@@ -966,7 +981,7 @@ main( int argc, char** argv )
 #endif
 #endif
               << "Random number generation    = " << rndgentxt << std::endl
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 #ifdef _OPENMP
               << "OMP threads / `nproc --all` = " << omp_get_max_threads() << " / " << nprocall // includes a newline
 #endif
@@ -1062,14 +1077,14 @@ main( int argc, char** argv )
              << "\"FLOAT (NaN/abnormal=" << nabn << ")\"," << std::endl
 #endif
              << "\"Complex type\": "
-#ifdef __CUDACC__
 #if defined MGONGPU_CUCXTYPE_CUCOMPLEX
              << "\"CUCOMPLEX\"," << std::endl
 #elif defined MGONGPU_CUCXTYPE_THRUST
              << "\"THRUST::COMPLEX\"," << std::endl
-#endif
-#else
+#elif defined MGONGPU_CUCXTYPE_CXSMPL
              << "\"STD::COMPLEX\"," << std::endl
+#else
+             << "\"???\"," << std::endl                           // no path to this statement...
 #endif
              << "\"RanNumb memory layout\": "
              << "\"AOSOA[" << neppR << "]\""
@@ -1077,7 +1092,7 @@ main( int argc, char** argv )
              << "\"Momenta memory layout\": "
              << "\"AOSOA[" << neppM << "]\""
              << ( neppM == 1 ? " == AOS" : "" ) << ", " << std::endl
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     //<< "\"Wavefunction GPU memory\": " << "\"LOCAL\"," << std::endl
 #endif
              << "\"Curand generation\": "
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxccx/CPPProcess.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxccx/CPPProcess.cc
index d0be5131af..be2315b035 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxccx/CPPProcess.cc
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxccx/CPPProcess.cc
@@ -4,7 +4,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
 // MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
@@ -16,7 +16,6 @@
 
 #include "mgOnGpuConfig.h"
 
-#include "CudaRuntime.h"
 #include "HelAmps_sm.h"
 #include "MemoryAccessAmplitudes.h"
 #include "MemoryAccessCouplings.h"
@@ -57,7 +56,7 @@
 // Process: s s~ > t t~ c c~ WEIGHTED<=4 @2
 // Process: s s~ > t t~ d d~ WEIGHTED<=4 @2
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -91,7 +90,7 @@ namespace mg5amcCpu
   __device__ const fptype cIPD[2] = { (fptype)Parameters_sm::mdl_MT, (fptype)Parameters_sm::mdl_WT };
   __device__ const fptype* cIPC = nullptr; // unused as nicoup=0
 #else
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   __device__ __constant__ fptype cIPD[2];
   __device__ __constant__ fptype* cIPC = nullptr; // unused as nicoup=0
 #else
@@ -101,7 +100,7 @@ namespace mg5amcCpu
 #endif
 
   // Helicity combinations (and filtering of "good" helicity combinations)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   __device__ __constant__ short cHel[ncomb][npar];
   __device__ __constant__ int cNGoodHel;
   __device__ __constant__ int cGoodHel[ncomb];
@@ -129,13 +128,13 @@ namespace mg5amcCpu
                            fptype* allDenominators,       // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
                            fptype_sv* jamp2_sv            // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled)
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
                            , const int ievt00             // input: first event number in current C++ event page (for CUDA, ievt depends on threadid)
 #endif
                            )
   //ALWAYS_INLINE // attributes are not permitted in a function definition
   {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     using namespace mg5amcGpu;
     using M_ACCESS = DeviceAccessMomenta;         // non-trivial access: buffer includes all events
     using E_ACCESS = DeviceAccessMatrixElements;  // non-trivial access: buffer includes all events
@@ -162,7 +161,7 @@ namespace mg5amcCpu
 #endif /* clang-format on */
     mgDebug( 0, __FUNCTION__ );
     //printf( "calculate_wavefunctions: ihel=%2d\n", ihel );
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
     //printf( "calculate_wavefunctions: ievt00=%d\n", ievt00 );
 #endif
 
@@ -198,7 +197,7 @@ namespace mg5amcCpu
 #endif
     for( int iParity = 0; iParity < nParity; ++iParity )
     { // START LOOP ON IPARITY
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
       const int ievt0 = ievt00 + iParity * neppV;
 #endif
       constexpr size_t nxcoup = ndcoup + nicoup; // both dependent and independent couplings
@@ -211,8 +210,10 @@ namespace mg5amcCpu
         allCOUPs[idcoup] = CD_ACCESS::idcoupAccessBufferConst( allcouplings, idcoup ); // dependent couplings, vary event-by-event
       for( size_t iicoup = 0; iicoup < nicoup; iicoup++ )
         allCOUPs[ndcoup + iicoup] = CI_ACCESS::iicoupAccessBufferConst( cIPC, iicoup ); // independent couplings, fixed for all events
+#ifdef MGONGPUCPP_GPUIMPL
 #ifdef __CUDACC__
 #pragma nv_diagnostic pop
+#endif
       // CUDA kernels take input/output buffers with momenta/MEs for all events
       const fptype* momenta = allmomenta;
       const fptype* COUPs[nxcoup];
@@ -393,7 +394,7 @@ namespace mg5amcCpu
         { 3, 9, 9, 3, 27, 9 },
         { 9, 3, 3, 9, 9, 27 } }; // 2-D array[6][6]
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
       // Pre-compute a constexpr triangular color matrix properly normalized #475
       struct TriangularNormalizedColorMatrix
       {
@@ -450,7 +451,7 @@ namespace mg5amcCpu
 #endif
       for( int icol = 0; icol < ncolor; icol++ )
       {
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
         // === C++ START ===
         // Diagonal terms
 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
@@ -509,7 +510,7 @@ namespace mg5amcCpu
       MEs_sv_previous += deltaMEs_previous;
 #endif
       /*
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
       if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", blockDim.x * blockIdx.x + threadIdx.x, ihel, MEs_sv );
 #else
 #ifdef MGONGPU_CPPSIMD
@@ -604,8 +605,8 @@ namespace mg5amcCpu
       { -1, 1, 1, -1, -1, -1 },
       { -1, 1, 1, -1, 1, 1 },
       { -1, 1, 1, -1, 1, -1 } };
-#ifdef __CUDACC__
-    checkCuda( cudaMemcpyToSymbol( cHel, tHel, ncomb * npar * sizeof( short ) ) );
+#ifdef MGONGPUCPP_GPUIMPL
+    gpuMemcpyToSymbol( cHel, tHel, ncomb * npar * sizeof( short ) );
 #else
     memcpy( cHel, tHel, ncomb * npar * sizeof( short ) );
 #endif
@@ -647,9 +648,9 @@ namespace mg5amcCpu
     // Then copy them to CUDA constant memory (issue #39) or its C++ emulation in file-scope static memory
     const fptype tIPD[2] = { (fptype)m_pars->mdl_MT, (fptype)m_pars->mdl_WT };
     //const cxtype tIPC[0] = { ... }; // nicoup=0
-#ifdef __CUDACC__
-    checkCuda( cudaMemcpyToSymbol( cIPD, tIPD, 2 * sizeof( fptype ) ) );
-    //checkCuda( cudaMemcpyToSymbol( cIPC, tIPC, 0 * sizeof( cxtype ) ) ); // nicoup=0
+#ifdef MGONGPUCPP_GPUIMPL
+    gpuMemcpyToSymbol( cIPD, tIPD, 2 * sizeof( fptype ) );
+    //gpuMemcpyToSymbol( cIPC, tIPC, 0 * sizeof( cxtype ) ); // nicoup=0
 #else
     memcpy( cIPD, tIPD, 2 * sizeof( fptype ) );
     //memcpy( cIPC, tIPC, 0 * sizeof( cxtype ) ); // nicoup=0
@@ -687,7 +688,7 @@ namespace mg5amcCpu
   {
     std::stringstream out;
     // CUDA version (NVCC)
-    // [Use __NVCC__ instead of __CUDACC__ here!]
+    // [Use __NVCC__ instead of MGONGPUCPP_GPUIMPL here!]
     // [This tests if 'nvcc' was used even to build a .cc file, even if not necessarily 'nvcc -x cu' for a .cu file]
     // [Check 'nvcc --compiler-options -dM -E dummy.c | grep CUDA': see https://stackoverflow.com/a/53713712]
 #ifdef __NVCC__
@@ -752,12 +753,12 @@ namespace mg5amcCpu
   __global__ void /* clang-format off */
   computeDependentCouplings( const fptype* allgs, // input: Gs[nevt]
                              fptype* allcouplings // output: couplings[nevt*ndcoup*2]
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
                              , const int nevt     // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #endif
   ) /* clang-format on */
   {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     using namespace mg5amcGpu;
     using G_ACCESS = DeviceAccessGs;
     using C_ACCESS = DeviceAccessCouplings;
@@ -778,7 +779,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__ /* clang-format off */
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
   __global__ void
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
@@ -904,9 +905,9 @@ namespace mg5amcCpu
         nGoodHel++;
       }
     }
-#ifdef __CUDACC__
-    checkCuda( cudaMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) ) );
-    checkCuda( cudaMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) ) );
+#ifdef MGONGPUCPP_GPUIMPL
+    gpuMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) );
+    gpuMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) );
 #else
     cNGoodHel = nGoodHel;
     for( int ihel = 0; ihel < ncomb; ihel++ ) cGoodHel[ihel] = goodHel[ihel];
@@ -930,7 +931,7 @@ namespace mg5amcCpu
 #endif
             int* allselhel,                // output: helicity selection[nevt]
             int* allselcol                 // output: helicity selection[nevt]
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
             , const int nevt               // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #endif
             ) /* clang-format on */
@@ -950,7 +951,7 @@ namespace mg5amcCpu
     // Denominators: spins, colors and identical particles
     constexpr int helcolDenominators[1] = { 36 }; // assume nprocesses == 1 (#272 and #343)
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     // Remember: in CUDA this is a kernel for one event, in c++ this processes n events
     const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid
 #else
@@ -964,9 +965,12 @@ namespace mg5amcCpu
 #endif
 
     // Start sigmaKin_lines
+
+#include "GpuAbstraction.h"
+
     // === PART 0 - INITIALISATION (before calculate_wavefunctions) ===
     // Reset the "matrix elements" - running sums of |M|^2 over helicities for the given event
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     allMEs[ievt] = 0;
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     allNumerators[ievt] = 0;
@@ -994,7 +998,7 @@ namespace mg5amcCpu
     // === PART 1 - HELICITY LOOP: CALCULATE WAVEFUNCTIONS ===
     // (in both CUDA and C++, using precomputed good helicities)
 
-#ifdef __CUDACC__ // CUDA OR C++
+#ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++
 
     // *** START OF PART 1a - CUDA (one event per CPU thread) ***
     // Running sum of partial amplitudes squared for event by event color selection (#402)
@@ -1204,7 +1208,7 @@ namespace mg5amcCpu
     // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event
     // [NB 'sum over final spins, average over initial spins', eg see
     // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf]
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     allMEs[ievt] /= helcolDenominators[0];
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     if( channelId > 0 ) allMEs[ievt] *= allNumerators[ievt] / allDenominators[ievt];
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxccx/CPPProcess.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxccx/CPPProcess.h
index 14490d782f..71fdc6e547 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxccx/CPPProcess.h
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxccx/CPPProcess.h
@@ -4,7 +4,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
 // MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
@@ -25,7 +25,7 @@
 
 //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -118,7 +118,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   __global__ void
   computeDependentCouplings( const fptype* allgs,    // input: Gs[nevt]
                              fptype* allcouplings ); // output: couplings[nevt*ndcoup*2]
@@ -131,7 +131,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__ /* clang-format off */
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
   __global__ void
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
@@ -161,7 +161,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__ /* clang-format off */
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
   __global__ void
   sigmaKin( const fptype* allmomenta,      // input: momenta[nevt*npar*4]
             const fptype* allcouplings,    // input: couplings[nevt*ndcoup*2]
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxccx/CudaRuntime.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxccx/CudaRuntime.h
deleted file mode 120000
index ce9e1a487a..0000000000
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxccx/CudaRuntime.h
+++ /dev/null
@@ -1 +0,0 @@
-../CudaRuntime.h
\ No newline at end of file
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxccx/GpuAbstraction.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxccx/GpuAbstraction.h
new file mode 120000
index 0000000000..72054e19ba
--- /dev/null
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxccx/GpuAbstraction.h
@@ -0,0 +1 @@
+../GpuAbstraction.h
\ No newline at end of file
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxccx/GpuRuntime.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxccx/GpuRuntime.h
new file mode 120000
index 0000000000..3920e83be4
--- /dev/null
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxccx/GpuRuntime.h
@@ -0,0 +1 @@
+../GpuRuntime.h
\ No newline at end of file
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxccx/check_sa.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxccx/check_sa.cc
index 3fbf0ffbee..7cac5ab47b 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxccx/check_sa.cc
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxccx/check_sa.cc
@@ -4,7 +4,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: O. Mattelaer (Nov 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 
 #include "mgOnGpuConfig.h"
@@ -12,6 +12,7 @@
 #include "BridgeKernels.h"
 #include "CPPProcess.h"
 #include "CrossSectionKernels.h"
+#include "GpuRuntime.h"
 #include "MatrixElementKernels.h"
 #include "MemoryAccessMatrixElements.h"
 #include "MemoryAccessMomenta.h"
@@ -65,7 +66,7 @@ usage( char* argv0, int ret = 1 )
   std::cout << std::endl;
   std::cout << "Summary stats are always computed: '-p' and '-j' only control their printout" << std::endl;
   std::cout << "The '-d' flag only enables NaN/abnormal warnings and OMP debugging" << std::endl;
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 #ifdef _OPENMP
   std::cout << std::endl;
   std::cout << "Use the OMP_NUM_THREADS environment variable to control OMP multi-threading" << std::endl;
@@ -96,7 +97,7 @@ int
 main( int argc, char** argv )
 {
   // Namespaces for CUDA and C++ (FIXME - eventually use the same namespace everywhere...)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   using namespace mg5amcGpu;
 #else
   using namespace mg5amcCpu;
@@ -134,9 +135,11 @@ main( int argc, char** argv )
     CurandDevice = 2
   };
 #ifdef MGONGPU_HAS_NO_CURAND
-  RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // this is the only supported mode if build has no curand (PR #784)
+  RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // this is the only supported mode if build has no curand (PR #784 and #785)
+#elif defined __HIPCC__
+#error Internal error: MGONGPU_HAS_NO_CURAND should have been set for __HIPCC__ // default on AMD GPUs should be common random
 #elif defined __CUDACC__
-  RandomNumberMode rndgen = RandomNumberMode::CurandDevice; // default on GPU if build has curand
+  RandomNumberMode rndgen = RandomNumberMode::CurandDevice; // default on NVidia GPU if build has curand
 #else
   RandomNumberMode rndgen = RandomNumberMode::CurandHost; // default on CPU if build has curand
 #endif
@@ -146,10 +149,10 @@ main( int argc, char** argv )
     RamboHost = 1,
     RamboDevice = 2
   };
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   RamboSamplingMode rmbsmp = RamboSamplingMode::RamboDevice; // default on GPU
 #else
-  RamboSamplingMode rmbsmp = RamboSamplingMode::RamboHost;  // default on CPU
+  RamboSamplingMode rmbsmp = RamboSamplingMode::RamboHost; // default on CPU
 #endif
   // Bridge emulation mode (NB Bridge implies RamboHost!)
   bool bridge = false;
@@ -177,7 +180,7 @@ main( int argc, char** argv )
     else if( arg == "--curdev" )
     {
 #ifndef __CUDACC__
-      throw std::runtime_error( "CurandDevice is not supported on CPUs" );
+      throw std::runtime_error( "CurandDevice is not supported on CPUs or non-NVidia GPUs" );
 #elif defined MGONGPU_HAS_NO_CURAND
       throw std::runtime_error( "CurandDevice is not supported because this application was built without Curand support" );
 #else
@@ -198,7 +201,7 @@ main( int argc, char** argv )
     }
     else if( arg == "--rmbdev" )
     {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
       rmbsmp = RamboSamplingMode::RamboDevice;
 #else
       throw std::runtime_error( "RamboDevice is not supported on CPUs" );
@@ -272,13 +275,13 @@ main( int argc, char** argv )
     return usage( argv[0] );
   }
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 #ifdef _OPENMP
   ompnumthreadsNotSetMeansOneThread( debug ? 1 : 0 ); // quiet(-1), info(0), debug(1)
 #endif
 #endif
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // Fail gently and avoid "Illegal instruction (core dumped)" if the host does not support the SIMD used in the ME calculation
   // Note: this prevents a crash on pmpe04 but not on some github CI nodes?
   // [NB: SIMD vectorization in mg5amc C++ code is only used in the ME calculation below MatrixElementKernelHost!]
@@ -296,14 +299,14 @@ main( int argc, char** argv )
 
   // === STEP 0 - INITIALISE
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 
-  // --- 00. Initialise cuda
-  // Instantiate a CudaRuntime at the beginnining of the application's main to
-  // invoke cudaSetDevice(0) in the constructor and book a cudaDeviceReset() call in the destructor
-  const std::string cdinKey = "00 CudaInit";
+  // --- 00. Initialise GPU
+  // Instantiate a GpuRuntime at the beginnining of the application's main.
+  // For CUDA this invokes cudaSetDevice(0) in the constructor and books a cudaDeviceReset() call in the destructor.
+  const std::string cdinKey = "00 GpuInit";
   timermap.start( cdinKey );
-  CudaRuntime cudaRuntime( debug );
+  GpuRuntime GpuRuntime( debug );
 #endif
 
   // --- 0a. Initialise physics process
@@ -325,7 +328,7 @@ main( int argc, char** argv )
   timermap.start( alloKey );
 
   // Memory buffers for random numbers for momenta
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferRndNumMomenta hstRndmom( nevt );
 #else
   PinnedHostBufferRndNumMomenta hstRndmom( nevt );
@@ -333,7 +336,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for sampling weights
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferWeights hstWeights( nevt );
 #else
   PinnedHostBufferWeights hstWeights( nevt );
@@ -341,7 +344,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for momenta
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferMomenta hstMomenta( nevt );
 #else
   PinnedHostBufferMomenta hstMomenta( nevt );
@@ -349,7 +352,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for Gs
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferGs hstGs( nevt );
 #else
   PinnedHostBufferGs hstGs( nevt );
@@ -366,7 +369,7 @@ main( int argc, char** argv )
   }
 
   // Memory buffers for matrix elements
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferMatrixElements hstMatrixElements( nevt );
 #else
   PinnedHostBufferMatrixElements hstMatrixElements( nevt );
@@ -375,7 +378,7 @@ main( int argc, char** argv )
 
   // Memory buffers for random numbers for helicity selection
   // *** NB #403 these buffers always remain initialised at 0: no need for helicity choice in gcheck/check (no LHE produced) ***
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferRndNumHelicity hstRndHel( nevt );
 #else
   PinnedHostBufferRndNumHelicity hstRndHel( nevt );
@@ -384,7 +387,7 @@ main( int argc, char** argv )
 
   // Memory buffers for random numbers for color selection
   // *** NB #402 these buffers always remain initialised at 0: no need for color choice in gcheck/check (no LHE produced) ***
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferRndNumColor hstRndCol( nevt );
 #else
   PinnedHostBufferRndNumColor hstRndCol( nevt );
@@ -392,7 +395,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for helicity selection
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferSelectedHelicity hstSelHel( nevt );
 #else
   PinnedHostBufferSelectedHelicity hstSelHel( nevt );
@@ -400,7 +403,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for color selection
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferSelectedColor hstSelCol( nevt );
 #else
   PinnedHostBufferSelectedColor hstSelCol( nevt );
@@ -438,7 +441,7 @@ main( int argc, char** argv )
     const bool onDevice = true;
     prnk.reset( new CurandRandomNumberKernel( devRndmom, onDevice ) );
 #else
-    throw std::logic_error( "INTERNAL ERROR! CurandDevice is not supported on CPUs" ); // INTERNAL ERROR (no path to this statement)
+    throw std::logic_error( "INTERNAL ERROR! CurandDevice is not supported on CPUs or non-NVidia GPUs" ); // INTERNAL ERROR (no path to this statement)
 #endif
   }
 
@@ -450,7 +453,7 @@ main( int argc, char** argv )
   }
   else
   {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     prsk.reset( new RamboSamplingKernelDevice( energy, devRndmom, devMomenta, devWeights, gpublocks, gputhreads ) );
 #else
     throw std::logic_error( "RamboDevice is not supported on CPUs" ); // INTERNAL ERROR (no path to this statement)
@@ -461,7 +464,7 @@ main( int argc, char** argv )
   std::unique_ptr<MatrixElementKernelBase> pmek;
   if( !bridge )
   {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     pmek.reset( new MatrixElementKernelDevice( devMomenta, devGs, devRndHel, devRndCol, devMatrixElements, devSelHel, devSelCol, gpublocks, gputhreads ) );
 #else
     pmek.reset( new MatrixElementKernelHost( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, nevt ) );
@@ -469,7 +472,7 @@ main( int argc, char** argv )
   }
   else
   {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     pmek.reset( new BridgeKernelDevice( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, gpublocks, gputhreads ) );
 #else
     pmek.reset( new BridgeKernelHost( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, nevt ) );
@@ -511,7 +514,7 @@ main( int argc, char** argv )
     prnk->generateRnarray();
     //std::cout << "Got random numbers" << std::endl;
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     if( rndgen != RandomNumberMode::CurandDevice && rmbsmp == RamboSamplingMode::RamboDevice )
     {
       // --- 1c. Copy rndmom from host to device
@@ -543,7 +546,7 @@ main( int argc, char** argv )
     prsk->getMomentaFinal();
     //std::cout << "Got final momenta" << std::endl;
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     if( rmbsmp == RamboSamplingMode::RamboDevice )
     {
       // --- 2c. CopyDToH Weights
@@ -588,7 +591,7 @@ main( int argc, char** argv )
       dynamic_cast<BridgeKernelBase*>( pmek.get() )->transposeInputMomentaC2F();
     }
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     // --- 2d. CopyHToD Momenta
     const std::string gKey = "0.. CpHTDg";
     rambtime += timermap.start( gKey ); // FIXME! NOT A RAMBO TIMER!
@@ -617,7 +620,7 @@ main( int argc, char** argv )
     wv3atime += timermap.stop(); // calc only
     wavetime += wv3atime;        // calc plus copy
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     if( !bridge )
     {
       // --- 3b. CopyDToH MEs
@@ -760,18 +763,22 @@ main( int argc, char** argv )
     rndgentxt = "CURAND DEVICE";
 #ifdef __CUDACC__
   rndgentxt += " (CUDA code)";
+#elif defined __HIPCC__
+  rndgentxt += " (HIP code)";
 #else
   rndgentxt += " (C++ code)";
 #endif
 
   // Workflow description summary
   std::string wrkflwtxt;
-  // -- CUDA or C++?
+  // -- CUDA or HIP or C++?
 #ifdef __CUDACC__
   wrkflwtxt += "CUD:";
+#elif defined __HIPCC__
+  wrkflwtxt += "HIP:";
 #else
   wrkflwtxt += "CPP:";
-#endif
+#endif /* clang-format off */
   // -- DOUBLE or FLOAT?
 #if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
   wrkflwtxt += "MIX+"; // mixed fptypes (single precision color algebra #537)
@@ -781,7 +788,7 @@ main( int argc, char** argv )
   wrkflwtxt += "FLT+";
 #else
   wrkflwtxt += "???+"; // no path to this statement
-#endif
+#endif /* clang-format on */
   // -- CUCOMPLEX or THRUST or STD complex numbers?
 #ifdef __CUDACC__
 #if defined MGONGPU_CUCXTYPE_CUCOMPLEX
@@ -793,6 +800,12 @@ main( int argc, char** argv )
 #else
   wrkflwtxt += "???:"; // no path to this statement
 #endif
+#elif defined __HIPCC__
+#if defined MGONGPU_CUCXTYPE_CXSMPL
+  wrkflwtxt += "CXS:";
+#else
+  wrkflwtxt += "???:"; // no path to this statement
+#endif
 #else
 #if defined MGONGPU_CPPCXTYPE_STDCOMPLEX
   wrkflwtxt += "STX:";
@@ -818,7 +831,7 @@ main( int argc, char** argv )
     wrkflwtxt += "RMBDEV+";
   else
     wrkflwtxt += "??????+"; // no path to this statement
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   // -- HOST or DEVICE matrix elements? Standalone MEs or BRIDGE?
   if( !bridge )
     wrkflwtxt += "MESDEV";
@@ -874,7 +887,7 @@ main( int argc, char** argv )
 
   if( perf )
   {
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 #ifdef _OPENMP
     // Get the output of "nproc --all" (https://stackoverflow.com/a/478960)
     std::string nprocall;
@@ -895,6 +908,8 @@ main( int argc, char** argv )
     std::cout << std::string( SEP79, '*' ) << std::endl
 #ifdef __CUDACC__
               << "Process                     = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_CUDA"
+#elif defined __HIPCC__
+              << "Process                     = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_HIP"
 #else
               << "Process                     = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_CPP"
 #endif
@@ -921,21 +936,21 @@ main( int argc, char** argv )
 #elif defined MGONGPU_FPTYPE_FLOAT
               << "FP precision                = FLOAT (NaN/abnormal=" << nabn << ", zero=" << nzero << ")" << std::endl
 #endif
-#ifdef __CUDACC__
 #if defined MGONGPU_CUCXTYPE_CUCOMPLEX
               << "Complex type                = CUCOMPLEX" << std::endl
 #elif defined MGONGPU_CUCXTYPE_THRUST
               << "Complex type                = THRUST::COMPLEX" << std::endl
-#endif
-#else
+#elif defined MGONGPU_CUCXTYPE_CXSMPL
               << "Complex type                = STD::COMPLEX" << std::endl
+#else
+              << "Complex type                = ???" << std::endl // no path to this statement...
 #endif
               << "RanNumb memory layout       = AOSOA[" << neppR << "]"
               << ( neppR == 1 ? " == AOS" : "" )
               << " [HARDCODED FOR REPRODUCIBILITY]" << std::endl
               << "Momenta memory layout       = AOSOA[" << neppM << "]"
               << ( neppM == 1 ? " == AOS" : "" ) << std::endl
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     //<< "Wavefunction GPU memory     = LOCAL" << std::endl
 #else
 #if !defined MGONGPU_CPPSIMD
@@ -966,7 +981,7 @@ main( int argc, char** argv )
 #endif
 #endif
               << "Random number generation    = " << rndgentxt << std::endl
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 #ifdef _OPENMP
               << "OMP threads / `nproc --all` = " << omp_get_max_threads() << " / " << nprocall // includes a newline
 #endif
@@ -1062,14 +1077,14 @@ main( int argc, char** argv )
              << "\"FLOAT (NaN/abnormal=" << nabn << ")\"," << std::endl
 #endif
              << "\"Complex type\": "
-#ifdef __CUDACC__
 #if defined MGONGPU_CUCXTYPE_CUCOMPLEX
              << "\"CUCOMPLEX\"," << std::endl
 #elif defined MGONGPU_CUCXTYPE_THRUST
              << "\"THRUST::COMPLEX\"," << std::endl
-#endif
-#else
+#elif defined MGONGPU_CUCXTYPE_CXSMPL
              << "\"STD::COMPLEX\"," << std::endl
+#else
+             << "\"???\"," << std::endl                           // no path to this statement...
 #endif
              << "\"RanNumb memory layout\": "
              << "\"AOSOA[" << neppR << "]\""
@@ -1077,7 +1092,7 @@ main( int argc, char** argv )
              << "\"Momenta memory layout\": "
              << "\"AOSOA[" << neppM << "]\""
              << ( neppM == 1 ? " == AOS" : "" ) << ", " << std::endl
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     //<< "\"Wavefunction GPU memory\": " << "\"LOCAL\"," << std::endl
 #endif
              << "\"Curand generation\": "
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxgg/CPPProcess.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxgg/CPPProcess.cc
index 3a2178d534..c83b7be449 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxgg/CPPProcess.cc
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxgg/CPPProcess.cc
@@ -4,7 +4,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
 // MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
@@ -16,7 +16,6 @@
 
 #include "mgOnGpuConfig.h"
 
-#include "CudaRuntime.h"
 #include "HelAmps_sm.h"
 #include "MemoryAccessAmplitudes.h"
 #include "MemoryAccessCouplings.h"
@@ -49,7 +48,7 @@
 // Process: d d~ > t t~ g g WEIGHTED<=4 @2
 // Process: s s~ > t t~ g g WEIGHTED<=4 @2
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -83,7 +82,7 @@ namespace mg5amcCpu
   __device__ const fptype cIPD[2] = { (fptype)Parameters_sm::mdl_MT, (fptype)Parameters_sm::mdl_WT };
   __device__ const fptype* cIPC = nullptr; // unused as nicoup=0
 #else
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   __device__ __constant__ fptype cIPD[2];
   __device__ __constant__ fptype* cIPC = nullptr; // unused as nicoup=0
 #else
@@ -93,7 +92,7 @@ namespace mg5amcCpu
 #endif
 
   // Helicity combinations (and filtering of "good" helicity combinations)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   __device__ __constant__ short cHel[ncomb][npar];
   __device__ __constant__ int cNGoodHel;
   __device__ __constant__ int cGoodHel[ncomb];
@@ -121,13 +120,13 @@ namespace mg5amcCpu
                            fptype* allDenominators,       // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
                            fptype_sv* jamp2_sv            // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled)
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
                            , const int ievt00             // input: first event number in current C++ event page (for CUDA, ievt depends on threadid)
 #endif
                            )
   //ALWAYS_INLINE // attributes are not permitted in a function definition
   {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     using namespace mg5amcGpu;
     using M_ACCESS = DeviceAccessMomenta;         // non-trivial access: buffer includes all events
     using E_ACCESS = DeviceAccessMatrixElements;  // non-trivial access: buffer includes all events
@@ -154,7 +153,7 @@ namespace mg5amcCpu
 #endif /* clang-format on */
     mgDebug( 0, __FUNCTION__ );
     //printf( "calculate_wavefunctions: ihel=%2d\n", ihel );
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
     //printf( "calculate_wavefunctions: ievt00=%d\n", ievt00 );
 #endif
 
@@ -190,7 +189,7 @@ namespace mg5amcCpu
 #endif
     for( int iParity = 0; iParity < nParity; ++iParity )
     { // START LOOP ON IPARITY
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
       const int ievt0 = ievt00 + iParity * neppV;
 #endif
       constexpr size_t nxcoup = ndcoup + nicoup; // both dependent and independent couplings
@@ -203,8 +202,10 @@ namespace mg5amcCpu
         allCOUPs[idcoup] = CD_ACCESS::idcoupAccessBufferConst( allcouplings, idcoup ); // dependent couplings, vary event-by-event
       for( size_t iicoup = 0; iicoup < nicoup; iicoup++ )
         allCOUPs[ndcoup + iicoup] = CI_ACCESS::iicoupAccessBufferConst( cIPC, iicoup ); // independent couplings, fixed for all events
+#ifdef MGONGPUCPP_GPUIMPL
 #ifdef __CUDACC__
 #pragma nv_diagnostic pop
+#endif
       // CUDA kernels take input/output buffers with momenta/MEs for all events
       const fptype* momenta = allmomenta;
       const fptype* COUPs[nxcoup];
@@ -812,7 +813,7 @@ namespace mg5amcCpu
         { 16, -2, 0, 0, 0, 0, -2, 16, 16, 6, 48, 16 },
         { 0, 0, 16, -2, -2, 16, 0, 0, 6, 16, 16, 48 } }; // 2-D array[12][12]
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
       // Pre-compute a constexpr triangular color matrix properly normalized #475
       struct TriangularNormalizedColorMatrix
       {
@@ -869,7 +870,7 @@ namespace mg5amcCpu
 #endif
       for( int icol = 0; icol < ncolor; icol++ )
       {
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
         // === C++ START ===
         // Diagonal terms
 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
@@ -928,7 +929,7 @@ namespace mg5amcCpu
       MEs_sv_previous += deltaMEs_previous;
 #endif
       /*
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
       if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", blockDim.x * blockIdx.x + threadIdx.x, ihel, MEs_sv );
 #else
 #ifdef MGONGPU_CPPSIMD
@@ -1023,8 +1024,8 @@ namespace mg5amcCpu
       { -1, 1, 1, -1, -1, 1 },
       { -1, 1, 1, -1, 1, -1 },
       { -1, 1, 1, -1, 1, 1 } };
-#ifdef __CUDACC__
-    checkCuda( cudaMemcpyToSymbol( cHel, tHel, ncomb * npar * sizeof( short ) ) );
+#ifdef MGONGPUCPP_GPUIMPL
+    gpuMemcpyToSymbol( cHel, tHel, ncomb * npar * sizeof( short ) );
 #else
     memcpy( cHel, tHel, ncomb * npar * sizeof( short ) );
 #endif
@@ -1066,9 +1067,9 @@ namespace mg5amcCpu
     // Then copy them to CUDA constant memory (issue #39) or its C++ emulation in file-scope static memory
     const fptype tIPD[2] = { (fptype)m_pars->mdl_MT, (fptype)m_pars->mdl_WT };
     //const cxtype tIPC[0] = { ... }; // nicoup=0
-#ifdef __CUDACC__
-    checkCuda( cudaMemcpyToSymbol( cIPD, tIPD, 2 * sizeof( fptype ) ) );
-    //checkCuda( cudaMemcpyToSymbol( cIPC, tIPC, 0 * sizeof( cxtype ) ) ); // nicoup=0
+#ifdef MGONGPUCPP_GPUIMPL
+    gpuMemcpyToSymbol( cIPD, tIPD, 2 * sizeof( fptype ) );
+    //gpuMemcpyToSymbol( cIPC, tIPC, 0 * sizeof( cxtype ) ); // nicoup=0
 #else
     memcpy( cIPD, tIPD, 2 * sizeof( fptype ) );
     //memcpy( cIPC, tIPC, 0 * sizeof( cxtype ) ); // nicoup=0
@@ -1106,7 +1107,7 @@ namespace mg5amcCpu
   {
     std::stringstream out;
     // CUDA version (NVCC)
-    // [Use __NVCC__ instead of __CUDACC__ here!]
+    // [Use __NVCC__ instead of MGONGPUCPP_GPUIMPL here!]
     // [This tests if 'nvcc' was used even to build a .cc file, even if not necessarily 'nvcc -x cu' for a .cu file]
     // [Check 'nvcc --compiler-options -dM -E dummy.c | grep CUDA': see https://stackoverflow.com/a/53713712]
 #ifdef __NVCC__
@@ -1171,12 +1172,12 @@ namespace mg5amcCpu
   __global__ void /* clang-format off */
   computeDependentCouplings( const fptype* allgs, // input: Gs[nevt]
                              fptype* allcouplings // output: couplings[nevt*ndcoup*2]
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
                              , const int nevt     // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #endif
   ) /* clang-format on */
   {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     using namespace mg5amcGpu;
     using G_ACCESS = DeviceAccessGs;
     using C_ACCESS = DeviceAccessCouplings;
@@ -1197,7 +1198,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__ /* clang-format off */
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
   __global__ void
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
@@ -1323,9 +1324,9 @@ namespace mg5amcCpu
         nGoodHel++;
       }
     }
-#ifdef __CUDACC__
-    checkCuda( cudaMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) ) );
-    checkCuda( cudaMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) ) );
+#ifdef MGONGPUCPP_GPUIMPL
+    gpuMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) );
+    gpuMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) );
 #else
     cNGoodHel = nGoodHel;
     for( int ihel = 0; ihel < ncomb; ihel++ ) cGoodHel[ihel] = goodHel[ihel];
@@ -1349,7 +1350,7 @@ namespace mg5amcCpu
 #endif
             int* allselhel,                // output: helicity selection[nevt]
             int* allselcol                 // output: helicity selection[nevt]
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
             , const int nevt               // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #endif
             ) /* clang-format on */
@@ -1369,7 +1370,7 @@ namespace mg5amcCpu
     // Denominators: spins, colors and identical particles
     constexpr int helcolDenominators[1] = { 72 }; // assume nprocesses == 1 (#272 and #343)
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     // Remember: in CUDA this is a kernel for one event, in c++ this processes n events
     const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid
 #else
@@ -1383,9 +1384,12 @@ namespace mg5amcCpu
 #endif
 
     // Start sigmaKin_lines
+
+#include "GpuAbstraction.h"
+
     // === PART 0 - INITIALISATION (before calculate_wavefunctions) ===
     // Reset the "matrix elements" - running sums of |M|^2 over helicities for the given event
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     allMEs[ievt] = 0;
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     allNumerators[ievt] = 0;
@@ -1413,7 +1417,7 @@ namespace mg5amcCpu
     // === PART 1 - HELICITY LOOP: CALCULATE WAVEFUNCTIONS ===
     // (in both CUDA and C++, using precomputed good helicities)
 
-#ifdef __CUDACC__ // CUDA OR C++
+#ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++
 
     // *** START OF PART 1a - CUDA (one event per CPU thread) ***
     // Running sum of partial amplitudes squared for event by event color selection (#402)
@@ -1623,7 +1627,7 @@ namespace mg5amcCpu
     // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event
     // [NB 'sum over final spins, average over initial spins', eg see
     // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf]
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     allMEs[ievt] /= helcolDenominators[0];
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     if( channelId > 0 ) allMEs[ievt] *= allNumerators[ievt] / allDenominators[ievt];
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxgg/CPPProcess.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxgg/CPPProcess.h
index 1543c29649..e9a24f516d 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxgg/CPPProcess.h
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxgg/CPPProcess.h
@@ -4,7 +4,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
 // MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
@@ -25,7 +25,7 @@
 
 //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -110,7 +110,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   __global__ void
   computeDependentCouplings( const fptype* allgs,    // input: Gs[nevt]
                              fptype* allcouplings ); // output: couplings[nevt*ndcoup*2]
@@ -123,7 +123,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__ /* clang-format off */
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
   __global__ void
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
@@ -153,7 +153,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__ /* clang-format off */
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
   __global__ void
   sigmaKin( const fptype* allmomenta,      // input: momenta[nevt*npar*4]
             const fptype* allcouplings,    // input: couplings[nevt*ndcoup*2]
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxgg/CudaRuntime.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxgg/CudaRuntime.h
deleted file mode 120000
index ce9e1a487a..0000000000
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxgg/CudaRuntime.h
+++ /dev/null
@@ -1 +0,0 @@
-../CudaRuntime.h
\ No newline at end of file
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxgg/GpuAbstraction.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxgg/GpuAbstraction.h
new file mode 120000
index 0000000000..72054e19ba
--- /dev/null
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxgg/GpuAbstraction.h
@@ -0,0 +1 @@
+../GpuAbstraction.h
\ No newline at end of file
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxgg/GpuRuntime.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxgg/GpuRuntime.h
new file mode 120000
index 0000000000..3920e83be4
--- /dev/null
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxgg/GpuRuntime.h
@@ -0,0 +1 @@
+../GpuRuntime.h
\ No newline at end of file
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxgg/check_sa.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxgg/check_sa.cc
index 3fbf0ffbee..7cac5ab47b 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxgg/check_sa.cc
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxgg/check_sa.cc
@@ -4,7 +4,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: O. Mattelaer (Nov 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 
 #include "mgOnGpuConfig.h"
@@ -12,6 +12,7 @@
 #include "BridgeKernels.h"
 #include "CPPProcess.h"
 #include "CrossSectionKernels.h"
+#include "GpuRuntime.h"
 #include "MatrixElementKernels.h"
 #include "MemoryAccessMatrixElements.h"
 #include "MemoryAccessMomenta.h"
@@ -65,7 +66,7 @@ usage( char* argv0, int ret = 1 )
   std::cout << std::endl;
   std::cout << "Summary stats are always computed: '-p' and '-j' only control their printout" << std::endl;
   std::cout << "The '-d' flag only enables NaN/abnormal warnings and OMP debugging" << std::endl;
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 #ifdef _OPENMP
   std::cout << std::endl;
   std::cout << "Use the OMP_NUM_THREADS environment variable to control OMP multi-threading" << std::endl;
@@ -96,7 +97,7 @@ int
 main( int argc, char** argv )
 {
   // Namespaces for CUDA and C++ (FIXME - eventually use the same namespace everywhere...)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   using namespace mg5amcGpu;
 #else
   using namespace mg5amcCpu;
@@ -134,9 +135,11 @@ main( int argc, char** argv )
     CurandDevice = 2
   };
 #ifdef MGONGPU_HAS_NO_CURAND
-  RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // this is the only supported mode if build has no curand (PR #784)
+  RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // this is the only supported mode if build has no curand (PR #784 and #785)
+#elif defined __HIPCC__
+#error Internal error: MGONGPU_HAS_NO_CURAND should have been set for __HIPCC__ // default on AMD GPUs should be common random
 #elif defined __CUDACC__
-  RandomNumberMode rndgen = RandomNumberMode::CurandDevice; // default on GPU if build has curand
+  RandomNumberMode rndgen = RandomNumberMode::CurandDevice; // default on NVidia GPU if build has curand
 #else
   RandomNumberMode rndgen = RandomNumberMode::CurandHost; // default on CPU if build has curand
 #endif
@@ -146,10 +149,10 @@ main( int argc, char** argv )
     RamboHost = 1,
     RamboDevice = 2
   };
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   RamboSamplingMode rmbsmp = RamboSamplingMode::RamboDevice; // default on GPU
 #else
-  RamboSamplingMode rmbsmp = RamboSamplingMode::RamboHost;  // default on CPU
+  RamboSamplingMode rmbsmp = RamboSamplingMode::RamboHost; // default on CPU
 #endif
   // Bridge emulation mode (NB Bridge implies RamboHost!)
   bool bridge = false;
@@ -177,7 +180,7 @@ main( int argc, char** argv )
     else if( arg == "--curdev" )
     {
 #ifndef __CUDACC__
-      throw std::runtime_error( "CurandDevice is not supported on CPUs" );
+      throw std::runtime_error( "CurandDevice is not supported on CPUs or non-NVidia GPUs" );
 #elif defined MGONGPU_HAS_NO_CURAND
       throw std::runtime_error( "CurandDevice is not supported because this application was built without Curand support" );
 #else
@@ -198,7 +201,7 @@ main( int argc, char** argv )
     }
     else if( arg == "--rmbdev" )
     {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
       rmbsmp = RamboSamplingMode::RamboDevice;
 #else
       throw std::runtime_error( "RamboDevice is not supported on CPUs" );
@@ -272,13 +275,13 @@ main( int argc, char** argv )
     return usage( argv[0] );
   }
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 #ifdef _OPENMP
   ompnumthreadsNotSetMeansOneThread( debug ? 1 : 0 ); // quiet(-1), info(0), debug(1)
 #endif
 #endif
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // Fail gently and avoid "Illegal instruction (core dumped)" if the host does not support the SIMD used in the ME calculation
   // Note: this prevents a crash on pmpe04 but not on some github CI nodes?
   // [NB: SIMD vectorization in mg5amc C++ code is only used in the ME calculation below MatrixElementKernelHost!]
@@ -296,14 +299,14 @@ main( int argc, char** argv )
 
   // === STEP 0 - INITIALISE
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 
-  // --- 00. Initialise cuda
-  // Instantiate a CudaRuntime at the beginnining of the application's main to
-  // invoke cudaSetDevice(0) in the constructor and book a cudaDeviceReset() call in the destructor
-  const std::string cdinKey = "00 CudaInit";
+  // --- 00. Initialise GPU
+  // Instantiate a GpuRuntime at the beginnining of the application's main.
+  // For CUDA this invokes cudaSetDevice(0) in the constructor and books a cudaDeviceReset() call in the destructor.
+  const std::string cdinKey = "00 GpuInit";
   timermap.start( cdinKey );
-  CudaRuntime cudaRuntime( debug );
+  GpuRuntime GpuRuntime( debug );
 #endif
 
   // --- 0a. Initialise physics process
@@ -325,7 +328,7 @@ main( int argc, char** argv )
   timermap.start( alloKey );
 
   // Memory buffers for random numbers for momenta
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferRndNumMomenta hstRndmom( nevt );
 #else
   PinnedHostBufferRndNumMomenta hstRndmom( nevt );
@@ -333,7 +336,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for sampling weights
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferWeights hstWeights( nevt );
 #else
   PinnedHostBufferWeights hstWeights( nevt );
@@ -341,7 +344,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for momenta
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferMomenta hstMomenta( nevt );
 #else
   PinnedHostBufferMomenta hstMomenta( nevt );
@@ -349,7 +352,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for Gs
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferGs hstGs( nevt );
 #else
   PinnedHostBufferGs hstGs( nevt );
@@ -366,7 +369,7 @@ main( int argc, char** argv )
   }
 
   // Memory buffers for matrix elements
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferMatrixElements hstMatrixElements( nevt );
 #else
   PinnedHostBufferMatrixElements hstMatrixElements( nevt );
@@ -375,7 +378,7 @@ main( int argc, char** argv )
 
   // Memory buffers for random numbers for helicity selection
   // *** NB #403 these buffers always remain initialised at 0: no need for helicity choice in gcheck/check (no LHE produced) ***
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferRndNumHelicity hstRndHel( nevt );
 #else
   PinnedHostBufferRndNumHelicity hstRndHel( nevt );
@@ -384,7 +387,7 @@ main( int argc, char** argv )
 
   // Memory buffers for random numbers for color selection
   // *** NB #402 these buffers always remain initialised at 0: no need for color choice in gcheck/check (no LHE produced) ***
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferRndNumColor hstRndCol( nevt );
 #else
   PinnedHostBufferRndNumColor hstRndCol( nevt );
@@ -392,7 +395,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for helicity selection
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferSelectedHelicity hstSelHel( nevt );
 #else
   PinnedHostBufferSelectedHelicity hstSelHel( nevt );
@@ -400,7 +403,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for color selection
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferSelectedColor hstSelCol( nevt );
 #else
   PinnedHostBufferSelectedColor hstSelCol( nevt );
@@ -438,7 +441,7 @@ main( int argc, char** argv )
     const bool onDevice = true;
     prnk.reset( new CurandRandomNumberKernel( devRndmom, onDevice ) );
 #else
-    throw std::logic_error( "INTERNAL ERROR! CurandDevice is not supported on CPUs" ); // INTERNAL ERROR (no path to this statement)
+    throw std::logic_error( "INTERNAL ERROR! CurandDevice is not supported on CPUs or non-NVidia GPUs" ); // INTERNAL ERROR (no path to this statement)
 #endif
   }
 
@@ -450,7 +453,7 @@ main( int argc, char** argv )
   }
   else
   {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     prsk.reset( new RamboSamplingKernelDevice( energy, devRndmom, devMomenta, devWeights, gpublocks, gputhreads ) );
 #else
     throw std::logic_error( "RamboDevice is not supported on CPUs" ); // INTERNAL ERROR (no path to this statement)
@@ -461,7 +464,7 @@ main( int argc, char** argv )
   std::unique_ptr<MatrixElementKernelBase> pmek;
   if( !bridge )
   {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     pmek.reset( new MatrixElementKernelDevice( devMomenta, devGs, devRndHel, devRndCol, devMatrixElements, devSelHel, devSelCol, gpublocks, gputhreads ) );
 #else
     pmek.reset( new MatrixElementKernelHost( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, nevt ) );
@@ -469,7 +472,7 @@ main( int argc, char** argv )
   }
   else
   {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     pmek.reset( new BridgeKernelDevice( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, gpublocks, gputhreads ) );
 #else
     pmek.reset( new BridgeKernelHost( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, nevt ) );
@@ -511,7 +514,7 @@ main( int argc, char** argv )
     prnk->generateRnarray();
     //std::cout << "Got random numbers" << std::endl;
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     if( rndgen != RandomNumberMode::CurandDevice && rmbsmp == RamboSamplingMode::RamboDevice )
     {
       // --- 1c. Copy rndmom from host to device
@@ -543,7 +546,7 @@ main( int argc, char** argv )
     prsk->getMomentaFinal();
     //std::cout << "Got final momenta" << std::endl;
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     if( rmbsmp == RamboSamplingMode::RamboDevice )
     {
       // --- 2c. CopyDToH Weights
@@ -588,7 +591,7 @@ main( int argc, char** argv )
       dynamic_cast<BridgeKernelBase*>( pmek.get() )->transposeInputMomentaC2F();
     }
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     // --- 2d. CopyHToD Momenta
     const std::string gKey = "0.. CpHTDg";
     rambtime += timermap.start( gKey ); // FIXME! NOT A RAMBO TIMER!
@@ -617,7 +620,7 @@ main( int argc, char** argv )
     wv3atime += timermap.stop(); // calc only
     wavetime += wv3atime;        // calc plus copy
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     if( !bridge )
     {
       // --- 3b. CopyDToH MEs
@@ -760,18 +763,22 @@ main( int argc, char** argv )
     rndgentxt = "CURAND DEVICE";
 #ifdef __CUDACC__
   rndgentxt += " (CUDA code)";
+#elif defined __HIPCC__
+  rndgentxt += " (HIP code)";
 #else
   rndgentxt += " (C++ code)";
 #endif
 
   // Workflow description summary
   std::string wrkflwtxt;
-  // -- CUDA or C++?
+  // -- CUDA or HIP or C++?
 #ifdef __CUDACC__
   wrkflwtxt += "CUD:";
+#elif defined __HIPCC__
+  wrkflwtxt += "HIP:";
 #else
   wrkflwtxt += "CPP:";
-#endif
+#endif /* clang-format off */
   // -- DOUBLE or FLOAT?
 #if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
   wrkflwtxt += "MIX+"; // mixed fptypes (single precision color algebra #537)
@@ -781,7 +788,7 @@ main( int argc, char** argv )
   wrkflwtxt += "FLT+";
 #else
   wrkflwtxt += "???+"; // no path to this statement
-#endif
+#endif /* clang-format on */
   // -- CUCOMPLEX or THRUST or STD complex numbers?
 #ifdef __CUDACC__
 #if defined MGONGPU_CUCXTYPE_CUCOMPLEX
@@ -793,6 +800,12 @@ main( int argc, char** argv )
 #else
   wrkflwtxt += "???:"; // no path to this statement
 #endif
+#elif defined __HIPCC__
+#if defined MGONGPU_CUCXTYPE_CXSMPL
+  wrkflwtxt += "CXS:";
+#else
+  wrkflwtxt += "???:"; // no path to this statement
+#endif
 #else
 #if defined MGONGPU_CPPCXTYPE_STDCOMPLEX
   wrkflwtxt += "STX:";
@@ -818,7 +831,7 @@ main( int argc, char** argv )
     wrkflwtxt += "RMBDEV+";
   else
     wrkflwtxt += "??????+"; // no path to this statement
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   // -- HOST or DEVICE matrix elements? Standalone MEs or BRIDGE?
   if( !bridge )
     wrkflwtxt += "MESDEV";
@@ -874,7 +887,7 @@ main( int argc, char** argv )
 
   if( perf )
   {
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 #ifdef _OPENMP
     // Get the output of "nproc --all" (https://stackoverflow.com/a/478960)
     std::string nprocall;
@@ -895,6 +908,8 @@ main( int argc, char** argv )
     std::cout << std::string( SEP79, '*' ) << std::endl
 #ifdef __CUDACC__
               << "Process                     = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_CUDA"
+#elif defined __HIPCC__
+              << "Process                     = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_HIP"
 #else
               << "Process                     = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_CPP"
 #endif
@@ -921,21 +936,21 @@ main( int argc, char** argv )
 #elif defined MGONGPU_FPTYPE_FLOAT
               << "FP precision                = FLOAT (NaN/abnormal=" << nabn << ", zero=" << nzero << ")" << std::endl
 #endif
-#ifdef __CUDACC__
 #if defined MGONGPU_CUCXTYPE_CUCOMPLEX
               << "Complex type                = CUCOMPLEX" << std::endl
 #elif defined MGONGPU_CUCXTYPE_THRUST
               << "Complex type                = THRUST::COMPLEX" << std::endl
-#endif
-#else
+#elif defined MGONGPU_CUCXTYPE_CXSMPL
               << "Complex type                = STD::COMPLEX" << std::endl
+#else
+              << "Complex type                = ???" << std::endl // no path to this statement...
 #endif
               << "RanNumb memory layout       = AOSOA[" << neppR << "]"
               << ( neppR == 1 ? " == AOS" : "" )
               << " [HARDCODED FOR REPRODUCIBILITY]" << std::endl
               << "Momenta memory layout       = AOSOA[" << neppM << "]"
               << ( neppM == 1 ? " == AOS" : "" ) << std::endl
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     //<< "Wavefunction GPU memory     = LOCAL" << std::endl
 #else
 #if !defined MGONGPU_CPPSIMD
@@ -966,7 +981,7 @@ main( int argc, char** argv )
 #endif
 #endif
               << "Random number generation    = " << rndgentxt << std::endl
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 #ifdef _OPENMP
               << "OMP threads / `nproc --all` = " << omp_get_max_threads() << " / " << nprocall // includes a newline
 #endif
@@ -1062,14 +1077,14 @@ main( int argc, char** argv )
              << "\"FLOAT (NaN/abnormal=" << nabn << ")\"," << std::endl
 #endif
              << "\"Complex type\": "
-#ifdef __CUDACC__
 #if defined MGONGPU_CUCXTYPE_CUCOMPLEX
              << "\"CUCOMPLEX\"," << std::endl
 #elif defined MGONGPU_CUCXTYPE_THRUST
              << "\"THRUST::COMPLEX\"," << std::endl
-#endif
-#else
+#elif defined MGONGPU_CUCXTYPE_CXSMPL
              << "\"STD::COMPLEX\"," << std::endl
+#else
+             << "\"???\"," << std::endl                           // no path to this statement...
 #endif
              << "\"RanNumb memory layout\": "
              << "\"AOSOA[" << neppR << "]\""
@@ -1077,7 +1092,7 @@ main( int argc, char** argv )
              << "\"Momenta memory layout\": "
              << "\"AOSOA[" << neppM << "]\""
              << ( neppM == 1 ? " == AOS" : "" ) << ", " << std::endl
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     //<< "\"Wavefunction GPU memory\": " << "\"LOCAL\"," << std::endl
 #endif
              << "\"Curand generation\": "
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxuux/CPPProcess.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxuux/CPPProcess.cc
index 70fbbee59f..3ecdb48914 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxuux/CPPProcess.cc
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxuux/CPPProcess.cc
@@ -4,7 +4,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
 // MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
@@ -16,7 +16,6 @@
 
 #include "mgOnGpuConfig.h"
 
-#include "CudaRuntime.h"
 #include "HelAmps_sm.h"
 #include "MemoryAccessAmplitudes.h"
 #include "MemoryAccessCouplings.h"
@@ -49,7 +48,7 @@
 // Process: d d~ > t t~ d d~ WEIGHTED<=4 @2
 // Process: s s~ > t t~ s s~ WEIGHTED<=4 @2
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -83,7 +82,7 @@ namespace mg5amcCpu
   __device__ const fptype cIPD[2] = { (fptype)Parameters_sm::mdl_MT, (fptype)Parameters_sm::mdl_WT };
   __device__ const fptype* cIPC = nullptr; // unused as nicoup=0
 #else
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   __device__ __constant__ fptype cIPD[2];
   __device__ __constant__ fptype* cIPC = nullptr; // unused as nicoup=0
 #else
@@ -93,7 +92,7 @@ namespace mg5amcCpu
 #endif
 
   // Helicity combinations (and filtering of "good" helicity combinations)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   __device__ __constant__ short cHel[ncomb][npar];
   __device__ __constant__ int cNGoodHel;
   __device__ __constant__ int cGoodHel[ncomb];
@@ -121,13 +120,13 @@ namespace mg5amcCpu
                            fptype* allDenominators,       // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
                            fptype_sv* jamp2_sv            // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled)
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
                            , const int ievt00             // input: first event number in current C++ event page (for CUDA, ievt depends on threadid)
 #endif
                            )
   //ALWAYS_INLINE // attributes are not permitted in a function definition
   {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     using namespace mg5amcGpu;
     using M_ACCESS = DeviceAccessMomenta;         // non-trivial access: buffer includes all events
     using E_ACCESS = DeviceAccessMatrixElements;  // non-trivial access: buffer includes all events
@@ -154,7 +153,7 @@ namespace mg5amcCpu
 #endif /* clang-format on */
     mgDebug( 0, __FUNCTION__ );
     //printf( "calculate_wavefunctions: ihel=%2d\n", ihel );
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
     //printf( "calculate_wavefunctions: ievt00=%d\n", ievt00 );
 #endif
 
@@ -190,7 +189,7 @@ namespace mg5amcCpu
 #endif
     for( int iParity = 0; iParity < nParity; ++iParity )
     { // START LOOP ON IPARITY
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
       const int ievt0 = ievt00 + iParity * neppV;
 #endif
       constexpr size_t nxcoup = ndcoup + nicoup; // both dependent and independent couplings
@@ -203,8 +202,10 @@ namespace mg5amcCpu
         allCOUPs[idcoup] = CD_ACCESS::idcoupAccessBufferConst( allcouplings, idcoup ); // dependent couplings, vary event-by-event
       for( size_t iicoup = 0; iicoup < nicoup; iicoup++ )
         allCOUPs[ndcoup + iicoup] = CI_ACCESS::iicoupAccessBufferConst( cIPC, iicoup ); // independent couplings, fixed for all events
+#ifdef MGONGPUCPP_GPUIMPL
 #ifdef __CUDACC__
 #pragma nv_diagnostic pop
+#endif
       // CUDA kernels take input/output buffers with momenta/MEs for all events
       const fptype* momenta = allmomenta;
       const fptype* COUPs[nxcoup];
@@ -497,7 +498,7 @@ namespace mg5amcCpu
         { 3, 9, 9, 3, 27, 9 },
         { 9, 3, 3, 9, 9, 27 } }; // 2-D array[6][6]
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
       // Pre-compute a constexpr triangular color matrix properly normalized #475
       struct TriangularNormalizedColorMatrix
       {
@@ -554,7 +555,7 @@ namespace mg5amcCpu
 #endif
       for( int icol = 0; icol < ncolor; icol++ )
       {
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
         // === C++ START ===
         // Diagonal terms
 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
@@ -613,7 +614,7 @@ namespace mg5amcCpu
       MEs_sv_previous += deltaMEs_previous;
 #endif
       /*
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
       if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", blockDim.x * blockIdx.x + threadIdx.x, ihel, MEs_sv );
 #else
 #ifdef MGONGPU_CPPSIMD
@@ -708,8 +709,8 @@ namespace mg5amcCpu
       { -1, 1, 1, -1, -1, -1 },
       { -1, 1, 1, -1, 1, 1 },
       { -1, 1, 1, -1, 1, -1 } };
-#ifdef __CUDACC__
-    checkCuda( cudaMemcpyToSymbol( cHel, tHel, ncomb * npar * sizeof( short ) ) );
+#ifdef MGONGPUCPP_GPUIMPL
+    gpuMemcpyToSymbol( cHel, tHel, ncomb * npar * sizeof( short ) );
 #else
     memcpy( cHel, tHel, ncomb * npar * sizeof( short ) );
 #endif
@@ -751,9 +752,9 @@ namespace mg5amcCpu
     // Then copy them to CUDA constant memory (issue #39) or its C++ emulation in file-scope static memory
     const fptype tIPD[2] = { (fptype)m_pars->mdl_MT, (fptype)m_pars->mdl_WT };
     //const cxtype tIPC[0] = { ... }; // nicoup=0
-#ifdef __CUDACC__
-    checkCuda( cudaMemcpyToSymbol( cIPD, tIPD, 2 * sizeof( fptype ) ) );
-    //checkCuda( cudaMemcpyToSymbol( cIPC, tIPC, 0 * sizeof( cxtype ) ) ); // nicoup=0
+#ifdef MGONGPUCPP_GPUIMPL
+    gpuMemcpyToSymbol( cIPD, tIPD, 2 * sizeof( fptype ) );
+    //gpuMemcpyToSymbol( cIPC, tIPC, 0 * sizeof( cxtype ) ); // nicoup=0
 #else
     memcpy( cIPD, tIPD, 2 * sizeof( fptype ) );
     //memcpy( cIPC, tIPC, 0 * sizeof( cxtype ) ); // nicoup=0
@@ -791,7 +792,7 @@ namespace mg5amcCpu
   {
     std::stringstream out;
     // CUDA version (NVCC)
-    // [Use __NVCC__ instead of __CUDACC__ here!]
+    // [Use __NVCC__ instead of MGONGPUCPP_GPUIMPL here!]
     // [This tests if 'nvcc' was used even to build a .cc file, even if not necessarily 'nvcc -x cu' for a .cu file]
     // [Check 'nvcc --compiler-options -dM -E dummy.c | grep CUDA': see https://stackoverflow.com/a/53713712]
 #ifdef __NVCC__
@@ -856,12 +857,12 @@ namespace mg5amcCpu
   __global__ void /* clang-format off */
   computeDependentCouplings( const fptype* allgs, // input: Gs[nevt]
                              fptype* allcouplings // output: couplings[nevt*ndcoup*2]
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
                              , const int nevt     // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #endif
   ) /* clang-format on */
   {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     using namespace mg5amcGpu;
     using G_ACCESS = DeviceAccessGs;
     using C_ACCESS = DeviceAccessCouplings;
@@ -882,7 +883,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__ /* clang-format off */
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
   __global__ void
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
@@ -1008,9 +1009,9 @@ namespace mg5amcCpu
         nGoodHel++;
       }
     }
-#ifdef __CUDACC__
-    checkCuda( cudaMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) ) );
-    checkCuda( cudaMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) ) );
+#ifdef MGONGPUCPP_GPUIMPL
+    gpuMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) );
+    gpuMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) );
 #else
     cNGoodHel = nGoodHel;
     for( int ihel = 0; ihel < ncomb; ihel++ ) cGoodHel[ihel] = goodHel[ihel];
@@ -1034,7 +1035,7 @@ namespace mg5amcCpu
 #endif
             int* allselhel,                // output: helicity selection[nevt]
             int* allselcol                 // output: helicity selection[nevt]
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
             , const int nevt               // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #endif
             ) /* clang-format on */
@@ -1054,7 +1055,7 @@ namespace mg5amcCpu
     // Denominators: spins, colors and identical particles
     constexpr int helcolDenominators[1] = { 36 }; // assume nprocesses == 1 (#272 and #343)
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     // Remember: in CUDA this is a kernel for one event, in c++ this processes n events
     const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid
 #else
@@ -1068,9 +1069,12 @@ namespace mg5amcCpu
 #endif
 
     // Start sigmaKin_lines
+
+#include "GpuAbstraction.h"
+
     // === PART 0 - INITIALISATION (before calculate_wavefunctions) ===
     // Reset the "matrix elements" - running sums of |M|^2 over helicities for the given event
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     allMEs[ievt] = 0;
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     allNumerators[ievt] = 0;
@@ -1098,7 +1102,7 @@ namespace mg5amcCpu
     // === PART 1 - HELICITY LOOP: CALCULATE WAVEFUNCTIONS ===
     // (in both CUDA and C++, using precomputed good helicities)
 
-#ifdef __CUDACC__ // CUDA OR C++
+#ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++
 
     // *** START OF PART 1a - CUDA (one event per CPU thread) ***
     // Running sum of partial amplitudes squared for event by event color selection (#402)
@@ -1308,7 +1312,7 @@ namespace mg5amcCpu
     // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event
     // [NB 'sum over final spins, average over initial spins', eg see
     // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf]
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     allMEs[ievt] /= helcolDenominators[0];
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     if( channelId > 0 ) allMEs[ievt] *= allNumerators[ievt] / allDenominators[ievt];
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxuux/CPPProcess.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxuux/CPPProcess.h
index 58cece5c62..d8d3d481ea 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxuux/CPPProcess.h
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxuux/CPPProcess.h
@@ -4,7 +4,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
 // MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
@@ -25,7 +25,7 @@
 
 //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -110,7 +110,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   __global__ void
   computeDependentCouplings( const fptype* allgs,    // input: Gs[nevt]
                              fptype* allcouplings ); // output: couplings[nevt*ndcoup*2]
@@ -123,7 +123,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__ /* clang-format off */
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
   __global__ void
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
@@ -153,7 +153,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__ /* clang-format off */
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
   __global__ void
   sigmaKin( const fptype* allmomenta,      // input: momenta[nevt*npar*4]
             const fptype* allcouplings,    // input: couplings[nevt*ndcoup*2]
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxuux/CudaRuntime.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxuux/CudaRuntime.h
deleted file mode 120000
index ce9e1a487a..0000000000
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxuux/CudaRuntime.h
+++ /dev/null
@@ -1 +0,0 @@
-../CudaRuntime.h
\ No newline at end of file
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxuux/GpuAbstraction.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxuux/GpuAbstraction.h
new file mode 120000
index 0000000000..72054e19ba
--- /dev/null
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxuux/GpuAbstraction.h
@@ -0,0 +1 @@
+../GpuAbstraction.h
\ No newline at end of file
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxuux/GpuRuntime.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxuux/GpuRuntime.h
new file mode 120000
index 0000000000..3920e83be4
--- /dev/null
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxuux/GpuRuntime.h
@@ -0,0 +1 @@
+../GpuRuntime.h
\ No newline at end of file
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxuux/check_sa.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxuux/check_sa.cc
index 3fbf0ffbee..7cac5ab47b 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxuux/check_sa.cc
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxuux/check_sa.cc
@@ -4,7 +4,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: O. Mattelaer (Nov 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 
 #include "mgOnGpuConfig.h"
@@ -12,6 +12,7 @@
 #include "BridgeKernels.h"
 #include "CPPProcess.h"
 #include "CrossSectionKernels.h"
+#include "GpuRuntime.h"
 #include "MatrixElementKernels.h"
 #include "MemoryAccessMatrixElements.h"
 #include "MemoryAccessMomenta.h"
@@ -65,7 +66,7 @@ usage( char* argv0, int ret = 1 )
   std::cout << std::endl;
   std::cout << "Summary stats are always computed: '-p' and '-j' only control their printout" << std::endl;
   std::cout << "The '-d' flag only enables NaN/abnormal warnings and OMP debugging" << std::endl;
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 #ifdef _OPENMP
   std::cout << std::endl;
   std::cout << "Use the OMP_NUM_THREADS environment variable to control OMP multi-threading" << std::endl;
@@ -96,7 +97,7 @@ int
 main( int argc, char** argv )
 {
   // Namespaces for CUDA and C++ (FIXME - eventually use the same namespace everywhere...)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   using namespace mg5amcGpu;
 #else
   using namespace mg5amcCpu;
@@ -134,9 +135,11 @@ main( int argc, char** argv )
     CurandDevice = 2
   };
 #ifdef MGONGPU_HAS_NO_CURAND
-  RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // this is the only supported mode if build has no curand (PR #784)
+  RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // this is the only supported mode if build has no curand (PR #784 and #785)
+#elif defined __HIPCC__
+#error Internal error: MGONGPU_HAS_NO_CURAND should have been set for __HIPCC__ // default on AMD GPUs should be common random
 #elif defined __CUDACC__
-  RandomNumberMode rndgen = RandomNumberMode::CurandDevice; // default on GPU if build has curand
+  RandomNumberMode rndgen = RandomNumberMode::CurandDevice; // default on NVidia GPU if build has curand
 #else
   RandomNumberMode rndgen = RandomNumberMode::CurandHost; // default on CPU if build has curand
 #endif
@@ -146,10 +149,10 @@ main( int argc, char** argv )
     RamboHost = 1,
     RamboDevice = 2
   };
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   RamboSamplingMode rmbsmp = RamboSamplingMode::RamboDevice; // default on GPU
 #else
-  RamboSamplingMode rmbsmp = RamboSamplingMode::RamboHost;  // default on CPU
+  RamboSamplingMode rmbsmp = RamboSamplingMode::RamboHost; // default on CPU
 #endif
   // Bridge emulation mode (NB Bridge implies RamboHost!)
   bool bridge = false;
@@ -177,7 +180,7 @@ main( int argc, char** argv )
     else if( arg == "--curdev" )
     {
 #ifndef __CUDACC__
-      throw std::runtime_error( "CurandDevice is not supported on CPUs" );
+      throw std::runtime_error( "CurandDevice is not supported on CPUs or non-NVidia GPUs" );
 #elif defined MGONGPU_HAS_NO_CURAND
       throw std::runtime_error( "CurandDevice is not supported because this application was built without Curand support" );
 #else
@@ -198,7 +201,7 @@ main( int argc, char** argv )
     }
     else if( arg == "--rmbdev" )
     {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
       rmbsmp = RamboSamplingMode::RamboDevice;
 #else
       throw std::runtime_error( "RamboDevice is not supported on CPUs" );
@@ -272,13 +275,13 @@ main( int argc, char** argv )
     return usage( argv[0] );
   }
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 #ifdef _OPENMP
   ompnumthreadsNotSetMeansOneThread( debug ? 1 : 0 ); // quiet(-1), info(0), debug(1)
 #endif
 #endif
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // Fail gently and avoid "Illegal instruction (core dumped)" if the host does not support the SIMD used in the ME calculation
   // Note: this prevents a crash on pmpe04 but not on some github CI nodes?
   // [NB: SIMD vectorization in mg5amc C++ code is only used in the ME calculation below MatrixElementKernelHost!]
@@ -296,14 +299,14 @@ main( int argc, char** argv )
 
   // === STEP 0 - INITIALISE
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 
-  // --- 00. Initialise cuda
-  // Instantiate a CudaRuntime at the beginnining of the application's main to
-  // invoke cudaSetDevice(0) in the constructor and book a cudaDeviceReset() call in the destructor
-  const std::string cdinKey = "00 CudaInit";
+  // --- 00. Initialise GPU
+  // Instantiate a GpuRuntime at the beginnining of the application's main.
+  // For CUDA this invokes cudaSetDevice(0) in the constructor and books a cudaDeviceReset() call in the destructor.
+  const std::string cdinKey = "00 GpuInit";
   timermap.start( cdinKey );
-  CudaRuntime cudaRuntime( debug );
+  GpuRuntime GpuRuntime( debug );
 #endif
 
   // --- 0a. Initialise physics process
@@ -325,7 +328,7 @@ main( int argc, char** argv )
   timermap.start( alloKey );
 
   // Memory buffers for random numbers for momenta
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferRndNumMomenta hstRndmom( nevt );
 #else
   PinnedHostBufferRndNumMomenta hstRndmom( nevt );
@@ -333,7 +336,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for sampling weights
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferWeights hstWeights( nevt );
 #else
   PinnedHostBufferWeights hstWeights( nevt );
@@ -341,7 +344,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for momenta
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferMomenta hstMomenta( nevt );
 #else
   PinnedHostBufferMomenta hstMomenta( nevt );
@@ -349,7 +352,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for Gs
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferGs hstGs( nevt );
 #else
   PinnedHostBufferGs hstGs( nevt );
@@ -366,7 +369,7 @@ main( int argc, char** argv )
   }
 
   // Memory buffers for matrix elements
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferMatrixElements hstMatrixElements( nevt );
 #else
   PinnedHostBufferMatrixElements hstMatrixElements( nevt );
@@ -375,7 +378,7 @@ main( int argc, char** argv )
 
   // Memory buffers for random numbers for helicity selection
   // *** NB #403 these buffers always remain initialised at 0: no need for helicity choice in gcheck/check (no LHE produced) ***
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferRndNumHelicity hstRndHel( nevt );
 #else
   PinnedHostBufferRndNumHelicity hstRndHel( nevt );
@@ -384,7 +387,7 @@ main( int argc, char** argv )
 
   // Memory buffers for random numbers for color selection
   // *** NB #402 these buffers always remain initialised at 0: no need for color choice in gcheck/check (no LHE produced) ***
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferRndNumColor hstRndCol( nevt );
 #else
   PinnedHostBufferRndNumColor hstRndCol( nevt );
@@ -392,7 +395,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for helicity selection
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferSelectedHelicity hstSelHel( nevt );
 #else
   PinnedHostBufferSelectedHelicity hstSelHel( nevt );
@@ -400,7 +403,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for color selection
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferSelectedColor hstSelCol( nevt );
 #else
   PinnedHostBufferSelectedColor hstSelCol( nevt );
@@ -438,7 +441,7 @@ main( int argc, char** argv )
     const bool onDevice = true;
     prnk.reset( new CurandRandomNumberKernel( devRndmom, onDevice ) );
 #else
-    throw std::logic_error( "INTERNAL ERROR! CurandDevice is not supported on CPUs" ); // INTERNAL ERROR (no path to this statement)
+    throw std::logic_error( "INTERNAL ERROR! CurandDevice is not supported on CPUs or non-NVidia GPUs" ); // INTERNAL ERROR (no path to this statement)
 #endif
   }
 
@@ -450,7 +453,7 @@ main( int argc, char** argv )
   }
   else
   {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     prsk.reset( new RamboSamplingKernelDevice( energy, devRndmom, devMomenta, devWeights, gpublocks, gputhreads ) );
 #else
     throw std::logic_error( "RamboDevice is not supported on CPUs" ); // INTERNAL ERROR (no path to this statement)
@@ -461,7 +464,7 @@ main( int argc, char** argv )
   std::unique_ptr<MatrixElementKernelBase> pmek;
   if( !bridge )
   {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     pmek.reset( new MatrixElementKernelDevice( devMomenta, devGs, devRndHel, devRndCol, devMatrixElements, devSelHel, devSelCol, gpublocks, gputhreads ) );
 #else
     pmek.reset( new MatrixElementKernelHost( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, nevt ) );
@@ -469,7 +472,7 @@ main( int argc, char** argv )
   }
   else
   {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     pmek.reset( new BridgeKernelDevice( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, gpublocks, gputhreads ) );
 #else
     pmek.reset( new BridgeKernelHost( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, nevt ) );
@@ -511,7 +514,7 @@ main( int argc, char** argv )
     prnk->generateRnarray();
     //std::cout << "Got random numbers" << std::endl;
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     if( rndgen != RandomNumberMode::CurandDevice && rmbsmp == RamboSamplingMode::RamboDevice )
     {
       // --- 1c. Copy rndmom from host to device
@@ -543,7 +546,7 @@ main( int argc, char** argv )
     prsk->getMomentaFinal();
     //std::cout << "Got final momenta" << std::endl;
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     if( rmbsmp == RamboSamplingMode::RamboDevice )
     {
       // --- 2c. CopyDToH Weights
@@ -588,7 +591,7 @@ main( int argc, char** argv )
       dynamic_cast<BridgeKernelBase*>( pmek.get() )->transposeInputMomentaC2F();
     }
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     // --- 2d. CopyHToD Momenta
     const std::string gKey = "0.. CpHTDg";
     rambtime += timermap.start( gKey ); // FIXME! NOT A RAMBO TIMER!
@@ -617,7 +620,7 @@ main( int argc, char** argv )
     wv3atime += timermap.stop(); // calc only
     wavetime += wv3atime;        // calc plus copy
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     if( !bridge )
     {
       // --- 3b. CopyDToH MEs
@@ -760,18 +763,22 @@ main( int argc, char** argv )
     rndgentxt = "CURAND DEVICE";
 #ifdef __CUDACC__
   rndgentxt += " (CUDA code)";
+#elif defined __HIPCC__
+  rndgentxt += " (HIP code)";
 #else
   rndgentxt += " (C++ code)";
 #endif
 
   // Workflow description summary
   std::string wrkflwtxt;
-  // -- CUDA or C++?
+  // -- CUDA or HIP or C++?
 #ifdef __CUDACC__
   wrkflwtxt += "CUD:";
+#elif defined __HIPCC__
+  wrkflwtxt += "HIP:";
 #else
   wrkflwtxt += "CPP:";
-#endif
+#endif /* clang-format off */
   // -- DOUBLE or FLOAT?
 #if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
   wrkflwtxt += "MIX+"; // mixed fptypes (single precision color algebra #537)
@@ -781,7 +788,7 @@ main( int argc, char** argv )
   wrkflwtxt += "FLT+";
 #else
   wrkflwtxt += "???+"; // no path to this statement
-#endif
+#endif /* clang-format on */
   // -- CUCOMPLEX or THRUST or STD complex numbers?
 #ifdef __CUDACC__
 #if defined MGONGPU_CUCXTYPE_CUCOMPLEX
@@ -793,6 +800,12 @@ main( int argc, char** argv )
 #else
   wrkflwtxt += "???:"; // no path to this statement
 #endif
+#elif defined __HIPCC__
+#if defined MGONGPU_CUCXTYPE_CXSMPL
+  wrkflwtxt += "CXS:";
+#else
+  wrkflwtxt += "???:"; // no path to this statement
+#endif
 #else
 #if defined MGONGPU_CPPCXTYPE_STDCOMPLEX
   wrkflwtxt += "STX:";
@@ -818,7 +831,7 @@ main( int argc, char** argv )
     wrkflwtxt += "RMBDEV+";
   else
     wrkflwtxt += "??????+"; // no path to this statement
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   // -- HOST or DEVICE matrix elements? Standalone MEs or BRIDGE?
   if( !bridge )
     wrkflwtxt += "MESDEV";
@@ -874,7 +887,7 @@ main( int argc, char** argv )
 
   if( perf )
   {
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 #ifdef _OPENMP
     // Get the output of "nproc --all" (https://stackoverflow.com/a/478960)
     std::string nprocall;
@@ -895,6 +908,8 @@ main( int argc, char** argv )
     std::cout << std::string( SEP79, '*' ) << std::endl
 #ifdef __CUDACC__
               << "Process                     = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_CUDA"
+#elif defined __HIPCC__
+              << "Process                     = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_HIP"
 #else
               << "Process                     = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_CPP"
 #endif
@@ -921,21 +936,21 @@ main( int argc, char** argv )
 #elif defined MGONGPU_FPTYPE_FLOAT
               << "FP precision                = FLOAT (NaN/abnormal=" << nabn << ", zero=" << nzero << ")" << std::endl
 #endif
-#ifdef __CUDACC__
 #if defined MGONGPU_CUCXTYPE_CUCOMPLEX
               << "Complex type                = CUCOMPLEX" << std::endl
 #elif defined MGONGPU_CUCXTYPE_THRUST
               << "Complex type                = THRUST::COMPLEX" << std::endl
-#endif
-#else
+#elif defined MGONGPU_CUCXTYPE_CXSMPL
               << "Complex type                = STD::COMPLEX" << std::endl
+#else
+              << "Complex type                = ???" << std::endl // no path to this statement...
 #endif
               << "RanNumb memory layout       = AOSOA[" << neppR << "]"
               << ( neppR == 1 ? " == AOS" : "" )
               << " [HARDCODED FOR REPRODUCIBILITY]" << std::endl
               << "Momenta memory layout       = AOSOA[" << neppM << "]"
               << ( neppM == 1 ? " == AOS" : "" ) << std::endl
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     //<< "Wavefunction GPU memory     = LOCAL" << std::endl
 #else
 #if !defined MGONGPU_CPPSIMD
@@ -966,7 +981,7 @@ main( int argc, char** argv )
 #endif
 #endif
               << "Random number generation    = " << rndgentxt << std::endl
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 #ifdef _OPENMP
               << "OMP threads / `nproc --all` = " << omp_get_max_threads() << " / " << nprocall // includes a newline
 #endif
@@ -1062,14 +1077,14 @@ main( int argc, char** argv )
              << "\"FLOAT (NaN/abnormal=" << nabn << ")\"," << std::endl
 #endif
              << "\"Complex type\": "
-#ifdef __CUDACC__
 #if defined MGONGPU_CUCXTYPE_CUCOMPLEX
              << "\"CUCOMPLEX\"," << std::endl
 #elif defined MGONGPU_CUCXTYPE_THRUST
              << "\"THRUST::COMPLEX\"," << std::endl
-#endif
-#else
+#elif defined MGONGPU_CUCXTYPE_CXSMPL
              << "\"STD::COMPLEX\"," << std::endl
+#else
+             << "\"???\"," << std::endl                           // no path to this statement...
 #endif
              << "\"RanNumb memory layout\": "
              << "\"AOSOA[" << neppR << "]\""
@@ -1077,7 +1092,7 @@ main( int argc, char** argv )
              << "\"Momenta memory layout\": "
              << "\"AOSOA[" << neppM << "]\""
              << ( neppM == 1 ? " == AOS" : "" ) << ", " << std::endl
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     //<< "\"Wavefunction GPU memory\": " << "\"LOCAL\"," << std::endl
 #endif
              << "\"Curand generation\": "
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxcx_ttxuxcx/CPPProcess.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxcx_ttxuxcx/CPPProcess.cc
index 7df13a2341..e21d1f0c48 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxcx_ttxuxcx/CPPProcess.cc
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxcx_ttxuxcx/CPPProcess.cc
@@ -4,7 +4,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
 // MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
@@ -16,7 +16,6 @@
 
 #include "mgOnGpuConfig.h"
 
-#include "CudaRuntime.h"
 #include "HelAmps_sm.h"
 #include "MemoryAccessAmplitudes.h"
 #include "MemoryAccessCouplings.h"
@@ -51,7 +50,7 @@
 // Process: c~ s~ > t t~ c~ s~ WEIGHTED<=4 @2
 // Process: d~ s~ > t t~ d~ s~ WEIGHTED<=4 @2
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -85,7 +84,7 @@ namespace mg5amcCpu
   __device__ const fptype cIPD[2] = { (fptype)Parameters_sm::mdl_MT, (fptype)Parameters_sm::mdl_WT };
   __device__ const fptype* cIPC = nullptr; // unused as nicoup=0
 #else
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   __device__ __constant__ fptype cIPD[2];
   __device__ __constant__ fptype* cIPC = nullptr; // unused as nicoup=0
 #else
@@ -95,7 +94,7 @@ namespace mg5amcCpu
 #endif
 
   // Helicity combinations (and filtering of "good" helicity combinations)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   __device__ __constant__ short cHel[ncomb][npar];
   __device__ __constant__ int cNGoodHel;
   __device__ __constant__ int cGoodHel[ncomb];
@@ -123,13 +122,13 @@ namespace mg5amcCpu
                            fptype* allDenominators,       // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
                            fptype_sv* jamp2_sv            // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled)
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
                            , const int ievt00             // input: first event number in current C++ event page (for CUDA, ievt depends on threadid)
 #endif
                            )
   //ALWAYS_INLINE // attributes are not permitted in a function definition
   {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     using namespace mg5amcGpu;
     using M_ACCESS = DeviceAccessMomenta;         // non-trivial access: buffer includes all events
     using E_ACCESS = DeviceAccessMatrixElements;  // non-trivial access: buffer includes all events
@@ -156,7 +155,7 @@ namespace mg5amcCpu
 #endif /* clang-format on */
     mgDebug( 0, __FUNCTION__ );
     //printf( "calculate_wavefunctions: ihel=%2d\n", ihel );
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
     //printf( "calculate_wavefunctions: ievt00=%d\n", ievt00 );
 #endif
 
@@ -192,7 +191,7 @@ namespace mg5amcCpu
 #endif
     for( int iParity = 0; iParity < nParity; ++iParity )
     { // START LOOP ON IPARITY
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
       const int ievt0 = ievt00 + iParity * neppV;
 #endif
       constexpr size_t nxcoup = ndcoup + nicoup; // both dependent and independent couplings
@@ -205,8 +204,10 @@ namespace mg5amcCpu
         allCOUPs[idcoup] = CD_ACCESS::idcoupAccessBufferConst( allcouplings, idcoup ); // dependent couplings, vary event-by-event
       for( size_t iicoup = 0; iicoup < nicoup; iicoup++ )
         allCOUPs[ndcoup + iicoup] = CI_ACCESS::iicoupAccessBufferConst( cIPC, iicoup ); // independent couplings, fixed for all events
+#ifdef MGONGPUCPP_GPUIMPL
 #ifdef __CUDACC__
 #pragma nv_diagnostic pop
+#endif
       // CUDA kernels take input/output buffers with momenta/MEs for all events
       const fptype* momenta = allmomenta;
       const fptype* COUPs[nxcoup];
@@ -387,7 +388,7 @@ namespace mg5amcCpu
         { 3, 9, 9, 3, 27, 9 },
         { 9, 3, 3, 9, 9, 27 } }; // 2-D array[6][6]
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
       // Pre-compute a constexpr triangular color matrix properly normalized #475
       struct TriangularNormalizedColorMatrix
       {
@@ -444,7 +445,7 @@ namespace mg5amcCpu
 #endif
       for( int icol = 0; icol < ncolor; icol++ )
       {
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
         // === C++ START ===
         // Diagonal terms
 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
@@ -503,7 +504,7 @@ namespace mg5amcCpu
       MEs_sv_previous += deltaMEs_previous;
 #endif
       /*
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
       if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", blockDim.x * blockIdx.x + threadIdx.x, ihel, MEs_sv );
 #else
 #ifdef MGONGPU_CPPSIMD
@@ -598,8 +599,8 @@ namespace mg5amcCpu
       { 1, 1, 1, -1, 1, -1 },
       { 1, 1, 1, -1, -1, 1 },
       { 1, 1, 1, -1, -1, -1 } };
-#ifdef __CUDACC__
-    checkCuda( cudaMemcpyToSymbol( cHel, tHel, ncomb * npar * sizeof( short ) ) );
+#ifdef MGONGPUCPP_GPUIMPL
+    gpuMemcpyToSymbol( cHel, tHel, ncomb * npar * sizeof( short ) );
 #else
     memcpy( cHel, tHel, ncomb * npar * sizeof( short ) );
 #endif
@@ -641,9 +642,9 @@ namespace mg5amcCpu
     // Then copy them to CUDA constant memory (issue #39) or its C++ emulation in file-scope static memory
     const fptype tIPD[2] = { (fptype)m_pars->mdl_MT, (fptype)m_pars->mdl_WT };
     //const cxtype tIPC[0] = { ... }; // nicoup=0
-#ifdef __CUDACC__
-    checkCuda( cudaMemcpyToSymbol( cIPD, tIPD, 2 * sizeof( fptype ) ) );
-    //checkCuda( cudaMemcpyToSymbol( cIPC, tIPC, 0 * sizeof( cxtype ) ) ); // nicoup=0
+#ifdef MGONGPUCPP_GPUIMPL
+    gpuMemcpyToSymbol( cIPD, tIPD, 2 * sizeof( fptype ) );
+    //gpuMemcpyToSymbol( cIPC, tIPC, 0 * sizeof( cxtype ) ); // nicoup=0
 #else
     memcpy( cIPD, tIPD, 2 * sizeof( fptype ) );
     //memcpy( cIPC, tIPC, 0 * sizeof( cxtype ) ); // nicoup=0
@@ -681,7 +682,7 @@ namespace mg5amcCpu
   {
     std::stringstream out;
     // CUDA version (NVCC)
-    // [Use __NVCC__ instead of __CUDACC__ here!]
+    // [Use __NVCC__ instead of MGONGPUCPP_GPUIMPL here!]
     // [This tests if 'nvcc' was used even to build a .cc file, even if not necessarily 'nvcc -x cu' for a .cu file]
     // [Check 'nvcc --compiler-options -dM -E dummy.c | grep CUDA': see https://stackoverflow.com/a/53713712]
 #ifdef __NVCC__
@@ -746,12 +747,12 @@ namespace mg5amcCpu
   __global__ void /* clang-format off */
   computeDependentCouplings( const fptype* allgs, // input: Gs[nevt]
                              fptype* allcouplings // output: couplings[nevt*ndcoup*2]
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
                              , const int nevt     // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #endif
   ) /* clang-format on */
   {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     using namespace mg5amcGpu;
     using G_ACCESS = DeviceAccessGs;
     using C_ACCESS = DeviceAccessCouplings;
@@ -772,7 +773,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__ /* clang-format off */
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
   __global__ void
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
@@ -898,9 +899,9 @@ namespace mg5amcCpu
         nGoodHel++;
       }
     }
-#ifdef __CUDACC__
-    checkCuda( cudaMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) ) );
-    checkCuda( cudaMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) ) );
+#ifdef MGONGPUCPP_GPUIMPL
+    gpuMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) );
+    gpuMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) );
 #else
     cNGoodHel = nGoodHel;
     for( int ihel = 0; ihel < ncomb; ihel++ ) cGoodHel[ihel] = goodHel[ihel];
@@ -924,7 +925,7 @@ namespace mg5amcCpu
 #endif
             int* allselhel,                // output: helicity selection[nevt]
             int* allselcol                 // output: helicity selection[nevt]
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
             , const int nevt               // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #endif
             ) /* clang-format on */
@@ -944,7 +945,7 @@ namespace mg5amcCpu
     // Denominators: spins, colors and identical particles
     constexpr int helcolDenominators[1] = { 36 }; // assume nprocesses == 1 (#272 and #343)
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     // Remember: in CUDA this is a kernel for one event, in c++ this processes n events
     const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid
 #else
@@ -958,9 +959,12 @@ namespace mg5amcCpu
 #endif
 
     // Start sigmaKin_lines
+
+#include "GpuAbstraction.h"
+
     // === PART 0 - INITIALISATION (before calculate_wavefunctions) ===
     // Reset the "matrix elements" - running sums of |M|^2 over helicities for the given event
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     allMEs[ievt] = 0;
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     allNumerators[ievt] = 0;
@@ -988,7 +992,7 @@ namespace mg5amcCpu
     // === PART 1 - HELICITY LOOP: CALCULATE WAVEFUNCTIONS ===
     // (in both CUDA and C++, using precomputed good helicities)
 
-#ifdef __CUDACC__ // CUDA OR C++
+#ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++
 
     // *** START OF PART 1a - CUDA (one event per CPU thread) ***
     // Running sum of partial amplitudes squared for event by event color selection (#402)
@@ -1198,7 +1202,7 @@ namespace mg5amcCpu
     // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event
     // [NB 'sum over final spins, average over initial spins', eg see
     // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf]
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     allMEs[ievt] /= helcolDenominators[0];
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     if( channelId > 0 ) allMEs[ievt] *= allNumerators[ievt] / allDenominators[ievt];
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxcx_ttxuxcx/CPPProcess.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxcx_ttxuxcx/CPPProcess.h
index 6bd3135c3c..901c6dfcc9 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxcx_ttxuxcx/CPPProcess.h
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxcx_ttxuxcx/CPPProcess.h
@@ -4,7 +4,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
 // MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
@@ -25,7 +25,7 @@
 
 //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -112,7 +112,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   __global__ void
   computeDependentCouplings( const fptype* allgs,    // input: Gs[nevt]
                              fptype* allcouplings ); // output: couplings[nevt*ndcoup*2]
@@ -125,7 +125,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__ /* clang-format off */
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
   __global__ void
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
@@ -155,7 +155,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__ /* clang-format off */
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
   __global__ void
   sigmaKin( const fptype* allmomenta,      // input: momenta[nevt*npar*4]
             const fptype* allcouplings,    // input: couplings[nevt*ndcoup*2]
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxcx_ttxuxcx/CudaRuntime.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxcx_ttxuxcx/CudaRuntime.h
deleted file mode 120000
index ce9e1a487a..0000000000
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxcx_ttxuxcx/CudaRuntime.h
+++ /dev/null
@@ -1 +0,0 @@
-../CudaRuntime.h
\ No newline at end of file
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxcx_ttxuxcx/GpuAbstraction.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxcx_ttxuxcx/GpuAbstraction.h
new file mode 120000
index 0000000000..72054e19ba
--- /dev/null
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxcx_ttxuxcx/GpuAbstraction.h
@@ -0,0 +1 @@
+../GpuAbstraction.h
\ No newline at end of file
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxcx_ttxuxcx/GpuRuntime.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxcx_ttxuxcx/GpuRuntime.h
new file mode 120000
index 0000000000..3920e83be4
--- /dev/null
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxcx_ttxuxcx/GpuRuntime.h
@@ -0,0 +1 @@
+../GpuRuntime.h
\ No newline at end of file
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxcx_ttxuxcx/check_sa.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxcx_ttxuxcx/check_sa.cc
index 3fbf0ffbee..7cac5ab47b 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxcx_ttxuxcx/check_sa.cc
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxcx_ttxuxcx/check_sa.cc
@@ -4,7 +4,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: O. Mattelaer (Nov 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 
 #include "mgOnGpuConfig.h"
@@ -12,6 +12,7 @@
 #include "BridgeKernels.h"
 #include "CPPProcess.h"
 #include "CrossSectionKernels.h"
+#include "GpuRuntime.h"
 #include "MatrixElementKernels.h"
 #include "MemoryAccessMatrixElements.h"
 #include "MemoryAccessMomenta.h"
@@ -65,7 +66,7 @@ usage( char* argv0, int ret = 1 )
   std::cout << std::endl;
   std::cout << "Summary stats are always computed: '-p' and '-j' only control their printout" << std::endl;
   std::cout << "The '-d' flag only enables NaN/abnormal warnings and OMP debugging" << std::endl;
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 #ifdef _OPENMP
   std::cout << std::endl;
   std::cout << "Use the OMP_NUM_THREADS environment variable to control OMP multi-threading" << std::endl;
@@ -96,7 +97,7 @@ int
 main( int argc, char** argv )
 {
   // Namespaces for CUDA and C++ (FIXME - eventually use the same namespace everywhere...)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   using namespace mg5amcGpu;
 #else
   using namespace mg5amcCpu;
@@ -134,9 +135,11 @@ main( int argc, char** argv )
     CurandDevice = 2
   };
 #ifdef MGONGPU_HAS_NO_CURAND
-  RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // this is the only supported mode if build has no curand (PR #784)
+  RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // this is the only supported mode if build has no curand (PR #784 and #785)
+#elif defined __HIPCC__
+#error Internal error: MGONGPU_HAS_NO_CURAND should have been set for __HIPCC__ // default on AMD GPUs should be common random
 #elif defined __CUDACC__
-  RandomNumberMode rndgen = RandomNumberMode::CurandDevice; // default on GPU if build has curand
+  RandomNumberMode rndgen = RandomNumberMode::CurandDevice; // default on NVidia GPU if build has curand
 #else
   RandomNumberMode rndgen = RandomNumberMode::CurandHost; // default on CPU if build has curand
 #endif
@@ -146,10 +149,10 @@ main( int argc, char** argv )
     RamboHost = 1,
     RamboDevice = 2
   };
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   RamboSamplingMode rmbsmp = RamboSamplingMode::RamboDevice; // default on GPU
 #else
-  RamboSamplingMode rmbsmp = RamboSamplingMode::RamboHost;  // default on CPU
+  RamboSamplingMode rmbsmp = RamboSamplingMode::RamboHost; // default on CPU
 #endif
   // Bridge emulation mode (NB Bridge implies RamboHost!)
   bool bridge = false;
@@ -177,7 +180,7 @@ main( int argc, char** argv )
     else if( arg == "--curdev" )
     {
 #ifndef __CUDACC__
-      throw std::runtime_error( "CurandDevice is not supported on CPUs" );
+      throw std::runtime_error( "CurandDevice is not supported on CPUs or non-NVidia GPUs" );
 #elif defined MGONGPU_HAS_NO_CURAND
       throw std::runtime_error( "CurandDevice is not supported because this application was built without Curand support" );
 #else
@@ -198,7 +201,7 @@ main( int argc, char** argv )
     }
     else if( arg == "--rmbdev" )
     {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
       rmbsmp = RamboSamplingMode::RamboDevice;
 #else
       throw std::runtime_error( "RamboDevice is not supported on CPUs" );
@@ -272,13 +275,13 @@ main( int argc, char** argv )
     return usage( argv[0] );
   }
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 #ifdef _OPENMP
   ompnumthreadsNotSetMeansOneThread( debug ? 1 : 0 ); // quiet(-1), info(0), debug(1)
 #endif
 #endif
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // Fail gently and avoid "Illegal instruction (core dumped)" if the host does not support the SIMD used in the ME calculation
   // Note: this prevents a crash on pmpe04 but not on some github CI nodes?
   // [NB: SIMD vectorization in mg5amc C++ code is only used in the ME calculation below MatrixElementKernelHost!]
@@ -296,14 +299,14 @@ main( int argc, char** argv )
 
   // === STEP 0 - INITIALISE
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 
-  // --- 00. Initialise cuda
-  // Instantiate a CudaRuntime at the beginnining of the application's main to
-  // invoke cudaSetDevice(0) in the constructor and book a cudaDeviceReset() call in the destructor
-  const std::string cdinKey = "00 CudaInit";
+  // --- 00. Initialise GPU
+  // Instantiate a GpuRuntime at the beginnining of the application's main.
+  // For CUDA this invokes cudaSetDevice(0) in the constructor and books a cudaDeviceReset() call in the destructor.
+  const std::string cdinKey = "00 GpuInit";
   timermap.start( cdinKey );
-  CudaRuntime cudaRuntime( debug );
+  GpuRuntime GpuRuntime( debug );
 #endif
 
   // --- 0a. Initialise physics process
@@ -325,7 +328,7 @@ main( int argc, char** argv )
   timermap.start( alloKey );
 
   // Memory buffers for random numbers for momenta
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferRndNumMomenta hstRndmom( nevt );
 #else
   PinnedHostBufferRndNumMomenta hstRndmom( nevt );
@@ -333,7 +336,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for sampling weights
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferWeights hstWeights( nevt );
 #else
   PinnedHostBufferWeights hstWeights( nevt );
@@ -341,7 +344,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for momenta
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferMomenta hstMomenta( nevt );
 #else
   PinnedHostBufferMomenta hstMomenta( nevt );
@@ -349,7 +352,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for Gs
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferGs hstGs( nevt );
 #else
   PinnedHostBufferGs hstGs( nevt );
@@ -366,7 +369,7 @@ main( int argc, char** argv )
   }
 
   // Memory buffers for matrix elements
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferMatrixElements hstMatrixElements( nevt );
 #else
   PinnedHostBufferMatrixElements hstMatrixElements( nevt );
@@ -375,7 +378,7 @@ main( int argc, char** argv )
 
   // Memory buffers for random numbers for helicity selection
   // *** NB #403 these buffers always remain initialised at 0: no need for helicity choice in gcheck/check (no LHE produced) ***
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferRndNumHelicity hstRndHel( nevt );
 #else
   PinnedHostBufferRndNumHelicity hstRndHel( nevt );
@@ -384,7 +387,7 @@ main( int argc, char** argv )
 
   // Memory buffers for random numbers for color selection
   // *** NB #402 these buffers always remain initialised at 0: no need for color choice in gcheck/check (no LHE produced) ***
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferRndNumColor hstRndCol( nevt );
 #else
   PinnedHostBufferRndNumColor hstRndCol( nevt );
@@ -392,7 +395,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for helicity selection
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferSelectedHelicity hstSelHel( nevt );
 #else
   PinnedHostBufferSelectedHelicity hstSelHel( nevt );
@@ -400,7 +403,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for color selection
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferSelectedColor hstSelCol( nevt );
 #else
   PinnedHostBufferSelectedColor hstSelCol( nevt );
@@ -438,7 +441,7 @@ main( int argc, char** argv )
     const bool onDevice = true;
     prnk.reset( new CurandRandomNumberKernel( devRndmom, onDevice ) );
 #else
-    throw std::logic_error( "INTERNAL ERROR! CurandDevice is not supported on CPUs" ); // INTERNAL ERROR (no path to this statement)
+    throw std::logic_error( "INTERNAL ERROR! CurandDevice is not supported on CPUs or non-NVidia GPUs" ); // INTERNAL ERROR (no path to this statement)
 #endif
   }
 
@@ -450,7 +453,7 @@ main( int argc, char** argv )
   }
   else
   {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     prsk.reset( new RamboSamplingKernelDevice( energy, devRndmom, devMomenta, devWeights, gpublocks, gputhreads ) );
 #else
     throw std::logic_error( "RamboDevice is not supported on CPUs" ); // INTERNAL ERROR (no path to this statement)
@@ -461,7 +464,7 @@ main( int argc, char** argv )
   std::unique_ptr<MatrixElementKernelBase> pmek;
   if( !bridge )
   {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     pmek.reset( new MatrixElementKernelDevice( devMomenta, devGs, devRndHel, devRndCol, devMatrixElements, devSelHel, devSelCol, gpublocks, gputhreads ) );
 #else
     pmek.reset( new MatrixElementKernelHost( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, nevt ) );
@@ -469,7 +472,7 @@ main( int argc, char** argv )
   }
   else
   {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     pmek.reset( new BridgeKernelDevice( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, gpublocks, gputhreads ) );
 #else
     pmek.reset( new BridgeKernelHost( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, nevt ) );
@@ -511,7 +514,7 @@ main( int argc, char** argv )
     prnk->generateRnarray();
     //std::cout << "Got random numbers" << std::endl;
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     if( rndgen != RandomNumberMode::CurandDevice && rmbsmp == RamboSamplingMode::RamboDevice )
     {
       // --- 1c. Copy rndmom from host to device
@@ -543,7 +546,7 @@ main( int argc, char** argv )
     prsk->getMomentaFinal();
     //std::cout << "Got final momenta" << std::endl;
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     if( rmbsmp == RamboSamplingMode::RamboDevice )
     {
       // --- 2c. CopyDToH Weights
@@ -588,7 +591,7 @@ main( int argc, char** argv )
       dynamic_cast<BridgeKernelBase*>( pmek.get() )->transposeInputMomentaC2F();
     }
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     // --- 2d. CopyHToD Momenta
     const std::string gKey = "0.. CpHTDg";
     rambtime += timermap.start( gKey ); // FIXME! NOT A RAMBO TIMER!
@@ -617,7 +620,7 @@ main( int argc, char** argv )
     wv3atime += timermap.stop(); // calc only
     wavetime += wv3atime;        // calc plus copy
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     if( !bridge )
     {
       // --- 3b. CopyDToH MEs
@@ -760,18 +763,22 @@ main( int argc, char** argv )
     rndgentxt = "CURAND DEVICE";
 #ifdef __CUDACC__
   rndgentxt += " (CUDA code)";
+#elif defined __HIPCC__
+  rndgentxt += " (HIP code)";
 #else
   rndgentxt += " (C++ code)";
 #endif
 
   // Workflow description summary
   std::string wrkflwtxt;
-  // -- CUDA or C++?
+  // -- CUDA or HIP or C++?
 #ifdef __CUDACC__
   wrkflwtxt += "CUD:";
+#elif defined __HIPCC__
+  wrkflwtxt += "HIP:";
 #else
   wrkflwtxt += "CPP:";
-#endif
+#endif /* clang-format off */
   // -- DOUBLE or FLOAT?
 #if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
   wrkflwtxt += "MIX+"; // mixed fptypes (single precision color algebra #537)
@@ -781,7 +788,7 @@ main( int argc, char** argv )
   wrkflwtxt += "FLT+";
 #else
   wrkflwtxt += "???+"; // no path to this statement
-#endif
+#endif /* clang-format on */
   // -- CUCOMPLEX or THRUST or STD complex numbers?
 #ifdef __CUDACC__
 #if defined MGONGPU_CUCXTYPE_CUCOMPLEX
@@ -793,6 +800,12 @@ main( int argc, char** argv )
 #else
   wrkflwtxt += "???:"; // no path to this statement
 #endif
+#elif defined __HIPCC__
+#if defined MGONGPU_CUCXTYPE_CXSMPL
+  wrkflwtxt += "CXS:";
+#else
+  wrkflwtxt += "???:"; // no path to this statement
+#endif
 #else
 #if defined MGONGPU_CPPCXTYPE_STDCOMPLEX
   wrkflwtxt += "STX:";
@@ -818,7 +831,7 @@ main( int argc, char** argv )
     wrkflwtxt += "RMBDEV+";
   else
     wrkflwtxt += "??????+"; // no path to this statement
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   // -- HOST or DEVICE matrix elements? Standalone MEs or BRIDGE?
   if( !bridge )
     wrkflwtxt += "MESDEV";
@@ -874,7 +887,7 @@ main( int argc, char** argv )
 
   if( perf )
   {
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 #ifdef _OPENMP
     // Get the output of "nproc --all" (https://stackoverflow.com/a/478960)
     std::string nprocall;
@@ -895,6 +908,8 @@ main( int argc, char** argv )
     std::cout << std::string( SEP79, '*' ) << std::endl
 #ifdef __CUDACC__
               << "Process                     = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_CUDA"
+#elif defined __HIPCC__
+              << "Process                     = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_HIP"
 #else
               << "Process                     = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_CPP"
 #endif
@@ -921,21 +936,21 @@ main( int argc, char** argv )
 #elif defined MGONGPU_FPTYPE_FLOAT
               << "FP precision                = FLOAT (NaN/abnormal=" << nabn << ", zero=" << nzero << ")" << std::endl
 #endif
-#ifdef __CUDACC__
 #if defined MGONGPU_CUCXTYPE_CUCOMPLEX
               << "Complex type                = CUCOMPLEX" << std::endl
 #elif defined MGONGPU_CUCXTYPE_THRUST
               << "Complex type                = THRUST::COMPLEX" << std::endl
-#endif
-#else
+#elif defined MGONGPU_CUCXTYPE_CXSMPL
               << "Complex type                = STD::COMPLEX" << std::endl
+#else
+              << "Complex type                = ???" << std::endl // no path to this statement...
 #endif
               << "RanNumb memory layout       = AOSOA[" << neppR << "]"
               << ( neppR == 1 ? " == AOS" : "" )
               << " [HARDCODED FOR REPRODUCIBILITY]" << std::endl
               << "Momenta memory layout       = AOSOA[" << neppM << "]"
               << ( neppM == 1 ? " == AOS" : "" ) << std::endl
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     //<< "Wavefunction GPU memory     = LOCAL" << std::endl
 #else
 #if !defined MGONGPU_CPPSIMD
@@ -966,7 +981,7 @@ main( int argc, char** argv )
 #endif
 #endif
               << "Random number generation    = " << rndgentxt << std::endl
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 #ifdef _OPENMP
               << "OMP threads / `nproc --all` = " << omp_get_max_threads() << " / " << nprocall // includes a newline
 #endif
@@ -1062,14 +1077,14 @@ main( int argc, char** argv )
              << "\"FLOAT (NaN/abnormal=" << nabn << ")\"," << std::endl
 #endif
              << "\"Complex type\": "
-#ifdef __CUDACC__
 #if defined MGONGPU_CUCXTYPE_CUCOMPLEX
              << "\"CUCOMPLEX\"," << std::endl
 #elif defined MGONGPU_CUCXTYPE_THRUST
              << "\"THRUST::COMPLEX\"," << std::endl
-#endif
-#else
+#elif defined MGONGPU_CUCXTYPE_CXSMPL
              << "\"STD::COMPLEX\"," << std::endl
+#else
+             << "\"???\"," << std::endl                           // no path to this statement...
 #endif
              << "\"RanNumb memory layout\": "
              << "\"AOSOA[" << neppR << "]\""
@@ -1077,7 +1092,7 @@ main( int argc, char** argv )
              << "\"Momenta memory layout\": "
              << "\"AOSOA[" << neppM << "]\""
              << ( neppM == 1 ? " == AOS" : "" ) << ", " << std::endl
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     //<< "\"Wavefunction GPU memory\": " << "\"LOCAL\"," << std::endl
 #endif
              << "\"Curand generation\": "
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxux_ttxuxux/CPPProcess.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxux_ttxuxux/CPPProcess.cc
index f464c27160..527b1d3c8f 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxux_ttxuxux/CPPProcess.cc
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxux_ttxuxux/CPPProcess.cc
@@ -4,7 +4,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
 // MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
@@ -16,7 +16,6 @@
 
 #include "mgOnGpuConfig.h"
 
-#include "CudaRuntime.h"
 #include "HelAmps_sm.h"
 #include "MemoryAccessAmplitudes.h"
 #include "MemoryAccessCouplings.h"
@@ -49,7 +48,7 @@
 // Process: d~ d~ > t t~ d~ d~ WEIGHTED<=4 @2
 // Process: s~ s~ > t t~ s~ s~ WEIGHTED<=4 @2
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -83,7 +82,7 @@ namespace mg5amcCpu
   __device__ const fptype cIPD[2] = { (fptype)Parameters_sm::mdl_MT, (fptype)Parameters_sm::mdl_WT };
   __device__ const fptype* cIPC = nullptr; // unused as nicoup=0
 #else
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   __device__ __constant__ fptype cIPD[2];
   __device__ __constant__ fptype* cIPC = nullptr; // unused as nicoup=0
 #else
@@ -93,7 +92,7 @@ namespace mg5amcCpu
 #endif
 
   // Helicity combinations (and filtering of "good" helicity combinations)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   __device__ __constant__ short cHel[ncomb][npar];
   __device__ __constant__ int cNGoodHel;
   __device__ __constant__ int cGoodHel[ncomb];
@@ -121,13 +120,13 @@ namespace mg5amcCpu
                            fptype* allDenominators,       // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
                            fptype_sv* jamp2_sv            // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled)
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
                            , const int ievt00             // input: first event number in current C++ event page (for CUDA, ievt depends on threadid)
 #endif
                            )
   //ALWAYS_INLINE // attributes are not permitted in a function definition
   {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     using namespace mg5amcGpu;
     using M_ACCESS = DeviceAccessMomenta;         // non-trivial access: buffer includes all events
     using E_ACCESS = DeviceAccessMatrixElements;  // non-trivial access: buffer includes all events
@@ -154,7 +153,7 @@ namespace mg5amcCpu
 #endif /* clang-format on */
     mgDebug( 0, __FUNCTION__ );
     //printf( "calculate_wavefunctions: ihel=%2d\n", ihel );
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
     //printf( "calculate_wavefunctions: ievt00=%d\n", ievt00 );
 #endif
 
@@ -190,7 +189,7 @@ namespace mg5amcCpu
 #endif
     for( int iParity = 0; iParity < nParity; ++iParity )
     { // START LOOP ON IPARITY
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
       const int ievt0 = ievt00 + iParity * neppV;
 #endif
       constexpr size_t nxcoup = ndcoup + nicoup; // both dependent and independent couplings
@@ -203,8 +202,10 @@ namespace mg5amcCpu
         allCOUPs[idcoup] = CD_ACCESS::idcoupAccessBufferConst( allcouplings, idcoup ); // dependent couplings, vary event-by-event
       for( size_t iicoup = 0; iicoup < nicoup; iicoup++ )
         allCOUPs[ndcoup + iicoup] = CI_ACCESS::iicoupAccessBufferConst( cIPC, iicoup ); // independent couplings, fixed for all events
+#ifdef MGONGPUCPP_GPUIMPL
 #ifdef __CUDACC__
 #pragma nv_diagnostic pop
+#endif
       // CUDA kernels take input/output buffers with momenta/MEs for all events
       const fptype* momenta = allmomenta;
       const fptype* COUPs[nxcoup];
@@ -497,7 +498,7 @@ namespace mg5amcCpu
         { 3, 9, 9, 3, 27, 9 },
         { 9, 3, 3, 9, 9, 27 } }; // 2-D array[6][6]
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
       // Pre-compute a constexpr triangular color matrix properly normalized #475
       struct TriangularNormalizedColorMatrix
       {
@@ -554,7 +555,7 @@ namespace mg5amcCpu
 #endif
       for( int icol = 0; icol < ncolor; icol++ )
       {
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
         // === C++ START ===
         // Diagonal terms
 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
@@ -613,7 +614,7 @@ namespace mg5amcCpu
       MEs_sv_previous += deltaMEs_previous;
 #endif
       /*
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
       if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", blockDim.x * blockIdx.x + threadIdx.x, ihel, MEs_sv );
 #else
 #ifdef MGONGPU_CPPSIMD
@@ -708,8 +709,8 @@ namespace mg5amcCpu
       { 1, 1, 1, -1, 1, -1 },
       { 1, 1, 1, -1, -1, 1 },
       { 1, 1, 1, -1, -1, -1 } };
-#ifdef __CUDACC__
-    checkCuda( cudaMemcpyToSymbol( cHel, tHel, ncomb * npar * sizeof( short ) ) );
+#ifdef MGONGPUCPP_GPUIMPL
+    gpuMemcpyToSymbol( cHel, tHel, ncomb * npar * sizeof( short ) );
 #else
     memcpy( cHel, tHel, ncomb * npar * sizeof( short ) );
 #endif
@@ -751,9 +752,9 @@ namespace mg5amcCpu
     // Then copy them to CUDA constant memory (issue #39) or its C++ emulation in file-scope static memory
     const fptype tIPD[2] = { (fptype)m_pars->mdl_MT, (fptype)m_pars->mdl_WT };
     //const cxtype tIPC[0] = { ... }; // nicoup=0
-#ifdef __CUDACC__
-    checkCuda( cudaMemcpyToSymbol( cIPD, tIPD, 2 * sizeof( fptype ) ) );
-    //checkCuda( cudaMemcpyToSymbol( cIPC, tIPC, 0 * sizeof( cxtype ) ) ); // nicoup=0
+#ifdef MGONGPUCPP_GPUIMPL
+    gpuMemcpyToSymbol( cIPD, tIPD, 2 * sizeof( fptype ) );
+    //gpuMemcpyToSymbol( cIPC, tIPC, 0 * sizeof( cxtype ) ); // nicoup=0
 #else
     memcpy( cIPD, tIPD, 2 * sizeof( fptype ) );
     //memcpy( cIPC, tIPC, 0 * sizeof( cxtype ) ); // nicoup=0
@@ -791,7 +792,7 @@ namespace mg5amcCpu
   {
     std::stringstream out;
     // CUDA version (NVCC)
-    // [Use __NVCC__ instead of __CUDACC__ here!]
+    // [Use __NVCC__ instead of MGONGPUCPP_GPUIMPL here!]
     // [This tests if 'nvcc' was used even to build a .cc file, even if not necessarily 'nvcc -x cu' for a .cu file]
     // [Check 'nvcc --compiler-options -dM -E dummy.c | grep CUDA': see https://stackoverflow.com/a/53713712]
 #ifdef __NVCC__
@@ -856,12 +857,12 @@ namespace mg5amcCpu
   __global__ void /* clang-format off */
   computeDependentCouplings( const fptype* allgs, // input: Gs[nevt]
                              fptype* allcouplings // output: couplings[nevt*ndcoup*2]
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
                              , const int nevt     // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #endif
   ) /* clang-format on */
   {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     using namespace mg5amcGpu;
     using G_ACCESS = DeviceAccessGs;
     using C_ACCESS = DeviceAccessCouplings;
@@ -882,7 +883,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__ /* clang-format off */
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
   __global__ void
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
@@ -1008,9 +1009,9 @@ namespace mg5amcCpu
         nGoodHel++;
       }
     }
-#ifdef __CUDACC__
-    checkCuda( cudaMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) ) );
-    checkCuda( cudaMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) ) );
+#ifdef MGONGPUCPP_GPUIMPL
+    gpuMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) );
+    gpuMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) );
 #else
     cNGoodHel = nGoodHel;
     for( int ihel = 0; ihel < ncomb; ihel++ ) cGoodHel[ihel] = goodHel[ihel];
@@ -1034,7 +1035,7 @@ namespace mg5amcCpu
 #endif
             int* allselhel,                // output: helicity selection[nevt]
             int* allselcol                 // output: helicity selection[nevt]
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
             , const int nevt               // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #endif
             ) /* clang-format on */
@@ -1054,7 +1055,7 @@ namespace mg5amcCpu
     // Denominators: spins, colors and identical particles
     constexpr int helcolDenominators[1] = { 72 }; // assume nprocesses == 1 (#272 and #343)
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     // Remember: in CUDA this is a kernel for one event, in c++ this processes n events
     const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid
 #else
@@ -1068,9 +1069,12 @@ namespace mg5amcCpu
 #endif
 
     // Start sigmaKin_lines
+
+#include "GpuAbstraction.h"
+
     // === PART 0 - INITIALISATION (before calculate_wavefunctions) ===
     // Reset the "matrix elements" - running sums of |M|^2 over helicities for the given event
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     allMEs[ievt] = 0;
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     allNumerators[ievt] = 0;
@@ -1098,7 +1102,7 @@ namespace mg5amcCpu
     // === PART 1 - HELICITY LOOP: CALCULATE WAVEFUNCTIONS ===
     // (in both CUDA and C++, using precomputed good helicities)
 
-#ifdef __CUDACC__ // CUDA OR C++
+#ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++
 
     // *** START OF PART 1a - CUDA (one event per CPU thread) ***
     // Running sum of partial amplitudes squared for event by event color selection (#402)
@@ -1308,7 +1312,7 @@ namespace mg5amcCpu
     // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event
     // [NB 'sum over final spins, average over initial spins', eg see
     // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf]
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     allMEs[ievt] /= helcolDenominators[0];
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     if( channelId > 0 ) allMEs[ievt] *= allNumerators[ievt] / allDenominators[ievt];
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxux_ttxuxux/CPPProcess.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxux_ttxuxux/CPPProcess.h
index 4e53fa1250..c2ca443c0e 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxux_ttxuxux/CPPProcess.h
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxux_ttxuxux/CPPProcess.h
@@ -4,7 +4,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
 // MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
@@ -25,7 +25,7 @@
 
 //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -110,7 +110,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   __global__ void
   computeDependentCouplings( const fptype* allgs,    // input: Gs[nevt]
                              fptype* allcouplings ); // output: couplings[nevt*ndcoup*2]
@@ -123,7 +123,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__ /* clang-format off */
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
   __global__ void
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
@@ -153,7 +153,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__ /* clang-format off */
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
   __global__ void
   sigmaKin( const fptype* allmomenta,      // input: momenta[nevt*npar*4]
             const fptype* allcouplings,    // input: couplings[nevt*ndcoup*2]
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxux_ttxuxux/CudaRuntime.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxux_ttxuxux/CudaRuntime.h
deleted file mode 120000
index ce9e1a487a..0000000000
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxux_ttxuxux/CudaRuntime.h
+++ /dev/null
@@ -1 +0,0 @@
-../CudaRuntime.h
\ No newline at end of file
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxux_ttxuxux/GpuAbstraction.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxux_ttxuxux/GpuAbstraction.h
new file mode 120000
index 0000000000..72054e19ba
--- /dev/null
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxux_ttxuxux/GpuAbstraction.h
@@ -0,0 +1 @@
+../GpuAbstraction.h
\ No newline at end of file
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxux_ttxuxux/GpuRuntime.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxux_ttxuxux/GpuRuntime.h
new file mode 120000
index 0000000000..3920e83be4
--- /dev/null
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxux_ttxuxux/GpuRuntime.h
@@ -0,0 +1 @@
+../GpuRuntime.h
\ No newline at end of file
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxux_ttxuxux/check_sa.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxux_ttxuxux/check_sa.cc
index 3fbf0ffbee..7cac5ab47b 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxux_ttxuxux/check_sa.cc
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxux_ttxuxux/check_sa.cc
@@ -4,7 +4,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: O. Mattelaer (Nov 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 
 #include "mgOnGpuConfig.h"
@@ -12,6 +12,7 @@
 #include "BridgeKernels.h"
 #include "CPPProcess.h"
 #include "CrossSectionKernels.h"
+#include "GpuRuntime.h"
 #include "MatrixElementKernels.h"
 #include "MemoryAccessMatrixElements.h"
 #include "MemoryAccessMomenta.h"
@@ -65,7 +66,7 @@ usage( char* argv0, int ret = 1 )
   std::cout << std::endl;
   std::cout << "Summary stats are always computed: '-p' and '-j' only control their printout" << std::endl;
   std::cout << "The '-d' flag only enables NaN/abnormal warnings and OMP debugging" << std::endl;
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 #ifdef _OPENMP
   std::cout << std::endl;
   std::cout << "Use the OMP_NUM_THREADS environment variable to control OMP multi-threading" << std::endl;
@@ -96,7 +97,7 @@ int
 main( int argc, char** argv )
 {
   // Namespaces for CUDA and C++ (FIXME - eventually use the same namespace everywhere...)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   using namespace mg5amcGpu;
 #else
   using namespace mg5amcCpu;
@@ -134,9 +135,11 @@ main( int argc, char** argv )
     CurandDevice = 2
   };
 #ifdef MGONGPU_HAS_NO_CURAND
-  RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // this is the only supported mode if build has no curand (PR #784)
+  RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // this is the only supported mode if build has no curand (PR #784 and #785)
+#elif defined __HIPCC__
+#error Internal error: MGONGPU_HAS_NO_CURAND should have been set for __HIPCC__ // default on AMD GPUs should be common random
 #elif defined __CUDACC__
-  RandomNumberMode rndgen = RandomNumberMode::CurandDevice; // default on GPU if build has curand
+  RandomNumberMode rndgen = RandomNumberMode::CurandDevice; // default on NVidia GPU if build has curand
 #else
   RandomNumberMode rndgen = RandomNumberMode::CurandHost; // default on CPU if build has curand
 #endif
@@ -146,10 +149,10 @@ main( int argc, char** argv )
     RamboHost = 1,
     RamboDevice = 2
   };
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   RamboSamplingMode rmbsmp = RamboSamplingMode::RamboDevice; // default on GPU
 #else
-  RamboSamplingMode rmbsmp = RamboSamplingMode::RamboHost;  // default on CPU
+  RamboSamplingMode rmbsmp = RamboSamplingMode::RamboHost; // default on CPU
 #endif
   // Bridge emulation mode (NB Bridge implies RamboHost!)
   bool bridge = false;
@@ -177,7 +180,7 @@ main( int argc, char** argv )
     else if( arg == "--curdev" )
     {
 #ifndef __CUDACC__
-      throw std::runtime_error( "CurandDevice is not supported on CPUs" );
+      throw std::runtime_error( "CurandDevice is not supported on CPUs or non-NVidia GPUs" );
 #elif defined MGONGPU_HAS_NO_CURAND
       throw std::runtime_error( "CurandDevice is not supported because this application was built without Curand support" );
 #else
@@ -198,7 +201,7 @@ main( int argc, char** argv )
     }
     else if( arg == "--rmbdev" )
     {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
       rmbsmp = RamboSamplingMode::RamboDevice;
 #else
       throw std::runtime_error( "RamboDevice is not supported on CPUs" );
@@ -272,13 +275,13 @@ main( int argc, char** argv )
     return usage( argv[0] );
   }
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 #ifdef _OPENMP
   ompnumthreadsNotSetMeansOneThread( debug ? 1 : 0 ); // quiet(-1), info(0), debug(1)
 #endif
 #endif
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // Fail gently and avoid "Illegal instruction (core dumped)" if the host does not support the SIMD used in the ME calculation
   // Note: this prevents a crash on pmpe04 but not on some github CI nodes?
   // [NB: SIMD vectorization in mg5amc C++ code is only used in the ME calculation below MatrixElementKernelHost!]
@@ -296,14 +299,14 @@ main( int argc, char** argv )
 
   // === STEP 0 - INITIALISE
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 
-  // --- 00. Initialise cuda
-  // Instantiate a CudaRuntime at the beginnining of the application's main to
-  // invoke cudaSetDevice(0) in the constructor and book a cudaDeviceReset() call in the destructor
-  const std::string cdinKey = "00 CudaInit";
+  // --- 00. Initialise GPU
+  // Instantiate a GpuRuntime at the beginnining of the application's main.
+  // For CUDA this invokes cudaSetDevice(0) in the constructor and books a cudaDeviceReset() call in the destructor.
+  const std::string cdinKey = "00 GpuInit";
   timermap.start( cdinKey );
-  CudaRuntime cudaRuntime( debug );
+  GpuRuntime GpuRuntime( debug );
 #endif
 
   // --- 0a. Initialise physics process
@@ -325,7 +328,7 @@ main( int argc, char** argv )
   timermap.start( alloKey );
 
   // Memory buffers for random numbers for momenta
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferRndNumMomenta hstRndmom( nevt );
 #else
   PinnedHostBufferRndNumMomenta hstRndmom( nevt );
@@ -333,7 +336,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for sampling weights
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferWeights hstWeights( nevt );
 #else
   PinnedHostBufferWeights hstWeights( nevt );
@@ -341,7 +344,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for momenta
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferMomenta hstMomenta( nevt );
 #else
   PinnedHostBufferMomenta hstMomenta( nevt );
@@ -349,7 +352,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for Gs
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferGs hstGs( nevt );
 #else
   PinnedHostBufferGs hstGs( nevt );
@@ -366,7 +369,7 @@ main( int argc, char** argv )
   }
 
   // Memory buffers for matrix elements
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferMatrixElements hstMatrixElements( nevt );
 #else
   PinnedHostBufferMatrixElements hstMatrixElements( nevt );
@@ -375,7 +378,7 @@ main( int argc, char** argv )
 
   // Memory buffers for random numbers for helicity selection
   // *** NB #403 these buffers always remain initialised at 0: no need for helicity choice in gcheck/check (no LHE produced) ***
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferRndNumHelicity hstRndHel( nevt );
 #else
   PinnedHostBufferRndNumHelicity hstRndHel( nevt );
@@ -384,7 +387,7 @@ main( int argc, char** argv )
 
   // Memory buffers for random numbers for color selection
   // *** NB #402 these buffers always remain initialised at 0: no need for color choice in gcheck/check (no LHE produced) ***
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferRndNumColor hstRndCol( nevt );
 #else
   PinnedHostBufferRndNumColor hstRndCol( nevt );
@@ -392,7 +395,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for helicity selection
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferSelectedHelicity hstSelHel( nevt );
 #else
   PinnedHostBufferSelectedHelicity hstSelHel( nevt );
@@ -400,7 +403,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for color selection
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferSelectedColor hstSelCol( nevt );
 #else
   PinnedHostBufferSelectedColor hstSelCol( nevt );
@@ -438,7 +441,7 @@ main( int argc, char** argv )
     const bool onDevice = true;
     prnk.reset( new CurandRandomNumberKernel( devRndmom, onDevice ) );
 #else
-    throw std::logic_error( "INTERNAL ERROR! CurandDevice is not supported on CPUs" ); // INTERNAL ERROR (no path to this statement)
+    throw std::logic_error( "INTERNAL ERROR! CurandDevice is not supported on CPUs or non-NVidia GPUs" ); // INTERNAL ERROR (no path to this statement)
 #endif
   }
 
@@ -450,7 +453,7 @@ main( int argc, char** argv )
   }
   else
   {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     prsk.reset( new RamboSamplingKernelDevice( energy, devRndmom, devMomenta, devWeights, gpublocks, gputhreads ) );
 #else
     throw std::logic_error( "RamboDevice is not supported on CPUs" ); // INTERNAL ERROR (no path to this statement)
@@ -461,7 +464,7 @@ main( int argc, char** argv )
   std::unique_ptr<MatrixElementKernelBase> pmek;
   if( !bridge )
   {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     pmek.reset( new MatrixElementKernelDevice( devMomenta, devGs, devRndHel, devRndCol, devMatrixElements, devSelHel, devSelCol, gpublocks, gputhreads ) );
 #else
     pmek.reset( new MatrixElementKernelHost( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, nevt ) );
@@ -469,7 +472,7 @@ main( int argc, char** argv )
   }
   else
   {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     pmek.reset( new BridgeKernelDevice( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, gpublocks, gputhreads ) );
 #else
     pmek.reset( new BridgeKernelHost( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, nevt ) );
@@ -511,7 +514,7 @@ main( int argc, char** argv )
     prnk->generateRnarray();
     //std::cout << "Got random numbers" << std::endl;
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     if( rndgen != RandomNumberMode::CurandDevice && rmbsmp == RamboSamplingMode::RamboDevice )
     {
       // --- 1c. Copy rndmom from host to device
@@ -543,7 +546,7 @@ main( int argc, char** argv )
     prsk->getMomentaFinal();
     //std::cout << "Got final momenta" << std::endl;
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     if( rmbsmp == RamboSamplingMode::RamboDevice )
     {
       // --- 2c. CopyDToH Weights
@@ -588,7 +591,7 @@ main( int argc, char** argv )
       dynamic_cast<BridgeKernelBase*>( pmek.get() )->transposeInputMomentaC2F();
     }
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     // --- 2d. CopyHToD Momenta
     const std::string gKey = "0.. CpHTDg";
     rambtime += timermap.start( gKey ); // FIXME! NOT A RAMBO TIMER!
@@ -617,7 +620,7 @@ main( int argc, char** argv )
     wv3atime += timermap.stop(); // calc only
     wavetime += wv3atime;        // calc plus copy
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     if( !bridge )
     {
       // --- 3b. CopyDToH MEs
@@ -760,18 +763,22 @@ main( int argc, char** argv )
     rndgentxt = "CURAND DEVICE";
 #ifdef __CUDACC__
   rndgentxt += " (CUDA code)";
+#elif defined __HIPCC__
+  rndgentxt += " (HIP code)";
 #else
   rndgentxt += " (C++ code)";
 #endif
 
   // Workflow description summary
   std::string wrkflwtxt;
-  // -- CUDA or C++?
+  // -- CUDA or HIP or C++?
 #ifdef __CUDACC__
   wrkflwtxt += "CUD:";
+#elif defined __HIPCC__
+  wrkflwtxt += "HIP:";
 #else
   wrkflwtxt += "CPP:";
-#endif
+#endif /* clang-format off */
   // -- DOUBLE or FLOAT?
 #if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
   wrkflwtxt += "MIX+"; // mixed fptypes (single precision color algebra #537)
@@ -781,7 +788,7 @@ main( int argc, char** argv )
   wrkflwtxt += "FLT+";
 #else
   wrkflwtxt += "???+"; // no path to this statement
-#endif
+#endif /* clang-format on */
   // -- CUCOMPLEX or THRUST or STD complex numbers?
 #ifdef __CUDACC__
 #if defined MGONGPU_CUCXTYPE_CUCOMPLEX
@@ -793,6 +800,12 @@ main( int argc, char** argv )
 #else
   wrkflwtxt += "???:"; // no path to this statement
 #endif
+#elif defined __HIPCC__
+#if defined MGONGPU_CUCXTYPE_CXSMPL
+  wrkflwtxt += "CXS:";
+#else
+  wrkflwtxt += "???:"; // no path to this statement
+#endif
 #else
 #if defined MGONGPU_CPPCXTYPE_STDCOMPLEX
   wrkflwtxt += "STX:";
@@ -818,7 +831,7 @@ main( int argc, char** argv )
     wrkflwtxt += "RMBDEV+";
   else
     wrkflwtxt += "??????+"; // no path to this statement
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   // -- HOST or DEVICE matrix elements? Standalone MEs or BRIDGE?
   if( !bridge )
     wrkflwtxt += "MESDEV";
@@ -874,7 +887,7 @@ main( int argc, char** argv )
 
   if( perf )
   {
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 #ifdef _OPENMP
     // Get the output of "nproc --all" (https://stackoverflow.com/a/478960)
     std::string nprocall;
@@ -895,6 +908,8 @@ main( int argc, char** argv )
     std::cout << std::string( SEP79, '*' ) << std::endl
 #ifdef __CUDACC__
               << "Process                     = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_CUDA"
+#elif defined __HIPCC__
+              << "Process                     = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_HIP"
 #else
               << "Process                     = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_CPP"
 #endif
@@ -921,21 +936,21 @@ main( int argc, char** argv )
 #elif defined MGONGPU_FPTYPE_FLOAT
               << "FP precision                = FLOAT (NaN/abnormal=" << nabn << ", zero=" << nzero << ")" << std::endl
 #endif
-#ifdef __CUDACC__
 #if defined MGONGPU_CUCXTYPE_CUCOMPLEX
               << "Complex type                = CUCOMPLEX" << std::endl
 #elif defined MGONGPU_CUCXTYPE_THRUST
               << "Complex type                = THRUST::COMPLEX" << std::endl
-#endif
-#else
+#elif defined MGONGPU_CUCXTYPE_CXSMPL
               << "Complex type                = STD::COMPLEX" << std::endl
+#else
+              << "Complex type                = ???" << std::endl // no path to this statement...
 #endif
               << "RanNumb memory layout       = AOSOA[" << neppR << "]"
               << ( neppR == 1 ? " == AOS" : "" )
               << " [HARDCODED FOR REPRODUCIBILITY]" << std::endl
               << "Momenta memory layout       = AOSOA[" << neppM << "]"
               << ( neppM == 1 ? " == AOS" : "" ) << std::endl
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     //<< "Wavefunction GPU memory     = LOCAL" << std::endl
 #else
 #if !defined MGONGPU_CPPSIMD
@@ -966,7 +981,7 @@ main( int argc, char** argv )
 #endif
 #endif
               << "Random number generation    = " << rndgentxt << std::endl
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 #ifdef _OPENMP
               << "OMP threads / `nproc --all` = " << omp_get_max_threads() << " / " << nprocall // includes a newline
 #endif
@@ -1062,14 +1077,14 @@ main( int argc, char** argv )
              << "\"FLOAT (NaN/abnormal=" << nabn << ")\"," << std::endl
 #endif
              << "\"Complex type\": "
-#ifdef __CUDACC__
 #if defined MGONGPU_CUCXTYPE_CUCOMPLEX
              << "\"CUCOMPLEX\"," << std::endl
 #elif defined MGONGPU_CUCXTYPE_THRUST
              << "\"THRUST::COMPLEX\"," << std::endl
-#endif
-#else
+#elif defined MGONGPU_CUCXTYPE_CXSMPL
              << "\"STD::COMPLEX\"," << std::endl
+#else
+             << "\"???\"," << std::endl                           // no path to this statement...
 #endif
              << "\"RanNumb memory layout\": "
              << "\"AOSOA[" << neppR << "]\""
@@ -1077,7 +1092,7 @@ main( int argc, char** argv )
              << "\"Momenta memory layout\": "
              << "\"AOSOA[" << neppM << "]\""
              << ( neppM == 1 ? " == AOS" : "" ) << ", " << std::endl
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     //<< "\"Wavefunction GPU memory\": " << "\"LOCAL\"," << std::endl
 #endif
              << "\"Curand generation\": "
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/RamboSamplingKernels.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/RamboSamplingKernels.cc
index da68aa9255..79abbcc4f8 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/RamboSamplingKernels.cc
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/RamboSamplingKernels.cc
@@ -1,11 +1,11 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #include "RamboSamplingKernels.h"
 
-#include "CudaRuntime.h"
+#include "GpuRuntime.h"
 #include "MemoryAccessMomenta.h"
 #include "MemoryAccessRandomNumbers.h"
 #include "MemoryAccessWeights.h"
@@ -14,7 +14,7 @@
 
 #include <sstream>
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -92,7 +92,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   RamboSamplingKernelDevice::RamboSamplingKernelDevice( const fptype energy,               // input: energy
                                                         const BufferRndNumMomenta& rndmom, // input: random numbers in [0,1]
                                                         BufferMomenta& momenta,            // output: momenta
@@ -135,7 +135,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   __global__ void
   getMomentaInitialDevice( const fptype energy,
                            fptype* momenta )
@@ -147,17 +147,17 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   void
   RamboSamplingKernelDevice::getMomentaInitial()
   {
-    getMomentaInitialDevice<<<m_gpublocks, m_gputhreads>>>( m_energy, m_momenta.data() );
+    gpuLaunchKernel( getMomentaInitialDevice, m_gpublocks, m_gputhreads, m_energy, m_momenta.data() );
   }
 #endif
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   __global__ void
   getMomentaFinalDevice( const fptype energy,
                          const fptype* rndmom,
@@ -171,11 +171,11 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   void
   RamboSamplingKernelDevice::getMomentaFinal()
   {
-    getMomentaFinalDevice<<<m_gpublocks, m_gputhreads>>>( m_energy, m_rndmom.data(), m_momenta.data(), m_weights.data() );
+    gpuLaunchKernel( getMomentaFinalDevice, m_gpublocks, m_gputhreads, m_energy, m_rndmom.data(), m_momenta.data(), m_weights.data() );
   }
 #endif
 
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/RamboSamplingKernels.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/RamboSamplingKernels.h
index 184089efd7..7c214cd74b 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/RamboSamplingKernels.h
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/RamboSamplingKernels.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef RAMBOSAMPLINGKERNELS_H
 #define RAMBOSAMPLINGKERNELS_H 1
@@ -10,7 +10,7 @@
 
 #include "MemoryBuffers.h"
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -93,7 +93,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   // A class encapsulating RAMBO phase space sampling on a GPU device
   class RamboSamplingKernelDevice final : public SamplingKernelBase, public NumberOfEvents
   {
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/RandomNumberKernels.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/RandomNumberKernels.h
index 188a72c2c9..21d63beeac 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/RandomNumberKernels.h
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/RandomNumberKernels.h
@@ -1,14 +1,14 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef RANDOMNUMBERKERNELS_H
 #define RANDOMNUMBERKERNELS_H 1
 
 #include "mgOnGpuConfig.h"
 
-// NB This must come AFTER mgOnGpuConfig.h which contains our definition of __global__ when __CUDACC__ is not defined
+// NB This must come AFTER mgOnGpuConfig.h which contains our definition of __global__ when MGONGPUCPP_GPUIMPL is not defined
 #ifndef MGONGPU_HAS_NO_CURAND
 //#include "curand.h"
 struct curandGenerator_st; // forward definition from curand.h
@@ -16,7 +16,7 @@ struct curandGenerator_st; // forward definition from curand.h
 
 #include "MemoryBuffers.h"
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/cudacpp.mk b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/cudacpp.mk
index 509307506b..f2cfa349da 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/cudacpp.mk
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/cudacpp.mk
@@ -1,7 +1,7 @@
 # Copyright (C) 2020-2023 CERN and UCLouvain.
 # Licensed under the GNU Lesser General Public License (version 3 or later).
 # Created by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin.
-# Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+# Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 
 #=== Determine the name of this makefile (https://ftp.gnu.org/old-gnu/Manuals/make-3.80/html_node/make_17.html)
 #=== NB: use ':=' to ensure that the value of CUDACPP_MAKEFILE is not modified further down after including make_opts
@@ -42,10 +42,10 @@ endif
 
 #-------------------------------------------------------------------------------
 
-#=== Configure common compiler flags for C++ and CUDA
+#=== Configure common compiler flags for C++ and CUDA/HIP
 
 INCFLAGS = -I.
-OPTFLAGS = -O3 # this ends up in CUFLAGS too (should it?), cannot add -Ofast or -ffast-math here
+OPTFLAGS = -O3 # this ends up in GPUFLAGS too (should it?), cannot add -Ofast or -ffast-math here
 
 # Dependency on src directory
 MG5AMC_COMMONLIB = mg5amc_common
@@ -121,24 +121,46 @@ endif
 
 #-------------------------------------------------------------------------------
 
-#=== Configure the CUDA compiler
+#=== Configure the GPU compiler (CUDA or HIP)
 
-# If CXX is not a single word (example "clang++ --gcc-toolchain...") then disable CUDA builds (issue #505)
-# This is because it is impossible to pass this to "CUFLAGS += -ccbin <host-compiler>" below
+# FIXME! (AV 24.01.2024)
+# In the current implementation (without separate builds for C++ and CUDA/HIP), we first check for cudacc and hipcc in CUDA_HOME and HIP_HOME.
+# If CUDA_HOME or HIP_HOME are not set, try to determine them from the path to cudacc and hipcc.
+# While convoluted, this is currently necessary to allow disabling CUDA/HIP builds by setting CUDA_HOME or HIP_HOME to invalid paths.
+# This will (probably?) be fixed when separate C++ and CUDA/HIP builds are implemented (PR #775).
+
+# If CXX is not a single word (example "clang++ --gcc-toolchain...") then disable CUDA and HIP builds (issue #505)
+# This is because it is impossible to pass this to "GPUFLAGS += -ccbin <host-compiler>" below
 ifneq ($(words $(subst ccache ,,$(CXX))),1) # allow at most "CXX=ccache <host-compiler>" from outside
-  $(warning CUDA builds are not supported for multi-word CXX "$(CXX)")
+  $(warning CUDA and HIP builds are not supported for multi-word CXX "$(CXX)")
   override CUDA_HOME=disabled
+  override HIP_HOME=disabled
 endif
 
-# If CUDA_HOME is not set, try to set it from the location of nvcc
+# If CUDA_HOME is not set, try to set it from the path to nvcc
 ifndef CUDA_HOME
   CUDA_HOME = $(patsubst %bin/nvcc,%,$(shell which nvcc 2>/dev/null))
   $(warning CUDA_HOME was not set: using "$(CUDA_HOME)")
 endif
 
-# Set NVCC as $(CUDA_HOME)/bin/nvcc if it exists
+# If HIP_HOME is not set, try to set it from the path to hipcc
+ifndef HIP_HOME
+  HIP_HOME = $(patsubst %bin/hipcc,%,$(HIP_COMPILER_PATH))
+  $(warning HIP_HOME was not set: using "$(HIP_HOME)")
+endif
+
+# FIXME! (AV 24.01.2024)
+# In the current implementation (without separate builds for C++ and CUDA/HIP),
+# builds are performed for HIP only if CUDA is not found in the path.
+# If both CUDA and HIP are installed, HIP builds can be triggered by unsetting CUDA_HOME.
+# This will be fixed when separate C++ and CUDA/HIP builds are implemented (PR #775).
+
+#--- Option 1: CUDA exists -> use CUDA
+
+# Set GPUCC as $(CUDA_HOME)/bin/nvcc if it exists
 ifneq ($(wildcard $(CUDA_HOME)/bin/nvcc),)
-  NVCC = $(CUDA_HOME)/bin/nvcc
+
+  GPUCC = $(CUDA_HOME)/bin/nvcc
   USE_NVTX ?=-DUSE_NVTX
   # See https://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/index.html
   # See https://arnon.dk/matching-sm-architectures-arch-and-gencode-for-various-nvidia-cards/
@@ -158,41 +180,78 @@ ifneq ($(wildcard $(CUDA_HOME)/bin/nvcc),)
     CURANDLIBFLAGS = -L$(CUDA_HOME)/lib64/ -lcurand # NB: -lcuda is not needed here!
   endif
   CUOPTFLAGS = -lineinfo
-  CUFLAGS = $(foreach opt, $(OPTFLAGS), -Xcompiler $(opt)) $(CUOPTFLAGS) $(INCFLAGS) $(CUINC) $(USE_NVTX) $(CUARCHFLAGS) -use_fast_math
-  ###CUFLAGS += -Xcompiler -Wall -Xcompiler -Wextra -Xcompiler -Wshadow
-  ###NVCC_VERSION = $(shell $(NVCC) --version | grep 'Cuda compilation tools' | cut -d' ' -f5 | cut -d, -f1)
-  CUFLAGS += -std=c++17 # need CUDA >= 11.2 (see #333): this is enforced in mgOnGpuConfig.h
+  ###GPUFLAGS = $(OPTFLAGS) $(CUOPTFLAGS) $(INCFLAGS) $(CUINC) $(USE_NVTX) $(CUARCHFLAGS) -use_fast_math
+  GPUFLAGS = $(foreach opt, $(OPTFLAGS), -Xcompiler $(opt)) $(CUOPTFLAGS) $(INCFLAGS) $(CUINC) $(USE_NVTX) $(CUARCHFLAGS) -use_fast_math
+  ###GPUFLAGS += -Xcompiler -Wall -Xcompiler -Wextra -Xcompiler -Wshadow
+  ###GPUCC_VERSION = $(shell $(GPUCC) --version | grep 'Cuda compilation tools' | cut -d' ' -f5 | cut -d, -f1)
+  GPUFLAGS += -std=c++17 # need CUDA >= 11.2 (see #333): this is enforced in mgOnGpuConfig.h
   # Without -maxrregcount: baseline throughput: 6.5E8 (16384 32 12) up to 7.3E8 (65536 128 12)
-  ###CUFLAGS+= --maxrregcount 160 # improves throughput: 6.9E8 (16384 32 12) up to 7.7E8 (65536 128 12)
-  ###CUFLAGS+= --maxrregcount 128 # improves throughput: 7.3E8 (16384 32 12) up to 7.6E8 (65536 128 12)
-  ###CUFLAGS+= --maxrregcount 96 # degrades throughput: 4.1E8 (16384 32 12) up to 4.5E8 (65536 128 12)
-  ###CUFLAGS+= --maxrregcount 64 # degrades throughput: 1.7E8 (16384 32 12) flat at 1.7E8 (65536 128 12)
+  ###GPUFLAGS+= --maxrregcount 160 # improves throughput: 6.9E8 (16384 32 12) up to 7.7E8 (65536 128 12)
+  ###GPUFLAGS+= --maxrregcount 128 # improves throughput: 7.3E8 (16384 32 12) up to 7.6E8 (65536 128 12)
+  ###GPUFLAGS+= --maxrregcount 96 # degrades throughput: 4.1E8 (16384 32 12) up to 4.5E8 (65536 128 12)
+  ###GPUFLAGS+= --maxrregcount 64 # degrades throughput: 1.7E8 (16384 32 12) flat at 1.7E8 (65536 128 12)
+  CUBUILDRULEFLAGS = -Xcompiler -fPIC -c
+  CCBUILDRULEFLAGS = -Xcompiler -fPIC -c -x cu
+  CUDATESTFLAGS = -lcuda
+
+  # Set the host C++ compiler for GPUCC via "-ccbin <host-compiler>"
+  # (NB issue #505: this must be a single word, "clang++ --gcc-toolchain..." is not supported)
+  GPUFLAGS += -ccbin $(shell which $(subst ccache ,,$(CXX)))
+
+  # Allow newer (unsupported) C++ compilers with older versions of CUDA if ALLOW_UNSUPPORTED_COMPILER_IN_CUDA is set (#504)
+  ifneq ($(origin ALLOW_UNSUPPORTED_COMPILER_IN_CUDA),undefined)
+  GPUFLAGS += -allow-unsupported-compiler
+  endif
+
 else ifneq ($(origin REQUIRE_CUDA),undefined)
+
   # If REQUIRE_CUDA is set but no cuda is found, stop here (e.g. for CI tests on GPU #443)
-  $(error No cuda installation found (set CUDA_HOME or make nvcc visible in PATH))
+  $(error No cuda installation found (set CUDA_HOME or make GPUCC visible in PATH))
+
+#--- Option 2: CUDA does not exist, HIP exists -> use HIP
+
+# Set GPUCC as $(HIP_HOME)/bin/hipcc if it exists
+else ifneq ($(wildcard $(HIP_HOME)/bin/hipcc),)
+
+  GPUCC = $(HIP_HOME)/bin/hipcc
+  #USE_NVTX ?=-DUSE_NVTX # should maybe find something equivalent to this in HIP?
+  HIPARCHFLAGS = -target x86_64-linux-gnu --offload-arch=gfx90a
+  HIPINC = -I$(HIP_HOME)/include/
+  # Note: -DHIP_FAST_MATH is equivalent to -use_fast_math in HIP 
+  # (but only for single precision line 208: https://rocm-developer-tools.github.io/HIP/hcc__detail_2math__functions_8h_source.html)
+  GPUFLAGS = $(OPTFLAGS) $(CUOPTFLAGS) $(INCFLAGS) $(HIPINC) $(HIPARCHFLAGS) -DHIP_FAST_MATH -DHIP_PLATFORM=amd -fPIC
+  ###GPUFLAGS += -Xcompiler -Wall -Xcompiler -Wextra -Xcompiler -Wshadow
+  GPUFLAGS += -std=c++17
+  ###GPUFLAGS+= --maxrregcount 255 # (AV: is this option valid on HIP and meaningful on AMD GPUs?)
+  CUBUILDRULEFLAGS = -fPIC -c
+  CCBUILDRULEFLAGS = -fPIC -c
+
+else ifneq ($(origin REQUIRE_HIP),undefined)
+
+  # If REQUIRE_HIP is set but no HIP is found, stop here (e.g. for CI tests on GPU #443)
+  $(error No hip installation found (set HIP_HOME or make GPUCC visible in PATH))
+
+#--- Option 3: CUDA does not exist, HIP does not exist -> switch off both CUDA and HIP
+
 else
-  # No cuda. Switch cuda compilation off and go to common random numbers in C++
+
+  # No cudacc and no hipcc: switch CUDA and HIP compilation off and go to common random numbers in C++
   $(warning CUDA_HOME is not set or is invalid: export CUDA_HOME to compile with cuda)
-  override NVCC=
+  $(warning HIP_HOME is not set or is invalid: export HIP_HOME to compile with hip)
+  override GPUCC=
   override USE_NVTX=
   override CUINC=
   override CURANDLIBFLAGS=
-endif
-export NVCC
-export CUFLAGS
-
-# Set the host C++ compiler for nvcc via "-ccbin <host-compiler>"
-# (NB issue #505: this must be a single word, "clang++ --gcc-toolchain..." is not supported)
-CUFLAGS += -ccbin $(shell which $(subst ccache ,,$(CXX)))
 
-# Allow newer (unsupported) C++ compilers with older versions of CUDA if ALLOW_UNSUPPORTED_COMPILER_IN_CUDA is set (#504)
-ifneq ($(origin ALLOW_UNSUPPORTED_COMPILER_IN_CUDA),undefined)
-CUFLAGS += -allow-unsupported-compiler
 endif
 
+# Export GPUCC (so that it can also be used in cudacpp_src.mk?)
+export GPUCC
+export GPUFLAGS
+
 #-------------------------------------------------------------------------------
 
-#=== Configure ccache for C++ and CUDA builds
+#=== Configure ccache for C++ and CUDA/HIP builds
 
 # Enable ccache if USECCACHE=1
 ifeq ($(USECCACHE)$(shell echo $(CXX) | grep ccache),1)
@@ -201,15 +260,15 @@ endif
 #ifeq ($(USECCACHE)$(shell echo $(AR) | grep ccache),1)
 #  override AR:=ccache $(AR)
 #endif
-ifneq ($(NVCC),)
-  ifeq ($(USECCACHE)$(shell echo $(NVCC) | grep ccache),1)
-    override NVCC:=ccache $(NVCC)
+ifneq ($(GPUCC),)
+  ifeq ($(USECCACHE)$(shell echo $(GPUCC) | grep ccache),1)
+    override GPUCC:=ccache $(GPUCC)
   endif
 endif
 
 #-------------------------------------------------------------------------------
 
-#=== Configure PowerPC-specific compiler flags for C++ and CUDA
+#=== Configure PowerPC-specific compiler flags for C++ and CUDA/HIP
 
 # PowerPC-specific CXX compiler flags (being reviewed)
 ifeq ($(UNAME_P),ppc64le)
@@ -225,9 +284,9 @@ else
   ######CXXFLAGS+= -fno-semantic-interposition # no benefit (neither alone, nor combined with -flto)
 endif
 
-# PowerPC-specific CUDA compiler flags (to be reviewed!)
+# PowerPC-specific CUDA/HIP compiler flags (to be reviewed!)
 ifeq ($(UNAME_P),ppc64le)
-  CUFLAGS+= -Xcompiler -mno-float128
+  GPUFLAGS+= -Xcompiler -mno-float128
 endif
 
 #-------------------------------------------------------------------------------
@@ -237,7 +296,7 @@ endif
 # Set the default OMPFLAGS choice
 ifneq ($(shell $(CXX) --version | egrep '^Intel'),)
 override OMPFLAGS = -fopenmp
-###override OMPFLAGS = # disable OpenMP MT on Intel (was ok without nvcc but not ok with nvcc before #578)
+###override OMPFLAGS = # disable OpenMP MT on Intel (was ok without GPUCC but not ok with GPUCC before #578)
 else ifneq ($(shell $(CXX) --version | egrep '^(clang)'),)
 override OMPFLAGS = -fopenmp
 ###override OMPFLAGS = # disable OpenMP MT on clang (was not ok without or with nvcc before #578)
@@ -293,7 +352,10 @@ endif
 
 # Set the default RNDGEN (random number generator) choice
 ifeq ($(RNDGEN),)
-  ifeq ($(NVCC),)
+  ifeq ($(GPUCC),)
+    override RNDGEN = hasNoCurand
+  # Edgecase for HIP compilation
+  else ifeq ($(findstring hipcc,$(GPUCC)),hipcc)
     override RNDGEN = hasNoCurand
   else ifeq ($(RNDGEN),)
     override RNDGEN = hasCurand
@@ -310,7 +372,7 @@ export OMPFLAGS
 
 #-------------------------------------------------------------------------------
 
-#=== Set the CUDA/C++ compiler flags appropriate to user-defined choices of AVX, FPTYPE, HELINL, HRDCOD, RNDGEN
+#=== Set the CUDA/HIP/C++ compiler flags appropriate to user-defined choices of AVX, FPTYPE, HELINL, HRDCOD, RNDGEN
 
 # Set the build flags appropriate to OMPFLAGS
 $(info OMPFLAGS=$(OMPFLAGS))
@@ -368,13 +430,13 @@ CXXFLAGS+= $(AVXFLAGS)
 $(info FPTYPE=$(FPTYPE))
 ifeq ($(FPTYPE),d)
   CXXFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_DOUBLE
-  CUFLAGS  += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_DOUBLE
+  GPUFLAGS  += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_DOUBLE
 else ifeq ($(FPTYPE),f)
   CXXFLAGS += -DMGONGPU_FPTYPE_FLOAT -DMGONGPU_FPTYPE2_FLOAT
-  CUFLAGS  += -DMGONGPU_FPTYPE_FLOAT -DMGONGPU_FPTYPE2_FLOAT
+  GPUFLAGS  += -DMGONGPU_FPTYPE_FLOAT -DMGONGPU_FPTYPE2_FLOAT
 else ifeq ($(FPTYPE),m)
   CXXFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_FLOAT
-  CUFLAGS  += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_FLOAT
+  GPUFLAGS  += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_FLOAT
 else
   $(error Unknown FPTYPE='$(FPTYPE)': only 'd', 'f' and 'm' are supported)
 endif
@@ -383,7 +445,7 @@ endif
 $(info HELINL=$(HELINL))
 ifeq ($(HELINL),1)
   CXXFLAGS += -DMGONGPU_INLINE_HELAMPS
-  CUFLAGS  += -DMGONGPU_INLINE_HELAMPS
+  GPUFLAGS  += -DMGONGPU_INLINE_HELAMPS
 else ifneq ($(HELINL),0)
   $(error Unknown HELINL='$(HELINL)': only '0' and '1' are supported)
 endif
@@ -392,7 +454,7 @@ endif
 $(info HRDCOD=$(HRDCOD))
 ifeq ($(HRDCOD),1)
   CXXFLAGS += -DMGONGPU_HARDCODE_PARAM
-  CUFLAGS  += -DMGONGPU_HARDCODE_PARAM
+  GPUFLAGS  += -DMGONGPU_HARDCODE_PARAM
 else ifneq ($(HRDCOD),0)
   $(error Unknown HRDCOD='$(HRDCOD)': only '0' and '1' are supported)
 endif
@@ -444,11 +506,11 @@ ifeq ($(UNAME_S),Darwin)
   override CULIBFLAGSRPATH2 =
 else
   # RPATH to cuda/cpp libs when linking executables
-  override CXXLIBFLAGSRPATH = -Wl,-rpath,$(LIBDIRRPATH)
-  override CULIBFLAGSRPATH = -Xlinker -rpath,$(LIBDIRRPATH)
+  override CXXLIBFLAGSRPATH = -Wl,-rpath=$(LIBDIRRPATH)
+  override CULIBFLAGSRPATH = -Xlinker -rpath=$(LIBDIRRPATH)
   # RPATH to common lib when linking cuda/cpp libs
-  override CXXLIBFLAGSRPATH2 = -Wl,-rpath,'$$ORIGIN'
-  override CULIBFLAGSRPATH2 = -Xlinker -rpath,'$$ORIGIN'
+  override CXXLIBFLAGSRPATH2 = -Wl,-rpath='$$ORIGIN'
+  override CULIBFLAGSRPATH2 = -Xlinker -rpath='$$ORIGIN'
 endif
 
 # Setting LD_LIBRARY_PATH or DYLD_LIBRARY_PATH in the RUNTIME is no longer necessary (neither on Linux nor on Mac)
@@ -461,7 +523,7 @@ override RUNTIME =
 cxx_main=$(BUILDDIR)/check.exe
 fcxx_main=$(BUILDDIR)/fcheck.exe
 
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 cu_main=$(BUILDDIR)/gcheck.exe
 fcu_main=$(BUILDDIR)/fgcheck.exe
 else
@@ -492,15 +554,16 @@ $(BUILDDIR)/.build.$(TAG):
 	@touch $(BUILDDIR)/.build.$(TAG)
 
 # Generic target and build rules: objects from CUDA compilation
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 $(BUILDDIR)/%.o : %.cu *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
 	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
-	$(NVCC) $(CPPFLAGS) $(CUFLAGS) -Xcompiler -fPIC -c $< -o $@
+	$(GPUCC) $(CPPFLAGS) $(GPUFLAGS) $(CUBUILDRULEFLAGS) $< -o $@
 
 $(BUILDDIR)/%_cu.o : %.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
 	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
-	$(NVCC) $(CPPFLAGS) $(CUFLAGS) -Xcompiler -fPIC -c -x cu $< -o $@
+	$(GPUCC) $(CPPFLAGS) $(GPUFLAGS) $(CCBUILDRULEFLAGS) $< -o $@
 endif
+# -x cu in line above
 
 # Generic target and build rules: objects from C++ compilation
 # (NB do not include CUINC here! add it only for NVTX or curand #679)
@@ -509,11 +572,14 @@ $(BUILDDIR)/%.o : %.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
 	$(CXX) $(CPPFLAGS) $(CXXFLAGS) -fPIC -c $< -o $@
 
 # Apply special build flags only to CrossSectionKernel.cc and gCrossSectionKernel.cu (no fast math, see #117 and #516)
+# Added edgecase for HIP compilation
 ifeq ($(shell $(CXX) --version | grep ^nvc++),)
 $(BUILDDIR)/CrossSectionKernels.o: CXXFLAGS := $(filter-out -ffast-math,$(CXXFLAGS))
 $(BUILDDIR)/CrossSectionKernels.o: CXXFLAGS += -fno-fast-math
-ifneq ($(NVCC),)
-$(BUILDDIR)/gCrossSectionKernels.o: CUFLAGS += -Xcompiler -fno-fast-math
+ifeq ($(findstring nvcc,$(GPUCC)),nvcc)
+  $(BUILDDIR)/gCrossSectionKernels.o: GPUFLAGS += -Xcompiler -fno-fast-math
+else
+  $(BUILDDIR)/gCrossSectionKernels.o: GPUFLAGS += -fno-fast-math
 endif
 endif
 
@@ -530,10 +596,10 @@ ifeq ($(RNDGEN),hasCurand)
 $(BUILDDIR)/CurandRandomNumberKernel.o: CXXFLAGS += $(CUINC)
 endif
 
-# Avoid "warning: builtin __has_trivial_... is deprecated; use __is_trivially_... instead" in nvcc with icx2023 (#592)
+# Avoid "warning: builtin __has_trivial_... is deprecated; use __is_trivially_... instead" in GPUCC with icx2023 (#592)
 ifneq ($(shell $(CXX) --version | egrep '^(Intel)'),)
-ifneq ($(NVCC),)
-CUFLAGS += -Xcompiler -Wno-deprecated-builtins
+ifneq ($(GPUCC),)
+GPUFLAGS += -Wno-deprecated-builtins
 endif
 endif
 
@@ -541,8 +607,8 @@ endif
 # This patch does remove the warning, but I prefer to keep it disabled for the moment...
 ###ifneq ($(shell $(CXX) --version | egrep '^(clang|Apple clang|Intel)'),)
 ###$(BUILDDIR)/CrossSectionKernels.o: CXXFLAGS += -Wno-overriding-t-option
-###ifneq ($(NVCC),)
-###$(BUILDDIR)/gCrossSectionKernels.o: CUFLAGS += -Xcompiler -Wno-overriding-t-option
+###ifneq ($(GPUCC),)
+###$(BUILDDIR)/gCrossSectionKernels.o: GPUFLAGS += -Xcompiler -Wno-overriding-t-option
 ###endif
 ###endif
 
@@ -569,7 +635,7 @@ MG5AMC_CXXLIB = mg5amc_$(processid_short)_cpp
 cxx_objects_lib=$(BUILDDIR)/CPPProcess.o $(BUILDDIR)/MatrixElementKernels.o $(BUILDDIR)/BridgeKernels.o $(BUILDDIR)/CrossSectionKernels.o
 cxx_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel.o $(BUILDDIR)/RamboSamplingKernels.o
 
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 MG5AMC_CULIB = mg5amc_$(processid_short)_cuda
 cu_objects_lib=$(BUILDDIR)/gCPPProcess.o $(BUILDDIR)/gMatrixElementKernels.o $(BUILDDIR)/gBridgeKernels.o $(BUILDDIR)/gCrossSectionKernels.o
 cu_objects_exe=$(BUILDDIR)/gCommonRandomNumberKernel.o $(BUILDDIR)/gRamboSamplingKernels.o
@@ -581,11 +647,11 @@ $(LIBDIR)/lib$(MG5AMC_CXXLIB).so: cxx_objects_lib += $(BUILDDIR)/fbridge.o
 $(LIBDIR)/lib$(MG5AMC_CXXLIB).so: $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib)
 	$(CXX) -shared -o $@ $(cxx_objects_lib) $(CXXLIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB)
 
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 $(LIBDIR)/lib$(MG5AMC_CULIB).so: $(BUILDDIR)/fbridge_cu.o
 $(LIBDIR)/lib$(MG5AMC_CULIB).so: cu_objects_lib += $(BUILDDIR)/fbridge_cu.o
 $(LIBDIR)/lib$(MG5AMC_CULIB).so: $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cu_objects_lib)
-	$(NVCC) --shared -o $@ $(cu_objects_lib) $(CULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB)
+	$(GPUCC) --shared -o $@ $(cu_objects_lib) $(CULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB)
 endif
 
 #-------------------------------------------------------------------------------
@@ -602,16 +668,16 @@ $(cxx_main): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PAT
 $(cxx_main): $(BUILDDIR)/check_sa.o $(LIBDIR)/lib$(MG5AMC_CXXLIB).so $(cxx_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel.o
 	$(CXX) -o $@ $(BUILDDIR)/check_sa.o $(OMPFLAGS) -ldl -pthread $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_CXXLIB) $(cxx_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel.o $(CURANDLIBFLAGS)
 
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 ifneq ($(shell $(CXX) --version | grep ^Intel),)
-$(cu_main): LIBFLAGS += -lintlc # compile with icpx and link with nvcc (undefined reference to `_intel_fast_memcpy')
-$(cu_main): LIBFLAGS += -lsvml # compile with icpx and link with nvcc (undefined reference to `__svml_cos4_l9')
+$(cu_main): LIBFLAGS += -lintlc # compile with icpx and link with GPUCC (undefined reference to `_intel_fast_memcpy')
+$(cu_main): LIBFLAGS += -lsvml # compile with icpx and link with GPUCC (undefined reference to `__svml_cos4_l9')
 else ifneq ($(shell $(CXX) --version | grep ^nvc++),) # support nvc++ #531
 $(cu_main): LIBFLAGS += -L$(patsubst %bin/nvc++,%lib,$(subst ccache ,,$(CXX))) -lnvhpcatm -lnvcpumath -lnvc
 endif
 $(cu_main): LIBFLAGS += $(CULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH
 $(cu_main): $(BUILDDIR)/gcheck_sa.o $(LIBDIR)/lib$(MG5AMC_CULIB).so $(cu_objects_exe) $(BUILDDIR)/gCurandRandomNumberKernel.o
-	$(NVCC) -o $@ $(BUILDDIR)/gcheck_sa.o $(CUARCHFLAGS) $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe) $(BUILDDIR)/gCurandRandomNumberKernel.o $(CURANDLIBFLAGS)
+	$(GPUCC) -o $@ $(BUILDDIR)/gcheck_sa.o $(CUARCHFLAGS) $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe) $(BUILDDIR)/gCurandRandomNumberKernel.o $(CURANDLIBFLAGS)
 endif
 
 #-------------------------------------------------------------------------------
@@ -637,17 +703,17 @@ $(fcxx_main): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PA
 $(fcxx_main): $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler.o $(LIBDIR)/lib$(MG5AMC_CXXLIB).so $(cxx_objects_exe)
 	$(CXX) -o $@ $(BUILDDIR)/fcheck_sa.o $(OMPFLAGS) $(BUILDDIR)/fsampler.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CXXLIB) $(cxx_objects_exe)
 
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 ifneq ($(shell $(CXX) --version | grep ^Intel),)
-$(fcu_main): LIBFLAGS += -lintlc # compile with icpx and link with nvcc (undefined reference to `_intel_fast_memcpy')
-$(fcu_main): LIBFLAGS += -lsvml # compile with icpx and link with nvcc (undefined reference to `__svml_cos4_l9')
+$(fcu_main): LIBFLAGS += -lintlc # compile with icpx and link with GPUCC (undefined reference to `_intel_fast_memcpy')
+$(fcu_main): LIBFLAGS += -lsvml # compile with icpx and link with GPUCC (undefined reference to `__svml_cos4_l9')
 endif
 ifeq ($(UNAME_S),Darwin)
 $(fcu_main): LIBFLAGS += -L$(shell dirname $(shell $(FC) --print-file-name libgfortran.dylib)) # add path to libgfortran on Mac #375
 endif
 $(fcu_main): LIBFLAGS += $(CULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH
 $(fcu_main): $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler_cu.o $(LIBDIR)/lib$(MG5AMC_CULIB).so $(cu_objects_exe)
-	$(NVCC) -o $@ $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler_cu.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe)
+	$(GPUCC) -o $@ $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler_cu.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe)
 endif
 
 #-------------------------------------------------------------------------------
@@ -659,7 +725,7 @@ $(BUILDDIR)/testxxx.o: testxxx_cc_ref.txt
 $(testmain): $(BUILDDIR)/testxxx.o
 $(testmain): cxx_objects_exe += $(BUILDDIR)/testxxx.o # Comment out this line to skip the C++ test of xxx functions
 
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 $(BUILDDIR)/testxxx_cu.o: $(GTESTLIBS)
 $(BUILDDIR)/testxxx_cu.o: INCFLAGS += $(GTESTINC)
 $(BUILDDIR)/testxxx_cu.o: testxxx_cc_ref.txt
@@ -672,7 +738,7 @@ $(BUILDDIR)/testmisc.o: INCFLAGS += $(GTESTINC)
 $(testmain): $(BUILDDIR)/testmisc.o
 $(testmain): cxx_objects_exe += $(BUILDDIR)/testmisc.o # Comment out this line to skip the C++ miscellaneous tests
 
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 $(BUILDDIR)/testmisc_cu.o: $(GTESTLIBS)
 $(BUILDDIR)/testmisc_cu.o: INCFLAGS += $(GTESTINC)
 $(testmain): $(BUILDDIR)/testmisc_cu.o
@@ -684,12 +750,12 @@ $(BUILDDIR)/runTest.o: INCFLAGS += $(GTESTINC)
 $(testmain): $(BUILDDIR)/runTest.o
 $(testmain): cxx_objects_exe += $(BUILDDIR)/runTest.o
 
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 $(BUILDDIR)/runTest_cu.o: $(GTESTLIBS)
 $(BUILDDIR)/runTest_cu.o: INCFLAGS += $(GTESTINC)
 ifneq ($(shell $(CXX) --version | grep ^Intel),)
-$(testmain): LIBFLAGS += -lintlc # compile with icpx and link with nvcc (undefined reference to `_intel_fast_memcpy')
-$(testmain): LIBFLAGS += -lsvml # compile with icpx and link with nvcc (undefined reference to `__svml_cos4_l9')
+$(testmain): LIBFLAGS += -lintlc # compile with icpx and link with GPUCC (undefined reference to `_intel_fast_memcpy')
+$(testmain): LIBFLAGS += -lsvml # compile with icpx and link with GPUCC (undefined reference to `__svml_cos4_l9')
 else ifneq ($(shell $(CXX) --version | grep ^nvc++),) # support nvc++ #531
 $(testmain): LIBFLAGS += -L$(patsubst %bin/nvc++,%lib,$(subst ccache ,,$(CXX))) -lnvhpcatm -lnvcpumath -lnvc
 endif
@@ -713,14 +779,14 @@ $(testmain): LIBFLAGS += -lgomp
 endif
 endif
 
-ifeq ($(NVCC),) # link only runTest.o
+ifeq ($(GPUCC),) # link only runTest.o
 $(testmain): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH
 $(testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(cxx_objects_exe) $(GTESTLIBS)
 	$(CXX) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) -ldl -pthread $(LIBFLAGS)
 else # link both runTest.o and runTest_cu.o
 $(testmain): LIBFLAGS += $(CULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH
 $(testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) $(GTESTLIBS)
-	$(NVCC) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) -ldl $(LIBFLAGS) -lcuda
+	  $(GPUCC) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) -ldl $(LIBFLAGS) $(CUDATESTFLAGS)
 endif
 
 # Use target gtestlibs to build only googletest
@@ -829,9 +895,9 @@ ifeq ($(USECCACHE),1)
 	ccache --version | head -1
 endif
 	@echo ""
-	@echo NVCC=$(NVCC)
-ifneq ($(NVCC),)
-	$(NVCC) --version
+	@echo GPUCC=$(GPUCC)
+ifneq ($(GPUCC),)
+	$(GPUCC) --version
 endif
 	@echo ""
 	@echo CXX=$(CXX)
@@ -850,7 +916,7 @@ endif
 
 # Target: check (run the C++ test executable)
 # [NB THIS IS WHAT IS USED IN THE GITHUB CI!]
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 check: runTest cmpFcheck cmpFGcheck
 else
 check: runTest cmpFcheck
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/fbridge.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/fbridge.cc
index 2d2b36d560..22ce3f5115 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/fbridge.cc
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/fbridge.cc
@@ -1,11 +1,11 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: S. Roiser (Oct 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Roiser, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #include "Bridge.h"
 #include "CPPProcess.h"
-#include "CudaRuntime.h"
+#include "GpuRuntime.h"
 
 extern "C"
 {
@@ -22,7 +22,7 @@ extern "C"
    * Using the same Fortran MadEvent code, linking to the hetrerogeneous library would allow access to both CPU and GPU implementations.
    * The specific heterogeneous configuration (how many GPUs, how many threads on each CPU, etc) could be loaded in CUDA/C++ from a data file.
    */
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   using namespace mg5amcGpu;
 #else
   using namespace mg5amcCpu;
@@ -46,8 +46,8 @@ extern "C"
    */
   void fbridgecreate_( CppObjectInFortran** ppbridge, const int* pnevtF, const int* pnparF, const int* pnp4F )
   {
-#ifdef __CUDACC__
-    CudaRuntime::setUp();
+#ifdef MGONGPUCPP_GPUIMPL
+    GpuRuntime::setUp();
 #endif
     // (NB: CPPProcess::initProc no longer needs to be executed here because it is called in the Bridge constructor)
     // FIXME: disable OMP in Bridge when called from Fortran
@@ -65,8 +65,8 @@ extern "C"
     Bridge<FORTRANFPTYPE>* pbridge = dynamic_cast<Bridge<FORTRANFPTYPE>*>( *ppbridge );
     if( pbridge == 0 ) throw std::runtime_error( "fbridgedelete_: invalid Bridge address" );
     delete pbridge;
-#ifdef __CUDACC__
-    CudaRuntime::tearDown();
+#ifdef MGONGPUCPP_GPUIMPL
+    GpuRuntime::tearDown();
 #endif
   }
 
@@ -96,7 +96,7 @@ extern "C"
   {
     Bridge<FORTRANFPTYPE>* pbridge = dynamic_cast<Bridge<FORTRANFPTYPE>*>( *ppbridge );
     if( pbridge == 0 ) throw std::runtime_error( "fbridgesequence_: invalid Bridge address" );
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     // Use the device/GPU implementation in the CUDA library
     // (there is also a host implementation in this library)
     pbridge->gpu_sequence( momenta, gs, rndhel, rndcol, *pchannelId, mes, selhel, selcol );
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/fsampler.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/fsampler.cc
index 2fb445372d..3743934f41 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/fsampler.cc
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/fsampler.cc
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Feb 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #include "mgOnGpuConfig.h"
 
@@ -13,7 +13,7 @@
 
 //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -40,7 +40,7 @@ namespace mg5amcCpu
   private:
     const int m_nevt; // The number of events in each iteration
     int m_iiter;      // The iteration counter (for random number seeding)
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
     HostBufferRndNumMomenta m_hstRndmom; // Memory buffers for random numbers
     HostBufferMomenta m_hstMomenta;      // Memory buffers for momenta
     HostBufferWeights m_hstWeights;      // Memory buffers for sampling weights
@@ -105,7 +105,7 @@ namespace mg5amcCpu
 
 extern "C"
 {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   using namespace mg5amcGpu;
 #else
   using namespace mg5amcCpu;
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/runTest.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/runTest.cc
index d4a760a71b..de327f2321 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/runTest.cc
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/runTest.cc
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: S. Hageboeck (Nov 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 //----------------------------------------------------------------------------
 // Use ./runTest.exe --gtest_filter=*xxx to run only testxxx.cc tests
 //----------------------------------------------------------------------------
@@ -18,7 +18,7 @@
 #include "RandomNumberKernels.h"
 #include "epoch_process_id.h"
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 using namespace mg5amcGpu;
 #else
 using namespace mg5amcCpu;
@@ -35,7 +35,7 @@ struct CUDA_CPU_TestBase : public TestDriverBase
     : TestDriverBase( npar, refFileName ) {}
 };
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 struct CPUTest : public CUDA_CPU_TestBase
 {
   // Struct data members (process, and memory structures for random numbers, momenta, matrix elements and weights on host and device)
@@ -119,7 +119,7 @@ struct CPUTest : public CUDA_CPU_TestBase
 };
 #endif
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 struct CUDATest : public CUDA_CPU_TestBase
 {
   // Reset the device when our test goes out of scope. Note that this should happen after
@@ -128,7 +128,7 @@ struct CUDATest : public CUDA_CPU_TestBase
   {
     ~DeviceReset()
     {
-      checkCuda( cudaDeviceReset() ); // this is needed by cuda-memcheck --leak-check full
+      checkGpu( gpuDeviceReset() ); // this is needed by cuda-memcheck --leak-check full
     }
   } deviceResetter;
 
@@ -256,7 +256,7 @@ INSTANTIATE_TEST_SUITE_P( prefix, \
                           test_suite_name, \
                           testing::Values( new CUDATest( MG_EPOCH_REFERENCE_FILE_NAME ) ) );
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 MG_INSTANTIATE_TEST_SUITE_GPU( XTESTID_GPU( MG_EPOCH_PROCESS_ID ), MadgraphTest );
 #else
 MG_INSTANTIATE_TEST_SUITE_CPU( XTESTID_CPU( MG_EPOCH_PROCESS_ID ), MadgraphTest );
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/testmisc.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/testmisc.cc
index 895d6eeb56..ba9e59a8a3 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/testmisc.cc
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/testmisc.cc
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 //----------------------------------------------------------------------------
 // Use ./runTest.exe --gtest_filter=*misc to run only testmisc.cc tests
 //----------------------------------------------------------------------------
@@ -17,7 +17,7 @@
 #include <sstream>
 #include <typeinfo>
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 #define TESTID( s ) s##_GPU_MISC
 #else
 #define TESTID( s ) s##_CPU_MISC
@@ -26,7 +26,7 @@
 #define XTESTID( s ) TESTID( s )
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -59,7 +59,7 @@ namespace mg5amcCpu
 
 TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testmisc )
 {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   using namespace mg5amcGpu;
 #else
   using namespace mg5amcCpu;
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/testxxx.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/testxxx.cc
index 3361fe5aa9..e5167de00c 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/testxxx.cc
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/testxxx.cc
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Apr 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 //----------------------------------------------------------------------------
 // Use ./runTest.exe --gtest_filter=*xxx to run only testxxx.cc tests
 //----------------------------------------------------------------------------
@@ -24,7 +24,7 @@
 #include <iomanip>
 #include <iostream>
 #include <vector>
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 #define TESTID( s ) s##_GPU_XXX
 #else
 #define TESTID( s ) s##_CPU_XXX
@@ -32,7 +32,7 @@
 
 #define XTESTID( s ) TESTID( s )
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -42,7 +42,7 @@ namespace mg5amcCpu
   int FPEhandlerIevt = -1;
   inline void FPEhandler( int sig )
   {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     std::cerr << "Floating Point Exception (GPU): '" << FPEhandlerMessage << "' ievt=" << FPEhandlerIevt << std::endl;
 #else
     std::cerr << "Floating Point Exception (CPU neppV=" << neppV << "): '" << FPEhandlerMessage << "' ievt=" << FPEhandlerIevt << std::endl;
@@ -53,7 +53,7 @@ namespace mg5amcCpu
 
 TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testxxx )
 {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   using namespace mg5amcGpu;
 #else
   using namespace mg5amcCpu;
@@ -77,7 +77,7 @@ TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testxxx )
   assert( nevt % neppM == 0 ); // nevt must be a multiple of neppM
   assert( nevt % neppV == 0 ); // nevt must be a multiple of neppV
   // Fill in the input momenta
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   mg5amcGpu::PinnedHostBufferMomenta hstMomenta( nevt ); // AOSOA[npagM][npar=4][np4=4][neppM]
 #else
   mg5amcCpu::HostBufferMomenta hstMomenta( nevt ); // AOSOA[npagM][npar=4][np4=4][neppM]
@@ -322,7 +322,7 @@ TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testxxx )
   {
     for( int ievt = 0; ievt < nevt; ievt++ )
     {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
       using namespace mg5amcGpu;
 #else
       using namespace mg5amcCpu;
diff --git a/epochX/cudacpp/pp_tt012j.mad/src/HelAmps_sm.h b/epochX/cudacpp/pp_tt012j.mad/src/HelAmps_sm.h
index 8df465ad6d..8b4ad719be 100644
--- a/epochX/cudacpp/pp_tt012j.mad/src/HelAmps_sm.h
+++ b/epochX/cudacpp/pp_tt012j.mad/src/HelAmps_sm.h
@@ -5,7 +5,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: A. Valassi (Sep 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
 // MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
@@ -28,7 +28,7 @@
 //#include <iomanip>
 //#include <iostream>
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/pp_tt012j.mad/src/Parameters_sm.cc b/epochX/cudacpp/pp_tt012j.mad/src/Parameters_sm.cc
index 64fc3fea62..067445b198 100644
--- a/epochX/cudacpp/pp_tt012j.mad/src/Parameters_sm.cc
+++ b/epochX/cudacpp/pp_tt012j.mad/src/Parameters_sm.cc
@@ -4,7 +4,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: A. Valassi (Sep 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
 // MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
@@ -17,7 +17,7 @@
 #include <iomanip>
 #include <iostream>
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 using namespace mg5amcGpu;
 #else
 using namespace mg5amcCpu;
diff --git a/epochX/cudacpp/pp_tt012j.mad/src/Parameters_sm.h b/epochX/cudacpp/pp_tt012j.mad/src/Parameters_sm.h
index b6568d3761..9581d66e0e 100644
--- a/epochX/cudacpp/pp_tt012j.mad/src/Parameters_sm.h
+++ b/epochX/cudacpp/pp_tt012j.mad/src/Parameters_sm.h
@@ -27,7 +27,7 @@
 #include "read_slha.h"
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -93,7 +93,7 @@ namespace mg5amcCpu
 #include <limits>
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -218,7 +218,7 @@ namespace mg5amcCpu
 //==========================================================================
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -239,7 +239,7 @@ namespace mg5amcCpu
 #pragma GCC diagnostic push
 #pragma GCC diagnostic ignored "-Wunused-variable"  // e.g. <<warning: unused variable ‘mdl_G__exp__2’ [-Wunused-variable]>>
 #pragma GCC diagnostic ignored "-Wunused-parameter" // e.g. <<warning: unused parameter ‘G’ [-Wunused-parameter]>>
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 #pragma nv_diagnostic push
 #pragma nv_diag_suppress 177 // e.g. <<warning #177-D: variable "mdl_G__exp__2" was declared but never referenced>>
 #endif
@@ -267,7 +267,7 @@ namespace mg5amcCpu
       // End SM implementation - no special handling of vectors of floats as in EFT (#439)
       return out;
     }
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 #pragma GCC diagnostic pop
 #pragma nv_diagnostic pop
 #endif
diff --git a/epochX/cudacpp/pp_tt012j.mad/src/cudacpp_src.mk b/epochX/cudacpp/pp_tt012j.mad/src/cudacpp_src.mk
index d4cc628aec..159e19a46d 100644
--- a/epochX/cudacpp/pp_tt012j.mad/src/cudacpp_src.mk
+++ b/epochX/cudacpp/pp_tt012j.mad/src/cudacpp_src.mk
@@ -19,7 +19,7 @@ SHELL := /bin/bash
 #=== Configure common compiler flags for CUDA and C++
 
 INCFLAGS = -I.
-OPTFLAGS = -O3 # this ends up in CUFLAGS too (should it?), cannot add -Ofast or -ffast-math here
+OPTFLAGS = -O3 # this ends up in GPUFLAGS too (should it?), cannot add -Ofast or -ffast-math here
 
 #-------------------------------------------------------------------------------
 
@@ -45,13 +45,13 @@ endif
 
 #-------------------------------------------------------------------------------
 
-#=== Configure the CUDA compiler (note: NVCC is already exported including ccache)
+#=== Configure the CUDA compiler (note: GPUCC is already exported including ccache)
 
-###$(info NVCC=$(NVCC))
+###$(info GPUCC=$(GPUCC))
 
 #-------------------------------------------------------------------------------
 
-#=== Configure ccache for C++ builds (note: NVCC is already exported including ccache)
+#=== Configure ccache for C++ builds (note: GPUCC is already exported including ccache)
 
 # Enable ccache if USECCACHE=1
 ifeq ($(USECCACHE)$(shell echo $(CXX) | grep ccache),1)
@@ -92,6 +92,13 @@ endif
 ###$(info OMPFLAGS=$(OMPFLAGS))
 CXXFLAGS += $(OMPFLAGS)
 
+# Add correct -DHIP_LATFORM when compiling for HIP
+ifeq ($(findstring nvcc,$(GPUCC)),nvcc)
+  GPUFLAGS += -Xcompiler -fPIC -c -x cu
+else ifeq ($(findstring hipcc,$(GPUCC)),hipcc)
+  GPUFLAGS += -fPIC -c
+endif
+
 # Set the build flags appropriate to each AVX choice (example: "make AVX=none")
 # [NB MGONGPU_PVW512 is needed because "-mprefer-vector-width=256" is not exposed in a macro]
 # [See https://gcc.gnu.org/bugzilla/show_bug.cgi?id=96476]
@@ -253,20 +260,20 @@ $(BUILDDIR)/%.o : %.cc *.h $(BUILDDIR)/.build.$(TAG)
 # Generic target and build rules: objects from CUDA compilation
 $(BUILDDIR)/%_cu.o : %.cc *.h $(BUILDDIR)/.build.$(TAG)
 	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
-	$(NVCC) $(CPPFLAGS) $(CUFLAGS) -Xcompiler -fPIC -c -x cu $< -o $@
+	$(GPUCC) $(CPPFLAGS) $(GPUFLAGS) $< -o $@
 
 #-------------------------------------------------------------------------------
 
 cxx_objects=$(addprefix $(BUILDDIR)/, Parameters_sm.o read_slha.o)
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 cu_objects=$(addprefix $(BUILDDIR)/, Parameters_sm_cu.o)
 endif
 
 # Target (and build rules): common (src) library
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so : $(cxx_objects) $(cu_objects)
 	@if [ ! -d $(LIBDIR) ]; then echo "mkdir -p $(LIBDIR)"; mkdir -p $(LIBDIR); fi
-	$(NVCC) -shared -o $@ $(cxx_objects) $(cu_objects) $(LDFLAGS)
+	$(GPUCC) -shared -o $@ $(cxx_objects) $(cu_objects) $(LDFLAGS)
 else
 $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so : $(cxx_objects)
 	@if [ ! -d $(LIBDIR) ]; then echo "mkdir -p $(LIBDIR)"; mkdir -p $(LIBDIR); fi
diff --git a/epochX/cudacpp/pp_tt012j.mad/src/mgOnGpuConfig.h b/epochX/cudacpp/pp_tt012j.mad/src/mgOnGpuConfig.h
index 80032e528b..55d03f1252 100644
--- a/epochX/cudacpp/pp_tt012j.mad/src/mgOnGpuConfig.h
+++ b/epochX/cudacpp/pp_tt012j.mad/src/mgOnGpuConfig.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jul 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MGONGPUCONFIG_H
 #define MGONGPUCONFIG_H 1
@@ -10,12 +10,25 @@
 // There are two different code bases for standalone_cudacpp (without multichannel) and madevent+cudacpp (with multichannel)
 #define MGONGPU_SUPPORTS_MULTICHANNEL 1
 
+// Is this a GPU (CUDA, HIP) or CPU implementation?
+#ifdef __CUDACC__
+#define MGONGPUCPP_GPUIMPL cuda
+#elif defined __HIPCC__
+#define MGONGPUCPP_GPUIMPL hip
+#else
+#undef MGONGPUCPP_GPUIMPL
+#endif
+
 // ** NB1 Throughputs (e.g. 6.8E8) are events/sec for "./gcheck.exe -p 65536 128 12"
 // ** NB2 Baseline on b7g47n0004 fluctuates (probably depends on load on other VMs)
 
 // Choose if curand is supported for generating random numbers
+// For HIP, by default, do not use curand (common random numbers will be used instead)
 // For both CUDA and C++, by default, do not inline, but allow this macro to be set from outside with e.g. -DMGONGPU_HAS_NO_CURAND
-// (there exist CUDA installations, e.g. using the HPC package, which do not include curand - see PR #784)
+// (there exist CUDA installations, e.g. using the HPC package, which do not include curand - see PR #784 and #785)
+#if defined __HIPCC__
+#define MGONGPU_HAS_NO_CURAND 1
+#else
 //#ifdef __CUDACC__
 //#undef MGONGPU_HAS_NO_CURAND // default
 ////#define MGONGPU_HAS_NO_CURAND 1
@@ -23,6 +36,7 @@
 //#undef MGONGPU_HAS_NO_CURAND // default
 ////#define MGONGPU_HAS_NO_CURAND 1
 //#endif
+#endif
 
 // Choose floating point precision (for everything but color algebra #537)
 // If one of these macros has been set from outside with e.g. -DMGONGPU_FPTYPE_FLOAT, nothing happens (issue #167)
@@ -54,23 +68,28 @@
 //#undef MGONGPU_HARDCODE_PARAM // default
 ////#define MGONGPU_HARDCODE_PARAM 1
 
-// Complex type in c++: std::complex or cxsmpl (CHOOSE ONLY ONE)
-#ifndef __CUDACC__
-//#define MGONGPU_CPPCXTYPE_STDCOMPLEX 1 // ~8 percent slower on float, same on double (5.1E6/double, 9.4E6/float)
-#define MGONGPU_CPPCXTYPE_CXSMPL 1 // new default (5.1E6/double, 10.2E6/float)
-#endif
-
-// Complex type in cuda: thrust or cucomplex or cxsmpl (CHOOSE ONLY ONE)
+// Complex type in CUDA: thrust or cucomplex or cxsmpl (CHOOSE ONLY ONE)
 #ifdef __CUDACC__
 #define MGONGPU_CUCXTYPE_THRUST 1 // default (~1.15E9/double, ~3.2E9/float)
 //#define MGONGPU_CUCXTYPE_CUCOMPLEX 1 // ~10 percent slower (1.03E9/double, ~2.8E9/float)
 //#define MGONGPU_CUCXTYPE_CXSMPL 1 // ~10 percent slower (1.00E9/double, ~2.9E9/float)
+
+// Complex type in HIP: cxsmpl (ONLY ONE OPTION POSSIBLE)
+#elif defined __HIPCC__
+#define MGONGPU_CUCXTYPE_CXSMPL 1 // ~10 percent slower (1.00E9/double, ~2.9E9/float)
+
+// Complex type in C++: std::complex or cxsmpl (CHOOSE ONLY ONE)
+#else
+//#define MGONGPU_CPPCXTYPE_STDCOMPLEX 1 // ~8 percent slower on float, same on double (5.1E6/double, 9.4E6/float)
+#define MGONGPU_CPPCXTYPE_CXSMPL 1 // new default (5.1E6/double, 10.2E6/float)
 #endif
 
-// Cuda nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation
+// CUDA nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation
 #ifdef __CUDACC__
-#undef MGONGPU_NSIGHT_DEBUG // default
+#undef MGONGPU_NSIGHT_DEBUG // default in CUDA
 //#define MGONGPU_NSIGHT_DEBUG 1
+#else
+#undef MGONGPU_NSIGHT_DEBUG // only option in HIP or C++
 #endif
 
 // SANITY CHECKS (floating point precision for everything but color algebra #537)
@@ -86,17 +105,21 @@
 #error You cannot use double precision for color algebra and single precision elsewhere
 #endif
 
-// SANITY CHECKS (c++ complex number implementation)
-#ifndef __CUDACC__
-#if defined MGONGPU_CPPCXTYPE_STDCOMPLEX and defined MGONGPU_CPPCXTYPE_CXSMPL
-#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CPPCXTYPE_STDCOMPLEX or MGONGPU_CPPCXTYPE_CXSMPL
+// SANITY CHECKS (CUDA complex number implementation)
+#ifdef __CUDACC__
+#if defined MGONGPU_CUCXTYPE_THRUST and defined MGONGPU_CUCXTYPE_CUCOMPLEX
+#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CUCXTYPE_THRUST or MGONGPU_CUCXTYPE_CUCOMPLEX for CUDA
+#elif defined MGONGPU_CUCXTYPE_THRUST and defined MGONGPU_CUCXTYPE_CXSMPL
+#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CUCXTYPE_THRUST or MGONGPU_CUCXTYPE_CXSMPL for CUDA
+#elif defined MGONGPU_CUCXTYPE_CUCOMPLEX and defined MGONGPU_CUCXTYPE_CXSMPL
+#error You must CHOOSE (ONE AND) ONLY ONE OF MGONGPU_CUCXTYPE_CUCOMPLEX or MGONGPU_CUCXTYPE_CXSMPL for CUDA
 #endif
 #endif
 
-// SANITY CHECKS (cuda complex number implementation)
-#ifdef __CUDACC__
-#if defined MGONGPU_CUCXTYPE_THRUST and defined MGONGPU_CUCXTYPE_CUCOMPLEX and defined MGONGPU_CUCXTYPE_CXSMPL
-#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CUCXTYPE_THRUST or MGONGPU_CUCXTYPE_CUCOMPLEX or MGONGPU_CUCXTYPE_CXSMPL
+// SANITY CHECKS (C++ complex number implementation)
+#ifndef MGONGPUCPP_GPUIMPL
+#if defined MGONGPU_CPPCXTYPE_STDCOMPLEX and defined MGONGPU_CPPCXTYPE_CXSMPL
+#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CPPCXTYPE_STDCOMPLEX or MGONGPU_CPPCXTYPE_CXSMPL for C++
 #endif
 #endif
 
@@ -134,7 +157,7 @@ namespace mgOnGpu
   // Alignment requirement for using reinterpret_cast with SIMD vectorized code
   // (using reinterpret_cast with non aligned memory may lead to segmentation faults!)
   // Only needed for C++ code but can be enforced also in NVCC builds of C++ code using CUDA>=11.2 and C++17 (#318, #319, #333)
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   constexpr int cppAlign = 64; // alignment requirement for SIMD vectorization (64-byte i.e. 512-bit)
 #endif
 
@@ -145,7 +168,7 @@ using mgOnGpu::fptype;
 using mgOnGpu::fptype2;
 
 // C++ SIMD vectorization width (this will be used to set neppV)
-#ifdef __CUDACC__ // CUDA implementation has no SIMD
+#ifdef MGONGPUCPP_GPUIMPL // CUDA and HIP implementations have no SIMD
 #undef MGONGPU_CPPSIMD
 #elif defined __AVX512VL__ && defined MGONGPU_PVW512 // C++ "512z" AVX512 with 512 width (512-bit ie 64-byte): 8 (DOUBLE) or 16 (FLOAT)
 #ifdef MGONGPU_FPTYPE_DOUBLE
@@ -175,9 +198,9 @@ using mgOnGpu::fptype2;
 #undef MGONGPU_CPPSIMD
 #endif
 
-// Cuda nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation
+// CUDA nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation
 // Arguments (not used so far): text is __FUNCTION__, code is 0 (start) or 1 (end)
-#if defined __CUDACC__ && defined MGONGPU_NSIGHT_DEBUG /* clang-format off */
+#if defined __CUDA__ && defined MGONGPU_NSIGHT_DEBUG /* clang-format off */
 #define mgDebugDeclare() __shared__ float mgDebugCounter[mgOnGpu::ntpbMAX];
 #define mgDebugInitialise() { mgDebugCounter[threadIdx.x] = 0; }
 #define mgDebug( code, text ) { mgDebugCounter[threadIdx.x] += 1; }
@@ -189,8 +212,8 @@ using mgOnGpu::fptype2;
 #define mgDebugFinalise() { /*noop*/ }
 #endif /* clang-format on */
 
-// Define empty CUDA declaration specifiers for C++
-#ifndef __CUDACC__
+// Define empty CUDA/HIP declaration specifiers for C++
+#ifndef MGONGPUCPP_GPUIMPL
 #define __global__
 #define __host__
 #define __device__
diff --git a/epochX/cudacpp/pp_tt012j.mad/src/mgOnGpuCxtypes.h b/epochX/cudacpp/pp_tt012j.mad/src/mgOnGpuCxtypes.h
index ca9a9f00c0..5532e22fa1 100644
--- a/epochX/cudacpp/pp_tt012j.mad/src/mgOnGpuCxtypes.h
+++ b/epochX/cudacpp/pp_tt012j.mad/src/mgOnGpuCxtypes.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022, based on earlier work by D. Smith) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MGONGPUCXTYPES_H
 #define MGONGPUCXTYPES_H 1
@@ -19,7 +19,7 @@
 #include <complex>
 
 // Complex type in cuda: thrust or cucomplex or cxsmpl
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 #if defined MGONGPU_CUCXTYPE_THRUST
 #pragma clang diagnostic push
 #pragma clang diagnostic ignored "-Wtautological-compare" // for icpx2021/clang13 (https://stackoverflow.com/a/15864661)
@@ -82,7 +82,7 @@ namespace mgOnGpu /* clang-format off */
 using mgOnGpu::cxsmpl;
 
 // Printout to stream for user defined types
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -92,7 +92,7 @@ namespace mg5amcCpu
   inline __host__ std::ostream&
   operator<<( std::ostream& out, const cxsmpl<FP>& c )
   {
-    out << std::complex( c.real(), c.imag() );
+    out << std::complex<FP>( c.real(), c.imag() );
     return out;
   }
 
@@ -215,14 +215,14 @@ namespace mg5amcCpu
 //==========================================================================
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
 #endif
 {
   // --- Type definitions (complex type: cxtype)
-#ifdef __CUDACC__ // cuda
+#ifdef MGONGPUCPP_GPUIMPL // cuda
 #if defined MGONGPU_CUCXTYPE_THRUST
   typedef thrust::complex<fptype> cxtype;
 #elif defined MGONGPU_CUCXTYPE_CUCOMPLEX
@@ -255,7 +255,7 @@ namespace mg5amcCpu
 //==========================================================================
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -307,7 +307,7 @@ namespace mg5amcCpu
 
   //==========================================================================
 
-#if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_THRUST // cuda + thrust
+#if defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CUCXTYPE_THRUST // cuda + thrust
 
   //------------------------------
   // CUDA - using thrust::complex
@@ -343,11 +343,11 @@ namespace mg5amcCpu
     return c;
   }
 
-#endif // #if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_THRUST
+#endif // #if defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CUCXTYPE_THRUST
 
   //==========================================================================
 
-#if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_CUCOMPLEX // cuda + cucomplex
+#if defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CUCXTYPE_CUCOMPLEX // cuda + cucomplex
 
   //------------------------------
   // CUDA - using cuComplex
@@ -562,11 +562,11 @@ namespace mg5amcCpu
     return cxmake( c.real(), c.imag() );
   }
 
-#endif // #if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_CUCOMPLEX
+#endif // #if defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CUCXTYPE_CUCOMPLEX
 
   //==========================================================================
 
-#if not defined __CUDACC__ and defined MGONGPU_CPPCXTYPE_STDCOMPLEX // c++ + stdcomplex
+#if not defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CPPCXTYPE_STDCOMPLEX // c++ + stdcomplex
 
   //------------------------------
   // C++ - using std::complex
@@ -610,7 +610,7 @@ namespace mg5amcCpu
   }
 #endif
 
-#endif // #if not defined __CUDACC__ and defined MGONGPU_CPPCXTYPE_STDCOMPLEX
+#endif // #if not defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CPPCXTYPE_STDCOMPLEX
 
   //==========================================================================
 
@@ -633,7 +633,7 @@ namespace mg5amcCpu
 //==========================================================================
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/pp_tt012j.mad/src/mgOnGpuFptypes.h b/epochX/cudacpp/pp_tt012j.mad/src/mgOnGpuFptypes.h
index 905c97d700..fa3a02664b 100644
--- a/epochX/cudacpp/pp_tt012j.mad/src/mgOnGpuFptypes.h
+++ b/epochX/cudacpp/pp_tt012j.mad/src/mgOnGpuFptypes.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MGONGPUFPTYPES_H
 #define MGONGPUFPTYPES_H 1
@@ -12,7 +12,7 @@
 #include <cmath>
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL // cuda
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -20,7 +20,7 @@ namespace mg5amcCpu
 {
   //==========================================================================
 
-#ifdef __CUDACC__ // cuda
+#ifdef MGONGPUCPP_GPUIMPL // cuda
 
   //------------------------------
   // Floating point types - Cuda
@@ -64,11 +64,11 @@ namespace mg5amcCpu
 #endif
   }
 
-#endif // #ifdef __CUDACC__
+#endif // #ifdef MGONGPUCPP_GPUIMPL
 
   //==========================================================================
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 
   //------------------------------
   // Floating point types - C++
@@ -92,7 +92,7 @@ namespace mg5amcCpu
     return std::sqrt( f );
   }
 
-#endif // #ifndef __CUDACC__
+#endif // #ifndef MGONGPUCPP_GPUIMPL
 
   //==========================================================================
 
diff --git a/epochX/cudacpp/pp_tt012j.mad/src/mgOnGpuVectors.h b/epochX/cudacpp/pp_tt012j.mad/src/mgOnGpuVectors.h
index e1299ba81e..cdae04326b 100644
--- a/epochX/cudacpp/pp_tt012j.mad/src/mgOnGpuVectors.h
+++ b/epochX/cudacpp/pp_tt012j.mad/src/mgOnGpuVectors.h
@@ -32,7 +32,7 @@
 #endif
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -131,7 +131,7 @@ namespace mg5amcCpu
 #endif
 #endif
 
-#else // i.e #ifndef MGONGPU_CPPSIMD (this includes #ifdef __CUDACC__)
+#else // i.e #ifndef MGONGPU_CPPSIMD (this includes #ifdef MGONGPUCPP_GPUIMPL)
 
   const int neppV = 1;
 
@@ -153,13 +153,13 @@ namespace mg5amcCpu
 //==========================================================================
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
 #endif
 {
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 
   // Printout to stream for user defined types
 
@@ -805,11 +805,11 @@ namespace mg5amcCpu
 
 #endif // #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
 
-#endif // #ifndef __CUDACC__
+#endif // #ifndef MGONGPUCPP_GPUIMPL
 
   //==========================================================================
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 
   //------------------------------
   // Vector types - CUDA
@@ -853,12 +853,12 @@ namespace mg5amcCpu
     return mask;
   }
 
-#endif // #ifdef __CUDACC__
+#endif // #ifdef MGONGPUCPP_GPUIMPL
 
   //==========================================================================
 
   // Scalar-or-vector types: scalar in CUDA, vector or scalar in C++
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   typedef bool bool_sv;
   typedef fptype fptype_sv;
   typedef fptype2 fptype2_sv;
@@ -879,7 +879,7 @@ namespace mg5amcCpu
 #endif
 
   // Scalar-or-vector zeros: scalar in CUDA, vector or scalar in C++
-#ifdef __CUDACC__ /* clang-format off */
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
   inline __host__ __device__ cxtype cxzero_sv(){ return cxtype( 0, 0 ); }
 #elif defined MGONGPU_CPPSIMD
   inline cxtype_v cxzero_sv() { return cxtype_v(); } // RRRR=0000 IIII=0000
diff --git a/epochX/cudacpp/pp_tt012j.mad/src/rambo.h b/epochX/cudacpp/pp_tt012j.mad/src/rambo.h
index e02ea52496..cd7e1008ea 100644
--- a/epochX/cudacpp/pp_tt012j.mad/src/rambo.h
+++ b/epochX/cudacpp/pp_tt012j.mad/src/rambo.h
@@ -4,7 +4,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 
 #include "mgOnGpuConfig.h"
@@ -18,7 +18,7 @@
 #include <iostream>
 
 // Simplified rambo version for 2 to N (with N>=2) processes with massless particles
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -83,7 +83,7 @@ namespace mg5amcCpu
       static bool first = true;
       if( first )
       {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
         if constexpr( M_ACCESS::isOnDevice() ) // avoid
         {
           const int ievt0 = 0;
@@ -166,7 +166,7 @@ namespace mg5amcCpu
     wt = po2log;
     if( nparf != 2 ) wt = ( 2. * nparf - 4. ) * log( energy ) + z[nparf - 1];
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
     // issue warnings if weight is too small or too large
     static int iwarn[5] = { 0, 0, 0, 0, 0 };
     if( wt < -180. )

From 464703b6f6e96f7b3585663e41ec435b709e2cc5 Mon Sep 17 00:00:00 2001
From: Andrea Valassi <andrea.valassi@cern.ch>
Date: Thu, 25 Jan 2024 18:08:40 +0100
Subject: [PATCH 508/509] [jt774] *** COMPLETE SYNC OF JTHIP24 AND JT774 ***
 regenerate all processes - add to repo Gpu*.h when missing

*** NB Now all processes in the repo are the same as in jthip24 (including codegen logs as I copied those of jt774 to jthip24) ***

*** NB Now jthip24 is identical to jt774, except that jthip24 also contains extra files in .github/workflows and in tools for CI and profiling ***
---
 .../ee_mumu.mad/CODEGEN_mad_ee_mumu_log.txt   |  22 +-
 epochX/cudacpp/ee_mumu.mad/COPYRIGHT          |   1 +
 .../cudacpp/ee_mumu.mad/SubProcesses/Bridge.h |  32 +--
 .../ee_mumu.mad/SubProcesses/BridgeKernels.cc |   9 +-
 .../ee_mumu.mad/SubProcesses/BridgeKernels.h  |   8 +-
 .../SubProcesses/CommonRandomNumberKernel.cc  |   5 +-
 .../SubProcesses/CrossSectionKernels.cc       |   7 +-
 .../SubProcesses/CrossSectionKernels.h        |   6 +-
 .../SubProcesses/CurandRandomNumberKernel.cc  |  12 +-
 .../SubProcesses/EventStatistics.h            |   4 +-
 .../ee_mumu.mad/SubProcesses/GpuAbstraction.h |  71 ++++++
 .../{CudaRuntime.h => GpuRuntime.h}           |  54 ++--
 .../ee_mumu.mad/SubProcesses/MadgraphTest.h   |   8 +-
 .../SubProcesses/MatrixElementKernels.cc      |  26 +-
 .../SubProcesses/MatrixElementKernels.h       |   8 +-
 .../SubProcesses/MemoryAccessAmplitudes.h     |   2 +-
 .../SubProcesses/MemoryAccessCouplings.h      |   2 +-
 .../SubProcesses/MemoryAccessCouplingsFixed.h |   2 +-
 .../SubProcesses/MemoryAccessDenominators.h   |   2 +-
 .../ee_mumu.mad/SubProcesses/MemoryAccessGs.h |   2 +-
 .../SubProcesses/MemoryAccessHelpers.h        |   4 +-
 .../SubProcesses/MemoryAccessMatrixElements.h |   2 +-
 .../SubProcesses/MemoryAccessMomenta.h        |   6 +-
 .../SubProcesses/MemoryAccessNumerators.h     |   2 +-
 .../SubProcesses/MemoryAccessRandomNumbers.h  |   4 +-
 .../SubProcesses/MemoryAccessVectors.h        |   4 +-
 .../SubProcesses/MemoryAccessWavefunctions.h  |   2 +-
 .../ee_mumu.mad/SubProcesses/MemoryBuffers.h  |  64 ++---
 .../SubProcesses/P1_epem_mupmum/CPPProcess.cc |  62 ++---
 .../SubProcesses/P1_epem_mupmum/CPPProcess.h  |  10 +-
 .../SubProcesses/P1_epem_mupmum/CudaRuntime.h |   1 -
 .../P1_epem_mupmum/GpuAbstraction.h           |   1 +
 .../SubProcesses/P1_epem_mupmum/GpuRuntime.h  |   1 +
 .../SubProcesses/P1_epem_mupmum/check_sa.cc   | 111 +++++----
 .../SubProcesses/RamboSamplingKernels.cc      |  20 +-
 .../SubProcesses/RamboSamplingKernels.h       |   6 +-
 .../SubProcesses/RandomNumberKernels.h        |   6 +-
 .../ee_mumu.mad/SubProcesses/cudacpp.mk       | 232 +++++++++++-------
 .../ee_mumu.mad/SubProcesses/fbridge.cc       |  16 +-
 .../ee_mumu.mad/SubProcesses/fsampler.cc      |   8 +-
 .../ee_mumu.mad/SubProcesses/runTest.cc       |  12 +-
 .../ee_mumu.mad/SubProcesses/testmisc.cc      |   8 +-
 .../ee_mumu.mad/SubProcesses/testxxx.cc       |  14 +-
 epochX/cudacpp/ee_mumu.mad/src/HelAmps_sm.h   |   4 +-
 .../cudacpp/ee_mumu.mad/src/Parameters_sm.cc  |   4 +-
 .../cudacpp/ee_mumu.mad/src/Parameters_sm.h   |  10 +-
 epochX/cudacpp/ee_mumu.mad/src/cudacpp_src.mk |  23 +-
 .../cudacpp/ee_mumu.mad/src/mgOnGpuConfig.h   |  73 ++++--
 .../cudacpp/ee_mumu.mad/src/mgOnGpuCxtypes.h  |  28 +--
 .../cudacpp/ee_mumu.mad/src/mgOnGpuFptypes.h  |  12 +-
 .../cudacpp/ee_mumu.mad/src/mgOnGpuVectors.h  |  18 +-
 epochX/cudacpp/ee_mumu.mad/src/rambo.h        |   8 +-
 .../CODEGEN_cudacpp_ee_mumu_log.txt           |  10 +-
 epochX/cudacpp/ee_mumu.sa/COPYRIGHT           |   1 +
 .../cudacpp/ee_mumu.sa/SubProcesses/Bridge.h  |  32 +--
 .../ee_mumu.sa/SubProcesses/BridgeKernels.cc  |   9 +-
 .../ee_mumu.sa/SubProcesses/BridgeKernels.h   |   8 +-
 .../SubProcesses/CommonRandomNumberKernel.cc  |   5 +-
 .../SubProcesses/CrossSectionKernels.cc       |   7 +-
 .../SubProcesses/CrossSectionKernels.h        |   6 +-
 .../SubProcesses/CurandRandomNumberKernel.cc  |  12 +-
 .../ee_mumu.sa/SubProcesses/EventStatistics.h |   4 +-
 .../ee_mumu.sa/SubProcesses/GpuAbstraction.h  |  71 ++++++
 .../SubProcesses/GpuRuntime.h}                |  54 ++--
 .../ee_mumu.sa/SubProcesses/MadgraphTest.h    |   8 +-
 .../SubProcesses/MatrixElementKernels.cc      |  26 +-
 .../SubProcesses/MatrixElementKernels.h       |   8 +-
 .../SubProcesses/MemoryAccessAmplitudes.h     |   2 +-
 .../SubProcesses/MemoryAccessCouplings.h      |   2 +-
 .../SubProcesses/MemoryAccessCouplingsFixed.h |   2 +-
 .../SubProcesses/MemoryAccessDenominators.h   |   2 +-
 .../ee_mumu.sa/SubProcesses/MemoryAccessGs.h  |   2 +-
 .../SubProcesses/MemoryAccessHelpers.h        |   4 +-
 .../SubProcesses/MemoryAccessMatrixElements.h |   2 +-
 .../SubProcesses/MemoryAccessMomenta.h        |   6 +-
 .../SubProcesses/MemoryAccessNumerators.h     |   2 +-
 .../SubProcesses/MemoryAccessRandomNumbers.h  |   4 +-
 .../SubProcesses/MemoryAccessVectors.h        |   4 +-
 .../SubProcesses/MemoryAccessWavefunctions.h  |   2 +-
 .../ee_mumu.sa/SubProcesses/MemoryBuffers.h   |  64 ++---
 .../P1_Sigma_sm_epem_mupmum/CPPProcess.cc     |  62 ++---
 .../P1_Sigma_sm_epem_mupmum/CPPProcess.h      |  10 +-
 .../P1_Sigma_sm_epem_mupmum/CudaRuntime.h     |   1 -
 .../P1_Sigma_sm_epem_mupmum/GpuAbstraction.h  |   1 +
 .../P1_Sigma_sm_epem_mupmum/GpuRuntime.h      |   1 +
 .../P1_Sigma_sm_epem_mupmum/check_sa.cc       | 111 +++++----
 .../SubProcesses/RamboSamplingKernels.cc      |  20 +-
 .../SubProcesses/RamboSamplingKernels.h       |   6 +-
 .../SubProcesses/RandomNumberKernels.h        |   6 +-
 .../ee_mumu.sa/SubProcesses/cudacpp.mk        | 232 +++++++++++-------
 .../ee_mumu.sa/SubProcesses/fbridge.cc        |  16 +-
 .../ee_mumu.sa/SubProcesses/fsampler.cc       |   8 +-
 .../ee_mumu.sa/SubProcesses/runTest.cc        |  12 +-
 .../ee_mumu.sa/SubProcesses/testmisc.cc       |   8 +-
 .../ee_mumu.sa/SubProcesses/testxxx.cc        |  14 +-
 epochX/cudacpp/ee_mumu.sa/src/HelAmps_sm.h    |   4 +-
 .../cudacpp/ee_mumu.sa/src/Parameters_sm.cc   |   4 +-
 epochX/cudacpp/ee_mumu.sa/src/Parameters_sm.h |  10 +-
 epochX/cudacpp/ee_mumu.sa/src/cudacpp_src.mk  |  23 +-
 epochX/cudacpp/ee_mumu.sa/src/mgOnGpuConfig.h |  73 ++++--
 .../cudacpp/ee_mumu.sa/src/mgOnGpuCxtypes.h   |  28 +--
 .../cudacpp/ee_mumu.sa/src/mgOnGpuFptypes.h   |  12 +-
 .../cudacpp/ee_mumu.sa/src/mgOnGpuVectors.h   |  18 +-
 epochX/cudacpp/ee_mumu.sa/src/rambo.h         |   8 +-
 .../gg_tt.mad/CODEGEN_mad_gg_tt_log.txt       |  14 +-
 .../gg_tt.sa/CODEGEN_cudacpp_gg_tt_log.txt    |  12 +-
 epochX/cudacpp/gg_tt.sa/COPYRIGHT             |   1 +
 epochX/cudacpp/gg_tt.sa/SubProcesses/Bridge.h |  32 +--
 .../gg_tt.sa/SubProcesses/BridgeKernels.cc    |   9 +-
 .../gg_tt.sa/SubProcesses/BridgeKernels.h     |   8 +-
 .../SubProcesses/CommonRandomNumberKernel.cc  |   5 +-
 .../SubProcesses/CrossSectionKernels.cc       |   7 +-
 .../SubProcesses/CrossSectionKernels.h        |   6 +-
 .../SubProcesses/CurandRandomNumberKernel.cc  |  12 +-
 .../gg_tt.sa/SubProcesses/EventStatistics.h   |   4 +-
 .../gg_tt.sa/SubProcesses/GpuAbstraction.h    |  71 ++++++
 .../SubProcesses/GpuRuntime.h}                |  54 ++--
 .../gg_tt.sa/SubProcesses/MadgraphTest.h      |   8 +-
 .../SubProcesses/MatrixElementKernels.cc      |  26 +-
 .../SubProcesses/MatrixElementKernels.h       |   8 +-
 .../SubProcesses/MemoryAccessAmplitudes.h     |   2 +-
 .../SubProcesses/MemoryAccessCouplings.h      |   2 +-
 .../SubProcesses/MemoryAccessCouplingsFixed.h |   2 +-
 .../SubProcesses/MemoryAccessDenominators.h   |   2 +-
 .../gg_tt.sa/SubProcesses/MemoryAccessGs.h    |   2 +-
 .../SubProcesses/MemoryAccessHelpers.h        |   4 +-
 .../SubProcesses/MemoryAccessMatrixElements.h |   2 +-
 .../SubProcesses/MemoryAccessMomenta.h        |   6 +-
 .../SubProcesses/MemoryAccessNumerators.h     |   2 +-
 .../SubProcesses/MemoryAccessRandomNumbers.h  |   4 +-
 .../SubProcesses/MemoryAccessVectors.h        |   4 +-
 .../SubProcesses/MemoryAccessWavefunctions.h  |   2 +-
 .../gg_tt.sa/SubProcesses/MemoryBuffers.h     |  64 ++---
 .../P1_Sigma_sm_gg_ttx/CPPProcess.cc          |  62 ++---
 .../P1_Sigma_sm_gg_ttx/CPPProcess.h           |  10 +-
 .../P1_Sigma_sm_gg_ttx/CudaRuntime.h          |   1 -
 .../P1_Sigma_sm_gg_ttx/GpuAbstraction.h       |   1 +
 .../P1_Sigma_sm_gg_ttx/GpuRuntime.h           |   1 +
 .../P1_Sigma_sm_gg_ttx/check_sa.cc            | 111 +++++----
 .../SubProcesses/RamboSamplingKernels.cc      |  20 +-
 .../SubProcesses/RamboSamplingKernels.h       |   6 +-
 .../SubProcesses/RandomNumberKernels.h        |   6 +-
 .../cudacpp/gg_tt.sa/SubProcesses/cudacpp.mk  | 232 +++++++++++-------
 .../cudacpp/gg_tt.sa/SubProcesses/fbridge.cc  |  16 +-
 .../cudacpp/gg_tt.sa/SubProcesses/fsampler.cc |   8 +-
 .../cudacpp/gg_tt.sa/SubProcesses/runTest.cc  |  12 +-
 .../cudacpp/gg_tt.sa/SubProcesses/testmisc.cc |   8 +-
 .../cudacpp/gg_tt.sa/SubProcesses/testxxx.cc  |  14 +-
 epochX/cudacpp/gg_tt.sa/src/HelAmps_sm.h      |   4 +-
 epochX/cudacpp/gg_tt.sa/src/Parameters_sm.cc  |   4 +-
 epochX/cudacpp/gg_tt.sa/src/Parameters_sm.h   |  10 +-
 epochX/cudacpp/gg_tt.sa/src/cudacpp_src.mk    |  23 +-
 epochX/cudacpp/gg_tt.sa/src/mgOnGpuConfig.h   |  73 ++++--
 epochX/cudacpp/gg_tt.sa/src/mgOnGpuCxtypes.h  |  28 +--
 epochX/cudacpp/gg_tt.sa/src/mgOnGpuFptypes.h  |  12 +-
 epochX/cudacpp/gg_tt.sa/src/mgOnGpuVectors.h  |  18 +-
 epochX/cudacpp/gg_tt.sa/src/rambo.h           |   8 +-
 .../gg_tt01g.mad/CODEGEN_mad_gg_tt01g_log.txt |  28 +--
 epochX/cudacpp/gg_tt01g.mad/COPYRIGHT         |   1 +
 .../gg_tt01g.mad/SubProcesses/Bridge.h        |  32 +--
 .../SubProcesses/BridgeKernels.cc             |   9 +-
 .../gg_tt01g.mad/SubProcesses/BridgeKernels.h |   8 +-
 .../SubProcesses/CommonRandomNumberKernel.cc  |   5 +-
 .../SubProcesses/CrossSectionKernels.cc       |   7 +-
 .../SubProcesses/CrossSectionKernels.h        |   6 +-
 .../SubProcesses/CurandRandomNumberKernel.cc  |  12 +-
 .../SubProcesses/EventStatistics.h            |   4 +-
 .../SubProcesses/GpuAbstraction.h             |  71 ++++++
 .../{CudaRuntime.h => GpuRuntime.h}           |  54 ++--
 .../gg_tt01g.mad/SubProcesses/MadgraphTest.h  |   8 +-
 .../SubProcesses/MatrixElementKernels.cc      |  26 +-
 .../SubProcesses/MatrixElementKernels.h       |   8 +-
 .../SubProcesses/MemoryAccessAmplitudes.h     |   2 +-
 .../SubProcesses/MemoryAccessCouplings.h      |   2 +-
 .../SubProcesses/MemoryAccessCouplingsFixed.h |   2 +-
 .../SubProcesses/MemoryAccessDenominators.h   |   2 +-
 .../SubProcesses/MemoryAccessGs.h             |   2 +-
 .../SubProcesses/MemoryAccessHelpers.h        |   4 +-
 .../SubProcesses/MemoryAccessMatrixElements.h |   2 +-
 .../SubProcesses/MemoryAccessMomenta.h        |   6 +-
 .../SubProcesses/MemoryAccessNumerators.h     |   2 +-
 .../SubProcesses/MemoryAccessRandomNumbers.h  |   4 +-
 .../SubProcesses/MemoryAccessVectors.h        |   4 +-
 .../SubProcesses/MemoryAccessWavefunctions.h  |   2 +-
 .../gg_tt01g.mad/SubProcesses/MemoryBuffers.h |  64 ++---
 .../SubProcesses/P1_gg_ttx/CPPProcess.cc      |  62 ++---
 .../SubProcesses/P1_gg_ttx/CPPProcess.h       |  10 +-
 .../SubProcesses/P1_gg_ttx/CudaRuntime.h      |   1 -
 .../SubProcesses/P1_gg_ttx/GpuAbstraction.h   |   1 +
 .../SubProcesses/P1_gg_ttx/GpuRuntime.h       |   1 +
 .../SubProcesses/P1_gg_ttx/check_sa.cc        | 111 +++++----
 .../SubProcesses/P2_gg_ttxg/CPPProcess.cc     |  62 ++---
 .../SubProcesses/P2_gg_ttxg/CPPProcess.h      |  10 +-
 .../SubProcesses/P2_gg_ttxg/CudaRuntime.h     |   1 -
 .../SubProcesses/P2_gg_ttxg/GpuAbstraction.h  |   1 +
 .../SubProcesses/P2_gg_ttxg/GpuRuntime.h      |   1 +
 .../SubProcesses/P2_gg_ttxg/check_sa.cc       | 111 +++++----
 .../SubProcesses/RamboSamplingKernels.cc      |  20 +-
 .../SubProcesses/RamboSamplingKernels.h       |   6 +-
 .../SubProcesses/RandomNumberKernels.h        |   6 +-
 .../gg_tt01g.mad/SubProcesses/cudacpp.mk      | 232 +++++++++++-------
 .../gg_tt01g.mad/SubProcesses/fbridge.cc      |  16 +-
 .../gg_tt01g.mad/SubProcesses/fsampler.cc     |   8 +-
 .../gg_tt01g.mad/SubProcesses/runTest.cc      |  12 +-
 .../gg_tt01g.mad/SubProcesses/testmisc.cc     |   8 +-
 .../gg_tt01g.mad/SubProcesses/testxxx.cc      |  14 +-
 epochX/cudacpp/gg_tt01g.mad/src/HelAmps_sm.h  |   4 +-
 .../cudacpp/gg_tt01g.mad/src/Parameters_sm.cc |   4 +-
 .../cudacpp/gg_tt01g.mad/src/Parameters_sm.h  |  10 +-
 .../cudacpp/gg_tt01g.mad/src/cudacpp_src.mk   |  23 +-
 .../cudacpp/gg_tt01g.mad/src/mgOnGpuConfig.h  |  73 ++++--
 .../cudacpp/gg_tt01g.mad/src/mgOnGpuCxtypes.h |  28 +--
 .../cudacpp/gg_tt01g.mad/src/mgOnGpuFptypes.h |  12 +-
 .../cudacpp/gg_tt01g.mad/src/mgOnGpuVectors.h |  18 +-
 epochX/cudacpp/gg_tt01g.mad/src/rambo.h       |   8 +-
 .../gg_ttg.mad/CODEGEN_mad_gg_ttg_log.txt     |  22 +-
 epochX/cudacpp/gg_ttg.mad/COPYRIGHT           |   1 +
 .../cudacpp/gg_ttg.mad/SubProcesses/Bridge.h  |  32 +--
 .../gg_ttg.mad/SubProcesses/BridgeKernels.cc  |   9 +-
 .../gg_ttg.mad/SubProcesses/BridgeKernels.h   |   8 +-
 .../SubProcesses/CommonRandomNumberKernel.cc  |   5 +-
 .../SubProcesses/CrossSectionKernels.cc       |   7 +-
 .../SubProcesses/CrossSectionKernels.h        |   6 +-
 .../gg_ttg.mad/SubProcesses/CudaRuntime.h     |  85 -------
 .../SubProcesses/CurandRandomNumberKernel.cc  |  12 +-
 .../gg_ttg.mad/SubProcesses/EventStatistics.h |   4 +-
 .../gg_ttg.mad/SubProcesses/GpuAbstraction.h  |  71 ++++++
 .../gg_ttg.mad/SubProcesses/GpuRuntime.h      |  85 +++++++
 .../gg_ttg.mad/SubProcesses/MadgraphTest.h    |   8 +-
 .../SubProcesses/MatrixElementKernels.cc      |  26 +-
 .../SubProcesses/MatrixElementKernels.h       |   8 +-
 .../SubProcesses/MemoryAccessAmplitudes.h     |   2 +-
 .../SubProcesses/MemoryAccessCouplings.h      |   2 +-
 .../SubProcesses/MemoryAccessCouplingsFixed.h |   2 +-
 .../SubProcesses/MemoryAccessDenominators.h   |   2 +-
 .../gg_ttg.mad/SubProcesses/MemoryAccessGs.h  |   2 +-
 .../SubProcesses/MemoryAccessHelpers.h        |   4 +-
 .../SubProcesses/MemoryAccessMatrixElements.h |   2 +-
 .../SubProcesses/MemoryAccessMomenta.h        |   6 +-
 .../SubProcesses/MemoryAccessNumerators.h     |   2 +-
 .../SubProcesses/MemoryAccessRandomNumbers.h  |   4 +-
 .../SubProcesses/MemoryAccessVectors.h        |   4 +-
 .../SubProcesses/MemoryAccessWavefunctions.h  |   2 +-
 .../gg_ttg.mad/SubProcesses/MemoryBuffers.h   |  64 ++---
 .../SubProcesses/P1_gg_ttxg/CPPProcess.cc     |  62 ++---
 .../SubProcesses/P1_gg_ttxg/CPPProcess.h      |  10 +-
 .../SubProcesses/P1_gg_ttxg/CudaRuntime.h     |   1 -
 .../SubProcesses/P1_gg_ttxg/GpuAbstraction.h  |   1 +
 .../SubProcesses/P1_gg_ttxg/GpuRuntime.h      |   1 +
 .../SubProcesses/P1_gg_ttxg/check_sa.cc       | 111 +++++----
 .../SubProcesses/RamboSamplingKernels.cc      |  20 +-
 .../SubProcesses/RamboSamplingKernels.h       |   6 +-
 .../SubProcesses/RandomNumberKernels.h        |   6 +-
 .../gg_ttg.mad/SubProcesses/cudacpp.mk        | 232 +++++++++++-------
 .../gg_ttg.mad/SubProcesses/fbridge.cc        |  16 +-
 .../gg_ttg.mad/SubProcesses/fsampler.cc       |   8 +-
 .../gg_ttg.mad/SubProcesses/runTest.cc        |  12 +-
 .../gg_ttg.mad/SubProcesses/testmisc.cc       |   8 +-
 .../gg_ttg.mad/SubProcesses/testxxx.cc        |  14 +-
 epochX/cudacpp/gg_ttg.mad/src/HelAmps_sm.h    |   4 +-
 .../cudacpp/gg_ttg.mad/src/Parameters_sm.cc   |   4 +-
 epochX/cudacpp/gg_ttg.mad/src/Parameters_sm.h |  10 +-
 epochX/cudacpp/gg_ttg.mad/src/cudacpp_src.mk  |  23 +-
 epochX/cudacpp/gg_ttg.mad/src/mgOnGpuConfig.h |  73 ++++--
 .../cudacpp/gg_ttg.mad/src/mgOnGpuCxtypes.h   |  28 +--
 .../cudacpp/gg_ttg.mad/src/mgOnGpuFptypes.h   |  12 +-
 .../cudacpp/gg_ttg.mad/src/mgOnGpuVectors.h   |  18 +-
 epochX/cudacpp/gg_ttg.mad/src/rambo.h         |   8 +-
 .../gg_ttg.sa/CODEGEN_cudacpp_gg_ttg_log.txt  |  18 +-
 epochX/cudacpp/gg_ttg.sa/COPYRIGHT            |   1 +
 .../cudacpp/gg_ttg.sa/SubProcesses/Bridge.h   |  32 +--
 .../gg_ttg.sa/SubProcesses/BridgeKernels.cc   |   9 +-
 .../gg_ttg.sa/SubProcesses/BridgeKernels.h    |   8 +-
 .../SubProcesses/CommonRandomNumberKernel.cc  |   5 +-
 .../SubProcesses/CrossSectionKernels.cc       |   7 +-
 .../SubProcesses/CrossSectionKernels.h        |   6 +-
 .../gg_ttg.sa/SubProcesses/CudaRuntime.h      |  85 -------
 .../SubProcesses/CurandRandomNumberKernel.cc  |  12 +-
 .../gg_ttg.sa/SubProcesses/EventStatistics.h  |   4 +-
 .../gg_ttg.sa/SubProcesses/GpuAbstraction.h   |  71 ++++++
 .../gg_ttg.sa/SubProcesses/GpuRuntime.h       |  85 +++++++
 .../gg_ttg.sa/SubProcesses/MadgraphTest.h     |   8 +-
 .../SubProcesses/MatrixElementKernels.cc      |  26 +-
 .../SubProcesses/MatrixElementKernels.h       |   8 +-
 .../SubProcesses/MemoryAccessAmplitudes.h     |   2 +-
 .../SubProcesses/MemoryAccessCouplings.h      |   2 +-
 .../SubProcesses/MemoryAccessCouplingsFixed.h |   2 +-
 .../SubProcesses/MemoryAccessDenominators.h   |   2 +-
 .../gg_ttg.sa/SubProcesses/MemoryAccessGs.h   |   2 +-
 .../SubProcesses/MemoryAccessHelpers.h        |   4 +-
 .../SubProcesses/MemoryAccessMatrixElements.h |   2 +-
 .../SubProcesses/MemoryAccessMomenta.h        |   6 +-
 .../SubProcesses/MemoryAccessNumerators.h     |   2 +-
 .../SubProcesses/MemoryAccessRandomNumbers.h  |   4 +-
 .../SubProcesses/MemoryAccessVectors.h        |   4 +-
 .../SubProcesses/MemoryAccessWavefunctions.h  |   2 +-
 .../gg_ttg.sa/SubProcesses/MemoryBuffers.h    |  64 ++---
 .../P1_Sigma_sm_gg_ttxg/CPPProcess.cc         |  62 ++---
 .../P1_Sigma_sm_gg_ttxg/CPPProcess.h          |  10 +-
 .../P1_Sigma_sm_gg_ttxg/CudaRuntime.h         |   1 -
 .../P1_Sigma_sm_gg_ttxg/GpuAbstraction.h      |   1 +
 .../P1_Sigma_sm_gg_ttxg/GpuRuntime.h          |   1 +
 .../P1_Sigma_sm_gg_ttxg/check_sa.cc           | 111 +++++----
 .../SubProcesses/RamboSamplingKernels.cc      |  20 +-
 .../SubProcesses/RamboSamplingKernels.h       |   6 +-
 .../SubProcesses/RandomNumberKernels.h        |   6 +-
 .../cudacpp/gg_ttg.sa/SubProcesses/cudacpp.mk | 232 +++++++++++-------
 .../cudacpp/gg_ttg.sa/SubProcesses/fbridge.cc |  16 +-
 .../gg_ttg.sa/SubProcesses/fsampler.cc        |   8 +-
 .../cudacpp/gg_ttg.sa/SubProcesses/runTest.cc |  12 +-
 .../gg_ttg.sa/SubProcesses/testmisc.cc        |   8 +-
 .../cudacpp/gg_ttg.sa/SubProcesses/testxxx.cc |  14 +-
 epochX/cudacpp/gg_ttg.sa/src/HelAmps_sm.h     |   4 +-
 epochX/cudacpp/gg_ttg.sa/src/Parameters_sm.cc |   4 +-
 epochX/cudacpp/gg_ttg.sa/src/Parameters_sm.h  |  10 +-
 epochX/cudacpp/gg_ttg.sa/src/cudacpp_src.mk   |  23 +-
 epochX/cudacpp/gg_ttg.sa/src/mgOnGpuConfig.h  |  73 ++++--
 epochX/cudacpp/gg_ttg.sa/src/mgOnGpuCxtypes.h |  28 +--
 epochX/cudacpp/gg_ttg.sa/src/mgOnGpuFptypes.h |  12 +-
 epochX/cudacpp/gg_ttg.sa/src/mgOnGpuVectors.h |  18 +-
 epochX/cudacpp/gg_ttg.sa/src/rambo.h          |   8 +-
 .../gg_ttgg.mad/CODEGEN_mad_gg_ttgg_log.txt   |  28 +--
 epochX/cudacpp/gg_ttgg.mad/COPYRIGHT          |   1 +
 .../cudacpp/gg_ttgg.mad/SubProcesses/Bridge.h |  32 +--
 .../gg_ttgg.mad/SubProcesses/BridgeKernels.cc |   9 +-
 .../gg_ttgg.mad/SubProcesses/BridgeKernels.h  |   8 +-
 .../SubProcesses/CommonRandomNumberKernel.cc  |   5 +-
 .../SubProcesses/CrossSectionKernels.cc       |   7 +-
 .../SubProcesses/CrossSectionKernels.h        |   6 +-
 .../gg_ttgg.mad/SubProcesses/CudaRuntime.h    |  85 -------
 .../SubProcesses/CurandRandomNumberKernel.cc  |  12 +-
 .../SubProcesses/EventStatistics.h            |   4 +-
 .../gg_ttgg.mad/SubProcesses/GpuAbstraction.h |  32 +--
 .../gg_ttgg.mad/SubProcesses/GpuRuntime.h     |  85 +++++++
 .../gg_ttgg.mad/SubProcesses/MadgraphTest.h   |   8 +-
 .../SubProcesses/MatrixElementKernels.cc      |  22 +-
 .../SubProcesses/MatrixElementKernels.h       |   8 +-
 .../SubProcesses/MemoryAccessAmplitudes.h     |   2 +-
 .../SubProcesses/MemoryAccessCouplings.h      |   2 +-
 .../SubProcesses/MemoryAccessCouplingsFixed.h |   2 +-
 .../SubProcesses/MemoryAccessDenominators.h   |   2 +-
 .../gg_ttgg.mad/SubProcesses/MemoryAccessGs.h |   2 +-
 .../SubProcesses/MemoryAccessHelpers.h        |   4 +-
 .../SubProcesses/MemoryAccessMatrixElements.h |   2 +-
 .../SubProcesses/MemoryAccessMomenta.h        |   6 +-
 .../SubProcesses/MemoryAccessNumerators.h     |   2 +-
 .../SubProcesses/MemoryAccessRandomNumbers.h  |   4 +-
 .../SubProcesses/MemoryAccessVectors.h        |   4 +-
 .../SubProcesses/MemoryAccessWavefunctions.h  |   2 +-
 .../gg_ttgg.mad/SubProcesses/MemoryBuffers.h  |  64 ++---
 .../SubProcesses/P1_gg_ttxgg/CPPProcess.cc    |  62 ++---
 .../SubProcesses/P1_gg_ttxgg/CPPProcess.h     |  10 +-
 .../SubProcesses/P1_gg_ttxgg/CudaRuntime.h    |   1 -
 .../SubProcesses/P1_gg_ttxgg/GpuAbstraction.h |   1 +
 .../SubProcesses/P1_gg_ttxgg/GpuRuntime.h     |   1 +
 .../SubProcesses/P1_gg_ttxgg/check_sa.cc      | 107 ++++----
 .../SubProcesses/RamboSamplingKernels.cc      |  20 +-
 .../SubProcesses/RamboSamplingKernels.h       |   6 +-
 .../SubProcesses/RandomNumberKernels.h        |   6 +-
 .../gg_ttgg.mad/SubProcesses/cudacpp.mk       | 232 +++++++++++-------
 .../gg_ttgg.mad/SubProcesses/fbridge.cc       |  16 +-
 .../gg_ttgg.mad/SubProcesses/fsampler.cc      |   8 +-
 .../gg_ttgg.mad/SubProcesses/runTest.cc       |  10 +-
 .../gg_ttgg.mad/SubProcesses/testmisc.cc      |   8 +-
 .../gg_ttgg.mad/SubProcesses/testxxx.cc       |  14 +-
 epochX/cudacpp/gg_ttgg.mad/src/HelAmps_sm.h   |   4 +-
 .../cudacpp/gg_ttgg.mad/src/Parameters_sm.cc  |   4 +-
 .../cudacpp/gg_ttgg.mad/src/Parameters_sm.h   |  10 +-
 epochX/cudacpp/gg_ttgg.mad/src/cudacpp_src.mk |  23 +-
 .../cudacpp/gg_ttgg.mad/src/mgOnGpuConfig.h   |  71 ++++--
 .../cudacpp/gg_ttgg.mad/src/mgOnGpuCxtypes.h  |  28 +--
 .../cudacpp/gg_ttgg.mad/src/mgOnGpuFptypes.h  |  12 +-
 .../cudacpp/gg_ttgg.mad/src/mgOnGpuVectors.h  |  20 +-
 epochX/cudacpp/gg_ttgg.mad/src/rambo.h        |   8 +-
 .../CODEGEN_cudacpp_gg_ttgg_log.txt           |  18 +-
 epochX/cudacpp/gg_ttgg.sa/COPYRIGHT           |   1 +
 .../cudacpp/gg_ttgg.sa/SubProcesses/Bridge.h  |  32 +--
 .../gg_ttgg.sa/SubProcesses/BridgeKernels.cc  |   9 +-
 .../gg_ttgg.sa/SubProcesses/BridgeKernels.h   |   8 +-
 .../SubProcesses/CommonRandomNumberKernel.cc  |   5 +-
 .../SubProcesses/CrossSectionKernels.cc       |   7 +-
 .../SubProcesses/CrossSectionKernels.h        |   6 +-
 .../gg_ttgg.sa/SubProcesses/CudaRuntime.h     |  85 -------
 .../SubProcesses/CurandRandomNumberKernel.cc  |  12 +-
 .../gg_ttgg.sa/SubProcesses/EventStatistics.h |   4 +-
 .../gg_ttgg.sa/SubProcesses/GpuAbstraction.h  |  71 ++++++
 .../gg_ttgg.sa/SubProcesses/GpuRuntime.h      |  85 +++++++
 .../gg_ttgg.sa/SubProcesses/MadgraphTest.h    |   8 +-
 .../SubProcesses/MatrixElementKernels.cc      |  26 +-
 .../SubProcesses/MatrixElementKernels.h       |   8 +-
 .../SubProcesses/MemoryAccessAmplitudes.h     |   2 +-
 .../SubProcesses/MemoryAccessCouplings.h      |   2 +-
 .../SubProcesses/MemoryAccessCouplingsFixed.h |   2 +-
 .../SubProcesses/MemoryAccessDenominators.h   |   2 +-
 .../gg_ttgg.sa/SubProcesses/MemoryAccessGs.h  |   2 +-
 .../SubProcesses/MemoryAccessHelpers.h        |   4 +-
 .../SubProcesses/MemoryAccessMatrixElements.h |   2 +-
 .../SubProcesses/MemoryAccessMomenta.h        |   6 +-
 .../SubProcesses/MemoryAccessNumerators.h     |   2 +-
 .../SubProcesses/MemoryAccessRandomNumbers.h  |   4 +-
 .../SubProcesses/MemoryAccessVectors.h        |   4 +-
 .../SubProcesses/MemoryAccessWavefunctions.h  |   2 +-
 .../gg_ttgg.sa/SubProcesses/MemoryBuffers.h   |  64 ++---
 .../P1_Sigma_sm_gg_ttxgg/CPPProcess.cc        |  62 ++---
 .../P1_Sigma_sm_gg_ttxgg/CPPProcess.h         |  10 +-
 .../P1_Sigma_sm_gg_ttxgg/CudaRuntime.h        |   1 -
 .../P1_Sigma_sm_gg_ttxgg/GpuAbstraction.h     |   1 +
 .../P1_Sigma_sm_gg_ttxgg/GpuRuntime.h         |   1 +
 .../P1_Sigma_sm_gg_ttxgg/check_sa.cc          | 111 +++++----
 .../SubProcesses/RamboSamplingKernels.cc      |  20 +-
 .../SubProcesses/RamboSamplingKernels.h       |   6 +-
 .../SubProcesses/RandomNumberKernels.h        |   6 +-
 .../gg_ttgg.sa/SubProcesses/cudacpp.mk        | 232 +++++++++++-------
 .../gg_ttgg.sa/SubProcesses/fbridge.cc        |  16 +-
 .../gg_ttgg.sa/SubProcesses/fsampler.cc       |   8 +-
 .../gg_ttgg.sa/SubProcesses/runTest.cc        |  12 +-
 .../gg_ttgg.sa/SubProcesses/testmisc.cc       |   8 +-
 .../gg_ttgg.sa/SubProcesses/testxxx.cc        |  14 +-
 epochX/cudacpp/gg_ttgg.sa/src/HelAmps_sm.h    |   4 +-
 .../cudacpp/gg_ttgg.sa/src/Parameters_sm.cc   |   4 +-
 epochX/cudacpp/gg_ttgg.sa/src/Parameters_sm.h |  10 +-
 epochX/cudacpp/gg_ttgg.sa/src/cudacpp_src.mk  |  23 +-
 epochX/cudacpp/gg_ttgg.sa/src/mgOnGpuConfig.h |  73 ++++--
 .../cudacpp/gg_ttgg.sa/src/mgOnGpuCxtypes.h   |  28 +--
 .../cudacpp/gg_ttgg.sa/src/mgOnGpuFptypes.h   |  12 +-
 .../cudacpp/gg_ttgg.sa/src/mgOnGpuVectors.h   |  18 +-
 epochX/cudacpp/gg_ttgg.sa/src/rambo.h         |   8 +-
 .../gg_ttggg.mad/CODEGEN_mad_gg_ttggg_log.txt |  26 +-
 epochX/cudacpp/gg_ttggg.mad/COPYRIGHT         |   1 +
 .../gg_ttggg.mad/SubProcesses/Bridge.h        |  32 +--
 .../SubProcesses/BridgeKernels.cc             |   9 +-
 .../gg_ttggg.mad/SubProcesses/BridgeKernels.h |   8 +-
 .../SubProcesses/CommonRandomNumberKernel.cc  |   5 +-
 .../SubProcesses/CrossSectionKernels.cc       |   7 +-
 .../SubProcesses/CrossSectionKernels.h        |   6 +-
 .../gg_ttggg.mad/SubProcesses/CudaRuntime.h   |  85 -------
 .../SubProcesses/CurandRandomNumberKernel.cc  |  12 +-
 .../SubProcesses/EventStatistics.h            |   4 +-
 .../SubProcesses/GpuAbstraction.h             |  71 ++++++
 .../gg_ttggg.mad/SubProcesses/GpuRuntime.h    |  85 +++++++
 .../gg_ttggg.mad/SubProcesses/MadgraphTest.h  |   8 +-
 .../SubProcesses/MatrixElementKernels.cc      |  26 +-
 .../SubProcesses/MatrixElementKernels.h       |   8 +-
 .../SubProcesses/MemoryAccessAmplitudes.h     |   2 +-
 .../SubProcesses/MemoryAccessCouplings.h      |   2 +-
 .../SubProcesses/MemoryAccessCouplingsFixed.h |   2 +-
 .../SubProcesses/MemoryAccessDenominators.h   |   2 +-
 .../SubProcesses/MemoryAccessGs.h             |   2 +-
 .../SubProcesses/MemoryAccessHelpers.h        |   4 +-
 .../SubProcesses/MemoryAccessMatrixElements.h |   2 +-
 .../SubProcesses/MemoryAccessMomenta.h        |   6 +-
 .../SubProcesses/MemoryAccessNumerators.h     |   2 +-
 .../SubProcesses/MemoryAccessRandomNumbers.h  |   4 +-
 .../SubProcesses/MemoryAccessVectors.h        |   4 +-
 .../SubProcesses/MemoryAccessWavefunctions.h  |   2 +-
 .../gg_ttggg.mad/SubProcesses/MemoryBuffers.h |  64 ++---
 .../SubProcesses/P1_gg_ttxggg/CPPProcess.cc   |  62 ++---
 .../SubProcesses/P1_gg_ttxggg/CPPProcess.h    |  10 +-
 .../SubProcesses/P1_gg_ttxggg/CudaRuntime.h   |   1 -
 .../P1_gg_ttxggg/GpuAbstraction.h             |   1 +
 .../SubProcesses/P1_gg_ttxggg/GpuRuntime.h    |   1 +
 .../SubProcesses/P1_gg_ttxggg/check_sa.cc     | 111 +++++----
 .../SubProcesses/RamboSamplingKernels.cc      |  20 +-
 .../SubProcesses/RamboSamplingKernels.h       |   6 +-
 .../SubProcesses/RandomNumberKernels.h        |   6 +-
 .../gg_ttggg.mad/SubProcesses/cudacpp.mk      | 232 +++++++++++-------
 .../gg_ttggg.mad/SubProcesses/fbridge.cc      |  16 +-
 .../gg_ttggg.mad/SubProcesses/fsampler.cc     |   8 +-
 .../gg_ttggg.mad/SubProcesses/runTest.cc      |  12 +-
 .../gg_ttggg.mad/SubProcesses/testmisc.cc     |   8 +-
 .../gg_ttggg.mad/SubProcesses/testxxx.cc      |  14 +-
 epochX/cudacpp/gg_ttggg.mad/src/HelAmps_sm.h  |   4 +-
 .../cudacpp/gg_ttggg.mad/src/Parameters_sm.cc |   4 +-
 .../cudacpp/gg_ttggg.mad/src/Parameters_sm.h  |  10 +-
 .../cudacpp/gg_ttggg.mad/src/cudacpp_src.mk   |  23 +-
 .../cudacpp/gg_ttggg.mad/src/mgOnGpuConfig.h  |  73 ++++--
 .../cudacpp/gg_ttggg.mad/src/mgOnGpuCxtypes.h |  28 +--
 .../cudacpp/gg_ttggg.mad/src/mgOnGpuFptypes.h |  12 +-
 .../cudacpp/gg_ttggg.mad/src/mgOnGpuVectors.h |  18 +-
 epochX/cudacpp/gg_ttggg.mad/src/rambo.h       |   8 +-
 .../CODEGEN_cudacpp_gg_ttggg_log.txt          |  16 +-
 epochX/cudacpp/gg_ttggg.sa/COPYRIGHT          |   1 +
 .../cudacpp/gg_ttggg.sa/SubProcesses/Bridge.h |  32 +--
 .../gg_ttggg.sa/SubProcesses/BridgeKernels.cc |   9 +-
 .../gg_ttggg.sa/SubProcesses/BridgeKernels.h  |   8 +-
 .../SubProcesses/CommonRandomNumberKernel.cc  |   5 +-
 .../SubProcesses/CrossSectionKernels.cc       |   7 +-
 .../SubProcesses/CrossSectionKernels.h        |   6 +-
 .../gg_ttggg.sa/SubProcesses/CudaRuntime.h    |  85 -------
 .../SubProcesses/CurandRandomNumberKernel.cc  |  12 +-
 .../SubProcesses/EventStatistics.h            |   4 +-
 .../gg_ttggg.sa/SubProcesses/GpuAbstraction.h |  71 ++++++
 .../gg_ttggg.sa/SubProcesses/GpuRuntime.h     |  85 +++++++
 .../gg_ttggg.sa/SubProcesses/MadgraphTest.h   |   8 +-
 .../SubProcesses/MatrixElementKernels.cc      |  26 +-
 .../SubProcesses/MatrixElementKernels.h       |   8 +-
 .../SubProcesses/MemoryAccessAmplitudes.h     |   2 +-
 .../SubProcesses/MemoryAccessCouplings.h      |   2 +-
 .../SubProcesses/MemoryAccessCouplingsFixed.h |   2 +-
 .../SubProcesses/MemoryAccessDenominators.h   |   2 +-
 .../gg_ttggg.sa/SubProcesses/MemoryAccessGs.h |   2 +-
 .../SubProcesses/MemoryAccessHelpers.h        |   4 +-
 .../SubProcesses/MemoryAccessMatrixElements.h |   2 +-
 .../SubProcesses/MemoryAccessMomenta.h        |   6 +-
 .../SubProcesses/MemoryAccessNumerators.h     |   2 +-
 .../SubProcesses/MemoryAccessRandomNumbers.h  |   4 +-
 .../SubProcesses/MemoryAccessVectors.h        |   4 +-
 .../SubProcesses/MemoryAccessWavefunctions.h  |   2 +-
 .../gg_ttggg.sa/SubProcesses/MemoryBuffers.h  |  64 ++---
 .../P1_Sigma_sm_gg_ttxggg/CPPProcess.cc       |  62 ++---
 .../P1_Sigma_sm_gg_ttxggg/CPPProcess.h        |  10 +-
 .../P1_Sigma_sm_gg_ttxggg/CudaRuntime.h       |   1 -
 .../P1_Sigma_sm_gg_ttxggg/GpuAbstraction.h    |   1 +
 .../P1_Sigma_sm_gg_ttxggg/GpuRuntime.h        |   1 +
 .../P1_Sigma_sm_gg_ttxggg/check_sa.cc         | 111 +++++----
 .../SubProcesses/RamboSamplingKernels.cc      |  20 +-
 .../SubProcesses/RamboSamplingKernels.h       |   6 +-
 .../SubProcesses/RandomNumberKernels.h        |   6 +-
 .../gg_ttggg.sa/SubProcesses/cudacpp.mk       | 232 +++++++++++-------
 .../gg_ttggg.sa/SubProcesses/fbridge.cc       |  16 +-
 .../gg_ttggg.sa/SubProcesses/fsampler.cc      |   8 +-
 .../gg_ttggg.sa/SubProcesses/runTest.cc       |  12 +-
 .../gg_ttggg.sa/SubProcesses/testmisc.cc      |   8 +-
 .../gg_ttggg.sa/SubProcesses/testxxx.cc       |  14 +-
 epochX/cudacpp/gg_ttggg.sa/src/HelAmps_sm.h   |   4 +-
 .../cudacpp/gg_ttggg.sa/src/Parameters_sm.cc  |   4 +-
 .../cudacpp/gg_ttggg.sa/src/Parameters_sm.h   |  10 +-
 epochX/cudacpp/gg_ttggg.sa/src/cudacpp_src.mk |  23 +-
 .../cudacpp/gg_ttggg.sa/src/mgOnGpuConfig.h   |  73 ++++--
 .../cudacpp/gg_ttggg.sa/src/mgOnGpuCxtypes.h  |  28 +--
 .../cudacpp/gg_ttggg.sa/src/mgOnGpuFptypes.h  |  12 +-
 .../cudacpp/gg_ttggg.sa/src/mgOnGpuVectors.h  |  18 +-
 epochX/cudacpp/gg_ttggg.sa/src/rambo.h        |   8 +-
 .../gq_ttq.mad/CODEGEN_mad_gq_ttq_log.txt     |  30 +--
 epochX/cudacpp/gq_ttq.mad/COPYRIGHT           |   1 +
 .../cudacpp/gq_ttq.mad/SubProcesses/Bridge.h  |  32 +--
 .../gq_ttq.mad/SubProcesses/BridgeKernels.cc  |   9 +-
 .../gq_ttq.mad/SubProcesses/BridgeKernels.h   |   8 +-
 .../SubProcesses/CommonRandomNumberKernel.cc  |   5 +-
 .../SubProcesses/CrossSectionKernels.cc       |   7 +-
 .../SubProcesses/CrossSectionKernels.h        |   6 +-
 .../gq_ttq.mad/SubProcesses/CudaRuntime.h     |  85 -------
 .../SubProcesses/CurandRandomNumberKernel.cc  |  12 +-
 .../gq_ttq.mad/SubProcesses/EventStatistics.h |   4 +-
 .../gq_ttq.mad/SubProcesses/GpuAbstraction.h  |  71 ++++++
 .../gq_ttq.mad/SubProcesses/GpuRuntime.h      |  85 +++++++
 .../gq_ttq.mad/SubProcesses/MadgraphTest.h    |   8 +-
 .../SubProcesses/MatrixElementKernels.cc      |  26 +-
 .../SubProcesses/MatrixElementKernels.h       |   8 +-
 .../SubProcesses/MemoryAccessAmplitudes.h     |   2 +-
 .../SubProcesses/MemoryAccessCouplings.h      |   2 +-
 .../SubProcesses/MemoryAccessCouplingsFixed.h |   2 +-
 .../SubProcesses/MemoryAccessDenominators.h   |   2 +-
 .../gq_ttq.mad/SubProcesses/MemoryAccessGs.h  |   2 +-
 .../SubProcesses/MemoryAccessHelpers.h        |   4 +-
 .../SubProcesses/MemoryAccessMatrixElements.h |   2 +-
 .../SubProcesses/MemoryAccessMomenta.h        |   6 +-
 .../SubProcesses/MemoryAccessNumerators.h     |   2 +-
 .../SubProcesses/MemoryAccessRandomNumbers.h  |   4 +-
 .../SubProcesses/MemoryAccessVectors.h        |   4 +-
 .../SubProcesses/MemoryAccessWavefunctions.h  |   2 +-
 .../gq_ttq.mad/SubProcesses/MemoryBuffers.h   |  64 ++---
 .../SubProcesses/P1_gu_ttxu/CPPProcess.cc     |  62 ++---
 .../SubProcesses/P1_gu_ttxu/CPPProcess.h      |  10 +-
 .../SubProcesses/P1_gu_ttxu/CudaRuntime.h     |   1 -
 .../SubProcesses/P1_gu_ttxu/GpuAbstraction.h  |   1 +
 .../SubProcesses/P1_gu_ttxu/GpuRuntime.h      |   1 +
 .../SubProcesses/P1_gu_ttxu/check_sa.cc       | 111 +++++----
 .../SubProcesses/P1_gux_ttxux/CPPProcess.cc   |  62 ++---
 .../SubProcesses/P1_gux_ttxux/CPPProcess.h    |  10 +-
 .../SubProcesses/P1_gux_ttxux/CudaRuntime.h   |   1 -
 .../P1_gux_ttxux/GpuAbstraction.h             |   1 +
 .../SubProcesses/P1_gux_ttxux/GpuRuntime.h    |   1 +
 .../SubProcesses/P1_gux_ttxux/check_sa.cc     | 111 +++++----
 .../SubProcesses/RamboSamplingKernels.cc      |  20 +-
 .../SubProcesses/RamboSamplingKernels.h       |   6 +-
 .../SubProcesses/RandomNumberKernels.h        |   6 +-
 .../gq_ttq.mad/SubProcesses/cudacpp.mk        | 232 +++++++++++-------
 .../gq_ttq.mad/SubProcesses/fbridge.cc        |  16 +-
 .../gq_ttq.mad/SubProcesses/fsampler.cc       |   8 +-
 .../gq_ttq.mad/SubProcesses/runTest.cc        |  12 +-
 .../gq_ttq.mad/SubProcesses/testmisc.cc       |   8 +-
 .../gq_ttq.mad/SubProcesses/testxxx.cc        |  14 +-
 epochX/cudacpp/gq_ttq.mad/src/HelAmps_sm.h    |   4 +-
 .../cudacpp/gq_ttq.mad/src/Parameters_sm.cc   |   4 +-
 epochX/cudacpp/gq_ttq.mad/src/Parameters_sm.h |  10 +-
 epochX/cudacpp/gq_ttq.mad/src/cudacpp_src.mk  |  23 +-
 epochX/cudacpp/gq_ttq.mad/src/mgOnGpuConfig.h |  73 ++++--
 .../cudacpp/gq_ttq.mad/src/mgOnGpuCxtypes.h   |  28 +--
 .../cudacpp/gq_ttq.mad/src/mgOnGpuFptypes.h   |  12 +-
 .../cudacpp/gq_ttq.mad/src/mgOnGpuVectors.h   |  18 +-
 epochX/cudacpp/gq_ttq.mad/src/rambo.h         |   8 +-
 .../gq_ttq.sa/CODEGEN_cudacpp_gq_ttq_log.txt  |  16 +-
 epochX/cudacpp/gq_ttq.sa/COPYRIGHT            |   1 +
 .../cudacpp/gq_ttq.sa/SubProcesses/Bridge.h   |  32 +--
 .../gq_ttq.sa/SubProcesses/BridgeKernels.cc   |   9 +-
 .../gq_ttq.sa/SubProcesses/BridgeKernels.h    |   8 +-
 .../SubProcesses/CommonRandomNumberKernel.cc  |   5 +-
 .../SubProcesses/CrossSectionKernels.cc       |   7 +-
 .../SubProcesses/CrossSectionKernels.h        |   6 +-
 .../gq_ttq.sa/SubProcesses/CudaRuntime.h      |  85 -------
 .../SubProcesses/CurandRandomNumberKernel.cc  |  12 +-
 .../gq_ttq.sa/SubProcesses/EventStatistics.h  |   4 +-
 .../gq_ttq.sa/SubProcesses/GpuAbstraction.h   |  71 ++++++
 .../gq_ttq.sa/SubProcesses/GpuRuntime.h       |  85 +++++++
 .../gq_ttq.sa/SubProcesses/MadgraphTest.h     |   8 +-
 .../SubProcesses/MatrixElementKernels.cc      |  26 +-
 .../SubProcesses/MatrixElementKernels.h       |   8 +-
 .../SubProcesses/MemoryAccessAmplitudes.h     |   2 +-
 .../SubProcesses/MemoryAccessCouplings.h      |   2 +-
 .../SubProcesses/MemoryAccessCouplingsFixed.h |   2 +-
 .../SubProcesses/MemoryAccessDenominators.h   |   2 +-
 .../gq_ttq.sa/SubProcesses/MemoryAccessGs.h   |   2 +-
 .../SubProcesses/MemoryAccessHelpers.h        |   4 +-
 .../SubProcesses/MemoryAccessMatrixElements.h |   2 +-
 .../SubProcesses/MemoryAccessMomenta.h        |   6 +-
 .../SubProcesses/MemoryAccessNumerators.h     |   2 +-
 .../SubProcesses/MemoryAccessRandomNumbers.h  |   4 +-
 .../SubProcesses/MemoryAccessVectors.h        |   4 +-
 .../SubProcesses/MemoryAccessWavefunctions.h  |   2 +-
 .../gq_ttq.sa/SubProcesses/MemoryBuffers.h    |  64 ++---
 .../P1_Sigma_sm_gu_ttxu/CPPProcess.cc         |  62 ++---
 .../P1_Sigma_sm_gu_ttxu/CPPProcess.h          |  10 +-
 .../P1_Sigma_sm_gu_ttxu/CudaRuntime.h         |   1 -
 .../P1_Sigma_sm_gu_ttxu/GpuAbstraction.h      |   1 +
 .../P1_Sigma_sm_gu_ttxu/GpuRuntime.h          |   1 +
 .../P1_Sigma_sm_gu_ttxu/check_sa.cc           | 111 +++++----
 .../P1_Sigma_sm_gux_ttxux/CPPProcess.cc       |  62 ++---
 .../P1_Sigma_sm_gux_ttxux/CPPProcess.h        |  10 +-
 .../P1_Sigma_sm_gux_ttxux/CudaRuntime.h       |   1 -
 .../P1_Sigma_sm_gux_ttxux/GpuAbstraction.h    |   1 +
 .../P1_Sigma_sm_gux_ttxux/GpuRuntime.h        |   1 +
 .../P1_Sigma_sm_gux_ttxux/check_sa.cc         | 111 +++++----
 .../SubProcesses/RamboSamplingKernels.cc      |  20 +-
 .../SubProcesses/RamboSamplingKernels.h       |   6 +-
 .../SubProcesses/RandomNumberKernels.h        |   6 +-
 .../cudacpp/gq_ttq.sa/SubProcesses/cudacpp.mk | 232 +++++++++++-------
 .../cudacpp/gq_ttq.sa/SubProcesses/fbridge.cc |  16 +-
 .../gq_ttq.sa/SubProcesses/fsampler.cc        |   8 +-
 .../cudacpp/gq_ttq.sa/SubProcesses/runTest.cc |  12 +-
 .../gq_ttq.sa/SubProcesses/testmisc.cc        |   8 +-
 .../cudacpp/gq_ttq.sa/SubProcesses/testxxx.cc |  14 +-
 epochX/cudacpp/gq_ttq.sa/src/HelAmps_sm.h     |   4 +-
 epochX/cudacpp/gq_ttq.sa/src/Parameters_sm.cc |   4 +-
 epochX/cudacpp/gq_ttq.sa/src/Parameters_sm.h  |  10 +-
 epochX/cudacpp/gq_ttq.sa/src/cudacpp_src.mk   |  23 +-
 epochX/cudacpp/gq_ttq.sa/src/mgOnGpuConfig.h  |  73 ++++--
 epochX/cudacpp/gq_ttq.sa/src/mgOnGpuCxtypes.h |  28 +--
 epochX/cudacpp/gq_ttq.sa/src/mgOnGpuFptypes.h |  12 +-
 epochX/cudacpp/gq_ttq.sa/src/mgOnGpuVectors.h |  18 +-
 epochX/cudacpp/gq_ttq.sa/src/rambo.h          |   8 +-
 .../CODEGEN_cudacpp_heft_gg_h_log.txt         |  12 +-
 epochX/cudacpp/heft_gg_h.sa/COPYRIGHT         |   1 +
 .../heft_gg_h.sa/SubProcesses/Bridge.h        |  32 +--
 .../SubProcesses/BridgeKernels.cc             |   9 +-
 .../heft_gg_h.sa/SubProcesses/BridgeKernels.h |   8 +-
 .../SubProcesses/CommonRandomNumberKernel.cc  |   5 +-
 .../SubProcesses/CrossSectionKernels.cc       |   7 +-
 .../SubProcesses/CrossSectionKernels.h        |   6 +-
 .../heft_gg_h.sa/SubProcesses/CudaRuntime.h   |  85 -------
 .../SubProcesses/CurandRandomNumberKernel.cc  |  12 +-
 .../SubProcesses/EventStatistics.h            |   4 +-
 .../SubProcesses/GpuAbstraction.h             |  71 ++++++
 .../heft_gg_h.sa/SubProcesses/GpuRuntime.h    |  85 +++++++
 .../heft_gg_h.sa/SubProcesses/MadgraphTest.h  |   8 +-
 .../SubProcesses/MatrixElementKernels.cc      |  26 +-
 .../SubProcesses/MatrixElementKernels.h       |   8 +-
 .../SubProcesses/MemoryAccessAmplitudes.h     |   2 +-
 .../SubProcesses/MemoryAccessCouplings.h      |   2 +-
 .../SubProcesses/MemoryAccessCouplingsFixed.h |   2 +-
 .../SubProcesses/MemoryAccessDenominators.h   |   2 +-
 .../SubProcesses/MemoryAccessGs.h             |   2 +-
 .../SubProcesses/MemoryAccessHelpers.h        |   4 +-
 .../SubProcesses/MemoryAccessMatrixElements.h |   2 +-
 .../SubProcesses/MemoryAccessMomenta.h        |   6 +-
 .../SubProcesses/MemoryAccessNumerators.h     |   2 +-
 .../SubProcesses/MemoryAccessRandomNumbers.h  |   4 +-
 .../SubProcesses/MemoryAccessVectors.h        |   4 +-
 .../SubProcesses/MemoryAccessWavefunctions.h  |   2 +-
 .../heft_gg_h.sa/SubProcesses/MemoryBuffers.h |  64 ++---
 .../P1_Sigma_heft_gg_h/CPPProcess.cc          |  62 ++---
 .../P1_Sigma_heft_gg_h/CPPProcess.h           |  10 +-
 .../P1_Sigma_heft_gg_h/CudaRuntime.h          |   1 -
 .../P1_Sigma_heft_gg_h/GpuAbstraction.h       |   1 +
 .../P1_Sigma_heft_gg_h/GpuRuntime.h           |   1 +
 .../P1_Sigma_heft_gg_h/check_sa.cc            | 111 +++++----
 .../SubProcesses/RamboSamplingKernels.cc      |  20 +-
 .../SubProcesses/RamboSamplingKernels.h       |   6 +-
 .../SubProcesses/RandomNumberKernels.h        |   6 +-
 .../heft_gg_h.sa/SubProcesses/cudacpp.mk      | 232 +++++++++++-------
 .../heft_gg_h.sa/SubProcesses/fbridge.cc      |  16 +-
 .../heft_gg_h.sa/SubProcesses/fsampler.cc     |   8 +-
 .../heft_gg_h.sa/SubProcesses/runTest.cc      |  12 +-
 .../heft_gg_h.sa/SubProcesses/testmisc.cc     |   8 +-
 .../heft_gg_h.sa/SubProcesses/testxxx.cc      |  14 +-
 .../cudacpp/heft_gg_h.sa/src/HelAmps_heft.h   |   4 +-
 .../heft_gg_h.sa/src/Parameters_heft.cc       |   4 +-
 .../heft_gg_h.sa/src/Parameters_heft.h        |  10 +-
 .../cudacpp/heft_gg_h.sa/src/cudacpp_src.mk   |  23 +-
 .../cudacpp/heft_gg_h.sa/src/mgOnGpuConfig.h  |  73 ++++--
 .../cudacpp/heft_gg_h.sa/src/mgOnGpuCxtypes.h |  28 +--
 .../cudacpp/heft_gg_h.sa/src/mgOnGpuFptypes.h |  12 +-
 .../cudacpp/heft_gg_h.sa/src/mgOnGpuVectors.h |  18 +-
 epochX/cudacpp/heft_gg_h.sa/src/rambo.h       |   8 +-
 .../CODEGEN_mad_pp_tt012j_log.txt             |  64 ++---
 epochX/cudacpp/pp_tt012j.mad/COPYRIGHT        |   1 +
 .../pp_tt012j.mad/SubProcesses/Bridge.h       |  32 +--
 .../SubProcesses/BridgeKernels.cc             |   9 +-
 .../SubProcesses/BridgeKernels.h              |   8 +-
 .../SubProcesses/CommonRandomNumberKernel.cc  |   5 +-
 .../SubProcesses/CrossSectionKernels.cc       |   7 +-
 .../SubProcesses/CrossSectionKernels.h        |   6 +-
 .../pp_tt012j.mad/SubProcesses/CudaRuntime.h  |  85 -------
 .../SubProcesses/CurandRandomNumberKernel.cc  |  12 +-
 .../SubProcesses/EventStatistics.h            |   4 +-
 .../SubProcesses/GpuAbstraction.h             |  71 ++++++
 .../pp_tt012j.mad/SubProcesses/GpuRuntime.h   |  85 +++++++
 .../pp_tt012j.mad/SubProcesses/MadgraphTest.h |   8 +-
 .../SubProcesses/MatrixElementKernels.cc      |  26 +-
 .../SubProcesses/MatrixElementKernels.h       |   8 +-
 .../SubProcesses/MemoryAccessAmplitudes.h     |   2 +-
 .../SubProcesses/MemoryAccessCouplings.h      |   2 +-
 .../SubProcesses/MemoryAccessCouplingsFixed.h |   2 +-
 .../SubProcesses/MemoryAccessDenominators.h   |   2 +-
 .../SubProcesses/MemoryAccessGs.h             |   2 +-
 .../SubProcesses/MemoryAccessHelpers.h        |   4 +-
 .../SubProcesses/MemoryAccessMatrixElements.h |   2 +-
 .../SubProcesses/MemoryAccessMomenta.h        |   6 +-
 .../SubProcesses/MemoryAccessNumerators.h     |   2 +-
 .../SubProcesses/MemoryAccessRandomNumbers.h  |   4 +-
 .../SubProcesses/MemoryAccessVectors.h        |   4 +-
 .../SubProcesses/MemoryAccessWavefunctions.h  |   2 +-
 .../SubProcesses/MemoryBuffers.h              |  64 ++---
 .../SubProcesses/P0_gg_ttx/CPPProcess.cc      |  62 ++---
 .../SubProcesses/P0_gg_ttx/CPPProcess.h       |  10 +-
 .../SubProcesses/P0_gg_ttx/CudaRuntime.h      |   1 -
 .../SubProcesses/P0_gg_ttx/GpuAbstraction.h   |   1 +
 .../SubProcesses/P0_gg_ttx/GpuRuntime.h       |   1 +
 .../SubProcesses/P0_gg_ttx/check_sa.cc        | 111 +++++----
 .../SubProcesses/P0_uux_ttx/CPPProcess.cc     |  62 ++---
 .../SubProcesses/P0_uux_ttx/CPPProcess.h      |  10 +-
 .../SubProcesses/P0_uux_ttx/CudaRuntime.h     |   1 -
 .../SubProcesses/P0_uux_ttx/GpuAbstraction.h  |   1 +
 .../SubProcesses/P0_uux_ttx/GpuRuntime.h      |   1 +
 .../SubProcesses/P0_uux_ttx/check_sa.cc       | 111 +++++----
 .../SubProcesses/P1_gg_ttxg/CPPProcess.cc     |  62 ++---
 .../SubProcesses/P1_gg_ttxg/CPPProcess.h      |  10 +-
 .../SubProcesses/P1_gg_ttxg/CudaRuntime.h     |   1 -
 .../SubProcesses/P1_gg_ttxg/GpuAbstraction.h  |   1 +
 .../SubProcesses/P1_gg_ttxg/GpuRuntime.h      |   1 +
 .../SubProcesses/P1_gg_ttxg/check_sa.cc       | 111 +++++----
 .../SubProcesses/P1_gu_ttxu/CPPProcess.cc     |  62 ++---
 .../SubProcesses/P1_gu_ttxu/CPPProcess.h      |  10 +-
 .../SubProcesses/P1_gu_ttxu/CudaRuntime.h     |   1 -
 .../SubProcesses/P1_gu_ttxu/GpuAbstraction.h  |   1 +
 .../SubProcesses/P1_gu_ttxu/GpuRuntime.h      |   1 +
 .../SubProcesses/P1_gu_ttxu/check_sa.cc       | 111 +++++----
 .../SubProcesses/P1_gux_ttxux/CPPProcess.cc   |  62 ++---
 .../SubProcesses/P1_gux_ttxux/CPPProcess.h    |  10 +-
 .../SubProcesses/P1_gux_ttxux/CudaRuntime.h   |   1 -
 .../P1_gux_ttxux/GpuAbstraction.h             |   1 +
 .../SubProcesses/P1_gux_ttxux/GpuRuntime.h    |   1 +
 .../SubProcesses/P1_gux_ttxux/check_sa.cc     | 111 +++++----
 .../SubProcesses/P1_uux_ttxg/CPPProcess.cc    |  62 ++---
 .../SubProcesses/P1_uux_ttxg/CPPProcess.h     |  10 +-
 .../SubProcesses/P1_uux_ttxg/CudaRuntime.h    |   1 -
 .../SubProcesses/P1_uux_ttxg/GpuAbstraction.h |   1 +
 .../SubProcesses/P1_uux_ttxg/GpuRuntime.h     |   1 +
 .../SubProcesses/P1_uux_ttxg/check_sa.cc      | 111 +++++----
 .../SubProcesses/P2_gg_ttxgg/CPPProcess.cc    |  62 ++---
 .../SubProcesses/P2_gg_ttxgg/CPPProcess.h     |  10 +-
 .../SubProcesses/P2_gg_ttxgg/CudaRuntime.h    |   1 -
 .../SubProcesses/P2_gg_ttxgg/GpuAbstraction.h |   1 +
 .../SubProcesses/P2_gg_ttxgg/GpuRuntime.h     |   1 +
 .../SubProcesses/P2_gg_ttxgg/check_sa.cc      | 111 +++++----
 .../SubProcesses/P2_gg_ttxuux/CPPProcess.cc   |  62 ++---
 .../SubProcesses/P2_gg_ttxuux/CPPProcess.h    |  10 +-
 .../SubProcesses/P2_gg_ttxuux/CudaRuntime.h   |   1 -
 .../P2_gg_ttxuux/GpuAbstraction.h             |   1 +
 .../SubProcesses/P2_gg_ttxuux/GpuRuntime.h    |   1 +
 .../SubProcesses/P2_gg_ttxuux/check_sa.cc     | 111 +++++----
 .../SubProcesses/P2_gu_ttxgu/CPPProcess.cc    |  62 ++---
 .../SubProcesses/P2_gu_ttxgu/CPPProcess.h     |  10 +-
 .../SubProcesses/P2_gu_ttxgu/CudaRuntime.h    |   1 -
 .../SubProcesses/P2_gu_ttxgu/GpuAbstraction.h |   1 +
 .../SubProcesses/P2_gu_ttxgu/GpuRuntime.h     |   1 +
 .../SubProcesses/P2_gu_ttxgu/check_sa.cc      | 111 +++++----
 .../SubProcesses/P2_gux_ttxgux/CPPProcess.cc  |  62 ++---
 .../SubProcesses/P2_gux_ttxgux/CPPProcess.h   |  10 +-
 .../SubProcesses/P2_gux_ttxgux/CudaRuntime.h  |   1 -
 .../P2_gux_ttxgux/GpuAbstraction.h            |   1 +
 .../SubProcesses/P2_gux_ttxgux/GpuRuntime.h   |   1 +
 .../SubProcesses/P2_gux_ttxgux/check_sa.cc    | 111 +++++----
 .../SubProcesses/P2_uc_ttxuc/CPPProcess.cc    |  62 ++---
 .../SubProcesses/P2_uc_ttxuc/CPPProcess.h     |  10 +-
 .../SubProcesses/P2_uc_ttxuc/CudaRuntime.h    |   1 -
 .../SubProcesses/P2_uc_ttxuc/GpuAbstraction.h |   1 +
 .../SubProcesses/P2_uc_ttxuc/GpuRuntime.h     |   1 +
 .../SubProcesses/P2_uc_ttxuc/check_sa.cc      | 111 +++++----
 .../SubProcesses/P2_ucx_ttxucx/CPPProcess.cc  |  62 ++---
 .../SubProcesses/P2_ucx_ttxucx/CPPProcess.h   |  10 +-
 .../SubProcesses/P2_ucx_ttxucx/CudaRuntime.h  |   1 -
 .../P2_ucx_ttxucx/GpuAbstraction.h            |   1 +
 .../SubProcesses/P2_ucx_ttxucx/GpuRuntime.h   |   1 +
 .../SubProcesses/P2_ucx_ttxucx/check_sa.cc    | 111 +++++----
 .../SubProcesses/P2_uu_ttxuu/CPPProcess.cc    |  62 ++---
 .../SubProcesses/P2_uu_ttxuu/CPPProcess.h     |  10 +-
 .../SubProcesses/P2_uu_ttxuu/CudaRuntime.h    |   1 -
 .../SubProcesses/P2_uu_ttxuu/GpuAbstraction.h |   1 +
 .../SubProcesses/P2_uu_ttxuu/GpuRuntime.h     |   1 +
 .../SubProcesses/P2_uu_ttxuu/check_sa.cc      | 111 +++++----
 .../SubProcesses/P2_uux_ttxccx/CPPProcess.cc  |  62 ++---
 .../SubProcesses/P2_uux_ttxccx/CPPProcess.h   |  10 +-
 .../SubProcesses/P2_uux_ttxccx/CudaRuntime.h  |   1 -
 .../P2_uux_ttxccx/GpuAbstraction.h            |   1 +
 .../SubProcesses/P2_uux_ttxccx/GpuRuntime.h   |   1 +
 .../SubProcesses/P2_uux_ttxccx/check_sa.cc    | 111 +++++----
 .../SubProcesses/P2_uux_ttxgg/CPPProcess.cc   |  62 ++---
 .../SubProcesses/P2_uux_ttxgg/CPPProcess.h    |  10 +-
 .../SubProcesses/P2_uux_ttxgg/CudaRuntime.h   |   1 -
 .../P2_uux_ttxgg/GpuAbstraction.h             |   1 +
 .../SubProcesses/P2_uux_ttxgg/GpuRuntime.h    |   1 +
 .../SubProcesses/P2_uux_ttxgg/check_sa.cc     | 111 +++++----
 .../SubProcesses/P2_uux_ttxuux/CPPProcess.cc  |  62 ++---
 .../SubProcesses/P2_uux_ttxuux/CPPProcess.h   |  10 +-
 .../SubProcesses/P2_uux_ttxuux/CudaRuntime.h  |   1 -
 .../P2_uux_ttxuux/GpuAbstraction.h            |   1 +
 .../SubProcesses/P2_uux_ttxuux/GpuRuntime.h   |   1 +
 .../SubProcesses/P2_uux_ttxuux/check_sa.cc    | 111 +++++----
 .../P2_uxcx_ttxuxcx/CPPProcess.cc             |  62 ++---
 .../SubProcesses/P2_uxcx_ttxuxcx/CPPProcess.h |  10 +-
 .../P2_uxcx_ttxuxcx/CudaRuntime.h             |   1 -
 .../P2_uxcx_ttxuxcx/GpuAbstraction.h          |   1 +
 .../SubProcesses/P2_uxcx_ttxuxcx/GpuRuntime.h |   1 +
 .../SubProcesses/P2_uxcx_ttxuxcx/check_sa.cc  | 111 +++++----
 .../P2_uxux_ttxuxux/CPPProcess.cc             |  62 ++---
 .../SubProcesses/P2_uxux_ttxuxux/CPPProcess.h |  10 +-
 .../P2_uxux_ttxuxux/CudaRuntime.h             |   1 -
 .../P2_uxux_ttxuxux/GpuAbstraction.h          |   1 +
 .../SubProcesses/P2_uxux_ttxuxux/GpuRuntime.h |   1 +
 .../SubProcesses/P2_uxux_ttxuxux/check_sa.cc  | 111 +++++----
 .../SubProcesses/RamboSamplingKernels.cc      |  20 +-
 .../SubProcesses/RamboSamplingKernels.h       |   6 +-
 .../SubProcesses/RandomNumberKernels.h        |   6 +-
 .../pp_tt012j.mad/SubProcesses/cudacpp.mk     | 232 +++++++++++-------
 .../pp_tt012j.mad/SubProcesses/fbridge.cc     |  16 +-
 .../pp_tt012j.mad/SubProcesses/fsampler.cc    |   8 +-
 .../pp_tt012j.mad/SubProcesses/runTest.cc     |  12 +-
 .../pp_tt012j.mad/SubProcesses/testmisc.cc    |   8 +-
 .../pp_tt012j.mad/SubProcesses/testxxx.cc     |  14 +-
 epochX/cudacpp/pp_tt012j.mad/src/HelAmps_sm.h |   4 +-
 .../pp_tt012j.mad/src/Parameters_sm.cc        |   4 +-
 .../cudacpp/pp_tt012j.mad/src/Parameters_sm.h |  10 +-
 .../cudacpp/pp_tt012j.mad/src/cudacpp_src.mk  |  23 +-
 .../cudacpp/pp_tt012j.mad/src/mgOnGpuConfig.h |  73 ++++--
 .../pp_tt012j.mad/src/mgOnGpuCxtypes.h        |  28 +--
 .../pp_tt012j.mad/src/mgOnGpuFptypes.h        |  12 +-
 .../pp_tt012j.mad/src/mgOnGpuVectors.h        |  18 +-
 epochX/cudacpp/pp_tt012j.mad/src/rambo.h      |   8 +-
 859 files changed, 11337 insertions(+), 8348 deletions(-)
 create mode 100644 epochX/cudacpp/ee_mumu.mad/SubProcesses/GpuAbstraction.h
 rename epochX/cudacpp/ee_mumu.mad/SubProcesses/{CudaRuntime.h => GpuRuntime.h} (62%)
 delete mode 120000 epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/CudaRuntime.h
 create mode 120000 epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/GpuAbstraction.h
 create mode 120000 epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/GpuRuntime.h
 create mode 100644 epochX/cudacpp/ee_mumu.sa/SubProcesses/GpuAbstraction.h
 rename epochX/cudacpp/{gg_tt.sa/SubProcesses/CudaRuntime.h => ee_mumu.sa/SubProcesses/GpuRuntime.h} (62%)
 delete mode 120000 epochX/cudacpp/ee_mumu.sa/SubProcesses/P1_Sigma_sm_epem_mupmum/CudaRuntime.h
 create mode 120000 epochX/cudacpp/ee_mumu.sa/SubProcesses/P1_Sigma_sm_epem_mupmum/GpuAbstraction.h
 create mode 120000 epochX/cudacpp/ee_mumu.sa/SubProcesses/P1_Sigma_sm_epem_mupmum/GpuRuntime.h
 create mode 100644 epochX/cudacpp/gg_tt.sa/SubProcesses/GpuAbstraction.h
 rename epochX/cudacpp/{ee_mumu.sa/SubProcesses/CudaRuntime.h => gg_tt.sa/SubProcesses/GpuRuntime.h} (62%)
 delete mode 120000 epochX/cudacpp/gg_tt.sa/SubProcesses/P1_Sigma_sm_gg_ttx/CudaRuntime.h
 create mode 120000 epochX/cudacpp/gg_tt.sa/SubProcesses/P1_Sigma_sm_gg_ttx/GpuAbstraction.h
 create mode 120000 epochX/cudacpp/gg_tt.sa/SubProcesses/P1_Sigma_sm_gg_ttx/GpuRuntime.h
 create mode 100644 epochX/cudacpp/gg_tt01g.mad/SubProcesses/GpuAbstraction.h
 rename epochX/cudacpp/gg_tt01g.mad/SubProcesses/{CudaRuntime.h => GpuRuntime.h} (62%)
 delete mode 120000 epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/CudaRuntime.h
 create mode 120000 epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/GpuAbstraction.h
 create mode 120000 epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/GpuRuntime.h
 delete mode 120000 epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/CudaRuntime.h
 create mode 120000 epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/GpuAbstraction.h
 create mode 120000 epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/GpuRuntime.h
 delete mode 100644 epochX/cudacpp/gg_ttg.mad/SubProcesses/CudaRuntime.h
 create mode 100644 epochX/cudacpp/gg_ttg.mad/SubProcesses/GpuAbstraction.h
 create mode 100644 epochX/cudacpp/gg_ttg.mad/SubProcesses/GpuRuntime.h
 delete mode 120000 epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/CudaRuntime.h
 create mode 120000 epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/GpuAbstraction.h
 create mode 120000 epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/GpuRuntime.h
 delete mode 100644 epochX/cudacpp/gg_ttg.sa/SubProcesses/CudaRuntime.h
 create mode 100644 epochX/cudacpp/gg_ttg.sa/SubProcesses/GpuAbstraction.h
 create mode 100644 epochX/cudacpp/gg_ttg.sa/SubProcesses/GpuRuntime.h
 delete mode 120000 epochX/cudacpp/gg_ttg.sa/SubProcesses/P1_Sigma_sm_gg_ttxg/CudaRuntime.h
 create mode 120000 epochX/cudacpp/gg_ttg.sa/SubProcesses/P1_Sigma_sm_gg_ttxg/GpuAbstraction.h
 create mode 120000 epochX/cudacpp/gg_ttg.sa/SubProcesses/P1_Sigma_sm_gg_ttxg/GpuRuntime.h
 delete mode 100644 epochX/cudacpp/gg_ttgg.mad/SubProcesses/CudaRuntime.h
 create mode 100644 epochX/cudacpp/gg_ttgg.mad/SubProcesses/GpuRuntime.h
 delete mode 120000 epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/CudaRuntime.h
 create mode 120000 epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/GpuAbstraction.h
 create mode 120000 epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/GpuRuntime.h
 delete mode 100644 epochX/cudacpp/gg_ttgg.sa/SubProcesses/CudaRuntime.h
 create mode 100644 epochX/cudacpp/gg_ttgg.sa/SubProcesses/GpuAbstraction.h
 create mode 100644 epochX/cudacpp/gg_ttgg.sa/SubProcesses/GpuRuntime.h
 delete mode 120000 epochX/cudacpp/gg_ttgg.sa/SubProcesses/P1_Sigma_sm_gg_ttxgg/CudaRuntime.h
 create mode 120000 epochX/cudacpp/gg_ttgg.sa/SubProcesses/P1_Sigma_sm_gg_ttxgg/GpuAbstraction.h
 create mode 120000 epochX/cudacpp/gg_ttgg.sa/SubProcesses/P1_Sigma_sm_gg_ttxgg/GpuRuntime.h
 delete mode 100644 epochX/cudacpp/gg_ttggg.mad/SubProcesses/CudaRuntime.h
 create mode 100644 epochX/cudacpp/gg_ttggg.mad/SubProcesses/GpuAbstraction.h
 create mode 100644 epochX/cudacpp/gg_ttggg.mad/SubProcesses/GpuRuntime.h
 delete mode 120000 epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/CudaRuntime.h
 create mode 120000 epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/GpuAbstraction.h
 create mode 120000 epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/GpuRuntime.h
 delete mode 100644 epochX/cudacpp/gg_ttggg.sa/SubProcesses/CudaRuntime.h
 create mode 100644 epochX/cudacpp/gg_ttggg.sa/SubProcesses/GpuAbstraction.h
 create mode 100644 epochX/cudacpp/gg_ttggg.sa/SubProcesses/GpuRuntime.h
 delete mode 120000 epochX/cudacpp/gg_ttggg.sa/SubProcesses/P1_Sigma_sm_gg_ttxggg/CudaRuntime.h
 create mode 120000 epochX/cudacpp/gg_ttggg.sa/SubProcesses/P1_Sigma_sm_gg_ttxggg/GpuAbstraction.h
 create mode 120000 epochX/cudacpp/gg_ttggg.sa/SubProcesses/P1_Sigma_sm_gg_ttxggg/GpuRuntime.h
 delete mode 100644 epochX/cudacpp/gq_ttq.mad/SubProcesses/CudaRuntime.h
 create mode 100644 epochX/cudacpp/gq_ttq.mad/SubProcesses/GpuAbstraction.h
 create mode 100644 epochX/cudacpp/gq_ttq.mad/SubProcesses/GpuRuntime.h
 delete mode 120000 epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/CudaRuntime.h
 create mode 120000 epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/GpuAbstraction.h
 create mode 120000 epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/GpuRuntime.h
 delete mode 120000 epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/CudaRuntime.h
 create mode 120000 epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/GpuAbstraction.h
 create mode 120000 epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/GpuRuntime.h
 delete mode 100644 epochX/cudacpp/gq_ttq.sa/SubProcesses/CudaRuntime.h
 create mode 100644 epochX/cudacpp/gq_ttq.sa/SubProcesses/GpuAbstraction.h
 create mode 100644 epochX/cudacpp/gq_ttq.sa/SubProcesses/GpuRuntime.h
 delete mode 120000 epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gu_ttxu/CudaRuntime.h
 create mode 120000 epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gu_ttxu/GpuAbstraction.h
 create mode 120000 epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gu_ttxu/GpuRuntime.h
 delete mode 120000 epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gux_ttxux/CudaRuntime.h
 create mode 120000 epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gux_ttxux/GpuAbstraction.h
 create mode 120000 epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gux_ttxux/GpuRuntime.h
 delete mode 100644 epochX/cudacpp/heft_gg_h.sa/SubProcesses/CudaRuntime.h
 create mode 100644 epochX/cudacpp/heft_gg_h.sa/SubProcesses/GpuAbstraction.h
 create mode 100644 epochX/cudacpp/heft_gg_h.sa/SubProcesses/GpuRuntime.h
 delete mode 120000 epochX/cudacpp/heft_gg_h.sa/SubProcesses/P1_Sigma_heft_gg_h/CudaRuntime.h
 create mode 120000 epochX/cudacpp/heft_gg_h.sa/SubProcesses/P1_Sigma_heft_gg_h/GpuAbstraction.h
 create mode 120000 epochX/cudacpp/heft_gg_h.sa/SubProcesses/P1_Sigma_heft_gg_h/GpuRuntime.h
 delete mode 100644 epochX/cudacpp/pp_tt012j.mad/SubProcesses/CudaRuntime.h
 create mode 100644 epochX/cudacpp/pp_tt012j.mad/SubProcesses/GpuAbstraction.h
 create mode 100644 epochX/cudacpp/pp_tt012j.mad/SubProcesses/GpuRuntime.h
 delete mode 120000 epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_gg_ttx/CudaRuntime.h
 create mode 120000 epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_gg_ttx/GpuAbstraction.h
 create mode 120000 epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_gg_ttx/GpuRuntime.h
 delete mode 120000 epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_uux_ttx/CudaRuntime.h
 create mode 120000 epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_uux_ttx/GpuAbstraction.h
 create mode 120000 epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_uux_ttx/GpuRuntime.h
 delete mode 120000 epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gg_ttxg/CudaRuntime.h
 create mode 120000 epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gg_ttxg/GpuAbstraction.h
 create mode 120000 epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gg_ttxg/GpuRuntime.h
 delete mode 120000 epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gu_ttxu/CudaRuntime.h
 create mode 120000 epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gu_ttxu/GpuAbstraction.h
 create mode 120000 epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gu_ttxu/GpuRuntime.h
 delete mode 120000 epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gux_ttxux/CudaRuntime.h
 create mode 120000 epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gux_ttxux/GpuAbstraction.h
 create mode 120000 epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gux_ttxux/GpuRuntime.h
 delete mode 120000 epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_uux_ttxg/CudaRuntime.h
 create mode 120000 epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_uux_ttxg/GpuAbstraction.h
 create mode 120000 epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_uux_ttxg/GpuRuntime.h
 delete mode 120000 epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxgg/CudaRuntime.h
 create mode 120000 epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxgg/GpuAbstraction.h
 create mode 120000 epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxgg/GpuRuntime.h
 delete mode 120000 epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxuux/CudaRuntime.h
 create mode 120000 epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxuux/GpuAbstraction.h
 create mode 120000 epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxuux/GpuRuntime.h
 delete mode 120000 epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gu_ttxgu/CudaRuntime.h
 create mode 120000 epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gu_ttxgu/GpuAbstraction.h
 create mode 120000 epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gu_ttxgu/GpuRuntime.h
 delete mode 120000 epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gux_ttxgux/CudaRuntime.h
 create mode 120000 epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gux_ttxgux/GpuAbstraction.h
 create mode 120000 epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gux_ttxgux/GpuRuntime.h
 delete mode 120000 epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uc_ttxuc/CudaRuntime.h
 create mode 120000 epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uc_ttxuc/GpuAbstraction.h
 create mode 120000 epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uc_ttxuc/GpuRuntime.h
 delete mode 120000 epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_ucx_ttxucx/CudaRuntime.h
 create mode 120000 epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_ucx_ttxucx/GpuAbstraction.h
 create mode 120000 epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_ucx_ttxucx/GpuRuntime.h
 delete mode 120000 epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uu_ttxuu/CudaRuntime.h
 create mode 120000 epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uu_ttxuu/GpuAbstraction.h
 create mode 120000 epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uu_ttxuu/GpuRuntime.h
 delete mode 120000 epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxccx/CudaRuntime.h
 create mode 120000 epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxccx/GpuAbstraction.h
 create mode 120000 epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxccx/GpuRuntime.h
 delete mode 120000 epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxgg/CudaRuntime.h
 create mode 120000 epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxgg/GpuAbstraction.h
 create mode 120000 epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxgg/GpuRuntime.h
 delete mode 120000 epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxuux/CudaRuntime.h
 create mode 120000 epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxuux/GpuAbstraction.h
 create mode 120000 epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxuux/GpuRuntime.h
 delete mode 120000 epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxcx_ttxuxcx/CudaRuntime.h
 create mode 120000 epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxcx_ttxuxcx/GpuAbstraction.h
 create mode 120000 epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxcx_ttxuxcx/GpuRuntime.h
 delete mode 120000 epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxux_ttxuxux/CudaRuntime.h
 create mode 120000 epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxux_ttxuxux/GpuAbstraction.h
 create mode 120000 epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxux_ttxuxux/GpuRuntime.h

diff --git a/epochX/cudacpp/ee_mumu.mad/CODEGEN_mad_ee_mumu_log.txt b/epochX/cudacpp/ee_mumu.mad/CODEGEN_mad_ee_mumu_log.txt
index 36b42987c5..dd0f31341f 100644
--- a/epochX/cudacpp/ee_mumu.mad/CODEGEN_mad_ee_mumu_log.txt
+++ b/epochX/cudacpp/ee_mumu.mad/CODEGEN_mad_ee_mumu_log.txt
@@ -52,7 +52,7 @@ Note that you can still compile and run aMC@NLO with the built-in PDFs
 
 Using default text editor "vi". Set another one in ./input/mg5_configuration.txt
 Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt
-No valid web browser found. Please set in ./input/mg5_configuration.txt
+Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt
 import /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu.mg
 The import format was not given, so we guess it as command
 set stdout_level DEBUG
@@ -62,7 +62,7 @@ generate e+ e- > mu+ mu-
 No model currently active, so we import the Standard Model
 INFO: load particles 
 INFO: load vertices 
-[1;32mDEBUG: model prefixing  takes 0.005498409271240234 [0m
+[1;32mDEBUG: model prefixing  takes 0.005403280258178711 [0m
 INFO: Restrict model sm with file models/sm/restrict_default.dat . 
 [1;32mDEBUG: Simplifying conditional expressions [0m
 [1;32mDEBUG: remove interactions: u s w+ at order: QED=1 [0m
@@ -174,7 +174,7 @@ INFO: Generating Helas calls for process: e+ e- > mu+ mu- WEIGHTED<=4 @1
 INFO: Processing color information for process: e+ e- > mu+ mu- @1 
 INFO: Creating files in directory P1_epem_mupmum 
 [1;32mDEBUG:  kwargs[prefix] = 0 [1;30m[model_handling.py at line 1058][0m [0m
-[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7f5f7935eb80> [1;30m[export_v4.py at line 6262][0m [0m
+[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7fca26007b20> [1;30m[export_v4.py at line 6262][0m [0m
 INFO: Creating files in directory . 
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.h
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.cc
@@ -191,19 +191,19 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./.
 INFO: Generating Feynman diagrams for Process: e+ e- > mu+ mu- WEIGHTED<=4 @1 
 INFO: Finding symmetric diagrams for subprocess group epem_mupmum 
 Generated helas calls for 1 subprocesses (2 diagrams) in 0.004 s
-Wrote files for 8 helas calls in 0.102 s
+Wrote files for 8 helas calls in 0.098 s
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates FFV1 routines[0m
 ALOHA: aloha creates FFV2 routines[0m
 ALOHA: aloha creates FFV4 routines[0m
-ALOHA: aloha creates 3 routines in  0.203 s
+ALOHA: aloha creates 3 routines in  0.200 s
 [1;32mDEBUG:  Entering PLUGIN_ProcessExporter.convert_model (create the model) [1;30m[output.py at line 202][0m [0m
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates FFV1 routines[0m
 ALOHA: aloha creates FFV2 routines[0m
 ALOHA: aloha creates FFV4 routines[0m
 ALOHA: aloha creates FFV2_4 routines[0m
-ALOHA: aloha creates 7 routines in  0.260 s
+ALOHA: aloha creates 7 routines in  0.537 s
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
 <class 'aloha.create_aloha.AbstractRoutine'> FFV2
@@ -248,9 +248,9 @@ Type "launch" to generate events from this process, or see
 Run "open index.html" to see more information about this process.
 quit
 
-real	0m1.900s
-user	0m1.697s
-sys	0m0.195s
+real	0m2.147s
+user	0m1.627s
+sys	0m0.231s
 Code generation completed in 2 seconds
 ************************************************************
 *                                                          *
@@ -277,7 +277,7 @@ INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/mg5amc
 INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu/Cards/me5_configuration.txt  
 Using default text editor "vi". Set another one in ./input/mg5_configuration.txt
 Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt
-No valid web browser found. Please set in ./input/mg5_configuration.txt
+Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt
 treatcards run
 quit
 INFO:  
@@ -307,7 +307,7 @@ INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/mg5amc
 INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu/Cards/me5_configuration.txt  
 Using default text editor "vi". Set another one in ./input/mg5_configuration.txt
 Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt
-No valid web browser found. Please set in ./input/mg5_configuration.txt
+Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt
 treatcards param
 quit
 INFO:  
diff --git a/epochX/cudacpp/ee_mumu.mad/COPYRIGHT b/epochX/cudacpp/ee_mumu.mad/COPYRIGHT
index a134b5fef9..84a883fbb0 100644
--- a/epochX/cudacpp/ee_mumu.mad/COPYRIGHT
+++ b/epochX/cudacpp/ee_mumu.mad/COPYRIGHT
@@ -15,6 +15,7 @@ The full development team currently includes the following authors :
   Stephan Hageboeck (CERN)
   Olivier Mattelaer (Universite Catholique de Louvain, original author)
   Stefan Roiser (CERN, original author)
+  Joergen Teig (CERN)
   Andrea Valassi (CERN, original author)
   Zenny Wettersten (CERN)
 See https://github.com/madgraph5/madgraph4gpu for more details. For the full
diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/Bridge.h b/epochX/cudacpp/ee_mumu.mad/SubProcesses/Bridge.h
index bf8b5e024d..89437b4c42 100644
--- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/Bridge.h
+++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/Bridge.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: S. Roiser (Nov 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Roiser, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef BRIDGE_H
 #define BRIDGE_H 1
@@ -23,7 +23,7 @@
 #include <memory>
 #include <type_traits>
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -83,7 +83,7 @@ namespace mg5amcCpu
     Bridge& operator=( const Bridge& ) = delete;
     Bridge& operator=( Bridge&& ) = delete;
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     /**
      * Set the gpublocks and gputhreads for the gpusequence - throws if evnt != gpublocks*gputhreads
      * (this is needed for BridgeKernel tests rather than for actual production use in Fortran)
@@ -150,7 +150,7 @@ namespace mg5amcCpu
     unsigned int m_nevt; // number of events
     int m_nGoodHel;      // the number of good helicities (-1 initially when they have not yet been calculated)
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     int m_gputhreads; // number of gpu threads (default set from number of events, can be modified)
     int m_gpublocks;  // number of gpu blocks (default set from number of events, can be modified)
     DeviceBuffer<FORTRANFPTYPE, sizePerEventMomenta> m_devMomentaF;
@@ -187,12 +187,12 @@ namespace mg5amcCpu
   // Forward declare transposition methods
   //
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 
   template<typename Tin, typename Tout>
   __global__ void dev_transposeMomentaF2C( const Tin* in, Tout* out, const unsigned int nevt );
 
-#endif // __CUDACC__
+#endif // MGONGPUCPP_GPUIMPL
 
   template<typename Tin, typename Tout>
   void hst_transposeMomentaF2C( const Tin* in, Tout* out, const unsigned int nevt );
@@ -209,7 +209,7 @@ namespace mg5amcCpu
   Bridge<FORTRANFPTYPE>::Bridge( unsigned int nevtF, unsigned int nparF, unsigned int np4F )
     : m_nevt( nevtF )
     , m_nGoodHel( -1 )
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     , m_gputhreads( 256 )                  // default number of gpu threads
     , m_gpublocks( m_nevt / m_gputhreads ) // this ensures m_nevt <= m_gpublocks*m_gputhreads
     , m_devMomentaF( m_nevt )
@@ -233,7 +233,7 @@ namespace mg5amcCpu
   {
     if( nparF != CPPProcess::npar ) throw std::runtime_error( "Bridge constructor: npar mismatch" );
     if( np4F != CPPProcess::np4 ) throw std::runtime_error( "Bridge constructor: np4 mismatch" );
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     if( ( m_nevt < s_gputhreadsmin ) || ( m_nevt % s_gputhreadsmin != 0 ) )
       throw std::runtime_error( "Bridge constructor: nevt should be a multiple of " + std::to_string( s_gputhreadsmin ) );
     while( m_nevt != m_gpublocks * m_gputhreads )
@@ -249,7 +249,7 @@ namespace mg5amcCpu
 #else
     std::cout << "WARNING! Instantiate host Bridge (nevt=" << m_nevt << ")" << std::endl;
     m_pmek.reset( new MatrixElementKernelHost( m_hstMomentaC, m_hstGs, m_hstRndHel, m_hstRndCol, m_hstMEs, m_hstSelHel, m_hstSelCol, m_nevt ) );
-#endif // __CUDACC__
+#endif // MGONGPUCPP_GPUIMPL
     // Create a process object, read param card and set parameters
     // FIXME: the process instance can happily go out of scope because it is only needed to read parameters?
     // FIXME: the CPPProcess should really be a singleton? what if fbridgecreate is called from several Fortran threads?
@@ -262,7 +262,7 @@ namespace mg5amcCpu
     process.initProc( paramCard );
   }
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   template<typename FORTRANFPTYPE>
   void Bridge<FORTRANFPTYPE>::set_gpugrid( const int gpublocks, const int gputhreads )
   {
@@ -276,7 +276,7 @@ namespace mg5amcCpu
   }
 #endif
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   template<typename FORTRANFPTYPE>
   void Bridge<FORTRANFPTYPE>::gpu_sequence( const FORTRANFPTYPE* momenta,
                                             const FORTRANFPTYPE* gs,
@@ -291,14 +291,14 @@ namespace mg5amcCpu
     constexpr int neppM = MemoryAccessMomenta::neppM;
     if constexpr( neppM == 1 && std::is_same_v<FORTRANFPTYPE, fptype> )
     {
-      checkCuda( cudaMemcpy( m_devMomentaC.data(), momenta, m_devMomentaC.bytes(), cudaMemcpyHostToDevice ) );
+      gpuMemcpy( m_devMomentaC.data(), momenta, m_devMomentaC.bytes(), gpuMemcpyHostToDevice );
     }
     else
     {
-      checkCuda( cudaMemcpy( m_devMomentaF.data(), momenta, m_devMomentaF.bytes(), cudaMemcpyHostToDevice ) );
+      gpuMemcpy( m_devMomentaF.data(), momenta, m_devMomentaF.bytes(), gpuMemcpyHostToDevice );
       const int thrPerEvt = CPPProcess::npar * CPPProcess::np4; // AV: transpose alg does 1 element per thread (NOT 1 event per thread)
       //const int thrPerEvt = 1; // AV: try new alg with 1 event per thread... this seems slower
-      dev_transposeMomentaF2C<<<m_gpublocks * thrPerEvt, m_gputhreads>>>( m_devMomentaF.data(), m_devMomentaC.data(), m_nevt );
+      gpuLaunchKernel( dev_transposeMomentaF2C, m_gpublocks * thrPerEvt, m_gputhreads, m_devMomentaF.data(), m_devMomentaC.data(), m_nevt );
     }
     if constexpr( std::is_same_v<FORTRANFPTYPE, fptype> )
     {
@@ -341,7 +341,7 @@ namespace mg5amcCpu
   }
 #endif
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   template<typename FORTRANFPTYPE>
   void Bridge<FORTRANFPTYPE>::cpu_sequence( const FORTRANFPTYPE* momenta,
                                             const FORTRANFPTYPE* gs,
@@ -396,7 +396,7 @@ namespace mg5amcCpu
   // - C++ array: momenta[npagM][npar][np4][neppM] with nevt=npagM*neppM (AOSOA)
   //
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   template<typename Tin, typename Tout>
   __global__ void dev_transposeMomentaF2C( const Tin* in, Tout* out, const unsigned int nevt )
   {
diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/BridgeKernels.cc b/epochX/cudacpp/ee_mumu.mad/SubProcesses/BridgeKernels.cc
index d58066c9c1..eaf4037a24 100644
--- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/BridgeKernels.cc
+++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/BridgeKernels.cc
@@ -1,17 +1,18 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #include "BridgeKernels.h"
 
+#include "GpuAbstraction.h"
 #include "MemoryAccessMomenta.h"
 
 #include <sstream>
 
 //============================================================================
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -45,7 +46,7 @@ namespace mg5amcCpu
 
 //============================================================================
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 namespace mg5amcCpu
 {
 
@@ -96,7 +97,7 @@ namespace mg5amcCpu
 
 //============================================================================
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 {
 
diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/BridgeKernels.h b/epochX/cudacpp/ee_mumu.mad/SubProcesses/BridgeKernels.h
index 15eb4bff4d..3efef8ce97 100644
--- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/BridgeKernels.h
+++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/BridgeKernels.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef BRIDGEKERNELS_H
 #define BRIDGEKERNELS_H 1
@@ -12,7 +12,7 @@
 #include "MatrixElementKernels.h"
 #include "MemoryBuffers.h"
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -49,7 +49,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A Bridge wrapper class encapsulating matrix element calculations on a CPU host
   class BridgeKernelHost final : public BridgeKernelBase
   {
@@ -89,7 +89,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   // A Bridge wrapper class encapsulating matrix element calculations on a GPU device
   class BridgeKernelDevice : public BridgeKernelBase
   {
diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/CommonRandomNumberKernel.cc b/epochX/cudacpp/ee_mumu.mad/SubProcesses/CommonRandomNumberKernel.cc
index 985b39f576..010bc4cbd0 100644
--- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/CommonRandomNumberKernel.cc
+++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/CommonRandomNumberKernel.cc
@@ -1,15 +1,16 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #include "CommonRandomNumbers.h"
+#include "GpuAbstraction.h"
 #include "MemoryBuffers.h"
 #include "RandomNumberKernels.h"
 
 #include <cassert>
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/CrossSectionKernels.cc b/epochX/cudacpp/ee_mumu.mad/SubProcesses/CrossSectionKernels.cc
index 0b355a3c8d..c15b39844d 100644
--- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/CrossSectionKernels.cc
+++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/CrossSectionKernels.cc
@@ -1,10 +1,11 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #include "CrossSectionKernels.h"
 
+#include "GpuAbstraction.h"
 #include "MemoryAccessMatrixElements.h"
 #include "MemoryAccessWeights.h"
 #include "MemoryBuffers.h"
@@ -77,7 +78,7 @@ debug_me_is_abnormal( const fptype& me, size_t ievtALL )
 
 //============================================================================
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -185,7 +186,7 @@ namespace mg5amcCpu
 
 //============================================================================
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 {
 
diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/CrossSectionKernels.h b/epochX/cudacpp/ee_mumu.mad/SubProcesses/CrossSectionKernels.h
index 7933ca4bbf..4d9659e04e 100644
--- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/CrossSectionKernels.h
+++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/CrossSectionKernels.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef CROSSSECTIONKERNELS_H
 #define CROSSSECTIONKERNELS_H 1
@@ -13,7 +13,7 @@
 
 //============================================================================
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -96,7 +96,7 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
   /*
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   // A class encapsulating the calculation of event statistics on a GPU device
   class CrossSectionKernelDevice : public CrossSectionKernelBase, public NumberOfEvents
   {
diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/CurandRandomNumberKernel.cc b/epochX/cudacpp/ee_mumu.mad/SubProcesses/CurandRandomNumberKernel.cc
index eb56333b03..08a16f6f2c 100644
--- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/CurandRandomNumberKernel.cc
+++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/CurandRandomNumberKernel.cc
@@ -1,9 +1,9 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
-#include "CudaRuntime.h"
+#include "GpuRuntime.h"
 #include "MemoryBuffers.h"
 #include "RandomNumberKernels.h"
 
@@ -22,7 +22,7 @@ inline void assertCurand( curandStatus_t code, const char *file, int line, bool
 }
 #endif /* clang-format on */
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -36,7 +36,7 @@ namespace mg5amcCpu
   {
     if( m_isOnDevice )
     {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
       if( !m_rnarray.isOnDevice() )
         throw std::runtime_error( "CurandRandomNumberKernel on device with a host random number array" );
 #else
@@ -114,7 +114,7 @@ namespace mg5amcCpu
     /*
     printf( "\nCurandRandomNumberKernel::generateRnarray size = %d\n", (int)m_rnarray.size() );
     fptype* data = m_rnarray.data();
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     if( m_rnarray.isOnDevice() )
     {
       data = new fptype[m_rnarray.size()]();
@@ -123,7 +123,7 @@ namespace mg5amcCpu
 #endif
     for( int i = 0; i < ( (int)m_rnarray.size() / 4 ); i++ )
       printf( "[%4d] %f %f %f %f\n", i * 4, data[i * 4], data[i * 4 + 2], data[i * 4 + 2], data[i * 4 + 3] );
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     if( m_rnarray.isOnDevice() ) delete[] data;
 #endif
     */
diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/EventStatistics.h b/epochX/cudacpp/ee_mumu.mad/SubProcesses/EventStatistics.h
index 48b51e0a49..b425a5bade 100644
--- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/EventStatistics.h
+++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/EventStatistics.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef EventStatistics_H
 #define EventStatistics_H 1
@@ -16,7 +16,7 @@
 #include <limits>
 #include <string>
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/GpuAbstraction.h b/epochX/cudacpp/ee_mumu.mad/SubProcesses/GpuAbstraction.h
new file mode 100644
index 0000000000..6a7d9c05c0
--- /dev/null
+++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/GpuAbstraction.h
@@ -0,0 +1,71 @@
+// Copyright (C) 2020-2023 CERN and UCLouvain.
+// Licensed under the GNU Lesser General Public License (version 3 or later).
+// Created by: J. Teig (Jul 2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+
+#ifndef MG5AMC_GPUABSTRACTION_H
+#define MG5AMC_GPUABSTRACTION_H 1
+
+#include <cassert>
+
+//--------------------------------------------------------------------------
+
+#ifdef __CUDACC__
+
+#define gpuError_t cudaError_t
+#define gpuPeekAtLastError cudaPeekAtLastError
+#define gpuGetErrorString cudaGetErrorString
+#define gpuSuccess cudaSuccess
+
+#define gpuMallocHost( ptr, size ) checkGpu( cudaMallocHost( ptr, size ) )
+#define gpuMalloc( ptr, size ) checkGpu( cudaMalloc( ptr, size ) )
+
+#define gpuMemcpy( dstData, srcData, srcBytes, func ) checkGpu( cudaMemcpy( dstData, srcData, srcBytes, func ) )
+#define gpuMemcpyHostToDevice cudaMemcpyHostToDevice
+#define gpuMemcpyDeviceToHost cudaMemcpyDeviceToHost
+#define gpuMemcpyToSymbol( type1, type2, size ) checkGpu( cudaMemcpyToSymbol( type1, type2, size ) )
+
+#define gpuFree( ptr ) checkGpu( cudaFree( ptr ) )
+#define gpuFreeHost( ptr ) checkGpu( cudaFreeHost( ptr ) )
+
+#define gpuSetDevice cudaSetDevice
+#define gpuDeviceSynchronize cudaDeviceSynchronize
+#define gpuDeviceReset cudaDeviceReset
+
+#define gpuLaunchKernel( kernel, blocks, threads, ... ) kernel<<<blocks, threads>>>( __VA_ARGS__ )
+#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<<blocks, threads, sharedMem>>>( __VA_ARGS__ )
+
+//--------------------------------------------------------------------------
+
+#elif defined __HIPCC__
+
+#include "hip/hip_runtime.h"
+
+#define gpuError_t hipError_t
+#define gpuPeekAtLastError hipPeekAtLastError
+#define gpuGetErrorString hipGetErrorString
+#define gpuSuccess hipSuccess
+
+#define gpuMallocHost( ptr, size ) checkGpu( hipHostMalloc( ptr, size ) ) // HostMalloc better
+#define gpuMalloc( ptr, size ) checkGpu( hipMalloc( ptr, size ) )
+
+#define gpuMemcpy( dstData, srcData, srcBytes, func ) checkGpu( hipMemcpy( dstData, srcData, srcBytes, func ) )
+#define gpuMemcpyHostToDevice hipMemcpyHostToDevice
+#define gpuMemcpyDeviceToHost hipMemcpyDeviceToHost
+#define gpuMemcpyToSymbol( type1, type2, size ) checkGpu( hipMemcpyToSymbol( type1, type2, size ) )
+
+#define gpuFree( ptr ) checkGpu( hipFree( ptr ) )
+#define gpuFreeHost( ptr ) checkGpu( hipHostFree( ptr ) )
+
+#define gpuSetDevice hipSetDevice
+#define gpuDeviceSynchronize hipDeviceSynchronize
+#define gpuDeviceReset hipDeviceReset
+
+#define gpuLaunchKernel( kernel, blocks, threads, ... ) kernel<<<blocks, threads>>>( __VA_ARGS__ )
+#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<<blocks, threads, sharedMem>>>( __VA_ARGS__ )
+
+//--------------------------------------------------------------------------
+
+#endif
+
+#endif // MG5AMC_GPUABSTRACTION_H
diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/CudaRuntime.h b/epochX/cudacpp/ee_mumu.mad/SubProcesses/GpuRuntime.h
similarity index 62%
rename from epochX/cudacpp/ee_mumu.mad/SubProcesses/CudaRuntime.h
rename to epochX/cudacpp/ee_mumu.mad/SubProcesses/GpuRuntime.h
index 64ce52f4b3..93579ef08b 100644
--- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/CudaRuntime.h
+++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/GpuRuntime.h
@@ -1,49 +1,50 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
-// Created by: S. Roiser (Jul 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+// Created by: J. Teig (Jun 2023, based on earlier work by S. Roiser) for the MG5aMC CUDACPP plugin.
+// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 
-#ifndef MG5AMC_CUDARUNTIME_H
-#define MG5AMC_CUDARUNTIME_H 1
+#ifndef MG5AMC_GPURUNTIME_H
+#define MG5AMC_GPURUNTIME_H 1
 
 // MG5AMC on GPU uses the CUDA runtime API, not the lower level CUDA driver API
 // See https://docs.nvidia.com/cuda/cuda-runtime-api/driver-vs-runtime-api.html#driver-vs-runtime-api
 
-#include <cassert>
+#include "GpuAbstraction.h"
+
 #include <iostream>
 
 //--------------------------------------------------------------------------
 
 // See https://stackoverflow.com/a/14038590
-#ifdef __CUDACC__ /* clang-format off */
-#define checkCuda( code ) { assertCuda( code, __FILE__, __LINE__ ); }
-inline void assertCuda( cudaError_t code, const char* file, int line, bool abort = true )
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
+#define checkGpu( code ) { assertGpu( code, __FILE__, __LINE__ ); }
+inline void assertGpu( gpuError_t code, const char* file, int line, bool abort = true )
 {
-  if( code != cudaSuccess )
+  if( code != gpuSuccess )
   {
-    printf( "ERROR! assertCuda: '%s' (%d) in %s:%d\n", cudaGetErrorString( code ), code, file, line );
-    if( abort ) assert( code == cudaSuccess );
+    printf( "ERROR! assertGpu: '%s' (%d) in %s:%d\n", gpuGetErrorString( code ), code, file, line );
+    if( abort ) assert( code == gpuSuccess );
   }
 }
 #endif /* clang-format on */
 
 //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 {
-  // Instantiate a CudaRuntime at the beginnining of the application's main to
-  // invoke cudaSetDevice(0) in the constructor and book a cudaDeviceReset() call in the destructor
+  // Instantiate a GpuRuntime at the beginnining of the application's main to
+  // invoke gpuSetDevice(0) in the constructor and book a gpuDeviceReset() call in the destructor
   // *** FIXME! This will all need to be designed differently when going to multi-GPU nodes! ***
-  struct CudaRuntime final
+  struct GpuRuntime final
   {
-    CudaRuntime( const bool debug = true )
+    GpuRuntime( const bool debug = true )
       : m_debug( debug ) { setUp( m_debug ); }
-    ~CudaRuntime() { tearDown( m_debug ); }
-    CudaRuntime( const CudaRuntime& ) = delete;
-    CudaRuntime( CudaRuntime&& ) = delete;
-    CudaRuntime& operator=( const CudaRuntime& ) = delete;
-    CudaRuntime& operator=( CudaRuntime&& ) = delete;
+    ~GpuRuntime() { tearDown( m_debug ); }
+    GpuRuntime( const GpuRuntime& ) = delete;
+    GpuRuntime( GpuRuntime&& ) = delete;
+    GpuRuntime& operator=( const GpuRuntime& ) = delete;
+    GpuRuntime& operator=( GpuRuntime&& ) = delete;
     bool m_debug;
 
     // Set up CUDA application
@@ -62,8 +63,8 @@ namespace mg5amcGpu
       */
       // Replace cudaFree(0) by cudaSetDevice(0), even if it is not really needed either
       // (but see https://developer.nvidia.com/blog/cuda-pro-tip-always-set-current-device-avoid-multithreading-bugs)
-      if( debug ) std::cout << "__CudaRuntime: calling cudaSetDevice(0)" << std::endl;
-      checkCuda( cudaSetDevice( 0 ) ); // SLOW!
+      if( debug ) std::cout << "__GpuRuntime: calling GpuSetDevice(0)" << std::endl;
+      checkGpu( gpuSetDevice( 0 ) ); // SLOW!
     }
 
     // Tear down CUDA application (call cudaDeviceReset)
@@ -72,14 +73,13 @@ namespace mg5amcGpu
     // See https://docs.nvidia.com/cuda/cuda-memcheck/index.html#leak-checking
     static void tearDown( const bool debug = true )
     {
-      if( debug ) std::cout << "__CudaRuntime: calling cudaDeviceReset()" << std::endl;
-      checkCuda( cudaDeviceReset() );
+      if( debug ) std::cout << "__GpuRuntime: calling GpuDeviceReset()" << std::endl;
+      checkGpu( gpuDeviceReset() );
     }
   };
-
 }
 #endif
 
 //--------------------------------------------------------------------------
 
-#endif // MG5AMC_CUDARUNTIME_H
+#endif // MG5AMC_GPURUNTIME_H
diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/MadgraphTest.h b/epochX/cudacpp/ee_mumu.mad/SubProcesses/MadgraphTest.h
index ef40624c88..a64c05c26a 100644
--- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/MadgraphTest.h
+++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/MadgraphTest.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: S. Hageboeck (Dec 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MADGRAPHTEST_H_
 #define MADGRAPHTEST_H_ 1
@@ -22,7 +22,7 @@
 #include <string>
 #include <vector>
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 using mg5amcGpu::CPPProcess;
 #else
 using mg5amcCpu::CPPProcess;
@@ -201,7 +201,7 @@ class MadgraphTest : public testing::TestWithParam<TestDriverBase*>
 
 // Since we link both the CPU-only and GPU tests into the same executable, we prevent
 // a multiply defined symbol by only compiling this in the non-CUDA phase:
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 
 /// Compare momenta and matrix elements.
 /// This uses an implementation of TestDriverBase to run a madgraph workflow,
@@ -307,6 +307,6 @@ TEST_P( MadgraphTest, CompareMomentaAndME )
   }
 }
 
-#endif // __CUDACC__
+#endif // MGONGPUCPP_GPUIMPL
 
 #endif /* MADGRAPHTEST_H_ */
diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/MatrixElementKernels.cc b/epochX/cudacpp/ee_mumu.mad/SubProcesses/MatrixElementKernels.cc
index 74b5239ebf..81699dfea9 100644
--- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/MatrixElementKernels.cc
+++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/MatrixElementKernels.cc
@@ -1,12 +1,12 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #include "MatrixElementKernels.h"
 
 #include "CPPProcess.h"
-#include "CudaRuntime.h"
+#include "GpuRuntime.h" // Includes the abstraction for Nvidia/AMD compilation
 #include "MemoryAccessMomenta.h"
 #include "MemoryBuffers.h"
 
@@ -14,7 +14,7 @@
 
 //============================================================================
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 namespace mg5amcCpu
 {
 
@@ -150,7 +150,7 @@ namespace mg5amcCpu
 
 //============================================================================
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 {
 
@@ -209,13 +209,13 @@ namespace mg5amcGpu
     PinnedHostBufferHelicityMask hstIsGoodHel( ncomb );
     DeviceBufferHelicityMask devIsGoodHel( ncomb );
     // ... 0d1. Compute good helicity mask on the device
-    computeDependentCouplings<<<m_gpublocks, m_gputhreads>>>( m_gs.data(), m_couplings.data() );
+    gpuLaunchKernel( computeDependentCouplings, m_gpublocks, m_gputhreads, m_gs.data(), m_couplings.data() );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    sigmaKin_getGoodHel<<<m_gpublocks, m_gputhreads>>>( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_numerators.data(), m_denominators.data(), devIsGoodHel.data() );
+    gpuLaunchKernel( sigmaKin_getGoodHel, m_gpublocks, m_gputhreads, m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_numerators.data(), m_denominators.data(), devIsGoodHel.data() );
 #else
-    sigmaKin_getGoodHel<<<m_gpublocks, m_gputhreads>>>( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), devIsGoodHel.data() );
+    gpuLaunchKernel( sigmaKin_getGoodHel, m_gpublocks, m_gputhreads, m_momenta.data(), m_couplings.data(), m_matrixElements.data(), devIsGoodHel.data() );
 #endif
-    checkCuda( cudaPeekAtLastError() );
+    checkGpu( gpuPeekAtLastError() );
     // ... 0d2. Copy back good helicity mask to the host
     copyHostFromDevice( hstIsGoodHel, devIsGoodHel );
     // ... 0d3. Copy back good helicity list to constant memory on the device
@@ -226,19 +226,19 @@ namespace mg5amcGpu
 
   void MatrixElementKernelDevice::computeMatrixElements( const unsigned int channelId )
   {
-    computeDependentCouplings<<<m_gpublocks, m_gputhreads>>>( m_gs.data(), m_couplings.data() );
+    gpuLaunchKernel( computeDependentCouplings, m_gpublocks, m_gputhreads, m_gs.data(), m_couplings.data() );
 #ifndef MGONGPU_NSIGHT_DEBUG
     constexpr unsigned int sharedMemSize = 0;
 #else
     constexpr unsigned int sharedMemSize = ntpbMAX * sizeof( float );
 #endif
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    sigmaKin<<<m_gpublocks, m_gputhreads, sharedMemSize>>>( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), channelId, m_numerators.data(), m_denominators.data(), m_selhel.data(), m_selcol.data() );
+    gpuLaunchKernelSharedMem( sigmaKin, m_gpublocks, m_gputhreads, sharedMemSize, m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), channelId, m_numerators.data(), m_denominators.data(), m_selhel.data(), m_selcol.data() );
 #else
-    sigmaKin<<<m_gpublocks, m_gputhreads, sharedMemSize>>>( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), m_selhel.data(), m_selcol.data() );
+    gpuLaunchKernelSharedMem( sigmaKin, m_gpublocks, m_gputhreads, sharedMemSize, m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), m_selhel.data(), m_selcol.data() );
 #endif
-    checkCuda( cudaPeekAtLastError() );
-    checkCuda( cudaDeviceSynchronize() );
+    checkGpu( gpuPeekAtLastError() );
+    checkGpu( gpuDeviceSynchronize() );
   }
 
   //--------------------------------------------------------------------------
diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/MatrixElementKernels.h b/epochX/cudacpp/ee_mumu.mad/SubProcesses/MatrixElementKernels.h
index 23e84757a2..72bd8f195b 100644
--- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/MatrixElementKernels.h
+++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/MatrixElementKernels.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MATRIXELEMENTKERNELS_H
 #define MATRIXELEMENTKERNELS_H 1
@@ -10,7 +10,7 @@
 
 #include "MemoryBuffers.h"
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -81,7 +81,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating matrix element calculations on a CPU host
   class MatrixElementKernelHost final : public MatrixElementKernelBase, public NumberOfEvents
   {
@@ -130,7 +130,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   // A class encapsulating matrix element calculations on a GPU device
   class MatrixElementKernelDevice : public MatrixElementKernelBase, public NumberOfEvents
   {
diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/MemoryAccessAmplitudes.h b/epochX/cudacpp/ee_mumu.mad/SubProcesses/MemoryAccessAmplitudes.h
index 573b3bbbc9..ffb76e93de 100644
--- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/MemoryAccessAmplitudes.h
+++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/MemoryAccessAmplitudes.h
@@ -15,7 +15,7 @@
 #define MGONGPU_TRIVIAL_AMPLITUDES 1
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/MemoryAccessCouplings.h b/epochX/cudacpp/ee_mumu.mad/SubProcesses/MemoryAccessCouplings.h
index 35a3af42e0..3afdf3e554 100644
--- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/MemoryAccessCouplings.h
+++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/MemoryAccessCouplings.h
@@ -15,7 +15,7 @@
 #include "MemoryBuffers.h"       // for HostBufferCouplings::isaligned
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/MemoryAccessCouplingsFixed.h b/epochX/cudacpp/ee_mumu.mad/SubProcesses/MemoryAccessCouplingsFixed.h
index dc0d93afff..ffcdf4dbef 100644
--- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/MemoryAccessCouplingsFixed.h
+++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/MemoryAccessCouplingsFixed.h
@@ -14,7 +14,7 @@
 //#include "MemoryAccessHelpers.h"
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/MemoryAccessDenominators.h b/epochX/cudacpp/ee_mumu.mad/SubProcesses/MemoryAccessDenominators.h
index 3bce635718..66f2d32a6b 100644
--- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/MemoryAccessDenominators.h
+++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/MemoryAccessDenominators.h
@@ -10,7 +10,7 @@
 #include "MemoryAccessGs.h"
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/MemoryAccessGs.h b/epochX/cudacpp/ee_mumu.mad/SubProcesses/MemoryAccessGs.h
index 31311aa375..4c726b30f3 100644
--- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/MemoryAccessGs.h
+++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/MemoryAccessGs.h
@@ -13,7 +13,7 @@
 #include "MemoryBuffers.h" // for HostBufferMatrixElements::isaligned
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/MemoryAccessHelpers.h b/epochX/cudacpp/ee_mumu.mad/SubProcesses/MemoryAccessHelpers.h
index c82a6c7635..db73e4e064 100644
--- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/MemoryAccessHelpers.h
+++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/MemoryAccessHelpers.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MemoryAccessHelpers_H
 #define MemoryAccessHelpers_H 1
@@ -105,7 +105,7 @@ class KernelAccessHelper : public MemoryAccessHelper<T>
     }
     else
     {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
       const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid
       //printf( "kernelAccessRecord: ievt=%d threadId=%d\n", ievt, threadIdx.x );
       return T::ieventAccessRecord( buffer, ievt ); // NB fptype and fptype_sv coincide for CUDA
diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/MemoryAccessMatrixElements.h b/epochX/cudacpp/ee_mumu.mad/SubProcesses/MemoryAccessMatrixElements.h
index f32e6fea5b..3741011971 100644
--- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/MemoryAccessMatrixElements.h
+++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/MemoryAccessMatrixElements.h
@@ -13,7 +13,7 @@
 #include "MemoryBuffers.h" // for HostBufferMatrixElements::isaligned
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/MemoryAccessMomenta.h b/epochX/cudacpp/ee_mumu.mad/SubProcesses/MemoryAccessMomenta.h
index 29266de32c..3be229d392 100644
--- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/MemoryAccessMomenta.h
+++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/MemoryAccessMomenta.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MemoryAccessMomenta_H
 #define MemoryAccessMomenta_H 1
@@ -13,7 +13,7 @@
 #include "MemoryAccessVectors.h"
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -30,7 +30,7 @@ namespace mg5amcCpu
 
     // Number of Events Per Page in the momenta AOSOA memory buffer layout
     // (these are all best kept as a compile-time constants: see issue #23)
-#ifdef __CUDACC__ /* clang-format off */
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
     // -----------------------------------------------------------------------------------------------
     // --- GPUs: neppM is best set to a power of 2 times the number of fptype's in a 32-byte cacheline
     // --- This is relevant to ensure coalesced access to momenta in global memory
diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/MemoryAccessNumerators.h b/epochX/cudacpp/ee_mumu.mad/SubProcesses/MemoryAccessNumerators.h
index b152183b28..18991f4fa6 100644
--- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/MemoryAccessNumerators.h
+++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/MemoryAccessNumerators.h
@@ -10,7 +10,7 @@
 #include "MemoryAccessGs.h"
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/MemoryAccessRandomNumbers.h b/epochX/cudacpp/ee_mumu.mad/SubProcesses/MemoryAccessRandomNumbers.h
index e2988d39f3..40cb089135 100644
--- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/MemoryAccessRandomNumbers.h
+++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/MemoryAccessRandomNumbers.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MemoryAccessRandomNumbers_H
 #define MemoryAccessRandomNumbers_H 1
@@ -11,7 +11,7 @@
 #include "CPPProcess.h"
 #include "MemoryAccessHelpers.h"
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 using mg5amcGpu::CPPProcess;
 #else
 using mg5amcCpu::CPPProcess;
diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/MemoryAccessVectors.h b/epochX/cudacpp/ee_mumu.mad/SubProcesses/MemoryAccessVectors.h
index e9b197368e..08faccff0f 100644
--- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/MemoryAccessVectors.h
+++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/MemoryAccessVectors.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MemoryAccessVectors_H
 #define MemoryAccessVectors_H 1
@@ -10,7 +10,7 @@
 
 #include "mgOnGpuVectors.h"
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 namespace mg5amcCpu // this is only needed for CPU SIMD vectorization
 {
 
diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/MemoryAccessWavefunctions.h b/epochX/cudacpp/ee_mumu.mad/SubProcesses/MemoryAccessWavefunctions.h
index 5428aaf933..33bef4559e 100644
--- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/MemoryAccessWavefunctions.h
+++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/MemoryAccessWavefunctions.h
@@ -15,7 +15,7 @@
 #define MGONGPU_TRIVIAL_WAVEFUNCTIONS 1
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/MemoryBuffers.h b/epochX/cudacpp/ee_mumu.mad/SubProcesses/MemoryBuffers.h
index 3093e6ed18..7756a71621 100644
--- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/MemoryBuffers.h
+++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/MemoryBuffers.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021, based on earlier work by S. Hageboeck) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Roiser, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MemoryBuffers_H
 #define MemoryBuffers_H 1
@@ -11,12 +11,12 @@
 #include "mgOnGpuCxtypes.h"
 
 #include "CPPProcess.h"
-#include "CudaRuntime.h"
+#include "GpuRuntime.h"
 #include "Parameters_sm.h"
 
 #include <sstream>
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -87,7 +87,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   constexpr bool HostBufferALIGNED = false;   // ismisaligned=false
   constexpr bool HostBufferMISALIGNED = true; // ismisaligned=true
 
@@ -119,7 +119,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   // A class encapsulating a CUDA pinned host buffer
   template<typename T>
   class PinnedHostBufferBase : public BufferBase<T>
@@ -128,18 +128,18 @@ namespace mg5amcCpu
     PinnedHostBufferBase( const size_t size )
       : BufferBase<T>( size, false )
     {
-      checkCuda( cudaMallocHost( &( this->m_data ), this->bytes() ) );
+      gpuMallocHost( &( this->m_data ), this->bytes() );
     }
     virtual ~PinnedHostBufferBase()
     {
-      checkCuda( cudaFreeHost( this->m_data ) );
+      gpuFreeHost( this->m_data );
     }
   };
 #endif
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   // A class encapsulating a CUDA device buffer
   template<typename T>
   class DeviceBufferBase : public BufferBase<T>
@@ -148,18 +148,18 @@ namespace mg5amcCpu
     DeviceBufferBase( const size_t size )
       : BufferBase<T>( size, true )
     {
-      checkCuda( cudaMalloc( &( this->m_data ), this->bytes() ) );
+      gpuMalloc( &( this->m_data ), this->bytes() );
     }
     virtual ~DeviceBufferBase()
     {
-      checkCuda( cudaFree( this->m_data ) );
+      gpuFree( this->m_data );
     }
   };
 #endif
 
   //--------------------------------------------------------------------------
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for a given number of events
   template<typename T, size_t sizePerEvent, bool ismisaligned>
   class HostBuffer : public HostBufferBase<T, ismisaligned>, virtual private NumberOfEvents
@@ -175,7 +175,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   // A class encapsulating a CUDA pinned host buffer for a given number of events
   template<typename T, size_t sizePerEvent>
   class PinnedHostBuffer : public PinnedHostBufferBase<T>, virtual private NumberOfEvents
@@ -191,7 +191,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   // A class encapsulating a CUDA device buffer for a given number of events
   template<typename T, size_t sizePerEvent>
   class DeviceBuffer : public DeviceBufferBase<T>, virtual private NumberOfEvents
@@ -213,7 +213,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for momenta random numbers
   constexpr size_t sizePerEventRndNumMomenta = MemoryBuffers::np4 * MemoryBuffers::nparf;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for momenta random numbers
   typedef HostBuffer<fptype, sizePerEventRndNumMomenta, HostBufferALIGNED> HostBufferRndNumMomenta;
 #else
@@ -232,7 +232,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer with ONE fptype per event
   constexpr size_t sizePerEventOneFp = 1;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer with ONE fptype per event
   typedef HostBuffer<fptype, sizePerEventOneFp, HostBufferALIGNED> HostBufferOneFp;
 #else
@@ -257,7 +257,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for Gs
   constexpr size_t sizePerEventGs = 1;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for gs
   typedef HostBuffer<fptype, sizePerEventGs, HostBufferALIGNED> HostBufferGs;
 #else
@@ -276,7 +276,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for numerators
   constexpr size_t sizePerEventNumerators = 1;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for gs
   typedef HostBuffer<fptype, sizePerEventNumerators, HostBufferALIGNED> HostBufferNumerators;
 #else
@@ -296,7 +296,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for denominators
   constexpr size_t sizePerEventDenominators = 1;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for gs
   typedef HostBuffer<fptype, sizePerEventDenominators, HostBufferALIGNED> HostBufferDenominators;
 #else
@@ -315,7 +315,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for random numbers
   constexpr size_t sizePerEventCouplings = MemoryBuffers::ndcoup * MemoryBuffers::nx2;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for gs
   typedef HostBuffer<fptype, sizePerEventCouplings, HostBufferALIGNED> HostBufferCouplings;
 #else
@@ -333,7 +333,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for momenta
   constexpr size_t sizePerEventMomenta = MemoryBuffers::np4 * MemoryBuffers::npar;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for momenta
   typedef HostBuffer<fptype, sizePerEventMomenta, HostBufferALIGNED> HostBufferMomenta;
   //typedef HostBuffer<fptype, sizePerEventMomenta, HostBufferMISALIGNED> HostBufferMomenta; // TEST MISALIGNMENT!
@@ -352,7 +352,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for sampling weights
   constexpr size_t sizePerEventWeights = 1;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for sampling weights
   typedef HostBuffer<fptype, sizePerEventWeights, HostBufferALIGNED> HostBufferWeights;
 #else
@@ -370,7 +370,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for matrix elements
   constexpr size_t sizePerEventMatrixElements = 1;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for matrix elements
   typedef HostBuffer<fptype, sizePerEventMatrixElements, HostBufferALIGNED> HostBufferMatrixElements;
 #else
@@ -385,7 +385,7 @@ namespace mg5amcCpu
   // A base class encapsulating a memory buffer for the helicity mask
   typedef BufferBase<bool> BufferHelicityMask;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for the helicity mask
   typedef HostBufferBase<bool, HostBufferALIGNED> HostBufferHelicityMask;
 #else
@@ -403,7 +403,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for wavefunctions
   constexpr size_t sizePerEventWavefunctions = MemoryBuffers::nw6 * MemoryBuffers::nx2;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for wavefunctions
   typedef HostBuffer<fptype, sizePerEventWavefunctions, HostBufferALIGNED> HostBufferWavefunctions;
 #else
@@ -421,7 +421,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for helicity random numbers
   constexpr size_t sizePerEventRndNumHelicity = 1;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for helicity random numbers
   typedef HostBuffer<fptype, sizePerEventRndNumHelicity, HostBufferALIGNED> HostBufferRndNumHelicity;
 #else
@@ -439,7 +439,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for color random numbers
   constexpr size_t sizePerEventRndNumColor = 1;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for color random numbers
   typedef HostBuffer<fptype, sizePerEventRndNumColor, HostBufferALIGNED> HostBufferRndNumColor;
 #else
@@ -457,7 +457,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for helicity selection
   constexpr size_t sizePerEventSelectedHelicity = 1;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for helicity selection
   typedef HostBuffer<int, sizePerEventSelectedHelicity, HostBufferALIGNED> HostBufferSelectedHelicity;
 #else
@@ -475,7 +475,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for color selection
   constexpr size_t sizePerEventSelectedColor = 1;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for color selection
   typedef HostBuffer<int, sizePerEventSelectedColor, HostBufferALIGNED> HostBufferSelectedColor;
 #else
@@ -487,7 +487,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   template<class Tdst, class Tsrc>
   void copyDeviceFromHost( Tdst& dst, const Tsrc& src ) // keep the same order of arguments as in memcpy
   {
@@ -504,13 +504,13 @@ namespace mg5amcCpu
       throw std::runtime_error( sstr.str() );
     }
     // NB (PR #45): cudaMemcpy involves an intermediate memcpy to pinned memory if host array is a not a pinned host array
-    checkCuda( cudaMemcpy( dst.data(), src.data(), src.bytes(), cudaMemcpyHostToDevice ) );
+    gpuMemcpy( dst.data(), src.data(), src.bytes(), gpuMemcpyHostToDevice );
   }
 #endif
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   template<class Tdst, class Tsrc>
   void copyHostFromDevice( Tdst& dst, const Tsrc& src ) // keep the same order of arguments as in memcpy
   {
@@ -527,7 +527,7 @@ namespace mg5amcCpu
       throw std::runtime_error( sstr.str() );
     }
     // NB (PR #45): cudaMemcpy involves an intermediate memcpy to pinned memory if host array is a not a pinned host array
-    checkCuda( cudaMemcpy( dst.data(), src.data(), src.bytes(), cudaMemcpyDeviceToHost ) );
+    gpuMemcpy( dst.data(), src.data(), src.bytes(), gpuMemcpyDeviceToHost );
   }
 #endif
 
diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/CPPProcess.cc b/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/CPPProcess.cc
index 9193aa2382..83e5b15013 100644
--- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/CPPProcess.cc
+++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/CPPProcess.cc
@@ -4,7 +4,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
 // MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
@@ -16,7 +16,6 @@
 
 #include "mgOnGpuConfig.h"
 
-#include "CudaRuntime.h"
 #include "HelAmps_sm.h"
 #include "MemoryAccessAmplitudes.h"
 #include "MemoryAccessCouplings.h"
@@ -46,7 +45,7 @@
 // Class member functions for calculating the matrix elements for
 // Process: e+ e- > mu+ mu- WEIGHTED<=4 @1
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -80,7 +79,7 @@ namespace mg5amcCpu
   __device__ const fptype cIPD[2] = { (fptype)Parameters_sm::mdl_MZ, (fptype)Parameters_sm::mdl_WZ };
   __device__ const fptype cIPC[6] = { (fptype)Parameters_sm::GC_3.real(), (fptype)Parameters_sm::GC_3.imag(), (fptype)Parameters_sm::GC_50.real(), (fptype)Parameters_sm::GC_50.imag(), (fptype)Parameters_sm::GC_59.real(), (fptype)Parameters_sm::GC_59.imag() };
 #else
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   __device__ __constant__ fptype cIPD[2];
   __device__ __constant__ fptype cIPC[6];
 #else
@@ -90,7 +89,7 @@ namespace mg5amcCpu
 #endif
 
   // Helicity combinations (and filtering of "good" helicity combinations)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   __device__ __constant__ short cHel[ncomb][npar];
   __device__ __constant__ int cNGoodHel;
   __device__ __constant__ int cGoodHel[ncomb];
@@ -118,13 +117,13 @@ namespace mg5amcCpu
                            fptype* allDenominators,       // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
                            fptype_sv* jamp2_sv            // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled)
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
                            , const int ievt00             // input: first event number in current C++ event page (for CUDA, ievt depends on threadid)
 #endif
                            )
   //ALWAYS_INLINE // attributes are not permitted in a function definition
   {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     using namespace mg5amcGpu;
     using M_ACCESS = DeviceAccessMomenta;         // non-trivial access: buffer includes all events
     using E_ACCESS = DeviceAccessMatrixElements;  // non-trivial access: buffer includes all events
@@ -151,7 +150,7 @@ namespace mg5amcCpu
 #endif /* clang-format on */
     mgDebug( 0, __FUNCTION__ );
     //printf( "calculate_wavefunctions: ihel=%2d\n", ihel );
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
     //printf( "calculate_wavefunctions: ievt00=%d\n", ievt00 );
 #endif
 
@@ -187,7 +186,7 @@ namespace mg5amcCpu
 #endif
     for( int iParity = 0; iParity < nParity; ++iParity )
     { // START LOOP ON IPARITY
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
       const int ievt0 = ievt00 + iParity * neppV;
 #endif
       constexpr size_t nxcoup = ndcoup + nicoup; // both dependent and independent couplings
@@ -200,8 +199,10 @@ namespace mg5amcCpu
         allCOUPs[idcoup] = CD_ACCESS::idcoupAccessBufferConst( allcouplings, idcoup ); // dependent couplings, vary event-by-event
       for( size_t iicoup = 0; iicoup < nicoup; iicoup++ )
         allCOUPs[ndcoup + iicoup] = CI_ACCESS::iicoupAccessBufferConst( cIPC, iicoup ); // independent couplings, fixed for all events
+#ifdef MGONGPUCPP_GPUIMPL
 #ifdef __CUDACC__
 #pragma nv_diagnostic pop
+#endif
       // CUDA kernels take input/output buffers with momenta/MEs for all events
       const fptype* momenta = allmomenta;
       const fptype* COUPs[nxcoup];
@@ -286,7 +287,7 @@ namespace mg5amcCpu
       // [NB do keep 'static' for these constexpr arrays, see issue #283]
       static constexpr fptype2 cf[ncolor][ncolor] = { { 1 } }; // 2-D array[1][1]
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
       // Pre-compute a constexpr triangular color matrix properly normalized #475
       struct TriangularNormalizedColorMatrix
       {
@@ -343,7 +344,7 @@ namespace mg5amcCpu
 #endif
       for( int icol = 0; icol < ncolor; icol++ )
       {
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
         // === C++ START ===
         // Diagonal terms
 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
@@ -402,7 +403,7 @@ namespace mg5amcCpu
       MEs_sv_previous += deltaMEs_previous;
 #endif
       /*
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
       if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", blockDim.x * blockIdx.x + threadIdx.x, ihel, MEs_sv );
 #else
 #ifdef MGONGPU_CPPSIMD
@@ -449,8 +450,8 @@ namespace mg5amcCpu
       { 1, -1, 1, 1 },
       { 1, -1, -1, -1 },
       { 1, -1, -1, 1 } };
-#ifdef __CUDACC__
-    checkCuda( cudaMemcpyToSymbol( cHel, tHel, ncomb * npar * sizeof( short ) ) );
+#ifdef MGONGPUCPP_GPUIMPL
+    gpuMemcpyToSymbol( cHel, tHel, ncomb * npar * sizeof( short ) );
 #else
     memcpy( cHel, tHel, ncomb * npar * sizeof( short ) );
 #endif
@@ -490,9 +491,9 @@ namespace mg5amcCpu
     // Then copy them to CUDA constant memory (issue #39) or its C++ emulation in file-scope static memory
     const fptype tIPD[2] = { (fptype)m_pars->mdl_MZ, (fptype)m_pars->mdl_WZ };
     const cxtype tIPC[3] = { cxmake( m_pars->GC_3 ), cxmake( m_pars->GC_50 ), cxmake( m_pars->GC_59 ) };
-#ifdef __CUDACC__
-    checkCuda( cudaMemcpyToSymbol( cIPD, tIPD, 2 * sizeof( fptype ) ) );
-    checkCuda( cudaMemcpyToSymbol( cIPC, tIPC, 3 * sizeof( cxtype ) ) );
+#ifdef MGONGPUCPP_GPUIMPL
+    gpuMemcpyToSymbol( cIPD, tIPD, 2 * sizeof( fptype ) );
+    gpuMemcpyToSymbol( cIPC, tIPC, 3 * sizeof( cxtype ) );
 #else
     memcpy( cIPD, tIPD, 2 * sizeof( fptype ) );
     memcpy( cIPC, tIPC, 3 * sizeof( cxtype ) );
@@ -529,7 +530,7 @@ namespace mg5amcCpu
   {
     std::stringstream out;
     // CUDA version (NVCC)
-    // [Use __NVCC__ instead of __CUDACC__ here!]
+    // [Use __NVCC__ instead of MGONGPUCPP_GPUIMPL here!]
     // [This tests if 'nvcc' was used even to build a .cc file, even if not necessarily 'nvcc -x cu' for a .cu file]
     // [Check 'nvcc --compiler-options -dM -E dummy.c | grep CUDA': see https://stackoverflow.com/a/53713712]
 #ifdef __NVCC__
@@ -594,12 +595,12 @@ namespace mg5amcCpu
   __global__ void /* clang-format off */
   computeDependentCouplings( const fptype* allgs, // input: Gs[nevt]
                              fptype* allcouplings // output: couplings[nevt*ndcoup*2]
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
                              , const int nevt     // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #endif
   ) /* clang-format on */
   {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     using namespace mg5amcGpu;
     using G_ACCESS = DeviceAccessGs;
     using C_ACCESS = DeviceAccessCouplings;
@@ -620,7 +621,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__ /* clang-format off */
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
   __global__ void
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
@@ -746,9 +747,9 @@ namespace mg5amcCpu
         nGoodHel++;
       }
     }
-#ifdef __CUDACC__
-    checkCuda( cudaMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) ) );
-    checkCuda( cudaMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) ) );
+#ifdef MGONGPUCPP_GPUIMPL
+    gpuMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) );
+    gpuMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) );
 #else
     cNGoodHel = nGoodHel;
     for( int ihel = 0; ihel < ncomb; ihel++ ) cGoodHel[ihel] = goodHel[ihel];
@@ -772,7 +773,7 @@ namespace mg5amcCpu
 #endif
             int* allselhel,                // output: helicity selection[nevt]
             int* allselcol                 // output: helicity selection[nevt]
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
             , const int nevt               // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #endif
             ) /* clang-format on */
@@ -792,7 +793,7 @@ namespace mg5amcCpu
     // Denominators: spins, colors and identical particles
     constexpr int helcolDenominators[1] = { 4 }; // assume nprocesses == 1 (#272 and #343)
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     // Remember: in CUDA this is a kernel for one event, in c++ this processes n events
     const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid
 #else
@@ -806,9 +807,12 @@ namespace mg5amcCpu
 #endif
 
     // Start sigmaKin_lines
+
+#include "GpuAbstraction.h"
+
     // === PART 0 - INITIALISATION (before calculate_wavefunctions) ===
     // Reset the "matrix elements" - running sums of |M|^2 over helicities for the given event
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     allMEs[ievt] = 0;
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     allNumerators[ievt] = 0;
@@ -836,7 +840,7 @@ namespace mg5amcCpu
     // === PART 1 - HELICITY LOOP: CALCULATE WAVEFUNCTIONS ===
     // (in both CUDA and C++, using precomputed good helicities)
 
-#ifdef __CUDACC__ // CUDA OR C++
+#ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++
 
     // *** START OF PART 1a - CUDA (one event per CPU thread) ***
     // Running sum of partial amplitudes squared for event by event color selection (#402)
@@ -1046,7 +1050,7 @@ namespace mg5amcCpu
     // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event
     // [NB 'sum over final spins, average over initial spins', eg see
     // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf]
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     allMEs[ievt] /= helcolDenominators[0];
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     if( channelId > 0 ) allMEs[ievt] *= allNumerators[ievt] / allDenominators[ievt];
diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/CPPProcess.h b/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/CPPProcess.h
index 77b610753c..0b29ffb3ff 100644
--- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/CPPProcess.h
+++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/CPPProcess.h
@@ -4,7 +4,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
 // MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
@@ -25,7 +25,7 @@
 
 //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -107,7 +107,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   __global__ void
   computeDependentCouplings( const fptype* allgs,    // input: Gs[nevt]
                              fptype* allcouplings ); // output: couplings[nevt*ndcoup*2]
@@ -120,7 +120,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__ /* clang-format off */
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
   __global__ void
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
@@ -150,7 +150,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__ /* clang-format off */
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
   __global__ void
   sigmaKin( const fptype* allmomenta,      // input: momenta[nevt*npar*4]
             const fptype* allcouplings,    // input: couplings[nevt*ndcoup*2]
diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/CudaRuntime.h b/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/CudaRuntime.h
deleted file mode 120000
index ce9e1a487a..0000000000
--- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/CudaRuntime.h
+++ /dev/null
@@ -1 +0,0 @@
-../CudaRuntime.h
\ No newline at end of file
diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/GpuAbstraction.h b/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/GpuAbstraction.h
new file mode 120000
index 0000000000..72054e19ba
--- /dev/null
+++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/GpuAbstraction.h
@@ -0,0 +1 @@
+../GpuAbstraction.h
\ No newline at end of file
diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/GpuRuntime.h b/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/GpuRuntime.h
new file mode 120000
index 0000000000..3920e83be4
--- /dev/null
+++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/GpuRuntime.h
@@ -0,0 +1 @@
+../GpuRuntime.h
\ No newline at end of file
diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/check_sa.cc b/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/check_sa.cc
index 3fbf0ffbee..7cac5ab47b 100644
--- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/check_sa.cc
+++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/check_sa.cc
@@ -4,7 +4,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: O. Mattelaer (Nov 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 
 #include "mgOnGpuConfig.h"
@@ -12,6 +12,7 @@
 #include "BridgeKernels.h"
 #include "CPPProcess.h"
 #include "CrossSectionKernels.h"
+#include "GpuRuntime.h"
 #include "MatrixElementKernels.h"
 #include "MemoryAccessMatrixElements.h"
 #include "MemoryAccessMomenta.h"
@@ -65,7 +66,7 @@ usage( char* argv0, int ret = 1 )
   std::cout << std::endl;
   std::cout << "Summary stats are always computed: '-p' and '-j' only control their printout" << std::endl;
   std::cout << "The '-d' flag only enables NaN/abnormal warnings and OMP debugging" << std::endl;
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 #ifdef _OPENMP
   std::cout << std::endl;
   std::cout << "Use the OMP_NUM_THREADS environment variable to control OMP multi-threading" << std::endl;
@@ -96,7 +97,7 @@ int
 main( int argc, char** argv )
 {
   // Namespaces for CUDA and C++ (FIXME - eventually use the same namespace everywhere...)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   using namespace mg5amcGpu;
 #else
   using namespace mg5amcCpu;
@@ -134,9 +135,11 @@ main( int argc, char** argv )
     CurandDevice = 2
   };
 #ifdef MGONGPU_HAS_NO_CURAND
-  RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // this is the only supported mode if build has no curand (PR #784)
+  RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // this is the only supported mode if build has no curand (PR #784 and #785)
+#elif defined __HIPCC__
+#error Internal error: MGONGPU_HAS_NO_CURAND should have been set for __HIPCC__ // default on AMD GPUs should be common random
 #elif defined __CUDACC__
-  RandomNumberMode rndgen = RandomNumberMode::CurandDevice; // default on GPU if build has curand
+  RandomNumberMode rndgen = RandomNumberMode::CurandDevice; // default on NVidia GPU if build has curand
 #else
   RandomNumberMode rndgen = RandomNumberMode::CurandHost; // default on CPU if build has curand
 #endif
@@ -146,10 +149,10 @@ main( int argc, char** argv )
     RamboHost = 1,
     RamboDevice = 2
   };
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   RamboSamplingMode rmbsmp = RamboSamplingMode::RamboDevice; // default on GPU
 #else
-  RamboSamplingMode rmbsmp = RamboSamplingMode::RamboHost;  // default on CPU
+  RamboSamplingMode rmbsmp = RamboSamplingMode::RamboHost; // default on CPU
 #endif
   // Bridge emulation mode (NB Bridge implies RamboHost!)
   bool bridge = false;
@@ -177,7 +180,7 @@ main( int argc, char** argv )
     else if( arg == "--curdev" )
     {
 #ifndef __CUDACC__
-      throw std::runtime_error( "CurandDevice is not supported on CPUs" );
+      throw std::runtime_error( "CurandDevice is not supported on CPUs or non-NVidia GPUs" );
 #elif defined MGONGPU_HAS_NO_CURAND
       throw std::runtime_error( "CurandDevice is not supported because this application was built without Curand support" );
 #else
@@ -198,7 +201,7 @@ main( int argc, char** argv )
     }
     else if( arg == "--rmbdev" )
     {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
       rmbsmp = RamboSamplingMode::RamboDevice;
 #else
       throw std::runtime_error( "RamboDevice is not supported on CPUs" );
@@ -272,13 +275,13 @@ main( int argc, char** argv )
     return usage( argv[0] );
   }
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 #ifdef _OPENMP
   ompnumthreadsNotSetMeansOneThread( debug ? 1 : 0 ); // quiet(-1), info(0), debug(1)
 #endif
 #endif
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // Fail gently and avoid "Illegal instruction (core dumped)" if the host does not support the SIMD used in the ME calculation
   // Note: this prevents a crash on pmpe04 but not on some github CI nodes?
   // [NB: SIMD vectorization in mg5amc C++ code is only used in the ME calculation below MatrixElementKernelHost!]
@@ -296,14 +299,14 @@ main( int argc, char** argv )
 
   // === STEP 0 - INITIALISE
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 
-  // --- 00. Initialise cuda
-  // Instantiate a CudaRuntime at the beginnining of the application's main to
-  // invoke cudaSetDevice(0) in the constructor and book a cudaDeviceReset() call in the destructor
-  const std::string cdinKey = "00 CudaInit";
+  // --- 00. Initialise GPU
+  // Instantiate a GpuRuntime at the beginnining of the application's main.
+  // For CUDA this invokes cudaSetDevice(0) in the constructor and books a cudaDeviceReset() call in the destructor.
+  const std::string cdinKey = "00 GpuInit";
   timermap.start( cdinKey );
-  CudaRuntime cudaRuntime( debug );
+  GpuRuntime GpuRuntime( debug );
 #endif
 
   // --- 0a. Initialise physics process
@@ -325,7 +328,7 @@ main( int argc, char** argv )
   timermap.start( alloKey );
 
   // Memory buffers for random numbers for momenta
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferRndNumMomenta hstRndmom( nevt );
 #else
   PinnedHostBufferRndNumMomenta hstRndmom( nevt );
@@ -333,7 +336,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for sampling weights
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferWeights hstWeights( nevt );
 #else
   PinnedHostBufferWeights hstWeights( nevt );
@@ -341,7 +344,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for momenta
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferMomenta hstMomenta( nevt );
 #else
   PinnedHostBufferMomenta hstMomenta( nevt );
@@ -349,7 +352,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for Gs
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferGs hstGs( nevt );
 #else
   PinnedHostBufferGs hstGs( nevt );
@@ -366,7 +369,7 @@ main( int argc, char** argv )
   }
 
   // Memory buffers for matrix elements
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferMatrixElements hstMatrixElements( nevt );
 #else
   PinnedHostBufferMatrixElements hstMatrixElements( nevt );
@@ -375,7 +378,7 @@ main( int argc, char** argv )
 
   // Memory buffers for random numbers for helicity selection
   // *** NB #403 these buffers always remain initialised at 0: no need for helicity choice in gcheck/check (no LHE produced) ***
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferRndNumHelicity hstRndHel( nevt );
 #else
   PinnedHostBufferRndNumHelicity hstRndHel( nevt );
@@ -384,7 +387,7 @@ main( int argc, char** argv )
 
   // Memory buffers for random numbers for color selection
   // *** NB #402 these buffers always remain initialised at 0: no need for color choice in gcheck/check (no LHE produced) ***
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferRndNumColor hstRndCol( nevt );
 #else
   PinnedHostBufferRndNumColor hstRndCol( nevt );
@@ -392,7 +395,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for helicity selection
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferSelectedHelicity hstSelHel( nevt );
 #else
   PinnedHostBufferSelectedHelicity hstSelHel( nevt );
@@ -400,7 +403,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for color selection
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferSelectedColor hstSelCol( nevt );
 #else
   PinnedHostBufferSelectedColor hstSelCol( nevt );
@@ -438,7 +441,7 @@ main( int argc, char** argv )
     const bool onDevice = true;
     prnk.reset( new CurandRandomNumberKernel( devRndmom, onDevice ) );
 #else
-    throw std::logic_error( "INTERNAL ERROR! CurandDevice is not supported on CPUs" ); // INTERNAL ERROR (no path to this statement)
+    throw std::logic_error( "INTERNAL ERROR! CurandDevice is not supported on CPUs or non-NVidia GPUs" ); // INTERNAL ERROR (no path to this statement)
 #endif
   }
 
@@ -450,7 +453,7 @@ main( int argc, char** argv )
   }
   else
   {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     prsk.reset( new RamboSamplingKernelDevice( energy, devRndmom, devMomenta, devWeights, gpublocks, gputhreads ) );
 #else
     throw std::logic_error( "RamboDevice is not supported on CPUs" ); // INTERNAL ERROR (no path to this statement)
@@ -461,7 +464,7 @@ main( int argc, char** argv )
   std::unique_ptr<MatrixElementKernelBase> pmek;
   if( !bridge )
   {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     pmek.reset( new MatrixElementKernelDevice( devMomenta, devGs, devRndHel, devRndCol, devMatrixElements, devSelHel, devSelCol, gpublocks, gputhreads ) );
 #else
     pmek.reset( new MatrixElementKernelHost( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, nevt ) );
@@ -469,7 +472,7 @@ main( int argc, char** argv )
   }
   else
   {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     pmek.reset( new BridgeKernelDevice( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, gpublocks, gputhreads ) );
 #else
     pmek.reset( new BridgeKernelHost( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, nevt ) );
@@ -511,7 +514,7 @@ main( int argc, char** argv )
     prnk->generateRnarray();
     //std::cout << "Got random numbers" << std::endl;
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     if( rndgen != RandomNumberMode::CurandDevice && rmbsmp == RamboSamplingMode::RamboDevice )
     {
       // --- 1c. Copy rndmom from host to device
@@ -543,7 +546,7 @@ main( int argc, char** argv )
     prsk->getMomentaFinal();
     //std::cout << "Got final momenta" << std::endl;
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     if( rmbsmp == RamboSamplingMode::RamboDevice )
     {
       // --- 2c. CopyDToH Weights
@@ -588,7 +591,7 @@ main( int argc, char** argv )
       dynamic_cast<BridgeKernelBase*>( pmek.get() )->transposeInputMomentaC2F();
     }
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     // --- 2d. CopyHToD Momenta
     const std::string gKey = "0.. CpHTDg";
     rambtime += timermap.start( gKey ); // FIXME! NOT A RAMBO TIMER!
@@ -617,7 +620,7 @@ main( int argc, char** argv )
     wv3atime += timermap.stop(); // calc only
     wavetime += wv3atime;        // calc plus copy
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     if( !bridge )
     {
       // --- 3b. CopyDToH MEs
@@ -760,18 +763,22 @@ main( int argc, char** argv )
     rndgentxt = "CURAND DEVICE";
 #ifdef __CUDACC__
   rndgentxt += " (CUDA code)";
+#elif defined __HIPCC__
+  rndgentxt += " (HIP code)";
 #else
   rndgentxt += " (C++ code)";
 #endif
 
   // Workflow description summary
   std::string wrkflwtxt;
-  // -- CUDA or C++?
+  // -- CUDA or HIP or C++?
 #ifdef __CUDACC__
   wrkflwtxt += "CUD:";
+#elif defined __HIPCC__
+  wrkflwtxt += "HIP:";
 #else
   wrkflwtxt += "CPP:";
-#endif
+#endif /* clang-format off */
   // -- DOUBLE or FLOAT?
 #if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
   wrkflwtxt += "MIX+"; // mixed fptypes (single precision color algebra #537)
@@ -781,7 +788,7 @@ main( int argc, char** argv )
   wrkflwtxt += "FLT+";
 #else
   wrkflwtxt += "???+"; // no path to this statement
-#endif
+#endif /* clang-format on */
   // -- CUCOMPLEX or THRUST or STD complex numbers?
 #ifdef __CUDACC__
 #if defined MGONGPU_CUCXTYPE_CUCOMPLEX
@@ -793,6 +800,12 @@ main( int argc, char** argv )
 #else
   wrkflwtxt += "???:"; // no path to this statement
 #endif
+#elif defined __HIPCC__
+#if defined MGONGPU_CUCXTYPE_CXSMPL
+  wrkflwtxt += "CXS:";
+#else
+  wrkflwtxt += "???:"; // no path to this statement
+#endif
 #else
 #if defined MGONGPU_CPPCXTYPE_STDCOMPLEX
   wrkflwtxt += "STX:";
@@ -818,7 +831,7 @@ main( int argc, char** argv )
     wrkflwtxt += "RMBDEV+";
   else
     wrkflwtxt += "??????+"; // no path to this statement
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   // -- HOST or DEVICE matrix elements? Standalone MEs or BRIDGE?
   if( !bridge )
     wrkflwtxt += "MESDEV";
@@ -874,7 +887,7 @@ main( int argc, char** argv )
 
   if( perf )
   {
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 #ifdef _OPENMP
     // Get the output of "nproc --all" (https://stackoverflow.com/a/478960)
     std::string nprocall;
@@ -895,6 +908,8 @@ main( int argc, char** argv )
     std::cout << std::string( SEP79, '*' ) << std::endl
 #ifdef __CUDACC__
               << "Process                     = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_CUDA"
+#elif defined __HIPCC__
+              << "Process                     = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_HIP"
 #else
               << "Process                     = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_CPP"
 #endif
@@ -921,21 +936,21 @@ main( int argc, char** argv )
 #elif defined MGONGPU_FPTYPE_FLOAT
               << "FP precision                = FLOAT (NaN/abnormal=" << nabn << ", zero=" << nzero << ")" << std::endl
 #endif
-#ifdef __CUDACC__
 #if defined MGONGPU_CUCXTYPE_CUCOMPLEX
               << "Complex type                = CUCOMPLEX" << std::endl
 #elif defined MGONGPU_CUCXTYPE_THRUST
               << "Complex type                = THRUST::COMPLEX" << std::endl
-#endif
-#else
+#elif defined MGONGPU_CUCXTYPE_CXSMPL
               << "Complex type                = STD::COMPLEX" << std::endl
+#else
+              << "Complex type                = ???" << std::endl // no path to this statement...
 #endif
               << "RanNumb memory layout       = AOSOA[" << neppR << "]"
               << ( neppR == 1 ? " == AOS" : "" )
               << " [HARDCODED FOR REPRODUCIBILITY]" << std::endl
               << "Momenta memory layout       = AOSOA[" << neppM << "]"
               << ( neppM == 1 ? " == AOS" : "" ) << std::endl
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     //<< "Wavefunction GPU memory     = LOCAL" << std::endl
 #else
 #if !defined MGONGPU_CPPSIMD
@@ -966,7 +981,7 @@ main( int argc, char** argv )
 #endif
 #endif
               << "Random number generation    = " << rndgentxt << std::endl
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 #ifdef _OPENMP
               << "OMP threads / `nproc --all` = " << omp_get_max_threads() << " / " << nprocall // includes a newline
 #endif
@@ -1062,14 +1077,14 @@ main( int argc, char** argv )
              << "\"FLOAT (NaN/abnormal=" << nabn << ")\"," << std::endl
 #endif
              << "\"Complex type\": "
-#ifdef __CUDACC__
 #if defined MGONGPU_CUCXTYPE_CUCOMPLEX
              << "\"CUCOMPLEX\"," << std::endl
 #elif defined MGONGPU_CUCXTYPE_THRUST
              << "\"THRUST::COMPLEX\"," << std::endl
-#endif
-#else
+#elif defined MGONGPU_CUCXTYPE_CXSMPL
              << "\"STD::COMPLEX\"," << std::endl
+#else
+             << "\"???\"," << std::endl                           // no path to this statement...
 #endif
              << "\"RanNumb memory layout\": "
              << "\"AOSOA[" << neppR << "]\""
@@ -1077,7 +1092,7 @@ main( int argc, char** argv )
              << "\"Momenta memory layout\": "
              << "\"AOSOA[" << neppM << "]\""
              << ( neppM == 1 ? " == AOS" : "" ) << ", " << std::endl
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     //<< "\"Wavefunction GPU memory\": " << "\"LOCAL\"," << std::endl
 #endif
              << "\"Curand generation\": "
diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/RamboSamplingKernels.cc b/epochX/cudacpp/ee_mumu.mad/SubProcesses/RamboSamplingKernels.cc
index da68aa9255..79abbcc4f8 100644
--- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/RamboSamplingKernels.cc
+++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/RamboSamplingKernels.cc
@@ -1,11 +1,11 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #include "RamboSamplingKernels.h"
 
-#include "CudaRuntime.h"
+#include "GpuRuntime.h"
 #include "MemoryAccessMomenta.h"
 #include "MemoryAccessRandomNumbers.h"
 #include "MemoryAccessWeights.h"
@@ -14,7 +14,7 @@
 
 #include <sstream>
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -92,7 +92,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   RamboSamplingKernelDevice::RamboSamplingKernelDevice( const fptype energy,               // input: energy
                                                         const BufferRndNumMomenta& rndmom, // input: random numbers in [0,1]
                                                         BufferMomenta& momenta,            // output: momenta
@@ -135,7 +135,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   __global__ void
   getMomentaInitialDevice( const fptype energy,
                            fptype* momenta )
@@ -147,17 +147,17 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   void
   RamboSamplingKernelDevice::getMomentaInitial()
   {
-    getMomentaInitialDevice<<<m_gpublocks, m_gputhreads>>>( m_energy, m_momenta.data() );
+    gpuLaunchKernel( getMomentaInitialDevice, m_gpublocks, m_gputhreads, m_energy, m_momenta.data() );
   }
 #endif
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   __global__ void
   getMomentaFinalDevice( const fptype energy,
                          const fptype* rndmom,
@@ -171,11 +171,11 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   void
   RamboSamplingKernelDevice::getMomentaFinal()
   {
-    getMomentaFinalDevice<<<m_gpublocks, m_gputhreads>>>( m_energy, m_rndmom.data(), m_momenta.data(), m_weights.data() );
+    gpuLaunchKernel( getMomentaFinalDevice, m_gpublocks, m_gputhreads, m_energy, m_rndmom.data(), m_momenta.data(), m_weights.data() );
   }
 #endif
 
diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/RamboSamplingKernels.h b/epochX/cudacpp/ee_mumu.mad/SubProcesses/RamboSamplingKernels.h
index 184089efd7..7c214cd74b 100644
--- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/RamboSamplingKernels.h
+++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/RamboSamplingKernels.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef RAMBOSAMPLINGKERNELS_H
 #define RAMBOSAMPLINGKERNELS_H 1
@@ -10,7 +10,7 @@
 
 #include "MemoryBuffers.h"
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -93,7 +93,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   // A class encapsulating RAMBO phase space sampling on a GPU device
   class RamboSamplingKernelDevice final : public SamplingKernelBase, public NumberOfEvents
   {
diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/RandomNumberKernels.h b/epochX/cudacpp/ee_mumu.mad/SubProcesses/RandomNumberKernels.h
index 188a72c2c9..21d63beeac 100644
--- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/RandomNumberKernels.h
+++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/RandomNumberKernels.h
@@ -1,14 +1,14 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef RANDOMNUMBERKERNELS_H
 #define RANDOMNUMBERKERNELS_H 1
 
 #include "mgOnGpuConfig.h"
 
-// NB This must come AFTER mgOnGpuConfig.h which contains our definition of __global__ when __CUDACC__ is not defined
+// NB This must come AFTER mgOnGpuConfig.h which contains our definition of __global__ when MGONGPUCPP_GPUIMPL is not defined
 #ifndef MGONGPU_HAS_NO_CURAND
 //#include "curand.h"
 struct curandGenerator_st; // forward definition from curand.h
@@ -16,7 +16,7 @@ struct curandGenerator_st; // forward definition from curand.h
 
 #include "MemoryBuffers.h"
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/cudacpp.mk b/epochX/cudacpp/ee_mumu.mad/SubProcesses/cudacpp.mk
index 509307506b..f2cfa349da 100644
--- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/cudacpp.mk
+++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/cudacpp.mk
@@ -1,7 +1,7 @@
 # Copyright (C) 2020-2023 CERN and UCLouvain.
 # Licensed under the GNU Lesser General Public License (version 3 or later).
 # Created by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin.
-# Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+# Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 
 #=== Determine the name of this makefile (https://ftp.gnu.org/old-gnu/Manuals/make-3.80/html_node/make_17.html)
 #=== NB: use ':=' to ensure that the value of CUDACPP_MAKEFILE is not modified further down after including make_opts
@@ -42,10 +42,10 @@ endif
 
 #-------------------------------------------------------------------------------
 
-#=== Configure common compiler flags for C++ and CUDA
+#=== Configure common compiler flags for C++ and CUDA/HIP
 
 INCFLAGS = -I.
-OPTFLAGS = -O3 # this ends up in CUFLAGS too (should it?), cannot add -Ofast or -ffast-math here
+OPTFLAGS = -O3 # this ends up in GPUFLAGS too (should it?), cannot add -Ofast or -ffast-math here
 
 # Dependency on src directory
 MG5AMC_COMMONLIB = mg5amc_common
@@ -121,24 +121,46 @@ endif
 
 #-------------------------------------------------------------------------------
 
-#=== Configure the CUDA compiler
+#=== Configure the GPU compiler (CUDA or HIP)
 
-# If CXX is not a single word (example "clang++ --gcc-toolchain...") then disable CUDA builds (issue #505)
-# This is because it is impossible to pass this to "CUFLAGS += -ccbin <host-compiler>" below
+# FIXME! (AV 24.01.2024)
+# In the current implementation (without separate builds for C++ and CUDA/HIP), we first check for cudacc and hipcc in CUDA_HOME and HIP_HOME.
+# If CUDA_HOME or HIP_HOME are not set, try to determine them from the path to cudacc and hipcc.
+# While convoluted, this is currently necessary to allow disabling CUDA/HIP builds by setting CUDA_HOME or HIP_HOME to invalid paths.
+# This will (probably?) be fixed when separate C++ and CUDA/HIP builds are implemented (PR #775).
+
+# If CXX is not a single word (example "clang++ --gcc-toolchain...") then disable CUDA and HIP builds (issue #505)
+# This is because it is impossible to pass this to "GPUFLAGS += -ccbin <host-compiler>" below
 ifneq ($(words $(subst ccache ,,$(CXX))),1) # allow at most "CXX=ccache <host-compiler>" from outside
-  $(warning CUDA builds are not supported for multi-word CXX "$(CXX)")
+  $(warning CUDA and HIP builds are not supported for multi-word CXX "$(CXX)")
   override CUDA_HOME=disabled
+  override HIP_HOME=disabled
 endif
 
-# If CUDA_HOME is not set, try to set it from the location of nvcc
+# If CUDA_HOME is not set, try to set it from the path to nvcc
 ifndef CUDA_HOME
   CUDA_HOME = $(patsubst %bin/nvcc,%,$(shell which nvcc 2>/dev/null))
   $(warning CUDA_HOME was not set: using "$(CUDA_HOME)")
 endif
 
-# Set NVCC as $(CUDA_HOME)/bin/nvcc if it exists
+# If HIP_HOME is not set, try to set it from the path to hipcc
+ifndef HIP_HOME
+  HIP_HOME = $(patsubst %bin/hipcc,%,$(HIP_COMPILER_PATH))
+  $(warning HIP_HOME was not set: using "$(HIP_HOME)")
+endif
+
+# FIXME! (AV 24.01.2024)
+# In the current implementation (without separate builds for C++ and CUDA/HIP),
+# builds are performed for HIP only if CUDA is not found in the path.
+# If both CUDA and HIP are installed, HIP builds can be triggered by unsetting CUDA_HOME.
+# This will be fixed when separate C++ and CUDA/HIP builds are implemented (PR #775).
+
+#--- Option 1: CUDA exists -> use CUDA
+
+# Set GPUCC as $(CUDA_HOME)/bin/nvcc if it exists
 ifneq ($(wildcard $(CUDA_HOME)/bin/nvcc),)
-  NVCC = $(CUDA_HOME)/bin/nvcc
+
+  GPUCC = $(CUDA_HOME)/bin/nvcc
   USE_NVTX ?=-DUSE_NVTX
   # See https://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/index.html
   # See https://arnon.dk/matching-sm-architectures-arch-and-gencode-for-various-nvidia-cards/
@@ -158,41 +180,78 @@ ifneq ($(wildcard $(CUDA_HOME)/bin/nvcc),)
     CURANDLIBFLAGS = -L$(CUDA_HOME)/lib64/ -lcurand # NB: -lcuda is not needed here!
   endif
   CUOPTFLAGS = -lineinfo
-  CUFLAGS = $(foreach opt, $(OPTFLAGS), -Xcompiler $(opt)) $(CUOPTFLAGS) $(INCFLAGS) $(CUINC) $(USE_NVTX) $(CUARCHFLAGS) -use_fast_math
-  ###CUFLAGS += -Xcompiler -Wall -Xcompiler -Wextra -Xcompiler -Wshadow
-  ###NVCC_VERSION = $(shell $(NVCC) --version | grep 'Cuda compilation tools' | cut -d' ' -f5 | cut -d, -f1)
-  CUFLAGS += -std=c++17 # need CUDA >= 11.2 (see #333): this is enforced in mgOnGpuConfig.h
+  ###GPUFLAGS = $(OPTFLAGS) $(CUOPTFLAGS) $(INCFLAGS) $(CUINC) $(USE_NVTX) $(CUARCHFLAGS) -use_fast_math
+  GPUFLAGS = $(foreach opt, $(OPTFLAGS), -Xcompiler $(opt)) $(CUOPTFLAGS) $(INCFLAGS) $(CUINC) $(USE_NVTX) $(CUARCHFLAGS) -use_fast_math
+  ###GPUFLAGS += -Xcompiler -Wall -Xcompiler -Wextra -Xcompiler -Wshadow
+  ###GPUCC_VERSION = $(shell $(GPUCC) --version | grep 'Cuda compilation tools' | cut -d' ' -f5 | cut -d, -f1)
+  GPUFLAGS += -std=c++17 # need CUDA >= 11.2 (see #333): this is enforced in mgOnGpuConfig.h
   # Without -maxrregcount: baseline throughput: 6.5E8 (16384 32 12) up to 7.3E8 (65536 128 12)
-  ###CUFLAGS+= --maxrregcount 160 # improves throughput: 6.9E8 (16384 32 12) up to 7.7E8 (65536 128 12)
-  ###CUFLAGS+= --maxrregcount 128 # improves throughput: 7.3E8 (16384 32 12) up to 7.6E8 (65536 128 12)
-  ###CUFLAGS+= --maxrregcount 96 # degrades throughput: 4.1E8 (16384 32 12) up to 4.5E8 (65536 128 12)
-  ###CUFLAGS+= --maxrregcount 64 # degrades throughput: 1.7E8 (16384 32 12) flat at 1.7E8 (65536 128 12)
+  ###GPUFLAGS+= --maxrregcount 160 # improves throughput: 6.9E8 (16384 32 12) up to 7.7E8 (65536 128 12)
+  ###GPUFLAGS+= --maxrregcount 128 # improves throughput: 7.3E8 (16384 32 12) up to 7.6E8 (65536 128 12)
+  ###GPUFLAGS+= --maxrregcount 96 # degrades throughput: 4.1E8 (16384 32 12) up to 4.5E8 (65536 128 12)
+  ###GPUFLAGS+= --maxrregcount 64 # degrades throughput: 1.7E8 (16384 32 12) flat at 1.7E8 (65536 128 12)
+  CUBUILDRULEFLAGS = -Xcompiler -fPIC -c
+  CCBUILDRULEFLAGS = -Xcompiler -fPIC -c -x cu
+  CUDATESTFLAGS = -lcuda
+
+  # Set the host C++ compiler for GPUCC via "-ccbin <host-compiler>"
+  # (NB issue #505: this must be a single word, "clang++ --gcc-toolchain..." is not supported)
+  GPUFLAGS += -ccbin $(shell which $(subst ccache ,,$(CXX)))
+
+  # Allow newer (unsupported) C++ compilers with older versions of CUDA if ALLOW_UNSUPPORTED_COMPILER_IN_CUDA is set (#504)
+  ifneq ($(origin ALLOW_UNSUPPORTED_COMPILER_IN_CUDA),undefined)
+  GPUFLAGS += -allow-unsupported-compiler
+  endif
+
 else ifneq ($(origin REQUIRE_CUDA),undefined)
+
   # If REQUIRE_CUDA is set but no cuda is found, stop here (e.g. for CI tests on GPU #443)
-  $(error No cuda installation found (set CUDA_HOME or make nvcc visible in PATH))
+  $(error No cuda installation found (set CUDA_HOME or make GPUCC visible in PATH))
+
+#--- Option 2: CUDA does not exist, HIP exists -> use HIP
+
+# Set GPUCC as $(HIP_HOME)/bin/hipcc if it exists
+else ifneq ($(wildcard $(HIP_HOME)/bin/hipcc),)
+
+  GPUCC = $(HIP_HOME)/bin/hipcc
+  #USE_NVTX ?=-DUSE_NVTX # should maybe find something equivalent to this in HIP?
+  HIPARCHFLAGS = -target x86_64-linux-gnu --offload-arch=gfx90a
+  HIPINC = -I$(HIP_HOME)/include/
+  # Note: -DHIP_FAST_MATH is equivalent to -use_fast_math in HIP 
+  # (but only for single precision line 208: https://rocm-developer-tools.github.io/HIP/hcc__detail_2math__functions_8h_source.html)
+  GPUFLAGS = $(OPTFLAGS) $(CUOPTFLAGS) $(INCFLAGS) $(HIPINC) $(HIPARCHFLAGS) -DHIP_FAST_MATH -DHIP_PLATFORM=amd -fPIC
+  ###GPUFLAGS += -Xcompiler -Wall -Xcompiler -Wextra -Xcompiler -Wshadow
+  GPUFLAGS += -std=c++17
+  ###GPUFLAGS+= --maxrregcount 255 # (AV: is this option valid on HIP and meaningful on AMD GPUs?)
+  CUBUILDRULEFLAGS = -fPIC -c
+  CCBUILDRULEFLAGS = -fPIC -c
+
+else ifneq ($(origin REQUIRE_HIP),undefined)
+
+  # If REQUIRE_HIP is set but no HIP is found, stop here (e.g. for CI tests on GPU #443)
+  $(error No hip installation found (set HIP_HOME or make GPUCC visible in PATH))
+
+#--- Option 3: CUDA does not exist, HIP does not exist -> switch off both CUDA and HIP
+
 else
-  # No cuda. Switch cuda compilation off and go to common random numbers in C++
+
+  # No cudacc and no hipcc: switch CUDA and HIP compilation off and go to common random numbers in C++
   $(warning CUDA_HOME is not set or is invalid: export CUDA_HOME to compile with cuda)
-  override NVCC=
+  $(warning HIP_HOME is not set or is invalid: export HIP_HOME to compile with hip)
+  override GPUCC=
   override USE_NVTX=
   override CUINC=
   override CURANDLIBFLAGS=
-endif
-export NVCC
-export CUFLAGS
-
-# Set the host C++ compiler for nvcc via "-ccbin <host-compiler>"
-# (NB issue #505: this must be a single word, "clang++ --gcc-toolchain..." is not supported)
-CUFLAGS += -ccbin $(shell which $(subst ccache ,,$(CXX)))
 
-# Allow newer (unsupported) C++ compilers with older versions of CUDA if ALLOW_UNSUPPORTED_COMPILER_IN_CUDA is set (#504)
-ifneq ($(origin ALLOW_UNSUPPORTED_COMPILER_IN_CUDA),undefined)
-CUFLAGS += -allow-unsupported-compiler
 endif
 
+# Export GPUCC (so that it can also be used in cudacpp_src.mk?)
+export GPUCC
+export GPUFLAGS
+
 #-------------------------------------------------------------------------------
 
-#=== Configure ccache for C++ and CUDA builds
+#=== Configure ccache for C++ and CUDA/HIP builds
 
 # Enable ccache if USECCACHE=1
 ifeq ($(USECCACHE)$(shell echo $(CXX) | grep ccache),1)
@@ -201,15 +260,15 @@ endif
 #ifeq ($(USECCACHE)$(shell echo $(AR) | grep ccache),1)
 #  override AR:=ccache $(AR)
 #endif
-ifneq ($(NVCC),)
-  ifeq ($(USECCACHE)$(shell echo $(NVCC) | grep ccache),1)
-    override NVCC:=ccache $(NVCC)
+ifneq ($(GPUCC),)
+  ifeq ($(USECCACHE)$(shell echo $(GPUCC) | grep ccache),1)
+    override GPUCC:=ccache $(GPUCC)
   endif
 endif
 
 #-------------------------------------------------------------------------------
 
-#=== Configure PowerPC-specific compiler flags for C++ and CUDA
+#=== Configure PowerPC-specific compiler flags for C++ and CUDA/HIP
 
 # PowerPC-specific CXX compiler flags (being reviewed)
 ifeq ($(UNAME_P),ppc64le)
@@ -225,9 +284,9 @@ else
   ######CXXFLAGS+= -fno-semantic-interposition # no benefit (neither alone, nor combined with -flto)
 endif
 
-# PowerPC-specific CUDA compiler flags (to be reviewed!)
+# PowerPC-specific CUDA/HIP compiler flags (to be reviewed!)
 ifeq ($(UNAME_P),ppc64le)
-  CUFLAGS+= -Xcompiler -mno-float128
+  GPUFLAGS+= -Xcompiler -mno-float128
 endif
 
 #-------------------------------------------------------------------------------
@@ -237,7 +296,7 @@ endif
 # Set the default OMPFLAGS choice
 ifneq ($(shell $(CXX) --version | egrep '^Intel'),)
 override OMPFLAGS = -fopenmp
-###override OMPFLAGS = # disable OpenMP MT on Intel (was ok without nvcc but not ok with nvcc before #578)
+###override OMPFLAGS = # disable OpenMP MT on Intel (was ok without GPUCC but not ok with GPUCC before #578)
 else ifneq ($(shell $(CXX) --version | egrep '^(clang)'),)
 override OMPFLAGS = -fopenmp
 ###override OMPFLAGS = # disable OpenMP MT on clang (was not ok without or with nvcc before #578)
@@ -293,7 +352,10 @@ endif
 
 # Set the default RNDGEN (random number generator) choice
 ifeq ($(RNDGEN),)
-  ifeq ($(NVCC),)
+  ifeq ($(GPUCC),)
+    override RNDGEN = hasNoCurand
+  # Edgecase for HIP compilation
+  else ifeq ($(findstring hipcc,$(GPUCC)),hipcc)
     override RNDGEN = hasNoCurand
   else ifeq ($(RNDGEN),)
     override RNDGEN = hasCurand
@@ -310,7 +372,7 @@ export OMPFLAGS
 
 #-------------------------------------------------------------------------------
 
-#=== Set the CUDA/C++ compiler flags appropriate to user-defined choices of AVX, FPTYPE, HELINL, HRDCOD, RNDGEN
+#=== Set the CUDA/HIP/C++ compiler flags appropriate to user-defined choices of AVX, FPTYPE, HELINL, HRDCOD, RNDGEN
 
 # Set the build flags appropriate to OMPFLAGS
 $(info OMPFLAGS=$(OMPFLAGS))
@@ -368,13 +430,13 @@ CXXFLAGS+= $(AVXFLAGS)
 $(info FPTYPE=$(FPTYPE))
 ifeq ($(FPTYPE),d)
   CXXFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_DOUBLE
-  CUFLAGS  += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_DOUBLE
+  GPUFLAGS  += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_DOUBLE
 else ifeq ($(FPTYPE),f)
   CXXFLAGS += -DMGONGPU_FPTYPE_FLOAT -DMGONGPU_FPTYPE2_FLOAT
-  CUFLAGS  += -DMGONGPU_FPTYPE_FLOAT -DMGONGPU_FPTYPE2_FLOAT
+  GPUFLAGS  += -DMGONGPU_FPTYPE_FLOAT -DMGONGPU_FPTYPE2_FLOAT
 else ifeq ($(FPTYPE),m)
   CXXFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_FLOAT
-  CUFLAGS  += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_FLOAT
+  GPUFLAGS  += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_FLOAT
 else
   $(error Unknown FPTYPE='$(FPTYPE)': only 'd', 'f' and 'm' are supported)
 endif
@@ -383,7 +445,7 @@ endif
 $(info HELINL=$(HELINL))
 ifeq ($(HELINL),1)
   CXXFLAGS += -DMGONGPU_INLINE_HELAMPS
-  CUFLAGS  += -DMGONGPU_INLINE_HELAMPS
+  GPUFLAGS  += -DMGONGPU_INLINE_HELAMPS
 else ifneq ($(HELINL),0)
   $(error Unknown HELINL='$(HELINL)': only '0' and '1' are supported)
 endif
@@ -392,7 +454,7 @@ endif
 $(info HRDCOD=$(HRDCOD))
 ifeq ($(HRDCOD),1)
   CXXFLAGS += -DMGONGPU_HARDCODE_PARAM
-  CUFLAGS  += -DMGONGPU_HARDCODE_PARAM
+  GPUFLAGS  += -DMGONGPU_HARDCODE_PARAM
 else ifneq ($(HRDCOD),0)
   $(error Unknown HRDCOD='$(HRDCOD)': only '0' and '1' are supported)
 endif
@@ -444,11 +506,11 @@ ifeq ($(UNAME_S),Darwin)
   override CULIBFLAGSRPATH2 =
 else
   # RPATH to cuda/cpp libs when linking executables
-  override CXXLIBFLAGSRPATH = -Wl,-rpath,$(LIBDIRRPATH)
-  override CULIBFLAGSRPATH = -Xlinker -rpath,$(LIBDIRRPATH)
+  override CXXLIBFLAGSRPATH = -Wl,-rpath=$(LIBDIRRPATH)
+  override CULIBFLAGSRPATH = -Xlinker -rpath=$(LIBDIRRPATH)
   # RPATH to common lib when linking cuda/cpp libs
-  override CXXLIBFLAGSRPATH2 = -Wl,-rpath,'$$ORIGIN'
-  override CULIBFLAGSRPATH2 = -Xlinker -rpath,'$$ORIGIN'
+  override CXXLIBFLAGSRPATH2 = -Wl,-rpath='$$ORIGIN'
+  override CULIBFLAGSRPATH2 = -Xlinker -rpath='$$ORIGIN'
 endif
 
 # Setting LD_LIBRARY_PATH or DYLD_LIBRARY_PATH in the RUNTIME is no longer necessary (neither on Linux nor on Mac)
@@ -461,7 +523,7 @@ override RUNTIME =
 cxx_main=$(BUILDDIR)/check.exe
 fcxx_main=$(BUILDDIR)/fcheck.exe
 
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 cu_main=$(BUILDDIR)/gcheck.exe
 fcu_main=$(BUILDDIR)/fgcheck.exe
 else
@@ -492,15 +554,16 @@ $(BUILDDIR)/.build.$(TAG):
 	@touch $(BUILDDIR)/.build.$(TAG)
 
 # Generic target and build rules: objects from CUDA compilation
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 $(BUILDDIR)/%.o : %.cu *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
 	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
-	$(NVCC) $(CPPFLAGS) $(CUFLAGS) -Xcompiler -fPIC -c $< -o $@
+	$(GPUCC) $(CPPFLAGS) $(GPUFLAGS) $(CUBUILDRULEFLAGS) $< -o $@
 
 $(BUILDDIR)/%_cu.o : %.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
 	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
-	$(NVCC) $(CPPFLAGS) $(CUFLAGS) -Xcompiler -fPIC -c -x cu $< -o $@
+	$(GPUCC) $(CPPFLAGS) $(GPUFLAGS) $(CCBUILDRULEFLAGS) $< -o $@
 endif
+# -x cu in line above
 
 # Generic target and build rules: objects from C++ compilation
 # (NB do not include CUINC here! add it only for NVTX or curand #679)
@@ -509,11 +572,14 @@ $(BUILDDIR)/%.o : %.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
 	$(CXX) $(CPPFLAGS) $(CXXFLAGS) -fPIC -c $< -o $@
 
 # Apply special build flags only to CrossSectionKernel.cc and gCrossSectionKernel.cu (no fast math, see #117 and #516)
+# Added edgecase for HIP compilation
 ifeq ($(shell $(CXX) --version | grep ^nvc++),)
 $(BUILDDIR)/CrossSectionKernels.o: CXXFLAGS := $(filter-out -ffast-math,$(CXXFLAGS))
 $(BUILDDIR)/CrossSectionKernels.o: CXXFLAGS += -fno-fast-math
-ifneq ($(NVCC),)
-$(BUILDDIR)/gCrossSectionKernels.o: CUFLAGS += -Xcompiler -fno-fast-math
+ifeq ($(findstring nvcc,$(GPUCC)),nvcc)
+  $(BUILDDIR)/gCrossSectionKernels.o: GPUFLAGS += -Xcompiler -fno-fast-math
+else
+  $(BUILDDIR)/gCrossSectionKernels.o: GPUFLAGS += -fno-fast-math
 endif
 endif
 
@@ -530,10 +596,10 @@ ifeq ($(RNDGEN),hasCurand)
 $(BUILDDIR)/CurandRandomNumberKernel.o: CXXFLAGS += $(CUINC)
 endif
 
-# Avoid "warning: builtin __has_trivial_... is deprecated; use __is_trivially_... instead" in nvcc with icx2023 (#592)
+# Avoid "warning: builtin __has_trivial_... is deprecated; use __is_trivially_... instead" in GPUCC with icx2023 (#592)
 ifneq ($(shell $(CXX) --version | egrep '^(Intel)'),)
-ifneq ($(NVCC),)
-CUFLAGS += -Xcompiler -Wno-deprecated-builtins
+ifneq ($(GPUCC),)
+GPUFLAGS += -Wno-deprecated-builtins
 endif
 endif
 
@@ -541,8 +607,8 @@ endif
 # This patch does remove the warning, but I prefer to keep it disabled for the moment...
 ###ifneq ($(shell $(CXX) --version | egrep '^(clang|Apple clang|Intel)'),)
 ###$(BUILDDIR)/CrossSectionKernels.o: CXXFLAGS += -Wno-overriding-t-option
-###ifneq ($(NVCC),)
-###$(BUILDDIR)/gCrossSectionKernels.o: CUFLAGS += -Xcompiler -Wno-overriding-t-option
+###ifneq ($(GPUCC),)
+###$(BUILDDIR)/gCrossSectionKernels.o: GPUFLAGS += -Xcompiler -Wno-overriding-t-option
 ###endif
 ###endif
 
@@ -569,7 +635,7 @@ MG5AMC_CXXLIB = mg5amc_$(processid_short)_cpp
 cxx_objects_lib=$(BUILDDIR)/CPPProcess.o $(BUILDDIR)/MatrixElementKernels.o $(BUILDDIR)/BridgeKernels.o $(BUILDDIR)/CrossSectionKernels.o
 cxx_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel.o $(BUILDDIR)/RamboSamplingKernels.o
 
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 MG5AMC_CULIB = mg5amc_$(processid_short)_cuda
 cu_objects_lib=$(BUILDDIR)/gCPPProcess.o $(BUILDDIR)/gMatrixElementKernels.o $(BUILDDIR)/gBridgeKernels.o $(BUILDDIR)/gCrossSectionKernels.o
 cu_objects_exe=$(BUILDDIR)/gCommonRandomNumberKernel.o $(BUILDDIR)/gRamboSamplingKernels.o
@@ -581,11 +647,11 @@ $(LIBDIR)/lib$(MG5AMC_CXXLIB).so: cxx_objects_lib += $(BUILDDIR)/fbridge.o
 $(LIBDIR)/lib$(MG5AMC_CXXLIB).so: $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib)
 	$(CXX) -shared -o $@ $(cxx_objects_lib) $(CXXLIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB)
 
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 $(LIBDIR)/lib$(MG5AMC_CULIB).so: $(BUILDDIR)/fbridge_cu.o
 $(LIBDIR)/lib$(MG5AMC_CULIB).so: cu_objects_lib += $(BUILDDIR)/fbridge_cu.o
 $(LIBDIR)/lib$(MG5AMC_CULIB).so: $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cu_objects_lib)
-	$(NVCC) --shared -o $@ $(cu_objects_lib) $(CULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB)
+	$(GPUCC) --shared -o $@ $(cu_objects_lib) $(CULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB)
 endif
 
 #-------------------------------------------------------------------------------
@@ -602,16 +668,16 @@ $(cxx_main): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PAT
 $(cxx_main): $(BUILDDIR)/check_sa.o $(LIBDIR)/lib$(MG5AMC_CXXLIB).so $(cxx_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel.o
 	$(CXX) -o $@ $(BUILDDIR)/check_sa.o $(OMPFLAGS) -ldl -pthread $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_CXXLIB) $(cxx_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel.o $(CURANDLIBFLAGS)
 
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 ifneq ($(shell $(CXX) --version | grep ^Intel),)
-$(cu_main): LIBFLAGS += -lintlc # compile with icpx and link with nvcc (undefined reference to `_intel_fast_memcpy')
-$(cu_main): LIBFLAGS += -lsvml # compile with icpx and link with nvcc (undefined reference to `__svml_cos4_l9')
+$(cu_main): LIBFLAGS += -lintlc # compile with icpx and link with GPUCC (undefined reference to `_intel_fast_memcpy')
+$(cu_main): LIBFLAGS += -lsvml # compile with icpx and link with GPUCC (undefined reference to `__svml_cos4_l9')
 else ifneq ($(shell $(CXX) --version | grep ^nvc++),) # support nvc++ #531
 $(cu_main): LIBFLAGS += -L$(patsubst %bin/nvc++,%lib,$(subst ccache ,,$(CXX))) -lnvhpcatm -lnvcpumath -lnvc
 endif
 $(cu_main): LIBFLAGS += $(CULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH
 $(cu_main): $(BUILDDIR)/gcheck_sa.o $(LIBDIR)/lib$(MG5AMC_CULIB).so $(cu_objects_exe) $(BUILDDIR)/gCurandRandomNumberKernel.o
-	$(NVCC) -o $@ $(BUILDDIR)/gcheck_sa.o $(CUARCHFLAGS) $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe) $(BUILDDIR)/gCurandRandomNumberKernel.o $(CURANDLIBFLAGS)
+	$(GPUCC) -o $@ $(BUILDDIR)/gcheck_sa.o $(CUARCHFLAGS) $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe) $(BUILDDIR)/gCurandRandomNumberKernel.o $(CURANDLIBFLAGS)
 endif
 
 #-------------------------------------------------------------------------------
@@ -637,17 +703,17 @@ $(fcxx_main): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PA
 $(fcxx_main): $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler.o $(LIBDIR)/lib$(MG5AMC_CXXLIB).so $(cxx_objects_exe)
 	$(CXX) -o $@ $(BUILDDIR)/fcheck_sa.o $(OMPFLAGS) $(BUILDDIR)/fsampler.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CXXLIB) $(cxx_objects_exe)
 
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 ifneq ($(shell $(CXX) --version | grep ^Intel),)
-$(fcu_main): LIBFLAGS += -lintlc # compile with icpx and link with nvcc (undefined reference to `_intel_fast_memcpy')
-$(fcu_main): LIBFLAGS += -lsvml # compile with icpx and link with nvcc (undefined reference to `__svml_cos4_l9')
+$(fcu_main): LIBFLAGS += -lintlc # compile with icpx and link with GPUCC (undefined reference to `_intel_fast_memcpy')
+$(fcu_main): LIBFLAGS += -lsvml # compile with icpx and link with GPUCC (undefined reference to `__svml_cos4_l9')
 endif
 ifeq ($(UNAME_S),Darwin)
 $(fcu_main): LIBFLAGS += -L$(shell dirname $(shell $(FC) --print-file-name libgfortran.dylib)) # add path to libgfortran on Mac #375
 endif
 $(fcu_main): LIBFLAGS += $(CULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH
 $(fcu_main): $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler_cu.o $(LIBDIR)/lib$(MG5AMC_CULIB).so $(cu_objects_exe)
-	$(NVCC) -o $@ $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler_cu.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe)
+	$(GPUCC) -o $@ $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler_cu.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe)
 endif
 
 #-------------------------------------------------------------------------------
@@ -659,7 +725,7 @@ $(BUILDDIR)/testxxx.o: testxxx_cc_ref.txt
 $(testmain): $(BUILDDIR)/testxxx.o
 $(testmain): cxx_objects_exe += $(BUILDDIR)/testxxx.o # Comment out this line to skip the C++ test of xxx functions
 
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 $(BUILDDIR)/testxxx_cu.o: $(GTESTLIBS)
 $(BUILDDIR)/testxxx_cu.o: INCFLAGS += $(GTESTINC)
 $(BUILDDIR)/testxxx_cu.o: testxxx_cc_ref.txt
@@ -672,7 +738,7 @@ $(BUILDDIR)/testmisc.o: INCFLAGS += $(GTESTINC)
 $(testmain): $(BUILDDIR)/testmisc.o
 $(testmain): cxx_objects_exe += $(BUILDDIR)/testmisc.o # Comment out this line to skip the C++ miscellaneous tests
 
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 $(BUILDDIR)/testmisc_cu.o: $(GTESTLIBS)
 $(BUILDDIR)/testmisc_cu.o: INCFLAGS += $(GTESTINC)
 $(testmain): $(BUILDDIR)/testmisc_cu.o
@@ -684,12 +750,12 @@ $(BUILDDIR)/runTest.o: INCFLAGS += $(GTESTINC)
 $(testmain): $(BUILDDIR)/runTest.o
 $(testmain): cxx_objects_exe += $(BUILDDIR)/runTest.o
 
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 $(BUILDDIR)/runTest_cu.o: $(GTESTLIBS)
 $(BUILDDIR)/runTest_cu.o: INCFLAGS += $(GTESTINC)
 ifneq ($(shell $(CXX) --version | grep ^Intel),)
-$(testmain): LIBFLAGS += -lintlc # compile with icpx and link with nvcc (undefined reference to `_intel_fast_memcpy')
-$(testmain): LIBFLAGS += -lsvml # compile with icpx and link with nvcc (undefined reference to `__svml_cos4_l9')
+$(testmain): LIBFLAGS += -lintlc # compile with icpx and link with GPUCC (undefined reference to `_intel_fast_memcpy')
+$(testmain): LIBFLAGS += -lsvml # compile with icpx and link with GPUCC (undefined reference to `__svml_cos4_l9')
 else ifneq ($(shell $(CXX) --version | grep ^nvc++),) # support nvc++ #531
 $(testmain): LIBFLAGS += -L$(patsubst %bin/nvc++,%lib,$(subst ccache ,,$(CXX))) -lnvhpcatm -lnvcpumath -lnvc
 endif
@@ -713,14 +779,14 @@ $(testmain): LIBFLAGS += -lgomp
 endif
 endif
 
-ifeq ($(NVCC),) # link only runTest.o
+ifeq ($(GPUCC),) # link only runTest.o
 $(testmain): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH
 $(testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(cxx_objects_exe) $(GTESTLIBS)
 	$(CXX) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) -ldl -pthread $(LIBFLAGS)
 else # link both runTest.o and runTest_cu.o
 $(testmain): LIBFLAGS += $(CULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH
 $(testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) $(GTESTLIBS)
-	$(NVCC) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) -ldl $(LIBFLAGS) -lcuda
+	  $(GPUCC) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) -ldl $(LIBFLAGS) $(CUDATESTFLAGS)
 endif
 
 # Use target gtestlibs to build only googletest
@@ -829,9 +895,9 @@ ifeq ($(USECCACHE),1)
 	ccache --version | head -1
 endif
 	@echo ""
-	@echo NVCC=$(NVCC)
-ifneq ($(NVCC),)
-	$(NVCC) --version
+	@echo GPUCC=$(GPUCC)
+ifneq ($(GPUCC),)
+	$(GPUCC) --version
 endif
 	@echo ""
 	@echo CXX=$(CXX)
@@ -850,7 +916,7 @@ endif
 
 # Target: check (run the C++ test executable)
 # [NB THIS IS WHAT IS USED IN THE GITHUB CI!]
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 check: runTest cmpFcheck cmpFGcheck
 else
 check: runTest cmpFcheck
diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/fbridge.cc b/epochX/cudacpp/ee_mumu.mad/SubProcesses/fbridge.cc
index 2d2b36d560..22ce3f5115 100644
--- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/fbridge.cc
+++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/fbridge.cc
@@ -1,11 +1,11 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: S. Roiser (Oct 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Roiser, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #include "Bridge.h"
 #include "CPPProcess.h"
-#include "CudaRuntime.h"
+#include "GpuRuntime.h"
 
 extern "C"
 {
@@ -22,7 +22,7 @@ extern "C"
    * Using the same Fortran MadEvent code, linking to the hetrerogeneous library would allow access to both CPU and GPU implementations.
    * The specific heterogeneous configuration (how many GPUs, how many threads on each CPU, etc) could be loaded in CUDA/C++ from a data file.
    */
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   using namespace mg5amcGpu;
 #else
   using namespace mg5amcCpu;
@@ -46,8 +46,8 @@ extern "C"
    */
   void fbridgecreate_( CppObjectInFortran** ppbridge, const int* pnevtF, const int* pnparF, const int* pnp4F )
   {
-#ifdef __CUDACC__
-    CudaRuntime::setUp();
+#ifdef MGONGPUCPP_GPUIMPL
+    GpuRuntime::setUp();
 #endif
     // (NB: CPPProcess::initProc no longer needs to be executed here because it is called in the Bridge constructor)
     // FIXME: disable OMP in Bridge when called from Fortran
@@ -65,8 +65,8 @@ extern "C"
     Bridge<FORTRANFPTYPE>* pbridge = dynamic_cast<Bridge<FORTRANFPTYPE>*>( *ppbridge );
     if( pbridge == 0 ) throw std::runtime_error( "fbridgedelete_: invalid Bridge address" );
     delete pbridge;
-#ifdef __CUDACC__
-    CudaRuntime::tearDown();
+#ifdef MGONGPUCPP_GPUIMPL
+    GpuRuntime::tearDown();
 #endif
   }
 
@@ -96,7 +96,7 @@ extern "C"
   {
     Bridge<FORTRANFPTYPE>* pbridge = dynamic_cast<Bridge<FORTRANFPTYPE>*>( *ppbridge );
     if( pbridge == 0 ) throw std::runtime_error( "fbridgesequence_: invalid Bridge address" );
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     // Use the device/GPU implementation in the CUDA library
     // (there is also a host implementation in this library)
     pbridge->gpu_sequence( momenta, gs, rndhel, rndcol, *pchannelId, mes, selhel, selcol );
diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/fsampler.cc b/epochX/cudacpp/ee_mumu.mad/SubProcesses/fsampler.cc
index 2fb445372d..3743934f41 100644
--- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/fsampler.cc
+++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/fsampler.cc
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Feb 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #include "mgOnGpuConfig.h"
 
@@ -13,7 +13,7 @@
 
 //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -40,7 +40,7 @@ namespace mg5amcCpu
   private:
     const int m_nevt; // The number of events in each iteration
     int m_iiter;      // The iteration counter (for random number seeding)
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
     HostBufferRndNumMomenta m_hstRndmom; // Memory buffers for random numbers
     HostBufferMomenta m_hstMomenta;      // Memory buffers for momenta
     HostBufferWeights m_hstWeights;      // Memory buffers for sampling weights
@@ -105,7 +105,7 @@ namespace mg5amcCpu
 
 extern "C"
 {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   using namespace mg5amcGpu;
 #else
   using namespace mg5amcCpu;
diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/runTest.cc b/epochX/cudacpp/ee_mumu.mad/SubProcesses/runTest.cc
index d4a760a71b..de327f2321 100644
--- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/runTest.cc
+++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/runTest.cc
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: S. Hageboeck (Nov 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 //----------------------------------------------------------------------------
 // Use ./runTest.exe --gtest_filter=*xxx to run only testxxx.cc tests
 //----------------------------------------------------------------------------
@@ -18,7 +18,7 @@
 #include "RandomNumberKernels.h"
 #include "epoch_process_id.h"
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 using namespace mg5amcGpu;
 #else
 using namespace mg5amcCpu;
@@ -35,7 +35,7 @@ struct CUDA_CPU_TestBase : public TestDriverBase
     : TestDriverBase( npar, refFileName ) {}
 };
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 struct CPUTest : public CUDA_CPU_TestBase
 {
   // Struct data members (process, and memory structures for random numbers, momenta, matrix elements and weights on host and device)
@@ -119,7 +119,7 @@ struct CPUTest : public CUDA_CPU_TestBase
 };
 #endif
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 struct CUDATest : public CUDA_CPU_TestBase
 {
   // Reset the device when our test goes out of scope. Note that this should happen after
@@ -128,7 +128,7 @@ struct CUDATest : public CUDA_CPU_TestBase
   {
     ~DeviceReset()
     {
-      checkCuda( cudaDeviceReset() ); // this is needed by cuda-memcheck --leak-check full
+      checkGpu( gpuDeviceReset() ); // this is needed by cuda-memcheck --leak-check full
     }
   } deviceResetter;
 
@@ -256,7 +256,7 @@ INSTANTIATE_TEST_SUITE_P( prefix, \
                           test_suite_name, \
                           testing::Values( new CUDATest( MG_EPOCH_REFERENCE_FILE_NAME ) ) );
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 MG_INSTANTIATE_TEST_SUITE_GPU( XTESTID_GPU( MG_EPOCH_PROCESS_ID ), MadgraphTest );
 #else
 MG_INSTANTIATE_TEST_SUITE_CPU( XTESTID_CPU( MG_EPOCH_PROCESS_ID ), MadgraphTest );
diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/testmisc.cc b/epochX/cudacpp/ee_mumu.mad/SubProcesses/testmisc.cc
index 895d6eeb56..ba9e59a8a3 100644
--- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/testmisc.cc
+++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/testmisc.cc
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 //----------------------------------------------------------------------------
 // Use ./runTest.exe --gtest_filter=*misc to run only testmisc.cc tests
 //----------------------------------------------------------------------------
@@ -17,7 +17,7 @@
 #include <sstream>
 #include <typeinfo>
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 #define TESTID( s ) s##_GPU_MISC
 #else
 #define TESTID( s ) s##_CPU_MISC
@@ -26,7 +26,7 @@
 #define XTESTID( s ) TESTID( s )
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -59,7 +59,7 @@ namespace mg5amcCpu
 
 TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testmisc )
 {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   using namespace mg5amcGpu;
 #else
   using namespace mg5amcCpu;
diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/testxxx.cc b/epochX/cudacpp/ee_mumu.mad/SubProcesses/testxxx.cc
index 3361fe5aa9..e5167de00c 100644
--- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/testxxx.cc
+++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/testxxx.cc
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Apr 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 //----------------------------------------------------------------------------
 // Use ./runTest.exe --gtest_filter=*xxx to run only testxxx.cc tests
 //----------------------------------------------------------------------------
@@ -24,7 +24,7 @@
 #include <iomanip>
 #include <iostream>
 #include <vector>
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 #define TESTID( s ) s##_GPU_XXX
 #else
 #define TESTID( s ) s##_CPU_XXX
@@ -32,7 +32,7 @@
 
 #define XTESTID( s ) TESTID( s )
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -42,7 +42,7 @@ namespace mg5amcCpu
   int FPEhandlerIevt = -1;
   inline void FPEhandler( int sig )
   {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     std::cerr << "Floating Point Exception (GPU): '" << FPEhandlerMessage << "' ievt=" << FPEhandlerIevt << std::endl;
 #else
     std::cerr << "Floating Point Exception (CPU neppV=" << neppV << "): '" << FPEhandlerMessage << "' ievt=" << FPEhandlerIevt << std::endl;
@@ -53,7 +53,7 @@ namespace mg5amcCpu
 
 TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testxxx )
 {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   using namespace mg5amcGpu;
 #else
   using namespace mg5amcCpu;
@@ -77,7 +77,7 @@ TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testxxx )
   assert( nevt % neppM == 0 ); // nevt must be a multiple of neppM
   assert( nevt % neppV == 0 ); // nevt must be a multiple of neppV
   // Fill in the input momenta
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   mg5amcGpu::PinnedHostBufferMomenta hstMomenta( nevt ); // AOSOA[npagM][npar=4][np4=4][neppM]
 #else
   mg5amcCpu::HostBufferMomenta hstMomenta( nevt ); // AOSOA[npagM][npar=4][np4=4][neppM]
@@ -322,7 +322,7 @@ TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testxxx )
   {
     for( int ievt = 0; ievt < nevt; ievt++ )
     {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
       using namespace mg5amcGpu;
 #else
       using namespace mg5amcCpu;
diff --git a/epochX/cudacpp/ee_mumu.mad/src/HelAmps_sm.h b/epochX/cudacpp/ee_mumu.mad/src/HelAmps_sm.h
index 9fa30cfd7f..e878fcd28e 100644
--- a/epochX/cudacpp/ee_mumu.mad/src/HelAmps_sm.h
+++ b/epochX/cudacpp/ee_mumu.mad/src/HelAmps_sm.h
@@ -5,7 +5,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: A. Valassi (Sep 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
 // MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
@@ -28,7 +28,7 @@
 //#include <iomanip>
 //#include <iostream>
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/ee_mumu.mad/src/Parameters_sm.cc b/epochX/cudacpp/ee_mumu.mad/src/Parameters_sm.cc
index 0b4be4d5ed..cffc5d3bff 100644
--- a/epochX/cudacpp/ee_mumu.mad/src/Parameters_sm.cc
+++ b/epochX/cudacpp/ee_mumu.mad/src/Parameters_sm.cc
@@ -4,7 +4,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: A. Valassi (Sep 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
 // MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
@@ -17,7 +17,7 @@
 #include <iomanip>
 #include <iostream>
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 using namespace mg5amcGpu;
 #else
 using namespace mg5amcCpu;
diff --git a/epochX/cudacpp/ee_mumu.mad/src/Parameters_sm.h b/epochX/cudacpp/ee_mumu.mad/src/Parameters_sm.h
index 64d0b8e761..2a6d960581 100644
--- a/epochX/cudacpp/ee_mumu.mad/src/Parameters_sm.h
+++ b/epochX/cudacpp/ee_mumu.mad/src/Parameters_sm.h
@@ -27,7 +27,7 @@
 #include "read_slha.h"
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -93,7 +93,7 @@ namespace mg5amcCpu
 #include <limits>
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -218,7 +218,7 @@ namespace mg5amcCpu
 //==========================================================================
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -235,7 +235,7 @@ namespace mg5amcCpu
 #pragma GCC diagnostic push
 #pragma GCC diagnostic ignored "-Wunused-variable"  // e.g. <<warning: unused variable ‘mdl_G__exp__2’ [-Wunused-variable]>>
 #pragma GCC diagnostic ignored "-Wunused-parameter" // e.g. <<warning: unused parameter ‘G’ [-Wunused-parameter]>>
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 #pragma nv_diagnostic push
 #pragma nv_diag_suppress 177 // e.g. <<warning #177-D: variable "mdl_G__exp__2" was declared but never referenced>>
 #endif
@@ -259,7 +259,7 @@ namespace mg5amcCpu
       // End SM implementation - no special handling of vectors of floats as in EFT (#439)
       return out;
     }
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 #pragma GCC diagnostic pop
 #pragma nv_diagnostic pop
 #endif
diff --git a/epochX/cudacpp/ee_mumu.mad/src/cudacpp_src.mk b/epochX/cudacpp/ee_mumu.mad/src/cudacpp_src.mk
index d4cc628aec..159e19a46d 100644
--- a/epochX/cudacpp/ee_mumu.mad/src/cudacpp_src.mk
+++ b/epochX/cudacpp/ee_mumu.mad/src/cudacpp_src.mk
@@ -19,7 +19,7 @@ SHELL := /bin/bash
 #=== Configure common compiler flags for CUDA and C++
 
 INCFLAGS = -I.
-OPTFLAGS = -O3 # this ends up in CUFLAGS too (should it?), cannot add -Ofast or -ffast-math here
+OPTFLAGS = -O3 # this ends up in GPUFLAGS too (should it?), cannot add -Ofast or -ffast-math here
 
 #-------------------------------------------------------------------------------
 
@@ -45,13 +45,13 @@ endif
 
 #-------------------------------------------------------------------------------
 
-#=== Configure the CUDA compiler (note: NVCC is already exported including ccache)
+#=== Configure the CUDA compiler (note: GPUCC is already exported including ccache)
 
-###$(info NVCC=$(NVCC))
+###$(info GPUCC=$(GPUCC))
 
 #-------------------------------------------------------------------------------
 
-#=== Configure ccache for C++ builds (note: NVCC is already exported including ccache)
+#=== Configure ccache for C++ builds (note: GPUCC is already exported including ccache)
 
 # Enable ccache if USECCACHE=1
 ifeq ($(USECCACHE)$(shell echo $(CXX) | grep ccache),1)
@@ -92,6 +92,13 @@ endif
 ###$(info OMPFLAGS=$(OMPFLAGS))
 CXXFLAGS += $(OMPFLAGS)
 
+# Add correct -DHIP_LATFORM when compiling for HIP
+ifeq ($(findstring nvcc,$(GPUCC)),nvcc)
+  GPUFLAGS += -Xcompiler -fPIC -c -x cu
+else ifeq ($(findstring hipcc,$(GPUCC)),hipcc)
+  GPUFLAGS += -fPIC -c
+endif
+
 # Set the build flags appropriate to each AVX choice (example: "make AVX=none")
 # [NB MGONGPU_PVW512 is needed because "-mprefer-vector-width=256" is not exposed in a macro]
 # [See https://gcc.gnu.org/bugzilla/show_bug.cgi?id=96476]
@@ -253,20 +260,20 @@ $(BUILDDIR)/%.o : %.cc *.h $(BUILDDIR)/.build.$(TAG)
 # Generic target and build rules: objects from CUDA compilation
 $(BUILDDIR)/%_cu.o : %.cc *.h $(BUILDDIR)/.build.$(TAG)
 	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
-	$(NVCC) $(CPPFLAGS) $(CUFLAGS) -Xcompiler -fPIC -c -x cu $< -o $@
+	$(GPUCC) $(CPPFLAGS) $(GPUFLAGS) $< -o $@
 
 #-------------------------------------------------------------------------------
 
 cxx_objects=$(addprefix $(BUILDDIR)/, Parameters_sm.o read_slha.o)
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 cu_objects=$(addprefix $(BUILDDIR)/, Parameters_sm_cu.o)
 endif
 
 # Target (and build rules): common (src) library
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so : $(cxx_objects) $(cu_objects)
 	@if [ ! -d $(LIBDIR) ]; then echo "mkdir -p $(LIBDIR)"; mkdir -p $(LIBDIR); fi
-	$(NVCC) -shared -o $@ $(cxx_objects) $(cu_objects) $(LDFLAGS)
+	$(GPUCC) -shared -o $@ $(cxx_objects) $(cu_objects) $(LDFLAGS)
 else
 $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so : $(cxx_objects)
 	@if [ ! -d $(LIBDIR) ]; then echo "mkdir -p $(LIBDIR)"; mkdir -p $(LIBDIR); fi
diff --git a/epochX/cudacpp/ee_mumu.mad/src/mgOnGpuConfig.h b/epochX/cudacpp/ee_mumu.mad/src/mgOnGpuConfig.h
index 80032e528b..55d03f1252 100644
--- a/epochX/cudacpp/ee_mumu.mad/src/mgOnGpuConfig.h
+++ b/epochX/cudacpp/ee_mumu.mad/src/mgOnGpuConfig.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jul 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MGONGPUCONFIG_H
 #define MGONGPUCONFIG_H 1
@@ -10,12 +10,25 @@
 // There are two different code bases for standalone_cudacpp (without multichannel) and madevent+cudacpp (with multichannel)
 #define MGONGPU_SUPPORTS_MULTICHANNEL 1
 
+// Is this a GPU (CUDA, HIP) or CPU implementation?
+#ifdef __CUDACC__
+#define MGONGPUCPP_GPUIMPL cuda
+#elif defined __HIPCC__
+#define MGONGPUCPP_GPUIMPL hip
+#else
+#undef MGONGPUCPP_GPUIMPL
+#endif
+
 // ** NB1 Throughputs (e.g. 6.8E8) are events/sec for "./gcheck.exe -p 65536 128 12"
 // ** NB2 Baseline on b7g47n0004 fluctuates (probably depends on load on other VMs)
 
 // Choose if curand is supported for generating random numbers
+// For HIP, by default, do not use curand (common random numbers will be used instead)
 // For both CUDA and C++, by default, do not inline, but allow this macro to be set from outside with e.g. -DMGONGPU_HAS_NO_CURAND
-// (there exist CUDA installations, e.g. using the HPC package, which do not include curand - see PR #784)
+// (there exist CUDA installations, e.g. using the HPC package, which do not include curand - see PR #784 and #785)
+#if defined __HIPCC__
+#define MGONGPU_HAS_NO_CURAND 1
+#else
 //#ifdef __CUDACC__
 //#undef MGONGPU_HAS_NO_CURAND // default
 ////#define MGONGPU_HAS_NO_CURAND 1
@@ -23,6 +36,7 @@
 //#undef MGONGPU_HAS_NO_CURAND // default
 ////#define MGONGPU_HAS_NO_CURAND 1
 //#endif
+#endif
 
 // Choose floating point precision (for everything but color algebra #537)
 // If one of these macros has been set from outside with e.g. -DMGONGPU_FPTYPE_FLOAT, nothing happens (issue #167)
@@ -54,23 +68,28 @@
 //#undef MGONGPU_HARDCODE_PARAM // default
 ////#define MGONGPU_HARDCODE_PARAM 1
 
-// Complex type in c++: std::complex or cxsmpl (CHOOSE ONLY ONE)
-#ifndef __CUDACC__
-//#define MGONGPU_CPPCXTYPE_STDCOMPLEX 1 // ~8 percent slower on float, same on double (5.1E6/double, 9.4E6/float)
-#define MGONGPU_CPPCXTYPE_CXSMPL 1 // new default (5.1E6/double, 10.2E6/float)
-#endif
-
-// Complex type in cuda: thrust or cucomplex or cxsmpl (CHOOSE ONLY ONE)
+// Complex type in CUDA: thrust or cucomplex or cxsmpl (CHOOSE ONLY ONE)
 #ifdef __CUDACC__
 #define MGONGPU_CUCXTYPE_THRUST 1 // default (~1.15E9/double, ~3.2E9/float)
 //#define MGONGPU_CUCXTYPE_CUCOMPLEX 1 // ~10 percent slower (1.03E9/double, ~2.8E9/float)
 //#define MGONGPU_CUCXTYPE_CXSMPL 1 // ~10 percent slower (1.00E9/double, ~2.9E9/float)
+
+// Complex type in HIP: cxsmpl (ONLY ONE OPTION POSSIBLE)
+#elif defined __HIPCC__
+#define MGONGPU_CUCXTYPE_CXSMPL 1 // ~10 percent slower (1.00E9/double, ~2.9E9/float)
+
+// Complex type in C++: std::complex or cxsmpl (CHOOSE ONLY ONE)
+#else
+//#define MGONGPU_CPPCXTYPE_STDCOMPLEX 1 // ~8 percent slower on float, same on double (5.1E6/double, 9.4E6/float)
+#define MGONGPU_CPPCXTYPE_CXSMPL 1 // new default (5.1E6/double, 10.2E6/float)
 #endif
 
-// Cuda nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation
+// CUDA nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation
 #ifdef __CUDACC__
-#undef MGONGPU_NSIGHT_DEBUG // default
+#undef MGONGPU_NSIGHT_DEBUG // default in CUDA
 //#define MGONGPU_NSIGHT_DEBUG 1
+#else
+#undef MGONGPU_NSIGHT_DEBUG // only option in HIP or C++
 #endif
 
 // SANITY CHECKS (floating point precision for everything but color algebra #537)
@@ -86,17 +105,21 @@
 #error You cannot use double precision for color algebra and single precision elsewhere
 #endif
 
-// SANITY CHECKS (c++ complex number implementation)
-#ifndef __CUDACC__
-#if defined MGONGPU_CPPCXTYPE_STDCOMPLEX and defined MGONGPU_CPPCXTYPE_CXSMPL
-#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CPPCXTYPE_STDCOMPLEX or MGONGPU_CPPCXTYPE_CXSMPL
+// SANITY CHECKS (CUDA complex number implementation)
+#ifdef __CUDACC__
+#if defined MGONGPU_CUCXTYPE_THRUST and defined MGONGPU_CUCXTYPE_CUCOMPLEX
+#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CUCXTYPE_THRUST or MGONGPU_CUCXTYPE_CUCOMPLEX for CUDA
+#elif defined MGONGPU_CUCXTYPE_THRUST and defined MGONGPU_CUCXTYPE_CXSMPL
+#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CUCXTYPE_THRUST or MGONGPU_CUCXTYPE_CXSMPL for CUDA
+#elif defined MGONGPU_CUCXTYPE_CUCOMPLEX and defined MGONGPU_CUCXTYPE_CXSMPL
+#error You must CHOOSE (ONE AND) ONLY ONE OF MGONGPU_CUCXTYPE_CUCOMPLEX or MGONGPU_CUCXTYPE_CXSMPL for CUDA
 #endif
 #endif
 
-// SANITY CHECKS (cuda complex number implementation)
-#ifdef __CUDACC__
-#if defined MGONGPU_CUCXTYPE_THRUST and defined MGONGPU_CUCXTYPE_CUCOMPLEX and defined MGONGPU_CUCXTYPE_CXSMPL
-#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CUCXTYPE_THRUST or MGONGPU_CUCXTYPE_CUCOMPLEX or MGONGPU_CUCXTYPE_CXSMPL
+// SANITY CHECKS (C++ complex number implementation)
+#ifndef MGONGPUCPP_GPUIMPL
+#if defined MGONGPU_CPPCXTYPE_STDCOMPLEX and defined MGONGPU_CPPCXTYPE_CXSMPL
+#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CPPCXTYPE_STDCOMPLEX or MGONGPU_CPPCXTYPE_CXSMPL for C++
 #endif
 #endif
 
@@ -134,7 +157,7 @@ namespace mgOnGpu
   // Alignment requirement for using reinterpret_cast with SIMD vectorized code
   // (using reinterpret_cast with non aligned memory may lead to segmentation faults!)
   // Only needed for C++ code but can be enforced also in NVCC builds of C++ code using CUDA>=11.2 and C++17 (#318, #319, #333)
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   constexpr int cppAlign = 64; // alignment requirement for SIMD vectorization (64-byte i.e. 512-bit)
 #endif
 
@@ -145,7 +168,7 @@ using mgOnGpu::fptype;
 using mgOnGpu::fptype2;
 
 // C++ SIMD vectorization width (this will be used to set neppV)
-#ifdef __CUDACC__ // CUDA implementation has no SIMD
+#ifdef MGONGPUCPP_GPUIMPL // CUDA and HIP implementations have no SIMD
 #undef MGONGPU_CPPSIMD
 #elif defined __AVX512VL__ && defined MGONGPU_PVW512 // C++ "512z" AVX512 with 512 width (512-bit ie 64-byte): 8 (DOUBLE) or 16 (FLOAT)
 #ifdef MGONGPU_FPTYPE_DOUBLE
@@ -175,9 +198,9 @@ using mgOnGpu::fptype2;
 #undef MGONGPU_CPPSIMD
 #endif
 
-// Cuda nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation
+// CUDA nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation
 // Arguments (not used so far): text is __FUNCTION__, code is 0 (start) or 1 (end)
-#if defined __CUDACC__ && defined MGONGPU_NSIGHT_DEBUG /* clang-format off */
+#if defined __CUDA__ && defined MGONGPU_NSIGHT_DEBUG /* clang-format off */
 #define mgDebugDeclare() __shared__ float mgDebugCounter[mgOnGpu::ntpbMAX];
 #define mgDebugInitialise() { mgDebugCounter[threadIdx.x] = 0; }
 #define mgDebug( code, text ) { mgDebugCounter[threadIdx.x] += 1; }
@@ -189,8 +212,8 @@ using mgOnGpu::fptype2;
 #define mgDebugFinalise() { /*noop*/ }
 #endif /* clang-format on */
 
-// Define empty CUDA declaration specifiers for C++
-#ifndef __CUDACC__
+// Define empty CUDA/HIP declaration specifiers for C++
+#ifndef MGONGPUCPP_GPUIMPL
 #define __global__
 #define __host__
 #define __device__
diff --git a/epochX/cudacpp/ee_mumu.mad/src/mgOnGpuCxtypes.h b/epochX/cudacpp/ee_mumu.mad/src/mgOnGpuCxtypes.h
index ca9a9f00c0..5532e22fa1 100644
--- a/epochX/cudacpp/ee_mumu.mad/src/mgOnGpuCxtypes.h
+++ b/epochX/cudacpp/ee_mumu.mad/src/mgOnGpuCxtypes.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022, based on earlier work by D. Smith) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MGONGPUCXTYPES_H
 #define MGONGPUCXTYPES_H 1
@@ -19,7 +19,7 @@
 #include <complex>
 
 // Complex type in cuda: thrust or cucomplex or cxsmpl
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 #if defined MGONGPU_CUCXTYPE_THRUST
 #pragma clang diagnostic push
 #pragma clang diagnostic ignored "-Wtautological-compare" // for icpx2021/clang13 (https://stackoverflow.com/a/15864661)
@@ -82,7 +82,7 @@ namespace mgOnGpu /* clang-format off */
 using mgOnGpu::cxsmpl;
 
 // Printout to stream for user defined types
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -92,7 +92,7 @@ namespace mg5amcCpu
   inline __host__ std::ostream&
   operator<<( std::ostream& out, const cxsmpl<FP>& c )
   {
-    out << std::complex( c.real(), c.imag() );
+    out << std::complex<FP>( c.real(), c.imag() );
     return out;
   }
 
@@ -215,14 +215,14 @@ namespace mg5amcCpu
 //==========================================================================
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
 #endif
 {
   // --- Type definitions (complex type: cxtype)
-#ifdef __CUDACC__ // cuda
+#ifdef MGONGPUCPP_GPUIMPL // cuda
 #if defined MGONGPU_CUCXTYPE_THRUST
   typedef thrust::complex<fptype> cxtype;
 #elif defined MGONGPU_CUCXTYPE_CUCOMPLEX
@@ -255,7 +255,7 @@ namespace mg5amcCpu
 //==========================================================================
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -307,7 +307,7 @@ namespace mg5amcCpu
 
   //==========================================================================
 
-#if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_THRUST // cuda + thrust
+#if defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CUCXTYPE_THRUST // cuda + thrust
 
   //------------------------------
   // CUDA - using thrust::complex
@@ -343,11 +343,11 @@ namespace mg5amcCpu
     return c;
   }
 
-#endif // #if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_THRUST
+#endif // #if defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CUCXTYPE_THRUST
 
   //==========================================================================
 
-#if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_CUCOMPLEX // cuda + cucomplex
+#if defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CUCXTYPE_CUCOMPLEX // cuda + cucomplex
 
   //------------------------------
   // CUDA - using cuComplex
@@ -562,11 +562,11 @@ namespace mg5amcCpu
     return cxmake( c.real(), c.imag() );
   }
 
-#endif // #if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_CUCOMPLEX
+#endif // #if defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CUCXTYPE_CUCOMPLEX
 
   //==========================================================================
 
-#if not defined __CUDACC__ and defined MGONGPU_CPPCXTYPE_STDCOMPLEX // c++ + stdcomplex
+#if not defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CPPCXTYPE_STDCOMPLEX // c++ + stdcomplex
 
   //------------------------------
   // C++ - using std::complex
@@ -610,7 +610,7 @@ namespace mg5amcCpu
   }
 #endif
 
-#endif // #if not defined __CUDACC__ and defined MGONGPU_CPPCXTYPE_STDCOMPLEX
+#endif // #if not defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CPPCXTYPE_STDCOMPLEX
 
   //==========================================================================
 
@@ -633,7 +633,7 @@ namespace mg5amcCpu
 //==========================================================================
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/ee_mumu.mad/src/mgOnGpuFptypes.h b/epochX/cudacpp/ee_mumu.mad/src/mgOnGpuFptypes.h
index 905c97d700..fa3a02664b 100644
--- a/epochX/cudacpp/ee_mumu.mad/src/mgOnGpuFptypes.h
+++ b/epochX/cudacpp/ee_mumu.mad/src/mgOnGpuFptypes.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MGONGPUFPTYPES_H
 #define MGONGPUFPTYPES_H 1
@@ -12,7 +12,7 @@
 #include <cmath>
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL // cuda
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -20,7 +20,7 @@ namespace mg5amcCpu
 {
   //==========================================================================
 
-#ifdef __CUDACC__ // cuda
+#ifdef MGONGPUCPP_GPUIMPL // cuda
 
   //------------------------------
   // Floating point types - Cuda
@@ -64,11 +64,11 @@ namespace mg5amcCpu
 #endif
   }
 
-#endif // #ifdef __CUDACC__
+#endif // #ifdef MGONGPUCPP_GPUIMPL
 
   //==========================================================================
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 
   //------------------------------
   // Floating point types - C++
@@ -92,7 +92,7 @@ namespace mg5amcCpu
     return std::sqrt( f );
   }
 
-#endif // #ifndef __CUDACC__
+#endif // #ifndef MGONGPUCPP_GPUIMPL
 
   //==========================================================================
 
diff --git a/epochX/cudacpp/ee_mumu.mad/src/mgOnGpuVectors.h b/epochX/cudacpp/ee_mumu.mad/src/mgOnGpuVectors.h
index e1299ba81e..cdae04326b 100644
--- a/epochX/cudacpp/ee_mumu.mad/src/mgOnGpuVectors.h
+++ b/epochX/cudacpp/ee_mumu.mad/src/mgOnGpuVectors.h
@@ -32,7 +32,7 @@
 #endif
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -131,7 +131,7 @@ namespace mg5amcCpu
 #endif
 #endif
 
-#else // i.e #ifndef MGONGPU_CPPSIMD (this includes #ifdef __CUDACC__)
+#else // i.e #ifndef MGONGPU_CPPSIMD (this includes #ifdef MGONGPUCPP_GPUIMPL)
 
   const int neppV = 1;
 
@@ -153,13 +153,13 @@ namespace mg5amcCpu
 //==========================================================================
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
 #endif
 {
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 
   // Printout to stream for user defined types
 
@@ -805,11 +805,11 @@ namespace mg5amcCpu
 
 #endif // #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
 
-#endif // #ifndef __CUDACC__
+#endif // #ifndef MGONGPUCPP_GPUIMPL
 
   //==========================================================================
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 
   //------------------------------
   // Vector types - CUDA
@@ -853,12 +853,12 @@ namespace mg5amcCpu
     return mask;
   }
 
-#endif // #ifdef __CUDACC__
+#endif // #ifdef MGONGPUCPP_GPUIMPL
 
   //==========================================================================
 
   // Scalar-or-vector types: scalar in CUDA, vector or scalar in C++
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   typedef bool bool_sv;
   typedef fptype fptype_sv;
   typedef fptype2 fptype2_sv;
@@ -879,7 +879,7 @@ namespace mg5amcCpu
 #endif
 
   // Scalar-or-vector zeros: scalar in CUDA, vector or scalar in C++
-#ifdef __CUDACC__ /* clang-format off */
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
   inline __host__ __device__ cxtype cxzero_sv(){ return cxtype( 0, 0 ); }
 #elif defined MGONGPU_CPPSIMD
   inline cxtype_v cxzero_sv() { return cxtype_v(); } // RRRR=0000 IIII=0000
diff --git a/epochX/cudacpp/ee_mumu.mad/src/rambo.h b/epochX/cudacpp/ee_mumu.mad/src/rambo.h
index e02ea52496..cd7e1008ea 100644
--- a/epochX/cudacpp/ee_mumu.mad/src/rambo.h
+++ b/epochX/cudacpp/ee_mumu.mad/src/rambo.h
@@ -4,7 +4,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 
 #include "mgOnGpuConfig.h"
@@ -18,7 +18,7 @@
 #include <iostream>
 
 // Simplified rambo version for 2 to N (with N>=2) processes with massless particles
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -83,7 +83,7 @@ namespace mg5amcCpu
       static bool first = true;
       if( first )
       {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
         if constexpr( M_ACCESS::isOnDevice() ) // avoid
         {
           const int ievt0 = 0;
@@ -166,7 +166,7 @@ namespace mg5amcCpu
     wt = po2log;
     if( nparf != 2 ) wt = ( 2. * nparf - 4. ) * log( energy ) + z[nparf - 1];
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
     // issue warnings if weight is too small or too large
     static int iwarn[5] = { 0, 0, 0, 0, 0 };
     if( wt < -180. )
diff --git a/epochX/cudacpp/ee_mumu.sa/CODEGEN_cudacpp_ee_mumu_log.txt b/epochX/cudacpp/ee_mumu.sa/CODEGEN_cudacpp_ee_mumu_log.txt
index 636fab0372..20d35a4a26 100644
--- a/epochX/cudacpp/ee_mumu.sa/CODEGEN_cudacpp_ee_mumu_log.txt
+++ b/epochX/cudacpp/ee_mumu.sa/CODEGEN_cudacpp_ee_mumu_log.txt
@@ -52,7 +52,7 @@ Note that you can still compile and run aMC@NLO with the built-in PDFs
 
 Using default text editor "vi". Set another one in ./input/mg5_configuration.txt
 Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt
-No valid web browser found. Please set in ./input/mg5_configuration.txt
+Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt
 import /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_ee_mumu.mg
 The import format was not given, so we guess it as command
 set stdout_level DEBUG
@@ -62,7 +62,7 @@ generate e+ e- > mu+ mu-
 No model currently active, so we import the Standard Model
 INFO: load particles 
 INFO: load vertices 
-[1;32mDEBUG: model prefixing  takes 0.00569605827331543 [0m
+[1;32mDEBUG: model prefixing  takes 0.005757331848144531 [0m
 INFO: Restrict model sm with file models/sm/restrict_default.dat . 
 [1;32mDEBUG: Simplifying conditional expressions [0m
 [1;32mDEBUG: remove interactions: u s w+ at order: QED=1 [0m
@@ -181,7 +181,7 @@ ALOHA: aloha creates FFV1 routines[0m
 ALOHA: aloha creates FFV2 routines[0m
 ALOHA: aloha creates FFV4 routines[0m
 ALOHA: aloha creates FFV2_4 routines[0m
-ALOHA: aloha creates 4 routines in  0.271 s
+ALOHA: aloha creates 4 routines in  0.267 s
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
 <class 'aloha.create_aloha.AbstractRoutine'> FFV2
@@ -201,6 +201,6 @@ INFO: /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_ee_mumu
 quit
 
 real	0m0.662s
-user	0m0.604s
-sys	0m0.052s
+user	0m0.596s
+sys	0m0.051s
 Code generation completed in 1 seconds
diff --git a/epochX/cudacpp/ee_mumu.sa/COPYRIGHT b/epochX/cudacpp/ee_mumu.sa/COPYRIGHT
index a134b5fef9..84a883fbb0 100644
--- a/epochX/cudacpp/ee_mumu.sa/COPYRIGHT
+++ b/epochX/cudacpp/ee_mumu.sa/COPYRIGHT
@@ -15,6 +15,7 @@ The full development team currently includes the following authors :
   Stephan Hageboeck (CERN)
   Olivier Mattelaer (Universite Catholique de Louvain, original author)
   Stefan Roiser (CERN, original author)
+  Joergen Teig (CERN)
   Andrea Valassi (CERN, original author)
   Zenny Wettersten (CERN)
 See https://github.com/madgraph5/madgraph4gpu for more details. For the full
diff --git a/epochX/cudacpp/ee_mumu.sa/SubProcesses/Bridge.h b/epochX/cudacpp/ee_mumu.sa/SubProcesses/Bridge.h
index bf8b5e024d..89437b4c42 100644
--- a/epochX/cudacpp/ee_mumu.sa/SubProcesses/Bridge.h
+++ b/epochX/cudacpp/ee_mumu.sa/SubProcesses/Bridge.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: S. Roiser (Nov 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Roiser, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef BRIDGE_H
 #define BRIDGE_H 1
@@ -23,7 +23,7 @@
 #include <memory>
 #include <type_traits>
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -83,7 +83,7 @@ namespace mg5amcCpu
     Bridge& operator=( const Bridge& ) = delete;
     Bridge& operator=( Bridge&& ) = delete;
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     /**
      * Set the gpublocks and gputhreads for the gpusequence - throws if evnt != gpublocks*gputhreads
      * (this is needed for BridgeKernel tests rather than for actual production use in Fortran)
@@ -150,7 +150,7 @@ namespace mg5amcCpu
     unsigned int m_nevt; // number of events
     int m_nGoodHel;      // the number of good helicities (-1 initially when they have not yet been calculated)
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     int m_gputhreads; // number of gpu threads (default set from number of events, can be modified)
     int m_gpublocks;  // number of gpu blocks (default set from number of events, can be modified)
     DeviceBuffer<FORTRANFPTYPE, sizePerEventMomenta> m_devMomentaF;
@@ -187,12 +187,12 @@ namespace mg5amcCpu
   // Forward declare transposition methods
   //
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 
   template<typename Tin, typename Tout>
   __global__ void dev_transposeMomentaF2C( const Tin* in, Tout* out, const unsigned int nevt );
 
-#endif // __CUDACC__
+#endif // MGONGPUCPP_GPUIMPL
 
   template<typename Tin, typename Tout>
   void hst_transposeMomentaF2C( const Tin* in, Tout* out, const unsigned int nevt );
@@ -209,7 +209,7 @@ namespace mg5amcCpu
   Bridge<FORTRANFPTYPE>::Bridge( unsigned int nevtF, unsigned int nparF, unsigned int np4F )
     : m_nevt( nevtF )
     , m_nGoodHel( -1 )
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     , m_gputhreads( 256 )                  // default number of gpu threads
     , m_gpublocks( m_nevt / m_gputhreads ) // this ensures m_nevt <= m_gpublocks*m_gputhreads
     , m_devMomentaF( m_nevt )
@@ -233,7 +233,7 @@ namespace mg5amcCpu
   {
     if( nparF != CPPProcess::npar ) throw std::runtime_error( "Bridge constructor: npar mismatch" );
     if( np4F != CPPProcess::np4 ) throw std::runtime_error( "Bridge constructor: np4 mismatch" );
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     if( ( m_nevt < s_gputhreadsmin ) || ( m_nevt % s_gputhreadsmin != 0 ) )
       throw std::runtime_error( "Bridge constructor: nevt should be a multiple of " + std::to_string( s_gputhreadsmin ) );
     while( m_nevt != m_gpublocks * m_gputhreads )
@@ -249,7 +249,7 @@ namespace mg5amcCpu
 #else
     std::cout << "WARNING! Instantiate host Bridge (nevt=" << m_nevt << ")" << std::endl;
     m_pmek.reset( new MatrixElementKernelHost( m_hstMomentaC, m_hstGs, m_hstRndHel, m_hstRndCol, m_hstMEs, m_hstSelHel, m_hstSelCol, m_nevt ) );
-#endif // __CUDACC__
+#endif // MGONGPUCPP_GPUIMPL
     // Create a process object, read param card and set parameters
     // FIXME: the process instance can happily go out of scope because it is only needed to read parameters?
     // FIXME: the CPPProcess should really be a singleton? what if fbridgecreate is called from several Fortran threads?
@@ -262,7 +262,7 @@ namespace mg5amcCpu
     process.initProc( paramCard );
   }
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   template<typename FORTRANFPTYPE>
   void Bridge<FORTRANFPTYPE>::set_gpugrid( const int gpublocks, const int gputhreads )
   {
@@ -276,7 +276,7 @@ namespace mg5amcCpu
   }
 #endif
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   template<typename FORTRANFPTYPE>
   void Bridge<FORTRANFPTYPE>::gpu_sequence( const FORTRANFPTYPE* momenta,
                                             const FORTRANFPTYPE* gs,
@@ -291,14 +291,14 @@ namespace mg5amcCpu
     constexpr int neppM = MemoryAccessMomenta::neppM;
     if constexpr( neppM == 1 && std::is_same_v<FORTRANFPTYPE, fptype> )
     {
-      checkCuda( cudaMemcpy( m_devMomentaC.data(), momenta, m_devMomentaC.bytes(), cudaMemcpyHostToDevice ) );
+      gpuMemcpy( m_devMomentaC.data(), momenta, m_devMomentaC.bytes(), gpuMemcpyHostToDevice );
     }
     else
     {
-      checkCuda( cudaMemcpy( m_devMomentaF.data(), momenta, m_devMomentaF.bytes(), cudaMemcpyHostToDevice ) );
+      gpuMemcpy( m_devMomentaF.data(), momenta, m_devMomentaF.bytes(), gpuMemcpyHostToDevice );
       const int thrPerEvt = CPPProcess::npar * CPPProcess::np4; // AV: transpose alg does 1 element per thread (NOT 1 event per thread)
       //const int thrPerEvt = 1; // AV: try new alg with 1 event per thread... this seems slower
-      dev_transposeMomentaF2C<<<m_gpublocks * thrPerEvt, m_gputhreads>>>( m_devMomentaF.data(), m_devMomentaC.data(), m_nevt );
+      gpuLaunchKernel( dev_transposeMomentaF2C, m_gpublocks * thrPerEvt, m_gputhreads, m_devMomentaF.data(), m_devMomentaC.data(), m_nevt );
     }
     if constexpr( std::is_same_v<FORTRANFPTYPE, fptype> )
     {
@@ -341,7 +341,7 @@ namespace mg5amcCpu
   }
 #endif
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   template<typename FORTRANFPTYPE>
   void Bridge<FORTRANFPTYPE>::cpu_sequence( const FORTRANFPTYPE* momenta,
                                             const FORTRANFPTYPE* gs,
@@ -396,7 +396,7 @@ namespace mg5amcCpu
   // - C++ array: momenta[npagM][npar][np4][neppM] with nevt=npagM*neppM (AOSOA)
   //
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   template<typename Tin, typename Tout>
   __global__ void dev_transposeMomentaF2C( const Tin* in, Tout* out, const unsigned int nevt )
   {
diff --git a/epochX/cudacpp/ee_mumu.sa/SubProcesses/BridgeKernels.cc b/epochX/cudacpp/ee_mumu.sa/SubProcesses/BridgeKernels.cc
index d58066c9c1..eaf4037a24 100644
--- a/epochX/cudacpp/ee_mumu.sa/SubProcesses/BridgeKernels.cc
+++ b/epochX/cudacpp/ee_mumu.sa/SubProcesses/BridgeKernels.cc
@@ -1,17 +1,18 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #include "BridgeKernels.h"
 
+#include "GpuAbstraction.h"
 #include "MemoryAccessMomenta.h"
 
 #include <sstream>
 
 //============================================================================
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -45,7 +46,7 @@ namespace mg5amcCpu
 
 //============================================================================
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 namespace mg5amcCpu
 {
 
@@ -96,7 +97,7 @@ namespace mg5amcCpu
 
 //============================================================================
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 {
 
diff --git a/epochX/cudacpp/ee_mumu.sa/SubProcesses/BridgeKernels.h b/epochX/cudacpp/ee_mumu.sa/SubProcesses/BridgeKernels.h
index 15eb4bff4d..3efef8ce97 100644
--- a/epochX/cudacpp/ee_mumu.sa/SubProcesses/BridgeKernels.h
+++ b/epochX/cudacpp/ee_mumu.sa/SubProcesses/BridgeKernels.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef BRIDGEKERNELS_H
 #define BRIDGEKERNELS_H 1
@@ -12,7 +12,7 @@
 #include "MatrixElementKernels.h"
 #include "MemoryBuffers.h"
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -49,7 +49,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A Bridge wrapper class encapsulating matrix element calculations on a CPU host
   class BridgeKernelHost final : public BridgeKernelBase
   {
@@ -89,7 +89,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   // A Bridge wrapper class encapsulating matrix element calculations on a GPU device
   class BridgeKernelDevice : public BridgeKernelBase
   {
diff --git a/epochX/cudacpp/ee_mumu.sa/SubProcesses/CommonRandomNumberKernel.cc b/epochX/cudacpp/ee_mumu.sa/SubProcesses/CommonRandomNumberKernel.cc
index 985b39f576..010bc4cbd0 100644
--- a/epochX/cudacpp/ee_mumu.sa/SubProcesses/CommonRandomNumberKernel.cc
+++ b/epochX/cudacpp/ee_mumu.sa/SubProcesses/CommonRandomNumberKernel.cc
@@ -1,15 +1,16 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #include "CommonRandomNumbers.h"
+#include "GpuAbstraction.h"
 #include "MemoryBuffers.h"
 #include "RandomNumberKernels.h"
 
 #include <cassert>
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/ee_mumu.sa/SubProcesses/CrossSectionKernels.cc b/epochX/cudacpp/ee_mumu.sa/SubProcesses/CrossSectionKernels.cc
index 0b355a3c8d..c15b39844d 100644
--- a/epochX/cudacpp/ee_mumu.sa/SubProcesses/CrossSectionKernels.cc
+++ b/epochX/cudacpp/ee_mumu.sa/SubProcesses/CrossSectionKernels.cc
@@ -1,10 +1,11 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #include "CrossSectionKernels.h"
 
+#include "GpuAbstraction.h"
 #include "MemoryAccessMatrixElements.h"
 #include "MemoryAccessWeights.h"
 #include "MemoryBuffers.h"
@@ -77,7 +78,7 @@ debug_me_is_abnormal( const fptype& me, size_t ievtALL )
 
 //============================================================================
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -185,7 +186,7 @@ namespace mg5amcCpu
 
 //============================================================================
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 {
 
diff --git a/epochX/cudacpp/ee_mumu.sa/SubProcesses/CrossSectionKernels.h b/epochX/cudacpp/ee_mumu.sa/SubProcesses/CrossSectionKernels.h
index 7933ca4bbf..4d9659e04e 100644
--- a/epochX/cudacpp/ee_mumu.sa/SubProcesses/CrossSectionKernels.h
+++ b/epochX/cudacpp/ee_mumu.sa/SubProcesses/CrossSectionKernels.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef CROSSSECTIONKERNELS_H
 #define CROSSSECTIONKERNELS_H 1
@@ -13,7 +13,7 @@
 
 //============================================================================
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -96,7 +96,7 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
   /*
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   // A class encapsulating the calculation of event statistics on a GPU device
   class CrossSectionKernelDevice : public CrossSectionKernelBase, public NumberOfEvents
   {
diff --git a/epochX/cudacpp/ee_mumu.sa/SubProcesses/CurandRandomNumberKernel.cc b/epochX/cudacpp/ee_mumu.sa/SubProcesses/CurandRandomNumberKernel.cc
index eb56333b03..08a16f6f2c 100644
--- a/epochX/cudacpp/ee_mumu.sa/SubProcesses/CurandRandomNumberKernel.cc
+++ b/epochX/cudacpp/ee_mumu.sa/SubProcesses/CurandRandomNumberKernel.cc
@@ -1,9 +1,9 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
-#include "CudaRuntime.h"
+#include "GpuRuntime.h"
 #include "MemoryBuffers.h"
 #include "RandomNumberKernels.h"
 
@@ -22,7 +22,7 @@ inline void assertCurand( curandStatus_t code, const char *file, int line, bool
 }
 #endif /* clang-format on */
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -36,7 +36,7 @@ namespace mg5amcCpu
   {
     if( m_isOnDevice )
     {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
       if( !m_rnarray.isOnDevice() )
         throw std::runtime_error( "CurandRandomNumberKernel on device with a host random number array" );
 #else
@@ -114,7 +114,7 @@ namespace mg5amcCpu
     /*
     printf( "\nCurandRandomNumberKernel::generateRnarray size = %d\n", (int)m_rnarray.size() );
     fptype* data = m_rnarray.data();
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     if( m_rnarray.isOnDevice() )
     {
       data = new fptype[m_rnarray.size()]();
@@ -123,7 +123,7 @@ namespace mg5amcCpu
 #endif
     for( int i = 0; i < ( (int)m_rnarray.size() / 4 ); i++ )
       printf( "[%4d] %f %f %f %f\n", i * 4, data[i * 4], data[i * 4 + 2], data[i * 4 + 2], data[i * 4 + 3] );
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     if( m_rnarray.isOnDevice() ) delete[] data;
 #endif
     */
diff --git a/epochX/cudacpp/ee_mumu.sa/SubProcesses/EventStatistics.h b/epochX/cudacpp/ee_mumu.sa/SubProcesses/EventStatistics.h
index 48b51e0a49..b425a5bade 100644
--- a/epochX/cudacpp/ee_mumu.sa/SubProcesses/EventStatistics.h
+++ b/epochX/cudacpp/ee_mumu.sa/SubProcesses/EventStatistics.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef EventStatistics_H
 #define EventStatistics_H 1
@@ -16,7 +16,7 @@
 #include <limits>
 #include <string>
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/ee_mumu.sa/SubProcesses/GpuAbstraction.h b/epochX/cudacpp/ee_mumu.sa/SubProcesses/GpuAbstraction.h
new file mode 100644
index 0000000000..6a7d9c05c0
--- /dev/null
+++ b/epochX/cudacpp/ee_mumu.sa/SubProcesses/GpuAbstraction.h
@@ -0,0 +1,71 @@
+// Copyright (C) 2020-2023 CERN and UCLouvain.
+// Licensed under the GNU Lesser General Public License (version 3 or later).
+// Created by: J. Teig (Jul 2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+
+#ifndef MG5AMC_GPUABSTRACTION_H
+#define MG5AMC_GPUABSTRACTION_H 1
+
+#include <cassert>
+
+//--------------------------------------------------------------------------
+
+#ifdef __CUDACC__
+
+#define gpuError_t cudaError_t
+#define gpuPeekAtLastError cudaPeekAtLastError
+#define gpuGetErrorString cudaGetErrorString
+#define gpuSuccess cudaSuccess
+
+#define gpuMallocHost( ptr, size ) checkGpu( cudaMallocHost( ptr, size ) )
+#define gpuMalloc( ptr, size ) checkGpu( cudaMalloc( ptr, size ) )
+
+#define gpuMemcpy( dstData, srcData, srcBytes, func ) checkGpu( cudaMemcpy( dstData, srcData, srcBytes, func ) )
+#define gpuMemcpyHostToDevice cudaMemcpyHostToDevice
+#define gpuMemcpyDeviceToHost cudaMemcpyDeviceToHost
+#define gpuMemcpyToSymbol( type1, type2, size ) checkGpu( cudaMemcpyToSymbol( type1, type2, size ) )
+
+#define gpuFree( ptr ) checkGpu( cudaFree( ptr ) )
+#define gpuFreeHost( ptr ) checkGpu( cudaFreeHost( ptr ) )
+
+#define gpuSetDevice cudaSetDevice
+#define gpuDeviceSynchronize cudaDeviceSynchronize
+#define gpuDeviceReset cudaDeviceReset
+
+#define gpuLaunchKernel( kernel, blocks, threads, ... ) kernel<<<blocks, threads>>>( __VA_ARGS__ )
+#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<<blocks, threads, sharedMem>>>( __VA_ARGS__ )
+
+//--------------------------------------------------------------------------
+
+#elif defined __HIPCC__
+
+#include "hip/hip_runtime.h"
+
+#define gpuError_t hipError_t
+#define gpuPeekAtLastError hipPeekAtLastError
+#define gpuGetErrorString hipGetErrorString
+#define gpuSuccess hipSuccess
+
+#define gpuMallocHost( ptr, size ) checkGpu( hipHostMalloc( ptr, size ) ) // HostMalloc better
+#define gpuMalloc( ptr, size ) checkGpu( hipMalloc( ptr, size ) )
+
+#define gpuMemcpy( dstData, srcData, srcBytes, func ) checkGpu( hipMemcpy( dstData, srcData, srcBytes, func ) )
+#define gpuMemcpyHostToDevice hipMemcpyHostToDevice
+#define gpuMemcpyDeviceToHost hipMemcpyDeviceToHost
+#define gpuMemcpyToSymbol( type1, type2, size ) checkGpu( hipMemcpyToSymbol( type1, type2, size ) )
+
+#define gpuFree( ptr ) checkGpu( hipFree( ptr ) )
+#define gpuFreeHost( ptr ) checkGpu( hipHostFree( ptr ) )
+
+#define gpuSetDevice hipSetDevice
+#define gpuDeviceSynchronize hipDeviceSynchronize
+#define gpuDeviceReset hipDeviceReset
+
+#define gpuLaunchKernel( kernel, blocks, threads, ... ) kernel<<<blocks, threads>>>( __VA_ARGS__ )
+#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<<blocks, threads, sharedMem>>>( __VA_ARGS__ )
+
+//--------------------------------------------------------------------------
+
+#endif
+
+#endif // MG5AMC_GPUABSTRACTION_H
diff --git a/epochX/cudacpp/gg_tt.sa/SubProcesses/CudaRuntime.h b/epochX/cudacpp/ee_mumu.sa/SubProcesses/GpuRuntime.h
similarity index 62%
rename from epochX/cudacpp/gg_tt.sa/SubProcesses/CudaRuntime.h
rename to epochX/cudacpp/ee_mumu.sa/SubProcesses/GpuRuntime.h
index 64ce52f4b3..93579ef08b 100644
--- a/epochX/cudacpp/gg_tt.sa/SubProcesses/CudaRuntime.h
+++ b/epochX/cudacpp/ee_mumu.sa/SubProcesses/GpuRuntime.h
@@ -1,49 +1,50 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
-// Created by: S. Roiser (Jul 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+// Created by: J. Teig (Jun 2023, based on earlier work by S. Roiser) for the MG5aMC CUDACPP plugin.
+// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 
-#ifndef MG5AMC_CUDARUNTIME_H
-#define MG5AMC_CUDARUNTIME_H 1
+#ifndef MG5AMC_GPURUNTIME_H
+#define MG5AMC_GPURUNTIME_H 1
 
 // MG5AMC on GPU uses the CUDA runtime API, not the lower level CUDA driver API
 // See https://docs.nvidia.com/cuda/cuda-runtime-api/driver-vs-runtime-api.html#driver-vs-runtime-api
 
-#include <cassert>
+#include "GpuAbstraction.h"
+
 #include <iostream>
 
 //--------------------------------------------------------------------------
 
 // See https://stackoverflow.com/a/14038590
-#ifdef __CUDACC__ /* clang-format off */
-#define checkCuda( code ) { assertCuda( code, __FILE__, __LINE__ ); }
-inline void assertCuda( cudaError_t code, const char* file, int line, bool abort = true )
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
+#define checkGpu( code ) { assertGpu( code, __FILE__, __LINE__ ); }
+inline void assertGpu( gpuError_t code, const char* file, int line, bool abort = true )
 {
-  if( code != cudaSuccess )
+  if( code != gpuSuccess )
   {
-    printf( "ERROR! assertCuda: '%s' (%d) in %s:%d\n", cudaGetErrorString( code ), code, file, line );
-    if( abort ) assert( code == cudaSuccess );
+    printf( "ERROR! assertGpu: '%s' (%d) in %s:%d\n", gpuGetErrorString( code ), code, file, line );
+    if( abort ) assert( code == gpuSuccess );
   }
 }
 #endif /* clang-format on */
 
 //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 {
-  // Instantiate a CudaRuntime at the beginnining of the application's main to
-  // invoke cudaSetDevice(0) in the constructor and book a cudaDeviceReset() call in the destructor
+  // Instantiate a GpuRuntime at the beginnining of the application's main to
+  // invoke gpuSetDevice(0) in the constructor and book a gpuDeviceReset() call in the destructor
   // *** FIXME! This will all need to be designed differently when going to multi-GPU nodes! ***
-  struct CudaRuntime final
+  struct GpuRuntime final
   {
-    CudaRuntime( const bool debug = true )
+    GpuRuntime( const bool debug = true )
       : m_debug( debug ) { setUp( m_debug ); }
-    ~CudaRuntime() { tearDown( m_debug ); }
-    CudaRuntime( const CudaRuntime& ) = delete;
-    CudaRuntime( CudaRuntime&& ) = delete;
-    CudaRuntime& operator=( const CudaRuntime& ) = delete;
-    CudaRuntime& operator=( CudaRuntime&& ) = delete;
+    ~GpuRuntime() { tearDown( m_debug ); }
+    GpuRuntime( const GpuRuntime& ) = delete;
+    GpuRuntime( GpuRuntime&& ) = delete;
+    GpuRuntime& operator=( const GpuRuntime& ) = delete;
+    GpuRuntime& operator=( GpuRuntime&& ) = delete;
     bool m_debug;
 
     // Set up CUDA application
@@ -62,8 +63,8 @@ namespace mg5amcGpu
       */
       // Replace cudaFree(0) by cudaSetDevice(0), even if it is not really needed either
       // (but see https://developer.nvidia.com/blog/cuda-pro-tip-always-set-current-device-avoid-multithreading-bugs)
-      if( debug ) std::cout << "__CudaRuntime: calling cudaSetDevice(0)" << std::endl;
-      checkCuda( cudaSetDevice( 0 ) ); // SLOW!
+      if( debug ) std::cout << "__GpuRuntime: calling GpuSetDevice(0)" << std::endl;
+      checkGpu( gpuSetDevice( 0 ) ); // SLOW!
     }
 
     // Tear down CUDA application (call cudaDeviceReset)
@@ -72,14 +73,13 @@ namespace mg5amcGpu
     // See https://docs.nvidia.com/cuda/cuda-memcheck/index.html#leak-checking
     static void tearDown( const bool debug = true )
     {
-      if( debug ) std::cout << "__CudaRuntime: calling cudaDeviceReset()" << std::endl;
-      checkCuda( cudaDeviceReset() );
+      if( debug ) std::cout << "__GpuRuntime: calling GpuDeviceReset()" << std::endl;
+      checkGpu( gpuDeviceReset() );
     }
   };
-
 }
 #endif
 
 //--------------------------------------------------------------------------
 
-#endif // MG5AMC_CUDARUNTIME_H
+#endif // MG5AMC_GPURUNTIME_H
diff --git a/epochX/cudacpp/ee_mumu.sa/SubProcesses/MadgraphTest.h b/epochX/cudacpp/ee_mumu.sa/SubProcesses/MadgraphTest.h
index ef40624c88..a64c05c26a 100644
--- a/epochX/cudacpp/ee_mumu.sa/SubProcesses/MadgraphTest.h
+++ b/epochX/cudacpp/ee_mumu.sa/SubProcesses/MadgraphTest.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: S. Hageboeck (Dec 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MADGRAPHTEST_H_
 #define MADGRAPHTEST_H_ 1
@@ -22,7 +22,7 @@
 #include <string>
 #include <vector>
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 using mg5amcGpu::CPPProcess;
 #else
 using mg5amcCpu::CPPProcess;
@@ -201,7 +201,7 @@ class MadgraphTest : public testing::TestWithParam<TestDriverBase*>
 
 // Since we link both the CPU-only and GPU tests into the same executable, we prevent
 // a multiply defined symbol by only compiling this in the non-CUDA phase:
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 
 /// Compare momenta and matrix elements.
 /// This uses an implementation of TestDriverBase to run a madgraph workflow,
@@ -307,6 +307,6 @@ TEST_P( MadgraphTest, CompareMomentaAndME )
   }
 }
 
-#endif // __CUDACC__
+#endif // MGONGPUCPP_GPUIMPL
 
 #endif /* MADGRAPHTEST_H_ */
diff --git a/epochX/cudacpp/ee_mumu.sa/SubProcesses/MatrixElementKernels.cc b/epochX/cudacpp/ee_mumu.sa/SubProcesses/MatrixElementKernels.cc
index 74b5239ebf..81699dfea9 100644
--- a/epochX/cudacpp/ee_mumu.sa/SubProcesses/MatrixElementKernels.cc
+++ b/epochX/cudacpp/ee_mumu.sa/SubProcesses/MatrixElementKernels.cc
@@ -1,12 +1,12 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #include "MatrixElementKernels.h"
 
 #include "CPPProcess.h"
-#include "CudaRuntime.h"
+#include "GpuRuntime.h" // Includes the abstraction for Nvidia/AMD compilation
 #include "MemoryAccessMomenta.h"
 #include "MemoryBuffers.h"
 
@@ -14,7 +14,7 @@
 
 //============================================================================
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 namespace mg5amcCpu
 {
 
@@ -150,7 +150,7 @@ namespace mg5amcCpu
 
 //============================================================================
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 {
 
@@ -209,13 +209,13 @@ namespace mg5amcGpu
     PinnedHostBufferHelicityMask hstIsGoodHel( ncomb );
     DeviceBufferHelicityMask devIsGoodHel( ncomb );
     // ... 0d1. Compute good helicity mask on the device
-    computeDependentCouplings<<<m_gpublocks, m_gputhreads>>>( m_gs.data(), m_couplings.data() );
+    gpuLaunchKernel( computeDependentCouplings, m_gpublocks, m_gputhreads, m_gs.data(), m_couplings.data() );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    sigmaKin_getGoodHel<<<m_gpublocks, m_gputhreads>>>( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_numerators.data(), m_denominators.data(), devIsGoodHel.data() );
+    gpuLaunchKernel( sigmaKin_getGoodHel, m_gpublocks, m_gputhreads, m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_numerators.data(), m_denominators.data(), devIsGoodHel.data() );
 #else
-    sigmaKin_getGoodHel<<<m_gpublocks, m_gputhreads>>>( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), devIsGoodHel.data() );
+    gpuLaunchKernel( sigmaKin_getGoodHel, m_gpublocks, m_gputhreads, m_momenta.data(), m_couplings.data(), m_matrixElements.data(), devIsGoodHel.data() );
 #endif
-    checkCuda( cudaPeekAtLastError() );
+    checkGpu( gpuPeekAtLastError() );
     // ... 0d2. Copy back good helicity mask to the host
     copyHostFromDevice( hstIsGoodHel, devIsGoodHel );
     // ... 0d3. Copy back good helicity list to constant memory on the device
@@ -226,19 +226,19 @@ namespace mg5amcGpu
 
   void MatrixElementKernelDevice::computeMatrixElements( const unsigned int channelId )
   {
-    computeDependentCouplings<<<m_gpublocks, m_gputhreads>>>( m_gs.data(), m_couplings.data() );
+    gpuLaunchKernel( computeDependentCouplings, m_gpublocks, m_gputhreads, m_gs.data(), m_couplings.data() );
 #ifndef MGONGPU_NSIGHT_DEBUG
     constexpr unsigned int sharedMemSize = 0;
 #else
     constexpr unsigned int sharedMemSize = ntpbMAX * sizeof( float );
 #endif
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    sigmaKin<<<m_gpublocks, m_gputhreads, sharedMemSize>>>( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), channelId, m_numerators.data(), m_denominators.data(), m_selhel.data(), m_selcol.data() );
+    gpuLaunchKernelSharedMem( sigmaKin, m_gpublocks, m_gputhreads, sharedMemSize, m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), channelId, m_numerators.data(), m_denominators.data(), m_selhel.data(), m_selcol.data() );
 #else
-    sigmaKin<<<m_gpublocks, m_gputhreads, sharedMemSize>>>( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), m_selhel.data(), m_selcol.data() );
+    gpuLaunchKernelSharedMem( sigmaKin, m_gpublocks, m_gputhreads, sharedMemSize, m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), m_selhel.data(), m_selcol.data() );
 #endif
-    checkCuda( cudaPeekAtLastError() );
-    checkCuda( cudaDeviceSynchronize() );
+    checkGpu( gpuPeekAtLastError() );
+    checkGpu( gpuDeviceSynchronize() );
   }
 
   //--------------------------------------------------------------------------
diff --git a/epochX/cudacpp/ee_mumu.sa/SubProcesses/MatrixElementKernels.h b/epochX/cudacpp/ee_mumu.sa/SubProcesses/MatrixElementKernels.h
index 23e84757a2..72bd8f195b 100644
--- a/epochX/cudacpp/ee_mumu.sa/SubProcesses/MatrixElementKernels.h
+++ b/epochX/cudacpp/ee_mumu.sa/SubProcesses/MatrixElementKernels.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MATRIXELEMENTKERNELS_H
 #define MATRIXELEMENTKERNELS_H 1
@@ -10,7 +10,7 @@
 
 #include "MemoryBuffers.h"
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -81,7 +81,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating matrix element calculations on a CPU host
   class MatrixElementKernelHost final : public MatrixElementKernelBase, public NumberOfEvents
   {
@@ -130,7 +130,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   // A class encapsulating matrix element calculations on a GPU device
   class MatrixElementKernelDevice : public MatrixElementKernelBase, public NumberOfEvents
   {
diff --git a/epochX/cudacpp/ee_mumu.sa/SubProcesses/MemoryAccessAmplitudes.h b/epochX/cudacpp/ee_mumu.sa/SubProcesses/MemoryAccessAmplitudes.h
index 573b3bbbc9..ffb76e93de 100644
--- a/epochX/cudacpp/ee_mumu.sa/SubProcesses/MemoryAccessAmplitudes.h
+++ b/epochX/cudacpp/ee_mumu.sa/SubProcesses/MemoryAccessAmplitudes.h
@@ -15,7 +15,7 @@
 #define MGONGPU_TRIVIAL_AMPLITUDES 1
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/ee_mumu.sa/SubProcesses/MemoryAccessCouplings.h b/epochX/cudacpp/ee_mumu.sa/SubProcesses/MemoryAccessCouplings.h
index 35a3af42e0..3afdf3e554 100644
--- a/epochX/cudacpp/ee_mumu.sa/SubProcesses/MemoryAccessCouplings.h
+++ b/epochX/cudacpp/ee_mumu.sa/SubProcesses/MemoryAccessCouplings.h
@@ -15,7 +15,7 @@
 #include "MemoryBuffers.h"       // for HostBufferCouplings::isaligned
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/ee_mumu.sa/SubProcesses/MemoryAccessCouplingsFixed.h b/epochX/cudacpp/ee_mumu.sa/SubProcesses/MemoryAccessCouplingsFixed.h
index dc0d93afff..ffcdf4dbef 100644
--- a/epochX/cudacpp/ee_mumu.sa/SubProcesses/MemoryAccessCouplingsFixed.h
+++ b/epochX/cudacpp/ee_mumu.sa/SubProcesses/MemoryAccessCouplingsFixed.h
@@ -14,7 +14,7 @@
 //#include "MemoryAccessHelpers.h"
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/ee_mumu.sa/SubProcesses/MemoryAccessDenominators.h b/epochX/cudacpp/ee_mumu.sa/SubProcesses/MemoryAccessDenominators.h
index 3bce635718..66f2d32a6b 100644
--- a/epochX/cudacpp/ee_mumu.sa/SubProcesses/MemoryAccessDenominators.h
+++ b/epochX/cudacpp/ee_mumu.sa/SubProcesses/MemoryAccessDenominators.h
@@ -10,7 +10,7 @@
 #include "MemoryAccessGs.h"
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/ee_mumu.sa/SubProcesses/MemoryAccessGs.h b/epochX/cudacpp/ee_mumu.sa/SubProcesses/MemoryAccessGs.h
index 31311aa375..4c726b30f3 100644
--- a/epochX/cudacpp/ee_mumu.sa/SubProcesses/MemoryAccessGs.h
+++ b/epochX/cudacpp/ee_mumu.sa/SubProcesses/MemoryAccessGs.h
@@ -13,7 +13,7 @@
 #include "MemoryBuffers.h" // for HostBufferMatrixElements::isaligned
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/ee_mumu.sa/SubProcesses/MemoryAccessHelpers.h b/epochX/cudacpp/ee_mumu.sa/SubProcesses/MemoryAccessHelpers.h
index c82a6c7635..db73e4e064 100644
--- a/epochX/cudacpp/ee_mumu.sa/SubProcesses/MemoryAccessHelpers.h
+++ b/epochX/cudacpp/ee_mumu.sa/SubProcesses/MemoryAccessHelpers.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MemoryAccessHelpers_H
 #define MemoryAccessHelpers_H 1
@@ -105,7 +105,7 @@ class KernelAccessHelper : public MemoryAccessHelper<T>
     }
     else
     {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
       const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid
       //printf( "kernelAccessRecord: ievt=%d threadId=%d\n", ievt, threadIdx.x );
       return T::ieventAccessRecord( buffer, ievt ); // NB fptype and fptype_sv coincide for CUDA
diff --git a/epochX/cudacpp/ee_mumu.sa/SubProcesses/MemoryAccessMatrixElements.h b/epochX/cudacpp/ee_mumu.sa/SubProcesses/MemoryAccessMatrixElements.h
index f32e6fea5b..3741011971 100644
--- a/epochX/cudacpp/ee_mumu.sa/SubProcesses/MemoryAccessMatrixElements.h
+++ b/epochX/cudacpp/ee_mumu.sa/SubProcesses/MemoryAccessMatrixElements.h
@@ -13,7 +13,7 @@
 #include "MemoryBuffers.h" // for HostBufferMatrixElements::isaligned
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/ee_mumu.sa/SubProcesses/MemoryAccessMomenta.h b/epochX/cudacpp/ee_mumu.sa/SubProcesses/MemoryAccessMomenta.h
index 29266de32c..3be229d392 100644
--- a/epochX/cudacpp/ee_mumu.sa/SubProcesses/MemoryAccessMomenta.h
+++ b/epochX/cudacpp/ee_mumu.sa/SubProcesses/MemoryAccessMomenta.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MemoryAccessMomenta_H
 #define MemoryAccessMomenta_H 1
@@ -13,7 +13,7 @@
 #include "MemoryAccessVectors.h"
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -30,7 +30,7 @@ namespace mg5amcCpu
 
     // Number of Events Per Page in the momenta AOSOA memory buffer layout
     // (these are all best kept as a compile-time constants: see issue #23)
-#ifdef __CUDACC__ /* clang-format off */
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
     // -----------------------------------------------------------------------------------------------
     // --- GPUs: neppM is best set to a power of 2 times the number of fptype's in a 32-byte cacheline
     // --- This is relevant to ensure coalesced access to momenta in global memory
diff --git a/epochX/cudacpp/ee_mumu.sa/SubProcesses/MemoryAccessNumerators.h b/epochX/cudacpp/ee_mumu.sa/SubProcesses/MemoryAccessNumerators.h
index b152183b28..18991f4fa6 100644
--- a/epochX/cudacpp/ee_mumu.sa/SubProcesses/MemoryAccessNumerators.h
+++ b/epochX/cudacpp/ee_mumu.sa/SubProcesses/MemoryAccessNumerators.h
@@ -10,7 +10,7 @@
 #include "MemoryAccessGs.h"
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/ee_mumu.sa/SubProcesses/MemoryAccessRandomNumbers.h b/epochX/cudacpp/ee_mumu.sa/SubProcesses/MemoryAccessRandomNumbers.h
index e2988d39f3..40cb089135 100644
--- a/epochX/cudacpp/ee_mumu.sa/SubProcesses/MemoryAccessRandomNumbers.h
+++ b/epochX/cudacpp/ee_mumu.sa/SubProcesses/MemoryAccessRandomNumbers.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MemoryAccessRandomNumbers_H
 #define MemoryAccessRandomNumbers_H 1
@@ -11,7 +11,7 @@
 #include "CPPProcess.h"
 #include "MemoryAccessHelpers.h"
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 using mg5amcGpu::CPPProcess;
 #else
 using mg5amcCpu::CPPProcess;
diff --git a/epochX/cudacpp/ee_mumu.sa/SubProcesses/MemoryAccessVectors.h b/epochX/cudacpp/ee_mumu.sa/SubProcesses/MemoryAccessVectors.h
index e9b197368e..08faccff0f 100644
--- a/epochX/cudacpp/ee_mumu.sa/SubProcesses/MemoryAccessVectors.h
+++ b/epochX/cudacpp/ee_mumu.sa/SubProcesses/MemoryAccessVectors.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MemoryAccessVectors_H
 #define MemoryAccessVectors_H 1
@@ -10,7 +10,7 @@
 
 #include "mgOnGpuVectors.h"
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 namespace mg5amcCpu // this is only needed for CPU SIMD vectorization
 {
 
diff --git a/epochX/cudacpp/ee_mumu.sa/SubProcesses/MemoryAccessWavefunctions.h b/epochX/cudacpp/ee_mumu.sa/SubProcesses/MemoryAccessWavefunctions.h
index 5428aaf933..33bef4559e 100644
--- a/epochX/cudacpp/ee_mumu.sa/SubProcesses/MemoryAccessWavefunctions.h
+++ b/epochX/cudacpp/ee_mumu.sa/SubProcesses/MemoryAccessWavefunctions.h
@@ -15,7 +15,7 @@
 #define MGONGPU_TRIVIAL_WAVEFUNCTIONS 1
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/ee_mumu.sa/SubProcesses/MemoryBuffers.h b/epochX/cudacpp/ee_mumu.sa/SubProcesses/MemoryBuffers.h
index 3093e6ed18..7756a71621 100644
--- a/epochX/cudacpp/ee_mumu.sa/SubProcesses/MemoryBuffers.h
+++ b/epochX/cudacpp/ee_mumu.sa/SubProcesses/MemoryBuffers.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021, based on earlier work by S. Hageboeck) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Roiser, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MemoryBuffers_H
 #define MemoryBuffers_H 1
@@ -11,12 +11,12 @@
 #include "mgOnGpuCxtypes.h"
 
 #include "CPPProcess.h"
-#include "CudaRuntime.h"
+#include "GpuRuntime.h"
 #include "Parameters_sm.h"
 
 #include <sstream>
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -87,7 +87,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   constexpr bool HostBufferALIGNED = false;   // ismisaligned=false
   constexpr bool HostBufferMISALIGNED = true; // ismisaligned=true
 
@@ -119,7 +119,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   // A class encapsulating a CUDA pinned host buffer
   template<typename T>
   class PinnedHostBufferBase : public BufferBase<T>
@@ -128,18 +128,18 @@ namespace mg5amcCpu
     PinnedHostBufferBase( const size_t size )
       : BufferBase<T>( size, false )
     {
-      checkCuda( cudaMallocHost( &( this->m_data ), this->bytes() ) );
+      gpuMallocHost( &( this->m_data ), this->bytes() );
     }
     virtual ~PinnedHostBufferBase()
     {
-      checkCuda( cudaFreeHost( this->m_data ) );
+      gpuFreeHost( this->m_data );
     }
   };
 #endif
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   // A class encapsulating a CUDA device buffer
   template<typename T>
   class DeviceBufferBase : public BufferBase<T>
@@ -148,18 +148,18 @@ namespace mg5amcCpu
     DeviceBufferBase( const size_t size )
       : BufferBase<T>( size, true )
     {
-      checkCuda( cudaMalloc( &( this->m_data ), this->bytes() ) );
+      gpuMalloc( &( this->m_data ), this->bytes() );
     }
     virtual ~DeviceBufferBase()
     {
-      checkCuda( cudaFree( this->m_data ) );
+      gpuFree( this->m_data );
     }
   };
 #endif
 
   //--------------------------------------------------------------------------
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for a given number of events
   template<typename T, size_t sizePerEvent, bool ismisaligned>
   class HostBuffer : public HostBufferBase<T, ismisaligned>, virtual private NumberOfEvents
@@ -175,7 +175,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   // A class encapsulating a CUDA pinned host buffer for a given number of events
   template<typename T, size_t sizePerEvent>
   class PinnedHostBuffer : public PinnedHostBufferBase<T>, virtual private NumberOfEvents
@@ -191,7 +191,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   // A class encapsulating a CUDA device buffer for a given number of events
   template<typename T, size_t sizePerEvent>
   class DeviceBuffer : public DeviceBufferBase<T>, virtual private NumberOfEvents
@@ -213,7 +213,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for momenta random numbers
   constexpr size_t sizePerEventRndNumMomenta = MemoryBuffers::np4 * MemoryBuffers::nparf;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for momenta random numbers
   typedef HostBuffer<fptype, sizePerEventRndNumMomenta, HostBufferALIGNED> HostBufferRndNumMomenta;
 #else
@@ -232,7 +232,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer with ONE fptype per event
   constexpr size_t sizePerEventOneFp = 1;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer with ONE fptype per event
   typedef HostBuffer<fptype, sizePerEventOneFp, HostBufferALIGNED> HostBufferOneFp;
 #else
@@ -257,7 +257,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for Gs
   constexpr size_t sizePerEventGs = 1;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for gs
   typedef HostBuffer<fptype, sizePerEventGs, HostBufferALIGNED> HostBufferGs;
 #else
@@ -276,7 +276,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for numerators
   constexpr size_t sizePerEventNumerators = 1;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for gs
   typedef HostBuffer<fptype, sizePerEventNumerators, HostBufferALIGNED> HostBufferNumerators;
 #else
@@ -296,7 +296,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for denominators
   constexpr size_t sizePerEventDenominators = 1;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for gs
   typedef HostBuffer<fptype, sizePerEventDenominators, HostBufferALIGNED> HostBufferDenominators;
 #else
@@ -315,7 +315,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for random numbers
   constexpr size_t sizePerEventCouplings = MemoryBuffers::ndcoup * MemoryBuffers::nx2;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for gs
   typedef HostBuffer<fptype, sizePerEventCouplings, HostBufferALIGNED> HostBufferCouplings;
 #else
@@ -333,7 +333,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for momenta
   constexpr size_t sizePerEventMomenta = MemoryBuffers::np4 * MemoryBuffers::npar;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for momenta
   typedef HostBuffer<fptype, sizePerEventMomenta, HostBufferALIGNED> HostBufferMomenta;
   //typedef HostBuffer<fptype, sizePerEventMomenta, HostBufferMISALIGNED> HostBufferMomenta; // TEST MISALIGNMENT!
@@ -352,7 +352,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for sampling weights
   constexpr size_t sizePerEventWeights = 1;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for sampling weights
   typedef HostBuffer<fptype, sizePerEventWeights, HostBufferALIGNED> HostBufferWeights;
 #else
@@ -370,7 +370,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for matrix elements
   constexpr size_t sizePerEventMatrixElements = 1;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for matrix elements
   typedef HostBuffer<fptype, sizePerEventMatrixElements, HostBufferALIGNED> HostBufferMatrixElements;
 #else
@@ -385,7 +385,7 @@ namespace mg5amcCpu
   // A base class encapsulating a memory buffer for the helicity mask
   typedef BufferBase<bool> BufferHelicityMask;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for the helicity mask
   typedef HostBufferBase<bool, HostBufferALIGNED> HostBufferHelicityMask;
 #else
@@ -403,7 +403,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for wavefunctions
   constexpr size_t sizePerEventWavefunctions = MemoryBuffers::nw6 * MemoryBuffers::nx2;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for wavefunctions
   typedef HostBuffer<fptype, sizePerEventWavefunctions, HostBufferALIGNED> HostBufferWavefunctions;
 #else
@@ -421,7 +421,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for helicity random numbers
   constexpr size_t sizePerEventRndNumHelicity = 1;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for helicity random numbers
   typedef HostBuffer<fptype, sizePerEventRndNumHelicity, HostBufferALIGNED> HostBufferRndNumHelicity;
 #else
@@ -439,7 +439,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for color random numbers
   constexpr size_t sizePerEventRndNumColor = 1;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for color random numbers
   typedef HostBuffer<fptype, sizePerEventRndNumColor, HostBufferALIGNED> HostBufferRndNumColor;
 #else
@@ -457,7 +457,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for helicity selection
   constexpr size_t sizePerEventSelectedHelicity = 1;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for helicity selection
   typedef HostBuffer<int, sizePerEventSelectedHelicity, HostBufferALIGNED> HostBufferSelectedHelicity;
 #else
@@ -475,7 +475,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for color selection
   constexpr size_t sizePerEventSelectedColor = 1;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for color selection
   typedef HostBuffer<int, sizePerEventSelectedColor, HostBufferALIGNED> HostBufferSelectedColor;
 #else
@@ -487,7 +487,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   template<class Tdst, class Tsrc>
   void copyDeviceFromHost( Tdst& dst, const Tsrc& src ) // keep the same order of arguments as in memcpy
   {
@@ -504,13 +504,13 @@ namespace mg5amcCpu
       throw std::runtime_error( sstr.str() );
     }
     // NB (PR #45): cudaMemcpy involves an intermediate memcpy to pinned memory if host array is a not a pinned host array
-    checkCuda( cudaMemcpy( dst.data(), src.data(), src.bytes(), cudaMemcpyHostToDevice ) );
+    gpuMemcpy( dst.data(), src.data(), src.bytes(), gpuMemcpyHostToDevice );
   }
 #endif
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   template<class Tdst, class Tsrc>
   void copyHostFromDevice( Tdst& dst, const Tsrc& src ) // keep the same order of arguments as in memcpy
   {
@@ -527,7 +527,7 @@ namespace mg5amcCpu
       throw std::runtime_error( sstr.str() );
     }
     // NB (PR #45): cudaMemcpy involves an intermediate memcpy to pinned memory if host array is a not a pinned host array
-    checkCuda( cudaMemcpy( dst.data(), src.data(), src.bytes(), cudaMemcpyDeviceToHost ) );
+    gpuMemcpy( dst.data(), src.data(), src.bytes(), gpuMemcpyDeviceToHost );
   }
 #endif
 
diff --git a/epochX/cudacpp/ee_mumu.sa/SubProcesses/P1_Sigma_sm_epem_mupmum/CPPProcess.cc b/epochX/cudacpp/ee_mumu.sa/SubProcesses/P1_Sigma_sm_epem_mupmum/CPPProcess.cc
index 87bcecccd9..13429436af 100644
--- a/epochX/cudacpp/ee_mumu.sa/SubProcesses/P1_Sigma_sm_epem_mupmum/CPPProcess.cc
+++ b/epochX/cudacpp/ee_mumu.sa/SubProcesses/P1_Sigma_sm_epem_mupmum/CPPProcess.cc
@@ -4,7 +4,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
 // MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
@@ -16,7 +16,6 @@
 
 #include "mgOnGpuConfig.h"
 
-#include "CudaRuntime.h"
 #include "HelAmps_sm.h"
 #include "MemoryAccessAmplitudes.h"
 #include "MemoryAccessCouplings.h"
@@ -46,7 +45,7 @@
 // Class member functions for calculating the matrix elements for
 // Process: e+ e- > mu+ mu- WEIGHTED<=4 @1
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -80,7 +79,7 @@ namespace mg5amcCpu
   __device__ const fptype cIPD[2] = { (fptype)Parameters_sm::mdl_MZ, (fptype)Parameters_sm::mdl_WZ };
   __device__ const fptype cIPC[6] = { (fptype)Parameters_sm::GC_3.real(), (fptype)Parameters_sm::GC_3.imag(), (fptype)Parameters_sm::GC_50.real(), (fptype)Parameters_sm::GC_50.imag(), (fptype)Parameters_sm::GC_59.real(), (fptype)Parameters_sm::GC_59.imag() };
 #else
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   __device__ __constant__ fptype cIPD[2];
   __device__ __constant__ fptype cIPC[6];
 #else
@@ -90,7 +89,7 @@ namespace mg5amcCpu
 #endif
 
   // Helicity combinations (and filtering of "good" helicity combinations)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   __device__ __constant__ short cHel[ncomb][npar];
   __device__ __constant__ int cNGoodHel;
   __device__ __constant__ int cGoodHel[ncomb];
@@ -118,13 +117,13 @@ namespace mg5amcCpu
                            fptype* allDenominators,       // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
                            fptype_sv* jamp2_sv            // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled)
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
                            , const int ievt00             // input: first event number in current C++ event page (for CUDA, ievt depends on threadid)
 #endif
                            )
   //ALWAYS_INLINE // attributes are not permitted in a function definition
   {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     using namespace mg5amcGpu;
     using M_ACCESS = DeviceAccessMomenta;         // non-trivial access: buffer includes all events
     using E_ACCESS = DeviceAccessMatrixElements;  // non-trivial access: buffer includes all events
@@ -151,7 +150,7 @@ namespace mg5amcCpu
 #endif /* clang-format on */
     mgDebug( 0, __FUNCTION__ );
     //printf( "calculate_wavefunctions: ihel=%2d\n", ihel );
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
     //printf( "calculate_wavefunctions: ievt00=%d\n", ievt00 );
 #endif
 
@@ -187,7 +186,7 @@ namespace mg5amcCpu
 #endif
     for( int iParity = 0; iParity < nParity; ++iParity )
     { // START LOOP ON IPARITY
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
       const int ievt0 = ievt00 + iParity * neppV;
 #endif
       constexpr size_t nxcoup = ndcoup + nicoup; // both dependent and independent couplings
@@ -200,8 +199,10 @@ namespace mg5amcCpu
         allCOUPs[idcoup] = CD_ACCESS::idcoupAccessBufferConst( allcouplings, idcoup ); // dependent couplings, vary event-by-event
       for( size_t iicoup = 0; iicoup < nicoup; iicoup++ )
         allCOUPs[ndcoup + iicoup] = CI_ACCESS::iicoupAccessBufferConst( cIPC, iicoup ); // independent couplings, fixed for all events
+#ifdef MGONGPUCPP_GPUIMPL
 #ifdef __CUDACC__
 #pragma nv_diagnostic pop
+#endif
       // CUDA kernels take input/output buffers with momenta/MEs for all events
       const fptype* momenta = allmomenta;
       const fptype* COUPs[nxcoup];
@@ -284,7 +285,7 @@ namespace mg5amcCpu
       // [NB do keep 'static' for these constexpr arrays, see issue #283]
       static constexpr fptype2 cf[ncolor][ncolor] = { { 1 } }; // 2-D array[1][1]
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
       // Pre-compute a constexpr triangular color matrix properly normalized #475
       struct TriangularNormalizedColorMatrix
       {
@@ -341,7 +342,7 @@ namespace mg5amcCpu
 #endif
       for( int icol = 0; icol < ncolor; icol++ )
       {
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
         // === C++ START ===
         // Diagonal terms
 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
@@ -400,7 +401,7 @@ namespace mg5amcCpu
       MEs_sv_previous += deltaMEs_previous;
 #endif
       /*
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
       if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", blockDim.x * blockIdx.x + threadIdx.x, ihel, MEs_sv );
 #else
 #ifdef MGONGPU_CPPSIMD
@@ -447,8 +448,8 @@ namespace mg5amcCpu
       { 1, -1, 1, 1 },
       { 1, -1, -1, -1 },
       { 1, -1, -1, 1 } };
-#ifdef __CUDACC__
-    checkCuda( cudaMemcpyToSymbol( cHel, tHel, ncomb * npar * sizeof( short ) ) );
+#ifdef MGONGPUCPP_GPUIMPL
+    gpuMemcpyToSymbol( cHel, tHel, ncomb * npar * sizeof( short ) );
 #else
     memcpy( cHel, tHel, ncomb * npar * sizeof( short ) );
 #endif
@@ -488,9 +489,9 @@ namespace mg5amcCpu
     // Then copy them to CUDA constant memory (issue #39) or its C++ emulation in file-scope static memory
     const fptype tIPD[2] = { (fptype)m_pars->mdl_MZ, (fptype)m_pars->mdl_WZ };
     const cxtype tIPC[3] = { cxmake( m_pars->GC_3 ), cxmake( m_pars->GC_50 ), cxmake( m_pars->GC_59 ) };
-#ifdef __CUDACC__
-    checkCuda( cudaMemcpyToSymbol( cIPD, tIPD, 2 * sizeof( fptype ) ) );
-    checkCuda( cudaMemcpyToSymbol( cIPC, tIPC, 3 * sizeof( cxtype ) ) );
+#ifdef MGONGPUCPP_GPUIMPL
+    gpuMemcpyToSymbol( cIPD, tIPD, 2 * sizeof( fptype ) );
+    gpuMemcpyToSymbol( cIPC, tIPC, 3 * sizeof( cxtype ) );
 #else
     memcpy( cIPD, tIPD, 2 * sizeof( fptype ) );
     memcpy( cIPC, tIPC, 3 * sizeof( cxtype ) );
@@ -527,7 +528,7 @@ namespace mg5amcCpu
   {
     std::stringstream out;
     // CUDA version (NVCC)
-    // [Use __NVCC__ instead of __CUDACC__ here!]
+    // [Use __NVCC__ instead of MGONGPUCPP_GPUIMPL here!]
     // [This tests if 'nvcc' was used even to build a .cc file, even if not necessarily 'nvcc -x cu' for a .cu file]
     // [Check 'nvcc --compiler-options -dM -E dummy.c | grep CUDA': see https://stackoverflow.com/a/53713712]
 #ifdef __NVCC__
@@ -592,12 +593,12 @@ namespace mg5amcCpu
   __global__ void /* clang-format off */
   computeDependentCouplings( const fptype* allgs, // input: Gs[nevt]
                              fptype* allcouplings // output: couplings[nevt*ndcoup*2]
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
                              , const int nevt     // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #endif
   ) /* clang-format on */
   {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     using namespace mg5amcGpu;
     using G_ACCESS = DeviceAccessGs;
     using C_ACCESS = DeviceAccessCouplings;
@@ -618,7 +619,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__ /* clang-format off */
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
   __global__ void
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
@@ -744,9 +745,9 @@ namespace mg5amcCpu
         nGoodHel++;
       }
     }
-#ifdef __CUDACC__
-    checkCuda( cudaMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) ) );
-    checkCuda( cudaMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) ) );
+#ifdef MGONGPUCPP_GPUIMPL
+    gpuMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) );
+    gpuMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) );
 #else
     cNGoodHel = nGoodHel;
     for( int ihel = 0; ihel < ncomb; ihel++ ) cGoodHel[ihel] = goodHel[ihel];
@@ -770,7 +771,7 @@ namespace mg5amcCpu
 #endif
             int* allselhel,                // output: helicity selection[nevt]
             int* allselcol                 // output: helicity selection[nevt]
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
             , const int nevt               // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #endif
             ) /* clang-format on */
@@ -790,7 +791,7 @@ namespace mg5amcCpu
     // Denominators: spins, colors and identical particles
     constexpr int helcolDenominators[1] = { 4 }; // assume nprocesses == 1 (#272 and #343)
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     // Remember: in CUDA this is a kernel for one event, in c++ this processes n events
     const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid
 #else
@@ -804,9 +805,12 @@ namespace mg5amcCpu
 #endif
 
     // Start sigmaKin_lines
+
+#include "GpuAbstraction.h"
+
     // === PART 0 - INITIALISATION (before calculate_wavefunctions) ===
     // Reset the "matrix elements" - running sums of |M|^2 over helicities for the given event
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     allMEs[ievt] = 0;
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     allNumerators[ievt] = 0;
@@ -834,7 +838,7 @@ namespace mg5amcCpu
     // === PART 1 - HELICITY LOOP: CALCULATE WAVEFUNCTIONS ===
     // (in both CUDA and C++, using precomputed good helicities)
 
-#ifdef __CUDACC__ // CUDA OR C++
+#ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++
 
     // *** START OF PART 1a - CUDA (one event per CPU thread) ***
     // Running sum of partial amplitudes squared for event by event color selection (#402)
@@ -1044,7 +1048,7 @@ namespace mg5amcCpu
     // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event
     // [NB 'sum over final spins, average over initial spins', eg see
     // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf]
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     allMEs[ievt] /= helcolDenominators[0];
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     if( channelId > 0 ) allMEs[ievt] *= allNumerators[ievt] / allDenominators[ievt];
diff --git a/epochX/cudacpp/ee_mumu.sa/SubProcesses/P1_Sigma_sm_epem_mupmum/CPPProcess.h b/epochX/cudacpp/ee_mumu.sa/SubProcesses/P1_Sigma_sm_epem_mupmum/CPPProcess.h
index 77b610753c..0b29ffb3ff 100644
--- a/epochX/cudacpp/ee_mumu.sa/SubProcesses/P1_Sigma_sm_epem_mupmum/CPPProcess.h
+++ b/epochX/cudacpp/ee_mumu.sa/SubProcesses/P1_Sigma_sm_epem_mupmum/CPPProcess.h
@@ -4,7 +4,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
 // MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
@@ -25,7 +25,7 @@
 
 //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -107,7 +107,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   __global__ void
   computeDependentCouplings( const fptype* allgs,    // input: Gs[nevt]
                              fptype* allcouplings ); // output: couplings[nevt*ndcoup*2]
@@ -120,7 +120,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__ /* clang-format off */
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
   __global__ void
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
@@ -150,7 +150,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__ /* clang-format off */
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
   __global__ void
   sigmaKin( const fptype* allmomenta,      // input: momenta[nevt*npar*4]
             const fptype* allcouplings,    // input: couplings[nevt*ndcoup*2]
diff --git a/epochX/cudacpp/ee_mumu.sa/SubProcesses/P1_Sigma_sm_epem_mupmum/CudaRuntime.h b/epochX/cudacpp/ee_mumu.sa/SubProcesses/P1_Sigma_sm_epem_mupmum/CudaRuntime.h
deleted file mode 120000
index ce9e1a487a..0000000000
--- a/epochX/cudacpp/ee_mumu.sa/SubProcesses/P1_Sigma_sm_epem_mupmum/CudaRuntime.h
+++ /dev/null
@@ -1 +0,0 @@
-../CudaRuntime.h
\ No newline at end of file
diff --git a/epochX/cudacpp/ee_mumu.sa/SubProcesses/P1_Sigma_sm_epem_mupmum/GpuAbstraction.h b/epochX/cudacpp/ee_mumu.sa/SubProcesses/P1_Sigma_sm_epem_mupmum/GpuAbstraction.h
new file mode 120000
index 0000000000..72054e19ba
--- /dev/null
+++ b/epochX/cudacpp/ee_mumu.sa/SubProcesses/P1_Sigma_sm_epem_mupmum/GpuAbstraction.h
@@ -0,0 +1 @@
+../GpuAbstraction.h
\ No newline at end of file
diff --git a/epochX/cudacpp/ee_mumu.sa/SubProcesses/P1_Sigma_sm_epem_mupmum/GpuRuntime.h b/epochX/cudacpp/ee_mumu.sa/SubProcesses/P1_Sigma_sm_epem_mupmum/GpuRuntime.h
new file mode 120000
index 0000000000..3920e83be4
--- /dev/null
+++ b/epochX/cudacpp/ee_mumu.sa/SubProcesses/P1_Sigma_sm_epem_mupmum/GpuRuntime.h
@@ -0,0 +1 @@
+../GpuRuntime.h
\ No newline at end of file
diff --git a/epochX/cudacpp/ee_mumu.sa/SubProcesses/P1_Sigma_sm_epem_mupmum/check_sa.cc b/epochX/cudacpp/ee_mumu.sa/SubProcesses/P1_Sigma_sm_epem_mupmum/check_sa.cc
index 3fbf0ffbee..7cac5ab47b 100644
--- a/epochX/cudacpp/ee_mumu.sa/SubProcesses/P1_Sigma_sm_epem_mupmum/check_sa.cc
+++ b/epochX/cudacpp/ee_mumu.sa/SubProcesses/P1_Sigma_sm_epem_mupmum/check_sa.cc
@@ -4,7 +4,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: O. Mattelaer (Nov 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 
 #include "mgOnGpuConfig.h"
@@ -12,6 +12,7 @@
 #include "BridgeKernels.h"
 #include "CPPProcess.h"
 #include "CrossSectionKernels.h"
+#include "GpuRuntime.h"
 #include "MatrixElementKernels.h"
 #include "MemoryAccessMatrixElements.h"
 #include "MemoryAccessMomenta.h"
@@ -65,7 +66,7 @@ usage( char* argv0, int ret = 1 )
   std::cout << std::endl;
   std::cout << "Summary stats are always computed: '-p' and '-j' only control their printout" << std::endl;
   std::cout << "The '-d' flag only enables NaN/abnormal warnings and OMP debugging" << std::endl;
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 #ifdef _OPENMP
   std::cout << std::endl;
   std::cout << "Use the OMP_NUM_THREADS environment variable to control OMP multi-threading" << std::endl;
@@ -96,7 +97,7 @@ int
 main( int argc, char** argv )
 {
   // Namespaces for CUDA and C++ (FIXME - eventually use the same namespace everywhere...)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   using namespace mg5amcGpu;
 #else
   using namespace mg5amcCpu;
@@ -134,9 +135,11 @@ main( int argc, char** argv )
     CurandDevice = 2
   };
 #ifdef MGONGPU_HAS_NO_CURAND
-  RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // this is the only supported mode if build has no curand (PR #784)
+  RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // this is the only supported mode if build has no curand (PR #784 and #785)
+#elif defined __HIPCC__
+#error Internal error: MGONGPU_HAS_NO_CURAND should have been set for __HIPCC__ // default on AMD GPUs should be common random
 #elif defined __CUDACC__
-  RandomNumberMode rndgen = RandomNumberMode::CurandDevice; // default on GPU if build has curand
+  RandomNumberMode rndgen = RandomNumberMode::CurandDevice; // default on NVidia GPU if build has curand
 #else
   RandomNumberMode rndgen = RandomNumberMode::CurandHost; // default on CPU if build has curand
 #endif
@@ -146,10 +149,10 @@ main( int argc, char** argv )
     RamboHost = 1,
     RamboDevice = 2
   };
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   RamboSamplingMode rmbsmp = RamboSamplingMode::RamboDevice; // default on GPU
 #else
-  RamboSamplingMode rmbsmp = RamboSamplingMode::RamboHost;  // default on CPU
+  RamboSamplingMode rmbsmp = RamboSamplingMode::RamboHost; // default on CPU
 #endif
   // Bridge emulation mode (NB Bridge implies RamboHost!)
   bool bridge = false;
@@ -177,7 +180,7 @@ main( int argc, char** argv )
     else if( arg == "--curdev" )
     {
 #ifndef __CUDACC__
-      throw std::runtime_error( "CurandDevice is not supported on CPUs" );
+      throw std::runtime_error( "CurandDevice is not supported on CPUs or non-NVidia GPUs" );
 #elif defined MGONGPU_HAS_NO_CURAND
       throw std::runtime_error( "CurandDevice is not supported because this application was built without Curand support" );
 #else
@@ -198,7 +201,7 @@ main( int argc, char** argv )
     }
     else if( arg == "--rmbdev" )
     {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
       rmbsmp = RamboSamplingMode::RamboDevice;
 #else
       throw std::runtime_error( "RamboDevice is not supported on CPUs" );
@@ -272,13 +275,13 @@ main( int argc, char** argv )
     return usage( argv[0] );
   }
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 #ifdef _OPENMP
   ompnumthreadsNotSetMeansOneThread( debug ? 1 : 0 ); // quiet(-1), info(0), debug(1)
 #endif
 #endif
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // Fail gently and avoid "Illegal instruction (core dumped)" if the host does not support the SIMD used in the ME calculation
   // Note: this prevents a crash on pmpe04 but not on some github CI nodes?
   // [NB: SIMD vectorization in mg5amc C++ code is only used in the ME calculation below MatrixElementKernelHost!]
@@ -296,14 +299,14 @@ main( int argc, char** argv )
 
   // === STEP 0 - INITIALISE
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 
-  // --- 00. Initialise cuda
-  // Instantiate a CudaRuntime at the beginnining of the application's main to
-  // invoke cudaSetDevice(0) in the constructor and book a cudaDeviceReset() call in the destructor
-  const std::string cdinKey = "00 CudaInit";
+  // --- 00. Initialise GPU
+  // Instantiate a GpuRuntime at the beginnining of the application's main.
+  // For CUDA this invokes cudaSetDevice(0) in the constructor and books a cudaDeviceReset() call in the destructor.
+  const std::string cdinKey = "00 GpuInit";
   timermap.start( cdinKey );
-  CudaRuntime cudaRuntime( debug );
+  GpuRuntime GpuRuntime( debug );
 #endif
 
   // --- 0a. Initialise physics process
@@ -325,7 +328,7 @@ main( int argc, char** argv )
   timermap.start( alloKey );
 
   // Memory buffers for random numbers for momenta
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferRndNumMomenta hstRndmom( nevt );
 #else
   PinnedHostBufferRndNumMomenta hstRndmom( nevt );
@@ -333,7 +336,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for sampling weights
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferWeights hstWeights( nevt );
 #else
   PinnedHostBufferWeights hstWeights( nevt );
@@ -341,7 +344,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for momenta
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferMomenta hstMomenta( nevt );
 #else
   PinnedHostBufferMomenta hstMomenta( nevt );
@@ -349,7 +352,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for Gs
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferGs hstGs( nevt );
 #else
   PinnedHostBufferGs hstGs( nevt );
@@ -366,7 +369,7 @@ main( int argc, char** argv )
   }
 
   // Memory buffers for matrix elements
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferMatrixElements hstMatrixElements( nevt );
 #else
   PinnedHostBufferMatrixElements hstMatrixElements( nevt );
@@ -375,7 +378,7 @@ main( int argc, char** argv )
 
   // Memory buffers for random numbers for helicity selection
   // *** NB #403 these buffers always remain initialised at 0: no need for helicity choice in gcheck/check (no LHE produced) ***
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferRndNumHelicity hstRndHel( nevt );
 #else
   PinnedHostBufferRndNumHelicity hstRndHel( nevt );
@@ -384,7 +387,7 @@ main( int argc, char** argv )
 
   // Memory buffers for random numbers for color selection
   // *** NB #402 these buffers always remain initialised at 0: no need for color choice in gcheck/check (no LHE produced) ***
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferRndNumColor hstRndCol( nevt );
 #else
   PinnedHostBufferRndNumColor hstRndCol( nevt );
@@ -392,7 +395,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for helicity selection
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferSelectedHelicity hstSelHel( nevt );
 #else
   PinnedHostBufferSelectedHelicity hstSelHel( nevt );
@@ -400,7 +403,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for color selection
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferSelectedColor hstSelCol( nevt );
 #else
   PinnedHostBufferSelectedColor hstSelCol( nevt );
@@ -438,7 +441,7 @@ main( int argc, char** argv )
     const bool onDevice = true;
     prnk.reset( new CurandRandomNumberKernel( devRndmom, onDevice ) );
 #else
-    throw std::logic_error( "INTERNAL ERROR! CurandDevice is not supported on CPUs" ); // INTERNAL ERROR (no path to this statement)
+    throw std::logic_error( "INTERNAL ERROR! CurandDevice is not supported on CPUs or non-NVidia GPUs" ); // INTERNAL ERROR (no path to this statement)
 #endif
   }
 
@@ -450,7 +453,7 @@ main( int argc, char** argv )
   }
   else
   {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     prsk.reset( new RamboSamplingKernelDevice( energy, devRndmom, devMomenta, devWeights, gpublocks, gputhreads ) );
 #else
     throw std::logic_error( "RamboDevice is not supported on CPUs" ); // INTERNAL ERROR (no path to this statement)
@@ -461,7 +464,7 @@ main( int argc, char** argv )
   std::unique_ptr<MatrixElementKernelBase> pmek;
   if( !bridge )
   {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     pmek.reset( new MatrixElementKernelDevice( devMomenta, devGs, devRndHel, devRndCol, devMatrixElements, devSelHel, devSelCol, gpublocks, gputhreads ) );
 #else
     pmek.reset( new MatrixElementKernelHost( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, nevt ) );
@@ -469,7 +472,7 @@ main( int argc, char** argv )
   }
   else
   {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     pmek.reset( new BridgeKernelDevice( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, gpublocks, gputhreads ) );
 #else
     pmek.reset( new BridgeKernelHost( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, nevt ) );
@@ -511,7 +514,7 @@ main( int argc, char** argv )
     prnk->generateRnarray();
     //std::cout << "Got random numbers" << std::endl;
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     if( rndgen != RandomNumberMode::CurandDevice && rmbsmp == RamboSamplingMode::RamboDevice )
     {
       // --- 1c. Copy rndmom from host to device
@@ -543,7 +546,7 @@ main( int argc, char** argv )
     prsk->getMomentaFinal();
     //std::cout << "Got final momenta" << std::endl;
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     if( rmbsmp == RamboSamplingMode::RamboDevice )
     {
       // --- 2c. CopyDToH Weights
@@ -588,7 +591,7 @@ main( int argc, char** argv )
       dynamic_cast<BridgeKernelBase*>( pmek.get() )->transposeInputMomentaC2F();
     }
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     // --- 2d. CopyHToD Momenta
     const std::string gKey = "0.. CpHTDg";
     rambtime += timermap.start( gKey ); // FIXME! NOT A RAMBO TIMER!
@@ -617,7 +620,7 @@ main( int argc, char** argv )
     wv3atime += timermap.stop(); // calc only
     wavetime += wv3atime;        // calc plus copy
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     if( !bridge )
     {
       // --- 3b. CopyDToH MEs
@@ -760,18 +763,22 @@ main( int argc, char** argv )
     rndgentxt = "CURAND DEVICE";
 #ifdef __CUDACC__
   rndgentxt += " (CUDA code)";
+#elif defined __HIPCC__
+  rndgentxt += " (HIP code)";
 #else
   rndgentxt += " (C++ code)";
 #endif
 
   // Workflow description summary
   std::string wrkflwtxt;
-  // -- CUDA or C++?
+  // -- CUDA or HIP or C++?
 #ifdef __CUDACC__
   wrkflwtxt += "CUD:";
+#elif defined __HIPCC__
+  wrkflwtxt += "HIP:";
 #else
   wrkflwtxt += "CPP:";
-#endif
+#endif /* clang-format off */
   // -- DOUBLE or FLOAT?
 #if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
   wrkflwtxt += "MIX+"; // mixed fptypes (single precision color algebra #537)
@@ -781,7 +788,7 @@ main( int argc, char** argv )
   wrkflwtxt += "FLT+";
 #else
   wrkflwtxt += "???+"; // no path to this statement
-#endif
+#endif /* clang-format on */
   // -- CUCOMPLEX or THRUST or STD complex numbers?
 #ifdef __CUDACC__
 #if defined MGONGPU_CUCXTYPE_CUCOMPLEX
@@ -793,6 +800,12 @@ main( int argc, char** argv )
 #else
   wrkflwtxt += "???:"; // no path to this statement
 #endif
+#elif defined __HIPCC__
+#if defined MGONGPU_CUCXTYPE_CXSMPL
+  wrkflwtxt += "CXS:";
+#else
+  wrkflwtxt += "???:"; // no path to this statement
+#endif
 #else
 #if defined MGONGPU_CPPCXTYPE_STDCOMPLEX
   wrkflwtxt += "STX:";
@@ -818,7 +831,7 @@ main( int argc, char** argv )
     wrkflwtxt += "RMBDEV+";
   else
     wrkflwtxt += "??????+"; // no path to this statement
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   // -- HOST or DEVICE matrix elements? Standalone MEs or BRIDGE?
   if( !bridge )
     wrkflwtxt += "MESDEV";
@@ -874,7 +887,7 @@ main( int argc, char** argv )
 
   if( perf )
   {
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 #ifdef _OPENMP
     // Get the output of "nproc --all" (https://stackoverflow.com/a/478960)
     std::string nprocall;
@@ -895,6 +908,8 @@ main( int argc, char** argv )
     std::cout << std::string( SEP79, '*' ) << std::endl
 #ifdef __CUDACC__
               << "Process                     = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_CUDA"
+#elif defined __HIPCC__
+              << "Process                     = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_HIP"
 #else
               << "Process                     = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_CPP"
 #endif
@@ -921,21 +936,21 @@ main( int argc, char** argv )
 #elif defined MGONGPU_FPTYPE_FLOAT
               << "FP precision                = FLOAT (NaN/abnormal=" << nabn << ", zero=" << nzero << ")" << std::endl
 #endif
-#ifdef __CUDACC__
 #if defined MGONGPU_CUCXTYPE_CUCOMPLEX
               << "Complex type                = CUCOMPLEX" << std::endl
 #elif defined MGONGPU_CUCXTYPE_THRUST
               << "Complex type                = THRUST::COMPLEX" << std::endl
-#endif
-#else
+#elif defined MGONGPU_CUCXTYPE_CXSMPL
               << "Complex type                = STD::COMPLEX" << std::endl
+#else
+              << "Complex type                = ???" << std::endl // no path to this statement...
 #endif
               << "RanNumb memory layout       = AOSOA[" << neppR << "]"
               << ( neppR == 1 ? " == AOS" : "" )
               << " [HARDCODED FOR REPRODUCIBILITY]" << std::endl
               << "Momenta memory layout       = AOSOA[" << neppM << "]"
               << ( neppM == 1 ? " == AOS" : "" ) << std::endl
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     //<< "Wavefunction GPU memory     = LOCAL" << std::endl
 #else
 #if !defined MGONGPU_CPPSIMD
@@ -966,7 +981,7 @@ main( int argc, char** argv )
 #endif
 #endif
               << "Random number generation    = " << rndgentxt << std::endl
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 #ifdef _OPENMP
               << "OMP threads / `nproc --all` = " << omp_get_max_threads() << " / " << nprocall // includes a newline
 #endif
@@ -1062,14 +1077,14 @@ main( int argc, char** argv )
              << "\"FLOAT (NaN/abnormal=" << nabn << ")\"," << std::endl
 #endif
              << "\"Complex type\": "
-#ifdef __CUDACC__
 #if defined MGONGPU_CUCXTYPE_CUCOMPLEX
              << "\"CUCOMPLEX\"," << std::endl
 #elif defined MGONGPU_CUCXTYPE_THRUST
              << "\"THRUST::COMPLEX\"," << std::endl
-#endif
-#else
+#elif defined MGONGPU_CUCXTYPE_CXSMPL
              << "\"STD::COMPLEX\"," << std::endl
+#else
+             << "\"???\"," << std::endl                           // no path to this statement...
 #endif
              << "\"RanNumb memory layout\": "
              << "\"AOSOA[" << neppR << "]\""
@@ -1077,7 +1092,7 @@ main( int argc, char** argv )
              << "\"Momenta memory layout\": "
              << "\"AOSOA[" << neppM << "]\""
              << ( neppM == 1 ? " == AOS" : "" ) << ", " << std::endl
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     //<< "\"Wavefunction GPU memory\": " << "\"LOCAL\"," << std::endl
 #endif
              << "\"Curand generation\": "
diff --git a/epochX/cudacpp/ee_mumu.sa/SubProcesses/RamboSamplingKernels.cc b/epochX/cudacpp/ee_mumu.sa/SubProcesses/RamboSamplingKernels.cc
index da68aa9255..79abbcc4f8 100644
--- a/epochX/cudacpp/ee_mumu.sa/SubProcesses/RamboSamplingKernels.cc
+++ b/epochX/cudacpp/ee_mumu.sa/SubProcesses/RamboSamplingKernels.cc
@@ -1,11 +1,11 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #include "RamboSamplingKernels.h"
 
-#include "CudaRuntime.h"
+#include "GpuRuntime.h"
 #include "MemoryAccessMomenta.h"
 #include "MemoryAccessRandomNumbers.h"
 #include "MemoryAccessWeights.h"
@@ -14,7 +14,7 @@
 
 #include <sstream>
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -92,7 +92,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   RamboSamplingKernelDevice::RamboSamplingKernelDevice( const fptype energy,               // input: energy
                                                         const BufferRndNumMomenta& rndmom, // input: random numbers in [0,1]
                                                         BufferMomenta& momenta,            // output: momenta
@@ -135,7 +135,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   __global__ void
   getMomentaInitialDevice( const fptype energy,
                            fptype* momenta )
@@ -147,17 +147,17 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   void
   RamboSamplingKernelDevice::getMomentaInitial()
   {
-    getMomentaInitialDevice<<<m_gpublocks, m_gputhreads>>>( m_energy, m_momenta.data() );
+    gpuLaunchKernel( getMomentaInitialDevice, m_gpublocks, m_gputhreads, m_energy, m_momenta.data() );
   }
 #endif
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   __global__ void
   getMomentaFinalDevice( const fptype energy,
                          const fptype* rndmom,
@@ -171,11 +171,11 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   void
   RamboSamplingKernelDevice::getMomentaFinal()
   {
-    getMomentaFinalDevice<<<m_gpublocks, m_gputhreads>>>( m_energy, m_rndmom.data(), m_momenta.data(), m_weights.data() );
+    gpuLaunchKernel( getMomentaFinalDevice, m_gpublocks, m_gputhreads, m_energy, m_rndmom.data(), m_momenta.data(), m_weights.data() );
   }
 #endif
 
diff --git a/epochX/cudacpp/ee_mumu.sa/SubProcesses/RamboSamplingKernels.h b/epochX/cudacpp/ee_mumu.sa/SubProcesses/RamboSamplingKernels.h
index 184089efd7..7c214cd74b 100644
--- a/epochX/cudacpp/ee_mumu.sa/SubProcesses/RamboSamplingKernels.h
+++ b/epochX/cudacpp/ee_mumu.sa/SubProcesses/RamboSamplingKernels.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef RAMBOSAMPLINGKERNELS_H
 #define RAMBOSAMPLINGKERNELS_H 1
@@ -10,7 +10,7 @@
 
 #include "MemoryBuffers.h"
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -93,7 +93,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   // A class encapsulating RAMBO phase space sampling on a GPU device
   class RamboSamplingKernelDevice final : public SamplingKernelBase, public NumberOfEvents
   {
diff --git a/epochX/cudacpp/ee_mumu.sa/SubProcesses/RandomNumberKernels.h b/epochX/cudacpp/ee_mumu.sa/SubProcesses/RandomNumberKernels.h
index 188a72c2c9..21d63beeac 100644
--- a/epochX/cudacpp/ee_mumu.sa/SubProcesses/RandomNumberKernels.h
+++ b/epochX/cudacpp/ee_mumu.sa/SubProcesses/RandomNumberKernels.h
@@ -1,14 +1,14 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef RANDOMNUMBERKERNELS_H
 #define RANDOMNUMBERKERNELS_H 1
 
 #include "mgOnGpuConfig.h"
 
-// NB This must come AFTER mgOnGpuConfig.h which contains our definition of __global__ when __CUDACC__ is not defined
+// NB This must come AFTER mgOnGpuConfig.h which contains our definition of __global__ when MGONGPUCPP_GPUIMPL is not defined
 #ifndef MGONGPU_HAS_NO_CURAND
 //#include "curand.h"
 struct curandGenerator_st; // forward definition from curand.h
@@ -16,7 +16,7 @@ struct curandGenerator_st; // forward definition from curand.h
 
 #include "MemoryBuffers.h"
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/ee_mumu.sa/SubProcesses/cudacpp.mk b/epochX/cudacpp/ee_mumu.sa/SubProcesses/cudacpp.mk
index 509307506b..f2cfa349da 100644
--- a/epochX/cudacpp/ee_mumu.sa/SubProcesses/cudacpp.mk
+++ b/epochX/cudacpp/ee_mumu.sa/SubProcesses/cudacpp.mk
@@ -1,7 +1,7 @@
 # Copyright (C) 2020-2023 CERN and UCLouvain.
 # Licensed under the GNU Lesser General Public License (version 3 or later).
 # Created by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin.
-# Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+# Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 
 #=== Determine the name of this makefile (https://ftp.gnu.org/old-gnu/Manuals/make-3.80/html_node/make_17.html)
 #=== NB: use ':=' to ensure that the value of CUDACPP_MAKEFILE is not modified further down after including make_opts
@@ -42,10 +42,10 @@ endif
 
 #-------------------------------------------------------------------------------
 
-#=== Configure common compiler flags for C++ and CUDA
+#=== Configure common compiler flags for C++ and CUDA/HIP
 
 INCFLAGS = -I.
-OPTFLAGS = -O3 # this ends up in CUFLAGS too (should it?), cannot add -Ofast or -ffast-math here
+OPTFLAGS = -O3 # this ends up in GPUFLAGS too (should it?), cannot add -Ofast or -ffast-math here
 
 # Dependency on src directory
 MG5AMC_COMMONLIB = mg5amc_common
@@ -121,24 +121,46 @@ endif
 
 #-------------------------------------------------------------------------------
 
-#=== Configure the CUDA compiler
+#=== Configure the GPU compiler (CUDA or HIP)
 
-# If CXX is not a single word (example "clang++ --gcc-toolchain...") then disable CUDA builds (issue #505)
-# This is because it is impossible to pass this to "CUFLAGS += -ccbin <host-compiler>" below
+# FIXME! (AV 24.01.2024)
+# In the current implementation (without separate builds for C++ and CUDA/HIP), we first check for cudacc and hipcc in CUDA_HOME and HIP_HOME.
+# If CUDA_HOME or HIP_HOME are not set, try to determine them from the path to cudacc and hipcc.
+# While convoluted, this is currently necessary to allow disabling CUDA/HIP builds by setting CUDA_HOME or HIP_HOME to invalid paths.
+# This will (probably?) be fixed when separate C++ and CUDA/HIP builds are implemented (PR #775).
+
+# If CXX is not a single word (example "clang++ --gcc-toolchain...") then disable CUDA and HIP builds (issue #505)
+# This is because it is impossible to pass this to "GPUFLAGS += -ccbin <host-compiler>" below
 ifneq ($(words $(subst ccache ,,$(CXX))),1) # allow at most "CXX=ccache <host-compiler>" from outside
-  $(warning CUDA builds are not supported for multi-word CXX "$(CXX)")
+  $(warning CUDA and HIP builds are not supported for multi-word CXX "$(CXX)")
   override CUDA_HOME=disabled
+  override HIP_HOME=disabled
 endif
 
-# If CUDA_HOME is not set, try to set it from the location of nvcc
+# If CUDA_HOME is not set, try to set it from the path to nvcc
 ifndef CUDA_HOME
   CUDA_HOME = $(patsubst %bin/nvcc,%,$(shell which nvcc 2>/dev/null))
   $(warning CUDA_HOME was not set: using "$(CUDA_HOME)")
 endif
 
-# Set NVCC as $(CUDA_HOME)/bin/nvcc if it exists
+# If HIP_HOME is not set, try to set it from the path to hipcc
+ifndef HIP_HOME
+  HIP_HOME = $(patsubst %bin/hipcc,%,$(HIP_COMPILER_PATH))
+  $(warning HIP_HOME was not set: using "$(HIP_HOME)")
+endif
+
+# FIXME! (AV 24.01.2024)
+# In the current implementation (without separate builds for C++ and CUDA/HIP),
+# builds are performed for HIP only if CUDA is not found in the path.
+# If both CUDA and HIP are installed, HIP builds can be triggered by unsetting CUDA_HOME.
+# This will be fixed when separate C++ and CUDA/HIP builds are implemented (PR #775).
+
+#--- Option 1: CUDA exists -> use CUDA
+
+# Set GPUCC as $(CUDA_HOME)/bin/nvcc if it exists
 ifneq ($(wildcard $(CUDA_HOME)/bin/nvcc),)
-  NVCC = $(CUDA_HOME)/bin/nvcc
+
+  GPUCC = $(CUDA_HOME)/bin/nvcc
   USE_NVTX ?=-DUSE_NVTX
   # See https://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/index.html
   # See https://arnon.dk/matching-sm-architectures-arch-and-gencode-for-various-nvidia-cards/
@@ -158,41 +180,78 @@ ifneq ($(wildcard $(CUDA_HOME)/bin/nvcc),)
     CURANDLIBFLAGS = -L$(CUDA_HOME)/lib64/ -lcurand # NB: -lcuda is not needed here!
   endif
   CUOPTFLAGS = -lineinfo
-  CUFLAGS = $(foreach opt, $(OPTFLAGS), -Xcompiler $(opt)) $(CUOPTFLAGS) $(INCFLAGS) $(CUINC) $(USE_NVTX) $(CUARCHFLAGS) -use_fast_math
-  ###CUFLAGS += -Xcompiler -Wall -Xcompiler -Wextra -Xcompiler -Wshadow
-  ###NVCC_VERSION = $(shell $(NVCC) --version | grep 'Cuda compilation tools' | cut -d' ' -f5 | cut -d, -f1)
-  CUFLAGS += -std=c++17 # need CUDA >= 11.2 (see #333): this is enforced in mgOnGpuConfig.h
+  ###GPUFLAGS = $(OPTFLAGS) $(CUOPTFLAGS) $(INCFLAGS) $(CUINC) $(USE_NVTX) $(CUARCHFLAGS) -use_fast_math
+  GPUFLAGS = $(foreach opt, $(OPTFLAGS), -Xcompiler $(opt)) $(CUOPTFLAGS) $(INCFLAGS) $(CUINC) $(USE_NVTX) $(CUARCHFLAGS) -use_fast_math
+  ###GPUFLAGS += -Xcompiler -Wall -Xcompiler -Wextra -Xcompiler -Wshadow
+  ###GPUCC_VERSION = $(shell $(GPUCC) --version | grep 'Cuda compilation tools' | cut -d' ' -f5 | cut -d, -f1)
+  GPUFLAGS += -std=c++17 # need CUDA >= 11.2 (see #333): this is enforced in mgOnGpuConfig.h
   # Without -maxrregcount: baseline throughput: 6.5E8 (16384 32 12) up to 7.3E8 (65536 128 12)
-  ###CUFLAGS+= --maxrregcount 160 # improves throughput: 6.9E8 (16384 32 12) up to 7.7E8 (65536 128 12)
-  ###CUFLAGS+= --maxrregcount 128 # improves throughput: 7.3E8 (16384 32 12) up to 7.6E8 (65536 128 12)
-  ###CUFLAGS+= --maxrregcount 96 # degrades throughput: 4.1E8 (16384 32 12) up to 4.5E8 (65536 128 12)
-  ###CUFLAGS+= --maxrregcount 64 # degrades throughput: 1.7E8 (16384 32 12) flat at 1.7E8 (65536 128 12)
+  ###GPUFLAGS+= --maxrregcount 160 # improves throughput: 6.9E8 (16384 32 12) up to 7.7E8 (65536 128 12)
+  ###GPUFLAGS+= --maxrregcount 128 # improves throughput: 7.3E8 (16384 32 12) up to 7.6E8 (65536 128 12)
+  ###GPUFLAGS+= --maxrregcount 96 # degrades throughput: 4.1E8 (16384 32 12) up to 4.5E8 (65536 128 12)
+  ###GPUFLAGS+= --maxrregcount 64 # degrades throughput: 1.7E8 (16384 32 12) flat at 1.7E8 (65536 128 12)
+  CUBUILDRULEFLAGS = -Xcompiler -fPIC -c
+  CCBUILDRULEFLAGS = -Xcompiler -fPIC -c -x cu
+  CUDATESTFLAGS = -lcuda
+
+  # Set the host C++ compiler for GPUCC via "-ccbin <host-compiler>"
+  # (NB issue #505: this must be a single word, "clang++ --gcc-toolchain..." is not supported)
+  GPUFLAGS += -ccbin $(shell which $(subst ccache ,,$(CXX)))
+
+  # Allow newer (unsupported) C++ compilers with older versions of CUDA if ALLOW_UNSUPPORTED_COMPILER_IN_CUDA is set (#504)
+  ifneq ($(origin ALLOW_UNSUPPORTED_COMPILER_IN_CUDA),undefined)
+  GPUFLAGS += -allow-unsupported-compiler
+  endif
+
 else ifneq ($(origin REQUIRE_CUDA),undefined)
+
   # If REQUIRE_CUDA is set but no cuda is found, stop here (e.g. for CI tests on GPU #443)
-  $(error No cuda installation found (set CUDA_HOME or make nvcc visible in PATH))
+  $(error No cuda installation found (set CUDA_HOME or make GPUCC visible in PATH))
+
+#--- Option 2: CUDA does not exist, HIP exists -> use HIP
+
+# Set GPUCC as $(HIP_HOME)/bin/hipcc if it exists
+else ifneq ($(wildcard $(HIP_HOME)/bin/hipcc),)
+
+  GPUCC = $(HIP_HOME)/bin/hipcc
+  #USE_NVTX ?=-DUSE_NVTX # should maybe find something equivalent to this in HIP?
+  HIPARCHFLAGS = -target x86_64-linux-gnu --offload-arch=gfx90a
+  HIPINC = -I$(HIP_HOME)/include/
+  # Note: -DHIP_FAST_MATH is equivalent to -use_fast_math in HIP 
+  # (but only for single precision line 208: https://rocm-developer-tools.github.io/HIP/hcc__detail_2math__functions_8h_source.html)
+  GPUFLAGS = $(OPTFLAGS) $(CUOPTFLAGS) $(INCFLAGS) $(HIPINC) $(HIPARCHFLAGS) -DHIP_FAST_MATH -DHIP_PLATFORM=amd -fPIC
+  ###GPUFLAGS += -Xcompiler -Wall -Xcompiler -Wextra -Xcompiler -Wshadow
+  GPUFLAGS += -std=c++17
+  ###GPUFLAGS+= --maxrregcount 255 # (AV: is this option valid on HIP and meaningful on AMD GPUs?)
+  CUBUILDRULEFLAGS = -fPIC -c
+  CCBUILDRULEFLAGS = -fPIC -c
+
+else ifneq ($(origin REQUIRE_HIP),undefined)
+
+  # If REQUIRE_HIP is set but no HIP is found, stop here (e.g. for CI tests on GPU #443)
+  $(error No hip installation found (set HIP_HOME or make GPUCC visible in PATH))
+
+#--- Option 3: CUDA does not exist, HIP does not exist -> switch off both CUDA and HIP
+
 else
-  # No cuda. Switch cuda compilation off and go to common random numbers in C++
+
+  # No cudacc and no hipcc: switch CUDA and HIP compilation off and go to common random numbers in C++
   $(warning CUDA_HOME is not set or is invalid: export CUDA_HOME to compile with cuda)
-  override NVCC=
+  $(warning HIP_HOME is not set or is invalid: export HIP_HOME to compile with hip)
+  override GPUCC=
   override USE_NVTX=
   override CUINC=
   override CURANDLIBFLAGS=
-endif
-export NVCC
-export CUFLAGS
-
-# Set the host C++ compiler for nvcc via "-ccbin <host-compiler>"
-# (NB issue #505: this must be a single word, "clang++ --gcc-toolchain..." is not supported)
-CUFLAGS += -ccbin $(shell which $(subst ccache ,,$(CXX)))
 
-# Allow newer (unsupported) C++ compilers with older versions of CUDA if ALLOW_UNSUPPORTED_COMPILER_IN_CUDA is set (#504)
-ifneq ($(origin ALLOW_UNSUPPORTED_COMPILER_IN_CUDA),undefined)
-CUFLAGS += -allow-unsupported-compiler
 endif
 
+# Export GPUCC (so that it can also be used in cudacpp_src.mk?)
+export GPUCC
+export GPUFLAGS
+
 #-------------------------------------------------------------------------------
 
-#=== Configure ccache for C++ and CUDA builds
+#=== Configure ccache for C++ and CUDA/HIP builds
 
 # Enable ccache if USECCACHE=1
 ifeq ($(USECCACHE)$(shell echo $(CXX) | grep ccache),1)
@@ -201,15 +260,15 @@ endif
 #ifeq ($(USECCACHE)$(shell echo $(AR) | grep ccache),1)
 #  override AR:=ccache $(AR)
 #endif
-ifneq ($(NVCC),)
-  ifeq ($(USECCACHE)$(shell echo $(NVCC) | grep ccache),1)
-    override NVCC:=ccache $(NVCC)
+ifneq ($(GPUCC),)
+  ifeq ($(USECCACHE)$(shell echo $(GPUCC) | grep ccache),1)
+    override GPUCC:=ccache $(GPUCC)
   endif
 endif
 
 #-------------------------------------------------------------------------------
 
-#=== Configure PowerPC-specific compiler flags for C++ and CUDA
+#=== Configure PowerPC-specific compiler flags for C++ and CUDA/HIP
 
 # PowerPC-specific CXX compiler flags (being reviewed)
 ifeq ($(UNAME_P),ppc64le)
@@ -225,9 +284,9 @@ else
   ######CXXFLAGS+= -fno-semantic-interposition # no benefit (neither alone, nor combined with -flto)
 endif
 
-# PowerPC-specific CUDA compiler flags (to be reviewed!)
+# PowerPC-specific CUDA/HIP compiler flags (to be reviewed!)
 ifeq ($(UNAME_P),ppc64le)
-  CUFLAGS+= -Xcompiler -mno-float128
+  GPUFLAGS+= -Xcompiler -mno-float128
 endif
 
 #-------------------------------------------------------------------------------
@@ -237,7 +296,7 @@ endif
 # Set the default OMPFLAGS choice
 ifneq ($(shell $(CXX) --version | egrep '^Intel'),)
 override OMPFLAGS = -fopenmp
-###override OMPFLAGS = # disable OpenMP MT on Intel (was ok without nvcc but not ok with nvcc before #578)
+###override OMPFLAGS = # disable OpenMP MT on Intel (was ok without GPUCC but not ok with GPUCC before #578)
 else ifneq ($(shell $(CXX) --version | egrep '^(clang)'),)
 override OMPFLAGS = -fopenmp
 ###override OMPFLAGS = # disable OpenMP MT on clang (was not ok without or with nvcc before #578)
@@ -293,7 +352,10 @@ endif
 
 # Set the default RNDGEN (random number generator) choice
 ifeq ($(RNDGEN),)
-  ifeq ($(NVCC),)
+  ifeq ($(GPUCC),)
+    override RNDGEN = hasNoCurand
+  # Edgecase for HIP compilation
+  else ifeq ($(findstring hipcc,$(GPUCC)),hipcc)
     override RNDGEN = hasNoCurand
   else ifeq ($(RNDGEN),)
     override RNDGEN = hasCurand
@@ -310,7 +372,7 @@ export OMPFLAGS
 
 #-------------------------------------------------------------------------------
 
-#=== Set the CUDA/C++ compiler flags appropriate to user-defined choices of AVX, FPTYPE, HELINL, HRDCOD, RNDGEN
+#=== Set the CUDA/HIP/C++ compiler flags appropriate to user-defined choices of AVX, FPTYPE, HELINL, HRDCOD, RNDGEN
 
 # Set the build flags appropriate to OMPFLAGS
 $(info OMPFLAGS=$(OMPFLAGS))
@@ -368,13 +430,13 @@ CXXFLAGS+= $(AVXFLAGS)
 $(info FPTYPE=$(FPTYPE))
 ifeq ($(FPTYPE),d)
   CXXFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_DOUBLE
-  CUFLAGS  += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_DOUBLE
+  GPUFLAGS  += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_DOUBLE
 else ifeq ($(FPTYPE),f)
   CXXFLAGS += -DMGONGPU_FPTYPE_FLOAT -DMGONGPU_FPTYPE2_FLOAT
-  CUFLAGS  += -DMGONGPU_FPTYPE_FLOAT -DMGONGPU_FPTYPE2_FLOAT
+  GPUFLAGS  += -DMGONGPU_FPTYPE_FLOAT -DMGONGPU_FPTYPE2_FLOAT
 else ifeq ($(FPTYPE),m)
   CXXFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_FLOAT
-  CUFLAGS  += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_FLOAT
+  GPUFLAGS  += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_FLOAT
 else
   $(error Unknown FPTYPE='$(FPTYPE)': only 'd', 'f' and 'm' are supported)
 endif
@@ -383,7 +445,7 @@ endif
 $(info HELINL=$(HELINL))
 ifeq ($(HELINL),1)
   CXXFLAGS += -DMGONGPU_INLINE_HELAMPS
-  CUFLAGS  += -DMGONGPU_INLINE_HELAMPS
+  GPUFLAGS  += -DMGONGPU_INLINE_HELAMPS
 else ifneq ($(HELINL),0)
   $(error Unknown HELINL='$(HELINL)': only '0' and '1' are supported)
 endif
@@ -392,7 +454,7 @@ endif
 $(info HRDCOD=$(HRDCOD))
 ifeq ($(HRDCOD),1)
   CXXFLAGS += -DMGONGPU_HARDCODE_PARAM
-  CUFLAGS  += -DMGONGPU_HARDCODE_PARAM
+  GPUFLAGS  += -DMGONGPU_HARDCODE_PARAM
 else ifneq ($(HRDCOD),0)
   $(error Unknown HRDCOD='$(HRDCOD)': only '0' and '1' are supported)
 endif
@@ -444,11 +506,11 @@ ifeq ($(UNAME_S),Darwin)
   override CULIBFLAGSRPATH2 =
 else
   # RPATH to cuda/cpp libs when linking executables
-  override CXXLIBFLAGSRPATH = -Wl,-rpath,$(LIBDIRRPATH)
-  override CULIBFLAGSRPATH = -Xlinker -rpath,$(LIBDIRRPATH)
+  override CXXLIBFLAGSRPATH = -Wl,-rpath=$(LIBDIRRPATH)
+  override CULIBFLAGSRPATH = -Xlinker -rpath=$(LIBDIRRPATH)
   # RPATH to common lib when linking cuda/cpp libs
-  override CXXLIBFLAGSRPATH2 = -Wl,-rpath,'$$ORIGIN'
-  override CULIBFLAGSRPATH2 = -Xlinker -rpath,'$$ORIGIN'
+  override CXXLIBFLAGSRPATH2 = -Wl,-rpath='$$ORIGIN'
+  override CULIBFLAGSRPATH2 = -Xlinker -rpath='$$ORIGIN'
 endif
 
 # Setting LD_LIBRARY_PATH or DYLD_LIBRARY_PATH in the RUNTIME is no longer necessary (neither on Linux nor on Mac)
@@ -461,7 +523,7 @@ override RUNTIME =
 cxx_main=$(BUILDDIR)/check.exe
 fcxx_main=$(BUILDDIR)/fcheck.exe
 
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 cu_main=$(BUILDDIR)/gcheck.exe
 fcu_main=$(BUILDDIR)/fgcheck.exe
 else
@@ -492,15 +554,16 @@ $(BUILDDIR)/.build.$(TAG):
 	@touch $(BUILDDIR)/.build.$(TAG)
 
 # Generic target and build rules: objects from CUDA compilation
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 $(BUILDDIR)/%.o : %.cu *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
 	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
-	$(NVCC) $(CPPFLAGS) $(CUFLAGS) -Xcompiler -fPIC -c $< -o $@
+	$(GPUCC) $(CPPFLAGS) $(GPUFLAGS) $(CUBUILDRULEFLAGS) $< -o $@
 
 $(BUILDDIR)/%_cu.o : %.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
 	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
-	$(NVCC) $(CPPFLAGS) $(CUFLAGS) -Xcompiler -fPIC -c -x cu $< -o $@
+	$(GPUCC) $(CPPFLAGS) $(GPUFLAGS) $(CCBUILDRULEFLAGS) $< -o $@
 endif
+# -x cu in line above
 
 # Generic target and build rules: objects from C++ compilation
 # (NB do not include CUINC here! add it only for NVTX or curand #679)
@@ -509,11 +572,14 @@ $(BUILDDIR)/%.o : %.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
 	$(CXX) $(CPPFLAGS) $(CXXFLAGS) -fPIC -c $< -o $@
 
 # Apply special build flags only to CrossSectionKernel.cc and gCrossSectionKernel.cu (no fast math, see #117 and #516)
+# Added edgecase for HIP compilation
 ifeq ($(shell $(CXX) --version | grep ^nvc++),)
 $(BUILDDIR)/CrossSectionKernels.o: CXXFLAGS := $(filter-out -ffast-math,$(CXXFLAGS))
 $(BUILDDIR)/CrossSectionKernels.o: CXXFLAGS += -fno-fast-math
-ifneq ($(NVCC),)
-$(BUILDDIR)/gCrossSectionKernels.o: CUFLAGS += -Xcompiler -fno-fast-math
+ifeq ($(findstring nvcc,$(GPUCC)),nvcc)
+  $(BUILDDIR)/gCrossSectionKernels.o: GPUFLAGS += -Xcompiler -fno-fast-math
+else
+  $(BUILDDIR)/gCrossSectionKernels.o: GPUFLAGS += -fno-fast-math
 endif
 endif
 
@@ -530,10 +596,10 @@ ifeq ($(RNDGEN),hasCurand)
 $(BUILDDIR)/CurandRandomNumberKernel.o: CXXFLAGS += $(CUINC)
 endif
 
-# Avoid "warning: builtin __has_trivial_... is deprecated; use __is_trivially_... instead" in nvcc with icx2023 (#592)
+# Avoid "warning: builtin __has_trivial_... is deprecated; use __is_trivially_... instead" in GPUCC with icx2023 (#592)
 ifneq ($(shell $(CXX) --version | egrep '^(Intel)'),)
-ifneq ($(NVCC),)
-CUFLAGS += -Xcompiler -Wno-deprecated-builtins
+ifneq ($(GPUCC),)
+GPUFLAGS += -Wno-deprecated-builtins
 endif
 endif
 
@@ -541,8 +607,8 @@ endif
 # This patch does remove the warning, but I prefer to keep it disabled for the moment...
 ###ifneq ($(shell $(CXX) --version | egrep '^(clang|Apple clang|Intel)'),)
 ###$(BUILDDIR)/CrossSectionKernels.o: CXXFLAGS += -Wno-overriding-t-option
-###ifneq ($(NVCC),)
-###$(BUILDDIR)/gCrossSectionKernels.o: CUFLAGS += -Xcompiler -Wno-overriding-t-option
+###ifneq ($(GPUCC),)
+###$(BUILDDIR)/gCrossSectionKernels.o: GPUFLAGS += -Xcompiler -Wno-overriding-t-option
 ###endif
 ###endif
 
@@ -569,7 +635,7 @@ MG5AMC_CXXLIB = mg5amc_$(processid_short)_cpp
 cxx_objects_lib=$(BUILDDIR)/CPPProcess.o $(BUILDDIR)/MatrixElementKernels.o $(BUILDDIR)/BridgeKernels.o $(BUILDDIR)/CrossSectionKernels.o
 cxx_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel.o $(BUILDDIR)/RamboSamplingKernels.o
 
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 MG5AMC_CULIB = mg5amc_$(processid_short)_cuda
 cu_objects_lib=$(BUILDDIR)/gCPPProcess.o $(BUILDDIR)/gMatrixElementKernels.o $(BUILDDIR)/gBridgeKernels.o $(BUILDDIR)/gCrossSectionKernels.o
 cu_objects_exe=$(BUILDDIR)/gCommonRandomNumberKernel.o $(BUILDDIR)/gRamboSamplingKernels.o
@@ -581,11 +647,11 @@ $(LIBDIR)/lib$(MG5AMC_CXXLIB).so: cxx_objects_lib += $(BUILDDIR)/fbridge.o
 $(LIBDIR)/lib$(MG5AMC_CXXLIB).so: $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib)
 	$(CXX) -shared -o $@ $(cxx_objects_lib) $(CXXLIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB)
 
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 $(LIBDIR)/lib$(MG5AMC_CULIB).so: $(BUILDDIR)/fbridge_cu.o
 $(LIBDIR)/lib$(MG5AMC_CULIB).so: cu_objects_lib += $(BUILDDIR)/fbridge_cu.o
 $(LIBDIR)/lib$(MG5AMC_CULIB).so: $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cu_objects_lib)
-	$(NVCC) --shared -o $@ $(cu_objects_lib) $(CULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB)
+	$(GPUCC) --shared -o $@ $(cu_objects_lib) $(CULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB)
 endif
 
 #-------------------------------------------------------------------------------
@@ -602,16 +668,16 @@ $(cxx_main): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PAT
 $(cxx_main): $(BUILDDIR)/check_sa.o $(LIBDIR)/lib$(MG5AMC_CXXLIB).so $(cxx_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel.o
 	$(CXX) -o $@ $(BUILDDIR)/check_sa.o $(OMPFLAGS) -ldl -pthread $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_CXXLIB) $(cxx_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel.o $(CURANDLIBFLAGS)
 
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 ifneq ($(shell $(CXX) --version | grep ^Intel),)
-$(cu_main): LIBFLAGS += -lintlc # compile with icpx and link with nvcc (undefined reference to `_intel_fast_memcpy')
-$(cu_main): LIBFLAGS += -lsvml # compile with icpx and link with nvcc (undefined reference to `__svml_cos4_l9')
+$(cu_main): LIBFLAGS += -lintlc # compile with icpx and link with GPUCC (undefined reference to `_intel_fast_memcpy')
+$(cu_main): LIBFLAGS += -lsvml # compile with icpx and link with GPUCC (undefined reference to `__svml_cos4_l9')
 else ifneq ($(shell $(CXX) --version | grep ^nvc++),) # support nvc++ #531
 $(cu_main): LIBFLAGS += -L$(patsubst %bin/nvc++,%lib,$(subst ccache ,,$(CXX))) -lnvhpcatm -lnvcpumath -lnvc
 endif
 $(cu_main): LIBFLAGS += $(CULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH
 $(cu_main): $(BUILDDIR)/gcheck_sa.o $(LIBDIR)/lib$(MG5AMC_CULIB).so $(cu_objects_exe) $(BUILDDIR)/gCurandRandomNumberKernel.o
-	$(NVCC) -o $@ $(BUILDDIR)/gcheck_sa.o $(CUARCHFLAGS) $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe) $(BUILDDIR)/gCurandRandomNumberKernel.o $(CURANDLIBFLAGS)
+	$(GPUCC) -o $@ $(BUILDDIR)/gcheck_sa.o $(CUARCHFLAGS) $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe) $(BUILDDIR)/gCurandRandomNumberKernel.o $(CURANDLIBFLAGS)
 endif
 
 #-------------------------------------------------------------------------------
@@ -637,17 +703,17 @@ $(fcxx_main): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PA
 $(fcxx_main): $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler.o $(LIBDIR)/lib$(MG5AMC_CXXLIB).so $(cxx_objects_exe)
 	$(CXX) -o $@ $(BUILDDIR)/fcheck_sa.o $(OMPFLAGS) $(BUILDDIR)/fsampler.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CXXLIB) $(cxx_objects_exe)
 
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 ifneq ($(shell $(CXX) --version | grep ^Intel),)
-$(fcu_main): LIBFLAGS += -lintlc # compile with icpx and link with nvcc (undefined reference to `_intel_fast_memcpy')
-$(fcu_main): LIBFLAGS += -lsvml # compile with icpx and link with nvcc (undefined reference to `__svml_cos4_l9')
+$(fcu_main): LIBFLAGS += -lintlc # compile with icpx and link with GPUCC (undefined reference to `_intel_fast_memcpy')
+$(fcu_main): LIBFLAGS += -lsvml # compile with icpx and link with GPUCC (undefined reference to `__svml_cos4_l9')
 endif
 ifeq ($(UNAME_S),Darwin)
 $(fcu_main): LIBFLAGS += -L$(shell dirname $(shell $(FC) --print-file-name libgfortran.dylib)) # add path to libgfortran on Mac #375
 endif
 $(fcu_main): LIBFLAGS += $(CULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH
 $(fcu_main): $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler_cu.o $(LIBDIR)/lib$(MG5AMC_CULIB).so $(cu_objects_exe)
-	$(NVCC) -o $@ $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler_cu.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe)
+	$(GPUCC) -o $@ $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler_cu.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe)
 endif
 
 #-------------------------------------------------------------------------------
@@ -659,7 +725,7 @@ $(BUILDDIR)/testxxx.o: testxxx_cc_ref.txt
 $(testmain): $(BUILDDIR)/testxxx.o
 $(testmain): cxx_objects_exe += $(BUILDDIR)/testxxx.o # Comment out this line to skip the C++ test of xxx functions
 
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 $(BUILDDIR)/testxxx_cu.o: $(GTESTLIBS)
 $(BUILDDIR)/testxxx_cu.o: INCFLAGS += $(GTESTINC)
 $(BUILDDIR)/testxxx_cu.o: testxxx_cc_ref.txt
@@ -672,7 +738,7 @@ $(BUILDDIR)/testmisc.o: INCFLAGS += $(GTESTINC)
 $(testmain): $(BUILDDIR)/testmisc.o
 $(testmain): cxx_objects_exe += $(BUILDDIR)/testmisc.o # Comment out this line to skip the C++ miscellaneous tests
 
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 $(BUILDDIR)/testmisc_cu.o: $(GTESTLIBS)
 $(BUILDDIR)/testmisc_cu.o: INCFLAGS += $(GTESTINC)
 $(testmain): $(BUILDDIR)/testmisc_cu.o
@@ -684,12 +750,12 @@ $(BUILDDIR)/runTest.o: INCFLAGS += $(GTESTINC)
 $(testmain): $(BUILDDIR)/runTest.o
 $(testmain): cxx_objects_exe += $(BUILDDIR)/runTest.o
 
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 $(BUILDDIR)/runTest_cu.o: $(GTESTLIBS)
 $(BUILDDIR)/runTest_cu.o: INCFLAGS += $(GTESTINC)
 ifneq ($(shell $(CXX) --version | grep ^Intel),)
-$(testmain): LIBFLAGS += -lintlc # compile with icpx and link with nvcc (undefined reference to `_intel_fast_memcpy')
-$(testmain): LIBFLAGS += -lsvml # compile with icpx and link with nvcc (undefined reference to `__svml_cos4_l9')
+$(testmain): LIBFLAGS += -lintlc # compile with icpx and link with GPUCC (undefined reference to `_intel_fast_memcpy')
+$(testmain): LIBFLAGS += -lsvml # compile with icpx and link with GPUCC (undefined reference to `__svml_cos4_l9')
 else ifneq ($(shell $(CXX) --version | grep ^nvc++),) # support nvc++ #531
 $(testmain): LIBFLAGS += -L$(patsubst %bin/nvc++,%lib,$(subst ccache ,,$(CXX))) -lnvhpcatm -lnvcpumath -lnvc
 endif
@@ -713,14 +779,14 @@ $(testmain): LIBFLAGS += -lgomp
 endif
 endif
 
-ifeq ($(NVCC),) # link only runTest.o
+ifeq ($(GPUCC),) # link only runTest.o
 $(testmain): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH
 $(testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(cxx_objects_exe) $(GTESTLIBS)
 	$(CXX) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) -ldl -pthread $(LIBFLAGS)
 else # link both runTest.o and runTest_cu.o
 $(testmain): LIBFLAGS += $(CULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH
 $(testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) $(GTESTLIBS)
-	$(NVCC) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) -ldl $(LIBFLAGS) -lcuda
+	  $(GPUCC) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) -ldl $(LIBFLAGS) $(CUDATESTFLAGS)
 endif
 
 # Use target gtestlibs to build only googletest
@@ -829,9 +895,9 @@ ifeq ($(USECCACHE),1)
 	ccache --version | head -1
 endif
 	@echo ""
-	@echo NVCC=$(NVCC)
-ifneq ($(NVCC),)
-	$(NVCC) --version
+	@echo GPUCC=$(GPUCC)
+ifneq ($(GPUCC),)
+	$(GPUCC) --version
 endif
 	@echo ""
 	@echo CXX=$(CXX)
@@ -850,7 +916,7 @@ endif
 
 # Target: check (run the C++ test executable)
 # [NB THIS IS WHAT IS USED IN THE GITHUB CI!]
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 check: runTest cmpFcheck cmpFGcheck
 else
 check: runTest cmpFcheck
diff --git a/epochX/cudacpp/ee_mumu.sa/SubProcesses/fbridge.cc b/epochX/cudacpp/ee_mumu.sa/SubProcesses/fbridge.cc
index 2d2b36d560..22ce3f5115 100644
--- a/epochX/cudacpp/ee_mumu.sa/SubProcesses/fbridge.cc
+++ b/epochX/cudacpp/ee_mumu.sa/SubProcesses/fbridge.cc
@@ -1,11 +1,11 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: S. Roiser (Oct 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Roiser, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #include "Bridge.h"
 #include "CPPProcess.h"
-#include "CudaRuntime.h"
+#include "GpuRuntime.h"
 
 extern "C"
 {
@@ -22,7 +22,7 @@ extern "C"
    * Using the same Fortran MadEvent code, linking to the hetrerogeneous library would allow access to both CPU and GPU implementations.
    * The specific heterogeneous configuration (how many GPUs, how many threads on each CPU, etc) could be loaded in CUDA/C++ from a data file.
    */
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   using namespace mg5amcGpu;
 #else
   using namespace mg5amcCpu;
@@ -46,8 +46,8 @@ extern "C"
    */
   void fbridgecreate_( CppObjectInFortran** ppbridge, const int* pnevtF, const int* pnparF, const int* pnp4F )
   {
-#ifdef __CUDACC__
-    CudaRuntime::setUp();
+#ifdef MGONGPUCPP_GPUIMPL
+    GpuRuntime::setUp();
 #endif
     // (NB: CPPProcess::initProc no longer needs to be executed here because it is called in the Bridge constructor)
     // FIXME: disable OMP in Bridge when called from Fortran
@@ -65,8 +65,8 @@ extern "C"
     Bridge<FORTRANFPTYPE>* pbridge = dynamic_cast<Bridge<FORTRANFPTYPE>*>( *ppbridge );
     if( pbridge == 0 ) throw std::runtime_error( "fbridgedelete_: invalid Bridge address" );
     delete pbridge;
-#ifdef __CUDACC__
-    CudaRuntime::tearDown();
+#ifdef MGONGPUCPP_GPUIMPL
+    GpuRuntime::tearDown();
 #endif
   }
 
@@ -96,7 +96,7 @@ extern "C"
   {
     Bridge<FORTRANFPTYPE>* pbridge = dynamic_cast<Bridge<FORTRANFPTYPE>*>( *ppbridge );
     if( pbridge == 0 ) throw std::runtime_error( "fbridgesequence_: invalid Bridge address" );
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     // Use the device/GPU implementation in the CUDA library
     // (there is also a host implementation in this library)
     pbridge->gpu_sequence( momenta, gs, rndhel, rndcol, *pchannelId, mes, selhel, selcol );
diff --git a/epochX/cudacpp/ee_mumu.sa/SubProcesses/fsampler.cc b/epochX/cudacpp/ee_mumu.sa/SubProcesses/fsampler.cc
index 2fb445372d..3743934f41 100644
--- a/epochX/cudacpp/ee_mumu.sa/SubProcesses/fsampler.cc
+++ b/epochX/cudacpp/ee_mumu.sa/SubProcesses/fsampler.cc
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Feb 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #include "mgOnGpuConfig.h"
 
@@ -13,7 +13,7 @@
 
 //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -40,7 +40,7 @@ namespace mg5amcCpu
   private:
     const int m_nevt; // The number of events in each iteration
     int m_iiter;      // The iteration counter (for random number seeding)
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
     HostBufferRndNumMomenta m_hstRndmom; // Memory buffers for random numbers
     HostBufferMomenta m_hstMomenta;      // Memory buffers for momenta
     HostBufferWeights m_hstWeights;      // Memory buffers for sampling weights
@@ -105,7 +105,7 @@ namespace mg5amcCpu
 
 extern "C"
 {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   using namespace mg5amcGpu;
 #else
   using namespace mg5amcCpu;
diff --git a/epochX/cudacpp/ee_mumu.sa/SubProcesses/runTest.cc b/epochX/cudacpp/ee_mumu.sa/SubProcesses/runTest.cc
index d4a760a71b..de327f2321 100644
--- a/epochX/cudacpp/ee_mumu.sa/SubProcesses/runTest.cc
+++ b/epochX/cudacpp/ee_mumu.sa/SubProcesses/runTest.cc
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: S. Hageboeck (Nov 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 //----------------------------------------------------------------------------
 // Use ./runTest.exe --gtest_filter=*xxx to run only testxxx.cc tests
 //----------------------------------------------------------------------------
@@ -18,7 +18,7 @@
 #include "RandomNumberKernels.h"
 #include "epoch_process_id.h"
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 using namespace mg5amcGpu;
 #else
 using namespace mg5amcCpu;
@@ -35,7 +35,7 @@ struct CUDA_CPU_TestBase : public TestDriverBase
     : TestDriverBase( npar, refFileName ) {}
 };
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 struct CPUTest : public CUDA_CPU_TestBase
 {
   // Struct data members (process, and memory structures for random numbers, momenta, matrix elements and weights on host and device)
@@ -119,7 +119,7 @@ struct CPUTest : public CUDA_CPU_TestBase
 };
 #endif
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 struct CUDATest : public CUDA_CPU_TestBase
 {
   // Reset the device when our test goes out of scope. Note that this should happen after
@@ -128,7 +128,7 @@ struct CUDATest : public CUDA_CPU_TestBase
   {
     ~DeviceReset()
     {
-      checkCuda( cudaDeviceReset() ); // this is needed by cuda-memcheck --leak-check full
+      checkGpu( gpuDeviceReset() ); // this is needed by cuda-memcheck --leak-check full
     }
   } deviceResetter;
 
@@ -256,7 +256,7 @@ INSTANTIATE_TEST_SUITE_P( prefix, \
                           test_suite_name, \
                           testing::Values( new CUDATest( MG_EPOCH_REFERENCE_FILE_NAME ) ) );
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 MG_INSTANTIATE_TEST_SUITE_GPU( XTESTID_GPU( MG_EPOCH_PROCESS_ID ), MadgraphTest );
 #else
 MG_INSTANTIATE_TEST_SUITE_CPU( XTESTID_CPU( MG_EPOCH_PROCESS_ID ), MadgraphTest );
diff --git a/epochX/cudacpp/ee_mumu.sa/SubProcesses/testmisc.cc b/epochX/cudacpp/ee_mumu.sa/SubProcesses/testmisc.cc
index 895d6eeb56..ba9e59a8a3 100644
--- a/epochX/cudacpp/ee_mumu.sa/SubProcesses/testmisc.cc
+++ b/epochX/cudacpp/ee_mumu.sa/SubProcesses/testmisc.cc
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 //----------------------------------------------------------------------------
 // Use ./runTest.exe --gtest_filter=*misc to run only testmisc.cc tests
 //----------------------------------------------------------------------------
@@ -17,7 +17,7 @@
 #include <sstream>
 #include <typeinfo>
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 #define TESTID( s ) s##_GPU_MISC
 #else
 #define TESTID( s ) s##_CPU_MISC
@@ -26,7 +26,7 @@
 #define XTESTID( s ) TESTID( s )
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -59,7 +59,7 @@ namespace mg5amcCpu
 
 TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testmisc )
 {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   using namespace mg5amcGpu;
 #else
   using namespace mg5amcCpu;
diff --git a/epochX/cudacpp/ee_mumu.sa/SubProcesses/testxxx.cc b/epochX/cudacpp/ee_mumu.sa/SubProcesses/testxxx.cc
index 3361fe5aa9..e5167de00c 100644
--- a/epochX/cudacpp/ee_mumu.sa/SubProcesses/testxxx.cc
+++ b/epochX/cudacpp/ee_mumu.sa/SubProcesses/testxxx.cc
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Apr 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 //----------------------------------------------------------------------------
 // Use ./runTest.exe --gtest_filter=*xxx to run only testxxx.cc tests
 //----------------------------------------------------------------------------
@@ -24,7 +24,7 @@
 #include <iomanip>
 #include <iostream>
 #include <vector>
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 #define TESTID( s ) s##_GPU_XXX
 #else
 #define TESTID( s ) s##_CPU_XXX
@@ -32,7 +32,7 @@
 
 #define XTESTID( s ) TESTID( s )
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -42,7 +42,7 @@ namespace mg5amcCpu
   int FPEhandlerIevt = -1;
   inline void FPEhandler( int sig )
   {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     std::cerr << "Floating Point Exception (GPU): '" << FPEhandlerMessage << "' ievt=" << FPEhandlerIevt << std::endl;
 #else
     std::cerr << "Floating Point Exception (CPU neppV=" << neppV << "): '" << FPEhandlerMessage << "' ievt=" << FPEhandlerIevt << std::endl;
@@ -53,7 +53,7 @@ namespace mg5amcCpu
 
 TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testxxx )
 {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   using namespace mg5amcGpu;
 #else
   using namespace mg5amcCpu;
@@ -77,7 +77,7 @@ TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testxxx )
   assert( nevt % neppM == 0 ); // nevt must be a multiple of neppM
   assert( nevt % neppV == 0 ); // nevt must be a multiple of neppV
   // Fill in the input momenta
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   mg5amcGpu::PinnedHostBufferMomenta hstMomenta( nevt ); // AOSOA[npagM][npar=4][np4=4][neppM]
 #else
   mg5amcCpu::HostBufferMomenta hstMomenta( nevt ); // AOSOA[npagM][npar=4][np4=4][neppM]
@@ -322,7 +322,7 @@ TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testxxx )
   {
     for( int ievt = 0; ievt < nevt; ievt++ )
     {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
       using namespace mg5amcGpu;
 #else
       using namespace mg5amcCpu;
diff --git a/epochX/cudacpp/ee_mumu.sa/src/HelAmps_sm.h b/epochX/cudacpp/ee_mumu.sa/src/HelAmps_sm.h
index 9fa30cfd7f..e878fcd28e 100644
--- a/epochX/cudacpp/ee_mumu.sa/src/HelAmps_sm.h
+++ b/epochX/cudacpp/ee_mumu.sa/src/HelAmps_sm.h
@@ -5,7 +5,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: A. Valassi (Sep 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
 // MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
@@ -28,7 +28,7 @@
 //#include <iomanip>
 //#include <iostream>
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/ee_mumu.sa/src/Parameters_sm.cc b/epochX/cudacpp/ee_mumu.sa/src/Parameters_sm.cc
index 0b4be4d5ed..cffc5d3bff 100644
--- a/epochX/cudacpp/ee_mumu.sa/src/Parameters_sm.cc
+++ b/epochX/cudacpp/ee_mumu.sa/src/Parameters_sm.cc
@@ -4,7 +4,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: A. Valassi (Sep 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
 // MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
@@ -17,7 +17,7 @@
 #include <iomanip>
 #include <iostream>
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 using namespace mg5amcGpu;
 #else
 using namespace mg5amcCpu;
diff --git a/epochX/cudacpp/ee_mumu.sa/src/Parameters_sm.h b/epochX/cudacpp/ee_mumu.sa/src/Parameters_sm.h
index 64d0b8e761..2a6d960581 100644
--- a/epochX/cudacpp/ee_mumu.sa/src/Parameters_sm.h
+++ b/epochX/cudacpp/ee_mumu.sa/src/Parameters_sm.h
@@ -27,7 +27,7 @@
 #include "read_slha.h"
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -93,7 +93,7 @@ namespace mg5amcCpu
 #include <limits>
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -218,7 +218,7 @@ namespace mg5amcCpu
 //==========================================================================
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -235,7 +235,7 @@ namespace mg5amcCpu
 #pragma GCC diagnostic push
 #pragma GCC diagnostic ignored "-Wunused-variable"  // e.g. <<warning: unused variable ‘mdl_G__exp__2’ [-Wunused-variable]>>
 #pragma GCC diagnostic ignored "-Wunused-parameter" // e.g. <<warning: unused parameter ‘G’ [-Wunused-parameter]>>
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 #pragma nv_diagnostic push
 #pragma nv_diag_suppress 177 // e.g. <<warning #177-D: variable "mdl_G__exp__2" was declared but never referenced>>
 #endif
@@ -259,7 +259,7 @@ namespace mg5amcCpu
       // End SM implementation - no special handling of vectors of floats as in EFT (#439)
       return out;
     }
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 #pragma GCC diagnostic pop
 #pragma nv_diagnostic pop
 #endif
diff --git a/epochX/cudacpp/ee_mumu.sa/src/cudacpp_src.mk b/epochX/cudacpp/ee_mumu.sa/src/cudacpp_src.mk
index d4cc628aec..159e19a46d 100644
--- a/epochX/cudacpp/ee_mumu.sa/src/cudacpp_src.mk
+++ b/epochX/cudacpp/ee_mumu.sa/src/cudacpp_src.mk
@@ -19,7 +19,7 @@ SHELL := /bin/bash
 #=== Configure common compiler flags for CUDA and C++
 
 INCFLAGS = -I.
-OPTFLAGS = -O3 # this ends up in CUFLAGS too (should it?), cannot add -Ofast or -ffast-math here
+OPTFLAGS = -O3 # this ends up in GPUFLAGS too (should it?), cannot add -Ofast or -ffast-math here
 
 #-------------------------------------------------------------------------------
 
@@ -45,13 +45,13 @@ endif
 
 #-------------------------------------------------------------------------------
 
-#=== Configure the CUDA compiler (note: NVCC is already exported including ccache)
+#=== Configure the CUDA compiler (note: GPUCC is already exported including ccache)
 
-###$(info NVCC=$(NVCC))
+###$(info GPUCC=$(GPUCC))
 
 #-------------------------------------------------------------------------------
 
-#=== Configure ccache for C++ builds (note: NVCC is already exported including ccache)
+#=== Configure ccache for C++ builds (note: GPUCC is already exported including ccache)
 
 # Enable ccache if USECCACHE=1
 ifeq ($(USECCACHE)$(shell echo $(CXX) | grep ccache),1)
@@ -92,6 +92,13 @@ endif
 ###$(info OMPFLAGS=$(OMPFLAGS))
 CXXFLAGS += $(OMPFLAGS)
 
+# Add correct -DHIP_LATFORM when compiling for HIP
+ifeq ($(findstring nvcc,$(GPUCC)),nvcc)
+  GPUFLAGS += -Xcompiler -fPIC -c -x cu
+else ifeq ($(findstring hipcc,$(GPUCC)),hipcc)
+  GPUFLAGS += -fPIC -c
+endif
+
 # Set the build flags appropriate to each AVX choice (example: "make AVX=none")
 # [NB MGONGPU_PVW512 is needed because "-mprefer-vector-width=256" is not exposed in a macro]
 # [See https://gcc.gnu.org/bugzilla/show_bug.cgi?id=96476]
@@ -253,20 +260,20 @@ $(BUILDDIR)/%.o : %.cc *.h $(BUILDDIR)/.build.$(TAG)
 # Generic target and build rules: objects from CUDA compilation
 $(BUILDDIR)/%_cu.o : %.cc *.h $(BUILDDIR)/.build.$(TAG)
 	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
-	$(NVCC) $(CPPFLAGS) $(CUFLAGS) -Xcompiler -fPIC -c -x cu $< -o $@
+	$(GPUCC) $(CPPFLAGS) $(GPUFLAGS) $< -o $@
 
 #-------------------------------------------------------------------------------
 
 cxx_objects=$(addprefix $(BUILDDIR)/, Parameters_sm.o read_slha.o)
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 cu_objects=$(addprefix $(BUILDDIR)/, Parameters_sm_cu.o)
 endif
 
 # Target (and build rules): common (src) library
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so : $(cxx_objects) $(cu_objects)
 	@if [ ! -d $(LIBDIR) ]; then echo "mkdir -p $(LIBDIR)"; mkdir -p $(LIBDIR); fi
-	$(NVCC) -shared -o $@ $(cxx_objects) $(cu_objects) $(LDFLAGS)
+	$(GPUCC) -shared -o $@ $(cxx_objects) $(cu_objects) $(LDFLAGS)
 else
 $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so : $(cxx_objects)
 	@if [ ! -d $(LIBDIR) ]; then echo "mkdir -p $(LIBDIR)"; mkdir -p $(LIBDIR); fi
diff --git a/epochX/cudacpp/ee_mumu.sa/src/mgOnGpuConfig.h b/epochX/cudacpp/ee_mumu.sa/src/mgOnGpuConfig.h
index b247654dcf..da4ba36ad8 100644
--- a/epochX/cudacpp/ee_mumu.sa/src/mgOnGpuConfig.h
+++ b/epochX/cudacpp/ee_mumu.sa/src/mgOnGpuConfig.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jul 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MGONGPUCONFIG_H
 #define MGONGPUCONFIG_H 1
@@ -10,12 +10,25 @@
 // There are two different code bases for standalone_cudacpp (without multichannel) and madevent+cudacpp (with multichannel)
 #undef MGONGPU_SUPPORTS_MULTICHANNEL
 
+// Is this a GPU (CUDA, HIP) or CPU implementation?
+#ifdef __CUDACC__
+#define MGONGPUCPP_GPUIMPL cuda
+#elif defined __HIPCC__
+#define MGONGPUCPP_GPUIMPL hip
+#else
+#undef MGONGPUCPP_GPUIMPL
+#endif
+
 // ** NB1 Throughputs (e.g. 6.8E8) are events/sec for "./gcheck.exe -p 65536 128 12"
 // ** NB2 Baseline on b7g47n0004 fluctuates (probably depends on load on other VMs)
 
 // Choose if curand is supported for generating random numbers
+// For HIP, by default, do not use curand (common random numbers will be used instead)
 // For both CUDA and C++, by default, do not inline, but allow this macro to be set from outside with e.g. -DMGONGPU_HAS_NO_CURAND
-// (there exist CUDA installations, e.g. using the HPC package, which do not include curand - see PR #784)
+// (there exist CUDA installations, e.g. using the HPC package, which do not include curand - see PR #784 and #785)
+#if defined __HIPCC__
+#define MGONGPU_HAS_NO_CURAND 1
+#else
 //#ifdef __CUDACC__
 //#undef MGONGPU_HAS_NO_CURAND // default
 ////#define MGONGPU_HAS_NO_CURAND 1
@@ -23,6 +36,7 @@
 //#undef MGONGPU_HAS_NO_CURAND // default
 ////#define MGONGPU_HAS_NO_CURAND 1
 //#endif
+#endif
 
 // Choose floating point precision (for everything but color algebra #537)
 // If one of these macros has been set from outside with e.g. -DMGONGPU_FPTYPE_FLOAT, nothing happens (issue #167)
@@ -54,23 +68,28 @@
 //#undef MGONGPU_HARDCODE_PARAM // default
 ////#define MGONGPU_HARDCODE_PARAM 1
 
-// Complex type in c++: std::complex or cxsmpl (CHOOSE ONLY ONE)
-#ifndef __CUDACC__
-//#define MGONGPU_CPPCXTYPE_STDCOMPLEX 1 // ~8 percent slower on float, same on double (5.1E6/double, 9.4E6/float)
-#define MGONGPU_CPPCXTYPE_CXSMPL 1 // new default (5.1E6/double, 10.2E6/float)
-#endif
-
-// Complex type in cuda: thrust or cucomplex or cxsmpl (CHOOSE ONLY ONE)
+// Complex type in CUDA: thrust or cucomplex or cxsmpl (CHOOSE ONLY ONE)
 #ifdef __CUDACC__
 #define MGONGPU_CUCXTYPE_THRUST 1 // default (~1.15E9/double, ~3.2E9/float)
 //#define MGONGPU_CUCXTYPE_CUCOMPLEX 1 // ~10 percent slower (1.03E9/double, ~2.8E9/float)
 //#define MGONGPU_CUCXTYPE_CXSMPL 1 // ~10 percent slower (1.00E9/double, ~2.9E9/float)
+
+// Complex type in HIP: cxsmpl (ONLY ONE OPTION POSSIBLE)
+#elif defined __HIPCC__
+#define MGONGPU_CUCXTYPE_CXSMPL 1 // ~10 percent slower (1.00E9/double, ~2.9E9/float)
+
+// Complex type in C++: std::complex or cxsmpl (CHOOSE ONLY ONE)
+#else
+//#define MGONGPU_CPPCXTYPE_STDCOMPLEX 1 // ~8 percent slower on float, same on double (5.1E6/double, 9.4E6/float)
+#define MGONGPU_CPPCXTYPE_CXSMPL 1 // new default (5.1E6/double, 10.2E6/float)
 #endif
 
-// Cuda nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation
+// CUDA nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation
 #ifdef __CUDACC__
-#undef MGONGPU_NSIGHT_DEBUG // default
+#undef MGONGPU_NSIGHT_DEBUG // default in CUDA
 //#define MGONGPU_NSIGHT_DEBUG 1
+#else
+#undef MGONGPU_NSIGHT_DEBUG // only option in HIP or C++
 #endif
 
 // SANITY CHECKS (floating point precision for everything but color algebra #537)
@@ -86,17 +105,21 @@
 #error You cannot use double precision for color algebra and single precision elsewhere
 #endif
 
-// SANITY CHECKS (c++ complex number implementation)
-#ifndef __CUDACC__
-#if defined MGONGPU_CPPCXTYPE_STDCOMPLEX and defined MGONGPU_CPPCXTYPE_CXSMPL
-#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CPPCXTYPE_STDCOMPLEX or MGONGPU_CPPCXTYPE_CXSMPL
+// SANITY CHECKS (CUDA complex number implementation)
+#ifdef __CUDACC__
+#if defined MGONGPU_CUCXTYPE_THRUST and defined MGONGPU_CUCXTYPE_CUCOMPLEX
+#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CUCXTYPE_THRUST or MGONGPU_CUCXTYPE_CUCOMPLEX for CUDA
+#elif defined MGONGPU_CUCXTYPE_THRUST and defined MGONGPU_CUCXTYPE_CXSMPL
+#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CUCXTYPE_THRUST or MGONGPU_CUCXTYPE_CXSMPL for CUDA
+#elif defined MGONGPU_CUCXTYPE_CUCOMPLEX and defined MGONGPU_CUCXTYPE_CXSMPL
+#error You must CHOOSE (ONE AND) ONLY ONE OF MGONGPU_CUCXTYPE_CUCOMPLEX or MGONGPU_CUCXTYPE_CXSMPL for CUDA
 #endif
 #endif
 
-// SANITY CHECKS (cuda complex number implementation)
-#ifdef __CUDACC__
-#if defined MGONGPU_CUCXTYPE_THRUST and defined MGONGPU_CUCXTYPE_CUCOMPLEX and defined MGONGPU_CUCXTYPE_CXSMPL
-#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CUCXTYPE_THRUST or MGONGPU_CUCXTYPE_CUCOMPLEX or MGONGPU_CUCXTYPE_CXSMPL
+// SANITY CHECKS (C++ complex number implementation)
+#ifndef MGONGPUCPP_GPUIMPL
+#if defined MGONGPU_CPPCXTYPE_STDCOMPLEX and defined MGONGPU_CPPCXTYPE_CXSMPL
+#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CPPCXTYPE_STDCOMPLEX or MGONGPU_CPPCXTYPE_CXSMPL for C++
 #endif
 #endif
 
@@ -134,7 +157,7 @@ namespace mgOnGpu
   // Alignment requirement for using reinterpret_cast with SIMD vectorized code
   // (using reinterpret_cast with non aligned memory may lead to segmentation faults!)
   // Only needed for C++ code but can be enforced also in NVCC builds of C++ code using CUDA>=11.2 and C++17 (#318, #319, #333)
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   constexpr int cppAlign = 64; // alignment requirement for SIMD vectorization (64-byte i.e. 512-bit)
 #endif
 
@@ -145,7 +168,7 @@ using mgOnGpu::fptype;
 using mgOnGpu::fptype2;
 
 // C++ SIMD vectorization width (this will be used to set neppV)
-#ifdef __CUDACC__ // CUDA implementation has no SIMD
+#ifdef MGONGPUCPP_GPUIMPL // CUDA and HIP implementations have no SIMD
 #undef MGONGPU_CPPSIMD
 #elif defined __AVX512VL__ && defined MGONGPU_PVW512 // C++ "512z" AVX512 with 512 width (512-bit ie 64-byte): 8 (DOUBLE) or 16 (FLOAT)
 #ifdef MGONGPU_FPTYPE_DOUBLE
@@ -175,9 +198,9 @@ using mgOnGpu::fptype2;
 #undef MGONGPU_CPPSIMD
 #endif
 
-// Cuda nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation
+// CUDA nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation
 // Arguments (not used so far): text is __FUNCTION__, code is 0 (start) or 1 (end)
-#if defined __CUDACC__ && defined MGONGPU_NSIGHT_DEBUG /* clang-format off */
+#if defined __CUDA__ && defined MGONGPU_NSIGHT_DEBUG /* clang-format off */
 #define mgDebugDeclare() __shared__ float mgDebugCounter[mgOnGpu::ntpbMAX];
 #define mgDebugInitialise() { mgDebugCounter[threadIdx.x] = 0; }
 #define mgDebug( code, text ) { mgDebugCounter[threadIdx.x] += 1; }
@@ -189,8 +212,8 @@ using mgOnGpu::fptype2;
 #define mgDebugFinalise() { /*noop*/ }
 #endif /* clang-format on */
 
-// Define empty CUDA declaration specifiers for C++
-#ifndef __CUDACC__
+// Define empty CUDA/HIP declaration specifiers for C++
+#ifndef MGONGPUCPP_GPUIMPL
 #define __global__
 #define __host__
 #define __device__
diff --git a/epochX/cudacpp/ee_mumu.sa/src/mgOnGpuCxtypes.h b/epochX/cudacpp/ee_mumu.sa/src/mgOnGpuCxtypes.h
index ca9a9f00c0..5532e22fa1 100644
--- a/epochX/cudacpp/ee_mumu.sa/src/mgOnGpuCxtypes.h
+++ b/epochX/cudacpp/ee_mumu.sa/src/mgOnGpuCxtypes.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022, based on earlier work by D. Smith) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MGONGPUCXTYPES_H
 #define MGONGPUCXTYPES_H 1
@@ -19,7 +19,7 @@
 #include <complex>
 
 // Complex type in cuda: thrust or cucomplex or cxsmpl
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 #if defined MGONGPU_CUCXTYPE_THRUST
 #pragma clang diagnostic push
 #pragma clang diagnostic ignored "-Wtautological-compare" // for icpx2021/clang13 (https://stackoverflow.com/a/15864661)
@@ -82,7 +82,7 @@ namespace mgOnGpu /* clang-format off */
 using mgOnGpu::cxsmpl;
 
 // Printout to stream for user defined types
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -92,7 +92,7 @@ namespace mg5amcCpu
   inline __host__ std::ostream&
   operator<<( std::ostream& out, const cxsmpl<FP>& c )
   {
-    out << std::complex( c.real(), c.imag() );
+    out << std::complex<FP>( c.real(), c.imag() );
     return out;
   }
 
@@ -215,14 +215,14 @@ namespace mg5amcCpu
 //==========================================================================
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
 #endif
 {
   // --- Type definitions (complex type: cxtype)
-#ifdef __CUDACC__ // cuda
+#ifdef MGONGPUCPP_GPUIMPL // cuda
 #if defined MGONGPU_CUCXTYPE_THRUST
   typedef thrust::complex<fptype> cxtype;
 #elif defined MGONGPU_CUCXTYPE_CUCOMPLEX
@@ -255,7 +255,7 @@ namespace mg5amcCpu
 //==========================================================================
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -307,7 +307,7 @@ namespace mg5amcCpu
 
   //==========================================================================
 
-#if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_THRUST // cuda + thrust
+#if defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CUCXTYPE_THRUST // cuda + thrust
 
   //------------------------------
   // CUDA - using thrust::complex
@@ -343,11 +343,11 @@ namespace mg5amcCpu
     return c;
   }
 
-#endif // #if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_THRUST
+#endif // #if defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CUCXTYPE_THRUST
 
   //==========================================================================
 
-#if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_CUCOMPLEX // cuda + cucomplex
+#if defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CUCXTYPE_CUCOMPLEX // cuda + cucomplex
 
   //------------------------------
   // CUDA - using cuComplex
@@ -562,11 +562,11 @@ namespace mg5amcCpu
     return cxmake( c.real(), c.imag() );
   }
 
-#endif // #if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_CUCOMPLEX
+#endif // #if defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CUCXTYPE_CUCOMPLEX
 
   //==========================================================================
 
-#if not defined __CUDACC__ and defined MGONGPU_CPPCXTYPE_STDCOMPLEX // c++ + stdcomplex
+#if not defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CPPCXTYPE_STDCOMPLEX // c++ + stdcomplex
 
   //------------------------------
   // C++ - using std::complex
@@ -610,7 +610,7 @@ namespace mg5amcCpu
   }
 #endif
 
-#endif // #if not defined __CUDACC__ and defined MGONGPU_CPPCXTYPE_STDCOMPLEX
+#endif // #if not defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CPPCXTYPE_STDCOMPLEX
 
   //==========================================================================
 
@@ -633,7 +633,7 @@ namespace mg5amcCpu
 //==========================================================================
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/ee_mumu.sa/src/mgOnGpuFptypes.h b/epochX/cudacpp/ee_mumu.sa/src/mgOnGpuFptypes.h
index 905c97d700..fa3a02664b 100644
--- a/epochX/cudacpp/ee_mumu.sa/src/mgOnGpuFptypes.h
+++ b/epochX/cudacpp/ee_mumu.sa/src/mgOnGpuFptypes.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MGONGPUFPTYPES_H
 #define MGONGPUFPTYPES_H 1
@@ -12,7 +12,7 @@
 #include <cmath>
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL // cuda
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -20,7 +20,7 @@ namespace mg5amcCpu
 {
   //==========================================================================
 
-#ifdef __CUDACC__ // cuda
+#ifdef MGONGPUCPP_GPUIMPL // cuda
 
   //------------------------------
   // Floating point types - Cuda
@@ -64,11 +64,11 @@ namespace mg5amcCpu
 #endif
   }
 
-#endif // #ifdef __CUDACC__
+#endif // #ifdef MGONGPUCPP_GPUIMPL
 
   //==========================================================================
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 
   //------------------------------
   // Floating point types - C++
@@ -92,7 +92,7 @@ namespace mg5amcCpu
     return std::sqrt( f );
   }
 
-#endif // #ifndef __CUDACC__
+#endif // #ifndef MGONGPUCPP_GPUIMPL
 
   //==========================================================================
 
diff --git a/epochX/cudacpp/ee_mumu.sa/src/mgOnGpuVectors.h b/epochX/cudacpp/ee_mumu.sa/src/mgOnGpuVectors.h
index e1299ba81e..cdae04326b 100644
--- a/epochX/cudacpp/ee_mumu.sa/src/mgOnGpuVectors.h
+++ b/epochX/cudacpp/ee_mumu.sa/src/mgOnGpuVectors.h
@@ -32,7 +32,7 @@
 #endif
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -131,7 +131,7 @@ namespace mg5amcCpu
 #endif
 #endif
 
-#else // i.e #ifndef MGONGPU_CPPSIMD (this includes #ifdef __CUDACC__)
+#else // i.e #ifndef MGONGPU_CPPSIMD (this includes #ifdef MGONGPUCPP_GPUIMPL)
 
   const int neppV = 1;
 
@@ -153,13 +153,13 @@ namespace mg5amcCpu
 //==========================================================================
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
 #endif
 {
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 
   // Printout to stream for user defined types
 
@@ -805,11 +805,11 @@ namespace mg5amcCpu
 
 #endif // #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
 
-#endif // #ifndef __CUDACC__
+#endif // #ifndef MGONGPUCPP_GPUIMPL
 
   //==========================================================================
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 
   //------------------------------
   // Vector types - CUDA
@@ -853,12 +853,12 @@ namespace mg5amcCpu
     return mask;
   }
 
-#endif // #ifdef __CUDACC__
+#endif // #ifdef MGONGPUCPP_GPUIMPL
 
   //==========================================================================
 
   // Scalar-or-vector types: scalar in CUDA, vector or scalar in C++
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   typedef bool bool_sv;
   typedef fptype fptype_sv;
   typedef fptype2 fptype2_sv;
@@ -879,7 +879,7 @@ namespace mg5amcCpu
 #endif
 
   // Scalar-or-vector zeros: scalar in CUDA, vector or scalar in C++
-#ifdef __CUDACC__ /* clang-format off */
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
   inline __host__ __device__ cxtype cxzero_sv(){ return cxtype( 0, 0 ); }
 #elif defined MGONGPU_CPPSIMD
   inline cxtype_v cxzero_sv() { return cxtype_v(); } // RRRR=0000 IIII=0000
diff --git a/epochX/cudacpp/ee_mumu.sa/src/rambo.h b/epochX/cudacpp/ee_mumu.sa/src/rambo.h
index e02ea52496..cd7e1008ea 100644
--- a/epochX/cudacpp/ee_mumu.sa/src/rambo.h
+++ b/epochX/cudacpp/ee_mumu.sa/src/rambo.h
@@ -4,7 +4,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 
 #include "mgOnGpuConfig.h"
@@ -18,7 +18,7 @@
 #include <iostream>
 
 // Simplified rambo version for 2 to N (with N>=2) processes with massless particles
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -83,7 +83,7 @@ namespace mg5amcCpu
       static bool first = true;
       if( first )
       {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
         if constexpr( M_ACCESS::isOnDevice() ) // avoid
         {
           const int ievt0 = 0;
@@ -166,7 +166,7 @@ namespace mg5amcCpu
     wt = po2log;
     if( nparf != 2 ) wt = ( 2. * nparf - 4. ) * log( energy ) + z[nparf - 1];
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
     // issue warnings if weight is too small or too large
     static int iwarn[5] = { 0, 0, 0, 0, 0 };
     if( wt < -180. )
diff --git a/epochX/cudacpp/gg_tt.mad/CODEGEN_mad_gg_tt_log.txt b/epochX/cudacpp/gg_tt.mad/CODEGEN_mad_gg_tt_log.txt
index 360771ac98..75c84e12fb 100644
--- a/epochX/cudacpp/gg_tt.mad/CODEGEN_mad_gg_tt_log.txt
+++ b/epochX/cudacpp/gg_tt.mad/CODEGEN_mad_gg_tt_log.txt
@@ -62,7 +62,7 @@ generate g g > t t~
 No model currently active, so we import the Standard Model
 INFO: load particles 
 INFO: load vertices 
-[1;32mDEBUG: model prefixing  takes 0.005492210388183594 [0m
+[1;32mDEBUG: model prefixing  takes 0.005261659622192383 [0m
 INFO: Restrict model sm with file models/sm/restrict_default.dat . 
 [1;32mDEBUG: Simplifying conditional expressions [0m
 [1;32mDEBUG: remove interactions: u s w+ at order: QED=1 [0m
@@ -175,7 +175,7 @@ INFO: Generating Helas calls for process: g g > t t~ WEIGHTED<=2 @1
 INFO: Processing color information for process: g g > t t~ @1 
 INFO: Creating files in directory P1_gg_ttx 
 [1;32mDEBUG:  kwargs[prefix] = 0 [1;30m[model_handling.py at line 1058][0m [0m
-[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7fa9ca549f70> [1;30m[export_v4.py at line 6262][0m [0m
+[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7f0b248a16a0> [1;30m[export_v4.py at line 6262][0m [0m
 INFO: Creating files in directory . 
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.h
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.cc
@@ -191,11 +191,11 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./.
 INFO: Generating Feynman diagrams for Process: g g > t t~ WEIGHTED<=2 @1 
 INFO: Finding symmetric diagrams for subprocess group gg_ttx 
 Generated helas calls for 1 subprocesses (3 diagrams) in 0.006 s
-Wrote files for 10 helas calls in 0.101 s
+Wrote files for 10 helas calls in 0.100 s
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates VVV1 set of routines with options: P0[0m
 ALOHA: aloha creates FFV1 routines[0m
-ALOHA: aloha creates 2 routines in  0.145 s
+ALOHA: aloha creates 2 routines in  0.144 s
 [1;32mDEBUG:  Entering PLUGIN_ProcessExporter.convert_model (create the model) [1;30m[output.py at line 202][0m [0m
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates VVV1 set of routines with options: P0[0m
@@ -237,9 +237,9 @@ Type "launch" to generate events from this process, or see
 Run "open index.html" to see more information about this process.
 quit
 
-real	0m1.713s
-user	0m1.482s
-sys	0m0.227s
+real	0m1.690s
+user	0m1.458s
+sys	0m0.220s
 Code generation completed in 2 seconds
 ************************************************************
 *                                                          *
diff --git a/epochX/cudacpp/gg_tt.sa/CODEGEN_cudacpp_gg_tt_log.txt b/epochX/cudacpp/gg_tt.sa/CODEGEN_cudacpp_gg_tt_log.txt
index 0db09949ad..5542e5323b 100644
--- a/epochX/cudacpp/gg_tt.sa/CODEGEN_cudacpp_gg_tt_log.txt
+++ b/epochX/cudacpp/gg_tt.sa/CODEGEN_cudacpp_gg_tt_log.txt
@@ -52,7 +52,7 @@ Note that you can still compile and run aMC@NLO with the built-in PDFs
 
 Using default text editor "vi". Set another one in ./input/mg5_configuration.txt
 Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt
-No valid web browser found. Please set in ./input/mg5_configuration.txt
+Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt
 import /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_tt.mg
 The import format was not given, so we guess it as command
 set stdout_level DEBUG
@@ -62,7 +62,7 @@ generate g g > t t~
 No model currently active, so we import the Standard Model
 INFO: load particles 
 INFO: load vertices 
-[1;32mDEBUG: model prefixing  takes 0.005459308624267578 [0m
+[1;32mDEBUG: model prefixing  takes 0.005713224411010742 [0m
 INFO: Restrict model sm with file models/sm/restrict_default.dat . 
 [1;32mDEBUG: Simplifying conditional expressions [0m
 [1;32mDEBUG: remove interactions: u s w+ at order: QED=1 [0m
@@ -180,7 +180,7 @@ Generated helas calls for 1 subprocesses (3 diagrams) in 0.006 s
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates VVV1 set of routines with options: P0[0m
 ALOHA: aloha creates FFV1 routines[0m
-ALOHA: aloha creates 2 routines in  0.146 s
+ALOHA: aloha creates 2 routines in  0.145 s
 <class 'aloha.create_aloha.AbstractRoutine'> VVV1
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
@@ -195,7 +195,7 @@ INFO: Created files Parameters_sm.h and Parameters_sm.cc in directory
 INFO: /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_tt/src/. and /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_tt/src/. 
 quit
 
-real	0m0.545s
-user	0m0.487s
-sys	0m0.049s
+real	0m0.623s
+user	0m0.466s
+sys	0m0.061s
 Code generation completed in 1 seconds
diff --git a/epochX/cudacpp/gg_tt.sa/COPYRIGHT b/epochX/cudacpp/gg_tt.sa/COPYRIGHT
index a134b5fef9..84a883fbb0 100644
--- a/epochX/cudacpp/gg_tt.sa/COPYRIGHT
+++ b/epochX/cudacpp/gg_tt.sa/COPYRIGHT
@@ -15,6 +15,7 @@ The full development team currently includes the following authors :
   Stephan Hageboeck (CERN)
   Olivier Mattelaer (Universite Catholique de Louvain, original author)
   Stefan Roiser (CERN, original author)
+  Joergen Teig (CERN)
   Andrea Valassi (CERN, original author)
   Zenny Wettersten (CERN)
 See https://github.com/madgraph5/madgraph4gpu for more details. For the full
diff --git a/epochX/cudacpp/gg_tt.sa/SubProcesses/Bridge.h b/epochX/cudacpp/gg_tt.sa/SubProcesses/Bridge.h
index bf8b5e024d..89437b4c42 100644
--- a/epochX/cudacpp/gg_tt.sa/SubProcesses/Bridge.h
+++ b/epochX/cudacpp/gg_tt.sa/SubProcesses/Bridge.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: S. Roiser (Nov 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Roiser, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef BRIDGE_H
 #define BRIDGE_H 1
@@ -23,7 +23,7 @@
 #include <memory>
 #include <type_traits>
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -83,7 +83,7 @@ namespace mg5amcCpu
     Bridge& operator=( const Bridge& ) = delete;
     Bridge& operator=( Bridge&& ) = delete;
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     /**
      * Set the gpublocks and gputhreads for the gpusequence - throws if evnt != gpublocks*gputhreads
      * (this is needed for BridgeKernel tests rather than for actual production use in Fortran)
@@ -150,7 +150,7 @@ namespace mg5amcCpu
     unsigned int m_nevt; // number of events
     int m_nGoodHel;      // the number of good helicities (-1 initially when they have not yet been calculated)
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     int m_gputhreads; // number of gpu threads (default set from number of events, can be modified)
     int m_gpublocks;  // number of gpu blocks (default set from number of events, can be modified)
     DeviceBuffer<FORTRANFPTYPE, sizePerEventMomenta> m_devMomentaF;
@@ -187,12 +187,12 @@ namespace mg5amcCpu
   // Forward declare transposition methods
   //
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 
   template<typename Tin, typename Tout>
   __global__ void dev_transposeMomentaF2C( const Tin* in, Tout* out, const unsigned int nevt );
 
-#endif // __CUDACC__
+#endif // MGONGPUCPP_GPUIMPL
 
   template<typename Tin, typename Tout>
   void hst_transposeMomentaF2C( const Tin* in, Tout* out, const unsigned int nevt );
@@ -209,7 +209,7 @@ namespace mg5amcCpu
   Bridge<FORTRANFPTYPE>::Bridge( unsigned int nevtF, unsigned int nparF, unsigned int np4F )
     : m_nevt( nevtF )
     , m_nGoodHel( -1 )
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     , m_gputhreads( 256 )                  // default number of gpu threads
     , m_gpublocks( m_nevt / m_gputhreads ) // this ensures m_nevt <= m_gpublocks*m_gputhreads
     , m_devMomentaF( m_nevt )
@@ -233,7 +233,7 @@ namespace mg5amcCpu
   {
     if( nparF != CPPProcess::npar ) throw std::runtime_error( "Bridge constructor: npar mismatch" );
     if( np4F != CPPProcess::np4 ) throw std::runtime_error( "Bridge constructor: np4 mismatch" );
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     if( ( m_nevt < s_gputhreadsmin ) || ( m_nevt % s_gputhreadsmin != 0 ) )
       throw std::runtime_error( "Bridge constructor: nevt should be a multiple of " + std::to_string( s_gputhreadsmin ) );
     while( m_nevt != m_gpublocks * m_gputhreads )
@@ -249,7 +249,7 @@ namespace mg5amcCpu
 #else
     std::cout << "WARNING! Instantiate host Bridge (nevt=" << m_nevt << ")" << std::endl;
     m_pmek.reset( new MatrixElementKernelHost( m_hstMomentaC, m_hstGs, m_hstRndHel, m_hstRndCol, m_hstMEs, m_hstSelHel, m_hstSelCol, m_nevt ) );
-#endif // __CUDACC__
+#endif // MGONGPUCPP_GPUIMPL
     // Create a process object, read param card and set parameters
     // FIXME: the process instance can happily go out of scope because it is only needed to read parameters?
     // FIXME: the CPPProcess should really be a singleton? what if fbridgecreate is called from several Fortran threads?
@@ -262,7 +262,7 @@ namespace mg5amcCpu
     process.initProc( paramCard );
   }
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   template<typename FORTRANFPTYPE>
   void Bridge<FORTRANFPTYPE>::set_gpugrid( const int gpublocks, const int gputhreads )
   {
@@ -276,7 +276,7 @@ namespace mg5amcCpu
   }
 #endif
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   template<typename FORTRANFPTYPE>
   void Bridge<FORTRANFPTYPE>::gpu_sequence( const FORTRANFPTYPE* momenta,
                                             const FORTRANFPTYPE* gs,
@@ -291,14 +291,14 @@ namespace mg5amcCpu
     constexpr int neppM = MemoryAccessMomenta::neppM;
     if constexpr( neppM == 1 && std::is_same_v<FORTRANFPTYPE, fptype> )
     {
-      checkCuda( cudaMemcpy( m_devMomentaC.data(), momenta, m_devMomentaC.bytes(), cudaMemcpyHostToDevice ) );
+      gpuMemcpy( m_devMomentaC.data(), momenta, m_devMomentaC.bytes(), gpuMemcpyHostToDevice );
     }
     else
     {
-      checkCuda( cudaMemcpy( m_devMomentaF.data(), momenta, m_devMomentaF.bytes(), cudaMemcpyHostToDevice ) );
+      gpuMemcpy( m_devMomentaF.data(), momenta, m_devMomentaF.bytes(), gpuMemcpyHostToDevice );
       const int thrPerEvt = CPPProcess::npar * CPPProcess::np4; // AV: transpose alg does 1 element per thread (NOT 1 event per thread)
       //const int thrPerEvt = 1; // AV: try new alg with 1 event per thread... this seems slower
-      dev_transposeMomentaF2C<<<m_gpublocks * thrPerEvt, m_gputhreads>>>( m_devMomentaF.data(), m_devMomentaC.data(), m_nevt );
+      gpuLaunchKernel( dev_transposeMomentaF2C, m_gpublocks * thrPerEvt, m_gputhreads, m_devMomentaF.data(), m_devMomentaC.data(), m_nevt );
     }
     if constexpr( std::is_same_v<FORTRANFPTYPE, fptype> )
     {
@@ -341,7 +341,7 @@ namespace mg5amcCpu
   }
 #endif
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   template<typename FORTRANFPTYPE>
   void Bridge<FORTRANFPTYPE>::cpu_sequence( const FORTRANFPTYPE* momenta,
                                             const FORTRANFPTYPE* gs,
@@ -396,7 +396,7 @@ namespace mg5amcCpu
   // - C++ array: momenta[npagM][npar][np4][neppM] with nevt=npagM*neppM (AOSOA)
   //
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   template<typename Tin, typename Tout>
   __global__ void dev_transposeMomentaF2C( const Tin* in, Tout* out, const unsigned int nevt )
   {
diff --git a/epochX/cudacpp/gg_tt.sa/SubProcesses/BridgeKernels.cc b/epochX/cudacpp/gg_tt.sa/SubProcesses/BridgeKernels.cc
index d58066c9c1..eaf4037a24 100644
--- a/epochX/cudacpp/gg_tt.sa/SubProcesses/BridgeKernels.cc
+++ b/epochX/cudacpp/gg_tt.sa/SubProcesses/BridgeKernels.cc
@@ -1,17 +1,18 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #include "BridgeKernels.h"
 
+#include "GpuAbstraction.h"
 #include "MemoryAccessMomenta.h"
 
 #include <sstream>
 
 //============================================================================
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -45,7 +46,7 @@ namespace mg5amcCpu
 
 //============================================================================
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 namespace mg5amcCpu
 {
 
@@ -96,7 +97,7 @@ namespace mg5amcCpu
 
 //============================================================================
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 {
 
diff --git a/epochX/cudacpp/gg_tt.sa/SubProcesses/BridgeKernels.h b/epochX/cudacpp/gg_tt.sa/SubProcesses/BridgeKernels.h
index 15eb4bff4d..3efef8ce97 100644
--- a/epochX/cudacpp/gg_tt.sa/SubProcesses/BridgeKernels.h
+++ b/epochX/cudacpp/gg_tt.sa/SubProcesses/BridgeKernels.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef BRIDGEKERNELS_H
 #define BRIDGEKERNELS_H 1
@@ -12,7 +12,7 @@
 #include "MatrixElementKernels.h"
 #include "MemoryBuffers.h"
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -49,7 +49,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A Bridge wrapper class encapsulating matrix element calculations on a CPU host
   class BridgeKernelHost final : public BridgeKernelBase
   {
@@ -89,7 +89,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   // A Bridge wrapper class encapsulating matrix element calculations on a GPU device
   class BridgeKernelDevice : public BridgeKernelBase
   {
diff --git a/epochX/cudacpp/gg_tt.sa/SubProcesses/CommonRandomNumberKernel.cc b/epochX/cudacpp/gg_tt.sa/SubProcesses/CommonRandomNumberKernel.cc
index 985b39f576..010bc4cbd0 100644
--- a/epochX/cudacpp/gg_tt.sa/SubProcesses/CommonRandomNumberKernel.cc
+++ b/epochX/cudacpp/gg_tt.sa/SubProcesses/CommonRandomNumberKernel.cc
@@ -1,15 +1,16 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #include "CommonRandomNumbers.h"
+#include "GpuAbstraction.h"
 #include "MemoryBuffers.h"
 #include "RandomNumberKernels.h"
 
 #include <cassert>
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/gg_tt.sa/SubProcesses/CrossSectionKernels.cc b/epochX/cudacpp/gg_tt.sa/SubProcesses/CrossSectionKernels.cc
index 0b355a3c8d..c15b39844d 100644
--- a/epochX/cudacpp/gg_tt.sa/SubProcesses/CrossSectionKernels.cc
+++ b/epochX/cudacpp/gg_tt.sa/SubProcesses/CrossSectionKernels.cc
@@ -1,10 +1,11 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #include "CrossSectionKernels.h"
 
+#include "GpuAbstraction.h"
 #include "MemoryAccessMatrixElements.h"
 #include "MemoryAccessWeights.h"
 #include "MemoryBuffers.h"
@@ -77,7 +78,7 @@ debug_me_is_abnormal( const fptype& me, size_t ievtALL )
 
 //============================================================================
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -185,7 +186,7 @@ namespace mg5amcCpu
 
 //============================================================================
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 {
 
diff --git a/epochX/cudacpp/gg_tt.sa/SubProcesses/CrossSectionKernels.h b/epochX/cudacpp/gg_tt.sa/SubProcesses/CrossSectionKernels.h
index 7933ca4bbf..4d9659e04e 100644
--- a/epochX/cudacpp/gg_tt.sa/SubProcesses/CrossSectionKernels.h
+++ b/epochX/cudacpp/gg_tt.sa/SubProcesses/CrossSectionKernels.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef CROSSSECTIONKERNELS_H
 #define CROSSSECTIONKERNELS_H 1
@@ -13,7 +13,7 @@
 
 //============================================================================
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -96,7 +96,7 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
   /*
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   // A class encapsulating the calculation of event statistics on a GPU device
   class CrossSectionKernelDevice : public CrossSectionKernelBase, public NumberOfEvents
   {
diff --git a/epochX/cudacpp/gg_tt.sa/SubProcesses/CurandRandomNumberKernel.cc b/epochX/cudacpp/gg_tt.sa/SubProcesses/CurandRandomNumberKernel.cc
index eb56333b03..08a16f6f2c 100644
--- a/epochX/cudacpp/gg_tt.sa/SubProcesses/CurandRandomNumberKernel.cc
+++ b/epochX/cudacpp/gg_tt.sa/SubProcesses/CurandRandomNumberKernel.cc
@@ -1,9 +1,9 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
-#include "CudaRuntime.h"
+#include "GpuRuntime.h"
 #include "MemoryBuffers.h"
 #include "RandomNumberKernels.h"
 
@@ -22,7 +22,7 @@ inline void assertCurand( curandStatus_t code, const char *file, int line, bool
 }
 #endif /* clang-format on */
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -36,7 +36,7 @@ namespace mg5amcCpu
   {
     if( m_isOnDevice )
     {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
       if( !m_rnarray.isOnDevice() )
         throw std::runtime_error( "CurandRandomNumberKernel on device with a host random number array" );
 #else
@@ -114,7 +114,7 @@ namespace mg5amcCpu
     /*
     printf( "\nCurandRandomNumberKernel::generateRnarray size = %d\n", (int)m_rnarray.size() );
     fptype* data = m_rnarray.data();
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     if( m_rnarray.isOnDevice() )
     {
       data = new fptype[m_rnarray.size()]();
@@ -123,7 +123,7 @@ namespace mg5amcCpu
 #endif
     for( int i = 0; i < ( (int)m_rnarray.size() / 4 ); i++ )
       printf( "[%4d] %f %f %f %f\n", i * 4, data[i * 4], data[i * 4 + 2], data[i * 4 + 2], data[i * 4 + 3] );
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     if( m_rnarray.isOnDevice() ) delete[] data;
 #endif
     */
diff --git a/epochX/cudacpp/gg_tt.sa/SubProcesses/EventStatistics.h b/epochX/cudacpp/gg_tt.sa/SubProcesses/EventStatistics.h
index 48b51e0a49..b425a5bade 100644
--- a/epochX/cudacpp/gg_tt.sa/SubProcesses/EventStatistics.h
+++ b/epochX/cudacpp/gg_tt.sa/SubProcesses/EventStatistics.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef EventStatistics_H
 #define EventStatistics_H 1
@@ -16,7 +16,7 @@
 #include <limits>
 #include <string>
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/gg_tt.sa/SubProcesses/GpuAbstraction.h b/epochX/cudacpp/gg_tt.sa/SubProcesses/GpuAbstraction.h
new file mode 100644
index 0000000000..6a7d9c05c0
--- /dev/null
+++ b/epochX/cudacpp/gg_tt.sa/SubProcesses/GpuAbstraction.h
@@ -0,0 +1,71 @@
+// Copyright (C) 2020-2023 CERN and UCLouvain.
+// Licensed under the GNU Lesser General Public License (version 3 or later).
+// Created by: J. Teig (Jul 2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+
+#ifndef MG5AMC_GPUABSTRACTION_H
+#define MG5AMC_GPUABSTRACTION_H 1
+
+#include <cassert>
+
+//--------------------------------------------------------------------------
+
+#ifdef __CUDACC__
+
+#define gpuError_t cudaError_t
+#define gpuPeekAtLastError cudaPeekAtLastError
+#define gpuGetErrorString cudaGetErrorString
+#define gpuSuccess cudaSuccess
+
+#define gpuMallocHost( ptr, size ) checkGpu( cudaMallocHost( ptr, size ) )
+#define gpuMalloc( ptr, size ) checkGpu( cudaMalloc( ptr, size ) )
+
+#define gpuMemcpy( dstData, srcData, srcBytes, func ) checkGpu( cudaMemcpy( dstData, srcData, srcBytes, func ) )
+#define gpuMemcpyHostToDevice cudaMemcpyHostToDevice
+#define gpuMemcpyDeviceToHost cudaMemcpyDeviceToHost
+#define gpuMemcpyToSymbol( type1, type2, size ) checkGpu( cudaMemcpyToSymbol( type1, type2, size ) )
+
+#define gpuFree( ptr ) checkGpu( cudaFree( ptr ) )
+#define gpuFreeHost( ptr ) checkGpu( cudaFreeHost( ptr ) )
+
+#define gpuSetDevice cudaSetDevice
+#define gpuDeviceSynchronize cudaDeviceSynchronize
+#define gpuDeviceReset cudaDeviceReset
+
+#define gpuLaunchKernel( kernel, blocks, threads, ... ) kernel<<<blocks, threads>>>( __VA_ARGS__ )
+#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<<blocks, threads, sharedMem>>>( __VA_ARGS__ )
+
+//--------------------------------------------------------------------------
+
+#elif defined __HIPCC__
+
+#include "hip/hip_runtime.h"
+
+#define gpuError_t hipError_t
+#define gpuPeekAtLastError hipPeekAtLastError
+#define gpuGetErrorString hipGetErrorString
+#define gpuSuccess hipSuccess
+
+#define gpuMallocHost( ptr, size ) checkGpu( hipHostMalloc( ptr, size ) ) // HostMalloc better
+#define gpuMalloc( ptr, size ) checkGpu( hipMalloc( ptr, size ) )
+
+#define gpuMemcpy( dstData, srcData, srcBytes, func ) checkGpu( hipMemcpy( dstData, srcData, srcBytes, func ) )
+#define gpuMemcpyHostToDevice hipMemcpyHostToDevice
+#define gpuMemcpyDeviceToHost hipMemcpyDeviceToHost
+#define gpuMemcpyToSymbol( type1, type2, size ) checkGpu( hipMemcpyToSymbol( type1, type2, size ) )
+
+#define gpuFree( ptr ) checkGpu( hipFree( ptr ) )
+#define gpuFreeHost( ptr ) checkGpu( hipHostFree( ptr ) )
+
+#define gpuSetDevice hipSetDevice
+#define gpuDeviceSynchronize hipDeviceSynchronize
+#define gpuDeviceReset hipDeviceReset
+
+#define gpuLaunchKernel( kernel, blocks, threads, ... ) kernel<<<blocks, threads>>>( __VA_ARGS__ )
+#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<<blocks, threads, sharedMem>>>( __VA_ARGS__ )
+
+//--------------------------------------------------------------------------
+
+#endif
+
+#endif // MG5AMC_GPUABSTRACTION_H
diff --git a/epochX/cudacpp/ee_mumu.sa/SubProcesses/CudaRuntime.h b/epochX/cudacpp/gg_tt.sa/SubProcesses/GpuRuntime.h
similarity index 62%
rename from epochX/cudacpp/ee_mumu.sa/SubProcesses/CudaRuntime.h
rename to epochX/cudacpp/gg_tt.sa/SubProcesses/GpuRuntime.h
index 64ce52f4b3..93579ef08b 100644
--- a/epochX/cudacpp/ee_mumu.sa/SubProcesses/CudaRuntime.h
+++ b/epochX/cudacpp/gg_tt.sa/SubProcesses/GpuRuntime.h
@@ -1,49 +1,50 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
-// Created by: S. Roiser (Jul 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+// Created by: J. Teig (Jun 2023, based on earlier work by S. Roiser) for the MG5aMC CUDACPP plugin.
+// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 
-#ifndef MG5AMC_CUDARUNTIME_H
-#define MG5AMC_CUDARUNTIME_H 1
+#ifndef MG5AMC_GPURUNTIME_H
+#define MG5AMC_GPURUNTIME_H 1
 
 // MG5AMC on GPU uses the CUDA runtime API, not the lower level CUDA driver API
 // See https://docs.nvidia.com/cuda/cuda-runtime-api/driver-vs-runtime-api.html#driver-vs-runtime-api
 
-#include <cassert>
+#include "GpuAbstraction.h"
+
 #include <iostream>
 
 //--------------------------------------------------------------------------
 
 // See https://stackoverflow.com/a/14038590
-#ifdef __CUDACC__ /* clang-format off */
-#define checkCuda( code ) { assertCuda( code, __FILE__, __LINE__ ); }
-inline void assertCuda( cudaError_t code, const char* file, int line, bool abort = true )
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
+#define checkGpu( code ) { assertGpu( code, __FILE__, __LINE__ ); }
+inline void assertGpu( gpuError_t code, const char* file, int line, bool abort = true )
 {
-  if( code != cudaSuccess )
+  if( code != gpuSuccess )
   {
-    printf( "ERROR! assertCuda: '%s' (%d) in %s:%d\n", cudaGetErrorString( code ), code, file, line );
-    if( abort ) assert( code == cudaSuccess );
+    printf( "ERROR! assertGpu: '%s' (%d) in %s:%d\n", gpuGetErrorString( code ), code, file, line );
+    if( abort ) assert( code == gpuSuccess );
   }
 }
 #endif /* clang-format on */
 
 //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 {
-  // Instantiate a CudaRuntime at the beginnining of the application's main to
-  // invoke cudaSetDevice(0) in the constructor and book a cudaDeviceReset() call in the destructor
+  // Instantiate a GpuRuntime at the beginnining of the application's main to
+  // invoke gpuSetDevice(0) in the constructor and book a gpuDeviceReset() call in the destructor
   // *** FIXME! This will all need to be designed differently when going to multi-GPU nodes! ***
-  struct CudaRuntime final
+  struct GpuRuntime final
   {
-    CudaRuntime( const bool debug = true )
+    GpuRuntime( const bool debug = true )
       : m_debug( debug ) { setUp( m_debug ); }
-    ~CudaRuntime() { tearDown( m_debug ); }
-    CudaRuntime( const CudaRuntime& ) = delete;
-    CudaRuntime( CudaRuntime&& ) = delete;
-    CudaRuntime& operator=( const CudaRuntime& ) = delete;
-    CudaRuntime& operator=( CudaRuntime&& ) = delete;
+    ~GpuRuntime() { tearDown( m_debug ); }
+    GpuRuntime( const GpuRuntime& ) = delete;
+    GpuRuntime( GpuRuntime&& ) = delete;
+    GpuRuntime& operator=( const GpuRuntime& ) = delete;
+    GpuRuntime& operator=( GpuRuntime&& ) = delete;
     bool m_debug;
 
     // Set up CUDA application
@@ -62,8 +63,8 @@ namespace mg5amcGpu
       */
       // Replace cudaFree(0) by cudaSetDevice(0), even if it is not really needed either
       // (but see https://developer.nvidia.com/blog/cuda-pro-tip-always-set-current-device-avoid-multithreading-bugs)
-      if( debug ) std::cout << "__CudaRuntime: calling cudaSetDevice(0)" << std::endl;
-      checkCuda( cudaSetDevice( 0 ) ); // SLOW!
+      if( debug ) std::cout << "__GpuRuntime: calling GpuSetDevice(0)" << std::endl;
+      checkGpu( gpuSetDevice( 0 ) ); // SLOW!
     }
 
     // Tear down CUDA application (call cudaDeviceReset)
@@ -72,14 +73,13 @@ namespace mg5amcGpu
     // See https://docs.nvidia.com/cuda/cuda-memcheck/index.html#leak-checking
     static void tearDown( const bool debug = true )
     {
-      if( debug ) std::cout << "__CudaRuntime: calling cudaDeviceReset()" << std::endl;
-      checkCuda( cudaDeviceReset() );
+      if( debug ) std::cout << "__GpuRuntime: calling GpuDeviceReset()" << std::endl;
+      checkGpu( gpuDeviceReset() );
     }
   };
-
 }
 #endif
 
 //--------------------------------------------------------------------------
 
-#endif // MG5AMC_CUDARUNTIME_H
+#endif // MG5AMC_GPURUNTIME_H
diff --git a/epochX/cudacpp/gg_tt.sa/SubProcesses/MadgraphTest.h b/epochX/cudacpp/gg_tt.sa/SubProcesses/MadgraphTest.h
index ef40624c88..a64c05c26a 100644
--- a/epochX/cudacpp/gg_tt.sa/SubProcesses/MadgraphTest.h
+++ b/epochX/cudacpp/gg_tt.sa/SubProcesses/MadgraphTest.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: S. Hageboeck (Dec 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MADGRAPHTEST_H_
 #define MADGRAPHTEST_H_ 1
@@ -22,7 +22,7 @@
 #include <string>
 #include <vector>
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 using mg5amcGpu::CPPProcess;
 #else
 using mg5amcCpu::CPPProcess;
@@ -201,7 +201,7 @@ class MadgraphTest : public testing::TestWithParam<TestDriverBase*>
 
 // Since we link both the CPU-only and GPU tests into the same executable, we prevent
 // a multiply defined symbol by only compiling this in the non-CUDA phase:
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 
 /// Compare momenta and matrix elements.
 /// This uses an implementation of TestDriverBase to run a madgraph workflow,
@@ -307,6 +307,6 @@ TEST_P( MadgraphTest, CompareMomentaAndME )
   }
 }
 
-#endif // __CUDACC__
+#endif // MGONGPUCPP_GPUIMPL
 
 #endif /* MADGRAPHTEST_H_ */
diff --git a/epochX/cudacpp/gg_tt.sa/SubProcesses/MatrixElementKernels.cc b/epochX/cudacpp/gg_tt.sa/SubProcesses/MatrixElementKernels.cc
index 74b5239ebf..81699dfea9 100644
--- a/epochX/cudacpp/gg_tt.sa/SubProcesses/MatrixElementKernels.cc
+++ b/epochX/cudacpp/gg_tt.sa/SubProcesses/MatrixElementKernels.cc
@@ -1,12 +1,12 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #include "MatrixElementKernels.h"
 
 #include "CPPProcess.h"
-#include "CudaRuntime.h"
+#include "GpuRuntime.h" // Includes the abstraction for Nvidia/AMD compilation
 #include "MemoryAccessMomenta.h"
 #include "MemoryBuffers.h"
 
@@ -14,7 +14,7 @@
 
 //============================================================================
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 namespace mg5amcCpu
 {
 
@@ -150,7 +150,7 @@ namespace mg5amcCpu
 
 //============================================================================
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 {
 
@@ -209,13 +209,13 @@ namespace mg5amcGpu
     PinnedHostBufferHelicityMask hstIsGoodHel( ncomb );
     DeviceBufferHelicityMask devIsGoodHel( ncomb );
     // ... 0d1. Compute good helicity mask on the device
-    computeDependentCouplings<<<m_gpublocks, m_gputhreads>>>( m_gs.data(), m_couplings.data() );
+    gpuLaunchKernel( computeDependentCouplings, m_gpublocks, m_gputhreads, m_gs.data(), m_couplings.data() );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    sigmaKin_getGoodHel<<<m_gpublocks, m_gputhreads>>>( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_numerators.data(), m_denominators.data(), devIsGoodHel.data() );
+    gpuLaunchKernel( sigmaKin_getGoodHel, m_gpublocks, m_gputhreads, m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_numerators.data(), m_denominators.data(), devIsGoodHel.data() );
 #else
-    sigmaKin_getGoodHel<<<m_gpublocks, m_gputhreads>>>( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), devIsGoodHel.data() );
+    gpuLaunchKernel( sigmaKin_getGoodHel, m_gpublocks, m_gputhreads, m_momenta.data(), m_couplings.data(), m_matrixElements.data(), devIsGoodHel.data() );
 #endif
-    checkCuda( cudaPeekAtLastError() );
+    checkGpu( gpuPeekAtLastError() );
     // ... 0d2. Copy back good helicity mask to the host
     copyHostFromDevice( hstIsGoodHel, devIsGoodHel );
     // ... 0d3. Copy back good helicity list to constant memory on the device
@@ -226,19 +226,19 @@ namespace mg5amcGpu
 
   void MatrixElementKernelDevice::computeMatrixElements( const unsigned int channelId )
   {
-    computeDependentCouplings<<<m_gpublocks, m_gputhreads>>>( m_gs.data(), m_couplings.data() );
+    gpuLaunchKernel( computeDependentCouplings, m_gpublocks, m_gputhreads, m_gs.data(), m_couplings.data() );
 #ifndef MGONGPU_NSIGHT_DEBUG
     constexpr unsigned int sharedMemSize = 0;
 #else
     constexpr unsigned int sharedMemSize = ntpbMAX * sizeof( float );
 #endif
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    sigmaKin<<<m_gpublocks, m_gputhreads, sharedMemSize>>>( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), channelId, m_numerators.data(), m_denominators.data(), m_selhel.data(), m_selcol.data() );
+    gpuLaunchKernelSharedMem( sigmaKin, m_gpublocks, m_gputhreads, sharedMemSize, m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), channelId, m_numerators.data(), m_denominators.data(), m_selhel.data(), m_selcol.data() );
 #else
-    sigmaKin<<<m_gpublocks, m_gputhreads, sharedMemSize>>>( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), m_selhel.data(), m_selcol.data() );
+    gpuLaunchKernelSharedMem( sigmaKin, m_gpublocks, m_gputhreads, sharedMemSize, m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), m_selhel.data(), m_selcol.data() );
 #endif
-    checkCuda( cudaPeekAtLastError() );
-    checkCuda( cudaDeviceSynchronize() );
+    checkGpu( gpuPeekAtLastError() );
+    checkGpu( gpuDeviceSynchronize() );
   }
 
   //--------------------------------------------------------------------------
diff --git a/epochX/cudacpp/gg_tt.sa/SubProcesses/MatrixElementKernels.h b/epochX/cudacpp/gg_tt.sa/SubProcesses/MatrixElementKernels.h
index 23e84757a2..72bd8f195b 100644
--- a/epochX/cudacpp/gg_tt.sa/SubProcesses/MatrixElementKernels.h
+++ b/epochX/cudacpp/gg_tt.sa/SubProcesses/MatrixElementKernels.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MATRIXELEMENTKERNELS_H
 #define MATRIXELEMENTKERNELS_H 1
@@ -10,7 +10,7 @@
 
 #include "MemoryBuffers.h"
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -81,7 +81,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating matrix element calculations on a CPU host
   class MatrixElementKernelHost final : public MatrixElementKernelBase, public NumberOfEvents
   {
@@ -130,7 +130,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   // A class encapsulating matrix element calculations on a GPU device
   class MatrixElementKernelDevice : public MatrixElementKernelBase, public NumberOfEvents
   {
diff --git a/epochX/cudacpp/gg_tt.sa/SubProcesses/MemoryAccessAmplitudes.h b/epochX/cudacpp/gg_tt.sa/SubProcesses/MemoryAccessAmplitudes.h
index 573b3bbbc9..ffb76e93de 100644
--- a/epochX/cudacpp/gg_tt.sa/SubProcesses/MemoryAccessAmplitudes.h
+++ b/epochX/cudacpp/gg_tt.sa/SubProcesses/MemoryAccessAmplitudes.h
@@ -15,7 +15,7 @@
 #define MGONGPU_TRIVIAL_AMPLITUDES 1
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/gg_tt.sa/SubProcesses/MemoryAccessCouplings.h b/epochX/cudacpp/gg_tt.sa/SubProcesses/MemoryAccessCouplings.h
index 35a3af42e0..3afdf3e554 100644
--- a/epochX/cudacpp/gg_tt.sa/SubProcesses/MemoryAccessCouplings.h
+++ b/epochX/cudacpp/gg_tt.sa/SubProcesses/MemoryAccessCouplings.h
@@ -15,7 +15,7 @@
 #include "MemoryBuffers.h"       // for HostBufferCouplings::isaligned
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/gg_tt.sa/SubProcesses/MemoryAccessCouplingsFixed.h b/epochX/cudacpp/gg_tt.sa/SubProcesses/MemoryAccessCouplingsFixed.h
index dc0d93afff..ffcdf4dbef 100644
--- a/epochX/cudacpp/gg_tt.sa/SubProcesses/MemoryAccessCouplingsFixed.h
+++ b/epochX/cudacpp/gg_tt.sa/SubProcesses/MemoryAccessCouplingsFixed.h
@@ -14,7 +14,7 @@
 //#include "MemoryAccessHelpers.h"
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/gg_tt.sa/SubProcesses/MemoryAccessDenominators.h b/epochX/cudacpp/gg_tt.sa/SubProcesses/MemoryAccessDenominators.h
index 3bce635718..66f2d32a6b 100644
--- a/epochX/cudacpp/gg_tt.sa/SubProcesses/MemoryAccessDenominators.h
+++ b/epochX/cudacpp/gg_tt.sa/SubProcesses/MemoryAccessDenominators.h
@@ -10,7 +10,7 @@
 #include "MemoryAccessGs.h"
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/gg_tt.sa/SubProcesses/MemoryAccessGs.h b/epochX/cudacpp/gg_tt.sa/SubProcesses/MemoryAccessGs.h
index 31311aa375..4c726b30f3 100644
--- a/epochX/cudacpp/gg_tt.sa/SubProcesses/MemoryAccessGs.h
+++ b/epochX/cudacpp/gg_tt.sa/SubProcesses/MemoryAccessGs.h
@@ -13,7 +13,7 @@
 #include "MemoryBuffers.h" // for HostBufferMatrixElements::isaligned
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/gg_tt.sa/SubProcesses/MemoryAccessHelpers.h b/epochX/cudacpp/gg_tt.sa/SubProcesses/MemoryAccessHelpers.h
index c82a6c7635..db73e4e064 100644
--- a/epochX/cudacpp/gg_tt.sa/SubProcesses/MemoryAccessHelpers.h
+++ b/epochX/cudacpp/gg_tt.sa/SubProcesses/MemoryAccessHelpers.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MemoryAccessHelpers_H
 #define MemoryAccessHelpers_H 1
@@ -105,7 +105,7 @@ class KernelAccessHelper : public MemoryAccessHelper<T>
     }
     else
     {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
       const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid
       //printf( "kernelAccessRecord: ievt=%d threadId=%d\n", ievt, threadIdx.x );
       return T::ieventAccessRecord( buffer, ievt ); // NB fptype and fptype_sv coincide for CUDA
diff --git a/epochX/cudacpp/gg_tt.sa/SubProcesses/MemoryAccessMatrixElements.h b/epochX/cudacpp/gg_tt.sa/SubProcesses/MemoryAccessMatrixElements.h
index f32e6fea5b..3741011971 100644
--- a/epochX/cudacpp/gg_tt.sa/SubProcesses/MemoryAccessMatrixElements.h
+++ b/epochX/cudacpp/gg_tt.sa/SubProcesses/MemoryAccessMatrixElements.h
@@ -13,7 +13,7 @@
 #include "MemoryBuffers.h" // for HostBufferMatrixElements::isaligned
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/gg_tt.sa/SubProcesses/MemoryAccessMomenta.h b/epochX/cudacpp/gg_tt.sa/SubProcesses/MemoryAccessMomenta.h
index 29266de32c..3be229d392 100644
--- a/epochX/cudacpp/gg_tt.sa/SubProcesses/MemoryAccessMomenta.h
+++ b/epochX/cudacpp/gg_tt.sa/SubProcesses/MemoryAccessMomenta.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MemoryAccessMomenta_H
 #define MemoryAccessMomenta_H 1
@@ -13,7 +13,7 @@
 #include "MemoryAccessVectors.h"
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -30,7 +30,7 @@ namespace mg5amcCpu
 
     // Number of Events Per Page in the momenta AOSOA memory buffer layout
     // (these are all best kept as a compile-time constants: see issue #23)
-#ifdef __CUDACC__ /* clang-format off */
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
     // -----------------------------------------------------------------------------------------------
     // --- GPUs: neppM is best set to a power of 2 times the number of fptype's in a 32-byte cacheline
     // --- This is relevant to ensure coalesced access to momenta in global memory
diff --git a/epochX/cudacpp/gg_tt.sa/SubProcesses/MemoryAccessNumerators.h b/epochX/cudacpp/gg_tt.sa/SubProcesses/MemoryAccessNumerators.h
index b152183b28..18991f4fa6 100644
--- a/epochX/cudacpp/gg_tt.sa/SubProcesses/MemoryAccessNumerators.h
+++ b/epochX/cudacpp/gg_tt.sa/SubProcesses/MemoryAccessNumerators.h
@@ -10,7 +10,7 @@
 #include "MemoryAccessGs.h"
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/gg_tt.sa/SubProcesses/MemoryAccessRandomNumbers.h b/epochX/cudacpp/gg_tt.sa/SubProcesses/MemoryAccessRandomNumbers.h
index e2988d39f3..40cb089135 100644
--- a/epochX/cudacpp/gg_tt.sa/SubProcesses/MemoryAccessRandomNumbers.h
+++ b/epochX/cudacpp/gg_tt.sa/SubProcesses/MemoryAccessRandomNumbers.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MemoryAccessRandomNumbers_H
 #define MemoryAccessRandomNumbers_H 1
@@ -11,7 +11,7 @@
 #include "CPPProcess.h"
 #include "MemoryAccessHelpers.h"
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 using mg5amcGpu::CPPProcess;
 #else
 using mg5amcCpu::CPPProcess;
diff --git a/epochX/cudacpp/gg_tt.sa/SubProcesses/MemoryAccessVectors.h b/epochX/cudacpp/gg_tt.sa/SubProcesses/MemoryAccessVectors.h
index e9b197368e..08faccff0f 100644
--- a/epochX/cudacpp/gg_tt.sa/SubProcesses/MemoryAccessVectors.h
+++ b/epochX/cudacpp/gg_tt.sa/SubProcesses/MemoryAccessVectors.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MemoryAccessVectors_H
 #define MemoryAccessVectors_H 1
@@ -10,7 +10,7 @@
 
 #include "mgOnGpuVectors.h"
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 namespace mg5amcCpu // this is only needed for CPU SIMD vectorization
 {
 
diff --git a/epochX/cudacpp/gg_tt.sa/SubProcesses/MemoryAccessWavefunctions.h b/epochX/cudacpp/gg_tt.sa/SubProcesses/MemoryAccessWavefunctions.h
index 5428aaf933..33bef4559e 100644
--- a/epochX/cudacpp/gg_tt.sa/SubProcesses/MemoryAccessWavefunctions.h
+++ b/epochX/cudacpp/gg_tt.sa/SubProcesses/MemoryAccessWavefunctions.h
@@ -15,7 +15,7 @@
 #define MGONGPU_TRIVIAL_WAVEFUNCTIONS 1
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/gg_tt.sa/SubProcesses/MemoryBuffers.h b/epochX/cudacpp/gg_tt.sa/SubProcesses/MemoryBuffers.h
index 3093e6ed18..7756a71621 100644
--- a/epochX/cudacpp/gg_tt.sa/SubProcesses/MemoryBuffers.h
+++ b/epochX/cudacpp/gg_tt.sa/SubProcesses/MemoryBuffers.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021, based on earlier work by S. Hageboeck) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Roiser, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MemoryBuffers_H
 #define MemoryBuffers_H 1
@@ -11,12 +11,12 @@
 #include "mgOnGpuCxtypes.h"
 
 #include "CPPProcess.h"
-#include "CudaRuntime.h"
+#include "GpuRuntime.h"
 #include "Parameters_sm.h"
 
 #include <sstream>
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -87,7 +87,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   constexpr bool HostBufferALIGNED = false;   // ismisaligned=false
   constexpr bool HostBufferMISALIGNED = true; // ismisaligned=true
 
@@ -119,7 +119,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   // A class encapsulating a CUDA pinned host buffer
   template<typename T>
   class PinnedHostBufferBase : public BufferBase<T>
@@ -128,18 +128,18 @@ namespace mg5amcCpu
     PinnedHostBufferBase( const size_t size )
       : BufferBase<T>( size, false )
     {
-      checkCuda( cudaMallocHost( &( this->m_data ), this->bytes() ) );
+      gpuMallocHost( &( this->m_data ), this->bytes() );
     }
     virtual ~PinnedHostBufferBase()
     {
-      checkCuda( cudaFreeHost( this->m_data ) );
+      gpuFreeHost( this->m_data );
     }
   };
 #endif
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   // A class encapsulating a CUDA device buffer
   template<typename T>
   class DeviceBufferBase : public BufferBase<T>
@@ -148,18 +148,18 @@ namespace mg5amcCpu
     DeviceBufferBase( const size_t size )
       : BufferBase<T>( size, true )
     {
-      checkCuda( cudaMalloc( &( this->m_data ), this->bytes() ) );
+      gpuMalloc( &( this->m_data ), this->bytes() );
     }
     virtual ~DeviceBufferBase()
     {
-      checkCuda( cudaFree( this->m_data ) );
+      gpuFree( this->m_data );
     }
   };
 #endif
 
   //--------------------------------------------------------------------------
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for a given number of events
   template<typename T, size_t sizePerEvent, bool ismisaligned>
   class HostBuffer : public HostBufferBase<T, ismisaligned>, virtual private NumberOfEvents
@@ -175,7 +175,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   // A class encapsulating a CUDA pinned host buffer for a given number of events
   template<typename T, size_t sizePerEvent>
   class PinnedHostBuffer : public PinnedHostBufferBase<T>, virtual private NumberOfEvents
@@ -191,7 +191,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   // A class encapsulating a CUDA device buffer for a given number of events
   template<typename T, size_t sizePerEvent>
   class DeviceBuffer : public DeviceBufferBase<T>, virtual private NumberOfEvents
@@ -213,7 +213,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for momenta random numbers
   constexpr size_t sizePerEventRndNumMomenta = MemoryBuffers::np4 * MemoryBuffers::nparf;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for momenta random numbers
   typedef HostBuffer<fptype, sizePerEventRndNumMomenta, HostBufferALIGNED> HostBufferRndNumMomenta;
 #else
@@ -232,7 +232,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer with ONE fptype per event
   constexpr size_t sizePerEventOneFp = 1;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer with ONE fptype per event
   typedef HostBuffer<fptype, sizePerEventOneFp, HostBufferALIGNED> HostBufferOneFp;
 #else
@@ -257,7 +257,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for Gs
   constexpr size_t sizePerEventGs = 1;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for gs
   typedef HostBuffer<fptype, sizePerEventGs, HostBufferALIGNED> HostBufferGs;
 #else
@@ -276,7 +276,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for numerators
   constexpr size_t sizePerEventNumerators = 1;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for gs
   typedef HostBuffer<fptype, sizePerEventNumerators, HostBufferALIGNED> HostBufferNumerators;
 #else
@@ -296,7 +296,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for denominators
   constexpr size_t sizePerEventDenominators = 1;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for gs
   typedef HostBuffer<fptype, sizePerEventDenominators, HostBufferALIGNED> HostBufferDenominators;
 #else
@@ -315,7 +315,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for random numbers
   constexpr size_t sizePerEventCouplings = MemoryBuffers::ndcoup * MemoryBuffers::nx2;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for gs
   typedef HostBuffer<fptype, sizePerEventCouplings, HostBufferALIGNED> HostBufferCouplings;
 #else
@@ -333,7 +333,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for momenta
   constexpr size_t sizePerEventMomenta = MemoryBuffers::np4 * MemoryBuffers::npar;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for momenta
   typedef HostBuffer<fptype, sizePerEventMomenta, HostBufferALIGNED> HostBufferMomenta;
   //typedef HostBuffer<fptype, sizePerEventMomenta, HostBufferMISALIGNED> HostBufferMomenta; // TEST MISALIGNMENT!
@@ -352,7 +352,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for sampling weights
   constexpr size_t sizePerEventWeights = 1;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for sampling weights
   typedef HostBuffer<fptype, sizePerEventWeights, HostBufferALIGNED> HostBufferWeights;
 #else
@@ -370,7 +370,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for matrix elements
   constexpr size_t sizePerEventMatrixElements = 1;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for matrix elements
   typedef HostBuffer<fptype, sizePerEventMatrixElements, HostBufferALIGNED> HostBufferMatrixElements;
 #else
@@ -385,7 +385,7 @@ namespace mg5amcCpu
   // A base class encapsulating a memory buffer for the helicity mask
   typedef BufferBase<bool> BufferHelicityMask;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for the helicity mask
   typedef HostBufferBase<bool, HostBufferALIGNED> HostBufferHelicityMask;
 #else
@@ -403,7 +403,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for wavefunctions
   constexpr size_t sizePerEventWavefunctions = MemoryBuffers::nw6 * MemoryBuffers::nx2;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for wavefunctions
   typedef HostBuffer<fptype, sizePerEventWavefunctions, HostBufferALIGNED> HostBufferWavefunctions;
 #else
@@ -421,7 +421,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for helicity random numbers
   constexpr size_t sizePerEventRndNumHelicity = 1;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for helicity random numbers
   typedef HostBuffer<fptype, sizePerEventRndNumHelicity, HostBufferALIGNED> HostBufferRndNumHelicity;
 #else
@@ -439,7 +439,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for color random numbers
   constexpr size_t sizePerEventRndNumColor = 1;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for color random numbers
   typedef HostBuffer<fptype, sizePerEventRndNumColor, HostBufferALIGNED> HostBufferRndNumColor;
 #else
@@ -457,7 +457,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for helicity selection
   constexpr size_t sizePerEventSelectedHelicity = 1;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for helicity selection
   typedef HostBuffer<int, sizePerEventSelectedHelicity, HostBufferALIGNED> HostBufferSelectedHelicity;
 #else
@@ -475,7 +475,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for color selection
   constexpr size_t sizePerEventSelectedColor = 1;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for color selection
   typedef HostBuffer<int, sizePerEventSelectedColor, HostBufferALIGNED> HostBufferSelectedColor;
 #else
@@ -487,7 +487,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   template<class Tdst, class Tsrc>
   void copyDeviceFromHost( Tdst& dst, const Tsrc& src ) // keep the same order of arguments as in memcpy
   {
@@ -504,13 +504,13 @@ namespace mg5amcCpu
       throw std::runtime_error( sstr.str() );
     }
     // NB (PR #45): cudaMemcpy involves an intermediate memcpy to pinned memory if host array is a not a pinned host array
-    checkCuda( cudaMemcpy( dst.data(), src.data(), src.bytes(), cudaMemcpyHostToDevice ) );
+    gpuMemcpy( dst.data(), src.data(), src.bytes(), gpuMemcpyHostToDevice );
   }
 #endif
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   template<class Tdst, class Tsrc>
   void copyHostFromDevice( Tdst& dst, const Tsrc& src ) // keep the same order of arguments as in memcpy
   {
@@ -527,7 +527,7 @@ namespace mg5amcCpu
       throw std::runtime_error( sstr.str() );
     }
     // NB (PR #45): cudaMemcpy involves an intermediate memcpy to pinned memory if host array is a not a pinned host array
-    checkCuda( cudaMemcpy( dst.data(), src.data(), src.bytes(), cudaMemcpyDeviceToHost ) );
+    gpuMemcpy( dst.data(), src.data(), src.bytes(), gpuMemcpyDeviceToHost );
   }
 #endif
 
diff --git a/epochX/cudacpp/gg_tt.sa/SubProcesses/P1_Sigma_sm_gg_ttx/CPPProcess.cc b/epochX/cudacpp/gg_tt.sa/SubProcesses/P1_Sigma_sm_gg_ttx/CPPProcess.cc
index d390883453..e7dbb05570 100644
--- a/epochX/cudacpp/gg_tt.sa/SubProcesses/P1_Sigma_sm_gg_ttx/CPPProcess.cc
+++ b/epochX/cudacpp/gg_tt.sa/SubProcesses/P1_Sigma_sm_gg_ttx/CPPProcess.cc
@@ -4,7 +4,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
 // MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
@@ -16,7 +16,6 @@
 
 #include "mgOnGpuConfig.h"
 
-#include "CudaRuntime.h"
 #include "HelAmps_sm.h"
 #include "MemoryAccessAmplitudes.h"
 #include "MemoryAccessCouplings.h"
@@ -46,7 +45,7 @@
 // Class member functions for calculating the matrix elements for
 // Process: g g > t t~ WEIGHTED<=2 @1
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -80,7 +79,7 @@ namespace mg5amcCpu
   __device__ const fptype cIPD[2] = { (fptype)Parameters_sm::mdl_MT, (fptype)Parameters_sm::mdl_WT };
   __device__ const fptype* cIPC = nullptr; // unused as nicoup=0
 #else
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   __device__ __constant__ fptype cIPD[2];
   __device__ __constant__ fptype* cIPC = nullptr; // unused as nicoup=0
 #else
@@ -90,7 +89,7 @@ namespace mg5amcCpu
 #endif
 
   // Helicity combinations (and filtering of "good" helicity combinations)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   __device__ __constant__ short cHel[ncomb][npar];
   __device__ __constant__ int cNGoodHel;
   __device__ __constant__ int cGoodHel[ncomb];
@@ -118,13 +117,13 @@ namespace mg5amcCpu
                            fptype* allDenominators,       // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
                            fptype_sv* jamp2_sv            // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled)
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
                            , const int ievt00             // input: first event number in current C++ event page (for CUDA, ievt depends on threadid)
 #endif
                            )
   //ALWAYS_INLINE // attributes are not permitted in a function definition
   {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     using namespace mg5amcGpu;
     using M_ACCESS = DeviceAccessMomenta;         // non-trivial access: buffer includes all events
     using E_ACCESS = DeviceAccessMatrixElements;  // non-trivial access: buffer includes all events
@@ -151,7 +150,7 @@ namespace mg5amcCpu
 #endif /* clang-format on */
     mgDebug( 0, __FUNCTION__ );
     //printf( "calculate_wavefunctions: ihel=%2d\n", ihel );
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
     //printf( "calculate_wavefunctions: ievt00=%d\n", ievt00 );
 #endif
 
@@ -187,7 +186,7 @@ namespace mg5amcCpu
 #endif
     for( int iParity = 0; iParity < nParity; ++iParity )
     { // START LOOP ON IPARITY
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
       const int ievt0 = ievt00 + iParity * neppV;
 #endif
       constexpr size_t nxcoup = ndcoup + nicoup; // both dependent and independent couplings
@@ -200,8 +199,10 @@ namespace mg5amcCpu
         allCOUPs[idcoup] = CD_ACCESS::idcoupAccessBufferConst( allcouplings, idcoup ); // dependent couplings, vary event-by-event
       for( size_t iicoup = 0; iicoup < nicoup; iicoup++ )
         allCOUPs[ndcoup + iicoup] = CI_ACCESS::iicoupAccessBufferConst( cIPC, iicoup ); // independent couplings, fixed for all events
+#ifdef MGONGPUCPP_GPUIMPL
 #ifdef __CUDACC__
 #pragma nv_diagnostic pop
+#endif
       // CUDA kernels take input/output buffers with momenta/MEs for all events
       const fptype* momenta = allmomenta;
       const fptype* COUPs[nxcoup];
@@ -299,7 +300,7 @@ namespace mg5amcCpu
         { 16, -2 },
         { -2, 16 } }; // 2-D array[2][2]
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
       // Pre-compute a constexpr triangular color matrix properly normalized #475
       struct TriangularNormalizedColorMatrix
       {
@@ -356,7 +357,7 @@ namespace mg5amcCpu
 #endif
       for( int icol = 0; icol < ncolor; icol++ )
       {
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
         // === C++ START ===
         // Diagonal terms
 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
@@ -415,7 +416,7 @@ namespace mg5amcCpu
       MEs_sv_previous += deltaMEs_previous;
 #endif
       /*
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
       if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", blockDim.x * blockIdx.x + threadIdx.x, ihel, MEs_sv );
 #else
 #ifdef MGONGPU_CPPSIMD
@@ -462,8 +463,8 @@ namespace mg5amcCpu
       { 1, 1, -1, -1 },
       { 1, 1, 1, 1 },
       { 1, 1, 1, -1 } };
-#ifdef __CUDACC__
-    checkCuda( cudaMemcpyToSymbol( cHel, tHel, ncomb * npar * sizeof( short ) ) );
+#ifdef MGONGPUCPP_GPUIMPL
+    gpuMemcpyToSymbol( cHel, tHel, ncomb * npar * sizeof( short ) );
 #else
     memcpy( cHel, tHel, ncomb * npar * sizeof( short ) );
 #endif
@@ -503,9 +504,9 @@ namespace mg5amcCpu
     // Then copy them to CUDA constant memory (issue #39) or its C++ emulation in file-scope static memory
     const fptype tIPD[2] = { (fptype)m_pars->mdl_MT, (fptype)m_pars->mdl_WT };
     //const cxtype tIPC[0] = { ... }; // nicoup=0
-#ifdef __CUDACC__
-    checkCuda( cudaMemcpyToSymbol( cIPD, tIPD, 2 * sizeof( fptype ) ) );
-    //checkCuda( cudaMemcpyToSymbol( cIPC, tIPC, 0 * sizeof( cxtype ) ) ); // nicoup=0
+#ifdef MGONGPUCPP_GPUIMPL
+    gpuMemcpyToSymbol( cIPD, tIPD, 2 * sizeof( fptype ) );
+    //gpuMemcpyToSymbol( cIPC, tIPC, 0 * sizeof( cxtype ) ); // nicoup=0
 #else
     memcpy( cIPD, tIPD, 2 * sizeof( fptype ) );
     //memcpy( cIPC, tIPC, 0 * sizeof( cxtype ) ); // nicoup=0
@@ -541,7 +542,7 @@ namespace mg5amcCpu
   {
     std::stringstream out;
     // CUDA version (NVCC)
-    // [Use __NVCC__ instead of __CUDACC__ here!]
+    // [Use __NVCC__ instead of MGONGPUCPP_GPUIMPL here!]
     // [This tests if 'nvcc' was used even to build a .cc file, even if not necessarily 'nvcc -x cu' for a .cu file]
     // [Check 'nvcc --compiler-options -dM -E dummy.c | grep CUDA': see https://stackoverflow.com/a/53713712]
 #ifdef __NVCC__
@@ -606,12 +607,12 @@ namespace mg5amcCpu
   __global__ void /* clang-format off */
   computeDependentCouplings( const fptype* allgs, // input: Gs[nevt]
                              fptype* allcouplings // output: couplings[nevt*ndcoup*2]
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
                              , const int nevt     // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #endif
   ) /* clang-format on */
   {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     using namespace mg5amcGpu;
     using G_ACCESS = DeviceAccessGs;
     using C_ACCESS = DeviceAccessCouplings;
@@ -632,7 +633,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__ /* clang-format off */
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
   __global__ void
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
@@ -758,9 +759,9 @@ namespace mg5amcCpu
         nGoodHel++;
       }
     }
-#ifdef __CUDACC__
-    checkCuda( cudaMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) ) );
-    checkCuda( cudaMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) ) );
+#ifdef MGONGPUCPP_GPUIMPL
+    gpuMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) );
+    gpuMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) );
 #else
     cNGoodHel = nGoodHel;
     for( int ihel = 0; ihel < ncomb; ihel++ ) cGoodHel[ihel] = goodHel[ihel];
@@ -784,7 +785,7 @@ namespace mg5amcCpu
 #endif
             int* allselhel,                // output: helicity selection[nevt]
             int* allselcol                 // output: helicity selection[nevt]
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
             , const int nevt               // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #endif
             ) /* clang-format on */
@@ -804,7 +805,7 @@ namespace mg5amcCpu
     // Denominators: spins, colors and identical particles
     constexpr int helcolDenominators[1] = { 256 }; // assume nprocesses == 1 (#272 and #343)
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     // Remember: in CUDA this is a kernel for one event, in c++ this processes n events
     const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid
 #else
@@ -818,9 +819,12 @@ namespace mg5amcCpu
 #endif
 
     // Start sigmaKin_lines
+
+#include "GpuAbstraction.h"
+
     // === PART 0 - INITIALISATION (before calculate_wavefunctions) ===
     // Reset the "matrix elements" - running sums of |M|^2 over helicities for the given event
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     allMEs[ievt] = 0;
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     allNumerators[ievt] = 0;
@@ -848,7 +852,7 @@ namespace mg5amcCpu
     // === PART 1 - HELICITY LOOP: CALCULATE WAVEFUNCTIONS ===
     // (in both CUDA and C++, using precomputed good helicities)
 
-#ifdef __CUDACC__ // CUDA OR C++
+#ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++
 
     // *** START OF PART 1a - CUDA (one event per CPU thread) ***
     // Running sum of partial amplitudes squared for event by event color selection (#402)
@@ -1058,7 +1062,7 @@ namespace mg5amcCpu
     // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event
     // [NB 'sum over final spins, average over initial spins', eg see
     // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf]
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     allMEs[ievt] /= helcolDenominators[0];
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     if( channelId > 0 ) allMEs[ievt] *= allNumerators[ievt] / allDenominators[ievt];
diff --git a/epochX/cudacpp/gg_tt.sa/SubProcesses/P1_Sigma_sm_gg_ttx/CPPProcess.h b/epochX/cudacpp/gg_tt.sa/SubProcesses/P1_Sigma_sm_gg_ttx/CPPProcess.h
index 3ebd92c038..4a88a07226 100644
--- a/epochX/cudacpp/gg_tt.sa/SubProcesses/P1_Sigma_sm_gg_ttx/CPPProcess.h
+++ b/epochX/cudacpp/gg_tt.sa/SubProcesses/P1_Sigma_sm_gg_ttx/CPPProcess.h
@@ -4,7 +4,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
 // MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
@@ -25,7 +25,7 @@
 
 //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -107,7 +107,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   __global__ void
   computeDependentCouplings( const fptype* allgs,    // input: Gs[nevt]
                              fptype* allcouplings ); // output: couplings[nevt*ndcoup*2]
@@ -120,7 +120,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__ /* clang-format off */
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
   __global__ void
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
@@ -150,7 +150,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__ /* clang-format off */
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
   __global__ void
   sigmaKin( const fptype* allmomenta,      // input: momenta[nevt*npar*4]
             const fptype* allcouplings,    // input: couplings[nevt*ndcoup*2]
diff --git a/epochX/cudacpp/gg_tt.sa/SubProcesses/P1_Sigma_sm_gg_ttx/CudaRuntime.h b/epochX/cudacpp/gg_tt.sa/SubProcesses/P1_Sigma_sm_gg_ttx/CudaRuntime.h
deleted file mode 120000
index ce9e1a487a..0000000000
--- a/epochX/cudacpp/gg_tt.sa/SubProcesses/P1_Sigma_sm_gg_ttx/CudaRuntime.h
+++ /dev/null
@@ -1 +0,0 @@
-../CudaRuntime.h
\ No newline at end of file
diff --git a/epochX/cudacpp/gg_tt.sa/SubProcesses/P1_Sigma_sm_gg_ttx/GpuAbstraction.h b/epochX/cudacpp/gg_tt.sa/SubProcesses/P1_Sigma_sm_gg_ttx/GpuAbstraction.h
new file mode 120000
index 0000000000..72054e19ba
--- /dev/null
+++ b/epochX/cudacpp/gg_tt.sa/SubProcesses/P1_Sigma_sm_gg_ttx/GpuAbstraction.h
@@ -0,0 +1 @@
+../GpuAbstraction.h
\ No newline at end of file
diff --git a/epochX/cudacpp/gg_tt.sa/SubProcesses/P1_Sigma_sm_gg_ttx/GpuRuntime.h b/epochX/cudacpp/gg_tt.sa/SubProcesses/P1_Sigma_sm_gg_ttx/GpuRuntime.h
new file mode 120000
index 0000000000..3920e83be4
--- /dev/null
+++ b/epochX/cudacpp/gg_tt.sa/SubProcesses/P1_Sigma_sm_gg_ttx/GpuRuntime.h
@@ -0,0 +1 @@
+../GpuRuntime.h
\ No newline at end of file
diff --git a/epochX/cudacpp/gg_tt.sa/SubProcesses/P1_Sigma_sm_gg_ttx/check_sa.cc b/epochX/cudacpp/gg_tt.sa/SubProcesses/P1_Sigma_sm_gg_ttx/check_sa.cc
index 3fbf0ffbee..7cac5ab47b 100644
--- a/epochX/cudacpp/gg_tt.sa/SubProcesses/P1_Sigma_sm_gg_ttx/check_sa.cc
+++ b/epochX/cudacpp/gg_tt.sa/SubProcesses/P1_Sigma_sm_gg_ttx/check_sa.cc
@@ -4,7 +4,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: O. Mattelaer (Nov 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 
 #include "mgOnGpuConfig.h"
@@ -12,6 +12,7 @@
 #include "BridgeKernels.h"
 #include "CPPProcess.h"
 #include "CrossSectionKernels.h"
+#include "GpuRuntime.h"
 #include "MatrixElementKernels.h"
 #include "MemoryAccessMatrixElements.h"
 #include "MemoryAccessMomenta.h"
@@ -65,7 +66,7 @@ usage( char* argv0, int ret = 1 )
   std::cout << std::endl;
   std::cout << "Summary stats are always computed: '-p' and '-j' only control their printout" << std::endl;
   std::cout << "The '-d' flag only enables NaN/abnormal warnings and OMP debugging" << std::endl;
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 #ifdef _OPENMP
   std::cout << std::endl;
   std::cout << "Use the OMP_NUM_THREADS environment variable to control OMP multi-threading" << std::endl;
@@ -96,7 +97,7 @@ int
 main( int argc, char** argv )
 {
   // Namespaces for CUDA and C++ (FIXME - eventually use the same namespace everywhere...)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   using namespace mg5amcGpu;
 #else
   using namespace mg5amcCpu;
@@ -134,9 +135,11 @@ main( int argc, char** argv )
     CurandDevice = 2
   };
 #ifdef MGONGPU_HAS_NO_CURAND
-  RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // this is the only supported mode if build has no curand (PR #784)
+  RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // this is the only supported mode if build has no curand (PR #784 and #785)
+#elif defined __HIPCC__
+#error Internal error: MGONGPU_HAS_NO_CURAND should have been set for __HIPCC__ // default on AMD GPUs should be common random
 #elif defined __CUDACC__
-  RandomNumberMode rndgen = RandomNumberMode::CurandDevice; // default on GPU if build has curand
+  RandomNumberMode rndgen = RandomNumberMode::CurandDevice; // default on NVidia GPU if build has curand
 #else
   RandomNumberMode rndgen = RandomNumberMode::CurandHost; // default on CPU if build has curand
 #endif
@@ -146,10 +149,10 @@ main( int argc, char** argv )
     RamboHost = 1,
     RamboDevice = 2
   };
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   RamboSamplingMode rmbsmp = RamboSamplingMode::RamboDevice; // default on GPU
 #else
-  RamboSamplingMode rmbsmp = RamboSamplingMode::RamboHost;  // default on CPU
+  RamboSamplingMode rmbsmp = RamboSamplingMode::RamboHost; // default on CPU
 #endif
   // Bridge emulation mode (NB Bridge implies RamboHost!)
   bool bridge = false;
@@ -177,7 +180,7 @@ main( int argc, char** argv )
     else if( arg == "--curdev" )
     {
 #ifndef __CUDACC__
-      throw std::runtime_error( "CurandDevice is not supported on CPUs" );
+      throw std::runtime_error( "CurandDevice is not supported on CPUs or non-NVidia GPUs" );
 #elif defined MGONGPU_HAS_NO_CURAND
       throw std::runtime_error( "CurandDevice is not supported because this application was built without Curand support" );
 #else
@@ -198,7 +201,7 @@ main( int argc, char** argv )
     }
     else if( arg == "--rmbdev" )
     {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
       rmbsmp = RamboSamplingMode::RamboDevice;
 #else
       throw std::runtime_error( "RamboDevice is not supported on CPUs" );
@@ -272,13 +275,13 @@ main( int argc, char** argv )
     return usage( argv[0] );
   }
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 #ifdef _OPENMP
   ompnumthreadsNotSetMeansOneThread( debug ? 1 : 0 ); // quiet(-1), info(0), debug(1)
 #endif
 #endif
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // Fail gently and avoid "Illegal instruction (core dumped)" if the host does not support the SIMD used in the ME calculation
   // Note: this prevents a crash on pmpe04 but not on some github CI nodes?
   // [NB: SIMD vectorization in mg5amc C++ code is only used in the ME calculation below MatrixElementKernelHost!]
@@ -296,14 +299,14 @@ main( int argc, char** argv )
 
   // === STEP 0 - INITIALISE
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 
-  // --- 00. Initialise cuda
-  // Instantiate a CudaRuntime at the beginnining of the application's main to
-  // invoke cudaSetDevice(0) in the constructor and book a cudaDeviceReset() call in the destructor
-  const std::string cdinKey = "00 CudaInit";
+  // --- 00. Initialise GPU
+  // Instantiate a GpuRuntime at the beginnining of the application's main.
+  // For CUDA this invokes cudaSetDevice(0) in the constructor and books a cudaDeviceReset() call in the destructor.
+  const std::string cdinKey = "00 GpuInit";
   timermap.start( cdinKey );
-  CudaRuntime cudaRuntime( debug );
+  GpuRuntime GpuRuntime( debug );
 #endif
 
   // --- 0a. Initialise physics process
@@ -325,7 +328,7 @@ main( int argc, char** argv )
   timermap.start( alloKey );
 
   // Memory buffers for random numbers for momenta
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferRndNumMomenta hstRndmom( nevt );
 #else
   PinnedHostBufferRndNumMomenta hstRndmom( nevt );
@@ -333,7 +336,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for sampling weights
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferWeights hstWeights( nevt );
 #else
   PinnedHostBufferWeights hstWeights( nevt );
@@ -341,7 +344,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for momenta
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferMomenta hstMomenta( nevt );
 #else
   PinnedHostBufferMomenta hstMomenta( nevt );
@@ -349,7 +352,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for Gs
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferGs hstGs( nevt );
 #else
   PinnedHostBufferGs hstGs( nevt );
@@ -366,7 +369,7 @@ main( int argc, char** argv )
   }
 
   // Memory buffers for matrix elements
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferMatrixElements hstMatrixElements( nevt );
 #else
   PinnedHostBufferMatrixElements hstMatrixElements( nevt );
@@ -375,7 +378,7 @@ main( int argc, char** argv )
 
   // Memory buffers for random numbers for helicity selection
   // *** NB #403 these buffers always remain initialised at 0: no need for helicity choice in gcheck/check (no LHE produced) ***
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferRndNumHelicity hstRndHel( nevt );
 #else
   PinnedHostBufferRndNumHelicity hstRndHel( nevt );
@@ -384,7 +387,7 @@ main( int argc, char** argv )
 
   // Memory buffers for random numbers for color selection
   // *** NB #402 these buffers always remain initialised at 0: no need for color choice in gcheck/check (no LHE produced) ***
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferRndNumColor hstRndCol( nevt );
 #else
   PinnedHostBufferRndNumColor hstRndCol( nevt );
@@ -392,7 +395,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for helicity selection
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferSelectedHelicity hstSelHel( nevt );
 #else
   PinnedHostBufferSelectedHelicity hstSelHel( nevt );
@@ -400,7 +403,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for color selection
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferSelectedColor hstSelCol( nevt );
 #else
   PinnedHostBufferSelectedColor hstSelCol( nevt );
@@ -438,7 +441,7 @@ main( int argc, char** argv )
     const bool onDevice = true;
     prnk.reset( new CurandRandomNumberKernel( devRndmom, onDevice ) );
 #else
-    throw std::logic_error( "INTERNAL ERROR! CurandDevice is not supported on CPUs" ); // INTERNAL ERROR (no path to this statement)
+    throw std::logic_error( "INTERNAL ERROR! CurandDevice is not supported on CPUs or non-NVidia GPUs" ); // INTERNAL ERROR (no path to this statement)
 #endif
   }
 
@@ -450,7 +453,7 @@ main( int argc, char** argv )
   }
   else
   {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     prsk.reset( new RamboSamplingKernelDevice( energy, devRndmom, devMomenta, devWeights, gpublocks, gputhreads ) );
 #else
     throw std::logic_error( "RamboDevice is not supported on CPUs" ); // INTERNAL ERROR (no path to this statement)
@@ -461,7 +464,7 @@ main( int argc, char** argv )
   std::unique_ptr<MatrixElementKernelBase> pmek;
   if( !bridge )
   {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     pmek.reset( new MatrixElementKernelDevice( devMomenta, devGs, devRndHel, devRndCol, devMatrixElements, devSelHel, devSelCol, gpublocks, gputhreads ) );
 #else
     pmek.reset( new MatrixElementKernelHost( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, nevt ) );
@@ -469,7 +472,7 @@ main( int argc, char** argv )
   }
   else
   {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     pmek.reset( new BridgeKernelDevice( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, gpublocks, gputhreads ) );
 #else
     pmek.reset( new BridgeKernelHost( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, nevt ) );
@@ -511,7 +514,7 @@ main( int argc, char** argv )
     prnk->generateRnarray();
     //std::cout << "Got random numbers" << std::endl;
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     if( rndgen != RandomNumberMode::CurandDevice && rmbsmp == RamboSamplingMode::RamboDevice )
     {
       // --- 1c. Copy rndmom from host to device
@@ -543,7 +546,7 @@ main( int argc, char** argv )
     prsk->getMomentaFinal();
     //std::cout << "Got final momenta" << std::endl;
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     if( rmbsmp == RamboSamplingMode::RamboDevice )
     {
       // --- 2c. CopyDToH Weights
@@ -588,7 +591,7 @@ main( int argc, char** argv )
       dynamic_cast<BridgeKernelBase*>( pmek.get() )->transposeInputMomentaC2F();
     }
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     // --- 2d. CopyHToD Momenta
     const std::string gKey = "0.. CpHTDg";
     rambtime += timermap.start( gKey ); // FIXME! NOT A RAMBO TIMER!
@@ -617,7 +620,7 @@ main( int argc, char** argv )
     wv3atime += timermap.stop(); // calc only
     wavetime += wv3atime;        // calc plus copy
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     if( !bridge )
     {
       // --- 3b. CopyDToH MEs
@@ -760,18 +763,22 @@ main( int argc, char** argv )
     rndgentxt = "CURAND DEVICE";
 #ifdef __CUDACC__
   rndgentxt += " (CUDA code)";
+#elif defined __HIPCC__
+  rndgentxt += " (HIP code)";
 #else
   rndgentxt += " (C++ code)";
 #endif
 
   // Workflow description summary
   std::string wrkflwtxt;
-  // -- CUDA or C++?
+  // -- CUDA or HIP or C++?
 #ifdef __CUDACC__
   wrkflwtxt += "CUD:";
+#elif defined __HIPCC__
+  wrkflwtxt += "HIP:";
 #else
   wrkflwtxt += "CPP:";
-#endif
+#endif /* clang-format off */
   // -- DOUBLE or FLOAT?
 #if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
   wrkflwtxt += "MIX+"; // mixed fptypes (single precision color algebra #537)
@@ -781,7 +788,7 @@ main( int argc, char** argv )
   wrkflwtxt += "FLT+";
 #else
   wrkflwtxt += "???+"; // no path to this statement
-#endif
+#endif /* clang-format on */
   // -- CUCOMPLEX or THRUST or STD complex numbers?
 #ifdef __CUDACC__
 #if defined MGONGPU_CUCXTYPE_CUCOMPLEX
@@ -793,6 +800,12 @@ main( int argc, char** argv )
 #else
   wrkflwtxt += "???:"; // no path to this statement
 #endif
+#elif defined __HIPCC__
+#if defined MGONGPU_CUCXTYPE_CXSMPL
+  wrkflwtxt += "CXS:";
+#else
+  wrkflwtxt += "???:"; // no path to this statement
+#endif
 #else
 #if defined MGONGPU_CPPCXTYPE_STDCOMPLEX
   wrkflwtxt += "STX:";
@@ -818,7 +831,7 @@ main( int argc, char** argv )
     wrkflwtxt += "RMBDEV+";
   else
     wrkflwtxt += "??????+"; // no path to this statement
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   // -- HOST or DEVICE matrix elements? Standalone MEs or BRIDGE?
   if( !bridge )
     wrkflwtxt += "MESDEV";
@@ -874,7 +887,7 @@ main( int argc, char** argv )
 
   if( perf )
   {
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 #ifdef _OPENMP
     // Get the output of "nproc --all" (https://stackoverflow.com/a/478960)
     std::string nprocall;
@@ -895,6 +908,8 @@ main( int argc, char** argv )
     std::cout << std::string( SEP79, '*' ) << std::endl
 #ifdef __CUDACC__
               << "Process                     = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_CUDA"
+#elif defined __HIPCC__
+              << "Process                     = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_HIP"
 #else
               << "Process                     = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_CPP"
 #endif
@@ -921,21 +936,21 @@ main( int argc, char** argv )
 #elif defined MGONGPU_FPTYPE_FLOAT
               << "FP precision                = FLOAT (NaN/abnormal=" << nabn << ", zero=" << nzero << ")" << std::endl
 #endif
-#ifdef __CUDACC__
 #if defined MGONGPU_CUCXTYPE_CUCOMPLEX
               << "Complex type                = CUCOMPLEX" << std::endl
 #elif defined MGONGPU_CUCXTYPE_THRUST
               << "Complex type                = THRUST::COMPLEX" << std::endl
-#endif
-#else
+#elif defined MGONGPU_CUCXTYPE_CXSMPL
               << "Complex type                = STD::COMPLEX" << std::endl
+#else
+              << "Complex type                = ???" << std::endl // no path to this statement...
 #endif
               << "RanNumb memory layout       = AOSOA[" << neppR << "]"
               << ( neppR == 1 ? " == AOS" : "" )
               << " [HARDCODED FOR REPRODUCIBILITY]" << std::endl
               << "Momenta memory layout       = AOSOA[" << neppM << "]"
               << ( neppM == 1 ? " == AOS" : "" ) << std::endl
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     //<< "Wavefunction GPU memory     = LOCAL" << std::endl
 #else
 #if !defined MGONGPU_CPPSIMD
@@ -966,7 +981,7 @@ main( int argc, char** argv )
 #endif
 #endif
               << "Random number generation    = " << rndgentxt << std::endl
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 #ifdef _OPENMP
               << "OMP threads / `nproc --all` = " << omp_get_max_threads() << " / " << nprocall // includes a newline
 #endif
@@ -1062,14 +1077,14 @@ main( int argc, char** argv )
              << "\"FLOAT (NaN/abnormal=" << nabn << ")\"," << std::endl
 #endif
              << "\"Complex type\": "
-#ifdef __CUDACC__
 #if defined MGONGPU_CUCXTYPE_CUCOMPLEX
              << "\"CUCOMPLEX\"," << std::endl
 #elif defined MGONGPU_CUCXTYPE_THRUST
              << "\"THRUST::COMPLEX\"," << std::endl
-#endif
-#else
+#elif defined MGONGPU_CUCXTYPE_CXSMPL
              << "\"STD::COMPLEX\"," << std::endl
+#else
+             << "\"???\"," << std::endl                           // no path to this statement...
 #endif
              << "\"RanNumb memory layout\": "
              << "\"AOSOA[" << neppR << "]\""
@@ -1077,7 +1092,7 @@ main( int argc, char** argv )
              << "\"Momenta memory layout\": "
              << "\"AOSOA[" << neppM << "]\""
              << ( neppM == 1 ? " == AOS" : "" ) << ", " << std::endl
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     //<< "\"Wavefunction GPU memory\": " << "\"LOCAL\"," << std::endl
 #endif
              << "\"Curand generation\": "
diff --git a/epochX/cudacpp/gg_tt.sa/SubProcesses/RamboSamplingKernels.cc b/epochX/cudacpp/gg_tt.sa/SubProcesses/RamboSamplingKernels.cc
index da68aa9255..79abbcc4f8 100644
--- a/epochX/cudacpp/gg_tt.sa/SubProcesses/RamboSamplingKernels.cc
+++ b/epochX/cudacpp/gg_tt.sa/SubProcesses/RamboSamplingKernels.cc
@@ -1,11 +1,11 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #include "RamboSamplingKernels.h"
 
-#include "CudaRuntime.h"
+#include "GpuRuntime.h"
 #include "MemoryAccessMomenta.h"
 #include "MemoryAccessRandomNumbers.h"
 #include "MemoryAccessWeights.h"
@@ -14,7 +14,7 @@
 
 #include <sstream>
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -92,7 +92,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   RamboSamplingKernelDevice::RamboSamplingKernelDevice( const fptype energy,               // input: energy
                                                         const BufferRndNumMomenta& rndmom, // input: random numbers in [0,1]
                                                         BufferMomenta& momenta,            // output: momenta
@@ -135,7 +135,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   __global__ void
   getMomentaInitialDevice( const fptype energy,
                            fptype* momenta )
@@ -147,17 +147,17 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   void
   RamboSamplingKernelDevice::getMomentaInitial()
   {
-    getMomentaInitialDevice<<<m_gpublocks, m_gputhreads>>>( m_energy, m_momenta.data() );
+    gpuLaunchKernel( getMomentaInitialDevice, m_gpublocks, m_gputhreads, m_energy, m_momenta.data() );
   }
 #endif
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   __global__ void
   getMomentaFinalDevice( const fptype energy,
                          const fptype* rndmom,
@@ -171,11 +171,11 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   void
   RamboSamplingKernelDevice::getMomentaFinal()
   {
-    getMomentaFinalDevice<<<m_gpublocks, m_gputhreads>>>( m_energy, m_rndmom.data(), m_momenta.data(), m_weights.data() );
+    gpuLaunchKernel( getMomentaFinalDevice, m_gpublocks, m_gputhreads, m_energy, m_rndmom.data(), m_momenta.data(), m_weights.data() );
   }
 #endif
 
diff --git a/epochX/cudacpp/gg_tt.sa/SubProcesses/RamboSamplingKernels.h b/epochX/cudacpp/gg_tt.sa/SubProcesses/RamboSamplingKernels.h
index 184089efd7..7c214cd74b 100644
--- a/epochX/cudacpp/gg_tt.sa/SubProcesses/RamboSamplingKernels.h
+++ b/epochX/cudacpp/gg_tt.sa/SubProcesses/RamboSamplingKernels.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef RAMBOSAMPLINGKERNELS_H
 #define RAMBOSAMPLINGKERNELS_H 1
@@ -10,7 +10,7 @@
 
 #include "MemoryBuffers.h"
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -93,7 +93,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   // A class encapsulating RAMBO phase space sampling on a GPU device
   class RamboSamplingKernelDevice final : public SamplingKernelBase, public NumberOfEvents
   {
diff --git a/epochX/cudacpp/gg_tt.sa/SubProcesses/RandomNumberKernels.h b/epochX/cudacpp/gg_tt.sa/SubProcesses/RandomNumberKernels.h
index 188a72c2c9..21d63beeac 100644
--- a/epochX/cudacpp/gg_tt.sa/SubProcesses/RandomNumberKernels.h
+++ b/epochX/cudacpp/gg_tt.sa/SubProcesses/RandomNumberKernels.h
@@ -1,14 +1,14 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef RANDOMNUMBERKERNELS_H
 #define RANDOMNUMBERKERNELS_H 1
 
 #include "mgOnGpuConfig.h"
 
-// NB This must come AFTER mgOnGpuConfig.h which contains our definition of __global__ when __CUDACC__ is not defined
+// NB This must come AFTER mgOnGpuConfig.h which contains our definition of __global__ when MGONGPUCPP_GPUIMPL is not defined
 #ifndef MGONGPU_HAS_NO_CURAND
 //#include "curand.h"
 struct curandGenerator_st; // forward definition from curand.h
@@ -16,7 +16,7 @@ struct curandGenerator_st; // forward definition from curand.h
 
 #include "MemoryBuffers.h"
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/gg_tt.sa/SubProcesses/cudacpp.mk b/epochX/cudacpp/gg_tt.sa/SubProcesses/cudacpp.mk
index 509307506b..f2cfa349da 100644
--- a/epochX/cudacpp/gg_tt.sa/SubProcesses/cudacpp.mk
+++ b/epochX/cudacpp/gg_tt.sa/SubProcesses/cudacpp.mk
@@ -1,7 +1,7 @@
 # Copyright (C) 2020-2023 CERN and UCLouvain.
 # Licensed under the GNU Lesser General Public License (version 3 or later).
 # Created by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin.
-# Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+# Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 
 #=== Determine the name of this makefile (https://ftp.gnu.org/old-gnu/Manuals/make-3.80/html_node/make_17.html)
 #=== NB: use ':=' to ensure that the value of CUDACPP_MAKEFILE is not modified further down after including make_opts
@@ -42,10 +42,10 @@ endif
 
 #-------------------------------------------------------------------------------
 
-#=== Configure common compiler flags for C++ and CUDA
+#=== Configure common compiler flags for C++ and CUDA/HIP
 
 INCFLAGS = -I.
-OPTFLAGS = -O3 # this ends up in CUFLAGS too (should it?), cannot add -Ofast or -ffast-math here
+OPTFLAGS = -O3 # this ends up in GPUFLAGS too (should it?), cannot add -Ofast or -ffast-math here
 
 # Dependency on src directory
 MG5AMC_COMMONLIB = mg5amc_common
@@ -121,24 +121,46 @@ endif
 
 #-------------------------------------------------------------------------------
 
-#=== Configure the CUDA compiler
+#=== Configure the GPU compiler (CUDA or HIP)
 
-# If CXX is not a single word (example "clang++ --gcc-toolchain...") then disable CUDA builds (issue #505)
-# This is because it is impossible to pass this to "CUFLAGS += -ccbin <host-compiler>" below
+# FIXME! (AV 24.01.2024)
+# In the current implementation (without separate builds for C++ and CUDA/HIP), we first check for cudacc and hipcc in CUDA_HOME and HIP_HOME.
+# If CUDA_HOME or HIP_HOME are not set, try to determine them from the path to cudacc and hipcc.
+# While convoluted, this is currently necessary to allow disabling CUDA/HIP builds by setting CUDA_HOME or HIP_HOME to invalid paths.
+# This will (probably?) be fixed when separate C++ and CUDA/HIP builds are implemented (PR #775).
+
+# If CXX is not a single word (example "clang++ --gcc-toolchain...") then disable CUDA and HIP builds (issue #505)
+# This is because it is impossible to pass this to "GPUFLAGS += -ccbin <host-compiler>" below
 ifneq ($(words $(subst ccache ,,$(CXX))),1) # allow at most "CXX=ccache <host-compiler>" from outside
-  $(warning CUDA builds are not supported for multi-word CXX "$(CXX)")
+  $(warning CUDA and HIP builds are not supported for multi-word CXX "$(CXX)")
   override CUDA_HOME=disabled
+  override HIP_HOME=disabled
 endif
 
-# If CUDA_HOME is not set, try to set it from the location of nvcc
+# If CUDA_HOME is not set, try to set it from the path to nvcc
 ifndef CUDA_HOME
   CUDA_HOME = $(patsubst %bin/nvcc,%,$(shell which nvcc 2>/dev/null))
   $(warning CUDA_HOME was not set: using "$(CUDA_HOME)")
 endif
 
-# Set NVCC as $(CUDA_HOME)/bin/nvcc if it exists
+# If HIP_HOME is not set, try to set it from the path to hipcc
+ifndef HIP_HOME
+  HIP_HOME = $(patsubst %bin/hipcc,%,$(HIP_COMPILER_PATH))
+  $(warning HIP_HOME was not set: using "$(HIP_HOME)")
+endif
+
+# FIXME! (AV 24.01.2024)
+# In the current implementation (without separate builds for C++ and CUDA/HIP),
+# builds are performed for HIP only if CUDA is not found in the path.
+# If both CUDA and HIP are installed, HIP builds can be triggered by unsetting CUDA_HOME.
+# This will be fixed when separate C++ and CUDA/HIP builds are implemented (PR #775).
+
+#--- Option 1: CUDA exists -> use CUDA
+
+# Set GPUCC as $(CUDA_HOME)/bin/nvcc if it exists
 ifneq ($(wildcard $(CUDA_HOME)/bin/nvcc),)
-  NVCC = $(CUDA_HOME)/bin/nvcc
+
+  GPUCC = $(CUDA_HOME)/bin/nvcc
   USE_NVTX ?=-DUSE_NVTX
   # See https://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/index.html
   # See https://arnon.dk/matching-sm-architectures-arch-and-gencode-for-various-nvidia-cards/
@@ -158,41 +180,78 @@ ifneq ($(wildcard $(CUDA_HOME)/bin/nvcc),)
     CURANDLIBFLAGS = -L$(CUDA_HOME)/lib64/ -lcurand # NB: -lcuda is not needed here!
   endif
   CUOPTFLAGS = -lineinfo
-  CUFLAGS = $(foreach opt, $(OPTFLAGS), -Xcompiler $(opt)) $(CUOPTFLAGS) $(INCFLAGS) $(CUINC) $(USE_NVTX) $(CUARCHFLAGS) -use_fast_math
-  ###CUFLAGS += -Xcompiler -Wall -Xcompiler -Wextra -Xcompiler -Wshadow
-  ###NVCC_VERSION = $(shell $(NVCC) --version | grep 'Cuda compilation tools' | cut -d' ' -f5 | cut -d, -f1)
-  CUFLAGS += -std=c++17 # need CUDA >= 11.2 (see #333): this is enforced in mgOnGpuConfig.h
+  ###GPUFLAGS = $(OPTFLAGS) $(CUOPTFLAGS) $(INCFLAGS) $(CUINC) $(USE_NVTX) $(CUARCHFLAGS) -use_fast_math
+  GPUFLAGS = $(foreach opt, $(OPTFLAGS), -Xcompiler $(opt)) $(CUOPTFLAGS) $(INCFLAGS) $(CUINC) $(USE_NVTX) $(CUARCHFLAGS) -use_fast_math
+  ###GPUFLAGS += -Xcompiler -Wall -Xcompiler -Wextra -Xcompiler -Wshadow
+  ###GPUCC_VERSION = $(shell $(GPUCC) --version | grep 'Cuda compilation tools' | cut -d' ' -f5 | cut -d, -f1)
+  GPUFLAGS += -std=c++17 # need CUDA >= 11.2 (see #333): this is enforced in mgOnGpuConfig.h
   # Without -maxrregcount: baseline throughput: 6.5E8 (16384 32 12) up to 7.3E8 (65536 128 12)
-  ###CUFLAGS+= --maxrregcount 160 # improves throughput: 6.9E8 (16384 32 12) up to 7.7E8 (65536 128 12)
-  ###CUFLAGS+= --maxrregcount 128 # improves throughput: 7.3E8 (16384 32 12) up to 7.6E8 (65536 128 12)
-  ###CUFLAGS+= --maxrregcount 96 # degrades throughput: 4.1E8 (16384 32 12) up to 4.5E8 (65536 128 12)
-  ###CUFLAGS+= --maxrregcount 64 # degrades throughput: 1.7E8 (16384 32 12) flat at 1.7E8 (65536 128 12)
+  ###GPUFLAGS+= --maxrregcount 160 # improves throughput: 6.9E8 (16384 32 12) up to 7.7E8 (65536 128 12)
+  ###GPUFLAGS+= --maxrregcount 128 # improves throughput: 7.3E8 (16384 32 12) up to 7.6E8 (65536 128 12)
+  ###GPUFLAGS+= --maxrregcount 96 # degrades throughput: 4.1E8 (16384 32 12) up to 4.5E8 (65536 128 12)
+  ###GPUFLAGS+= --maxrregcount 64 # degrades throughput: 1.7E8 (16384 32 12) flat at 1.7E8 (65536 128 12)
+  CUBUILDRULEFLAGS = -Xcompiler -fPIC -c
+  CCBUILDRULEFLAGS = -Xcompiler -fPIC -c -x cu
+  CUDATESTFLAGS = -lcuda
+
+  # Set the host C++ compiler for GPUCC via "-ccbin <host-compiler>"
+  # (NB issue #505: this must be a single word, "clang++ --gcc-toolchain..." is not supported)
+  GPUFLAGS += -ccbin $(shell which $(subst ccache ,,$(CXX)))
+
+  # Allow newer (unsupported) C++ compilers with older versions of CUDA if ALLOW_UNSUPPORTED_COMPILER_IN_CUDA is set (#504)
+  ifneq ($(origin ALLOW_UNSUPPORTED_COMPILER_IN_CUDA),undefined)
+  GPUFLAGS += -allow-unsupported-compiler
+  endif
+
 else ifneq ($(origin REQUIRE_CUDA),undefined)
+
   # If REQUIRE_CUDA is set but no cuda is found, stop here (e.g. for CI tests on GPU #443)
-  $(error No cuda installation found (set CUDA_HOME or make nvcc visible in PATH))
+  $(error No cuda installation found (set CUDA_HOME or make GPUCC visible in PATH))
+
+#--- Option 2: CUDA does not exist, HIP exists -> use HIP
+
+# Set GPUCC as $(HIP_HOME)/bin/hipcc if it exists
+else ifneq ($(wildcard $(HIP_HOME)/bin/hipcc),)
+
+  GPUCC = $(HIP_HOME)/bin/hipcc
+  #USE_NVTX ?=-DUSE_NVTX # should maybe find something equivalent to this in HIP?
+  HIPARCHFLAGS = -target x86_64-linux-gnu --offload-arch=gfx90a
+  HIPINC = -I$(HIP_HOME)/include/
+  # Note: -DHIP_FAST_MATH is equivalent to -use_fast_math in HIP 
+  # (but only for single precision line 208: https://rocm-developer-tools.github.io/HIP/hcc__detail_2math__functions_8h_source.html)
+  GPUFLAGS = $(OPTFLAGS) $(CUOPTFLAGS) $(INCFLAGS) $(HIPINC) $(HIPARCHFLAGS) -DHIP_FAST_MATH -DHIP_PLATFORM=amd -fPIC
+  ###GPUFLAGS += -Xcompiler -Wall -Xcompiler -Wextra -Xcompiler -Wshadow
+  GPUFLAGS += -std=c++17
+  ###GPUFLAGS+= --maxrregcount 255 # (AV: is this option valid on HIP and meaningful on AMD GPUs?)
+  CUBUILDRULEFLAGS = -fPIC -c
+  CCBUILDRULEFLAGS = -fPIC -c
+
+else ifneq ($(origin REQUIRE_HIP),undefined)
+
+  # If REQUIRE_HIP is set but no HIP is found, stop here (e.g. for CI tests on GPU #443)
+  $(error No hip installation found (set HIP_HOME or make GPUCC visible in PATH))
+
+#--- Option 3: CUDA does not exist, HIP does not exist -> switch off both CUDA and HIP
+
 else
-  # No cuda. Switch cuda compilation off and go to common random numbers in C++
+
+  # No cudacc and no hipcc: switch CUDA and HIP compilation off and go to common random numbers in C++
   $(warning CUDA_HOME is not set or is invalid: export CUDA_HOME to compile with cuda)
-  override NVCC=
+  $(warning HIP_HOME is not set or is invalid: export HIP_HOME to compile with hip)
+  override GPUCC=
   override USE_NVTX=
   override CUINC=
   override CURANDLIBFLAGS=
-endif
-export NVCC
-export CUFLAGS
-
-# Set the host C++ compiler for nvcc via "-ccbin <host-compiler>"
-# (NB issue #505: this must be a single word, "clang++ --gcc-toolchain..." is not supported)
-CUFLAGS += -ccbin $(shell which $(subst ccache ,,$(CXX)))
 
-# Allow newer (unsupported) C++ compilers with older versions of CUDA if ALLOW_UNSUPPORTED_COMPILER_IN_CUDA is set (#504)
-ifneq ($(origin ALLOW_UNSUPPORTED_COMPILER_IN_CUDA),undefined)
-CUFLAGS += -allow-unsupported-compiler
 endif
 
+# Export GPUCC (so that it can also be used in cudacpp_src.mk?)
+export GPUCC
+export GPUFLAGS
+
 #-------------------------------------------------------------------------------
 
-#=== Configure ccache for C++ and CUDA builds
+#=== Configure ccache for C++ and CUDA/HIP builds
 
 # Enable ccache if USECCACHE=1
 ifeq ($(USECCACHE)$(shell echo $(CXX) | grep ccache),1)
@@ -201,15 +260,15 @@ endif
 #ifeq ($(USECCACHE)$(shell echo $(AR) | grep ccache),1)
 #  override AR:=ccache $(AR)
 #endif
-ifneq ($(NVCC),)
-  ifeq ($(USECCACHE)$(shell echo $(NVCC) | grep ccache),1)
-    override NVCC:=ccache $(NVCC)
+ifneq ($(GPUCC),)
+  ifeq ($(USECCACHE)$(shell echo $(GPUCC) | grep ccache),1)
+    override GPUCC:=ccache $(GPUCC)
   endif
 endif
 
 #-------------------------------------------------------------------------------
 
-#=== Configure PowerPC-specific compiler flags for C++ and CUDA
+#=== Configure PowerPC-specific compiler flags for C++ and CUDA/HIP
 
 # PowerPC-specific CXX compiler flags (being reviewed)
 ifeq ($(UNAME_P),ppc64le)
@@ -225,9 +284,9 @@ else
   ######CXXFLAGS+= -fno-semantic-interposition # no benefit (neither alone, nor combined with -flto)
 endif
 
-# PowerPC-specific CUDA compiler flags (to be reviewed!)
+# PowerPC-specific CUDA/HIP compiler flags (to be reviewed!)
 ifeq ($(UNAME_P),ppc64le)
-  CUFLAGS+= -Xcompiler -mno-float128
+  GPUFLAGS+= -Xcompiler -mno-float128
 endif
 
 #-------------------------------------------------------------------------------
@@ -237,7 +296,7 @@ endif
 # Set the default OMPFLAGS choice
 ifneq ($(shell $(CXX) --version | egrep '^Intel'),)
 override OMPFLAGS = -fopenmp
-###override OMPFLAGS = # disable OpenMP MT on Intel (was ok without nvcc but not ok with nvcc before #578)
+###override OMPFLAGS = # disable OpenMP MT on Intel (was ok without GPUCC but not ok with GPUCC before #578)
 else ifneq ($(shell $(CXX) --version | egrep '^(clang)'),)
 override OMPFLAGS = -fopenmp
 ###override OMPFLAGS = # disable OpenMP MT on clang (was not ok without or with nvcc before #578)
@@ -293,7 +352,10 @@ endif
 
 # Set the default RNDGEN (random number generator) choice
 ifeq ($(RNDGEN),)
-  ifeq ($(NVCC),)
+  ifeq ($(GPUCC),)
+    override RNDGEN = hasNoCurand
+  # Edgecase for HIP compilation
+  else ifeq ($(findstring hipcc,$(GPUCC)),hipcc)
     override RNDGEN = hasNoCurand
   else ifeq ($(RNDGEN),)
     override RNDGEN = hasCurand
@@ -310,7 +372,7 @@ export OMPFLAGS
 
 #-------------------------------------------------------------------------------
 
-#=== Set the CUDA/C++ compiler flags appropriate to user-defined choices of AVX, FPTYPE, HELINL, HRDCOD, RNDGEN
+#=== Set the CUDA/HIP/C++ compiler flags appropriate to user-defined choices of AVX, FPTYPE, HELINL, HRDCOD, RNDGEN
 
 # Set the build flags appropriate to OMPFLAGS
 $(info OMPFLAGS=$(OMPFLAGS))
@@ -368,13 +430,13 @@ CXXFLAGS+= $(AVXFLAGS)
 $(info FPTYPE=$(FPTYPE))
 ifeq ($(FPTYPE),d)
   CXXFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_DOUBLE
-  CUFLAGS  += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_DOUBLE
+  GPUFLAGS  += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_DOUBLE
 else ifeq ($(FPTYPE),f)
   CXXFLAGS += -DMGONGPU_FPTYPE_FLOAT -DMGONGPU_FPTYPE2_FLOAT
-  CUFLAGS  += -DMGONGPU_FPTYPE_FLOAT -DMGONGPU_FPTYPE2_FLOAT
+  GPUFLAGS  += -DMGONGPU_FPTYPE_FLOAT -DMGONGPU_FPTYPE2_FLOAT
 else ifeq ($(FPTYPE),m)
   CXXFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_FLOAT
-  CUFLAGS  += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_FLOAT
+  GPUFLAGS  += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_FLOAT
 else
   $(error Unknown FPTYPE='$(FPTYPE)': only 'd', 'f' and 'm' are supported)
 endif
@@ -383,7 +445,7 @@ endif
 $(info HELINL=$(HELINL))
 ifeq ($(HELINL),1)
   CXXFLAGS += -DMGONGPU_INLINE_HELAMPS
-  CUFLAGS  += -DMGONGPU_INLINE_HELAMPS
+  GPUFLAGS  += -DMGONGPU_INLINE_HELAMPS
 else ifneq ($(HELINL),0)
   $(error Unknown HELINL='$(HELINL)': only '0' and '1' are supported)
 endif
@@ -392,7 +454,7 @@ endif
 $(info HRDCOD=$(HRDCOD))
 ifeq ($(HRDCOD),1)
   CXXFLAGS += -DMGONGPU_HARDCODE_PARAM
-  CUFLAGS  += -DMGONGPU_HARDCODE_PARAM
+  GPUFLAGS  += -DMGONGPU_HARDCODE_PARAM
 else ifneq ($(HRDCOD),0)
   $(error Unknown HRDCOD='$(HRDCOD)': only '0' and '1' are supported)
 endif
@@ -444,11 +506,11 @@ ifeq ($(UNAME_S),Darwin)
   override CULIBFLAGSRPATH2 =
 else
   # RPATH to cuda/cpp libs when linking executables
-  override CXXLIBFLAGSRPATH = -Wl,-rpath,$(LIBDIRRPATH)
-  override CULIBFLAGSRPATH = -Xlinker -rpath,$(LIBDIRRPATH)
+  override CXXLIBFLAGSRPATH = -Wl,-rpath=$(LIBDIRRPATH)
+  override CULIBFLAGSRPATH = -Xlinker -rpath=$(LIBDIRRPATH)
   # RPATH to common lib when linking cuda/cpp libs
-  override CXXLIBFLAGSRPATH2 = -Wl,-rpath,'$$ORIGIN'
-  override CULIBFLAGSRPATH2 = -Xlinker -rpath,'$$ORIGIN'
+  override CXXLIBFLAGSRPATH2 = -Wl,-rpath='$$ORIGIN'
+  override CULIBFLAGSRPATH2 = -Xlinker -rpath='$$ORIGIN'
 endif
 
 # Setting LD_LIBRARY_PATH or DYLD_LIBRARY_PATH in the RUNTIME is no longer necessary (neither on Linux nor on Mac)
@@ -461,7 +523,7 @@ override RUNTIME =
 cxx_main=$(BUILDDIR)/check.exe
 fcxx_main=$(BUILDDIR)/fcheck.exe
 
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 cu_main=$(BUILDDIR)/gcheck.exe
 fcu_main=$(BUILDDIR)/fgcheck.exe
 else
@@ -492,15 +554,16 @@ $(BUILDDIR)/.build.$(TAG):
 	@touch $(BUILDDIR)/.build.$(TAG)
 
 # Generic target and build rules: objects from CUDA compilation
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 $(BUILDDIR)/%.o : %.cu *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
 	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
-	$(NVCC) $(CPPFLAGS) $(CUFLAGS) -Xcompiler -fPIC -c $< -o $@
+	$(GPUCC) $(CPPFLAGS) $(GPUFLAGS) $(CUBUILDRULEFLAGS) $< -o $@
 
 $(BUILDDIR)/%_cu.o : %.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
 	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
-	$(NVCC) $(CPPFLAGS) $(CUFLAGS) -Xcompiler -fPIC -c -x cu $< -o $@
+	$(GPUCC) $(CPPFLAGS) $(GPUFLAGS) $(CCBUILDRULEFLAGS) $< -o $@
 endif
+# -x cu in line above
 
 # Generic target and build rules: objects from C++ compilation
 # (NB do not include CUINC here! add it only for NVTX or curand #679)
@@ -509,11 +572,14 @@ $(BUILDDIR)/%.o : %.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
 	$(CXX) $(CPPFLAGS) $(CXXFLAGS) -fPIC -c $< -o $@
 
 # Apply special build flags only to CrossSectionKernel.cc and gCrossSectionKernel.cu (no fast math, see #117 and #516)
+# Added edgecase for HIP compilation
 ifeq ($(shell $(CXX) --version | grep ^nvc++),)
 $(BUILDDIR)/CrossSectionKernels.o: CXXFLAGS := $(filter-out -ffast-math,$(CXXFLAGS))
 $(BUILDDIR)/CrossSectionKernels.o: CXXFLAGS += -fno-fast-math
-ifneq ($(NVCC),)
-$(BUILDDIR)/gCrossSectionKernels.o: CUFLAGS += -Xcompiler -fno-fast-math
+ifeq ($(findstring nvcc,$(GPUCC)),nvcc)
+  $(BUILDDIR)/gCrossSectionKernels.o: GPUFLAGS += -Xcompiler -fno-fast-math
+else
+  $(BUILDDIR)/gCrossSectionKernels.o: GPUFLAGS += -fno-fast-math
 endif
 endif
 
@@ -530,10 +596,10 @@ ifeq ($(RNDGEN),hasCurand)
 $(BUILDDIR)/CurandRandomNumberKernel.o: CXXFLAGS += $(CUINC)
 endif
 
-# Avoid "warning: builtin __has_trivial_... is deprecated; use __is_trivially_... instead" in nvcc with icx2023 (#592)
+# Avoid "warning: builtin __has_trivial_... is deprecated; use __is_trivially_... instead" in GPUCC with icx2023 (#592)
 ifneq ($(shell $(CXX) --version | egrep '^(Intel)'),)
-ifneq ($(NVCC),)
-CUFLAGS += -Xcompiler -Wno-deprecated-builtins
+ifneq ($(GPUCC),)
+GPUFLAGS += -Wno-deprecated-builtins
 endif
 endif
 
@@ -541,8 +607,8 @@ endif
 # This patch does remove the warning, but I prefer to keep it disabled for the moment...
 ###ifneq ($(shell $(CXX) --version | egrep '^(clang|Apple clang|Intel)'),)
 ###$(BUILDDIR)/CrossSectionKernels.o: CXXFLAGS += -Wno-overriding-t-option
-###ifneq ($(NVCC),)
-###$(BUILDDIR)/gCrossSectionKernels.o: CUFLAGS += -Xcompiler -Wno-overriding-t-option
+###ifneq ($(GPUCC),)
+###$(BUILDDIR)/gCrossSectionKernels.o: GPUFLAGS += -Xcompiler -Wno-overriding-t-option
 ###endif
 ###endif
 
@@ -569,7 +635,7 @@ MG5AMC_CXXLIB = mg5amc_$(processid_short)_cpp
 cxx_objects_lib=$(BUILDDIR)/CPPProcess.o $(BUILDDIR)/MatrixElementKernels.o $(BUILDDIR)/BridgeKernels.o $(BUILDDIR)/CrossSectionKernels.o
 cxx_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel.o $(BUILDDIR)/RamboSamplingKernels.o
 
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 MG5AMC_CULIB = mg5amc_$(processid_short)_cuda
 cu_objects_lib=$(BUILDDIR)/gCPPProcess.o $(BUILDDIR)/gMatrixElementKernels.o $(BUILDDIR)/gBridgeKernels.o $(BUILDDIR)/gCrossSectionKernels.o
 cu_objects_exe=$(BUILDDIR)/gCommonRandomNumberKernel.o $(BUILDDIR)/gRamboSamplingKernels.o
@@ -581,11 +647,11 @@ $(LIBDIR)/lib$(MG5AMC_CXXLIB).so: cxx_objects_lib += $(BUILDDIR)/fbridge.o
 $(LIBDIR)/lib$(MG5AMC_CXXLIB).so: $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib)
 	$(CXX) -shared -o $@ $(cxx_objects_lib) $(CXXLIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB)
 
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 $(LIBDIR)/lib$(MG5AMC_CULIB).so: $(BUILDDIR)/fbridge_cu.o
 $(LIBDIR)/lib$(MG5AMC_CULIB).so: cu_objects_lib += $(BUILDDIR)/fbridge_cu.o
 $(LIBDIR)/lib$(MG5AMC_CULIB).so: $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cu_objects_lib)
-	$(NVCC) --shared -o $@ $(cu_objects_lib) $(CULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB)
+	$(GPUCC) --shared -o $@ $(cu_objects_lib) $(CULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB)
 endif
 
 #-------------------------------------------------------------------------------
@@ -602,16 +668,16 @@ $(cxx_main): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PAT
 $(cxx_main): $(BUILDDIR)/check_sa.o $(LIBDIR)/lib$(MG5AMC_CXXLIB).so $(cxx_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel.o
 	$(CXX) -o $@ $(BUILDDIR)/check_sa.o $(OMPFLAGS) -ldl -pthread $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_CXXLIB) $(cxx_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel.o $(CURANDLIBFLAGS)
 
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 ifneq ($(shell $(CXX) --version | grep ^Intel),)
-$(cu_main): LIBFLAGS += -lintlc # compile with icpx and link with nvcc (undefined reference to `_intel_fast_memcpy')
-$(cu_main): LIBFLAGS += -lsvml # compile with icpx and link with nvcc (undefined reference to `__svml_cos4_l9')
+$(cu_main): LIBFLAGS += -lintlc # compile with icpx and link with GPUCC (undefined reference to `_intel_fast_memcpy')
+$(cu_main): LIBFLAGS += -lsvml # compile with icpx and link with GPUCC (undefined reference to `__svml_cos4_l9')
 else ifneq ($(shell $(CXX) --version | grep ^nvc++),) # support nvc++ #531
 $(cu_main): LIBFLAGS += -L$(patsubst %bin/nvc++,%lib,$(subst ccache ,,$(CXX))) -lnvhpcatm -lnvcpumath -lnvc
 endif
 $(cu_main): LIBFLAGS += $(CULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH
 $(cu_main): $(BUILDDIR)/gcheck_sa.o $(LIBDIR)/lib$(MG5AMC_CULIB).so $(cu_objects_exe) $(BUILDDIR)/gCurandRandomNumberKernel.o
-	$(NVCC) -o $@ $(BUILDDIR)/gcheck_sa.o $(CUARCHFLAGS) $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe) $(BUILDDIR)/gCurandRandomNumberKernel.o $(CURANDLIBFLAGS)
+	$(GPUCC) -o $@ $(BUILDDIR)/gcheck_sa.o $(CUARCHFLAGS) $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe) $(BUILDDIR)/gCurandRandomNumberKernel.o $(CURANDLIBFLAGS)
 endif
 
 #-------------------------------------------------------------------------------
@@ -637,17 +703,17 @@ $(fcxx_main): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PA
 $(fcxx_main): $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler.o $(LIBDIR)/lib$(MG5AMC_CXXLIB).so $(cxx_objects_exe)
 	$(CXX) -o $@ $(BUILDDIR)/fcheck_sa.o $(OMPFLAGS) $(BUILDDIR)/fsampler.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CXXLIB) $(cxx_objects_exe)
 
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 ifneq ($(shell $(CXX) --version | grep ^Intel),)
-$(fcu_main): LIBFLAGS += -lintlc # compile with icpx and link with nvcc (undefined reference to `_intel_fast_memcpy')
-$(fcu_main): LIBFLAGS += -lsvml # compile with icpx and link with nvcc (undefined reference to `__svml_cos4_l9')
+$(fcu_main): LIBFLAGS += -lintlc # compile with icpx and link with GPUCC (undefined reference to `_intel_fast_memcpy')
+$(fcu_main): LIBFLAGS += -lsvml # compile with icpx and link with GPUCC (undefined reference to `__svml_cos4_l9')
 endif
 ifeq ($(UNAME_S),Darwin)
 $(fcu_main): LIBFLAGS += -L$(shell dirname $(shell $(FC) --print-file-name libgfortran.dylib)) # add path to libgfortran on Mac #375
 endif
 $(fcu_main): LIBFLAGS += $(CULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH
 $(fcu_main): $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler_cu.o $(LIBDIR)/lib$(MG5AMC_CULIB).so $(cu_objects_exe)
-	$(NVCC) -o $@ $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler_cu.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe)
+	$(GPUCC) -o $@ $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler_cu.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe)
 endif
 
 #-------------------------------------------------------------------------------
@@ -659,7 +725,7 @@ $(BUILDDIR)/testxxx.o: testxxx_cc_ref.txt
 $(testmain): $(BUILDDIR)/testxxx.o
 $(testmain): cxx_objects_exe += $(BUILDDIR)/testxxx.o # Comment out this line to skip the C++ test of xxx functions
 
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 $(BUILDDIR)/testxxx_cu.o: $(GTESTLIBS)
 $(BUILDDIR)/testxxx_cu.o: INCFLAGS += $(GTESTINC)
 $(BUILDDIR)/testxxx_cu.o: testxxx_cc_ref.txt
@@ -672,7 +738,7 @@ $(BUILDDIR)/testmisc.o: INCFLAGS += $(GTESTINC)
 $(testmain): $(BUILDDIR)/testmisc.o
 $(testmain): cxx_objects_exe += $(BUILDDIR)/testmisc.o # Comment out this line to skip the C++ miscellaneous tests
 
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 $(BUILDDIR)/testmisc_cu.o: $(GTESTLIBS)
 $(BUILDDIR)/testmisc_cu.o: INCFLAGS += $(GTESTINC)
 $(testmain): $(BUILDDIR)/testmisc_cu.o
@@ -684,12 +750,12 @@ $(BUILDDIR)/runTest.o: INCFLAGS += $(GTESTINC)
 $(testmain): $(BUILDDIR)/runTest.o
 $(testmain): cxx_objects_exe += $(BUILDDIR)/runTest.o
 
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 $(BUILDDIR)/runTest_cu.o: $(GTESTLIBS)
 $(BUILDDIR)/runTest_cu.o: INCFLAGS += $(GTESTINC)
 ifneq ($(shell $(CXX) --version | grep ^Intel),)
-$(testmain): LIBFLAGS += -lintlc # compile with icpx and link with nvcc (undefined reference to `_intel_fast_memcpy')
-$(testmain): LIBFLAGS += -lsvml # compile with icpx and link with nvcc (undefined reference to `__svml_cos4_l9')
+$(testmain): LIBFLAGS += -lintlc # compile with icpx and link with GPUCC (undefined reference to `_intel_fast_memcpy')
+$(testmain): LIBFLAGS += -lsvml # compile with icpx and link with GPUCC (undefined reference to `__svml_cos4_l9')
 else ifneq ($(shell $(CXX) --version | grep ^nvc++),) # support nvc++ #531
 $(testmain): LIBFLAGS += -L$(patsubst %bin/nvc++,%lib,$(subst ccache ,,$(CXX))) -lnvhpcatm -lnvcpumath -lnvc
 endif
@@ -713,14 +779,14 @@ $(testmain): LIBFLAGS += -lgomp
 endif
 endif
 
-ifeq ($(NVCC),) # link only runTest.o
+ifeq ($(GPUCC),) # link only runTest.o
 $(testmain): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH
 $(testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(cxx_objects_exe) $(GTESTLIBS)
 	$(CXX) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) -ldl -pthread $(LIBFLAGS)
 else # link both runTest.o and runTest_cu.o
 $(testmain): LIBFLAGS += $(CULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH
 $(testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) $(GTESTLIBS)
-	$(NVCC) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) -ldl $(LIBFLAGS) -lcuda
+	  $(GPUCC) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) -ldl $(LIBFLAGS) $(CUDATESTFLAGS)
 endif
 
 # Use target gtestlibs to build only googletest
@@ -829,9 +895,9 @@ ifeq ($(USECCACHE),1)
 	ccache --version | head -1
 endif
 	@echo ""
-	@echo NVCC=$(NVCC)
-ifneq ($(NVCC),)
-	$(NVCC) --version
+	@echo GPUCC=$(GPUCC)
+ifneq ($(GPUCC),)
+	$(GPUCC) --version
 endif
 	@echo ""
 	@echo CXX=$(CXX)
@@ -850,7 +916,7 @@ endif
 
 # Target: check (run the C++ test executable)
 # [NB THIS IS WHAT IS USED IN THE GITHUB CI!]
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 check: runTest cmpFcheck cmpFGcheck
 else
 check: runTest cmpFcheck
diff --git a/epochX/cudacpp/gg_tt.sa/SubProcesses/fbridge.cc b/epochX/cudacpp/gg_tt.sa/SubProcesses/fbridge.cc
index 2d2b36d560..22ce3f5115 100644
--- a/epochX/cudacpp/gg_tt.sa/SubProcesses/fbridge.cc
+++ b/epochX/cudacpp/gg_tt.sa/SubProcesses/fbridge.cc
@@ -1,11 +1,11 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: S. Roiser (Oct 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Roiser, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #include "Bridge.h"
 #include "CPPProcess.h"
-#include "CudaRuntime.h"
+#include "GpuRuntime.h"
 
 extern "C"
 {
@@ -22,7 +22,7 @@ extern "C"
    * Using the same Fortran MadEvent code, linking to the hetrerogeneous library would allow access to both CPU and GPU implementations.
    * The specific heterogeneous configuration (how many GPUs, how many threads on each CPU, etc) could be loaded in CUDA/C++ from a data file.
    */
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   using namespace mg5amcGpu;
 #else
   using namespace mg5amcCpu;
@@ -46,8 +46,8 @@ extern "C"
    */
   void fbridgecreate_( CppObjectInFortran** ppbridge, const int* pnevtF, const int* pnparF, const int* pnp4F )
   {
-#ifdef __CUDACC__
-    CudaRuntime::setUp();
+#ifdef MGONGPUCPP_GPUIMPL
+    GpuRuntime::setUp();
 #endif
     // (NB: CPPProcess::initProc no longer needs to be executed here because it is called in the Bridge constructor)
     // FIXME: disable OMP in Bridge when called from Fortran
@@ -65,8 +65,8 @@ extern "C"
     Bridge<FORTRANFPTYPE>* pbridge = dynamic_cast<Bridge<FORTRANFPTYPE>*>( *ppbridge );
     if( pbridge == 0 ) throw std::runtime_error( "fbridgedelete_: invalid Bridge address" );
     delete pbridge;
-#ifdef __CUDACC__
-    CudaRuntime::tearDown();
+#ifdef MGONGPUCPP_GPUIMPL
+    GpuRuntime::tearDown();
 #endif
   }
 
@@ -96,7 +96,7 @@ extern "C"
   {
     Bridge<FORTRANFPTYPE>* pbridge = dynamic_cast<Bridge<FORTRANFPTYPE>*>( *ppbridge );
     if( pbridge == 0 ) throw std::runtime_error( "fbridgesequence_: invalid Bridge address" );
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     // Use the device/GPU implementation in the CUDA library
     // (there is also a host implementation in this library)
     pbridge->gpu_sequence( momenta, gs, rndhel, rndcol, *pchannelId, mes, selhel, selcol );
diff --git a/epochX/cudacpp/gg_tt.sa/SubProcesses/fsampler.cc b/epochX/cudacpp/gg_tt.sa/SubProcesses/fsampler.cc
index 2fb445372d..3743934f41 100644
--- a/epochX/cudacpp/gg_tt.sa/SubProcesses/fsampler.cc
+++ b/epochX/cudacpp/gg_tt.sa/SubProcesses/fsampler.cc
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Feb 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #include "mgOnGpuConfig.h"
 
@@ -13,7 +13,7 @@
 
 //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -40,7 +40,7 @@ namespace mg5amcCpu
   private:
     const int m_nevt; // The number of events in each iteration
     int m_iiter;      // The iteration counter (for random number seeding)
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
     HostBufferRndNumMomenta m_hstRndmom; // Memory buffers for random numbers
     HostBufferMomenta m_hstMomenta;      // Memory buffers for momenta
     HostBufferWeights m_hstWeights;      // Memory buffers for sampling weights
@@ -105,7 +105,7 @@ namespace mg5amcCpu
 
 extern "C"
 {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   using namespace mg5amcGpu;
 #else
   using namespace mg5amcCpu;
diff --git a/epochX/cudacpp/gg_tt.sa/SubProcesses/runTest.cc b/epochX/cudacpp/gg_tt.sa/SubProcesses/runTest.cc
index d4a760a71b..de327f2321 100644
--- a/epochX/cudacpp/gg_tt.sa/SubProcesses/runTest.cc
+++ b/epochX/cudacpp/gg_tt.sa/SubProcesses/runTest.cc
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: S. Hageboeck (Nov 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 //----------------------------------------------------------------------------
 // Use ./runTest.exe --gtest_filter=*xxx to run only testxxx.cc tests
 //----------------------------------------------------------------------------
@@ -18,7 +18,7 @@
 #include "RandomNumberKernels.h"
 #include "epoch_process_id.h"
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 using namespace mg5amcGpu;
 #else
 using namespace mg5amcCpu;
@@ -35,7 +35,7 @@ struct CUDA_CPU_TestBase : public TestDriverBase
     : TestDriverBase( npar, refFileName ) {}
 };
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 struct CPUTest : public CUDA_CPU_TestBase
 {
   // Struct data members (process, and memory structures for random numbers, momenta, matrix elements and weights on host and device)
@@ -119,7 +119,7 @@ struct CPUTest : public CUDA_CPU_TestBase
 };
 #endif
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 struct CUDATest : public CUDA_CPU_TestBase
 {
   // Reset the device when our test goes out of scope. Note that this should happen after
@@ -128,7 +128,7 @@ struct CUDATest : public CUDA_CPU_TestBase
   {
     ~DeviceReset()
     {
-      checkCuda( cudaDeviceReset() ); // this is needed by cuda-memcheck --leak-check full
+      checkGpu( gpuDeviceReset() ); // this is needed by cuda-memcheck --leak-check full
     }
   } deviceResetter;
 
@@ -256,7 +256,7 @@ INSTANTIATE_TEST_SUITE_P( prefix, \
                           test_suite_name, \
                           testing::Values( new CUDATest( MG_EPOCH_REFERENCE_FILE_NAME ) ) );
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 MG_INSTANTIATE_TEST_SUITE_GPU( XTESTID_GPU( MG_EPOCH_PROCESS_ID ), MadgraphTest );
 #else
 MG_INSTANTIATE_TEST_SUITE_CPU( XTESTID_CPU( MG_EPOCH_PROCESS_ID ), MadgraphTest );
diff --git a/epochX/cudacpp/gg_tt.sa/SubProcesses/testmisc.cc b/epochX/cudacpp/gg_tt.sa/SubProcesses/testmisc.cc
index 895d6eeb56..ba9e59a8a3 100644
--- a/epochX/cudacpp/gg_tt.sa/SubProcesses/testmisc.cc
+++ b/epochX/cudacpp/gg_tt.sa/SubProcesses/testmisc.cc
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 //----------------------------------------------------------------------------
 // Use ./runTest.exe --gtest_filter=*misc to run only testmisc.cc tests
 //----------------------------------------------------------------------------
@@ -17,7 +17,7 @@
 #include <sstream>
 #include <typeinfo>
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 #define TESTID( s ) s##_GPU_MISC
 #else
 #define TESTID( s ) s##_CPU_MISC
@@ -26,7 +26,7 @@
 #define XTESTID( s ) TESTID( s )
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -59,7 +59,7 @@ namespace mg5amcCpu
 
 TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testmisc )
 {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   using namespace mg5amcGpu;
 #else
   using namespace mg5amcCpu;
diff --git a/epochX/cudacpp/gg_tt.sa/SubProcesses/testxxx.cc b/epochX/cudacpp/gg_tt.sa/SubProcesses/testxxx.cc
index 3361fe5aa9..e5167de00c 100644
--- a/epochX/cudacpp/gg_tt.sa/SubProcesses/testxxx.cc
+++ b/epochX/cudacpp/gg_tt.sa/SubProcesses/testxxx.cc
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Apr 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 //----------------------------------------------------------------------------
 // Use ./runTest.exe --gtest_filter=*xxx to run only testxxx.cc tests
 //----------------------------------------------------------------------------
@@ -24,7 +24,7 @@
 #include <iomanip>
 #include <iostream>
 #include <vector>
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 #define TESTID( s ) s##_GPU_XXX
 #else
 #define TESTID( s ) s##_CPU_XXX
@@ -32,7 +32,7 @@
 
 #define XTESTID( s ) TESTID( s )
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -42,7 +42,7 @@ namespace mg5amcCpu
   int FPEhandlerIevt = -1;
   inline void FPEhandler( int sig )
   {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     std::cerr << "Floating Point Exception (GPU): '" << FPEhandlerMessage << "' ievt=" << FPEhandlerIevt << std::endl;
 #else
     std::cerr << "Floating Point Exception (CPU neppV=" << neppV << "): '" << FPEhandlerMessage << "' ievt=" << FPEhandlerIevt << std::endl;
@@ -53,7 +53,7 @@ namespace mg5amcCpu
 
 TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testxxx )
 {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   using namespace mg5amcGpu;
 #else
   using namespace mg5amcCpu;
@@ -77,7 +77,7 @@ TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testxxx )
   assert( nevt % neppM == 0 ); // nevt must be a multiple of neppM
   assert( nevt % neppV == 0 ); // nevt must be a multiple of neppV
   // Fill in the input momenta
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   mg5amcGpu::PinnedHostBufferMomenta hstMomenta( nevt ); // AOSOA[npagM][npar=4][np4=4][neppM]
 #else
   mg5amcCpu::HostBufferMomenta hstMomenta( nevt ); // AOSOA[npagM][npar=4][np4=4][neppM]
@@ -322,7 +322,7 @@ TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testxxx )
   {
     for( int ievt = 0; ievt < nevt; ievt++ )
     {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
       using namespace mg5amcGpu;
 #else
       using namespace mg5amcCpu;
diff --git a/epochX/cudacpp/gg_tt.sa/src/HelAmps_sm.h b/epochX/cudacpp/gg_tt.sa/src/HelAmps_sm.h
index 55f43bb43a..add8fce575 100644
--- a/epochX/cudacpp/gg_tt.sa/src/HelAmps_sm.h
+++ b/epochX/cudacpp/gg_tt.sa/src/HelAmps_sm.h
@@ -5,7 +5,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: A. Valassi (Sep 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
 // MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
@@ -28,7 +28,7 @@
 //#include <iomanip>
 //#include <iostream>
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/gg_tt.sa/src/Parameters_sm.cc b/epochX/cudacpp/gg_tt.sa/src/Parameters_sm.cc
index a9bc93ff98..c5dd6e7e4c 100644
--- a/epochX/cudacpp/gg_tt.sa/src/Parameters_sm.cc
+++ b/epochX/cudacpp/gg_tt.sa/src/Parameters_sm.cc
@@ -4,7 +4,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: A. Valassi (Sep 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
 // MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
@@ -17,7 +17,7 @@
 #include <iomanip>
 #include <iostream>
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 using namespace mg5amcGpu;
 #else
 using namespace mg5amcCpu;
diff --git a/epochX/cudacpp/gg_tt.sa/src/Parameters_sm.h b/epochX/cudacpp/gg_tt.sa/src/Parameters_sm.h
index 932f123fea..5f2f4391b9 100644
--- a/epochX/cudacpp/gg_tt.sa/src/Parameters_sm.h
+++ b/epochX/cudacpp/gg_tt.sa/src/Parameters_sm.h
@@ -27,7 +27,7 @@
 #include "read_slha.h"
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -93,7 +93,7 @@ namespace mg5amcCpu
 #include <limits>
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -217,7 +217,7 @@ namespace mg5amcCpu
 //==========================================================================
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -236,7 +236,7 @@ namespace mg5amcCpu
 #pragma GCC diagnostic push
 #pragma GCC diagnostic ignored "-Wunused-variable"  // e.g. <<warning: unused variable ‘mdl_G__exp__2’ [-Wunused-variable]>>
 #pragma GCC diagnostic ignored "-Wunused-parameter" // e.g. <<warning: unused parameter ‘G’ [-Wunused-parameter]>>
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 #pragma nv_diagnostic push
 #pragma nv_diag_suppress 177 // e.g. <<warning #177-D: variable "mdl_G__exp__2" was declared but never referenced>>
 #endif
@@ -263,7 +263,7 @@ namespace mg5amcCpu
       // End SM implementation - no special handling of vectors of floats as in EFT (#439)
       return out;
     }
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 #pragma GCC diagnostic pop
 #pragma nv_diagnostic pop
 #endif
diff --git a/epochX/cudacpp/gg_tt.sa/src/cudacpp_src.mk b/epochX/cudacpp/gg_tt.sa/src/cudacpp_src.mk
index d4cc628aec..159e19a46d 100644
--- a/epochX/cudacpp/gg_tt.sa/src/cudacpp_src.mk
+++ b/epochX/cudacpp/gg_tt.sa/src/cudacpp_src.mk
@@ -19,7 +19,7 @@ SHELL := /bin/bash
 #=== Configure common compiler flags for CUDA and C++
 
 INCFLAGS = -I.
-OPTFLAGS = -O3 # this ends up in CUFLAGS too (should it?), cannot add -Ofast or -ffast-math here
+OPTFLAGS = -O3 # this ends up in GPUFLAGS too (should it?), cannot add -Ofast or -ffast-math here
 
 #-------------------------------------------------------------------------------
 
@@ -45,13 +45,13 @@ endif
 
 #-------------------------------------------------------------------------------
 
-#=== Configure the CUDA compiler (note: NVCC is already exported including ccache)
+#=== Configure the CUDA compiler (note: GPUCC is already exported including ccache)
 
-###$(info NVCC=$(NVCC))
+###$(info GPUCC=$(GPUCC))
 
 #-------------------------------------------------------------------------------
 
-#=== Configure ccache for C++ builds (note: NVCC is already exported including ccache)
+#=== Configure ccache for C++ builds (note: GPUCC is already exported including ccache)
 
 # Enable ccache if USECCACHE=1
 ifeq ($(USECCACHE)$(shell echo $(CXX) | grep ccache),1)
@@ -92,6 +92,13 @@ endif
 ###$(info OMPFLAGS=$(OMPFLAGS))
 CXXFLAGS += $(OMPFLAGS)
 
+# Add correct -DHIP_LATFORM when compiling for HIP
+ifeq ($(findstring nvcc,$(GPUCC)),nvcc)
+  GPUFLAGS += -Xcompiler -fPIC -c -x cu
+else ifeq ($(findstring hipcc,$(GPUCC)),hipcc)
+  GPUFLAGS += -fPIC -c
+endif
+
 # Set the build flags appropriate to each AVX choice (example: "make AVX=none")
 # [NB MGONGPU_PVW512 is needed because "-mprefer-vector-width=256" is not exposed in a macro]
 # [See https://gcc.gnu.org/bugzilla/show_bug.cgi?id=96476]
@@ -253,20 +260,20 @@ $(BUILDDIR)/%.o : %.cc *.h $(BUILDDIR)/.build.$(TAG)
 # Generic target and build rules: objects from CUDA compilation
 $(BUILDDIR)/%_cu.o : %.cc *.h $(BUILDDIR)/.build.$(TAG)
 	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
-	$(NVCC) $(CPPFLAGS) $(CUFLAGS) -Xcompiler -fPIC -c -x cu $< -o $@
+	$(GPUCC) $(CPPFLAGS) $(GPUFLAGS) $< -o $@
 
 #-------------------------------------------------------------------------------
 
 cxx_objects=$(addprefix $(BUILDDIR)/, Parameters_sm.o read_slha.o)
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 cu_objects=$(addprefix $(BUILDDIR)/, Parameters_sm_cu.o)
 endif
 
 # Target (and build rules): common (src) library
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so : $(cxx_objects) $(cu_objects)
 	@if [ ! -d $(LIBDIR) ]; then echo "mkdir -p $(LIBDIR)"; mkdir -p $(LIBDIR); fi
-	$(NVCC) -shared -o $@ $(cxx_objects) $(cu_objects) $(LDFLAGS)
+	$(GPUCC) -shared -o $@ $(cxx_objects) $(cu_objects) $(LDFLAGS)
 else
 $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so : $(cxx_objects)
 	@if [ ! -d $(LIBDIR) ]; then echo "mkdir -p $(LIBDIR)"; mkdir -p $(LIBDIR); fi
diff --git a/epochX/cudacpp/gg_tt.sa/src/mgOnGpuConfig.h b/epochX/cudacpp/gg_tt.sa/src/mgOnGpuConfig.h
index b247654dcf..da4ba36ad8 100644
--- a/epochX/cudacpp/gg_tt.sa/src/mgOnGpuConfig.h
+++ b/epochX/cudacpp/gg_tt.sa/src/mgOnGpuConfig.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jul 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MGONGPUCONFIG_H
 #define MGONGPUCONFIG_H 1
@@ -10,12 +10,25 @@
 // There are two different code bases for standalone_cudacpp (without multichannel) and madevent+cudacpp (with multichannel)
 #undef MGONGPU_SUPPORTS_MULTICHANNEL
 
+// Is this a GPU (CUDA, HIP) or CPU implementation?
+#ifdef __CUDACC__
+#define MGONGPUCPP_GPUIMPL cuda
+#elif defined __HIPCC__
+#define MGONGPUCPP_GPUIMPL hip
+#else
+#undef MGONGPUCPP_GPUIMPL
+#endif
+
 // ** NB1 Throughputs (e.g. 6.8E8) are events/sec for "./gcheck.exe -p 65536 128 12"
 // ** NB2 Baseline on b7g47n0004 fluctuates (probably depends on load on other VMs)
 
 // Choose if curand is supported for generating random numbers
+// For HIP, by default, do not use curand (common random numbers will be used instead)
 // For both CUDA and C++, by default, do not inline, but allow this macro to be set from outside with e.g. -DMGONGPU_HAS_NO_CURAND
-// (there exist CUDA installations, e.g. using the HPC package, which do not include curand - see PR #784)
+// (there exist CUDA installations, e.g. using the HPC package, which do not include curand - see PR #784 and #785)
+#if defined __HIPCC__
+#define MGONGPU_HAS_NO_CURAND 1
+#else
 //#ifdef __CUDACC__
 //#undef MGONGPU_HAS_NO_CURAND // default
 ////#define MGONGPU_HAS_NO_CURAND 1
@@ -23,6 +36,7 @@
 //#undef MGONGPU_HAS_NO_CURAND // default
 ////#define MGONGPU_HAS_NO_CURAND 1
 //#endif
+#endif
 
 // Choose floating point precision (for everything but color algebra #537)
 // If one of these macros has been set from outside with e.g. -DMGONGPU_FPTYPE_FLOAT, nothing happens (issue #167)
@@ -54,23 +68,28 @@
 //#undef MGONGPU_HARDCODE_PARAM // default
 ////#define MGONGPU_HARDCODE_PARAM 1
 
-// Complex type in c++: std::complex or cxsmpl (CHOOSE ONLY ONE)
-#ifndef __CUDACC__
-//#define MGONGPU_CPPCXTYPE_STDCOMPLEX 1 // ~8 percent slower on float, same on double (5.1E6/double, 9.4E6/float)
-#define MGONGPU_CPPCXTYPE_CXSMPL 1 // new default (5.1E6/double, 10.2E6/float)
-#endif
-
-// Complex type in cuda: thrust or cucomplex or cxsmpl (CHOOSE ONLY ONE)
+// Complex type in CUDA: thrust or cucomplex or cxsmpl (CHOOSE ONLY ONE)
 #ifdef __CUDACC__
 #define MGONGPU_CUCXTYPE_THRUST 1 // default (~1.15E9/double, ~3.2E9/float)
 //#define MGONGPU_CUCXTYPE_CUCOMPLEX 1 // ~10 percent slower (1.03E9/double, ~2.8E9/float)
 //#define MGONGPU_CUCXTYPE_CXSMPL 1 // ~10 percent slower (1.00E9/double, ~2.9E9/float)
+
+// Complex type in HIP: cxsmpl (ONLY ONE OPTION POSSIBLE)
+#elif defined __HIPCC__
+#define MGONGPU_CUCXTYPE_CXSMPL 1 // ~10 percent slower (1.00E9/double, ~2.9E9/float)
+
+// Complex type in C++: std::complex or cxsmpl (CHOOSE ONLY ONE)
+#else
+//#define MGONGPU_CPPCXTYPE_STDCOMPLEX 1 // ~8 percent slower on float, same on double (5.1E6/double, 9.4E6/float)
+#define MGONGPU_CPPCXTYPE_CXSMPL 1 // new default (5.1E6/double, 10.2E6/float)
 #endif
 
-// Cuda nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation
+// CUDA nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation
 #ifdef __CUDACC__
-#undef MGONGPU_NSIGHT_DEBUG // default
+#undef MGONGPU_NSIGHT_DEBUG // default in CUDA
 //#define MGONGPU_NSIGHT_DEBUG 1
+#else
+#undef MGONGPU_NSIGHT_DEBUG // only option in HIP or C++
 #endif
 
 // SANITY CHECKS (floating point precision for everything but color algebra #537)
@@ -86,17 +105,21 @@
 #error You cannot use double precision for color algebra and single precision elsewhere
 #endif
 
-// SANITY CHECKS (c++ complex number implementation)
-#ifndef __CUDACC__
-#if defined MGONGPU_CPPCXTYPE_STDCOMPLEX and defined MGONGPU_CPPCXTYPE_CXSMPL
-#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CPPCXTYPE_STDCOMPLEX or MGONGPU_CPPCXTYPE_CXSMPL
+// SANITY CHECKS (CUDA complex number implementation)
+#ifdef __CUDACC__
+#if defined MGONGPU_CUCXTYPE_THRUST and defined MGONGPU_CUCXTYPE_CUCOMPLEX
+#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CUCXTYPE_THRUST or MGONGPU_CUCXTYPE_CUCOMPLEX for CUDA
+#elif defined MGONGPU_CUCXTYPE_THRUST and defined MGONGPU_CUCXTYPE_CXSMPL
+#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CUCXTYPE_THRUST or MGONGPU_CUCXTYPE_CXSMPL for CUDA
+#elif defined MGONGPU_CUCXTYPE_CUCOMPLEX and defined MGONGPU_CUCXTYPE_CXSMPL
+#error You must CHOOSE (ONE AND) ONLY ONE OF MGONGPU_CUCXTYPE_CUCOMPLEX or MGONGPU_CUCXTYPE_CXSMPL for CUDA
 #endif
 #endif
 
-// SANITY CHECKS (cuda complex number implementation)
-#ifdef __CUDACC__
-#if defined MGONGPU_CUCXTYPE_THRUST and defined MGONGPU_CUCXTYPE_CUCOMPLEX and defined MGONGPU_CUCXTYPE_CXSMPL
-#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CUCXTYPE_THRUST or MGONGPU_CUCXTYPE_CUCOMPLEX or MGONGPU_CUCXTYPE_CXSMPL
+// SANITY CHECKS (C++ complex number implementation)
+#ifndef MGONGPUCPP_GPUIMPL
+#if defined MGONGPU_CPPCXTYPE_STDCOMPLEX and defined MGONGPU_CPPCXTYPE_CXSMPL
+#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CPPCXTYPE_STDCOMPLEX or MGONGPU_CPPCXTYPE_CXSMPL for C++
 #endif
 #endif
 
@@ -134,7 +157,7 @@ namespace mgOnGpu
   // Alignment requirement for using reinterpret_cast with SIMD vectorized code
   // (using reinterpret_cast with non aligned memory may lead to segmentation faults!)
   // Only needed for C++ code but can be enforced also in NVCC builds of C++ code using CUDA>=11.2 and C++17 (#318, #319, #333)
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   constexpr int cppAlign = 64; // alignment requirement for SIMD vectorization (64-byte i.e. 512-bit)
 #endif
 
@@ -145,7 +168,7 @@ using mgOnGpu::fptype;
 using mgOnGpu::fptype2;
 
 // C++ SIMD vectorization width (this will be used to set neppV)
-#ifdef __CUDACC__ // CUDA implementation has no SIMD
+#ifdef MGONGPUCPP_GPUIMPL // CUDA and HIP implementations have no SIMD
 #undef MGONGPU_CPPSIMD
 #elif defined __AVX512VL__ && defined MGONGPU_PVW512 // C++ "512z" AVX512 with 512 width (512-bit ie 64-byte): 8 (DOUBLE) or 16 (FLOAT)
 #ifdef MGONGPU_FPTYPE_DOUBLE
@@ -175,9 +198,9 @@ using mgOnGpu::fptype2;
 #undef MGONGPU_CPPSIMD
 #endif
 
-// Cuda nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation
+// CUDA nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation
 // Arguments (not used so far): text is __FUNCTION__, code is 0 (start) or 1 (end)
-#if defined __CUDACC__ && defined MGONGPU_NSIGHT_DEBUG /* clang-format off */
+#if defined __CUDA__ && defined MGONGPU_NSIGHT_DEBUG /* clang-format off */
 #define mgDebugDeclare() __shared__ float mgDebugCounter[mgOnGpu::ntpbMAX];
 #define mgDebugInitialise() { mgDebugCounter[threadIdx.x] = 0; }
 #define mgDebug( code, text ) { mgDebugCounter[threadIdx.x] += 1; }
@@ -189,8 +212,8 @@ using mgOnGpu::fptype2;
 #define mgDebugFinalise() { /*noop*/ }
 #endif /* clang-format on */
 
-// Define empty CUDA declaration specifiers for C++
-#ifndef __CUDACC__
+// Define empty CUDA/HIP declaration specifiers for C++
+#ifndef MGONGPUCPP_GPUIMPL
 #define __global__
 #define __host__
 #define __device__
diff --git a/epochX/cudacpp/gg_tt.sa/src/mgOnGpuCxtypes.h b/epochX/cudacpp/gg_tt.sa/src/mgOnGpuCxtypes.h
index ca9a9f00c0..5532e22fa1 100644
--- a/epochX/cudacpp/gg_tt.sa/src/mgOnGpuCxtypes.h
+++ b/epochX/cudacpp/gg_tt.sa/src/mgOnGpuCxtypes.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022, based on earlier work by D. Smith) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MGONGPUCXTYPES_H
 #define MGONGPUCXTYPES_H 1
@@ -19,7 +19,7 @@
 #include <complex>
 
 // Complex type in cuda: thrust or cucomplex or cxsmpl
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 #if defined MGONGPU_CUCXTYPE_THRUST
 #pragma clang diagnostic push
 #pragma clang diagnostic ignored "-Wtautological-compare" // for icpx2021/clang13 (https://stackoverflow.com/a/15864661)
@@ -82,7 +82,7 @@ namespace mgOnGpu /* clang-format off */
 using mgOnGpu::cxsmpl;
 
 // Printout to stream for user defined types
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -92,7 +92,7 @@ namespace mg5amcCpu
   inline __host__ std::ostream&
   operator<<( std::ostream& out, const cxsmpl<FP>& c )
   {
-    out << std::complex( c.real(), c.imag() );
+    out << std::complex<FP>( c.real(), c.imag() );
     return out;
   }
 
@@ -215,14 +215,14 @@ namespace mg5amcCpu
 //==========================================================================
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
 #endif
 {
   // --- Type definitions (complex type: cxtype)
-#ifdef __CUDACC__ // cuda
+#ifdef MGONGPUCPP_GPUIMPL // cuda
 #if defined MGONGPU_CUCXTYPE_THRUST
   typedef thrust::complex<fptype> cxtype;
 #elif defined MGONGPU_CUCXTYPE_CUCOMPLEX
@@ -255,7 +255,7 @@ namespace mg5amcCpu
 //==========================================================================
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -307,7 +307,7 @@ namespace mg5amcCpu
 
   //==========================================================================
 
-#if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_THRUST // cuda + thrust
+#if defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CUCXTYPE_THRUST // cuda + thrust
 
   //------------------------------
   // CUDA - using thrust::complex
@@ -343,11 +343,11 @@ namespace mg5amcCpu
     return c;
   }
 
-#endif // #if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_THRUST
+#endif // #if defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CUCXTYPE_THRUST
 
   //==========================================================================
 
-#if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_CUCOMPLEX // cuda + cucomplex
+#if defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CUCXTYPE_CUCOMPLEX // cuda + cucomplex
 
   //------------------------------
   // CUDA - using cuComplex
@@ -562,11 +562,11 @@ namespace mg5amcCpu
     return cxmake( c.real(), c.imag() );
   }
 
-#endif // #if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_CUCOMPLEX
+#endif // #if defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CUCXTYPE_CUCOMPLEX
 
   //==========================================================================
 
-#if not defined __CUDACC__ and defined MGONGPU_CPPCXTYPE_STDCOMPLEX // c++ + stdcomplex
+#if not defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CPPCXTYPE_STDCOMPLEX // c++ + stdcomplex
 
   //------------------------------
   // C++ - using std::complex
@@ -610,7 +610,7 @@ namespace mg5amcCpu
   }
 #endif
 
-#endif // #if not defined __CUDACC__ and defined MGONGPU_CPPCXTYPE_STDCOMPLEX
+#endif // #if not defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CPPCXTYPE_STDCOMPLEX
 
   //==========================================================================
 
@@ -633,7 +633,7 @@ namespace mg5amcCpu
 //==========================================================================
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/gg_tt.sa/src/mgOnGpuFptypes.h b/epochX/cudacpp/gg_tt.sa/src/mgOnGpuFptypes.h
index 905c97d700..fa3a02664b 100644
--- a/epochX/cudacpp/gg_tt.sa/src/mgOnGpuFptypes.h
+++ b/epochX/cudacpp/gg_tt.sa/src/mgOnGpuFptypes.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MGONGPUFPTYPES_H
 #define MGONGPUFPTYPES_H 1
@@ -12,7 +12,7 @@
 #include <cmath>
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL // cuda
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -20,7 +20,7 @@ namespace mg5amcCpu
 {
   //==========================================================================
 
-#ifdef __CUDACC__ // cuda
+#ifdef MGONGPUCPP_GPUIMPL // cuda
 
   //------------------------------
   // Floating point types - Cuda
@@ -64,11 +64,11 @@ namespace mg5amcCpu
 #endif
   }
 
-#endif // #ifdef __CUDACC__
+#endif // #ifdef MGONGPUCPP_GPUIMPL
 
   //==========================================================================
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 
   //------------------------------
   // Floating point types - C++
@@ -92,7 +92,7 @@ namespace mg5amcCpu
     return std::sqrt( f );
   }
 
-#endif // #ifndef __CUDACC__
+#endif // #ifndef MGONGPUCPP_GPUIMPL
 
   //==========================================================================
 
diff --git a/epochX/cudacpp/gg_tt.sa/src/mgOnGpuVectors.h b/epochX/cudacpp/gg_tt.sa/src/mgOnGpuVectors.h
index e1299ba81e..cdae04326b 100644
--- a/epochX/cudacpp/gg_tt.sa/src/mgOnGpuVectors.h
+++ b/epochX/cudacpp/gg_tt.sa/src/mgOnGpuVectors.h
@@ -32,7 +32,7 @@
 #endif
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -131,7 +131,7 @@ namespace mg5amcCpu
 #endif
 #endif
 
-#else // i.e #ifndef MGONGPU_CPPSIMD (this includes #ifdef __CUDACC__)
+#else // i.e #ifndef MGONGPU_CPPSIMD (this includes #ifdef MGONGPUCPP_GPUIMPL)
 
   const int neppV = 1;
 
@@ -153,13 +153,13 @@ namespace mg5amcCpu
 //==========================================================================
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
 #endif
 {
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 
   // Printout to stream for user defined types
 
@@ -805,11 +805,11 @@ namespace mg5amcCpu
 
 #endif // #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
 
-#endif // #ifndef __CUDACC__
+#endif // #ifndef MGONGPUCPP_GPUIMPL
 
   //==========================================================================
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 
   //------------------------------
   // Vector types - CUDA
@@ -853,12 +853,12 @@ namespace mg5amcCpu
     return mask;
   }
 
-#endif // #ifdef __CUDACC__
+#endif // #ifdef MGONGPUCPP_GPUIMPL
 
   //==========================================================================
 
   // Scalar-or-vector types: scalar in CUDA, vector or scalar in C++
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   typedef bool bool_sv;
   typedef fptype fptype_sv;
   typedef fptype2 fptype2_sv;
@@ -879,7 +879,7 @@ namespace mg5amcCpu
 #endif
 
   // Scalar-or-vector zeros: scalar in CUDA, vector or scalar in C++
-#ifdef __CUDACC__ /* clang-format off */
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
   inline __host__ __device__ cxtype cxzero_sv(){ return cxtype( 0, 0 ); }
 #elif defined MGONGPU_CPPSIMD
   inline cxtype_v cxzero_sv() { return cxtype_v(); } // RRRR=0000 IIII=0000
diff --git a/epochX/cudacpp/gg_tt.sa/src/rambo.h b/epochX/cudacpp/gg_tt.sa/src/rambo.h
index e02ea52496..cd7e1008ea 100644
--- a/epochX/cudacpp/gg_tt.sa/src/rambo.h
+++ b/epochX/cudacpp/gg_tt.sa/src/rambo.h
@@ -4,7 +4,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 
 #include "mgOnGpuConfig.h"
@@ -18,7 +18,7 @@
 #include <iostream>
 
 // Simplified rambo version for 2 to N (with N>=2) processes with massless particles
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -83,7 +83,7 @@ namespace mg5amcCpu
       static bool first = true;
       if( first )
       {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
         if constexpr( M_ACCESS::isOnDevice() ) // avoid
         {
           const int ievt0 = 0;
@@ -166,7 +166,7 @@ namespace mg5amcCpu
     wt = po2log;
     if( nparf != 2 ) wt = ( 2. * nparf - 4. ) * log( energy ) + z[nparf - 1];
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
     // issue warnings if weight is too small or too large
     static int iwarn[5] = { 0, 0, 0, 0, 0 };
     if( wt < -180. )
diff --git a/epochX/cudacpp/gg_tt01g.mad/CODEGEN_mad_gg_tt01g_log.txt b/epochX/cudacpp/gg_tt01g.mad/CODEGEN_mad_gg_tt01g_log.txt
index b3d319e039..f38b6ec6e6 100644
--- a/epochX/cudacpp/gg_tt01g.mad/CODEGEN_mad_gg_tt01g_log.txt
+++ b/epochX/cudacpp/gg_tt01g.mad/CODEGEN_mad_gg_tt01g_log.txt
@@ -52,7 +52,7 @@ Note that you can still compile and run aMC@NLO with the built-in PDFs
 
 Using default text editor "vi". Set another one in ./input/mg5_configuration.txt
 Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt
-No valid web browser found. Please set in ./input/mg5_configuration.txt
+Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt
 import /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g.mg
 The import format was not given, so we guess it as command
 set stdout_level DEBUG
@@ -62,7 +62,7 @@ generate g g > t t~
 No model currently active, so we import the Standard Model
 INFO: load particles 
 INFO: load vertices 
-[1;32mDEBUG: model prefixing  takes 0.005671977996826172 [0m
+[1;32mDEBUG: model prefixing  takes 0.005505561828613281 [0m
 INFO: Restrict model sm with file models/sm/restrict_default.dat . 
 [1;32mDEBUG: Simplifying conditional expressions [0m
 [1;32mDEBUG: remove interactions: u s w+ at order: QED=1 [0m
@@ -185,7 +185,7 @@ INFO: Generating Helas calls for process: g g > t t~ WEIGHTED<=2 @1
 INFO: Processing color information for process: g g > t t~ @1 
 INFO: Creating files in directory P2_gg_ttxg 
 [1;32mDEBUG:  kwargs[prefix] = 0 [1;30m[model_handling.py at line 1058][0m [0m
-[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7f4feb5031c0> [1;30m[export_v4.py at line 6262][0m [0m
+[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7f29a41021f0> [1;30m[export_v4.py at line 6262][0m [0m
 INFO: Creating files in directory . 
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.h
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.cc
@@ -202,7 +202,7 @@ INFO: Generating Feynman diagrams for Process: g g > t t~ g WEIGHTED<=3 @2
 INFO: Finding symmetric diagrams for subprocess group gg_ttxg 
 INFO: Creating files in directory P1_gg_ttx 
 [1;32mDEBUG:  kwargs[prefix] = 0 [1;30m[model_handling.py at line 1058][0m [0m
-[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7f4feb492fd0> [1;30m[export_v4.py at line 6262][0m [0m
+[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7f29a4102070> [1;30m[export_v4.py at line 6262][0m [0m
 INFO: Creating files in directory . 
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.h
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.cc
@@ -217,15 +217,15 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./.
 [1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m 32 True 32 [1;30m[export_v4.py at line 1872][0m [0m
 INFO: Generating Feynman diagrams for Process: g g > t t~ WEIGHTED<=2 @1 
 INFO: Finding symmetric diagrams for subprocess group gg_ttx 
-Generated helas calls for 2 subprocesses (19 diagrams) in 0.044 s
-Wrote files for 46 helas calls in 0.247 s
+Generated helas calls for 2 subprocesses (19 diagrams) in 0.042 s
+Wrote files for 46 helas calls in 0.243 s
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates VVV1 routines[0m
 ALOHA: aloha creates FFV1 routines[0m
 ALOHA: aloha creates VVVV1 set of routines with options: P0[0m
 ALOHA: aloha creates VVVV3 set of routines with options: P0[0m
 ALOHA: aloha creates VVVV4 set of routines with options: P0[0m
-ALOHA: aloha creates 5 routines in  0.330 s
+ALOHA: aloha creates 5 routines in  0.324 s
 [1;32mDEBUG:  Entering PLUGIN_ProcessExporter.convert_model (create the model) [1;30m[output.py at line 202][0m [0m
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates VVV1 routines[0m
@@ -233,7 +233,7 @@ ALOHA: aloha creates FFV1 routines[0m
 ALOHA: aloha creates VVVV1 set of routines with options: P0[0m
 ALOHA: aloha creates VVVV3 set of routines with options: P0[0m
 ALOHA: aloha creates VVVV4 set of routines with options: P0[0m
-ALOHA: aloha creates 10 routines in  0.316 s
+ALOHA: aloha creates 10 routines in  0.308 s
 <class 'aloha.create_aloha.AbstractRoutine'> VVV1
 <class 'aloha.create_aloha.AbstractRoutine'> VVV1
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
@@ -283,10 +283,10 @@ Type "launch" to generate events from this process, or see
 Run "open index.html" to see more information about this process.
 quit
 
-real	0m2.334s
-user	0m2.083s
-sys	0m0.238s
-Code generation completed in 2 seconds
+real	0m2.484s
+user	0m2.030s
+sys	0m0.256s
+Code generation completed in 3 seconds
 ************************************************************
 *                                                          *
 *                      W E L C O M E to                    *
@@ -312,7 +312,7 @@ INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/mg5amc
 INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g/Cards/me5_configuration.txt  
 Using default text editor "vi". Set another one in ./input/mg5_configuration.txt
 Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt
-No valid web browser found. Please set in ./input/mg5_configuration.txt
+Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt
 treatcards run
 quit
 INFO:  
@@ -342,7 +342,7 @@ INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/mg5amc
 INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g/Cards/me5_configuration.txt  
 Using default text editor "vi". Set another one in ./input/mg5_configuration.txt
 Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt
-No valid web browser found. Please set in ./input/mg5_configuration.txt
+Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt
 treatcards param
 quit
 INFO:  
diff --git a/epochX/cudacpp/gg_tt01g.mad/COPYRIGHT b/epochX/cudacpp/gg_tt01g.mad/COPYRIGHT
index a134b5fef9..84a883fbb0 100644
--- a/epochX/cudacpp/gg_tt01g.mad/COPYRIGHT
+++ b/epochX/cudacpp/gg_tt01g.mad/COPYRIGHT
@@ -15,6 +15,7 @@ The full development team currently includes the following authors :
   Stephan Hageboeck (CERN)
   Olivier Mattelaer (Universite Catholique de Louvain, original author)
   Stefan Roiser (CERN, original author)
+  Joergen Teig (CERN)
   Andrea Valassi (CERN, original author)
   Zenny Wettersten (CERN)
 See https://github.com/madgraph5/madgraph4gpu for more details. For the full
diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/Bridge.h b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/Bridge.h
index bf8b5e024d..89437b4c42 100644
--- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/Bridge.h
+++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/Bridge.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: S. Roiser (Nov 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Roiser, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef BRIDGE_H
 #define BRIDGE_H 1
@@ -23,7 +23,7 @@
 #include <memory>
 #include <type_traits>
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -83,7 +83,7 @@ namespace mg5amcCpu
     Bridge& operator=( const Bridge& ) = delete;
     Bridge& operator=( Bridge&& ) = delete;
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     /**
      * Set the gpublocks and gputhreads for the gpusequence - throws if evnt != gpublocks*gputhreads
      * (this is needed for BridgeKernel tests rather than for actual production use in Fortran)
@@ -150,7 +150,7 @@ namespace mg5amcCpu
     unsigned int m_nevt; // number of events
     int m_nGoodHel;      // the number of good helicities (-1 initially when they have not yet been calculated)
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     int m_gputhreads; // number of gpu threads (default set from number of events, can be modified)
     int m_gpublocks;  // number of gpu blocks (default set from number of events, can be modified)
     DeviceBuffer<FORTRANFPTYPE, sizePerEventMomenta> m_devMomentaF;
@@ -187,12 +187,12 @@ namespace mg5amcCpu
   // Forward declare transposition methods
   //
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 
   template<typename Tin, typename Tout>
   __global__ void dev_transposeMomentaF2C( const Tin* in, Tout* out, const unsigned int nevt );
 
-#endif // __CUDACC__
+#endif // MGONGPUCPP_GPUIMPL
 
   template<typename Tin, typename Tout>
   void hst_transposeMomentaF2C( const Tin* in, Tout* out, const unsigned int nevt );
@@ -209,7 +209,7 @@ namespace mg5amcCpu
   Bridge<FORTRANFPTYPE>::Bridge( unsigned int nevtF, unsigned int nparF, unsigned int np4F )
     : m_nevt( nevtF )
     , m_nGoodHel( -1 )
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     , m_gputhreads( 256 )                  // default number of gpu threads
     , m_gpublocks( m_nevt / m_gputhreads ) // this ensures m_nevt <= m_gpublocks*m_gputhreads
     , m_devMomentaF( m_nevt )
@@ -233,7 +233,7 @@ namespace mg5amcCpu
   {
     if( nparF != CPPProcess::npar ) throw std::runtime_error( "Bridge constructor: npar mismatch" );
     if( np4F != CPPProcess::np4 ) throw std::runtime_error( "Bridge constructor: np4 mismatch" );
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     if( ( m_nevt < s_gputhreadsmin ) || ( m_nevt % s_gputhreadsmin != 0 ) )
       throw std::runtime_error( "Bridge constructor: nevt should be a multiple of " + std::to_string( s_gputhreadsmin ) );
     while( m_nevt != m_gpublocks * m_gputhreads )
@@ -249,7 +249,7 @@ namespace mg5amcCpu
 #else
     std::cout << "WARNING! Instantiate host Bridge (nevt=" << m_nevt << ")" << std::endl;
     m_pmek.reset( new MatrixElementKernelHost( m_hstMomentaC, m_hstGs, m_hstRndHel, m_hstRndCol, m_hstMEs, m_hstSelHel, m_hstSelCol, m_nevt ) );
-#endif // __CUDACC__
+#endif // MGONGPUCPP_GPUIMPL
     // Create a process object, read param card and set parameters
     // FIXME: the process instance can happily go out of scope because it is only needed to read parameters?
     // FIXME: the CPPProcess should really be a singleton? what if fbridgecreate is called from several Fortran threads?
@@ -262,7 +262,7 @@ namespace mg5amcCpu
     process.initProc( paramCard );
   }
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   template<typename FORTRANFPTYPE>
   void Bridge<FORTRANFPTYPE>::set_gpugrid( const int gpublocks, const int gputhreads )
   {
@@ -276,7 +276,7 @@ namespace mg5amcCpu
   }
 #endif
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   template<typename FORTRANFPTYPE>
   void Bridge<FORTRANFPTYPE>::gpu_sequence( const FORTRANFPTYPE* momenta,
                                             const FORTRANFPTYPE* gs,
@@ -291,14 +291,14 @@ namespace mg5amcCpu
     constexpr int neppM = MemoryAccessMomenta::neppM;
     if constexpr( neppM == 1 && std::is_same_v<FORTRANFPTYPE, fptype> )
     {
-      checkCuda( cudaMemcpy( m_devMomentaC.data(), momenta, m_devMomentaC.bytes(), cudaMemcpyHostToDevice ) );
+      gpuMemcpy( m_devMomentaC.data(), momenta, m_devMomentaC.bytes(), gpuMemcpyHostToDevice );
     }
     else
     {
-      checkCuda( cudaMemcpy( m_devMomentaF.data(), momenta, m_devMomentaF.bytes(), cudaMemcpyHostToDevice ) );
+      gpuMemcpy( m_devMomentaF.data(), momenta, m_devMomentaF.bytes(), gpuMemcpyHostToDevice );
       const int thrPerEvt = CPPProcess::npar * CPPProcess::np4; // AV: transpose alg does 1 element per thread (NOT 1 event per thread)
       //const int thrPerEvt = 1; // AV: try new alg with 1 event per thread... this seems slower
-      dev_transposeMomentaF2C<<<m_gpublocks * thrPerEvt, m_gputhreads>>>( m_devMomentaF.data(), m_devMomentaC.data(), m_nevt );
+      gpuLaunchKernel( dev_transposeMomentaF2C, m_gpublocks * thrPerEvt, m_gputhreads, m_devMomentaF.data(), m_devMomentaC.data(), m_nevt );
     }
     if constexpr( std::is_same_v<FORTRANFPTYPE, fptype> )
     {
@@ -341,7 +341,7 @@ namespace mg5amcCpu
   }
 #endif
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   template<typename FORTRANFPTYPE>
   void Bridge<FORTRANFPTYPE>::cpu_sequence( const FORTRANFPTYPE* momenta,
                                             const FORTRANFPTYPE* gs,
@@ -396,7 +396,7 @@ namespace mg5amcCpu
   // - C++ array: momenta[npagM][npar][np4][neppM] with nevt=npagM*neppM (AOSOA)
   //
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   template<typename Tin, typename Tout>
   __global__ void dev_transposeMomentaF2C( const Tin* in, Tout* out, const unsigned int nevt )
   {
diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/BridgeKernels.cc b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/BridgeKernels.cc
index d58066c9c1..eaf4037a24 100644
--- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/BridgeKernels.cc
+++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/BridgeKernels.cc
@@ -1,17 +1,18 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #include "BridgeKernels.h"
 
+#include "GpuAbstraction.h"
 #include "MemoryAccessMomenta.h"
 
 #include <sstream>
 
 //============================================================================
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -45,7 +46,7 @@ namespace mg5amcCpu
 
 //============================================================================
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 namespace mg5amcCpu
 {
 
@@ -96,7 +97,7 @@ namespace mg5amcCpu
 
 //============================================================================
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 {
 
diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/BridgeKernels.h b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/BridgeKernels.h
index 15eb4bff4d..3efef8ce97 100644
--- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/BridgeKernels.h
+++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/BridgeKernels.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef BRIDGEKERNELS_H
 #define BRIDGEKERNELS_H 1
@@ -12,7 +12,7 @@
 #include "MatrixElementKernels.h"
 #include "MemoryBuffers.h"
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -49,7 +49,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A Bridge wrapper class encapsulating matrix element calculations on a CPU host
   class BridgeKernelHost final : public BridgeKernelBase
   {
@@ -89,7 +89,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   // A Bridge wrapper class encapsulating matrix element calculations on a GPU device
   class BridgeKernelDevice : public BridgeKernelBase
   {
diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/CommonRandomNumberKernel.cc b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/CommonRandomNumberKernel.cc
index 985b39f576..010bc4cbd0 100644
--- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/CommonRandomNumberKernel.cc
+++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/CommonRandomNumberKernel.cc
@@ -1,15 +1,16 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #include "CommonRandomNumbers.h"
+#include "GpuAbstraction.h"
 #include "MemoryBuffers.h"
 #include "RandomNumberKernels.h"
 
 #include <cassert>
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/CrossSectionKernels.cc b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/CrossSectionKernels.cc
index 0b355a3c8d..c15b39844d 100644
--- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/CrossSectionKernels.cc
+++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/CrossSectionKernels.cc
@@ -1,10 +1,11 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #include "CrossSectionKernels.h"
 
+#include "GpuAbstraction.h"
 #include "MemoryAccessMatrixElements.h"
 #include "MemoryAccessWeights.h"
 #include "MemoryBuffers.h"
@@ -77,7 +78,7 @@ debug_me_is_abnormal( const fptype& me, size_t ievtALL )
 
 //============================================================================
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -185,7 +186,7 @@ namespace mg5amcCpu
 
 //============================================================================
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 {
 
diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/CrossSectionKernels.h b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/CrossSectionKernels.h
index 7933ca4bbf..4d9659e04e 100644
--- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/CrossSectionKernels.h
+++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/CrossSectionKernels.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef CROSSSECTIONKERNELS_H
 #define CROSSSECTIONKERNELS_H 1
@@ -13,7 +13,7 @@
 
 //============================================================================
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -96,7 +96,7 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
   /*
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   // A class encapsulating the calculation of event statistics on a GPU device
   class CrossSectionKernelDevice : public CrossSectionKernelBase, public NumberOfEvents
   {
diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/CurandRandomNumberKernel.cc b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/CurandRandomNumberKernel.cc
index eb56333b03..08a16f6f2c 100644
--- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/CurandRandomNumberKernel.cc
+++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/CurandRandomNumberKernel.cc
@@ -1,9 +1,9 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
-#include "CudaRuntime.h"
+#include "GpuRuntime.h"
 #include "MemoryBuffers.h"
 #include "RandomNumberKernels.h"
 
@@ -22,7 +22,7 @@ inline void assertCurand( curandStatus_t code, const char *file, int line, bool
 }
 #endif /* clang-format on */
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -36,7 +36,7 @@ namespace mg5amcCpu
   {
     if( m_isOnDevice )
     {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
       if( !m_rnarray.isOnDevice() )
         throw std::runtime_error( "CurandRandomNumberKernel on device with a host random number array" );
 #else
@@ -114,7 +114,7 @@ namespace mg5amcCpu
     /*
     printf( "\nCurandRandomNumberKernel::generateRnarray size = %d\n", (int)m_rnarray.size() );
     fptype* data = m_rnarray.data();
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     if( m_rnarray.isOnDevice() )
     {
       data = new fptype[m_rnarray.size()]();
@@ -123,7 +123,7 @@ namespace mg5amcCpu
 #endif
     for( int i = 0; i < ( (int)m_rnarray.size() / 4 ); i++ )
       printf( "[%4d] %f %f %f %f\n", i * 4, data[i * 4], data[i * 4 + 2], data[i * 4 + 2], data[i * 4 + 3] );
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     if( m_rnarray.isOnDevice() ) delete[] data;
 #endif
     */
diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/EventStatistics.h b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/EventStatistics.h
index 48b51e0a49..b425a5bade 100644
--- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/EventStatistics.h
+++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/EventStatistics.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef EventStatistics_H
 #define EventStatistics_H 1
@@ -16,7 +16,7 @@
 #include <limits>
 #include <string>
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/GpuAbstraction.h b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/GpuAbstraction.h
new file mode 100644
index 0000000000..6a7d9c05c0
--- /dev/null
+++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/GpuAbstraction.h
@@ -0,0 +1,71 @@
+// Copyright (C) 2020-2023 CERN and UCLouvain.
+// Licensed under the GNU Lesser General Public License (version 3 or later).
+// Created by: J. Teig (Jul 2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+
+#ifndef MG5AMC_GPUABSTRACTION_H
+#define MG5AMC_GPUABSTRACTION_H 1
+
+#include <cassert>
+
+//--------------------------------------------------------------------------
+
+#ifdef __CUDACC__
+
+#define gpuError_t cudaError_t
+#define gpuPeekAtLastError cudaPeekAtLastError
+#define gpuGetErrorString cudaGetErrorString
+#define gpuSuccess cudaSuccess
+
+#define gpuMallocHost( ptr, size ) checkGpu( cudaMallocHost( ptr, size ) )
+#define gpuMalloc( ptr, size ) checkGpu( cudaMalloc( ptr, size ) )
+
+#define gpuMemcpy( dstData, srcData, srcBytes, func ) checkGpu( cudaMemcpy( dstData, srcData, srcBytes, func ) )
+#define gpuMemcpyHostToDevice cudaMemcpyHostToDevice
+#define gpuMemcpyDeviceToHost cudaMemcpyDeviceToHost
+#define gpuMemcpyToSymbol( type1, type2, size ) checkGpu( cudaMemcpyToSymbol( type1, type2, size ) )
+
+#define gpuFree( ptr ) checkGpu( cudaFree( ptr ) )
+#define gpuFreeHost( ptr ) checkGpu( cudaFreeHost( ptr ) )
+
+#define gpuSetDevice cudaSetDevice
+#define gpuDeviceSynchronize cudaDeviceSynchronize
+#define gpuDeviceReset cudaDeviceReset
+
+#define gpuLaunchKernel( kernel, blocks, threads, ... ) kernel<<<blocks, threads>>>( __VA_ARGS__ )
+#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<<blocks, threads, sharedMem>>>( __VA_ARGS__ )
+
+//--------------------------------------------------------------------------
+
+#elif defined __HIPCC__
+
+#include "hip/hip_runtime.h"
+
+#define gpuError_t hipError_t
+#define gpuPeekAtLastError hipPeekAtLastError
+#define gpuGetErrorString hipGetErrorString
+#define gpuSuccess hipSuccess
+
+#define gpuMallocHost( ptr, size ) checkGpu( hipHostMalloc( ptr, size ) ) // HostMalloc better
+#define gpuMalloc( ptr, size ) checkGpu( hipMalloc( ptr, size ) )
+
+#define gpuMemcpy( dstData, srcData, srcBytes, func ) checkGpu( hipMemcpy( dstData, srcData, srcBytes, func ) )
+#define gpuMemcpyHostToDevice hipMemcpyHostToDevice
+#define gpuMemcpyDeviceToHost hipMemcpyDeviceToHost
+#define gpuMemcpyToSymbol( type1, type2, size ) checkGpu( hipMemcpyToSymbol( type1, type2, size ) )
+
+#define gpuFree( ptr ) checkGpu( hipFree( ptr ) )
+#define gpuFreeHost( ptr ) checkGpu( hipHostFree( ptr ) )
+
+#define gpuSetDevice hipSetDevice
+#define gpuDeviceSynchronize hipDeviceSynchronize
+#define gpuDeviceReset hipDeviceReset
+
+#define gpuLaunchKernel( kernel, blocks, threads, ... ) kernel<<<blocks, threads>>>( __VA_ARGS__ )
+#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<<blocks, threads, sharedMem>>>( __VA_ARGS__ )
+
+//--------------------------------------------------------------------------
+
+#endif
+
+#endif // MG5AMC_GPUABSTRACTION_H
diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/CudaRuntime.h b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/GpuRuntime.h
similarity index 62%
rename from epochX/cudacpp/gg_tt01g.mad/SubProcesses/CudaRuntime.h
rename to epochX/cudacpp/gg_tt01g.mad/SubProcesses/GpuRuntime.h
index 64ce52f4b3..93579ef08b 100644
--- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/CudaRuntime.h
+++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/GpuRuntime.h
@@ -1,49 +1,50 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
-// Created by: S. Roiser (Jul 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+// Created by: J. Teig (Jun 2023, based on earlier work by S. Roiser) for the MG5aMC CUDACPP plugin.
+// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 
-#ifndef MG5AMC_CUDARUNTIME_H
-#define MG5AMC_CUDARUNTIME_H 1
+#ifndef MG5AMC_GPURUNTIME_H
+#define MG5AMC_GPURUNTIME_H 1
 
 // MG5AMC on GPU uses the CUDA runtime API, not the lower level CUDA driver API
 // See https://docs.nvidia.com/cuda/cuda-runtime-api/driver-vs-runtime-api.html#driver-vs-runtime-api
 
-#include <cassert>
+#include "GpuAbstraction.h"
+
 #include <iostream>
 
 //--------------------------------------------------------------------------
 
 // See https://stackoverflow.com/a/14038590
-#ifdef __CUDACC__ /* clang-format off */
-#define checkCuda( code ) { assertCuda( code, __FILE__, __LINE__ ); }
-inline void assertCuda( cudaError_t code, const char* file, int line, bool abort = true )
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
+#define checkGpu( code ) { assertGpu( code, __FILE__, __LINE__ ); }
+inline void assertGpu( gpuError_t code, const char* file, int line, bool abort = true )
 {
-  if( code != cudaSuccess )
+  if( code != gpuSuccess )
   {
-    printf( "ERROR! assertCuda: '%s' (%d) in %s:%d\n", cudaGetErrorString( code ), code, file, line );
-    if( abort ) assert( code == cudaSuccess );
+    printf( "ERROR! assertGpu: '%s' (%d) in %s:%d\n", gpuGetErrorString( code ), code, file, line );
+    if( abort ) assert( code == gpuSuccess );
   }
 }
 #endif /* clang-format on */
 
 //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 {
-  // Instantiate a CudaRuntime at the beginnining of the application's main to
-  // invoke cudaSetDevice(0) in the constructor and book a cudaDeviceReset() call in the destructor
+  // Instantiate a GpuRuntime at the beginnining of the application's main to
+  // invoke gpuSetDevice(0) in the constructor and book a gpuDeviceReset() call in the destructor
   // *** FIXME! This will all need to be designed differently when going to multi-GPU nodes! ***
-  struct CudaRuntime final
+  struct GpuRuntime final
   {
-    CudaRuntime( const bool debug = true )
+    GpuRuntime( const bool debug = true )
       : m_debug( debug ) { setUp( m_debug ); }
-    ~CudaRuntime() { tearDown( m_debug ); }
-    CudaRuntime( const CudaRuntime& ) = delete;
-    CudaRuntime( CudaRuntime&& ) = delete;
-    CudaRuntime& operator=( const CudaRuntime& ) = delete;
-    CudaRuntime& operator=( CudaRuntime&& ) = delete;
+    ~GpuRuntime() { tearDown( m_debug ); }
+    GpuRuntime( const GpuRuntime& ) = delete;
+    GpuRuntime( GpuRuntime&& ) = delete;
+    GpuRuntime& operator=( const GpuRuntime& ) = delete;
+    GpuRuntime& operator=( GpuRuntime&& ) = delete;
     bool m_debug;
 
     // Set up CUDA application
@@ -62,8 +63,8 @@ namespace mg5amcGpu
       */
       // Replace cudaFree(0) by cudaSetDevice(0), even if it is not really needed either
       // (but see https://developer.nvidia.com/blog/cuda-pro-tip-always-set-current-device-avoid-multithreading-bugs)
-      if( debug ) std::cout << "__CudaRuntime: calling cudaSetDevice(0)" << std::endl;
-      checkCuda( cudaSetDevice( 0 ) ); // SLOW!
+      if( debug ) std::cout << "__GpuRuntime: calling GpuSetDevice(0)" << std::endl;
+      checkGpu( gpuSetDevice( 0 ) ); // SLOW!
     }
 
     // Tear down CUDA application (call cudaDeviceReset)
@@ -72,14 +73,13 @@ namespace mg5amcGpu
     // See https://docs.nvidia.com/cuda/cuda-memcheck/index.html#leak-checking
     static void tearDown( const bool debug = true )
     {
-      if( debug ) std::cout << "__CudaRuntime: calling cudaDeviceReset()" << std::endl;
-      checkCuda( cudaDeviceReset() );
+      if( debug ) std::cout << "__GpuRuntime: calling GpuDeviceReset()" << std::endl;
+      checkGpu( gpuDeviceReset() );
     }
   };
-
 }
 #endif
 
 //--------------------------------------------------------------------------
 
-#endif // MG5AMC_CUDARUNTIME_H
+#endif // MG5AMC_GPURUNTIME_H
diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MadgraphTest.h b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MadgraphTest.h
index ef40624c88..a64c05c26a 100644
--- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MadgraphTest.h
+++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MadgraphTest.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: S. Hageboeck (Dec 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MADGRAPHTEST_H_
 #define MADGRAPHTEST_H_ 1
@@ -22,7 +22,7 @@
 #include <string>
 #include <vector>
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 using mg5amcGpu::CPPProcess;
 #else
 using mg5amcCpu::CPPProcess;
@@ -201,7 +201,7 @@ class MadgraphTest : public testing::TestWithParam<TestDriverBase*>
 
 // Since we link both the CPU-only and GPU tests into the same executable, we prevent
 // a multiply defined symbol by only compiling this in the non-CUDA phase:
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 
 /// Compare momenta and matrix elements.
 /// This uses an implementation of TestDriverBase to run a madgraph workflow,
@@ -307,6 +307,6 @@ TEST_P( MadgraphTest, CompareMomentaAndME )
   }
 }
 
-#endif // __CUDACC__
+#endif // MGONGPUCPP_GPUIMPL
 
 #endif /* MADGRAPHTEST_H_ */
diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MatrixElementKernels.cc b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MatrixElementKernels.cc
index 74b5239ebf..81699dfea9 100644
--- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MatrixElementKernels.cc
+++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MatrixElementKernels.cc
@@ -1,12 +1,12 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #include "MatrixElementKernels.h"
 
 #include "CPPProcess.h"
-#include "CudaRuntime.h"
+#include "GpuRuntime.h" // Includes the abstraction for Nvidia/AMD compilation
 #include "MemoryAccessMomenta.h"
 #include "MemoryBuffers.h"
 
@@ -14,7 +14,7 @@
 
 //============================================================================
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 namespace mg5amcCpu
 {
 
@@ -150,7 +150,7 @@ namespace mg5amcCpu
 
 //============================================================================
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 {
 
@@ -209,13 +209,13 @@ namespace mg5amcGpu
     PinnedHostBufferHelicityMask hstIsGoodHel( ncomb );
     DeviceBufferHelicityMask devIsGoodHel( ncomb );
     // ... 0d1. Compute good helicity mask on the device
-    computeDependentCouplings<<<m_gpublocks, m_gputhreads>>>( m_gs.data(), m_couplings.data() );
+    gpuLaunchKernel( computeDependentCouplings, m_gpublocks, m_gputhreads, m_gs.data(), m_couplings.data() );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    sigmaKin_getGoodHel<<<m_gpublocks, m_gputhreads>>>( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_numerators.data(), m_denominators.data(), devIsGoodHel.data() );
+    gpuLaunchKernel( sigmaKin_getGoodHel, m_gpublocks, m_gputhreads, m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_numerators.data(), m_denominators.data(), devIsGoodHel.data() );
 #else
-    sigmaKin_getGoodHel<<<m_gpublocks, m_gputhreads>>>( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), devIsGoodHel.data() );
+    gpuLaunchKernel( sigmaKin_getGoodHel, m_gpublocks, m_gputhreads, m_momenta.data(), m_couplings.data(), m_matrixElements.data(), devIsGoodHel.data() );
 #endif
-    checkCuda( cudaPeekAtLastError() );
+    checkGpu( gpuPeekAtLastError() );
     // ... 0d2. Copy back good helicity mask to the host
     copyHostFromDevice( hstIsGoodHel, devIsGoodHel );
     // ... 0d3. Copy back good helicity list to constant memory on the device
@@ -226,19 +226,19 @@ namespace mg5amcGpu
 
   void MatrixElementKernelDevice::computeMatrixElements( const unsigned int channelId )
   {
-    computeDependentCouplings<<<m_gpublocks, m_gputhreads>>>( m_gs.data(), m_couplings.data() );
+    gpuLaunchKernel( computeDependentCouplings, m_gpublocks, m_gputhreads, m_gs.data(), m_couplings.data() );
 #ifndef MGONGPU_NSIGHT_DEBUG
     constexpr unsigned int sharedMemSize = 0;
 #else
     constexpr unsigned int sharedMemSize = ntpbMAX * sizeof( float );
 #endif
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    sigmaKin<<<m_gpublocks, m_gputhreads, sharedMemSize>>>( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), channelId, m_numerators.data(), m_denominators.data(), m_selhel.data(), m_selcol.data() );
+    gpuLaunchKernelSharedMem( sigmaKin, m_gpublocks, m_gputhreads, sharedMemSize, m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), channelId, m_numerators.data(), m_denominators.data(), m_selhel.data(), m_selcol.data() );
 #else
-    sigmaKin<<<m_gpublocks, m_gputhreads, sharedMemSize>>>( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), m_selhel.data(), m_selcol.data() );
+    gpuLaunchKernelSharedMem( sigmaKin, m_gpublocks, m_gputhreads, sharedMemSize, m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), m_selhel.data(), m_selcol.data() );
 #endif
-    checkCuda( cudaPeekAtLastError() );
-    checkCuda( cudaDeviceSynchronize() );
+    checkGpu( gpuPeekAtLastError() );
+    checkGpu( gpuDeviceSynchronize() );
   }
 
   //--------------------------------------------------------------------------
diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MatrixElementKernels.h b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MatrixElementKernels.h
index 23e84757a2..72bd8f195b 100644
--- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MatrixElementKernels.h
+++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MatrixElementKernels.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MATRIXELEMENTKERNELS_H
 #define MATRIXELEMENTKERNELS_H 1
@@ -10,7 +10,7 @@
 
 #include "MemoryBuffers.h"
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -81,7 +81,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating matrix element calculations on a CPU host
   class MatrixElementKernelHost final : public MatrixElementKernelBase, public NumberOfEvents
   {
@@ -130,7 +130,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   // A class encapsulating matrix element calculations on a GPU device
   class MatrixElementKernelDevice : public MatrixElementKernelBase, public NumberOfEvents
   {
diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MemoryAccessAmplitudes.h b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MemoryAccessAmplitudes.h
index 573b3bbbc9..ffb76e93de 100644
--- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MemoryAccessAmplitudes.h
+++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MemoryAccessAmplitudes.h
@@ -15,7 +15,7 @@
 #define MGONGPU_TRIVIAL_AMPLITUDES 1
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MemoryAccessCouplings.h b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MemoryAccessCouplings.h
index 35a3af42e0..3afdf3e554 100644
--- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MemoryAccessCouplings.h
+++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MemoryAccessCouplings.h
@@ -15,7 +15,7 @@
 #include "MemoryBuffers.h"       // for HostBufferCouplings::isaligned
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MemoryAccessCouplingsFixed.h b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MemoryAccessCouplingsFixed.h
index dc0d93afff..ffcdf4dbef 100644
--- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MemoryAccessCouplingsFixed.h
+++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MemoryAccessCouplingsFixed.h
@@ -14,7 +14,7 @@
 //#include "MemoryAccessHelpers.h"
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MemoryAccessDenominators.h b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MemoryAccessDenominators.h
index 3bce635718..66f2d32a6b 100644
--- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MemoryAccessDenominators.h
+++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MemoryAccessDenominators.h
@@ -10,7 +10,7 @@
 #include "MemoryAccessGs.h"
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MemoryAccessGs.h b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MemoryAccessGs.h
index 31311aa375..4c726b30f3 100644
--- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MemoryAccessGs.h
+++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MemoryAccessGs.h
@@ -13,7 +13,7 @@
 #include "MemoryBuffers.h" // for HostBufferMatrixElements::isaligned
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MemoryAccessHelpers.h b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MemoryAccessHelpers.h
index c82a6c7635..db73e4e064 100644
--- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MemoryAccessHelpers.h
+++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MemoryAccessHelpers.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MemoryAccessHelpers_H
 #define MemoryAccessHelpers_H 1
@@ -105,7 +105,7 @@ class KernelAccessHelper : public MemoryAccessHelper<T>
     }
     else
     {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
       const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid
       //printf( "kernelAccessRecord: ievt=%d threadId=%d\n", ievt, threadIdx.x );
       return T::ieventAccessRecord( buffer, ievt ); // NB fptype and fptype_sv coincide for CUDA
diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MemoryAccessMatrixElements.h b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MemoryAccessMatrixElements.h
index f32e6fea5b..3741011971 100644
--- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MemoryAccessMatrixElements.h
+++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MemoryAccessMatrixElements.h
@@ -13,7 +13,7 @@
 #include "MemoryBuffers.h" // for HostBufferMatrixElements::isaligned
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MemoryAccessMomenta.h b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MemoryAccessMomenta.h
index 29266de32c..3be229d392 100644
--- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MemoryAccessMomenta.h
+++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MemoryAccessMomenta.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MemoryAccessMomenta_H
 #define MemoryAccessMomenta_H 1
@@ -13,7 +13,7 @@
 #include "MemoryAccessVectors.h"
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -30,7 +30,7 @@ namespace mg5amcCpu
 
     // Number of Events Per Page in the momenta AOSOA memory buffer layout
     // (these are all best kept as a compile-time constants: see issue #23)
-#ifdef __CUDACC__ /* clang-format off */
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
     // -----------------------------------------------------------------------------------------------
     // --- GPUs: neppM is best set to a power of 2 times the number of fptype's in a 32-byte cacheline
     // --- This is relevant to ensure coalesced access to momenta in global memory
diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MemoryAccessNumerators.h b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MemoryAccessNumerators.h
index b152183b28..18991f4fa6 100644
--- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MemoryAccessNumerators.h
+++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MemoryAccessNumerators.h
@@ -10,7 +10,7 @@
 #include "MemoryAccessGs.h"
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MemoryAccessRandomNumbers.h b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MemoryAccessRandomNumbers.h
index e2988d39f3..40cb089135 100644
--- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MemoryAccessRandomNumbers.h
+++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MemoryAccessRandomNumbers.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MemoryAccessRandomNumbers_H
 #define MemoryAccessRandomNumbers_H 1
@@ -11,7 +11,7 @@
 #include "CPPProcess.h"
 #include "MemoryAccessHelpers.h"
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 using mg5amcGpu::CPPProcess;
 #else
 using mg5amcCpu::CPPProcess;
diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MemoryAccessVectors.h b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MemoryAccessVectors.h
index e9b197368e..08faccff0f 100644
--- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MemoryAccessVectors.h
+++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MemoryAccessVectors.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MemoryAccessVectors_H
 #define MemoryAccessVectors_H 1
@@ -10,7 +10,7 @@
 
 #include "mgOnGpuVectors.h"
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 namespace mg5amcCpu // this is only needed for CPU SIMD vectorization
 {
 
diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MemoryAccessWavefunctions.h b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MemoryAccessWavefunctions.h
index 5428aaf933..33bef4559e 100644
--- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MemoryAccessWavefunctions.h
+++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MemoryAccessWavefunctions.h
@@ -15,7 +15,7 @@
 #define MGONGPU_TRIVIAL_WAVEFUNCTIONS 1
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MemoryBuffers.h b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MemoryBuffers.h
index 3093e6ed18..7756a71621 100644
--- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MemoryBuffers.h
+++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MemoryBuffers.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021, based on earlier work by S. Hageboeck) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Roiser, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MemoryBuffers_H
 #define MemoryBuffers_H 1
@@ -11,12 +11,12 @@
 #include "mgOnGpuCxtypes.h"
 
 #include "CPPProcess.h"
-#include "CudaRuntime.h"
+#include "GpuRuntime.h"
 #include "Parameters_sm.h"
 
 #include <sstream>
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -87,7 +87,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   constexpr bool HostBufferALIGNED = false;   // ismisaligned=false
   constexpr bool HostBufferMISALIGNED = true; // ismisaligned=true
 
@@ -119,7 +119,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   // A class encapsulating a CUDA pinned host buffer
   template<typename T>
   class PinnedHostBufferBase : public BufferBase<T>
@@ -128,18 +128,18 @@ namespace mg5amcCpu
     PinnedHostBufferBase( const size_t size )
       : BufferBase<T>( size, false )
     {
-      checkCuda( cudaMallocHost( &( this->m_data ), this->bytes() ) );
+      gpuMallocHost( &( this->m_data ), this->bytes() );
     }
     virtual ~PinnedHostBufferBase()
     {
-      checkCuda( cudaFreeHost( this->m_data ) );
+      gpuFreeHost( this->m_data );
     }
   };
 #endif
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   // A class encapsulating a CUDA device buffer
   template<typename T>
   class DeviceBufferBase : public BufferBase<T>
@@ -148,18 +148,18 @@ namespace mg5amcCpu
     DeviceBufferBase( const size_t size )
       : BufferBase<T>( size, true )
     {
-      checkCuda( cudaMalloc( &( this->m_data ), this->bytes() ) );
+      gpuMalloc( &( this->m_data ), this->bytes() );
     }
     virtual ~DeviceBufferBase()
     {
-      checkCuda( cudaFree( this->m_data ) );
+      gpuFree( this->m_data );
     }
   };
 #endif
 
   //--------------------------------------------------------------------------
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for a given number of events
   template<typename T, size_t sizePerEvent, bool ismisaligned>
   class HostBuffer : public HostBufferBase<T, ismisaligned>, virtual private NumberOfEvents
@@ -175,7 +175,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   // A class encapsulating a CUDA pinned host buffer for a given number of events
   template<typename T, size_t sizePerEvent>
   class PinnedHostBuffer : public PinnedHostBufferBase<T>, virtual private NumberOfEvents
@@ -191,7 +191,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   // A class encapsulating a CUDA device buffer for a given number of events
   template<typename T, size_t sizePerEvent>
   class DeviceBuffer : public DeviceBufferBase<T>, virtual private NumberOfEvents
@@ -213,7 +213,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for momenta random numbers
   constexpr size_t sizePerEventRndNumMomenta = MemoryBuffers::np4 * MemoryBuffers::nparf;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for momenta random numbers
   typedef HostBuffer<fptype, sizePerEventRndNumMomenta, HostBufferALIGNED> HostBufferRndNumMomenta;
 #else
@@ -232,7 +232,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer with ONE fptype per event
   constexpr size_t sizePerEventOneFp = 1;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer with ONE fptype per event
   typedef HostBuffer<fptype, sizePerEventOneFp, HostBufferALIGNED> HostBufferOneFp;
 #else
@@ -257,7 +257,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for Gs
   constexpr size_t sizePerEventGs = 1;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for gs
   typedef HostBuffer<fptype, sizePerEventGs, HostBufferALIGNED> HostBufferGs;
 #else
@@ -276,7 +276,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for numerators
   constexpr size_t sizePerEventNumerators = 1;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for gs
   typedef HostBuffer<fptype, sizePerEventNumerators, HostBufferALIGNED> HostBufferNumerators;
 #else
@@ -296,7 +296,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for denominators
   constexpr size_t sizePerEventDenominators = 1;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for gs
   typedef HostBuffer<fptype, sizePerEventDenominators, HostBufferALIGNED> HostBufferDenominators;
 #else
@@ -315,7 +315,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for random numbers
   constexpr size_t sizePerEventCouplings = MemoryBuffers::ndcoup * MemoryBuffers::nx2;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for gs
   typedef HostBuffer<fptype, sizePerEventCouplings, HostBufferALIGNED> HostBufferCouplings;
 #else
@@ -333,7 +333,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for momenta
   constexpr size_t sizePerEventMomenta = MemoryBuffers::np4 * MemoryBuffers::npar;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for momenta
   typedef HostBuffer<fptype, sizePerEventMomenta, HostBufferALIGNED> HostBufferMomenta;
   //typedef HostBuffer<fptype, sizePerEventMomenta, HostBufferMISALIGNED> HostBufferMomenta; // TEST MISALIGNMENT!
@@ -352,7 +352,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for sampling weights
   constexpr size_t sizePerEventWeights = 1;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for sampling weights
   typedef HostBuffer<fptype, sizePerEventWeights, HostBufferALIGNED> HostBufferWeights;
 #else
@@ -370,7 +370,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for matrix elements
   constexpr size_t sizePerEventMatrixElements = 1;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for matrix elements
   typedef HostBuffer<fptype, sizePerEventMatrixElements, HostBufferALIGNED> HostBufferMatrixElements;
 #else
@@ -385,7 +385,7 @@ namespace mg5amcCpu
   // A base class encapsulating a memory buffer for the helicity mask
   typedef BufferBase<bool> BufferHelicityMask;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for the helicity mask
   typedef HostBufferBase<bool, HostBufferALIGNED> HostBufferHelicityMask;
 #else
@@ -403,7 +403,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for wavefunctions
   constexpr size_t sizePerEventWavefunctions = MemoryBuffers::nw6 * MemoryBuffers::nx2;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for wavefunctions
   typedef HostBuffer<fptype, sizePerEventWavefunctions, HostBufferALIGNED> HostBufferWavefunctions;
 #else
@@ -421,7 +421,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for helicity random numbers
   constexpr size_t sizePerEventRndNumHelicity = 1;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for helicity random numbers
   typedef HostBuffer<fptype, sizePerEventRndNumHelicity, HostBufferALIGNED> HostBufferRndNumHelicity;
 #else
@@ -439,7 +439,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for color random numbers
   constexpr size_t sizePerEventRndNumColor = 1;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for color random numbers
   typedef HostBuffer<fptype, sizePerEventRndNumColor, HostBufferALIGNED> HostBufferRndNumColor;
 #else
@@ -457,7 +457,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for helicity selection
   constexpr size_t sizePerEventSelectedHelicity = 1;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for helicity selection
   typedef HostBuffer<int, sizePerEventSelectedHelicity, HostBufferALIGNED> HostBufferSelectedHelicity;
 #else
@@ -475,7 +475,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for color selection
   constexpr size_t sizePerEventSelectedColor = 1;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for color selection
   typedef HostBuffer<int, sizePerEventSelectedColor, HostBufferALIGNED> HostBufferSelectedColor;
 #else
@@ -487,7 +487,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   template<class Tdst, class Tsrc>
   void copyDeviceFromHost( Tdst& dst, const Tsrc& src ) // keep the same order of arguments as in memcpy
   {
@@ -504,13 +504,13 @@ namespace mg5amcCpu
       throw std::runtime_error( sstr.str() );
     }
     // NB (PR #45): cudaMemcpy involves an intermediate memcpy to pinned memory if host array is a not a pinned host array
-    checkCuda( cudaMemcpy( dst.data(), src.data(), src.bytes(), cudaMemcpyHostToDevice ) );
+    gpuMemcpy( dst.data(), src.data(), src.bytes(), gpuMemcpyHostToDevice );
   }
 #endif
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   template<class Tdst, class Tsrc>
   void copyHostFromDevice( Tdst& dst, const Tsrc& src ) // keep the same order of arguments as in memcpy
   {
@@ -527,7 +527,7 @@ namespace mg5amcCpu
       throw std::runtime_error( sstr.str() );
     }
     // NB (PR #45): cudaMemcpy involves an intermediate memcpy to pinned memory if host array is a not a pinned host array
-    checkCuda( cudaMemcpy( dst.data(), src.data(), src.bytes(), cudaMemcpyDeviceToHost ) );
+    gpuMemcpy( dst.data(), src.data(), src.bytes(), gpuMemcpyDeviceToHost );
   }
 #endif
 
diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/CPPProcess.cc b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/CPPProcess.cc
index 18052b6676..f20c229897 100644
--- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/CPPProcess.cc
+++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/CPPProcess.cc
@@ -4,7 +4,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
 // MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
@@ -16,7 +16,6 @@
 
 #include "mgOnGpuConfig.h"
 
-#include "CudaRuntime.h"
 #include "HelAmps_sm.h"
 #include "MemoryAccessAmplitudes.h"
 #include "MemoryAccessCouplings.h"
@@ -46,7 +45,7 @@
 // Class member functions for calculating the matrix elements for
 // Process: g g > t t~ WEIGHTED<=2 @1
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -80,7 +79,7 @@ namespace mg5amcCpu
   __device__ const fptype cIPD[2] = { (fptype)Parameters_sm::mdl_MT, (fptype)Parameters_sm::mdl_WT };
   __device__ const fptype* cIPC = nullptr; // unused as nicoup=0
 #else
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   __device__ __constant__ fptype cIPD[2];
   __device__ __constant__ fptype* cIPC = nullptr; // unused as nicoup=0
 #else
@@ -90,7 +89,7 @@ namespace mg5amcCpu
 #endif
 
   // Helicity combinations (and filtering of "good" helicity combinations)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   __device__ __constant__ short cHel[ncomb][npar];
   __device__ __constant__ int cNGoodHel;
   __device__ __constant__ int cGoodHel[ncomb];
@@ -118,13 +117,13 @@ namespace mg5amcCpu
                            fptype* allDenominators,       // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
                            fptype_sv* jamp2_sv            // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled)
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
                            , const int ievt00             // input: first event number in current C++ event page (for CUDA, ievt depends on threadid)
 #endif
                            )
   //ALWAYS_INLINE // attributes are not permitted in a function definition
   {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     using namespace mg5amcGpu;
     using M_ACCESS = DeviceAccessMomenta;         // non-trivial access: buffer includes all events
     using E_ACCESS = DeviceAccessMatrixElements;  // non-trivial access: buffer includes all events
@@ -151,7 +150,7 @@ namespace mg5amcCpu
 #endif /* clang-format on */
     mgDebug( 0, __FUNCTION__ );
     //printf( "calculate_wavefunctions: ihel=%2d\n", ihel );
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
     //printf( "calculate_wavefunctions: ievt00=%d\n", ievt00 );
 #endif
 
@@ -187,7 +186,7 @@ namespace mg5amcCpu
 #endif
     for( int iParity = 0; iParity < nParity; ++iParity )
     { // START LOOP ON IPARITY
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
       const int ievt0 = ievt00 + iParity * neppV;
 #endif
       constexpr size_t nxcoup = ndcoup + nicoup; // both dependent and independent couplings
@@ -200,8 +199,10 @@ namespace mg5amcCpu
         allCOUPs[idcoup] = CD_ACCESS::idcoupAccessBufferConst( allcouplings, idcoup ); // dependent couplings, vary event-by-event
       for( size_t iicoup = 0; iicoup < nicoup; iicoup++ )
         allCOUPs[ndcoup + iicoup] = CI_ACCESS::iicoupAccessBufferConst( cIPC, iicoup ); // independent couplings, fixed for all events
+#ifdef MGONGPUCPP_GPUIMPL
 #ifdef __CUDACC__
 #pragma nv_diagnostic pop
+#endif
       // CUDA kernels take input/output buffers with momenta/MEs for all events
       const fptype* momenta = allmomenta;
       const fptype* COUPs[nxcoup];
@@ -302,7 +303,7 @@ namespace mg5amcCpu
         { 16, -2 },
         { -2, 16 } }; // 2-D array[2][2]
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
       // Pre-compute a constexpr triangular color matrix properly normalized #475
       struct TriangularNormalizedColorMatrix
       {
@@ -359,7 +360,7 @@ namespace mg5amcCpu
 #endif
       for( int icol = 0; icol < ncolor; icol++ )
       {
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
         // === C++ START ===
         // Diagonal terms
 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
@@ -418,7 +419,7 @@ namespace mg5amcCpu
       MEs_sv_previous += deltaMEs_previous;
 #endif
       /*
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
       if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", blockDim.x * blockIdx.x + threadIdx.x, ihel, MEs_sv );
 #else
 #ifdef MGONGPU_CPPSIMD
@@ -465,8 +466,8 @@ namespace mg5amcCpu
       { 1, 1, -1, -1 },
       { 1, 1, 1, 1 },
       { 1, 1, 1, -1 } };
-#ifdef __CUDACC__
-    checkCuda( cudaMemcpyToSymbol( cHel, tHel, ncomb * npar * sizeof( short ) ) );
+#ifdef MGONGPUCPP_GPUIMPL
+    gpuMemcpyToSymbol( cHel, tHel, ncomb * npar * sizeof( short ) );
 #else
     memcpy( cHel, tHel, ncomb * npar * sizeof( short ) );
 #endif
@@ -506,9 +507,9 @@ namespace mg5amcCpu
     // Then copy them to CUDA constant memory (issue #39) or its C++ emulation in file-scope static memory
     const fptype tIPD[2] = { (fptype)m_pars->mdl_MT, (fptype)m_pars->mdl_WT };
     //const cxtype tIPC[0] = { ... }; // nicoup=0
-#ifdef __CUDACC__
-    checkCuda( cudaMemcpyToSymbol( cIPD, tIPD, 2 * sizeof( fptype ) ) );
-    //checkCuda( cudaMemcpyToSymbol( cIPC, tIPC, 0 * sizeof( cxtype ) ) ); // nicoup=0
+#ifdef MGONGPUCPP_GPUIMPL
+    gpuMemcpyToSymbol( cIPD, tIPD, 2 * sizeof( fptype ) );
+    //gpuMemcpyToSymbol( cIPC, tIPC, 0 * sizeof( cxtype ) ); // nicoup=0
 #else
     memcpy( cIPD, tIPD, 2 * sizeof( fptype ) );
     //memcpy( cIPC, tIPC, 0 * sizeof( cxtype ) ); // nicoup=0
@@ -544,7 +545,7 @@ namespace mg5amcCpu
   {
     std::stringstream out;
     // CUDA version (NVCC)
-    // [Use __NVCC__ instead of __CUDACC__ here!]
+    // [Use __NVCC__ instead of MGONGPUCPP_GPUIMPL here!]
     // [This tests if 'nvcc' was used even to build a .cc file, even if not necessarily 'nvcc -x cu' for a .cu file]
     // [Check 'nvcc --compiler-options -dM -E dummy.c | grep CUDA': see https://stackoverflow.com/a/53713712]
 #ifdef __NVCC__
@@ -609,12 +610,12 @@ namespace mg5amcCpu
   __global__ void /* clang-format off */
   computeDependentCouplings( const fptype* allgs, // input: Gs[nevt]
                              fptype* allcouplings // output: couplings[nevt*ndcoup*2]
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
                              , const int nevt     // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #endif
   ) /* clang-format on */
   {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     using namespace mg5amcGpu;
     using G_ACCESS = DeviceAccessGs;
     using C_ACCESS = DeviceAccessCouplings;
@@ -635,7 +636,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__ /* clang-format off */
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
   __global__ void
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
@@ -761,9 +762,9 @@ namespace mg5amcCpu
         nGoodHel++;
       }
     }
-#ifdef __CUDACC__
-    checkCuda( cudaMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) ) );
-    checkCuda( cudaMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) ) );
+#ifdef MGONGPUCPP_GPUIMPL
+    gpuMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) );
+    gpuMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) );
 #else
     cNGoodHel = nGoodHel;
     for( int ihel = 0; ihel < ncomb; ihel++ ) cGoodHel[ihel] = goodHel[ihel];
@@ -787,7 +788,7 @@ namespace mg5amcCpu
 #endif
             int* allselhel,                // output: helicity selection[nevt]
             int* allselcol                 // output: helicity selection[nevt]
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
             , const int nevt               // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #endif
             ) /* clang-format on */
@@ -807,7 +808,7 @@ namespace mg5amcCpu
     // Denominators: spins, colors and identical particles
     constexpr int helcolDenominators[1] = { 256 }; // assume nprocesses == 1 (#272 and #343)
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     // Remember: in CUDA this is a kernel for one event, in c++ this processes n events
     const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid
 #else
@@ -821,9 +822,12 @@ namespace mg5amcCpu
 #endif
 
     // Start sigmaKin_lines
+
+#include "GpuAbstraction.h"
+
     // === PART 0 - INITIALISATION (before calculate_wavefunctions) ===
     // Reset the "matrix elements" - running sums of |M|^2 over helicities for the given event
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     allMEs[ievt] = 0;
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     allNumerators[ievt] = 0;
@@ -851,7 +855,7 @@ namespace mg5amcCpu
     // === PART 1 - HELICITY LOOP: CALCULATE WAVEFUNCTIONS ===
     // (in both CUDA and C++, using precomputed good helicities)
 
-#ifdef __CUDACC__ // CUDA OR C++
+#ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++
 
     // *** START OF PART 1a - CUDA (one event per CPU thread) ***
     // Running sum of partial amplitudes squared for event by event color selection (#402)
@@ -1061,7 +1065,7 @@ namespace mg5amcCpu
     // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event
     // [NB 'sum over final spins, average over initial spins', eg see
     // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf]
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     allMEs[ievt] /= helcolDenominators[0];
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     if( channelId > 0 ) allMEs[ievt] *= allNumerators[ievt] / allDenominators[ievt];
diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/CPPProcess.h b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/CPPProcess.h
index 3ebd92c038..4a88a07226 100644
--- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/CPPProcess.h
+++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/CPPProcess.h
@@ -4,7 +4,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
 // MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
@@ -25,7 +25,7 @@
 
 //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -107,7 +107,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   __global__ void
   computeDependentCouplings( const fptype* allgs,    // input: Gs[nevt]
                              fptype* allcouplings ); // output: couplings[nevt*ndcoup*2]
@@ -120,7 +120,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__ /* clang-format off */
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
   __global__ void
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
@@ -150,7 +150,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__ /* clang-format off */
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
   __global__ void
   sigmaKin( const fptype* allmomenta,      // input: momenta[nevt*npar*4]
             const fptype* allcouplings,    // input: couplings[nevt*ndcoup*2]
diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/CudaRuntime.h b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/CudaRuntime.h
deleted file mode 120000
index ce9e1a487a..0000000000
--- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/CudaRuntime.h
+++ /dev/null
@@ -1 +0,0 @@
-../CudaRuntime.h
\ No newline at end of file
diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/GpuAbstraction.h b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/GpuAbstraction.h
new file mode 120000
index 0000000000..72054e19ba
--- /dev/null
+++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/GpuAbstraction.h
@@ -0,0 +1 @@
+../GpuAbstraction.h
\ No newline at end of file
diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/GpuRuntime.h b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/GpuRuntime.h
new file mode 120000
index 0000000000..3920e83be4
--- /dev/null
+++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/GpuRuntime.h
@@ -0,0 +1 @@
+../GpuRuntime.h
\ No newline at end of file
diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/check_sa.cc b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/check_sa.cc
index 3fbf0ffbee..7cac5ab47b 100644
--- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/check_sa.cc
+++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/check_sa.cc
@@ -4,7 +4,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: O. Mattelaer (Nov 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 
 #include "mgOnGpuConfig.h"
@@ -12,6 +12,7 @@
 #include "BridgeKernels.h"
 #include "CPPProcess.h"
 #include "CrossSectionKernels.h"
+#include "GpuRuntime.h"
 #include "MatrixElementKernels.h"
 #include "MemoryAccessMatrixElements.h"
 #include "MemoryAccessMomenta.h"
@@ -65,7 +66,7 @@ usage( char* argv0, int ret = 1 )
   std::cout << std::endl;
   std::cout << "Summary stats are always computed: '-p' and '-j' only control their printout" << std::endl;
   std::cout << "The '-d' flag only enables NaN/abnormal warnings and OMP debugging" << std::endl;
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 #ifdef _OPENMP
   std::cout << std::endl;
   std::cout << "Use the OMP_NUM_THREADS environment variable to control OMP multi-threading" << std::endl;
@@ -96,7 +97,7 @@ int
 main( int argc, char** argv )
 {
   // Namespaces for CUDA and C++ (FIXME - eventually use the same namespace everywhere...)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   using namespace mg5amcGpu;
 #else
   using namespace mg5amcCpu;
@@ -134,9 +135,11 @@ main( int argc, char** argv )
     CurandDevice = 2
   };
 #ifdef MGONGPU_HAS_NO_CURAND
-  RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // this is the only supported mode if build has no curand (PR #784)
+  RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // this is the only supported mode if build has no curand (PR #784 and #785)
+#elif defined __HIPCC__
+#error Internal error: MGONGPU_HAS_NO_CURAND should have been set for __HIPCC__ // default on AMD GPUs should be common random
 #elif defined __CUDACC__
-  RandomNumberMode rndgen = RandomNumberMode::CurandDevice; // default on GPU if build has curand
+  RandomNumberMode rndgen = RandomNumberMode::CurandDevice; // default on NVidia GPU if build has curand
 #else
   RandomNumberMode rndgen = RandomNumberMode::CurandHost; // default on CPU if build has curand
 #endif
@@ -146,10 +149,10 @@ main( int argc, char** argv )
     RamboHost = 1,
     RamboDevice = 2
   };
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   RamboSamplingMode rmbsmp = RamboSamplingMode::RamboDevice; // default on GPU
 #else
-  RamboSamplingMode rmbsmp = RamboSamplingMode::RamboHost;  // default on CPU
+  RamboSamplingMode rmbsmp = RamboSamplingMode::RamboHost; // default on CPU
 #endif
   // Bridge emulation mode (NB Bridge implies RamboHost!)
   bool bridge = false;
@@ -177,7 +180,7 @@ main( int argc, char** argv )
     else if( arg == "--curdev" )
     {
 #ifndef __CUDACC__
-      throw std::runtime_error( "CurandDevice is not supported on CPUs" );
+      throw std::runtime_error( "CurandDevice is not supported on CPUs or non-NVidia GPUs" );
 #elif defined MGONGPU_HAS_NO_CURAND
       throw std::runtime_error( "CurandDevice is not supported because this application was built without Curand support" );
 #else
@@ -198,7 +201,7 @@ main( int argc, char** argv )
     }
     else if( arg == "--rmbdev" )
     {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
       rmbsmp = RamboSamplingMode::RamboDevice;
 #else
       throw std::runtime_error( "RamboDevice is not supported on CPUs" );
@@ -272,13 +275,13 @@ main( int argc, char** argv )
     return usage( argv[0] );
   }
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 #ifdef _OPENMP
   ompnumthreadsNotSetMeansOneThread( debug ? 1 : 0 ); // quiet(-1), info(0), debug(1)
 #endif
 #endif
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // Fail gently and avoid "Illegal instruction (core dumped)" if the host does not support the SIMD used in the ME calculation
   // Note: this prevents a crash on pmpe04 but not on some github CI nodes?
   // [NB: SIMD vectorization in mg5amc C++ code is only used in the ME calculation below MatrixElementKernelHost!]
@@ -296,14 +299,14 @@ main( int argc, char** argv )
 
   // === STEP 0 - INITIALISE
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 
-  // --- 00. Initialise cuda
-  // Instantiate a CudaRuntime at the beginnining of the application's main to
-  // invoke cudaSetDevice(0) in the constructor and book a cudaDeviceReset() call in the destructor
-  const std::string cdinKey = "00 CudaInit";
+  // --- 00. Initialise GPU
+  // Instantiate a GpuRuntime at the beginnining of the application's main.
+  // For CUDA this invokes cudaSetDevice(0) in the constructor and books a cudaDeviceReset() call in the destructor.
+  const std::string cdinKey = "00 GpuInit";
   timermap.start( cdinKey );
-  CudaRuntime cudaRuntime( debug );
+  GpuRuntime GpuRuntime( debug );
 #endif
 
   // --- 0a. Initialise physics process
@@ -325,7 +328,7 @@ main( int argc, char** argv )
   timermap.start( alloKey );
 
   // Memory buffers for random numbers for momenta
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferRndNumMomenta hstRndmom( nevt );
 #else
   PinnedHostBufferRndNumMomenta hstRndmom( nevt );
@@ -333,7 +336,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for sampling weights
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferWeights hstWeights( nevt );
 #else
   PinnedHostBufferWeights hstWeights( nevt );
@@ -341,7 +344,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for momenta
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferMomenta hstMomenta( nevt );
 #else
   PinnedHostBufferMomenta hstMomenta( nevt );
@@ -349,7 +352,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for Gs
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferGs hstGs( nevt );
 #else
   PinnedHostBufferGs hstGs( nevt );
@@ -366,7 +369,7 @@ main( int argc, char** argv )
   }
 
   // Memory buffers for matrix elements
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferMatrixElements hstMatrixElements( nevt );
 #else
   PinnedHostBufferMatrixElements hstMatrixElements( nevt );
@@ -375,7 +378,7 @@ main( int argc, char** argv )
 
   // Memory buffers for random numbers for helicity selection
   // *** NB #403 these buffers always remain initialised at 0: no need for helicity choice in gcheck/check (no LHE produced) ***
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferRndNumHelicity hstRndHel( nevt );
 #else
   PinnedHostBufferRndNumHelicity hstRndHel( nevt );
@@ -384,7 +387,7 @@ main( int argc, char** argv )
 
   // Memory buffers for random numbers for color selection
   // *** NB #402 these buffers always remain initialised at 0: no need for color choice in gcheck/check (no LHE produced) ***
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferRndNumColor hstRndCol( nevt );
 #else
   PinnedHostBufferRndNumColor hstRndCol( nevt );
@@ -392,7 +395,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for helicity selection
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferSelectedHelicity hstSelHel( nevt );
 #else
   PinnedHostBufferSelectedHelicity hstSelHel( nevt );
@@ -400,7 +403,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for color selection
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferSelectedColor hstSelCol( nevt );
 #else
   PinnedHostBufferSelectedColor hstSelCol( nevt );
@@ -438,7 +441,7 @@ main( int argc, char** argv )
     const bool onDevice = true;
     prnk.reset( new CurandRandomNumberKernel( devRndmom, onDevice ) );
 #else
-    throw std::logic_error( "INTERNAL ERROR! CurandDevice is not supported on CPUs" ); // INTERNAL ERROR (no path to this statement)
+    throw std::logic_error( "INTERNAL ERROR! CurandDevice is not supported on CPUs or non-NVidia GPUs" ); // INTERNAL ERROR (no path to this statement)
 #endif
   }
 
@@ -450,7 +453,7 @@ main( int argc, char** argv )
   }
   else
   {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     prsk.reset( new RamboSamplingKernelDevice( energy, devRndmom, devMomenta, devWeights, gpublocks, gputhreads ) );
 #else
     throw std::logic_error( "RamboDevice is not supported on CPUs" ); // INTERNAL ERROR (no path to this statement)
@@ -461,7 +464,7 @@ main( int argc, char** argv )
   std::unique_ptr<MatrixElementKernelBase> pmek;
   if( !bridge )
   {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     pmek.reset( new MatrixElementKernelDevice( devMomenta, devGs, devRndHel, devRndCol, devMatrixElements, devSelHel, devSelCol, gpublocks, gputhreads ) );
 #else
     pmek.reset( new MatrixElementKernelHost( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, nevt ) );
@@ -469,7 +472,7 @@ main( int argc, char** argv )
   }
   else
   {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     pmek.reset( new BridgeKernelDevice( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, gpublocks, gputhreads ) );
 #else
     pmek.reset( new BridgeKernelHost( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, nevt ) );
@@ -511,7 +514,7 @@ main( int argc, char** argv )
     prnk->generateRnarray();
     //std::cout << "Got random numbers" << std::endl;
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     if( rndgen != RandomNumberMode::CurandDevice && rmbsmp == RamboSamplingMode::RamboDevice )
     {
       // --- 1c. Copy rndmom from host to device
@@ -543,7 +546,7 @@ main( int argc, char** argv )
     prsk->getMomentaFinal();
     //std::cout << "Got final momenta" << std::endl;
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     if( rmbsmp == RamboSamplingMode::RamboDevice )
     {
       // --- 2c. CopyDToH Weights
@@ -588,7 +591,7 @@ main( int argc, char** argv )
       dynamic_cast<BridgeKernelBase*>( pmek.get() )->transposeInputMomentaC2F();
     }
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     // --- 2d. CopyHToD Momenta
     const std::string gKey = "0.. CpHTDg";
     rambtime += timermap.start( gKey ); // FIXME! NOT A RAMBO TIMER!
@@ -617,7 +620,7 @@ main( int argc, char** argv )
     wv3atime += timermap.stop(); // calc only
     wavetime += wv3atime;        // calc plus copy
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     if( !bridge )
     {
       // --- 3b. CopyDToH MEs
@@ -760,18 +763,22 @@ main( int argc, char** argv )
     rndgentxt = "CURAND DEVICE";
 #ifdef __CUDACC__
   rndgentxt += " (CUDA code)";
+#elif defined __HIPCC__
+  rndgentxt += " (HIP code)";
 #else
   rndgentxt += " (C++ code)";
 #endif
 
   // Workflow description summary
   std::string wrkflwtxt;
-  // -- CUDA or C++?
+  // -- CUDA or HIP or C++?
 #ifdef __CUDACC__
   wrkflwtxt += "CUD:";
+#elif defined __HIPCC__
+  wrkflwtxt += "HIP:";
 #else
   wrkflwtxt += "CPP:";
-#endif
+#endif /* clang-format off */
   // -- DOUBLE or FLOAT?
 #if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
   wrkflwtxt += "MIX+"; // mixed fptypes (single precision color algebra #537)
@@ -781,7 +788,7 @@ main( int argc, char** argv )
   wrkflwtxt += "FLT+";
 #else
   wrkflwtxt += "???+"; // no path to this statement
-#endif
+#endif /* clang-format on */
   // -- CUCOMPLEX or THRUST or STD complex numbers?
 #ifdef __CUDACC__
 #if defined MGONGPU_CUCXTYPE_CUCOMPLEX
@@ -793,6 +800,12 @@ main( int argc, char** argv )
 #else
   wrkflwtxt += "???:"; // no path to this statement
 #endif
+#elif defined __HIPCC__
+#if defined MGONGPU_CUCXTYPE_CXSMPL
+  wrkflwtxt += "CXS:";
+#else
+  wrkflwtxt += "???:"; // no path to this statement
+#endif
 #else
 #if defined MGONGPU_CPPCXTYPE_STDCOMPLEX
   wrkflwtxt += "STX:";
@@ -818,7 +831,7 @@ main( int argc, char** argv )
     wrkflwtxt += "RMBDEV+";
   else
     wrkflwtxt += "??????+"; // no path to this statement
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   // -- HOST or DEVICE matrix elements? Standalone MEs or BRIDGE?
   if( !bridge )
     wrkflwtxt += "MESDEV";
@@ -874,7 +887,7 @@ main( int argc, char** argv )
 
   if( perf )
   {
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 #ifdef _OPENMP
     // Get the output of "nproc --all" (https://stackoverflow.com/a/478960)
     std::string nprocall;
@@ -895,6 +908,8 @@ main( int argc, char** argv )
     std::cout << std::string( SEP79, '*' ) << std::endl
 #ifdef __CUDACC__
               << "Process                     = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_CUDA"
+#elif defined __HIPCC__
+              << "Process                     = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_HIP"
 #else
               << "Process                     = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_CPP"
 #endif
@@ -921,21 +936,21 @@ main( int argc, char** argv )
 #elif defined MGONGPU_FPTYPE_FLOAT
               << "FP precision                = FLOAT (NaN/abnormal=" << nabn << ", zero=" << nzero << ")" << std::endl
 #endif
-#ifdef __CUDACC__
 #if defined MGONGPU_CUCXTYPE_CUCOMPLEX
               << "Complex type                = CUCOMPLEX" << std::endl
 #elif defined MGONGPU_CUCXTYPE_THRUST
               << "Complex type                = THRUST::COMPLEX" << std::endl
-#endif
-#else
+#elif defined MGONGPU_CUCXTYPE_CXSMPL
               << "Complex type                = STD::COMPLEX" << std::endl
+#else
+              << "Complex type                = ???" << std::endl // no path to this statement...
 #endif
               << "RanNumb memory layout       = AOSOA[" << neppR << "]"
               << ( neppR == 1 ? " == AOS" : "" )
               << " [HARDCODED FOR REPRODUCIBILITY]" << std::endl
               << "Momenta memory layout       = AOSOA[" << neppM << "]"
               << ( neppM == 1 ? " == AOS" : "" ) << std::endl
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     //<< "Wavefunction GPU memory     = LOCAL" << std::endl
 #else
 #if !defined MGONGPU_CPPSIMD
@@ -966,7 +981,7 @@ main( int argc, char** argv )
 #endif
 #endif
               << "Random number generation    = " << rndgentxt << std::endl
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 #ifdef _OPENMP
               << "OMP threads / `nproc --all` = " << omp_get_max_threads() << " / " << nprocall // includes a newline
 #endif
@@ -1062,14 +1077,14 @@ main( int argc, char** argv )
              << "\"FLOAT (NaN/abnormal=" << nabn << ")\"," << std::endl
 #endif
              << "\"Complex type\": "
-#ifdef __CUDACC__
 #if defined MGONGPU_CUCXTYPE_CUCOMPLEX
              << "\"CUCOMPLEX\"," << std::endl
 #elif defined MGONGPU_CUCXTYPE_THRUST
              << "\"THRUST::COMPLEX\"," << std::endl
-#endif
-#else
+#elif defined MGONGPU_CUCXTYPE_CXSMPL
              << "\"STD::COMPLEX\"," << std::endl
+#else
+             << "\"???\"," << std::endl                           // no path to this statement...
 #endif
              << "\"RanNumb memory layout\": "
              << "\"AOSOA[" << neppR << "]\""
@@ -1077,7 +1092,7 @@ main( int argc, char** argv )
              << "\"Momenta memory layout\": "
              << "\"AOSOA[" << neppM << "]\""
              << ( neppM == 1 ? " == AOS" : "" ) << ", " << std::endl
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     //<< "\"Wavefunction GPU memory\": " << "\"LOCAL\"," << std::endl
 #endif
              << "\"Curand generation\": "
diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/CPPProcess.cc b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/CPPProcess.cc
index bfab81142d..3c7715b235 100644
--- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/CPPProcess.cc
+++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/CPPProcess.cc
@@ -4,7 +4,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
 // MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
@@ -16,7 +16,6 @@
 
 #include "mgOnGpuConfig.h"
 
-#include "CudaRuntime.h"
 #include "HelAmps_sm.h"
 #include "MemoryAccessAmplitudes.h"
 #include "MemoryAccessCouplings.h"
@@ -46,7 +45,7 @@
 // Class member functions for calculating the matrix elements for
 // Process: g g > t t~ g WEIGHTED<=3 @2
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -80,7 +79,7 @@ namespace mg5amcCpu
   __device__ const fptype cIPD[2] = { (fptype)Parameters_sm::mdl_MT, (fptype)Parameters_sm::mdl_WT };
   __device__ const fptype* cIPC = nullptr; // unused as nicoup=0
 #else
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   __device__ __constant__ fptype cIPD[2];
   __device__ __constant__ fptype* cIPC = nullptr; // unused as nicoup=0
 #else
@@ -90,7 +89,7 @@ namespace mg5amcCpu
 #endif
 
   // Helicity combinations (and filtering of "good" helicity combinations)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   __device__ __constant__ short cHel[ncomb][npar];
   __device__ __constant__ int cNGoodHel;
   __device__ __constant__ int cGoodHel[ncomb];
@@ -118,13 +117,13 @@ namespace mg5amcCpu
                            fptype* allDenominators,       // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
                            fptype_sv* jamp2_sv            // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled)
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
                            , const int ievt00             // input: first event number in current C++ event page (for CUDA, ievt depends on threadid)
 #endif
                            )
   //ALWAYS_INLINE // attributes are not permitted in a function definition
   {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     using namespace mg5amcGpu;
     using M_ACCESS = DeviceAccessMomenta;         // non-trivial access: buffer includes all events
     using E_ACCESS = DeviceAccessMatrixElements;  // non-trivial access: buffer includes all events
@@ -151,7 +150,7 @@ namespace mg5amcCpu
 #endif /* clang-format on */
     mgDebug( 0, __FUNCTION__ );
     //printf( "calculate_wavefunctions: ihel=%2d\n", ihel );
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
     //printf( "calculate_wavefunctions: ievt00=%d\n", ievt00 );
 #endif
 
@@ -187,7 +186,7 @@ namespace mg5amcCpu
 #endif
     for( int iParity = 0; iParity < nParity; ++iParity )
     { // START LOOP ON IPARITY
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
       const int ievt0 = ievt00 + iParity * neppV;
 #endif
       constexpr size_t nxcoup = ndcoup + nicoup; // both dependent and independent couplings
@@ -200,8 +199,10 @@ namespace mg5amcCpu
         allCOUPs[idcoup] = CD_ACCESS::idcoupAccessBufferConst( allcouplings, idcoup ); // dependent couplings, vary event-by-event
       for( size_t iicoup = 0; iicoup < nicoup; iicoup++ )
         allCOUPs[ndcoup + iicoup] = CI_ACCESS::iicoupAccessBufferConst( cIPC, iicoup ); // independent couplings, fixed for all events
+#ifdef MGONGPUCPP_GPUIMPL
 #ifdef __CUDACC__
 #pragma nv_diagnostic pop
+#endif
       // CUDA kernels take input/output buffers with momenta/MEs for all events
       const fptype* momenta = allmomenta;
       const fptype* COUPs[nxcoup];
@@ -505,7 +506,7 @@ namespace mg5amcCpu
         { 1, -8, 10, 1, 64, -8 },
         { 10, 1, 1, -8, -8, 64 } }; // 2-D array[6][6]
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
       // Pre-compute a constexpr triangular color matrix properly normalized #475
       struct TriangularNormalizedColorMatrix
       {
@@ -562,7 +563,7 @@ namespace mg5amcCpu
 #endif
       for( int icol = 0; icol < ncolor; icol++ )
       {
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
         // === C++ START ===
         // Diagonal terms
 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
@@ -621,7 +622,7 @@ namespace mg5amcCpu
       MEs_sv_previous += deltaMEs_previous;
 #endif
       /*
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
       if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", blockDim.x * blockIdx.x + threadIdx.x, ihel, MEs_sv );
 #else
 #ifdef MGONGPU_CPPSIMD
@@ -684,8 +685,8 @@ namespace mg5amcCpu
       { 1, 1, 1, 1, 1 },
       { 1, 1, 1, -1, -1 },
       { 1, 1, 1, -1, 1 } };
-#ifdef __CUDACC__
-    checkCuda( cudaMemcpyToSymbol( cHel, tHel, ncomb * npar * sizeof( short ) ) );
+#ifdef MGONGPUCPP_GPUIMPL
+    gpuMemcpyToSymbol( cHel, tHel, ncomb * npar * sizeof( short ) );
 #else
     memcpy( cHel, tHel, ncomb * npar * sizeof( short ) );
 #endif
@@ -726,9 +727,9 @@ namespace mg5amcCpu
     // Then copy them to CUDA constant memory (issue #39) or its C++ emulation in file-scope static memory
     const fptype tIPD[2] = { (fptype)m_pars->mdl_MT, (fptype)m_pars->mdl_WT };
     //const cxtype tIPC[0] = { ... }; // nicoup=0
-#ifdef __CUDACC__
-    checkCuda( cudaMemcpyToSymbol( cIPD, tIPD, 2 * sizeof( fptype ) ) );
-    //checkCuda( cudaMemcpyToSymbol( cIPC, tIPC, 0 * sizeof( cxtype ) ) ); // nicoup=0
+#ifdef MGONGPUCPP_GPUIMPL
+    gpuMemcpyToSymbol( cIPD, tIPD, 2 * sizeof( fptype ) );
+    //gpuMemcpyToSymbol( cIPC, tIPC, 0 * sizeof( cxtype ) ); // nicoup=0
 #else
     memcpy( cIPD, tIPD, 2 * sizeof( fptype ) );
     //memcpy( cIPC, tIPC, 0 * sizeof( cxtype ) ); // nicoup=0
@@ -765,7 +766,7 @@ namespace mg5amcCpu
   {
     std::stringstream out;
     // CUDA version (NVCC)
-    // [Use __NVCC__ instead of __CUDACC__ here!]
+    // [Use __NVCC__ instead of MGONGPUCPP_GPUIMPL here!]
     // [This tests if 'nvcc' was used even to build a .cc file, even if not necessarily 'nvcc -x cu' for a .cu file]
     // [Check 'nvcc --compiler-options -dM -E dummy.c | grep CUDA': see https://stackoverflow.com/a/53713712]
 #ifdef __NVCC__
@@ -830,12 +831,12 @@ namespace mg5amcCpu
   __global__ void /* clang-format off */
   computeDependentCouplings( const fptype* allgs, // input: Gs[nevt]
                              fptype* allcouplings // output: couplings[nevt*ndcoup*2]
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
                              , const int nevt     // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #endif
   ) /* clang-format on */
   {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     using namespace mg5amcGpu;
     using G_ACCESS = DeviceAccessGs;
     using C_ACCESS = DeviceAccessCouplings;
@@ -856,7 +857,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__ /* clang-format off */
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
   __global__ void
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
@@ -982,9 +983,9 @@ namespace mg5amcCpu
         nGoodHel++;
       }
     }
-#ifdef __CUDACC__
-    checkCuda( cudaMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) ) );
-    checkCuda( cudaMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) ) );
+#ifdef MGONGPUCPP_GPUIMPL
+    gpuMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) );
+    gpuMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) );
 #else
     cNGoodHel = nGoodHel;
     for( int ihel = 0; ihel < ncomb; ihel++ ) cGoodHel[ihel] = goodHel[ihel];
@@ -1008,7 +1009,7 @@ namespace mg5amcCpu
 #endif
             int* allselhel,                // output: helicity selection[nevt]
             int* allselcol                 // output: helicity selection[nevt]
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
             , const int nevt               // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #endif
             ) /* clang-format on */
@@ -1028,7 +1029,7 @@ namespace mg5amcCpu
     // Denominators: spins, colors and identical particles
     constexpr int helcolDenominators[1] = { 256 }; // assume nprocesses == 1 (#272 and #343)
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     // Remember: in CUDA this is a kernel for one event, in c++ this processes n events
     const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid
 #else
@@ -1042,9 +1043,12 @@ namespace mg5amcCpu
 #endif
 
     // Start sigmaKin_lines
+
+#include "GpuAbstraction.h"
+
     // === PART 0 - INITIALISATION (before calculate_wavefunctions) ===
     // Reset the "matrix elements" - running sums of |M|^2 over helicities for the given event
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     allMEs[ievt] = 0;
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     allNumerators[ievt] = 0;
@@ -1072,7 +1076,7 @@ namespace mg5amcCpu
     // === PART 1 - HELICITY LOOP: CALCULATE WAVEFUNCTIONS ===
     // (in both CUDA and C++, using precomputed good helicities)
 
-#ifdef __CUDACC__ // CUDA OR C++
+#ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++
 
     // *** START OF PART 1a - CUDA (one event per CPU thread) ***
     // Running sum of partial amplitudes squared for event by event color selection (#402)
@@ -1282,7 +1286,7 @@ namespace mg5amcCpu
     // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event
     // [NB 'sum over final spins, average over initial spins', eg see
     // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf]
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     allMEs[ievt] /= helcolDenominators[0];
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     if( channelId > 0 ) allMEs[ievt] *= allNumerators[ievt] / allDenominators[ievt];
diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/CPPProcess.h b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/CPPProcess.h
index 3901ddcb20..d4b3c0445c 100644
--- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/CPPProcess.h
+++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/CPPProcess.h
@@ -4,7 +4,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
 // MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
@@ -25,7 +25,7 @@
 
 //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -107,7 +107,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   __global__ void
   computeDependentCouplings( const fptype* allgs,    // input: Gs[nevt]
                              fptype* allcouplings ); // output: couplings[nevt*ndcoup*2]
@@ -120,7 +120,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__ /* clang-format off */
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
   __global__ void
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
@@ -150,7 +150,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__ /* clang-format off */
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
   __global__ void
   sigmaKin( const fptype* allmomenta,      // input: momenta[nevt*npar*4]
             const fptype* allcouplings,    // input: couplings[nevt*ndcoup*2]
diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/CudaRuntime.h b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/CudaRuntime.h
deleted file mode 120000
index ce9e1a487a..0000000000
--- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/CudaRuntime.h
+++ /dev/null
@@ -1 +0,0 @@
-../CudaRuntime.h
\ No newline at end of file
diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/GpuAbstraction.h b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/GpuAbstraction.h
new file mode 120000
index 0000000000..72054e19ba
--- /dev/null
+++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/GpuAbstraction.h
@@ -0,0 +1 @@
+../GpuAbstraction.h
\ No newline at end of file
diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/GpuRuntime.h b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/GpuRuntime.h
new file mode 120000
index 0000000000..3920e83be4
--- /dev/null
+++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/GpuRuntime.h
@@ -0,0 +1 @@
+../GpuRuntime.h
\ No newline at end of file
diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/check_sa.cc b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/check_sa.cc
index 3fbf0ffbee..7cac5ab47b 100644
--- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/check_sa.cc
+++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/check_sa.cc
@@ -4,7 +4,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: O. Mattelaer (Nov 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 
 #include "mgOnGpuConfig.h"
@@ -12,6 +12,7 @@
 #include "BridgeKernels.h"
 #include "CPPProcess.h"
 #include "CrossSectionKernels.h"
+#include "GpuRuntime.h"
 #include "MatrixElementKernels.h"
 #include "MemoryAccessMatrixElements.h"
 #include "MemoryAccessMomenta.h"
@@ -65,7 +66,7 @@ usage( char* argv0, int ret = 1 )
   std::cout << std::endl;
   std::cout << "Summary stats are always computed: '-p' and '-j' only control their printout" << std::endl;
   std::cout << "The '-d' flag only enables NaN/abnormal warnings and OMP debugging" << std::endl;
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 #ifdef _OPENMP
   std::cout << std::endl;
   std::cout << "Use the OMP_NUM_THREADS environment variable to control OMP multi-threading" << std::endl;
@@ -96,7 +97,7 @@ int
 main( int argc, char** argv )
 {
   // Namespaces for CUDA and C++ (FIXME - eventually use the same namespace everywhere...)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   using namespace mg5amcGpu;
 #else
   using namespace mg5amcCpu;
@@ -134,9 +135,11 @@ main( int argc, char** argv )
     CurandDevice = 2
   };
 #ifdef MGONGPU_HAS_NO_CURAND
-  RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // this is the only supported mode if build has no curand (PR #784)
+  RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // this is the only supported mode if build has no curand (PR #784 and #785)
+#elif defined __HIPCC__
+#error Internal error: MGONGPU_HAS_NO_CURAND should have been set for __HIPCC__ // default on AMD GPUs should be common random
 #elif defined __CUDACC__
-  RandomNumberMode rndgen = RandomNumberMode::CurandDevice; // default on GPU if build has curand
+  RandomNumberMode rndgen = RandomNumberMode::CurandDevice; // default on NVidia GPU if build has curand
 #else
   RandomNumberMode rndgen = RandomNumberMode::CurandHost; // default on CPU if build has curand
 #endif
@@ -146,10 +149,10 @@ main( int argc, char** argv )
     RamboHost = 1,
     RamboDevice = 2
   };
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   RamboSamplingMode rmbsmp = RamboSamplingMode::RamboDevice; // default on GPU
 #else
-  RamboSamplingMode rmbsmp = RamboSamplingMode::RamboHost;  // default on CPU
+  RamboSamplingMode rmbsmp = RamboSamplingMode::RamboHost; // default on CPU
 #endif
   // Bridge emulation mode (NB Bridge implies RamboHost!)
   bool bridge = false;
@@ -177,7 +180,7 @@ main( int argc, char** argv )
     else if( arg == "--curdev" )
     {
 #ifndef __CUDACC__
-      throw std::runtime_error( "CurandDevice is not supported on CPUs" );
+      throw std::runtime_error( "CurandDevice is not supported on CPUs or non-NVidia GPUs" );
 #elif defined MGONGPU_HAS_NO_CURAND
       throw std::runtime_error( "CurandDevice is not supported because this application was built without Curand support" );
 #else
@@ -198,7 +201,7 @@ main( int argc, char** argv )
     }
     else if( arg == "--rmbdev" )
     {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
       rmbsmp = RamboSamplingMode::RamboDevice;
 #else
       throw std::runtime_error( "RamboDevice is not supported on CPUs" );
@@ -272,13 +275,13 @@ main( int argc, char** argv )
     return usage( argv[0] );
   }
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 #ifdef _OPENMP
   ompnumthreadsNotSetMeansOneThread( debug ? 1 : 0 ); // quiet(-1), info(0), debug(1)
 #endif
 #endif
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // Fail gently and avoid "Illegal instruction (core dumped)" if the host does not support the SIMD used in the ME calculation
   // Note: this prevents a crash on pmpe04 but not on some github CI nodes?
   // [NB: SIMD vectorization in mg5amc C++ code is only used in the ME calculation below MatrixElementKernelHost!]
@@ -296,14 +299,14 @@ main( int argc, char** argv )
 
   // === STEP 0 - INITIALISE
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 
-  // --- 00. Initialise cuda
-  // Instantiate a CudaRuntime at the beginnining of the application's main to
-  // invoke cudaSetDevice(0) in the constructor and book a cudaDeviceReset() call in the destructor
-  const std::string cdinKey = "00 CudaInit";
+  // --- 00. Initialise GPU
+  // Instantiate a GpuRuntime at the beginnining of the application's main.
+  // For CUDA this invokes cudaSetDevice(0) in the constructor and books a cudaDeviceReset() call in the destructor.
+  const std::string cdinKey = "00 GpuInit";
   timermap.start( cdinKey );
-  CudaRuntime cudaRuntime( debug );
+  GpuRuntime GpuRuntime( debug );
 #endif
 
   // --- 0a. Initialise physics process
@@ -325,7 +328,7 @@ main( int argc, char** argv )
   timermap.start( alloKey );
 
   // Memory buffers for random numbers for momenta
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferRndNumMomenta hstRndmom( nevt );
 #else
   PinnedHostBufferRndNumMomenta hstRndmom( nevt );
@@ -333,7 +336,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for sampling weights
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferWeights hstWeights( nevt );
 #else
   PinnedHostBufferWeights hstWeights( nevt );
@@ -341,7 +344,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for momenta
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferMomenta hstMomenta( nevt );
 #else
   PinnedHostBufferMomenta hstMomenta( nevt );
@@ -349,7 +352,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for Gs
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferGs hstGs( nevt );
 #else
   PinnedHostBufferGs hstGs( nevt );
@@ -366,7 +369,7 @@ main( int argc, char** argv )
   }
 
   // Memory buffers for matrix elements
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferMatrixElements hstMatrixElements( nevt );
 #else
   PinnedHostBufferMatrixElements hstMatrixElements( nevt );
@@ -375,7 +378,7 @@ main( int argc, char** argv )
 
   // Memory buffers for random numbers for helicity selection
   // *** NB #403 these buffers always remain initialised at 0: no need for helicity choice in gcheck/check (no LHE produced) ***
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferRndNumHelicity hstRndHel( nevt );
 #else
   PinnedHostBufferRndNumHelicity hstRndHel( nevt );
@@ -384,7 +387,7 @@ main( int argc, char** argv )
 
   // Memory buffers for random numbers for color selection
   // *** NB #402 these buffers always remain initialised at 0: no need for color choice in gcheck/check (no LHE produced) ***
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferRndNumColor hstRndCol( nevt );
 #else
   PinnedHostBufferRndNumColor hstRndCol( nevt );
@@ -392,7 +395,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for helicity selection
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferSelectedHelicity hstSelHel( nevt );
 #else
   PinnedHostBufferSelectedHelicity hstSelHel( nevt );
@@ -400,7 +403,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for color selection
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferSelectedColor hstSelCol( nevt );
 #else
   PinnedHostBufferSelectedColor hstSelCol( nevt );
@@ -438,7 +441,7 @@ main( int argc, char** argv )
     const bool onDevice = true;
     prnk.reset( new CurandRandomNumberKernel( devRndmom, onDevice ) );
 #else
-    throw std::logic_error( "INTERNAL ERROR! CurandDevice is not supported on CPUs" ); // INTERNAL ERROR (no path to this statement)
+    throw std::logic_error( "INTERNAL ERROR! CurandDevice is not supported on CPUs or non-NVidia GPUs" ); // INTERNAL ERROR (no path to this statement)
 #endif
   }
 
@@ -450,7 +453,7 @@ main( int argc, char** argv )
   }
   else
   {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     prsk.reset( new RamboSamplingKernelDevice( energy, devRndmom, devMomenta, devWeights, gpublocks, gputhreads ) );
 #else
     throw std::logic_error( "RamboDevice is not supported on CPUs" ); // INTERNAL ERROR (no path to this statement)
@@ -461,7 +464,7 @@ main( int argc, char** argv )
   std::unique_ptr<MatrixElementKernelBase> pmek;
   if( !bridge )
   {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     pmek.reset( new MatrixElementKernelDevice( devMomenta, devGs, devRndHel, devRndCol, devMatrixElements, devSelHel, devSelCol, gpublocks, gputhreads ) );
 #else
     pmek.reset( new MatrixElementKernelHost( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, nevt ) );
@@ -469,7 +472,7 @@ main( int argc, char** argv )
   }
   else
   {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     pmek.reset( new BridgeKernelDevice( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, gpublocks, gputhreads ) );
 #else
     pmek.reset( new BridgeKernelHost( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, nevt ) );
@@ -511,7 +514,7 @@ main( int argc, char** argv )
     prnk->generateRnarray();
     //std::cout << "Got random numbers" << std::endl;
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     if( rndgen != RandomNumberMode::CurandDevice && rmbsmp == RamboSamplingMode::RamboDevice )
     {
       // --- 1c. Copy rndmom from host to device
@@ -543,7 +546,7 @@ main( int argc, char** argv )
     prsk->getMomentaFinal();
     //std::cout << "Got final momenta" << std::endl;
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     if( rmbsmp == RamboSamplingMode::RamboDevice )
     {
       // --- 2c. CopyDToH Weights
@@ -588,7 +591,7 @@ main( int argc, char** argv )
       dynamic_cast<BridgeKernelBase*>( pmek.get() )->transposeInputMomentaC2F();
     }
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     // --- 2d. CopyHToD Momenta
     const std::string gKey = "0.. CpHTDg";
     rambtime += timermap.start( gKey ); // FIXME! NOT A RAMBO TIMER!
@@ -617,7 +620,7 @@ main( int argc, char** argv )
     wv3atime += timermap.stop(); // calc only
     wavetime += wv3atime;        // calc plus copy
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     if( !bridge )
     {
       // --- 3b. CopyDToH MEs
@@ -760,18 +763,22 @@ main( int argc, char** argv )
     rndgentxt = "CURAND DEVICE";
 #ifdef __CUDACC__
   rndgentxt += " (CUDA code)";
+#elif defined __HIPCC__
+  rndgentxt += " (HIP code)";
 #else
   rndgentxt += " (C++ code)";
 #endif
 
   // Workflow description summary
   std::string wrkflwtxt;
-  // -- CUDA or C++?
+  // -- CUDA or HIP or C++?
 #ifdef __CUDACC__
   wrkflwtxt += "CUD:";
+#elif defined __HIPCC__
+  wrkflwtxt += "HIP:";
 #else
   wrkflwtxt += "CPP:";
-#endif
+#endif /* clang-format off */
   // -- DOUBLE or FLOAT?
 #if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
   wrkflwtxt += "MIX+"; // mixed fptypes (single precision color algebra #537)
@@ -781,7 +788,7 @@ main( int argc, char** argv )
   wrkflwtxt += "FLT+";
 #else
   wrkflwtxt += "???+"; // no path to this statement
-#endif
+#endif /* clang-format on */
   // -- CUCOMPLEX or THRUST or STD complex numbers?
 #ifdef __CUDACC__
 #if defined MGONGPU_CUCXTYPE_CUCOMPLEX
@@ -793,6 +800,12 @@ main( int argc, char** argv )
 #else
   wrkflwtxt += "???:"; // no path to this statement
 #endif
+#elif defined __HIPCC__
+#if defined MGONGPU_CUCXTYPE_CXSMPL
+  wrkflwtxt += "CXS:";
+#else
+  wrkflwtxt += "???:"; // no path to this statement
+#endif
 #else
 #if defined MGONGPU_CPPCXTYPE_STDCOMPLEX
   wrkflwtxt += "STX:";
@@ -818,7 +831,7 @@ main( int argc, char** argv )
     wrkflwtxt += "RMBDEV+";
   else
     wrkflwtxt += "??????+"; // no path to this statement
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   // -- HOST or DEVICE matrix elements? Standalone MEs or BRIDGE?
   if( !bridge )
     wrkflwtxt += "MESDEV";
@@ -874,7 +887,7 @@ main( int argc, char** argv )
 
   if( perf )
   {
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 #ifdef _OPENMP
     // Get the output of "nproc --all" (https://stackoverflow.com/a/478960)
     std::string nprocall;
@@ -895,6 +908,8 @@ main( int argc, char** argv )
     std::cout << std::string( SEP79, '*' ) << std::endl
 #ifdef __CUDACC__
               << "Process                     = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_CUDA"
+#elif defined __HIPCC__
+              << "Process                     = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_HIP"
 #else
               << "Process                     = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_CPP"
 #endif
@@ -921,21 +936,21 @@ main( int argc, char** argv )
 #elif defined MGONGPU_FPTYPE_FLOAT
               << "FP precision                = FLOAT (NaN/abnormal=" << nabn << ", zero=" << nzero << ")" << std::endl
 #endif
-#ifdef __CUDACC__
 #if defined MGONGPU_CUCXTYPE_CUCOMPLEX
               << "Complex type                = CUCOMPLEX" << std::endl
 #elif defined MGONGPU_CUCXTYPE_THRUST
               << "Complex type                = THRUST::COMPLEX" << std::endl
-#endif
-#else
+#elif defined MGONGPU_CUCXTYPE_CXSMPL
               << "Complex type                = STD::COMPLEX" << std::endl
+#else
+              << "Complex type                = ???" << std::endl // no path to this statement...
 #endif
               << "RanNumb memory layout       = AOSOA[" << neppR << "]"
               << ( neppR == 1 ? " == AOS" : "" )
               << " [HARDCODED FOR REPRODUCIBILITY]" << std::endl
               << "Momenta memory layout       = AOSOA[" << neppM << "]"
               << ( neppM == 1 ? " == AOS" : "" ) << std::endl
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     //<< "Wavefunction GPU memory     = LOCAL" << std::endl
 #else
 #if !defined MGONGPU_CPPSIMD
@@ -966,7 +981,7 @@ main( int argc, char** argv )
 #endif
 #endif
               << "Random number generation    = " << rndgentxt << std::endl
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 #ifdef _OPENMP
               << "OMP threads / `nproc --all` = " << omp_get_max_threads() << " / " << nprocall // includes a newline
 #endif
@@ -1062,14 +1077,14 @@ main( int argc, char** argv )
              << "\"FLOAT (NaN/abnormal=" << nabn << ")\"," << std::endl
 #endif
              << "\"Complex type\": "
-#ifdef __CUDACC__
 #if defined MGONGPU_CUCXTYPE_CUCOMPLEX
              << "\"CUCOMPLEX\"," << std::endl
 #elif defined MGONGPU_CUCXTYPE_THRUST
              << "\"THRUST::COMPLEX\"," << std::endl
-#endif
-#else
+#elif defined MGONGPU_CUCXTYPE_CXSMPL
              << "\"STD::COMPLEX\"," << std::endl
+#else
+             << "\"???\"," << std::endl                           // no path to this statement...
 #endif
              << "\"RanNumb memory layout\": "
              << "\"AOSOA[" << neppR << "]\""
@@ -1077,7 +1092,7 @@ main( int argc, char** argv )
              << "\"Momenta memory layout\": "
              << "\"AOSOA[" << neppM << "]\""
              << ( neppM == 1 ? " == AOS" : "" ) << ", " << std::endl
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     //<< "\"Wavefunction GPU memory\": " << "\"LOCAL\"," << std::endl
 #endif
              << "\"Curand generation\": "
diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/RamboSamplingKernels.cc b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/RamboSamplingKernels.cc
index da68aa9255..79abbcc4f8 100644
--- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/RamboSamplingKernels.cc
+++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/RamboSamplingKernels.cc
@@ -1,11 +1,11 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #include "RamboSamplingKernels.h"
 
-#include "CudaRuntime.h"
+#include "GpuRuntime.h"
 #include "MemoryAccessMomenta.h"
 #include "MemoryAccessRandomNumbers.h"
 #include "MemoryAccessWeights.h"
@@ -14,7 +14,7 @@
 
 #include <sstream>
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -92,7 +92,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   RamboSamplingKernelDevice::RamboSamplingKernelDevice( const fptype energy,               // input: energy
                                                         const BufferRndNumMomenta& rndmom, // input: random numbers in [0,1]
                                                         BufferMomenta& momenta,            // output: momenta
@@ -135,7 +135,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   __global__ void
   getMomentaInitialDevice( const fptype energy,
                            fptype* momenta )
@@ -147,17 +147,17 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   void
   RamboSamplingKernelDevice::getMomentaInitial()
   {
-    getMomentaInitialDevice<<<m_gpublocks, m_gputhreads>>>( m_energy, m_momenta.data() );
+    gpuLaunchKernel( getMomentaInitialDevice, m_gpublocks, m_gputhreads, m_energy, m_momenta.data() );
   }
 #endif
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   __global__ void
   getMomentaFinalDevice( const fptype energy,
                          const fptype* rndmom,
@@ -171,11 +171,11 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   void
   RamboSamplingKernelDevice::getMomentaFinal()
   {
-    getMomentaFinalDevice<<<m_gpublocks, m_gputhreads>>>( m_energy, m_rndmom.data(), m_momenta.data(), m_weights.data() );
+    gpuLaunchKernel( getMomentaFinalDevice, m_gpublocks, m_gputhreads, m_energy, m_rndmom.data(), m_momenta.data(), m_weights.data() );
   }
 #endif
 
diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/RamboSamplingKernels.h b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/RamboSamplingKernels.h
index 184089efd7..7c214cd74b 100644
--- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/RamboSamplingKernels.h
+++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/RamboSamplingKernels.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef RAMBOSAMPLINGKERNELS_H
 #define RAMBOSAMPLINGKERNELS_H 1
@@ -10,7 +10,7 @@
 
 #include "MemoryBuffers.h"
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -93,7 +93,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   // A class encapsulating RAMBO phase space sampling on a GPU device
   class RamboSamplingKernelDevice final : public SamplingKernelBase, public NumberOfEvents
   {
diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/RandomNumberKernels.h b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/RandomNumberKernels.h
index 188a72c2c9..21d63beeac 100644
--- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/RandomNumberKernels.h
+++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/RandomNumberKernels.h
@@ -1,14 +1,14 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef RANDOMNUMBERKERNELS_H
 #define RANDOMNUMBERKERNELS_H 1
 
 #include "mgOnGpuConfig.h"
 
-// NB This must come AFTER mgOnGpuConfig.h which contains our definition of __global__ when __CUDACC__ is not defined
+// NB This must come AFTER mgOnGpuConfig.h which contains our definition of __global__ when MGONGPUCPP_GPUIMPL is not defined
 #ifndef MGONGPU_HAS_NO_CURAND
 //#include "curand.h"
 struct curandGenerator_st; // forward definition from curand.h
@@ -16,7 +16,7 @@ struct curandGenerator_st; // forward definition from curand.h
 
 #include "MemoryBuffers.h"
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/cudacpp.mk b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/cudacpp.mk
index 509307506b..f2cfa349da 100644
--- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/cudacpp.mk
+++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/cudacpp.mk
@@ -1,7 +1,7 @@
 # Copyright (C) 2020-2023 CERN and UCLouvain.
 # Licensed under the GNU Lesser General Public License (version 3 or later).
 # Created by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin.
-# Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+# Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 
 #=== Determine the name of this makefile (https://ftp.gnu.org/old-gnu/Manuals/make-3.80/html_node/make_17.html)
 #=== NB: use ':=' to ensure that the value of CUDACPP_MAKEFILE is not modified further down after including make_opts
@@ -42,10 +42,10 @@ endif
 
 #-------------------------------------------------------------------------------
 
-#=== Configure common compiler flags for C++ and CUDA
+#=== Configure common compiler flags for C++ and CUDA/HIP
 
 INCFLAGS = -I.
-OPTFLAGS = -O3 # this ends up in CUFLAGS too (should it?), cannot add -Ofast or -ffast-math here
+OPTFLAGS = -O3 # this ends up in GPUFLAGS too (should it?), cannot add -Ofast or -ffast-math here
 
 # Dependency on src directory
 MG5AMC_COMMONLIB = mg5amc_common
@@ -121,24 +121,46 @@ endif
 
 #-------------------------------------------------------------------------------
 
-#=== Configure the CUDA compiler
+#=== Configure the GPU compiler (CUDA or HIP)
 
-# If CXX is not a single word (example "clang++ --gcc-toolchain...") then disable CUDA builds (issue #505)
-# This is because it is impossible to pass this to "CUFLAGS += -ccbin <host-compiler>" below
+# FIXME! (AV 24.01.2024)
+# In the current implementation (without separate builds for C++ and CUDA/HIP), we first check for cudacc and hipcc in CUDA_HOME and HIP_HOME.
+# If CUDA_HOME or HIP_HOME are not set, try to determine them from the path to cudacc and hipcc.
+# While convoluted, this is currently necessary to allow disabling CUDA/HIP builds by setting CUDA_HOME or HIP_HOME to invalid paths.
+# This will (probably?) be fixed when separate C++ and CUDA/HIP builds are implemented (PR #775).
+
+# If CXX is not a single word (example "clang++ --gcc-toolchain...") then disable CUDA and HIP builds (issue #505)
+# This is because it is impossible to pass this to "GPUFLAGS += -ccbin <host-compiler>" below
 ifneq ($(words $(subst ccache ,,$(CXX))),1) # allow at most "CXX=ccache <host-compiler>" from outside
-  $(warning CUDA builds are not supported for multi-word CXX "$(CXX)")
+  $(warning CUDA and HIP builds are not supported for multi-word CXX "$(CXX)")
   override CUDA_HOME=disabled
+  override HIP_HOME=disabled
 endif
 
-# If CUDA_HOME is not set, try to set it from the location of nvcc
+# If CUDA_HOME is not set, try to set it from the path to nvcc
 ifndef CUDA_HOME
   CUDA_HOME = $(patsubst %bin/nvcc,%,$(shell which nvcc 2>/dev/null))
   $(warning CUDA_HOME was not set: using "$(CUDA_HOME)")
 endif
 
-# Set NVCC as $(CUDA_HOME)/bin/nvcc if it exists
+# If HIP_HOME is not set, try to set it from the path to hipcc
+ifndef HIP_HOME
+  HIP_HOME = $(patsubst %bin/hipcc,%,$(HIP_COMPILER_PATH))
+  $(warning HIP_HOME was not set: using "$(HIP_HOME)")
+endif
+
+# FIXME! (AV 24.01.2024)
+# In the current implementation (without separate builds for C++ and CUDA/HIP),
+# builds are performed for HIP only if CUDA is not found in the path.
+# If both CUDA and HIP are installed, HIP builds can be triggered by unsetting CUDA_HOME.
+# This will be fixed when separate C++ and CUDA/HIP builds are implemented (PR #775).
+
+#--- Option 1: CUDA exists -> use CUDA
+
+# Set GPUCC as $(CUDA_HOME)/bin/nvcc if it exists
 ifneq ($(wildcard $(CUDA_HOME)/bin/nvcc),)
-  NVCC = $(CUDA_HOME)/bin/nvcc
+
+  GPUCC = $(CUDA_HOME)/bin/nvcc
   USE_NVTX ?=-DUSE_NVTX
   # See https://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/index.html
   # See https://arnon.dk/matching-sm-architectures-arch-and-gencode-for-various-nvidia-cards/
@@ -158,41 +180,78 @@ ifneq ($(wildcard $(CUDA_HOME)/bin/nvcc),)
     CURANDLIBFLAGS = -L$(CUDA_HOME)/lib64/ -lcurand # NB: -lcuda is not needed here!
   endif
   CUOPTFLAGS = -lineinfo
-  CUFLAGS = $(foreach opt, $(OPTFLAGS), -Xcompiler $(opt)) $(CUOPTFLAGS) $(INCFLAGS) $(CUINC) $(USE_NVTX) $(CUARCHFLAGS) -use_fast_math
-  ###CUFLAGS += -Xcompiler -Wall -Xcompiler -Wextra -Xcompiler -Wshadow
-  ###NVCC_VERSION = $(shell $(NVCC) --version | grep 'Cuda compilation tools' | cut -d' ' -f5 | cut -d, -f1)
-  CUFLAGS += -std=c++17 # need CUDA >= 11.2 (see #333): this is enforced in mgOnGpuConfig.h
+  ###GPUFLAGS = $(OPTFLAGS) $(CUOPTFLAGS) $(INCFLAGS) $(CUINC) $(USE_NVTX) $(CUARCHFLAGS) -use_fast_math
+  GPUFLAGS = $(foreach opt, $(OPTFLAGS), -Xcompiler $(opt)) $(CUOPTFLAGS) $(INCFLAGS) $(CUINC) $(USE_NVTX) $(CUARCHFLAGS) -use_fast_math
+  ###GPUFLAGS += -Xcompiler -Wall -Xcompiler -Wextra -Xcompiler -Wshadow
+  ###GPUCC_VERSION = $(shell $(GPUCC) --version | grep 'Cuda compilation tools' | cut -d' ' -f5 | cut -d, -f1)
+  GPUFLAGS += -std=c++17 # need CUDA >= 11.2 (see #333): this is enforced in mgOnGpuConfig.h
   # Without -maxrregcount: baseline throughput: 6.5E8 (16384 32 12) up to 7.3E8 (65536 128 12)
-  ###CUFLAGS+= --maxrregcount 160 # improves throughput: 6.9E8 (16384 32 12) up to 7.7E8 (65536 128 12)
-  ###CUFLAGS+= --maxrregcount 128 # improves throughput: 7.3E8 (16384 32 12) up to 7.6E8 (65536 128 12)
-  ###CUFLAGS+= --maxrregcount 96 # degrades throughput: 4.1E8 (16384 32 12) up to 4.5E8 (65536 128 12)
-  ###CUFLAGS+= --maxrregcount 64 # degrades throughput: 1.7E8 (16384 32 12) flat at 1.7E8 (65536 128 12)
+  ###GPUFLAGS+= --maxrregcount 160 # improves throughput: 6.9E8 (16384 32 12) up to 7.7E8 (65536 128 12)
+  ###GPUFLAGS+= --maxrregcount 128 # improves throughput: 7.3E8 (16384 32 12) up to 7.6E8 (65536 128 12)
+  ###GPUFLAGS+= --maxrregcount 96 # degrades throughput: 4.1E8 (16384 32 12) up to 4.5E8 (65536 128 12)
+  ###GPUFLAGS+= --maxrregcount 64 # degrades throughput: 1.7E8 (16384 32 12) flat at 1.7E8 (65536 128 12)
+  CUBUILDRULEFLAGS = -Xcompiler -fPIC -c
+  CCBUILDRULEFLAGS = -Xcompiler -fPIC -c -x cu
+  CUDATESTFLAGS = -lcuda
+
+  # Set the host C++ compiler for GPUCC via "-ccbin <host-compiler>"
+  # (NB issue #505: this must be a single word, "clang++ --gcc-toolchain..." is not supported)
+  GPUFLAGS += -ccbin $(shell which $(subst ccache ,,$(CXX)))
+
+  # Allow newer (unsupported) C++ compilers with older versions of CUDA if ALLOW_UNSUPPORTED_COMPILER_IN_CUDA is set (#504)
+  ifneq ($(origin ALLOW_UNSUPPORTED_COMPILER_IN_CUDA),undefined)
+  GPUFLAGS += -allow-unsupported-compiler
+  endif
+
 else ifneq ($(origin REQUIRE_CUDA),undefined)
+
   # If REQUIRE_CUDA is set but no cuda is found, stop here (e.g. for CI tests on GPU #443)
-  $(error No cuda installation found (set CUDA_HOME or make nvcc visible in PATH))
+  $(error No cuda installation found (set CUDA_HOME or make GPUCC visible in PATH))
+
+#--- Option 2: CUDA does not exist, HIP exists -> use HIP
+
+# Set GPUCC as $(HIP_HOME)/bin/hipcc if it exists
+else ifneq ($(wildcard $(HIP_HOME)/bin/hipcc),)
+
+  GPUCC = $(HIP_HOME)/bin/hipcc
+  #USE_NVTX ?=-DUSE_NVTX # should maybe find something equivalent to this in HIP?
+  HIPARCHFLAGS = -target x86_64-linux-gnu --offload-arch=gfx90a
+  HIPINC = -I$(HIP_HOME)/include/
+  # Note: -DHIP_FAST_MATH is equivalent to -use_fast_math in HIP 
+  # (but only for single precision line 208: https://rocm-developer-tools.github.io/HIP/hcc__detail_2math__functions_8h_source.html)
+  GPUFLAGS = $(OPTFLAGS) $(CUOPTFLAGS) $(INCFLAGS) $(HIPINC) $(HIPARCHFLAGS) -DHIP_FAST_MATH -DHIP_PLATFORM=amd -fPIC
+  ###GPUFLAGS += -Xcompiler -Wall -Xcompiler -Wextra -Xcompiler -Wshadow
+  GPUFLAGS += -std=c++17
+  ###GPUFLAGS+= --maxrregcount 255 # (AV: is this option valid on HIP and meaningful on AMD GPUs?)
+  CUBUILDRULEFLAGS = -fPIC -c
+  CCBUILDRULEFLAGS = -fPIC -c
+
+else ifneq ($(origin REQUIRE_HIP),undefined)
+
+  # If REQUIRE_HIP is set but no HIP is found, stop here (e.g. for CI tests on GPU #443)
+  $(error No hip installation found (set HIP_HOME or make GPUCC visible in PATH))
+
+#--- Option 3: CUDA does not exist, HIP does not exist -> switch off both CUDA and HIP
+
 else
-  # No cuda. Switch cuda compilation off and go to common random numbers in C++
+
+  # No cudacc and no hipcc: switch CUDA and HIP compilation off and go to common random numbers in C++
   $(warning CUDA_HOME is not set or is invalid: export CUDA_HOME to compile with cuda)
-  override NVCC=
+  $(warning HIP_HOME is not set or is invalid: export HIP_HOME to compile with hip)
+  override GPUCC=
   override USE_NVTX=
   override CUINC=
   override CURANDLIBFLAGS=
-endif
-export NVCC
-export CUFLAGS
-
-# Set the host C++ compiler for nvcc via "-ccbin <host-compiler>"
-# (NB issue #505: this must be a single word, "clang++ --gcc-toolchain..." is not supported)
-CUFLAGS += -ccbin $(shell which $(subst ccache ,,$(CXX)))
 
-# Allow newer (unsupported) C++ compilers with older versions of CUDA if ALLOW_UNSUPPORTED_COMPILER_IN_CUDA is set (#504)
-ifneq ($(origin ALLOW_UNSUPPORTED_COMPILER_IN_CUDA),undefined)
-CUFLAGS += -allow-unsupported-compiler
 endif
 
+# Export GPUCC (so that it can also be used in cudacpp_src.mk?)
+export GPUCC
+export GPUFLAGS
+
 #-------------------------------------------------------------------------------
 
-#=== Configure ccache for C++ and CUDA builds
+#=== Configure ccache for C++ and CUDA/HIP builds
 
 # Enable ccache if USECCACHE=1
 ifeq ($(USECCACHE)$(shell echo $(CXX) | grep ccache),1)
@@ -201,15 +260,15 @@ endif
 #ifeq ($(USECCACHE)$(shell echo $(AR) | grep ccache),1)
 #  override AR:=ccache $(AR)
 #endif
-ifneq ($(NVCC),)
-  ifeq ($(USECCACHE)$(shell echo $(NVCC) | grep ccache),1)
-    override NVCC:=ccache $(NVCC)
+ifneq ($(GPUCC),)
+  ifeq ($(USECCACHE)$(shell echo $(GPUCC) | grep ccache),1)
+    override GPUCC:=ccache $(GPUCC)
   endif
 endif
 
 #-------------------------------------------------------------------------------
 
-#=== Configure PowerPC-specific compiler flags for C++ and CUDA
+#=== Configure PowerPC-specific compiler flags for C++ and CUDA/HIP
 
 # PowerPC-specific CXX compiler flags (being reviewed)
 ifeq ($(UNAME_P),ppc64le)
@@ -225,9 +284,9 @@ else
   ######CXXFLAGS+= -fno-semantic-interposition # no benefit (neither alone, nor combined with -flto)
 endif
 
-# PowerPC-specific CUDA compiler flags (to be reviewed!)
+# PowerPC-specific CUDA/HIP compiler flags (to be reviewed!)
 ifeq ($(UNAME_P),ppc64le)
-  CUFLAGS+= -Xcompiler -mno-float128
+  GPUFLAGS+= -Xcompiler -mno-float128
 endif
 
 #-------------------------------------------------------------------------------
@@ -237,7 +296,7 @@ endif
 # Set the default OMPFLAGS choice
 ifneq ($(shell $(CXX) --version | egrep '^Intel'),)
 override OMPFLAGS = -fopenmp
-###override OMPFLAGS = # disable OpenMP MT on Intel (was ok without nvcc but not ok with nvcc before #578)
+###override OMPFLAGS = # disable OpenMP MT on Intel (was ok without GPUCC but not ok with GPUCC before #578)
 else ifneq ($(shell $(CXX) --version | egrep '^(clang)'),)
 override OMPFLAGS = -fopenmp
 ###override OMPFLAGS = # disable OpenMP MT on clang (was not ok without or with nvcc before #578)
@@ -293,7 +352,10 @@ endif
 
 # Set the default RNDGEN (random number generator) choice
 ifeq ($(RNDGEN),)
-  ifeq ($(NVCC),)
+  ifeq ($(GPUCC),)
+    override RNDGEN = hasNoCurand
+  # Edgecase for HIP compilation
+  else ifeq ($(findstring hipcc,$(GPUCC)),hipcc)
     override RNDGEN = hasNoCurand
   else ifeq ($(RNDGEN),)
     override RNDGEN = hasCurand
@@ -310,7 +372,7 @@ export OMPFLAGS
 
 #-------------------------------------------------------------------------------
 
-#=== Set the CUDA/C++ compiler flags appropriate to user-defined choices of AVX, FPTYPE, HELINL, HRDCOD, RNDGEN
+#=== Set the CUDA/HIP/C++ compiler flags appropriate to user-defined choices of AVX, FPTYPE, HELINL, HRDCOD, RNDGEN
 
 # Set the build flags appropriate to OMPFLAGS
 $(info OMPFLAGS=$(OMPFLAGS))
@@ -368,13 +430,13 @@ CXXFLAGS+= $(AVXFLAGS)
 $(info FPTYPE=$(FPTYPE))
 ifeq ($(FPTYPE),d)
   CXXFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_DOUBLE
-  CUFLAGS  += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_DOUBLE
+  GPUFLAGS  += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_DOUBLE
 else ifeq ($(FPTYPE),f)
   CXXFLAGS += -DMGONGPU_FPTYPE_FLOAT -DMGONGPU_FPTYPE2_FLOAT
-  CUFLAGS  += -DMGONGPU_FPTYPE_FLOAT -DMGONGPU_FPTYPE2_FLOAT
+  GPUFLAGS  += -DMGONGPU_FPTYPE_FLOAT -DMGONGPU_FPTYPE2_FLOAT
 else ifeq ($(FPTYPE),m)
   CXXFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_FLOAT
-  CUFLAGS  += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_FLOAT
+  GPUFLAGS  += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_FLOAT
 else
   $(error Unknown FPTYPE='$(FPTYPE)': only 'd', 'f' and 'm' are supported)
 endif
@@ -383,7 +445,7 @@ endif
 $(info HELINL=$(HELINL))
 ifeq ($(HELINL),1)
   CXXFLAGS += -DMGONGPU_INLINE_HELAMPS
-  CUFLAGS  += -DMGONGPU_INLINE_HELAMPS
+  GPUFLAGS  += -DMGONGPU_INLINE_HELAMPS
 else ifneq ($(HELINL),0)
   $(error Unknown HELINL='$(HELINL)': only '0' and '1' are supported)
 endif
@@ -392,7 +454,7 @@ endif
 $(info HRDCOD=$(HRDCOD))
 ifeq ($(HRDCOD),1)
   CXXFLAGS += -DMGONGPU_HARDCODE_PARAM
-  CUFLAGS  += -DMGONGPU_HARDCODE_PARAM
+  GPUFLAGS  += -DMGONGPU_HARDCODE_PARAM
 else ifneq ($(HRDCOD),0)
   $(error Unknown HRDCOD='$(HRDCOD)': only '0' and '1' are supported)
 endif
@@ -444,11 +506,11 @@ ifeq ($(UNAME_S),Darwin)
   override CULIBFLAGSRPATH2 =
 else
   # RPATH to cuda/cpp libs when linking executables
-  override CXXLIBFLAGSRPATH = -Wl,-rpath,$(LIBDIRRPATH)
-  override CULIBFLAGSRPATH = -Xlinker -rpath,$(LIBDIRRPATH)
+  override CXXLIBFLAGSRPATH = -Wl,-rpath=$(LIBDIRRPATH)
+  override CULIBFLAGSRPATH = -Xlinker -rpath=$(LIBDIRRPATH)
   # RPATH to common lib when linking cuda/cpp libs
-  override CXXLIBFLAGSRPATH2 = -Wl,-rpath,'$$ORIGIN'
-  override CULIBFLAGSRPATH2 = -Xlinker -rpath,'$$ORIGIN'
+  override CXXLIBFLAGSRPATH2 = -Wl,-rpath='$$ORIGIN'
+  override CULIBFLAGSRPATH2 = -Xlinker -rpath='$$ORIGIN'
 endif
 
 # Setting LD_LIBRARY_PATH or DYLD_LIBRARY_PATH in the RUNTIME is no longer necessary (neither on Linux nor on Mac)
@@ -461,7 +523,7 @@ override RUNTIME =
 cxx_main=$(BUILDDIR)/check.exe
 fcxx_main=$(BUILDDIR)/fcheck.exe
 
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 cu_main=$(BUILDDIR)/gcheck.exe
 fcu_main=$(BUILDDIR)/fgcheck.exe
 else
@@ -492,15 +554,16 @@ $(BUILDDIR)/.build.$(TAG):
 	@touch $(BUILDDIR)/.build.$(TAG)
 
 # Generic target and build rules: objects from CUDA compilation
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 $(BUILDDIR)/%.o : %.cu *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
 	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
-	$(NVCC) $(CPPFLAGS) $(CUFLAGS) -Xcompiler -fPIC -c $< -o $@
+	$(GPUCC) $(CPPFLAGS) $(GPUFLAGS) $(CUBUILDRULEFLAGS) $< -o $@
 
 $(BUILDDIR)/%_cu.o : %.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
 	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
-	$(NVCC) $(CPPFLAGS) $(CUFLAGS) -Xcompiler -fPIC -c -x cu $< -o $@
+	$(GPUCC) $(CPPFLAGS) $(GPUFLAGS) $(CCBUILDRULEFLAGS) $< -o $@
 endif
+# -x cu in line above
 
 # Generic target and build rules: objects from C++ compilation
 # (NB do not include CUINC here! add it only for NVTX or curand #679)
@@ -509,11 +572,14 @@ $(BUILDDIR)/%.o : %.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
 	$(CXX) $(CPPFLAGS) $(CXXFLAGS) -fPIC -c $< -o $@
 
 # Apply special build flags only to CrossSectionKernel.cc and gCrossSectionKernel.cu (no fast math, see #117 and #516)
+# Added edgecase for HIP compilation
 ifeq ($(shell $(CXX) --version | grep ^nvc++),)
 $(BUILDDIR)/CrossSectionKernels.o: CXXFLAGS := $(filter-out -ffast-math,$(CXXFLAGS))
 $(BUILDDIR)/CrossSectionKernels.o: CXXFLAGS += -fno-fast-math
-ifneq ($(NVCC),)
-$(BUILDDIR)/gCrossSectionKernels.o: CUFLAGS += -Xcompiler -fno-fast-math
+ifeq ($(findstring nvcc,$(GPUCC)),nvcc)
+  $(BUILDDIR)/gCrossSectionKernels.o: GPUFLAGS += -Xcompiler -fno-fast-math
+else
+  $(BUILDDIR)/gCrossSectionKernels.o: GPUFLAGS += -fno-fast-math
 endif
 endif
 
@@ -530,10 +596,10 @@ ifeq ($(RNDGEN),hasCurand)
 $(BUILDDIR)/CurandRandomNumberKernel.o: CXXFLAGS += $(CUINC)
 endif
 
-# Avoid "warning: builtin __has_trivial_... is deprecated; use __is_trivially_... instead" in nvcc with icx2023 (#592)
+# Avoid "warning: builtin __has_trivial_... is deprecated; use __is_trivially_... instead" in GPUCC with icx2023 (#592)
 ifneq ($(shell $(CXX) --version | egrep '^(Intel)'),)
-ifneq ($(NVCC),)
-CUFLAGS += -Xcompiler -Wno-deprecated-builtins
+ifneq ($(GPUCC),)
+GPUFLAGS += -Wno-deprecated-builtins
 endif
 endif
 
@@ -541,8 +607,8 @@ endif
 # This patch does remove the warning, but I prefer to keep it disabled for the moment...
 ###ifneq ($(shell $(CXX) --version | egrep '^(clang|Apple clang|Intel)'),)
 ###$(BUILDDIR)/CrossSectionKernels.o: CXXFLAGS += -Wno-overriding-t-option
-###ifneq ($(NVCC),)
-###$(BUILDDIR)/gCrossSectionKernels.o: CUFLAGS += -Xcompiler -Wno-overriding-t-option
+###ifneq ($(GPUCC),)
+###$(BUILDDIR)/gCrossSectionKernels.o: GPUFLAGS += -Xcompiler -Wno-overriding-t-option
 ###endif
 ###endif
 
@@ -569,7 +635,7 @@ MG5AMC_CXXLIB = mg5amc_$(processid_short)_cpp
 cxx_objects_lib=$(BUILDDIR)/CPPProcess.o $(BUILDDIR)/MatrixElementKernels.o $(BUILDDIR)/BridgeKernels.o $(BUILDDIR)/CrossSectionKernels.o
 cxx_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel.o $(BUILDDIR)/RamboSamplingKernels.o
 
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 MG5AMC_CULIB = mg5amc_$(processid_short)_cuda
 cu_objects_lib=$(BUILDDIR)/gCPPProcess.o $(BUILDDIR)/gMatrixElementKernels.o $(BUILDDIR)/gBridgeKernels.o $(BUILDDIR)/gCrossSectionKernels.o
 cu_objects_exe=$(BUILDDIR)/gCommonRandomNumberKernel.o $(BUILDDIR)/gRamboSamplingKernels.o
@@ -581,11 +647,11 @@ $(LIBDIR)/lib$(MG5AMC_CXXLIB).so: cxx_objects_lib += $(BUILDDIR)/fbridge.o
 $(LIBDIR)/lib$(MG5AMC_CXXLIB).so: $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib)
 	$(CXX) -shared -o $@ $(cxx_objects_lib) $(CXXLIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB)
 
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 $(LIBDIR)/lib$(MG5AMC_CULIB).so: $(BUILDDIR)/fbridge_cu.o
 $(LIBDIR)/lib$(MG5AMC_CULIB).so: cu_objects_lib += $(BUILDDIR)/fbridge_cu.o
 $(LIBDIR)/lib$(MG5AMC_CULIB).so: $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cu_objects_lib)
-	$(NVCC) --shared -o $@ $(cu_objects_lib) $(CULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB)
+	$(GPUCC) --shared -o $@ $(cu_objects_lib) $(CULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB)
 endif
 
 #-------------------------------------------------------------------------------
@@ -602,16 +668,16 @@ $(cxx_main): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PAT
 $(cxx_main): $(BUILDDIR)/check_sa.o $(LIBDIR)/lib$(MG5AMC_CXXLIB).so $(cxx_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel.o
 	$(CXX) -o $@ $(BUILDDIR)/check_sa.o $(OMPFLAGS) -ldl -pthread $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_CXXLIB) $(cxx_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel.o $(CURANDLIBFLAGS)
 
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 ifneq ($(shell $(CXX) --version | grep ^Intel),)
-$(cu_main): LIBFLAGS += -lintlc # compile with icpx and link with nvcc (undefined reference to `_intel_fast_memcpy')
-$(cu_main): LIBFLAGS += -lsvml # compile with icpx and link with nvcc (undefined reference to `__svml_cos4_l9')
+$(cu_main): LIBFLAGS += -lintlc # compile with icpx and link with GPUCC (undefined reference to `_intel_fast_memcpy')
+$(cu_main): LIBFLAGS += -lsvml # compile with icpx and link with GPUCC (undefined reference to `__svml_cos4_l9')
 else ifneq ($(shell $(CXX) --version | grep ^nvc++),) # support nvc++ #531
 $(cu_main): LIBFLAGS += -L$(patsubst %bin/nvc++,%lib,$(subst ccache ,,$(CXX))) -lnvhpcatm -lnvcpumath -lnvc
 endif
 $(cu_main): LIBFLAGS += $(CULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH
 $(cu_main): $(BUILDDIR)/gcheck_sa.o $(LIBDIR)/lib$(MG5AMC_CULIB).so $(cu_objects_exe) $(BUILDDIR)/gCurandRandomNumberKernel.o
-	$(NVCC) -o $@ $(BUILDDIR)/gcheck_sa.o $(CUARCHFLAGS) $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe) $(BUILDDIR)/gCurandRandomNumberKernel.o $(CURANDLIBFLAGS)
+	$(GPUCC) -o $@ $(BUILDDIR)/gcheck_sa.o $(CUARCHFLAGS) $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe) $(BUILDDIR)/gCurandRandomNumberKernel.o $(CURANDLIBFLAGS)
 endif
 
 #-------------------------------------------------------------------------------
@@ -637,17 +703,17 @@ $(fcxx_main): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PA
 $(fcxx_main): $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler.o $(LIBDIR)/lib$(MG5AMC_CXXLIB).so $(cxx_objects_exe)
 	$(CXX) -o $@ $(BUILDDIR)/fcheck_sa.o $(OMPFLAGS) $(BUILDDIR)/fsampler.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CXXLIB) $(cxx_objects_exe)
 
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 ifneq ($(shell $(CXX) --version | grep ^Intel),)
-$(fcu_main): LIBFLAGS += -lintlc # compile with icpx and link with nvcc (undefined reference to `_intel_fast_memcpy')
-$(fcu_main): LIBFLAGS += -lsvml # compile with icpx and link with nvcc (undefined reference to `__svml_cos4_l9')
+$(fcu_main): LIBFLAGS += -lintlc # compile with icpx and link with GPUCC (undefined reference to `_intel_fast_memcpy')
+$(fcu_main): LIBFLAGS += -lsvml # compile with icpx and link with GPUCC (undefined reference to `__svml_cos4_l9')
 endif
 ifeq ($(UNAME_S),Darwin)
 $(fcu_main): LIBFLAGS += -L$(shell dirname $(shell $(FC) --print-file-name libgfortran.dylib)) # add path to libgfortran on Mac #375
 endif
 $(fcu_main): LIBFLAGS += $(CULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH
 $(fcu_main): $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler_cu.o $(LIBDIR)/lib$(MG5AMC_CULIB).so $(cu_objects_exe)
-	$(NVCC) -o $@ $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler_cu.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe)
+	$(GPUCC) -o $@ $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler_cu.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe)
 endif
 
 #-------------------------------------------------------------------------------
@@ -659,7 +725,7 @@ $(BUILDDIR)/testxxx.o: testxxx_cc_ref.txt
 $(testmain): $(BUILDDIR)/testxxx.o
 $(testmain): cxx_objects_exe += $(BUILDDIR)/testxxx.o # Comment out this line to skip the C++ test of xxx functions
 
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 $(BUILDDIR)/testxxx_cu.o: $(GTESTLIBS)
 $(BUILDDIR)/testxxx_cu.o: INCFLAGS += $(GTESTINC)
 $(BUILDDIR)/testxxx_cu.o: testxxx_cc_ref.txt
@@ -672,7 +738,7 @@ $(BUILDDIR)/testmisc.o: INCFLAGS += $(GTESTINC)
 $(testmain): $(BUILDDIR)/testmisc.o
 $(testmain): cxx_objects_exe += $(BUILDDIR)/testmisc.o # Comment out this line to skip the C++ miscellaneous tests
 
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 $(BUILDDIR)/testmisc_cu.o: $(GTESTLIBS)
 $(BUILDDIR)/testmisc_cu.o: INCFLAGS += $(GTESTINC)
 $(testmain): $(BUILDDIR)/testmisc_cu.o
@@ -684,12 +750,12 @@ $(BUILDDIR)/runTest.o: INCFLAGS += $(GTESTINC)
 $(testmain): $(BUILDDIR)/runTest.o
 $(testmain): cxx_objects_exe += $(BUILDDIR)/runTest.o
 
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 $(BUILDDIR)/runTest_cu.o: $(GTESTLIBS)
 $(BUILDDIR)/runTest_cu.o: INCFLAGS += $(GTESTINC)
 ifneq ($(shell $(CXX) --version | grep ^Intel),)
-$(testmain): LIBFLAGS += -lintlc # compile with icpx and link with nvcc (undefined reference to `_intel_fast_memcpy')
-$(testmain): LIBFLAGS += -lsvml # compile with icpx and link with nvcc (undefined reference to `__svml_cos4_l9')
+$(testmain): LIBFLAGS += -lintlc # compile with icpx and link with GPUCC (undefined reference to `_intel_fast_memcpy')
+$(testmain): LIBFLAGS += -lsvml # compile with icpx and link with GPUCC (undefined reference to `__svml_cos4_l9')
 else ifneq ($(shell $(CXX) --version | grep ^nvc++),) # support nvc++ #531
 $(testmain): LIBFLAGS += -L$(patsubst %bin/nvc++,%lib,$(subst ccache ,,$(CXX))) -lnvhpcatm -lnvcpumath -lnvc
 endif
@@ -713,14 +779,14 @@ $(testmain): LIBFLAGS += -lgomp
 endif
 endif
 
-ifeq ($(NVCC),) # link only runTest.o
+ifeq ($(GPUCC),) # link only runTest.o
 $(testmain): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH
 $(testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(cxx_objects_exe) $(GTESTLIBS)
 	$(CXX) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) -ldl -pthread $(LIBFLAGS)
 else # link both runTest.o and runTest_cu.o
 $(testmain): LIBFLAGS += $(CULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH
 $(testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) $(GTESTLIBS)
-	$(NVCC) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) -ldl $(LIBFLAGS) -lcuda
+	  $(GPUCC) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) -ldl $(LIBFLAGS) $(CUDATESTFLAGS)
 endif
 
 # Use target gtestlibs to build only googletest
@@ -829,9 +895,9 @@ ifeq ($(USECCACHE),1)
 	ccache --version | head -1
 endif
 	@echo ""
-	@echo NVCC=$(NVCC)
-ifneq ($(NVCC),)
-	$(NVCC) --version
+	@echo GPUCC=$(GPUCC)
+ifneq ($(GPUCC),)
+	$(GPUCC) --version
 endif
 	@echo ""
 	@echo CXX=$(CXX)
@@ -850,7 +916,7 @@ endif
 
 # Target: check (run the C++ test executable)
 # [NB THIS IS WHAT IS USED IN THE GITHUB CI!]
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 check: runTest cmpFcheck cmpFGcheck
 else
 check: runTest cmpFcheck
diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/fbridge.cc b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/fbridge.cc
index 2d2b36d560..22ce3f5115 100644
--- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/fbridge.cc
+++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/fbridge.cc
@@ -1,11 +1,11 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: S. Roiser (Oct 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Roiser, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #include "Bridge.h"
 #include "CPPProcess.h"
-#include "CudaRuntime.h"
+#include "GpuRuntime.h"
 
 extern "C"
 {
@@ -22,7 +22,7 @@ extern "C"
    * Using the same Fortran MadEvent code, linking to the hetrerogeneous library would allow access to both CPU and GPU implementations.
    * The specific heterogeneous configuration (how many GPUs, how many threads on each CPU, etc) could be loaded in CUDA/C++ from a data file.
    */
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   using namespace mg5amcGpu;
 #else
   using namespace mg5amcCpu;
@@ -46,8 +46,8 @@ extern "C"
    */
   void fbridgecreate_( CppObjectInFortran** ppbridge, const int* pnevtF, const int* pnparF, const int* pnp4F )
   {
-#ifdef __CUDACC__
-    CudaRuntime::setUp();
+#ifdef MGONGPUCPP_GPUIMPL
+    GpuRuntime::setUp();
 #endif
     // (NB: CPPProcess::initProc no longer needs to be executed here because it is called in the Bridge constructor)
     // FIXME: disable OMP in Bridge when called from Fortran
@@ -65,8 +65,8 @@ extern "C"
     Bridge<FORTRANFPTYPE>* pbridge = dynamic_cast<Bridge<FORTRANFPTYPE>*>( *ppbridge );
     if( pbridge == 0 ) throw std::runtime_error( "fbridgedelete_: invalid Bridge address" );
     delete pbridge;
-#ifdef __CUDACC__
-    CudaRuntime::tearDown();
+#ifdef MGONGPUCPP_GPUIMPL
+    GpuRuntime::tearDown();
 #endif
   }
 
@@ -96,7 +96,7 @@ extern "C"
   {
     Bridge<FORTRANFPTYPE>* pbridge = dynamic_cast<Bridge<FORTRANFPTYPE>*>( *ppbridge );
     if( pbridge == 0 ) throw std::runtime_error( "fbridgesequence_: invalid Bridge address" );
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     // Use the device/GPU implementation in the CUDA library
     // (there is also a host implementation in this library)
     pbridge->gpu_sequence( momenta, gs, rndhel, rndcol, *pchannelId, mes, selhel, selcol );
diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/fsampler.cc b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/fsampler.cc
index 2fb445372d..3743934f41 100644
--- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/fsampler.cc
+++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/fsampler.cc
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Feb 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #include "mgOnGpuConfig.h"
 
@@ -13,7 +13,7 @@
 
 //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -40,7 +40,7 @@ namespace mg5amcCpu
   private:
     const int m_nevt; // The number of events in each iteration
     int m_iiter;      // The iteration counter (for random number seeding)
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
     HostBufferRndNumMomenta m_hstRndmom; // Memory buffers for random numbers
     HostBufferMomenta m_hstMomenta;      // Memory buffers for momenta
     HostBufferWeights m_hstWeights;      // Memory buffers for sampling weights
@@ -105,7 +105,7 @@ namespace mg5amcCpu
 
 extern "C"
 {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   using namespace mg5amcGpu;
 #else
   using namespace mg5amcCpu;
diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/runTest.cc b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/runTest.cc
index d4a760a71b..de327f2321 100644
--- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/runTest.cc
+++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/runTest.cc
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: S. Hageboeck (Nov 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 //----------------------------------------------------------------------------
 // Use ./runTest.exe --gtest_filter=*xxx to run only testxxx.cc tests
 //----------------------------------------------------------------------------
@@ -18,7 +18,7 @@
 #include "RandomNumberKernels.h"
 #include "epoch_process_id.h"
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 using namespace mg5amcGpu;
 #else
 using namespace mg5amcCpu;
@@ -35,7 +35,7 @@ struct CUDA_CPU_TestBase : public TestDriverBase
     : TestDriverBase( npar, refFileName ) {}
 };
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 struct CPUTest : public CUDA_CPU_TestBase
 {
   // Struct data members (process, and memory structures for random numbers, momenta, matrix elements and weights on host and device)
@@ -119,7 +119,7 @@ struct CPUTest : public CUDA_CPU_TestBase
 };
 #endif
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 struct CUDATest : public CUDA_CPU_TestBase
 {
   // Reset the device when our test goes out of scope. Note that this should happen after
@@ -128,7 +128,7 @@ struct CUDATest : public CUDA_CPU_TestBase
   {
     ~DeviceReset()
     {
-      checkCuda( cudaDeviceReset() ); // this is needed by cuda-memcheck --leak-check full
+      checkGpu( gpuDeviceReset() ); // this is needed by cuda-memcheck --leak-check full
     }
   } deviceResetter;
 
@@ -256,7 +256,7 @@ INSTANTIATE_TEST_SUITE_P( prefix, \
                           test_suite_name, \
                           testing::Values( new CUDATest( MG_EPOCH_REFERENCE_FILE_NAME ) ) );
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 MG_INSTANTIATE_TEST_SUITE_GPU( XTESTID_GPU( MG_EPOCH_PROCESS_ID ), MadgraphTest );
 #else
 MG_INSTANTIATE_TEST_SUITE_CPU( XTESTID_CPU( MG_EPOCH_PROCESS_ID ), MadgraphTest );
diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/testmisc.cc b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/testmisc.cc
index 895d6eeb56..ba9e59a8a3 100644
--- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/testmisc.cc
+++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/testmisc.cc
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 //----------------------------------------------------------------------------
 // Use ./runTest.exe --gtest_filter=*misc to run only testmisc.cc tests
 //----------------------------------------------------------------------------
@@ -17,7 +17,7 @@
 #include <sstream>
 #include <typeinfo>
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 #define TESTID( s ) s##_GPU_MISC
 #else
 #define TESTID( s ) s##_CPU_MISC
@@ -26,7 +26,7 @@
 #define XTESTID( s ) TESTID( s )
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -59,7 +59,7 @@ namespace mg5amcCpu
 
 TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testmisc )
 {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   using namespace mg5amcGpu;
 #else
   using namespace mg5amcCpu;
diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/testxxx.cc b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/testxxx.cc
index 3361fe5aa9..e5167de00c 100644
--- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/testxxx.cc
+++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/testxxx.cc
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Apr 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 //----------------------------------------------------------------------------
 // Use ./runTest.exe --gtest_filter=*xxx to run only testxxx.cc tests
 //----------------------------------------------------------------------------
@@ -24,7 +24,7 @@
 #include <iomanip>
 #include <iostream>
 #include <vector>
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 #define TESTID( s ) s##_GPU_XXX
 #else
 #define TESTID( s ) s##_CPU_XXX
@@ -32,7 +32,7 @@
 
 #define XTESTID( s ) TESTID( s )
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -42,7 +42,7 @@ namespace mg5amcCpu
   int FPEhandlerIevt = -1;
   inline void FPEhandler( int sig )
   {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     std::cerr << "Floating Point Exception (GPU): '" << FPEhandlerMessage << "' ievt=" << FPEhandlerIevt << std::endl;
 #else
     std::cerr << "Floating Point Exception (CPU neppV=" << neppV << "): '" << FPEhandlerMessage << "' ievt=" << FPEhandlerIevt << std::endl;
@@ -53,7 +53,7 @@ namespace mg5amcCpu
 
 TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testxxx )
 {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   using namespace mg5amcGpu;
 #else
   using namespace mg5amcCpu;
@@ -77,7 +77,7 @@ TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testxxx )
   assert( nevt % neppM == 0 ); // nevt must be a multiple of neppM
   assert( nevt % neppV == 0 ); // nevt must be a multiple of neppV
   // Fill in the input momenta
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   mg5amcGpu::PinnedHostBufferMomenta hstMomenta( nevt ); // AOSOA[npagM][npar=4][np4=4][neppM]
 #else
   mg5amcCpu::HostBufferMomenta hstMomenta( nevt ); // AOSOA[npagM][npar=4][np4=4][neppM]
@@ -322,7 +322,7 @@ TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testxxx )
   {
     for( int ievt = 0; ievt < nevt; ievt++ )
     {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
       using namespace mg5amcGpu;
 #else
       using namespace mg5amcCpu;
diff --git a/epochX/cudacpp/gg_tt01g.mad/src/HelAmps_sm.h b/epochX/cudacpp/gg_tt01g.mad/src/HelAmps_sm.h
index 361b488401..0dd0f3ebba 100644
--- a/epochX/cudacpp/gg_tt01g.mad/src/HelAmps_sm.h
+++ b/epochX/cudacpp/gg_tt01g.mad/src/HelAmps_sm.h
@@ -5,7 +5,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: A. Valassi (Sep 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
 // MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
@@ -28,7 +28,7 @@
 //#include <iomanip>
 //#include <iostream>
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/gg_tt01g.mad/src/Parameters_sm.cc b/epochX/cudacpp/gg_tt01g.mad/src/Parameters_sm.cc
index 64fc3fea62..067445b198 100644
--- a/epochX/cudacpp/gg_tt01g.mad/src/Parameters_sm.cc
+++ b/epochX/cudacpp/gg_tt01g.mad/src/Parameters_sm.cc
@@ -4,7 +4,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: A. Valassi (Sep 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
 // MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
@@ -17,7 +17,7 @@
 #include <iomanip>
 #include <iostream>
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 using namespace mg5amcGpu;
 #else
 using namespace mg5amcCpu;
diff --git a/epochX/cudacpp/gg_tt01g.mad/src/Parameters_sm.h b/epochX/cudacpp/gg_tt01g.mad/src/Parameters_sm.h
index b6568d3761..9581d66e0e 100644
--- a/epochX/cudacpp/gg_tt01g.mad/src/Parameters_sm.h
+++ b/epochX/cudacpp/gg_tt01g.mad/src/Parameters_sm.h
@@ -27,7 +27,7 @@
 #include "read_slha.h"
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -93,7 +93,7 @@ namespace mg5amcCpu
 #include <limits>
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -218,7 +218,7 @@ namespace mg5amcCpu
 //==========================================================================
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -239,7 +239,7 @@ namespace mg5amcCpu
 #pragma GCC diagnostic push
 #pragma GCC diagnostic ignored "-Wunused-variable"  // e.g. <<warning: unused variable ‘mdl_G__exp__2’ [-Wunused-variable]>>
 #pragma GCC diagnostic ignored "-Wunused-parameter" // e.g. <<warning: unused parameter ‘G’ [-Wunused-parameter]>>
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 #pragma nv_diagnostic push
 #pragma nv_diag_suppress 177 // e.g. <<warning #177-D: variable "mdl_G__exp__2" was declared but never referenced>>
 #endif
@@ -267,7 +267,7 @@ namespace mg5amcCpu
       // End SM implementation - no special handling of vectors of floats as in EFT (#439)
       return out;
     }
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 #pragma GCC diagnostic pop
 #pragma nv_diagnostic pop
 #endif
diff --git a/epochX/cudacpp/gg_tt01g.mad/src/cudacpp_src.mk b/epochX/cudacpp/gg_tt01g.mad/src/cudacpp_src.mk
index d4cc628aec..159e19a46d 100644
--- a/epochX/cudacpp/gg_tt01g.mad/src/cudacpp_src.mk
+++ b/epochX/cudacpp/gg_tt01g.mad/src/cudacpp_src.mk
@@ -19,7 +19,7 @@ SHELL := /bin/bash
 #=== Configure common compiler flags for CUDA and C++
 
 INCFLAGS = -I.
-OPTFLAGS = -O3 # this ends up in CUFLAGS too (should it?), cannot add -Ofast or -ffast-math here
+OPTFLAGS = -O3 # this ends up in GPUFLAGS too (should it?), cannot add -Ofast or -ffast-math here
 
 #-------------------------------------------------------------------------------
 
@@ -45,13 +45,13 @@ endif
 
 #-------------------------------------------------------------------------------
 
-#=== Configure the CUDA compiler (note: NVCC is already exported including ccache)
+#=== Configure the CUDA compiler (note: GPUCC is already exported including ccache)
 
-###$(info NVCC=$(NVCC))
+###$(info GPUCC=$(GPUCC))
 
 #-------------------------------------------------------------------------------
 
-#=== Configure ccache for C++ builds (note: NVCC is already exported including ccache)
+#=== Configure ccache for C++ builds (note: GPUCC is already exported including ccache)
 
 # Enable ccache if USECCACHE=1
 ifeq ($(USECCACHE)$(shell echo $(CXX) | grep ccache),1)
@@ -92,6 +92,13 @@ endif
 ###$(info OMPFLAGS=$(OMPFLAGS))
 CXXFLAGS += $(OMPFLAGS)
 
+# Add correct -DHIP_LATFORM when compiling for HIP
+ifeq ($(findstring nvcc,$(GPUCC)),nvcc)
+  GPUFLAGS += -Xcompiler -fPIC -c -x cu
+else ifeq ($(findstring hipcc,$(GPUCC)),hipcc)
+  GPUFLAGS += -fPIC -c
+endif
+
 # Set the build flags appropriate to each AVX choice (example: "make AVX=none")
 # [NB MGONGPU_PVW512 is needed because "-mprefer-vector-width=256" is not exposed in a macro]
 # [See https://gcc.gnu.org/bugzilla/show_bug.cgi?id=96476]
@@ -253,20 +260,20 @@ $(BUILDDIR)/%.o : %.cc *.h $(BUILDDIR)/.build.$(TAG)
 # Generic target and build rules: objects from CUDA compilation
 $(BUILDDIR)/%_cu.o : %.cc *.h $(BUILDDIR)/.build.$(TAG)
 	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
-	$(NVCC) $(CPPFLAGS) $(CUFLAGS) -Xcompiler -fPIC -c -x cu $< -o $@
+	$(GPUCC) $(CPPFLAGS) $(GPUFLAGS) $< -o $@
 
 #-------------------------------------------------------------------------------
 
 cxx_objects=$(addprefix $(BUILDDIR)/, Parameters_sm.o read_slha.o)
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 cu_objects=$(addprefix $(BUILDDIR)/, Parameters_sm_cu.o)
 endif
 
 # Target (and build rules): common (src) library
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so : $(cxx_objects) $(cu_objects)
 	@if [ ! -d $(LIBDIR) ]; then echo "mkdir -p $(LIBDIR)"; mkdir -p $(LIBDIR); fi
-	$(NVCC) -shared -o $@ $(cxx_objects) $(cu_objects) $(LDFLAGS)
+	$(GPUCC) -shared -o $@ $(cxx_objects) $(cu_objects) $(LDFLAGS)
 else
 $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so : $(cxx_objects)
 	@if [ ! -d $(LIBDIR) ]; then echo "mkdir -p $(LIBDIR)"; mkdir -p $(LIBDIR); fi
diff --git a/epochX/cudacpp/gg_tt01g.mad/src/mgOnGpuConfig.h b/epochX/cudacpp/gg_tt01g.mad/src/mgOnGpuConfig.h
index 80032e528b..55d03f1252 100644
--- a/epochX/cudacpp/gg_tt01g.mad/src/mgOnGpuConfig.h
+++ b/epochX/cudacpp/gg_tt01g.mad/src/mgOnGpuConfig.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jul 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MGONGPUCONFIG_H
 #define MGONGPUCONFIG_H 1
@@ -10,12 +10,25 @@
 // There are two different code bases for standalone_cudacpp (without multichannel) and madevent+cudacpp (with multichannel)
 #define MGONGPU_SUPPORTS_MULTICHANNEL 1
 
+// Is this a GPU (CUDA, HIP) or CPU implementation?
+#ifdef __CUDACC__
+#define MGONGPUCPP_GPUIMPL cuda
+#elif defined __HIPCC__
+#define MGONGPUCPP_GPUIMPL hip
+#else
+#undef MGONGPUCPP_GPUIMPL
+#endif
+
 // ** NB1 Throughputs (e.g. 6.8E8) are events/sec for "./gcheck.exe -p 65536 128 12"
 // ** NB2 Baseline on b7g47n0004 fluctuates (probably depends on load on other VMs)
 
 // Choose if curand is supported for generating random numbers
+// For HIP, by default, do not use curand (common random numbers will be used instead)
 // For both CUDA and C++, by default, do not inline, but allow this macro to be set from outside with e.g. -DMGONGPU_HAS_NO_CURAND
-// (there exist CUDA installations, e.g. using the HPC package, which do not include curand - see PR #784)
+// (there exist CUDA installations, e.g. using the HPC package, which do not include curand - see PR #784 and #785)
+#if defined __HIPCC__
+#define MGONGPU_HAS_NO_CURAND 1
+#else
 //#ifdef __CUDACC__
 //#undef MGONGPU_HAS_NO_CURAND // default
 ////#define MGONGPU_HAS_NO_CURAND 1
@@ -23,6 +36,7 @@
 //#undef MGONGPU_HAS_NO_CURAND // default
 ////#define MGONGPU_HAS_NO_CURAND 1
 //#endif
+#endif
 
 // Choose floating point precision (for everything but color algebra #537)
 // If one of these macros has been set from outside with e.g. -DMGONGPU_FPTYPE_FLOAT, nothing happens (issue #167)
@@ -54,23 +68,28 @@
 //#undef MGONGPU_HARDCODE_PARAM // default
 ////#define MGONGPU_HARDCODE_PARAM 1
 
-// Complex type in c++: std::complex or cxsmpl (CHOOSE ONLY ONE)
-#ifndef __CUDACC__
-//#define MGONGPU_CPPCXTYPE_STDCOMPLEX 1 // ~8 percent slower on float, same on double (5.1E6/double, 9.4E6/float)
-#define MGONGPU_CPPCXTYPE_CXSMPL 1 // new default (5.1E6/double, 10.2E6/float)
-#endif
-
-// Complex type in cuda: thrust or cucomplex or cxsmpl (CHOOSE ONLY ONE)
+// Complex type in CUDA: thrust or cucomplex or cxsmpl (CHOOSE ONLY ONE)
 #ifdef __CUDACC__
 #define MGONGPU_CUCXTYPE_THRUST 1 // default (~1.15E9/double, ~3.2E9/float)
 //#define MGONGPU_CUCXTYPE_CUCOMPLEX 1 // ~10 percent slower (1.03E9/double, ~2.8E9/float)
 //#define MGONGPU_CUCXTYPE_CXSMPL 1 // ~10 percent slower (1.00E9/double, ~2.9E9/float)
+
+// Complex type in HIP: cxsmpl (ONLY ONE OPTION POSSIBLE)
+#elif defined __HIPCC__
+#define MGONGPU_CUCXTYPE_CXSMPL 1 // ~10 percent slower (1.00E9/double, ~2.9E9/float)
+
+// Complex type in C++: std::complex or cxsmpl (CHOOSE ONLY ONE)
+#else
+//#define MGONGPU_CPPCXTYPE_STDCOMPLEX 1 // ~8 percent slower on float, same on double (5.1E6/double, 9.4E6/float)
+#define MGONGPU_CPPCXTYPE_CXSMPL 1 // new default (5.1E6/double, 10.2E6/float)
 #endif
 
-// Cuda nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation
+// CUDA nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation
 #ifdef __CUDACC__
-#undef MGONGPU_NSIGHT_DEBUG // default
+#undef MGONGPU_NSIGHT_DEBUG // default in CUDA
 //#define MGONGPU_NSIGHT_DEBUG 1
+#else
+#undef MGONGPU_NSIGHT_DEBUG // only option in HIP or C++
 #endif
 
 // SANITY CHECKS (floating point precision for everything but color algebra #537)
@@ -86,17 +105,21 @@
 #error You cannot use double precision for color algebra and single precision elsewhere
 #endif
 
-// SANITY CHECKS (c++ complex number implementation)
-#ifndef __CUDACC__
-#if defined MGONGPU_CPPCXTYPE_STDCOMPLEX and defined MGONGPU_CPPCXTYPE_CXSMPL
-#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CPPCXTYPE_STDCOMPLEX or MGONGPU_CPPCXTYPE_CXSMPL
+// SANITY CHECKS (CUDA complex number implementation)
+#ifdef __CUDACC__
+#if defined MGONGPU_CUCXTYPE_THRUST and defined MGONGPU_CUCXTYPE_CUCOMPLEX
+#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CUCXTYPE_THRUST or MGONGPU_CUCXTYPE_CUCOMPLEX for CUDA
+#elif defined MGONGPU_CUCXTYPE_THRUST and defined MGONGPU_CUCXTYPE_CXSMPL
+#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CUCXTYPE_THRUST or MGONGPU_CUCXTYPE_CXSMPL for CUDA
+#elif defined MGONGPU_CUCXTYPE_CUCOMPLEX and defined MGONGPU_CUCXTYPE_CXSMPL
+#error You must CHOOSE (ONE AND) ONLY ONE OF MGONGPU_CUCXTYPE_CUCOMPLEX or MGONGPU_CUCXTYPE_CXSMPL for CUDA
 #endif
 #endif
 
-// SANITY CHECKS (cuda complex number implementation)
-#ifdef __CUDACC__
-#if defined MGONGPU_CUCXTYPE_THRUST and defined MGONGPU_CUCXTYPE_CUCOMPLEX and defined MGONGPU_CUCXTYPE_CXSMPL
-#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CUCXTYPE_THRUST or MGONGPU_CUCXTYPE_CUCOMPLEX or MGONGPU_CUCXTYPE_CXSMPL
+// SANITY CHECKS (C++ complex number implementation)
+#ifndef MGONGPUCPP_GPUIMPL
+#if defined MGONGPU_CPPCXTYPE_STDCOMPLEX and defined MGONGPU_CPPCXTYPE_CXSMPL
+#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CPPCXTYPE_STDCOMPLEX or MGONGPU_CPPCXTYPE_CXSMPL for C++
 #endif
 #endif
 
@@ -134,7 +157,7 @@ namespace mgOnGpu
   // Alignment requirement for using reinterpret_cast with SIMD vectorized code
   // (using reinterpret_cast with non aligned memory may lead to segmentation faults!)
   // Only needed for C++ code but can be enforced also in NVCC builds of C++ code using CUDA>=11.2 and C++17 (#318, #319, #333)
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   constexpr int cppAlign = 64; // alignment requirement for SIMD vectorization (64-byte i.e. 512-bit)
 #endif
 
@@ -145,7 +168,7 @@ using mgOnGpu::fptype;
 using mgOnGpu::fptype2;
 
 // C++ SIMD vectorization width (this will be used to set neppV)
-#ifdef __CUDACC__ // CUDA implementation has no SIMD
+#ifdef MGONGPUCPP_GPUIMPL // CUDA and HIP implementations have no SIMD
 #undef MGONGPU_CPPSIMD
 #elif defined __AVX512VL__ && defined MGONGPU_PVW512 // C++ "512z" AVX512 with 512 width (512-bit ie 64-byte): 8 (DOUBLE) or 16 (FLOAT)
 #ifdef MGONGPU_FPTYPE_DOUBLE
@@ -175,9 +198,9 @@ using mgOnGpu::fptype2;
 #undef MGONGPU_CPPSIMD
 #endif
 
-// Cuda nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation
+// CUDA nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation
 // Arguments (not used so far): text is __FUNCTION__, code is 0 (start) or 1 (end)
-#if defined __CUDACC__ && defined MGONGPU_NSIGHT_DEBUG /* clang-format off */
+#if defined __CUDA__ && defined MGONGPU_NSIGHT_DEBUG /* clang-format off */
 #define mgDebugDeclare() __shared__ float mgDebugCounter[mgOnGpu::ntpbMAX];
 #define mgDebugInitialise() { mgDebugCounter[threadIdx.x] = 0; }
 #define mgDebug( code, text ) { mgDebugCounter[threadIdx.x] += 1; }
@@ -189,8 +212,8 @@ using mgOnGpu::fptype2;
 #define mgDebugFinalise() { /*noop*/ }
 #endif /* clang-format on */
 
-// Define empty CUDA declaration specifiers for C++
-#ifndef __CUDACC__
+// Define empty CUDA/HIP declaration specifiers for C++
+#ifndef MGONGPUCPP_GPUIMPL
 #define __global__
 #define __host__
 #define __device__
diff --git a/epochX/cudacpp/gg_tt01g.mad/src/mgOnGpuCxtypes.h b/epochX/cudacpp/gg_tt01g.mad/src/mgOnGpuCxtypes.h
index ca9a9f00c0..5532e22fa1 100644
--- a/epochX/cudacpp/gg_tt01g.mad/src/mgOnGpuCxtypes.h
+++ b/epochX/cudacpp/gg_tt01g.mad/src/mgOnGpuCxtypes.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022, based on earlier work by D. Smith) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MGONGPUCXTYPES_H
 #define MGONGPUCXTYPES_H 1
@@ -19,7 +19,7 @@
 #include <complex>
 
 // Complex type in cuda: thrust or cucomplex or cxsmpl
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 #if defined MGONGPU_CUCXTYPE_THRUST
 #pragma clang diagnostic push
 #pragma clang diagnostic ignored "-Wtautological-compare" // for icpx2021/clang13 (https://stackoverflow.com/a/15864661)
@@ -82,7 +82,7 @@ namespace mgOnGpu /* clang-format off */
 using mgOnGpu::cxsmpl;
 
 // Printout to stream for user defined types
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -92,7 +92,7 @@ namespace mg5amcCpu
   inline __host__ std::ostream&
   operator<<( std::ostream& out, const cxsmpl<FP>& c )
   {
-    out << std::complex( c.real(), c.imag() );
+    out << std::complex<FP>( c.real(), c.imag() );
     return out;
   }
 
@@ -215,14 +215,14 @@ namespace mg5amcCpu
 //==========================================================================
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
 #endif
 {
   // --- Type definitions (complex type: cxtype)
-#ifdef __CUDACC__ // cuda
+#ifdef MGONGPUCPP_GPUIMPL // cuda
 #if defined MGONGPU_CUCXTYPE_THRUST
   typedef thrust::complex<fptype> cxtype;
 #elif defined MGONGPU_CUCXTYPE_CUCOMPLEX
@@ -255,7 +255,7 @@ namespace mg5amcCpu
 //==========================================================================
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -307,7 +307,7 @@ namespace mg5amcCpu
 
   //==========================================================================
 
-#if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_THRUST // cuda + thrust
+#if defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CUCXTYPE_THRUST // cuda + thrust
 
   //------------------------------
   // CUDA - using thrust::complex
@@ -343,11 +343,11 @@ namespace mg5amcCpu
     return c;
   }
 
-#endif // #if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_THRUST
+#endif // #if defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CUCXTYPE_THRUST
 
   //==========================================================================
 
-#if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_CUCOMPLEX // cuda + cucomplex
+#if defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CUCXTYPE_CUCOMPLEX // cuda + cucomplex
 
   //------------------------------
   // CUDA - using cuComplex
@@ -562,11 +562,11 @@ namespace mg5amcCpu
     return cxmake( c.real(), c.imag() );
   }
 
-#endif // #if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_CUCOMPLEX
+#endif // #if defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CUCXTYPE_CUCOMPLEX
 
   //==========================================================================
 
-#if not defined __CUDACC__ and defined MGONGPU_CPPCXTYPE_STDCOMPLEX // c++ + stdcomplex
+#if not defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CPPCXTYPE_STDCOMPLEX // c++ + stdcomplex
 
   //------------------------------
   // C++ - using std::complex
@@ -610,7 +610,7 @@ namespace mg5amcCpu
   }
 #endif
 
-#endif // #if not defined __CUDACC__ and defined MGONGPU_CPPCXTYPE_STDCOMPLEX
+#endif // #if not defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CPPCXTYPE_STDCOMPLEX
 
   //==========================================================================
 
@@ -633,7 +633,7 @@ namespace mg5amcCpu
 //==========================================================================
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/gg_tt01g.mad/src/mgOnGpuFptypes.h b/epochX/cudacpp/gg_tt01g.mad/src/mgOnGpuFptypes.h
index 905c97d700..fa3a02664b 100644
--- a/epochX/cudacpp/gg_tt01g.mad/src/mgOnGpuFptypes.h
+++ b/epochX/cudacpp/gg_tt01g.mad/src/mgOnGpuFptypes.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MGONGPUFPTYPES_H
 #define MGONGPUFPTYPES_H 1
@@ -12,7 +12,7 @@
 #include <cmath>
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL // cuda
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -20,7 +20,7 @@ namespace mg5amcCpu
 {
   //==========================================================================
 
-#ifdef __CUDACC__ // cuda
+#ifdef MGONGPUCPP_GPUIMPL // cuda
 
   //------------------------------
   // Floating point types - Cuda
@@ -64,11 +64,11 @@ namespace mg5amcCpu
 #endif
   }
 
-#endif // #ifdef __CUDACC__
+#endif // #ifdef MGONGPUCPP_GPUIMPL
 
   //==========================================================================
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 
   //------------------------------
   // Floating point types - C++
@@ -92,7 +92,7 @@ namespace mg5amcCpu
     return std::sqrt( f );
   }
 
-#endif // #ifndef __CUDACC__
+#endif // #ifndef MGONGPUCPP_GPUIMPL
 
   //==========================================================================
 
diff --git a/epochX/cudacpp/gg_tt01g.mad/src/mgOnGpuVectors.h b/epochX/cudacpp/gg_tt01g.mad/src/mgOnGpuVectors.h
index e1299ba81e..cdae04326b 100644
--- a/epochX/cudacpp/gg_tt01g.mad/src/mgOnGpuVectors.h
+++ b/epochX/cudacpp/gg_tt01g.mad/src/mgOnGpuVectors.h
@@ -32,7 +32,7 @@
 #endif
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -131,7 +131,7 @@ namespace mg5amcCpu
 #endif
 #endif
 
-#else // i.e #ifndef MGONGPU_CPPSIMD (this includes #ifdef __CUDACC__)
+#else // i.e #ifndef MGONGPU_CPPSIMD (this includes #ifdef MGONGPUCPP_GPUIMPL)
 
   const int neppV = 1;
 
@@ -153,13 +153,13 @@ namespace mg5amcCpu
 //==========================================================================
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
 #endif
 {
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 
   // Printout to stream for user defined types
 
@@ -805,11 +805,11 @@ namespace mg5amcCpu
 
 #endif // #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
 
-#endif // #ifndef __CUDACC__
+#endif // #ifndef MGONGPUCPP_GPUIMPL
 
   //==========================================================================
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 
   //------------------------------
   // Vector types - CUDA
@@ -853,12 +853,12 @@ namespace mg5amcCpu
     return mask;
   }
 
-#endif // #ifdef __CUDACC__
+#endif // #ifdef MGONGPUCPP_GPUIMPL
 
   //==========================================================================
 
   // Scalar-or-vector types: scalar in CUDA, vector or scalar in C++
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   typedef bool bool_sv;
   typedef fptype fptype_sv;
   typedef fptype2 fptype2_sv;
@@ -879,7 +879,7 @@ namespace mg5amcCpu
 #endif
 
   // Scalar-or-vector zeros: scalar in CUDA, vector or scalar in C++
-#ifdef __CUDACC__ /* clang-format off */
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
   inline __host__ __device__ cxtype cxzero_sv(){ return cxtype( 0, 0 ); }
 #elif defined MGONGPU_CPPSIMD
   inline cxtype_v cxzero_sv() { return cxtype_v(); } // RRRR=0000 IIII=0000
diff --git a/epochX/cudacpp/gg_tt01g.mad/src/rambo.h b/epochX/cudacpp/gg_tt01g.mad/src/rambo.h
index e02ea52496..cd7e1008ea 100644
--- a/epochX/cudacpp/gg_tt01g.mad/src/rambo.h
+++ b/epochX/cudacpp/gg_tt01g.mad/src/rambo.h
@@ -4,7 +4,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 
 #include "mgOnGpuConfig.h"
@@ -18,7 +18,7 @@
 #include <iostream>
 
 // Simplified rambo version for 2 to N (with N>=2) processes with massless particles
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -83,7 +83,7 @@ namespace mg5amcCpu
       static bool first = true;
       if( first )
       {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
         if constexpr( M_ACCESS::isOnDevice() ) // avoid
         {
           const int ievt0 = 0;
@@ -166,7 +166,7 @@ namespace mg5amcCpu
     wt = po2log;
     if( nparf != 2 ) wt = ( 2. * nparf - 4. ) * log( energy ) + z[nparf - 1];
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
     // issue warnings if weight is too small or too large
     static int iwarn[5] = { 0, 0, 0, 0, 0 };
     if( wt < -180. )
diff --git a/epochX/cudacpp/gg_ttg.mad/CODEGEN_mad_gg_ttg_log.txt b/epochX/cudacpp/gg_ttg.mad/CODEGEN_mad_gg_ttg_log.txt
index 37ba5c7297..00ae96c5fb 100644
--- a/epochX/cudacpp/gg_ttg.mad/CODEGEN_mad_gg_ttg_log.txt
+++ b/epochX/cudacpp/gg_ttg.mad/CODEGEN_mad_gg_ttg_log.txt
@@ -52,7 +52,7 @@ Note that you can still compile and run aMC@NLO with the built-in PDFs
 
 Using default text editor "vi". Set another one in ./input/mg5_configuration.txt
 Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt
-No valid web browser found. Please set in ./input/mg5_configuration.txt
+Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt
 import /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg.mg
 The import format was not given, so we guess it as command
 set stdout_level DEBUG
@@ -62,7 +62,7 @@ generate g g > t t~ g
 No model currently active, so we import the Standard Model
 INFO: load particles 
 INFO: load vertices 
-[1;32mDEBUG: model prefixing  takes 0.005791187286376953 [0m
+[1;32mDEBUG: model prefixing  takes 0.0055010318756103516 [0m
 INFO: Restrict model sm with file models/sm/restrict_default.dat . 
 [1;32mDEBUG: Simplifying conditional expressions [0m
 [1;32mDEBUG: remove interactions: u s w+ at order: QED=1 [0m
@@ -175,7 +175,7 @@ INFO: Generating Helas calls for process: g g > t t~ g WEIGHTED<=3 @1
 INFO: Processing color information for process: g g > t t~ g @1 
 INFO: Creating files in directory P1_gg_ttxg 
 [1;32mDEBUG:  kwargs[prefix] = 0 [1;30m[model_handling.py at line 1058][0m [0m
-[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7f9d4565e700> [1;30m[export_v4.py at line 6262][0m [0m
+[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7fd52d0a16a0> [1;30m[export_v4.py at line 6262][0m [0m
 INFO: Creating files in directory . 
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.h
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.cc
@@ -191,14 +191,14 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./.
 INFO: Generating Feynman diagrams for Process: g g > t t~ g WEIGHTED<=3 @1 
 INFO: Finding symmetric diagrams for subprocess group gg_ttxg 
 Generated helas calls for 1 subprocesses (16 diagrams) in 0.039 s
-Wrote files for 36 helas calls in 0.153 s
+Wrote files for 36 helas calls in 0.184 s
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates VVV1 routines[0m
 ALOHA: aloha creates FFV1 routines[0m
 ALOHA: aloha creates VVVV1 set of routines with options: P0[0m
 ALOHA: aloha creates VVVV3 set of routines with options: P0[0m
 ALOHA: aloha creates VVVV4 set of routines with options: P0[0m
-ALOHA: aloha creates 5 routines in  0.331 s
+ALOHA: aloha creates 5 routines in  0.325 s
 [1;32mDEBUG:  Entering PLUGIN_ProcessExporter.convert_model (create the model) [1;30m[output.py at line 202][0m [0m
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates VVV1 routines[0m
@@ -206,7 +206,7 @@ ALOHA: aloha creates FFV1 routines[0m
 ALOHA: aloha creates VVVV1 set of routines with options: P0[0m
 ALOHA: aloha creates VVVV3 set of routines with options: P0[0m
 ALOHA: aloha creates VVVV4 set of routines with options: P0[0m
-ALOHA: aloha creates 10 routines in  0.315 s
+ALOHA: aloha creates 10 routines in  0.310 s
 <class 'aloha.create_aloha.AbstractRoutine'> VVV1
 <class 'aloha.create_aloha.AbstractRoutine'> VVV1
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
@@ -252,9 +252,9 @@ Type "launch" to generate events from this process, or see
 Run "open index.html" to see more information about this process.
 quit
 
-real	0m2.208s
-user	0m1.988s
-sys	0m0.221s
+real	0m2.571s
+user	0m1.941s
+sys	0m0.238s
 Code generation completed in 2 seconds
 ************************************************************
 *                                                          *
@@ -281,7 +281,7 @@ INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/mg5amc
 INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg/Cards/me5_configuration.txt  
 Using default text editor "vi". Set another one in ./input/mg5_configuration.txt
 Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt
-No valid web browser found. Please set in ./input/mg5_configuration.txt
+Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt
 treatcards run
 quit
 INFO:  
@@ -311,7 +311,7 @@ INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/mg5amc
 INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg/Cards/me5_configuration.txt  
 Using default text editor "vi". Set another one in ./input/mg5_configuration.txt
 Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt
-No valid web browser found. Please set in ./input/mg5_configuration.txt
+Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt
 treatcards param
 quit
 INFO:  
diff --git a/epochX/cudacpp/gg_ttg.mad/COPYRIGHT b/epochX/cudacpp/gg_ttg.mad/COPYRIGHT
index a134b5fef9..84a883fbb0 100644
--- a/epochX/cudacpp/gg_ttg.mad/COPYRIGHT
+++ b/epochX/cudacpp/gg_ttg.mad/COPYRIGHT
@@ -15,6 +15,7 @@ The full development team currently includes the following authors :
   Stephan Hageboeck (CERN)
   Olivier Mattelaer (Universite Catholique de Louvain, original author)
   Stefan Roiser (CERN, original author)
+  Joergen Teig (CERN)
   Andrea Valassi (CERN, original author)
   Zenny Wettersten (CERN)
 See https://github.com/madgraph5/madgraph4gpu for more details. For the full
diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/Bridge.h b/epochX/cudacpp/gg_ttg.mad/SubProcesses/Bridge.h
index bf8b5e024d..89437b4c42 100644
--- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/Bridge.h
+++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/Bridge.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: S. Roiser (Nov 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Roiser, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef BRIDGE_H
 #define BRIDGE_H 1
@@ -23,7 +23,7 @@
 #include <memory>
 #include <type_traits>
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -83,7 +83,7 @@ namespace mg5amcCpu
     Bridge& operator=( const Bridge& ) = delete;
     Bridge& operator=( Bridge&& ) = delete;
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     /**
      * Set the gpublocks and gputhreads for the gpusequence - throws if evnt != gpublocks*gputhreads
      * (this is needed for BridgeKernel tests rather than for actual production use in Fortran)
@@ -150,7 +150,7 @@ namespace mg5amcCpu
     unsigned int m_nevt; // number of events
     int m_nGoodHel;      // the number of good helicities (-1 initially when they have not yet been calculated)
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     int m_gputhreads; // number of gpu threads (default set from number of events, can be modified)
     int m_gpublocks;  // number of gpu blocks (default set from number of events, can be modified)
     DeviceBuffer<FORTRANFPTYPE, sizePerEventMomenta> m_devMomentaF;
@@ -187,12 +187,12 @@ namespace mg5amcCpu
   // Forward declare transposition methods
   //
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 
   template<typename Tin, typename Tout>
   __global__ void dev_transposeMomentaF2C( const Tin* in, Tout* out, const unsigned int nevt );
 
-#endif // __CUDACC__
+#endif // MGONGPUCPP_GPUIMPL
 
   template<typename Tin, typename Tout>
   void hst_transposeMomentaF2C( const Tin* in, Tout* out, const unsigned int nevt );
@@ -209,7 +209,7 @@ namespace mg5amcCpu
   Bridge<FORTRANFPTYPE>::Bridge( unsigned int nevtF, unsigned int nparF, unsigned int np4F )
     : m_nevt( nevtF )
     , m_nGoodHel( -1 )
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     , m_gputhreads( 256 )                  // default number of gpu threads
     , m_gpublocks( m_nevt / m_gputhreads ) // this ensures m_nevt <= m_gpublocks*m_gputhreads
     , m_devMomentaF( m_nevt )
@@ -233,7 +233,7 @@ namespace mg5amcCpu
   {
     if( nparF != CPPProcess::npar ) throw std::runtime_error( "Bridge constructor: npar mismatch" );
     if( np4F != CPPProcess::np4 ) throw std::runtime_error( "Bridge constructor: np4 mismatch" );
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     if( ( m_nevt < s_gputhreadsmin ) || ( m_nevt % s_gputhreadsmin != 0 ) )
       throw std::runtime_error( "Bridge constructor: nevt should be a multiple of " + std::to_string( s_gputhreadsmin ) );
     while( m_nevt != m_gpublocks * m_gputhreads )
@@ -249,7 +249,7 @@ namespace mg5amcCpu
 #else
     std::cout << "WARNING! Instantiate host Bridge (nevt=" << m_nevt << ")" << std::endl;
     m_pmek.reset( new MatrixElementKernelHost( m_hstMomentaC, m_hstGs, m_hstRndHel, m_hstRndCol, m_hstMEs, m_hstSelHel, m_hstSelCol, m_nevt ) );
-#endif // __CUDACC__
+#endif // MGONGPUCPP_GPUIMPL
     // Create a process object, read param card and set parameters
     // FIXME: the process instance can happily go out of scope because it is only needed to read parameters?
     // FIXME: the CPPProcess should really be a singleton? what if fbridgecreate is called from several Fortran threads?
@@ -262,7 +262,7 @@ namespace mg5amcCpu
     process.initProc( paramCard );
   }
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   template<typename FORTRANFPTYPE>
   void Bridge<FORTRANFPTYPE>::set_gpugrid( const int gpublocks, const int gputhreads )
   {
@@ -276,7 +276,7 @@ namespace mg5amcCpu
   }
 #endif
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   template<typename FORTRANFPTYPE>
   void Bridge<FORTRANFPTYPE>::gpu_sequence( const FORTRANFPTYPE* momenta,
                                             const FORTRANFPTYPE* gs,
@@ -291,14 +291,14 @@ namespace mg5amcCpu
     constexpr int neppM = MemoryAccessMomenta::neppM;
     if constexpr( neppM == 1 && std::is_same_v<FORTRANFPTYPE, fptype> )
     {
-      checkCuda( cudaMemcpy( m_devMomentaC.data(), momenta, m_devMomentaC.bytes(), cudaMemcpyHostToDevice ) );
+      gpuMemcpy( m_devMomentaC.data(), momenta, m_devMomentaC.bytes(), gpuMemcpyHostToDevice );
     }
     else
     {
-      checkCuda( cudaMemcpy( m_devMomentaF.data(), momenta, m_devMomentaF.bytes(), cudaMemcpyHostToDevice ) );
+      gpuMemcpy( m_devMomentaF.data(), momenta, m_devMomentaF.bytes(), gpuMemcpyHostToDevice );
       const int thrPerEvt = CPPProcess::npar * CPPProcess::np4; // AV: transpose alg does 1 element per thread (NOT 1 event per thread)
       //const int thrPerEvt = 1; // AV: try new alg with 1 event per thread... this seems slower
-      dev_transposeMomentaF2C<<<m_gpublocks * thrPerEvt, m_gputhreads>>>( m_devMomentaF.data(), m_devMomentaC.data(), m_nevt );
+      gpuLaunchKernel( dev_transposeMomentaF2C, m_gpublocks * thrPerEvt, m_gputhreads, m_devMomentaF.data(), m_devMomentaC.data(), m_nevt );
     }
     if constexpr( std::is_same_v<FORTRANFPTYPE, fptype> )
     {
@@ -341,7 +341,7 @@ namespace mg5amcCpu
   }
 #endif
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   template<typename FORTRANFPTYPE>
   void Bridge<FORTRANFPTYPE>::cpu_sequence( const FORTRANFPTYPE* momenta,
                                             const FORTRANFPTYPE* gs,
@@ -396,7 +396,7 @@ namespace mg5amcCpu
   // - C++ array: momenta[npagM][npar][np4][neppM] with nevt=npagM*neppM (AOSOA)
   //
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   template<typename Tin, typename Tout>
   __global__ void dev_transposeMomentaF2C( const Tin* in, Tout* out, const unsigned int nevt )
   {
diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/BridgeKernels.cc b/epochX/cudacpp/gg_ttg.mad/SubProcesses/BridgeKernels.cc
index d58066c9c1..eaf4037a24 100644
--- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/BridgeKernels.cc
+++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/BridgeKernels.cc
@@ -1,17 +1,18 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #include "BridgeKernels.h"
 
+#include "GpuAbstraction.h"
 #include "MemoryAccessMomenta.h"
 
 #include <sstream>
 
 //============================================================================
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -45,7 +46,7 @@ namespace mg5amcCpu
 
 //============================================================================
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 namespace mg5amcCpu
 {
 
@@ -96,7 +97,7 @@ namespace mg5amcCpu
 
 //============================================================================
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 {
 
diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/BridgeKernels.h b/epochX/cudacpp/gg_ttg.mad/SubProcesses/BridgeKernels.h
index 15eb4bff4d..3efef8ce97 100644
--- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/BridgeKernels.h
+++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/BridgeKernels.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef BRIDGEKERNELS_H
 #define BRIDGEKERNELS_H 1
@@ -12,7 +12,7 @@
 #include "MatrixElementKernels.h"
 #include "MemoryBuffers.h"
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -49,7 +49,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A Bridge wrapper class encapsulating matrix element calculations on a CPU host
   class BridgeKernelHost final : public BridgeKernelBase
   {
@@ -89,7 +89,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   // A Bridge wrapper class encapsulating matrix element calculations on a GPU device
   class BridgeKernelDevice : public BridgeKernelBase
   {
diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/CommonRandomNumberKernel.cc b/epochX/cudacpp/gg_ttg.mad/SubProcesses/CommonRandomNumberKernel.cc
index 985b39f576..010bc4cbd0 100644
--- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/CommonRandomNumberKernel.cc
+++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/CommonRandomNumberKernel.cc
@@ -1,15 +1,16 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #include "CommonRandomNumbers.h"
+#include "GpuAbstraction.h"
 #include "MemoryBuffers.h"
 #include "RandomNumberKernels.h"
 
 #include <cassert>
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/CrossSectionKernels.cc b/epochX/cudacpp/gg_ttg.mad/SubProcesses/CrossSectionKernels.cc
index 0b355a3c8d..c15b39844d 100644
--- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/CrossSectionKernels.cc
+++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/CrossSectionKernels.cc
@@ -1,10 +1,11 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #include "CrossSectionKernels.h"
 
+#include "GpuAbstraction.h"
 #include "MemoryAccessMatrixElements.h"
 #include "MemoryAccessWeights.h"
 #include "MemoryBuffers.h"
@@ -77,7 +78,7 @@ debug_me_is_abnormal( const fptype& me, size_t ievtALL )
 
 //============================================================================
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -185,7 +186,7 @@ namespace mg5amcCpu
 
 //============================================================================
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 {
 
diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/CrossSectionKernels.h b/epochX/cudacpp/gg_ttg.mad/SubProcesses/CrossSectionKernels.h
index 7933ca4bbf..4d9659e04e 100644
--- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/CrossSectionKernels.h
+++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/CrossSectionKernels.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef CROSSSECTIONKERNELS_H
 #define CROSSSECTIONKERNELS_H 1
@@ -13,7 +13,7 @@
 
 //============================================================================
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -96,7 +96,7 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
   /*
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   // A class encapsulating the calculation of event statistics on a GPU device
   class CrossSectionKernelDevice : public CrossSectionKernelBase, public NumberOfEvents
   {
diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/CudaRuntime.h b/epochX/cudacpp/gg_ttg.mad/SubProcesses/CudaRuntime.h
deleted file mode 100644
index 64ce52f4b3..0000000000
--- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/CudaRuntime.h
+++ /dev/null
@@ -1,85 +0,0 @@
-// Copyright (C) 2020-2023 CERN and UCLouvain.
-// Licensed under the GNU Lesser General Public License (version 3 or later).
-// Created by: S. Roiser (Jul 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
-
-#ifndef MG5AMC_CUDARUNTIME_H
-#define MG5AMC_CUDARUNTIME_H 1
-
-// MG5AMC on GPU uses the CUDA runtime API, not the lower level CUDA driver API
-// See https://docs.nvidia.com/cuda/cuda-runtime-api/driver-vs-runtime-api.html#driver-vs-runtime-api
-
-#include <cassert>
-#include <iostream>
-
-//--------------------------------------------------------------------------
-
-// See https://stackoverflow.com/a/14038590
-#ifdef __CUDACC__ /* clang-format off */
-#define checkCuda( code ) { assertCuda( code, __FILE__, __LINE__ ); }
-inline void assertCuda( cudaError_t code, const char* file, int line, bool abort = true )
-{
-  if( code != cudaSuccess )
-  {
-    printf( "ERROR! assertCuda: '%s' (%d) in %s:%d\n", cudaGetErrorString( code ), code, file, line );
-    if( abort ) assert( code == cudaSuccess );
-  }
-}
-#endif /* clang-format on */
-
-//--------------------------------------------------------------------------
-
-#ifdef __CUDACC__
-namespace mg5amcGpu
-{
-  // Instantiate a CudaRuntime at the beginnining of the application's main to
-  // invoke cudaSetDevice(0) in the constructor and book a cudaDeviceReset() call in the destructor
-  // *** FIXME! This will all need to be designed differently when going to multi-GPU nodes! ***
-  struct CudaRuntime final
-  {
-    CudaRuntime( const bool debug = true )
-      : m_debug( debug ) { setUp( m_debug ); }
-    ~CudaRuntime() { tearDown( m_debug ); }
-    CudaRuntime( const CudaRuntime& ) = delete;
-    CudaRuntime( CudaRuntime&& ) = delete;
-    CudaRuntime& operator=( const CudaRuntime& ) = delete;
-    CudaRuntime& operator=( CudaRuntime&& ) = delete;
-    bool m_debug;
-
-    // Set up CUDA application
-    // ** NB: strictly speaking this is not needed when using the CUDA runtime API **
-    // Calling cudaSetDevice on startup is useful to properly book-keep the time spent in CUDA initialization
-    static void setUp( const bool debug = true )
-    {
-      // ** NB: it is useful to call cudaSetDevice, or cudaFree, to properly book-keep the time spent in CUDA initialization
-      // ** NB: otherwise, the first CUDA operation (eg a cudaMemcpyToSymbol in CPPProcess ctor) appears to take much longer!
-      /*
-      // [We initially added cudaFree(0) to "ease profile analysis" only because it shows up as a big recognizable block!]
-      // No explicit initialization is needed: https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#initialization
-      // It is not clear what cudaFree(0) does at all: https://stackoverflow.com/questions/69967813/
-      if ( debug ) std::cout << "__CudaRuntime: calling cudaFree(0)" << std::endl;
-      checkCuda( cudaFree( 0 ) ); // SLOW!
-      */
-      // Replace cudaFree(0) by cudaSetDevice(0), even if it is not really needed either
-      // (but see https://developer.nvidia.com/blog/cuda-pro-tip-always-set-current-device-avoid-multithreading-bugs)
-      if( debug ) std::cout << "__CudaRuntime: calling cudaSetDevice(0)" << std::endl;
-      checkCuda( cudaSetDevice( 0 ) ); // SLOW!
-    }
-
-    // Tear down CUDA application (call cudaDeviceReset)
-    // ** NB: strictly speaking this is not needed when using the CUDA runtime API **
-    // Calling cudaDeviceReset on shutdown is only needed for checking memory leaks in cuda-memcheck
-    // See https://docs.nvidia.com/cuda/cuda-memcheck/index.html#leak-checking
-    static void tearDown( const bool debug = true )
-    {
-      if( debug ) std::cout << "__CudaRuntime: calling cudaDeviceReset()" << std::endl;
-      checkCuda( cudaDeviceReset() );
-    }
-  };
-
-}
-#endif
-
-//--------------------------------------------------------------------------
-
-#endif // MG5AMC_CUDARUNTIME_H
diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/CurandRandomNumberKernel.cc b/epochX/cudacpp/gg_ttg.mad/SubProcesses/CurandRandomNumberKernel.cc
index eb56333b03..08a16f6f2c 100644
--- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/CurandRandomNumberKernel.cc
+++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/CurandRandomNumberKernel.cc
@@ -1,9 +1,9 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
-#include "CudaRuntime.h"
+#include "GpuRuntime.h"
 #include "MemoryBuffers.h"
 #include "RandomNumberKernels.h"
 
@@ -22,7 +22,7 @@ inline void assertCurand( curandStatus_t code, const char *file, int line, bool
 }
 #endif /* clang-format on */
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -36,7 +36,7 @@ namespace mg5amcCpu
   {
     if( m_isOnDevice )
     {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
       if( !m_rnarray.isOnDevice() )
         throw std::runtime_error( "CurandRandomNumberKernel on device with a host random number array" );
 #else
@@ -114,7 +114,7 @@ namespace mg5amcCpu
     /*
     printf( "\nCurandRandomNumberKernel::generateRnarray size = %d\n", (int)m_rnarray.size() );
     fptype* data = m_rnarray.data();
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     if( m_rnarray.isOnDevice() )
     {
       data = new fptype[m_rnarray.size()]();
@@ -123,7 +123,7 @@ namespace mg5amcCpu
 #endif
     for( int i = 0; i < ( (int)m_rnarray.size() / 4 ); i++ )
       printf( "[%4d] %f %f %f %f\n", i * 4, data[i * 4], data[i * 4 + 2], data[i * 4 + 2], data[i * 4 + 3] );
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     if( m_rnarray.isOnDevice() ) delete[] data;
 #endif
     */
diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/EventStatistics.h b/epochX/cudacpp/gg_ttg.mad/SubProcesses/EventStatistics.h
index 48b51e0a49..b425a5bade 100644
--- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/EventStatistics.h
+++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/EventStatistics.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef EventStatistics_H
 #define EventStatistics_H 1
@@ -16,7 +16,7 @@
 #include <limits>
 #include <string>
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/GpuAbstraction.h b/epochX/cudacpp/gg_ttg.mad/SubProcesses/GpuAbstraction.h
new file mode 100644
index 0000000000..6a7d9c05c0
--- /dev/null
+++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/GpuAbstraction.h
@@ -0,0 +1,71 @@
+// Copyright (C) 2020-2023 CERN and UCLouvain.
+// Licensed under the GNU Lesser General Public License (version 3 or later).
+// Created by: J. Teig (Jul 2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+
+#ifndef MG5AMC_GPUABSTRACTION_H
+#define MG5AMC_GPUABSTRACTION_H 1
+
+#include <cassert>
+
+//--------------------------------------------------------------------------
+
+#ifdef __CUDACC__
+
+#define gpuError_t cudaError_t
+#define gpuPeekAtLastError cudaPeekAtLastError
+#define gpuGetErrorString cudaGetErrorString
+#define gpuSuccess cudaSuccess
+
+#define gpuMallocHost( ptr, size ) checkGpu( cudaMallocHost( ptr, size ) )
+#define gpuMalloc( ptr, size ) checkGpu( cudaMalloc( ptr, size ) )
+
+#define gpuMemcpy( dstData, srcData, srcBytes, func ) checkGpu( cudaMemcpy( dstData, srcData, srcBytes, func ) )
+#define gpuMemcpyHostToDevice cudaMemcpyHostToDevice
+#define gpuMemcpyDeviceToHost cudaMemcpyDeviceToHost
+#define gpuMemcpyToSymbol( type1, type2, size ) checkGpu( cudaMemcpyToSymbol( type1, type2, size ) )
+
+#define gpuFree( ptr ) checkGpu( cudaFree( ptr ) )
+#define gpuFreeHost( ptr ) checkGpu( cudaFreeHost( ptr ) )
+
+#define gpuSetDevice cudaSetDevice
+#define gpuDeviceSynchronize cudaDeviceSynchronize
+#define gpuDeviceReset cudaDeviceReset
+
+#define gpuLaunchKernel( kernel, blocks, threads, ... ) kernel<<<blocks, threads>>>( __VA_ARGS__ )
+#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<<blocks, threads, sharedMem>>>( __VA_ARGS__ )
+
+//--------------------------------------------------------------------------
+
+#elif defined __HIPCC__
+
+#include "hip/hip_runtime.h"
+
+#define gpuError_t hipError_t
+#define gpuPeekAtLastError hipPeekAtLastError
+#define gpuGetErrorString hipGetErrorString
+#define gpuSuccess hipSuccess
+
+#define gpuMallocHost( ptr, size ) checkGpu( hipHostMalloc( ptr, size ) ) // HostMalloc better
+#define gpuMalloc( ptr, size ) checkGpu( hipMalloc( ptr, size ) )
+
+#define gpuMemcpy( dstData, srcData, srcBytes, func ) checkGpu( hipMemcpy( dstData, srcData, srcBytes, func ) )
+#define gpuMemcpyHostToDevice hipMemcpyHostToDevice
+#define gpuMemcpyDeviceToHost hipMemcpyDeviceToHost
+#define gpuMemcpyToSymbol( type1, type2, size ) checkGpu( hipMemcpyToSymbol( type1, type2, size ) )
+
+#define gpuFree( ptr ) checkGpu( hipFree( ptr ) )
+#define gpuFreeHost( ptr ) checkGpu( hipHostFree( ptr ) )
+
+#define gpuSetDevice hipSetDevice
+#define gpuDeviceSynchronize hipDeviceSynchronize
+#define gpuDeviceReset hipDeviceReset
+
+#define gpuLaunchKernel( kernel, blocks, threads, ... ) kernel<<<blocks, threads>>>( __VA_ARGS__ )
+#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<<blocks, threads, sharedMem>>>( __VA_ARGS__ )
+
+//--------------------------------------------------------------------------
+
+#endif
+
+#endif // MG5AMC_GPUABSTRACTION_H
diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/GpuRuntime.h b/epochX/cudacpp/gg_ttg.mad/SubProcesses/GpuRuntime.h
new file mode 100644
index 0000000000..93579ef08b
--- /dev/null
+++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/GpuRuntime.h
@@ -0,0 +1,85 @@
+// Copyright (C) 2020-2023 CERN and UCLouvain.
+// Licensed under the GNU Lesser General Public License (version 3 or later).
+// Created by: J. Teig (Jun 2023, based on earlier work by S. Roiser) for the MG5aMC CUDACPP plugin.
+// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+
+#ifndef MG5AMC_GPURUNTIME_H
+#define MG5AMC_GPURUNTIME_H 1
+
+// MG5AMC on GPU uses the CUDA runtime API, not the lower level CUDA driver API
+// See https://docs.nvidia.com/cuda/cuda-runtime-api/driver-vs-runtime-api.html#driver-vs-runtime-api
+
+#include "GpuAbstraction.h"
+
+#include <iostream>
+
+//--------------------------------------------------------------------------
+
+// See https://stackoverflow.com/a/14038590
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
+#define checkGpu( code ) { assertGpu( code, __FILE__, __LINE__ ); }
+inline void assertGpu( gpuError_t code, const char* file, int line, bool abort = true )
+{
+  if( code != gpuSuccess )
+  {
+    printf( "ERROR! assertGpu: '%s' (%d) in %s:%d\n", gpuGetErrorString( code ), code, file, line );
+    if( abort ) assert( code == gpuSuccess );
+  }
+}
+#endif /* clang-format on */
+
+//--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+namespace mg5amcGpu
+{
+  // Instantiate a GpuRuntime at the beginnining of the application's main to
+  // invoke gpuSetDevice(0) in the constructor and book a gpuDeviceReset() call in the destructor
+  // *** FIXME! This will all need to be designed differently when going to multi-GPU nodes! ***
+  struct GpuRuntime final
+  {
+    GpuRuntime( const bool debug = true )
+      : m_debug( debug ) { setUp( m_debug ); }
+    ~GpuRuntime() { tearDown( m_debug ); }
+    GpuRuntime( const GpuRuntime& ) = delete;
+    GpuRuntime( GpuRuntime&& ) = delete;
+    GpuRuntime& operator=( const GpuRuntime& ) = delete;
+    GpuRuntime& operator=( GpuRuntime&& ) = delete;
+    bool m_debug;
+
+    // Set up CUDA application
+    // ** NB: strictly speaking this is not needed when using the CUDA runtime API **
+    // Calling cudaSetDevice on startup is useful to properly book-keep the time spent in CUDA initialization
+    static void setUp( const bool debug = true )
+    {
+      // ** NB: it is useful to call cudaSetDevice, or cudaFree, to properly book-keep the time spent in CUDA initialization
+      // ** NB: otherwise, the first CUDA operation (eg a cudaMemcpyToSymbol in CPPProcess ctor) appears to take much longer!
+      /*
+      // [We initially added cudaFree(0) to "ease profile analysis" only because it shows up as a big recognizable block!]
+      // No explicit initialization is needed: https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#initialization
+      // It is not clear what cudaFree(0) does at all: https://stackoverflow.com/questions/69967813/
+      if ( debug ) std::cout << "__CudaRuntime: calling cudaFree(0)" << std::endl;
+      checkCuda( cudaFree( 0 ) ); // SLOW!
+      */
+      // Replace cudaFree(0) by cudaSetDevice(0), even if it is not really needed either
+      // (but see https://developer.nvidia.com/blog/cuda-pro-tip-always-set-current-device-avoid-multithreading-bugs)
+      if( debug ) std::cout << "__GpuRuntime: calling GpuSetDevice(0)" << std::endl;
+      checkGpu( gpuSetDevice( 0 ) ); // SLOW!
+    }
+
+    // Tear down CUDA application (call cudaDeviceReset)
+    // ** NB: strictly speaking this is not needed when using the CUDA runtime API **
+    // Calling cudaDeviceReset on shutdown is only needed for checking memory leaks in cuda-memcheck
+    // See https://docs.nvidia.com/cuda/cuda-memcheck/index.html#leak-checking
+    static void tearDown( const bool debug = true )
+    {
+      if( debug ) std::cout << "__GpuRuntime: calling GpuDeviceReset()" << std::endl;
+      checkGpu( gpuDeviceReset() );
+    }
+  };
+}
+#endif
+
+//--------------------------------------------------------------------------
+
+#endif // MG5AMC_GPURUNTIME_H
diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/MadgraphTest.h b/epochX/cudacpp/gg_ttg.mad/SubProcesses/MadgraphTest.h
index ef40624c88..a64c05c26a 100644
--- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/MadgraphTest.h
+++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/MadgraphTest.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: S. Hageboeck (Dec 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MADGRAPHTEST_H_
 #define MADGRAPHTEST_H_ 1
@@ -22,7 +22,7 @@
 #include <string>
 #include <vector>
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 using mg5amcGpu::CPPProcess;
 #else
 using mg5amcCpu::CPPProcess;
@@ -201,7 +201,7 @@ class MadgraphTest : public testing::TestWithParam<TestDriverBase*>
 
 // Since we link both the CPU-only and GPU tests into the same executable, we prevent
 // a multiply defined symbol by only compiling this in the non-CUDA phase:
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 
 /// Compare momenta and matrix elements.
 /// This uses an implementation of TestDriverBase to run a madgraph workflow,
@@ -307,6 +307,6 @@ TEST_P( MadgraphTest, CompareMomentaAndME )
   }
 }
 
-#endif // __CUDACC__
+#endif // MGONGPUCPP_GPUIMPL
 
 #endif /* MADGRAPHTEST_H_ */
diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/MatrixElementKernels.cc b/epochX/cudacpp/gg_ttg.mad/SubProcesses/MatrixElementKernels.cc
index 74b5239ebf..81699dfea9 100644
--- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/MatrixElementKernels.cc
+++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/MatrixElementKernels.cc
@@ -1,12 +1,12 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #include "MatrixElementKernels.h"
 
 #include "CPPProcess.h"
-#include "CudaRuntime.h"
+#include "GpuRuntime.h" // Includes the abstraction for Nvidia/AMD compilation
 #include "MemoryAccessMomenta.h"
 #include "MemoryBuffers.h"
 
@@ -14,7 +14,7 @@
 
 //============================================================================
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 namespace mg5amcCpu
 {
 
@@ -150,7 +150,7 @@ namespace mg5amcCpu
 
 //============================================================================
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 {
 
@@ -209,13 +209,13 @@ namespace mg5amcGpu
     PinnedHostBufferHelicityMask hstIsGoodHel( ncomb );
     DeviceBufferHelicityMask devIsGoodHel( ncomb );
     // ... 0d1. Compute good helicity mask on the device
-    computeDependentCouplings<<<m_gpublocks, m_gputhreads>>>( m_gs.data(), m_couplings.data() );
+    gpuLaunchKernel( computeDependentCouplings, m_gpublocks, m_gputhreads, m_gs.data(), m_couplings.data() );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    sigmaKin_getGoodHel<<<m_gpublocks, m_gputhreads>>>( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_numerators.data(), m_denominators.data(), devIsGoodHel.data() );
+    gpuLaunchKernel( sigmaKin_getGoodHel, m_gpublocks, m_gputhreads, m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_numerators.data(), m_denominators.data(), devIsGoodHel.data() );
 #else
-    sigmaKin_getGoodHel<<<m_gpublocks, m_gputhreads>>>( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), devIsGoodHel.data() );
+    gpuLaunchKernel( sigmaKin_getGoodHel, m_gpublocks, m_gputhreads, m_momenta.data(), m_couplings.data(), m_matrixElements.data(), devIsGoodHel.data() );
 #endif
-    checkCuda( cudaPeekAtLastError() );
+    checkGpu( gpuPeekAtLastError() );
     // ... 0d2. Copy back good helicity mask to the host
     copyHostFromDevice( hstIsGoodHel, devIsGoodHel );
     // ... 0d3. Copy back good helicity list to constant memory on the device
@@ -226,19 +226,19 @@ namespace mg5amcGpu
 
   void MatrixElementKernelDevice::computeMatrixElements( const unsigned int channelId )
   {
-    computeDependentCouplings<<<m_gpublocks, m_gputhreads>>>( m_gs.data(), m_couplings.data() );
+    gpuLaunchKernel( computeDependentCouplings, m_gpublocks, m_gputhreads, m_gs.data(), m_couplings.data() );
 #ifndef MGONGPU_NSIGHT_DEBUG
     constexpr unsigned int sharedMemSize = 0;
 #else
     constexpr unsigned int sharedMemSize = ntpbMAX * sizeof( float );
 #endif
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    sigmaKin<<<m_gpublocks, m_gputhreads, sharedMemSize>>>( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), channelId, m_numerators.data(), m_denominators.data(), m_selhel.data(), m_selcol.data() );
+    gpuLaunchKernelSharedMem( sigmaKin, m_gpublocks, m_gputhreads, sharedMemSize, m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), channelId, m_numerators.data(), m_denominators.data(), m_selhel.data(), m_selcol.data() );
 #else
-    sigmaKin<<<m_gpublocks, m_gputhreads, sharedMemSize>>>( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), m_selhel.data(), m_selcol.data() );
+    gpuLaunchKernelSharedMem( sigmaKin, m_gpublocks, m_gputhreads, sharedMemSize, m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), m_selhel.data(), m_selcol.data() );
 #endif
-    checkCuda( cudaPeekAtLastError() );
-    checkCuda( cudaDeviceSynchronize() );
+    checkGpu( gpuPeekAtLastError() );
+    checkGpu( gpuDeviceSynchronize() );
   }
 
   //--------------------------------------------------------------------------
diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/MatrixElementKernels.h b/epochX/cudacpp/gg_ttg.mad/SubProcesses/MatrixElementKernels.h
index 23e84757a2..72bd8f195b 100644
--- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/MatrixElementKernels.h
+++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/MatrixElementKernels.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MATRIXELEMENTKERNELS_H
 #define MATRIXELEMENTKERNELS_H 1
@@ -10,7 +10,7 @@
 
 #include "MemoryBuffers.h"
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -81,7 +81,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating matrix element calculations on a CPU host
   class MatrixElementKernelHost final : public MatrixElementKernelBase, public NumberOfEvents
   {
@@ -130,7 +130,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   // A class encapsulating matrix element calculations on a GPU device
   class MatrixElementKernelDevice : public MatrixElementKernelBase, public NumberOfEvents
   {
diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/MemoryAccessAmplitudes.h b/epochX/cudacpp/gg_ttg.mad/SubProcesses/MemoryAccessAmplitudes.h
index 573b3bbbc9..ffb76e93de 100644
--- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/MemoryAccessAmplitudes.h
+++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/MemoryAccessAmplitudes.h
@@ -15,7 +15,7 @@
 #define MGONGPU_TRIVIAL_AMPLITUDES 1
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/MemoryAccessCouplings.h b/epochX/cudacpp/gg_ttg.mad/SubProcesses/MemoryAccessCouplings.h
index 35a3af42e0..3afdf3e554 100644
--- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/MemoryAccessCouplings.h
+++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/MemoryAccessCouplings.h
@@ -15,7 +15,7 @@
 #include "MemoryBuffers.h"       // for HostBufferCouplings::isaligned
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/MemoryAccessCouplingsFixed.h b/epochX/cudacpp/gg_ttg.mad/SubProcesses/MemoryAccessCouplingsFixed.h
index dc0d93afff..ffcdf4dbef 100644
--- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/MemoryAccessCouplingsFixed.h
+++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/MemoryAccessCouplingsFixed.h
@@ -14,7 +14,7 @@
 //#include "MemoryAccessHelpers.h"
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/MemoryAccessDenominators.h b/epochX/cudacpp/gg_ttg.mad/SubProcesses/MemoryAccessDenominators.h
index 3bce635718..66f2d32a6b 100644
--- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/MemoryAccessDenominators.h
+++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/MemoryAccessDenominators.h
@@ -10,7 +10,7 @@
 #include "MemoryAccessGs.h"
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/MemoryAccessGs.h b/epochX/cudacpp/gg_ttg.mad/SubProcesses/MemoryAccessGs.h
index 31311aa375..4c726b30f3 100644
--- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/MemoryAccessGs.h
+++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/MemoryAccessGs.h
@@ -13,7 +13,7 @@
 #include "MemoryBuffers.h" // for HostBufferMatrixElements::isaligned
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/MemoryAccessHelpers.h b/epochX/cudacpp/gg_ttg.mad/SubProcesses/MemoryAccessHelpers.h
index c82a6c7635..db73e4e064 100644
--- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/MemoryAccessHelpers.h
+++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/MemoryAccessHelpers.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MemoryAccessHelpers_H
 #define MemoryAccessHelpers_H 1
@@ -105,7 +105,7 @@ class KernelAccessHelper : public MemoryAccessHelper<T>
     }
     else
     {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
       const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid
       //printf( "kernelAccessRecord: ievt=%d threadId=%d\n", ievt, threadIdx.x );
       return T::ieventAccessRecord( buffer, ievt ); // NB fptype and fptype_sv coincide for CUDA
diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/MemoryAccessMatrixElements.h b/epochX/cudacpp/gg_ttg.mad/SubProcesses/MemoryAccessMatrixElements.h
index f32e6fea5b..3741011971 100644
--- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/MemoryAccessMatrixElements.h
+++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/MemoryAccessMatrixElements.h
@@ -13,7 +13,7 @@
 #include "MemoryBuffers.h" // for HostBufferMatrixElements::isaligned
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/MemoryAccessMomenta.h b/epochX/cudacpp/gg_ttg.mad/SubProcesses/MemoryAccessMomenta.h
index 29266de32c..3be229d392 100644
--- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/MemoryAccessMomenta.h
+++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/MemoryAccessMomenta.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MemoryAccessMomenta_H
 #define MemoryAccessMomenta_H 1
@@ -13,7 +13,7 @@
 #include "MemoryAccessVectors.h"
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -30,7 +30,7 @@ namespace mg5amcCpu
 
     // Number of Events Per Page in the momenta AOSOA memory buffer layout
     // (these are all best kept as a compile-time constants: see issue #23)
-#ifdef __CUDACC__ /* clang-format off */
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
     // -----------------------------------------------------------------------------------------------
     // --- GPUs: neppM is best set to a power of 2 times the number of fptype's in a 32-byte cacheline
     // --- This is relevant to ensure coalesced access to momenta in global memory
diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/MemoryAccessNumerators.h b/epochX/cudacpp/gg_ttg.mad/SubProcesses/MemoryAccessNumerators.h
index b152183b28..18991f4fa6 100644
--- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/MemoryAccessNumerators.h
+++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/MemoryAccessNumerators.h
@@ -10,7 +10,7 @@
 #include "MemoryAccessGs.h"
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/MemoryAccessRandomNumbers.h b/epochX/cudacpp/gg_ttg.mad/SubProcesses/MemoryAccessRandomNumbers.h
index e2988d39f3..40cb089135 100644
--- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/MemoryAccessRandomNumbers.h
+++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/MemoryAccessRandomNumbers.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MemoryAccessRandomNumbers_H
 #define MemoryAccessRandomNumbers_H 1
@@ -11,7 +11,7 @@
 #include "CPPProcess.h"
 #include "MemoryAccessHelpers.h"
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 using mg5amcGpu::CPPProcess;
 #else
 using mg5amcCpu::CPPProcess;
diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/MemoryAccessVectors.h b/epochX/cudacpp/gg_ttg.mad/SubProcesses/MemoryAccessVectors.h
index e9b197368e..08faccff0f 100644
--- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/MemoryAccessVectors.h
+++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/MemoryAccessVectors.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MemoryAccessVectors_H
 #define MemoryAccessVectors_H 1
@@ -10,7 +10,7 @@
 
 #include "mgOnGpuVectors.h"
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 namespace mg5amcCpu // this is only needed for CPU SIMD vectorization
 {
 
diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/MemoryAccessWavefunctions.h b/epochX/cudacpp/gg_ttg.mad/SubProcesses/MemoryAccessWavefunctions.h
index 5428aaf933..33bef4559e 100644
--- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/MemoryAccessWavefunctions.h
+++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/MemoryAccessWavefunctions.h
@@ -15,7 +15,7 @@
 #define MGONGPU_TRIVIAL_WAVEFUNCTIONS 1
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/MemoryBuffers.h b/epochX/cudacpp/gg_ttg.mad/SubProcesses/MemoryBuffers.h
index 3093e6ed18..7756a71621 100644
--- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/MemoryBuffers.h
+++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/MemoryBuffers.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021, based on earlier work by S. Hageboeck) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Roiser, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MemoryBuffers_H
 #define MemoryBuffers_H 1
@@ -11,12 +11,12 @@
 #include "mgOnGpuCxtypes.h"
 
 #include "CPPProcess.h"
-#include "CudaRuntime.h"
+#include "GpuRuntime.h"
 #include "Parameters_sm.h"
 
 #include <sstream>
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -87,7 +87,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   constexpr bool HostBufferALIGNED = false;   // ismisaligned=false
   constexpr bool HostBufferMISALIGNED = true; // ismisaligned=true
 
@@ -119,7 +119,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   // A class encapsulating a CUDA pinned host buffer
   template<typename T>
   class PinnedHostBufferBase : public BufferBase<T>
@@ -128,18 +128,18 @@ namespace mg5amcCpu
     PinnedHostBufferBase( const size_t size )
       : BufferBase<T>( size, false )
     {
-      checkCuda( cudaMallocHost( &( this->m_data ), this->bytes() ) );
+      gpuMallocHost( &( this->m_data ), this->bytes() );
     }
     virtual ~PinnedHostBufferBase()
     {
-      checkCuda( cudaFreeHost( this->m_data ) );
+      gpuFreeHost( this->m_data );
     }
   };
 #endif
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   // A class encapsulating a CUDA device buffer
   template<typename T>
   class DeviceBufferBase : public BufferBase<T>
@@ -148,18 +148,18 @@ namespace mg5amcCpu
     DeviceBufferBase( const size_t size )
       : BufferBase<T>( size, true )
     {
-      checkCuda( cudaMalloc( &( this->m_data ), this->bytes() ) );
+      gpuMalloc( &( this->m_data ), this->bytes() );
     }
     virtual ~DeviceBufferBase()
     {
-      checkCuda( cudaFree( this->m_data ) );
+      gpuFree( this->m_data );
     }
   };
 #endif
 
   //--------------------------------------------------------------------------
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for a given number of events
   template<typename T, size_t sizePerEvent, bool ismisaligned>
   class HostBuffer : public HostBufferBase<T, ismisaligned>, virtual private NumberOfEvents
@@ -175,7 +175,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   // A class encapsulating a CUDA pinned host buffer for a given number of events
   template<typename T, size_t sizePerEvent>
   class PinnedHostBuffer : public PinnedHostBufferBase<T>, virtual private NumberOfEvents
@@ -191,7 +191,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   // A class encapsulating a CUDA device buffer for a given number of events
   template<typename T, size_t sizePerEvent>
   class DeviceBuffer : public DeviceBufferBase<T>, virtual private NumberOfEvents
@@ -213,7 +213,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for momenta random numbers
   constexpr size_t sizePerEventRndNumMomenta = MemoryBuffers::np4 * MemoryBuffers::nparf;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for momenta random numbers
   typedef HostBuffer<fptype, sizePerEventRndNumMomenta, HostBufferALIGNED> HostBufferRndNumMomenta;
 #else
@@ -232,7 +232,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer with ONE fptype per event
   constexpr size_t sizePerEventOneFp = 1;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer with ONE fptype per event
   typedef HostBuffer<fptype, sizePerEventOneFp, HostBufferALIGNED> HostBufferOneFp;
 #else
@@ -257,7 +257,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for Gs
   constexpr size_t sizePerEventGs = 1;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for gs
   typedef HostBuffer<fptype, sizePerEventGs, HostBufferALIGNED> HostBufferGs;
 #else
@@ -276,7 +276,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for numerators
   constexpr size_t sizePerEventNumerators = 1;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for gs
   typedef HostBuffer<fptype, sizePerEventNumerators, HostBufferALIGNED> HostBufferNumerators;
 #else
@@ -296,7 +296,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for denominators
   constexpr size_t sizePerEventDenominators = 1;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for gs
   typedef HostBuffer<fptype, sizePerEventDenominators, HostBufferALIGNED> HostBufferDenominators;
 #else
@@ -315,7 +315,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for random numbers
   constexpr size_t sizePerEventCouplings = MemoryBuffers::ndcoup * MemoryBuffers::nx2;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for gs
   typedef HostBuffer<fptype, sizePerEventCouplings, HostBufferALIGNED> HostBufferCouplings;
 #else
@@ -333,7 +333,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for momenta
   constexpr size_t sizePerEventMomenta = MemoryBuffers::np4 * MemoryBuffers::npar;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for momenta
   typedef HostBuffer<fptype, sizePerEventMomenta, HostBufferALIGNED> HostBufferMomenta;
   //typedef HostBuffer<fptype, sizePerEventMomenta, HostBufferMISALIGNED> HostBufferMomenta; // TEST MISALIGNMENT!
@@ -352,7 +352,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for sampling weights
   constexpr size_t sizePerEventWeights = 1;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for sampling weights
   typedef HostBuffer<fptype, sizePerEventWeights, HostBufferALIGNED> HostBufferWeights;
 #else
@@ -370,7 +370,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for matrix elements
   constexpr size_t sizePerEventMatrixElements = 1;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for matrix elements
   typedef HostBuffer<fptype, sizePerEventMatrixElements, HostBufferALIGNED> HostBufferMatrixElements;
 #else
@@ -385,7 +385,7 @@ namespace mg5amcCpu
   // A base class encapsulating a memory buffer for the helicity mask
   typedef BufferBase<bool> BufferHelicityMask;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for the helicity mask
   typedef HostBufferBase<bool, HostBufferALIGNED> HostBufferHelicityMask;
 #else
@@ -403,7 +403,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for wavefunctions
   constexpr size_t sizePerEventWavefunctions = MemoryBuffers::nw6 * MemoryBuffers::nx2;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for wavefunctions
   typedef HostBuffer<fptype, sizePerEventWavefunctions, HostBufferALIGNED> HostBufferWavefunctions;
 #else
@@ -421,7 +421,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for helicity random numbers
   constexpr size_t sizePerEventRndNumHelicity = 1;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for helicity random numbers
   typedef HostBuffer<fptype, sizePerEventRndNumHelicity, HostBufferALIGNED> HostBufferRndNumHelicity;
 #else
@@ -439,7 +439,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for color random numbers
   constexpr size_t sizePerEventRndNumColor = 1;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for color random numbers
   typedef HostBuffer<fptype, sizePerEventRndNumColor, HostBufferALIGNED> HostBufferRndNumColor;
 #else
@@ -457,7 +457,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for helicity selection
   constexpr size_t sizePerEventSelectedHelicity = 1;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for helicity selection
   typedef HostBuffer<int, sizePerEventSelectedHelicity, HostBufferALIGNED> HostBufferSelectedHelicity;
 #else
@@ -475,7 +475,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for color selection
   constexpr size_t sizePerEventSelectedColor = 1;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for color selection
   typedef HostBuffer<int, sizePerEventSelectedColor, HostBufferALIGNED> HostBufferSelectedColor;
 #else
@@ -487,7 +487,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   template<class Tdst, class Tsrc>
   void copyDeviceFromHost( Tdst& dst, const Tsrc& src ) // keep the same order of arguments as in memcpy
   {
@@ -504,13 +504,13 @@ namespace mg5amcCpu
       throw std::runtime_error( sstr.str() );
     }
     // NB (PR #45): cudaMemcpy involves an intermediate memcpy to pinned memory if host array is a not a pinned host array
-    checkCuda( cudaMemcpy( dst.data(), src.data(), src.bytes(), cudaMemcpyHostToDevice ) );
+    gpuMemcpy( dst.data(), src.data(), src.bytes(), gpuMemcpyHostToDevice );
   }
 #endif
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   template<class Tdst, class Tsrc>
   void copyHostFromDevice( Tdst& dst, const Tsrc& src ) // keep the same order of arguments as in memcpy
   {
@@ -527,7 +527,7 @@ namespace mg5amcCpu
       throw std::runtime_error( sstr.str() );
     }
     // NB (PR #45): cudaMemcpy involves an intermediate memcpy to pinned memory if host array is a not a pinned host array
-    checkCuda( cudaMemcpy( dst.data(), src.data(), src.bytes(), cudaMemcpyDeviceToHost ) );
+    gpuMemcpy( dst.data(), src.data(), src.bytes(), gpuMemcpyDeviceToHost );
   }
 #endif
 
diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/CPPProcess.cc b/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/CPPProcess.cc
index afeebde3c6..0e4d5d1157 100644
--- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/CPPProcess.cc
+++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/CPPProcess.cc
@@ -4,7 +4,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
 // MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
@@ -16,7 +16,6 @@
 
 #include "mgOnGpuConfig.h"
 
-#include "CudaRuntime.h"
 #include "HelAmps_sm.h"
 #include "MemoryAccessAmplitudes.h"
 #include "MemoryAccessCouplings.h"
@@ -46,7 +45,7 @@
 // Class member functions for calculating the matrix elements for
 // Process: g g > t t~ g WEIGHTED<=3 @1
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -80,7 +79,7 @@ namespace mg5amcCpu
   __device__ const fptype cIPD[2] = { (fptype)Parameters_sm::mdl_MT, (fptype)Parameters_sm::mdl_WT };
   __device__ const fptype* cIPC = nullptr; // unused as nicoup=0
 #else
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   __device__ __constant__ fptype cIPD[2];
   __device__ __constant__ fptype* cIPC = nullptr; // unused as nicoup=0
 #else
@@ -90,7 +89,7 @@ namespace mg5amcCpu
 #endif
 
   // Helicity combinations (and filtering of "good" helicity combinations)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   __device__ __constant__ short cHel[ncomb][npar];
   __device__ __constant__ int cNGoodHel;
   __device__ __constant__ int cGoodHel[ncomb];
@@ -118,13 +117,13 @@ namespace mg5amcCpu
                            fptype* allDenominators,       // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
                            fptype_sv* jamp2_sv            // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled)
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
                            , const int ievt00             // input: first event number in current C++ event page (for CUDA, ievt depends on threadid)
 #endif
                            )
   //ALWAYS_INLINE // attributes are not permitted in a function definition
   {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     using namespace mg5amcGpu;
     using M_ACCESS = DeviceAccessMomenta;         // non-trivial access: buffer includes all events
     using E_ACCESS = DeviceAccessMatrixElements;  // non-trivial access: buffer includes all events
@@ -151,7 +150,7 @@ namespace mg5amcCpu
 #endif /* clang-format on */
     mgDebug( 0, __FUNCTION__ );
     //printf( "calculate_wavefunctions: ihel=%2d\n", ihel );
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
     //printf( "calculate_wavefunctions: ievt00=%d\n", ievt00 );
 #endif
 
@@ -187,7 +186,7 @@ namespace mg5amcCpu
 #endif
     for( int iParity = 0; iParity < nParity; ++iParity )
     { // START LOOP ON IPARITY
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
       const int ievt0 = ievt00 + iParity * neppV;
 #endif
       constexpr size_t nxcoup = ndcoup + nicoup; // both dependent and independent couplings
@@ -200,8 +199,10 @@ namespace mg5amcCpu
         allCOUPs[idcoup] = CD_ACCESS::idcoupAccessBufferConst( allcouplings, idcoup ); // dependent couplings, vary event-by-event
       for( size_t iicoup = 0; iicoup < nicoup; iicoup++ )
         allCOUPs[ndcoup + iicoup] = CI_ACCESS::iicoupAccessBufferConst( cIPC, iicoup ); // independent couplings, fixed for all events
+#ifdef MGONGPUCPP_GPUIMPL
 #ifdef __CUDACC__
 #pragma nv_diagnostic pop
+#endif
       // CUDA kernels take input/output buffers with momenta/MEs for all events
       const fptype* momenta = allmomenta;
       const fptype* COUPs[nxcoup];
@@ -505,7 +506,7 @@ namespace mg5amcCpu
         { 1, -8, 10, 1, 64, -8 },
         { 10, 1, 1, -8, -8, 64 } }; // 2-D array[6][6]
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
       // Pre-compute a constexpr triangular color matrix properly normalized #475
       struct TriangularNormalizedColorMatrix
       {
@@ -562,7 +563,7 @@ namespace mg5amcCpu
 #endif
       for( int icol = 0; icol < ncolor; icol++ )
       {
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
         // === C++ START ===
         // Diagonal terms
 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
@@ -621,7 +622,7 @@ namespace mg5amcCpu
       MEs_sv_previous += deltaMEs_previous;
 #endif
       /*
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
       if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", blockDim.x * blockIdx.x + threadIdx.x, ihel, MEs_sv );
 #else
 #ifdef MGONGPU_CPPSIMD
@@ -684,8 +685,8 @@ namespace mg5amcCpu
       { 1, 1, 1, 1, 1 },
       { 1, 1, 1, -1, -1 },
       { 1, 1, 1, -1, 1 } };
-#ifdef __CUDACC__
-    checkCuda( cudaMemcpyToSymbol( cHel, tHel, ncomb * npar * sizeof( short ) ) );
+#ifdef MGONGPUCPP_GPUIMPL
+    gpuMemcpyToSymbol( cHel, tHel, ncomb * npar * sizeof( short ) );
 #else
     memcpy( cHel, tHel, ncomb * npar * sizeof( short ) );
 #endif
@@ -726,9 +727,9 @@ namespace mg5amcCpu
     // Then copy them to CUDA constant memory (issue #39) or its C++ emulation in file-scope static memory
     const fptype tIPD[2] = { (fptype)m_pars->mdl_MT, (fptype)m_pars->mdl_WT };
     //const cxtype tIPC[0] = { ... }; // nicoup=0
-#ifdef __CUDACC__
-    checkCuda( cudaMemcpyToSymbol( cIPD, tIPD, 2 * sizeof( fptype ) ) );
-    //checkCuda( cudaMemcpyToSymbol( cIPC, tIPC, 0 * sizeof( cxtype ) ) ); // nicoup=0
+#ifdef MGONGPUCPP_GPUIMPL
+    gpuMemcpyToSymbol( cIPD, tIPD, 2 * sizeof( fptype ) );
+    //gpuMemcpyToSymbol( cIPC, tIPC, 0 * sizeof( cxtype ) ); // nicoup=0
 #else
     memcpy( cIPD, tIPD, 2 * sizeof( fptype ) );
     //memcpy( cIPC, tIPC, 0 * sizeof( cxtype ) ); // nicoup=0
@@ -765,7 +766,7 @@ namespace mg5amcCpu
   {
     std::stringstream out;
     // CUDA version (NVCC)
-    // [Use __NVCC__ instead of __CUDACC__ here!]
+    // [Use __NVCC__ instead of MGONGPUCPP_GPUIMPL here!]
     // [This tests if 'nvcc' was used even to build a .cc file, even if not necessarily 'nvcc -x cu' for a .cu file]
     // [Check 'nvcc --compiler-options -dM -E dummy.c | grep CUDA': see https://stackoverflow.com/a/53713712]
 #ifdef __NVCC__
@@ -830,12 +831,12 @@ namespace mg5amcCpu
   __global__ void /* clang-format off */
   computeDependentCouplings( const fptype* allgs, // input: Gs[nevt]
                              fptype* allcouplings // output: couplings[nevt*ndcoup*2]
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
                              , const int nevt     // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #endif
   ) /* clang-format on */
   {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     using namespace mg5amcGpu;
     using G_ACCESS = DeviceAccessGs;
     using C_ACCESS = DeviceAccessCouplings;
@@ -856,7 +857,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__ /* clang-format off */
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
   __global__ void
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
@@ -982,9 +983,9 @@ namespace mg5amcCpu
         nGoodHel++;
       }
     }
-#ifdef __CUDACC__
-    checkCuda( cudaMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) ) );
-    checkCuda( cudaMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) ) );
+#ifdef MGONGPUCPP_GPUIMPL
+    gpuMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) );
+    gpuMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) );
 #else
     cNGoodHel = nGoodHel;
     for( int ihel = 0; ihel < ncomb; ihel++ ) cGoodHel[ihel] = goodHel[ihel];
@@ -1008,7 +1009,7 @@ namespace mg5amcCpu
 #endif
             int* allselhel,                // output: helicity selection[nevt]
             int* allselcol                 // output: helicity selection[nevt]
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
             , const int nevt               // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #endif
             ) /* clang-format on */
@@ -1028,7 +1029,7 @@ namespace mg5amcCpu
     // Denominators: spins, colors and identical particles
     constexpr int helcolDenominators[1] = { 256 }; // assume nprocesses == 1 (#272 and #343)
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     // Remember: in CUDA this is a kernel for one event, in c++ this processes n events
     const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid
 #else
@@ -1042,9 +1043,12 @@ namespace mg5amcCpu
 #endif
 
     // Start sigmaKin_lines
+
+#include "GpuAbstraction.h"
+
     // === PART 0 - INITIALISATION (before calculate_wavefunctions) ===
     // Reset the "matrix elements" - running sums of |M|^2 over helicities for the given event
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     allMEs[ievt] = 0;
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     allNumerators[ievt] = 0;
@@ -1072,7 +1076,7 @@ namespace mg5amcCpu
     // === PART 1 - HELICITY LOOP: CALCULATE WAVEFUNCTIONS ===
     // (in both CUDA and C++, using precomputed good helicities)
 
-#ifdef __CUDACC__ // CUDA OR C++
+#ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++
 
     // *** START OF PART 1a - CUDA (one event per CPU thread) ***
     // Running sum of partial amplitudes squared for event by event color selection (#402)
@@ -1282,7 +1286,7 @@ namespace mg5amcCpu
     // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event
     // [NB 'sum over final spins, average over initial spins', eg see
     // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf]
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     allMEs[ievt] /= helcolDenominators[0];
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     if( channelId > 0 ) allMEs[ievt] *= allNumerators[ievt] / allDenominators[ievt];
diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/CPPProcess.h b/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/CPPProcess.h
index 37d6ebe981..11f562273e 100644
--- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/CPPProcess.h
+++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/CPPProcess.h
@@ -4,7 +4,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
 // MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
@@ -25,7 +25,7 @@
 
 //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -107,7 +107,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   __global__ void
   computeDependentCouplings( const fptype* allgs,    // input: Gs[nevt]
                              fptype* allcouplings ); // output: couplings[nevt*ndcoup*2]
@@ -120,7 +120,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__ /* clang-format off */
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
   __global__ void
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
@@ -150,7 +150,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__ /* clang-format off */
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
   __global__ void
   sigmaKin( const fptype* allmomenta,      // input: momenta[nevt*npar*4]
             const fptype* allcouplings,    // input: couplings[nevt*ndcoup*2]
diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/CudaRuntime.h b/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/CudaRuntime.h
deleted file mode 120000
index ce9e1a487a..0000000000
--- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/CudaRuntime.h
+++ /dev/null
@@ -1 +0,0 @@
-../CudaRuntime.h
\ No newline at end of file
diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/GpuAbstraction.h b/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/GpuAbstraction.h
new file mode 120000
index 0000000000..72054e19ba
--- /dev/null
+++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/GpuAbstraction.h
@@ -0,0 +1 @@
+../GpuAbstraction.h
\ No newline at end of file
diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/GpuRuntime.h b/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/GpuRuntime.h
new file mode 120000
index 0000000000..3920e83be4
--- /dev/null
+++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/GpuRuntime.h
@@ -0,0 +1 @@
+../GpuRuntime.h
\ No newline at end of file
diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/check_sa.cc b/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/check_sa.cc
index 3fbf0ffbee..7cac5ab47b 100644
--- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/check_sa.cc
+++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/check_sa.cc
@@ -4,7 +4,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: O. Mattelaer (Nov 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 
 #include "mgOnGpuConfig.h"
@@ -12,6 +12,7 @@
 #include "BridgeKernels.h"
 #include "CPPProcess.h"
 #include "CrossSectionKernels.h"
+#include "GpuRuntime.h"
 #include "MatrixElementKernels.h"
 #include "MemoryAccessMatrixElements.h"
 #include "MemoryAccessMomenta.h"
@@ -65,7 +66,7 @@ usage( char* argv0, int ret = 1 )
   std::cout << std::endl;
   std::cout << "Summary stats are always computed: '-p' and '-j' only control their printout" << std::endl;
   std::cout << "The '-d' flag only enables NaN/abnormal warnings and OMP debugging" << std::endl;
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 #ifdef _OPENMP
   std::cout << std::endl;
   std::cout << "Use the OMP_NUM_THREADS environment variable to control OMP multi-threading" << std::endl;
@@ -96,7 +97,7 @@ int
 main( int argc, char** argv )
 {
   // Namespaces for CUDA and C++ (FIXME - eventually use the same namespace everywhere...)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   using namespace mg5amcGpu;
 #else
   using namespace mg5amcCpu;
@@ -134,9 +135,11 @@ main( int argc, char** argv )
     CurandDevice = 2
   };
 #ifdef MGONGPU_HAS_NO_CURAND
-  RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // this is the only supported mode if build has no curand (PR #784)
+  RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // this is the only supported mode if build has no curand (PR #784 and #785)
+#elif defined __HIPCC__
+#error Internal error: MGONGPU_HAS_NO_CURAND should have been set for __HIPCC__ // default on AMD GPUs should be common random
 #elif defined __CUDACC__
-  RandomNumberMode rndgen = RandomNumberMode::CurandDevice; // default on GPU if build has curand
+  RandomNumberMode rndgen = RandomNumberMode::CurandDevice; // default on NVidia GPU if build has curand
 #else
   RandomNumberMode rndgen = RandomNumberMode::CurandHost; // default on CPU if build has curand
 #endif
@@ -146,10 +149,10 @@ main( int argc, char** argv )
     RamboHost = 1,
     RamboDevice = 2
   };
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   RamboSamplingMode rmbsmp = RamboSamplingMode::RamboDevice; // default on GPU
 #else
-  RamboSamplingMode rmbsmp = RamboSamplingMode::RamboHost;  // default on CPU
+  RamboSamplingMode rmbsmp = RamboSamplingMode::RamboHost; // default on CPU
 #endif
   // Bridge emulation mode (NB Bridge implies RamboHost!)
   bool bridge = false;
@@ -177,7 +180,7 @@ main( int argc, char** argv )
     else if( arg == "--curdev" )
     {
 #ifndef __CUDACC__
-      throw std::runtime_error( "CurandDevice is not supported on CPUs" );
+      throw std::runtime_error( "CurandDevice is not supported on CPUs or non-NVidia GPUs" );
 #elif defined MGONGPU_HAS_NO_CURAND
       throw std::runtime_error( "CurandDevice is not supported because this application was built without Curand support" );
 #else
@@ -198,7 +201,7 @@ main( int argc, char** argv )
     }
     else if( arg == "--rmbdev" )
     {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
       rmbsmp = RamboSamplingMode::RamboDevice;
 #else
       throw std::runtime_error( "RamboDevice is not supported on CPUs" );
@@ -272,13 +275,13 @@ main( int argc, char** argv )
     return usage( argv[0] );
   }
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 #ifdef _OPENMP
   ompnumthreadsNotSetMeansOneThread( debug ? 1 : 0 ); // quiet(-1), info(0), debug(1)
 #endif
 #endif
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // Fail gently and avoid "Illegal instruction (core dumped)" if the host does not support the SIMD used in the ME calculation
   // Note: this prevents a crash on pmpe04 but not on some github CI nodes?
   // [NB: SIMD vectorization in mg5amc C++ code is only used in the ME calculation below MatrixElementKernelHost!]
@@ -296,14 +299,14 @@ main( int argc, char** argv )
 
   // === STEP 0 - INITIALISE
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 
-  // --- 00. Initialise cuda
-  // Instantiate a CudaRuntime at the beginnining of the application's main to
-  // invoke cudaSetDevice(0) in the constructor and book a cudaDeviceReset() call in the destructor
-  const std::string cdinKey = "00 CudaInit";
+  // --- 00. Initialise GPU
+  // Instantiate a GpuRuntime at the beginnining of the application's main.
+  // For CUDA this invokes cudaSetDevice(0) in the constructor and books a cudaDeviceReset() call in the destructor.
+  const std::string cdinKey = "00 GpuInit";
   timermap.start( cdinKey );
-  CudaRuntime cudaRuntime( debug );
+  GpuRuntime GpuRuntime( debug );
 #endif
 
   // --- 0a. Initialise physics process
@@ -325,7 +328,7 @@ main( int argc, char** argv )
   timermap.start( alloKey );
 
   // Memory buffers for random numbers for momenta
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferRndNumMomenta hstRndmom( nevt );
 #else
   PinnedHostBufferRndNumMomenta hstRndmom( nevt );
@@ -333,7 +336,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for sampling weights
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferWeights hstWeights( nevt );
 #else
   PinnedHostBufferWeights hstWeights( nevt );
@@ -341,7 +344,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for momenta
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferMomenta hstMomenta( nevt );
 #else
   PinnedHostBufferMomenta hstMomenta( nevt );
@@ -349,7 +352,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for Gs
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferGs hstGs( nevt );
 #else
   PinnedHostBufferGs hstGs( nevt );
@@ -366,7 +369,7 @@ main( int argc, char** argv )
   }
 
   // Memory buffers for matrix elements
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferMatrixElements hstMatrixElements( nevt );
 #else
   PinnedHostBufferMatrixElements hstMatrixElements( nevt );
@@ -375,7 +378,7 @@ main( int argc, char** argv )
 
   // Memory buffers for random numbers for helicity selection
   // *** NB #403 these buffers always remain initialised at 0: no need for helicity choice in gcheck/check (no LHE produced) ***
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferRndNumHelicity hstRndHel( nevt );
 #else
   PinnedHostBufferRndNumHelicity hstRndHel( nevt );
@@ -384,7 +387,7 @@ main( int argc, char** argv )
 
   // Memory buffers for random numbers for color selection
   // *** NB #402 these buffers always remain initialised at 0: no need for color choice in gcheck/check (no LHE produced) ***
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferRndNumColor hstRndCol( nevt );
 #else
   PinnedHostBufferRndNumColor hstRndCol( nevt );
@@ -392,7 +395,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for helicity selection
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferSelectedHelicity hstSelHel( nevt );
 #else
   PinnedHostBufferSelectedHelicity hstSelHel( nevt );
@@ -400,7 +403,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for color selection
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferSelectedColor hstSelCol( nevt );
 #else
   PinnedHostBufferSelectedColor hstSelCol( nevt );
@@ -438,7 +441,7 @@ main( int argc, char** argv )
     const bool onDevice = true;
     prnk.reset( new CurandRandomNumberKernel( devRndmom, onDevice ) );
 #else
-    throw std::logic_error( "INTERNAL ERROR! CurandDevice is not supported on CPUs" ); // INTERNAL ERROR (no path to this statement)
+    throw std::logic_error( "INTERNAL ERROR! CurandDevice is not supported on CPUs or non-NVidia GPUs" ); // INTERNAL ERROR (no path to this statement)
 #endif
   }
 
@@ -450,7 +453,7 @@ main( int argc, char** argv )
   }
   else
   {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     prsk.reset( new RamboSamplingKernelDevice( energy, devRndmom, devMomenta, devWeights, gpublocks, gputhreads ) );
 #else
     throw std::logic_error( "RamboDevice is not supported on CPUs" ); // INTERNAL ERROR (no path to this statement)
@@ -461,7 +464,7 @@ main( int argc, char** argv )
   std::unique_ptr<MatrixElementKernelBase> pmek;
   if( !bridge )
   {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     pmek.reset( new MatrixElementKernelDevice( devMomenta, devGs, devRndHel, devRndCol, devMatrixElements, devSelHel, devSelCol, gpublocks, gputhreads ) );
 #else
     pmek.reset( new MatrixElementKernelHost( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, nevt ) );
@@ -469,7 +472,7 @@ main( int argc, char** argv )
   }
   else
   {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     pmek.reset( new BridgeKernelDevice( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, gpublocks, gputhreads ) );
 #else
     pmek.reset( new BridgeKernelHost( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, nevt ) );
@@ -511,7 +514,7 @@ main( int argc, char** argv )
     prnk->generateRnarray();
     //std::cout << "Got random numbers" << std::endl;
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     if( rndgen != RandomNumberMode::CurandDevice && rmbsmp == RamboSamplingMode::RamboDevice )
     {
       // --- 1c. Copy rndmom from host to device
@@ -543,7 +546,7 @@ main( int argc, char** argv )
     prsk->getMomentaFinal();
     //std::cout << "Got final momenta" << std::endl;
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     if( rmbsmp == RamboSamplingMode::RamboDevice )
     {
       // --- 2c. CopyDToH Weights
@@ -588,7 +591,7 @@ main( int argc, char** argv )
       dynamic_cast<BridgeKernelBase*>( pmek.get() )->transposeInputMomentaC2F();
     }
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     // --- 2d. CopyHToD Momenta
     const std::string gKey = "0.. CpHTDg";
     rambtime += timermap.start( gKey ); // FIXME! NOT A RAMBO TIMER!
@@ -617,7 +620,7 @@ main( int argc, char** argv )
     wv3atime += timermap.stop(); // calc only
     wavetime += wv3atime;        // calc plus copy
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     if( !bridge )
     {
       // --- 3b. CopyDToH MEs
@@ -760,18 +763,22 @@ main( int argc, char** argv )
     rndgentxt = "CURAND DEVICE";
 #ifdef __CUDACC__
   rndgentxt += " (CUDA code)";
+#elif defined __HIPCC__
+  rndgentxt += " (HIP code)";
 #else
   rndgentxt += " (C++ code)";
 #endif
 
   // Workflow description summary
   std::string wrkflwtxt;
-  // -- CUDA or C++?
+  // -- CUDA or HIP or C++?
 #ifdef __CUDACC__
   wrkflwtxt += "CUD:";
+#elif defined __HIPCC__
+  wrkflwtxt += "HIP:";
 #else
   wrkflwtxt += "CPP:";
-#endif
+#endif /* clang-format off */
   // -- DOUBLE or FLOAT?
 #if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
   wrkflwtxt += "MIX+"; // mixed fptypes (single precision color algebra #537)
@@ -781,7 +788,7 @@ main( int argc, char** argv )
   wrkflwtxt += "FLT+";
 #else
   wrkflwtxt += "???+"; // no path to this statement
-#endif
+#endif /* clang-format on */
   // -- CUCOMPLEX or THRUST or STD complex numbers?
 #ifdef __CUDACC__
 #if defined MGONGPU_CUCXTYPE_CUCOMPLEX
@@ -793,6 +800,12 @@ main( int argc, char** argv )
 #else
   wrkflwtxt += "???:"; // no path to this statement
 #endif
+#elif defined __HIPCC__
+#if defined MGONGPU_CUCXTYPE_CXSMPL
+  wrkflwtxt += "CXS:";
+#else
+  wrkflwtxt += "???:"; // no path to this statement
+#endif
 #else
 #if defined MGONGPU_CPPCXTYPE_STDCOMPLEX
   wrkflwtxt += "STX:";
@@ -818,7 +831,7 @@ main( int argc, char** argv )
     wrkflwtxt += "RMBDEV+";
   else
     wrkflwtxt += "??????+"; // no path to this statement
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   // -- HOST or DEVICE matrix elements? Standalone MEs or BRIDGE?
   if( !bridge )
     wrkflwtxt += "MESDEV";
@@ -874,7 +887,7 @@ main( int argc, char** argv )
 
   if( perf )
   {
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 #ifdef _OPENMP
     // Get the output of "nproc --all" (https://stackoverflow.com/a/478960)
     std::string nprocall;
@@ -895,6 +908,8 @@ main( int argc, char** argv )
     std::cout << std::string( SEP79, '*' ) << std::endl
 #ifdef __CUDACC__
               << "Process                     = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_CUDA"
+#elif defined __HIPCC__
+              << "Process                     = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_HIP"
 #else
               << "Process                     = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_CPP"
 #endif
@@ -921,21 +936,21 @@ main( int argc, char** argv )
 #elif defined MGONGPU_FPTYPE_FLOAT
               << "FP precision                = FLOAT (NaN/abnormal=" << nabn << ", zero=" << nzero << ")" << std::endl
 #endif
-#ifdef __CUDACC__
 #if defined MGONGPU_CUCXTYPE_CUCOMPLEX
               << "Complex type                = CUCOMPLEX" << std::endl
 #elif defined MGONGPU_CUCXTYPE_THRUST
               << "Complex type                = THRUST::COMPLEX" << std::endl
-#endif
-#else
+#elif defined MGONGPU_CUCXTYPE_CXSMPL
               << "Complex type                = STD::COMPLEX" << std::endl
+#else
+              << "Complex type                = ???" << std::endl // no path to this statement...
 #endif
               << "RanNumb memory layout       = AOSOA[" << neppR << "]"
               << ( neppR == 1 ? " == AOS" : "" )
               << " [HARDCODED FOR REPRODUCIBILITY]" << std::endl
               << "Momenta memory layout       = AOSOA[" << neppM << "]"
               << ( neppM == 1 ? " == AOS" : "" ) << std::endl
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     //<< "Wavefunction GPU memory     = LOCAL" << std::endl
 #else
 #if !defined MGONGPU_CPPSIMD
@@ -966,7 +981,7 @@ main( int argc, char** argv )
 #endif
 #endif
               << "Random number generation    = " << rndgentxt << std::endl
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 #ifdef _OPENMP
               << "OMP threads / `nproc --all` = " << omp_get_max_threads() << " / " << nprocall // includes a newline
 #endif
@@ -1062,14 +1077,14 @@ main( int argc, char** argv )
              << "\"FLOAT (NaN/abnormal=" << nabn << ")\"," << std::endl
 #endif
              << "\"Complex type\": "
-#ifdef __CUDACC__
 #if defined MGONGPU_CUCXTYPE_CUCOMPLEX
              << "\"CUCOMPLEX\"," << std::endl
 #elif defined MGONGPU_CUCXTYPE_THRUST
              << "\"THRUST::COMPLEX\"," << std::endl
-#endif
-#else
+#elif defined MGONGPU_CUCXTYPE_CXSMPL
              << "\"STD::COMPLEX\"," << std::endl
+#else
+             << "\"???\"," << std::endl                           // no path to this statement...
 #endif
              << "\"RanNumb memory layout\": "
              << "\"AOSOA[" << neppR << "]\""
@@ -1077,7 +1092,7 @@ main( int argc, char** argv )
              << "\"Momenta memory layout\": "
              << "\"AOSOA[" << neppM << "]\""
              << ( neppM == 1 ? " == AOS" : "" ) << ", " << std::endl
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     //<< "\"Wavefunction GPU memory\": " << "\"LOCAL\"," << std::endl
 #endif
              << "\"Curand generation\": "
diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/RamboSamplingKernels.cc b/epochX/cudacpp/gg_ttg.mad/SubProcesses/RamboSamplingKernels.cc
index da68aa9255..79abbcc4f8 100644
--- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/RamboSamplingKernels.cc
+++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/RamboSamplingKernels.cc
@@ -1,11 +1,11 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #include "RamboSamplingKernels.h"
 
-#include "CudaRuntime.h"
+#include "GpuRuntime.h"
 #include "MemoryAccessMomenta.h"
 #include "MemoryAccessRandomNumbers.h"
 #include "MemoryAccessWeights.h"
@@ -14,7 +14,7 @@
 
 #include <sstream>
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -92,7 +92,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   RamboSamplingKernelDevice::RamboSamplingKernelDevice( const fptype energy,               // input: energy
                                                         const BufferRndNumMomenta& rndmom, // input: random numbers in [0,1]
                                                         BufferMomenta& momenta,            // output: momenta
@@ -135,7 +135,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   __global__ void
   getMomentaInitialDevice( const fptype energy,
                            fptype* momenta )
@@ -147,17 +147,17 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   void
   RamboSamplingKernelDevice::getMomentaInitial()
   {
-    getMomentaInitialDevice<<<m_gpublocks, m_gputhreads>>>( m_energy, m_momenta.data() );
+    gpuLaunchKernel( getMomentaInitialDevice, m_gpublocks, m_gputhreads, m_energy, m_momenta.data() );
   }
 #endif
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   __global__ void
   getMomentaFinalDevice( const fptype energy,
                          const fptype* rndmom,
@@ -171,11 +171,11 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   void
   RamboSamplingKernelDevice::getMomentaFinal()
   {
-    getMomentaFinalDevice<<<m_gpublocks, m_gputhreads>>>( m_energy, m_rndmom.data(), m_momenta.data(), m_weights.data() );
+    gpuLaunchKernel( getMomentaFinalDevice, m_gpublocks, m_gputhreads, m_energy, m_rndmom.data(), m_momenta.data(), m_weights.data() );
   }
 #endif
 
diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/RamboSamplingKernels.h b/epochX/cudacpp/gg_ttg.mad/SubProcesses/RamboSamplingKernels.h
index 184089efd7..7c214cd74b 100644
--- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/RamboSamplingKernels.h
+++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/RamboSamplingKernels.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef RAMBOSAMPLINGKERNELS_H
 #define RAMBOSAMPLINGKERNELS_H 1
@@ -10,7 +10,7 @@
 
 #include "MemoryBuffers.h"
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -93,7 +93,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   // A class encapsulating RAMBO phase space sampling on a GPU device
   class RamboSamplingKernelDevice final : public SamplingKernelBase, public NumberOfEvents
   {
diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/RandomNumberKernels.h b/epochX/cudacpp/gg_ttg.mad/SubProcesses/RandomNumberKernels.h
index 188a72c2c9..21d63beeac 100644
--- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/RandomNumberKernels.h
+++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/RandomNumberKernels.h
@@ -1,14 +1,14 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef RANDOMNUMBERKERNELS_H
 #define RANDOMNUMBERKERNELS_H 1
 
 #include "mgOnGpuConfig.h"
 
-// NB This must come AFTER mgOnGpuConfig.h which contains our definition of __global__ when __CUDACC__ is not defined
+// NB This must come AFTER mgOnGpuConfig.h which contains our definition of __global__ when MGONGPUCPP_GPUIMPL is not defined
 #ifndef MGONGPU_HAS_NO_CURAND
 //#include "curand.h"
 struct curandGenerator_st; // forward definition from curand.h
@@ -16,7 +16,7 @@ struct curandGenerator_st; // forward definition from curand.h
 
 #include "MemoryBuffers.h"
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/cudacpp.mk b/epochX/cudacpp/gg_ttg.mad/SubProcesses/cudacpp.mk
index 509307506b..f2cfa349da 100644
--- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/cudacpp.mk
+++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/cudacpp.mk
@@ -1,7 +1,7 @@
 # Copyright (C) 2020-2023 CERN and UCLouvain.
 # Licensed under the GNU Lesser General Public License (version 3 or later).
 # Created by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin.
-# Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+# Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 
 #=== Determine the name of this makefile (https://ftp.gnu.org/old-gnu/Manuals/make-3.80/html_node/make_17.html)
 #=== NB: use ':=' to ensure that the value of CUDACPP_MAKEFILE is not modified further down after including make_opts
@@ -42,10 +42,10 @@ endif
 
 #-------------------------------------------------------------------------------
 
-#=== Configure common compiler flags for C++ and CUDA
+#=== Configure common compiler flags for C++ and CUDA/HIP
 
 INCFLAGS = -I.
-OPTFLAGS = -O3 # this ends up in CUFLAGS too (should it?), cannot add -Ofast or -ffast-math here
+OPTFLAGS = -O3 # this ends up in GPUFLAGS too (should it?), cannot add -Ofast or -ffast-math here
 
 # Dependency on src directory
 MG5AMC_COMMONLIB = mg5amc_common
@@ -121,24 +121,46 @@ endif
 
 #-------------------------------------------------------------------------------
 
-#=== Configure the CUDA compiler
+#=== Configure the GPU compiler (CUDA or HIP)
 
-# If CXX is not a single word (example "clang++ --gcc-toolchain...") then disable CUDA builds (issue #505)
-# This is because it is impossible to pass this to "CUFLAGS += -ccbin <host-compiler>" below
+# FIXME! (AV 24.01.2024)
+# In the current implementation (without separate builds for C++ and CUDA/HIP), we first check for cudacc and hipcc in CUDA_HOME and HIP_HOME.
+# If CUDA_HOME or HIP_HOME are not set, try to determine them from the path to cudacc and hipcc.
+# While convoluted, this is currently necessary to allow disabling CUDA/HIP builds by setting CUDA_HOME or HIP_HOME to invalid paths.
+# This will (probably?) be fixed when separate C++ and CUDA/HIP builds are implemented (PR #775).
+
+# If CXX is not a single word (example "clang++ --gcc-toolchain...") then disable CUDA and HIP builds (issue #505)
+# This is because it is impossible to pass this to "GPUFLAGS += -ccbin <host-compiler>" below
 ifneq ($(words $(subst ccache ,,$(CXX))),1) # allow at most "CXX=ccache <host-compiler>" from outside
-  $(warning CUDA builds are not supported for multi-word CXX "$(CXX)")
+  $(warning CUDA and HIP builds are not supported for multi-word CXX "$(CXX)")
   override CUDA_HOME=disabled
+  override HIP_HOME=disabled
 endif
 
-# If CUDA_HOME is not set, try to set it from the location of nvcc
+# If CUDA_HOME is not set, try to set it from the path to nvcc
 ifndef CUDA_HOME
   CUDA_HOME = $(patsubst %bin/nvcc,%,$(shell which nvcc 2>/dev/null))
   $(warning CUDA_HOME was not set: using "$(CUDA_HOME)")
 endif
 
-# Set NVCC as $(CUDA_HOME)/bin/nvcc if it exists
+# If HIP_HOME is not set, try to set it from the path to hipcc
+ifndef HIP_HOME
+  HIP_HOME = $(patsubst %bin/hipcc,%,$(HIP_COMPILER_PATH))
+  $(warning HIP_HOME was not set: using "$(HIP_HOME)")
+endif
+
+# FIXME! (AV 24.01.2024)
+# In the current implementation (without separate builds for C++ and CUDA/HIP),
+# builds are performed for HIP only if CUDA is not found in the path.
+# If both CUDA and HIP are installed, HIP builds can be triggered by unsetting CUDA_HOME.
+# This will be fixed when separate C++ and CUDA/HIP builds are implemented (PR #775).
+
+#--- Option 1: CUDA exists -> use CUDA
+
+# Set GPUCC as $(CUDA_HOME)/bin/nvcc if it exists
 ifneq ($(wildcard $(CUDA_HOME)/bin/nvcc),)
-  NVCC = $(CUDA_HOME)/bin/nvcc
+
+  GPUCC = $(CUDA_HOME)/bin/nvcc
   USE_NVTX ?=-DUSE_NVTX
   # See https://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/index.html
   # See https://arnon.dk/matching-sm-architectures-arch-and-gencode-for-various-nvidia-cards/
@@ -158,41 +180,78 @@ ifneq ($(wildcard $(CUDA_HOME)/bin/nvcc),)
     CURANDLIBFLAGS = -L$(CUDA_HOME)/lib64/ -lcurand # NB: -lcuda is not needed here!
   endif
   CUOPTFLAGS = -lineinfo
-  CUFLAGS = $(foreach opt, $(OPTFLAGS), -Xcompiler $(opt)) $(CUOPTFLAGS) $(INCFLAGS) $(CUINC) $(USE_NVTX) $(CUARCHFLAGS) -use_fast_math
-  ###CUFLAGS += -Xcompiler -Wall -Xcompiler -Wextra -Xcompiler -Wshadow
-  ###NVCC_VERSION = $(shell $(NVCC) --version | grep 'Cuda compilation tools' | cut -d' ' -f5 | cut -d, -f1)
-  CUFLAGS += -std=c++17 # need CUDA >= 11.2 (see #333): this is enforced in mgOnGpuConfig.h
+  ###GPUFLAGS = $(OPTFLAGS) $(CUOPTFLAGS) $(INCFLAGS) $(CUINC) $(USE_NVTX) $(CUARCHFLAGS) -use_fast_math
+  GPUFLAGS = $(foreach opt, $(OPTFLAGS), -Xcompiler $(opt)) $(CUOPTFLAGS) $(INCFLAGS) $(CUINC) $(USE_NVTX) $(CUARCHFLAGS) -use_fast_math
+  ###GPUFLAGS += -Xcompiler -Wall -Xcompiler -Wextra -Xcompiler -Wshadow
+  ###GPUCC_VERSION = $(shell $(GPUCC) --version | grep 'Cuda compilation tools' | cut -d' ' -f5 | cut -d, -f1)
+  GPUFLAGS += -std=c++17 # need CUDA >= 11.2 (see #333): this is enforced in mgOnGpuConfig.h
   # Without -maxrregcount: baseline throughput: 6.5E8 (16384 32 12) up to 7.3E8 (65536 128 12)
-  ###CUFLAGS+= --maxrregcount 160 # improves throughput: 6.9E8 (16384 32 12) up to 7.7E8 (65536 128 12)
-  ###CUFLAGS+= --maxrregcount 128 # improves throughput: 7.3E8 (16384 32 12) up to 7.6E8 (65536 128 12)
-  ###CUFLAGS+= --maxrregcount 96 # degrades throughput: 4.1E8 (16384 32 12) up to 4.5E8 (65536 128 12)
-  ###CUFLAGS+= --maxrregcount 64 # degrades throughput: 1.7E8 (16384 32 12) flat at 1.7E8 (65536 128 12)
+  ###GPUFLAGS+= --maxrregcount 160 # improves throughput: 6.9E8 (16384 32 12) up to 7.7E8 (65536 128 12)
+  ###GPUFLAGS+= --maxrregcount 128 # improves throughput: 7.3E8 (16384 32 12) up to 7.6E8 (65536 128 12)
+  ###GPUFLAGS+= --maxrregcount 96 # degrades throughput: 4.1E8 (16384 32 12) up to 4.5E8 (65536 128 12)
+  ###GPUFLAGS+= --maxrregcount 64 # degrades throughput: 1.7E8 (16384 32 12) flat at 1.7E8 (65536 128 12)
+  CUBUILDRULEFLAGS = -Xcompiler -fPIC -c
+  CCBUILDRULEFLAGS = -Xcompiler -fPIC -c -x cu
+  CUDATESTFLAGS = -lcuda
+
+  # Set the host C++ compiler for GPUCC via "-ccbin <host-compiler>"
+  # (NB issue #505: this must be a single word, "clang++ --gcc-toolchain..." is not supported)
+  GPUFLAGS += -ccbin $(shell which $(subst ccache ,,$(CXX)))
+
+  # Allow newer (unsupported) C++ compilers with older versions of CUDA if ALLOW_UNSUPPORTED_COMPILER_IN_CUDA is set (#504)
+  ifneq ($(origin ALLOW_UNSUPPORTED_COMPILER_IN_CUDA),undefined)
+  GPUFLAGS += -allow-unsupported-compiler
+  endif
+
 else ifneq ($(origin REQUIRE_CUDA),undefined)
+
   # If REQUIRE_CUDA is set but no cuda is found, stop here (e.g. for CI tests on GPU #443)
-  $(error No cuda installation found (set CUDA_HOME or make nvcc visible in PATH))
+  $(error No cuda installation found (set CUDA_HOME or make GPUCC visible in PATH))
+
+#--- Option 2: CUDA does not exist, HIP exists -> use HIP
+
+# Set GPUCC as $(HIP_HOME)/bin/hipcc if it exists
+else ifneq ($(wildcard $(HIP_HOME)/bin/hipcc),)
+
+  GPUCC = $(HIP_HOME)/bin/hipcc
+  #USE_NVTX ?=-DUSE_NVTX # should maybe find something equivalent to this in HIP?
+  HIPARCHFLAGS = -target x86_64-linux-gnu --offload-arch=gfx90a
+  HIPINC = -I$(HIP_HOME)/include/
+  # Note: -DHIP_FAST_MATH is equivalent to -use_fast_math in HIP 
+  # (but only for single precision line 208: https://rocm-developer-tools.github.io/HIP/hcc__detail_2math__functions_8h_source.html)
+  GPUFLAGS = $(OPTFLAGS) $(CUOPTFLAGS) $(INCFLAGS) $(HIPINC) $(HIPARCHFLAGS) -DHIP_FAST_MATH -DHIP_PLATFORM=amd -fPIC
+  ###GPUFLAGS += -Xcompiler -Wall -Xcompiler -Wextra -Xcompiler -Wshadow
+  GPUFLAGS += -std=c++17
+  ###GPUFLAGS+= --maxrregcount 255 # (AV: is this option valid on HIP and meaningful on AMD GPUs?)
+  CUBUILDRULEFLAGS = -fPIC -c
+  CCBUILDRULEFLAGS = -fPIC -c
+
+else ifneq ($(origin REQUIRE_HIP),undefined)
+
+  # If REQUIRE_HIP is set but no HIP is found, stop here (e.g. for CI tests on GPU #443)
+  $(error No hip installation found (set HIP_HOME or make GPUCC visible in PATH))
+
+#--- Option 3: CUDA does not exist, HIP does not exist -> switch off both CUDA and HIP
+
 else
-  # No cuda. Switch cuda compilation off and go to common random numbers in C++
+
+  # No cudacc and no hipcc: switch CUDA and HIP compilation off and go to common random numbers in C++
   $(warning CUDA_HOME is not set or is invalid: export CUDA_HOME to compile with cuda)
-  override NVCC=
+  $(warning HIP_HOME is not set or is invalid: export HIP_HOME to compile with hip)
+  override GPUCC=
   override USE_NVTX=
   override CUINC=
   override CURANDLIBFLAGS=
-endif
-export NVCC
-export CUFLAGS
-
-# Set the host C++ compiler for nvcc via "-ccbin <host-compiler>"
-# (NB issue #505: this must be a single word, "clang++ --gcc-toolchain..." is not supported)
-CUFLAGS += -ccbin $(shell which $(subst ccache ,,$(CXX)))
 
-# Allow newer (unsupported) C++ compilers with older versions of CUDA if ALLOW_UNSUPPORTED_COMPILER_IN_CUDA is set (#504)
-ifneq ($(origin ALLOW_UNSUPPORTED_COMPILER_IN_CUDA),undefined)
-CUFLAGS += -allow-unsupported-compiler
 endif
 
+# Export GPUCC (so that it can also be used in cudacpp_src.mk?)
+export GPUCC
+export GPUFLAGS
+
 #-------------------------------------------------------------------------------
 
-#=== Configure ccache for C++ and CUDA builds
+#=== Configure ccache for C++ and CUDA/HIP builds
 
 # Enable ccache if USECCACHE=1
 ifeq ($(USECCACHE)$(shell echo $(CXX) | grep ccache),1)
@@ -201,15 +260,15 @@ endif
 #ifeq ($(USECCACHE)$(shell echo $(AR) | grep ccache),1)
 #  override AR:=ccache $(AR)
 #endif
-ifneq ($(NVCC),)
-  ifeq ($(USECCACHE)$(shell echo $(NVCC) | grep ccache),1)
-    override NVCC:=ccache $(NVCC)
+ifneq ($(GPUCC),)
+  ifeq ($(USECCACHE)$(shell echo $(GPUCC) | grep ccache),1)
+    override GPUCC:=ccache $(GPUCC)
   endif
 endif
 
 #-------------------------------------------------------------------------------
 
-#=== Configure PowerPC-specific compiler flags for C++ and CUDA
+#=== Configure PowerPC-specific compiler flags for C++ and CUDA/HIP
 
 # PowerPC-specific CXX compiler flags (being reviewed)
 ifeq ($(UNAME_P),ppc64le)
@@ -225,9 +284,9 @@ else
   ######CXXFLAGS+= -fno-semantic-interposition # no benefit (neither alone, nor combined with -flto)
 endif
 
-# PowerPC-specific CUDA compiler flags (to be reviewed!)
+# PowerPC-specific CUDA/HIP compiler flags (to be reviewed!)
 ifeq ($(UNAME_P),ppc64le)
-  CUFLAGS+= -Xcompiler -mno-float128
+  GPUFLAGS+= -Xcompiler -mno-float128
 endif
 
 #-------------------------------------------------------------------------------
@@ -237,7 +296,7 @@ endif
 # Set the default OMPFLAGS choice
 ifneq ($(shell $(CXX) --version | egrep '^Intel'),)
 override OMPFLAGS = -fopenmp
-###override OMPFLAGS = # disable OpenMP MT on Intel (was ok without nvcc but not ok with nvcc before #578)
+###override OMPFLAGS = # disable OpenMP MT on Intel (was ok without GPUCC but not ok with GPUCC before #578)
 else ifneq ($(shell $(CXX) --version | egrep '^(clang)'),)
 override OMPFLAGS = -fopenmp
 ###override OMPFLAGS = # disable OpenMP MT on clang (was not ok without or with nvcc before #578)
@@ -293,7 +352,10 @@ endif
 
 # Set the default RNDGEN (random number generator) choice
 ifeq ($(RNDGEN),)
-  ifeq ($(NVCC),)
+  ifeq ($(GPUCC),)
+    override RNDGEN = hasNoCurand
+  # Edgecase for HIP compilation
+  else ifeq ($(findstring hipcc,$(GPUCC)),hipcc)
     override RNDGEN = hasNoCurand
   else ifeq ($(RNDGEN),)
     override RNDGEN = hasCurand
@@ -310,7 +372,7 @@ export OMPFLAGS
 
 #-------------------------------------------------------------------------------
 
-#=== Set the CUDA/C++ compiler flags appropriate to user-defined choices of AVX, FPTYPE, HELINL, HRDCOD, RNDGEN
+#=== Set the CUDA/HIP/C++ compiler flags appropriate to user-defined choices of AVX, FPTYPE, HELINL, HRDCOD, RNDGEN
 
 # Set the build flags appropriate to OMPFLAGS
 $(info OMPFLAGS=$(OMPFLAGS))
@@ -368,13 +430,13 @@ CXXFLAGS+= $(AVXFLAGS)
 $(info FPTYPE=$(FPTYPE))
 ifeq ($(FPTYPE),d)
   CXXFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_DOUBLE
-  CUFLAGS  += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_DOUBLE
+  GPUFLAGS  += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_DOUBLE
 else ifeq ($(FPTYPE),f)
   CXXFLAGS += -DMGONGPU_FPTYPE_FLOAT -DMGONGPU_FPTYPE2_FLOAT
-  CUFLAGS  += -DMGONGPU_FPTYPE_FLOAT -DMGONGPU_FPTYPE2_FLOAT
+  GPUFLAGS  += -DMGONGPU_FPTYPE_FLOAT -DMGONGPU_FPTYPE2_FLOAT
 else ifeq ($(FPTYPE),m)
   CXXFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_FLOAT
-  CUFLAGS  += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_FLOAT
+  GPUFLAGS  += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_FLOAT
 else
   $(error Unknown FPTYPE='$(FPTYPE)': only 'd', 'f' and 'm' are supported)
 endif
@@ -383,7 +445,7 @@ endif
 $(info HELINL=$(HELINL))
 ifeq ($(HELINL),1)
   CXXFLAGS += -DMGONGPU_INLINE_HELAMPS
-  CUFLAGS  += -DMGONGPU_INLINE_HELAMPS
+  GPUFLAGS  += -DMGONGPU_INLINE_HELAMPS
 else ifneq ($(HELINL),0)
   $(error Unknown HELINL='$(HELINL)': only '0' and '1' are supported)
 endif
@@ -392,7 +454,7 @@ endif
 $(info HRDCOD=$(HRDCOD))
 ifeq ($(HRDCOD),1)
   CXXFLAGS += -DMGONGPU_HARDCODE_PARAM
-  CUFLAGS  += -DMGONGPU_HARDCODE_PARAM
+  GPUFLAGS  += -DMGONGPU_HARDCODE_PARAM
 else ifneq ($(HRDCOD),0)
   $(error Unknown HRDCOD='$(HRDCOD)': only '0' and '1' are supported)
 endif
@@ -444,11 +506,11 @@ ifeq ($(UNAME_S),Darwin)
   override CULIBFLAGSRPATH2 =
 else
   # RPATH to cuda/cpp libs when linking executables
-  override CXXLIBFLAGSRPATH = -Wl,-rpath,$(LIBDIRRPATH)
-  override CULIBFLAGSRPATH = -Xlinker -rpath,$(LIBDIRRPATH)
+  override CXXLIBFLAGSRPATH = -Wl,-rpath=$(LIBDIRRPATH)
+  override CULIBFLAGSRPATH = -Xlinker -rpath=$(LIBDIRRPATH)
   # RPATH to common lib when linking cuda/cpp libs
-  override CXXLIBFLAGSRPATH2 = -Wl,-rpath,'$$ORIGIN'
-  override CULIBFLAGSRPATH2 = -Xlinker -rpath,'$$ORIGIN'
+  override CXXLIBFLAGSRPATH2 = -Wl,-rpath='$$ORIGIN'
+  override CULIBFLAGSRPATH2 = -Xlinker -rpath='$$ORIGIN'
 endif
 
 # Setting LD_LIBRARY_PATH or DYLD_LIBRARY_PATH in the RUNTIME is no longer necessary (neither on Linux nor on Mac)
@@ -461,7 +523,7 @@ override RUNTIME =
 cxx_main=$(BUILDDIR)/check.exe
 fcxx_main=$(BUILDDIR)/fcheck.exe
 
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 cu_main=$(BUILDDIR)/gcheck.exe
 fcu_main=$(BUILDDIR)/fgcheck.exe
 else
@@ -492,15 +554,16 @@ $(BUILDDIR)/.build.$(TAG):
 	@touch $(BUILDDIR)/.build.$(TAG)
 
 # Generic target and build rules: objects from CUDA compilation
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 $(BUILDDIR)/%.o : %.cu *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
 	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
-	$(NVCC) $(CPPFLAGS) $(CUFLAGS) -Xcompiler -fPIC -c $< -o $@
+	$(GPUCC) $(CPPFLAGS) $(GPUFLAGS) $(CUBUILDRULEFLAGS) $< -o $@
 
 $(BUILDDIR)/%_cu.o : %.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
 	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
-	$(NVCC) $(CPPFLAGS) $(CUFLAGS) -Xcompiler -fPIC -c -x cu $< -o $@
+	$(GPUCC) $(CPPFLAGS) $(GPUFLAGS) $(CCBUILDRULEFLAGS) $< -o $@
 endif
+# -x cu in line above
 
 # Generic target and build rules: objects from C++ compilation
 # (NB do not include CUINC here! add it only for NVTX or curand #679)
@@ -509,11 +572,14 @@ $(BUILDDIR)/%.o : %.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
 	$(CXX) $(CPPFLAGS) $(CXXFLAGS) -fPIC -c $< -o $@
 
 # Apply special build flags only to CrossSectionKernel.cc and gCrossSectionKernel.cu (no fast math, see #117 and #516)
+# Added edgecase for HIP compilation
 ifeq ($(shell $(CXX) --version | grep ^nvc++),)
 $(BUILDDIR)/CrossSectionKernels.o: CXXFLAGS := $(filter-out -ffast-math,$(CXXFLAGS))
 $(BUILDDIR)/CrossSectionKernels.o: CXXFLAGS += -fno-fast-math
-ifneq ($(NVCC),)
-$(BUILDDIR)/gCrossSectionKernels.o: CUFLAGS += -Xcompiler -fno-fast-math
+ifeq ($(findstring nvcc,$(GPUCC)),nvcc)
+  $(BUILDDIR)/gCrossSectionKernels.o: GPUFLAGS += -Xcompiler -fno-fast-math
+else
+  $(BUILDDIR)/gCrossSectionKernels.o: GPUFLAGS += -fno-fast-math
 endif
 endif
 
@@ -530,10 +596,10 @@ ifeq ($(RNDGEN),hasCurand)
 $(BUILDDIR)/CurandRandomNumberKernel.o: CXXFLAGS += $(CUINC)
 endif
 
-# Avoid "warning: builtin __has_trivial_... is deprecated; use __is_trivially_... instead" in nvcc with icx2023 (#592)
+# Avoid "warning: builtin __has_trivial_... is deprecated; use __is_trivially_... instead" in GPUCC with icx2023 (#592)
 ifneq ($(shell $(CXX) --version | egrep '^(Intel)'),)
-ifneq ($(NVCC),)
-CUFLAGS += -Xcompiler -Wno-deprecated-builtins
+ifneq ($(GPUCC),)
+GPUFLAGS += -Wno-deprecated-builtins
 endif
 endif
 
@@ -541,8 +607,8 @@ endif
 # This patch does remove the warning, but I prefer to keep it disabled for the moment...
 ###ifneq ($(shell $(CXX) --version | egrep '^(clang|Apple clang|Intel)'),)
 ###$(BUILDDIR)/CrossSectionKernels.o: CXXFLAGS += -Wno-overriding-t-option
-###ifneq ($(NVCC),)
-###$(BUILDDIR)/gCrossSectionKernels.o: CUFLAGS += -Xcompiler -Wno-overriding-t-option
+###ifneq ($(GPUCC),)
+###$(BUILDDIR)/gCrossSectionKernels.o: GPUFLAGS += -Xcompiler -Wno-overriding-t-option
 ###endif
 ###endif
 
@@ -569,7 +635,7 @@ MG5AMC_CXXLIB = mg5amc_$(processid_short)_cpp
 cxx_objects_lib=$(BUILDDIR)/CPPProcess.o $(BUILDDIR)/MatrixElementKernels.o $(BUILDDIR)/BridgeKernels.o $(BUILDDIR)/CrossSectionKernels.o
 cxx_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel.o $(BUILDDIR)/RamboSamplingKernels.o
 
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 MG5AMC_CULIB = mg5amc_$(processid_short)_cuda
 cu_objects_lib=$(BUILDDIR)/gCPPProcess.o $(BUILDDIR)/gMatrixElementKernels.o $(BUILDDIR)/gBridgeKernels.o $(BUILDDIR)/gCrossSectionKernels.o
 cu_objects_exe=$(BUILDDIR)/gCommonRandomNumberKernel.o $(BUILDDIR)/gRamboSamplingKernels.o
@@ -581,11 +647,11 @@ $(LIBDIR)/lib$(MG5AMC_CXXLIB).so: cxx_objects_lib += $(BUILDDIR)/fbridge.o
 $(LIBDIR)/lib$(MG5AMC_CXXLIB).so: $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib)
 	$(CXX) -shared -o $@ $(cxx_objects_lib) $(CXXLIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB)
 
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 $(LIBDIR)/lib$(MG5AMC_CULIB).so: $(BUILDDIR)/fbridge_cu.o
 $(LIBDIR)/lib$(MG5AMC_CULIB).so: cu_objects_lib += $(BUILDDIR)/fbridge_cu.o
 $(LIBDIR)/lib$(MG5AMC_CULIB).so: $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cu_objects_lib)
-	$(NVCC) --shared -o $@ $(cu_objects_lib) $(CULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB)
+	$(GPUCC) --shared -o $@ $(cu_objects_lib) $(CULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB)
 endif
 
 #-------------------------------------------------------------------------------
@@ -602,16 +668,16 @@ $(cxx_main): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PAT
 $(cxx_main): $(BUILDDIR)/check_sa.o $(LIBDIR)/lib$(MG5AMC_CXXLIB).so $(cxx_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel.o
 	$(CXX) -o $@ $(BUILDDIR)/check_sa.o $(OMPFLAGS) -ldl -pthread $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_CXXLIB) $(cxx_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel.o $(CURANDLIBFLAGS)
 
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 ifneq ($(shell $(CXX) --version | grep ^Intel),)
-$(cu_main): LIBFLAGS += -lintlc # compile with icpx and link with nvcc (undefined reference to `_intel_fast_memcpy')
-$(cu_main): LIBFLAGS += -lsvml # compile with icpx and link with nvcc (undefined reference to `__svml_cos4_l9')
+$(cu_main): LIBFLAGS += -lintlc # compile with icpx and link with GPUCC (undefined reference to `_intel_fast_memcpy')
+$(cu_main): LIBFLAGS += -lsvml # compile with icpx and link with GPUCC (undefined reference to `__svml_cos4_l9')
 else ifneq ($(shell $(CXX) --version | grep ^nvc++),) # support nvc++ #531
 $(cu_main): LIBFLAGS += -L$(patsubst %bin/nvc++,%lib,$(subst ccache ,,$(CXX))) -lnvhpcatm -lnvcpumath -lnvc
 endif
 $(cu_main): LIBFLAGS += $(CULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH
 $(cu_main): $(BUILDDIR)/gcheck_sa.o $(LIBDIR)/lib$(MG5AMC_CULIB).so $(cu_objects_exe) $(BUILDDIR)/gCurandRandomNumberKernel.o
-	$(NVCC) -o $@ $(BUILDDIR)/gcheck_sa.o $(CUARCHFLAGS) $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe) $(BUILDDIR)/gCurandRandomNumberKernel.o $(CURANDLIBFLAGS)
+	$(GPUCC) -o $@ $(BUILDDIR)/gcheck_sa.o $(CUARCHFLAGS) $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe) $(BUILDDIR)/gCurandRandomNumberKernel.o $(CURANDLIBFLAGS)
 endif
 
 #-------------------------------------------------------------------------------
@@ -637,17 +703,17 @@ $(fcxx_main): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PA
 $(fcxx_main): $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler.o $(LIBDIR)/lib$(MG5AMC_CXXLIB).so $(cxx_objects_exe)
 	$(CXX) -o $@ $(BUILDDIR)/fcheck_sa.o $(OMPFLAGS) $(BUILDDIR)/fsampler.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CXXLIB) $(cxx_objects_exe)
 
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 ifneq ($(shell $(CXX) --version | grep ^Intel),)
-$(fcu_main): LIBFLAGS += -lintlc # compile with icpx and link with nvcc (undefined reference to `_intel_fast_memcpy')
-$(fcu_main): LIBFLAGS += -lsvml # compile with icpx and link with nvcc (undefined reference to `__svml_cos4_l9')
+$(fcu_main): LIBFLAGS += -lintlc # compile with icpx and link with GPUCC (undefined reference to `_intel_fast_memcpy')
+$(fcu_main): LIBFLAGS += -lsvml # compile with icpx and link with GPUCC (undefined reference to `__svml_cos4_l9')
 endif
 ifeq ($(UNAME_S),Darwin)
 $(fcu_main): LIBFLAGS += -L$(shell dirname $(shell $(FC) --print-file-name libgfortran.dylib)) # add path to libgfortran on Mac #375
 endif
 $(fcu_main): LIBFLAGS += $(CULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH
 $(fcu_main): $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler_cu.o $(LIBDIR)/lib$(MG5AMC_CULIB).so $(cu_objects_exe)
-	$(NVCC) -o $@ $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler_cu.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe)
+	$(GPUCC) -o $@ $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler_cu.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe)
 endif
 
 #-------------------------------------------------------------------------------
@@ -659,7 +725,7 @@ $(BUILDDIR)/testxxx.o: testxxx_cc_ref.txt
 $(testmain): $(BUILDDIR)/testxxx.o
 $(testmain): cxx_objects_exe += $(BUILDDIR)/testxxx.o # Comment out this line to skip the C++ test of xxx functions
 
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 $(BUILDDIR)/testxxx_cu.o: $(GTESTLIBS)
 $(BUILDDIR)/testxxx_cu.o: INCFLAGS += $(GTESTINC)
 $(BUILDDIR)/testxxx_cu.o: testxxx_cc_ref.txt
@@ -672,7 +738,7 @@ $(BUILDDIR)/testmisc.o: INCFLAGS += $(GTESTINC)
 $(testmain): $(BUILDDIR)/testmisc.o
 $(testmain): cxx_objects_exe += $(BUILDDIR)/testmisc.o # Comment out this line to skip the C++ miscellaneous tests
 
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 $(BUILDDIR)/testmisc_cu.o: $(GTESTLIBS)
 $(BUILDDIR)/testmisc_cu.o: INCFLAGS += $(GTESTINC)
 $(testmain): $(BUILDDIR)/testmisc_cu.o
@@ -684,12 +750,12 @@ $(BUILDDIR)/runTest.o: INCFLAGS += $(GTESTINC)
 $(testmain): $(BUILDDIR)/runTest.o
 $(testmain): cxx_objects_exe += $(BUILDDIR)/runTest.o
 
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 $(BUILDDIR)/runTest_cu.o: $(GTESTLIBS)
 $(BUILDDIR)/runTest_cu.o: INCFLAGS += $(GTESTINC)
 ifneq ($(shell $(CXX) --version | grep ^Intel),)
-$(testmain): LIBFLAGS += -lintlc # compile with icpx and link with nvcc (undefined reference to `_intel_fast_memcpy')
-$(testmain): LIBFLAGS += -lsvml # compile with icpx and link with nvcc (undefined reference to `__svml_cos4_l9')
+$(testmain): LIBFLAGS += -lintlc # compile with icpx and link with GPUCC (undefined reference to `_intel_fast_memcpy')
+$(testmain): LIBFLAGS += -lsvml # compile with icpx and link with GPUCC (undefined reference to `__svml_cos4_l9')
 else ifneq ($(shell $(CXX) --version | grep ^nvc++),) # support nvc++ #531
 $(testmain): LIBFLAGS += -L$(patsubst %bin/nvc++,%lib,$(subst ccache ,,$(CXX))) -lnvhpcatm -lnvcpumath -lnvc
 endif
@@ -713,14 +779,14 @@ $(testmain): LIBFLAGS += -lgomp
 endif
 endif
 
-ifeq ($(NVCC),) # link only runTest.o
+ifeq ($(GPUCC),) # link only runTest.o
 $(testmain): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH
 $(testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(cxx_objects_exe) $(GTESTLIBS)
 	$(CXX) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) -ldl -pthread $(LIBFLAGS)
 else # link both runTest.o and runTest_cu.o
 $(testmain): LIBFLAGS += $(CULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH
 $(testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) $(GTESTLIBS)
-	$(NVCC) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) -ldl $(LIBFLAGS) -lcuda
+	  $(GPUCC) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) -ldl $(LIBFLAGS) $(CUDATESTFLAGS)
 endif
 
 # Use target gtestlibs to build only googletest
@@ -829,9 +895,9 @@ ifeq ($(USECCACHE),1)
 	ccache --version | head -1
 endif
 	@echo ""
-	@echo NVCC=$(NVCC)
-ifneq ($(NVCC),)
-	$(NVCC) --version
+	@echo GPUCC=$(GPUCC)
+ifneq ($(GPUCC),)
+	$(GPUCC) --version
 endif
 	@echo ""
 	@echo CXX=$(CXX)
@@ -850,7 +916,7 @@ endif
 
 # Target: check (run the C++ test executable)
 # [NB THIS IS WHAT IS USED IN THE GITHUB CI!]
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 check: runTest cmpFcheck cmpFGcheck
 else
 check: runTest cmpFcheck
diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/fbridge.cc b/epochX/cudacpp/gg_ttg.mad/SubProcesses/fbridge.cc
index 2d2b36d560..22ce3f5115 100644
--- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/fbridge.cc
+++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/fbridge.cc
@@ -1,11 +1,11 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: S. Roiser (Oct 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Roiser, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #include "Bridge.h"
 #include "CPPProcess.h"
-#include "CudaRuntime.h"
+#include "GpuRuntime.h"
 
 extern "C"
 {
@@ -22,7 +22,7 @@ extern "C"
    * Using the same Fortran MadEvent code, linking to the hetrerogeneous library would allow access to both CPU and GPU implementations.
    * The specific heterogeneous configuration (how many GPUs, how many threads on each CPU, etc) could be loaded in CUDA/C++ from a data file.
    */
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   using namespace mg5amcGpu;
 #else
   using namespace mg5amcCpu;
@@ -46,8 +46,8 @@ extern "C"
    */
   void fbridgecreate_( CppObjectInFortran** ppbridge, const int* pnevtF, const int* pnparF, const int* pnp4F )
   {
-#ifdef __CUDACC__
-    CudaRuntime::setUp();
+#ifdef MGONGPUCPP_GPUIMPL
+    GpuRuntime::setUp();
 #endif
     // (NB: CPPProcess::initProc no longer needs to be executed here because it is called in the Bridge constructor)
     // FIXME: disable OMP in Bridge when called from Fortran
@@ -65,8 +65,8 @@ extern "C"
     Bridge<FORTRANFPTYPE>* pbridge = dynamic_cast<Bridge<FORTRANFPTYPE>*>( *ppbridge );
     if( pbridge == 0 ) throw std::runtime_error( "fbridgedelete_: invalid Bridge address" );
     delete pbridge;
-#ifdef __CUDACC__
-    CudaRuntime::tearDown();
+#ifdef MGONGPUCPP_GPUIMPL
+    GpuRuntime::tearDown();
 #endif
   }
 
@@ -96,7 +96,7 @@ extern "C"
   {
     Bridge<FORTRANFPTYPE>* pbridge = dynamic_cast<Bridge<FORTRANFPTYPE>*>( *ppbridge );
     if( pbridge == 0 ) throw std::runtime_error( "fbridgesequence_: invalid Bridge address" );
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     // Use the device/GPU implementation in the CUDA library
     // (there is also a host implementation in this library)
     pbridge->gpu_sequence( momenta, gs, rndhel, rndcol, *pchannelId, mes, selhel, selcol );
diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/fsampler.cc b/epochX/cudacpp/gg_ttg.mad/SubProcesses/fsampler.cc
index 2fb445372d..3743934f41 100644
--- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/fsampler.cc
+++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/fsampler.cc
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Feb 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #include "mgOnGpuConfig.h"
 
@@ -13,7 +13,7 @@
 
 //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -40,7 +40,7 @@ namespace mg5amcCpu
   private:
     const int m_nevt; // The number of events in each iteration
     int m_iiter;      // The iteration counter (for random number seeding)
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
     HostBufferRndNumMomenta m_hstRndmom; // Memory buffers for random numbers
     HostBufferMomenta m_hstMomenta;      // Memory buffers for momenta
     HostBufferWeights m_hstWeights;      // Memory buffers for sampling weights
@@ -105,7 +105,7 @@ namespace mg5amcCpu
 
 extern "C"
 {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   using namespace mg5amcGpu;
 #else
   using namespace mg5amcCpu;
diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/runTest.cc b/epochX/cudacpp/gg_ttg.mad/SubProcesses/runTest.cc
index d4a760a71b..de327f2321 100644
--- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/runTest.cc
+++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/runTest.cc
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: S. Hageboeck (Nov 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 //----------------------------------------------------------------------------
 // Use ./runTest.exe --gtest_filter=*xxx to run only testxxx.cc tests
 //----------------------------------------------------------------------------
@@ -18,7 +18,7 @@
 #include "RandomNumberKernels.h"
 #include "epoch_process_id.h"
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 using namespace mg5amcGpu;
 #else
 using namespace mg5amcCpu;
@@ -35,7 +35,7 @@ struct CUDA_CPU_TestBase : public TestDriverBase
     : TestDriverBase( npar, refFileName ) {}
 };
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 struct CPUTest : public CUDA_CPU_TestBase
 {
   // Struct data members (process, and memory structures for random numbers, momenta, matrix elements and weights on host and device)
@@ -119,7 +119,7 @@ struct CPUTest : public CUDA_CPU_TestBase
 };
 #endif
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 struct CUDATest : public CUDA_CPU_TestBase
 {
   // Reset the device when our test goes out of scope. Note that this should happen after
@@ -128,7 +128,7 @@ struct CUDATest : public CUDA_CPU_TestBase
   {
     ~DeviceReset()
     {
-      checkCuda( cudaDeviceReset() ); // this is needed by cuda-memcheck --leak-check full
+      checkGpu( gpuDeviceReset() ); // this is needed by cuda-memcheck --leak-check full
     }
   } deviceResetter;
 
@@ -256,7 +256,7 @@ INSTANTIATE_TEST_SUITE_P( prefix, \
                           test_suite_name, \
                           testing::Values( new CUDATest( MG_EPOCH_REFERENCE_FILE_NAME ) ) );
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 MG_INSTANTIATE_TEST_SUITE_GPU( XTESTID_GPU( MG_EPOCH_PROCESS_ID ), MadgraphTest );
 #else
 MG_INSTANTIATE_TEST_SUITE_CPU( XTESTID_CPU( MG_EPOCH_PROCESS_ID ), MadgraphTest );
diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/testmisc.cc b/epochX/cudacpp/gg_ttg.mad/SubProcesses/testmisc.cc
index 895d6eeb56..ba9e59a8a3 100644
--- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/testmisc.cc
+++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/testmisc.cc
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 //----------------------------------------------------------------------------
 // Use ./runTest.exe --gtest_filter=*misc to run only testmisc.cc tests
 //----------------------------------------------------------------------------
@@ -17,7 +17,7 @@
 #include <sstream>
 #include <typeinfo>
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 #define TESTID( s ) s##_GPU_MISC
 #else
 #define TESTID( s ) s##_CPU_MISC
@@ -26,7 +26,7 @@
 #define XTESTID( s ) TESTID( s )
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -59,7 +59,7 @@ namespace mg5amcCpu
 
 TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testmisc )
 {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   using namespace mg5amcGpu;
 #else
   using namespace mg5amcCpu;
diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/testxxx.cc b/epochX/cudacpp/gg_ttg.mad/SubProcesses/testxxx.cc
index 3361fe5aa9..e5167de00c 100644
--- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/testxxx.cc
+++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/testxxx.cc
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Apr 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 //----------------------------------------------------------------------------
 // Use ./runTest.exe --gtest_filter=*xxx to run only testxxx.cc tests
 //----------------------------------------------------------------------------
@@ -24,7 +24,7 @@
 #include <iomanip>
 #include <iostream>
 #include <vector>
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 #define TESTID( s ) s##_GPU_XXX
 #else
 #define TESTID( s ) s##_CPU_XXX
@@ -32,7 +32,7 @@
 
 #define XTESTID( s ) TESTID( s )
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -42,7 +42,7 @@ namespace mg5amcCpu
   int FPEhandlerIevt = -1;
   inline void FPEhandler( int sig )
   {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     std::cerr << "Floating Point Exception (GPU): '" << FPEhandlerMessage << "' ievt=" << FPEhandlerIevt << std::endl;
 #else
     std::cerr << "Floating Point Exception (CPU neppV=" << neppV << "): '" << FPEhandlerMessage << "' ievt=" << FPEhandlerIevt << std::endl;
@@ -53,7 +53,7 @@ namespace mg5amcCpu
 
 TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testxxx )
 {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   using namespace mg5amcGpu;
 #else
   using namespace mg5amcCpu;
@@ -77,7 +77,7 @@ TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testxxx )
   assert( nevt % neppM == 0 ); // nevt must be a multiple of neppM
   assert( nevt % neppV == 0 ); // nevt must be a multiple of neppV
   // Fill in the input momenta
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   mg5amcGpu::PinnedHostBufferMomenta hstMomenta( nevt ); // AOSOA[npagM][npar=4][np4=4][neppM]
 #else
   mg5amcCpu::HostBufferMomenta hstMomenta( nevt ); // AOSOA[npagM][npar=4][np4=4][neppM]
@@ -322,7 +322,7 @@ TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testxxx )
   {
     for( int ievt = 0; ievt < nevt; ievt++ )
     {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
       using namespace mg5amcGpu;
 #else
       using namespace mg5amcCpu;
diff --git a/epochX/cudacpp/gg_ttg.mad/src/HelAmps_sm.h b/epochX/cudacpp/gg_ttg.mad/src/HelAmps_sm.h
index 361b488401..0dd0f3ebba 100644
--- a/epochX/cudacpp/gg_ttg.mad/src/HelAmps_sm.h
+++ b/epochX/cudacpp/gg_ttg.mad/src/HelAmps_sm.h
@@ -5,7 +5,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: A. Valassi (Sep 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
 // MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
@@ -28,7 +28,7 @@
 //#include <iomanip>
 //#include <iostream>
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/gg_ttg.mad/src/Parameters_sm.cc b/epochX/cudacpp/gg_ttg.mad/src/Parameters_sm.cc
index 64fc3fea62..067445b198 100644
--- a/epochX/cudacpp/gg_ttg.mad/src/Parameters_sm.cc
+++ b/epochX/cudacpp/gg_ttg.mad/src/Parameters_sm.cc
@@ -4,7 +4,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: A. Valassi (Sep 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
 // MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
@@ -17,7 +17,7 @@
 #include <iomanip>
 #include <iostream>
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 using namespace mg5amcGpu;
 #else
 using namespace mg5amcCpu;
diff --git a/epochX/cudacpp/gg_ttg.mad/src/Parameters_sm.h b/epochX/cudacpp/gg_ttg.mad/src/Parameters_sm.h
index b6568d3761..9581d66e0e 100644
--- a/epochX/cudacpp/gg_ttg.mad/src/Parameters_sm.h
+++ b/epochX/cudacpp/gg_ttg.mad/src/Parameters_sm.h
@@ -27,7 +27,7 @@
 #include "read_slha.h"
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -93,7 +93,7 @@ namespace mg5amcCpu
 #include <limits>
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -218,7 +218,7 @@ namespace mg5amcCpu
 //==========================================================================
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -239,7 +239,7 @@ namespace mg5amcCpu
 #pragma GCC diagnostic push
 #pragma GCC diagnostic ignored "-Wunused-variable"  // e.g. <<warning: unused variable ‘mdl_G__exp__2’ [-Wunused-variable]>>
 #pragma GCC diagnostic ignored "-Wunused-parameter" // e.g. <<warning: unused parameter ‘G’ [-Wunused-parameter]>>
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 #pragma nv_diagnostic push
 #pragma nv_diag_suppress 177 // e.g. <<warning #177-D: variable "mdl_G__exp__2" was declared but never referenced>>
 #endif
@@ -267,7 +267,7 @@ namespace mg5amcCpu
       // End SM implementation - no special handling of vectors of floats as in EFT (#439)
       return out;
     }
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 #pragma GCC diagnostic pop
 #pragma nv_diagnostic pop
 #endif
diff --git a/epochX/cudacpp/gg_ttg.mad/src/cudacpp_src.mk b/epochX/cudacpp/gg_ttg.mad/src/cudacpp_src.mk
index d4cc628aec..159e19a46d 100644
--- a/epochX/cudacpp/gg_ttg.mad/src/cudacpp_src.mk
+++ b/epochX/cudacpp/gg_ttg.mad/src/cudacpp_src.mk
@@ -19,7 +19,7 @@ SHELL := /bin/bash
 #=== Configure common compiler flags for CUDA and C++
 
 INCFLAGS = -I.
-OPTFLAGS = -O3 # this ends up in CUFLAGS too (should it?), cannot add -Ofast or -ffast-math here
+OPTFLAGS = -O3 # this ends up in GPUFLAGS too (should it?), cannot add -Ofast or -ffast-math here
 
 #-------------------------------------------------------------------------------
 
@@ -45,13 +45,13 @@ endif
 
 #-------------------------------------------------------------------------------
 
-#=== Configure the CUDA compiler (note: NVCC is already exported including ccache)
+#=== Configure the CUDA compiler (note: GPUCC is already exported including ccache)
 
-###$(info NVCC=$(NVCC))
+###$(info GPUCC=$(GPUCC))
 
 #-------------------------------------------------------------------------------
 
-#=== Configure ccache for C++ builds (note: NVCC is already exported including ccache)
+#=== Configure ccache for C++ builds (note: GPUCC is already exported including ccache)
 
 # Enable ccache if USECCACHE=1
 ifeq ($(USECCACHE)$(shell echo $(CXX) | grep ccache),1)
@@ -92,6 +92,13 @@ endif
 ###$(info OMPFLAGS=$(OMPFLAGS))
 CXXFLAGS += $(OMPFLAGS)
 
+# Add correct -DHIP_LATFORM when compiling for HIP
+ifeq ($(findstring nvcc,$(GPUCC)),nvcc)
+  GPUFLAGS += -Xcompiler -fPIC -c -x cu
+else ifeq ($(findstring hipcc,$(GPUCC)),hipcc)
+  GPUFLAGS += -fPIC -c
+endif
+
 # Set the build flags appropriate to each AVX choice (example: "make AVX=none")
 # [NB MGONGPU_PVW512 is needed because "-mprefer-vector-width=256" is not exposed in a macro]
 # [See https://gcc.gnu.org/bugzilla/show_bug.cgi?id=96476]
@@ -253,20 +260,20 @@ $(BUILDDIR)/%.o : %.cc *.h $(BUILDDIR)/.build.$(TAG)
 # Generic target and build rules: objects from CUDA compilation
 $(BUILDDIR)/%_cu.o : %.cc *.h $(BUILDDIR)/.build.$(TAG)
 	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
-	$(NVCC) $(CPPFLAGS) $(CUFLAGS) -Xcompiler -fPIC -c -x cu $< -o $@
+	$(GPUCC) $(CPPFLAGS) $(GPUFLAGS) $< -o $@
 
 #-------------------------------------------------------------------------------
 
 cxx_objects=$(addprefix $(BUILDDIR)/, Parameters_sm.o read_slha.o)
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 cu_objects=$(addprefix $(BUILDDIR)/, Parameters_sm_cu.o)
 endif
 
 # Target (and build rules): common (src) library
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so : $(cxx_objects) $(cu_objects)
 	@if [ ! -d $(LIBDIR) ]; then echo "mkdir -p $(LIBDIR)"; mkdir -p $(LIBDIR); fi
-	$(NVCC) -shared -o $@ $(cxx_objects) $(cu_objects) $(LDFLAGS)
+	$(GPUCC) -shared -o $@ $(cxx_objects) $(cu_objects) $(LDFLAGS)
 else
 $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so : $(cxx_objects)
 	@if [ ! -d $(LIBDIR) ]; then echo "mkdir -p $(LIBDIR)"; mkdir -p $(LIBDIR); fi
diff --git a/epochX/cudacpp/gg_ttg.mad/src/mgOnGpuConfig.h b/epochX/cudacpp/gg_ttg.mad/src/mgOnGpuConfig.h
index 80032e528b..55d03f1252 100644
--- a/epochX/cudacpp/gg_ttg.mad/src/mgOnGpuConfig.h
+++ b/epochX/cudacpp/gg_ttg.mad/src/mgOnGpuConfig.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jul 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MGONGPUCONFIG_H
 #define MGONGPUCONFIG_H 1
@@ -10,12 +10,25 @@
 // There are two different code bases for standalone_cudacpp (without multichannel) and madevent+cudacpp (with multichannel)
 #define MGONGPU_SUPPORTS_MULTICHANNEL 1
 
+// Is this a GPU (CUDA, HIP) or CPU implementation?
+#ifdef __CUDACC__
+#define MGONGPUCPP_GPUIMPL cuda
+#elif defined __HIPCC__
+#define MGONGPUCPP_GPUIMPL hip
+#else
+#undef MGONGPUCPP_GPUIMPL
+#endif
+
 // ** NB1 Throughputs (e.g. 6.8E8) are events/sec for "./gcheck.exe -p 65536 128 12"
 // ** NB2 Baseline on b7g47n0004 fluctuates (probably depends on load on other VMs)
 
 // Choose if curand is supported for generating random numbers
+// For HIP, by default, do not use curand (common random numbers will be used instead)
 // For both CUDA and C++, by default, do not inline, but allow this macro to be set from outside with e.g. -DMGONGPU_HAS_NO_CURAND
-// (there exist CUDA installations, e.g. using the HPC package, which do not include curand - see PR #784)
+// (there exist CUDA installations, e.g. using the HPC package, which do not include curand - see PR #784 and #785)
+#if defined __HIPCC__
+#define MGONGPU_HAS_NO_CURAND 1
+#else
 //#ifdef __CUDACC__
 //#undef MGONGPU_HAS_NO_CURAND // default
 ////#define MGONGPU_HAS_NO_CURAND 1
@@ -23,6 +36,7 @@
 //#undef MGONGPU_HAS_NO_CURAND // default
 ////#define MGONGPU_HAS_NO_CURAND 1
 //#endif
+#endif
 
 // Choose floating point precision (for everything but color algebra #537)
 // If one of these macros has been set from outside with e.g. -DMGONGPU_FPTYPE_FLOAT, nothing happens (issue #167)
@@ -54,23 +68,28 @@
 //#undef MGONGPU_HARDCODE_PARAM // default
 ////#define MGONGPU_HARDCODE_PARAM 1
 
-// Complex type in c++: std::complex or cxsmpl (CHOOSE ONLY ONE)
-#ifndef __CUDACC__
-//#define MGONGPU_CPPCXTYPE_STDCOMPLEX 1 // ~8 percent slower on float, same on double (5.1E6/double, 9.4E6/float)
-#define MGONGPU_CPPCXTYPE_CXSMPL 1 // new default (5.1E6/double, 10.2E6/float)
-#endif
-
-// Complex type in cuda: thrust or cucomplex or cxsmpl (CHOOSE ONLY ONE)
+// Complex type in CUDA: thrust or cucomplex or cxsmpl (CHOOSE ONLY ONE)
 #ifdef __CUDACC__
 #define MGONGPU_CUCXTYPE_THRUST 1 // default (~1.15E9/double, ~3.2E9/float)
 //#define MGONGPU_CUCXTYPE_CUCOMPLEX 1 // ~10 percent slower (1.03E9/double, ~2.8E9/float)
 //#define MGONGPU_CUCXTYPE_CXSMPL 1 // ~10 percent slower (1.00E9/double, ~2.9E9/float)
+
+// Complex type in HIP: cxsmpl (ONLY ONE OPTION POSSIBLE)
+#elif defined __HIPCC__
+#define MGONGPU_CUCXTYPE_CXSMPL 1 // ~10 percent slower (1.00E9/double, ~2.9E9/float)
+
+// Complex type in C++: std::complex or cxsmpl (CHOOSE ONLY ONE)
+#else
+//#define MGONGPU_CPPCXTYPE_STDCOMPLEX 1 // ~8 percent slower on float, same on double (5.1E6/double, 9.4E6/float)
+#define MGONGPU_CPPCXTYPE_CXSMPL 1 // new default (5.1E6/double, 10.2E6/float)
 #endif
 
-// Cuda nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation
+// CUDA nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation
 #ifdef __CUDACC__
-#undef MGONGPU_NSIGHT_DEBUG // default
+#undef MGONGPU_NSIGHT_DEBUG // default in CUDA
 //#define MGONGPU_NSIGHT_DEBUG 1
+#else
+#undef MGONGPU_NSIGHT_DEBUG // only option in HIP or C++
 #endif
 
 // SANITY CHECKS (floating point precision for everything but color algebra #537)
@@ -86,17 +105,21 @@
 #error You cannot use double precision for color algebra and single precision elsewhere
 #endif
 
-// SANITY CHECKS (c++ complex number implementation)
-#ifndef __CUDACC__
-#if defined MGONGPU_CPPCXTYPE_STDCOMPLEX and defined MGONGPU_CPPCXTYPE_CXSMPL
-#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CPPCXTYPE_STDCOMPLEX or MGONGPU_CPPCXTYPE_CXSMPL
+// SANITY CHECKS (CUDA complex number implementation)
+#ifdef __CUDACC__
+#if defined MGONGPU_CUCXTYPE_THRUST and defined MGONGPU_CUCXTYPE_CUCOMPLEX
+#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CUCXTYPE_THRUST or MGONGPU_CUCXTYPE_CUCOMPLEX for CUDA
+#elif defined MGONGPU_CUCXTYPE_THRUST and defined MGONGPU_CUCXTYPE_CXSMPL
+#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CUCXTYPE_THRUST or MGONGPU_CUCXTYPE_CXSMPL for CUDA
+#elif defined MGONGPU_CUCXTYPE_CUCOMPLEX and defined MGONGPU_CUCXTYPE_CXSMPL
+#error You must CHOOSE (ONE AND) ONLY ONE OF MGONGPU_CUCXTYPE_CUCOMPLEX or MGONGPU_CUCXTYPE_CXSMPL for CUDA
 #endif
 #endif
 
-// SANITY CHECKS (cuda complex number implementation)
-#ifdef __CUDACC__
-#if defined MGONGPU_CUCXTYPE_THRUST and defined MGONGPU_CUCXTYPE_CUCOMPLEX and defined MGONGPU_CUCXTYPE_CXSMPL
-#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CUCXTYPE_THRUST or MGONGPU_CUCXTYPE_CUCOMPLEX or MGONGPU_CUCXTYPE_CXSMPL
+// SANITY CHECKS (C++ complex number implementation)
+#ifndef MGONGPUCPP_GPUIMPL
+#if defined MGONGPU_CPPCXTYPE_STDCOMPLEX and defined MGONGPU_CPPCXTYPE_CXSMPL
+#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CPPCXTYPE_STDCOMPLEX or MGONGPU_CPPCXTYPE_CXSMPL for C++
 #endif
 #endif
 
@@ -134,7 +157,7 @@ namespace mgOnGpu
   // Alignment requirement for using reinterpret_cast with SIMD vectorized code
   // (using reinterpret_cast with non aligned memory may lead to segmentation faults!)
   // Only needed for C++ code but can be enforced also in NVCC builds of C++ code using CUDA>=11.2 and C++17 (#318, #319, #333)
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   constexpr int cppAlign = 64; // alignment requirement for SIMD vectorization (64-byte i.e. 512-bit)
 #endif
 
@@ -145,7 +168,7 @@ using mgOnGpu::fptype;
 using mgOnGpu::fptype2;
 
 // C++ SIMD vectorization width (this will be used to set neppV)
-#ifdef __CUDACC__ // CUDA implementation has no SIMD
+#ifdef MGONGPUCPP_GPUIMPL // CUDA and HIP implementations have no SIMD
 #undef MGONGPU_CPPSIMD
 #elif defined __AVX512VL__ && defined MGONGPU_PVW512 // C++ "512z" AVX512 with 512 width (512-bit ie 64-byte): 8 (DOUBLE) or 16 (FLOAT)
 #ifdef MGONGPU_FPTYPE_DOUBLE
@@ -175,9 +198,9 @@ using mgOnGpu::fptype2;
 #undef MGONGPU_CPPSIMD
 #endif
 
-// Cuda nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation
+// CUDA nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation
 // Arguments (not used so far): text is __FUNCTION__, code is 0 (start) or 1 (end)
-#if defined __CUDACC__ && defined MGONGPU_NSIGHT_DEBUG /* clang-format off */
+#if defined __CUDA__ && defined MGONGPU_NSIGHT_DEBUG /* clang-format off */
 #define mgDebugDeclare() __shared__ float mgDebugCounter[mgOnGpu::ntpbMAX];
 #define mgDebugInitialise() { mgDebugCounter[threadIdx.x] = 0; }
 #define mgDebug( code, text ) { mgDebugCounter[threadIdx.x] += 1; }
@@ -189,8 +212,8 @@ using mgOnGpu::fptype2;
 #define mgDebugFinalise() { /*noop*/ }
 #endif /* clang-format on */
 
-// Define empty CUDA declaration specifiers for C++
-#ifndef __CUDACC__
+// Define empty CUDA/HIP declaration specifiers for C++
+#ifndef MGONGPUCPP_GPUIMPL
 #define __global__
 #define __host__
 #define __device__
diff --git a/epochX/cudacpp/gg_ttg.mad/src/mgOnGpuCxtypes.h b/epochX/cudacpp/gg_ttg.mad/src/mgOnGpuCxtypes.h
index ca9a9f00c0..5532e22fa1 100644
--- a/epochX/cudacpp/gg_ttg.mad/src/mgOnGpuCxtypes.h
+++ b/epochX/cudacpp/gg_ttg.mad/src/mgOnGpuCxtypes.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022, based on earlier work by D. Smith) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MGONGPUCXTYPES_H
 #define MGONGPUCXTYPES_H 1
@@ -19,7 +19,7 @@
 #include <complex>
 
 // Complex type in cuda: thrust or cucomplex or cxsmpl
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 #if defined MGONGPU_CUCXTYPE_THRUST
 #pragma clang diagnostic push
 #pragma clang diagnostic ignored "-Wtautological-compare" // for icpx2021/clang13 (https://stackoverflow.com/a/15864661)
@@ -82,7 +82,7 @@ namespace mgOnGpu /* clang-format off */
 using mgOnGpu::cxsmpl;
 
 // Printout to stream for user defined types
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -92,7 +92,7 @@ namespace mg5amcCpu
   inline __host__ std::ostream&
   operator<<( std::ostream& out, const cxsmpl<FP>& c )
   {
-    out << std::complex( c.real(), c.imag() );
+    out << std::complex<FP>( c.real(), c.imag() );
     return out;
   }
 
@@ -215,14 +215,14 @@ namespace mg5amcCpu
 //==========================================================================
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
 #endif
 {
   // --- Type definitions (complex type: cxtype)
-#ifdef __CUDACC__ // cuda
+#ifdef MGONGPUCPP_GPUIMPL // cuda
 #if defined MGONGPU_CUCXTYPE_THRUST
   typedef thrust::complex<fptype> cxtype;
 #elif defined MGONGPU_CUCXTYPE_CUCOMPLEX
@@ -255,7 +255,7 @@ namespace mg5amcCpu
 //==========================================================================
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -307,7 +307,7 @@ namespace mg5amcCpu
 
   //==========================================================================
 
-#if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_THRUST // cuda + thrust
+#if defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CUCXTYPE_THRUST // cuda + thrust
 
   //------------------------------
   // CUDA - using thrust::complex
@@ -343,11 +343,11 @@ namespace mg5amcCpu
     return c;
   }
 
-#endif // #if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_THRUST
+#endif // #if defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CUCXTYPE_THRUST
 
   //==========================================================================
 
-#if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_CUCOMPLEX // cuda + cucomplex
+#if defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CUCXTYPE_CUCOMPLEX // cuda + cucomplex
 
   //------------------------------
   // CUDA - using cuComplex
@@ -562,11 +562,11 @@ namespace mg5amcCpu
     return cxmake( c.real(), c.imag() );
   }
 
-#endif // #if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_CUCOMPLEX
+#endif // #if defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CUCXTYPE_CUCOMPLEX
 
   //==========================================================================
 
-#if not defined __CUDACC__ and defined MGONGPU_CPPCXTYPE_STDCOMPLEX // c++ + stdcomplex
+#if not defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CPPCXTYPE_STDCOMPLEX // c++ + stdcomplex
 
   //------------------------------
   // C++ - using std::complex
@@ -610,7 +610,7 @@ namespace mg5amcCpu
   }
 #endif
 
-#endif // #if not defined __CUDACC__ and defined MGONGPU_CPPCXTYPE_STDCOMPLEX
+#endif // #if not defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CPPCXTYPE_STDCOMPLEX
 
   //==========================================================================
 
@@ -633,7 +633,7 @@ namespace mg5amcCpu
 //==========================================================================
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/gg_ttg.mad/src/mgOnGpuFptypes.h b/epochX/cudacpp/gg_ttg.mad/src/mgOnGpuFptypes.h
index 905c97d700..fa3a02664b 100644
--- a/epochX/cudacpp/gg_ttg.mad/src/mgOnGpuFptypes.h
+++ b/epochX/cudacpp/gg_ttg.mad/src/mgOnGpuFptypes.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MGONGPUFPTYPES_H
 #define MGONGPUFPTYPES_H 1
@@ -12,7 +12,7 @@
 #include <cmath>
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL // cuda
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -20,7 +20,7 @@ namespace mg5amcCpu
 {
   //==========================================================================
 
-#ifdef __CUDACC__ // cuda
+#ifdef MGONGPUCPP_GPUIMPL // cuda
 
   //------------------------------
   // Floating point types - Cuda
@@ -64,11 +64,11 @@ namespace mg5amcCpu
 #endif
   }
 
-#endif // #ifdef __CUDACC__
+#endif // #ifdef MGONGPUCPP_GPUIMPL
 
   //==========================================================================
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 
   //------------------------------
   // Floating point types - C++
@@ -92,7 +92,7 @@ namespace mg5amcCpu
     return std::sqrt( f );
   }
 
-#endif // #ifndef __CUDACC__
+#endif // #ifndef MGONGPUCPP_GPUIMPL
 
   //==========================================================================
 
diff --git a/epochX/cudacpp/gg_ttg.mad/src/mgOnGpuVectors.h b/epochX/cudacpp/gg_ttg.mad/src/mgOnGpuVectors.h
index e1299ba81e..cdae04326b 100644
--- a/epochX/cudacpp/gg_ttg.mad/src/mgOnGpuVectors.h
+++ b/epochX/cudacpp/gg_ttg.mad/src/mgOnGpuVectors.h
@@ -32,7 +32,7 @@
 #endif
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -131,7 +131,7 @@ namespace mg5amcCpu
 #endif
 #endif
 
-#else // i.e #ifndef MGONGPU_CPPSIMD (this includes #ifdef __CUDACC__)
+#else // i.e #ifndef MGONGPU_CPPSIMD (this includes #ifdef MGONGPUCPP_GPUIMPL)
 
   const int neppV = 1;
 
@@ -153,13 +153,13 @@ namespace mg5amcCpu
 //==========================================================================
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
 #endif
 {
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 
   // Printout to stream for user defined types
 
@@ -805,11 +805,11 @@ namespace mg5amcCpu
 
 #endif // #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
 
-#endif // #ifndef __CUDACC__
+#endif // #ifndef MGONGPUCPP_GPUIMPL
 
   //==========================================================================
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 
   //------------------------------
   // Vector types - CUDA
@@ -853,12 +853,12 @@ namespace mg5amcCpu
     return mask;
   }
 
-#endif // #ifdef __CUDACC__
+#endif // #ifdef MGONGPUCPP_GPUIMPL
 
   //==========================================================================
 
   // Scalar-or-vector types: scalar in CUDA, vector or scalar in C++
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   typedef bool bool_sv;
   typedef fptype fptype_sv;
   typedef fptype2 fptype2_sv;
@@ -879,7 +879,7 @@ namespace mg5amcCpu
 #endif
 
   // Scalar-or-vector zeros: scalar in CUDA, vector or scalar in C++
-#ifdef __CUDACC__ /* clang-format off */
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
   inline __host__ __device__ cxtype cxzero_sv(){ return cxtype( 0, 0 ); }
 #elif defined MGONGPU_CPPSIMD
   inline cxtype_v cxzero_sv() { return cxtype_v(); } // RRRR=0000 IIII=0000
diff --git a/epochX/cudacpp/gg_ttg.mad/src/rambo.h b/epochX/cudacpp/gg_ttg.mad/src/rambo.h
index e02ea52496..cd7e1008ea 100644
--- a/epochX/cudacpp/gg_ttg.mad/src/rambo.h
+++ b/epochX/cudacpp/gg_ttg.mad/src/rambo.h
@@ -4,7 +4,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 
 #include "mgOnGpuConfig.h"
@@ -18,7 +18,7 @@
 #include <iostream>
 
 // Simplified rambo version for 2 to N (with N>=2) processes with massless particles
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -83,7 +83,7 @@ namespace mg5amcCpu
       static bool first = true;
       if( first )
       {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
         if constexpr( M_ACCESS::isOnDevice() ) // avoid
         {
           const int ievt0 = 0;
@@ -166,7 +166,7 @@ namespace mg5amcCpu
     wt = po2log;
     if( nparf != 2 ) wt = ( 2. * nparf - 4. ) * log( energy ) + z[nparf - 1];
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
     // issue warnings if weight is too small or too large
     static int iwarn[5] = { 0, 0, 0, 0, 0 };
     if( wt < -180. )
diff --git a/epochX/cudacpp/gg_ttg.sa/CODEGEN_cudacpp_gg_ttg_log.txt b/epochX/cudacpp/gg_ttg.sa/CODEGEN_cudacpp_gg_ttg_log.txt
index adda711aad..ee1a51555d 100644
--- a/epochX/cudacpp/gg_ttg.sa/CODEGEN_cudacpp_gg_ttg_log.txt
+++ b/epochX/cudacpp/gg_ttg.sa/CODEGEN_cudacpp_gg_ttg_log.txt
@@ -52,7 +52,7 @@ Note that you can still compile and run aMC@NLO with the built-in PDFs
 
 Using default text editor "vi". Set another one in ./input/mg5_configuration.txt
 Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt
-No valid web browser found. Please set in ./input/mg5_configuration.txt
+Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt
 import /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttg.mg
 The import format was not given, so we guess it as command
 set stdout_level DEBUG
@@ -62,7 +62,7 @@ generate g g > t t~ g
 No model currently active, so we import the Standard Model
 INFO: load particles 
 INFO: load vertices 
-[1;32mDEBUG: model prefixing  takes 0.005533933639526367 [0m
+[1;32mDEBUG: model prefixing  takes 0.0054416656494140625 [0m
 INFO: Restrict model sm with file models/sm/restrict_default.dat . 
 [1;32mDEBUG: Simplifying conditional expressions [0m
 [1;32mDEBUG: remove interactions: u s w+ at order: QED=1 [0m
@@ -155,7 +155,7 @@ INFO: Please specify coupling orders to bypass this step.
 INFO: Trying coupling order WEIGHTED<=3: WEIGTHED IS QCD+2*QED 
 INFO: Trying process: g g > t t~ g WEIGHTED<=3 @1  
 INFO: Process has 16 diagrams 
-1 processes with 16 diagrams generated in 0.022 s
+1 processes with 16 diagrams generated in 0.021 s
 Total: 1 processes with 16 diagrams
 output standalone_cudacpp ../TMPOUT/CODEGEN_cudacpp_gg_ttg
 Load PLUGIN.CUDACPP_OUTPUT
@@ -175,7 +175,7 @@ INFO: Creating files in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TM
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttg/SubProcesses/P1_Sigma_sm_gg_ttxg/./CPPProcess.h
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttg/SubProcesses/P1_Sigma_sm_gg_ttxg/./CPPProcess.cc
 INFO: Created files CPPProcess.h and CPPProcess.cc in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttg/SubProcesses/P1_Sigma_sm_gg_ttxg/. 
-Generated helas calls for 1 subprocesses (16 diagrams) in 0.037 s
+Generated helas calls for 1 subprocesses (16 diagrams) in 0.038 s
 [1;32mDEBUG:  Entering PLUGIN_ProcessExporter.convert_model (create the model) [1;30m[output.py at line 202][0m [0m
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates VVV1 routines[0m
@@ -183,7 +183,7 @@ ALOHA: aloha creates FFV1 routines[0m
 ALOHA: aloha creates VVVV1 set of routines with options: P0[0m
 ALOHA: aloha creates VVVV3 set of routines with options: P0[0m
 ALOHA: aloha creates VVVV4 set of routines with options: P0[0m
-ALOHA: aloha creates 5 routines in  0.328 s
+ALOHA: aloha creates 5 routines in  0.345 s
 <class 'aloha.create_aloha.AbstractRoutine'> VVV1
 <class 'aloha.create_aloha.AbstractRoutine'> VVV1
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
@@ -203,7 +203,7 @@ INFO: Created files Parameters_sm.h and Parameters_sm.cc in directory
 INFO: /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttg/src/. and /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttg/src/. 
 quit
 
-real	0m0.787s
-user	0m0.730s
-sys	0m0.049s
-Code generation completed in 0 seconds
+real	0m0.803s
+user	0m0.731s
+sys	0m0.066s
+Code generation completed in 1 seconds
diff --git a/epochX/cudacpp/gg_ttg.sa/COPYRIGHT b/epochX/cudacpp/gg_ttg.sa/COPYRIGHT
index a134b5fef9..84a883fbb0 100644
--- a/epochX/cudacpp/gg_ttg.sa/COPYRIGHT
+++ b/epochX/cudacpp/gg_ttg.sa/COPYRIGHT
@@ -15,6 +15,7 @@ The full development team currently includes the following authors :
   Stephan Hageboeck (CERN)
   Olivier Mattelaer (Universite Catholique de Louvain, original author)
   Stefan Roiser (CERN, original author)
+  Joergen Teig (CERN)
   Andrea Valassi (CERN, original author)
   Zenny Wettersten (CERN)
 See https://github.com/madgraph5/madgraph4gpu for more details. For the full
diff --git a/epochX/cudacpp/gg_ttg.sa/SubProcesses/Bridge.h b/epochX/cudacpp/gg_ttg.sa/SubProcesses/Bridge.h
index bf8b5e024d..89437b4c42 100644
--- a/epochX/cudacpp/gg_ttg.sa/SubProcesses/Bridge.h
+++ b/epochX/cudacpp/gg_ttg.sa/SubProcesses/Bridge.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: S. Roiser (Nov 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Roiser, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef BRIDGE_H
 #define BRIDGE_H 1
@@ -23,7 +23,7 @@
 #include <memory>
 #include <type_traits>
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -83,7 +83,7 @@ namespace mg5amcCpu
     Bridge& operator=( const Bridge& ) = delete;
     Bridge& operator=( Bridge&& ) = delete;
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     /**
      * Set the gpublocks and gputhreads for the gpusequence - throws if evnt != gpublocks*gputhreads
      * (this is needed for BridgeKernel tests rather than for actual production use in Fortran)
@@ -150,7 +150,7 @@ namespace mg5amcCpu
     unsigned int m_nevt; // number of events
     int m_nGoodHel;      // the number of good helicities (-1 initially when they have not yet been calculated)
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     int m_gputhreads; // number of gpu threads (default set from number of events, can be modified)
     int m_gpublocks;  // number of gpu blocks (default set from number of events, can be modified)
     DeviceBuffer<FORTRANFPTYPE, sizePerEventMomenta> m_devMomentaF;
@@ -187,12 +187,12 @@ namespace mg5amcCpu
   // Forward declare transposition methods
   //
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 
   template<typename Tin, typename Tout>
   __global__ void dev_transposeMomentaF2C( const Tin* in, Tout* out, const unsigned int nevt );
 
-#endif // __CUDACC__
+#endif // MGONGPUCPP_GPUIMPL
 
   template<typename Tin, typename Tout>
   void hst_transposeMomentaF2C( const Tin* in, Tout* out, const unsigned int nevt );
@@ -209,7 +209,7 @@ namespace mg5amcCpu
   Bridge<FORTRANFPTYPE>::Bridge( unsigned int nevtF, unsigned int nparF, unsigned int np4F )
     : m_nevt( nevtF )
     , m_nGoodHel( -1 )
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     , m_gputhreads( 256 )                  // default number of gpu threads
     , m_gpublocks( m_nevt / m_gputhreads ) // this ensures m_nevt <= m_gpublocks*m_gputhreads
     , m_devMomentaF( m_nevt )
@@ -233,7 +233,7 @@ namespace mg5amcCpu
   {
     if( nparF != CPPProcess::npar ) throw std::runtime_error( "Bridge constructor: npar mismatch" );
     if( np4F != CPPProcess::np4 ) throw std::runtime_error( "Bridge constructor: np4 mismatch" );
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     if( ( m_nevt < s_gputhreadsmin ) || ( m_nevt % s_gputhreadsmin != 0 ) )
       throw std::runtime_error( "Bridge constructor: nevt should be a multiple of " + std::to_string( s_gputhreadsmin ) );
     while( m_nevt != m_gpublocks * m_gputhreads )
@@ -249,7 +249,7 @@ namespace mg5amcCpu
 #else
     std::cout << "WARNING! Instantiate host Bridge (nevt=" << m_nevt << ")" << std::endl;
     m_pmek.reset( new MatrixElementKernelHost( m_hstMomentaC, m_hstGs, m_hstRndHel, m_hstRndCol, m_hstMEs, m_hstSelHel, m_hstSelCol, m_nevt ) );
-#endif // __CUDACC__
+#endif // MGONGPUCPP_GPUIMPL
     // Create a process object, read param card and set parameters
     // FIXME: the process instance can happily go out of scope because it is only needed to read parameters?
     // FIXME: the CPPProcess should really be a singleton? what if fbridgecreate is called from several Fortran threads?
@@ -262,7 +262,7 @@ namespace mg5amcCpu
     process.initProc( paramCard );
   }
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   template<typename FORTRANFPTYPE>
   void Bridge<FORTRANFPTYPE>::set_gpugrid( const int gpublocks, const int gputhreads )
   {
@@ -276,7 +276,7 @@ namespace mg5amcCpu
   }
 #endif
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   template<typename FORTRANFPTYPE>
   void Bridge<FORTRANFPTYPE>::gpu_sequence( const FORTRANFPTYPE* momenta,
                                             const FORTRANFPTYPE* gs,
@@ -291,14 +291,14 @@ namespace mg5amcCpu
     constexpr int neppM = MemoryAccessMomenta::neppM;
     if constexpr( neppM == 1 && std::is_same_v<FORTRANFPTYPE, fptype> )
     {
-      checkCuda( cudaMemcpy( m_devMomentaC.data(), momenta, m_devMomentaC.bytes(), cudaMemcpyHostToDevice ) );
+      gpuMemcpy( m_devMomentaC.data(), momenta, m_devMomentaC.bytes(), gpuMemcpyHostToDevice );
     }
     else
     {
-      checkCuda( cudaMemcpy( m_devMomentaF.data(), momenta, m_devMomentaF.bytes(), cudaMemcpyHostToDevice ) );
+      gpuMemcpy( m_devMomentaF.data(), momenta, m_devMomentaF.bytes(), gpuMemcpyHostToDevice );
       const int thrPerEvt = CPPProcess::npar * CPPProcess::np4; // AV: transpose alg does 1 element per thread (NOT 1 event per thread)
       //const int thrPerEvt = 1; // AV: try new alg with 1 event per thread... this seems slower
-      dev_transposeMomentaF2C<<<m_gpublocks * thrPerEvt, m_gputhreads>>>( m_devMomentaF.data(), m_devMomentaC.data(), m_nevt );
+      gpuLaunchKernel( dev_transposeMomentaF2C, m_gpublocks * thrPerEvt, m_gputhreads, m_devMomentaF.data(), m_devMomentaC.data(), m_nevt );
     }
     if constexpr( std::is_same_v<FORTRANFPTYPE, fptype> )
     {
@@ -341,7 +341,7 @@ namespace mg5amcCpu
   }
 #endif
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   template<typename FORTRANFPTYPE>
   void Bridge<FORTRANFPTYPE>::cpu_sequence( const FORTRANFPTYPE* momenta,
                                             const FORTRANFPTYPE* gs,
@@ -396,7 +396,7 @@ namespace mg5amcCpu
   // - C++ array: momenta[npagM][npar][np4][neppM] with nevt=npagM*neppM (AOSOA)
   //
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   template<typename Tin, typename Tout>
   __global__ void dev_transposeMomentaF2C( const Tin* in, Tout* out, const unsigned int nevt )
   {
diff --git a/epochX/cudacpp/gg_ttg.sa/SubProcesses/BridgeKernels.cc b/epochX/cudacpp/gg_ttg.sa/SubProcesses/BridgeKernels.cc
index d58066c9c1..eaf4037a24 100644
--- a/epochX/cudacpp/gg_ttg.sa/SubProcesses/BridgeKernels.cc
+++ b/epochX/cudacpp/gg_ttg.sa/SubProcesses/BridgeKernels.cc
@@ -1,17 +1,18 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #include "BridgeKernels.h"
 
+#include "GpuAbstraction.h"
 #include "MemoryAccessMomenta.h"
 
 #include <sstream>
 
 //============================================================================
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -45,7 +46,7 @@ namespace mg5amcCpu
 
 //============================================================================
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 namespace mg5amcCpu
 {
 
@@ -96,7 +97,7 @@ namespace mg5amcCpu
 
 //============================================================================
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 {
 
diff --git a/epochX/cudacpp/gg_ttg.sa/SubProcesses/BridgeKernels.h b/epochX/cudacpp/gg_ttg.sa/SubProcesses/BridgeKernels.h
index 15eb4bff4d..3efef8ce97 100644
--- a/epochX/cudacpp/gg_ttg.sa/SubProcesses/BridgeKernels.h
+++ b/epochX/cudacpp/gg_ttg.sa/SubProcesses/BridgeKernels.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef BRIDGEKERNELS_H
 #define BRIDGEKERNELS_H 1
@@ -12,7 +12,7 @@
 #include "MatrixElementKernels.h"
 #include "MemoryBuffers.h"
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -49,7 +49,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A Bridge wrapper class encapsulating matrix element calculations on a CPU host
   class BridgeKernelHost final : public BridgeKernelBase
   {
@@ -89,7 +89,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   // A Bridge wrapper class encapsulating matrix element calculations on a GPU device
   class BridgeKernelDevice : public BridgeKernelBase
   {
diff --git a/epochX/cudacpp/gg_ttg.sa/SubProcesses/CommonRandomNumberKernel.cc b/epochX/cudacpp/gg_ttg.sa/SubProcesses/CommonRandomNumberKernel.cc
index 985b39f576..010bc4cbd0 100644
--- a/epochX/cudacpp/gg_ttg.sa/SubProcesses/CommonRandomNumberKernel.cc
+++ b/epochX/cudacpp/gg_ttg.sa/SubProcesses/CommonRandomNumberKernel.cc
@@ -1,15 +1,16 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #include "CommonRandomNumbers.h"
+#include "GpuAbstraction.h"
 #include "MemoryBuffers.h"
 #include "RandomNumberKernels.h"
 
 #include <cassert>
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/gg_ttg.sa/SubProcesses/CrossSectionKernels.cc b/epochX/cudacpp/gg_ttg.sa/SubProcesses/CrossSectionKernels.cc
index 0b355a3c8d..c15b39844d 100644
--- a/epochX/cudacpp/gg_ttg.sa/SubProcesses/CrossSectionKernels.cc
+++ b/epochX/cudacpp/gg_ttg.sa/SubProcesses/CrossSectionKernels.cc
@@ -1,10 +1,11 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #include "CrossSectionKernels.h"
 
+#include "GpuAbstraction.h"
 #include "MemoryAccessMatrixElements.h"
 #include "MemoryAccessWeights.h"
 #include "MemoryBuffers.h"
@@ -77,7 +78,7 @@ debug_me_is_abnormal( const fptype& me, size_t ievtALL )
 
 //============================================================================
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -185,7 +186,7 @@ namespace mg5amcCpu
 
 //============================================================================
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 {
 
diff --git a/epochX/cudacpp/gg_ttg.sa/SubProcesses/CrossSectionKernels.h b/epochX/cudacpp/gg_ttg.sa/SubProcesses/CrossSectionKernels.h
index 7933ca4bbf..4d9659e04e 100644
--- a/epochX/cudacpp/gg_ttg.sa/SubProcesses/CrossSectionKernels.h
+++ b/epochX/cudacpp/gg_ttg.sa/SubProcesses/CrossSectionKernels.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef CROSSSECTIONKERNELS_H
 #define CROSSSECTIONKERNELS_H 1
@@ -13,7 +13,7 @@
 
 //============================================================================
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -96,7 +96,7 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
   /*
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   // A class encapsulating the calculation of event statistics on a GPU device
   class CrossSectionKernelDevice : public CrossSectionKernelBase, public NumberOfEvents
   {
diff --git a/epochX/cudacpp/gg_ttg.sa/SubProcesses/CudaRuntime.h b/epochX/cudacpp/gg_ttg.sa/SubProcesses/CudaRuntime.h
deleted file mode 100644
index 64ce52f4b3..0000000000
--- a/epochX/cudacpp/gg_ttg.sa/SubProcesses/CudaRuntime.h
+++ /dev/null
@@ -1,85 +0,0 @@
-// Copyright (C) 2020-2023 CERN and UCLouvain.
-// Licensed under the GNU Lesser General Public License (version 3 or later).
-// Created by: S. Roiser (Jul 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
-
-#ifndef MG5AMC_CUDARUNTIME_H
-#define MG5AMC_CUDARUNTIME_H 1
-
-// MG5AMC on GPU uses the CUDA runtime API, not the lower level CUDA driver API
-// See https://docs.nvidia.com/cuda/cuda-runtime-api/driver-vs-runtime-api.html#driver-vs-runtime-api
-
-#include <cassert>
-#include <iostream>
-
-//--------------------------------------------------------------------------
-
-// See https://stackoverflow.com/a/14038590
-#ifdef __CUDACC__ /* clang-format off */
-#define checkCuda( code ) { assertCuda( code, __FILE__, __LINE__ ); }
-inline void assertCuda( cudaError_t code, const char* file, int line, bool abort = true )
-{
-  if( code != cudaSuccess )
-  {
-    printf( "ERROR! assertCuda: '%s' (%d) in %s:%d\n", cudaGetErrorString( code ), code, file, line );
-    if( abort ) assert( code == cudaSuccess );
-  }
-}
-#endif /* clang-format on */
-
-//--------------------------------------------------------------------------
-
-#ifdef __CUDACC__
-namespace mg5amcGpu
-{
-  // Instantiate a CudaRuntime at the beginnining of the application's main to
-  // invoke cudaSetDevice(0) in the constructor and book a cudaDeviceReset() call in the destructor
-  // *** FIXME! This will all need to be designed differently when going to multi-GPU nodes! ***
-  struct CudaRuntime final
-  {
-    CudaRuntime( const bool debug = true )
-      : m_debug( debug ) { setUp( m_debug ); }
-    ~CudaRuntime() { tearDown( m_debug ); }
-    CudaRuntime( const CudaRuntime& ) = delete;
-    CudaRuntime( CudaRuntime&& ) = delete;
-    CudaRuntime& operator=( const CudaRuntime& ) = delete;
-    CudaRuntime& operator=( CudaRuntime&& ) = delete;
-    bool m_debug;
-
-    // Set up CUDA application
-    // ** NB: strictly speaking this is not needed when using the CUDA runtime API **
-    // Calling cudaSetDevice on startup is useful to properly book-keep the time spent in CUDA initialization
-    static void setUp( const bool debug = true )
-    {
-      // ** NB: it is useful to call cudaSetDevice, or cudaFree, to properly book-keep the time spent in CUDA initialization
-      // ** NB: otherwise, the first CUDA operation (eg a cudaMemcpyToSymbol in CPPProcess ctor) appears to take much longer!
-      /*
-      // [We initially added cudaFree(0) to "ease profile analysis" only because it shows up as a big recognizable block!]
-      // No explicit initialization is needed: https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#initialization
-      // It is not clear what cudaFree(0) does at all: https://stackoverflow.com/questions/69967813/
-      if ( debug ) std::cout << "__CudaRuntime: calling cudaFree(0)" << std::endl;
-      checkCuda( cudaFree( 0 ) ); // SLOW!
-      */
-      // Replace cudaFree(0) by cudaSetDevice(0), even if it is not really needed either
-      // (but see https://developer.nvidia.com/blog/cuda-pro-tip-always-set-current-device-avoid-multithreading-bugs)
-      if( debug ) std::cout << "__CudaRuntime: calling cudaSetDevice(0)" << std::endl;
-      checkCuda( cudaSetDevice( 0 ) ); // SLOW!
-    }
-
-    // Tear down CUDA application (call cudaDeviceReset)
-    // ** NB: strictly speaking this is not needed when using the CUDA runtime API **
-    // Calling cudaDeviceReset on shutdown is only needed for checking memory leaks in cuda-memcheck
-    // See https://docs.nvidia.com/cuda/cuda-memcheck/index.html#leak-checking
-    static void tearDown( const bool debug = true )
-    {
-      if( debug ) std::cout << "__CudaRuntime: calling cudaDeviceReset()" << std::endl;
-      checkCuda( cudaDeviceReset() );
-    }
-  };
-
-}
-#endif
-
-//--------------------------------------------------------------------------
-
-#endif // MG5AMC_CUDARUNTIME_H
diff --git a/epochX/cudacpp/gg_ttg.sa/SubProcesses/CurandRandomNumberKernel.cc b/epochX/cudacpp/gg_ttg.sa/SubProcesses/CurandRandomNumberKernel.cc
index eb56333b03..08a16f6f2c 100644
--- a/epochX/cudacpp/gg_ttg.sa/SubProcesses/CurandRandomNumberKernel.cc
+++ b/epochX/cudacpp/gg_ttg.sa/SubProcesses/CurandRandomNumberKernel.cc
@@ -1,9 +1,9 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
-#include "CudaRuntime.h"
+#include "GpuRuntime.h"
 #include "MemoryBuffers.h"
 #include "RandomNumberKernels.h"
 
@@ -22,7 +22,7 @@ inline void assertCurand( curandStatus_t code, const char *file, int line, bool
 }
 #endif /* clang-format on */
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -36,7 +36,7 @@ namespace mg5amcCpu
   {
     if( m_isOnDevice )
     {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
       if( !m_rnarray.isOnDevice() )
         throw std::runtime_error( "CurandRandomNumberKernel on device with a host random number array" );
 #else
@@ -114,7 +114,7 @@ namespace mg5amcCpu
     /*
     printf( "\nCurandRandomNumberKernel::generateRnarray size = %d\n", (int)m_rnarray.size() );
     fptype* data = m_rnarray.data();
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     if( m_rnarray.isOnDevice() )
     {
       data = new fptype[m_rnarray.size()]();
@@ -123,7 +123,7 @@ namespace mg5amcCpu
 #endif
     for( int i = 0; i < ( (int)m_rnarray.size() / 4 ); i++ )
       printf( "[%4d] %f %f %f %f\n", i * 4, data[i * 4], data[i * 4 + 2], data[i * 4 + 2], data[i * 4 + 3] );
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     if( m_rnarray.isOnDevice() ) delete[] data;
 #endif
     */
diff --git a/epochX/cudacpp/gg_ttg.sa/SubProcesses/EventStatistics.h b/epochX/cudacpp/gg_ttg.sa/SubProcesses/EventStatistics.h
index 48b51e0a49..b425a5bade 100644
--- a/epochX/cudacpp/gg_ttg.sa/SubProcesses/EventStatistics.h
+++ b/epochX/cudacpp/gg_ttg.sa/SubProcesses/EventStatistics.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef EventStatistics_H
 #define EventStatistics_H 1
@@ -16,7 +16,7 @@
 #include <limits>
 #include <string>
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/gg_ttg.sa/SubProcesses/GpuAbstraction.h b/epochX/cudacpp/gg_ttg.sa/SubProcesses/GpuAbstraction.h
new file mode 100644
index 0000000000..6a7d9c05c0
--- /dev/null
+++ b/epochX/cudacpp/gg_ttg.sa/SubProcesses/GpuAbstraction.h
@@ -0,0 +1,71 @@
+// Copyright (C) 2020-2023 CERN and UCLouvain.
+// Licensed under the GNU Lesser General Public License (version 3 or later).
+// Created by: J. Teig (Jul 2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+
+#ifndef MG5AMC_GPUABSTRACTION_H
+#define MG5AMC_GPUABSTRACTION_H 1
+
+#include <cassert>
+
+//--------------------------------------------------------------------------
+
+#ifdef __CUDACC__
+
+#define gpuError_t cudaError_t
+#define gpuPeekAtLastError cudaPeekAtLastError
+#define gpuGetErrorString cudaGetErrorString
+#define gpuSuccess cudaSuccess
+
+#define gpuMallocHost( ptr, size ) checkGpu( cudaMallocHost( ptr, size ) )
+#define gpuMalloc( ptr, size ) checkGpu( cudaMalloc( ptr, size ) )
+
+#define gpuMemcpy( dstData, srcData, srcBytes, func ) checkGpu( cudaMemcpy( dstData, srcData, srcBytes, func ) )
+#define gpuMemcpyHostToDevice cudaMemcpyHostToDevice
+#define gpuMemcpyDeviceToHost cudaMemcpyDeviceToHost
+#define gpuMemcpyToSymbol( type1, type2, size ) checkGpu( cudaMemcpyToSymbol( type1, type2, size ) )
+
+#define gpuFree( ptr ) checkGpu( cudaFree( ptr ) )
+#define gpuFreeHost( ptr ) checkGpu( cudaFreeHost( ptr ) )
+
+#define gpuSetDevice cudaSetDevice
+#define gpuDeviceSynchronize cudaDeviceSynchronize
+#define gpuDeviceReset cudaDeviceReset
+
+#define gpuLaunchKernel( kernel, blocks, threads, ... ) kernel<<<blocks, threads>>>( __VA_ARGS__ )
+#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<<blocks, threads, sharedMem>>>( __VA_ARGS__ )
+
+//--------------------------------------------------------------------------
+
+#elif defined __HIPCC__
+
+#include "hip/hip_runtime.h"
+
+#define gpuError_t hipError_t
+#define gpuPeekAtLastError hipPeekAtLastError
+#define gpuGetErrorString hipGetErrorString
+#define gpuSuccess hipSuccess
+
+#define gpuMallocHost( ptr, size ) checkGpu( hipHostMalloc( ptr, size ) ) // HostMalloc better
+#define gpuMalloc( ptr, size ) checkGpu( hipMalloc( ptr, size ) )
+
+#define gpuMemcpy( dstData, srcData, srcBytes, func ) checkGpu( hipMemcpy( dstData, srcData, srcBytes, func ) )
+#define gpuMemcpyHostToDevice hipMemcpyHostToDevice
+#define gpuMemcpyDeviceToHost hipMemcpyDeviceToHost
+#define gpuMemcpyToSymbol( type1, type2, size ) checkGpu( hipMemcpyToSymbol( type1, type2, size ) )
+
+#define gpuFree( ptr ) checkGpu( hipFree( ptr ) )
+#define gpuFreeHost( ptr ) checkGpu( hipHostFree( ptr ) )
+
+#define gpuSetDevice hipSetDevice
+#define gpuDeviceSynchronize hipDeviceSynchronize
+#define gpuDeviceReset hipDeviceReset
+
+#define gpuLaunchKernel( kernel, blocks, threads, ... ) kernel<<<blocks, threads>>>( __VA_ARGS__ )
+#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<<blocks, threads, sharedMem>>>( __VA_ARGS__ )
+
+//--------------------------------------------------------------------------
+
+#endif
+
+#endif // MG5AMC_GPUABSTRACTION_H
diff --git a/epochX/cudacpp/gg_ttg.sa/SubProcesses/GpuRuntime.h b/epochX/cudacpp/gg_ttg.sa/SubProcesses/GpuRuntime.h
new file mode 100644
index 0000000000..93579ef08b
--- /dev/null
+++ b/epochX/cudacpp/gg_ttg.sa/SubProcesses/GpuRuntime.h
@@ -0,0 +1,85 @@
+// Copyright (C) 2020-2023 CERN and UCLouvain.
+// Licensed under the GNU Lesser General Public License (version 3 or later).
+// Created by: J. Teig (Jun 2023, based on earlier work by S. Roiser) for the MG5aMC CUDACPP plugin.
+// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+
+#ifndef MG5AMC_GPURUNTIME_H
+#define MG5AMC_GPURUNTIME_H 1
+
+// MG5AMC on GPU uses the CUDA runtime API, not the lower level CUDA driver API
+// See https://docs.nvidia.com/cuda/cuda-runtime-api/driver-vs-runtime-api.html#driver-vs-runtime-api
+
+#include "GpuAbstraction.h"
+
+#include <iostream>
+
+//--------------------------------------------------------------------------
+
+// See https://stackoverflow.com/a/14038590
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
+#define checkGpu( code ) { assertGpu( code, __FILE__, __LINE__ ); }
+inline void assertGpu( gpuError_t code, const char* file, int line, bool abort = true )
+{
+  if( code != gpuSuccess )
+  {
+    printf( "ERROR! assertGpu: '%s' (%d) in %s:%d\n", gpuGetErrorString( code ), code, file, line );
+    if( abort ) assert( code == gpuSuccess );
+  }
+}
+#endif /* clang-format on */
+
+//--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+namespace mg5amcGpu
+{
+  // Instantiate a GpuRuntime at the beginnining of the application's main to
+  // invoke gpuSetDevice(0) in the constructor and book a gpuDeviceReset() call in the destructor
+  // *** FIXME! This will all need to be designed differently when going to multi-GPU nodes! ***
+  struct GpuRuntime final
+  {
+    GpuRuntime( const bool debug = true )
+      : m_debug( debug ) { setUp( m_debug ); }
+    ~GpuRuntime() { tearDown( m_debug ); }
+    GpuRuntime( const GpuRuntime& ) = delete;
+    GpuRuntime( GpuRuntime&& ) = delete;
+    GpuRuntime& operator=( const GpuRuntime& ) = delete;
+    GpuRuntime& operator=( GpuRuntime&& ) = delete;
+    bool m_debug;
+
+    // Set up CUDA application
+    // ** NB: strictly speaking this is not needed when using the CUDA runtime API **
+    // Calling cudaSetDevice on startup is useful to properly book-keep the time spent in CUDA initialization
+    static void setUp( const bool debug = true )
+    {
+      // ** NB: it is useful to call cudaSetDevice, or cudaFree, to properly book-keep the time spent in CUDA initialization
+      // ** NB: otherwise, the first CUDA operation (eg a cudaMemcpyToSymbol in CPPProcess ctor) appears to take much longer!
+      /*
+      // [We initially added cudaFree(0) to "ease profile analysis" only because it shows up as a big recognizable block!]
+      // No explicit initialization is needed: https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#initialization
+      // It is not clear what cudaFree(0) does at all: https://stackoverflow.com/questions/69967813/
+      if ( debug ) std::cout << "__CudaRuntime: calling cudaFree(0)" << std::endl;
+      checkCuda( cudaFree( 0 ) ); // SLOW!
+      */
+      // Replace cudaFree(0) by cudaSetDevice(0), even if it is not really needed either
+      // (but see https://developer.nvidia.com/blog/cuda-pro-tip-always-set-current-device-avoid-multithreading-bugs)
+      if( debug ) std::cout << "__GpuRuntime: calling GpuSetDevice(0)" << std::endl;
+      checkGpu( gpuSetDevice( 0 ) ); // SLOW!
+    }
+
+    // Tear down CUDA application (call cudaDeviceReset)
+    // ** NB: strictly speaking this is not needed when using the CUDA runtime API **
+    // Calling cudaDeviceReset on shutdown is only needed for checking memory leaks in cuda-memcheck
+    // See https://docs.nvidia.com/cuda/cuda-memcheck/index.html#leak-checking
+    static void tearDown( const bool debug = true )
+    {
+      if( debug ) std::cout << "__GpuRuntime: calling GpuDeviceReset()" << std::endl;
+      checkGpu( gpuDeviceReset() );
+    }
+  };
+}
+#endif
+
+//--------------------------------------------------------------------------
+
+#endif // MG5AMC_GPURUNTIME_H
diff --git a/epochX/cudacpp/gg_ttg.sa/SubProcesses/MadgraphTest.h b/epochX/cudacpp/gg_ttg.sa/SubProcesses/MadgraphTest.h
index ef40624c88..a64c05c26a 100644
--- a/epochX/cudacpp/gg_ttg.sa/SubProcesses/MadgraphTest.h
+++ b/epochX/cudacpp/gg_ttg.sa/SubProcesses/MadgraphTest.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: S. Hageboeck (Dec 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MADGRAPHTEST_H_
 #define MADGRAPHTEST_H_ 1
@@ -22,7 +22,7 @@
 #include <string>
 #include <vector>
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 using mg5amcGpu::CPPProcess;
 #else
 using mg5amcCpu::CPPProcess;
@@ -201,7 +201,7 @@ class MadgraphTest : public testing::TestWithParam<TestDriverBase*>
 
 // Since we link both the CPU-only and GPU tests into the same executable, we prevent
 // a multiply defined symbol by only compiling this in the non-CUDA phase:
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 
 /// Compare momenta and matrix elements.
 /// This uses an implementation of TestDriverBase to run a madgraph workflow,
@@ -307,6 +307,6 @@ TEST_P( MadgraphTest, CompareMomentaAndME )
   }
 }
 
-#endif // __CUDACC__
+#endif // MGONGPUCPP_GPUIMPL
 
 #endif /* MADGRAPHTEST_H_ */
diff --git a/epochX/cudacpp/gg_ttg.sa/SubProcesses/MatrixElementKernels.cc b/epochX/cudacpp/gg_ttg.sa/SubProcesses/MatrixElementKernels.cc
index 74b5239ebf..81699dfea9 100644
--- a/epochX/cudacpp/gg_ttg.sa/SubProcesses/MatrixElementKernels.cc
+++ b/epochX/cudacpp/gg_ttg.sa/SubProcesses/MatrixElementKernels.cc
@@ -1,12 +1,12 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #include "MatrixElementKernels.h"
 
 #include "CPPProcess.h"
-#include "CudaRuntime.h"
+#include "GpuRuntime.h" // Includes the abstraction for Nvidia/AMD compilation
 #include "MemoryAccessMomenta.h"
 #include "MemoryBuffers.h"
 
@@ -14,7 +14,7 @@
 
 //============================================================================
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 namespace mg5amcCpu
 {
 
@@ -150,7 +150,7 @@ namespace mg5amcCpu
 
 //============================================================================
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 {
 
@@ -209,13 +209,13 @@ namespace mg5amcGpu
     PinnedHostBufferHelicityMask hstIsGoodHel( ncomb );
     DeviceBufferHelicityMask devIsGoodHel( ncomb );
     // ... 0d1. Compute good helicity mask on the device
-    computeDependentCouplings<<<m_gpublocks, m_gputhreads>>>( m_gs.data(), m_couplings.data() );
+    gpuLaunchKernel( computeDependentCouplings, m_gpublocks, m_gputhreads, m_gs.data(), m_couplings.data() );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    sigmaKin_getGoodHel<<<m_gpublocks, m_gputhreads>>>( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_numerators.data(), m_denominators.data(), devIsGoodHel.data() );
+    gpuLaunchKernel( sigmaKin_getGoodHel, m_gpublocks, m_gputhreads, m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_numerators.data(), m_denominators.data(), devIsGoodHel.data() );
 #else
-    sigmaKin_getGoodHel<<<m_gpublocks, m_gputhreads>>>( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), devIsGoodHel.data() );
+    gpuLaunchKernel( sigmaKin_getGoodHel, m_gpublocks, m_gputhreads, m_momenta.data(), m_couplings.data(), m_matrixElements.data(), devIsGoodHel.data() );
 #endif
-    checkCuda( cudaPeekAtLastError() );
+    checkGpu( gpuPeekAtLastError() );
     // ... 0d2. Copy back good helicity mask to the host
     copyHostFromDevice( hstIsGoodHel, devIsGoodHel );
     // ... 0d3. Copy back good helicity list to constant memory on the device
@@ -226,19 +226,19 @@ namespace mg5amcGpu
 
   void MatrixElementKernelDevice::computeMatrixElements( const unsigned int channelId )
   {
-    computeDependentCouplings<<<m_gpublocks, m_gputhreads>>>( m_gs.data(), m_couplings.data() );
+    gpuLaunchKernel( computeDependentCouplings, m_gpublocks, m_gputhreads, m_gs.data(), m_couplings.data() );
 #ifndef MGONGPU_NSIGHT_DEBUG
     constexpr unsigned int sharedMemSize = 0;
 #else
     constexpr unsigned int sharedMemSize = ntpbMAX * sizeof( float );
 #endif
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    sigmaKin<<<m_gpublocks, m_gputhreads, sharedMemSize>>>( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), channelId, m_numerators.data(), m_denominators.data(), m_selhel.data(), m_selcol.data() );
+    gpuLaunchKernelSharedMem( sigmaKin, m_gpublocks, m_gputhreads, sharedMemSize, m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), channelId, m_numerators.data(), m_denominators.data(), m_selhel.data(), m_selcol.data() );
 #else
-    sigmaKin<<<m_gpublocks, m_gputhreads, sharedMemSize>>>( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), m_selhel.data(), m_selcol.data() );
+    gpuLaunchKernelSharedMem( sigmaKin, m_gpublocks, m_gputhreads, sharedMemSize, m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), m_selhel.data(), m_selcol.data() );
 #endif
-    checkCuda( cudaPeekAtLastError() );
-    checkCuda( cudaDeviceSynchronize() );
+    checkGpu( gpuPeekAtLastError() );
+    checkGpu( gpuDeviceSynchronize() );
   }
 
   //--------------------------------------------------------------------------
diff --git a/epochX/cudacpp/gg_ttg.sa/SubProcesses/MatrixElementKernels.h b/epochX/cudacpp/gg_ttg.sa/SubProcesses/MatrixElementKernels.h
index 23e84757a2..72bd8f195b 100644
--- a/epochX/cudacpp/gg_ttg.sa/SubProcesses/MatrixElementKernels.h
+++ b/epochX/cudacpp/gg_ttg.sa/SubProcesses/MatrixElementKernels.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MATRIXELEMENTKERNELS_H
 #define MATRIXELEMENTKERNELS_H 1
@@ -10,7 +10,7 @@
 
 #include "MemoryBuffers.h"
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -81,7 +81,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating matrix element calculations on a CPU host
   class MatrixElementKernelHost final : public MatrixElementKernelBase, public NumberOfEvents
   {
@@ -130,7 +130,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   // A class encapsulating matrix element calculations on a GPU device
   class MatrixElementKernelDevice : public MatrixElementKernelBase, public NumberOfEvents
   {
diff --git a/epochX/cudacpp/gg_ttg.sa/SubProcesses/MemoryAccessAmplitudes.h b/epochX/cudacpp/gg_ttg.sa/SubProcesses/MemoryAccessAmplitudes.h
index 573b3bbbc9..ffb76e93de 100644
--- a/epochX/cudacpp/gg_ttg.sa/SubProcesses/MemoryAccessAmplitudes.h
+++ b/epochX/cudacpp/gg_ttg.sa/SubProcesses/MemoryAccessAmplitudes.h
@@ -15,7 +15,7 @@
 #define MGONGPU_TRIVIAL_AMPLITUDES 1
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/gg_ttg.sa/SubProcesses/MemoryAccessCouplings.h b/epochX/cudacpp/gg_ttg.sa/SubProcesses/MemoryAccessCouplings.h
index 35a3af42e0..3afdf3e554 100644
--- a/epochX/cudacpp/gg_ttg.sa/SubProcesses/MemoryAccessCouplings.h
+++ b/epochX/cudacpp/gg_ttg.sa/SubProcesses/MemoryAccessCouplings.h
@@ -15,7 +15,7 @@
 #include "MemoryBuffers.h"       // for HostBufferCouplings::isaligned
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/gg_ttg.sa/SubProcesses/MemoryAccessCouplingsFixed.h b/epochX/cudacpp/gg_ttg.sa/SubProcesses/MemoryAccessCouplingsFixed.h
index dc0d93afff..ffcdf4dbef 100644
--- a/epochX/cudacpp/gg_ttg.sa/SubProcesses/MemoryAccessCouplingsFixed.h
+++ b/epochX/cudacpp/gg_ttg.sa/SubProcesses/MemoryAccessCouplingsFixed.h
@@ -14,7 +14,7 @@
 //#include "MemoryAccessHelpers.h"
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/gg_ttg.sa/SubProcesses/MemoryAccessDenominators.h b/epochX/cudacpp/gg_ttg.sa/SubProcesses/MemoryAccessDenominators.h
index 3bce635718..66f2d32a6b 100644
--- a/epochX/cudacpp/gg_ttg.sa/SubProcesses/MemoryAccessDenominators.h
+++ b/epochX/cudacpp/gg_ttg.sa/SubProcesses/MemoryAccessDenominators.h
@@ -10,7 +10,7 @@
 #include "MemoryAccessGs.h"
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/gg_ttg.sa/SubProcesses/MemoryAccessGs.h b/epochX/cudacpp/gg_ttg.sa/SubProcesses/MemoryAccessGs.h
index 31311aa375..4c726b30f3 100644
--- a/epochX/cudacpp/gg_ttg.sa/SubProcesses/MemoryAccessGs.h
+++ b/epochX/cudacpp/gg_ttg.sa/SubProcesses/MemoryAccessGs.h
@@ -13,7 +13,7 @@
 #include "MemoryBuffers.h" // for HostBufferMatrixElements::isaligned
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/gg_ttg.sa/SubProcesses/MemoryAccessHelpers.h b/epochX/cudacpp/gg_ttg.sa/SubProcesses/MemoryAccessHelpers.h
index c82a6c7635..db73e4e064 100644
--- a/epochX/cudacpp/gg_ttg.sa/SubProcesses/MemoryAccessHelpers.h
+++ b/epochX/cudacpp/gg_ttg.sa/SubProcesses/MemoryAccessHelpers.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MemoryAccessHelpers_H
 #define MemoryAccessHelpers_H 1
@@ -105,7 +105,7 @@ class KernelAccessHelper : public MemoryAccessHelper<T>
     }
     else
     {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
       const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid
       //printf( "kernelAccessRecord: ievt=%d threadId=%d\n", ievt, threadIdx.x );
       return T::ieventAccessRecord( buffer, ievt ); // NB fptype and fptype_sv coincide for CUDA
diff --git a/epochX/cudacpp/gg_ttg.sa/SubProcesses/MemoryAccessMatrixElements.h b/epochX/cudacpp/gg_ttg.sa/SubProcesses/MemoryAccessMatrixElements.h
index f32e6fea5b..3741011971 100644
--- a/epochX/cudacpp/gg_ttg.sa/SubProcesses/MemoryAccessMatrixElements.h
+++ b/epochX/cudacpp/gg_ttg.sa/SubProcesses/MemoryAccessMatrixElements.h
@@ -13,7 +13,7 @@
 #include "MemoryBuffers.h" // for HostBufferMatrixElements::isaligned
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/gg_ttg.sa/SubProcesses/MemoryAccessMomenta.h b/epochX/cudacpp/gg_ttg.sa/SubProcesses/MemoryAccessMomenta.h
index 29266de32c..3be229d392 100644
--- a/epochX/cudacpp/gg_ttg.sa/SubProcesses/MemoryAccessMomenta.h
+++ b/epochX/cudacpp/gg_ttg.sa/SubProcesses/MemoryAccessMomenta.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MemoryAccessMomenta_H
 #define MemoryAccessMomenta_H 1
@@ -13,7 +13,7 @@
 #include "MemoryAccessVectors.h"
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -30,7 +30,7 @@ namespace mg5amcCpu
 
     // Number of Events Per Page in the momenta AOSOA memory buffer layout
     // (these are all best kept as a compile-time constants: see issue #23)
-#ifdef __CUDACC__ /* clang-format off */
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
     // -----------------------------------------------------------------------------------------------
     // --- GPUs: neppM is best set to a power of 2 times the number of fptype's in a 32-byte cacheline
     // --- This is relevant to ensure coalesced access to momenta in global memory
diff --git a/epochX/cudacpp/gg_ttg.sa/SubProcesses/MemoryAccessNumerators.h b/epochX/cudacpp/gg_ttg.sa/SubProcesses/MemoryAccessNumerators.h
index b152183b28..18991f4fa6 100644
--- a/epochX/cudacpp/gg_ttg.sa/SubProcesses/MemoryAccessNumerators.h
+++ b/epochX/cudacpp/gg_ttg.sa/SubProcesses/MemoryAccessNumerators.h
@@ -10,7 +10,7 @@
 #include "MemoryAccessGs.h"
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/gg_ttg.sa/SubProcesses/MemoryAccessRandomNumbers.h b/epochX/cudacpp/gg_ttg.sa/SubProcesses/MemoryAccessRandomNumbers.h
index e2988d39f3..40cb089135 100644
--- a/epochX/cudacpp/gg_ttg.sa/SubProcesses/MemoryAccessRandomNumbers.h
+++ b/epochX/cudacpp/gg_ttg.sa/SubProcesses/MemoryAccessRandomNumbers.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MemoryAccessRandomNumbers_H
 #define MemoryAccessRandomNumbers_H 1
@@ -11,7 +11,7 @@
 #include "CPPProcess.h"
 #include "MemoryAccessHelpers.h"
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 using mg5amcGpu::CPPProcess;
 #else
 using mg5amcCpu::CPPProcess;
diff --git a/epochX/cudacpp/gg_ttg.sa/SubProcesses/MemoryAccessVectors.h b/epochX/cudacpp/gg_ttg.sa/SubProcesses/MemoryAccessVectors.h
index e9b197368e..08faccff0f 100644
--- a/epochX/cudacpp/gg_ttg.sa/SubProcesses/MemoryAccessVectors.h
+++ b/epochX/cudacpp/gg_ttg.sa/SubProcesses/MemoryAccessVectors.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MemoryAccessVectors_H
 #define MemoryAccessVectors_H 1
@@ -10,7 +10,7 @@
 
 #include "mgOnGpuVectors.h"
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 namespace mg5amcCpu // this is only needed for CPU SIMD vectorization
 {
 
diff --git a/epochX/cudacpp/gg_ttg.sa/SubProcesses/MemoryAccessWavefunctions.h b/epochX/cudacpp/gg_ttg.sa/SubProcesses/MemoryAccessWavefunctions.h
index 5428aaf933..33bef4559e 100644
--- a/epochX/cudacpp/gg_ttg.sa/SubProcesses/MemoryAccessWavefunctions.h
+++ b/epochX/cudacpp/gg_ttg.sa/SubProcesses/MemoryAccessWavefunctions.h
@@ -15,7 +15,7 @@
 #define MGONGPU_TRIVIAL_WAVEFUNCTIONS 1
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/gg_ttg.sa/SubProcesses/MemoryBuffers.h b/epochX/cudacpp/gg_ttg.sa/SubProcesses/MemoryBuffers.h
index 3093e6ed18..7756a71621 100644
--- a/epochX/cudacpp/gg_ttg.sa/SubProcesses/MemoryBuffers.h
+++ b/epochX/cudacpp/gg_ttg.sa/SubProcesses/MemoryBuffers.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021, based on earlier work by S. Hageboeck) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Roiser, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MemoryBuffers_H
 #define MemoryBuffers_H 1
@@ -11,12 +11,12 @@
 #include "mgOnGpuCxtypes.h"
 
 #include "CPPProcess.h"
-#include "CudaRuntime.h"
+#include "GpuRuntime.h"
 #include "Parameters_sm.h"
 
 #include <sstream>
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -87,7 +87,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   constexpr bool HostBufferALIGNED = false;   // ismisaligned=false
   constexpr bool HostBufferMISALIGNED = true; // ismisaligned=true
 
@@ -119,7 +119,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   // A class encapsulating a CUDA pinned host buffer
   template<typename T>
   class PinnedHostBufferBase : public BufferBase<T>
@@ -128,18 +128,18 @@ namespace mg5amcCpu
     PinnedHostBufferBase( const size_t size )
       : BufferBase<T>( size, false )
     {
-      checkCuda( cudaMallocHost( &( this->m_data ), this->bytes() ) );
+      gpuMallocHost( &( this->m_data ), this->bytes() );
     }
     virtual ~PinnedHostBufferBase()
     {
-      checkCuda( cudaFreeHost( this->m_data ) );
+      gpuFreeHost( this->m_data );
     }
   };
 #endif
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   // A class encapsulating a CUDA device buffer
   template<typename T>
   class DeviceBufferBase : public BufferBase<T>
@@ -148,18 +148,18 @@ namespace mg5amcCpu
     DeviceBufferBase( const size_t size )
       : BufferBase<T>( size, true )
     {
-      checkCuda( cudaMalloc( &( this->m_data ), this->bytes() ) );
+      gpuMalloc( &( this->m_data ), this->bytes() );
     }
     virtual ~DeviceBufferBase()
     {
-      checkCuda( cudaFree( this->m_data ) );
+      gpuFree( this->m_data );
     }
   };
 #endif
 
   //--------------------------------------------------------------------------
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for a given number of events
   template<typename T, size_t sizePerEvent, bool ismisaligned>
   class HostBuffer : public HostBufferBase<T, ismisaligned>, virtual private NumberOfEvents
@@ -175,7 +175,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   // A class encapsulating a CUDA pinned host buffer for a given number of events
   template<typename T, size_t sizePerEvent>
   class PinnedHostBuffer : public PinnedHostBufferBase<T>, virtual private NumberOfEvents
@@ -191,7 +191,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   // A class encapsulating a CUDA device buffer for a given number of events
   template<typename T, size_t sizePerEvent>
   class DeviceBuffer : public DeviceBufferBase<T>, virtual private NumberOfEvents
@@ -213,7 +213,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for momenta random numbers
   constexpr size_t sizePerEventRndNumMomenta = MemoryBuffers::np4 * MemoryBuffers::nparf;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for momenta random numbers
   typedef HostBuffer<fptype, sizePerEventRndNumMomenta, HostBufferALIGNED> HostBufferRndNumMomenta;
 #else
@@ -232,7 +232,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer with ONE fptype per event
   constexpr size_t sizePerEventOneFp = 1;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer with ONE fptype per event
   typedef HostBuffer<fptype, sizePerEventOneFp, HostBufferALIGNED> HostBufferOneFp;
 #else
@@ -257,7 +257,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for Gs
   constexpr size_t sizePerEventGs = 1;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for gs
   typedef HostBuffer<fptype, sizePerEventGs, HostBufferALIGNED> HostBufferGs;
 #else
@@ -276,7 +276,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for numerators
   constexpr size_t sizePerEventNumerators = 1;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for gs
   typedef HostBuffer<fptype, sizePerEventNumerators, HostBufferALIGNED> HostBufferNumerators;
 #else
@@ -296,7 +296,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for denominators
   constexpr size_t sizePerEventDenominators = 1;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for gs
   typedef HostBuffer<fptype, sizePerEventDenominators, HostBufferALIGNED> HostBufferDenominators;
 #else
@@ -315,7 +315,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for random numbers
   constexpr size_t sizePerEventCouplings = MemoryBuffers::ndcoup * MemoryBuffers::nx2;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for gs
   typedef HostBuffer<fptype, sizePerEventCouplings, HostBufferALIGNED> HostBufferCouplings;
 #else
@@ -333,7 +333,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for momenta
   constexpr size_t sizePerEventMomenta = MemoryBuffers::np4 * MemoryBuffers::npar;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for momenta
   typedef HostBuffer<fptype, sizePerEventMomenta, HostBufferALIGNED> HostBufferMomenta;
   //typedef HostBuffer<fptype, sizePerEventMomenta, HostBufferMISALIGNED> HostBufferMomenta; // TEST MISALIGNMENT!
@@ -352,7 +352,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for sampling weights
   constexpr size_t sizePerEventWeights = 1;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for sampling weights
   typedef HostBuffer<fptype, sizePerEventWeights, HostBufferALIGNED> HostBufferWeights;
 #else
@@ -370,7 +370,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for matrix elements
   constexpr size_t sizePerEventMatrixElements = 1;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for matrix elements
   typedef HostBuffer<fptype, sizePerEventMatrixElements, HostBufferALIGNED> HostBufferMatrixElements;
 #else
@@ -385,7 +385,7 @@ namespace mg5amcCpu
   // A base class encapsulating a memory buffer for the helicity mask
   typedef BufferBase<bool> BufferHelicityMask;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for the helicity mask
   typedef HostBufferBase<bool, HostBufferALIGNED> HostBufferHelicityMask;
 #else
@@ -403,7 +403,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for wavefunctions
   constexpr size_t sizePerEventWavefunctions = MemoryBuffers::nw6 * MemoryBuffers::nx2;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for wavefunctions
   typedef HostBuffer<fptype, sizePerEventWavefunctions, HostBufferALIGNED> HostBufferWavefunctions;
 #else
@@ -421,7 +421,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for helicity random numbers
   constexpr size_t sizePerEventRndNumHelicity = 1;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for helicity random numbers
   typedef HostBuffer<fptype, sizePerEventRndNumHelicity, HostBufferALIGNED> HostBufferRndNumHelicity;
 #else
@@ -439,7 +439,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for color random numbers
   constexpr size_t sizePerEventRndNumColor = 1;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for color random numbers
   typedef HostBuffer<fptype, sizePerEventRndNumColor, HostBufferALIGNED> HostBufferRndNumColor;
 #else
@@ -457,7 +457,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for helicity selection
   constexpr size_t sizePerEventSelectedHelicity = 1;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for helicity selection
   typedef HostBuffer<int, sizePerEventSelectedHelicity, HostBufferALIGNED> HostBufferSelectedHelicity;
 #else
@@ -475,7 +475,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for color selection
   constexpr size_t sizePerEventSelectedColor = 1;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for color selection
   typedef HostBuffer<int, sizePerEventSelectedColor, HostBufferALIGNED> HostBufferSelectedColor;
 #else
@@ -487,7 +487,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   template<class Tdst, class Tsrc>
   void copyDeviceFromHost( Tdst& dst, const Tsrc& src ) // keep the same order of arguments as in memcpy
   {
@@ -504,13 +504,13 @@ namespace mg5amcCpu
       throw std::runtime_error( sstr.str() );
     }
     // NB (PR #45): cudaMemcpy involves an intermediate memcpy to pinned memory if host array is a not a pinned host array
-    checkCuda( cudaMemcpy( dst.data(), src.data(), src.bytes(), cudaMemcpyHostToDevice ) );
+    gpuMemcpy( dst.data(), src.data(), src.bytes(), gpuMemcpyHostToDevice );
   }
 #endif
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   template<class Tdst, class Tsrc>
   void copyHostFromDevice( Tdst& dst, const Tsrc& src ) // keep the same order of arguments as in memcpy
   {
@@ -527,7 +527,7 @@ namespace mg5amcCpu
       throw std::runtime_error( sstr.str() );
     }
     // NB (PR #45): cudaMemcpy involves an intermediate memcpy to pinned memory if host array is a not a pinned host array
-    checkCuda( cudaMemcpy( dst.data(), src.data(), src.bytes(), cudaMemcpyDeviceToHost ) );
+    gpuMemcpy( dst.data(), src.data(), src.bytes(), gpuMemcpyDeviceToHost );
   }
 #endif
 
diff --git a/epochX/cudacpp/gg_ttg.sa/SubProcesses/P1_Sigma_sm_gg_ttxg/CPPProcess.cc b/epochX/cudacpp/gg_ttg.sa/SubProcesses/P1_Sigma_sm_gg_ttxg/CPPProcess.cc
index 2988a13b82..2e02593919 100644
--- a/epochX/cudacpp/gg_ttg.sa/SubProcesses/P1_Sigma_sm_gg_ttxg/CPPProcess.cc
+++ b/epochX/cudacpp/gg_ttg.sa/SubProcesses/P1_Sigma_sm_gg_ttxg/CPPProcess.cc
@@ -4,7 +4,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
 // MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
@@ -16,7 +16,6 @@
 
 #include "mgOnGpuConfig.h"
 
-#include "CudaRuntime.h"
 #include "HelAmps_sm.h"
 #include "MemoryAccessAmplitudes.h"
 #include "MemoryAccessCouplings.h"
@@ -46,7 +45,7 @@
 // Class member functions for calculating the matrix elements for
 // Process: g g > t t~ g WEIGHTED<=3 @1
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -80,7 +79,7 @@ namespace mg5amcCpu
   __device__ const fptype cIPD[2] = { (fptype)Parameters_sm::mdl_MT, (fptype)Parameters_sm::mdl_WT };
   __device__ const fptype* cIPC = nullptr; // unused as nicoup=0
 #else
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   __device__ __constant__ fptype cIPD[2];
   __device__ __constant__ fptype* cIPC = nullptr; // unused as nicoup=0
 #else
@@ -90,7 +89,7 @@ namespace mg5amcCpu
 #endif
 
   // Helicity combinations (and filtering of "good" helicity combinations)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   __device__ __constant__ short cHel[ncomb][npar];
   __device__ __constant__ int cNGoodHel;
   __device__ __constant__ int cGoodHel[ncomb];
@@ -118,13 +117,13 @@ namespace mg5amcCpu
                            fptype* allDenominators,       // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
                            fptype_sv* jamp2_sv            // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled)
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
                            , const int ievt00             // input: first event number in current C++ event page (for CUDA, ievt depends on threadid)
 #endif
                            )
   //ALWAYS_INLINE // attributes are not permitted in a function definition
   {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     using namespace mg5amcGpu;
     using M_ACCESS = DeviceAccessMomenta;         // non-trivial access: buffer includes all events
     using E_ACCESS = DeviceAccessMatrixElements;  // non-trivial access: buffer includes all events
@@ -151,7 +150,7 @@ namespace mg5amcCpu
 #endif /* clang-format on */
     mgDebug( 0, __FUNCTION__ );
     //printf( "calculate_wavefunctions: ihel=%2d\n", ihel );
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
     //printf( "calculate_wavefunctions: ievt00=%d\n", ievt00 );
 #endif
 
@@ -187,7 +186,7 @@ namespace mg5amcCpu
 #endif
     for( int iParity = 0; iParity < nParity; ++iParity )
     { // START LOOP ON IPARITY
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
       const int ievt0 = ievt00 + iParity * neppV;
 #endif
       constexpr size_t nxcoup = ndcoup + nicoup; // both dependent and independent couplings
@@ -200,8 +199,10 @@ namespace mg5amcCpu
         allCOUPs[idcoup] = CD_ACCESS::idcoupAccessBufferConst( allcouplings, idcoup ); // dependent couplings, vary event-by-event
       for( size_t iicoup = 0; iicoup < nicoup; iicoup++ )
         allCOUPs[ndcoup + iicoup] = CI_ACCESS::iicoupAccessBufferConst( cIPC, iicoup ); // independent couplings, fixed for all events
+#ifdef MGONGPUCPP_GPUIMPL
 #ifdef __CUDACC__
 #pragma nv_diagnostic pop
+#endif
       // CUDA kernels take input/output buffers with momenta/MEs for all events
       const fptype* momenta = allmomenta;
       const fptype* COUPs[nxcoup];
@@ -499,7 +500,7 @@ namespace mg5amcCpu
         { 1, -8, 10, 1, 64, -8 },
         { 10, 1, 1, -8, -8, 64 } }; // 2-D array[6][6]
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
       // Pre-compute a constexpr triangular color matrix properly normalized #475
       struct TriangularNormalizedColorMatrix
       {
@@ -556,7 +557,7 @@ namespace mg5amcCpu
 #endif
       for( int icol = 0; icol < ncolor; icol++ )
       {
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
         // === C++ START ===
         // Diagonal terms
 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
@@ -615,7 +616,7 @@ namespace mg5amcCpu
       MEs_sv_previous += deltaMEs_previous;
 #endif
       /*
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
       if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", blockDim.x * blockIdx.x + threadIdx.x, ihel, MEs_sv );
 #else
 #ifdef MGONGPU_CPPSIMD
@@ -678,8 +679,8 @@ namespace mg5amcCpu
       { 1, 1, 1, 1, 1 },
       { 1, 1, 1, -1, -1 },
       { 1, 1, 1, -1, 1 } };
-#ifdef __CUDACC__
-    checkCuda( cudaMemcpyToSymbol( cHel, tHel, ncomb * npar * sizeof( short ) ) );
+#ifdef MGONGPUCPP_GPUIMPL
+    gpuMemcpyToSymbol( cHel, tHel, ncomb * npar * sizeof( short ) );
 #else
     memcpy( cHel, tHel, ncomb * npar * sizeof( short ) );
 #endif
@@ -720,9 +721,9 @@ namespace mg5amcCpu
     // Then copy them to CUDA constant memory (issue #39) or its C++ emulation in file-scope static memory
     const fptype tIPD[2] = { (fptype)m_pars->mdl_MT, (fptype)m_pars->mdl_WT };
     //const cxtype tIPC[0] = { ... }; // nicoup=0
-#ifdef __CUDACC__
-    checkCuda( cudaMemcpyToSymbol( cIPD, tIPD, 2 * sizeof( fptype ) ) );
-    //checkCuda( cudaMemcpyToSymbol( cIPC, tIPC, 0 * sizeof( cxtype ) ) ); // nicoup=0
+#ifdef MGONGPUCPP_GPUIMPL
+    gpuMemcpyToSymbol( cIPD, tIPD, 2 * sizeof( fptype ) );
+    //gpuMemcpyToSymbol( cIPC, tIPC, 0 * sizeof( cxtype ) ); // nicoup=0
 #else
     memcpy( cIPD, tIPD, 2 * sizeof( fptype ) );
     //memcpy( cIPC, tIPC, 0 * sizeof( cxtype ) ); // nicoup=0
@@ -759,7 +760,7 @@ namespace mg5amcCpu
   {
     std::stringstream out;
     // CUDA version (NVCC)
-    // [Use __NVCC__ instead of __CUDACC__ here!]
+    // [Use __NVCC__ instead of MGONGPUCPP_GPUIMPL here!]
     // [This tests if 'nvcc' was used even to build a .cc file, even if not necessarily 'nvcc -x cu' for a .cu file]
     // [Check 'nvcc --compiler-options -dM -E dummy.c | grep CUDA': see https://stackoverflow.com/a/53713712]
 #ifdef __NVCC__
@@ -824,12 +825,12 @@ namespace mg5amcCpu
   __global__ void /* clang-format off */
   computeDependentCouplings( const fptype* allgs, // input: Gs[nevt]
                              fptype* allcouplings // output: couplings[nevt*ndcoup*2]
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
                              , const int nevt     // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #endif
   ) /* clang-format on */
   {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     using namespace mg5amcGpu;
     using G_ACCESS = DeviceAccessGs;
     using C_ACCESS = DeviceAccessCouplings;
@@ -850,7 +851,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__ /* clang-format off */
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
   __global__ void
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
@@ -976,9 +977,9 @@ namespace mg5amcCpu
         nGoodHel++;
       }
     }
-#ifdef __CUDACC__
-    checkCuda( cudaMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) ) );
-    checkCuda( cudaMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) ) );
+#ifdef MGONGPUCPP_GPUIMPL
+    gpuMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) );
+    gpuMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) );
 #else
     cNGoodHel = nGoodHel;
     for( int ihel = 0; ihel < ncomb; ihel++ ) cGoodHel[ihel] = goodHel[ihel];
@@ -1002,7 +1003,7 @@ namespace mg5amcCpu
 #endif
             int* allselhel,                // output: helicity selection[nevt]
             int* allselcol                 // output: helicity selection[nevt]
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
             , const int nevt               // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #endif
             ) /* clang-format on */
@@ -1022,7 +1023,7 @@ namespace mg5amcCpu
     // Denominators: spins, colors and identical particles
     constexpr int helcolDenominators[1] = { 256 }; // assume nprocesses == 1 (#272 and #343)
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     // Remember: in CUDA this is a kernel for one event, in c++ this processes n events
     const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid
 #else
@@ -1036,9 +1037,12 @@ namespace mg5amcCpu
 #endif
 
     // Start sigmaKin_lines
+
+#include "GpuAbstraction.h"
+
     // === PART 0 - INITIALISATION (before calculate_wavefunctions) ===
     // Reset the "matrix elements" - running sums of |M|^2 over helicities for the given event
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     allMEs[ievt] = 0;
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     allNumerators[ievt] = 0;
@@ -1066,7 +1070,7 @@ namespace mg5amcCpu
     // === PART 1 - HELICITY LOOP: CALCULATE WAVEFUNCTIONS ===
     // (in both CUDA and C++, using precomputed good helicities)
 
-#ifdef __CUDACC__ // CUDA OR C++
+#ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++
 
     // *** START OF PART 1a - CUDA (one event per CPU thread) ***
     // Running sum of partial amplitudes squared for event by event color selection (#402)
@@ -1276,7 +1280,7 @@ namespace mg5amcCpu
     // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event
     // [NB 'sum over final spins, average over initial spins', eg see
     // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf]
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     allMEs[ievt] /= helcolDenominators[0];
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     if( channelId > 0 ) allMEs[ievt] *= allNumerators[ievt] / allDenominators[ievt];
diff --git a/epochX/cudacpp/gg_ttg.sa/SubProcesses/P1_Sigma_sm_gg_ttxg/CPPProcess.h b/epochX/cudacpp/gg_ttg.sa/SubProcesses/P1_Sigma_sm_gg_ttxg/CPPProcess.h
index 37d6ebe981..11f562273e 100644
--- a/epochX/cudacpp/gg_ttg.sa/SubProcesses/P1_Sigma_sm_gg_ttxg/CPPProcess.h
+++ b/epochX/cudacpp/gg_ttg.sa/SubProcesses/P1_Sigma_sm_gg_ttxg/CPPProcess.h
@@ -4,7 +4,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
 // MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
@@ -25,7 +25,7 @@
 
 //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -107,7 +107,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   __global__ void
   computeDependentCouplings( const fptype* allgs,    // input: Gs[nevt]
                              fptype* allcouplings ); // output: couplings[nevt*ndcoup*2]
@@ -120,7 +120,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__ /* clang-format off */
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
   __global__ void
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
@@ -150,7 +150,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__ /* clang-format off */
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
   __global__ void
   sigmaKin( const fptype* allmomenta,      // input: momenta[nevt*npar*4]
             const fptype* allcouplings,    // input: couplings[nevt*ndcoup*2]
diff --git a/epochX/cudacpp/gg_ttg.sa/SubProcesses/P1_Sigma_sm_gg_ttxg/CudaRuntime.h b/epochX/cudacpp/gg_ttg.sa/SubProcesses/P1_Sigma_sm_gg_ttxg/CudaRuntime.h
deleted file mode 120000
index ce9e1a487a..0000000000
--- a/epochX/cudacpp/gg_ttg.sa/SubProcesses/P1_Sigma_sm_gg_ttxg/CudaRuntime.h
+++ /dev/null
@@ -1 +0,0 @@
-../CudaRuntime.h
\ No newline at end of file
diff --git a/epochX/cudacpp/gg_ttg.sa/SubProcesses/P1_Sigma_sm_gg_ttxg/GpuAbstraction.h b/epochX/cudacpp/gg_ttg.sa/SubProcesses/P1_Sigma_sm_gg_ttxg/GpuAbstraction.h
new file mode 120000
index 0000000000..72054e19ba
--- /dev/null
+++ b/epochX/cudacpp/gg_ttg.sa/SubProcesses/P1_Sigma_sm_gg_ttxg/GpuAbstraction.h
@@ -0,0 +1 @@
+../GpuAbstraction.h
\ No newline at end of file
diff --git a/epochX/cudacpp/gg_ttg.sa/SubProcesses/P1_Sigma_sm_gg_ttxg/GpuRuntime.h b/epochX/cudacpp/gg_ttg.sa/SubProcesses/P1_Sigma_sm_gg_ttxg/GpuRuntime.h
new file mode 120000
index 0000000000..3920e83be4
--- /dev/null
+++ b/epochX/cudacpp/gg_ttg.sa/SubProcesses/P1_Sigma_sm_gg_ttxg/GpuRuntime.h
@@ -0,0 +1 @@
+../GpuRuntime.h
\ No newline at end of file
diff --git a/epochX/cudacpp/gg_ttg.sa/SubProcesses/P1_Sigma_sm_gg_ttxg/check_sa.cc b/epochX/cudacpp/gg_ttg.sa/SubProcesses/P1_Sigma_sm_gg_ttxg/check_sa.cc
index 3fbf0ffbee..7cac5ab47b 100644
--- a/epochX/cudacpp/gg_ttg.sa/SubProcesses/P1_Sigma_sm_gg_ttxg/check_sa.cc
+++ b/epochX/cudacpp/gg_ttg.sa/SubProcesses/P1_Sigma_sm_gg_ttxg/check_sa.cc
@@ -4,7 +4,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: O. Mattelaer (Nov 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 
 #include "mgOnGpuConfig.h"
@@ -12,6 +12,7 @@
 #include "BridgeKernels.h"
 #include "CPPProcess.h"
 #include "CrossSectionKernels.h"
+#include "GpuRuntime.h"
 #include "MatrixElementKernels.h"
 #include "MemoryAccessMatrixElements.h"
 #include "MemoryAccessMomenta.h"
@@ -65,7 +66,7 @@ usage( char* argv0, int ret = 1 )
   std::cout << std::endl;
   std::cout << "Summary stats are always computed: '-p' and '-j' only control their printout" << std::endl;
   std::cout << "The '-d' flag only enables NaN/abnormal warnings and OMP debugging" << std::endl;
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 #ifdef _OPENMP
   std::cout << std::endl;
   std::cout << "Use the OMP_NUM_THREADS environment variable to control OMP multi-threading" << std::endl;
@@ -96,7 +97,7 @@ int
 main( int argc, char** argv )
 {
   // Namespaces for CUDA and C++ (FIXME - eventually use the same namespace everywhere...)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   using namespace mg5amcGpu;
 #else
   using namespace mg5amcCpu;
@@ -134,9 +135,11 @@ main( int argc, char** argv )
     CurandDevice = 2
   };
 #ifdef MGONGPU_HAS_NO_CURAND
-  RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // this is the only supported mode if build has no curand (PR #784)
+  RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // this is the only supported mode if build has no curand (PR #784 and #785)
+#elif defined __HIPCC__
+#error Internal error: MGONGPU_HAS_NO_CURAND should have been set for __HIPCC__ // default on AMD GPUs should be common random
 #elif defined __CUDACC__
-  RandomNumberMode rndgen = RandomNumberMode::CurandDevice; // default on GPU if build has curand
+  RandomNumberMode rndgen = RandomNumberMode::CurandDevice; // default on NVidia GPU if build has curand
 #else
   RandomNumberMode rndgen = RandomNumberMode::CurandHost; // default on CPU if build has curand
 #endif
@@ -146,10 +149,10 @@ main( int argc, char** argv )
     RamboHost = 1,
     RamboDevice = 2
   };
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   RamboSamplingMode rmbsmp = RamboSamplingMode::RamboDevice; // default on GPU
 #else
-  RamboSamplingMode rmbsmp = RamboSamplingMode::RamboHost;  // default on CPU
+  RamboSamplingMode rmbsmp = RamboSamplingMode::RamboHost; // default on CPU
 #endif
   // Bridge emulation mode (NB Bridge implies RamboHost!)
   bool bridge = false;
@@ -177,7 +180,7 @@ main( int argc, char** argv )
     else if( arg == "--curdev" )
     {
 #ifndef __CUDACC__
-      throw std::runtime_error( "CurandDevice is not supported on CPUs" );
+      throw std::runtime_error( "CurandDevice is not supported on CPUs or non-NVidia GPUs" );
 #elif defined MGONGPU_HAS_NO_CURAND
       throw std::runtime_error( "CurandDevice is not supported because this application was built without Curand support" );
 #else
@@ -198,7 +201,7 @@ main( int argc, char** argv )
     }
     else if( arg == "--rmbdev" )
     {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
       rmbsmp = RamboSamplingMode::RamboDevice;
 #else
       throw std::runtime_error( "RamboDevice is not supported on CPUs" );
@@ -272,13 +275,13 @@ main( int argc, char** argv )
     return usage( argv[0] );
   }
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 #ifdef _OPENMP
   ompnumthreadsNotSetMeansOneThread( debug ? 1 : 0 ); // quiet(-1), info(0), debug(1)
 #endif
 #endif
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // Fail gently and avoid "Illegal instruction (core dumped)" if the host does not support the SIMD used in the ME calculation
   // Note: this prevents a crash on pmpe04 but not on some github CI nodes?
   // [NB: SIMD vectorization in mg5amc C++ code is only used in the ME calculation below MatrixElementKernelHost!]
@@ -296,14 +299,14 @@ main( int argc, char** argv )
 
   // === STEP 0 - INITIALISE
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 
-  // --- 00. Initialise cuda
-  // Instantiate a CudaRuntime at the beginnining of the application's main to
-  // invoke cudaSetDevice(0) in the constructor and book a cudaDeviceReset() call in the destructor
-  const std::string cdinKey = "00 CudaInit";
+  // --- 00. Initialise GPU
+  // Instantiate a GpuRuntime at the beginnining of the application's main.
+  // For CUDA this invokes cudaSetDevice(0) in the constructor and books a cudaDeviceReset() call in the destructor.
+  const std::string cdinKey = "00 GpuInit";
   timermap.start( cdinKey );
-  CudaRuntime cudaRuntime( debug );
+  GpuRuntime GpuRuntime( debug );
 #endif
 
   // --- 0a. Initialise physics process
@@ -325,7 +328,7 @@ main( int argc, char** argv )
   timermap.start( alloKey );
 
   // Memory buffers for random numbers for momenta
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferRndNumMomenta hstRndmom( nevt );
 #else
   PinnedHostBufferRndNumMomenta hstRndmom( nevt );
@@ -333,7 +336,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for sampling weights
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferWeights hstWeights( nevt );
 #else
   PinnedHostBufferWeights hstWeights( nevt );
@@ -341,7 +344,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for momenta
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferMomenta hstMomenta( nevt );
 #else
   PinnedHostBufferMomenta hstMomenta( nevt );
@@ -349,7 +352,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for Gs
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferGs hstGs( nevt );
 #else
   PinnedHostBufferGs hstGs( nevt );
@@ -366,7 +369,7 @@ main( int argc, char** argv )
   }
 
   // Memory buffers for matrix elements
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferMatrixElements hstMatrixElements( nevt );
 #else
   PinnedHostBufferMatrixElements hstMatrixElements( nevt );
@@ -375,7 +378,7 @@ main( int argc, char** argv )
 
   // Memory buffers for random numbers for helicity selection
   // *** NB #403 these buffers always remain initialised at 0: no need for helicity choice in gcheck/check (no LHE produced) ***
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferRndNumHelicity hstRndHel( nevt );
 #else
   PinnedHostBufferRndNumHelicity hstRndHel( nevt );
@@ -384,7 +387,7 @@ main( int argc, char** argv )
 
   // Memory buffers for random numbers for color selection
   // *** NB #402 these buffers always remain initialised at 0: no need for color choice in gcheck/check (no LHE produced) ***
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferRndNumColor hstRndCol( nevt );
 #else
   PinnedHostBufferRndNumColor hstRndCol( nevt );
@@ -392,7 +395,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for helicity selection
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferSelectedHelicity hstSelHel( nevt );
 #else
   PinnedHostBufferSelectedHelicity hstSelHel( nevt );
@@ -400,7 +403,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for color selection
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferSelectedColor hstSelCol( nevt );
 #else
   PinnedHostBufferSelectedColor hstSelCol( nevt );
@@ -438,7 +441,7 @@ main( int argc, char** argv )
     const bool onDevice = true;
     prnk.reset( new CurandRandomNumberKernel( devRndmom, onDevice ) );
 #else
-    throw std::logic_error( "INTERNAL ERROR! CurandDevice is not supported on CPUs" ); // INTERNAL ERROR (no path to this statement)
+    throw std::logic_error( "INTERNAL ERROR! CurandDevice is not supported on CPUs or non-NVidia GPUs" ); // INTERNAL ERROR (no path to this statement)
 #endif
   }
 
@@ -450,7 +453,7 @@ main( int argc, char** argv )
   }
   else
   {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     prsk.reset( new RamboSamplingKernelDevice( energy, devRndmom, devMomenta, devWeights, gpublocks, gputhreads ) );
 #else
     throw std::logic_error( "RamboDevice is not supported on CPUs" ); // INTERNAL ERROR (no path to this statement)
@@ -461,7 +464,7 @@ main( int argc, char** argv )
   std::unique_ptr<MatrixElementKernelBase> pmek;
   if( !bridge )
   {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     pmek.reset( new MatrixElementKernelDevice( devMomenta, devGs, devRndHel, devRndCol, devMatrixElements, devSelHel, devSelCol, gpublocks, gputhreads ) );
 #else
     pmek.reset( new MatrixElementKernelHost( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, nevt ) );
@@ -469,7 +472,7 @@ main( int argc, char** argv )
   }
   else
   {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     pmek.reset( new BridgeKernelDevice( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, gpublocks, gputhreads ) );
 #else
     pmek.reset( new BridgeKernelHost( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, nevt ) );
@@ -511,7 +514,7 @@ main( int argc, char** argv )
     prnk->generateRnarray();
     //std::cout << "Got random numbers" << std::endl;
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     if( rndgen != RandomNumberMode::CurandDevice && rmbsmp == RamboSamplingMode::RamboDevice )
     {
       // --- 1c. Copy rndmom from host to device
@@ -543,7 +546,7 @@ main( int argc, char** argv )
     prsk->getMomentaFinal();
     //std::cout << "Got final momenta" << std::endl;
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     if( rmbsmp == RamboSamplingMode::RamboDevice )
     {
       // --- 2c. CopyDToH Weights
@@ -588,7 +591,7 @@ main( int argc, char** argv )
       dynamic_cast<BridgeKernelBase*>( pmek.get() )->transposeInputMomentaC2F();
     }
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     // --- 2d. CopyHToD Momenta
     const std::string gKey = "0.. CpHTDg";
     rambtime += timermap.start( gKey ); // FIXME! NOT A RAMBO TIMER!
@@ -617,7 +620,7 @@ main( int argc, char** argv )
     wv3atime += timermap.stop(); // calc only
     wavetime += wv3atime;        // calc plus copy
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     if( !bridge )
     {
       // --- 3b. CopyDToH MEs
@@ -760,18 +763,22 @@ main( int argc, char** argv )
     rndgentxt = "CURAND DEVICE";
 #ifdef __CUDACC__
   rndgentxt += " (CUDA code)";
+#elif defined __HIPCC__
+  rndgentxt += " (HIP code)";
 #else
   rndgentxt += " (C++ code)";
 #endif
 
   // Workflow description summary
   std::string wrkflwtxt;
-  // -- CUDA or C++?
+  // -- CUDA or HIP or C++?
 #ifdef __CUDACC__
   wrkflwtxt += "CUD:";
+#elif defined __HIPCC__
+  wrkflwtxt += "HIP:";
 #else
   wrkflwtxt += "CPP:";
-#endif
+#endif /* clang-format off */
   // -- DOUBLE or FLOAT?
 #if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
   wrkflwtxt += "MIX+"; // mixed fptypes (single precision color algebra #537)
@@ -781,7 +788,7 @@ main( int argc, char** argv )
   wrkflwtxt += "FLT+";
 #else
   wrkflwtxt += "???+"; // no path to this statement
-#endif
+#endif /* clang-format on */
   // -- CUCOMPLEX or THRUST or STD complex numbers?
 #ifdef __CUDACC__
 #if defined MGONGPU_CUCXTYPE_CUCOMPLEX
@@ -793,6 +800,12 @@ main( int argc, char** argv )
 #else
   wrkflwtxt += "???:"; // no path to this statement
 #endif
+#elif defined __HIPCC__
+#if defined MGONGPU_CUCXTYPE_CXSMPL
+  wrkflwtxt += "CXS:";
+#else
+  wrkflwtxt += "???:"; // no path to this statement
+#endif
 #else
 #if defined MGONGPU_CPPCXTYPE_STDCOMPLEX
   wrkflwtxt += "STX:";
@@ -818,7 +831,7 @@ main( int argc, char** argv )
     wrkflwtxt += "RMBDEV+";
   else
     wrkflwtxt += "??????+"; // no path to this statement
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   // -- HOST or DEVICE matrix elements? Standalone MEs or BRIDGE?
   if( !bridge )
     wrkflwtxt += "MESDEV";
@@ -874,7 +887,7 @@ main( int argc, char** argv )
 
   if( perf )
   {
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 #ifdef _OPENMP
     // Get the output of "nproc --all" (https://stackoverflow.com/a/478960)
     std::string nprocall;
@@ -895,6 +908,8 @@ main( int argc, char** argv )
     std::cout << std::string( SEP79, '*' ) << std::endl
 #ifdef __CUDACC__
               << "Process                     = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_CUDA"
+#elif defined __HIPCC__
+              << "Process                     = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_HIP"
 #else
               << "Process                     = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_CPP"
 #endif
@@ -921,21 +936,21 @@ main( int argc, char** argv )
 #elif defined MGONGPU_FPTYPE_FLOAT
               << "FP precision                = FLOAT (NaN/abnormal=" << nabn << ", zero=" << nzero << ")" << std::endl
 #endif
-#ifdef __CUDACC__
 #if defined MGONGPU_CUCXTYPE_CUCOMPLEX
               << "Complex type                = CUCOMPLEX" << std::endl
 #elif defined MGONGPU_CUCXTYPE_THRUST
               << "Complex type                = THRUST::COMPLEX" << std::endl
-#endif
-#else
+#elif defined MGONGPU_CUCXTYPE_CXSMPL
               << "Complex type                = STD::COMPLEX" << std::endl
+#else
+              << "Complex type                = ???" << std::endl // no path to this statement...
 #endif
               << "RanNumb memory layout       = AOSOA[" << neppR << "]"
               << ( neppR == 1 ? " == AOS" : "" )
               << " [HARDCODED FOR REPRODUCIBILITY]" << std::endl
               << "Momenta memory layout       = AOSOA[" << neppM << "]"
               << ( neppM == 1 ? " == AOS" : "" ) << std::endl
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     //<< "Wavefunction GPU memory     = LOCAL" << std::endl
 #else
 #if !defined MGONGPU_CPPSIMD
@@ -966,7 +981,7 @@ main( int argc, char** argv )
 #endif
 #endif
               << "Random number generation    = " << rndgentxt << std::endl
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 #ifdef _OPENMP
               << "OMP threads / `nproc --all` = " << omp_get_max_threads() << " / " << nprocall // includes a newline
 #endif
@@ -1062,14 +1077,14 @@ main( int argc, char** argv )
              << "\"FLOAT (NaN/abnormal=" << nabn << ")\"," << std::endl
 #endif
              << "\"Complex type\": "
-#ifdef __CUDACC__
 #if defined MGONGPU_CUCXTYPE_CUCOMPLEX
              << "\"CUCOMPLEX\"," << std::endl
 #elif defined MGONGPU_CUCXTYPE_THRUST
              << "\"THRUST::COMPLEX\"," << std::endl
-#endif
-#else
+#elif defined MGONGPU_CUCXTYPE_CXSMPL
              << "\"STD::COMPLEX\"," << std::endl
+#else
+             << "\"???\"," << std::endl                           // no path to this statement...
 #endif
              << "\"RanNumb memory layout\": "
              << "\"AOSOA[" << neppR << "]\""
@@ -1077,7 +1092,7 @@ main( int argc, char** argv )
              << "\"Momenta memory layout\": "
              << "\"AOSOA[" << neppM << "]\""
              << ( neppM == 1 ? " == AOS" : "" ) << ", " << std::endl
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     //<< "\"Wavefunction GPU memory\": " << "\"LOCAL\"," << std::endl
 #endif
              << "\"Curand generation\": "
diff --git a/epochX/cudacpp/gg_ttg.sa/SubProcesses/RamboSamplingKernels.cc b/epochX/cudacpp/gg_ttg.sa/SubProcesses/RamboSamplingKernels.cc
index da68aa9255..79abbcc4f8 100644
--- a/epochX/cudacpp/gg_ttg.sa/SubProcesses/RamboSamplingKernels.cc
+++ b/epochX/cudacpp/gg_ttg.sa/SubProcesses/RamboSamplingKernels.cc
@@ -1,11 +1,11 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #include "RamboSamplingKernels.h"
 
-#include "CudaRuntime.h"
+#include "GpuRuntime.h"
 #include "MemoryAccessMomenta.h"
 #include "MemoryAccessRandomNumbers.h"
 #include "MemoryAccessWeights.h"
@@ -14,7 +14,7 @@
 
 #include <sstream>
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -92,7 +92,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   RamboSamplingKernelDevice::RamboSamplingKernelDevice( const fptype energy,               // input: energy
                                                         const BufferRndNumMomenta& rndmom, // input: random numbers in [0,1]
                                                         BufferMomenta& momenta,            // output: momenta
@@ -135,7 +135,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   __global__ void
   getMomentaInitialDevice( const fptype energy,
                            fptype* momenta )
@@ -147,17 +147,17 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   void
   RamboSamplingKernelDevice::getMomentaInitial()
   {
-    getMomentaInitialDevice<<<m_gpublocks, m_gputhreads>>>( m_energy, m_momenta.data() );
+    gpuLaunchKernel( getMomentaInitialDevice, m_gpublocks, m_gputhreads, m_energy, m_momenta.data() );
   }
 #endif
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   __global__ void
   getMomentaFinalDevice( const fptype energy,
                          const fptype* rndmom,
@@ -171,11 +171,11 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   void
   RamboSamplingKernelDevice::getMomentaFinal()
   {
-    getMomentaFinalDevice<<<m_gpublocks, m_gputhreads>>>( m_energy, m_rndmom.data(), m_momenta.data(), m_weights.data() );
+    gpuLaunchKernel( getMomentaFinalDevice, m_gpublocks, m_gputhreads, m_energy, m_rndmom.data(), m_momenta.data(), m_weights.data() );
   }
 #endif
 
diff --git a/epochX/cudacpp/gg_ttg.sa/SubProcesses/RamboSamplingKernels.h b/epochX/cudacpp/gg_ttg.sa/SubProcesses/RamboSamplingKernels.h
index 184089efd7..7c214cd74b 100644
--- a/epochX/cudacpp/gg_ttg.sa/SubProcesses/RamboSamplingKernels.h
+++ b/epochX/cudacpp/gg_ttg.sa/SubProcesses/RamboSamplingKernels.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef RAMBOSAMPLINGKERNELS_H
 #define RAMBOSAMPLINGKERNELS_H 1
@@ -10,7 +10,7 @@
 
 #include "MemoryBuffers.h"
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -93,7 +93,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   // A class encapsulating RAMBO phase space sampling on a GPU device
   class RamboSamplingKernelDevice final : public SamplingKernelBase, public NumberOfEvents
   {
diff --git a/epochX/cudacpp/gg_ttg.sa/SubProcesses/RandomNumberKernels.h b/epochX/cudacpp/gg_ttg.sa/SubProcesses/RandomNumberKernels.h
index 188a72c2c9..21d63beeac 100644
--- a/epochX/cudacpp/gg_ttg.sa/SubProcesses/RandomNumberKernels.h
+++ b/epochX/cudacpp/gg_ttg.sa/SubProcesses/RandomNumberKernels.h
@@ -1,14 +1,14 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef RANDOMNUMBERKERNELS_H
 #define RANDOMNUMBERKERNELS_H 1
 
 #include "mgOnGpuConfig.h"
 
-// NB This must come AFTER mgOnGpuConfig.h which contains our definition of __global__ when __CUDACC__ is not defined
+// NB This must come AFTER mgOnGpuConfig.h which contains our definition of __global__ when MGONGPUCPP_GPUIMPL is not defined
 #ifndef MGONGPU_HAS_NO_CURAND
 //#include "curand.h"
 struct curandGenerator_st; // forward definition from curand.h
@@ -16,7 +16,7 @@ struct curandGenerator_st; // forward definition from curand.h
 
 #include "MemoryBuffers.h"
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/gg_ttg.sa/SubProcesses/cudacpp.mk b/epochX/cudacpp/gg_ttg.sa/SubProcesses/cudacpp.mk
index 509307506b..f2cfa349da 100644
--- a/epochX/cudacpp/gg_ttg.sa/SubProcesses/cudacpp.mk
+++ b/epochX/cudacpp/gg_ttg.sa/SubProcesses/cudacpp.mk
@@ -1,7 +1,7 @@
 # Copyright (C) 2020-2023 CERN and UCLouvain.
 # Licensed under the GNU Lesser General Public License (version 3 or later).
 # Created by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin.
-# Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+# Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 
 #=== Determine the name of this makefile (https://ftp.gnu.org/old-gnu/Manuals/make-3.80/html_node/make_17.html)
 #=== NB: use ':=' to ensure that the value of CUDACPP_MAKEFILE is not modified further down after including make_opts
@@ -42,10 +42,10 @@ endif
 
 #-------------------------------------------------------------------------------
 
-#=== Configure common compiler flags for C++ and CUDA
+#=== Configure common compiler flags for C++ and CUDA/HIP
 
 INCFLAGS = -I.
-OPTFLAGS = -O3 # this ends up in CUFLAGS too (should it?), cannot add -Ofast or -ffast-math here
+OPTFLAGS = -O3 # this ends up in GPUFLAGS too (should it?), cannot add -Ofast or -ffast-math here
 
 # Dependency on src directory
 MG5AMC_COMMONLIB = mg5amc_common
@@ -121,24 +121,46 @@ endif
 
 #-------------------------------------------------------------------------------
 
-#=== Configure the CUDA compiler
+#=== Configure the GPU compiler (CUDA or HIP)
 
-# If CXX is not a single word (example "clang++ --gcc-toolchain...") then disable CUDA builds (issue #505)
-# This is because it is impossible to pass this to "CUFLAGS += -ccbin <host-compiler>" below
+# FIXME! (AV 24.01.2024)
+# In the current implementation (without separate builds for C++ and CUDA/HIP), we first check for cudacc and hipcc in CUDA_HOME and HIP_HOME.
+# If CUDA_HOME or HIP_HOME are not set, try to determine them from the path to cudacc and hipcc.
+# While convoluted, this is currently necessary to allow disabling CUDA/HIP builds by setting CUDA_HOME or HIP_HOME to invalid paths.
+# This will (probably?) be fixed when separate C++ and CUDA/HIP builds are implemented (PR #775).
+
+# If CXX is not a single word (example "clang++ --gcc-toolchain...") then disable CUDA and HIP builds (issue #505)
+# This is because it is impossible to pass this to "GPUFLAGS += -ccbin <host-compiler>" below
 ifneq ($(words $(subst ccache ,,$(CXX))),1) # allow at most "CXX=ccache <host-compiler>" from outside
-  $(warning CUDA builds are not supported for multi-word CXX "$(CXX)")
+  $(warning CUDA and HIP builds are not supported for multi-word CXX "$(CXX)")
   override CUDA_HOME=disabled
+  override HIP_HOME=disabled
 endif
 
-# If CUDA_HOME is not set, try to set it from the location of nvcc
+# If CUDA_HOME is not set, try to set it from the path to nvcc
 ifndef CUDA_HOME
   CUDA_HOME = $(patsubst %bin/nvcc,%,$(shell which nvcc 2>/dev/null))
   $(warning CUDA_HOME was not set: using "$(CUDA_HOME)")
 endif
 
-# Set NVCC as $(CUDA_HOME)/bin/nvcc if it exists
+# If HIP_HOME is not set, try to set it from the path to hipcc
+ifndef HIP_HOME
+  HIP_HOME = $(patsubst %bin/hipcc,%,$(HIP_COMPILER_PATH))
+  $(warning HIP_HOME was not set: using "$(HIP_HOME)")
+endif
+
+# FIXME! (AV 24.01.2024)
+# In the current implementation (without separate builds for C++ and CUDA/HIP),
+# builds are performed for HIP only if CUDA is not found in the path.
+# If both CUDA and HIP are installed, HIP builds can be triggered by unsetting CUDA_HOME.
+# This will be fixed when separate C++ and CUDA/HIP builds are implemented (PR #775).
+
+#--- Option 1: CUDA exists -> use CUDA
+
+# Set GPUCC as $(CUDA_HOME)/bin/nvcc if it exists
 ifneq ($(wildcard $(CUDA_HOME)/bin/nvcc),)
-  NVCC = $(CUDA_HOME)/bin/nvcc
+
+  GPUCC = $(CUDA_HOME)/bin/nvcc
   USE_NVTX ?=-DUSE_NVTX
   # See https://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/index.html
   # See https://arnon.dk/matching-sm-architectures-arch-and-gencode-for-various-nvidia-cards/
@@ -158,41 +180,78 @@ ifneq ($(wildcard $(CUDA_HOME)/bin/nvcc),)
     CURANDLIBFLAGS = -L$(CUDA_HOME)/lib64/ -lcurand # NB: -lcuda is not needed here!
   endif
   CUOPTFLAGS = -lineinfo
-  CUFLAGS = $(foreach opt, $(OPTFLAGS), -Xcompiler $(opt)) $(CUOPTFLAGS) $(INCFLAGS) $(CUINC) $(USE_NVTX) $(CUARCHFLAGS) -use_fast_math
-  ###CUFLAGS += -Xcompiler -Wall -Xcompiler -Wextra -Xcompiler -Wshadow
-  ###NVCC_VERSION = $(shell $(NVCC) --version | grep 'Cuda compilation tools' | cut -d' ' -f5 | cut -d, -f1)
-  CUFLAGS += -std=c++17 # need CUDA >= 11.2 (see #333): this is enforced in mgOnGpuConfig.h
+  ###GPUFLAGS = $(OPTFLAGS) $(CUOPTFLAGS) $(INCFLAGS) $(CUINC) $(USE_NVTX) $(CUARCHFLAGS) -use_fast_math
+  GPUFLAGS = $(foreach opt, $(OPTFLAGS), -Xcompiler $(opt)) $(CUOPTFLAGS) $(INCFLAGS) $(CUINC) $(USE_NVTX) $(CUARCHFLAGS) -use_fast_math
+  ###GPUFLAGS += -Xcompiler -Wall -Xcompiler -Wextra -Xcompiler -Wshadow
+  ###GPUCC_VERSION = $(shell $(GPUCC) --version | grep 'Cuda compilation tools' | cut -d' ' -f5 | cut -d, -f1)
+  GPUFLAGS += -std=c++17 # need CUDA >= 11.2 (see #333): this is enforced in mgOnGpuConfig.h
   # Without -maxrregcount: baseline throughput: 6.5E8 (16384 32 12) up to 7.3E8 (65536 128 12)
-  ###CUFLAGS+= --maxrregcount 160 # improves throughput: 6.9E8 (16384 32 12) up to 7.7E8 (65536 128 12)
-  ###CUFLAGS+= --maxrregcount 128 # improves throughput: 7.3E8 (16384 32 12) up to 7.6E8 (65536 128 12)
-  ###CUFLAGS+= --maxrregcount 96 # degrades throughput: 4.1E8 (16384 32 12) up to 4.5E8 (65536 128 12)
-  ###CUFLAGS+= --maxrregcount 64 # degrades throughput: 1.7E8 (16384 32 12) flat at 1.7E8 (65536 128 12)
+  ###GPUFLAGS+= --maxrregcount 160 # improves throughput: 6.9E8 (16384 32 12) up to 7.7E8 (65536 128 12)
+  ###GPUFLAGS+= --maxrregcount 128 # improves throughput: 7.3E8 (16384 32 12) up to 7.6E8 (65536 128 12)
+  ###GPUFLAGS+= --maxrregcount 96 # degrades throughput: 4.1E8 (16384 32 12) up to 4.5E8 (65536 128 12)
+  ###GPUFLAGS+= --maxrregcount 64 # degrades throughput: 1.7E8 (16384 32 12) flat at 1.7E8 (65536 128 12)
+  CUBUILDRULEFLAGS = -Xcompiler -fPIC -c
+  CCBUILDRULEFLAGS = -Xcompiler -fPIC -c -x cu
+  CUDATESTFLAGS = -lcuda
+
+  # Set the host C++ compiler for GPUCC via "-ccbin <host-compiler>"
+  # (NB issue #505: this must be a single word, "clang++ --gcc-toolchain..." is not supported)
+  GPUFLAGS += -ccbin $(shell which $(subst ccache ,,$(CXX)))
+
+  # Allow newer (unsupported) C++ compilers with older versions of CUDA if ALLOW_UNSUPPORTED_COMPILER_IN_CUDA is set (#504)
+  ifneq ($(origin ALLOW_UNSUPPORTED_COMPILER_IN_CUDA),undefined)
+  GPUFLAGS += -allow-unsupported-compiler
+  endif
+
 else ifneq ($(origin REQUIRE_CUDA),undefined)
+
   # If REQUIRE_CUDA is set but no cuda is found, stop here (e.g. for CI tests on GPU #443)
-  $(error No cuda installation found (set CUDA_HOME or make nvcc visible in PATH))
+  $(error No cuda installation found (set CUDA_HOME or make GPUCC visible in PATH))
+
+#--- Option 2: CUDA does not exist, HIP exists -> use HIP
+
+# Set GPUCC as $(HIP_HOME)/bin/hipcc if it exists
+else ifneq ($(wildcard $(HIP_HOME)/bin/hipcc),)
+
+  GPUCC = $(HIP_HOME)/bin/hipcc
+  #USE_NVTX ?=-DUSE_NVTX # should maybe find something equivalent to this in HIP?
+  HIPARCHFLAGS = -target x86_64-linux-gnu --offload-arch=gfx90a
+  HIPINC = -I$(HIP_HOME)/include/
+  # Note: -DHIP_FAST_MATH is equivalent to -use_fast_math in HIP 
+  # (but only for single precision line 208: https://rocm-developer-tools.github.io/HIP/hcc__detail_2math__functions_8h_source.html)
+  GPUFLAGS = $(OPTFLAGS) $(CUOPTFLAGS) $(INCFLAGS) $(HIPINC) $(HIPARCHFLAGS) -DHIP_FAST_MATH -DHIP_PLATFORM=amd -fPIC
+  ###GPUFLAGS += -Xcompiler -Wall -Xcompiler -Wextra -Xcompiler -Wshadow
+  GPUFLAGS += -std=c++17
+  ###GPUFLAGS+= --maxrregcount 255 # (AV: is this option valid on HIP and meaningful on AMD GPUs?)
+  CUBUILDRULEFLAGS = -fPIC -c
+  CCBUILDRULEFLAGS = -fPIC -c
+
+else ifneq ($(origin REQUIRE_HIP),undefined)
+
+  # If REQUIRE_HIP is set but no HIP is found, stop here (e.g. for CI tests on GPU #443)
+  $(error No hip installation found (set HIP_HOME or make GPUCC visible in PATH))
+
+#--- Option 3: CUDA does not exist, HIP does not exist -> switch off both CUDA and HIP
+
 else
-  # No cuda. Switch cuda compilation off and go to common random numbers in C++
+
+  # No cudacc and no hipcc: switch CUDA and HIP compilation off and go to common random numbers in C++
   $(warning CUDA_HOME is not set or is invalid: export CUDA_HOME to compile with cuda)
-  override NVCC=
+  $(warning HIP_HOME is not set or is invalid: export HIP_HOME to compile with hip)
+  override GPUCC=
   override USE_NVTX=
   override CUINC=
   override CURANDLIBFLAGS=
-endif
-export NVCC
-export CUFLAGS
-
-# Set the host C++ compiler for nvcc via "-ccbin <host-compiler>"
-# (NB issue #505: this must be a single word, "clang++ --gcc-toolchain..." is not supported)
-CUFLAGS += -ccbin $(shell which $(subst ccache ,,$(CXX)))
 
-# Allow newer (unsupported) C++ compilers with older versions of CUDA if ALLOW_UNSUPPORTED_COMPILER_IN_CUDA is set (#504)
-ifneq ($(origin ALLOW_UNSUPPORTED_COMPILER_IN_CUDA),undefined)
-CUFLAGS += -allow-unsupported-compiler
 endif
 
+# Export GPUCC (so that it can also be used in cudacpp_src.mk?)
+export GPUCC
+export GPUFLAGS
+
 #-------------------------------------------------------------------------------
 
-#=== Configure ccache for C++ and CUDA builds
+#=== Configure ccache for C++ and CUDA/HIP builds
 
 # Enable ccache if USECCACHE=1
 ifeq ($(USECCACHE)$(shell echo $(CXX) | grep ccache),1)
@@ -201,15 +260,15 @@ endif
 #ifeq ($(USECCACHE)$(shell echo $(AR) | grep ccache),1)
 #  override AR:=ccache $(AR)
 #endif
-ifneq ($(NVCC),)
-  ifeq ($(USECCACHE)$(shell echo $(NVCC) | grep ccache),1)
-    override NVCC:=ccache $(NVCC)
+ifneq ($(GPUCC),)
+  ifeq ($(USECCACHE)$(shell echo $(GPUCC) | grep ccache),1)
+    override GPUCC:=ccache $(GPUCC)
   endif
 endif
 
 #-------------------------------------------------------------------------------
 
-#=== Configure PowerPC-specific compiler flags for C++ and CUDA
+#=== Configure PowerPC-specific compiler flags for C++ and CUDA/HIP
 
 # PowerPC-specific CXX compiler flags (being reviewed)
 ifeq ($(UNAME_P),ppc64le)
@@ -225,9 +284,9 @@ else
   ######CXXFLAGS+= -fno-semantic-interposition # no benefit (neither alone, nor combined with -flto)
 endif
 
-# PowerPC-specific CUDA compiler flags (to be reviewed!)
+# PowerPC-specific CUDA/HIP compiler flags (to be reviewed!)
 ifeq ($(UNAME_P),ppc64le)
-  CUFLAGS+= -Xcompiler -mno-float128
+  GPUFLAGS+= -Xcompiler -mno-float128
 endif
 
 #-------------------------------------------------------------------------------
@@ -237,7 +296,7 @@ endif
 # Set the default OMPFLAGS choice
 ifneq ($(shell $(CXX) --version | egrep '^Intel'),)
 override OMPFLAGS = -fopenmp
-###override OMPFLAGS = # disable OpenMP MT on Intel (was ok without nvcc but not ok with nvcc before #578)
+###override OMPFLAGS = # disable OpenMP MT on Intel (was ok without GPUCC but not ok with GPUCC before #578)
 else ifneq ($(shell $(CXX) --version | egrep '^(clang)'),)
 override OMPFLAGS = -fopenmp
 ###override OMPFLAGS = # disable OpenMP MT on clang (was not ok without or with nvcc before #578)
@@ -293,7 +352,10 @@ endif
 
 # Set the default RNDGEN (random number generator) choice
 ifeq ($(RNDGEN),)
-  ifeq ($(NVCC),)
+  ifeq ($(GPUCC),)
+    override RNDGEN = hasNoCurand
+  # Edgecase for HIP compilation
+  else ifeq ($(findstring hipcc,$(GPUCC)),hipcc)
     override RNDGEN = hasNoCurand
   else ifeq ($(RNDGEN),)
     override RNDGEN = hasCurand
@@ -310,7 +372,7 @@ export OMPFLAGS
 
 #-------------------------------------------------------------------------------
 
-#=== Set the CUDA/C++ compiler flags appropriate to user-defined choices of AVX, FPTYPE, HELINL, HRDCOD, RNDGEN
+#=== Set the CUDA/HIP/C++ compiler flags appropriate to user-defined choices of AVX, FPTYPE, HELINL, HRDCOD, RNDGEN
 
 # Set the build flags appropriate to OMPFLAGS
 $(info OMPFLAGS=$(OMPFLAGS))
@@ -368,13 +430,13 @@ CXXFLAGS+= $(AVXFLAGS)
 $(info FPTYPE=$(FPTYPE))
 ifeq ($(FPTYPE),d)
   CXXFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_DOUBLE
-  CUFLAGS  += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_DOUBLE
+  GPUFLAGS  += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_DOUBLE
 else ifeq ($(FPTYPE),f)
   CXXFLAGS += -DMGONGPU_FPTYPE_FLOAT -DMGONGPU_FPTYPE2_FLOAT
-  CUFLAGS  += -DMGONGPU_FPTYPE_FLOAT -DMGONGPU_FPTYPE2_FLOAT
+  GPUFLAGS  += -DMGONGPU_FPTYPE_FLOAT -DMGONGPU_FPTYPE2_FLOAT
 else ifeq ($(FPTYPE),m)
   CXXFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_FLOAT
-  CUFLAGS  += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_FLOAT
+  GPUFLAGS  += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_FLOAT
 else
   $(error Unknown FPTYPE='$(FPTYPE)': only 'd', 'f' and 'm' are supported)
 endif
@@ -383,7 +445,7 @@ endif
 $(info HELINL=$(HELINL))
 ifeq ($(HELINL),1)
   CXXFLAGS += -DMGONGPU_INLINE_HELAMPS
-  CUFLAGS  += -DMGONGPU_INLINE_HELAMPS
+  GPUFLAGS  += -DMGONGPU_INLINE_HELAMPS
 else ifneq ($(HELINL),0)
   $(error Unknown HELINL='$(HELINL)': only '0' and '1' are supported)
 endif
@@ -392,7 +454,7 @@ endif
 $(info HRDCOD=$(HRDCOD))
 ifeq ($(HRDCOD),1)
   CXXFLAGS += -DMGONGPU_HARDCODE_PARAM
-  CUFLAGS  += -DMGONGPU_HARDCODE_PARAM
+  GPUFLAGS  += -DMGONGPU_HARDCODE_PARAM
 else ifneq ($(HRDCOD),0)
   $(error Unknown HRDCOD='$(HRDCOD)': only '0' and '1' are supported)
 endif
@@ -444,11 +506,11 @@ ifeq ($(UNAME_S),Darwin)
   override CULIBFLAGSRPATH2 =
 else
   # RPATH to cuda/cpp libs when linking executables
-  override CXXLIBFLAGSRPATH = -Wl,-rpath,$(LIBDIRRPATH)
-  override CULIBFLAGSRPATH = -Xlinker -rpath,$(LIBDIRRPATH)
+  override CXXLIBFLAGSRPATH = -Wl,-rpath=$(LIBDIRRPATH)
+  override CULIBFLAGSRPATH = -Xlinker -rpath=$(LIBDIRRPATH)
   # RPATH to common lib when linking cuda/cpp libs
-  override CXXLIBFLAGSRPATH2 = -Wl,-rpath,'$$ORIGIN'
-  override CULIBFLAGSRPATH2 = -Xlinker -rpath,'$$ORIGIN'
+  override CXXLIBFLAGSRPATH2 = -Wl,-rpath='$$ORIGIN'
+  override CULIBFLAGSRPATH2 = -Xlinker -rpath='$$ORIGIN'
 endif
 
 # Setting LD_LIBRARY_PATH or DYLD_LIBRARY_PATH in the RUNTIME is no longer necessary (neither on Linux nor on Mac)
@@ -461,7 +523,7 @@ override RUNTIME =
 cxx_main=$(BUILDDIR)/check.exe
 fcxx_main=$(BUILDDIR)/fcheck.exe
 
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 cu_main=$(BUILDDIR)/gcheck.exe
 fcu_main=$(BUILDDIR)/fgcheck.exe
 else
@@ -492,15 +554,16 @@ $(BUILDDIR)/.build.$(TAG):
 	@touch $(BUILDDIR)/.build.$(TAG)
 
 # Generic target and build rules: objects from CUDA compilation
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 $(BUILDDIR)/%.o : %.cu *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
 	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
-	$(NVCC) $(CPPFLAGS) $(CUFLAGS) -Xcompiler -fPIC -c $< -o $@
+	$(GPUCC) $(CPPFLAGS) $(GPUFLAGS) $(CUBUILDRULEFLAGS) $< -o $@
 
 $(BUILDDIR)/%_cu.o : %.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
 	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
-	$(NVCC) $(CPPFLAGS) $(CUFLAGS) -Xcompiler -fPIC -c -x cu $< -o $@
+	$(GPUCC) $(CPPFLAGS) $(GPUFLAGS) $(CCBUILDRULEFLAGS) $< -o $@
 endif
+# -x cu in line above
 
 # Generic target and build rules: objects from C++ compilation
 # (NB do not include CUINC here! add it only for NVTX or curand #679)
@@ -509,11 +572,14 @@ $(BUILDDIR)/%.o : %.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
 	$(CXX) $(CPPFLAGS) $(CXXFLAGS) -fPIC -c $< -o $@
 
 # Apply special build flags only to CrossSectionKernel.cc and gCrossSectionKernel.cu (no fast math, see #117 and #516)
+# Added edgecase for HIP compilation
 ifeq ($(shell $(CXX) --version | grep ^nvc++),)
 $(BUILDDIR)/CrossSectionKernels.o: CXXFLAGS := $(filter-out -ffast-math,$(CXXFLAGS))
 $(BUILDDIR)/CrossSectionKernels.o: CXXFLAGS += -fno-fast-math
-ifneq ($(NVCC),)
-$(BUILDDIR)/gCrossSectionKernels.o: CUFLAGS += -Xcompiler -fno-fast-math
+ifeq ($(findstring nvcc,$(GPUCC)),nvcc)
+  $(BUILDDIR)/gCrossSectionKernels.o: GPUFLAGS += -Xcompiler -fno-fast-math
+else
+  $(BUILDDIR)/gCrossSectionKernels.o: GPUFLAGS += -fno-fast-math
 endif
 endif
 
@@ -530,10 +596,10 @@ ifeq ($(RNDGEN),hasCurand)
 $(BUILDDIR)/CurandRandomNumberKernel.o: CXXFLAGS += $(CUINC)
 endif
 
-# Avoid "warning: builtin __has_trivial_... is deprecated; use __is_trivially_... instead" in nvcc with icx2023 (#592)
+# Avoid "warning: builtin __has_trivial_... is deprecated; use __is_trivially_... instead" in GPUCC with icx2023 (#592)
 ifneq ($(shell $(CXX) --version | egrep '^(Intel)'),)
-ifneq ($(NVCC),)
-CUFLAGS += -Xcompiler -Wno-deprecated-builtins
+ifneq ($(GPUCC),)
+GPUFLAGS += -Wno-deprecated-builtins
 endif
 endif
 
@@ -541,8 +607,8 @@ endif
 # This patch does remove the warning, but I prefer to keep it disabled for the moment...
 ###ifneq ($(shell $(CXX) --version | egrep '^(clang|Apple clang|Intel)'),)
 ###$(BUILDDIR)/CrossSectionKernels.o: CXXFLAGS += -Wno-overriding-t-option
-###ifneq ($(NVCC),)
-###$(BUILDDIR)/gCrossSectionKernels.o: CUFLAGS += -Xcompiler -Wno-overriding-t-option
+###ifneq ($(GPUCC),)
+###$(BUILDDIR)/gCrossSectionKernels.o: GPUFLAGS += -Xcompiler -Wno-overriding-t-option
 ###endif
 ###endif
 
@@ -569,7 +635,7 @@ MG5AMC_CXXLIB = mg5amc_$(processid_short)_cpp
 cxx_objects_lib=$(BUILDDIR)/CPPProcess.o $(BUILDDIR)/MatrixElementKernels.o $(BUILDDIR)/BridgeKernels.o $(BUILDDIR)/CrossSectionKernels.o
 cxx_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel.o $(BUILDDIR)/RamboSamplingKernels.o
 
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 MG5AMC_CULIB = mg5amc_$(processid_short)_cuda
 cu_objects_lib=$(BUILDDIR)/gCPPProcess.o $(BUILDDIR)/gMatrixElementKernels.o $(BUILDDIR)/gBridgeKernels.o $(BUILDDIR)/gCrossSectionKernels.o
 cu_objects_exe=$(BUILDDIR)/gCommonRandomNumberKernel.o $(BUILDDIR)/gRamboSamplingKernels.o
@@ -581,11 +647,11 @@ $(LIBDIR)/lib$(MG5AMC_CXXLIB).so: cxx_objects_lib += $(BUILDDIR)/fbridge.o
 $(LIBDIR)/lib$(MG5AMC_CXXLIB).so: $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib)
 	$(CXX) -shared -o $@ $(cxx_objects_lib) $(CXXLIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB)
 
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 $(LIBDIR)/lib$(MG5AMC_CULIB).so: $(BUILDDIR)/fbridge_cu.o
 $(LIBDIR)/lib$(MG5AMC_CULIB).so: cu_objects_lib += $(BUILDDIR)/fbridge_cu.o
 $(LIBDIR)/lib$(MG5AMC_CULIB).so: $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cu_objects_lib)
-	$(NVCC) --shared -o $@ $(cu_objects_lib) $(CULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB)
+	$(GPUCC) --shared -o $@ $(cu_objects_lib) $(CULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB)
 endif
 
 #-------------------------------------------------------------------------------
@@ -602,16 +668,16 @@ $(cxx_main): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PAT
 $(cxx_main): $(BUILDDIR)/check_sa.o $(LIBDIR)/lib$(MG5AMC_CXXLIB).so $(cxx_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel.o
 	$(CXX) -o $@ $(BUILDDIR)/check_sa.o $(OMPFLAGS) -ldl -pthread $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_CXXLIB) $(cxx_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel.o $(CURANDLIBFLAGS)
 
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 ifneq ($(shell $(CXX) --version | grep ^Intel),)
-$(cu_main): LIBFLAGS += -lintlc # compile with icpx and link with nvcc (undefined reference to `_intel_fast_memcpy')
-$(cu_main): LIBFLAGS += -lsvml # compile with icpx and link with nvcc (undefined reference to `__svml_cos4_l9')
+$(cu_main): LIBFLAGS += -lintlc # compile with icpx and link with GPUCC (undefined reference to `_intel_fast_memcpy')
+$(cu_main): LIBFLAGS += -lsvml # compile with icpx and link with GPUCC (undefined reference to `__svml_cos4_l9')
 else ifneq ($(shell $(CXX) --version | grep ^nvc++),) # support nvc++ #531
 $(cu_main): LIBFLAGS += -L$(patsubst %bin/nvc++,%lib,$(subst ccache ,,$(CXX))) -lnvhpcatm -lnvcpumath -lnvc
 endif
 $(cu_main): LIBFLAGS += $(CULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH
 $(cu_main): $(BUILDDIR)/gcheck_sa.o $(LIBDIR)/lib$(MG5AMC_CULIB).so $(cu_objects_exe) $(BUILDDIR)/gCurandRandomNumberKernel.o
-	$(NVCC) -o $@ $(BUILDDIR)/gcheck_sa.o $(CUARCHFLAGS) $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe) $(BUILDDIR)/gCurandRandomNumberKernel.o $(CURANDLIBFLAGS)
+	$(GPUCC) -o $@ $(BUILDDIR)/gcheck_sa.o $(CUARCHFLAGS) $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe) $(BUILDDIR)/gCurandRandomNumberKernel.o $(CURANDLIBFLAGS)
 endif
 
 #-------------------------------------------------------------------------------
@@ -637,17 +703,17 @@ $(fcxx_main): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PA
 $(fcxx_main): $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler.o $(LIBDIR)/lib$(MG5AMC_CXXLIB).so $(cxx_objects_exe)
 	$(CXX) -o $@ $(BUILDDIR)/fcheck_sa.o $(OMPFLAGS) $(BUILDDIR)/fsampler.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CXXLIB) $(cxx_objects_exe)
 
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 ifneq ($(shell $(CXX) --version | grep ^Intel),)
-$(fcu_main): LIBFLAGS += -lintlc # compile with icpx and link with nvcc (undefined reference to `_intel_fast_memcpy')
-$(fcu_main): LIBFLAGS += -lsvml # compile with icpx and link with nvcc (undefined reference to `__svml_cos4_l9')
+$(fcu_main): LIBFLAGS += -lintlc # compile with icpx and link with GPUCC (undefined reference to `_intel_fast_memcpy')
+$(fcu_main): LIBFLAGS += -lsvml # compile with icpx and link with GPUCC (undefined reference to `__svml_cos4_l9')
 endif
 ifeq ($(UNAME_S),Darwin)
 $(fcu_main): LIBFLAGS += -L$(shell dirname $(shell $(FC) --print-file-name libgfortran.dylib)) # add path to libgfortran on Mac #375
 endif
 $(fcu_main): LIBFLAGS += $(CULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH
 $(fcu_main): $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler_cu.o $(LIBDIR)/lib$(MG5AMC_CULIB).so $(cu_objects_exe)
-	$(NVCC) -o $@ $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler_cu.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe)
+	$(GPUCC) -o $@ $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler_cu.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe)
 endif
 
 #-------------------------------------------------------------------------------
@@ -659,7 +725,7 @@ $(BUILDDIR)/testxxx.o: testxxx_cc_ref.txt
 $(testmain): $(BUILDDIR)/testxxx.o
 $(testmain): cxx_objects_exe += $(BUILDDIR)/testxxx.o # Comment out this line to skip the C++ test of xxx functions
 
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 $(BUILDDIR)/testxxx_cu.o: $(GTESTLIBS)
 $(BUILDDIR)/testxxx_cu.o: INCFLAGS += $(GTESTINC)
 $(BUILDDIR)/testxxx_cu.o: testxxx_cc_ref.txt
@@ -672,7 +738,7 @@ $(BUILDDIR)/testmisc.o: INCFLAGS += $(GTESTINC)
 $(testmain): $(BUILDDIR)/testmisc.o
 $(testmain): cxx_objects_exe += $(BUILDDIR)/testmisc.o # Comment out this line to skip the C++ miscellaneous tests
 
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 $(BUILDDIR)/testmisc_cu.o: $(GTESTLIBS)
 $(BUILDDIR)/testmisc_cu.o: INCFLAGS += $(GTESTINC)
 $(testmain): $(BUILDDIR)/testmisc_cu.o
@@ -684,12 +750,12 @@ $(BUILDDIR)/runTest.o: INCFLAGS += $(GTESTINC)
 $(testmain): $(BUILDDIR)/runTest.o
 $(testmain): cxx_objects_exe += $(BUILDDIR)/runTest.o
 
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 $(BUILDDIR)/runTest_cu.o: $(GTESTLIBS)
 $(BUILDDIR)/runTest_cu.o: INCFLAGS += $(GTESTINC)
 ifneq ($(shell $(CXX) --version | grep ^Intel),)
-$(testmain): LIBFLAGS += -lintlc # compile with icpx and link with nvcc (undefined reference to `_intel_fast_memcpy')
-$(testmain): LIBFLAGS += -lsvml # compile with icpx and link with nvcc (undefined reference to `__svml_cos4_l9')
+$(testmain): LIBFLAGS += -lintlc # compile with icpx and link with GPUCC (undefined reference to `_intel_fast_memcpy')
+$(testmain): LIBFLAGS += -lsvml # compile with icpx and link with GPUCC (undefined reference to `__svml_cos4_l9')
 else ifneq ($(shell $(CXX) --version | grep ^nvc++),) # support nvc++ #531
 $(testmain): LIBFLAGS += -L$(patsubst %bin/nvc++,%lib,$(subst ccache ,,$(CXX))) -lnvhpcatm -lnvcpumath -lnvc
 endif
@@ -713,14 +779,14 @@ $(testmain): LIBFLAGS += -lgomp
 endif
 endif
 
-ifeq ($(NVCC),) # link only runTest.o
+ifeq ($(GPUCC),) # link only runTest.o
 $(testmain): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH
 $(testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(cxx_objects_exe) $(GTESTLIBS)
 	$(CXX) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) -ldl -pthread $(LIBFLAGS)
 else # link both runTest.o and runTest_cu.o
 $(testmain): LIBFLAGS += $(CULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH
 $(testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) $(GTESTLIBS)
-	$(NVCC) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) -ldl $(LIBFLAGS) -lcuda
+	  $(GPUCC) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) -ldl $(LIBFLAGS) $(CUDATESTFLAGS)
 endif
 
 # Use target gtestlibs to build only googletest
@@ -829,9 +895,9 @@ ifeq ($(USECCACHE),1)
 	ccache --version | head -1
 endif
 	@echo ""
-	@echo NVCC=$(NVCC)
-ifneq ($(NVCC),)
-	$(NVCC) --version
+	@echo GPUCC=$(GPUCC)
+ifneq ($(GPUCC),)
+	$(GPUCC) --version
 endif
 	@echo ""
 	@echo CXX=$(CXX)
@@ -850,7 +916,7 @@ endif
 
 # Target: check (run the C++ test executable)
 # [NB THIS IS WHAT IS USED IN THE GITHUB CI!]
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 check: runTest cmpFcheck cmpFGcheck
 else
 check: runTest cmpFcheck
diff --git a/epochX/cudacpp/gg_ttg.sa/SubProcesses/fbridge.cc b/epochX/cudacpp/gg_ttg.sa/SubProcesses/fbridge.cc
index 2d2b36d560..22ce3f5115 100644
--- a/epochX/cudacpp/gg_ttg.sa/SubProcesses/fbridge.cc
+++ b/epochX/cudacpp/gg_ttg.sa/SubProcesses/fbridge.cc
@@ -1,11 +1,11 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: S. Roiser (Oct 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Roiser, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #include "Bridge.h"
 #include "CPPProcess.h"
-#include "CudaRuntime.h"
+#include "GpuRuntime.h"
 
 extern "C"
 {
@@ -22,7 +22,7 @@ extern "C"
    * Using the same Fortran MadEvent code, linking to the hetrerogeneous library would allow access to both CPU and GPU implementations.
    * The specific heterogeneous configuration (how many GPUs, how many threads on each CPU, etc) could be loaded in CUDA/C++ from a data file.
    */
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   using namespace mg5amcGpu;
 #else
   using namespace mg5amcCpu;
@@ -46,8 +46,8 @@ extern "C"
    */
   void fbridgecreate_( CppObjectInFortran** ppbridge, const int* pnevtF, const int* pnparF, const int* pnp4F )
   {
-#ifdef __CUDACC__
-    CudaRuntime::setUp();
+#ifdef MGONGPUCPP_GPUIMPL
+    GpuRuntime::setUp();
 #endif
     // (NB: CPPProcess::initProc no longer needs to be executed here because it is called in the Bridge constructor)
     // FIXME: disable OMP in Bridge when called from Fortran
@@ -65,8 +65,8 @@ extern "C"
     Bridge<FORTRANFPTYPE>* pbridge = dynamic_cast<Bridge<FORTRANFPTYPE>*>( *ppbridge );
     if( pbridge == 0 ) throw std::runtime_error( "fbridgedelete_: invalid Bridge address" );
     delete pbridge;
-#ifdef __CUDACC__
-    CudaRuntime::tearDown();
+#ifdef MGONGPUCPP_GPUIMPL
+    GpuRuntime::tearDown();
 #endif
   }
 
@@ -96,7 +96,7 @@ extern "C"
   {
     Bridge<FORTRANFPTYPE>* pbridge = dynamic_cast<Bridge<FORTRANFPTYPE>*>( *ppbridge );
     if( pbridge == 0 ) throw std::runtime_error( "fbridgesequence_: invalid Bridge address" );
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     // Use the device/GPU implementation in the CUDA library
     // (there is also a host implementation in this library)
     pbridge->gpu_sequence( momenta, gs, rndhel, rndcol, *pchannelId, mes, selhel, selcol );
diff --git a/epochX/cudacpp/gg_ttg.sa/SubProcesses/fsampler.cc b/epochX/cudacpp/gg_ttg.sa/SubProcesses/fsampler.cc
index 2fb445372d..3743934f41 100644
--- a/epochX/cudacpp/gg_ttg.sa/SubProcesses/fsampler.cc
+++ b/epochX/cudacpp/gg_ttg.sa/SubProcesses/fsampler.cc
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Feb 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #include "mgOnGpuConfig.h"
 
@@ -13,7 +13,7 @@
 
 //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -40,7 +40,7 @@ namespace mg5amcCpu
   private:
     const int m_nevt; // The number of events in each iteration
     int m_iiter;      // The iteration counter (for random number seeding)
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
     HostBufferRndNumMomenta m_hstRndmom; // Memory buffers for random numbers
     HostBufferMomenta m_hstMomenta;      // Memory buffers for momenta
     HostBufferWeights m_hstWeights;      // Memory buffers for sampling weights
@@ -105,7 +105,7 @@ namespace mg5amcCpu
 
 extern "C"
 {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   using namespace mg5amcGpu;
 #else
   using namespace mg5amcCpu;
diff --git a/epochX/cudacpp/gg_ttg.sa/SubProcesses/runTest.cc b/epochX/cudacpp/gg_ttg.sa/SubProcesses/runTest.cc
index d4a760a71b..de327f2321 100644
--- a/epochX/cudacpp/gg_ttg.sa/SubProcesses/runTest.cc
+++ b/epochX/cudacpp/gg_ttg.sa/SubProcesses/runTest.cc
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: S. Hageboeck (Nov 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 //----------------------------------------------------------------------------
 // Use ./runTest.exe --gtest_filter=*xxx to run only testxxx.cc tests
 //----------------------------------------------------------------------------
@@ -18,7 +18,7 @@
 #include "RandomNumberKernels.h"
 #include "epoch_process_id.h"
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 using namespace mg5amcGpu;
 #else
 using namespace mg5amcCpu;
@@ -35,7 +35,7 @@ struct CUDA_CPU_TestBase : public TestDriverBase
     : TestDriverBase( npar, refFileName ) {}
 };
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 struct CPUTest : public CUDA_CPU_TestBase
 {
   // Struct data members (process, and memory structures for random numbers, momenta, matrix elements and weights on host and device)
@@ -119,7 +119,7 @@ struct CPUTest : public CUDA_CPU_TestBase
 };
 #endif
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 struct CUDATest : public CUDA_CPU_TestBase
 {
   // Reset the device when our test goes out of scope. Note that this should happen after
@@ -128,7 +128,7 @@ struct CUDATest : public CUDA_CPU_TestBase
   {
     ~DeviceReset()
     {
-      checkCuda( cudaDeviceReset() ); // this is needed by cuda-memcheck --leak-check full
+      checkGpu( gpuDeviceReset() ); // this is needed by cuda-memcheck --leak-check full
     }
   } deviceResetter;
 
@@ -256,7 +256,7 @@ INSTANTIATE_TEST_SUITE_P( prefix, \
                           test_suite_name, \
                           testing::Values( new CUDATest( MG_EPOCH_REFERENCE_FILE_NAME ) ) );
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 MG_INSTANTIATE_TEST_SUITE_GPU( XTESTID_GPU( MG_EPOCH_PROCESS_ID ), MadgraphTest );
 #else
 MG_INSTANTIATE_TEST_SUITE_CPU( XTESTID_CPU( MG_EPOCH_PROCESS_ID ), MadgraphTest );
diff --git a/epochX/cudacpp/gg_ttg.sa/SubProcesses/testmisc.cc b/epochX/cudacpp/gg_ttg.sa/SubProcesses/testmisc.cc
index 895d6eeb56..ba9e59a8a3 100644
--- a/epochX/cudacpp/gg_ttg.sa/SubProcesses/testmisc.cc
+++ b/epochX/cudacpp/gg_ttg.sa/SubProcesses/testmisc.cc
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 //----------------------------------------------------------------------------
 // Use ./runTest.exe --gtest_filter=*misc to run only testmisc.cc tests
 //----------------------------------------------------------------------------
@@ -17,7 +17,7 @@
 #include <sstream>
 #include <typeinfo>
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 #define TESTID( s ) s##_GPU_MISC
 #else
 #define TESTID( s ) s##_CPU_MISC
@@ -26,7 +26,7 @@
 #define XTESTID( s ) TESTID( s )
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -59,7 +59,7 @@ namespace mg5amcCpu
 
 TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testmisc )
 {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   using namespace mg5amcGpu;
 #else
   using namespace mg5amcCpu;
diff --git a/epochX/cudacpp/gg_ttg.sa/SubProcesses/testxxx.cc b/epochX/cudacpp/gg_ttg.sa/SubProcesses/testxxx.cc
index 3361fe5aa9..e5167de00c 100644
--- a/epochX/cudacpp/gg_ttg.sa/SubProcesses/testxxx.cc
+++ b/epochX/cudacpp/gg_ttg.sa/SubProcesses/testxxx.cc
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Apr 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 //----------------------------------------------------------------------------
 // Use ./runTest.exe --gtest_filter=*xxx to run only testxxx.cc tests
 //----------------------------------------------------------------------------
@@ -24,7 +24,7 @@
 #include <iomanip>
 #include <iostream>
 #include <vector>
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 #define TESTID( s ) s##_GPU_XXX
 #else
 #define TESTID( s ) s##_CPU_XXX
@@ -32,7 +32,7 @@
 
 #define XTESTID( s ) TESTID( s )
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -42,7 +42,7 @@ namespace mg5amcCpu
   int FPEhandlerIevt = -1;
   inline void FPEhandler( int sig )
   {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     std::cerr << "Floating Point Exception (GPU): '" << FPEhandlerMessage << "' ievt=" << FPEhandlerIevt << std::endl;
 #else
     std::cerr << "Floating Point Exception (CPU neppV=" << neppV << "): '" << FPEhandlerMessage << "' ievt=" << FPEhandlerIevt << std::endl;
@@ -53,7 +53,7 @@ namespace mg5amcCpu
 
 TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testxxx )
 {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   using namespace mg5amcGpu;
 #else
   using namespace mg5amcCpu;
@@ -77,7 +77,7 @@ TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testxxx )
   assert( nevt % neppM == 0 ); // nevt must be a multiple of neppM
   assert( nevt % neppV == 0 ); // nevt must be a multiple of neppV
   // Fill in the input momenta
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   mg5amcGpu::PinnedHostBufferMomenta hstMomenta( nevt ); // AOSOA[npagM][npar=4][np4=4][neppM]
 #else
   mg5amcCpu::HostBufferMomenta hstMomenta( nevt ); // AOSOA[npagM][npar=4][np4=4][neppM]
@@ -322,7 +322,7 @@ TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testxxx )
   {
     for( int ievt = 0; ievt < nevt; ievt++ )
     {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
       using namespace mg5amcGpu;
 #else
       using namespace mg5amcCpu;
diff --git a/epochX/cudacpp/gg_ttg.sa/src/HelAmps_sm.h b/epochX/cudacpp/gg_ttg.sa/src/HelAmps_sm.h
index 361b488401..0dd0f3ebba 100644
--- a/epochX/cudacpp/gg_ttg.sa/src/HelAmps_sm.h
+++ b/epochX/cudacpp/gg_ttg.sa/src/HelAmps_sm.h
@@ -5,7 +5,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: A. Valassi (Sep 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
 // MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
@@ -28,7 +28,7 @@
 //#include <iomanip>
 //#include <iostream>
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/gg_ttg.sa/src/Parameters_sm.cc b/epochX/cudacpp/gg_ttg.sa/src/Parameters_sm.cc
index 64fc3fea62..067445b198 100644
--- a/epochX/cudacpp/gg_ttg.sa/src/Parameters_sm.cc
+++ b/epochX/cudacpp/gg_ttg.sa/src/Parameters_sm.cc
@@ -4,7 +4,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: A. Valassi (Sep 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
 // MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
@@ -17,7 +17,7 @@
 #include <iomanip>
 #include <iostream>
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 using namespace mg5amcGpu;
 #else
 using namespace mg5amcCpu;
diff --git a/epochX/cudacpp/gg_ttg.sa/src/Parameters_sm.h b/epochX/cudacpp/gg_ttg.sa/src/Parameters_sm.h
index b6568d3761..9581d66e0e 100644
--- a/epochX/cudacpp/gg_ttg.sa/src/Parameters_sm.h
+++ b/epochX/cudacpp/gg_ttg.sa/src/Parameters_sm.h
@@ -27,7 +27,7 @@
 #include "read_slha.h"
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -93,7 +93,7 @@ namespace mg5amcCpu
 #include <limits>
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -218,7 +218,7 @@ namespace mg5amcCpu
 //==========================================================================
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -239,7 +239,7 @@ namespace mg5amcCpu
 #pragma GCC diagnostic push
 #pragma GCC diagnostic ignored "-Wunused-variable"  // e.g. <<warning: unused variable ‘mdl_G__exp__2’ [-Wunused-variable]>>
 #pragma GCC diagnostic ignored "-Wunused-parameter" // e.g. <<warning: unused parameter ‘G’ [-Wunused-parameter]>>
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 #pragma nv_diagnostic push
 #pragma nv_diag_suppress 177 // e.g. <<warning #177-D: variable "mdl_G__exp__2" was declared but never referenced>>
 #endif
@@ -267,7 +267,7 @@ namespace mg5amcCpu
       // End SM implementation - no special handling of vectors of floats as in EFT (#439)
       return out;
     }
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 #pragma GCC diagnostic pop
 #pragma nv_diagnostic pop
 #endif
diff --git a/epochX/cudacpp/gg_ttg.sa/src/cudacpp_src.mk b/epochX/cudacpp/gg_ttg.sa/src/cudacpp_src.mk
index d4cc628aec..159e19a46d 100644
--- a/epochX/cudacpp/gg_ttg.sa/src/cudacpp_src.mk
+++ b/epochX/cudacpp/gg_ttg.sa/src/cudacpp_src.mk
@@ -19,7 +19,7 @@ SHELL := /bin/bash
 #=== Configure common compiler flags for CUDA and C++
 
 INCFLAGS = -I.
-OPTFLAGS = -O3 # this ends up in CUFLAGS too (should it?), cannot add -Ofast or -ffast-math here
+OPTFLAGS = -O3 # this ends up in GPUFLAGS too (should it?), cannot add -Ofast or -ffast-math here
 
 #-------------------------------------------------------------------------------
 
@@ -45,13 +45,13 @@ endif
 
 #-------------------------------------------------------------------------------
 
-#=== Configure the CUDA compiler (note: NVCC is already exported including ccache)
+#=== Configure the CUDA compiler (note: GPUCC is already exported including ccache)
 
-###$(info NVCC=$(NVCC))
+###$(info GPUCC=$(GPUCC))
 
 #-------------------------------------------------------------------------------
 
-#=== Configure ccache for C++ builds (note: NVCC is already exported including ccache)
+#=== Configure ccache for C++ builds (note: GPUCC is already exported including ccache)
 
 # Enable ccache if USECCACHE=1
 ifeq ($(USECCACHE)$(shell echo $(CXX) | grep ccache),1)
@@ -92,6 +92,13 @@ endif
 ###$(info OMPFLAGS=$(OMPFLAGS))
 CXXFLAGS += $(OMPFLAGS)
 
+# Add correct -DHIP_LATFORM when compiling for HIP
+ifeq ($(findstring nvcc,$(GPUCC)),nvcc)
+  GPUFLAGS += -Xcompiler -fPIC -c -x cu
+else ifeq ($(findstring hipcc,$(GPUCC)),hipcc)
+  GPUFLAGS += -fPIC -c
+endif
+
 # Set the build flags appropriate to each AVX choice (example: "make AVX=none")
 # [NB MGONGPU_PVW512 is needed because "-mprefer-vector-width=256" is not exposed in a macro]
 # [See https://gcc.gnu.org/bugzilla/show_bug.cgi?id=96476]
@@ -253,20 +260,20 @@ $(BUILDDIR)/%.o : %.cc *.h $(BUILDDIR)/.build.$(TAG)
 # Generic target and build rules: objects from CUDA compilation
 $(BUILDDIR)/%_cu.o : %.cc *.h $(BUILDDIR)/.build.$(TAG)
 	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
-	$(NVCC) $(CPPFLAGS) $(CUFLAGS) -Xcompiler -fPIC -c -x cu $< -o $@
+	$(GPUCC) $(CPPFLAGS) $(GPUFLAGS) $< -o $@
 
 #-------------------------------------------------------------------------------
 
 cxx_objects=$(addprefix $(BUILDDIR)/, Parameters_sm.o read_slha.o)
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 cu_objects=$(addprefix $(BUILDDIR)/, Parameters_sm_cu.o)
 endif
 
 # Target (and build rules): common (src) library
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so : $(cxx_objects) $(cu_objects)
 	@if [ ! -d $(LIBDIR) ]; then echo "mkdir -p $(LIBDIR)"; mkdir -p $(LIBDIR); fi
-	$(NVCC) -shared -o $@ $(cxx_objects) $(cu_objects) $(LDFLAGS)
+	$(GPUCC) -shared -o $@ $(cxx_objects) $(cu_objects) $(LDFLAGS)
 else
 $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so : $(cxx_objects)
 	@if [ ! -d $(LIBDIR) ]; then echo "mkdir -p $(LIBDIR)"; mkdir -p $(LIBDIR); fi
diff --git a/epochX/cudacpp/gg_ttg.sa/src/mgOnGpuConfig.h b/epochX/cudacpp/gg_ttg.sa/src/mgOnGpuConfig.h
index b247654dcf..da4ba36ad8 100644
--- a/epochX/cudacpp/gg_ttg.sa/src/mgOnGpuConfig.h
+++ b/epochX/cudacpp/gg_ttg.sa/src/mgOnGpuConfig.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jul 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MGONGPUCONFIG_H
 #define MGONGPUCONFIG_H 1
@@ -10,12 +10,25 @@
 // There are two different code bases for standalone_cudacpp (without multichannel) and madevent+cudacpp (with multichannel)
 #undef MGONGPU_SUPPORTS_MULTICHANNEL
 
+// Is this a GPU (CUDA, HIP) or CPU implementation?
+#ifdef __CUDACC__
+#define MGONGPUCPP_GPUIMPL cuda
+#elif defined __HIPCC__
+#define MGONGPUCPP_GPUIMPL hip
+#else
+#undef MGONGPUCPP_GPUIMPL
+#endif
+
 // ** NB1 Throughputs (e.g. 6.8E8) are events/sec for "./gcheck.exe -p 65536 128 12"
 // ** NB2 Baseline on b7g47n0004 fluctuates (probably depends on load on other VMs)
 
 // Choose if curand is supported for generating random numbers
+// For HIP, by default, do not use curand (common random numbers will be used instead)
 // For both CUDA and C++, by default, do not inline, but allow this macro to be set from outside with e.g. -DMGONGPU_HAS_NO_CURAND
-// (there exist CUDA installations, e.g. using the HPC package, which do not include curand - see PR #784)
+// (there exist CUDA installations, e.g. using the HPC package, which do not include curand - see PR #784 and #785)
+#if defined __HIPCC__
+#define MGONGPU_HAS_NO_CURAND 1
+#else
 //#ifdef __CUDACC__
 //#undef MGONGPU_HAS_NO_CURAND // default
 ////#define MGONGPU_HAS_NO_CURAND 1
@@ -23,6 +36,7 @@
 //#undef MGONGPU_HAS_NO_CURAND // default
 ////#define MGONGPU_HAS_NO_CURAND 1
 //#endif
+#endif
 
 // Choose floating point precision (for everything but color algebra #537)
 // If one of these macros has been set from outside with e.g. -DMGONGPU_FPTYPE_FLOAT, nothing happens (issue #167)
@@ -54,23 +68,28 @@
 //#undef MGONGPU_HARDCODE_PARAM // default
 ////#define MGONGPU_HARDCODE_PARAM 1
 
-// Complex type in c++: std::complex or cxsmpl (CHOOSE ONLY ONE)
-#ifndef __CUDACC__
-//#define MGONGPU_CPPCXTYPE_STDCOMPLEX 1 // ~8 percent slower on float, same on double (5.1E6/double, 9.4E6/float)
-#define MGONGPU_CPPCXTYPE_CXSMPL 1 // new default (5.1E6/double, 10.2E6/float)
-#endif
-
-// Complex type in cuda: thrust or cucomplex or cxsmpl (CHOOSE ONLY ONE)
+// Complex type in CUDA: thrust or cucomplex or cxsmpl (CHOOSE ONLY ONE)
 #ifdef __CUDACC__
 #define MGONGPU_CUCXTYPE_THRUST 1 // default (~1.15E9/double, ~3.2E9/float)
 //#define MGONGPU_CUCXTYPE_CUCOMPLEX 1 // ~10 percent slower (1.03E9/double, ~2.8E9/float)
 //#define MGONGPU_CUCXTYPE_CXSMPL 1 // ~10 percent slower (1.00E9/double, ~2.9E9/float)
+
+// Complex type in HIP: cxsmpl (ONLY ONE OPTION POSSIBLE)
+#elif defined __HIPCC__
+#define MGONGPU_CUCXTYPE_CXSMPL 1 // ~10 percent slower (1.00E9/double, ~2.9E9/float)
+
+// Complex type in C++: std::complex or cxsmpl (CHOOSE ONLY ONE)
+#else
+//#define MGONGPU_CPPCXTYPE_STDCOMPLEX 1 // ~8 percent slower on float, same on double (5.1E6/double, 9.4E6/float)
+#define MGONGPU_CPPCXTYPE_CXSMPL 1 // new default (5.1E6/double, 10.2E6/float)
 #endif
 
-// Cuda nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation
+// CUDA nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation
 #ifdef __CUDACC__
-#undef MGONGPU_NSIGHT_DEBUG // default
+#undef MGONGPU_NSIGHT_DEBUG // default in CUDA
 //#define MGONGPU_NSIGHT_DEBUG 1
+#else
+#undef MGONGPU_NSIGHT_DEBUG // only option in HIP or C++
 #endif
 
 // SANITY CHECKS (floating point precision for everything but color algebra #537)
@@ -86,17 +105,21 @@
 #error You cannot use double precision for color algebra and single precision elsewhere
 #endif
 
-// SANITY CHECKS (c++ complex number implementation)
-#ifndef __CUDACC__
-#if defined MGONGPU_CPPCXTYPE_STDCOMPLEX and defined MGONGPU_CPPCXTYPE_CXSMPL
-#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CPPCXTYPE_STDCOMPLEX or MGONGPU_CPPCXTYPE_CXSMPL
+// SANITY CHECKS (CUDA complex number implementation)
+#ifdef __CUDACC__
+#if defined MGONGPU_CUCXTYPE_THRUST and defined MGONGPU_CUCXTYPE_CUCOMPLEX
+#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CUCXTYPE_THRUST or MGONGPU_CUCXTYPE_CUCOMPLEX for CUDA
+#elif defined MGONGPU_CUCXTYPE_THRUST and defined MGONGPU_CUCXTYPE_CXSMPL
+#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CUCXTYPE_THRUST or MGONGPU_CUCXTYPE_CXSMPL for CUDA
+#elif defined MGONGPU_CUCXTYPE_CUCOMPLEX and defined MGONGPU_CUCXTYPE_CXSMPL
+#error You must CHOOSE (ONE AND) ONLY ONE OF MGONGPU_CUCXTYPE_CUCOMPLEX or MGONGPU_CUCXTYPE_CXSMPL for CUDA
 #endif
 #endif
 
-// SANITY CHECKS (cuda complex number implementation)
-#ifdef __CUDACC__
-#if defined MGONGPU_CUCXTYPE_THRUST and defined MGONGPU_CUCXTYPE_CUCOMPLEX and defined MGONGPU_CUCXTYPE_CXSMPL
-#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CUCXTYPE_THRUST or MGONGPU_CUCXTYPE_CUCOMPLEX or MGONGPU_CUCXTYPE_CXSMPL
+// SANITY CHECKS (C++ complex number implementation)
+#ifndef MGONGPUCPP_GPUIMPL
+#if defined MGONGPU_CPPCXTYPE_STDCOMPLEX and defined MGONGPU_CPPCXTYPE_CXSMPL
+#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CPPCXTYPE_STDCOMPLEX or MGONGPU_CPPCXTYPE_CXSMPL for C++
 #endif
 #endif
 
@@ -134,7 +157,7 @@ namespace mgOnGpu
   // Alignment requirement for using reinterpret_cast with SIMD vectorized code
   // (using reinterpret_cast with non aligned memory may lead to segmentation faults!)
   // Only needed for C++ code but can be enforced also in NVCC builds of C++ code using CUDA>=11.2 and C++17 (#318, #319, #333)
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   constexpr int cppAlign = 64; // alignment requirement for SIMD vectorization (64-byte i.e. 512-bit)
 #endif
 
@@ -145,7 +168,7 @@ using mgOnGpu::fptype;
 using mgOnGpu::fptype2;
 
 // C++ SIMD vectorization width (this will be used to set neppV)
-#ifdef __CUDACC__ // CUDA implementation has no SIMD
+#ifdef MGONGPUCPP_GPUIMPL // CUDA and HIP implementations have no SIMD
 #undef MGONGPU_CPPSIMD
 #elif defined __AVX512VL__ && defined MGONGPU_PVW512 // C++ "512z" AVX512 with 512 width (512-bit ie 64-byte): 8 (DOUBLE) or 16 (FLOAT)
 #ifdef MGONGPU_FPTYPE_DOUBLE
@@ -175,9 +198,9 @@ using mgOnGpu::fptype2;
 #undef MGONGPU_CPPSIMD
 #endif
 
-// Cuda nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation
+// CUDA nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation
 // Arguments (not used so far): text is __FUNCTION__, code is 0 (start) or 1 (end)
-#if defined __CUDACC__ && defined MGONGPU_NSIGHT_DEBUG /* clang-format off */
+#if defined __CUDA__ && defined MGONGPU_NSIGHT_DEBUG /* clang-format off */
 #define mgDebugDeclare() __shared__ float mgDebugCounter[mgOnGpu::ntpbMAX];
 #define mgDebugInitialise() { mgDebugCounter[threadIdx.x] = 0; }
 #define mgDebug( code, text ) { mgDebugCounter[threadIdx.x] += 1; }
@@ -189,8 +212,8 @@ using mgOnGpu::fptype2;
 #define mgDebugFinalise() { /*noop*/ }
 #endif /* clang-format on */
 
-// Define empty CUDA declaration specifiers for C++
-#ifndef __CUDACC__
+// Define empty CUDA/HIP declaration specifiers for C++
+#ifndef MGONGPUCPP_GPUIMPL
 #define __global__
 #define __host__
 #define __device__
diff --git a/epochX/cudacpp/gg_ttg.sa/src/mgOnGpuCxtypes.h b/epochX/cudacpp/gg_ttg.sa/src/mgOnGpuCxtypes.h
index ca9a9f00c0..5532e22fa1 100644
--- a/epochX/cudacpp/gg_ttg.sa/src/mgOnGpuCxtypes.h
+++ b/epochX/cudacpp/gg_ttg.sa/src/mgOnGpuCxtypes.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022, based on earlier work by D. Smith) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MGONGPUCXTYPES_H
 #define MGONGPUCXTYPES_H 1
@@ -19,7 +19,7 @@
 #include <complex>
 
 // Complex type in cuda: thrust or cucomplex or cxsmpl
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 #if defined MGONGPU_CUCXTYPE_THRUST
 #pragma clang diagnostic push
 #pragma clang diagnostic ignored "-Wtautological-compare" // for icpx2021/clang13 (https://stackoverflow.com/a/15864661)
@@ -82,7 +82,7 @@ namespace mgOnGpu /* clang-format off */
 using mgOnGpu::cxsmpl;
 
 // Printout to stream for user defined types
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -92,7 +92,7 @@ namespace mg5amcCpu
   inline __host__ std::ostream&
   operator<<( std::ostream& out, const cxsmpl<FP>& c )
   {
-    out << std::complex( c.real(), c.imag() );
+    out << std::complex<FP>( c.real(), c.imag() );
     return out;
   }
 
@@ -215,14 +215,14 @@ namespace mg5amcCpu
 //==========================================================================
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
 #endif
 {
   // --- Type definitions (complex type: cxtype)
-#ifdef __CUDACC__ // cuda
+#ifdef MGONGPUCPP_GPUIMPL // cuda
 #if defined MGONGPU_CUCXTYPE_THRUST
   typedef thrust::complex<fptype> cxtype;
 #elif defined MGONGPU_CUCXTYPE_CUCOMPLEX
@@ -255,7 +255,7 @@ namespace mg5amcCpu
 //==========================================================================
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -307,7 +307,7 @@ namespace mg5amcCpu
 
   //==========================================================================
 
-#if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_THRUST // cuda + thrust
+#if defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CUCXTYPE_THRUST // cuda + thrust
 
   //------------------------------
   // CUDA - using thrust::complex
@@ -343,11 +343,11 @@ namespace mg5amcCpu
     return c;
   }
 
-#endif // #if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_THRUST
+#endif // #if defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CUCXTYPE_THRUST
 
   //==========================================================================
 
-#if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_CUCOMPLEX // cuda + cucomplex
+#if defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CUCXTYPE_CUCOMPLEX // cuda + cucomplex
 
   //------------------------------
   // CUDA - using cuComplex
@@ -562,11 +562,11 @@ namespace mg5amcCpu
     return cxmake( c.real(), c.imag() );
   }
 
-#endif // #if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_CUCOMPLEX
+#endif // #if defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CUCXTYPE_CUCOMPLEX
 
   //==========================================================================
 
-#if not defined __CUDACC__ and defined MGONGPU_CPPCXTYPE_STDCOMPLEX // c++ + stdcomplex
+#if not defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CPPCXTYPE_STDCOMPLEX // c++ + stdcomplex
 
   //------------------------------
   // C++ - using std::complex
@@ -610,7 +610,7 @@ namespace mg5amcCpu
   }
 #endif
 
-#endif // #if not defined __CUDACC__ and defined MGONGPU_CPPCXTYPE_STDCOMPLEX
+#endif // #if not defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CPPCXTYPE_STDCOMPLEX
 
   //==========================================================================
 
@@ -633,7 +633,7 @@ namespace mg5amcCpu
 //==========================================================================
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/gg_ttg.sa/src/mgOnGpuFptypes.h b/epochX/cudacpp/gg_ttg.sa/src/mgOnGpuFptypes.h
index 905c97d700..fa3a02664b 100644
--- a/epochX/cudacpp/gg_ttg.sa/src/mgOnGpuFptypes.h
+++ b/epochX/cudacpp/gg_ttg.sa/src/mgOnGpuFptypes.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MGONGPUFPTYPES_H
 #define MGONGPUFPTYPES_H 1
@@ -12,7 +12,7 @@
 #include <cmath>
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL // cuda
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -20,7 +20,7 @@ namespace mg5amcCpu
 {
   //==========================================================================
 
-#ifdef __CUDACC__ // cuda
+#ifdef MGONGPUCPP_GPUIMPL // cuda
 
   //------------------------------
   // Floating point types - Cuda
@@ -64,11 +64,11 @@ namespace mg5amcCpu
 #endif
   }
 
-#endif // #ifdef __CUDACC__
+#endif // #ifdef MGONGPUCPP_GPUIMPL
 
   //==========================================================================
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 
   //------------------------------
   // Floating point types - C++
@@ -92,7 +92,7 @@ namespace mg5amcCpu
     return std::sqrt( f );
   }
 
-#endif // #ifndef __CUDACC__
+#endif // #ifndef MGONGPUCPP_GPUIMPL
 
   //==========================================================================
 
diff --git a/epochX/cudacpp/gg_ttg.sa/src/mgOnGpuVectors.h b/epochX/cudacpp/gg_ttg.sa/src/mgOnGpuVectors.h
index e1299ba81e..cdae04326b 100644
--- a/epochX/cudacpp/gg_ttg.sa/src/mgOnGpuVectors.h
+++ b/epochX/cudacpp/gg_ttg.sa/src/mgOnGpuVectors.h
@@ -32,7 +32,7 @@
 #endif
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -131,7 +131,7 @@ namespace mg5amcCpu
 #endif
 #endif
 
-#else // i.e #ifndef MGONGPU_CPPSIMD (this includes #ifdef __CUDACC__)
+#else // i.e #ifndef MGONGPU_CPPSIMD (this includes #ifdef MGONGPUCPP_GPUIMPL)
 
   const int neppV = 1;
 
@@ -153,13 +153,13 @@ namespace mg5amcCpu
 //==========================================================================
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
 #endif
 {
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 
   // Printout to stream for user defined types
 
@@ -805,11 +805,11 @@ namespace mg5amcCpu
 
 #endif // #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
 
-#endif // #ifndef __CUDACC__
+#endif // #ifndef MGONGPUCPP_GPUIMPL
 
   //==========================================================================
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 
   //------------------------------
   // Vector types - CUDA
@@ -853,12 +853,12 @@ namespace mg5amcCpu
     return mask;
   }
 
-#endif // #ifdef __CUDACC__
+#endif // #ifdef MGONGPUCPP_GPUIMPL
 
   //==========================================================================
 
   // Scalar-or-vector types: scalar in CUDA, vector or scalar in C++
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   typedef bool bool_sv;
   typedef fptype fptype_sv;
   typedef fptype2 fptype2_sv;
@@ -879,7 +879,7 @@ namespace mg5amcCpu
 #endif
 
   // Scalar-or-vector zeros: scalar in CUDA, vector or scalar in C++
-#ifdef __CUDACC__ /* clang-format off */
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
   inline __host__ __device__ cxtype cxzero_sv(){ return cxtype( 0, 0 ); }
 #elif defined MGONGPU_CPPSIMD
   inline cxtype_v cxzero_sv() { return cxtype_v(); } // RRRR=0000 IIII=0000
diff --git a/epochX/cudacpp/gg_ttg.sa/src/rambo.h b/epochX/cudacpp/gg_ttg.sa/src/rambo.h
index e02ea52496..cd7e1008ea 100644
--- a/epochX/cudacpp/gg_ttg.sa/src/rambo.h
+++ b/epochX/cudacpp/gg_ttg.sa/src/rambo.h
@@ -4,7 +4,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 
 #include "mgOnGpuConfig.h"
@@ -18,7 +18,7 @@
 #include <iostream>
 
 // Simplified rambo version for 2 to N (with N>=2) processes with massless particles
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -83,7 +83,7 @@ namespace mg5amcCpu
       static bool first = true;
       if( first )
       {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
         if constexpr( M_ACCESS::isOnDevice() ) // avoid
         {
           const int ievt0 = 0;
@@ -166,7 +166,7 @@ namespace mg5amcCpu
     wt = po2log;
     if( nparf != 2 ) wt = ( 2. * nparf - 4. ) * log( energy ) + z[nparf - 1];
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
     // issue warnings if weight is too small or too large
     static int iwarn[5] = { 0, 0, 0, 0, 0 };
     if( wt < -180. )
diff --git a/epochX/cudacpp/gg_ttgg.mad/CODEGEN_mad_gg_ttgg_log.txt b/epochX/cudacpp/gg_ttgg.mad/CODEGEN_mad_gg_ttgg_log.txt
index 2c2fae1608..3a2b1ad647 100644
--- a/epochX/cudacpp/gg_ttgg.mad/CODEGEN_mad_gg_ttgg_log.txt
+++ b/epochX/cudacpp/gg_ttgg.mad/CODEGEN_mad_gg_ttgg_log.txt
@@ -52,7 +52,7 @@ Note that you can still compile and run aMC@NLO with the built-in PDFs
 
 Using default text editor "vi". Set another one in ./input/mg5_configuration.txt
 Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt
-No valid web browser found. Please set in ./input/mg5_configuration.txt
+Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt
 import /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg.mg
 The import format was not given, so we guess it as command
 set stdout_level DEBUG
@@ -62,7 +62,7 @@ generate g g > t t~ g g
 No model currently active, so we import the Standard Model
 INFO: load particles 
 INFO: load vertices 
-[1;32mDEBUG: model prefixing  takes 0.0057299137115478516 [0m
+[1;32mDEBUG: model prefixing  takes 0.0053348541259765625 [0m
 INFO: Restrict model sm with file models/sm/restrict_default.dat . 
 [1;32mDEBUG: Simplifying conditional expressions [0m
 [1;32mDEBUG: remove interactions: u s w+ at order: QED=1 [0m
@@ -155,7 +155,7 @@ INFO: Please specify coupling orders to bypass this step.
 INFO: Trying coupling order WEIGHTED<=4: WEIGTHED IS QCD+2*QED 
 INFO: Trying process: g g > t t~ g g WEIGHTED<=4 @1  
 INFO: Process has 123 diagrams 
-1 processes with 123 diagrams generated in 0.163 s
+1 processes with 123 diagrams generated in 0.156 s
 Total: 1 processes with 123 diagrams
 output madevent ../TMPOUT/CODEGEN_mad_gg_ttgg --hel_recycling=False --vector_size=32 --me_exporter=standalone_cudacpp
 Load PLUGIN.CUDACPP_OUTPUT
@@ -175,7 +175,7 @@ INFO: Generating Helas calls for process: g g > t t~ g g WEIGHTED<=4 @1
 INFO: Processing color information for process: g g > t t~ g g @1 
 INFO: Creating files in directory P1_gg_ttxgg 
 [1;32mDEBUG:  kwargs[prefix] = 0 [1;30m[model_handling.py at line 1058][0m [0m
-[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7f37353fffa0> [1;30m[export_v4.py at line 6262][0m [0m
+[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7f5b0f500c70> [1;30m[export_v4.py at line 6262][0m [0m
 INFO: Creating files in directory . 
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.h
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.cc
@@ -190,15 +190,15 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./.
 [1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m 32 True 32 [1;30m[export_v4.py at line 1872][0m [0m
 INFO: Generating Feynman diagrams for Process: g g > t t~ g g WEIGHTED<=4 @1 
 INFO: Finding symmetric diagrams for subprocess group gg_ttxgg 
-Generated helas calls for 1 subprocesses (123 diagrams) in 0.433 s
-Wrote files for 222 helas calls in 0.711 s
+Generated helas calls for 1 subprocesses (123 diagrams) in 0.437 s
+Wrote files for 222 helas calls in 0.735 s
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates VVV1 routines[0m
 ALOHA: aloha creates FFV1 routines[0m
 ALOHA: aloha creates VVVV1 routines[0m
 ALOHA: aloha creates VVVV3 routines[0m
 ALOHA: aloha creates VVVV4 routines[0m
-ALOHA: aloha creates 5 routines in  0.336 s
+ALOHA: aloha creates 5 routines in  0.441 s
 [1;32mDEBUG:  Entering PLUGIN_ProcessExporter.convert_model (create the model) [1;30m[output.py at line 202][0m [0m
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates VVV1 routines[0m
@@ -206,7 +206,7 @@ ALOHA: aloha creates FFV1 routines[0m
 ALOHA: aloha creates VVVV1 routines[0m
 ALOHA: aloha creates VVVV3 routines[0m
 ALOHA: aloha creates VVVV4 routines[0m
-ALOHA: aloha creates 10 routines in  0.327 s
+ALOHA: aloha creates 10 routines in  0.309 s
 <class 'aloha.create_aloha.AbstractRoutine'> VVV1
 <class 'aloha.create_aloha.AbstractRoutine'> VVV1
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
@@ -255,10 +255,10 @@ Type "launch" to generate events from this process, or see
 Run "open index.html" to see more information about this process.
 quit
 
-real	0m3.329s
-user	0m3.091s
-sys	0m0.226s
-Code generation completed in 4 seconds
+real	0m3.582s
+user	0m3.061s
+sys	0m0.243s
+Code generation completed in 3 seconds
 ************************************************************
 *                                                          *
 *                      W E L C O M E to                    *
@@ -284,7 +284,7 @@ INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/mg5amc
 INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg/Cards/me5_configuration.txt  
 Using default text editor "vi". Set another one in ./input/mg5_configuration.txt
 Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt
-No valid web browser found. Please set in ./input/mg5_configuration.txt
+Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt
 treatcards run
 quit
 INFO:  
@@ -314,7 +314,7 @@ INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/mg5amc
 INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg/Cards/me5_configuration.txt  
 Using default text editor "vi". Set another one in ./input/mg5_configuration.txt
 Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt
-No valid web browser found. Please set in ./input/mg5_configuration.txt
+Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt
 treatcards param
 quit
 INFO:  
diff --git a/epochX/cudacpp/gg_ttgg.mad/COPYRIGHT b/epochX/cudacpp/gg_ttgg.mad/COPYRIGHT
index a134b5fef9..84a883fbb0 100644
--- a/epochX/cudacpp/gg_ttgg.mad/COPYRIGHT
+++ b/epochX/cudacpp/gg_ttgg.mad/COPYRIGHT
@@ -15,6 +15,7 @@ The full development team currently includes the following authors :
   Stephan Hageboeck (CERN)
   Olivier Mattelaer (Universite Catholique de Louvain, original author)
   Stefan Roiser (CERN, original author)
+  Joergen Teig (CERN)
   Andrea Valassi (CERN, original author)
   Zenny Wettersten (CERN)
 See https://github.com/madgraph5/madgraph4gpu for more details. For the full
diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/Bridge.h b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/Bridge.h
index bf8b5e024d..89437b4c42 100644
--- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/Bridge.h
+++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/Bridge.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: S. Roiser (Nov 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Roiser, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef BRIDGE_H
 #define BRIDGE_H 1
@@ -23,7 +23,7 @@
 #include <memory>
 #include <type_traits>
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -83,7 +83,7 @@ namespace mg5amcCpu
     Bridge& operator=( const Bridge& ) = delete;
     Bridge& operator=( Bridge&& ) = delete;
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     /**
      * Set the gpublocks and gputhreads for the gpusequence - throws if evnt != gpublocks*gputhreads
      * (this is needed for BridgeKernel tests rather than for actual production use in Fortran)
@@ -150,7 +150,7 @@ namespace mg5amcCpu
     unsigned int m_nevt; // number of events
     int m_nGoodHel;      // the number of good helicities (-1 initially when they have not yet been calculated)
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     int m_gputhreads; // number of gpu threads (default set from number of events, can be modified)
     int m_gpublocks;  // number of gpu blocks (default set from number of events, can be modified)
     DeviceBuffer<FORTRANFPTYPE, sizePerEventMomenta> m_devMomentaF;
@@ -187,12 +187,12 @@ namespace mg5amcCpu
   // Forward declare transposition methods
   //
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 
   template<typename Tin, typename Tout>
   __global__ void dev_transposeMomentaF2C( const Tin* in, Tout* out, const unsigned int nevt );
 
-#endif // __CUDACC__
+#endif // MGONGPUCPP_GPUIMPL
 
   template<typename Tin, typename Tout>
   void hst_transposeMomentaF2C( const Tin* in, Tout* out, const unsigned int nevt );
@@ -209,7 +209,7 @@ namespace mg5amcCpu
   Bridge<FORTRANFPTYPE>::Bridge( unsigned int nevtF, unsigned int nparF, unsigned int np4F )
     : m_nevt( nevtF )
     , m_nGoodHel( -1 )
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     , m_gputhreads( 256 )                  // default number of gpu threads
     , m_gpublocks( m_nevt / m_gputhreads ) // this ensures m_nevt <= m_gpublocks*m_gputhreads
     , m_devMomentaF( m_nevt )
@@ -233,7 +233,7 @@ namespace mg5amcCpu
   {
     if( nparF != CPPProcess::npar ) throw std::runtime_error( "Bridge constructor: npar mismatch" );
     if( np4F != CPPProcess::np4 ) throw std::runtime_error( "Bridge constructor: np4 mismatch" );
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     if( ( m_nevt < s_gputhreadsmin ) || ( m_nevt % s_gputhreadsmin != 0 ) )
       throw std::runtime_error( "Bridge constructor: nevt should be a multiple of " + std::to_string( s_gputhreadsmin ) );
     while( m_nevt != m_gpublocks * m_gputhreads )
@@ -249,7 +249,7 @@ namespace mg5amcCpu
 #else
     std::cout << "WARNING! Instantiate host Bridge (nevt=" << m_nevt << ")" << std::endl;
     m_pmek.reset( new MatrixElementKernelHost( m_hstMomentaC, m_hstGs, m_hstRndHel, m_hstRndCol, m_hstMEs, m_hstSelHel, m_hstSelCol, m_nevt ) );
-#endif // __CUDACC__
+#endif // MGONGPUCPP_GPUIMPL
     // Create a process object, read param card and set parameters
     // FIXME: the process instance can happily go out of scope because it is only needed to read parameters?
     // FIXME: the CPPProcess should really be a singleton? what if fbridgecreate is called from several Fortran threads?
@@ -262,7 +262,7 @@ namespace mg5amcCpu
     process.initProc( paramCard );
   }
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   template<typename FORTRANFPTYPE>
   void Bridge<FORTRANFPTYPE>::set_gpugrid( const int gpublocks, const int gputhreads )
   {
@@ -276,7 +276,7 @@ namespace mg5amcCpu
   }
 #endif
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   template<typename FORTRANFPTYPE>
   void Bridge<FORTRANFPTYPE>::gpu_sequence( const FORTRANFPTYPE* momenta,
                                             const FORTRANFPTYPE* gs,
@@ -291,14 +291,14 @@ namespace mg5amcCpu
     constexpr int neppM = MemoryAccessMomenta::neppM;
     if constexpr( neppM == 1 && std::is_same_v<FORTRANFPTYPE, fptype> )
     {
-      checkCuda( cudaMemcpy( m_devMomentaC.data(), momenta, m_devMomentaC.bytes(), cudaMemcpyHostToDevice ) );
+      gpuMemcpy( m_devMomentaC.data(), momenta, m_devMomentaC.bytes(), gpuMemcpyHostToDevice );
     }
     else
     {
-      checkCuda( cudaMemcpy( m_devMomentaF.data(), momenta, m_devMomentaF.bytes(), cudaMemcpyHostToDevice ) );
+      gpuMemcpy( m_devMomentaF.data(), momenta, m_devMomentaF.bytes(), gpuMemcpyHostToDevice );
       const int thrPerEvt = CPPProcess::npar * CPPProcess::np4; // AV: transpose alg does 1 element per thread (NOT 1 event per thread)
       //const int thrPerEvt = 1; // AV: try new alg with 1 event per thread... this seems slower
-      dev_transposeMomentaF2C<<<m_gpublocks * thrPerEvt, m_gputhreads>>>( m_devMomentaF.data(), m_devMomentaC.data(), m_nevt );
+      gpuLaunchKernel( dev_transposeMomentaF2C, m_gpublocks * thrPerEvt, m_gputhreads, m_devMomentaF.data(), m_devMomentaC.data(), m_nevt );
     }
     if constexpr( std::is_same_v<FORTRANFPTYPE, fptype> )
     {
@@ -341,7 +341,7 @@ namespace mg5amcCpu
   }
 #endif
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   template<typename FORTRANFPTYPE>
   void Bridge<FORTRANFPTYPE>::cpu_sequence( const FORTRANFPTYPE* momenta,
                                             const FORTRANFPTYPE* gs,
@@ -396,7 +396,7 @@ namespace mg5amcCpu
   // - C++ array: momenta[npagM][npar][np4][neppM] with nevt=npagM*neppM (AOSOA)
   //
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   template<typename Tin, typename Tout>
   __global__ void dev_transposeMomentaF2C( const Tin* in, Tout* out, const unsigned int nevt )
   {
diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/BridgeKernels.cc b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/BridgeKernels.cc
index d58066c9c1..eaf4037a24 100644
--- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/BridgeKernels.cc
+++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/BridgeKernels.cc
@@ -1,17 +1,18 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #include "BridgeKernels.h"
 
+#include "GpuAbstraction.h"
 #include "MemoryAccessMomenta.h"
 
 #include <sstream>
 
 //============================================================================
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -45,7 +46,7 @@ namespace mg5amcCpu
 
 //============================================================================
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 namespace mg5amcCpu
 {
 
@@ -96,7 +97,7 @@ namespace mg5amcCpu
 
 //============================================================================
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 {
 
diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/BridgeKernels.h b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/BridgeKernels.h
index 15eb4bff4d..3efef8ce97 100644
--- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/BridgeKernels.h
+++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/BridgeKernels.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef BRIDGEKERNELS_H
 #define BRIDGEKERNELS_H 1
@@ -12,7 +12,7 @@
 #include "MatrixElementKernels.h"
 #include "MemoryBuffers.h"
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -49,7 +49,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A Bridge wrapper class encapsulating matrix element calculations on a CPU host
   class BridgeKernelHost final : public BridgeKernelBase
   {
@@ -89,7 +89,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   // A Bridge wrapper class encapsulating matrix element calculations on a GPU device
   class BridgeKernelDevice : public BridgeKernelBase
   {
diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/CommonRandomNumberKernel.cc b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/CommonRandomNumberKernel.cc
index 985b39f576..010bc4cbd0 100644
--- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/CommonRandomNumberKernel.cc
+++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/CommonRandomNumberKernel.cc
@@ -1,15 +1,16 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #include "CommonRandomNumbers.h"
+#include "GpuAbstraction.h"
 #include "MemoryBuffers.h"
 #include "RandomNumberKernels.h"
 
 #include <cassert>
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/CrossSectionKernels.cc b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/CrossSectionKernels.cc
index 0b355a3c8d..c15b39844d 100644
--- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/CrossSectionKernels.cc
+++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/CrossSectionKernels.cc
@@ -1,10 +1,11 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #include "CrossSectionKernels.h"
 
+#include "GpuAbstraction.h"
 #include "MemoryAccessMatrixElements.h"
 #include "MemoryAccessWeights.h"
 #include "MemoryBuffers.h"
@@ -77,7 +78,7 @@ debug_me_is_abnormal( const fptype& me, size_t ievtALL )
 
 //============================================================================
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -185,7 +186,7 @@ namespace mg5amcCpu
 
 //============================================================================
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 {
 
diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/CrossSectionKernels.h b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/CrossSectionKernels.h
index 7933ca4bbf..4d9659e04e 100644
--- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/CrossSectionKernels.h
+++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/CrossSectionKernels.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef CROSSSECTIONKERNELS_H
 #define CROSSSECTIONKERNELS_H 1
@@ -13,7 +13,7 @@
 
 //============================================================================
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -96,7 +96,7 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
   /*
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   // A class encapsulating the calculation of event statistics on a GPU device
   class CrossSectionKernelDevice : public CrossSectionKernelBase, public NumberOfEvents
   {
diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/CudaRuntime.h b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/CudaRuntime.h
deleted file mode 100644
index 64ce52f4b3..0000000000
--- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/CudaRuntime.h
+++ /dev/null
@@ -1,85 +0,0 @@
-// Copyright (C) 2020-2023 CERN and UCLouvain.
-// Licensed under the GNU Lesser General Public License (version 3 or later).
-// Created by: S. Roiser (Jul 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
-
-#ifndef MG5AMC_CUDARUNTIME_H
-#define MG5AMC_CUDARUNTIME_H 1
-
-// MG5AMC on GPU uses the CUDA runtime API, not the lower level CUDA driver API
-// See https://docs.nvidia.com/cuda/cuda-runtime-api/driver-vs-runtime-api.html#driver-vs-runtime-api
-
-#include <cassert>
-#include <iostream>
-
-//--------------------------------------------------------------------------
-
-// See https://stackoverflow.com/a/14038590
-#ifdef __CUDACC__ /* clang-format off */
-#define checkCuda( code ) { assertCuda( code, __FILE__, __LINE__ ); }
-inline void assertCuda( cudaError_t code, const char* file, int line, bool abort = true )
-{
-  if( code != cudaSuccess )
-  {
-    printf( "ERROR! assertCuda: '%s' (%d) in %s:%d\n", cudaGetErrorString( code ), code, file, line );
-    if( abort ) assert( code == cudaSuccess );
-  }
-}
-#endif /* clang-format on */
-
-//--------------------------------------------------------------------------
-
-#ifdef __CUDACC__
-namespace mg5amcGpu
-{
-  // Instantiate a CudaRuntime at the beginnining of the application's main to
-  // invoke cudaSetDevice(0) in the constructor and book a cudaDeviceReset() call in the destructor
-  // *** FIXME! This will all need to be designed differently when going to multi-GPU nodes! ***
-  struct CudaRuntime final
-  {
-    CudaRuntime( const bool debug = true )
-      : m_debug( debug ) { setUp( m_debug ); }
-    ~CudaRuntime() { tearDown( m_debug ); }
-    CudaRuntime( const CudaRuntime& ) = delete;
-    CudaRuntime( CudaRuntime&& ) = delete;
-    CudaRuntime& operator=( const CudaRuntime& ) = delete;
-    CudaRuntime& operator=( CudaRuntime&& ) = delete;
-    bool m_debug;
-
-    // Set up CUDA application
-    // ** NB: strictly speaking this is not needed when using the CUDA runtime API **
-    // Calling cudaSetDevice on startup is useful to properly book-keep the time spent in CUDA initialization
-    static void setUp( const bool debug = true )
-    {
-      // ** NB: it is useful to call cudaSetDevice, or cudaFree, to properly book-keep the time spent in CUDA initialization
-      // ** NB: otherwise, the first CUDA operation (eg a cudaMemcpyToSymbol in CPPProcess ctor) appears to take much longer!
-      /*
-      // [We initially added cudaFree(0) to "ease profile analysis" only because it shows up as a big recognizable block!]
-      // No explicit initialization is needed: https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#initialization
-      // It is not clear what cudaFree(0) does at all: https://stackoverflow.com/questions/69967813/
-      if ( debug ) std::cout << "__CudaRuntime: calling cudaFree(0)" << std::endl;
-      checkCuda( cudaFree( 0 ) ); // SLOW!
-      */
-      // Replace cudaFree(0) by cudaSetDevice(0), even if it is not really needed either
-      // (but see https://developer.nvidia.com/blog/cuda-pro-tip-always-set-current-device-avoid-multithreading-bugs)
-      if( debug ) std::cout << "__CudaRuntime: calling cudaSetDevice(0)" << std::endl;
-      checkCuda( cudaSetDevice( 0 ) ); // SLOW!
-    }
-
-    // Tear down CUDA application (call cudaDeviceReset)
-    // ** NB: strictly speaking this is not needed when using the CUDA runtime API **
-    // Calling cudaDeviceReset on shutdown is only needed for checking memory leaks in cuda-memcheck
-    // See https://docs.nvidia.com/cuda/cuda-memcheck/index.html#leak-checking
-    static void tearDown( const bool debug = true )
-    {
-      if( debug ) std::cout << "__CudaRuntime: calling cudaDeviceReset()" << std::endl;
-      checkCuda( cudaDeviceReset() );
-    }
-  };
-
-}
-#endif
-
-//--------------------------------------------------------------------------
-
-#endif // MG5AMC_CUDARUNTIME_H
diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/CurandRandomNumberKernel.cc b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/CurandRandomNumberKernel.cc
index eb56333b03..08a16f6f2c 100644
--- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/CurandRandomNumberKernel.cc
+++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/CurandRandomNumberKernel.cc
@@ -1,9 +1,9 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
-#include "CudaRuntime.h"
+#include "GpuRuntime.h"
 #include "MemoryBuffers.h"
 #include "RandomNumberKernels.h"
 
@@ -22,7 +22,7 @@ inline void assertCurand( curandStatus_t code, const char *file, int line, bool
 }
 #endif /* clang-format on */
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -36,7 +36,7 @@ namespace mg5amcCpu
   {
     if( m_isOnDevice )
     {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
       if( !m_rnarray.isOnDevice() )
         throw std::runtime_error( "CurandRandomNumberKernel on device with a host random number array" );
 #else
@@ -114,7 +114,7 @@ namespace mg5amcCpu
     /*
     printf( "\nCurandRandomNumberKernel::generateRnarray size = %d\n", (int)m_rnarray.size() );
     fptype* data = m_rnarray.data();
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     if( m_rnarray.isOnDevice() )
     {
       data = new fptype[m_rnarray.size()]();
@@ -123,7 +123,7 @@ namespace mg5amcCpu
 #endif
     for( int i = 0; i < ( (int)m_rnarray.size() / 4 ); i++ )
       printf( "[%4d] %f %f %f %f\n", i * 4, data[i * 4], data[i * 4 + 2], data[i * 4 + 2], data[i * 4 + 3] );
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     if( m_rnarray.isOnDevice() ) delete[] data;
 #endif
     */
diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/EventStatistics.h b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/EventStatistics.h
index 48b51e0a49..b425a5bade 100644
--- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/EventStatistics.h
+++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/EventStatistics.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef EventStatistics_H
 #define EventStatistics_H 1
@@ -16,7 +16,7 @@
 #include <limits>
 #include <string>
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/GpuAbstraction.h b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/GpuAbstraction.h
index 2f000e33d1..6a7d9c05c0 100644
--- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/GpuAbstraction.h
+++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/GpuAbstraction.h
@@ -1,24 +1,17 @@
+// Copyright (C) 2020-2023 CERN and UCLouvain.
+// Licensed under the GNU Lesser General Public License (version 3 or later).
+// Created by: J. Teig (Jul 2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+
 #ifndef MG5AMC_GPUABSTRACTION_H
 #define MG5AMC_GPUABSTRACTION_H 1
 
 #include <cassert>
 
-#ifdef MGONGPUCPP_GPUIMPL
-#define MGONGPUCPP_CUDACC 1
-#endif
-
-#ifdef __HIPCC__
-#include "hip/hip_runtime.h"
-#define MGONGPUCPP_HIPCC 1
-#endif
-
-#ifdef MGONGPUCPP_CUDACC
-
-// Defines correct compiler
-#define MGONGPUCPP_GPUIMPL MGONGPUCPP_GPUIMPL
-
 //--------------------------------------------------------------------------
 
+#ifdef __CUDACC__
+
 #define gpuError_t cudaError_t
 #define gpuPeekAtLastError cudaPeekAtLastError
 #define gpuGetErrorString cudaGetErrorString
@@ -44,12 +37,9 @@
 
 //--------------------------------------------------------------------------
 
-#elif defined MGONGPUCPP_HIPCC
+#elif defined __HIPCC__
 
-// Defines correct compiler
-#define MGONGPUCPP_GPUIMPL __HCC__
-
-//--------------------------------------------------------------------------
+#include "hip/hip_runtime.h"
 
 #define gpuError_t hipError_t
 #define gpuPeekAtLastError hipPeekAtLastError
@@ -74,6 +64,8 @@
 #define gpuLaunchKernel( kernel, blocks, threads, ... ) kernel<<<blocks, threads>>>( __VA_ARGS__ )
 #define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<<blocks, threads, sharedMem>>>( __VA_ARGS__ )
 
+//--------------------------------------------------------------------------
+
 #endif
 
-#endif // MG5AMC_GPUABSTRACTION_H
\ No newline at end of file
+#endif // MG5AMC_GPUABSTRACTION_H
diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/GpuRuntime.h b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/GpuRuntime.h
new file mode 100644
index 0000000000..93579ef08b
--- /dev/null
+++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/GpuRuntime.h
@@ -0,0 +1,85 @@
+// Copyright (C) 2020-2023 CERN and UCLouvain.
+// Licensed under the GNU Lesser General Public License (version 3 or later).
+// Created by: J. Teig (Jun 2023, based on earlier work by S. Roiser) for the MG5aMC CUDACPP plugin.
+// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+
+#ifndef MG5AMC_GPURUNTIME_H
+#define MG5AMC_GPURUNTIME_H 1
+
+// MG5AMC on GPU uses the CUDA runtime API, not the lower level CUDA driver API
+// See https://docs.nvidia.com/cuda/cuda-runtime-api/driver-vs-runtime-api.html#driver-vs-runtime-api
+
+#include "GpuAbstraction.h"
+
+#include <iostream>
+
+//--------------------------------------------------------------------------
+
+// See https://stackoverflow.com/a/14038590
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
+#define checkGpu( code ) { assertGpu( code, __FILE__, __LINE__ ); }
+inline void assertGpu( gpuError_t code, const char* file, int line, bool abort = true )
+{
+  if( code != gpuSuccess )
+  {
+    printf( "ERROR! assertGpu: '%s' (%d) in %s:%d\n", gpuGetErrorString( code ), code, file, line );
+    if( abort ) assert( code == gpuSuccess );
+  }
+}
+#endif /* clang-format on */
+
+//--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+namespace mg5amcGpu
+{
+  // Instantiate a GpuRuntime at the beginnining of the application's main to
+  // invoke gpuSetDevice(0) in the constructor and book a gpuDeviceReset() call in the destructor
+  // *** FIXME! This will all need to be designed differently when going to multi-GPU nodes! ***
+  struct GpuRuntime final
+  {
+    GpuRuntime( const bool debug = true )
+      : m_debug( debug ) { setUp( m_debug ); }
+    ~GpuRuntime() { tearDown( m_debug ); }
+    GpuRuntime( const GpuRuntime& ) = delete;
+    GpuRuntime( GpuRuntime&& ) = delete;
+    GpuRuntime& operator=( const GpuRuntime& ) = delete;
+    GpuRuntime& operator=( GpuRuntime&& ) = delete;
+    bool m_debug;
+
+    // Set up CUDA application
+    // ** NB: strictly speaking this is not needed when using the CUDA runtime API **
+    // Calling cudaSetDevice on startup is useful to properly book-keep the time spent in CUDA initialization
+    static void setUp( const bool debug = true )
+    {
+      // ** NB: it is useful to call cudaSetDevice, or cudaFree, to properly book-keep the time spent in CUDA initialization
+      // ** NB: otherwise, the first CUDA operation (eg a cudaMemcpyToSymbol in CPPProcess ctor) appears to take much longer!
+      /*
+      // [We initially added cudaFree(0) to "ease profile analysis" only because it shows up as a big recognizable block!]
+      // No explicit initialization is needed: https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#initialization
+      // It is not clear what cudaFree(0) does at all: https://stackoverflow.com/questions/69967813/
+      if ( debug ) std::cout << "__CudaRuntime: calling cudaFree(0)" << std::endl;
+      checkCuda( cudaFree( 0 ) ); // SLOW!
+      */
+      // Replace cudaFree(0) by cudaSetDevice(0), even if it is not really needed either
+      // (but see https://developer.nvidia.com/blog/cuda-pro-tip-always-set-current-device-avoid-multithreading-bugs)
+      if( debug ) std::cout << "__GpuRuntime: calling GpuSetDevice(0)" << std::endl;
+      checkGpu( gpuSetDevice( 0 ) ); // SLOW!
+    }
+
+    // Tear down CUDA application (call cudaDeviceReset)
+    // ** NB: strictly speaking this is not needed when using the CUDA runtime API **
+    // Calling cudaDeviceReset on shutdown is only needed for checking memory leaks in cuda-memcheck
+    // See https://docs.nvidia.com/cuda/cuda-memcheck/index.html#leak-checking
+    static void tearDown( const bool debug = true )
+    {
+      if( debug ) std::cout << "__GpuRuntime: calling GpuDeviceReset()" << std::endl;
+      checkGpu( gpuDeviceReset() );
+    }
+  };
+}
+#endif
+
+//--------------------------------------------------------------------------
+
+#endif // MG5AMC_GPURUNTIME_H
diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MadgraphTest.h b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MadgraphTest.h
index ef40624c88..a64c05c26a 100644
--- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MadgraphTest.h
+++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MadgraphTest.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: S. Hageboeck (Dec 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MADGRAPHTEST_H_
 #define MADGRAPHTEST_H_ 1
@@ -22,7 +22,7 @@
 #include <string>
 #include <vector>
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 using mg5amcGpu::CPPProcess;
 #else
 using mg5amcCpu::CPPProcess;
@@ -201,7 +201,7 @@ class MadgraphTest : public testing::TestWithParam<TestDriverBase*>
 
 // Since we link both the CPU-only and GPU tests into the same executable, we prevent
 // a multiply defined symbol by only compiling this in the non-CUDA phase:
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 
 /// Compare momenta and matrix elements.
 /// This uses an implementation of TestDriverBase to run a madgraph workflow,
@@ -307,6 +307,6 @@ TEST_P( MadgraphTest, CompareMomentaAndME )
   }
 }
 
-#endif // __CUDACC__
+#endif // MGONGPUCPP_GPUIMPL
 
 #endif /* MADGRAPHTEST_H_ */
diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MatrixElementKernels.cc b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MatrixElementKernels.cc
index 3a957ee2ca..81699dfea9 100644
--- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MatrixElementKernels.cc
+++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MatrixElementKernels.cc
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #include "MatrixElementKernels.h"
 
@@ -14,7 +14,7 @@
 
 //============================================================================
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 namespace mg5amcCpu
 {
 
@@ -150,7 +150,7 @@ namespace mg5amcCpu
 
 //============================================================================
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 {
 
@@ -211,11 +211,11 @@ namespace mg5amcGpu
     // ... 0d1. Compute good helicity mask on the device
     gpuLaunchKernel( computeDependentCouplings, m_gpublocks, m_gputhreads, m_gs.data(), m_couplings.data() );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    sigmaKin_getGoodHel<<<m_gpublocks, m_gputhreads>>>( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_numerators.data(), m_denominators.data(), devIsGoodHel.data() );
+    gpuLaunchKernel( sigmaKin_getGoodHel, m_gpublocks, m_gputhreads, m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_numerators.data(), m_denominators.data(), devIsGoodHel.data() );
 #else
-    sigmaKin_getGoodHel<<<m_gpublocks, m_gputhreads>>>( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), devIsGoodHel.data() );
+    gpuLaunchKernel( sigmaKin_getGoodHel, m_gpublocks, m_gputhreads, m_momenta.data(), m_couplings.data(), m_matrixElements.data(), devIsGoodHel.data() );
 #endif
-    checkCuda( cudaPeekAtLastError() );
+    checkGpu( gpuPeekAtLastError() );
     // ... 0d2. Copy back good helicity mask to the host
     copyHostFromDevice( hstIsGoodHel, devIsGoodHel );
     // ... 0d3. Copy back good helicity list to constant memory on the device
@@ -226,19 +226,19 @@ namespace mg5amcGpu
 
   void MatrixElementKernelDevice::computeMatrixElements( const unsigned int channelId )
   {
-    computeDependentCouplings<<<m_gpublocks, m_gputhreads>>>( m_gs.data(), m_couplings.data() );
+    gpuLaunchKernel( computeDependentCouplings, m_gpublocks, m_gputhreads, m_gs.data(), m_couplings.data() );
 #ifndef MGONGPU_NSIGHT_DEBUG
     constexpr unsigned int sharedMemSize = 0;
 #else
     constexpr unsigned int sharedMemSize = ntpbMAX * sizeof( float );
 #endif
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    sigmaKin<<<m_gpublocks, m_gputhreads, sharedMemSize>>>( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), channelId, m_numerators.data(), m_denominators.data(), m_selhel.data(), m_selcol.data() );
+    gpuLaunchKernelSharedMem( sigmaKin, m_gpublocks, m_gputhreads, sharedMemSize, m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), channelId, m_numerators.data(), m_denominators.data(), m_selhel.data(), m_selcol.data() );
 #else
-    sigmaKin<<<m_gpublocks, m_gputhreads, sharedMemSize>>>( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), m_selhel.data(), m_selcol.data() );
+    gpuLaunchKernelSharedMem( sigmaKin, m_gpublocks, m_gputhreads, sharedMemSize, m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), m_selhel.data(), m_selcol.data() );
 #endif
-    checkCuda( cudaPeekAtLastError() );
-    checkCuda( cudaDeviceSynchronize() );
+    checkGpu( gpuPeekAtLastError() );
+    checkGpu( gpuDeviceSynchronize() );
   }
 
   //--------------------------------------------------------------------------
diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MatrixElementKernels.h b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MatrixElementKernels.h
index 23e84757a2..72bd8f195b 100644
--- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MatrixElementKernels.h
+++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MatrixElementKernels.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MATRIXELEMENTKERNELS_H
 #define MATRIXELEMENTKERNELS_H 1
@@ -10,7 +10,7 @@
 
 #include "MemoryBuffers.h"
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -81,7 +81,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating matrix element calculations on a CPU host
   class MatrixElementKernelHost final : public MatrixElementKernelBase, public NumberOfEvents
   {
@@ -130,7 +130,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   // A class encapsulating matrix element calculations on a GPU device
   class MatrixElementKernelDevice : public MatrixElementKernelBase, public NumberOfEvents
   {
diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MemoryAccessAmplitudes.h b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MemoryAccessAmplitudes.h
index 573b3bbbc9..ffb76e93de 100644
--- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MemoryAccessAmplitudes.h
+++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MemoryAccessAmplitudes.h
@@ -15,7 +15,7 @@
 #define MGONGPU_TRIVIAL_AMPLITUDES 1
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MemoryAccessCouplings.h b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MemoryAccessCouplings.h
index 35a3af42e0..3afdf3e554 100644
--- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MemoryAccessCouplings.h
+++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MemoryAccessCouplings.h
@@ -15,7 +15,7 @@
 #include "MemoryBuffers.h"       // for HostBufferCouplings::isaligned
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MemoryAccessCouplingsFixed.h b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MemoryAccessCouplingsFixed.h
index dc0d93afff..ffcdf4dbef 100644
--- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MemoryAccessCouplingsFixed.h
+++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MemoryAccessCouplingsFixed.h
@@ -14,7 +14,7 @@
 //#include "MemoryAccessHelpers.h"
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MemoryAccessDenominators.h b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MemoryAccessDenominators.h
index 3bce635718..66f2d32a6b 100644
--- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MemoryAccessDenominators.h
+++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MemoryAccessDenominators.h
@@ -10,7 +10,7 @@
 #include "MemoryAccessGs.h"
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MemoryAccessGs.h b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MemoryAccessGs.h
index 31311aa375..4c726b30f3 100644
--- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MemoryAccessGs.h
+++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MemoryAccessGs.h
@@ -13,7 +13,7 @@
 #include "MemoryBuffers.h" // for HostBufferMatrixElements::isaligned
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MemoryAccessHelpers.h b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MemoryAccessHelpers.h
index c82a6c7635..db73e4e064 100644
--- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MemoryAccessHelpers.h
+++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MemoryAccessHelpers.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MemoryAccessHelpers_H
 #define MemoryAccessHelpers_H 1
@@ -105,7 +105,7 @@ class KernelAccessHelper : public MemoryAccessHelper<T>
     }
     else
     {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
       const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid
       //printf( "kernelAccessRecord: ievt=%d threadId=%d\n", ievt, threadIdx.x );
       return T::ieventAccessRecord( buffer, ievt ); // NB fptype and fptype_sv coincide for CUDA
diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MemoryAccessMatrixElements.h b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MemoryAccessMatrixElements.h
index f32e6fea5b..3741011971 100644
--- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MemoryAccessMatrixElements.h
+++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MemoryAccessMatrixElements.h
@@ -13,7 +13,7 @@
 #include "MemoryBuffers.h" // for HostBufferMatrixElements::isaligned
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MemoryAccessMomenta.h b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MemoryAccessMomenta.h
index 29266de32c..3be229d392 100644
--- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MemoryAccessMomenta.h
+++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MemoryAccessMomenta.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MemoryAccessMomenta_H
 #define MemoryAccessMomenta_H 1
@@ -13,7 +13,7 @@
 #include "MemoryAccessVectors.h"
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -30,7 +30,7 @@ namespace mg5amcCpu
 
     // Number of Events Per Page in the momenta AOSOA memory buffer layout
     // (these are all best kept as a compile-time constants: see issue #23)
-#ifdef __CUDACC__ /* clang-format off */
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
     // -----------------------------------------------------------------------------------------------
     // --- GPUs: neppM is best set to a power of 2 times the number of fptype's in a 32-byte cacheline
     // --- This is relevant to ensure coalesced access to momenta in global memory
diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MemoryAccessNumerators.h b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MemoryAccessNumerators.h
index b152183b28..18991f4fa6 100644
--- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MemoryAccessNumerators.h
+++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MemoryAccessNumerators.h
@@ -10,7 +10,7 @@
 #include "MemoryAccessGs.h"
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MemoryAccessRandomNumbers.h b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MemoryAccessRandomNumbers.h
index e2988d39f3..40cb089135 100644
--- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MemoryAccessRandomNumbers.h
+++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MemoryAccessRandomNumbers.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MemoryAccessRandomNumbers_H
 #define MemoryAccessRandomNumbers_H 1
@@ -11,7 +11,7 @@
 #include "CPPProcess.h"
 #include "MemoryAccessHelpers.h"
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 using mg5amcGpu::CPPProcess;
 #else
 using mg5amcCpu::CPPProcess;
diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MemoryAccessVectors.h b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MemoryAccessVectors.h
index e9b197368e..08faccff0f 100644
--- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MemoryAccessVectors.h
+++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MemoryAccessVectors.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MemoryAccessVectors_H
 #define MemoryAccessVectors_H 1
@@ -10,7 +10,7 @@
 
 #include "mgOnGpuVectors.h"
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 namespace mg5amcCpu // this is only needed for CPU SIMD vectorization
 {
 
diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MemoryAccessWavefunctions.h b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MemoryAccessWavefunctions.h
index 5428aaf933..33bef4559e 100644
--- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MemoryAccessWavefunctions.h
+++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MemoryAccessWavefunctions.h
@@ -15,7 +15,7 @@
 #define MGONGPU_TRIVIAL_WAVEFUNCTIONS 1
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MemoryBuffers.h b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MemoryBuffers.h
index 3093e6ed18..7756a71621 100644
--- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MemoryBuffers.h
+++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MemoryBuffers.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021, based on earlier work by S. Hageboeck) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Roiser, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MemoryBuffers_H
 #define MemoryBuffers_H 1
@@ -11,12 +11,12 @@
 #include "mgOnGpuCxtypes.h"
 
 #include "CPPProcess.h"
-#include "CudaRuntime.h"
+#include "GpuRuntime.h"
 #include "Parameters_sm.h"
 
 #include <sstream>
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -87,7 +87,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   constexpr bool HostBufferALIGNED = false;   // ismisaligned=false
   constexpr bool HostBufferMISALIGNED = true; // ismisaligned=true
 
@@ -119,7 +119,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   // A class encapsulating a CUDA pinned host buffer
   template<typename T>
   class PinnedHostBufferBase : public BufferBase<T>
@@ -128,18 +128,18 @@ namespace mg5amcCpu
     PinnedHostBufferBase( const size_t size )
       : BufferBase<T>( size, false )
     {
-      checkCuda( cudaMallocHost( &( this->m_data ), this->bytes() ) );
+      gpuMallocHost( &( this->m_data ), this->bytes() );
     }
     virtual ~PinnedHostBufferBase()
     {
-      checkCuda( cudaFreeHost( this->m_data ) );
+      gpuFreeHost( this->m_data );
     }
   };
 #endif
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   // A class encapsulating a CUDA device buffer
   template<typename T>
   class DeviceBufferBase : public BufferBase<T>
@@ -148,18 +148,18 @@ namespace mg5amcCpu
     DeviceBufferBase( const size_t size )
       : BufferBase<T>( size, true )
     {
-      checkCuda( cudaMalloc( &( this->m_data ), this->bytes() ) );
+      gpuMalloc( &( this->m_data ), this->bytes() );
     }
     virtual ~DeviceBufferBase()
     {
-      checkCuda( cudaFree( this->m_data ) );
+      gpuFree( this->m_data );
     }
   };
 #endif
 
   //--------------------------------------------------------------------------
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for a given number of events
   template<typename T, size_t sizePerEvent, bool ismisaligned>
   class HostBuffer : public HostBufferBase<T, ismisaligned>, virtual private NumberOfEvents
@@ -175,7 +175,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   // A class encapsulating a CUDA pinned host buffer for a given number of events
   template<typename T, size_t sizePerEvent>
   class PinnedHostBuffer : public PinnedHostBufferBase<T>, virtual private NumberOfEvents
@@ -191,7 +191,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   // A class encapsulating a CUDA device buffer for a given number of events
   template<typename T, size_t sizePerEvent>
   class DeviceBuffer : public DeviceBufferBase<T>, virtual private NumberOfEvents
@@ -213,7 +213,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for momenta random numbers
   constexpr size_t sizePerEventRndNumMomenta = MemoryBuffers::np4 * MemoryBuffers::nparf;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for momenta random numbers
   typedef HostBuffer<fptype, sizePerEventRndNumMomenta, HostBufferALIGNED> HostBufferRndNumMomenta;
 #else
@@ -232,7 +232,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer with ONE fptype per event
   constexpr size_t sizePerEventOneFp = 1;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer with ONE fptype per event
   typedef HostBuffer<fptype, sizePerEventOneFp, HostBufferALIGNED> HostBufferOneFp;
 #else
@@ -257,7 +257,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for Gs
   constexpr size_t sizePerEventGs = 1;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for gs
   typedef HostBuffer<fptype, sizePerEventGs, HostBufferALIGNED> HostBufferGs;
 #else
@@ -276,7 +276,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for numerators
   constexpr size_t sizePerEventNumerators = 1;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for gs
   typedef HostBuffer<fptype, sizePerEventNumerators, HostBufferALIGNED> HostBufferNumerators;
 #else
@@ -296,7 +296,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for denominators
   constexpr size_t sizePerEventDenominators = 1;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for gs
   typedef HostBuffer<fptype, sizePerEventDenominators, HostBufferALIGNED> HostBufferDenominators;
 #else
@@ -315,7 +315,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for random numbers
   constexpr size_t sizePerEventCouplings = MemoryBuffers::ndcoup * MemoryBuffers::nx2;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for gs
   typedef HostBuffer<fptype, sizePerEventCouplings, HostBufferALIGNED> HostBufferCouplings;
 #else
@@ -333,7 +333,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for momenta
   constexpr size_t sizePerEventMomenta = MemoryBuffers::np4 * MemoryBuffers::npar;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for momenta
   typedef HostBuffer<fptype, sizePerEventMomenta, HostBufferALIGNED> HostBufferMomenta;
   //typedef HostBuffer<fptype, sizePerEventMomenta, HostBufferMISALIGNED> HostBufferMomenta; // TEST MISALIGNMENT!
@@ -352,7 +352,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for sampling weights
   constexpr size_t sizePerEventWeights = 1;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for sampling weights
   typedef HostBuffer<fptype, sizePerEventWeights, HostBufferALIGNED> HostBufferWeights;
 #else
@@ -370,7 +370,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for matrix elements
   constexpr size_t sizePerEventMatrixElements = 1;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for matrix elements
   typedef HostBuffer<fptype, sizePerEventMatrixElements, HostBufferALIGNED> HostBufferMatrixElements;
 #else
@@ -385,7 +385,7 @@ namespace mg5amcCpu
   // A base class encapsulating a memory buffer for the helicity mask
   typedef BufferBase<bool> BufferHelicityMask;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for the helicity mask
   typedef HostBufferBase<bool, HostBufferALIGNED> HostBufferHelicityMask;
 #else
@@ -403,7 +403,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for wavefunctions
   constexpr size_t sizePerEventWavefunctions = MemoryBuffers::nw6 * MemoryBuffers::nx2;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for wavefunctions
   typedef HostBuffer<fptype, sizePerEventWavefunctions, HostBufferALIGNED> HostBufferWavefunctions;
 #else
@@ -421,7 +421,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for helicity random numbers
   constexpr size_t sizePerEventRndNumHelicity = 1;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for helicity random numbers
   typedef HostBuffer<fptype, sizePerEventRndNumHelicity, HostBufferALIGNED> HostBufferRndNumHelicity;
 #else
@@ -439,7 +439,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for color random numbers
   constexpr size_t sizePerEventRndNumColor = 1;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for color random numbers
   typedef HostBuffer<fptype, sizePerEventRndNumColor, HostBufferALIGNED> HostBufferRndNumColor;
 #else
@@ -457,7 +457,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for helicity selection
   constexpr size_t sizePerEventSelectedHelicity = 1;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for helicity selection
   typedef HostBuffer<int, sizePerEventSelectedHelicity, HostBufferALIGNED> HostBufferSelectedHelicity;
 #else
@@ -475,7 +475,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for color selection
   constexpr size_t sizePerEventSelectedColor = 1;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for color selection
   typedef HostBuffer<int, sizePerEventSelectedColor, HostBufferALIGNED> HostBufferSelectedColor;
 #else
@@ -487,7 +487,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   template<class Tdst, class Tsrc>
   void copyDeviceFromHost( Tdst& dst, const Tsrc& src ) // keep the same order of arguments as in memcpy
   {
@@ -504,13 +504,13 @@ namespace mg5amcCpu
       throw std::runtime_error( sstr.str() );
     }
     // NB (PR #45): cudaMemcpy involves an intermediate memcpy to pinned memory if host array is a not a pinned host array
-    checkCuda( cudaMemcpy( dst.data(), src.data(), src.bytes(), cudaMemcpyHostToDevice ) );
+    gpuMemcpy( dst.data(), src.data(), src.bytes(), gpuMemcpyHostToDevice );
   }
 #endif
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   template<class Tdst, class Tsrc>
   void copyHostFromDevice( Tdst& dst, const Tsrc& src ) // keep the same order of arguments as in memcpy
   {
@@ -527,7 +527,7 @@ namespace mg5amcCpu
       throw std::runtime_error( sstr.str() );
     }
     // NB (PR #45): cudaMemcpy involves an intermediate memcpy to pinned memory if host array is a not a pinned host array
-    checkCuda( cudaMemcpy( dst.data(), src.data(), src.bytes(), cudaMemcpyDeviceToHost ) );
+    gpuMemcpy( dst.data(), src.data(), src.bytes(), gpuMemcpyDeviceToHost );
   }
 #endif
 
diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/CPPProcess.cc b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/CPPProcess.cc
index 19bc1e7973..2f4b1f9d0e 100644
--- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/CPPProcess.cc
+++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/CPPProcess.cc
@@ -4,7 +4,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
 // MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
@@ -16,7 +16,6 @@
 
 #include "mgOnGpuConfig.h"
 
-#include "CudaRuntime.h"
 #include "HelAmps_sm.h"
 #include "MemoryAccessAmplitudes.h"
 #include "MemoryAccessCouplings.h"
@@ -46,7 +45,7 @@
 // Class member functions for calculating the matrix elements for
 // Process: g g > t t~ g g WEIGHTED<=4 @1
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -80,7 +79,7 @@ namespace mg5amcCpu
   __device__ const fptype cIPD[2] = { (fptype)Parameters_sm::mdl_MT, (fptype)Parameters_sm::mdl_WT };
   __device__ const fptype* cIPC = nullptr; // unused as nicoup=0
 #else
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   __device__ __constant__ fptype cIPD[2];
   __device__ __constant__ fptype* cIPC = nullptr; // unused as nicoup=0
 #else
@@ -90,7 +89,7 @@ namespace mg5amcCpu
 #endif
 
   // Helicity combinations (and filtering of "good" helicity combinations)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   __device__ __constant__ short cHel[ncomb][npar];
   __device__ __constant__ int cNGoodHel;
   __device__ __constant__ int cGoodHel[ncomb];
@@ -118,13 +117,13 @@ namespace mg5amcCpu
                            fptype* allDenominators,       // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
                            fptype_sv* jamp2_sv            // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled)
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
                            , const int ievt00             // input: first event number in current C++ event page (for CUDA, ievt depends on threadid)
 #endif
                            )
   //ALWAYS_INLINE // attributes are not permitted in a function definition
   {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     using namespace mg5amcGpu;
     using M_ACCESS = DeviceAccessMomenta;         // non-trivial access: buffer includes all events
     using E_ACCESS = DeviceAccessMatrixElements;  // non-trivial access: buffer includes all events
@@ -151,7 +150,7 @@ namespace mg5amcCpu
 #endif /* clang-format on */
     mgDebug( 0, __FUNCTION__ );
     //printf( "calculate_wavefunctions: ihel=%2d\n", ihel );
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
     //printf( "calculate_wavefunctions: ievt00=%d\n", ievt00 );
 #endif
 
@@ -187,7 +186,7 @@ namespace mg5amcCpu
 #endif
     for( int iParity = 0; iParity < nParity; ++iParity )
     { // START LOOP ON IPARITY
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
       const int ievt0 = ievt00 + iParity * neppV;
 #endif
       constexpr size_t nxcoup = ndcoup + nicoup; // both dependent and independent couplings
@@ -200,8 +199,10 @@ namespace mg5amcCpu
         allCOUPs[idcoup] = CD_ACCESS::idcoupAccessBufferConst( allcouplings, idcoup ); // dependent couplings, vary event-by-event
       for( size_t iicoup = 0; iicoup < nicoup; iicoup++ )
         allCOUPs[ndcoup + iicoup] = CI_ACCESS::iicoupAccessBufferConst( cIPC, iicoup ); // independent couplings, fixed for all events
+#ifdef MGONGPUCPP_GPUIMPL
 #ifdef __CUDACC__
 #pragma nv_diagnostic pop
+#endif
       // CUDA kernels take input/output buffers with momenta/MEs for all events
       const fptype* momenta = allmomenta;
       const fptype* COUPs[nxcoup];
@@ -2417,7 +2418,7 @@ namespace mg5amcCpu
         { 62, 71, -10, 80, -1, 8, -28, 62, 62, -10, -10, -1, -1, 8, -10, -1, -64, 8, 8, -64, 80, 8, 512, -64 },
         { -28, 62, 62, -10, -10, -1, 62, 71, -10, 80, -1, 8, -10, -1, -1, 8, 8, -64, 80, 8, 8, -64, -64, 512 } }; // 2-D array[24][24]
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
       // Pre-compute a constexpr triangular color matrix properly normalized #475
       struct TriangularNormalizedColorMatrix
       {
@@ -2474,7 +2475,7 @@ namespace mg5amcCpu
 #endif
       for( int icol = 0; icol < ncolor; icol++ )
       {
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
         // === C++ START ===
         // Diagonal terms
 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
@@ -2533,7 +2534,7 @@ namespace mg5amcCpu
       MEs_sv_previous += deltaMEs_previous;
 #endif
       /*
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
       if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", blockDim.x * blockIdx.x + threadIdx.x, ihel, MEs_sv );
 #else
 #ifdef MGONGPU_CPPSIMD
@@ -2628,8 +2629,8 @@ namespace mg5amcCpu
       { 1, 1, 1, -1, -1, 1 },
       { 1, 1, 1, -1, 1, -1 },
       { 1, 1, 1, -1, 1, 1 } };
-#ifdef __CUDACC__
-    checkCuda( cudaMemcpyToSymbol( cHel, tHel, ncomb * npar * sizeof( short ) ) );
+#ifdef MGONGPUCPP_GPUIMPL
+    gpuMemcpyToSymbol( cHel, tHel, ncomb * npar * sizeof( short ) );
 #else
     memcpy( cHel, tHel, ncomb * npar * sizeof( short ) );
 #endif
@@ -2671,9 +2672,9 @@ namespace mg5amcCpu
     // Then copy them to CUDA constant memory (issue #39) or its C++ emulation in file-scope static memory
     const fptype tIPD[2] = { (fptype)m_pars->mdl_MT, (fptype)m_pars->mdl_WT };
     //const cxtype tIPC[0] = { ... }; // nicoup=0
-#ifdef __CUDACC__
-    checkCuda( cudaMemcpyToSymbol( cIPD, tIPD, 2 * sizeof( fptype ) ) );
-    //checkCuda( cudaMemcpyToSymbol( cIPC, tIPC, 0 * sizeof( cxtype ) ) ); // nicoup=0
+#ifdef MGONGPUCPP_GPUIMPL
+    gpuMemcpyToSymbol( cIPD, tIPD, 2 * sizeof( fptype ) );
+    //gpuMemcpyToSymbol( cIPC, tIPC, 0 * sizeof( cxtype ) ); // nicoup=0
 #else
     memcpy( cIPD, tIPD, 2 * sizeof( fptype ) );
     //memcpy( cIPC, tIPC, 0 * sizeof( cxtype ) ); // nicoup=0
@@ -2711,7 +2712,7 @@ namespace mg5amcCpu
   {
     std::stringstream out;
     // CUDA version (NVCC)
-    // [Use __NVCC__ instead of __CUDACC__ here!]
+    // [Use __NVCC__ instead of MGONGPUCPP_GPUIMPL here!]
     // [This tests if 'nvcc' was used even to build a .cc file, even if not necessarily 'nvcc -x cu' for a .cu file]
     // [Check 'nvcc --compiler-options -dM -E dummy.c | grep CUDA': see https://stackoverflow.com/a/53713712]
 #ifdef __NVCC__
@@ -2776,12 +2777,12 @@ namespace mg5amcCpu
   __global__ void /* clang-format off */
   computeDependentCouplings( const fptype* allgs, // input: Gs[nevt]
                              fptype* allcouplings // output: couplings[nevt*ndcoup*2]
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
                              , const int nevt     // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #endif
   ) /* clang-format on */
   {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     using namespace mg5amcGpu;
     using G_ACCESS = DeviceAccessGs;
     using C_ACCESS = DeviceAccessCouplings;
@@ -2802,7 +2803,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__ /* clang-format off */
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
   __global__ void
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
@@ -2928,9 +2929,9 @@ namespace mg5amcCpu
         nGoodHel++;
       }
     }
-#ifdef __CUDACC__
-    checkCuda( cudaMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) ) );
-    checkCuda( cudaMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) ) );
+#ifdef MGONGPUCPP_GPUIMPL
+    gpuMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) );
+    gpuMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) );
 #else
     cNGoodHel = nGoodHel;
     for( int ihel = 0; ihel < ncomb; ihel++ ) cGoodHel[ihel] = goodHel[ihel];
@@ -2954,7 +2955,7 @@ namespace mg5amcCpu
 #endif
             int* allselhel,                // output: helicity selection[nevt]
             int* allselcol                 // output: helicity selection[nevt]
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
             , const int nevt               // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #endif
             ) /* clang-format on */
@@ -2974,7 +2975,7 @@ namespace mg5amcCpu
     // Denominators: spins, colors and identical particles
     constexpr int helcolDenominators[1] = { 512 }; // assume nprocesses == 1 (#272 and #343)
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     // Remember: in CUDA this is a kernel for one event, in c++ this processes n events
     const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid
 #else
@@ -2988,9 +2989,12 @@ namespace mg5amcCpu
 #endif
 
     // Start sigmaKin_lines
+
+#include "GpuAbstraction.h"
+
     // === PART 0 - INITIALISATION (before calculate_wavefunctions) ===
     // Reset the "matrix elements" - running sums of |M|^2 over helicities for the given event
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     allMEs[ievt] = 0;
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     allNumerators[ievt] = 0;
@@ -3018,7 +3022,7 @@ namespace mg5amcCpu
     // === PART 1 - HELICITY LOOP: CALCULATE WAVEFUNCTIONS ===
     // (in both CUDA and C++, using precomputed good helicities)
 
-#ifdef __CUDACC__ // CUDA OR C++
+#ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++
 
     // *** START OF PART 1a - CUDA (one event per CPU thread) ***
     // Running sum of partial amplitudes squared for event by event color selection (#402)
@@ -3228,7 +3232,7 @@ namespace mg5amcCpu
     // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event
     // [NB 'sum over final spins, average over initial spins', eg see
     // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf]
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     allMEs[ievt] /= helcolDenominators[0];
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     if( channelId > 0 ) allMEs[ievt] *= allNumerators[ievt] / allDenominators[ievt];
diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/CPPProcess.h b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/CPPProcess.h
index 04f7c62976..deb1358992 100644
--- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/CPPProcess.h
+++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/CPPProcess.h
@@ -4,7 +4,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
 // MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
@@ -25,7 +25,7 @@
 
 //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -107,7 +107,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   __global__ void
   computeDependentCouplings( const fptype* allgs,    // input: Gs[nevt]
                              fptype* allcouplings ); // output: couplings[nevt*ndcoup*2]
@@ -120,7 +120,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__ /* clang-format off */
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
   __global__ void
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
@@ -150,7 +150,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__ /* clang-format off */
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
   __global__ void
   sigmaKin( const fptype* allmomenta,      // input: momenta[nevt*npar*4]
             const fptype* allcouplings,    // input: couplings[nevt*ndcoup*2]
diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/CudaRuntime.h b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/CudaRuntime.h
deleted file mode 120000
index ce9e1a487a..0000000000
--- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/CudaRuntime.h
+++ /dev/null
@@ -1 +0,0 @@
-../CudaRuntime.h
\ No newline at end of file
diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/GpuAbstraction.h b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/GpuAbstraction.h
new file mode 120000
index 0000000000..72054e19ba
--- /dev/null
+++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/GpuAbstraction.h
@@ -0,0 +1 @@
+../GpuAbstraction.h
\ No newline at end of file
diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/GpuRuntime.h b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/GpuRuntime.h
new file mode 120000
index 0000000000..3920e83be4
--- /dev/null
+++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/GpuRuntime.h
@@ -0,0 +1 @@
+../GpuRuntime.h
\ No newline at end of file
diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/check_sa.cc b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/check_sa.cc
index 8fe4c22145..7cac5ab47b 100644
--- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/check_sa.cc
+++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/check_sa.cc
@@ -4,7 +4,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: O. Mattelaer (Nov 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 
 #include "mgOnGpuConfig.h"
@@ -12,6 +12,7 @@
 #include "BridgeKernels.h"
 #include "CPPProcess.h"
 #include "CrossSectionKernels.h"
+#include "GpuRuntime.h"
 #include "MatrixElementKernels.h"
 #include "MemoryAccessMatrixElements.h"
 #include "MemoryAccessMomenta.h"
@@ -65,7 +66,7 @@ usage( char* argv0, int ret = 1 )
   std::cout << std::endl;
   std::cout << "Summary stats are always computed: '-p' and '-j' only control their printout" << std::endl;
   std::cout << "The '-d' flag only enables NaN/abnormal warnings and OMP debugging" << std::endl;
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 #ifdef _OPENMP
   std::cout << std::endl;
   std::cout << "Use the OMP_NUM_THREADS environment variable to control OMP multi-threading" << std::endl;
@@ -96,7 +97,7 @@ int
 main( int argc, char** argv )
 {
   // Namespaces for CUDA and C++ (FIXME - eventually use the same namespace everywhere...)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   using namespace mg5amcGpu;
 #else
   using namespace mg5amcCpu;
@@ -134,9 +135,11 @@ main( int argc, char** argv )
     CurandDevice = 2
   };
 #ifdef MGONGPU_HAS_NO_CURAND
-  RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // this is the only supported mode if build has no curand (PR #784)
+  RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // this is the only supported mode if build has no curand (PR #784 and #785)
+#elif defined __HIPCC__
+#error Internal error: MGONGPU_HAS_NO_CURAND should have been set for __HIPCC__ // default on AMD GPUs should be common random
 #elif defined __CUDACC__
-  RandomNumberMode rndgen = RandomNumberMode::CurandDevice; // default on GPU if build has curand
+  RandomNumberMode rndgen = RandomNumberMode::CurandDevice; // default on NVidia GPU if build has curand
 #else
   RandomNumberMode rndgen = RandomNumberMode::CurandHost; // default on CPU if build has curand
 #endif
@@ -146,10 +149,10 @@ main( int argc, char** argv )
     RamboHost = 1,
     RamboDevice = 2
   };
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   RamboSamplingMode rmbsmp = RamboSamplingMode::RamboDevice; // default on GPU
 #else
-  RamboSamplingMode rmbsmp = RamboSamplingMode::RamboHost;  // default on CPU
+  RamboSamplingMode rmbsmp = RamboSamplingMode::RamboHost; // default on CPU
 #endif
   // Bridge emulation mode (NB Bridge implies RamboHost!)
   bool bridge = false;
@@ -177,7 +180,7 @@ main( int argc, char** argv )
     else if( arg == "--curdev" )
     {
 #ifndef __CUDACC__
-      throw std::runtime_error( "CurandDevice is not supported on CPUs" );
+      throw std::runtime_error( "CurandDevice is not supported on CPUs or non-NVidia GPUs" );
 #elif defined MGONGPU_HAS_NO_CURAND
       throw std::runtime_error( "CurandDevice is not supported because this application was built without Curand support" );
 #else
@@ -198,7 +201,7 @@ main( int argc, char** argv )
     }
     else if( arg == "--rmbdev" )
     {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
       rmbsmp = RamboSamplingMode::RamboDevice;
 #else
       throw std::runtime_error( "RamboDevice is not supported on CPUs" );
@@ -272,13 +275,13 @@ main( int argc, char** argv )
     return usage( argv[0] );
   }
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 #ifdef _OPENMP
   ompnumthreadsNotSetMeansOneThread( debug ? 1 : 0 ); // quiet(-1), info(0), debug(1)
 #endif
 #endif
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // Fail gently and avoid "Illegal instruction (core dumped)" if the host does not support the SIMD used in the ME calculation
   // Note: this prevents a crash on pmpe04 but not on some github CI nodes?
   // [NB: SIMD vectorization in mg5amc C++ code is only used in the ME calculation below MatrixElementKernelHost!]
@@ -296,14 +299,14 @@ main( int argc, char** argv )
 
   // === STEP 0 - INITIALISE
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 
-  // --- 00. Initialise cuda
-  // Instantiate a CudaRuntime at the beginnining of the application's main to
-  // invoke cudaSetDevice(0) in the constructor and book a cudaDeviceReset() call in the destructor
-  const std::string cdinKey = "00 CudaInit";
+  // --- 00. Initialise GPU
+  // Instantiate a GpuRuntime at the beginnining of the application's main.
+  // For CUDA this invokes cudaSetDevice(0) in the constructor and books a cudaDeviceReset() call in the destructor.
+  const std::string cdinKey = "00 GpuInit";
   timermap.start( cdinKey );
-  CudaRuntime cudaRuntime( debug );
+  GpuRuntime GpuRuntime( debug );
 #endif
 
   // --- 0a. Initialise physics process
@@ -325,7 +328,7 @@ main( int argc, char** argv )
   timermap.start( alloKey );
 
   // Memory buffers for random numbers for momenta
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferRndNumMomenta hstRndmom( nevt );
 #else
   PinnedHostBufferRndNumMomenta hstRndmom( nevt );
@@ -333,7 +336,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for sampling weights
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferWeights hstWeights( nevt );
 #else
   PinnedHostBufferWeights hstWeights( nevt );
@@ -341,7 +344,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for momenta
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferMomenta hstMomenta( nevt );
 #else
   PinnedHostBufferMomenta hstMomenta( nevt );
@@ -349,7 +352,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for Gs
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferGs hstGs( nevt );
 #else
   PinnedHostBufferGs hstGs( nevt );
@@ -366,7 +369,7 @@ main( int argc, char** argv )
   }
 
   // Memory buffers for matrix elements
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferMatrixElements hstMatrixElements( nevt );
 #else
   PinnedHostBufferMatrixElements hstMatrixElements( nevt );
@@ -375,7 +378,7 @@ main( int argc, char** argv )
 
   // Memory buffers for random numbers for helicity selection
   // *** NB #403 these buffers always remain initialised at 0: no need for helicity choice in gcheck/check (no LHE produced) ***
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferRndNumHelicity hstRndHel( nevt );
 #else
   PinnedHostBufferRndNumHelicity hstRndHel( nevt );
@@ -384,7 +387,7 @@ main( int argc, char** argv )
 
   // Memory buffers for random numbers for color selection
   // *** NB #402 these buffers always remain initialised at 0: no need for color choice in gcheck/check (no LHE produced) ***
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferRndNumColor hstRndCol( nevt );
 #else
   PinnedHostBufferRndNumColor hstRndCol( nevt );
@@ -392,7 +395,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for helicity selection
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferSelectedHelicity hstSelHel( nevt );
 #else
   PinnedHostBufferSelectedHelicity hstSelHel( nevt );
@@ -400,7 +403,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for color selection
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferSelectedColor hstSelCol( nevt );
 #else
   PinnedHostBufferSelectedColor hstSelCol( nevt );
@@ -438,7 +441,7 @@ main( int argc, char** argv )
     const bool onDevice = true;
     prnk.reset( new CurandRandomNumberKernel( devRndmom, onDevice ) );
 #else
-    throw std::logic_error( "INTERNAL ERROR! CurandDevice is not supported on CPUs" ); // INTERNAL ERROR (no path to this statement)
+    throw std::logic_error( "INTERNAL ERROR! CurandDevice is not supported on CPUs or non-NVidia GPUs" ); // INTERNAL ERROR (no path to this statement)
 #endif
   }
 
@@ -450,7 +453,7 @@ main( int argc, char** argv )
   }
   else
   {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     prsk.reset( new RamboSamplingKernelDevice( energy, devRndmom, devMomenta, devWeights, gpublocks, gputhreads ) );
 #else
     throw std::logic_error( "RamboDevice is not supported on CPUs" ); // INTERNAL ERROR (no path to this statement)
@@ -461,7 +464,7 @@ main( int argc, char** argv )
   std::unique_ptr<MatrixElementKernelBase> pmek;
   if( !bridge )
   {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     pmek.reset( new MatrixElementKernelDevice( devMomenta, devGs, devRndHel, devRndCol, devMatrixElements, devSelHel, devSelCol, gpublocks, gputhreads ) );
 #else
     pmek.reset( new MatrixElementKernelHost( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, nevt ) );
@@ -469,7 +472,7 @@ main( int argc, char** argv )
   }
   else
   {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     pmek.reset( new BridgeKernelDevice( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, gpublocks, gputhreads ) );
 #else
     pmek.reset( new BridgeKernelHost( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, nevt ) );
@@ -511,7 +514,7 @@ main( int argc, char** argv )
     prnk->generateRnarray();
     //std::cout << "Got random numbers" << std::endl;
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     if( rndgen != RandomNumberMode::CurandDevice && rmbsmp == RamboSamplingMode::RamboDevice )
     {
       // --- 1c. Copy rndmom from host to device
@@ -543,7 +546,7 @@ main( int argc, char** argv )
     prsk->getMomentaFinal();
     //std::cout << "Got final momenta" << std::endl;
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     if( rmbsmp == RamboSamplingMode::RamboDevice )
     {
       // --- 2c. CopyDToH Weights
@@ -588,7 +591,7 @@ main( int argc, char** argv )
       dynamic_cast<BridgeKernelBase*>( pmek.get() )->transposeInputMomentaC2F();
     }
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     // --- 2d. CopyHToD Momenta
     const std::string gKey = "0.. CpHTDg";
     rambtime += timermap.start( gKey ); // FIXME! NOT A RAMBO TIMER!
@@ -617,7 +620,7 @@ main( int argc, char** argv )
     wv3atime += timermap.stop(); // calc only
     wavetime += wv3atime;        // calc plus copy
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     if( !bridge )
     {
       // --- 3b. CopyDToH MEs
@@ -760,20 +763,22 @@ main( int argc, char** argv )
     rndgentxt = "CURAND DEVICE";
 #ifdef __CUDACC__
   rndgentxt += " (CUDA code)";
+#elif defined __HIPCC__
+  rndgentxt += " (HIP code)";
 #else
   rndgentxt += " (C++ code)";
 #endif
 
   // Workflow description summary
   std::string wrkflwtxt;
-  // -- CUDA or C++?
+  // -- CUDA or HIP or C++?
 #ifdef __CUDACC__
   wrkflwtxt += "CUD:";
 #elif defined __HIPCC__
   wrkflwtxt += "HIP:";
 #else
   wrkflwtxt += "CPP:";
-#endif
+#endif /* clang-format off */
   // -- DOUBLE or FLOAT?
 #if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
   wrkflwtxt += "MIX+"; // mixed fptypes (single precision color algebra #537)
@@ -783,7 +788,7 @@ main( int argc, char** argv )
   wrkflwtxt += "FLT+";
 #else
   wrkflwtxt += "???+"; // no path to this statement
-#endif
+#endif /* clang-format on */
   // -- CUCOMPLEX or THRUST or STD complex numbers?
 #ifdef __CUDACC__
 #if defined MGONGPU_CUCXTYPE_CUCOMPLEX
@@ -795,6 +800,12 @@ main( int argc, char** argv )
 #else
   wrkflwtxt += "???:"; // no path to this statement
 #endif
+#elif defined __HIPCC__
+#if defined MGONGPU_CUCXTYPE_CXSMPL
+  wrkflwtxt += "CXS:";
+#else
+  wrkflwtxt += "???:"; // no path to this statement
+#endif
 #else
 #if defined MGONGPU_CPPCXTYPE_STDCOMPLEX
   wrkflwtxt += "STX:";
@@ -820,7 +831,7 @@ main( int argc, char** argv )
     wrkflwtxt += "RMBDEV+";
   else
     wrkflwtxt += "??????+"; // no path to this statement
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   // -- HOST or DEVICE matrix elements? Standalone MEs or BRIDGE?
   if( !bridge )
     wrkflwtxt += "MESDEV";
@@ -876,7 +887,7 @@ main( int argc, char** argv )
 
   if( perf )
   {
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 #ifdef _OPENMP
     // Get the output of "nproc --all" (https://stackoverflow.com/a/478960)
     std::string nprocall;
@@ -897,6 +908,8 @@ main( int argc, char** argv )
     std::cout << std::string( SEP79, '*' ) << std::endl
 #ifdef __CUDACC__
               << "Process                     = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_CUDA"
+#elif defined __HIPCC__
+              << "Process                     = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_HIP"
 #else
               << "Process                     = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_CPP"
 #endif
@@ -923,21 +936,21 @@ main( int argc, char** argv )
 #elif defined MGONGPU_FPTYPE_FLOAT
               << "FP precision                = FLOAT (NaN/abnormal=" << nabn << ", zero=" << nzero << ")" << std::endl
 #endif
-#ifdef __CUDACC__
 #if defined MGONGPU_CUCXTYPE_CUCOMPLEX
               << "Complex type                = CUCOMPLEX" << std::endl
 #elif defined MGONGPU_CUCXTYPE_THRUST
               << "Complex type                = THRUST::COMPLEX" << std::endl
-#endif
-#else
+#elif defined MGONGPU_CUCXTYPE_CXSMPL
               << "Complex type                = STD::COMPLEX" << std::endl
+#else
+              << "Complex type                = ???" << std::endl // no path to this statement...
 #endif
               << "RanNumb memory layout       = AOSOA[" << neppR << "]"
               << ( neppR == 1 ? " == AOS" : "" )
               << " [HARDCODED FOR REPRODUCIBILITY]" << std::endl
               << "Momenta memory layout       = AOSOA[" << neppM << "]"
               << ( neppM == 1 ? " == AOS" : "" ) << std::endl
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     //<< "Wavefunction GPU memory     = LOCAL" << std::endl
 #else
 #if !defined MGONGPU_CPPSIMD
@@ -968,7 +981,7 @@ main( int argc, char** argv )
 #endif
 #endif
               << "Random number generation    = " << rndgentxt << std::endl
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 #ifdef _OPENMP
               << "OMP threads / `nproc --all` = " << omp_get_max_threads() << " / " << nprocall // includes a newline
 #endif
@@ -1064,12 +1077,12 @@ main( int argc, char** argv )
              << "\"FLOAT (NaN/abnormal=" << nabn << ")\"," << std::endl
 #endif
              << "\"Complex type\": "
-#ifdef __CUDACC__
 #if defined MGONGPU_CUCXTYPE_CUCOMPLEX
              << "\"CUCOMPLEX\"," << std::endl
 #elif defined MGONGPU_CUCXTYPE_THRUST
              << "\"THRUST::COMPLEX\"," << std::endl
-#endif
+#elif defined MGONGPU_CUCXTYPE_CXSMPL
+             << "\"STD::COMPLEX\"," << std::endl
 #else
              << "\"???\"," << std::endl                           // no path to this statement...
 #endif
@@ -1079,7 +1092,7 @@ main( int argc, char** argv )
              << "\"Momenta memory layout\": "
              << "\"AOSOA[" << neppM << "]\""
              << ( neppM == 1 ? " == AOS" : "" ) << ", " << std::endl
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     //<< "\"Wavefunction GPU memory\": " << "\"LOCAL\"," << std::endl
 #endif
              << "\"Curand generation\": "
diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/RamboSamplingKernels.cc b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/RamboSamplingKernels.cc
index da68aa9255..79abbcc4f8 100644
--- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/RamboSamplingKernels.cc
+++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/RamboSamplingKernels.cc
@@ -1,11 +1,11 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #include "RamboSamplingKernels.h"
 
-#include "CudaRuntime.h"
+#include "GpuRuntime.h"
 #include "MemoryAccessMomenta.h"
 #include "MemoryAccessRandomNumbers.h"
 #include "MemoryAccessWeights.h"
@@ -14,7 +14,7 @@
 
 #include <sstream>
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -92,7 +92,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   RamboSamplingKernelDevice::RamboSamplingKernelDevice( const fptype energy,               // input: energy
                                                         const BufferRndNumMomenta& rndmom, // input: random numbers in [0,1]
                                                         BufferMomenta& momenta,            // output: momenta
@@ -135,7 +135,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   __global__ void
   getMomentaInitialDevice( const fptype energy,
                            fptype* momenta )
@@ -147,17 +147,17 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   void
   RamboSamplingKernelDevice::getMomentaInitial()
   {
-    getMomentaInitialDevice<<<m_gpublocks, m_gputhreads>>>( m_energy, m_momenta.data() );
+    gpuLaunchKernel( getMomentaInitialDevice, m_gpublocks, m_gputhreads, m_energy, m_momenta.data() );
   }
 #endif
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   __global__ void
   getMomentaFinalDevice( const fptype energy,
                          const fptype* rndmom,
@@ -171,11 +171,11 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   void
   RamboSamplingKernelDevice::getMomentaFinal()
   {
-    getMomentaFinalDevice<<<m_gpublocks, m_gputhreads>>>( m_energy, m_rndmom.data(), m_momenta.data(), m_weights.data() );
+    gpuLaunchKernel( getMomentaFinalDevice, m_gpublocks, m_gputhreads, m_energy, m_rndmom.data(), m_momenta.data(), m_weights.data() );
   }
 #endif
 
diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/RamboSamplingKernels.h b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/RamboSamplingKernels.h
index 184089efd7..7c214cd74b 100644
--- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/RamboSamplingKernels.h
+++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/RamboSamplingKernels.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef RAMBOSAMPLINGKERNELS_H
 #define RAMBOSAMPLINGKERNELS_H 1
@@ -10,7 +10,7 @@
 
 #include "MemoryBuffers.h"
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -93,7 +93,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   // A class encapsulating RAMBO phase space sampling on a GPU device
   class RamboSamplingKernelDevice final : public SamplingKernelBase, public NumberOfEvents
   {
diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/RandomNumberKernels.h b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/RandomNumberKernels.h
index 188a72c2c9..21d63beeac 100644
--- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/RandomNumberKernels.h
+++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/RandomNumberKernels.h
@@ -1,14 +1,14 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef RANDOMNUMBERKERNELS_H
 #define RANDOMNUMBERKERNELS_H 1
 
 #include "mgOnGpuConfig.h"
 
-// NB This must come AFTER mgOnGpuConfig.h which contains our definition of __global__ when __CUDACC__ is not defined
+// NB This must come AFTER mgOnGpuConfig.h which contains our definition of __global__ when MGONGPUCPP_GPUIMPL is not defined
 #ifndef MGONGPU_HAS_NO_CURAND
 //#include "curand.h"
 struct curandGenerator_st; // forward definition from curand.h
@@ -16,7 +16,7 @@ struct curandGenerator_st; // forward definition from curand.h
 
 #include "MemoryBuffers.h"
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/cudacpp.mk b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/cudacpp.mk
index 509307506b..f2cfa349da 100644
--- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/cudacpp.mk
+++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/cudacpp.mk
@@ -1,7 +1,7 @@
 # Copyright (C) 2020-2023 CERN and UCLouvain.
 # Licensed under the GNU Lesser General Public License (version 3 or later).
 # Created by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin.
-# Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+# Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 
 #=== Determine the name of this makefile (https://ftp.gnu.org/old-gnu/Manuals/make-3.80/html_node/make_17.html)
 #=== NB: use ':=' to ensure that the value of CUDACPP_MAKEFILE is not modified further down after including make_opts
@@ -42,10 +42,10 @@ endif
 
 #-------------------------------------------------------------------------------
 
-#=== Configure common compiler flags for C++ and CUDA
+#=== Configure common compiler flags for C++ and CUDA/HIP
 
 INCFLAGS = -I.
-OPTFLAGS = -O3 # this ends up in CUFLAGS too (should it?), cannot add -Ofast or -ffast-math here
+OPTFLAGS = -O3 # this ends up in GPUFLAGS too (should it?), cannot add -Ofast or -ffast-math here
 
 # Dependency on src directory
 MG5AMC_COMMONLIB = mg5amc_common
@@ -121,24 +121,46 @@ endif
 
 #-------------------------------------------------------------------------------
 
-#=== Configure the CUDA compiler
+#=== Configure the GPU compiler (CUDA or HIP)
 
-# If CXX is not a single word (example "clang++ --gcc-toolchain...") then disable CUDA builds (issue #505)
-# This is because it is impossible to pass this to "CUFLAGS += -ccbin <host-compiler>" below
+# FIXME! (AV 24.01.2024)
+# In the current implementation (without separate builds for C++ and CUDA/HIP), we first check for cudacc and hipcc in CUDA_HOME and HIP_HOME.
+# If CUDA_HOME or HIP_HOME are not set, try to determine them from the path to cudacc and hipcc.
+# While convoluted, this is currently necessary to allow disabling CUDA/HIP builds by setting CUDA_HOME or HIP_HOME to invalid paths.
+# This will (probably?) be fixed when separate C++ and CUDA/HIP builds are implemented (PR #775).
+
+# If CXX is not a single word (example "clang++ --gcc-toolchain...") then disable CUDA and HIP builds (issue #505)
+# This is because it is impossible to pass this to "GPUFLAGS += -ccbin <host-compiler>" below
 ifneq ($(words $(subst ccache ,,$(CXX))),1) # allow at most "CXX=ccache <host-compiler>" from outside
-  $(warning CUDA builds are not supported for multi-word CXX "$(CXX)")
+  $(warning CUDA and HIP builds are not supported for multi-word CXX "$(CXX)")
   override CUDA_HOME=disabled
+  override HIP_HOME=disabled
 endif
 
-# If CUDA_HOME is not set, try to set it from the location of nvcc
+# If CUDA_HOME is not set, try to set it from the path to nvcc
 ifndef CUDA_HOME
   CUDA_HOME = $(patsubst %bin/nvcc,%,$(shell which nvcc 2>/dev/null))
   $(warning CUDA_HOME was not set: using "$(CUDA_HOME)")
 endif
 
-# Set NVCC as $(CUDA_HOME)/bin/nvcc if it exists
+# If HIP_HOME is not set, try to set it from the path to hipcc
+ifndef HIP_HOME
+  HIP_HOME = $(patsubst %bin/hipcc,%,$(HIP_COMPILER_PATH))
+  $(warning HIP_HOME was not set: using "$(HIP_HOME)")
+endif
+
+# FIXME! (AV 24.01.2024)
+# In the current implementation (without separate builds for C++ and CUDA/HIP),
+# builds are performed for HIP only if CUDA is not found in the path.
+# If both CUDA and HIP are installed, HIP builds can be triggered by unsetting CUDA_HOME.
+# This will be fixed when separate C++ and CUDA/HIP builds are implemented (PR #775).
+
+#--- Option 1: CUDA exists -> use CUDA
+
+# Set GPUCC as $(CUDA_HOME)/bin/nvcc if it exists
 ifneq ($(wildcard $(CUDA_HOME)/bin/nvcc),)
-  NVCC = $(CUDA_HOME)/bin/nvcc
+
+  GPUCC = $(CUDA_HOME)/bin/nvcc
   USE_NVTX ?=-DUSE_NVTX
   # See https://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/index.html
   # See https://arnon.dk/matching-sm-architectures-arch-and-gencode-for-various-nvidia-cards/
@@ -158,41 +180,78 @@ ifneq ($(wildcard $(CUDA_HOME)/bin/nvcc),)
     CURANDLIBFLAGS = -L$(CUDA_HOME)/lib64/ -lcurand # NB: -lcuda is not needed here!
   endif
   CUOPTFLAGS = -lineinfo
-  CUFLAGS = $(foreach opt, $(OPTFLAGS), -Xcompiler $(opt)) $(CUOPTFLAGS) $(INCFLAGS) $(CUINC) $(USE_NVTX) $(CUARCHFLAGS) -use_fast_math
-  ###CUFLAGS += -Xcompiler -Wall -Xcompiler -Wextra -Xcompiler -Wshadow
-  ###NVCC_VERSION = $(shell $(NVCC) --version | grep 'Cuda compilation tools' | cut -d' ' -f5 | cut -d, -f1)
-  CUFLAGS += -std=c++17 # need CUDA >= 11.2 (see #333): this is enforced in mgOnGpuConfig.h
+  ###GPUFLAGS = $(OPTFLAGS) $(CUOPTFLAGS) $(INCFLAGS) $(CUINC) $(USE_NVTX) $(CUARCHFLAGS) -use_fast_math
+  GPUFLAGS = $(foreach opt, $(OPTFLAGS), -Xcompiler $(opt)) $(CUOPTFLAGS) $(INCFLAGS) $(CUINC) $(USE_NVTX) $(CUARCHFLAGS) -use_fast_math
+  ###GPUFLAGS += -Xcompiler -Wall -Xcompiler -Wextra -Xcompiler -Wshadow
+  ###GPUCC_VERSION = $(shell $(GPUCC) --version | grep 'Cuda compilation tools' | cut -d' ' -f5 | cut -d, -f1)
+  GPUFLAGS += -std=c++17 # need CUDA >= 11.2 (see #333): this is enforced in mgOnGpuConfig.h
   # Without -maxrregcount: baseline throughput: 6.5E8 (16384 32 12) up to 7.3E8 (65536 128 12)
-  ###CUFLAGS+= --maxrregcount 160 # improves throughput: 6.9E8 (16384 32 12) up to 7.7E8 (65536 128 12)
-  ###CUFLAGS+= --maxrregcount 128 # improves throughput: 7.3E8 (16384 32 12) up to 7.6E8 (65536 128 12)
-  ###CUFLAGS+= --maxrregcount 96 # degrades throughput: 4.1E8 (16384 32 12) up to 4.5E8 (65536 128 12)
-  ###CUFLAGS+= --maxrregcount 64 # degrades throughput: 1.7E8 (16384 32 12) flat at 1.7E8 (65536 128 12)
+  ###GPUFLAGS+= --maxrregcount 160 # improves throughput: 6.9E8 (16384 32 12) up to 7.7E8 (65536 128 12)
+  ###GPUFLAGS+= --maxrregcount 128 # improves throughput: 7.3E8 (16384 32 12) up to 7.6E8 (65536 128 12)
+  ###GPUFLAGS+= --maxrregcount 96 # degrades throughput: 4.1E8 (16384 32 12) up to 4.5E8 (65536 128 12)
+  ###GPUFLAGS+= --maxrregcount 64 # degrades throughput: 1.7E8 (16384 32 12) flat at 1.7E8 (65536 128 12)
+  CUBUILDRULEFLAGS = -Xcompiler -fPIC -c
+  CCBUILDRULEFLAGS = -Xcompiler -fPIC -c -x cu
+  CUDATESTFLAGS = -lcuda
+
+  # Set the host C++ compiler for GPUCC via "-ccbin <host-compiler>"
+  # (NB issue #505: this must be a single word, "clang++ --gcc-toolchain..." is not supported)
+  GPUFLAGS += -ccbin $(shell which $(subst ccache ,,$(CXX)))
+
+  # Allow newer (unsupported) C++ compilers with older versions of CUDA if ALLOW_UNSUPPORTED_COMPILER_IN_CUDA is set (#504)
+  ifneq ($(origin ALLOW_UNSUPPORTED_COMPILER_IN_CUDA),undefined)
+  GPUFLAGS += -allow-unsupported-compiler
+  endif
+
 else ifneq ($(origin REQUIRE_CUDA),undefined)
+
   # If REQUIRE_CUDA is set but no cuda is found, stop here (e.g. for CI tests on GPU #443)
-  $(error No cuda installation found (set CUDA_HOME or make nvcc visible in PATH))
+  $(error No cuda installation found (set CUDA_HOME or make GPUCC visible in PATH))
+
+#--- Option 2: CUDA does not exist, HIP exists -> use HIP
+
+# Set GPUCC as $(HIP_HOME)/bin/hipcc if it exists
+else ifneq ($(wildcard $(HIP_HOME)/bin/hipcc),)
+
+  GPUCC = $(HIP_HOME)/bin/hipcc
+  #USE_NVTX ?=-DUSE_NVTX # should maybe find something equivalent to this in HIP?
+  HIPARCHFLAGS = -target x86_64-linux-gnu --offload-arch=gfx90a
+  HIPINC = -I$(HIP_HOME)/include/
+  # Note: -DHIP_FAST_MATH is equivalent to -use_fast_math in HIP 
+  # (but only for single precision line 208: https://rocm-developer-tools.github.io/HIP/hcc__detail_2math__functions_8h_source.html)
+  GPUFLAGS = $(OPTFLAGS) $(CUOPTFLAGS) $(INCFLAGS) $(HIPINC) $(HIPARCHFLAGS) -DHIP_FAST_MATH -DHIP_PLATFORM=amd -fPIC
+  ###GPUFLAGS += -Xcompiler -Wall -Xcompiler -Wextra -Xcompiler -Wshadow
+  GPUFLAGS += -std=c++17
+  ###GPUFLAGS+= --maxrregcount 255 # (AV: is this option valid on HIP and meaningful on AMD GPUs?)
+  CUBUILDRULEFLAGS = -fPIC -c
+  CCBUILDRULEFLAGS = -fPIC -c
+
+else ifneq ($(origin REQUIRE_HIP),undefined)
+
+  # If REQUIRE_HIP is set but no HIP is found, stop here (e.g. for CI tests on GPU #443)
+  $(error No hip installation found (set HIP_HOME or make GPUCC visible in PATH))
+
+#--- Option 3: CUDA does not exist, HIP does not exist -> switch off both CUDA and HIP
+
 else
-  # No cuda. Switch cuda compilation off and go to common random numbers in C++
+
+  # No cudacc and no hipcc: switch CUDA and HIP compilation off and go to common random numbers in C++
   $(warning CUDA_HOME is not set or is invalid: export CUDA_HOME to compile with cuda)
-  override NVCC=
+  $(warning HIP_HOME is not set or is invalid: export HIP_HOME to compile with hip)
+  override GPUCC=
   override USE_NVTX=
   override CUINC=
   override CURANDLIBFLAGS=
-endif
-export NVCC
-export CUFLAGS
-
-# Set the host C++ compiler for nvcc via "-ccbin <host-compiler>"
-# (NB issue #505: this must be a single word, "clang++ --gcc-toolchain..." is not supported)
-CUFLAGS += -ccbin $(shell which $(subst ccache ,,$(CXX)))
 
-# Allow newer (unsupported) C++ compilers with older versions of CUDA if ALLOW_UNSUPPORTED_COMPILER_IN_CUDA is set (#504)
-ifneq ($(origin ALLOW_UNSUPPORTED_COMPILER_IN_CUDA),undefined)
-CUFLAGS += -allow-unsupported-compiler
 endif
 
+# Export GPUCC (so that it can also be used in cudacpp_src.mk?)
+export GPUCC
+export GPUFLAGS
+
 #-------------------------------------------------------------------------------
 
-#=== Configure ccache for C++ and CUDA builds
+#=== Configure ccache for C++ and CUDA/HIP builds
 
 # Enable ccache if USECCACHE=1
 ifeq ($(USECCACHE)$(shell echo $(CXX) | grep ccache),1)
@@ -201,15 +260,15 @@ endif
 #ifeq ($(USECCACHE)$(shell echo $(AR) | grep ccache),1)
 #  override AR:=ccache $(AR)
 #endif
-ifneq ($(NVCC),)
-  ifeq ($(USECCACHE)$(shell echo $(NVCC) | grep ccache),1)
-    override NVCC:=ccache $(NVCC)
+ifneq ($(GPUCC),)
+  ifeq ($(USECCACHE)$(shell echo $(GPUCC) | grep ccache),1)
+    override GPUCC:=ccache $(GPUCC)
   endif
 endif
 
 #-------------------------------------------------------------------------------
 
-#=== Configure PowerPC-specific compiler flags for C++ and CUDA
+#=== Configure PowerPC-specific compiler flags for C++ and CUDA/HIP
 
 # PowerPC-specific CXX compiler flags (being reviewed)
 ifeq ($(UNAME_P),ppc64le)
@@ -225,9 +284,9 @@ else
   ######CXXFLAGS+= -fno-semantic-interposition # no benefit (neither alone, nor combined with -flto)
 endif
 
-# PowerPC-specific CUDA compiler flags (to be reviewed!)
+# PowerPC-specific CUDA/HIP compiler flags (to be reviewed!)
 ifeq ($(UNAME_P),ppc64le)
-  CUFLAGS+= -Xcompiler -mno-float128
+  GPUFLAGS+= -Xcompiler -mno-float128
 endif
 
 #-------------------------------------------------------------------------------
@@ -237,7 +296,7 @@ endif
 # Set the default OMPFLAGS choice
 ifneq ($(shell $(CXX) --version | egrep '^Intel'),)
 override OMPFLAGS = -fopenmp
-###override OMPFLAGS = # disable OpenMP MT on Intel (was ok without nvcc but not ok with nvcc before #578)
+###override OMPFLAGS = # disable OpenMP MT on Intel (was ok without GPUCC but not ok with GPUCC before #578)
 else ifneq ($(shell $(CXX) --version | egrep '^(clang)'),)
 override OMPFLAGS = -fopenmp
 ###override OMPFLAGS = # disable OpenMP MT on clang (was not ok without or with nvcc before #578)
@@ -293,7 +352,10 @@ endif
 
 # Set the default RNDGEN (random number generator) choice
 ifeq ($(RNDGEN),)
-  ifeq ($(NVCC),)
+  ifeq ($(GPUCC),)
+    override RNDGEN = hasNoCurand
+  # Edgecase for HIP compilation
+  else ifeq ($(findstring hipcc,$(GPUCC)),hipcc)
     override RNDGEN = hasNoCurand
   else ifeq ($(RNDGEN),)
     override RNDGEN = hasCurand
@@ -310,7 +372,7 @@ export OMPFLAGS
 
 #-------------------------------------------------------------------------------
 
-#=== Set the CUDA/C++ compiler flags appropriate to user-defined choices of AVX, FPTYPE, HELINL, HRDCOD, RNDGEN
+#=== Set the CUDA/HIP/C++ compiler flags appropriate to user-defined choices of AVX, FPTYPE, HELINL, HRDCOD, RNDGEN
 
 # Set the build flags appropriate to OMPFLAGS
 $(info OMPFLAGS=$(OMPFLAGS))
@@ -368,13 +430,13 @@ CXXFLAGS+= $(AVXFLAGS)
 $(info FPTYPE=$(FPTYPE))
 ifeq ($(FPTYPE),d)
   CXXFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_DOUBLE
-  CUFLAGS  += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_DOUBLE
+  GPUFLAGS  += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_DOUBLE
 else ifeq ($(FPTYPE),f)
   CXXFLAGS += -DMGONGPU_FPTYPE_FLOAT -DMGONGPU_FPTYPE2_FLOAT
-  CUFLAGS  += -DMGONGPU_FPTYPE_FLOAT -DMGONGPU_FPTYPE2_FLOAT
+  GPUFLAGS  += -DMGONGPU_FPTYPE_FLOAT -DMGONGPU_FPTYPE2_FLOAT
 else ifeq ($(FPTYPE),m)
   CXXFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_FLOAT
-  CUFLAGS  += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_FLOAT
+  GPUFLAGS  += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_FLOAT
 else
   $(error Unknown FPTYPE='$(FPTYPE)': only 'd', 'f' and 'm' are supported)
 endif
@@ -383,7 +445,7 @@ endif
 $(info HELINL=$(HELINL))
 ifeq ($(HELINL),1)
   CXXFLAGS += -DMGONGPU_INLINE_HELAMPS
-  CUFLAGS  += -DMGONGPU_INLINE_HELAMPS
+  GPUFLAGS  += -DMGONGPU_INLINE_HELAMPS
 else ifneq ($(HELINL),0)
   $(error Unknown HELINL='$(HELINL)': only '0' and '1' are supported)
 endif
@@ -392,7 +454,7 @@ endif
 $(info HRDCOD=$(HRDCOD))
 ifeq ($(HRDCOD),1)
   CXXFLAGS += -DMGONGPU_HARDCODE_PARAM
-  CUFLAGS  += -DMGONGPU_HARDCODE_PARAM
+  GPUFLAGS  += -DMGONGPU_HARDCODE_PARAM
 else ifneq ($(HRDCOD),0)
   $(error Unknown HRDCOD='$(HRDCOD)': only '0' and '1' are supported)
 endif
@@ -444,11 +506,11 @@ ifeq ($(UNAME_S),Darwin)
   override CULIBFLAGSRPATH2 =
 else
   # RPATH to cuda/cpp libs when linking executables
-  override CXXLIBFLAGSRPATH = -Wl,-rpath,$(LIBDIRRPATH)
-  override CULIBFLAGSRPATH = -Xlinker -rpath,$(LIBDIRRPATH)
+  override CXXLIBFLAGSRPATH = -Wl,-rpath=$(LIBDIRRPATH)
+  override CULIBFLAGSRPATH = -Xlinker -rpath=$(LIBDIRRPATH)
   # RPATH to common lib when linking cuda/cpp libs
-  override CXXLIBFLAGSRPATH2 = -Wl,-rpath,'$$ORIGIN'
-  override CULIBFLAGSRPATH2 = -Xlinker -rpath,'$$ORIGIN'
+  override CXXLIBFLAGSRPATH2 = -Wl,-rpath='$$ORIGIN'
+  override CULIBFLAGSRPATH2 = -Xlinker -rpath='$$ORIGIN'
 endif
 
 # Setting LD_LIBRARY_PATH or DYLD_LIBRARY_PATH in the RUNTIME is no longer necessary (neither on Linux nor on Mac)
@@ -461,7 +523,7 @@ override RUNTIME =
 cxx_main=$(BUILDDIR)/check.exe
 fcxx_main=$(BUILDDIR)/fcheck.exe
 
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 cu_main=$(BUILDDIR)/gcheck.exe
 fcu_main=$(BUILDDIR)/fgcheck.exe
 else
@@ -492,15 +554,16 @@ $(BUILDDIR)/.build.$(TAG):
 	@touch $(BUILDDIR)/.build.$(TAG)
 
 # Generic target and build rules: objects from CUDA compilation
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 $(BUILDDIR)/%.o : %.cu *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
 	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
-	$(NVCC) $(CPPFLAGS) $(CUFLAGS) -Xcompiler -fPIC -c $< -o $@
+	$(GPUCC) $(CPPFLAGS) $(GPUFLAGS) $(CUBUILDRULEFLAGS) $< -o $@
 
 $(BUILDDIR)/%_cu.o : %.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
 	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
-	$(NVCC) $(CPPFLAGS) $(CUFLAGS) -Xcompiler -fPIC -c -x cu $< -o $@
+	$(GPUCC) $(CPPFLAGS) $(GPUFLAGS) $(CCBUILDRULEFLAGS) $< -o $@
 endif
+# -x cu in line above
 
 # Generic target and build rules: objects from C++ compilation
 # (NB do not include CUINC here! add it only for NVTX or curand #679)
@@ -509,11 +572,14 @@ $(BUILDDIR)/%.o : %.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
 	$(CXX) $(CPPFLAGS) $(CXXFLAGS) -fPIC -c $< -o $@
 
 # Apply special build flags only to CrossSectionKernel.cc and gCrossSectionKernel.cu (no fast math, see #117 and #516)
+# Added edgecase for HIP compilation
 ifeq ($(shell $(CXX) --version | grep ^nvc++),)
 $(BUILDDIR)/CrossSectionKernels.o: CXXFLAGS := $(filter-out -ffast-math,$(CXXFLAGS))
 $(BUILDDIR)/CrossSectionKernels.o: CXXFLAGS += -fno-fast-math
-ifneq ($(NVCC),)
-$(BUILDDIR)/gCrossSectionKernels.o: CUFLAGS += -Xcompiler -fno-fast-math
+ifeq ($(findstring nvcc,$(GPUCC)),nvcc)
+  $(BUILDDIR)/gCrossSectionKernels.o: GPUFLAGS += -Xcompiler -fno-fast-math
+else
+  $(BUILDDIR)/gCrossSectionKernels.o: GPUFLAGS += -fno-fast-math
 endif
 endif
 
@@ -530,10 +596,10 @@ ifeq ($(RNDGEN),hasCurand)
 $(BUILDDIR)/CurandRandomNumberKernel.o: CXXFLAGS += $(CUINC)
 endif
 
-# Avoid "warning: builtin __has_trivial_... is deprecated; use __is_trivially_... instead" in nvcc with icx2023 (#592)
+# Avoid "warning: builtin __has_trivial_... is deprecated; use __is_trivially_... instead" in GPUCC with icx2023 (#592)
 ifneq ($(shell $(CXX) --version | egrep '^(Intel)'),)
-ifneq ($(NVCC),)
-CUFLAGS += -Xcompiler -Wno-deprecated-builtins
+ifneq ($(GPUCC),)
+GPUFLAGS += -Wno-deprecated-builtins
 endif
 endif
 
@@ -541,8 +607,8 @@ endif
 # This patch does remove the warning, but I prefer to keep it disabled for the moment...
 ###ifneq ($(shell $(CXX) --version | egrep '^(clang|Apple clang|Intel)'),)
 ###$(BUILDDIR)/CrossSectionKernels.o: CXXFLAGS += -Wno-overriding-t-option
-###ifneq ($(NVCC),)
-###$(BUILDDIR)/gCrossSectionKernels.o: CUFLAGS += -Xcompiler -Wno-overriding-t-option
+###ifneq ($(GPUCC),)
+###$(BUILDDIR)/gCrossSectionKernels.o: GPUFLAGS += -Xcompiler -Wno-overriding-t-option
 ###endif
 ###endif
 
@@ -569,7 +635,7 @@ MG5AMC_CXXLIB = mg5amc_$(processid_short)_cpp
 cxx_objects_lib=$(BUILDDIR)/CPPProcess.o $(BUILDDIR)/MatrixElementKernels.o $(BUILDDIR)/BridgeKernels.o $(BUILDDIR)/CrossSectionKernels.o
 cxx_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel.o $(BUILDDIR)/RamboSamplingKernels.o
 
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 MG5AMC_CULIB = mg5amc_$(processid_short)_cuda
 cu_objects_lib=$(BUILDDIR)/gCPPProcess.o $(BUILDDIR)/gMatrixElementKernels.o $(BUILDDIR)/gBridgeKernels.o $(BUILDDIR)/gCrossSectionKernels.o
 cu_objects_exe=$(BUILDDIR)/gCommonRandomNumberKernel.o $(BUILDDIR)/gRamboSamplingKernels.o
@@ -581,11 +647,11 @@ $(LIBDIR)/lib$(MG5AMC_CXXLIB).so: cxx_objects_lib += $(BUILDDIR)/fbridge.o
 $(LIBDIR)/lib$(MG5AMC_CXXLIB).so: $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib)
 	$(CXX) -shared -o $@ $(cxx_objects_lib) $(CXXLIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB)
 
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 $(LIBDIR)/lib$(MG5AMC_CULIB).so: $(BUILDDIR)/fbridge_cu.o
 $(LIBDIR)/lib$(MG5AMC_CULIB).so: cu_objects_lib += $(BUILDDIR)/fbridge_cu.o
 $(LIBDIR)/lib$(MG5AMC_CULIB).so: $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cu_objects_lib)
-	$(NVCC) --shared -o $@ $(cu_objects_lib) $(CULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB)
+	$(GPUCC) --shared -o $@ $(cu_objects_lib) $(CULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB)
 endif
 
 #-------------------------------------------------------------------------------
@@ -602,16 +668,16 @@ $(cxx_main): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PAT
 $(cxx_main): $(BUILDDIR)/check_sa.o $(LIBDIR)/lib$(MG5AMC_CXXLIB).so $(cxx_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel.o
 	$(CXX) -o $@ $(BUILDDIR)/check_sa.o $(OMPFLAGS) -ldl -pthread $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_CXXLIB) $(cxx_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel.o $(CURANDLIBFLAGS)
 
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 ifneq ($(shell $(CXX) --version | grep ^Intel),)
-$(cu_main): LIBFLAGS += -lintlc # compile with icpx and link with nvcc (undefined reference to `_intel_fast_memcpy')
-$(cu_main): LIBFLAGS += -lsvml # compile with icpx and link with nvcc (undefined reference to `__svml_cos4_l9')
+$(cu_main): LIBFLAGS += -lintlc # compile with icpx and link with GPUCC (undefined reference to `_intel_fast_memcpy')
+$(cu_main): LIBFLAGS += -lsvml # compile with icpx and link with GPUCC (undefined reference to `__svml_cos4_l9')
 else ifneq ($(shell $(CXX) --version | grep ^nvc++),) # support nvc++ #531
 $(cu_main): LIBFLAGS += -L$(patsubst %bin/nvc++,%lib,$(subst ccache ,,$(CXX))) -lnvhpcatm -lnvcpumath -lnvc
 endif
 $(cu_main): LIBFLAGS += $(CULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH
 $(cu_main): $(BUILDDIR)/gcheck_sa.o $(LIBDIR)/lib$(MG5AMC_CULIB).so $(cu_objects_exe) $(BUILDDIR)/gCurandRandomNumberKernel.o
-	$(NVCC) -o $@ $(BUILDDIR)/gcheck_sa.o $(CUARCHFLAGS) $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe) $(BUILDDIR)/gCurandRandomNumberKernel.o $(CURANDLIBFLAGS)
+	$(GPUCC) -o $@ $(BUILDDIR)/gcheck_sa.o $(CUARCHFLAGS) $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe) $(BUILDDIR)/gCurandRandomNumberKernel.o $(CURANDLIBFLAGS)
 endif
 
 #-------------------------------------------------------------------------------
@@ -637,17 +703,17 @@ $(fcxx_main): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PA
 $(fcxx_main): $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler.o $(LIBDIR)/lib$(MG5AMC_CXXLIB).so $(cxx_objects_exe)
 	$(CXX) -o $@ $(BUILDDIR)/fcheck_sa.o $(OMPFLAGS) $(BUILDDIR)/fsampler.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CXXLIB) $(cxx_objects_exe)
 
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 ifneq ($(shell $(CXX) --version | grep ^Intel),)
-$(fcu_main): LIBFLAGS += -lintlc # compile with icpx and link with nvcc (undefined reference to `_intel_fast_memcpy')
-$(fcu_main): LIBFLAGS += -lsvml # compile with icpx and link with nvcc (undefined reference to `__svml_cos4_l9')
+$(fcu_main): LIBFLAGS += -lintlc # compile with icpx and link with GPUCC (undefined reference to `_intel_fast_memcpy')
+$(fcu_main): LIBFLAGS += -lsvml # compile with icpx and link with GPUCC (undefined reference to `__svml_cos4_l9')
 endif
 ifeq ($(UNAME_S),Darwin)
 $(fcu_main): LIBFLAGS += -L$(shell dirname $(shell $(FC) --print-file-name libgfortran.dylib)) # add path to libgfortran on Mac #375
 endif
 $(fcu_main): LIBFLAGS += $(CULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH
 $(fcu_main): $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler_cu.o $(LIBDIR)/lib$(MG5AMC_CULIB).so $(cu_objects_exe)
-	$(NVCC) -o $@ $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler_cu.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe)
+	$(GPUCC) -o $@ $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler_cu.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe)
 endif
 
 #-------------------------------------------------------------------------------
@@ -659,7 +725,7 @@ $(BUILDDIR)/testxxx.o: testxxx_cc_ref.txt
 $(testmain): $(BUILDDIR)/testxxx.o
 $(testmain): cxx_objects_exe += $(BUILDDIR)/testxxx.o # Comment out this line to skip the C++ test of xxx functions
 
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 $(BUILDDIR)/testxxx_cu.o: $(GTESTLIBS)
 $(BUILDDIR)/testxxx_cu.o: INCFLAGS += $(GTESTINC)
 $(BUILDDIR)/testxxx_cu.o: testxxx_cc_ref.txt
@@ -672,7 +738,7 @@ $(BUILDDIR)/testmisc.o: INCFLAGS += $(GTESTINC)
 $(testmain): $(BUILDDIR)/testmisc.o
 $(testmain): cxx_objects_exe += $(BUILDDIR)/testmisc.o # Comment out this line to skip the C++ miscellaneous tests
 
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 $(BUILDDIR)/testmisc_cu.o: $(GTESTLIBS)
 $(BUILDDIR)/testmisc_cu.o: INCFLAGS += $(GTESTINC)
 $(testmain): $(BUILDDIR)/testmisc_cu.o
@@ -684,12 +750,12 @@ $(BUILDDIR)/runTest.o: INCFLAGS += $(GTESTINC)
 $(testmain): $(BUILDDIR)/runTest.o
 $(testmain): cxx_objects_exe += $(BUILDDIR)/runTest.o
 
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 $(BUILDDIR)/runTest_cu.o: $(GTESTLIBS)
 $(BUILDDIR)/runTest_cu.o: INCFLAGS += $(GTESTINC)
 ifneq ($(shell $(CXX) --version | grep ^Intel),)
-$(testmain): LIBFLAGS += -lintlc # compile with icpx and link with nvcc (undefined reference to `_intel_fast_memcpy')
-$(testmain): LIBFLAGS += -lsvml # compile with icpx and link with nvcc (undefined reference to `__svml_cos4_l9')
+$(testmain): LIBFLAGS += -lintlc # compile with icpx and link with GPUCC (undefined reference to `_intel_fast_memcpy')
+$(testmain): LIBFLAGS += -lsvml # compile with icpx and link with GPUCC (undefined reference to `__svml_cos4_l9')
 else ifneq ($(shell $(CXX) --version | grep ^nvc++),) # support nvc++ #531
 $(testmain): LIBFLAGS += -L$(patsubst %bin/nvc++,%lib,$(subst ccache ,,$(CXX))) -lnvhpcatm -lnvcpumath -lnvc
 endif
@@ -713,14 +779,14 @@ $(testmain): LIBFLAGS += -lgomp
 endif
 endif
 
-ifeq ($(NVCC),) # link only runTest.o
+ifeq ($(GPUCC),) # link only runTest.o
 $(testmain): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH
 $(testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(cxx_objects_exe) $(GTESTLIBS)
 	$(CXX) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) -ldl -pthread $(LIBFLAGS)
 else # link both runTest.o and runTest_cu.o
 $(testmain): LIBFLAGS += $(CULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH
 $(testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) $(GTESTLIBS)
-	$(NVCC) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) -ldl $(LIBFLAGS) -lcuda
+	  $(GPUCC) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) -ldl $(LIBFLAGS) $(CUDATESTFLAGS)
 endif
 
 # Use target gtestlibs to build only googletest
@@ -829,9 +895,9 @@ ifeq ($(USECCACHE),1)
 	ccache --version | head -1
 endif
 	@echo ""
-	@echo NVCC=$(NVCC)
-ifneq ($(NVCC),)
-	$(NVCC) --version
+	@echo GPUCC=$(GPUCC)
+ifneq ($(GPUCC),)
+	$(GPUCC) --version
 endif
 	@echo ""
 	@echo CXX=$(CXX)
@@ -850,7 +916,7 @@ endif
 
 # Target: check (run the C++ test executable)
 # [NB THIS IS WHAT IS USED IN THE GITHUB CI!]
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 check: runTest cmpFcheck cmpFGcheck
 else
 check: runTest cmpFcheck
diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/fbridge.cc b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/fbridge.cc
index 2d2b36d560..22ce3f5115 100644
--- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/fbridge.cc
+++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/fbridge.cc
@@ -1,11 +1,11 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: S. Roiser (Oct 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Roiser, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #include "Bridge.h"
 #include "CPPProcess.h"
-#include "CudaRuntime.h"
+#include "GpuRuntime.h"
 
 extern "C"
 {
@@ -22,7 +22,7 @@ extern "C"
    * Using the same Fortran MadEvent code, linking to the hetrerogeneous library would allow access to both CPU and GPU implementations.
    * The specific heterogeneous configuration (how many GPUs, how many threads on each CPU, etc) could be loaded in CUDA/C++ from a data file.
    */
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   using namespace mg5amcGpu;
 #else
   using namespace mg5amcCpu;
@@ -46,8 +46,8 @@ extern "C"
    */
   void fbridgecreate_( CppObjectInFortran** ppbridge, const int* pnevtF, const int* pnparF, const int* pnp4F )
   {
-#ifdef __CUDACC__
-    CudaRuntime::setUp();
+#ifdef MGONGPUCPP_GPUIMPL
+    GpuRuntime::setUp();
 #endif
     // (NB: CPPProcess::initProc no longer needs to be executed here because it is called in the Bridge constructor)
     // FIXME: disable OMP in Bridge when called from Fortran
@@ -65,8 +65,8 @@ extern "C"
     Bridge<FORTRANFPTYPE>* pbridge = dynamic_cast<Bridge<FORTRANFPTYPE>*>( *ppbridge );
     if( pbridge == 0 ) throw std::runtime_error( "fbridgedelete_: invalid Bridge address" );
     delete pbridge;
-#ifdef __CUDACC__
-    CudaRuntime::tearDown();
+#ifdef MGONGPUCPP_GPUIMPL
+    GpuRuntime::tearDown();
 #endif
   }
 
@@ -96,7 +96,7 @@ extern "C"
   {
     Bridge<FORTRANFPTYPE>* pbridge = dynamic_cast<Bridge<FORTRANFPTYPE>*>( *ppbridge );
     if( pbridge == 0 ) throw std::runtime_error( "fbridgesequence_: invalid Bridge address" );
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     // Use the device/GPU implementation in the CUDA library
     // (there is also a host implementation in this library)
     pbridge->gpu_sequence( momenta, gs, rndhel, rndcol, *pchannelId, mes, selhel, selcol );
diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/fsampler.cc b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/fsampler.cc
index 2fb445372d..3743934f41 100644
--- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/fsampler.cc
+++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/fsampler.cc
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Feb 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #include "mgOnGpuConfig.h"
 
@@ -13,7 +13,7 @@
 
 //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -40,7 +40,7 @@ namespace mg5amcCpu
   private:
     const int m_nevt; // The number of events in each iteration
     int m_iiter;      // The iteration counter (for random number seeding)
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
     HostBufferRndNumMomenta m_hstRndmom; // Memory buffers for random numbers
     HostBufferMomenta m_hstMomenta;      // Memory buffers for momenta
     HostBufferWeights m_hstWeights;      // Memory buffers for sampling weights
@@ -105,7 +105,7 @@ namespace mg5amcCpu
 
 extern "C"
 {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   using namespace mg5amcGpu;
 #else
   using namespace mg5amcCpu;
diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/runTest.cc b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/runTest.cc
index 904cb78a72..de327f2321 100644
--- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/runTest.cc
+++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/runTest.cc
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: S. Hageboeck (Nov 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 //----------------------------------------------------------------------------
 // Use ./runTest.exe --gtest_filter=*xxx to run only testxxx.cc tests
 //----------------------------------------------------------------------------
@@ -18,7 +18,7 @@
 #include "RandomNumberKernels.h"
 #include "epoch_process_id.h"
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 using namespace mg5amcGpu;
 #else
 using namespace mg5amcCpu;
@@ -35,7 +35,7 @@ struct CUDA_CPU_TestBase : public TestDriverBase
     : TestDriverBase( npar, refFileName ) {}
 };
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 struct CPUTest : public CUDA_CPU_TestBase
 {
   // Struct data members (process, and memory structures for random numbers, momenta, matrix elements and weights on host and device)
@@ -119,7 +119,7 @@ struct CPUTest : public CUDA_CPU_TestBase
 };
 #endif
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 struct CUDATest : public CUDA_CPU_TestBase
 {
   // Reset the device when our test goes out of scope. Note that this should happen after
@@ -256,7 +256,7 @@ INSTANTIATE_TEST_SUITE_P( prefix, \
                           test_suite_name, \
                           testing::Values( new CUDATest( MG_EPOCH_REFERENCE_FILE_NAME ) ) );
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 MG_INSTANTIATE_TEST_SUITE_GPU( XTESTID_GPU( MG_EPOCH_PROCESS_ID ), MadgraphTest );
 #else
 MG_INSTANTIATE_TEST_SUITE_CPU( XTESTID_CPU( MG_EPOCH_PROCESS_ID ), MadgraphTest );
diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/testmisc.cc b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/testmisc.cc
index 895d6eeb56..ba9e59a8a3 100644
--- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/testmisc.cc
+++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/testmisc.cc
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 //----------------------------------------------------------------------------
 // Use ./runTest.exe --gtest_filter=*misc to run only testmisc.cc tests
 //----------------------------------------------------------------------------
@@ -17,7 +17,7 @@
 #include <sstream>
 #include <typeinfo>
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 #define TESTID( s ) s##_GPU_MISC
 #else
 #define TESTID( s ) s##_CPU_MISC
@@ -26,7 +26,7 @@
 #define XTESTID( s ) TESTID( s )
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -59,7 +59,7 @@ namespace mg5amcCpu
 
 TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testmisc )
 {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   using namespace mg5amcGpu;
 #else
   using namespace mg5amcCpu;
diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/testxxx.cc b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/testxxx.cc
index 3361fe5aa9..e5167de00c 100644
--- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/testxxx.cc
+++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/testxxx.cc
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Apr 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 //----------------------------------------------------------------------------
 // Use ./runTest.exe --gtest_filter=*xxx to run only testxxx.cc tests
 //----------------------------------------------------------------------------
@@ -24,7 +24,7 @@
 #include <iomanip>
 #include <iostream>
 #include <vector>
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 #define TESTID( s ) s##_GPU_XXX
 #else
 #define TESTID( s ) s##_CPU_XXX
@@ -32,7 +32,7 @@
 
 #define XTESTID( s ) TESTID( s )
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -42,7 +42,7 @@ namespace mg5amcCpu
   int FPEhandlerIevt = -1;
   inline void FPEhandler( int sig )
   {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     std::cerr << "Floating Point Exception (GPU): '" << FPEhandlerMessage << "' ievt=" << FPEhandlerIevt << std::endl;
 #else
     std::cerr << "Floating Point Exception (CPU neppV=" << neppV << "): '" << FPEhandlerMessage << "' ievt=" << FPEhandlerIevt << std::endl;
@@ -53,7 +53,7 @@ namespace mg5amcCpu
 
 TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testxxx )
 {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   using namespace mg5amcGpu;
 #else
   using namespace mg5amcCpu;
@@ -77,7 +77,7 @@ TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testxxx )
   assert( nevt % neppM == 0 ); // nevt must be a multiple of neppM
   assert( nevt % neppV == 0 ); // nevt must be a multiple of neppV
   // Fill in the input momenta
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   mg5amcGpu::PinnedHostBufferMomenta hstMomenta( nevt ); // AOSOA[npagM][npar=4][np4=4][neppM]
 #else
   mg5amcCpu::HostBufferMomenta hstMomenta( nevt ); // AOSOA[npagM][npar=4][np4=4][neppM]
@@ -322,7 +322,7 @@ TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testxxx )
   {
     for( int ievt = 0; ievt < nevt; ievt++ )
     {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
       using namespace mg5amcGpu;
 #else
       using namespace mg5amcCpu;
diff --git a/epochX/cudacpp/gg_ttgg.mad/src/HelAmps_sm.h b/epochX/cudacpp/gg_ttgg.mad/src/HelAmps_sm.h
index 8df465ad6d..8b4ad719be 100644
--- a/epochX/cudacpp/gg_ttgg.mad/src/HelAmps_sm.h
+++ b/epochX/cudacpp/gg_ttgg.mad/src/HelAmps_sm.h
@@ -5,7 +5,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: A. Valassi (Sep 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
 // MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
@@ -28,7 +28,7 @@
 //#include <iomanip>
 //#include <iostream>
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/gg_ttgg.mad/src/Parameters_sm.cc b/epochX/cudacpp/gg_ttgg.mad/src/Parameters_sm.cc
index 64fc3fea62..067445b198 100644
--- a/epochX/cudacpp/gg_ttgg.mad/src/Parameters_sm.cc
+++ b/epochX/cudacpp/gg_ttgg.mad/src/Parameters_sm.cc
@@ -4,7 +4,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: A. Valassi (Sep 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
 // MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
@@ -17,7 +17,7 @@
 #include <iomanip>
 #include <iostream>
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 using namespace mg5amcGpu;
 #else
 using namespace mg5amcCpu;
diff --git a/epochX/cudacpp/gg_ttgg.mad/src/Parameters_sm.h b/epochX/cudacpp/gg_ttgg.mad/src/Parameters_sm.h
index b6568d3761..9581d66e0e 100644
--- a/epochX/cudacpp/gg_ttgg.mad/src/Parameters_sm.h
+++ b/epochX/cudacpp/gg_ttgg.mad/src/Parameters_sm.h
@@ -27,7 +27,7 @@
 #include "read_slha.h"
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -93,7 +93,7 @@ namespace mg5amcCpu
 #include <limits>
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -218,7 +218,7 @@ namespace mg5amcCpu
 //==========================================================================
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -239,7 +239,7 @@ namespace mg5amcCpu
 #pragma GCC diagnostic push
 #pragma GCC diagnostic ignored "-Wunused-variable"  // e.g. <<warning: unused variable ‘mdl_G__exp__2’ [-Wunused-variable]>>
 #pragma GCC diagnostic ignored "-Wunused-parameter" // e.g. <<warning: unused parameter ‘G’ [-Wunused-parameter]>>
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 #pragma nv_diagnostic push
 #pragma nv_diag_suppress 177 // e.g. <<warning #177-D: variable "mdl_G__exp__2" was declared but never referenced>>
 #endif
@@ -267,7 +267,7 @@ namespace mg5amcCpu
       // End SM implementation - no special handling of vectors of floats as in EFT (#439)
       return out;
     }
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 #pragma GCC diagnostic pop
 #pragma nv_diagnostic pop
 #endif
diff --git a/epochX/cudacpp/gg_ttgg.mad/src/cudacpp_src.mk b/epochX/cudacpp/gg_ttgg.mad/src/cudacpp_src.mk
index d4cc628aec..159e19a46d 100644
--- a/epochX/cudacpp/gg_ttgg.mad/src/cudacpp_src.mk
+++ b/epochX/cudacpp/gg_ttgg.mad/src/cudacpp_src.mk
@@ -19,7 +19,7 @@ SHELL := /bin/bash
 #=== Configure common compiler flags for CUDA and C++
 
 INCFLAGS = -I.
-OPTFLAGS = -O3 # this ends up in CUFLAGS too (should it?), cannot add -Ofast or -ffast-math here
+OPTFLAGS = -O3 # this ends up in GPUFLAGS too (should it?), cannot add -Ofast or -ffast-math here
 
 #-------------------------------------------------------------------------------
 
@@ -45,13 +45,13 @@ endif
 
 #-------------------------------------------------------------------------------
 
-#=== Configure the CUDA compiler (note: NVCC is already exported including ccache)
+#=== Configure the CUDA compiler (note: GPUCC is already exported including ccache)
 
-###$(info NVCC=$(NVCC))
+###$(info GPUCC=$(GPUCC))
 
 #-------------------------------------------------------------------------------
 
-#=== Configure ccache for C++ builds (note: NVCC is already exported including ccache)
+#=== Configure ccache for C++ builds (note: GPUCC is already exported including ccache)
 
 # Enable ccache if USECCACHE=1
 ifeq ($(USECCACHE)$(shell echo $(CXX) | grep ccache),1)
@@ -92,6 +92,13 @@ endif
 ###$(info OMPFLAGS=$(OMPFLAGS))
 CXXFLAGS += $(OMPFLAGS)
 
+# Add correct -DHIP_LATFORM when compiling for HIP
+ifeq ($(findstring nvcc,$(GPUCC)),nvcc)
+  GPUFLAGS += -Xcompiler -fPIC -c -x cu
+else ifeq ($(findstring hipcc,$(GPUCC)),hipcc)
+  GPUFLAGS += -fPIC -c
+endif
+
 # Set the build flags appropriate to each AVX choice (example: "make AVX=none")
 # [NB MGONGPU_PVW512 is needed because "-mprefer-vector-width=256" is not exposed in a macro]
 # [See https://gcc.gnu.org/bugzilla/show_bug.cgi?id=96476]
@@ -253,20 +260,20 @@ $(BUILDDIR)/%.o : %.cc *.h $(BUILDDIR)/.build.$(TAG)
 # Generic target and build rules: objects from CUDA compilation
 $(BUILDDIR)/%_cu.o : %.cc *.h $(BUILDDIR)/.build.$(TAG)
 	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
-	$(NVCC) $(CPPFLAGS) $(CUFLAGS) -Xcompiler -fPIC -c -x cu $< -o $@
+	$(GPUCC) $(CPPFLAGS) $(GPUFLAGS) $< -o $@
 
 #-------------------------------------------------------------------------------
 
 cxx_objects=$(addprefix $(BUILDDIR)/, Parameters_sm.o read_slha.o)
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 cu_objects=$(addprefix $(BUILDDIR)/, Parameters_sm_cu.o)
 endif
 
 # Target (and build rules): common (src) library
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so : $(cxx_objects) $(cu_objects)
 	@if [ ! -d $(LIBDIR) ]; then echo "mkdir -p $(LIBDIR)"; mkdir -p $(LIBDIR); fi
-	$(NVCC) -shared -o $@ $(cxx_objects) $(cu_objects) $(LDFLAGS)
+	$(GPUCC) -shared -o $@ $(cxx_objects) $(cu_objects) $(LDFLAGS)
 else
 $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so : $(cxx_objects)
 	@if [ ! -d $(LIBDIR) ]; then echo "mkdir -p $(LIBDIR)"; mkdir -p $(LIBDIR); fi
diff --git a/epochX/cudacpp/gg_ttgg.mad/src/mgOnGpuConfig.h b/epochX/cudacpp/gg_ttgg.mad/src/mgOnGpuConfig.h
index 88173dcc94..55d03f1252 100644
--- a/epochX/cudacpp/gg_ttgg.mad/src/mgOnGpuConfig.h
+++ b/epochX/cudacpp/gg_ttgg.mad/src/mgOnGpuConfig.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jul 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MGONGPUCONFIG_H
 #define MGONGPUCONFIG_H 1
@@ -10,12 +10,25 @@
 // There are two different code bases for standalone_cudacpp (without multichannel) and madevent+cudacpp (with multichannel)
 #define MGONGPU_SUPPORTS_MULTICHANNEL 1
 
+// Is this a GPU (CUDA, HIP) or CPU implementation?
+#ifdef __CUDACC__
+#define MGONGPUCPP_GPUIMPL cuda
+#elif defined __HIPCC__
+#define MGONGPUCPP_GPUIMPL hip
+#else
+#undef MGONGPUCPP_GPUIMPL
+#endif
+
 // ** NB1 Throughputs (e.g. 6.8E8) are events/sec for "./gcheck.exe -p 65536 128 12"
 // ** NB2 Baseline on b7g47n0004 fluctuates (probably depends on load on other VMs)
 
 // Choose if curand is supported for generating random numbers
+// For HIP, by default, do not use curand (common random numbers will be used instead)
 // For both CUDA and C++, by default, do not inline, but allow this macro to be set from outside with e.g. -DMGONGPU_HAS_NO_CURAND
-// (there exist CUDA installations, e.g. using the HPC package, which do not include curand - see PR #784)
+// (there exist CUDA installations, e.g. using the HPC package, which do not include curand - see PR #784 and #785)
+#if defined __HIPCC__
+#define MGONGPU_HAS_NO_CURAND 1
+#else
 //#ifdef __CUDACC__
 //#undef MGONGPU_HAS_NO_CURAND // default
 ////#define MGONGPU_HAS_NO_CURAND 1
@@ -23,6 +36,7 @@
 //#undef MGONGPU_HAS_NO_CURAND // default
 ////#define MGONGPU_HAS_NO_CURAND 1
 //#endif
+#endif
 
 // Choose floating point precision (for everything but color algebra #537)
 // If one of these macros has been set from outside with e.g. -DMGONGPU_FPTYPE_FLOAT, nothing happens (issue #167)
@@ -54,22 +68,25 @@
 //#undef MGONGPU_HARDCODE_PARAM // default
 ////#define MGONGPU_HARDCODE_PARAM 1
 
-// Complex type in c++: std::complex or cxsmpl (CHOOSE ONLY ONE)
-#ifndef __CUDACC__
-//#define MGONGPU_CPPCXTYPE_STDCOMPLEX 1 // ~8 percent slower on float, same on double (5.1E6/double, 9.4E6/float)
-#define MGONGPU_CPPCXTYPE_CXSMPL 1 // new default (5.1E6/double, 10.2E6/float)
-#endif
-
-// Complex type in cuda: thrust or cucomplex or cxsmpl (CHOOSE ONLY ONE)
+// Complex type in CUDA: thrust or cucomplex or cxsmpl (CHOOSE ONLY ONE)
 #ifdef __CUDACC__
 #define MGONGPU_CUCXTYPE_THRUST 1 // default (~1.15E9/double, ~3.2E9/float)
 //#define MGONGPU_CUCXTYPE_CUCOMPLEX 1 // ~10 percent slower (1.03E9/double, ~2.8E9/float)
 //#define MGONGPU_CUCXTYPE_CXSMPL 1 // ~10 percent slower (1.00E9/double, ~2.9E9/float)
+
+// Complex type in HIP: cxsmpl (ONLY ONE OPTION POSSIBLE)
+#elif defined __HIPCC__
+#define MGONGPU_CUCXTYPE_CXSMPL 1 // ~10 percent slower (1.00E9/double, ~2.9E9/float)
+
+// Complex type in C++: std::complex or cxsmpl (CHOOSE ONLY ONE)
+#else
+//#define MGONGPU_CPPCXTYPE_STDCOMPLEX 1 // ~8 percent slower on float, same on double (5.1E6/double, 9.4E6/float)
+#define MGONGPU_CPPCXTYPE_CXSMPL 1 // new default (5.1E6/double, 10.2E6/float)
 #endif
 
-// Cuda nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation
+// CUDA nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation
 #ifdef __CUDACC__
-#undef MGONGPU_NSIGHT_DEBUG // default
+#undef MGONGPU_NSIGHT_DEBUG // default in CUDA
 //#define MGONGPU_NSIGHT_DEBUG 1
 #else
 #undef MGONGPU_NSIGHT_DEBUG // only option in HIP or C++
@@ -88,17 +105,21 @@
 #error You cannot use double precision for color algebra and single precision elsewhere
 #endif
 
-// SANITY CHECKS (c++ complex number implementation)
-#ifndef __CUDACC__
-#if defined MGONGPU_CPPCXTYPE_STDCOMPLEX and defined MGONGPU_CPPCXTYPE_CXSMPL
-#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CPPCXTYPE_STDCOMPLEX or MGONGPU_CPPCXTYPE_CXSMPL
+// SANITY CHECKS (CUDA complex number implementation)
+#ifdef __CUDACC__
+#if defined MGONGPU_CUCXTYPE_THRUST and defined MGONGPU_CUCXTYPE_CUCOMPLEX
+#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CUCXTYPE_THRUST or MGONGPU_CUCXTYPE_CUCOMPLEX for CUDA
+#elif defined MGONGPU_CUCXTYPE_THRUST and defined MGONGPU_CUCXTYPE_CXSMPL
+#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CUCXTYPE_THRUST or MGONGPU_CUCXTYPE_CXSMPL for CUDA
+#elif defined MGONGPU_CUCXTYPE_CUCOMPLEX and defined MGONGPU_CUCXTYPE_CXSMPL
+#error You must CHOOSE (ONE AND) ONLY ONE OF MGONGPU_CUCXTYPE_CUCOMPLEX or MGONGPU_CUCXTYPE_CXSMPL for CUDA
 #endif
 #endif
 
-// SANITY CHECKS (cuda complex number implementation)
-#ifdef __CUDACC__
-#if defined MGONGPU_CUCXTYPE_THRUST and defined MGONGPU_CUCXTYPE_CUCOMPLEX and defined MGONGPU_CUCXTYPE_CXSMPL
-#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CUCXTYPE_THRUST or MGONGPU_CUCXTYPE_CUCOMPLEX or MGONGPU_CUCXTYPE_CXSMPL
+// SANITY CHECKS (C++ complex number implementation)
+#ifndef MGONGPUCPP_GPUIMPL
+#if defined MGONGPU_CPPCXTYPE_STDCOMPLEX and defined MGONGPU_CPPCXTYPE_CXSMPL
+#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CPPCXTYPE_STDCOMPLEX or MGONGPU_CPPCXTYPE_CXSMPL for C++
 #endif
 #endif
 
@@ -136,7 +157,7 @@ namespace mgOnGpu
   // Alignment requirement for using reinterpret_cast with SIMD vectorized code
   // (using reinterpret_cast with non aligned memory may lead to segmentation faults!)
   // Only needed for C++ code but can be enforced also in NVCC builds of C++ code using CUDA>=11.2 and C++17 (#318, #319, #333)
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   constexpr int cppAlign = 64; // alignment requirement for SIMD vectorization (64-byte i.e. 512-bit)
 #endif
 
@@ -147,7 +168,7 @@ using mgOnGpu::fptype;
 using mgOnGpu::fptype2;
 
 // C++ SIMD vectorization width (this will be used to set neppV)
-#ifdef __CUDACC__ // CUDA implementation has no SIMD
+#ifdef MGONGPUCPP_GPUIMPL // CUDA and HIP implementations have no SIMD
 #undef MGONGPU_CPPSIMD
 #elif defined __AVX512VL__ && defined MGONGPU_PVW512 // C++ "512z" AVX512 with 512 width (512-bit ie 64-byte): 8 (DOUBLE) or 16 (FLOAT)
 #ifdef MGONGPU_FPTYPE_DOUBLE
@@ -177,9 +198,9 @@ using mgOnGpu::fptype2;
 #undef MGONGPU_CPPSIMD
 #endif
 
-// Cuda nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation
+// CUDA nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation
 // Arguments (not used so far): text is __FUNCTION__, code is 0 (start) or 1 (end)
-#if defined __CUDACC__ && defined MGONGPU_NSIGHT_DEBUG /* clang-format off */
+#if defined __CUDA__ && defined MGONGPU_NSIGHT_DEBUG /* clang-format off */
 #define mgDebugDeclare() __shared__ float mgDebugCounter[mgOnGpu::ntpbMAX];
 #define mgDebugInitialise() { mgDebugCounter[threadIdx.x] = 0; }
 #define mgDebug( code, text ) { mgDebugCounter[threadIdx.x] += 1; }
@@ -191,8 +212,8 @@ using mgOnGpu::fptype2;
 #define mgDebugFinalise() { /*noop*/ }
 #endif /* clang-format on */
 
-// Define empty CUDA declaration specifiers for C++
-#ifndef __CUDACC__
+// Define empty CUDA/HIP declaration specifiers for C++
+#ifndef MGONGPUCPP_GPUIMPL
 #define __global__
 #define __host__
 #define __device__
diff --git a/epochX/cudacpp/gg_ttgg.mad/src/mgOnGpuCxtypes.h b/epochX/cudacpp/gg_ttgg.mad/src/mgOnGpuCxtypes.h
index ca9a9f00c0..5532e22fa1 100644
--- a/epochX/cudacpp/gg_ttgg.mad/src/mgOnGpuCxtypes.h
+++ b/epochX/cudacpp/gg_ttgg.mad/src/mgOnGpuCxtypes.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022, based on earlier work by D. Smith) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MGONGPUCXTYPES_H
 #define MGONGPUCXTYPES_H 1
@@ -19,7 +19,7 @@
 #include <complex>
 
 // Complex type in cuda: thrust or cucomplex or cxsmpl
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 #if defined MGONGPU_CUCXTYPE_THRUST
 #pragma clang diagnostic push
 #pragma clang diagnostic ignored "-Wtautological-compare" // for icpx2021/clang13 (https://stackoverflow.com/a/15864661)
@@ -82,7 +82,7 @@ namespace mgOnGpu /* clang-format off */
 using mgOnGpu::cxsmpl;
 
 // Printout to stream for user defined types
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -92,7 +92,7 @@ namespace mg5amcCpu
   inline __host__ std::ostream&
   operator<<( std::ostream& out, const cxsmpl<FP>& c )
   {
-    out << std::complex( c.real(), c.imag() );
+    out << std::complex<FP>( c.real(), c.imag() );
     return out;
   }
 
@@ -215,14 +215,14 @@ namespace mg5amcCpu
 //==========================================================================
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
 #endif
 {
   // --- Type definitions (complex type: cxtype)
-#ifdef __CUDACC__ // cuda
+#ifdef MGONGPUCPP_GPUIMPL // cuda
 #if defined MGONGPU_CUCXTYPE_THRUST
   typedef thrust::complex<fptype> cxtype;
 #elif defined MGONGPU_CUCXTYPE_CUCOMPLEX
@@ -255,7 +255,7 @@ namespace mg5amcCpu
 //==========================================================================
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -307,7 +307,7 @@ namespace mg5amcCpu
 
   //==========================================================================
 
-#if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_THRUST // cuda + thrust
+#if defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CUCXTYPE_THRUST // cuda + thrust
 
   //------------------------------
   // CUDA - using thrust::complex
@@ -343,11 +343,11 @@ namespace mg5amcCpu
     return c;
   }
 
-#endif // #if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_THRUST
+#endif // #if defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CUCXTYPE_THRUST
 
   //==========================================================================
 
-#if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_CUCOMPLEX // cuda + cucomplex
+#if defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CUCXTYPE_CUCOMPLEX // cuda + cucomplex
 
   //------------------------------
   // CUDA - using cuComplex
@@ -562,11 +562,11 @@ namespace mg5amcCpu
     return cxmake( c.real(), c.imag() );
   }
 
-#endif // #if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_CUCOMPLEX
+#endif // #if defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CUCXTYPE_CUCOMPLEX
 
   //==========================================================================
 
-#if not defined __CUDACC__ and defined MGONGPU_CPPCXTYPE_STDCOMPLEX // c++ + stdcomplex
+#if not defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CPPCXTYPE_STDCOMPLEX // c++ + stdcomplex
 
   //------------------------------
   // C++ - using std::complex
@@ -610,7 +610,7 @@ namespace mg5amcCpu
   }
 #endif
 
-#endif // #if not defined __CUDACC__ and defined MGONGPU_CPPCXTYPE_STDCOMPLEX
+#endif // #if not defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CPPCXTYPE_STDCOMPLEX
 
   //==========================================================================
 
@@ -633,7 +633,7 @@ namespace mg5amcCpu
 //==========================================================================
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/gg_ttgg.mad/src/mgOnGpuFptypes.h b/epochX/cudacpp/gg_ttgg.mad/src/mgOnGpuFptypes.h
index 905c97d700..fa3a02664b 100644
--- a/epochX/cudacpp/gg_ttgg.mad/src/mgOnGpuFptypes.h
+++ b/epochX/cudacpp/gg_ttgg.mad/src/mgOnGpuFptypes.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MGONGPUFPTYPES_H
 #define MGONGPUFPTYPES_H 1
@@ -12,7 +12,7 @@
 #include <cmath>
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL // cuda
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -20,7 +20,7 @@ namespace mg5amcCpu
 {
   //==========================================================================
 
-#ifdef __CUDACC__ // cuda
+#ifdef MGONGPUCPP_GPUIMPL // cuda
 
   //------------------------------
   // Floating point types - Cuda
@@ -64,11 +64,11 @@ namespace mg5amcCpu
 #endif
   }
 
-#endif // #ifdef __CUDACC__
+#endif // #ifdef MGONGPUCPP_GPUIMPL
 
   //==========================================================================
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 
   //------------------------------
   // Floating point types - C++
@@ -92,7 +92,7 @@ namespace mg5amcCpu
     return std::sqrt( f );
   }
 
-#endif // #ifndef __CUDACC__
+#endif // #ifndef MGONGPUCPP_GPUIMPL
 
   //==========================================================================
 
diff --git a/epochX/cudacpp/gg_ttgg.mad/src/mgOnGpuVectors.h b/epochX/cudacpp/gg_ttgg.mad/src/mgOnGpuVectors.h
index e91f5927d6..cdae04326b 100644
--- a/epochX/cudacpp/gg_ttgg.mad/src/mgOnGpuVectors.h
+++ b/epochX/cudacpp/gg_ttgg.mad/src/mgOnGpuVectors.h
@@ -9,8 +9,6 @@
 #include "mgOnGpuCxtypes.h"
 #include "mgOnGpuFptypes.h"
 
-#include "GpuAbstraction.h"
-
 #include <iostream>
 
 //==========================================================================
@@ -34,7 +32,7 @@
 #endif
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -133,7 +131,7 @@ namespace mg5amcCpu
 #endif
 #endif
 
-#else // i.e #ifndef MGONGPU_CPPSIMD (this includes #ifdef __CUDACC__)
+#else // i.e #ifndef MGONGPU_CPPSIMD (this includes #ifdef MGONGPUCPP_GPUIMPL)
 
   const int neppV = 1;
 
@@ -155,13 +153,13 @@ namespace mg5amcCpu
 //==========================================================================
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
 #endif
 {
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 
   // Printout to stream for user defined types
 
@@ -807,11 +805,11 @@ namespace mg5amcCpu
 
 #endif // #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
 
-#endif // #ifndef __CUDACC__
+#endif // #ifndef MGONGPUCPP_GPUIMPL
 
   //==========================================================================
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 
   //------------------------------
   // Vector types - CUDA
@@ -855,12 +853,12 @@ namespace mg5amcCpu
     return mask;
   }
 
-#endif // #ifdef __CUDACC__
+#endif // #ifdef MGONGPUCPP_GPUIMPL
 
   //==========================================================================
 
   // Scalar-or-vector types: scalar in CUDA, vector or scalar in C++
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   typedef bool bool_sv;
   typedef fptype fptype_sv;
   typedef fptype2 fptype2_sv;
@@ -881,7 +879,7 @@ namespace mg5amcCpu
 #endif
 
   // Scalar-or-vector zeros: scalar in CUDA, vector or scalar in C++
-#ifdef __CUDACC__ /* clang-format off */
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
   inline __host__ __device__ cxtype cxzero_sv(){ return cxtype( 0, 0 ); }
 #elif defined MGONGPU_CPPSIMD
   inline cxtype_v cxzero_sv() { return cxtype_v(); } // RRRR=0000 IIII=0000
diff --git a/epochX/cudacpp/gg_ttgg.mad/src/rambo.h b/epochX/cudacpp/gg_ttgg.mad/src/rambo.h
index e02ea52496..cd7e1008ea 100644
--- a/epochX/cudacpp/gg_ttgg.mad/src/rambo.h
+++ b/epochX/cudacpp/gg_ttgg.mad/src/rambo.h
@@ -4,7 +4,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 
 #include "mgOnGpuConfig.h"
@@ -18,7 +18,7 @@
 #include <iostream>
 
 // Simplified rambo version for 2 to N (with N>=2) processes with massless particles
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -83,7 +83,7 @@ namespace mg5amcCpu
       static bool first = true;
       if( first )
       {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
         if constexpr( M_ACCESS::isOnDevice() ) // avoid
         {
           const int ievt0 = 0;
@@ -166,7 +166,7 @@ namespace mg5amcCpu
     wt = po2log;
     if( nparf != 2 ) wt = ( 2. * nparf - 4. ) * log( energy ) + z[nparf - 1];
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
     // issue warnings if weight is too small or too large
     static int iwarn[5] = { 0, 0, 0, 0, 0 };
     if( wt < -180. )
diff --git a/epochX/cudacpp/gg_ttgg.sa/CODEGEN_cudacpp_gg_ttgg_log.txt b/epochX/cudacpp/gg_ttgg.sa/CODEGEN_cudacpp_gg_ttgg_log.txt
index 3c3686e228..1b6c420503 100644
--- a/epochX/cudacpp/gg_ttgg.sa/CODEGEN_cudacpp_gg_ttgg_log.txt
+++ b/epochX/cudacpp/gg_ttgg.sa/CODEGEN_cudacpp_gg_ttgg_log.txt
@@ -52,7 +52,7 @@ Note that you can still compile and run aMC@NLO with the built-in PDFs
 
 Using default text editor "vi". Set another one in ./input/mg5_configuration.txt
 Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt
-No valid web browser found. Please set in ./input/mg5_configuration.txt
+Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt
 import /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttgg.mg
 The import format was not given, so we guess it as command
 set stdout_level DEBUG
@@ -62,7 +62,7 @@ generate g g > t t~ g g
 No model currently active, so we import the Standard Model
 INFO: load particles 
 INFO: load vertices 
-[1;32mDEBUG: model prefixing  takes 0.005596637725830078 [0m
+[1;32mDEBUG: model prefixing  takes 0.005376100540161133 [0m
 INFO: Restrict model sm with file models/sm/restrict_default.dat . 
 [1;32mDEBUG: Simplifying conditional expressions [0m
 [1;32mDEBUG: remove interactions: u s w+ at order: QED=1 [0m
@@ -155,7 +155,7 @@ INFO: Please specify coupling orders to bypass this step.
 INFO: Trying coupling order WEIGHTED<=4: WEIGTHED IS QCD+2*QED 
 INFO: Trying process: g g > t t~ g g WEIGHTED<=4 @1  
 INFO: Process has 123 diagrams 
-1 processes with 123 diagrams generated in 0.166 s
+1 processes with 123 diagrams generated in 0.156 s
 Total: 1 processes with 123 diagrams
 output standalone_cudacpp ../TMPOUT/CODEGEN_cudacpp_gg_ttgg
 Load PLUGIN.CUDACPP_OUTPUT
@@ -175,7 +175,7 @@ INFO: Creating files in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TM
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttgg/SubProcesses/P1_Sigma_sm_gg_ttxgg/./CPPProcess.h
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttgg/SubProcesses/P1_Sigma_sm_gg_ttxgg/./CPPProcess.cc
 INFO: Created files CPPProcess.h and CPPProcess.cc in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttgg/SubProcesses/P1_Sigma_sm_gg_ttxgg/. 
-Generated helas calls for 1 subprocesses (123 diagrams) in 0.442 s
+Generated helas calls for 1 subprocesses (123 diagrams) in 0.427 s
 [1;32mDEBUG:  Entering PLUGIN_ProcessExporter.convert_model (create the model) [1;30m[output.py at line 202][0m [0m
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates VVV1 routines[0m
@@ -183,7 +183,7 @@ ALOHA: aloha creates FFV1 routines[0m
 ALOHA: aloha creates VVVV1 routines[0m
 ALOHA: aloha creates VVVV3 routines[0m
 ALOHA: aloha creates VVVV4 routines[0m
-ALOHA: aloha creates 5 routines in  0.337 s
+ALOHA: aloha creates 5 routines in  0.319 s
 <class 'aloha.create_aloha.AbstractRoutine'> VVV1
 <class 'aloha.create_aloha.AbstractRoutine'> VVV1
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
@@ -206,7 +206,7 @@ INFO: Created files Parameters_sm.h and Parameters_sm.cc in directory
 INFO: /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttgg/src/. and /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttgg/src/. 
 quit
 
-real	0m1.506s
-user	0m1.438s
-sys	0m0.059s
-Code generation completed in 2 seconds
+real	0m1.461s
+user	0m1.381s
+sys	0m0.050s
+Code generation completed in 1 seconds
diff --git a/epochX/cudacpp/gg_ttgg.sa/COPYRIGHT b/epochX/cudacpp/gg_ttgg.sa/COPYRIGHT
index a134b5fef9..84a883fbb0 100644
--- a/epochX/cudacpp/gg_ttgg.sa/COPYRIGHT
+++ b/epochX/cudacpp/gg_ttgg.sa/COPYRIGHT
@@ -15,6 +15,7 @@ The full development team currently includes the following authors :
   Stephan Hageboeck (CERN)
   Olivier Mattelaer (Universite Catholique de Louvain, original author)
   Stefan Roiser (CERN, original author)
+  Joergen Teig (CERN)
   Andrea Valassi (CERN, original author)
   Zenny Wettersten (CERN)
 See https://github.com/madgraph5/madgraph4gpu for more details. For the full
diff --git a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/Bridge.h b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/Bridge.h
index bf8b5e024d..89437b4c42 100644
--- a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/Bridge.h
+++ b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/Bridge.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: S. Roiser (Nov 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Roiser, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef BRIDGE_H
 #define BRIDGE_H 1
@@ -23,7 +23,7 @@
 #include <memory>
 #include <type_traits>
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -83,7 +83,7 @@ namespace mg5amcCpu
     Bridge& operator=( const Bridge& ) = delete;
     Bridge& operator=( Bridge&& ) = delete;
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     /**
      * Set the gpublocks and gputhreads for the gpusequence - throws if evnt != gpublocks*gputhreads
      * (this is needed for BridgeKernel tests rather than for actual production use in Fortran)
@@ -150,7 +150,7 @@ namespace mg5amcCpu
     unsigned int m_nevt; // number of events
     int m_nGoodHel;      // the number of good helicities (-1 initially when they have not yet been calculated)
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     int m_gputhreads; // number of gpu threads (default set from number of events, can be modified)
     int m_gpublocks;  // number of gpu blocks (default set from number of events, can be modified)
     DeviceBuffer<FORTRANFPTYPE, sizePerEventMomenta> m_devMomentaF;
@@ -187,12 +187,12 @@ namespace mg5amcCpu
   // Forward declare transposition methods
   //
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 
   template<typename Tin, typename Tout>
   __global__ void dev_transposeMomentaF2C( const Tin* in, Tout* out, const unsigned int nevt );
 
-#endif // __CUDACC__
+#endif // MGONGPUCPP_GPUIMPL
 
   template<typename Tin, typename Tout>
   void hst_transposeMomentaF2C( const Tin* in, Tout* out, const unsigned int nevt );
@@ -209,7 +209,7 @@ namespace mg5amcCpu
   Bridge<FORTRANFPTYPE>::Bridge( unsigned int nevtF, unsigned int nparF, unsigned int np4F )
     : m_nevt( nevtF )
     , m_nGoodHel( -1 )
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     , m_gputhreads( 256 )                  // default number of gpu threads
     , m_gpublocks( m_nevt / m_gputhreads ) // this ensures m_nevt <= m_gpublocks*m_gputhreads
     , m_devMomentaF( m_nevt )
@@ -233,7 +233,7 @@ namespace mg5amcCpu
   {
     if( nparF != CPPProcess::npar ) throw std::runtime_error( "Bridge constructor: npar mismatch" );
     if( np4F != CPPProcess::np4 ) throw std::runtime_error( "Bridge constructor: np4 mismatch" );
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     if( ( m_nevt < s_gputhreadsmin ) || ( m_nevt % s_gputhreadsmin != 0 ) )
       throw std::runtime_error( "Bridge constructor: nevt should be a multiple of " + std::to_string( s_gputhreadsmin ) );
     while( m_nevt != m_gpublocks * m_gputhreads )
@@ -249,7 +249,7 @@ namespace mg5amcCpu
 #else
     std::cout << "WARNING! Instantiate host Bridge (nevt=" << m_nevt << ")" << std::endl;
     m_pmek.reset( new MatrixElementKernelHost( m_hstMomentaC, m_hstGs, m_hstRndHel, m_hstRndCol, m_hstMEs, m_hstSelHel, m_hstSelCol, m_nevt ) );
-#endif // __CUDACC__
+#endif // MGONGPUCPP_GPUIMPL
     // Create a process object, read param card and set parameters
     // FIXME: the process instance can happily go out of scope because it is only needed to read parameters?
     // FIXME: the CPPProcess should really be a singleton? what if fbridgecreate is called from several Fortran threads?
@@ -262,7 +262,7 @@ namespace mg5amcCpu
     process.initProc( paramCard );
   }
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   template<typename FORTRANFPTYPE>
   void Bridge<FORTRANFPTYPE>::set_gpugrid( const int gpublocks, const int gputhreads )
   {
@@ -276,7 +276,7 @@ namespace mg5amcCpu
   }
 #endif
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   template<typename FORTRANFPTYPE>
   void Bridge<FORTRANFPTYPE>::gpu_sequence( const FORTRANFPTYPE* momenta,
                                             const FORTRANFPTYPE* gs,
@@ -291,14 +291,14 @@ namespace mg5amcCpu
     constexpr int neppM = MemoryAccessMomenta::neppM;
     if constexpr( neppM == 1 && std::is_same_v<FORTRANFPTYPE, fptype> )
     {
-      checkCuda( cudaMemcpy( m_devMomentaC.data(), momenta, m_devMomentaC.bytes(), cudaMemcpyHostToDevice ) );
+      gpuMemcpy( m_devMomentaC.data(), momenta, m_devMomentaC.bytes(), gpuMemcpyHostToDevice );
     }
     else
     {
-      checkCuda( cudaMemcpy( m_devMomentaF.data(), momenta, m_devMomentaF.bytes(), cudaMemcpyHostToDevice ) );
+      gpuMemcpy( m_devMomentaF.data(), momenta, m_devMomentaF.bytes(), gpuMemcpyHostToDevice );
       const int thrPerEvt = CPPProcess::npar * CPPProcess::np4; // AV: transpose alg does 1 element per thread (NOT 1 event per thread)
       //const int thrPerEvt = 1; // AV: try new alg with 1 event per thread... this seems slower
-      dev_transposeMomentaF2C<<<m_gpublocks * thrPerEvt, m_gputhreads>>>( m_devMomentaF.data(), m_devMomentaC.data(), m_nevt );
+      gpuLaunchKernel( dev_transposeMomentaF2C, m_gpublocks * thrPerEvt, m_gputhreads, m_devMomentaF.data(), m_devMomentaC.data(), m_nevt );
     }
     if constexpr( std::is_same_v<FORTRANFPTYPE, fptype> )
     {
@@ -341,7 +341,7 @@ namespace mg5amcCpu
   }
 #endif
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   template<typename FORTRANFPTYPE>
   void Bridge<FORTRANFPTYPE>::cpu_sequence( const FORTRANFPTYPE* momenta,
                                             const FORTRANFPTYPE* gs,
@@ -396,7 +396,7 @@ namespace mg5amcCpu
   // - C++ array: momenta[npagM][npar][np4][neppM] with nevt=npagM*neppM (AOSOA)
   //
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   template<typename Tin, typename Tout>
   __global__ void dev_transposeMomentaF2C( const Tin* in, Tout* out, const unsigned int nevt )
   {
diff --git a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/BridgeKernels.cc b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/BridgeKernels.cc
index d58066c9c1..eaf4037a24 100644
--- a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/BridgeKernels.cc
+++ b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/BridgeKernels.cc
@@ -1,17 +1,18 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #include "BridgeKernels.h"
 
+#include "GpuAbstraction.h"
 #include "MemoryAccessMomenta.h"
 
 #include <sstream>
 
 //============================================================================
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -45,7 +46,7 @@ namespace mg5amcCpu
 
 //============================================================================
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 namespace mg5amcCpu
 {
 
@@ -96,7 +97,7 @@ namespace mg5amcCpu
 
 //============================================================================
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 {
 
diff --git a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/BridgeKernels.h b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/BridgeKernels.h
index 15eb4bff4d..3efef8ce97 100644
--- a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/BridgeKernels.h
+++ b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/BridgeKernels.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef BRIDGEKERNELS_H
 #define BRIDGEKERNELS_H 1
@@ -12,7 +12,7 @@
 #include "MatrixElementKernels.h"
 #include "MemoryBuffers.h"
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -49,7 +49,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A Bridge wrapper class encapsulating matrix element calculations on a CPU host
   class BridgeKernelHost final : public BridgeKernelBase
   {
@@ -89,7 +89,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   // A Bridge wrapper class encapsulating matrix element calculations on a GPU device
   class BridgeKernelDevice : public BridgeKernelBase
   {
diff --git a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/CommonRandomNumberKernel.cc b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/CommonRandomNumberKernel.cc
index 985b39f576..010bc4cbd0 100644
--- a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/CommonRandomNumberKernel.cc
+++ b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/CommonRandomNumberKernel.cc
@@ -1,15 +1,16 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #include "CommonRandomNumbers.h"
+#include "GpuAbstraction.h"
 #include "MemoryBuffers.h"
 #include "RandomNumberKernels.h"
 
 #include <cassert>
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/CrossSectionKernels.cc b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/CrossSectionKernels.cc
index 0b355a3c8d..c15b39844d 100644
--- a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/CrossSectionKernels.cc
+++ b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/CrossSectionKernels.cc
@@ -1,10 +1,11 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #include "CrossSectionKernels.h"
 
+#include "GpuAbstraction.h"
 #include "MemoryAccessMatrixElements.h"
 #include "MemoryAccessWeights.h"
 #include "MemoryBuffers.h"
@@ -77,7 +78,7 @@ debug_me_is_abnormal( const fptype& me, size_t ievtALL )
 
 //============================================================================
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -185,7 +186,7 @@ namespace mg5amcCpu
 
 //============================================================================
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 {
 
diff --git a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/CrossSectionKernels.h b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/CrossSectionKernels.h
index 7933ca4bbf..4d9659e04e 100644
--- a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/CrossSectionKernels.h
+++ b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/CrossSectionKernels.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef CROSSSECTIONKERNELS_H
 #define CROSSSECTIONKERNELS_H 1
@@ -13,7 +13,7 @@
 
 //============================================================================
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -96,7 +96,7 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
   /*
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   // A class encapsulating the calculation of event statistics on a GPU device
   class CrossSectionKernelDevice : public CrossSectionKernelBase, public NumberOfEvents
   {
diff --git a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/CudaRuntime.h b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/CudaRuntime.h
deleted file mode 100644
index 64ce52f4b3..0000000000
--- a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/CudaRuntime.h
+++ /dev/null
@@ -1,85 +0,0 @@
-// Copyright (C) 2020-2023 CERN and UCLouvain.
-// Licensed under the GNU Lesser General Public License (version 3 or later).
-// Created by: S. Roiser (Jul 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
-
-#ifndef MG5AMC_CUDARUNTIME_H
-#define MG5AMC_CUDARUNTIME_H 1
-
-// MG5AMC on GPU uses the CUDA runtime API, not the lower level CUDA driver API
-// See https://docs.nvidia.com/cuda/cuda-runtime-api/driver-vs-runtime-api.html#driver-vs-runtime-api
-
-#include <cassert>
-#include <iostream>
-
-//--------------------------------------------------------------------------
-
-// See https://stackoverflow.com/a/14038590
-#ifdef __CUDACC__ /* clang-format off */
-#define checkCuda( code ) { assertCuda( code, __FILE__, __LINE__ ); }
-inline void assertCuda( cudaError_t code, const char* file, int line, bool abort = true )
-{
-  if( code != cudaSuccess )
-  {
-    printf( "ERROR! assertCuda: '%s' (%d) in %s:%d\n", cudaGetErrorString( code ), code, file, line );
-    if( abort ) assert( code == cudaSuccess );
-  }
-}
-#endif /* clang-format on */
-
-//--------------------------------------------------------------------------
-
-#ifdef __CUDACC__
-namespace mg5amcGpu
-{
-  // Instantiate a CudaRuntime at the beginnining of the application's main to
-  // invoke cudaSetDevice(0) in the constructor and book a cudaDeviceReset() call in the destructor
-  // *** FIXME! This will all need to be designed differently when going to multi-GPU nodes! ***
-  struct CudaRuntime final
-  {
-    CudaRuntime( const bool debug = true )
-      : m_debug( debug ) { setUp( m_debug ); }
-    ~CudaRuntime() { tearDown( m_debug ); }
-    CudaRuntime( const CudaRuntime& ) = delete;
-    CudaRuntime( CudaRuntime&& ) = delete;
-    CudaRuntime& operator=( const CudaRuntime& ) = delete;
-    CudaRuntime& operator=( CudaRuntime&& ) = delete;
-    bool m_debug;
-
-    // Set up CUDA application
-    // ** NB: strictly speaking this is not needed when using the CUDA runtime API **
-    // Calling cudaSetDevice on startup is useful to properly book-keep the time spent in CUDA initialization
-    static void setUp( const bool debug = true )
-    {
-      // ** NB: it is useful to call cudaSetDevice, or cudaFree, to properly book-keep the time spent in CUDA initialization
-      // ** NB: otherwise, the first CUDA operation (eg a cudaMemcpyToSymbol in CPPProcess ctor) appears to take much longer!
-      /*
-      // [We initially added cudaFree(0) to "ease profile analysis" only because it shows up as a big recognizable block!]
-      // No explicit initialization is needed: https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#initialization
-      // It is not clear what cudaFree(0) does at all: https://stackoverflow.com/questions/69967813/
-      if ( debug ) std::cout << "__CudaRuntime: calling cudaFree(0)" << std::endl;
-      checkCuda( cudaFree( 0 ) ); // SLOW!
-      */
-      // Replace cudaFree(0) by cudaSetDevice(0), even if it is not really needed either
-      // (but see https://developer.nvidia.com/blog/cuda-pro-tip-always-set-current-device-avoid-multithreading-bugs)
-      if( debug ) std::cout << "__CudaRuntime: calling cudaSetDevice(0)" << std::endl;
-      checkCuda( cudaSetDevice( 0 ) ); // SLOW!
-    }
-
-    // Tear down CUDA application (call cudaDeviceReset)
-    // ** NB: strictly speaking this is not needed when using the CUDA runtime API **
-    // Calling cudaDeviceReset on shutdown is only needed for checking memory leaks in cuda-memcheck
-    // See https://docs.nvidia.com/cuda/cuda-memcheck/index.html#leak-checking
-    static void tearDown( const bool debug = true )
-    {
-      if( debug ) std::cout << "__CudaRuntime: calling cudaDeviceReset()" << std::endl;
-      checkCuda( cudaDeviceReset() );
-    }
-  };
-
-}
-#endif
-
-//--------------------------------------------------------------------------
-
-#endif // MG5AMC_CUDARUNTIME_H
diff --git a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/CurandRandomNumberKernel.cc b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/CurandRandomNumberKernel.cc
index eb56333b03..08a16f6f2c 100644
--- a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/CurandRandomNumberKernel.cc
+++ b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/CurandRandomNumberKernel.cc
@@ -1,9 +1,9 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
-#include "CudaRuntime.h"
+#include "GpuRuntime.h"
 #include "MemoryBuffers.h"
 #include "RandomNumberKernels.h"
 
@@ -22,7 +22,7 @@ inline void assertCurand( curandStatus_t code, const char *file, int line, bool
 }
 #endif /* clang-format on */
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -36,7 +36,7 @@ namespace mg5amcCpu
   {
     if( m_isOnDevice )
     {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
       if( !m_rnarray.isOnDevice() )
         throw std::runtime_error( "CurandRandomNumberKernel on device with a host random number array" );
 #else
@@ -114,7 +114,7 @@ namespace mg5amcCpu
     /*
     printf( "\nCurandRandomNumberKernel::generateRnarray size = %d\n", (int)m_rnarray.size() );
     fptype* data = m_rnarray.data();
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     if( m_rnarray.isOnDevice() )
     {
       data = new fptype[m_rnarray.size()]();
@@ -123,7 +123,7 @@ namespace mg5amcCpu
 #endif
     for( int i = 0; i < ( (int)m_rnarray.size() / 4 ); i++ )
       printf( "[%4d] %f %f %f %f\n", i * 4, data[i * 4], data[i * 4 + 2], data[i * 4 + 2], data[i * 4 + 3] );
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     if( m_rnarray.isOnDevice() ) delete[] data;
 #endif
     */
diff --git a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/EventStatistics.h b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/EventStatistics.h
index 48b51e0a49..b425a5bade 100644
--- a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/EventStatistics.h
+++ b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/EventStatistics.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef EventStatistics_H
 #define EventStatistics_H 1
@@ -16,7 +16,7 @@
 #include <limits>
 #include <string>
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/GpuAbstraction.h b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/GpuAbstraction.h
new file mode 100644
index 0000000000..6a7d9c05c0
--- /dev/null
+++ b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/GpuAbstraction.h
@@ -0,0 +1,71 @@
+// Copyright (C) 2020-2023 CERN and UCLouvain.
+// Licensed under the GNU Lesser General Public License (version 3 or later).
+// Created by: J. Teig (Jul 2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+
+#ifndef MG5AMC_GPUABSTRACTION_H
+#define MG5AMC_GPUABSTRACTION_H 1
+
+#include <cassert>
+
+//--------------------------------------------------------------------------
+
+#ifdef __CUDACC__
+
+#define gpuError_t cudaError_t
+#define gpuPeekAtLastError cudaPeekAtLastError
+#define gpuGetErrorString cudaGetErrorString
+#define gpuSuccess cudaSuccess
+
+#define gpuMallocHost( ptr, size ) checkGpu( cudaMallocHost( ptr, size ) )
+#define gpuMalloc( ptr, size ) checkGpu( cudaMalloc( ptr, size ) )
+
+#define gpuMemcpy( dstData, srcData, srcBytes, func ) checkGpu( cudaMemcpy( dstData, srcData, srcBytes, func ) )
+#define gpuMemcpyHostToDevice cudaMemcpyHostToDevice
+#define gpuMemcpyDeviceToHost cudaMemcpyDeviceToHost
+#define gpuMemcpyToSymbol( type1, type2, size ) checkGpu( cudaMemcpyToSymbol( type1, type2, size ) )
+
+#define gpuFree( ptr ) checkGpu( cudaFree( ptr ) )
+#define gpuFreeHost( ptr ) checkGpu( cudaFreeHost( ptr ) )
+
+#define gpuSetDevice cudaSetDevice
+#define gpuDeviceSynchronize cudaDeviceSynchronize
+#define gpuDeviceReset cudaDeviceReset
+
+#define gpuLaunchKernel( kernel, blocks, threads, ... ) kernel<<<blocks, threads>>>( __VA_ARGS__ )
+#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<<blocks, threads, sharedMem>>>( __VA_ARGS__ )
+
+//--------------------------------------------------------------------------
+
+#elif defined __HIPCC__
+
+#include "hip/hip_runtime.h"
+
+#define gpuError_t hipError_t
+#define gpuPeekAtLastError hipPeekAtLastError
+#define gpuGetErrorString hipGetErrorString
+#define gpuSuccess hipSuccess
+
+#define gpuMallocHost( ptr, size ) checkGpu( hipHostMalloc( ptr, size ) ) // HostMalloc better
+#define gpuMalloc( ptr, size ) checkGpu( hipMalloc( ptr, size ) )
+
+#define gpuMemcpy( dstData, srcData, srcBytes, func ) checkGpu( hipMemcpy( dstData, srcData, srcBytes, func ) )
+#define gpuMemcpyHostToDevice hipMemcpyHostToDevice
+#define gpuMemcpyDeviceToHost hipMemcpyDeviceToHost
+#define gpuMemcpyToSymbol( type1, type2, size ) checkGpu( hipMemcpyToSymbol( type1, type2, size ) )
+
+#define gpuFree( ptr ) checkGpu( hipFree( ptr ) )
+#define gpuFreeHost( ptr ) checkGpu( hipHostFree( ptr ) )
+
+#define gpuSetDevice hipSetDevice
+#define gpuDeviceSynchronize hipDeviceSynchronize
+#define gpuDeviceReset hipDeviceReset
+
+#define gpuLaunchKernel( kernel, blocks, threads, ... ) kernel<<<blocks, threads>>>( __VA_ARGS__ )
+#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<<blocks, threads, sharedMem>>>( __VA_ARGS__ )
+
+//--------------------------------------------------------------------------
+
+#endif
+
+#endif // MG5AMC_GPUABSTRACTION_H
diff --git a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/GpuRuntime.h b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/GpuRuntime.h
new file mode 100644
index 0000000000..93579ef08b
--- /dev/null
+++ b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/GpuRuntime.h
@@ -0,0 +1,85 @@
+// Copyright (C) 2020-2023 CERN and UCLouvain.
+// Licensed under the GNU Lesser General Public License (version 3 or later).
+// Created by: J. Teig (Jun 2023, based on earlier work by S. Roiser) for the MG5aMC CUDACPP plugin.
+// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+
+#ifndef MG5AMC_GPURUNTIME_H
+#define MG5AMC_GPURUNTIME_H 1
+
+// MG5AMC on GPU uses the CUDA runtime API, not the lower level CUDA driver API
+// See https://docs.nvidia.com/cuda/cuda-runtime-api/driver-vs-runtime-api.html#driver-vs-runtime-api
+
+#include "GpuAbstraction.h"
+
+#include <iostream>
+
+//--------------------------------------------------------------------------
+
+// See https://stackoverflow.com/a/14038590
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
+#define checkGpu( code ) { assertGpu( code, __FILE__, __LINE__ ); }
+inline void assertGpu( gpuError_t code, const char* file, int line, bool abort = true )
+{
+  if( code != gpuSuccess )
+  {
+    printf( "ERROR! assertGpu: '%s' (%d) in %s:%d\n", gpuGetErrorString( code ), code, file, line );
+    if( abort ) assert( code == gpuSuccess );
+  }
+}
+#endif /* clang-format on */
+
+//--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+namespace mg5amcGpu
+{
+  // Instantiate a GpuRuntime at the beginnining of the application's main to
+  // invoke gpuSetDevice(0) in the constructor and book a gpuDeviceReset() call in the destructor
+  // *** FIXME! This will all need to be designed differently when going to multi-GPU nodes! ***
+  struct GpuRuntime final
+  {
+    GpuRuntime( const bool debug = true )
+      : m_debug( debug ) { setUp( m_debug ); }
+    ~GpuRuntime() { tearDown( m_debug ); }
+    GpuRuntime( const GpuRuntime& ) = delete;
+    GpuRuntime( GpuRuntime&& ) = delete;
+    GpuRuntime& operator=( const GpuRuntime& ) = delete;
+    GpuRuntime& operator=( GpuRuntime&& ) = delete;
+    bool m_debug;
+
+    // Set up CUDA application
+    // ** NB: strictly speaking this is not needed when using the CUDA runtime API **
+    // Calling cudaSetDevice on startup is useful to properly book-keep the time spent in CUDA initialization
+    static void setUp( const bool debug = true )
+    {
+      // ** NB: it is useful to call cudaSetDevice, or cudaFree, to properly book-keep the time spent in CUDA initialization
+      // ** NB: otherwise, the first CUDA operation (eg a cudaMemcpyToSymbol in CPPProcess ctor) appears to take much longer!
+      /*
+      // [We initially added cudaFree(0) to "ease profile analysis" only because it shows up as a big recognizable block!]
+      // No explicit initialization is needed: https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#initialization
+      // It is not clear what cudaFree(0) does at all: https://stackoverflow.com/questions/69967813/
+      if ( debug ) std::cout << "__CudaRuntime: calling cudaFree(0)" << std::endl;
+      checkCuda( cudaFree( 0 ) ); // SLOW!
+      */
+      // Replace cudaFree(0) by cudaSetDevice(0), even if it is not really needed either
+      // (but see https://developer.nvidia.com/blog/cuda-pro-tip-always-set-current-device-avoid-multithreading-bugs)
+      if( debug ) std::cout << "__GpuRuntime: calling GpuSetDevice(0)" << std::endl;
+      checkGpu( gpuSetDevice( 0 ) ); // SLOW!
+    }
+
+    // Tear down CUDA application (call cudaDeviceReset)
+    // ** NB: strictly speaking this is not needed when using the CUDA runtime API **
+    // Calling cudaDeviceReset on shutdown is only needed for checking memory leaks in cuda-memcheck
+    // See https://docs.nvidia.com/cuda/cuda-memcheck/index.html#leak-checking
+    static void tearDown( const bool debug = true )
+    {
+      if( debug ) std::cout << "__GpuRuntime: calling GpuDeviceReset()" << std::endl;
+      checkGpu( gpuDeviceReset() );
+    }
+  };
+}
+#endif
+
+//--------------------------------------------------------------------------
+
+#endif // MG5AMC_GPURUNTIME_H
diff --git a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MadgraphTest.h b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MadgraphTest.h
index ef40624c88..a64c05c26a 100644
--- a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MadgraphTest.h
+++ b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MadgraphTest.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: S. Hageboeck (Dec 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MADGRAPHTEST_H_
 #define MADGRAPHTEST_H_ 1
@@ -22,7 +22,7 @@
 #include <string>
 #include <vector>
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 using mg5amcGpu::CPPProcess;
 #else
 using mg5amcCpu::CPPProcess;
@@ -201,7 +201,7 @@ class MadgraphTest : public testing::TestWithParam<TestDriverBase*>
 
 // Since we link both the CPU-only and GPU tests into the same executable, we prevent
 // a multiply defined symbol by only compiling this in the non-CUDA phase:
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 
 /// Compare momenta and matrix elements.
 /// This uses an implementation of TestDriverBase to run a madgraph workflow,
@@ -307,6 +307,6 @@ TEST_P( MadgraphTest, CompareMomentaAndME )
   }
 }
 
-#endif // __CUDACC__
+#endif // MGONGPUCPP_GPUIMPL
 
 #endif /* MADGRAPHTEST_H_ */
diff --git a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MatrixElementKernels.cc b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MatrixElementKernels.cc
index 74b5239ebf..81699dfea9 100644
--- a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MatrixElementKernels.cc
+++ b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MatrixElementKernels.cc
@@ -1,12 +1,12 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #include "MatrixElementKernels.h"
 
 #include "CPPProcess.h"
-#include "CudaRuntime.h"
+#include "GpuRuntime.h" // Includes the abstraction for Nvidia/AMD compilation
 #include "MemoryAccessMomenta.h"
 #include "MemoryBuffers.h"
 
@@ -14,7 +14,7 @@
 
 //============================================================================
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 namespace mg5amcCpu
 {
 
@@ -150,7 +150,7 @@ namespace mg5amcCpu
 
 //============================================================================
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 {
 
@@ -209,13 +209,13 @@ namespace mg5amcGpu
     PinnedHostBufferHelicityMask hstIsGoodHel( ncomb );
     DeviceBufferHelicityMask devIsGoodHel( ncomb );
     // ... 0d1. Compute good helicity mask on the device
-    computeDependentCouplings<<<m_gpublocks, m_gputhreads>>>( m_gs.data(), m_couplings.data() );
+    gpuLaunchKernel( computeDependentCouplings, m_gpublocks, m_gputhreads, m_gs.data(), m_couplings.data() );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    sigmaKin_getGoodHel<<<m_gpublocks, m_gputhreads>>>( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_numerators.data(), m_denominators.data(), devIsGoodHel.data() );
+    gpuLaunchKernel( sigmaKin_getGoodHel, m_gpublocks, m_gputhreads, m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_numerators.data(), m_denominators.data(), devIsGoodHel.data() );
 #else
-    sigmaKin_getGoodHel<<<m_gpublocks, m_gputhreads>>>( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), devIsGoodHel.data() );
+    gpuLaunchKernel( sigmaKin_getGoodHel, m_gpublocks, m_gputhreads, m_momenta.data(), m_couplings.data(), m_matrixElements.data(), devIsGoodHel.data() );
 #endif
-    checkCuda( cudaPeekAtLastError() );
+    checkGpu( gpuPeekAtLastError() );
     // ... 0d2. Copy back good helicity mask to the host
     copyHostFromDevice( hstIsGoodHel, devIsGoodHel );
     // ... 0d3. Copy back good helicity list to constant memory on the device
@@ -226,19 +226,19 @@ namespace mg5amcGpu
 
   void MatrixElementKernelDevice::computeMatrixElements( const unsigned int channelId )
   {
-    computeDependentCouplings<<<m_gpublocks, m_gputhreads>>>( m_gs.data(), m_couplings.data() );
+    gpuLaunchKernel( computeDependentCouplings, m_gpublocks, m_gputhreads, m_gs.data(), m_couplings.data() );
 #ifndef MGONGPU_NSIGHT_DEBUG
     constexpr unsigned int sharedMemSize = 0;
 #else
     constexpr unsigned int sharedMemSize = ntpbMAX * sizeof( float );
 #endif
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    sigmaKin<<<m_gpublocks, m_gputhreads, sharedMemSize>>>( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), channelId, m_numerators.data(), m_denominators.data(), m_selhel.data(), m_selcol.data() );
+    gpuLaunchKernelSharedMem( sigmaKin, m_gpublocks, m_gputhreads, sharedMemSize, m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), channelId, m_numerators.data(), m_denominators.data(), m_selhel.data(), m_selcol.data() );
 #else
-    sigmaKin<<<m_gpublocks, m_gputhreads, sharedMemSize>>>( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), m_selhel.data(), m_selcol.data() );
+    gpuLaunchKernelSharedMem( sigmaKin, m_gpublocks, m_gputhreads, sharedMemSize, m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), m_selhel.data(), m_selcol.data() );
 #endif
-    checkCuda( cudaPeekAtLastError() );
-    checkCuda( cudaDeviceSynchronize() );
+    checkGpu( gpuPeekAtLastError() );
+    checkGpu( gpuDeviceSynchronize() );
   }
 
   //--------------------------------------------------------------------------
diff --git a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MatrixElementKernels.h b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MatrixElementKernels.h
index 23e84757a2..72bd8f195b 100644
--- a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MatrixElementKernels.h
+++ b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MatrixElementKernels.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MATRIXELEMENTKERNELS_H
 #define MATRIXELEMENTKERNELS_H 1
@@ -10,7 +10,7 @@
 
 #include "MemoryBuffers.h"
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -81,7 +81,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating matrix element calculations on a CPU host
   class MatrixElementKernelHost final : public MatrixElementKernelBase, public NumberOfEvents
   {
@@ -130,7 +130,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   // A class encapsulating matrix element calculations on a GPU device
   class MatrixElementKernelDevice : public MatrixElementKernelBase, public NumberOfEvents
   {
diff --git a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MemoryAccessAmplitudes.h b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MemoryAccessAmplitudes.h
index 573b3bbbc9..ffb76e93de 100644
--- a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MemoryAccessAmplitudes.h
+++ b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MemoryAccessAmplitudes.h
@@ -15,7 +15,7 @@
 #define MGONGPU_TRIVIAL_AMPLITUDES 1
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MemoryAccessCouplings.h b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MemoryAccessCouplings.h
index 35a3af42e0..3afdf3e554 100644
--- a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MemoryAccessCouplings.h
+++ b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MemoryAccessCouplings.h
@@ -15,7 +15,7 @@
 #include "MemoryBuffers.h"       // for HostBufferCouplings::isaligned
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MemoryAccessCouplingsFixed.h b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MemoryAccessCouplingsFixed.h
index dc0d93afff..ffcdf4dbef 100644
--- a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MemoryAccessCouplingsFixed.h
+++ b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MemoryAccessCouplingsFixed.h
@@ -14,7 +14,7 @@
 //#include "MemoryAccessHelpers.h"
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MemoryAccessDenominators.h b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MemoryAccessDenominators.h
index 3bce635718..66f2d32a6b 100644
--- a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MemoryAccessDenominators.h
+++ b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MemoryAccessDenominators.h
@@ -10,7 +10,7 @@
 #include "MemoryAccessGs.h"
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MemoryAccessGs.h b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MemoryAccessGs.h
index 31311aa375..4c726b30f3 100644
--- a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MemoryAccessGs.h
+++ b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MemoryAccessGs.h
@@ -13,7 +13,7 @@
 #include "MemoryBuffers.h" // for HostBufferMatrixElements::isaligned
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MemoryAccessHelpers.h b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MemoryAccessHelpers.h
index c82a6c7635..db73e4e064 100644
--- a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MemoryAccessHelpers.h
+++ b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MemoryAccessHelpers.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MemoryAccessHelpers_H
 #define MemoryAccessHelpers_H 1
@@ -105,7 +105,7 @@ class KernelAccessHelper : public MemoryAccessHelper<T>
     }
     else
     {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
       const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid
       //printf( "kernelAccessRecord: ievt=%d threadId=%d\n", ievt, threadIdx.x );
       return T::ieventAccessRecord( buffer, ievt ); // NB fptype and fptype_sv coincide for CUDA
diff --git a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MemoryAccessMatrixElements.h b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MemoryAccessMatrixElements.h
index f32e6fea5b..3741011971 100644
--- a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MemoryAccessMatrixElements.h
+++ b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MemoryAccessMatrixElements.h
@@ -13,7 +13,7 @@
 #include "MemoryBuffers.h" // for HostBufferMatrixElements::isaligned
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MemoryAccessMomenta.h b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MemoryAccessMomenta.h
index 29266de32c..3be229d392 100644
--- a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MemoryAccessMomenta.h
+++ b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MemoryAccessMomenta.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MemoryAccessMomenta_H
 #define MemoryAccessMomenta_H 1
@@ -13,7 +13,7 @@
 #include "MemoryAccessVectors.h"
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -30,7 +30,7 @@ namespace mg5amcCpu
 
     // Number of Events Per Page in the momenta AOSOA memory buffer layout
     // (these are all best kept as a compile-time constants: see issue #23)
-#ifdef __CUDACC__ /* clang-format off */
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
     // -----------------------------------------------------------------------------------------------
     // --- GPUs: neppM is best set to a power of 2 times the number of fptype's in a 32-byte cacheline
     // --- This is relevant to ensure coalesced access to momenta in global memory
diff --git a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MemoryAccessNumerators.h b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MemoryAccessNumerators.h
index b152183b28..18991f4fa6 100644
--- a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MemoryAccessNumerators.h
+++ b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MemoryAccessNumerators.h
@@ -10,7 +10,7 @@
 #include "MemoryAccessGs.h"
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MemoryAccessRandomNumbers.h b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MemoryAccessRandomNumbers.h
index e2988d39f3..40cb089135 100644
--- a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MemoryAccessRandomNumbers.h
+++ b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MemoryAccessRandomNumbers.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MemoryAccessRandomNumbers_H
 #define MemoryAccessRandomNumbers_H 1
@@ -11,7 +11,7 @@
 #include "CPPProcess.h"
 #include "MemoryAccessHelpers.h"
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 using mg5amcGpu::CPPProcess;
 #else
 using mg5amcCpu::CPPProcess;
diff --git a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MemoryAccessVectors.h b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MemoryAccessVectors.h
index e9b197368e..08faccff0f 100644
--- a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MemoryAccessVectors.h
+++ b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MemoryAccessVectors.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MemoryAccessVectors_H
 #define MemoryAccessVectors_H 1
@@ -10,7 +10,7 @@
 
 #include "mgOnGpuVectors.h"
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 namespace mg5amcCpu // this is only needed for CPU SIMD vectorization
 {
 
diff --git a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MemoryAccessWavefunctions.h b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MemoryAccessWavefunctions.h
index 5428aaf933..33bef4559e 100644
--- a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MemoryAccessWavefunctions.h
+++ b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MemoryAccessWavefunctions.h
@@ -15,7 +15,7 @@
 #define MGONGPU_TRIVIAL_WAVEFUNCTIONS 1
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MemoryBuffers.h b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MemoryBuffers.h
index 3093e6ed18..7756a71621 100644
--- a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MemoryBuffers.h
+++ b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MemoryBuffers.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021, based on earlier work by S. Hageboeck) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Roiser, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MemoryBuffers_H
 #define MemoryBuffers_H 1
@@ -11,12 +11,12 @@
 #include "mgOnGpuCxtypes.h"
 
 #include "CPPProcess.h"
-#include "CudaRuntime.h"
+#include "GpuRuntime.h"
 #include "Parameters_sm.h"
 
 #include <sstream>
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -87,7 +87,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   constexpr bool HostBufferALIGNED = false;   // ismisaligned=false
   constexpr bool HostBufferMISALIGNED = true; // ismisaligned=true
 
@@ -119,7 +119,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   // A class encapsulating a CUDA pinned host buffer
   template<typename T>
   class PinnedHostBufferBase : public BufferBase<T>
@@ -128,18 +128,18 @@ namespace mg5amcCpu
     PinnedHostBufferBase( const size_t size )
       : BufferBase<T>( size, false )
     {
-      checkCuda( cudaMallocHost( &( this->m_data ), this->bytes() ) );
+      gpuMallocHost( &( this->m_data ), this->bytes() );
     }
     virtual ~PinnedHostBufferBase()
     {
-      checkCuda( cudaFreeHost( this->m_data ) );
+      gpuFreeHost( this->m_data );
     }
   };
 #endif
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   // A class encapsulating a CUDA device buffer
   template<typename T>
   class DeviceBufferBase : public BufferBase<T>
@@ -148,18 +148,18 @@ namespace mg5amcCpu
     DeviceBufferBase( const size_t size )
       : BufferBase<T>( size, true )
     {
-      checkCuda( cudaMalloc( &( this->m_data ), this->bytes() ) );
+      gpuMalloc( &( this->m_data ), this->bytes() );
     }
     virtual ~DeviceBufferBase()
     {
-      checkCuda( cudaFree( this->m_data ) );
+      gpuFree( this->m_data );
     }
   };
 #endif
 
   //--------------------------------------------------------------------------
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for a given number of events
   template<typename T, size_t sizePerEvent, bool ismisaligned>
   class HostBuffer : public HostBufferBase<T, ismisaligned>, virtual private NumberOfEvents
@@ -175,7 +175,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   // A class encapsulating a CUDA pinned host buffer for a given number of events
   template<typename T, size_t sizePerEvent>
   class PinnedHostBuffer : public PinnedHostBufferBase<T>, virtual private NumberOfEvents
@@ -191,7 +191,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   // A class encapsulating a CUDA device buffer for a given number of events
   template<typename T, size_t sizePerEvent>
   class DeviceBuffer : public DeviceBufferBase<T>, virtual private NumberOfEvents
@@ -213,7 +213,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for momenta random numbers
   constexpr size_t sizePerEventRndNumMomenta = MemoryBuffers::np4 * MemoryBuffers::nparf;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for momenta random numbers
   typedef HostBuffer<fptype, sizePerEventRndNumMomenta, HostBufferALIGNED> HostBufferRndNumMomenta;
 #else
@@ -232,7 +232,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer with ONE fptype per event
   constexpr size_t sizePerEventOneFp = 1;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer with ONE fptype per event
   typedef HostBuffer<fptype, sizePerEventOneFp, HostBufferALIGNED> HostBufferOneFp;
 #else
@@ -257,7 +257,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for Gs
   constexpr size_t sizePerEventGs = 1;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for gs
   typedef HostBuffer<fptype, sizePerEventGs, HostBufferALIGNED> HostBufferGs;
 #else
@@ -276,7 +276,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for numerators
   constexpr size_t sizePerEventNumerators = 1;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for gs
   typedef HostBuffer<fptype, sizePerEventNumerators, HostBufferALIGNED> HostBufferNumerators;
 #else
@@ -296,7 +296,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for denominators
   constexpr size_t sizePerEventDenominators = 1;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for gs
   typedef HostBuffer<fptype, sizePerEventDenominators, HostBufferALIGNED> HostBufferDenominators;
 #else
@@ -315,7 +315,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for random numbers
   constexpr size_t sizePerEventCouplings = MemoryBuffers::ndcoup * MemoryBuffers::nx2;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for gs
   typedef HostBuffer<fptype, sizePerEventCouplings, HostBufferALIGNED> HostBufferCouplings;
 #else
@@ -333,7 +333,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for momenta
   constexpr size_t sizePerEventMomenta = MemoryBuffers::np4 * MemoryBuffers::npar;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for momenta
   typedef HostBuffer<fptype, sizePerEventMomenta, HostBufferALIGNED> HostBufferMomenta;
   //typedef HostBuffer<fptype, sizePerEventMomenta, HostBufferMISALIGNED> HostBufferMomenta; // TEST MISALIGNMENT!
@@ -352,7 +352,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for sampling weights
   constexpr size_t sizePerEventWeights = 1;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for sampling weights
   typedef HostBuffer<fptype, sizePerEventWeights, HostBufferALIGNED> HostBufferWeights;
 #else
@@ -370,7 +370,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for matrix elements
   constexpr size_t sizePerEventMatrixElements = 1;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for matrix elements
   typedef HostBuffer<fptype, sizePerEventMatrixElements, HostBufferALIGNED> HostBufferMatrixElements;
 #else
@@ -385,7 +385,7 @@ namespace mg5amcCpu
   // A base class encapsulating a memory buffer for the helicity mask
   typedef BufferBase<bool> BufferHelicityMask;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for the helicity mask
   typedef HostBufferBase<bool, HostBufferALIGNED> HostBufferHelicityMask;
 #else
@@ -403,7 +403,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for wavefunctions
   constexpr size_t sizePerEventWavefunctions = MemoryBuffers::nw6 * MemoryBuffers::nx2;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for wavefunctions
   typedef HostBuffer<fptype, sizePerEventWavefunctions, HostBufferALIGNED> HostBufferWavefunctions;
 #else
@@ -421,7 +421,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for helicity random numbers
   constexpr size_t sizePerEventRndNumHelicity = 1;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for helicity random numbers
   typedef HostBuffer<fptype, sizePerEventRndNumHelicity, HostBufferALIGNED> HostBufferRndNumHelicity;
 #else
@@ -439,7 +439,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for color random numbers
   constexpr size_t sizePerEventRndNumColor = 1;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for color random numbers
   typedef HostBuffer<fptype, sizePerEventRndNumColor, HostBufferALIGNED> HostBufferRndNumColor;
 #else
@@ -457,7 +457,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for helicity selection
   constexpr size_t sizePerEventSelectedHelicity = 1;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for helicity selection
   typedef HostBuffer<int, sizePerEventSelectedHelicity, HostBufferALIGNED> HostBufferSelectedHelicity;
 #else
@@ -475,7 +475,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for color selection
   constexpr size_t sizePerEventSelectedColor = 1;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for color selection
   typedef HostBuffer<int, sizePerEventSelectedColor, HostBufferALIGNED> HostBufferSelectedColor;
 #else
@@ -487,7 +487,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   template<class Tdst, class Tsrc>
   void copyDeviceFromHost( Tdst& dst, const Tsrc& src ) // keep the same order of arguments as in memcpy
   {
@@ -504,13 +504,13 @@ namespace mg5amcCpu
       throw std::runtime_error( sstr.str() );
     }
     // NB (PR #45): cudaMemcpy involves an intermediate memcpy to pinned memory if host array is a not a pinned host array
-    checkCuda( cudaMemcpy( dst.data(), src.data(), src.bytes(), cudaMemcpyHostToDevice ) );
+    gpuMemcpy( dst.data(), src.data(), src.bytes(), gpuMemcpyHostToDevice );
   }
 #endif
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   template<class Tdst, class Tsrc>
   void copyHostFromDevice( Tdst& dst, const Tsrc& src ) // keep the same order of arguments as in memcpy
   {
@@ -527,7 +527,7 @@ namespace mg5amcCpu
       throw std::runtime_error( sstr.str() );
     }
     // NB (PR #45): cudaMemcpy involves an intermediate memcpy to pinned memory if host array is a not a pinned host array
-    checkCuda( cudaMemcpy( dst.data(), src.data(), src.bytes(), cudaMemcpyDeviceToHost ) );
+    gpuMemcpy( dst.data(), src.data(), src.bytes(), gpuMemcpyDeviceToHost );
   }
 #endif
 
diff --git a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/P1_Sigma_sm_gg_ttxgg/CPPProcess.cc b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/P1_Sigma_sm_gg_ttxgg/CPPProcess.cc
index f9016eaa88..d59cc349e3 100644
--- a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/P1_Sigma_sm_gg_ttxgg/CPPProcess.cc
+++ b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/P1_Sigma_sm_gg_ttxgg/CPPProcess.cc
@@ -4,7 +4,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
 // MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
@@ -16,7 +16,6 @@
 
 #include "mgOnGpuConfig.h"
 
-#include "CudaRuntime.h"
 #include "HelAmps_sm.h"
 #include "MemoryAccessAmplitudes.h"
 #include "MemoryAccessCouplings.h"
@@ -46,7 +45,7 @@
 // Class member functions for calculating the matrix elements for
 // Process: g g > t t~ g g WEIGHTED<=4 @1
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -80,7 +79,7 @@ namespace mg5amcCpu
   __device__ const fptype cIPD[2] = { (fptype)Parameters_sm::mdl_MT, (fptype)Parameters_sm::mdl_WT };
   __device__ const fptype* cIPC = nullptr; // unused as nicoup=0
 #else
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   __device__ __constant__ fptype cIPD[2];
   __device__ __constant__ fptype* cIPC = nullptr; // unused as nicoup=0
 #else
@@ -90,7 +89,7 @@ namespace mg5amcCpu
 #endif
 
   // Helicity combinations (and filtering of "good" helicity combinations)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   __device__ __constant__ short cHel[ncomb][npar];
   __device__ __constant__ int cNGoodHel;
   __device__ __constant__ int cGoodHel[ncomb];
@@ -118,13 +117,13 @@ namespace mg5amcCpu
                            fptype* allDenominators,       // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
                            fptype_sv* jamp2_sv            // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled)
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
                            , const int ievt00             // input: first event number in current C++ event page (for CUDA, ievt depends on threadid)
 #endif
                            )
   //ALWAYS_INLINE // attributes are not permitted in a function definition
   {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     using namespace mg5amcGpu;
     using M_ACCESS = DeviceAccessMomenta;         // non-trivial access: buffer includes all events
     using E_ACCESS = DeviceAccessMatrixElements;  // non-trivial access: buffer includes all events
@@ -151,7 +150,7 @@ namespace mg5amcCpu
 #endif /* clang-format on */
     mgDebug( 0, __FUNCTION__ );
     //printf( "calculate_wavefunctions: ihel=%2d\n", ihel );
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
     //printf( "calculate_wavefunctions: ievt00=%d\n", ievt00 );
 #endif
 
@@ -187,7 +186,7 @@ namespace mg5amcCpu
 #endif
     for( int iParity = 0; iParity < nParity; ++iParity )
     { // START LOOP ON IPARITY
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
       const int ievt0 = ievt00 + iParity * neppV;
 #endif
       constexpr size_t nxcoup = ndcoup + nicoup; // both dependent and independent couplings
@@ -200,8 +199,10 @@ namespace mg5amcCpu
         allCOUPs[idcoup] = CD_ACCESS::idcoupAccessBufferConst( allcouplings, idcoup ); // dependent couplings, vary event-by-event
       for( size_t iicoup = 0; iicoup < nicoup; iicoup++ )
         allCOUPs[ndcoup + iicoup] = CI_ACCESS::iicoupAccessBufferConst( cIPC, iicoup ); // independent couplings, fixed for all events
+#ifdef MGONGPUCPP_GPUIMPL
 #ifdef __CUDACC__
 #pragma nv_diagnostic pop
+#endif
       // CUDA kernels take input/output buffers with momenta/MEs for all events
       const fptype* momenta = allmomenta;
       const fptype* COUPs[nxcoup];
@@ -2474,7 +2475,7 @@ namespace mg5amcCpu
         { 62, 71, -10, 80, -1, 8, -28, 62, 62, -10, -10, -1, -1, 8, -10, -1, -64, 8, 8, -64, 80, 8, 512, -64 },
         { -28, 62, 62, -10, -10, -1, 62, 71, -10, 80, -1, 8, -10, -1, -1, 8, 8, -64, 80, 8, 8, -64, -64, 512 } }; // 2-D array[24][24]
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
       // Pre-compute a constexpr triangular color matrix properly normalized #475
       struct TriangularNormalizedColorMatrix
       {
@@ -2531,7 +2532,7 @@ namespace mg5amcCpu
 #endif
       for( int icol = 0; icol < ncolor; icol++ )
       {
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
         // === C++ START ===
         // Diagonal terms
 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
@@ -2590,7 +2591,7 @@ namespace mg5amcCpu
       MEs_sv_previous += deltaMEs_previous;
 #endif
       /*
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
       if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", blockDim.x * blockIdx.x + threadIdx.x, ihel, MEs_sv );
 #else
 #ifdef MGONGPU_CPPSIMD
@@ -2685,8 +2686,8 @@ namespace mg5amcCpu
       { 1, 1, 1, -1, -1, 1 },
       { 1, 1, 1, -1, 1, -1 },
       { 1, 1, 1, -1, 1, 1 } };
-#ifdef __CUDACC__
-    checkCuda( cudaMemcpyToSymbol( cHel, tHel, ncomb * npar * sizeof( short ) ) );
+#ifdef MGONGPUCPP_GPUIMPL
+    gpuMemcpyToSymbol( cHel, tHel, ncomb * npar * sizeof( short ) );
 #else
     memcpy( cHel, tHel, ncomb * npar * sizeof( short ) );
 #endif
@@ -2728,9 +2729,9 @@ namespace mg5amcCpu
     // Then copy them to CUDA constant memory (issue #39) or its C++ emulation in file-scope static memory
     const fptype tIPD[2] = { (fptype)m_pars->mdl_MT, (fptype)m_pars->mdl_WT };
     //const cxtype tIPC[0] = { ... }; // nicoup=0
-#ifdef __CUDACC__
-    checkCuda( cudaMemcpyToSymbol( cIPD, tIPD, 2 * sizeof( fptype ) ) );
-    //checkCuda( cudaMemcpyToSymbol( cIPC, tIPC, 0 * sizeof( cxtype ) ) ); // nicoup=0
+#ifdef MGONGPUCPP_GPUIMPL
+    gpuMemcpyToSymbol( cIPD, tIPD, 2 * sizeof( fptype ) );
+    //gpuMemcpyToSymbol( cIPC, tIPC, 0 * sizeof( cxtype ) ); // nicoup=0
 #else
     memcpy( cIPD, tIPD, 2 * sizeof( fptype ) );
     //memcpy( cIPC, tIPC, 0 * sizeof( cxtype ) ); // nicoup=0
@@ -2768,7 +2769,7 @@ namespace mg5amcCpu
   {
     std::stringstream out;
     // CUDA version (NVCC)
-    // [Use __NVCC__ instead of __CUDACC__ here!]
+    // [Use __NVCC__ instead of MGONGPUCPP_GPUIMPL here!]
     // [This tests if 'nvcc' was used even to build a .cc file, even if not necessarily 'nvcc -x cu' for a .cu file]
     // [Check 'nvcc --compiler-options -dM -E dummy.c | grep CUDA': see https://stackoverflow.com/a/53713712]
 #ifdef __NVCC__
@@ -2833,12 +2834,12 @@ namespace mg5amcCpu
   __global__ void /* clang-format off */
   computeDependentCouplings( const fptype* allgs, // input: Gs[nevt]
                              fptype* allcouplings // output: couplings[nevt*ndcoup*2]
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
                              , const int nevt     // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #endif
   ) /* clang-format on */
   {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     using namespace mg5amcGpu;
     using G_ACCESS = DeviceAccessGs;
     using C_ACCESS = DeviceAccessCouplings;
@@ -2859,7 +2860,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__ /* clang-format off */
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
   __global__ void
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
@@ -2985,9 +2986,9 @@ namespace mg5amcCpu
         nGoodHel++;
       }
     }
-#ifdef __CUDACC__
-    checkCuda( cudaMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) ) );
-    checkCuda( cudaMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) ) );
+#ifdef MGONGPUCPP_GPUIMPL
+    gpuMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) );
+    gpuMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) );
 #else
     cNGoodHel = nGoodHel;
     for( int ihel = 0; ihel < ncomb; ihel++ ) cGoodHel[ihel] = goodHel[ihel];
@@ -3011,7 +3012,7 @@ namespace mg5amcCpu
 #endif
             int* allselhel,                // output: helicity selection[nevt]
             int* allselcol                 // output: helicity selection[nevt]
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
             , const int nevt               // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #endif
             ) /* clang-format on */
@@ -3031,7 +3032,7 @@ namespace mg5amcCpu
     // Denominators: spins, colors and identical particles
     constexpr int helcolDenominators[1] = { 512 }; // assume nprocesses == 1 (#272 and #343)
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     // Remember: in CUDA this is a kernel for one event, in c++ this processes n events
     const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid
 #else
@@ -3045,9 +3046,12 @@ namespace mg5amcCpu
 #endif
 
     // Start sigmaKin_lines
+
+#include "GpuAbstraction.h"
+
     // === PART 0 - INITIALISATION (before calculate_wavefunctions) ===
     // Reset the "matrix elements" - running sums of |M|^2 over helicities for the given event
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     allMEs[ievt] = 0;
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     allNumerators[ievt] = 0;
@@ -3075,7 +3079,7 @@ namespace mg5amcCpu
     // === PART 1 - HELICITY LOOP: CALCULATE WAVEFUNCTIONS ===
     // (in both CUDA and C++, using precomputed good helicities)
 
-#ifdef __CUDACC__ // CUDA OR C++
+#ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++
 
     // *** START OF PART 1a - CUDA (one event per CPU thread) ***
     // Running sum of partial amplitudes squared for event by event color selection (#402)
@@ -3285,7 +3289,7 @@ namespace mg5amcCpu
     // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event
     // [NB 'sum over final spins, average over initial spins', eg see
     // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf]
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     allMEs[ievt] /= helcolDenominators[0];
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     if( channelId > 0 ) allMEs[ievt] *= allNumerators[ievt] / allDenominators[ievt];
diff --git a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/P1_Sigma_sm_gg_ttxgg/CPPProcess.h b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/P1_Sigma_sm_gg_ttxgg/CPPProcess.h
index 04f7c62976..deb1358992 100644
--- a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/P1_Sigma_sm_gg_ttxgg/CPPProcess.h
+++ b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/P1_Sigma_sm_gg_ttxgg/CPPProcess.h
@@ -4,7 +4,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
 // MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
@@ -25,7 +25,7 @@
 
 //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -107,7 +107,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   __global__ void
   computeDependentCouplings( const fptype* allgs,    // input: Gs[nevt]
                              fptype* allcouplings ); // output: couplings[nevt*ndcoup*2]
@@ -120,7 +120,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__ /* clang-format off */
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
   __global__ void
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
@@ -150,7 +150,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__ /* clang-format off */
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
   __global__ void
   sigmaKin( const fptype* allmomenta,      // input: momenta[nevt*npar*4]
             const fptype* allcouplings,    // input: couplings[nevt*ndcoup*2]
diff --git a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/P1_Sigma_sm_gg_ttxgg/CudaRuntime.h b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/P1_Sigma_sm_gg_ttxgg/CudaRuntime.h
deleted file mode 120000
index ce9e1a487a..0000000000
--- a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/P1_Sigma_sm_gg_ttxgg/CudaRuntime.h
+++ /dev/null
@@ -1 +0,0 @@
-../CudaRuntime.h
\ No newline at end of file
diff --git a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/P1_Sigma_sm_gg_ttxgg/GpuAbstraction.h b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/P1_Sigma_sm_gg_ttxgg/GpuAbstraction.h
new file mode 120000
index 0000000000..72054e19ba
--- /dev/null
+++ b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/P1_Sigma_sm_gg_ttxgg/GpuAbstraction.h
@@ -0,0 +1 @@
+../GpuAbstraction.h
\ No newline at end of file
diff --git a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/P1_Sigma_sm_gg_ttxgg/GpuRuntime.h b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/P1_Sigma_sm_gg_ttxgg/GpuRuntime.h
new file mode 120000
index 0000000000..3920e83be4
--- /dev/null
+++ b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/P1_Sigma_sm_gg_ttxgg/GpuRuntime.h
@@ -0,0 +1 @@
+../GpuRuntime.h
\ No newline at end of file
diff --git a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/P1_Sigma_sm_gg_ttxgg/check_sa.cc b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/P1_Sigma_sm_gg_ttxgg/check_sa.cc
index 3fbf0ffbee..7cac5ab47b 100644
--- a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/P1_Sigma_sm_gg_ttxgg/check_sa.cc
+++ b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/P1_Sigma_sm_gg_ttxgg/check_sa.cc
@@ -4,7 +4,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: O. Mattelaer (Nov 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 
 #include "mgOnGpuConfig.h"
@@ -12,6 +12,7 @@
 #include "BridgeKernels.h"
 #include "CPPProcess.h"
 #include "CrossSectionKernels.h"
+#include "GpuRuntime.h"
 #include "MatrixElementKernels.h"
 #include "MemoryAccessMatrixElements.h"
 #include "MemoryAccessMomenta.h"
@@ -65,7 +66,7 @@ usage( char* argv0, int ret = 1 )
   std::cout << std::endl;
   std::cout << "Summary stats are always computed: '-p' and '-j' only control their printout" << std::endl;
   std::cout << "The '-d' flag only enables NaN/abnormal warnings and OMP debugging" << std::endl;
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 #ifdef _OPENMP
   std::cout << std::endl;
   std::cout << "Use the OMP_NUM_THREADS environment variable to control OMP multi-threading" << std::endl;
@@ -96,7 +97,7 @@ int
 main( int argc, char** argv )
 {
   // Namespaces for CUDA and C++ (FIXME - eventually use the same namespace everywhere...)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   using namespace mg5amcGpu;
 #else
   using namespace mg5amcCpu;
@@ -134,9 +135,11 @@ main( int argc, char** argv )
     CurandDevice = 2
   };
 #ifdef MGONGPU_HAS_NO_CURAND
-  RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // this is the only supported mode if build has no curand (PR #784)
+  RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // this is the only supported mode if build has no curand (PR #784 and #785)
+#elif defined __HIPCC__
+#error Internal error: MGONGPU_HAS_NO_CURAND should have been set for __HIPCC__ // default on AMD GPUs should be common random
 #elif defined __CUDACC__
-  RandomNumberMode rndgen = RandomNumberMode::CurandDevice; // default on GPU if build has curand
+  RandomNumberMode rndgen = RandomNumberMode::CurandDevice; // default on NVidia GPU if build has curand
 #else
   RandomNumberMode rndgen = RandomNumberMode::CurandHost; // default on CPU if build has curand
 #endif
@@ -146,10 +149,10 @@ main( int argc, char** argv )
     RamboHost = 1,
     RamboDevice = 2
   };
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   RamboSamplingMode rmbsmp = RamboSamplingMode::RamboDevice; // default on GPU
 #else
-  RamboSamplingMode rmbsmp = RamboSamplingMode::RamboHost;  // default on CPU
+  RamboSamplingMode rmbsmp = RamboSamplingMode::RamboHost; // default on CPU
 #endif
   // Bridge emulation mode (NB Bridge implies RamboHost!)
   bool bridge = false;
@@ -177,7 +180,7 @@ main( int argc, char** argv )
     else if( arg == "--curdev" )
     {
 #ifndef __CUDACC__
-      throw std::runtime_error( "CurandDevice is not supported on CPUs" );
+      throw std::runtime_error( "CurandDevice is not supported on CPUs or non-NVidia GPUs" );
 #elif defined MGONGPU_HAS_NO_CURAND
       throw std::runtime_error( "CurandDevice is not supported because this application was built without Curand support" );
 #else
@@ -198,7 +201,7 @@ main( int argc, char** argv )
     }
     else if( arg == "--rmbdev" )
     {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
       rmbsmp = RamboSamplingMode::RamboDevice;
 #else
       throw std::runtime_error( "RamboDevice is not supported on CPUs" );
@@ -272,13 +275,13 @@ main( int argc, char** argv )
     return usage( argv[0] );
   }
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 #ifdef _OPENMP
   ompnumthreadsNotSetMeansOneThread( debug ? 1 : 0 ); // quiet(-1), info(0), debug(1)
 #endif
 #endif
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // Fail gently and avoid "Illegal instruction (core dumped)" if the host does not support the SIMD used in the ME calculation
   // Note: this prevents a crash on pmpe04 but not on some github CI nodes?
   // [NB: SIMD vectorization in mg5amc C++ code is only used in the ME calculation below MatrixElementKernelHost!]
@@ -296,14 +299,14 @@ main( int argc, char** argv )
 
   // === STEP 0 - INITIALISE
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 
-  // --- 00. Initialise cuda
-  // Instantiate a CudaRuntime at the beginnining of the application's main to
-  // invoke cudaSetDevice(0) in the constructor and book a cudaDeviceReset() call in the destructor
-  const std::string cdinKey = "00 CudaInit";
+  // --- 00. Initialise GPU
+  // Instantiate a GpuRuntime at the beginnining of the application's main.
+  // For CUDA this invokes cudaSetDevice(0) in the constructor and books a cudaDeviceReset() call in the destructor.
+  const std::string cdinKey = "00 GpuInit";
   timermap.start( cdinKey );
-  CudaRuntime cudaRuntime( debug );
+  GpuRuntime GpuRuntime( debug );
 #endif
 
   // --- 0a. Initialise physics process
@@ -325,7 +328,7 @@ main( int argc, char** argv )
   timermap.start( alloKey );
 
   // Memory buffers for random numbers for momenta
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferRndNumMomenta hstRndmom( nevt );
 #else
   PinnedHostBufferRndNumMomenta hstRndmom( nevt );
@@ -333,7 +336,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for sampling weights
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferWeights hstWeights( nevt );
 #else
   PinnedHostBufferWeights hstWeights( nevt );
@@ -341,7 +344,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for momenta
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferMomenta hstMomenta( nevt );
 #else
   PinnedHostBufferMomenta hstMomenta( nevt );
@@ -349,7 +352,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for Gs
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferGs hstGs( nevt );
 #else
   PinnedHostBufferGs hstGs( nevt );
@@ -366,7 +369,7 @@ main( int argc, char** argv )
   }
 
   // Memory buffers for matrix elements
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferMatrixElements hstMatrixElements( nevt );
 #else
   PinnedHostBufferMatrixElements hstMatrixElements( nevt );
@@ -375,7 +378,7 @@ main( int argc, char** argv )
 
   // Memory buffers for random numbers for helicity selection
   // *** NB #403 these buffers always remain initialised at 0: no need for helicity choice in gcheck/check (no LHE produced) ***
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferRndNumHelicity hstRndHel( nevt );
 #else
   PinnedHostBufferRndNumHelicity hstRndHel( nevt );
@@ -384,7 +387,7 @@ main( int argc, char** argv )
 
   // Memory buffers for random numbers for color selection
   // *** NB #402 these buffers always remain initialised at 0: no need for color choice in gcheck/check (no LHE produced) ***
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferRndNumColor hstRndCol( nevt );
 #else
   PinnedHostBufferRndNumColor hstRndCol( nevt );
@@ -392,7 +395,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for helicity selection
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferSelectedHelicity hstSelHel( nevt );
 #else
   PinnedHostBufferSelectedHelicity hstSelHel( nevt );
@@ -400,7 +403,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for color selection
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferSelectedColor hstSelCol( nevt );
 #else
   PinnedHostBufferSelectedColor hstSelCol( nevt );
@@ -438,7 +441,7 @@ main( int argc, char** argv )
     const bool onDevice = true;
     prnk.reset( new CurandRandomNumberKernel( devRndmom, onDevice ) );
 #else
-    throw std::logic_error( "INTERNAL ERROR! CurandDevice is not supported on CPUs" ); // INTERNAL ERROR (no path to this statement)
+    throw std::logic_error( "INTERNAL ERROR! CurandDevice is not supported on CPUs or non-NVidia GPUs" ); // INTERNAL ERROR (no path to this statement)
 #endif
   }
 
@@ -450,7 +453,7 @@ main( int argc, char** argv )
   }
   else
   {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     prsk.reset( new RamboSamplingKernelDevice( energy, devRndmom, devMomenta, devWeights, gpublocks, gputhreads ) );
 #else
     throw std::logic_error( "RamboDevice is not supported on CPUs" ); // INTERNAL ERROR (no path to this statement)
@@ -461,7 +464,7 @@ main( int argc, char** argv )
   std::unique_ptr<MatrixElementKernelBase> pmek;
   if( !bridge )
   {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     pmek.reset( new MatrixElementKernelDevice( devMomenta, devGs, devRndHel, devRndCol, devMatrixElements, devSelHel, devSelCol, gpublocks, gputhreads ) );
 #else
     pmek.reset( new MatrixElementKernelHost( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, nevt ) );
@@ -469,7 +472,7 @@ main( int argc, char** argv )
   }
   else
   {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     pmek.reset( new BridgeKernelDevice( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, gpublocks, gputhreads ) );
 #else
     pmek.reset( new BridgeKernelHost( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, nevt ) );
@@ -511,7 +514,7 @@ main( int argc, char** argv )
     prnk->generateRnarray();
     //std::cout << "Got random numbers" << std::endl;
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     if( rndgen != RandomNumberMode::CurandDevice && rmbsmp == RamboSamplingMode::RamboDevice )
     {
       // --- 1c. Copy rndmom from host to device
@@ -543,7 +546,7 @@ main( int argc, char** argv )
     prsk->getMomentaFinal();
     //std::cout << "Got final momenta" << std::endl;
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     if( rmbsmp == RamboSamplingMode::RamboDevice )
     {
       // --- 2c. CopyDToH Weights
@@ -588,7 +591,7 @@ main( int argc, char** argv )
       dynamic_cast<BridgeKernelBase*>( pmek.get() )->transposeInputMomentaC2F();
     }
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     // --- 2d. CopyHToD Momenta
     const std::string gKey = "0.. CpHTDg";
     rambtime += timermap.start( gKey ); // FIXME! NOT A RAMBO TIMER!
@@ -617,7 +620,7 @@ main( int argc, char** argv )
     wv3atime += timermap.stop(); // calc only
     wavetime += wv3atime;        // calc plus copy
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     if( !bridge )
     {
       // --- 3b. CopyDToH MEs
@@ -760,18 +763,22 @@ main( int argc, char** argv )
     rndgentxt = "CURAND DEVICE";
 #ifdef __CUDACC__
   rndgentxt += " (CUDA code)";
+#elif defined __HIPCC__
+  rndgentxt += " (HIP code)";
 #else
   rndgentxt += " (C++ code)";
 #endif
 
   // Workflow description summary
   std::string wrkflwtxt;
-  // -- CUDA or C++?
+  // -- CUDA or HIP or C++?
 #ifdef __CUDACC__
   wrkflwtxt += "CUD:";
+#elif defined __HIPCC__
+  wrkflwtxt += "HIP:";
 #else
   wrkflwtxt += "CPP:";
-#endif
+#endif /* clang-format off */
   // -- DOUBLE or FLOAT?
 #if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
   wrkflwtxt += "MIX+"; // mixed fptypes (single precision color algebra #537)
@@ -781,7 +788,7 @@ main( int argc, char** argv )
   wrkflwtxt += "FLT+";
 #else
   wrkflwtxt += "???+"; // no path to this statement
-#endif
+#endif /* clang-format on */
   // -- CUCOMPLEX or THRUST or STD complex numbers?
 #ifdef __CUDACC__
 #if defined MGONGPU_CUCXTYPE_CUCOMPLEX
@@ -793,6 +800,12 @@ main( int argc, char** argv )
 #else
   wrkflwtxt += "???:"; // no path to this statement
 #endif
+#elif defined __HIPCC__
+#if defined MGONGPU_CUCXTYPE_CXSMPL
+  wrkflwtxt += "CXS:";
+#else
+  wrkflwtxt += "???:"; // no path to this statement
+#endif
 #else
 #if defined MGONGPU_CPPCXTYPE_STDCOMPLEX
   wrkflwtxt += "STX:";
@@ -818,7 +831,7 @@ main( int argc, char** argv )
     wrkflwtxt += "RMBDEV+";
   else
     wrkflwtxt += "??????+"; // no path to this statement
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   // -- HOST or DEVICE matrix elements? Standalone MEs or BRIDGE?
   if( !bridge )
     wrkflwtxt += "MESDEV";
@@ -874,7 +887,7 @@ main( int argc, char** argv )
 
   if( perf )
   {
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 #ifdef _OPENMP
     // Get the output of "nproc --all" (https://stackoverflow.com/a/478960)
     std::string nprocall;
@@ -895,6 +908,8 @@ main( int argc, char** argv )
     std::cout << std::string( SEP79, '*' ) << std::endl
 #ifdef __CUDACC__
               << "Process                     = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_CUDA"
+#elif defined __HIPCC__
+              << "Process                     = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_HIP"
 #else
               << "Process                     = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_CPP"
 #endif
@@ -921,21 +936,21 @@ main( int argc, char** argv )
 #elif defined MGONGPU_FPTYPE_FLOAT
               << "FP precision                = FLOAT (NaN/abnormal=" << nabn << ", zero=" << nzero << ")" << std::endl
 #endif
-#ifdef __CUDACC__
 #if defined MGONGPU_CUCXTYPE_CUCOMPLEX
               << "Complex type                = CUCOMPLEX" << std::endl
 #elif defined MGONGPU_CUCXTYPE_THRUST
               << "Complex type                = THRUST::COMPLEX" << std::endl
-#endif
-#else
+#elif defined MGONGPU_CUCXTYPE_CXSMPL
               << "Complex type                = STD::COMPLEX" << std::endl
+#else
+              << "Complex type                = ???" << std::endl // no path to this statement...
 #endif
               << "RanNumb memory layout       = AOSOA[" << neppR << "]"
               << ( neppR == 1 ? " == AOS" : "" )
               << " [HARDCODED FOR REPRODUCIBILITY]" << std::endl
               << "Momenta memory layout       = AOSOA[" << neppM << "]"
               << ( neppM == 1 ? " == AOS" : "" ) << std::endl
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     //<< "Wavefunction GPU memory     = LOCAL" << std::endl
 #else
 #if !defined MGONGPU_CPPSIMD
@@ -966,7 +981,7 @@ main( int argc, char** argv )
 #endif
 #endif
               << "Random number generation    = " << rndgentxt << std::endl
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 #ifdef _OPENMP
               << "OMP threads / `nproc --all` = " << omp_get_max_threads() << " / " << nprocall // includes a newline
 #endif
@@ -1062,14 +1077,14 @@ main( int argc, char** argv )
              << "\"FLOAT (NaN/abnormal=" << nabn << ")\"," << std::endl
 #endif
              << "\"Complex type\": "
-#ifdef __CUDACC__
 #if defined MGONGPU_CUCXTYPE_CUCOMPLEX
              << "\"CUCOMPLEX\"," << std::endl
 #elif defined MGONGPU_CUCXTYPE_THRUST
              << "\"THRUST::COMPLEX\"," << std::endl
-#endif
-#else
+#elif defined MGONGPU_CUCXTYPE_CXSMPL
              << "\"STD::COMPLEX\"," << std::endl
+#else
+             << "\"???\"," << std::endl                           // no path to this statement...
 #endif
              << "\"RanNumb memory layout\": "
              << "\"AOSOA[" << neppR << "]\""
@@ -1077,7 +1092,7 @@ main( int argc, char** argv )
              << "\"Momenta memory layout\": "
              << "\"AOSOA[" << neppM << "]\""
              << ( neppM == 1 ? " == AOS" : "" ) << ", " << std::endl
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     //<< "\"Wavefunction GPU memory\": " << "\"LOCAL\"," << std::endl
 #endif
              << "\"Curand generation\": "
diff --git a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/RamboSamplingKernels.cc b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/RamboSamplingKernels.cc
index da68aa9255..79abbcc4f8 100644
--- a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/RamboSamplingKernels.cc
+++ b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/RamboSamplingKernels.cc
@@ -1,11 +1,11 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #include "RamboSamplingKernels.h"
 
-#include "CudaRuntime.h"
+#include "GpuRuntime.h"
 #include "MemoryAccessMomenta.h"
 #include "MemoryAccessRandomNumbers.h"
 #include "MemoryAccessWeights.h"
@@ -14,7 +14,7 @@
 
 #include <sstream>
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -92,7 +92,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   RamboSamplingKernelDevice::RamboSamplingKernelDevice( const fptype energy,               // input: energy
                                                         const BufferRndNumMomenta& rndmom, // input: random numbers in [0,1]
                                                         BufferMomenta& momenta,            // output: momenta
@@ -135,7 +135,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   __global__ void
   getMomentaInitialDevice( const fptype energy,
                            fptype* momenta )
@@ -147,17 +147,17 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   void
   RamboSamplingKernelDevice::getMomentaInitial()
   {
-    getMomentaInitialDevice<<<m_gpublocks, m_gputhreads>>>( m_energy, m_momenta.data() );
+    gpuLaunchKernel( getMomentaInitialDevice, m_gpublocks, m_gputhreads, m_energy, m_momenta.data() );
   }
 #endif
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   __global__ void
   getMomentaFinalDevice( const fptype energy,
                          const fptype* rndmom,
@@ -171,11 +171,11 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   void
   RamboSamplingKernelDevice::getMomentaFinal()
   {
-    getMomentaFinalDevice<<<m_gpublocks, m_gputhreads>>>( m_energy, m_rndmom.data(), m_momenta.data(), m_weights.data() );
+    gpuLaunchKernel( getMomentaFinalDevice, m_gpublocks, m_gputhreads, m_energy, m_rndmom.data(), m_momenta.data(), m_weights.data() );
   }
 #endif
 
diff --git a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/RamboSamplingKernels.h b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/RamboSamplingKernels.h
index 184089efd7..7c214cd74b 100644
--- a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/RamboSamplingKernels.h
+++ b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/RamboSamplingKernels.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef RAMBOSAMPLINGKERNELS_H
 #define RAMBOSAMPLINGKERNELS_H 1
@@ -10,7 +10,7 @@
 
 #include "MemoryBuffers.h"
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -93,7 +93,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   // A class encapsulating RAMBO phase space sampling on a GPU device
   class RamboSamplingKernelDevice final : public SamplingKernelBase, public NumberOfEvents
   {
diff --git a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/RandomNumberKernels.h b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/RandomNumberKernels.h
index 188a72c2c9..21d63beeac 100644
--- a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/RandomNumberKernels.h
+++ b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/RandomNumberKernels.h
@@ -1,14 +1,14 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef RANDOMNUMBERKERNELS_H
 #define RANDOMNUMBERKERNELS_H 1
 
 #include "mgOnGpuConfig.h"
 
-// NB This must come AFTER mgOnGpuConfig.h which contains our definition of __global__ when __CUDACC__ is not defined
+// NB This must come AFTER mgOnGpuConfig.h which contains our definition of __global__ when MGONGPUCPP_GPUIMPL is not defined
 #ifndef MGONGPU_HAS_NO_CURAND
 //#include "curand.h"
 struct curandGenerator_st; // forward definition from curand.h
@@ -16,7 +16,7 @@ struct curandGenerator_st; // forward definition from curand.h
 
 #include "MemoryBuffers.h"
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/cudacpp.mk b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/cudacpp.mk
index 509307506b..f2cfa349da 100644
--- a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/cudacpp.mk
+++ b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/cudacpp.mk
@@ -1,7 +1,7 @@
 # Copyright (C) 2020-2023 CERN and UCLouvain.
 # Licensed under the GNU Lesser General Public License (version 3 or later).
 # Created by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin.
-# Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+# Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 
 #=== Determine the name of this makefile (https://ftp.gnu.org/old-gnu/Manuals/make-3.80/html_node/make_17.html)
 #=== NB: use ':=' to ensure that the value of CUDACPP_MAKEFILE is not modified further down after including make_opts
@@ -42,10 +42,10 @@ endif
 
 #-------------------------------------------------------------------------------
 
-#=== Configure common compiler flags for C++ and CUDA
+#=== Configure common compiler flags for C++ and CUDA/HIP
 
 INCFLAGS = -I.
-OPTFLAGS = -O3 # this ends up in CUFLAGS too (should it?), cannot add -Ofast or -ffast-math here
+OPTFLAGS = -O3 # this ends up in GPUFLAGS too (should it?), cannot add -Ofast or -ffast-math here
 
 # Dependency on src directory
 MG5AMC_COMMONLIB = mg5amc_common
@@ -121,24 +121,46 @@ endif
 
 #-------------------------------------------------------------------------------
 
-#=== Configure the CUDA compiler
+#=== Configure the GPU compiler (CUDA or HIP)
 
-# If CXX is not a single word (example "clang++ --gcc-toolchain...") then disable CUDA builds (issue #505)
-# This is because it is impossible to pass this to "CUFLAGS += -ccbin <host-compiler>" below
+# FIXME! (AV 24.01.2024)
+# In the current implementation (without separate builds for C++ and CUDA/HIP), we first check for cudacc and hipcc in CUDA_HOME and HIP_HOME.
+# If CUDA_HOME or HIP_HOME are not set, try to determine them from the path to cudacc and hipcc.
+# While convoluted, this is currently necessary to allow disabling CUDA/HIP builds by setting CUDA_HOME or HIP_HOME to invalid paths.
+# This will (probably?) be fixed when separate C++ and CUDA/HIP builds are implemented (PR #775).
+
+# If CXX is not a single word (example "clang++ --gcc-toolchain...") then disable CUDA and HIP builds (issue #505)
+# This is because it is impossible to pass this to "GPUFLAGS += -ccbin <host-compiler>" below
 ifneq ($(words $(subst ccache ,,$(CXX))),1) # allow at most "CXX=ccache <host-compiler>" from outside
-  $(warning CUDA builds are not supported for multi-word CXX "$(CXX)")
+  $(warning CUDA and HIP builds are not supported for multi-word CXX "$(CXX)")
   override CUDA_HOME=disabled
+  override HIP_HOME=disabled
 endif
 
-# If CUDA_HOME is not set, try to set it from the location of nvcc
+# If CUDA_HOME is not set, try to set it from the path to nvcc
 ifndef CUDA_HOME
   CUDA_HOME = $(patsubst %bin/nvcc,%,$(shell which nvcc 2>/dev/null))
   $(warning CUDA_HOME was not set: using "$(CUDA_HOME)")
 endif
 
-# Set NVCC as $(CUDA_HOME)/bin/nvcc if it exists
+# If HIP_HOME is not set, try to set it from the path to hipcc
+ifndef HIP_HOME
+  HIP_HOME = $(patsubst %bin/hipcc,%,$(HIP_COMPILER_PATH))
+  $(warning HIP_HOME was not set: using "$(HIP_HOME)")
+endif
+
+# FIXME! (AV 24.01.2024)
+# In the current implementation (without separate builds for C++ and CUDA/HIP),
+# builds are performed for HIP only if CUDA is not found in the path.
+# If both CUDA and HIP are installed, HIP builds can be triggered by unsetting CUDA_HOME.
+# This will be fixed when separate C++ and CUDA/HIP builds are implemented (PR #775).
+
+#--- Option 1: CUDA exists -> use CUDA
+
+# Set GPUCC as $(CUDA_HOME)/bin/nvcc if it exists
 ifneq ($(wildcard $(CUDA_HOME)/bin/nvcc),)
-  NVCC = $(CUDA_HOME)/bin/nvcc
+
+  GPUCC = $(CUDA_HOME)/bin/nvcc
   USE_NVTX ?=-DUSE_NVTX
   # See https://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/index.html
   # See https://arnon.dk/matching-sm-architectures-arch-and-gencode-for-various-nvidia-cards/
@@ -158,41 +180,78 @@ ifneq ($(wildcard $(CUDA_HOME)/bin/nvcc),)
     CURANDLIBFLAGS = -L$(CUDA_HOME)/lib64/ -lcurand # NB: -lcuda is not needed here!
   endif
   CUOPTFLAGS = -lineinfo
-  CUFLAGS = $(foreach opt, $(OPTFLAGS), -Xcompiler $(opt)) $(CUOPTFLAGS) $(INCFLAGS) $(CUINC) $(USE_NVTX) $(CUARCHFLAGS) -use_fast_math
-  ###CUFLAGS += -Xcompiler -Wall -Xcompiler -Wextra -Xcompiler -Wshadow
-  ###NVCC_VERSION = $(shell $(NVCC) --version | grep 'Cuda compilation tools' | cut -d' ' -f5 | cut -d, -f1)
-  CUFLAGS += -std=c++17 # need CUDA >= 11.2 (see #333): this is enforced in mgOnGpuConfig.h
+  ###GPUFLAGS = $(OPTFLAGS) $(CUOPTFLAGS) $(INCFLAGS) $(CUINC) $(USE_NVTX) $(CUARCHFLAGS) -use_fast_math
+  GPUFLAGS = $(foreach opt, $(OPTFLAGS), -Xcompiler $(opt)) $(CUOPTFLAGS) $(INCFLAGS) $(CUINC) $(USE_NVTX) $(CUARCHFLAGS) -use_fast_math
+  ###GPUFLAGS += -Xcompiler -Wall -Xcompiler -Wextra -Xcompiler -Wshadow
+  ###GPUCC_VERSION = $(shell $(GPUCC) --version | grep 'Cuda compilation tools' | cut -d' ' -f5 | cut -d, -f1)
+  GPUFLAGS += -std=c++17 # need CUDA >= 11.2 (see #333): this is enforced in mgOnGpuConfig.h
   # Without -maxrregcount: baseline throughput: 6.5E8 (16384 32 12) up to 7.3E8 (65536 128 12)
-  ###CUFLAGS+= --maxrregcount 160 # improves throughput: 6.9E8 (16384 32 12) up to 7.7E8 (65536 128 12)
-  ###CUFLAGS+= --maxrregcount 128 # improves throughput: 7.3E8 (16384 32 12) up to 7.6E8 (65536 128 12)
-  ###CUFLAGS+= --maxrregcount 96 # degrades throughput: 4.1E8 (16384 32 12) up to 4.5E8 (65536 128 12)
-  ###CUFLAGS+= --maxrregcount 64 # degrades throughput: 1.7E8 (16384 32 12) flat at 1.7E8 (65536 128 12)
+  ###GPUFLAGS+= --maxrregcount 160 # improves throughput: 6.9E8 (16384 32 12) up to 7.7E8 (65536 128 12)
+  ###GPUFLAGS+= --maxrregcount 128 # improves throughput: 7.3E8 (16384 32 12) up to 7.6E8 (65536 128 12)
+  ###GPUFLAGS+= --maxrregcount 96 # degrades throughput: 4.1E8 (16384 32 12) up to 4.5E8 (65536 128 12)
+  ###GPUFLAGS+= --maxrregcount 64 # degrades throughput: 1.7E8 (16384 32 12) flat at 1.7E8 (65536 128 12)
+  CUBUILDRULEFLAGS = -Xcompiler -fPIC -c
+  CCBUILDRULEFLAGS = -Xcompiler -fPIC -c -x cu
+  CUDATESTFLAGS = -lcuda
+
+  # Set the host C++ compiler for GPUCC via "-ccbin <host-compiler>"
+  # (NB issue #505: this must be a single word, "clang++ --gcc-toolchain..." is not supported)
+  GPUFLAGS += -ccbin $(shell which $(subst ccache ,,$(CXX)))
+
+  # Allow newer (unsupported) C++ compilers with older versions of CUDA if ALLOW_UNSUPPORTED_COMPILER_IN_CUDA is set (#504)
+  ifneq ($(origin ALLOW_UNSUPPORTED_COMPILER_IN_CUDA),undefined)
+  GPUFLAGS += -allow-unsupported-compiler
+  endif
+
 else ifneq ($(origin REQUIRE_CUDA),undefined)
+
   # If REQUIRE_CUDA is set but no cuda is found, stop here (e.g. for CI tests on GPU #443)
-  $(error No cuda installation found (set CUDA_HOME or make nvcc visible in PATH))
+  $(error No cuda installation found (set CUDA_HOME or make GPUCC visible in PATH))
+
+#--- Option 2: CUDA does not exist, HIP exists -> use HIP
+
+# Set GPUCC as $(HIP_HOME)/bin/hipcc if it exists
+else ifneq ($(wildcard $(HIP_HOME)/bin/hipcc),)
+
+  GPUCC = $(HIP_HOME)/bin/hipcc
+  #USE_NVTX ?=-DUSE_NVTX # should maybe find something equivalent to this in HIP?
+  HIPARCHFLAGS = -target x86_64-linux-gnu --offload-arch=gfx90a
+  HIPINC = -I$(HIP_HOME)/include/
+  # Note: -DHIP_FAST_MATH is equivalent to -use_fast_math in HIP 
+  # (but only for single precision line 208: https://rocm-developer-tools.github.io/HIP/hcc__detail_2math__functions_8h_source.html)
+  GPUFLAGS = $(OPTFLAGS) $(CUOPTFLAGS) $(INCFLAGS) $(HIPINC) $(HIPARCHFLAGS) -DHIP_FAST_MATH -DHIP_PLATFORM=amd -fPIC
+  ###GPUFLAGS += -Xcompiler -Wall -Xcompiler -Wextra -Xcompiler -Wshadow
+  GPUFLAGS += -std=c++17
+  ###GPUFLAGS+= --maxrregcount 255 # (AV: is this option valid on HIP and meaningful on AMD GPUs?)
+  CUBUILDRULEFLAGS = -fPIC -c
+  CCBUILDRULEFLAGS = -fPIC -c
+
+else ifneq ($(origin REQUIRE_HIP),undefined)
+
+  # If REQUIRE_HIP is set but no HIP is found, stop here (e.g. for CI tests on GPU #443)
+  $(error No hip installation found (set HIP_HOME or make GPUCC visible in PATH))
+
+#--- Option 3: CUDA does not exist, HIP does not exist -> switch off both CUDA and HIP
+
 else
-  # No cuda. Switch cuda compilation off and go to common random numbers in C++
+
+  # No cudacc and no hipcc: switch CUDA and HIP compilation off and go to common random numbers in C++
   $(warning CUDA_HOME is not set or is invalid: export CUDA_HOME to compile with cuda)
-  override NVCC=
+  $(warning HIP_HOME is not set or is invalid: export HIP_HOME to compile with hip)
+  override GPUCC=
   override USE_NVTX=
   override CUINC=
   override CURANDLIBFLAGS=
-endif
-export NVCC
-export CUFLAGS
-
-# Set the host C++ compiler for nvcc via "-ccbin <host-compiler>"
-# (NB issue #505: this must be a single word, "clang++ --gcc-toolchain..." is not supported)
-CUFLAGS += -ccbin $(shell which $(subst ccache ,,$(CXX)))
 
-# Allow newer (unsupported) C++ compilers with older versions of CUDA if ALLOW_UNSUPPORTED_COMPILER_IN_CUDA is set (#504)
-ifneq ($(origin ALLOW_UNSUPPORTED_COMPILER_IN_CUDA),undefined)
-CUFLAGS += -allow-unsupported-compiler
 endif
 
+# Export GPUCC (so that it can also be used in cudacpp_src.mk?)
+export GPUCC
+export GPUFLAGS
+
 #-------------------------------------------------------------------------------
 
-#=== Configure ccache for C++ and CUDA builds
+#=== Configure ccache for C++ and CUDA/HIP builds
 
 # Enable ccache if USECCACHE=1
 ifeq ($(USECCACHE)$(shell echo $(CXX) | grep ccache),1)
@@ -201,15 +260,15 @@ endif
 #ifeq ($(USECCACHE)$(shell echo $(AR) | grep ccache),1)
 #  override AR:=ccache $(AR)
 #endif
-ifneq ($(NVCC),)
-  ifeq ($(USECCACHE)$(shell echo $(NVCC) | grep ccache),1)
-    override NVCC:=ccache $(NVCC)
+ifneq ($(GPUCC),)
+  ifeq ($(USECCACHE)$(shell echo $(GPUCC) | grep ccache),1)
+    override GPUCC:=ccache $(GPUCC)
   endif
 endif
 
 #-------------------------------------------------------------------------------
 
-#=== Configure PowerPC-specific compiler flags for C++ and CUDA
+#=== Configure PowerPC-specific compiler flags for C++ and CUDA/HIP
 
 # PowerPC-specific CXX compiler flags (being reviewed)
 ifeq ($(UNAME_P),ppc64le)
@@ -225,9 +284,9 @@ else
   ######CXXFLAGS+= -fno-semantic-interposition # no benefit (neither alone, nor combined with -flto)
 endif
 
-# PowerPC-specific CUDA compiler flags (to be reviewed!)
+# PowerPC-specific CUDA/HIP compiler flags (to be reviewed!)
 ifeq ($(UNAME_P),ppc64le)
-  CUFLAGS+= -Xcompiler -mno-float128
+  GPUFLAGS+= -Xcompiler -mno-float128
 endif
 
 #-------------------------------------------------------------------------------
@@ -237,7 +296,7 @@ endif
 # Set the default OMPFLAGS choice
 ifneq ($(shell $(CXX) --version | egrep '^Intel'),)
 override OMPFLAGS = -fopenmp
-###override OMPFLAGS = # disable OpenMP MT on Intel (was ok without nvcc but not ok with nvcc before #578)
+###override OMPFLAGS = # disable OpenMP MT on Intel (was ok without GPUCC but not ok with GPUCC before #578)
 else ifneq ($(shell $(CXX) --version | egrep '^(clang)'),)
 override OMPFLAGS = -fopenmp
 ###override OMPFLAGS = # disable OpenMP MT on clang (was not ok without or with nvcc before #578)
@@ -293,7 +352,10 @@ endif
 
 # Set the default RNDGEN (random number generator) choice
 ifeq ($(RNDGEN),)
-  ifeq ($(NVCC),)
+  ifeq ($(GPUCC),)
+    override RNDGEN = hasNoCurand
+  # Edgecase for HIP compilation
+  else ifeq ($(findstring hipcc,$(GPUCC)),hipcc)
     override RNDGEN = hasNoCurand
   else ifeq ($(RNDGEN),)
     override RNDGEN = hasCurand
@@ -310,7 +372,7 @@ export OMPFLAGS
 
 #-------------------------------------------------------------------------------
 
-#=== Set the CUDA/C++ compiler flags appropriate to user-defined choices of AVX, FPTYPE, HELINL, HRDCOD, RNDGEN
+#=== Set the CUDA/HIP/C++ compiler flags appropriate to user-defined choices of AVX, FPTYPE, HELINL, HRDCOD, RNDGEN
 
 # Set the build flags appropriate to OMPFLAGS
 $(info OMPFLAGS=$(OMPFLAGS))
@@ -368,13 +430,13 @@ CXXFLAGS+= $(AVXFLAGS)
 $(info FPTYPE=$(FPTYPE))
 ifeq ($(FPTYPE),d)
   CXXFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_DOUBLE
-  CUFLAGS  += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_DOUBLE
+  GPUFLAGS  += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_DOUBLE
 else ifeq ($(FPTYPE),f)
   CXXFLAGS += -DMGONGPU_FPTYPE_FLOAT -DMGONGPU_FPTYPE2_FLOAT
-  CUFLAGS  += -DMGONGPU_FPTYPE_FLOAT -DMGONGPU_FPTYPE2_FLOAT
+  GPUFLAGS  += -DMGONGPU_FPTYPE_FLOAT -DMGONGPU_FPTYPE2_FLOAT
 else ifeq ($(FPTYPE),m)
   CXXFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_FLOAT
-  CUFLAGS  += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_FLOAT
+  GPUFLAGS  += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_FLOAT
 else
   $(error Unknown FPTYPE='$(FPTYPE)': only 'd', 'f' and 'm' are supported)
 endif
@@ -383,7 +445,7 @@ endif
 $(info HELINL=$(HELINL))
 ifeq ($(HELINL),1)
   CXXFLAGS += -DMGONGPU_INLINE_HELAMPS
-  CUFLAGS  += -DMGONGPU_INLINE_HELAMPS
+  GPUFLAGS  += -DMGONGPU_INLINE_HELAMPS
 else ifneq ($(HELINL),0)
   $(error Unknown HELINL='$(HELINL)': only '0' and '1' are supported)
 endif
@@ -392,7 +454,7 @@ endif
 $(info HRDCOD=$(HRDCOD))
 ifeq ($(HRDCOD),1)
   CXXFLAGS += -DMGONGPU_HARDCODE_PARAM
-  CUFLAGS  += -DMGONGPU_HARDCODE_PARAM
+  GPUFLAGS  += -DMGONGPU_HARDCODE_PARAM
 else ifneq ($(HRDCOD),0)
   $(error Unknown HRDCOD='$(HRDCOD)': only '0' and '1' are supported)
 endif
@@ -444,11 +506,11 @@ ifeq ($(UNAME_S),Darwin)
   override CULIBFLAGSRPATH2 =
 else
   # RPATH to cuda/cpp libs when linking executables
-  override CXXLIBFLAGSRPATH = -Wl,-rpath,$(LIBDIRRPATH)
-  override CULIBFLAGSRPATH = -Xlinker -rpath,$(LIBDIRRPATH)
+  override CXXLIBFLAGSRPATH = -Wl,-rpath=$(LIBDIRRPATH)
+  override CULIBFLAGSRPATH = -Xlinker -rpath=$(LIBDIRRPATH)
   # RPATH to common lib when linking cuda/cpp libs
-  override CXXLIBFLAGSRPATH2 = -Wl,-rpath,'$$ORIGIN'
-  override CULIBFLAGSRPATH2 = -Xlinker -rpath,'$$ORIGIN'
+  override CXXLIBFLAGSRPATH2 = -Wl,-rpath='$$ORIGIN'
+  override CULIBFLAGSRPATH2 = -Xlinker -rpath='$$ORIGIN'
 endif
 
 # Setting LD_LIBRARY_PATH or DYLD_LIBRARY_PATH in the RUNTIME is no longer necessary (neither on Linux nor on Mac)
@@ -461,7 +523,7 @@ override RUNTIME =
 cxx_main=$(BUILDDIR)/check.exe
 fcxx_main=$(BUILDDIR)/fcheck.exe
 
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 cu_main=$(BUILDDIR)/gcheck.exe
 fcu_main=$(BUILDDIR)/fgcheck.exe
 else
@@ -492,15 +554,16 @@ $(BUILDDIR)/.build.$(TAG):
 	@touch $(BUILDDIR)/.build.$(TAG)
 
 # Generic target and build rules: objects from CUDA compilation
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 $(BUILDDIR)/%.o : %.cu *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
 	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
-	$(NVCC) $(CPPFLAGS) $(CUFLAGS) -Xcompiler -fPIC -c $< -o $@
+	$(GPUCC) $(CPPFLAGS) $(GPUFLAGS) $(CUBUILDRULEFLAGS) $< -o $@
 
 $(BUILDDIR)/%_cu.o : %.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
 	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
-	$(NVCC) $(CPPFLAGS) $(CUFLAGS) -Xcompiler -fPIC -c -x cu $< -o $@
+	$(GPUCC) $(CPPFLAGS) $(GPUFLAGS) $(CCBUILDRULEFLAGS) $< -o $@
 endif
+# -x cu in line above
 
 # Generic target and build rules: objects from C++ compilation
 # (NB do not include CUINC here! add it only for NVTX or curand #679)
@@ -509,11 +572,14 @@ $(BUILDDIR)/%.o : %.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
 	$(CXX) $(CPPFLAGS) $(CXXFLAGS) -fPIC -c $< -o $@
 
 # Apply special build flags only to CrossSectionKernel.cc and gCrossSectionKernel.cu (no fast math, see #117 and #516)
+# Added edgecase for HIP compilation
 ifeq ($(shell $(CXX) --version | grep ^nvc++),)
 $(BUILDDIR)/CrossSectionKernels.o: CXXFLAGS := $(filter-out -ffast-math,$(CXXFLAGS))
 $(BUILDDIR)/CrossSectionKernels.o: CXXFLAGS += -fno-fast-math
-ifneq ($(NVCC),)
-$(BUILDDIR)/gCrossSectionKernels.o: CUFLAGS += -Xcompiler -fno-fast-math
+ifeq ($(findstring nvcc,$(GPUCC)),nvcc)
+  $(BUILDDIR)/gCrossSectionKernels.o: GPUFLAGS += -Xcompiler -fno-fast-math
+else
+  $(BUILDDIR)/gCrossSectionKernels.o: GPUFLAGS += -fno-fast-math
 endif
 endif
 
@@ -530,10 +596,10 @@ ifeq ($(RNDGEN),hasCurand)
 $(BUILDDIR)/CurandRandomNumberKernel.o: CXXFLAGS += $(CUINC)
 endif
 
-# Avoid "warning: builtin __has_trivial_... is deprecated; use __is_trivially_... instead" in nvcc with icx2023 (#592)
+# Avoid "warning: builtin __has_trivial_... is deprecated; use __is_trivially_... instead" in GPUCC with icx2023 (#592)
 ifneq ($(shell $(CXX) --version | egrep '^(Intel)'),)
-ifneq ($(NVCC),)
-CUFLAGS += -Xcompiler -Wno-deprecated-builtins
+ifneq ($(GPUCC),)
+GPUFLAGS += -Wno-deprecated-builtins
 endif
 endif
 
@@ -541,8 +607,8 @@ endif
 # This patch does remove the warning, but I prefer to keep it disabled for the moment...
 ###ifneq ($(shell $(CXX) --version | egrep '^(clang|Apple clang|Intel)'),)
 ###$(BUILDDIR)/CrossSectionKernels.o: CXXFLAGS += -Wno-overriding-t-option
-###ifneq ($(NVCC),)
-###$(BUILDDIR)/gCrossSectionKernels.o: CUFLAGS += -Xcompiler -Wno-overriding-t-option
+###ifneq ($(GPUCC),)
+###$(BUILDDIR)/gCrossSectionKernels.o: GPUFLAGS += -Xcompiler -Wno-overriding-t-option
 ###endif
 ###endif
 
@@ -569,7 +635,7 @@ MG5AMC_CXXLIB = mg5amc_$(processid_short)_cpp
 cxx_objects_lib=$(BUILDDIR)/CPPProcess.o $(BUILDDIR)/MatrixElementKernels.o $(BUILDDIR)/BridgeKernels.o $(BUILDDIR)/CrossSectionKernels.o
 cxx_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel.o $(BUILDDIR)/RamboSamplingKernels.o
 
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 MG5AMC_CULIB = mg5amc_$(processid_short)_cuda
 cu_objects_lib=$(BUILDDIR)/gCPPProcess.o $(BUILDDIR)/gMatrixElementKernels.o $(BUILDDIR)/gBridgeKernels.o $(BUILDDIR)/gCrossSectionKernels.o
 cu_objects_exe=$(BUILDDIR)/gCommonRandomNumberKernel.o $(BUILDDIR)/gRamboSamplingKernels.o
@@ -581,11 +647,11 @@ $(LIBDIR)/lib$(MG5AMC_CXXLIB).so: cxx_objects_lib += $(BUILDDIR)/fbridge.o
 $(LIBDIR)/lib$(MG5AMC_CXXLIB).so: $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib)
 	$(CXX) -shared -o $@ $(cxx_objects_lib) $(CXXLIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB)
 
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 $(LIBDIR)/lib$(MG5AMC_CULIB).so: $(BUILDDIR)/fbridge_cu.o
 $(LIBDIR)/lib$(MG5AMC_CULIB).so: cu_objects_lib += $(BUILDDIR)/fbridge_cu.o
 $(LIBDIR)/lib$(MG5AMC_CULIB).so: $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cu_objects_lib)
-	$(NVCC) --shared -o $@ $(cu_objects_lib) $(CULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB)
+	$(GPUCC) --shared -o $@ $(cu_objects_lib) $(CULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB)
 endif
 
 #-------------------------------------------------------------------------------
@@ -602,16 +668,16 @@ $(cxx_main): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PAT
 $(cxx_main): $(BUILDDIR)/check_sa.o $(LIBDIR)/lib$(MG5AMC_CXXLIB).so $(cxx_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel.o
 	$(CXX) -o $@ $(BUILDDIR)/check_sa.o $(OMPFLAGS) -ldl -pthread $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_CXXLIB) $(cxx_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel.o $(CURANDLIBFLAGS)
 
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 ifneq ($(shell $(CXX) --version | grep ^Intel),)
-$(cu_main): LIBFLAGS += -lintlc # compile with icpx and link with nvcc (undefined reference to `_intel_fast_memcpy')
-$(cu_main): LIBFLAGS += -lsvml # compile with icpx and link with nvcc (undefined reference to `__svml_cos4_l9')
+$(cu_main): LIBFLAGS += -lintlc # compile with icpx and link with GPUCC (undefined reference to `_intel_fast_memcpy')
+$(cu_main): LIBFLAGS += -lsvml # compile with icpx and link with GPUCC (undefined reference to `__svml_cos4_l9')
 else ifneq ($(shell $(CXX) --version | grep ^nvc++),) # support nvc++ #531
 $(cu_main): LIBFLAGS += -L$(patsubst %bin/nvc++,%lib,$(subst ccache ,,$(CXX))) -lnvhpcatm -lnvcpumath -lnvc
 endif
 $(cu_main): LIBFLAGS += $(CULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH
 $(cu_main): $(BUILDDIR)/gcheck_sa.o $(LIBDIR)/lib$(MG5AMC_CULIB).so $(cu_objects_exe) $(BUILDDIR)/gCurandRandomNumberKernel.o
-	$(NVCC) -o $@ $(BUILDDIR)/gcheck_sa.o $(CUARCHFLAGS) $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe) $(BUILDDIR)/gCurandRandomNumberKernel.o $(CURANDLIBFLAGS)
+	$(GPUCC) -o $@ $(BUILDDIR)/gcheck_sa.o $(CUARCHFLAGS) $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe) $(BUILDDIR)/gCurandRandomNumberKernel.o $(CURANDLIBFLAGS)
 endif
 
 #-------------------------------------------------------------------------------
@@ -637,17 +703,17 @@ $(fcxx_main): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PA
 $(fcxx_main): $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler.o $(LIBDIR)/lib$(MG5AMC_CXXLIB).so $(cxx_objects_exe)
 	$(CXX) -o $@ $(BUILDDIR)/fcheck_sa.o $(OMPFLAGS) $(BUILDDIR)/fsampler.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CXXLIB) $(cxx_objects_exe)
 
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 ifneq ($(shell $(CXX) --version | grep ^Intel),)
-$(fcu_main): LIBFLAGS += -lintlc # compile with icpx and link with nvcc (undefined reference to `_intel_fast_memcpy')
-$(fcu_main): LIBFLAGS += -lsvml # compile with icpx and link with nvcc (undefined reference to `__svml_cos4_l9')
+$(fcu_main): LIBFLAGS += -lintlc # compile with icpx and link with GPUCC (undefined reference to `_intel_fast_memcpy')
+$(fcu_main): LIBFLAGS += -lsvml # compile with icpx and link with GPUCC (undefined reference to `__svml_cos4_l9')
 endif
 ifeq ($(UNAME_S),Darwin)
 $(fcu_main): LIBFLAGS += -L$(shell dirname $(shell $(FC) --print-file-name libgfortran.dylib)) # add path to libgfortran on Mac #375
 endif
 $(fcu_main): LIBFLAGS += $(CULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH
 $(fcu_main): $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler_cu.o $(LIBDIR)/lib$(MG5AMC_CULIB).so $(cu_objects_exe)
-	$(NVCC) -o $@ $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler_cu.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe)
+	$(GPUCC) -o $@ $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler_cu.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe)
 endif
 
 #-------------------------------------------------------------------------------
@@ -659,7 +725,7 @@ $(BUILDDIR)/testxxx.o: testxxx_cc_ref.txt
 $(testmain): $(BUILDDIR)/testxxx.o
 $(testmain): cxx_objects_exe += $(BUILDDIR)/testxxx.o # Comment out this line to skip the C++ test of xxx functions
 
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 $(BUILDDIR)/testxxx_cu.o: $(GTESTLIBS)
 $(BUILDDIR)/testxxx_cu.o: INCFLAGS += $(GTESTINC)
 $(BUILDDIR)/testxxx_cu.o: testxxx_cc_ref.txt
@@ -672,7 +738,7 @@ $(BUILDDIR)/testmisc.o: INCFLAGS += $(GTESTINC)
 $(testmain): $(BUILDDIR)/testmisc.o
 $(testmain): cxx_objects_exe += $(BUILDDIR)/testmisc.o # Comment out this line to skip the C++ miscellaneous tests
 
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 $(BUILDDIR)/testmisc_cu.o: $(GTESTLIBS)
 $(BUILDDIR)/testmisc_cu.o: INCFLAGS += $(GTESTINC)
 $(testmain): $(BUILDDIR)/testmisc_cu.o
@@ -684,12 +750,12 @@ $(BUILDDIR)/runTest.o: INCFLAGS += $(GTESTINC)
 $(testmain): $(BUILDDIR)/runTest.o
 $(testmain): cxx_objects_exe += $(BUILDDIR)/runTest.o
 
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 $(BUILDDIR)/runTest_cu.o: $(GTESTLIBS)
 $(BUILDDIR)/runTest_cu.o: INCFLAGS += $(GTESTINC)
 ifneq ($(shell $(CXX) --version | grep ^Intel),)
-$(testmain): LIBFLAGS += -lintlc # compile with icpx and link with nvcc (undefined reference to `_intel_fast_memcpy')
-$(testmain): LIBFLAGS += -lsvml # compile with icpx and link with nvcc (undefined reference to `__svml_cos4_l9')
+$(testmain): LIBFLAGS += -lintlc # compile with icpx and link with GPUCC (undefined reference to `_intel_fast_memcpy')
+$(testmain): LIBFLAGS += -lsvml # compile with icpx and link with GPUCC (undefined reference to `__svml_cos4_l9')
 else ifneq ($(shell $(CXX) --version | grep ^nvc++),) # support nvc++ #531
 $(testmain): LIBFLAGS += -L$(patsubst %bin/nvc++,%lib,$(subst ccache ,,$(CXX))) -lnvhpcatm -lnvcpumath -lnvc
 endif
@@ -713,14 +779,14 @@ $(testmain): LIBFLAGS += -lgomp
 endif
 endif
 
-ifeq ($(NVCC),) # link only runTest.o
+ifeq ($(GPUCC),) # link only runTest.o
 $(testmain): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH
 $(testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(cxx_objects_exe) $(GTESTLIBS)
 	$(CXX) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) -ldl -pthread $(LIBFLAGS)
 else # link both runTest.o and runTest_cu.o
 $(testmain): LIBFLAGS += $(CULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH
 $(testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) $(GTESTLIBS)
-	$(NVCC) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) -ldl $(LIBFLAGS) -lcuda
+	  $(GPUCC) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) -ldl $(LIBFLAGS) $(CUDATESTFLAGS)
 endif
 
 # Use target gtestlibs to build only googletest
@@ -829,9 +895,9 @@ ifeq ($(USECCACHE),1)
 	ccache --version | head -1
 endif
 	@echo ""
-	@echo NVCC=$(NVCC)
-ifneq ($(NVCC),)
-	$(NVCC) --version
+	@echo GPUCC=$(GPUCC)
+ifneq ($(GPUCC),)
+	$(GPUCC) --version
 endif
 	@echo ""
 	@echo CXX=$(CXX)
@@ -850,7 +916,7 @@ endif
 
 # Target: check (run the C++ test executable)
 # [NB THIS IS WHAT IS USED IN THE GITHUB CI!]
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 check: runTest cmpFcheck cmpFGcheck
 else
 check: runTest cmpFcheck
diff --git a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/fbridge.cc b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/fbridge.cc
index 2d2b36d560..22ce3f5115 100644
--- a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/fbridge.cc
+++ b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/fbridge.cc
@@ -1,11 +1,11 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: S. Roiser (Oct 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Roiser, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #include "Bridge.h"
 #include "CPPProcess.h"
-#include "CudaRuntime.h"
+#include "GpuRuntime.h"
 
 extern "C"
 {
@@ -22,7 +22,7 @@ extern "C"
    * Using the same Fortran MadEvent code, linking to the hetrerogeneous library would allow access to both CPU and GPU implementations.
    * The specific heterogeneous configuration (how many GPUs, how many threads on each CPU, etc) could be loaded in CUDA/C++ from a data file.
    */
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   using namespace mg5amcGpu;
 #else
   using namespace mg5amcCpu;
@@ -46,8 +46,8 @@ extern "C"
    */
   void fbridgecreate_( CppObjectInFortran** ppbridge, const int* pnevtF, const int* pnparF, const int* pnp4F )
   {
-#ifdef __CUDACC__
-    CudaRuntime::setUp();
+#ifdef MGONGPUCPP_GPUIMPL
+    GpuRuntime::setUp();
 #endif
     // (NB: CPPProcess::initProc no longer needs to be executed here because it is called in the Bridge constructor)
     // FIXME: disable OMP in Bridge when called from Fortran
@@ -65,8 +65,8 @@ extern "C"
     Bridge<FORTRANFPTYPE>* pbridge = dynamic_cast<Bridge<FORTRANFPTYPE>*>( *ppbridge );
     if( pbridge == 0 ) throw std::runtime_error( "fbridgedelete_: invalid Bridge address" );
     delete pbridge;
-#ifdef __CUDACC__
-    CudaRuntime::tearDown();
+#ifdef MGONGPUCPP_GPUIMPL
+    GpuRuntime::tearDown();
 #endif
   }
 
@@ -96,7 +96,7 @@ extern "C"
   {
     Bridge<FORTRANFPTYPE>* pbridge = dynamic_cast<Bridge<FORTRANFPTYPE>*>( *ppbridge );
     if( pbridge == 0 ) throw std::runtime_error( "fbridgesequence_: invalid Bridge address" );
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     // Use the device/GPU implementation in the CUDA library
     // (there is also a host implementation in this library)
     pbridge->gpu_sequence( momenta, gs, rndhel, rndcol, *pchannelId, mes, selhel, selcol );
diff --git a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/fsampler.cc b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/fsampler.cc
index 2fb445372d..3743934f41 100644
--- a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/fsampler.cc
+++ b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/fsampler.cc
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Feb 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #include "mgOnGpuConfig.h"
 
@@ -13,7 +13,7 @@
 
 //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -40,7 +40,7 @@ namespace mg5amcCpu
   private:
     const int m_nevt; // The number of events in each iteration
     int m_iiter;      // The iteration counter (for random number seeding)
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
     HostBufferRndNumMomenta m_hstRndmom; // Memory buffers for random numbers
     HostBufferMomenta m_hstMomenta;      // Memory buffers for momenta
     HostBufferWeights m_hstWeights;      // Memory buffers for sampling weights
@@ -105,7 +105,7 @@ namespace mg5amcCpu
 
 extern "C"
 {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   using namespace mg5amcGpu;
 #else
   using namespace mg5amcCpu;
diff --git a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/runTest.cc b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/runTest.cc
index d4a760a71b..de327f2321 100644
--- a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/runTest.cc
+++ b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/runTest.cc
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: S. Hageboeck (Nov 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 //----------------------------------------------------------------------------
 // Use ./runTest.exe --gtest_filter=*xxx to run only testxxx.cc tests
 //----------------------------------------------------------------------------
@@ -18,7 +18,7 @@
 #include "RandomNumberKernels.h"
 #include "epoch_process_id.h"
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 using namespace mg5amcGpu;
 #else
 using namespace mg5amcCpu;
@@ -35,7 +35,7 @@ struct CUDA_CPU_TestBase : public TestDriverBase
     : TestDriverBase( npar, refFileName ) {}
 };
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 struct CPUTest : public CUDA_CPU_TestBase
 {
   // Struct data members (process, and memory structures for random numbers, momenta, matrix elements and weights on host and device)
@@ -119,7 +119,7 @@ struct CPUTest : public CUDA_CPU_TestBase
 };
 #endif
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 struct CUDATest : public CUDA_CPU_TestBase
 {
   // Reset the device when our test goes out of scope. Note that this should happen after
@@ -128,7 +128,7 @@ struct CUDATest : public CUDA_CPU_TestBase
   {
     ~DeviceReset()
     {
-      checkCuda( cudaDeviceReset() ); // this is needed by cuda-memcheck --leak-check full
+      checkGpu( gpuDeviceReset() ); // this is needed by cuda-memcheck --leak-check full
     }
   } deviceResetter;
 
@@ -256,7 +256,7 @@ INSTANTIATE_TEST_SUITE_P( prefix, \
                           test_suite_name, \
                           testing::Values( new CUDATest( MG_EPOCH_REFERENCE_FILE_NAME ) ) );
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 MG_INSTANTIATE_TEST_SUITE_GPU( XTESTID_GPU( MG_EPOCH_PROCESS_ID ), MadgraphTest );
 #else
 MG_INSTANTIATE_TEST_SUITE_CPU( XTESTID_CPU( MG_EPOCH_PROCESS_ID ), MadgraphTest );
diff --git a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/testmisc.cc b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/testmisc.cc
index 895d6eeb56..ba9e59a8a3 100644
--- a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/testmisc.cc
+++ b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/testmisc.cc
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 //----------------------------------------------------------------------------
 // Use ./runTest.exe --gtest_filter=*misc to run only testmisc.cc tests
 //----------------------------------------------------------------------------
@@ -17,7 +17,7 @@
 #include <sstream>
 #include <typeinfo>
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 #define TESTID( s ) s##_GPU_MISC
 #else
 #define TESTID( s ) s##_CPU_MISC
@@ -26,7 +26,7 @@
 #define XTESTID( s ) TESTID( s )
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -59,7 +59,7 @@ namespace mg5amcCpu
 
 TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testmisc )
 {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   using namespace mg5amcGpu;
 #else
   using namespace mg5amcCpu;
diff --git a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/testxxx.cc b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/testxxx.cc
index 3361fe5aa9..e5167de00c 100644
--- a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/testxxx.cc
+++ b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/testxxx.cc
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Apr 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 //----------------------------------------------------------------------------
 // Use ./runTest.exe --gtest_filter=*xxx to run only testxxx.cc tests
 //----------------------------------------------------------------------------
@@ -24,7 +24,7 @@
 #include <iomanip>
 #include <iostream>
 #include <vector>
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 #define TESTID( s ) s##_GPU_XXX
 #else
 #define TESTID( s ) s##_CPU_XXX
@@ -32,7 +32,7 @@
 
 #define XTESTID( s ) TESTID( s )
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -42,7 +42,7 @@ namespace mg5amcCpu
   int FPEhandlerIevt = -1;
   inline void FPEhandler( int sig )
   {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     std::cerr << "Floating Point Exception (GPU): '" << FPEhandlerMessage << "' ievt=" << FPEhandlerIevt << std::endl;
 #else
     std::cerr << "Floating Point Exception (CPU neppV=" << neppV << "): '" << FPEhandlerMessage << "' ievt=" << FPEhandlerIevt << std::endl;
@@ -53,7 +53,7 @@ namespace mg5amcCpu
 
 TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testxxx )
 {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   using namespace mg5amcGpu;
 #else
   using namespace mg5amcCpu;
@@ -77,7 +77,7 @@ TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testxxx )
   assert( nevt % neppM == 0 ); // nevt must be a multiple of neppM
   assert( nevt % neppV == 0 ); // nevt must be a multiple of neppV
   // Fill in the input momenta
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   mg5amcGpu::PinnedHostBufferMomenta hstMomenta( nevt ); // AOSOA[npagM][npar=4][np4=4][neppM]
 #else
   mg5amcCpu::HostBufferMomenta hstMomenta( nevt ); // AOSOA[npagM][npar=4][np4=4][neppM]
@@ -322,7 +322,7 @@ TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testxxx )
   {
     for( int ievt = 0; ievt < nevt; ievt++ )
     {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
       using namespace mg5amcGpu;
 #else
       using namespace mg5amcCpu;
diff --git a/epochX/cudacpp/gg_ttgg.sa/src/HelAmps_sm.h b/epochX/cudacpp/gg_ttgg.sa/src/HelAmps_sm.h
index 8df465ad6d..8b4ad719be 100644
--- a/epochX/cudacpp/gg_ttgg.sa/src/HelAmps_sm.h
+++ b/epochX/cudacpp/gg_ttgg.sa/src/HelAmps_sm.h
@@ -5,7 +5,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: A. Valassi (Sep 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
 // MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
@@ -28,7 +28,7 @@
 //#include <iomanip>
 //#include <iostream>
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/gg_ttgg.sa/src/Parameters_sm.cc b/epochX/cudacpp/gg_ttgg.sa/src/Parameters_sm.cc
index 64fc3fea62..067445b198 100644
--- a/epochX/cudacpp/gg_ttgg.sa/src/Parameters_sm.cc
+++ b/epochX/cudacpp/gg_ttgg.sa/src/Parameters_sm.cc
@@ -4,7 +4,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: A. Valassi (Sep 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
 // MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
@@ -17,7 +17,7 @@
 #include <iomanip>
 #include <iostream>
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 using namespace mg5amcGpu;
 #else
 using namespace mg5amcCpu;
diff --git a/epochX/cudacpp/gg_ttgg.sa/src/Parameters_sm.h b/epochX/cudacpp/gg_ttgg.sa/src/Parameters_sm.h
index b6568d3761..9581d66e0e 100644
--- a/epochX/cudacpp/gg_ttgg.sa/src/Parameters_sm.h
+++ b/epochX/cudacpp/gg_ttgg.sa/src/Parameters_sm.h
@@ -27,7 +27,7 @@
 #include "read_slha.h"
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -93,7 +93,7 @@ namespace mg5amcCpu
 #include <limits>
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -218,7 +218,7 @@ namespace mg5amcCpu
 //==========================================================================
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -239,7 +239,7 @@ namespace mg5amcCpu
 #pragma GCC diagnostic push
 #pragma GCC diagnostic ignored "-Wunused-variable"  // e.g. <<warning: unused variable ‘mdl_G__exp__2’ [-Wunused-variable]>>
 #pragma GCC diagnostic ignored "-Wunused-parameter" // e.g. <<warning: unused parameter ‘G’ [-Wunused-parameter]>>
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 #pragma nv_diagnostic push
 #pragma nv_diag_suppress 177 // e.g. <<warning #177-D: variable "mdl_G__exp__2" was declared but never referenced>>
 #endif
@@ -267,7 +267,7 @@ namespace mg5amcCpu
       // End SM implementation - no special handling of vectors of floats as in EFT (#439)
       return out;
     }
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 #pragma GCC diagnostic pop
 #pragma nv_diagnostic pop
 #endif
diff --git a/epochX/cudacpp/gg_ttgg.sa/src/cudacpp_src.mk b/epochX/cudacpp/gg_ttgg.sa/src/cudacpp_src.mk
index d4cc628aec..159e19a46d 100644
--- a/epochX/cudacpp/gg_ttgg.sa/src/cudacpp_src.mk
+++ b/epochX/cudacpp/gg_ttgg.sa/src/cudacpp_src.mk
@@ -19,7 +19,7 @@ SHELL := /bin/bash
 #=== Configure common compiler flags for CUDA and C++
 
 INCFLAGS = -I.
-OPTFLAGS = -O3 # this ends up in CUFLAGS too (should it?), cannot add -Ofast or -ffast-math here
+OPTFLAGS = -O3 # this ends up in GPUFLAGS too (should it?), cannot add -Ofast or -ffast-math here
 
 #-------------------------------------------------------------------------------
 
@@ -45,13 +45,13 @@ endif
 
 #-------------------------------------------------------------------------------
 
-#=== Configure the CUDA compiler (note: NVCC is already exported including ccache)
+#=== Configure the CUDA compiler (note: GPUCC is already exported including ccache)
 
-###$(info NVCC=$(NVCC))
+###$(info GPUCC=$(GPUCC))
 
 #-------------------------------------------------------------------------------
 
-#=== Configure ccache for C++ builds (note: NVCC is already exported including ccache)
+#=== Configure ccache for C++ builds (note: GPUCC is already exported including ccache)
 
 # Enable ccache if USECCACHE=1
 ifeq ($(USECCACHE)$(shell echo $(CXX) | grep ccache),1)
@@ -92,6 +92,13 @@ endif
 ###$(info OMPFLAGS=$(OMPFLAGS))
 CXXFLAGS += $(OMPFLAGS)
 
+# Add correct -DHIP_LATFORM when compiling for HIP
+ifeq ($(findstring nvcc,$(GPUCC)),nvcc)
+  GPUFLAGS += -Xcompiler -fPIC -c -x cu
+else ifeq ($(findstring hipcc,$(GPUCC)),hipcc)
+  GPUFLAGS += -fPIC -c
+endif
+
 # Set the build flags appropriate to each AVX choice (example: "make AVX=none")
 # [NB MGONGPU_PVW512 is needed because "-mprefer-vector-width=256" is not exposed in a macro]
 # [See https://gcc.gnu.org/bugzilla/show_bug.cgi?id=96476]
@@ -253,20 +260,20 @@ $(BUILDDIR)/%.o : %.cc *.h $(BUILDDIR)/.build.$(TAG)
 # Generic target and build rules: objects from CUDA compilation
 $(BUILDDIR)/%_cu.o : %.cc *.h $(BUILDDIR)/.build.$(TAG)
 	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
-	$(NVCC) $(CPPFLAGS) $(CUFLAGS) -Xcompiler -fPIC -c -x cu $< -o $@
+	$(GPUCC) $(CPPFLAGS) $(GPUFLAGS) $< -o $@
 
 #-------------------------------------------------------------------------------
 
 cxx_objects=$(addprefix $(BUILDDIR)/, Parameters_sm.o read_slha.o)
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 cu_objects=$(addprefix $(BUILDDIR)/, Parameters_sm_cu.o)
 endif
 
 # Target (and build rules): common (src) library
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so : $(cxx_objects) $(cu_objects)
 	@if [ ! -d $(LIBDIR) ]; then echo "mkdir -p $(LIBDIR)"; mkdir -p $(LIBDIR); fi
-	$(NVCC) -shared -o $@ $(cxx_objects) $(cu_objects) $(LDFLAGS)
+	$(GPUCC) -shared -o $@ $(cxx_objects) $(cu_objects) $(LDFLAGS)
 else
 $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so : $(cxx_objects)
 	@if [ ! -d $(LIBDIR) ]; then echo "mkdir -p $(LIBDIR)"; mkdir -p $(LIBDIR); fi
diff --git a/epochX/cudacpp/gg_ttgg.sa/src/mgOnGpuConfig.h b/epochX/cudacpp/gg_ttgg.sa/src/mgOnGpuConfig.h
index b247654dcf..da4ba36ad8 100644
--- a/epochX/cudacpp/gg_ttgg.sa/src/mgOnGpuConfig.h
+++ b/epochX/cudacpp/gg_ttgg.sa/src/mgOnGpuConfig.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jul 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MGONGPUCONFIG_H
 #define MGONGPUCONFIG_H 1
@@ -10,12 +10,25 @@
 // There are two different code bases for standalone_cudacpp (without multichannel) and madevent+cudacpp (with multichannel)
 #undef MGONGPU_SUPPORTS_MULTICHANNEL
 
+// Is this a GPU (CUDA, HIP) or CPU implementation?
+#ifdef __CUDACC__
+#define MGONGPUCPP_GPUIMPL cuda
+#elif defined __HIPCC__
+#define MGONGPUCPP_GPUIMPL hip
+#else
+#undef MGONGPUCPP_GPUIMPL
+#endif
+
 // ** NB1 Throughputs (e.g. 6.8E8) are events/sec for "./gcheck.exe -p 65536 128 12"
 // ** NB2 Baseline on b7g47n0004 fluctuates (probably depends on load on other VMs)
 
 // Choose if curand is supported for generating random numbers
+// For HIP, by default, do not use curand (common random numbers will be used instead)
 // For both CUDA and C++, by default, do not inline, but allow this macro to be set from outside with e.g. -DMGONGPU_HAS_NO_CURAND
-// (there exist CUDA installations, e.g. using the HPC package, which do not include curand - see PR #784)
+// (there exist CUDA installations, e.g. using the HPC package, which do not include curand - see PR #784 and #785)
+#if defined __HIPCC__
+#define MGONGPU_HAS_NO_CURAND 1
+#else
 //#ifdef __CUDACC__
 //#undef MGONGPU_HAS_NO_CURAND // default
 ////#define MGONGPU_HAS_NO_CURAND 1
@@ -23,6 +36,7 @@
 //#undef MGONGPU_HAS_NO_CURAND // default
 ////#define MGONGPU_HAS_NO_CURAND 1
 //#endif
+#endif
 
 // Choose floating point precision (for everything but color algebra #537)
 // If one of these macros has been set from outside with e.g. -DMGONGPU_FPTYPE_FLOAT, nothing happens (issue #167)
@@ -54,23 +68,28 @@
 //#undef MGONGPU_HARDCODE_PARAM // default
 ////#define MGONGPU_HARDCODE_PARAM 1
 
-// Complex type in c++: std::complex or cxsmpl (CHOOSE ONLY ONE)
-#ifndef __CUDACC__
-//#define MGONGPU_CPPCXTYPE_STDCOMPLEX 1 // ~8 percent slower on float, same on double (5.1E6/double, 9.4E6/float)
-#define MGONGPU_CPPCXTYPE_CXSMPL 1 // new default (5.1E6/double, 10.2E6/float)
-#endif
-
-// Complex type in cuda: thrust or cucomplex or cxsmpl (CHOOSE ONLY ONE)
+// Complex type in CUDA: thrust or cucomplex or cxsmpl (CHOOSE ONLY ONE)
 #ifdef __CUDACC__
 #define MGONGPU_CUCXTYPE_THRUST 1 // default (~1.15E9/double, ~3.2E9/float)
 //#define MGONGPU_CUCXTYPE_CUCOMPLEX 1 // ~10 percent slower (1.03E9/double, ~2.8E9/float)
 //#define MGONGPU_CUCXTYPE_CXSMPL 1 // ~10 percent slower (1.00E9/double, ~2.9E9/float)
+
+// Complex type in HIP: cxsmpl (ONLY ONE OPTION POSSIBLE)
+#elif defined __HIPCC__
+#define MGONGPU_CUCXTYPE_CXSMPL 1 // ~10 percent slower (1.00E9/double, ~2.9E9/float)
+
+// Complex type in C++: std::complex or cxsmpl (CHOOSE ONLY ONE)
+#else
+//#define MGONGPU_CPPCXTYPE_STDCOMPLEX 1 // ~8 percent slower on float, same on double (5.1E6/double, 9.4E6/float)
+#define MGONGPU_CPPCXTYPE_CXSMPL 1 // new default (5.1E6/double, 10.2E6/float)
 #endif
 
-// Cuda nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation
+// CUDA nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation
 #ifdef __CUDACC__
-#undef MGONGPU_NSIGHT_DEBUG // default
+#undef MGONGPU_NSIGHT_DEBUG // default in CUDA
 //#define MGONGPU_NSIGHT_DEBUG 1
+#else
+#undef MGONGPU_NSIGHT_DEBUG // only option in HIP or C++
 #endif
 
 // SANITY CHECKS (floating point precision for everything but color algebra #537)
@@ -86,17 +105,21 @@
 #error You cannot use double precision for color algebra and single precision elsewhere
 #endif
 
-// SANITY CHECKS (c++ complex number implementation)
-#ifndef __CUDACC__
-#if defined MGONGPU_CPPCXTYPE_STDCOMPLEX and defined MGONGPU_CPPCXTYPE_CXSMPL
-#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CPPCXTYPE_STDCOMPLEX or MGONGPU_CPPCXTYPE_CXSMPL
+// SANITY CHECKS (CUDA complex number implementation)
+#ifdef __CUDACC__
+#if defined MGONGPU_CUCXTYPE_THRUST and defined MGONGPU_CUCXTYPE_CUCOMPLEX
+#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CUCXTYPE_THRUST or MGONGPU_CUCXTYPE_CUCOMPLEX for CUDA
+#elif defined MGONGPU_CUCXTYPE_THRUST and defined MGONGPU_CUCXTYPE_CXSMPL
+#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CUCXTYPE_THRUST or MGONGPU_CUCXTYPE_CXSMPL for CUDA
+#elif defined MGONGPU_CUCXTYPE_CUCOMPLEX and defined MGONGPU_CUCXTYPE_CXSMPL
+#error You must CHOOSE (ONE AND) ONLY ONE OF MGONGPU_CUCXTYPE_CUCOMPLEX or MGONGPU_CUCXTYPE_CXSMPL for CUDA
 #endif
 #endif
 
-// SANITY CHECKS (cuda complex number implementation)
-#ifdef __CUDACC__
-#if defined MGONGPU_CUCXTYPE_THRUST and defined MGONGPU_CUCXTYPE_CUCOMPLEX and defined MGONGPU_CUCXTYPE_CXSMPL
-#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CUCXTYPE_THRUST or MGONGPU_CUCXTYPE_CUCOMPLEX or MGONGPU_CUCXTYPE_CXSMPL
+// SANITY CHECKS (C++ complex number implementation)
+#ifndef MGONGPUCPP_GPUIMPL
+#if defined MGONGPU_CPPCXTYPE_STDCOMPLEX and defined MGONGPU_CPPCXTYPE_CXSMPL
+#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CPPCXTYPE_STDCOMPLEX or MGONGPU_CPPCXTYPE_CXSMPL for C++
 #endif
 #endif
 
@@ -134,7 +157,7 @@ namespace mgOnGpu
   // Alignment requirement for using reinterpret_cast with SIMD vectorized code
   // (using reinterpret_cast with non aligned memory may lead to segmentation faults!)
   // Only needed for C++ code but can be enforced also in NVCC builds of C++ code using CUDA>=11.2 and C++17 (#318, #319, #333)
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   constexpr int cppAlign = 64; // alignment requirement for SIMD vectorization (64-byte i.e. 512-bit)
 #endif
 
@@ -145,7 +168,7 @@ using mgOnGpu::fptype;
 using mgOnGpu::fptype2;
 
 // C++ SIMD vectorization width (this will be used to set neppV)
-#ifdef __CUDACC__ // CUDA implementation has no SIMD
+#ifdef MGONGPUCPP_GPUIMPL // CUDA and HIP implementations have no SIMD
 #undef MGONGPU_CPPSIMD
 #elif defined __AVX512VL__ && defined MGONGPU_PVW512 // C++ "512z" AVX512 with 512 width (512-bit ie 64-byte): 8 (DOUBLE) or 16 (FLOAT)
 #ifdef MGONGPU_FPTYPE_DOUBLE
@@ -175,9 +198,9 @@ using mgOnGpu::fptype2;
 #undef MGONGPU_CPPSIMD
 #endif
 
-// Cuda nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation
+// CUDA nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation
 // Arguments (not used so far): text is __FUNCTION__, code is 0 (start) or 1 (end)
-#if defined __CUDACC__ && defined MGONGPU_NSIGHT_DEBUG /* clang-format off */
+#if defined __CUDA__ && defined MGONGPU_NSIGHT_DEBUG /* clang-format off */
 #define mgDebugDeclare() __shared__ float mgDebugCounter[mgOnGpu::ntpbMAX];
 #define mgDebugInitialise() { mgDebugCounter[threadIdx.x] = 0; }
 #define mgDebug( code, text ) { mgDebugCounter[threadIdx.x] += 1; }
@@ -189,8 +212,8 @@ using mgOnGpu::fptype2;
 #define mgDebugFinalise() { /*noop*/ }
 #endif /* clang-format on */
 
-// Define empty CUDA declaration specifiers for C++
-#ifndef __CUDACC__
+// Define empty CUDA/HIP declaration specifiers for C++
+#ifndef MGONGPUCPP_GPUIMPL
 #define __global__
 #define __host__
 #define __device__
diff --git a/epochX/cudacpp/gg_ttgg.sa/src/mgOnGpuCxtypes.h b/epochX/cudacpp/gg_ttgg.sa/src/mgOnGpuCxtypes.h
index ca9a9f00c0..5532e22fa1 100644
--- a/epochX/cudacpp/gg_ttgg.sa/src/mgOnGpuCxtypes.h
+++ b/epochX/cudacpp/gg_ttgg.sa/src/mgOnGpuCxtypes.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022, based on earlier work by D. Smith) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MGONGPUCXTYPES_H
 #define MGONGPUCXTYPES_H 1
@@ -19,7 +19,7 @@
 #include <complex>
 
 // Complex type in cuda: thrust or cucomplex or cxsmpl
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 #if defined MGONGPU_CUCXTYPE_THRUST
 #pragma clang diagnostic push
 #pragma clang diagnostic ignored "-Wtautological-compare" // for icpx2021/clang13 (https://stackoverflow.com/a/15864661)
@@ -82,7 +82,7 @@ namespace mgOnGpu /* clang-format off */
 using mgOnGpu::cxsmpl;
 
 // Printout to stream for user defined types
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -92,7 +92,7 @@ namespace mg5amcCpu
   inline __host__ std::ostream&
   operator<<( std::ostream& out, const cxsmpl<FP>& c )
   {
-    out << std::complex( c.real(), c.imag() );
+    out << std::complex<FP>( c.real(), c.imag() );
     return out;
   }
 
@@ -215,14 +215,14 @@ namespace mg5amcCpu
 //==========================================================================
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
 #endif
 {
   // --- Type definitions (complex type: cxtype)
-#ifdef __CUDACC__ // cuda
+#ifdef MGONGPUCPP_GPUIMPL // cuda
 #if defined MGONGPU_CUCXTYPE_THRUST
   typedef thrust::complex<fptype> cxtype;
 #elif defined MGONGPU_CUCXTYPE_CUCOMPLEX
@@ -255,7 +255,7 @@ namespace mg5amcCpu
 //==========================================================================
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -307,7 +307,7 @@ namespace mg5amcCpu
 
   //==========================================================================
 
-#if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_THRUST // cuda + thrust
+#if defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CUCXTYPE_THRUST // cuda + thrust
 
   //------------------------------
   // CUDA - using thrust::complex
@@ -343,11 +343,11 @@ namespace mg5amcCpu
     return c;
   }
 
-#endif // #if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_THRUST
+#endif // #if defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CUCXTYPE_THRUST
 
   //==========================================================================
 
-#if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_CUCOMPLEX // cuda + cucomplex
+#if defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CUCXTYPE_CUCOMPLEX // cuda + cucomplex
 
   //------------------------------
   // CUDA - using cuComplex
@@ -562,11 +562,11 @@ namespace mg5amcCpu
     return cxmake( c.real(), c.imag() );
   }
 
-#endif // #if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_CUCOMPLEX
+#endif // #if defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CUCXTYPE_CUCOMPLEX
 
   //==========================================================================
 
-#if not defined __CUDACC__ and defined MGONGPU_CPPCXTYPE_STDCOMPLEX // c++ + stdcomplex
+#if not defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CPPCXTYPE_STDCOMPLEX // c++ + stdcomplex
 
   //------------------------------
   // C++ - using std::complex
@@ -610,7 +610,7 @@ namespace mg5amcCpu
   }
 #endif
 
-#endif // #if not defined __CUDACC__ and defined MGONGPU_CPPCXTYPE_STDCOMPLEX
+#endif // #if not defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CPPCXTYPE_STDCOMPLEX
 
   //==========================================================================
 
@@ -633,7 +633,7 @@ namespace mg5amcCpu
 //==========================================================================
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/gg_ttgg.sa/src/mgOnGpuFptypes.h b/epochX/cudacpp/gg_ttgg.sa/src/mgOnGpuFptypes.h
index 905c97d700..fa3a02664b 100644
--- a/epochX/cudacpp/gg_ttgg.sa/src/mgOnGpuFptypes.h
+++ b/epochX/cudacpp/gg_ttgg.sa/src/mgOnGpuFptypes.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MGONGPUFPTYPES_H
 #define MGONGPUFPTYPES_H 1
@@ -12,7 +12,7 @@
 #include <cmath>
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL // cuda
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -20,7 +20,7 @@ namespace mg5amcCpu
 {
   //==========================================================================
 
-#ifdef __CUDACC__ // cuda
+#ifdef MGONGPUCPP_GPUIMPL // cuda
 
   //------------------------------
   // Floating point types - Cuda
@@ -64,11 +64,11 @@ namespace mg5amcCpu
 #endif
   }
 
-#endif // #ifdef __CUDACC__
+#endif // #ifdef MGONGPUCPP_GPUIMPL
 
   //==========================================================================
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 
   //------------------------------
   // Floating point types - C++
@@ -92,7 +92,7 @@ namespace mg5amcCpu
     return std::sqrt( f );
   }
 
-#endif // #ifndef __CUDACC__
+#endif // #ifndef MGONGPUCPP_GPUIMPL
 
   //==========================================================================
 
diff --git a/epochX/cudacpp/gg_ttgg.sa/src/mgOnGpuVectors.h b/epochX/cudacpp/gg_ttgg.sa/src/mgOnGpuVectors.h
index e1299ba81e..cdae04326b 100644
--- a/epochX/cudacpp/gg_ttgg.sa/src/mgOnGpuVectors.h
+++ b/epochX/cudacpp/gg_ttgg.sa/src/mgOnGpuVectors.h
@@ -32,7 +32,7 @@
 #endif
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -131,7 +131,7 @@ namespace mg5amcCpu
 #endif
 #endif
 
-#else // i.e #ifndef MGONGPU_CPPSIMD (this includes #ifdef __CUDACC__)
+#else // i.e #ifndef MGONGPU_CPPSIMD (this includes #ifdef MGONGPUCPP_GPUIMPL)
 
   const int neppV = 1;
 
@@ -153,13 +153,13 @@ namespace mg5amcCpu
 //==========================================================================
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
 #endif
 {
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 
   // Printout to stream for user defined types
 
@@ -805,11 +805,11 @@ namespace mg5amcCpu
 
 #endif // #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
 
-#endif // #ifndef __CUDACC__
+#endif // #ifndef MGONGPUCPP_GPUIMPL
 
   //==========================================================================
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 
   //------------------------------
   // Vector types - CUDA
@@ -853,12 +853,12 @@ namespace mg5amcCpu
     return mask;
   }
 
-#endif // #ifdef __CUDACC__
+#endif // #ifdef MGONGPUCPP_GPUIMPL
 
   //==========================================================================
 
   // Scalar-or-vector types: scalar in CUDA, vector or scalar in C++
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   typedef bool bool_sv;
   typedef fptype fptype_sv;
   typedef fptype2 fptype2_sv;
@@ -879,7 +879,7 @@ namespace mg5amcCpu
 #endif
 
   // Scalar-or-vector zeros: scalar in CUDA, vector or scalar in C++
-#ifdef __CUDACC__ /* clang-format off */
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
   inline __host__ __device__ cxtype cxzero_sv(){ return cxtype( 0, 0 ); }
 #elif defined MGONGPU_CPPSIMD
   inline cxtype_v cxzero_sv() { return cxtype_v(); } // RRRR=0000 IIII=0000
diff --git a/epochX/cudacpp/gg_ttgg.sa/src/rambo.h b/epochX/cudacpp/gg_ttgg.sa/src/rambo.h
index e02ea52496..cd7e1008ea 100644
--- a/epochX/cudacpp/gg_ttgg.sa/src/rambo.h
+++ b/epochX/cudacpp/gg_ttgg.sa/src/rambo.h
@@ -4,7 +4,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 
 #include "mgOnGpuConfig.h"
@@ -18,7 +18,7 @@
 #include <iostream>
 
 // Simplified rambo version for 2 to N (with N>=2) processes with massless particles
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -83,7 +83,7 @@ namespace mg5amcCpu
       static bool first = true;
       if( first )
       {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
         if constexpr( M_ACCESS::isOnDevice() ) // avoid
         {
           const int ievt0 = 0;
@@ -166,7 +166,7 @@ namespace mg5amcCpu
     wt = po2log;
     if( nparf != 2 ) wt = ( 2. * nparf - 4. ) * log( energy ) + z[nparf - 1];
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
     // issue warnings if weight is too small or too large
     static int iwarn[5] = { 0, 0, 0, 0, 0 };
     if( wt < -180. )
diff --git a/epochX/cudacpp/gg_ttggg.mad/CODEGEN_mad_gg_ttggg_log.txt b/epochX/cudacpp/gg_ttggg.mad/CODEGEN_mad_gg_ttggg_log.txt
index 2480a22f8d..f222e5a6b5 100644
--- a/epochX/cudacpp/gg_ttggg.mad/CODEGEN_mad_gg_ttggg_log.txt
+++ b/epochX/cudacpp/gg_ttggg.mad/CODEGEN_mad_gg_ttggg_log.txt
@@ -52,7 +52,7 @@ Note that you can still compile and run aMC@NLO with the built-in PDFs
 
 Using default text editor "vi". Set another one in ./input/mg5_configuration.txt
 Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt
-No valid web browser found. Please set in ./input/mg5_configuration.txt
+Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt
 import /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg.mg
 The import format was not given, so we guess it as command
 set stdout_level DEBUG
@@ -62,7 +62,7 @@ generate g g > t t~ g g g
 No model currently active, so we import the Standard Model
 INFO: load particles 
 INFO: load vertices 
-[1;32mDEBUG: model prefixing  takes 0.005784511566162109 [0m
+[1;32mDEBUG: model prefixing  takes 0.005517005920410156 [0m
 INFO: Restrict model sm with file models/sm/restrict_default.dat . 
 [1;32mDEBUG: Simplifying conditional expressions [0m
 [1;32mDEBUG: remove interactions: u s w+ at order: QED=1 [0m
@@ -155,7 +155,7 @@ INFO: Please specify coupling orders to bypass this step.
 INFO: Trying coupling order WEIGHTED<=5: WEIGTHED IS QCD+2*QED 
 INFO: Trying process: g g > t t~ g g g WEIGHTED<=5 @1  
 INFO: Process has 1240 diagrams 
-1 processes with 1240 diagrams generated in 1.929 s
+1 processes with 1240 diagrams generated in 1.861 s
 Total: 1 processes with 1240 diagrams
 output madevent ../TMPOUT/CODEGEN_mad_gg_ttggg --hel_recycling=False --vector_size=32 --me_exporter=standalone_cudacpp
 Load PLUGIN.CUDACPP_OUTPUT
@@ -177,7 +177,7 @@ INFO: Creating files in directory P1_gg_ttxggg
 INFO: Computing Color-Flow optimization [15120 term] 
 INFO: Color-Flow passed to 1630 term in 8s. Introduce 3030 contraction 
 [1;32mDEBUG:  kwargs[prefix] = 0 [1;30m[model_handling.py at line 1058][0m [0m
-[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7fc088fe88e0> [1;30m[export_v4.py at line 6262][0m [0m
+[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7fc5768c88e0> [1;30m[export_v4.py at line 6262][0m [0m
 INFO: Creating files in directory . 
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.h
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.cc
@@ -192,15 +192,15 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./.
 [1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m 32 True 32 [1;30m[export_v4.py at line 1872][0m [0m
 INFO: Generating Feynman diagrams for Process: g g > t t~ g g g WEIGHTED<=5 @1 
 INFO: Finding symmetric diagrams for subprocess group gg_ttxggg 
-Generated helas calls for 1 subprocesses (1240 diagrams) in 6.718 s
-Wrote files for 2281 helas calls in 18.893 s
+Generated helas calls for 1 subprocesses (1240 diagrams) in 6.528 s
+Wrote files for 2281 helas calls in 18.450 s
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates VVV1 routines[0m
 ALOHA: aloha creates FFV1 routines[0m
 ALOHA: aloha creates VVVV1 routines[0m
 ALOHA: aloha creates VVVV3 routines[0m
 ALOHA: aloha creates VVVV4 routines[0m
-ALOHA: aloha creates 5 routines in  0.322 s
+ALOHA: aloha creates 5 routines in  0.314 s
 [1;32mDEBUG:  Entering PLUGIN_ProcessExporter.convert_model (create the model) [1;30m[output.py at line 202][0m [0m
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates VVV1 routines[0m
@@ -208,7 +208,7 @@ ALOHA: aloha creates FFV1 routines[0m
 ALOHA: aloha creates VVVV1 routines[0m
 ALOHA: aloha creates VVVV3 routines[0m
 ALOHA: aloha creates VVVV4 routines[0m
-ALOHA: aloha creates 10 routines in  0.319 s
+ALOHA: aloha creates 10 routines in  0.309 s
 <class 'aloha.create_aloha.AbstractRoutine'> VVV1
 <class 'aloha.create_aloha.AbstractRoutine'> VVV1
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
@@ -257,9 +257,9 @@ Type "launch" to generate events from this process, or see
 Run "open index.html" to see more information about this process.
 quit
 
-real	0m29.815s
-user	0m29.332s
-sys	0m0.380s
+real	0m29.049s
+user	0m28.554s
+sys	0m0.393s
 Code generation completed in 30 seconds
 ************************************************************
 *                                                          *
@@ -286,7 +286,7 @@ INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/mg5amc
 INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg/Cards/me5_configuration.txt  
 Using default text editor "vi". Set another one in ./input/mg5_configuration.txt
 Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt
-No valid web browser found. Please set in ./input/mg5_configuration.txt
+Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt
 treatcards run
 quit
 INFO:  
@@ -316,7 +316,7 @@ INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/mg5amc
 INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg/Cards/me5_configuration.txt  
 Using default text editor "vi". Set another one in ./input/mg5_configuration.txt
 Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt
-No valid web browser found. Please set in ./input/mg5_configuration.txt
+Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt
 treatcards param
 quit
 INFO:  
diff --git a/epochX/cudacpp/gg_ttggg.mad/COPYRIGHT b/epochX/cudacpp/gg_ttggg.mad/COPYRIGHT
index a134b5fef9..84a883fbb0 100644
--- a/epochX/cudacpp/gg_ttggg.mad/COPYRIGHT
+++ b/epochX/cudacpp/gg_ttggg.mad/COPYRIGHT
@@ -15,6 +15,7 @@ The full development team currently includes the following authors :
   Stephan Hageboeck (CERN)
   Olivier Mattelaer (Universite Catholique de Louvain, original author)
   Stefan Roiser (CERN, original author)
+  Joergen Teig (CERN)
   Andrea Valassi (CERN, original author)
   Zenny Wettersten (CERN)
 See https://github.com/madgraph5/madgraph4gpu for more details. For the full
diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/Bridge.h b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/Bridge.h
index bf8b5e024d..89437b4c42 100644
--- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/Bridge.h
+++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/Bridge.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: S. Roiser (Nov 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Roiser, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef BRIDGE_H
 #define BRIDGE_H 1
@@ -23,7 +23,7 @@
 #include <memory>
 #include <type_traits>
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -83,7 +83,7 @@ namespace mg5amcCpu
     Bridge& operator=( const Bridge& ) = delete;
     Bridge& operator=( Bridge&& ) = delete;
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     /**
      * Set the gpublocks and gputhreads for the gpusequence - throws if evnt != gpublocks*gputhreads
      * (this is needed for BridgeKernel tests rather than for actual production use in Fortran)
@@ -150,7 +150,7 @@ namespace mg5amcCpu
     unsigned int m_nevt; // number of events
     int m_nGoodHel;      // the number of good helicities (-1 initially when they have not yet been calculated)
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     int m_gputhreads; // number of gpu threads (default set from number of events, can be modified)
     int m_gpublocks;  // number of gpu blocks (default set from number of events, can be modified)
     DeviceBuffer<FORTRANFPTYPE, sizePerEventMomenta> m_devMomentaF;
@@ -187,12 +187,12 @@ namespace mg5amcCpu
   // Forward declare transposition methods
   //
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 
   template<typename Tin, typename Tout>
   __global__ void dev_transposeMomentaF2C( const Tin* in, Tout* out, const unsigned int nevt );
 
-#endif // __CUDACC__
+#endif // MGONGPUCPP_GPUIMPL
 
   template<typename Tin, typename Tout>
   void hst_transposeMomentaF2C( const Tin* in, Tout* out, const unsigned int nevt );
@@ -209,7 +209,7 @@ namespace mg5amcCpu
   Bridge<FORTRANFPTYPE>::Bridge( unsigned int nevtF, unsigned int nparF, unsigned int np4F )
     : m_nevt( nevtF )
     , m_nGoodHel( -1 )
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     , m_gputhreads( 256 )                  // default number of gpu threads
     , m_gpublocks( m_nevt / m_gputhreads ) // this ensures m_nevt <= m_gpublocks*m_gputhreads
     , m_devMomentaF( m_nevt )
@@ -233,7 +233,7 @@ namespace mg5amcCpu
   {
     if( nparF != CPPProcess::npar ) throw std::runtime_error( "Bridge constructor: npar mismatch" );
     if( np4F != CPPProcess::np4 ) throw std::runtime_error( "Bridge constructor: np4 mismatch" );
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     if( ( m_nevt < s_gputhreadsmin ) || ( m_nevt % s_gputhreadsmin != 0 ) )
       throw std::runtime_error( "Bridge constructor: nevt should be a multiple of " + std::to_string( s_gputhreadsmin ) );
     while( m_nevt != m_gpublocks * m_gputhreads )
@@ -249,7 +249,7 @@ namespace mg5amcCpu
 #else
     std::cout << "WARNING! Instantiate host Bridge (nevt=" << m_nevt << ")" << std::endl;
     m_pmek.reset( new MatrixElementKernelHost( m_hstMomentaC, m_hstGs, m_hstRndHel, m_hstRndCol, m_hstMEs, m_hstSelHel, m_hstSelCol, m_nevt ) );
-#endif // __CUDACC__
+#endif // MGONGPUCPP_GPUIMPL
     // Create a process object, read param card and set parameters
     // FIXME: the process instance can happily go out of scope because it is only needed to read parameters?
     // FIXME: the CPPProcess should really be a singleton? what if fbridgecreate is called from several Fortran threads?
@@ -262,7 +262,7 @@ namespace mg5amcCpu
     process.initProc( paramCard );
   }
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   template<typename FORTRANFPTYPE>
   void Bridge<FORTRANFPTYPE>::set_gpugrid( const int gpublocks, const int gputhreads )
   {
@@ -276,7 +276,7 @@ namespace mg5amcCpu
   }
 #endif
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   template<typename FORTRANFPTYPE>
   void Bridge<FORTRANFPTYPE>::gpu_sequence( const FORTRANFPTYPE* momenta,
                                             const FORTRANFPTYPE* gs,
@@ -291,14 +291,14 @@ namespace mg5amcCpu
     constexpr int neppM = MemoryAccessMomenta::neppM;
     if constexpr( neppM == 1 && std::is_same_v<FORTRANFPTYPE, fptype> )
     {
-      checkCuda( cudaMemcpy( m_devMomentaC.data(), momenta, m_devMomentaC.bytes(), cudaMemcpyHostToDevice ) );
+      gpuMemcpy( m_devMomentaC.data(), momenta, m_devMomentaC.bytes(), gpuMemcpyHostToDevice );
     }
     else
     {
-      checkCuda( cudaMemcpy( m_devMomentaF.data(), momenta, m_devMomentaF.bytes(), cudaMemcpyHostToDevice ) );
+      gpuMemcpy( m_devMomentaF.data(), momenta, m_devMomentaF.bytes(), gpuMemcpyHostToDevice );
       const int thrPerEvt = CPPProcess::npar * CPPProcess::np4; // AV: transpose alg does 1 element per thread (NOT 1 event per thread)
       //const int thrPerEvt = 1; // AV: try new alg with 1 event per thread... this seems slower
-      dev_transposeMomentaF2C<<<m_gpublocks * thrPerEvt, m_gputhreads>>>( m_devMomentaF.data(), m_devMomentaC.data(), m_nevt );
+      gpuLaunchKernel( dev_transposeMomentaF2C, m_gpublocks * thrPerEvt, m_gputhreads, m_devMomentaF.data(), m_devMomentaC.data(), m_nevt );
     }
     if constexpr( std::is_same_v<FORTRANFPTYPE, fptype> )
     {
@@ -341,7 +341,7 @@ namespace mg5amcCpu
   }
 #endif
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   template<typename FORTRANFPTYPE>
   void Bridge<FORTRANFPTYPE>::cpu_sequence( const FORTRANFPTYPE* momenta,
                                             const FORTRANFPTYPE* gs,
@@ -396,7 +396,7 @@ namespace mg5amcCpu
   // - C++ array: momenta[npagM][npar][np4][neppM] with nevt=npagM*neppM (AOSOA)
   //
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   template<typename Tin, typename Tout>
   __global__ void dev_transposeMomentaF2C( const Tin* in, Tout* out, const unsigned int nevt )
   {
diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/BridgeKernels.cc b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/BridgeKernels.cc
index d58066c9c1..eaf4037a24 100644
--- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/BridgeKernels.cc
+++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/BridgeKernels.cc
@@ -1,17 +1,18 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #include "BridgeKernels.h"
 
+#include "GpuAbstraction.h"
 #include "MemoryAccessMomenta.h"
 
 #include <sstream>
 
 //============================================================================
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -45,7 +46,7 @@ namespace mg5amcCpu
 
 //============================================================================
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 namespace mg5amcCpu
 {
 
@@ -96,7 +97,7 @@ namespace mg5amcCpu
 
 //============================================================================
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 {
 
diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/BridgeKernels.h b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/BridgeKernels.h
index 15eb4bff4d..3efef8ce97 100644
--- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/BridgeKernels.h
+++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/BridgeKernels.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef BRIDGEKERNELS_H
 #define BRIDGEKERNELS_H 1
@@ -12,7 +12,7 @@
 #include "MatrixElementKernels.h"
 #include "MemoryBuffers.h"
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -49,7 +49,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A Bridge wrapper class encapsulating matrix element calculations on a CPU host
   class BridgeKernelHost final : public BridgeKernelBase
   {
@@ -89,7 +89,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   // A Bridge wrapper class encapsulating matrix element calculations on a GPU device
   class BridgeKernelDevice : public BridgeKernelBase
   {
diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/CommonRandomNumberKernel.cc b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/CommonRandomNumberKernel.cc
index 985b39f576..010bc4cbd0 100644
--- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/CommonRandomNumberKernel.cc
+++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/CommonRandomNumberKernel.cc
@@ -1,15 +1,16 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #include "CommonRandomNumbers.h"
+#include "GpuAbstraction.h"
 #include "MemoryBuffers.h"
 #include "RandomNumberKernels.h"
 
 #include <cassert>
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/CrossSectionKernels.cc b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/CrossSectionKernels.cc
index 0b355a3c8d..c15b39844d 100644
--- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/CrossSectionKernels.cc
+++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/CrossSectionKernels.cc
@@ -1,10 +1,11 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #include "CrossSectionKernels.h"
 
+#include "GpuAbstraction.h"
 #include "MemoryAccessMatrixElements.h"
 #include "MemoryAccessWeights.h"
 #include "MemoryBuffers.h"
@@ -77,7 +78,7 @@ debug_me_is_abnormal( const fptype& me, size_t ievtALL )
 
 //============================================================================
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -185,7 +186,7 @@ namespace mg5amcCpu
 
 //============================================================================
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 {
 
diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/CrossSectionKernels.h b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/CrossSectionKernels.h
index 7933ca4bbf..4d9659e04e 100644
--- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/CrossSectionKernels.h
+++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/CrossSectionKernels.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef CROSSSECTIONKERNELS_H
 #define CROSSSECTIONKERNELS_H 1
@@ -13,7 +13,7 @@
 
 //============================================================================
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -96,7 +96,7 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
   /*
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   // A class encapsulating the calculation of event statistics on a GPU device
   class CrossSectionKernelDevice : public CrossSectionKernelBase, public NumberOfEvents
   {
diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/CudaRuntime.h b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/CudaRuntime.h
deleted file mode 100644
index 64ce52f4b3..0000000000
--- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/CudaRuntime.h
+++ /dev/null
@@ -1,85 +0,0 @@
-// Copyright (C) 2020-2023 CERN and UCLouvain.
-// Licensed under the GNU Lesser General Public License (version 3 or later).
-// Created by: S. Roiser (Jul 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
-
-#ifndef MG5AMC_CUDARUNTIME_H
-#define MG5AMC_CUDARUNTIME_H 1
-
-// MG5AMC on GPU uses the CUDA runtime API, not the lower level CUDA driver API
-// See https://docs.nvidia.com/cuda/cuda-runtime-api/driver-vs-runtime-api.html#driver-vs-runtime-api
-
-#include <cassert>
-#include <iostream>
-
-//--------------------------------------------------------------------------
-
-// See https://stackoverflow.com/a/14038590
-#ifdef __CUDACC__ /* clang-format off */
-#define checkCuda( code ) { assertCuda( code, __FILE__, __LINE__ ); }
-inline void assertCuda( cudaError_t code, const char* file, int line, bool abort = true )
-{
-  if( code != cudaSuccess )
-  {
-    printf( "ERROR! assertCuda: '%s' (%d) in %s:%d\n", cudaGetErrorString( code ), code, file, line );
-    if( abort ) assert( code == cudaSuccess );
-  }
-}
-#endif /* clang-format on */
-
-//--------------------------------------------------------------------------
-
-#ifdef __CUDACC__
-namespace mg5amcGpu
-{
-  // Instantiate a CudaRuntime at the beginnining of the application's main to
-  // invoke cudaSetDevice(0) in the constructor and book a cudaDeviceReset() call in the destructor
-  // *** FIXME! This will all need to be designed differently when going to multi-GPU nodes! ***
-  struct CudaRuntime final
-  {
-    CudaRuntime( const bool debug = true )
-      : m_debug( debug ) { setUp( m_debug ); }
-    ~CudaRuntime() { tearDown( m_debug ); }
-    CudaRuntime( const CudaRuntime& ) = delete;
-    CudaRuntime( CudaRuntime&& ) = delete;
-    CudaRuntime& operator=( const CudaRuntime& ) = delete;
-    CudaRuntime& operator=( CudaRuntime&& ) = delete;
-    bool m_debug;
-
-    // Set up CUDA application
-    // ** NB: strictly speaking this is not needed when using the CUDA runtime API **
-    // Calling cudaSetDevice on startup is useful to properly book-keep the time spent in CUDA initialization
-    static void setUp( const bool debug = true )
-    {
-      // ** NB: it is useful to call cudaSetDevice, or cudaFree, to properly book-keep the time spent in CUDA initialization
-      // ** NB: otherwise, the first CUDA operation (eg a cudaMemcpyToSymbol in CPPProcess ctor) appears to take much longer!
-      /*
-      // [We initially added cudaFree(0) to "ease profile analysis" only because it shows up as a big recognizable block!]
-      // No explicit initialization is needed: https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#initialization
-      // It is not clear what cudaFree(0) does at all: https://stackoverflow.com/questions/69967813/
-      if ( debug ) std::cout << "__CudaRuntime: calling cudaFree(0)" << std::endl;
-      checkCuda( cudaFree( 0 ) ); // SLOW!
-      */
-      // Replace cudaFree(0) by cudaSetDevice(0), even if it is not really needed either
-      // (but see https://developer.nvidia.com/blog/cuda-pro-tip-always-set-current-device-avoid-multithreading-bugs)
-      if( debug ) std::cout << "__CudaRuntime: calling cudaSetDevice(0)" << std::endl;
-      checkCuda( cudaSetDevice( 0 ) ); // SLOW!
-    }
-
-    // Tear down CUDA application (call cudaDeviceReset)
-    // ** NB: strictly speaking this is not needed when using the CUDA runtime API **
-    // Calling cudaDeviceReset on shutdown is only needed for checking memory leaks in cuda-memcheck
-    // See https://docs.nvidia.com/cuda/cuda-memcheck/index.html#leak-checking
-    static void tearDown( const bool debug = true )
-    {
-      if( debug ) std::cout << "__CudaRuntime: calling cudaDeviceReset()" << std::endl;
-      checkCuda( cudaDeviceReset() );
-    }
-  };
-
-}
-#endif
-
-//--------------------------------------------------------------------------
-
-#endif // MG5AMC_CUDARUNTIME_H
diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/CurandRandomNumberKernel.cc b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/CurandRandomNumberKernel.cc
index eb56333b03..08a16f6f2c 100644
--- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/CurandRandomNumberKernel.cc
+++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/CurandRandomNumberKernel.cc
@@ -1,9 +1,9 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
-#include "CudaRuntime.h"
+#include "GpuRuntime.h"
 #include "MemoryBuffers.h"
 #include "RandomNumberKernels.h"
 
@@ -22,7 +22,7 @@ inline void assertCurand( curandStatus_t code, const char *file, int line, bool
 }
 #endif /* clang-format on */
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -36,7 +36,7 @@ namespace mg5amcCpu
   {
     if( m_isOnDevice )
     {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
       if( !m_rnarray.isOnDevice() )
         throw std::runtime_error( "CurandRandomNumberKernel on device with a host random number array" );
 #else
@@ -114,7 +114,7 @@ namespace mg5amcCpu
     /*
     printf( "\nCurandRandomNumberKernel::generateRnarray size = %d\n", (int)m_rnarray.size() );
     fptype* data = m_rnarray.data();
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     if( m_rnarray.isOnDevice() )
     {
       data = new fptype[m_rnarray.size()]();
@@ -123,7 +123,7 @@ namespace mg5amcCpu
 #endif
     for( int i = 0; i < ( (int)m_rnarray.size() / 4 ); i++ )
       printf( "[%4d] %f %f %f %f\n", i * 4, data[i * 4], data[i * 4 + 2], data[i * 4 + 2], data[i * 4 + 3] );
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     if( m_rnarray.isOnDevice() ) delete[] data;
 #endif
     */
diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/EventStatistics.h b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/EventStatistics.h
index 48b51e0a49..b425a5bade 100644
--- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/EventStatistics.h
+++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/EventStatistics.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef EventStatistics_H
 #define EventStatistics_H 1
@@ -16,7 +16,7 @@
 #include <limits>
 #include <string>
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/GpuAbstraction.h b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/GpuAbstraction.h
new file mode 100644
index 0000000000..6a7d9c05c0
--- /dev/null
+++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/GpuAbstraction.h
@@ -0,0 +1,71 @@
+// Copyright (C) 2020-2023 CERN and UCLouvain.
+// Licensed under the GNU Lesser General Public License (version 3 or later).
+// Created by: J. Teig (Jul 2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+
+#ifndef MG5AMC_GPUABSTRACTION_H
+#define MG5AMC_GPUABSTRACTION_H 1
+
+#include <cassert>
+
+//--------------------------------------------------------------------------
+
+#ifdef __CUDACC__
+
+#define gpuError_t cudaError_t
+#define gpuPeekAtLastError cudaPeekAtLastError
+#define gpuGetErrorString cudaGetErrorString
+#define gpuSuccess cudaSuccess
+
+#define gpuMallocHost( ptr, size ) checkGpu( cudaMallocHost( ptr, size ) )
+#define gpuMalloc( ptr, size ) checkGpu( cudaMalloc( ptr, size ) )
+
+#define gpuMemcpy( dstData, srcData, srcBytes, func ) checkGpu( cudaMemcpy( dstData, srcData, srcBytes, func ) )
+#define gpuMemcpyHostToDevice cudaMemcpyHostToDevice
+#define gpuMemcpyDeviceToHost cudaMemcpyDeviceToHost
+#define gpuMemcpyToSymbol( type1, type2, size ) checkGpu( cudaMemcpyToSymbol( type1, type2, size ) )
+
+#define gpuFree( ptr ) checkGpu( cudaFree( ptr ) )
+#define gpuFreeHost( ptr ) checkGpu( cudaFreeHost( ptr ) )
+
+#define gpuSetDevice cudaSetDevice
+#define gpuDeviceSynchronize cudaDeviceSynchronize
+#define gpuDeviceReset cudaDeviceReset
+
+#define gpuLaunchKernel( kernel, blocks, threads, ... ) kernel<<<blocks, threads>>>( __VA_ARGS__ )
+#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<<blocks, threads, sharedMem>>>( __VA_ARGS__ )
+
+//--------------------------------------------------------------------------
+
+#elif defined __HIPCC__
+
+#include "hip/hip_runtime.h"
+
+#define gpuError_t hipError_t
+#define gpuPeekAtLastError hipPeekAtLastError
+#define gpuGetErrorString hipGetErrorString
+#define gpuSuccess hipSuccess
+
+#define gpuMallocHost( ptr, size ) checkGpu( hipHostMalloc( ptr, size ) ) // HostMalloc better
+#define gpuMalloc( ptr, size ) checkGpu( hipMalloc( ptr, size ) )
+
+#define gpuMemcpy( dstData, srcData, srcBytes, func ) checkGpu( hipMemcpy( dstData, srcData, srcBytes, func ) )
+#define gpuMemcpyHostToDevice hipMemcpyHostToDevice
+#define gpuMemcpyDeviceToHost hipMemcpyDeviceToHost
+#define gpuMemcpyToSymbol( type1, type2, size ) checkGpu( hipMemcpyToSymbol( type1, type2, size ) )
+
+#define gpuFree( ptr ) checkGpu( hipFree( ptr ) )
+#define gpuFreeHost( ptr ) checkGpu( hipHostFree( ptr ) )
+
+#define gpuSetDevice hipSetDevice
+#define gpuDeviceSynchronize hipDeviceSynchronize
+#define gpuDeviceReset hipDeviceReset
+
+#define gpuLaunchKernel( kernel, blocks, threads, ... ) kernel<<<blocks, threads>>>( __VA_ARGS__ )
+#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<<blocks, threads, sharedMem>>>( __VA_ARGS__ )
+
+//--------------------------------------------------------------------------
+
+#endif
+
+#endif // MG5AMC_GPUABSTRACTION_H
diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/GpuRuntime.h b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/GpuRuntime.h
new file mode 100644
index 0000000000..93579ef08b
--- /dev/null
+++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/GpuRuntime.h
@@ -0,0 +1,85 @@
+// Copyright (C) 2020-2023 CERN and UCLouvain.
+// Licensed under the GNU Lesser General Public License (version 3 or later).
+// Created by: J. Teig (Jun 2023, based on earlier work by S. Roiser) for the MG5aMC CUDACPP plugin.
+// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+
+#ifndef MG5AMC_GPURUNTIME_H
+#define MG5AMC_GPURUNTIME_H 1
+
+// MG5AMC on GPU uses the CUDA runtime API, not the lower level CUDA driver API
+// See https://docs.nvidia.com/cuda/cuda-runtime-api/driver-vs-runtime-api.html#driver-vs-runtime-api
+
+#include "GpuAbstraction.h"
+
+#include <iostream>
+
+//--------------------------------------------------------------------------
+
+// See https://stackoverflow.com/a/14038590
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
+#define checkGpu( code ) { assertGpu( code, __FILE__, __LINE__ ); }
+inline void assertGpu( gpuError_t code, const char* file, int line, bool abort = true )
+{
+  if( code != gpuSuccess )
+  {
+    printf( "ERROR! assertGpu: '%s' (%d) in %s:%d\n", gpuGetErrorString( code ), code, file, line );
+    if( abort ) assert( code == gpuSuccess );
+  }
+}
+#endif /* clang-format on */
+
+//--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+namespace mg5amcGpu
+{
+  // Instantiate a GpuRuntime at the beginnining of the application's main to
+  // invoke gpuSetDevice(0) in the constructor and book a gpuDeviceReset() call in the destructor
+  // *** FIXME! This will all need to be designed differently when going to multi-GPU nodes! ***
+  struct GpuRuntime final
+  {
+    GpuRuntime( const bool debug = true )
+      : m_debug( debug ) { setUp( m_debug ); }
+    ~GpuRuntime() { tearDown( m_debug ); }
+    GpuRuntime( const GpuRuntime& ) = delete;
+    GpuRuntime( GpuRuntime&& ) = delete;
+    GpuRuntime& operator=( const GpuRuntime& ) = delete;
+    GpuRuntime& operator=( GpuRuntime&& ) = delete;
+    bool m_debug;
+
+    // Set up CUDA application
+    // ** NB: strictly speaking this is not needed when using the CUDA runtime API **
+    // Calling cudaSetDevice on startup is useful to properly book-keep the time spent in CUDA initialization
+    static void setUp( const bool debug = true )
+    {
+      // ** NB: it is useful to call cudaSetDevice, or cudaFree, to properly book-keep the time spent in CUDA initialization
+      // ** NB: otherwise, the first CUDA operation (eg a cudaMemcpyToSymbol in CPPProcess ctor) appears to take much longer!
+      /*
+      // [We initially added cudaFree(0) to "ease profile analysis" only because it shows up as a big recognizable block!]
+      // No explicit initialization is needed: https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#initialization
+      // It is not clear what cudaFree(0) does at all: https://stackoverflow.com/questions/69967813/
+      if ( debug ) std::cout << "__CudaRuntime: calling cudaFree(0)" << std::endl;
+      checkCuda( cudaFree( 0 ) ); // SLOW!
+      */
+      // Replace cudaFree(0) by cudaSetDevice(0), even if it is not really needed either
+      // (but see https://developer.nvidia.com/blog/cuda-pro-tip-always-set-current-device-avoid-multithreading-bugs)
+      if( debug ) std::cout << "__GpuRuntime: calling GpuSetDevice(0)" << std::endl;
+      checkGpu( gpuSetDevice( 0 ) ); // SLOW!
+    }
+
+    // Tear down CUDA application (call cudaDeviceReset)
+    // ** NB: strictly speaking this is not needed when using the CUDA runtime API **
+    // Calling cudaDeviceReset on shutdown is only needed for checking memory leaks in cuda-memcheck
+    // See https://docs.nvidia.com/cuda/cuda-memcheck/index.html#leak-checking
+    static void tearDown( const bool debug = true )
+    {
+      if( debug ) std::cout << "__GpuRuntime: calling GpuDeviceReset()" << std::endl;
+      checkGpu( gpuDeviceReset() );
+    }
+  };
+}
+#endif
+
+//--------------------------------------------------------------------------
+
+#endif // MG5AMC_GPURUNTIME_H
diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MadgraphTest.h b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MadgraphTest.h
index ef40624c88..a64c05c26a 100644
--- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MadgraphTest.h
+++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MadgraphTest.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: S. Hageboeck (Dec 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MADGRAPHTEST_H_
 #define MADGRAPHTEST_H_ 1
@@ -22,7 +22,7 @@
 #include <string>
 #include <vector>
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 using mg5amcGpu::CPPProcess;
 #else
 using mg5amcCpu::CPPProcess;
@@ -201,7 +201,7 @@ class MadgraphTest : public testing::TestWithParam<TestDriverBase*>
 
 // Since we link both the CPU-only and GPU tests into the same executable, we prevent
 // a multiply defined symbol by only compiling this in the non-CUDA phase:
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 
 /// Compare momenta and matrix elements.
 /// This uses an implementation of TestDriverBase to run a madgraph workflow,
@@ -307,6 +307,6 @@ TEST_P( MadgraphTest, CompareMomentaAndME )
   }
 }
 
-#endif // __CUDACC__
+#endif // MGONGPUCPP_GPUIMPL
 
 #endif /* MADGRAPHTEST_H_ */
diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MatrixElementKernels.cc b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MatrixElementKernels.cc
index 74b5239ebf..81699dfea9 100644
--- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MatrixElementKernels.cc
+++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MatrixElementKernels.cc
@@ -1,12 +1,12 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #include "MatrixElementKernels.h"
 
 #include "CPPProcess.h"
-#include "CudaRuntime.h"
+#include "GpuRuntime.h" // Includes the abstraction for Nvidia/AMD compilation
 #include "MemoryAccessMomenta.h"
 #include "MemoryBuffers.h"
 
@@ -14,7 +14,7 @@
 
 //============================================================================
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 namespace mg5amcCpu
 {
 
@@ -150,7 +150,7 @@ namespace mg5amcCpu
 
 //============================================================================
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 {
 
@@ -209,13 +209,13 @@ namespace mg5amcGpu
     PinnedHostBufferHelicityMask hstIsGoodHel( ncomb );
     DeviceBufferHelicityMask devIsGoodHel( ncomb );
     // ... 0d1. Compute good helicity mask on the device
-    computeDependentCouplings<<<m_gpublocks, m_gputhreads>>>( m_gs.data(), m_couplings.data() );
+    gpuLaunchKernel( computeDependentCouplings, m_gpublocks, m_gputhreads, m_gs.data(), m_couplings.data() );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    sigmaKin_getGoodHel<<<m_gpublocks, m_gputhreads>>>( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_numerators.data(), m_denominators.data(), devIsGoodHel.data() );
+    gpuLaunchKernel( sigmaKin_getGoodHel, m_gpublocks, m_gputhreads, m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_numerators.data(), m_denominators.data(), devIsGoodHel.data() );
 #else
-    sigmaKin_getGoodHel<<<m_gpublocks, m_gputhreads>>>( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), devIsGoodHel.data() );
+    gpuLaunchKernel( sigmaKin_getGoodHel, m_gpublocks, m_gputhreads, m_momenta.data(), m_couplings.data(), m_matrixElements.data(), devIsGoodHel.data() );
 #endif
-    checkCuda( cudaPeekAtLastError() );
+    checkGpu( gpuPeekAtLastError() );
     // ... 0d2. Copy back good helicity mask to the host
     copyHostFromDevice( hstIsGoodHel, devIsGoodHel );
     // ... 0d3. Copy back good helicity list to constant memory on the device
@@ -226,19 +226,19 @@ namespace mg5amcGpu
 
   void MatrixElementKernelDevice::computeMatrixElements( const unsigned int channelId )
   {
-    computeDependentCouplings<<<m_gpublocks, m_gputhreads>>>( m_gs.data(), m_couplings.data() );
+    gpuLaunchKernel( computeDependentCouplings, m_gpublocks, m_gputhreads, m_gs.data(), m_couplings.data() );
 #ifndef MGONGPU_NSIGHT_DEBUG
     constexpr unsigned int sharedMemSize = 0;
 #else
     constexpr unsigned int sharedMemSize = ntpbMAX * sizeof( float );
 #endif
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    sigmaKin<<<m_gpublocks, m_gputhreads, sharedMemSize>>>( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), channelId, m_numerators.data(), m_denominators.data(), m_selhel.data(), m_selcol.data() );
+    gpuLaunchKernelSharedMem( sigmaKin, m_gpublocks, m_gputhreads, sharedMemSize, m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), channelId, m_numerators.data(), m_denominators.data(), m_selhel.data(), m_selcol.data() );
 #else
-    sigmaKin<<<m_gpublocks, m_gputhreads, sharedMemSize>>>( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), m_selhel.data(), m_selcol.data() );
+    gpuLaunchKernelSharedMem( sigmaKin, m_gpublocks, m_gputhreads, sharedMemSize, m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), m_selhel.data(), m_selcol.data() );
 #endif
-    checkCuda( cudaPeekAtLastError() );
-    checkCuda( cudaDeviceSynchronize() );
+    checkGpu( gpuPeekAtLastError() );
+    checkGpu( gpuDeviceSynchronize() );
   }
 
   //--------------------------------------------------------------------------
diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MatrixElementKernels.h b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MatrixElementKernels.h
index 23e84757a2..72bd8f195b 100644
--- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MatrixElementKernels.h
+++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MatrixElementKernels.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MATRIXELEMENTKERNELS_H
 #define MATRIXELEMENTKERNELS_H 1
@@ -10,7 +10,7 @@
 
 #include "MemoryBuffers.h"
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -81,7 +81,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating matrix element calculations on a CPU host
   class MatrixElementKernelHost final : public MatrixElementKernelBase, public NumberOfEvents
   {
@@ -130,7 +130,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   // A class encapsulating matrix element calculations on a GPU device
   class MatrixElementKernelDevice : public MatrixElementKernelBase, public NumberOfEvents
   {
diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MemoryAccessAmplitudes.h b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MemoryAccessAmplitudes.h
index 573b3bbbc9..ffb76e93de 100644
--- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MemoryAccessAmplitudes.h
+++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MemoryAccessAmplitudes.h
@@ -15,7 +15,7 @@
 #define MGONGPU_TRIVIAL_AMPLITUDES 1
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MemoryAccessCouplings.h b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MemoryAccessCouplings.h
index 35a3af42e0..3afdf3e554 100644
--- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MemoryAccessCouplings.h
+++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MemoryAccessCouplings.h
@@ -15,7 +15,7 @@
 #include "MemoryBuffers.h"       // for HostBufferCouplings::isaligned
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MemoryAccessCouplingsFixed.h b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MemoryAccessCouplingsFixed.h
index dc0d93afff..ffcdf4dbef 100644
--- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MemoryAccessCouplingsFixed.h
+++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MemoryAccessCouplingsFixed.h
@@ -14,7 +14,7 @@
 //#include "MemoryAccessHelpers.h"
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MemoryAccessDenominators.h b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MemoryAccessDenominators.h
index 3bce635718..66f2d32a6b 100644
--- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MemoryAccessDenominators.h
+++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MemoryAccessDenominators.h
@@ -10,7 +10,7 @@
 #include "MemoryAccessGs.h"
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MemoryAccessGs.h b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MemoryAccessGs.h
index 31311aa375..4c726b30f3 100644
--- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MemoryAccessGs.h
+++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MemoryAccessGs.h
@@ -13,7 +13,7 @@
 #include "MemoryBuffers.h" // for HostBufferMatrixElements::isaligned
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MemoryAccessHelpers.h b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MemoryAccessHelpers.h
index c82a6c7635..db73e4e064 100644
--- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MemoryAccessHelpers.h
+++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MemoryAccessHelpers.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MemoryAccessHelpers_H
 #define MemoryAccessHelpers_H 1
@@ -105,7 +105,7 @@ class KernelAccessHelper : public MemoryAccessHelper<T>
     }
     else
     {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
       const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid
       //printf( "kernelAccessRecord: ievt=%d threadId=%d\n", ievt, threadIdx.x );
       return T::ieventAccessRecord( buffer, ievt ); // NB fptype and fptype_sv coincide for CUDA
diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MemoryAccessMatrixElements.h b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MemoryAccessMatrixElements.h
index f32e6fea5b..3741011971 100644
--- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MemoryAccessMatrixElements.h
+++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MemoryAccessMatrixElements.h
@@ -13,7 +13,7 @@
 #include "MemoryBuffers.h" // for HostBufferMatrixElements::isaligned
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MemoryAccessMomenta.h b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MemoryAccessMomenta.h
index 29266de32c..3be229d392 100644
--- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MemoryAccessMomenta.h
+++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MemoryAccessMomenta.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MemoryAccessMomenta_H
 #define MemoryAccessMomenta_H 1
@@ -13,7 +13,7 @@
 #include "MemoryAccessVectors.h"
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -30,7 +30,7 @@ namespace mg5amcCpu
 
     // Number of Events Per Page in the momenta AOSOA memory buffer layout
     // (these are all best kept as a compile-time constants: see issue #23)
-#ifdef __CUDACC__ /* clang-format off */
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
     // -----------------------------------------------------------------------------------------------
     // --- GPUs: neppM is best set to a power of 2 times the number of fptype's in a 32-byte cacheline
     // --- This is relevant to ensure coalesced access to momenta in global memory
diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MemoryAccessNumerators.h b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MemoryAccessNumerators.h
index b152183b28..18991f4fa6 100644
--- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MemoryAccessNumerators.h
+++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MemoryAccessNumerators.h
@@ -10,7 +10,7 @@
 #include "MemoryAccessGs.h"
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MemoryAccessRandomNumbers.h b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MemoryAccessRandomNumbers.h
index e2988d39f3..40cb089135 100644
--- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MemoryAccessRandomNumbers.h
+++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MemoryAccessRandomNumbers.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MemoryAccessRandomNumbers_H
 #define MemoryAccessRandomNumbers_H 1
@@ -11,7 +11,7 @@
 #include "CPPProcess.h"
 #include "MemoryAccessHelpers.h"
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 using mg5amcGpu::CPPProcess;
 #else
 using mg5amcCpu::CPPProcess;
diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MemoryAccessVectors.h b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MemoryAccessVectors.h
index e9b197368e..08faccff0f 100644
--- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MemoryAccessVectors.h
+++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MemoryAccessVectors.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MemoryAccessVectors_H
 #define MemoryAccessVectors_H 1
@@ -10,7 +10,7 @@
 
 #include "mgOnGpuVectors.h"
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 namespace mg5amcCpu // this is only needed for CPU SIMD vectorization
 {
 
diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MemoryAccessWavefunctions.h b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MemoryAccessWavefunctions.h
index 5428aaf933..33bef4559e 100644
--- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MemoryAccessWavefunctions.h
+++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MemoryAccessWavefunctions.h
@@ -15,7 +15,7 @@
 #define MGONGPU_TRIVIAL_WAVEFUNCTIONS 1
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MemoryBuffers.h b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MemoryBuffers.h
index 3093e6ed18..7756a71621 100644
--- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MemoryBuffers.h
+++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MemoryBuffers.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021, based on earlier work by S. Hageboeck) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Roiser, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MemoryBuffers_H
 #define MemoryBuffers_H 1
@@ -11,12 +11,12 @@
 #include "mgOnGpuCxtypes.h"
 
 #include "CPPProcess.h"
-#include "CudaRuntime.h"
+#include "GpuRuntime.h"
 #include "Parameters_sm.h"
 
 #include <sstream>
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -87,7 +87,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   constexpr bool HostBufferALIGNED = false;   // ismisaligned=false
   constexpr bool HostBufferMISALIGNED = true; // ismisaligned=true
 
@@ -119,7 +119,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   // A class encapsulating a CUDA pinned host buffer
   template<typename T>
   class PinnedHostBufferBase : public BufferBase<T>
@@ -128,18 +128,18 @@ namespace mg5amcCpu
     PinnedHostBufferBase( const size_t size )
       : BufferBase<T>( size, false )
     {
-      checkCuda( cudaMallocHost( &( this->m_data ), this->bytes() ) );
+      gpuMallocHost( &( this->m_data ), this->bytes() );
     }
     virtual ~PinnedHostBufferBase()
     {
-      checkCuda( cudaFreeHost( this->m_data ) );
+      gpuFreeHost( this->m_data );
     }
   };
 #endif
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   // A class encapsulating a CUDA device buffer
   template<typename T>
   class DeviceBufferBase : public BufferBase<T>
@@ -148,18 +148,18 @@ namespace mg5amcCpu
     DeviceBufferBase( const size_t size )
       : BufferBase<T>( size, true )
     {
-      checkCuda( cudaMalloc( &( this->m_data ), this->bytes() ) );
+      gpuMalloc( &( this->m_data ), this->bytes() );
     }
     virtual ~DeviceBufferBase()
     {
-      checkCuda( cudaFree( this->m_data ) );
+      gpuFree( this->m_data );
     }
   };
 #endif
 
   //--------------------------------------------------------------------------
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for a given number of events
   template<typename T, size_t sizePerEvent, bool ismisaligned>
   class HostBuffer : public HostBufferBase<T, ismisaligned>, virtual private NumberOfEvents
@@ -175,7 +175,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   // A class encapsulating a CUDA pinned host buffer for a given number of events
   template<typename T, size_t sizePerEvent>
   class PinnedHostBuffer : public PinnedHostBufferBase<T>, virtual private NumberOfEvents
@@ -191,7 +191,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   // A class encapsulating a CUDA device buffer for a given number of events
   template<typename T, size_t sizePerEvent>
   class DeviceBuffer : public DeviceBufferBase<T>, virtual private NumberOfEvents
@@ -213,7 +213,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for momenta random numbers
   constexpr size_t sizePerEventRndNumMomenta = MemoryBuffers::np4 * MemoryBuffers::nparf;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for momenta random numbers
   typedef HostBuffer<fptype, sizePerEventRndNumMomenta, HostBufferALIGNED> HostBufferRndNumMomenta;
 #else
@@ -232,7 +232,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer with ONE fptype per event
   constexpr size_t sizePerEventOneFp = 1;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer with ONE fptype per event
   typedef HostBuffer<fptype, sizePerEventOneFp, HostBufferALIGNED> HostBufferOneFp;
 #else
@@ -257,7 +257,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for Gs
   constexpr size_t sizePerEventGs = 1;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for gs
   typedef HostBuffer<fptype, sizePerEventGs, HostBufferALIGNED> HostBufferGs;
 #else
@@ -276,7 +276,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for numerators
   constexpr size_t sizePerEventNumerators = 1;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for gs
   typedef HostBuffer<fptype, sizePerEventNumerators, HostBufferALIGNED> HostBufferNumerators;
 #else
@@ -296,7 +296,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for denominators
   constexpr size_t sizePerEventDenominators = 1;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for gs
   typedef HostBuffer<fptype, sizePerEventDenominators, HostBufferALIGNED> HostBufferDenominators;
 #else
@@ -315,7 +315,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for random numbers
   constexpr size_t sizePerEventCouplings = MemoryBuffers::ndcoup * MemoryBuffers::nx2;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for gs
   typedef HostBuffer<fptype, sizePerEventCouplings, HostBufferALIGNED> HostBufferCouplings;
 #else
@@ -333,7 +333,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for momenta
   constexpr size_t sizePerEventMomenta = MemoryBuffers::np4 * MemoryBuffers::npar;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for momenta
   typedef HostBuffer<fptype, sizePerEventMomenta, HostBufferALIGNED> HostBufferMomenta;
   //typedef HostBuffer<fptype, sizePerEventMomenta, HostBufferMISALIGNED> HostBufferMomenta; // TEST MISALIGNMENT!
@@ -352,7 +352,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for sampling weights
   constexpr size_t sizePerEventWeights = 1;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for sampling weights
   typedef HostBuffer<fptype, sizePerEventWeights, HostBufferALIGNED> HostBufferWeights;
 #else
@@ -370,7 +370,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for matrix elements
   constexpr size_t sizePerEventMatrixElements = 1;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for matrix elements
   typedef HostBuffer<fptype, sizePerEventMatrixElements, HostBufferALIGNED> HostBufferMatrixElements;
 #else
@@ -385,7 +385,7 @@ namespace mg5amcCpu
   // A base class encapsulating a memory buffer for the helicity mask
   typedef BufferBase<bool> BufferHelicityMask;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for the helicity mask
   typedef HostBufferBase<bool, HostBufferALIGNED> HostBufferHelicityMask;
 #else
@@ -403,7 +403,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for wavefunctions
   constexpr size_t sizePerEventWavefunctions = MemoryBuffers::nw6 * MemoryBuffers::nx2;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for wavefunctions
   typedef HostBuffer<fptype, sizePerEventWavefunctions, HostBufferALIGNED> HostBufferWavefunctions;
 #else
@@ -421,7 +421,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for helicity random numbers
   constexpr size_t sizePerEventRndNumHelicity = 1;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for helicity random numbers
   typedef HostBuffer<fptype, sizePerEventRndNumHelicity, HostBufferALIGNED> HostBufferRndNumHelicity;
 #else
@@ -439,7 +439,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for color random numbers
   constexpr size_t sizePerEventRndNumColor = 1;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for color random numbers
   typedef HostBuffer<fptype, sizePerEventRndNumColor, HostBufferALIGNED> HostBufferRndNumColor;
 #else
@@ -457,7 +457,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for helicity selection
   constexpr size_t sizePerEventSelectedHelicity = 1;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for helicity selection
   typedef HostBuffer<int, sizePerEventSelectedHelicity, HostBufferALIGNED> HostBufferSelectedHelicity;
 #else
@@ -475,7 +475,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for color selection
   constexpr size_t sizePerEventSelectedColor = 1;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for color selection
   typedef HostBuffer<int, sizePerEventSelectedColor, HostBufferALIGNED> HostBufferSelectedColor;
 #else
@@ -487,7 +487,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   template<class Tdst, class Tsrc>
   void copyDeviceFromHost( Tdst& dst, const Tsrc& src ) // keep the same order of arguments as in memcpy
   {
@@ -504,13 +504,13 @@ namespace mg5amcCpu
       throw std::runtime_error( sstr.str() );
     }
     // NB (PR #45): cudaMemcpy involves an intermediate memcpy to pinned memory if host array is a not a pinned host array
-    checkCuda( cudaMemcpy( dst.data(), src.data(), src.bytes(), cudaMemcpyHostToDevice ) );
+    gpuMemcpy( dst.data(), src.data(), src.bytes(), gpuMemcpyHostToDevice );
   }
 #endif
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   template<class Tdst, class Tsrc>
   void copyHostFromDevice( Tdst& dst, const Tsrc& src ) // keep the same order of arguments as in memcpy
   {
@@ -527,7 +527,7 @@ namespace mg5amcCpu
       throw std::runtime_error( sstr.str() );
     }
     // NB (PR #45): cudaMemcpy involves an intermediate memcpy to pinned memory if host array is a not a pinned host array
-    checkCuda( cudaMemcpy( dst.data(), src.data(), src.bytes(), cudaMemcpyDeviceToHost ) );
+    gpuMemcpy( dst.data(), src.data(), src.bytes(), gpuMemcpyDeviceToHost );
   }
 #endif
 
diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/CPPProcess.cc b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/CPPProcess.cc
index 19e6cd201c..a478ecb28e 100644
--- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/CPPProcess.cc
+++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/CPPProcess.cc
@@ -4,7 +4,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
 // MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
@@ -16,7 +16,6 @@
 
 #include "mgOnGpuConfig.h"
 
-#include "CudaRuntime.h"
 #include "HelAmps_sm.h"
 #include "MemoryAccessAmplitudes.h"
 #include "MemoryAccessCouplings.h"
@@ -46,7 +45,7 @@
 // Class member functions for calculating the matrix elements for
 // Process: g g > t t~ g g g WEIGHTED<=5 @1
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -80,7 +79,7 @@ namespace mg5amcCpu
   __device__ const fptype cIPD[2] = { (fptype)Parameters_sm::mdl_MT, (fptype)Parameters_sm::mdl_WT };
   __device__ const fptype* cIPC = nullptr; // unused as nicoup=0
 #else
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   __device__ __constant__ fptype cIPD[2];
   __device__ __constant__ fptype* cIPC = nullptr; // unused as nicoup=0
 #else
@@ -90,7 +89,7 @@ namespace mg5amcCpu
 #endif
 
   // Helicity combinations (and filtering of "good" helicity combinations)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   __device__ __constant__ short cHel[ncomb][npar];
   __device__ __constant__ int cNGoodHel;
   __device__ __constant__ int cGoodHel[ncomb];
@@ -118,13 +117,13 @@ namespace mg5amcCpu
                            fptype* allDenominators,       // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
                            fptype_sv* jamp2_sv            // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled)
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
                            , const int ievt00             // input: first event number in current C++ event page (for CUDA, ievt depends on threadid)
 #endif
                            )
   //ALWAYS_INLINE // attributes are not permitted in a function definition
   {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     using namespace mg5amcGpu;
     using M_ACCESS = DeviceAccessMomenta;         // non-trivial access: buffer includes all events
     using E_ACCESS = DeviceAccessMatrixElements;  // non-trivial access: buffer includes all events
@@ -151,7 +150,7 @@ namespace mg5amcCpu
 #endif /* clang-format on */
     mgDebug( 0, __FUNCTION__ );
     //printf( "calculate_wavefunctions: ihel=%2d\n", ihel );
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
     //printf( "calculate_wavefunctions: ievt00=%d\n", ievt00 );
 #endif
 
@@ -187,7 +186,7 @@ namespace mg5amcCpu
 #endif
     for( int iParity = 0; iParity < nParity; ++iParity )
     { // START LOOP ON IPARITY
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
       const int ievt0 = ievt00 + iParity * neppV;
 #endif
       constexpr size_t nxcoup = ndcoup + nicoup; // both dependent and independent couplings
@@ -200,8 +199,10 @@ namespace mg5amcCpu
         allCOUPs[idcoup] = CD_ACCESS::idcoupAccessBufferConst( allcouplings, idcoup ); // dependent couplings, vary event-by-event
       for( size_t iicoup = 0; iicoup < nicoup; iicoup++ )
         allCOUPs[ndcoup + iicoup] = CI_ACCESS::iicoupAccessBufferConst( cIPC, iicoup ); // independent couplings, fixed for all events
+#ifdef MGONGPUCPP_GPUIMPL
 #ifdef __CUDACC__
 #pragma nv_diagnostic pop
+#endif
       // CUDA kernels take input/output buffers with momenta/MEs for all events
       const fptype* momenta = allmomenta;
       const fptype* COUPs[nxcoup];
@@ -30018,7 +30019,7 @@ namespace mg5amcCpu
         { -116, 442, 442, -134, -134, 505, -44, -134, 28, -224, -62, 496, -53, 19, -62, 496, 10, -80, -62, -71, 10, -80, 1, -8, 136, -116, -116, -44, -44, 514, -116, 442, -44, 28, -53, -62, -44, -53, 514, -62, 100, 10, 28, -62, -62, 10, 10, 1, 514, 505, -62, 496, -71, 568, -44, -134, -53, -62, 19, -71, 10, -80, 100, 10, 640, -80, 1, -8, 10, 1, 64, -8, -62, -71, 10, -80, 1, -8, 28, -62, -62, 10, 10, 1, 1, -8, 10, 1, 64, -8, -8, 64, -80, -8, -512, 64, 496, 568, -80, 640, -8, 64, -224, 496, 496, -80, -80, -8, -8, 64, -80, -8, -512, 64, 64, -512, 640, 64, 4096, -512 },
         { 136, -116, -116, -44, -44, 514, -116, 442, -44, 28, -53, -62, -44, -53, 514, -62, 100, 10, 28, -62, -62, 10, 10, 1, -116, 442, 442, -134, -134, 505, -44, -134, 28, -224, -62, 496, -53, 19, -62, 496, 10, -80, -62, -71, 10, -80, 1, -8, -44, -134, -53, -62, 19, -71, 514, 505, -62, 496, -71, 568, 100, 10, 10, -80, -80, 640, 10, 1, 1, -8, -8, 64, 28, -62, -62, 10, 10, 1, -62, -71, 10, -80, 1, -8, 10, 1, 1, -8, -8, 64, -80, -8, -8, 64, 64, -512, -224, 496, 496, -80, -80, -8, 496, 568, -80, 640, -8, 64, -80, -8, -8, 64, 64, -512, 640, 64, 64, -512, -512, 4096 } }; // 2-D array[120][120]
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
       // Pre-compute a constexpr triangular color matrix properly normalized #475
       struct TriangularNormalizedColorMatrix
       {
@@ -30075,7 +30076,7 @@ namespace mg5amcCpu
 #endif
       for( int icol = 0; icol < ncolor; icol++ )
       {
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
         // === C++ START ===
         // Diagonal terms
 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
@@ -30134,7 +30135,7 @@ namespace mg5amcCpu
       MEs_sv_previous += deltaMEs_previous;
 #endif
       /*
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
       if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", blockDim.x * blockIdx.x + threadIdx.x, ihel, MEs_sv );
 #else
 #ifdef MGONGPU_CPPSIMD
@@ -30293,8 +30294,8 @@ namespace mg5amcCpu
       { 1, 1, 1, -1, 1, -1, 1 },
       { 1, 1, 1, -1, 1, 1, -1 },
       { 1, 1, 1, -1, 1, 1, 1 } };
-#ifdef __CUDACC__
-    checkCuda( cudaMemcpyToSymbol( cHel, tHel, ncomb * npar * sizeof( short ) ) );
+#ifdef MGONGPUCPP_GPUIMPL
+    gpuMemcpyToSymbol( cHel, tHel, ncomb * npar * sizeof( short ) );
 #else
     memcpy( cHel, tHel, ncomb * npar * sizeof( short ) );
 #endif
@@ -30337,9 +30338,9 @@ namespace mg5amcCpu
     // Then copy them to CUDA constant memory (issue #39) or its C++ emulation in file-scope static memory
     const fptype tIPD[2] = { (fptype)m_pars->mdl_MT, (fptype)m_pars->mdl_WT };
     //const cxtype tIPC[0] = { ... }; // nicoup=0
-#ifdef __CUDACC__
-    checkCuda( cudaMemcpyToSymbol( cIPD, tIPD, 2 * sizeof( fptype ) ) );
-    //checkCuda( cudaMemcpyToSymbol( cIPC, tIPC, 0 * sizeof( cxtype ) ) ); // nicoup=0
+#ifdef MGONGPUCPP_GPUIMPL
+    gpuMemcpyToSymbol( cIPD, tIPD, 2 * sizeof( fptype ) );
+    //gpuMemcpyToSymbol( cIPC, tIPC, 0 * sizeof( cxtype ) ); // nicoup=0
 #else
     memcpy( cIPD, tIPD, 2 * sizeof( fptype ) );
     //memcpy( cIPC, tIPC, 0 * sizeof( cxtype ) ); // nicoup=0
@@ -30378,7 +30379,7 @@ namespace mg5amcCpu
   {
     std::stringstream out;
     // CUDA version (NVCC)
-    // [Use __NVCC__ instead of __CUDACC__ here!]
+    // [Use __NVCC__ instead of MGONGPUCPP_GPUIMPL here!]
     // [This tests if 'nvcc' was used even to build a .cc file, even if not necessarily 'nvcc -x cu' for a .cu file]
     // [Check 'nvcc --compiler-options -dM -E dummy.c | grep CUDA': see https://stackoverflow.com/a/53713712]
 #ifdef __NVCC__
@@ -30443,12 +30444,12 @@ namespace mg5amcCpu
   __global__ void /* clang-format off */
   computeDependentCouplings( const fptype* allgs, // input: Gs[nevt]
                              fptype* allcouplings // output: couplings[nevt*ndcoup*2]
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
                              , const int nevt     // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #endif
   ) /* clang-format on */
   {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     using namespace mg5amcGpu;
     using G_ACCESS = DeviceAccessGs;
     using C_ACCESS = DeviceAccessCouplings;
@@ -30469,7 +30470,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__ /* clang-format off */
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
   __global__ void
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
@@ -30595,9 +30596,9 @@ namespace mg5amcCpu
         nGoodHel++;
       }
     }
-#ifdef __CUDACC__
-    checkCuda( cudaMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) ) );
-    checkCuda( cudaMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) ) );
+#ifdef MGONGPUCPP_GPUIMPL
+    gpuMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) );
+    gpuMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) );
 #else
     cNGoodHel = nGoodHel;
     for( int ihel = 0; ihel < ncomb; ihel++ ) cGoodHel[ihel] = goodHel[ihel];
@@ -30621,7 +30622,7 @@ namespace mg5amcCpu
 #endif
             int* allselhel,                // output: helicity selection[nevt]
             int* allselcol                 // output: helicity selection[nevt]
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
             , const int nevt               // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #endif
             ) /* clang-format on */
@@ -30641,7 +30642,7 @@ namespace mg5amcCpu
     // Denominators: spins, colors and identical particles
     constexpr int helcolDenominators[1] = { 1536 }; // assume nprocesses == 1 (#272 and #343)
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     // Remember: in CUDA this is a kernel for one event, in c++ this processes n events
     const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid
 #else
@@ -30655,9 +30656,12 @@ namespace mg5amcCpu
 #endif
 
     // Start sigmaKin_lines
+
+#include "GpuAbstraction.h"
+
     // === PART 0 - INITIALISATION (before calculate_wavefunctions) ===
     // Reset the "matrix elements" - running sums of |M|^2 over helicities for the given event
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     allMEs[ievt] = 0;
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     allNumerators[ievt] = 0;
@@ -30685,7 +30689,7 @@ namespace mg5amcCpu
     // === PART 1 - HELICITY LOOP: CALCULATE WAVEFUNCTIONS ===
     // (in both CUDA and C++, using precomputed good helicities)
 
-#ifdef __CUDACC__ // CUDA OR C++
+#ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++
 
     // *** START OF PART 1a - CUDA (one event per CPU thread) ***
     // Running sum of partial amplitudes squared for event by event color selection (#402)
@@ -30895,7 +30899,7 @@ namespace mg5amcCpu
     // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event
     // [NB 'sum over final spins, average over initial spins', eg see
     // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf]
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     allMEs[ievt] /= helcolDenominators[0];
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     if( channelId > 0 ) allMEs[ievt] *= allNumerators[ievt] / allDenominators[ievt];
diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/CPPProcess.h b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/CPPProcess.h
index 2565923dde..fff95b66e2 100644
--- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/CPPProcess.h
+++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/CPPProcess.h
@@ -4,7 +4,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
 // MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
@@ -25,7 +25,7 @@
 
 //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -107,7 +107,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   __global__ void
   computeDependentCouplings( const fptype* allgs,    // input: Gs[nevt]
                              fptype* allcouplings ); // output: couplings[nevt*ndcoup*2]
@@ -120,7 +120,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__ /* clang-format off */
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
   __global__ void
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
@@ -150,7 +150,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__ /* clang-format off */
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
   __global__ void
   sigmaKin( const fptype* allmomenta,      // input: momenta[nevt*npar*4]
             const fptype* allcouplings,    // input: couplings[nevt*ndcoup*2]
diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/CudaRuntime.h b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/CudaRuntime.h
deleted file mode 120000
index ce9e1a487a..0000000000
--- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/CudaRuntime.h
+++ /dev/null
@@ -1 +0,0 @@
-../CudaRuntime.h
\ No newline at end of file
diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/GpuAbstraction.h b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/GpuAbstraction.h
new file mode 120000
index 0000000000..72054e19ba
--- /dev/null
+++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/GpuAbstraction.h
@@ -0,0 +1 @@
+../GpuAbstraction.h
\ No newline at end of file
diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/GpuRuntime.h b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/GpuRuntime.h
new file mode 120000
index 0000000000..3920e83be4
--- /dev/null
+++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/GpuRuntime.h
@@ -0,0 +1 @@
+../GpuRuntime.h
\ No newline at end of file
diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/check_sa.cc b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/check_sa.cc
index 3fbf0ffbee..7cac5ab47b 100644
--- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/check_sa.cc
+++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/check_sa.cc
@@ -4,7 +4,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: O. Mattelaer (Nov 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 
 #include "mgOnGpuConfig.h"
@@ -12,6 +12,7 @@
 #include "BridgeKernels.h"
 #include "CPPProcess.h"
 #include "CrossSectionKernels.h"
+#include "GpuRuntime.h"
 #include "MatrixElementKernels.h"
 #include "MemoryAccessMatrixElements.h"
 #include "MemoryAccessMomenta.h"
@@ -65,7 +66,7 @@ usage( char* argv0, int ret = 1 )
   std::cout << std::endl;
   std::cout << "Summary stats are always computed: '-p' and '-j' only control their printout" << std::endl;
   std::cout << "The '-d' flag only enables NaN/abnormal warnings and OMP debugging" << std::endl;
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 #ifdef _OPENMP
   std::cout << std::endl;
   std::cout << "Use the OMP_NUM_THREADS environment variable to control OMP multi-threading" << std::endl;
@@ -96,7 +97,7 @@ int
 main( int argc, char** argv )
 {
   // Namespaces for CUDA and C++ (FIXME - eventually use the same namespace everywhere...)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   using namespace mg5amcGpu;
 #else
   using namespace mg5amcCpu;
@@ -134,9 +135,11 @@ main( int argc, char** argv )
     CurandDevice = 2
   };
 #ifdef MGONGPU_HAS_NO_CURAND
-  RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // this is the only supported mode if build has no curand (PR #784)
+  RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // this is the only supported mode if build has no curand (PR #784 and #785)
+#elif defined __HIPCC__
+#error Internal error: MGONGPU_HAS_NO_CURAND should have been set for __HIPCC__ // default on AMD GPUs should be common random
 #elif defined __CUDACC__
-  RandomNumberMode rndgen = RandomNumberMode::CurandDevice; // default on GPU if build has curand
+  RandomNumberMode rndgen = RandomNumberMode::CurandDevice; // default on NVidia GPU if build has curand
 #else
   RandomNumberMode rndgen = RandomNumberMode::CurandHost; // default on CPU if build has curand
 #endif
@@ -146,10 +149,10 @@ main( int argc, char** argv )
     RamboHost = 1,
     RamboDevice = 2
   };
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   RamboSamplingMode rmbsmp = RamboSamplingMode::RamboDevice; // default on GPU
 #else
-  RamboSamplingMode rmbsmp = RamboSamplingMode::RamboHost;  // default on CPU
+  RamboSamplingMode rmbsmp = RamboSamplingMode::RamboHost; // default on CPU
 #endif
   // Bridge emulation mode (NB Bridge implies RamboHost!)
   bool bridge = false;
@@ -177,7 +180,7 @@ main( int argc, char** argv )
     else if( arg == "--curdev" )
     {
 #ifndef __CUDACC__
-      throw std::runtime_error( "CurandDevice is not supported on CPUs" );
+      throw std::runtime_error( "CurandDevice is not supported on CPUs or non-NVidia GPUs" );
 #elif defined MGONGPU_HAS_NO_CURAND
       throw std::runtime_error( "CurandDevice is not supported because this application was built without Curand support" );
 #else
@@ -198,7 +201,7 @@ main( int argc, char** argv )
     }
     else if( arg == "--rmbdev" )
     {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
       rmbsmp = RamboSamplingMode::RamboDevice;
 #else
       throw std::runtime_error( "RamboDevice is not supported on CPUs" );
@@ -272,13 +275,13 @@ main( int argc, char** argv )
     return usage( argv[0] );
   }
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 #ifdef _OPENMP
   ompnumthreadsNotSetMeansOneThread( debug ? 1 : 0 ); // quiet(-1), info(0), debug(1)
 #endif
 #endif
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // Fail gently and avoid "Illegal instruction (core dumped)" if the host does not support the SIMD used in the ME calculation
   // Note: this prevents a crash on pmpe04 but not on some github CI nodes?
   // [NB: SIMD vectorization in mg5amc C++ code is only used in the ME calculation below MatrixElementKernelHost!]
@@ -296,14 +299,14 @@ main( int argc, char** argv )
 
   // === STEP 0 - INITIALISE
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 
-  // --- 00. Initialise cuda
-  // Instantiate a CudaRuntime at the beginnining of the application's main to
-  // invoke cudaSetDevice(0) in the constructor and book a cudaDeviceReset() call in the destructor
-  const std::string cdinKey = "00 CudaInit";
+  // --- 00. Initialise GPU
+  // Instantiate a GpuRuntime at the beginnining of the application's main.
+  // For CUDA this invokes cudaSetDevice(0) in the constructor and books a cudaDeviceReset() call in the destructor.
+  const std::string cdinKey = "00 GpuInit";
   timermap.start( cdinKey );
-  CudaRuntime cudaRuntime( debug );
+  GpuRuntime GpuRuntime( debug );
 #endif
 
   // --- 0a. Initialise physics process
@@ -325,7 +328,7 @@ main( int argc, char** argv )
   timermap.start( alloKey );
 
   // Memory buffers for random numbers for momenta
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferRndNumMomenta hstRndmom( nevt );
 #else
   PinnedHostBufferRndNumMomenta hstRndmom( nevt );
@@ -333,7 +336,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for sampling weights
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferWeights hstWeights( nevt );
 #else
   PinnedHostBufferWeights hstWeights( nevt );
@@ -341,7 +344,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for momenta
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferMomenta hstMomenta( nevt );
 #else
   PinnedHostBufferMomenta hstMomenta( nevt );
@@ -349,7 +352,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for Gs
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferGs hstGs( nevt );
 #else
   PinnedHostBufferGs hstGs( nevt );
@@ -366,7 +369,7 @@ main( int argc, char** argv )
   }
 
   // Memory buffers for matrix elements
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferMatrixElements hstMatrixElements( nevt );
 #else
   PinnedHostBufferMatrixElements hstMatrixElements( nevt );
@@ -375,7 +378,7 @@ main( int argc, char** argv )
 
   // Memory buffers for random numbers for helicity selection
   // *** NB #403 these buffers always remain initialised at 0: no need for helicity choice in gcheck/check (no LHE produced) ***
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferRndNumHelicity hstRndHel( nevt );
 #else
   PinnedHostBufferRndNumHelicity hstRndHel( nevt );
@@ -384,7 +387,7 @@ main( int argc, char** argv )
 
   // Memory buffers for random numbers for color selection
   // *** NB #402 these buffers always remain initialised at 0: no need for color choice in gcheck/check (no LHE produced) ***
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferRndNumColor hstRndCol( nevt );
 #else
   PinnedHostBufferRndNumColor hstRndCol( nevt );
@@ -392,7 +395,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for helicity selection
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferSelectedHelicity hstSelHel( nevt );
 #else
   PinnedHostBufferSelectedHelicity hstSelHel( nevt );
@@ -400,7 +403,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for color selection
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferSelectedColor hstSelCol( nevt );
 #else
   PinnedHostBufferSelectedColor hstSelCol( nevt );
@@ -438,7 +441,7 @@ main( int argc, char** argv )
     const bool onDevice = true;
     prnk.reset( new CurandRandomNumberKernel( devRndmom, onDevice ) );
 #else
-    throw std::logic_error( "INTERNAL ERROR! CurandDevice is not supported on CPUs" ); // INTERNAL ERROR (no path to this statement)
+    throw std::logic_error( "INTERNAL ERROR! CurandDevice is not supported on CPUs or non-NVidia GPUs" ); // INTERNAL ERROR (no path to this statement)
 #endif
   }
 
@@ -450,7 +453,7 @@ main( int argc, char** argv )
   }
   else
   {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     prsk.reset( new RamboSamplingKernelDevice( energy, devRndmom, devMomenta, devWeights, gpublocks, gputhreads ) );
 #else
     throw std::logic_error( "RamboDevice is not supported on CPUs" ); // INTERNAL ERROR (no path to this statement)
@@ -461,7 +464,7 @@ main( int argc, char** argv )
   std::unique_ptr<MatrixElementKernelBase> pmek;
   if( !bridge )
   {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     pmek.reset( new MatrixElementKernelDevice( devMomenta, devGs, devRndHel, devRndCol, devMatrixElements, devSelHel, devSelCol, gpublocks, gputhreads ) );
 #else
     pmek.reset( new MatrixElementKernelHost( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, nevt ) );
@@ -469,7 +472,7 @@ main( int argc, char** argv )
   }
   else
   {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     pmek.reset( new BridgeKernelDevice( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, gpublocks, gputhreads ) );
 #else
     pmek.reset( new BridgeKernelHost( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, nevt ) );
@@ -511,7 +514,7 @@ main( int argc, char** argv )
     prnk->generateRnarray();
     //std::cout << "Got random numbers" << std::endl;
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     if( rndgen != RandomNumberMode::CurandDevice && rmbsmp == RamboSamplingMode::RamboDevice )
     {
       // --- 1c. Copy rndmom from host to device
@@ -543,7 +546,7 @@ main( int argc, char** argv )
     prsk->getMomentaFinal();
     //std::cout << "Got final momenta" << std::endl;
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     if( rmbsmp == RamboSamplingMode::RamboDevice )
     {
       // --- 2c. CopyDToH Weights
@@ -588,7 +591,7 @@ main( int argc, char** argv )
       dynamic_cast<BridgeKernelBase*>( pmek.get() )->transposeInputMomentaC2F();
     }
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     // --- 2d. CopyHToD Momenta
     const std::string gKey = "0.. CpHTDg";
     rambtime += timermap.start( gKey ); // FIXME! NOT A RAMBO TIMER!
@@ -617,7 +620,7 @@ main( int argc, char** argv )
     wv3atime += timermap.stop(); // calc only
     wavetime += wv3atime;        // calc plus copy
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     if( !bridge )
     {
       // --- 3b. CopyDToH MEs
@@ -760,18 +763,22 @@ main( int argc, char** argv )
     rndgentxt = "CURAND DEVICE";
 #ifdef __CUDACC__
   rndgentxt += " (CUDA code)";
+#elif defined __HIPCC__
+  rndgentxt += " (HIP code)";
 #else
   rndgentxt += " (C++ code)";
 #endif
 
   // Workflow description summary
   std::string wrkflwtxt;
-  // -- CUDA or C++?
+  // -- CUDA or HIP or C++?
 #ifdef __CUDACC__
   wrkflwtxt += "CUD:";
+#elif defined __HIPCC__
+  wrkflwtxt += "HIP:";
 #else
   wrkflwtxt += "CPP:";
-#endif
+#endif /* clang-format off */
   // -- DOUBLE or FLOAT?
 #if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
   wrkflwtxt += "MIX+"; // mixed fptypes (single precision color algebra #537)
@@ -781,7 +788,7 @@ main( int argc, char** argv )
   wrkflwtxt += "FLT+";
 #else
   wrkflwtxt += "???+"; // no path to this statement
-#endif
+#endif /* clang-format on */
   // -- CUCOMPLEX or THRUST or STD complex numbers?
 #ifdef __CUDACC__
 #if defined MGONGPU_CUCXTYPE_CUCOMPLEX
@@ -793,6 +800,12 @@ main( int argc, char** argv )
 #else
   wrkflwtxt += "???:"; // no path to this statement
 #endif
+#elif defined __HIPCC__
+#if defined MGONGPU_CUCXTYPE_CXSMPL
+  wrkflwtxt += "CXS:";
+#else
+  wrkflwtxt += "???:"; // no path to this statement
+#endif
 #else
 #if defined MGONGPU_CPPCXTYPE_STDCOMPLEX
   wrkflwtxt += "STX:";
@@ -818,7 +831,7 @@ main( int argc, char** argv )
     wrkflwtxt += "RMBDEV+";
   else
     wrkflwtxt += "??????+"; // no path to this statement
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   // -- HOST or DEVICE matrix elements? Standalone MEs or BRIDGE?
   if( !bridge )
     wrkflwtxt += "MESDEV";
@@ -874,7 +887,7 @@ main( int argc, char** argv )
 
   if( perf )
   {
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 #ifdef _OPENMP
     // Get the output of "nproc --all" (https://stackoverflow.com/a/478960)
     std::string nprocall;
@@ -895,6 +908,8 @@ main( int argc, char** argv )
     std::cout << std::string( SEP79, '*' ) << std::endl
 #ifdef __CUDACC__
               << "Process                     = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_CUDA"
+#elif defined __HIPCC__
+              << "Process                     = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_HIP"
 #else
               << "Process                     = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_CPP"
 #endif
@@ -921,21 +936,21 @@ main( int argc, char** argv )
 #elif defined MGONGPU_FPTYPE_FLOAT
               << "FP precision                = FLOAT (NaN/abnormal=" << nabn << ", zero=" << nzero << ")" << std::endl
 #endif
-#ifdef __CUDACC__
 #if defined MGONGPU_CUCXTYPE_CUCOMPLEX
               << "Complex type                = CUCOMPLEX" << std::endl
 #elif defined MGONGPU_CUCXTYPE_THRUST
               << "Complex type                = THRUST::COMPLEX" << std::endl
-#endif
-#else
+#elif defined MGONGPU_CUCXTYPE_CXSMPL
               << "Complex type                = STD::COMPLEX" << std::endl
+#else
+              << "Complex type                = ???" << std::endl // no path to this statement...
 #endif
               << "RanNumb memory layout       = AOSOA[" << neppR << "]"
               << ( neppR == 1 ? " == AOS" : "" )
               << " [HARDCODED FOR REPRODUCIBILITY]" << std::endl
               << "Momenta memory layout       = AOSOA[" << neppM << "]"
               << ( neppM == 1 ? " == AOS" : "" ) << std::endl
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     //<< "Wavefunction GPU memory     = LOCAL" << std::endl
 #else
 #if !defined MGONGPU_CPPSIMD
@@ -966,7 +981,7 @@ main( int argc, char** argv )
 #endif
 #endif
               << "Random number generation    = " << rndgentxt << std::endl
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 #ifdef _OPENMP
               << "OMP threads / `nproc --all` = " << omp_get_max_threads() << " / " << nprocall // includes a newline
 #endif
@@ -1062,14 +1077,14 @@ main( int argc, char** argv )
              << "\"FLOAT (NaN/abnormal=" << nabn << ")\"," << std::endl
 #endif
              << "\"Complex type\": "
-#ifdef __CUDACC__
 #if defined MGONGPU_CUCXTYPE_CUCOMPLEX
              << "\"CUCOMPLEX\"," << std::endl
 #elif defined MGONGPU_CUCXTYPE_THRUST
              << "\"THRUST::COMPLEX\"," << std::endl
-#endif
-#else
+#elif defined MGONGPU_CUCXTYPE_CXSMPL
              << "\"STD::COMPLEX\"," << std::endl
+#else
+             << "\"???\"," << std::endl                           // no path to this statement...
 #endif
              << "\"RanNumb memory layout\": "
              << "\"AOSOA[" << neppR << "]\""
@@ -1077,7 +1092,7 @@ main( int argc, char** argv )
              << "\"Momenta memory layout\": "
              << "\"AOSOA[" << neppM << "]\""
              << ( neppM == 1 ? " == AOS" : "" ) << ", " << std::endl
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     //<< "\"Wavefunction GPU memory\": " << "\"LOCAL\"," << std::endl
 #endif
              << "\"Curand generation\": "
diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/RamboSamplingKernels.cc b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/RamboSamplingKernels.cc
index da68aa9255..79abbcc4f8 100644
--- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/RamboSamplingKernels.cc
+++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/RamboSamplingKernels.cc
@@ -1,11 +1,11 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #include "RamboSamplingKernels.h"
 
-#include "CudaRuntime.h"
+#include "GpuRuntime.h"
 #include "MemoryAccessMomenta.h"
 #include "MemoryAccessRandomNumbers.h"
 #include "MemoryAccessWeights.h"
@@ -14,7 +14,7 @@
 
 #include <sstream>
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -92,7 +92,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   RamboSamplingKernelDevice::RamboSamplingKernelDevice( const fptype energy,               // input: energy
                                                         const BufferRndNumMomenta& rndmom, // input: random numbers in [0,1]
                                                         BufferMomenta& momenta,            // output: momenta
@@ -135,7 +135,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   __global__ void
   getMomentaInitialDevice( const fptype energy,
                            fptype* momenta )
@@ -147,17 +147,17 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   void
   RamboSamplingKernelDevice::getMomentaInitial()
   {
-    getMomentaInitialDevice<<<m_gpublocks, m_gputhreads>>>( m_energy, m_momenta.data() );
+    gpuLaunchKernel( getMomentaInitialDevice, m_gpublocks, m_gputhreads, m_energy, m_momenta.data() );
   }
 #endif
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   __global__ void
   getMomentaFinalDevice( const fptype energy,
                          const fptype* rndmom,
@@ -171,11 +171,11 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   void
   RamboSamplingKernelDevice::getMomentaFinal()
   {
-    getMomentaFinalDevice<<<m_gpublocks, m_gputhreads>>>( m_energy, m_rndmom.data(), m_momenta.data(), m_weights.data() );
+    gpuLaunchKernel( getMomentaFinalDevice, m_gpublocks, m_gputhreads, m_energy, m_rndmom.data(), m_momenta.data(), m_weights.data() );
   }
 #endif
 
diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/RamboSamplingKernels.h b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/RamboSamplingKernels.h
index 184089efd7..7c214cd74b 100644
--- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/RamboSamplingKernels.h
+++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/RamboSamplingKernels.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef RAMBOSAMPLINGKERNELS_H
 #define RAMBOSAMPLINGKERNELS_H 1
@@ -10,7 +10,7 @@
 
 #include "MemoryBuffers.h"
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -93,7 +93,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   // A class encapsulating RAMBO phase space sampling on a GPU device
   class RamboSamplingKernelDevice final : public SamplingKernelBase, public NumberOfEvents
   {
diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/RandomNumberKernels.h b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/RandomNumberKernels.h
index 188a72c2c9..21d63beeac 100644
--- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/RandomNumberKernels.h
+++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/RandomNumberKernels.h
@@ -1,14 +1,14 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef RANDOMNUMBERKERNELS_H
 #define RANDOMNUMBERKERNELS_H 1
 
 #include "mgOnGpuConfig.h"
 
-// NB This must come AFTER mgOnGpuConfig.h which contains our definition of __global__ when __CUDACC__ is not defined
+// NB This must come AFTER mgOnGpuConfig.h which contains our definition of __global__ when MGONGPUCPP_GPUIMPL is not defined
 #ifndef MGONGPU_HAS_NO_CURAND
 //#include "curand.h"
 struct curandGenerator_st; // forward definition from curand.h
@@ -16,7 +16,7 @@ struct curandGenerator_st; // forward definition from curand.h
 
 #include "MemoryBuffers.h"
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/cudacpp.mk b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/cudacpp.mk
index 509307506b..f2cfa349da 100644
--- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/cudacpp.mk
+++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/cudacpp.mk
@@ -1,7 +1,7 @@
 # Copyright (C) 2020-2023 CERN and UCLouvain.
 # Licensed under the GNU Lesser General Public License (version 3 or later).
 # Created by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin.
-# Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+# Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 
 #=== Determine the name of this makefile (https://ftp.gnu.org/old-gnu/Manuals/make-3.80/html_node/make_17.html)
 #=== NB: use ':=' to ensure that the value of CUDACPP_MAKEFILE is not modified further down after including make_opts
@@ -42,10 +42,10 @@ endif
 
 #-------------------------------------------------------------------------------
 
-#=== Configure common compiler flags for C++ and CUDA
+#=== Configure common compiler flags for C++ and CUDA/HIP
 
 INCFLAGS = -I.
-OPTFLAGS = -O3 # this ends up in CUFLAGS too (should it?), cannot add -Ofast or -ffast-math here
+OPTFLAGS = -O3 # this ends up in GPUFLAGS too (should it?), cannot add -Ofast or -ffast-math here
 
 # Dependency on src directory
 MG5AMC_COMMONLIB = mg5amc_common
@@ -121,24 +121,46 @@ endif
 
 #-------------------------------------------------------------------------------
 
-#=== Configure the CUDA compiler
+#=== Configure the GPU compiler (CUDA or HIP)
 
-# If CXX is not a single word (example "clang++ --gcc-toolchain...") then disable CUDA builds (issue #505)
-# This is because it is impossible to pass this to "CUFLAGS += -ccbin <host-compiler>" below
+# FIXME! (AV 24.01.2024)
+# In the current implementation (without separate builds for C++ and CUDA/HIP), we first check for cudacc and hipcc in CUDA_HOME and HIP_HOME.
+# If CUDA_HOME or HIP_HOME are not set, try to determine them from the path to cudacc and hipcc.
+# While convoluted, this is currently necessary to allow disabling CUDA/HIP builds by setting CUDA_HOME or HIP_HOME to invalid paths.
+# This will (probably?) be fixed when separate C++ and CUDA/HIP builds are implemented (PR #775).
+
+# If CXX is not a single word (example "clang++ --gcc-toolchain...") then disable CUDA and HIP builds (issue #505)
+# This is because it is impossible to pass this to "GPUFLAGS += -ccbin <host-compiler>" below
 ifneq ($(words $(subst ccache ,,$(CXX))),1) # allow at most "CXX=ccache <host-compiler>" from outside
-  $(warning CUDA builds are not supported for multi-word CXX "$(CXX)")
+  $(warning CUDA and HIP builds are not supported for multi-word CXX "$(CXX)")
   override CUDA_HOME=disabled
+  override HIP_HOME=disabled
 endif
 
-# If CUDA_HOME is not set, try to set it from the location of nvcc
+# If CUDA_HOME is not set, try to set it from the path to nvcc
 ifndef CUDA_HOME
   CUDA_HOME = $(patsubst %bin/nvcc,%,$(shell which nvcc 2>/dev/null))
   $(warning CUDA_HOME was not set: using "$(CUDA_HOME)")
 endif
 
-# Set NVCC as $(CUDA_HOME)/bin/nvcc if it exists
+# If HIP_HOME is not set, try to set it from the path to hipcc
+ifndef HIP_HOME
+  HIP_HOME = $(patsubst %bin/hipcc,%,$(HIP_COMPILER_PATH))
+  $(warning HIP_HOME was not set: using "$(HIP_HOME)")
+endif
+
+# FIXME! (AV 24.01.2024)
+# In the current implementation (without separate builds for C++ and CUDA/HIP),
+# builds are performed for HIP only if CUDA is not found in the path.
+# If both CUDA and HIP are installed, HIP builds can be triggered by unsetting CUDA_HOME.
+# This will be fixed when separate C++ and CUDA/HIP builds are implemented (PR #775).
+
+#--- Option 1: CUDA exists -> use CUDA
+
+# Set GPUCC as $(CUDA_HOME)/bin/nvcc if it exists
 ifneq ($(wildcard $(CUDA_HOME)/bin/nvcc),)
-  NVCC = $(CUDA_HOME)/bin/nvcc
+
+  GPUCC = $(CUDA_HOME)/bin/nvcc
   USE_NVTX ?=-DUSE_NVTX
   # See https://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/index.html
   # See https://arnon.dk/matching-sm-architectures-arch-and-gencode-for-various-nvidia-cards/
@@ -158,41 +180,78 @@ ifneq ($(wildcard $(CUDA_HOME)/bin/nvcc),)
     CURANDLIBFLAGS = -L$(CUDA_HOME)/lib64/ -lcurand # NB: -lcuda is not needed here!
   endif
   CUOPTFLAGS = -lineinfo
-  CUFLAGS = $(foreach opt, $(OPTFLAGS), -Xcompiler $(opt)) $(CUOPTFLAGS) $(INCFLAGS) $(CUINC) $(USE_NVTX) $(CUARCHFLAGS) -use_fast_math
-  ###CUFLAGS += -Xcompiler -Wall -Xcompiler -Wextra -Xcompiler -Wshadow
-  ###NVCC_VERSION = $(shell $(NVCC) --version | grep 'Cuda compilation tools' | cut -d' ' -f5 | cut -d, -f1)
-  CUFLAGS += -std=c++17 # need CUDA >= 11.2 (see #333): this is enforced in mgOnGpuConfig.h
+  ###GPUFLAGS = $(OPTFLAGS) $(CUOPTFLAGS) $(INCFLAGS) $(CUINC) $(USE_NVTX) $(CUARCHFLAGS) -use_fast_math
+  GPUFLAGS = $(foreach opt, $(OPTFLAGS), -Xcompiler $(opt)) $(CUOPTFLAGS) $(INCFLAGS) $(CUINC) $(USE_NVTX) $(CUARCHFLAGS) -use_fast_math
+  ###GPUFLAGS += -Xcompiler -Wall -Xcompiler -Wextra -Xcompiler -Wshadow
+  ###GPUCC_VERSION = $(shell $(GPUCC) --version | grep 'Cuda compilation tools' | cut -d' ' -f5 | cut -d, -f1)
+  GPUFLAGS += -std=c++17 # need CUDA >= 11.2 (see #333): this is enforced in mgOnGpuConfig.h
   # Without -maxrregcount: baseline throughput: 6.5E8 (16384 32 12) up to 7.3E8 (65536 128 12)
-  ###CUFLAGS+= --maxrregcount 160 # improves throughput: 6.9E8 (16384 32 12) up to 7.7E8 (65536 128 12)
-  ###CUFLAGS+= --maxrregcount 128 # improves throughput: 7.3E8 (16384 32 12) up to 7.6E8 (65536 128 12)
-  ###CUFLAGS+= --maxrregcount 96 # degrades throughput: 4.1E8 (16384 32 12) up to 4.5E8 (65536 128 12)
-  ###CUFLAGS+= --maxrregcount 64 # degrades throughput: 1.7E8 (16384 32 12) flat at 1.7E8 (65536 128 12)
+  ###GPUFLAGS+= --maxrregcount 160 # improves throughput: 6.9E8 (16384 32 12) up to 7.7E8 (65536 128 12)
+  ###GPUFLAGS+= --maxrregcount 128 # improves throughput: 7.3E8 (16384 32 12) up to 7.6E8 (65536 128 12)
+  ###GPUFLAGS+= --maxrregcount 96 # degrades throughput: 4.1E8 (16384 32 12) up to 4.5E8 (65536 128 12)
+  ###GPUFLAGS+= --maxrregcount 64 # degrades throughput: 1.7E8 (16384 32 12) flat at 1.7E8 (65536 128 12)
+  CUBUILDRULEFLAGS = -Xcompiler -fPIC -c
+  CCBUILDRULEFLAGS = -Xcompiler -fPIC -c -x cu
+  CUDATESTFLAGS = -lcuda
+
+  # Set the host C++ compiler for GPUCC via "-ccbin <host-compiler>"
+  # (NB issue #505: this must be a single word, "clang++ --gcc-toolchain..." is not supported)
+  GPUFLAGS += -ccbin $(shell which $(subst ccache ,,$(CXX)))
+
+  # Allow newer (unsupported) C++ compilers with older versions of CUDA if ALLOW_UNSUPPORTED_COMPILER_IN_CUDA is set (#504)
+  ifneq ($(origin ALLOW_UNSUPPORTED_COMPILER_IN_CUDA),undefined)
+  GPUFLAGS += -allow-unsupported-compiler
+  endif
+
 else ifneq ($(origin REQUIRE_CUDA),undefined)
+
   # If REQUIRE_CUDA is set but no cuda is found, stop here (e.g. for CI tests on GPU #443)
-  $(error No cuda installation found (set CUDA_HOME or make nvcc visible in PATH))
+  $(error No cuda installation found (set CUDA_HOME or make GPUCC visible in PATH))
+
+#--- Option 2: CUDA does not exist, HIP exists -> use HIP
+
+# Set GPUCC as $(HIP_HOME)/bin/hipcc if it exists
+else ifneq ($(wildcard $(HIP_HOME)/bin/hipcc),)
+
+  GPUCC = $(HIP_HOME)/bin/hipcc
+  #USE_NVTX ?=-DUSE_NVTX # should maybe find something equivalent to this in HIP?
+  HIPARCHFLAGS = -target x86_64-linux-gnu --offload-arch=gfx90a
+  HIPINC = -I$(HIP_HOME)/include/
+  # Note: -DHIP_FAST_MATH is equivalent to -use_fast_math in HIP 
+  # (but only for single precision line 208: https://rocm-developer-tools.github.io/HIP/hcc__detail_2math__functions_8h_source.html)
+  GPUFLAGS = $(OPTFLAGS) $(CUOPTFLAGS) $(INCFLAGS) $(HIPINC) $(HIPARCHFLAGS) -DHIP_FAST_MATH -DHIP_PLATFORM=amd -fPIC
+  ###GPUFLAGS += -Xcompiler -Wall -Xcompiler -Wextra -Xcompiler -Wshadow
+  GPUFLAGS += -std=c++17
+  ###GPUFLAGS+= --maxrregcount 255 # (AV: is this option valid on HIP and meaningful on AMD GPUs?)
+  CUBUILDRULEFLAGS = -fPIC -c
+  CCBUILDRULEFLAGS = -fPIC -c
+
+else ifneq ($(origin REQUIRE_HIP),undefined)
+
+  # If REQUIRE_HIP is set but no HIP is found, stop here (e.g. for CI tests on GPU #443)
+  $(error No hip installation found (set HIP_HOME or make GPUCC visible in PATH))
+
+#--- Option 3: CUDA does not exist, HIP does not exist -> switch off both CUDA and HIP
+
 else
-  # No cuda. Switch cuda compilation off and go to common random numbers in C++
+
+  # No cudacc and no hipcc: switch CUDA and HIP compilation off and go to common random numbers in C++
   $(warning CUDA_HOME is not set or is invalid: export CUDA_HOME to compile with cuda)
-  override NVCC=
+  $(warning HIP_HOME is not set or is invalid: export HIP_HOME to compile with hip)
+  override GPUCC=
   override USE_NVTX=
   override CUINC=
   override CURANDLIBFLAGS=
-endif
-export NVCC
-export CUFLAGS
-
-# Set the host C++ compiler for nvcc via "-ccbin <host-compiler>"
-# (NB issue #505: this must be a single word, "clang++ --gcc-toolchain..." is not supported)
-CUFLAGS += -ccbin $(shell which $(subst ccache ,,$(CXX)))
 
-# Allow newer (unsupported) C++ compilers with older versions of CUDA if ALLOW_UNSUPPORTED_COMPILER_IN_CUDA is set (#504)
-ifneq ($(origin ALLOW_UNSUPPORTED_COMPILER_IN_CUDA),undefined)
-CUFLAGS += -allow-unsupported-compiler
 endif
 
+# Export GPUCC (so that it can also be used in cudacpp_src.mk?)
+export GPUCC
+export GPUFLAGS
+
 #-------------------------------------------------------------------------------
 
-#=== Configure ccache for C++ and CUDA builds
+#=== Configure ccache for C++ and CUDA/HIP builds
 
 # Enable ccache if USECCACHE=1
 ifeq ($(USECCACHE)$(shell echo $(CXX) | grep ccache),1)
@@ -201,15 +260,15 @@ endif
 #ifeq ($(USECCACHE)$(shell echo $(AR) | grep ccache),1)
 #  override AR:=ccache $(AR)
 #endif
-ifneq ($(NVCC),)
-  ifeq ($(USECCACHE)$(shell echo $(NVCC) | grep ccache),1)
-    override NVCC:=ccache $(NVCC)
+ifneq ($(GPUCC),)
+  ifeq ($(USECCACHE)$(shell echo $(GPUCC) | grep ccache),1)
+    override GPUCC:=ccache $(GPUCC)
   endif
 endif
 
 #-------------------------------------------------------------------------------
 
-#=== Configure PowerPC-specific compiler flags for C++ and CUDA
+#=== Configure PowerPC-specific compiler flags for C++ and CUDA/HIP
 
 # PowerPC-specific CXX compiler flags (being reviewed)
 ifeq ($(UNAME_P),ppc64le)
@@ -225,9 +284,9 @@ else
   ######CXXFLAGS+= -fno-semantic-interposition # no benefit (neither alone, nor combined with -flto)
 endif
 
-# PowerPC-specific CUDA compiler flags (to be reviewed!)
+# PowerPC-specific CUDA/HIP compiler flags (to be reviewed!)
 ifeq ($(UNAME_P),ppc64le)
-  CUFLAGS+= -Xcompiler -mno-float128
+  GPUFLAGS+= -Xcompiler -mno-float128
 endif
 
 #-------------------------------------------------------------------------------
@@ -237,7 +296,7 @@ endif
 # Set the default OMPFLAGS choice
 ifneq ($(shell $(CXX) --version | egrep '^Intel'),)
 override OMPFLAGS = -fopenmp
-###override OMPFLAGS = # disable OpenMP MT on Intel (was ok without nvcc but not ok with nvcc before #578)
+###override OMPFLAGS = # disable OpenMP MT on Intel (was ok without GPUCC but not ok with GPUCC before #578)
 else ifneq ($(shell $(CXX) --version | egrep '^(clang)'),)
 override OMPFLAGS = -fopenmp
 ###override OMPFLAGS = # disable OpenMP MT on clang (was not ok without or with nvcc before #578)
@@ -293,7 +352,10 @@ endif
 
 # Set the default RNDGEN (random number generator) choice
 ifeq ($(RNDGEN),)
-  ifeq ($(NVCC),)
+  ifeq ($(GPUCC),)
+    override RNDGEN = hasNoCurand
+  # Edgecase for HIP compilation
+  else ifeq ($(findstring hipcc,$(GPUCC)),hipcc)
     override RNDGEN = hasNoCurand
   else ifeq ($(RNDGEN),)
     override RNDGEN = hasCurand
@@ -310,7 +372,7 @@ export OMPFLAGS
 
 #-------------------------------------------------------------------------------
 
-#=== Set the CUDA/C++ compiler flags appropriate to user-defined choices of AVX, FPTYPE, HELINL, HRDCOD, RNDGEN
+#=== Set the CUDA/HIP/C++ compiler flags appropriate to user-defined choices of AVX, FPTYPE, HELINL, HRDCOD, RNDGEN
 
 # Set the build flags appropriate to OMPFLAGS
 $(info OMPFLAGS=$(OMPFLAGS))
@@ -368,13 +430,13 @@ CXXFLAGS+= $(AVXFLAGS)
 $(info FPTYPE=$(FPTYPE))
 ifeq ($(FPTYPE),d)
   CXXFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_DOUBLE
-  CUFLAGS  += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_DOUBLE
+  GPUFLAGS  += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_DOUBLE
 else ifeq ($(FPTYPE),f)
   CXXFLAGS += -DMGONGPU_FPTYPE_FLOAT -DMGONGPU_FPTYPE2_FLOAT
-  CUFLAGS  += -DMGONGPU_FPTYPE_FLOAT -DMGONGPU_FPTYPE2_FLOAT
+  GPUFLAGS  += -DMGONGPU_FPTYPE_FLOAT -DMGONGPU_FPTYPE2_FLOAT
 else ifeq ($(FPTYPE),m)
   CXXFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_FLOAT
-  CUFLAGS  += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_FLOAT
+  GPUFLAGS  += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_FLOAT
 else
   $(error Unknown FPTYPE='$(FPTYPE)': only 'd', 'f' and 'm' are supported)
 endif
@@ -383,7 +445,7 @@ endif
 $(info HELINL=$(HELINL))
 ifeq ($(HELINL),1)
   CXXFLAGS += -DMGONGPU_INLINE_HELAMPS
-  CUFLAGS  += -DMGONGPU_INLINE_HELAMPS
+  GPUFLAGS  += -DMGONGPU_INLINE_HELAMPS
 else ifneq ($(HELINL),0)
   $(error Unknown HELINL='$(HELINL)': only '0' and '1' are supported)
 endif
@@ -392,7 +454,7 @@ endif
 $(info HRDCOD=$(HRDCOD))
 ifeq ($(HRDCOD),1)
   CXXFLAGS += -DMGONGPU_HARDCODE_PARAM
-  CUFLAGS  += -DMGONGPU_HARDCODE_PARAM
+  GPUFLAGS  += -DMGONGPU_HARDCODE_PARAM
 else ifneq ($(HRDCOD),0)
   $(error Unknown HRDCOD='$(HRDCOD)': only '0' and '1' are supported)
 endif
@@ -444,11 +506,11 @@ ifeq ($(UNAME_S),Darwin)
   override CULIBFLAGSRPATH2 =
 else
   # RPATH to cuda/cpp libs when linking executables
-  override CXXLIBFLAGSRPATH = -Wl,-rpath,$(LIBDIRRPATH)
-  override CULIBFLAGSRPATH = -Xlinker -rpath,$(LIBDIRRPATH)
+  override CXXLIBFLAGSRPATH = -Wl,-rpath=$(LIBDIRRPATH)
+  override CULIBFLAGSRPATH = -Xlinker -rpath=$(LIBDIRRPATH)
   # RPATH to common lib when linking cuda/cpp libs
-  override CXXLIBFLAGSRPATH2 = -Wl,-rpath,'$$ORIGIN'
-  override CULIBFLAGSRPATH2 = -Xlinker -rpath,'$$ORIGIN'
+  override CXXLIBFLAGSRPATH2 = -Wl,-rpath='$$ORIGIN'
+  override CULIBFLAGSRPATH2 = -Xlinker -rpath='$$ORIGIN'
 endif
 
 # Setting LD_LIBRARY_PATH or DYLD_LIBRARY_PATH in the RUNTIME is no longer necessary (neither on Linux nor on Mac)
@@ -461,7 +523,7 @@ override RUNTIME =
 cxx_main=$(BUILDDIR)/check.exe
 fcxx_main=$(BUILDDIR)/fcheck.exe
 
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 cu_main=$(BUILDDIR)/gcheck.exe
 fcu_main=$(BUILDDIR)/fgcheck.exe
 else
@@ -492,15 +554,16 @@ $(BUILDDIR)/.build.$(TAG):
 	@touch $(BUILDDIR)/.build.$(TAG)
 
 # Generic target and build rules: objects from CUDA compilation
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 $(BUILDDIR)/%.o : %.cu *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
 	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
-	$(NVCC) $(CPPFLAGS) $(CUFLAGS) -Xcompiler -fPIC -c $< -o $@
+	$(GPUCC) $(CPPFLAGS) $(GPUFLAGS) $(CUBUILDRULEFLAGS) $< -o $@
 
 $(BUILDDIR)/%_cu.o : %.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
 	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
-	$(NVCC) $(CPPFLAGS) $(CUFLAGS) -Xcompiler -fPIC -c -x cu $< -o $@
+	$(GPUCC) $(CPPFLAGS) $(GPUFLAGS) $(CCBUILDRULEFLAGS) $< -o $@
 endif
+# -x cu in line above
 
 # Generic target and build rules: objects from C++ compilation
 # (NB do not include CUINC here! add it only for NVTX or curand #679)
@@ -509,11 +572,14 @@ $(BUILDDIR)/%.o : %.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
 	$(CXX) $(CPPFLAGS) $(CXXFLAGS) -fPIC -c $< -o $@
 
 # Apply special build flags only to CrossSectionKernel.cc and gCrossSectionKernel.cu (no fast math, see #117 and #516)
+# Added edgecase for HIP compilation
 ifeq ($(shell $(CXX) --version | grep ^nvc++),)
 $(BUILDDIR)/CrossSectionKernels.o: CXXFLAGS := $(filter-out -ffast-math,$(CXXFLAGS))
 $(BUILDDIR)/CrossSectionKernels.o: CXXFLAGS += -fno-fast-math
-ifneq ($(NVCC),)
-$(BUILDDIR)/gCrossSectionKernels.o: CUFLAGS += -Xcompiler -fno-fast-math
+ifeq ($(findstring nvcc,$(GPUCC)),nvcc)
+  $(BUILDDIR)/gCrossSectionKernels.o: GPUFLAGS += -Xcompiler -fno-fast-math
+else
+  $(BUILDDIR)/gCrossSectionKernels.o: GPUFLAGS += -fno-fast-math
 endif
 endif
 
@@ -530,10 +596,10 @@ ifeq ($(RNDGEN),hasCurand)
 $(BUILDDIR)/CurandRandomNumberKernel.o: CXXFLAGS += $(CUINC)
 endif
 
-# Avoid "warning: builtin __has_trivial_... is deprecated; use __is_trivially_... instead" in nvcc with icx2023 (#592)
+# Avoid "warning: builtin __has_trivial_... is deprecated; use __is_trivially_... instead" in GPUCC with icx2023 (#592)
 ifneq ($(shell $(CXX) --version | egrep '^(Intel)'),)
-ifneq ($(NVCC),)
-CUFLAGS += -Xcompiler -Wno-deprecated-builtins
+ifneq ($(GPUCC),)
+GPUFLAGS += -Wno-deprecated-builtins
 endif
 endif
 
@@ -541,8 +607,8 @@ endif
 # This patch does remove the warning, but I prefer to keep it disabled for the moment...
 ###ifneq ($(shell $(CXX) --version | egrep '^(clang|Apple clang|Intel)'),)
 ###$(BUILDDIR)/CrossSectionKernels.o: CXXFLAGS += -Wno-overriding-t-option
-###ifneq ($(NVCC),)
-###$(BUILDDIR)/gCrossSectionKernels.o: CUFLAGS += -Xcompiler -Wno-overriding-t-option
+###ifneq ($(GPUCC),)
+###$(BUILDDIR)/gCrossSectionKernels.o: GPUFLAGS += -Xcompiler -Wno-overriding-t-option
 ###endif
 ###endif
 
@@ -569,7 +635,7 @@ MG5AMC_CXXLIB = mg5amc_$(processid_short)_cpp
 cxx_objects_lib=$(BUILDDIR)/CPPProcess.o $(BUILDDIR)/MatrixElementKernels.o $(BUILDDIR)/BridgeKernels.o $(BUILDDIR)/CrossSectionKernels.o
 cxx_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel.o $(BUILDDIR)/RamboSamplingKernels.o
 
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 MG5AMC_CULIB = mg5amc_$(processid_short)_cuda
 cu_objects_lib=$(BUILDDIR)/gCPPProcess.o $(BUILDDIR)/gMatrixElementKernels.o $(BUILDDIR)/gBridgeKernels.o $(BUILDDIR)/gCrossSectionKernels.o
 cu_objects_exe=$(BUILDDIR)/gCommonRandomNumberKernel.o $(BUILDDIR)/gRamboSamplingKernels.o
@@ -581,11 +647,11 @@ $(LIBDIR)/lib$(MG5AMC_CXXLIB).so: cxx_objects_lib += $(BUILDDIR)/fbridge.o
 $(LIBDIR)/lib$(MG5AMC_CXXLIB).so: $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib)
 	$(CXX) -shared -o $@ $(cxx_objects_lib) $(CXXLIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB)
 
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 $(LIBDIR)/lib$(MG5AMC_CULIB).so: $(BUILDDIR)/fbridge_cu.o
 $(LIBDIR)/lib$(MG5AMC_CULIB).so: cu_objects_lib += $(BUILDDIR)/fbridge_cu.o
 $(LIBDIR)/lib$(MG5AMC_CULIB).so: $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cu_objects_lib)
-	$(NVCC) --shared -o $@ $(cu_objects_lib) $(CULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB)
+	$(GPUCC) --shared -o $@ $(cu_objects_lib) $(CULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB)
 endif
 
 #-------------------------------------------------------------------------------
@@ -602,16 +668,16 @@ $(cxx_main): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PAT
 $(cxx_main): $(BUILDDIR)/check_sa.o $(LIBDIR)/lib$(MG5AMC_CXXLIB).so $(cxx_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel.o
 	$(CXX) -o $@ $(BUILDDIR)/check_sa.o $(OMPFLAGS) -ldl -pthread $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_CXXLIB) $(cxx_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel.o $(CURANDLIBFLAGS)
 
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 ifneq ($(shell $(CXX) --version | grep ^Intel),)
-$(cu_main): LIBFLAGS += -lintlc # compile with icpx and link with nvcc (undefined reference to `_intel_fast_memcpy')
-$(cu_main): LIBFLAGS += -lsvml # compile with icpx and link with nvcc (undefined reference to `__svml_cos4_l9')
+$(cu_main): LIBFLAGS += -lintlc # compile with icpx and link with GPUCC (undefined reference to `_intel_fast_memcpy')
+$(cu_main): LIBFLAGS += -lsvml # compile with icpx and link with GPUCC (undefined reference to `__svml_cos4_l9')
 else ifneq ($(shell $(CXX) --version | grep ^nvc++),) # support nvc++ #531
 $(cu_main): LIBFLAGS += -L$(patsubst %bin/nvc++,%lib,$(subst ccache ,,$(CXX))) -lnvhpcatm -lnvcpumath -lnvc
 endif
 $(cu_main): LIBFLAGS += $(CULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH
 $(cu_main): $(BUILDDIR)/gcheck_sa.o $(LIBDIR)/lib$(MG5AMC_CULIB).so $(cu_objects_exe) $(BUILDDIR)/gCurandRandomNumberKernel.o
-	$(NVCC) -o $@ $(BUILDDIR)/gcheck_sa.o $(CUARCHFLAGS) $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe) $(BUILDDIR)/gCurandRandomNumberKernel.o $(CURANDLIBFLAGS)
+	$(GPUCC) -o $@ $(BUILDDIR)/gcheck_sa.o $(CUARCHFLAGS) $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe) $(BUILDDIR)/gCurandRandomNumberKernel.o $(CURANDLIBFLAGS)
 endif
 
 #-------------------------------------------------------------------------------
@@ -637,17 +703,17 @@ $(fcxx_main): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PA
 $(fcxx_main): $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler.o $(LIBDIR)/lib$(MG5AMC_CXXLIB).so $(cxx_objects_exe)
 	$(CXX) -o $@ $(BUILDDIR)/fcheck_sa.o $(OMPFLAGS) $(BUILDDIR)/fsampler.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CXXLIB) $(cxx_objects_exe)
 
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 ifneq ($(shell $(CXX) --version | grep ^Intel),)
-$(fcu_main): LIBFLAGS += -lintlc # compile with icpx and link with nvcc (undefined reference to `_intel_fast_memcpy')
-$(fcu_main): LIBFLAGS += -lsvml # compile with icpx and link with nvcc (undefined reference to `__svml_cos4_l9')
+$(fcu_main): LIBFLAGS += -lintlc # compile with icpx and link with GPUCC (undefined reference to `_intel_fast_memcpy')
+$(fcu_main): LIBFLAGS += -lsvml # compile with icpx and link with GPUCC (undefined reference to `__svml_cos4_l9')
 endif
 ifeq ($(UNAME_S),Darwin)
 $(fcu_main): LIBFLAGS += -L$(shell dirname $(shell $(FC) --print-file-name libgfortran.dylib)) # add path to libgfortran on Mac #375
 endif
 $(fcu_main): LIBFLAGS += $(CULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH
 $(fcu_main): $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler_cu.o $(LIBDIR)/lib$(MG5AMC_CULIB).so $(cu_objects_exe)
-	$(NVCC) -o $@ $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler_cu.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe)
+	$(GPUCC) -o $@ $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler_cu.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe)
 endif
 
 #-------------------------------------------------------------------------------
@@ -659,7 +725,7 @@ $(BUILDDIR)/testxxx.o: testxxx_cc_ref.txt
 $(testmain): $(BUILDDIR)/testxxx.o
 $(testmain): cxx_objects_exe += $(BUILDDIR)/testxxx.o # Comment out this line to skip the C++ test of xxx functions
 
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 $(BUILDDIR)/testxxx_cu.o: $(GTESTLIBS)
 $(BUILDDIR)/testxxx_cu.o: INCFLAGS += $(GTESTINC)
 $(BUILDDIR)/testxxx_cu.o: testxxx_cc_ref.txt
@@ -672,7 +738,7 @@ $(BUILDDIR)/testmisc.o: INCFLAGS += $(GTESTINC)
 $(testmain): $(BUILDDIR)/testmisc.o
 $(testmain): cxx_objects_exe += $(BUILDDIR)/testmisc.o # Comment out this line to skip the C++ miscellaneous tests
 
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 $(BUILDDIR)/testmisc_cu.o: $(GTESTLIBS)
 $(BUILDDIR)/testmisc_cu.o: INCFLAGS += $(GTESTINC)
 $(testmain): $(BUILDDIR)/testmisc_cu.o
@@ -684,12 +750,12 @@ $(BUILDDIR)/runTest.o: INCFLAGS += $(GTESTINC)
 $(testmain): $(BUILDDIR)/runTest.o
 $(testmain): cxx_objects_exe += $(BUILDDIR)/runTest.o
 
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 $(BUILDDIR)/runTest_cu.o: $(GTESTLIBS)
 $(BUILDDIR)/runTest_cu.o: INCFLAGS += $(GTESTINC)
 ifneq ($(shell $(CXX) --version | grep ^Intel),)
-$(testmain): LIBFLAGS += -lintlc # compile with icpx and link with nvcc (undefined reference to `_intel_fast_memcpy')
-$(testmain): LIBFLAGS += -lsvml # compile with icpx and link with nvcc (undefined reference to `__svml_cos4_l9')
+$(testmain): LIBFLAGS += -lintlc # compile with icpx and link with GPUCC (undefined reference to `_intel_fast_memcpy')
+$(testmain): LIBFLAGS += -lsvml # compile with icpx and link with GPUCC (undefined reference to `__svml_cos4_l9')
 else ifneq ($(shell $(CXX) --version | grep ^nvc++),) # support nvc++ #531
 $(testmain): LIBFLAGS += -L$(patsubst %bin/nvc++,%lib,$(subst ccache ,,$(CXX))) -lnvhpcatm -lnvcpumath -lnvc
 endif
@@ -713,14 +779,14 @@ $(testmain): LIBFLAGS += -lgomp
 endif
 endif
 
-ifeq ($(NVCC),) # link only runTest.o
+ifeq ($(GPUCC),) # link only runTest.o
 $(testmain): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH
 $(testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(cxx_objects_exe) $(GTESTLIBS)
 	$(CXX) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) -ldl -pthread $(LIBFLAGS)
 else # link both runTest.o and runTest_cu.o
 $(testmain): LIBFLAGS += $(CULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH
 $(testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) $(GTESTLIBS)
-	$(NVCC) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) -ldl $(LIBFLAGS) -lcuda
+	  $(GPUCC) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) -ldl $(LIBFLAGS) $(CUDATESTFLAGS)
 endif
 
 # Use target gtestlibs to build only googletest
@@ -829,9 +895,9 @@ ifeq ($(USECCACHE),1)
 	ccache --version | head -1
 endif
 	@echo ""
-	@echo NVCC=$(NVCC)
-ifneq ($(NVCC),)
-	$(NVCC) --version
+	@echo GPUCC=$(GPUCC)
+ifneq ($(GPUCC),)
+	$(GPUCC) --version
 endif
 	@echo ""
 	@echo CXX=$(CXX)
@@ -850,7 +916,7 @@ endif
 
 # Target: check (run the C++ test executable)
 # [NB THIS IS WHAT IS USED IN THE GITHUB CI!]
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 check: runTest cmpFcheck cmpFGcheck
 else
 check: runTest cmpFcheck
diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/fbridge.cc b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/fbridge.cc
index 2d2b36d560..22ce3f5115 100644
--- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/fbridge.cc
+++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/fbridge.cc
@@ -1,11 +1,11 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: S. Roiser (Oct 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Roiser, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #include "Bridge.h"
 #include "CPPProcess.h"
-#include "CudaRuntime.h"
+#include "GpuRuntime.h"
 
 extern "C"
 {
@@ -22,7 +22,7 @@ extern "C"
    * Using the same Fortran MadEvent code, linking to the hetrerogeneous library would allow access to both CPU and GPU implementations.
    * The specific heterogeneous configuration (how many GPUs, how many threads on each CPU, etc) could be loaded in CUDA/C++ from a data file.
    */
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   using namespace mg5amcGpu;
 #else
   using namespace mg5amcCpu;
@@ -46,8 +46,8 @@ extern "C"
    */
   void fbridgecreate_( CppObjectInFortran** ppbridge, const int* pnevtF, const int* pnparF, const int* pnp4F )
   {
-#ifdef __CUDACC__
-    CudaRuntime::setUp();
+#ifdef MGONGPUCPP_GPUIMPL
+    GpuRuntime::setUp();
 #endif
     // (NB: CPPProcess::initProc no longer needs to be executed here because it is called in the Bridge constructor)
     // FIXME: disable OMP in Bridge when called from Fortran
@@ -65,8 +65,8 @@ extern "C"
     Bridge<FORTRANFPTYPE>* pbridge = dynamic_cast<Bridge<FORTRANFPTYPE>*>( *ppbridge );
     if( pbridge == 0 ) throw std::runtime_error( "fbridgedelete_: invalid Bridge address" );
     delete pbridge;
-#ifdef __CUDACC__
-    CudaRuntime::tearDown();
+#ifdef MGONGPUCPP_GPUIMPL
+    GpuRuntime::tearDown();
 #endif
   }
 
@@ -96,7 +96,7 @@ extern "C"
   {
     Bridge<FORTRANFPTYPE>* pbridge = dynamic_cast<Bridge<FORTRANFPTYPE>*>( *ppbridge );
     if( pbridge == 0 ) throw std::runtime_error( "fbridgesequence_: invalid Bridge address" );
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     // Use the device/GPU implementation in the CUDA library
     // (there is also a host implementation in this library)
     pbridge->gpu_sequence( momenta, gs, rndhel, rndcol, *pchannelId, mes, selhel, selcol );
diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/fsampler.cc b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/fsampler.cc
index 2fb445372d..3743934f41 100644
--- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/fsampler.cc
+++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/fsampler.cc
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Feb 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #include "mgOnGpuConfig.h"
 
@@ -13,7 +13,7 @@
 
 //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -40,7 +40,7 @@ namespace mg5amcCpu
   private:
     const int m_nevt; // The number of events in each iteration
     int m_iiter;      // The iteration counter (for random number seeding)
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
     HostBufferRndNumMomenta m_hstRndmom; // Memory buffers for random numbers
     HostBufferMomenta m_hstMomenta;      // Memory buffers for momenta
     HostBufferWeights m_hstWeights;      // Memory buffers for sampling weights
@@ -105,7 +105,7 @@ namespace mg5amcCpu
 
 extern "C"
 {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   using namespace mg5amcGpu;
 #else
   using namespace mg5amcCpu;
diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/runTest.cc b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/runTest.cc
index d4a760a71b..de327f2321 100644
--- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/runTest.cc
+++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/runTest.cc
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: S. Hageboeck (Nov 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 //----------------------------------------------------------------------------
 // Use ./runTest.exe --gtest_filter=*xxx to run only testxxx.cc tests
 //----------------------------------------------------------------------------
@@ -18,7 +18,7 @@
 #include "RandomNumberKernels.h"
 #include "epoch_process_id.h"
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 using namespace mg5amcGpu;
 #else
 using namespace mg5amcCpu;
@@ -35,7 +35,7 @@ struct CUDA_CPU_TestBase : public TestDriverBase
     : TestDriverBase( npar, refFileName ) {}
 };
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 struct CPUTest : public CUDA_CPU_TestBase
 {
   // Struct data members (process, and memory structures for random numbers, momenta, matrix elements and weights on host and device)
@@ -119,7 +119,7 @@ struct CPUTest : public CUDA_CPU_TestBase
 };
 #endif
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 struct CUDATest : public CUDA_CPU_TestBase
 {
   // Reset the device when our test goes out of scope. Note that this should happen after
@@ -128,7 +128,7 @@ struct CUDATest : public CUDA_CPU_TestBase
   {
     ~DeviceReset()
     {
-      checkCuda( cudaDeviceReset() ); // this is needed by cuda-memcheck --leak-check full
+      checkGpu( gpuDeviceReset() ); // this is needed by cuda-memcheck --leak-check full
     }
   } deviceResetter;
 
@@ -256,7 +256,7 @@ INSTANTIATE_TEST_SUITE_P( prefix, \
                           test_suite_name, \
                           testing::Values( new CUDATest( MG_EPOCH_REFERENCE_FILE_NAME ) ) );
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 MG_INSTANTIATE_TEST_SUITE_GPU( XTESTID_GPU( MG_EPOCH_PROCESS_ID ), MadgraphTest );
 #else
 MG_INSTANTIATE_TEST_SUITE_CPU( XTESTID_CPU( MG_EPOCH_PROCESS_ID ), MadgraphTest );
diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/testmisc.cc b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/testmisc.cc
index 895d6eeb56..ba9e59a8a3 100644
--- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/testmisc.cc
+++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/testmisc.cc
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 //----------------------------------------------------------------------------
 // Use ./runTest.exe --gtest_filter=*misc to run only testmisc.cc tests
 //----------------------------------------------------------------------------
@@ -17,7 +17,7 @@
 #include <sstream>
 #include <typeinfo>
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 #define TESTID( s ) s##_GPU_MISC
 #else
 #define TESTID( s ) s##_CPU_MISC
@@ -26,7 +26,7 @@
 #define XTESTID( s ) TESTID( s )
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -59,7 +59,7 @@ namespace mg5amcCpu
 
 TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testmisc )
 {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   using namespace mg5amcGpu;
 #else
   using namespace mg5amcCpu;
diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/testxxx.cc b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/testxxx.cc
index 3361fe5aa9..e5167de00c 100644
--- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/testxxx.cc
+++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/testxxx.cc
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Apr 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 //----------------------------------------------------------------------------
 // Use ./runTest.exe --gtest_filter=*xxx to run only testxxx.cc tests
 //----------------------------------------------------------------------------
@@ -24,7 +24,7 @@
 #include <iomanip>
 #include <iostream>
 #include <vector>
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 #define TESTID( s ) s##_GPU_XXX
 #else
 #define TESTID( s ) s##_CPU_XXX
@@ -32,7 +32,7 @@
 
 #define XTESTID( s ) TESTID( s )
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -42,7 +42,7 @@ namespace mg5amcCpu
   int FPEhandlerIevt = -1;
   inline void FPEhandler( int sig )
   {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     std::cerr << "Floating Point Exception (GPU): '" << FPEhandlerMessage << "' ievt=" << FPEhandlerIevt << std::endl;
 #else
     std::cerr << "Floating Point Exception (CPU neppV=" << neppV << "): '" << FPEhandlerMessage << "' ievt=" << FPEhandlerIevt << std::endl;
@@ -53,7 +53,7 @@ namespace mg5amcCpu
 
 TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testxxx )
 {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   using namespace mg5amcGpu;
 #else
   using namespace mg5amcCpu;
@@ -77,7 +77,7 @@ TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testxxx )
   assert( nevt % neppM == 0 ); // nevt must be a multiple of neppM
   assert( nevt % neppV == 0 ); // nevt must be a multiple of neppV
   // Fill in the input momenta
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   mg5amcGpu::PinnedHostBufferMomenta hstMomenta( nevt ); // AOSOA[npagM][npar=4][np4=4][neppM]
 #else
   mg5amcCpu::HostBufferMomenta hstMomenta( nevt ); // AOSOA[npagM][npar=4][np4=4][neppM]
@@ -322,7 +322,7 @@ TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testxxx )
   {
     for( int ievt = 0; ievt < nevt; ievt++ )
     {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
       using namespace mg5amcGpu;
 #else
       using namespace mg5amcCpu;
diff --git a/epochX/cudacpp/gg_ttggg.mad/src/HelAmps_sm.h b/epochX/cudacpp/gg_ttggg.mad/src/HelAmps_sm.h
index 8df465ad6d..8b4ad719be 100644
--- a/epochX/cudacpp/gg_ttggg.mad/src/HelAmps_sm.h
+++ b/epochX/cudacpp/gg_ttggg.mad/src/HelAmps_sm.h
@@ -5,7 +5,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: A. Valassi (Sep 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
 // MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
@@ -28,7 +28,7 @@
 //#include <iomanip>
 //#include <iostream>
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/gg_ttggg.mad/src/Parameters_sm.cc b/epochX/cudacpp/gg_ttggg.mad/src/Parameters_sm.cc
index 64fc3fea62..067445b198 100644
--- a/epochX/cudacpp/gg_ttggg.mad/src/Parameters_sm.cc
+++ b/epochX/cudacpp/gg_ttggg.mad/src/Parameters_sm.cc
@@ -4,7 +4,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: A. Valassi (Sep 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
 // MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
@@ -17,7 +17,7 @@
 #include <iomanip>
 #include <iostream>
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 using namespace mg5amcGpu;
 #else
 using namespace mg5amcCpu;
diff --git a/epochX/cudacpp/gg_ttggg.mad/src/Parameters_sm.h b/epochX/cudacpp/gg_ttggg.mad/src/Parameters_sm.h
index b6568d3761..9581d66e0e 100644
--- a/epochX/cudacpp/gg_ttggg.mad/src/Parameters_sm.h
+++ b/epochX/cudacpp/gg_ttggg.mad/src/Parameters_sm.h
@@ -27,7 +27,7 @@
 #include "read_slha.h"
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -93,7 +93,7 @@ namespace mg5amcCpu
 #include <limits>
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -218,7 +218,7 @@ namespace mg5amcCpu
 //==========================================================================
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -239,7 +239,7 @@ namespace mg5amcCpu
 #pragma GCC diagnostic push
 #pragma GCC diagnostic ignored "-Wunused-variable"  // e.g. <<warning: unused variable ‘mdl_G__exp__2’ [-Wunused-variable]>>
 #pragma GCC diagnostic ignored "-Wunused-parameter" // e.g. <<warning: unused parameter ‘G’ [-Wunused-parameter]>>
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 #pragma nv_diagnostic push
 #pragma nv_diag_suppress 177 // e.g. <<warning #177-D: variable "mdl_G__exp__2" was declared but never referenced>>
 #endif
@@ -267,7 +267,7 @@ namespace mg5amcCpu
       // End SM implementation - no special handling of vectors of floats as in EFT (#439)
       return out;
     }
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 #pragma GCC diagnostic pop
 #pragma nv_diagnostic pop
 #endif
diff --git a/epochX/cudacpp/gg_ttggg.mad/src/cudacpp_src.mk b/epochX/cudacpp/gg_ttggg.mad/src/cudacpp_src.mk
index d4cc628aec..159e19a46d 100644
--- a/epochX/cudacpp/gg_ttggg.mad/src/cudacpp_src.mk
+++ b/epochX/cudacpp/gg_ttggg.mad/src/cudacpp_src.mk
@@ -19,7 +19,7 @@ SHELL := /bin/bash
 #=== Configure common compiler flags for CUDA and C++
 
 INCFLAGS = -I.
-OPTFLAGS = -O3 # this ends up in CUFLAGS too (should it?), cannot add -Ofast or -ffast-math here
+OPTFLAGS = -O3 # this ends up in GPUFLAGS too (should it?), cannot add -Ofast or -ffast-math here
 
 #-------------------------------------------------------------------------------
 
@@ -45,13 +45,13 @@ endif
 
 #-------------------------------------------------------------------------------
 
-#=== Configure the CUDA compiler (note: NVCC is already exported including ccache)
+#=== Configure the CUDA compiler (note: GPUCC is already exported including ccache)
 
-###$(info NVCC=$(NVCC))
+###$(info GPUCC=$(GPUCC))
 
 #-------------------------------------------------------------------------------
 
-#=== Configure ccache for C++ builds (note: NVCC is already exported including ccache)
+#=== Configure ccache for C++ builds (note: GPUCC is already exported including ccache)
 
 # Enable ccache if USECCACHE=1
 ifeq ($(USECCACHE)$(shell echo $(CXX) | grep ccache),1)
@@ -92,6 +92,13 @@ endif
 ###$(info OMPFLAGS=$(OMPFLAGS))
 CXXFLAGS += $(OMPFLAGS)
 
+# Add correct -DHIP_LATFORM when compiling for HIP
+ifeq ($(findstring nvcc,$(GPUCC)),nvcc)
+  GPUFLAGS += -Xcompiler -fPIC -c -x cu
+else ifeq ($(findstring hipcc,$(GPUCC)),hipcc)
+  GPUFLAGS += -fPIC -c
+endif
+
 # Set the build flags appropriate to each AVX choice (example: "make AVX=none")
 # [NB MGONGPU_PVW512 is needed because "-mprefer-vector-width=256" is not exposed in a macro]
 # [See https://gcc.gnu.org/bugzilla/show_bug.cgi?id=96476]
@@ -253,20 +260,20 @@ $(BUILDDIR)/%.o : %.cc *.h $(BUILDDIR)/.build.$(TAG)
 # Generic target and build rules: objects from CUDA compilation
 $(BUILDDIR)/%_cu.o : %.cc *.h $(BUILDDIR)/.build.$(TAG)
 	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
-	$(NVCC) $(CPPFLAGS) $(CUFLAGS) -Xcompiler -fPIC -c -x cu $< -o $@
+	$(GPUCC) $(CPPFLAGS) $(GPUFLAGS) $< -o $@
 
 #-------------------------------------------------------------------------------
 
 cxx_objects=$(addprefix $(BUILDDIR)/, Parameters_sm.o read_slha.o)
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 cu_objects=$(addprefix $(BUILDDIR)/, Parameters_sm_cu.o)
 endif
 
 # Target (and build rules): common (src) library
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so : $(cxx_objects) $(cu_objects)
 	@if [ ! -d $(LIBDIR) ]; then echo "mkdir -p $(LIBDIR)"; mkdir -p $(LIBDIR); fi
-	$(NVCC) -shared -o $@ $(cxx_objects) $(cu_objects) $(LDFLAGS)
+	$(GPUCC) -shared -o $@ $(cxx_objects) $(cu_objects) $(LDFLAGS)
 else
 $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so : $(cxx_objects)
 	@if [ ! -d $(LIBDIR) ]; then echo "mkdir -p $(LIBDIR)"; mkdir -p $(LIBDIR); fi
diff --git a/epochX/cudacpp/gg_ttggg.mad/src/mgOnGpuConfig.h b/epochX/cudacpp/gg_ttggg.mad/src/mgOnGpuConfig.h
index 80032e528b..55d03f1252 100644
--- a/epochX/cudacpp/gg_ttggg.mad/src/mgOnGpuConfig.h
+++ b/epochX/cudacpp/gg_ttggg.mad/src/mgOnGpuConfig.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jul 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MGONGPUCONFIG_H
 #define MGONGPUCONFIG_H 1
@@ -10,12 +10,25 @@
 // There are two different code bases for standalone_cudacpp (without multichannel) and madevent+cudacpp (with multichannel)
 #define MGONGPU_SUPPORTS_MULTICHANNEL 1
 
+// Is this a GPU (CUDA, HIP) or CPU implementation?
+#ifdef __CUDACC__
+#define MGONGPUCPP_GPUIMPL cuda
+#elif defined __HIPCC__
+#define MGONGPUCPP_GPUIMPL hip
+#else
+#undef MGONGPUCPP_GPUIMPL
+#endif
+
 // ** NB1 Throughputs (e.g. 6.8E8) are events/sec for "./gcheck.exe -p 65536 128 12"
 // ** NB2 Baseline on b7g47n0004 fluctuates (probably depends on load on other VMs)
 
 // Choose if curand is supported for generating random numbers
+// For HIP, by default, do not use curand (common random numbers will be used instead)
 // For both CUDA and C++, by default, do not inline, but allow this macro to be set from outside with e.g. -DMGONGPU_HAS_NO_CURAND
-// (there exist CUDA installations, e.g. using the HPC package, which do not include curand - see PR #784)
+// (there exist CUDA installations, e.g. using the HPC package, which do not include curand - see PR #784 and #785)
+#if defined __HIPCC__
+#define MGONGPU_HAS_NO_CURAND 1
+#else
 //#ifdef __CUDACC__
 //#undef MGONGPU_HAS_NO_CURAND // default
 ////#define MGONGPU_HAS_NO_CURAND 1
@@ -23,6 +36,7 @@
 //#undef MGONGPU_HAS_NO_CURAND // default
 ////#define MGONGPU_HAS_NO_CURAND 1
 //#endif
+#endif
 
 // Choose floating point precision (for everything but color algebra #537)
 // If one of these macros has been set from outside with e.g. -DMGONGPU_FPTYPE_FLOAT, nothing happens (issue #167)
@@ -54,23 +68,28 @@
 //#undef MGONGPU_HARDCODE_PARAM // default
 ////#define MGONGPU_HARDCODE_PARAM 1
 
-// Complex type in c++: std::complex or cxsmpl (CHOOSE ONLY ONE)
-#ifndef __CUDACC__
-//#define MGONGPU_CPPCXTYPE_STDCOMPLEX 1 // ~8 percent slower on float, same on double (5.1E6/double, 9.4E6/float)
-#define MGONGPU_CPPCXTYPE_CXSMPL 1 // new default (5.1E6/double, 10.2E6/float)
-#endif
-
-// Complex type in cuda: thrust or cucomplex or cxsmpl (CHOOSE ONLY ONE)
+// Complex type in CUDA: thrust or cucomplex or cxsmpl (CHOOSE ONLY ONE)
 #ifdef __CUDACC__
 #define MGONGPU_CUCXTYPE_THRUST 1 // default (~1.15E9/double, ~3.2E9/float)
 //#define MGONGPU_CUCXTYPE_CUCOMPLEX 1 // ~10 percent slower (1.03E9/double, ~2.8E9/float)
 //#define MGONGPU_CUCXTYPE_CXSMPL 1 // ~10 percent slower (1.00E9/double, ~2.9E9/float)
+
+// Complex type in HIP: cxsmpl (ONLY ONE OPTION POSSIBLE)
+#elif defined __HIPCC__
+#define MGONGPU_CUCXTYPE_CXSMPL 1 // ~10 percent slower (1.00E9/double, ~2.9E9/float)
+
+// Complex type in C++: std::complex or cxsmpl (CHOOSE ONLY ONE)
+#else
+//#define MGONGPU_CPPCXTYPE_STDCOMPLEX 1 // ~8 percent slower on float, same on double (5.1E6/double, 9.4E6/float)
+#define MGONGPU_CPPCXTYPE_CXSMPL 1 // new default (5.1E6/double, 10.2E6/float)
 #endif
 
-// Cuda nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation
+// CUDA nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation
 #ifdef __CUDACC__
-#undef MGONGPU_NSIGHT_DEBUG // default
+#undef MGONGPU_NSIGHT_DEBUG // default in CUDA
 //#define MGONGPU_NSIGHT_DEBUG 1
+#else
+#undef MGONGPU_NSIGHT_DEBUG // only option in HIP or C++
 #endif
 
 // SANITY CHECKS (floating point precision for everything but color algebra #537)
@@ -86,17 +105,21 @@
 #error You cannot use double precision for color algebra and single precision elsewhere
 #endif
 
-// SANITY CHECKS (c++ complex number implementation)
-#ifndef __CUDACC__
-#if defined MGONGPU_CPPCXTYPE_STDCOMPLEX and defined MGONGPU_CPPCXTYPE_CXSMPL
-#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CPPCXTYPE_STDCOMPLEX or MGONGPU_CPPCXTYPE_CXSMPL
+// SANITY CHECKS (CUDA complex number implementation)
+#ifdef __CUDACC__
+#if defined MGONGPU_CUCXTYPE_THRUST and defined MGONGPU_CUCXTYPE_CUCOMPLEX
+#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CUCXTYPE_THRUST or MGONGPU_CUCXTYPE_CUCOMPLEX for CUDA
+#elif defined MGONGPU_CUCXTYPE_THRUST and defined MGONGPU_CUCXTYPE_CXSMPL
+#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CUCXTYPE_THRUST or MGONGPU_CUCXTYPE_CXSMPL for CUDA
+#elif defined MGONGPU_CUCXTYPE_CUCOMPLEX and defined MGONGPU_CUCXTYPE_CXSMPL
+#error You must CHOOSE (ONE AND) ONLY ONE OF MGONGPU_CUCXTYPE_CUCOMPLEX or MGONGPU_CUCXTYPE_CXSMPL for CUDA
 #endif
 #endif
 
-// SANITY CHECKS (cuda complex number implementation)
-#ifdef __CUDACC__
-#if defined MGONGPU_CUCXTYPE_THRUST and defined MGONGPU_CUCXTYPE_CUCOMPLEX and defined MGONGPU_CUCXTYPE_CXSMPL
-#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CUCXTYPE_THRUST or MGONGPU_CUCXTYPE_CUCOMPLEX or MGONGPU_CUCXTYPE_CXSMPL
+// SANITY CHECKS (C++ complex number implementation)
+#ifndef MGONGPUCPP_GPUIMPL
+#if defined MGONGPU_CPPCXTYPE_STDCOMPLEX and defined MGONGPU_CPPCXTYPE_CXSMPL
+#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CPPCXTYPE_STDCOMPLEX or MGONGPU_CPPCXTYPE_CXSMPL for C++
 #endif
 #endif
 
@@ -134,7 +157,7 @@ namespace mgOnGpu
   // Alignment requirement for using reinterpret_cast with SIMD vectorized code
   // (using reinterpret_cast with non aligned memory may lead to segmentation faults!)
   // Only needed for C++ code but can be enforced also in NVCC builds of C++ code using CUDA>=11.2 and C++17 (#318, #319, #333)
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   constexpr int cppAlign = 64; // alignment requirement for SIMD vectorization (64-byte i.e. 512-bit)
 #endif
 
@@ -145,7 +168,7 @@ using mgOnGpu::fptype;
 using mgOnGpu::fptype2;
 
 // C++ SIMD vectorization width (this will be used to set neppV)
-#ifdef __CUDACC__ // CUDA implementation has no SIMD
+#ifdef MGONGPUCPP_GPUIMPL // CUDA and HIP implementations have no SIMD
 #undef MGONGPU_CPPSIMD
 #elif defined __AVX512VL__ && defined MGONGPU_PVW512 // C++ "512z" AVX512 with 512 width (512-bit ie 64-byte): 8 (DOUBLE) or 16 (FLOAT)
 #ifdef MGONGPU_FPTYPE_DOUBLE
@@ -175,9 +198,9 @@ using mgOnGpu::fptype2;
 #undef MGONGPU_CPPSIMD
 #endif
 
-// Cuda nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation
+// CUDA nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation
 // Arguments (not used so far): text is __FUNCTION__, code is 0 (start) or 1 (end)
-#if defined __CUDACC__ && defined MGONGPU_NSIGHT_DEBUG /* clang-format off */
+#if defined __CUDA__ && defined MGONGPU_NSIGHT_DEBUG /* clang-format off */
 #define mgDebugDeclare() __shared__ float mgDebugCounter[mgOnGpu::ntpbMAX];
 #define mgDebugInitialise() { mgDebugCounter[threadIdx.x] = 0; }
 #define mgDebug( code, text ) { mgDebugCounter[threadIdx.x] += 1; }
@@ -189,8 +212,8 @@ using mgOnGpu::fptype2;
 #define mgDebugFinalise() { /*noop*/ }
 #endif /* clang-format on */
 
-// Define empty CUDA declaration specifiers for C++
-#ifndef __CUDACC__
+// Define empty CUDA/HIP declaration specifiers for C++
+#ifndef MGONGPUCPP_GPUIMPL
 #define __global__
 #define __host__
 #define __device__
diff --git a/epochX/cudacpp/gg_ttggg.mad/src/mgOnGpuCxtypes.h b/epochX/cudacpp/gg_ttggg.mad/src/mgOnGpuCxtypes.h
index ca9a9f00c0..5532e22fa1 100644
--- a/epochX/cudacpp/gg_ttggg.mad/src/mgOnGpuCxtypes.h
+++ b/epochX/cudacpp/gg_ttggg.mad/src/mgOnGpuCxtypes.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022, based on earlier work by D. Smith) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MGONGPUCXTYPES_H
 #define MGONGPUCXTYPES_H 1
@@ -19,7 +19,7 @@
 #include <complex>
 
 // Complex type in cuda: thrust or cucomplex or cxsmpl
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 #if defined MGONGPU_CUCXTYPE_THRUST
 #pragma clang diagnostic push
 #pragma clang diagnostic ignored "-Wtautological-compare" // for icpx2021/clang13 (https://stackoverflow.com/a/15864661)
@@ -82,7 +82,7 @@ namespace mgOnGpu /* clang-format off */
 using mgOnGpu::cxsmpl;
 
 // Printout to stream for user defined types
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -92,7 +92,7 @@ namespace mg5amcCpu
   inline __host__ std::ostream&
   operator<<( std::ostream& out, const cxsmpl<FP>& c )
   {
-    out << std::complex( c.real(), c.imag() );
+    out << std::complex<FP>( c.real(), c.imag() );
     return out;
   }
 
@@ -215,14 +215,14 @@ namespace mg5amcCpu
 //==========================================================================
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
 #endif
 {
   // --- Type definitions (complex type: cxtype)
-#ifdef __CUDACC__ // cuda
+#ifdef MGONGPUCPP_GPUIMPL // cuda
 #if defined MGONGPU_CUCXTYPE_THRUST
   typedef thrust::complex<fptype> cxtype;
 #elif defined MGONGPU_CUCXTYPE_CUCOMPLEX
@@ -255,7 +255,7 @@ namespace mg5amcCpu
 //==========================================================================
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -307,7 +307,7 @@ namespace mg5amcCpu
 
   //==========================================================================
 
-#if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_THRUST // cuda + thrust
+#if defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CUCXTYPE_THRUST // cuda + thrust
 
   //------------------------------
   // CUDA - using thrust::complex
@@ -343,11 +343,11 @@ namespace mg5amcCpu
     return c;
   }
 
-#endif // #if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_THRUST
+#endif // #if defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CUCXTYPE_THRUST
 
   //==========================================================================
 
-#if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_CUCOMPLEX // cuda + cucomplex
+#if defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CUCXTYPE_CUCOMPLEX // cuda + cucomplex
 
   //------------------------------
   // CUDA - using cuComplex
@@ -562,11 +562,11 @@ namespace mg5amcCpu
     return cxmake( c.real(), c.imag() );
   }
 
-#endif // #if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_CUCOMPLEX
+#endif // #if defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CUCXTYPE_CUCOMPLEX
 
   //==========================================================================
 
-#if not defined __CUDACC__ and defined MGONGPU_CPPCXTYPE_STDCOMPLEX // c++ + stdcomplex
+#if not defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CPPCXTYPE_STDCOMPLEX // c++ + stdcomplex
 
   //------------------------------
   // C++ - using std::complex
@@ -610,7 +610,7 @@ namespace mg5amcCpu
   }
 #endif
 
-#endif // #if not defined __CUDACC__ and defined MGONGPU_CPPCXTYPE_STDCOMPLEX
+#endif // #if not defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CPPCXTYPE_STDCOMPLEX
 
   //==========================================================================
 
@@ -633,7 +633,7 @@ namespace mg5amcCpu
 //==========================================================================
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/gg_ttggg.mad/src/mgOnGpuFptypes.h b/epochX/cudacpp/gg_ttggg.mad/src/mgOnGpuFptypes.h
index 905c97d700..fa3a02664b 100644
--- a/epochX/cudacpp/gg_ttggg.mad/src/mgOnGpuFptypes.h
+++ b/epochX/cudacpp/gg_ttggg.mad/src/mgOnGpuFptypes.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MGONGPUFPTYPES_H
 #define MGONGPUFPTYPES_H 1
@@ -12,7 +12,7 @@
 #include <cmath>
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL // cuda
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -20,7 +20,7 @@ namespace mg5amcCpu
 {
   //==========================================================================
 
-#ifdef __CUDACC__ // cuda
+#ifdef MGONGPUCPP_GPUIMPL // cuda
 
   //------------------------------
   // Floating point types - Cuda
@@ -64,11 +64,11 @@ namespace mg5amcCpu
 #endif
   }
 
-#endif // #ifdef __CUDACC__
+#endif // #ifdef MGONGPUCPP_GPUIMPL
 
   //==========================================================================
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 
   //------------------------------
   // Floating point types - C++
@@ -92,7 +92,7 @@ namespace mg5amcCpu
     return std::sqrt( f );
   }
 
-#endif // #ifndef __CUDACC__
+#endif // #ifndef MGONGPUCPP_GPUIMPL
 
   //==========================================================================
 
diff --git a/epochX/cudacpp/gg_ttggg.mad/src/mgOnGpuVectors.h b/epochX/cudacpp/gg_ttggg.mad/src/mgOnGpuVectors.h
index e1299ba81e..cdae04326b 100644
--- a/epochX/cudacpp/gg_ttggg.mad/src/mgOnGpuVectors.h
+++ b/epochX/cudacpp/gg_ttggg.mad/src/mgOnGpuVectors.h
@@ -32,7 +32,7 @@
 #endif
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -131,7 +131,7 @@ namespace mg5amcCpu
 #endif
 #endif
 
-#else // i.e #ifndef MGONGPU_CPPSIMD (this includes #ifdef __CUDACC__)
+#else // i.e #ifndef MGONGPU_CPPSIMD (this includes #ifdef MGONGPUCPP_GPUIMPL)
 
   const int neppV = 1;
 
@@ -153,13 +153,13 @@ namespace mg5amcCpu
 //==========================================================================
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
 #endif
 {
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 
   // Printout to stream for user defined types
 
@@ -805,11 +805,11 @@ namespace mg5amcCpu
 
 #endif // #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
 
-#endif // #ifndef __CUDACC__
+#endif // #ifndef MGONGPUCPP_GPUIMPL
 
   //==========================================================================
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 
   //------------------------------
   // Vector types - CUDA
@@ -853,12 +853,12 @@ namespace mg5amcCpu
     return mask;
   }
 
-#endif // #ifdef __CUDACC__
+#endif // #ifdef MGONGPUCPP_GPUIMPL
 
   //==========================================================================
 
   // Scalar-or-vector types: scalar in CUDA, vector or scalar in C++
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   typedef bool bool_sv;
   typedef fptype fptype_sv;
   typedef fptype2 fptype2_sv;
@@ -879,7 +879,7 @@ namespace mg5amcCpu
 #endif
 
   // Scalar-or-vector zeros: scalar in CUDA, vector or scalar in C++
-#ifdef __CUDACC__ /* clang-format off */
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
   inline __host__ __device__ cxtype cxzero_sv(){ return cxtype( 0, 0 ); }
 #elif defined MGONGPU_CPPSIMD
   inline cxtype_v cxzero_sv() { return cxtype_v(); } // RRRR=0000 IIII=0000
diff --git a/epochX/cudacpp/gg_ttggg.mad/src/rambo.h b/epochX/cudacpp/gg_ttggg.mad/src/rambo.h
index e02ea52496..cd7e1008ea 100644
--- a/epochX/cudacpp/gg_ttggg.mad/src/rambo.h
+++ b/epochX/cudacpp/gg_ttggg.mad/src/rambo.h
@@ -4,7 +4,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 
 #include "mgOnGpuConfig.h"
@@ -18,7 +18,7 @@
 #include <iostream>
 
 // Simplified rambo version for 2 to N (with N>=2) processes with massless particles
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -83,7 +83,7 @@ namespace mg5amcCpu
       static bool first = true;
       if( first )
       {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
         if constexpr( M_ACCESS::isOnDevice() ) // avoid
         {
           const int ievt0 = 0;
@@ -166,7 +166,7 @@ namespace mg5amcCpu
     wt = po2log;
     if( nparf != 2 ) wt = ( 2. * nparf - 4. ) * log( energy ) + z[nparf - 1];
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
     // issue warnings if weight is too small or too large
     static int iwarn[5] = { 0, 0, 0, 0, 0 };
     if( wt < -180. )
diff --git a/epochX/cudacpp/gg_ttggg.sa/CODEGEN_cudacpp_gg_ttggg_log.txt b/epochX/cudacpp/gg_ttggg.sa/CODEGEN_cudacpp_gg_ttggg_log.txt
index 0970bf8b4c..2720870321 100644
--- a/epochX/cudacpp/gg_ttggg.sa/CODEGEN_cudacpp_gg_ttggg_log.txt
+++ b/epochX/cudacpp/gg_ttggg.sa/CODEGEN_cudacpp_gg_ttggg_log.txt
@@ -52,7 +52,7 @@ Note that you can still compile and run aMC@NLO with the built-in PDFs
 
 Using default text editor "vi". Set another one in ./input/mg5_configuration.txt
 Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt
-No valid web browser found. Please set in ./input/mg5_configuration.txt
+Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt
 import /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg.mg
 The import format was not given, so we guess it as command
 set stdout_level DEBUG
@@ -62,7 +62,7 @@ generate g g > t t~ g g g
 No model currently active, so we import the Standard Model
 INFO: load particles 
 INFO: load vertices 
-[1;32mDEBUG: model prefixing  takes 0.005753755569458008 [0m
+[1;32mDEBUG: model prefixing  takes 0.005664825439453125 [0m
 INFO: Restrict model sm with file models/sm/restrict_default.dat . 
 [1;32mDEBUG: Simplifying conditional expressions [0m
 [1;32mDEBUG: remove interactions: u s w+ at order: QED=1 [0m
@@ -155,7 +155,7 @@ INFO: Please specify coupling orders to bypass this step.
 INFO: Trying coupling order WEIGHTED<=5: WEIGTHED IS QCD+2*QED 
 INFO: Trying process: g g > t t~ g g g WEIGHTED<=5 @1  
 INFO: Process has 1240 diagrams 
-1 processes with 1240 diagrams generated in 1.912 s
+1 processes with 1240 diagrams generated in 1.872 s
 Total: 1 processes with 1240 diagrams
 output standalone_cudacpp ../TMPOUT/CODEGEN_cudacpp_gg_ttggg
 Load PLUGIN.CUDACPP_OUTPUT
@@ -175,7 +175,7 @@ INFO: Creating files in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TM
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg/SubProcesses/P1_Sigma_sm_gg_ttxggg/./CPPProcess.h
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg/SubProcesses/P1_Sigma_sm_gg_ttxggg/./CPPProcess.cc
 INFO: Created files CPPProcess.h and CPPProcess.cc in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg/SubProcesses/P1_Sigma_sm_gg_ttxggg/. 
-Generated helas calls for 1 subprocesses (1240 diagrams) in 6.716 s
+Generated helas calls for 1 subprocesses (1240 diagrams) in 6.609 s
 [1;32mDEBUG:  Entering PLUGIN_ProcessExporter.convert_model (create the model) [1;30m[output.py at line 202][0m [0m
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates VVV1 routines[0m
@@ -183,7 +183,7 @@ ALOHA: aloha creates FFV1 routines[0m
 ALOHA: aloha creates VVVV1 routines[0m
 ALOHA: aloha creates VVVV3 routines[0m
 ALOHA: aloha creates VVVV4 routines[0m
-ALOHA: aloha creates 5 routines in  0.352 s
+ALOHA: aloha creates 5 routines in  0.345 s
 <class 'aloha.create_aloha.AbstractRoutine'> VVV1
 <class 'aloha.create_aloha.AbstractRoutine'> VVV1
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
@@ -206,7 +206,7 @@ INFO: Created files Parameters_sm.h and Parameters_sm.cc in directory
 INFO: /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg/src/. and /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg/src/. 
 quit
 
-real	0m13.290s
-user	0m13.123s
-sys	0m0.115s
+real	0m12.978s
+user	0m12.813s
+sys	0m0.111s
 Code generation completed in 13 seconds
diff --git a/epochX/cudacpp/gg_ttggg.sa/COPYRIGHT b/epochX/cudacpp/gg_ttggg.sa/COPYRIGHT
index a134b5fef9..84a883fbb0 100644
--- a/epochX/cudacpp/gg_ttggg.sa/COPYRIGHT
+++ b/epochX/cudacpp/gg_ttggg.sa/COPYRIGHT
@@ -15,6 +15,7 @@ The full development team currently includes the following authors :
   Stephan Hageboeck (CERN)
   Olivier Mattelaer (Universite Catholique de Louvain, original author)
   Stefan Roiser (CERN, original author)
+  Joergen Teig (CERN)
   Andrea Valassi (CERN, original author)
   Zenny Wettersten (CERN)
 See https://github.com/madgraph5/madgraph4gpu for more details. For the full
diff --git a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/Bridge.h b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/Bridge.h
index bf8b5e024d..89437b4c42 100644
--- a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/Bridge.h
+++ b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/Bridge.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: S. Roiser (Nov 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Roiser, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef BRIDGE_H
 #define BRIDGE_H 1
@@ -23,7 +23,7 @@
 #include <memory>
 #include <type_traits>
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -83,7 +83,7 @@ namespace mg5amcCpu
     Bridge& operator=( const Bridge& ) = delete;
     Bridge& operator=( Bridge&& ) = delete;
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     /**
      * Set the gpublocks and gputhreads for the gpusequence - throws if evnt != gpublocks*gputhreads
      * (this is needed for BridgeKernel tests rather than for actual production use in Fortran)
@@ -150,7 +150,7 @@ namespace mg5amcCpu
     unsigned int m_nevt; // number of events
     int m_nGoodHel;      // the number of good helicities (-1 initially when they have not yet been calculated)
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     int m_gputhreads; // number of gpu threads (default set from number of events, can be modified)
     int m_gpublocks;  // number of gpu blocks (default set from number of events, can be modified)
     DeviceBuffer<FORTRANFPTYPE, sizePerEventMomenta> m_devMomentaF;
@@ -187,12 +187,12 @@ namespace mg5amcCpu
   // Forward declare transposition methods
   //
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 
   template<typename Tin, typename Tout>
   __global__ void dev_transposeMomentaF2C( const Tin* in, Tout* out, const unsigned int nevt );
 
-#endif // __CUDACC__
+#endif // MGONGPUCPP_GPUIMPL
 
   template<typename Tin, typename Tout>
   void hst_transposeMomentaF2C( const Tin* in, Tout* out, const unsigned int nevt );
@@ -209,7 +209,7 @@ namespace mg5amcCpu
   Bridge<FORTRANFPTYPE>::Bridge( unsigned int nevtF, unsigned int nparF, unsigned int np4F )
     : m_nevt( nevtF )
     , m_nGoodHel( -1 )
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     , m_gputhreads( 256 )                  // default number of gpu threads
     , m_gpublocks( m_nevt / m_gputhreads ) // this ensures m_nevt <= m_gpublocks*m_gputhreads
     , m_devMomentaF( m_nevt )
@@ -233,7 +233,7 @@ namespace mg5amcCpu
   {
     if( nparF != CPPProcess::npar ) throw std::runtime_error( "Bridge constructor: npar mismatch" );
     if( np4F != CPPProcess::np4 ) throw std::runtime_error( "Bridge constructor: np4 mismatch" );
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     if( ( m_nevt < s_gputhreadsmin ) || ( m_nevt % s_gputhreadsmin != 0 ) )
       throw std::runtime_error( "Bridge constructor: nevt should be a multiple of " + std::to_string( s_gputhreadsmin ) );
     while( m_nevt != m_gpublocks * m_gputhreads )
@@ -249,7 +249,7 @@ namespace mg5amcCpu
 #else
     std::cout << "WARNING! Instantiate host Bridge (nevt=" << m_nevt << ")" << std::endl;
     m_pmek.reset( new MatrixElementKernelHost( m_hstMomentaC, m_hstGs, m_hstRndHel, m_hstRndCol, m_hstMEs, m_hstSelHel, m_hstSelCol, m_nevt ) );
-#endif // __CUDACC__
+#endif // MGONGPUCPP_GPUIMPL
     // Create a process object, read param card and set parameters
     // FIXME: the process instance can happily go out of scope because it is only needed to read parameters?
     // FIXME: the CPPProcess should really be a singleton? what if fbridgecreate is called from several Fortran threads?
@@ -262,7 +262,7 @@ namespace mg5amcCpu
     process.initProc( paramCard );
   }
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   template<typename FORTRANFPTYPE>
   void Bridge<FORTRANFPTYPE>::set_gpugrid( const int gpublocks, const int gputhreads )
   {
@@ -276,7 +276,7 @@ namespace mg5amcCpu
   }
 #endif
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   template<typename FORTRANFPTYPE>
   void Bridge<FORTRANFPTYPE>::gpu_sequence( const FORTRANFPTYPE* momenta,
                                             const FORTRANFPTYPE* gs,
@@ -291,14 +291,14 @@ namespace mg5amcCpu
     constexpr int neppM = MemoryAccessMomenta::neppM;
     if constexpr( neppM == 1 && std::is_same_v<FORTRANFPTYPE, fptype> )
     {
-      checkCuda( cudaMemcpy( m_devMomentaC.data(), momenta, m_devMomentaC.bytes(), cudaMemcpyHostToDevice ) );
+      gpuMemcpy( m_devMomentaC.data(), momenta, m_devMomentaC.bytes(), gpuMemcpyHostToDevice );
     }
     else
     {
-      checkCuda( cudaMemcpy( m_devMomentaF.data(), momenta, m_devMomentaF.bytes(), cudaMemcpyHostToDevice ) );
+      gpuMemcpy( m_devMomentaF.data(), momenta, m_devMomentaF.bytes(), gpuMemcpyHostToDevice );
       const int thrPerEvt = CPPProcess::npar * CPPProcess::np4; // AV: transpose alg does 1 element per thread (NOT 1 event per thread)
       //const int thrPerEvt = 1; // AV: try new alg with 1 event per thread... this seems slower
-      dev_transposeMomentaF2C<<<m_gpublocks * thrPerEvt, m_gputhreads>>>( m_devMomentaF.data(), m_devMomentaC.data(), m_nevt );
+      gpuLaunchKernel( dev_transposeMomentaF2C, m_gpublocks * thrPerEvt, m_gputhreads, m_devMomentaF.data(), m_devMomentaC.data(), m_nevt );
     }
     if constexpr( std::is_same_v<FORTRANFPTYPE, fptype> )
     {
@@ -341,7 +341,7 @@ namespace mg5amcCpu
   }
 #endif
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   template<typename FORTRANFPTYPE>
   void Bridge<FORTRANFPTYPE>::cpu_sequence( const FORTRANFPTYPE* momenta,
                                             const FORTRANFPTYPE* gs,
@@ -396,7 +396,7 @@ namespace mg5amcCpu
   // - C++ array: momenta[npagM][npar][np4][neppM] with nevt=npagM*neppM (AOSOA)
   //
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   template<typename Tin, typename Tout>
   __global__ void dev_transposeMomentaF2C( const Tin* in, Tout* out, const unsigned int nevt )
   {
diff --git a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/BridgeKernels.cc b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/BridgeKernels.cc
index d58066c9c1..eaf4037a24 100644
--- a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/BridgeKernels.cc
+++ b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/BridgeKernels.cc
@@ -1,17 +1,18 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #include "BridgeKernels.h"
 
+#include "GpuAbstraction.h"
 #include "MemoryAccessMomenta.h"
 
 #include <sstream>
 
 //============================================================================
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -45,7 +46,7 @@ namespace mg5amcCpu
 
 //============================================================================
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 namespace mg5amcCpu
 {
 
@@ -96,7 +97,7 @@ namespace mg5amcCpu
 
 //============================================================================
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 {
 
diff --git a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/BridgeKernels.h b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/BridgeKernels.h
index 15eb4bff4d..3efef8ce97 100644
--- a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/BridgeKernels.h
+++ b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/BridgeKernels.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef BRIDGEKERNELS_H
 #define BRIDGEKERNELS_H 1
@@ -12,7 +12,7 @@
 #include "MatrixElementKernels.h"
 #include "MemoryBuffers.h"
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -49,7 +49,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A Bridge wrapper class encapsulating matrix element calculations on a CPU host
   class BridgeKernelHost final : public BridgeKernelBase
   {
@@ -89,7 +89,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   // A Bridge wrapper class encapsulating matrix element calculations on a GPU device
   class BridgeKernelDevice : public BridgeKernelBase
   {
diff --git a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/CommonRandomNumberKernel.cc b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/CommonRandomNumberKernel.cc
index 985b39f576..010bc4cbd0 100644
--- a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/CommonRandomNumberKernel.cc
+++ b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/CommonRandomNumberKernel.cc
@@ -1,15 +1,16 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #include "CommonRandomNumbers.h"
+#include "GpuAbstraction.h"
 #include "MemoryBuffers.h"
 #include "RandomNumberKernels.h"
 
 #include <cassert>
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/CrossSectionKernels.cc b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/CrossSectionKernels.cc
index 0b355a3c8d..c15b39844d 100644
--- a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/CrossSectionKernels.cc
+++ b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/CrossSectionKernels.cc
@@ -1,10 +1,11 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #include "CrossSectionKernels.h"
 
+#include "GpuAbstraction.h"
 #include "MemoryAccessMatrixElements.h"
 #include "MemoryAccessWeights.h"
 #include "MemoryBuffers.h"
@@ -77,7 +78,7 @@ debug_me_is_abnormal( const fptype& me, size_t ievtALL )
 
 //============================================================================
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -185,7 +186,7 @@ namespace mg5amcCpu
 
 //============================================================================
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 {
 
diff --git a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/CrossSectionKernels.h b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/CrossSectionKernels.h
index 7933ca4bbf..4d9659e04e 100644
--- a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/CrossSectionKernels.h
+++ b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/CrossSectionKernels.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef CROSSSECTIONKERNELS_H
 #define CROSSSECTIONKERNELS_H 1
@@ -13,7 +13,7 @@
 
 //============================================================================
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -96,7 +96,7 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
   /*
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   // A class encapsulating the calculation of event statistics on a GPU device
   class CrossSectionKernelDevice : public CrossSectionKernelBase, public NumberOfEvents
   {
diff --git a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/CudaRuntime.h b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/CudaRuntime.h
deleted file mode 100644
index 64ce52f4b3..0000000000
--- a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/CudaRuntime.h
+++ /dev/null
@@ -1,85 +0,0 @@
-// Copyright (C) 2020-2023 CERN and UCLouvain.
-// Licensed under the GNU Lesser General Public License (version 3 or later).
-// Created by: S. Roiser (Jul 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
-
-#ifndef MG5AMC_CUDARUNTIME_H
-#define MG5AMC_CUDARUNTIME_H 1
-
-// MG5AMC on GPU uses the CUDA runtime API, not the lower level CUDA driver API
-// See https://docs.nvidia.com/cuda/cuda-runtime-api/driver-vs-runtime-api.html#driver-vs-runtime-api
-
-#include <cassert>
-#include <iostream>
-
-//--------------------------------------------------------------------------
-
-// See https://stackoverflow.com/a/14038590
-#ifdef __CUDACC__ /* clang-format off */
-#define checkCuda( code ) { assertCuda( code, __FILE__, __LINE__ ); }
-inline void assertCuda( cudaError_t code, const char* file, int line, bool abort = true )
-{
-  if( code != cudaSuccess )
-  {
-    printf( "ERROR! assertCuda: '%s' (%d) in %s:%d\n", cudaGetErrorString( code ), code, file, line );
-    if( abort ) assert( code == cudaSuccess );
-  }
-}
-#endif /* clang-format on */
-
-//--------------------------------------------------------------------------
-
-#ifdef __CUDACC__
-namespace mg5amcGpu
-{
-  // Instantiate a CudaRuntime at the beginnining of the application's main to
-  // invoke cudaSetDevice(0) in the constructor and book a cudaDeviceReset() call in the destructor
-  // *** FIXME! This will all need to be designed differently when going to multi-GPU nodes! ***
-  struct CudaRuntime final
-  {
-    CudaRuntime( const bool debug = true )
-      : m_debug( debug ) { setUp( m_debug ); }
-    ~CudaRuntime() { tearDown( m_debug ); }
-    CudaRuntime( const CudaRuntime& ) = delete;
-    CudaRuntime( CudaRuntime&& ) = delete;
-    CudaRuntime& operator=( const CudaRuntime& ) = delete;
-    CudaRuntime& operator=( CudaRuntime&& ) = delete;
-    bool m_debug;
-
-    // Set up CUDA application
-    // ** NB: strictly speaking this is not needed when using the CUDA runtime API **
-    // Calling cudaSetDevice on startup is useful to properly book-keep the time spent in CUDA initialization
-    static void setUp( const bool debug = true )
-    {
-      // ** NB: it is useful to call cudaSetDevice, or cudaFree, to properly book-keep the time spent in CUDA initialization
-      // ** NB: otherwise, the first CUDA operation (eg a cudaMemcpyToSymbol in CPPProcess ctor) appears to take much longer!
-      /*
-      // [We initially added cudaFree(0) to "ease profile analysis" only because it shows up as a big recognizable block!]
-      // No explicit initialization is needed: https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#initialization
-      // It is not clear what cudaFree(0) does at all: https://stackoverflow.com/questions/69967813/
-      if ( debug ) std::cout << "__CudaRuntime: calling cudaFree(0)" << std::endl;
-      checkCuda( cudaFree( 0 ) ); // SLOW!
-      */
-      // Replace cudaFree(0) by cudaSetDevice(0), even if it is not really needed either
-      // (but see https://developer.nvidia.com/blog/cuda-pro-tip-always-set-current-device-avoid-multithreading-bugs)
-      if( debug ) std::cout << "__CudaRuntime: calling cudaSetDevice(0)" << std::endl;
-      checkCuda( cudaSetDevice( 0 ) ); // SLOW!
-    }
-
-    // Tear down CUDA application (call cudaDeviceReset)
-    // ** NB: strictly speaking this is not needed when using the CUDA runtime API **
-    // Calling cudaDeviceReset on shutdown is only needed for checking memory leaks in cuda-memcheck
-    // See https://docs.nvidia.com/cuda/cuda-memcheck/index.html#leak-checking
-    static void tearDown( const bool debug = true )
-    {
-      if( debug ) std::cout << "__CudaRuntime: calling cudaDeviceReset()" << std::endl;
-      checkCuda( cudaDeviceReset() );
-    }
-  };
-
-}
-#endif
-
-//--------------------------------------------------------------------------
-
-#endif // MG5AMC_CUDARUNTIME_H
diff --git a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/CurandRandomNumberKernel.cc b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/CurandRandomNumberKernel.cc
index eb56333b03..08a16f6f2c 100644
--- a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/CurandRandomNumberKernel.cc
+++ b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/CurandRandomNumberKernel.cc
@@ -1,9 +1,9 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
-#include "CudaRuntime.h"
+#include "GpuRuntime.h"
 #include "MemoryBuffers.h"
 #include "RandomNumberKernels.h"
 
@@ -22,7 +22,7 @@ inline void assertCurand( curandStatus_t code, const char *file, int line, bool
 }
 #endif /* clang-format on */
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -36,7 +36,7 @@ namespace mg5amcCpu
   {
     if( m_isOnDevice )
     {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
       if( !m_rnarray.isOnDevice() )
         throw std::runtime_error( "CurandRandomNumberKernel on device with a host random number array" );
 #else
@@ -114,7 +114,7 @@ namespace mg5amcCpu
     /*
     printf( "\nCurandRandomNumberKernel::generateRnarray size = %d\n", (int)m_rnarray.size() );
     fptype* data = m_rnarray.data();
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     if( m_rnarray.isOnDevice() )
     {
       data = new fptype[m_rnarray.size()]();
@@ -123,7 +123,7 @@ namespace mg5amcCpu
 #endif
     for( int i = 0; i < ( (int)m_rnarray.size() / 4 ); i++ )
       printf( "[%4d] %f %f %f %f\n", i * 4, data[i * 4], data[i * 4 + 2], data[i * 4 + 2], data[i * 4 + 3] );
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     if( m_rnarray.isOnDevice() ) delete[] data;
 #endif
     */
diff --git a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/EventStatistics.h b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/EventStatistics.h
index 48b51e0a49..b425a5bade 100644
--- a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/EventStatistics.h
+++ b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/EventStatistics.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef EventStatistics_H
 #define EventStatistics_H 1
@@ -16,7 +16,7 @@
 #include <limits>
 #include <string>
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/GpuAbstraction.h b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/GpuAbstraction.h
new file mode 100644
index 0000000000..6a7d9c05c0
--- /dev/null
+++ b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/GpuAbstraction.h
@@ -0,0 +1,71 @@
+// Copyright (C) 2020-2023 CERN and UCLouvain.
+// Licensed under the GNU Lesser General Public License (version 3 or later).
+// Created by: J. Teig (Jul 2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+
+#ifndef MG5AMC_GPUABSTRACTION_H
+#define MG5AMC_GPUABSTRACTION_H 1
+
+#include <cassert>
+
+//--------------------------------------------------------------------------
+
+#ifdef __CUDACC__
+
+#define gpuError_t cudaError_t
+#define gpuPeekAtLastError cudaPeekAtLastError
+#define gpuGetErrorString cudaGetErrorString
+#define gpuSuccess cudaSuccess
+
+#define gpuMallocHost( ptr, size ) checkGpu( cudaMallocHost( ptr, size ) )
+#define gpuMalloc( ptr, size ) checkGpu( cudaMalloc( ptr, size ) )
+
+#define gpuMemcpy( dstData, srcData, srcBytes, func ) checkGpu( cudaMemcpy( dstData, srcData, srcBytes, func ) )
+#define gpuMemcpyHostToDevice cudaMemcpyHostToDevice
+#define gpuMemcpyDeviceToHost cudaMemcpyDeviceToHost
+#define gpuMemcpyToSymbol( type1, type2, size ) checkGpu( cudaMemcpyToSymbol( type1, type2, size ) )
+
+#define gpuFree( ptr ) checkGpu( cudaFree( ptr ) )
+#define gpuFreeHost( ptr ) checkGpu( cudaFreeHost( ptr ) )
+
+#define gpuSetDevice cudaSetDevice
+#define gpuDeviceSynchronize cudaDeviceSynchronize
+#define gpuDeviceReset cudaDeviceReset
+
+#define gpuLaunchKernel( kernel, blocks, threads, ... ) kernel<<<blocks, threads>>>( __VA_ARGS__ )
+#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<<blocks, threads, sharedMem>>>( __VA_ARGS__ )
+
+//--------------------------------------------------------------------------
+
+#elif defined __HIPCC__
+
+#include "hip/hip_runtime.h"
+
+#define gpuError_t hipError_t
+#define gpuPeekAtLastError hipPeekAtLastError
+#define gpuGetErrorString hipGetErrorString
+#define gpuSuccess hipSuccess
+
+#define gpuMallocHost( ptr, size ) checkGpu( hipHostMalloc( ptr, size ) ) // HostMalloc better
+#define gpuMalloc( ptr, size ) checkGpu( hipMalloc( ptr, size ) )
+
+#define gpuMemcpy( dstData, srcData, srcBytes, func ) checkGpu( hipMemcpy( dstData, srcData, srcBytes, func ) )
+#define gpuMemcpyHostToDevice hipMemcpyHostToDevice
+#define gpuMemcpyDeviceToHost hipMemcpyDeviceToHost
+#define gpuMemcpyToSymbol( type1, type2, size ) checkGpu( hipMemcpyToSymbol( type1, type2, size ) )
+
+#define gpuFree( ptr ) checkGpu( hipFree( ptr ) )
+#define gpuFreeHost( ptr ) checkGpu( hipHostFree( ptr ) )
+
+#define gpuSetDevice hipSetDevice
+#define gpuDeviceSynchronize hipDeviceSynchronize
+#define gpuDeviceReset hipDeviceReset
+
+#define gpuLaunchKernel( kernel, blocks, threads, ... ) kernel<<<blocks, threads>>>( __VA_ARGS__ )
+#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<<blocks, threads, sharedMem>>>( __VA_ARGS__ )
+
+//--------------------------------------------------------------------------
+
+#endif
+
+#endif // MG5AMC_GPUABSTRACTION_H
diff --git a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/GpuRuntime.h b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/GpuRuntime.h
new file mode 100644
index 0000000000..93579ef08b
--- /dev/null
+++ b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/GpuRuntime.h
@@ -0,0 +1,85 @@
+// Copyright (C) 2020-2023 CERN and UCLouvain.
+// Licensed under the GNU Lesser General Public License (version 3 or later).
+// Created by: J. Teig (Jun 2023, based on earlier work by S. Roiser) for the MG5aMC CUDACPP plugin.
+// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+
+#ifndef MG5AMC_GPURUNTIME_H
+#define MG5AMC_GPURUNTIME_H 1
+
+// MG5AMC on GPU uses the CUDA runtime API, not the lower level CUDA driver API
+// See https://docs.nvidia.com/cuda/cuda-runtime-api/driver-vs-runtime-api.html#driver-vs-runtime-api
+
+#include "GpuAbstraction.h"
+
+#include <iostream>
+
+//--------------------------------------------------------------------------
+
+// See https://stackoverflow.com/a/14038590
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
+#define checkGpu( code ) { assertGpu( code, __FILE__, __LINE__ ); }
+inline void assertGpu( gpuError_t code, const char* file, int line, bool abort = true )
+{
+  if( code != gpuSuccess )
+  {
+    printf( "ERROR! assertGpu: '%s' (%d) in %s:%d\n", gpuGetErrorString( code ), code, file, line );
+    if( abort ) assert( code == gpuSuccess );
+  }
+}
+#endif /* clang-format on */
+
+//--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+namespace mg5amcGpu
+{
+  // Instantiate a GpuRuntime at the beginnining of the application's main to
+  // invoke gpuSetDevice(0) in the constructor and book a gpuDeviceReset() call in the destructor
+  // *** FIXME! This will all need to be designed differently when going to multi-GPU nodes! ***
+  struct GpuRuntime final
+  {
+    GpuRuntime( const bool debug = true )
+      : m_debug( debug ) { setUp( m_debug ); }
+    ~GpuRuntime() { tearDown( m_debug ); }
+    GpuRuntime( const GpuRuntime& ) = delete;
+    GpuRuntime( GpuRuntime&& ) = delete;
+    GpuRuntime& operator=( const GpuRuntime& ) = delete;
+    GpuRuntime& operator=( GpuRuntime&& ) = delete;
+    bool m_debug;
+
+    // Set up CUDA application
+    // ** NB: strictly speaking this is not needed when using the CUDA runtime API **
+    // Calling cudaSetDevice on startup is useful to properly book-keep the time spent in CUDA initialization
+    static void setUp( const bool debug = true )
+    {
+      // ** NB: it is useful to call cudaSetDevice, or cudaFree, to properly book-keep the time spent in CUDA initialization
+      // ** NB: otherwise, the first CUDA operation (eg a cudaMemcpyToSymbol in CPPProcess ctor) appears to take much longer!
+      /*
+      // [We initially added cudaFree(0) to "ease profile analysis" only because it shows up as a big recognizable block!]
+      // No explicit initialization is needed: https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#initialization
+      // It is not clear what cudaFree(0) does at all: https://stackoverflow.com/questions/69967813/
+      if ( debug ) std::cout << "__CudaRuntime: calling cudaFree(0)" << std::endl;
+      checkCuda( cudaFree( 0 ) ); // SLOW!
+      */
+      // Replace cudaFree(0) by cudaSetDevice(0), even if it is not really needed either
+      // (but see https://developer.nvidia.com/blog/cuda-pro-tip-always-set-current-device-avoid-multithreading-bugs)
+      if( debug ) std::cout << "__GpuRuntime: calling GpuSetDevice(0)" << std::endl;
+      checkGpu( gpuSetDevice( 0 ) ); // SLOW!
+    }
+
+    // Tear down CUDA application (call cudaDeviceReset)
+    // ** NB: strictly speaking this is not needed when using the CUDA runtime API **
+    // Calling cudaDeviceReset on shutdown is only needed for checking memory leaks in cuda-memcheck
+    // See https://docs.nvidia.com/cuda/cuda-memcheck/index.html#leak-checking
+    static void tearDown( const bool debug = true )
+    {
+      if( debug ) std::cout << "__GpuRuntime: calling GpuDeviceReset()" << std::endl;
+      checkGpu( gpuDeviceReset() );
+    }
+  };
+}
+#endif
+
+//--------------------------------------------------------------------------
+
+#endif // MG5AMC_GPURUNTIME_H
diff --git a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MadgraphTest.h b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MadgraphTest.h
index ef40624c88..a64c05c26a 100644
--- a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MadgraphTest.h
+++ b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MadgraphTest.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: S. Hageboeck (Dec 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MADGRAPHTEST_H_
 #define MADGRAPHTEST_H_ 1
@@ -22,7 +22,7 @@
 #include <string>
 #include <vector>
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 using mg5amcGpu::CPPProcess;
 #else
 using mg5amcCpu::CPPProcess;
@@ -201,7 +201,7 @@ class MadgraphTest : public testing::TestWithParam<TestDriverBase*>
 
 // Since we link both the CPU-only and GPU tests into the same executable, we prevent
 // a multiply defined symbol by only compiling this in the non-CUDA phase:
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 
 /// Compare momenta and matrix elements.
 /// This uses an implementation of TestDriverBase to run a madgraph workflow,
@@ -307,6 +307,6 @@ TEST_P( MadgraphTest, CompareMomentaAndME )
   }
 }
 
-#endif // __CUDACC__
+#endif // MGONGPUCPP_GPUIMPL
 
 #endif /* MADGRAPHTEST_H_ */
diff --git a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MatrixElementKernels.cc b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MatrixElementKernels.cc
index 74b5239ebf..81699dfea9 100644
--- a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MatrixElementKernels.cc
+++ b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MatrixElementKernels.cc
@@ -1,12 +1,12 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #include "MatrixElementKernels.h"
 
 #include "CPPProcess.h"
-#include "CudaRuntime.h"
+#include "GpuRuntime.h" // Includes the abstraction for Nvidia/AMD compilation
 #include "MemoryAccessMomenta.h"
 #include "MemoryBuffers.h"
 
@@ -14,7 +14,7 @@
 
 //============================================================================
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 namespace mg5amcCpu
 {
 
@@ -150,7 +150,7 @@ namespace mg5amcCpu
 
 //============================================================================
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 {
 
@@ -209,13 +209,13 @@ namespace mg5amcGpu
     PinnedHostBufferHelicityMask hstIsGoodHel( ncomb );
     DeviceBufferHelicityMask devIsGoodHel( ncomb );
     // ... 0d1. Compute good helicity mask on the device
-    computeDependentCouplings<<<m_gpublocks, m_gputhreads>>>( m_gs.data(), m_couplings.data() );
+    gpuLaunchKernel( computeDependentCouplings, m_gpublocks, m_gputhreads, m_gs.data(), m_couplings.data() );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    sigmaKin_getGoodHel<<<m_gpublocks, m_gputhreads>>>( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_numerators.data(), m_denominators.data(), devIsGoodHel.data() );
+    gpuLaunchKernel( sigmaKin_getGoodHel, m_gpublocks, m_gputhreads, m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_numerators.data(), m_denominators.data(), devIsGoodHel.data() );
 #else
-    sigmaKin_getGoodHel<<<m_gpublocks, m_gputhreads>>>( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), devIsGoodHel.data() );
+    gpuLaunchKernel( sigmaKin_getGoodHel, m_gpublocks, m_gputhreads, m_momenta.data(), m_couplings.data(), m_matrixElements.data(), devIsGoodHel.data() );
 #endif
-    checkCuda( cudaPeekAtLastError() );
+    checkGpu( gpuPeekAtLastError() );
     // ... 0d2. Copy back good helicity mask to the host
     copyHostFromDevice( hstIsGoodHel, devIsGoodHel );
     // ... 0d3. Copy back good helicity list to constant memory on the device
@@ -226,19 +226,19 @@ namespace mg5amcGpu
 
   void MatrixElementKernelDevice::computeMatrixElements( const unsigned int channelId )
   {
-    computeDependentCouplings<<<m_gpublocks, m_gputhreads>>>( m_gs.data(), m_couplings.data() );
+    gpuLaunchKernel( computeDependentCouplings, m_gpublocks, m_gputhreads, m_gs.data(), m_couplings.data() );
 #ifndef MGONGPU_NSIGHT_DEBUG
     constexpr unsigned int sharedMemSize = 0;
 #else
     constexpr unsigned int sharedMemSize = ntpbMAX * sizeof( float );
 #endif
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    sigmaKin<<<m_gpublocks, m_gputhreads, sharedMemSize>>>( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), channelId, m_numerators.data(), m_denominators.data(), m_selhel.data(), m_selcol.data() );
+    gpuLaunchKernelSharedMem( sigmaKin, m_gpublocks, m_gputhreads, sharedMemSize, m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), channelId, m_numerators.data(), m_denominators.data(), m_selhel.data(), m_selcol.data() );
 #else
-    sigmaKin<<<m_gpublocks, m_gputhreads, sharedMemSize>>>( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), m_selhel.data(), m_selcol.data() );
+    gpuLaunchKernelSharedMem( sigmaKin, m_gpublocks, m_gputhreads, sharedMemSize, m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), m_selhel.data(), m_selcol.data() );
 #endif
-    checkCuda( cudaPeekAtLastError() );
-    checkCuda( cudaDeviceSynchronize() );
+    checkGpu( gpuPeekAtLastError() );
+    checkGpu( gpuDeviceSynchronize() );
   }
 
   //--------------------------------------------------------------------------
diff --git a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MatrixElementKernels.h b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MatrixElementKernels.h
index 23e84757a2..72bd8f195b 100644
--- a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MatrixElementKernels.h
+++ b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MatrixElementKernels.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MATRIXELEMENTKERNELS_H
 #define MATRIXELEMENTKERNELS_H 1
@@ -10,7 +10,7 @@
 
 #include "MemoryBuffers.h"
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -81,7 +81,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating matrix element calculations on a CPU host
   class MatrixElementKernelHost final : public MatrixElementKernelBase, public NumberOfEvents
   {
@@ -130,7 +130,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   // A class encapsulating matrix element calculations on a GPU device
   class MatrixElementKernelDevice : public MatrixElementKernelBase, public NumberOfEvents
   {
diff --git a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MemoryAccessAmplitudes.h b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MemoryAccessAmplitudes.h
index 573b3bbbc9..ffb76e93de 100644
--- a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MemoryAccessAmplitudes.h
+++ b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MemoryAccessAmplitudes.h
@@ -15,7 +15,7 @@
 #define MGONGPU_TRIVIAL_AMPLITUDES 1
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MemoryAccessCouplings.h b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MemoryAccessCouplings.h
index 35a3af42e0..3afdf3e554 100644
--- a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MemoryAccessCouplings.h
+++ b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MemoryAccessCouplings.h
@@ -15,7 +15,7 @@
 #include "MemoryBuffers.h"       // for HostBufferCouplings::isaligned
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MemoryAccessCouplingsFixed.h b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MemoryAccessCouplingsFixed.h
index dc0d93afff..ffcdf4dbef 100644
--- a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MemoryAccessCouplingsFixed.h
+++ b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MemoryAccessCouplingsFixed.h
@@ -14,7 +14,7 @@
 //#include "MemoryAccessHelpers.h"
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MemoryAccessDenominators.h b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MemoryAccessDenominators.h
index 3bce635718..66f2d32a6b 100644
--- a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MemoryAccessDenominators.h
+++ b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MemoryAccessDenominators.h
@@ -10,7 +10,7 @@
 #include "MemoryAccessGs.h"
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MemoryAccessGs.h b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MemoryAccessGs.h
index 31311aa375..4c726b30f3 100644
--- a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MemoryAccessGs.h
+++ b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MemoryAccessGs.h
@@ -13,7 +13,7 @@
 #include "MemoryBuffers.h" // for HostBufferMatrixElements::isaligned
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MemoryAccessHelpers.h b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MemoryAccessHelpers.h
index c82a6c7635..db73e4e064 100644
--- a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MemoryAccessHelpers.h
+++ b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MemoryAccessHelpers.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MemoryAccessHelpers_H
 #define MemoryAccessHelpers_H 1
@@ -105,7 +105,7 @@ class KernelAccessHelper : public MemoryAccessHelper<T>
     }
     else
     {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
       const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid
       //printf( "kernelAccessRecord: ievt=%d threadId=%d\n", ievt, threadIdx.x );
       return T::ieventAccessRecord( buffer, ievt ); // NB fptype and fptype_sv coincide for CUDA
diff --git a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MemoryAccessMatrixElements.h b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MemoryAccessMatrixElements.h
index f32e6fea5b..3741011971 100644
--- a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MemoryAccessMatrixElements.h
+++ b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MemoryAccessMatrixElements.h
@@ -13,7 +13,7 @@
 #include "MemoryBuffers.h" // for HostBufferMatrixElements::isaligned
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MemoryAccessMomenta.h b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MemoryAccessMomenta.h
index 29266de32c..3be229d392 100644
--- a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MemoryAccessMomenta.h
+++ b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MemoryAccessMomenta.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MemoryAccessMomenta_H
 #define MemoryAccessMomenta_H 1
@@ -13,7 +13,7 @@
 #include "MemoryAccessVectors.h"
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -30,7 +30,7 @@ namespace mg5amcCpu
 
     // Number of Events Per Page in the momenta AOSOA memory buffer layout
     // (these are all best kept as a compile-time constants: see issue #23)
-#ifdef __CUDACC__ /* clang-format off */
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
     // -----------------------------------------------------------------------------------------------
     // --- GPUs: neppM is best set to a power of 2 times the number of fptype's in a 32-byte cacheline
     // --- This is relevant to ensure coalesced access to momenta in global memory
diff --git a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MemoryAccessNumerators.h b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MemoryAccessNumerators.h
index b152183b28..18991f4fa6 100644
--- a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MemoryAccessNumerators.h
+++ b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MemoryAccessNumerators.h
@@ -10,7 +10,7 @@
 #include "MemoryAccessGs.h"
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MemoryAccessRandomNumbers.h b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MemoryAccessRandomNumbers.h
index e2988d39f3..40cb089135 100644
--- a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MemoryAccessRandomNumbers.h
+++ b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MemoryAccessRandomNumbers.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MemoryAccessRandomNumbers_H
 #define MemoryAccessRandomNumbers_H 1
@@ -11,7 +11,7 @@
 #include "CPPProcess.h"
 #include "MemoryAccessHelpers.h"
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 using mg5amcGpu::CPPProcess;
 #else
 using mg5amcCpu::CPPProcess;
diff --git a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MemoryAccessVectors.h b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MemoryAccessVectors.h
index e9b197368e..08faccff0f 100644
--- a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MemoryAccessVectors.h
+++ b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MemoryAccessVectors.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MemoryAccessVectors_H
 #define MemoryAccessVectors_H 1
@@ -10,7 +10,7 @@
 
 #include "mgOnGpuVectors.h"
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 namespace mg5amcCpu // this is only needed for CPU SIMD vectorization
 {
 
diff --git a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MemoryAccessWavefunctions.h b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MemoryAccessWavefunctions.h
index 5428aaf933..33bef4559e 100644
--- a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MemoryAccessWavefunctions.h
+++ b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MemoryAccessWavefunctions.h
@@ -15,7 +15,7 @@
 #define MGONGPU_TRIVIAL_WAVEFUNCTIONS 1
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MemoryBuffers.h b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MemoryBuffers.h
index 3093e6ed18..7756a71621 100644
--- a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MemoryBuffers.h
+++ b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MemoryBuffers.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021, based on earlier work by S. Hageboeck) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Roiser, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MemoryBuffers_H
 #define MemoryBuffers_H 1
@@ -11,12 +11,12 @@
 #include "mgOnGpuCxtypes.h"
 
 #include "CPPProcess.h"
-#include "CudaRuntime.h"
+#include "GpuRuntime.h"
 #include "Parameters_sm.h"
 
 #include <sstream>
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -87,7 +87,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   constexpr bool HostBufferALIGNED = false;   // ismisaligned=false
   constexpr bool HostBufferMISALIGNED = true; // ismisaligned=true
 
@@ -119,7 +119,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   // A class encapsulating a CUDA pinned host buffer
   template<typename T>
   class PinnedHostBufferBase : public BufferBase<T>
@@ -128,18 +128,18 @@ namespace mg5amcCpu
     PinnedHostBufferBase( const size_t size )
       : BufferBase<T>( size, false )
     {
-      checkCuda( cudaMallocHost( &( this->m_data ), this->bytes() ) );
+      gpuMallocHost( &( this->m_data ), this->bytes() );
     }
     virtual ~PinnedHostBufferBase()
     {
-      checkCuda( cudaFreeHost( this->m_data ) );
+      gpuFreeHost( this->m_data );
     }
   };
 #endif
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   // A class encapsulating a CUDA device buffer
   template<typename T>
   class DeviceBufferBase : public BufferBase<T>
@@ -148,18 +148,18 @@ namespace mg5amcCpu
     DeviceBufferBase( const size_t size )
       : BufferBase<T>( size, true )
     {
-      checkCuda( cudaMalloc( &( this->m_data ), this->bytes() ) );
+      gpuMalloc( &( this->m_data ), this->bytes() );
     }
     virtual ~DeviceBufferBase()
     {
-      checkCuda( cudaFree( this->m_data ) );
+      gpuFree( this->m_data );
     }
   };
 #endif
 
   //--------------------------------------------------------------------------
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for a given number of events
   template<typename T, size_t sizePerEvent, bool ismisaligned>
   class HostBuffer : public HostBufferBase<T, ismisaligned>, virtual private NumberOfEvents
@@ -175,7 +175,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   // A class encapsulating a CUDA pinned host buffer for a given number of events
   template<typename T, size_t sizePerEvent>
   class PinnedHostBuffer : public PinnedHostBufferBase<T>, virtual private NumberOfEvents
@@ -191,7 +191,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   // A class encapsulating a CUDA device buffer for a given number of events
   template<typename T, size_t sizePerEvent>
   class DeviceBuffer : public DeviceBufferBase<T>, virtual private NumberOfEvents
@@ -213,7 +213,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for momenta random numbers
   constexpr size_t sizePerEventRndNumMomenta = MemoryBuffers::np4 * MemoryBuffers::nparf;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for momenta random numbers
   typedef HostBuffer<fptype, sizePerEventRndNumMomenta, HostBufferALIGNED> HostBufferRndNumMomenta;
 #else
@@ -232,7 +232,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer with ONE fptype per event
   constexpr size_t sizePerEventOneFp = 1;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer with ONE fptype per event
   typedef HostBuffer<fptype, sizePerEventOneFp, HostBufferALIGNED> HostBufferOneFp;
 #else
@@ -257,7 +257,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for Gs
   constexpr size_t sizePerEventGs = 1;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for gs
   typedef HostBuffer<fptype, sizePerEventGs, HostBufferALIGNED> HostBufferGs;
 #else
@@ -276,7 +276,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for numerators
   constexpr size_t sizePerEventNumerators = 1;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for gs
   typedef HostBuffer<fptype, sizePerEventNumerators, HostBufferALIGNED> HostBufferNumerators;
 #else
@@ -296,7 +296,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for denominators
   constexpr size_t sizePerEventDenominators = 1;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for gs
   typedef HostBuffer<fptype, sizePerEventDenominators, HostBufferALIGNED> HostBufferDenominators;
 #else
@@ -315,7 +315,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for random numbers
   constexpr size_t sizePerEventCouplings = MemoryBuffers::ndcoup * MemoryBuffers::nx2;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for gs
   typedef HostBuffer<fptype, sizePerEventCouplings, HostBufferALIGNED> HostBufferCouplings;
 #else
@@ -333,7 +333,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for momenta
   constexpr size_t sizePerEventMomenta = MemoryBuffers::np4 * MemoryBuffers::npar;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for momenta
   typedef HostBuffer<fptype, sizePerEventMomenta, HostBufferALIGNED> HostBufferMomenta;
   //typedef HostBuffer<fptype, sizePerEventMomenta, HostBufferMISALIGNED> HostBufferMomenta; // TEST MISALIGNMENT!
@@ -352,7 +352,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for sampling weights
   constexpr size_t sizePerEventWeights = 1;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for sampling weights
   typedef HostBuffer<fptype, sizePerEventWeights, HostBufferALIGNED> HostBufferWeights;
 #else
@@ -370,7 +370,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for matrix elements
   constexpr size_t sizePerEventMatrixElements = 1;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for matrix elements
   typedef HostBuffer<fptype, sizePerEventMatrixElements, HostBufferALIGNED> HostBufferMatrixElements;
 #else
@@ -385,7 +385,7 @@ namespace mg5amcCpu
   // A base class encapsulating a memory buffer for the helicity mask
   typedef BufferBase<bool> BufferHelicityMask;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for the helicity mask
   typedef HostBufferBase<bool, HostBufferALIGNED> HostBufferHelicityMask;
 #else
@@ -403,7 +403,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for wavefunctions
   constexpr size_t sizePerEventWavefunctions = MemoryBuffers::nw6 * MemoryBuffers::nx2;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for wavefunctions
   typedef HostBuffer<fptype, sizePerEventWavefunctions, HostBufferALIGNED> HostBufferWavefunctions;
 #else
@@ -421,7 +421,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for helicity random numbers
   constexpr size_t sizePerEventRndNumHelicity = 1;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for helicity random numbers
   typedef HostBuffer<fptype, sizePerEventRndNumHelicity, HostBufferALIGNED> HostBufferRndNumHelicity;
 #else
@@ -439,7 +439,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for color random numbers
   constexpr size_t sizePerEventRndNumColor = 1;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for color random numbers
   typedef HostBuffer<fptype, sizePerEventRndNumColor, HostBufferALIGNED> HostBufferRndNumColor;
 #else
@@ -457,7 +457,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for helicity selection
   constexpr size_t sizePerEventSelectedHelicity = 1;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for helicity selection
   typedef HostBuffer<int, sizePerEventSelectedHelicity, HostBufferALIGNED> HostBufferSelectedHelicity;
 #else
@@ -475,7 +475,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for color selection
   constexpr size_t sizePerEventSelectedColor = 1;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for color selection
   typedef HostBuffer<int, sizePerEventSelectedColor, HostBufferALIGNED> HostBufferSelectedColor;
 #else
@@ -487,7 +487,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   template<class Tdst, class Tsrc>
   void copyDeviceFromHost( Tdst& dst, const Tsrc& src ) // keep the same order of arguments as in memcpy
   {
@@ -504,13 +504,13 @@ namespace mg5amcCpu
       throw std::runtime_error( sstr.str() );
     }
     // NB (PR #45): cudaMemcpy involves an intermediate memcpy to pinned memory if host array is a not a pinned host array
-    checkCuda( cudaMemcpy( dst.data(), src.data(), src.bytes(), cudaMemcpyHostToDevice ) );
+    gpuMemcpy( dst.data(), src.data(), src.bytes(), gpuMemcpyHostToDevice );
   }
 #endif
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   template<class Tdst, class Tsrc>
   void copyHostFromDevice( Tdst& dst, const Tsrc& src ) // keep the same order of arguments as in memcpy
   {
@@ -527,7 +527,7 @@ namespace mg5amcCpu
       throw std::runtime_error( sstr.str() );
     }
     // NB (PR #45): cudaMemcpy involves an intermediate memcpy to pinned memory if host array is a not a pinned host array
-    checkCuda( cudaMemcpy( dst.data(), src.data(), src.bytes(), cudaMemcpyDeviceToHost ) );
+    gpuMemcpy( dst.data(), src.data(), src.bytes(), gpuMemcpyDeviceToHost );
   }
 #endif
 
diff --git a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/P1_Sigma_sm_gg_ttxggg/CPPProcess.cc b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/P1_Sigma_sm_gg_ttxggg/CPPProcess.cc
index c2f8607428..fa23301c50 100644
--- a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/P1_Sigma_sm_gg_ttxggg/CPPProcess.cc
+++ b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/P1_Sigma_sm_gg_ttxggg/CPPProcess.cc
@@ -4,7 +4,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
 // MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
@@ -16,7 +16,6 @@
 
 #include "mgOnGpuConfig.h"
 
-#include "CudaRuntime.h"
 #include "HelAmps_sm.h"
 #include "MemoryAccessAmplitudes.h"
 #include "MemoryAccessCouplings.h"
@@ -46,7 +45,7 @@
 // Class member functions for calculating the matrix elements for
 // Process: g g > t t~ g g g WEIGHTED<=5 @1
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -80,7 +79,7 @@ namespace mg5amcCpu
   __device__ const fptype cIPD[2] = { (fptype)Parameters_sm::mdl_MT, (fptype)Parameters_sm::mdl_WT };
   __device__ const fptype* cIPC = nullptr; // unused as nicoup=0
 #else
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   __device__ __constant__ fptype cIPD[2];
   __device__ __constant__ fptype* cIPC = nullptr; // unused as nicoup=0
 #else
@@ -90,7 +89,7 @@ namespace mg5amcCpu
 #endif
 
   // Helicity combinations (and filtering of "good" helicity combinations)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   __device__ __constant__ short cHel[ncomb][npar];
   __device__ __constant__ int cNGoodHel;
   __device__ __constant__ int cGoodHel[ncomb];
@@ -118,13 +117,13 @@ namespace mg5amcCpu
                            fptype* allDenominators,       // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
                            fptype_sv* jamp2_sv            // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled)
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
                            , const int ievt00             // input: first event number in current C++ event page (for CUDA, ievt depends on threadid)
 #endif
                            )
   //ALWAYS_INLINE // attributes are not permitted in a function definition
   {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     using namespace mg5amcGpu;
     using M_ACCESS = DeviceAccessMomenta;         // non-trivial access: buffer includes all events
     using E_ACCESS = DeviceAccessMatrixElements;  // non-trivial access: buffer includes all events
@@ -151,7 +150,7 @@ namespace mg5amcCpu
 #endif /* clang-format on */
     mgDebug( 0, __FUNCTION__ );
     //printf( "calculate_wavefunctions: ihel=%2d\n", ihel );
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
     //printf( "calculate_wavefunctions: ievt00=%d\n", ievt00 );
 #endif
 
@@ -187,7 +186,7 @@ namespace mg5amcCpu
 #endif
     for( int iParity = 0; iParity < nParity; ++iParity )
     { // START LOOP ON IPARITY
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
       const int ievt0 = ievt00 + iParity * neppV;
 #endif
       constexpr size_t nxcoup = ndcoup + nicoup; // both dependent and independent couplings
@@ -200,8 +199,10 @@ namespace mg5amcCpu
         allCOUPs[idcoup] = CD_ACCESS::idcoupAccessBufferConst( allcouplings, idcoup ); // dependent couplings, vary event-by-event
       for( size_t iicoup = 0; iicoup < nicoup; iicoup++ )
         allCOUPs[ndcoup + iicoup] = CI_ACCESS::iicoupAccessBufferConst( cIPC, iicoup ); // independent couplings, fixed for all events
+#ifdef MGONGPUCPP_GPUIMPL
 #ifdef __CUDACC__
 #pragma nv_diagnostic pop
+#endif
       // CUDA kernels take input/output buffers with momenta/MEs for all events
       const fptype* momenta = allmomenta;
       const fptype* COUPs[nxcoup];
@@ -31908,7 +31909,7 @@ namespace mg5amcCpu
         { -116, 442, 442, -134, -134, 505, -44, -134, 28, -224, -62, 496, -53, 19, -62, 496, 10, -80, -62, -71, 10, -80, 1, -8, 136, -116, -116, -44, -44, 514, -116, 442, -44, 28, -53, -62, -44, -53, 514, -62, 100, 10, 28, -62, -62, 10, 10, 1, 514, 505, -62, 496, -71, 568, -44, -134, -53, -62, 19, -71, 10, -80, 100, 10, 640, -80, 1, -8, 10, 1, 64, -8, -62, -71, 10, -80, 1, -8, 28, -62, -62, 10, 10, 1, 1, -8, 10, 1, 64, -8, -8, 64, -80, -8, -512, 64, 496, 568, -80, 640, -8, 64, -224, 496, 496, -80, -80, -8, -8, 64, -80, -8, -512, 64, 64, -512, 640, 64, 4096, -512 },
         { 136, -116, -116, -44, -44, 514, -116, 442, -44, 28, -53, -62, -44, -53, 514, -62, 100, 10, 28, -62, -62, 10, 10, 1, -116, 442, 442, -134, -134, 505, -44, -134, 28, -224, -62, 496, -53, 19, -62, 496, 10, -80, -62, -71, 10, -80, 1, -8, -44, -134, -53, -62, 19, -71, 514, 505, -62, 496, -71, 568, 100, 10, 10, -80, -80, 640, 10, 1, 1, -8, -8, 64, 28, -62, -62, 10, 10, 1, -62, -71, 10, -80, 1, -8, 10, 1, 1, -8, -8, 64, -80, -8, -8, 64, 64, -512, -224, 496, 496, -80, -80, -8, 496, 568, -80, 640, -8, 64, -80, -8, -8, 64, 64, -512, 640, 64, 64, -512, -512, 4096 } }; // 2-D array[120][120]
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
       // Pre-compute a constexpr triangular color matrix properly normalized #475
       struct TriangularNormalizedColorMatrix
       {
@@ -31965,7 +31966,7 @@ namespace mg5amcCpu
 #endif
       for( int icol = 0; icol < ncolor; icol++ )
       {
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
         // === C++ START ===
         // Diagonal terms
 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
@@ -32024,7 +32025,7 @@ namespace mg5amcCpu
       MEs_sv_previous += deltaMEs_previous;
 #endif
       /*
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
       if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", blockDim.x * blockIdx.x + threadIdx.x, ihel, MEs_sv );
 #else
 #ifdef MGONGPU_CPPSIMD
@@ -32183,8 +32184,8 @@ namespace mg5amcCpu
       { 1, 1, 1, -1, 1, -1, 1 },
       { 1, 1, 1, -1, 1, 1, -1 },
       { 1, 1, 1, -1, 1, 1, 1 } };
-#ifdef __CUDACC__
-    checkCuda( cudaMemcpyToSymbol( cHel, tHel, ncomb * npar * sizeof( short ) ) );
+#ifdef MGONGPUCPP_GPUIMPL
+    gpuMemcpyToSymbol( cHel, tHel, ncomb * npar * sizeof( short ) );
 #else
     memcpy( cHel, tHel, ncomb * npar * sizeof( short ) );
 #endif
@@ -32227,9 +32228,9 @@ namespace mg5amcCpu
     // Then copy them to CUDA constant memory (issue #39) or its C++ emulation in file-scope static memory
     const fptype tIPD[2] = { (fptype)m_pars->mdl_MT, (fptype)m_pars->mdl_WT };
     //const cxtype tIPC[0] = { ... }; // nicoup=0
-#ifdef __CUDACC__
-    checkCuda( cudaMemcpyToSymbol( cIPD, tIPD, 2 * sizeof( fptype ) ) );
-    //checkCuda( cudaMemcpyToSymbol( cIPC, tIPC, 0 * sizeof( cxtype ) ) ); // nicoup=0
+#ifdef MGONGPUCPP_GPUIMPL
+    gpuMemcpyToSymbol( cIPD, tIPD, 2 * sizeof( fptype ) );
+    //gpuMemcpyToSymbol( cIPC, tIPC, 0 * sizeof( cxtype ) ); // nicoup=0
 #else
     memcpy( cIPD, tIPD, 2 * sizeof( fptype ) );
     //memcpy( cIPC, tIPC, 0 * sizeof( cxtype ) ); // nicoup=0
@@ -32268,7 +32269,7 @@ namespace mg5amcCpu
   {
     std::stringstream out;
     // CUDA version (NVCC)
-    // [Use __NVCC__ instead of __CUDACC__ here!]
+    // [Use __NVCC__ instead of MGONGPUCPP_GPUIMPL here!]
     // [This tests if 'nvcc' was used even to build a .cc file, even if not necessarily 'nvcc -x cu' for a .cu file]
     // [Check 'nvcc --compiler-options -dM -E dummy.c | grep CUDA': see https://stackoverflow.com/a/53713712]
 #ifdef __NVCC__
@@ -32333,12 +32334,12 @@ namespace mg5amcCpu
   __global__ void /* clang-format off */
   computeDependentCouplings( const fptype* allgs, // input: Gs[nevt]
                              fptype* allcouplings // output: couplings[nevt*ndcoup*2]
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
                              , const int nevt     // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #endif
   ) /* clang-format on */
   {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     using namespace mg5amcGpu;
     using G_ACCESS = DeviceAccessGs;
     using C_ACCESS = DeviceAccessCouplings;
@@ -32359,7 +32360,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__ /* clang-format off */
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
   __global__ void
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
@@ -32485,9 +32486,9 @@ namespace mg5amcCpu
         nGoodHel++;
       }
     }
-#ifdef __CUDACC__
-    checkCuda( cudaMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) ) );
-    checkCuda( cudaMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) ) );
+#ifdef MGONGPUCPP_GPUIMPL
+    gpuMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) );
+    gpuMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) );
 #else
     cNGoodHel = nGoodHel;
     for( int ihel = 0; ihel < ncomb; ihel++ ) cGoodHel[ihel] = goodHel[ihel];
@@ -32511,7 +32512,7 @@ namespace mg5amcCpu
 #endif
             int* allselhel,                // output: helicity selection[nevt]
             int* allselcol                 // output: helicity selection[nevt]
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
             , const int nevt               // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #endif
             ) /* clang-format on */
@@ -32531,7 +32532,7 @@ namespace mg5amcCpu
     // Denominators: spins, colors and identical particles
     constexpr int helcolDenominators[1] = { 1536 }; // assume nprocesses == 1 (#272 and #343)
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     // Remember: in CUDA this is a kernel for one event, in c++ this processes n events
     const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid
 #else
@@ -32545,9 +32546,12 @@ namespace mg5amcCpu
 #endif
 
     // Start sigmaKin_lines
+
+#include "GpuAbstraction.h"
+
     // === PART 0 - INITIALISATION (before calculate_wavefunctions) ===
     // Reset the "matrix elements" - running sums of |M|^2 over helicities for the given event
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     allMEs[ievt] = 0;
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     allNumerators[ievt] = 0;
@@ -32575,7 +32579,7 @@ namespace mg5amcCpu
     // === PART 1 - HELICITY LOOP: CALCULATE WAVEFUNCTIONS ===
     // (in both CUDA and C++, using precomputed good helicities)
 
-#ifdef __CUDACC__ // CUDA OR C++
+#ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++
 
     // *** START OF PART 1a - CUDA (one event per CPU thread) ***
     // Running sum of partial amplitudes squared for event by event color selection (#402)
@@ -32785,7 +32789,7 @@ namespace mg5amcCpu
     // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event
     // [NB 'sum over final spins, average over initial spins', eg see
     // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf]
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     allMEs[ievt] /= helcolDenominators[0];
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     if( channelId > 0 ) allMEs[ievt] *= allNumerators[ievt] / allDenominators[ievt];
diff --git a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/P1_Sigma_sm_gg_ttxggg/CPPProcess.h b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/P1_Sigma_sm_gg_ttxggg/CPPProcess.h
index 2565923dde..fff95b66e2 100644
--- a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/P1_Sigma_sm_gg_ttxggg/CPPProcess.h
+++ b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/P1_Sigma_sm_gg_ttxggg/CPPProcess.h
@@ -4,7 +4,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
 // MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
@@ -25,7 +25,7 @@
 
 //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -107,7 +107,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   __global__ void
   computeDependentCouplings( const fptype* allgs,    // input: Gs[nevt]
                              fptype* allcouplings ); // output: couplings[nevt*ndcoup*2]
@@ -120,7 +120,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__ /* clang-format off */
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
   __global__ void
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
@@ -150,7 +150,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__ /* clang-format off */
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
   __global__ void
   sigmaKin( const fptype* allmomenta,      // input: momenta[nevt*npar*4]
             const fptype* allcouplings,    // input: couplings[nevt*ndcoup*2]
diff --git a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/P1_Sigma_sm_gg_ttxggg/CudaRuntime.h b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/P1_Sigma_sm_gg_ttxggg/CudaRuntime.h
deleted file mode 120000
index ce9e1a487a..0000000000
--- a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/P1_Sigma_sm_gg_ttxggg/CudaRuntime.h
+++ /dev/null
@@ -1 +0,0 @@
-../CudaRuntime.h
\ No newline at end of file
diff --git a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/P1_Sigma_sm_gg_ttxggg/GpuAbstraction.h b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/P1_Sigma_sm_gg_ttxggg/GpuAbstraction.h
new file mode 120000
index 0000000000..72054e19ba
--- /dev/null
+++ b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/P1_Sigma_sm_gg_ttxggg/GpuAbstraction.h
@@ -0,0 +1 @@
+../GpuAbstraction.h
\ No newline at end of file
diff --git a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/P1_Sigma_sm_gg_ttxggg/GpuRuntime.h b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/P1_Sigma_sm_gg_ttxggg/GpuRuntime.h
new file mode 120000
index 0000000000..3920e83be4
--- /dev/null
+++ b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/P1_Sigma_sm_gg_ttxggg/GpuRuntime.h
@@ -0,0 +1 @@
+../GpuRuntime.h
\ No newline at end of file
diff --git a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/P1_Sigma_sm_gg_ttxggg/check_sa.cc b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/P1_Sigma_sm_gg_ttxggg/check_sa.cc
index 3fbf0ffbee..7cac5ab47b 100644
--- a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/P1_Sigma_sm_gg_ttxggg/check_sa.cc
+++ b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/P1_Sigma_sm_gg_ttxggg/check_sa.cc
@@ -4,7 +4,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: O. Mattelaer (Nov 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 
 #include "mgOnGpuConfig.h"
@@ -12,6 +12,7 @@
 #include "BridgeKernels.h"
 #include "CPPProcess.h"
 #include "CrossSectionKernels.h"
+#include "GpuRuntime.h"
 #include "MatrixElementKernels.h"
 #include "MemoryAccessMatrixElements.h"
 #include "MemoryAccessMomenta.h"
@@ -65,7 +66,7 @@ usage( char* argv0, int ret = 1 )
   std::cout << std::endl;
   std::cout << "Summary stats are always computed: '-p' and '-j' only control their printout" << std::endl;
   std::cout << "The '-d' flag only enables NaN/abnormal warnings and OMP debugging" << std::endl;
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 #ifdef _OPENMP
   std::cout << std::endl;
   std::cout << "Use the OMP_NUM_THREADS environment variable to control OMP multi-threading" << std::endl;
@@ -96,7 +97,7 @@ int
 main( int argc, char** argv )
 {
   // Namespaces for CUDA and C++ (FIXME - eventually use the same namespace everywhere...)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   using namespace mg5amcGpu;
 #else
   using namespace mg5amcCpu;
@@ -134,9 +135,11 @@ main( int argc, char** argv )
     CurandDevice = 2
   };
 #ifdef MGONGPU_HAS_NO_CURAND
-  RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // this is the only supported mode if build has no curand (PR #784)
+  RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // this is the only supported mode if build has no curand (PR #784 and #785)
+#elif defined __HIPCC__
+#error Internal error: MGONGPU_HAS_NO_CURAND should have been set for __HIPCC__ // default on AMD GPUs should be common random
 #elif defined __CUDACC__
-  RandomNumberMode rndgen = RandomNumberMode::CurandDevice; // default on GPU if build has curand
+  RandomNumberMode rndgen = RandomNumberMode::CurandDevice; // default on NVidia GPU if build has curand
 #else
   RandomNumberMode rndgen = RandomNumberMode::CurandHost; // default on CPU if build has curand
 #endif
@@ -146,10 +149,10 @@ main( int argc, char** argv )
     RamboHost = 1,
     RamboDevice = 2
   };
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   RamboSamplingMode rmbsmp = RamboSamplingMode::RamboDevice; // default on GPU
 #else
-  RamboSamplingMode rmbsmp = RamboSamplingMode::RamboHost;  // default on CPU
+  RamboSamplingMode rmbsmp = RamboSamplingMode::RamboHost; // default on CPU
 #endif
   // Bridge emulation mode (NB Bridge implies RamboHost!)
   bool bridge = false;
@@ -177,7 +180,7 @@ main( int argc, char** argv )
     else if( arg == "--curdev" )
     {
 #ifndef __CUDACC__
-      throw std::runtime_error( "CurandDevice is not supported on CPUs" );
+      throw std::runtime_error( "CurandDevice is not supported on CPUs or non-NVidia GPUs" );
 #elif defined MGONGPU_HAS_NO_CURAND
       throw std::runtime_error( "CurandDevice is not supported because this application was built without Curand support" );
 #else
@@ -198,7 +201,7 @@ main( int argc, char** argv )
     }
     else if( arg == "--rmbdev" )
     {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
       rmbsmp = RamboSamplingMode::RamboDevice;
 #else
       throw std::runtime_error( "RamboDevice is not supported on CPUs" );
@@ -272,13 +275,13 @@ main( int argc, char** argv )
     return usage( argv[0] );
   }
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 #ifdef _OPENMP
   ompnumthreadsNotSetMeansOneThread( debug ? 1 : 0 ); // quiet(-1), info(0), debug(1)
 #endif
 #endif
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // Fail gently and avoid "Illegal instruction (core dumped)" if the host does not support the SIMD used in the ME calculation
   // Note: this prevents a crash on pmpe04 but not on some github CI nodes?
   // [NB: SIMD vectorization in mg5amc C++ code is only used in the ME calculation below MatrixElementKernelHost!]
@@ -296,14 +299,14 @@ main( int argc, char** argv )
 
   // === STEP 0 - INITIALISE
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 
-  // --- 00. Initialise cuda
-  // Instantiate a CudaRuntime at the beginnining of the application's main to
-  // invoke cudaSetDevice(0) in the constructor and book a cudaDeviceReset() call in the destructor
-  const std::string cdinKey = "00 CudaInit";
+  // --- 00. Initialise GPU
+  // Instantiate a GpuRuntime at the beginnining of the application's main.
+  // For CUDA this invokes cudaSetDevice(0) in the constructor and books a cudaDeviceReset() call in the destructor.
+  const std::string cdinKey = "00 GpuInit";
   timermap.start( cdinKey );
-  CudaRuntime cudaRuntime( debug );
+  GpuRuntime GpuRuntime( debug );
 #endif
 
   // --- 0a. Initialise physics process
@@ -325,7 +328,7 @@ main( int argc, char** argv )
   timermap.start( alloKey );
 
   // Memory buffers for random numbers for momenta
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferRndNumMomenta hstRndmom( nevt );
 #else
   PinnedHostBufferRndNumMomenta hstRndmom( nevt );
@@ -333,7 +336,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for sampling weights
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferWeights hstWeights( nevt );
 #else
   PinnedHostBufferWeights hstWeights( nevt );
@@ -341,7 +344,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for momenta
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferMomenta hstMomenta( nevt );
 #else
   PinnedHostBufferMomenta hstMomenta( nevt );
@@ -349,7 +352,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for Gs
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferGs hstGs( nevt );
 #else
   PinnedHostBufferGs hstGs( nevt );
@@ -366,7 +369,7 @@ main( int argc, char** argv )
   }
 
   // Memory buffers for matrix elements
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferMatrixElements hstMatrixElements( nevt );
 #else
   PinnedHostBufferMatrixElements hstMatrixElements( nevt );
@@ -375,7 +378,7 @@ main( int argc, char** argv )
 
   // Memory buffers for random numbers for helicity selection
   // *** NB #403 these buffers always remain initialised at 0: no need for helicity choice in gcheck/check (no LHE produced) ***
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferRndNumHelicity hstRndHel( nevt );
 #else
   PinnedHostBufferRndNumHelicity hstRndHel( nevt );
@@ -384,7 +387,7 @@ main( int argc, char** argv )
 
   // Memory buffers for random numbers for color selection
   // *** NB #402 these buffers always remain initialised at 0: no need for color choice in gcheck/check (no LHE produced) ***
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferRndNumColor hstRndCol( nevt );
 #else
   PinnedHostBufferRndNumColor hstRndCol( nevt );
@@ -392,7 +395,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for helicity selection
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferSelectedHelicity hstSelHel( nevt );
 #else
   PinnedHostBufferSelectedHelicity hstSelHel( nevt );
@@ -400,7 +403,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for color selection
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferSelectedColor hstSelCol( nevt );
 #else
   PinnedHostBufferSelectedColor hstSelCol( nevt );
@@ -438,7 +441,7 @@ main( int argc, char** argv )
     const bool onDevice = true;
     prnk.reset( new CurandRandomNumberKernel( devRndmom, onDevice ) );
 #else
-    throw std::logic_error( "INTERNAL ERROR! CurandDevice is not supported on CPUs" ); // INTERNAL ERROR (no path to this statement)
+    throw std::logic_error( "INTERNAL ERROR! CurandDevice is not supported on CPUs or non-NVidia GPUs" ); // INTERNAL ERROR (no path to this statement)
 #endif
   }
 
@@ -450,7 +453,7 @@ main( int argc, char** argv )
   }
   else
   {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     prsk.reset( new RamboSamplingKernelDevice( energy, devRndmom, devMomenta, devWeights, gpublocks, gputhreads ) );
 #else
     throw std::logic_error( "RamboDevice is not supported on CPUs" ); // INTERNAL ERROR (no path to this statement)
@@ -461,7 +464,7 @@ main( int argc, char** argv )
   std::unique_ptr<MatrixElementKernelBase> pmek;
   if( !bridge )
   {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     pmek.reset( new MatrixElementKernelDevice( devMomenta, devGs, devRndHel, devRndCol, devMatrixElements, devSelHel, devSelCol, gpublocks, gputhreads ) );
 #else
     pmek.reset( new MatrixElementKernelHost( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, nevt ) );
@@ -469,7 +472,7 @@ main( int argc, char** argv )
   }
   else
   {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     pmek.reset( new BridgeKernelDevice( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, gpublocks, gputhreads ) );
 #else
     pmek.reset( new BridgeKernelHost( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, nevt ) );
@@ -511,7 +514,7 @@ main( int argc, char** argv )
     prnk->generateRnarray();
     //std::cout << "Got random numbers" << std::endl;
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     if( rndgen != RandomNumberMode::CurandDevice && rmbsmp == RamboSamplingMode::RamboDevice )
     {
       // --- 1c. Copy rndmom from host to device
@@ -543,7 +546,7 @@ main( int argc, char** argv )
     prsk->getMomentaFinal();
     //std::cout << "Got final momenta" << std::endl;
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     if( rmbsmp == RamboSamplingMode::RamboDevice )
     {
       // --- 2c. CopyDToH Weights
@@ -588,7 +591,7 @@ main( int argc, char** argv )
       dynamic_cast<BridgeKernelBase*>( pmek.get() )->transposeInputMomentaC2F();
     }
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     // --- 2d. CopyHToD Momenta
     const std::string gKey = "0.. CpHTDg";
     rambtime += timermap.start( gKey ); // FIXME! NOT A RAMBO TIMER!
@@ -617,7 +620,7 @@ main( int argc, char** argv )
     wv3atime += timermap.stop(); // calc only
     wavetime += wv3atime;        // calc plus copy
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     if( !bridge )
     {
       // --- 3b. CopyDToH MEs
@@ -760,18 +763,22 @@ main( int argc, char** argv )
     rndgentxt = "CURAND DEVICE";
 #ifdef __CUDACC__
   rndgentxt += " (CUDA code)";
+#elif defined __HIPCC__
+  rndgentxt += " (HIP code)";
 #else
   rndgentxt += " (C++ code)";
 #endif
 
   // Workflow description summary
   std::string wrkflwtxt;
-  // -- CUDA or C++?
+  // -- CUDA or HIP or C++?
 #ifdef __CUDACC__
   wrkflwtxt += "CUD:";
+#elif defined __HIPCC__
+  wrkflwtxt += "HIP:";
 #else
   wrkflwtxt += "CPP:";
-#endif
+#endif /* clang-format off */
   // -- DOUBLE or FLOAT?
 #if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
   wrkflwtxt += "MIX+"; // mixed fptypes (single precision color algebra #537)
@@ -781,7 +788,7 @@ main( int argc, char** argv )
   wrkflwtxt += "FLT+";
 #else
   wrkflwtxt += "???+"; // no path to this statement
-#endif
+#endif /* clang-format on */
   // -- CUCOMPLEX or THRUST or STD complex numbers?
 #ifdef __CUDACC__
 #if defined MGONGPU_CUCXTYPE_CUCOMPLEX
@@ -793,6 +800,12 @@ main( int argc, char** argv )
 #else
   wrkflwtxt += "???:"; // no path to this statement
 #endif
+#elif defined __HIPCC__
+#if defined MGONGPU_CUCXTYPE_CXSMPL
+  wrkflwtxt += "CXS:";
+#else
+  wrkflwtxt += "???:"; // no path to this statement
+#endif
 #else
 #if defined MGONGPU_CPPCXTYPE_STDCOMPLEX
   wrkflwtxt += "STX:";
@@ -818,7 +831,7 @@ main( int argc, char** argv )
     wrkflwtxt += "RMBDEV+";
   else
     wrkflwtxt += "??????+"; // no path to this statement
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   // -- HOST or DEVICE matrix elements? Standalone MEs or BRIDGE?
   if( !bridge )
     wrkflwtxt += "MESDEV";
@@ -874,7 +887,7 @@ main( int argc, char** argv )
 
   if( perf )
   {
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 #ifdef _OPENMP
     // Get the output of "nproc --all" (https://stackoverflow.com/a/478960)
     std::string nprocall;
@@ -895,6 +908,8 @@ main( int argc, char** argv )
     std::cout << std::string( SEP79, '*' ) << std::endl
 #ifdef __CUDACC__
               << "Process                     = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_CUDA"
+#elif defined __HIPCC__
+              << "Process                     = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_HIP"
 #else
               << "Process                     = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_CPP"
 #endif
@@ -921,21 +936,21 @@ main( int argc, char** argv )
 #elif defined MGONGPU_FPTYPE_FLOAT
               << "FP precision                = FLOAT (NaN/abnormal=" << nabn << ", zero=" << nzero << ")" << std::endl
 #endif
-#ifdef __CUDACC__
 #if defined MGONGPU_CUCXTYPE_CUCOMPLEX
               << "Complex type                = CUCOMPLEX" << std::endl
 #elif defined MGONGPU_CUCXTYPE_THRUST
               << "Complex type                = THRUST::COMPLEX" << std::endl
-#endif
-#else
+#elif defined MGONGPU_CUCXTYPE_CXSMPL
               << "Complex type                = STD::COMPLEX" << std::endl
+#else
+              << "Complex type                = ???" << std::endl // no path to this statement...
 #endif
               << "RanNumb memory layout       = AOSOA[" << neppR << "]"
               << ( neppR == 1 ? " == AOS" : "" )
               << " [HARDCODED FOR REPRODUCIBILITY]" << std::endl
               << "Momenta memory layout       = AOSOA[" << neppM << "]"
               << ( neppM == 1 ? " == AOS" : "" ) << std::endl
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     //<< "Wavefunction GPU memory     = LOCAL" << std::endl
 #else
 #if !defined MGONGPU_CPPSIMD
@@ -966,7 +981,7 @@ main( int argc, char** argv )
 #endif
 #endif
               << "Random number generation    = " << rndgentxt << std::endl
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 #ifdef _OPENMP
               << "OMP threads / `nproc --all` = " << omp_get_max_threads() << " / " << nprocall // includes a newline
 #endif
@@ -1062,14 +1077,14 @@ main( int argc, char** argv )
              << "\"FLOAT (NaN/abnormal=" << nabn << ")\"," << std::endl
 #endif
              << "\"Complex type\": "
-#ifdef __CUDACC__
 #if defined MGONGPU_CUCXTYPE_CUCOMPLEX
              << "\"CUCOMPLEX\"," << std::endl
 #elif defined MGONGPU_CUCXTYPE_THRUST
              << "\"THRUST::COMPLEX\"," << std::endl
-#endif
-#else
+#elif defined MGONGPU_CUCXTYPE_CXSMPL
              << "\"STD::COMPLEX\"," << std::endl
+#else
+             << "\"???\"," << std::endl                           // no path to this statement...
 #endif
              << "\"RanNumb memory layout\": "
              << "\"AOSOA[" << neppR << "]\""
@@ -1077,7 +1092,7 @@ main( int argc, char** argv )
              << "\"Momenta memory layout\": "
              << "\"AOSOA[" << neppM << "]\""
              << ( neppM == 1 ? " == AOS" : "" ) << ", " << std::endl
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     //<< "\"Wavefunction GPU memory\": " << "\"LOCAL\"," << std::endl
 #endif
              << "\"Curand generation\": "
diff --git a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/RamboSamplingKernels.cc b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/RamboSamplingKernels.cc
index da68aa9255..79abbcc4f8 100644
--- a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/RamboSamplingKernels.cc
+++ b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/RamboSamplingKernels.cc
@@ -1,11 +1,11 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #include "RamboSamplingKernels.h"
 
-#include "CudaRuntime.h"
+#include "GpuRuntime.h"
 #include "MemoryAccessMomenta.h"
 #include "MemoryAccessRandomNumbers.h"
 #include "MemoryAccessWeights.h"
@@ -14,7 +14,7 @@
 
 #include <sstream>
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -92,7 +92,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   RamboSamplingKernelDevice::RamboSamplingKernelDevice( const fptype energy,               // input: energy
                                                         const BufferRndNumMomenta& rndmom, // input: random numbers in [0,1]
                                                         BufferMomenta& momenta,            // output: momenta
@@ -135,7 +135,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   __global__ void
   getMomentaInitialDevice( const fptype energy,
                            fptype* momenta )
@@ -147,17 +147,17 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   void
   RamboSamplingKernelDevice::getMomentaInitial()
   {
-    getMomentaInitialDevice<<<m_gpublocks, m_gputhreads>>>( m_energy, m_momenta.data() );
+    gpuLaunchKernel( getMomentaInitialDevice, m_gpublocks, m_gputhreads, m_energy, m_momenta.data() );
   }
 #endif
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   __global__ void
   getMomentaFinalDevice( const fptype energy,
                          const fptype* rndmom,
@@ -171,11 +171,11 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   void
   RamboSamplingKernelDevice::getMomentaFinal()
   {
-    getMomentaFinalDevice<<<m_gpublocks, m_gputhreads>>>( m_energy, m_rndmom.data(), m_momenta.data(), m_weights.data() );
+    gpuLaunchKernel( getMomentaFinalDevice, m_gpublocks, m_gputhreads, m_energy, m_rndmom.data(), m_momenta.data(), m_weights.data() );
   }
 #endif
 
diff --git a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/RamboSamplingKernels.h b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/RamboSamplingKernels.h
index 184089efd7..7c214cd74b 100644
--- a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/RamboSamplingKernels.h
+++ b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/RamboSamplingKernels.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef RAMBOSAMPLINGKERNELS_H
 #define RAMBOSAMPLINGKERNELS_H 1
@@ -10,7 +10,7 @@
 
 #include "MemoryBuffers.h"
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -93,7 +93,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   // A class encapsulating RAMBO phase space sampling on a GPU device
   class RamboSamplingKernelDevice final : public SamplingKernelBase, public NumberOfEvents
   {
diff --git a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/RandomNumberKernels.h b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/RandomNumberKernels.h
index 188a72c2c9..21d63beeac 100644
--- a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/RandomNumberKernels.h
+++ b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/RandomNumberKernels.h
@@ -1,14 +1,14 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef RANDOMNUMBERKERNELS_H
 #define RANDOMNUMBERKERNELS_H 1
 
 #include "mgOnGpuConfig.h"
 
-// NB This must come AFTER mgOnGpuConfig.h which contains our definition of __global__ when __CUDACC__ is not defined
+// NB This must come AFTER mgOnGpuConfig.h which contains our definition of __global__ when MGONGPUCPP_GPUIMPL is not defined
 #ifndef MGONGPU_HAS_NO_CURAND
 //#include "curand.h"
 struct curandGenerator_st; // forward definition from curand.h
@@ -16,7 +16,7 @@ struct curandGenerator_st; // forward definition from curand.h
 
 #include "MemoryBuffers.h"
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/cudacpp.mk b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/cudacpp.mk
index 509307506b..f2cfa349da 100644
--- a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/cudacpp.mk
+++ b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/cudacpp.mk
@@ -1,7 +1,7 @@
 # Copyright (C) 2020-2023 CERN and UCLouvain.
 # Licensed under the GNU Lesser General Public License (version 3 or later).
 # Created by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin.
-# Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+# Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 
 #=== Determine the name of this makefile (https://ftp.gnu.org/old-gnu/Manuals/make-3.80/html_node/make_17.html)
 #=== NB: use ':=' to ensure that the value of CUDACPP_MAKEFILE is not modified further down after including make_opts
@@ -42,10 +42,10 @@ endif
 
 #-------------------------------------------------------------------------------
 
-#=== Configure common compiler flags for C++ and CUDA
+#=== Configure common compiler flags for C++ and CUDA/HIP
 
 INCFLAGS = -I.
-OPTFLAGS = -O3 # this ends up in CUFLAGS too (should it?), cannot add -Ofast or -ffast-math here
+OPTFLAGS = -O3 # this ends up in GPUFLAGS too (should it?), cannot add -Ofast or -ffast-math here
 
 # Dependency on src directory
 MG5AMC_COMMONLIB = mg5amc_common
@@ -121,24 +121,46 @@ endif
 
 #-------------------------------------------------------------------------------
 
-#=== Configure the CUDA compiler
+#=== Configure the GPU compiler (CUDA or HIP)
 
-# If CXX is not a single word (example "clang++ --gcc-toolchain...") then disable CUDA builds (issue #505)
-# This is because it is impossible to pass this to "CUFLAGS += -ccbin <host-compiler>" below
+# FIXME! (AV 24.01.2024)
+# In the current implementation (without separate builds for C++ and CUDA/HIP), we first check for cudacc and hipcc in CUDA_HOME and HIP_HOME.
+# If CUDA_HOME or HIP_HOME are not set, try to determine them from the path to cudacc and hipcc.
+# While convoluted, this is currently necessary to allow disabling CUDA/HIP builds by setting CUDA_HOME or HIP_HOME to invalid paths.
+# This will (probably?) be fixed when separate C++ and CUDA/HIP builds are implemented (PR #775).
+
+# If CXX is not a single word (example "clang++ --gcc-toolchain...") then disable CUDA and HIP builds (issue #505)
+# This is because it is impossible to pass this to "GPUFLAGS += -ccbin <host-compiler>" below
 ifneq ($(words $(subst ccache ,,$(CXX))),1) # allow at most "CXX=ccache <host-compiler>" from outside
-  $(warning CUDA builds are not supported for multi-word CXX "$(CXX)")
+  $(warning CUDA and HIP builds are not supported for multi-word CXX "$(CXX)")
   override CUDA_HOME=disabled
+  override HIP_HOME=disabled
 endif
 
-# If CUDA_HOME is not set, try to set it from the location of nvcc
+# If CUDA_HOME is not set, try to set it from the path to nvcc
 ifndef CUDA_HOME
   CUDA_HOME = $(patsubst %bin/nvcc,%,$(shell which nvcc 2>/dev/null))
   $(warning CUDA_HOME was not set: using "$(CUDA_HOME)")
 endif
 
-# Set NVCC as $(CUDA_HOME)/bin/nvcc if it exists
+# If HIP_HOME is not set, try to set it from the path to hipcc
+ifndef HIP_HOME
+  HIP_HOME = $(patsubst %bin/hipcc,%,$(HIP_COMPILER_PATH))
+  $(warning HIP_HOME was not set: using "$(HIP_HOME)")
+endif
+
+# FIXME! (AV 24.01.2024)
+# In the current implementation (without separate builds for C++ and CUDA/HIP),
+# builds are performed for HIP only if CUDA is not found in the path.
+# If both CUDA and HIP are installed, HIP builds can be triggered by unsetting CUDA_HOME.
+# This will be fixed when separate C++ and CUDA/HIP builds are implemented (PR #775).
+
+#--- Option 1: CUDA exists -> use CUDA
+
+# Set GPUCC as $(CUDA_HOME)/bin/nvcc if it exists
 ifneq ($(wildcard $(CUDA_HOME)/bin/nvcc),)
-  NVCC = $(CUDA_HOME)/bin/nvcc
+
+  GPUCC = $(CUDA_HOME)/bin/nvcc
   USE_NVTX ?=-DUSE_NVTX
   # See https://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/index.html
   # See https://arnon.dk/matching-sm-architectures-arch-and-gencode-for-various-nvidia-cards/
@@ -158,41 +180,78 @@ ifneq ($(wildcard $(CUDA_HOME)/bin/nvcc),)
     CURANDLIBFLAGS = -L$(CUDA_HOME)/lib64/ -lcurand # NB: -lcuda is not needed here!
   endif
   CUOPTFLAGS = -lineinfo
-  CUFLAGS = $(foreach opt, $(OPTFLAGS), -Xcompiler $(opt)) $(CUOPTFLAGS) $(INCFLAGS) $(CUINC) $(USE_NVTX) $(CUARCHFLAGS) -use_fast_math
-  ###CUFLAGS += -Xcompiler -Wall -Xcompiler -Wextra -Xcompiler -Wshadow
-  ###NVCC_VERSION = $(shell $(NVCC) --version | grep 'Cuda compilation tools' | cut -d' ' -f5 | cut -d, -f1)
-  CUFLAGS += -std=c++17 # need CUDA >= 11.2 (see #333): this is enforced in mgOnGpuConfig.h
+  ###GPUFLAGS = $(OPTFLAGS) $(CUOPTFLAGS) $(INCFLAGS) $(CUINC) $(USE_NVTX) $(CUARCHFLAGS) -use_fast_math
+  GPUFLAGS = $(foreach opt, $(OPTFLAGS), -Xcompiler $(opt)) $(CUOPTFLAGS) $(INCFLAGS) $(CUINC) $(USE_NVTX) $(CUARCHFLAGS) -use_fast_math
+  ###GPUFLAGS += -Xcompiler -Wall -Xcompiler -Wextra -Xcompiler -Wshadow
+  ###GPUCC_VERSION = $(shell $(GPUCC) --version | grep 'Cuda compilation tools' | cut -d' ' -f5 | cut -d, -f1)
+  GPUFLAGS += -std=c++17 # need CUDA >= 11.2 (see #333): this is enforced in mgOnGpuConfig.h
   # Without -maxrregcount: baseline throughput: 6.5E8 (16384 32 12) up to 7.3E8 (65536 128 12)
-  ###CUFLAGS+= --maxrregcount 160 # improves throughput: 6.9E8 (16384 32 12) up to 7.7E8 (65536 128 12)
-  ###CUFLAGS+= --maxrregcount 128 # improves throughput: 7.3E8 (16384 32 12) up to 7.6E8 (65536 128 12)
-  ###CUFLAGS+= --maxrregcount 96 # degrades throughput: 4.1E8 (16384 32 12) up to 4.5E8 (65536 128 12)
-  ###CUFLAGS+= --maxrregcount 64 # degrades throughput: 1.7E8 (16384 32 12) flat at 1.7E8 (65536 128 12)
+  ###GPUFLAGS+= --maxrregcount 160 # improves throughput: 6.9E8 (16384 32 12) up to 7.7E8 (65536 128 12)
+  ###GPUFLAGS+= --maxrregcount 128 # improves throughput: 7.3E8 (16384 32 12) up to 7.6E8 (65536 128 12)
+  ###GPUFLAGS+= --maxrregcount 96 # degrades throughput: 4.1E8 (16384 32 12) up to 4.5E8 (65536 128 12)
+  ###GPUFLAGS+= --maxrregcount 64 # degrades throughput: 1.7E8 (16384 32 12) flat at 1.7E8 (65536 128 12)
+  CUBUILDRULEFLAGS = -Xcompiler -fPIC -c
+  CCBUILDRULEFLAGS = -Xcompiler -fPIC -c -x cu
+  CUDATESTFLAGS = -lcuda
+
+  # Set the host C++ compiler for GPUCC via "-ccbin <host-compiler>"
+  # (NB issue #505: this must be a single word, "clang++ --gcc-toolchain..." is not supported)
+  GPUFLAGS += -ccbin $(shell which $(subst ccache ,,$(CXX)))
+
+  # Allow newer (unsupported) C++ compilers with older versions of CUDA if ALLOW_UNSUPPORTED_COMPILER_IN_CUDA is set (#504)
+  ifneq ($(origin ALLOW_UNSUPPORTED_COMPILER_IN_CUDA),undefined)
+  GPUFLAGS += -allow-unsupported-compiler
+  endif
+
 else ifneq ($(origin REQUIRE_CUDA),undefined)
+
   # If REQUIRE_CUDA is set but no cuda is found, stop here (e.g. for CI tests on GPU #443)
-  $(error No cuda installation found (set CUDA_HOME or make nvcc visible in PATH))
+  $(error No cuda installation found (set CUDA_HOME or make GPUCC visible in PATH))
+
+#--- Option 2: CUDA does not exist, HIP exists -> use HIP
+
+# Set GPUCC as $(HIP_HOME)/bin/hipcc if it exists
+else ifneq ($(wildcard $(HIP_HOME)/bin/hipcc),)
+
+  GPUCC = $(HIP_HOME)/bin/hipcc
+  #USE_NVTX ?=-DUSE_NVTX # should maybe find something equivalent to this in HIP?
+  HIPARCHFLAGS = -target x86_64-linux-gnu --offload-arch=gfx90a
+  HIPINC = -I$(HIP_HOME)/include/
+  # Note: -DHIP_FAST_MATH is equivalent to -use_fast_math in HIP 
+  # (but only for single precision line 208: https://rocm-developer-tools.github.io/HIP/hcc__detail_2math__functions_8h_source.html)
+  GPUFLAGS = $(OPTFLAGS) $(CUOPTFLAGS) $(INCFLAGS) $(HIPINC) $(HIPARCHFLAGS) -DHIP_FAST_MATH -DHIP_PLATFORM=amd -fPIC
+  ###GPUFLAGS += -Xcompiler -Wall -Xcompiler -Wextra -Xcompiler -Wshadow
+  GPUFLAGS += -std=c++17
+  ###GPUFLAGS+= --maxrregcount 255 # (AV: is this option valid on HIP and meaningful on AMD GPUs?)
+  CUBUILDRULEFLAGS = -fPIC -c
+  CCBUILDRULEFLAGS = -fPIC -c
+
+else ifneq ($(origin REQUIRE_HIP),undefined)
+
+  # If REQUIRE_HIP is set but no HIP is found, stop here (e.g. for CI tests on GPU #443)
+  $(error No hip installation found (set HIP_HOME or make GPUCC visible in PATH))
+
+#--- Option 3: CUDA does not exist, HIP does not exist -> switch off both CUDA and HIP
+
 else
-  # No cuda. Switch cuda compilation off and go to common random numbers in C++
+
+  # No cudacc and no hipcc: switch CUDA and HIP compilation off and go to common random numbers in C++
   $(warning CUDA_HOME is not set or is invalid: export CUDA_HOME to compile with cuda)
-  override NVCC=
+  $(warning HIP_HOME is not set or is invalid: export HIP_HOME to compile with hip)
+  override GPUCC=
   override USE_NVTX=
   override CUINC=
   override CURANDLIBFLAGS=
-endif
-export NVCC
-export CUFLAGS
-
-# Set the host C++ compiler for nvcc via "-ccbin <host-compiler>"
-# (NB issue #505: this must be a single word, "clang++ --gcc-toolchain..." is not supported)
-CUFLAGS += -ccbin $(shell which $(subst ccache ,,$(CXX)))
 
-# Allow newer (unsupported) C++ compilers with older versions of CUDA if ALLOW_UNSUPPORTED_COMPILER_IN_CUDA is set (#504)
-ifneq ($(origin ALLOW_UNSUPPORTED_COMPILER_IN_CUDA),undefined)
-CUFLAGS += -allow-unsupported-compiler
 endif
 
+# Export GPUCC (so that it can also be used in cudacpp_src.mk?)
+export GPUCC
+export GPUFLAGS
+
 #-------------------------------------------------------------------------------
 
-#=== Configure ccache for C++ and CUDA builds
+#=== Configure ccache for C++ and CUDA/HIP builds
 
 # Enable ccache if USECCACHE=1
 ifeq ($(USECCACHE)$(shell echo $(CXX) | grep ccache),1)
@@ -201,15 +260,15 @@ endif
 #ifeq ($(USECCACHE)$(shell echo $(AR) | grep ccache),1)
 #  override AR:=ccache $(AR)
 #endif
-ifneq ($(NVCC),)
-  ifeq ($(USECCACHE)$(shell echo $(NVCC) | grep ccache),1)
-    override NVCC:=ccache $(NVCC)
+ifneq ($(GPUCC),)
+  ifeq ($(USECCACHE)$(shell echo $(GPUCC) | grep ccache),1)
+    override GPUCC:=ccache $(GPUCC)
   endif
 endif
 
 #-------------------------------------------------------------------------------
 
-#=== Configure PowerPC-specific compiler flags for C++ and CUDA
+#=== Configure PowerPC-specific compiler flags for C++ and CUDA/HIP
 
 # PowerPC-specific CXX compiler flags (being reviewed)
 ifeq ($(UNAME_P),ppc64le)
@@ -225,9 +284,9 @@ else
   ######CXXFLAGS+= -fno-semantic-interposition # no benefit (neither alone, nor combined with -flto)
 endif
 
-# PowerPC-specific CUDA compiler flags (to be reviewed!)
+# PowerPC-specific CUDA/HIP compiler flags (to be reviewed!)
 ifeq ($(UNAME_P),ppc64le)
-  CUFLAGS+= -Xcompiler -mno-float128
+  GPUFLAGS+= -Xcompiler -mno-float128
 endif
 
 #-------------------------------------------------------------------------------
@@ -237,7 +296,7 @@ endif
 # Set the default OMPFLAGS choice
 ifneq ($(shell $(CXX) --version | egrep '^Intel'),)
 override OMPFLAGS = -fopenmp
-###override OMPFLAGS = # disable OpenMP MT on Intel (was ok without nvcc but not ok with nvcc before #578)
+###override OMPFLAGS = # disable OpenMP MT on Intel (was ok without GPUCC but not ok with GPUCC before #578)
 else ifneq ($(shell $(CXX) --version | egrep '^(clang)'),)
 override OMPFLAGS = -fopenmp
 ###override OMPFLAGS = # disable OpenMP MT on clang (was not ok without or with nvcc before #578)
@@ -293,7 +352,10 @@ endif
 
 # Set the default RNDGEN (random number generator) choice
 ifeq ($(RNDGEN),)
-  ifeq ($(NVCC),)
+  ifeq ($(GPUCC),)
+    override RNDGEN = hasNoCurand
+  # Edgecase for HIP compilation
+  else ifeq ($(findstring hipcc,$(GPUCC)),hipcc)
     override RNDGEN = hasNoCurand
   else ifeq ($(RNDGEN),)
     override RNDGEN = hasCurand
@@ -310,7 +372,7 @@ export OMPFLAGS
 
 #-------------------------------------------------------------------------------
 
-#=== Set the CUDA/C++ compiler flags appropriate to user-defined choices of AVX, FPTYPE, HELINL, HRDCOD, RNDGEN
+#=== Set the CUDA/HIP/C++ compiler flags appropriate to user-defined choices of AVX, FPTYPE, HELINL, HRDCOD, RNDGEN
 
 # Set the build flags appropriate to OMPFLAGS
 $(info OMPFLAGS=$(OMPFLAGS))
@@ -368,13 +430,13 @@ CXXFLAGS+= $(AVXFLAGS)
 $(info FPTYPE=$(FPTYPE))
 ifeq ($(FPTYPE),d)
   CXXFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_DOUBLE
-  CUFLAGS  += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_DOUBLE
+  GPUFLAGS  += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_DOUBLE
 else ifeq ($(FPTYPE),f)
   CXXFLAGS += -DMGONGPU_FPTYPE_FLOAT -DMGONGPU_FPTYPE2_FLOAT
-  CUFLAGS  += -DMGONGPU_FPTYPE_FLOAT -DMGONGPU_FPTYPE2_FLOAT
+  GPUFLAGS  += -DMGONGPU_FPTYPE_FLOAT -DMGONGPU_FPTYPE2_FLOAT
 else ifeq ($(FPTYPE),m)
   CXXFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_FLOAT
-  CUFLAGS  += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_FLOAT
+  GPUFLAGS  += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_FLOAT
 else
   $(error Unknown FPTYPE='$(FPTYPE)': only 'd', 'f' and 'm' are supported)
 endif
@@ -383,7 +445,7 @@ endif
 $(info HELINL=$(HELINL))
 ifeq ($(HELINL),1)
   CXXFLAGS += -DMGONGPU_INLINE_HELAMPS
-  CUFLAGS  += -DMGONGPU_INLINE_HELAMPS
+  GPUFLAGS  += -DMGONGPU_INLINE_HELAMPS
 else ifneq ($(HELINL),0)
   $(error Unknown HELINL='$(HELINL)': only '0' and '1' are supported)
 endif
@@ -392,7 +454,7 @@ endif
 $(info HRDCOD=$(HRDCOD))
 ifeq ($(HRDCOD),1)
   CXXFLAGS += -DMGONGPU_HARDCODE_PARAM
-  CUFLAGS  += -DMGONGPU_HARDCODE_PARAM
+  GPUFLAGS  += -DMGONGPU_HARDCODE_PARAM
 else ifneq ($(HRDCOD),0)
   $(error Unknown HRDCOD='$(HRDCOD)': only '0' and '1' are supported)
 endif
@@ -444,11 +506,11 @@ ifeq ($(UNAME_S),Darwin)
   override CULIBFLAGSRPATH2 =
 else
   # RPATH to cuda/cpp libs when linking executables
-  override CXXLIBFLAGSRPATH = -Wl,-rpath,$(LIBDIRRPATH)
-  override CULIBFLAGSRPATH = -Xlinker -rpath,$(LIBDIRRPATH)
+  override CXXLIBFLAGSRPATH = -Wl,-rpath=$(LIBDIRRPATH)
+  override CULIBFLAGSRPATH = -Xlinker -rpath=$(LIBDIRRPATH)
   # RPATH to common lib when linking cuda/cpp libs
-  override CXXLIBFLAGSRPATH2 = -Wl,-rpath,'$$ORIGIN'
-  override CULIBFLAGSRPATH2 = -Xlinker -rpath,'$$ORIGIN'
+  override CXXLIBFLAGSRPATH2 = -Wl,-rpath='$$ORIGIN'
+  override CULIBFLAGSRPATH2 = -Xlinker -rpath='$$ORIGIN'
 endif
 
 # Setting LD_LIBRARY_PATH or DYLD_LIBRARY_PATH in the RUNTIME is no longer necessary (neither on Linux nor on Mac)
@@ -461,7 +523,7 @@ override RUNTIME =
 cxx_main=$(BUILDDIR)/check.exe
 fcxx_main=$(BUILDDIR)/fcheck.exe
 
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 cu_main=$(BUILDDIR)/gcheck.exe
 fcu_main=$(BUILDDIR)/fgcheck.exe
 else
@@ -492,15 +554,16 @@ $(BUILDDIR)/.build.$(TAG):
 	@touch $(BUILDDIR)/.build.$(TAG)
 
 # Generic target and build rules: objects from CUDA compilation
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 $(BUILDDIR)/%.o : %.cu *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
 	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
-	$(NVCC) $(CPPFLAGS) $(CUFLAGS) -Xcompiler -fPIC -c $< -o $@
+	$(GPUCC) $(CPPFLAGS) $(GPUFLAGS) $(CUBUILDRULEFLAGS) $< -o $@
 
 $(BUILDDIR)/%_cu.o : %.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
 	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
-	$(NVCC) $(CPPFLAGS) $(CUFLAGS) -Xcompiler -fPIC -c -x cu $< -o $@
+	$(GPUCC) $(CPPFLAGS) $(GPUFLAGS) $(CCBUILDRULEFLAGS) $< -o $@
 endif
+# -x cu in line above
 
 # Generic target and build rules: objects from C++ compilation
 # (NB do not include CUINC here! add it only for NVTX or curand #679)
@@ -509,11 +572,14 @@ $(BUILDDIR)/%.o : %.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
 	$(CXX) $(CPPFLAGS) $(CXXFLAGS) -fPIC -c $< -o $@
 
 # Apply special build flags only to CrossSectionKernel.cc and gCrossSectionKernel.cu (no fast math, see #117 and #516)
+# Added edgecase for HIP compilation
 ifeq ($(shell $(CXX) --version | grep ^nvc++),)
 $(BUILDDIR)/CrossSectionKernels.o: CXXFLAGS := $(filter-out -ffast-math,$(CXXFLAGS))
 $(BUILDDIR)/CrossSectionKernels.o: CXXFLAGS += -fno-fast-math
-ifneq ($(NVCC),)
-$(BUILDDIR)/gCrossSectionKernels.o: CUFLAGS += -Xcompiler -fno-fast-math
+ifeq ($(findstring nvcc,$(GPUCC)),nvcc)
+  $(BUILDDIR)/gCrossSectionKernels.o: GPUFLAGS += -Xcompiler -fno-fast-math
+else
+  $(BUILDDIR)/gCrossSectionKernels.o: GPUFLAGS += -fno-fast-math
 endif
 endif
 
@@ -530,10 +596,10 @@ ifeq ($(RNDGEN),hasCurand)
 $(BUILDDIR)/CurandRandomNumberKernel.o: CXXFLAGS += $(CUINC)
 endif
 
-# Avoid "warning: builtin __has_trivial_... is deprecated; use __is_trivially_... instead" in nvcc with icx2023 (#592)
+# Avoid "warning: builtin __has_trivial_... is deprecated; use __is_trivially_... instead" in GPUCC with icx2023 (#592)
 ifneq ($(shell $(CXX) --version | egrep '^(Intel)'),)
-ifneq ($(NVCC),)
-CUFLAGS += -Xcompiler -Wno-deprecated-builtins
+ifneq ($(GPUCC),)
+GPUFLAGS += -Wno-deprecated-builtins
 endif
 endif
 
@@ -541,8 +607,8 @@ endif
 # This patch does remove the warning, but I prefer to keep it disabled for the moment...
 ###ifneq ($(shell $(CXX) --version | egrep '^(clang|Apple clang|Intel)'),)
 ###$(BUILDDIR)/CrossSectionKernels.o: CXXFLAGS += -Wno-overriding-t-option
-###ifneq ($(NVCC),)
-###$(BUILDDIR)/gCrossSectionKernels.o: CUFLAGS += -Xcompiler -Wno-overriding-t-option
+###ifneq ($(GPUCC),)
+###$(BUILDDIR)/gCrossSectionKernels.o: GPUFLAGS += -Xcompiler -Wno-overriding-t-option
 ###endif
 ###endif
 
@@ -569,7 +635,7 @@ MG5AMC_CXXLIB = mg5amc_$(processid_short)_cpp
 cxx_objects_lib=$(BUILDDIR)/CPPProcess.o $(BUILDDIR)/MatrixElementKernels.o $(BUILDDIR)/BridgeKernels.o $(BUILDDIR)/CrossSectionKernels.o
 cxx_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel.o $(BUILDDIR)/RamboSamplingKernels.o
 
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 MG5AMC_CULIB = mg5amc_$(processid_short)_cuda
 cu_objects_lib=$(BUILDDIR)/gCPPProcess.o $(BUILDDIR)/gMatrixElementKernels.o $(BUILDDIR)/gBridgeKernels.o $(BUILDDIR)/gCrossSectionKernels.o
 cu_objects_exe=$(BUILDDIR)/gCommonRandomNumberKernel.o $(BUILDDIR)/gRamboSamplingKernels.o
@@ -581,11 +647,11 @@ $(LIBDIR)/lib$(MG5AMC_CXXLIB).so: cxx_objects_lib += $(BUILDDIR)/fbridge.o
 $(LIBDIR)/lib$(MG5AMC_CXXLIB).so: $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib)
 	$(CXX) -shared -o $@ $(cxx_objects_lib) $(CXXLIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB)
 
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 $(LIBDIR)/lib$(MG5AMC_CULIB).so: $(BUILDDIR)/fbridge_cu.o
 $(LIBDIR)/lib$(MG5AMC_CULIB).so: cu_objects_lib += $(BUILDDIR)/fbridge_cu.o
 $(LIBDIR)/lib$(MG5AMC_CULIB).so: $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cu_objects_lib)
-	$(NVCC) --shared -o $@ $(cu_objects_lib) $(CULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB)
+	$(GPUCC) --shared -o $@ $(cu_objects_lib) $(CULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB)
 endif
 
 #-------------------------------------------------------------------------------
@@ -602,16 +668,16 @@ $(cxx_main): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PAT
 $(cxx_main): $(BUILDDIR)/check_sa.o $(LIBDIR)/lib$(MG5AMC_CXXLIB).so $(cxx_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel.o
 	$(CXX) -o $@ $(BUILDDIR)/check_sa.o $(OMPFLAGS) -ldl -pthread $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_CXXLIB) $(cxx_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel.o $(CURANDLIBFLAGS)
 
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 ifneq ($(shell $(CXX) --version | grep ^Intel),)
-$(cu_main): LIBFLAGS += -lintlc # compile with icpx and link with nvcc (undefined reference to `_intel_fast_memcpy')
-$(cu_main): LIBFLAGS += -lsvml # compile with icpx and link with nvcc (undefined reference to `__svml_cos4_l9')
+$(cu_main): LIBFLAGS += -lintlc # compile with icpx and link with GPUCC (undefined reference to `_intel_fast_memcpy')
+$(cu_main): LIBFLAGS += -lsvml # compile with icpx and link with GPUCC (undefined reference to `__svml_cos4_l9')
 else ifneq ($(shell $(CXX) --version | grep ^nvc++),) # support nvc++ #531
 $(cu_main): LIBFLAGS += -L$(patsubst %bin/nvc++,%lib,$(subst ccache ,,$(CXX))) -lnvhpcatm -lnvcpumath -lnvc
 endif
 $(cu_main): LIBFLAGS += $(CULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH
 $(cu_main): $(BUILDDIR)/gcheck_sa.o $(LIBDIR)/lib$(MG5AMC_CULIB).so $(cu_objects_exe) $(BUILDDIR)/gCurandRandomNumberKernel.o
-	$(NVCC) -o $@ $(BUILDDIR)/gcheck_sa.o $(CUARCHFLAGS) $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe) $(BUILDDIR)/gCurandRandomNumberKernel.o $(CURANDLIBFLAGS)
+	$(GPUCC) -o $@ $(BUILDDIR)/gcheck_sa.o $(CUARCHFLAGS) $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe) $(BUILDDIR)/gCurandRandomNumberKernel.o $(CURANDLIBFLAGS)
 endif
 
 #-------------------------------------------------------------------------------
@@ -637,17 +703,17 @@ $(fcxx_main): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PA
 $(fcxx_main): $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler.o $(LIBDIR)/lib$(MG5AMC_CXXLIB).so $(cxx_objects_exe)
 	$(CXX) -o $@ $(BUILDDIR)/fcheck_sa.o $(OMPFLAGS) $(BUILDDIR)/fsampler.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CXXLIB) $(cxx_objects_exe)
 
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 ifneq ($(shell $(CXX) --version | grep ^Intel),)
-$(fcu_main): LIBFLAGS += -lintlc # compile with icpx and link with nvcc (undefined reference to `_intel_fast_memcpy')
-$(fcu_main): LIBFLAGS += -lsvml # compile with icpx and link with nvcc (undefined reference to `__svml_cos4_l9')
+$(fcu_main): LIBFLAGS += -lintlc # compile with icpx and link with GPUCC (undefined reference to `_intel_fast_memcpy')
+$(fcu_main): LIBFLAGS += -lsvml # compile with icpx and link with GPUCC (undefined reference to `__svml_cos4_l9')
 endif
 ifeq ($(UNAME_S),Darwin)
 $(fcu_main): LIBFLAGS += -L$(shell dirname $(shell $(FC) --print-file-name libgfortran.dylib)) # add path to libgfortran on Mac #375
 endif
 $(fcu_main): LIBFLAGS += $(CULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH
 $(fcu_main): $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler_cu.o $(LIBDIR)/lib$(MG5AMC_CULIB).so $(cu_objects_exe)
-	$(NVCC) -o $@ $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler_cu.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe)
+	$(GPUCC) -o $@ $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler_cu.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe)
 endif
 
 #-------------------------------------------------------------------------------
@@ -659,7 +725,7 @@ $(BUILDDIR)/testxxx.o: testxxx_cc_ref.txt
 $(testmain): $(BUILDDIR)/testxxx.o
 $(testmain): cxx_objects_exe += $(BUILDDIR)/testxxx.o # Comment out this line to skip the C++ test of xxx functions
 
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 $(BUILDDIR)/testxxx_cu.o: $(GTESTLIBS)
 $(BUILDDIR)/testxxx_cu.o: INCFLAGS += $(GTESTINC)
 $(BUILDDIR)/testxxx_cu.o: testxxx_cc_ref.txt
@@ -672,7 +738,7 @@ $(BUILDDIR)/testmisc.o: INCFLAGS += $(GTESTINC)
 $(testmain): $(BUILDDIR)/testmisc.o
 $(testmain): cxx_objects_exe += $(BUILDDIR)/testmisc.o # Comment out this line to skip the C++ miscellaneous tests
 
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 $(BUILDDIR)/testmisc_cu.o: $(GTESTLIBS)
 $(BUILDDIR)/testmisc_cu.o: INCFLAGS += $(GTESTINC)
 $(testmain): $(BUILDDIR)/testmisc_cu.o
@@ -684,12 +750,12 @@ $(BUILDDIR)/runTest.o: INCFLAGS += $(GTESTINC)
 $(testmain): $(BUILDDIR)/runTest.o
 $(testmain): cxx_objects_exe += $(BUILDDIR)/runTest.o
 
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 $(BUILDDIR)/runTest_cu.o: $(GTESTLIBS)
 $(BUILDDIR)/runTest_cu.o: INCFLAGS += $(GTESTINC)
 ifneq ($(shell $(CXX) --version | grep ^Intel),)
-$(testmain): LIBFLAGS += -lintlc # compile with icpx and link with nvcc (undefined reference to `_intel_fast_memcpy')
-$(testmain): LIBFLAGS += -lsvml # compile with icpx and link with nvcc (undefined reference to `__svml_cos4_l9')
+$(testmain): LIBFLAGS += -lintlc # compile with icpx and link with GPUCC (undefined reference to `_intel_fast_memcpy')
+$(testmain): LIBFLAGS += -lsvml # compile with icpx and link with GPUCC (undefined reference to `__svml_cos4_l9')
 else ifneq ($(shell $(CXX) --version | grep ^nvc++),) # support nvc++ #531
 $(testmain): LIBFLAGS += -L$(patsubst %bin/nvc++,%lib,$(subst ccache ,,$(CXX))) -lnvhpcatm -lnvcpumath -lnvc
 endif
@@ -713,14 +779,14 @@ $(testmain): LIBFLAGS += -lgomp
 endif
 endif
 
-ifeq ($(NVCC),) # link only runTest.o
+ifeq ($(GPUCC),) # link only runTest.o
 $(testmain): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH
 $(testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(cxx_objects_exe) $(GTESTLIBS)
 	$(CXX) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) -ldl -pthread $(LIBFLAGS)
 else # link both runTest.o and runTest_cu.o
 $(testmain): LIBFLAGS += $(CULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH
 $(testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) $(GTESTLIBS)
-	$(NVCC) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) -ldl $(LIBFLAGS) -lcuda
+	  $(GPUCC) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) -ldl $(LIBFLAGS) $(CUDATESTFLAGS)
 endif
 
 # Use target gtestlibs to build only googletest
@@ -829,9 +895,9 @@ ifeq ($(USECCACHE),1)
 	ccache --version | head -1
 endif
 	@echo ""
-	@echo NVCC=$(NVCC)
-ifneq ($(NVCC),)
-	$(NVCC) --version
+	@echo GPUCC=$(GPUCC)
+ifneq ($(GPUCC),)
+	$(GPUCC) --version
 endif
 	@echo ""
 	@echo CXX=$(CXX)
@@ -850,7 +916,7 @@ endif
 
 # Target: check (run the C++ test executable)
 # [NB THIS IS WHAT IS USED IN THE GITHUB CI!]
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 check: runTest cmpFcheck cmpFGcheck
 else
 check: runTest cmpFcheck
diff --git a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/fbridge.cc b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/fbridge.cc
index 2d2b36d560..22ce3f5115 100644
--- a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/fbridge.cc
+++ b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/fbridge.cc
@@ -1,11 +1,11 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: S. Roiser (Oct 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Roiser, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #include "Bridge.h"
 #include "CPPProcess.h"
-#include "CudaRuntime.h"
+#include "GpuRuntime.h"
 
 extern "C"
 {
@@ -22,7 +22,7 @@ extern "C"
    * Using the same Fortran MadEvent code, linking to the hetrerogeneous library would allow access to both CPU and GPU implementations.
    * The specific heterogeneous configuration (how many GPUs, how many threads on each CPU, etc) could be loaded in CUDA/C++ from a data file.
    */
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   using namespace mg5amcGpu;
 #else
   using namespace mg5amcCpu;
@@ -46,8 +46,8 @@ extern "C"
    */
   void fbridgecreate_( CppObjectInFortran** ppbridge, const int* pnevtF, const int* pnparF, const int* pnp4F )
   {
-#ifdef __CUDACC__
-    CudaRuntime::setUp();
+#ifdef MGONGPUCPP_GPUIMPL
+    GpuRuntime::setUp();
 #endif
     // (NB: CPPProcess::initProc no longer needs to be executed here because it is called in the Bridge constructor)
     // FIXME: disable OMP in Bridge when called from Fortran
@@ -65,8 +65,8 @@ extern "C"
     Bridge<FORTRANFPTYPE>* pbridge = dynamic_cast<Bridge<FORTRANFPTYPE>*>( *ppbridge );
     if( pbridge == 0 ) throw std::runtime_error( "fbridgedelete_: invalid Bridge address" );
     delete pbridge;
-#ifdef __CUDACC__
-    CudaRuntime::tearDown();
+#ifdef MGONGPUCPP_GPUIMPL
+    GpuRuntime::tearDown();
 #endif
   }
 
@@ -96,7 +96,7 @@ extern "C"
   {
     Bridge<FORTRANFPTYPE>* pbridge = dynamic_cast<Bridge<FORTRANFPTYPE>*>( *ppbridge );
     if( pbridge == 0 ) throw std::runtime_error( "fbridgesequence_: invalid Bridge address" );
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     // Use the device/GPU implementation in the CUDA library
     // (there is also a host implementation in this library)
     pbridge->gpu_sequence( momenta, gs, rndhel, rndcol, *pchannelId, mes, selhel, selcol );
diff --git a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/fsampler.cc b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/fsampler.cc
index 2fb445372d..3743934f41 100644
--- a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/fsampler.cc
+++ b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/fsampler.cc
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Feb 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #include "mgOnGpuConfig.h"
 
@@ -13,7 +13,7 @@
 
 //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -40,7 +40,7 @@ namespace mg5amcCpu
   private:
     const int m_nevt; // The number of events in each iteration
     int m_iiter;      // The iteration counter (for random number seeding)
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
     HostBufferRndNumMomenta m_hstRndmom; // Memory buffers for random numbers
     HostBufferMomenta m_hstMomenta;      // Memory buffers for momenta
     HostBufferWeights m_hstWeights;      // Memory buffers for sampling weights
@@ -105,7 +105,7 @@ namespace mg5amcCpu
 
 extern "C"
 {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   using namespace mg5amcGpu;
 #else
   using namespace mg5amcCpu;
diff --git a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/runTest.cc b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/runTest.cc
index d4a760a71b..de327f2321 100644
--- a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/runTest.cc
+++ b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/runTest.cc
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: S. Hageboeck (Nov 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 //----------------------------------------------------------------------------
 // Use ./runTest.exe --gtest_filter=*xxx to run only testxxx.cc tests
 //----------------------------------------------------------------------------
@@ -18,7 +18,7 @@
 #include "RandomNumberKernels.h"
 #include "epoch_process_id.h"
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 using namespace mg5amcGpu;
 #else
 using namespace mg5amcCpu;
@@ -35,7 +35,7 @@ struct CUDA_CPU_TestBase : public TestDriverBase
     : TestDriverBase( npar, refFileName ) {}
 };
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 struct CPUTest : public CUDA_CPU_TestBase
 {
   // Struct data members (process, and memory structures for random numbers, momenta, matrix elements and weights on host and device)
@@ -119,7 +119,7 @@ struct CPUTest : public CUDA_CPU_TestBase
 };
 #endif
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 struct CUDATest : public CUDA_CPU_TestBase
 {
   // Reset the device when our test goes out of scope. Note that this should happen after
@@ -128,7 +128,7 @@ struct CUDATest : public CUDA_CPU_TestBase
   {
     ~DeviceReset()
     {
-      checkCuda( cudaDeviceReset() ); // this is needed by cuda-memcheck --leak-check full
+      checkGpu( gpuDeviceReset() ); // this is needed by cuda-memcheck --leak-check full
     }
   } deviceResetter;
 
@@ -256,7 +256,7 @@ INSTANTIATE_TEST_SUITE_P( prefix, \
                           test_suite_name, \
                           testing::Values( new CUDATest( MG_EPOCH_REFERENCE_FILE_NAME ) ) );
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 MG_INSTANTIATE_TEST_SUITE_GPU( XTESTID_GPU( MG_EPOCH_PROCESS_ID ), MadgraphTest );
 #else
 MG_INSTANTIATE_TEST_SUITE_CPU( XTESTID_CPU( MG_EPOCH_PROCESS_ID ), MadgraphTest );
diff --git a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/testmisc.cc b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/testmisc.cc
index 895d6eeb56..ba9e59a8a3 100644
--- a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/testmisc.cc
+++ b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/testmisc.cc
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 //----------------------------------------------------------------------------
 // Use ./runTest.exe --gtest_filter=*misc to run only testmisc.cc tests
 //----------------------------------------------------------------------------
@@ -17,7 +17,7 @@
 #include <sstream>
 #include <typeinfo>
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 #define TESTID( s ) s##_GPU_MISC
 #else
 #define TESTID( s ) s##_CPU_MISC
@@ -26,7 +26,7 @@
 #define XTESTID( s ) TESTID( s )
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -59,7 +59,7 @@ namespace mg5amcCpu
 
 TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testmisc )
 {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   using namespace mg5amcGpu;
 #else
   using namespace mg5amcCpu;
diff --git a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/testxxx.cc b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/testxxx.cc
index 3361fe5aa9..e5167de00c 100644
--- a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/testxxx.cc
+++ b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/testxxx.cc
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Apr 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 //----------------------------------------------------------------------------
 // Use ./runTest.exe --gtest_filter=*xxx to run only testxxx.cc tests
 //----------------------------------------------------------------------------
@@ -24,7 +24,7 @@
 #include <iomanip>
 #include <iostream>
 #include <vector>
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 #define TESTID( s ) s##_GPU_XXX
 #else
 #define TESTID( s ) s##_CPU_XXX
@@ -32,7 +32,7 @@
 
 #define XTESTID( s ) TESTID( s )
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -42,7 +42,7 @@ namespace mg5amcCpu
   int FPEhandlerIevt = -1;
   inline void FPEhandler( int sig )
   {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     std::cerr << "Floating Point Exception (GPU): '" << FPEhandlerMessage << "' ievt=" << FPEhandlerIevt << std::endl;
 #else
     std::cerr << "Floating Point Exception (CPU neppV=" << neppV << "): '" << FPEhandlerMessage << "' ievt=" << FPEhandlerIevt << std::endl;
@@ -53,7 +53,7 @@ namespace mg5amcCpu
 
 TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testxxx )
 {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   using namespace mg5amcGpu;
 #else
   using namespace mg5amcCpu;
@@ -77,7 +77,7 @@ TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testxxx )
   assert( nevt % neppM == 0 ); // nevt must be a multiple of neppM
   assert( nevt % neppV == 0 ); // nevt must be a multiple of neppV
   // Fill in the input momenta
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   mg5amcGpu::PinnedHostBufferMomenta hstMomenta( nevt ); // AOSOA[npagM][npar=4][np4=4][neppM]
 #else
   mg5amcCpu::HostBufferMomenta hstMomenta( nevt ); // AOSOA[npagM][npar=4][np4=4][neppM]
@@ -322,7 +322,7 @@ TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testxxx )
   {
     for( int ievt = 0; ievt < nevt; ievt++ )
     {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
       using namespace mg5amcGpu;
 #else
       using namespace mg5amcCpu;
diff --git a/epochX/cudacpp/gg_ttggg.sa/src/HelAmps_sm.h b/epochX/cudacpp/gg_ttggg.sa/src/HelAmps_sm.h
index 8df465ad6d..8b4ad719be 100644
--- a/epochX/cudacpp/gg_ttggg.sa/src/HelAmps_sm.h
+++ b/epochX/cudacpp/gg_ttggg.sa/src/HelAmps_sm.h
@@ -5,7 +5,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: A. Valassi (Sep 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
 // MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
@@ -28,7 +28,7 @@
 //#include <iomanip>
 //#include <iostream>
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/gg_ttggg.sa/src/Parameters_sm.cc b/epochX/cudacpp/gg_ttggg.sa/src/Parameters_sm.cc
index 64fc3fea62..067445b198 100644
--- a/epochX/cudacpp/gg_ttggg.sa/src/Parameters_sm.cc
+++ b/epochX/cudacpp/gg_ttggg.sa/src/Parameters_sm.cc
@@ -4,7 +4,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: A. Valassi (Sep 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
 // MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
@@ -17,7 +17,7 @@
 #include <iomanip>
 #include <iostream>
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 using namespace mg5amcGpu;
 #else
 using namespace mg5amcCpu;
diff --git a/epochX/cudacpp/gg_ttggg.sa/src/Parameters_sm.h b/epochX/cudacpp/gg_ttggg.sa/src/Parameters_sm.h
index b6568d3761..9581d66e0e 100644
--- a/epochX/cudacpp/gg_ttggg.sa/src/Parameters_sm.h
+++ b/epochX/cudacpp/gg_ttggg.sa/src/Parameters_sm.h
@@ -27,7 +27,7 @@
 #include "read_slha.h"
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -93,7 +93,7 @@ namespace mg5amcCpu
 #include <limits>
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -218,7 +218,7 @@ namespace mg5amcCpu
 //==========================================================================
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -239,7 +239,7 @@ namespace mg5amcCpu
 #pragma GCC diagnostic push
 #pragma GCC diagnostic ignored "-Wunused-variable"  // e.g. <<warning: unused variable ‘mdl_G__exp__2’ [-Wunused-variable]>>
 #pragma GCC diagnostic ignored "-Wunused-parameter" // e.g. <<warning: unused parameter ‘G’ [-Wunused-parameter]>>
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 #pragma nv_diagnostic push
 #pragma nv_diag_suppress 177 // e.g. <<warning #177-D: variable "mdl_G__exp__2" was declared but never referenced>>
 #endif
@@ -267,7 +267,7 @@ namespace mg5amcCpu
       // End SM implementation - no special handling of vectors of floats as in EFT (#439)
       return out;
     }
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 #pragma GCC diagnostic pop
 #pragma nv_diagnostic pop
 #endif
diff --git a/epochX/cudacpp/gg_ttggg.sa/src/cudacpp_src.mk b/epochX/cudacpp/gg_ttggg.sa/src/cudacpp_src.mk
index d4cc628aec..159e19a46d 100644
--- a/epochX/cudacpp/gg_ttggg.sa/src/cudacpp_src.mk
+++ b/epochX/cudacpp/gg_ttggg.sa/src/cudacpp_src.mk
@@ -19,7 +19,7 @@ SHELL := /bin/bash
 #=== Configure common compiler flags for CUDA and C++
 
 INCFLAGS = -I.
-OPTFLAGS = -O3 # this ends up in CUFLAGS too (should it?), cannot add -Ofast or -ffast-math here
+OPTFLAGS = -O3 # this ends up in GPUFLAGS too (should it?), cannot add -Ofast or -ffast-math here
 
 #-------------------------------------------------------------------------------
 
@@ -45,13 +45,13 @@ endif
 
 #-------------------------------------------------------------------------------
 
-#=== Configure the CUDA compiler (note: NVCC is already exported including ccache)
+#=== Configure the CUDA compiler (note: GPUCC is already exported including ccache)
 
-###$(info NVCC=$(NVCC))
+###$(info GPUCC=$(GPUCC))
 
 #-------------------------------------------------------------------------------
 
-#=== Configure ccache for C++ builds (note: NVCC is already exported including ccache)
+#=== Configure ccache for C++ builds (note: GPUCC is already exported including ccache)
 
 # Enable ccache if USECCACHE=1
 ifeq ($(USECCACHE)$(shell echo $(CXX) | grep ccache),1)
@@ -92,6 +92,13 @@ endif
 ###$(info OMPFLAGS=$(OMPFLAGS))
 CXXFLAGS += $(OMPFLAGS)
 
+# Add correct -DHIP_LATFORM when compiling for HIP
+ifeq ($(findstring nvcc,$(GPUCC)),nvcc)
+  GPUFLAGS += -Xcompiler -fPIC -c -x cu
+else ifeq ($(findstring hipcc,$(GPUCC)),hipcc)
+  GPUFLAGS += -fPIC -c
+endif
+
 # Set the build flags appropriate to each AVX choice (example: "make AVX=none")
 # [NB MGONGPU_PVW512 is needed because "-mprefer-vector-width=256" is not exposed in a macro]
 # [See https://gcc.gnu.org/bugzilla/show_bug.cgi?id=96476]
@@ -253,20 +260,20 @@ $(BUILDDIR)/%.o : %.cc *.h $(BUILDDIR)/.build.$(TAG)
 # Generic target and build rules: objects from CUDA compilation
 $(BUILDDIR)/%_cu.o : %.cc *.h $(BUILDDIR)/.build.$(TAG)
 	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
-	$(NVCC) $(CPPFLAGS) $(CUFLAGS) -Xcompiler -fPIC -c -x cu $< -o $@
+	$(GPUCC) $(CPPFLAGS) $(GPUFLAGS) $< -o $@
 
 #-------------------------------------------------------------------------------
 
 cxx_objects=$(addprefix $(BUILDDIR)/, Parameters_sm.o read_slha.o)
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 cu_objects=$(addprefix $(BUILDDIR)/, Parameters_sm_cu.o)
 endif
 
 # Target (and build rules): common (src) library
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so : $(cxx_objects) $(cu_objects)
 	@if [ ! -d $(LIBDIR) ]; then echo "mkdir -p $(LIBDIR)"; mkdir -p $(LIBDIR); fi
-	$(NVCC) -shared -o $@ $(cxx_objects) $(cu_objects) $(LDFLAGS)
+	$(GPUCC) -shared -o $@ $(cxx_objects) $(cu_objects) $(LDFLAGS)
 else
 $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so : $(cxx_objects)
 	@if [ ! -d $(LIBDIR) ]; then echo "mkdir -p $(LIBDIR)"; mkdir -p $(LIBDIR); fi
diff --git a/epochX/cudacpp/gg_ttggg.sa/src/mgOnGpuConfig.h b/epochX/cudacpp/gg_ttggg.sa/src/mgOnGpuConfig.h
index b247654dcf..da4ba36ad8 100644
--- a/epochX/cudacpp/gg_ttggg.sa/src/mgOnGpuConfig.h
+++ b/epochX/cudacpp/gg_ttggg.sa/src/mgOnGpuConfig.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jul 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MGONGPUCONFIG_H
 #define MGONGPUCONFIG_H 1
@@ -10,12 +10,25 @@
 // There are two different code bases for standalone_cudacpp (without multichannel) and madevent+cudacpp (with multichannel)
 #undef MGONGPU_SUPPORTS_MULTICHANNEL
 
+// Is this a GPU (CUDA, HIP) or CPU implementation?
+#ifdef __CUDACC__
+#define MGONGPUCPP_GPUIMPL cuda
+#elif defined __HIPCC__
+#define MGONGPUCPP_GPUIMPL hip
+#else
+#undef MGONGPUCPP_GPUIMPL
+#endif
+
 // ** NB1 Throughputs (e.g. 6.8E8) are events/sec for "./gcheck.exe -p 65536 128 12"
 // ** NB2 Baseline on b7g47n0004 fluctuates (probably depends on load on other VMs)
 
 // Choose if curand is supported for generating random numbers
+// For HIP, by default, do not use curand (common random numbers will be used instead)
 // For both CUDA and C++, by default, do not inline, but allow this macro to be set from outside with e.g. -DMGONGPU_HAS_NO_CURAND
-// (there exist CUDA installations, e.g. using the HPC package, which do not include curand - see PR #784)
+// (there exist CUDA installations, e.g. using the HPC package, which do not include curand - see PR #784 and #785)
+#if defined __HIPCC__
+#define MGONGPU_HAS_NO_CURAND 1
+#else
 //#ifdef __CUDACC__
 //#undef MGONGPU_HAS_NO_CURAND // default
 ////#define MGONGPU_HAS_NO_CURAND 1
@@ -23,6 +36,7 @@
 //#undef MGONGPU_HAS_NO_CURAND // default
 ////#define MGONGPU_HAS_NO_CURAND 1
 //#endif
+#endif
 
 // Choose floating point precision (for everything but color algebra #537)
 // If one of these macros has been set from outside with e.g. -DMGONGPU_FPTYPE_FLOAT, nothing happens (issue #167)
@@ -54,23 +68,28 @@
 //#undef MGONGPU_HARDCODE_PARAM // default
 ////#define MGONGPU_HARDCODE_PARAM 1
 
-// Complex type in c++: std::complex or cxsmpl (CHOOSE ONLY ONE)
-#ifndef __CUDACC__
-//#define MGONGPU_CPPCXTYPE_STDCOMPLEX 1 // ~8 percent slower on float, same on double (5.1E6/double, 9.4E6/float)
-#define MGONGPU_CPPCXTYPE_CXSMPL 1 // new default (5.1E6/double, 10.2E6/float)
-#endif
-
-// Complex type in cuda: thrust or cucomplex or cxsmpl (CHOOSE ONLY ONE)
+// Complex type in CUDA: thrust or cucomplex or cxsmpl (CHOOSE ONLY ONE)
 #ifdef __CUDACC__
 #define MGONGPU_CUCXTYPE_THRUST 1 // default (~1.15E9/double, ~3.2E9/float)
 //#define MGONGPU_CUCXTYPE_CUCOMPLEX 1 // ~10 percent slower (1.03E9/double, ~2.8E9/float)
 //#define MGONGPU_CUCXTYPE_CXSMPL 1 // ~10 percent slower (1.00E9/double, ~2.9E9/float)
+
+// Complex type in HIP: cxsmpl (ONLY ONE OPTION POSSIBLE)
+#elif defined __HIPCC__
+#define MGONGPU_CUCXTYPE_CXSMPL 1 // ~10 percent slower (1.00E9/double, ~2.9E9/float)
+
+// Complex type in C++: std::complex or cxsmpl (CHOOSE ONLY ONE)
+#else
+//#define MGONGPU_CPPCXTYPE_STDCOMPLEX 1 // ~8 percent slower on float, same on double (5.1E6/double, 9.4E6/float)
+#define MGONGPU_CPPCXTYPE_CXSMPL 1 // new default (5.1E6/double, 10.2E6/float)
 #endif
 
-// Cuda nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation
+// CUDA nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation
 #ifdef __CUDACC__
-#undef MGONGPU_NSIGHT_DEBUG // default
+#undef MGONGPU_NSIGHT_DEBUG // default in CUDA
 //#define MGONGPU_NSIGHT_DEBUG 1
+#else
+#undef MGONGPU_NSIGHT_DEBUG // only option in HIP or C++
 #endif
 
 // SANITY CHECKS (floating point precision for everything but color algebra #537)
@@ -86,17 +105,21 @@
 #error You cannot use double precision for color algebra and single precision elsewhere
 #endif
 
-// SANITY CHECKS (c++ complex number implementation)
-#ifndef __CUDACC__
-#if defined MGONGPU_CPPCXTYPE_STDCOMPLEX and defined MGONGPU_CPPCXTYPE_CXSMPL
-#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CPPCXTYPE_STDCOMPLEX or MGONGPU_CPPCXTYPE_CXSMPL
+// SANITY CHECKS (CUDA complex number implementation)
+#ifdef __CUDACC__
+#if defined MGONGPU_CUCXTYPE_THRUST and defined MGONGPU_CUCXTYPE_CUCOMPLEX
+#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CUCXTYPE_THRUST or MGONGPU_CUCXTYPE_CUCOMPLEX for CUDA
+#elif defined MGONGPU_CUCXTYPE_THRUST and defined MGONGPU_CUCXTYPE_CXSMPL
+#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CUCXTYPE_THRUST or MGONGPU_CUCXTYPE_CXSMPL for CUDA
+#elif defined MGONGPU_CUCXTYPE_CUCOMPLEX and defined MGONGPU_CUCXTYPE_CXSMPL
+#error You must CHOOSE (ONE AND) ONLY ONE OF MGONGPU_CUCXTYPE_CUCOMPLEX or MGONGPU_CUCXTYPE_CXSMPL for CUDA
 #endif
 #endif
 
-// SANITY CHECKS (cuda complex number implementation)
-#ifdef __CUDACC__
-#if defined MGONGPU_CUCXTYPE_THRUST and defined MGONGPU_CUCXTYPE_CUCOMPLEX and defined MGONGPU_CUCXTYPE_CXSMPL
-#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CUCXTYPE_THRUST or MGONGPU_CUCXTYPE_CUCOMPLEX or MGONGPU_CUCXTYPE_CXSMPL
+// SANITY CHECKS (C++ complex number implementation)
+#ifndef MGONGPUCPP_GPUIMPL
+#if defined MGONGPU_CPPCXTYPE_STDCOMPLEX and defined MGONGPU_CPPCXTYPE_CXSMPL
+#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CPPCXTYPE_STDCOMPLEX or MGONGPU_CPPCXTYPE_CXSMPL for C++
 #endif
 #endif
 
@@ -134,7 +157,7 @@ namespace mgOnGpu
   // Alignment requirement for using reinterpret_cast with SIMD vectorized code
   // (using reinterpret_cast with non aligned memory may lead to segmentation faults!)
   // Only needed for C++ code but can be enforced also in NVCC builds of C++ code using CUDA>=11.2 and C++17 (#318, #319, #333)
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   constexpr int cppAlign = 64; // alignment requirement for SIMD vectorization (64-byte i.e. 512-bit)
 #endif
 
@@ -145,7 +168,7 @@ using mgOnGpu::fptype;
 using mgOnGpu::fptype2;
 
 // C++ SIMD vectorization width (this will be used to set neppV)
-#ifdef __CUDACC__ // CUDA implementation has no SIMD
+#ifdef MGONGPUCPP_GPUIMPL // CUDA and HIP implementations have no SIMD
 #undef MGONGPU_CPPSIMD
 #elif defined __AVX512VL__ && defined MGONGPU_PVW512 // C++ "512z" AVX512 with 512 width (512-bit ie 64-byte): 8 (DOUBLE) or 16 (FLOAT)
 #ifdef MGONGPU_FPTYPE_DOUBLE
@@ -175,9 +198,9 @@ using mgOnGpu::fptype2;
 #undef MGONGPU_CPPSIMD
 #endif
 
-// Cuda nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation
+// CUDA nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation
 // Arguments (not used so far): text is __FUNCTION__, code is 0 (start) or 1 (end)
-#if defined __CUDACC__ && defined MGONGPU_NSIGHT_DEBUG /* clang-format off */
+#if defined __CUDA__ && defined MGONGPU_NSIGHT_DEBUG /* clang-format off */
 #define mgDebugDeclare() __shared__ float mgDebugCounter[mgOnGpu::ntpbMAX];
 #define mgDebugInitialise() { mgDebugCounter[threadIdx.x] = 0; }
 #define mgDebug( code, text ) { mgDebugCounter[threadIdx.x] += 1; }
@@ -189,8 +212,8 @@ using mgOnGpu::fptype2;
 #define mgDebugFinalise() { /*noop*/ }
 #endif /* clang-format on */
 
-// Define empty CUDA declaration specifiers for C++
-#ifndef __CUDACC__
+// Define empty CUDA/HIP declaration specifiers for C++
+#ifndef MGONGPUCPP_GPUIMPL
 #define __global__
 #define __host__
 #define __device__
diff --git a/epochX/cudacpp/gg_ttggg.sa/src/mgOnGpuCxtypes.h b/epochX/cudacpp/gg_ttggg.sa/src/mgOnGpuCxtypes.h
index ca9a9f00c0..5532e22fa1 100644
--- a/epochX/cudacpp/gg_ttggg.sa/src/mgOnGpuCxtypes.h
+++ b/epochX/cudacpp/gg_ttggg.sa/src/mgOnGpuCxtypes.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022, based on earlier work by D. Smith) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MGONGPUCXTYPES_H
 #define MGONGPUCXTYPES_H 1
@@ -19,7 +19,7 @@
 #include <complex>
 
 // Complex type in cuda: thrust or cucomplex or cxsmpl
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 #if defined MGONGPU_CUCXTYPE_THRUST
 #pragma clang diagnostic push
 #pragma clang diagnostic ignored "-Wtautological-compare" // for icpx2021/clang13 (https://stackoverflow.com/a/15864661)
@@ -82,7 +82,7 @@ namespace mgOnGpu /* clang-format off */
 using mgOnGpu::cxsmpl;
 
 // Printout to stream for user defined types
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -92,7 +92,7 @@ namespace mg5amcCpu
   inline __host__ std::ostream&
   operator<<( std::ostream& out, const cxsmpl<FP>& c )
   {
-    out << std::complex( c.real(), c.imag() );
+    out << std::complex<FP>( c.real(), c.imag() );
     return out;
   }
 
@@ -215,14 +215,14 @@ namespace mg5amcCpu
 //==========================================================================
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
 #endif
 {
   // --- Type definitions (complex type: cxtype)
-#ifdef __CUDACC__ // cuda
+#ifdef MGONGPUCPP_GPUIMPL // cuda
 #if defined MGONGPU_CUCXTYPE_THRUST
   typedef thrust::complex<fptype> cxtype;
 #elif defined MGONGPU_CUCXTYPE_CUCOMPLEX
@@ -255,7 +255,7 @@ namespace mg5amcCpu
 //==========================================================================
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -307,7 +307,7 @@ namespace mg5amcCpu
 
   //==========================================================================
 
-#if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_THRUST // cuda + thrust
+#if defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CUCXTYPE_THRUST // cuda + thrust
 
   //------------------------------
   // CUDA - using thrust::complex
@@ -343,11 +343,11 @@ namespace mg5amcCpu
     return c;
   }
 
-#endif // #if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_THRUST
+#endif // #if defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CUCXTYPE_THRUST
 
   //==========================================================================
 
-#if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_CUCOMPLEX // cuda + cucomplex
+#if defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CUCXTYPE_CUCOMPLEX // cuda + cucomplex
 
   //------------------------------
   // CUDA - using cuComplex
@@ -562,11 +562,11 @@ namespace mg5amcCpu
     return cxmake( c.real(), c.imag() );
   }
 
-#endif // #if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_CUCOMPLEX
+#endif // #if defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CUCXTYPE_CUCOMPLEX
 
   //==========================================================================
 
-#if not defined __CUDACC__ and defined MGONGPU_CPPCXTYPE_STDCOMPLEX // c++ + stdcomplex
+#if not defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CPPCXTYPE_STDCOMPLEX // c++ + stdcomplex
 
   //------------------------------
   // C++ - using std::complex
@@ -610,7 +610,7 @@ namespace mg5amcCpu
   }
 #endif
 
-#endif // #if not defined __CUDACC__ and defined MGONGPU_CPPCXTYPE_STDCOMPLEX
+#endif // #if not defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CPPCXTYPE_STDCOMPLEX
 
   //==========================================================================
 
@@ -633,7 +633,7 @@ namespace mg5amcCpu
 //==========================================================================
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/gg_ttggg.sa/src/mgOnGpuFptypes.h b/epochX/cudacpp/gg_ttggg.sa/src/mgOnGpuFptypes.h
index 905c97d700..fa3a02664b 100644
--- a/epochX/cudacpp/gg_ttggg.sa/src/mgOnGpuFptypes.h
+++ b/epochX/cudacpp/gg_ttggg.sa/src/mgOnGpuFptypes.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MGONGPUFPTYPES_H
 #define MGONGPUFPTYPES_H 1
@@ -12,7 +12,7 @@
 #include <cmath>
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL // cuda
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -20,7 +20,7 @@ namespace mg5amcCpu
 {
   //==========================================================================
 
-#ifdef __CUDACC__ // cuda
+#ifdef MGONGPUCPP_GPUIMPL // cuda
 
   //------------------------------
   // Floating point types - Cuda
@@ -64,11 +64,11 @@ namespace mg5amcCpu
 #endif
   }
 
-#endif // #ifdef __CUDACC__
+#endif // #ifdef MGONGPUCPP_GPUIMPL
 
   //==========================================================================
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 
   //------------------------------
   // Floating point types - C++
@@ -92,7 +92,7 @@ namespace mg5amcCpu
     return std::sqrt( f );
   }
 
-#endif // #ifndef __CUDACC__
+#endif // #ifndef MGONGPUCPP_GPUIMPL
 
   //==========================================================================
 
diff --git a/epochX/cudacpp/gg_ttggg.sa/src/mgOnGpuVectors.h b/epochX/cudacpp/gg_ttggg.sa/src/mgOnGpuVectors.h
index e1299ba81e..cdae04326b 100644
--- a/epochX/cudacpp/gg_ttggg.sa/src/mgOnGpuVectors.h
+++ b/epochX/cudacpp/gg_ttggg.sa/src/mgOnGpuVectors.h
@@ -32,7 +32,7 @@
 #endif
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -131,7 +131,7 @@ namespace mg5amcCpu
 #endif
 #endif
 
-#else // i.e #ifndef MGONGPU_CPPSIMD (this includes #ifdef __CUDACC__)
+#else // i.e #ifndef MGONGPU_CPPSIMD (this includes #ifdef MGONGPUCPP_GPUIMPL)
 
   const int neppV = 1;
 
@@ -153,13 +153,13 @@ namespace mg5amcCpu
 //==========================================================================
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
 #endif
 {
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 
   // Printout to stream for user defined types
 
@@ -805,11 +805,11 @@ namespace mg5amcCpu
 
 #endif // #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
 
-#endif // #ifndef __CUDACC__
+#endif // #ifndef MGONGPUCPP_GPUIMPL
 
   //==========================================================================
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 
   //------------------------------
   // Vector types - CUDA
@@ -853,12 +853,12 @@ namespace mg5amcCpu
     return mask;
   }
 
-#endif // #ifdef __CUDACC__
+#endif // #ifdef MGONGPUCPP_GPUIMPL
 
   //==========================================================================
 
   // Scalar-or-vector types: scalar in CUDA, vector or scalar in C++
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   typedef bool bool_sv;
   typedef fptype fptype_sv;
   typedef fptype2 fptype2_sv;
@@ -879,7 +879,7 @@ namespace mg5amcCpu
 #endif
 
   // Scalar-or-vector zeros: scalar in CUDA, vector or scalar in C++
-#ifdef __CUDACC__ /* clang-format off */
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
   inline __host__ __device__ cxtype cxzero_sv(){ return cxtype( 0, 0 ); }
 #elif defined MGONGPU_CPPSIMD
   inline cxtype_v cxzero_sv() { return cxtype_v(); } // RRRR=0000 IIII=0000
diff --git a/epochX/cudacpp/gg_ttggg.sa/src/rambo.h b/epochX/cudacpp/gg_ttggg.sa/src/rambo.h
index e02ea52496..cd7e1008ea 100644
--- a/epochX/cudacpp/gg_ttggg.sa/src/rambo.h
+++ b/epochX/cudacpp/gg_ttggg.sa/src/rambo.h
@@ -4,7 +4,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 
 #include "mgOnGpuConfig.h"
@@ -18,7 +18,7 @@
 #include <iostream>
 
 // Simplified rambo version for 2 to N (with N>=2) processes with massless particles
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -83,7 +83,7 @@ namespace mg5amcCpu
       static bool first = true;
       if( first )
       {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
         if constexpr( M_ACCESS::isOnDevice() ) // avoid
         {
           const int ievt0 = 0;
@@ -166,7 +166,7 @@ namespace mg5amcCpu
     wt = po2log;
     if( nparf != 2 ) wt = ( 2. * nparf - 4. ) * log( energy ) + z[nparf - 1];
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
     // issue warnings if weight is too small or too large
     static int iwarn[5] = { 0, 0, 0, 0, 0 };
     if( wt < -180. )
diff --git a/epochX/cudacpp/gq_ttq.mad/CODEGEN_mad_gq_ttq_log.txt b/epochX/cudacpp/gq_ttq.mad/CODEGEN_mad_gq_ttq_log.txt
index 2c0e77fafd..bb803498ee 100644
--- a/epochX/cudacpp/gq_ttq.mad/CODEGEN_mad_gq_ttq_log.txt
+++ b/epochX/cudacpp/gq_ttq.mad/CODEGEN_mad_gq_ttq_log.txt
@@ -52,7 +52,7 @@ Note that you can still compile and run aMC@NLO with the built-in PDFs
 
 Using default text editor "vi". Set another one in ./input/mg5_configuration.txt
 Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt
-No valid web browser found. Please set in ./input/mg5_configuration.txt
+Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt
 import /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq.mg
 The import format was not given, so we guess it as command
 set stdout_level DEBUG
@@ -61,7 +61,7 @@ set zerowidth_tchannel F
 define q = u c d s u~ c~ d~ s~
 INFO: load particles 
 INFO: load vertices 
-[1;32mDEBUG: model prefixing  takes 0.005677223205566406 [0m
+[1;32mDEBUG: model prefixing  takes 0.005455732345581055 [0m
 INFO: Restrict model sm with file models/sm/restrict_default.dat . 
 [1;32mDEBUG: Simplifying conditional expressions [0m
 [1;32mDEBUG: remove interactions: u s w+ at order: QED=1 [0m
@@ -170,7 +170,7 @@ INFO: Crossed process found for g u~ > t t~ u~, reuse diagrams.
 INFO: Crossed process found for g c~ > t t~ c~, reuse diagrams. 
 INFO: Crossed process found for g d~ > t t~ d~, reuse diagrams. 
 INFO: Crossed process found for g s~ > t t~ s~, reuse diagrams. 
-8 processes with 40 diagrams generated in 0.080 s
+8 processes with 40 diagrams generated in 0.077 s
 Total: 8 processes with 40 diagrams
 output madevent ../TMPOUT/CODEGEN_mad_gq_ttq --hel_recycling=False --vector_size=32 --me_exporter=standalone_cudacpp
 Load PLUGIN.CUDACPP_OUTPUT
@@ -198,7 +198,7 @@ INFO: Combined process g d~ > t t~ d~ WEIGHTED<=3 @1 with process g u~ > t t~ u~
 INFO: Combined process g s~ > t t~ s~ WEIGHTED<=3 @1 with process g u~ > t t~ u~ WEIGHTED<=3 @1 
 INFO: Creating files in directory P1_gu_ttxu 
 [1;32mDEBUG:  kwargs[prefix] = 0 [1;30m[model_handling.py at line 1058][0m [0m
-[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7f914b38cc40> [1;30m[export_v4.py at line 6262][0m [0m
+[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7f50ff1b1be0> [1;30m[export_v4.py at line 6262][0m [0m
 INFO: Creating files in directory . 
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.h
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.cc
@@ -215,7 +215,7 @@ INFO: Generating Feynman diagrams for Process: g u > t t~ u WEIGHTED<=3 @1
 INFO: Finding symmetric diagrams for subprocess group gu_ttxu 
 INFO: Creating files in directory P1_gux_ttxux 
 [1;32mDEBUG:  kwargs[prefix] = 0 [1;30m[model_handling.py at line 1058][0m [0m
-[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7f914b238c10> [1;30m[export_v4.py at line 6262][0m [0m
+[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7f50ff1b1be0> [1;30m[export_v4.py at line 6262][0m [0m
 INFO: Creating files in directory . 
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.h
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.cc
@@ -230,17 +230,17 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./.
 [1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m 32 True 32 [1;30m[export_v4.py at line 1872][0m [0m
 INFO: Generating Feynman diagrams for Process: g u~ > t t~ u~ WEIGHTED<=3 @1 
 INFO: Finding symmetric diagrams for subprocess group gux_ttxux 
-Generated helas calls for 2 subprocesses (10 diagrams) in 0.032 s
-Wrote files for 32 helas calls in 0.231 s
+Generated helas calls for 2 subprocesses (10 diagrams) in 0.030 s
+Wrote files for 32 helas calls in 0.216 s
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates FFV1 routines[0m
 ALOHA: aloha creates VVV1 routines[0m
-ALOHA: aloha creates 2 routines in  0.364 s
+ALOHA: aloha creates 2 routines in  0.143 s
 [1;32mDEBUG:  Entering PLUGIN_ProcessExporter.convert_model (create the model) [1;30m[output.py at line 202][0m [0m
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates FFV1 routines[0m
 ALOHA: aloha creates VVV1 routines[0m
-ALOHA: aloha creates 4 routines in  0.137 s
+ALOHA: aloha creates 4 routines in  0.130 s
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
@@ -294,10 +294,10 @@ Type "launch" to generate events from this process, or see
 Run "open index.html" to see more information about this process.
 quit
 
-real	0m2.934s
-user	0m1.748s
-sys	0m0.220s
-Code generation completed in 3 seconds
+real	0m1.916s
+user	0m1.672s
+sys	0m0.240s
+Code generation completed in 2 seconds
 ************************************************************
 *                                                          *
 *                      W E L C O M E to                    *
@@ -323,7 +323,7 @@ INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/mg5amc
 INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq/Cards/me5_configuration.txt  
 Using default text editor "vi". Set another one in ./input/mg5_configuration.txt
 Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt
-No valid web browser found. Please set in ./input/mg5_configuration.txt
+Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt
 treatcards run
 quit
 INFO:  
@@ -353,7 +353,7 @@ INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/mg5amc
 INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq/Cards/me5_configuration.txt  
 Using default text editor "vi". Set another one in ./input/mg5_configuration.txt
 Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt
-No valid web browser found. Please set in ./input/mg5_configuration.txt
+Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt
 treatcards param
 quit
 INFO:  
diff --git a/epochX/cudacpp/gq_ttq.mad/COPYRIGHT b/epochX/cudacpp/gq_ttq.mad/COPYRIGHT
index a134b5fef9..84a883fbb0 100644
--- a/epochX/cudacpp/gq_ttq.mad/COPYRIGHT
+++ b/epochX/cudacpp/gq_ttq.mad/COPYRIGHT
@@ -15,6 +15,7 @@ The full development team currently includes the following authors :
   Stephan Hageboeck (CERN)
   Olivier Mattelaer (Universite Catholique de Louvain, original author)
   Stefan Roiser (CERN, original author)
+  Joergen Teig (CERN)
   Andrea Valassi (CERN, original author)
   Zenny Wettersten (CERN)
 See https://github.com/madgraph5/madgraph4gpu for more details. For the full
diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/Bridge.h b/epochX/cudacpp/gq_ttq.mad/SubProcesses/Bridge.h
index bf8b5e024d..89437b4c42 100644
--- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/Bridge.h
+++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/Bridge.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: S. Roiser (Nov 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Roiser, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef BRIDGE_H
 #define BRIDGE_H 1
@@ -23,7 +23,7 @@
 #include <memory>
 #include <type_traits>
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -83,7 +83,7 @@ namespace mg5amcCpu
     Bridge& operator=( const Bridge& ) = delete;
     Bridge& operator=( Bridge&& ) = delete;
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     /**
      * Set the gpublocks and gputhreads for the gpusequence - throws if evnt != gpublocks*gputhreads
      * (this is needed for BridgeKernel tests rather than for actual production use in Fortran)
@@ -150,7 +150,7 @@ namespace mg5amcCpu
     unsigned int m_nevt; // number of events
     int m_nGoodHel;      // the number of good helicities (-1 initially when they have not yet been calculated)
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     int m_gputhreads; // number of gpu threads (default set from number of events, can be modified)
     int m_gpublocks;  // number of gpu blocks (default set from number of events, can be modified)
     DeviceBuffer<FORTRANFPTYPE, sizePerEventMomenta> m_devMomentaF;
@@ -187,12 +187,12 @@ namespace mg5amcCpu
   // Forward declare transposition methods
   //
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 
   template<typename Tin, typename Tout>
   __global__ void dev_transposeMomentaF2C( const Tin* in, Tout* out, const unsigned int nevt );
 
-#endif // __CUDACC__
+#endif // MGONGPUCPP_GPUIMPL
 
   template<typename Tin, typename Tout>
   void hst_transposeMomentaF2C( const Tin* in, Tout* out, const unsigned int nevt );
@@ -209,7 +209,7 @@ namespace mg5amcCpu
   Bridge<FORTRANFPTYPE>::Bridge( unsigned int nevtF, unsigned int nparF, unsigned int np4F )
     : m_nevt( nevtF )
     , m_nGoodHel( -1 )
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     , m_gputhreads( 256 )                  // default number of gpu threads
     , m_gpublocks( m_nevt / m_gputhreads ) // this ensures m_nevt <= m_gpublocks*m_gputhreads
     , m_devMomentaF( m_nevt )
@@ -233,7 +233,7 @@ namespace mg5amcCpu
   {
     if( nparF != CPPProcess::npar ) throw std::runtime_error( "Bridge constructor: npar mismatch" );
     if( np4F != CPPProcess::np4 ) throw std::runtime_error( "Bridge constructor: np4 mismatch" );
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     if( ( m_nevt < s_gputhreadsmin ) || ( m_nevt % s_gputhreadsmin != 0 ) )
       throw std::runtime_error( "Bridge constructor: nevt should be a multiple of " + std::to_string( s_gputhreadsmin ) );
     while( m_nevt != m_gpublocks * m_gputhreads )
@@ -249,7 +249,7 @@ namespace mg5amcCpu
 #else
     std::cout << "WARNING! Instantiate host Bridge (nevt=" << m_nevt << ")" << std::endl;
     m_pmek.reset( new MatrixElementKernelHost( m_hstMomentaC, m_hstGs, m_hstRndHel, m_hstRndCol, m_hstMEs, m_hstSelHel, m_hstSelCol, m_nevt ) );
-#endif // __CUDACC__
+#endif // MGONGPUCPP_GPUIMPL
     // Create a process object, read param card and set parameters
     // FIXME: the process instance can happily go out of scope because it is only needed to read parameters?
     // FIXME: the CPPProcess should really be a singleton? what if fbridgecreate is called from several Fortran threads?
@@ -262,7 +262,7 @@ namespace mg5amcCpu
     process.initProc( paramCard );
   }
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   template<typename FORTRANFPTYPE>
   void Bridge<FORTRANFPTYPE>::set_gpugrid( const int gpublocks, const int gputhreads )
   {
@@ -276,7 +276,7 @@ namespace mg5amcCpu
   }
 #endif
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   template<typename FORTRANFPTYPE>
   void Bridge<FORTRANFPTYPE>::gpu_sequence( const FORTRANFPTYPE* momenta,
                                             const FORTRANFPTYPE* gs,
@@ -291,14 +291,14 @@ namespace mg5amcCpu
     constexpr int neppM = MemoryAccessMomenta::neppM;
     if constexpr( neppM == 1 && std::is_same_v<FORTRANFPTYPE, fptype> )
     {
-      checkCuda( cudaMemcpy( m_devMomentaC.data(), momenta, m_devMomentaC.bytes(), cudaMemcpyHostToDevice ) );
+      gpuMemcpy( m_devMomentaC.data(), momenta, m_devMomentaC.bytes(), gpuMemcpyHostToDevice );
     }
     else
     {
-      checkCuda( cudaMemcpy( m_devMomentaF.data(), momenta, m_devMomentaF.bytes(), cudaMemcpyHostToDevice ) );
+      gpuMemcpy( m_devMomentaF.data(), momenta, m_devMomentaF.bytes(), gpuMemcpyHostToDevice );
       const int thrPerEvt = CPPProcess::npar * CPPProcess::np4; // AV: transpose alg does 1 element per thread (NOT 1 event per thread)
       //const int thrPerEvt = 1; // AV: try new alg with 1 event per thread... this seems slower
-      dev_transposeMomentaF2C<<<m_gpublocks * thrPerEvt, m_gputhreads>>>( m_devMomentaF.data(), m_devMomentaC.data(), m_nevt );
+      gpuLaunchKernel( dev_transposeMomentaF2C, m_gpublocks * thrPerEvt, m_gputhreads, m_devMomentaF.data(), m_devMomentaC.data(), m_nevt );
     }
     if constexpr( std::is_same_v<FORTRANFPTYPE, fptype> )
     {
@@ -341,7 +341,7 @@ namespace mg5amcCpu
   }
 #endif
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   template<typename FORTRANFPTYPE>
   void Bridge<FORTRANFPTYPE>::cpu_sequence( const FORTRANFPTYPE* momenta,
                                             const FORTRANFPTYPE* gs,
@@ -396,7 +396,7 @@ namespace mg5amcCpu
   // - C++ array: momenta[npagM][npar][np4][neppM] with nevt=npagM*neppM (AOSOA)
   //
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   template<typename Tin, typename Tout>
   __global__ void dev_transposeMomentaF2C( const Tin* in, Tout* out, const unsigned int nevt )
   {
diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/BridgeKernels.cc b/epochX/cudacpp/gq_ttq.mad/SubProcesses/BridgeKernels.cc
index d58066c9c1..eaf4037a24 100644
--- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/BridgeKernels.cc
+++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/BridgeKernels.cc
@@ -1,17 +1,18 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #include "BridgeKernels.h"
 
+#include "GpuAbstraction.h"
 #include "MemoryAccessMomenta.h"
 
 #include <sstream>
 
 //============================================================================
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -45,7 +46,7 @@ namespace mg5amcCpu
 
 //============================================================================
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 namespace mg5amcCpu
 {
 
@@ -96,7 +97,7 @@ namespace mg5amcCpu
 
 //============================================================================
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 {
 
diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/BridgeKernels.h b/epochX/cudacpp/gq_ttq.mad/SubProcesses/BridgeKernels.h
index 15eb4bff4d..3efef8ce97 100644
--- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/BridgeKernels.h
+++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/BridgeKernels.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef BRIDGEKERNELS_H
 #define BRIDGEKERNELS_H 1
@@ -12,7 +12,7 @@
 #include "MatrixElementKernels.h"
 #include "MemoryBuffers.h"
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -49,7 +49,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A Bridge wrapper class encapsulating matrix element calculations on a CPU host
   class BridgeKernelHost final : public BridgeKernelBase
   {
@@ -89,7 +89,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   // A Bridge wrapper class encapsulating matrix element calculations on a GPU device
   class BridgeKernelDevice : public BridgeKernelBase
   {
diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/CommonRandomNumberKernel.cc b/epochX/cudacpp/gq_ttq.mad/SubProcesses/CommonRandomNumberKernel.cc
index 985b39f576..010bc4cbd0 100644
--- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/CommonRandomNumberKernel.cc
+++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/CommonRandomNumberKernel.cc
@@ -1,15 +1,16 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #include "CommonRandomNumbers.h"
+#include "GpuAbstraction.h"
 #include "MemoryBuffers.h"
 #include "RandomNumberKernels.h"
 
 #include <cassert>
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/CrossSectionKernels.cc b/epochX/cudacpp/gq_ttq.mad/SubProcesses/CrossSectionKernels.cc
index 0b355a3c8d..c15b39844d 100644
--- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/CrossSectionKernels.cc
+++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/CrossSectionKernels.cc
@@ -1,10 +1,11 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #include "CrossSectionKernels.h"
 
+#include "GpuAbstraction.h"
 #include "MemoryAccessMatrixElements.h"
 #include "MemoryAccessWeights.h"
 #include "MemoryBuffers.h"
@@ -77,7 +78,7 @@ debug_me_is_abnormal( const fptype& me, size_t ievtALL )
 
 //============================================================================
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -185,7 +186,7 @@ namespace mg5amcCpu
 
 //============================================================================
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 {
 
diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/CrossSectionKernels.h b/epochX/cudacpp/gq_ttq.mad/SubProcesses/CrossSectionKernels.h
index 7933ca4bbf..4d9659e04e 100644
--- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/CrossSectionKernels.h
+++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/CrossSectionKernels.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef CROSSSECTIONKERNELS_H
 #define CROSSSECTIONKERNELS_H 1
@@ -13,7 +13,7 @@
 
 //============================================================================
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -96,7 +96,7 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
   /*
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   // A class encapsulating the calculation of event statistics on a GPU device
   class CrossSectionKernelDevice : public CrossSectionKernelBase, public NumberOfEvents
   {
diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/CudaRuntime.h b/epochX/cudacpp/gq_ttq.mad/SubProcesses/CudaRuntime.h
deleted file mode 100644
index 64ce52f4b3..0000000000
--- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/CudaRuntime.h
+++ /dev/null
@@ -1,85 +0,0 @@
-// Copyright (C) 2020-2023 CERN and UCLouvain.
-// Licensed under the GNU Lesser General Public License (version 3 or later).
-// Created by: S. Roiser (Jul 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
-
-#ifndef MG5AMC_CUDARUNTIME_H
-#define MG5AMC_CUDARUNTIME_H 1
-
-// MG5AMC on GPU uses the CUDA runtime API, not the lower level CUDA driver API
-// See https://docs.nvidia.com/cuda/cuda-runtime-api/driver-vs-runtime-api.html#driver-vs-runtime-api
-
-#include <cassert>
-#include <iostream>
-
-//--------------------------------------------------------------------------
-
-// See https://stackoverflow.com/a/14038590
-#ifdef __CUDACC__ /* clang-format off */
-#define checkCuda( code ) { assertCuda( code, __FILE__, __LINE__ ); }
-inline void assertCuda( cudaError_t code, const char* file, int line, bool abort = true )
-{
-  if( code != cudaSuccess )
-  {
-    printf( "ERROR! assertCuda: '%s' (%d) in %s:%d\n", cudaGetErrorString( code ), code, file, line );
-    if( abort ) assert( code == cudaSuccess );
-  }
-}
-#endif /* clang-format on */
-
-//--------------------------------------------------------------------------
-
-#ifdef __CUDACC__
-namespace mg5amcGpu
-{
-  // Instantiate a CudaRuntime at the beginnining of the application's main to
-  // invoke cudaSetDevice(0) in the constructor and book a cudaDeviceReset() call in the destructor
-  // *** FIXME! This will all need to be designed differently when going to multi-GPU nodes! ***
-  struct CudaRuntime final
-  {
-    CudaRuntime( const bool debug = true )
-      : m_debug( debug ) { setUp( m_debug ); }
-    ~CudaRuntime() { tearDown( m_debug ); }
-    CudaRuntime( const CudaRuntime& ) = delete;
-    CudaRuntime( CudaRuntime&& ) = delete;
-    CudaRuntime& operator=( const CudaRuntime& ) = delete;
-    CudaRuntime& operator=( CudaRuntime&& ) = delete;
-    bool m_debug;
-
-    // Set up CUDA application
-    // ** NB: strictly speaking this is not needed when using the CUDA runtime API **
-    // Calling cudaSetDevice on startup is useful to properly book-keep the time spent in CUDA initialization
-    static void setUp( const bool debug = true )
-    {
-      // ** NB: it is useful to call cudaSetDevice, or cudaFree, to properly book-keep the time spent in CUDA initialization
-      // ** NB: otherwise, the first CUDA operation (eg a cudaMemcpyToSymbol in CPPProcess ctor) appears to take much longer!
-      /*
-      // [We initially added cudaFree(0) to "ease profile analysis" only because it shows up as a big recognizable block!]
-      // No explicit initialization is needed: https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#initialization
-      // It is not clear what cudaFree(0) does at all: https://stackoverflow.com/questions/69967813/
-      if ( debug ) std::cout << "__CudaRuntime: calling cudaFree(0)" << std::endl;
-      checkCuda( cudaFree( 0 ) ); // SLOW!
-      */
-      // Replace cudaFree(0) by cudaSetDevice(0), even if it is not really needed either
-      // (but see https://developer.nvidia.com/blog/cuda-pro-tip-always-set-current-device-avoid-multithreading-bugs)
-      if( debug ) std::cout << "__CudaRuntime: calling cudaSetDevice(0)" << std::endl;
-      checkCuda( cudaSetDevice( 0 ) ); // SLOW!
-    }
-
-    // Tear down CUDA application (call cudaDeviceReset)
-    // ** NB: strictly speaking this is not needed when using the CUDA runtime API **
-    // Calling cudaDeviceReset on shutdown is only needed for checking memory leaks in cuda-memcheck
-    // See https://docs.nvidia.com/cuda/cuda-memcheck/index.html#leak-checking
-    static void tearDown( const bool debug = true )
-    {
-      if( debug ) std::cout << "__CudaRuntime: calling cudaDeviceReset()" << std::endl;
-      checkCuda( cudaDeviceReset() );
-    }
-  };
-
-}
-#endif
-
-//--------------------------------------------------------------------------
-
-#endif // MG5AMC_CUDARUNTIME_H
diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/CurandRandomNumberKernel.cc b/epochX/cudacpp/gq_ttq.mad/SubProcesses/CurandRandomNumberKernel.cc
index eb56333b03..08a16f6f2c 100644
--- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/CurandRandomNumberKernel.cc
+++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/CurandRandomNumberKernel.cc
@@ -1,9 +1,9 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
-#include "CudaRuntime.h"
+#include "GpuRuntime.h"
 #include "MemoryBuffers.h"
 #include "RandomNumberKernels.h"
 
@@ -22,7 +22,7 @@ inline void assertCurand( curandStatus_t code, const char *file, int line, bool
 }
 #endif /* clang-format on */
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -36,7 +36,7 @@ namespace mg5amcCpu
   {
     if( m_isOnDevice )
     {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
       if( !m_rnarray.isOnDevice() )
         throw std::runtime_error( "CurandRandomNumberKernel on device with a host random number array" );
 #else
@@ -114,7 +114,7 @@ namespace mg5amcCpu
     /*
     printf( "\nCurandRandomNumberKernel::generateRnarray size = %d\n", (int)m_rnarray.size() );
     fptype* data = m_rnarray.data();
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     if( m_rnarray.isOnDevice() )
     {
       data = new fptype[m_rnarray.size()]();
@@ -123,7 +123,7 @@ namespace mg5amcCpu
 #endif
     for( int i = 0; i < ( (int)m_rnarray.size() / 4 ); i++ )
       printf( "[%4d] %f %f %f %f\n", i * 4, data[i * 4], data[i * 4 + 2], data[i * 4 + 2], data[i * 4 + 3] );
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     if( m_rnarray.isOnDevice() ) delete[] data;
 #endif
     */
diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/EventStatistics.h b/epochX/cudacpp/gq_ttq.mad/SubProcesses/EventStatistics.h
index 48b51e0a49..b425a5bade 100644
--- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/EventStatistics.h
+++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/EventStatistics.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef EventStatistics_H
 #define EventStatistics_H 1
@@ -16,7 +16,7 @@
 #include <limits>
 #include <string>
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/GpuAbstraction.h b/epochX/cudacpp/gq_ttq.mad/SubProcesses/GpuAbstraction.h
new file mode 100644
index 0000000000..6a7d9c05c0
--- /dev/null
+++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/GpuAbstraction.h
@@ -0,0 +1,71 @@
+// Copyright (C) 2020-2023 CERN and UCLouvain.
+// Licensed under the GNU Lesser General Public License (version 3 or later).
+// Created by: J. Teig (Jul 2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+
+#ifndef MG5AMC_GPUABSTRACTION_H
+#define MG5AMC_GPUABSTRACTION_H 1
+
+#include <cassert>
+
+//--------------------------------------------------------------------------
+
+#ifdef __CUDACC__
+
+#define gpuError_t cudaError_t
+#define gpuPeekAtLastError cudaPeekAtLastError
+#define gpuGetErrorString cudaGetErrorString
+#define gpuSuccess cudaSuccess
+
+#define gpuMallocHost( ptr, size ) checkGpu( cudaMallocHost( ptr, size ) )
+#define gpuMalloc( ptr, size ) checkGpu( cudaMalloc( ptr, size ) )
+
+#define gpuMemcpy( dstData, srcData, srcBytes, func ) checkGpu( cudaMemcpy( dstData, srcData, srcBytes, func ) )
+#define gpuMemcpyHostToDevice cudaMemcpyHostToDevice
+#define gpuMemcpyDeviceToHost cudaMemcpyDeviceToHost
+#define gpuMemcpyToSymbol( type1, type2, size ) checkGpu( cudaMemcpyToSymbol( type1, type2, size ) )
+
+#define gpuFree( ptr ) checkGpu( cudaFree( ptr ) )
+#define gpuFreeHost( ptr ) checkGpu( cudaFreeHost( ptr ) )
+
+#define gpuSetDevice cudaSetDevice
+#define gpuDeviceSynchronize cudaDeviceSynchronize
+#define gpuDeviceReset cudaDeviceReset
+
+#define gpuLaunchKernel( kernel, blocks, threads, ... ) kernel<<<blocks, threads>>>( __VA_ARGS__ )
+#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<<blocks, threads, sharedMem>>>( __VA_ARGS__ )
+
+//--------------------------------------------------------------------------
+
+#elif defined __HIPCC__
+
+#include "hip/hip_runtime.h"
+
+#define gpuError_t hipError_t
+#define gpuPeekAtLastError hipPeekAtLastError
+#define gpuGetErrorString hipGetErrorString
+#define gpuSuccess hipSuccess
+
+#define gpuMallocHost( ptr, size ) checkGpu( hipHostMalloc( ptr, size ) ) // HostMalloc better
+#define gpuMalloc( ptr, size ) checkGpu( hipMalloc( ptr, size ) )
+
+#define gpuMemcpy( dstData, srcData, srcBytes, func ) checkGpu( hipMemcpy( dstData, srcData, srcBytes, func ) )
+#define gpuMemcpyHostToDevice hipMemcpyHostToDevice
+#define gpuMemcpyDeviceToHost hipMemcpyDeviceToHost
+#define gpuMemcpyToSymbol( type1, type2, size ) checkGpu( hipMemcpyToSymbol( type1, type2, size ) )
+
+#define gpuFree( ptr ) checkGpu( hipFree( ptr ) )
+#define gpuFreeHost( ptr ) checkGpu( hipHostFree( ptr ) )
+
+#define gpuSetDevice hipSetDevice
+#define gpuDeviceSynchronize hipDeviceSynchronize
+#define gpuDeviceReset hipDeviceReset
+
+#define gpuLaunchKernel( kernel, blocks, threads, ... ) kernel<<<blocks, threads>>>( __VA_ARGS__ )
+#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<<blocks, threads, sharedMem>>>( __VA_ARGS__ )
+
+//--------------------------------------------------------------------------
+
+#endif
+
+#endif // MG5AMC_GPUABSTRACTION_H
diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/GpuRuntime.h b/epochX/cudacpp/gq_ttq.mad/SubProcesses/GpuRuntime.h
new file mode 100644
index 0000000000..93579ef08b
--- /dev/null
+++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/GpuRuntime.h
@@ -0,0 +1,85 @@
+// Copyright (C) 2020-2023 CERN and UCLouvain.
+// Licensed under the GNU Lesser General Public License (version 3 or later).
+// Created by: J. Teig (Jun 2023, based on earlier work by S. Roiser) for the MG5aMC CUDACPP plugin.
+// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+
+#ifndef MG5AMC_GPURUNTIME_H
+#define MG5AMC_GPURUNTIME_H 1
+
+// MG5AMC on GPU uses the CUDA runtime API, not the lower level CUDA driver API
+// See https://docs.nvidia.com/cuda/cuda-runtime-api/driver-vs-runtime-api.html#driver-vs-runtime-api
+
+#include "GpuAbstraction.h"
+
+#include <iostream>
+
+//--------------------------------------------------------------------------
+
+// See https://stackoverflow.com/a/14038590
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
+#define checkGpu( code ) { assertGpu( code, __FILE__, __LINE__ ); }
+inline void assertGpu( gpuError_t code, const char* file, int line, bool abort = true )
+{
+  if( code != gpuSuccess )
+  {
+    printf( "ERROR! assertGpu: '%s' (%d) in %s:%d\n", gpuGetErrorString( code ), code, file, line );
+    if( abort ) assert( code == gpuSuccess );
+  }
+}
+#endif /* clang-format on */
+
+//--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+namespace mg5amcGpu
+{
+  // Instantiate a GpuRuntime at the beginnining of the application's main to
+  // invoke gpuSetDevice(0) in the constructor and book a gpuDeviceReset() call in the destructor
+  // *** FIXME! This will all need to be designed differently when going to multi-GPU nodes! ***
+  struct GpuRuntime final
+  {
+    GpuRuntime( const bool debug = true )
+      : m_debug( debug ) { setUp( m_debug ); }
+    ~GpuRuntime() { tearDown( m_debug ); }
+    GpuRuntime( const GpuRuntime& ) = delete;
+    GpuRuntime( GpuRuntime&& ) = delete;
+    GpuRuntime& operator=( const GpuRuntime& ) = delete;
+    GpuRuntime& operator=( GpuRuntime&& ) = delete;
+    bool m_debug;
+
+    // Set up CUDA application
+    // ** NB: strictly speaking this is not needed when using the CUDA runtime API **
+    // Calling cudaSetDevice on startup is useful to properly book-keep the time spent in CUDA initialization
+    static void setUp( const bool debug = true )
+    {
+      // ** NB: it is useful to call cudaSetDevice, or cudaFree, to properly book-keep the time spent in CUDA initialization
+      // ** NB: otherwise, the first CUDA operation (eg a cudaMemcpyToSymbol in CPPProcess ctor) appears to take much longer!
+      /*
+      // [We initially added cudaFree(0) to "ease profile analysis" only because it shows up as a big recognizable block!]
+      // No explicit initialization is needed: https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#initialization
+      // It is not clear what cudaFree(0) does at all: https://stackoverflow.com/questions/69967813/
+      if ( debug ) std::cout << "__CudaRuntime: calling cudaFree(0)" << std::endl;
+      checkCuda( cudaFree( 0 ) ); // SLOW!
+      */
+      // Replace cudaFree(0) by cudaSetDevice(0), even if it is not really needed either
+      // (but see https://developer.nvidia.com/blog/cuda-pro-tip-always-set-current-device-avoid-multithreading-bugs)
+      if( debug ) std::cout << "__GpuRuntime: calling GpuSetDevice(0)" << std::endl;
+      checkGpu( gpuSetDevice( 0 ) ); // SLOW!
+    }
+
+    // Tear down CUDA application (call cudaDeviceReset)
+    // ** NB: strictly speaking this is not needed when using the CUDA runtime API **
+    // Calling cudaDeviceReset on shutdown is only needed for checking memory leaks in cuda-memcheck
+    // See https://docs.nvidia.com/cuda/cuda-memcheck/index.html#leak-checking
+    static void tearDown( const bool debug = true )
+    {
+      if( debug ) std::cout << "__GpuRuntime: calling GpuDeviceReset()" << std::endl;
+      checkGpu( gpuDeviceReset() );
+    }
+  };
+}
+#endif
+
+//--------------------------------------------------------------------------
+
+#endif // MG5AMC_GPURUNTIME_H
diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/MadgraphTest.h b/epochX/cudacpp/gq_ttq.mad/SubProcesses/MadgraphTest.h
index ef40624c88..a64c05c26a 100644
--- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/MadgraphTest.h
+++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/MadgraphTest.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: S. Hageboeck (Dec 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MADGRAPHTEST_H_
 #define MADGRAPHTEST_H_ 1
@@ -22,7 +22,7 @@
 #include <string>
 #include <vector>
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 using mg5amcGpu::CPPProcess;
 #else
 using mg5amcCpu::CPPProcess;
@@ -201,7 +201,7 @@ class MadgraphTest : public testing::TestWithParam<TestDriverBase*>
 
 // Since we link both the CPU-only and GPU tests into the same executable, we prevent
 // a multiply defined symbol by only compiling this in the non-CUDA phase:
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 
 /// Compare momenta and matrix elements.
 /// This uses an implementation of TestDriverBase to run a madgraph workflow,
@@ -307,6 +307,6 @@ TEST_P( MadgraphTest, CompareMomentaAndME )
   }
 }
 
-#endif // __CUDACC__
+#endif // MGONGPUCPP_GPUIMPL
 
 #endif /* MADGRAPHTEST_H_ */
diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/MatrixElementKernels.cc b/epochX/cudacpp/gq_ttq.mad/SubProcesses/MatrixElementKernels.cc
index 74b5239ebf..81699dfea9 100644
--- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/MatrixElementKernels.cc
+++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/MatrixElementKernels.cc
@@ -1,12 +1,12 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #include "MatrixElementKernels.h"
 
 #include "CPPProcess.h"
-#include "CudaRuntime.h"
+#include "GpuRuntime.h" // Includes the abstraction for Nvidia/AMD compilation
 #include "MemoryAccessMomenta.h"
 #include "MemoryBuffers.h"
 
@@ -14,7 +14,7 @@
 
 //============================================================================
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 namespace mg5amcCpu
 {
 
@@ -150,7 +150,7 @@ namespace mg5amcCpu
 
 //============================================================================
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 {
 
@@ -209,13 +209,13 @@ namespace mg5amcGpu
     PinnedHostBufferHelicityMask hstIsGoodHel( ncomb );
     DeviceBufferHelicityMask devIsGoodHel( ncomb );
     // ... 0d1. Compute good helicity mask on the device
-    computeDependentCouplings<<<m_gpublocks, m_gputhreads>>>( m_gs.data(), m_couplings.data() );
+    gpuLaunchKernel( computeDependentCouplings, m_gpublocks, m_gputhreads, m_gs.data(), m_couplings.data() );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    sigmaKin_getGoodHel<<<m_gpublocks, m_gputhreads>>>( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_numerators.data(), m_denominators.data(), devIsGoodHel.data() );
+    gpuLaunchKernel( sigmaKin_getGoodHel, m_gpublocks, m_gputhreads, m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_numerators.data(), m_denominators.data(), devIsGoodHel.data() );
 #else
-    sigmaKin_getGoodHel<<<m_gpublocks, m_gputhreads>>>( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), devIsGoodHel.data() );
+    gpuLaunchKernel( sigmaKin_getGoodHel, m_gpublocks, m_gputhreads, m_momenta.data(), m_couplings.data(), m_matrixElements.data(), devIsGoodHel.data() );
 #endif
-    checkCuda( cudaPeekAtLastError() );
+    checkGpu( gpuPeekAtLastError() );
     // ... 0d2. Copy back good helicity mask to the host
     copyHostFromDevice( hstIsGoodHel, devIsGoodHel );
     // ... 0d3. Copy back good helicity list to constant memory on the device
@@ -226,19 +226,19 @@ namespace mg5amcGpu
 
   void MatrixElementKernelDevice::computeMatrixElements( const unsigned int channelId )
   {
-    computeDependentCouplings<<<m_gpublocks, m_gputhreads>>>( m_gs.data(), m_couplings.data() );
+    gpuLaunchKernel( computeDependentCouplings, m_gpublocks, m_gputhreads, m_gs.data(), m_couplings.data() );
 #ifndef MGONGPU_NSIGHT_DEBUG
     constexpr unsigned int sharedMemSize = 0;
 #else
     constexpr unsigned int sharedMemSize = ntpbMAX * sizeof( float );
 #endif
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    sigmaKin<<<m_gpublocks, m_gputhreads, sharedMemSize>>>( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), channelId, m_numerators.data(), m_denominators.data(), m_selhel.data(), m_selcol.data() );
+    gpuLaunchKernelSharedMem( sigmaKin, m_gpublocks, m_gputhreads, sharedMemSize, m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), channelId, m_numerators.data(), m_denominators.data(), m_selhel.data(), m_selcol.data() );
 #else
-    sigmaKin<<<m_gpublocks, m_gputhreads, sharedMemSize>>>( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), m_selhel.data(), m_selcol.data() );
+    gpuLaunchKernelSharedMem( sigmaKin, m_gpublocks, m_gputhreads, sharedMemSize, m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), m_selhel.data(), m_selcol.data() );
 #endif
-    checkCuda( cudaPeekAtLastError() );
-    checkCuda( cudaDeviceSynchronize() );
+    checkGpu( gpuPeekAtLastError() );
+    checkGpu( gpuDeviceSynchronize() );
   }
 
   //--------------------------------------------------------------------------
diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/MatrixElementKernels.h b/epochX/cudacpp/gq_ttq.mad/SubProcesses/MatrixElementKernels.h
index 23e84757a2..72bd8f195b 100644
--- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/MatrixElementKernels.h
+++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/MatrixElementKernels.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MATRIXELEMENTKERNELS_H
 #define MATRIXELEMENTKERNELS_H 1
@@ -10,7 +10,7 @@
 
 #include "MemoryBuffers.h"
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -81,7 +81,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating matrix element calculations on a CPU host
   class MatrixElementKernelHost final : public MatrixElementKernelBase, public NumberOfEvents
   {
@@ -130,7 +130,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   // A class encapsulating matrix element calculations on a GPU device
   class MatrixElementKernelDevice : public MatrixElementKernelBase, public NumberOfEvents
   {
diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/MemoryAccessAmplitudes.h b/epochX/cudacpp/gq_ttq.mad/SubProcesses/MemoryAccessAmplitudes.h
index 573b3bbbc9..ffb76e93de 100644
--- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/MemoryAccessAmplitudes.h
+++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/MemoryAccessAmplitudes.h
@@ -15,7 +15,7 @@
 #define MGONGPU_TRIVIAL_AMPLITUDES 1
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/MemoryAccessCouplings.h b/epochX/cudacpp/gq_ttq.mad/SubProcesses/MemoryAccessCouplings.h
index 35a3af42e0..3afdf3e554 100644
--- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/MemoryAccessCouplings.h
+++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/MemoryAccessCouplings.h
@@ -15,7 +15,7 @@
 #include "MemoryBuffers.h"       // for HostBufferCouplings::isaligned
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/MemoryAccessCouplingsFixed.h b/epochX/cudacpp/gq_ttq.mad/SubProcesses/MemoryAccessCouplingsFixed.h
index dc0d93afff..ffcdf4dbef 100644
--- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/MemoryAccessCouplingsFixed.h
+++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/MemoryAccessCouplingsFixed.h
@@ -14,7 +14,7 @@
 //#include "MemoryAccessHelpers.h"
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/MemoryAccessDenominators.h b/epochX/cudacpp/gq_ttq.mad/SubProcesses/MemoryAccessDenominators.h
index 3bce635718..66f2d32a6b 100644
--- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/MemoryAccessDenominators.h
+++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/MemoryAccessDenominators.h
@@ -10,7 +10,7 @@
 #include "MemoryAccessGs.h"
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/MemoryAccessGs.h b/epochX/cudacpp/gq_ttq.mad/SubProcesses/MemoryAccessGs.h
index 31311aa375..4c726b30f3 100644
--- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/MemoryAccessGs.h
+++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/MemoryAccessGs.h
@@ -13,7 +13,7 @@
 #include "MemoryBuffers.h" // for HostBufferMatrixElements::isaligned
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/MemoryAccessHelpers.h b/epochX/cudacpp/gq_ttq.mad/SubProcesses/MemoryAccessHelpers.h
index c82a6c7635..db73e4e064 100644
--- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/MemoryAccessHelpers.h
+++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/MemoryAccessHelpers.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MemoryAccessHelpers_H
 #define MemoryAccessHelpers_H 1
@@ -105,7 +105,7 @@ class KernelAccessHelper : public MemoryAccessHelper<T>
     }
     else
     {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
       const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid
       //printf( "kernelAccessRecord: ievt=%d threadId=%d\n", ievt, threadIdx.x );
       return T::ieventAccessRecord( buffer, ievt ); // NB fptype and fptype_sv coincide for CUDA
diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/MemoryAccessMatrixElements.h b/epochX/cudacpp/gq_ttq.mad/SubProcesses/MemoryAccessMatrixElements.h
index f32e6fea5b..3741011971 100644
--- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/MemoryAccessMatrixElements.h
+++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/MemoryAccessMatrixElements.h
@@ -13,7 +13,7 @@
 #include "MemoryBuffers.h" // for HostBufferMatrixElements::isaligned
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/MemoryAccessMomenta.h b/epochX/cudacpp/gq_ttq.mad/SubProcesses/MemoryAccessMomenta.h
index 29266de32c..3be229d392 100644
--- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/MemoryAccessMomenta.h
+++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/MemoryAccessMomenta.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MemoryAccessMomenta_H
 #define MemoryAccessMomenta_H 1
@@ -13,7 +13,7 @@
 #include "MemoryAccessVectors.h"
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -30,7 +30,7 @@ namespace mg5amcCpu
 
     // Number of Events Per Page in the momenta AOSOA memory buffer layout
     // (these are all best kept as a compile-time constants: see issue #23)
-#ifdef __CUDACC__ /* clang-format off */
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
     // -----------------------------------------------------------------------------------------------
     // --- GPUs: neppM is best set to a power of 2 times the number of fptype's in a 32-byte cacheline
     // --- This is relevant to ensure coalesced access to momenta in global memory
diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/MemoryAccessNumerators.h b/epochX/cudacpp/gq_ttq.mad/SubProcesses/MemoryAccessNumerators.h
index b152183b28..18991f4fa6 100644
--- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/MemoryAccessNumerators.h
+++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/MemoryAccessNumerators.h
@@ -10,7 +10,7 @@
 #include "MemoryAccessGs.h"
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/MemoryAccessRandomNumbers.h b/epochX/cudacpp/gq_ttq.mad/SubProcesses/MemoryAccessRandomNumbers.h
index e2988d39f3..40cb089135 100644
--- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/MemoryAccessRandomNumbers.h
+++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/MemoryAccessRandomNumbers.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MemoryAccessRandomNumbers_H
 #define MemoryAccessRandomNumbers_H 1
@@ -11,7 +11,7 @@
 #include "CPPProcess.h"
 #include "MemoryAccessHelpers.h"
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 using mg5amcGpu::CPPProcess;
 #else
 using mg5amcCpu::CPPProcess;
diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/MemoryAccessVectors.h b/epochX/cudacpp/gq_ttq.mad/SubProcesses/MemoryAccessVectors.h
index e9b197368e..08faccff0f 100644
--- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/MemoryAccessVectors.h
+++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/MemoryAccessVectors.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MemoryAccessVectors_H
 #define MemoryAccessVectors_H 1
@@ -10,7 +10,7 @@
 
 #include "mgOnGpuVectors.h"
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 namespace mg5amcCpu // this is only needed for CPU SIMD vectorization
 {
 
diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/MemoryAccessWavefunctions.h b/epochX/cudacpp/gq_ttq.mad/SubProcesses/MemoryAccessWavefunctions.h
index 5428aaf933..33bef4559e 100644
--- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/MemoryAccessWavefunctions.h
+++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/MemoryAccessWavefunctions.h
@@ -15,7 +15,7 @@
 #define MGONGPU_TRIVIAL_WAVEFUNCTIONS 1
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/MemoryBuffers.h b/epochX/cudacpp/gq_ttq.mad/SubProcesses/MemoryBuffers.h
index 3093e6ed18..7756a71621 100644
--- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/MemoryBuffers.h
+++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/MemoryBuffers.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021, based on earlier work by S. Hageboeck) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Roiser, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MemoryBuffers_H
 #define MemoryBuffers_H 1
@@ -11,12 +11,12 @@
 #include "mgOnGpuCxtypes.h"
 
 #include "CPPProcess.h"
-#include "CudaRuntime.h"
+#include "GpuRuntime.h"
 #include "Parameters_sm.h"
 
 #include <sstream>
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -87,7 +87,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   constexpr bool HostBufferALIGNED = false;   // ismisaligned=false
   constexpr bool HostBufferMISALIGNED = true; // ismisaligned=true
 
@@ -119,7 +119,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   // A class encapsulating a CUDA pinned host buffer
   template<typename T>
   class PinnedHostBufferBase : public BufferBase<T>
@@ -128,18 +128,18 @@ namespace mg5amcCpu
     PinnedHostBufferBase( const size_t size )
       : BufferBase<T>( size, false )
     {
-      checkCuda( cudaMallocHost( &( this->m_data ), this->bytes() ) );
+      gpuMallocHost( &( this->m_data ), this->bytes() );
     }
     virtual ~PinnedHostBufferBase()
     {
-      checkCuda( cudaFreeHost( this->m_data ) );
+      gpuFreeHost( this->m_data );
     }
   };
 #endif
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   // A class encapsulating a CUDA device buffer
   template<typename T>
   class DeviceBufferBase : public BufferBase<T>
@@ -148,18 +148,18 @@ namespace mg5amcCpu
     DeviceBufferBase( const size_t size )
       : BufferBase<T>( size, true )
     {
-      checkCuda( cudaMalloc( &( this->m_data ), this->bytes() ) );
+      gpuMalloc( &( this->m_data ), this->bytes() );
     }
     virtual ~DeviceBufferBase()
     {
-      checkCuda( cudaFree( this->m_data ) );
+      gpuFree( this->m_data );
     }
   };
 #endif
 
   //--------------------------------------------------------------------------
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for a given number of events
   template<typename T, size_t sizePerEvent, bool ismisaligned>
   class HostBuffer : public HostBufferBase<T, ismisaligned>, virtual private NumberOfEvents
@@ -175,7 +175,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   // A class encapsulating a CUDA pinned host buffer for a given number of events
   template<typename T, size_t sizePerEvent>
   class PinnedHostBuffer : public PinnedHostBufferBase<T>, virtual private NumberOfEvents
@@ -191,7 +191,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   // A class encapsulating a CUDA device buffer for a given number of events
   template<typename T, size_t sizePerEvent>
   class DeviceBuffer : public DeviceBufferBase<T>, virtual private NumberOfEvents
@@ -213,7 +213,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for momenta random numbers
   constexpr size_t sizePerEventRndNumMomenta = MemoryBuffers::np4 * MemoryBuffers::nparf;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for momenta random numbers
   typedef HostBuffer<fptype, sizePerEventRndNumMomenta, HostBufferALIGNED> HostBufferRndNumMomenta;
 #else
@@ -232,7 +232,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer with ONE fptype per event
   constexpr size_t sizePerEventOneFp = 1;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer with ONE fptype per event
   typedef HostBuffer<fptype, sizePerEventOneFp, HostBufferALIGNED> HostBufferOneFp;
 #else
@@ -257,7 +257,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for Gs
   constexpr size_t sizePerEventGs = 1;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for gs
   typedef HostBuffer<fptype, sizePerEventGs, HostBufferALIGNED> HostBufferGs;
 #else
@@ -276,7 +276,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for numerators
   constexpr size_t sizePerEventNumerators = 1;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for gs
   typedef HostBuffer<fptype, sizePerEventNumerators, HostBufferALIGNED> HostBufferNumerators;
 #else
@@ -296,7 +296,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for denominators
   constexpr size_t sizePerEventDenominators = 1;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for gs
   typedef HostBuffer<fptype, sizePerEventDenominators, HostBufferALIGNED> HostBufferDenominators;
 #else
@@ -315,7 +315,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for random numbers
   constexpr size_t sizePerEventCouplings = MemoryBuffers::ndcoup * MemoryBuffers::nx2;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for gs
   typedef HostBuffer<fptype, sizePerEventCouplings, HostBufferALIGNED> HostBufferCouplings;
 #else
@@ -333,7 +333,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for momenta
   constexpr size_t sizePerEventMomenta = MemoryBuffers::np4 * MemoryBuffers::npar;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for momenta
   typedef HostBuffer<fptype, sizePerEventMomenta, HostBufferALIGNED> HostBufferMomenta;
   //typedef HostBuffer<fptype, sizePerEventMomenta, HostBufferMISALIGNED> HostBufferMomenta; // TEST MISALIGNMENT!
@@ -352,7 +352,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for sampling weights
   constexpr size_t sizePerEventWeights = 1;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for sampling weights
   typedef HostBuffer<fptype, sizePerEventWeights, HostBufferALIGNED> HostBufferWeights;
 #else
@@ -370,7 +370,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for matrix elements
   constexpr size_t sizePerEventMatrixElements = 1;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for matrix elements
   typedef HostBuffer<fptype, sizePerEventMatrixElements, HostBufferALIGNED> HostBufferMatrixElements;
 #else
@@ -385,7 +385,7 @@ namespace mg5amcCpu
   // A base class encapsulating a memory buffer for the helicity mask
   typedef BufferBase<bool> BufferHelicityMask;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for the helicity mask
   typedef HostBufferBase<bool, HostBufferALIGNED> HostBufferHelicityMask;
 #else
@@ -403,7 +403,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for wavefunctions
   constexpr size_t sizePerEventWavefunctions = MemoryBuffers::nw6 * MemoryBuffers::nx2;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for wavefunctions
   typedef HostBuffer<fptype, sizePerEventWavefunctions, HostBufferALIGNED> HostBufferWavefunctions;
 #else
@@ -421,7 +421,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for helicity random numbers
   constexpr size_t sizePerEventRndNumHelicity = 1;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for helicity random numbers
   typedef HostBuffer<fptype, sizePerEventRndNumHelicity, HostBufferALIGNED> HostBufferRndNumHelicity;
 #else
@@ -439,7 +439,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for color random numbers
   constexpr size_t sizePerEventRndNumColor = 1;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for color random numbers
   typedef HostBuffer<fptype, sizePerEventRndNumColor, HostBufferALIGNED> HostBufferRndNumColor;
 #else
@@ -457,7 +457,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for helicity selection
   constexpr size_t sizePerEventSelectedHelicity = 1;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for helicity selection
   typedef HostBuffer<int, sizePerEventSelectedHelicity, HostBufferALIGNED> HostBufferSelectedHelicity;
 #else
@@ -475,7 +475,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for color selection
   constexpr size_t sizePerEventSelectedColor = 1;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for color selection
   typedef HostBuffer<int, sizePerEventSelectedColor, HostBufferALIGNED> HostBufferSelectedColor;
 #else
@@ -487,7 +487,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   template<class Tdst, class Tsrc>
   void copyDeviceFromHost( Tdst& dst, const Tsrc& src ) // keep the same order of arguments as in memcpy
   {
@@ -504,13 +504,13 @@ namespace mg5amcCpu
       throw std::runtime_error( sstr.str() );
     }
     // NB (PR #45): cudaMemcpy involves an intermediate memcpy to pinned memory if host array is a not a pinned host array
-    checkCuda( cudaMemcpy( dst.data(), src.data(), src.bytes(), cudaMemcpyHostToDevice ) );
+    gpuMemcpy( dst.data(), src.data(), src.bytes(), gpuMemcpyHostToDevice );
   }
 #endif
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   template<class Tdst, class Tsrc>
   void copyHostFromDevice( Tdst& dst, const Tsrc& src ) // keep the same order of arguments as in memcpy
   {
@@ -527,7 +527,7 @@ namespace mg5amcCpu
       throw std::runtime_error( sstr.str() );
     }
     // NB (PR #45): cudaMemcpy involves an intermediate memcpy to pinned memory if host array is a not a pinned host array
-    checkCuda( cudaMemcpy( dst.data(), src.data(), src.bytes(), cudaMemcpyDeviceToHost ) );
+    gpuMemcpy( dst.data(), src.data(), src.bytes(), gpuMemcpyDeviceToHost );
   }
 #endif
 
diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/CPPProcess.cc b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/CPPProcess.cc
index 6242b019fa..a376b0c455 100644
--- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/CPPProcess.cc
+++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/CPPProcess.cc
@@ -4,7 +4,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
 // MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
@@ -16,7 +16,6 @@
 
 #include "mgOnGpuConfig.h"
 
-#include "CudaRuntime.h"
 #include "HelAmps_sm.h"
 #include "MemoryAccessAmplitudes.h"
 #include "MemoryAccessCouplings.h"
@@ -49,7 +48,7 @@
 // Process: g d > t t~ d WEIGHTED<=3 @1
 // Process: g s > t t~ s WEIGHTED<=3 @1
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -83,7 +82,7 @@ namespace mg5amcCpu
   __device__ const fptype cIPD[2] = { (fptype)Parameters_sm::mdl_MT, (fptype)Parameters_sm::mdl_WT };
   __device__ const fptype* cIPC = nullptr; // unused as nicoup=0
 #else
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   __device__ __constant__ fptype cIPD[2];
   __device__ __constant__ fptype* cIPC = nullptr; // unused as nicoup=0
 #else
@@ -93,7 +92,7 @@ namespace mg5amcCpu
 #endif
 
   // Helicity combinations (and filtering of "good" helicity combinations)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   __device__ __constant__ short cHel[ncomb][npar];
   __device__ __constant__ int cNGoodHel;
   __device__ __constant__ int cGoodHel[ncomb];
@@ -121,13 +120,13 @@ namespace mg5amcCpu
                            fptype* allDenominators,       // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
                            fptype_sv* jamp2_sv            // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled)
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
                            , const int ievt00             // input: first event number in current C++ event page (for CUDA, ievt depends on threadid)
 #endif
                            )
   //ALWAYS_INLINE // attributes are not permitted in a function definition
   {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     using namespace mg5amcGpu;
     using M_ACCESS = DeviceAccessMomenta;         // non-trivial access: buffer includes all events
     using E_ACCESS = DeviceAccessMatrixElements;  // non-trivial access: buffer includes all events
@@ -154,7 +153,7 @@ namespace mg5amcCpu
 #endif /* clang-format on */
     mgDebug( 0, __FUNCTION__ );
     //printf( "calculate_wavefunctions: ihel=%2d\n", ihel );
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
     //printf( "calculate_wavefunctions: ievt00=%d\n", ievt00 );
 #endif
 
@@ -190,7 +189,7 @@ namespace mg5amcCpu
 #endif
     for( int iParity = 0; iParity < nParity; ++iParity )
     { // START LOOP ON IPARITY
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
       const int ievt0 = ievt00 + iParity * neppV;
 #endif
       constexpr size_t nxcoup = ndcoup + nicoup; // both dependent and independent couplings
@@ -203,8 +202,10 @@ namespace mg5amcCpu
         allCOUPs[idcoup] = CD_ACCESS::idcoupAccessBufferConst( allcouplings, idcoup ); // dependent couplings, vary event-by-event
       for( size_t iicoup = 0; iicoup < nicoup; iicoup++ )
         allCOUPs[ndcoup + iicoup] = CI_ACCESS::iicoupAccessBufferConst( cIPC, iicoup ); // independent couplings, fixed for all events
+#ifdef MGONGPUCPP_GPUIMPL
 #ifdef __CUDACC__
 #pragma nv_diagnostic pop
+#endif
       // CUDA kernels take input/output buffers with momenta/MEs for all events
       const fptype* momenta = allmomenta;
       const fptype* COUPs[nxcoup];
@@ -341,7 +342,7 @@ namespace mg5amcCpu
         { 4, 0, 12, 4 },
         { 0, 4, 4, 12 } }; // 2-D array[4][4]
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
       // Pre-compute a constexpr triangular color matrix properly normalized #475
       struct TriangularNormalizedColorMatrix
       {
@@ -398,7 +399,7 @@ namespace mg5amcCpu
 #endif
       for( int icol = 0; icol < ncolor; icol++ )
       {
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
         // === C++ START ===
         // Diagonal terms
 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
@@ -457,7 +458,7 @@ namespace mg5amcCpu
       MEs_sv_previous += deltaMEs_previous;
 #endif
       /*
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
       if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", blockDim.x * blockIdx.x + threadIdx.x, ihel, MEs_sv );
 #else
 #ifdef MGONGPU_CPPSIMD
@@ -520,8 +521,8 @@ namespace mg5amcCpu
       { 1, -1, 1, 1, 1 },
       { 1, -1, 1, -1, -1 },
       { 1, -1, 1, -1, 1 } };
-#ifdef __CUDACC__
-    checkCuda( cudaMemcpyToSymbol( cHel, tHel, ncomb * npar * sizeof( short ) ) );
+#ifdef MGONGPUCPP_GPUIMPL
+    gpuMemcpyToSymbol( cHel, tHel, ncomb * npar * sizeof( short ) );
 #else
     memcpy( cHel, tHel, ncomb * npar * sizeof( short ) );
 #endif
@@ -562,9 +563,9 @@ namespace mg5amcCpu
     // Then copy them to CUDA constant memory (issue #39) or its C++ emulation in file-scope static memory
     const fptype tIPD[2] = { (fptype)m_pars->mdl_MT, (fptype)m_pars->mdl_WT };
     //const cxtype tIPC[0] = { ... }; // nicoup=0
-#ifdef __CUDACC__
-    checkCuda( cudaMemcpyToSymbol( cIPD, tIPD, 2 * sizeof( fptype ) ) );
-    //checkCuda( cudaMemcpyToSymbol( cIPC, tIPC, 0 * sizeof( cxtype ) ) ); // nicoup=0
+#ifdef MGONGPUCPP_GPUIMPL
+    gpuMemcpyToSymbol( cIPD, tIPD, 2 * sizeof( fptype ) );
+    //gpuMemcpyToSymbol( cIPC, tIPC, 0 * sizeof( cxtype ) ); // nicoup=0
 #else
     memcpy( cIPD, tIPD, 2 * sizeof( fptype ) );
     //memcpy( cIPC, tIPC, 0 * sizeof( cxtype ) ); // nicoup=0
@@ -601,7 +602,7 @@ namespace mg5amcCpu
   {
     std::stringstream out;
     // CUDA version (NVCC)
-    // [Use __NVCC__ instead of __CUDACC__ here!]
+    // [Use __NVCC__ instead of MGONGPUCPP_GPUIMPL here!]
     // [This tests if 'nvcc' was used even to build a .cc file, even if not necessarily 'nvcc -x cu' for a .cu file]
     // [Check 'nvcc --compiler-options -dM -E dummy.c | grep CUDA': see https://stackoverflow.com/a/53713712]
 #ifdef __NVCC__
@@ -666,12 +667,12 @@ namespace mg5amcCpu
   __global__ void /* clang-format off */
   computeDependentCouplings( const fptype* allgs, // input: Gs[nevt]
                              fptype* allcouplings // output: couplings[nevt*ndcoup*2]
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
                              , const int nevt     // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #endif
   ) /* clang-format on */
   {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     using namespace mg5amcGpu;
     using G_ACCESS = DeviceAccessGs;
     using C_ACCESS = DeviceAccessCouplings;
@@ -692,7 +693,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__ /* clang-format off */
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
   __global__ void
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
@@ -818,9 +819,9 @@ namespace mg5amcCpu
         nGoodHel++;
       }
     }
-#ifdef __CUDACC__
-    checkCuda( cudaMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) ) );
-    checkCuda( cudaMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) ) );
+#ifdef MGONGPUCPP_GPUIMPL
+    gpuMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) );
+    gpuMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) );
 #else
     cNGoodHel = nGoodHel;
     for( int ihel = 0; ihel < ncomb; ihel++ ) cGoodHel[ihel] = goodHel[ihel];
@@ -844,7 +845,7 @@ namespace mg5amcCpu
 #endif
             int* allselhel,                // output: helicity selection[nevt]
             int* allselcol                 // output: helicity selection[nevt]
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
             , const int nevt               // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #endif
             ) /* clang-format on */
@@ -864,7 +865,7 @@ namespace mg5amcCpu
     // Denominators: spins, colors and identical particles
     constexpr int helcolDenominators[1] = { 96 }; // assume nprocesses == 1 (#272 and #343)
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     // Remember: in CUDA this is a kernel for one event, in c++ this processes n events
     const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid
 #else
@@ -878,9 +879,12 @@ namespace mg5amcCpu
 #endif
 
     // Start sigmaKin_lines
+
+#include "GpuAbstraction.h"
+
     // === PART 0 - INITIALISATION (before calculate_wavefunctions) ===
     // Reset the "matrix elements" - running sums of |M|^2 over helicities for the given event
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     allMEs[ievt] = 0;
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     allNumerators[ievt] = 0;
@@ -908,7 +912,7 @@ namespace mg5amcCpu
     // === PART 1 - HELICITY LOOP: CALCULATE WAVEFUNCTIONS ===
     // (in both CUDA and C++, using precomputed good helicities)
 
-#ifdef __CUDACC__ // CUDA OR C++
+#ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++
 
     // *** START OF PART 1a - CUDA (one event per CPU thread) ***
     // Running sum of partial amplitudes squared for event by event color selection (#402)
@@ -1118,7 +1122,7 @@ namespace mg5amcCpu
     // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event
     // [NB 'sum over final spins, average over initial spins', eg see
     // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf]
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     allMEs[ievt] /= helcolDenominators[0];
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     if( channelId > 0 ) allMEs[ievt] *= allNumerators[ievt] / allDenominators[ievt];
diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/CPPProcess.h b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/CPPProcess.h
index bf037c6c28..ce22572055 100644
--- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/CPPProcess.h
+++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/CPPProcess.h
@@ -4,7 +4,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
 // MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
@@ -25,7 +25,7 @@
 
 //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -110,7 +110,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   __global__ void
   computeDependentCouplings( const fptype* allgs,    // input: Gs[nevt]
                              fptype* allcouplings ); // output: couplings[nevt*ndcoup*2]
@@ -123,7 +123,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__ /* clang-format off */
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
   __global__ void
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
@@ -153,7 +153,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__ /* clang-format off */
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
   __global__ void
   sigmaKin( const fptype* allmomenta,      // input: momenta[nevt*npar*4]
             const fptype* allcouplings,    // input: couplings[nevt*ndcoup*2]
diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/CudaRuntime.h b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/CudaRuntime.h
deleted file mode 120000
index ce9e1a487a..0000000000
--- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/CudaRuntime.h
+++ /dev/null
@@ -1 +0,0 @@
-../CudaRuntime.h
\ No newline at end of file
diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/GpuAbstraction.h b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/GpuAbstraction.h
new file mode 120000
index 0000000000..72054e19ba
--- /dev/null
+++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/GpuAbstraction.h
@@ -0,0 +1 @@
+../GpuAbstraction.h
\ No newline at end of file
diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/GpuRuntime.h b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/GpuRuntime.h
new file mode 120000
index 0000000000..3920e83be4
--- /dev/null
+++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/GpuRuntime.h
@@ -0,0 +1 @@
+../GpuRuntime.h
\ No newline at end of file
diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/check_sa.cc b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/check_sa.cc
index 3fbf0ffbee..7cac5ab47b 100644
--- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/check_sa.cc
+++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/check_sa.cc
@@ -4,7 +4,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: O. Mattelaer (Nov 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 
 #include "mgOnGpuConfig.h"
@@ -12,6 +12,7 @@
 #include "BridgeKernels.h"
 #include "CPPProcess.h"
 #include "CrossSectionKernels.h"
+#include "GpuRuntime.h"
 #include "MatrixElementKernels.h"
 #include "MemoryAccessMatrixElements.h"
 #include "MemoryAccessMomenta.h"
@@ -65,7 +66,7 @@ usage( char* argv0, int ret = 1 )
   std::cout << std::endl;
   std::cout << "Summary stats are always computed: '-p' and '-j' only control their printout" << std::endl;
   std::cout << "The '-d' flag only enables NaN/abnormal warnings and OMP debugging" << std::endl;
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 #ifdef _OPENMP
   std::cout << std::endl;
   std::cout << "Use the OMP_NUM_THREADS environment variable to control OMP multi-threading" << std::endl;
@@ -96,7 +97,7 @@ int
 main( int argc, char** argv )
 {
   // Namespaces for CUDA and C++ (FIXME - eventually use the same namespace everywhere...)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   using namespace mg5amcGpu;
 #else
   using namespace mg5amcCpu;
@@ -134,9 +135,11 @@ main( int argc, char** argv )
     CurandDevice = 2
   };
 #ifdef MGONGPU_HAS_NO_CURAND
-  RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // this is the only supported mode if build has no curand (PR #784)
+  RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // this is the only supported mode if build has no curand (PR #784 and #785)
+#elif defined __HIPCC__
+#error Internal error: MGONGPU_HAS_NO_CURAND should have been set for __HIPCC__ // default on AMD GPUs should be common random
 #elif defined __CUDACC__
-  RandomNumberMode rndgen = RandomNumberMode::CurandDevice; // default on GPU if build has curand
+  RandomNumberMode rndgen = RandomNumberMode::CurandDevice; // default on NVidia GPU if build has curand
 #else
   RandomNumberMode rndgen = RandomNumberMode::CurandHost; // default on CPU if build has curand
 #endif
@@ -146,10 +149,10 @@ main( int argc, char** argv )
     RamboHost = 1,
     RamboDevice = 2
   };
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   RamboSamplingMode rmbsmp = RamboSamplingMode::RamboDevice; // default on GPU
 #else
-  RamboSamplingMode rmbsmp = RamboSamplingMode::RamboHost;  // default on CPU
+  RamboSamplingMode rmbsmp = RamboSamplingMode::RamboHost; // default on CPU
 #endif
   // Bridge emulation mode (NB Bridge implies RamboHost!)
   bool bridge = false;
@@ -177,7 +180,7 @@ main( int argc, char** argv )
     else if( arg == "--curdev" )
     {
 #ifndef __CUDACC__
-      throw std::runtime_error( "CurandDevice is not supported on CPUs" );
+      throw std::runtime_error( "CurandDevice is not supported on CPUs or non-NVidia GPUs" );
 #elif defined MGONGPU_HAS_NO_CURAND
       throw std::runtime_error( "CurandDevice is not supported because this application was built without Curand support" );
 #else
@@ -198,7 +201,7 @@ main( int argc, char** argv )
     }
     else if( arg == "--rmbdev" )
     {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
       rmbsmp = RamboSamplingMode::RamboDevice;
 #else
       throw std::runtime_error( "RamboDevice is not supported on CPUs" );
@@ -272,13 +275,13 @@ main( int argc, char** argv )
     return usage( argv[0] );
   }
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 #ifdef _OPENMP
   ompnumthreadsNotSetMeansOneThread( debug ? 1 : 0 ); // quiet(-1), info(0), debug(1)
 #endif
 #endif
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // Fail gently and avoid "Illegal instruction (core dumped)" if the host does not support the SIMD used in the ME calculation
   // Note: this prevents a crash on pmpe04 but not on some github CI nodes?
   // [NB: SIMD vectorization in mg5amc C++ code is only used in the ME calculation below MatrixElementKernelHost!]
@@ -296,14 +299,14 @@ main( int argc, char** argv )
 
   // === STEP 0 - INITIALISE
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 
-  // --- 00. Initialise cuda
-  // Instantiate a CudaRuntime at the beginnining of the application's main to
-  // invoke cudaSetDevice(0) in the constructor and book a cudaDeviceReset() call in the destructor
-  const std::string cdinKey = "00 CudaInit";
+  // --- 00. Initialise GPU
+  // Instantiate a GpuRuntime at the beginnining of the application's main.
+  // For CUDA this invokes cudaSetDevice(0) in the constructor and books a cudaDeviceReset() call in the destructor.
+  const std::string cdinKey = "00 GpuInit";
   timermap.start( cdinKey );
-  CudaRuntime cudaRuntime( debug );
+  GpuRuntime GpuRuntime( debug );
 #endif
 
   // --- 0a. Initialise physics process
@@ -325,7 +328,7 @@ main( int argc, char** argv )
   timermap.start( alloKey );
 
   // Memory buffers for random numbers for momenta
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferRndNumMomenta hstRndmom( nevt );
 #else
   PinnedHostBufferRndNumMomenta hstRndmom( nevt );
@@ -333,7 +336,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for sampling weights
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferWeights hstWeights( nevt );
 #else
   PinnedHostBufferWeights hstWeights( nevt );
@@ -341,7 +344,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for momenta
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferMomenta hstMomenta( nevt );
 #else
   PinnedHostBufferMomenta hstMomenta( nevt );
@@ -349,7 +352,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for Gs
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferGs hstGs( nevt );
 #else
   PinnedHostBufferGs hstGs( nevt );
@@ -366,7 +369,7 @@ main( int argc, char** argv )
   }
 
   // Memory buffers for matrix elements
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferMatrixElements hstMatrixElements( nevt );
 #else
   PinnedHostBufferMatrixElements hstMatrixElements( nevt );
@@ -375,7 +378,7 @@ main( int argc, char** argv )
 
   // Memory buffers for random numbers for helicity selection
   // *** NB #403 these buffers always remain initialised at 0: no need for helicity choice in gcheck/check (no LHE produced) ***
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferRndNumHelicity hstRndHel( nevt );
 #else
   PinnedHostBufferRndNumHelicity hstRndHel( nevt );
@@ -384,7 +387,7 @@ main( int argc, char** argv )
 
   // Memory buffers for random numbers for color selection
   // *** NB #402 these buffers always remain initialised at 0: no need for color choice in gcheck/check (no LHE produced) ***
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferRndNumColor hstRndCol( nevt );
 #else
   PinnedHostBufferRndNumColor hstRndCol( nevt );
@@ -392,7 +395,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for helicity selection
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferSelectedHelicity hstSelHel( nevt );
 #else
   PinnedHostBufferSelectedHelicity hstSelHel( nevt );
@@ -400,7 +403,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for color selection
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferSelectedColor hstSelCol( nevt );
 #else
   PinnedHostBufferSelectedColor hstSelCol( nevt );
@@ -438,7 +441,7 @@ main( int argc, char** argv )
     const bool onDevice = true;
     prnk.reset( new CurandRandomNumberKernel( devRndmom, onDevice ) );
 #else
-    throw std::logic_error( "INTERNAL ERROR! CurandDevice is not supported on CPUs" ); // INTERNAL ERROR (no path to this statement)
+    throw std::logic_error( "INTERNAL ERROR! CurandDevice is not supported on CPUs or non-NVidia GPUs" ); // INTERNAL ERROR (no path to this statement)
 #endif
   }
 
@@ -450,7 +453,7 @@ main( int argc, char** argv )
   }
   else
   {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     prsk.reset( new RamboSamplingKernelDevice( energy, devRndmom, devMomenta, devWeights, gpublocks, gputhreads ) );
 #else
     throw std::logic_error( "RamboDevice is not supported on CPUs" ); // INTERNAL ERROR (no path to this statement)
@@ -461,7 +464,7 @@ main( int argc, char** argv )
   std::unique_ptr<MatrixElementKernelBase> pmek;
   if( !bridge )
   {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     pmek.reset( new MatrixElementKernelDevice( devMomenta, devGs, devRndHel, devRndCol, devMatrixElements, devSelHel, devSelCol, gpublocks, gputhreads ) );
 #else
     pmek.reset( new MatrixElementKernelHost( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, nevt ) );
@@ -469,7 +472,7 @@ main( int argc, char** argv )
   }
   else
   {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     pmek.reset( new BridgeKernelDevice( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, gpublocks, gputhreads ) );
 #else
     pmek.reset( new BridgeKernelHost( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, nevt ) );
@@ -511,7 +514,7 @@ main( int argc, char** argv )
     prnk->generateRnarray();
     //std::cout << "Got random numbers" << std::endl;
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     if( rndgen != RandomNumberMode::CurandDevice && rmbsmp == RamboSamplingMode::RamboDevice )
     {
       // --- 1c. Copy rndmom from host to device
@@ -543,7 +546,7 @@ main( int argc, char** argv )
     prsk->getMomentaFinal();
     //std::cout << "Got final momenta" << std::endl;
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     if( rmbsmp == RamboSamplingMode::RamboDevice )
     {
       // --- 2c. CopyDToH Weights
@@ -588,7 +591,7 @@ main( int argc, char** argv )
       dynamic_cast<BridgeKernelBase*>( pmek.get() )->transposeInputMomentaC2F();
     }
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     // --- 2d. CopyHToD Momenta
     const std::string gKey = "0.. CpHTDg";
     rambtime += timermap.start( gKey ); // FIXME! NOT A RAMBO TIMER!
@@ -617,7 +620,7 @@ main( int argc, char** argv )
     wv3atime += timermap.stop(); // calc only
     wavetime += wv3atime;        // calc plus copy
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     if( !bridge )
     {
       // --- 3b. CopyDToH MEs
@@ -760,18 +763,22 @@ main( int argc, char** argv )
     rndgentxt = "CURAND DEVICE";
 #ifdef __CUDACC__
   rndgentxt += " (CUDA code)";
+#elif defined __HIPCC__
+  rndgentxt += " (HIP code)";
 #else
   rndgentxt += " (C++ code)";
 #endif
 
   // Workflow description summary
   std::string wrkflwtxt;
-  // -- CUDA or C++?
+  // -- CUDA or HIP or C++?
 #ifdef __CUDACC__
   wrkflwtxt += "CUD:";
+#elif defined __HIPCC__
+  wrkflwtxt += "HIP:";
 #else
   wrkflwtxt += "CPP:";
-#endif
+#endif /* clang-format off */
   // -- DOUBLE or FLOAT?
 #if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
   wrkflwtxt += "MIX+"; // mixed fptypes (single precision color algebra #537)
@@ -781,7 +788,7 @@ main( int argc, char** argv )
   wrkflwtxt += "FLT+";
 #else
   wrkflwtxt += "???+"; // no path to this statement
-#endif
+#endif /* clang-format on */
   // -- CUCOMPLEX or THRUST or STD complex numbers?
 #ifdef __CUDACC__
 #if defined MGONGPU_CUCXTYPE_CUCOMPLEX
@@ -793,6 +800,12 @@ main( int argc, char** argv )
 #else
   wrkflwtxt += "???:"; // no path to this statement
 #endif
+#elif defined __HIPCC__
+#if defined MGONGPU_CUCXTYPE_CXSMPL
+  wrkflwtxt += "CXS:";
+#else
+  wrkflwtxt += "???:"; // no path to this statement
+#endif
 #else
 #if defined MGONGPU_CPPCXTYPE_STDCOMPLEX
   wrkflwtxt += "STX:";
@@ -818,7 +831,7 @@ main( int argc, char** argv )
     wrkflwtxt += "RMBDEV+";
   else
     wrkflwtxt += "??????+"; // no path to this statement
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   // -- HOST or DEVICE matrix elements? Standalone MEs or BRIDGE?
   if( !bridge )
     wrkflwtxt += "MESDEV";
@@ -874,7 +887,7 @@ main( int argc, char** argv )
 
   if( perf )
   {
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 #ifdef _OPENMP
     // Get the output of "nproc --all" (https://stackoverflow.com/a/478960)
     std::string nprocall;
@@ -895,6 +908,8 @@ main( int argc, char** argv )
     std::cout << std::string( SEP79, '*' ) << std::endl
 #ifdef __CUDACC__
               << "Process                     = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_CUDA"
+#elif defined __HIPCC__
+              << "Process                     = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_HIP"
 #else
               << "Process                     = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_CPP"
 #endif
@@ -921,21 +936,21 @@ main( int argc, char** argv )
 #elif defined MGONGPU_FPTYPE_FLOAT
               << "FP precision                = FLOAT (NaN/abnormal=" << nabn << ", zero=" << nzero << ")" << std::endl
 #endif
-#ifdef __CUDACC__
 #if defined MGONGPU_CUCXTYPE_CUCOMPLEX
               << "Complex type                = CUCOMPLEX" << std::endl
 #elif defined MGONGPU_CUCXTYPE_THRUST
               << "Complex type                = THRUST::COMPLEX" << std::endl
-#endif
-#else
+#elif defined MGONGPU_CUCXTYPE_CXSMPL
               << "Complex type                = STD::COMPLEX" << std::endl
+#else
+              << "Complex type                = ???" << std::endl // no path to this statement...
 #endif
               << "RanNumb memory layout       = AOSOA[" << neppR << "]"
               << ( neppR == 1 ? " == AOS" : "" )
               << " [HARDCODED FOR REPRODUCIBILITY]" << std::endl
               << "Momenta memory layout       = AOSOA[" << neppM << "]"
               << ( neppM == 1 ? " == AOS" : "" ) << std::endl
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     //<< "Wavefunction GPU memory     = LOCAL" << std::endl
 #else
 #if !defined MGONGPU_CPPSIMD
@@ -966,7 +981,7 @@ main( int argc, char** argv )
 #endif
 #endif
               << "Random number generation    = " << rndgentxt << std::endl
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 #ifdef _OPENMP
               << "OMP threads / `nproc --all` = " << omp_get_max_threads() << " / " << nprocall // includes a newline
 #endif
@@ -1062,14 +1077,14 @@ main( int argc, char** argv )
              << "\"FLOAT (NaN/abnormal=" << nabn << ")\"," << std::endl
 #endif
              << "\"Complex type\": "
-#ifdef __CUDACC__
 #if defined MGONGPU_CUCXTYPE_CUCOMPLEX
              << "\"CUCOMPLEX\"," << std::endl
 #elif defined MGONGPU_CUCXTYPE_THRUST
              << "\"THRUST::COMPLEX\"," << std::endl
-#endif
-#else
+#elif defined MGONGPU_CUCXTYPE_CXSMPL
              << "\"STD::COMPLEX\"," << std::endl
+#else
+             << "\"???\"," << std::endl                           // no path to this statement...
 #endif
              << "\"RanNumb memory layout\": "
              << "\"AOSOA[" << neppR << "]\""
@@ -1077,7 +1092,7 @@ main( int argc, char** argv )
              << "\"Momenta memory layout\": "
              << "\"AOSOA[" << neppM << "]\""
              << ( neppM == 1 ? " == AOS" : "" ) << ", " << std::endl
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     //<< "\"Wavefunction GPU memory\": " << "\"LOCAL\"," << std::endl
 #endif
              << "\"Curand generation\": "
diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/CPPProcess.cc b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/CPPProcess.cc
index 90788b2c75..41f17b9fb0 100644
--- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/CPPProcess.cc
+++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/CPPProcess.cc
@@ -4,7 +4,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
 // MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
@@ -16,7 +16,6 @@
 
 #include "mgOnGpuConfig.h"
 
-#include "CudaRuntime.h"
 #include "HelAmps_sm.h"
 #include "MemoryAccessAmplitudes.h"
 #include "MemoryAccessCouplings.h"
@@ -49,7 +48,7 @@
 // Process: g d~ > t t~ d~ WEIGHTED<=3 @1
 // Process: g s~ > t t~ s~ WEIGHTED<=3 @1
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -83,7 +82,7 @@ namespace mg5amcCpu
   __device__ const fptype cIPD[2] = { (fptype)Parameters_sm::mdl_MT, (fptype)Parameters_sm::mdl_WT };
   __device__ const fptype* cIPC = nullptr; // unused as nicoup=0
 #else
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   __device__ __constant__ fptype cIPD[2];
   __device__ __constant__ fptype* cIPC = nullptr; // unused as nicoup=0
 #else
@@ -93,7 +92,7 @@ namespace mg5amcCpu
 #endif
 
   // Helicity combinations (and filtering of "good" helicity combinations)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   __device__ __constant__ short cHel[ncomb][npar];
   __device__ __constant__ int cNGoodHel;
   __device__ __constant__ int cGoodHel[ncomb];
@@ -121,13 +120,13 @@ namespace mg5amcCpu
                            fptype* allDenominators,       // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
                            fptype_sv* jamp2_sv            // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled)
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
                            , const int ievt00             // input: first event number in current C++ event page (for CUDA, ievt depends on threadid)
 #endif
                            )
   //ALWAYS_INLINE // attributes are not permitted in a function definition
   {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     using namespace mg5amcGpu;
     using M_ACCESS = DeviceAccessMomenta;         // non-trivial access: buffer includes all events
     using E_ACCESS = DeviceAccessMatrixElements;  // non-trivial access: buffer includes all events
@@ -154,7 +153,7 @@ namespace mg5amcCpu
 #endif /* clang-format on */
     mgDebug( 0, __FUNCTION__ );
     //printf( "calculate_wavefunctions: ihel=%2d\n", ihel );
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
     //printf( "calculate_wavefunctions: ievt00=%d\n", ievt00 );
 #endif
 
@@ -190,7 +189,7 @@ namespace mg5amcCpu
 #endif
     for( int iParity = 0; iParity < nParity; ++iParity )
     { // START LOOP ON IPARITY
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
       const int ievt0 = ievt00 + iParity * neppV;
 #endif
       constexpr size_t nxcoup = ndcoup + nicoup; // both dependent and independent couplings
@@ -203,8 +202,10 @@ namespace mg5amcCpu
         allCOUPs[idcoup] = CD_ACCESS::idcoupAccessBufferConst( allcouplings, idcoup ); // dependent couplings, vary event-by-event
       for( size_t iicoup = 0; iicoup < nicoup; iicoup++ )
         allCOUPs[ndcoup + iicoup] = CI_ACCESS::iicoupAccessBufferConst( cIPC, iicoup ); // independent couplings, fixed for all events
+#ifdef MGONGPUCPP_GPUIMPL
 #ifdef __CUDACC__
 #pragma nv_diagnostic pop
+#endif
       // CUDA kernels take input/output buffers with momenta/MEs for all events
       const fptype* momenta = allmomenta;
       const fptype* COUPs[nxcoup];
@@ -341,7 +342,7 @@ namespace mg5amcCpu
         { 4, 0, 12, 4 },
         { 0, 4, 4, 12 } }; // 2-D array[4][4]
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
       // Pre-compute a constexpr triangular color matrix properly normalized #475
       struct TriangularNormalizedColorMatrix
       {
@@ -398,7 +399,7 @@ namespace mg5amcCpu
 #endif
       for( int icol = 0; icol < ncolor; icol++ )
       {
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
         // === C++ START ===
         // Diagonal terms
 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
@@ -457,7 +458,7 @@ namespace mg5amcCpu
       MEs_sv_previous += deltaMEs_previous;
 #endif
       /*
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
       if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", blockDim.x * blockIdx.x + threadIdx.x, ihel, MEs_sv );
 #else
 #ifdef MGONGPU_CPPSIMD
@@ -520,8 +521,8 @@ namespace mg5amcCpu
       { 1, 1, 1, 1, -1 },
       { 1, 1, 1, -1, 1 },
       { 1, 1, 1, -1, -1 } };
-#ifdef __CUDACC__
-    checkCuda( cudaMemcpyToSymbol( cHel, tHel, ncomb * npar * sizeof( short ) ) );
+#ifdef MGONGPUCPP_GPUIMPL
+    gpuMemcpyToSymbol( cHel, tHel, ncomb * npar * sizeof( short ) );
 #else
     memcpy( cHel, tHel, ncomb * npar * sizeof( short ) );
 #endif
@@ -562,9 +563,9 @@ namespace mg5amcCpu
     // Then copy them to CUDA constant memory (issue #39) or its C++ emulation in file-scope static memory
     const fptype tIPD[2] = { (fptype)m_pars->mdl_MT, (fptype)m_pars->mdl_WT };
     //const cxtype tIPC[0] = { ... }; // nicoup=0
-#ifdef __CUDACC__
-    checkCuda( cudaMemcpyToSymbol( cIPD, tIPD, 2 * sizeof( fptype ) ) );
-    //checkCuda( cudaMemcpyToSymbol( cIPC, tIPC, 0 * sizeof( cxtype ) ) ); // nicoup=0
+#ifdef MGONGPUCPP_GPUIMPL
+    gpuMemcpyToSymbol( cIPD, tIPD, 2 * sizeof( fptype ) );
+    //gpuMemcpyToSymbol( cIPC, tIPC, 0 * sizeof( cxtype ) ); // nicoup=0
 #else
     memcpy( cIPD, tIPD, 2 * sizeof( fptype ) );
     //memcpy( cIPC, tIPC, 0 * sizeof( cxtype ) ); // nicoup=0
@@ -601,7 +602,7 @@ namespace mg5amcCpu
   {
     std::stringstream out;
     // CUDA version (NVCC)
-    // [Use __NVCC__ instead of __CUDACC__ here!]
+    // [Use __NVCC__ instead of MGONGPUCPP_GPUIMPL here!]
     // [This tests if 'nvcc' was used even to build a .cc file, even if not necessarily 'nvcc -x cu' for a .cu file]
     // [Check 'nvcc --compiler-options -dM -E dummy.c | grep CUDA': see https://stackoverflow.com/a/53713712]
 #ifdef __NVCC__
@@ -666,12 +667,12 @@ namespace mg5amcCpu
   __global__ void /* clang-format off */
   computeDependentCouplings( const fptype* allgs, // input: Gs[nevt]
                              fptype* allcouplings // output: couplings[nevt*ndcoup*2]
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
                              , const int nevt     // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #endif
   ) /* clang-format on */
   {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     using namespace mg5amcGpu;
     using G_ACCESS = DeviceAccessGs;
     using C_ACCESS = DeviceAccessCouplings;
@@ -692,7 +693,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__ /* clang-format off */
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
   __global__ void
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
@@ -818,9 +819,9 @@ namespace mg5amcCpu
         nGoodHel++;
       }
     }
-#ifdef __CUDACC__
-    checkCuda( cudaMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) ) );
-    checkCuda( cudaMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) ) );
+#ifdef MGONGPUCPP_GPUIMPL
+    gpuMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) );
+    gpuMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) );
 #else
     cNGoodHel = nGoodHel;
     for( int ihel = 0; ihel < ncomb; ihel++ ) cGoodHel[ihel] = goodHel[ihel];
@@ -844,7 +845,7 @@ namespace mg5amcCpu
 #endif
             int* allselhel,                // output: helicity selection[nevt]
             int* allselcol                 // output: helicity selection[nevt]
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
             , const int nevt               // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #endif
             ) /* clang-format on */
@@ -864,7 +865,7 @@ namespace mg5amcCpu
     // Denominators: spins, colors and identical particles
     constexpr int helcolDenominators[1] = { 96 }; // assume nprocesses == 1 (#272 and #343)
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     // Remember: in CUDA this is a kernel for one event, in c++ this processes n events
     const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid
 #else
@@ -878,9 +879,12 @@ namespace mg5amcCpu
 #endif
 
     // Start sigmaKin_lines
+
+#include "GpuAbstraction.h"
+
     // === PART 0 - INITIALISATION (before calculate_wavefunctions) ===
     // Reset the "matrix elements" - running sums of |M|^2 over helicities for the given event
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     allMEs[ievt] = 0;
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     allNumerators[ievt] = 0;
@@ -908,7 +912,7 @@ namespace mg5amcCpu
     // === PART 1 - HELICITY LOOP: CALCULATE WAVEFUNCTIONS ===
     // (in both CUDA and C++, using precomputed good helicities)
 
-#ifdef __CUDACC__ // CUDA OR C++
+#ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++
 
     // *** START OF PART 1a - CUDA (one event per CPU thread) ***
     // Running sum of partial amplitudes squared for event by event color selection (#402)
@@ -1118,7 +1122,7 @@ namespace mg5amcCpu
     // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event
     // [NB 'sum over final spins, average over initial spins', eg see
     // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf]
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     allMEs[ievt] /= helcolDenominators[0];
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     if( channelId > 0 ) allMEs[ievt] *= allNumerators[ievt] / allDenominators[ievt];
diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/CPPProcess.h b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/CPPProcess.h
index 0f49f5247b..46c4347506 100644
--- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/CPPProcess.h
+++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/CPPProcess.h
@@ -4,7 +4,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
 // MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
@@ -25,7 +25,7 @@
 
 //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -110,7 +110,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   __global__ void
   computeDependentCouplings( const fptype* allgs,    // input: Gs[nevt]
                              fptype* allcouplings ); // output: couplings[nevt*ndcoup*2]
@@ -123,7 +123,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__ /* clang-format off */
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
   __global__ void
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
@@ -153,7 +153,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__ /* clang-format off */
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
   __global__ void
   sigmaKin( const fptype* allmomenta,      // input: momenta[nevt*npar*4]
             const fptype* allcouplings,    // input: couplings[nevt*ndcoup*2]
diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/CudaRuntime.h b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/CudaRuntime.h
deleted file mode 120000
index ce9e1a487a..0000000000
--- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/CudaRuntime.h
+++ /dev/null
@@ -1 +0,0 @@
-../CudaRuntime.h
\ No newline at end of file
diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/GpuAbstraction.h b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/GpuAbstraction.h
new file mode 120000
index 0000000000..72054e19ba
--- /dev/null
+++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/GpuAbstraction.h
@@ -0,0 +1 @@
+../GpuAbstraction.h
\ No newline at end of file
diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/GpuRuntime.h b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/GpuRuntime.h
new file mode 120000
index 0000000000..3920e83be4
--- /dev/null
+++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/GpuRuntime.h
@@ -0,0 +1 @@
+../GpuRuntime.h
\ No newline at end of file
diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/check_sa.cc b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/check_sa.cc
index 3fbf0ffbee..7cac5ab47b 100644
--- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/check_sa.cc
+++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/check_sa.cc
@@ -4,7 +4,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: O. Mattelaer (Nov 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 
 #include "mgOnGpuConfig.h"
@@ -12,6 +12,7 @@
 #include "BridgeKernels.h"
 #include "CPPProcess.h"
 #include "CrossSectionKernels.h"
+#include "GpuRuntime.h"
 #include "MatrixElementKernels.h"
 #include "MemoryAccessMatrixElements.h"
 #include "MemoryAccessMomenta.h"
@@ -65,7 +66,7 @@ usage( char* argv0, int ret = 1 )
   std::cout << std::endl;
   std::cout << "Summary stats are always computed: '-p' and '-j' only control their printout" << std::endl;
   std::cout << "The '-d' flag only enables NaN/abnormal warnings and OMP debugging" << std::endl;
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 #ifdef _OPENMP
   std::cout << std::endl;
   std::cout << "Use the OMP_NUM_THREADS environment variable to control OMP multi-threading" << std::endl;
@@ -96,7 +97,7 @@ int
 main( int argc, char** argv )
 {
   // Namespaces for CUDA and C++ (FIXME - eventually use the same namespace everywhere...)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   using namespace mg5amcGpu;
 #else
   using namespace mg5amcCpu;
@@ -134,9 +135,11 @@ main( int argc, char** argv )
     CurandDevice = 2
   };
 #ifdef MGONGPU_HAS_NO_CURAND
-  RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // this is the only supported mode if build has no curand (PR #784)
+  RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // this is the only supported mode if build has no curand (PR #784 and #785)
+#elif defined __HIPCC__
+#error Internal error: MGONGPU_HAS_NO_CURAND should have been set for __HIPCC__ // default on AMD GPUs should be common random
 #elif defined __CUDACC__
-  RandomNumberMode rndgen = RandomNumberMode::CurandDevice; // default on GPU if build has curand
+  RandomNumberMode rndgen = RandomNumberMode::CurandDevice; // default on NVidia GPU if build has curand
 #else
   RandomNumberMode rndgen = RandomNumberMode::CurandHost; // default on CPU if build has curand
 #endif
@@ -146,10 +149,10 @@ main( int argc, char** argv )
     RamboHost = 1,
     RamboDevice = 2
   };
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   RamboSamplingMode rmbsmp = RamboSamplingMode::RamboDevice; // default on GPU
 #else
-  RamboSamplingMode rmbsmp = RamboSamplingMode::RamboHost;  // default on CPU
+  RamboSamplingMode rmbsmp = RamboSamplingMode::RamboHost; // default on CPU
 #endif
   // Bridge emulation mode (NB Bridge implies RamboHost!)
   bool bridge = false;
@@ -177,7 +180,7 @@ main( int argc, char** argv )
     else if( arg == "--curdev" )
     {
 #ifndef __CUDACC__
-      throw std::runtime_error( "CurandDevice is not supported on CPUs" );
+      throw std::runtime_error( "CurandDevice is not supported on CPUs or non-NVidia GPUs" );
 #elif defined MGONGPU_HAS_NO_CURAND
       throw std::runtime_error( "CurandDevice is not supported because this application was built without Curand support" );
 #else
@@ -198,7 +201,7 @@ main( int argc, char** argv )
     }
     else if( arg == "--rmbdev" )
     {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
       rmbsmp = RamboSamplingMode::RamboDevice;
 #else
       throw std::runtime_error( "RamboDevice is not supported on CPUs" );
@@ -272,13 +275,13 @@ main( int argc, char** argv )
     return usage( argv[0] );
   }
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 #ifdef _OPENMP
   ompnumthreadsNotSetMeansOneThread( debug ? 1 : 0 ); // quiet(-1), info(0), debug(1)
 #endif
 #endif
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // Fail gently and avoid "Illegal instruction (core dumped)" if the host does not support the SIMD used in the ME calculation
   // Note: this prevents a crash on pmpe04 but not on some github CI nodes?
   // [NB: SIMD vectorization in mg5amc C++ code is only used in the ME calculation below MatrixElementKernelHost!]
@@ -296,14 +299,14 @@ main( int argc, char** argv )
 
   // === STEP 0 - INITIALISE
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 
-  // --- 00. Initialise cuda
-  // Instantiate a CudaRuntime at the beginnining of the application's main to
-  // invoke cudaSetDevice(0) in the constructor and book a cudaDeviceReset() call in the destructor
-  const std::string cdinKey = "00 CudaInit";
+  // --- 00. Initialise GPU
+  // Instantiate a GpuRuntime at the beginnining of the application's main.
+  // For CUDA this invokes cudaSetDevice(0) in the constructor and books a cudaDeviceReset() call in the destructor.
+  const std::string cdinKey = "00 GpuInit";
   timermap.start( cdinKey );
-  CudaRuntime cudaRuntime( debug );
+  GpuRuntime GpuRuntime( debug );
 #endif
 
   // --- 0a. Initialise physics process
@@ -325,7 +328,7 @@ main( int argc, char** argv )
   timermap.start( alloKey );
 
   // Memory buffers for random numbers for momenta
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferRndNumMomenta hstRndmom( nevt );
 #else
   PinnedHostBufferRndNumMomenta hstRndmom( nevt );
@@ -333,7 +336,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for sampling weights
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferWeights hstWeights( nevt );
 #else
   PinnedHostBufferWeights hstWeights( nevt );
@@ -341,7 +344,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for momenta
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferMomenta hstMomenta( nevt );
 #else
   PinnedHostBufferMomenta hstMomenta( nevt );
@@ -349,7 +352,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for Gs
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferGs hstGs( nevt );
 #else
   PinnedHostBufferGs hstGs( nevt );
@@ -366,7 +369,7 @@ main( int argc, char** argv )
   }
 
   // Memory buffers for matrix elements
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferMatrixElements hstMatrixElements( nevt );
 #else
   PinnedHostBufferMatrixElements hstMatrixElements( nevt );
@@ -375,7 +378,7 @@ main( int argc, char** argv )
 
   // Memory buffers for random numbers for helicity selection
   // *** NB #403 these buffers always remain initialised at 0: no need for helicity choice in gcheck/check (no LHE produced) ***
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferRndNumHelicity hstRndHel( nevt );
 #else
   PinnedHostBufferRndNumHelicity hstRndHel( nevt );
@@ -384,7 +387,7 @@ main( int argc, char** argv )
 
   // Memory buffers for random numbers for color selection
   // *** NB #402 these buffers always remain initialised at 0: no need for color choice in gcheck/check (no LHE produced) ***
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferRndNumColor hstRndCol( nevt );
 #else
   PinnedHostBufferRndNumColor hstRndCol( nevt );
@@ -392,7 +395,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for helicity selection
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferSelectedHelicity hstSelHel( nevt );
 #else
   PinnedHostBufferSelectedHelicity hstSelHel( nevt );
@@ -400,7 +403,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for color selection
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferSelectedColor hstSelCol( nevt );
 #else
   PinnedHostBufferSelectedColor hstSelCol( nevt );
@@ -438,7 +441,7 @@ main( int argc, char** argv )
     const bool onDevice = true;
     prnk.reset( new CurandRandomNumberKernel( devRndmom, onDevice ) );
 #else
-    throw std::logic_error( "INTERNAL ERROR! CurandDevice is not supported on CPUs" ); // INTERNAL ERROR (no path to this statement)
+    throw std::logic_error( "INTERNAL ERROR! CurandDevice is not supported on CPUs or non-NVidia GPUs" ); // INTERNAL ERROR (no path to this statement)
 #endif
   }
 
@@ -450,7 +453,7 @@ main( int argc, char** argv )
   }
   else
   {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     prsk.reset( new RamboSamplingKernelDevice( energy, devRndmom, devMomenta, devWeights, gpublocks, gputhreads ) );
 #else
     throw std::logic_error( "RamboDevice is not supported on CPUs" ); // INTERNAL ERROR (no path to this statement)
@@ -461,7 +464,7 @@ main( int argc, char** argv )
   std::unique_ptr<MatrixElementKernelBase> pmek;
   if( !bridge )
   {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     pmek.reset( new MatrixElementKernelDevice( devMomenta, devGs, devRndHel, devRndCol, devMatrixElements, devSelHel, devSelCol, gpublocks, gputhreads ) );
 #else
     pmek.reset( new MatrixElementKernelHost( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, nevt ) );
@@ -469,7 +472,7 @@ main( int argc, char** argv )
   }
   else
   {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     pmek.reset( new BridgeKernelDevice( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, gpublocks, gputhreads ) );
 #else
     pmek.reset( new BridgeKernelHost( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, nevt ) );
@@ -511,7 +514,7 @@ main( int argc, char** argv )
     prnk->generateRnarray();
     //std::cout << "Got random numbers" << std::endl;
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     if( rndgen != RandomNumberMode::CurandDevice && rmbsmp == RamboSamplingMode::RamboDevice )
     {
       // --- 1c. Copy rndmom from host to device
@@ -543,7 +546,7 @@ main( int argc, char** argv )
     prsk->getMomentaFinal();
     //std::cout << "Got final momenta" << std::endl;
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     if( rmbsmp == RamboSamplingMode::RamboDevice )
     {
       // --- 2c. CopyDToH Weights
@@ -588,7 +591,7 @@ main( int argc, char** argv )
       dynamic_cast<BridgeKernelBase*>( pmek.get() )->transposeInputMomentaC2F();
     }
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     // --- 2d. CopyHToD Momenta
     const std::string gKey = "0.. CpHTDg";
     rambtime += timermap.start( gKey ); // FIXME! NOT A RAMBO TIMER!
@@ -617,7 +620,7 @@ main( int argc, char** argv )
     wv3atime += timermap.stop(); // calc only
     wavetime += wv3atime;        // calc plus copy
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     if( !bridge )
     {
       // --- 3b. CopyDToH MEs
@@ -760,18 +763,22 @@ main( int argc, char** argv )
     rndgentxt = "CURAND DEVICE";
 #ifdef __CUDACC__
   rndgentxt += " (CUDA code)";
+#elif defined __HIPCC__
+  rndgentxt += " (HIP code)";
 #else
   rndgentxt += " (C++ code)";
 #endif
 
   // Workflow description summary
   std::string wrkflwtxt;
-  // -- CUDA or C++?
+  // -- CUDA or HIP or C++?
 #ifdef __CUDACC__
   wrkflwtxt += "CUD:";
+#elif defined __HIPCC__
+  wrkflwtxt += "HIP:";
 #else
   wrkflwtxt += "CPP:";
-#endif
+#endif /* clang-format off */
   // -- DOUBLE or FLOAT?
 #if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
   wrkflwtxt += "MIX+"; // mixed fptypes (single precision color algebra #537)
@@ -781,7 +788,7 @@ main( int argc, char** argv )
   wrkflwtxt += "FLT+";
 #else
   wrkflwtxt += "???+"; // no path to this statement
-#endif
+#endif /* clang-format on */
   // -- CUCOMPLEX or THRUST or STD complex numbers?
 #ifdef __CUDACC__
 #if defined MGONGPU_CUCXTYPE_CUCOMPLEX
@@ -793,6 +800,12 @@ main( int argc, char** argv )
 #else
   wrkflwtxt += "???:"; // no path to this statement
 #endif
+#elif defined __HIPCC__
+#if defined MGONGPU_CUCXTYPE_CXSMPL
+  wrkflwtxt += "CXS:";
+#else
+  wrkflwtxt += "???:"; // no path to this statement
+#endif
 #else
 #if defined MGONGPU_CPPCXTYPE_STDCOMPLEX
   wrkflwtxt += "STX:";
@@ -818,7 +831,7 @@ main( int argc, char** argv )
     wrkflwtxt += "RMBDEV+";
   else
     wrkflwtxt += "??????+"; // no path to this statement
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   // -- HOST or DEVICE matrix elements? Standalone MEs or BRIDGE?
   if( !bridge )
     wrkflwtxt += "MESDEV";
@@ -874,7 +887,7 @@ main( int argc, char** argv )
 
   if( perf )
   {
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 #ifdef _OPENMP
     // Get the output of "nproc --all" (https://stackoverflow.com/a/478960)
     std::string nprocall;
@@ -895,6 +908,8 @@ main( int argc, char** argv )
     std::cout << std::string( SEP79, '*' ) << std::endl
 #ifdef __CUDACC__
               << "Process                     = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_CUDA"
+#elif defined __HIPCC__
+              << "Process                     = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_HIP"
 #else
               << "Process                     = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_CPP"
 #endif
@@ -921,21 +936,21 @@ main( int argc, char** argv )
 #elif defined MGONGPU_FPTYPE_FLOAT
               << "FP precision                = FLOAT (NaN/abnormal=" << nabn << ", zero=" << nzero << ")" << std::endl
 #endif
-#ifdef __CUDACC__
 #if defined MGONGPU_CUCXTYPE_CUCOMPLEX
               << "Complex type                = CUCOMPLEX" << std::endl
 #elif defined MGONGPU_CUCXTYPE_THRUST
               << "Complex type                = THRUST::COMPLEX" << std::endl
-#endif
-#else
+#elif defined MGONGPU_CUCXTYPE_CXSMPL
               << "Complex type                = STD::COMPLEX" << std::endl
+#else
+              << "Complex type                = ???" << std::endl // no path to this statement...
 #endif
               << "RanNumb memory layout       = AOSOA[" << neppR << "]"
               << ( neppR == 1 ? " == AOS" : "" )
               << " [HARDCODED FOR REPRODUCIBILITY]" << std::endl
               << "Momenta memory layout       = AOSOA[" << neppM << "]"
               << ( neppM == 1 ? " == AOS" : "" ) << std::endl
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     //<< "Wavefunction GPU memory     = LOCAL" << std::endl
 #else
 #if !defined MGONGPU_CPPSIMD
@@ -966,7 +981,7 @@ main( int argc, char** argv )
 #endif
 #endif
               << "Random number generation    = " << rndgentxt << std::endl
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 #ifdef _OPENMP
               << "OMP threads / `nproc --all` = " << omp_get_max_threads() << " / " << nprocall // includes a newline
 #endif
@@ -1062,14 +1077,14 @@ main( int argc, char** argv )
              << "\"FLOAT (NaN/abnormal=" << nabn << ")\"," << std::endl
 #endif
              << "\"Complex type\": "
-#ifdef __CUDACC__
 #if defined MGONGPU_CUCXTYPE_CUCOMPLEX
              << "\"CUCOMPLEX\"," << std::endl
 #elif defined MGONGPU_CUCXTYPE_THRUST
              << "\"THRUST::COMPLEX\"," << std::endl
-#endif
-#else
+#elif defined MGONGPU_CUCXTYPE_CXSMPL
              << "\"STD::COMPLEX\"," << std::endl
+#else
+             << "\"???\"," << std::endl                           // no path to this statement...
 #endif
              << "\"RanNumb memory layout\": "
              << "\"AOSOA[" << neppR << "]\""
@@ -1077,7 +1092,7 @@ main( int argc, char** argv )
              << "\"Momenta memory layout\": "
              << "\"AOSOA[" << neppM << "]\""
              << ( neppM == 1 ? " == AOS" : "" ) << ", " << std::endl
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     //<< "\"Wavefunction GPU memory\": " << "\"LOCAL\"," << std::endl
 #endif
              << "\"Curand generation\": "
diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/RamboSamplingKernels.cc b/epochX/cudacpp/gq_ttq.mad/SubProcesses/RamboSamplingKernels.cc
index da68aa9255..79abbcc4f8 100644
--- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/RamboSamplingKernels.cc
+++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/RamboSamplingKernels.cc
@@ -1,11 +1,11 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #include "RamboSamplingKernels.h"
 
-#include "CudaRuntime.h"
+#include "GpuRuntime.h"
 #include "MemoryAccessMomenta.h"
 #include "MemoryAccessRandomNumbers.h"
 #include "MemoryAccessWeights.h"
@@ -14,7 +14,7 @@
 
 #include <sstream>
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -92,7 +92,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   RamboSamplingKernelDevice::RamboSamplingKernelDevice( const fptype energy,               // input: energy
                                                         const BufferRndNumMomenta& rndmom, // input: random numbers in [0,1]
                                                         BufferMomenta& momenta,            // output: momenta
@@ -135,7 +135,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   __global__ void
   getMomentaInitialDevice( const fptype energy,
                            fptype* momenta )
@@ -147,17 +147,17 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   void
   RamboSamplingKernelDevice::getMomentaInitial()
   {
-    getMomentaInitialDevice<<<m_gpublocks, m_gputhreads>>>( m_energy, m_momenta.data() );
+    gpuLaunchKernel( getMomentaInitialDevice, m_gpublocks, m_gputhreads, m_energy, m_momenta.data() );
   }
 #endif
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   __global__ void
   getMomentaFinalDevice( const fptype energy,
                          const fptype* rndmom,
@@ -171,11 +171,11 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   void
   RamboSamplingKernelDevice::getMomentaFinal()
   {
-    getMomentaFinalDevice<<<m_gpublocks, m_gputhreads>>>( m_energy, m_rndmom.data(), m_momenta.data(), m_weights.data() );
+    gpuLaunchKernel( getMomentaFinalDevice, m_gpublocks, m_gputhreads, m_energy, m_rndmom.data(), m_momenta.data(), m_weights.data() );
   }
 #endif
 
diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/RamboSamplingKernels.h b/epochX/cudacpp/gq_ttq.mad/SubProcesses/RamboSamplingKernels.h
index 184089efd7..7c214cd74b 100644
--- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/RamboSamplingKernels.h
+++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/RamboSamplingKernels.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef RAMBOSAMPLINGKERNELS_H
 #define RAMBOSAMPLINGKERNELS_H 1
@@ -10,7 +10,7 @@
 
 #include "MemoryBuffers.h"
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -93,7 +93,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   // A class encapsulating RAMBO phase space sampling on a GPU device
   class RamboSamplingKernelDevice final : public SamplingKernelBase, public NumberOfEvents
   {
diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/RandomNumberKernels.h b/epochX/cudacpp/gq_ttq.mad/SubProcesses/RandomNumberKernels.h
index 188a72c2c9..21d63beeac 100644
--- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/RandomNumberKernels.h
+++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/RandomNumberKernels.h
@@ -1,14 +1,14 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef RANDOMNUMBERKERNELS_H
 #define RANDOMNUMBERKERNELS_H 1
 
 #include "mgOnGpuConfig.h"
 
-// NB This must come AFTER mgOnGpuConfig.h which contains our definition of __global__ when __CUDACC__ is not defined
+// NB This must come AFTER mgOnGpuConfig.h which contains our definition of __global__ when MGONGPUCPP_GPUIMPL is not defined
 #ifndef MGONGPU_HAS_NO_CURAND
 //#include "curand.h"
 struct curandGenerator_st; // forward definition from curand.h
@@ -16,7 +16,7 @@ struct curandGenerator_st; // forward definition from curand.h
 
 #include "MemoryBuffers.h"
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/cudacpp.mk b/epochX/cudacpp/gq_ttq.mad/SubProcesses/cudacpp.mk
index 509307506b..f2cfa349da 100644
--- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/cudacpp.mk
+++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/cudacpp.mk
@@ -1,7 +1,7 @@
 # Copyright (C) 2020-2023 CERN and UCLouvain.
 # Licensed under the GNU Lesser General Public License (version 3 or later).
 # Created by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin.
-# Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+# Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 
 #=== Determine the name of this makefile (https://ftp.gnu.org/old-gnu/Manuals/make-3.80/html_node/make_17.html)
 #=== NB: use ':=' to ensure that the value of CUDACPP_MAKEFILE is not modified further down after including make_opts
@@ -42,10 +42,10 @@ endif
 
 #-------------------------------------------------------------------------------
 
-#=== Configure common compiler flags for C++ and CUDA
+#=== Configure common compiler flags for C++ and CUDA/HIP
 
 INCFLAGS = -I.
-OPTFLAGS = -O3 # this ends up in CUFLAGS too (should it?), cannot add -Ofast or -ffast-math here
+OPTFLAGS = -O3 # this ends up in GPUFLAGS too (should it?), cannot add -Ofast or -ffast-math here
 
 # Dependency on src directory
 MG5AMC_COMMONLIB = mg5amc_common
@@ -121,24 +121,46 @@ endif
 
 #-------------------------------------------------------------------------------
 
-#=== Configure the CUDA compiler
+#=== Configure the GPU compiler (CUDA or HIP)
 
-# If CXX is not a single word (example "clang++ --gcc-toolchain...") then disable CUDA builds (issue #505)
-# This is because it is impossible to pass this to "CUFLAGS += -ccbin <host-compiler>" below
+# FIXME! (AV 24.01.2024)
+# In the current implementation (without separate builds for C++ and CUDA/HIP), we first check for cudacc and hipcc in CUDA_HOME and HIP_HOME.
+# If CUDA_HOME or HIP_HOME are not set, try to determine them from the path to cudacc and hipcc.
+# While convoluted, this is currently necessary to allow disabling CUDA/HIP builds by setting CUDA_HOME or HIP_HOME to invalid paths.
+# This will (probably?) be fixed when separate C++ and CUDA/HIP builds are implemented (PR #775).
+
+# If CXX is not a single word (example "clang++ --gcc-toolchain...") then disable CUDA and HIP builds (issue #505)
+# This is because it is impossible to pass this to "GPUFLAGS += -ccbin <host-compiler>" below
 ifneq ($(words $(subst ccache ,,$(CXX))),1) # allow at most "CXX=ccache <host-compiler>" from outside
-  $(warning CUDA builds are not supported for multi-word CXX "$(CXX)")
+  $(warning CUDA and HIP builds are not supported for multi-word CXX "$(CXX)")
   override CUDA_HOME=disabled
+  override HIP_HOME=disabled
 endif
 
-# If CUDA_HOME is not set, try to set it from the location of nvcc
+# If CUDA_HOME is not set, try to set it from the path to nvcc
 ifndef CUDA_HOME
   CUDA_HOME = $(patsubst %bin/nvcc,%,$(shell which nvcc 2>/dev/null))
   $(warning CUDA_HOME was not set: using "$(CUDA_HOME)")
 endif
 
-# Set NVCC as $(CUDA_HOME)/bin/nvcc if it exists
+# If HIP_HOME is not set, try to set it from the path to hipcc
+ifndef HIP_HOME
+  HIP_HOME = $(patsubst %bin/hipcc,%,$(HIP_COMPILER_PATH))
+  $(warning HIP_HOME was not set: using "$(HIP_HOME)")
+endif
+
+# FIXME! (AV 24.01.2024)
+# In the current implementation (without separate builds for C++ and CUDA/HIP),
+# builds are performed for HIP only if CUDA is not found in the path.
+# If both CUDA and HIP are installed, HIP builds can be triggered by unsetting CUDA_HOME.
+# This will be fixed when separate C++ and CUDA/HIP builds are implemented (PR #775).
+
+#--- Option 1: CUDA exists -> use CUDA
+
+# Set GPUCC as $(CUDA_HOME)/bin/nvcc if it exists
 ifneq ($(wildcard $(CUDA_HOME)/bin/nvcc),)
-  NVCC = $(CUDA_HOME)/bin/nvcc
+
+  GPUCC = $(CUDA_HOME)/bin/nvcc
   USE_NVTX ?=-DUSE_NVTX
   # See https://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/index.html
   # See https://arnon.dk/matching-sm-architectures-arch-and-gencode-for-various-nvidia-cards/
@@ -158,41 +180,78 @@ ifneq ($(wildcard $(CUDA_HOME)/bin/nvcc),)
     CURANDLIBFLAGS = -L$(CUDA_HOME)/lib64/ -lcurand # NB: -lcuda is not needed here!
   endif
   CUOPTFLAGS = -lineinfo
-  CUFLAGS = $(foreach opt, $(OPTFLAGS), -Xcompiler $(opt)) $(CUOPTFLAGS) $(INCFLAGS) $(CUINC) $(USE_NVTX) $(CUARCHFLAGS) -use_fast_math
-  ###CUFLAGS += -Xcompiler -Wall -Xcompiler -Wextra -Xcompiler -Wshadow
-  ###NVCC_VERSION = $(shell $(NVCC) --version | grep 'Cuda compilation tools' | cut -d' ' -f5 | cut -d, -f1)
-  CUFLAGS += -std=c++17 # need CUDA >= 11.2 (see #333): this is enforced in mgOnGpuConfig.h
+  ###GPUFLAGS = $(OPTFLAGS) $(CUOPTFLAGS) $(INCFLAGS) $(CUINC) $(USE_NVTX) $(CUARCHFLAGS) -use_fast_math
+  GPUFLAGS = $(foreach opt, $(OPTFLAGS), -Xcompiler $(opt)) $(CUOPTFLAGS) $(INCFLAGS) $(CUINC) $(USE_NVTX) $(CUARCHFLAGS) -use_fast_math
+  ###GPUFLAGS += -Xcompiler -Wall -Xcompiler -Wextra -Xcompiler -Wshadow
+  ###GPUCC_VERSION = $(shell $(GPUCC) --version | grep 'Cuda compilation tools' | cut -d' ' -f5 | cut -d, -f1)
+  GPUFLAGS += -std=c++17 # need CUDA >= 11.2 (see #333): this is enforced in mgOnGpuConfig.h
   # Without -maxrregcount: baseline throughput: 6.5E8 (16384 32 12) up to 7.3E8 (65536 128 12)
-  ###CUFLAGS+= --maxrregcount 160 # improves throughput: 6.9E8 (16384 32 12) up to 7.7E8 (65536 128 12)
-  ###CUFLAGS+= --maxrregcount 128 # improves throughput: 7.3E8 (16384 32 12) up to 7.6E8 (65536 128 12)
-  ###CUFLAGS+= --maxrregcount 96 # degrades throughput: 4.1E8 (16384 32 12) up to 4.5E8 (65536 128 12)
-  ###CUFLAGS+= --maxrregcount 64 # degrades throughput: 1.7E8 (16384 32 12) flat at 1.7E8 (65536 128 12)
+  ###GPUFLAGS+= --maxrregcount 160 # improves throughput: 6.9E8 (16384 32 12) up to 7.7E8 (65536 128 12)
+  ###GPUFLAGS+= --maxrregcount 128 # improves throughput: 7.3E8 (16384 32 12) up to 7.6E8 (65536 128 12)
+  ###GPUFLAGS+= --maxrregcount 96 # degrades throughput: 4.1E8 (16384 32 12) up to 4.5E8 (65536 128 12)
+  ###GPUFLAGS+= --maxrregcount 64 # degrades throughput: 1.7E8 (16384 32 12) flat at 1.7E8 (65536 128 12)
+  CUBUILDRULEFLAGS = -Xcompiler -fPIC -c
+  CCBUILDRULEFLAGS = -Xcompiler -fPIC -c -x cu
+  CUDATESTFLAGS = -lcuda
+
+  # Set the host C++ compiler for GPUCC via "-ccbin <host-compiler>"
+  # (NB issue #505: this must be a single word, "clang++ --gcc-toolchain..." is not supported)
+  GPUFLAGS += -ccbin $(shell which $(subst ccache ,,$(CXX)))
+
+  # Allow newer (unsupported) C++ compilers with older versions of CUDA if ALLOW_UNSUPPORTED_COMPILER_IN_CUDA is set (#504)
+  ifneq ($(origin ALLOW_UNSUPPORTED_COMPILER_IN_CUDA),undefined)
+  GPUFLAGS += -allow-unsupported-compiler
+  endif
+
 else ifneq ($(origin REQUIRE_CUDA),undefined)
+
   # If REQUIRE_CUDA is set but no cuda is found, stop here (e.g. for CI tests on GPU #443)
-  $(error No cuda installation found (set CUDA_HOME or make nvcc visible in PATH))
+  $(error No cuda installation found (set CUDA_HOME or make GPUCC visible in PATH))
+
+#--- Option 2: CUDA does not exist, HIP exists -> use HIP
+
+# Set GPUCC as $(HIP_HOME)/bin/hipcc if it exists
+else ifneq ($(wildcard $(HIP_HOME)/bin/hipcc),)
+
+  GPUCC = $(HIP_HOME)/bin/hipcc
+  #USE_NVTX ?=-DUSE_NVTX # should maybe find something equivalent to this in HIP?
+  HIPARCHFLAGS = -target x86_64-linux-gnu --offload-arch=gfx90a
+  HIPINC = -I$(HIP_HOME)/include/
+  # Note: -DHIP_FAST_MATH is equivalent to -use_fast_math in HIP 
+  # (but only for single precision line 208: https://rocm-developer-tools.github.io/HIP/hcc__detail_2math__functions_8h_source.html)
+  GPUFLAGS = $(OPTFLAGS) $(CUOPTFLAGS) $(INCFLAGS) $(HIPINC) $(HIPARCHFLAGS) -DHIP_FAST_MATH -DHIP_PLATFORM=amd -fPIC
+  ###GPUFLAGS += -Xcompiler -Wall -Xcompiler -Wextra -Xcompiler -Wshadow
+  GPUFLAGS += -std=c++17
+  ###GPUFLAGS+= --maxrregcount 255 # (AV: is this option valid on HIP and meaningful on AMD GPUs?)
+  CUBUILDRULEFLAGS = -fPIC -c
+  CCBUILDRULEFLAGS = -fPIC -c
+
+else ifneq ($(origin REQUIRE_HIP),undefined)
+
+  # If REQUIRE_HIP is set but no HIP is found, stop here (e.g. for CI tests on GPU #443)
+  $(error No hip installation found (set HIP_HOME or make GPUCC visible in PATH))
+
+#--- Option 3: CUDA does not exist, HIP does not exist -> switch off both CUDA and HIP
+
 else
-  # No cuda. Switch cuda compilation off and go to common random numbers in C++
+
+  # No cudacc and no hipcc: switch CUDA and HIP compilation off and go to common random numbers in C++
   $(warning CUDA_HOME is not set or is invalid: export CUDA_HOME to compile with cuda)
-  override NVCC=
+  $(warning HIP_HOME is not set or is invalid: export HIP_HOME to compile with hip)
+  override GPUCC=
   override USE_NVTX=
   override CUINC=
   override CURANDLIBFLAGS=
-endif
-export NVCC
-export CUFLAGS
-
-# Set the host C++ compiler for nvcc via "-ccbin <host-compiler>"
-# (NB issue #505: this must be a single word, "clang++ --gcc-toolchain..." is not supported)
-CUFLAGS += -ccbin $(shell which $(subst ccache ,,$(CXX)))
 
-# Allow newer (unsupported) C++ compilers with older versions of CUDA if ALLOW_UNSUPPORTED_COMPILER_IN_CUDA is set (#504)
-ifneq ($(origin ALLOW_UNSUPPORTED_COMPILER_IN_CUDA),undefined)
-CUFLAGS += -allow-unsupported-compiler
 endif
 
+# Export GPUCC (so that it can also be used in cudacpp_src.mk?)
+export GPUCC
+export GPUFLAGS
+
 #-------------------------------------------------------------------------------
 
-#=== Configure ccache for C++ and CUDA builds
+#=== Configure ccache for C++ and CUDA/HIP builds
 
 # Enable ccache if USECCACHE=1
 ifeq ($(USECCACHE)$(shell echo $(CXX) | grep ccache),1)
@@ -201,15 +260,15 @@ endif
 #ifeq ($(USECCACHE)$(shell echo $(AR) | grep ccache),1)
 #  override AR:=ccache $(AR)
 #endif
-ifneq ($(NVCC),)
-  ifeq ($(USECCACHE)$(shell echo $(NVCC) | grep ccache),1)
-    override NVCC:=ccache $(NVCC)
+ifneq ($(GPUCC),)
+  ifeq ($(USECCACHE)$(shell echo $(GPUCC) | grep ccache),1)
+    override GPUCC:=ccache $(GPUCC)
   endif
 endif
 
 #-------------------------------------------------------------------------------
 
-#=== Configure PowerPC-specific compiler flags for C++ and CUDA
+#=== Configure PowerPC-specific compiler flags for C++ and CUDA/HIP
 
 # PowerPC-specific CXX compiler flags (being reviewed)
 ifeq ($(UNAME_P),ppc64le)
@@ -225,9 +284,9 @@ else
   ######CXXFLAGS+= -fno-semantic-interposition # no benefit (neither alone, nor combined with -flto)
 endif
 
-# PowerPC-specific CUDA compiler flags (to be reviewed!)
+# PowerPC-specific CUDA/HIP compiler flags (to be reviewed!)
 ifeq ($(UNAME_P),ppc64le)
-  CUFLAGS+= -Xcompiler -mno-float128
+  GPUFLAGS+= -Xcompiler -mno-float128
 endif
 
 #-------------------------------------------------------------------------------
@@ -237,7 +296,7 @@ endif
 # Set the default OMPFLAGS choice
 ifneq ($(shell $(CXX) --version | egrep '^Intel'),)
 override OMPFLAGS = -fopenmp
-###override OMPFLAGS = # disable OpenMP MT on Intel (was ok without nvcc but not ok with nvcc before #578)
+###override OMPFLAGS = # disable OpenMP MT on Intel (was ok without GPUCC but not ok with GPUCC before #578)
 else ifneq ($(shell $(CXX) --version | egrep '^(clang)'),)
 override OMPFLAGS = -fopenmp
 ###override OMPFLAGS = # disable OpenMP MT on clang (was not ok without or with nvcc before #578)
@@ -293,7 +352,10 @@ endif
 
 # Set the default RNDGEN (random number generator) choice
 ifeq ($(RNDGEN),)
-  ifeq ($(NVCC),)
+  ifeq ($(GPUCC),)
+    override RNDGEN = hasNoCurand
+  # Edgecase for HIP compilation
+  else ifeq ($(findstring hipcc,$(GPUCC)),hipcc)
     override RNDGEN = hasNoCurand
   else ifeq ($(RNDGEN),)
     override RNDGEN = hasCurand
@@ -310,7 +372,7 @@ export OMPFLAGS
 
 #-------------------------------------------------------------------------------
 
-#=== Set the CUDA/C++ compiler flags appropriate to user-defined choices of AVX, FPTYPE, HELINL, HRDCOD, RNDGEN
+#=== Set the CUDA/HIP/C++ compiler flags appropriate to user-defined choices of AVX, FPTYPE, HELINL, HRDCOD, RNDGEN
 
 # Set the build flags appropriate to OMPFLAGS
 $(info OMPFLAGS=$(OMPFLAGS))
@@ -368,13 +430,13 @@ CXXFLAGS+= $(AVXFLAGS)
 $(info FPTYPE=$(FPTYPE))
 ifeq ($(FPTYPE),d)
   CXXFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_DOUBLE
-  CUFLAGS  += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_DOUBLE
+  GPUFLAGS  += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_DOUBLE
 else ifeq ($(FPTYPE),f)
   CXXFLAGS += -DMGONGPU_FPTYPE_FLOAT -DMGONGPU_FPTYPE2_FLOAT
-  CUFLAGS  += -DMGONGPU_FPTYPE_FLOAT -DMGONGPU_FPTYPE2_FLOAT
+  GPUFLAGS  += -DMGONGPU_FPTYPE_FLOAT -DMGONGPU_FPTYPE2_FLOAT
 else ifeq ($(FPTYPE),m)
   CXXFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_FLOAT
-  CUFLAGS  += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_FLOAT
+  GPUFLAGS  += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_FLOAT
 else
   $(error Unknown FPTYPE='$(FPTYPE)': only 'd', 'f' and 'm' are supported)
 endif
@@ -383,7 +445,7 @@ endif
 $(info HELINL=$(HELINL))
 ifeq ($(HELINL),1)
   CXXFLAGS += -DMGONGPU_INLINE_HELAMPS
-  CUFLAGS  += -DMGONGPU_INLINE_HELAMPS
+  GPUFLAGS  += -DMGONGPU_INLINE_HELAMPS
 else ifneq ($(HELINL),0)
   $(error Unknown HELINL='$(HELINL)': only '0' and '1' are supported)
 endif
@@ -392,7 +454,7 @@ endif
 $(info HRDCOD=$(HRDCOD))
 ifeq ($(HRDCOD),1)
   CXXFLAGS += -DMGONGPU_HARDCODE_PARAM
-  CUFLAGS  += -DMGONGPU_HARDCODE_PARAM
+  GPUFLAGS  += -DMGONGPU_HARDCODE_PARAM
 else ifneq ($(HRDCOD),0)
   $(error Unknown HRDCOD='$(HRDCOD)': only '0' and '1' are supported)
 endif
@@ -444,11 +506,11 @@ ifeq ($(UNAME_S),Darwin)
   override CULIBFLAGSRPATH2 =
 else
   # RPATH to cuda/cpp libs when linking executables
-  override CXXLIBFLAGSRPATH = -Wl,-rpath,$(LIBDIRRPATH)
-  override CULIBFLAGSRPATH = -Xlinker -rpath,$(LIBDIRRPATH)
+  override CXXLIBFLAGSRPATH = -Wl,-rpath=$(LIBDIRRPATH)
+  override CULIBFLAGSRPATH = -Xlinker -rpath=$(LIBDIRRPATH)
   # RPATH to common lib when linking cuda/cpp libs
-  override CXXLIBFLAGSRPATH2 = -Wl,-rpath,'$$ORIGIN'
-  override CULIBFLAGSRPATH2 = -Xlinker -rpath,'$$ORIGIN'
+  override CXXLIBFLAGSRPATH2 = -Wl,-rpath='$$ORIGIN'
+  override CULIBFLAGSRPATH2 = -Xlinker -rpath='$$ORIGIN'
 endif
 
 # Setting LD_LIBRARY_PATH or DYLD_LIBRARY_PATH in the RUNTIME is no longer necessary (neither on Linux nor on Mac)
@@ -461,7 +523,7 @@ override RUNTIME =
 cxx_main=$(BUILDDIR)/check.exe
 fcxx_main=$(BUILDDIR)/fcheck.exe
 
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 cu_main=$(BUILDDIR)/gcheck.exe
 fcu_main=$(BUILDDIR)/fgcheck.exe
 else
@@ -492,15 +554,16 @@ $(BUILDDIR)/.build.$(TAG):
 	@touch $(BUILDDIR)/.build.$(TAG)
 
 # Generic target and build rules: objects from CUDA compilation
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 $(BUILDDIR)/%.o : %.cu *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
 	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
-	$(NVCC) $(CPPFLAGS) $(CUFLAGS) -Xcompiler -fPIC -c $< -o $@
+	$(GPUCC) $(CPPFLAGS) $(GPUFLAGS) $(CUBUILDRULEFLAGS) $< -o $@
 
 $(BUILDDIR)/%_cu.o : %.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
 	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
-	$(NVCC) $(CPPFLAGS) $(CUFLAGS) -Xcompiler -fPIC -c -x cu $< -o $@
+	$(GPUCC) $(CPPFLAGS) $(GPUFLAGS) $(CCBUILDRULEFLAGS) $< -o $@
 endif
+# -x cu in line above
 
 # Generic target and build rules: objects from C++ compilation
 # (NB do not include CUINC here! add it only for NVTX or curand #679)
@@ -509,11 +572,14 @@ $(BUILDDIR)/%.o : %.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
 	$(CXX) $(CPPFLAGS) $(CXXFLAGS) -fPIC -c $< -o $@
 
 # Apply special build flags only to CrossSectionKernel.cc and gCrossSectionKernel.cu (no fast math, see #117 and #516)
+# Added edgecase for HIP compilation
 ifeq ($(shell $(CXX) --version | grep ^nvc++),)
 $(BUILDDIR)/CrossSectionKernels.o: CXXFLAGS := $(filter-out -ffast-math,$(CXXFLAGS))
 $(BUILDDIR)/CrossSectionKernels.o: CXXFLAGS += -fno-fast-math
-ifneq ($(NVCC),)
-$(BUILDDIR)/gCrossSectionKernels.o: CUFLAGS += -Xcompiler -fno-fast-math
+ifeq ($(findstring nvcc,$(GPUCC)),nvcc)
+  $(BUILDDIR)/gCrossSectionKernels.o: GPUFLAGS += -Xcompiler -fno-fast-math
+else
+  $(BUILDDIR)/gCrossSectionKernels.o: GPUFLAGS += -fno-fast-math
 endif
 endif
 
@@ -530,10 +596,10 @@ ifeq ($(RNDGEN),hasCurand)
 $(BUILDDIR)/CurandRandomNumberKernel.o: CXXFLAGS += $(CUINC)
 endif
 
-# Avoid "warning: builtin __has_trivial_... is deprecated; use __is_trivially_... instead" in nvcc with icx2023 (#592)
+# Avoid "warning: builtin __has_trivial_... is deprecated; use __is_trivially_... instead" in GPUCC with icx2023 (#592)
 ifneq ($(shell $(CXX) --version | egrep '^(Intel)'),)
-ifneq ($(NVCC),)
-CUFLAGS += -Xcompiler -Wno-deprecated-builtins
+ifneq ($(GPUCC),)
+GPUFLAGS += -Wno-deprecated-builtins
 endif
 endif
 
@@ -541,8 +607,8 @@ endif
 # This patch does remove the warning, but I prefer to keep it disabled for the moment...
 ###ifneq ($(shell $(CXX) --version | egrep '^(clang|Apple clang|Intel)'),)
 ###$(BUILDDIR)/CrossSectionKernels.o: CXXFLAGS += -Wno-overriding-t-option
-###ifneq ($(NVCC),)
-###$(BUILDDIR)/gCrossSectionKernels.o: CUFLAGS += -Xcompiler -Wno-overriding-t-option
+###ifneq ($(GPUCC),)
+###$(BUILDDIR)/gCrossSectionKernels.o: GPUFLAGS += -Xcompiler -Wno-overriding-t-option
 ###endif
 ###endif
 
@@ -569,7 +635,7 @@ MG5AMC_CXXLIB = mg5amc_$(processid_short)_cpp
 cxx_objects_lib=$(BUILDDIR)/CPPProcess.o $(BUILDDIR)/MatrixElementKernels.o $(BUILDDIR)/BridgeKernels.o $(BUILDDIR)/CrossSectionKernels.o
 cxx_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel.o $(BUILDDIR)/RamboSamplingKernels.o
 
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 MG5AMC_CULIB = mg5amc_$(processid_short)_cuda
 cu_objects_lib=$(BUILDDIR)/gCPPProcess.o $(BUILDDIR)/gMatrixElementKernels.o $(BUILDDIR)/gBridgeKernels.o $(BUILDDIR)/gCrossSectionKernels.o
 cu_objects_exe=$(BUILDDIR)/gCommonRandomNumberKernel.o $(BUILDDIR)/gRamboSamplingKernels.o
@@ -581,11 +647,11 @@ $(LIBDIR)/lib$(MG5AMC_CXXLIB).so: cxx_objects_lib += $(BUILDDIR)/fbridge.o
 $(LIBDIR)/lib$(MG5AMC_CXXLIB).so: $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib)
 	$(CXX) -shared -o $@ $(cxx_objects_lib) $(CXXLIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB)
 
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 $(LIBDIR)/lib$(MG5AMC_CULIB).so: $(BUILDDIR)/fbridge_cu.o
 $(LIBDIR)/lib$(MG5AMC_CULIB).so: cu_objects_lib += $(BUILDDIR)/fbridge_cu.o
 $(LIBDIR)/lib$(MG5AMC_CULIB).so: $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cu_objects_lib)
-	$(NVCC) --shared -o $@ $(cu_objects_lib) $(CULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB)
+	$(GPUCC) --shared -o $@ $(cu_objects_lib) $(CULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB)
 endif
 
 #-------------------------------------------------------------------------------
@@ -602,16 +668,16 @@ $(cxx_main): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PAT
 $(cxx_main): $(BUILDDIR)/check_sa.o $(LIBDIR)/lib$(MG5AMC_CXXLIB).so $(cxx_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel.o
 	$(CXX) -o $@ $(BUILDDIR)/check_sa.o $(OMPFLAGS) -ldl -pthread $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_CXXLIB) $(cxx_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel.o $(CURANDLIBFLAGS)
 
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 ifneq ($(shell $(CXX) --version | grep ^Intel),)
-$(cu_main): LIBFLAGS += -lintlc # compile with icpx and link with nvcc (undefined reference to `_intel_fast_memcpy')
-$(cu_main): LIBFLAGS += -lsvml # compile with icpx and link with nvcc (undefined reference to `__svml_cos4_l9')
+$(cu_main): LIBFLAGS += -lintlc # compile with icpx and link with GPUCC (undefined reference to `_intel_fast_memcpy')
+$(cu_main): LIBFLAGS += -lsvml # compile with icpx and link with GPUCC (undefined reference to `__svml_cos4_l9')
 else ifneq ($(shell $(CXX) --version | grep ^nvc++),) # support nvc++ #531
 $(cu_main): LIBFLAGS += -L$(patsubst %bin/nvc++,%lib,$(subst ccache ,,$(CXX))) -lnvhpcatm -lnvcpumath -lnvc
 endif
 $(cu_main): LIBFLAGS += $(CULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH
 $(cu_main): $(BUILDDIR)/gcheck_sa.o $(LIBDIR)/lib$(MG5AMC_CULIB).so $(cu_objects_exe) $(BUILDDIR)/gCurandRandomNumberKernel.o
-	$(NVCC) -o $@ $(BUILDDIR)/gcheck_sa.o $(CUARCHFLAGS) $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe) $(BUILDDIR)/gCurandRandomNumberKernel.o $(CURANDLIBFLAGS)
+	$(GPUCC) -o $@ $(BUILDDIR)/gcheck_sa.o $(CUARCHFLAGS) $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe) $(BUILDDIR)/gCurandRandomNumberKernel.o $(CURANDLIBFLAGS)
 endif
 
 #-------------------------------------------------------------------------------
@@ -637,17 +703,17 @@ $(fcxx_main): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PA
 $(fcxx_main): $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler.o $(LIBDIR)/lib$(MG5AMC_CXXLIB).so $(cxx_objects_exe)
 	$(CXX) -o $@ $(BUILDDIR)/fcheck_sa.o $(OMPFLAGS) $(BUILDDIR)/fsampler.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CXXLIB) $(cxx_objects_exe)
 
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 ifneq ($(shell $(CXX) --version | grep ^Intel),)
-$(fcu_main): LIBFLAGS += -lintlc # compile with icpx and link with nvcc (undefined reference to `_intel_fast_memcpy')
-$(fcu_main): LIBFLAGS += -lsvml # compile with icpx and link with nvcc (undefined reference to `__svml_cos4_l9')
+$(fcu_main): LIBFLAGS += -lintlc # compile with icpx and link with GPUCC (undefined reference to `_intel_fast_memcpy')
+$(fcu_main): LIBFLAGS += -lsvml # compile with icpx and link with GPUCC (undefined reference to `__svml_cos4_l9')
 endif
 ifeq ($(UNAME_S),Darwin)
 $(fcu_main): LIBFLAGS += -L$(shell dirname $(shell $(FC) --print-file-name libgfortran.dylib)) # add path to libgfortran on Mac #375
 endif
 $(fcu_main): LIBFLAGS += $(CULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH
 $(fcu_main): $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler_cu.o $(LIBDIR)/lib$(MG5AMC_CULIB).so $(cu_objects_exe)
-	$(NVCC) -o $@ $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler_cu.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe)
+	$(GPUCC) -o $@ $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler_cu.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe)
 endif
 
 #-------------------------------------------------------------------------------
@@ -659,7 +725,7 @@ $(BUILDDIR)/testxxx.o: testxxx_cc_ref.txt
 $(testmain): $(BUILDDIR)/testxxx.o
 $(testmain): cxx_objects_exe += $(BUILDDIR)/testxxx.o # Comment out this line to skip the C++ test of xxx functions
 
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 $(BUILDDIR)/testxxx_cu.o: $(GTESTLIBS)
 $(BUILDDIR)/testxxx_cu.o: INCFLAGS += $(GTESTINC)
 $(BUILDDIR)/testxxx_cu.o: testxxx_cc_ref.txt
@@ -672,7 +738,7 @@ $(BUILDDIR)/testmisc.o: INCFLAGS += $(GTESTINC)
 $(testmain): $(BUILDDIR)/testmisc.o
 $(testmain): cxx_objects_exe += $(BUILDDIR)/testmisc.o # Comment out this line to skip the C++ miscellaneous tests
 
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 $(BUILDDIR)/testmisc_cu.o: $(GTESTLIBS)
 $(BUILDDIR)/testmisc_cu.o: INCFLAGS += $(GTESTINC)
 $(testmain): $(BUILDDIR)/testmisc_cu.o
@@ -684,12 +750,12 @@ $(BUILDDIR)/runTest.o: INCFLAGS += $(GTESTINC)
 $(testmain): $(BUILDDIR)/runTest.o
 $(testmain): cxx_objects_exe += $(BUILDDIR)/runTest.o
 
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 $(BUILDDIR)/runTest_cu.o: $(GTESTLIBS)
 $(BUILDDIR)/runTest_cu.o: INCFLAGS += $(GTESTINC)
 ifneq ($(shell $(CXX) --version | grep ^Intel),)
-$(testmain): LIBFLAGS += -lintlc # compile with icpx and link with nvcc (undefined reference to `_intel_fast_memcpy')
-$(testmain): LIBFLAGS += -lsvml # compile with icpx and link with nvcc (undefined reference to `__svml_cos4_l9')
+$(testmain): LIBFLAGS += -lintlc # compile with icpx and link with GPUCC (undefined reference to `_intel_fast_memcpy')
+$(testmain): LIBFLAGS += -lsvml # compile with icpx and link with GPUCC (undefined reference to `__svml_cos4_l9')
 else ifneq ($(shell $(CXX) --version | grep ^nvc++),) # support nvc++ #531
 $(testmain): LIBFLAGS += -L$(patsubst %bin/nvc++,%lib,$(subst ccache ,,$(CXX))) -lnvhpcatm -lnvcpumath -lnvc
 endif
@@ -713,14 +779,14 @@ $(testmain): LIBFLAGS += -lgomp
 endif
 endif
 
-ifeq ($(NVCC),) # link only runTest.o
+ifeq ($(GPUCC),) # link only runTest.o
 $(testmain): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH
 $(testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(cxx_objects_exe) $(GTESTLIBS)
 	$(CXX) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) -ldl -pthread $(LIBFLAGS)
 else # link both runTest.o and runTest_cu.o
 $(testmain): LIBFLAGS += $(CULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH
 $(testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) $(GTESTLIBS)
-	$(NVCC) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) -ldl $(LIBFLAGS) -lcuda
+	  $(GPUCC) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) -ldl $(LIBFLAGS) $(CUDATESTFLAGS)
 endif
 
 # Use target gtestlibs to build only googletest
@@ -829,9 +895,9 @@ ifeq ($(USECCACHE),1)
 	ccache --version | head -1
 endif
 	@echo ""
-	@echo NVCC=$(NVCC)
-ifneq ($(NVCC),)
-	$(NVCC) --version
+	@echo GPUCC=$(GPUCC)
+ifneq ($(GPUCC),)
+	$(GPUCC) --version
 endif
 	@echo ""
 	@echo CXX=$(CXX)
@@ -850,7 +916,7 @@ endif
 
 # Target: check (run the C++ test executable)
 # [NB THIS IS WHAT IS USED IN THE GITHUB CI!]
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 check: runTest cmpFcheck cmpFGcheck
 else
 check: runTest cmpFcheck
diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/fbridge.cc b/epochX/cudacpp/gq_ttq.mad/SubProcesses/fbridge.cc
index 2d2b36d560..22ce3f5115 100644
--- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/fbridge.cc
+++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/fbridge.cc
@@ -1,11 +1,11 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: S. Roiser (Oct 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Roiser, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #include "Bridge.h"
 #include "CPPProcess.h"
-#include "CudaRuntime.h"
+#include "GpuRuntime.h"
 
 extern "C"
 {
@@ -22,7 +22,7 @@ extern "C"
    * Using the same Fortran MadEvent code, linking to the hetrerogeneous library would allow access to both CPU and GPU implementations.
    * The specific heterogeneous configuration (how many GPUs, how many threads on each CPU, etc) could be loaded in CUDA/C++ from a data file.
    */
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   using namespace mg5amcGpu;
 #else
   using namespace mg5amcCpu;
@@ -46,8 +46,8 @@ extern "C"
    */
   void fbridgecreate_( CppObjectInFortran** ppbridge, const int* pnevtF, const int* pnparF, const int* pnp4F )
   {
-#ifdef __CUDACC__
-    CudaRuntime::setUp();
+#ifdef MGONGPUCPP_GPUIMPL
+    GpuRuntime::setUp();
 #endif
     // (NB: CPPProcess::initProc no longer needs to be executed here because it is called in the Bridge constructor)
     // FIXME: disable OMP in Bridge when called from Fortran
@@ -65,8 +65,8 @@ extern "C"
     Bridge<FORTRANFPTYPE>* pbridge = dynamic_cast<Bridge<FORTRANFPTYPE>*>( *ppbridge );
     if( pbridge == 0 ) throw std::runtime_error( "fbridgedelete_: invalid Bridge address" );
     delete pbridge;
-#ifdef __CUDACC__
-    CudaRuntime::tearDown();
+#ifdef MGONGPUCPP_GPUIMPL
+    GpuRuntime::tearDown();
 #endif
   }
 
@@ -96,7 +96,7 @@ extern "C"
   {
     Bridge<FORTRANFPTYPE>* pbridge = dynamic_cast<Bridge<FORTRANFPTYPE>*>( *ppbridge );
     if( pbridge == 0 ) throw std::runtime_error( "fbridgesequence_: invalid Bridge address" );
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     // Use the device/GPU implementation in the CUDA library
     // (there is also a host implementation in this library)
     pbridge->gpu_sequence( momenta, gs, rndhel, rndcol, *pchannelId, mes, selhel, selcol );
diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/fsampler.cc b/epochX/cudacpp/gq_ttq.mad/SubProcesses/fsampler.cc
index 2fb445372d..3743934f41 100644
--- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/fsampler.cc
+++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/fsampler.cc
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Feb 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #include "mgOnGpuConfig.h"
 
@@ -13,7 +13,7 @@
 
 //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -40,7 +40,7 @@ namespace mg5amcCpu
   private:
     const int m_nevt; // The number of events in each iteration
     int m_iiter;      // The iteration counter (for random number seeding)
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
     HostBufferRndNumMomenta m_hstRndmom; // Memory buffers for random numbers
     HostBufferMomenta m_hstMomenta;      // Memory buffers for momenta
     HostBufferWeights m_hstWeights;      // Memory buffers for sampling weights
@@ -105,7 +105,7 @@ namespace mg5amcCpu
 
 extern "C"
 {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   using namespace mg5amcGpu;
 #else
   using namespace mg5amcCpu;
diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/runTest.cc b/epochX/cudacpp/gq_ttq.mad/SubProcesses/runTest.cc
index d4a760a71b..de327f2321 100644
--- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/runTest.cc
+++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/runTest.cc
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: S. Hageboeck (Nov 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 //----------------------------------------------------------------------------
 // Use ./runTest.exe --gtest_filter=*xxx to run only testxxx.cc tests
 //----------------------------------------------------------------------------
@@ -18,7 +18,7 @@
 #include "RandomNumberKernels.h"
 #include "epoch_process_id.h"
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 using namespace mg5amcGpu;
 #else
 using namespace mg5amcCpu;
@@ -35,7 +35,7 @@ struct CUDA_CPU_TestBase : public TestDriverBase
     : TestDriverBase( npar, refFileName ) {}
 };
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 struct CPUTest : public CUDA_CPU_TestBase
 {
   // Struct data members (process, and memory structures for random numbers, momenta, matrix elements and weights on host and device)
@@ -119,7 +119,7 @@ struct CPUTest : public CUDA_CPU_TestBase
 };
 #endif
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 struct CUDATest : public CUDA_CPU_TestBase
 {
   // Reset the device when our test goes out of scope. Note that this should happen after
@@ -128,7 +128,7 @@ struct CUDATest : public CUDA_CPU_TestBase
   {
     ~DeviceReset()
     {
-      checkCuda( cudaDeviceReset() ); // this is needed by cuda-memcheck --leak-check full
+      checkGpu( gpuDeviceReset() ); // this is needed by cuda-memcheck --leak-check full
     }
   } deviceResetter;
 
@@ -256,7 +256,7 @@ INSTANTIATE_TEST_SUITE_P( prefix, \
                           test_suite_name, \
                           testing::Values( new CUDATest( MG_EPOCH_REFERENCE_FILE_NAME ) ) );
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 MG_INSTANTIATE_TEST_SUITE_GPU( XTESTID_GPU( MG_EPOCH_PROCESS_ID ), MadgraphTest );
 #else
 MG_INSTANTIATE_TEST_SUITE_CPU( XTESTID_CPU( MG_EPOCH_PROCESS_ID ), MadgraphTest );
diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/testmisc.cc b/epochX/cudacpp/gq_ttq.mad/SubProcesses/testmisc.cc
index 895d6eeb56..ba9e59a8a3 100644
--- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/testmisc.cc
+++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/testmisc.cc
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 //----------------------------------------------------------------------------
 // Use ./runTest.exe --gtest_filter=*misc to run only testmisc.cc tests
 //----------------------------------------------------------------------------
@@ -17,7 +17,7 @@
 #include <sstream>
 #include <typeinfo>
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 #define TESTID( s ) s##_GPU_MISC
 #else
 #define TESTID( s ) s##_CPU_MISC
@@ -26,7 +26,7 @@
 #define XTESTID( s ) TESTID( s )
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -59,7 +59,7 @@ namespace mg5amcCpu
 
 TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testmisc )
 {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   using namespace mg5amcGpu;
 #else
   using namespace mg5amcCpu;
diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/testxxx.cc b/epochX/cudacpp/gq_ttq.mad/SubProcesses/testxxx.cc
index 3361fe5aa9..e5167de00c 100644
--- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/testxxx.cc
+++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/testxxx.cc
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Apr 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 //----------------------------------------------------------------------------
 // Use ./runTest.exe --gtest_filter=*xxx to run only testxxx.cc tests
 //----------------------------------------------------------------------------
@@ -24,7 +24,7 @@
 #include <iomanip>
 #include <iostream>
 #include <vector>
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 #define TESTID( s ) s##_GPU_XXX
 #else
 #define TESTID( s ) s##_CPU_XXX
@@ -32,7 +32,7 @@
 
 #define XTESTID( s ) TESTID( s )
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -42,7 +42,7 @@ namespace mg5amcCpu
   int FPEhandlerIevt = -1;
   inline void FPEhandler( int sig )
   {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     std::cerr << "Floating Point Exception (GPU): '" << FPEhandlerMessage << "' ievt=" << FPEhandlerIevt << std::endl;
 #else
     std::cerr << "Floating Point Exception (CPU neppV=" << neppV << "): '" << FPEhandlerMessage << "' ievt=" << FPEhandlerIevt << std::endl;
@@ -53,7 +53,7 @@ namespace mg5amcCpu
 
 TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testxxx )
 {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   using namespace mg5amcGpu;
 #else
   using namespace mg5amcCpu;
@@ -77,7 +77,7 @@ TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testxxx )
   assert( nevt % neppM == 0 ); // nevt must be a multiple of neppM
   assert( nevt % neppV == 0 ); // nevt must be a multiple of neppV
   // Fill in the input momenta
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   mg5amcGpu::PinnedHostBufferMomenta hstMomenta( nevt ); // AOSOA[npagM][npar=4][np4=4][neppM]
 #else
   mg5amcCpu::HostBufferMomenta hstMomenta( nevt ); // AOSOA[npagM][npar=4][np4=4][neppM]
@@ -322,7 +322,7 @@ TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testxxx )
   {
     for( int ievt = 0; ievt < nevt; ievt++ )
     {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
       using namespace mg5amcGpu;
 #else
       using namespace mg5amcCpu;
diff --git a/epochX/cudacpp/gq_ttq.mad/src/HelAmps_sm.h b/epochX/cudacpp/gq_ttq.mad/src/HelAmps_sm.h
index cd4e6de668..45000c7246 100644
--- a/epochX/cudacpp/gq_ttq.mad/src/HelAmps_sm.h
+++ b/epochX/cudacpp/gq_ttq.mad/src/HelAmps_sm.h
@@ -5,7 +5,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: A. Valassi (Sep 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
 // MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
@@ -28,7 +28,7 @@
 //#include <iomanip>
 //#include <iostream>
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/gq_ttq.mad/src/Parameters_sm.cc b/epochX/cudacpp/gq_ttq.mad/src/Parameters_sm.cc
index c06dcbb252..8b92ea0bd6 100644
--- a/epochX/cudacpp/gq_ttq.mad/src/Parameters_sm.cc
+++ b/epochX/cudacpp/gq_ttq.mad/src/Parameters_sm.cc
@@ -4,7 +4,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: A. Valassi (Sep 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
 // MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
@@ -17,7 +17,7 @@
 #include <iomanip>
 #include <iostream>
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 using namespace mg5amcGpu;
 #else
 using namespace mg5amcCpu;
diff --git a/epochX/cudacpp/gq_ttq.mad/src/Parameters_sm.h b/epochX/cudacpp/gq_ttq.mad/src/Parameters_sm.h
index a6eb185434..a3615ec77a 100644
--- a/epochX/cudacpp/gq_ttq.mad/src/Parameters_sm.h
+++ b/epochX/cudacpp/gq_ttq.mad/src/Parameters_sm.h
@@ -27,7 +27,7 @@
 #include "read_slha.h"
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -93,7 +93,7 @@ namespace mg5amcCpu
 #include <limits>
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -217,7 +217,7 @@ namespace mg5amcCpu
 //==========================================================================
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -236,7 +236,7 @@ namespace mg5amcCpu
 #pragma GCC diagnostic push
 #pragma GCC diagnostic ignored "-Wunused-variable"  // e.g. <<warning: unused variable ‘mdl_G__exp__2’ [-Wunused-variable]>>
 #pragma GCC diagnostic ignored "-Wunused-parameter" // e.g. <<warning: unused parameter ‘G’ [-Wunused-parameter]>>
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 #pragma nv_diagnostic push
 #pragma nv_diag_suppress 177 // e.g. <<warning #177-D: variable "mdl_G__exp__2" was declared but never referenced>>
 #endif
@@ -263,7 +263,7 @@ namespace mg5amcCpu
       // End SM implementation - no special handling of vectors of floats as in EFT (#439)
       return out;
     }
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 #pragma GCC diagnostic pop
 #pragma nv_diagnostic pop
 #endif
diff --git a/epochX/cudacpp/gq_ttq.mad/src/cudacpp_src.mk b/epochX/cudacpp/gq_ttq.mad/src/cudacpp_src.mk
index d4cc628aec..159e19a46d 100644
--- a/epochX/cudacpp/gq_ttq.mad/src/cudacpp_src.mk
+++ b/epochX/cudacpp/gq_ttq.mad/src/cudacpp_src.mk
@@ -19,7 +19,7 @@ SHELL := /bin/bash
 #=== Configure common compiler flags for CUDA and C++
 
 INCFLAGS = -I.
-OPTFLAGS = -O3 # this ends up in CUFLAGS too (should it?), cannot add -Ofast or -ffast-math here
+OPTFLAGS = -O3 # this ends up in GPUFLAGS too (should it?), cannot add -Ofast or -ffast-math here
 
 #-------------------------------------------------------------------------------
 
@@ -45,13 +45,13 @@ endif
 
 #-------------------------------------------------------------------------------
 
-#=== Configure the CUDA compiler (note: NVCC is already exported including ccache)
+#=== Configure the CUDA compiler (note: GPUCC is already exported including ccache)
 
-###$(info NVCC=$(NVCC))
+###$(info GPUCC=$(GPUCC))
 
 #-------------------------------------------------------------------------------
 
-#=== Configure ccache for C++ builds (note: NVCC is already exported including ccache)
+#=== Configure ccache for C++ builds (note: GPUCC is already exported including ccache)
 
 # Enable ccache if USECCACHE=1
 ifeq ($(USECCACHE)$(shell echo $(CXX) | grep ccache),1)
@@ -92,6 +92,13 @@ endif
 ###$(info OMPFLAGS=$(OMPFLAGS))
 CXXFLAGS += $(OMPFLAGS)
 
+# Add correct -DHIP_LATFORM when compiling for HIP
+ifeq ($(findstring nvcc,$(GPUCC)),nvcc)
+  GPUFLAGS += -Xcompiler -fPIC -c -x cu
+else ifeq ($(findstring hipcc,$(GPUCC)),hipcc)
+  GPUFLAGS += -fPIC -c
+endif
+
 # Set the build flags appropriate to each AVX choice (example: "make AVX=none")
 # [NB MGONGPU_PVW512 is needed because "-mprefer-vector-width=256" is not exposed in a macro]
 # [See https://gcc.gnu.org/bugzilla/show_bug.cgi?id=96476]
@@ -253,20 +260,20 @@ $(BUILDDIR)/%.o : %.cc *.h $(BUILDDIR)/.build.$(TAG)
 # Generic target and build rules: objects from CUDA compilation
 $(BUILDDIR)/%_cu.o : %.cc *.h $(BUILDDIR)/.build.$(TAG)
 	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
-	$(NVCC) $(CPPFLAGS) $(CUFLAGS) -Xcompiler -fPIC -c -x cu $< -o $@
+	$(GPUCC) $(CPPFLAGS) $(GPUFLAGS) $< -o $@
 
 #-------------------------------------------------------------------------------
 
 cxx_objects=$(addprefix $(BUILDDIR)/, Parameters_sm.o read_slha.o)
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 cu_objects=$(addprefix $(BUILDDIR)/, Parameters_sm_cu.o)
 endif
 
 # Target (and build rules): common (src) library
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so : $(cxx_objects) $(cu_objects)
 	@if [ ! -d $(LIBDIR) ]; then echo "mkdir -p $(LIBDIR)"; mkdir -p $(LIBDIR); fi
-	$(NVCC) -shared -o $@ $(cxx_objects) $(cu_objects) $(LDFLAGS)
+	$(GPUCC) -shared -o $@ $(cxx_objects) $(cu_objects) $(LDFLAGS)
 else
 $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so : $(cxx_objects)
 	@if [ ! -d $(LIBDIR) ]; then echo "mkdir -p $(LIBDIR)"; mkdir -p $(LIBDIR); fi
diff --git a/epochX/cudacpp/gq_ttq.mad/src/mgOnGpuConfig.h b/epochX/cudacpp/gq_ttq.mad/src/mgOnGpuConfig.h
index 80032e528b..55d03f1252 100644
--- a/epochX/cudacpp/gq_ttq.mad/src/mgOnGpuConfig.h
+++ b/epochX/cudacpp/gq_ttq.mad/src/mgOnGpuConfig.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jul 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MGONGPUCONFIG_H
 #define MGONGPUCONFIG_H 1
@@ -10,12 +10,25 @@
 // There are two different code bases for standalone_cudacpp (without multichannel) and madevent+cudacpp (with multichannel)
 #define MGONGPU_SUPPORTS_MULTICHANNEL 1
 
+// Is this a GPU (CUDA, HIP) or CPU implementation?
+#ifdef __CUDACC__
+#define MGONGPUCPP_GPUIMPL cuda
+#elif defined __HIPCC__
+#define MGONGPUCPP_GPUIMPL hip
+#else
+#undef MGONGPUCPP_GPUIMPL
+#endif
+
 // ** NB1 Throughputs (e.g. 6.8E8) are events/sec for "./gcheck.exe -p 65536 128 12"
 // ** NB2 Baseline on b7g47n0004 fluctuates (probably depends on load on other VMs)
 
 // Choose if curand is supported for generating random numbers
+// For HIP, by default, do not use curand (common random numbers will be used instead)
 // For both CUDA and C++, by default, do not inline, but allow this macro to be set from outside with e.g. -DMGONGPU_HAS_NO_CURAND
-// (there exist CUDA installations, e.g. using the HPC package, which do not include curand - see PR #784)
+// (there exist CUDA installations, e.g. using the HPC package, which do not include curand - see PR #784 and #785)
+#if defined __HIPCC__
+#define MGONGPU_HAS_NO_CURAND 1
+#else
 //#ifdef __CUDACC__
 //#undef MGONGPU_HAS_NO_CURAND // default
 ////#define MGONGPU_HAS_NO_CURAND 1
@@ -23,6 +36,7 @@
 //#undef MGONGPU_HAS_NO_CURAND // default
 ////#define MGONGPU_HAS_NO_CURAND 1
 //#endif
+#endif
 
 // Choose floating point precision (for everything but color algebra #537)
 // If one of these macros has been set from outside with e.g. -DMGONGPU_FPTYPE_FLOAT, nothing happens (issue #167)
@@ -54,23 +68,28 @@
 //#undef MGONGPU_HARDCODE_PARAM // default
 ////#define MGONGPU_HARDCODE_PARAM 1
 
-// Complex type in c++: std::complex or cxsmpl (CHOOSE ONLY ONE)
-#ifndef __CUDACC__
-//#define MGONGPU_CPPCXTYPE_STDCOMPLEX 1 // ~8 percent slower on float, same on double (5.1E6/double, 9.4E6/float)
-#define MGONGPU_CPPCXTYPE_CXSMPL 1 // new default (5.1E6/double, 10.2E6/float)
-#endif
-
-// Complex type in cuda: thrust or cucomplex or cxsmpl (CHOOSE ONLY ONE)
+// Complex type in CUDA: thrust or cucomplex or cxsmpl (CHOOSE ONLY ONE)
 #ifdef __CUDACC__
 #define MGONGPU_CUCXTYPE_THRUST 1 // default (~1.15E9/double, ~3.2E9/float)
 //#define MGONGPU_CUCXTYPE_CUCOMPLEX 1 // ~10 percent slower (1.03E9/double, ~2.8E9/float)
 //#define MGONGPU_CUCXTYPE_CXSMPL 1 // ~10 percent slower (1.00E9/double, ~2.9E9/float)
+
+// Complex type in HIP: cxsmpl (ONLY ONE OPTION POSSIBLE)
+#elif defined __HIPCC__
+#define MGONGPU_CUCXTYPE_CXSMPL 1 // ~10 percent slower (1.00E9/double, ~2.9E9/float)
+
+// Complex type in C++: std::complex or cxsmpl (CHOOSE ONLY ONE)
+#else
+//#define MGONGPU_CPPCXTYPE_STDCOMPLEX 1 // ~8 percent slower on float, same on double (5.1E6/double, 9.4E6/float)
+#define MGONGPU_CPPCXTYPE_CXSMPL 1 // new default (5.1E6/double, 10.2E6/float)
 #endif
 
-// Cuda nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation
+// CUDA nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation
 #ifdef __CUDACC__
-#undef MGONGPU_NSIGHT_DEBUG // default
+#undef MGONGPU_NSIGHT_DEBUG // default in CUDA
 //#define MGONGPU_NSIGHT_DEBUG 1
+#else
+#undef MGONGPU_NSIGHT_DEBUG // only option in HIP or C++
 #endif
 
 // SANITY CHECKS (floating point precision for everything but color algebra #537)
@@ -86,17 +105,21 @@
 #error You cannot use double precision for color algebra and single precision elsewhere
 #endif
 
-// SANITY CHECKS (c++ complex number implementation)
-#ifndef __CUDACC__
-#if defined MGONGPU_CPPCXTYPE_STDCOMPLEX and defined MGONGPU_CPPCXTYPE_CXSMPL
-#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CPPCXTYPE_STDCOMPLEX or MGONGPU_CPPCXTYPE_CXSMPL
+// SANITY CHECKS (CUDA complex number implementation)
+#ifdef __CUDACC__
+#if defined MGONGPU_CUCXTYPE_THRUST and defined MGONGPU_CUCXTYPE_CUCOMPLEX
+#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CUCXTYPE_THRUST or MGONGPU_CUCXTYPE_CUCOMPLEX for CUDA
+#elif defined MGONGPU_CUCXTYPE_THRUST and defined MGONGPU_CUCXTYPE_CXSMPL
+#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CUCXTYPE_THRUST or MGONGPU_CUCXTYPE_CXSMPL for CUDA
+#elif defined MGONGPU_CUCXTYPE_CUCOMPLEX and defined MGONGPU_CUCXTYPE_CXSMPL
+#error You must CHOOSE (ONE AND) ONLY ONE OF MGONGPU_CUCXTYPE_CUCOMPLEX or MGONGPU_CUCXTYPE_CXSMPL for CUDA
 #endif
 #endif
 
-// SANITY CHECKS (cuda complex number implementation)
-#ifdef __CUDACC__
-#if defined MGONGPU_CUCXTYPE_THRUST and defined MGONGPU_CUCXTYPE_CUCOMPLEX and defined MGONGPU_CUCXTYPE_CXSMPL
-#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CUCXTYPE_THRUST or MGONGPU_CUCXTYPE_CUCOMPLEX or MGONGPU_CUCXTYPE_CXSMPL
+// SANITY CHECKS (C++ complex number implementation)
+#ifndef MGONGPUCPP_GPUIMPL
+#if defined MGONGPU_CPPCXTYPE_STDCOMPLEX and defined MGONGPU_CPPCXTYPE_CXSMPL
+#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CPPCXTYPE_STDCOMPLEX or MGONGPU_CPPCXTYPE_CXSMPL for C++
 #endif
 #endif
 
@@ -134,7 +157,7 @@ namespace mgOnGpu
   // Alignment requirement for using reinterpret_cast with SIMD vectorized code
   // (using reinterpret_cast with non aligned memory may lead to segmentation faults!)
   // Only needed for C++ code but can be enforced also in NVCC builds of C++ code using CUDA>=11.2 and C++17 (#318, #319, #333)
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   constexpr int cppAlign = 64; // alignment requirement for SIMD vectorization (64-byte i.e. 512-bit)
 #endif
 
@@ -145,7 +168,7 @@ using mgOnGpu::fptype;
 using mgOnGpu::fptype2;
 
 // C++ SIMD vectorization width (this will be used to set neppV)
-#ifdef __CUDACC__ // CUDA implementation has no SIMD
+#ifdef MGONGPUCPP_GPUIMPL // CUDA and HIP implementations have no SIMD
 #undef MGONGPU_CPPSIMD
 #elif defined __AVX512VL__ && defined MGONGPU_PVW512 // C++ "512z" AVX512 with 512 width (512-bit ie 64-byte): 8 (DOUBLE) or 16 (FLOAT)
 #ifdef MGONGPU_FPTYPE_DOUBLE
@@ -175,9 +198,9 @@ using mgOnGpu::fptype2;
 #undef MGONGPU_CPPSIMD
 #endif
 
-// Cuda nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation
+// CUDA nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation
 // Arguments (not used so far): text is __FUNCTION__, code is 0 (start) or 1 (end)
-#if defined __CUDACC__ && defined MGONGPU_NSIGHT_DEBUG /* clang-format off */
+#if defined __CUDA__ && defined MGONGPU_NSIGHT_DEBUG /* clang-format off */
 #define mgDebugDeclare() __shared__ float mgDebugCounter[mgOnGpu::ntpbMAX];
 #define mgDebugInitialise() { mgDebugCounter[threadIdx.x] = 0; }
 #define mgDebug( code, text ) { mgDebugCounter[threadIdx.x] += 1; }
@@ -189,8 +212,8 @@ using mgOnGpu::fptype2;
 #define mgDebugFinalise() { /*noop*/ }
 #endif /* clang-format on */
 
-// Define empty CUDA declaration specifiers for C++
-#ifndef __CUDACC__
+// Define empty CUDA/HIP declaration specifiers for C++
+#ifndef MGONGPUCPP_GPUIMPL
 #define __global__
 #define __host__
 #define __device__
diff --git a/epochX/cudacpp/gq_ttq.mad/src/mgOnGpuCxtypes.h b/epochX/cudacpp/gq_ttq.mad/src/mgOnGpuCxtypes.h
index ca9a9f00c0..5532e22fa1 100644
--- a/epochX/cudacpp/gq_ttq.mad/src/mgOnGpuCxtypes.h
+++ b/epochX/cudacpp/gq_ttq.mad/src/mgOnGpuCxtypes.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022, based on earlier work by D. Smith) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MGONGPUCXTYPES_H
 #define MGONGPUCXTYPES_H 1
@@ -19,7 +19,7 @@
 #include <complex>
 
 // Complex type in cuda: thrust or cucomplex or cxsmpl
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 #if defined MGONGPU_CUCXTYPE_THRUST
 #pragma clang diagnostic push
 #pragma clang diagnostic ignored "-Wtautological-compare" // for icpx2021/clang13 (https://stackoverflow.com/a/15864661)
@@ -82,7 +82,7 @@ namespace mgOnGpu /* clang-format off */
 using mgOnGpu::cxsmpl;
 
 // Printout to stream for user defined types
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -92,7 +92,7 @@ namespace mg5amcCpu
   inline __host__ std::ostream&
   operator<<( std::ostream& out, const cxsmpl<FP>& c )
   {
-    out << std::complex( c.real(), c.imag() );
+    out << std::complex<FP>( c.real(), c.imag() );
     return out;
   }
 
@@ -215,14 +215,14 @@ namespace mg5amcCpu
 //==========================================================================
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
 #endif
 {
   // --- Type definitions (complex type: cxtype)
-#ifdef __CUDACC__ // cuda
+#ifdef MGONGPUCPP_GPUIMPL // cuda
 #if defined MGONGPU_CUCXTYPE_THRUST
   typedef thrust::complex<fptype> cxtype;
 #elif defined MGONGPU_CUCXTYPE_CUCOMPLEX
@@ -255,7 +255,7 @@ namespace mg5amcCpu
 //==========================================================================
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -307,7 +307,7 @@ namespace mg5amcCpu
 
   //==========================================================================
 
-#if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_THRUST // cuda + thrust
+#if defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CUCXTYPE_THRUST // cuda + thrust
 
   //------------------------------
   // CUDA - using thrust::complex
@@ -343,11 +343,11 @@ namespace mg5amcCpu
     return c;
   }
 
-#endif // #if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_THRUST
+#endif // #if defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CUCXTYPE_THRUST
 
   //==========================================================================
 
-#if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_CUCOMPLEX // cuda + cucomplex
+#if defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CUCXTYPE_CUCOMPLEX // cuda + cucomplex
 
   //------------------------------
   // CUDA - using cuComplex
@@ -562,11 +562,11 @@ namespace mg5amcCpu
     return cxmake( c.real(), c.imag() );
   }
 
-#endif // #if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_CUCOMPLEX
+#endif // #if defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CUCXTYPE_CUCOMPLEX
 
   //==========================================================================
 
-#if not defined __CUDACC__ and defined MGONGPU_CPPCXTYPE_STDCOMPLEX // c++ + stdcomplex
+#if not defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CPPCXTYPE_STDCOMPLEX // c++ + stdcomplex
 
   //------------------------------
   // C++ - using std::complex
@@ -610,7 +610,7 @@ namespace mg5amcCpu
   }
 #endif
 
-#endif // #if not defined __CUDACC__ and defined MGONGPU_CPPCXTYPE_STDCOMPLEX
+#endif // #if not defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CPPCXTYPE_STDCOMPLEX
 
   //==========================================================================
 
@@ -633,7 +633,7 @@ namespace mg5amcCpu
 //==========================================================================
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/gq_ttq.mad/src/mgOnGpuFptypes.h b/epochX/cudacpp/gq_ttq.mad/src/mgOnGpuFptypes.h
index 905c97d700..fa3a02664b 100644
--- a/epochX/cudacpp/gq_ttq.mad/src/mgOnGpuFptypes.h
+++ b/epochX/cudacpp/gq_ttq.mad/src/mgOnGpuFptypes.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MGONGPUFPTYPES_H
 #define MGONGPUFPTYPES_H 1
@@ -12,7 +12,7 @@
 #include <cmath>
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL // cuda
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -20,7 +20,7 @@ namespace mg5amcCpu
 {
   //==========================================================================
 
-#ifdef __CUDACC__ // cuda
+#ifdef MGONGPUCPP_GPUIMPL // cuda
 
   //------------------------------
   // Floating point types - Cuda
@@ -64,11 +64,11 @@ namespace mg5amcCpu
 #endif
   }
 
-#endif // #ifdef __CUDACC__
+#endif // #ifdef MGONGPUCPP_GPUIMPL
 
   //==========================================================================
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 
   //------------------------------
   // Floating point types - C++
@@ -92,7 +92,7 @@ namespace mg5amcCpu
     return std::sqrt( f );
   }
 
-#endif // #ifndef __CUDACC__
+#endif // #ifndef MGONGPUCPP_GPUIMPL
 
   //==========================================================================
 
diff --git a/epochX/cudacpp/gq_ttq.mad/src/mgOnGpuVectors.h b/epochX/cudacpp/gq_ttq.mad/src/mgOnGpuVectors.h
index e1299ba81e..cdae04326b 100644
--- a/epochX/cudacpp/gq_ttq.mad/src/mgOnGpuVectors.h
+++ b/epochX/cudacpp/gq_ttq.mad/src/mgOnGpuVectors.h
@@ -32,7 +32,7 @@
 #endif
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -131,7 +131,7 @@ namespace mg5amcCpu
 #endif
 #endif
 
-#else // i.e #ifndef MGONGPU_CPPSIMD (this includes #ifdef __CUDACC__)
+#else // i.e #ifndef MGONGPU_CPPSIMD (this includes #ifdef MGONGPUCPP_GPUIMPL)
 
   const int neppV = 1;
 
@@ -153,13 +153,13 @@ namespace mg5amcCpu
 //==========================================================================
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
 #endif
 {
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 
   // Printout to stream for user defined types
 
@@ -805,11 +805,11 @@ namespace mg5amcCpu
 
 #endif // #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
 
-#endif // #ifndef __CUDACC__
+#endif // #ifndef MGONGPUCPP_GPUIMPL
 
   //==========================================================================
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 
   //------------------------------
   // Vector types - CUDA
@@ -853,12 +853,12 @@ namespace mg5amcCpu
     return mask;
   }
 
-#endif // #ifdef __CUDACC__
+#endif // #ifdef MGONGPUCPP_GPUIMPL
 
   //==========================================================================
 
   // Scalar-or-vector types: scalar in CUDA, vector or scalar in C++
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   typedef bool bool_sv;
   typedef fptype fptype_sv;
   typedef fptype2 fptype2_sv;
@@ -879,7 +879,7 @@ namespace mg5amcCpu
 #endif
 
   // Scalar-or-vector zeros: scalar in CUDA, vector or scalar in C++
-#ifdef __CUDACC__ /* clang-format off */
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
   inline __host__ __device__ cxtype cxzero_sv(){ return cxtype( 0, 0 ); }
 #elif defined MGONGPU_CPPSIMD
   inline cxtype_v cxzero_sv() { return cxtype_v(); } // RRRR=0000 IIII=0000
diff --git a/epochX/cudacpp/gq_ttq.mad/src/rambo.h b/epochX/cudacpp/gq_ttq.mad/src/rambo.h
index e02ea52496..cd7e1008ea 100644
--- a/epochX/cudacpp/gq_ttq.mad/src/rambo.h
+++ b/epochX/cudacpp/gq_ttq.mad/src/rambo.h
@@ -4,7 +4,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 
 #include "mgOnGpuConfig.h"
@@ -18,7 +18,7 @@
 #include <iostream>
 
 // Simplified rambo version for 2 to N (with N>=2) processes with massless particles
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -83,7 +83,7 @@ namespace mg5amcCpu
       static bool first = true;
       if( first )
       {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
         if constexpr( M_ACCESS::isOnDevice() ) // avoid
         {
           const int ievt0 = 0;
@@ -166,7 +166,7 @@ namespace mg5amcCpu
     wt = po2log;
     if( nparf != 2 ) wt = ( 2. * nparf - 4. ) * log( energy ) + z[nparf - 1];
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
     // issue warnings if weight is too small or too large
     static int iwarn[5] = { 0, 0, 0, 0, 0 };
     if( wt < -180. )
diff --git a/epochX/cudacpp/gq_ttq.sa/CODEGEN_cudacpp_gq_ttq_log.txt b/epochX/cudacpp/gq_ttq.sa/CODEGEN_cudacpp_gq_ttq_log.txt
index f659f6bb8d..5a07808142 100644
--- a/epochX/cudacpp/gq_ttq.sa/CODEGEN_cudacpp_gq_ttq_log.txt
+++ b/epochX/cudacpp/gq_ttq.sa/CODEGEN_cudacpp_gq_ttq_log.txt
@@ -52,7 +52,7 @@ Note that you can still compile and run aMC@NLO with the built-in PDFs
 
 Using default text editor "vi". Set another one in ./input/mg5_configuration.txt
 Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt
-No valid web browser found. Please set in ./input/mg5_configuration.txt
+Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt
 import /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq.mg
 The import format was not given, so we guess it as command
 set stdout_level DEBUG
@@ -61,7 +61,7 @@ set zerowidth_tchannel F
 define q = u c d s u~ c~ d~ s~
 INFO: load particles 
 INFO: load vertices 
-[1;32mDEBUG: model prefixing  takes 0.0054836273193359375 [0m
+[1;32mDEBUG: model prefixing  takes 0.005926370620727539 [0m
 INFO: Restrict model sm with file models/sm/restrict_default.dat . 
 [1;32mDEBUG: Simplifying conditional expressions [0m
 [1;32mDEBUG: remove interactions: u s w+ at order: QED=1 [0m
@@ -170,7 +170,7 @@ INFO: Crossed process found for g u~ > t t~ u~, reuse diagrams.
 INFO: Crossed process found for g c~ > t t~ c~, reuse diagrams. 
 INFO: Crossed process found for g d~ > t t~ d~, reuse diagrams. 
 INFO: Crossed process found for g s~ > t t~ s~, reuse diagrams. 
-8 processes with 40 diagrams generated in 0.080 s
+8 processes with 40 diagrams generated in 0.082 s
 Total: 8 processes with 40 diagrams
 output standalone_cudacpp ../TMPOUT/CODEGEN_cudacpp_gq_ttq
 Load PLUGIN.CUDACPP_OUTPUT
@@ -211,7 +211,7 @@ Generated helas calls for 2 subprocesses (10 diagrams) in 0.031 s
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates FFV1 routines[0m
 ALOHA: aloha creates VVV1 routines[0m
-ALOHA: aloha creates 2 routines in  0.146 s
+ALOHA: aloha creates 2 routines in  0.179 s
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
@@ -227,7 +227,7 @@ INFO: Created files Parameters_sm.h and Parameters_sm.cc in directory
 INFO: /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/src/. and /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/src/. 
 quit
 
-real	0m0.709s
-user	0m0.586s
-sys	0m0.064s
-Code generation completed in 0 seconds
+real	0m1.076s
+user	0m0.601s
+sys	0m0.061s
+Code generation completed in 1 seconds
diff --git a/epochX/cudacpp/gq_ttq.sa/COPYRIGHT b/epochX/cudacpp/gq_ttq.sa/COPYRIGHT
index a134b5fef9..84a883fbb0 100644
--- a/epochX/cudacpp/gq_ttq.sa/COPYRIGHT
+++ b/epochX/cudacpp/gq_ttq.sa/COPYRIGHT
@@ -15,6 +15,7 @@ The full development team currently includes the following authors :
   Stephan Hageboeck (CERN)
   Olivier Mattelaer (Universite Catholique de Louvain, original author)
   Stefan Roiser (CERN, original author)
+  Joergen Teig (CERN)
   Andrea Valassi (CERN, original author)
   Zenny Wettersten (CERN)
 See https://github.com/madgraph5/madgraph4gpu for more details. For the full
diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/Bridge.h b/epochX/cudacpp/gq_ttq.sa/SubProcesses/Bridge.h
index bf8b5e024d..89437b4c42 100644
--- a/epochX/cudacpp/gq_ttq.sa/SubProcesses/Bridge.h
+++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/Bridge.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: S. Roiser (Nov 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Roiser, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef BRIDGE_H
 #define BRIDGE_H 1
@@ -23,7 +23,7 @@
 #include <memory>
 #include <type_traits>
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -83,7 +83,7 @@ namespace mg5amcCpu
     Bridge& operator=( const Bridge& ) = delete;
     Bridge& operator=( Bridge&& ) = delete;
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     /**
      * Set the gpublocks and gputhreads for the gpusequence - throws if evnt != gpublocks*gputhreads
      * (this is needed for BridgeKernel tests rather than for actual production use in Fortran)
@@ -150,7 +150,7 @@ namespace mg5amcCpu
     unsigned int m_nevt; // number of events
     int m_nGoodHel;      // the number of good helicities (-1 initially when they have not yet been calculated)
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     int m_gputhreads; // number of gpu threads (default set from number of events, can be modified)
     int m_gpublocks;  // number of gpu blocks (default set from number of events, can be modified)
     DeviceBuffer<FORTRANFPTYPE, sizePerEventMomenta> m_devMomentaF;
@@ -187,12 +187,12 @@ namespace mg5amcCpu
   // Forward declare transposition methods
   //
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 
   template<typename Tin, typename Tout>
   __global__ void dev_transposeMomentaF2C( const Tin* in, Tout* out, const unsigned int nevt );
 
-#endif // __CUDACC__
+#endif // MGONGPUCPP_GPUIMPL
 
   template<typename Tin, typename Tout>
   void hst_transposeMomentaF2C( const Tin* in, Tout* out, const unsigned int nevt );
@@ -209,7 +209,7 @@ namespace mg5amcCpu
   Bridge<FORTRANFPTYPE>::Bridge( unsigned int nevtF, unsigned int nparF, unsigned int np4F )
     : m_nevt( nevtF )
     , m_nGoodHel( -1 )
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     , m_gputhreads( 256 )                  // default number of gpu threads
     , m_gpublocks( m_nevt / m_gputhreads ) // this ensures m_nevt <= m_gpublocks*m_gputhreads
     , m_devMomentaF( m_nevt )
@@ -233,7 +233,7 @@ namespace mg5amcCpu
   {
     if( nparF != CPPProcess::npar ) throw std::runtime_error( "Bridge constructor: npar mismatch" );
     if( np4F != CPPProcess::np4 ) throw std::runtime_error( "Bridge constructor: np4 mismatch" );
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     if( ( m_nevt < s_gputhreadsmin ) || ( m_nevt % s_gputhreadsmin != 0 ) )
       throw std::runtime_error( "Bridge constructor: nevt should be a multiple of " + std::to_string( s_gputhreadsmin ) );
     while( m_nevt != m_gpublocks * m_gputhreads )
@@ -249,7 +249,7 @@ namespace mg5amcCpu
 #else
     std::cout << "WARNING! Instantiate host Bridge (nevt=" << m_nevt << ")" << std::endl;
     m_pmek.reset( new MatrixElementKernelHost( m_hstMomentaC, m_hstGs, m_hstRndHel, m_hstRndCol, m_hstMEs, m_hstSelHel, m_hstSelCol, m_nevt ) );
-#endif // __CUDACC__
+#endif // MGONGPUCPP_GPUIMPL
     // Create a process object, read param card and set parameters
     // FIXME: the process instance can happily go out of scope because it is only needed to read parameters?
     // FIXME: the CPPProcess should really be a singleton? what if fbridgecreate is called from several Fortran threads?
@@ -262,7 +262,7 @@ namespace mg5amcCpu
     process.initProc( paramCard );
   }
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   template<typename FORTRANFPTYPE>
   void Bridge<FORTRANFPTYPE>::set_gpugrid( const int gpublocks, const int gputhreads )
   {
@@ -276,7 +276,7 @@ namespace mg5amcCpu
   }
 #endif
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   template<typename FORTRANFPTYPE>
   void Bridge<FORTRANFPTYPE>::gpu_sequence( const FORTRANFPTYPE* momenta,
                                             const FORTRANFPTYPE* gs,
@@ -291,14 +291,14 @@ namespace mg5amcCpu
     constexpr int neppM = MemoryAccessMomenta::neppM;
     if constexpr( neppM == 1 && std::is_same_v<FORTRANFPTYPE, fptype> )
     {
-      checkCuda( cudaMemcpy( m_devMomentaC.data(), momenta, m_devMomentaC.bytes(), cudaMemcpyHostToDevice ) );
+      gpuMemcpy( m_devMomentaC.data(), momenta, m_devMomentaC.bytes(), gpuMemcpyHostToDevice );
     }
     else
     {
-      checkCuda( cudaMemcpy( m_devMomentaF.data(), momenta, m_devMomentaF.bytes(), cudaMemcpyHostToDevice ) );
+      gpuMemcpy( m_devMomentaF.data(), momenta, m_devMomentaF.bytes(), gpuMemcpyHostToDevice );
       const int thrPerEvt = CPPProcess::npar * CPPProcess::np4; // AV: transpose alg does 1 element per thread (NOT 1 event per thread)
       //const int thrPerEvt = 1; // AV: try new alg with 1 event per thread... this seems slower
-      dev_transposeMomentaF2C<<<m_gpublocks * thrPerEvt, m_gputhreads>>>( m_devMomentaF.data(), m_devMomentaC.data(), m_nevt );
+      gpuLaunchKernel( dev_transposeMomentaF2C, m_gpublocks * thrPerEvt, m_gputhreads, m_devMomentaF.data(), m_devMomentaC.data(), m_nevt );
     }
     if constexpr( std::is_same_v<FORTRANFPTYPE, fptype> )
     {
@@ -341,7 +341,7 @@ namespace mg5amcCpu
   }
 #endif
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   template<typename FORTRANFPTYPE>
   void Bridge<FORTRANFPTYPE>::cpu_sequence( const FORTRANFPTYPE* momenta,
                                             const FORTRANFPTYPE* gs,
@@ -396,7 +396,7 @@ namespace mg5amcCpu
   // - C++ array: momenta[npagM][npar][np4][neppM] with nevt=npagM*neppM (AOSOA)
   //
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   template<typename Tin, typename Tout>
   __global__ void dev_transposeMomentaF2C( const Tin* in, Tout* out, const unsigned int nevt )
   {
diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/BridgeKernels.cc b/epochX/cudacpp/gq_ttq.sa/SubProcesses/BridgeKernels.cc
index d58066c9c1..eaf4037a24 100644
--- a/epochX/cudacpp/gq_ttq.sa/SubProcesses/BridgeKernels.cc
+++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/BridgeKernels.cc
@@ -1,17 +1,18 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #include "BridgeKernels.h"
 
+#include "GpuAbstraction.h"
 #include "MemoryAccessMomenta.h"
 
 #include <sstream>
 
 //============================================================================
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -45,7 +46,7 @@ namespace mg5amcCpu
 
 //============================================================================
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 namespace mg5amcCpu
 {
 
@@ -96,7 +97,7 @@ namespace mg5amcCpu
 
 //============================================================================
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 {
 
diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/BridgeKernels.h b/epochX/cudacpp/gq_ttq.sa/SubProcesses/BridgeKernels.h
index 15eb4bff4d..3efef8ce97 100644
--- a/epochX/cudacpp/gq_ttq.sa/SubProcesses/BridgeKernels.h
+++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/BridgeKernels.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef BRIDGEKERNELS_H
 #define BRIDGEKERNELS_H 1
@@ -12,7 +12,7 @@
 #include "MatrixElementKernels.h"
 #include "MemoryBuffers.h"
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -49,7 +49,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A Bridge wrapper class encapsulating matrix element calculations on a CPU host
   class BridgeKernelHost final : public BridgeKernelBase
   {
@@ -89,7 +89,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   // A Bridge wrapper class encapsulating matrix element calculations on a GPU device
   class BridgeKernelDevice : public BridgeKernelBase
   {
diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/CommonRandomNumberKernel.cc b/epochX/cudacpp/gq_ttq.sa/SubProcesses/CommonRandomNumberKernel.cc
index 985b39f576..010bc4cbd0 100644
--- a/epochX/cudacpp/gq_ttq.sa/SubProcesses/CommonRandomNumberKernel.cc
+++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/CommonRandomNumberKernel.cc
@@ -1,15 +1,16 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #include "CommonRandomNumbers.h"
+#include "GpuAbstraction.h"
 #include "MemoryBuffers.h"
 #include "RandomNumberKernels.h"
 
 #include <cassert>
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/CrossSectionKernels.cc b/epochX/cudacpp/gq_ttq.sa/SubProcesses/CrossSectionKernels.cc
index 0b355a3c8d..c15b39844d 100644
--- a/epochX/cudacpp/gq_ttq.sa/SubProcesses/CrossSectionKernels.cc
+++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/CrossSectionKernels.cc
@@ -1,10 +1,11 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #include "CrossSectionKernels.h"
 
+#include "GpuAbstraction.h"
 #include "MemoryAccessMatrixElements.h"
 #include "MemoryAccessWeights.h"
 #include "MemoryBuffers.h"
@@ -77,7 +78,7 @@ debug_me_is_abnormal( const fptype& me, size_t ievtALL )
 
 //============================================================================
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -185,7 +186,7 @@ namespace mg5amcCpu
 
 //============================================================================
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 {
 
diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/CrossSectionKernels.h b/epochX/cudacpp/gq_ttq.sa/SubProcesses/CrossSectionKernels.h
index 7933ca4bbf..4d9659e04e 100644
--- a/epochX/cudacpp/gq_ttq.sa/SubProcesses/CrossSectionKernels.h
+++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/CrossSectionKernels.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef CROSSSECTIONKERNELS_H
 #define CROSSSECTIONKERNELS_H 1
@@ -13,7 +13,7 @@
 
 //============================================================================
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -96,7 +96,7 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
   /*
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   // A class encapsulating the calculation of event statistics on a GPU device
   class CrossSectionKernelDevice : public CrossSectionKernelBase, public NumberOfEvents
   {
diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/CudaRuntime.h b/epochX/cudacpp/gq_ttq.sa/SubProcesses/CudaRuntime.h
deleted file mode 100644
index 64ce52f4b3..0000000000
--- a/epochX/cudacpp/gq_ttq.sa/SubProcesses/CudaRuntime.h
+++ /dev/null
@@ -1,85 +0,0 @@
-// Copyright (C) 2020-2023 CERN and UCLouvain.
-// Licensed under the GNU Lesser General Public License (version 3 or later).
-// Created by: S. Roiser (Jul 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
-
-#ifndef MG5AMC_CUDARUNTIME_H
-#define MG5AMC_CUDARUNTIME_H 1
-
-// MG5AMC on GPU uses the CUDA runtime API, not the lower level CUDA driver API
-// See https://docs.nvidia.com/cuda/cuda-runtime-api/driver-vs-runtime-api.html#driver-vs-runtime-api
-
-#include <cassert>
-#include <iostream>
-
-//--------------------------------------------------------------------------
-
-// See https://stackoverflow.com/a/14038590
-#ifdef __CUDACC__ /* clang-format off */
-#define checkCuda( code ) { assertCuda( code, __FILE__, __LINE__ ); }
-inline void assertCuda( cudaError_t code, const char* file, int line, bool abort = true )
-{
-  if( code != cudaSuccess )
-  {
-    printf( "ERROR! assertCuda: '%s' (%d) in %s:%d\n", cudaGetErrorString( code ), code, file, line );
-    if( abort ) assert( code == cudaSuccess );
-  }
-}
-#endif /* clang-format on */
-
-//--------------------------------------------------------------------------
-
-#ifdef __CUDACC__
-namespace mg5amcGpu
-{
-  // Instantiate a CudaRuntime at the beginnining of the application's main to
-  // invoke cudaSetDevice(0) in the constructor and book a cudaDeviceReset() call in the destructor
-  // *** FIXME! This will all need to be designed differently when going to multi-GPU nodes! ***
-  struct CudaRuntime final
-  {
-    CudaRuntime( const bool debug = true )
-      : m_debug( debug ) { setUp( m_debug ); }
-    ~CudaRuntime() { tearDown( m_debug ); }
-    CudaRuntime( const CudaRuntime& ) = delete;
-    CudaRuntime( CudaRuntime&& ) = delete;
-    CudaRuntime& operator=( const CudaRuntime& ) = delete;
-    CudaRuntime& operator=( CudaRuntime&& ) = delete;
-    bool m_debug;
-
-    // Set up CUDA application
-    // ** NB: strictly speaking this is not needed when using the CUDA runtime API **
-    // Calling cudaSetDevice on startup is useful to properly book-keep the time spent in CUDA initialization
-    static void setUp( const bool debug = true )
-    {
-      // ** NB: it is useful to call cudaSetDevice, or cudaFree, to properly book-keep the time spent in CUDA initialization
-      // ** NB: otherwise, the first CUDA operation (eg a cudaMemcpyToSymbol in CPPProcess ctor) appears to take much longer!
-      /*
-      // [We initially added cudaFree(0) to "ease profile analysis" only because it shows up as a big recognizable block!]
-      // No explicit initialization is needed: https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#initialization
-      // It is not clear what cudaFree(0) does at all: https://stackoverflow.com/questions/69967813/
-      if ( debug ) std::cout << "__CudaRuntime: calling cudaFree(0)" << std::endl;
-      checkCuda( cudaFree( 0 ) ); // SLOW!
-      */
-      // Replace cudaFree(0) by cudaSetDevice(0), even if it is not really needed either
-      // (but see https://developer.nvidia.com/blog/cuda-pro-tip-always-set-current-device-avoid-multithreading-bugs)
-      if( debug ) std::cout << "__CudaRuntime: calling cudaSetDevice(0)" << std::endl;
-      checkCuda( cudaSetDevice( 0 ) ); // SLOW!
-    }
-
-    // Tear down CUDA application (call cudaDeviceReset)
-    // ** NB: strictly speaking this is not needed when using the CUDA runtime API **
-    // Calling cudaDeviceReset on shutdown is only needed for checking memory leaks in cuda-memcheck
-    // See https://docs.nvidia.com/cuda/cuda-memcheck/index.html#leak-checking
-    static void tearDown( const bool debug = true )
-    {
-      if( debug ) std::cout << "__CudaRuntime: calling cudaDeviceReset()" << std::endl;
-      checkCuda( cudaDeviceReset() );
-    }
-  };
-
-}
-#endif
-
-//--------------------------------------------------------------------------
-
-#endif // MG5AMC_CUDARUNTIME_H
diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/CurandRandomNumberKernel.cc b/epochX/cudacpp/gq_ttq.sa/SubProcesses/CurandRandomNumberKernel.cc
index eb56333b03..08a16f6f2c 100644
--- a/epochX/cudacpp/gq_ttq.sa/SubProcesses/CurandRandomNumberKernel.cc
+++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/CurandRandomNumberKernel.cc
@@ -1,9 +1,9 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
-#include "CudaRuntime.h"
+#include "GpuRuntime.h"
 #include "MemoryBuffers.h"
 #include "RandomNumberKernels.h"
 
@@ -22,7 +22,7 @@ inline void assertCurand( curandStatus_t code, const char *file, int line, bool
 }
 #endif /* clang-format on */
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -36,7 +36,7 @@ namespace mg5amcCpu
   {
     if( m_isOnDevice )
     {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
       if( !m_rnarray.isOnDevice() )
         throw std::runtime_error( "CurandRandomNumberKernel on device with a host random number array" );
 #else
@@ -114,7 +114,7 @@ namespace mg5amcCpu
     /*
     printf( "\nCurandRandomNumberKernel::generateRnarray size = %d\n", (int)m_rnarray.size() );
     fptype* data = m_rnarray.data();
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     if( m_rnarray.isOnDevice() )
     {
       data = new fptype[m_rnarray.size()]();
@@ -123,7 +123,7 @@ namespace mg5amcCpu
 #endif
     for( int i = 0; i < ( (int)m_rnarray.size() / 4 ); i++ )
       printf( "[%4d] %f %f %f %f\n", i * 4, data[i * 4], data[i * 4 + 2], data[i * 4 + 2], data[i * 4 + 3] );
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     if( m_rnarray.isOnDevice() ) delete[] data;
 #endif
     */
diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/EventStatistics.h b/epochX/cudacpp/gq_ttq.sa/SubProcesses/EventStatistics.h
index 48b51e0a49..b425a5bade 100644
--- a/epochX/cudacpp/gq_ttq.sa/SubProcesses/EventStatistics.h
+++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/EventStatistics.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef EventStatistics_H
 #define EventStatistics_H 1
@@ -16,7 +16,7 @@
 #include <limits>
 #include <string>
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/GpuAbstraction.h b/epochX/cudacpp/gq_ttq.sa/SubProcesses/GpuAbstraction.h
new file mode 100644
index 0000000000..6a7d9c05c0
--- /dev/null
+++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/GpuAbstraction.h
@@ -0,0 +1,71 @@
+// Copyright (C) 2020-2023 CERN and UCLouvain.
+// Licensed under the GNU Lesser General Public License (version 3 or later).
+// Created by: J. Teig (Jul 2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+
+#ifndef MG5AMC_GPUABSTRACTION_H
+#define MG5AMC_GPUABSTRACTION_H 1
+
+#include <cassert>
+
+//--------------------------------------------------------------------------
+
+#ifdef __CUDACC__
+
+#define gpuError_t cudaError_t
+#define gpuPeekAtLastError cudaPeekAtLastError
+#define gpuGetErrorString cudaGetErrorString
+#define gpuSuccess cudaSuccess
+
+#define gpuMallocHost( ptr, size ) checkGpu( cudaMallocHost( ptr, size ) )
+#define gpuMalloc( ptr, size ) checkGpu( cudaMalloc( ptr, size ) )
+
+#define gpuMemcpy( dstData, srcData, srcBytes, func ) checkGpu( cudaMemcpy( dstData, srcData, srcBytes, func ) )
+#define gpuMemcpyHostToDevice cudaMemcpyHostToDevice
+#define gpuMemcpyDeviceToHost cudaMemcpyDeviceToHost
+#define gpuMemcpyToSymbol( type1, type2, size ) checkGpu( cudaMemcpyToSymbol( type1, type2, size ) )
+
+#define gpuFree( ptr ) checkGpu( cudaFree( ptr ) )
+#define gpuFreeHost( ptr ) checkGpu( cudaFreeHost( ptr ) )
+
+#define gpuSetDevice cudaSetDevice
+#define gpuDeviceSynchronize cudaDeviceSynchronize
+#define gpuDeviceReset cudaDeviceReset
+
+#define gpuLaunchKernel( kernel, blocks, threads, ... ) kernel<<<blocks, threads>>>( __VA_ARGS__ )
+#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<<blocks, threads, sharedMem>>>( __VA_ARGS__ )
+
+//--------------------------------------------------------------------------
+
+#elif defined __HIPCC__
+
+#include "hip/hip_runtime.h"
+
+#define gpuError_t hipError_t
+#define gpuPeekAtLastError hipPeekAtLastError
+#define gpuGetErrorString hipGetErrorString
+#define gpuSuccess hipSuccess
+
+#define gpuMallocHost( ptr, size ) checkGpu( hipHostMalloc( ptr, size ) ) // HostMalloc better
+#define gpuMalloc( ptr, size ) checkGpu( hipMalloc( ptr, size ) )
+
+#define gpuMemcpy( dstData, srcData, srcBytes, func ) checkGpu( hipMemcpy( dstData, srcData, srcBytes, func ) )
+#define gpuMemcpyHostToDevice hipMemcpyHostToDevice
+#define gpuMemcpyDeviceToHost hipMemcpyDeviceToHost
+#define gpuMemcpyToSymbol( type1, type2, size ) checkGpu( hipMemcpyToSymbol( type1, type2, size ) )
+
+#define gpuFree( ptr ) checkGpu( hipFree( ptr ) )
+#define gpuFreeHost( ptr ) checkGpu( hipHostFree( ptr ) )
+
+#define gpuSetDevice hipSetDevice
+#define gpuDeviceSynchronize hipDeviceSynchronize
+#define gpuDeviceReset hipDeviceReset
+
+#define gpuLaunchKernel( kernel, blocks, threads, ... ) kernel<<<blocks, threads>>>( __VA_ARGS__ )
+#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<<blocks, threads, sharedMem>>>( __VA_ARGS__ )
+
+//--------------------------------------------------------------------------
+
+#endif
+
+#endif // MG5AMC_GPUABSTRACTION_H
diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/GpuRuntime.h b/epochX/cudacpp/gq_ttq.sa/SubProcesses/GpuRuntime.h
new file mode 100644
index 0000000000..93579ef08b
--- /dev/null
+++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/GpuRuntime.h
@@ -0,0 +1,85 @@
+// Copyright (C) 2020-2023 CERN and UCLouvain.
+// Licensed under the GNU Lesser General Public License (version 3 or later).
+// Created by: J. Teig (Jun 2023, based on earlier work by S. Roiser) for the MG5aMC CUDACPP plugin.
+// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+
+#ifndef MG5AMC_GPURUNTIME_H
+#define MG5AMC_GPURUNTIME_H 1
+
+// MG5AMC on GPU uses the CUDA runtime API, not the lower level CUDA driver API
+// See https://docs.nvidia.com/cuda/cuda-runtime-api/driver-vs-runtime-api.html#driver-vs-runtime-api
+
+#include "GpuAbstraction.h"
+
+#include <iostream>
+
+//--------------------------------------------------------------------------
+
+// See https://stackoverflow.com/a/14038590
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
+#define checkGpu( code ) { assertGpu( code, __FILE__, __LINE__ ); }
+inline void assertGpu( gpuError_t code, const char* file, int line, bool abort = true )
+{
+  if( code != gpuSuccess )
+  {
+    printf( "ERROR! assertGpu: '%s' (%d) in %s:%d\n", gpuGetErrorString( code ), code, file, line );
+    if( abort ) assert( code == gpuSuccess );
+  }
+}
+#endif /* clang-format on */
+
+//--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+namespace mg5amcGpu
+{
+  // Instantiate a GpuRuntime at the beginnining of the application's main to
+  // invoke gpuSetDevice(0) in the constructor and book a gpuDeviceReset() call in the destructor
+  // *** FIXME! This will all need to be designed differently when going to multi-GPU nodes! ***
+  struct GpuRuntime final
+  {
+    GpuRuntime( const bool debug = true )
+      : m_debug( debug ) { setUp( m_debug ); }
+    ~GpuRuntime() { tearDown( m_debug ); }
+    GpuRuntime( const GpuRuntime& ) = delete;
+    GpuRuntime( GpuRuntime&& ) = delete;
+    GpuRuntime& operator=( const GpuRuntime& ) = delete;
+    GpuRuntime& operator=( GpuRuntime&& ) = delete;
+    bool m_debug;
+
+    // Set up CUDA application
+    // ** NB: strictly speaking this is not needed when using the CUDA runtime API **
+    // Calling cudaSetDevice on startup is useful to properly book-keep the time spent in CUDA initialization
+    static void setUp( const bool debug = true )
+    {
+      // ** NB: it is useful to call cudaSetDevice, or cudaFree, to properly book-keep the time spent in CUDA initialization
+      // ** NB: otherwise, the first CUDA operation (eg a cudaMemcpyToSymbol in CPPProcess ctor) appears to take much longer!
+      /*
+      // [We initially added cudaFree(0) to "ease profile analysis" only because it shows up as a big recognizable block!]
+      // No explicit initialization is needed: https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#initialization
+      // It is not clear what cudaFree(0) does at all: https://stackoverflow.com/questions/69967813/
+      if ( debug ) std::cout << "__CudaRuntime: calling cudaFree(0)" << std::endl;
+      checkCuda( cudaFree( 0 ) ); // SLOW!
+      */
+      // Replace cudaFree(0) by cudaSetDevice(0), even if it is not really needed either
+      // (but see https://developer.nvidia.com/blog/cuda-pro-tip-always-set-current-device-avoid-multithreading-bugs)
+      if( debug ) std::cout << "__GpuRuntime: calling GpuSetDevice(0)" << std::endl;
+      checkGpu( gpuSetDevice( 0 ) ); // SLOW!
+    }
+
+    // Tear down CUDA application (call cudaDeviceReset)
+    // ** NB: strictly speaking this is not needed when using the CUDA runtime API **
+    // Calling cudaDeviceReset on shutdown is only needed for checking memory leaks in cuda-memcheck
+    // See https://docs.nvidia.com/cuda/cuda-memcheck/index.html#leak-checking
+    static void tearDown( const bool debug = true )
+    {
+      if( debug ) std::cout << "__GpuRuntime: calling GpuDeviceReset()" << std::endl;
+      checkGpu( gpuDeviceReset() );
+    }
+  };
+}
+#endif
+
+//--------------------------------------------------------------------------
+
+#endif // MG5AMC_GPURUNTIME_H
diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/MadgraphTest.h b/epochX/cudacpp/gq_ttq.sa/SubProcesses/MadgraphTest.h
index ef40624c88..a64c05c26a 100644
--- a/epochX/cudacpp/gq_ttq.sa/SubProcesses/MadgraphTest.h
+++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/MadgraphTest.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: S. Hageboeck (Dec 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MADGRAPHTEST_H_
 #define MADGRAPHTEST_H_ 1
@@ -22,7 +22,7 @@
 #include <string>
 #include <vector>
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 using mg5amcGpu::CPPProcess;
 #else
 using mg5amcCpu::CPPProcess;
@@ -201,7 +201,7 @@ class MadgraphTest : public testing::TestWithParam<TestDriverBase*>
 
 // Since we link both the CPU-only and GPU tests into the same executable, we prevent
 // a multiply defined symbol by only compiling this in the non-CUDA phase:
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 
 /// Compare momenta and matrix elements.
 /// This uses an implementation of TestDriverBase to run a madgraph workflow,
@@ -307,6 +307,6 @@ TEST_P( MadgraphTest, CompareMomentaAndME )
   }
 }
 
-#endif // __CUDACC__
+#endif // MGONGPUCPP_GPUIMPL
 
 #endif /* MADGRAPHTEST_H_ */
diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/MatrixElementKernels.cc b/epochX/cudacpp/gq_ttq.sa/SubProcesses/MatrixElementKernels.cc
index 74b5239ebf..81699dfea9 100644
--- a/epochX/cudacpp/gq_ttq.sa/SubProcesses/MatrixElementKernels.cc
+++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/MatrixElementKernels.cc
@@ -1,12 +1,12 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #include "MatrixElementKernels.h"
 
 #include "CPPProcess.h"
-#include "CudaRuntime.h"
+#include "GpuRuntime.h" // Includes the abstraction for Nvidia/AMD compilation
 #include "MemoryAccessMomenta.h"
 #include "MemoryBuffers.h"
 
@@ -14,7 +14,7 @@
 
 //============================================================================
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 namespace mg5amcCpu
 {
 
@@ -150,7 +150,7 @@ namespace mg5amcCpu
 
 //============================================================================
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 {
 
@@ -209,13 +209,13 @@ namespace mg5amcGpu
     PinnedHostBufferHelicityMask hstIsGoodHel( ncomb );
     DeviceBufferHelicityMask devIsGoodHel( ncomb );
     // ... 0d1. Compute good helicity mask on the device
-    computeDependentCouplings<<<m_gpublocks, m_gputhreads>>>( m_gs.data(), m_couplings.data() );
+    gpuLaunchKernel( computeDependentCouplings, m_gpublocks, m_gputhreads, m_gs.data(), m_couplings.data() );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    sigmaKin_getGoodHel<<<m_gpublocks, m_gputhreads>>>( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_numerators.data(), m_denominators.data(), devIsGoodHel.data() );
+    gpuLaunchKernel( sigmaKin_getGoodHel, m_gpublocks, m_gputhreads, m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_numerators.data(), m_denominators.data(), devIsGoodHel.data() );
 #else
-    sigmaKin_getGoodHel<<<m_gpublocks, m_gputhreads>>>( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), devIsGoodHel.data() );
+    gpuLaunchKernel( sigmaKin_getGoodHel, m_gpublocks, m_gputhreads, m_momenta.data(), m_couplings.data(), m_matrixElements.data(), devIsGoodHel.data() );
 #endif
-    checkCuda( cudaPeekAtLastError() );
+    checkGpu( gpuPeekAtLastError() );
     // ... 0d2. Copy back good helicity mask to the host
     copyHostFromDevice( hstIsGoodHel, devIsGoodHel );
     // ... 0d3. Copy back good helicity list to constant memory on the device
@@ -226,19 +226,19 @@ namespace mg5amcGpu
 
   void MatrixElementKernelDevice::computeMatrixElements( const unsigned int channelId )
   {
-    computeDependentCouplings<<<m_gpublocks, m_gputhreads>>>( m_gs.data(), m_couplings.data() );
+    gpuLaunchKernel( computeDependentCouplings, m_gpublocks, m_gputhreads, m_gs.data(), m_couplings.data() );
 #ifndef MGONGPU_NSIGHT_DEBUG
     constexpr unsigned int sharedMemSize = 0;
 #else
     constexpr unsigned int sharedMemSize = ntpbMAX * sizeof( float );
 #endif
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    sigmaKin<<<m_gpublocks, m_gputhreads, sharedMemSize>>>( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), channelId, m_numerators.data(), m_denominators.data(), m_selhel.data(), m_selcol.data() );
+    gpuLaunchKernelSharedMem( sigmaKin, m_gpublocks, m_gputhreads, sharedMemSize, m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), channelId, m_numerators.data(), m_denominators.data(), m_selhel.data(), m_selcol.data() );
 #else
-    sigmaKin<<<m_gpublocks, m_gputhreads, sharedMemSize>>>( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), m_selhel.data(), m_selcol.data() );
+    gpuLaunchKernelSharedMem( sigmaKin, m_gpublocks, m_gputhreads, sharedMemSize, m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), m_selhel.data(), m_selcol.data() );
 #endif
-    checkCuda( cudaPeekAtLastError() );
-    checkCuda( cudaDeviceSynchronize() );
+    checkGpu( gpuPeekAtLastError() );
+    checkGpu( gpuDeviceSynchronize() );
   }
 
   //--------------------------------------------------------------------------
diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/MatrixElementKernels.h b/epochX/cudacpp/gq_ttq.sa/SubProcesses/MatrixElementKernels.h
index 23e84757a2..72bd8f195b 100644
--- a/epochX/cudacpp/gq_ttq.sa/SubProcesses/MatrixElementKernels.h
+++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/MatrixElementKernels.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MATRIXELEMENTKERNELS_H
 #define MATRIXELEMENTKERNELS_H 1
@@ -10,7 +10,7 @@
 
 #include "MemoryBuffers.h"
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -81,7 +81,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating matrix element calculations on a CPU host
   class MatrixElementKernelHost final : public MatrixElementKernelBase, public NumberOfEvents
   {
@@ -130,7 +130,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   // A class encapsulating matrix element calculations on a GPU device
   class MatrixElementKernelDevice : public MatrixElementKernelBase, public NumberOfEvents
   {
diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/MemoryAccessAmplitudes.h b/epochX/cudacpp/gq_ttq.sa/SubProcesses/MemoryAccessAmplitudes.h
index 573b3bbbc9..ffb76e93de 100644
--- a/epochX/cudacpp/gq_ttq.sa/SubProcesses/MemoryAccessAmplitudes.h
+++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/MemoryAccessAmplitudes.h
@@ -15,7 +15,7 @@
 #define MGONGPU_TRIVIAL_AMPLITUDES 1
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/MemoryAccessCouplings.h b/epochX/cudacpp/gq_ttq.sa/SubProcesses/MemoryAccessCouplings.h
index 35a3af42e0..3afdf3e554 100644
--- a/epochX/cudacpp/gq_ttq.sa/SubProcesses/MemoryAccessCouplings.h
+++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/MemoryAccessCouplings.h
@@ -15,7 +15,7 @@
 #include "MemoryBuffers.h"       // for HostBufferCouplings::isaligned
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/MemoryAccessCouplingsFixed.h b/epochX/cudacpp/gq_ttq.sa/SubProcesses/MemoryAccessCouplingsFixed.h
index dc0d93afff..ffcdf4dbef 100644
--- a/epochX/cudacpp/gq_ttq.sa/SubProcesses/MemoryAccessCouplingsFixed.h
+++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/MemoryAccessCouplingsFixed.h
@@ -14,7 +14,7 @@
 //#include "MemoryAccessHelpers.h"
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/MemoryAccessDenominators.h b/epochX/cudacpp/gq_ttq.sa/SubProcesses/MemoryAccessDenominators.h
index 3bce635718..66f2d32a6b 100644
--- a/epochX/cudacpp/gq_ttq.sa/SubProcesses/MemoryAccessDenominators.h
+++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/MemoryAccessDenominators.h
@@ -10,7 +10,7 @@
 #include "MemoryAccessGs.h"
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/MemoryAccessGs.h b/epochX/cudacpp/gq_ttq.sa/SubProcesses/MemoryAccessGs.h
index 31311aa375..4c726b30f3 100644
--- a/epochX/cudacpp/gq_ttq.sa/SubProcesses/MemoryAccessGs.h
+++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/MemoryAccessGs.h
@@ -13,7 +13,7 @@
 #include "MemoryBuffers.h" // for HostBufferMatrixElements::isaligned
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/MemoryAccessHelpers.h b/epochX/cudacpp/gq_ttq.sa/SubProcesses/MemoryAccessHelpers.h
index c82a6c7635..db73e4e064 100644
--- a/epochX/cudacpp/gq_ttq.sa/SubProcesses/MemoryAccessHelpers.h
+++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/MemoryAccessHelpers.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MemoryAccessHelpers_H
 #define MemoryAccessHelpers_H 1
@@ -105,7 +105,7 @@ class KernelAccessHelper : public MemoryAccessHelper<T>
     }
     else
     {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
       const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid
       //printf( "kernelAccessRecord: ievt=%d threadId=%d\n", ievt, threadIdx.x );
       return T::ieventAccessRecord( buffer, ievt ); // NB fptype and fptype_sv coincide for CUDA
diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/MemoryAccessMatrixElements.h b/epochX/cudacpp/gq_ttq.sa/SubProcesses/MemoryAccessMatrixElements.h
index f32e6fea5b..3741011971 100644
--- a/epochX/cudacpp/gq_ttq.sa/SubProcesses/MemoryAccessMatrixElements.h
+++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/MemoryAccessMatrixElements.h
@@ -13,7 +13,7 @@
 #include "MemoryBuffers.h" // for HostBufferMatrixElements::isaligned
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/MemoryAccessMomenta.h b/epochX/cudacpp/gq_ttq.sa/SubProcesses/MemoryAccessMomenta.h
index 29266de32c..3be229d392 100644
--- a/epochX/cudacpp/gq_ttq.sa/SubProcesses/MemoryAccessMomenta.h
+++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/MemoryAccessMomenta.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MemoryAccessMomenta_H
 #define MemoryAccessMomenta_H 1
@@ -13,7 +13,7 @@
 #include "MemoryAccessVectors.h"
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -30,7 +30,7 @@ namespace mg5amcCpu
 
     // Number of Events Per Page in the momenta AOSOA memory buffer layout
     // (these are all best kept as a compile-time constants: see issue #23)
-#ifdef __CUDACC__ /* clang-format off */
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
     // -----------------------------------------------------------------------------------------------
     // --- GPUs: neppM is best set to a power of 2 times the number of fptype's in a 32-byte cacheline
     // --- This is relevant to ensure coalesced access to momenta in global memory
diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/MemoryAccessNumerators.h b/epochX/cudacpp/gq_ttq.sa/SubProcesses/MemoryAccessNumerators.h
index b152183b28..18991f4fa6 100644
--- a/epochX/cudacpp/gq_ttq.sa/SubProcesses/MemoryAccessNumerators.h
+++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/MemoryAccessNumerators.h
@@ -10,7 +10,7 @@
 #include "MemoryAccessGs.h"
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/MemoryAccessRandomNumbers.h b/epochX/cudacpp/gq_ttq.sa/SubProcesses/MemoryAccessRandomNumbers.h
index e2988d39f3..40cb089135 100644
--- a/epochX/cudacpp/gq_ttq.sa/SubProcesses/MemoryAccessRandomNumbers.h
+++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/MemoryAccessRandomNumbers.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MemoryAccessRandomNumbers_H
 #define MemoryAccessRandomNumbers_H 1
@@ -11,7 +11,7 @@
 #include "CPPProcess.h"
 #include "MemoryAccessHelpers.h"
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 using mg5amcGpu::CPPProcess;
 #else
 using mg5amcCpu::CPPProcess;
diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/MemoryAccessVectors.h b/epochX/cudacpp/gq_ttq.sa/SubProcesses/MemoryAccessVectors.h
index e9b197368e..08faccff0f 100644
--- a/epochX/cudacpp/gq_ttq.sa/SubProcesses/MemoryAccessVectors.h
+++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/MemoryAccessVectors.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MemoryAccessVectors_H
 #define MemoryAccessVectors_H 1
@@ -10,7 +10,7 @@
 
 #include "mgOnGpuVectors.h"
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 namespace mg5amcCpu // this is only needed for CPU SIMD vectorization
 {
 
diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/MemoryAccessWavefunctions.h b/epochX/cudacpp/gq_ttq.sa/SubProcesses/MemoryAccessWavefunctions.h
index 5428aaf933..33bef4559e 100644
--- a/epochX/cudacpp/gq_ttq.sa/SubProcesses/MemoryAccessWavefunctions.h
+++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/MemoryAccessWavefunctions.h
@@ -15,7 +15,7 @@
 #define MGONGPU_TRIVIAL_WAVEFUNCTIONS 1
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/MemoryBuffers.h b/epochX/cudacpp/gq_ttq.sa/SubProcesses/MemoryBuffers.h
index 3093e6ed18..7756a71621 100644
--- a/epochX/cudacpp/gq_ttq.sa/SubProcesses/MemoryBuffers.h
+++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/MemoryBuffers.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021, based on earlier work by S. Hageboeck) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Roiser, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MemoryBuffers_H
 #define MemoryBuffers_H 1
@@ -11,12 +11,12 @@
 #include "mgOnGpuCxtypes.h"
 
 #include "CPPProcess.h"
-#include "CudaRuntime.h"
+#include "GpuRuntime.h"
 #include "Parameters_sm.h"
 
 #include <sstream>
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -87,7 +87,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   constexpr bool HostBufferALIGNED = false;   // ismisaligned=false
   constexpr bool HostBufferMISALIGNED = true; // ismisaligned=true
 
@@ -119,7 +119,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   // A class encapsulating a CUDA pinned host buffer
   template<typename T>
   class PinnedHostBufferBase : public BufferBase<T>
@@ -128,18 +128,18 @@ namespace mg5amcCpu
     PinnedHostBufferBase( const size_t size )
       : BufferBase<T>( size, false )
     {
-      checkCuda( cudaMallocHost( &( this->m_data ), this->bytes() ) );
+      gpuMallocHost( &( this->m_data ), this->bytes() );
     }
     virtual ~PinnedHostBufferBase()
     {
-      checkCuda( cudaFreeHost( this->m_data ) );
+      gpuFreeHost( this->m_data );
     }
   };
 #endif
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   // A class encapsulating a CUDA device buffer
   template<typename T>
   class DeviceBufferBase : public BufferBase<T>
@@ -148,18 +148,18 @@ namespace mg5amcCpu
     DeviceBufferBase( const size_t size )
       : BufferBase<T>( size, true )
     {
-      checkCuda( cudaMalloc( &( this->m_data ), this->bytes() ) );
+      gpuMalloc( &( this->m_data ), this->bytes() );
     }
     virtual ~DeviceBufferBase()
     {
-      checkCuda( cudaFree( this->m_data ) );
+      gpuFree( this->m_data );
     }
   };
 #endif
 
   //--------------------------------------------------------------------------
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for a given number of events
   template<typename T, size_t sizePerEvent, bool ismisaligned>
   class HostBuffer : public HostBufferBase<T, ismisaligned>, virtual private NumberOfEvents
@@ -175,7 +175,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   // A class encapsulating a CUDA pinned host buffer for a given number of events
   template<typename T, size_t sizePerEvent>
   class PinnedHostBuffer : public PinnedHostBufferBase<T>, virtual private NumberOfEvents
@@ -191,7 +191,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   // A class encapsulating a CUDA device buffer for a given number of events
   template<typename T, size_t sizePerEvent>
   class DeviceBuffer : public DeviceBufferBase<T>, virtual private NumberOfEvents
@@ -213,7 +213,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for momenta random numbers
   constexpr size_t sizePerEventRndNumMomenta = MemoryBuffers::np4 * MemoryBuffers::nparf;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for momenta random numbers
   typedef HostBuffer<fptype, sizePerEventRndNumMomenta, HostBufferALIGNED> HostBufferRndNumMomenta;
 #else
@@ -232,7 +232,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer with ONE fptype per event
   constexpr size_t sizePerEventOneFp = 1;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer with ONE fptype per event
   typedef HostBuffer<fptype, sizePerEventOneFp, HostBufferALIGNED> HostBufferOneFp;
 #else
@@ -257,7 +257,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for Gs
   constexpr size_t sizePerEventGs = 1;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for gs
   typedef HostBuffer<fptype, sizePerEventGs, HostBufferALIGNED> HostBufferGs;
 #else
@@ -276,7 +276,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for numerators
   constexpr size_t sizePerEventNumerators = 1;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for gs
   typedef HostBuffer<fptype, sizePerEventNumerators, HostBufferALIGNED> HostBufferNumerators;
 #else
@@ -296,7 +296,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for denominators
   constexpr size_t sizePerEventDenominators = 1;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for gs
   typedef HostBuffer<fptype, sizePerEventDenominators, HostBufferALIGNED> HostBufferDenominators;
 #else
@@ -315,7 +315,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for random numbers
   constexpr size_t sizePerEventCouplings = MemoryBuffers::ndcoup * MemoryBuffers::nx2;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for gs
   typedef HostBuffer<fptype, sizePerEventCouplings, HostBufferALIGNED> HostBufferCouplings;
 #else
@@ -333,7 +333,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for momenta
   constexpr size_t sizePerEventMomenta = MemoryBuffers::np4 * MemoryBuffers::npar;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for momenta
   typedef HostBuffer<fptype, sizePerEventMomenta, HostBufferALIGNED> HostBufferMomenta;
   //typedef HostBuffer<fptype, sizePerEventMomenta, HostBufferMISALIGNED> HostBufferMomenta; // TEST MISALIGNMENT!
@@ -352,7 +352,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for sampling weights
   constexpr size_t sizePerEventWeights = 1;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for sampling weights
   typedef HostBuffer<fptype, sizePerEventWeights, HostBufferALIGNED> HostBufferWeights;
 #else
@@ -370,7 +370,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for matrix elements
   constexpr size_t sizePerEventMatrixElements = 1;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for matrix elements
   typedef HostBuffer<fptype, sizePerEventMatrixElements, HostBufferALIGNED> HostBufferMatrixElements;
 #else
@@ -385,7 +385,7 @@ namespace mg5amcCpu
   // A base class encapsulating a memory buffer for the helicity mask
   typedef BufferBase<bool> BufferHelicityMask;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for the helicity mask
   typedef HostBufferBase<bool, HostBufferALIGNED> HostBufferHelicityMask;
 #else
@@ -403,7 +403,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for wavefunctions
   constexpr size_t sizePerEventWavefunctions = MemoryBuffers::nw6 * MemoryBuffers::nx2;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for wavefunctions
   typedef HostBuffer<fptype, sizePerEventWavefunctions, HostBufferALIGNED> HostBufferWavefunctions;
 #else
@@ -421,7 +421,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for helicity random numbers
   constexpr size_t sizePerEventRndNumHelicity = 1;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for helicity random numbers
   typedef HostBuffer<fptype, sizePerEventRndNumHelicity, HostBufferALIGNED> HostBufferRndNumHelicity;
 #else
@@ -439,7 +439,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for color random numbers
   constexpr size_t sizePerEventRndNumColor = 1;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for color random numbers
   typedef HostBuffer<fptype, sizePerEventRndNumColor, HostBufferALIGNED> HostBufferRndNumColor;
 #else
@@ -457,7 +457,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for helicity selection
   constexpr size_t sizePerEventSelectedHelicity = 1;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for helicity selection
   typedef HostBuffer<int, sizePerEventSelectedHelicity, HostBufferALIGNED> HostBufferSelectedHelicity;
 #else
@@ -475,7 +475,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for color selection
   constexpr size_t sizePerEventSelectedColor = 1;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for color selection
   typedef HostBuffer<int, sizePerEventSelectedColor, HostBufferALIGNED> HostBufferSelectedColor;
 #else
@@ -487,7 +487,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   template<class Tdst, class Tsrc>
   void copyDeviceFromHost( Tdst& dst, const Tsrc& src ) // keep the same order of arguments as in memcpy
   {
@@ -504,13 +504,13 @@ namespace mg5amcCpu
       throw std::runtime_error( sstr.str() );
     }
     // NB (PR #45): cudaMemcpy involves an intermediate memcpy to pinned memory if host array is a not a pinned host array
-    checkCuda( cudaMemcpy( dst.data(), src.data(), src.bytes(), cudaMemcpyHostToDevice ) );
+    gpuMemcpy( dst.data(), src.data(), src.bytes(), gpuMemcpyHostToDevice );
   }
 #endif
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   template<class Tdst, class Tsrc>
   void copyHostFromDevice( Tdst& dst, const Tsrc& src ) // keep the same order of arguments as in memcpy
   {
@@ -527,7 +527,7 @@ namespace mg5amcCpu
       throw std::runtime_error( sstr.str() );
     }
     // NB (PR #45): cudaMemcpy involves an intermediate memcpy to pinned memory if host array is a not a pinned host array
-    checkCuda( cudaMemcpy( dst.data(), src.data(), src.bytes(), cudaMemcpyDeviceToHost ) );
+    gpuMemcpy( dst.data(), src.data(), src.bytes(), gpuMemcpyDeviceToHost );
   }
 #endif
 
diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gu_ttxu/CPPProcess.cc b/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gu_ttxu/CPPProcess.cc
index 90e90b3aa9..c1543791ca 100644
--- a/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gu_ttxu/CPPProcess.cc
+++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gu_ttxu/CPPProcess.cc
@@ -4,7 +4,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
 // MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
@@ -16,7 +16,6 @@
 
 #include "mgOnGpuConfig.h"
 
-#include "CudaRuntime.h"
 #include "HelAmps_sm.h"
 #include "MemoryAccessAmplitudes.h"
 #include "MemoryAccessCouplings.h"
@@ -49,7 +48,7 @@
 // Process: g d > t t~ d WEIGHTED<=3 @1
 // Process: g s > t t~ s WEIGHTED<=3 @1
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -83,7 +82,7 @@ namespace mg5amcCpu
   __device__ const fptype cIPD[2] = { (fptype)Parameters_sm::mdl_MT, (fptype)Parameters_sm::mdl_WT };
   __device__ const fptype* cIPC = nullptr; // unused as nicoup=0
 #else
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   __device__ __constant__ fptype cIPD[2];
   __device__ __constant__ fptype* cIPC = nullptr; // unused as nicoup=0
 #else
@@ -93,7 +92,7 @@ namespace mg5amcCpu
 #endif
 
   // Helicity combinations (and filtering of "good" helicity combinations)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   __device__ __constant__ short cHel[ncomb][npar];
   __device__ __constant__ int cNGoodHel;
   __device__ __constant__ int cGoodHel[ncomb];
@@ -121,13 +120,13 @@ namespace mg5amcCpu
                            fptype* allDenominators,       // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
                            fptype_sv* jamp2_sv            // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled)
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
                            , const int ievt00             // input: first event number in current C++ event page (for CUDA, ievt depends on threadid)
 #endif
                            )
   //ALWAYS_INLINE // attributes are not permitted in a function definition
   {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     using namespace mg5amcGpu;
     using M_ACCESS = DeviceAccessMomenta;         // non-trivial access: buffer includes all events
     using E_ACCESS = DeviceAccessMatrixElements;  // non-trivial access: buffer includes all events
@@ -154,7 +153,7 @@ namespace mg5amcCpu
 #endif /* clang-format on */
     mgDebug( 0, __FUNCTION__ );
     //printf( "calculate_wavefunctions: ihel=%2d\n", ihel );
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
     //printf( "calculate_wavefunctions: ievt00=%d\n", ievt00 );
 #endif
 
@@ -190,7 +189,7 @@ namespace mg5amcCpu
 #endif
     for( int iParity = 0; iParity < nParity; ++iParity )
     { // START LOOP ON IPARITY
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
       const int ievt0 = ievt00 + iParity * neppV;
 #endif
       constexpr size_t nxcoup = ndcoup + nicoup; // both dependent and independent couplings
@@ -203,8 +202,10 @@ namespace mg5amcCpu
         allCOUPs[idcoup] = CD_ACCESS::idcoupAccessBufferConst( allcouplings, idcoup ); // dependent couplings, vary event-by-event
       for( size_t iicoup = 0; iicoup < nicoup; iicoup++ )
         allCOUPs[ndcoup + iicoup] = CI_ACCESS::iicoupAccessBufferConst( cIPC, iicoup ); // independent couplings, fixed for all events
+#ifdef MGONGPUCPP_GPUIMPL
 #ifdef __CUDACC__
 #pragma nv_diagnostic pop
+#endif
       // CUDA kernels take input/output buffers with momenta/MEs for all events
       const fptype* momenta = allmomenta;
       const fptype* COUPs[nxcoup];
@@ -336,7 +337,7 @@ namespace mg5amcCpu
         { 4, 0, 12, 4 },
         { 0, 4, 4, 12 } }; // 2-D array[4][4]
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
       // Pre-compute a constexpr triangular color matrix properly normalized #475
       struct TriangularNormalizedColorMatrix
       {
@@ -393,7 +394,7 @@ namespace mg5amcCpu
 #endif
       for( int icol = 0; icol < ncolor; icol++ )
       {
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
         // === C++ START ===
         // Diagonal terms
 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
@@ -452,7 +453,7 @@ namespace mg5amcCpu
       MEs_sv_previous += deltaMEs_previous;
 #endif
       /*
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
       if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", blockDim.x * blockIdx.x + threadIdx.x, ihel, MEs_sv );
 #else
 #ifdef MGONGPU_CPPSIMD
@@ -515,8 +516,8 @@ namespace mg5amcCpu
       { 1, -1, 1, 1, 1 },
       { 1, -1, 1, -1, -1 },
       { 1, -1, 1, -1, 1 } };
-#ifdef __CUDACC__
-    checkCuda( cudaMemcpyToSymbol( cHel, tHel, ncomb * npar * sizeof( short ) ) );
+#ifdef MGONGPUCPP_GPUIMPL
+    gpuMemcpyToSymbol( cHel, tHel, ncomb * npar * sizeof( short ) );
 #else
     memcpy( cHel, tHel, ncomb * npar * sizeof( short ) );
 #endif
@@ -557,9 +558,9 @@ namespace mg5amcCpu
     // Then copy them to CUDA constant memory (issue #39) or its C++ emulation in file-scope static memory
     const fptype tIPD[2] = { (fptype)m_pars->mdl_MT, (fptype)m_pars->mdl_WT };
     //const cxtype tIPC[0] = { ... }; // nicoup=0
-#ifdef __CUDACC__
-    checkCuda( cudaMemcpyToSymbol( cIPD, tIPD, 2 * sizeof( fptype ) ) );
-    //checkCuda( cudaMemcpyToSymbol( cIPC, tIPC, 0 * sizeof( cxtype ) ) ); // nicoup=0
+#ifdef MGONGPUCPP_GPUIMPL
+    gpuMemcpyToSymbol( cIPD, tIPD, 2 * sizeof( fptype ) );
+    //gpuMemcpyToSymbol( cIPC, tIPC, 0 * sizeof( cxtype ) ); // nicoup=0
 #else
     memcpy( cIPD, tIPD, 2 * sizeof( fptype ) );
     //memcpy( cIPC, tIPC, 0 * sizeof( cxtype ) ); // nicoup=0
@@ -596,7 +597,7 @@ namespace mg5amcCpu
   {
     std::stringstream out;
     // CUDA version (NVCC)
-    // [Use __NVCC__ instead of __CUDACC__ here!]
+    // [Use __NVCC__ instead of MGONGPUCPP_GPUIMPL here!]
     // [This tests if 'nvcc' was used even to build a .cc file, even if not necessarily 'nvcc -x cu' for a .cu file]
     // [Check 'nvcc --compiler-options -dM -E dummy.c | grep CUDA': see https://stackoverflow.com/a/53713712]
 #ifdef __NVCC__
@@ -661,12 +662,12 @@ namespace mg5amcCpu
   __global__ void /* clang-format off */
   computeDependentCouplings( const fptype* allgs, // input: Gs[nevt]
                              fptype* allcouplings // output: couplings[nevt*ndcoup*2]
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
                              , const int nevt     // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #endif
   ) /* clang-format on */
   {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     using namespace mg5amcGpu;
     using G_ACCESS = DeviceAccessGs;
     using C_ACCESS = DeviceAccessCouplings;
@@ -687,7 +688,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__ /* clang-format off */
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
   __global__ void
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
@@ -813,9 +814,9 @@ namespace mg5amcCpu
         nGoodHel++;
       }
     }
-#ifdef __CUDACC__
-    checkCuda( cudaMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) ) );
-    checkCuda( cudaMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) ) );
+#ifdef MGONGPUCPP_GPUIMPL
+    gpuMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) );
+    gpuMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) );
 #else
     cNGoodHel = nGoodHel;
     for( int ihel = 0; ihel < ncomb; ihel++ ) cGoodHel[ihel] = goodHel[ihel];
@@ -839,7 +840,7 @@ namespace mg5amcCpu
 #endif
             int* allselhel,                // output: helicity selection[nevt]
             int* allselcol                 // output: helicity selection[nevt]
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
             , const int nevt               // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #endif
             ) /* clang-format on */
@@ -859,7 +860,7 @@ namespace mg5amcCpu
     // Denominators: spins, colors and identical particles
     constexpr int helcolDenominators[1] = { 96 }; // assume nprocesses == 1 (#272 and #343)
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     // Remember: in CUDA this is a kernel for one event, in c++ this processes n events
     const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid
 #else
@@ -873,9 +874,12 @@ namespace mg5amcCpu
 #endif
 
     // Start sigmaKin_lines
+
+#include "GpuAbstraction.h"
+
     // === PART 0 - INITIALISATION (before calculate_wavefunctions) ===
     // Reset the "matrix elements" - running sums of |M|^2 over helicities for the given event
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     allMEs[ievt] = 0;
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     allNumerators[ievt] = 0;
@@ -903,7 +907,7 @@ namespace mg5amcCpu
     // === PART 1 - HELICITY LOOP: CALCULATE WAVEFUNCTIONS ===
     // (in both CUDA and C++, using precomputed good helicities)
 
-#ifdef __CUDACC__ // CUDA OR C++
+#ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++
 
     // *** START OF PART 1a - CUDA (one event per CPU thread) ***
     // Running sum of partial amplitudes squared for event by event color selection (#402)
@@ -1113,7 +1117,7 @@ namespace mg5amcCpu
     // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event
     // [NB 'sum over final spins, average over initial spins', eg see
     // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf]
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     allMEs[ievt] /= helcolDenominators[0];
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     if( channelId > 0 ) allMEs[ievt] *= allNumerators[ievt] / allDenominators[ievt];
diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gu_ttxu/CPPProcess.h b/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gu_ttxu/CPPProcess.h
index bf037c6c28..ce22572055 100644
--- a/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gu_ttxu/CPPProcess.h
+++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gu_ttxu/CPPProcess.h
@@ -4,7 +4,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
 // MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
@@ -25,7 +25,7 @@
 
 //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -110,7 +110,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   __global__ void
   computeDependentCouplings( const fptype* allgs,    // input: Gs[nevt]
                              fptype* allcouplings ); // output: couplings[nevt*ndcoup*2]
@@ -123,7 +123,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__ /* clang-format off */
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
   __global__ void
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
@@ -153,7 +153,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__ /* clang-format off */
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
   __global__ void
   sigmaKin( const fptype* allmomenta,      // input: momenta[nevt*npar*4]
             const fptype* allcouplings,    // input: couplings[nevt*ndcoup*2]
diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gu_ttxu/CudaRuntime.h b/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gu_ttxu/CudaRuntime.h
deleted file mode 120000
index ce9e1a487a..0000000000
--- a/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gu_ttxu/CudaRuntime.h
+++ /dev/null
@@ -1 +0,0 @@
-../CudaRuntime.h
\ No newline at end of file
diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gu_ttxu/GpuAbstraction.h b/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gu_ttxu/GpuAbstraction.h
new file mode 120000
index 0000000000..72054e19ba
--- /dev/null
+++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gu_ttxu/GpuAbstraction.h
@@ -0,0 +1 @@
+../GpuAbstraction.h
\ No newline at end of file
diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gu_ttxu/GpuRuntime.h b/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gu_ttxu/GpuRuntime.h
new file mode 120000
index 0000000000..3920e83be4
--- /dev/null
+++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gu_ttxu/GpuRuntime.h
@@ -0,0 +1 @@
+../GpuRuntime.h
\ No newline at end of file
diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gu_ttxu/check_sa.cc b/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gu_ttxu/check_sa.cc
index 3fbf0ffbee..7cac5ab47b 100644
--- a/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gu_ttxu/check_sa.cc
+++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gu_ttxu/check_sa.cc
@@ -4,7 +4,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: O. Mattelaer (Nov 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 
 #include "mgOnGpuConfig.h"
@@ -12,6 +12,7 @@
 #include "BridgeKernels.h"
 #include "CPPProcess.h"
 #include "CrossSectionKernels.h"
+#include "GpuRuntime.h"
 #include "MatrixElementKernels.h"
 #include "MemoryAccessMatrixElements.h"
 #include "MemoryAccessMomenta.h"
@@ -65,7 +66,7 @@ usage( char* argv0, int ret = 1 )
   std::cout << std::endl;
   std::cout << "Summary stats are always computed: '-p' and '-j' only control their printout" << std::endl;
   std::cout << "The '-d' flag only enables NaN/abnormal warnings and OMP debugging" << std::endl;
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 #ifdef _OPENMP
   std::cout << std::endl;
   std::cout << "Use the OMP_NUM_THREADS environment variable to control OMP multi-threading" << std::endl;
@@ -96,7 +97,7 @@ int
 main( int argc, char** argv )
 {
   // Namespaces for CUDA and C++ (FIXME - eventually use the same namespace everywhere...)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   using namespace mg5amcGpu;
 #else
   using namespace mg5amcCpu;
@@ -134,9 +135,11 @@ main( int argc, char** argv )
     CurandDevice = 2
   };
 #ifdef MGONGPU_HAS_NO_CURAND
-  RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // this is the only supported mode if build has no curand (PR #784)
+  RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // this is the only supported mode if build has no curand (PR #784 and #785)
+#elif defined __HIPCC__
+#error Internal error: MGONGPU_HAS_NO_CURAND should have been set for __HIPCC__ // default on AMD GPUs should be common random
 #elif defined __CUDACC__
-  RandomNumberMode rndgen = RandomNumberMode::CurandDevice; // default on GPU if build has curand
+  RandomNumberMode rndgen = RandomNumberMode::CurandDevice; // default on NVidia GPU if build has curand
 #else
   RandomNumberMode rndgen = RandomNumberMode::CurandHost; // default on CPU if build has curand
 #endif
@@ -146,10 +149,10 @@ main( int argc, char** argv )
     RamboHost = 1,
     RamboDevice = 2
   };
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   RamboSamplingMode rmbsmp = RamboSamplingMode::RamboDevice; // default on GPU
 #else
-  RamboSamplingMode rmbsmp = RamboSamplingMode::RamboHost;  // default on CPU
+  RamboSamplingMode rmbsmp = RamboSamplingMode::RamboHost; // default on CPU
 #endif
   // Bridge emulation mode (NB Bridge implies RamboHost!)
   bool bridge = false;
@@ -177,7 +180,7 @@ main( int argc, char** argv )
     else if( arg == "--curdev" )
     {
 #ifndef __CUDACC__
-      throw std::runtime_error( "CurandDevice is not supported on CPUs" );
+      throw std::runtime_error( "CurandDevice is not supported on CPUs or non-NVidia GPUs" );
 #elif defined MGONGPU_HAS_NO_CURAND
       throw std::runtime_error( "CurandDevice is not supported because this application was built without Curand support" );
 #else
@@ -198,7 +201,7 @@ main( int argc, char** argv )
     }
     else if( arg == "--rmbdev" )
     {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
       rmbsmp = RamboSamplingMode::RamboDevice;
 #else
       throw std::runtime_error( "RamboDevice is not supported on CPUs" );
@@ -272,13 +275,13 @@ main( int argc, char** argv )
     return usage( argv[0] );
   }
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 #ifdef _OPENMP
   ompnumthreadsNotSetMeansOneThread( debug ? 1 : 0 ); // quiet(-1), info(0), debug(1)
 #endif
 #endif
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // Fail gently and avoid "Illegal instruction (core dumped)" if the host does not support the SIMD used in the ME calculation
   // Note: this prevents a crash on pmpe04 but not on some github CI nodes?
   // [NB: SIMD vectorization in mg5amc C++ code is only used in the ME calculation below MatrixElementKernelHost!]
@@ -296,14 +299,14 @@ main( int argc, char** argv )
 
   // === STEP 0 - INITIALISE
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 
-  // --- 00. Initialise cuda
-  // Instantiate a CudaRuntime at the beginnining of the application's main to
-  // invoke cudaSetDevice(0) in the constructor and book a cudaDeviceReset() call in the destructor
-  const std::string cdinKey = "00 CudaInit";
+  // --- 00. Initialise GPU
+  // Instantiate a GpuRuntime at the beginnining of the application's main.
+  // For CUDA this invokes cudaSetDevice(0) in the constructor and books a cudaDeviceReset() call in the destructor.
+  const std::string cdinKey = "00 GpuInit";
   timermap.start( cdinKey );
-  CudaRuntime cudaRuntime( debug );
+  GpuRuntime GpuRuntime( debug );
 #endif
 
   // --- 0a. Initialise physics process
@@ -325,7 +328,7 @@ main( int argc, char** argv )
   timermap.start( alloKey );
 
   // Memory buffers for random numbers for momenta
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferRndNumMomenta hstRndmom( nevt );
 #else
   PinnedHostBufferRndNumMomenta hstRndmom( nevt );
@@ -333,7 +336,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for sampling weights
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferWeights hstWeights( nevt );
 #else
   PinnedHostBufferWeights hstWeights( nevt );
@@ -341,7 +344,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for momenta
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferMomenta hstMomenta( nevt );
 #else
   PinnedHostBufferMomenta hstMomenta( nevt );
@@ -349,7 +352,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for Gs
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferGs hstGs( nevt );
 #else
   PinnedHostBufferGs hstGs( nevt );
@@ -366,7 +369,7 @@ main( int argc, char** argv )
   }
 
   // Memory buffers for matrix elements
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferMatrixElements hstMatrixElements( nevt );
 #else
   PinnedHostBufferMatrixElements hstMatrixElements( nevt );
@@ -375,7 +378,7 @@ main( int argc, char** argv )
 
   // Memory buffers for random numbers for helicity selection
   // *** NB #403 these buffers always remain initialised at 0: no need for helicity choice in gcheck/check (no LHE produced) ***
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferRndNumHelicity hstRndHel( nevt );
 #else
   PinnedHostBufferRndNumHelicity hstRndHel( nevt );
@@ -384,7 +387,7 @@ main( int argc, char** argv )
 
   // Memory buffers for random numbers for color selection
   // *** NB #402 these buffers always remain initialised at 0: no need for color choice in gcheck/check (no LHE produced) ***
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferRndNumColor hstRndCol( nevt );
 #else
   PinnedHostBufferRndNumColor hstRndCol( nevt );
@@ -392,7 +395,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for helicity selection
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferSelectedHelicity hstSelHel( nevt );
 #else
   PinnedHostBufferSelectedHelicity hstSelHel( nevt );
@@ -400,7 +403,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for color selection
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferSelectedColor hstSelCol( nevt );
 #else
   PinnedHostBufferSelectedColor hstSelCol( nevt );
@@ -438,7 +441,7 @@ main( int argc, char** argv )
     const bool onDevice = true;
     prnk.reset( new CurandRandomNumberKernel( devRndmom, onDevice ) );
 #else
-    throw std::logic_error( "INTERNAL ERROR! CurandDevice is not supported on CPUs" ); // INTERNAL ERROR (no path to this statement)
+    throw std::logic_error( "INTERNAL ERROR! CurandDevice is not supported on CPUs or non-NVidia GPUs" ); // INTERNAL ERROR (no path to this statement)
 #endif
   }
 
@@ -450,7 +453,7 @@ main( int argc, char** argv )
   }
   else
   {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     prsk.reset( new RamboSamplingKernelDevice( energy, devRndmom, devMomenta, devWeights, gpublocks, gputhreads ) );
 #else
     throw std::logic_error( "RamboDevice is not supported on CPUs" ); // INTERNAL ERROR (no path to this statement)
@@ -461,7 +464,7 @@ main( int argc, char** argv )
   std::unique_ptr<MatrixElementKernelBase> pmek;
   if( !bridge )
   {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     pmek.reset( new MatrixElementKernelDevice( devMomenta, devGs, devRndHel, devRndCol, devMatrixElements, devSelHel, devSelCol, gpublocks, gputhreads ) );
 #else
     pmek.reset( new MatrixElementKernelHost( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, nevt ) );
@@ -469,7 +472,7 @@ main( int argc, char** argv )
   }
   else
   {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     pmek.reset( new BridgeKernelDevice( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, gpublocks, gputhreads ) );
 #else
     pmek.reset( new BridgeKernelHost( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, nevt ) );
@@ -511,7 +514,7 @@ main( int argc, char** argv )
     prnk->generateRnarray();
     //std::cout << "Got random numbers" << std::endl;
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     if( rndgen != RandomNumberMode::CurandDevice && rmbsmp == RamboSamplingMode::RamboDevice )
     {
       // --- 1c. Copy rndmom from host to device
@@ -543,7 +546,7 @@ main( int argc, char** argv )
     prsk->getMomentaFinal();
     //std::cout << "Got final momenta" << std::endl;
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     if( rmbsmp == RamboSamplingMode::RamboDevice )
     {
       // --- 2c. CopyDToH Weights
@@ -588,7 +591,7 @@ main( int argc, char** argv )
       dynamic_cast<BridgeKernelBase*>( pmek.get() )->transposeInputMomentaC2F();
     }
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     // --- 2d. CopyHToD Momenta
     const std::string gKey = "0.. CpHTDg";
     rambtime += timermap.start( gKey ); // FIXME! NOT A RAMBO TIMER!
@@ -617,7 +620,7 @@ main( int argc, char** argv )
     wv3atime += timermap.stop(); // calc only
     wavetime += wv3atime;        // calc plus copy
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     if( !bridge )
     {
       // --- 3b. CopyDToH MEs
@@ -760,18 +763,22 @@ main( int argc, char** argv )
     rndgentxt = "CURAND DEVICE";
 #ifdef __CUDACC__
   rndgentxt += " (CUDA code)";
+#elif defined __HIPCC__
+  rndgentxt += " (HIP code)";
 #else
   rndgentxt += " (C++ code)";
 #endif
 
   // Workflow description summary
   std::string wrkflwtxt;
-  // -- CUDA or C++?
+  // -- CUDA or HIP or C++?
 #ifdef __CUDACC__
   wrkflwtxt += "CUD:";
+#elif defined __HIPCC__
+  wrkflwtxt += "HIP:";
 #else
   wrkflwtxt += "CPP:";
-#endif
+#endif /* clang-format off */
   // -- DOUBLE or FLOAT?
 #if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
   wrkflwtxt += "MIX+"; // mixed fptypes (single precision color algebra #537)
@@ -781,7 +788,7 @@ main( int argc, char** argv )
   wrkflwtxt += "FLT+";
 #else
   wrkflwtxt += "???+"; // no path to this statement
-#endif
+#endif /* clang-format on */
   // -- CUCOMPLEX or THRUST or STD complex numbers?
 #ifdef __CUDACC__
 #if defined MGONGPU_CUCXTYPE_CUCOMPLEX
@@ -793,6 +800,12 @@ main( int argc, char** argv )
 #else
   wrkflwtxt += "???:"; // no path to this statement
 #endif
+#elif defined __HIPCC__
+#if defined MGONGPU_CUCXTYPE_CXSMPL
+  wrkflwtxt += "CXS:";
+#else
+  wrkflwtxt += "???:"; // no path to this statement
+#endif
 #else
 #if defined MGONGPU_CPPCXTYPE_STDCOMPLEX
   wrkflwtxt += "STX:";
@@ -818,7 +831,7 @@ main( int argc, char** argv )
     wrkflwtxt += "RMBDEV+";
   else
     wrkflwtxt += "??????+"; // no path to this statement
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   // -- HOST or DEVICE matrix elements? Standalone MEs or BRIDGE?
   if( !bridge )
     wrkflwtxt += "MESDEV";
@@ -874,7 +887,7 @@ main( int argc, char** argv )
 
   if( perf )
   {
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 #ifdef _OPENMP
     // Get the output of "nproc --all" (https://stackoverflow.com/a/478960)
     std::string nprocall;
@@ -895,6 +908,8 @@ main( int argc, char** argv )
     std::cout << std::string( SEP79, '*' ) << std::endl
 #ifdef __CUDACC__
               << "Process                     = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_CUDA"
+#elif defined __HIPCC__
+              << "Process                     = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_HIP"
 #else
               << "Process                     = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_CPP"
 #endif
@@ -921,21 +936,21 @@ main( int argc, char** argv )
 #elif defined MGONGPU_FPTYPE_FLOAT
               << "FP precision                = FLOAT (NaN/abnormal=" << nabn << ", zero=" << nzero << ")" << std::endl
 #endif
-#ifdef __CUDACC__
 #if defined MGONGPU_CUCXTYPE_CUCOMPLEX
               << "Complex type                = CUCOMPLEX" << std::endl
 #elif defined MGONGPU_CUCXTYPE_THRUST
               << "Complex type                = THRUST::COMPLEX" << std::endl
-#endif
-#else
+#elif defined MGONGPU_CUCXTYPE_CXSMPL
               << "Complex type                = STD::COMPLEX" << std::endl
+#else
+              << "Complex type                = ???" << std::endl // no path to this statement...
 #endif
               << "RanNumb memory layout       = AOSOA[" << neppR << "]"
               << ( neppR == 1 ? " == AOS" : "" )
               << " [HARDCODED FOR REPRODUCIBILITY]" << std::endl
               << "Momenta memory layout       = AOSOA[" << neppM << "]"
               << ( neppM == 1 ? " == AOS" : "" ) << std::endl
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     //<< "Wavefunction GPU memory     = LOCAL" << std::endl
 #else
 #if !defined MGONGPU_CPPSIMD
@@ -966,7 +981,7 @@ main( int argc, char** argv )
 #endif
 #endif
               << "Random number generation    = " << rndgentxt << std::endl
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 #ifdef _OPENMP
               << "OMP threads / `nproc --all` = " << omp_get_max_threads() << " / " << nprocall // includes a newline
 #endif
@@ -1062,14 +1077,14 @@ main( int argc, char** argv )
              << "\"FLOAT (NaN/abnormal=" << nabn << ")\"," << std::endl
 #endif
              << "\"Complex type\": "
-#ifdef __CUDACC__
 #if defined MGONGPU_CUCXTYPE_CUCOMPLEX
              << "\"CUCOMPLEX\"," << std::endl
 #elif defined MGONGPU_CUCXTYPE_THRUST
              << "\"THRUST::COMPLEX\"," << std::endl
-#endif
-#else
+#elif defined MGONGPU_CUCXTYPE_CXSMPL
              << "\"STD::COMPLEX\"," << std::endl
+#else
+             << "\"???\"," << std::endl                           // no path to this statement...
 #endif
              << "\"RanNumb memory layout\": "
              << "\"AOSOA[" << neppR << "]\""
@@ -1077,7 +1092,7 @@ main( int argc, char** argv )
              << "\"Momenta memory layout\": "
              << "\"AOSOA[" << neppM << "]\""
              << ( neppM == 1 ? " == AOS" : "" ) << ", " << std::endl
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     //<< "\"Wavefunction GPU memory\": " << "\"LOCAL\"," << std::endl
 #endif
              << "\"Curand generation\": "
diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gux_ttxux/CPPProcess.cc b/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gux_ttxux/CPPProcess.cc
index 76c9403933..a9294d1fea 100644
--- a/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gux_ttxux/CPPProcess.cc
+++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gux_ttxux/CPPProcess.cc
@@ -4,7 +4,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
 // MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
@@ -16,7 +16,6 @@
 
 #include "mgOnGpuConfig.h"
 
-#include "CudaRuntime.h"
 #include "HelAmps_sm.h"
 #include "MemoryAccessAmplitudes.h"
 #include "MemoryAccessCouplings.h"
@@ -49,7 +48,7 @@
 // Process: g d~ > t t~ d~ WEIGHTED<=3 @1
 // Process: g s~ > t t~ s~ WEIGHTED<=3 @1
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -83,7 +82,7 @@ namespace mg5amcCpu
   __device__ const fptype cIPD[2] = { (fptype)Parameters_sm::mdl_MT, (fptype)Parameters_sm::mdl_WT };
   __device__ const fptype* cIPC = nullptr; // unused as nicoup=0
 #else
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   __device__ __constant__ fptype cIPD[2];
   __device__ __constant__ fptype* cIPC = nullptr; // unused as nicoup=0
 #else
@@ -93,7 +92,7 @@ namespace mg5amcCpu
 #endif
 
   // Helicity combinations (and filtering of "good" helicity combinations)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   __device__ __constant__ short cHel[ncomb][npar];
   __device__ __constant__ int cNGoodHel;
   __device__ __constant__ int cGoodHel[ncomb];
@@ -121,13 +120,13 @@ namespace mg5amcCpu
                            fptype* allDenominators,       // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
                            fptype_sv* jamp2_sv            // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled)
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
                            , const int ievt00             // input: first event number in current C++ event page (for CUDA, ievt depends on threadid)
 #endif
                            )
   //ALWAYS_INLINE // attributes are not permitted in a function definition
   {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     using namespace mg5amcGpu;
     using M_ACCESS = DeviceAccessMomenta;         // non-trivial access: buffer includes all events
     using E_ACCESS = DeviceAccessMatrixElements;  // non-trivial access: buffer includes all events
@@ -154,7 +153,7 @@ namespace mg5amcCpu
 #endif /* clang-format on */
     mgDebug( 0, __FUNCTION__ );
     //printf( "calculate_wavefunctions: ihel=%2d\n", ihel );
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
     //printf( "calculate_wavefunctions: ievt00=%d\n", ievt00 );
 #endif
 
@@ -190,7 +189,7 @@ namespace mg5amcCpu
 #endif
     for( int iParity = 0; iParity < nParity; ++iParity )
     { // START LOOP ON IPARITY
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
       const int ievt0 = ievt00 + iParity * neppV;
 #endif
       constexpr size_t nxcoup = ndcoup + nicoup; // both dependent and independent couplings
@@ -203,8 +202,10 @@ namespace mg5amcCpu
         allCOUPs[idcoup] = CD_ACCESS::idcoupAccessBufferConst( allcouplings, idcoup ); // dependent couplings, vary event-by-event
       for( size_t iicoup = 0; iicoup < nicoup; iicoup++ )
         allCOUPs[ndcoup + iicoup] = CI_ACCESS::iicoupAccessBufferConst( cIPC, iicoup ); // independent couplings, fixed for all events
+#ifdef MGONGPUCPP_GPUIMPL
 #ifdef __CUDACC__
 #pragma nv_diagnostic pop
+#endif
       // CUDA kernels take input/output buffers with momenta/MEs for all events
       const fptype* momenta = allmomenta;
       const fptype* COUPs[nxcoup];
@@ -336,7 +337,7 @@ namespace mg5amcCpu
         { 4, 0, 12, 4 },
         { 0, 4, 4, 12 } }; // 2-D array[4][4]
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
       // Pre-compute a constexpr triangular color matrix properly normalized #475
       struct TriangularNormalizedColorMatrix
       {
@@ -393,7 +394,7 @@ namespace mg5amcCpu
 #endif
       for( int icol = 0; icol < ncolor; icol++ )
       {
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
         // === C++ START ===
         // Diagonal terms
 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
@@ -452,7 +453,7 @@ namespace mg5amcCpu
       MEs_sv_previous += deltaMEs_previous;
 #endif
       /*
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
       if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", blockDim.x * blockIdx.x + threadIdx.x, ihel, MEs_sv );
 #else
 #ifdef MGONGPU_CPPSIMD
@@ -515,8 +516,8 @@ namespace mg5amcCpu
       { 1, 1, 1, 1, -1 },
       { 1, 1, 1, -1, 1 },
       { 1, 1, 1, -1, -1 } };
-#ifdef __CUDACC__
-    checkCuda( cudaMemcpyToSymbol( cHel, tHel, ncomb * npar * sizeof( short ) ) );
+#ifdef MGONGPUCPP_GPUIMPL
+    gpuMemcpyToSymbol( cHel, tHel, ncomb * npar * sizeof( short ) );
 #else
     memcpy( cHel, tHel, ncomb * npar * sizeof( short ) );
 #endif
@@ -557,9 +558,9 @@ namespace mg5amcCpu
     // Then copy them to CUDA constant memory (issue #39) or its C++ emulation in file-scope static memory
     const fptype tIPD[2] = { (fptype)m_pars->mdl_MT, (fptype)m_pars->mdl_WT };
     //const cxtype tIPC[0] = { ... }; // nicoup=0
-#ifdef __CUDACC__
-    checkCuda( cudaMemcpyToSymbol( cIPD, tIPD, 2 * sizeof( fptype ) ) );
-    //checkCuda( cudaMemcpyToSymbol( cIPC, tIPC, 0 * sizeof( cxtype ) ) ); // nicoup=0
+#ifdef MGONGPUCPP_GPUIMPL
+    gpuMemcpyToSymbol( cIPD, tIPD, 2 * sizeof( fptype ) );
+    //gpuMemcpyToSymbol( cIPC, tIPC, 0 * sizeof( cxtype ) ); // nicoup=0
 #else
     memcpy( cIPD, tIPD, 2 * sizeof( fptype ) );
     //memcpy( cIPC, tIPC, 0 * sizeof( cxtype ) ); // nicoup=0
@@ -596,7 +597,7 @@ namespace mg5amcCpu
   {
     std::stringstream out;
     // CUDA version (NVCC)
-    // [Use __NVCC__ instead of __CUDACC__ here!]
+    // [Use __NVCC__ instead of MGONGPUCPP_GPUIMPL here!]
     // [This tests if 'nvcc' was used even to build a .cc file, even if not necessarily 'nvcc -x cu' for a .cu file]
     // [Check 'nvcc --compiler-options -dM -E dummy.c | grep CUDA': see https://stackoverflow.com/a/53713712]
 #ifdef __NVCC__
@@ -661,12 +662,12 @@ namespace mg5amcCpu
   __global__ void /* clang-format off */
   computeDependentCouplings( const fptype* allgs, // input: Gs[nevt]
                              fptype* allcouplings // output: couplings[nevt*ndcoup*2]
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
                              , const int nevt     // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #endif
   ) /* clang-format on */
   {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     using namespace mg5amcGpu;
     using G_ACCESS = DeviceAccessGs;
     using C_ACCESS = DeviceAccessCouplings;
@@ -687,7 +688,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__ /* clang-format off */
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
   __global__ void
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
@@ -813,9 +814,9 @@ namespace mg5amcCpu
         nGoodHel++;
       }
     }
-#ifdef __CUDACC__
-    checkCuda( cudaMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) ) );
-    checkCuda( cudaMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) ) );
+#ifdef MGONGPUCPP_GPUIMPL
+    gpuMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) );
+    gpuMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) );
 #else
     cNGoodHel = nGoodHel;
     for( int ihel = 0; ihel < ncomb; ihel++ ) cGoodHel[ihel] = goodHel[ihel];
@@ -839,7 +840,7 @@ namespace mg5amcCpu
 #endif
             int* allselhel,                // output: helicity selection[nevt]
             int* allselcol                 // output: helicity selection[nevt]
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
             , const int nevt               // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #endif
             ) /* clang-format on */
@@ -859,7 +860,7 @@ namespace mg5amcCpu
     // Denominators: spins, colors and identical particles
     constexpr int helcolDenominators[1] = { 96 }; // assume nprocesses == 1 (#272 and #343)
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     // Remember: in CUDA this is a kernel for one event, in c++ this processes n events
     const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid
 #else
@@ -873,9 +874,12 @@ namespace mg5amcCpu
 #endif
 
     // Start sigmaKin_lines
+
+#include "GpuAbstraction.h"
+
     // === PART 0 - INITIALISATION (before calculate_wavefunctions) ===
     // Reset the "matrix elements" - running sums of |M|^2 over helicities for the given event
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     allMEs[ievt] = 0;
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     allNumerators[ievt] = 0;
@@ -903,7 +907,7 @@ namespace mg5amcCpu
     // === PART 1 - HELICITY LOOP: CALCULATE WAVEFUNCTIONS ===
     // (in both CUDA and C++, using precomputed good helicities)
 
-#ifdef __CUDACC__ // CUDA OR C++
+#ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++
 
     // *** START OF PART 1a - CUDA (one event per CPU thread) ***
     // Running sum of partial amplitudes squared for event by event color selection (#402)
@@ -1113,7 +1117,7 @@ namespace mg5amcCpu
     // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event
     // [NB 'sum over final spins, average over initial spins', eg see
     // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf]
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     allMEs[ievt] /= helcolDenominators[0];
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     if( channelId > 0 ) allMEs[ievt] *= allNumerators[ievt] / allDenominators[ievt];
diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gux_ttxux/CPPProcess.h b/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gux_ttxux/CPPProcess.h
index 0f49f5247b..46c4347506 100644
--- a/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gux_ttxux/CPPProcess.h
+++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gux_ttxux/CPPProcess.h
@@ -4,7 +4,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
 // MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
@@ -25,7 +25,7 @@
 
 //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -110,7 +110,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   __global__ void
   computeDependentCouplings( const fptype* allgs,    // input: Gs[nevt]
                              fptype* allcouplings ); // output: couplings[nevt*ndcoup*2]
@@ -123,7 +123,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__ /* clang-format off */
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
   __global__ void
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
@@ -153,7 +153,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__ /* clang-format off */
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
   __global__ void
   sigmaKin( const fptype* allmomenta,      // input: momenta[nevt*npar*4]
             const fptype* allcouplings,    // input: couplings[nevt*ndcoup*2]
diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gux_ttxux/CudaRuntime.h b/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gux_ttxux/CudaRuntime.h
deleted file mode 120000
index ce9e1a487a..0000000000
--- a/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gux_ttxux/CudaRuntime.h
+++ /dev/null
@@ -1 +0,0 @@
-../CudaRuntime.h
\ No newline at end of file
diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gux_ttxux/GpuAbstraction.h b/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gux_ttxux/GpuAbstraction.h
new file mode 120000
index 0000000000..72054e19ba
--- /dev/null
+++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gux_ttxux/GpuAbstraction.h
@@ -0,0 +1 @@
+../GpuAbstraction.h
\ No newline at end of file
diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gux_ttxux/GpuRuntime.h b/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gux_ttxux/GpuRuntime.h
new file mode 120000
index 0000000000..3920e83be4
--- /dev/null
+++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gux_ttxux/GpuRuntime.h
@@ -0,0 +1 @@
+../GpuRuntime.h
\ No newline at end of file
diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gux_ttxux/check_sa.cc b/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gux_ttxux/check_sa.cc
index 3fbf0ffbee..7cac5ab47b 100644
--- a/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gux_ttxux/check_sa.cc
+++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gux_ttxux/check_sa.cc
@@ -4,7 +4,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: O. Mattelaer (Nov 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 
 #include "mgOnGpuConfig.h"
@@ -12,6 +12,7 @@
 #include "BridgeKernels.h"
 #include "CPPProcess.h"
 #include "CrossSectionKernels.h"
+#include "GpuRuntime.h"
 #include "MatrixElementKernels.h"
 #include "MemoryAccessMatrixElements.h"
 #include "MemoryAccessMomenta.h"
@@ -65,7 +66,7 @@ usage( char* argv0, int ret = 1 )
   std::cout << std::endl;
   std::cout << "Summary stats are always computed: '-p' and '-j' only control their printout" << std::endl;
   std::cout << "The '-d' flag only enables NaN/abnormal warnings and OMP debugging" << std::endl;
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 #ifdef _OPENMP
   std::cout << std::endl;
   std::cout << "Use the OMP_NUM_THREADS environment variable to control OMP multi-threading" << std::endl;
@@ -96,7 +97,7 @@ int
 main( int argc, char** argv )
 {
   // Namespaces for CUDA and C++ (FIXME - eventually use the same namespace everywhere...)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   using namespace mg5amcGpu;
 #else
   using namespace mg5amcCpu;
@@ -134,9 +135,11 @@ main( int argc, char** argv )
     CurandDevice = 2
   };
 #ifdef MGONGPU_HAS_NO_CURAND
-  RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // this is the only supported mode if build has no curand (PR #784)
+  RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // this is the only supported mode if build has no curand (PR #784 and #785)
+#elif defined __HIPCC__
+#error Internal error: MGONGPU_HAS_NO_CURAND should have been set for __HIPCC__ // default on AMD GPUs should be common random
 #elif defined __CUDACC__
-  RandomNumberMode rndgen = RandomNumberMode::CurandDevice; // default on GPU if build has curand
+  RandomNumberMode rndgen = RandomNumberMode::CurandDevice; // default on NVidia GPU if build has curand
 #else
   RandomNumberMode rndgen = RandomNumberMode::CurandHost; // default on CPU if build has curand
 #endif
@@ -146,10 +149,10 @@ main( int argc, char** argv )
     RamboHost = 1,
     RamboDevice = 2
   };
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   RamboSamplingMode rmbsmp = RamboSamplingMode::RamboDevice; // default on GPU
 #else
-  RamboSamplingMode rmbsmp = RamboSamplingMode::RamboHost;  // default on CPU
+  RamboSamplingMode rmbsmp = RamboSamplingMode::RamboHost; // default on CPU
 #endif
   // Bridge emulation mode (NB Bridge implies RamboHost!)
   bool bridge = false;
@@ -177,7 +180,7 @@ main( int argc, char** argv )
     else if( arg == "--curdev" )
     {
 #ifndef __CUDACC__
-      throw std::runtime_error( "CurandDevice is not supported on CPUs" );
+      throw std::runtime_error( "CurandDevice is not supported on CPUs or non-NVidia GPUs" );
 #elif defined MGONGPU_HAS_NO_CURAND
       throw std::runtime_error( "CurandDevice is not supported because this application was built without Curand support" );
 #else
@@ -198,7 +201,7 @@ main( int argc, char** argv )
     }
     else if( arg == "--rmbdev" )
     {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
       rmbsmp = RamboSamplingMode::RamboDevice;
 #else
       throw std::runtime_error( "RamboDevice is not supported on CPUs" );
@@ -272,13 +275,13 @@ main( int argc, char** argv )
     return usage( argv[0] );
   }
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 #ifdef _OPENMP
   ompnumthreadsNotSetMeansOneThread( debug ? 1 : 0 ); // quiet(-1), info(0), debug(1)
 #endif
 #endif
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // Fail gently and avoid "Illegal instruction (core dumped)" if the host does not support the SIMD used in the ME calculation
   // Note: this prevents a crash on pmpe04 but not on some github CI nodes?
   // [NB: SIMD vectorization in mg5amc C++ code is only used in the ME calculation below MatrixElementKernelHost!]
@@ -296,14 +299,14 @@ main( int argc, char** argv )
 
   // === STEP 0 - INITIALISE
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 
-  // --- 00. Initialise cuda
-  // Instantiate a CudaRuntime at the beginnining of the application's main to
-  // invoke cudaSetDevice(0) in the constructor and book a cudaDeviceReset() call in the destructor
-  const std::string cdinKey = "00 CudaInit";
+  // --- 00. Initialise GPU
+  // Instantiate a GpuRuntime at the beginnining of the application's main.
+  // For CUDA this invokes cudaSetDevice(0) in the constructor and books a cudaDeviceReset() call in the destructor.
+  const std::string cdinKey = "00 GpuInit";
   timermap.start( cdinKey );
-  CudaRuntime cudaRuntime( debug );
+  GpuRuntime GpuRuntime( debug );
 #endif
 
   // --- 0a. Initialise physics process
@@ -325,7 +328,7 @@ main( int argc, char** argv )
   timermap.start( alloKey );
 
   // Memory buffers for random numbers for momenta
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferRndNumMomenta hstRndmom( nevt );
 #else
   PinnedHostBufferRndNumMomenta hstRndmom( nevt );
@@ -333,7 +336,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for sampling weights
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferWeights hstWeights( nevt );
 #else
   PinnedHostBufferWeights hstWeights( nevt );
@@ -341,7 +344,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for momenta
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferMomenta hstMomenta( nevt );
 #else
   PinnedHostBufferMomenta hstMomenta( nevt );
@@ -349,7 +352,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for Gs
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferGs hstGs( nevt );
 #else
   PinnedHostBufferGs hstGs( nevt );
@@ -366,7 +369,7 @@ main( int argc, char** argv )
   }
 
   // Memory buffers for matrix elements
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferMatrixElements hstMatrixElements( nevt );
 #else
   PinnedHostBufferMatrixElements hstMatrixElements( nevt );
@@ -375,7 +378,7 @@ main( int argc, char** argv )
 
   // Memory buffers for random numbers for helicity selection
   // *** NB #403 these buffers always remain initialised at 0: no need for helicity choice in gcheck/check (no LHE produced) ***
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferRndNumHelicity hstRndHel( nevt );
 #else
   PinnedHostBufferRndNumHelicity hstRndHel( nevt );
@@ -384,7 +387,7 @@ main( int argc, char** argv )
 
   // Memory buffers for random numbers for color selection
   // *** NB #402 these buffers always remain initialised at 0: no need for color choice in gcheck/check (no LHE produced) ***
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferRndNumColor hstRndCol( nevt );
 #else
   PinnedHostBufferRndNumColor hstRndCol( nevt );
@@ -392,7 +395,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for helicity selection
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferSelectedHelicity hstSelHel( nevt );
 #else
   PinnedHostBufferSelectedHelicity hstSelHel( nevt );
@@ -400,7 +403,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for color selection
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferSelectedColor hstSelCol( nevt );
 #else
   PinnedHostBufferSelectedColor hstSelCol( nevt );
@@ -438,7 +441,7 @@ main( int argc, char** argv )
     const bool onDevice = true;
     prnk.reset( new CurandRandomNumberKernel( devRndmom, onDevice ) );
 #else
-    throw std::logic_error( "INTERNAL ERROR! CurandDevice is not supported on CPUs" ); // INTERNAL ERROR (no path to this statement)
+    throw std::logic_error( "INTERNAL ERROR! CurandDevice is not supported on CPUs or non-NVidia GPUs" ); // INTERNAL ERROR (no path to this statement)
 #endif
   }
 
@@ -450,7 +453,7 @@ main( int argc, char** argv )
   }
   else
   {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     prsk.reset( new RamboSamplingKernelDevice( energy, devRndmom, devMomenta, devWeights, gpublocks, gputhreads ) );
 #else
     throw std::logic_error( "RamboDevice is not supported on CPUs" ); // INTERNAL ERROR (no path to this statement)
@@ -461,7 +464,7 @@ main( int argc, char** argv )
   std::unique_ptr<MatrixElementKernelBase> pmek;
   if( !bridge )
   {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     pmek.reset( new MatrixElementKernelDevice( devMomenta, devGs, devRndHel, devRndCol, devMatrixElements, devSelHel, devSelCol, gpublocks, gputhreads ) );
 #else
     pmek.reset( new MatrixElementKernelHost( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, nevt ) );
@@ -469,7 +472,7 @@ main( int argc, char** argv )
   }
   else
   {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     pmek.reset( new BridgeKernelDevice( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, gpublocks, gputhreads ) );
 #else
     pmek.reset( new BridgeKernelHost( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, nevt ) );
@@ -511,7 +514,7 @@ main( int argc, char** argv )
     prnk->generateRnarray();
     //std::cout << "Got random numbers" << std::endl;
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     if( rndgen != RandomNumberMode::CurandDevice && rmbsmp == RamboSamplingMode::RamboDevice )
     {
       // --- 1c. Copy rndmom from host to device
@@ -543,7 +546,7 @@ main( int argc, char** argv )
     prsk->getMomentaFinal();
     //std::cout << "Got final momenta" << std::endl;
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     if( rmbsmp == RamboSamplingMode::RamboDevice )
     {
       // --- 2c. CopyDToH Weights
@@ -588,7 +591,7 @@ main( int argc, char** argv )
       dynamic_cast<BridgeKernelBase*>( pmek.get() )->transposeInputMomentaC2F();
     }
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     // --- 2d. CopyHToD Momenta
     const std::string gKey = "0.. CpHTDg";
     rambtime += timermap.start( gKey ); // FIXME! NOT A RAMBO TIMER!
@@ -617,7 +620,7 @@ main( int argc, char** argv )
     wv3atime += timermap.stop(); // calc only
     wavetime += wv3atime;        // calc plus copy
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     if( !bridge )
     {
       // --- 3b. CopyDToH MEs
@@ -760,18 +763,22 @@ main( int argc, char** argv )
     rndgentxt = "CURAND DEVICE";
 #ifdef __CUDACC__
   rndgentxt += " (CUDA code)";
+#elif defined __HIPCC__
+  rndgentxt += " (HIP code)";
 #else
   rndgentxt += " (C++ code)";
 #endif
 
   // Workflow description summary
   std::string wrkflwtxt;
-  // -- CUDA or C++?
+  // -- CUDA or HIP or C++?
 #ifdef __CUDACC__
   wrkflwtxt += "CUD:";
+#elif defined __HIPCC__
+  wrkflwtxt += "HIP:";
 #else
   wrkflwtxt += "CPP:";
-#endif
+#endif /* clang-format off */
   // -- DOUBLE or FLOAT?
 #if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
   wrkflwtxt += "MIX+"; // mixed fptypes (single precision color algebra #537)
@@ -781,7 +788,7 @@ main( int argc, char** argv )
   wrkflwtxt += "FLT+";
 #else
   wrkflwtxt += "???+"; // no path to this statement
-#endif
+#endif /* clang-format on */
   // -- CUCOMPLEX or THRUST or STD complex numbers?
 #ifdef __CUDACC__
 #if defined MGONGPU_CUCXTYPE_CUCOMPLEX
@@ -793,6 +800,12 @@ main( int argc, char** argv )
 #else
   wrkflwtxt += "???:"; // no path to this statement
 #endif
+#elif defined __HIPCC__
+#if defined MGONGPU_CUCXTYPE_CXSMPL
+  wrkflwtxt += "CXS:";
+#else
+  wrkflwtxt += "???:"; // no path to this statement
+#endif
 #else
 #if defined MGONGPU_CPPCXTYPE_STDCOMPLEX
   wrkflwtxt += "STX:";
@@ -818,7 +831,7 @@ main( int argc, char** argv )
     wrkflwtxt += "RMBDEV+";
   else
     wrkflwtxt += "??????+"; // no path to this statement
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   // -- HOST or DEVICE matrix elements? Standalone MEs or BRIDGE?
   if( !bridge )
     wrkflwtxt += "MESDEV";
@@ -874,7 +887,7 @@ main( int argc, char** argv )
 
   if( perf )
   {
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 #ifdef _OPENMP
     // Get the output of "nproc --all" (https://stackoverflow.com/a/478960)
     std::string nprocall;
@@ -895,6 +908,8 @@ main( int argc, char** argv )
     std::cout << std::string( SEP79, '*' ) << std::endl
 #ifdef __CUDACC__
               << "Process                     = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_CUDA"
+#elif defined __HIPCC__
+              << "Process                     = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_HIP"
 #else
               << "Process                     = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_CPP"
 #endif
@@ -921,21 +936,21 @@ main( int argc, char** argv )
 #elif defined MGONGPU_FPTYPE_FLOAT
               << "FP precision                = FLOAT (NaN/abnormal=" << nabn << ", zero=" << nzero << ")" << std::endl
 #endif
-#ifdef __CUDACC__
 #if defined MGONGPU_CUCXTYPE_CUCOMPLEX
               << "Complex type                = CUCOMPLEX" << std::endl
 #elif defined MGONGPU_CUCXTYPE_THRUST
               << "Complex type                = THRUST::COMPLEX" << std::endl
-#endif
-#else
+#elif defined MGONGPU_CUCXTYPE_CXSMPL
               << "Complex type                = STD::COMPLEX" << std::endl
+#else
+              << "Complex type                = ???" << std::endl // no path to this statement...
 #endif
               << "RanNumb memory layout       = AOSOA[" << neppR << "]"
               << ( neppR == 1 ? " == AOS" : "" )
               << " [HARDCODED FOR REPRODUCIBILITY]" << std::endl
               << "Momenta memory layout       = AOSOA[" << neppM << "]"
               << ( neppM == 1 ? " == AOS" : "" ) << std::endl
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     //<< "Wavefunction GPU memory     = LOCAL" << std::endl
 #else
 #if !defined MGONGPU_CPPSIMD
@@ -966,7 +981,7 @@ main( int argc, char** argv )
 #endif
 #endif
               << "Random number generation    = " << rndgentxt << std::endl
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 #ifdef _OPENMP
               << "OMP threads / `nproc --all` = " << omp_get_max_threads() << " / " << nprocall // includes a newline
 #endif
@@ -1062,14 +1077,14 @@ main( int argc, char** argv )
              << "\"FLOAT (NaN/abnormal=" << nabn << ")\"," << std::endl
 #endif
              << "\"Complex type\": "
-#ifdef __CUDACC__
 #if defined MGONGPU_CUCXTYPE_CUCOMPLEX
              << "\"CUCOMPLEX\"," << std::endl
 #elif defined MGONGPU_CUCXTYPE_THRUST
              << "\"THRUST::COMPLEX\"," << std::endl
-#endif
-#else
+#elif defined MGONGPU_CUCXTYPE_CXSMPL
              << "\"STD::COMPLEX\"," << std::endl
+#else
+             << "\"???\"," << std::endl                           // no path to this statement...
 #endif
              << "\"RanNumb memory layout\": "
              << "\"AOSOA[" << neppR << "]\""
@@ -1077,7 +1092,7 @@ main( int argc, char** argv )
              << "\"Momenta memory layout\": "
              << "\"AOSOA[" << neppM << "]\""
              << ( neppM == 1 ? " == AOS" : "" ) << ", " << std::endl
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     //<< "\"Wavefunction GPU memory\": " << "\"LOCAL\"," << std::endl
 #endif
              << "\"Curand generation\": "
diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/RamboSamplingKernels.cc b/epochX/cudacpp/gq_ttq.sa/SubProcesses/RamboSamplingKernels.cc
index da68aa9255..79abbcc4f8 100644
--- a/epochX/cudacpp/gq_ttq.sa/SubProcesses/RamboSamplingKernels.cc
+++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/RamboSamplingKernels.cc
@@ -1,11 +1,11 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #include "RamboSamplingKernels.h"
 
-#include "CudaRuntime.h"
+#include "GpuRuntime.h"
 #include "MemoryAccessMomenta.h"
 #include "MemoryAccessRandomNumbers.h"
 #include "MemoryAccessWeights.h"
@@ -14,7 +14,7 @@
 
 #include <sstream>
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -92,7 +92,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   RamboSamplingKernelDevice::RamboSamplingKernelDevice( const fptype energy,               // input: energy
                                                         const BufferRndNumMomenta& rndmom, // input: random numbers in [0,1]
                                                         BufferMomenta& momenta,            // output: momenta
@@ -135,7 +135,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   __global__ void
   getMomentaInitialDevice( const fptype energy,
                            fptype* momenta )
@@ -147,17 +147,17 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   void
   RamboSamplingKernelDevice::getMomentaInitial()
   {
-    getMomentaInitialDevice<<<m_gpublocks, m_gputhreads>>>( m_energy, m_momenta.data() );
+    gpuLaunchKernel( getMomentaInitialDevice, m_gpublocks, m_gputhreads, m_energy, m_momenta.data() );
   }
 #endif
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   __global__ void
   getMomentaFinalDevice( const fptype energy,
                          const fptype* rndmom,
@@ -171,11 +171,11 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   void
   RamboSamplingKernelDevice::getMomentaFinal()
   {
-    getMomentaFinalDevice<<<m_gpublocks, m_gputhreads>>>( m_energy, m_rndmom.data(), m_momenta.data(), m_weights.data() );
+    gpuLaunchKernel( getMomentaFinalDevice, m_gpublocks, m_gputhreads, m_energy, m_rndmom.data(), m_momenta.data(), m_weights.data() );
   }
 #endif
 
diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/RamboSamplingKernels.h b/epochX/cudacpp/gq_ttq.sa/SubProcesses/RamboSamplingKernels.h
index 184089efd7..7c214cd74b 100644
--- a/epochX/cudacpp/gq_ttq.sa/SubProcesses/RamboSamplingKernels.h
+++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/RamboSamplingKernels.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef RAMBOSAMPLINGKERNELS_H
 #define RAMBOSAMPLINGKERNELS_H 1
@@ -10,7 +10,7 @@
 
 #include "MemoryBuffers.h"
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -93,7 +93,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   // A class encapsulating RAMBO phase space sampling on a GPU device
   class RamboSamplingKernelDevice final : public SamplingKernelBase, public NumberOfEvents
   {
diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/RandomNumberKernels.h b/epochX/cudacpp/gq_ttq.sa/SubProcesses/RandomNumberKernels.h
index 188a72c2c9..21d63beeac 100644
--- a/epochX/cudacpp/gq_ttq.sa/SubProcesses/RandomNumberKernels.h
+++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/RandomNumberKernels.h
@@ -1,14 +1,14 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef RANDOMNUMBERKERNELS_H
 #define RANDOMNUMBERKERNELS_H 1
 
 #include "mgOnGpuConfig.h"
 
-// NB This must come AFTER mgOnGpuConfig.h which contains our definition of __global__ when __CUDACC__ is not defined
+// NB This must come AFTER mgOnGpuConfig.h which contains our definition of __global__ when MGONGPUCPP_GPUIMPL is not defined
 #ifndef MGONGPU_HAS_NO_CURAND
 //#include "curand.h"
 struct curandGenerator_st; // forward definition from curand.h
@@ -16,7 +16,7 @@ struct curandGenerator_st; // forward definition from curand.h
 
 #include "MemoryBuffers.h"
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/cudacpp.mk b/epochX/cudacpp/gq_ttq.sa/SubProcesses/cudacpp.mk
index 509307506b..f2cfa349da 100644
--- a/epochX/cudacpp/gq_ttq.sa/SubProcesses/cudacpp.mk
+++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/cudacpp.mk
@@ -1,7 +1,7 @@
 # Copyright (C) 2020-2023 CERN and UCLouvain.
 # Licensed under the GNU Lesser General Public License (version 3 or later).
 # Created by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin.
-# Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+# Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 
 #=== Determine the name of this makefile (https://ftp.gnu.org/old-gnu/Manuals/make-3.80/html_node/make_17.html)
 #=== NB: use ':=' to ensure that the value of CUDACPP_MAKEFILE is not modified further down after including make_opts
@@ -42,10 +42,10 @@ endif
 
 #-------------------------------------------------------------------------------
 
-#=== Configure common compiler flags for C++ and CUDA
+#=== Configure common compiler flags for C++ and CUDA/HIP
 
 INCFLAGS = -I.
-OPTFLAGS = -O3 # this ends up in CUFLAGS too (should it?), cannot add -Ofast or -ffast-math here
+OPTFLAGS = -O3 # this ends up in GPUFLAGS too (should it?), cannot add -Ofast or -ffast-math here
 
 # Dependency on src directory
 MG5AMC_COMMONLIB = mg5amc_common
@@ -121,24 +121,46 @@ endif
 
 #-------------------------------------------------------------------------------
 
-#=== Configure the CUDA compiler
+#=== Configure the GPU compiler (CUDA or HIP)
 
-# If CXX is not a single word (example "clang++ --gcc-toolchain...") then disable CUDA builds (issue #505)
-# This is because it is impossible to pass this to "CUFLAGS += -ccbin <host-compiler>" below
+# FIXME! (AV 24.01.2024)
+# In the current implementation (without separate builds for C++ and CUDA/HIP), we first check for cudacc and hipcc in CUDA_HOME and HIP_HOME.
+# If CUDA_HOME or HIP_HOME are not set, try to determine them from the path to cudacc and hipcc.
+# While convoluted, this is currently necessary to allow disabling CUDA/HIP builds by setting CUDA_HOME or HIP_HOME to invalid paths.
+# This will (probably?) be fixed when separate C++ and CUDA/HIP builds are implemented (PR #775).
+
+# If CXX is not a single word (example "clang++ --gcc-toolchain...") then disable CUDA and HIP builds (issue #505)
+# This is because it is impossible to pass this to "GPUFLAGS += -ccbin <host-compiler>" below
 ifneq ($(words $(subst ccache ,,$(CXX))),1) # allow at most "CXX=ccache <host-compiler>" from outside
-  $(warning CUDA builds are not supported for multi-word CXX "$(CXX)")
+  $(warning CUDA and HIP builds are not supported for multi-word CXX "$(CXX)")
   override CUDA_HOME=disabled
+  override HIP_HOME=disabled
 endif
 
-# If CUDA_HOME is not set, try to set it from the location of nvcc
+# If CUDA_HOME is not set, try to set it from the path to nvcc
 ifndef CUDA_HOME
   CUDA_HOME = $(patsubst %bin/nvcc,%,$(shell which nvcc 2>/dev/null))
   $(warning CUDA_HOME was not set: using "$(CUDA_HOME)")
 endif
 
-# Set NVCC as $(CUDA_HOME)/bin/nvcc if it exists
+# If HIP_HOME is not set, try to set it from the path to hipcc
+ifndef HIP_HOME
+  HIP_HOME = $(patsubst %bin/hipcc,%,$(HIP_COMPILER_PATH))
+  $(warning HIP_HOME was not set: using "$(HIP_HOME)")
+endif
+
+# FIXME! (AV 24.01.2024)
+# In the current implementation (without separate builds for C++ and CUDA/HIP),
+# builds are performed for HIP only if CUDA is not found in the path.
+# If both CUDA and HIP are installed, HIP builds can be triggered by unsetting CUDA_HOME.
+# This will be fixed when separate C++ and CUDA/HIP builds are implemented (PR #775).
+
+#--- Option 1: CUDA exists -> use CUDA
+
+# Set GPUCC as $(CUDA_HOME)/bin/nvcc if it exists
 ifneq ($(wildcard $(CUDA_HOME)/bin/nvcc),)
-  NVCC = $(CUDA_HOME)/bin/nvcc
+
+  GPUCC = $(CUDA_HOME)/bin/nvcc
   USE_NVTX ?=-DUSE_NVTX
   # See https://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/index.html
   # See https://arnon.dk/matching-sm-architectures-arch-and-gencode-for-various-nvidia-cards/
@@ -158,41 +180,78 @@ ifneq ($(wildcard $(CUDA_HOME)/bin/nvcc),)
     CURANDLIBFLAGS = -L$(CUDA_HOME)/lib64/ -lcurand # NB: -lcuda is not needed here!
   endif
   CUOPTFLAGS = -lineinfo
-  CUFLAGS = $(foreach opt, $(OPTFLAGS), -Xcompiler $(opt)) $(CUOPTFLAGS) $(INCFLAGS) $(CUINC) $(USE_NVTX) $(CUARCHFLAGS) -use_fast_math
-  ###CUFLAGS += -Xcompiler -Wall -Xcompiler -Wextra -Xcompiler -Wshadow
-  ###NVCC_VERSION = $(shell $(NVCC) --version | grep 'Cuda compilation tools' | cut -d' ' -f5 | cut -d, -f1)
-  CUFLAGS += -std=c++17 # need CUDA >= 11.2 (see #333): this is enforced in mgOnGpuConfig.h
+  ###GPUFLAGS = $(OPTFLAGS) $(CUOPTFLAGS) $(INCFLAGS) $(CUINC) $(USE_NVTX) $(CUARCHFLAGS) -use_fast_math
+  GPUFLAGS = $(foreach opt, $(OPTFLAGS), -Xcompiler $(opt)) $(CUOPTFLAGS) $(INCFLAGS) $(CUINC) $(USE_NVTX) $(CUARCHFLAGS) -use_fast_math
+  ###GPUFLAGS += -Xcompiler -Wall -Xcompiler -Wextra -Xcompiler -Wshadow
+  ###GPUCC_VERSION = $(shell $(GPUCC) --version | grep 'Cuda compilation tools' | cut -d' ' -f5 | cut -d, -f1)
+  GPUFLAGS += -std=c++17 # need CUDA >= 11.2 (see #333): this is enforced in mgOnGpuConfig.h
   # Without -maxrregcount: baseline throughput: 6.5E8 (16384 32 12) up to 7.3E8 (65536 128 12)
-  ###CUFLAGS+= --maxrregcount 160 # improves throughput: 6.9E8 (16384 32 12) up to 7.7E8 (65536 128 12)
-  ###CUFLAGS+= --maxrregcount 128 # improves throughput: 7.3E8 (16384 32 12) up to 7.6E8 (65536 128 12)
-  ###CUFLAGS+= --maxrregcount 96 # degrades throughput: 4.1E8 (16384 32 12) up to 4.5E8 (65536 128 12)
-  ###CUFLAGS+= --maxrregcount 64 # degrades throughput: 1.7E8 (16384 32 12) flat at 1.7E8 (65536 128 12)
+  ###GPUFLAGS+= --maxrregcount 160 # improves throughput: 6.9E8 (16384 32 12) up to 7.7E8 (65536 128 12)
+  ###GPUFLAGS+= --maxrregcount 128 # improves throughput: 7.3E8 (16384 32 12) up to 7.6E8 (65536 128 12)
+  ###GPUFLAGS+= --maxrregcount 96 # degrades throughput: 4.1E8 (16384 32 12) up to 4.5E8 (65536 128 12)
+  ###GPUFLAGS+= --maxrregcount 64 # degrades throughput: 1.7E8 (16384 32 12) flat at 1.7E8 (65536 128 12)
+  CUBUILDRULEFLAGS = -Xcompiler -fPIC -c
+  CCBUILDRULEFLAGS = -Xcompiler -fPIC -c -x cu
+  CUDATESTFLAGS = -lcuda
+
+  # Set the host C++ compiler for GPUCC via "-ccbin <host-compiler>"
+  # (NB issue #505: this must be a single word, "clang++ --gcc-toolchain..." is not supported)
+  GPUFLAGS += -ccbin $(shell which $(subst ccache ,,$(CXX)))
+
+  # Allow newer (unsupported) C++ compilers with older versions of CUDA if ALLOW_UNSUPPORTED_COMPILER_IN_CUDA is set (#504)
+  ifneq ($(origin ALLOW_UNSUPPORTED_COMPILER_IN_CUDA),undefined)
+  GPUFLAGS += -allow-unsupported-compiler
+  endif
+
 else ifneq ($(origin REQUIRE_CUDA),undefined)
+
   # If REQUIRE_CUDA is set but no cuda is found, stop here (e.g. for CI tests on GPU #443)
-  $(error No cuda installation found (set CUDA_HOME or make nvcc visible in PATH))
+  $(error No cuda installation found (set CUDA_HOME or make GPUCC visible in PATH))
+
+#--- Option 2: CUDA does not exist, HIP exists -> use HIP
+
+# Set GPUCC as $(HIP_HOME)/bin/hipcc if it exists
+else ifneq ($(wildcard $(HIP_HOME)/bin/hipcc),)
+
+  GPUCC = $(HIP_HOME)/bin/hipcc
+  #USE_NVTX ?=-DUSE_NVTX # should maybe find something equivalent to this in HIP?
+  HIPARCHFLAGS = -target x86_64-linux-gnu --offload-arch=gfx90a
+  HIPINC = -I$(HIP_HOME)/include/
+  # Note: -DHIP_FAST_MATH is equivalent to -use_fast_math in HIP 
+  # (but only for single precision line 208: https://rocm-developer-tools.github.io/HIP/hcc__detail_2math__functions_8h_source.html)
+  GPUFLAGS = $(OPTFLAGS) $(CUOPTFLAGS) $(INCFLAGS) $(HIPINC) $(HIPARCHFLAGS) -DHIP_FAST_MATH -DHIP_PLATFORM=amd -fPIC
+  ###GPUFLAGS += -Xcompiler -Wall -Xcompiler -Wextra -Xcompiler -Wshadow
+  GPUFLAGS += -std=c++17
+  ###GPUFLAGS+= --maxrregcount 255 # (AV: is this option valid on HIP and meaningful on AMD GPUs?)
+  CUBUILDRULEFLAGS = -fPIC -c
+  CCBUILDRULEFLAGS = -fPIC -c
+
+else ifneq ($(origin REQUIRE_HIP),undefined)
+
+  # If REQUIRE_HIP is set but no HIP is found, stop here (e.g. for CI tests on GPU #443)
+  $(error No hip installation found (set HIP_HOME or make GPUCC visible in PATH))
+
+#--- Option 3: CUDA does not exist, HIP does not exist -> switch off both CUDA and HIP
+
 else
-  # No cuda. Switch cuda compilation off and go to common random numbers in C++
+
+  # No cudacc and no hipcc: switch CUDA and HIP compilation off and go to common random numbers in C++
   $(warning CUDA_HOME is not set or is invalid: export CUDA_HOME to compile with cuda)
-  override NVCC=
+  $(warning HIP_HOME is not set or is invalid: export HIP_HOME to compile with hip)
+  override GPUCC=
   override USE_NVTX=
   override CUINC=
   override CURANDLIBFLAGS=
-endif
-export NVCC
-export CUFLAGS
-
-# Set the host C++ compiler for nvcc via "-ccbin <host-compiler>"
-# (NB issue #505: this must be a single word, "clang++ --gcc-toolchain..." is not supported)
-CUFLAGS += -ccbin $(shell which $(subst ccache ,,$(CXX)))
 
-# Allow newer (unsupported) C++ compilers with older versions of CUDA if ALLOW_UNSUPPORTED_COMPILER_IN_CUDA is set (#504)
-ifneq ($(origin ALLOW_UNSUPPORTED_COMPILER_IN_CUDA),undefined)
-CUFLAGS += -allow-unsupported-compiler
 endif
 
+# Export GPUCC (so that it can also be used in cudacpp_src.mk?)
+export GPUCC
+export GPUFLAGS
+
 #-------------------------------------------------------------------------------
 
-#=== Configure ccache for C++ and CUDA builds
+#=== Configure ccache for C++ and CUDA/HIP builds
 
 # Enable ccache if USECCACHE=1
 ifeq ($(USECCACHE)$(shell echo $(CXX) | grep ccache),1)
@@ -201,15 +260,15 @@ endif
 #ifeq ($(USECCACHE)$(shell echo $(AR) | grep ccache),1)
 #  override AR:=ccache $(AR)
 #endif
-ifneq ($(NVCC),)
-  ifeq ($(USECCACHE)$(shell echo $(NVCC) | grep ccache),1)
-    override NVCC:=ccache $(NVCC)
+ifneq ($(GPUCC),)
+  ifeq ($(USECCACHE)$(shell echo $(GPUCC) | grep ccache),1)
+    override GPUCC:=ccache $(GPUCC)
   endif
 endif
 
 #-------------------------------------------------------------------------------
 
-#=== Configure PowerPC-specific compiler flags for C++ and CUDA
+#=== Configure PowerPC-specific compiler flags for C++ and CUDA/HIP
 
 # PowerPC-specific CXX compiler flags (being reviewed)
 ifeq ($(UNAME_P),ppc64le)
@@ -225,9 +284,9 @@ else
   ######CXXFLAGS+= -fno-semantic-interposition # no benefit (neither alone, nor combined with -flto)
 endif
 
-# PowerPC-specific CUDA compiler flags (to be reviewed!)
+# PowerPC-specific CUDA/HIP compiler flags (to be reviewed!)
 ifeq ($(UNAME_P),ppc64le)
-  CUFLAGS+= -Xcompiler -mno-float128
+  GPUFLAGS+= -Xcompiler -mno-float128
 endif
 
 #-------------------------------------------------------------------------------
@@ -237,7 +296,7 @@ endif
 # Set the default OMPFLAGS choice
 ifneq ($(shell $(CXX) --version | egrep '^Intel'),)
 override OMPFLAGS = -fopenmp
-###override OMPFLAGS = # disable OpenMP MT on Intel (was ok without nvcc but not ok with nvcc before #578)
+###override OMPFLAGS = # disable OpenMP MT on Intel (was ok without GPUCC but not ok with GPUCC before #578)
 else ifneq ($(shell $(CXX) --version | egrep '^(clang)'),)
 override OMPFLAGS = -fopenmp
 ###override OMPFLAGS = # disable OpenMP MT on clang (was not ok without or with nvcc before #578)
@@ -293,7 +352,10 @@ endif
 
 # Set the default RNDGEN (random number generator) choice
 ifeq ($(RNDGEN),)
-  ifeq ($(NVCC),)
+  ifeq ($(GPUCC),)
+    override RNDGEN = hasNoCurand
+  # Edgecase for HIP compilation
+  else ifeq ($(findstring hipcc,$(GPUCC)),hipcc)
     override RNDGEN = hasNoCurand
   else ifeq ($(RNDGEN),)
     override RNDGEN = hasCurand
@@ -310,7 +372,7 @@ export OMPFLAGS
 
 #-------------------------------------------------------------------------------
 
-#=== Set the CUDA/C++ compiler flags appropriate to user-defined choices of AVX, FPTYPE, HELINL, HRDCOD, RNDGEN
+#=== Set the CUDA/HIP/C++ compiler flags appropriate to user-defined choices of AVX, FPTYPE, HELINL, HRDCOD, RNDGEN
 
 # Set the build flags appropriate to OMPFLAGS
 $(info OMPFLAGS=$(OMPFLAGS))
@@ -368,13 +430,13 @@ CXXFLAGS+= $(AVXFLAGS)
 $(info FPTYPE=$(FPTYPE))
 ifeq ($(FPTYPE),d)
   CXXFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_DOUBLE
-  CUFLAGS  += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_DOUBLE
+  GPUFLAGS  += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_DOUBLE
 else ifeq ($(FPTYPE),f)
   CXXFLAGS += -DMGONGPU_FPTYPE_FLOAT -DMGONGPU_FPTYPE2_FLOAT
-  CUFLAGS  += -DMGONGPU_FPTYPE_FLOAT -DMGONGPU_FPTYPE2_FLOAT
+  GPUFLAGS  += -DMGONGPU_FPTYPE_FLOAT -DMGONGPU_FPTYPE2_FLOAT
 else ifeq ($(FPTYPE),m)
   CXXFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_FLOAT
-  CUFLAGS  += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_FLOAT
+  GPUFLAGS  += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_FLOAT
 else
   $(error Unknown FPTYPE='$(FPTYPE)': only 'd', 'f' and 'm' are supported)
 endif
@@ -383,7 +445,7 @@ endif
 $(info HELINL=$(HELINL))
 ifeq ($(HELINL),1)
   CXXFLAGS += -DMGONGPU_INLINE_HELAMPS
-  CUFLAGS  += -DMGONGPU_INLINE_HELAMPS
+  GPUFLAGS  += -DMGONGPU_INLINE_HELAMPS
 else ifneq ($(HELINL),0)
   $(error Unknown HELINL='$(HELINL)': only '0' and '1' are supported)
 endif
@@ -392,7 +454,7 @@ endif
 $(info HRDCOD=$(HRDCOD))
 ifeq ($(HRDCOD),1)
   CXXFLAGS += -DMGONGPU_HARDCODE_PARAM
-  CUFLAGS  += -DMGONGPU_HARDCODE_PARAM
+  GPUFLAGS  += -DMGONGPU_HARDCODE_PARAM
 else ifneq ($(HRDCOD),0)
   $(error Unknown HRDCOD='$(HRDCOD)': only '0' and '1' are supported)
 endif
@@ -444,11 +506,11 @@ ifeq ($(UNAME_S),Darwin)
   override CULIBFLAGSRPATH2 =
 else
   # RPATH to cuda/cpp libs when linking executables
-  override CXXLIBFLAGSRPATH = -Wl,-rpath,$(LIBDIRRPATH)
-  override CULIBFLAGSRPATH = -Xlinker -rpath,$(LIBDIRRPATH)
+  override CXXLIBFLAGSRPATH = -Wl,-rpath=$(LIBDIRRPATH)
+  override CULIBFLAGSRPATH = -Xlinker -rpath=$(LIBDIRRPATH)
   # RPATH to common lib when linking cuda/cpp libs
-  override CXXLIBFLAGSRPATH2 = -Wl,-rpath,'$$ORIGIN'
-  override CULIBFLAGSRPATH2 = -Xlinker -rpath,'$$ORIGIN'
+  override CXXLIBFLAGSRPATH2 = -Wl,-rpath='$$ORIGIN'
+  override CULIBFLAGSRPATH2 = -Xlinker -rpath='$$ORIGIN'
 endif
 
 # Setting LD_LIBRARY_PATH or DYLD_LIBRARY_PATH in the RUNTIME is no longer necessary (neither on Linux nor on Mac)
@@ -461,7 +523,7 @@ override RUNTIME =
 cxx_main=$(BUILDDIR)/check.exe
 fcxx_main=$(BUILDDIR)/fcheck.exe
 
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 cu_main=$(BUILDDIR)/gcheck.exe
 fcu_main=$(BUILDDIR)/fgcheck.exe
 else
@@ -492,15 +554,16 @@ $(BUILDDIR)/.build.$(TAG):
 	@touch $(BUILDDIR)/.build.$(TAG)
 
 # Generic target and build rules: objects from CUDA compilation
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 $(BUILDDIR)/%.o : %.cu *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
 	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
-	$(NVCC) $(CPPFLAGS) $(CUFLAGS) -Xcompiler -fPIC -c $< -o $@
+	$(GPUCC) $(CPPFLAGS) $(GPUFLAGS) $(CUBUILDRULEFLAGS) $< -o $@
 
 $(BUILDDIR)/%_cu.o : %.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
 	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
-	$(NVCC) $(CPPFLAGS) $(CUFLAGS) -Xcompiler -fPIC -c -x cu $< -o $@
+	$(GPUCC) $(CPPFLAGS) $(GPUFLAGS) $(CCBUILDRULEFLAGS) $< -o $@
 endif
+# -x cu in line above
 
 # Generic target and build rules: objects from C++ compilation
 # (NB do not include CUINC here! add it only for NVTX or curand #679)
@@ -509,11 +572,14 @@ $(BUILDDIR)/%.o : %.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
 	$(CXX) $(CPPFLAGS) $(CXXFLAGS) -fPIC -c $< -o $@
 
 # Apply special build flags only to CrossSectionKernel.cc and gCrossSectionKernel.cu (no fast math, see #117 and #516)
+# Added edgecase for HIP compilation
 ifeq ($(shell $(CXX) --version | grep ^nvc++),)
 $(BUILDDIR)/CrossSectionKernels.o: CXXFLAGS := $(filter-out -ffast-math,$(CXXFLAGS))
 $(BUILDDIR)/CrossSectionKernels.o: CXXFLAGS += -fno-fast-math
-ifneq ($(NVCC),)
-$(BUILDDIR)/gCrossSectionKernels.o: CUFLAGS += -Xcompiler -fno-fast-math
+ifeq ($(findstring nvcc,$(GPUCC)),nvcc)
+  $(BUILDDIR)/gCrossSectionKernels.o: GPUFLAGS += -Xcompiler -fno-fast-math
+else
+  $(BUILDDIR)/gCrossSectionKernels.o: GPUFLAGS += -fno-fast-math
 endif
 endif
 
@@ -530,10 +596,10 @@ ifeq ($(RNDGEN),hasCurand)
 $(BUILDDIR)/CurandRandomNumberKernel.o: CXXFLAGS += $(CUINC)
 endif
 
-# Avoid "warning: builtin __has_trivial_... is deprecated; use __is_trivially_... instead" in nvcc with icx2023 (#592)
+# Avoid "warning: builtin __has_trivial_... is deprecated; use __is_trivially_... instead" in GPUCC with icx2023 (#592)
 ifneq ($(shell $(CXX) --version | egrep '^(Intel)'),)
-ifneq ($(NVCC),)
-CUFLAGS += -Xcompiler -Wno-deprecated-builtins
+ifneq ($(GPUCC),)
+GPUFLAGS += -Wno-deprecated-builtins
 endif
 endif
 
@@ -541,8 +607,8 @@ endif
 # This patch does remove the warning, but I prefer to keep it disabled for the moment...
 ###ifneq ($(shell $(CXX) --version | egrep '^(clang|Apple clang|Intel)'),)
 ###$(BUILDDIR)/CrossSectionKernels.o: CXXFLAGS += -Wno-overriding-t-option
-###ifneq ($(NVCC),)
-###$(BUILDDIR)/gCrossSectionKernels.o: CUFLAGS += -Xcompiler -Wno-overriding-t-option
+###ifneq ($(GPUCC),)
+###$(BUILDDIR)/gCrossSectionKernels.o: GPUFLAGS += -Xcompiler -Wno-overriding-t-option
 ###endif
 ###endif
 
@@ -569,7 +635,7 @@ MG5AMC_CXXLIB = mg5amc_$(processid_short)_cpp
 cxx_objects_lib=$(BUILDDIR)/CPPProcess.o $(BUILDDIR)/MatrixElementKernels.o $(BUILDDIR)/BridgeKernels.o $(BUILDDIR)/CrossSectionKernels.o
 cxx_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel.o $(BUILDDIR)/RamboSamplingKernels.o
 
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 MG5AMC_CULIB = mg5amc_$(processid_short)_cuda
 cu_objects_lib=$(BUILDDIR)/gCPPProcess.o $(BUILDDIR)/gMatrixElementKernels.o $(BUILDDIR)/gBridgeKernels.o $(BUILDDIR)/gCrossSectionKernels.o
 cu_objects_exe=$(BUILDDIR)/gCommonRandomNumberKernel.o $(BUILDDIR)/gRamboSamplingKernels.o
@@ -581,11 +647,11 @@ $(LIBDIR)/lib$(MG5AMC_CXXLIB).so: cxx_objects_lib += $(BUILDDIR)/fbridge.o
 $(LIBDIR)/lib$(MG5AMC_CXXLIB).so: $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib)
 	$(CXX) -shared -o $@ $(cxx_objects_lib) $(CXXLIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB)
 
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 $(LIBDIR)/lib$(MG5AMC_CULIB).so: $(BUILDDIR)/fbridge_cu.o
 $(LIBDIR)/lib$(MG5AMC_CULIB).so: cu_objects_lib += $(BUILDDIR)/fbridge_cu.o
 $(LIBDIR)/lib$(MG5AMC_CULIB).so: $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cu_objects_lib)
-	$(NVCC) --shared -o $@ $(cu_objects_lib) $(CULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB)
+	$(GPUCC) --shared -o $@ $(cu_objects_lib) $(CULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB)
 endif
 
 #-------------------------------------------------------------------------------
@@ -602,16 +668,16 @@ $(cxx_main): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PAT
 $(cxx_main): $(BUILDDIR)/check_sa.o $(LIBDIR)/lib$(MG5AMC_CXXLIB).so $(cxx_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel.o
 	$(CXX) -o $@ $(BUILDDIR)/check_sa.o $(OMPFLAGS) -ldl -pthread $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_CXXLIB) $(cxx_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel.o $(CURANDLIBFLAGS)
 
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 ifneq ($(shell $(CXX) --version | grep ^Intel),)
-$(cu_main): LIBFLAGS += -lintlc # compile with icpx and link with nvcc (undefined reference to `_intel_fast_memcpy')
-$(cu_main): LIBFLAGS += -lsvml # compile with icpx and link with nvcc (undefined reference to `__svml_cos4_l9')
+$(cu_main): LIBFLAGS += -lintlc # compile with icpx and link with GPUCC (undefined reference to `_intel_fast_memcpy')
+$(cu_main): LIBFLAGS += -lsvml # compile with icpx and link with GPUCC (undefined reference to `__svml_cos4_l9')
 else ifneq ($(shell $(CXX) --version | grep ^nvc++),) # support nvc++ #531
 $(cu_main): LIBFLAGS += -L$(patsubst %bin/nvc++,%lib,$(subst ccache ,,$(CXX))) -lnvhpcatm -lnvcpumath -lnvc
 endif
 $(cu_main): LIBFLAGS += $(CULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH
 $(cu_main): $(BUILDDIR)/gcheck_sa.o $(LIBDIR)/lib$(MG5AMC_CULIB).so $(cu_objects_exe) $(BUILDDIR)/gCurandRandomNumberKernel.o
-	$(NVCC) -o $@ $(BUILDDIR)/gcheck_sa.o $(CUARCHFLAGS) $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe) $(BUILDDIR)/gCurandRandomNumberKernel.o $(CURANDLIBFLAGS)
+	$(GPUCC) -o $@ $(BUILDDIR)/gcheck_sa.o $(CUARCHFLAGS) $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe) $(BUILDDIR)/gCurandRandomNumberKernel.o $(CURANDLIBFLAGS)
 endif
 
 #-------------------------------------------------------------------------------
@@ -637,17 +703,17 @@ $(fcxx_main): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PA
 $(fcxx_main): $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler.o $(LIBDIR)/lib$(MG5AMC_CXXLIB).so $(cxx_objects_exe)
 	$(CXX) -o $@ $(BUILDDIR)/fcheck_sa.o $(OMPFLAGS) $(BUILDDIR)/fsampler.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CXXLIB) $(cxx_objects_exe)
 
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 ifneq ($(shell $(CXX) --version | grep ^Intel),)
-$(fcu_main): LIBFLAGS += -lintlc # compile with icpx and link with nvcc (undefined reference to `_intel_fast_memcpy')
-$(fcu_main): LIBFLAGS += -lsvml # compile with icpx and link with nvcc (undefined reference to `__svml_cos4_l9')
+$(fcu_main): LIBFLAGS += -lintlc # compile with icpx and link with GPUCC (undefined reference to `_intel_fast_memcpy')
+$(fcu_main): LIBFLAGS += -lsvml # compile with icpx and link with GPUCC (undefined reference to `__svml_cos4_l9')
 endif
 ifeq ($(UNAME_S),Darwin)
 $(fcu_main): LIBFLAGS += -L$(shell dirname $(shell $(FC) --print-file-name libgfortran.dylib)) # add path to libgfortran on Mac #375
 endif
 $(fcu_main): LIBFLAGS += $(CULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH
 $(fcu_main): $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler_cu.o $(LIBDIR)/lib$(MG5AMC_CULIB).so $(cu_objects_exe)
-	$(NVCC) -o $@ $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler_cu.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe)
+	$(GPUCC) -o $@ $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler_cu.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe)
 endif
 
 #-------------------------------------------------------------------------------
@@ -659,7 +725,7 @@ $(BUILDDIR)/testxxx.o: testxxx_cc_ref.txt
 $(testmain): $(BUILDDIR)/testxxx.o
 $(testmain): cxx_objects_exe += $(BUILDDIR)/testxxx.o # Comment out this line to skip the C++ test of xxx functions
 
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 $(BUILDDIR)/testxxx_cu.o: $(GTESTLIBS)
 $(BUILDDIR)/testxxx_cu.o: INCFLAGS += $(GTESTINC)
 $(BUILDDIR)/testxxx_cu.o: testxxx_cc_ref.txt
@@ -672,7 +738,7 @@ $(BUILDDIR)/testmisc.o: INCFLAGS += $(GTESTINC)
 $(testmain): $(BUILDDIR)/testmisc.o
 $(testmain): cxx_objects_exe += $(BUILDDIR)/testmisc.o # Comment out this line to skip the C++ miscellaneous tests
 
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 $(BUILDDIR)/testmisc_cu.o: $(GTESTLIBS)
 $(BUILDDIR)/testmisc_cu.o: INCFLAGS += $(GTESTINC)
 $(testmain): $(BUILDDIR)/testmisc_cu.o
@@ -684,12 +750,12 @@ $(BUILDDIR)/runTest.o: INCFLAGS += $(GTESTINC)
 $(testmain): $(BUILDDIR)/runTest.o
 $(testmain): cxx_objects_exe += $(BUILDDIR)/runTest.o
 
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 $(BUILDDIR)/runTest_cu.o: $(GTESTLIBS)
 $(BUILDDIR)/runTest_cu.o: INCFLAGS += $(GTESTINC)
 ifneq ($(shell $(CXX) --version | grep ^Intel),)
-$(testmain): LIBFLAGS += -lintlc # compile with icpx and link with nvcc (undefined reference to `_intel_fast_memcpy')
-$(testmain): LIBFLAGS += -lsvml # compile with icpx and link with nvcc (undefined reference to `__svml_cos4_l9')
+$(testmain): LIBFLAGS += -lintlc # compile with icpx and link with GPUCC (undefined reference to `_intel_fast_memcpy')
+$(testmain): LIBFLAGS += -lsvml # compile with icpx and link with GPUCC (undefined reference to `__svml_cos4_l9')
 else ifneq ($(shell $(CXX) --version | grep ^nvc++),) # support nvc++ #531
 $(testmain): LIBFLAGS += -L$(patsubst %bin/nvc++,%lib,$(subst ccache ,,$(CXX))) -lnvhpcatm -lnvcpumath -lnvc
 endif
@@ -713,14 +779,14 @@ $(testmain): LIBFLAGS += -lgomp
 endif
 endif
 
-ifeq ($(NVCC),) # link only runTest.o
+ifeq ($(GPUCC),) # link only runTest.o
 $(testmain): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH
 $(testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(cxx_objects_exe) $(GTESTLIBS)
 	$(CXX) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) -ldl -pthread $(LIBFLAGS)
 else # link both runTest.o and runTest_cu.o
 $(testmain): LIBFLAGS += $(CULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH
 $(testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) $(GTESTLIBS)
-	$(NVCC) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) -ldl $(LIBFLAGS) -lcuda
+	  $(GPUCC) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) -ldl $(LIBFLAGS) $(CUDATESTFLAGS)
 endif
 
 # Use target gtestlibs to build only googletest
@@ -829,9 +895,9 @@ ifeq ($(USECCACHE),1)
 	ccache --version | head -1
 endif
 	@echo ""
-	@echo NVCC=$(NVCC)
-ifneq ($(NVCC),)
-	$(NVCC) --version
+	@echo GPUCC=$(GPUCC)
+ifneq ($(GPUCC),)
+	$(GPUCC) --version
 endif
 	@echo ""
 	@echo CXX=$(CXX)
@@ -850,7 +916,7 @@ endif
 
 # Target: check (run the C++ test executable)
 # [NB THIS IS WHAT IS USED IN THE GITHUB CI!]
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 check: runTest cmpFcheck cmpFGcheck
 else
 check: runTest cmpFcheck
diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/fbridge.cc b/epochX/cudacpp/gq_ttq.sa/SubProcesses/fbridge.cc
index 2d2b36d560..22ce3f5115 100644
--- a/epochX/cudacpp/gq_ttq.sa/SubProcesses/fbridge.cc
+++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/fbridge.cc
@@ -1,11 +1,11 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: S. Roiser (Oct 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Roiser, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #include "Bridge.h"
 #include "CPPProcess.h"
-#include "CudaRuntime.h"
+#include "GpuRuntime.h"
 
 extern "C"
 {
@@ -22,7 +22,7 @@ extern "C"
    * Using the same Fortran MadEvent code, linking to the hetrerogeneous library would allow access to both CPU and GPU implementations.
    * The specific heterogeneous configuration (how many GPUs, how many threads on each CPU, etc) could be loaded in CUDA/C++ from a data file.
    */
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   using namespace mg5amcGpu;
 #else
   using namespace mg5amcCpu;
@@ -46,8 +46,8 @@ extern "C"
    */
   void fbridgecreate_( CppObjectInFortran** ppbridge, const int* pnevtF, const int* pnparF, const int* pnp4F )
   {
-#ifdef __CUDACC__
-    CudaRuntime::setUp();
+#ifdef MGONGPUCPP_GPUIMPL
+    GpuRuntime::setUp();
 #endif
     // (NB: CPPProcess::initProc no longer needs to be executed here because it is called in the Bridge constructor)
     // FIXME: disable OMP in Bridge when called from Fortran
@@ -65,8 +65,8 @@ extern "C"
     Bridge<FORTRANFPTYPE>* pbridge = dynamic_cast<Bridge<FORTRANFPTYPE>*>( *ppbridge );
     if( pbridge == 0 ) throw std::runtime_error( "fbridgedelete_: invalid Bridge address" );
     delete pbridge;
-#ifdef __CUDACC__
-    CudaRuntime::tearDown();
+#ifdef MGONGPUCPP_GPUIMPL
+    GpuRuntime::tearDown();
 #endif
   }
 
@@ -96,7 +96,7 @@ extern "C"
   {
     Bridge<FORTRANFPTYPE>* pbridge = dynamic_cast<Bridge<FORTRANFPTYPE>*>( *ppbridge );
     if( pbridge == 0 ) throw std::runtime_error( "fbridgesequence_: invalid Bridge address" );
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     // Use the device/GPU implementation in the CUDA library
     // (there is also a host implementation in this library)
     pbridge->gpu_sequence( momenta, gs, rndhel, rndcol, *pchannelId, mes, selhel, selcol );
diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/fsampler.cc b/epochX/cudacpp/gq_ttq.sa/SubProcesses/fsampler.cc
index 2fb445372d..3743934f41 100644
--- a/epochX/cudacpp/gq_ttq.sa/SubProcesses/fsampler.cc
+++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/fsampler.cc
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Feb 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #include "mgOnGpuConfig.h"
 
@@ -13,7 +13,7 @@
 
 //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -40,7 +40,7 @@ namespace mg5amcCpu
   private:
     const int m_nevt; // The number of events in each iteration
     int m_iiter;      // The iteration counter (for random number seeding)
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
     HostBufferRndNumMomenta m_hstRndmom; // Memory buffers for random numbers
     HostBufferMomenta m_hstMomenta;      // Memory buffers for momenta
     HostBufferWeights m_hstWeights;      // Memory buffers for sampling weights
@@ -105,7 +105,7 @@ namespace mg5amcCpu
 
 extern "C"
 {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   using namespace mg5amcGpu;
 #else
   using namespace mg5amcCpu;
diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/runTest.cc b/epochX/cudacpp/gq_ttq.sa/SubProcesses/runTest.cc
index d4a760a71b..de327f2321 100644
--- a/epochX/cudacpp/gq_ttq.sa/SubProcesses/runTest.cc
+++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/runTest.cc
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: S. Hageboeck (Nov 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 //----------------------------------------------------------------------------
 // Use ./runTest.exe --gtest_filter=*xxx to run only testxxx.cc tests
 //----------------------------------------------------------------------------
@@ -18,7 +18,7 @@
 #include "RandomNumberKernels.h"
 #include "epoch_process_id.h"
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 using namespace mg5amcGpu;
 #else
 using namespace mg5amcCpu;
@@ -35,7 +35,7 @@ struct CUDA_CPU_TestBase : public TestDriverBase
     : TestDriverBase( npar, refFileName ) {}
 };
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 struct CPUTest : public CUDA_CPU_TestBase
 {
   // Struct data members (process, and memory structures for random numbers, momenta, matrix elements and weights on host and device)
@@ -119,7 +119,7 @@ struct CPUTest : public CUDA_CPU_TestBase
 };
 #endif
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 struct CUDATest : public CUDA_CPU_TestBase
 {
   // Reset the device when our test goes out of scope. Note that this should happen after
@@ -128,7 +128,7 @@ struct CUDATest : public CUDA_CPU_TestBase
   {
     ~DeviceReset()
     {
-      checkCuda( cudaDeviceReset() ); // this is needed by cuda-memcheck --leak-check full
+      checkGpu( gpuDeviceReset() ); // this is needed by cuda-memcheck --leak-check full
     }
   } deviceResetter;
 
@@ -256,7 +256,7 @@ INSTANTIATE_TEST_SUITE_P( prefix, \
                           test_suite_name, \
                           testing::Values( new CUDATest( MG_EPOCH_REFERENCE_FILE_NAME ) ) );
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 MG_INSTANTIATE_TEST_SUITE_GPU( XTESTID_GPU( MG_EPOCH_PROCESS_ID ), MadgraphTest );
 #else
 MG_INSTANTIATE_TEST_SUITE_CPU( XTESTID_CPU( MG_EPOCH_PROCESS_ID ), MadgraphTest );
diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/testmisc.cc b/epochX/cudacpp/gq_ttq.sa/SubProcesses/testmisc.cc
index 895d6eeb56..ba9e59a8a3 100644
--- a/epochX/cudacpp/gq_ttq.sa/SubProcesses/testmisc.cc
+++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/testmisc.cc
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 //----------------------------------------------------------------------------
 // Use ./runTest.exe --gtest_filter=*misc to run only testmisc.cc tests
 //----------------------------------------------------------------------------
@@ -17,7 +17,7 @@
 #include <sstream>
 #include <typeinfo>
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 #define TESTID( s ) s##_GPU_MISC
 #else
 #define TESTID( s ) s##_CPU_MISC
@@ -26,7 +26,7 @@
 #define XTESTID( s ) TESTID( s )
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -59,7 +59,7 @@ namespace mg5amcCpu
 
 TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testmisc )
 {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   using namespace mg5amcGpu;
 #else
   using namespace mg5amcCpu;
diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/testxxx.cc b/epochX/cudacpp/gq_ttq.sa/SubProcesses/testxxx.cc
index 3361fe5aa9..e5167de00c 100644
--- a/epochX/cudacpp/gq_ttq.sa/SubProcesses/testxxx.cc
+++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/testxxx.cc
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Apr 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 //----------------------------------------------------------------------------
 // Use ./runTest.exe --gtest_filter=*xxx to run only testxxx.cc tests
 //----------------------------------------------------------------------------
@@ -24,7 +24,7 @@
 #include <iomanip>
 #include <iostream>
 #include <vector>
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 #define TESTID( s ) s##_GPU_XXX
 #else
 #define TESTID( s ) s##_CPU_XXX
@@ -32,7 +32,7 @@
 
 #define XTESTID( s ) TESTID( s )
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -42,7 +42,7 @@ namespace mg5amcCpu
   int FPEhandlerIevt = -1;
   inline void FPEhandler( int sig )
   {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     std::cerr << "Floating Point Exception (GPU): '" << FPEhandlerMessage << "' ievt=" << FPEhandlerIevt << std::endl;
 #else
     std::cerr << "Floating Point Exception (CPU neppV=" << neppV << "): '" << FPEhandlerMessage << "' ievt=" << FPEhandlerIevt << std::endl;
@@ -53,7 +53,7 @@ namespace mg5amcCpu
 
 TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testxxx )
 {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   using namespace mg5amcGpu;
 #else
   using namespace mg5amcCpu;
@@ -77,7 +77,7 @@ TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testxxx )
   assert( nevt % neppM == 0 ); // nevt must be a multiple of neppM
   assert( nevt % neppV == 0 ); // nevt must be a multiple of neppV
   // Fill in the input momenta
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   mg5amcGpu::PinnedHostBufferMomenta hstMomenta( nevt ); // AOSOA[npagM][npar=4][np4=4][neppM]
 #else
   mg5amcCpu::HostBufferMomenta hstMomenta( nevt ); // AOSOA[npagM][npar=4][np4=4][neppM]
@@ -322,7 +322,7 @@ TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testxxx )
   {
     for( int ievt = 0; ievt < nevt; ievt++ )
     {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
       using namespace mg5amcGpu;
 #else
       using namespace mg5amcCpu;
diff --git a/epochX/cudacpp/gq_ttq.sa/src/HelAmps_sm.h b/epochX/cudacpp/gq_ttq.sa/src/HelAmps_sm.h
index cd4e6de668..45000c7246 100644
--- a/epochX/cudacpp/gq_ttq.sa/src/HelAmps_sm.h
+++ b/epochX/cudacpp/gq_ttq.sa/src/HelAmps_sm.h
@@ -5,7 +5,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: A. Valassi (Sep 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
 // MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
@@ -28,7 +28,7 @@
 //#include <iomanip>
 //#include <iostream>
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/gq_ttq.sa/src/Parameters_sm.cc b/epochX/cudacpp/gq_ttq.sa/src/Parameters_sm.cc
index c06dcbb252..8b92ea0bd6 100644
--- a/epochX/cudacpp/gq_ttq.sa/src/Parameters_sm.cc
+++ b/epochX/cudacpp/gq_ttq.sa/src/Parameters_sm.cc
@@ -4,7 +4,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: A. Valassi (Sep 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
 // MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
@@ -17,7 +17,7 @@
 #include <iomanip>
 #include <iostream>
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 using namespace mg5amcGpu;
 #else
 using namespace mg5amcCpu;
diff --git a/epochX/cudacpp/gq_ttq.sa/src/Parameters_sm.h b/epochX/cudacpp/gq_ttq.sa/src/Parameters_sm.h
index a6eb185434..a3615ec77a 100644
--- a/epochX/cudacpp/gq_ttq.sa/src/Parameters_sm.h
+++ b/epochX/cudacpp/gq_ttq.sa/src/Parameters_sm.h
@@ -27,7 +27,7 @@
 #include "read_slha.h"
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -93,7 +93,7 @@ namespace mg5amcCpu
 #include <limits>
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -217,7 +217,7 @@ namespace mg5amcCpu
 //==========================================================================
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -236,7 +236,7 @@ namespace mg5amcCpu
 #pragma GCC diagnostic push
 #pragma GCC diagnostic ignored "-Wunused-variable"  // e.g. <<warning: unused variable ‘mdl_G__exp__2’ [-Wunused-variable]>>
 #pragma GCC diagnostic ignored "-Wunused-parameter" // e.g. <<warning: unused parameter ‘G’ [-Wunused-parameter]>>
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 #pragma nv_diagnostic push
 #pragma nv_diag_suppress 177 // e.g. <<warning #177-D: variable "mdl_G__exp__2" was declared but never referenced>>
 #endif
@@ -263,7 +263,7 @@ namespace mg5amcCpu
       // End SM implementation - no special handling of vectors of floats as in EFT (#439)
       return out;
     }
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 #pragma GCC diagnostic pop
 #pragma nv_diagnostic pop
 #endif
diff --git a/epochX/cudacpp/gq_ttq.sa/src/cudacpp_src.mk b/epochX/cudacpp/gq_ttq.sa/src/cudacpp_src.mk
index d4cc628aec..159e19a46d 100644
--- a/epochX/cudacpp/gq_ttq.sa/src/cudacpp_src.mk
+++ b/epochX/cudacpp/gq_ttq.sa/src/cudacpp_src.mk
@@ -19,7 +19,7 @@ SHELL := /bin/bash
 #=== Configure common compiler flags for CUDA and C++
 
 INCFLAGS = -I.
-OPTFLAGS = -O3 # this ends up in CUFLAGS too (should it?), cannot add -Ofast or -ffast-math here
+OPTFLAGS = -O3 # this ends up in GPUFLAGS too (should it?), cannot add -Ofast or -ffast-math here
 
 #-------------------------------------------------------------------------------
 
@@ -45,13 +45,13 @@ endif
 
 #-------------------------------------------------------------------------------
 
-#=== Configure the CUDA compiler (note: NVCC is already exported including ccache)
+#=== Configure the CUDA compiler (note: GPUCC is already exported including ccache)
 
-###$(info NVCC=$(NVCC))
+###$(info GPUCC=$(GPUCC))
 
 #-------------------------------------------------------------------------------
 
-#=== Configure ccache for C++ builds (note: NVCC is already exported including ccache)
+#=== Configure ccache for C++ builds (note: GPUCC is already exported including ccache)
 
 # Enable ccache if USECCACHE=1
 ifeq ($(USECCACHE)$(shell echo $(CXX) | grep ccache),1)
@@ -92,6 +92,13 @@ endif
 ###$(info OMPFLAGS=$(OMPFLAGS))
 CXXFLAGS += $(OMPFLAGS)
 
+# Add correct -DHIP_LATFORM when compiling for HIP
+ifeq ($(findstring nvcc,$(GPUCC)),nvcc)
+  GPUFLAGS += -Xcompiler -fPIC -c -x cu
+else ifeq ($(findstring hipcc,$(GPUCC)),hipcc)
+  GPUFLAGS += -fPIC -c
+endif
+
 # Set the build flags appropriate to each AVX choice (example: "make AVX=none")
 # [NB MGONGPU_PVW512 is needed because "-mprefer-vector-width=256" is not exposed in a macro]
 # [See https://gcc.gnu.org/bugzilla/show_bug.cgi?id=96476]
@@ -253,20 +260,20 @@ $(BUILDDIR)/%.o : %.cc *.h $(BUILDDIR)/.build.$(TAG)
 # Generic target and build rules: objects from CUDA compilation
 $(BUILDDIR)/%_cu.o : %.cc *.h $(BUILDDIR)/.build.$(TAG)
 	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
-	$(NVCC) $(CPPFLAGS) $(CUFLAGS) -Xcompiler -fPIC -c -x cu $< -o $@
+	$(GPUCC) $(CPPFLAGS) $(GPUFLAGS) $< -o $@
 
 #-------------------------------------------------------------------------------
 
 cxx_objects=$(addprefix $(BUILDDIR)/, Parameters_sm.o read_slha.o)
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 cu_objects=$(addprefix $(BUILDDIR)/, Parameters_sm_cu.o)
 endif
 
 # Target (and build rules): common (src) library
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so : $(cxx_objects) $(cu_objects)
 	@if [ ! -d $(LIBDIR) ]; then echo "mkdir -p $(LIBDIR)"; mkdir -p $(LIBDIR); fi
-	$(NVCC) -shared -o $@ $(cxx_objects) $(cu_objects) $(LDFLAGS)
+	$(GPUCC) -shared -o $@ $(cxx_objects) $(cu_objects) $(LDFLAGS)
 else
 $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so : $(cxx_objects)
 	@if [ ! -d $(LIBDIR) ]; then echo "mkdir -p $(LIBDIR)"; mkdir -p $(LIBDIR); fi
diff --git a/epochX/cudacpp/gq_ttq.sa/src/mgOnGpuConfig.h b/epochX/cudacpp/gq_ttq.sa/src/mgOnGpuConfig.h
index b247654dcf..da4ba36ad8 100644
--- a/epochX/cudacpp/gq_ttq.sa/src/mgOnGpuConfig.h
+++ b/epochX/cudacpp/gq_ttq.sa/src/mgOnGpuConfig.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jul 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MGONGPUCONFIG_H
 #define MGONGPUCONFIG_H 1
@@ -10,12 +10,25 @@
 // There are two different code bases for standalone_cudacpp (without multichannel) and madevent+cudacpp (with multichannel)
 #undef MGONGPU_SUPPORTS_MULTICHANNEL
 
+// Is this a GPU (CUDA, HIP) or CPU implementation?
+#ifdef __CUDACC__
+#define MGONGPUCPP_GPUIMPL cuda
+#elif defined __HIPCC__
+#define MGONGPUCPP_GPUIMPL hip
+#else
+#undef MGONGPUCPP_GPUIMPL
+#endif
+
 // ** NB1 Throughputs (e.g. 6.8E8) are events/sec for "./gcheck.exe -p 65536 128 12"
 // ** NB2 Baseline on b7g47n0004 fluctuates (probably depends on load on other VMs)
 
 // Choose if curand is supported for generating random numbers
+// For HIP, by default, do not use curand (common random numbers will be used instead)
 // For both CUDA and C++, by default, do not inline, but allow this macro to be set from outside with e.g. -DMGONGPU_HAS_NO_CURAND
-// (there exist CUDA installations, e.g. using the HPC package, which do not include curand - see PR #784)
+// (there exist CUDA installations, e.g. using the HPC package, which do not include curand - see PR #784 and #785)
+#if defined __HIPCC__
+#define MGONGPU_HAS_NO_CURAND 1
+#else
 //#ifdef __CUDACC__
 //#undef MGONGPU_HAS_NO_CURAND // default
 ////#define MGONGPU_HAS_NO_CURAND 1
@@ -23,6 +36,7 @@
 //#undef MGONGPU_HAS_NO_CURAND // default
 ////#define MGONGPU_HAS_NO_CURAND 1
 //#endif
+#endif
 
 // Choose floating point precision (for everything but color algebra #537)
 // If one of these macros has been set from outside with e.g. -DMGONGPU_FPTYPE_FLOAT, nothing happens (issue #167)
@@ -54,23 +68,28 @@
 //#undef MGONGPU_HARDCODE_PARAM // default
 ////#define MGONGPU_HARDCODE_PARAM 1
 
-// Complex type in c++: std::complex or cxsmpl (CHOOSE ONLY ONE)
-#ifndef __CUDACC__
-//#define MGONGPU_CPPCXTYPE_STDCOMPLEX 1 // ~8 percent slower on float, same on double (5.1E6/double, 9.4E6/float)
-#define MGONGPU_CPPCXTYPE_CXSMPL 1 // new default (5.1E6/double, 10.2E6/float)
-#endif
-
-// Complex type in cuda: thrust or cucomplex or cxsmpl (CHOOSE ONLY ONE)
+// Complex type in CUDA: thrust or cucomplex or cxsmpl (CHOOSE ONLY ONE)
 #ifdef __CUDACC__
 #define MGONGPU_CUCXTYPE_THRUST 1 // default (~1.15E9/double, ~3.2E9/float)
 //#define MGONGPU_CUCXTYPE_CUCOMPLEX 1 // ~10 percent slower (1.03E9/double, ~2.8E9/float)
 //#define MGONGPU_CUCXTYPE_CXSMPL 1 // ~10 percent slower (1.00E9/double, ~2.9E9/float)
+
+// Complex type in HIP: cxsmpl (ONLY ONE OPTION POSSIBLE)
+#elif defined __HIPCC__
+#define MGONGPU_CUCXTYPE_CXSMPL 1 // ~10 percent slower (1.00E9/double, ~2.9E9/float)
+
+// Complex type in C++: std::complex or cxsmpl (CHOOSE ONLY ONE)
+#else
+//#define MGONGPU_CPPCXTYPE_STDCOMPLEX 1 // ~8 percent slower on float, same on double (5.1E6/double, 9.4E6/float)
+#define MGONGPU_CPPCXTYPE_CXSMPL 1 // new default (5.1E6/double, 10.2E6/float)
 #endif
 
-// Cuda nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation
+// CUDA nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation
 #ifdef __CUDACC__
-#undef MGONGPU_NSIGHT_DEBUG // default
+#undef MGONGPU_NSIGHT_DEBUG // default in CUDA
 //#define MGONGPU_NSIGHT_DEBUG 1
+#else
+#undef MGONGPU_NSIGHT_DEBUG // only option in HIP or C++
 #endif
 
 // SANITY CHECKS (floating point precision for everything but color algebra #537)
@@ -86,17 +105,21 @@
 #error You cannot use double precision for color algebra and single precision elsewhere
 #endif
 
-// SANITY CHECKS (c++ complex number implementation)
-#ifndef __CUDACC__
-#if defined MGONGPU_CPPCXTYPE_STDCOMPLEX and defined MGONGPU_CPPCXTYPE_CXSMPL
-#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CPPCXTYPE_STDCOMPLEX or MGONGPU_CPPCXTYPE_CXSMPL
+// SANITY CHECKS (CUDA complex number implementation)
+#ifdef __CUDACC__
+#if defined MGONGPU_CUCXTYPE_THRUST and defined MGONGPU_CUCXTYPE_CUCOMPLEX
+#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CUCXTYPE_THRUST or MGONGPU_CUCXTYPE_CUCOMPLEX for CUDA
+#elif defined MGONGPU_CUCXTYPE_THRUST and defined MGONGPU_CUCXTYPE_CXSMPL
+#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CUCXTYPE_THRUST or MGONGPU_CUCXTYPE_CXSMPL for CUDA
+#elif defined MGONGPU_CUCXTYPE_CUCOMPLEX and defined MGONGPU_CUCXTYPE_CXSMPL
+#error You must CHOOSE (ONE AND) ONLY ONE OF MGONGPU_CUCXTYPE_CUCOMPLEX or MGONGPU_CUCXTYPE_CXSMPL for CUDA
 #endif
 #endif
 
-// SANITY CHECKS (cuda complex number implementation)
-#ifdef __CUDACC__
-#if defined MGONGPU_CUCXTYPE_THRUST and defined MGONGPU_CUCXTYPE_CUCOMPLEX and defined MGONGPU_CUCXTYPE_CXSMPL
-#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CUCXTYPE_THRUST or MGONGPU_CUCXTYPE_CUCOMPLEX or MGONGPU_CUCXTYPE_CXSMPL
+// SANITY CHECKS (C++ complex number implementation)
+#ifndef MGONGPUCPP_GPUIMPL
+#if defined MGONGPU_CPPCXTYPE_STDCOMPLEX and defined MGONGPU_CPPCXTYPE_CXSMPL
+#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CPPCXTYPE_STDCOMPLEX or MGONGPU_CPPCXTYPE_CXSMPL for C++
 #endif
 #endif
 
@@ -134,7 +157,7 @@ namespace mgOnGpu
   // Alignment requirement for using reinterpret_cast with SIMD vectorized code
   // (using reinterpret_cast with non aligned memory may lead to segmentation faults!)
   // Only needed for C++ code but can be enforced also in NVCC builds of C++ code using CUDA>=11.2 and C++17 (#318, #319, #333)
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   constexpr int cppAlign = 64; // alignment requirement for SIMD vectorization (64-byte i.e. 512-bit)
 #endif
 
@@ -145,7 +168,7 @@ using mgOnGpu::fptype;
 using mgOnGpu::fptype2;
 
 // C++ SIMD vectorization width (this will be used to set neppV)
-#ifdef __CUDACC__ // CUDA implementation has no SIMD
+#ifdef MGONGPUCPP_GPUIMPL // CUDA and HIP implementations have no SIMD
 #undef MGONGPU_CPPSIMD
 #elif defined __AVX512VL__ && defined MGONGPU_PVW512 // C++ "512z" AVX512 with 512 width (512-bit ie 64-byte): 8 (DOUBLE) or 16 (FLOAT)
 #ifdef MGONGPU_FPTYPE_DOUBLE
@@ -175,9 +198,9 @@ using mgOnGpu::fptype2;
 #undef MGONGPU_CPPSIMD
 #endif
 
-// Cuda nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation
+// CUDA nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation
 // Arguments (not used so far): text is __FUNCTION__, code is 0 (start) or 1 (end)
-#if defined __CUDACC__ && defined MGONGPU_NSIGHT_DEBUG /* clang-format off */
+#if defined __CUDA__ && defined MGONGPU_NSIGHT_DEBUG /* clang-format off */
 #define mgDebugDeclare() __shared__ float mgDebugCounter[mgOnGpu::ntpbMAX];
 #define mgDebugInitialise() { mgDebugCounter[threadIdx.x] = 0; }
 #define mgDebug( code, text ) { mgDebugCounter[threadIdx.x] += 1; }
@@ -189,8 +212,8 @@ using mgOnGpu::fptype2;
 #define mgDebugFinalise() { /*noop*/ }
 #endif /* clang-format on */
 
-// Define empty CUDA declaration specifiers for C++
-#ifndef __CUDACC__
+// Define empty CUDA/HIP declaration specifiers for C++
+#ifndef MGONGPUCPP_GPUIMPL
 #define __global__
 #define __host__
 #define __device__
diff --git a/epochX/cudacpp/gq_ttq.sa/src/mgOnGpuCxtypes.h b/epochX/cudacpp/gq_ttq.sa/src/mgOnGpuCxtypes.h
index ca9a9f00c0..5532e22fa1 100644
--- a/epochX/cudacpp/gq_ttq.sa/src/mgOnGpuCxtypes.h
+++ b/epochX/cudacpp/gq_ttq.sa/src/mgOnGpuCxtypes.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022, based on earlier work by D. Smith) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MGONGPUCXTYPES_H
 #define MGONGPUCXTYPES_H 1
@@ -19,7 +19,7 @@
 #include <complex>
 
 // Complex type in cuda: thrust or cucomplex or cxsmpl
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 #if defined MGONGPU_CUCXTYPE_THRUST
 #pragma clang diagnostic push
 #pragma clang diagnostic ignored "-Wtautological-compare" // for icpx2021/clang13 (https://stackoverflow.com/a/15864661)
@@ -82,7 +82,7 @@ namespace mgOnGpu /* clang-format off */
 using mgOnGpu::cxsmpl;
 
 // Printout to stream for user defined types
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -92,7 +92,7 @@ namespace mg5amcCpu
   inline __host__ std::ostream&
   operator<<( std::ostream& out, const cxsmpl<FP>& c )
   {
-    out << std::complex( c.real(), c.imag() );
+    out << std::complex<FP>( c.real(), c.imag() );
     return out;
   }
 
@@ -215,14 +215,14 @@ namespace mg5amcCpu
 //==========================================================================
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
 #endif
 {
   // --- Type definitions (complex type: cxtype)
-#ifdef __CUDACC__ // cuda
+#ifdef MGONGPUCPP_GPUIMPL // cuda
 #if defined MGONGPU_CUCXTYPE_THRUST
   typedef thrust::complex<fptype> cxtype;
 #elif defined MGONGPU_CUCXTYPE_CUCOMPLEX
@@ -255,7 +255,7 @@ namespace mg5amcCpu
 //==========================================================================
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -307,7 +307,7 @@ namespace mg5amcCpu
 
   //==========================================================================
 
-#if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_THRUST // cuda + thrust
+#if defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CUCXTYPE_THRUST // cuda + thrust
 
   //------------------------------
   // CUDA - using thrust::complex
@@ -343,11 +343,11 @@ namespace mg5amcCpu
     return c;
   }
 
-#endif // #if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_THRUST
+#endif // #if defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CUCXTYPE_THRUST
 
   //==========================================================================
 
-#if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_CUCOMPLEX // cuda + cucomplex
+#if defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CUCXTYPE_CUCOMPLEX // cuda + cucomplex
 
   //------------------------------
   // CUDA - using cuComplex
@@ -562,11 +562,11 @@ namespace mg5amcCpu
     return cxmake( c.real(), c.imag() );
   }
 
-#endif // #if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_CUCOMPLEX
+#endif // #if defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CUCXTYPE_CUCOMPLEX
 
   //==========================================================================
 
-#if not defined __CUDACC__ and defined MGONGPU_CPPCXTYPE_STDCOMPLEX // c++ + stdcomplex
+#if not defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CPPCXTYPE_STDCOMPLEX // c++ + stdcomplex
 
   //------------------------------
   // C++ - using std::complex
@@ -610,7 +610,7 @@ namespace mg5amcCpu
   }
 #endif
 
-#endif // #if not defined __CUDACC__ and defined MGONGPU_CPPCXTYPE_STDCOMPLEX
+#endif // #if not defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CPPCXTYPE_STDCOMPLEX
 
   //==========================================================================
 
@@ -633,7 +633,7 @@ namespace mg5amcCpu
 //==========================================================================
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/gq_ttq.sa/src/mgOnGpuFptypes.h b/epochX/cudacpp/gq_ttq.sa/src/mgOnGpuFptypes.h
index 905c97d700..fa3a02664b 100644
--- a/epochX/cudacpp/gq_ttq.sa/src/mgOnGpuFptypes.h
+++ b/epochX/cudacpp/gq_ttq.sa/src/mgOnGpuFptypes.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MGONGPUFPTYPES_H
 #define MGONGPUFPTYPES_H 1
@@ -12,7 +12,7 @@
 #include <cmath>
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL // cuda
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -20,7 +20,7 @@ namespace mg5amcCpu
 {
   //==========================================================================
 
-#ifdef __CUDACC__ // cuda
+#ifdef MGONGPUCPP_GPUIMPL // cuda
 
   //------------------------------
   // Floating point types - Cuda
@@ -64,11 +64,11 @@ namespace mg5amcCpu
 #endif
   }
 
-#endif // #ifdef __CUDACC__
+#endif // #ifdef MGONGPUCPP_GPUIMPL
 
   //==========================================================================
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 
   //------------------------------
   // Floating point types - C++
@@ -92,7 +92,7 @@ namespace mg5amcCpu
     return std::sqrt( f );
   }
 
-#endif // #ifndef __CUDACC__
+#endif // #ifndef MGONGPUCPP_GPUIMPL
 
   //==========================================================================
 
diff --git a/epochX/cudacpp/gq_ttq.sa/src/mgOnGpuVectors.h b/epochX/cudacpp/gq_ttq.sa/src/mgOnGpuVectors.h
index e1299ba81e..cdae04326b 100644
--- a/epochX/cudacpp/gq_ttq.sa/src/mgOnGpuVectors.h
+++ b/epochX/cudacpp/gq_ttq.sa/src/mgOnGpuVectors.h
@@ -32,7 +32,7 @@
 #endif
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -131,7 +131,7 @@ namespace mg5amcCpu
 #endif
 #endif
 
-#else // i.e #ifndef MGONGPU_CPPSIMD (this includes #ifdef __CUDACC__)
+#else // i.e #ifndef MGONGPU_CPPSIMD (this includes #ifdef MGONGPUCPP_GPUIMPL)
 
   const int neppV = 1;
 
@@ -153,13 +153,13 @@ namespace mg5amcCpu
 //==========================================================================
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
 #endif
 {
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 
   // Printout to stream for user defined types
 
@@ -805,11 +805,11 @@ namespace mg5amcCpu
 
 #endif // #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
 
-#endif // #ifndef __CUDACC__
+#endif // #ifndef MGONGPUCPP_GPUIMPL
 
   //==========================================================================
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 
   //------------------------------
   // Vector types - CUDA
@@ -853,12 +853,12 @@ namespace mg5amcCpu
     return mask;
   }
 
-#endif // #ifdef __CUDACC__
+#endif // #ifdef MGONGPUCPP_GPUIMPL
 
   //==========================================================================
 
   // Scalar-or-vector types: scalar in CUDA, vector or scalar in C++
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   typedef bool bool_sv;
   typedef fptype fptype_sv;
   typedef fptype2 fptype2_sv;
@@ -879,7 +879,7 @@ namespace mg5amcCpu
 #endif
 
   // Scalar-or-vector zeros: scalar in CUDA, vector or scalar in C++
-#ifdef __CUDACC__ /* clang-format off */
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
   inline __host__ __device__ cxtype cxzero_sv(){ return cxtype( 0, 0 ); }
 #elif defined MGONGPU_CPPSIMD
   inline cxtype_v cxzero_sv() { return cxtype_v(); } // RRRR=0000 IIII=0000
diff --git a/epochX/cudacpp/gq_ttq.sa/src/rambo.h b/epochX/cudacpp/gq_ttq.sa/src/rambo.h
index e02ea52496..cd7e1008ea 100644
--- a/epochX/cudacpp/gq_ttq.sa/src/rambo.h
+++ b/epochX/cudacpp/gq_ttq.sa/src/rambo.h
@@ -4,7 +4,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 
 #include "mgOnGpuConfig.h"
@@ -18,7 +18,7 @@
 #include <iostream>
 
 // Simplified rambo version for 2 to N (with N>=2) processes with massless particles
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -83,7 +83,7 @@ namespace mg5amcCpu
       static bool first = true;
       if( first )
       {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
         if constexpr( M_ACCESS::isOnDevice() ) // avoid
         {
           const int ievt0 = 0;
@@ -166,7 +166,7 @@ namespace mg5amcCpu
     wt = po2log;
     if( nparf != 2 ) wt = ( 2. * nparf - 4. ) * log( energy ) + z[nparf - 1];
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
     // issue warnings if weight is too small or too large
     static int iwarn[5] = { 0, 0, 0, 0, 0 };
     if( wt < -180. )
diff --git a/epochX/cudacpp/heft_gg_h.sa/CODEGEN_cudacpp_heft_gg_h_log.txt b/epochX/cudacpp/heft_gg_h.sa/CODEGEN_cudacpp_heft_gg_h_log.txt
index 800492306f..9bac4b3aae 100644
--- a/epochX/cudacpp/heft_gg_h.sa/CODEGEN_cudacpp_heft_gg_h_log.txt
+++ b/epochX/cudacpp/heft_gg_h.sa/CODEGEN_cudacpp_heft_gg_h_log.txt
@@ -52,7 +52,7 @@ Note that you can still compile and run aMC@NLO with the built-in PDFs
 
 Using default text editor "vi". Set another one in ./input/mg5_configuration.txt
 Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt
-No valid web browser found. Please set in ./input/mg5_configuration.txt
+Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt
 import /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_heft_gg_h.mg
 The import format was not given, so we guess it as command
 set stdout_level DEBUG
@@ -153,7 +153,7 @@ Generated helas calls for 1 subprocesses (1 diagrams) in 0.002 s
 [1;32mDEBUG:  Entering PLUGIN_ProcessExporter.convert_model (create the model) [1;30m[output.py at line 202][0m [0m
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates VVS3 routines[0m
-ALOHA: aloha creates 1 routines in  0.062 s
+ALOHA: aloha creates 1 routines in  0.060 s
 <class 'aloha.create_aloha.AbstractRoutine'> VVS3
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_heft_gg_h/src/./HelAmps_heft.h
 INFO: Created file HelAmps_heft.h in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_heft_gg_h/src/. 
@@ -165,7 +165,7 @@ INFO: Created files Parameters_heft.h and Parameters_heft.cc in directory
 INFO: /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_heft_gg_h/src/. and /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_heft_gg_h/src/. 
 quit
 
-real	0m0.471s
-user	0m0.367s
-sys	0m0.052s
-Code generation completed in 0 seconds
+real	0m0.414s
+user	0m0.350s
+sys	0m0.059s
+Code generation completed in 1 seconds
diff --git a/epochX/cudacpp/heft_gg_h.sa/COPYRIGHT b/epochX/cudacpp/heft_gg_h.sa/COPYRIGHT
index a134b5fef9..84a883fbb0 100644
--- a/epochX/cudacpp/heft_gg_h.sa/COPYRIGHT
+++ b/epochX/cudacpp/heft_gg_h.sa/COPYRIGHT
@@ -15,6 +15,7 @@ The full development team currently includes the following authors :
   Stephan Hageboeck (CERN)
   Olivier Mattelaer (Universite Catholique de Louvain, original author)
   Stefan Roiser (CERN, original author)
+  Joergen Teig (CERN)
   Andrea Valassi (CERN, original author)
   Zenny Wettersten (CERN)
 See https://github.com/madgraph5/madgraph4gpu for more details. For the full
diff --git a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/Bridge.h b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/Bridge.h
index bf8b5e024d..89437b4c42 100644
--- a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/Bridge.h
+++ b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/Bridge.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: S. Roiser (Nov 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Roiser, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef BRIDGE_H
 #define BRIDGE_H 1
@@ -23,7 +23,7 @@
 #include <memory>
 #include <type_traits>
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -83,7 +83,7 @@ namespace mg5amcCpu
     Bridge& operator=( const Bridge& ) = delete;
     Bridge& operator=( Bridge&& ) = delete;
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     /**
      * Set the gpublocks and gputhreads for the gpusequence - throws if evnt != gpublocks*gputhreads
      * (this is needed for BridgeKernel tests rather than for actual production use in Fortran)
@@ -150,7 +150,7 @@ namespace mg5amcCpu
     unsigned int m_nevt; // number of events
     int m_nGoodHel;      // the number of good helicities (-1 initially when they have not yet been calculated)
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     int m_gputhreads; // number of gpu threads (default set from number of events, can be modified)
     int m_gpublocks;  // number of gpu blocks (default set from number of events, can be modified)
     DeviceBuffer<FORTRANFPTYPE, sizePerEventMomenta> m_devMomentaF;
@@ -187,12 +187,12 @@ namespace mg5amcCpu
   // Forward declare transposition methods
   //
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 
   template<typename Tin, typename Tout>
   __global__ void dev_transposeMomentaF2C( const Tin* in, Tout* out, const unsigned int nevt );
 
-#endif // __CUDACC__
+#endif // MGONGPUCPP_GPUIMPL
 
   template<typename Tin, typename Tout>
   void hst_transposeMomentaF2C( const Tin* in, Tout* out, const unsigned int nevt );
@@ -209,7 +209,7 @@ namespace mg5amcCpu
   Bridge<FORTRANFPTYPE>::Bridge( unsigned int nevtF, unsigned int nparF, unsigned int np4F )
     : m_nevt( nevtF )
     , m_nGoodHel( -1 )
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     , m_gputhreads( 256 )                  // default number of gpu threads
     , m_gpublocks( m_nevt / m_gputhreads ) // this ensures m_nevt <= m_gpublocks*m_gputhreads
     , m_devMomentaF( m_nevt )
@@ -233,7 +233,7 @@ namespace mg5amcCpu
   {
     if( nparF != CPPProcess::npar ) throw std::runtime_error( "Bridge constructor: npar mismatch" );
     if( np4F != CPPProcess::np4 ) throw std::runtime_error( "Bridge constructor: np4 mismatch" );
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     if( ( m_nevt < s_gputhreadsmin ) || ( m_nevt % s_gputhreadsmin != 0 ) )
       throw std::runtime_error( "Bridge constructor: nevt should be a multiple of " + std::to_string( s_gputhreadsmin ) );
     while( m_nevt != m_gpublocks * m_gputhreads )
@@ -249,7 +249,7 @@ namespace mg5amcCpu
 #else
     std::cout << "WARNING! Instantiate host Bridge (nevt=" << m_nevt << ")" << std::endl;
     m_pmek.reset( new MatrixElementKernelHost( m_hstMomentaC, m_hstGs, m_hstRndHel, m_hstRndCol, m_hstMEs, m_hstSelHel, m_hstSelCol, m_nevt ) );
-#endif // __CUDACC__
+#endif // MGONGPUCPP_GPUIMPL
     // Create a process object, read param card and set parameters
     // FIXME: the process instance can happily go out of scope because it is only needed to read parameters?
     // FIXME: the CPPProcess should really be a singleton? what if fbridgecreate is called from several Fortran threads?
@@ -262,7 +262,7 @@ namespace mg5amcCpu
     process.initProc( paramCard );
   }
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   template<typename FORTRANFPTYPE>
   void Bridge<FORTRANFPTYPE>::set_gpugrid( const int gpublocks, const int gputhreads )
   {
@@ -276,7 +276,7 @@ namespace mg5amcCpu
   }
 #endif
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   template<typename FORTRANFPTYPE>
   void Bridge<FORTRANFPTYPE>::gpu_sequence( const FORTRANFPTYPE* momenta,
                                             const FORTRANFPTYPE* gs,
@@ -291,14 +291,14 @@ namespace mg5amcCpu
     constexpr int neppM = MemoryAccessMomenta::neppM;
     if constexpr( neppM == 1 && std::is_same_v<FORTRANFPTYPE, fptype> )
     {
-      checkCuda( cudaMemcpy( m_devMomentaC.data(), momenta, m_devMomentaC.bytes(), cudaMemcpyHostToDevice ) );
+      gpuMemcpy( m_devMomentaC.data(), momenta, m_devMomentaC.bytes(), gpuMemcpyHostToDevice );
     }
     else
     {
-      checkCuda( cudaMemcpy( m_devMomentaF.data(), momenta, m_devMomentaF.bytes(), cudaMemcpyHostToDevice ) );
+      gpuMemcpy( m_devMomentaF.data(), momenta, m_devMomentaF.bytes(), gpuMemcpyHostToDevice );
       const int thrPerEvt = CPPProcess::npar * CPPProcess::np4; // AV: transpose alg does 1 element per thread (NOT 1 event per thread)
       //const int thrPerEvt = 1; // AV: try new alg with 1 event per thread... this seems slower
-      dev_transposeMomentaF2C<<<m_gpublocks * thrPerEvt, m_gputhreads>>>( m_devMomentaF.data(), m_devMomentaC.data(), m_nevt );
+      gpuLaunchKernel( dev_transposeMomentaF2C, m_gpublocks * thrPerEvt, m_gputhreads, m_devMomentaF.data(), m_devMomentaC.data(), m_nevt );
     }
     if constexpr( std::is_same_v<FORTRANFPTYPE, fptype> )
     {
@@ -341,7 +341,7 @@ namespace mg5amcCpu
   }
 #endif
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   template<typename FORTRANFPTYPE>
   void Bridge<FORTRANFPTYPE>::cpu_sequence( const FORTRANFPTYPE* momenta,
                                             const FORTRANFPTYPE* gs,
@@ -396,7 +396,7 @@ namespace mg5amcCpu
   // - C++ array: momenta[npagM][npar][np4][neppM] with nevt=npagM*neppM (AOSOA)
   //
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   template<typename Tin, typename Tout>
   __global__ void dev_transposeMomentaF2C( const Tin* in, Tout* out, const unsigned int nevt )
   {
diff --git a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/BridgeKernels.cc b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/BridgeKernels.cc
index d58066c9c1..eaf4037a24 100644
--- a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/BridgeKernels.cc
+++ b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/BridgeKernels.cc
@@ -1,17 +1,18 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #include "BridgeKernels.h"
 
+#include "GpuAbstraction.h"
 #include "MemoryAccessMomenta.h"
 
 #include <sstream>
 
 //============================================================================
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -45,7 +46,7 @@ namespace mg5amcCpu
 
 //============================================================================
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 namespace mg5amcCpu
 {
 
@@ -96,7 +97,7 @@ namespace mg5amcCpu
 
 //============================================================================
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 {
 
diff --git a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/BridgeKernels.h b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/BridgeKernels.h
index 15eb4bff4d..3efef8ce97 100644
--- a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/BridgeKernels.h
+++ b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/BridgeKernels.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef BRIDGEKERNELS_H
 #define BRIDGEKERNELS_H 1
@@ -12,7 +12,7 @@
 #include "MatrixElementKernels.h"
 #include "MemoryBuffers.h"
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -49,7 +49,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A Bridge wrapper class encapsulating matrix element calculations on a CPU host
   class BridgeKernelHost final : public BridgeKernelBase
   {
@@ -89,7 +89,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   // A Bridge wrapper class encapsulating matrix element calculations on a GPU device
   class BridgeKernelDevice : public BridgeKernelBase
   {
diff --git a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/CommonRandomNumberKernel.cc b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/CommonRandomNumberKernel.cc
index 985b39f576..010bc4cbd0 100644
--- a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/CommonRandomNumberKernel.cc
+++ b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/CommonRandomNumberKernel.cc
@@ -1,15 +1,16 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #include "CommonRandomNumbers.h"
+#include "GpuAbstraction.h"
 #include "MemoryBuffers.h"
 #include "RandomNumberKernels.h"
 
 #include <cassert>
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/CrossSectionKernels.cc b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/CrossSectionKernels.cc
index 0b355a3c8d..c15b39844d 100644
--- a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/CrossSectionKernels.cc
+++ b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/CrossSectionKernels.cc
@@ -1,10 +1,11 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #include "CrossSectionKernels.h"
 
+#include "GpuAbstraction.h"
 #include "MemoryAccessMatrixElements.h"
 #include "MemoryAccessWeights.h"
 #include "MemoryBuffers.h"
@@ -77,7 +78,7 @@ debug_me_is_abnormal( const fptype& me, size_t ievtALL )
 
 //============================================================================
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -185,7 +186,7 @@ namespace mg5amcCpu
 
 //============================================================================
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 {
 
diff --git a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/CrossSectionKernels.h b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/CrossSectionKernels.h
index 7933ca4bbf..4d9659e04e 100644
--- a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/CrossSectionKernels.h
+++ b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/CrossSectionKernels.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef CROSSSECTIONKERNELS_H
 #define CROSSSECTIONKERNELS_H 1
@@ -13,7 +13,7 @@
 
 //============================================================================
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -96,7 +96,7 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
   /*
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   // A class encapsulating the calculation of event statistics on a GPU device
   class CrossSectionKernelDevice : public CrossSectionKernelBase, public NumberOfEvents
   {
diff --git a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/CudaRuntime.h b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/CudaRuntime.h
deleted file mode 100644
index 64ce52f4b3..0000000000
--- a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/CudaRuntime.h
+++ /dev/null
@@ -1,85 +0,0 @@
-// Copyright (C) 2020-2023 CERN and UCLouvain.
-// Licensed under the GNU Lesser General Public License (version 3 or later).
-// Created by: S. Roiser (Jul 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
-
-#ifndef MG5AMC_CUDARUNTIME_H
-#define MG5AMC_CUDARUNTIME_H 1
-
-// MG5AMC on GPU uses the CUDA runtime API, not the lower level CUDA driver API
-// See https://docs.nvidia.com/cuda/cuda-runtime-api/driver-vs-runtime-api.html#driver-vs-runtime-api
-
-#include <cassert>
-#include <iostream>
-
-//--------------------------------------------------------------------------
-
-// See https://stackoverflow.com/a/14038590
-#ifdef __CUDACC__ /* clang-format off */
-#define checkCuda( code ) { assertCuda( code, __FILE__, __LINE__ ); }
-inline void assertCuda( cudaError_t code, const char* file, int line, bool abort = true )
-{
-  if( code != cudaSuccess )
-  {
-    printf( "ERROR! assertCuda: '%s' (%d) in %s:%d\n", cudaGetErrorString( code ), code, file, line );
-    if( abort ) assert( code == cudaSuccess );
-  }
-}
-#endif /* clang-format on */
-
-//--------------------------------------------------------------------------
-
-#ifdef __CUDACC__
-namespace mg5amcGpu
-{
-  // Instantiate a CudaRuntime at the beginnining of the application's main to
-  // invoke cudaSetDevice(0) in the constructor and book a cudaDeviceReset() call in the destructor
-  // *** FIXME! This will all need to be designed differently when going to multi-GPU nodes! ***
-  struct CudaRuntime final
-  {
-    CudaRuntime( const bool debug = true )
-      : m_debug( debug ) { setUp( m_debug ); }
-    ~CudaRuntime() { tearDown( m_debug ); }
-    CudaRuntime( const CudaRuntime& ) = delete;
-    CudaRuntime( CudaRuntime&& ) = delete;
-    CudaRuntime& operator=( const CudaRuntime& ) = delete;
-    CudaRuntime& operator=( CudaRuntime&& ) = delete;
-    bool m_debug;
-
-    // Set up CUDA application
-    // ** NB: strictly speaking this is not needed when using the CUDA runtime API **
-    // Calling cudaSetDevice on startup is useful to properly book-keep the time spent in CUDA initialization
-    static void setUp( const bool debug = true )
-    {
-      // ** NB: it is useful to call cudaSetDevice, or cudaFree, to properly book-keep the time spent in CUDA initialization
-      // ** NB: otherwise, the first CUDA operation (eg a cudaMemcpyToSymbol in CPPProcess ctor) appears to take much longer!
-      /*
-      // [We initially added cudaFree(0) to "ease profile analysis" only because it shows up as a big recognizable block!]
-      // No explicit initialization is needed: https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#initialization
-      // It is not clear what cudaFree(0) does at all: https://stackoverflow.com/questions/69967813/
-      if ( debug ) std::cout << "__CudaRuntime: calling cudaFree(0)" << std::endl;
-      checkCuda( cudaFree( 0 ) ); // SLOW!
-      */
-      // Replace cudaFree(0) by cudaSetDevice(0), even if it is not really needed either
-      // (but see https://developer.nvidia.com/blog/cuda-pro-tip-always-set-current-device-avoid-multithreading-bugs)
-      if( debug ) std::cout << "__CudaRuntime: calling cudaSetDevice(0)" << std::endl;
-      checkCuda( cudaSetDevice( 0 ) ); // SLOW!
-    }
-
-    // Tear down CUDA application (call cudaDeviceReset)
-    // ** NB: strictly speaking this is not needed when using the CUDA runtime API **
-    // Calling cudaDeviceReset on shutdown is only needed for checking memory leaks in cuda-memcheck
-    // See https://docs.nvidia.com/cuda/cuda-memcheck/index.html#leak-checking
-    static void tearDown( const bool debug = true )
-    {
-      if( debug ) std::cout << "__CudaRuntime: calling cudaDeviceReset()" << std::endl;
-      checkCuda( cudaDeviceReset() );
-    }
-  };
-
-}
-#endif
-
-//--------------------------------------------------------------------------
-
-#endif // MG5AMC_CUDARUNTIME_H
diff --git a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/CurandRandomNumberKernel.cc b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/CurandRandomNumberKernel.cc
index eb56333b03..08a16f6f2c 100644
--- a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/CurandRandomNumberKernel.cc
+++ b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/CurandRandomNumberKernel.cc
@@ -1,9 +1,9 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
-#include "CudaRuntime.h"
+#include "GpuRuntime.h"
 #include "MemoryBuffers.h"
 #include "RandomNumberKernels.h"
 
@@ -22,7 +22,7 @@ inline void assertCurand( curandStatus_t code, const char *file, int line, bool
 }
 #endif /* clang-format on */
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -36,7 +36,7 @@ namespace mg5amcCpu
   {
     if( m_isOnDevice )
     {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
       if( !m_rnarray.isOnDevice() )
         throw std::runtime_error( "CurandRandomNumberKernel on device with a host random number array" );
 #else
@@ -114,7 +114,7 @@ namespace mg5amcCpu
     /*
     printf( "\nCurandRandomNumberKernel::generateRnarray size = %d\n", (int)m_rnarray.size() );
     fptype* data = m_rnarray.data();
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     if( m_rnarray.isOnDevice() )
     {
       data = new fptype[m_rnarray.size()]();
@@ -123,7 +123,7 @@ namespace mg5amcCpu
 #endif
     for( int i = 0; i < ( (int)m_rnarray.size() / 4 ); i++ )
       printf( "[%4d] %f %f %f %f\n", i * 4, data[i * 4], data[i * 4 + 2], data[i * 4 + 2], data[i * 4 + 3] );
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     if( m_rnarray.isOnDevice() ) delete[] data;
 #endif
     */
diff --git a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/EventStatistics.h b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/EventStatistics.h
index 48b51e0a49..b425a5bade 100644
--- a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/EventStatistics.h
+++ b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/EventStatistics.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef EventStatistics_H
 #define EventStatistics_H 1
@@ -16,7 +16,7 @@
 #include <limits>
 #include <string>
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/GpuAbstraction.h b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/GpuAbstraction.h
new file mode 100644
index 0000000000..6a7d9c05c0
--- /dev/null
+++ b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/GpuAbstraction.h
@@ -0,0 +1,71 @@
+// Copyright (C) 2020-2023 CERN and UCLouvain.
+// Licensed under the GNU Lesser General Public License (version 3 or later).
+// Created by: J. Teig (Jul 2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+
+#ifndef MG5AMC_GPUABSTRACTION_H
+#define MG5AMC_GPUABSTRACTION_H 1
+
+#include <cassert>
+
+//--------------------------------------------------------------------------
+
+#ifdef __CUDACC__
+
+#define gpuError_t cudaError_t
+#define gpuPeekAtLastError cudaPeekAtLastError
+#define gpuGetErrorString cudaGetErrorString
+#define gpuSuccess cudaSuccess
+
+#define gpuMallocHost( ptr, size ) checkGpu( cudaMallocHost( ptr, size ) )
+#define gpuMalloc( ptr, size ) checkGpu( cudaMalloc( ptr, size ) )
+
+#define gpuMemcpy( dstData, srcData, srcBytes, func ) checkGpu( cudaMemcpy( dstData, srcData, srcBytes, func ) )
+#define gpuMemcpyHostToDevice cudaMemcpyHostToDevice
+#define gpuMemcpyDeviceToHost cudaMemcpyDeviceToHost
+#define gpuMemcpyToSymbol( type1, type2, size ) checkGpu( cudaMemcpyToSymbol( type1, type2, size ) )
+
+#define gpuFree( ptr ) checkGpu( cudaFree( ptr ) )
+#define gpuFreeHost( ptr ) checkGpu( cudaFreeHost( ptr ) )
+
+#define gpuSetDevice cudaSetDevice
+#define gpuDeviceSynchronize cudaDeviceSynchronize
+#define gpuDeviceReset cudaDeviceReset
+
+#define gpuLaunchKernel( kernel, blocks, threads, ... ) kernel<<<blocks, threads>>>( __VA_ARGS__ )
+#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<<blocks, threads, sharedMem>>>( __VA_ARGS__ )
+
+//--------------------------------------------------------------------------
+
+#elif defined __HIPCC__
+
+#include "hip/hip_runtime.h"
+
+#define gpuError_t hipError_t
+#define gpuPeekAtLastError hipPeekAtLastError
+#define gpuGetErrorString hipGetErrorString
+#define gpuSuccess hipSuccess
+
+#define gpuMallocHost( ptr, size ) checkGpu( hipHostMalloc( ptr, size ) ) // HostMalloc better
+#define gpuMalloc( ptr, size ) checkGpu( hipMalloc( ptr, size ) )
+
+#define gpuMemcpy( dstData, srcData, srcBytes, func ) checkGpu( hipMemcpy( dstData, srcData, srcBytes, func ) )
+#define gpuMemcpyHostToDevice hipMemcpyHostToDevice
+#define gpuMemcpyDeviceToHost hipMemcpyDeviceToHost
+#define gpuMemcpyToSymbol( type1, type2, size ) checkGpu( hipMemcpyToSymbol( type1, type2, size ) )
+
+#define gpuFree( ptr ) checkGpu( hipFree( ptr ) )
+#define gpuFreeHost( ptr ) checkGpu( hipHostFree( ptr ) )
+
+#define gpuSetDevice hipSetDevice
+#define gpuDeviceSynchronize hipDeviceSynchronize
+#define gpuDeviceReset hipDeviceReset
+
+#define gpuLaunchKernel( kernel, blocks, threads, ... ) kernel<<<blocks, threads>>>( __VA_ARGS__ )
+#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<<blocks, threads, sharedMem>>>( __VA_ARGS__ )
+
+//--------------------------------------------------------------------------
+
+#endif
+
+#endif // MG5AMC_GPUABSTRACTION_H
diff --git a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/GpuRuntime.h b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/GpuRuntime.h
new file mode 100644
index 0000000000..93579ef08b
--- /dev/null
+++ b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/GpuRuntime.h
@@ -0,0 +1,85 @@
+// Copyright (C) 2020-2023 CERN and UCLouvain.
+// Licensed under the GNU Lesser General Public License (version 3 or later).
+// Created by: J. Teig (Jun 2023, based on earlier work by S. Roiser) for the MG5aMC CUDACPP plugin.
+// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+
+#ifndef MG5AMC_GPURUNTIME_H
+#define MG5AMC_GPURUNTIME_H 1
+
+// MG5AMC on GPU uses the CUDA runtime API, not the lower level CUDA driver API
+// See https://docs.nvidia.com/cuda/cuda-runtime-api/driver-vs-runtime-api.html#driver-vs-runtime-api
+
+#include "GpuAbstraction.h"
+
+#include <iostream>
+
+//--------------------------------------------------------------------------
+
+// See https://stackoverflow.com/a/14038590
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
+#define checkGpu( code ) { assertGpu( code, __FILE__, __LINE__ ); }
+inline void assertGpu( gpuError_t code, const char* file, int line, bool abort = true )
+{
+  if( code != gpuSuccess )
+  {
+    printf( "ERROR! assertGpu: '%s' (%d) in %s:%d\n", gpuGetErrorString( code ), code, file, line );
+    if( abort ) assert( code == gpuSuccess );
+  }
+}
+#endif /* clang-format on */
+
+//--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+namespace mg5amcGpu
+{
+  // Instantiate a GpuRuntime at the beginnining of the application's main to
+  // invoke gpuSetDevice(0) in the constructor and book a gpuDeviceReset() call in the destructor
+  // *** FIXME! This will all need to be designed differently when going to multi-GPU nodes! ***
+  struct GpuRuntime final
+  {
+    GpuRuntime( const bool debug = true )
+      : m_debug( debug ) { setUp( m_debug ); }
+    ~GpuRuntime() { tearDown( m_debug ); }
+    GpuRuntime( const GpuRuntime& ) = delete;
+    GpuRuntime( GpuRuntime&& ) = delete;
+    GpuRuntime& operator=( const GpuRuntime& ) = delete;
+    GpuRuntime& operator=( GpuRuntime&& ) = delete;
+    bool m_debug;
+
+    // Set up CUDA application
+    // ** NB: strictly speaking this is not needed when using the CUDA runtime API **
+    // Calling cudaSetDevice on startup is useful to properly book-keep the time spent in CUDA initialization
+    static void setUp( const bool debug = true )
+    {
+      // ** NB: it is useful to call cudaSetDevice, or cudaFree, to properly book-keep the time spent in CUDA initialization
+      // ** NB: otherwise, the first CUDA operation (eg a cudaMemcpyToSymbol in CPPProcess ctor) appears to take much longer!
+      /*
+      // [We initially added cudaFree(0) to "ease profile analysis" only because it shows up as a big recognizable block!]
+      // No explicit initialization is needed: https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#initialization
+      // It is not clear what cudaFree(0) does at all: https://stackoverflow.com/questions/69967813/
+      if ( debug ) std::cout << "__CudaRuntime: calling cudaFree(0)" << std::endl;
+      checkCuda( cudaFree( 0 ) ); // SLOW!
+      */
+      // Replace cudaFree(0) by cudaSetDevice(0), even if it is not really needed either
+      // (but see https://developer.nvidia.com/blog/cuda-pro-tip-always-set-current-device-avoid-multithreading-bugs)
+      if( debug ) std::cout << "__GpuRuntime: calling GpuSetDevice(0)" << std::endl;
+      checkGpu( gpuSetDevice( 0 ) ); // SLOW!
+    }
+
+    // Tear down CUDA application (call cudaDeviceReset)
+    // ** NB: strictly speaking this is not needed when using the CUDA runtime API **
+    // Calling cudaDeviceReset on shutdown is only needed for checking memory leaks in cuda-memcheck
+    // See https://docs.nvidia.com/cuda/cuda-memcheck/index.html#leak-checking
+    static void tearDown( const bool debug = true )
+    {
+      if( debug ) std::cout << "__GpuRuntime: calling GpuDeviceReset()" << std::endl;
+      checkGpu( gpuDeviceReset() );
+    }
+  };
+}
+#endif
+
+//--------------------------------------------------------------------------
+
+#endif // MG5AMC_GPURUNTIME_H
diff --git a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/MadgraphTest.h b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/MadgraphTest.h
index ef40624c88..a64c05c26a 100644
--- a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/MadgraphTest.h
+++ b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/MadgraphTest.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: S. Hageboeck (Dec 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MADGRAPHTEST_H_
 #define MADGRAPHTEST_H_ 1
@@ -22,7 +22,7 @@
 #include <string>
 #include <vector>
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 using mg5amcGpu::CPPProcess;
 #else
 using mg5amcCpu::CPPProcess;
@@ -201,7 +201,7 @@ class MadgraphTest : public testing::TestWithParam<TestDriverBase*>
 
 // Since we link both the CPU-only and GPU tests into the same executable, we prevent
 // a multiply defined symbol by only compiling this in the non-CUDA phase:
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 
 /// Compare momenta and matrix elements.
 /// This uses an implementation of TestDriverBase to run a madgraph workflow,
@@ -307,6 +307,6 @@ TEST_P( MadgraphTest, CompareMomentaAndME )
   }
 }
 
-#endif // __CUDACC__
+#endif // MGONGPUCPP_GPUIMPL
 
 #endif /* MADGRAPHTEST_H_ */
diff --git a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/MatrixElementKernels.cc b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/MatrixElementKernels.cc
index 74b5239ebf..81699dfea9 100644
--- a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/MatrixElementKernels.cc
+++ b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/MatrixElementKernels.cc
@@ -1,12 +1,12 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #include "MatrixElementKernels.h"
 
 #include "CPPProcess.h"
-#include "CudaRuntime.h"
+#include "GpuRuntime.h" // Includes the abstraction for Nvidia/AMD compilation
 #include "MemoryAccessMomenta.h"
 #include "MemoryBuffers.h"
 
@@ -14,7 +14,7 @@
 
 //============================================================================
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 namespace mg5amcCpu
 {
 
@@ -150,7 +150,7 @@ namespace mg5amcCpu
 
 //============================================================================
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 {
 
@@ -209,13 +209,13 @@ namespace mg5amcGpu
     PinnedHostBufferHelicityMask hstIsGoodHel( ncomb );
     DeviceBufferHelicityMask devIsGoodHel( ncomb );
     // ... 0d1. Compute good helicity mask on the device
-    computeDependentCouplings<<<m_gpublocks, m_gputhreads>>>( m_gs.data(), m_couplings.data() );
+    gpuLaunchKernel( computeDependentCouplings, m_gpublocks, m_gputhreads, m_gs.data(), m_couplings.data() );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    sigmaKin_getGoodHel<<<m_gpublocks, m_gputhreads>>>( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_numerators.data(), m_denominators.data(), devIsGoodHel.data() );
+    gpuLaunchKernel( sigmaKin_getGoodHel, m_gpublocks, m_gputhreads, m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_numerators.data(), m_denominators.data(), devIsGoodHel.data() );
 #else
-    sigmaKin_getGoodHel<<<m_gpublocks, m_gputhreads>>>( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), devIsGoodHel.data() );
+    gpuLaunchKernel( sigmaKin_getGoodHel, m_gpublocks, m_gputhreads, m_momenta.data(), m_couplings.data(), m_matrixElements.data(), devIsGoodHel.data() );
 #endif
-    checkCuda( cudaPeekAtLastError() );
+    checkGpu( gpuPeekAtLastError() );
     // ... 0d2. Copy back good helicity mask to the host
     copyHostFromDevice( hstIsGoodHel, devIsGoodHel );
     // ... 0d3. Copy back good helicity list to constant memory on the device
@@ -226,19 +226,19 @@ namespace mg5amcGpu
 
   void MatrixElementKernelDevice::computeMatrixElements( const unsigned int channelId )
   {
-    computeDependentCouplings<<<m_gpublocks, m_gputhreads>>>( m_gs.data(), m_couplings.data() );
+    gpuLaunchKernel( computeDependentCouplings, m_gpublocks, m_gputhreads, m_gs.data(), m_couplings.data() );
 #ifndef MGONGPU_NSIGHT_DEBUG
     constexpr unsigned int sharedMemSize = 0;
 #else
     constexpr unsigned int sharedMemSize = ntpbMAX * sizeof( float );
 #endif
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    sigmaKin<<<m_gpublocks, m_gputhreads, sharedMemSize>>>( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), channelId, m_numerators.data(), m_denominators.data(), m_selhel.data(), m_selcol.data() );
+    gpuLaunchKernelSharedMem( sigmaKin, m_gpublocks, m_gputhreads, sharedMemSize, m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), channelId, m_numerators.data(), m_denominators.data(), m_selhel.data(), m_selcol.data() );
 #else
-    sigmaKin<<<m_gpublocks, m_gputhreads, sharedMemSize>>>( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), m_selhel.data(), m_selcol.data() );
+    gpuLaunchKernelSharedMem( sigmaKin, m_gpublocks, m_gputhreads, sharedMemSize, m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), m_selhel.data(), m_selcol.data() );
 #endif
-    checkCuda( cudaPeekAtLastError() );
-    checkCuda( cudaDeviceSynchronize() );
+    checkGpu( gpuPeekAtLastError() );
+    checkGpu( gpuDeviceSynchronize() );
   }
 
   //--------------------------------------------------------------------------
diff --git a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/MatrixElementKernels.h b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/MatrixElementKernels.h
index 23e84757a2..72bd8f195b 100644
--- a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/MatrixElementKernels.h
+++ b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/MatrixElementKernels.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MATRIXELEMENTKERNELS_H
 #define MATRIXELEMENTKERNELS_H 1
@@ -10,7 +10,7 @@
 
 #include "MemoryBuffers.h"
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -81,7 +81,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating matrix element calculations on a CPU host
   class MatrixElementKernelHost final : public MatrixElementKernelBase, public NumberOfEvents
   {
@@ -130,7 +130,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   // A class encapsulating matrix element calculations on a GPU device
   class MatrixElementKernelDevice : public MatrixElementKernelBase, public NumberOfEvents
   {
diff --git a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/MemoryAccessAmplitudes.h b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/MemoryAccessAmplitudes.h
index 573b3bbbc9..ffb76e93de 100644
--- a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/MemoryAccessAmplitudes.h
+++ b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/MemoryAccessAmplitudes.h
@@ -15,7 +15,7 @@
 #define MGONGPU_TRIVIAL_AMPLITUDES 1
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/MemoryAccessCouplings.h b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/MemoryAccessCouplings.h
index d65c9d6e04..85c3c9ed1c 100644
--- a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/MemoryAccessCouplings.h
+++ b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/MemoryAccessCouplings.h
@@ -15,7 +15,7 @@
 #include "MemoryBuffers.h"       // for HostBufferCouplings::isaligned
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/MemoryAccessCouplingsFixed.h b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/MemoryAccessCouplingsFixed.h
index dc0d93afff..ffcdf4dbef 100644
--- a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/MemoryAccessCouplingsFixed.h
+++ b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/MemoryAccessCouplingsFixed.h
@@ -14,7 +14,7 @@
 //#include "MemoryAccessHelpers.h"
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/MemoryAccessDenominators.h b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/MemoryAccessDenominators.h
index 3bce635718..66f2d32a6b 100644
--- a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/MemoryAccessDenominators.h
+++ b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/MemoryAccessDenominators.h
@@ -10,7 +10,7 @@
 #include "MemoryAccessGs.h"
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/MemoryAccessGs.h b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/MemoryAccessGs.h
index 31311aa375..4c726b30f3 100644
--- a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/MemoryAccessGs.h
+++ b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/MemoryAccessGs.h
@@ -13,7 +13,7 @@
 #include "MemoryBuffers.h" // for HostBufferMatrixElements::isaligned
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/MemoryAccessHelpers.h b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/MemoryAccessHelpers.h
index c82a6c7635..db73e4e064 100644
--- a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/MemoryAccessHelpers.h
+++ b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/MemoryAccessHelpers.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MemoryAccessHelpers_H
 #define MemoryAccessHelpers_H 1
@@ -105,7 +105,7 @@ class KernelAccessHelper : public MemoryAccessHelper<T>
     }
     else
     {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
       const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid
       //printf( "kernelAccessRecord: ievt=%d threadId=%d\n", ievt, threadIdx.x );
       return T::ieventAccessRecord( buffer, ievt ); // NB fptype and fptype_sv coincide for CUDA
diff --git a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/MemoryAccessMatrixElements.h b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/MemoryAccessMatrixElements.h
index f32e6fea5b..3741011971 100644
--- a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/MemoryAccessMatrixElements.h
+++ b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/MemoryAccessMatrixElements.h
@@ -13,7 +13,7 @@
 #include "MemoryBuffers.h" // for HostBufferMatrixElements::isaligned
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/MemoryAccessMomenta.h b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/MemoryAccessMomenta.h
index 29266de32c..3be229d392 100644
--- a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/MemoryAccessMomenta.h
+++ b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/MemoryAccessMomenta.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MemoryAccessMomenta_H
 #define MemoryAccessMomenta_H 1
@@ -13,7 +13,7 @@
 #include "MemoryAccessVectors.h"
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -30,7 +30,7 @@ namespace mg5amcCpu
 
     // Number of Events Per Page in the momenta AOSOA memory buffer layout
     // (these are all best kept as a compile-time constants: see issue #23)
-#ifdef __CUDACC__ /* clang-format off */
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
     // -----------------------------------------------------------------------------------------------
     // --- GPUs: neppM is best set to a power of 2 times the number of fptype's in a 32-byte cacheline
     // --- This is relevant to ensure coalesced access to momenta in global memory
diff --git a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/MemoryAccessNumerators.h b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/MemoryAccessNumerators.h
index b152183b28..18991f4fa6 100644
--- a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/MemoryAccessNumerators.h
+++ b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/MemoryAccessNumerators.h
@@ -10,7 +10,7 @@
 #include "MemoryAccessGs.h"
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/MemoryAccessRandomNumbers.h b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/MemoryAccessRandomNumbers.h
index e2988d39f3..40cb089135 100644
--- a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/MemoryAccessRandomNumbers.h
+++ b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/MemoryAccessRandomNumbers.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MemoryAccessRandomNumbers_H
 #define MemoryAccessRandomNumbers_H 1
@@ -11,7 +11,7 @@
 #include "CPPProcess.h"
 #include "MemoryAccessHelpers.h"
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 using mg5amcGpu::CPPProcess;
 #else
 using mg5amcCpu::CPPProcess;
diff --git a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/MemoryAccessVectors.h b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/MemoryAccessVectors.h
index e9b197368e..08faccff0f 100644
--- a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/MemoryAccessVectors.h
+++ b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/MemoryAccessVectors.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MemoryAccessVectors_H
 #define MemoryAccessVectors_H 1
@@ -10,7 +10,7 @@
 
 #include "mgOnGpuVectors.h"
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 namespace mg5amcCpu // this is only needed for CPU SIMD vectorization
 {
 
diff --git a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/MemoryAccessWavefunctions.h b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/MemoryAccessWavefunctions.h
index 5428aaf933..33bef4559e 100644
--- a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/MemoryAccessWavefunctions.h
+++ b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/MemoryAccessWavefunctions.h
@@ -15,7 +15,7 @@
 #define MGONGPU_TRIVIAL_WAVEFUNCTIONS 1
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/MemoryBuffers.h b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/MemoryBuffers.h
index 8109470148..78004e66cc 100644
--- a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/MemoryBuffers.h
+++ b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/MemoryBuffers.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021, based on earlier work by S. Hageboeck) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Roiser, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MemoryBuffers_H
 #define MemoryBuffers_H 1
@@ -11,12 +11,12 @@
 #include "mgOnGpuCxtypes.h"
 
 #include "CPPProcess.h"
-#include "CudaRuntime.h"
+#include "GpuRuntime.h"
 #include "Parameters_heft.h"
 
 #include <sstream>
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -87,7 +87,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   constexpr bool HostBufferALIGNED = false;   // ismisaligned=false
   constexpr bool HostBufferMISALIGNED = true; // ismisaligned=true
 
@@ -119,7 +119,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   // A class encapsulating a CUDA pinned host buffer
   template<typename T>
   class PinnedHostBufferBase : public BufferBase<T>
@@ -128,18 +128,18 @@ namespace mg5amcCpu
     PinnedHostBufferBase( const size_t size )
       : BufferBase<T>( size, false )
     {
-      checkCuda( cudaMallocHost( &( this->m_data ), this->bytes() ) );
+      gpuMallocHost( &( this->m_data ), this->bytes() );
     }
     virtual ~PinnedHostBufferBase()
     {
-      checkCuda( cudaFreeHost( this->m_data ) );
+      gpuFreeHost( this->m_data );
     }
   };
 #endif
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   // A class encapsulating a CUDA device buffer
   template<typename T>
   class DeviceBufferBase : public BufferBase<T>
@@ -148,18 +148,18 @@ namespace mg5amcCpu
     DeviceBufferBase( const size_t size )
       : BufferBase<T>( size, true )
     {
-      checkCuda( cudaMalloc( &( this->m_data ), this->bytes() ) );
+      gpuMalloc( &( this->m_data ), this->bytes() );
     }
     virtual ~DeviceBufferBase()
     {
-      checkCuda( cudaFree( this->m_data ) );
+      gpuFree( this->m_data );
     }
   };
 #endif
 
   //--------------------------------------------------------------------------
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for a given number of events
   template<typename T, size_t sizePerEvent, bool ismisaligned>
   class HostBuffer : public HostBufferBase<T, ismisaligned>, virtual private NumberOfEvents
@@ -175,7 +175,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   // A class encapsulating a CUDA pinned host buffer for a given number of events
   template<typename T, size_t sizePerEvent>
   class PinnedHostBuffer : public PinnedHostBufferBase<T>, virtual private NumberOfEvents
@@ -191,7 +191,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   // A class encapsulating a CUDA device buffer for a given number of events
   template<typename T, size_t sizePerEvent>
   class DeviceBuffer : public DeviceBufferBase<T>, virtual private NumberOfEvents
@@ -213,7 +213,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for momenta random numbers
   constexpr size_t sizePerEventRndNumMomenta = MemoryBuffers::np4 * MemoryBuffers::nparf;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for momenta random numbers
   typedef HostBuffer<fptype, sizePerEventRndNumMomenta, HostBufferALIGNED> HostBufferRndNumMomenta;
 #else
@@ -232,7 +232,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer with ONE fptype per event
   constexpr size_t sizePerEventOneFp = 1;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer with ONE fptype per event
   typedef HostBuffer<fptype, sizePerEventOneFp, HostBufferALIGNED> HostBufferOneFp;
 #else
@@ -257,7 +257,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for Gs
   constexpr size_t sizePerEventGs = 1;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for gs
   typedef HostBuffer<fptype, sizePerEventGs, HostBufferALIGNED> HostBufferGs;
 #else
@@ -276,7 +276,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for numerators
   constexpr size_t sizePerEventNumerators = 1;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for gs
   typedef HostBuffer<fptype, sizePerEventNumerators, HostBufferALIGNED> HostBufferNumerators;
 #else
@@ -296,7 +296,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for denominators
   constexpr size_t sizePerEventDenominators = 1;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for gs
   typedef HostBuffer<fptype, sizePerEventDenominators, HostBufferALIGNED> HostBufferDenominators;
 #else
@@ -315,7 +315,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for random numbers
   constexpr size_t sizePerEventCouplings = MemoryBuffers::ndcoup * MemoryBuffers::nx2;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for gs
   typedef HostBuffer<fptype, sizePerEventCouplings, HostBufferALIGNED> HostBufferCouplings;
 #else
@@ -333,7 +333,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for momenta
   constexpr size_t sizePerEventMomenta = MemoryBuffers::np4 * MemoryBuffers::npar;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for momenta
   typedef HostBuffer<fptype, sizePerEventMomenta, HostBufferALIGNED> HostBufferMomenta;
   //typedef HostBuffer<fptype, sizePerEventMomenta, HostBufferMISALIGNED> HostBufferMomenta; // TEST MISALIGNMENT!
@@ -352,7 +352,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for sampling weights
   constexpr size_t sizePerEventWeights = 1;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for sampling weights
   typedef HostBuffer<fptype, sizePerEventWeights, HostBufferALIGNED> HostBufferWeights;
 #else
@@ -370,7 +370,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for matrix elements
   constexpr size_t sizePerEventMatrixElements = 1;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for matrix elements
   typedef HostBuffer<fptype, sizePerEventMatrixElements, HostBufferALIGNED> HostBufferMatrixElements;
 #else
@@ -385,7 +385,7 @@ namespace mg5amcCpu
   // A base class encapsulating a memory buffer for the helicity mask
   typedef BufferBase<bool> BufferHelicityMask;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for the helicity mask
   typedef HostBufferBase<bool, HostBufferALIGNED> HostBufferHelicityMask;
 #else
@@ -403,7 +403,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for wavefunctions
   constexpr size_t sizePerEventWavefunctions = MemoryBuffers::nw6 * MemoryBuffers::nx2;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for wavefunctions
   typedef HostBuffer<fptype, sizePerEventWavefunctions, HostBufferALIGNED> HostBufferWavefunctions;
 #else
@@ -421,7 +421,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for helicity random numbers
   constexpr size_t sizePerEventRndNumHelicity = 1;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for helicity random numbers
   typedef HostBuffer<fptype, sizePerEventRndNumHelicity, HostBufferALIGNED> HostBufferRndNumHelicity;
 #else
@@ -439,7 +439,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for color random numbers
   constexpr size_t sizePerEventRndNumColor = 1;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for color random numbers
   typedef HostBuffer<fptype, sizePerEventRndNumColor, HostBufferALIGNED> HostBufferRndNumColor;
 #else
@@ -457,7 +457,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for helicity selection
   constexpr size_t sizePerEventSelectedHelicity = 1;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for helicity selection
   typedef HostBuffer<int, sizePerEventSelectedHelicity, HostBufferALIGNED> HostBufferSelectedHelicity;
 #else
@@ -475,7 +475,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for color selection
   constexpr size_t sizePerEventSelectedColor = 1;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for color selection
   typedef HostBuffer<int, sizePerEventSelectedColor, HostBufferALIGNED> HostBufferSelectedColor;
 #else
@@ -487,7 +487,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   template<class Tdst, class Tsrc>
   void copyDeviceFromHost( Tdst& dst, const Tsrc& src ) // keep the same order of arguments as in memcpy
   {
@@ -504,13 +504,13 @@ namespace mg5amcCpu
       throw std::runtime_error( sstr.str() );
     }
     // NB (PR #45): cudaMemcpy involves an intermediate memcpy to pinned memory if host array is a not a pinned host array
-    checkCuda( cudaMemcpy( dst.data(), src.data(), src.bytes(), cudaMemcpyHostToDevice ) );
+    gpuMemcpy( dst.data(), src.data(), src.bytes(), gpuMemcpyHostToDevice );
   }
 #endif
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   template<class Tdst, class Tsrc>
   void copyHostFromDevice( Tdst& dst, const Tsrc& src ) // keep the same order of arguments as in memcpy
   {
@@ -527,7 +527,7 @@ namespace mg5amcCpu
       throw std::runtime_error( sstr.str() );
     }
     // NB (PR #45): cudaMemcpy involves an intermediate memcpy to pinned memory if host array is a not a pinned host array
-    checkCuda( cudaMemcpy( dst.data(), src.data(), src.bytes(), cudaMemcpyDeviceToHost ) );
+    gpuMemcpy( dst.data(), src.data(), src.bytes(), gpuMemcpyDeviceToHost );
   }
 #endif
 
diff --git a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/P1_Sigma_heft_gg_h/CPPProcess.cc b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/P1_Sigma_heft_gg_h/CPPProcess.cc
index 526bd7d296..3b6085c784 100644
--- a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/P1_Sigma_heft_gg_h/CPPProcess.cc
+++ b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/P1_Sigma_heft_gg_h/CPPProcess.cc
@@ -4,7 +4,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
 // MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
@@ -16,7 +16,6 @@
 
 #include "mgOnGpuConfig.h"
 
-#include "CudaRuntime.h"
 #include "HelAmps_heft.h"
 #include "MemoryAccessAmplitudes.h"
 #include "MemoryAccessCouplings.h"
@@ -46,7 +45,7 @@
 // Class member functions for calculating the matrix elements for
 // Process: g g > h HIG<=1 HIW<=1 WEIGHTED<=2 @1
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -80,7 +79,7 @@ namespace mg5amcCpu
   //__device__ const fptype* cIPD = nullptr; // unused as nparam=0
   __device__ const fptype* cIPC = nullptr; // unused as nicoup=0
 #else
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   //__device__ __constant__ fptype* cIPD = nullptr; // unused as nparam=0
   __device__ __constant__ fptype* cIPC = nullptr; // unused as nicoup=0
 #else
@@ -90,7 +89,7 @@ namespace mg5amcCpu
 #endif
 
   // Helicity combinations (and filtering of "good" helicity combinations)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   __device__ __constant__ short cHel[ncomb][npar];
   __device__ __constant__ int cNGoodHel;
   __device__ __constant__ int cGoodHel[ncomb];
@@ -118,13 +117,13 @@ namespace mg5amcCpu
                            fptype* allDenominators,       // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
                            fptype_sv* jamp2_sv            // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled)
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
                            , const int ievt00             // input: first event number in current C++ event page (for CUDA, ievt depends on threadid)
 #endif
                            )
   //ALWAYS_INLINE // attributes are not permitted in a function definition
   {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     using namespace mg5amcGpu;
     using M_ACCESS = DeviceAccessMomenta;         // non-trivial access: buffer includes all events
     using E_ACCESS = DeviceAccessMatrixElements;  // non-trivial access: buffer includes all events
@@ -151,7 +150,7 @@ namespace mg5amcCpu
 #endif /* clang-format on */
     mgDebug( 0, __FUNCTION__ );
     //printf( "calculate_wavefunctions: ihel=%2d\n", ihel );
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
     //printf( "calculate_wavefunctions: ievt00=%d\n", ievt00 );
 #endif
 
@@ -187,7 +186,7 @@ namespace mg5amcCpu
 #endif
     for( int iParity = 0; iParity < nParity; ++iParity )
     { // START LOOP ON IPARITY
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
       const int ievt0 = ievt00 + iParity * neppV;
 #endif
       constexpr size_t nxcoup = ndcoup + nicoup; // both dependent and independent couplings
@@ -200,8 +199,10 @@ namespace mg5amcCpu
         allCOUPs[idcoup] = CD_ACCESS::idcoupAccessBufferConst( allcouplings, idcoup ); // dependent couplings, vary event-by-event
       for( size_t iicoup = 0; iicoup < nicoup; iicoup++ )
         allCOUPs[ndcoup + iicoup] = CI_ACCESS::iicoupAccessBufferConst( cIPC, iicoup ); // independent couplings, fixed for all events
+#ifdef MGONGPUCPP_GPUIMPL
 #ifdef __CUDACC__
 #pragma nv_diagnostic pop
+#endif
       // CUDA kernels take input/output buffers with momenta/MEs for all events
       const fptype* momenta = allmomenta;
       const fptype* COUPs[nxcoup];
@@ -268,7 +269,7 @@ namespace mg5amcCpu
       // [NB do keep 'static' for these constexpr arrays, see issue #283]
       static constexpr fptype2 cf[ncolor][ncolor] = { { 2 } }; // 2-D array[1][1]
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
       // Pre-compute a constexpr triangular color matrix properly normalized #475
       struct TriangularNormalizedColorMatrix
       {
@@ -325,7 +326,7 @@ namespace mg5amcCpu
 #endif
       for( int icol = 0; icol < ncolor; icol++ )
       {
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
         // === C++ START ===
         // Diagonal terms
 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
@@ -384,7 +385,7 @@ namespace mg5amcCpu
       MEs_sv_previous += deltaMEs_previous;
 #endif
       /*
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
       if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", blockDim.x * blockIdx.x + threadIdx.x, ihel, MEs_sv );
 #else
 #ifdef MGONGPU_CPPSIMD
@@ -419,8 +420,8 @@ namespace mg5amcCpu
       { -1, 1, 0 },
       { 1, -1, 0 },
       { 1, 1, 0 } };
-#ifdef __CUDACC__
-    checkCuda( cudaMemcpyToSymbol( cHel, tHel, ncomb * npar * sizeof( short ) ) );
+#ifdef MGONGPUCPP_GPUIMPL
+    gpuMemcpyToSymbol( cHel, tHel, ncomb * npar * sizeof( short ) );
 #else
     memcpy( cHel, tHel, ncomb * npar * sizeof( short ) );
 #endif
@@ -459,9 +460,9 @@ namespace mg5amcCpu
     // Then copy them to CUDA constant memory (issue #39) or its C++ emulation in file-scope static memory
     //const fptype tIPD[0] = { ... }; // nparam=0
     //const cxtype tIPC[0] = { ... }; // nicoup=0
-#ifdef __CUDACC__
-    //checkCuda( cudaMemcpyToSymbol( cIPD, tIPD, 0 * sizeof( fptype ) ) ); // nparam=0
-    //checkCuda( cudaMemcpyToSymbol( cIPC, tIPC, 0 * sizeof( cxtype ) ) ); // nicoup=0
+#ifdef MGONGPUCPP_GPUIMPL
+    //gpuMemcpyToSymbol( cIPD, tIPD, 0 * sizeof( fptype ) ); // nparam=0
+    //gpuMemcpyToSymbol( cIPC, tIPC, 0 * sizeof( cxtype ) ); // nicoup=0
 #else
     //memcpy( cIPD, tIPD, 0 * sizeof( fptype ) ); // nparam=0
     //memcpy( cIPC, tIPC, 0 * sizeof( cxtype ) ); // nicoup=0
@@ -495,7 +496,7 @@ namespace mg5amcCpu
   {
     std::stringstream out;
     // CUDA version (NVCC)
-    // [Use __NVCC__ instead of __CUDACC__ here!]
+    // [Use __NVCC__ instead of MGONGPUCPP_GPUIMPL here!]
     // [This tests if 'nvcc' was used even to build a .cc file, even if not necessarily 'nvcc -x cu' for a .cu file]
     // [Check 'nvcc --compiler-options -dM -E dummy.c | grep CUDA': see https://stackoverflow.com/a/53713712]
 #ifdef __NVCC__
@@ -560,12 +561,12 @@ namespace mg5amcCpu
   __global__ void /* clang-format off */
   computeDependentCouplings( const fptype* allgs, // input: Gs[nevt]
                              fptype* allcouplings // output: couplings[nevt*ndcoup*2]
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
                              , const int nevt     // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #endif
   ) /* clang-format on */
   {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     using namespace mg5amcGpu;
     using G_ACCESS = DeviceAccessGs;
     using C_ACCESS = DeviceAccessCouplings;
@@ -586,7 +587,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__ /* clang-format off */
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
   __global__ void
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
@@ -712,9 +713,9 @@ namespace mg5amcCpu
         nGoodHel++;
       }
     }
-#ifdef __CUDACC__
-    checkCuda( cudaMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) ) );
-    checkCuda( cudaMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) ) );
+#ifdef MGONGPUCPP_GPUIMPL
+    gpuMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) );
+    gpuMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) );
 #else
     cNGoodHel = nGoodHel;
     for( int ihel = 0; ihel < ncomb; ihel++ ) cGoodHel[ihel] = goodHel[ihel];
@@ -738,7 +739,7 @@ namespace mg5amcCpu
 #endif
             int* allselhel,                // output: helicity selection[nevt]
             int* allselcol                 // output: helicity selection[nevt]
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
             , const int nevt               // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #endif
             ) /* clang-format on */
@@ -758,7 +759,7 @@ namespace mg5amcCpu
     // Denominators: spins, colors and identical particles
     constexpr int helcolDenominators[1] = { 256 }; // assume nprocesses == 1 (#272 and #343)
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     // Remember: in CUDA this is a kernel for one event, in c++ this processes n events
     const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid
 #else
@@ -772,9 +773,12 @@ namespace mg5amcCpu
 #endif
 
     // Start sigmaKin_lines
+
+#include "GpuAbstraction.h"
+
     // === PART 0 - INITIALISATION (before calculate_wavefunctions) ===
     // Reset the "matrix elements" - running sums of |M|^2 over helicities for the given event
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     allMEs[ievt] = 0;
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     allNumerators[ievt] = 0;
@@ -802,7 +806,7 @@ namespace mg5amcCpu
     // === PART 1 - HELICITY LOOP: CALCULATE WAVEFUNCTIONS ===
     // (in both CUDA and C++, using precomputed good helicities)
 
-#ifdef __CUDACC__ // CUDA OR C++
+#ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++
 
     // *** START OF PART 1a - CUDA (one event per CPU thread) ***
     // Running sum of partial amplitudes squared for event by event color selection (#402)
@@ -1012,7 +1016,7 @@ namespace mg5amcCpu
     // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event
     // [NB 'sum over final spins, average over initial spins', eg see
     // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf]
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     allMEs[ievt] /= helcolDenominators[0];
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     if( channelId > 0 ) allMEs[ievt] *= allNumerators[ievt] / allDenominators[ievt];
diff --git a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/P1_Sigma_heft_gg_h/CPPProcess.h b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/P1_Sigma_heft_gg_h/CPPProcess.h
index dbc5aa0e4e..e1caef360b 100644
--- a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/P1_Sigma_heft_gg_h/CPPProcess.h
+++ b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/P1_Sigma_heft_gg_h/CPPProcess.h
@@ -4,7 +4,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
 // MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
@@ -25,7 +25,7 @@
 
 //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -107,7 +107,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   __global__ void
   computeDependentCouplings( const fptype* allgs,    // input: Gs[nevt]
                              fptype* allcouplings ); // output: couplings[nevt*ndcoup*2]
@@ -120,7 +120,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__ /* clang-format off */
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
   __global__ void
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
@@ -150,7 +150,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__ /* clang-format off */
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
   __global__ void
   sigmaKin( const fptype* allmomenta,      // input: momenta[nevt*npar*4]
             const fptype* allcouplings,    // input: couplings[nevt*ndcoup*2]
diff --git a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/P1_Sigma_heft_gg_h/CudaRuntime.h b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/P1_Sigma_heft_gg_h/CudaRuntime.h
deleted file mode 120000
index ce9e1a487a..0000000000
--- a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/P1_Sigma_heft_gg_h/CudaRuntime.h
+++ /dev/null
@@ -1 +0,0 @@
-../CudaRuntime.h
\ No newline at end of file
diff --git a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/P1_Sigma_heft_gg_h/GpuAbstraction.h b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/P1_Sigma_heft_gg_h/GpuAbstraction.h
new file mode 120000
index 0000000000..72054e19ba
--- /dev/null
+++ b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/P1_Sigma_heft_gg_h/GpuAbstraction.h
@@ -0,0 +1 @@
+../GpuAbstraction.h
\ No newline at end of file
diff --git a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/P1_Sigma_heft_gg_h/GpuRuntime.h b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/P1_Sigma_heft_gg_h/GpuRuntime.h
new file mode 120000
index 0000000000..3920e83be4
--- /dev/null
+++ b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/P1_Sigma_heft_gg_h/GpuRuntime.h
@@ -0,0 +1 @@
+../GpuRuntime.h
\ No newline at end of file
diff --git a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/P1_Sigma_heft_gg_h/check_sa.cc b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/P1_Sigma_heft_gg_h/check_sa.cc
index 3fbf0ffbee..7cac5ab47b 100644
--- a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/P1_Sigma_heft_gg_h/check_sa.cc
+++ b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/P1_Sigma_heft_gg_h/check_sa.cc
@@ -4,7 +4,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: O. Mattelaer (Nov 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 
 #include "mgOnGpuConfig.h"
@@ -12,6 +12,7 @@
 #include "BridgeKernels.h"
 #include "CPPProcess.h"
 #include "CrossSectionKernels.h"
+#include "GpuRuntime.h"
 #include "MatrixElementKernels.h"
 #include "MemoryAccessMatrixElements.h"
 #include "MemoryAccessMomenta.h"
@@ -65,7 +66,7 @@ usage( char* argv0, int ret = 1 )
   std::cout << std::endl;
   std::cout << "Summary stats are always computed: '-p' and '-j' only control their printout" << std::endl;
   std::cout << "The '-d' flag only enables NaN/abnormal warnings and OMP debugging" << std::endl;
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 #ifdef _OPENMP
   std::cout << std::endl;
   std::cout << "Use the OMP_NUM_THREADS environment variable to control OMP multi-threading" << std::endl;
@@ -96,7 +97,7 @@ int
 main( int argc, char** argv )
 {
   // Namespaces for CUDA and C++ (FIXME - eventually use the same namespace everywhere...)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   using namespace mg5amcGpu;
 #else
   using namespace mg5amcCpu;
@@ -134,9 +135,11 @@ main( int argc, char** argv )
     CurandDevice = 2
   };
 #ifdef MGONGPU_HAS_NO_CURAND
-  RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // this is the only supported mode if build has no curand (PR #784)
+  RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // this is the only supported mode if build has no curand (PR #784 and #785)
+#elif defined __HIPCC__
+#error Internal error: MGONGPU_HAS_NO_CURAND should have been set for __HIPCC__ // default on AMD GPUs should be common random
 #elif defined __CUDACC__
-  RandomNumberMode rndgen = RandomNumberMode::CurandDevice; // default on GPU if build has curand
+  RandomNumberMode rndgen = RandomNumberMode::CurandDevice; // default on NVidia GPU if build has curand
 #else
   RandomNumberMode rndgen = RandomNumberMode::CurandHost; // default on CPU if build has curand
 #endif
@@ -146,10 +149,10 @@ main( int argc, char** argv )
     RamboHost = 1,
     RamboDevice = 2
   };
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   RamboSamplingMode rmbsmp = RamboSamplingMode::RamboDevice; // default on GPU
 #else
-  RamboSamplingMode rmbsmp = RamboSamplingMode::RamboHost;  // default on CPU
+  RamboSamplingMode rmbsmp = RamboSamplingMode::RamboHost; // default on CPU
 #endif
   // Bridge emulation mode (NB Bridge implies RamboHost!)
   bool bridge = false;
@@ -177,7 +180,7 @@ main( int argc, char** argv )
     else if( arg == "--curdev" )
     {
 #ifndef __CUDACC__
-      throw std::runtime_error( "CurandDevice is not supported on CPUs" );
+      throw std::runtime_error( "CurandDevice is not supported on CPUs or non-NVidia GPUs" );
 #elif defined MGONGPU_HAS_NO_CURAND
       throw std::runtime_error( "CurandDevice is not supported because this application was built without Curand support" );
 #else
@@ -198,7 +201,7 @@ main( int argc, char** argv )
     }
     else if( arg == "--rmbdev" )
     {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
       rmbsmp = RamboSamplingMode::RamboDevice;
 #else
       throw std::runtime_error( "RamboDevice is not supported on CPUs" );
@@ -272,13 +275,13 @@ main( int argc, char** argv )
     return usage( argv[0] );
   }
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 #ifdef _OPENMP
   ompnumthreadsNotSetMeansOneThread( debug ? 1 : 0 ); // quiet(-1), info(0), debug(1)
 #endif
 #endif
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // Fail gently and avoid "Illegal instruction (core dumped)" if the host does not support the SIMD used in the ME calculation
   // Note: this prevents a crash on pmpe04 but not on some github CI nodes?
   // [NB: SIMD vectorization in mg5amc C++ code is only used in the ME calculation below MatrixElementKernelHost!]
@@ -296,14 +299,14 @@ main( int argc, char** argv )
 
   // === STEP 0 - INITIALISE
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 
-  // --- 00. Initialise cuda
-  // Instantiate a CudaRuntime at the beginnining of the application's main to
-  // invoke cudaSetDevice(0) in the constructor and book a cudaDeviceReset() call in the destructor
-  const std::string cdinKey = "00 CudaInit";
+  // --- 00. Initialise GPU
+  // Instantiate a GpuRuntime at the beginnining of the application's main.
+  // For CUDA this invokes cudaSetDevice(0) in the constructor and books a cudaDeviceReset() call in the destructor.
+  const std::string cdinKey = "00 GpuInit";
   timermap.start( cdinKey );
-  CudaRuntime cudaRuntime( debug );
+  GpuRuntime GpuRuntime( debug );
 #endif
 
   // --- 0a. Initialise physics process
@@ -325,7 +328,7 @@ main( int argc, char** argv )
   timermap.start( alloKey );
 
   // Memory buffers for random numbers for momenta
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferRndNumMomenta hstRndmom( nevt );
 #else
   PinnedHostBufferRndNumMomenta hstRndmom( nevt );
@@ -333,7 +336,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for sampling weights
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferWeights hstWeights( nevt );
 #else
   PinnedHostBufferWeights hstWeights( nevt );
@@ -341,7 +344,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for momenta
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferMomenta hstMomenta( nevt );
 #else
   PinnedHostBufferMomenta hstMomenta( nevt );
@@ -349,7 +352,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for Gs
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferGs hstGs( nevt );
 #else
   PinnedHostBufferGs hstGs( nevt );
@@ -366,7 +369,7 @@ main( int argc, char** argv )
   }
 
   // Memory buffers for matrix elements
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferMatrixElements hstMatrixElements( nevt );
 #else
   PinnedHostBufferMatrixElements hstMatrixElements( nevt );
@@ -375,7 +378,7 @@ main( int argc, char** argv )
 
   // Memory buffers for random numbers for helicity selection
   // *** NB #403 these buffers always remain initialised at 0: no need for helicity choice in gcheck/check (no LHE produced) ***
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferRndNumHelicity hstRndHel( nevt );
 #else
   PinnedHostBufferRndNumHelicity hstRndHel( nevt );
@@ -384,7 +387,7 @@ main( int argc, char** argv )
 
   // Memory buffers for random numbers for color selection
   // *** NB #402 these buffers always remain initialised at 0: no need for color choice in gcheck/check (no LHE produced) ***
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferRndNumColor hstRndCol( nevt );
 #else
   PinnedHostBufferRndNumColor hstRndCol( nevt );
@@ -392,7 +395,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for helicity selection
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferSelectedHelicity hstSelHel( nevt );
 #else
   PinnedHostBufferSelectedHelicity hstSelHel( nevt );
@@ -400,7 +403,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for color selection
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferSelectedColor hstSelCol( nevt );
 #else
   PinnedHostBufferSelectedColor hstSelCol( nevt );
@@ -438,7 +441,7 @@ main( int argc, char** argv )
     const bool onDevice = true;
     prnk.reset( new CurandRandomNumberKernel( devRndmom, onDevice ) );
 #else
-    throw std::logic_error( "INTERNAL ERROR! CurandDevice is not supported on CPUs" ); // INTERNAL ERROR (no path to this statement)
+    throw std::logic_error( "INTERNAL ERROR! CurandDevice is not supported on CPUs or non-NVidia GPUs" ); // INTERNAL ERROR (no path to this statement)
 #endif
   }
 
@@ -450,7 +453,7 @@ main( int argc, char** argv )
   }
   else
   {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     prsk.reset( new RamboSamplingKernelDevice( energy, devRndmom, devMomenta, devWeights, gpublocks, gputhreads ) );
 #else
     throw std::logic_error( "RamboDevice is not supported on CPUs" ); // INTERNAL ERROR (no path to this statement)
@@ -461,7 +464,7 @@ main( int argc, char** argv )
   std::unique_ptr<MatrixElementKernelBase> pmek;
   if( !bridge )
   {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     pmek.reset( new MatrixElementKernelDevice( devMomenta, devGs, devRndHel, devRndCol, devMatrixElements, devSelHel, devSelCol, gpublocks, gputhreads ) );
 #else
     pmek.reset( new MatrixElementKernelHost( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, nevt ) );
@@ -469,7 +472,7 @@ main( int argc, char** argv )
   }
   else
   {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     pmek.reset( new BridgeKernelDevice( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, gpublocks, gputhreads ) );
 #else
     pmek.reset( new BridgeKernelHost( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, nevt ) );
@@ -511,7 +514,7 @@ main( int argc, char** argv )
     prnk->generateRnarray();
     //std::cout << "Got random numbers" << std::endl;
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     if( rndgen != RandomNumberMode::CurandDevice && rmbsmp == RamboSamplingMode::RamboDevice )
     {
       // --- 1c. Copy rndmom from host to device
@@ -543,7 +546,7 @@ main( int argc, char** argv )
     prsk->getMomentaFinal();
     //std::cout << "Got final momenta" << std::endl;
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     if( rmbsmp == RamboSamplingMode::RamboDevice )
     {
       // --- 2c. CopyDToH Weights
@@ -588,7 +591,7 @@ main( int argc, char** argv )
       dynamic_cast<BridgeKernelBase*>( pmek.get() )->transposeInputMomentaC2F();
     }
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     // --- 2d. CopyHToD Momenta
     const std::string gKey = "0.. CpHTDg";
     rambtime += timermap.start( gKey ); // FIXME! NOT A RAMBO TIMER!
@@ -617,7 +620,7 @@ main( int argc, char** argv )
     wv3atime += timermap.stop(); // calc only
     wavetime += wv3atime;        // calc plus copy
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     if( !bridge )
     {
       // --- 3b. CopyDToH MEs
@@ -760,18 +763,22 @@ main( int argc, char** argv )
     rndgentxt = "CURAND DEVICE";
 #ifdef __CUDACC__
   rndgentxt += " (CUDA code)";
+#elif defined __HIPCC__
+  rndgentxt += " (HIP code)";
 #else
   rndgentxt += " (C++ code)";
 #endif
 
   // Workflow description summary
   std::string wrkflwtxt;
-  // -- CUDA or C++?
+  // -- CUDA or HIP or C++?
 #ifdef __CUDACC__
   wrkflwtxt += "CUD:";
+#elif defined __HIPCC__
+  wrkflwtxt += "HIP:";
 #else
   wrkflwtxt += "CPP:";
-#endif
+#endif /* clang-format off */
   // -- DOUBLE or FLOAT?
 #if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
   wrkflwtxt += "MIX+"; // mixed fptypes (single precision color algebra #537)
@@ -781,7 +788,7 @@ main( int argc, char** argv )
   wrkflwtxt += "FLT+";
 #else
   wrkflwtxt += "???+"; // no path to this statement
-#endif
+#endif /* clang-format on */
   // -- CUCOMPLEX or THRUST or STD complex numbers?
 #ifdef __CUDACC__
 #if defined MGONGPU_CUCXTYPE_CUCOMPLEX
@@ -793,6 +800,12 @@ main( int argc, char** argv )
 #else
   wrkflwtxt += "???:"; // no path to this statement
 #endif
+#elif defined __HIPCC__
+#if defined MGONGPU_CUCXTYPE_CXSMPL
+  wrkflwtxt += "CXS:";
+#else
+  wrkflwtxt += "???:"; // no path to this statement
+#endif
 #else
 #if defined MGONGPU_CPPCXTYPE_STDCOMPLEX
   wrkflwtxt += "STX:";
@@ -818,7 +831,7 @@ main( int argc, char** argv )
     wrkflwtxt += "RMBDEV+";
   else
     wrkflwtxt += "??????+"; // no path to this statement
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   // -- HOST or DEVICE matrix elements? Standalone MEs or BRIDGE?
   if( !bridge )
     wrkflwtxt += "MESDEV";
@@ -874,7 +887,7 @@ main( int argc, char** argv )
 
   if( perf )
   {
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 #ifdef _OPENMP
     // Get the output of "nproc --all" (https://stackoverflow.com/a/478960)
     std::string nprocall;
@@ -895,6 +908,8 @@ main( int argc, char** argv )
     std::cout << std::string( SEP79, '*' ) << std::endl
 #ifdef __CUDACC__
               << "Process                     = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_CUDA"
+#elif defined __HIPCC__
+              << "Process                     = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_HIP"
 #else
               << "Process                     = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_CPP"
 #endif
@@ -921,21 +936,21 @@ main( int argc, char** argv )
 #elif defined MGONGPU_FPTYPE_FLOAT
               << "FP precision                = FLOAT (NaN/abnormal=" << nabn << ", zero=" << nzero << ")" << std::endl
 #endif
-#ifdef __CUDACC__
 #if defined MGONGPU_CUCXTYPE_CUCOMPLEX
               << "Complex type                = CUCOMPLEX" << std::endl
 #elif defined MGONGPU_CUCXTYPE_THRUST
               << "Complex type                = THRUST::COMPLEX" << std::endl
-#endif
-#else
+#elif defined MGONGPU_CUCXTYPE_CXSMPL
               << "Complex type                = STD::COMPLEX" << std::endl
+#else
+              << "Complex type                = ???" << std::endl // no path to this statement...
 #endif
               << "RanNumb memory layout       = AOSOA[" << neppR << "]"
               << ( neppR == 1 ? " == AOS" : "" )
               << " [HARDCODED FOR REPRODUCIBILITY]" << std::endl
               << "Momenta memory layout       = AOSOA[" << neppM << "]"
               << ( neppM == 1 ? " == AOS" : "" ) << std::endl
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     //<< "Wavefunction GPU memory     = LOCAL" << std::endl
 #else
 #if !defined MGONGPU_CPPSIMD
@@ -966,7 +981,7 @@ main( int argc, char** argv )
 #endif
 #endif
               << "Random number generation    = " << rndgentxt << std::endl
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 #ifdef _OPENMP
               << "OMP threads / `nproc --all` = " << omp_get_max_threads() << " / " << nprocall // includes a newline
 #endif
@@ -1062,14 +1077,14 @@ main( int argc, char** argv )
              << "\"FLOAT (NaN/abnormal=" << nabn << ")\"," << std::endl
 #endif
              << "\"Complex type\": "
-#ifdef __CUDACC__
 #if defined MGONGPU_CUCXTYPE_CUCOMPLEX
              << "\"CUCOMPLEX\"," << std::endl
 #elif defined MGONGPU_CUCXTYPE_THRUST
              << "\"THRUST::COMPLEX\"," << std::endl
-#endif
-#else
+#elif defined MGONGPU_CUCXTYPE_CXSMPL
              << "\"STD::COMPLEX\"," << std::endl
+#else
+             << "\"???\"," << std::endl                           // no path to this statement...
 #endif
              << "\"RanNumb memory layout\": "
              << "\"AOSOA[" << neppR << "]\""
@@ -1077,7 +1092,7 @@ main( int argc, char** argv )
              << "\"Momenta memory layout\": "
              << "\"AOSOA[" << neppM << "]\""
              << ( neppM == 1 ? " == AOS" : "" ) << ", " << std::endl
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     //<< "\"Wavefunction GPU memory\": " << "\"LOCAL\"," << std::endl
 #endif
              << "\"Curand generation\": "
diff --git a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/RamboSamplingKernels.cc b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/RamboSamplingKernels.cc
index da68aa9255..79abbcc4f8 100644
--- a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/RamboSamplingKernels.cc
+++ b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/RamboSamplingKernels.cc
@@ -1,11 +1,11 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #include "RamboSamplingKernels.h"
 
-#include "CudaRuntime.h"
+#include "GpuRuntime.h"
 #include "MemoryAccessMomenta.h"
 #include "MemoryAccessRandomNumbers.h"
 #include "MemoryAccessWeights.h"
@@ -14,7 +14,7 @@
 
 #include <sstream>
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -92,7 +92,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   RamboSamplingKernelDevice::RamboSamplingKernelDevice( const fptype energy,               // input: energy
                                                         const BufferRndNumMomenta& rndmom, // input: random numbers in [0,1]
                                                         BufferMomenta& momenta,            // output: momenta
@@ -135,7 +135,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   __global__ void
   getMomentaInitialDevice( const fptype energy,
                            fptype* momenta )
@@ -147,17 +147,17 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   void
   RamboSamplingKernelDevice::getMomentaInitial()
   {
-    getMomentaInitialDevice<<<m_gpublocks, m_gputhreads>>>( m_energy, m_momenta.data() );
+    gpuLaunchKernel( getMomentaInitialDevice, m_gpublocks, m_gputhreads, m_energy, m_momenta.data() );
   }
 #endif
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   __global__ void
   getMomentaFinalDevice( const fptype energy,
                          const fptype* rndmom,
@@ -171,11 +171,11 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   void
   RamboSamplingKernelDevice::getMomentaFinal()
   {
-    getMomentaFinalDevice<<<m_gpublocks, m_gputhreads>>>( m_energy, m_rndmom.data(), m_momenta.data(), m_weights.data() );
+    gpuLaunchKernel( getMomentaFinalDevice, m_gpublocks, m_gputhreads, m_energy, m_rndmom.data(), m_momenta.data(), m_weights.data() );
   }
 #endif
 
diff --git a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/RamboSamplingKernels.h b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/RamboSamplingKernels.h
index 184089efd7..7c214cd74b 100644
--- a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/RamboSamplingKernels.h
+++ b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/RamboSamplingKernels.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef RAMBOSAMPLINGKERNELS_H
 #define RAMBOSAMPLINGKERNELS_H 1
@@ -10,7 +10,7 @@
 
 #include "MemoryBuffers.h"
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -93,7 +93,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   // A class encapsulating RAMBO phase space sampling on a GPU device
   class RamboSamplingKernelDevice final : public SamplingKernelBase, public NumberOfEvents
   {
diff --git a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/RandomNumberKernels.h b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/RandomNumberKernels.h
index 188a72c2c9..21d63beeac 100644
--- a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/RandomNumberKernels.h
+++ b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/RandomNumberKernels.h
@@ -1,14 +1,14 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef RANDOMNUMBERKERNELS_H
 #define RANDOMNUMBERKERNELS_H 1
 
 #include "mgOnGpuConfig.h"
 
-// NB This must come AFTER mgOnGpuConfig.h which contains our definition of __global__ when __CUDACC__ is not defined
+// NB This must come AFTER mgOnGpuConfig.h which contains our definition of __global__ when MGONGPUCPP_GPUIMPL is not defined
 #ifndef MGONGPU_HAS_NO_CURAND
 //#include "curand.h"
 struct curandGenerator_st; // forward definition from curand.h
@@ -16,7 +16,7 @@ struct curandGenerator_st; // forward definition from curand.h
 
 #include "MemoryBuffers.h"
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/cudacpp.mk b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/cudacpp.mk
index 509307506b..f2cfa349da 100644
--- a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/cudacpp.mk
+++ b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/cudacpp.mk
@@ -1,7 +1,7 @@
 # Copyright (C) 2020-2023 CERN and UCLouvain.
 # Licensed under the GNU Lesser General Public License (version 3 or later).
 # Created by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin.
-# Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+# Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 
 #=== Determine the name of this makefile (https://ftp.gnu.org/old-gnu/Manuals/make-3.80/html_node/make_17.html)
 #=== NB: use ':=' to ensure that the value of CUDACPP_MAKEFILE is not modified further down after including make_opts
@@ -42,10 +42,10 @@ endif
 
 #-------------------------------------------------------------------------------
 
-#=== Configure common compiler flags for C++ and CUDA
+#=== Configure common compiler flags for C++ and CUDA/HIP
 
 INCFLAGS = -I.
-OPTFLAGS = -O3 # this ends up in CUFLAGS too (should it?), cannot add -Ofast or -ffast-math here
+OPTFLAGS = -O3 # this ends up in GPUFLAGS too (should it?), cannot add -Ofast or -ffast-math here
 
 # Dependency on src directory
 MG5AMC_COMMONLIB = mg5amc_common
@@ -121,24 +121,46 @@ endif
 
 #-------------------------------------------------------------------------------
 
-#=== Configure the CUDA compiler
+#=== Configure the GPU compiler (CUDA or HIP)
 
-# If CXX is not a single word (example "clang++ --gcc-toolchain...") then disable CUDA builds (issue #505)
-# This is because it is impossible to pass this to "CUFLAGS += -ccbin <host-compiler>" below
+# FIXME! (AV 24.01.2024)
+# In the current implementation (without separate builds for C++ and CUDA/HIP), we first check for cudacc and hipcc in CUDA_HOME and HIP_HOME.
+# If CUDA_HOME or HIP_HOME are not set, try to determine them from the path to cudacc and hipcc.
+# While convoluted, this is currently necessary to allow disabling CUDA/HIP builds by setting CUDA_HOME or HIP_HOME to invalid paths.
+# This will (probably?) be fixed when separate C++ and CUDA/HIP builds are implemented (PR #775).
+
+# If CXX is not a single word (example "clang++ --gcc-toolchain...") then disable CUDA and HIP builds (issue #505)
+# This is because it is impossible to pass this to "GPUFLAGS += -ccbin <host-compiler>" below
 ifneq ($(words $(subst ccache ,,$(CXX))),1) # allow at most "CXX=ccache <host-compiler>" from outside
-  $(warning CUDA builds are not supported for multi-word CXX "$(CXX)")
+  $(warning CUDA and HIP builds are not supported for multi-word CXX "$(CXX)")
   override CUDA_HOME=disabled
+  override HIP_HOME=disabled
 endif
 
-# If CUDA_HOME is not set, try to set it from the location of nvcc
+# If CUDA_HOME is not set, try to set it from the path to nvcc
 ifndef CUDA_HOME
   CUDA_HOME = $(patsubst %bin/nvcc,%,$(shell which nvcc 2>/dev/null))
   $(warning CUDA_HOME was not set: using "$(CUDA_HOME)")
 endif
 
-# Set NVCC as $(CUDA_HOME)/bin/nvcc if it exists
+# If HIP_HOME is not set, try to set it from the path to hipcc
+ifndef HIP_HOME
+  HIP_HOME = $(patsubst %bin/hipcc,%,$(HIP_COMPILER_PATH))
+  $(warning HIP_HOME was not set: using "$(HIP_HOME)")
+endif
+
+# FIXME! (AV 24.01.2024)
+# In the current implementation (without separate builds for C++ and CUDA/HIP),
+# builds are performed for HIP only if CUDA is not found in the path.
+# If both CUDA and HIP are installed, HIP builds can be triggered by unsetting CUDA_HOME.
+# This will be fixed when separate C++ and CUDA/HIP builds are implemented (PR #775).
+
+#--- Option 1: CUDA exists -> use CUDA
+
+# Set GPUCC as $(CUDA_HOME)/bin/nvcc if it exists
 ifneq ($(wildcard $(CUDA_HOME)/bin/nvcc),)
-  NVCC = $(CUDA_HOME)/bin/nvcc
+
+  GPUCC = $(CUDA_HOME)/bin/nvcc
   USE_NVTX ?=-DUSE_NVTX
   # See https://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/index.html
   # See https://arnon.dk/matching-sm-architectures-arch-and-gencode-for-various-nvidia-cards/
@@ -158,41 +180,78 @@ ifneq ($(wildcard $(CUDA_HOME)/bin/nvcc),)
     CURANDLIBFLAGS = -L$(CUDA_HOME)/lib64/ -lcurand # NB: -lcuda is not needed here!
   endif
   CUOPTFLAGS = -lineinfo
-  CUFLAGS = $(foreach opt, $(OPTFLAGS), -Xcompiler $(opt)) $(CUOPTFLAGS) $(INCFLAGS) $(CUINC) $(USE_NVTX) $(CUARCHFLAGS) -use_fast_math
-  ###CUFLAGS += -Xcompiler -Wall -Xcompiler -Wextra -Xcompiler -Wshadow
-  ###NVCC_VERSION = $(shell $(NVCC) --version | grep 'Cuda compilation tools' | cut -d' ' -f5 | cut -d, -f1)
-  CUFLAGS += -std=c++17 # need CUDA >= 11.2 (see #333): this is enforced in mgOnGpuConfig.h
+  ###GPUFLAGS = $(OPTFLAGS) $(CUOPTFLAGS) $(INCFLAGS) $(CUINC) $(USE_NVTX) $(CUARCHFLAGS) -use_fast_math
+  GPUFLAGS = $(foreach opt, $(OPTFLAGS), -Xcompiler $(opt)) $(CUOPTFLAGS) $(INCFLAGS) $(CUINC) $(USE_NVTX) $(CUARCHFLAGS) -use_fast_math
+  ###GPUFLAGS += -Xcompiler -Wall -Xcompiler -Wextra -Xcompiler -Wshadow
+  ###GPUCC_VERSION = $(shell $(GPUCC) --version | grep 'Cuda compilation tools' | cut -d' ' -f5 | cut -d, -f1)
+  GPUFLAGS += -std=c++17 # need CUDA >= 11.2 (see #333): this is enforced in mgOnGpuConfig.h
   # Without -maxrregcount: baseline throughput: 6.5E8 (16384 32 12) up to 7.3E8 (65536 128 12)
-  ###CUFLAGS+= --maxrregcount 160 # improves throughput: 6.9E8 (16384 32 12) up to 7.7E8 (65536 128 12)
-  ###CUFLAGS+= --maxrregcount 128 # improves throughput: 7.3E8 (16384 32 12) up to 7.6E8 (65536 128 12)
-  ###CUFLAGS+= --maxrregcount 96 # degrades throughput: 4.1E8 (16384 32 12) up to 4.5E8 (65536 128 12)
-  ###CUFLAGS+= --maxrregcount 64 # degrades throughput: 1.7E8 (16384 32 12) flat at 1.7E8 (65536 128 12)
+  ###GPUFLAGS+= --maxrregcount 160 # improves throughput: 6.9E8 (16384 32 12) up to 7.7E8 (65536 128 12)
+  ###GPUFLAGS+= --maxrregcount 128 # improves throughput: 7.3E8 (16384 32 12) up to 7.6E8 (65536 128 12)
+  ###GPUFLAGS+= --maxrregcount 96 # degrades throughput: 4.1E8 (16384 32 12) up to 4.5E8 (65536 128 12)
+  ###GPUFLAGS+= --maxrregcount 64 # degrades throughput: 1.7E8 (16384 32 12) flat at 1.7E8 (65536 128 12)
+  CUBUILDRULEFLAGS = -Xcompiler -fPIC -c
+  CCBUILDRULEFLAGS = -Xcompiler -fPIC -c -x cu
+  CUDATESTFLAGS = -lcuda
+
+  # Set the host C++ compiler for GPUCC via "-ccbin <host-compiler>"
+  # (NB issue #505: this must be a single word, "clang++ --gcc-toolchain..." is not supported)
+  GPUFLAGS += -ccbin $(shell which $(subst ccache ,,$(CXX)))
+
+  # Allow newer (unsupported) C++ compilers with older versions of CUDA if ALLOW_UNSUPPORTED_COMPILER_IN_CUDA is set (#504)
+  ifneq ($(origin ALLOW_UNSUPPORTED_COMPILER_IN_CUDA),undefined)
+  GPUFLAGS += -allow-unsupported-compiler
+  endif
+
 else ifneq ($(origin REQUIRE_CUDA),undefined)
+
   # If REQUIRE_CUDA is set but no cuda is found, stop here (e.g. for CI tests on GPU #443)
-  $(error No cuda installation found (set CUDA_HOME or make nvcc visible in PATH))
+  $(error No cuda installation found (set CUDA_HOME or make GPUCC visible in PATH))
+
+#--- Option 2: CUDA does not exist, HIP exists -> use HIP
+
+# Set GPUCC as $(HIP_HOME)/bin/hipcc if it exists
+else ifneq ($(wildcard $(HIP_HOME)/bin/hipcc),)
+
+  GPUCC = $(HIP_HOME)/bin/hipcc
+  #USE_NVTX ?=-DUSE_NVTX # should maybe find something equivalent to this in HIP?
+  HIPARCHFLAGS = -target x86_64-linux-gnu --offload-arch=gfx90a
+  HIPINC = -I$(HIP_HOME)/include/
+  # Note: -DHIP_FAST_MATH is equivalent to -use_fast_math in HIP 
+  # (but only for single precision line 208: https://rocm-developer-tools.github.io/HIP/hcc__detail_2math__functions_8h_source.html)
+  GPUFLAGS = $(OPTFLAGS) $(CUOPTFLAGS) $(INCFLAGS) $(HIPINC) $(HIPARCHFLAGS) -DHIP_FAST_MATH -DHIP_PLATFORM=amd -fPIC
+  ###GPUFLAGS += -Xcompiler -Wall -Xcompiler -Wextra -Xcompiler -Wshadow
+  GPUFLAGS += -std=c++17
+  ###GPUFLAGS+= --maxrregcount 255 # (AV: is this option valid on HIP and meaningful on AMD GPUs?)
+  CUBUILDRULEFLAGS = -fPIC -c
+  CCBUILDRULEFLAGS = -fPIC -c
+
+else ifneq ($(origin REQUIRE_HIP),undefined)
+
+  # If REQUIRE_HIP is set but no HIP is found, stop here (e.g. for CI tests on GPU #443)
+  $(error No hip installation found (set HIP_HOME or make GPUCC visible in PATH))
+
+#--- Option 3: CUDA does not exist, HIP does not exist -> switch off both CUDA and HIP
+
 else
-  # No cuda. Switch cuda compilation off and go to common random numbers in C++
+
+  # No cudacc and no hipcc: switch CUDA and HIP compilation off and go to common random numbers in C++
   $(warning CUDA_HOME is not set or is invalid: export CUDA_HOME to compile with cuda)
-  override NVCC=
+  $(warning HIP_HOME is not set or is invalid: export HIP_HOME to compile with hip)
+  override GPUCC=
   override USE_NVTX=
   override CUINC=
   override CURANDLIBFLAGS=
-endif
-export NVCC
-export CUFLAGS
-
-# Set the host C++ compiler for nvcc via "-ccbin <host-compiler>"
-# (NB issue #505: this must be a single word, "clang++ --gcc-toolchain..." is not supported)
-CUFLAGS += -ccbin $(shell which $(subst ccache ,,$(CXX)))
 
-# Allow newer (unsupported) C++ compilers with older versions of CUDA if ALLOW_UNSUPPORTED_COMPILER_IN_CUDA is set (#504)
-ifneq ($(origin ALLOW_UNSUPPORTED_COMPILER_IN_CUDA),undefined)
-CUFLAGS += -allow-unsupported-compiler
 endif
 
+# Export GPUCC (so that it can also be used in cudacpp_src.mk?)
+export GPUCC
+export GPUFLAGS
+
 #-------------------------------------------------------------------------------
 
-#=== Configure ccache for C++ and CUDA builds
+#=== Configure ccache for C++ and CUDA/HIP builds
 
 # Enable ccache if USECCACHE=1
 ifeq ($(USECCACHE)$(shell echo $(CXX) | grep ccache),1)
@@ -201,15 +260,15 @@ endif
 #ifeq ($(USECCACHE)$(shell echo $(AR) | grep ccache),1)
 #  override AR:=ccache $(AR)
 #endif
-ifneq ($(NVCC),)
-  ifeq ($(USECCACHE)$(shell echo $(NVCC) | grep ccache),1)
-    override NVCC:=ccache $(NVCC)
+ifneq ($(GPUCC),)
+  ifeq ($(USECCACHE)$(shell echo $(GPUCC) | grep ccache),1)
+    override GPUCC:=ccache $(GPUCC)
   endif
 endif
 
 #-------------------------------------------------------------------------------
 
-#=== Configure PowerPC-specific compiler flags for C++ and CUDA
+#=== Configure PowerPC-specific compiler flags for C++ and CUDA/HIP
 
 # PowerPC-specific CXX compiler flags (being reviewed)
 ifeq ($(UNAME_P),ppc64le)
@@ -225,9 +284,9 @@ else
   ######CXXFLAGS+= -fno-semantic-interposition # no benefit (neither alone, nor combined with -flto)
 endif
 
-# PowerPC-specific CUDA compiler flags (to be reviewed!)
+# PowerPC-specific CUDA/HIP compiler flags (to be reviewed!)
 ifeq ($(UNAME_P),ppc64le)
-  CUFLAGS+= -Xcompiler -mno-float128
+  GPUFLAGS+= -Xcompiler -mno-float128
 endif
 
 #-------------------------------------------------------------------------------
@@ -237,7 +296,7 @@ endif
 # Set the default OMPFLAGS choice
 ifneq ($(shell $(CXX) --version | egrep '^Intel'),)
 override OMPFLAGS = -fopenmp
-###override OMPFLAGS = # disable OpenMP MT on Intel (was ok without nvcc but not ok with nvcc before #578)
+###override OMPFLAGS = # disable OpenMP MT on Intel (was ok without GPUCC but not ok with GPUCC before #578)
 else ifneq ($(shell $(CXX) --version | egrep '^(clang)'),)
 override OMPFLAGS = -fopenmp
 ###override OMPFLAGS = # disable OpenMP MT on clang (was not ok without or with nvcc before #578)
@@ -293,7 +352,10 @@ endif
 
 # Set the default RNDGEN (random number generator) choice
 ifeq ($(RNDGEN),)
-  ifeq ($(NVCC),)
+  ifeq ($(GPUCC),)
+    override RNDGEN = hasNoCurand
+  # Edgecase for HIP compilation
+  else ifeq ($(findstring hipcc,$(GPUCC)),hipcc)
     override RNDGEN = hasNoCurand
   else ifeq ($(RNDGEN),)
     override RNDGEN = hasCurand
@@ -310,7 +372,7 @@ export OMPFLAGS
 
 #-------------------------------------------------------------------------------
 
-#=== Set the CUDA/C++ compiler flags appropriate to user-defined choices of AVX, FPTYPE, HELINL, HRDCOD, RNDGEN
+#=== Set the CUDA/HIP/C++ compiler flags appropriate to user-defined choices of AVX, FPTYPE, HELINL, HRDCOD, RNDGEN
 
 # Set the build flags appropriate to OMPFLAGS
 $(info OMPFLAGS=$(OMPFLAGS))
@@ -368,13 +430,13 @@ CXXFLAGS+= $(AVXFLAGS)
 $(info FPTYPE=$(FPTYPE))
 ifeq ($(FPTYPE),d)
   CXXFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_DOUBLE
-  CUFLAGS  += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_DOUBLE
+  GPUFLAGS  += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_DOUBLE
 else ifeq ($(FPTYPE),f)
   CXXFLAGS += -DMGONGPU_FPTYPE_FLOAT -DMGONGPU_FPTYPE2_FLOAT
-  CUFLAGS  += -DMGONGPU_FPTYPE_FLOAT -DMGONGPU_FPTYPE2_FLOAT
+  GPUFLAGS  += -DMGONGPU_FPTYPE_FLOAT -DMGONGPU_FPTYPE2_FLOAT
 else ifeq ($(FPTYPE),m)
   CXXFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_FLOAT
-  CUFLAGS  += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_FLOAT
+  GPUFLAGS  += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_FLOAT
 else
   $(error Unknown FPTYPE='$(FPTYPE)': only 'd', 'f' and 'm' are supported)
 endif
@@ -383,7 +445,7 @@ endif
 $(info HELINL=$(HELINL))
 ifeq ($(HELINL),1)
   CXXFLAGS += -DMGONGPU_INLINE_HELAMPS
-  CUFLAGS  += -DMGONGPU_INLINE_HELAMPS
+  GPUFLAGS  += -DMGONGPU_INLINE_HELAMPS
 else ifneq ($(HELINL),0)
   $(error Unknown HELINL='$(HELINL)': only '0' and '1' are supported)
 endif
@@ -392,7 +454,7 @@ endif
 $(info HRDCOD=$(HRDCOD))
 ifeq ($(HRDCOD),1)
   CXXFLAGS += -DMGONGPU_HARDCODE_PARAM
-  CUFLAGS  += -DMGONGPU_HARDCODE_PARAM
+  GPUFLAGS  += -DMGONGPU_HARDCODE_PARAM
 else ifneq ($(HRDCOD),0)
   $(error Unknown HRDCOD='$(HRDCOD)': only '0' and '1' are supported)
 endif
@@ -444,11 +506,11 @@ ifeq ($(UNAME_S),Darwin)
   override CULIBFLAGSRPATH2 =
 else
   # RPATH to cuda/cpp libs when linking executables
-  override CXXLIBFLAGSRPATH = -Wl,-rpath,$(LIBDIRRPATH)
-  override CULIBFLAGSRPATH = -Xlinker -rpath,$(LIBDIRRPATH)
+  override CXXLIBFLAGSRPATH = -Wl,-rpath=$(LIBDIRRPATH)
+  override CULIBFLAGSRPATH = -Xlinker -rpath=$(LIBDIRRPATH)
   # RPATH to common lib when linking cuda/cpp libs
-  override CXXLIBFLAGSRPATH2 = -Wl,-rpath,'$$ORIGIN'
-  override CULIBFLAGSRPATH2 = -Xlinker -rpath,'$$ORIGIN'
+  override CXXLIBFLAGSRPATH2 = -Wl,-rpath='$$ORIGIN'
+  override CULIBFLAGSRPATH2 = -Xlinker -rpath='$$ORIGIN'
 endif
 
 # Setting LD_LIBRARY_PATH or DYLD_LIBRARY_PATH in the RUNTIME is no longer necessary (neither on Linux nor on Mac)
@@ -461,7 +523,7 @@ override RUNTIME =
 cxx_main=$(BUILDDIR)/check.exe
 fcxx_main=$(BUILDDIR)/fcheck.exe
 
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 cu_main=$(BUILDDIR)/gcheck.exe
 fcu_main=$(BUILDDIR)/fgcheck.exe
 else
@@ -492,15 +554,16 @@ $(BUILDDIR)/.build.$(TAG):
 	@touch $(BUILDDIR)/.build.$(TAG)
 
 # Generic target and build rules: objects from CUDA compilation
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 $(BUILDDIR)/%.o : %.cu *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
 	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
-	$(NVCC) $(CPPFLAGS) $(CUFLAGS) -Xcompiler -fPIC -c $< -o $@
+	$(GPUCC) $(CPPFLAGS) $(GPUFLAGS) $(CUBUILDRULEFLAGS) $< -o $@
 
 $(BUILDDIR)/%_cu.o : %.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
 	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
-	$(NVCC) $(CPPFLAGS) $(CUFLAGS) -Xcompiler -fPIC -c -x cu $< -o $@
+	$(GPUCC) $(CPPFLAGS) $(GPUFLAGS) $(CCBUILDRULEFLAGS) $< -o $@
 endif
+# -x cu in line above
 
 # Generic target and build rules: objects from C++ compilation
 # (NB do not include CUINC here! add it only for NVTX or curand #679)
@@ -509,11 +572,14 @@ $(BUILDDIR)/%.o : %.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
 	$(CXX) $(CPPFLAGS) $(CXXFLAGS) -fPIC -c $< -o $@
 
 # Apply special build flags only to CrossSectionKernel.cc and gCrossSectionKernel.cu (no fast math, see #117 and #516)
+# Added edgecase for HIP compilation
 ifeq ($(shell $(CXX) --version | grep ^nvc++),)
 $(BUILDDIR)/CrossSectionKernels.o: CXXFLAGS := $(filter-out -ffast-math,$(CXXFLAGS))
 $(BUILDDIR)/CrossSectionKernels.o: CXXFLAGS += -fno-fast-math
-ifneq ($(NVCC),)
-$(BUILDDIR)/gCrossSectionKernels.o: CUFLAGS += -Xcompiler -fno-fast-math
+ifeq ($(findstring nvcc,$(GPUCC)),nvcc)
+  $(BUILDDIR)/gCrossSectionKernels.o: GPUFLAGS += -Xcompiler -fno-fast-math
+else
+  $(BUILDDIR)/gCrossSectionKernels.o: GPUFLAGS += -fno-fast-math
 endif
 endif
 
@@ -530,10 +596,10 @@ ifeq ($(RNDGEN),hasCurand)
 $(BUILDDIR)/CurandRandomNumberKernel.o: CXXFLAGS += $(CUINC)
 endif
 
-# Avoid "warning: builtin __has_trivial_... is deprecated; use __is_trivially_... instead" in nvcc with icx2023 (#592)
+# Avoid "warning: builtin __has_trivial_... is deprecated; use __is_trivially_... instead" in GPUCC with icx2023 (#592)
 ifneq ($(shell $(CXX) --version | egrep '^(Intel)'),)
-ifneq ($(NVCC),)
-CUFLAGS += -Xcompiler -Wno-deprecated-builtins
+ifneq ($(GPUCC),)
+GPUFLAGS += -Wno-deprecated-builtins
 endif
 endif
 
@@ -541,8 +607,8 @@ endif
 # This patch does remove the warning, but I prefer to keep it disabled for the moment...
 ###ifneq ($(shell $(CXX) --version | egrep '^(clang|Apple clang|Intel)'),)
 ###$(BUILDDIR)/CrossSectionKernels.o: CXXFLAGS += -Wno-overriding-t-option
-###ifneq ($(NVCC),)
-###$(BUILDDIR)/gCrossSectionKernels.o: CUFLAGS += -Xcompiler -Wno-overriding-t-option
+###ifneq ($(GPUCC),)
+###$(BUILDDIR)/gCrossSectionKernels.o: GPUFLAGS += -Xcompiler -Wno-overriding-t-option
 ###endif
 ###endif
 
@@ -569,7 +635,7 @@ MG5AMC_CXXLIB = mg5amc_$(processid_short)_cpp
 cxx_objects_lib=$(BUILDDIR)/CPPProcess.o $(BUILDDIR)/MatrixElementKernels.o $(BUILDDIR)/BridgeKernels.o $(BUILDDIR)/CrossSectionKernels.o
 cxx_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel.o $(BUILDDIR)/RamboSamplingKernels.o
 
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 MG5AMC_CULIB = mg5amc_$(processid_short)_cuda
 cu_objects_lib=$(BUILDDIR)/gCPPProcess.o $(BUILDDIR)/gMatrixElementKernels.o $(BUILDDIR)/gBridgeKernels.o $(BUILDDIR)/gCrossSectionKernels.o
 cu_objects_exe=$(BUILDDIR)/gCommonRandomNumberKernel.o $(BUILDDIR)/gRamboSamplingKernels.o
@@ -581,11 +647,11 @@ $(LIBDIR)/lib$(MG5AMC_CXXLIB).so: cxx_objects_lib += $(BUILDDIR)/fbridge.o
 $(LIBDIR)/lib$(MG5AMC_CXXLIB).so: $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib)
 	$(CXX) -shared -o $@ $(cxx_objects_lib) $(CXXLIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB)
 
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 $(LIBDIR)/lib$(MG5AMC_CULIB).so: $(BUILDDIR)/fbridge_cu.o
 $(LIBDIR)/lib$(MG5AMC_CULIB).so: cu_objects_lib += $(BUILDDIR)/fbridge_cu.o
 $(LIBDIR)/lib$(MG5AMC_CULIB).so: $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cu_objects_lib)
-	$(NVCC) --shared -o $@ $(cu_objects_lib) $(CULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB)
+	$(GPUCC) --shared -o $@ $(cu_objects_lib) $(CULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB)
 endif
 
 #-------------------------------------------------------------------------------
@@ -602,16 +668,16 @@ $(cxx_main): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PAT
 $(cxx_main): $(BUILDDIR)/check_sa.o $(LIBDIR)/lib$(MG5AMC_CXXLIB).so $(cxx_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel.o
 	$(CXX) -o $@ $(BUILDDIR)/check_sa.o $(OMPFLAGS) -ldl -pthread $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_CXXLIB) $(cxx_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel.o $(CURANDLIBFLAGS)
 
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 ifneq ($(shell $(CXX) --version | grep ^Intel),)
-$(cu_main): LIBFLAGS += -lintlc # compile with icpx and link with nvcc (undefined reference to `_intel_fast_memcpy')
-$(cu_main): LIBFLAGS += -lsvml # compile with icpx and link with nvcc (undefined reference to `__svml_cos4_l9')
+$(cu_main): LIBFLAGS += -lintlc # compile with icpx and link with GPUCC (undefined reference to `_intel_fast_memcpy')
+$(cu_main): LIBFLAGS += -lsvml # compile with icpx and link with GPUCC (undefined reference to `__svml_cos4_l9')
 else ifneq ($(shell $(CXX) --version | grep ^nvc++),) # support nvc++ #531
 $(cu_main): LIBFLAGS += -L$(patsubst %bin/nvc++,%lib,$(subst ccache ,,$(CXX))) -lnvhpcatm -lnvcpumath -lnvc
 endif
 $(cu_main): LIBFLAGS += $(CULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH
 $(cu_main): $(BUILDDIR)/gcheck_sa.o $(LIBDIR)/lib$(MG5AMC_CULIB).so $(cu_objects_exe) $(BUILDDIR)/gCurandRandomNumberKernel.o
-	$(NVCC) -o $@ $(BUILDDIR)/gcheck_sa.o $(CUARCHFLAGS) $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe) $(BUILDDIR)/gCurandRandomNumberKernel.o $(CURANDLIBFLAGS)
+	$(GPUCC) -o $@ $(BUILDDIR)/gcheck_sa.o $(CUARCHFLAGS) $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe) $(BUILDDIR)/gCurandRandomNumberKernel.o $(CURANDLIBFLAGS)
 endif
 
 #-------------------------------------------------------------------------------
@@ -637,17 +703,17 @@ $(fcxx_main): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PA
 $(fcxx_main): $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler.o $(LIBDIR)/lib$(MG5AMC_CXXLIB).so $(cxx_objects_exe)
 	$(CXX) -o $@ $(BUILDDIR)/fcheck_sa.o $(OMPFLAGS) $(BUILDDIR)/fsampler.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CXXLIB) $(cxx_objects_exe)
 
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 ifneq ($(shell $(CXX) --version | grep ^Intel),)
-$(fcu_main): LIBFLAGS += -lintlc # compile with icpx and link with nvcc (undefined reference to `_intel_fast_memcpy')
-$(fcu_main): LIBFLAGS += -lsvml # compile with icpx and link with nvcc (undefined reference to `__svml_cos4_l9')
+$(fcu_main): LIBFLAGS += -lintlc # compile with icpx and link with GPUCC (undefined reference to `_intel_fast_memcpy')
+$(fcu_main): LIBFLAGS += -lsvml # compile with icpx and link with GPUCC (undefined reference to `__svml_cos4_l9')
 endif
 ifeq ($(UNAME_S),Darwin)
 $(fcu_main): LIBFLAGS += -L$(shell dirname $(shell $(FC) --print-file-name libgfortran.dylib)) # add path to libgfortran on Mac #375
 endif
 $(fcu_main): LIBFLAGS += $(CULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH
 $(fcu_main): $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler_cu.o $(LIBDIR)/lib$(MG5AMC_CULIB).so $(cu_objects_exe)
-	$(NVCC) -o $@ $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler_cu.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe)
+	$(GPUCC) -o $@ $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler_cu.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe)
 endif
 
 #-------------------------------------------------------------------------------
@@ -659,7 +725,7 @@ $(BUILDDIR)/testxxx.o: testxxx_cc_ref.txt
 $(testmain): $(BUILDDIR)/testxxx.o
 $(testmain): cxx_objects_exe += $(BUILDDIR)/testxxx.o # Comment out this line to skip the C++ test of xxx functions
 
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 $(BUILDDIR)/testxxx_cu.o: $(GTESTLIBS)
 $(BUILDDIR)/testxxx_cu.o: INCFLAGS += $(GTESTINC)
 $(BUILDDIR)/testxxx_cu.o: testxxx_cc_ref.txt
@@ -672,7 +738,7 @@ $(BUILDDIR)/testmisc.o: INCFLAGS += $(GTESTINC)
 $(testmain): $(BUILDDIR)/testmisc.o
 $(testmain): cxx_objects_exe += $(BUILDDIR)/testmisc.o # Comment out this line to skip the C++ miscellaneous tests
 
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 $(BUILDDIR)/testmisc_cu.o: $(GTESTLIBS)
 $(BUILDDIR)/testmisc_cu.o: INCFLAGS += $(GTESTINC)
 $(testmain): $(BUILDDIR)/testmisc_cu.o
@@ -684,12 +750,12 @@ $(BUILDDIR)/runTest.o: INCFLAGS += $(GTESTINC)
 $(testmain): $(BUILDDIR)/runTest.o
 $(testmain): cxx_objects_exe += $(BUILDDIR)/runTest.o
 
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 $(BUILDDIR)/runTest_cu.o: $(GTESTLIBS)
 $(BUILDDIR)/runTest_cu.o: INCFLAGS += $(GTESTINC)
 ifneq ($(shell $(CXX) --version | grep ^Intel),)
-$(testmain): LIBFLAGS += -lintlc # compile with icpx and link with nvcc (undefined reference to `_intel_fast_memcpy')
-$(testmain): LIBFLAGS += -lsvml # compile with icpx and link with nvcc (undefined reference to `__svml_cos4_l9')
+$(testmain): LIBFLAGS += -lintlc # compile with icpx and link with GPUCC (undefined reference to `_intel_fast_memcpy')
+$(testmain): LIBFLAGS += -lsvml # compile with icpx and link with GPUCC (undefined reference to `__svml_cos4_l9')
 else ifneq ($(shell $(CXX) --version | grep ^nvc++),) # support nvc++ #531
 $(testmain): LIBFLAGS += -L$(patsubst %bin/nvc++,%lib,$(subst ccache ,,$(CXX))) -lnvhpcatm -lnvcpumath -lnvc
 endif
@@ -713,14 +779,14 @@ $(testmain): LIBFLAGS += -lgomp
 endif
 endif
 
-ifeq ($(NVCC),) # link only runTest.o
+ifeq ($(GPUCC),) # link only runTest.o
 $(testmain): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH
 $(testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(cxx_objects_exe) $(GTESTLIBS)
 	$(CXX) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) -ldl -pthread $(LIBFLAGS)
 else # link both runTest.o and runTest_cu.o
 $(testmain): LIBFLAGS += $(CULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH
 $(testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) $(GTESTLIBS)
-	$(NVCC) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) -ldl $(LIBFLAGS) -lcuda
+	  $(GPUCC) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) -ldl $(LIBFLAGS) $(CUDATESTFLAGS)
 endif
 
 # Use target gtestlibs to build only googletest
@@ -829,9 +895,9 @@ ifeq ($(USECCACHE),1)
 	ccache --version | head -1
 endif
 	@echo ""
-	@echo NVCC=$(NVCC)
-ifneq ($(NVCC),)
-	$(NVCC) --version
+	@echo GPUCC=$(GPUCC)
+ifneq ($(GPUCC),)
+	$(GPUCC) --version
 endif
 	@echo ""
 	@echo CXX=$(CXX)
@@ -850,7 +916,7 @@ endif
 
 # Target: check (run the C++ test executable)
 # [NB THIS IS WHAT IS USED IN THE GITHUB CI!]
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 check: runTest cmpFcheck cmpFGcheck
 else
 check: runTest cmpFcheck
diff --git a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/fbridge.cc b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/fbridge.cc
index 2d2b36d560..22ce3f5115 100644
--- a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/fbridge.cc
+++ b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/fbridge.cc
@@ -1,11 +1,11 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: S. Roiser (Oct 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Roiser, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #include "Bridge.h"
 #include "CPPProcess.h"
-#include "CudaRuntime.h"
+#include "GpuRuntime.h"
 
 extern "C"
 {
@@ -22,7 +22,7 @@ extern "C"
    * Using the same Fortran MadEvent code, linking to the hetrerogeneous library would allow access to both CPU and GPU implementations.
    * The specific heterogeneous configuration (how many GPUs, how many threads on each CPU, etc) could be loaded in CUDA/C++ from a data file.
    */
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   using namespace mg5amcGpu;
 #else
   using namespace mg5amcCpu;
@@ -46,8 +46,8 @@ extern "C"
    */
   void fbridgecreate_( CppObjectInFortran** ppbridge, const int* pnevtF, const int* pnparF, const int* pnp4F )
   {
-#ifdef __CUDACC__
-    CudaRuntime::setUp();
+#ifdef MGONGPUCPP_GPUIMPL
+    GpuRuntime::setUp();
 #endif
     // (NB: CPPProcess::initProc no longer needs to be executed here because it is called in the Bridge constructor)
     // FIXME: disable OMP in Bridge when called from Fortran
@@ -65,8 +65,8 @@ extern "C"
     Bridge<FORTRANFPTYPE>* pbridge = dynamic_cast<Bridge<FORTRANFPTYPE>*>( *ppbridge );
     if( pbridge == 0 ) throw std::runtime_error( "fbridgedelete_: invalid Bridge address" );
     delete pbridge;
-#ifdef __CUDACC__
-    CudaRuntime::tearDown();
+#ifdef MGONGPUCPP_GPUIMPL
+    GpuRuntime::tearDown();
 #endif
   }
 
@@ -96,7 +96,7 @@ extern "C"
   {
     Bridge<FORTRANFPTYPE>* pbridge = dynamic_cast<Bridge<FORTRANFPTYPE>*>( *ppbridge );
     if( pbridge == 0 ) throw std::runtime_error( "fbridgesequence_: invalid Bridge address" );
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     // Use the device/GPU implementation in the CUDA library
     // (there is also a host implementation in this library)
     pbridge->gpu_sequence( momenta, gs, rndhel, rndcol, *pchannelId, mes, selhel, selcol );
diff --git a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/fsampler.cc b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/fsampler.cc
index 2fb445372d..3743934f41 100644
--- a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/fsampler.cc
+++ b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/fsampler.cc
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Feb 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #include "mgOnGpuConfig.h"
 
@@ -13,7 +13,7 @@
 
 //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -40,7 +40,7 @@ namespace mg5amcCpu
   private:
     const int m_nevt; // The number of events in each iteration
     int m_iiter;      // The iteration counter (for random number seeding)
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
     HostBufferRndNumMomenta m_hstRndmom; // Memory buffers for random numbers
     HostBufferMomenta m_hstMomenta;      // Memory buffers for momenta
     HostBufferWeights m_hstWeights;      // Memory buffers for sampling weights
@@ -105,7 +105,7 @@ namespace mg5amcCpu
 
 extern "C"
 {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   using namespace mg5amcGpu;
 #else
   using namespace mg5amcCpu;
diff --git a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/runTest.cc b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/runTest.cc
index d4a760a71b..de327f2321 100644
--- a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/runTest.cc
+++ b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/runTest.cc
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: S. Hageboeck (Nov 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 //----------------------------------------------------------------------------
 // Use ./runTest.exe --gtest_filter=*xxx to run only testxxx.cc tests
 //----------------------------------------------------------------------------
@@ -18,7 +18,7 @@
 #include "RandomNumberKernels.h"
 #include "epoch_process_id.h"
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 using namespace mg5amcGpu;
 #else
 using namespace mg5amcCpu;
@@ -35,7 +35,7 @@ struct CUDA_CPU_TestBase : public TestDriverBase
     : TestDriverBase( npar, refFileName ) {}
 };
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 struct CPUTest : public CUDA_CPU_TestBase
 {
   // Struct data members (process, and memory structures for random numbers, momenta, matrix elements and weights on host and device)
@@ -119,7 +119,7 @@ struct CPUTest : public CUDA_CPU_TestBase
 };
 #endif
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 struct CUDATest : public CUDA_CPU_TestBase
 {
   // Reset the device when our test goes out of scope. Note that this should happen after
@@ -128,7 +128,7 @@ struct CUDATest : public CUDA_CPU_TestBase
   {
     ~DeviceReset()
     {
-      checkCuda( cudaDeviceReset() ); // this is needed by cuda-memcheck --leak-check full
+      checkGpu( gpuDeviceReset() ); // this is needed by cuda-memcheck --leak-check full
     }
   } deviceResetter;
 
@@ -256,7 +256,7 @@ INSTANTIATE_TEST_SUITE_P( prefix, \
                           test_suite_name, \
                           testing::Values( new CUDATest( MG_EPOCH_REFERENCE_FILE_NAME ) ) );
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 MG_INSTANTIATE_TEST_SUITE_GPU( XTESTID_GPU( MG_EPOCH_PROCESS_ID ), MadgraphTest );
 #else
 MG_INSTANTIATE_TEST_SUITE_CPU( XTESTID_CPU( MG_EPOCH_PROCESS_ID ), MadgraphTest );
diff --git a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/testmisc.cc b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/testmisc.cc
index 895d6eeb56..ba9e59a8a3 100644
--- a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/testmisc.cc
+++ b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/testmisc.cc
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 //----------------------------------------------------------------------------
 // Use ./runTest.exe --gtest_filter=*misc to run only testmisc.cc tests
 //----------------------------------------------------------------------------
@@ -17,7 +17,7 @@
 #include <sstream>
 #include <typeinfo>
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 #define TESTID( s ) s##_GPU_MISC
 #else
 #define TESTID( s ) s##_CPU_MISC
@@ -26,7 +26,7 @@
 #define XTESTID( s ) TESTID( s )
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -59,7 +59,7 @@ namespace mg5amcCpu
 
 TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testmisc )
 {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   using namespace mg5amcGpu;
 #else
   using namespace mg5amcCpu;
diff --git a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/testxxx.cc b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/testxxx.cc
index a1c3cdc238..688cb8167b 100644
--- a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/testxxx.cc
+++ b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/testxxx.cc
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Apr 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 //----------------------------------------------------------------------------
 // Use ./runTest.exe --gtest_filter=*xxx to run only testxxx.cc tests
 //----------------------------------------------------------------------------
@@ -24,7 +24,7 @@
 #include <iomanip>
 #include <iostream>
 #include <vector>
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 #define TESTID( s ) s##_GPU_XXX
 #else
 #define TESTID( s ) s##_CPU_XXX
@@ -32,7 +32,7 @@
 
 #define XTESTID( s ) TESTID( s )
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -42,7 +42,7 @@ namespace mg5amcCpu
   int FPEhandlerIevt = -1;
   inline void FPEhandler( int sig )
   {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     std::cerr << "Floating Point Exception (GPU): '" << FPEhandlerMessage << "' ievt=" << FPEhandlerIevt << std::endl;
 #else
     std::cerr << "Floating Point Exception (CPU neppV=" << neppV << "): '" << FPEhandlerMessage << "' ievt=" << FPEhandlerIevt << std::endl;
@@ -53,7 +53,7 @@ namespace mg5amcCpu
 
 TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testxxx )
 {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   using namespace mg5amcGpu;
 #else
   using namespace mg5amcCpu;
@@ -77,7 +77,7 @@ TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testxxx )
   assert( nevt % neppM == 0 ); // nevt must be a multiple of neppM
   assert( nevt % neppV == 0 ); // nevt must be a multiple of neppV
   // Fill in the input momenta
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   mg5amcGpu::PinnedHostBufferMomenta hstMomenta( nevt ); // AOSOA[npagM][npar=4][np4=4][neppM]
 #else
   mg5amcCpu::HostBufferMomenta hstMomenta( nevt ); // AOSOA[npagM][npar=4][np4=4][neppM]
@@ -322,7 +322,7 @@ TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testxxx )
   {
     for( int ievt = 0; ievt < nevt; ievt++ )
     {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
       using namespace mg5amcGpu;
 #else
       using namespace mg5amcCpu;
diff --git a/epochX/cudacpp/heft_gg_h.sa/src/HelAmps_heft.h b/epochX/cudacpp/heft_gg_h.sa/src/HelAmps_heft.h
index eae9ff5242..dbff117235 100644
--- a/epochX/cudacpp/heft_gg_h.sa/src/HelAmps_heft.h
+++ b/epochX/cudacpp/heft_gg_h.sa/src/HelAmps_heft.h
@@ -5,7 +5,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: A. Valassi (Sep 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
 // MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
@@ -28,7 +28,7 @@
 //#include <iomanip>
 //#include <iostream>
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/heft_gg_h.sa/src/Parameters_heft.cc b/epochX/cudacpp/heft_gg_h.sa/src/Parameters_heft.cc
index e5442756b1..d3d6058b46 100644
--- a/epochX/cudacpp/heft_gg_h.sa/src/Parameters_heft.cc
+++ b/epochX/cudacpp/heft_gg_h.sa/src/Parameters_heft.cc
@@ -4,7 +4,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: A. Valassi (Sep 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
 // MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
@@ -17,7 +17,7 @@
 #include <iomanip>
 #include <iostream>
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 using namespace mg5amcGpu;
 #else
 using namespace mg5amcCpu;
diff --git a/epochX/cudacpp/heft_gg_h.sa/src/Parameters_heft.h b/epochX/cudacpp/heft_gg_h.sa/src/Parameters_heft.h
index 790485fee0..c2be5bba97 100644
--- a/epochX/cudacpp/heft_gg_h.sa/src/Parameters_heft.h
+++ b/epochX/cudacpp/heft_gg_h.sa/src/Parameters_heft.h
@@ -28,7 +28,7 @@
 #include "read_slha.h"
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -94,7 +94,7 @@ namespace mg5amcCpu
 #include <limits>
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -230,7 +230,7 @@ namespace mg5amcCpu
 //==========================================================================
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -247,7 +247,7 @@ namespace mg5amcCpu
 #pragma GCC diagnostic push
 #pragma GCC diagnostic ignored "-Wunused-variable"  // e.g. <<warning: unused variable ‘mdl_G__exp__2’ [-Wunused-variable]>>
 #pragma GCC diagnostic ignored "-Wunused-parameter" // e.g. <<warning: unused parameter ‘G’ [-Wunused-parameter]>>
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 #pragma nv_diagnostic push
 #pragma nv_diag_suppress 177 // e.g. <<warning #177-D: variable "mdl_G__exp__2" was declared but never referenced>>
 #endif
@@ -298,7 +298,7 @@ namespace mg5amcCpu
       // End non-SM (e.g. EFT) implementation - special handling of vectors of floats (#439)
       return out;
     }
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 #pragma GCC diagnostic pop
 #pragma nv_diagnostic pop
 #endif
diff --git a/epochX/cudacpp/heft_gg_h.sa/src/cudacpp_src.mk b/epochX/cudacpp/heft_gg_h.sa/src/cudacpp_src.mk
index 0bd815c9b3..998d3c84fa 100644
--- a/epochX/cudacpp/heft_gg_h.sa/src/cudacpp_src.mk
+++ b/epochX/cudacpp/heft_gg_h.sa/src/cudacpp_src.mk
@@ -19,7 +19,7 @@ SHELL := /bin/bash
 #=== Configure common compiler flags for CUDA and C++
 
 INCFLAGS = -I.
-OPTFLAGS = -O3 # this ends up in CUFLAGS too (should it?), cannot add -Ofast or -ffast-math here
+OPTFLAGS = -O3 # this ends up in GPUFLAGS too (should it?), cannot add -Ofast or -ffast-math here
 
 #-------------------------------------------------------------------------------
 
@@ -45,13 +45,13 @@ endif
 
 #-------------------------------------------------------------------------------
 
-#=== Configure the CUDA compiler (note: NVCC is already exported including ccache)
+#=== Configure the CUDA compiler (note: GPUCC is already exported including ccache)
 
-###$(info NVCC=$(NVCC))
+###$(info GPUCC=$(GPUCC))
 
 #-------------------------------------------------------------------------------
 
-#=== Configure ccache for C++ builds (note: NVCC is already exported including ccache)
+#=== Configure ccache for C++ builds (note: GPUCC is already exported including ccache)
 
 # Enable ccache if USECCACHE=1
 ifeq ($(USECCACHE)$(shell echo $(CXX) | grep ccache),1)
@@ -92,6 +92,13 @@ endif
 ###$(info OMPFLAGS=$(OMPFLAGS))
 CXXFLAGS += $(OMPFLAGS)
 
+# Add correct -DHIP_LATFORM when compiling for HIP
+ifeq ($(findstring nvcc,$(GPUCC)),nvcc)
+  GPUFLAGS += -Xcompiler -fPIC -c -x cu
+else ifeq ($(findstring hipcc,$(GPUCC)),hipcc)
+  GPUFLAGS += -fPIC -c
+endif
+
 # Set the build flags appropriate to each AVX choice (example: "make AVX=none")
 # [NB MGONGPU_PVW512 is needed because "-mprefer-vector-width=256" is not exposed in a macro]
 # [See https://gcc.gnu.org/bugzilla/show_bug.cgi?id=96476]
@@ -253,20 +260,20 @@ $(BUILDDIR)/%.o : %.cc *.h $(BUILDDIR)/.build.$(TAG)
 # Generic target and build rules: objects from CUDA compilation
 $(BUILDDIR)/%_cu.o : %.cc *.h $(BUILDDIR)/.build.$(TAG)
 	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
-	$(NVCC) $(CPPFLAGS) $(CUFLAGS) -Xcompiler -fPIC -c -x cu $< -o $@
+	$(GPUCC) $(CPPFLAGS) $(GPUFLAGS) $< -o $@
 
 #-------------------------------------------------------------------------------
 
 cxx_objects=$(addprefix $(BUILDDIR)/, Parameters_heft.o read_slha.o)
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 cu_objects=$(addprefix $(BUILDDIR)/, Parameters_heft_cu.o)
 endif
 
 # Target (and build rules): common (src) library
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so : $(cxx_objects) $(cu_objects)
 	@if [ ! -d $(LIBDIR) ]; then echo "mkdir -p $(LIBDIR)"; mkdir -p $(LIBDIR); fi
-	$(NVCC) -shared -o $@ $(cxx_objects) $(cu_objects) $(LDFLAGS)
+	$(GPUCC) -shared -o $@ $(cxx_objects) $(cu_objects) $(LDFLAGS)
 else
 $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so : $(cxx_objects)
 	@if [ ! -d $(LIBDIR) ]; then echo "mkdir -p $(LIBDIR)"; mkdir -p $(LIBDIR); fi
diff --git a/epochX/cudacpp/heft_gg_h.sa/src/mgOnGpuConfig.h b/epochX/cudacpp/heft_gg_h.sa/src/mgOnGpuConfig.h
index b247654dcf..da4ba36ad8 100644
--- a/epochX/cudacpp/heft_gg_h.sa/src/mgOnGpuConfig.h
+++ b/epochX/cudacpp/heft_gg_h.sa/src/mgOnGpuConfig.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jul 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MGONGPUCONFIG_H
 #define MGONGPUCONFIG_H 1
@@ -10,12 +10,25 @@
 // There are two different code bases for standalone_cudacpp (without multichannel) and madevent+cudacpp (with multichannel)
 #undef MGONGPU_SUPPORTS_MULTICHANNEL
 
+// Is this a GPU (CUDA, HIP) or CPU implementation?
+#ifdef __CUDACC__
+#define MGONGPUCPP_GPUIMPL cuda
+#elif defined __HIPCC__
+#define MGONGPUCPP_GPUIMPL hip
+#else
+#undef MGONGPUCPP_GPUIMPL
+#endif
+
 // ** NB1 Throughputs (e.g. 6.8E8) are events/sec for "./gcheck.exe -p 65536 128 12"
 // ** NB2 Baseline on b7g47n0004 fluctuates (probably depends on load on other VMs)
 
 // Choose if curand is supported for generating random numbers
+// For HIP, by default, do not use curand (common random numbers will be used instead)
 // For both CUDA and C++, by default, do not inline, but allow this macro to be set from outside with e.g. -DMGONGPU_HAS_NO_CURAND
-// (there exist CUDA installations, e.g. using the HPC package, which do not include curand - see PR #784)
+// (there exist CUDA installations, e.g. using the HPC package, which do not include curand - see PR #784 and #785)
+#if defined __HIPCC__
+#define MGONGPU_HAS_NO_CURAND 1
+#else
 //#ifdef __CUDACC__
 //#undef MGONGPU_HAS_NO_CURAND // default
 ////#define MGONGPU_HAS_NO_CURAND 1
@@ -23,6 +36,7 @@
 //#undef MGONGPU_HAS_NO_CURAND // default
 ////#define MGONGPU_HAS_NO_CURAND 1
 //#endif
+#endif
 
 // Choose floating point precision (for everything but color algebra #537)
 // If one of these macros has been set from outside with e.g. -DMGONGPU_FPTYPE_FLOAT, nothing happens (issue #167)
@@ -54,23 +68,28 @@
 //#undef MGONGPU_HARDCODE_PARAM // default
 ////#define MGONGPU_HARDCODE_PARAM 1
 
-// Complex type in c++: std::complex or cxsmpl (CHOOSE ONLY ONE)
-#ifndef __CUDACC__
-//#define MGONGPU_CPPCXTYPE_STDCOMPLEX 1 // ~8 percent slower on float, same on double (5.1E6/double, 9.4E6/float)
-#define MGONGPU_CPPCXTYPE_CXSMPL 1 // new default (5.1E6/double, 10.2E6/float)
-#endif
-
-// Complex type in cuda: thrust or cucomplex or cxsmpl (CHOOSE ONLY ONE)
+// Complex type in CUDA: thrust or cucomplex or cxsmpl (CHOOSE ONLY ONE)
 #ifdef __CUDACC__
 #define MGONGPU_CUCXTYPE_THRUST 1 // default (~1.15E9/double, ~3.2E9/float)
 //#define MGONGPU_CUCXTYPE_CUCOMPLEX 1 // ~10 percent slower (1.03E9/double, ~2.8E9/float)
 //#define MGONGPU_CUCXTYPE_CXSMPL 1 // ~10 percent slower (1.00E9/double, ~2.9E9/float)
+
+// Complex type in HIP: cxsmpl (ONLY ONE OPTION POSSIBLE)
+#elif defined __HIPCC__
+#define MGONGPU_CUCXTYPE_CXSMPL 1 // ~10 percent slower (1.00E9/double, ~2.9E9/float)
+
+// Complex type in C++: std::complex or cxsmpl (CHOOSE ONLY ONE)
+#else
+//#define MGONGPU_CPPCXTYPE_STDCOMPLEX 1 // ~8 percent slower on float, same on double (5.1E6/double, 9.4E6/float)
+#define MGONGPU_CPPCXTYPE_CXSMPL 1 // new default (5.1E6/double, 10.2E6/float)
 #endif
 
-// Cuda nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation
+// CUDA nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation
 #ifdef __CUDACC__
-#undef MGONGPU_NSIGHT_DEBUG // default
+#undef MGONGPU_NSIGHT_DEBUG // default in CUDA
 //#define MGONGPU_NSIGHT_DEBUG 1
+#else
+#undef MGONGPU_NSIGHT_DEBUG // only option in HIP or C++
 #endif
 
 // SANITY CHECKS (floating point precision for everything but color algebra #537)
@@ -86,17 +105,21 @@
 #error You cannot use double precision for color algebra and single precision elsewhere
 #endif
 
-// SANITY CHECKS (c++ complex number implementation)
-#ifndef __CUDACC__
-#if defined MGONGPU_CPPCXTYPE_STDCOMPLEX and defined MGONGPU_CPPCXTYPE_CXSMPL
-#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CPPCXTYPE_STDCOMPLEX or MGONGPU_CPPCXTYPE_CXSMPL
+// SANITY CHECKS (CUDA complex number implementation)
+#ifdef __CUDACC__
+#if defined MGONGPU_CUCXTYPE_THRUST and defined MGONGPU_CUCXTYPE_CUCOMPLEX
+#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CUCXTYPE_THRUST or MGONGPU_CUCXTYPE_CUCOMPLEX for CUDA
+#elif defined MGONGPU_CUCXTYPE_THRUST and defined MGONGPU_CUCXTYPE_CXSMPL
+#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CUCXTYPE_THRUST or MGONGPU_CUCXTYPE_CXSMPL for CUDA
+#elif defined MGONGPU_CUCXTYPE_CUCOMPLEX and defined MGONGPU_CUCXTYPE_CXSMPL
+#error You must CHOOSE (ONE AND) ONLY ONE OF MGONGPU_CUCXTYPE_CUCOMPLEX or MGONGPU_CUCXTYPE_CXSMPL for CUDA
 #endif
 #endif
 
-// SANITY CHECKS (cuda complex number implementation)
-#ifdef __CUDACC__
-#if defined MGONGPU_CUCXTYPE_THRUST and defined MGONGPU_CUCXTYPE_CUCOMPLEX and defined MGONGPU_CUCXTYPE_CXSMPL
-#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CUCXTYPE_THRUST or MGONGPU_CUCXTYPE_CUCOMPLEX or MGONGPU_CUCXTYPE_CXSMPL
+// SANITY CHECKS (C++ complex number implementation)
+#ifndef MGONGPUCPP_GPUIMPL
+#if defined MGONGPU_CPPCXTYPE_STDCOMPLEX and defined MGONGPU_CPPCXTYPE_CXSMPL
+#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CPPCXTYPE_STDCOMPLEX or MGONGPU_CPPCXTYPE_CXSMPL for C++
 #endif
 #endif
 
@@ -134,7 +157,7 @@ namespace mgOnGpu
   // Alignment requirement for using reinterpret_cast with SIMD vectorized code
   // (using reinterpret_cast with non aligned memory may lead to segmentation faults!)
   // Only needed for C++ code but can be enforced also in NVCC builds of C++ code using CUDA>=11.2 and C++17 (#318, #319, #333)
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   constexpr int cppAlign = 64; // alignment requirement for SIMD vectorization (64-byte i.e. 512-bit)
 #endif
 
@@ -145,7 +168,7 @@ using mgOnGpu::fptype;
 using mgOnGpu::fptype2;
 
 // C++ SIMD vectorization width (this will be used to set neppV)
-#ifdef __CUDACC__ // CUDA implementation has no SIMD
+#ifdef MGONGPUCPP_GPUIMPL // CUDA and HIP implementations have no SIMD
 #undef MGONGPU_CPPSIMD
 #elif defined __AVX512VL__ && defined MGONGPU_PVW512 // C++ "512z" AVX512 with 512 width (512-bit ie 64-byte): 8 (DOUBLE) or 16 (FLOAT)
 #ifdef MGONGPU_FPTYPE_DOUBLE
@@ -175,9 +198,9 @@ using mgOnGpu::fptype2;
 #undef MGONGPU_CPPSIMD
 #endif
 
-// Cuda nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation
+// CUDA nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation
 // Arguments (not used so far): text is __FUNCTION__, code is 0 (start) or 1 (end)
-#if defined __CUDACC__ && defined MGONGPU_NSIGHT_DEBUG /* clang-format off */
+#if defined __CUDA__ && defined MGONGPU_NSIGHT_DEBUG /* clang-format off */
 #define mgDebugDeclare() __shared__ float mgDebugCounter[mgOnGpu::ntpbMAX];
 #define mgDebugInitialise() { mgDebugCounter[threadIdx.x] = 0; }
 #define mgDebug( code, text ) { mgDebugCounter[threadIdx.x] += 1; }
@@ -189,8 +212,8 @@ using mgOnGpu::fptype2;
 #define mgDebugFinalise() { /*noop*/ }
 #endif /* clang-format on */
 
-// Define empty CUDA declaration specifiers for C++
-#ifndef __CUDACC__
+// Define empty CUDA/HIP declaration specifiers for C++
+#ifndef MGONGPUCPP_GPUIMPL
 #define __global__
 #define __host__
 #define __device__
diff --git a/epochX/cudacpp/heft_gg_h.sa/src/mgOnGpuCxtypes.h b/epochX/cudacpp/heft_gg_h.sa/src/mgOnGpuCxtypes.h
index ca9a9f00c0..5532e22fa1 100644
--- a/epochX/cudacpp/heft_gg_h.sa/src/mgOnGpuCxtypes.h
+++ b/epochX/cudacpp/heft_gg_h.sa/src/mgOnGpuCxtypes.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022, based on earlier work by D. Smith) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MGONGPUCXTYPES_H
 #define MGONGPUCXTYPES_H 1
@@ -19,7 +19,7 @@
 #include <complex>
 
 // Complex type in cuda: thrust or cucomplex or cxsmpl
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 #if defined MGONGPU_CUCXTYPE_THRUST
 #pragma clang diagnostic push
 #pragma clang diagnostic ignored "-Wtautological-compare" // for icpx2021/clang13 (https://stackoverflow.com/a/15864661)
@@ -82,7 +82,7 @@ namespace mgOnGpu /* clang-format off */
 using mgOnGpu::cxsmpl;
 
 // Printout to stream for user defined types
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -92,7 +92,7 @@ namespace mg5amcCpu
   inline __host__ std::ostream&
   operator<<( std::ostream& out, const cxsmpl<FP>& c )
   {
-    out << std::complex( c.real(), c.imag() );
+    out << std::complex<FP>( c.real(), c.imag() );
     return out;
   }
 
@@ -215,14 +215,14 @@ namespace mg5amcCpu
 //==========================================================================
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
 #endif
 {
   // --- Type definitions (complex type: cxtype)
-#ifdef __CUDACC__ // cuda
+#ifdef MGONGPUCPP_GPUIMPL // cuda
 #if defined MGONGPU_CUCXTYPE_THRUST
   typedef thrust::complex<fptype> cxtype;
 #elif defined MGONGPU_CUCXTYPE_CUCOMPLEX
@@ -255,7 +255,7 @@ namespace mg5amcCpu
 //==========================================================================
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -307,7 +307,7 @@ namespace mg5amcCpu
 
   //==========================================================================
 
-#if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_THRUST // cuda + thrust
+#if defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CUCXTYPE_THRUST // cuda + thrust
 
   //------------------------------
   // CUDA - using thrust::complex
@@ -343,11 +343,11 @@ namespace mg5amcCpu
     return c;
   }
 
-#endif // #if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_THRUST
+#endif // #if defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CUCXTYPE_THRUST
 
   //==========================================================================
 
-#if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_CUCOMPLEX // cuda + cucomplex
+#if defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CUCXTYPE_CUCOMPLEX // cuda + cucomplex
 
   //------------------------------
   // CUDA - using cuComplex
@@ -562,11 +562,11 @@ namespace mg5amcCpu
     return cxmake( c.real(), c.imag() );
   }
 
-#endif // #if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_CUCOMPLEX
+#endif // #if defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CUCXTYPE_CUCOMPLEX
 
   //==========================================================================
 
-#if not defined __CUDACC__ and defined MGONGPU_CPPCXTYPE_STDCOMPLEX // c++ + stdcomplex
+#if not defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CPPCXTYPE_STDCOMPLEX // c++ + stdcomplex
 
   //------------------------------
   // C++ - using std::complex
@@ -610,7 +610,7 @@ namespace mg5amcCpu
   }
 #endif
 
-#endif // #if not defined __CUDACC__ and defined MGONGPU_CPPCXTYPE_STDCOMPLEX
+#endif // #if not defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CPPCXTYPE_STDCOMPLEX
 
   //==========================================================================
 
@@ -633,7 +633,7 @@ namespace mg5amcCpu
 //==========================================================================
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/heft_gg_h.sa/src/mgOnGpuFptypes.h b/epochX/cudacpp/heft_gg_h.sa/src/mgOnGpuFptypes.h
index 905c97d700..fa3a02664b 100644
--- a/epochX/cudacpp/heft_gg_h.sa/src/mgOnGpuFptypes.h
+++ b/epochX/cudacpp/heft_gg_h.sa/src/mgOnGpuFptypes.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MGONGPUFPTYPES_H
 #define MGONGPUFPTYPES_H 1
@@ -12,7 +12,7 @@
 #include <cmath>
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL // cuda
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -20,7 +20,7 @@ namespace mg5amcCpu
 {
   //==========================================================================
 
-#ifdef __CUDACC__ // cuda
+#ifdef MGONGPUCPP_GPUIMPL // cuda
 
   //------------------------------
   // Floating point types - Cuda
@@ -64,11 +64,11 @@ namespace mg5amcCpu
 #endif
   }
 
-#endif // #ifdef __CUDACC__
+#endif // #ifdef MGONGPUCPP_GPUIMPL
 
   //==========================================================================
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 
   //------------------------------
   // Floating point types - C++
@@ -92,7 +92,7 @@ namespace mg5amcCpu
     return std::sqrt( f );
   }
 
-#endif // #ifndef __CUDACC__
+#endif // #ifndef MGONGPUCPP_GPUIMPL
 
   //==========================================================================
 
diff --git a/epochX/cudacpp/heft_gg_h.sa/src/mgOnGpuVectors.h b/epochX/cudacpp/heft_gg_h.sa/src/mgOnGpuVectors.h
index e1299ba81e..cdae04326b 100644
--- a/epochX/cudacpp/heft_gg_h.sa/src/mgOnGpuVectors.h
+++ b/epochX/cudacpp/heft_gg_h.sa/src/mgOnGpuVectors.h
@@ -32,7 +32,7 @@
 #endif
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -131,7 +131,7 @@ namespace mg5amcCpu
 #endif
 #endif
 
-#else // i.e #ifndef MGONGPU_CPPSIMD (this includes #ifdef __CUDACC__)
+#else // i.e #ifndef MGONGPU_CPPSIMD (this includes #ifdef MGONGPUCPP_GPUIMPL)
 
   const int neppV = 1;
 
@@ -153,13 +153,13 @@ namespace mg5amcCpu
 //==========================================================================
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
 #endif
 {
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 
   // Printout to stream for user defined types
 
@@ -805,11 +805,11 @@ namespace mg5amcCpu
 
 #endif // #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
 
-#endif // #ifndef __CUDACC__
+#endif // #ifndef MGONGPUCPP_GPUIMPL
 
   //==========================================================================
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 
   //------------------------------
   // Vector types - CUDA
@@ -853,12 +853,12 @@ namespace mg5amcCpu
     return mask;
   }
 
-#endif // #ifdef __CUDACC__
+#endif // #ifdef MGONGPUCPP_GPUIMPL
 
   //==========================================================================
 
   // Scalar-or-vector types: scalar in CUDA, vector or scalar in C++
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   typedef bool bool_sv;
   typedef fptype fptype_sv;
   typedef fptype2 fptype2_sv;
@@ -879,7 +879,7 @@ namespace mg5amcCpu
 #endif
 
   // Scalar-or-vector zeros: scalar in CUDA, vector or scalar in C++
-#ifdef __CUDACC__ /* clang-format off */
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
   inline __host__ __device__ cxtype cxzero_sv(){ return cxtype( 0, 0 ); }
 #elif defined MGONGPU_CPPSIMD
   inline cxtype_v cxzero_sv() { return cxtype_v(); } // RRRR=0000 IIII=0000
diff --git a/epochX/cudacpp/heft_gg_h.sa/src/rambo.h b/epochX/cudacpp/heft_gg_h.sa/src/rambo.h
index e02ea52496..cd7e1008ea 100644
--- a/epochX/cudacpp/heft_gg_h.sa/src/rambo.h
+++ b/epochX/cudacpp/heft_gg_h.sa/src/rambo.h
@@ -4,7 +4,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 
 #include "mgOnGpuConfig.h"
@@ -18,7 +18,7 @@
 #include <iostream>
 
 // Simplified rambo version for 2 to N (with N>=2) processes with massless particles
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -83,7 +83,7 @@ namespace mg5amcCpu
       static bool first = true;
       if( first )
       {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
         if constexpr( M_ACCESS::isOnDevice() ) // avoid
         {
           const int ievt0 = 0;
@@ -166,7 +166,7 @@ namespace mg5amcCpu
     wt = po2log;
     if( nparf != 2 ) wt = ( 2. * nparf - 4. ) * log( energy ) + z[nparf - 1];
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
     // issue warnings if weight is too small or too large
     static int iwarn[5] = { 0, 0, 0, 0, 0 };
     if( wt < -180. )
diff --git a/epochX/cudacpp/pp_tt012j.mad/CODEGEN_mad_pp_tt012j_log.txt b/epochX/cudacpp/pp_tt012j.mad/CODEGEN_mad_pp_tt012j_log.txt
index ff161c336f..adfd21027c 100644
--- a/epochX/cudacpp/pp_tt012j.mad/CODEGEN_mad_pp_tt012j_log.txt
+++ b/epochX/cudacpp/pp_tt012j.mad/CODEGEN_mad_pp_tt012j_log.txt
@@ -52,7 +52,7 @@ Note that you can still compile and run aMC@NLO with the built-in PDFs
 
 Using default text editor "vi". Set another one in ./input/mg5_configuration.txt
 Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt
-No valid web browser found. Please set in ./input/mg5_configuration.txt
+Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt
 import /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j.mg
 The import format was not given, so we guess it as command
 set stdout_level DEBUG
@@ -61,7 +61,7 @@ set zerowidth_tchannel F
 define j = p
 INFO: load particles 
 INFO: load vertices 
-[1;32mDEBUG: model prefixing  takes 0.005424976348876953 [0m
+[1;32mDEBUG: model prefixing  takes 0.0053827762603759766 [0m
 INFO: Restrict model sm with file models/sm/restrict_default.dat . 
 [1;32mDEBUG: Simplifying conditional expressions [0m
 [1;32mDEBUG: remove interactions: u s w+ at order: QED=1 [0m
@@ -172,7 +172,7 @@ INFO: Process u~ u > t t~ added to mirror process u u~ > t t~
 INFO: Process c~ c > t t~ added to mirror process c c~ > t t~ 
 INFO: Process d~ d > t t~ added to mirror process d d~ > t t~ 
 INFO: Process s~ s > t t~ added to mirror process s s~ > t t~ 
-5 processes with 7 diagrams generated in 0.030 s
+5 processes with 7 diagrams generated in 0.029 s
 Total: 5 processes with 7 diagrams
 add process p p > t t~ j @1
 INFO: Checking for minimal orders which gives processes. 
@@ -212,7 +212,7 @@ INFO: Process d~ g > t t~ d~ added to mirror process g d~ > t t~ d~
 INFO: Process d~ d > t t~ g added to mirror process d d~ > t t~ g 
 INFO: Process s~ g > t t~ s~ added to mirror process g s~ > t t~ s~ 
 INFO: Process s~ s > t t~ g added to mirror process s s~ > t t~ g 
-13 processes with 76 diagrams generated in 0.139 s
+13 processes with 76 diagrams generated in 0.136 s
 Total: 18 processes with 83 diagrams
 add process p p > t t~ j j @2
 INFO: Checking for minimal orders which gives processes. 
@@ -378,7 +378,7 @@ INFO: Process s~ u~ > t t~ u~ s~ added to mirror process u~ s~ > t t~ u~ s~
 INFO: Process s~ c~ > t t~ c~ s~ added to mirror process c~ s~ > t t~ c~ s~ 
 INFO: Process s~ d~ > t t~ d~ s~ added to mirror process d~ s~ > t t~ d~ s~ 
 INFO: Crossed process found for s~ s~ > t t~ s~ s~, reuse diagrams. 
-65 processes with 1119 diagrams generated in 1.876 s
+65 processes with 1119 diagrams generated in 1.869 s
 Total: 83 processes with 1202 diagrams
 output madevent ../TMPOUT/CODEGEN_mad_pp_tt012j --hel_recycling=False --vector_size=32 --me_exporter=standalone_cudacpp
 Load PLUGIN.CUDACPP_OUTPUT
@@ -497,7 +497,7 @@ INFO: Combined process d d~ > t t~ WEIGHTED<=2 with process u u~ > t t~ WEIGHTED
 INFO: Combined process s s~ > t t~ WEIGHTED<=2 with process u u~ > t t~ WEIGHTED<=2 
 INFO: Creating files in directory P2_gg_ttxgg 
 [1;32mDEBUG:  kwargs[prefix] = 0 [1;30m[model_handling.py at line 1058][0m [0m
-[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7ff282794c40> [1;30m[export_v4.py at line 6262][0m [0m
+[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7fe3d5e3fe80> [1;30m[export_v4.py at line 6262][0m [0m
 INFO: Creating files in directory . 
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.h
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.cc
@@ -514,7 +514,7 @@ INFO: Generating Feynman diagrams for Process: g g > t t~ g g WEIGHTED<=4 @2
 INFO: Finding symmetric diagrams for subprocess group gg_ttxgg 
 INFO: Creating files in directory P2_gg_ttxuux 
 [1;32mDEBUG:  kwargs[prefix] = 0 [1;30m[model_handling.py at line 1058][0m [0m
-[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7ff282794430> [1;30m[export_v4.py at line 6262][0m [0m
+[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7fe3d5649d90> [1;30m[export_v4.py at line 6262][0m [0m
 INFO: Creating files in directory . 
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.h
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.cc
@@ -531,7 +531,7 @@ INFO: Generating Feynman diagrams for Process: g g > t t~ u u~ WEIGHTED<=4 @2
 INFO: Finding symmetric diagrams for subprocess group gg_ttxuux 
 INFO: Creating files in directory P2_gu_ttxgu 
 [1;32mDEBUG:  kwargs[prefix] = 0 [1;30m[model_handling.py at line 1058][0m [0m
-[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7ff282addeb0> [1;30m[export_v4.py at line 6262][0m [0m
+[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7fe3d599b190> [1;30m[export_v4.py at line 6262][0m [0m
 INFO: Creating files in directory . 
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.h
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.cc
@@ -548,7 +548,7 @@ INFO: Generating Feynman diagrams for Process: g u > t t~ g u WEIGHTED<=4 @2
 INFO: Finding symmetric diagrams for subprocess group gu_ttxgu 
 INFO: Creating files in directory P2_gux_ttxgux 
 [1;32mDEBUG:  kwargs[prefix] = 0 [1;30m[model_handling.py at line 1058][0m [0m
-[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7ff2832e91c0> [1;30m[export_v4.py at line 6262][0m [0m
+[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7fe3d55da490> [1;30m[export_v4.py at line 6262][0m [0m
 INFO: Creating files in directory . 
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.h
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.cc
@@ -565,7 +565,7 @@ INFO: Generating Feynman diagrams for Process: g u~ > t t~ g u~ WEIGHTED<=4 @2
 INFO: Finding symmetric diagrams for subprocess group gux_ttxgux 
 INFO: Creating files in directory P2_uux_ttxgg 
 [1;32mDEBUG:  kwargs[prefix] = 0 [1;30m[model_handling.py at line 1058][0m [0m
-[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7ff282919250> [1;30m[export_v4.py at line 6262][0m [0m
+[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7fe3d599b190> [1;30m[export_v4.py at line 6262][0m [0m
 INFO: Creating files in directory . 
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.h
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.cc
@@ -582,7 +582,7 @@ INFO: Generating Feynman diagrams for Process: u u~ > t t~ g g WEIGHTED<=4 @2
 INFO: Finding symmetric diagrams for subprocess group uux_ttxgg 
 INFO: Creating files in directory P1_gg_ttxg 
 [1;32mDEBUG:  kwargs[prefix] = 0 [1;30m[model_handling.py at line 1058][0m [0m
-[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7ff282919250> [1;30m[export_v4.py at line 6262][0m [0m
+[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7fe3d56527c0> [1;30m[export_v4.py at line 6262][0m [0m
 INFO: Creating files in directory . 
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.h
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.cc
@@ -599,7 +599,7 @@ INFO: Generating Feynman diagrams for Process: g g > t t~ g WEIGHTED<=3 @1
 INFO: Finding symmetric diagrams for subprocess group gg_ttxg 
 INFO: Creating files in directory P2_uu_ttxuu 
 [1;32mDEBUG:  kwargs[prefix] = 0 [1;30m[model_handling.py at line 1058][0m [0m
-[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7ff282ae3ca0> [1;30m[export_v4.py at line 6262][0m [0m
+[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7fe3d56527c0> [1;30m[export_v4.py at line 6262][0m [0m
 INFO: Creating files in directory . 
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.h
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.cc
@@ -616,7 +616,7 @@ INFO: Generating Feynman diagrams for Process: u u > t t~ u u WEIGHTED<=4 @2
 INFO: Finding symmetric diagrams for subprocess group uu_ttxuu 
 INFO: Creating files in directory P2_uux_ttxuux 
 [1;32mDEBUG:  kwargs[prefix] = 0 [1;30m[model_handling.py at line 1058][0m [0m
-[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7ff282aa5be0> [1;30m[export_v4.py at line 6262][0m [0m
+[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7fe3d56527c0> [1;30m[export_v4.py at line 6262][0m [0m
 INFO: Creating files in directory . 
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.h
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.cc
@@ -633,7 +633,7 @@ INFO: Generating Feynman diagrams for Process: u u~ > t t~ u u~ WEIGHTED<=4 @2
 INFO: Finding symmetric diagrams for subprocess group uux_ttxuux 
 INFO: Creating files in directory P2_uxux_ttxuxux 
 [1;32mDEBUG:  kwargs[prefix] = 0 [1;30m[model_handling.py at line 1058][0m [0m
-[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7ff282fa8f70> [1;30m[export_v4.py at line 6262][0m [0m
+[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7fe3d56527c0> [1;30m[export_v4.py at line 6262][0m [0m
 INFO: Creating files in directory . 
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.h
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.cc
@@ -650,7 +650,7 @@ INFO: Generating Feynman diagrams for Process: u~ u~ > t t~ u~ u~ WEIGHTED<=4 @2
 INFO: Finding symmetric diagrams for subprocess group uxux_ttxuxux 
 INFO: Creating files in directory P2_uc_ttxuc 
 [1;32mDEBUG:  kwargs[prefix] = 0 [1;30m[model_handling.py at line 1058][0m [0m
-[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7ff282e2d850> [1;30m[export_v4.py at line 6262][0m [0m
+[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7fe3d5e3fe80> [1;30m[export_v4.py at line 6262][0m [0m
 INFO: Creating files in directory . 
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.h
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.cc
@@ -667,7 +667,7 @@ INFO: Generating Feynman diagrams for Process: u c > t t~ u c WEIGHTED<=4 @2
 INFO: Finding symmetric diagrams for subprocess group uc_ttxuc 
 INFO: Creating files in directory P2_uux_ttxccx 
 [1;32mDEBUG:  kwargs[prefix] = 0 [1;30m[model_handling.py at line 1058][0m [0m
-[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7ff282addd60> [1;30m[export_v4.py at line 6262][0m [0m
+[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7fe3d5e3fe80> [1;30m[export_v4.py at line 6262][0m [0m
 INFO: Creating files in directory . 
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.h
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.cc
@@ -684,7 +684,7 @@ INFO: Generating Feynman diagrams for Process: u u~ > t t~ c c~ WEIGHTED<=4 @2
 INFO: Finding symmetric diagrams for subprocess group uux_ttxccx 
 INFO: Creating files in directory P2_ucx_ttxucx 
 [1;32mDEBUG:  kwargs[prefix] = 0 [1;30m[model_handling.py at line 1058][0m [0m
-[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7ff282a84d30> [1;30m[export_v4.py at line 6262][0m [0m
+[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7fe3d5649d00> [1;30m[export_v4.py at line 6262][0m [0m
 INFO: Creating files in directory . 
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.h
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.cc
@@ -701,7 +701,7 @@ INFO: Generating Feynman diagrams for Process: u c~ > t t~ u c~ WEIGHTED<=4 @2
 INFO: Finding symmetric diagrams for subprocess group ucx_ttxucx 
 INFO: Creating files in directory P2_uxcx_ttxuxcx 
 [1;32mDEBUG:  kwargs[prefix] = 0 [1;30m[model_handling.py at line 1058][0m [0m
-[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7ff282e2d850> [1;30m[export_v4.py at line 6262][0m [0m
+[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7fe3d5649d00> [1;30m[export_v4.py at line 6262][0m [0m
 INFO: Creating files in directory . 
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.h
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.cc
@@ -718,7 +718,7 @@ INFO: Generating Feynman diagrams for Process: u~ c~ > t t~ u~ c~ WEIGHTED<=4 @2
 INFO: Finding symmetric diagrams for subprocess group uxcx_ttxuxcx 
 INFO: Creating files in directory P1_gu_ttxu 
 [1;32mDEBUG:  kwargs[prefix] = 0 [1;30m[model_handling.py at line 1058][0m [0m
-[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7ff282e2d850> [1;30m[export_v4.py at line 6262][0m [0m
+[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7fe3d5649d90> [1;30m[export_v4.py at line 6262][0m [0m
 INFO: Creating files in directory . 
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.h
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.cc
@@ -735,7 +735,7 @@ INFO: Generating Feynman diagrams for Process: g u > t t~ u WEIGHTED<=3 @1
 INFO: Finding symmetric diagrams for subprocess group gu_ttxu 
 INFO: Creating files in directory P1_gux_ttxux 
 [1;32mDEBUG:  kwargs[prefix] = 0 [1;30m[model_handling.py at line 1058][0m [0m
-[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7ff282e2d850> [1;30m[export_v4.py at line 6262][0m [0m
+[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7fe3d559e730> [1;30m[export_v4.py at line 6262][0m [0m
 INFO: Creating files in directory . 
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.h
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.cc
@@ -752,7 +752,7 @@ INFO: Generating Feynman diagrams for Process: g u~ > t t~ u~ WEIGHTED<=3 @1
 INFO: Finding symmetric diagrams for subprocess group gux_ttxux 
 INFO: Creating files in directory P1_uux_ttxg 
 [1;32mDEBUG:  kwargs[prefix] = 0 [1;30m[model_handling.py at line 1058][0m [0m
-[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7ff282aa3a00> [1;30m[export_v4.py at line 6262][0m [0m
+[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7fe3d5a100a0> [1;30m[export_v4.py at line 6262][0m [0m
 INFO: Creating files in directory . 
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.h
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.cc
@@ -769,7 +769,7 @@ INFO: Generating Feynman diagrams for Process: u u~ > t t~ g WEIGHTED<=3 @1
 INFO: Finding symmetric diagrams for subprocess group uux_ttxg 
 INFO: Creating files in directory P0_gg_ttx 
 [1;32mDEBUG:  kwargs[prefix] = 0 [1;30m[model_handling.py at line 1058][0m [0m
-[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7ff282addd60> [1;30m[export_v4.py at line 6262][0m [0m
+[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7fe3d5e3fe80> [1;30m[export_v4.py at line 6262][0m [0m
 INFO: Creating files in directory . 
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.h
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.cc
@@ -786,7 +786,7 @@ INFO: Generating Feynman diagrams for Process: g g > t t~ WEIGHTED<=2
 INFO: Finding symmetric diagrams for subprocess group gg_ttx 
 INFO: Creating files in directory P0_uux_ttx 
 [1;32mDEBUG:  kwargs[prefix] = 0 [1;30m[model_handling.py at line 1058][0m [0m
-[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7ff282794c40> [1;30m[export_v4.py at line 6262][0m [0m
+[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7fe3d59998e0> [1;30m[export_v4.py at line 6262][0m [0m
 INFO: Creating files in directory . 
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.h
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.cc
@@ -801,15 +801,15 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./.
 [1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m 32 True 32 [1;30m[export_v4.py at line 1872][0m [0m
 INFO: Generating Feynman diagrams for Process: u u~ > t t~ WEIGHTED<=2 
 INFO: Finding symmetric diagrams for subprocess group uux_ttx 
-Generated helas calls for 18 subprocesses (372 diagrams) in 1.312 s
-Wrote files for 810 helas calls in 3.308 s
+Generated helas calls for 18 subprocesses (372 diagrams) in 1.297 s
+Wrote files for 810 helas calls in 3.533 s
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates VVV1 routines[0m
 ALOHA: aloha creates FFV1 routines[0m
 ALOHA: aloha creates VVVV1 routines[0m
 ALOHA: aloha creates VVVV3 routines[0m
 ALOHA: aloha creates VVVV4 routines[0m
-ALOHA: aloha creates 5 routines in  0.342 s
+ALOHA: aloha creates 5 routines in  0.333 s
 [1;32mDEBUG:  Entering PLUGIN_ProcessExporter.convert_model (create the model) [1;30m[output.py at line 202][0m [0m
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates VVV1 routines[0m
@@ -817,7 +817,7 @@ ALOHA: aloha creates FFV1 routines[0m
 ALOHA: aloha creates VVVV1 routines[0m
 ALOHA: aloha creates VVVV3 routines[0m
 ALOHA: aloha creates VVVV4 routines[0m
-ALOHA: aloha creates 10 routines in  0.321 s
+ALOHA: aloha creates 10 routines in  0.310 s
 <class 'aloha.create_aloha.AbstractRoutine'> VVV1
 <class 'aloha.create_aloha.AbstractRoutine'> VVV1
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
@@ -1028,9 +1028,9 @@ Type "launch" to generate events from this process, or see
 Run "open index.html" to see more information about this process.
 quit
 
-real	0m9.073s
-user	0m8.514s
-sys	0m0.464s
+real	0m9.184s
+user	0m8.370s
+sys	0m0.508s
 Code generation completed in 9 seconds
 ************************************************************
 *                                                          *
@@ -1057,7 +1057,7 @@ INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/mg5amc
 INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/Cards/me5_configuration.txt  
 Using default text editor "vi". Set another one in ./input/mg5_configuration.txt
 Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt
-No valid web browser found. Please set in ./input/mg5_configuration.txt
+Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt
 treatcards run
 quit
 INFO:  
@@ -1087,7 +1087,7 @@ INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/mg5amc
 INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/Cards/me5_configuration.txt  
 Using default text editor "vi". Set another one in ./input/mg5_configuration.txt
 Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt
-No valid web browser found. Please set in ./input/mg5_configuration.txt
+Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt
 treatcards param
 quit
 INFO:  
diff --git a/epochX/cudacpp/pp_tt012j.mad/COPYRIGHT b/epochX/cudacpp/pp_tt012j.mad/COPYRIGHT
index a134b5fef9..84a883fbb0 100644
--- a/epochX/cudacpp/pp_tt012j.mad/COPYRIGHT
+++ b/epochX/cudacpp/pp_tt012j.mad/COPYRIGHT
@@ -15,6 +15,7 @@ The full development team currently includes the following authors :
   Stephan Hageboeck (CERN)
   Olivier Mattelaer (Universite Catholique de Louvain, original author)
   Stefan Roiser (CERN, original author)
+  Joergen Teig (CERN)
   Andrea Valassi (CERN, original author)
   Zenny Wettersten (CERN)
 See https://github.com/madgraph5/madgraph4gpu for more details. For the full
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/Bridge.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/Bridge.h
index bf8b5e024d..89437b4c42 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/Bridge.h
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/Bridge.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: S. Roiser (Nov 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Roiser, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef BRIDGE_H
 #define BRIDGE_H 1
@@ -23,7 +23,7 @@
 #include <memory>
 #include <type_traits>
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -83,7 +83,7 @@ namespace mg5amcCpu
     Bridge& operator=( const Bridge& ) = delete;
     Bridge& operator=( Bridge&& ) = delete;
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     /**
      * Set the gpublocks and gputhreads for the gpusequence - throws if evnt != gpublocks*gputhreads
      * (this is needed for BridgeKernel tests rather than for actual production use in Fortran)
@@ -150,7 +150,7 @@ namespace mg5amcCpu
     unsigned int m_nevt; // number of events
     int m_nGoodHel;      // the number of good helicities (-1 initially when they have not yet been calculated)
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     int m_gputhreads; // number of gpu threads (default set from number of events, can be modified)
     int m_gpublocks;  // number of gpu blocks (default set from number of events, can be modified)
     DeviceBuffer<FORTRANFPTYPE, sizePerEventMomenta> m_devMomentaF;
@@ -187,12 +187,12 @@ namespace mg5amcCpu
   // Forward declare transposition methods
   //
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 
   template<typename Tin, typename Tout>
   __global__ void dev_transposeMomentaF2C( const Tin* in, Tout* out, const unsigned int nevt );
 
-#endif // __CUDACC__
+#endif // MGONGPUCPP_GPUIMPL
 
   template<typename Tin, typename Tout>
   void hst_transposeMomentaF2C( const Tin* in, Tout* out, const unsigned int nevt );
@@ -209,7 +209,7 @@ namespace mg5amcCpu
   Bridge<FORTRANFPTYPE>::Bridge( unsigned int nevtF, unsigned int nparF, unsigned int np4F )
     : m_nevt( nevtF )
     , m_nGoodHel( -1 )
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     , m_gputhreads( 256 )                  // default number of gpu threads
     , m_gpublocks( m_nevt / m_gputhreads ) // this ensures m_nevt <= m_gpublocks*m_gputhreads
     , m_devMomentaF( m_nevt )
@@ -233,7 +233,7 @@ namespace mg5amcCpu
   {
     if( nparF != CPPProcess::npar ) throw std::runtime_error( "Bridge constructor: npar mismatch" );
     if( np4F != CPPProcess::np4 ) throw std::runtime_error( "Bridge constructor: np4 mismatch" );
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     if( ( m_nevt < s_gputhreadsmin ) || ( m_nevt % s_gputhreadsmin != 0 ) )
       throw std::runtime_error( "Bridge constructor: nevt should be a multiple of " + std::to_string( s_gputhreadsmin ) );
     while( m_nevt != m_gpublocks * m_gputhreads )
@@ -249,7 +249,7 @@ namespace mg5amcCpu
 #else
     std::cout << "WARNING! Instantiate host Bridge (nevt=" << m_nevt << ")" << std::endl;
     m_pmek.reset( new MatrixElementKernelHost( m_hstMomentaC, m_hstGs, m_hstRndHel, m_hstRndCol, m_hstMEs, m_hstSelHel, m_hstSelCol, m_nevt ) );
-#endif // __CUDACC__
+#endif // MGONGPUCPP_GPUIMPL
     // Create a process object, read param card and set parameters
     // FIXME: the process instance can happily go out of scope because it is only needed to read parameters?
     // FIXME: the CPPProcess should really be a singleton? what if fbridgecreate is called from several Fortran threads?
@@ -262,7 +262,7 @@ namespace mg5amcCpu
     process.initProc( paramCard );
   }
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   template<typename FORTRANFPTYPE>
   void Bridge<FORTRANFPTYPE>::set_gpugrid( const int gpublocks, const int gputhreads )
   {
@@ -276,7 +276,7 @@ namespace mg5amcCpu
   }
 #endif
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   template<typename FORTRANFPTYPE>
   void Bridge<FORTRANFPTYPE>::gpu_sequence( const FORTRANFPTYPE* momenta,
                                             const FORTRANFPTYPE* gs,
@@ -291,14 +291,14 @@ namespace mg5amcCpu
     constexpr int neppM = MemoryAccessMomenta::neppM;
     if constexpr( neppM == 1 && std::is_same_v<FORTRANFPTYPE, fptype> )
     {
-      checkCuda( cudaMemcpy( m_devMomentaC.data(), momenta, m_devMomentaC.bytes(), cudaMemcpyHostToDevice ) );
+      gpuMemcpy( m_devMomentaC.data(), momenta, m_devMomentaC.bytes(), gpuMemcpyHostToDevice );
     }
     else
     {
-      checkCuda( cudaMemcpy( m_devMomentaF.data(), momenta, m_devMomentaF.bytes(), cudaMemcpyHostToDevice ) );
+      gpuMemcpy( m_devMomentaF.data(), momenta, m_devMomentaF.bytes(), gpuMemcpyHostToDevice );
       const int thrPerEvt = CPPProcess::npar * CPPProcess::np4; // AV: transpose alg does 1 element per thread (NOT 1 event per thread)
       //const int thrPerEvt = 1; // AV: try new alg with 1 event per thread... this seems slower
-      dev_transposeMomentaF2C<<<m_gpublocks * thrPerEvt, m_gputhreads>>>( m_devMomentaF.data(), m_devMomentaC.data(), m_nevt );
+      gpuLaunchKernel( dev_transposeMomentaF2C, m_gpublocks * thrPerEvt, m_gputhreads, m_devMomentaF.data(), m_devMomentaC.data(), m_nevt );
     }
     if constexpr( std::is_same_v<FORTRANFPTYPE, fptype> )
     {
@@ -341,7 +341,7 @@ namespace mg5amcCpu
   }
 #endif
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   template<typename FORTRANFPTYPE>
   void Bridge<FORTRANFPTYPE>::cpu_sequence( const FORTRANFPTYPE* momenta,
                                             const FORTRANFPTYPE* gs,
@@ -396,7 +396,7 @@ namespace mg5amcCpu
   // - C++ array: momenta[npagM][npar][np4][neppM] with nevt=npagM*neppM (AOSOA)
   //
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   template<typename Tin, typename Tout>
   __global__ void dev_transposeMomentaF2C( const Tin* in, Tout* out, const unsigned int nevt )
   {
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/BridgeKernels.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/BridgeKernels.cc
index d58066c9c1..eaf4037a24 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/BridgeKernels.cc
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/BridgeKernels.cc
@@ -1,17 +1,18 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #include "BridgeKernels.h"
 
+#include "GpuAbstraction.h"
 #include "MemoryAccessMomenta.h"
 
 #include <sstream>
 
 //============================================================================
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -45,7 +46,7 @@ namespace mg5amcCpu
 
 //============================================================================
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 namespace mg5amcCpu
 {
 
@@ -96,7 +97,7 @@ namespace mg5amcCpu
 
 //============================================================================
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 {
 
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/BridgeKernels.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/BridgeKernels.h
index 15eb4bff4d..3efef8ce97 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/BridgeKernels.h
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/BridgeKernels.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef BRIDGEKERNELS_H
 #define BRIDGEKERNELS_H 1
@@ -12,7 +12,7 @@
 #include "MatrixElementKernels.h"
 #include "MemoryBuffers.h"
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -49,7 +49,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A Bridge wrapper class encapsulating matrix element calculations on a CPU host
   class BridgeKernelHost final : public BridgeKernelBase
   {
@@ -89,7 +89,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   // A Bridge wrapper class encapsulating matrix element calculations on a GPU device
   class BridgeKernelDevice : public BridgeKernelBase
   {
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/CommonRandomNumberKernel.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/CommonRandomNumberKernel.cc
index 985b39f576..010bc4cbd0 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/CommonRandomNumberKernel.cc
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/CommonRandomNumberKernel.cc
@@ -1,15 +1,16 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #include "CommonRandomNumbers.h"
+#include "GpuAbstraction.h"
 #include "MemoryBuffers.h"
 #include "RandomNumberKernels.h"
 
 #include <cassert>
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/CrossSectionKernels.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/CrossSectionKernels.cc
index 0b355a3c8d..c15b39844d 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/CrossSectionKernels.cc
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/CrossSectionKernels.cc
@@ -1,10 +1,11 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #include "CrossSectionKernels.h"
 
+#include "GpuAbstraction.h"
 #include "MemoryAccessMatrixElements.h"
 #include "MemoryAccessWeights.h"
 #include "MemoryBuffers.h"
@@ -77,7 +78,7 @@ debug_me_is_abnormal( const fptype& me, size_t ievtALL )
 
 //============================================================================
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -185,7 +186,7 @@ namespace mg5amcCpu
 
 //============================================================================
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 {
 
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/CrossSectionKernels.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/CrossSectionKernels.h
index 7933ca4bbf..4d9659e04e 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/CrossSectionKernels.h
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/CrossSectionKernels.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef CROSSSECTIONKERNELS_H
 #define CROSSSECTIONKERNELS_H 1
@@ -13,7 +13,7 @@
 
 //============================================================================
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -96,7 +96,7 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
   /*
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   // A class encapsulating the calculation of event statistics on a GPU device
   class CrossSectionKernelDevice : public CrossSectionKernelBase, public NumberOfEvents
   {
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/CudaRuntime.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/CudaRuntime.h
deleted file mode 100644
index 64ce52f4b3..0000000000
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/CudaRuntime.h
+++ /dev/null
@@ -1,85 +0,0 @@
-// Copyright (C) 2020-2023 CERN and UCLouvain.
-// Licensed under the GNU Lesser General Public License (version 3 or later).
-// Created by: S. Roiser (Jul 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
-
-#ifndef MG5AMC_CUDARUNTIME_H
-#define MG5AMC_CUDARUNTIME_H 1
-
-// MG5AMC on GPU uses the CUDA runtime API, not the lower level CUDA driver API
-// See https://docs.nvidia.com/cuda/cuda-runtime-api/driver-vs-runtime-api.html#driver-vs-runtime-api
-
-#include <cassert>
-#include <iostream>
-
-//--------------------------------------------------------------------------
-
-// See https://stackoverflow.com/a/14038590
-#ifdef __CUDACC__ /* clang-format off */
-#define checkCuda( code ) { assertCuda( code, __FILE__, __LINE__ ); }
-inline void assertCuda( cudaError_t code, const char* file, int line, bool abort = true )
-{
-  if( code != cudaSuccess )
-  {
-    printf( "ERROR! assertCuda: '%s' (%d) in %s:%d\n", cudaGetErrorString( code ), code, file, line );
-    if( abort ) assert( code == cudaSuccess );
-  }
-}
-#endif /* clang-format on */
-
-//--------------------------------------------------------------------------
-
-#ifdef __CUDACC__
-namespace mg5amcGpu
-{
-  // Instantiate a CudaRuntime at the beginnining of the application's main to
-  // invoke cudaSetDevice(0) in the constructor and book a cudaDeviceReset() call in the destructor
-  // *** FIXME! This will all need to be designed differently when going to multi-GPU nodes! ***
-  struct CudaRuntime final
-  {
-    CudaRuntime( const bool debug = true )
-      : m_debug( debug ) { setUp( m_debug ); }
-    ~CudaRuntime() { tearDown( m_debug ); }
-    CudaRuntime( const CudaRuntime& ) = delete;
-    CudaRuntime( CudaRuntime&& ) = delete;
-    CudaRuntime& operator=( const CudaRuntime& ) = delete;
-    CudaRuntime& operator=( CudaRuntime&& ) = delete;
-    bool m_debug;
-
-    // Set up CUDA application
-    // ** NB: strictly speaking this is not needed when using the CUDA runtime API **
-    // Calling cudaSetDevice on startup is useful to properly book-keep the time spent in CUDA initialization
-    static void setUp( const bool debug = true )
-    {
-      // ** NB: it is useful to call cudaSetDevice, or cudaFree, to properly book-keep the time spent in CUDA initialization
-      // ** NB: otherwise, the first CUDA operation (eg a cudaMemcpyToSymbol in CPPProcess ctor) appears to take much longer!
-      /*
-      // [We initially added cudaFree(0) to "ease profile analysis" only because it shows up as a big recognizable block!]
-      // No explicit initialization is needed: https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#initialization
-      // It is not clear what cudaFree(0) does at all: https://stackoverflow.com/questions/69967813/
-      if ( debug ) std::cout << "__CudaRuntime: calling cudaFree(0)" << std::endl;
-      checkCuda( cudaFree( 0 ) ); // SLOW!
-      */
-      // Replace cudaFree(0) by cudaSetDevice(0), even if it is not really needed either
-      // (but see https://developer.nvidia.com/blog/cuda-pro-tip-always-set-current-device-avoid-multithreading-bugs)
-      if( debug ) std::cout << "__CudaRuntime: calling cudaSetDevice(0)" << std::endl;
-      checkCuda( cudaSetDevice( 0 ) ); // SLOW!
-    }
-
-    // Tear down CUDA application (call cudaDeviceReset)
-    // ** NB: strictly speaking this is not needed when using the CUDA runtime API **
-    // Calling cudaDeviceReset on shutdown is only needed for checking memory leaks in cuda-memcheck
-    // See https://docs.nvidia.com/cuda/cuda-memcheck/index.html#leak-checking
-    static void tearDown( const bool debug = true )
-    {
-      if( debug ) std::cout << "__CudaRuntime: calling cudaDeviceReset()" << std::endl;
-      checkCuda( cudaDeviceReset() );
-    }
-  };
-
-}
-#endif
-
-//--------------------------------------------------------------------------
-
-#endif // MG5AMC_CUDARUNTIME_H
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/CurandRandomNumberKernel.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/CurandRandomNumberKernel.cc
index eb56333b03..08a16f6f2c 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/CurandRandomNumberKernel.cc
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/CurandRandomNumberKernel.cc
@@ -1,9 +1,9 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
-#include "CudaRuntime.h"
+#include "GpuRuntime.h"
 #include "MemoryBuffers.h"
 #include "RandomNumberKernels.h"
 
@@ -22,7 +22,7 @@ inline void assertCurand( curandStatus_t code, const char *file, int line, bool
 }
 #endif /* clang-format on */
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -36,7 +36,7 @@ namespace mg5amcCpu
   {
     if( m_isOnDevice )
     {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
       if( !m_rnarray.isOnDevice() )
         throw std::runtime_error( "CurandRandomNumberKernel on device with a host random number array" );
 #else
@@ -114,7 +114,7 @@ namespace mg5amcCpu
     /*
     printf( "\nCurandRandomNumberKernel::generateRnarray size = %d\n", (int)m_rnarray.size() );
     fptype* data = m_rnarray.data();
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     if( m_rnarray.isOnDevice() )
     {
       data = new fptype[m_rnarray.size()]();
@@ -123,7 +123,7 @@ namespace mg5amcCpu
 #endif
     for( int i = 0; i < ( (int)m_rnarray.size() / 4 ); i++ )
       printf( "[%4d] %f %f %f %f\n", i * 4, data[i * 4], data[i * 4 + 2], data[i * 4 + 2], data[i * 4 + 3] );
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     if( m_rnarray.isOnDevice() ) delete[] data;
 #endif
     */
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/EventStatistics.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/EventStatistics.h
index 48b51e0a49..b425a5bade 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/EventStatistics.h
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/EventStatistics.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef EventStatistics_H
 #define EventStatistics_H 1
@@ -16,7 +16,7 @@
 #include <limits>
 #include <string>
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/GpuAbstraction.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/GpuAbstraction.h
new file mode 100644
index 0000000000..6a7d9c05c0
--- /dev/null
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/GpuAbstraction.h
@@ -0,0 +1,71 @@
+// Copyright (C) 2020-2023 CERN and UCLouvain.
+// Licensed under the GNU Lesser General Public License (version 3 or later).
+// Created by: J. Teig (Jul 2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+
+#ifndef MG5AMC_GPUABSTRACTION_H
+#define MG5AMC_GPUABSTRACTION_H 1
+
+#include <cassert>
+
+//--------------------------------------------------------------------------
+
+#ifdef __CUDACC__
+
+#define gpuError_t cudaError_t
+#define gpuPeekAtLastError cudaPeekAtLastError
+#define gpuGetErrorString cudaGetErrorString
+#define gpuSuccess cudaSuccess
+
+#define gpuMallocHost( ptr, size ) checkGpu( cudaMallocHost( ptr, size ) )
+#define gpuMalloc( ptr, size ) checkGpu( cudaMalloc( ptr, size ) )
+
+#define gpuMemcpy( dstData, srcData, srcBytes, func ) checkGpu( cudaMemcpy( dstData, srcData, srcBytes, func ) )
+#define gpuMemcpyHostToDevice cudaMemcpyHostToDevice
+#define gpuMemcpyDeviceToHost cudaMemcpyDeviceToHost
+#define gpuMemcpyToSymbol( type1, type2, size ) checkGpu( cudaMemcpyToSymbol( type1, type2, size ) )
+
+#define gpuFree( ptr ) checkGpu( cudaFree( ptr ) )
+#define gpuFreeHost( ptr ) checkGpu( cudaFreeHost( ptr ) )
+
+#define gpuSetDevice cudaSetDevice
+#define gpuDeviceSynchronize cudaDeviceSynchronize
+#define gpuDeviceReset cudaDeviceReset
+
+#define gpuLaunchKernel( kernel, blocks, threads, ... ) kernel<<<blocks, threads>>>( __VA_ARGS__ )
+#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<<blocks, threads, sharedMem>>>( __VA_ARGS__ )
+
+//--------------------------------------------------------------------------
+
+#elif defined __HIPCC__
+
+#include "hip/hip_runtime.h"
+
+#define gpuError_t hipError_t
+#define gpuPeekAtLastError hipPeekAtLastError
+#define gpuGetErrorString hipGetErrorString
+#define gpuSuccess hipSuccess
+
+#define gpuMallocHost( ptr, size ) checkGpu( hipHostMalloc( ptr, size ) ) // HostMalloc better
+#define gpuMalloc( ptr, size ) checkGpu( hipMalloc( ptr, size ) )
+
+#define gpuMemcpy( dstData, srcData, srcBytes, func ) checkGpu( hipMemcpy( dstData, srcData, srcBytes, func ) )
+#define gpuMemcpyHostToDevice hipMemcpyHostToDevice
+#define gpuMemcpyDeviceToHost hipMemcpyDeviceToHost
+#define gpuMemcpyToSymbol( type1, type2, size ) checkGpu( hipMemcpyToSymbol( type1, type2, size ) )
+
+#define gpuFree( ptr ) checkGpu( hipFree( ptr ) )
+#define gpuFreeHost( ptr ) checkGpu( hipHostFree( ptr ) )
+
+#define gpuSetDevice hipSetDevice
+#define gpuDeviceSynchronize hipDeviceSynchronize
+#define gpuDeviceReset hipDeviceReset
+
+#define gpuLaunchKernel( kernel, blocks, threads, ... ) kernel<<<blocks, threads>>>( __VA_ARGS__ )
+#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<<blocks, threads, sharedMem>>>( __VA_ARGS__ )
+
+//--------------------------------------------------------------------------
+
+#endif
+
+#endif // MG5AMC_GPUABSTRACTION_H
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/GpuRuntime.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/GpuRuntime.h
new file mode 100644
index 0000000000..93579ef08b
--- /dev/null
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/GpuRuntime.h
@@ -0,0 +1,85 @@
+// Copyright (C) 2020-2023 CERN and UCLouvain.
+// Licensed under the GNU Lesser General Public License (version 3 or later).
+// Created by: J. Teig (Jun 2023, based on earlier work by S. Roiser) for the MG5aMC CUDACPP plugin.
+// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+
+#ifndef MG5AMC_GPURUNTIME_H
+#define MG5AMC_GPURUNTIME_H 1
+
+// MG5AMC on GPU uses the CUDA runtime API, not the lower level CUDA driver API
+// See https://docs.nvidia.com/cuda/cuda-runtime-api/driver-vs-runtime-api.html#driver-vs-runtime-api
+
+#include "GpuAbstraction.h"
+
+#include <iostream>
+
+//--------------------------------------------------------------------------
+
+// See https://stackoverflow.com/a/14038590
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
+#define checkGpu( code ) { assertGpu( code, __FILE__, __LINE__ ); }
+inline void assertGpu( gpuError_t code, const char* file, int line, bool abort = true )
+{
+  if( code != gpuSuccess )
+  {
+    printf( "ERROR! assertGpu: '%s' (%d) in %s:%d\n", gpuGetErrorString( code ), code, file, line );
+    if( abort ) assert( code == gpuSuccess );
+  }
+}
+#endif /* clang-format on */
+
+//--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+namespace mg5amcGpu
+{
+  // Instantiate a GpuRuntime at the beginnining of the application's main to
+  // invoke gpuSetDevice(0) in the constructor and book a gpuDeviceReset() call in the destructor
+  // *** FIXME! This will all need to be designed differently when going to multi-GPU nodes! ***
+  struct GpuRuntime final
+  {
+    GpuRuntime( const bool debug = true )
+      : m_debug( debug ) { setUp( m_debug ); }
+    ~GpuRuntime() { tearDown( m_debug ); }
+    GpuRuntime( const GpuRuntime& ) = delete;
+    GpuRuntime( GpuRuntime&& ) = delete;
+    GpuRuntime& operator=( const GpuRuntime& ) = delete;
+    GpuRuntime& operator=( GpuRuntime&& ) = delete;
+    bool m_debug;
+
+    // Set up CUDA application
+    // ** NB: strictly speaking this is not needed when using the CUDA runtime API **
+    // Calling cudaSetDevice on startup is useful to properly book-keep the time spent in CUDA initialization
+    static void setUp( const bool debug = true )
+    {
+      // ** NB: it is useful to call cudaSetDevice, or cudaFree, to properly book-keep the time spent in CUDA initialization
+      // ** NB: otherwise, the first CUDA operation (eg a cudaMemcpyToSymbol in CPPProcess ctor) appears to take much longer!
+      /*
+      // [We initially added cudaFree(0) to "ease profile analysis" only because it shows up as a big recognizable block!]
+      // No explicit initialization is needed: https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#initialization
+      // It is not clear what cudaFree(0) does at all: https://stackoverflow.com/questions/69967813/
+      if ( debug ) std::cout << "__CudaRuntime: calling cudaFree(0)" << std::endl;
+      checkCuda( cudaFree( 0 ) ); // SLOW!
+      */
+      // Replace cudaFree(0) by cudaSetDevice(0), even if it is not really needed either
+      // (but see https://developer.nvidia.com/blog/cuda-pro-tip-always-set-current-device-avoid-multithreading-bugs)
+      if( debug ) std::cout << "__GpuRuntime: calling GpuSetDevice(0)" << std::endl;
+      checkGpu( gpuSetDevice( 0 ) ); // SLOW!
+    }
+
+    // Tear down CUDA application (call cudaDeviceReset)
+    // ** NB: strictly speaking this is not needed when using the CUDA runtime API **
+    // Calling cudaDeviceReset on shutdown is only needed for checking memory leaks in cuda-memcheck
+    // See https://docs.nvidia.com/cuda/cuda-memcheck/index.html#leak-checking
+    static void tearDown( const bool debug = true )
+    {
+      if( debug ) std::cout << "__GpuRuntime: calling GpuDeviceReset()" << std::endl;
+      checkGpu( gpuDeviceReset() );
+    }
+  };
+}
+#endif
+
+//--------------------------------------------------------------------------
+
+#endif // MG5AMC_GPURUNTIME_H
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/MadgraphTest.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/MadgraphTest.h
index ef40624c88..a64c05c26a 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/MadgraphTest.h
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/MadgraphTest.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: S. Hageboeck (Dec 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MADGRAPHTEST_H_
 #define MADGRAPHTEST_H_ 1
@@ -22,7 +22,7 @@
 #include <string>
 #include <vector>
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 using mg5amcGpu::CPPProcess;
 #else
 using mg5amcCpu::CPPProcess;
@@ -201,7 +201,7 @@ class MadgraphTest : public testing::TestWithParam<TestDriverBase*>
 
 // Since we link both the CPU-only and GPU tests into the same executable, we prevent
 // a multiply defined symbol by only compiling this in the non-CUDA phase:
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 
 /// Compare momenta and matrix elements.
 /// This uses an implementation of TestDriverBase to run a madgraph workflow,
@@ -307,6 +307,6 @@ TEST_P( MadgraphTest, CompareMomentaAndME )
   }
 }
 
-#endif // __CUDACC__
+#endif // MGONGPUCPP_GPUIMPL
 
 #endif /* MADGRAPHTEST_H_ */
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/MatrixElementKernels.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/MatrixElementKernels.cc
index 74b5239ebf..81699dfea9 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/MatrixElementKernels.cc
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/MatrixElementKernels.cc
@@ -1,12 +1,12 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #include "MatrixElementKernels.h"
 
 #include "CPPProcess.h"
-#include "CudaRuntime.h"
+#include "GpuRuntime.h" // Includes the abstraction for Nvidia/AMD compilation
 #include "MemoryAccessMomenta.h"
 #include "MemoryBuffers.h"
 
@@ -14,7 +14,7 @@
 
 //============================================================================
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 namespace mg5amcCpu
 {
 
@@ -150,7 +150,7 @@ namespace mg5amcCpu
 
 //============================================================================
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 {
 
@@ -209,13 +209,13 @@ namespace mg5amcGpu
     PinnedHostBufferHelicityMask hstIsGoodHel( ncomb );
     DeviceBufferHelicityMask devIsGoodHel( ncomb );
     // ... 0d1. Compute good helicity mask on the device
-    computeDependentCouplings<<<m_gpublocks, m_gputhreads>>>( m_gs.data(), m_couplings.data() );
+    gpuLaunchKernel( computeDependentCouplings, m_gpublocks, m_gputhreads, m_gs.data(), m_couplings.data() );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    sigmaKin_getGoodHel<<<m_gpublocks, m_gputhreads>>>( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_numerators.data(), m_denominators.data(), devIsGoodHel.data() );
+    gpuLaunchKernel( sigmaKin_getGoodHel, m_gpublocks, m_gputhreads, m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_numerators.data(), m_denominators.data(), devIsGoodHel.data() );
 #else
-    sigmaKin_getGoodHel<<<m_gpublocks, m_gputhreads>>>( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), devIsGoodHel.data() );
+    gpuLaunchKernel( sigmaKin_getGoodHel, m_gpublocks, m_gputhreads, m_momenta.data(), m_couplings.data(), m_matrixElements.data(), devIsGoodHel.data() );
 #endif
-    checkCuda( cudaPeekAtLastError() );
+    checkGpu( gpuPeekAtLastError() );
     // ... 0d2. Copy back good helicity mask to the host
     copyHostFromDevice( hstIsGoodHel, devIsGoodHel );
     // ... 0d3. Copy back good helicity list to constant memory on the device
@@ -226,19 +226,19 @@ namespace mg5amcGpu
 
   void MatrixElementKernelDevice::computeMatrixElements( const unsigned int channelId )
   {
-    computeDependentCouplings<<<m_gpublocks, m_gputhreads>>>( m_gs.data(), m_couplings.data() );
+    gpuLaunchKernel( computeDependentCouplings, m_gpublocks, m_gputhreads, m_gs.data(), m_couplings.data() );
 #ifndef MGONGPU_NSIGHT_DEBUG
     constexpr unsigned int sharedMemSize = 0;
 #else
     constexpr unsigned int sharedMemSize = ntpbMAX * sizeof( float );
 #endif
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    sigmaKin<<<m_gpublocks, m_gputhreads, sharedMemSize>>>( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), channelId, m_numerators.data(), m_denominators.data(), m_selhel.data(), m_selcol.data() );
+    gpuLaunchKernelSharedMem( sigmaKin, m_gpublocks, m_gputhreads, sharedMemSize, m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), channelId, m_numerators.data(), m_denominators.data(), m_selhel.data(), m_selcol.data() );
 #else
-    sigmaKin<<<m_gpublocks, m_gputhreads, sharedMemSize>>>( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), m_selhel.data(), m_selcol.data() );
+    gpuLaunchKernelSharedMem( sigmaKin, m_gpublocks, m_gputhreads, sharedMemSize, m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), m_selhel.data(), m_selcol.data() );
 #endif
-    checkCuda( cudaPeekAtLastError() );
-    checkCuda( cudaDeviceSynchronize() );
+    checkGpu( gpuPeekAtLastError() );
+    checkGpu( gpuDeviceSynchronize() );
   }
 
   //--------------------------------------------------------------------------
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/MatrixElementKernels.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/MatrixElementKernels.h
index 23e84757a2..72bd8f195b 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/MatrixElementKernels.h
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/MatrixElementKernels.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MATRIXELEMENTKERNELS_H
 #define MATRIXELEMENTKERNELS_H 1
@@ -10,7 +10,7 @@
 
 #include "MemoryBuffers.h"
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -81,7 +81,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating matrix element calculations on a CPU host
   class MatrixElementKernelHost final : public MatrixElementKernelBase, public NumberOfEvents
   {
@@ -130,7 +130,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   // A class encapsulating matrix element calculations on a GPU device
   class MatrixElementKernelDevice : public MatrixElementKernelBase, public NumberOfEvents
   {
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/MemoryAccessAmplitudes.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/MemoryAccessAmplitudes.h
index 573b3bbbc9..ffb76e93de 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/MemoryAccessAmplitudes.h
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/MemoryAccessAmplitudes.h
@@ -15,7 +15,7 @@
 #define MGONGPU_TRIVIAL_AMPLITUDES 1
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/MemoryAccessCouplings.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/MemoryAccessCouplings.h
index 35a3af42e0..3afdf3e554 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/MemoryAccessCouplings.h
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/MemoryAccessCouplings.h
@@ -15,7 +15,7 @@
 #include "MemoryBuffers.h"       // for HostBufferCouplings::isaligned
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/MemoryAccessCouplingsFixed.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/MemoryAccessCouplingsFixed.h
index dc0d93afff..ffcdf4dbef 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/MemoryAccessCouplingsFixed.h
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/MemoryAccessCouplingsFixed.h
@@ -14,7 +14,7 @@
 //#include "MemoryAccessHelpers.h"
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/MemoryAccessDenominators.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/MemoryAccessDenominators.h
index 3bce635718..66f2d32a6b 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/MemoryAccessDenominators.h
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/MemoryAccessDenominators.h
@@ -10,7 +10,7 @@
 #include "MemoryAccessGs.h"
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/MemoryAccessGs.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/MemoryAccessGs.h
index 31311aa375..4c726b30f3 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/MemoryAccessGs.h
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/MemoryAccessGs.h
@@ -13,7 +13,7 @@
 #include "MemoryBuffers.h" // for HostBufferMatrixElements::isaligned
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/MemoryAccessHelpers.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/MemoryAccessHelpers.h
index c82a6c7635..db73e4e064 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/MemoryAccessHelpers.h
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/MemoryAccessHelpers.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MemoryAccessHelpers_H
 #define MemoryAccessHelpers_H 1
@@ -105,7 +105,7 @@ class KernelAccessHelper : public MemoryAccessHelper<T>
     }
     else
     {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
       const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid
       //printf( "kernelAccessRecord: ievt=%d threadId=%d\n", ievt, threadIdx.x );
       return T::ieventAccessRecord( buffer, ievt ); // NB fptype and fptype_sv coincide for CUDA
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/MemoryAccessMatrixElements.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/MemoryAccessMatrixElements.h
index f32e6fea5b..3741011971 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/MemoryAccessMatrixElements.h
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/MemoryAccessMatrixElements.h
@@ -13,7 +13,7 @@
 #include "MemoryBuffers.h" // for HostBufferMatrixElements::isaligned
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/MemoryAccessMomenta.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/MemoryAccessMomenta.h
index 29266de32c..3be229d392 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/MemoryAccessMomenta.h
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/MemoryAccessMomenta.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MemoryAccessMomenta_H
 #define MemoryAccessMomenta_H 1
@@ -13,7 +13,7 @@
 #include "MemoryAccessVectors.h"
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -30,7 +30,7 @@ namespace mg5amcCpu
 
     // Number of Events Per Page in the momenta AOSOA memory buffer layout
     // (these are all best kept as a compile-time constants: see issue #23)
-#ifdef __CUDACC__ /* clang-format off */
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
     // -----------------------------------------------------------------------------------------------
     // --- GPUs: neppM is best set to a power of 2 times the number of fptype's in a 32-byte cacheline
     // --- This is relevant to ensure coalesced access to momenta in global memory
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/MemoryAccessNumerators.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/MemoryAccessNumerators.h
index b152183b28..18991f4fa6 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/MemoryAccessNumerators.h
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/MemoryAccessNumerators.h
@@ -10,7 +10,7 @@
 #include "MemoryAccessGs.h"
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/MemoryAccessRandomNumbers.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/MemoryAccessRandomNumbers.h
index e2988d39f3..40cb089135 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/MemoryAccessRandomNumbers.h
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/MemoryAccessRandomNumbers.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MemoryAccessRandomNumbers_H
 #define MemoryAccessRandomNumbers_H 1
@@ -11,7 +11,7 @@
 #include "CPPProcess.h"
 #include "MemoryAccessHelpers.h"
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 using mg5amcGpu::CPPProcess;
 #else
 using mg5amcCpu::CPPProcess;
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/MemoryAccessVectors.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/MemoryAccessVectors.h
index e9b197368e..08faccff0f 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/MemoryAccessVectors.h
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/MemoryAccessVectors.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MemoryAccessVectors_H
 #define MemoryAccessVectors_H 1
@@ -10,7 +10,7 @@
 
 #include "mgOnGpuVectors.h"
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 namespace mg5amcCpu // this is only needed for CPU SIMD vectorization
 {
 
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/MemoryAccessWavefunctions.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/MemoryAccessWavefunctions.h
index 5428aaf933..33bef4559e 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/MemoryAccessWavefunctions.h
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/MemoryAccessWavefunctions.h
@@ -15,7 +15,7 @@
 #define MGONGPU_TRIVIAL_WAVEFUNCTIONS 1
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/MemoryBuffers.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/MemoryBuffers.h
index 3093e6ed18..7756a71621 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/MemoryBuffers.h
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/MemoryBuffers.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021, based on earlier work by S. Hageboeck) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Roiser, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MemoryBuffers_H
 #define MemoryBuffers_H 1
@@ -11,12 +11,12 @@
 #include "mgOnGpuCxtypes.h"
 
 #include "CPPProcess.h"
-#include "CudaRuntime.h"
+#include "GpuRuntime.h"
 #include "Parameters_sm.h"
 
 #include <sstream>
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -87,7 +87,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   constexpr bool HostBufferALIGNED = false;   // ismisaligned=false
   constexpr bool HostBufferMISALIGNED = true; // ismisaligned=true
 
@@ -119,7 +119,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   // A class encapsulating a CUDA pinned host buffer
   template<typename T>
   class PinnedHostBufferBase : public BufferBase<T>
@@ -128,18 +128,18 @@ namespace mg5amcCpu
     PinnedHostBufferBase( const size_t size )
       : BufferBase<T>( size, false )
     {
-      checkCuda( cudaMallocHost( &( this->m_data ), this->bytes() ) );
+      gpuMallocHost( &( this->m_data ), this->bytes() );
     }
     virtual ~PinnedHostBufferBase()
     {
-      checkCuda( cudaFreeHost( this->m_data ) );
+      gpuFreeHost( this->m_data );
     }
   };
 #endif
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   // A class encapsulating a CUDA device buffer
   template<typename T>
   class DeviceBufferBase : public BufferBase<T>
@@ -148,18 +148,18 @@ namespace mg5amcCpu
     DeviceBufferBase( const size_t size )
       : BufferBase<T>( size, true )
     {
-      checkCuda( cudaMalloc( &( this->m_data ), this->bytes() ) );
+      gpuMalloc( &( this->m_data ), this->bytes() );
     }
     virtual ~DeviceBufferBase()
     {
-      checkCuda( cudaFree( this->m_data ) );
+      gpuFree( this->m_data );
     }
   };
 #endif
 
   //--------------------------------------------------------------------------
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for a given number of events
   template<typename T, size_t sizePerEvent, bool ismisaligned>
   class HostBuffer : public HostBufferBase<T, ismisaligned>, virtual private NumberOfEvents
@@ -175,7 +175,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   // A class encapsulating a CUDA pinned host buffer for a given number of events
   template<typename T, size_t sizePerEvent>
   class PinnedHostBuffer : public PinnedHostBufferBase<T>, virtual private NumberOfEvents
@@ -191,7 +191,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   // A class encapsulating a CUDA device buffer for a given number of events
   template<typename T, size_t sizePerEvent>
   class DeviceBuffer : public DeviceBufferBase<T>, virtual private NumberOfEvents
@@ -213,7 +213,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for momenta random numbers
   constexpr size_t sizePerEventRndNumMomenta = MemoryBuffers::np4 * MemoryBuffers::nparf;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for momenta random numbers
   typedef HostBuffer<fptype, sizePerEventRndNumMomenta, HostBufferALIGNED> HostBufferRndNumMomenta;
 #else
@@ -232,7 +232,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer with ONE fptype per event
   constexpr size_t sizePerEventOneFp = 1;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer with ONE fptype per event
   typedef HostBuffer<fptype, sizePerEventOneFp, HostBufferALIGNED> HostBufferOneFp;
 #else
@@ -257,7 +257,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for Gs
   constexpr size_t sizePerEventGs = 1;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for gs
   typedef HostBuffer<fptype, sizePerEventGs, HostBufferALIGNED> HostBufferGs;
 #else
@@ -276,7 +276,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for numerators
   constexpr size_t sizePerEventNumerators = 1;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for gs
   typedef HostBuffer<fptype, sizePerEventNumerators, HostBufferALIGNED> HostBufferNumerators;
 #else
@@ -296,7 +296,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for denominators
   constexpr size_t sizePerEventDenominators = 1;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for gs
   typedef HostBuffer<fptype, sizePerEventDenominators, HostBufferALIGNED> HostBufferDenominators;
 #else
@@ -315,7 +315,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for random numbers
   constexpr size_t sizePerEventCouplings = MemoryBuffers::ndcoup * MemoryBuffers::nx2;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for gs
   typedef HostBuffer<fptype, sizePerEventCouplings, HostBufferALIGNED> HostBufferCouplings;
 #else
@@ -333,7 +333,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for momenta
   constexpr size_t sizePerEventMomenta = MemoryBuffers::np4 * MemoryBuffers::npar;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for momenta
   typedef HostBuffer<fptype, sizePerEventMomenta, HostBufferALIGNED> HostBufferMomenta;
   //typedef HostBuffer<fptype, sizePerEventMomenta, HostBufferMISALIGNED> HostBufferMomenta; // TEST MISALIGNMENT!
@@ -352,7 +352,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for sampling weights
   constexpr size_t sizePerEventWeights = 1;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for sampling weights
   typedef HostBuffer<fptype, sizePerEventWeights, HostBufferALIGNED> HostBufferWeights;
 #else
@@ -370,7 +370,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for matrix elements
   constexpr size_t sizePerEventMatrixElements = 1;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for matrix elements
   typedef HostBuffer<fptype, sizePerEventMatrixElements, HostBufferALIGNED> HostBufferMatrixElements;
 #else
@@ -385,7 +385,7 @@ namespace mg5amcCpu
   // A base class encapsulating a memory buffer for the helicity mask
   typedef BufferBase<bool> BufferHelicityMask;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for the helicity mask
   typedef HostBufferBase<bool, HostBufferALIGNED> HostBufferHelicityMask;
 #else
@@ -403,7 +403,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for wavefunctions
   constexpr size_t sizePerEventWavefunctions = MemoryBuffers::nw6 * MemoryBuffers::nx2;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for wavefunctions
   typedef HostBuffer<fptype, sizePerEventWavefunctions, HostBufferALIGNED> HostBufferWavefunctions;
 #else
@@ -421,7 +421,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for helicity random numbers
   constexpr size_t sizePerEventRndNumHelicity = 1;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for helicity random numbers
   typedef HostBuffer<fptype, sizePerEventRndNumHelicity, HostBufferALIGNED> HostBufferRndNumHelicity;
 #else
@@ -439,7 +439,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for color random numbers
   constexpr size_t sizePerEventRndNumColor = 1;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for color random numbers
   typedef HostBuffer<fptype, sizePerEventRndNumColor, HostBufferALIGNED> HostBufferRndNumColor;
 #else
@@ -457,7 +457,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for helicity selection
   constexpr size_t sizePerEventSelectedHelicity = 1;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for helicity selection
   typedef HostBuffer<int, sizePerEventSelectedHelicity, HostBufferALIGNED> HostBufferSelectedHelicity;
 #else
@@ -475,7 +475,7 @@ namespace mg5amcCpu
   // The size (number of elements) per event in a memory buffer for color selection
   constexpr size_t sizePerEventSelectedColor = 1;
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A class encapsulating a C++ host buffer for color selection
   typedef HostBuffer<int, sizePerEventSelectedColor, HostBufferALIGNED> HostBufferSelectedColor;
 #else
@@ -487,7 +487,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   template<class Tdst, class Tsrc>
   void copyDeviceFromHost( Tdst& dst, const Tsrc& src ) // keep the same order of arguments as in memcpy
   {
@@ -504,13 +504,13 @@ namespace mg5amcCpu
       throw std::runtime_error( sstr.str() );
     }
     // NB (PR #45): cudaMemcpy involves an intermediate memcpy to pinned memory if host array is a not a pinned host array
-    checkCuda( cudaMemcpy( dst.data(), src.data(), src.bytes(), cudaMemcpyHostToDevice ) );
+    gpuMemcpy( dst.data(), src.data(), src.bytes(), gpuMemcpyHostToDevice );
   }
 #endif
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   template<class Tdst, class Tsrc>
   void copyHostFromDevice( Tdst& dst, const Tsrc& src ) // keep the same order of arguments as in memcpy
   {
@@ -527,7 +527,7 @@ namespace mg5amcCpu
       throw std::runtime_error( sstr.str() );
     }
     // NB (PR #45): cudaMemcpy involves an intermediate memcpy to pinned memory if host array is a not a pinned host array
-    checkCuda( cudaMemcpy( dst.data(), src.data(), src.bytes(), cudaMemcpyDeviceToHost ) );
+    gpuMemcpy( dst.data(), src.data(), src.bytes(), gpuMemcpyDeviceToHost );
   }
 #endif
 
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_gg_ttx/CPPProcess.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_gg_ttx/CPPProcess.cc
index 7f14b5e299..40d8bdea5f 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_gg_ttx/CPPProcess.cc
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_gg_ttx/CPPProcess.cc
@@ -4,7 +4,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
 // MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
@@ -16,7 +16,6 @@
 
 #include "mgOnGpuConfig.h"
 
-#include "CudaRuntime.h"
 #include "HelAmps_sm.h"
 #include "MemoryAccessAmplitudes.h"
 #include "MemoryAccessCouplings.h"
@@ -46,7 +45,7 @@
 // Class member functions for calculating the matrix elements for
 // Process: g g > t t~ WEIGHTED<=2
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -80,7 +79,7 @@ namespace mg5amcCpu
   __device__ const fptype cIPD[2] = { (fptype)Parameters_sm::mdl_MT, (fptype)Parameters_sm::mdl_WT };
   __device__ const fptype* cIPC = nullptr; // unused as nicoup=0
 #else
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   __device__ __constant__ fptype cIPD[2];
   __device__ __constant__ fptype* cIPC = nullptr; // unused as nicoup=0
 #else
@@ -90,7 +89,7 @@ namespace mg5amcCpu
 #endif
 
   // Helicity combinations (and filtering of "good" helicity combinations)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   __device__ __constant__ short cHel[ncomb][npar];
   __device__ __constant__ int cNGoodHel;
   __device__ __constant__ int cGoodHel[ncomb];
@@ -118,13 +117,13 @@ namespace mg5amcCpu
                            fptype* allDenominators,       // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
                            fptype_sv* jamp2_sv            // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled)
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
                            , const int ievt00             // input: first event number in current C++ event page (for CUDA, ievt depends on threadid)
 #endif
                            )
   //ALWAYS_INLINE // attributes are not permitted in a function definition
   {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     using namespace mg5amcGpu;
     using M_ACCESS = DeviceAccessMomenta;         // non-trivial access: buffer includes all events
     using E_ACCESS = DeviceAccessMatrixElements;  // non-trivial access: buffer includes all events
@@ -151,7 +150,7 @@ namespace mg5amcCpu
 #endif /* clang-format on */
     mgDebug( 0, __FUNCTION__ );
     //printf( "calculate_wavefunctions: ihel=%2d\n", ihel );
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
     //printf( "calculate_wavefunctions: ievt00=%d\n", ievt00 );
 #endif
 
@@ -187,7 +186,7 @@ namespace mg5amcCpu
 #endif
     for( int iParity = 0; iParity < nParity; ++iParity )
     { // START LOOP ON IPARITY
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
       const int ievt0 = ievt00 + iParity * neppV;
 #endif
       constexpr size_t nxcoup = ndcoup + nicoup; // both dependent and independent couplings
@@ -200,8 +199,10 @@ namespace mg5amcCpu
         allCOUPs[idcoup] = CD_ACCESS::idcoupAccessBufferConst( allcouplings, idcoup ); // dependent couplings, vary event-by-event
       for( size_t iicoup = 0; iicoup < nicoup; iicoup++ )
         allCOUPs[ndcoup + iicoup] = CI_ACCESS::iicoupAccessBufferConst( cIPC, iicoup ); // independent couplings, fixed for all events
+#ifdef MGONGPUCPP_GPUIMPL
 #ifdef __CUDACC__
 #pragma nv_diagnostic pop
+#endif
       // CUDA kernels take input/output buffers with momenta/MEs for all events
       const fptype* momenta = allmomenta;
       const fptype* COUPs[nxcoup];
@@ -302,7 +303,7 @@ namespace mg5amcCpu
         { 16, -2 },
         { -2, 16 } }; // 2-D array[2][2]
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
       // Pre-compute a constexpr triangular color matrix properly normalized #475
       struct TriangularNormalizedColorMatrix
       {
@@ -359,7 +360,7 @@ namespace mg5amcCpu
 #endif
       for( int icol = 0; icol < ncolor; icol++ )
       {
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
         // === C++ START ===
         // Diagonal terms
 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
@@ -418,7 +419,7 @@ namespace mg5amcCpu
       MEs_sv_previous += deltaMEs_previous;
 #endif
       /*
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
       if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", blockDim.x * blockIdx.x + threadIdx.x, ihel, MEs_sv );
 #else
 #ifdef MGONGPU_CPPSIMD
@@ -465,8 +466,8 @@ namespace mg5amcCpu
       { 1, 1, -1, -1 },
       { 1, 1, 1, 1 },
       { 1, 1, 1, -1 } };
-#ifdef __CUDACC__
-    checkCuda( cudaMemcpyToSymbol( cHel, tHel, ncomb * npar * sizeof( short ) ) );
+#ifdef MGONGPUCPP_GPUIMPL
+    gpuMemcpyToSymbol( cHel, tHel, ncomb * npar * sizeof( short ) );
 #else
     memcpy( cHel, tHel, ncomb * npar * sizeof( short ) );
 #endif
@@ -506,9 +507,9 @@ namespace mg5amcCpu
     // Then copy them to CUDA constant memory (issue #39) or its C++ emulation in file-scope static memory
     const fptype tIPD[2] = { (fptype)m_pars->mdl_MT, (fptype)m_pars->mdl_WT };
     //const cxtype tIPC[0] = { ... }; // nicoup=0
-#ifdef __CUDACC__
-    checkCuda( cudaMemcpyToSymbol( cIPD, tIPD, 2 * sizeof( fptype ) ) );
-    //checkCuda( cudaMemcpyToSymbol( cIPC, tIPC, 0 * sizeof( cxtype ) ) ); // nicoup=0
+#ifdef MGONGPUCPP_GPUIMPL
+    gpuMemcpyToSymbol( cIPD, tIPD, 2 * sizeof( fptype ) );
+    //gpuMemcpyToSymbol( cIPC, tIPC, 0 * sizeof( cxtype ) ); // nicoup=0
 #else
     memcpy( cIPD, tIPD, 2 * sizeof( fptype ) );
     //memcpy( cIPC, tIPC, 0 * sizeof( cxtype ) ); // nicoup=0
@@ -544,7 +545,7 @@ namespace mg5amcCpu
   {
     std::stringstream out;
     // CUDA version (NVCC)
-    // [Use __NVCC__ instead of __CUDACC__ here!]
+    // [Use __NVCC__ instead of MGONGPUCPP_GPUIMPL here!]
     // [This tests if 'nvcc' was used even to build a .cc file, even if not necessarily 'nvcc -x cu' for a .cu file]
     // [Check 'nvcc --compiler-options -dM -E dummy.c | grep CUDA': see https://stackoverflow.com/a/53713712]
 #ifdef __NVCC__
@@ -609,12 +610,12 @@ namespace mg5amcCpu
   __global__ void /* clang-format off */
   computeDependentCouplings( const fptype* allgs, // input: Gs[nevt]
                              fptype* allcouplings // output: couplings[nevt*ndcoup*2]
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
                              , const int nevt     // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #endif
   ) /* clang-format on */
   {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     using namespace mg5amcGpu;
     using G_ACCESS = DeviceAccessGs;
     using C_ACCESS = DeviceAccessCouplings;
@@ -635,7 +636,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__ /* clang-format off */
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
   __global__ void
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
@@ -761,9 +762,9 @@ namespace mg5amcCpu
         nGoodHel++;
       }
     }
-#ifdef __CUDACC__
-    checkCuda( cudaMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) ) );
-    checkCuda( cudaMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) ) );
+#ifdef MGONGPUCPP_GPUIMPL
+    gpuMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) );
+    gpuMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) );
 #else
     cNGoodHel = nGoodHel;
     for( int ihel = 0; ihel < ncomb; ihel++ ) cGoodHel[ihel] = goodHel[ihel];
@@ -787,7 +788,7 @@ namespace mg5amcCpu
 #endif
             int* allselhel,                // output: helicity selection[nevt]
             int* allselcol                 // output: helicity selection[nevt]
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
             , const int nevt               // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #endif
             ) /* clang-format on */
@@ -807,7 +808,7 @@ namespace mg5amcCpu
     // Denominators: spins, colors and identical particles
     constexpr int helcolDenominators[1] = { 256 }; // assume nprocesses == 1 (#272 and #343)
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     // Remember: in CUDA this is a kernel for one event, in c++ this processes n events
     const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid
 #else
@@ -821,9 +822,12 @@ namespace mg5amcCpu
 #endif
 
     // Start sigmaKin_lines
+
+#include "GpuAbstraction.h"
+
     // === PART 0 - INITIALISATION (before calculate_wavefunctions) ===
     // Reset the "matrix elements" - running sums of |M|^2 over helicities for the given event
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     allMEs[ievt] = 0;
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     allNumerators[ievt] = 0;
@@ -851,7 +855,7 @@ namespace mg5amcCpu
     // === PART 1 - HELICITY LOOP: CALCULATE WAVEFUNCTIONS ===
     // (in both CUDA and C++, using precomputed good helicities)
 
-#ifdef __CUDACC__ // CUDA OR C++
+#ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++
 
     // *** START OF PART 1a - CUDA (one event per CPU thread) ***
     // Running sum of partial amplitudes squared for event by event color selection (#402)
@@ -1061,7 +1065,7 @@ namespace mg5amcCpu
     // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event
     // [NB 'sum over final spins, average over initial spins', eg see
     // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf]
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     allMEs[ievt] /= helcolDenominators[0];
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     if( channelId > 0 ) allMEs[ievt] *= allNumerators[ievt] / allDenominators[ievt];
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_gg_ttx/CPPProcess.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_gg_ttx/CPPProcess.h
index 448175be9d..f8a20b77fc 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_gg_ttx/CPPProcess.h
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_gg_ttx/CPPProcess.h
@@ -4,7 +4,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
 // MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
@@ -25,7 +25,7 @@
 
 //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -107,7 +107,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   __global__ void
   computeDependentCouplings( const fptype* allgs,    // input: Gs[nevt]
                              fptype* allcouplings ); // output: couplings[nevt*ndcoup*2]
@@ -120,7 +120,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__ /* clang-format off */
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
   __global__ void
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
@@ -150,7 +150,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__ /* clang-format off */
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
   __global__ void
   sigmaKin( const fptype* allmomenta,      // input: momenta[nevt*npar*4]
             const fptype* allcouplings,    // input: couplings[nevt*ndcoup*2]
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_gg_ttx/CudaRuntime.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_gg_ttx/CudaRuntime.h
deleted file mode 120000
index ce9e1a487a..0000000000
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_gg_ttx/CudaRuntime.h
+++ /dev/null
@@ -1 +0,0 @@
-../CudaRuntime.h
\ No newline at end of file
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_gg_ttx/GpuAbstraction.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_gg_ttx/GpuAbstraction.h
new file mode 120000
index 0000000000..72054e19ba
--- /dev/null
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_gg_ttx/GpuAbstraction.h
@@ -0,0 +1 @@
+../GpuAbstraction.h
\ No newline at end of file
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_gg_ttx/GpuRuntime.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_gg_ttx/GpuRuntime.h
new file mode 120000
index 0000000000..3920e83be4
--- /dev/null
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_gg_ttx/GpuRuntime.h
@@ -0,0 +1 @@
+../GpuRuntime.h
\ No newline at end of file
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_gg_ttx/check_sa.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_gg_ttx/check_sa.cc
index 3fbf0ffbee..7cac5ab47b 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_gg_ttx/check_sa.cc
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_gg_ttx/check_sa.cc
@@ -4,7 +4,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: O. Mattelaer (Nov 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 
 #include "mgOnGpuConfig.h"
@@ -12,6 +12,7 @@
 #include "BridgeKernels.h"
 #include "CPPProcess.h"
 #include "CrossSectionKernels.h"
+#include "GpuRuntime.h"
 #include "MatrixElementKernels.h"
 #include "MemoryAccessMatrixElements.h"
 #include "MemoryAccessMomenta.h"
@@ -65,7 +66,7 @@ usage( char* argv0, int ret = 1 )
   std::cout << std::endl;
   std::cout << "Summary stats are always computed: '-p' and '-j' only control their printout" << std::endl;
   std::cout << "The '-d' flag only enables NaN/abnormal warnings and OMP debugging" << std::endl;
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 #ifdef _OPENMP
   std::cout << std::endl;
   std::cout << "Use the OMP_NUM_THREADS environment variable to control OMP multi-threading" << std::endl;
@@ -96,7 +97,7 @@ int
 main( int argc, char** argv )
 {
   // Namespaces for CUDA and C++ (FIXME - eventually use the same namespace everywhere...)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   using namespace mg5amcGpu;
 #else
   using namespace mg5amcCpu;
@@ -134,9 +135,11 @@ main( int argc, char** argv )
     CurandDevice = 2
   };
 #ifdef MGONGPU_HAS_NO_CURAND
-  RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // this is the only supported mode if build has no curand (PR #784)
+  RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // this is the only supported mode if build has no curand (PR #784 and #785)
+#elif defined __HIPCC__
+#error Internal error: MGONGPU_HAS_NO_CURAND should have been set for __HIPCC__ // default on AMD GPUs should be common random
 #elif defined __CUDACC__
-  RandomNumberMode rndgen = RandomNumberMode::CurandDevice; // default on GPU if build has curand
+  RandomNumberMode rndgen = RandomNumberMode::CurandDevice; // default on NVidia GPU if build has curand
 #else
   RandomNumberMode rndgen = RandomNumberMode::CurandHost; // default on CPU if build has curand
 #endif
@@ -146,10 +149,10 @@ main( int argc, char** argv )
     RamboHost = 1,
     RamboDevice = 2
   };
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   RamboSamplingMode rmbsmp = RamboSamplingMode::RamboDevice; // default on GPU
 #else
-  RamboSamplingMode rmbsmp = RamboSamplingMode::RamboHost;  // default on CPU
+  RamboSamplingMode rmbsmp = RamboSamplingMode::RamboHost; // default on CPU
 #endif
   // Bridge emulation mode (NB Bridge implies RamboHost!)
   bool bridge = false;
@@ -177,7 +180,7 @@ main( int argc, char** argv )
     else if( arg == "--curdev" )
     {
 #ifndef __CUDACC__
-      throw std::runtime_error( "CurandDevice is not supported on CPUs" );
+      throw std::runtime_error( "CurandDevice is not supported on CPUs or non-NVidia GPUs" );
 #elif defined MGONGPU_HAS_NO_CURAND
       throw std::runtime_error( "CurandDevice is not supported because this application was built without Curand support" );
 #else
@@ -198,7 +201,7 @@ main( int argc, char** argv )
     }
     else if( arg == "--rmbdev" )
     {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
       rmbsmp = RamboSamplingMode::RamboDevice;
 #else
       throw std::runtime_error( "RamboDevice is not supported on CPUs" );
@@ -272,13 +275,13 @@ main( int argc, char** argv )
     return usage( argv[0] );
   }
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 #ifdef _OPENMP
   ompnumthreadsNotSetMeansOneThread( debug ? 1 : 0 ); // quiet(-1), info(0), debug(1)
 #endif
 #endif
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // Fail gently and avoid "Illegal instruction (core dumped)" if the host does not support the SIMD used in the ME calculation
   // Note: this prevents a crash on pmpe04 but not on some github CI nodes?
   // [NB: SIMD vectorization in mg5amc C++ code is only used in the ME calculation below MatrixElementKernelHost!]
@@ -296,14 +299,14 @@ main( int argc, char** argv )
 
   // === STEP 0 - INITIALISE
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 
-  // --- 00. Initialise cuda
-  // Instantiate a CudaRuntime at the beginnining of the application's main to
-  // invoke cudaSetDevice(0) in the constructor and book a cudaDeviceReset() call in the destructor
-  const std::string cdinKey = "00 CudaInit";
+  // --- 00. Initialise GPU
+  // Instantiate a GpuRuntime at the beginnining of the application's main.
+  // For CUDA this invokes cudaSetDevice(0) in the constructor and books a cudaDeviceReset() call in the destructor.
+  const std::string cdinKey = "00 GpuInit";
   timermap.start( cdinKey );
-  CudaRuntime cudaRuntime( debug );
+  GpuRuntime GpuRuntime( debug );
 #endif
 
   // --- 0a. Initialise physics process
@@ -325,7 +328,7 @@ main( int argc, char** argv )
   timermap.start( alloKey );
 
   // Memory buffers for random numbers for momenta
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferRndNumMomenta hstRndmom( nevt );
 #else
   PinnedHostBufferRndNumMomenta hstRndmom( nevt );
@@ -333,7 +336,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for sampling weights
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferWeights hstWeights( nevt );
 #else
   PinnedHostBufferWeights hstWeights( nevt );
@@ -341,7 +344,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for momenta
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferMomenta hstMomenta( nevt );
 #else
   PinnedHostBufferMomenta hstMomenta( nevt );
@@ -349,7 +352,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for Gs
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferGs hstGs( nevt );
 #else
   PinnedHostBufferGs hstGs( nevt );
@@ -366,7 +369,7 @@ main( int argc, char** argv )
   }
 
   // Memory buffers for matrix elements
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferMatrixElements hstMatrixElements( nevt );
 #else
   PinnedHostBufferMatrixElements hstMatrixElements( nevt );
@@ -375,7 +378,7 @@ main( int argc, char** argv )
 
   // Memory buffers for random numbers for helicity selection
   // *** NB #403 these buffers always remain initialised at 0: no need for helicity choice in gcheck/check (no LHE produced) ***
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferRndNumHelicity hstRndHel( nevt );
 #else
   PinnedHostBufferRndNumHelicity hstRndHel( nevt );
@@ -384,7 +387,7 @@ main( int argc, char** argv )
 
   // Memory buffers for random numbers for color selection
   // *** NB #402 these buffers always remain initialised at 0: no need for color choice in gcheck/check (no LHE produced) ***
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferRndNumColor hstRndCol( nevt );
 #else
   PinnedHostBufferRndNumColor hstRndCol( nevt );
@@ -392,7 +395,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for helicity selection
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferSelectedHelicity hstSelHel( nevt );
 #else
   PinnedHostBufferSelectedHelicity hstSelHel( nevt );
@@ -400,7 +403,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for color selection
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferSelectedColor hstSelCol( nevt );
 #else
   PinnedHostBufferSelectedColor hstSelCol( nevt );
@@ -438,7 +441,7 @@ main( int argc, char** argv )
     const bool onDevice = true;
     prnk.reset( new CurandRandomNumberKernel( devRndmom, onDevice ) );
 #else
-    throw std::logic_error( "INTERNAL ERROR! CurandDevice is not supported on CPUs" ); // INTERNAL ERROR (no path to this statement)
+    throw std::logic_error( "INTERNAL ERROR! CurandDevice is not supported on CPUs or non-NVidia GPUs" ); // INTERNAL ERROR (no path to this statement)
 #endif
   }
 
@@ -450,7 +453,7 @@ main( int argc, char** argv )
   }
   else
   {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     prsk.reset( new RamboSamplingKernelDevice( energy, devRndmom, devMomenta, devWeights, gpublocks, gputhreads ) );
 #else
     throw std::logic_error( "RamboDevice is not supported on CPUs" ); // INTERNAL ERROR (no path to this statement)
@@ -461,7 +464,7 @@ main( int argc, char** argv )
   std::unique_ptr<MatrixElementKernelBase> pmek;
   if( !bridge )
   {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     pmek.reset( new MatrixElementKernelDevice( devMomenta, devGs, devRndHel, devRndCol, devMatrixElements, devSelHel, devSelCol, gpublocks, gputhreads ) );
 #else
     pmek.reset( new MatrixElementKernelHost( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, nevt ) );
@@ -469,7 +472,7 @@ main( int argc, char** argv )
   }
   else
   {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     pmek.reset( new BridgeKernelDevice( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, gpublocks, gputhreads ) );
 #else
     pmek.reset( new BridgeKernelHost( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, nevt ) );
@@ -511,7 +514,7 @@ main( int argc, char** argv )
     prnk->generateRnarray();
     //std::cout << "Got random numbers" << std::endl;
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     if( rndgen != RandomNumberMode::CurandDevice && rmbsmp == RamboSamplingMode::RamboDevice )
     {
       // --- 1c. Copy rndmom from host to device
@@ -543,7 +546,7 @@ main( int argc, char** argv )
     prsk->getMomentaFinal();
     //std::cout << "Got final momenta" << std::endl;
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     if( rmbsmp == RamboSamplingMode::RamboDevice )
     {
       // --- 2c. CopyDToH Weights
@@ -588,7 +591,7 @@ main( int argc, char** argv )
       dynamic_cast<BridgeKernelBase*>( pmek.get() )->transposeInputMomentaC2F();
     }
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     // --- 2d. CopyHToD Momenta
     const std::string gKey = "0.. CpHTDg";
     rambtime += timermap.start( gKey ); // FIXME! NOT A RAMBO TIMER!
@@ -617,7 +620,7 @@ main( int argc, char** argv )
     wv3atime += timermap.stop(); // calc only
     wavetime += wv3atime;        // calc plus copy
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     if( !bridge )
     {
       // --- 3b. CopyDToH MEs
@@ -760,18 +763,22 @@ main( int argc, char** argv )
     rndgentxt = "CURAND DEVICE";
 #ifdef __CUDACC__
   rndgentxt += " (CUDA code)";
+#elif defined __HIPCC__
+  rndgentxt += " (HIP code)";
 #else
   rndgentxt += " (C++ code)";
 #endif
 
   // Workflow description summary
   std::string wrkflwtxt;
-  // -- CUDA or C++?
+  // -- CUDA or HIP or C++?
 #ifdef __CUDACC__
   wrkflwtxt += "CUD:";
+#elif defined __HIPCC__
+  wrkflwtxt += "HIP:";
 #else
   wrkflwtxt += "CPP:";
-#endif
+#endif /* clang-format off */
   // -- DOUBLE or FLOAT?
 #if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
   wrkflwtxt += "MIX+"; // mixed fptypes (single precision color algebra #537)
@@ -781,7 +788,7 @@ main( int argc, char** argv )
   wrkflwtxt += "FLT+";
 #else
   wrkflwtxt += "???+"; // no path to this statement
-#endif
+#endif /* clang-format on */
   // -- CUCOMPLEX or THRUST or STD complex numbers?
 #ifdef __CUDACC__
 #if defined MGONGPU_CUCXTYPE_CUCOMPLEX
@@ -793,6 +800,12 @@ main( int argc, char** argv )
 #else
   wrkflwtxt += "???:"; // no path to this statement
 #endif
+#elif defined __HIPCC__
+#if defined MGONGPU_CUCXTYPE_CXSMPL
+  wrkflwtxt += "CXS:";
+#else
+  wrkflwtxt += "???:"; // no path to this statement
+#endif
 #else
 #if defined MGONGPU_CPPCXTYPE_STDCOMPLEX
   wrkflwtxt += "STX:";
@@ -818,7 +831,7 @@ main( int argc, char** argv )
     wrkflwtxt += "RMBDEV+";
   else
     wrkflwtxt += "??????+"; // no path to this statement
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   // -- HOST or DEVICE matrix elements? Standalone MEs or BRIDGE?
   if( !bridge )
     wrkflwtxt += "MESDEV";
@@ -874,7 +887,7 @@ main( int argc, char** argv )
 
   if( perf )
   {
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 #ifdef _OPENMP
     // Get the output of "nproc --all" (https://stackoverflow.com/a/478960)
     std::string nprocall;
@@ -895,6 +908,8 @@ main( int argc, char** argv )
     std::cout << std::string( SEP79, '*' ) << std::endl
 #ifdef __CUDACC__
               << "Process                     = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_CUDA"
+#elif defined __HIPCC__
+              << "Process                     = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_HIP"
 #else
               << "Process                     = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_CPP"
 #endif
@@ -921,21 +936,21 @@ main( int argc, char** argv )
 #elif defined MGONGPU_FPTYPE_FLOAT
               << "FP precision                = FLOAT (NaN/abnormal=" << nabn << ", zero=" << nzero << ")" << std::endl
 #endif
-#ifdef __CUDACC__
 #if defined MGONGPU_CUCXTYPE_CUCOMPLEX
               << "Complex type                = CUCOMPLEX" << std::endl
 #elif defined MGONGPU_CUCXTYPE_THRUST
               << "Complex type                = THRUST::COMPLEX" << std::endl
-#endif
-#else
+#elif defined MGONGPU_CUCXTYPE_CXSMPL
               << "Complex type                = STD::COMPLEX" << std::endl
+#else
+              << "Complex type                = ???" << std::endl // no path to this statement...
 #endif
               << "RanNumb memory layout       = AOSOA[" << neppR << "]"
               << ( neppR == 1 ? " == AOS" : "" )
               << " [HARDCODED FOR REPRODUCIBILITY]" << std::endl
               << "Momenta memory layout       = AOSOA[" << neppM << "]"
               << ( neppM == 1 ? " == AOS" : "" ) << std::endl
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     //<< "Wavefunction GPU memory     = LOCAL" << std::endl
 #else
 #if !defined MGONGPU_CPPSIMD
@@ -966,7 +981,7 @@ main( int argc, char** argv )
 #endif
 #endif
               << "Random number generation    = " << rndgentxt << std::endl
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 #ifdef _OPENMP
               << "OMP threads / `nproc --all` = " << omp_get_max_threads() << " / " << nprocall // includes a newline
 #endif
@@ -1062,14 +1077,14 @@ main( int argc, char** argv )
              << "\"FLOAT (NaN/abnormal=" << nabn << ")\"," << std::endl
 #endif
              << "\"Complex type\": "
-#ifdef __CUDACC__
 #if defined MGONGPU_CUCXTYPE_CUCOMPLEX
              << "\"CUCOMPLEX\"," << std::endl
 #elif defined MGONGPU_CUCXTYPE_THRUST
              << "\"THRUST::COMPLEX\"," << std::endl
-#endif
-#else
+#elif defined MGONGPU_CUCXTYPE_CXSMPL
              << "\"STD::COMPLEX\"," << std::endl
+#else
+             << "\"???\"," << std::endl                           // no path to this statement...
 #endif
              << "\"RanNumb memory layout\": "
              << "\"AOSOA[" << neppR << "]\""
@@ -1077,7 +1092,7 @@ main( int argc, char** argv )
              << "\"Momenta memory layout\": "
              << "\"AOSOA[" << neppM << "]\""
              << ( neppM == 1 ? " == AOS" : "" ) << ", " << std::endl
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     //<< "\"Wavefunction GPU memory\": " << "\"LOCAL\"," << std::endl
 #endif
              << "\"Curand generation\": "
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_uux_ttx/CPPProcess.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_uux_ttx/CPPProcess.cc
index 20496eaa70..5f57cf55f3 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_uux_ttx/CPPProcess.cc
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_uux_ttx/CPPProcess.cc
@@ -4,7 +4,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
 // MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
@@ -16,7 +16,6 @@
 
 #include "mgOnGpuConfig.h"
 
-#include "CudaRuntime.h"
 #include "HelAmps_sm.h"
 #include "MemoryAccessAmplitudes.h"
 #include "MemoryAccessCouplings.h"
@@ -49,7 +48,7 @@
 // Process: d d~ > t t~ WEIGHTED<=2
 // Process: s s~ > t t~ WEIGHTED<=2
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -83,7 +82,7 @@ namespace mg5amcCpu
   __device__ const fptype cIPD[2] = { (fptype)Parameters_sm::mdl_MT, (fptype)Parameters_sm::mdl_WT };
   __device__ const fptype* cIPC = nullptr; // unused as nicoup=0
 #else
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   __device__ __constant__ fptype cIPD[2];
   __device__ __constant__ fptype* cIPC = nullptr; // unused as nicoup=0
 #else
@@ -93,7 +92,7 @@ namespace mg5amcCpu
 #endif
 
   // Helicity combinations (and filtering of "good" helicity combinations)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   __device__ __constant__ short cHel[ncomb][npar];
   __device__ __constant__ int cNGoodHel;
   __device__ __constant__ int cGoodHel[ncomb];
@@ -121,13 +120,13 @@ namespace mg5amcCpu
                            fptype* allDenominators,       // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
                            fptype_sv* jamp2_sv            // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled)
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
                            , const int ievt00             // input: first event number in current C++ event page (for CUDA, ievt depends on threadid)
 #endif
                            )
   //ALWAYS_INLINE // attributes are not permitted in a function definition
   {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     using namespace mg5amcGpu;
     using M_ACCESS = DeviceAccessMomenta;         // non-trivial access: buffer includes all events
     using E_ACCESS = DeviceAccessMatrixElements;  // non-trivial access: buffer includes all events
@@ -154,7 +153,7 @@ namespace mg5amcCpu
 #endif /* clang-format on */
     mgDebug( 0, __FUNCTION__ );
     //printf( "calculate_wavefunctions: ihel=%2d\n", ihel );
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
     //printf( "calculate_wavefunctions: ievt00=%d\n", ievt00 );
 #endif
 
@@ -190,7 +189,7 @@ namespace mg5amcCpu
 #endif
     for( int iParity = 0; iParity < nParity; ++iParity )
     { // START LOOP ON IPARITY
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
       const int ievt0 = ievt00 + iParity * neppV;
 #endif
       constexpr size_t nxcoup = ndcoup + nicoup; // both dependent and independent couplings
@@ -203,8 +202,10 @@ namespace mg5amcCpu
         allCOUPs[idcoup] = CD_ACCESS::idcoupAccessBufferConst( allcouplings, idcoup ); // dependent couplings, vary event-by-event
       for( size_t iicoup = 0; iicoup < nicoup; iicoup++ )
         allCOUPs[ndcoup + iicoup] = CI_ACCESS::iicoupAccessBufferConst( cIPC, iicoup ); // independent couplings, fixed for all events
+#ifdef MGONGPUCPP_GPUIMPL
 #ifdef __CUDACC__
 #pragma nv_diagnostic pop
+#endif
       // CUDA kernels take input/output buffers with momenta/MEs for all events
       const fptype* momenta = allmomenta;
       const fptype* COUPs[nxcoup];
@@ -279,7 +280,7 @@ namespace mg5amcCpu
         { 9, 3 },
         { 3, 9 } }; // 2-D array[2][2]
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
       // Pre-compute a constexpr triangular color matrix properly normalized #475
       struct TriangularNormalizedColorMatrix
       {
@@ -336,7 +337,7 @@ namespace mg5amcCpu
 #endif
       for( int icol = 0; icol < ncolor; icol++ )
       {
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
         // === C++ START ===
         // Diagonal terms
 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
@@ -395,7 +396,7 @@ namespace mg5amcCpu
       MEs_sv_previous += deltaMEs_previous;
 #endif
       /*
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
       if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", blockDim.x * blockIdx.x + threadIdx.x, ihel, MEs_sv );
 #else
 #ifdef MGONGPU_CPPSIMD
@@ -442,8 +443,8 @@ namespace mg5amcCpu
       { -1, 1, -1, -1 },
       { -1, 1, 1, 1 },
       { -1, 1, 1, -1 } };
-#ifdef __CUDACC__
-    checkCuda( cudaMemcpyToSymbol( cHel, tHel, ncomb * npar * sizeof( short ) ) );
+#ifdef MGONGPUCPP_GPUIMPL
+    gpuMemcpyToSymbol( cHel, tHel, ncomb * npar * sizeof( short ) );
 #else
     memcpy( cHel, tHel, ncomb * npar * sizeof( short ) );
 #endif
@@ -483,9 +484,9 @@ namespace mg5amcCpu
     // Then copy them to CUDA constant memory (issue #39) or its C++ emulation in file-scope static memory
     const fptype tIPD[2] = { (fptype)m_pars->mdl_MT, (fptype)m_pars->mdl_WT };
     //const cxtype tIPC[0] = { ... }; // nicoup=0
-#ifdef __CUDACC__
-    checkCuda( cudaMemcpyToSymbol( cIPD, tIPD, 2 * sizeof( fptype ) ) );
-    //checkCuda( cudaMemcpyToSymbol( cIPC, tIPC, 0 * sizeof( cxtype ) ) ); // nicoup=0
+#ifdef MGONGPUCPP_GPUIMPL
+    gpuMemcpyToSymbol( cIPD, tIPD, 2 * sizeof( fptype ) );
+    //gpuMemcpyToSymbol( cIPC, tIPC, 0 * sizeof( cxtype ) ); // nicoup=0
 #else
     memcpy( cIPD, tIPD, 2 * sizeof( fptype ) );
     //memcpy( cIPC, tIPC, 0 * sizeof( cxtype ) ); // nicoup=0
@@ -521,7 +522,7 @@ namespace mg5amcCpu
   {
     std::stringstream out;
     // CUDA version (NVCC)
-    // [Use __NVCC__ instead of __CUDACC__ here!]
+    // [Use __NVCC__ instead of MGONGPUCPP_GPUIMPL here!]
     // [This tests if 'nvcc' was used even to build a .cc file, even if not necessarily 'nvcc -x cu' for a .cu file]
     // [Check 'nvcc --compiler-options -dM -E dummy.c | grep CUDA': see https://stackoverflow.com/a/53713712]
 #ifdef __NVCC__
@@ -586,12 +587,12 @@ namespace mg5amcCpu
   __global__ void /* clang-format off */
   computeDependentCouplings( const fptype* allgs, // input: Gs[nevt]
                              fptype* allcouplings // output: couplings[nevt*ndcoup*2]
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
                              , const int nevt     // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #endif
   ) /* clang-format on */
   {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     using namespace mg5amcGpu;
     using G_ACCESS = DeviceAccessGs;
     using C_ACCESS = DeviceAccessCouplings;
@@ -612,7 +613,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__ /* clang-format off */
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
   __global__ void
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
@@ -738,9 +739,9 @@ namespace mg5amcCpu
         nGoodHel++;
       }
     }
-#ifdef __CUDACC__
-    checkCuda( cudaMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) ) );
-    checkCuda( cudaMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) ) );
+#ifdef MGONGPUCPP_GPUIMPL
+    gpuMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) );
+    gpuMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) );
 #else
     cNGoodHel = nGoodHel;
     for( int ihel = 0; ihel < ncomb; ihel++ ) cGoodHel[ihel] = goodHel[ihel];
@@ -764,7 +765,7 @@ namespace mg5amcCpu
 #endif
             int* allselhel,                // output: helicity selection[nevt]
             int* allselcol                 // output: helicity selection[nevt]
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
             , const int nevt               // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #endif
             ) /* clang-format on */
@@ -784,7 +785,7 @@ namespace mg5amcCpu
     // Denominators: spins, colors and identical particles
     constexpr int helcolDenominators[1] = { 36 }; // assume nprocesses == 1 (#272 and #343)
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     // Remember: in CUDA this is a kernel for one event, in c++ this processes n events
     const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid
 #else
@@ -798,9 +799,12 @@ namespace mg5amcCpu
 #endif
 
     // Start sigmaKin_lines
+
+#include "GpuAbstraction.h"
+
     // === PART 0 - INITIALISATION (before calculate_wavefunctions) ===
     // Reset the "matrix elements" - running sums of |M|^2 over helicities for the given event
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     allMEs[ievt] = 0;
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     allNumerators[ievt] = 0;
@@ -828,7 +832,7 @@ namespace mg5amcCpu
     // === PART 1 - HELICITY LOOP: CALCULATE WAVEFUNCTIONS ===
     // (in both CUDA and C++, using precomputed good helicities)
 
-#ifdef __CUDACC__ // CUDA OR C++
+#ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++
 
     // *** START OF PART 1a - CUDA (one event per CPU thread) ***
     // Running sum of partial amplitudes squared for event by event color selection (#402)
@@ -1038,7 +1042,7 @@ namespace mg5amcCpu
     // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event
     // [NB 'sum over final spins, average over initial spins', eg see
     // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf]
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     allMEs[ievt] /= helcolDenominators[0];
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     if( channelId > 0 ) allMEs[ievt] *= allNumerators[ievt] / allDenominators[ievt];
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_uux_ttx/CPPProcess.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_uux_ttx/CPPProcess.h
index e166fa1652..6498b91441 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_uux_ttx/CPPProcess.h
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_uux_ttx/CPPProcess.h
@@ -4,7 +4,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
 // MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
@@ -25,7 +25,7 @@
 
 //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -110,7 +110,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   __global__ void
   computeDependentCouplings( const fptype* allgs,    // input: Gs[nevt]
                              fptype* allcouplings ); // output: couplings[nevt*ndcoup*2]
@@ -123,7 +123,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__ /* clang-format off */
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
   __global__ void
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
@@ -153,7 +153,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__ /* clang-format off */
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
   __global__ void
   sigmaKin( const fptype* allmomenta,      // input: momenta[nevt*npar*4]
             const fptype* allcouplings,    // input: couplings[nevt*ndcoup*2]
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_uux_ttx/CudaRuntime.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_uux_ttx/CudaRuntime.h
deleted file mode 120000
index ce9e1a487a..0000000000
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_uux_ttx/CudaRuntime.h
+++ /dev/null
@@ -1 +0,0 @@
-../CudaRuntime.h
\ No newline at end of file
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_uux_ttx/GpuAbstraction.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_uux_ttx/GpuAbstraction.h
new file mode 120000
index 0000000000..72054e19ba
--- /dev/null
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_uux_ttx/GpuAbstraction.h
@@ -0,0 +1 @@
+../GpuAbstraction.h
\ No newline at end of file
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_uux_ttx/GpuRuntime.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_uux_ttx/GpuRuntime.h
new file mode 120000
index 0000000000..3920e83be4
--- /dev/null
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_uux_ttx/GpuRuntime.h
@@ -0,0 +1 @@
+../GpuRuntime.h
\ No newline at end of file
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_uux_ttx/check_sa.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_uux_ttx/check_sa.cc
index 3fbf0ffbee..7cac5ab47b 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_uux_ttx/check_sa.cc
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_uux_ttx/check_sa.cc
@@ -4,7 +4,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: O. Mattelaer (Nov 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 
 #include "mgOnGpuConfig.h"
@@ -12,6 +12,7 @@
 #include "BridgeKernels.h"
 #include "CPPProcess.h"
 #include "CrossSectionKernels.h"
+#include "GpuRuntime.h"
 #include "MatrixElementKernels.h"
 #include "MemoryAccessMatrixElements.h"
 #include "MemoryAccessMomenta.h"
@@ -65,7 +66,7 @@ usage( char* argv0, int ret = 1 )
   std::cout << std::endl;
   std::cout << "Summary stats are always computed: '-p' and '-j' only control their printout" << std::endl;
   std::cout << "The '-d' flag only enables NaN/abnormal warnings and OMP debugging" << std::endl;
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 #ifdef _OPENMP
   std::cout << std::endl;
   std::cout << "Use the OMP_NUM_THREADS environment variable to control OMP multi-threading" << std::endl;
@@ -96,7 +97,7 @@ int
 main( int argc, char** argv )
 {
   // Namespaces for CUDA and C++ (FIXME - eventually use the same namespace everywhere...)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   using namespace mg5amcGpu;
 #else
   using namespace mg5amcCpu;
@@ -134,9 +135,11 @@ main( int argc, char** argv )
     CurandDevice = 2
   };
 #ifdef MGONGPU_HAS_NO_CURAND
-  RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // this is the only supported mode if build has no curand (PR #784)
+  RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // this is the only supported mode if build has no curand (PR #784 and #785)
+#elif defined __HIPCC__
+#error Internal error: MGONGPU_HAS_NO_CURAND should have been set for __HIPCC__ // default on AMD GPUs should be common random
 #elif defined __CUDACC__
-  RandomNumberMode rndgen = RandomNumberMode::CurandDevice; // default on GPU if build has curand
+  RandomNumberMode rndgen = RandomNumberMode::CurandDevice; // default on NVidia GPU if build has curand
 #else
   RandomNumberMode rndgen = RandomNumberMode::CurandHost; // default on CPU if build has curand
 #endif
@@ -146,10 +149,10 @@ main( int argc, char** argv )
     RamboHost = 1,
     RamboDevice = 2
   };
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   RamboSamplingMode rmbsmp = RamboSamplingMode::RamboDevice; // default on GPU
 #else
-  RamboSamplingMode rmbsmp = RamboSamplingMode::RamboHost;  // default on CPU
+  RamboSamplingMode rmbsmp = RamboSamplingMode::RamboHost; // default on CPU
 #endif
   // Bridge emulation mode (NB Bridge implies RamboHost!)
   bool bridge = false;
@@ -177,7 +180,7 @@ main( int argc, char** argv )
     else if( arg == "--curdev" )
     {
 #ifndef __CUDACC__
-      throw std::runtime_error( "CurandDevice is not supported on CPUs" );
+      throw std::runtime_error( "CurandDevice is not supported on CPUs or non-NVidia GPUs" );
 #elif defined MGONGPU_HAS_NO_CURAND
       throw std::runtime_error( "CurandDevice is not supported because this application was built without Curand support" );
 #else
@@ -198,7 +201,7 @@ main( int argc, char** argv )
     }
     else if( arg == "--rmbdev" )
     {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
       rmbsmp = RamboSamplingMode::RamboDevice;
 #else
       throw std::runtime_error( "RamboDevice is not supported on CPUs" );
@@ -272,13 +275,13 @@ main( int argc, char** argv )
     return usage( argv[0] );
   }
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 #ifdef _OPENMP
   ompnumthreadsNotSetMeansOneThread( debug ? 1 : 0 ); // quiet(-1), info(0), debug(1)
 #endif
 #endif
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // Fail gently and avoid "Illegal instruction (core dumped)" if the host does not support the SIMD used in the ME calculation
   // Note: this prevents a crash on pmpe04 but not on some github CI nodes?
   // [NB: SIMD vectorization in mg5amc C++ code is only used in the ME calculation below MatrixElementKernelHost!]
@@ -296,14 +299,14 @@ main( int argc, char** argv )
 
   // === STEP 0 - INITIALISE
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 
-  // --- 00. Initialise cuda
-  // Instantiate a CudaRuntime at the beginnining of the application's main to
-  // invoke cudaSetDevice(0) in the constructor and book a cudaDeviceReset() call in the destructor
-  const std::string cdinKey = "00 CudaInit";
+  // --- 00. Initialise GPU
+  // Instantiate a GpuRuntime at the beginnining of the application's main.
+  // For CUDA this invokes cudaSetDevice(0) in the constructor and books a cudaDeviceReset() call in the destructor.
+  const std::string cdinKey = "00 GpuInit";
   timermap.start( cdinKey );
-  CudaRuntime cudaRuntime( debug );
+  GpuRuntime GpuRuntime( debug );
 #endif
 
   // --- 0a. Initialise physics process
@@ -325,7 +328,7 @@ main( int argc, char** argv )
   timermap.start( alloKey );
 
   // Memory buffers for random numbers for momenta
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferRndNumMomenta hstRndmom( nevt );
 #else
   PinnedHostBufferRndNumMomenta hstRndmom( nevt );
@@ -333,7 +336,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for sampling weights
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferWeights hstWeights( nevt );
 #else
   PinnedHostBufferWeights hstWeights( nevt );
@@ -341,7 +344,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for momenta
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferMomenta hstMomenta( nevt );
 #else
   PinnedHostBufferMomenta hstMomenta( nevt );
@@ -349,7 +352,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for Gs
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferGs hstGs( nevt );
 #else
   PinnedHostBufferGs hstGs( nevt );
@@ -366,7 +369,7 @@ main( int argc, char** argv )
   }
 
   // Memory buffers for matrix elements
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferMatrixElements hstMatrixElements( nevt );
 #else
   PinnedHostBufferMatrixElements hstMatrixElements( nevt );
@@ -375,7 +378,7 @@ main( int argc, char** argv )
 
   // Memory buffers for random numbers for helicity selection
   // *** NB #403 these buffers always remain initialised at 0: no need for helicity choice in gcheck/check (no LHE produced) ***
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferRndNumHelicity hstRndHel( nevt );
 #else
   PinnedHostBufferRndNumHelicity hstRndHel( nevt );
@@ -384,7 +387,7 @@ main( int argc, char** argv )
 
   // Memory buffers for random numbers for color selection
   // *** NB #402 these buffers always remain initialised at 0: no need for color choice in gcheck/check (no LHE produced) ***
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferRndNumColor hstRndCol( nevt );
 #else
   PinnedHostBufferRndNumColor hstRndCol( nevt );
@@ -392,7 +395,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for helicity selection
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferSelectedHelicity hstSelHel( nevt );
 #else
   PinnedHostBufferSelectedHelicity hstSelHel( nevt );
@@ -400,7 +403,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for color selection
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferSelectedColor hstSelCol( nevt );
 #else
   PinnedHostBufferSelectedColor hstSelCol( nevt );
@@ -438,7 +441,7 @@ main( int argc, char** argv )
     const bool onDevice = true;
     prnk.reset( new CurandRandomNumberKernel( devRndmom, onDevice ) );
 #else
-    throw std::logic_error( "INTERNAL ERROR! CurandDevice is not supported on CPUs" ); // INTERNAL ERROR (no path to this statement)
+    throw std::logic_error( "INTERNAL ERROR! CurandDevice is not supported on CPUs or non-NVidia GPUs" ); // INTERNAL ERROR (no path to this statement)
 #endif
   }
 
@@ -450,7 +453,7 @@ main( int argc, char** argv )
   }
   else
   {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     prsk.reset( new RamboSamplingKernelDevice( energy, devRndmom, devMomenta, devWeights, gpublocks, gputhreads ) );
 #else
     throw std::logic_error( "RamboDevice is not supported on CPUs" ); // INTERNAL ERROR (no path to this statement)
@@ -461,7 +464,7 @@ main( int argc, char** argv )
   std::unique_ptr<MatrixElementKernelBase> pmek;
   if( !bridge )
   {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     pmek.reset( new MatrixElementKernelDevice( devMomenta, devGs, devRndHel, devRndCol, devMatrixElements, devSelHel, devSelCol, gpublocks, gputhreads ) );
 #else
     pmek.reset( new MatrixElementKernelHost( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, nevt ) );
@@ -469,7 +472,7 @@ main( int argc, char** argv )
   }
   else
   {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     pmek.reset( new BridgeKernelDevice( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, gpublocks, gputhreads ) );
 #else
     pmek.reset( new BridgeKernelHost( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, nevt ) );
@@ -511,7 +514,7 @@ main( int argc, char** argv )
     prnk->generateRnarray();
     //std::cout << "Got random numbers" << std::endl;
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     if( rndgen != RandomNumberMode::CurandDevice && rmbsmp == RamboSamplingMode::RamboDevice )
     {
       // --- 1c. Copy rndmom from host to device
@@ -543,7 +546,7 @@ main( int argc, char** argv )
     prsk->getMomentaFinal();
     //std::cout << "Got final momenta" << std::endl;
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     if( rmbsmp == RamboSamplingMode::RamboDevice )
     {
       // --- 2c. CopyDToH Weights
@@ -588,7 +591,7 @@ main( int argc, char** argv )
       dynamic_cast<BridgeKernelBase*>( pmek.get() )->transposeInputMomentaC2F();
     }
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     // --- 2d. CopyHToD Momenta
     const std::string gKey = "0.. CpHTDg";
     rambtime += timermap.start( gKey ); // FIXME! NOT A RAMBO TIMER!
@@ -617,7 +620,7 @@ main( int argc, char** argv )
     wv3atime += timermap.stop(); // calc only
     wavetime += wv3atime;        // calc plus copy
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     if( !bridge )
     {
       // --- 3b. CopyDToH MEs
@@ -760,18 +763,22 @@ main( int argc, char** argv )
     rndgentxt = "CURAND DEVICE";
 #ifdef __CUDACC__
   rndgentxt += " (CUDA code)";
+#elif defined __HIPCC__
+  rndgentxt += " (HIP code)";
 #else
   rndgentxt += " (C++ code)";
 #endif
 
   // Workflow description summary
   std::string wrkflwtxt;
-  // -- CUDA or C++?
+  // -- CUDA or HIP or C++?
 #ifdef __CUDACC__
   wrkflwtxt += "CUD:";
+#elif defined __HIPCC__
+  wrkflwtxt += "HIP:";
 #else
   wrkflwtxt += "CPP:";
-#endif
+#endif /* clang-format off */
   // -- DOUBLE or FLOAT?
 #if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
   wrkflwtxt += "MIX+"; // mixed fptypes (single precision color algebra #537)
@@ -781,7 +788,7 @@ main( int argc, char** argv )
   wrkflwtxt += "FLT+";
 #else
   wrkflwtxt += "???+"; // no path to this statement
-#endif
+#endif /* clang-format on */
   // -- CUCOMPLEX or THRUST or STD complex numbers?
 #ifdef __CUDACC__
 #if defined MGONGPU_CUCXTYPE_CUCOMPLEX
@@ -793,6 +800,12 @@ main( int argc, char** argv )
 #else
   wrkflwtxt += "???:"; // no path to this statement
 #endif
+#elif defined __HIPCC__
+#if defined MGONGPU_CUCXTYPE_CXSMPL
+  wrkflwtxt += "CXS:";
+#else
+  wrkflwtxt += "???:"; // no path to this statement
+#endif
 #else
 #if defined MGONGPU_CPPCXTYPE_STDCOMPLEX
   wrkflwtxt += "STX:";
@@ -818,7 +831,7 @@ main( int argc, char** argv )
     wrkflwtxt += "RMBDEV+";
   else
     wrkflwtxt += "??????+"; // no path to this statement
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   // -- HOST or DEVICE matrix elements? Standalone MEs or BRIDGE?
   if( !bridge )
     wrkflwtxt += "MESDEV";
@@ -874,7 +887,7 @@ main( int argc, char** argv )
 
   if( perf )
   {
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 #ifdef _OPENMP
     // Get the output of "nproc --all" (https://stackoverflow.com/a/478960)
     std::string nprocall;
@@ -895,6 +908,8 @@ main( int argc, char** argv )
     std::cout << std::string( SEP79, '*' ) << std::endl
 #ifdef __CUDACC__
               << "Process                     = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_CUDA"
+#elif defined __HIPCC__
+              << "Process                     = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_HIP"
 #else
               << "Process                     = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_CPP"
 #endif
@@ -921,21 +936,21 @@ main( int argc, char** argv )
 #elif defined MGONGPU_FPTYPE_FLOAT
               << "FP precision                = FLOAT (NaN/abnormal=" << nabn << ", zero=" << nzero << ")" << std::endl
 #endif
-#ifdef __CUDACC__
 #if defined MGONGPU_CUCXTYPE_CUCOMPLEX
               << "Complex type                = CUCOMPLEX" << std::endl
 #elif defined MGONGPU_CUCXTYPE_THRUST
               << "Complex type                = THRUST::COMPLEX" << std::endl
-#endif
-#else
+#elif defined MGONGPU_CUCXTYPE_CXSMPL
               << "Complex type                = STD::COMPLEX" << std::endl
+#else
+              << "Complex type                = ???" << std::endl // no path to this statement...
 #endif
               << "RanNumb memory layout       = AOSOA[" << neppR << "]"
               << ( neppR == 1 ? " == AOS" : "" )
               << " [HARDCODED FOR REPRODUCIBILITY]" << std::endl
               << "Momenta memory layout       = AOSOA[" << neppM << "]"
               << ( neppM == 1 ? " == AOS" : "" ) << std::endl
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     //<< "Wavefunction GPU memory     = LOCAL" << std::endl
 #else
 #if !defined MGONGPU_CPPSIMD
@@ -966,7 +981,7 @@ main( int argc, char** argv )
 #endif
 #endif
               << "Random number generation    = " << rndgentxt << std::endl
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 #ifdef _OPENMP
               << "OMP threads / `nproc --all` = " << omp_get_max_threads() << " / " << nprocall // includes a newline
 #endif
@@ -1062,14 +1077,14 @@ main( int argc, char** argv )
              << "\"FLOAT (NaN/abnormal=" << nabn << ")\"," << std::endl
 #endif
              << "\"Complex type\": "
-#ifdef __CUDACC__
 #if defined MGONGPU_CUCXTYPE_CUCOMPLEX
              << "\"CUCOMPLEX\"," << std::endl
 #elif defined MGONGPU_CUCXTYPE_THRUST
              << "\"THRUST::COMPLEX\"," << std::endl
-#endif
-#else
+#elif defined MGONGPU_CUCXTYPE_CXSMPL
              << "\"STD::COMPLEX\"," << std::endl
+#else
+             << "\"???\"," << std::endl                           // no path to this statement...
 #endif
              << "\"RanNumb memory layout\": "
              << "\"AOSOA[" << neppR << "]\""
@@ -1077,7 +1092,7 @@ main( int argc, char** argv )
              << "\"Momenta memory layout\": "
              << "\"AOSOA[" << neppM << "]\""
              << ( neppM == 1 ? " == AOS" : "" ) << ", " << std::endl
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     //<< "\"Wavefunction GPU memory\": " << "\"LOCAL\"," << std::endl
 #endif
              << "\"Curand generation\": "
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gg_ttxg/CPPProcess.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gg_ttxg/CPPProcess.cc
index afeebde3c6..0e4d5d1157 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gg_ttxg/CPPProcess.cc
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gg_ttxg/CPPProcess.cc
@@ -4,7 +4,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
 // MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
@@ -16,7 +16,6 @@
 
 #include "mgOnGpuConfig.h"
 
-#include "CudaRuntime.h"
 #include "HelAmps_sm.h"
 #include "MemoryAccessAmplitudes.h"
 #include "MemoryAccessCouplings.h"
@@ -46,7 +45,7 @@
 // Class member functions for calculating the matrix elements for
 // Process: g g > t t~ g WEIGHTED<=3 @1
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -80,7 +79,7 @@ namespace mg5amcCpu
   __device__ const fptype cIPD[2] = { (fptype)Parameters_sm::mdl_MT, (fptype)Parameters_sm::mdl_WT };
   __device__ const fptype* cIPC = nullptr; // unused as nicoup=0
 #else
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   __device__ __constant__ fptype cIPD[2];
   __device__ __constant__ fptype* cIPC = nullptr; // unused as nicoup=0
 #else
@@ -90,7 +89,7 @@ namespace mg5amcCpu
 #endif
 
   // Helicity combinations (and filtering of "good" helicity combinations)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   __device__ __constant__ short cHel[ncomb][npar];
   __device__ __constant__ int cNGoodHel;
   __device__ __constant__ int cGoodHel[ncomb];
@@ -118,13 +117,13 @@ namespace mg5amcCpu
                            fptype* allDenominators,       // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
                            fptype_sv* jamp2_sv            // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled)
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
                            , const int ievt00             // input: first event number in current C++ event page (for CUDA, ievt depends on threadid)
 #endif
                            )
   //ALWAYS_INLINE // attributes are not permitted in a function definition
   {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     using namespace mg5amcGpu;
     using M_ACCESS = DeviceAccessMomenta;         // non-trivial access: buffer includes all events
     using E_ACCESS = DeviceAccessMatrixElements;  // non-trivial access: buffer includes all events
@@ -151,7 +150,7 @@ namespace mg5amcCpu
 #endif /* clang-format on */
     mgDebug( 0, __FUNCTION__ );
     //printf( "calculate_wavefunctions: ihel=%2d\n", ihel );
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
     //printf( "calculate_wavefunctions: ievt00=%d\n", ievt00 );
 #endif
 
@@ -187,7 +186,7 @@ namespace mg5amcCpu
 #endif
     for( int iParity = 0; iParity < nParity; ++iParity )
     { // START LOOP ON IPARITY
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
       const int ievt0 = ievt00 + iParity * neppV;
 #endif
       constexpr size_t nxcoup = ndcoup + nicoup; // both dependent and independent couplings
@@ -200,8 +199,10 @@ namespace mg5amcCpu
         allCOUPs[idcoup] = CD_ACCESS::idcoupAccessBufferConst( allcouplings, idcoup ); // dependent couplings, vary event-by-event
       for( size_t iicoup = 0; iicoup < nicoup; iicoup++ )
         allCOUPs[ndcoup + iicoup] = CI_ACCESS::iicoupAccessBufferConst( cIPC, iicoup ); // independent couplings, fixed for all events
+#ifdef MGONGPUCPP_GPUIMPL
 #ifdef __CUDACC__
 #pragma nv_diagnostic pop
+#endif
       // CUDA kernels take input/output buffers with momenta/MEs for all events
       const fptype* momenta = allmomenta;
       const fptype* COUPs[nxcoup];
@@ -505,7 +506,7 @@ namespace mg5amcCpu
         { 1, -8, 10, 1, 64, -8 },
         { 10, 1, 1, -8, -8, 64 } }; // 2-D array[6][6]
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
       // Pre-compute a constexpr triangular color matrix properly normalized #475
       struct TriangularNormalizedColorMatrix
       {
@@ -562,7 +563,7 @@ namespace mg5amcCpu
 #endif
       for( int icol = 0; icol < ncolor; icol++ )
       {
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
         // === C++ START ===
         // Diagonal terms
 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
@@ -621,7 +622,7 @@ namespace mg5amcCpu
       MEs_sv_previous += deltaMEs_previous;
 #endif
       /*
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
       if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", blockDim.x * blockIdx.x + threadIdx.x, ihel, MEs_sv );
 #else
 #ifdef MGONGPU_CPPSIMD
@@ -684,8 +685,8 @@ namespace mg5amcCpu
       { 1, 1, 1, 1, 1 },
       { 1, 1, 1, -1, -1 },
       { 1, 1, 1, -1, 1 } };
-#ifdef __CUDACC__
-    checkCuda( cudaMemcpyToSymbol( cHel, tHel, ncomb * npar * sizeof( short ) ) );
+#ifdef MGONGPUCPP_GPUIMPL
+    gpuMemcpyToSymbol( cHel, tHel, ncomb * npar * sizeof( short ) );
 #else
     memcpy( cHel, tHel, ncomb * npar * sizeof( short ) );
 #endif
@@ -726,9 +727,9 @@ namespace mg5amcCpu
     // Then copy them to CUDA constant memory (issue #39) or its C++ emulation in file-scope static memory
     const fptype tIPD[2] = { (fptype)m_pars->mdl_MT, (fptype)m_pars->mdl_WT };
     //const cxtype tIPC[0] = { ... }; // nicoup=0
-#ifdef __CUDACC__
-    checkCuda( cudaMemcpyToSymbol( cIPD, tIPD, 2 * sizeof( fptype ) ) );
-    //checkCuda( cudaMemcpyToSymbol( cIPC, tIPC, 0 * sizeof( cxtype ) ) ); // nicoup=0
+#ifdef MGONGPUCPP_GPUIMPL
+    gpuMemcpyToSymbol( cIPD, tIPD, 2 * sizeof( fptype ) );
+    //gpuMemcpyToSymbol( cIPC, tIPC, 0 * sizeof( cxtype ) ); // nicoup=0
 #else
     memcpy( cIPD, tIPD, 2 * sizeof( fptype ) );
     //memcpy( cIPC, tIPC, 0 * sizeof( cxtype ) ); // nicoup=0
@@ -765,7 +766,7 @@ namespace mg5amcCpu
   {
     std::stringstream out;
     // CUDA version (NVCC)
-    // [Use __NVCC__ instead of __CUDACC__ here!]
+    // [Use __NVCC__ instead of MGONGPUCPP_GPUIMPL here!]
     // [This tests if 'nvcc' was used even to build a .cc file, even if not necessarily 'nvcc -x cu' for a .cu file]
     // [Check 'nvcc --compiler-options -dM -E dummy.c | grep CUDA': see https://stackoverflow.com/a/53713712]
 #ifdef __NVCC__
@@ -830,12 +831,12 @@ namespace mg5amcCpu
   __global__ void /* clang-format off */
   computeDependentCouplings( const fptype* allgs, // input: Gs[nevt]
                              fptype* allcouplings // output: couplings[nevt*ndcoup*2]
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
                              , const int nevt     // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #endif
   ) /* clang-format on */
   {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     using namespace mg5amcGpu;
     using G_ACCESS = DeviceAccessGs;
     using C_ACCESS = DeviceAccessCouplings;
@@ -856,7 +857,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__ /* clang-format off */
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
   __global__ void
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
@@ -982,9 +983,9 @@ namespace mg5amcCpu
         nGoodHel++;
       }
     }
-#ifdef __CUDACC__
-    checkCuda( cudaMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) ) );
-    checkCuda( cudaMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) ) );
+#ifdef MGONGPUCPP_GPUIMPL
+    gpuMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) );
+    gpuMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) );
 #else
     cNGoodHel = nGoodHel;
     for( int ihel = 0; ihel < ncomb; ihel++ ) cGoodHel[ihel] = goodHel[ihel];
@@ -1008,7 +1009,7 @@ namespace mg5amcCpu
 #endif
             int* allselhel,                // output: helicity selection[nevt]
             int* allselcol                 // output: helicity selection[nevt]
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
             , const int nevt               // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #endif
             ) /* clang-format on */
@@ -1028,7 +1029,7 @@ namespace mg5amcCpu
     // Denominators: spins, colors and identical particles
     constexpr int helcolDenominators[1] = { 256 }; // assume nprocesses == 1 (#272 and #343)
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     // Remember: in CUDA this is a kernel for one event, in c++ this processes n events
     const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid
 #else
@@ -1042,9 +1043,12 @@ namespace mg5amcCpu
 #endif
 
     // Start sigmaKin_lines
+
+#include "GpuAbstraction.h"
+
     // === PART 0 - INITIALISATION (before calculate_wavefunctions) ===
     // Reset the "matrix elements" - running sums of |M|^2 over helicities for the given event
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     allMEs[ievt] = 0;
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     allNumerators[ievt] = 0;
@@ -1072,7 +1076,7 @@ namespace mg5amcCpu
     // === PART 1 - HELICITY LOOP: CALCULATE WAVEFUNCTIONS ===
     // (in both CUDA and C++, using precomputed good helicities)
 
-#ifdef __CUDACC__ // CUDA OR C++
+#ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++
 
     // *** START OF PART 1a - CUDA (one event per CPU thread) ***
     // Running sum of partial amplitudes squared for event by event color selection (#402)
@@ -1282,7 +1286,7 @@ namespace mg5amcCpu
     // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event
     // [NB 'sum over final spins, average over initial spins', eg see
     // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf]
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     allMEs[ievt] /= helcolDenominators[0];
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     if( channelId > 0 ) allMEs[ievt] *= allNumerators[ievt] / allDenominators[ievt];
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gg_ttxg/CPPProcess.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gg_ttxg/CPPProcess.h
index 37d6ebe981..11f562273e 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gg_ttxg/CPPProcess.h
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gg_ttxg/CPPProcess.h
@@ -4,7 +4,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
 // MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
@@ -25,7 +25,7 @@
 
 //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -107,7 +107,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   __global__ void
   computeDependentCouplings( const fptype* allgs,    // input: Gs[nevt]
                              fptype* allcouplings ); // output: couplings[nevt*ndcoup*2]
@@ -120,7 +120,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__ /* clang-format off */
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
   __global__ void
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
@@ -150,7 +150,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__ /* clang-format off */
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
   __global__ void
   sigmaKin( const fptype* allmomenta,      // input: momenta[nevt*npar*4]
             const fptype* allcouplings,    // input: couplings[nevt*ndcoup*2]
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gg_ttxg/CudaRuntime.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gg_ttxg/CudaRuntime.h
deleted file mode 120000
index ce9e1a487a..0000000000
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gg_ttxg/CudaRuntime.h
+++ /dev/null
@@ -1 +0,0 @@
-../CudaRuntime.h
\ No newline at end of file
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gg_ttxg/GpuAbstraction.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gg_ttxg/GpuAbstraction.h
new file mode 120000
index 0000000000..72054e19ba
--- /dev/null
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gg_ttxg/GpuAbstraction.h
@@ -0,0 +1 @@
+../GpuAbstraction.h
\ No newline at end of file
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gg_ttxg/GpuRuntime.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gg_ttxg/GpuRuntime.h
new file mode 120000
index 0000000000..3920e83be4
--- /dev/null
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gg_ttxg/GpuRuntime.h
@@ -0,0 +1 @@
+../GpuRuntime.h
\ No newline at end of file
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gg_ttxg/check_sa.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gg_ttxg/check_sa.cc
index 3fbf0ffbee..7cac5ab47b 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gg_ttxg/check_sa.cc
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gg_ttxg/check_sa.cc
@@ -4,7 +4,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: O. Mattelaer (Nov 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 
 #include "mgOnGpuConfig.h"
@@ -12,6 +12,7 @@
 #include "BridgeKernels.h"
 #include "CPPProcess.h"
 #include "CrossSectionKernels.h"
+#include "GpuRuntime.h"
 #include "MatrixElementKernels.h"
 #include "MemoryAccessMatrixElements.h"
 #include "MemoryAccessMomenta.h"
@@ -65,7 +66,7 @@ usage( char* argv0, int ret = 1 )
   std::cout << std::endl;
   std::cout << "Summary stats are always computed: '-p' and '-j' only control their printout" << std::endl;
   std::cout << "The '-d' flag only enables NaN/abnormal warnings and OMP debugging" << std::endl;
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 #ifdef _OPENMP
   std::cout << std::endl;
   std::cout << "Use the OMP_NUM_THREADS environment variable to control OMP multi-threading" << std::endl;
@@ -96,7 +97,7 @@ int
 main( int argc, char** argv )
 {
   // Namespaces for CUDA and C++ (FIXME - eventually use the same namespace everywhere...)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   using namespace mg5amcGpu;
 #else
   using namespace mg5amcCpu;
@@ -134,9 +135,11 @@ main( int argc, char** argv )
     CurandDevice = 2
   };
 #ifdef MGONGPU_HAS_NO_CURAND
-  RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // this is the only supported mode if build has no curand (PR #784)
+  RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // this is the only supported mode if build has no curand (PR #784 and #785)
+#elif defined __HIPCC__
+#error Internal error: MGONGPU_HAS_NO_CURAND should have been set for __HIPCC__ // default on AMD GPUs should be common random
 #elif defined __CUDACC__
-  RandomNumberMode rndgen = RandomNumberMode::CurandDevice; // default on GPU if build has curand
+  RandomNumberMode rndgen = RandomNumberMode::CurandDevice; // default on NVidia GPU if build has curand
 #else
   RandomNumberMode rndgen = RandomNumberMode::CurandHost; // default on CPU if build has curand
 #endif
@@ -146,10 +149,10 @@ main( int argc, char** argv )
     RamboHost = 1,
     RamboDevice = 2
   };
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   RamboSamplingMode rmbsmp = RamboSamplingMode::RamboDevice; // default on GPU
 #else
-  RamboSamplingMode rmbsmp = RamboSamplingMode::RamboHost;  // default on CPU
+  RamboSamplingMode rmbsmp = RamboSamplingMode::RamboHost; // default on CPU
 #endif
   // Bridge emulation mode (NB Bridge implies RamboHost!)
   bool bridge = false;
@@ -177,7 +180,7 @@ main( int argc, char** argv )
     else if( arg == "--curdev" )
     {
 #ifndef __CUDACC__
-      throw std::runtime_error( "CurandDevice is not supported on CPUs" );
+      throw std::runtime_error( "CurandDevice is not supported on CPUs or non-NVidia GPUs" );
 #elif defined MGONGPU_HAS_NO_CURAND
       throw std::runtime_error( "CurandDevice is not supported because this application was built without Curand support" );
 #else
@@ -198,7 +201,7 @@ main( int argc, char** argv )
     }
     else if( arg == "--rmbdev" )
     {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
       rmbsmp = RamboSamplingMode::RamboDevice;
 #else
       throw std::runtime_error( "RamboDevice is not supported on CPUs" );
@@ -272,13 +275,13 @@ main( int argc, char** argv )
     return usage( argv[0] );
   }
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 #ifdef _OPENMP
   ompnumthreadsNotSetMeansOneThread( debug ? 1 : 0 ); // quiet(-1), info(0), debug(1)
 #endif
 #endif
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // Fail gently and avoid "Illegal instruction (core dumped)" if the host does not support the SIMD used in the ME calculation
   // Note: this prevents a crash on pmpe04 but not on some github CI nodes?
   // [NB: SIMD vectorization in mg5amc C++ code is only used in the ME calculation below MatrixElementKernelHost!]
@@ -296,14 +299,14 @@ main( int argc, char** argv )
 
   // === STEP 0 - INITIALISE
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 
-  // --- 00. Initialise cuda
-  // Instantiate a CudaRuntime at the beginnining of the application's main to
-  // invoke cudaSetDevice(0) in the constructor and book a cudaDeviceReset() call in the destructor
-  const std::string cdinKey = "00 CudaInit";
+  // --- 00. Initialise GPU
+  // Instantiate a GpuRuntime at the beginnining of the application's main.
+  // For CUDA this invokes cudaSetDevice(0) in the constructor and books a cudaDeviceReset() call in the destructor.
+  const std::string cdinKey = "00 GpuInit";
   timermap.start( cdinKey );
-  CudaRuntime cudaRuntime( debug );
+  GpuRuntime GpuRuntime( debug );
 #endif
 
   // --- 0a. Initialise physics process
@@ -325,7 +328,7 @@ main( int argc, char** argv )
   timermap.start( alloKey );
 
   // Memory buffers for random numbers for momenta
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferRndNumMomenta hstRndmom( nevt );
 #else
   PinnedHostBufferRndNumMomenta hstRndmom( nevt );
@@ -333,7 +336,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for sampling weights
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferWeights hstWeights( nevt );
 #else
   PinnedHostBufferWeights hstWeights( nevt );
@@ -341,7 +344,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for momenta
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferMomenta hstMomenta( nevt );
 #else
   PinnedHostBufferMomenta hstMomenta( nevt );
@@ -349,7 +352,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for Gs
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferGs hstGs( nevt );
 #else
   PinnedHostBufferGs hstGs( nevt );
@@ -366,7 +369,7 @@ main( int argc, char** argv )
   }
 
   // Memory buffers for matrix elements
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferMatrixElements hstMatrixElements( nevt );
 #else
   PinnedHostBufferMatrixElements hstMatrixElements( nevt );
@@ -375,7 +378,7 @@ main( int argc, char** argv )
 
   // Memory buffers for random numbers for helicity selection
   // *** NB #403 these buffers always remain initialised at 0: no need for helicity choice in gcheck/check (no LHE produced) ***
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferRndNumHelicity hstRndHel( nevt );
 #else
   PinnedHostBufferRndNumHelicity hstRndHel( nevt );
@@ -384,7 +387,7 @@ main( int argc, char** argv )
 
   // Memory buffers for random numbers for color selection
   // *** NB #402 these buffers always remain initialised at 0: no need for color choice in gcheck/check (no LHE produced) ***
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferRndNumColor hstRndCol( nevt );
 #else
   PinnedHostBufferRndNumColor hstRndCol( nevt );
@@ -392,7 +395,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for helicity selection
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferSelectedHelicity hstSelHel( nevt );
 #else
   PinnedHostBufferSelectedHelicity hstSelHel( nevt );
@@ -400,7 +403,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for color selection
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferSelectedColor hstSelCol( nevt );
 #else
   PinnedHostBufferSelectedColor hstSelCol( nevt );
@@ -438,7 +441,7 @@ main( int argc, char** argv )
     const bool onDevice = true;
     prnk.reset( new CurandRandomNumberKernel( devRndmom, onDevice ) );
 #else
-    throw std::logic_error( "INTERNAL ERROR! CurandDevice is not supported on CPUs" ); // INTERNAL ERROR (no path to this statement)
+    throw std::logic_error( "INTERNAL ERROR! CurandDevice is not supported on CPUs or non-NVidia GPUs" ); // INTERNAL ERROR (no path to this statement)
 #endif
   }
 
@@ -450,7 +453,7 @@ main( int argc, char** argv )
   }
   else
   {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     prsk.reset( new RamboSamplingKernelDevice( energy, devRndmom, devMomenta, devWeights, gpublocks, gputhreads ) );
 #else
     throw std::logic_error( "RamboDevice is not supported on CPUs" ); // INTERNAL ERROR (no path to this statement)
@@ -461,7 +464,7 @@ main( int argc, char** argv )
   std::unique_ptr<MatrixElementKernelBase> pmek;
   if( !bridge )
   {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     pmek.reset( new MatrixElementKernelDevice( devMomenta, devGs, devRndHel, devRndCol, devMatrixElements, devSelHel, devSelCol, gpublocks, gputhreads ) );
 #else
     pmek.reset( new MatrixElementKernelHost( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, nevt ) );
@@ -469,7 +472,7 @@ main( int argc, char** argv )
   }
   else
   {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     pmek.reset( new BridgeKernelDevice( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, gpublocks, gputhreads ) );
 #else
     pmek.reset( new BridgeKernelHost( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, nevt ) );
@@ -511,7 +514,7 @@ main( int argc, char** argv )
     prnk->generateRnarray();
     //std::cout << "Got random numbers" << std::endl;
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     if( rndgen != RandomNumberMode::CurandDevice && rmbsmp == RamboSamplingMode::RamboDevice )
     {
       // --- 1c. Copy rndmom from host to device
@@ -543,7 +546,7 @@ main( int argc, char** argv )
     prsk->getMomentaFinal();
     //std::cout << "Got final momenta" << std::endl;
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     if( rmbsmp == RamboSamplingMode::RamboDevice )
     {
       // --- 2c. CopyDToH Weights
@@ -588,7 +591,7 @@ main( int argc, char** argv )
       dynamic_cast<BridgeKernelBase*>( pmek.get() )->transposeInputMomentaC2F();
     }
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     // --- 2d. CopyHToD Momenta
     const std::string gKey = "0.. CpHTDg";
     rambtime += timermap.start( gKey ); // FIXME! NOT A RAMBO TIMER!
@@ -617,7 +620,7 @@ main( int argc, char** argv )
     wv3atime += timermap.stop(); // calc only
     wavetime += wv3atime;        // calc plus copy
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     if( !bridge )
     {
       // --- 3b. CopyDToH MEs
@@ -760,18 +763,22 @@ main( int argc, char** argv )
     rndgentxt = "CURAND DEVICE";
 #ifdef __CUDACC__
   rndgentxt += " (CUDA code)";
+#elif defined __HIPCC__
+  rndgentxt += " (HIP code)";
 #else
   rndgentxt += " (C++ code)";
 #endif
 
   // Workflow description summary
   std::string wrkflwtxt;
-  // -- CUDA or C++?
+  // -- CUDA or HIP or C++?
 #ifdef __CUDACC__
   wrkflwtxt += "CUD:";
+#elif defined __HIPCC__
+  wrkflwtxt += "HIP:";
 #else
   wrkflwtxt += "CPP:";
-#endif
+#endif /* clang-format off */
   // -- DOUBLE or FLOAT?
 #if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
   wrkflwtxt += "MIX+"; // mixed fptypes (single precision color algebra #537)
@@ -781,7 +788,7 @@ main( int argc, char** argv )
   wrkflwtxt += "FLT+";
 #else
   wrkflwtxt += "???+"; // no path to this statement
-#endif
+#endif /* clang-format on */
   // -- CUCOMPLEX or THRUST or STD complex numbers?
 #ifdef __CUDACC__
 #if defined MGONGPU_CUCXTYPE_CUCOMPLEX
@@ -793,6 +800,12 @@ main( int argc, char** argv )
 #else
   wrkflwtxt += "???:"; // no path to this statement
 #endif
+#elif defined __HIPCC__
+#if defined MGONGPU_CUCXTYPE_CXSMPL
+  wrkflwtxt += "CXS:";
+#else
+  wrkflwtxt += "???:"; // no path to this statement
+#endif
 #else
 #if defined MGONGPU_CPPCXTYPE_STDCOMPLEX
   wrkflwtxt += "STX:";
@@ -818,7 +831,7 @@ main( int argc, char** argv )
     wrkflwtxt += "RMBDEV+";
   else
     wrkflwtxt += "??????+"; // no path to this statement
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   // -- HOST or DEVICE matrix elements? Standalone MEs or BRIDGE?
   if( !bridge )
     wrkflwtxt += "MESDEV";
@@ -874,7 +887,7 @@ main( int argc, char** argv )
 
   if( perf )
   {
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 #ifdef _OPENMP
     // Get the output of "nproc --all" (https://stackoverflow.com/a/478960)
     std::string nprocall;
@@ -895,6 +908,8 @@ main( int argc, char** argv )
     std::cout << std::string( SEP79, '*' ) << std::endl
 #ifdef __CUDACC__
               << "Process                     = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_CUDA"
+#elif defined __HIPCC__
+              << "Process                     = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_HIP"
 #else
               << "Process                     = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_CPP"
 #endif
@@ -921,21 +936,21 @@ main( int argc, char** argv )
 #elif defined MGONGPU_FPTYPE_FLOAT
               << "FP precision                = FLOAT (NaN/abnormal=" << nabn << ", zero=" << nzero << ")" << std::endl
 #endif
-#ifdef __CUDACC__
 #if defined MGONGPU_CUCXTYPE_CUCOMPLEX
               << "Complex type                = CUCOMPLEX" << std::endl
 #elif defined MGONGPU_CUCXTYPE_THRUST
               << "Complex type                = THRUST::COMPLEX" << std::endl
-#endif
-#else
+#elif defined MGONGPU_CUCXTYPE_CXSMPL
               << "Complex type                = STD::COMPLEX" << std::endl
+#else
+              << "Complex type                = ???" << std::endl // no path to this statement...
 #endif
               << "RanNumb memory layout       = AOSOA[" << neppR << "]"
               << ( neppR == 1 ? " == AOS" : "" )
               << " [HARDCODED FOR REPRODUCIBILITY]" << std::endl
               << "Momenta memory layout       = AOSOA[" << neppM << "]"
               << ( neppM == 1 ? " == AOS" : "" ) << std::endl
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     //<< "Wavefunction GPU memory     = LOCAL" << std::endl
 #else
 #if !defined MGONGPU_CPPSIMD
@@ -966,7 +981,7 @@ main( int argc, char** argv )
 #endif
 #endif
               << "Random number generation    = " << rndgentxt << std::endl
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 #ifdef _OPENMP
               << "OMP threads / `nproc --all` = " << omp_get_max_threads() << " / " << nprocall // includes a newline
 #endif
@@ -1062,14 +1077,14 @@ main( int argc, char** argv )
              << "\"FLOAT (NaN/abnormal=" << nabn << ")\"," << std::endl
 #endif
              << "\"Complex type\": "
-#ifdef __CUDACC__
 #if defined MGONGPU_CUCXTYPE_CUCOMPLEX
              << "\"CUCOMPLEX\"," << std::endl
 #elif defined MGONGPU_CUCXTYPE_THRUST
              << "\"THRUST::COMPLEX\"," << std::endl
-#endif
-#else
+#elif defined MGONGPU_CUCXTYPE_CXSMPL
              << "\"STD::COMPLEX\"," << std::endl
+#else
+             << "\"???\"," << std::endl                           // no path to this statement...
 #endif
              << "\"RanNumb memory layout\": "
              << "\"AOSOA[" << neppR << "]\""
@@ -1077,7 +1092,7 @@ main( int argc, char** argv )
              << "\"Momenta memory layout\": "
              << "\"AOSOA[" << neppM << "]\""
              << ( neppM == 1 ? " == AOS" : "" ) << ", " << std::endl
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     //<< "\"Wavefunction GPU memory\": " << "\"LOCAL\"," << std::endl
 #endif
              << "\"Curand generation\": "
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gu_ttxu/CPPProcess.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gu_ttxu/CPPProcess.cc
index b7e3475679..e098c03e3a 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gu_ttxu/CPPProcess.cc
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gu_ttxu/CPPProcess.cc
@@ -4,7 +4,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
 // MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
@@ -16,7 +16,6 @@
 
 #include "mgOnGpuConfig.h"
 
-#include "CudaRuntime.h"
 #include "HelAmps_sm.h"
 #include "MemoryAccessAmplitudes.h"
 #include "MemoryAccessCouplings.h"
@@ -49,7 +48,7 @@
 // Process: g d > t t~ d WEIGHTED<=3 @1
 // Process: g s > t t~ s WEIGHTED<=3 @1
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -83,7 +82,7 @@ namespace mg5amcCpu
   __device__ const fptype cIPD[2] = { (fptype)Parameters_sm::mdl_MT, (fptype)Parameters_sm::mdl_WT };
   __device__ const fptype* cIPC = nullptr; // unused as nicoup=0
 #else
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   __device__ __constant__ fptype cIPD[2];
   __device__ __constant__ fptype* cIPC = nullptr; // unused as nicoup=0
 #else
@@ -93,7 +92,7 @@ namespace mg5amcCpu
 #endif
 
   // Helicity combinations (and filtering of "good" helicity combinations)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   __device__ __constant__ short cHel[ncomb][npar];
   __device__ __constant__ int cNGoodHel;
   __device__ __constant__ int cGoodHel[ncomb];
@@ -121,13 +120,13 @@ namespace mg5amcCpu
                            fptype* allDenominators,       // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
                            fptype_sv* jamp2_sv            // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled)
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
                            , const int ievt00             // input: first event number in current C++ event page (for CUDA, ievt depends on threadid)
 #endif
                            )
   //ALWAYS_INLINE // attributes are not permitted in a function definition
   {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     using namespace mg5amcGpu;
     using M_ACCESS = DeviceAccessMomenta;         // non-trivial access: buffer includes all events
     using E_ACCESS = DeviceAccessMatrixElements;  // non-trivial access: buffer includes all events
@@ -154,7 +153,7 @@ namespace mg5amcCpu
 #endif /* clang-format on */
     mgDebug( 0, __FUNCTION__ );
     //printf( "calculate_wavefunctions: ihel=%2d\n", ihel );
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
     //printf( "calculate_wavefunctions: ievt00=%d\n", ievt00 );
 #endif
 
@@ -190,7 +189,7 @@ namespace mg5amcCpu
 #endif
     for( int iParity = 0; iParity < nParity; ++iParity )
     { // START LOOP ON IPARITY
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
       const int ievt0 = ievt00 + iParity * neppV;
 #endif
       constexpr size_t nxcoup = ndcoup + nicoup; // both dependent and independent couplings
@@ -203,8 +202,10 @@ namespace mg5amcCpu
         allCOUPs[idcoup] = CD_ACCESS::idcoupAccessBufferConst( allcouplings, idcoup ); // dependent couplings, vary event-by-event
       for( size_t iicoup = 0; iicoup < nicoup; iicoup++ )
         allCOUPs[ndcoup + iicoup] = CI_ACCESS::iicoupAccessBufferConst( cIPC, iicoup ); // independent couplings, fixed for all events
+#ifdef MGONGPUCPP_GPUIMPL
 #ifdef __CUDACC__
 #pragma nv_diagnostic pop
+#endif
       // CUDA kernels take input/output buffers with momenta/MEs for all events
       const fptype* momenta = allmomenta;
       const fptype* COUPs[nxcoup];
@@ -341,7 +342,7 @@ namespace mg5amcCpu
         { 4, 0, 12, 4 },
         { 0, 4, 4, 12 } }; // 2-D array[4][4]
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
       // Pre-compute a constexpr triangular color matrix properly normalized #475
       struct TriangularNormalizedColorMatrix
       {
@@ -398,7 +399,7 @@ namespace mg5amcCpu
 #endif
       for( int icol = 0; icol < ncolor; icol++ )
       {
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
         // === C++ START ===
         // Diagonal terms
 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
@@ -457,7 +458,7 @@ namespace mg5amcCpu
       MEs_sv_previous += deltaMEs_previous;
 #endif
       /*
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
       if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", blockDim.x * blockIdx.x + threadIdx.x, ihel, MEs_sv );
 #else
 #ifdef MGONGPU_CPPSIMD
@@ -520,8 +521,8 @@ namespace mg5amcCpu
       { 1, -1, 1, 1, 1 },
       { 1, -1, 1, -1, -1 },
       { 1, -1, 1, -1, 1 } };
-#ifdef __CUDACC__
-    checkCuda( cudaMemcpyToSymbol( cHel, tHel, ncomb * npar * sizeof( short ) ) );
+#ifdef MGONGPUCPP_GPUIMPL
+    gpuMemcpyToSymbol( cHel, tHel, ncomb * npar * sizeof( short ) );
 #else
     memcpy( cHel, tHel, ncomb * npar * sizeof( short ) );
 #endif
@@ -562,9 +563,9 @@ namespace mg5amcCpu
     // Then copy them to CUDA constant memory (issue #39) or its C++ emulation in file-scope static memory
     const fptype tIPD[2] = { (fptype)m_pars->mdl_MT, (fptype)m_pars->mdl_WT };
     //const cxtype tIPC[0] = { ... }; // nicoup=0
-#ifdef __CUDACC__
-    checkCuda( cudaMemcpyToSymbol( cIPD, tIPD, 2 * sizeof( fptype ) ) );
-    //checkCuda( cudaMemcpyToSymbol( cIPC, tIPC, 0 * sizeof( cxtype ) ) ); // nicoup=0
+#ifdef MGONGPUCPP_GPUIMPL
+    gpuMemcpyToSymbol( cIPD, tIPD, 2 * sizeof( fptype ) );
+    //gpuMemcpyToSymbol( cIPC, tIPC, 0 * sizeof( cxtype ) ); // nicoup=0
 #else
     memcpy( cIPD, tIPD, 2 * sizeof( fptype ) );
     //memcpy( cIPC, tIPC, 0 * sizeof( cxtype ) ); // nicoup=0
@@ -601,7 +602,7 @@ namespace mg5amcCpu
   {
     std::stringstream out;
     // CUDA version (NVCC)
-    // [Use __NVCC__ instead of __CUDACC__ here!]
+    // [Use __NVCC__ instead of MGONGPUCPP_GPUIMPL here!]
     // [This tests if 'nvcc' was used even to build a .cc file, even if not necessarily 'nvcc -x cu' for a .cu file]
     // [Check 'nvcc --compiler-options -dM -E dummy.c | grep CUDA': see https://stackoverflow.com/a/53713712]
 #ifdef __NVCC__
@@ -666,12 +667,12 @@ namespace mg5amcCpu
   __global__ void /* clang-format off */
   computeDependentCouplings( const fptype* allgs, // input: Gs[nevt]
                              fptype* allcouplings // output: couplings[nevt*ndcoup*2]
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
                              , const int nevt     // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #endif
   ) /* clang-format on */
   {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     using namespace mg5amcGpu;
     using G_ACCESS = DeviceAccessGs;
     using C_ACCESS = DeviceAccessCouplings;
@@ -692,7 +693,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__ /* clang-format off */
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
   __global__ void
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
@@ -818,9 +819,9 @@ namespace mg5amcCpu
         nGoodHel++;
       }
     }
-#ifdef __CUDACC__
-    checkCuda( cudaMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) ) );
-    checkCuda( cudaMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) ) );
+#ifdef MGONGPUCPP_GPUIMPL
+    gpuMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) );
+    gpuMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) );
 #else
     cNGoodHel = nGoodHel;
     for( int ihel = 0; ihel < ncomb; ihel++ ) cGoodHel[ihel] = goodHel[ihel];
@@ -844,7 +845,7 @@ namespace mg5amcCpu
 #endif
             int* allselhel,                // output: helicity selection[nevt]
             int* allselcol                 // output: helicity selection[nevt]
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
             , const int nevt               // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #endif
             ) /* clang-format on */
@@ -864,7 +865,7 @@ namespace mg5amcCpu
     // Denominators: spins, colors and identical particles
     constexpr int helcolDenominators[1] = { 96 }; // assume nprocesses == 1 (#272 and #343)
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     // Remember: in CUDA this is a kernel for one event, in c++ this processes n events
     const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid
 #else
@@ -878,9 +879,12 @@ namespace mg5amcCpu
 #endif
 
     // Start sigmaKin_lines
+
+#include "GpuAbstraction.h"
+
     // === PART 0 - INITIALISATION (before calculate_wavefunctions) ===
     // Reset the "matrix elements" - running sums of |M|^2 over helicities for the given event
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     allMEs[ievt] = 0;
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     allNumerators[ievt] = 0;
@@ -908,7 +912,7 @@ namespace mg5amcCpu
     // === PART 1 - HELICITY LOOP: CALCULATE WAVEFUNCTIONS ===
     // (in both CUDA and C++, using precomputed good helicities)
 
-#ifdef __CUDACC__ // CUDA OR C++
+#ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++
 
     // *** START OF PART 1a - CUDA (one event per CPU thread) ***
     // Running sum of partial amplitudes squared for event by event color selection (#402)
@@ -1118,7 +1122,7 @@ namespace mg5amcCpu
     // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event
     // [NB 'sum over final spins, average over initial spins', eg see
     // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf]
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     allMEs[ievt] /= helcolDenominators[0];
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     if( channelId > 0 ) allMEs[ievt] *= allNumerators[ievt] / allDenominators[ievt];
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gu_ttxu/CPPProcess.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gu_ttxu/CPPProcess.h
index bf037c6c28..ce22572055 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gu_ttxu/CPPProcess.h
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gu_ttxu/CPPProcess.h
@@ -4,7 +4,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
 // MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
@@ -25,7 +25,7 @@
 
 //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -110,7 +110,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   __global__ void
   computeDependentCouplings( const fptype* allgs,    // input: Gs[nevt]
                              fptype* allcouplings ); // output: couplings[nevt*ndcoup*2]
@@ -123,7 +123,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__ /* clang-format off */
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
   __global__ void
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
@@ -153,7 +153,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__ /* clang-format off */
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
   __global__ void
   sigmaKin( const fptype* allmomenta,      // input: momenta[nevt*npar*4]
             const fptype* allcouplings,    // input: couplings[nevt*ndcoup*2]
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gu_ttxu/CudaRuntime.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gu_ttxu/CudaRuntime.h
deleted file mode 120000
index ce9e1a487a..0000000000
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gu_ttxu/CudaRuntime.h
+++ /dev/null
@@ -1 +0,0 @@
-../CudaRuntime.h
\ No newline at end of file
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gu_ttxu/GpuAbstraction.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gu_ttxu/GpuAbstraction.h
new file mode 120000
index 0000000000..72054e19ba
--- /dev/null
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gu_ttxu/GpuAbstraction.h
@@ -0,0 +1 @@
+../GpuAbstraction.h
\ No newline at end of file
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gu_ttxu/GpuRuntime.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gu_ttxu/GpuRuntime.h
new file mode 120000
index 0000000000..3920e83be4
--- /dev/null
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gu_ttxu/GpuRuntime.h
@@ -0,0 +1 @@
+../GpuRuntime.h
\ No newline at end of file
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gu_ttxu/check_sa.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gu_ttxu/check_sa.cc
index 3fbf0ffbee..7cac5ab47b 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gu_ttxu/check_sa.cc
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gu_ttxu/check_sa.cc
@@ -4,7 +4,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: O. Mattelaer (Nov 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 
 #include "mgOnGpuConfig.h"
@@ -12,6 +12,7 @@
 #include "BridgeKernels.h"
 #include "CPPProcess.h"
 #include "CrossSectionKernels.h"
+#include "GpuRuntime.h"
 #include "MatrixElementKernels.h"
 #include "MemoryAccessMatrixElements.h"
 #include "MemoryAccessMomenta.h"
@@ -65,7 +66,7 @@ usage( char* argv0, int ret = 1 )
   std::cout << std::endl;
   std::cout << "Summary stats are always computed: '-p' and '-j' only control their printout" << std::endl;
   std::cout << "The '-d' flag only enables NaN/abnormal warnings and OMP debugging" << std::endl;
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 #ifdef _OPENMP
   std::cout << std::endl;
   std::cout << "Use the OMP_NUM_THREADS environment variable to control OMP multi-threading" << std::endl;
@@ -96,7 +97,7 @@ int
 main( int argc, char** argv )
 {
   // Namespaces for CUDA and C++ (FIXME - eventually use the same namespace everywhere...)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   using namespace mg5amcGpu;
 #else
   using namespace mg5amcCpu;
@@ -134,9 +135,11 @@ main( int argc, char** argv )
     CurandDevice = 2
   };
 #ifdef MGONGPU_HAS_NO_CURAND
-  RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // this is the only supported mode if build has no curand (PR #784)
+  RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // this is the only supported mode if build has no curand (PR #784 and #785)
+#elif defined __HIPCC__
+#error Internal error: MGONGPU_HAS_NO_CURAND should have been set for __HIPCC__ // default on AMD GPUs should be common random
 #elif defined __CUDACC__
-  RandomNumberMode rndgen = RandomNumberMode::CurandDevice; // default on GPU if build has curand
+  RandomNumberMode rndgen = RandomNumberMode::CurandDevice; // default on NVidia GPU if build has curand
 #else
   RandomNumberMode rndgen = RandomNumberMode::CurandHost; // default on CPU if build has curand
 #endif
@@ -146,10 +149,10 @@ main( int argc, char** argv )
     RamboHost = 1,
     RamboDevice = 2
   };
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   RamboSamplingMode rmbsmp = RamboSamplingMode::RamboDevice; // default on GPU
 #else
-  RamboSamplingMode rmbsmp = RamboSamplingMode::RamboHost;  // default on CPU
+  RamboSamplingMode rmbsmp = RamboSamplingMode::RamboHost; // default on CPU
 #endif
   // Bridge emulation mode (NB Bridge implies RamboHost!)
   bool bridge = false;
@@ -177,7 +180,7 @@ main( int argc, char** argv )
     else if( arg == "--curdev" )
     {
 #ifndef __CUDACC__
-      throw std::runtime_error( "CurandDevice is not supported on CPUs" );
+      throw std::runtime_error( "CurandDevice is not supported on CPUs or non-NVidia GPUs" );
 #elif defined MGONGPU_HAS_NO_CURAND
       throw std::runtime_error( "CurandDevice is not supported because this application was built without Curand support" );
 #else
@@ -198,7 +201,7 @@ main( int argc, char** argv )
     }
     else if( arg == "--rmbdev" )
     {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
       rmbsmp = RamboSamplingMode::RamboDevice;
 #else
       throw std::runtime_error( "RamboDevice is not supported on CPUs" );
@@ -272,13 +275,13 @@ main( int argc, char** argv )
     return usage( argv[0] );
   }
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 #ifdef _OPENMP
   ompnumthreadsNotSetMeansOneThread( debug ? 1 : 0 ); // quiet(-1), info(0), debug(1)
 #endif
 #endif
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // Fail gently and avoid "Illegal instruction (core dumped)" if the host does not support the SIMD used in the ME calculation
   // Note: this prevents a crash on pmpe04 but not on some github CI nodes?
   // [NB: SIMD vectorization in mg5amc C++ code is only used in the ME calculation below MatrixElementKernelHost!]
@@ -296,14 +299,14 @@ main( int argc, char** argv )
 
   // === STEP 0 - INITIALISE
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 
-  // --- 00. Initialise cuda
-  // Instantiate a CudaRuntime at the beginnining of the application's main to
-  // invoke cudaSetDevice(0) in the constructor and book a cudaDeviceReset() call in the destructor
-  const std::string cdinKey = "00 CudaInit";
+  // --- 00. Initialise GPU
+  // Instantiate a GpuRuntime at the beginnining of the application's main.
+  // For CUDA this invokes cudaSetDevice(0) in the constructor and books a cudaDeviceReset() call in the destructor.
+  const std::string cdinKey = "00 GpuInit";
   timermap.start( cdinKey );
-  CudaRuntime cudaRuntime( debug );
+  GpuRuntime GpuRuntime( debug );
 #endif
 
   // --- 0a. Initialise physics process
@@ -325,7 +328,7 @@ main( int argc, char** argv )
   timermap.start( alloKey );
 
   // Memory buffers for random numbers for momenta
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferRndNumMomenta hstRndmom( nevt );
 #else
   PinnedHostBufferRndNumMomenta hstRndmom( nevt );
@@ -333,7 +336,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for sampling weights
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferWeights hstWeights( nevt );
 #else
   PinnedHostBufferWeights hstWeights( nevt );
@@ -341,7 +344,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for momenta
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferMomenta hstMomenta( nevt );
 #else
   PinnedHostBufferMomenta hstMomenta( nevt );
@@ -349,7 +352,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for Gs
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferGs hstGs( nevt );
 #else
   PinnedHostBufferGs hstGs( nevt );
@@ -366,7 +369,7 @@ main( int argc, char** argv )
   }
 
   // Memory buffers for matrix elements
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferMatrixElements hstMatrixElements( nevt );
 #else
   PinnedHostBufferMatrixElements hstMatrixElements( nevt );
@@ -375,7 +378,7 @@ main( int argc, char** argv )
 
   // Memory buffers for random numbers for helicity selection
   // *** NB #403 these buffers always remain initialised at 0: no need for helicity choice in gcheck/check (no LHE produced) ***
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferRndNumHelicity hstRndHel( nevt );
 #else
   PinnedHostBufferRndNumHelicity hstRndHel( nevt );
@@ -384,7 +387,7 @@ main( int argc, char** argv )
 
   // Memory buffers for random numbers for color selection
   // *** NB #402 these buffers always remain initialised at 0: no need for color choice in gcheck/check (no LHE produced) ***
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferRndNumColor hstRndCol( nevt );
 #else
   PinnedHostBufferRndNumColor hstRndCol( nevt );
@@ -392,7 +395,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for helicity selection
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferSelectedHelicity hstSelHel( nevt );
 #else
   PinnedHostBufferSelectedHelicity hstSelHel( nevt );
@@ -400,7 +403,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for color selection
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferSelectedColor hstSelCol( nevt );
 #else
   PinnedHostBufferSelectedColor hstSelCol( nevt );
@@ -438,7 +441,7 @@ main( int argc, char** argv )
     const bool onDevice = true;
     prnk.reset( new CurandRandomNumberKernel( devRndmom, onDevice ) );
 #else
-    throw std::logic_error( "INTERNAL ERROR! CurandDevice is not supported on CPUs" ); // INTERNAL ERROR (no path to this statement)
+    throw std::logic_error( "INTERNAL ERROR! CurandDevice is not supported on CPUs or non-NVidia GPUs" ); // INTERNAL ERROR (no path to this statement)
 #endif
   }
 
@@ -450,7 +453,7 @@ main( int argc, char** argv )
   }
   else
   {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     prsk.reset( new RamboSamplingKernelDevice( energy, devRndmom, devMomenta, devWeights, gpublocks, gputhreads ) );
 #else
     throw std::logic_error( "RamboDevice is not supported on CPUs" ); // INTERNAL ERROR (no path to this statement)
@@ -461,7 +464,7 @@ main( int argc, char** argv )
   std::unique_ptr<MatrixElementKernelBase> pmek;
   if( !bridge )
   {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     pmek.reset( new MatrixElementKernelDevice( devMomenta, devGs, devRndHel, devRndCol, devMatrixElements, devSelHel, devSelCol, gpublocks, gputhreads ) );
 #else
     pmek.reset( new MatrixElementKernelHost( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, nevt ) );
@@ -469,7 +472,7 @@ main( int argc, char** argv )
   }
   else
   {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     pmek.reset( new BridgeKernelDevice( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, gpublocks, gputhreads ) );
 #else
     pmek.reset( new BridgeKernelHost( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, nevt ) );
@@ -511,7 +514,7 @@ main( int argc, char** argv )
     prnk->generateRnarray();
     //std::cout << "Got random numbers" << std::endl;
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     if( rndgen != RandomNumberMode::CurandDevice && rmbsmp == RamboSamplingMode::RamboDevice )
     {
       // --- 1c. Copy rndmom from host to device
@@ -543,7 +546,7 @@ main( int argc, char** argv )
     prsk->getMomentaFinal();
     //std::cout << "Got final momenta" << std::endl;
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     if( rmbsmp == RamboSamplingMode::RamboDevice )
     {
       // --- 2c. CopyDToH Weights
@@ -588,7 +591,7 @@ main( int argc, char** argv )
       dynamic_cast<BridgeKernelBase*>( pmek.get() )->transposeInputMomentaC2F();
     }
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     // --- 2d. CopyHToD Momenta
     const std::string gKey = "0.. CpHTDg";
     rambtime += timermap.start( gKey ); // FIXME! NOT A RAMBO TIMER!
@@ -617,7 +620,7 @@ main( int argc, char** argv )
     wv3atime += timermap.stop(); // calc only
     wavetime += wv3atime;        // calc plus copy
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     if( !bridge )
     {
       // --- 3b. CopyDToH MEs
@@ -760,18 +763,22 @@ main( int argc, char** argv )
     rndgentxt = "CURAND DEVICE";
 #ifdef __CUDACC__
   rndgentxt += " (CUDA code)";
+#elif defined __HIPCC__
+  rndgentxt += " (HIP code)";
 #else
   rndgentxt += " (C++ code)";
 #endif
 
   // Workflow description summary
   std::string wrkflwtxt;
-  // -- CUDA or C++?
+  // -- CUDA or HIP or C++?
 #ifdef __CUDACC__
   wrkflwtxt += "CUD:";
+#elif defined __HIPCC__
+  wrkflwtxt += "HIP:";
 #else
   wrkflwtxt += "CPP:";
-#endif
+#endif /* clang-format off */
   // -- DOUBLE or FLOAT?
 #if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
   wrkflwtxt += "MIX+"; // mixed fptypes (single precision color algebra #537)
@@ -781,7 +788,7 @@ main( int argc, char** argv )
   wrkflwtxt += "FLT+";
 #else
   wrkflwtxt += "???+"; // no path to this statement
-#endif
+#endif /* clang-format on */
   // -- CUCOMPLEX or THRUST or STD complex numbers?
 #ifdef __CUDACC__
 #if defined MGONGPU_CUCXTYPE_CUCOMPLEX
@@ -793,6 +800,12 @@ main( int argc, char** argv )
 #else
   wrkflwtxt += "???:"; // no path to this statement
 #endif
+#elif defined __HIPCC__
+#if defined MGONGPU_CUCXTYPE_CXSMPL
+  wrkflwtxt += "CXS:";
+#else
+  wrkflwtxt += "???:"; // no path to this statement
+#endif
 #else
 #if defined MGONGPU_CPPCXTYPE_STDCOMPLEX
   wrkflwtxt += "STX:";
@@ -818,7 +831,7 @@ main( int argc, char** argv )
     wrkflwtxt += "RMBDEV+";
   else
     wrkflwtxt += "??????+"; // no path to this statement
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   // -- HOST or DEVICE matrix elements? Standalone MEs or BRIDGE?
   if( !bridge )
     wrkflwtxt += "MESDEV";
@@ -874,7 +887,7 @@ main( int argc, char** argv )
 
   if( perf )
   {
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 #ifdef _OPENMP
     // Get the output of "nproc --all" (https://stackoverflow.com/a/478960)
     std::string nprocall;
@@ -895,6 +908,8 @@ main( int argc, char** argv )
     std::cout << std::string( SEP79, '*' ) << std::endl
 #ifdef __CUDACC__
               << "Process                     = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_CUDA"
+#elif defined __HIPCC__
+              << "Process                     = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_HIP"
 #else
               << "Process                     = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_CPP"
 #endif
@@ -921,21 +936,21 @@ main( int argc, char** argv )
 #elif defined MGONGPU_FPTYPE_FLOAT
               << "FP precision                = FLOAT (NaN/abnormal=" << nabn << ", zero=" << nzero << ")" << std::endl
 #endif
-#ifdef __CUDACC__
 #if defined MGONGPU_CUCXTYPE_CUCOMPLEX
               << "Complex type                = CUCOMPLEX" << std::endl
 #elif defined MGONGPU_CUCXTYPE_THRUST
               << "Complex type                = THRUST::COMPLEX" << std::endl
-#endif
-#else
+#elif defined MGONGPU_CUCXTYPE_CXSMPL
               << "Complex type                = STD::COMPLEX" << std::endl
+#else
+              << "Complex type                = ???" << std::endl // no path to this statement...
 #endif
               << "RanNumb memory layout       = AOSOA[" << neppR << "]"
               << ( neppR == 1 ? " == AOS" : "" )
               << " [HARDCODED FOR REPRODUCIBILITY]" << std::endl
               << "Momenta memory layout       = AOSOA[" << neppM << "]"
               << ( neppM == 1 ? " == AOS" : "" ) << std::endl
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     //<< "Wavefunction GPU memory     = LOCAL" << std::endl
 #else
 #if !defined MGONGPU_CPPSIMD
@@ -966,7 +981,7 @@ main( int argc, char** argv )
 #endif
 #endif
               << "Random number generation    = " << rndgentxt << std::endl
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 #ifdef _OPENMP
               << "OMP threads / `nproc --all` = " << omp_get_max_threads() << " / " << nprocall // includes a newline
 #endif
@@ -1062,14 +1077,14 @@ main( int argc, char** argv )
              << "\"FLOAT (NaN/abnormal=" << nabn << ")\"," << std::endl
 #endif
              << "\"Complex type\": "
-#ifdef __CUDACC__
 #if defined MGONGPU_CUCXTYPE_CUCOMPLEX
              << "\"CUCOMPLEX\"," << std::endl
 #elif defined MGONGPU_CUCXTYPE_THRUST
              << "\"THRUST::COMPLEX\"," << std::endl
-#endif
-#else
+#elif defined MGONGPU_CUCXTYPE_CXSMPL
              << "\"STD::COMPLEX\"," << std::endl
+#else
+             << "\"???\"," << std::endl                           // no path to this statement...
 #endif
              << "\"RanNumb memory layout\": "
              << "\"AOSOA[" << neppR << "]\""
@@ -1077,7 +1092,7 @@ main( int argc, char** argv )
              << "\"Momenta memory layout\": "
              << "\"AOSOA[" << neppM << "]\""
              << ( neppM == 1 ? " == AOS" : "" ) << ", " << std::endl
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     //<< "\"Wavefunction GPU memory\": " << "\"LOCAL\"," << std::endl
 #endif
              << "\"Curand generation\": "
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gux_ttxux/CPPProcess.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gux_ttxux/CPPProcess.cc
index 0f999663da..7308f8a2c7 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gux_ttxux/CPPProcess.cc
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gux_ttxux/CPPProcess.cc
@@ -4,7 +4,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
 // MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
@@ -16,7 +16,6 @@
 
 #include "mgOnGpuConfig.h"
 
-#include "CudaRuntime.h"
 #include "HelAmps_sm.h"
 #include "MemoryAccessAmplitudes.h"
 #include "MemoryAccessCouplings.h"
@@ -49,7 +48,7 @@
 // Process: g d~ > t t~ d~ WEIGHTED<=3 @1
 // Process: g s~ > t t~ s~ WEIGHTED<=3 @1
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -83,7 +82,7 @@ namespace mg5amcCpu
   __device__ const fptype cIPD[2] = { (fptype)Parameters_sm::mdl_MT, (fptype)Parameters_sm::mdl_WT };
   __device__ const fptype* cIPC = nullptr; // unused as nicoup=0
 #else
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   __device__ __constant__ fptype cIPD[2];
   __device__ __constant__ fptype* cIPC = nullptr; // unused as nicoup=0
 #else
@@ -93,7 +92,7 @@ namespace mg5amcCpu
 #endif
 
   // Helicity combinations (and filtering of "good" helicity combinations)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   __device__ __constant__ short cHel[ncomb][npar];
   __device__ __constant__ int cNGoodHel;
   __device__ __constant__ int cGoodHel[ncomb];
@@ -121,13 +120,13 @@ namespace mg5amcCpu
                            fptype* allDenominators,       // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
                            fptype_sv* jamp2_sv            // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled)
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
                            , const int ievt00             // input: first event number in current C++ event page (for CUDA, ievt depends on threadid)
 #endif
                            )
   //ALWAYS_INLINE // attributes are not permitted in a function definition
   {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     using namespace mg5amcGpu;
     using M_ACCESS = DeviceAccessMomenta;         // non-trivial access: buffer includes all events
     using E_ACCESS = DeviceAccessMatrixElements;  // non-trivial access: buffer includes all events
@@ -154,7 +153,7 @@ namespace mg5amcCpu
 #endif /* clang-format on */
     mgDebug( 0, __FUNCTION__ );
     //printf( "calculate_wavefunctions: ihel=%2d\n", ihel );
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
     //printf( "calculate_wavefunctions: ievt00=%d\n", ievt00 );
 #endif
 
@@ -190,7 +189,7 @@ namespace mg5amcCpu
 #endif
     for( int iParity = 0; iParity < nParity; ++iParity )
     { // START LOOP ON IPARITY
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
       const int ievt0 = ievt00 + iParity * neppV;
 #endif
       constexpr size_t nxcoup = ndcoup + nicoup; // both dependent and independent couplings
@@ -203,8 +202,10 @@ namespace mg5amcCpu
         allCOUPs[idcoup] = CD_ACCESS::idcoupAccessBufferConst( allcouplings, idcoup ); // dependent couplings, vary event-by-event
       for( size_t iicoup = 0; iicoup < nicoup; iicoup++ )
         allCOUPs[ndcoup + iicoup] = CI_ACCESS::iicoupAccessBufferConst( cIPC, iicoup ); // independent couplings, fixed for all events
+#ifdef MGONGPUCPP_GPUIMPL
 #ifdef __CUDACC__
 #pragma nv_diagnostic pop
+#endif
       // CUDA kernels take input/output buffers with momenta/MEs for all events
       const fptype* momenta = allmomenta;
       const fptype* COUPs[nxcoup];
@@ -341,7 +342,7 @@ namespace mg5amcCpu
         { 4, 0, 12, 4 },
         { 0, 4, 4, 12 } }; // 2-D array[4][4]
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
       // Pre-compute a constexpr triangular color matrix properly normalized #475
       struct TriangularNormalizedColorMatrix
       {
@@ -398,7 +399,7 @@ namespace mg5amcCpu
 #endif
       for( int icol = 0; icol < ncolor; icol++ )
       {
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
         // === C++ START ===
         // Diagonal terms
 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
@@ -457,7 +458,7 @@ namespace mg5amcCpu
       MEs_sv_previous += deltaMEs_previous;
 #endif
       /*
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
       if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", blockDim.x * blockIdx.x + threadIdx.x, ihel, MEs_sv );
 #else
 #ifdef MGONGPU_CPPSIMD
@@ -520,8 +521,8 @@ namespace mg5amcCpu
       { 1, 1, 1, 1, -1 },
       { 1, 1, 1, -1, 1 },
       { 1, 1, 1, -1, -1 } };
-#ifdef __CUDACC__
-    checkCuda( cudaMemcpyToSymbol( cHel, tHel, ncomb * npar * sizeof( short ) ) );
+#ifdef MGONGPUCPP_GPUIMPL
+    gpuMemcpyToSymbol( cHel, tHel, ncomb * npar * sizeof( short ) );
 #else
     memcpy( cHel, tHel, ncomb * npar * sizeof( short ) );
 #endif
@@ -562,9 +563,9 @@ namespace mg5amcCpu
     // Then copy them to CUDA constant memory (issue #39) or its C++ emulation in file-scope static memory
     const fptype tIPD[2] = { (fptype)m_pars->mdl_MT, (fptype)m_pars->mdl_WT };
     //const cxtype tIPC[0] = { ... }; // nicoup=0
-#ifdef __CUDACC__
-    checkCuda( cudaMemcpyToSymbol( cIPD, tIPD, 2 * sizeof( fptype ) ) );
-    //checkCuda( cudaMemcpyToSymbol( cIPC, tIPC, 0 * sizeof( cxtype ) ) ); // nicoup=0
+#ifdef MGONGPUCPP_GPUIMPL
+    gpuMemcpyToSymbol( cIPD, tIPD, 2 * sizeof( fptype ) );
+    //gpuMemcpyToSymbol( cIPC, tIPC, 0 * sizeof( cxtype ) ); // nicoup=0
 #else
     memcpy( cIPD, tIPD, 2 * sizeof( fptype ) );
     //memcpy( cIPC, tIPC, 0 * sizeof( cxtype ) ); // nicoup=0
@@ -601,7 +602,7 @@ namespace mg5amcCpu
   {
     std::stringstream out;
     // CUDA version (NVCC)
-    // [Use __NVCC__ instead of __CUDACC__ here!]
+    // [Use __NVCC__ instead of MGONGPUCPP_GPUIMPL here!]
     // [This tests if 'nvcc' was used even to build a .cc file, even if not necessarily 'nvcc -x cu' for a .cu file]
     // [Check 'nvcc --compiler-options -dM -E dummy.c | grep CUDA': see https://stackoverflow.com/a/53713712]
 #ifdef __NVCC__
@@ -666,12 +667,12 @@ namespace mg5amcCpu
   __global__ void /* clang-format off */
   computeDependentCouplings( const fptype* allgs, // input: Gs[nevt]
                              fptype* allcouplings // output: couplings[nevt*ndcoup*2]
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
                              , const int nevt     // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #endif
   ) /* clang-format on */
   {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     using namespace mg5amcGpu;
     using G_ACCESS = DeviceAccessGs;
     using C_ACCESS = DeviceAccessCouplings;
@@ -692,7 +693,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__ /* clang-format off */
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
   __global__ void
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
@@ -818,9 +819,9 @@ namespace mg5amcCpu
         nGoodHel++;
       }
     }
-#ifdef __CUDACC__
-    checkCuda( cudaMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) ) );
-    checkCuda( cudaMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) ) );
+#ifdef MGONGPUCPP_GPUIMPL
+    gpuMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) );
+    gpuMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) );
 #else
     cNGoodHel = nGoodHel;
     for( int ihel = 0; ihel < ncomb; ihel++ ) cGoodHel[ihel] = goodHel[ihel];
@@ -844,7 +845,7 @@ namespace mg5amcCpu
 #endif
             int* allselhel,                // output: helicity selection[nevt]
             int* allselcol                 // output: helicity selection[nevt]
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
             , const int nevt               // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #endif
             ) /* clang-format on */
@@ -864,7 +865,7 @@ namespace mg5amcCpu
     // Denominators: spins, colors and identical particles
     constexpr int helcolDenominators[1] = { 96 }; // assume nprocesses == 1 (#272 and #343)
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     // Remember: in CUDA this is a kernel for one event, in c++ this processes n events
     const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid
 #else
@@ -878,9 +879,12 @@ namespace mg5amcCpu
 #endif
 
     // Start sigmaKin_lines
+
+#include "GpuAbstraction.h"
+
     // === PART 0 - INITIALISATION (before calculate_wavefunctions) ===
     // Reset the "matrix elements" - running sums of |M|^2 over helicities for the given event
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     allMEs[ievt] = 0;
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     allNumerators[ievt] = 0;
@@ -908,7 +912,7 @@ namespace mg5amcCpu
     // === PART 1 - HELICITY LOOP: CALCULATE WAVEFUNCTIONS ===
     // (in both CUDA and C++, using precomputed good helicities)
 
-#ifdef __CUDACC__ // CUDA OR C++
+#ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++
 
     // *** START OF PART 1a - CUDA (one event per CPU thread) ***
     // Running sum of partial amplitudes squared for event by event color selection (#402)
@@ -1118,7 +1122,7 @@ namespace mg5amcCpu
     // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event
     // [NB 'sum over final spins, average over initial spins', eg see
     // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf]
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     allMEs[ievt] /= helcolDenominators[0];
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     if( channelId > 0 ) allMEs[ievt] *= allNumerators[ievt] / allDenominators[ievt];
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gux_ttxux/CPPProcess.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gux_ttxux/CPPProcess.h
index 0f49f5247b..46c4347506 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gux_ttxux/CPPProcess.h
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gux_ttxux/CPPProcess.h
@@ -4,7 +4,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
 // MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
@@ -25,7 +25,7 @@
 
 //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -110,7 +110,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   __global__ void
   computeDependentCouplings( const fptype* allgs,    // input: Gs[nevt]
                              fptype* allcouplings ); // output: couplings[nevt*ndcoup*2]
@@ -123,7 +123,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__ /* clang-format off */
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
   __global__ void
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
@@ -153,7 +153,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__ /* clang-format off */
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
   __global__ void
   sigmaKin( const fptype* allmomenta,      // input: momenta[nevt*npar*4]
             const fptype* allcouplings,    // input: couplings[nevt*ndcoup*2]
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gux_ttxux/CudaRuntime.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gux_ttxux/CudaRuntime.h
deleted file mode 120000
index ce9e1a487a..0000000000
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gux_ttxux/CudaRuntime.h
+++ /dev/null
@@ -1 +0,0 @@
-../CudaRuntime.h
\ No newline at end of file
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gux_ttxux/GpuAbstraction.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gux_ttxux/GpuAbstraction.h
new file mode 120000
index 0000000000..72054e19ba
--- /dev/null
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gux_ttxux/GpuAbstraction.h
@@ -0,0 +1 @@
+../GpuAbstraction.h
\ No newline at end of file
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gux_ttxux/GpuRuntime.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gux_ttxux/GpuRuntime.h
new file mode 120000
index 0000000000..3920e83be4
--- /dev/null
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gux_ttxux/GpuRuntime.h
@@ -0,0 +1 @@
+../GpuRuntime.h
\ No newline at end of file
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gux_ttxux/check_sa.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gux_ttxux/check_sa.cc
index 3fbf0ffbee..7cac5ab47b 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gux_ttxux/check_sa.cc
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gux_ttxux/check_sa.cc
@@ -4,7 +4,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: O. Mattelaer (Nov 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 
 #include "mgOnGpuConfig.h"
@@ -12,6 +12,7 @@
 #include "BridgeKernels.h"
 #include "CPPProcess.h"
 #include "CrossSectionKernels.h"
+#include "GpuRuntime.h"
 #include "MatrixElementKernels.h"
 #include "MemoryAccessMatrixElements.h"
 #include "MemoryAccessMomenta.h"
@@ -65,7 +66,7 @@ usage( char* argv0, int ret = 1 )
   std::cout << std::endl;
   std::cout << "Summary stats are always computed: '-p' and '-j' only control their printout" << std::endl;
   std::cout << "The '-d' flag only enables NaN/abnormal warnings and OMP debugging" << std::endl;
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 #ifdef _OPENMP
   std::cout << std::endl;
   std::cout << "Use the OMP_NUM_THREADS environment variable to control OMP multi-threading" << std::endl;
@@ -96,7 +97,7 @@ int
 main( int argc, char** argv )
 {
   // Namespaces for CUDA and C++ (FIXME - eventually use the same namespace everywhere...)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   using namespace mg5amcGpu;
 #else
   using namespace mg5amcCpu;
@@ -134,9 +135,11 @@ main( int argc, char** argv )
     CurandDevice = 2
   };
 #ifdef MGONGPU_HAS_NO_CURAND
-  RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // this is the only supported mode if build has no curand (PR #784)
+  RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // this is the only supported mode if build has no curand (PR #784 and #785)
+#elif defined __HIPCC__
+#error Internal error: MGONGPU_HAS_NO_CURAND should have been set for __HIPCC__ // default on AMD GPUs should be common random
 #elif defined __CUDACC__
-  RandomNumberMode rndgen = RandomNumberMode::CurandDevice; // default on GPU if build has curand
+  RandomNumberMode rndgen = RandomNumberMode::CurandDevice; // default on NVidia GPU if build has curand
 #else
   RandomNumberMode rndgen = RandomNumberMode::CurandHost; // default on CPU if build has curand
 #endif
@@ -146,10 +149,10 @@ main( int argc, char** argv )
     RamboHost = 1,
     RamboDevice = 2
   };
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   RamboSamplingMode rmbsmp = RamboSamplingMode::RamboDevice; // default on GPU
 #else
-  RamboSamplingMode rmbsmp = RamboSamplingMode::RamboHost;  // default on CPU
+  RamboSamplingMode rmbsmp = RamboSamplingMode::RamboHost; // default on CPU
 #endif
   // Bridge emulation mode (NB Bridge implies RamboHost!)
   bool bridge = false;
@@ -177,7 +180,7 @@ main( int argc, char** argv )
     else if( arg == "--curdev" )
     {
 #ifndef __CUDACC__
-      throw std::runtime_error( "CurandDevice is not supported on CPUs" );
+      throw std::runtime_error( "CurandDevice is not supported on CPUs or non-NVidia GPUs" );
 #elif defined MGONGPU_HAS_NO_CURAND
       throw std::runtime_error( "CurandDevice is not supported because this application was built without Curand support" );
 #else
@@ -198,7 +201,7 @@ main( int argc, char** argv )
     }
     else if( arg == "--rmbdev" )
     {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
       rmbsmp = RamboSamplingMode::RamboDevice;
 #else
       throw std::runtime_error( "RamboDevice is not supported on CPUs" );
@@ -272,13 +275,13 @@ main( int argc, char** argv )
     return usage( argv[0] );
   }
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 #ifdef _OPENMP
   ompnumthreadsNotSetMeansOneThread( debug ? 1 : 0 ); // quiet(-1), info(0), debug(1)
 #endif
 #endif
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // Fail gently and avoid "Illegal instruction (core dumped)" if the host does not support the SIMD used in the ME calculation
   // Note: this prevents a crash on pmpe04 but not on some github CI nodes?
   // [NB: SIMD vectorization in mg5amc C++ code is only used in the ME calculation below MatrixElementKernelHost!]
@@ -296,14 +299,14 @@ main( int argc, char** argv )
 
   // === STEP 0 - INITIALISE
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 
-  // --- 00. Initialise cuda
-  // Instantiate a CudaRuntime at the beginnining of the application's main to
-  // invoke cudaSetDevice(0) in the constructor and book a cudaDeviceReset() call in the destructor
-  const std::string cdinKey = "00 CudaInit";
+  // --- 00. Initialise GPU
+  // Instantiate a GpuRuntime at the beginnining of the application's main.
+  // For CUDA this invokes cudaSetDevice(0) in the constructor and books a cudaDeviceReset() call in the destructor.
+  const std::string cdinKey = "00 GpuInit";
   timermap.start( cdinKey );
-  CudaRuntime cudaRuntime( debug );
+  GpuRuntime GpuRuntime( debug );
 #endif
 
   // --- 0a. Initialise physics process
@@ -325,7 +328,7 @@ main( int argc, char** argv )
   timermap.start( alloKey );
 
   // Memory buffers for random numbers for momenta
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferRndNumMomenta hstRndmom( nevt );
 #else
   PinnedHostBufferRndNumMomenta hstRndmom( nevt );
@@ -333,7 +336,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for sampling weights
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferWeights hstWeights( nevt );
 #else
   PinnedHostBufferWeights hstWeights( nevt );
@@ -341,7 +344,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for momenta
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferMomenta hstMomenta( nevt );
 #else
   PinnedHostBufferMomenta hstMomenta( nevt );
@@ -349,7 +352,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for Gs
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferGs hstGs( nevt );
 #else
   PinnedHostBufferGs hstGs( nevt );
@@ -366,7 +369,7 @@ main( int argc, char** argv )
   }
 
   // Memory buffers for matrix elements
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferMatrixElements hstMatrixElements( nevt );
 #else
   PinnedHostBufferMatrixElements hstMatrixElements( nevt );
@@ -375,7 +378,7 @@ main( int argc, char** argv )
 
   // Memory buffers for random numbers for helicity selection
   // *** NB #403 these buffers always remain initialised at 0: no need for helicity choice in gcheck/check (no LHE produced) ***
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferRndNumHelicity hstRndHel( nevt );
 #else
   PinnedHostBufferRndNumHelicity hstRndHel( nevt );
@@ -384,7 +387,7 @@ main( int argc, char** argv )
 
   // Memory buffers for random numbers for color selection
   // *** NB #402 these buffers always remain initialised at 0: no need for color choice in gcheck/check (no LHE produced) ***
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferRndNumColor hstRndCol( nevt );
 #else
   PinnedHostBufferRndNumColor hstRndCol( nevt );
@@ -392,7 +395,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for helicity selection
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferSelectedHelicity hstSelHel( nevt );
 #else
   PinnedHostBufferSelectedHelicity hstSelHel( nevt );
@@ -400,7 +403,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for color selection
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferSelectedColor hstSelCol( nevt );
 #else
   PinnedHostBufferSelectedColor hstSelCol( nevt );
@@ -438,7 +441,7 @@ main( int argc, char** argv )
     const bool onDevice = true;
     prnk.reset( new CurandRandomNumberKernel( devRndmom, onDevice ) );
 #else
-    throw std::logic_error( "INTERNAL ERROR! CurandDevice is not supported on CPUs" ); // INTERNAL ERROR (no path to this statement)
+    throw std::logic_error( "INTERNAL ERROR! CurandDevice is not supported on CPUs or non-NVidia GPUs" ); // INTERNAL ERROR (no path to this statement)
 #endif
   }
 
@@ -450,7 +453,7 @@ main( int argc, char** argv )
   }
   else
   {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     prsk.reset( new RamboSamplingKernelDevice( energy, devRndmom, devMomenta, devWeights, gpublocks, gputhreads ) );
 #else
     throw std::logic_error( "RamboDevice is not supported on CPUs" ); // INTERNAL ERROR (no path to this statement)
@@ -461,7 +464,7 @@ main( int argc, char** argv )
   std::unique_ptr<MatrixElementKernelBase> pmek;
   if( !bridge )
   {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     pmek.reset( new MatrixElementKernelDevice( devMomenta, devGs, devRndHel, devRndCol, devMatrixElements, devSelHel, devSelCol, gpublocks, gputhreads ) );
 #else
     pmek.reset( new MatrixElementKernelHost( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, nevt ) );
@@ -469,7 +472,7 @@ main( int argc, char** argv )
   }
   else
   {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     pmek.reset( new BridgeKernelDevice( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, gpublocks, gputhreads ) );
 #else
     pmek.reset( new BridgeKernelHost( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, nevt ) );
@@ -511,7 +514,7 @@ main( int argc, char** argv )
     prnk->generateRnarray();
     //std::cout << "Got random numbers" << std::endl;
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     if( rndgen != RandomNumberMode::CurandDevice && rmbsmp == RamboSamplingMode::RamboDevice )
     {
       // --- 1c. Copy rndmom from host to device
@@ -543,7 +546,7 @@ main( int argc, char** argv )
     prsk->getMomentaFinal();
     //std::cout << "Got final momenta" << std::endl;
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     if( rmbsmp == RamboSamplingMode::RamboDevice )
     {
       // --- 2c. CopyDToH Weights
@@ -588,7 +591,7 @@ main( int argc, char** argv )
       dynamic_cast<BridgeKernelBase*>( pmek.get() )->transposeInputMomentaC2F();
     }
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     // --- 2d. CopyHToD Momenta
     const std::string gKey = "0.. CpHTDg";
     rambtime += timermap.start( gKey ); // FIXME! NOT A RAMBO TIMER!
@@ -617,7 +620,7 @@ main( int argc, char** argv )
     wv3atime += timermap.stop(); // calc only
     wavetime += wv3atime;        // calc plus copy
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     if( !bridge )
     {
       // --- 3b. CopyDToH MEs
@@ -760,18 +763,22 @@ main( int argc, char** argv )
     rndgentxt = "CURAND DEVICE";
 #ifdef __CUDACC__
   rndgentxt += " (CUDA code)";
+#elif defined __HIPCC__
+  rndgentxt += " (HIP code)";
 #else
   rndgentxt += " (C++ code)";
 #endif
 
   // Workflow description summary
   std::string wrkflwtxt;
-  // -- CUDA or C++?
+  // -- CUDA or HIP or C++?
 #ifdef __CUDACC__
   wrkflwtxt += "CUD:";
+#elif defined __HIPCC__
+  wrkflwtxt += "HIP:";
 #else
   wrkflwtxt += "CPP:";
-#endif
+#endif /* clang-format off */
   // -- DOUBLE or FLOAT?
 #if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
   wrkflwtxt += "MIX+"; // mixed fptypes (single precision color algebra #537)
@@ -781,7 +788,7 @@ main( int argc, char** argv )
   wrkflwtxt += "FLT+";
 #else
   wrkflwtxt += "???+"; // no path to this statement
-#endif
+#endif /* clang-format on */
   // -- CUCOMPLEX or THRUST or STD complex numbers?
 #ifdef __CUDACC__
 #if defined MGONGPU_CUCXTYPE_CUCOMPLEX
@@ -793,6 +800,12 @@ main( int argc, char** argv )
 #else
   wrkflwtxt += "???:"; // no path to this statement
 #endif
+#elif defined __HIPCC__
+#if defined MGONGPU_CUCXTYPE_CXSMPL
+  wrkflwtxt += "CXS:";
+#else
+  wrkflwtxt += "???:"; // no path to this statement
+#endif
 #else
 #if defined MGONGPU_CPPCXTYPE_STDCOMPLEX
   wrkflwtxt += "STX:";
@@ -818,7 +831,7 @@ main( int argc, char** argv )
     wrkflwtxt += "RMBDEV+";
   else
     wrkflwtxt += "??????+"; // no path to this statement
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   // -- HOST or DEVICE matrix elements? Standalone MEs or BRIDGE?
   if( !bridge )
     wrkflwtxt += "MESDEV";
@@ -874,7 +887,7 @@ main( int argc, char** argv )
 
   if( perf )
   {
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 #ifdef _OPENMP
     // Get the output of "nproc --all" (https://stackoverflow.com/a/478960)
     std::string nprocall;
@@ -895,6 +908,8 @@ main( int argc, char** argv )
     std::cout << std::string( SEP79, '*' ) << std::endl
 #ifdef __CUDACC__
               << "Process                     = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_CUDA"
+#elif defined __HIPCC__
+              << "Process                     = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_HIP"
 #else
               << "Process                     = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_CPP"
 #endif
@@ -921,21 +936,21 @@ main( int argc, char** argv )
 #elif defined MGONGPU_FPTYPE_FLOAT
               << "FP precision                = FLOAT (NaN/abnormal=" << nabn << ", zero=" << nzero << ")" << std::endl
 #endif
-#ifdef __CUDACC__
 #if defined MGONGPU_CUCXTYPE_CUCOMPLEX
               << "Complex type                = CUCOMPLEX" << std::endl
 #elif defined MGONGPU_CUCXTYPE_THRUST
               << "Complex type                = THRUST::COMPLEX" << std::endl
-#endif
-#else
+#elif defined MGONGPU_CUCXTYPE_CXSMPL
               << "Complex type                = STD::COMPLEX" << std::endl
+#else
+              << "Complex type                = ???" << std::endl // no path to this statement...
 #endif
               << "RanNumb memory layout       = AOSOA[" << neppR << "]"
               << ( neppR == 1 ? " == AOS" : "" )
               << " [HARDCODED FOR REPRODUCIBILITY]" << std::endl
               << "Momenta memory layout       = AOSOA[" << neppM << "]"
               << ( neppM == 1 ? " == AOS" : "" ) << std::endl
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     //<< "Wavefunction GPU memory     = LOCAL" << std::endl
 #else
 #if !defined MGONGPU_CPPSIMD
@@ -966,7 +981,7 @@ main( int argc, char** argv )
 #endif
 #endif
               << "Random number generation    = " << rndgentxt << std::endl
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 #ifdef _OPENMP
               << "OMP threads / `nproc --all` = " << omp_get_max_threads() << " / " << nprocall // includes a newline
 #endif
@@ -1062,14 +1077,14 @@ main( int argc, char** argv )
              << "\"FLOAT (NaN/abnormal=" << nabn << ")\"," << std::endl
 #endif
              << "\"Complex type\": "
-#ifdef __CUDACC__
 #if defined MGONGPU_CUCXTYPE_CUCOMPLEX
              << "\"CUCOMPLEX\"," << std::endl
 #elif defined MGONGPU_CUCXTYPE_THRUST
              << "\"THRUST::COMPLEX\"," << std::endl
-#endif
-#else
+#elif defined MGONGPU_CUCXTYPE_CXSMPL
              << "\"STD::COMPLEX\"," << std::endl
+#else
+             << "\"???\"," << std::endl                           // no path to this statement...
 #endif
              << "\"RanNumb memory layout\": "
              << "\"AOSOA[" << neppR << "]\""
@@ -1077,7 +1092,7 @@ main( int argc, char** argv )
              << "\"Momenta memory layout\": "
              << "\"AOSOA[" << neppM << "]\""
              << ( neppM == 1 ? " == AOS" : "" ) << ", " << std::endl
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     //<< "\"Wavefunction GPU memory\": " << "\"LOCAL\"," << std::endl
 #endif
              << "\"Curand generation\": "
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_uux_ttxg/CPPProcess.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_uux_ttxg/CPPProcess.cc
index 87830582d7..b37df5d33f 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_uux_ttxg/CPPProcess.cc
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_uux_ttxg/CPPProcess.cc
@@ -4,7 +4,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
 // MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
@@ -16,7 +16,6 @@
 
 #include "mgOnGpuConfig.h"
 
-#include "CudaRuntime.h"
 #include "HelAmps_sm.h"
 #include "MemoryAccessAmplitudes.h"
 #include "MemoryAccessCouplings.h"
@@ -49,7 +48,7 @@
 // Process: d d~ > t t~ g WEIGHTED<=3 @1
 // Process: s s~ > t t~ g WEIGHTED<=3 @1
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -83,7 +82,7 @@ namespace mg5amcCpu
   __device__ const fptype cIPD[2] = { (fptype)Parameters_sm::mdl_MT, (fptype)Parameters_sm::mdl_WT };
   __device__ const fptype* cIPC = nullptr; // unused as nicoup=0
 #else
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   __device__ __constant__ fptype cIPD[2];
   __device__ __constant__ fptype* cIPC = nullptr; // unused as nicoup=0
 #else
@@ -93,7 +92,7 @@ namespace mg5amcCpu
 #endif
 
   // Helicity combinations (and filtering of "good" helicity combinations)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   __device__ __constant__ short cHel[ncomb][npar];
   __device__ __constant__ int cNGoodHel;
   __device__ __constant__ int cGoodHel[ncomb];
@@ -121,13 +120,13 @@ namespace mg5amcCpu
                            fptype* allDenominators,       // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
                            fptype_sv* jamp2_sv            // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled)
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
                            , const int ievt00             // input: first event number in current C++ event page (for CUDA, ievt depends on threadid)
 #endif
                            )
   //ALWAYS_INLINE // attributes are not permitted in a function definition
   {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     using namespace mg5amcGpu;
     using M_ACCESS = DeviceAccessMomenta;         // non-trivial access: buffer includes all events
     using E_ACCESS = DeviceAccessMatrixElements;  // non-trivial access: buffer includes all events
@@ -154,7 +153,7 @@ namespace mg5amcCpu
 #endif /* clang-format on */
     mgDebug( 0, __FUNCTION__ );
     //printf( "calculate_wavefunctions: ihel=%2d\n", ihel );
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
     //printf( "calculate_wavefunctions: ievt00=%d\n", ievt00 );
 #endif
 
@@ -190,7 +189,7 @@ namespace mg5amcCpu
 #endif
     for( int iParity = 0; iParity < nParity; ++iParity )
     { // START LOOP ON IPARITY
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
       const int ievt0 = ievt00 + iParity * neppV;
 #endif
       constexpr size_t nxcoup = ndcoup + nicoup; // both dependent and independent couplings
@@ -203,8 +202,10 @@ namespace mg5amcCpu
         allCOUPs[idcoup] = CD_ACCESS::idcoupAccessBufferConst( allcouplings, idcoup ); // dependent couplings, vary event-by-event
       for( size_t iicoup = 0; iicoup < nicoup; iicoup++ )
         allCOUPs[ndcoup + iicoup] = CI_ACCESS::iicoupAccessBufferConst( cIPC, iicoup ); // independent couplings, fixed for all events
+#ifdef MGONGPUCPP_GPUIMPL
 #ifdef __CUDACC__
 #pragma nv_diagnostic pop
+#endif
       // CUDA kernels take input/output buffers with momenta/MEs for all events
       const fptype* momenta = allmomenta;
       const fptype* COUPs[nxcoup];
@@ -341,7 +342,7 @@ namespace mg5amcCpu
         { 4, 0, 12, 4 },
         { 0, 4, 4, 12 } }; // 2-D array[4][4]
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
       // Pre-compute a constexpr triangular color matrix properly normalized #475
       struct TriangularNormalizedColorMatrix
       {
@@ -398,7 +399,7 @@ namespace mg5amcCpu
 #endif
       for( int icol = 0; icol < ncolor; icol++ )
       {
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
         // === C++ START ===
         // Diagonal terms
 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
@@ -457,7 +458,7 @@ namespace mg5amcCpu
       MEs_sv_previous += deltaMEs_previous;
 #endif
       /*
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
       if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", blockDim.x * blockIdx.x + threadIdx.x, ihel, MEs_sv );
 #else
 #ifdef MGONGPU_CPPSIMD
@@ -520,8 +521,8 @@ namespace mg5amcCpu
       { -1, 1, 1, 1, 1 },
       { -1, 1, 1, -1, -1 },
       { -1, 1, 1, -1, 1 } };
-#ifdef __CUDACC__
-    checkCuda( cudaMemcpyToSymbol( cHel, tHel, ncomb * npar * sizeof( short ) ) );
+#ifdef MGONGPUCPP_GPUIMPL
+    gpuMemcpyToSymbol( cHel, tHel, ncomb * npar * sizeof( short ) );
 #else
     memcpy( cHel, tHel, ncomb * npar * sizeof( short ) );
 #endif
@@ -562,9 +563,9 @@ namespace mg5amcCpu
     // Then copy them to CUDA constant memory (issue #39) or its C++ emulation in file-scope static memory
     const fptype tIPD[2] = { (fptype)m_pars->mdl_MT, (fptype)m_pars->mdl_WT };
     //const cxtype tIPC[0] = { ... }; // nicoup=0
-#ifdef __CUDACC__
-    checkCuda( cudaMemcpyToSymbol( cIPD, tIPD, 2 * sizeof( fptype ) ) );
-    //checkCuda( cudaMemcpyToSymbol( cIPC, tIPC, 0 * sizeof( cxtype ) ) ); // nicoup=0
+#ifdef MGONGPUCPP_GPUIMPL
+    gpuMemcpyToSymbol( cIPD, tIPD, 2 * sizeof( fptype ) );
+    //gpuMemcpyToSymbol( cIPC, tIPC, 0 * sizeof( cxtype ) ); // nicoup=0
 #else
     memcpy( cIPD, tIPD, 2 * sizeof( fptype ) );
     //memcpy( cIPC, tIPC, 0 * sizeof( cxtype ) ); // nicoup=0
@@ -601,7 +602,7 @@ namespace mg5amcCpu
   {
     std::stringstream out;
     // CUDA version (NVCC)
-    // [Use __NVCC__ instead of __CUDACC__ here!]
+    // [Use __NVCC__ instead of MGONGPUCPP_GPUIMPL here!]
     // [This tests if 'nvcc' was used even to build a .cc file, even if not necessarily 'nvcc -x cu' for a .cu file]
     // [Check 'nvcc --compiler-options -dM -E dummy.c | grep CUDA': see https://stackoverflow.com/a/53713712]
 #ifdef __NVCC__
@@ -666,12 +667,12 @@ namespace mg5amcCpu
   __global__ void /* clang-format off */
   computeDependentCouplings( const fptype* allgs, // input: Gs[nevt]
                              fptype* allcouplings // output: couplings[nevt*ndcoup*2]
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
                              , const int nevt     // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #endif
   ) /* clang-format on */
   {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     using namespace mg5amcGpu;
     using G_ACCESS = DeviceAccessGs;
     using C_ACCESS = DeviceAccessCouplings;
@@ -692,7 +693,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__ /* clang-format off */
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
   __global__ void
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
@@ -818,9 +819,9 @@ namespace mg5amcCpu
         nGoodHel++;
       }
     }
-#ifdef __CUDACC__
-    checkCuda( cudaMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) ) );
-    checkCuda( cudaMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) ) );
+#ifdef MGONGPUCPP_GPUIMPL
+    gpuMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) );
+    gpuMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) );
 #else
     cNGoodHel = nGoodHel;
     for( int ihel = 0; ihel < ncomb; ihel++ ) cGoodHel[ihel] = goodHel[ihel];
@@ -844,7 +845,7 @@ namespace mg5amcCpu
 #endif
             int* allselhel,                // output: helicity selection[nevt]
             int* allselcol                 // output: helicity selection[nevt]
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
             , const int nevt               // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #endif
             ) /* clang-format on */
@@ -864,7 +865,7 @@ namespace mg5amcCpu
     // Denominators: spins, colors and identical particles
     constexpr int helcolDenominators[1] = { 36 }; // assume nprocesses == 1 (#272 and #343)
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     // Remember: in CUDA this is a kernel for one event, in c++ this processes n events
     const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid
 #else
@@ -878,9 +879,12 @@ namespace mg5amcCpu
 #endif
 
     // Start sigmaKin_lines
+
+#include "GpuAbstraction.h"
+
     // === PART 0 - INITIALISATION (before calculate_wavefunctions) ===
     // Reset the "matrix elements" - running sums of |M|^2 over helicities for the given event
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     allMEs[ievt] = 0;
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     allNumerators[ievt] = 0;
@@ -908,7 +912,7 @@ namespace mg5amcCpu
     // === PART 1 - HELICITY LOOP: CALCULATE WAVEFUNCTIONS ===
     // (in both CUDA and C++, using precomputed good helicities)
 
-#ifdef __CUDACC__ // CUDA OR C++
+#ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++
 
     // *** START OF PART 1a - CUDA (one event per CPU thread) ***
     // Running sum of partial amplitudes squared for event by event color selection (#402)
@@ -1118,7 +1122,7 @@ namespace mg5amcCpu
     // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event
     // [NB 'sum over final spins, average over initial spins', eg see
     // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf]
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     allMEs[ievt] /= helcolDenominators[0];
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     if( channelId > 0 ) allMEs[ievt] *= allNumerators[ievt] / allDenominators[ievt];
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_uux_ttxg/CPPProcess.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_uux_ttxg/CPPProcess.h
index f8bdb38aee..fc7c0d8196 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_uux_ttxg/CPPProcess.h
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_uux_ttxg/CPPProcess.h
@@ -4,7 +4,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
 // MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
@@ -25,7 +25,7 @@
 
 //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -110,7 +110,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   __global__ void
   computeDependentCouplings( const fptype* allgs,    // input: Gs[nevt]
                              fptype* allcouplings ); // output: couplings[nevt*ndcoup*2]
@@ -123,7 +123,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__ /* clang-format off */
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
   __global__ void
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
@@ -153,7 +153,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__ /* clang-format off */
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
   __global__ void
   sigmaKin( const fptype* allmomenta,      // input: momenta[nevt*npar*4]
             const fptype* allcouplings,    // input: couplings[nevt*ndcoup*2]
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_uux_ttxg/CudaRuntime.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_uux_ttxg/CudaRuntime.h
deleted file mode 120000
index ce9e1a487a..0000000000
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_uux_ttxg/CudaRuntime.h
+++ /dev/null
@@ -1 +0,0 @@
-../CudaRuntime.h
\ No newline at end of file
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_uux_ttxg/GpuAbstraction.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_uux_ttxg/GpuAbstraction.h
new file mode 120000
index 0000000000..72054e19ba
--- /dev/null
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_uux_ttxg/GpuAbstraction.h
@@ -0,0 +1 @@
+../GpuAbstraction.h
\ No newline at end of file
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_uux_ttxg/GpuRuntime.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_uux_ttxg/GpuRuntime.h
new file mode 120000
index 0000000000..3920e83be4
--- /dev/null
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_uux_ttxg/GpuRuntime.h
@@ -0,0 +1 @@
+../GpuRuntime.h
\ No newline at end of file
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_uux_ttxg/check_sa.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_uux_ttxg/check_sa.cc
index 3fbf0ffbee..7cac5ab47b 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_uux_ttxg/check_sa.cc
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_uux_ttxg/check_sa.cc
@@ -4,7 +4,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: O. Mattelaer (Nov 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 
 #include "mgOnGpuConfig.h"
@@ -12,6 +12,7 @@
 #include "BridgeKernels.h"
 #include "CPPProcess.h"
 #include "CrossSectionKernels.h"
+#include "GpuRuntime.h"
 #include "MatrixElementKernels.h"
 #include "MemoryAccessMatrixElements.h"
 #include "MemoryAccessMomenta.h"
@@ -65,7 +66,7 @@ usage( char* argv0, int ret = 1 )
   std::cout << std::endl;
   std::cout << "Summary stats are always computed: '-p' and '-j' only control their printout" << std::endl;
   std::cout << "The '-d' flag only enables NaN/abnormal warnings and OMP debugging" << std::endl;
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 #ifdef _OPENMP
   std::cout << std::endl;
   std::cout << "Use the OMP_NUM_THREADS environment variable to control OMP multi-threading" << std::endl;
@@ -96,7 +97,7 @@ int
 main( int argc, char** argv )
 {
   // Namespaces for CUDA and C++ (FIXME - eventually use the same namespace everywhere...)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   using namespace mg5amcGpu;
 #else
   using namespace mg5amcCpu;
@@ -134,9 +135,11 @@ main( int argc, char** argv )
     CurandDevice = 2
   };
 #ifdef MGONGPU_HAS_NO_CURAND
-  RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // this is the only supported mode if build has no curand (PR #784)
+  RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // this is the only supported mode if build has no curand (PR #784 and #785)
+#elif defined __HIPCC__
+#error Internal error: MGONGPU_HAS_NO_CURAND should have been set for __HIPCC__ // default on AMD GPUs should be common random
 #elif defined __CUDACC__
-  RandomNumberMode rndgen = RandomNumberMode::CurandDevice; // default on GPU if build has curand
+  RandomNumberMode rndgen = RandomNumberMode::CurandDevice; // default on NVidia GPU if build has curand
 #else
   RandomNumberMode rndgen = RandomNumberMode::CurandHost; // default on CPU if build has curand
 #endif
@@ -146,10 +149,10 @@ main( int argc, char** argv )
     RamboHost = 1,
     RamboDevice = 2
   };
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   RamboSamplingMode rmbsmp = RamboSamplingMode::RamboDevice; // default on GPU
 #else
-  RamboSamplingMode rmbsmp = RamboSamplingMode::RamboHost;  // default on CPU
+  RamboSamplingMode rmbsmp = RamboSamplingMode::RamboHost; // default on CPU
 #endif
   // Bridge emulation mode (NB Bridge implies RamboHost!)
   bool bridge = false;
@@ -177,7 +180,7 @@ main( int argc, char** argv )
     else if( arg == "--curdev" )
     {
 #ifndef __CUDACC__
-      throw std::runtime_error( "CurandDevice is not supported on CPUs" );
+      throw std::runtime_error( "CurandDevice is not supported on CPUs or non-NVidia GPUs" );
 #elif defined MGONGPU_HAS_NO_CURAND
       throw std::runtime_error( "CurandDevice is not supported because this application was built without Curand support" );
 #else
@@ -198,7 +201,7 @@ main( int argc, char** argv )
     }
     else if( arg == "--rmbdev" )
     {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
       rmbsmp = RamboSamplingMode::RamboDevice;
 #else
       throw std::runtime_error( "RamboDevice is not supported on CPUs" );
@@ -272,13 +275,13 @@ main( int argc, char** argv )
     return usage( argv[0] );
   }
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 #ifdef _OPENMP
   ompnumthreadsNotSetMeansOneThread( debug ? 1 : 0 ); // quiet(-1), info(0), debug(1)
 #endif
 #endif
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // Fail gently and avoid "Illegal instruction (core dumped)" if the host does not support the SIMD used in the ME calculation
   // Note: this prevents a crash on pmpe04 but not on some github CI nodes?
   // [NB: SIMD vectorization in mg5amc C++ code is only used in the ME calculation below MatrixElementKernelHost!]
@@ -296,14 +299,14 @@ main( int argc, char** argv )
 
   // === STEP 0 - INITIALISE
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 
-  // --- 00. Initialise cuda
-  // Instantiate a CudaRuntime at the beginnining of the application's main to
-  // invoke cudaSetDevice(0) in the constructor and book a cudaDeviceReset() call in the destructor
-  const std::string cdinKey = "00 CudaInit";
+  // --- 00. Initialise GPU
+  // Instantiate a GpuRuntime at the beginnining of the application's main.
+  // For CUDA this invokes cudaSetDevice(0) in the constructor and books a cudaDeviceReset() call in the destructor.
+  const std::string cdinKey = "00 GpuInit";
   timermap.start( cdinKey );
-  CudaRuntime cudaRuntime( debug );
+  GpuRuntime GpuRuntime( debug );
 #endif
 
   // --- 0a. Initialise physics process
@@ -325,7 +328,7 @@ main( int argc, char** argv )
   timermap.start( alloKey );
 
   // Memory buffers for random numbers for momenta
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferRndNumMomenta hstRndmom( nevt );
 #else
   PinnedHostBufferRndNumMomenta hstRndmom( nevt );
@@ -333,7 +336,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for sampling weights
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferWeights hstWeights( nevt );
 #else
   PinnedHostBufferWeights hstWeights( nevt );
@@ -341,7 +344,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for momenta
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferMomenta hstMomenta( nevt );
 #else
   PinnedHostBufferMomenta hstMomenta( nevt );
@@ -349,7 +352,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for Gs
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferGs hstGs( nevt );
 #else
   PinnedHostBufferGs hstGs( nevt );
@@ -366,7 +369,7 @@ main( int argc, char** argv )
   }
 
   // Memory buffers for matrix elements
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferMatrixElements hstMatrixElements( nevt );
 #else
   PinnedHostBufferMatrixElements hstMatrixElements( nevt );
@@ -375,7 +378,7 @@ main( int argc, char** argv )
 
   // Memory buffers for random numbers for helicity selection
   // *** NB #403 these buffers always remain initialised at 0: no need for helicity choice in gcheck/check (no LHE produced) ***
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferRndNumHelicity hstRndHel( nevt );
 #else
   PinnedHostBufferRndNumHelicity hstRndHel( nevt );
@@ -384,7 +387,7 @@ main( int argc, char** argv )
 
   // Memory buffers for random numbers for color selection
   // *** NB #402 these buffers always remain initialised at 0: no need for color choice in gcheck/check (no LHE produced) ***
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferRndNumColor hstRndCol( nevt );
 #else
   PinnedHostBufferRndNumColor hstRndCol( nevt );
@@ -392,7 +395,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for helicity selection
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferSelectedHelicity hstSelHel( nevt );
 #else
   PinnedHostBufferSelectedHelicity hstSelHel( nevt );
@@ -400,7 +403,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for color selection
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferSelectedColor hstSelCol( nevt );
 #else
   PinnedHostBufferSelectedColor hstSelCol( nevt );
@@ -438,7 +441,7 @@ main( int argc, char** argv )
     const bool onDevice = true;
     prnk.reset( new CurandRandomNumberKernel( devRndmom, onDevice ) );
 #else
-    throw std::logic_error( "INTERNAL ERROR! CurandDevice is not supported on CPUs" ); // INTERNAL ERROR (no path to this statement)
+    throw std::logic_error( "INTERNAL ERROR! CurandDevice is not supported on CPUs or non-NVidia GPUs" ); // INTERNAL ERROR (no path to this statement)
 #endif
   }
 
@@ -450,7 +453,7 @@ main( int argc, char** argv )
   }
   else
   {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     prsk.reset( new RamboSamplingKernelDevice( energy, devRndmom, devMomenta, devWeights, gpublocks, gputhreads ) );
 #else
     throw std::logic_error( "RamboDevice is not supported on CPUs" ); // INTERNAL ERROR (no path to this statement)
@@ -461,7 +464,7 @@ main( int argc, char** argv )
   std::unique_ptr<MatrixElementKernelBase> pmek;
   if( !bridge )
   {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     pmek.reset( new MatrixElementKernelDevice( devMomenta, devGs, devRndHel, devRndCol, devMatrixElements, devSelHel, devSelCol, gpublocks, gputhreads ) );
 #else
     pmek.reset( new MatrixElementKernelHost( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, nevt ) );
@@ -469,7 +472,7 @@ main( int argc, char** argv )
   }
   else
   {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     pmek.reset( new BridgeKernelDevice( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, gpublocks, gputhreads ) );
 #else
     pmek.reset( new BridgeKernelHost( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, nevt ) );
@@ -511,7 +514,7 @@ main( int argc, char** argv )
     prnk->generateRnarray();
     //std::cout << "Got random numbers" << std::endl;
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     if( rndgen != RandomNumberMode::CurandDevice && rmbsmp == RamboSamplingMode::RamboDevice )
     {
       // --- 1c. Copy rndmom from host to device
@@ -543,7 +546,7 @@ main( int argc, char** argv )
     prsk->getMomentaFinal();
     //std::cout << "Got final momenta" << std::endl;
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     if( rmbsmp == RamboSamplingMode::RamboDevice )
     {
       // --- 2c. CopyDToH Weights
@@ -588,7 +591,7 @@ main( int argc, char** argv )
       dynamic_cast<BridgeKernelBase*>( pmek.get() )->transposeInputMomentaC2F();
     }
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     // --- 2d. CopyHToD Momenta
     const std::string gKey = "0.. CpHTDg";
     rambtime += timermap.start( gKey ); // FIXME! NOT A RAMBO TIMER!
@@ -617,7 +620,7 @@ main( int argc, char** argv )
     wv3atime += timermap.stop(); // calc only
     wavetime += wv3atime;        // calc plus copy
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     if( !bridge )
     {
       // --- 3b. CopyDToH MEs
@@ -760,18 +763,22 @@ main( int argc, char** argv )
     rndgentxt = "CURAND DEVICE";
 #ifdef __CUDACC__
   rndgentxt += " (CUDA code)";
+#elif defined __HIPCC__
+  rndgentxt += " (HIP code)";
 #else
   rndgentxt += " (C++ code)";
 #endif
 
   // Workflow description summary
   std::string wrkflwtxt;
-  // -- CUDA or C++?
+  // -- CUDA or HIP or C++?
 #ifdef __CUDACC__
   wrkflwtxt += "CUD:";
+#elif defined __HIPCC__
+  wrkflwtxt += "HIP:";
 #else
   wrkflwtxt += "CPP:";
-#endif
+#endif /* clang-format off */
   // -- DOUBLE or FLOAT?
 #if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
   wrkflwtxt += "MIX+"; // mixed fptypes (single precision color algebra #537)
@@ -781,7 +788,7 @@ main( int argc, char** argv )
   wrkflwtxt += "FLT+";
 #else
   wrkflwtxt += "???+"; // no path to this statement
-#endif
+#endif /* clang-format on */
   // -- CUCOMPLEX or THRUST or STD complex numbers?
 #ifdef __CUDACC__
 #if defined MGONGPU_CUCXTYPE_CUCOMPLEX
@@ -793,6 +800,12 @@ main( int argc, char** argv )
 #else
   wrkflwtxt += "???:"; // no path to this statement
 #endif
+#elif defined __HIPCC__
+#if defined MGONGPU_CUCXTYPE_CXSMPL
+  wrkflwtxt += "CXS:";
+#else
+  wrkflwtxt += "???:"; // no path to this statement
+#endif
 #else
 #if defined MGONGPU_CPPCXTYPE_STDCOMPLEX
   wrkflwtxt += "STX:";
@@ -818,7 +831,7 @@ main( int argc, char** argv )
     wrkflwtxt += "RMBDEV+";
   else
     wrkflwtxt += "??????+"; // no path to this statement
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   // -- HOST or DEVICE matrix elements? Standalone MEs or BRIDGE?
   if( !bridge )
     wrkflwtxt += "MESDEV";
@@ -874,7 +887,7 @@ main( int argc, char** argv )
 
   if( perf )
   {
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 #ifdef _OPENMP
     // Get the output of "nproc --all" (https://stackoverflow.com/a/478960)
     std::string nprocall;
@@ -895,6 +908,8 @@ main( int argc, char** argv )
     std::cout << std::string( SEP79, '*' ) << std::endl
 #ifdef __CUDACC__
               << "Process                     = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_CUDA"
+#elif defined __HIPCC__
+              << "Process                     = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_HIP"
 #else
               << "Process                     = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_CPP"
 #endif
@@ -921,21 +936,21 @@ main( int argc, char** argv )
 #elif defined MGONGPU_FPTYPE_FLOAT
               << "FP precision                = FLOAT (NaN/abnormal=" << nabn << ", zero=" << nzero << ")" << std::endl
 #endif
-#ifdef __CUDACC__
 #if defined MGONGPU_CUCXTYPE_CUCOMPLEX
               << "Complex type                = CUCOMPLEX" << std::endl
 #elif defined MGONGPU_CUCXTYPE_THRUST
               << "Complex type                = THRUST::COMPLEX" << std::endl
-#endif
-#else
+#elif defined MGONGPU_CUCXTYPE_CXSMPL
               << "Complex type                = STD::COMPLEX" << std::endl
+#else
+              << "Complex type                = ???" << std::endl // no path to this statement...
 #endif
               << "RanNumb memory layout       = AOSOA[" << neppR << "]"
               << ( neppR == 1 ? " == AOS" : "" )
               << " [HARDCODED FOR REPRODUCIBILITY]" << std::endl
               << "Momenta memory layout       = AOSOA[" << neppM << "]"
               << ( neppM == 1 ? " == AOS" : "" ) << std::endl
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     //<< "Wavefunction GPU memory     = LOCAL" << std::endl
 #else
 #if !defined MGONGPU_CPPSIMD
@@ -966,7 +981,7 @@ main( int argc, char** argv )
 #endif
 #endif
               << "Random number generation    = " << rndgentxt << std::endl
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 #ifdef _OPENMP
               << "OMP threads / `nproc --all` = " << omp_get_max_threads() << " / " << nprocall // includes a newline
 #endif
@@ -1062,14 +1077,14 @@ main( int argc, char** argv )
              << "\"FLOAT (NaN/abnormal=" << nabn << ")\"," << std::endl
 #endif
              << "\"Complex type\": "
-#ifdef __CUDACC__
 #if defined MGONGPU_CUCXTYPE_CUCOMPLEX
              << "\"CUCOMPLEX\"," << std::endl
 #elif defined MGONGPU_CUCXTYPE_THRUST
              << "\"THRUST::COMPLEX\"," << std::endl
-#endif
-#else
+#elif defined MGONGPU_CUCXTYPE_CXSMPL
              << "\"STD::COMPLEX\"," << std::endl
+#else
+             << "\"???\"," << std::endl                           // no path to this statement...
 #endif
              << "\"RanNumb memory layout\": "
              << "\"AOSOA[" << neppR << "]\""
@@ -1077,7 +1092,7 @@ main( int argc, char** argv )
              << "\"Momenta memory layout\": "
              << "\"AOSOA[" << neppM << "]\""
              << ( neppM == 1 ? " == AOS" : "" ) << ", " << std::endl
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     //<< "\"Wavefunction GPU memory\": " << "\"LOCAL\"," << std::endl
 #endif
              << "\"Curand generation\": "
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxgg/CPPProcess.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxgg/CPPProcess.cc
index 9051b3108d..b4df38fb35 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxgg/CPPProcess.cc
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxgg/CPPProcess.cc
@@ -4,7 +4,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
 // MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
@@ -16,7 +16,6 @@
 
 #include "mgOnGpuConfig.h"
 
-#include "CudaRuntime.h"
 #include "HelAmps_sm.h"
 #include "MemoryAccessAmplitudes.h"
 #include "MemoryAccessCouplings.h"
@@ -46,7 +45,7 @@
 // Class member functions for calculating the matrix elements for
 // Process: g g > t t~ g g WEIGHTED<=4 @2
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -80,7 +79,7 @@ namespace mg5amcCpu
   __device__ const fptype cIPD[2] = { (fptype)Parameters_sm::mdl_MT, (fptype)Parameters_sm::mdl_WT };
   __device__ const fptype* cIPC = nullptr; // unused as nicoup=0
 #else
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   __device__ __constant__ fptype cIPD[2];
   __device__ __constant__ fptype* cIPC = nullptr; // unused as nicoup=0
 #else
@@ -90,7 +89,7 @@ namespace mg5amcCpu
 #endif
 
   // Helicity combinations (and filtering of "good" helicity combinations)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   __device__ __constant__ short cHel[ncomb][npar];
   __device__ __constant__ int cNGoodHel;
   __device__ __constant__ int cGoodHel[ncomb];
@@ -118,13 +117,13 @@ namespace mg5amcCpu
                            fptype* allDenominators,       // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
                            fptype_sv* jamp2_sv            // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled)
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
                            , const int ievt00             // input: first event number in current C++ event page (for CUDA, ievt depends on threadid)
 #endif
                            )
   //ALWAYS_INLINE // attributes are not permitted in a function definition
   {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     using namespace mg5amcGpu;
     using M_ACCESS = DeviceAccessMomenta;         // non-trivial access: buffer includes all events
     using E_ACCESS = DeviceAccessMatrixElements;  // non-trivial access: buffer includes all events
@@ -151,7 +150,7 @@ namespace mg5amcCpu
 #endif /* clang-format on */
     mgDebug( 0, __FUNCTION__ );
     //printf( "calculate_wavefunctions: ihel=%2d\n", ihel );
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
     //printf( "calculate_wavefunctions: ievt00=%d\n", ievt00 );
 #endif
 
@@ -187,7 +186,7 @@ namespace mg5amcCpu
 #endif
     for( int iParity = 0; iParity < nParity; ++iParity )
     { // START LOOP ON IPARITY
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
       const int ievt0 = ievt00 + iParity * neppV;
 #endif
       constexpr size_t nxcoup = ndcoup + nicoup; // both dependent and independent couplings
@@ -200,8 +199,10 @@ namespace mg5amcCpu
         allCOUPs[idcoup] = CD_ACCESS::idcoupAccessBufferConst( allcouplings, idcoup ); // dependent couplings, vary event-by-event
       for( size_t iicoup = 0; iicoup < nicoup; iicoup++ )
         allCOUPs[ndcoup + iicoup] = CI_ACCESS::iicoupAccessBufferConst( cIPC, iicoup ); // independent couplings, fixed for all events
+#ifdef MGONGPUCPP_GPUIMPL
 #ifdef __CUDACC__
 #pragma nv_diagnostic pop
+#endif
       // CUDA kernels take input/output buffers with momenta/MEs for all events
       const fptype* momenta = allmomenta;
       const fptype* COUPs[nxcoup];
@@ -2417,7 +2418,7 @@ namespace mg5amcCpu
         { 62, 71, -10, 80, -1, 8, -28, 62, 62, -10, -10, -1, -1, 8, -10, -1, -64, 8, 8, -64, 80, 8, 512, -64 },
         { -28, 62, 62, -10, -10, -1, 62, 71, -10, 80, -1, 8, -10, -1, -1, 8, 8, -64, 80, 8, 8, -64, -64, 512 } }; // 2-D array[24][24]
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
       // Pre-compute a constexpr triangular color matrix properly normalized #475
       struct TriangularNormalizedColorMatrix
       {
@@ -2474,7 +2475,7 @@ namespace mg5amcCpu
 #endif
       for( int icol = 0; icol < ncolor; icol++ )
       {
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
         // === C++ START ===
         // Diagonal terms
 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
@@ -2533,7 +2534,7 @@ namespace mg5amcCpu
       MEs_sv_previous += deltaMEs_previous;
 #endif
       /*
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
       if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", blockDim.x * blockIdx.x + threadIdx.x, ihel, MEs_sv );
 #else
 #ifdef MGONGPU_CPPSIMD
@@ -2628,8 +2629,8 @@ namespace mg5amcCpu
       { 1, 1, 1, -1, -1, 1 },
       { 1, 1, 1, -1, 1, -1 },
       { 1, 1, 1, -1, 1, 1 } };
-#ifdef __CUDACC__
-    checkCuda( cudaMemcpyToSymbol( cHel, tHel, ncomb * npar * sizeof( short ) ) );
+#ifdef MGONGPUCPP_GPUIMPL
+    gpuMemcpyToSymbol( cHel, tHel, ncomb * npar * sizeof( short ) );
 #else
     memcpy( cHel, tHel, ncomb * npar * sizeof( short ) );
 #endif
@@ -2671,9 +2672,9 @@ namespace mg5amcCpu
     // Then copy them to CUDA constant memory (issue #39) or its C++ emulation in file-scope static memory
     const fptype tIPD[2] = { (fptype)m_pars->mdl_MT, (fptype)m_pars->mdl_WT };
     //const cxtype tIPC[0] = { ... }; // nicoup=0
-#ifdef __CUDACC__
-    checkCuda( cudaMemcpyToSymbol( cIPD, tIPD, 2 * sizeof( fptype ) ) );
-    //checkCuda( cudaMemcpyToSymbol( cIPC, tIPC, 0 * sizeof( cxtype ) ) ); // nicoup=0
+#ifdef MGONGPUCPP_GPUIMPL
+    gpuMemcpyToSymbol( cIPD, tIPD, 2 * sizeof( fptype ) );
+    //gpuMemcpyToSymbol( cIPC, tIPC, 0 * sizeof( cxtype ) ); // nicoup=0
 #else
     memcpy( cIPD, tIPD, 2 * sizeof( fptype ) );
     //memcpy( cIPC, tIPC, 0 * sizeof( cxtype ) ); // nicoup=0
@@ -2711,7 +2712,7 @@ namespace mg5amcCpu
   {
     std::stringstream out;
     // CUDA version (NVCC)
-    // [Use __NVCC__ instead of __CUDACC__ here!]
+    // [Use __NVCC__ instead of MGONGPUCPP_GPUIMPL here!]
     // [This tests if 'nvcc' was used even to build a .cc file, even if not necessarily 'nvcc -x cu' for a .cu file]
     // [Check 'nvcc --compiler-options -dM -E dummy.c | grep CUDA': see https://stackoverflow.com/a/53713712]
 #ifdef __NVCC__
@@ -2776,12 +2777,12 @@ namespace mg5amcCpu
   __global__ void /* clang-format off */
   computeDependentCouplings( const fptype* allgs, // input: Gs[nevt]
                              fptype* allcouplings // output: couplings[nevt*ndcoup*2]
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
                              , const int nevt     // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #endif
   ) /* clang-format on */
   {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     using namespace mg5amcGpu;
     using G_ACCESS = DeviceAccessGs;
     using C_ACCESS = DeviceAccessCouplings;
@@ -2802,7 +2803,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__ /* clang-format off */
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
   __global__ void
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
@@ -2928,9 +2929,9 @@ namespace mg5amcCpu
         nGoodHel++;
       }
     }
-#ifdef __CUDACC__
-    checkCuda( cudaMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) ) );
-    checkCuda( cudaMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) ) );
+#ifdef MGONGPUCPP_GPUIMPL
+    gpuMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) );
+    gpuMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) );
 #else
     cNGoodHel = nGoodHel;
     for( int ihel = 0; ihel < ncomb; ihel++ ) cGoodHel[ihel] = goodHel[ihel];
@@ -2954,7 +2955,7 @@ namespace mg5amcCpu
 #endif
             int* allselhel,                // output: helicity selection[nevt]
             int* allselcol                 // output: helicity selection[nevt]
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
             , const int nevt               // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #endif
             ) /* clang-format on */
@@ -2974,7 +2975,7 @@ namespace mg5amcCpu
     // Denominators: spins, colors and identical particles
     constexpr int helcolDenominators[1] = { 512 }; // assume nprocesses == 1 (#272 and #343)
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     // Remember: in CUDA this is a kernel for one event, in c++ this processes n events
     const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid
 #else
@@ -2988,9 +2989,12 @@ namespace mg5amcCpu
 #endif
 
     // Start sigmaKin_lines
+
+#include "GpuAbstraction.h"
+
     // === PART 0 - INITIALISATION (before calculate_wavefunctions) ===
     // Reset the "matrix elements" - running sums of |M|^2 over helicities for the given event
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     allMEs[ievt] = 0;
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     allNumerators[ievt] = 0;
@@ -3018,7 +3022,7 @@ namespace mg5amcCpu
     // === PART 1 - HELICITY LOOP: CALCULATE WAVEFUNCTIONS ===
     // (in both CUDA and C++, using precomputed good helicities)
 
-#ifdef __CUDACC__ // CUDA OR C++
+#ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++
 
     // *** START OF PART 1a - CUDA (one event per CPU thread) ***
     // Running sum of partial amplitudes squared for event by event color selection (#402)
@@ -3228,7 +3232,7 @@ namespace mg5amcCpu
     // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event
     // [NB 'sum over final spins, average over initial spins', eg see
     // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf]
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     allMEs[ievt] /= helcolDenominators[0];
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     if( channelId > 0 ) allMEs[ievt] *= allNumerators[ievt] / allDenominators[ievt];
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxgg/CPPProcess.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxgg/CPPProcess.h
index 9f43559181..511b053c2a 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxgg/CPPProcess.h
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxgg/CPPProcess.h
@@ -4,7 +4,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
 // MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
@@ -25,7 +25,7 @@
 
 //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -107,7 +107,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   __global__ void
   computeDependentCouplings( const fptype* allgs,    // input: Gs[nevt]
                              fptype* allcouplings ); // output: couplings[nevt*ndcoup*2]
@@ -120,7 +120,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__ /* clang-format off */
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
   __global__ void
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
@@ -150,7 +150,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__ /* clang-format off */
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
   __global__ void
   sigmaKin( const fptype* allmomenta,      // input: momenta[nevt*npar*4]
             const fptype* allcouplings,    // input: couplings[nevt*ndcoup*2]
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxgg/CudaRuntime.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxgg/CudaRuntime.h
deleted file mode 120000
index ce9e1a487a..0000000000
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxgg/CudaRuntime.h
+++ /dev/null
@@ -1 +0,0 @@
-../CudaRuntime.h
\ No newline at end of file
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxgg/GpuAbstraction.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxgg/GpuAbstraction.h
new file mode 120000
index 0000000000..72054e19ba
--- /dev/null
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxgg/GpuAbstraction.h
@@ -0,0 +1 @@
+../GpuAbstraction.h
\ No newline at end of file
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxgg/GpuRuntime.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxgg/GpuRuntime.h
new file mode 120000
index 0000000000..3920e83be4
--- /dev/null
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxgg/GpuRuntime.h
@@ -0,0 +1 @@
+../GpuRuntime.h
\ No newline at end of file
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxgg/check_sa.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxgg/check_sa.cc
index 3fbf0ffbee..7cac5ab47b 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxgg/check_sa.cc
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxgg/check_sa.cc
@@ -4,7 +4,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: O. Mattelaer (Nov 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 
 #include "mgOnGpuConfig.h"
@@ -12,6 +12,7 @@
 #include "BridgeKernels.h"
 #include "CPPProcess.h"
 #include "CrossSectionKernels.h"
+#include "GpuRuntime.h"
 #include "MatrixElementKernels.h"
 #include "MemoryAccessMatrixElements.h"
 #include "MemoryAccessMomenta.h"
@@ -65,7 +66,7 @@ usage( char* argv0, int ret = 1 )
   std::cout << std::endl;
   std::cout << "Summary stats are always computed: '-p' and '-j' only control their printout" << std::endl;
   std::cout << "The '-d' flag only enables NaN/abnormal warnings and OMP debugging" << std::endl;
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 #ifdef _OPENMP
   std::cout << std::endl;
   std::cout << "Use the OMP_NUM_THREADS environment variable to control OMP multi-threading" << std::endl;
@@ -96,7 +97,7 @@ int
 main( int argc, char** argv )
 {
   // Namespaces for CUDA and C++ (FIXME - eventually use the same namespace everywhere...)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   using namespace mg5amcGpu;
 #else
   using namespace mg5amcCpu;
@@ -134,9 +135,11 @@ main( int argc, char** argv )
     CurandDevice = 2
   };
 #ifdef MGONGPU_HAS_NO_CURAND
-  RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // this is the only supported mode if build has no curand (PR #784)
+  RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // this is the only supported mode if build has no curand (PR #784 and #785)
+#elif defined __HIPCC__
+#error Internal error: MGONGPU_HAS_NO_CURAND should have been set for __HIPCC__ // default on AMD GPUs should be common random
 #elif defined __CUDACC__
-  RandomNumberMode rndgen = RandomNumberMode::CurandDevice; // default on GPU if build has curand
+  RandomNumberMode rndgen = RandomNumberMode::CurandDevice; // default on NVidia GPU if build has curand
 #else
   RandomNumberMode rndgen = RandomNumberMode::CurandHost; // default on CPU if build has curand
 #endif
@@ -146,10 +149,10 @@ main( int argc, char** argv )
     RamboHost = 1,
     RamboDevice = 2
   };
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   RamboSamplingMode rmbsmp = RamboSamplingMode::RamboDevice; // default on GPU
 #else
-  RamboSamplingMode rmbsmp = RamboSamplingMode::RamboHost;  // default on CPU
+  RamboSamplingMode rmbsmp = RamboSamplingMode::RamboHost; // default on CPU
 #endif
   // Bridge emulation mode (NB Bridge implies RamboHost!)
   bool bridge = false;
@@ -177,7 +180,7 @@ main( int argc, char** argv )
     else if( arg == "--curdev" )
     {
 #ifndef __CUDACC__
-      throw std::runtime_error( "CurandDevice is not supported on CPUs" );
+      throw std::runtime_error( "CurandDevice is not supported on CPUs or non-NVidia GPUs" );
 #elif defined MGONGPU_HAS_NO_CURAND
       throw std::runtime_error( "CurandDevice is not supported because this application was built without Curand support" );
 #else
@@ -198,7 +201,7 @@ main( int argc, char** argv )
     }
     else if( arg == "--rmbdev" )
     {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
       rmbsmp = RamboSamplingMode::RamboDevice;
 #else
       throw std::runtime_error( "RamboDevice is not supported on CPUs" );
@@ -272,13 +275,13 @@ main( int argc, char** argv )
     return usage( argv[0] );
   }
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 #ifdef _OPENMP
   ompnumthreadsNotSetMeansOneThread( debug ? 1 : 0 ); // quiet(-1), info(0), debug(1)
 #endif
 #endif
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // Fail gently and avoid "Illegal instruction (core dumped)" if the host does not support the SIMD used in the ME calculation
   // Note: this prevents a crash on pmpe04 but not on some github CI nodes?
   // [NB: SIMD vectorization in mg5amc C++ code is only used in the ME calculation below MatrixElementKernelHost!]
@@ -296,14 +299,14 @@ main( int argc, char** argv )
 
   // === STEP 0 - INITIALISE
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 
-  // --- 00. Initialise cuda
-  // Instantiate a CudaRuntime at the beginnining of the application's main to
-  // invoke cudaSetDevice(0) in the constructor and book a cudaDeviceReset() call in the destructor
-  const std::string cdinKey = "00 CudaInit";
+  // --- 00. Initialise GPU
+  // Instantiate a GpuRuntime at the beginnining of the application's main.
+  // For CUDA this invokes cudaSetDevice(0) in the constructor and books a cudaDeviceReset() call in the destructor.
+  const std::string cdinKey = "00 GpuInit";
   timermap.start( cdinKey );
-  CudaRuntime cudaRuntime( debug );
+  GpuRuntime GpuRuntime( debug );
 #endif
 
   // --- 0a. Initialise physics process
@@ -325,7 +328,7 @@ main( int argc, char** argv )
   timermap.start( alloKey );
 
   // Memory buffers for random numbers for momenta
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferRndNumMomenta hstRndmom( nevt );
 #else
   PinnedHostBufferRndNumMomenta hstRndmom( nevt );
@@ -333,7 +336,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for sampling weights
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferWeights hstWeights( nevt );
 #else
   PinnedHostBufferWeights hstWeights( nevt );
@@ -341,7 +344,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for momenta
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferMomenta hstMomenta( nevt );
 #else
   PinnedHostBufferMomenta hstMomenta( nevt );
@@ -349,7 +352,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for Gs
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferGs hstGs( nevt );
 #else
   PinnedHostBufferGs hstGs( nevt );
@@ -366,7 +369,7 @@ main( int argc, char** argv )
   }
 
   // Memory buffers for matrix elements
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferMatrixElements hstMatrixElements( nevt );
 #else
   PinnedHostBufferMatrixElements hstMatrixElements( nevt );
@@ -375,7 +378,7 @@ main( int argc, char** argv )
 
   // Memory buffers for random numbers for helicity selection
   // *** NB #403 these buffers always remain initialised at 0: no need for helicity choice in gcheck/check (no LHE produced) ***
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferRndNumHelicity hstRndHel( nevt );
 #else
   PinnedHostBufferRndNumHelicity hstRndHel( nevt );
@@ -384,7 +387,7 @@ main( int argc, char** argv )
 
   // Memory buffers for random numbers for color selection
   // *** NB #402 these buffers always remain initialised at 0: no need for color choice in gcheck/check (no LHE produced) ***
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferRndNumColor hstRndCol( nevt );
 #else
   PinnedHostBufferRndNumColor hstRndCol( nevt );
@@ -392,7 +395,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for helicity selection
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferSelectedHelicity hstSelHel( nevt );
 #else
   PinnedHostBufferSelectedHelicity hstSelHel( nevt );
@@ -400,7 +403,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for color selection
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferSelectedColor hstSelCol( nevt );
 #else
   PinnedHostBufferSelectedColor hstSelCol( nevt );
@@ -438,7 +441,7 @@ main( int argc, char** argv )
     const bool onDevice = true;
     prnk.reset( new CurandRandomNumberKernel( devRndmom, onDevice ) );
 #else
-    throw std::logic_error( "INTERNAL ERROR! CurandDevice is not supported on CPUs" ); // INTERNAL ERROR (no path to this statement)
+    throw std::logic_error( "INTERNAL ERROR! CurandDevice is not supported on CPUs or non-NVidia GPUs" ); // INTERNAL ERROR (no path to this statement)
 #endif
   }
 
@@ -450,7 +453,7 @@ main( int argc, char** argv )
   }
   else
   {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     prsk.reset( new RamboSamplingKernelDevice( energy, devRndmom, devMomenta, devWeights, gpublocks, gputhreads ) );
 #else
     throw std::logic_error( "RamboDevice is not supported on CPUs" ); // INTERNAL ERROR (no path to this statement)
@@ -461,7 +464,7 @@ main( int argc, char** argv )
   std::unique_ptr<MatrixElementKernelBase> pmek;
   if( !bridge )
   {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     pmek.reset( new MatrixElementKernelDevice( devMomenta, devGs, devRndHel, devRndCol, devMatrixElements, devSelHel, devSelCol, gpublocks, gputhreads ) );
 #else
     pmek.reset( new MatrixElementKernelHost( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, nevt ) );
@@ -469,7 +472,7 @@ main( int argc, char** argv )
   }
   else
   {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     pmek.reset( new BridgeKernelDevice( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, gpublocks, gputhreads ) );
 #else
     pmek.reset( new BridgeKernelHost( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, nevt ) );
@@ -511,7 +514,7 @@ main( int argc, char** argv )
     prnk->generateRnarray();
     //std::cout << "Got random numbers" << std::endl;
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     if( rndgen != RandomNumberMode::CurandDevice && rmbsmp == RamboSamplingMode::RamboDevice )
     {
       // --- 1c. Copy rndmom from host to device
@@ -543,7 +546,7 @@ main( int argc, char** argv )
     prsk->getMomentaFinal();
     //std::cout << "Got final momenta" << std::endl;
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     if( rmbsmp == RamboSamplingMode::RamboDevice )
     {
       // --- 2c. CopyDToH Weights
@@ -588,7 +591,7 @@ main( int argc, char** argv )
       dynamic_cast<BridgeKernelBase*>( pmek.get() )->transposeInputMomentaC2F();
     }
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     // --- 2d. CopyHToD Momenta
     const std::string gKey = "0.. CpHTDg";
     rambtime += timermap.start( gKey ); // FIXME! NOT A RAMBO TIMER!
@@ -617,7 +620,7 @@ main( int argc, char** argv )
     wv3atime += timermap.stop(); // calc only
     wavetime += wv3atime;        // calc plus copy
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     if( !bridge )
     {
       // --- 3b. CopyDToH MEs
@@ -760,18 +763,22 @@ main( int argc, char** argv )
     rndgentxt = "CURAND DEVICE";
 #ifdef __CUDACC__
   rndgentxt += " (CUDA code)";
+#elif defined __HIPCC__
+  rndgentxt += " (HIP code)";
 #else
   rndgentxt += " (C++ code)";
 #endif
 
   // Workflow description summary
   std::string wrkflwtxt;
-  // -- CUDA or C++?
+  // -- CUDA or HIP or C++?
 #ifdef __CUDACC__
   wrkflwtxt += "CUD:";
+#elif defined __HIPCC__
+  wrkflwtxt += "HIP:";
 #else
   wrkflwtxt += "CPP:";
-#endif
+#endif /* clang-format off */
   // -- DOUBLE or FLOAT?
 #if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
   wrkflwtxt += "MIX+"; // mixed fptypes (single precision color algebra #537)
@@ -781,7 +788,7 @@ main( int argc, char** argv )
   wrkflwtxt += "FLT+";
 #else
   wrkflwtxt += "???+"; // no path to this statement
-#endif
+#endif /* clang-format on */
   // -- CUCOMPLEX or THRUST or STD complex numbers?
 #ifdef __CUDACC__
 #if defined MGONGPU_CUCXTYPE_CUCOMPLEX
@@ -793,6 +800,12 @@ main( int argc, char** argv )
 #else
   wrkflwtxt += "???:"; // no path to this statement
 #endif
+#elif defined __HIPCC__
+#if defined MGONGPU_CUCXTYPE_CXSMPL
+  wrkflwtxt += "CXS:";
+#else
+  wrkflwtxt += "???:"; // no path to this statement
+#endif
 #else
 #if defined MGONGPU_CPPCXTYPE_STDCOMPLEX
   wrkflwtxt += "STX:";
@@ -818,7 +831,7 @@ main( int argc, char** argv )
     wrkflwtxt += "RMBDEV+";
   else
     wrkflwtxt += "??????+"; // no path to this statement
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   // -- HOST or DEVICE matrix elements? Standalone MEs or BRIDGE?
   if( !bridge )
     wrkflwtxt += "MESDEV";
@@ -874,7 +887,7 @@ main( int argc, char** argv )
 
   if( perf )
   {
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 #ifdef _OPENMP
     // Get the output of "nproc --all" (https://stackoverflow.com/a/478960)
     std::string nprocall;
@@ -895,6 +908,8 @@ main( int argc, char** argv )
     std::cout << std::string( SEP79, '*' ) << std::endl
 #ifdef __CUDACC__
               << "Process                     = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_CUDA"
+#elif defined __HIPCC__
+              << "Process                     = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_HIP"
 #else
               << "Process                     = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_CPP"
 #endif
@@ -921,21 +936,21 @@ main( int argc, char** argv )
 #elif defined MGONGPU_FPTYPE_FLOAT
               << "FP precision                = FLOAT (NaN/abnormal=" << nabn << ", zero=" << nzero << ")" << std::endl
 #endif
-#ifdef __CUDACC__
 #if defined MGONGPU_CUCXTYPE_CUCOMPLEX
               << "Complex type                = CUCOMPLEX" << std::endl
 #elif defined MGONGPU_CUCXTYPE_THRUST
               << "Complex type                = THRUST::COMPLEX" << std::endl
-#endif
-#else
+#elif defined MGONGPU_CUCXTYPE_CXSMPL
               << "Complex type                = STD::COMPLEX" << std::endl
+#else
+              << "Complex type                = ???" << std::endl // no path to this statement...
 #endif
               << "RanNumb memory layout       = AOSOA[" << neppR << "]"
               << ( neppR == 1 ? " == AOS" : "" )
               << " [HARDCODED FOR REPRODUCIBILITY]" << std::endl
               << "Momenta memory layout       = AOSOA[" << neppM << "]"
               << ( neppM == 1 ? " == AOS" : "" ) << std::endl
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     //<< "Wavefunction GPU memory     = LOCAL" << std::endl
 #else
 #if !defined MGONGPU_CPPSIMD
@@ -966,7 +981,7 @@ main( int argc, char** argv )
 #endif
 #endif
               << "Random number generation    = " << rndgentxt << std::endl
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 #ifdef _OPENMP
               << "OMP threads / `nproc --all` = " << omp_get_max_threads() << " / " << nprocall // includes a newline
 #endif
@@ -1062,14 +1077,14 @@ main( int argc, char** argv )
              << "\"FLOAT (NaN/abnormal=" << nabn << ")\"," << std::endl
 #endif
              << "\"Complex type\": "
-#ifdef __CUDACC__
 #if defined MGONGPU_CUCXTYPE_CUCOMPLEX
              << "\"CUCOMPLEX\"," << std::endl
 #elif defined MGONGPU_CUCXTYPE_THRUST
              << "\"THRUST::COMPLEX\"," << std::endl
-#endif
-#else
+#elif defined MGONGPU_CUCXTYPE_CXSMPL
              << "\"STD::COMPLEX\"," << std::endl
+#else
+             << "\"???\"," << std::endl                           // no path to this statement...
 #endif
              << "\"RanNumb memory layout\": "
              << "\"AOSOA[" << neppR << "]\""
@@ -1077,7 +1092,7 @@ main( int argc, char** argv )
              << "\"Momenta memory layout\": "
              << "\"AOSOA[" << neppM << "]\""
              << ( neppM == 1 ? " == AOS" : "" ) << ", " << std::endl
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     //<< "\"Wavefunction GPU memory\": " << "\"LOCAL\"," << std::endl
 #endif
              << "\"Curand generation\": "
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxuux/CPPProcess.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxuux/CPPProcess.cc
index 866433ae8b..bc38d1f109 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxuux/CPPProcess.cc
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxuux/CPPProcess.cc
@@ -4,7 +4,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
 // MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
@@ -16,7 +16,6 @@
 
 #include "mgOnGpuConfig.h"
 
-#include "CudaRuntime.h"
 #include "HelAmps_sm.h"
 #include "MemoryAccessAmplitudes.h"
 #include "MemoryAccessCouplings.h"
@@ -49,7 +48,7 @@
 // Process: g g > t t~ d d~ WEIGHTED<=4 @2
 // Process: g g > t t~ s s~ WEIGHTED<=4 @2
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -83,7 +82,7 @@ namespace mg5amcCpu
   __device__ const fptype cIPD[2] = { (fptype)Parameters_sm::mdl_MT, (fptype)Parameters_sm::mdl_WT };
   __device__ const fptype* cIPC = nullptr; // unused as nicoup=0
 #else
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   __device__ __constant__ fptype cIPD[2];
   __device__ __constant__ fptype* cIPC = nullptr; // unused as nicoup=0
 #else
@@ -93,7 +92,7 @@ namespace mg5amcCpu
 #endif
 
   // Helicity combinations (and filtering of "good" helicity combinations)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   __device__ __constant__ short cHel[ncomb][npar];
   __device__ __constant__ int cNGoodHel;
   __device__ __constant__ int cGoodHel[ncomb];
@@ -121,13 +120,13 @@ namespace mg5amcCpu
                            fptype* allDenominators,       // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
                            fptype_sv* jamp2_sv            // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled)
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
                            , const int ievt00             // input: first event number in current C++ event page (for CUDA, ievt depends on threadid)
 #endif
                            )
   //ALWAYS_INLINE // attributes are not permitted in a function definition
   {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     using namespace mg5amcGpu;
     using M_ACCESS = DeviceAccessMomenta;         // non-trivial access: buffer includes all events
     using E_ACCESS = DeviceAccessMatrixElements;  // non-trivial access: buffer includes all events
@@ -154,7 +153,7 @@ namespace mg5amcCpu
 #endif /* clang-format on */
     mgDebug( 0, __FUNCTION__ );
     //printf( "calculate_wavefunctions: ihel=%2d\n", ihel );
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
     //printf( "calculate_wavefunctions: ievt00=%d\n", ievt00 );
 #endif
 
@@ -190,7 +189,7 @@ namespace mg5amcCpu
 #endif
     for( int iParity = 0; iParity < nParity; ++iParity )
     { // START LOOP ON IPARITY
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
       const int ievt0 = ievt00 + iParity * neppV;
 #endif
       constexpr size_t nxcoup = ndcoup + nicoup; // both dependent and independent couplings
@@ -203,8 +202,10 @@ namespace mg5amcCpu
         allCOUPs[idcoup] = CD_ACCESS::idcoupAccessBufferConst( allcouplings, idcoup ); // dependent couplings, vary event-by-event
       for( size_t iicoup = 0; iicoup < nicoup; iicoup++ )
         allCOUPs[ndcoup + iicoup] = CI_ACCESS::iicoupAccessBufferConst( cIPC, iicoup ); // independent couplings, fixed for all events
+#ifdef MGONGPUCPP_GPUIMPL
 #ifdef __CUDACC__
 #pragma nv_diagnostic pop
+#endif
       // CUDA kernels take input/output buffers with momenta/MEs for all events
       const fptype* momenta = allmomenta;
       const fptype* COUPs[nxcoup];
@@ -812,7 +813,7 @@ namespace mg5amcCpu
         { -2, 6, -6, -2, 16, 0, 0, -2, 16, 6, 48, 16 },
         { 6, -2, -2, -6, 0, 16, -2, 0, 6, 16, 16, 48 } }; // 2-D array[12][12]
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
       // Pre-compute a constexpr triangular color matrix properly normalized #475
       struct TriangularNormalizedColorMatrix
       {
@@ -869,7 +870,7 @@ namespace mg5amcCpu
 #endif
       for( int icol = 0; icol < ncolor; icol++ )
       {
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
         // === C++ START ===
         // Diagonal terms
 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
@@ -928,7 +929,7 @@ namespace mg5amcCpu
       MEs_sv_previous += deltaMEs_previous;
 #endif
       /*
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
       if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", blockDim.x * blockIdx.x + threadIdx.x, ihel, MEs_sv );
 #else
 #ifdef MGONGPU_CPPSIMD
@@ -1023,8 +1024,8 @@ namespace mg5amcCpu
       { 1, 1, 1, -1, -1, -1 },
       { 1, 1, 1, -1, 1, 1 },
       { 1, 1, 1, -1, 1, -1 } };
-#ifdef __CUDACC__
-    checkCuda( cudaMemcpyToSymbol( cHel, tHel, ncomb * npar * sizeof( short ) ) );
+#ifdef MGONGPUCPP_GPUIMPL
+    gpuMemcpyToSymbol( cHel, tHel, ncomb * npar * sizeof( short ) );
 #else
     memcpy( cHel, tHel, ncomb * npar * sizeof( short ) );
 #endif
@@ -1066,9 +1067,9 @@ namespace mg5amcCpu
     // Then copy them to CUDA constant memory (issue #39) or its C++ emulation in file-scope static memory
     const fptype tIPD[2] = { (fptype)m_pars->mdl_MT, (fptype)m_pars->mdl_WT };
     //const cxtype tIPC[0] = { ... }; // nicoup=0
-#ifdef __CUDACC__
-    checkCuda( cudaMemcpyToSymbol( cIPD, tIPD, 2 * sizeof( fptype ) ) );
-    //checkCuda( cudaMemcpyToSymbol( cIPC, tIPC, 0 * sizeof( cxtype ) ) ); // nicoup=0
+#ifdef MGONGPUCPP_GPUIMPL
+    gpuMemcpyToSymbol( cIPD, tIPD, 2 * sizeof( fptype ) );
+    //gpuMemcpyToSymbol( cIPC, tIPC, 0 * sizeof( cxtype ) ); // nicoup=0
 #else
     memcpy( cIPD, tIPD, 2 * sizeof( fptype ) );
     //memcpy( cIPC, tIPC, 0 * sizeof( cxtype ) ); // nicoup=0
@@ -1106,7 +1107,7 @@ namespace mg5amcCpu
   {
     std::stringstream out;
     // CUDA version (NVCC)
-    // [Use __NVCC__ instead of __CUDACC__ here!]
+    // [Use __NVCC__ instead of MGONGPUCPP_GPUIMPL here!]
     // [This tests if 'nvcc' was used even to build a .cc file, even if not necessarily 'nvcc -x cu' for a .cu file]
     // [Check 'nvcc --compiler-options -dM -E dummy.c | grep CUDA': see https://stackoverflow.com/a/53713712]
 #ifdef __NVCC__
@@ -1171,12 +1172,12 @@ namespace mg5amcCpu
   __global__ void /* clang-format off */
   computeDependentCouplings( const fptype* allgs, // input: Gs[nevt]
                              fptype* allcouplings // output: couplings[nevt*ndcoup*2]
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
                              , const int nevt     // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #endif
   ) /* clang-format on */
   {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     using namespace mg5amcGpu;
     using G_ACCESS = DeviceAccessGs;
     using C_ACCESS = DeviceAccessCouplings;
@@ -1197,7 +1198,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__ /* clang-format off */
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
   __global__ void
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
@@ -1323,9 +1324,9 @@ namespace mg5amcCpu
         nGoodHel++;
       }
     }
-#ifdef __CUDACC__
-    checkCuda( cudaMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) ) );
-    checkCuda( cudaMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) ) );
+#ifdef MGONGPUCPP_GPUIMPL
+    gpuMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) );
+    gpuMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) );
 #else
     cNGoodHel = nGoodHel;
     for( int ihel = 0; ihel < ncomb; ihel++ ) cGoodHel[ihel] = goodHel[ihel];
@@ -1349,7 +1350,7 @@ namespace mg5amcCpu
 #endif
             int* allselhel,                // output: helicity selection[nevt]
             int* allselcol                 // output: helicity selection[nevt]
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
             , const int nevt               // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #endif
             ) /* clang-format on */
@@ -1369,7 +1370,7 @@ namespace mg5amcCpu
     // Denominators: spins, colors and identical particles
     constexpr int helcolDenominators[1] = { 256 }; // assume nprocesses == 1 (#272 and #343)
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     // Remember: in CUDA this is a kernel for one event, in c++ this processes n events
     const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid
 #else
@@ -1383,9 +1384,12 @@ namespace mg5amcCpu
 #endif
 
     // Start sigmaKin_lines
+
+#include "GpuAbstraction.h"
+
     // === PART 0 - INITIALISATION (before calculate_wavefunctions) ===
     // Reset the "matrix elements" - running sums of |M|^2 over helicities for the given event
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     allMEs[ievt] = 0;
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     allNumerators[ievt] = 0;
@@ -1413,7 +1417,7 @@ namespace mg5amcCpu
     // === PART 1 - HELICITY LOOP: CALCULATE WAVEFUNCTIONS ===
     // (in both CUDA and C++, using precomputed good helicities)
 
-#ifdef __CUDACC__ // CUDA OR C++
+#ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++
 
     // *** START OF PART 1a - CUDA (one event per CPU thread) ***
     // Running sum of partial amplitudes squared for event by event color selection (#402)
@@ -1623,7 +1627,7 @@ namespace mg5amcCpu
     // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event
     // [NB 'sum over final spins, average over initial spins', eg see
     // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf]
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     allMEs[ievt] /= helcolDenominators[0];
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     if( channelId > 0 ) allMEs[ievt] *= allNumerators[ievt] / allDenominators[ievt];
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxuux/CPPProcess.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxuux/CPPProcess.h
index f26b60c5bb..c411623fc8 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxuux/CPPProcess.h
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxuux/CPPProcess.h
@@ -4,7 +4,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
 // MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
@@ -25,7 +25,7 @@
 
 //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -110,7 +110,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   __global__ void
   computeDependentCouplings( const fptype* allgs,    // input: Gs[nevt]
                              fptype* allcouplings ); // output: couplings[nevt*ndcoup*2]
@@ -123,7 +123,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__ /* clang-format off */
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
   __global__ void
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
@@ -153,7 +153,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__ /* clang-format off */
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
   __global__ void
   sigmaKin( const fptype* allmomenta,      // input: momenta[nevt*npar*4]
             const fptype* allcouplings,    // input: couplings[nevt*ndcoup*2]
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxuux/CudaRuntime.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxuux/CudaRuntime.h
deleted file mode 120000
index ce9e1a487a..0000000000
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxuux/CudaRuntime.h
+++ /dev/null
@@ -1 +0,0 @@
-../CudaRuntime.h
\ No newline at end of file
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxuux/GpuAbstraction.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxuux/GpuAbstraction.h
new file mode 120000
index 0000000000..72054e19ba
--- /dev/null
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxuux/GpuAbstraction.h
@@ -0,0 +1 @@
+../GpuAbstraction.h
\ No newline at end of file
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxuux/GpuRuntime.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxuux/GpuRuntime.h
new file mode 120000
index 0000000000..3920e83be4
--- /dev/null
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxuux/GpuRuntime.h
@@ -0,0 +1 @@
+../GpuRuntime.h
\ No newline at end of file
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxuux/check_sa.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxuux/check_sa.cc
index 3fbf0ffbee..7cac5ab47b 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxuux/check_sa.cc
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxuux/check_sa.cc
@@ -4,7 +4,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: O. Mattelaer (Nov 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 
 #include "mgOnGpuConfig.h"
@@ -12,6 +12,7 @@
 #include "BridgeKernels.h"
 #include "CPPProcess.h"
 #include "CrossSectionKernels.h"
+#include "GpuRuntime.h"
 #include "MatrixElementKernels.h"
 #include "MemoryAccessMatrixElements.h"
 #include "MemoryAccessMomenta.h"
@@ -65,7 +66,7 @@ usage( char* argv0, int ret = 1 )
   std::cout << std::endl;
   std::cout << "Summary stats are always computed: '-p' and '-j' only control their printout" << std::endl;
   std::cout << "The '-d' flag only enables NaN/abnormal warnings and OMP debugging" << std::endl;
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 #ifdef _OPENMP
   std::cout << std::endl;
   std::cout << "Use the OMP_NUM_THREADS environment variable to control OMP multi-threading" << std::endl;
@@ -96,7 +97,7 @@ int
 main( int argc, char** argv )
 {
   // Namespaces for CUDA and C++ (FIXME - eventually use the same namespace everywhere...)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   using namespace mg5amcGpu;
 #else
   using namespace mg5amcCpu;
@@ -134,9 +135,11 @@ main( int argc, char** argv )
     CurandDevice = 2
   };
 #ifdef MGONGPU_HAS_NO_CURAND
-  RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // this is the only supported mode if build has no curand (PR #784)
+  RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // this is the only supported mode if build has no curand (PR #784 and #785)
+#elif defined __HIPCC__
+#error Internal error: MGONGPU_HAS_NO_CURAND should have been set for __HIPCC__ // default on AMD GPUs should be common random
 #elif defined __CUDACC__
-  RandomNumberMode rndgen = RandomNumberMode::CurandDevice; // default on GPU if build has curand
+  RandomNumberMode rndgen = RandomNumberMode::CurandDevice; // default on NVidia GPU if build has curand
 #else
   RandomNumberMode rndgen = RandomNumberMode::CurandHost; // default on CPU if build has curand
 #endif
@@ -146,10 +149,10 @@ main( int argc, char** argv )
     RamboHost = 1,
     RamboDevice = 2
   };
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   RamboSamplingMode rmbsmp = RamboSamplingMode::RamboDevice; // default on GPU
 #else
-  RamboSamplingMode rmbsmp = RamboSamplingMode::RamboHost;  // default on CPU
+  RamboSamplingMode rmbsmp = RamboSamplingMode::RamboHost; // default on CPU
 #endif
   // Bridge emulation mode (NB Bridge implies RamboHost!)
   bool bridge = false;
@@ -177,7 +180,7 @@ main( int argc, char** argv )
     else if( arg == "--curdev" )
     {
 #ifndef __CUDACC__
-      throw std::runtime_error( "CurandDevice is not supported on CPUs" );
+      throw std::runtime_error( "CurandDevice is not supported on CPUs or non-NVidia GPUs" );
 #elif defined MGONGPU_HAS_NO_CURAND
       throw std::runtime_error( "CurandDevice is not supported because this application was built without Curand support" );
 #else
@@ -198,7 +201,7 @@ main( int argc, char** argv )
     }
     else if( arg == "--rmbdev" )
     {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
       rmbsmp = RamboSamplingMode::RamboDevice;
 #else
       throw std::runtime_error( "RamboDevice is not supported on CPUs" );
@@ -272,13 +275,13 @@ main( int argc, char** argv )
     return usage( argv[0] );
   }
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 #ifdef _OPENMP
   ompnumthreadsNotSetMeansOneThread( debug ? 1 : 0 ); // quiet(-1), info(0), debug(1)
 #endif
 #endif
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // Fail gently and avoid "Illegal instruction (core dumped)" if the host does not support the SIMD used in the ME calculation
   // Note: this prevents a crash on pmpe04 but not on some github CI nodes?
   // [NB: SIMD vectorization in mg5amc C++ code is only used in the ME calculation below MatrixElementKernelHost!]
@@ -296,14 +299,14 @@ main( int argc, char** argv )
 
   // === STEP 0 - INITIALISE
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 
-  // --- 00. Initialise cuda
-  // Instantiate a CudaRuntime at the beginnining of the application's main to
-  // invoke cudaSetDevice(0) in the constructor and book a cudaDeviceReset() call in the destructor
-  const std::string cdinKey = "00 CudaInit";
+  // --- 00. Initialise GPU
+  // Instantiate a GpuRuntime at the beginnining of the application's main.
+  // For CUDA this invokes cudaSetDevice(0) in the constructor and books a cudaDeviceReset() call in the destructor.
+  const std::string cdinKey = "00 GpuInit";
   timermap.start( cdinKey );
-  CudaRuntime cudaRuntime( debug );
+  GpuRuntime GpuRuntime( debug );
 #endif
 
   // --- 0a. Initialise physics process
@@ -325,7 +328,7 @@ main( int argc, char** argv )
   timermap.start( alloKey );
 
   // Memory buffers for random numbers for momenta
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferRndNumMomenta hstRndmom( nevt );
 #else
   PinnedHostBufferRndNumMomenta hstRndmom( nevt );
@@ -333,7 +336,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for sampling weights
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferWeights hstWeights( nevt );
 #else
   PinnedHostBufferWeights hstWeights( nevt );
@@ -341,7 +344,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for momenta
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferMomenta hstMomenta( nevt );
 #else
   PinnedHostBufferMomenta hstMomenta( nevt );
@@ -349,7 +352,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for Gs
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferGs hstGs( nevt );
 #else
   PinnedHostBufferGs hstGs( nevt );
@@ -366,7 +369,7 @@ main( int argc, char** argv )
   }
 
   // Memory buffers for matrix elements
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferMatrixElements hstMatrixElements( nevt );
 #else
   PinnedHostBufferMatrixElements hstMatrixElements( nevt );
@@ -375,7 +378,7 @@ main( int argc, char** argv )
 
   // Memory buffers for random numbers for helicity selection
   // *** NB #403 these buffers always remain initialised at 0: no need for helicity choice in gcheck/check (no LHE produced) ***
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferRndNumHelicity hstRndHel( nevt );
 #else
   PinnedHostBufferRndNumHelicity hstRndHel( nevt );
@@ -384,7 +387,7 @@ main( int argc, char** argv )
 
   // Memory buffers for random numbers for color selection
   // *** NB #402 these buffers always remain initialised at 0: no need for color choice in gcheck/check (no LHE produced) ***
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferRndNumColor hstRndCol( nevt );
 #else
   PinnedHostBufferRndNumColor hstRndCol( nevt );
@@ -392,7 +395,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for helicity selection
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferSelectedHelicity hstSelHel( nevt );
 #else
   PinnedHostBufferSelectedHelicity hstSelHel( nevt );
@@ -400,7 +403,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for color selection
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferSelectedColor hstSelCol( nevt );
 #else
   PinnedHostBufferSelectedColor hstSelCol( nevt );
@@ -438,7 +441,7 @@ main( int argc, char** argv )
     const bool onDevice = true;
     prnk.reset( new CurandRandomNumberKernel( devRndmom, onDevice ) );
 #else
-    throw std::logic_error( "INTERNAL ERROR! CurandDevice is not supported on CPUs" ); // INTERNAL ERROR (no path to this statement)
+    throw std::logic_error( "INTERNAL ERROR! CurandDevice is not supported on CPUs or non-NVidia GPUs" ); // INTERNAL ERROR (no path to this statement)
 #endif
   }
 
@@ -450,7 +453,7 @@ main( int argc, char** argv )
   }
   else
   {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     prsk.reset( new RamboSamplingKernelDevice( energy, devRndmom, devMomenta, devWeights, gpublocks, gputhreads ) );
 #else
     throw std::logic_error( "RamboDevice is not supported on CPUs" ); // INTERNAL ERROR (no path to this statement)
@@ -461,7 +464,7 @@ main( int argc, char** argv )
   std::unique_ptr<MatrixElementKernelBase> pmek;
   if( !bridge )
   {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     pmek.reset( new MatrixElementKernelDevice( devMomenta, devGs, devRndHel, devRndCol, devMatrixElements, devSelHel, devSelCol, gpublocks, gputhreads ) );
 #else
     pmek.reset( new MatrixElementKernelHost( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, nevt ) );
@@ -469,7 +472,7 @@ main( int argc, char** argv )
   }
   else
   {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     pmek.reset( new BridgeKernelDevice( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, gpublocks, gputhreads ) );
 #else
     pmek.reset( new BridgeKernelHost( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, nevt ) );
@@ -511,7 +514,7 @@ main( int argc, char** argv )
     prnk->generateRnarray();
     //std::cout << "Got random numbers" << std::endl;
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     if( rndgen != RandomNumberMode::CurandDevice && rmbsmp == RamboSamplingMode::RamboDevice )
     {
       // --- 1c. Copy rndmom from host to device
@@ -543,7 +546,7 @@ main( int argc, char** argv )
     prsk->getMomentaFinal();
     //std::cout << "Got final momenta" << std::endl;
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     if( rmbsmp == RamboSamplingMode::RamboDevice )
     {
       // --- 2c. CopyDToH Weights
@@ -588,7 +591,7 @@ main( int argc, char** argv )
       dynamic_cast<BridgeKernelBase*>( pmek.get() )->transposeInputMomentaC2F();
     }
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     // --- 2d. CopyHToD Momenta
     const std::string gKey = "0.. CpHTDg";
     rambtime += timermap.start( gKey ); // FIXME! NOT A RAMBO TIMER!
@@ -617,7 +620,7 @@ main( int argc, char** argv )
     wv3atime += timermap.stop(); // calc only
     wavetime += wv3atime;        // calc plus copy
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     if( !bridge )
     {
       // --- 3b. CopyDToH MEs
@@ -760,18 +763,22 @@ main( int argc, char** argv )
     rndgentxt = "CURAND DEVICE";
 #ifdef __CUDACC__
   rndgentxt += " (CUDA code)";
+#elif defined __HIPCC__
+  rndgentxt += " (HIP code)";
 #else
   rndgentxt += " (C++ code)";
 #endif
 
   // Workflow description summary
   std::string wrkflwtxt;
-  // -- CUDA or C++?
+  // -- CUDA or HIP or C++?
 #ifdef __CUDACC__
   wrkflwtxt += "CUD:";
+#elif defined __HIPCC__
+  wrkflwtxt += "HIP:";
 #else
   wrkflwtxt += "CPP:";
-#endif
+#endif /* clang-format off */
   // -- DOUBLE or FLOAT?
 #if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
   wrkflwtxt += "MIX+"; // mixed fptypes (single precision color algebra #537)
@@ -781,7 +788,7 @@ main( int argc, char** argv )
   wrkflwtxt += "FLT+";
 #else
   wrkflwtxt += "???+"; // no path to this statement
-#endif
+#endif /* clang-format on */
   // -- CUCOMPLEX or THRUST or STD complex numbers?
 #ifdef __CUDACC__
 #if defined MGONGPU_CUCXTYPE_CUCOMPLEX
@@ -793,6 +800,12 @@ main( int argc, char** argv )
 #else
   wrkflwtxt += "???:"; // no path to this statement
 #endif
+#elif defined __HIPCC__
+#if defined MGONGPU_CUCXTYPE_CXSMPL
+  wrkflwtxt += "CXS:";
+#else
+  wrkflwtxt += "???:"; // no path to this statement
+#endif
 #else
 #if defined MGONGPU_CPPCXTYPE_STDCOMPLEX
   wrkflwtxt += "STX:";
@@ -818,7 +831,7 @@ main( int argc, char** argv )
     wrkflwtxt += "RMBDEV+";
   else
     wrkflwtxt += "??????+"; // no path to this statement
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   // -- HOST or DEVICE matrix elements? Standalone MEs or BRIDGE?
   if( !bridge )
     wrkflwtxt += "MESDEV";
@@ -874,7 +887,7 @@ main( int argc, char** argv )
 
   if( perf )
   {
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 #ifdef _OPENMP
     // Get the output of "nproc --all" (https://stackoverflow.com/a/478960)
     std::string nprocall;
@@ -895,6 +908,8 @@ main( int argc, char** argv )
     std::cout << std::string( SEP79, '*' ) << std::endl
 #ifdef __CUDACC__
               << "Process                     = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_CUDA"
+#elif defined __HIPCC__
+              << "Process                     = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_HIP"
 #else
               << "Process                     = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_CPP"
 #endif
@@ -921,21 +936,21 @@ main( int argc, char** argv )
 #elif defined MGONGPU_FPTYPE_FLOAT
               << "FP precision                = FLOAT (NaN/abnormal=" << nabn << ", zero=" << nzero << ")" << std::endl
 #endif
-#ifdef __CUDACC__
 #if defined MGONGPU_CUCXTYPE_CUCOMPLEX
               << "Complex type                = CUCOMPLEX" << std::endl
 #elif defined MGONGPU_CUCXTYPE_THRUST
               << "Complex type                = THRUST::COMPLEX" << std::endl
-#endif
-#else
+#elif defined MGONGPU_CUCXTYPE_CXSMPL
               << "Complex type                = STD::COMPLEX" << std::endl
+#else
+              << "Complex type                = ???" << std::endl // no path to this statement...
 #endif
               << "RanNumb memory layout       = AOSOA[" << neppR << "]"
               << ( neppR == 1 ? " == AOS" : "" )
               << " [HARDCODED FOR REPRODUCIBILITY]" << std::endl
               << "Momenta memory layout       = AOSOA[" << neppM << "]"
               << ( neppM == 1 ? " == AOS" : "" ) << std::endl
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     //<< "Wavefunction GPU memory     = LOCAL" << std::endl
 #else
 #if !defined MGONGPU_CPPSIMD
@@ -966,7 +981,7 @@ main( int argc, char** argv )
 #endif
 #endif
               << "Random number generation    = " << rndgentxt << std::endl
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 #ifdef _OPENMP
               << "OMP threads / `nproc --all` = " << omp_get_max_threads() << " / " << nprocall // includes a newline
 #endif
@@ -1062,14 +1077,14 @@ main( int argc, char** argv )
              << "\"FLOAT (NaN/abnormal=" << nabn << ")\"," << std::endl
 #endif
              << "\"Complex type\": "
-#ifdef __CUDACC__
 #if defined MGONGPU_CUCXTYPE_CUCOMPLEX
              << "\"CUCOMPLEX\"," << std::endl
 #elif defined MGONGPU_CUCXTYPE_THRUST
              << "\"THRUST::COMPLEX\"," << std::endl
-#endif
-#else
+#elif defined MGONGPU_CUCXTYPE_CXSMPL
              << "\"STD::COMPLEX\"," << std::endl
+#else
+             << "\"???\"," << std::endl                           // no path to this statement...
 #endif
              << "\"RanNumb memory layout\": "
              << "\"AOSOA[" << neppR << "]\""
@@ -1077,7 +1092,7 @@ main( int argc, char** argv )
              << "\"Momenta memory layout\": "
              << "\"AOSOA[" << neppM << "]\""
              << ( neppM == 1 ? " == AOS" : "" ) << ", " << std::endl
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     //<< "\"Wavefunction GPU memory\": " << "\"LOCAL\"," << std::endl
 #endif
              << "\"Curand generation\": "
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gu_ttxgu/CPPProcess.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gu_ttxgu/CPPProcess.cc
index 1be98364ee..a17bd3518e 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gu_ttxgu/CPPProcess.cc
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gu_ttxgu/CPPProcess.cc
@@ -4,7 +4,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
 // MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
@@ -16,7 +16,6 @@
 
 #include "mgOnGpuConfig.h"
 
-#include "CudaRuntime.h"
 #include "HelAmps_sm.h"
 #include "MemoryAccessAmplitudes.h"
 #include "MemoryAccessCouplings.h"
@@ -49,7 +48,7 @@
 // Process: g d > t t~ g d WEIGHTED<=4 @2
 // Process: g s > t t~ g s WEIGHTED<=4 @2
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -83,7 +82,7 @@ namespace mg5amcCpu
   __device__ const fptype cIPD[2] = { (fptype)Parameters_sm::mdl_MT, (fptype)Parameters_sm::mdl_WT };
   __device__ const fptype* cIPC = nullptr; // unused as nicoup=0
 #else
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   __device__ __constant__ fptype cIPD[2];
   __device__ __constant__ fptype* cIPC = nullptr; // unused as nicoup=0
 #else
@@ -93,7 +92,7 @@ namespace mg5amcCpu
 #endif
 
   // Helicity combinations (and filtering of "good" helicity combinations)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   __device__ __constant__ short cHel[ncomb][npar];
   __device__ __constant__ int cNGoodHel;
   __device__ __constant__ int cGoodHel[ncomb];
@@ -121,13 +120,13 @@ namespace mg5amcCpu
                            fptype* allDenominators,       // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
                            fptype_sv* jamp2_sv            // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled)
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
                            , const int ievt00             // input: first event number in current C++ event page (for CUDA, ievt depends on threadid)
 #endif
                            )
   //ALWAYS_INLINE // attributes are not permitted in a function definition
   {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     using namespace mg5amcGpu;
     using M_ACCESS = DeviceAccessMomenta;         // non-trivial access: buffer includes all events
     using E_ACCESS = DeviceAccessMatrixElements;  // non-trivial access: buffer includes all events
@@ -154,7 +153,7 @@ namespace mg5amcCpu
 #endif /* clang-format on */
     mgDebug( 0, __FUNCTION__ );
     //printf( "calculate_wavefunctions: ihel=%2d\n", ihel );
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
     //printf( "calculate_wavefunctions: ievt00=%d\n", ievt00 );
 #endif
 
@@ -190,7 +189,7 @@ namespace mg5amcCpu
 #endif
     for( int iParity = 0; iParity < nParity; ++iParity )
     { // START LOOP ON IPARITY
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
       const int ievt0 = ievt00 + iParity * neppV;
 #endif
       constexpr size_t nxcoup = ndcoup + nicoup; // both dependent and independent couplings
@@ -203,8 +202,10 @@ namespace mg5amcCpu
         allCOUPs[idcoup] = CD_ACCESS::idcoupAccessBufferConst( allcouplings, idcoup ); // dependent couplings, vary event-by-event
       for( size_t iicoup = 0; iicoup < nicoup; iicoup++ )
         allCOUPs[ndcoup + iicoup] = CI_ACCESS::iicoupAccessBufferConst( cIPC, iicoup ); // independent couplings, fixed for all events
+#ifdef MGONGPUCPP_GPUIMPL
 #ifdef __CUDACC__
 #pragma nv_diagnostic pop
+#endif
       // CUDA kernels take input/output buffers with momenta/MEs for all events
       const fptype* momenta = allmomenta;
       const fptype* COUPs[nxcoup];
@@ -812,7 +813,7 @@ namespace mg5amcCpu
         { 0, -2, -6, -2, -2, 6, 16, 0, 6, 16, 48, 16 },
         { -2, 0, -2, -6, 6, -2, 0, 16, 16, 6, 16, 48 } }; // 2-D array[12][12]
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
       // Pre-compute a constexpr triangular color matrix properly normalized #475
       struct TriangularNormalizedColorMatrix
       {
@@ -869,7 +870,7 @@ namespace mg5amcCpu
 #endif
       for( int icol = 0; icol < ncolor; icol++ )
       {
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
         // === C++ START ===
         // Diagonal terms
 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
@@ -928,7 +929,7 @@ namespace mg5amcCpu
       MEs_sv_previous += deltaMEs_previous;
 #endif
       /*
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
       if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", blockDim.x * blockIdx.x + threadIdx.x, ihel, MEs_sv );
 #else
 #ifdef MGONGPU_CPPSIMD
@@ -1023,8 +1024,8 @@ namespace mg5amcCpu
       { 1, -1, 1, -1, -1, 1 },
       { 1, -1, 1, -1, 1, -1 },
       { 1, -1, 1, -1, 1, 1 } };
-#ifdef __CUDACC__
-    checkCuda( cudaMemcpyToSymbol( cHel, tHel, ncomb * npar * sizeof( short ) ) );
+#ifdef MGONGPUCPP_GPUIMPL
+    gpuMemcpyToSymbol( cHel, tHel, ncomb * npar * sizeof( short ) );
 #else
     memcpy( cHel, tHel, ncomb * npar * sizeof( short ) );
 #endif
@@ -1066,9 +1067,9 @@ namespace mg5amcCpu
     // Then copy them to CUDA constant memory (issue #39) or its C++ emulation in file-scope static memory
     const fptype tIPD[2] = { (fptype)m_pars->mdl_MT, (fptype)m_pars->mdl_WT };
     //const cxtype tIPC[0] = { ... }; // nicoup=0
-#ifdef __CUDACC__
-    checkCuda( cudaMemcpyToSymbol( cIPD, tIPD, 2 * sizeof( fptype ) ) );
-    //checkCuda( cudaMemcpyToSymbol( cIPC, tIPC, 0 * sizeof( cxtype ) ) ); // nicoup=0
+#ifdef MGONGPUCPP_GPUIMPL
+    gpuMemcpyToSymbol( cIPD, tIPD, 2 * sizeof( fptype ) );
+    //gpuMemcpyToSymbol( cIPC, tIPC, 0 * sizeof( cxtype ) ); // nicoup=0
 #else
     memcpy( cIPD, tIPD, 2 * sizeof( fptype ) );
     //memcpy( cIPC, tIPC, 0 * sizeof( cxtype ) ); // nicoup=0
@@ -1106,7 +1107,7 @@ namespace mg5amcCpu
   {
     std::stringstream out;
     // CUDA version (NVCC)
-    // [Use __NVCC__ instead of __CUDACC__ here!]
+    // [Use __NVCC__ instead of MGONGPUCPP_GPUIMPL here!]
     // [This tests if 'nvcc' was used even to build a .cc file, even if not necessarily 'nvcc -x cu' for a .cu file]
     // [Check 'nvcc --compiler-options -dM -E dummy.c | grep CUDA': see https://stackoverflow.com/a/53713712]
 #ifdef __NVCC__
@@ -1171,12 +1172,12 @@ namespace mg5amcCpu
   __global__ void /* clang-format off */
   computeDependentCouplings( const fptype* allgs, // input: Gs[nevt]
                              fptype* allcouplings // output: couplings[nevt*ndcoup*2]
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
                              , const int nevt     // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #endif
   ) /* clang-format on */
   {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     using namespace mg5amcGpu;
     using G_ACCESS = DeviceAccessGs;
     using C_ACCESS = DeviceAccessCouplings;
@@ -1197,7 +1198,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__ /* clang-format off */
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
   __global__ void
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
@@ -1323,9 +1324,9 @@ namespace mg5amcCpu
         nGoodHel++;
       }
     }
-#ifdef __CUDACC__
-    checkCuda( cudaMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) ) );
-    checkCuda( cudaMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) ) );
+#ifdef MGONGPUCPP_GPUIMPL
+    gpuMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) );
+    gpuMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) );
 #else
     cNGoodHel = nGoodHel;
     for( int ihel = 0; ihel < ncomb; ihel++ ) cGoodHel[ihel] = goodHel[ihel];
@@ -1349,7 +1350,7 @@ namespace mg5amcCpu
 #endif
             int* allselhel,                // output: helicity selection[nevt]
             int* allselcol                 // output: helicity selection[nevt]
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
             , const int nevt               // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #endif
             ) /* clang-format on */
@@ -1369,7 +1370,7 @@ namespace mg5amcCpu
     // Denominators: spins, colors and identical particles
     constexpr int helcolDenominators[1] = { 96 }; // assume nprocesses == 1 (#272 and #343)
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     // Remember: in CUDA this is a kernel for one event, in c++ this processes n events
     const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid
 #else
@@ -1383,9 +1384,12 @@ namespace mg5amcCpu
 #endif
 
     // Start sigmaKin_lines
+
+#include "GpuAbstraction.h"
+
     // === PART 0 - INITIALISATION (before calculate_wavefunctions) ===
     // Reset the "matrix elements" - running sums of |M|^2 over helicities for the given event
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     allMEs[ievt] = 0;
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     allNumerators[ievt] = 0;
@@ -1413,7 +1417,7 @@ namespace mg5amcCpu
     // === PART 1 - HELICITY LOOP: CALCULATE WAVEFUNCTIONS ===
     // (in both CUDA and C++, using precomputed good helicities)
 
-#ifdef __CUDACC__ // CUDA OR C++
+#ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++
 
     // *** START OF PART 1a - CUDA (one event per CPU thread) ***
     // Running sum of partial amplitudes squared for event by event color selection (#402)
@@ -1623,7 +1627,7 @@ namespace mg5amcCpu
     // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event
     // [NB 'sum over final spins, average over initial spins', eg see
     // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf]
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     allMEs[ievt] /= helcolDenominators[0];
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     if( channelId > 0 ) allMEs[ievt] *= allNumerators[ievt] / allDenominators[ievt];
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gu_ttxgu/CPPProcess.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gu_ttxgu/CPPProcess.h
index 853175b477..9c820a5ddb 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gu_ttxgu/CPPProcess.h
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gu_ttxgu/CPPProcess.h
@@ -4,7 +4,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
 // MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
@@ -25,7 +25,7 @@
 
 //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -110,7 +110,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   __global__ void
   computeDependentCouplings( const fptype* allgs,    // input: Gs[nevt]
                              fptype* allcouplings ); // output: couplings[nevt*ndcoup*2]
@@ -123,7 +123,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__ /* clang-format off */
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
   __global__ void
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
@@ -153,7 +153,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__ /* clang-format off */
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
   __global__ void
   sigmaKin( const fptype* allmomenta,      // input: momenta[nevt*npar*4]
             const fptype* allcouplings,    // input: couplings[nevt*ndcoup*2]
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gu_ttxgu/CudaRuntime.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gu_ttxgu/CudaRuntime.h
deleted file mode 120000
index ce9e1a487a..0000000000
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gu_ttxgu/CudaRuntime.h
+++ /dev/null
@@ -1 +0,0 @@
-../CudaRuntime.h
\ No newline at end of file
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gu_ttxgu/GpuAbstraction.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gu_ttxgu/GpuAbstraction.h
new file mode 120000
index 0000000000..72054e19ba
--- /dev/null
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gu_ttxgu/GpuAbstraction.h
@@ -0,0 +1 @@
+../GpuAbstraction.h
\ No newline at end of file
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gu_ttxgu/GpuRuntime.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gu_ttxgu/GpuRuntime.h
new file mode 120000
index 0000000000..3920e83be4
--- /dev/null
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gu_ttxgu/GpuRuntime.h
@@ -0,0 +1 @@
+../GpuRuntime.h
\ No newline at end of file
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gu_ttxgu/check_sa.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gu_ttxgu/check_sa.cc
index 3fbf0ffbee..7cac5ab47b 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gu_ttxgu/check_sa.cc
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gu_ttxgu/check_sa.cc
@@ -4,7 +4,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: O. Mattelaer (Nov 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 
 #include "mgOnGpuConfig.h"
@@ -12,6 +12,7 @@
 #include "BridgeKernels.h"
 #include "CPPProcess.h"
 #include "CrossSectionKernels.h"
+#include "GpuRuntime.h"
 #include "MatrixElementKernels.h"
 #include "MemoryAccessMatrixElements.h"
 #include "MemoryAccessMomenta.h"
@@ -65,7 +66,7 @@ usage( char* argv0, int ret = 1 )
   std::cout << std::endl;
   std::cout << "Summary stats are always computed: '-p' and '-j' only control their printout" << std::endl;
   std::cout << "The '-d' flag only enables NaN/abnormal warnings and OMP debugging" << std::endl;
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 #ifdef _OPENMP
   std::cout << std::endl;
   std::cout << "Use the OMP_NUM_THREADS environment variable to control OMP multi-threading" << std::endl;
@@ -96,7 +97,7 @@ int
 main( int argc, char** argv )
 {
   // Namespaces for CUDA and C++ (FIXME - eventually use the same namespace everywhere...)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   using namespace mg5amcGpu;
 #else
   using namespace mg5amcCpu;
@@ -134,9 +135,11 @@ main( int argc, char** argv )
     CurandDevice = 2
   };
 #ifdef MGONGPU_HAS_NO_CURAND
-  RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // this is the only supported mode if build has no curand (PR #784)
+  RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // this is the only supported mode if build has no curand (PR #784 and #785)
+#elif defined __HIPCC__
+#error Internal error: MGONGPU_HAS_NO_CURAND should have been set for __HIPCC__ // default on AMD GPUs should be common random
 #elif defined __CUDACC__
-  RandomNumberMode rndgen = RandomNumberMode::CurandDevice; // default on GPU if build has curand
+  RandomNumberMode rndgen = RandomNumberMode::CurandDevice; // default on NVidia GPU if build has curand
 #else
   RandomNumberMode rndgen = RandomNumberMode::CurandHost; // default on CPU if build has curand
 #endif
@@ -146,10 +149,10 @@ main( int argc, char** argv )
     RamboHost = 1,
     RamboDevice = 2
   };
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   RamboSamplingMode rmbsmp = RamboSamplingMode::RamboDevice; // default on GPU
 #else
-  RamboSamplingMode rmbsmp = RamboSamplingMode::RamboHost;  // default on CPU
+  RamboSamplingMode rmbsmp = RamboSamplingMode::RamboHost; // default on CPU
 #endif
   // Bridge emulation mode (NB Bridge implies RamboHost!)
   bool bridge = false;
@@ -177,7 +180,7 @@ main( int argc, char** argv )
     else if( arg == "--curdev" )
     {
 #ifndef __CUDACC__
-      throw std::runtime_error( "CurandDevice is not supported on CPUs" );
+      throw std::runtime_error( "CurandDevice is not supported on CPUs or non-NVidia GPUs" );
 #elif defined MGONGPU_HAS_NO_CURAND
       throw std::runtime_error( "CurandDevice is not supported because this application was built without Curand support" );
 #else
@@ -198,7 +201,7 @@ main( int argc, char** argv )
     }
     else if( arg == "--rmbdev" )
     {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
       rmbsmp = RamboSamplingMode::RamboDevice;
 #else
       throw std::runtime_error( "RamboDevice is not supported on CPUs" );
@@ -272,13 +275,13 @@ main( int argc, char** argv )
     return usage( argv[0] );
   }
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 #ifdef _OPENMP
   ompnumthreadsNotSetMeansOneThread( debug ? 1 : 0 ); // quiet(-1), info(0), debug(1)
 #endif
 #endif
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // Fail gently and avoid "Illegal instruction (core dumped)" if the host does not support the SIMD used in the ME calculation
   // Note: this prevents a crash on pmpe04 but not on some github CI nodes?
   // [NB: SIMD vectorization in mg5amc C++ code is only used in the ME calculation below MatrixElementKernelHost!]
@@ -296,14 +299,14 @@ main( int argc, char** argv )
 
   // === STEP 0 - INITIALISE
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 
-  // --- 00. Initialise cuda
-  // Instantiate a CudaRuntime at the beginnining of the application's main to
-  // invoke cudaSetDevice(0) in the constructor and book a cudaDeviceReset() call in the destructor
-  const std::string cdinKey = "00 CudaInit";
+  // --- 00. Initialise GPU
+  // Instantiate a GpuRuntime at the beginnining of the application's main.
+  // For CUDA this invokes cudaSetDevice(0) in the constructor and books a cudaDeviceReset() call in the destructor.
+  const std::string cdinKey = "00 GpuInit";
   timermap.start( cdinKey );
-  CudaRuntime cudaRuntime( debug );
+  GpuRuntime GpuRuntime( debug );
 #endif
 
   // --- 0a. Initialise physics process
@@ -325,7 +328,7 @@ main( int argc, char** argv )
   timermap.start( alloKey );
 
   // Memory buffers for random numbers for momenta
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferRndNumMomenta hstRndmom( nevt );
 #else
   PinnedHostBufferRndNumMomenta hstRndmom( nevt );
@@ -333,7 +336,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for sampling weights
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferWeights hstWeights( nevt );
 #else
   PinnedHostBufferWeights hstWeights( nevt );
@@ -341,7 +344,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for momenta
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferMomenta hstMomenta( nevt );
 #else
   PinnedHostBufferMomenta hstMomenta( nevt );
@@ -349,7 +352,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for Gs
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferGs hstGs( nevt );
 #else
   PinnedHostBufferGs hstGs( nevt );
@@ -366,7 +369,7 @@ main( int argc, char** argv )
   }
 
   // Memory buffers for matrix elements
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferMatrixElements hstMatrixElements( nevt );
 #else
   PinnedHostBufferMatrixElements hstMatrixElements( nevt );
@@ -375,7 +378,7 @@ main( int argc, char** argv )
 
   // Memory buffers for random numbers for helicity selection
   // *** NB #403 these buffers always remain initialised at 0: no need for helicity choice in gcheck/check (no LHE produced) ***
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferRndNumHelicity hstRndHel( nevt );
 #else
   PinnedHostBufferRndNumHelicity hstRndHel( nevt );
@@ -384,7 +387,7 @@ main( int argc, char** argv )
 
   // Memory buffers for random numbers for color selection
   // *** NB #402 these buffers always remain initialised at 0: no need for color choice in gcheck/check (no LHE produced) ***
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferRndNumColor hstRndCol( nevt );
 #else
   PinnedHostBufferRndNumColor hstRndCol( nevt );
@@ -392,7 +395,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for helicity selection
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferSelectedHelicity hstSelHel( nevt );
 #else
   PinnedHostBufferSelectedHelicity hstSelHel( nevt );
@@ -400,7 +403,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for color selection
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferSelectedColor hstSelCol( nevt );
 #else
   PinnedHostBufferSelectedColor hstSelCol( nevt );
@@ -438,7 +441,7 @@ main( int argc, char** argv )
     const bool onDevice = true;
     prnk.reset( new CurandRandomNumberKernel( devRndmom, onDevice ) );
 #else
-    throw std::logic_error( "INTERNAL ERROR! CurandDevice is not supported on CPUs" ); // INTERNAL ERROR (no path to this statement)
+    throw std::logic_error( "INTERNAL ERROR! CurandDevice is not supported on CPUs or non-NVidia GPUs" ); // INTERNAL ERROR (no path to this statement)
 #endif
   }
 
@@ -450,7 +453,7 @@ main( int argc, char** argv )
   }
   else
   {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     prsk.reset( new RamboSamplingKernelDevice( energy, devRndmom, devMomenta, devWeights, gpublocks, gputhreads ) );
 #else
     throw std::logic_error( "RamboDevice is not supported on CPUs" ); // INTERNAL ERROR (no path to this statement)
@@ -461,7 +464,7 @@ main( int argc, char** argv )
   std::unique_ptr<MatrixElementKernelBase> pmek;
   if( !bridge )
   {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     pmek.reset( new MatrixElementKernelDevice( devMomenta, devGs, devRndHel, devRndCol, devMatrixElements, devSelHel, devSelCol, gpublocks, gputhreads ) );
 #else
     pmek.reset( new MatrixElementKernelHost( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, nevt ) );
@@ -469,7 +472,7 @@ main( int argc, char** argv )
   }
   else
   {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     pmek.reset( new BridgeKernelDevice( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, gpublocks, gputhreads ) );
 #else
     pmek.reset( new BridgeKernelHost( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, nevt ) );
@@ -511,7 +514,7 @@ main( int argc, char** argv )
     prnk->generateRnarray();
     //std::cout << "Got random numbers" << std::endl;
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     if( rndgen != RandomNumberMode::CurandDevice && rmbsmp == RamboSamplingMode::RamboDevice )
     {
       // --- 1c. Copy rndmom from host to device
@@ -543,7 +546,7 @@ main( int argc, char** argv )
     prsk->getMomentaFinal();
     //std::cout << "Got final momenta" << std::endl;
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     if( rmbsmp == RamboSamplingMode::RamboDevice )
     {
       // --- 2c. CopyDToH Weights
@@ -588,7 +591,7 @@ main( int argc, char** argv )
       dynamic_cast<BridgeKernelBase*>( pmek.get() )->transposeInputMomentaC2F();
     }
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     // --- 2d. CopyHToD Momenta
     const std::string gKey = "0.. CpHTDg";
     rambtime += timermap.start( gKey ); // FIXME! NOT A RAMBO TIMER!
@@ -617,7 +620,7 @@ main( int argc, char** argv )
     wv3atime += timermap.stop(); // calc only
     wavetime += wv3atime;        // calc plus copy
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     if( !bridge )
     {
       // --- 3b. CopyDToH MEs
@@ -760,18 +763,22 @@ main( int argc, char** argv )
     rndgentxt = "CURAND DEVICE";
 #ifdef __CUDACC__
   rndgentxt += " (CUDA code)";
+#elif defined __HIPCC__
+  rndgentxt += " (HIP code)";
 #else
   rndgentxt += " (C++ code)";
 #endif
 
   // Workflow description summary
   std::string wrkflwtxt;
-  // -- CUDA or C++?
+  // -- CUDA or HIP or C++?
 #ifdef __CUDACC__
   wrkflwtxt += "CUD:";
+#elif defined __HIPCC__
+  wrkflwtxt += "HIP:";
 #else
   wrkflwtxt += "CPP:";
-#endif
+#endif /* clang-format off */
   // -- DOUBLE or FLOAT?
 #if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
   wrkflwtxt += "MIX+"; // mixed fptypes (single precision color algebra #537)
@@ -781,7 +788,7 @@ main( int argc, char** argv )
   wrkflwtxt += "FLT+";
 #else
   wrkflwtxt += "???+"; // no path to this statement
-#endif
+#endif /* clang-format on */
   // -- CUCOMPLEX or THRUST or STD complex numbers?
 #ifdef __CUDACC__
 #if defined MGONGPU_CUCXTYPE_CUCOMPLEX
@@ -793,6 +800,12 @@ main( int argc, char** argv )
 #else
   wrkflwtxt += "???:"; // no path to this statement
 #endif
+#elif defined __HIPCC__
+#if defined MGONGPU_CUCXTYPE_CXSMPL
+  wrkflwtxt += "CXS:";
+#else
+  wrkflwtxt += "???:"; // no path to this statement
+#endif
 #else
 #if defined MGONGPU_CPPCXTYPE_STDCOMPLEX
   wrkflwtxt += "STX:";
@@ -818,7 +831,7 @@ main( int argc, char** argv )
     wrkflwtxt += "RMBDEV+";
   else
     wrkflwtxt += "??????+"; // no path to this statement
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   // -- HOST or DEVICE matrix elements? Standalone MEs or BRIDGE?
   if( !bridge )
     wrkflwtxt += "MESDEV";
@@ -874,7 +887,7 @@ main( int argc, char** argv )
 
   if( perf )
   {
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 #ifdef _OPENMP
     // Get the output of "nproc --all" (https://stackoverflow.com/a/478960)
     std::string nprocall;
@@ -895,6 +908,8 @@ main( int argc, char** argv )
     std::cout << std::string( SEP79, '*' ) << std::endl
 #ifdef __CUDACC__
               << "Process                     = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_CUDA"
+#elif defined __HIPCC__
+              << "Process                     = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_HIP"
 #else
               << "Process                     = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_CPP"
 #endif
@@ -921,21 +936,21 @@ main( int argc, char** argv )
 #elif defined MGONGPU_FPTYPE_FLOAT
               << "FP precision                = FLOAT (NaN/abnormal=" << nabn << ", zero=" << nzero << ")" << std::endl
 #endif
-#ifdef __CUDACC__
 #if defined MGONGPU_CUCXTYPE_CUCOMPLEX
               << "Complex type                = CUCOMPLEX" << std::endl
 #elif defined MGONGPU_CUCXTYPE_THRUST
               << "Complex type                = THRUST::COMPLEX" << std::endl
-#endif
-#else
+#elif defined MGONGPU_CUCXTYPE_CXSMPL
               << "Complex type                = STD::COMPLEX" << std::endl
+#else
+              << "Complex type                = ???" << std::endl // no path to this statement...
 #endif
               << "RanNumb memory layout       = AOSOA[" << neppR << "]"
               << ( neppR == 1 ? " == AOS" : "" )
               << " [HARDCODED FOR REPRODUCIBILITY]" << std::endl
               << "Momenta memory layout       = AOSOA[" << neppM << "]"
               << ( neppM == 1 ? " == AOS" : "" ) << std::endl
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     //<< "Wavefunction GPU memory     = LOCAL" << std::endl
 #else
 #if !defined MGONGPU_CPPSIMD
@@ -966,7 +981,7 @@ main( int argc, char** argv )
 #endif
 #endif
               << "Random number generation    = " << rndgentxt << std::endl
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 #ifdef _OPENMP
               << "OMP threads / `nproc --all` = " << omp_get_max_threads() << " / " << nprocall // includes a newline
 #endif
@@ -1062,14 +1077,14 @@ main( int argc, char** argv )
              << "\"FLOAT (NaN/abnormal=" << nabn << ")\"," << std::endl
 #endif
              << "\"Complex type\": "
-#ifdef __CUDACC__
 #if defined MGONGPU_CUCXTYPE_CUCOMPLEX
              << "\"CUCOMPLEX\"," << std::endl
 #elif defined MGONGPU_CUCXTYPE_THRUST
              << "\"THRUST::COMPLEX\"," << std::endl
-#endif
-#else
+#elif defined MGONGPU_CUCXTYPE_CXSMPL
              << "\"STD::COMPLEX\"," << std::endl
+#else
+             << "\"???\"," << std::endl                           // no path to this statement...
 #endif
              << "\"RanNumb memory layout\": "
              << "\"AOSOA[" << neppR << "]\""
@@ -1077,7 +1092,7 @@ main( int argc, char** argv )
              << "\"Momenta memory layout\": "
              << "\"AOSOA[" << neppM << "]\""
              << ( neppM == 1 ? " == AOS" : "" ) << ", " << std::endl
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     //<< "\"Wavefunction GPU memory\": " << "\"LOCAL\"," << std::endl
 #endif
              << "\"Curand generation\": "
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gux_ttxgux/CPPProcess.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gux_ttxgux/CPPProcess.cc
index dfb05016f5..6a53d09c8e 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gux_ttxgux/CPPProcess.cc
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gux_ttxgux/CPPProcess.cc
@@ -4,7 +4,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
 // MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
@@ -16,7 +16,6 @@
 
 #include "mgOnGpuConfig.h"
 
-#include "CudaRuntime.h"
 #include "HelAmps_sm.h"
 #include "MemoryAccessAmplitudes.h"
 #include "MemoryAccessCouplings.h"
@@ -49,7 +48,7 @@
 // Process: g d~ > t t~ g d~ WEIGHTED<=4 @2
 // Process: g s~ > t t~ g s~ WEIGHTED<=4 @2
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -83,7 +82,7 @@ namespace mg5amcCpu
   __device__ const fptype cIPD[2] = { (fptype)Parameters_sm::mdl_MT, (fptype)Parameters_sm::mdl_WT };
   __device__ const fptype* cIPC = nullptr; // unused as nicoup=0
 #else
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   __device__ __constant__ fptype cIPD[2];
   __device__ __constant__ fptype* cIPC = nullptr; // unused as nicoup=0
 #else
@@ -93,7 +92,7 @@ namespace mg5amcCpu
 #endif
 
   // Helicity combinations (and filtering of "good" helicity combinations)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   __device__ __constant__ short cHel[ncomb][npar];
   __device__ __constant__ int cNGoodHel;
   __device__ __constant__ int cGoodHel[ncomb];
@@ -121,13 +120,13 @@ namespace mg5amcCpu
                            fptype* allDenominators,       // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
                            fptype_sv* jamp2_sv            // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled)
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
                            , const int ievt00             // input: first event number in current C++ event page (for CUDA, ievt depends on threadid)
 #endif
                            )
   //ALWAYS_INLINE // attributes are not permitted in a function definition
   {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     using namespace mg5amcGpu;
     using M_ACCESS = DeviceAccessMomenta;         // non-trivial access: buffer includes all events
     using E_ACCESS = DeviceAccessMatrixElements;  // non-trivial access: buffer includes all events
@@ -154,7 +153,7 @@ namespace mg5amcCpu
 #endif /* clang-format on */
     mgDebug( 0, __FUNCTION__ );
     //printf( "calculate_wavefunctions: ihel=%2d\n", ihel );
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
     //printf( "calculate_wavefunctions: ievt00=%d\n", ievt00 );
 #endif
 
@@ -190,7 +189,7 @@ namespace mg5amcCpu
 #endif
     for( int iParity = 0; iParity < nParity; ++iParity )
     { // START LOOP ON IPARITY
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
       const int ievt0 = ievt00 + iParity * neppV;
 #endif
       constexpr size_t nxcoup = ndcoup + nicoup; // both dependent and independent couplings
@@ -203,8 +202,10 @@ namespace mg5amcCpu
         allCOUPs[idcoup] = CD_ACCESS::idcoupAccessBufferConst( allcouplings, idcoup ); // dependent couplings, vary event-by-event
       for( size_t iicoup = 0; iicoup < nicoup; iicoup++ )
         allCOUPs[ndcoup + iicoup] = CI_ACCESS::iicoupAccessBufferConst( cIPC, iicoup ); // independent couplings, fixed for all events
+#ifdef MGONGPUCPP_GPUIMPL
 #ifdef __CUDACC__
 #pragma nv_diagnostic pop
+#endif
       // CUDA kernels take input/output buffers with momenta/MEs for all events
       const fptype* momenta = allmomenta;
       const fptype* COUPs[nxcoup];
@@ -812,7 +813,7 @@ namespace mg5amcCpu
         { -2, 0, 0, 16, -2, -6, 6, -2, 16, 6, 48, 16 },
         { 0, -2, 16, 0, -6, -2, -2, 6, 6, 16, 16, 48 } }; // 2-D array[12][12]
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
       // Pre-compute a constexpr triangular color matrix properly normalized #475
       struct TriangularNormalizedColorMatrix
       {
@@ -869,7 +870,7 @@ namespace mg5amcCpu
 #endif
       for( int icol = 0; icol < ncolor; icol++ )
       {
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
         // === C++ START ===
         // Diagonal terms
 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
@@ -928,7 +929,7 @@ namespace mg5amcCpu
       MEs_sv_previous += deltaMEs_previous;
 #endif
       /*
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
       if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", blockDim.x * blockIdx.x + threadIdx.x, ihel, MEs_sv );
 #else
 #ifdef MGONGPU_CPPSIMD
@@ -1023,8 +1024,8 @@ namespace mg5amcCpu
       { 1, 1, 1, -1, -1, -1 },
       { 1, 1, 1, -1, 1, 1 },
       { 1, 1, 1, -1, 1, -1 } };
-#ifdef __CUDACC__
-    checkCuda( cudaMemcpyToSymbol( cHel, tHel, ncomb * npar * sizeof( short ) ) );
+#ifdef MGONGPUCPP_GPUIMPL
+    gpuMemcpyToSymbol( cHel, tHel, ncomb * npar * sizeof( short ) );
 #else
     memcpy( cHel, tHel, ncomb * npar * sizeof( short ) );
 #endif
@@ -1066,9 +1067,9 @@ namespace mg5amcCpu
     // Then copy them to CUDA constant memory (issue #39) or its C++ emulation in file-scope static memory
     const fptype tIPD[2] = { (fptype)m_pars->mdl_MT, (fptype)m_pars->mdl_WT };
     //const cxtype tIPC[0] = { ... }; // nicoup=0
-#ifdef __CUDACC__
-    checkCuda( cudaMemcpyToSymbol( cIPD, tIPD, 2 * sizeof( fptype ) ) );
-    //checkCuda( cudaMemcpyToSymbol( cIPC, tIPC, 0 * sizeof( cxtype ) ) ); // nicoup=0
+#ifdef MGONGPUCPP_GPUIMPL
+    gpuMemcpyToSymbol( cIPD, tIPD, 2 * sizeof( fptype ) );
+    //gpuMemcpyToSymbol( cIPC, tIPC, 0 * sizeof( cxtype ) ); // nicoup=0
 #else
     memcpy( cIPD, tIPD, 2 * sizeof( fptype ) );
     //memcpy( cIPC, tIPC, 0 * sizeof( cxtype ) ); // nicoup=0
@@ -1106,7 +1107,7 @@ namespace mg5amcCpu
   {
     std::stringstream out;
     // CUDA version (NVCC)
-    // [Use __NVCC__ instead of __CUDACC__ here!]
+    // [Use __NVCC__ instead of MGONGPUCPP_GPUIMPL here!]
     // [This tests if 'nvcc' was used even to build a .cc file, even if not necessarily 'nvcc -x cu' for a .cu file]
     // [Check 'nvcc --compiler-options -dM -E dummy.c | grep CUDA': see https://stackoverflow.com/a/53713712]
 #ifdef __NVCC__
@@ -1171,12 +1172,12 @@ namespace mg5amcCpu
   __global__ void /* clang-format off */
   computeDependentCouplings( const fptype* allgs, // input: Gs[nevt]
                              fptype* allcouplings // output: couplings[nevt*ndcoup*2]
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
                              , const int nevt     // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #endif
   ) /* clang-format on */
   {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     using namespace mg5amcGpu;
     using G_ACCESS = DeviceAccessGs;
     using C_ACCESS = DeviceAccessCouplings;
@@ -1197,7 +1198,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__ /* clang-format off */
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
   __global__ void
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
@@ -1323,9 +1324,9 @@ namespace mg5amcCpu
         nGoodHel++;
       }
     }
-#ifdef __CUDACC__
-    checkCuda( cudaMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) ) );
-    checkCuda( cudaMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) ) );
+#ifdef MGONGPUCPP_GPUIMPL
+    gpuMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) );
+    gpuMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) );
 #else
     cNGoodHel = nGoodHel;
     for( int ihel = 0; ihel < ncomb; ihel++ ) cGoodHel[ihel] = goodHel[ihel];
@@ -1349,7 +1350,7 @@ namespace mg5amcCpu
 #endif
             int* allselhel,                // output: helicity selection[nevt]
             int* allselcol                 // output: helicity selection[nevt]
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
             , const int nevt               // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #endif
             ) /* clang-format on */
@@ -1369,7 +1370,7 @@ namespace mg5amcCpu
     // Denominators: spins, colors and identical particles
     constexpr int helcolDenominators[1] = { 96 }; // assume nprocesses == 1 (#272 and #343)
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     // Remember: in CUDA this is a kernel for one event, in c++ this processes n events
     const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid
 #else
@@ -1383,9 +1384,12 @@ namespace mg5amcCpu
 #endif
 
     // Start sigmaKin_lines
+
+#include "GpuAbstraction.h"
+
     // === PART 0 - INITIALISATION (before calculate_wavefunctions) ===
     // Reset the "matrix elements" - running sums of |M|^2 over helicities for the given event
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     allMEs[ievt] = 0;
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     allNumerators[ievt] = 0;
@@ -1413,7 +1417,7 @@ namespace mg5amcCpu
     // === PART 1 - HELICITY LOOP: CALCULATE WAVEFUNCTIONS ===
     // (in both CUDA and C++, using precomputed good helicities)
 
-#ifdef __CUDACC__ // CUDA OR C++
+#ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++
 
     // *** START OF PART 1a - CUDA (one event per CPU thread) ***
     // Running sum of partial amplitudes squared for event by event color selection (#402)
@@ -1623,7 +1627,7 @@ namespace mg5amcCpu
     // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event
     // [NB 'sum over final spins, average over initial spins', eg see
     // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf]
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     allMEs[ievt] /= helcolDenominators[0];
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     if( channelId > 0 ) allMEs[ievt] *= allNumerators[ievt] / allDenominators[ievt];
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gux_ttxgux/CPPProcess.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gux_ttxgux/CPPProcess.h
index e60cb5b6d7..a5a285b22d 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gux_ttxgux/CPPProcess.h
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gux_ttxgux/CPPProcess.h
@@ -4,7 +4,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
 // MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
@@ -25,7 +25,7 @@
 
 //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -110,7 +110,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   __global__ void
   computeDependentCouplings( const fptype* allgs,    // input: Gs[nevt]
                              fptype* allcouplings ); // output: couplings[nevt*ndcoup*2]
@@ -123,7 +123,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__ /* clang-format off */
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
   __global__ void
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
@@ -153,7 +153,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__ /* clang-format off */
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
   __global__ void
   sigmaKin( const fptype* allmomenta,      // input: momenta[nevt*npar*4]
             const fptype* allcouplings,    // input: couplings[nevt*ndcoup*2]
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gux_ttxgux/CudaRuntime.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gux_ttxgux/CudaRuntime.h
deleted file mode 120000
index ce9e1a487a..0000000000
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gux_ttxgux/CudaRuntime.h
+++ /dev/null
@@ -1 +0,0 @@
-../CudaRuntime.h
\ No newline at end of file
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gux_ttxgux/GpuAbstraction.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gux_ttxgux/GpuAbstraction.h
new file mode 120000
index 0000000000..72054e19ba
--- /dev/null
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gux_ttxgux/GpuAbstraction.h
@@ -0,0 +1 @@
+../GpuAbstraction.h
\ No newline at end of file
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gux_ttxgux/GpuRuntime.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gux_ttxgux/GpuRuntime.h
new file mode 120000
index 0000000000..3920e83be4
--- /dev/null
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gux_ttxgux/GpuRuntime.h
@@ -0,0 +1 @@
+../GpuRuntime.h
\ No newline at end of file
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gux_ttxgux/check_sa.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gux_ttxgux/check_sa.cc
index 3fbf0ffbee..7cac5ab47b 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gux_ttxgux/check_sa.cc
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gux_ttxgux/check_sa.cc
@@ -4,7 +4,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: O. Mattelaer (Nov 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 
 #include "mgOnGpuConfig.h"
@@ -12,6 +12,7 @@
 #include "BridgeKernels.h"
 #include "CPPProcess.h"
 #include "CrossSectionKernels.h"
+#include "GpuRuntime.h"
 #include "MatrixElementKernels.h"
 #include "MemoryAccessMatrixElements.h"
 #include "MemoryAccessMomenta.h"
@@ -65,7 +66,7 @@ usage( char* argv0, int ret = 1 )
   std::cout << std::endl;
   std::cout << "Summary stats are always computed: '-p' and '-j' only control their printout" << std::endl;
   std::cout << "The '-d' flag only enables NaN/abnormal warnings and OMP debugging" << std::endl;
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 #ifdef _OPENMP
   std::cout << std::endl;
   std::cout << "Use the OMP_NUM_THREADS environment variable to control OMP multi-threading" << std::endl;
@@ -96,7 +97,7 @@ int
 main( int argc, char** argv )
 {
   // Namespaces for CUDA and C++ (FIXME - eventually use the same namespace everywhere...)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   using namespace mg5amcGpu;
 #else
   using namespace mg5amcCpu;
@@ -134,9 +135,11 @@ main( int argc, char** argv )
     CurandDevice = 2
   };
 #ifdef MGONGPU_HAS_NO_CURAND
-  RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // this is the only supported mode if build has no curand (PR #784)
+  RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // this is the only supported mode if build has no curand (PR #784 and #785)
+#elif defined __HIPCC__
+#error Internal error: MGONGPU_HAS_NO_CURAND should have been set for __HIPCC__ // default on AMD GPUs should be common random
 #elif defined __CUDACC__
-  RandomNumberMode rndgen = RandomNumberMode::CurandDevice; // default on GPU if build has curand
+  RandomNumberMode rndgen = RandomNumberMode::CurandDevice; // default on NVidia GPU if build has curand
 #else
   RandomNumberMode rndgen = RandomNumberMode::CurandHost; // default on CPU if build has curand
 #endif
@@ -146,10 +149,10 @@ main( int argc, char** argv )
     RamboHost = 1,
     RamboDevice = 2
   };
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   RamboSamplingMode rmbsmp = RamboSamplingMode::RamboDevice; // default on GPU
 #else
-  RamboSamplingMode rmbsmp = RamboSamplingMode::RamboHost;  // default on CPU
+  RamboSamplingMode rmbsmp = RamboSamplingMode::RamboHost; // default on CPU
 #endif
   // Bridge emulation mode (NB Bridge implies RamboHost!)
   bool bridge = false;
@@ -177,7 +180,7 @@ main( int argc, char** argv )
     else if( arg == "--curdev" )
     {
 #ifndef __CUDACC__
-      throw std::runtime_error( "CurandDevice is not supported on CPUs" );
+      throw std::runtime_error( "CurandDevice is not supported on CPUs or non-NVidia GPUs" );
 #elif defined MGONGPU_HAS_NO_CURAND
       throw std::runtime_error( "CurandDevice is not supported because this application was built without Curand support" );
 #else
@@ -198,7 +201,7 @@ main( int argc, char** argv )
     }
     else if( arg == "--rmbdev" )
     {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
       rmbsmp = RamboSamplingMode::RamboDevice;
 #else
       throw std::runtime_error( "RamboDevice is not supported on CPUs" );
@@ -272,13 +275,13 @@ main( int argc, char** argv )
     return usage( argv[0] );
   }
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 #ifdef _OPENMP
   ompnumthreadsNotSetMeansOneThread( debug ? 1 : 0 ); // quiet(-1), info(0), debug(1)
 #endif
 #endif
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // Fail gently and avoid "Illegal instruction (core dumped)" if the host does not support the SIMD used in the ME calculation
   // Note: this prevents a crash on pmpe04 but not on some github CI nodes?
   // [NB: SIMD vectorization in mg5amc C++ code is only used in the ME calculation below MatrixElementKernelHost!]
@@ -296,14 +299,14 @@ main( int argc, char** argv )
 
   // === STEP 0 - INITIALISE
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 
-  // --- 00. Initialise cuda
-  // Instantiate a CudaRuntime at the beginnining of the application's main to
-  // invoke cudaSetDevice(0) in the constructor and book a cudaDeviceReset() call in the destructor
-  const std::string cdinKey = "00 CudaInit";
+  // --- 00. Initialise GPU
+  // Instantiate a GpuRuntime at the beginnining of the application's main.
+  // For CUDA this invokes cudaSetDevice(0) in the constructor and books a cudaDeviceReset() call in the destructor.
+  const std::string cdinKey = "00 GpuInit";
   timermap.start( cdinKey );
-  CudaRuntime cudaRuntime( debug );
+  GpuRuntime GpuRuntime( debug );
 #endif
 
   // --- 0a. Initialise physics process
@@ -325,7 +328,7 @@ main( int argc, char** argv )
   timermap.start( alloKey );
 
   // Memory buffers for random numbers for momenta
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferRndNumMomenta hstRndmom( nevt );
 #else
   PinnedHostBufferRndNumMomenta hstRndmom( nevt );
@@ -333,7 +336,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for sampling weights
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferWeights hstWeights( nevt );
 #else
   PinnedHostBufferWeights hstWeights( nevt );
@@ -341,7 +344,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for momenta
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferMomenta hstMomenta( nevt );
 #else
   PinnedHostBufferMomenta hstMomenta( nevt );
@@ -349,7 +352,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for Gs
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferGs hstGs( nevt );
 #else
   PinnedHostBufferGs hstGs( nevt );
@@ -366,7 +369,7 @@ main( int argc, char** argv )
   }
 
   // Memory buffers for matrix elements
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferMatrixElements hstMatrixElements( nevt );
 #else
   PinnedHostBufferMatrixElements hstMatrixElements( nevt );
@@ -375,7 +378,7 @@ main( int argc, char** argv )
 
   // Memory buffers for random numbers for helicity selection
   // *** NB #403 these buffers always remain initialised at 0: no need for helicity choice in gcheck/check (no LHE produced) ***
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferRndNumHelicity hstRndHel( nevt );
 #else
   PinnedHostBufferRndNumHelicity hstRndHel( nevt );
@@ -384,7 +387,7 @@ main( int argc, char** argv )
 
   // Memory buffers for random numbers for color selection
   // *** NB #402 these buffers always remain initialised at 0: no need for color choice in gcheck/check (no LHE produced) ***
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferRndNumColor hstRndCol( nevt );
 #else
   PinnedHostBufferRndNumColor hstRndCol( nevt );
@@ -392,7 +395,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for helicity selection
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferSelectedHelicity hstSelHel( nevt );
 #else
   PinnedHostBufferSelectedHelicity hstSelHel( nevt );
@@ -400,7 +403,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for color selection
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferSelectedColor hstSelCol( nevt );
 #else
   PinnedHostBufferSelectedColor hstSelCol( nevt );
@@ -438,7 +441,7 @@ main( int argc, char** argv )
     const bool onDevice = true;
     prnk.reset( new CurandRandomNumberKernel( devRndmom, onDevice ) );
 #else
-    throw std::logic_error( "INTERNAL ERROR! CurandDevice is not supported on CPUs" ); // INTERNAL ERROR (no path to this statement)
+    throw std::logic_error( "INTERNAL ERROR! CurandDevice is not supported on CPUs or non-NVidia GPUs" ); // INTERNAL ERROR (no path to this statement)
 #endif
   }
 
@@ -450,7 +453,7 @@ main( int argc, char** argv )
   }
   else
   {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     prsk.reset( new RamboSamplingKernelDevice( energy, devRndmom, devMomenta, devWeights, gpublocks, gputhreads ) );
 #else
     throw std::logic_error( "RamboDevice is not supported on CPUs" ); // INTERNAL ERROR (no path to this statement)
@@ -461,7 +464,7 @@ main( int argc, char** argv )
   std::unique_ptr<MatrixElementKernelBase> pmek;
   if( !bridge )
   {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     pmek.reset( new MatrixElementKernelDevice( devMomenta, devGs, devRndHel, devRndCol, devMatrixElements, devSelHel, devSelCol, gpublocks, gputhreads ) );
 #else
     pmek.reset( new MatrixElementKernelHost( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, nevt ) );
@@ -469,7 +472,7 @@ main( int argc, char** argv )
   }
   else
   {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     pmek.reset( new BridgeKernelDevice( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, gpublocks, gputhreads ) );
 #else
     pmek.reset( new BridgeKernelHost( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, nevt ) );
@@ -511,7 +514,7 @@ main( int argc, char** argv )
     prnk->generateRnarray();
     //std::cout << "Got random numbers" << std::endl;
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     if( rndgen != RandomNumberMode::CurandDevice && rmbsmp == RamboSamplingMode::RamboDevice )
     {
       // --- 1c. Copy rndmom from host to device
@@ -543,7 +546,7 @@ main( int argc, char** argv )
     prsk->getMomentaFinal();
     //std::cout << "Got final momenta" << std::endl;
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     if( rmbsmp == RamboSamplingMode::RamboDevice )
     {
       // --- 2c. CopyDToH Weights
@@ -588,7 +591,7 @@ main( int argc, char** argv )
       dynamic_cast<BridgeKernelBase*>( pmek.get() )->transposeInputMomentaC2F();
     }
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     // --- 2d. CopyHToD Momenta
     const std::string gKey = "0.. CpHTDg";
     rambtime += timermap.start( gKey ); // FIXME! NOT A RAMBO TIMER!
@@ -617,7 +620,7 @@ main( int argc, char** argv )
     wv3atime += timermap.stop(); // calc only
     wavetime += wv3atime;        // calc plus copy
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     if( !bridge )
     {
       // --- 3b. CopyDToH MEs
@@ -760,18 +763,22 @@ main( int argc, char** argv )
     rndgentxt = "CURAND DEVICE";
 #ifdef __CUDACC__
   rndgentxt += " (CUDA code)";
+#elif defined __HIPCC__
+  rndgentxt += " (HIP code)";
 #else
   rndgentxt += " (C++ code)";
 #endif
 
   // Workflow description summary
   std::string wrkflwtxt;
-  // -- CUDA or C++?
+  // -- CUDA or HIP or C++?
 #ifdef __CUDACC__
   wrkflwtxt += "CUD:";
+#elif defined __HIPCC__
+  wrkflwtxt += "HIP:";
 #else
   wrkflwtxt += "CPP:";
-#endif
+#endif /* clang-format off */
   // -- DOUBLE or FLOAT?
 #if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
   wrkflwtxt += "MIX+"; // mixed fptypes (single precision color algebra #537)
@@ -781,7 +788,7 @@ main( int argc, char** argv )
   wrkflwtxt += "FLT+";
 #else
   wrkflwtxt += "???+"; // no path to this statement
-#endif
+#endif /* clang-format on */
   // -- CUCOMPLEX or THRUST or STD complex numbers?
 #ifdef __CUDACC__
 #if defined MGONGPU_CUCXTYPE_CUCOMPLEX
@@ -793,6 +800,12 @@ main( int argc, char** argv )
 #else
   wrkflwtxt += "???:"; // no path to this statement
 #endif
+#elif defined __HIPCC__
+#if defined MGONGPU_CUCXTYPE_CXSMPL
+  wrkflwtxt += "CXS:";
+#else
+  wrkflwtxt += "???:"; // no path to this statement
+#endif
 #else
 #if defined MGONGPU_CPPCXTYPE_STDCOMPLEX
   wrkflwtxt += "STX:";
@@ -818,7 +831,7 @@ main( int argc, char** argv )
     wrkflwtxt += "RMBDEV+";
   else
     wrkflwtxt += "??????+"; // no path to this statement
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   // -- HOST or DEVICE matrix elements? Standalone MEs or BRIDGE?
   if( !bridge )
     wrkflwtxt += "MESDEV";
@@ -874,7 +887,7 @@ main( int argc, char** argv )
 
   if( perf )
   {
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 #ifdef _OPENMP
     // Get the output of "nproc --all" (https://stackoverflow.com/a/478960)
     std::string nprocall;
@@ -895,6 +908,8 @@ main( int argc, char** argv )
     std::cout << std::string( SEP79, '*' ) << std::endl
 #ifdef __CUDACC__
               << "Process                     = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_CUDA"
+#elif defined __HIPCC__
+              << "Process                     = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_HIP"
 #else
               << "Process                     = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_CPP"
 #endif
@@ -921,21 +936,21 @@ main( int argc, char** argv )
 #elif defined MGONGPU_FPTYPE_FLOAT
               << "FP precision                = FLOAT (NaN/abnormal=" << nabn << ", zero=" << nzero << ")" << std::endl
 #endif
-#ifdef __CUDACC__
 #if defined MGONGPU_CUCXTYPE_CUCOMPLEX
               << "Complex type                = CUCOMPLEX" << std::endl
 #elif defined MGONGPU_CUCXTYPE_THRUST
               << "Complex type                = THRUST::COMPLEX" << std::endl
-#endif
-#else
+#elif defined MGONGPU_CUCXTYPE_CXSMPL
               << "Complex type                = STD::COMPLEX" << std::endl
+#else
+              << "Complex type                = ???" << std::endl // no path to this statement...
 #endif
               << "RanNumb memory layout       = AOSOA[" << neppR << "]"
               << ( neppR == 1 ? " == AOS" : "" )
               << " [HARDCODED FOR REPRODUCIBILITY]" << std::endl
               << "Momenta memory layout       = AOSOA[" << neppM << "]"
               << ( neppM == 1 ? " == AOS" : "" ) << std::endl
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     //<< "Wavefunction GPU memory     = LOCAL" << std::endl
 #else
 #if !defined MGONGPU_CPPSIMD
@@ -966,7 +981,7 @@ main( int argc, char** argv )
 #endif
 #endif
               << "Random number generation    = " << rndgentxt << std::endl
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 #ifdef _OPENMP
               << "OMP threads / `nproc --all` = " << omp_get_max_threads() << " / " << nprocall // includes a newline
 #endif
@@ -1062,14 +1077,14 @@ main( int argc, char** argv )
              << "\"FLOAT (NaN/abnormal=" << nabn << ")\"," << std::endl
 #endif
              << "\"Complex type\": "
-#ifdef __CUDACC__
 #if defined MGONGPU_CUCXTYPE_CUCOMPLEX
              << "\"CUCOMPLEX\"," << std::endl
 #elif defined MGONGPU_CUCXTYPE_THRUST
              << "\"THRUST::COMPLEX\"," << std::endl
-#endif
-#else
+#elif defined MGONGPU_CUCXTYPE_CXSMPL
              << "\"STD::COMPLEX\"," << std::endl
+#else
+             << "\"???\"," << std::endl                           // no path to this statement...
 #endif
              << "\"RanNumb memory layout\": "
              << "\"AOSOA[" << neppR << "]\""
@@ -1077,7 +1092,7 @@ main( int argc, char** argv )
              << "\"Momenta memory layout\": "
              << "\"AOSOA[" << neppM << "]\""
              << ( neppM == 1 ? " == AOS" : "" ) << ", " << std::endl
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     //<< "\"Wavefunction GPU memory\": " << "\"LOCAL\"," << std::endl
 #endif
              << "\"Curand generation\": "
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uc_ttxuc/CPPProcess.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uc_ttxuc/CPPProcess.cc
index ecef3e57ca..fedf955b6a 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uc_ttxuc/CPPProcess.cc
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uc_ttxuc/CPPProcess.cc
@@ -4,7 +4,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
 // MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
@@ -16,7 +16,6 @@
 
 #include "mgOnGpuConfig.h"
 
-#include "CudaRuntime.h"
 #include "HelAmps_sm.h"
 #include "MemoryAccessAmplitudes.h"
 #include "MemoryAccessCouplings.h"
@@ -51,7 +50,7 @@
 // Process: c s > t t~ c s WEIGHTED<=4 @2
 // Process: d s > t t~ d s WEIGHTED<=4 @2
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -85,7 +84,7 @@ namespace mg5amcCpu
   __device__ const fptype cIPD[2] = { (fptype)Parameters_sm::mdl_MT, (fptype)Parameters_sm::mdl_WT };
   __device__ const fptype* cIPC = nullptr; // unused as nicoup=0
 #else
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   __device__ __constant__ fptype cIPD[2];
   __device__ __constant__ fptype* cIPC = nullptr; // unused as nicoup=0
 #else
@@ -95,7 +94,7 @@ namespace mg5amcCpu
 #endif
 
   // Helicity combinations (and filtering of "good" helicity combinations)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   __device__ __constant__ short cHel[ncomb][npar];
   __device__ __constant__ int cNGoodHel;
   __device__ __constant__ int cGoodHel[ncomb];
@@ -123,13 +122,13 @@ namespace mg5amcCpu
                            fptype* allDenominators,       // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
                            fptype_sv* jamp2_sv            // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled)
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
                            , const int ievt00             // input: first event number in current C++ event page (for CUDA, ievt depends on threadid)
 #endif
                            )
   //ALWAYS_INLINE // attributes are not permitted in a function definition
   {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     using namespace mg5amcGpu;
     using M_ACCESS = DeviceAccessMomenta;         // non-trivial access: buffer includes all events
     using E_ACCESS = DeviceAccessMatrixElements;  // non-trivial access: buffer includes all events
@@ -156,7 +155,7 @@ namespace mg5amcCpu
 #endif /* clang-format on */
     mgDebug( 0, __FUNCTION__ );
     //printf( "calculate_wavefunctions: ihel=%2d\n", ihel );
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
     //printf( "calculate_wavefunctions: ievt00=%d\n", ievt00 );
 #endif
 
@@ -192,7 +191,7 @@ namespace mg5amcCpu
 #endif
     for( int iParity = 0; iParity < nParity; ++iParity )
     { // START LOOP ON IPARITY
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
       const int ievt0 = ievt00 + iParity * neppV;
 #endif
       constexpr size_t nxcoup = ndcoup + nicoup; // both dependent and independent couplings
@@ -205,8 +204,10 @@ namespace mg5amcCpu
         allCOUPs[idcoup] = CD_ACCESS::idcoupAccessBufferConst( allcouplings, idcoup ); // dependent couplings, vary event-by-event
       for( size_t iicoup = 0; iicoup < nicoup; iicoup++ )
         allCOUPs[ndcoup + iicoup] = CI_ACCESS::iicoupAccessBufferConst( cIPC, iicoup ); // independent couplings, fixed for all events
+#ifdef MGONGPUCPP_GPUIMPL
 #ifdef __CUDACC__
 #pragma nv_diagnostic pop
+#endif
       // CUDA kernels take input/output buffers with momenta/MEs for all events
       const fptype* momenta = allmomenta;
       const fptype* COUPs[nxcoup];
@@ -387,7 +388,7 @@ namespace mg5amcCpu
         { 3, 9, 9, 3, 27, 9 },
         { 9, 3, 3, 9, 9, 27 } }; // 2-D array[6][6]
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
       // Pre-compute a constexpr triangular color matrix properly normalized #475
       struct TriangularNormalizedColorMatrix
       {
@@ -444,7 +445,7 @@ namespace mg5amcCpu
 #endif
       for( int icol = 0; icol < ncolor; icol++ )
       {
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
         // === C++ START ===
         // Diagonal terms
 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
@@ -503,7 +504,7 @@ namespace mg5amcCpu
       MEs_sv_previous += deltaMEs_previous;
 #endif
       /*
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
       if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", blockDim.x * blockIdx.x + threadIdx.x, ihel, MEs_sv );
 #else
 #ifdef MGONGPU_CPPSIMD
@@ -598,8 +599,8 @@ namespace mg5amcCpu
       { -1, -1, 1, -1, -1, 1 },
       { -1, -1, 1, -1, 1, -1 },
       { -1, -1, 1, -1, 1, 1 } };
-#ifdef __CUDACC__
-    checkCuda( cudaMemcpyToSymbol( cHel, tHel, ncomb * npar * sizeof( short ) ) );
+#ifdef MGONGPUCPP_GPUIMPL
+    gpuMemcpyToSymbol( cHel, tHel, ncomb * npar * sizeof( short ) );
 #else
     memcpy( cHel, tHel, ncomb * npar * sizeof( short ) );
 #endif
@@ -641,9 +642,9 @@ namespace mg5amcCpu
     // Then copy them to CUDA constant memory (issue #39) or its C++ emulation in file-scope static memory
     const fptype tIPD[2] = { (fptype)m_pars->mdl_MT, (fptype)m_pars->mdl_WT };
     //const cxtype tIPC[0] = { ... }; // nicoup=0
-#ifdef __CUDACC__
-    checkCuda( cudaMemcpyToSymbol( cIPD, tIPD, 2 * sizeof( fptype ) ) );
-    //checkCuda( cudaMemcpyToSymbol( cIPC, tIPC, 0 * sizeof( cxtype ) ) ); // nicoup=0
+#ifdef MGONGPUCPP_GPUIMPL
+    gpuMemcpyToSymbol( cIPD, tIPD, 2 * sizeof( fptype ) );
+    //gpuMemcpyToSymbol( cIPC, tIPC, 0 * sizeof( cxtype ) ); // nicoup=0
 #else
     memcpy( cIPD, tIPD, 2 * sizeof( fptype ) );
     //memcpy( cIPC, tIPC, 0 * sizeof( cxtype ) ); // nicoup=0
@@ -681,7 +682,7 @@ namespace mg5amcCpu
   {
     std::stringstream out;
     // CUDA version (NVCC)
-    // [Use __NVCC__ instead of __CUDACC__ here!]
+    // [Use __NVCC__ instead of MGONGPUCPP_GPUIMPL here!]
     // [This tests if 'nvcc' was used even to build a .cc file, even if not necessarily 'nvcc -x cu' for a .cu file]
     // [Check 'nvcc --compiler-options -dM -E dummy.c | grep CUDA': see https://stackoverflow.com/a/53713712]
 #ifdef __NVCC__
@@ -746,12 +747,12 @@ namespace mg5amcCpu
   __global__ void /* clang-format off */
   computeDependentCouplings( const fptype* allgs, // input: Gs[nevt]
                              fptype* allcouplings // output: couplings[nevt*ndcoup*2]
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
                              , const int nevt     // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #endif
   ) /* clang-format on */
   {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     using namespace mg5amcGpu;
     using G_ACCESS = DeviceAccessGs;
     using C_ACCESS = DeviceAccessCouplings;
@@ -772,7 +773,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__ /* clang-format off */
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
   __global__ void
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
@@ -898,9 +899,9 @@ namespace mg5amcCpu
         nGoodHel++;
       }
     }
-#ifdef __CUDACC__
-    checkCuda( cudaMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) ) );
-    checkCuda( cudaMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) ) );
+#ifdef MGONGPUCPP_GPUIMPL
+    gpuMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) );
+    gpuMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) );
 #else
     cNGoodHel = nGoodHel;
     for( int ihel = 0; ihel < ncomb; ihel++ ) cGoodHel[ihel] = goodHel[ihel];
@@ -924,7 +925,7 @@ namespace mg5amcCpu
 #endif
             int* allselhel,                // output: helicity selection[nevt]
             int* allselcol                 // output: helicity selection[nevt]
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
             , const int nevt               // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #endif
             ) /* clang-format on */
@@ -944,7 +945,7 @@ namespace mg5amcCpu
     // Denominators: spins, colors and identical particles
     constexpr int helcolDenominators[1] = { 36 }; // assume nprocesses == 1 (#272 and #343)
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     // Remember: in CUDA this is a kernel for one event, in c++ this processes n events
     const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid
 #else
@@ -958,9 +959,12 @@ namespace mg5amcCpu
 #endif
 
     // Start sigmaKin_lines
+
+#include "GpuAbstraction.h"
+
     // === PART 0 - INITIALISATION (before calculate_wavefunctions) ===
     // Reset the "matrix elements" - running sums of |M|^2 over helicities for the given event
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     allMEs[ievt] = 0;
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     allNumerators[ievt] = 0;
@@ -988,7 +992,7 @@ namespace mg5amcCpu
     // === PART 1 - HELICITY LOOP: CALCULATE WAVEFUNCTIONS ===
     // (in both CUDA and C++, using precomputed good helicities)
 
-#ifdef __CUDACC__ // CUDA OR C++
+#ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++
 
     // *** START OF PART 1a - CUDA (one event per CPU thread) ***
     // Running sum of partial amplitudes squared for event by event color selection (#402)
@@ -1198,7 +1202,7 @@ namespace mg5amcCpu
     // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event
     // [NB 'sum over final spins, average over initial spins', eg see
     // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf]
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     allMEs[ievt] /= helcolDenominators[0];
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     if( channelId > 0 ) allMEs[ievt] *= allNumerators[ievt] / allDenominators[ievt];
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uc_ttxuc/CPPProcess.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uc_ttxuc/CPPProcess.h
index 5329710b87..8c84687f8a 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uc_ttxuc/CPPProcess.h
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uc_ttxuc/CPPProcess.h
@@ -4,7 +4,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
 // MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
@@ -25,7 +25,7 @@
 
 //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -112,7 +112,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   __global__ void
   computeDependentCouplings( const fptype* allgs,    // input: Gs[nevt]
                              fptype* allcouplings ); // output: couplings[nevt*ndcoup*2]
@@ -125,7 +125,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__ /* clang-format off */
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
   __global__ void
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
@@ -155,7 +155,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__ /* clang-format off */
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
   __global__ void
   sigmaKin( const fptype* allmomenta,      // input: momenta[nevt*npar*4]
             const fptype* allcouplings,    // input: couplings[nevt*ndcoup*2]
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uc_ttxuc/CudaRuntime.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uc_ttxuc/CudaRuntime.h
deleted file mode 120000
index ce9e1a487a..0000000000
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uc_ttxuc/CudaRuntime.h
+++ /dev/null
@@ -1 +0,0 @@
-../CudaRuntime.h
\ No newline at end of file
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uc_ttxuc/GpuAbstraction.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uc_ttxuc/GpuAbstraction.h
new file mode 120000
index 0000000000..72054e19ba
--- /dev/null
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uc_ttxuc/GpuAbstraction.h
@@ -0,0 +1 @@
+../GpuAbstraction.h
\ No newline at end of file
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uc_ttxuc/GpuRuntime.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uc_ttxuc/GpuRuntime.h
new file mode 120000
index 0000000000..3920e83be4
--- /dev/null
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uc_ttxuc/GpuRuntime.h
@@ -0,0 +1 @@
+../GpuRuntime.h
\ No newline at end of file
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uc_ttxuc/check_sa.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uc_ttxuc/check_sa.cc
index 3fbf0ffbee..7cac5ab47b 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uc_ttxuc/check_sa.cc
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uc_ttxuc/check_sa.cc
@@ -4,7 +4,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: O. Mattelaer (Nov 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 
 #include "mgOnGpuConfig.h"
@@ -12,6 +12,7 @@
 #include "BridgeKernels.h"
 #include "CPPProcess.h"
 #include "CrossSectionKernels.h"
+#include "GpuRuntime.h"
 #include "MatrixElementKernels.h"
 #include "MemoryAccessMatrixElements.h"
 #include "MemoryAccessMomenta.h"
@@ -65,7 +66,7 @@ usage( char* argv0, int ret = 1 )
   std::cout << std::endl;
   std::cout << "Summary stats are always computed: '-p' and '-j' only control their printout" << std::endl;
   std::cout << "The '-d' flag only enables NaN/abnormal warnings and OMP debugging" << std::endl;
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 #ifdef _OPENMP
   std::cout << std::endl;
   std::cout << "Use the OMP_NUM_THREADS environment variable to control OMP multi-threading" << std::endl;
@@ -96,7 +97,7 @@ int
 main( int argc, char** argv )
 {
   // Namespaces for CUDA and C++ (FIXME - eventually use the same namespace everywhere...)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   using namespace mg5amcGpu;
 #else
   using namespace mg5amcCpu;
@@ -134,9 +135,11 @@ main( int argc, char** argv )
     CurandDevice = 2
   };
 #ifdef MGONGPU_HAS_NO_CURAND
-  RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // this is the only supported mode if build has no curand (PR #784)
+  RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // this is the only supported mode if build has no curand (PR #784 and #785)
+#elif defined __HIPCC__
+#error Internal error: MGONGPU_HAS_NO_CURAND should have been set for __HIPCC__ // default on AMD GPUs should be common random
 #elif defined __CUDACC__
-  RandomNumberMode rndgen = RandomNumberMode::CurandDevice; // default on GPU if build has curand
+  RandomNumberMode rndgen = RandomNumberMode::CurandDevice; // default on NVidia GPU if build has curand
 #else
   RandomNumberMode rndgen = RandomNumberMode::CurandHost; // default on CPU if build has curand
 #endif
@@ -146,10 +149,10 @@ main( int argc, char** argv )
     RamboHost = 1,
     RamboDevice = 2
   };
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   RamboSamplingMode rmbsmp = RamboSamplingMode::RamboDevice; // default on GPU
 #else
-  RamboSamplingMode rmbsmp = RamboSamplingMode::RamboHost;  // default on CPU
+  RamboSamplingMode rmbsmp = RamboSamplingMode::RamboHost; // default on CPU
 #endif
   // Bridge emulation mode (NB Bridge implies RamboHost!)
   bool bridge = false;
@@ -177,7 +180,7 @@ main( int argc, char** argv )
     else if( arg == "--curdev" )
     {
 #ifndef __CUDACC__
-      throw std::runtime_error( "CurandDevice is not supported on CPUs" );
+      throw std::runtime_error( "CurandDevice is not supported on CPUs or non-NVidia GPUs" );
 #elif defined MGONGPU_HAS_NO_CURAND
       throw std::runtime_error( "CurandDevice is not supported because this application was built without Curand support" );
 #else
@@ -198,7 +201,7 @@ main( int argc, char** argv )
     }
     else if( arg == "--rmbdev" )
     {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
       rmbsmp = RamboSamplingMode::RamboDevice;
 #else
       throw std::runtime_error( "RamboDevice is not supported on CPUs" );
@@ -272,13 +275,13 @@ main( int argc, char** argv )
     return usage( argv[0] );
   }
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 #ifdef _OPENMP
   ompnumthreadsNotSetMeansOneThread( debug ? 1 : 0 ); // quiet(-1), info(0), debug(1)
 #endif
 #endif
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // Fail gently and avoid "Illegal instruction (core dumped)" if the host does not support the SIMD used in the ME calculation
   // Note: this prevents a crash on pmpe04 but not on some github CI nodes?
   // [NB: SIMD vectorization in mg5amc C++ code is only used in the ME calculation below MatrixElementKernelHost!]
@@ -296,14 +299,14 @@ main( int argc, char** argv )
 
   // === STEP 0 - INITIALISE
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 
-  // --- 00. Initialise cuda
-  // Instantiate a CudaRuntime at the beginnining of the application's main to
-  // invoke cudaSetDevice(0) in the constructor and book a cudaDeviceReset() call in the destructor
-  const std::string cdinKey = "00 CudaInit";
+  // --- 00. Initialise GPU
+  // Instantiate a GpuRuntime at the beginnining of the application's main.
+  // For CUDA this invokes cudaSetDevice(0) in the constructor and books a cudaDeviceReset() call in the destructor.
+  const std::string cdinKey = "00 GpuInit";
   timermap.start( cdinKey );
-  CudaRuntime cudaRuntime( debug );
+  GpuRuntime GpuRuntime( debug );
 #endif
 
   // --- 0a. Initialise physics process
@@ -325,7 +328,7 @@ main( int argc, char** argv )
   timermap.start( alloKey );
 
   // Memory buffers for random numbers for momenta
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferRndNumMomenta hstRndmom( nevt );
 #else
   PinnedHostBufferRndNumMomenta hstRndmom( nevt );
@@ -333,7 +336,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for sampling weights
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferWeights hstWeights( nevt );
 #else
   PinnedHostBufferWeights hstWeights( nevt );
@@ -341,7 +344,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for momenta
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferMomenta hstMomenta( nevt );
 #else
   PinnedHostBufferMomenta hstMomenta( nevt );
@@ -349,7 +352,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for Gs
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferGs hstGs( nevt );
 #else
   PinnedHostBufferGs hstGs( nevt );
@@ -366,7 +369,7 @@ main( int argc, char** argv )
   }
 
   // Memory buffers for matrix elements
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferMatrixElements hstMatrixElements( nevt );
 #else
   PinnedHostBufferMatrixElements hstMatrixElements( nevt );
@@ -375,7 +378,7 @@ main( int argc, char** argv )
 
   // Memory buffers for random numbers for helicity selection
   // *** NB #403 these buffers always remain initialised at 0: no need for helicity choice in gcheck/check (no LHE produced) ***
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferRndNumHelicity hstRndHel( nevt );
 #else
   PinnedHostBufferRndNumHelicity hstRndHel( nevt );
@@ -384,7 +387,7 @@ main( int argc, char** argv )
 
   // Memory buffers for random numbers for color selection
   // *** NB #402 these buffers always remain initialised at 0: no need for color choice in gcheck/check (no LHE produced) ***
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferRndNumColor hstRndCol( nevt );
 #else
   PinnedHostBufferRndNumColor hstRndCol( nevt );
@@ -392,7 +395,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for helicity selection
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferSelectedHelicity hstSelHel( nevt );
 #else
   PinnedHostBufferSelectedHelicity hstSelHel( nevt );
@@ -400,7 +403,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for color selection
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferSelectedColor hstSelCol( nevt );
 #else
   PinnedHostBufferSelectedColor hstSelCol( nevt );
@@ -438,7 +441,7 @@ main( int argc, char** argv )
     const bool onDevice = true;
     prnk.reset( new CurandRandomNumberKernel( devRndmom, onDevice ) );
 #else
-    throw std::logic_error( "INTERNAL ERROR! CurandDevice is not supported on CPUs" ); // INTERNAL ERROR (no path to this statement)
+    throw std::logic_error( "INTERNAL ERROR! CurandDevice is not supported on CPUs or non-NVidia GPUs" ); // INTERNAL ERROR (no path to this statement)
 #endif
   }
 
@@ -450,7 +453,7 @@ main( int argc, char** argv )
   }
   else
   {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     prsk.reset( new RamboSamplingKernelDevice( energy, devRndmom, devMomenta, devWeights, gpublocks, gputhreads ) );
 #else
     throw std::logic_error( "RamboDevice is not supported on CPUs" ); // INTERNAL ERROR (no path to this statement)
@@ -461,7 +464,7 @@ main( int argc, char** argv )
   std::unique_ptr<MatrixElementKernelBase> pmek;
   if( !bridge )
   {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     pmek.reset( new MatrixElementKernelDevice( devMomenta, devGs, devRndHel, devRndCol, devMatrixElements, devSelHel, devSelCol, gpublocks, gputhreads ) );
 #else
     pmek.reset( new MatrixElementKernelHost( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, nevt ) );
@@ -469,7 +472,7 @@ main( int argc, char** argv )
   }
   else
   {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     pmek.reset( new BridgeKernelDevice( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, gpublocks, gputhreads ) );
 #else
     pmek.reset( new BridgeKernelHost( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, nevt ) );
@@ -511,7 +514,7 @@ main( int argc, char** argv )
     prnk->generateRnarray();
     //std::cout << "Got random numbers" << std::endl;
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     if( rndgen != RandomNumberMode::CurandDevice && rmbsmp == RamboSamplingMode::RamboDevice )
     {
       // --- 1c. Copy rndmom from host to device
@@ -543,7 +546,7 @@ main( int argc, char** argv )
     prsk->getMomentaFinal();
     //std::cout << "Got final momenta" << std::endl;
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     if( rmbsmp == RamboSamplingMode::RamboDevice )
     {
       // --- 2c. CopyDToH Weights
@@ -588,7 +591,7 @@ main( int argc, char** argv )
       dynamic_cast<BridgeKernelBase*>( pmek.get() )->transposeInputMomentaC2F();
     }
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     // --- 2d. CopyHToD Momenta
     const std::string gKey = "0.. CpHTDg";
     rambtime += timermap.start( gKey ); // FIXME! NOT A RAMBO TIMER!
@@ -617,7 +620,7 @@ main( int argc, char** argv )
     wv3atime += timermap.stop(); // calc only
     wavetime += wv3atime;        // calc plus copy
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     if( !bridge )
     {
       // --- 3b. CopyDToH MEs
@@ -760,18 +763,22 @@ main( int argc, char** argv )
     rndgentxt = "CURAND DEVICE";
 #ifdef __CUDACC__
   rndgentxt += " (CUDA code)";
+#elif defined __HIPCC__
+  rndgentxt += " (HIP code)";
 #else
   rndgentxt += " (C++ code)";
 #endif
 
   // Workflow description summary
   std::string wrkflwtxt;
-  // -- CUDA or C++?
+  // -- CUDA or HIP or C++?
 #ifdef __CUDACC__
   wrkflwtxt += "CUD:";
+#elif defined __HIPCC__
+  wrkflwtxt += "HIP:";
 #else
   wrkflwtxt += "CPP:";
-#endif
+#endif /* clang-format off */
   // -- DOUBLE or FLOAT?
 #if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
   wrkflwtxt += "MIX+"; // mixed fptypes (single precision color algebra #537)
@@ -781,7 +788,7 @@ main( int argc, char** argv )
   wrkflwtxt += "FLT+";
 #else
   wrkflwtxt += "???+"; // no path to this statement
-#endif
+#endif /* clang-format on */
   // -- CUCOMPLEX or THRUST or STD complex numbers?
 #ifdef __CUDACC__
 #if defined MGONGPU_CUCXTYPE_CUCOMPLEX
@@ -793,6 +800,12 @@ main( int argc, char** argv )
 #else
   wrkflwtxt += "???:"; // no path to this statement
 #endif
+#elif defined __HIPCC__
+#if defined MGONGPU_CUCXTYPE_CXSMPL
+  wrkflwtxt += "CXS:";
+#else
+  wrkflwtxt += "???:"; // no path to this statement
+#endif
 #else
 #if defined MGONGPU_CPPCXTYPE_STDCOMPLEX
   wrkflwtxt += "STX:";
@@ -818,7 +831,7 @@ main( int argc, char** argv )
     wrkflwtxt += "RMBDEV+";
   else
     wrkflwtxt += "??????+"; // no path to this statement
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   // -- HOST or DEVICE matrix elements? Standalone MEs or BRIDGE?
   if( !bridge )
     wrkflwtxt += "MESDEV";
@@ -874,7 +887,7 @@ main( int argc, char** argv )
 
   if( perf )
   {
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 #ifdef _OPENMP
     // Get the output of "nproc --all" (https://stackoverflow.com/a/478960)
     std::string nprocall;
@@ -895,6 +908,8 @@ main( int argc, char** argv )
     std::cout << std::string( SEP79, '*' ) << std::endl
 #ifdef __CUDACC__
               << "Process                     = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_CUDA"
+#elif defined __HIPCC__
+              << "Process                     = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_HIP"
 #else
               << "Process                     = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_CPP"
 #endif
@@ -921,21 +936,21 @@ main( int argc, char** argv )
 #elif defined MGONGPU_FPTYPE_FLOAT
               << "FP precision                = FLOAT (NaN/abnormal=" << nabn << ", zero=" << nzero << ")" << std::endl
 #endif
-#ifdef __CUDACC__
 #if defined MGONGPU_CUCXTYPE_CUCOMPLEX
               << "Complex type                = CUCOMPLEX" << std::endl
 #elif defined MGONGPU_CUCXTYPE_THRUST
               << "Complex type                = THRUST::COMPLEX" << std::endl
-#endif
-#else
+#elif defined MGONGPU_CUCXTYPE_CXSMPL
               << "Complex type                = STD::COMPLEX" << std::endl
+#else
+              << "Complex type                = ???" << std::endl // no path to this statement...
 #endif
               << "RanNumb memory layout       = AOSOA[" << neppR << "]"
               << ( neppR == 1 ? " == AOS" : "" )
               << " [HARDCODED FOR REPRODUCIBILITY]" << std::endl
               << "Momenta memory layout       = AOSOA[" << neppM << "]"
               << ( neppM == 1 ? " == AOS" : "" ) << std::endl
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     //<< "Wavefunction GPU memory     = LOCAL" << std::endl
 #else
 #if !defined MGONGPU_CPPSIMD
@@ -966,7 +981,7 @@ main( int argc, char** argv )
 #endif
 #endif
               << "Random number generation    = " << rndgentxt << std::endl
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 #ifdef _OPENMP
               << "OMP threads / `nproc --all` = " << omp_get_max_threads() << " / " << nprocall // includes a newline
 #endif
@@ -1062,14 +1077,14 @@ main( int argc, char** argv )
              << "\"FLOAT (NaN/abnormal=" << nabn << ")\"," << std::endl
 #endif
              << "\"Complex type\": "
-#ifdef __CUDACC__
 #if defined MGONGPU_CUCXTYPE_CUCOMPLEX
              << "\"CUCOMPLEX\"," << std::endl
 #elif defined MGONGPU_CUCXTYPE_THRUST
              << "\"THRUST::COMPLEX\"," << std::endl
-#endif
-#else
+#elif defined MGONGPU_CUCXTYPE_CXSMPL
              << "\"STD::COMPLEX\"," << std::endl
+#else
+             << "\"???\"," << std::endl                           // no path to this statement...
 #endif
              << "\"RanNumb memory layout\": "
              << "\"AOSOA[" << neppR << "]\""
@@ -1077,7 +1092,7 @@ main( int argc, char** argv )
              << "\"Momenta memory layout\": "
              << "\"AOSOA[" << neppM << "]\""
              << ( neppM == 1 ? " == AOS" : "" ) << ", " << std::endl
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     //<< "\"Wavefunction GPU memory\": " << "\"LOCAL\"," << std::endl
 #endif
              << "\"Curand generation\": "
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_ucx_ttxucx/CPPProcess.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_ucx_ttxucx/CPPProcess.cc
index e4f9dee3a2..fc99b3bfae 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_ucx_ttxucx/CPPProcess.cc
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_ucx_ttxucx/CPPProcess.cc
@@ -4,7 +4,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
 // MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
@@ -16,7 +16,6 @@
 
 #include "mgOnGpuConfig.h"
 
-#include "CudaRuntime.h"
 #include "HelAmps_sm.h"
 #include "MemoryAccessAmplitudes.h"
 #include "MemoryAccessCouplings.h"
@@ -57,7 +56,7 @@
 // Process: s c~ > t t~ s c~ WEIGHTED<=4 @2
 // Process: s d~ > t t~ s d~ WEIGHTED<=4 @2
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -91,7 +90,7 @@ namespace mg5amcCpu
   __device__ const fptype cIPD[2] = { (fptype)Parameters_sm::mdl_MT, (fptype)Parameters_sm::mdl_WT };
   __device__ const fptype* cIPC = nullptr; // unused as nicoup=0
 #else
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   __device__ __constant__ fptype cIPD[2];
   __device__ __constant__ fptype* cIPC = nullptr; // unused as nicoup=0
 #else
@@ -101,7 +100,7 @@ namespace mg5amcCpu
 #endif
 
   // Helicity combinations (and filtering of "good" helicity combinations)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   __device__ __constant__ short cHel[ncomb][npar];
   __device__ __constant__ int cNGoodHel;
   __device__ __constant__ int cGoodHel[ncomb];
@@ -129,13 +128,13 @@ namespace mg5amcCpu
                            fptype* allDenominators,       // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
                            fptype_sv* jamp2_sv            // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled)
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
                            , const int ievt00             // input: first event number in current C++ event page (for CUDA, ievt depends on threadid)
 #endif
                            )
   //ALWAYS_INLINE // attributes are not permitted in a function definition
   {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     using namespace mg5amcGpu;
     using M_ACCESS = DeviceAccessMomenta;         // non-trivial access: buffer includes all events
     using E_ACCESS = DeviceAccessMatrixElements;  // non-trivial access: buffer includes all events
@@ -162,7 +161,7 @@ namespace mg5amcCpu
 #endif /* clang-format on */
     mgDebug( 0, __FUNCTION__ );
     //printf( "calculate_wavefunctions: ihel=%2d\n", ihel );
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
     //printf( "calculate_wavefunctions: ievt00=%d\n", ievt00 );
 #endif
 
@@ -198,7 +197,7 @@ namespace mg5amcCpu
 #endif
     for( int iParity = 0; iParity < nParity; ++iParity )
     { // START LOOP ON IPARITY
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
       const int ievt0 = ievt00 + iParity * neppV;
 #endif
       constexpr size_t nxcoup = ndcoup + nicoup; // both dependent and independent couplings
@@ -211,8 +210,10 @@ namespace mg5amcCpu
         allCOUPs[idcoup] = CD_ACCESS::idcoupAccessBufferConst( allcouplings, idcoup ); // dependent couplings, vary event-by-event
       for( size_t iicoup = 0; iicoup < nicoup; iicoup++ )
         allCOUPs[ndcoup + iicoup] = CI_ACCESS::iicoupAccessBufferConst( cIPC, iicoup ); // independent couplings, fixed for all events
+#ifdef MGONGPUCPP_GPUIMPL
 #ifdef __CUDACC__
 #pragma nv_diagnostic pop
+#endif
       // CUDA kernels take input/output buffers with momenta/MEs for all events
       const fptype* momenta = allmomenta;
       const fptype* COUPs[nxcoup];
@@ -393,7 +394,7 @@ namespace mg5amcCpu
         { 3, 9, 9, 3, 27, 9 },
         { 9, 3, 3, 9, 9, 27 } }; // 2-D array[6][6]
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
       // Pre-compute a constexpr triangular color matrix properly normalized #475
       struct TriangularNormalizedColorMatrix
       {
@@ -450,7 +451,7 @@ namespace mg5amcCpu
 #endif
       for( int icol = 0; icol < ncolor; icol++ )
       {
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
         // === C++ START ===
         // Diagonal terms
 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
@@ -509,7 +510,7 @@ namespace mg5amcCpu
       MEs_sv_previous += deltaMEs_previous;
 #endif
       /*
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
       if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", blockDim.x * blockIdx.x + threadIdx.x, ihel, MEs_sv );
 #else
 #ifdef MGONGPU_CPPSIMD
@@ -604,8 +605,8 @@ namespace mg5amcCpu
       { -1, 1, 1, -1, -1, -1 },
       { -1, 1, 1, -1, 1, 1 },
       { -1, 1, 1, -1, 1, -1 } };
-#ifdef __CUDACC__
-    checkCuda( cudaMemcpyToSymbol( cHel, tHel, ncomb * npar * sizeof( short ) ) );
+#ifdef MGONGPUCPP_GPUIMPL
+    gpuMemcpyToSymbol( cHel, tHel, ncomb * npar * sizeof( short ) );
 #else
     memcpy( cHel, tHel, ncomb * npar * sizeof( short ) );
 #endif
@@ -647,9 +648,9 @@ namespace mg5amcCpu
     // Then copy them to CUDA constant memory (issue #39) or its C++ emulation in file-scope static memory
     const fptype tIPD[2] = { (fptype)m_pars->mdl_MT, (fptype)m_pars->mdl_WT };
     //const cxtype tIPC[0] = { ... }; // nicoup=0
-#ifdef __CUDACC__
-    checkCuda( cudaMemcpyToSymbol( cIPD, tIPD, 2 * sizeof( fptype ) ) );
-    //checkCuda( cudaMemcpyToSymbol( cIPC, tIPC, 0 * sizeof( cxtype ) ) ); // nicoup=0
+#ifdef MGONGPUCPP_GPUIMPL
+    gpuMemcpyToSymbol( cIPD, tIPD, 2 * sizeof( fptype ) );
+    //gpuMemcpyToSymbol( cIPC, tIPC, 0 * sizeof( cxtype ) ); // nicoup=0
 #else
     memcpy( cIPD, tIPD, 2 * sizeof( fptype ) );
     //memcpy( cIPC, tIPC, 0 * sizeof( cxtype ) ); // nicoup=0
@@ -687,7 +688,7 @@ namespace mg5amcCpu
   {
     std::stringstream out;
     // CUDA version (NVCC)
-    // [Use __NVCC__ instead of __CUDACC__ here!]
+    // [Use __NVCC__ instead of MGONGPUCPP_GPUIMPL here!]
     // [This tests if 'nvcc' was used even to build a .cc file, even if not necessarily 'nvcc -x cu' for a .cu file]
     // [Check 'nvcc --compiler-options -dM -E dummy.c | grep CUDA': see https://stackoverflow.com/a/53713712]
 #ifdef __NVCC__
@@ -752,12 +753,12 @@ namespace mg5amcCpu
   __global__ void /* clang-format off */
   computeDependentCouplings( const fptype* allgs, // input: Gs[nevt]
                              fptype* allcouplings // output: couplings[nevt*ndcoup*2]
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
                              , const int nevt     // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #endif
   ) /* clang-format on */
   {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     using namespace mg5amcGpu;
     using G_ACCESS = DeviceAccessGs;
     using C_ACCESS = DeviceAccessCouplings;
@@ -778,7 +779,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__ /* clang-format off */
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
   __global__ void
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
@@ -904,9 +905,9 @@ namespace mg5amcCpu
         nGoodHel++;
       }
     }
-#ifdef __CUDACC__
-    checkCuda( cudaMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) ) );
-    checkCuda( cudaMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) ) );
+#ifdef MGONGPUCPP_GPUIMPL
+    gpuMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) );
+    gpuMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) );
 #else
     cNGoodHel = nGoodHel;
     for( int ihel = 0; ihel < ncomb; ihel++ ) cGoodHel[ihel] = goodHel[ihel];
@@ -930,7 +931,7 @@ namespace mg5amcCpu
 #endif
             int* allselhel,                // output: helicity selection[nevt]
             int* allselcol                 // output: helicity selection[nevt]
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
             , const int nevt               // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #endif
             ) /* clang-format on */
@@ -950,7 +951,7 @@ namespace mg5amcCpu
     // Denominators: spins, colors and identical particles
     constexpr int helcolDenominators[1] = { 36 }; // assume nprocesses == 1 (#272 and #343)
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     // Remember: in CUDA this is a kernel for one event, in c++ this processes n events
     const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid
 #else
@@ -964,9 +965,12 @@ namespace mg5amcCpu
 #endif
 
     // Start sigmaKin_lines
+
+#include "GpuAbstraction.h"
+
     // === PART 0 - INITIALISATION (before calculate_wavefunctions) ===
     // Reset the "matrix elements" - running sums of |M|^2 over helicities for the given event
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     allMEs[ievt] = 0;
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     allNumerators[ievt] = 0;
@@ -994,7 +998,7 @@ namespace mg5amcCpu
     // === PART 1 - HELICITY LOOP: CALCULATE WAVEFUNCTIONS ===
     // (in both CUDA and C++, using precomputed good helicities)
 
-#ifdef __CUDACC__ // CUDA OR C++
+#ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++
 
     // *** START OF PART 1a - CUDA (one event per CPU thread) ***
     // Running sum of partial amplitudes squared for event by event color selection (#402)
@@ -1204,7 +1208,7 @@ namespace mg5amcCpu
     // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event
     // [NB 'sum over final spins, average over initial spins', eg see
     // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf]
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     allMEs[ievt] /= helcolDenominators[0];
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     if( channelId > 0 ) allMEs[ievt] *= allNumerators[ievt] / allDenominators[ievt];
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_ucx_ttxucx/CPPProcess.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_ucx_ttxucx/CPPProcess.h
index 391789dc81..da747c3465 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_ucx_ttxucx/CPPProcess.h
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_ucx_ttxucx/CPPProcess.h
@@ -4,7 +4,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
 // MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
@@ -25,7 +25,7 @@
 
 //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -118,7 +118,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   __global__ void
   computeDependentCouplings( const fptype* allgs,    // input: Gs[nevt]
                              fptype* allcouplings ); // output: couplings[nevt*ndcoup*2]
@@ -131,7 +131,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__ /* clang-format off */
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
   __global__ void
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
@@ -161,7 +161,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__ /* clang-format off */
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
   __global__ void
   sigmaKin( const fptype* allmomenta,      // input: momenta[nevt*npar*4]
             const fptype* allcouplings,    // input: couplings[nevt*ndcoup*2]
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_ucx_ttxucx/CudaRuntime.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_ucx_ttxucx/CudaRuntime.h
deleted file mode 120000
index ce9e1a487a..0000000000
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_ucx_ttxucx/CudaRuntime.h
+++ /dev/null
@@ -1 +0,0 @@
-../CudaRuntime.h
\ No newline at end of file
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_ucx_ttxucx/GpuAbstraction.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_ucx_ttxucx/GpuAbstraction.h
new file mode 120000
index 0000000000..72054e19ba
--- /dev/null
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_ucx_ttxucx/GpuAbstraction.h
@@ -0,0 +1 @@
+../GpuAbstraction.h
\ No newline at end of file
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_ucx_ttxucx/GpuRuntime.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_ucx_ttxucx/GpuRuntime.h
new file mode 120000
index 0000000000..3920e83be4
--- /dev/null
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_ucx_ttxucx/GpuRuntime.h
@@ -0,0 +1 @@
+../GpuRuntime.h
\ No newline at end of file
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_ucx_ttxucx/check_sa.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_ucx_ttxucx/check_sa.cc
index 3fbf0ffbee..7cac5ab47b 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_ucx_ttxucx/check_sa.cc
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_ucx_ttxucx/check_sa.cc
@@ -4,7 +4,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: O. Mattelaer (Nov 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 
 #include "mgOnGpuConfig.h"
@@ -12,6 +12,7 @@
 #include "BridgeKernels.h"
 #include "CPPProcess.h"
 #include "CrossSectionKernels.h"
+#include "GpuRuntime.h"
 #include "MatrixElementKernels.h"
 #include "MemoryAccessMatrixElements.h"
 #include "MemoryAccessMomenta.h"
@@ -65,7 +66,7 @@ usage( char* argv0, int ret = 1 )
   std::cout << std::endl;
   std::cout << "Summary stats are always computed: '-p' and '-j' only control their printout" << std::endl;
   std::cout << "The '-d' flag only enables NaN/abnormal warnings and OMP debugging" << std::endl;
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 #ifdef _OPENMP
   std::cout << std::endl;
   std::cout << "Use the OMP_NUM_THREADS environment variable to control OMP multi-threading" << std::endl;
@@ -96,7 +97,7 @@ int
 main( int argc, char** argv )
 {
   // Namespaces for CUDA and C++ (FIXME - eventually use the same namespace everywhere...)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   using namespace mg5amcGpu;
 #else
   using namespace mg5amcCpu;
@@ -134,9 +135,11 @@ main( int argc, char** argv )
     CurandDevice = 2
   };
 #ifdef MGONGPU_HAS_NO_CURAND
-  RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // this is the only supported mode if build has no curand (PR #784)
+  RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // this is the only supported mode if build has no curand (PR #784 and #785)
+#elif defined __HIPCC__
+#error Internal error: MGONGPU_HAS_NO_CURAND should have been set for __HIPCC__ // default on AMD GPUs should be common random
 #elif defined __CUDACC__
-  RandomNumberMode rndgen = RandomNumberMode::CurandDevice; // default on GPU if build has curand
+  RandomNumberMode rndgen = RandomNumberMode::CurandDevice; // default on NVidia GPU if build has curand
 #else
   RandomNumberMode rndgen = RandomNumberMode::CurandHost; // default on CPU if build has curand
 #endif
@@ -146,10 +149,10 @@ main( int argc, char** argv )
     RamboHost = 1,
     RamboDevice = 2
   };
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   RamboSamplingMode rmbsmp = RamboSamplingMode::RamboDevice; // default on GPU
 #else
-  RamboSamplingMode rmbsmp = RamboSamplingMode::RamboHost;  // default on CPU
+  RamboSamplingMode rmbsmp = RamboSamplingMode::RamboHost; // default on CPU
 #endif
   // Bridge emulation mode (NB Bridge implies RamboHost!)
   bool bridge = false;
@@ -177,7 +180,7 @@ main( int argc, char** argv )
     else if( arg == "--curdev" )
     {
 #ifndef __CUDACC__
-      throw std::runtime_error( "CurandDevice is not supported on CPUs" );
+      throw std::runtime_error( "CurandDevice is not supported on CPUs or non-NVidia GPUs" );
 #elif defined MGONGPU_HAS_NO_CURAND
       throw std::runtime_error( "CurandDevice is not supported because this application was built without Curand support" );
 #else
@@ -198,7 +201,7 @@ main( int argc, char** argv )
     }
     else if( arg == "--rmbdev" )
     {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
       rmbsmp = RamboSamplingMode::RamboDevice;
 #else
       throw std::runtime_error( "RamboDevice is not supported on CPUs" );
@@ -272,13 +275,13 @@ main( int argc, char** argv )
     return usage( argv[0] );
   }
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 #ifdef _OPENMP
   ompnumthreadsNotSetMeansOneThread( debug ? 1 : 0 ); // quiet(-1), info(0), debug(1)
 #endif
 #endif
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // Fail gently and avoid "Illegal instruction (core dumped)" if the host does not support the SIMD used in the ME calculation
   // Note: this prevents a crash on pmpe04 but not on some github CI nodes?
   // [NB: SIMD vectorization in mg5amc C++ code is only used in the ME calculation below MatrixElementKernelHost!]
@@ -296,14 +299,14 @@ main( int argc, char** argv )
 
   // === STEP 0 - INITIALISE
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 
-  // --- 00. Initialise cuda
-  // Instantiate a CudaRuntime at the beginnining of the application's main to
-  // invoke cudaSetDevice(0) in the constructor and book a cudaDeviceReset() call in the destructor
-  const std::string cdinKey = "00 CudaInit";
+  // --- 00. Initialise GPU
+  // Instantiate a GpuRuntime at the beginnining of the application's main.
+  // For CUDA this invokes cudaSetDevice(0) in the constructor and books a cudaDeviceReset() call in the destructor.
+  const std::string cdinKey = "00 GpuInit";
   timermap.start( cdinKey );
-  CudaRuntime cudaRuntime( debug );
+  GpuRuntime GpuRuntime( debug );
 #endif
 
   // --- 0a. Initialise physics process
@@ -325,7 +328,7 @@ main( int argc, char** argv )
   timermap.start( alloKey );
 
   // Memory buffers for random numbers for momenta
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferRndNumMomenta hstRndmom( nevt );
 #else
   PinnedHostBufferRndNumMomenta hstRndmom( nevt );
@@ -333,7 +336,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for sampling weights
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferWeights hstWeights( nevt );
 #else
   PinnedHostBufferWeights hstWeights( nevt );
@@ -341,7 +344,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for momenta
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferMomenta hstMomenta( nevt );
 #else
   PinnedHostBufferMomenta hstMomenta( nevt );
@@ -349,7 +352,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for Gs
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferGs hstGs( nevt );
 #else
   PinnedHostBufferGs hstGs( nevt );
@@ -366,7 +369,7 @@ main( int argc, char** argv )
   }
 
   // Memory buffers for matrix elements
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferMatrixElements hstMatrixElements( nevt );
 #else
   PinnedHostBufferMatrixElements hstMatrixElements( nevt );
@@ -375,7 +378,7 @@ main( int argc, char** argv )
 
   // Memory buffers for random numbers for helicity selection
   // *** NB #403 these buffers always remain initialised at 0: no need for helicity choice in gcheck/check (no LHE produced) ***
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferRndNumHelicity hstRndHel( nevt );
 #else
   PinnedHostBufferRndNumHelicity hstRndHel( nevt );
@@ -384,7 +387,7 @@ main( int argc, char** argv )
 
   // Memory buffers for random numbers for color selection
   // *** NB #402 these buffers always remain initialised at 0: no need for color choice in gcheck/check (no LHE produced) ***
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferRndNumColor hstRndCol( nevt );
 #else
   PinnedHostBufferRndNumColor hstRndCol( nevt );
@@ -392,7 +395,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for helicity selection
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferSelectedHelicity hstSelHel( nevt );
 #else
   PinnedHostBufferSelectedHelicity hstSelHel( nevt );
@@ -400,7 +403,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for color selection
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferSelectedColor hstSelCol( nevt );
 #else
   PinnedHostBufferSelectedColor hstSelCol( nevt );
@@ -438,7 +441,7 @@ main( int argc, char** argv )
     const bool onDevice = true;
     prnk.reset( new CurandRandomNumberKernel( devRndmom, onDevice ) );
 #else
-    throw std::logic_error( "INTERNAL ERROR! CurandDevice is not supported on CPUs" ); // INTERNAL ERROR (no path to this statement)
+    throw std::logic_error( "INTERNAL ERROR! CurandDevice is not supported on CPUs or non-NVidia GPUs" ); // INTERNAL ERROR (no path to this statement)
 #endif
   }
 
@@ -450,7 +453,7 @@ main( int argc, char** argv )
   }
   else
   {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     prsk.reset( new RamboSamplingKernelDevice( energy, devRndmom, devMomenta, devWeights, gpublocks, gputhreads ) );
 #else
     throw std::logic_error( "RamboDevice is not supported on CPUs" ); // INTERNAL ERROR (no path to this statement)
@@ -461,7 +464,7 @@ main( int argc, char** argv )
   std::unique_ptr<MatrixElementKernelBase> pmek;
   if( !bridge )
   {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     pmek.reset( new MatrixElementKernelDevice( devMomenta, devGs, devRndHel, devRndCol, devMatrixElements, devSelHel, devSelCol, gpublocks, gputhreads ) );
 #else
     pmek.reset( new MatrixElementKernelHost( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, nevt ) );
@@ -469,7 +472,7 @@ main( int argc, char** argv )
   }
   else
   {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     pmek.reset( new BridgeKernelDevice( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, gpublocks, gputhreads ) );
 #else
     pmek.reset( new BridgeKernelHost( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, nevt ) );
@@ -511,7 +514,7 @@ main( int argc, char** argv )
     prnk->generateRnarray();
     //std::cout << "Got random numbers" << std::endl;
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     if( rndgen != RandomNumberMode::CurandDevice && rmbsmp == RamboSamplingMode::RamboDevice )
     {
       // --- 1c. Copy rndmom from host to device
@@ -543,7 +546,7 @@ main( int argc, char** argv )
     prsk->getMomentaFinal();
     //std::cout << "Got final momenta" << std::endl;
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     if( rmbsmp == RamboSamplingMode::RamboDevice )
     {
       // --- 2c. CopyDToH Weights
@@ -588,7 +591,7 @@ main( int argc, char** argv )
       dynamic_cast<BridgeKernelBase*>( pmek.get() )->transposeInputMomentaC2F();
     }
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     // --- 2d. CopyHToD Momenta
     const std::string gKey = "0.. CpHTDg";
     rambtime += timermap.start( gKey ); // FIXME! NOT A RAMBO TIMER!
@@ -617,7 +620,7 @@ main( int argc, char** argv )
     wv3atime += timermap.stop(); // calc only
     wavetime += wv3atime;        // calc plus copy
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     if( !bridge )
     {
       // --- 3b. CopyDToH MEs
@@ -760,18 +763,22 @@ main( int argc, char** argv )
     rndgentxt = "CURAND DEVICE";
 #ifdef __CUDACC__
   rndgentxt += " (CUDA code)";
+#elif defined __HIPCC__
+  rndgentxt += " (HIP code)";
 #else
   rndgentxt += " (C++ code)";
 #endif
 
   // Workflow description summary
   std::string wrkflwtxt;
-  // -- CUDA or C++?
+  // -- CUDA or HIP or C++?
 #ifdef __CUDACC__
   wrkflwtxt += "CUD:";
+#elif defined __HIPCC__
+  wrkflwtxt += "HIP:";
 #else
   wrkflwtxt += "CPP:";
-#endif
+#endif /* clang-format off */
   // -- DOUBLE or FLOAT?
 #if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
   wrkflwtxt += "MIX+"; // mixed fptypes (single precision color algebra #537)
@@ -781,7 +788,7 @@ main( int argc, char** argv )
   wrkflwtxt += "FLT+";
 #else
   wrkflwtxt += "???+"; // no path to this statement
-#endif
+#endif /* clang-format on */
   // -- CUCOMPLEX or THRUST or STD complex numbers?
 #ifdef __CUDACC__
 #if defined MGONGPU_CUCXTYPE_CUCOMPLEX
@@ -793,6 +800,12 @@ main( int argc, char** argv )
 #else
   wrkflwtxt += "???:"; // no path to this statement
 #endif
+#elif defined __HIPCC__
+#if defined MGONGPU_CUCXTYPE_CXSMPL
+  wrkflwtxt += "CXS:";
+#else
+  wrkflwtxt += "???:"; // no path to this statement
+#endif
 #else
 #if defined MGONGPU_CPPCXTYPE_STDCOMPLEX
   wrkflwtxt += "STX:";
@@ -818,7 +831,7 @@ main( int argc, char** argv )
     wrkflwtxt += "RMBDEV+";
   else
     wrkflwtxt += "??????+"; // no path to this statement
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   // -- HOST or DEVICE matrix elements? Standalone MEs or BRIDGE?
   if( !bridge )
     wrkflwtxt += "MESDEV";
@@ -874,7 +887,7 @@ main( int argc, char** argv )
 
   if( perf )
   {
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 #ifdef _OPENMP
     // Get the output of "nproc --all" (https://stackoverflow.com/a/478960)
     std::string nprocall;
@@ -895,6 +908,8 @@ main( int argc, char** argv )
     std::cout << std::string( SEP79, '*' ) << std::endl
 #ifdef __CUDACC__
               << "Process                     = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_CUDA"
+#elif defined __HIPCC__
+              << "Process                     = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_HIP"
 #else
               << "Process                     = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_CPP"
 #endif
@@ -921,21 +936,21 @@ main( int argc, char** argv )
 #elif defined MGONGPU_FPTYPE_FLOAT
               << "FP precision                = FLOAT (NaN/abnormal=" << nabn << ", zero=" << nzero << ")" << std::endl
 #endif
-#ifdef __CUDACC__
 #if defined MGONGPU_CUCXTYPE_CUCOMPLEX
               << "Complex type                = CUCOMPLEX" << std::endl
 #elif defined MGONGPU_CUCXTYPE_THRUST
               << "Complex type                = THRUST::COMPLEX" << std::endl
-#endif
-#else
+#elif defined MGONGPU_CUCXTYPE_CXSMPL
               << "Complex type                = STD::COMPLEX" << std::endl
+#else
+              << "Complex type                = ???" << std::endl // no path to this statement...
 #endif
               << "RanNumb memory layout       = AOSOA[" << neppR << "]"
               << ( neppR == 1 ? " == AOS" : "" )
               << " [HARDCODED FOR REPRODUCIBILITY]" << std::endl
               << "Momenta memory layout       = AOSOA[" << neppM << "]"
               << ( neppM == 1 ? " == AOS" : "" ) << std::endl
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     //<< "Wavefunction GPU memory     = LOCAL" << std::endl
 #else
 #if !defined MGONGPU_CPPSIMD
@@ -966,7 +981,7 @@ main( int argc, char** argv )
 #endif
 #endif
               << "Random number generation    = " << rndgentxt << std::endl
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 #ifdef _OPENMP
               << "OMP threads / `nproc --all` = " << omp_get_max_threads() << " / " << nprocall // includes a newline
 #endif
@@ -1062,14 +1077,14 @@ main( int argc, char** argv )
              << "\"FLOAT (NaN/abnormal=" << nabn << ")\"," << std::endl
 #endif
              << "\"Complex type\": "
-#ifdef __CUDACC__
 #if defined MGONGPU_CUCXTYPE_CUCOMPLEX
              << "\"CUCOMPLEX\"," << std::endl
 #elif defined MGONGPU_CUCXTYPE_THRUST
              << "\"THRUST::COMPLEX\"," << std::endl
-#endif
-#else
+#elif defined MGONGPU_CUCXTYPE_CXSMPL
              << "\"STD::COMPLEX\"," << std::endl
+#else
+             << "\"???\"," << std::endl                           // no path to this statement...
 #endif
              << "\"RanNumb memory layout\": "
              << "\"AOSOA[" << neppR << "]\""
@@ -1077,7 +1092,7 @@ main( int argc, char** argv )
              << "\"Momenta memory layout\": "
              << "\"AOSOA[" << neppM << "]\""
              << ( neppM == 1 ? " == AOS" : "" ) << ", " << std::endl
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     //<< "\"Wavefunction GPU memory\": " << "\"LOCAL\"," << std::endl
 #endif
              << "\"Curand generation\": "
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uu_ttxuu/CPPProcess.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uu_ttxuu/CPPProcess.cc
index 302d63e31d..97912e5855 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uu_ttxuu/CPPProcess.cc
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uu_ttxuu/CPPProcess.cc
@@ -4,7 +4,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
 // MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
@@ -16,7 +16,6 @@
 
 #include "mgOnGpuConfig.h"
 
-#include "CudaRuntime.h"
 #include "HelAmps_sm.h"
 #include "MemoryAccessAmplitudes.h"
 #include "MemoryAccessCouplings.h"
@@ -49,7 +48,7 @@
 // Process: d d > t t~ d d WEIGHTED<=4 @2
 // Process: s s > t t~ s s WEIGHTED<=4 @2
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -83,7 +82,7 @@ namespace mg5amcCpu
   __device__ const fptype cIPD[2] = { (fptype)Parameters_sm::mdl_MT, (fptype)Parameters_sm::mdl_WT };
   __device__ const fptype* cIPC = nullptr; // unused as nicoup=0
 #else
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   __device__ __constant__ fptype cIPD[2];
   __device__ __constant__ fptype* cIPC = nullptr; // unused as nicoup=0
 #else
@@ -93,7 +92,7 @@ namespace mg5amcCpu
 #endif
 
   // Helicity combinations (and filtering of "good" helicity combinations)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   __device__ __constant__ short cHel[ncomb][npar];
   __device__ __constant__ int cNGoodHel;
   __device__ __constant__ int cGoodHel[ncomb];
@@ -121,13 +120,13 @@ namespace mg5amcCpu
                            fptype* allDenominators,       // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
                            fptype_sv* jamp2_sv            // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled)
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
                            , const int ievt00             // input: first event number in current C++ event page (for CUDA, ievt depends on threadid)
 #endif
                            )
   //ALWAYS_INLINE // attributes are not permitted in a function definition
   {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     using namespace mg5amcGpu;
     using M_ACCESS = DeviceAccessMomenta;         // non-trivial access: buffer includes all events
     using E_ACCESS = DeviceAccessMatrixElements;  // non-trivial access: buffer includes all events
@@ -154,7 +153,7 @@ namespace mg5amcCpu
 #endif /* clang-format on */
     mgDebug( 0, __FUNCTION__ );
     //printf( "calculate_wavefunctions: ihel=%2d\n", ihel );
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
     //printf( "calculate_wavefunctions: ievt00=%d\n", ievt00 );
 #endif
 
@@ -190,7 +189,7 @@ namespace mg5amcCpu
 #endif
     for( int iParity = 0; iParity < nParity; ++iParity )
     { // START LOOP ON IPARITY
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
       const int ievt0 = ievt00 + iParity * neppV;
 #endif
       constexpr size_t nxcoup = ndcoup + nicoup; // both dependent and independent couplings
@@ -203,8 +202,10 @@ namespace mg5amcCpu
         allCOUPs[idcoup] = CD_ACCESS::idcoupAccessBufferConst( allcouplings, idcoup ); // dependent couplings, vary event-by-event
       for( size_t iicoup = 0; iicoup < nicoup; iicoup++ )
         allCOUPs[ndcoup + iicoup] = CI_ACCESS::iicoupAccessBufferConst( cIPC, iicoup ); // independent couplings, fixed for all events
+#ifdef MGONGPUCPP_GPUIMPL
 #ifdef __CUDACC__
 #pragma nv_diagnostic pop
+#endif
       // CUDA kernels take input/output buffers with momenta/MEs for all events
       const fptype* momenta = allmomenta;
       const fptype* COUPs[nxcoup];
@@ -497,7 +498,7 @@ namespace mg5amcCpu
         { 3, 9, 9, 3, 27, 9 },
         { 9, 3, 3, 9, 9, 27 } }; // 2-D array[6][6]
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
       // Pre-compute a constexpr triangular color matrix properly normalized #475
       struct TriangularNormalizedColorMatrix
       {
@@ -554,7 +555,7 @@ namespace mg5amcCpu
 #endif
       for( int icol = 0; icol < ncolor; icol++ )
       {
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
         // === C++ START ===
         // Diagonal terms
 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
@@ -613,7 +614,7 @@ namespace mg5amcCpu
       MEs_sv_previous += deltaMEs_previous;
 #endif
       /*
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
       if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", blockDim.x * blockIdx.x + threadIdx.x, ihel, MEs_sv );
 #else
 #ifdef MGONGPU_CPPSIMD
@@ -708,8 +709,8 @@ namespace mg5amcCpu
       { -1, -1, 1, -1, -1, 1 },
       { -1, -1, 1, -1, 1, -1 },
       { -1, -1, 1, -1, 1, 1 } };
-#ifdef __CUDACC__
-    checkCuda( cudaMemcpyToSymbol( cHel, tHel, ncomb * npar * sizeof( short ) ) );
+#ifdef MGONGPUCPP_GPUIMPL
+    gpuMemcpyToSymbol( cHel, tHel, ncomb * npar * sizeof( short ) );
 #else
     memcpy( cHel, tHel, ncomb * npar * sizeof( short ) );
 #endif
@@ -751,9 +752,9 @@ namespace mg5amcCpu
     // Then copy them to CUDA constant memory (issue #39) or its C++ emulation in file-scope static memory
     const fptype tIPD[2] = { (fptype)m_pars->mdl_MT, (fptype)m_pars->mdl_WT };
     //const cxtype tIPC[0] = { ... }; // nicoup=0
-#ifdef __CUDACC__
-    checkCuda( cudaMemcpyToSymbol( cIPD, tIPD, 2 * sizeof( fptype ) ) );
-    //checkCuda( cudaMemcpyToSymbol( cIPC, tIPC, 0 * sizeof( cxtype ) ) ); // nicoup=0
+#ifdef MGONGPUCPP_GPUIMPL
+    gpuMemcpyToSymbol( cIPD, tIPD, 2 * sizeof( fptype ) );
+    //gpuMemcpyToSymbol( cIPC, tIPC, 0 * sizeof( cxtype ) ); // nicoup=0
 #else
     memcpy( cIPD, tIPD, 2 * sizeof( fptype ) );
     //memcpy( cIPC, tIPC, 0 * sizeof( cxtype ) ); // nicoup=0
@@ -791,7 +792,7 @@ namespace mg5amcCpu
   {
     std::stringstream out;
     // CUDA version (NVCC)
-    // [Use __NVCC__ instead of __CUDACC__ here!]
+    // [Use __NVCC__ instead of MGONGPUCPP_GPUIMPL here!]
     // [This tests if 'nvcc' was used even to build a .cc file, even if not necessarily 'nvcc -x cu' for a .cu file]
     // [Check 'nvcc --compiler-options -dM -E dummy.c | grep CUDA': see https://stackoverflow.com/a/53713712]
 #ifdef __NVCC__
@@ -856,12 +857,12 @@ namespace mg5amcCpu
   __global__ void /* clang-format off */
   computeDependentCouplings( const fptype* allgs, // input: Gs[nevt]
                              fptype* allcouplings // output: couplings[nevt*ndcoup*2]
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
                              , const int nevt     // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #endif
   ) /* clang-format on */
   {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     using namespace mg5amcGpu;
     using G_ACCESS = DeviceAccessGs;
     using C_ACCESS = DeviceAccessCouplings;
@@ -882,7 +883,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__ /* clang-format off */
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
   __global__ void
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
@@ -1008,9 +1009,9 @@ namespace mg5amcCpu
         nGoodHel++;
       }
     }
-#ifdef __CUDACC__
-    checkCuda( cudaMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) ) );
-    checkCuda( cudaMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) ) );
+#ifdef MGONGPUCPP_GPUIMPL
+    gpuMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) );
+    gpuMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) );
 #else
     cNGoodHel = nGoodHel;
     for( int ihel = 0; ihel < ncomb; ihel++ ) cGoodHel[ihel] = goodHel[ihel];
@@ -1034,7 +1035,7 @@ namespace mg5amcCpu
 #endif
             int* allselhel,                // output: helicity selection[nevt]
             int* allselcol                 // output: helicity selection[nevt]
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
             , const int nevt               // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #endif
             ) /* clang-format on */
@@ -1054,7 +1055,7 @@ namespace mg5amcCpu
     // Denominators: spins, colors and identical particles
     constexpr int helcolDenominators[1] = { 72 }; // assume nprocesses == 1 (#272 and #343)
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     // Remember: in CUDA this is a kernel for one event, in c++ this processes n events
     const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid
 #else
@@ -1068,9 +1069,12 @@ namespace mg5amcCpu
 #endif
 
     // Start sigmaKin_lines
+
+#include "GpuAbstraction.h"
+
     // === PART 0 - INITIALISATION (before calculate_wavefunctions) ===
     // Reset the "matrix elements" - running sums of |M|^2 over helicities for the given event
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     allMEs[ievt] = 0;
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     allNumerators[ievt] = 0;
@@ -1098,7 +1102,7 @@ namespace mg5amcCpu
     // === PART 1 - HELICITY LOOP: CALCULATE WAVEFUNCTIONS ===
     // (in both CUDA and C++, using precomputed good helicities)
 
-#ifdef __CUDACC__ // CUDA OR C++
+#ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++
 
     // *** START OF PART 1a - CUDA (one event per CPU thread) ***
     // Running sum of partial amplitudes squared for event by event color selection (#402)
@@ -1308,7 +1312,7 @@ namespace mg5amcCpu
     // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event
     // [NB 'sum over final spins, average over initial spins', eg see
     // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf]
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     allMEs[ievt] /= helcolDenominators[0];
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     if( channelId > 0 ) allMEs[ievt] *= allNumerators[ievt] / allDenominators[ievt];
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uu_ttxuu/CPPProcess.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uu_ttxuu/CPPProcess.h
index 2d95f4b170..d8232ea652 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uu_ttxuu/CPPProcess.h
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uu_ttxuu/CPPProcess.h
@@ -4,7 +4,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
 // MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
@@ -25,7 +25,7 @@
 
 //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -110,7 +110,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   __global__ void
   computeDependentCouplings( const fptype* allgs,    // input: Gs[nevt]
                              fptype* allcouplings ); // output: couplings[nevt*ndcoup*2]
@@ -123,7 +123,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__ /* clang-format off */
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
   __global__ void
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
@@ -153,7 +153,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__ /* clang-format off */
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
   __global__ void
   sigmaKin( const fptype* allmomenta,      // input: momenta[nevt*npar*4]
             const fptype* allcouplings,    // input: couplings[nevt*ndcoup*2]
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uu_ttxuu/CudaRuntime.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uu_ttxuu/CudaRuntime.h
deleted file mode 120000
index ce9e1a487a..0000000000
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uu_ttxuu/CudaRuntime.h
+++ /dev/null
@@ -1 +0,0 @@
-../CudaRuntime.h
\ No newline at end of file
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uu_ttxuu/GpuAbstraction.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uu_ttxuu/GpuAbstraction.h
new file mode 120000
index 0000000000..72054e19ba
--- /dev/null
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uu_ttxuu/GpuAbstraction.h
@@ -0,0 +1 @@
+../GpuAbstraction.h
\ No newline at end of file
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uu_ttxuu/GpuRuntime.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uu_ttxuu/GpuRuntime.h
new file mode 120000
index 0000000000..3920e83be4
--- /dev/null
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uu_ttxuu/GpuRuntime.h
@@ -0,0 +1 @@
+../GpuRuntime.h
\ No newline at end of file
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uu_ttxuu/check_sa.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uu_ttxuu/check_sa.cc
index 3fbf0ffbee..7cac5ab47b 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uu_ttxuu/check_sa.cc
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uu_ttxuu/check_sa.cc
@@ -4,7 +4,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: O. Mattelaer (Nov 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 
 #include "mgOnGpuConfig.h"
@@ -12,6 +12,7 @@
 #include "BridgeKernels.h"
 #include "CPPProcess.h"
 #include "CrossSectionKernels.h"
+#include "GpuRuntime.h"
 #include "MatrixElementKernels.h"
 #include "MemoryAccessMatrixElements.h"
 #include "MemoryAccessMomenta.h"
@@ -65,7 +66,7 @@ usage( char* argv0, int ret = 1 )
   std::cout << std::endl;
   std::cout << "Summary stats are always computed: '-p' and '-j' only control their printout" << std::endl;
   std::cout << "The '-d' flag only enables NaN/abnormal warnings and OMP debugging" << std::endl;
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 #ifdef _OPENMP
   std::cout << std::endl;
   std::cout << "Use the OMP_NUM_THREADS environment variable to control OMP multi-threading" << std::endl;
@@ -96,7 +97,7 @@ int
 main( int argc, char** argv )
 {
   // Namespaces for CUDA and C++ (FIXME - eventually use the same namespace everywhere...)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   using namespace mg5amcGpu;
 #else
   using namespace mg5amcCpu;
@@ -134,9 +135,11 @@ main( int argc, char** argv )
     CurandDevice = 2
   };
 #ifdef MGONGPU_HAS_NO_CURAND
-  RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // this is the only supported mode if build has no curand (PR #784)
+  RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // this is the only supported mode if build has no curand (PR #784 and #785)
+#elif defined __HIPCC__
+#error Internal error: MGONGPU_HAS_NO_CURAND should have been set for __HIPCC__ // default on AMD GPUs should be common random
 #elif defined __CUDACC__
-  RandomNumberMode rndgen = RandomNumberMode::CurandDevice; // default on GPU if build has curand
+  RandomNumberMode rndgen = RandomNumberMode::CurandDevice; // default on NVidia GPU if build has curand
 #else
   RandomNumberMode rndgen = RandomNumberMode::CurandHost; // default on CPU if build has curand
 #endif
@@ -146,10 +149,10 @@ main( int argc, char** argv )
     RamboHost = 1,
     RamboDevice = 2
   };
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   RamboSamplingMode rmbsmp = RamboSamplingMode::RamboDevice; // default on GPU
 #else
-  RamboSamplingMode rmbsmp = RamboSamplingMode::RamboHost;  // default on CPU
+  RamboSamplingMode rmbsmp = RamboSamplingMode::RamboHost; // default on CPU
 #endif
   // Bridge emulation mode (NB Bridge implies RamboHost!)
   bool bridge = false;
@@ -177,7 +180,7 @@ main( int argc, char** argv )
     else if( arg == "--curdev" )
     {
 #ifndef __CUDACC__
-      throw std::runtime_error( "CurandDevice is not supported on CPUs" );
+      throw std::runtime_error( "CurandDevice is not supported on CPUs or non-NVidia GPUs" );
 #elif defined MGONGPU_HAS_NO_CURAND
       throw std::runtime_error( "CurandDevice is not supported because this application was built without Curand support" );
 #else
@@ -198,7 +201,7 @@ main( int argc, char** argv )
     }
     else if( arg == "--rmbdev" )
     {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
       rmbsmp = RamboSamplingMode::RamboDevice;
 #else
       throw std::runtime_error( "RamboDevice is not supported on CPUs" );
@@ -272,13 +275,13 @@ main( int argc, char** argv )
     return usage( argv[0] );
   }
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 #ifdef _OPENMP
   ompnumthreadsNotSetMeansOneThread( debug ? 1 : 0 ); // quiet(-1), info(0), debug(1)
 #endif
 #endif
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // Fail gently and avoid "Illegal instruction (core dumped)" if the host does not support the SIMD used in the ME calculation
   // Note: this prevents a crash on pmpe04 but not on some github CI nodes?
   // [NB: SIMD vectorization in mg5amc C++ code is only used in the ME calculation below MatrixElementKernelHost!]
@@ -296,14 +299,14 @@ main( int argc, char** argv )
 
   // === STEP 0 - INITIALISE
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 
-  // --- 00. Initialise cuda
-  // Instantiate a CudaRuntime at the beginnining of the application's main to
-  // invoke cudaSetDevice(0) in the constructor and book a cudaDeviceReset() call in the destructor
-  const std::string cdinKey = "00 CudaInit";
+  // --- 00. Initialise GPU
+  // Instantiate a GpuRuntime at the beginnining of the application's main.
+  // For CUDA this invokes cudaSetDevice(0) in the constructor and books a cudaDeviceReset() call in the destructor.
+  const std::string cdinKey = "00 GpuInit";
   timermap.start( cdinKey );
-  CudaRuntime cudaRuntime( debug );
+  GpuRuntime GpuRuntime( debug );
 #endif
 
   // --- 0a. Initialise physics process
@@ -325,7 +328,7 @@ main( int argc, char** argv )
   timermap.start( alloKey );
 
   // Memory buffers for random numbers for momenta
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferRndNumMomenta hstRndmom( nevt );
 #else
   PinnedHostBufferRndNumMomenta hstRndmom( nevt );
@@ -333,7 +336,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for sampling weights
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferWeights hstWeights( nevt );
 #else
   PinnedHostBufferWeights hstWeights( nevt );
@@ -341,7 +344,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for momenta
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferMomenta hstMomenta( nevt );
 #else
   PinnedHostBufferMomenta hstMomenta( nevt );
@@ -349,7 +352,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for Gs
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferGs hstGs( nevt );
 #else
   PinnedHostBufferGs hstGs( nevt );
@@ -366,7 +369,7 @@ main( int argc, char** argv )
   }
 
   // Memory buffers for matrix elements
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferMatrixElements hstMatrixElements( nevt );
 #else
   PinnedHostBufferMatrixElements hstMatrixElements( nevt );
@@ -375,7 +378,7 @@ main( int argc, char** argv )
 
   // Memory buffers for random numbers for helicity selection
   // *** NB #403 these buffers always remain initialised at 0: no need for helicity choice in gcheck/check (no LHE produced) ***
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferRndNumHelicity hstRndHel( nevt );
 #else
   PinnedHostBufferRndNumHelicity hstRndHel( nevt );
@@ -384,7 +387,7 @@ main( int argc, char** argv )
 
   // Memory buffers for random numbers for color selection
   // *** NB #402 these buffers always remain initialised at 0: no need for color choice in gcheck/check (no LHE produced) ***
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferRndNumColor hstRndCol( nevt );
 #else
   PinnedHostBufferRndNumColor hstRndCol( nevt );
@@ -392,7 +395,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for helicity selection
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferSelectedHelicity hstSelHel( nevt );
 #else
   PinnedHostBufferSelectedHelicity hstSelHel( nevt );
@@ -400,7 +403,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for color selection
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferSelectedColor hstSelCol( nevt );
 #else
   PinnedHostBufferSelectedColor hstSelCol( nevt );
@@ -438,7 +441,7 @@ main( int argc, char** argv )
     const bool onDevice = true;
     prnk.reset( new CurandRandomNumberKernel( devRndmom, onDevice ) );
 #else
-    throw std::logic_error( "INTERNAL ERROR! CurandDevice is not supported on CPUs" ); // INTERNAL ERROR (no path to this statement)
+    throw std::logic_error( "INTERNAL ERROR! CurandDevice is not supported on CPUs or non-NVidia GPUs" ); // INTERNAL ERROR (no path to this statement)
 #endif
   }
 
@@ -450,7 +453,7 @@ main( int argc, char** argv )
   }
   else
   {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     prsk.reset( new RamboSamplingKernelDevice( energy, devRndmom, devMomenta, devWeights, gpublocks, gputhreads ) );
 #else
     throw std::logic_error( "RamboDevice is not supported on CPUs" ); // INTERNAL ERROR (no path to this statement)
@@ -461,7 +464,7 @@ main( int argc, char** argv )
   std::unique_ptr<MatrixElementKernelBase> pmek;
   if( !bridge )
   {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     pmek.reset( new MatrixElementKernelDevice( devMomenta, devGs, devRndHel, devRndCol, devMatrixElements, devSelHel, devSelCol, gpublocks, gputhreads ) );
 #else
     pmek.reset( new MatrixElementKernelHost( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, nevt ) );
@@ -469,7 +472,7 @@ main( int argc, char** argv )
   }
   else
   {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     pmek.reset( new BridgeKernelDevice( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, gpublocks, gputhreads ) );
 #else
     pmek.reset( new BridgeKernelHost( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, nevt ) );
@@ -511,7 +514,7 @@ main( int argc, char** argv )
     prnk->generateRnarray();
     //std::cout << "Got random numbers" << std::endl;
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     if( rndgen != RandomNumberMode::CurandDevice && rmbsmp == RamboSamplingMode::RamboDevice )
     {
       // --- 1c. Copy rndmom from host to device
@@ -543,7 +546,7 @@ main( int argc, char** argv )
     prsk->getMomentaFinal();
     //std::cout << "Got final momenta" << std::endl;
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     if( rmbsmp == RamboSamplingMode::RamboDevice )
     {
       // --- 2c. CopyDToH Weights
@@ -588,7 +591,7 @@ main( int argc, char** argv )
       dynamic_cast<BridgeKernelBase*>( pmek.get() )->transposeInputMomentaC2F();
     }
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     // --- 2d. CopyHToD Momenta
     const std::string gKey = "0.. CpHTDg";
     rambtime += timermap.start( gKey ); // FIXME! NOT A RAMBO TIMER!
@@ -617,7 +620,7 @@ main( int argc, char** argv )
     wv3atime += timermap.stop(); // calc only
     wavetime += wv3atime;        // calc plus copy
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     if( !bridge )
     {
       // --- 3b. CopyDToH MEs
@@ -760,18 +763,22 @@ main( int argc, char** argv )
     rndgentxt = "CURAND DEVICE";
 #ifdef __CUDACC__
   rndgentxt += " (CUDA code)";
+#elif defined __HIPCC__
+  rndgentxt += " (HIP code)";
 #else
   rndgentxt += " (C++ code)";
 #endif
 
   // Workflow description summary
   std::string wrkflwtxt;
-  // -- CUDA or C++?
+  // -- CUDA or HIP or C++?
 #ifdef __CUDACC__
   wrkflwtxt += "CUD:";
+#elif defined __HIPCC__
+  wrkflwtxt += "HIP:";
 #else
   wrkflwtxt += "CPP:";
-#endif
+#endif /* clang-format off */
   // -- DOUBLE or FLOAT?
 #if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
   wrkflwtxt += "MIX+"; // mixed fptypes (single precision color algebra #537)
@@ -781,7 +788,7 @@ main( int argc, char** argv )
   wrkflwtxt += "FLT+";
 #else
   wrkflwtxt += "???+"; // no path to this statement
-#endif
+#endif /* clang-format on */
   // -- CUCOMPLEX or THRUST or STD complex numbers?
 #ifdef __CUDACC__
 #if defined MGONGPU_CUCXTYPE_CUCOMPLEX
@@ -793,6 +800,12 @@ main( int argc, char** argv )
 #else
   wrkflwtxt += "???:"; // no path to this statement
 #endif
+#elif defined __HIPCC__
+#if defined MGONGPU_CUCXTYPE_CXSMPL
+  wrkflwtxt += "CXS:";
+#else
+  wrkflwtxt += "???:"; // no path to this statement
+#endif
 #else
 #if defined MGONGPU_CPPCXTYPE_STDCOMPLEX
   wrkflwtxt += "STX:";
@@ -818,7 +831,7 @@ main( int argc, char** argv )
     wrkflwtxt += "RMBDEV+";
   else
     wrkflwtxt += "??????+"; // no path to this statement
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   // -- HOST or DEVICE matrix elements? Standalone MEs or BRIDGE?
   if( !bridge )
     wrkflwtxt += "MESDEV";
@@ -874,7 +887,7 @@ main( int argc, char** argv )
 
   if( perf )
   {
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 #ifdef _OPENMP
     // Get the output of "nproc --all" (https://stackoverflow.com/a/478960)
     std::string nprocall;
@@ -895,6 +908,8 @@ main( int argc, char** argv )
     std::cout << std::string( SEP79, '*' ) << std::endl
 #ifdef __CUDACC__
               << "Process                     = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_CUDA"
+#elif defined __HIPCC__
+              << "Process                     = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_HIP"
 #else
               << "Process                     = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_CPP"
 #endif
@@ -921,21 +936,21 @@ main( int argc, char** argv )
 #elif defined MGONGPU_FPTYPE_FLOAT
               << "FP precision                = FLOAT (NaN/abnormal=" << nabn << ", zero=" << nzero << ")" << std::endl
 #endif
-#ifdef __CUDACC__
 #if defined MGONGPU_CUCXTYPE_CUCOMPLEX
               << "Complex type                = CUCOMPLEX" << std::endl
 #elif defined MGONGPU_CUCXTYPE_THRUST
               << "Complex type                = THRUST::COMPLEX" << std::endl
-#endif
-#else
+#elif defined MGONGPU_CUCXTYPE_CXSMPL
               << "Complex type                = STD::COMPLEX" << std::endl
+#else
+              << "Complex type                = ???" << std::endl // no path to this statement...
 #endif
               << "RanNumb memory layout       = AOSOA[" << neppR << "]"
               << ( neppR == 1 ? " == AOS" : "" )
               << " [HARDCODED FOR REPRODUCIBILITY]" << std::endl
               << "Momenta memory layout       = AOSOA[" << neppM << "]"
               << ( neppM == 1 ? " == AOS" : "" ) << std::endl
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     //<< "Wavefunction GPU memory     = LOCAL" << std::endl
 #else
 #if !defined MGONGPU_CPPSIMD
@@ -966,7 +981,7 @@ main( int argc, char** argv )
 #endif
 #endif
               << "Random number generation    = " << rndgentxt << std::endl
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 #ifdef _OPENMP
               << "OMP threads / `nproc --all` = " << omp_get_max_threads() << " / " << nprocall // includes a newline
 #endif
@@ -1062,14 +1077,14 @@ main( int argc, char** argv )
              << "\"FLOAT (NaN/abnormal=" << nabn << ")\"," << std::endl
 #endif
              << "\"Complex type\": "
-#ifdef __CUDACC__
 #if defined MGONGPU_CUCXTYPE_CUCOMPLEX
              << "\"CUCOMPLEX\"," << std::endl
 #elif defined MGONGPU_CUCXTYPE_THRUST
              << "\"THRUST::COMPLEX\"," << std::endl
-#endif
-#else
+#elif defined MGONGPU_CUCXTYPE_CXSMPL
              << "\"STD::COMPLEX\"," << std::endl
+#else
+             << "\"???\"," << std::endl                           // no path to this statement...
 #endif
              << "\"RanNumb memory layout\": "
              << "\"AOSOA[" << neppR << "]\""
@@ -1077,7 +1092,7 @@ main( int argc, char** argv )
              << "\"Momenta memory layout\": "
              << "\"AOSOA[" << neppM << "]\""
              << ( neppM == 1 ? " == AOS" : "" ) << ", " << std::endl
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     //<< "\"Wavefunction GPU memory\": " << "\"LOCAL\"," << std::endl
 #endif
              << "\"Curand generation\": "
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxccx/CPPProcess.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxccx/CPPProcess.cc
index d0be5131af..be2315b035 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxccx/CPPProcess.cc
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxccx/CPPProcess.cc
@@ -4,7 +4,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
 // MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
@@ -16,7 +16,6 @@
 
 #include "mgOnGpuConfig.h"
 
-#include "CudaRuntime.h"
 #include "HelAmps_sm.h"
 #include "MemoryAccessAmplitudes.h"
 #include "MemoryAccessCouplings.h"
@@ -57,7 +56,7 @@
 // Process: s s~ > t t~ c c~ WEIGHTED<=4 @2
 // Process: s s~ > t t~ d d~ WEIGHTED<=4 @2
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -91,7 +90,7 @@ namespace mg5amcCpu
   __device__ const fptype cIPD[2] = { (fptype)Parameters_sm::mdl_MT, (fptype)Parameters_sm::mdl_WT };
   __device__ const fptype* cIPC = nullptr; // unused as nicoup=0
 #else
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   __device__ __constant__ fptype cIPD[2];
   __device__ __constant__ fptype* cIPC = nullptr; // unused as nicoup=0
 #else
@@ -101,7 +100,7 @@ namespace mg5amcCpu
 #endif
 
   // Helicity combinations (and filtering of "good" helicity combinations)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   __device__ __constant__ short cHel[ncomb][npar];
   __device__ __constant__ int cNGoodHel;
   __device__ __constant__ int cGoodHel[ncomb];
@@ -129,13 +128,13 @@ namespace mg5amcCpu
                            fptype* allDenominators,       // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
                            fptype_sv* jamp2_sv            // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled)
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
                            , const int ievt00             // input: first event number in current C++ event page (for CUDA, ievt depends on threadid)
 #endif
                            )
   //ALWAYS_INLINE // attributes are not permitted in a function definition
   {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     using namespace mg5amcGpu;
     using M_ACCESS = DeviceAccessMomenta;         // non-trivial access: buffer includes all events
     using E_ACCESS = DeviceAccessMatrixElements;  // non-trivial access: buffer includes all events
@@ -162,7 +161,7 @@ namespace mg5amcCpu
 #endif /* clang-format on */
     mgDebug( 0, __FUNCTION__ );
     //printf( "calculate_wavefunctions: ihel=%2d\n", ihel );
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
     //printf( "calculate_wavefunctions: ievt00=%d\n", ievt00 );
 #endif
 
@@ -198,7 +197,7 @@ namespace mg5amcCpu
 #endif
     for( int iParity = 0; iParity < nParity; ++iParity )
     { // START LOOP ON IPARITY
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
       const int ievt0 = ievt00 + iParity * neppV;
 #endif
       constexpr size_t nxcoup = ndcoup + nicoup; // both dependent and independent couplings
@@ -211,8 +210,10 @@ namespace mg5amcCpu
         allCOUPs[idcoup] = CD_ACCESS::idcoupAccessBufferConst( allcouplings, idcoup ); // dependent couplings, vary event-by-event
       for( size_t iicoup = 0; iicoup < nicoup; iicoup++ )
         allCOUPs[ndcoup + iicoup] = CI_ACCESS::iicoupAccessBufferConst( cIPC, iicoup ); // independent couplings, fixed for all events
+#ifdef MGONGPUCPP_GPUIMPL
 #ifdef __CUDACC__
 #pragma nv_diagnostic pop
+#endif
       // CUDA kernels take input/output buffers with momenta/MEs for all events
       const fptype* momenta = allmomenta;
       const fptype* COUPs[nxcoup];
@@ -393,7 +394,7 @@ namespace mg5amcCpu
         { 3, 9, 9, 3, 27, 9 },
         { 9, 3, 3, 9, 9, 27 } }; // 2-D array[6][6]
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
       // Pre-compute a constexpr triangular color matrix properly normalized #475
       struct TriangularNormalizedColorMatrix
       {
@@ -450,7 +451,7 @@ namespace mg5amcCpu
 #endif
       for( int icol = 0; icol < ncolor; icol++ )
       {
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
         // === C++ START ===
         // Diagonal terms
 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
@@ -509,7 +510,7 @@ namespace mg5amcCpu
       MEs_sv_previous += deltaMEs_previous;
 #endif
       /*
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
       if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", blockDim.x * blockIdx.x + threadIdx.x, ihel, MEs_sv );
 #else
 #ifdef MGONGPU_CPPSIMD
@@ -604,8 +605,8 @@ namespace mg5amcCpu
       { -1, 1, 1, -1, -1, -1 },
       { -1, 1, 1, -1, 1, 1 },
       { -1, 1, 1, -1, 1, -1 } };
-#ifdef __CUDACC__
-    checkCuda( cudaMemcpyToSymbol( cHel, tHel, ncomb * npar * sizeof( short ) ) );
+#ifdef MGONGPUCPP_GPUIMPL
+    gpuMemcpyToSymbol( cHel, tHel, ncomb * npar * sizeof( short ) );
 #else
     memcpy( cHel, tHel, ncomb * npar * sizeof( short ) );
 #endif
@@ -647,9 +648,9 @@ namespace mg5amcCpu
     // Then copy them to CUDA constant memory (issue #39) or its C++ emulation in file-scope static memory
     const fptype tIPD[2] = { (fptype)m_pars->mdl_MT, (fptype)m_pars->mdl_WT };
     //const cxtype tIPC[0] = { ... }; // nicoup=0
-#ifdef __CUDACC__
-    checkCuda( cudaMemcpyToSymbol( cIPD, tIPD, 2 * sizeof( fptype ) ) );
-    //checkCuda( cudaMemcpyToSymbol( cIPC, tIPC, 0 * sizeof( cxtype ) ) ); // nicoup=0
+#ifdef MGONGPUCPP_GPUIMPL
+    gpuMemcpyToSymbol( cIPD, tIPD, 2 * sizeof( fptype ) );
+    //gpuMemcpyToSymbol( cIPC, tIPC, 0 * sizeof( cxtype ) ); // nicoup=0
 #else
     memcpy( cIPD, tIPD, 2 * sizeof( fptype ) );
     //memcpy( cIPC, tIPC, 0 * sizeof( cxtype ) ); // nicoup=0
@@ -687,7 +688,7 @@ namespace mg5amcCpu
   {
     std::stringstream out;
     // CUDA version (NVCC)
-    // [Use __NVCC__ instead of __CUDACC__ here!]
+    // [Use __NVCC__ instead of MGONGPUCPP_GPUIMPL here!]
     // [This tests if 'nvcc' was used even to build a .cc file, even if not necessarily 'nvcc -x cu' for a .cu file]
     // [Check 'nvcc --compiler-options -dM -E dummy.c | grep CUDA': see https://stackoverflow.com/a/53713712]
 #ifdef __NVCC__
@@ -752,12 +753,12 @@ namespace mg5amcCpu
   __global__ void /* clang-format off */
   computeDependentCouplings( const fptype* allgs, // input: Gs[nevt]
                              fptype* allcouplings // output: couplings[nevt*ndcoup*2]
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
                              , const int nevt     // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #endif
   ) /* clang-format on */
   {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     using namespace mg5amcGpu;
     using G_ACCESS = DeviceAccessGs;
     using C_ACCESS = DeviceAccessCouplings;
@@ -778,7 +779,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__ /* clang-format off */
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
   __global__ void
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
@@ -904,9 +905,9 @@ namespace mg5amcCpu
         nGoodHel++;
       }
     }
-#ifdef __CUDACC__
-    checkCuda( cudaMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) ) );
-    checkCuda( cudaMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) ) );
+#ifdef MGONGPUCPP_GPUIMPL
+    gpuMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) );
+    gpuMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) );
 #else
     cNGoodHel = nGoodHel;
     for( int ihel = 0; ihel < ncomb; ihel++ ) cGoodHel[ihel] = goodHel[ihel];
@@ -930,7 +931,7 @@ namespace mg5amcCpu
 #endif
             int* allselhel,                // output: helicity selection[nevt]
             int* allselcol                 // output: helicity selection[nevt]
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
             , const int nevt               // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #endif
             ) /* clang-format on */
@@ -950,7 +951,7 @@ namespace mg5amcCpu
     // Denominators: spins, colors and identical particles
     constexpr int helcolDenominators[1] = { 36 }; // assume nprocesses == 1 (#272 and #343)
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     // Remember: in CUDA this is a kernel for one event, in c++ this processes n events
     const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid
 #else
@@ -964,9 +965,12 @@ namespace mg5amcCpu
 #endif
 
     // Start sigmaKin_lines
+
+#include "GpuAbstraction.h"
+
     // === PART 0 - INITIALISATION (before calculate_wavefunctions) ===
     // Reset the "matrix elements" - running sums of |M|^2 over helicities for the given event
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     allMEs[ievt] = 0;
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     allNumerators[ievt] = 0;
@@ -994,7 +998,7 @@ namespace mg5amcCpu
     // === PART 1 - HELICITY LOOP: CALCULATE WAVEFUNCTIONS ===
     // (in both CUDA and C++, using precomputed good helicities)
 
-#ifdef __CUDACC__ // CUDA OR C++
+#ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++
 
     // *** START OF PART 1a - CUDA (one event per CPU thread) ***
     // Running sum of partial amplitudes squared for event by event color selection (#402)
@@ -1204,7 +1208,7 @@ namespace mg5amcCpu
     // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event
     // [NB 'sum over final spins, average over initial spins', eg see
     // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf]
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     allMEs[ievt] /= helcolDenominators[0];
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     if( channelId > 0 ) allMEs[ievt] *= allNumerators[ievt] / allDenominators[ievt];
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxccx/CPPProcess.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxccx/CPPProcess.h
index 14490d782f..71fdc6e547 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxccx/CPPProcess.h
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxccx/CPPProcess.h
@@ -4,7 +4,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
 // MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
@@ -25,7 +25,7 @@
 
 //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -118,7 +118,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   __global__ void
   computeDependentCouplings( const fptype* allgs,    // input: Gs[nevt]
                              fptype* allcouplings ); // output: couplings[nevt*ndcoup*2]
@@ -131,7 +131,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__ /* clang-format off */
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
   __global__ void
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
@@ -161,7 +161,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__ /* clang-format off */
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
   __global__ void
   sigmaKin( const fptype* allmomenta,      // input: momenta[nevt*npar*4]
             const fptype* allcouplings,    // input: couplings[nevt*ndcoup*2]
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxccx/CudaRuntime.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxccx/CudaRuntime.h
deleted file mode 120000
index ce9e1a487a..0000000000
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxccx/CudaRuntime.h
+++ /dev/null
@@ -1 +0,0 @@
-../CudaRuntime.h
\ No newline at end of file
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxccx/GpuAbstraction.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxccx/GpuAbstraction.h
new file mode 120000
index 0000000000..72054e19ba
--- /dev/null
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxccx/GpuAbstraction.h
@@ -0,0 +1 @@
+../GpuAbstraction.h
\ No newline at end of file
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxccx/GpuRuntime.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxccx/GpuRuntime.h
new file mode 120000
index 0000000000..3920e83be4
--- /dev/null
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxccx/GpuRuntime.h
@@ -0,0 +1 @@
+../GpuRuntime.h
\ No newline at end of file
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxccx/check_sa.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxccx/check_sa.cc
index 3fbf0ffbee..7cac5ab47b 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxccx/check_sa.cc
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxccx/check_sa.cc
@@ -4,7 +4,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: O. Mattelaer (Nov 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 
 #include "mgOnGpuConfig.h"
@@ -12,6 +12,7 @@
 #include "BridgeKernels.h"
 #include "CPPProcess.h"
 #include "CrossSectionKernels.h"
+#include "GpuRuntime.h"
 #include "MatrixElementKernels.h"
 #include "MemoryAccessMatrixElements.h"
 #include "MemoryAccessMomenta.h"
@@ -65,7 +66,7 @@ usage( char* argv0, int ret = 1 )
   std::cout << std::endl;
   std::cout << "Summary stats are always computed: '-p' and '-j' only control their printout" << std::endl;
   std::cout << "The '-d' flag only enables NaN/abnormal warnings and OMP debugging" << std::endl;
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 #ifdef _OPENMP
   std::cout << std::endl;
   std::cout << "Use the OMP_NUM_THREADS environment variable to control OMP multi-threading" << std::endl;
@@ -96,7 +97,7 @@ int
 main( int argc, char** argv )
 {
   // Namespaces for CUDA and C++ (FIXME - eventually use the same namespace everywhere...)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   using namespace mg5amcGpu;
 #else
   using namespace mg5amcCpu;
@@ -134,9 +135,11 @@ main( int argc, char** argv )
     CurandDevice = 2
   };
 #ifdef MGONGPU_HAS_NO_CURAND
-  RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // this is the only supported mode if build has no curand (PR #784)
+  RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // this is the only supported mode if build has no curand (PR #784 and #785)
+#elif defined __HIPCC__
+#error Internal error: MGONGPU_HAS_NO_CURAND should have been set for __HIPCC__ // default on AMD GPUs should be common random
 #elif defined __CUDACC__
-  RandomNumberMode rndgen = RandomNumberMode::CurandDevice; // default on GPU if build has curand
+  RandomNumberMode rndgen = RandomNumberMode::CurandDevice; // default on NVidia GPU if build has curand
 #else
   RandomNumberMode rndgen = RandomNumberMode::CurandHost; // default on CPU if build has curand
 #endif
@@ -146,10 +149,10 @@ main( int argc, char** argv )
     RamboHost = 1,
     RamboDevice = 2
   };
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   RamboSamplingMode rmbsmp = RamboSamplingMode::RamboDevice; // default on GPU
 #else
-  RamboSamplingMode rmbsmp = RamboSamplingMode::RamboHost;  // default on CPU
+  RamboSamplingMode rmbsmp = RamboSamplingMode::RamboHost; // default on CPU
 #endif
   // Bridge emulation mode (NB Bridge implies RamboHost!)
   bool bridge = false;
@@ -177,7 +180,7 @@ main( int argc, char** argv )
     else if( arg == "--curdev" )
     {
 #ifndef __CUDACC__
-      throw std::runtime_error( "CurandDevice is not supported on CPUs" );
+      throw std::runtime_error( "CurandDevice is not supported on CPUs or non-NVidia GPUs" );
 #elif defined MGONGPU_HAS_NO_CURAND
       throw std::runtime_error( "CurandDevice is not supported because this application was built without Curand support" );
 #else
@@ -198,7 +201,7 @@ main( int argc, char** argv )
     }
     else if( arg == "--rmbdev" )
     {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
       rmbsmp = RamboSamplingMode::RamboDevice;
 #else
       throw std::runtime_error( "RamboDevice is not supported on CPUs" );
@@ -272,13 +275,13 @@ main( int argc, char** argv )
     return usage( argv[0] );
   }
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 #ifdef _OPENMP
   ompnumthreadsNotSetMeansOneThread( debug ? 1 : 0 ); // quiet(-1), info(0), debug(1)
 #endif
 #endif
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // Fail gently and avoid "Illegal instruction (core dumped)" if the host does not support the SIMD used in the ME calculation
   // Note: this prevents a crash on pmpe04 but not on some github CI nodes?
   // [NB: SIMD vectorization in mg5amc C++ code is only used in the ME calculation below MatrixElementKernelHost!]
@@ -296,14 +299,14 @@ main( int argc, char** argv )
 
   // === STEP 0 - INITIALISE
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 
-  // --- 00. Initialise cuda
-  // Instantiate a CudaRuntime at the beginnining of the application's main to
-  // invoke cudaSetDevice(0) in the constructor and book a cudaDeviceReset() call in the destructor
-  const std::string cdinKey = "00 CudaInit";
+  // --- 00. Initialise GPU
+  // Instantiate a GpuRuntime at the beginnining of the application's main.
+  // For CUDA this invokes cudaSetDevice(0) in the constructor and books a cudaDeviceReset() call in the destructor.
+  const std::string cdinKey = "00 GpuInit";
   timermap.start( cdinKey );
-  CudaRuntime cudaRuntime( debug );
+  GpuRuntime GpuRuntime( debug );
 #endif
 
   // --- 0a. Initialise physics process
@@ -325,7 +328,7 @@ main( int argc, char** argv )
   timermap.start( alloKey );
 
   // Memory buffers for random numbers for momenta
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferRndNumMomenta hstRndmom( nevt );
 #else
   PinnedHostBufferRndNumMomenta hstRndmom( nevt );
@@ -333,7 +336,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for sampling weights
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferWeights hstWeights( nevt );
 #else
   PinnedHostBufferWeights hstWeights( nevt );
@@ -341,7 +344,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for momenta
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferMomenta hstMomenta( nevt );
 #else
   PinnedHostBufferMomenta hstMomenta( nevt );
@@ -349,7 +352,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for Gs
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferGs hstGs( nevt );
 #else
   PinnedHostBufferGs hstGs( nevt );
@@ -366,7 +369,7 @@ main( int argc, char** argv )
   }
 
   // Memory buffers for matrix elements
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferMatrixElements hstMatrixElements( nevt );
 #else
   PinnedHostBufferMatrixElements hstMatrixElements( nevt );
@@ -375,7 +378,7 @@ main( int argc, char** argv )
 
   // Memory buffers for random numbers for helicity selection
   // *** NB #403 these buffers always remain initialised at 0: no need for helicity choice in gcheck/check (no LHE produced) ***
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferRndNumHelicity hstRndHel( nevt );
 #else
   PinnedHostBufferRndNumHelicity hstRndHel( nevt );
@@ -384,7 +387,7 @@ main( int argc, char** argv )
 
   // Memory buffers for random numbers for color selection
   // *** NB #402 these buffers always remain initialised at 0: no need for color choice in gcheck/check (no LHE produced) ***
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferRndNumColor hstRndCol( nevt );
 #else
   PinnedHostBufferRndNumColor hstRndCol( nevt );
@@ -392,7 +395,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for helicity selection
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferSelectedHelicity hstSelHel( nevt );
 #else
   PinnedHostBufferSelectedHelicity hstSelHel( nevt );
@@ -400,7 +403,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for color selection
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferSelectedColor hstSelCol( nevt );
 #else
   PinnedHostBufferSelectedColor hstSelCol( nevt );
@@ -438,7 +441,7 @@ main( int argc, char** argv )
     const bool onDevice = true;
     prnk.reset( new CurandRandomNumberKernel( devRndmom, onDevice ) );
 #else
-    throw std::logic_error( "INTERNAL ERROR! CurandDevice is not supported on CPUs" ); // INTERNAL ERROR (no path to this statement)
+    throw std::logic_error( "INTERNAL ERROR! CurandDevice is not supported on CPUs or non-NVidia GPUs" ); // INTERNAL ERROR (no path to this statement)
 #endif
   }
 
@@ -450,7 +453,7 @@ main( int argc, char** argv )
   }
   else
   {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     prsk.reset( new RamboSamplingKernelDevice( energy, devRndmom, devMomenta, devWeights, gpublocks, gputhreads ) );
 #else
     throw std::logic_error( "RamboDevice is not supported on CPUs" ); // INTERNAL ERROR (no path to this statement)
@@ -461,7 +464,7 @@ main( int argc, char** argv )
   std::unique_ptr<MatrixElementKernelBase> pmek;
   if( !bridge )
   {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     pmek.reset( new MatrixElementKernelDevice( devMomenta, devGs, devRndHel, devRndCol, devMatrixElements, devSelHel, devSelCol, gpublocks, gputhreads ) );
 #else
     pmek.reset( new MatrixElementKernelHost( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, nevt ) );
@@ -469,7 +472,7 @@ main( int argc, char** argv )
   }
   else
   {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     pmek.reset( new BridgeKernelDevice( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, gpublocks, gputhreads ) );
 #else
     pmek.reset( new BridgeKernelHost( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, nevt ) );
@@ -511,7 +514,7 @@ main( int argc, char** argv )
     prnk->generateRnarray();
     //std::cout << "Got random numbers" << std::endl;
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     if( rndgen != RandomNumberMode::CurandDevice && rmbsmp == RamboSamplingMode::RamboDevice )
     {
       // --- 1c. Copy rndmom from host to device
@@ -543,7 +546,7 @@ main( int argc, char** argv )
     prsk->getMomentaFinal();
     //std::cout << "Got final momenta" << std::endl;
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     if( rmbsmp == RamboSamplingMode::RamboDevice )
     {
       // --- 2c. CopyDToH Weights
@@ -588,7 +591,7 @@ main( int argc, char** argv )
       dynamic_cast<BridgeKernelBase*>( pmek.get() )->transposeInputMomentaC2F();
     }
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     // --- 2d. CopyHToD Momenta
     const std::string gKey = "0.. CpHTDg";
     rambtime += timermap.start( gKey ); // FIXME! NOT A RAMBO TIMER!
@@ -617,7 +620,7 @@ main( int argc, char** argv )
     wv3atime += timermap.stop(); // calc only
     wavetime += wv3atime;        // calc plus copy
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     if( !bridge )
     {
       // --- 3b. CopyDToH MEs
@@ -760,18 +763,22 @@ main( int argc, char** argv )
     rndgentxt = "CURAND DEVICE";
 #ifdef __CUDACC__
   rndgentxt += " (CUDA code)";
+#elif defined __HIPCC__
+  rndgentxt += " (HIP code)";
 #else
   rndgentxt += " (C++ code)";
 #endif
 
   // Workflow description summary
   std::string wrkflwtxt;
-  // -- CUDA or C++?
+  // -- CUDA or HIP or C++?
 #ifdef __CUDACC__
   wrkflwtxt += "CUD:";
+#elif defined __HIPCC__
+  wrkflwtxt += "HIP:";
 #else
   wrkflwtxt += "CPP:";
-#endif
+#endif /* clang-format off */
   // -- DOUBLE or FLOAT?
 #if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
   wrkflwtxt += "MIX+"; // mixed fptypes (single precision color algebra #537)
@@ -781,7 +788,7 @@ main( int argc, char** argv )
   wrkflwtxt += "FLT+";
 #else
   wrkflwtxt += "???+"; // no path to this statement
-#endif
+#endif /* clang-format on */
   // -- CUCOMPLEX or THRUST or STD complex numbers?
 #ifdef __CUDACC__
 #if defined MGONGPU_CUCXTYPE_CUCOMPLEX
@@ -793,6 +800,12 @@ main( int argc, char** argv )
 #else
   wrkflwtxt += "???:"; // no path to this statement
 #endif
+#elif defined __HIPCC__
+#if defined MGONGPU_CUCXTYPE_CXSMPL
+  wrkflwtxt += "CXS:";
+#else
+  wrkflwtxt += "???:"; // no path to this statement
+#endif
 #else
 #if defined MGONGPU_CPPCXTYPE_STDCOMPLEX
   wrkflwtxt += "STX:";
@@ -818,7 +831,7 @@ main( int argc, char** argv )
     wrkflwtxt += "RMBDEV+";
   else
     wrkflwtxt += "??????+"; // no path to this statement
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   // -- HOST or DEVICE matrix elements? Standalone MEs or BRIDGE?
   if( !bridge )
     wrkflwtxt += "MESDEV";
@@ -874,7 +887,7 @@ main( int argc, char** argv )
 
   if( perf )
   {
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 #ifdef _OPENMP
     // Get the output of "nproc --all" (https://stackoverflow.com/a/478960)
     std::string nprocall;
@@ -895,6 +908,8 @@ main( int argc, char** argv )
     std::cout << std::string( SEP79, '*' ) << std::endl
 #ifdef __CUDACC__
               << "Process                     = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_CUDA"
+#elif defined __HIPCC__
+              << "Process                     = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_HIP"
 #else
               << "Process                     = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_CPP"
 #endif
@@ -921,21 +936,21 @@ main( int argc, char** argv )
 #elif defined MGONGPU_FPTYPE_FLOAT
               << "FP precision                = FLOAT (NaN/abnormal=" << nabn << ", zero=" << nzero << ")" << std::endl
 #endif
-#ifdef __CUDACC__
 #if defined MGONGPU_CUCXTYPE_CUCOMPLEX
               << "Complex type                = CUCOMPLEX" << std::endl
 #elif defined MGONGPU_CUCXTYPE_THRUST
               << "Complex type                = THRUST::COMPLEX" << std::endl
-#endif
-#else
+#elif defined MGONGPU_CUCXTYPE_CXSMPL
               << "Complex type                = STD::COMPLEX" << std::endl
+#else
+              << "Complex type                = ???" << std::endl // no path to this statement...
 #endif
               << "RanNumb memory layout       = AOSOA[" << neppR << "]"
               << ( neppR == 1 ? " == AOS" : "" )
               << " [HARDCODED FOR REPRODUCIBILITY]" << std::endl
               << "Momenta memory layout       = AOSOA[" << neppM << "]"
               << ( neppM == 1 ? " == AOS" : "" ) << std::endl
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     //<< "Wavefunction GPU memory     = LOCAL" << std::endl
 #else
 #if !defined MGONGPU_CPPSIMD
@@ -966,7 +981,7 @@ main( int argc, char** argv )
 #endif
 #endif
               << "Random number generation    = " << rndgentxt << std::endl
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 #ifdef _OPENMP
               << "OMP threads / `nproc --all` = " << omp_get_max_threads() << " / " << nprocall // includes a newline
 #endif
@@ -1062,14 +1077,14 @@ main( int argc, char** argv )
              << "\"FLOAT (NaN/abnormal=" << nabn << ")\"," << std::endl
 #endif
              << "\"Complex type\": "
-#ifdef __CUDACC__
 #if defined MGONGPU_CUCXTYPE_CUCOMPLEX
              << "\"CUCOMPLEX\"," << std::endl
 #elif defined MGONGPU_CUCXTYPE_THRUST
              << "\"THRUST::COMPLEX\"," << std::endl
-#endif
-#else
+#elif defined MGONGPU_CUCXTYPE_CXSMPL
              << "\"STD::COMPLEX\"," << std::endl
+#else
+             << "\"???\"," << std::endl                           // no path to this statement...
 #endif
              << "\"RanNumb memory layout\": "
              << "\"AOSOA[" << neppR << "]\""
@@ -1077,7 +1092,7 @@ main( int argc, char** argv )
              << "\"Momenta memory layout\": "
              << "\"AOSOA[" << neppM << "]\""
              << ( neppM == 1 ? " == AOS" : "" ) << ", " << std::endl
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     //<< "\"Wavefunction GPU memory\": " << "\"LOCAL\"," << std::endl
 #endif
              << "\"Curand generation\": "
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxgg/CPPProcess.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxgg/CPPProcess.cc
index 3a2178d534..c83b7be449 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxgg/CPPProcess.cc
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxgg/CPPProcess.cc
@@ -4,7 +4,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
 // MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
@@ -16,7 +16,6 @@
 
 #include "mgOnGpuConfig.h"
 
-#include "CudaRuntime.h"
 #include "HelAmps_sm.h"
 #include "MemoryAccessAmplitudes.h"
 #include "MemoryAccessCouplings.h"
@@ -49,7 +48,7 @@
 // Process: d d~ > t t~ g g WEIGHTED<=4 @2
 // Process: s s~ > t t~ g g WEIGHTED<=4 @2
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -83,7 +82,7 @@ namespace mg5amcCpu
   __device__ const fptype cIPD[2] = { (fptype)Parameters_sm::mdl_MT, (fptype)Parameters_sm::mdl_WT };
   __device__ const fptype* cIPC = nullptr; // unused as nicoup=0
 #else
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   __device__ __constant__ fptype cIPD[2];
   __device__ __constant__ fptype* cIPC = nullptr; // unused as nicoup=0
 #else
@@ -93,7 +92,7 @@ namespace mg5amcCpu
 #endif
 
   // Helicity combinations (and filtering of "good" helicity combinations)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   __device__ __constant__ short cHel[ncomb][npar];
   __device__ __constant__ int cNGoodHel;
   __device__ __constant__ int cGoodHel[ncomb];
@@ -121,13 +120,13 @@ namespace mg5amcCpu
                            fptype* allDenominators,       // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
                            fptype_sv* jamp2_sv            // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled)
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
                            , const int ievt00             // input: first event number in current C++ event page (for CUDA, ievt depends on threadid)
 #endif
                            )
   //ALWAYS_INLINE // attributes are not permitted in a function definition
   {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     using namespace mg5amcGpu;
     using M_ACCESS = DeviceAccessMomenta;         // non-trivial access: buffer includes all events
     using E_ACCESS = DeviceAccessMatrixElements;  // non-trivial access: buffer includes all events
@@ -154,7 +153,7 @@ namespace mg5amcCpu
 #endif /* clang-format on */
     mgDebug( 0, __FUNCTION__ );
     //printf( "calculate_wavefunctions: ihel=%2d\n", ihel );
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
     //printf( "calculate_wavefunctions: ievt00=%d\n", ievt00 );
 #endif
 
@@ -190,7 +189,7 @@ namespace mg5amcCpu
 #endif
     for( int iParity = 0; iParity < nParity; ++iParity )
     { // START LOOP ON IPARITY
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
       const int ievt0 = ievt00 + iParity * neppV;
 #endif
       constexpr size_t nxcoup = ndcoup + nicoup; // both dependent and independent couplings
@@ -203,8 +202,10 @@ namespace mg5amcCpu
         allCOUPs[idcoup] = CD_ACCESS::idcoupAccessBufferConst( allcouplings, idcoup ); // dependent couplings, vary event-by-event
       for( size_t iicoup = 0; iicoup < nicoup; iicoup++ )
         allCOUPs[ndcoup + iicoup] = CI_ACCESS::iicoupAccessBufferConst( cIPC, iicoup ); // independent couplings, fixed for all events
+#ifdef MGONGPUCPP_GPUIMPL
 #ifdef __CUDACC__
 #pragma nv_diagnostic pop
+#endif
       // CUDA kernels take input/output buffers with momenta/MEs for all events
       const fptype* momenta = allmomenta;
       const fptype* COUPs[nxcoup];
@@ -812,7 +813,7 @@ namespace mg5amcCpu
         { 16, -2, 0, 0, 0, 0, -2, 16, 16, 6, 48, 16 },
         { 0, 0, 16, -2, -2, 16, 0, 0, 6, 16, 16, 48 } }; // 2-D array[12][12]
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
       // Pre-compute a constexpr triangular color matrix properly normalized #475
       struct TriangularNormalizedColorMatrix
       {
@@ -869,7 +870,7 @@ namespace mg5amcCpu
 #endif
       for( int icol = 0; icol < ncolor; icol++ )
       {
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
         // === C++ START ===
         // Diagonal terms
 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
@@ -928,7 +929,7 @@ namespace mg5amcCpu
       MEs_sv_previous += deltaMEs_previous;
 #endif
       /*
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
       if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", blockDim.x * blockIdx.x + threadIdx.x, ihel, MEs_sv );
 #else
 #ifdef MGONGPU_CPPSIMD
@@ -1023,8 +1024,8 @@ namespace mg5amcCpu
       { -1, 1, 1, -1, -1, 1 },
       { -1, 1, 1, -1, 1, -1 },
       { -1, 1, 1, -1, 1, 1 } };
-#ifdef __CUDACC__
-    checkCuda( cudaMemcpyToSymbol( cHel, tHel, ncomb * npar * sizeof( short ) ) );
+#ifdef MGONGPUCPP_GPUIMPL
+    gpuMemcpyToSymbol( cHel, tHel, ncomb * npar * sizeof( short ) );
 #else
     memcpy( cHel, tHel, ncomb * npar * sizeof( short ) );
 #endif
@@ -1066,9 +1067,9 @@ namespace mg5amcCpu
     // Then copy them to CUDA constant memory (issue #39) or its C++ emulation in file-scope static memory
     const fptype tIPD[2] = { (fptype)m_pars->mdl_MT, (fptype)m_pars->mdl_WT };
     //const cxtype tIPC[0] = { ... }; // nicoup=0
-#ifdef __CUDACC__
-    checkCuda( cudaMemcpyToSymbol( cIPD, tIPD, 2 * sizeof( fptype ) ) );
-    //checkCuda( cudaMemcpyToSymbol( cIPC, tIPC, 0 * sizeof( cxtype ) ) ); // nicoup=0
+#ifdef MGONGPUCPP_GPUIMPL
+    gpuMemcpyToSymbol( cIPD, tIPD, 2 * sizeof( fptype ) );
+    //gpuMemcpyToSymbol( cIPC, tIPC, 0 * sizeof( cxtype ) ); // nicoup=0
 #else
     memcpy( cIPD, tIPD, 2 * sizeof( fptype ) );
     //memcpy( cIPC, tIPC, 0 * sizeof( cxtype ) ); // nicoup=0
@@ -1106,7 +1107,7 @@ namespace mg5amcCpu
   {
     std::stringstream out;
     // CUDA version (NVCC)
-    // [Use __NVCC__ instead of __CUDACC__ here!]
+    // [Use __NVCC__ instead of MGONGPUCPP_GPUIMPL here!]
     // [This tests if 'nvcc' was used even to build a .cc file, even if not necessarily 'nvcc -x cu' for a .cu file]
     // [Check 'nvcc --compiler-options -dM -E dummy.c | grep CUDA': see https://stackoverflow.com/a/53713712]
 #ifdef __NVCC__
@@ -1171,12 +1172,12 @@ namespace mg5amcCpu
   __global__ void /* clang-format off */
   computeDependentCouplings( const fptype* allgs, // input: Gs[nevt]
                              fptype* allcouplings // output: couplings[nevt*ndcoup*2]
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
                              , const int nevt     // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #endif
   ) /* clang-format on */
   {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     using namespace mg5amcGpu;
     using G_ACCESS = DeviceAccessGs;
     using C_ACCESS = DeviceAccessCouplings;
@@ -1197,7 +1198,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__ /* clang-format off */
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
   __global__ void
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
@@ -1323,9 +1324,9 @@ namespace mg5amcCpu
         nGoodHel++;
       }
     }
-#ifdef __CUDACC__
-    checkCuda( cudaMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) ) );
-    checkCuda( cudaMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) ) );
+#ifdef MGONGPUCPP_GPUIMPL
+    gpuMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) );
+    gpuMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) );
 #else
     cNGoodHel = nGoodHel;
     for( int ihel = 0; ihel < ncomb; ihel++ ) cGoodHel[ihel] = goodHel[ihel];
@@ -1349,7 +1350,7 @@ namespace mg5amcCpu
 #endif
             int* allselhel,                // output: helicity selection[nevt]
             int* allselcol                 // output: helicity selection[nevt]
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
             , const int nevt               // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #endif
             ) /* clang-format on */
@@ -1369,7 +1370,7 @@ namespace mg5amcCpu
     // Denominators: spins, colors and identical particles
     constexpr int helcolDenominators[1] = { 72 }; // assume nprocesses == 1 (#272 and #343)
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     // Remember: in CUDA this is a kernel for one event, in c++ this processes n events
     const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid
 #else
@@ -1383,9 +1384,12 @@ namespace mg5amcCpu
 #endif
 
     // Start sigmaKin_lines
+
+#include "GpuAbstraction.h"
+
     // === PART 0 - INITIALISATION (before calculate_wavefunctions) ===
     // Reset the "matrix elements" - running sums of |M|^2 over helicities for the given event
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     allMEs[ievt] = 0;
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     allNumerators[ievt] = 0;
@@ -1413,7 +1417,7 @@ namespace mg5amcCpu
     // === PART 1 - HELICITY LOOP: CALCULATE WAVEFUNCTIONS ===
     // (in both CUDA and C++, using precomputed good helicities)
 
-#ifdef __CUDACC__ // CUDA OR C++
+#ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++
 
     // *** START OF PART 1a - CUDA (one event per CPU thread) ***
     // Running sum of partial amplitudes squared for event by event color selection (#402)
@@ -1623,7 +1627,7 @@ namespace mg5amcCpu
     // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event
     // [NB 'sum over final spins, average over initial spins', eg see
     // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf]
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     allMEs[ievt] /= helcolDenominators[0];
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     if( channelId > 0 ) allMEs[ievt] *= allNumerators[ievt] / allDenominators[ievt];
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxgg/CPPProcess.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxgg/CPPProcess.h
index 1543c29649..e9a24f516d 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxgg/CPPProcess.h
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxgg/CPPProcess.h
@@ -4,7 +4,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
 // MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
@@ -25,7 +25,7 @@
 
 //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -110,7 +110,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   __global__ void
   computeDependentCouplings( const fptype* allgs,    // input: Gs[nevt]
                              fptype* allcouplings ); // output: couplings[nevt*ndcoup*2]
@@ -123,7 +123,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__ /* clang-format off */
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
   __global__ void
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
@@ -153,7 +153,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__ /* clang-format off */
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
   __global__ void
   sigmaKin( const fptype* allmomenta,      // input: momenta[nevt*npar*4]
             const fptype* allcouplings,    // input: couplings[nevt*ndcoup*2]
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxgg/CudaRuntime.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxgg/CudaRuntime.h
deleted file mode 120000
index ce9e1a487a..0000000000
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxgg/CudaRuntime.h
+++ /dev/null
@@ -1 +0,0 @@
-../CudaRuntime.h
\ No newline at end of file
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxgg/GpuAbstraction.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxgg/GpuAbstraction.h
new file mode 120000
index 0000000000..72054e19ba
--- /dev/null
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxgg/GpuAbstraction.h
@@ -0,0 +1 @@
+../GpuAbstraction.h
\ No newline at end of file
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxgg/GpuRuntime.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxgg/GpuRuntime.h
new file mode 120000
index 0000000000..3920e83be4
--- /dev/null
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxgg/GpuRuntime.h
@@ -0,0 +1 @@
+../GpuRuntime.h
\ No newline at end of file
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxgg/check_sa.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxgg/check_sa.cc
index 3fbf0ffbee..7cac5ab47b 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxgg/check_sa.cc
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxgg/check_sa.cc
@@ -4,7 +4,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: O. Mattelaer (Nov 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 
 #include "mgOnGpuConfig.h"
@@ -12,6 +12,7 @@
 #include "BridgeKernels.h"
 #include "CPPProcess.h"
 #include "CrossSectionKernels.h"
+#include "GpuRuntime.h"
 #include "MatrixElementKernels.h"
 #include "MemoryAccessMatrixElements.h"
 #include "MemoryAccessMomenta.h"
@@ -65,7 +66,7 @@ usage( char* argv0, int ret = 1 )
   std::cout << std::endl;
   std::cout << "Summary stats are always computed: '-p' and '-j' only control their printout" << std::endl;
   std::cout << "The '-d' flag only enables NaN/abnormal warnings and OMP debugging" << std::endl;
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 #ifdef _OPENMP
   std::cout << std::endl;
   std::cout << "Use the OMP_NUM_THREADS environment variable to control OMP multi-threading" << std::endl;
@@ -96,7 +97,7 @@ int
 main( int argc, char** argv )
 {
   // Namespaces for CUDA and C++ (FIXME - eventually use the same namespace everywhere...)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   using namespace mg5amcGpu;
 #else
   using namespace mg5amcCpu;
@@ -134,9 +135,11 @@ main( int argc, char** argv )
     CurandDevice = 2
   };
 #ifdef MGONGPU_HAS_NO_CURAND
-  RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // this is the only supported mode if build has no curand (PR #784)
+  RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // this is the only supported mode if build has no curand (PR #784 and #785)
+#elif defined __HIPCC__
+#error Internal error: MGONGPU_HAS_NO_CURAND should have been set for __HIPCC__ // default on AMD GPUs should be common random
 #elif defined __CUDACC__
-  RandomNumberMode rndgen = RandomNumberMode::CurandDevice; // default on GPU if build has curand
+  RandomNumberMode rndgen = RandomNumberMode::CurandDevice; // default on NVidia GPU if build has curand
 #else
   RandomNumberMode rndgen = RandomNumberMode::CurandHost; // default on CPU if build has curand
 #endif
@@ -146,10 +149,10 @@ main( int argc, char** argv )
     RamboHost = 1,
     RamboDevice = 2
   };
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   RamboSamplingMode rmbsmp = RamboSamplingMode::RamboDevice; // default on GPU
 #else
-  RamboSamplingMode rmbsmp = RamboSamplingMode::RamboHost;  // default on CPU
+  RamboSamplingMode rmbsmp = RamboSamplingMode::RamboHost; // default on CPU
 #endif
   // Bridge emulation mode (NB Bridge implies RamboHost!)
   bool bridge = false;
@@ -177,7 +180,7 @@ main( int argc, char** argv )
     else if( arg == "--curdev" )
     {
 #ifndef __CUDACC__
-      throw std::runtime_error( "CurandDevice is not supported on CPUs" );
+      throw std::runtime_error( "CurandDevice is not supported on CPUs or non-NVidia GPUs" );
 #elif defined MGONGPU_HAS_NO_CURAND
       throw std::runtime_error( "CurandDevice is not supported because this application was built without Curand support" );
 #else
@@ -198,7 +201,7 @@ main( int argc, char** argv )
     }
     else if( arg == "--rmbdev" )
     {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
       rmbsmp = RamboSamplingMode::RamboDevice;
 #else
       throw std::runtime_error( "RamboDevice is not supported on CPUs" );
@@ -272,13 +275,13 @@ main( int argc, char** argv )
     return usage( argv[0] );
   }
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 #ifdef _OPENMP
   ompnumthreadsNotSetMeansOneThread( debug ? 1 : 0 ); // quiet(-1), info(0), debug(1)
 #endif
 #endif
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // Fail gently and avoid "Illegal instruction (core dumped)" if the host does not support the SIMD used in the ME calculation
   // Note: this prevents a crash on pmpe04 but not on some github CI nodes?
   // [NB: SIMD vectorization in mg5amc C++ code is only used in the ME calculation below MatrixElementKernelHost!]
@@ -296,14 +299,14 @@ main( int argc, char** argv )
 
   // === STEP 0 - INITIALISE
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 
-  // --- 00. Initialise cuda
-  // Instantiate a CudaRuntime at the beginnining of the application's main to
-  // invoke cudaSetDevice(0) in the constructor and book a cudaDeviceReset() call in the destructor
-  const std::string cdinKey = "00 CudaInit";
+  // --- 00. Initialise GPU
+  // Instantiate a GpuRuntime at the beginnining of the application's main.
+  // For CUDA this invokes cudaSetDevice(0) in the constructor and books a cudaDeviceReset() call in the destructor.
+  const std::string cdinKey = "00 GpuInit";
   timermap.start( cdinKey );
-  CudaRuntime cudaRuntime( debug );
+  GpuRuntime GpuRuntime( debug );
 #endif
 
   // --- 0a. Initialise physics process
@@ -325,7 +328,7 @@ main( int argc, char** argv )
   timermap.start( alloKey );
 
   // Memory buffers for random numbers for momenta
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferRndNumMomenta hstRndmom( nevt );
 #else
   PinnedHostBufferRndNumMomenta hstRndmom( nevt );
@@ -333,7 +336,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for sampling weights
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferWeights hstWeights( nevt );
 #else
   PinnedHostBufferWeights hstWeights( nevt );
@@ -341,7 +344,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for momenta
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferMomenta hstMomenta( nevt );
 #else
   PinnedHostBufferMomenta hstMomenta( nevt );
@@ -349,7 +352,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for Gs
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferGs hstGs( nevt );
 #else
   PinnedHostBufferGs hstGs( nevt );
@@ -366,7 +369,7 @@ main( int argc, char** argv )
   }
 
   // Memory buffers for matrix elements
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferMatrixElements hstMatrixElements( nevt );
 #else
   PinnedHostBufferMatrixElements hstMatrixElements( nevt );
@@ -375,7 +378,7 @@ main( int argc, char** argv )
 
   // Memory buffers for random numbers for helicity selection
   // *** NB #403 these buffers always remain initialised at 0: no need for helicity choice in gcheck/check (no LHE produced) ***
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferRndNumHelicity hstRndHel( nevt );
 #else
   PinnedHostBufferRndNumHelicity hstRndHel( nevt );
@@ -384,7 +387,7 @@ main( int argc, char** argv )
 
   // Memory buffers for random numbers for color selection
   // *** NB #402 these buffers always remain initialised at 0: no need for color choice in gcheck/check (no LHE produced) ***
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferRndNumColor hstRndCol( nevt );
 #else
   PinnedHostBufferRndNumColor hstRndCol( nevt );
@@ -392,7 +395,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for helicity selection
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferSelectedHelicity hstSelHel( nevt );
 #else
   PinnedHostBufferSelectedHelicity hstSelHel( nevt );
@@ -400,7 +403,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for color selection
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferSelectedColor hstSelCol( nevt );
 #else
   PinnedHostBufferSelectedColor hstSelCol( nevt );
@@ -438,7 +441,7 @@ main( int argc, char** argv )
     const bool onDevice = true;
     prnk.reset( new CurandRandomNumberKernel( devRndmom, onDevice ) );
 #else
-    throw std::logic_error( "INTERNAL ERROR! CurandDevice is not supported on CPUs" ); // INTERNAL ERROR (no path to this statement)
+    throw std::logic_error( "INTERNAL ERROR! CurandDevice is not supported on CPUs or non-NVidia GPUs" ); // INTERNAL ERROR (no path to this statement)
 #endif
   }
 
@@ -450,7 +453,7 @@ main( int argc, char** argv )
   }
   else
   {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     prsk.reset( new RamboSamplingKernelDevice( energy, devRndmom, devMomenta, devWeights, gpublocks, gputhreads ) );
 #else
     throw std::logic_error( "RamboDevice is not supported on CPUs" ); // INTERNAL ERROR (no path to this statement)
@@ -461,7 +464,7 @@ main( int argc, char** argv )
   std::unique_ptr<MatrixElementKernelBase> pmek;
   if( !bridge )
   {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     pmek.reset( new MatrixElementKernelDevice( devMomenta, devGs, devRndHel, devRndCol, devMatrixElements, devSelHel, devSelCol, gpublocks, gputhreads ) );
 #else
     pmek.reset( new MatrixElementKernelHost( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, nevt ) );
@@ -469,7 +472,7 @@ main( int argc, char** argv )
   }
   else
   {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     pmek.reset( new BridgeKernelDevice( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, gpublocks, gputhreads ) );
 #else
     pmek.reset( new BridgeKernelHost( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, nevt ) );
@@ -511,7 +514,7 @@ main( int argc, char** argv )
     prnk->generateRnarray();
     //std::cout << "Got random numbers" << std::endl;
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     if( rndgen != RandomNumberMode::CurandDevice && rmbsmp == RamboSamplingMode::RamboDevice )
     {
       // --- 1c. Copy rndmom from host to device
@@ -543,7 +546,7 @@ main( int argc, char** argv )
     prsk->getMomentaFinal();
     //std::cout << "Got final momenta" << std::endl;
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     if( rmbsmp == RamboSamplingMode::RamboDevice )
     {
       // --- 2c. CopyDToH Weights
@@ -588,7 +591,7 @@ main( int argc, char** argv )
       dynamic_cast<BridgeKernelBase*>( pmek.get() )->transposeInputMomentaC2F();
     }
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     // --- 2d. CopyHToD Momenta
     const std::string gKey = "0.. CpHTDg";
     rambtime += timermap.start( gKey ); // FIXME! NOT A RAMBO TIMER!
@@ -617,7 +620,7 @@ main( int argc, char** argv )
     wv3atime += timermap.stop(); // calc only
     wavetime += wv3atime;        // calc plus copy
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     if( !bridge )
     {
       // --- 3b. CopyDToH MEs
@@ -760,18 +763,22 @@ main( int argc, char** argv )
     rndgentxt = "CURAND DEVICE";
 #ifdef __CUDACC__
   rndgentxt += " (CUDA code)";
+#elif defined __HIPCC__
+  rndgentxt += " (HIP code)";
 #else
   rndgentxt += " (C++ code)";
 #endif
 
   // Workflow description summary
   std::string wrkflwtxt;
-  // -- CUDA or C++?
+  // -- CUDA or HIP or C++?
 #ifdef __CUDACC__
   wrkflwtxt += "CUD:";
+#elif defined __HIPCC__
+  wrkflwtxt += "HIP:";
 #else
   wrkflwtxt += "CPP:";
-#endif
+#endif /* clang-format off */
   // -- DOUBLE or FLOAT?
 #if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
   wrkflwtxt += "MIX+"; // mixed fptypes (single precision color algebra #537)
@@ -781,7 +788,7 @@ main( int argc, char** argv )
   wrkflwtxt += "FLT+";
 #else
   wrkflwtxt += "???+"; // no path to this statement
-#endif
+#endif /* clang-format on */
   // -- CUCOMPLEX or THRUST or STD complex numbers?
 #ifdef __CUDACC__
 #if defined MGONGPU_CUCXTYPE_CUCOMPLEX
@@ -793,6 +800,12 @@ main( int argc, char** argv )
 #else
   wrkflwtxt += "???:"; // no path to this statement
 #endif
+#elif defined __HIPCC__
+#if defined MGONGPU_CUCXTYPE_CXSMPL
+  wrkflwtxt += "CXS:";
+#else
+  wrkflwtxt += "???:"; // no path to this statement
+#endif
 #else
 #if defined MGONGPU_CPPCXTYPE_STDCOMPLEX
   wrkflwtxt += "STX:";
@@ -818,7 +831,7 @@ main( int argc, char** argv )
     wrkflwtxt += "RMBDEV+";
   else
     wrkflwtxt += "??????+"; // no path to this statement
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   // -- HOST or DEVICE matrix elements? Standalone MEs or BRIDGE?
   if( !bridge )
     wrkflwtxt += "MESDEV";
@@ -874,7 +887,7 @@ main( int argc, char** argv )
 
   if( perf )
   {
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 #ifdef _OPENMP
     // Get the output of "nproc --all" (https://stackoverflow.com/a/478960)
     std::string nprocall;
@@ -895,6 +908,8 @@ main( int argc, char** argv )
     std::cout << std::string( SEP79, '*' ) << std::endl
 #ifdef __CUDACC__
               << "Process                     = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_CUDA"
+#elif defined __HIPCC__
+              << "Process                     = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_HIP"
 #else
               << "Process                     = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_CPP"
 #endif
@@ -921,21 +936,21 @@ main( int argc, char** argv )
 #elif defined MGONGPU_FPTYPE_FLOAT
               << "FP precision                = FLOAT (NaN/abnormal=" << nabn << ", zero=" << nzero << ")" << std::endl
 #endif
-#ifdef __CUDACC__
 #if defined MGONGPU_CUCXTYPE_CUCOMPLEX
               << "Complex type                = CUCOMPLEX" << std::endl
 #elif defined MGONGPU_CUCXTYPE_THRUST
               << "Complex type                = THRUST::COMPLEX" << std::endl
-#endif
-#else
+#elif defined MGONGPU_CUCXTYPE_CXSMPL
               << "Complex type                = STD::COMPLEX" << std::endl
+#else
+              << "Complex type                = ???" << std::endl // no path to this statement...
 #endif
               << "RanNumb memory layout       = AOSOA[" << neppR << "]"
               << ( neppR == 1 ? " == AOS" : "" )
               << " [HARDCODED FOR REPRODUCIBILITY]" << std::endl
               << "Momenta memory layout       = AOSOA[" << neppM << "]"
               << ( neppM == 1 ? " == AOS" : "" ) << std::endl
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     //<< "Wavefunction GPU memory     = LOCAL" << std::endl
 #else
 #if !defined MGONGPU_CPPSIMD
@@ -966,7 +981,7 @@ main( int argc, char** argv )
 #endif
 #endif
               << "Random number generation    = " << rndgentxt << std::endl
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 #ifdef _OPENMP
               << "OMP threads / `nproc --all` = " << omp_get_max_threads() << " / " << nprocall // includes a newline
 #endif
@@ -1062,14 +1077,14 @@ main( int argc, char** argv )
              << "\"FLOAT (NaN/abnormal=" << nabn << ")\"," << std::endl
 #endif
              << "\"Complex type\": "
-#ifdef __CUDACC__
 #if defined MGONGPU_CUCXTYPE_CUCOMPLEX
              << "\"CUCOMPLEX\"," << std::endl
 #elif defined MGONGPU_CUCXTYPE_THRUST
              << "\"THRUST::COMPLEX\"," << std::endl
-#endif
-#else
+#elif defined MGONGPU_CUCXTYPE_CXSMPL
              << "\"STD::COMPLEX\"," << std::endl
+#else
+             << "\"???\"," << std::endl                           // no path to this statement...
 #endif
              << "\"RanNumb memory layout\": "
              << "\"AOSOA[" << neppR << "]\""
@@ -1077,7 +1092,7 @@ main( int argc, char** argv )
              << "\"Momenta memory layout\": "
              << "\"AOSOA[" << neppM << "]\""
              << ( neppM == 1 ? " == AOS" : "" ) << ", " << std::endl
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     //<< "\"Wavefunction GPU memory\": " << "\"LOCAL\"," << std::endl
 #endif
              << "\"Curand generation\": "
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxuux/CPPProcess.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxuux/CPPProcess.cc
index 70fbbee59f..3ecdb48914 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxuux/CPPProcess.cc
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxuux/CPPProcess.cc
@@ -4,7 +4,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
 // MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
@@ -16,7 +16,6 @@
 
 #include "mgOnGpuConfig.h"
 
-#include "CudaRuntime.h"
 #include "HelAmps_sm.h"
 #include "MemoryAccessAmplitudes.h"
 #include "MemoryAccessCouplings.h"
@@ -49,7 +48,7 @@
 // Process: d d~ > t t~ d d~ WEIGHTED<=4 @2
 // Process: s s~ > t t~ s s~ WEIGHTED<=4 @2
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -83,7 +82,7 @@ namespace mg5amcCpu
   __device__ const fptype cIPD[2] = { (fptype)Parameters_sm::mdl_MT, (fptype)Parameters_sm::mdl_WT };
   __device__ const fptype* cIPC = nullptr; // unused as nicoup=0
 #else
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   __device__ __constant__ fptype cIPD[2];
   __device__ __constant__ fptype* cIPC = nullptr; // unused as nicoup=0
 #else
@@ -93,7 +92,7 @@ namespace mg5amcCpu
 #endif
 
   // Helicity combinations (and filtering of "good" helicity combinations)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   __device__ __constant__ short cHel[ncomb][npar];
   __device__ __constant__ int cNGoodHel;
   __device__ __constant__ int cGoodHel[ncomb];
@@ -121,13 +120,13 @@ namespace mg5amcCpu
                            fptype* allDenominators,       // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
                            fptype_sv* jamp2_sv            // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled)
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
                            , const int ievt00             // input: first event number in current C++ event page (for CUDA, ievt depends on threadid)
 #endif
                            )
   //ALWAYS_INLINE // attributes are not permitted in a function definition
   {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     using namespace mg5amcGpu;
     using M_ACCESS = DeviceAccessMomenta;         // non-trivial access: buffer includes all events
     using E_ACCESS = DeviceAccessMatrixElements;  // non-trivial access: buffer includes all events
@@ -154,7 +153,7 @@ namespace mg5amcCpu
 #endif /* clang-format on */
     mgDebug( 0, __FUNCTION__ );
     //printf( "calculate_wavefunctions: ihel=%2d\n", ihel );
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
     //printf( "calculate_wavefunctions: ievt00=%d\n", ievt00 );
 #endif
 
@@ -190,7 +189,7 @@ namespace mg5amcCpu
 #endif
     for( int iParity = 0; iParity < nParity; ++iParity )
     { // START LOOP ON IPARITY
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
       const int ievt0 = ievt00 + iParity * neppV;
 #endif
       constexpr size_t nxcoup = ndcoup + nicoup; // both dependent and independent couplings
@@ -203,8 +202,10 @@ namespace mg5amcCpu
         allCOUPs[idcoup] = CD_ACCESS::idcoupAccessBufferConst( allcouplings, idcoup ); // dependent couplings, vary event-by-event
       for( size_t iicoup = 0; iicoup < nicoup; iicoup++ )
         allCOUPs[ndcoup + iicoup] = CI_ACCESS::iicoupAccessBufferConst( cIPC, iicoup ); // independent couplings, fixed for all events
+#ifdef MGONGPUCPP_GPUIMPL
 #ifdef __CUDACC__
 #pragma nv_diagnostic pop
+#endif
       // CUDA kernels take input/output buffers with momenta/MEs for all events
       const fptype* momenta = allmomenta;
       const fptype* COUPs[nxcoup];
@@ -497,7 +498,7 @@ namespace mg5amcCpu
         { 3, 9, 9, 3, 27, 9 },
         { 9, 3, 3, 9, 9, 27 } }; // 2-D array[6][6]
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
       // Pre-compute a constexpr triangular color matrix properly normalized #475
       struct TriangularNormalizedColorMatrix
       {
@@ -554,7 +555,7 @@ namespace mg5amcCpu
 #endif
       for( int icol = 0; icol < ncolor; icol++ )
       {
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
         // === C++ START ===
         // Diagonal terms
 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
@@ -613,7 +614,7 @@ namespace mg5amcCpu
       MEs_sv_previous += deltaMEs_previous;
 #endif
       /*
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
       if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", blockDim.x * blockIdx.x + threadIdx.x, ihel, MEs_sv );
 #else
 #ifdef MGONGPU_CPPSIMD
@@ -708,8 +709,8 @@ namespace mg5amcCpu
       { -1, 1, 1, -1, -1, -1 },
       { -1, 1, 1, -1, 1, 1 },
       { -1, 1, 1, -1, 1, -1 } };
-#ifdef __CUDACC__
-    checkCuda( cudaMemcpyToSymbol( cHel, tHel, ncomb * npar * sizeof( short ) ) );
+#ifdef MGONGPUCPP_GPUIMPL
+    gpuMemcpyToSymbol( cHel, tHel, ncomb * npar * sizeof( short ) );
 #else
     memcpy( cHel, tHel, ncomb * npar * sizeof( short ) );
 #endif
@@ -751,9 +752,9 @@ namespace mg5amcCpu
     // Then copy them to CUDA constant memory (issue #39) or its C++ emulation in file-scope static memory
     const fptype tIPD[2] = { (fptype)m_pars->mdl_MT, (fptype)m_pars->mdl_WT };
     //const cxtype tIPC[0] = { ... }; // nicoup=0
-#ifdef __CUDACC__
-    checkCuda( cudaMemcpyToSymbol( cIPD, tIPD, 2 * sizeof( fptype ) ) );
-    //checkCuda( cudaMemcpyToSymbol( cIPC, tIPC, 0 * sizeof( cxtype ) ) ); // nicoup=0
+#ifdef MGONGPUCPP_GPUIMPL
+    gpuMemcpyToSymbol( cIPD, tIPD, 2 * sizeof( fptype ) );
+    //gpuMemcpyToSymbol( cIPC, tIPC, 0 * sizeof( cxtype ) ); // nicoup=0
 #else
     memcpy( cIPD, tIPD, 2 * sizeof( fptype ) );
     //memcpy( cIPC, tIPC, 0 * sizeof( cxtype ) ); // nicoup=0
@@ -791,7 +792,7 @@ namespace mg5amcCpu
   {
     std::stringstream out;
     // CUDA version (NVCC)
-    // [Use __NVCC__ instead of __CUDACC__ here!]
+    // [Use __NVCC__ instead of MGONGPUCPP_GPUIMPL here!]
     // [This tests if 'nvcc' was used even to build a .cc file, even if not necessarily 'nvcc -x cu' for a .cu file]
     // [Check 'nvcc --compiler-options -dM -E dummy.c | grep CUDA': see https://stackoverflow.com/a/53713712]
 #ifdef __NVCC__
@@ -856,12 +857,12 @@ namespace mg5amcCpu
   __global__ void /* clang-format off */
   computeDependentCouplings( const fptype* allgs, // input: Gs[nevt]
                              fptype* allcouplings // output: couplings[nevt*ndcoup*2]
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
                              , const int nevt     // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #endif
   ) /* clang-format on */
   {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     using namespace mg5amcGpu;
     using G_ACCESS = DeviceAccessGs;
     using C_ACCESS = DeviceAccessCouplings;
@@ -882,7 +883,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__ /* clang-format off */
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
   __global__ void
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
@@ -1008,9 +1009,9 @@ namespace mg5amcCpu
         nGoodHel++;
       }
     }
-#ifdef __CUDACC__
-    checkCuda( cudaMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) ) );
-    checkCuda( cudaMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) ) );
+#ifdef MGONGPUCPP_GPUIMPL
+    gpuMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) );
+    gpuMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) );
 #else
     cNGoodHel = nGoodHel;
     for( int ihel = 0; ihel < ncomb; ihel++ ) cGoodHel[ihel] = goodHel[ihel];
@@ -1034,7 +1035,7 @@ namespace mg5amcCpu
 #endif
             int* allselhel,                // output: helicity selection[nevt]
             int* allselcol                 // output: helicity selection[nevt]
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
             , const int nevt               // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #endif
             ) /* clang-format on */
@@ -1054,7 +1055,7 @@ namespace mg5amcCpu
     // Denominators: spins, colors and identical particles
     constexpr int helcolDenominators[1] = { 36 }; // assume nprocesses == 1 (#272 and #343)
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     // Remember: in CUDA this is a kernel for one event, in c++ this processes n events
     const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid
 #else
@@ -1068,9 +1069,12 @@ namespace mg5amcCpu
 #endif
 
     // Start sigmaKin_lines
+
+#include "GpuAbstraction.h"
+
     // === PART 0 - INITIALISATION (before calculate_wavefunctions) ===
     // Reset the "matrix elements" - running sums of |M|^2 over helicities for the given event
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     allMEs[ievt] = 0;
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     allNumerators[ievt] = 0;
@@ -1098,7 +1102,7 @@ namespace mg5amcCpu
     // === PART 1 - HELICITY LOOP: CALCULATE WAVEFUNCTIONS ===
     // (in both CUDA and C++, using precomputed good helicities)
 
-#ifdef __CUDACC__ // CUDA OR C++
+#ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++
 
     // *** START OF PART 1a - CUDA (one event per CPU thread) ***
     // Running sum of partial amplitudes squared for event by event color selection (#402)
@@ -1308,7 +1312,7 @@ namespace mg5amcCpu
     // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event
     // [NB 'sum over final spins, average over initial spins', eg see
     // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf]
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     allMEs[ievt] /= helcolDenominators[0];
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     if( channelId > 0 ) allMEs[ievt] *= allNumerators[ievt] / allDenominators[ievt];
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxuux/CPPProcess.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxuux/CPPProcess.h
index 58cece5c62..d8d3d481ea 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxuux/CPPProcess.h
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxuux/CPPProcess.h
@@ -4,7 +4,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
 // MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
@@ -25,7 +25,7 @@
 
 //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -110,7 +110,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   __global__ void
   computeDependentCouplings( const fptype* allgs,    // input: Gs[nevt]
                              fptype* allcouplings ); // output: couplings[nevt*ndcoup*2]
@@ -123,7 +123,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__ /* clang-format off */
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
   __global__ void
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
@@ -153,7 +153,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__ /* clang-format off */
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
   __global__ void
   sigmaKin( const fptype* allmomenta,      // input: momenta[nevt*npar*4]
             const fptype* allcouplings,    // input: couplings[nevt*ndcoup*2]
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxuux/CudaRuntime.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxuux/CudaRuntime.h
deleted file mode 120000
index ce9e1a487a..0000000000
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxuux/CudaRuntime.h
+++ /dev/null
@@ -1 +0,0 @@
-../CudaRuntime.h
\ No newline at end of file
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxuux/GpuAbstraction.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxuux/GpuAbstraction.h
new file mode 120000
index 0000000000..72054e19ba
--- /dev/null
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxuux/GpuAbstraction.h
@@ -0,0 +1 @@
+../GpuAbstraction.h
\ No newline at end of file
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxuux/GpuRuntime.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxuux/GpuRuntime.h
new file mode 120000
index 0000000000..3920e83be4
--- /dev/null
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxuux/GpuRuntime.h
@@ -0,0 +1 @@
+../GpuRuntime.h
\ No newline at end of file
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxuux/check_sa.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxuux/check_sa.cc
index 3fbf0ffbee..7cac5ab47b 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxuux/check_sa.cc
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxuux/check_sa.cc
@@ -4,7 +4,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: O. Mattelaer (Nov 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 
 #include "mgOnGpuConfig.h"
@@ -12,6 +12,7 @@
 #include "BridgeKernels.h"
 #include "CPPProcess.h"
 #include "CrossSectionKernels.h"
+#include "GpuRuntime.h"
 #include "MatrixElementKernels.h"
 #include "MemoryAccessMatrixElements.h"
 #include "MemoryAccessMomenta.h"
@@ -65,7 +66,7 @@ usage( char* argv0, int ret = 1 )
   std::cout << std::endl;
   std::cout << "Summary stats are always computed: '-p' and '-j' only control their printout" << std::endl;
   std::cout << "The '-d' flag only enables NaN/abnormal warnings and OMP debugging" << std::endl;
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 #ifdef _OPENMP
   std::cout << std::endl;
   std::cout << "Use the OMP_NUM_THREADS environment variable to control OMP multi-threading" << std::endl;
@@ -96,7 +97,7 @@ int
 main( int argc, char** argv )
 {
   // Namespaces for CUDA and C++ (FIXME - eventually use the same namespace everywhere...)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   using namespace mg5amcGpu;
 #else
   using namespace mg5amcCpu;
@@ -134,9 +135,11 @@ main( int argc, char** argv )
     CurandDevice = 2
   };
 #ifdef MGONGPU_HAS_NO_CURAND
-  RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // this is the only supported mode if build has no curand (PR #784)
+  RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // this is the only supported mode if build has no curand (PR #784 and #785)
+#elif defined __HIPCC__
+#error Internal error: MGONGPU_HAS_NO_CURAND should have been set for __HIPCC__ // default on AMD GPUs should be common random
 #elif defined __CUDACC__
-  RandomNumberMode rndgen = RandomNumberMode::CurandDevice; // default on GPU if build has curand
+  RandomNumberMode rndgen = RandomNumberMode::CurandDevice; // default on NVidia GPU if build has curand
 #else
   RandomNumberMode rndgen = RandomNumberMode::CurandHost; // default on CPU if build has curand
 #endif
@@ -146,10 +149,10 @@ main( int argc, char** argv )
     RamboHost = 1,
     RamboDevice = 2
   };
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   RamboSamplingMode rmbsmp = RamboSamplingMode::RamboDevice; // default on GPU
 #else
-  RamboSamplingMode rmbsmp = RamboSamplingMode::RamboHost;  // default on CPU
+  RamboSamplingMode rmbsmp = RamboSamplingMode::RamboHost; // default on CPU
 #endif
   // Bridge emulation mode (NB Bridge implies RamboHost!)
   bool bridge = false;
@@ -177,7 +180,7 @@ main( int argc, char** argv )
     else if( arg == "--curdev" )
     {
 #ifndef __CUDACC__
-      throw std::runtime_error( "CurandDevice is not supported on CPUs" );
+      throw std::runtime_error( "CurandDevice is not supported on CPUs or non-NVidia GPUs" );
 #elif defined MGONGPU_HAS_NO_CURAND
       throw std::runtime_error( "CurandDevice is not supported because this application was built without Curand support" );
 #else
@@ -198,7 +201,7 @@ main( int argc, char** argv )
     }
     else if( arg == "--rmbdev" )
     {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
       rmbsmp = RamboSamplingMode::RamboDevice;
 #else
       throw std::runtime_error( "RamboDevice is not supported on CPUs" );
@@ -272,13 +275,13 @@ main( int argc, char** argv )
     return usage( argv[0] );
   }
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 #ifdef _OPENMP
   ompnumthreadsNotSetMeansOneThread( debug ? 1 : 0 ); // quiet(-1), info(0), debug(1)
 #endif
 #endif
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // Fail gently and avoid "Illegal instruction (core dumped)" if the host does not support the SIMD used in the ME calculation
   // Note: this prevents a crash on pmpe04 but not on some github CI nodes?
   // [NB: SIMD vectorization in mg5amc C++ code is only used in the ME calculation below MatrixElementKernelHost!]
@@ -296,14 +299,14 @@ main( int argc, char** argv )
 
   // === STEP 0 - INITIALISE
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 
-  // --- 00. Initialise cuda
-  // Instantiate a CudaRuntime at the beginnining of the application's main to
-  // invoke cudaSetDevice(0) in the constructor and book a cudaDeviceReset() call in the destructor
-  const std::string cdinKey = "00 CudaInit";
+  // --- 00. Initialise GPU
+  // Instantiate a GpuRuntime at the beginnining of the application's main.
+  // For CUDA this invokes cudaSetDevice(0) in the constructor and books a cudaDeviceReset() call in the destructor.
+  const std::string cdinKey = "00 GpuInit";
   timermap.start( cdinKey );
-  CudaRuntime cudaRuntime( debug );
+  GpuRuntime GpuRuntime( debug );
 #endif
 
   // --- 0a. Initialise physics process
@@ -325,7 +328,7 @@ main( int argc, char** argv )
   timermap.start( alloKey );
 
   // Memory buffers for random numbers for momenta
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferRndNumMomenta hstRndmom( nevt );
 #else
   PinnedHostBufferRndNumMomenta hstRndmom( nevt );
@@ -333,7 +336,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for sampling weights
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferWeights hstWeights( nevt );
 #else
   PinnedHostBufferWeights hstWeights( nevt );
@@ -341,7 +344,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for momenta
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferMomenta hstMomenta( nevt );
 #else
   PinnedHostBufferMomenta hstMomenta( nevt );
@@ -349,7 +352,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for Gs
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferGs hstGs( nevt );
 #else
   PinnedHostBufferGs hstGs( nevt );
@@ -366,7 +369,7 @@ main( int argc, char** argv )
   }
 
   // Memory buffers for matrix elements
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferMatrixElements hstMatrixElements( nevt );
 #else
   PinnedHostBufferMatrixElements hstMatrixElements( nevt );
@@ -375,7 +378,7 @@ main( int argc, char** argv )
 
   // Memory buffers for random numbers for helicity selection
   // *** NB #403 these buffers always remain initialised at 0: no need for helicity choice in gcheck/check (no LHE produced) ***
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferRndNumHelicity hstRndHel( nevt );
 #else
   PinnedHostBufferRndNumHelicity hstRndHel( nevt );
@@ -384,7 +387,7 @@ main( int argc, char** argv )
 
   // Memory buffers for random numbers for color selection
   // *** NB #402 these buffers always remain initialised at 0: no need for color choice in gcheck/check (no LHE produced) ***
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferRndNumColor hstRndCol( nevt );
 #else
   PinnedHostBufferRndNumColor hstRndCol( nevt );
@@ -392,7 +395,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for helicity selection
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferSelectedHelicity hstSelHel( nevt );
 #else
   PinnedHostBufferSelectedHelicity hstSelHel( nevt );
@@ -400,7 +403,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for color selection
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferSelectedColor hstSelCol( nevt );
 #else
   PinnedHostBufferSelectedColor hstSelCol( nevt );
@@ -438,7 +441,7 @@ main( int argc, char** argv )
     const bool onDevice = true;
     prnk.reset( new CurandRandomNumberKernel( devRndmom, onDevice ) );
 #else
-    throw std::logic_error( "INTERNAL ERROR! CurandDevice is not supported on CPUs" ); // INTERNAL ERROR (no path to this statement)
+    throw std::logic_error( "INTERNAL ERROR! CurandDevice is not supported on CPUs or non-NVidia GPUs" ); // INTERNAL ERROR (no path to this statement)
 #endif
   }
 
@@ -450,7 +453,7 @@ main( int argc, char** argv )
   }
   else
   {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     prsk.reset( new RamboSamplingKernelDevice( energy, devRndmom, devMomenta, devWeights, gpublocks, gputhreads ) );
 #else
     throw std::logic_error( "RamboDevice is not supported on CPUs" ); // INTERNAL ERROR (no path to this statement)
@@ -461,7 +464,7 @@ main( int argc, char** argv )
   std::unique_ptr<MatrixElementKernelBase> pmek;
   if( !bridge )
   {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     pmek.reset( new MatrixElementKernelDevice( devMomenta, devGs, devRndHel, devRndCol, devMatrixElements, devSelHel, devSelCol, gpublocks, gputhreads ) );
 #else
     pmek.reset( new MatrixElementKernelHost( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, nevt ) );
@@ -469,7 +472,7 @@ main( int argc, char** argv )
   }
   else
   {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     pmek.reset( new BridgeKernelDevice( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, gpublocks, gputhreads ) );
 #else
     pmek.reset( new BridgeKernelHost( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, nevt ) );
@@ -511,7 +514,7 @@ main( int argc, char** argv )
     prnk->generateRnarray();
     //std::cout << "Got random numbers" << std::endl;
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     if( rndgen != RandomNumberMode::CurandDevice && rmbsmp == RamboSamplingMode::RamboDevice )
     {
       // --- 1c. Copy rndmom from host to device
@@ -543,7 +546,7 @@ main( int argc, char** argv )
     prsk->getMomentaFinal();
     //std::cout << "Got final momenta" << std::endl;
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     if( rmbsmp == RamboSamplingMode::RamboDevice )
     {
       // --- 2c. CopyDToH Weights
@@ -588,7 +591,7 @@ main( int argc, char** argv )
       dynamic_cast<BridgeKernelBase*>( pmek.get() )->transposeInputMomentaC2F();
     }
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     // --- 2d. CopyHToD Momenta
     const std::string gKey = "0.. CpHTDg";
     rambtime += timermap.start( gKey ); // FIXME! NOT A RAMBO TIMER!
@@ -617,7 +620,7 @@ main( int argc, char** argv )
     wv3atime += timermap.stop(); // calc only
     wavetime += wv3atime;        // calc plus copy
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     if( !bridge )
     {
       // --- 3b. CopyDToH MEs
@@ -760,18 +763,22 @@ main( int argc, char** argv )
     rndgentxt = "CURAND DEVICE";
 #ifdef __CUDACC__
   rndgentxt += " (CUDA code)";
+#elif defined __HIPCC__
+  rndgentxt += " (HIP code)";
 #else
   rndgentxt += " (C++ code)";
 #endif
 
   // Workflow description summary
   std::string wrkflwtxt;
-  // -- CUDA or C++?
+  // -- CUDA or HIP or C++?
 #ifdef __CUDACC__
   wrkflwtxt += "CUD:";
+#elif defined __HIPCC__
+  wrkflwtxt += "HIP:";
 #else
   wrkflwtxt += "CPP:";
-#endif
+#endif /* clang-format off */
   // -- DOUBLE or FLOAT?
 #if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
   wrkflwtxt += "MIX+"; // mixed fptypes (single precision color algebra #537)
@@ -781,7 +788,7 @@ main( int argc, char** argv )
   wrkflwtxt += "FLT+";
 #else
   wrkflwtxt += "???+"; // no path to this statement
-#endif
+#endif /* clang-format on */
   // -- CUCOMPLEX or THRUST or STD complex numbers?
 #ifdef __CUDACC__
 #if defined MGONGPU_CUCXTYPE_CUCOMPLEX
@@ -793,6 +800,12 @@ main( int argc, char** argv )
 #else
   wrkflwtxt += "???:"; // no path to this statement
 #endif
+#elif defined __HIPCC__
+#if defined MGONGPU_CUCXTYPE_CXSMPL
+  wrkflwtxt += "CXS:";
+#else
+  wrkflwtxt += "???:"; // no path to this statement
+#endif
 #else
 #if defined MGONGPU_CPPCXTYPE_STDCOMPLEX
   wrkflwtxt += "STX:";
@@ -818,7 +831,7 @@ main( int argc, char** argv )
     wrkflwtxt += "RMBDEV+";
   else
     wrkflwtxt += "??????+"; // no path to this statement
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   // -- HOST or DEVICE matrix elements? Standalone MEs or BRIDGE?
   if( !bridge )
     wrkflwtxt += "MESDEV";
@@ -874,7 +887,7 @@ main( int argc, char** argv )
 
   if( perf )
   {
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 #ifdef _OPENMP
     // Get the output of "nproc --all" (https://stackoverflow.com/a/478960)
     std::string nprocall;
@@ -895,6 +908,8 @@ main( int argc, char** argv )
     std::cout << std::string( SEP79, '*' ) << std::endl
 #ifdef __CUDACC__
               << "Process                     = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_CUDA"
+#elif defined __HIPCC__
+              << "Process                     = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_HIP"
 #else
               << "Process                     = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_CPP"
 #endif
@@ -921,21 +936,21 @@ main( int argc, char** argv )
 #elif defined MGONGPU_FPTYPE_FLOAT
               << "FP precision                = FLOAT (NaN/abnormal=" << nabn << ", zero=" << nzero << ")" << std::endl
 #endif
-#ifdef __CUDACC__
 #if defined MGONGPU_CUCXTYPE_CUCOMPLEX
               << "Complex type                = CUCOMPLEX" << std::endl
 #elif defined MGONGPU_CUCXTYPE_THRUST
               << "Complex type                = THRUST::COMPLEX" << std::endl
-#endif
-#else
+#elif defined MGONGPU_CUCXTYPE_CXSMPL
               << "Complex type                = STD::COMPLEX" << std::endl
+#else
+              << "Complex type                = ???" << std::endl // no path to this statement...
 #endif
               << "RanNumb memory layout       = AOSOA[" << neppR << "]"
               << ( neppR == 1 ? " == AOS" : "" )
               << " [HARDCODED FOR REPRODUCIBILITY]" << std::endl
               << "Momenta memory layout       = AOSOA[" << neppM << "]"
               << ( neppM == 1 ? " == AOS" : "" ) << std::endl
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     //<< "Wavefunction GPU memory     = LOCAL" << std::endl
 #else
 #if !defined MGONGPU_CPPSIMD
@@ -966,7 +981,7 @@ main( int argc, char** argv )
 #endif
 #endif
               << "Random number generation    = " << rndgentxt << std::endl
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 #ifdef _OPENMP
               << "OMP threads / `nproc --all` = " << omp_get_max_threads() << " / " << nprocall // includes a newline
 #endif
@@ -1062,14 +1077,14 @@ main( int argc, char** argv )
              << "\"FLOAT (NaN/abnormal=" << nabn << ")\"," << std::endl
 #endif
              << "\"Complex type\": "
-#ifdef __CUDACC__
 #if defined MGONGPU_CUCXTYPE_CUCOMPLEX
              << "\"CUCOMPLEX\"," << std::endl
 #elif defined MGONGPU_CUCXTYPE_THRUST
              << "\"THRUST::COMPLEX\"," << std::endl
-#endif
-#else
+#elif defined MGONGPU_CUCXTYPE_CXSMPL
              << "\"STD::COMPLEX\"," << std::endl
+#else
+             << "\"???\"," << std::endl                           // no path to this statement...
 #endif
              << "\"RanNumb memory layout\": "
              << "\"AOSOA[" << neppR << "]\""
@@ -1077,7 +1092,7 @@ main( int argc, char** argv )
              << "\"Momenta memory layout\": "
              << "\"AOSOA[" << neppM << "]\""
              << ( neppM == 1 ? " == AOS" : "" ) << ", " << std::endl
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     //<< "\"Wavefunction GPU memory\": " << "\"LOCAL\"," << std::endl
 #endif
              << "\"Curand generation\": "
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxcx_ttxuxcx/CPPProcess.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxcx_ttxuxcx/CPPProcess.cc
index 7df13a2341..e21d1f0c48 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxcx_ttxuxcx/CPPProcess.cc
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxcx_ttxuxcx/CPPProcess.cc
@@ -4,7 +4,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
 // MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
@@ -16,7 +16,6 @@
 
 #include "mgOnGpuConfig.h"
 
-#include "CudaRuntime.h"
 #include "HelAmps_sm.h"
 #include "MemoryAccessAmplitudes.h"
 #include "MemoryAccessCouplings.h"
@@ -51,7 +50,7 @@
 // Process: c~ s~ > t t~ c~ s~ WEIGHTED<=4 @2
 // Process: d~ s~ > t t~ d~ s~ WEIGHTED<=4 @2
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -85,7 +84,7 @@ namespace mg5amcCpu
   __device__ const fptype cIPD[2] = { (fptype)Parameters_sm::mdl_MT, (fptype)Parameters_sm::mdl_WT };
   __device__ const fptype* cIPC = nullptr; // unused as nicoup=0
 #else
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   __device__ __constant__ fptype cIPD[2];
   __device__ __constant__ fptype* cIPC = nullptr; // unused as nicoup=0
 #else
@@ -95,7 +94,7 @@ namespace mg5amcCpu
 #endif
 
   // Helicity combinations (and filtering of "good" helicity combinations)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   __device__ __constant__ short cHel[ncomb][npar];
   __device__ __constant__ int cNGoodHel;
   __device__ __constant__ int cGoodHel[ncomb];
@@ -123,13 +122,13 @@ namespace mg5amcCpu
                            fptype* allDenominators,       // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
                            fptype_sv* jamp2_sv            // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled)
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
                            , const int ievt00             // input: first event number in current C++ event page (for CUDA, ievt depends on threadid)
 #endif
                            )
   //ALWAYS_INLINE // attributes are not permitted in a function definition
   {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     using namespace mg5amcGpu;
     using M_ACCESS = DeviceAccessMomenta;         // non-trivial access: buffer includes all events
     using E_ACCESS = DeviceAccessMatrixElements;  // non-trivial access: buffer includes all events
@@ -156,7 +155,7 @@ namespace mg5amcCpu
 #endif /* clang-format on */
     mgDebug( 0, __FUNCTION__ );
     //printf( "calculate_wavefunctions: ihel=%2d\n", ihel );
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
     //printf( "calculate_wavefunctions: ievt00=%d\n", ievt00 );
 #endif
 
@@ -192,7 +191,7 @@ namespace mg5amcCpu
 #endif
     for( int iParity = 0; iParity < nParity; ++iParity )
     { // START LOOP ON IPARITY
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
       const int ievt0 = ievt00 + iParity * neppV;
 #endif
       constexpr size_t nxcoup = ndcoup + nicoup; // both dependent and independent couplings
@@ -205,8 +204,10 @@ namespace mg5amcCpu
         allCOUPs[idcoup] = CD_ACCESS::idcoupAccessBufferConst( allcouplings, idcoup ); // dependent couplings, vary event-by-event
       for( size_t iicoup = 0; iicoup < nicoup; iicoup++ )
         allCOUPs[ndcoup + iicoup] = CI_ACCESS::iicoupAccessBufferConst( cIPC, iicoup ); // independent couplings, fixed for all events
+#ifdef MGONGPUCPP_GPUIMPL
 #ifdef __CUDACC__
 #pragma nv_diagnostic pop
+#endif
       // CUDA kernels take input/output buffers with momenta/MEs for all events
       const fptype* momenta = allmomenta;
       const fptype* COUPs[nxcoup];
@@ -387,7 +388,7 @@ namespace mg5amcCpu
         { 3, 9, 9, 3, 27, 9 },
         { 9, 3, 3, 9, 9, 27 } }; // 2-D array[6][6]
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
       // Pre-compute a constexpr triangular color matrix properly normalized #475
       struct TriangularNormalizedColorMatrix
       {
@@ -444,7 +445,7 @@ namespace mg5amcCpu
 #endif
       for( int icol = 0; icol < ncolor; icol++ )
       {
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
         // === C++ START ===
         // Diagonal terms
 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
@@ -503,7 +504,7 @@ namespace mg5amcCpu
       MEs_sv_previous += deltaMEs_previous;
 #endif
       /*
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
       if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", blockDim.x * blockIdx.x + threadIdx.x, ihel, MEs_sv );
 #else
 #ifdef MGONGPU_CPPSIMD
@@ -598,8 +599,8 @@ namespace mg5amcCpu
       { 1, 1, 1, -1, 1, -1 },
       { 1, 1, 1, -1, -1, 1 },
       { 1, 1, 1, -1, -1, -1 } };
-#ifdef __CUDACC__
-    checkCuda( cudaMemcpyToSymbol( cHel, tHel, ncomb * npar * sizeof( short ) ) );
+#ifdef MGONGPUCPP_GPUIMPL
+    gpuMemcpyToSymbol( cHel, tHel, ncomb * npar * sizeof( short ) );
 #else
     memcpy( cHel, tHel, ncomb * npar * sizeof( short ) );
 #endif
@@ -641,9 +642,9 @@ namespace mg5amcCpu
     // Then copy them to CUDA constant memory (issue #39) or its C++ emulation in file-scope static memory
     const fptype tIPD[2] = { (fptype)m_pars->mdl_MT, (fptype)m_pars->mdl_WT };
     //const cxtype tIPC[0] = { ... }; // nicoup=0
-#ifdef __CUDACC__
-    checkCuda( cudaMemcpyToSymbol( cIPD, tIPD, 2 * sizeof( fptype ) ) );
-    //checkCuda( cudaMemcpyToSymbol( cIPC, tIPC, 0 * sizeof( cxtype ) ) ); // nicoup=0
+#ifdef MGONGPUCPP_GPUIMPL
+    gpuMemcpyToSymbol( cIPD, tIPD, 2 * sizeof( fptype ) );
+    //gpuMemcpyToSymbol( cIPC, tIPC, 0 * sizeof( cxtype ) ); // nicoup=0
 #else
     memcpy( cIPD, tIPD, 2 * sizeof( fptype ) );
     //memcpy( cIPC, tIPC, 0 * sizeof( cxtype ) ); // nicoup=0
@@ -681,7 +682,7 @@ namespace mg5amcCpu
   {
     std::stringstream out;
     // CUDA version (NVCC)
-    // [Use __NVCC__ instead of __CUDACC__ here!]
+    // [Use __NVCC__ instead of MGONGPUCPP_GPUIMPL here!]
     // [This tests if 'nvcc' was used even to build a .cc file, even if not necessarily 'nvcc -x cu' for a .cu file]
     // [Check 'nvcc --compiler-options -dM -E dummy.c | grep CUDA': see https://stackoverflow.com/a/53713712]
 #ifdef __NVCC__
@@ -746,12 +747,12 @@ namespace mg5amcCpu
   __global__ void /* clang-format off */
   computeDependentCouplings( const fptype* allgs, // input: Gs[nevt]
                              fptype* allcouplings // output: couplings[nevt*ndcoup*2]
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
                              , const int nevt     // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #endif
   ) /* clang-format on */
   {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     using namespace mg5amcGpu;
     using G_ACCESS = DeviceAccessGs;
     using C_ACCESS = DeviceAccessCouplings;
@@ -772,7 +773,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__ /* clang-format off */
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
   __global__ void
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
@@ -898,9 +899,9 @@ namespace mg5amcCpu
         nGoodHel++;
       }
     }
-#ifdef __CUDACC__
-    checkCuda( cudaMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) ) );
-    checkCuda( cudaMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) ) );
+#ifdef MGONGPUCPP_GPUIMPL
+    gpuMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) );
+    gpuMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) );
 #else
     cNGoodHel = nGoodHel;
     for( int ihel = 0; ihel < ncomb; ihel++ ) cGoodHel[ihel] = goodHel[ihel];
@@ -924,7 +925,7 @@ namespace mg5amcCpu
 #endif
             int* allselhel,                // output: helicity selection[nevt]
             int* allselcol                 // output: helicity selection[nevt]
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
             , const int nevt               // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #endif
             ) /* clang-format on */
@@ -944,7 +945,7 @@ namespace mg5amcCpu
     // Denominators: spins, colors and identical particles
     constexpr int helcolDenominators[1] = { 36 }; // assume nprocesses == 1 (#272 and #343)
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     // Remember: in CUDA this is a kernel for one event, in c++ this processes n events
     const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid
 #else
@@ -958,9 +959,12 @@ namespace mg5amcCpu
 #endif
 
     // Start sigmaKin_lines
+
+#include "GpuAbstraction.h"
+
     // === PART 0 - INITIALISATION (before calculate_wavefunctions) ===
     // Reset the "matrix elements" - running sums of |M|^2 over helicities for the given event
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     allMEs[ievt] = 0;
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     allNumerators[ievt] = 0;
@@ -988,7 +992,7 @@ namespace mg5amcCpu
     // === PART 1 - HELICITY LOOP: CALCULATE WAVEFUNCTIONS ===
     // (in both CUDA and C++, using precomputed good helicities)
 
-#ifdef __CUDACC__ // CUDA OR C++
+#ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++
 
     // *** START OF PART 1a - CUDA (one event per CPU thread) ***
     // Running sum of partial amplitudes squared for event by event color selection (#402)
@@ -1198,7 +1202,7 @@ namespace mg5amcCpu
     // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event
     // [NB 'sum over final spins, average over initial spins', eg see
     // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf]
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     allMEs[ievt] /= helcolDenominators[0];
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     if( channelId > 0 ) allMEs[ievt] *= allNumerators[ievt] / allDenominators[ievt];
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxcx_ttxuxcx/CPPProcess.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxcx_ttxuxcx/CPPProcess.h
index 6bd3135c3c..901c6dfcc9 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxcx_ttxuxcx/CPPProcess.h
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxcx_ttxuxcx/CPPProcess.h
@@ -4,7 +4,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
 // MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
@@ -25,7 +25,7 @@
 
 //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -112,7 +112,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   __global__ void
   computeDependentCouplings( const fptype* allgs,    // input: Gs[nevt]
                              fptype* allcouplings ); // output: couplings[nevt*ndcoup*2]
@@ -125,7 +125,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__ /* clang-format off */
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
   __global__ void
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
@@ -155,7 +155,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__ /* clang-format off */
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
   __global__ void
   sigmaKin( const fptype* allmomenta,      // input: momenta[nevt*npar*4]
             const fptype* allcouplings,    // input: couplings[nevt*ndcoup*2]
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxcx_ttxuxcx/CudaRuntime.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxcx_ttxuxcx/CudaRuntime.h
deleted file mode 120000
index ce9e1a487a..0000000000
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxcx_ttxuxcx/CudaRuntime.h
+++ /dev/null
@@ -1 +0,0 @@
-../CudaRuntime.h
\ No newline at end of file
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxcx_ttxuxcx/GpuAbstraction.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxcx_ttxuxcx/GpuAbstraction.h
new file mode 120000
index 0000000000..72054e19ba
--- /dev/null
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxcx_ttxuxcx/GpuAbstraction.h
@@ -0,0 +1 @@
+../GpuAbstraction.h
\ No newline at end of file
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxcx_ttxuxcx/GpuRuntime.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxcx_ttxuxcx/GpuRuntime.h
new file mode 120000
index 0000000000..3920e83be4
--- /dev/null
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxcx_ttxuxcx/GpuRuntime.h
@@ -0,0 +1 @@
+../GpuRuntime.h
\ No newline at end of file
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxcx_ttxuxcx/check_sa.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxcx_ttxuxcx/check_sa.cc
index 3fbf0ffbee..7cac5ab47b 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxcx_ttxuxcx/check_sa.cc
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxcx_ttxuxcx/check_sa.cc
@@ -4,7 +4,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: O. Mattelaer (Nov 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 
 #include "mgOnGpuConfig.h"
@@ -12,6 +12,7 @@
 #include "BridgeKernels.h"
 #include "CPPProcess.h"
 #include "CrossSectionKernels.h"
+#include "GpuRuntime.h"
 #include "MatrixElementKernels.h"
 #include "MemoryAccessMatrixElements.h"
 #include "MemoryAccessMomenta.h"
@@ -65,7 +66,7 @@ usage( char* argv0, int ret = 1 )
   std::cout << std::endl;
   std::cout << "Summary stats are always computed: '-p' and '-j' only control their printout" << std::endl;
   std::cout << "The '-d' flag only enables NaN/abnormal warnings and OMP debugging" << std::endl;
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 #ifdef _OPENMP
   std::cout << std::endl;
   std::cout << "Use the OMP_NUM_THREADS environment variable to control OMP multi-threading" << std::endl;
@@ -96,7 +97,7 @@ int
 main( int argc, char** argv )
 {
   // Namespaces for CUDA and C++ (FIXME - eventually use the same namespace everywhere...)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   using namespace mg5amcGpu;
 #else
   using namespace mg5amcCpu;
@@ -134,9 +135,11 @@ main( int argc, char** argv )
     CurandDevice = 2
   };
 #ifdef MGONGPU_HAS_NO_CURAND
-  RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // this is the only supported mode if build has no curand (PR #784)
+  RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // this is the only supported mode if build has no curand (PR #784 and #785)
+#elif defined __HIPCC__
+#error Internal error: MGONGPU_HAS_NO_CURAND should have been set for __HIPCC__ // default on AMD GPUs should be common random
 #elif defined __CUDACC__
-  RandomNumberMode rndgen = RandomNumberMode::CurandDevice; // default on GPU if build has curand
+  RandomNumberMode rndgen = RandomNumberMode::CurandDevice; // default on NVidia GPU if build has curand
 #else
   RandomNumberMode rndgen = RandomNumberMode::CurandHost; // default on CPU if build has curand
 #endif
@@ -146,10 +149,10 @@ main( int argc, char** argv )
     RamboHost = 1,
     RamboDevice = 2
   };
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   RamboSamplingMode rmbsmp = RamboSamplingMode::RamboDevice; // default on GPU
 #else
-  RamboSamplingMode rmbsmp = RamboSamplingMode::RamboHost;  // default on CPU
+  RamboSamplingMode rmbsmp = RamboSamplingMode::RamboHost; // default on CPU
 #endif
   // Bridge emulation mode (NB Bridge implies RamboHost!)
   bool bridge = false;
@@ -177,7 +180,7 @@ main( int argc, char** argv )
     else if( arg == "--curdev" )
     {
 #ifndef __CUDACC__
-      throw std::runtime_error( "CurandDevice is not supported on CPUs" );
+      throw std::runtime_error( "CurandDevice is not supported on CPUs or non-NVidia GPUs" );
 #elif defined MGONGPU_HAS_NO_CURAND
       throw std::runtime_error( "CurandDevice is not supported because this application was built without Curand support" );
 #else
@@ -198,7 +201,7 @@ main( int argc, char** argv )
     }
     else if( arg == "--rmbdev" )
     {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
       rmbsmp = RamboSamplingMode::RamboDevice;
 #else
       throw std::runtime_error( "RamboDevice is not supported on CPUs" );
@@ -272,13 +275,13 @@ main( int argc, char** argv )
     return usage( argv[0] );
   }
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 #ifdef _OPENMP
   ompnumthreadsNotSetMeansOneThread( debug ? 1 : 0 ); // quiet(-1), info(0), debug(1)
 #endif
 #endif
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // Fail gently and avoid "Illegal instruction (core dumped)" if the host does not support the SIMD used in the ME calculation
   // Note: this prevents a crash on pmpe04 but not on some github CI nodes?
   // [NB: SIMD vectorization in mg5amc C++ code is only used in the ME calculation below MatrixElementKernelHost!]
@@ -296,14 +299,14 @@ main( int argc, char** argv )
 
   // === STEP 0 - INITIALISE
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 
-  // --- 00. Initialise cuda
-  // Instantiate a CudaRuntime at the beginnining of the application's main to
-  // invoke cudaSetDevice(0) in the constructor and book a cudaDeviceReset() call in the destructor
-  const std::string cdinKey = "00 CudaInit";
+  // --- 00. Initialise GPU
+  // Instantiate a GpuRuntime at the beginnining of the application's main.
+  // For CUDA this invokes cudaSetDevice(0) in the constructor and books a cudaDeviceReset() call in the destructor.
+  const std::string cdinKey = "00 GpuInit";
   timermap.start( cdinKey );
-  CudaRuntime cudaRuntime( debug );
+  GpuRuntime GpuRuntime( debug );
 #endif
 
   // --- 0a. Initialise physics process
@@ -325,7 +328,7 @@ main( int argc, char** argv )
   timermap.start( alloKey );
 
   // Memory buffers for random numbers for momenta
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferRndNumMomenta hstRndmom( nevt );
 #else
   PinnedHostBufferRndNumMomenta hstRndmom( nevt );
@@ -333,7 +336,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for sampling weights
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferWeights hstWeights( nevt );
 #else
   PinnedHostBufferWeights hstWeights( nevt );
@@ -341,7 +344,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for momenta
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferMomenta hstMomenta( nevt );
 #else
   PinnedHostBufferMomenta hstMomenta( nevt );
@@ -349,7 +352,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for Gs
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferGs hstGs( nevt );
 #else
   PinnedHostBufferGs hstGs( nevt );
@@ -366,7 +369,7 @@ main( int argc, char** argv )
   }
 
   // Memory buffers for matrix elements
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferMatrixElements hstMatrixElements( nevt );
 #else
   PinnedHostBufferMatrixElements hstMatrixElements( nevt );
@@ -375,7 +378,7 @@ main( int argc, char** argv )
 
   // Memory buffers for random numbers for helicity selection
   // *** NB #403 these buffers always remain initialised at 0: no need for helicity choice in gcheck/check (no LHE produced) ***
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferRndNumHelicity hstRndHel( nevt );
 #else
   PinnedHostBufferRndNumHelicity hstRndHel( nevt );
@@ -384,7 +387,7 @@ main( int argc, char** argv )
 
   // Memory buffers for random numbers for color selection
   // *** NB #402 these buffers always remain initialised at 0: no need for color choice in gcheck/check (no LHE produced) ***
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferRndNumColor hstRndCol( nevt );
 #else
   PinnedHostBufferRndNumColor hstRndCol( nevt );
@@ -392,7 +395,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for helicity selection
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferSelectedHelicity hstSelHel( nevt );
 #else
   PinnedHostBufferSelectedHelicity hstSelHel( nevt );
@@ -400,7 +403,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for color selection
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferSelectedColor hstSelCol( nevt );
 #else
   PinnedHostBufferSelectedColor hstSelCol( nevt );
@@ -438,7 +441,7 @@ main( int argc, char** argv )
     const bool onDevice = true;
     prnk.reset( new CurandRandomNumberKernel( devRndmom, onDevice ) );
 #else
-    throw std::logic_error( "INTERNAL ERROR! CurandDevice is not supported on CPUs" ); // INTERNAL ERROR (no path to this statement)
+    throw std::logic_error( "INTERNAL ERROR! CurandDevice is not supported on CPUs or non-NVidia GPUs" ); // INTERNAL ERROR (no path to this statement)
 #endif
   }
 
@@ -450,7 +453,7 @@ main( int argc, char** argv )
   }
   else
   {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     prsk.reset( new RamboSamplingKernelDevice( energy, devRndmom, devMomenta, devWeights, gpublocks, gputhreads ) );
 #else
     throw std::logic_error( "RamboDevice is not supported on CPUs" ); // INTERNAL ERROR (no path to this statement)
@@ -461,7 +464,7 @@ main( int argc, char** argv )
   std::unique_ptr<MatrixElementKernelBase> pmek;
   if( !bridge )
   {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     pmek.reset( new MatrixElementKernelDevice( devMomenta, devGs, devRndHel, devRndCol, devMatrixElements, devSelHel, devSelCol, gpublocks, gputhreads ) );
 #else
     pmek.reset( new MatrixElementKernelHost( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, nevt ) );
@@ -469,7 +472,7 @@ main( int argc, char** argv )
   }
   else
   {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     pmek.reset( new BridgeKernelDevice( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, gpublocks, gputhreads ) );
 #else
     pmek.reset( new BridgeKernelHost( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, nevt ) );
@@ -511,7 +514,7 @@ main( int argc, char** argv )
     prnk->generateRnarray();
     //std::cout << "Got random numbers" << std::endl;
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     if( rndgen != RandomNumberMode::CurandDevice && rmbsmp == RamboSamplingMode::RamboDevice )
     {
       // --- 1c. Copy rndmom from host to device
@@ -543,7 +546,7 @@ main( int argc, char** argv )
     prsk->getMomentaFinal();
     //std::cout << "Got final momenta" << std::endl;
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     if( rmbsmp == RamboSamplingMode::RamboDevice )
     {
       // --- 2c. CopyDToH Weights
@@ -588,7 +591,7 @@ main( int argc, char** argv )
       dynamic_cast<BridgeKernelBase*>( pmek.get() )->transposeInputMomentaC2F();
     }
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     // --- 2d. CopyHToD Momenta
     const std::string gKey = "0.. CpHTDg";
     rambtime += timermap.start( gKey ); // FIXME! NOT A RAMBO TIMER!
@@ -617,7 +620,7 @@ main( int argc, char** argv )
     wv3atime += timermap.stop(); // calc only
     wavetime += wv3atime;        // calc plus copy
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     if( !bridge )
     {
       // --- 3b. CopyDToH MEs
@@ -760,18 +763,22 @@ main( int argc, char** argv )
     rndgentxt = "CURAND DEVICE";
 #ifdef __CUDACC__
   rndgentxt += " (CUDA code)";
+#elif defined __HIPCC__
+  rndgentxt += " (HIP code)";
 #else
   rndgentxt += " (C++ code)";
 #endif
 
   // Workflow description summary
   std::string wrkflwtxt;
-  // -- CUDA or C++?
+  // -- CUDA or HIP or C++?
 #ifdef __CUDACC__
   wrkflwtxt += "CUD:";
+#elif defined __HIPCC__
+  wrkflwtxt += "HIP:";
 #else
   wrkflwtxt += "CPP:";
-#endif
+#endif /* clang-format off */
   // -- DOUBLE or FLOAT?
 #if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
   wrkflwtxt += "MIX+"; // mixed fptypes (single precision color algebra #537)
@@ -781,7 +788,7 @@ main( int argc, char** argv )
   wrkflwtxt += "FLT+";
 #else
   wrkflwtxt += "???+"; // no path to this statement
-#endif
+#endif /* clang-format on */
   // -- CUCOMPLEX or THRUST or STD complex numbers?
 #ifdef __CUDACC__
 #if defined MGONGPU_CUCXTYPE_CUCOMPLEX
@@ -793,6 +800,12 @@ main( int argc, char** argv )
 #else
   wrkflwtxt += "???:"; // no path to this statement
 #endif
+#elif defined __HIPCC__
+#if defined MGONGPU_CUCXTYPE_CXSMPL
+  wrkflwtxt += "CXS:";
+#else
+  wrkflwtxt += "???:"; // no path to this statement
+#endif
 #else
 #if defined MGONGPU_CPPCXTYPE_STDCOMPLEX
   wrkflwtxt += "STX:";
@@ -818,7 +831,7 @@ main( int argc, char** argv )
     wrkflwtxt += "RMBDEV+";
   else
     wrkflwtxt += "??????+"; // no path to this statement
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   // -- HOST or DEVICE matrix elements? Standalone MEs or BRIDGE?
   if( !bridge )
     wrkflwtxt += "MESDEV";
@@ -874,7 +887,7 @@ main( int argc, char** argv )
 
   if( perf )
   {
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 #ifdef _OPENMP
     // Get the output of "nproc --all" (https://stackoverflow.com/a/478960)
     std::string nprocall;
@@ -895,6 +908,8 @@ main( int argc, char** argv )
     std::cout << std::string( SEP79, '*' ) << std::endl
 #ifdef __CUDACC__
               << "Process                     = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_CUDA"
+#elif defined __HIPCC__
+              << "Process                     = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_HIP"
 #else
               << "Process                     = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_CPP"
 #endif
@@ -921,21 +936,21 @@ main( int argc, char** argv )
 #elif defined MGONGPU_FPTYPE_FLOAT
               << "FP precision                = FLOAT (NaN/abnormal=" << nabn << ", zero=" << nzero << ")" << std::endl
 #endif
-#ifdef __CUDACC__
 #if defined MGONGPU_CUCXTYPE_CUCOMPLEX
               << "Complex type                = CUCOMPLEX" << std::endl
 #elif defined MGONGPU_CUCXTYPE_THRUST
               << "Complex type                = THRUST::COMPLEX" << std::endl
-#endif
-#else
+#elif defined MGONGPU_CUCXTYPE_CXSMPL
               << "Complex type                = STD::COMPLEX" << std::endl
+#else
+              << "Complex type                = ???" << std::endl // no path to this statement...
 #endif
               << "RanNumb memory layout       = AOSOA[" << neppR << "]"
               << ( neppR == 1 ? " == AOS" : "" )
               << " [HARDCODED FOR REPRODUCIBILITY]" << std::endl
               << "Momenta memory layout       = AOSOA[" << neppM << "]"
               << ( neppM == 1 ? " == AOS" : "" ) << std::endl
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     //<< "Wavefunction GPU memory     = LOCAL" << std::endl
 #else
 #if !defined MGONGPU_CPPSIMD
@@ -966,7 +981,7 @@ main( int argc, char** argv )
 #endif
 #endif
               << "Random number generation    = " << rndgentxt << std::endl
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 #ifdef _OPENMP
               << "OMP threads / `nproc --all` = " << omp_get_max_threads() << " / " << nprocall // includes a newline
 #endif
@@ -1062,14 +1077,14 @@ main( int argc, char** argv )
              << "\"FLOAT (NaN/abnormal=" << nabn << ")\"," << std::endl
 #endif
              << "\"Complex type\": "
-#ifdef __CUDACC__
 #if defined MGONGPU_CUCXTYPE_CUCOMPLEX
              << "\"CUCOMPLEX\"," << std::endl
 #elif defined MGONGPU_CUCXTYPE_THRUST
              << "\"THRUST::COMPLEX\"," << std::endl
-#endif
-#else
+#elif defined MGONGPU_CUCXTYPE_CXSMPL
              << "\"STD::COMPLEX\"," << std::endl
+#else
+             << "\"???\"," << std::endl                           // no path to this statement...
 #endif
              << "\"RanNumb memory layout\": "
              << "\"AOSOA[" << neppR << "]\""
@@ -1077,7 +1092,7 @@ main( int argc, char** argv )
              << "\"Momenta memory layout\": "
              << "\"AOSOA[" << neppM << "]\""
              << ( neppM == 1 ? " == AOS" : "" ) << ", " << std::endl
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     //<< "\"Wavefunction GPU memory\": " << "\"LOCAL\"," << std::endl
 #endif
              << "\"Curand generation\": "
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxux_ttxuxux/CPPProcess.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxux_ttxuxux/CPPProcess.cc
index f464c27160..527b1d3c8f 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxux_ttxuxux/CPPProcess.cc
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxux_ttxuxux/CPPProcess.cc
@@ -4,7 +4,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
 // MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
@@ -16,7 +16,6 @@
 
 #include "mgOnGpuConfig.h"
 
-#include "CudaRuntime.h"
 #include "HelAmps_sm.h"
 #include "MemoryAccessAmplitudes.h"
 #include "MemoryAccessCouplings.h"
@@ -49,7 +48,7 @@
 // Process: d~ d~ > t t~ d~ d~ WEIGHTED<=4 @2
 // Process: s~ s~ > t t~ s~ s~ WEIGHTED<=4 @2
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -83,7 +82,7 @@ namespace mg5amcCpu
   __device__ const fptype cIPD[2] = { (fptype)Parameters_sm::mdl_MT, (fptype)Parameters_sm::mdl_WT };
   __device__ const fptype* cIPC = nullptr; // unused as nicoup=0
 #else
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   __device__ __constant__ fptype cIPD[2];
   __device__ __constant__ fptype* cIPC = nullptr; // unused as nicoup=0
 #else
@@ -93,7 +92,7 @@ namespace mg5amcCpu
 #endif
 
   // Helicity combinations (and filtering of "good" helicity combinations)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   __device__ __constant__ short cHel[ncomb][npar];
   __device__ __constant__ int cNGoodHel;
   __device__ __constant__ int cGoodHel[ncomb];
@@ -121,13 +120,13 @@ namespace mg5amcCpu
                            fptype* allDenominators,       // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
                            fptype_sv* jamp2_sv            // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled)
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
                            , const int ievt00             // input: first event number in current C++ event page (for CUDA, ievt depends on threadid)
 #endif
                            )
   //ALWAYS_INLINE // attributes are not permitted in a function definition
   {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     using namespace mg5amcGpu;
     using M_ACCESS = DeviceAccessMomenta;         // non-trivial access: buffer includes all events
     using E_ACCESS = DeviceAccessMatrixElements;  // non-trivial access: buffer includes all events
@@ -154,7 +153,7 @@ namespace mg5amcCpu
 #endif /* clang-format on */
     mgDebug( 0, __FUNCTION__ );
     //printf( "calculate_wavefunctions: ihel=%2d\n", ihel );
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
     //printf( "calculate_wavefunctions: ievt00=%d\n", ievt00 );
 #endif
 
@@ -190,7 +189,7 @@ namespace mg5amcCpu
 #endif
     for( int iParity = 0; iParity < nParity; ++iParity )
     { // START LOOP ON IPARITY
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
       const int ievt0 = ievt00 + iParity * neppV;
 #endif
       constexpr size_t nxcoup = ndcoup + nicoup; // both dependent and independent couplings
@@ -203,8 +202,10 @@ namespace mg5amcCpu
         allCOUPs[idcoup] = CD_ACCESS::idcoupAccessBufferConst( allcouplings, idcoup ); // dependent couplings, vary event-by-event
       for( size_t iicoup = 0; iicoup < nicoup; iicoup++ )
         allCOUPs[ndcoup + iicoup] = CI_ACCESS::iicoupAccessBufferConst( cIPC, iicoup ); // independent couplings, fixed for all events
+#ifdef MGONGPUCPP_GPUIMPL
 #ifdef __CUDACC__
 #pragma nv_diagnostic pop
+#endif
       // CUDA kernels take input/output buffers with momenta/MEs for all events
       const fptype* momenta = allmomenta;
       const fptype* COUPs[nxcoup];
@@ -497,7 +498,7 @@ namespace mg5amcCpu
         { 3, 9, 9, 3, 27, 9 },
         { 9, 3, 3, 9, 9, 27 } }; // 2-D array[6][6]
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
       // Pre-compute a constexpr triangular color matrix properly normalized #475
       struct TriangularNormalizedColorMatrix
       {
@@ -554,7 +555,7 @@ namespace mg5amcCpu
 #endif
       for( int icol = 0; icol < ncolor; icol++ )
       {
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
         // === C++ START ===
         // Diagonal terms
 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
@@ -613,7 +614,7 @@ namespace mg5amcCpu
       MEs_sv_previous += deltaMEs_previous;
 #endif
       /*
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
       if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", blockDim.x * blockIdx.x + threadIdx.x, ihel, MEs_sv );
 #else
 #ifdef MGONGPU_CPPSIMD
@@ -708,8 +709,8 @@ namespace mg5amcCpu
       { 1, 1, 1, -1, 1, -1 },
       { 1, 1, 1, -1, -1, 1 },
       { 1, 1, 1, -1, -1, -1 } };
-#ifdef __CUDACC__
-    checkCuda( cudaMemcpyToSymbol( cHel, tHel, ncomb * npar * sizeof( short ) ) );
+#ifdef MGONGPUCPP_GPUIMPL
+    gpuMemcpyToSymbol( cHel, tHel, ncomb * npar * sizeof( short ) );
 #else
     memcpy( cHel, tHel, ncomb * npar * sizeof( short ) );
 #endif
@@ -751,9 +752,9 @@ namespace mg5amcCpu
     // Then copy them to CUDA constant memory (issue #39) or its C++ emulation in file-scope static memory
     const fptype tIPD[2] = { (fptype)m_pars->mdl_MT, (fptype)m_pars->mdl_WT };
     //const cxtype tIPC[0] = { ... }; // nicoup=0
-#ifdef __CUDACC__
-    checkCuda( cudaMemcpyToSymbol( cIPD, tIPD, 2 * sizeof( fptype ) ) );
-    //checkCuda( cudaMemcpyToSymbol( cIPC, tIPC, 0 * sizeof( cxtype ) ) ); // nicoup=0
+#ifdef MGONGPUCPP_GPUIMPL
+    gpuMemcpyToSymbol( cIPD, tIPD, 2 * sizeof( fptype ) );
+    //gpuMemcpyToSymbol( cIPC, tIPC, 0 * sizeof( cxtype ) ); // nicoup=0
 #else
     memcpy( cIPD, tIPD, 2 * sizeof( fptype ) );
     //memcpy( cIPC, tIPC, 0 * sizeof( cxtype ) ); // nicoup=0
@@ -791,7 +792,7 @@ namespace mg5amcCpu
   {
     std::stringstream out;
     // CUDA version (NVCC)
-    // [Use __NVCC__ instead of __CUDACC__ here!]
+    // [Use __NVCC__ instead of MGONGPUCPP_GPUIMPL here!]
     // [This tests if 'nvcc' was used even to build a .cc file, even if not necessarily 'nvcc -x cu' for a .cu file]
     // [Check 'nvcc --compiler-options -dM -E dummy.c | grep CUDA': see https://stackoverflow.com/a/53713712]
 #ifdef __NVCC__
@@ -856,12 +857,12 @@ namespace mg5amcCpu
   __global__ void /* clang-format off */
   computeDependentCouplings( const fptype* allgs, // input: Gs[nevt]
                              fptype* allcouplings // output: couplings[nevt*ndcoup*2]
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
                              , const int nevt     // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #endif
   ) /* clang-format on */
   {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     using namespace mg5amcGpu;
     using G_ACCESS = DeviceAccessGs;
     using C_ACCESS = DeviceAccessCouplings;
@@ -882,7 +883,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__ /* clang-format off */
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
   __global__ void
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
@@ -1008,9 +1009,9 @@ namespace mg5amcCpu
         nGoodHel++;
       }
     }
-#ifdef __CUDACC__
-    checkCuda( cudaMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) ) );
-    checkCuda( cudaMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) ) );
+#ifdef MGONGPUCPP_GPUIMPL
+    gpuMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) );
+    gpuMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) );
 #else
     cNGoodHel = nGoodHel;
     for( int ihel = 0; ihel < ncomb; ihel++ ) cGoodHel[ihel] = goodHel[ihel];
@@ -1034,7 +1035,7 @@ namespace mg5amcCpu
 #endif
             int* allselhel,                // output: helicity selection[nevt]
             int* allselcol                 // output: helicity selection[nevt]
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
             , const int nevt               // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #endif
             ) /* clang-format on */
@@ -1054,7 +1055,7 @@ namespace mg5amcCpu
     // Denominators: spins, colors and identical particles
     constexpr int helcolDenominators[1] = { 72 }; // assume nprocesses == 1 (#272 and #343)
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     // Remember: in CUDA this is a kernel for one event, in c++ this processes n events
     const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid
 #else
@@ -1068,9 +1069,12 @@ namespace mg5amcCpu
 #endif
 
     // Start sigmaKin_lines
+
+#include "GpuAbstraction.h"
+
     // === PART 0 - INITIALISATION (before calculate_wavefunctions) ===
     // Reset the "matrix elements" - running sums of |M|^2 over helicities for the given event
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     allMEs[ievt] = 0;
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     allNumerators[ievt] = 0;
@@ -1098,7 +1102,7 @@ namespace mg5amcCpu
     // === PART 1 - HELICITY LOOP: CALCULATE WAVEFUNCTIONS ===
     // (in both CUDA and C++, using precomputed good helicities)
 
-#ifdef __CUDACC__ // CUDA OR C++
+#ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++
 
     // *** START OF PART 1a - CUDA (one event per CPU thread) ***
     // Running sum of partial amplitudes squared for event by event color selection (#402)
@@ -1308,7 +1312,7 @@ namespace mg5amcCpu
     // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event
     // [NB 'sum over final spins, average over initial spins', eg see
     // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf]
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     allMEs[ievt] /= helcolDenominators[0];
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     if( channelId > 0 ) allMEs[ievt] *= allNumerators[ievt] / allDenominators[ievt];
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxux_ttxuxux/CPPProcess.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxux_ttxuxux/CPPProcess.h
index 4e53fa1250..c2ca443c0e 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxux_ttxuxux/CPPProcess.h
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxux_ttxuxux/CPPProcess.h
@@ -4,7 +4,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
 // MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
@@ -25,7 +25,7 @@
 
 //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -110,7 +110,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   __global__ void
   computeDependentCouplings( const fptype* allgs,    // input: Gs[nevt]
                              fptype* allcouplings ); // output: couplings[nevt*ndcoup*2]
@@ -123,7 +123,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__ /* clang-format off */
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
   __global__ void
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
@@ -153,7 +153,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__ /* clang-format off */
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
   __global__ void
   sigmaKin( const fptype* allmomenta,      // input: momenta[nevt*npar*4]
             const fptype* allcouplings,    // input: couplings[nevt*ndcoup*2]
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxux_ttxuxux/CudaRuntime.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxux_ttxuxux/CudaRuntime.h
deleted file mode 120000
index ce9e1a487a..0000000000
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxux_ttxuxux/CudaRuntime.h
+++ /dev/null
@@ -1 +0,0 @@
-../CudaRuntime.h
\ No newline at end of file
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxux_ttxuxux/GpuAbstraction.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxux_ttxuxux/GpuAbstraction.h
new file mode 120000
index 0000000000..72054e19ba
--- /dev/null
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxux_ttxuxux/GpuAbstraction.h
@@ -0,0 +1 @@
+../GpuAbstraction.h
\ No newline at end of file
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxux_ttxuxux/GpuRuntime.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxux_ttxuxux/GpuRuntime.h
new file mode 120000
index 0000000000..3920e83be4
--- /dev/null
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxux_ttxuxux/GpuRuntime.h
@@ -0,0 +1 @@
+../GpuRuntime.h
\ No newline at end of file
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxux_ttxuxux/check_sa.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxux_ttxuxux/check_sa.cc
index 3fbf0ffbee..7cac5ab47b 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxux_ttxuxux/check_sa.cc
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxux_ttxuxux/check_sa.cc
@@ -4,7 +4,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: O. Mattelaer (Nov 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 
 #include "mgOnGpuConfig.h"
@@ -12,6 +12,7 @@
 #include "BridgeKernels.h"
 #include "CPPProcess.h"
 #include "CrossSectionKernels.h"
+#include "GpuRuntime.h"
 #include "MatrixElementKernels.h"
 #include "MemoryAccessMatrixElements.h"
 #include "MemoryAccessMomenta.h"
@@ -65,7 +66,7 @@ usage( char* argv0, int ret = 1 )
   std::cout << std::endl;
   std::cout << "Summary stats are always computed: '-p' and '-j' only control their printout" << std::endl;
   std::cout << "The '-d' flag only enables NaN/abnormal warnings and OMP debugging" << std::endl;
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 #ifdef _OPENMP
   std::cout << std::endl;
   std::cout << "Use the OMP_NUM_THREADS environment variable to control OMP multi-threading" << std::endl;
@@ -96,7 +97,7 @@ int
 main( int argc, char** argv )
 {
   // Namespaces for CUDA and C++ (FIXME - eventually use the same namespace everywhere...)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   using namespace mg5amcGpu;
 #else
   using namespace mg5amcCpu;
@@ -134,9 +135,11 @@ main( int argc, char** argv )
     CurandDevice = 2
   };
 #ifdef MGONGPU_HAS_NO_CURAND
-  RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // this is the only supported mode if build has no curand (PR #784)
+  RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // this is the only supported mode if build has no curand (PR #784 and #785)
+#elif defined __HIPCC__
+#error Internal error: MGONGPU_HAS_NO_CURAND should have been set for __HIPCC__ // default on AMD GPUs should be common random
 #elif defined __CUDACC__
-  RandomNumberMode rndgen = RandomNumberMode::CurandDevice; // default on GPU if build has curand
+  RandomNumberMode rndgen = RandomNumberMode::CurandDevice; // default on NVidia GPU if build has curand
 #else
   RandomNumberMode rndgen = RandomNumberMode::CurandHost; // default on CPU if build has curand
 #endif
@@ -146,10 +149,10 @@ main( int argc, char** argv )
     RamboHost = 1,
     RamboDevice = 2
   };
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   RamboSamplingMode rmbsmp = RamboSamplingMode::RamboDevice; // default on GPU
 #else
-  RamboSamplingMode rmbsmp = RamboSamplingMode::RamboHost;  // default on CPU
+  RamboSamplingMode rmbsmp = RamboSamplingMode::RamboHost; // default on CPU
 #endif
   // Bridge emulation mode (NB Bridge implies RamboHost!)
   bool bridge = false;
@@ -177,7 +180,7 @@ main( int argc, char** argv )
     else if( arg == "--curdev" )
     {
 #ifndef __CUDACC__
-      throw std::runtime_error( "CurandDevice is not supported on CPUs" );
+      throw std::runtime_error( "CurandDevice is not supported on CPUs or non-NVidia GPUs" );
 #elif defined MGONGPU_HAS_NO_CURAND
       throw std::runtime_error( "CurandDevice is not supported because this application was built without Curand support" );
 #else
@@ -198,7 +201,7 @@ main( int argc, char** argv )
     }
     else if( arg == "--rmbdev" )
     {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
       rmbsmp = RamboSamplingMode::RamboDevice;
 #else
       throw std::runtime_error( "RamboDevice is not supported on CPUs" );
@@ -272,13 +275,13 @@ main( int argc, char** argv )
     return usage( argv[0] );
   }
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 #ifdef _OPENMP
   ompnumthreadsNotSetMeansOneThread( debug ? 1 : 0 ); // quiet(-1), info(0), debug(1)
 #endif
 #endif
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // Fail gently and avoid "Illegal instruction (core dumped)" if the host does not support the SIMD used in the ME calculation
   // Note: this prevents a crash on pmpe04 but not on some github CI nodes?
   // [NB: SIMD vectorization in mg5amc C++ code is only used in the ME calculation below MatrixElementKernelHost!]
@@ -296,14 +299,14 @@ main( int argc, char** argv )
 
   // === STEP 0 - INITIALISE
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 
-  // --- 00. Initialise cuda
-  // Instantiate a CudaRuntime at the beginnining of the application's main to
-  // invoke cudaSetDevice(0) in the constructor and book a cudaDeviceReset() call in the destructor
-  const std::string cdinKey = "00 CudaInit";
+  // --- 00. Initialise GPU
+  // Instantiate a GpuRuntime at the beginnining of the application's main.
+  // For CUDA this invokes cudaSetDevice(0) in the constructor and books a cudaDeviceReset() call in the destructor.
+  const std::string cdinKey = "00 GpuInit";
   timermap.start( cdinKey );
-  CudaRuntime cudaRuntime( debug );
+  GpuRuntime GpuRuntime( debug );
 #endif
 
   // --- 0a. Initialise physics process
@@ -325,7 +328,7 @@ main( int argc, char** argv )
   timermap.start( alloKey );
 
   // Memory buffers for random numbers for momenta
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferRndNumMomenta hstRndmom( nevt );
 #else
   PinnedHostBufferRndNumMomenta hstRndmom( nevt );
@@ -333,7 +336,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for sampling weights
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferWeights hstWeights( nevt );
 #else
   PinnedHostBufferWeights hstWeights( nevt );
@@ -341,7 +344,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for momenta
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferMomenta hstMomenta( nevt );
 #else
   PinnedHostBufferMomenta hstMomenta( nevt );
@@ -349,7 +352,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for Gs
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferGs hstGs( nevt );
 #else
   PinnedHostBufferGs hstGs( nevt );
@@ -366,7 +369,7 @@ main( int argc, char** argv )
   }
 
   // Memory buffers for matrix elements
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferMatrixElements hstMatrixElements( nevt );
 #else
   PinnedHostBufferMatrixElements hstMatrixElements( nevt );
@@ -375,7 +378,7 @@ main( int argc, char** argv )
 
   // Memory buffers for random numbers for helicity selection
   // *** NB #403 these buffers always remain initialised at 0: no need for helicity choice in gcheck/check (no LHE produced) ***
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferRndNumHelicity hstRndHel( nevt );
 #else
   PinnedHostBufferRndNumHelicity hstRndHel( nevt );
@@ -384,7 +387,7 @@ main( int argc, char** argv )
 
   // Memory buffers for random numbers for color selection
   // *** NB #402 these buffers always remain initialised at 0: no need for color choice in gcheck/check (no LHE produced) ***
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferRndNumColor hstRndCol( nevt );
 #else
   PinnedHostBufferRndNumColor hstRndCol( nevt );
@@ -392,7 +395,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for helicity selection
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferSelectedHelicity hstSelHel( nevt );
 #else
   PinnedHostBufferSelectedHelicity hstSelHel( nevt );
@@ -400,7 +403,7 @@ main( int argc, char** argv )
 #endif
 
   // Memory buffers for color selection
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   HostBufferSelectedColor hstSelCol( nevt );
 #else
   PinnedHostBufferSelectedColor hstSelCol( nevt );
@@ -438,7 +441,7 @@ main( int argc, char** argv )
     const bool onDevice = true;
     prnk.reset( new CurandRandomNumberKernel( devRndmom, onDevice ) );
 #else
-    throw std::logic_error( "INTERNAL ERROR! CurandDevice is not supported on CPUs" ); // INTERNAL ERROR (no path to this statement)
+    throw std::logic_error( "INTERNAL ERROR! CurandDevice is not supported on CPUs or non-NVidia GPUs" ); // INTERNAL ERROR (no path to this statement)
 #endif
   }
 
@@ -450,7 +453,7 @@ main( int argc, char** argv )
   }
   else
   {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     prsk.reset( new RamboSamplingKernelDevice( energy, devRndmom, devMomenta, devWeights, gpublocks, gputhreads ) );
 #else
     throw std::logic_error( "RamboDevice is not supported on CPUs" ); // INTERNAL ERROR (no path to this statement)
@@ -461,7 +464,7 @@ main( int argc, char** argv )
   std::unique_ptr<MatrixElementKernelBase> pmek;
   if( !bridge )
   {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     pmek.reset( new MatrixElementKernelDevice( devMomenta, devGs, devRndHel, devRndCol, devMatrixElements, devSelHel, devSelCol, gpublocks, gputhreads ) );
 #else
     pmek.reset( new MatrixElementKernelHost( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, nevt ) );
@@ -469,7 +472,7 @@ main( int argc, char** argv )
   }
   else
   {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     pmek.reset( new BridgeKernelDevice( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, gpublocks, gputhreads ) );
 #else
     pmek.reset( new BridgeKernelHost( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, nevt ) );
@@ -511,7 +514,7 @@ main( int argc, char** argv )
     prnk->generateRnarray();
     //std::cout << "Got random numbers" << std::endl;
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     if( rndgen != RandomNumberMode::CurandDevice && rmbsmp == RamboSamplingMode::RamboDevice )
     {
       // --- 1c. Copy rndmom from host to device
@@ -543,7 +546,7 @@ main( int argc, char** argv )
     prsk->getMomentaFinal();
     //std::cout << "Got final momenta" << std::endl;
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     if( rmbsmp == RamboSamplingMode::RamboDevice )
     {
       // --- 2c. CopyDToH Weights
@@ -588,7 +591,7 @@ main( int argc, char** argv )
       dynamic_cast<BridgeKernelBase*>( pmek.get() )->transposeInputMomentaC2F();
     }
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     // --- 2d. CopyHToD Momenta
     const std::string gKey = "0.. CpHTDg";
     rambtime += timermap.start( gKey ); // FIXME! NOT A RAMBO TIMER!
@@ -617,7 +620,7 @@ main( int argc, char** argv )
     wv3atime += timermap.stop(); // calc only
     wavetime += wv3atime;        // calc plus copy
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     if( !bridge )
     {
       // --- 3b. CopyDToH MEs
@@ -760,18 +763,22 @@ main( int argc, char** argv )
     rndgentxt = "CURAND DEVICE";
 #ifdef __CUDACC__
   rndgentxt += " (CUDA code)";
+#elif defined __HIPCC__
+  rndgentxt += " (HIP code)";
 #else
   rndgentxt += " (C++ code)";
 #endif
 
   // Workflow description summary
   std::string wrkflwtxt;
-  // -- CUDA or C++?
+  // -- CUDA or HIP or C++?
 #ifdef __CUDACC__
   wrkflwtxt += "CUD:";
+#elif defined __HIPCC__
+  wrkflwtxt += "HIP:";
 #else
   wrkflwtxt += "CPP:";
-#endif
+#endif /* clang-format off */
   // -- DOUBLE or FLOAT?
 #if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
   wrkflwtxt += "MIX+"; // mixed fptypes (single precision color algebra #537)
@@ -781,7 +788,7 @@ main( int argc, char** argv )
   wrkflwtxt += "FLT+";
 #else
   wrkflwtxt += "???+"; // no path to this statement
-#endif
+#endif /* clang-format on */
   // -- CUCOMPLEX or THRUST or STD complex numbers?
 #ifdef __CUDACC__
 #if defined MGONGPU_CUCXTYPE_CUCOMPLEX
@@ -793,6 +800,12 @@ main( int argc, char** argv )
 #else
   wrkflwtxt += "???:"; // no path to this statement
 #endif
+#elif defined __HIPCC__
+#if defined MGONGPU_CUCXTYPE_CXSMPL
+  wrkflwtxt += "CXS:";
+#else
+  wrkflwtxt += "???:"; // no path to this statement
+#endif
 #else
 #if defined MGONGPU_CPPCXTYPE_STDCOMPLEX
   wrkflwtxt += "STX:";
@@ -818,7 +831,7 @@ main( int argc, char** argv )
     wrkflwtxt += "RMBDEV+";
   else
     wrkflwtxt += "??????+"; // no path to this statement
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   // -- HOST or DEVICE matrix elements? Standalone MEs or BRIDGE?
   if( !bridge )
     wrkflwtxt += "MESDEV";
@@ -874,7 +887,7 @@ main( int argc, char** argv )
 
   if( perf )
   {
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 #ifdef _OPENMP
     // Get the output of "nproc --all" (https://stackoverflow.com/a/478960)
     std::string nprocall;
@@ -895,6 +908,8 @@ main( int argc, char** argv )
     std::cout << std::string( SEP79, '*' ) << std::endl
 #ifdef __CUDACC__
               << "Process                     = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_CUDA"
+#elif defined __HIPCC__
+              << "Process                     = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_HIP"
 #else
               << "Process                     = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_CPP"
 #endif
@@ -921,21 +936,21 @@ main( int argc, char** argv )
 #elif defined MGONGPU_FPTYPE_FLOAT
               << "FP precision                = FLOAT (NaN/abnormal=" << nabn << ", zero=" << nzero << ")" << std::endl
 #endif
-#ifdef __CUDACC__
 #if defined MGONGPU_CUCXTYPE_CUCOMPLEX
               << "Complex type                = CUCOMPLEX" << std::endl
 #elif defined MGONGPU_CUCXTYPE_THRUST
               << "Complex type                = THRUST::COMPLEX" << std::endl
-#endif
-#else
+#elif defined MGONGPU_CUCXTYPE_CXSMPL
               << "Complex type                = STD::COMPLEX" << std::endl
+#else
+              << "Complex type                = ???" << std::endl // no path to this statement...
 #endif
               << "RanNumb memory layout       = AOSOA[" << neppR << "]"
               << ( neppR == 1 ? " == AOS" : "" )
               << " [HARDCODED FOR REPRODUCIBILITY]" << std::endl
               << "Momenta memory layout       = AOSOA[" << neppM << "]"
               << ( neppM == 1 ? " == AOS" : "" ) << std::endl
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     //<< "Wavefunction GPU memory     = LOCAL" << std::endl
 #else
 #if !defined MGONGPU_CPPSIMD
@@ -966,7 +981,7 @@ main( int argc, char** argv )
 #endif
 #endif
               << "Random number generation    = " << rndgentxt << std::endl
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 #ifdef _OPENMP
               << "OMP threads / `nproc --all` = " << omp_get_max_threads() << " / " << nprocall // includes a newline
 #endif
@@ -1062,14 +1077,14 @@ main( int argc, char** argv )
              << "\"FLOAT (NaN/abnormal=" << nabn << ")\"," << std::endl
 #endif
              << "\"Complex type\": "
-#ifdef __CUDACC__
 #if defined MGONGPU_CUCXTYPE_CUCOMPLEX
              << "\"CUCOMPLEX\"," << std::endl
 #elif defined MGONGPU_CUCXTYPE_THRUST
              << "\"THRUST::COMPLEX\"," << std::endl
-#endif
-#else
+#elif defined MGONGPU_CUCXTYPE_CXSMPL
              << "\"STD::COMPLEX\"," << std::endl
+#else
+             << "\"???\"," << std::endl                           // no path to this statement...
 #endif
              << "\"RanNumb memory layout\": "
              << "\"AOSOA[" << neppR << "]\""
@@ -1077,7 +1092,7 @@ main( int argc, char** argv )
              << "\"Momenta memory layout\": "
              << "\"AOSOA[" << neppM << "]\""
              << ( neppM == 1 ? " == AOS" : "" ) << ", " << std::endl
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     //<< "\"Wavefunction GPU memory\": " << "\"LOCAL\"," << std::endl
 #endif
              << "\"Curand generation\": "
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/RamboSamplingKernels.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/RamboSamplingKernels.cc
index da68aa9255..79abbcc4f8 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/RamboSamplingKernels.cc
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/RamboSamplingKernels.cc
@@ -1,11 +1,11 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #include "RamboSamplingKernels.h"
 
-#include "CudaRuntime.h"
+#include "GpuRuntime.h"
 #include "MemoryAccessMomenta.h"
 #include "MemoryAccessRandomNumbers.h"
 #include "MemoryAccessWeights.h"
@@ -14,7 +14,7 @@
 
 #include <sstream>
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -92,7 +92,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   RamboSamplingKernelDevice::RamboSamplingKernelDevice( const fptype energy,               // input: energy
                                                         const BufferRndNumMomenta& rndmom, // input: random numbers in [0,1]
                                                         BufferMomenta& momenta,            // output: momenta
@@ -135,7 +135,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   __global__ void
   getMomentaInitialDevice( const fptype energy,
                            fptype* momenta )
@@ -147,17 +147,17 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   void
   RamboSamplingKernelDevice::getMomentaInitial()
   {
-    getMomentaInitialDevice<<<m_gpublocks, m_gputhreads>>>( m_energy, m_momenta.data() );
+    gpuLaunchKernel( getMomentaInitialDevice, m_gpublocks, m_gputhreads, m_energy, m_momenta.data() );
   }
 #endif
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   __global__ void
   getMomentaFinalDevice( const fptype energy,
                          const fptype* rndmom,
@@ -171,11 +171,11 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   void
   RamboSamplingKernelDevice::getMomentaFinal()
   {
-    getMomentaFinalDevice<<<m_gpublocks, m_gputhreads>>>( m_energy, m_rndmom.data(), m_momenta.data(), m_weights.data() );
+    gpuLaunchKernel( getMomentaFinalDevice, m_gpublocks, m_gputhreads, m_energy, m_rndmom.data(), m_momenta.data(), m_weights.data() );
   }
 #endif
 
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/RamboSamplingKernels.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/RamboSamplingKernels.h
index 184089efd7..7c214cd74b 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/RamboSamplingKernels.h
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/RamboSamplingKernels.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef RAMBOSAMPLINGKERNELS_H
 #define RAMBOSAMPLINGKERNELS_H 1
@@ -10,7 +10,7 @@
 
 #include "MemoryBuffers.h"
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -93,7 +93,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   // A class encapsulating RAMBO phase space sampling on a GPU device
   class RamboSamplingKernelDevice final : public SamplingKernelBase, public NumberOfEvents
   {
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/RandomNumberKernels.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/RandomNumberKernels.h
index 188a72c2c9..21d63beeac 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/RandomNumberKernels.h
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/RandomNumberKernels.h
@@ -1,14 +1,14 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef RANDOMNUMBERKERNELS_H
 #define RANDOMNUMBERKERNELS_H 1
 
 #include "mgOnGpuConfig.h"
 
-// NB This must come AFTER mgOnGpuConfig.h which contains our definition of __global__ when __CUDACC__ is not defined
+// NB This must come AFTER mgOnGpuConfig.h which contains our definition of __global__ when MGONGPUCPP_GPUIMPL is not defined
 #ifndef MGONGPU_HAS_NO_CURAND
 //#include "curand.h"
 struct curandGenerator_st; // forward definition from curand.h
@@ -16,7 +16,7 @@ struct curandGenerator_st; // forward definition from curand.h
 
 #include "MemoryBuffers.h"
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/cudacpp.mk b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/cudacpp.mk
index 509307506b..f2cfa349da 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/cudacpp.mk
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/cudacpp.mk
@@ -1,7 +1,7 @@
 # Copyright (C) 2020-2023 CERN and UCLouvain.
 # Licensed under the GNU Lesser General Public License (version 3 or later).
 # Created by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin.
-# Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+# Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 
 #=== Determine the name of this makefile (https://ftp.gnu.org/old-gnu/Manuals/make-3.80/html_node/make_17.html)
 #=== NB: use ':=' to ensure that the value of CUDACPP_MAKEFILE is not modified further down after including make_opts
@@ -42,10 +42,10 @@ endif
 
 #-------------------------------------------------------------------------------
 
-#=== Configure common compiler flags for C++ and CUDA
+#=== Configure common compiler flags for C++ and CUDA/HIP
 
 INCFLAGS = -I.
-OPTFLAGS = -O3 # this ends up in CUFLAGS too (should it?), cannot add -Ofast or -ffast-math here
+OPTFLAGS = -O3 # this ends up in GPUFLAGS too (should it?), cannot add -Ofast or -ffast-math here
 
 # Dependency on src directory
 MG5AMC_COMMONLIB = mg5amc_common
@@ -121,24 +121,46 @@ endif
 
 #-------------------------------------------------------------------------------
 
-#=== Configure the CUDA compiler
+#=== Configure the GPU compiler (CUDA or HIP)
 
-# If CXX is not a single word (example "clang++ --gcc-toolchain...") then disable CUDA builds (issue #505)
-# This is because it is impossible to pass this to "CUFLAGS += -ccbin <host-compiler>" below
+# FIXME! (AV 24.01.2024)
+# In the current implementation (without separate builds for C++ and CUDA/HIP), we first check for cudacc and hipcc in CUDA_HOME and HIP_HOME.
+# If CUDA_HOME or HIP_HOME are not set, try to determine them from the path to cudacc and hipcc.
+# While convoluted, this is currently necessary to allow disabling CUDA/HIP builds by setting CUDA_HOME or HIP_HOME to invalid paths.
+# This will (probably?) be fixed when separate C++ and CUDA/HIP builds are implemented (PR #775).
+
+# If CXX is not a single word (example "clang++ --gcc-toolchain...") then disable CUDA and HIP builds (issue #505)
+# This is because it is impossible to pass this to "GPUFLAGS += -ccbin <host-compiler>" below
 ifneq ($(words $(subst ccache ,,$(CXX))),1) # allow at most "CXX=ccache <host-compiler>" from outside
-  $(warning CUDA builds are not supported for multi-word CXX "$(CXX)")
+  $(warning CUDA and HIP builds are not supported for multi-word CXX "$(CXX)")
   override CUDA_HOME=disabled
+  override HIP_HOME=disabled
 endif
 
-# If CUDA_HOME is not set, try to set it from the location of nvcc
+# If CUDA_HOME is not set, try to set it from the path to nvcc
 ifndef CUDA_HOME
   CUDA_HOME = $(patsubst %bin/nvcc,%,$(shell which nvcc 2>/dev/null))
   $(warning CUDA_HOME was not set: using "$(CUDA_HOME)")
 endif
 
-# Set NVCC as $(CUDA_HOME)/bin/nvcc if it exists
+# If HIP_HOME is not set, try to set it from the path to hipcc
+ifndef HIP_HOME
+  HIP_HOME = $(patsubst %bin/hipcc,%,$(HIP_COMPILER_PATH))
+  $(warning HIP_HOME was not set: using "$(HIP_HOME)")
+endif
+
+# FIXME! (AV 24.01.2024)
+# In the current implementation (without separate builds for C++ and CUDA/HIP),
+# builds are performed for HIP only if CUDA is not found in the path.
+# If both CUDA and HIP are installed, HIP builds can be triggered by unsetting CUDA_HOME.
+# This will be fixed when separate C++ and CUDA/HIP builds are implemented (PR #775).
+
+#--- Option 1: CUDA exists -> use CUDA
+
+# Set GPUCC as $(CUDA_HOME)/bin/nvcc if it exists
 ifneq ($(wildcard $(CUDA_HOME)/bin/nvcc),)
-  NVCC = $(CUDA_HOME)/bin/nvcc
+
+  GPUCC = $(CUDA_HOME)/bin/nvcc
   USE_NVTX ?=-DUSE_NVTX
   # See https://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/index.html
   # See https://arnon.dk/matching-sm-architectures-arch-and-gencode-for-various-nvidia-cards/
@@ -158,41 +180,78 @@ ifneq ($(wildcard $(CUDA_HOME)/bin/nvcc),)
     CURANDLIBFLAGS = -L$(CUDA_HOME)/lib64/ -lcurand # NB: -lcuda is not needed here!
   endif
   CUOPTFLAGS = -lineinfo
-  CUFLAGS = $(foreach opt, $(OPTFLAGS), -Xcompiler $(opt)) $(CUOPTFLAGS) $(INCFLAGS) $(CUINC) $(USE_NVTX) $(CUARCHFLAGS) -use_fast_math
-  ###CUFLAGS += -Xcompiler -Wall -Xcompiler -Wextra -Xcompiler -Wshadow
-  ###NVCC_VERSION = $(shell $(NVCC) --version | grep 'Cuda compilation tools' | cut -d' ' -f5 | cut -d, -f1)
-  CUFLAGS += -std=c++17 # need CUDA >= 11.2 (see #333): this is enforced in mgOnGpuConfig.h
+  ###GPUFLAGS = $(OPTFLAGS) $(CUOPTFLAGS) $(INCFLAGS) $(CUINC) $(USE_NVTX) $(CUARCHFLAGS) -use_fast_math
+  GPUFLAGS = $(foreach opt, $(OPTFLAGS), -Xcompiler $(opt)) $(CUOPTFLAGS) $(INCFLAGS) $(CUINC) $(USE_NVTX) $(CUARCHFLAGS) -use_fast_math
+  ###GPUFLAGS += -Xcompiler -Wall -Xcompiler -Wextra -Xcompiler -Wshadow
+  ###GPUCC_VERSION = $(shell $(GPUCC) --version | grep 'Cuda compilation tools' | cut -d' ' -f5 | cut -d, -f1)
+  GPUFLAGS += -std=c++17 # need CUDA >= 11.2 (see #333): this is enforced in mgOnGpuConfig.h
   # Without -maxrregcount: baseline throughput: 6.5E8 (16384 32 12) up to 7.3E8 (65536 128 12)
-  ###CUFLAGS+= --maxrregcount 160 # improves throughput: 6.9E8 (16384 32 12) up to 7.7E8 (65536 128 12)
-  ###CUFLAGS+= --maxrregcount 128 # improves throughput: 7.3E8 (16384 32 12) up to 7.6E8 (65536 128 12)
-  ###CUFLAGS+= --maxrregcount 96 # degrades throughput: 4.1E8 (16384 32 12) up to 4.5E8 (65536 128 12)
-  ###CUFLAGS+= --maxrregcount 64 # degrades throughput: 1.7E8 (16384 32 12) flat at 1.7E8 (65536 128 12)
+  ###GPUFLAGS+= --maxrregcount 160 # improves throughput: 6.9E8 (16384 32 12) up to 7.7E8 (65536 128 12)
+  ###GPUFLAGS+= --maxrregcount 128 # improves throughput: 7.3E8 (16384 32 12) up to 7.6E8 (65536 128 12)
+  ###GPUFLAGS+= --maxrregcount 96 # degrades throughput: 4.1E8 (16384 32 12) up to 4.5E8 (65536 128 12)
+  ###GPUFLAGS+= --maxrregcount 64 # degrades throughput: 1.7E8 (16384 32 12) flat at 1.7E8 (65536 128 12)
+  CUBUILDRULEFLAGS = -Xcompiler -fPIC -c
+  CCBUILDRULEFLAGS = -Xcompiler -fPIC -c -x cu
+  CUDATESTFLAGS = -lcuda
+
+  # Set the host C++ compiler for GPUCC via "-ccbin <host-compiler>"
+  # (NB issue #505: this must be a single word, "clang++ --gcc-toolchain..." is not supported)
+  GPUFLAGS += -ccbin $(shell which $(subst ccache ,,$(CXX)))
+
+  # Allow newer (unsupported) C++ compilers with older versions of CUDA if ALLOW_UNSUPPORTED_COMPILER_IN_CUDA is set (#504)
+  ifneq ($(origin ALLOW_UNSUPPORTED_COMPILER_IN_CUDA),undefined)
+  GPUFLAGS += -allow-unsupported-compiler
+  endif
+
 else ifneq ($(origin REQUIRE_CUDA),undefined)
+
   # If REQUIRE_CUDA is set but no cuda is found, stop here (e.g. for CI tests on GPU #443)
-  $(error No cuda installation found (set CUDA_HOME or make nvcc visible in PATH))
+  $(error No cuda installation found (set CUDA_HOME or make GPUCC visible in PATH))
+
+#--- Option 2: CUDA does not exist, HIP exists -> use HIP
+
+# Set GPUCC as $(HIP_HOME)/bin/hipcc if it exists
+else ifneq ($(wildcard $(HIP_HOME)/bin/hipcc),)
+
+  GPUCC = $(HIP_HOME)/bin/hipcc
+  #USE_NVTX ?=-DUSE_NVTX # should maybe find something equivalent to this in HIP?
+  HIPARCHFLAGS = -target x86_64-linux-gnu --offload-arch=gfx90a
+  HIPINC = -I$(HIP_HOME)/include/
+  # Note: -DHIP_FAST_MATH is equivalent to -use_fast_math in HIP 
+  # (but only for single precision line 208: https://rocm-developer-tools.github.io/HIP/hcc__detail_2math__functions_8h_source.html)
+  GPUFLAGS = $(OPTFLAGS) $(CUOPTFLAGS) $(INCFLAGS) $(HIPINC) $(HIPARCHFLAGS) -DHIP_FAST_MATH -DHIP_PLATFORM=amd -fPIC
+  ###GPUFLAGS += -Xcompiler -Wall -Xcompiler -Wextra -Xcompiler -Wshadow
+  GPUFLAGS += -std=c++17
+  ###GPUFLAGS+= --maxrregcount 255 # (AV: is this option valid on HIP and meaningful on AMD GPUs?)
+  CUBUILDRULEFLAGS = -fPIC -c
+  CCBUILDRULEFLAGS = -fPIC -c
+
+else ifneq ($(origin REQUIRE_HIP),undefined)
+
+  # If REQUIRE_HIP is set but no HIP is found, stop here (e.g. for CI tests on GPU #443)
+  $(error No hip installation found (set HIP_HOME or make GPUCC visible in PATH))
+
+#--- Option 3: CUDA does not exist, HIP does not exist -> switch off both CUDA and HIP
+
 else
-  # No cuda. Switch cuda compilation off and go to common random numbers in C++
+
+  # No cudacc and no hipcc: switch CUDA and HIP compilation off and go to common random numbers in C++
   $(warning CUDA_HOME is not set or is invalid: export CUDA_HOME to compile with cuda)
-  override NVCC=
+  $(warning HIP_HOME is not set or is invalid: export HIP_HOME to compile with hip)
+  override GPUCC=
   override USE_NVTX=
   override CUINC=
   override CURANDLIBFLAGS=
-endif
-export NVCC
-export CUFLAGS
-
-# Set the host C++ compiler for nvcc via "-ccbin <host-compiler>"
-# (NB issue #505: this must be a single word, "clang++ --gcc-toolchain..." is not supported)
-CUFLAGS += -ccbin $(shell which $(subst ccache ,,$(CXX)))
 
-# Allow newer (unsupported) C++ compilers with older versions of CUDA if ALLOW_UNSUPPORTED_COMPILER_IN_CUDA is set (#504)
-ifneq ($(origin ALLOW_UNSUPPORTED_COMPILER_IN_CUDA),undefined)
-CUFLAGS += -allow-unsupported-compiler
 endif
 
+# Export GPUCC (so that it can also be used in cudacpp_src.mk?)
+export GPUCC
+export GPUFLAGS
+
 #-------------------------------------------------------------------------------
 
-#=== Configure ccache for C++ and CUDA builds
+#=== Configure ccache for C++ and CUDA/HIP builds
 
 # Enable ccache if USECCACHE=1
 ifeq ($(USECCACHE)$(shell echo $(CXX) | grep ccache),1)
@@ -201,15 +260,15 @@ endif
 #ifeq ($(USECCACHE)$(shell echo $(AR) | grep ccache),1)
 #  override AR:=ccache $(AR)
 #endif
-ifneq ($(NVCC),)
-  ifeq ($(USECCACHE)$(shell echo $(NVCC) | grep ccache),1)
-    override NVCC:=ccache $(NVCC)
+ifneq ($(GPUCC),)
+  ifeq ($(USECCACHE)$(shell echo $(GPUCC) | grep ccache),1)
+    override GPUCC:=ccache $(GPUCC)
   endif
 endif
 
 #-------------------------------------------------------------------------------
 
-#=== Configure PowerPC-specific compiler flags for C++ and CUDA
+#=== Configure PowerPC-specific compiler flags for C++ and CUDA/HIP
 
 # PowerPC-specific CXX compiler flags (being reviewed)
 ifeq ($(UNAME_P),ppc64le)
@@ -225,9 +284,9 @@ else
   ######CXXFLAGS+= -fno-semantic-interposition # no benefit (neither alone, nor combined with -flto)
 endif
 
-# PowerPC-specific CUDA compiler flags (to be reviewed!)
+# PowerPC-specific CUDA/HIP compiler flags (to be reviewed!)
 ifeq ($(UNAME_P),ppc64le)
-  CUFLAGS+= -Xcompiler -mno-float128
+  GPUFLAGS+= -Xcompiler -mno-float128
 endif
 
 #-------------------------------------------------------------------------------
@@ -237,7 +296,7 @@ endif
 # Set the default OMPFLAGS choice
 ifneq ($(shell $(CXX) --version | egrep '^Intel'),)
 override OMPFLAGS = -fopenmp
-###override OMPFLAGS = # disable OpenMP MT on Intel (was ok without nvcc but not ok with nvcc before #578)
+###override OMPFLAGS = # disable OpenMP MT on Intel (was ok without GPUCC but not ok with GPUCC before #578)
 else ifneq ($(shell $(CXX) --version | egrep '^(clang)'),)
 override OMPFLAGS = -fopenmp
 ###override OMPFLAGS = # disable OpenMP MT on clang (was not ok without or with nvcc before #578)
@@ -293,7 +352,10 @@ endif
 
 # Set the default RNDGEN (random number generator) choice
 ifeq ($(RNDGEN),)
-  ifeq ($(NVCC),)
+  ifeq ($(GPUCC),)
+    override RNDGEN = hasNoCurand
+  # Edgecase for HIP compilation
+  else ifeq ($(findstring hipcc,$(GPUCC)),hipcc)
     override RNDGEN = hasNoCurand
   else ifeq ($(RNDGEN),)
     override RNDGEN = hasCurand
@@ -310,7 +372,7 @@ export OMPFLAGS
 
 #-------------------------------------------------------------------------------
 
-#=== Set the CUDA/C++ compiler flags appropriate to user-defined choices of AVX, FPTYPE, HELINL, HRDCOD, RNDGEN
+#=== Set the CUDA/HIP/C++ compiler flags appropriate to user-defined choices of AVX, FPTYPE, HELINL, HRDCOD, RNDGEN
 
 # Set the build flags appropriate to OMPFLAGS
 $(info OMPFLAGS=$(OMPFLAGS))
@@ -368,13 +430,13 @@ CXXFLAGS+= $(AVXFLAGS)
 $(info FPTYPE=$(FPTYPE))
 ifeq ($(FPTYPE),d)
   CXXFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_DOUBLE
-  CUFLAGS  += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_DOUBLE
+  GPUFLAGS  += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_DOUBLE
 else ifeq ($(FPTYPE),f)
   CXXFLAGS += -DMGONGPU_FPTYPE_FLOAT -DMGONGPU_FPTYPE2_FLOAT
-  CUFLAGS  += -DMGONGPU_FPTYPE_FLOAT -DMGONGPU_FPTYPE2_FLOAT
+  GPUFLAGS  += -DMGONGPU_FPTYPE_FLOAT -DMGONGPU_FPTYPE2_FLOAT
 else ifeq ($(FPTYPE),m)
   CXXFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_FLOAT
-  CUFLAGS  += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_FLOAT
+  GPUFLAGS  += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_FLOAT
 else
   $(error Unknown FPTYPE='$(FPTYPE)': only 'd', 'f' and 'm' are supported)
 endif
@@ -383,7 +445,7 @@ endif
 $(info HELINL=$(HELINL))
 ifeq ($(HELINL),1)
   CXXFLAGS += -DMGONGPU_INLINE_HELAMPS
-  CUFLAGS  += -DMGONGPU_INLINE_HELAMPS
+  GPUFLAGS  += -DMGONGPU_INLINE_HELAMPS
 else ifneq ($(HELINL),0)
   $(error Unknown HELINL='$(HELINL)': only '0' and '1' are supported)
 endif
@@ -392,7 +454,7 @@ endif
 $(info HRDCOD=$(HRDCOD))
 ifeq ($(HRDCOD),1)
   CXXFLAGS += -DMGONGPU_HARDCODE_PARAM
-  CUFLAGS  += -DMGONGPU_HARDCODE_PARAM
+  GPUFLAGS  += -DMGONGPU_HARDCODE_PARAM
 else ifneq ($(HRDCOD),0)
   $(error Unknown HRDCOD='$(HRDCOD)': only '0' and '1' are supported)
 endif
@@ -444,11 +506,11 @@ ifeq ($(UNAME_S),Darwin)
   override CULIBFLAGSRPATH2 =
 else
   # RPATH to cuda/cpp libs when linking executables
-  override CXXLIBFLAGSRPATH = -Wl,-rpath,$(LIBDIRRPATH)
-  override CULIBFLAGSRPATH = -Xlinker -rpath,$(LIBDIRRPATH)
+  override CXXLIBFLAGSRPATH = -Wl,-rpath=$(LIBDIRRPATH)
+  override CULIBFLAGSRPATH = -Xlinker -rpath=$(LIBDIRRPATH)
   # RPATH to common lib when linking cuda/cpp libs
-  override CXXLIBFLAGSRPATH2 = -Wl,-rpath,'$$ORIGIN'
-  override CULIBFLAGSRPATH2 = -Xlinker -rpath,'$$ORIGIN'
+  override CXXLIBFLAGSRPATH2 = -Wl,-rpath='$$ORIGIN'
+  override CULIBFLAGSRPATH2 = -Xlinker -rpath='$$ORIGIN'
 endif
 
 # Setting LD_LIBRARY_PATH or DYLD_LIBRARY_PATH in the RUNTIME is no longer necessary (neither on Linux nor on Mac)
@@ -461,7 +523,7 @@ override RUNTIME =
 cxx_main=$(BUILDDIR)/check.exe
 fcxx_main=$(BUILDDIR)/fcheck.exe
 
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 cu_main=$(BUILDDIR)/gcheck.exe
 fcu_main=$(BUILDDIR)/fgcheck.exe
 else
@@ -492,15 +554,16 @@ $(BUILDDIR)/.build.$(TAG):
 	@touch $(BUILDDIR)/.build.$(TAG)
 
 # Generic target and build rules: objects from CUDA compilation
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 $(BUILDDIR)/%.o : %.cu *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
 	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
-	$(NVCC) $(CPPFLAGS) $(CUFLAGS) -Xcompiler -fPIC -c $< -o $@
+	$(GPUCC) $(CPPFLAGS) $(GPUFLAGS) $(CUBUILDRULEFLAGS) $< -o $@
 
 $(BUILDDIR)/%_cu.o : %.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
 	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
-	$(NVCC) $(CPPFLAGS) $(CUFLAGS) -Xcompiler -fPIC -c -x cu $< -o $@
+	$(GPUCC) $(CPPFLAGS) $(GPUFLAGS) $(CCBUILDRULEFLAGS) $< -o $@
 endif
+# -x cu in line above
 
 # Generic target and build rules: objects from C++ compilation
 # (NB do not include CUINC here! add it only for NVTX or curand #679)
@@ -509,11 +572,14 @@ $(BUILDDIR)/%.o : %.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
 	$(CXX) $(CPPFLAGS) $(CXXFLAGS) -fPIC -c $< -o $@
 
 # Apply special build flags only to CrossSectionKernel.cc and gCrossSectionKernel.cu (no fast math, see #117 and #516)
+# Added edgecase for HIP compilation
 ifeq ($(shell $(CXX) --version | grep ^nvc++),)
 $(BUILDDIR)/CrossSectionKernels.o: CXXFLAGS := $(filter-out -ffast-math,$(CXXFLAGS))
 $(BUILDDIR)/CrossSectionKernels.o: CXXFLAGS += -fno-fast-math
-ifneq ($(NVCC),)
-$(BUILDDIR)/gCrossSectionKernels.o: CUFLAGS += -Xcompiler -fno-fast-math
+ifeq ($(findstring nvcc,$(GPUCC)),nvcc)
+  $(BUILDDIR)/gCrossSectionKernels.o: GPUFLAGS += -Xcompiler -fno-fast-math
+else
+  $(BUILDDIR)/gCrossSectionKernels.o: GPUFLAGS += -fno-fast-math
 endif
 endif
 
@@ -530,10 +596,10 @@ ifeq ($(RNDGEN),hasCurand)
 $(BUILDDIR)/CurandRandomNumberKernel.o: CXXFLAGS += $(CUINC)
 endif
 
-# Avoid "warning: builtin __has_trivial_... is deprecated; use __is_trivially_... instead" in nvcc with icx2023 (#592)
+# Avoid "warning: builtin __has_trivial_... is deprecated; use __is_trivially_... instead" in GPUCC with icx2023 (#592)
 ifneq ($(shell $(CXX) --version | egrep '^(Intel)'),)
-ifneq ($(NVCC),)
-CUFLAGS += -Xcompiler -Wno-deprecated-builtins
+ifneq ($(GPUCC),)
+GPUFLAGS += -Wno-deprecated-builtins
 endif
 endif
 
@@ -541,8 +607,8 @@ endif
 # This patch does remove the warning, but I prefer to keep it disabled for the moment...
 ###ifneq ($(shell $(CXX) --version | egrep '^(clang|Apple clang|Intel)'),)
 ###$(BUILDDIR)/CrossSectionKernels.o: CXXFLAGS += -Wno-overriding-t-option
-###ifneq ($(NVCC),)
-###$(BUILDDIR)/gCrossSectionKernels.o: CUFLAGS += -Xcompiler -Wno-overriding-t-option
+###ifneq ($(GPUCC),)
+###$(BUILDDIR)/gCrossSectionKernels.o: GPUFLAGS += -Xcompiler -Wno-overriding-t-option
 ###endif
 ###endif
 
@@ -569,7 +635,7 @@ MG5AMC_CXXLIB = mg5amc_$(processid_short)_cpp
 cxx_objects_lib=$(BUILDDIR)/CPPProcess.o $(BUILDDIR)/MatrixElementKernels.o $(BUILDDIR)/BridgeKernels.o $(BUILDDIR)/CrossSectionKernels.o
 cxx_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel.o $(BUILDDIR)/RamboSamplingKernels.o
 
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 MG5AMC_CULIB = mg5amc_$(processid_short)_cuda
 cu_objects_lib=$(BUILDDIR)/gCPPProcess.o $(BUILDDIR)/gMatrixElementKernels.o $(BUILDDIR)/gBridgeKernels.o $(BUILDDIR)/gCrossSectionKernels.o
 cu_objects_exe=$(BUILDDIR)/gCommonRandomNumberKernel.o $(BUILDDIR)/gRamboSamplingKernels.o
@@ -581,11 +647,11 @@ $(LIBDIR)/lib$(MG5AMC_CXXLIB).so: cxx_objects_lib += $(BUILDDIR)/fbridge.o
 $(LIBDIR)/lib$(MG5AMC_CXXLIB).so: $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib)
 	$(CXX) -shared -o $@ $(cxx_objects_lib) $(CXXLIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB)
 
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 $(LIBDIR)/lib$(MG5AMC_CULIB).so: $(BUILDDIR)/fbridge_cu.o
 $(LIBDIR)/lib$(MG5AMC_CULIB).so: cu_objects_lib += $(BUILDDIR)/fbridge_cu.o
 $(LIBDIR)/lib$(MG5AMC_CULIB).so: $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cu_objects_lib)
-	$(NVCC) --shared -o $@ $(cu_objects_lib) $(CULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB)
+	$(GPUCC) --shared -o $@ $(cu_objects_lib) $(CULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB)
 endif
 
 #-------------------------------------------------------------------------------
@@ -602,16 +668,16 @@ $(cxx_main): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PAT
 $(cxx_main): $(BUILDDIR)/check_sa.o $(LIBDIR)/lib$(MG5AMC_CXXLIB).so $(cxx_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel.o
 	$(CXX) -o $@ $(BUILDDIR)/check_sa.o $(OMPFLAGS) -ldl -pthread $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_CXXLIB) $(cxx_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel.o $(CURANDLIBFLAGS)
 
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 ifneq ($(shell $(CXX) --version | grep ^Intel),)
-$(cu_main): LIBFLAGS += -lintlc # compile with icpx and link with nvcc (undefined reference to `_intel_fast_memcpy')
-$(cu_main): LIBFLAGS += -lsvml # compile with icpx and link with nvcc (undefined reference to `__svml_cos4_l9')
+$(cu_main): LIBFLAGS += -lintlc # compile with icpx and link with GPUCC (undefined reference to `_intel_fast_memcpy')
+$(cu_main): LIBFLAGS += -lsvml # compile with icpx and link with GPUCC (undefined reference to `__svml_cos4_l9')
 else ifneq ($(shell $(CXX) --version | grep ^nvc++),) # support nvc++ #531
 $(cu_main): LIBFLAGS += -L$(patsubst %bin/nvc++,%lib,$(subst ccache ,,$(CXX))) -lnvhpcatm -lnvcpumath -lnvc
 endif
 $(cu_main): LIBFLAGS += $(CULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH
 $(cu_main): $(BUILDDIR)/gcheck_sa.o $(LIBDIR)/lib$(MG5AMC_CULIB).so $(cu_objects_exe) $(BUILDDIR)/gCurandRandomNumberKernel.o
-	$(NVCC) -o $@ $(BUILDDIR)/gcheck_sa.o $(CUARCHFLAGS) $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe) $(BUILDDIR)/gCurandRandomNumberKernel.o $(CURANDLIBFLAGS)
+	$(GPUCC) -o $@ $(BUILDDIR)/gcheck_sa.o $(CUARCHFLAGS) $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe) $(BUILDDIR)/gCurandRandomNumberKernel.o $(CURANDLIBFLAGS)
 endif
 
 #-------------------------------------------------------------------------------
@@ -637,17 +703,17 @@ $(fcxx_main): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PA
 $(fcxx_main): $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler.o $(LIBDIR)/lib$(MG5AMC_CXXLIB).so $(cxx_objects_exe)
 	$(CXX) -o $@ $(BUILDDIR)/fcheck_sa.o $(OMPFLAGS) $(BUILDDIR)/fsampler.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CXXLIB) $(cxx_objects_exe)
 
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 ifneq ($(shell $(CXX) --version | grep ^Intel),)
-$(fcu_main): LIBFLAGS += -lintlc # compile with icpx and link with nvcc (undefined reference to `_intel_fast_memcpy')
-$(fcu_main): LIBFLAGS += -lsvml # compile with icpx and link with nvcc (undefined reference to `__svml_cos4_l9')
+$(fcu_main): LIBFLAGS += -lintlc # compile with icpx and link with GPUCC (undefined reference to `_intel_fast_memcpy')
+$(fcu_main): LIBFLAGS += -lsvml # compile with icpx and link with GPUCC (undefined reference to `__svml_cos4_l9')
 endif
 ifeq ($(UNAME_S),Darwin)
 $(fcu_main): LIBFLAGS += -L$(shell dirname $(shell $(FC) --print-file-name libgfortran.dylib)) # add path to libgfortran on Mac #375
 endif
 $(fcu_main): LIBFLAGS += $(CULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH
 $(fcu_main): $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler_cu.o $(LIBDIR)/lib$(MG5AMC_CULIB).so $(cu_objects_exe)
-	$(NVCC) -o $@ $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler_cu.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe)
+	$(GPUCC) -o $@ $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler_cu.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe)
 endif
 
 #-------------------------------------------------------------------------------
@@ -659,7 +725,7 @@ $(BUILDDIR)/testxxx.o: testxxx_cc_ref.txt
 $(testmain): $(BUILDDIR)/testxxx.o
 $(testmain): cxx_objects_exe += $(BUILDDIR)/testxxx.o # Comment out this line to skip the C++ test of xxx functions
 
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 $(BUILDDIR)/testxxx_cu.o: $(GTESTLIBS)
 $(BUILDDIR)/testxxx_cu.o: INCFLAGS += $(GTESTINC)
 $(BUILDDIR)/testxxx_cu.o: testxxx_cc_ref.txt
@@ -672,7 +738,7 @@ $(BUILDDIR)/testmisc.o: INCFLAGS += $(GTESTINC)
 $(testmain): $(BUILDDIR)/testmisc.o
 $(testmain): cxx_objects_exe += $(BUILDDIR)/testmisc.o # Comment out this line to skip the C++ miscellaneous tests
 
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 $(BUILDDIR)/testmisc_cu.o: $(GTESTLIBS)
 $(BUILDDIR)/testmisc_cu.o: INCFLAGS += $(GTESTINC)
 $(testmain): $(BUILDDIR)/testmisc_cu.o
@@ -684,12 +750,12 @@ $(BUILDDIR)/runTest.o: INCFLAGS += $(GTESTINC)
 $(testmain): $(BUILDDIR)/runTest.o
 $(testmain): cxx_objects_exe += $(BUILDDIR)/runTest.o
 
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 $(BUILDDIR)/runTest_cu.o: $(GTESTLIBS)
 $(BUILDDIR)/runTest_cu.o: INCFLAGS += $(GTESTINC)
 ifneq ($(shell $(CXX) --version | grep ^Intel),)
-$(testmain): LIBFLAGS += -lintlc # compile with icpx and link with nvcc (undefined reference to `_intel_fast_memcpy')
-$(testmain): LIBFLAGS += -lsvml # compile with icpx and link with nvcc (undefined reference to `__svml_cos4_l9')
+$(testmain): LIBFLAGS += -lintlc # compile with icpx and link with GPUCC (undefined reference to `_intel_fast_memcpy')
+$(testmain): LIBFLAGS += -lsvml # compile with icpx and link with GPUCC (undefined reference to `__svml_cos4_l9')
 else ifneq ($(shell $(CXX) --version | grep ^nvc++),) # support nvc++ #531
 $(testmain): LIBFLAGS += -L$(patsubst %bin/nvc++,%lib,$(subst ccache ,,$(CXX))) -lnvhpcatm -lnvcpumath -lnvc
 endif
@@ -713,14 +779,14 @@ $(testmain): LIBFLAGS += -lgomp
 endif
 endif
 
-ifeq ($(NVCC),) # link only runTest.o
+ifeq ($(GPUCC),) # link only runTest.o
 $(testmain): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH
 $(testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(cxx_objects_exe) $(GTESTLIBS)
 	$(CXX) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) -ldl -pthread $(LIBFLAGS)
 else # link both runTest.o and runTest_cu.o
 $(testmain): LIBFLAGS += $(CULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH
 $(testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) $(GTESTLIBS)
-	$(NVCC) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) -ldl $(LIBFLAGS) -lcuda
+	  $(GPUCC) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) -ldl $(LIBFLAGS) $(CUDATESTFLAGS)
 endif
 
 # Use target gtestlibs to build only googletest
@@ -829,9 +895,9 @@ ifeq ($(USECCACHE),1)
 	ccache --version | head -1
 endif
 	@echo ""
-	@echo NVCC=$(NVCC)
-ifneq ($(NVCC),)
-	$(NVCC) --version
+	@echo GPUCC=$(GPUCC)
+ifneq ($(GPUCC),)
+	$(GPUCC) --version
 endif
 	@echo ""
 	@echo CXX=$(CXX)
@@ -850,7 +916,7 @@ endif
 
 # Target: check (run the C++ test executable)
 # [NB THIS IS WHAT IS USED IN THE GITHUB CI!]
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 check: runTest cmpFcheck cmpFGcheck
 else
 check: runTest cmpFcheck
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/fbridge.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/fbridge.cc
index 2d2b36d560..22ce3f5115 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/fbridge.cc
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/fbridge.cc
@@ -1,11 +1,11 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: S. Roiser (Oct 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Roiser, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #include "Bridge.h"
 #include "CPPProcess.h"
-#include "CudaRuntime.h"
+#include "GpuRuntime.h"
 
 extern "C"
 {
@@ -22,7 +22,7 @@ extern "C"
    * Using the same Fortran MadEvent code, linking to the hetrerogeneous library would allow access to both CPU and GPU implementations.
    * The specific heterogeneous configuration (how many GPUs, how many threads on each CPU, etc) could be loaded in CUDA/C++ from a data file.
    */
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   using namespace mg5amcGpu;
 #else
   using namespace mg5amcCpu;
@@ -46,8 +46,8 @@ extern "C"
    */
   void fbridgecreate_( CppObjectInFortran** ppbridge, const int* pnevtF, const int* pnparF, const int* pnp4F )
   {
-#ifdef __CUDACC__
-    CudaRuntime::setUp();
+#ifdef MGONGPUCPP_GPUIMPL
+    GpuRuntime::setUp();
 #endif
     // (NB: CPPProcess::initProc no longer needs to be executed here because it is called in the Bridge constructor)
     // FIXME: disable OMP in Bridge when called from Fortran
@@ -65,8 +65,8 @@ extern "C"
     Bridge<FORTRANFPTYPE>* pbridge = dynamic_cast<Bridge<FORTRANFPTYPE>*>( *ppbridge );
     if( pbridge == 0 ) throw std::runtime_error( "fbridgedelete_: invalid Bridge address" );
     delete pbridge;
-#ifdef __CUDACC__
-    CudaRuntime::tearDown();
+#ifdef MGONGPUCPP_GPUIMPL
+    GpuRuntime::tearDown();
 #endif
   }
 
@@ -96,7 +96,7 @@ extern "C"
   {
     Bridge<FORTRANFPTYPE>* pbridge = dynamic_cast<Bridge<FORTRANFPTYPE>*>( *ppbridge );
     if( pbridge == 0 ) throw std::runtime_error( "fbridgesequence_: invalid Bridge address" );
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     // Use the device/GPU implementation in the CUDA library
     // (there is also a host implementation in this library)
     pbridge->gpu_sequence( momenta, gs, rndhel, rndcol, *pchannelId, mes, selhel, selcol );
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/fsampler.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/fsampler.cc
index 2fb445372d..3743934f41 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/fsampler.cc
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/fsampler.cc
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Feb 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #include "mgOnGpuConfig.h"
 
@@ -13,7 +13,7 @@
 
 //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -40,7 +40,7 @@ namespace mg5amcCpu
   private:
     const int m_nevt; // The number of events in each iteration
     int m_iiter;      // The iteration counter (for random number seeding)
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
     HostBufferRndNumMomenta m_hstRndmom; // Memory buffers for random numbers
     HostBufferMomenta m_hstMomenta;      // Memory buffers for momenta
     HostBufferWeights m_hstWeights;      // Memory buffers for sampling weights
@@ -105,7 +105,7 @@ namespace mg5amcCpu
 
 extern "C"
 {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   using namespace mg5amcGpu;
 #else
   using namespace mg5amcCpu;
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/runTest.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/runTest.cc
index d4a760a71b..de327f2321 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/runTest.cc
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/runTest.cc
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: S. Hageboeck (Nov 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 //----------------------------------------------------------------------------
 // Use ./runTest.exe --gtest_filter=*xxx to run only testxxx.cc tests
 //----------------------------------------------------------------------------
@@ -18,7 +18,7 @@
 #include "RandomNumberKernels.h"
 #include "epoch_process_id.h"
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 using namespace mg5amcGpu;
 #else
 using namespace mg5amcCpu;
@@ -35,7 +35,7 @@ struct CUDA_CPU_TestBase : public TestDriverBase
     : TestDriverBase( npar, refFileName ) {}
 };
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 struct CPUTest : public CUDA_CPU_TestBase
 {
   // Struct data members (process, and memory structures for random numbers, momenta, matrix elements and weights on host and device)
@@ -119,7 +119,7 @@ struct CPUTest : public CUDA_CPU_TestBase
 };
 #endif
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 struct CUDATest : public CUDA_CPU_TestBase
 {
   // Reset the device when our test goes out of scope. Note that this should happen after
@@ -128,7 +128,7 @@ struct CUDATest : public CUDA_CPU_TestBase
   {
     ~DeviceReset()
     {
-      checkCuda( cudaDeviceReset() ); // this is needed by cuda-memcheck --leak-check full
+      checkGpu( gpuDeviceReset() ); // this is needed by cuda-memcheck --leak-check full
     }
   } deviceResetter;
 
@@ -256,7 +256,7 @@ INSTANTIATE_TEST_SUITE_P( prefix, \
                           test_suite_name, \
                           testing::Values( new CUDATest( MG_EPOCH_REFERENCE_FILE_NAME ) ) );
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 MG_INSTANTIATE_TEST_SUITE_GPU( XTESTID_GPU( MG_EPOCH_PROCESS_ID ), MadgraphTest );
 #else
 MG_INSTANTIATE_TEST_SUITE_CPU( XTESTID_CPU( MG_EPOCH_PROCESS_ID ), MadgraphTest );
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/testmisc.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/testmisc.cc
index 895d6eeb56..ba9e59a8a3 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/testmisc.cc
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/testmisc.cc
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 //----------------------------------------------------------------------------
 // Use ./runTest.exe --gtest_filter=*misc to run only testmisc.cc tests
 //----------------------------------------------------------------------------
@@ -17,7 +17,7 @@
 #include <sstream>
 #include <typeinfo>
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 #define TESTID( s ) s##_GPU_MISC
 #else
 #define TESTID( s ) s##_CPU_MISC
@@ -26,7 +26,7 @@
 #define XTESTID( s ) TESTID( s )
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -59,7 +59,7 @@ namespace mg5amcCpu
 
 TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testmisc )
 {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   using namespace mg5amcGpu;
 #else
   using namespace mg5amcCpu;
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/testxxx.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/testxxx.cc
index 3361fe5aa9..e5167de00c 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/testxxx.cc
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/testxxx.cc
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Apr 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 //----------------------------------------------------------------------------
 // Use ./runTest.exe --gtest_filter=*xxx to run only testxxx.cc tests
 //----------------------------------------------------------------------------
@@ -24,7 +24,7 @@
 #include <iomanip>
 #include <iostream>
 #include <vector>
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 #define TESTID( s ) s##_GPU_XXX
 #else
 #define TESTID( s ) s##_CPU_XXX
@@ -32,7 +32,7 @@
 
 #define XTESTID( s ) TESTID( s )
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -42,7 +42,7 @@ namespace mg5amcCpu
   int FPEhandlerIevt = -1;
   inline void FPEhandler( int sig )
   {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     std::cerr << "Floating Point Exception (GPU): '" << FPEhandlerMessage << "' ievt=" << FPEhandlerIevt << std::endl;
 #else
     std::cerr << "Floating Point Exception (CPU neppV=" << neppV << "): '" << FPEhandlerMessage << "' ievt=" << FPEhandlerIevt << std::endl;
@@ -53,7 +53,7 @@ namespace mg5amcCpu
 
 TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testxxx )
 {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   using namespace mg5amcGpu;
 #else
   using namespace mg5amcCpu;
@@ -77,7 +77,7 @@ TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testxxx )
   assert( nevt % neppM == 0 ); // nevt must be a multiple of neppM
   assert( nevt % neppV == 0 ); // nevt must be a multiple of neppV
   // Fill in the input momenta
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   mg5amcGpu::PinnedHostBufferMomenta hstMomenta( nevt ); // AOSOA[npagM][npar=4][np4=4][neppM]
 #else
   mg5amcCpu::HostBufferMomenta hstMomenta( nevt ); // AOSOA[npagM][npar=4][np4=4][neppM]
@@ -322,7 +322,7 @@ TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testxxx )
   {
     for( int ievt = 0; ievt < nevt; ievt++ )
     {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
       using namespace mg5amcGpu;
 #else
       using namespace mg5amcCpu;
diff --git a/epochX/cudacpp/pp_tt012j.mad/src/HelAmps_sm.h b/epochX/cudacpp/pp_tt012j.mad/src/HelAmps_sm.h
index 8df465ad6d..8b4ad719be 100644
--- a/epochX/cudacpp/pp_tt012j.mad/src/HelAmps_sm.h
+++ b/epochX/cudacpp/pp_tt012j.mad/src/HelAmps_sm.h
@@ -5,7 +5,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: A. Valassi (Sep 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
 // MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
@@ -28,7 +28,7 @@
 //#include <iomanip>
 //#include <iostream>
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/pp_tt012j.mad/src/Parameters_sm.cc b/epochX/cudacpp/pp_tt012j.mad/src/Parameters_sm.cc
index 64fc3fea62..067445b198 100644
--- a/epochX/cudacpp/pp_tt012j.mad/src/Parameters_sm.cc
+++ b/epochX/cudacpp/pp_tt012j.mad/src/Parameters_sm.cc
@@ -4,7 +4,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: A. Valassi (Sep 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
 // MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
@@ -17,7 +17,7 @@
 #include <iomanip>
 #include <iostream>
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 using namespace mg5amcGpu;
 #else
 using namespace mg5amcCpu;
diff --git a/epochX/cudacpp/pp_tt012j.mad/src/Parameters_sm.h b/epochX/cudacpp/pp_tt012j.mad/src/Parameters_sm.h
index b6568d3761..9581d66e0e 100644
--- a/epochX/cudacpp/pp_tt012j.mad/src/Parameters_sm.h
+++ b/epochX/cudacpp/pp_tt012j.mad/src/Parameters_sm.h
@@ -27,7 +27,7 @@
 #include "read_slha.h"
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -93,7 +93,7 @@ namespace mg5amcCpu
 #include <limits>
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -218,7 +218,7 @@ namespace mg5amcCpu
 //==========================================================================
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -239,7 +239,7 @@ namespace mg5amcCpu
 #pragma GCC diagnostic push
 #pragma GCC diagnostic ignored "-Wunused-variable"  // e.g. <<warning: unused variable ‘mdl_G__exp__2’ [-Wunused-variable]>>
 #pragma GCC diagnostic ignored "-Wunused-parameter" // e.g. <<warning: unused parameter ‘G’ [-Wunused-parameter]>>
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 #pragma nv_diagnostic push
 #pragma nv_diag_suppress 177 // e.g. <<warning #177-D: variable "mdl_G__exp__2" was declared but never referenced>>
 #endif
@@ -267,7 +267,7 @@ namespace mg5amcCpu
       // End SM implementation - no special handling of vectors of floats as in EFT (#439)
       return out;
     }
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 #pragma GCC diagnostic pop
 #pragma nv_diagnostic pop
 #endif
diff --git a/epochX/cudacpp/pp_tt012j.mad/src/cudacpp_src.mk b/epochX/cudacpp/pp_tt012j.mad/src/cudacpp_src.mk
index d4cc628aec..159e19a46d 100644
--- a/epochX/cudacpp/pp_tt012j.mad/src/cudacpp_src.mk
+++ b/epochX/cudacpp/pp_tt012j.mad/src/cudacpp_src.mk
@@ -19,7 +19,7 @@ SHELL := /bin/bash
 #=== Configure common compiler flags for CUDA and C++
 
 INCFLAGS = -I.
-OPTFLAGS = -O3 # this ends up in CUFLAGS too (should it?), cannot add -Ofast or -ffast-math here
+OPTFLAGS = -O3 # this ends up in GPUFLAGS too (should it?), cannot add -Ofast or -ffast-math here
 
 #-------------------------------------------------------------------------------
 
@@ -45,13 +45,13 @@ endif
 
 #-------------------------------------------------------------------------------
 
-#=== Configure the CUDA compiler (note: NVCC is already exported including ccache)
+#=== Configure the CUDA compiler (note: GPUCC is already exported including ccache)
 
-###$(info NVCC=$(NVCC))
+###$(info GPUCC=$(GPUCC))
 
 #-------------------------------------------------------------------------------
 
-#=== Configure ccache for C++ builds (note: NVCC is already exported including ccache)
+#=== Configure ccache for C++ builds (note: GPUCC is already exported including ccache)
 
 # Enable ccache if USECCACHE=1
 ifeq ($(USECCACHE)$(shell echo $(CXX) | grep ccache),1)
@@ -92,6 +92,13 @@ endif
 ###$(info OMPFLAGS=$(OMPFLAGS))
 CXXFLAGS += $(OMPFLAGS)
 
+# Add correct -DHIP_LATFORM when compiling for HIP
+ifeq ($(findstring nvcc,$(GPUCC)),nvcc)
+  GPUFLAGS += -Xcompiler -fPIC -c -x cu
+else ifeq ($(findstring hipcc,$(GPUCC)),hipcc)
+  GPUFLAGS += -fPIC -c
+endif
+
 # Set the build flags appropriate to each AVX choice (example: "make AVX=none")
 # [NB MGONGPU_PVW512 is needed because "-mprefer-vector-width=256" is not exposed in a macro]
 # [See https://gcc.gnu.org/bugzilla/show_bug.cgi?id=96476]
@@ -253,20 +260,20 @@ $(BUILDDIR)/%.o : %.cc *.h $(BUILDDIR)/.build.$(TAG)
 # Generic target and build rules: objects from CUDA compilation
 $(BUILDDIR)/%_cu.o : %.cc *.h $(BUILDDIR)/.build.$(TAG)
 	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
-	$(NVCC) $(CPPFLAGS) $(CUFLAGS) -Xcompiler -fPIC -c -x cu $< -o $@
+	$(GPUCC) $(CPPFLAGS) $(GPUFLAGS) $< -o $@
 
 #-------------------------------------------------------------------------------
 
 cxx_objects=$(addprefix $(BUILDDIR)/, Parameters_sm.o read_slha.o)
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 cu_objects=$(addprefix $(BUILDDIR)/, Parameters_sm_cu.o)
 endif
 
 # Target (and build rules): common (src) library
-ifneq ($(NVCC),)
+ifneq ($(GPUCC),)
 $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so : $(cxx_objects) $(cu_objects)
 	@if [ ! -d $(LIBDIR) ]; then echo "mkdir -p $(LIBDIR)"; mkdir -p $(LIBDIR); fi
-	$(NVCC) -shared -o $@ $(cxx_objects) $(cu_objects) $(LDFLAGS)
+	$(GPUCC) -shared -o $@ $(cxx_objects) $(cu_objects) $(LDFLAGS)
 else
 $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so : $(cxx_objects)
 	@if [ ! -d $(LIBDIR) ]; then echo "mkdir -p $(LIBDIR)"; mkdir -p $(LIBDIR); fi
diff --git a/epochX/cudacpp/pp_tt012j.mad/src/mgOnGpuConfig.h b/epochX/cudacpp/pp_tt012j.mad/src/mgOnGpuConfig.h
index 80032e528b..55d03f1252 100644
--- a/epochX/cudacpp/pp_tt012j.mad/src/mgOnGpuConfig.h
+++ b/epochX/cudacpp/pp_tt012j.mad/src/mgOnGpuConfig.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jul 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MGONGPUCONFIG_H
 #define MGONGPUCONFIG_H 1
@@ -10,12 +10,25 @@
 // There are two different code bases for standalone_cudacpp (without multichannel) and madevent+cudacpp (with multichannel)
 #define MGONGPU_SUPPORTS_MULTICHANNEL 1
 
+// Is this a GPU (CUDA, HIP) or CPU implementation?
+#ifdef __CUDACC__
+#define MGONGPUCPP_GPUIMPL cuda
+#elif defined __HIPCC__
+#define MGONGPUCPP_GPUIMPL hip
+#else
+#undef MGONGPUCPP_GPUIMPL
+#endif
+
 // ** NB1 Throughputs (e.g. 6.8E8) are events/sec for "./gcheck.exe -p 65536 128 12"
 // ** NB2 Baseline on b7g47n0004 fluctuates (probably depends on load on other VMs)
 
 // Choose if curand is supported for generating random numbers
+// For HIP, by default, do not use curand (common random numbers will be used instead)
 // For both CUDA and C++, by default, do not inline, but allow this macro to be set from outside with e.g. -DMGONGPU_HAS_NO_CURAND
-// (there exist CUDA installations, e.g. using the HPC package, which do not include curand - see PR #784)
+// (there exist CUDA installations, e.g. using the HPC package, which do not include curand - see PR #784 and #785)
+#if defined __HIPCC__
+#define MGONGPU_HAS_NO_CURAND 1
+#else
 //#ifdef __CUDACC__
 //#undef MGONGPU_HAS_NO_CURAND // default
 ////#define MGONGPU_HAS_NO_CURAND 1
@@ -23,6 +36,7 @@
 //#undef MGONGPU_HAS_NO_CURAND // default
 ////#define MGONGPU_HAS_NO_CURAND 1
 //#endif
+#endif
 
 // Choose floating point precision (for everything but color algebra #537)
 // If one of these macros has been set from outside with e.g. -DMGONGPU_FPTYPE_FLOAT, nothing happens (issue #167)
@@ -54,23 +68,28 @@
 //#undef MGONGPU_HARDCODE_PARAM // default
 ////#define MGONGPU_HARDCODE_PARAM 1
 
-// Complex type in c++: std::complex or cxsmpl (CHOOSE ONLY ONE)
-#ifndef __CUDACC__
-//#define MGONGPU_CPPCXTYPE_STDCOMPLEX 1 // ~8 percent slower on float, same on double (5.1E6/double, 9.4E6/float)
-#define MGONGPU_CPPCXTYPE_CXSMPL 1 // new default (5.1E6/double, 10.2E6/float)
-#endif
-
-// Complex type in cuda: thrust or cucomplex or cxsmpl (CHOOSE ONLY ONE)
+// Complex type in CUDA: thrust or cucomplex or cxsmpl (CHOOSE ONLY ONE)
 #ifdef __CUDACC__
 #define MGONGPU_CUCXTYPE_THRUST 1 // default (~1.15E9/double, ~3.2E9/float)
 //#define MGONGPU_CUCXTYPE_CUCOMPLEX 1 // ~10 percent slower (1.03E9/double, ~2.8E9/float)
 //#define MGONGPU_CUCXTYPE_CXSMPL 1 // ~10 percent slower (1.00E9/double, ~2.9E9/float)
+
+// Complex type in HIP: cxsmpl (ONLY ONE OPTION POSSIBLE)
+#elif defined __HIPCC__
+#define MGONGPU_CUCXTYPE_CXSMPL 1 // ~10 percent slower (1.00E9/double, ~2.9E9/float)
+
+// Complex type in C++: std::complex or cxsmpl (CHOOSE ONLY ONE)
+#else
+//#define MGONGPU_CPPCXTYPE_STDCOMPLEX 1 // ~8 percent slower on float, same on double (5.1E6/double, 9.4E6/float)
+#define MGONGPU_CPPCXTYPE_CXSMPL 1 // new default (5.1E6/double, 10.2E6/float)
 #endif
 
-// Cuda nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation
+// CUDA nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation
 #ifdef __CUDACC__
-#undef MGONGPU_NSIGHT_DEBUG // default
+#undef MGONGPU_NSIGHT_DEBUG // default in CUDA
 //#define MGONGPU_NSIGHT_DEBUG 1
+#else
+#undef MGONGPU_NSIGHT_DEBUG // only option in HIP or C++
 #endif
 
 // SANITY CHECKS (floating point precision for everything but color algebra #537)
@@ -86,17 +105,21 @@
 #error You cannot use double precision for color algebra and single precision elsewhere
 #endif
 
-// SANITY CHECKS (c++ complex number implementation)
-#ifndef __CUDACC__
-#if defined MGONGPU_CPPCXTYPE_STDCOMPLEX and defined MGONGPU_CPPCXTYPE_CXSMPL
-#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CPPCXTYPE_STDCOMPLEX or MGONGPU_CPPCXTYPE_CXSMPL
+// SANITY CHECKS (CUDA complex number implementation)
+#ifdef __CUDACC__
+#if defined MGONGPU_CUCXTYPE_THRUST and defined MGONGPU_CUCXTYPE_CUCOMPLEX
+#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CUCXTYPE_THRUST or MGONGPU_CUCXTYPE_CUCOMPLEX for CUDA
+#elif defined MGONGPU_CUCXTYPE_THRUST and defined MGONGPU_CUCXTYPE_CXSMPL
+#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CUCXTYPE_THRUST or MGONGPU_CUCXTYPE_CXSMPL for CUDA
+#elif defined MGONGPU_CUCXTYPE_CUCOMPLEX and defined MGONGPU_CUCXTYPE_CXSMPL
+#error You must CHOOSE (ONE AND) ONLY ONE OF MGONGPU_CUCXTYPE_CUCOMPLEX or MGONGPU_CUCXTYPE_CXSMPL for CUDA
 #endif
 #endif
 
-// SANITY CHECKS (cuda complex number implementation)
-#ifdef __CUDACC__
-#if defined MGONGPU_CUCXTYPE_THRUST and defined MGONGPU_CUCXTYPE_CUCOMPLEX and defined MGONGPU_CUCXTYPE_CXSMPL
-#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CUCXTYPE_THRUST or MGONGPU_CUCXTYPE_CUCOMPLEX or MGONGPU_CUCXTYPE_CXSMPL
+// SANITY CHECKS (C++ complex number implementation)
+#ifndef MGONGPUCPP_GPUIMPL
+#if defined MGONGPU_CPPCXTYPE_STDCOMPLEX and defined MGONGPU_CPPCXTYPE_CXSMPL
+#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CPPCXTYPE_STDCOMPLEX or MGONGPU_CPPCXTYPE_CXSMPL for C++
 #endif
 #endif
 
@@ -134,7 +157,7 @@ namespace mgOnGpu
   // Alignment requirement for using reinterpret_cast with SIMD vectorized code
   // (using reinterpret_cast with non aligned memory may lead to segmentation faults!)
   // Only needed for C++ code but can be enforced also in NVCC builds of C++ code using CUDA>=11.2 and C++17 (#318, #319, #333)
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   constexpr int cppAlign = 64; // alignment requirement for SIMD vectorization (64-byte i.e. 512-bit)
 #endif
 
@@ -145,7 +168,7 @@ using mgOnGpu::fptype;
 using mgOnGpu::fptype2;
 
 // C++ SIMD vectorization width (this will be used to set neppV)
-#ifdef __CUDACC__ // CUDA implementation has no SIMD
+#ifdef MGONGPUCPP_GPUIMPL // CUDA and HIP implementations have no SIMD
 #undef MGONGPU_CPPSIMD
 #elif defined __AVX512VL__ && defined MGONGPU_PVW512 // C++ "512z" AVX512 with 512 width (512-bit ie 64-byte): 8 (DOUBLE) or 16 (FLOAT)
 #ifdef MGONGPU_FPTYPE_DOUBLE
@@ -175,9 +198,9 @@ using mgOnGpu::fptype2;
 #undef MGONGPU_CPPSIMD
 #endif
 
-// Cuda nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation
+// CUDA nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation
 // Arguments (not used so far): text is __FUNCTION__, code is 0 (start) or 1 (end)
-#if defined __CUDACC__ && defined MGONGPU_NSIGHT_DEBUG /* clang-format off */
+#if defined __CUDA__ && defined MGONGPU_NSIGHT_DEBUG /* clang-format off */
 #define mgDebugDeclare() __shared__ float mgDebugCounter[mgOnGpu::ntpbMAX];
 #define mgDebugInitialise() { mgDebugCounter[threadIdx.x] = 0; }
 #define mgDebug( code, text ) { mgDebugCounter[threadIdx.x] += 1; }
@@ -189,8 +212,8 @@ using mgOnGpu::fptype2;
 #define mgDebugFinalise() { /*noop*/ }
 #endif /* clang-format on */
 
-// Define empty CUDA declaration specifiers for C++
-#ifndef __CUDACC__
+// Define empty CUDA/HIP declaration specifiers for C++
+#ifndef MGONGPUCPP_GPUIMPL
 #define __global__
 #define __host__
 #define __device__
diff --git a/epochX/cudacpp/pp_tt012j.mad/src/mgOnGpuCxtypes.h b/epochX/cudacpp/pp_tt012j.mad/src/mgOnGpuCxtypes.h
index ca9a9f00c0..5532e22fa1 100644
--- a/epochX/cudacpp/pp_tt012j.mad/src/mgOnGpuCxtypes.h
+++ b/epochX/cudacpp/pp_tt012j.mad/src/mgOnGpuCxtypes.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022, based on earlier work by D. Smith) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MGONGPUCXTYPES_H
 #define MGONGPUCXTYPES_H 1
@@ -19,7 +19,7 @@
 #include <complex>
 
 // Complex type in cuda: thrust or cucomplex or cxsmpl
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 #if defined MGONGPU_CUCXTYPE_THRUST
 #pragma clang diagnostic push
 #pragma clang diagnostic ignored "-Wtautological-compare" // for icpx2021/clang13 (https://stackoverflow.com/a/15864661)
@@ -82,7 +82,7 @@ namespace mgOnGpu /* clang-format off */
 using mgOnGpu::cxsmpl;
 
 // Printout to stream for user defined types
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -92,7 +92,7 @@ namespace mg5amcCpu
   inline __host__ std::ostream&
   operator<<( std::ostream& out, const cxsmpl<FP>& c )
   {
-    out << std::complex( c.real(), c.imag() );
+    out << std::complex<FP>( c.real(), c.imag() );
     return out;
   }
 
@@ -215,14 +215,14 @@ namespace mg5amcCpu
 //==========================================================================
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
 #endif
 {
   // --- Type definitions (complex type: cxtype)
-#ifdef __CUDACC__ // cuda
+#ifdef MGONGPUCPP_GPUIMPL // cuda
 #if defined MGONGPU_CUCXTYPE_THRUST
   typedef thrust::complex<fptype> cxtype;
 #elif defined MGONGPU_CUCXTYPE_CUCOMPLEX
@@ -255,7 +255,7 @@ namespace mg5amcCpu
 //==========================================================================
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -307,7 +307,7 @@ namespace mg5amcCpu
 
   //==========================================================================
 
-#if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_THRUST // cuda + thrust
+#if defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CUCXTYPE_THRUST // cuda + thrust
 
   //------------------------------
   // CUDA - using thrust::complex
@@ -343,11 +343,11 @@ namespace mg5amcCpu
     return c;
   }
 
-#endif // #if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_THRUST
+#endif // #if defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CUCXTYPE_THRUST
 
   //==========================================================================
 
-#if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_CUCOMPLEX // cuda + cucomplex
+#if defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CUCXTYPE_CUCOMPLEX // cuda + cucomplex
 
   //------------------------------
   // CUDA - using cuComplex
@@ -562,11 +562,11 @@ namespace mg5amcCpu
     return cxmake( c.real(), c.imag() );
   }
 
-#endif // #if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_CUCOMPLEX
+#endif // #if defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CUCXTYPE_CUCOMPLEX
 
   //==========================================================================
 
-#if not defined __CUDACC__ and defined MGONGPU_CPPCXTYPE_STDCOMPLEX // c++ + stdcomplex
+#if not defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CPPCXTYPE_STDCOMPLEX // c++ + stdcomplex
 
   //------------------------------
   // C++ - using std::complex
@@ -610,7 +610,7 @@ namespace mg5amcCpu
   }
 #endif
 
-#endif // #if not defined __CUDACC__ and defined MGONGPU_CPPCXTYPE_STDCOMPLEX
+#endif // #if not defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CPPCXTYPE_STDCOMPLEX
 
   //==========================================================================
 
@@ -633,7 +633,7 @@ namespace mg5amcCpu
 //==========================================================================
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
diff --git a/epochX/cudacpp/pp_tt012j.mad/src/mgOnGpuFptypes.h b/epochX/cudacpp/pp_tt012j.mad/src/mgOnGpuFptypes.h
index 905c97d700..fa3a02664b 100644
--- a/epochX/cudacpp/pp_tt012j.mad/src/mgOnGpuFptypes.h
+++ b/epochX/cudacpp/pp_tt012j.mad/src/mgOnGpuFptypes.h
@@ -1,7 +1,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef MGONGPUFPTYPES_H
 #define MGONGPUFPTYPES_H 1
@@ -12,7 +12,7 @@
 #include <cmath>
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL // cuda
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -20,7 +20,7 @@ namespace mg5amcCpu
 {
   //==========================================================================
 
-#ifdef __CUDACC__ // cuda
+#ifdef MGONGPUCPP_GPUIMPL // cuda
 
   //------------------------------
   // Floating point types - Cuda
@@ -64,11 +64,11 @@ namespace mg5amcCpu
 #endif
   }
 
-#endif // #ifdef __CUDACC__
+#endif // #ifdef MGONGPUCPP_GPUIMPL
 
   //==========================================================================
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 
   //------------------------------
   // Floating point types - C++
@@ -92,7 +92,7 @@ namespace mg5amcCpu
     return std::sqrt( f );
   }
 
-#endif // #ifndef __CUDACC__
+#endif // #ifndef MGONGPUCPP_GPUIMPL
 
   //==========================================================================
 
diff --git a/epochX/cudacpp/pp_tt012j.mad/src/mgOnGpuVectors.h b/epochX/cudacpp/pp_tt012j.mad/src/mgOnGpuVectors.h
index e1299ba81e..cdae04326b 100644
--- a/epochX/cudacpp/pp_tt012j.mad/src/mgOnGpuVectors.h
+++ b/epochX/cudacpp/pp_tt012j.mad/src/mgOnGpuVectors.h
@@ -32,7 +32,7 @@
 #endif
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -131,7 +131,7 @@ namespace mg5amcCpu
 #endif
 #endif
 
-#else // i.e #ifndef MGONGPU_CPPSIMD (this includes #ifdef __CUDACC__)
+#else // i.e #ifndef MGONGPU_CPPSIMD (this includes #ifdef MGONGPUCPP_GPUIMPL)
 
   const int neppV = 1;
 
@@ -153,13 +153,13 @@ namespace mg5amcCpu
 //==========================================================================
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
 #endif
 {
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 
   // Printout to stream for user defined types
 
@@ -805,11 +805,11 @@ namespace mg5amcCpu
 
 #endif // #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
 
-#endif // #ifndef __CUDACC__
+#endif // #ifndef MGONGPUCPP_GPUIMPL
 
   //==========================================================================
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 
   //------------------------------
   // Vector types - CUDA
@@ -853,12 +853,12 @@ namespace mg5amcCpu
     return mask;
   }
 
-#endif // #ifdef __CUDACC__
+#endif // #ifdef MGONGPUCPP_GPUIMPL
 
   //==========================================================================
 
   // Scalar-or-vector types: scalar in CUDA, vector or scalar in C++
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   typedef bool bool_sv;
   typedef fptype fptype_sv;
   typedef fptype2 fptype2_sv;
@@ -879,7 +879,7 @@ namespace mg5amcCpu
 #endif
 
   // Scalar-or-vector zeros: scalar in CUDA, vector or scalar in C++
-#ifdef __CUDACC__ /* clang-format off */
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
   inline __host__ __device__ cxtype cxzero_sv(){ return cxtype( 0, 0 ); }
 #elif defined MGONGPU_CPPSIMD
   inline cxtype_v cxzero_sv() { return cxtype_v(); } // RRRR=0000 IIII=0000
diff --git a/epochX/cudacpp/pp_tt012j.mad/src/rambo.h b/epochX/cudacpp/pp_tt012j.mad/src/rambo.h
index e02ea52496..cd7e1008ea 100644
--- a/epochX/cudacpp/pp_tt012j.mad/src/rambo.h
+++ b/epochX/cudacpp/pp_tt012j.mad/src/rambo.h
@@ -4,7 +4,7 @@
 // Copyright (C) 2020-2023 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 
 #include "mgOnGpuConfig.h"
@@ -18,7 +18,7 @@
 #include <iostream>
 
 // Simplified rambo version for 2 to N (with N>=2) processes with massless particles
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -83,7 +83,7 @@ namespace mg5amcCpu
       static bool first = true;
       if( first )
       {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
         if constexpr( M_ACCESS::isOnDevice() ) // avoid
         {
           const int ievt0 = 0;
@@ -166,7 +166,7 @@ namespace mg5amcCpu
     wt = po2log;
     if( nparf != 2 ) wt = ( 2. * nparf - 4. ) * log( energy ) + z[nparf - 1];
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
     // issue warnings if weight is too small or too large
     static int iwarn[5] = { 0, 0, 0, 0, 0 };
     if( wt < -180. )

From 445426e63d02a841d0b0dcf49afa3fa80d459eb2 Mon Sep 17 00:00:00 2001
From: Andrea Valassi <andrea.valassi@cern.ch>
Date: Thu, 25 Jan 2024 18:14:42 +0100
Subject: [PATCH 509/509] [jthip24] *** COMPLETE SYNC OF JTHIP24 AND JT774 ***
 copy over the codegen logs into jthip24 from jt774 (currently commit
 464703b6f6e96f7b3585663e41ec435b709e2cc5 on Thu Jan 25 18:08:40 2024 +0100)

git checkout jt774 $(git ls-tree --name-only HEAD */CODEGEN*txt)

*** NB Now all processes in the repo are the same as in jt774 (including codegen logs) ***

*** NB Now jthip24 is identical to jt774, except that jthip24 also contains extra files in .github/workflows and in tools for CI and profiling ***

git diff jthip24 jt774 --name-only
.github/workflows/a100_profiler.yml
.github/workflows/c-cpp.yml
.github/workflows/mi250x_profiler.yml
.github/workflows/sycl.yml
.github/workflows/v100s_profiler.yml
tools/profiling/README.md
tools/profiling/buildCUDAProcess.sh
tools/profiling/buildSYCLProcess.sh
tools/profiling/container-README.md
tools/profiling/containerSetup.sh
tools/profiling/evaluation.py
tools/profiling/performanceProfiler.py
tools/profiling/profileconfig.ini
tools/profiling/sendData.py
---
 .../ee_mumu.mad/CODEGEN_mad_ee_mumu_log.txt   | 16 +++---
 .../CODEGEN_cudacpp_ee_mumu_log.txt           | 10 ++--
 .../gg_tt.mad/CODEGEN_mad_gg_tt_log.txt       | 16 +++---
 .../gg_tt.sa/CODEGEN_cudacpp_gg_tt_log.txt    | 12 ++---
 .../gg_tt01g.mad/CODEGEN_mad_gg_tt01g_log.txt | 22 ++++----
 .../gg_ttg.mad/CODEGEN_mad_gg_ttg_log.txt     | 18 +++----
 .../gg_ttg.sa/CODEGEN_cudacpp_gg_ttg_log.txt  | 12 ++---
 .../gg_ttgg.mad/CODEGEN_mad_gg_ttgg_log.txt   | 22 ++++----
 .../CODEGEN_cudacpp_gg_ttgg_log.txt           | 16 +++---
 .../gg_ttggg.mad/CODEGEN_mad_gg_ttggg_log.txt | 22 ++++----
 .../CODEGEN_cudacpp_gg_ttggg_log.txt          | 14 ++---
 .../gq_ttq.mad/CODEGEN_mad_gq_ttq_log.txt     | 22 ++++----
 .../gq_ttq.sa/CODEGEN_cudacpp_gq_ttq_log.txt  | 12 ++---
 .../CODEGEN_cudacpp_heft_gg_h_log.txt         | 10 ++--
 .../CODEGEN_mad_pp_tt012j_log.txt             | 54 +++++++++----------
 15 files changed, 139 insertions(+), 139 deletions(-)

diff --git a/epochX/cudacpp/ee_mumu.mad/CODEGEN_mad_ee_mumu_log.txt b/epochX/cudacpp/ee_mumu.mad/CODEGEN_mad_ee_mumu_log.txt
index 3be3e9348e..dd0f31341f 100644
--- a/epochX/cudacpp/ee_mumu.mad/CODEGEN_mad_ee_mumu_log.txt
+++ b/epochX/cudacpp/ee_mumu.mad/CODEGEN_mad_ee_mumu_log.txt
@@ -62,7 +62,7 @@ generate e+ e- > mu+ mu-
 No model currently active, so we import the Standard Model
 INFO: load particles 
 INFO: load vertices 
-[1;32mDEBUG: model prefixing  takes 0.005505561828613281 [0m
+[1;32mDEBUG: model prefixing  takes 0.005403280258178711 [0m
 INFO: Restrict model sm with file models/sm/restrict_default.dat . 
 [1;32mDEBUG: Simplifying conditional expressions [0m
 [1;32mDEBUG: remove interactions: u s w+ at order: QED=1 [0m
@@ -174,7 +174,7 @@ INFO: Generating Helas calls for process: e+ e- > mu+ mu- WEIGHTED<=4 @1
 INFO: Processing color information for process: e+ e- > mu+ mu- @1 
 INFO: Creating files in directory P1_epem_mupmum 
 [1;32mDEBUG:  kwargs[prefix] = 0 [1;30m[model_handling.py at line 1058][0m [0m
-[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7f5a06712b20> [1;30m[export_v4.py at line 6262][0m [0m
+[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7fca26007b20> [1;30m[export_v4.py at line 6262][0m [0m
 INFO: Creating files in directory . 
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.h
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.cc
@@ -191,19 +191,19 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./.
 INFO: Generating Feynman diagrams for Process: e+ e- > mu+ mu- WEIGHTED<=4 @1 
 INFO: Finding symmetric diagrams for subprocess group epem_mupmum 
 Generated helas calls for 1 subprocesses (2 diagrams) in 0.004 s
-Wrote files for 8 helas calls in 0.097 s
+Wrote files for 8 helas calls in 0.098 s
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates FFV1 routines[0m
 ALOHA: aloha creates FFV2 routines[0m
 ALOHA: aloha creates FFV4 routines[0m
-ALOHA: aloha creates 3 routines in  0.199 s
+ALOHA: aloha creates 3 routines in  0.200 s
 [1;32mDEBUG:  Entering PLUGIN_ProcessExporter.convert_model (create the model) [1;30m[output.py at line 202][0m [0m
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates FFV1 routines[0m
 ALOHA: aloha creates FFV2 routines[0m
 ALOHA: aloha creates FFV4 routines[0m
 ALOHA: aloha creates FFV2_4 routines[0m
-ALOHA: aloha creates 7 routines in  0.253 s
+ALOHA: aloha creates 7 routines in  0.537 s
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
 <class 'aloha.create_aloha.AbstractRoutine'> FFV2
@@ -248,9 +248,9 @@ Type "launch" to generate events from this process, or see
 Run "open index.html" to see more information about this process.
 quit
 
-real	0m1.874s
-user	0m1.628s
-sys	0m0.225s
+real	0m2.147s
+user	0m1.627s
+sys	0m0.231s
 Code generation completed in 2 seconds
 ************************************************************
 *                                                          *
diff --git a/epochX/cudacpp/ee_mumu.sa/CODEGEN_cudacpp_ee_mumu_log.txt b/epochX/cudacpp/ee_mumu.sa/CODEGEN_cudacpp_ee_mumu_log.txt
index 26e1484575..20d35a4a26 100644
--- a/epochX/cudacpp/ee_mumu.sa/CODEGEN_cudacpp_ee_mumu_log.txt
+++ b/epochX/cudacpp/ee_mumu.sa/CODEGEN_cudacpp_ee_mumu_log.txt
@@ -62,7 +62,7 @@ generate e+ e- > mu+ mu-
 No model currently active, so we import the Standard Model
 INFO: load particles 
 INFO: load vertices 
-[1;32mDEBUG: model prefixing  takes 0.005408763885498047 [0m
+[1;32mDEBUG: model prefixing  takes 0.005757331848144531 [0m
 INFO: Restrict model sm with file models/sm/restrict_default.dat . 
 [1;32mDEBUG: Simplifying conditional expressions [0m
 [1;32mDEBUG: remove interactions: u s w+ at order: QED=1 [0m
@@ -181,7 +181,7 @@ ALOHA: aloha creates FFV1 routines[0m
 ALOHA: aloha creates FFV2 routines[0m
 ALOHA: aloha creates FFV4 routines[0m
 ALOHA: aloha creates FFV2_4 routines[0m
-ALOHA: aloha creates 4 routines in  0.266 s
+ALOHA: aloha creates 4 routines in  0.267 s
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
 <class 'aloha.create_aloha.AbstractRoutine'> FFV2
@@ -200,7 +200,7 @@ INFO: Created files Parameters_sm.h and Parameters_sm.cc in directory
 INFO: /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_ee_mumu/src/. and /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_ee_mumu/src/. 
 quit
 
-real	0m0.649s
-user	0m0.586s
-sys	0m0.056s
+real	0m0.662s
+user	0m0.596s
+sys	0m0.051s
 Code generation completed in 1 seconds
diff --git a/epochX/cudacpp/gg_tt.mad/CODEGEN_mad_gg_tt_log.txt b/epochX/cudacpp/gg_tt.mad/CODEGEN_mad_gg_tt_log.txt
index e35d15e679..75c84e12fb 100644
--- a/epochX/cudacpp/gg_tt.mad/CODEGEN_mad_gg_tt_log.txt
+++ b/epochX/cudacpp/gg_tt.mad/CODEGEN_mad_gg_tt_log.txt
@@ -62,7 +62,7 @@ generate g g > t t~
 No model currently active, so we import the Standard Model
 INFO: load particles 
 INFO: load vertices 
-[1;32mDEBUG: model prefixing  takes 0.00555729866027832 [0m
+[1;32mDEBUG: model prefixing  takes 0.005261659622192383 [0m
 INFO: Restrict model sm with file models/sm/restrict_default.dat . 
 [1;32mDEBUG: Simplifying conditional expressions [0m
 [1;32mDEBUG: remove interactions: u s w+ at order: QED=1 [0m
@@ -175,7 +175,7 @@ INFO: Generating Helas calls for process: g g > t t~ WEIGHTED<=2 @1
 INFO: Processing color information for process: g g > t t~ @1 
 INFO: Creating files in directory P1_gg_ttx 
 [1;32mDEBUG:  kwargs[prefix] = 0 [1;30m[model_handling.py at line 1058][0m [0m
-[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7f724bea16a0> [1;30m[export_v4.py at line 6262][0m [0m
+[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7f0b248a16a0> [1;30m[export_v4.py at line 6262][0m [0m
 INFO: Creating files in directory . 
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.h
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.cc
@@ -191,16 +191,16 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./.
 INFO: Generating Feynman diagrams for Process: g g > t t~ WEIGHTED<=2 @1 
 INFO: Finding symmetric diagrams for subprocess group gg_ttx 
 Generated helas calls for 1 subprocesses (3 diagrams) in 0.006 s
-Wrote files for 10 helas calls in 0.101 s
+Wrote files for 10 helas calls in 0.100 s
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates VVV1 set of routines with options: P0[0m
 ALOHA: aloha creates FFV1 routines[0m
-ALOHA: aloha creates 2 routines in  0.143 s
+ALOHA: aloha creates 2 routines in  0.144 s
 [1;32mDEBUG:  Entering PLUGIN_ProcessExporter.convert_model (create the model) [1;30m[output.py at line 202][0m [0m
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates VVV1 set of routines with options: P0[0m
 ALOHA: aloha creates FFV1 routines[0m
-ALOHA: aloha creates 4 routines in  0.130 s
+ALOHA: aloha creates 4 routines in  0.131 s
 <class 'aloha.create_aloha.AbstractRoutine'> VVV1
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
@@ -237,9 +237,9 @@ Type "launch" to generate events from this process, or see
 Run "open index.html" to see more information about this process.
 quit
 
-real	0m1.692s
-user	0m1.443s
-sys	0m0.239s
+real	0m1.690s
+user	0m1.458s
+sys	0m0.220s
 Code generation completed in 2 seconds
 ************************************************************
 *                                                          *
diff --git a/epochX/cudacpp/gg_tt.sa/CODEGEN_cudacpp_gg_tt_log.txt b/epochX/cudacpp/gg_tt.sa/CODEGEN_cudacpp_gg_tt_log.txt
index d541a897ed..5542e5323b 100644
--- a/epochX/cudacpp/gg_tt.sa/CODEGEN_cudacpp_gg_tt_log.txt
+++ b/epochX/cudacpp/gg_tt.sa/CODEGEN_cudacpp_gg_tt_log.txt
@@ -62,7 +62,7 @@ generate g g > t t~
 No model currently active, so we import the Standard Model
 INFO: load particles 
 INFO: load vertices 
-[1;32mDEBUG: model prefixing  takes 0.0053708553314208984 [0m
+[1;32mDEBUG: model prefixing  takes 0.005713224411010742 [0m
 INFO: Restrict model sm with file models/sm/restrict_default.dat . 
 [1;32mDEBUG: Simplifying conditional expressions [0m
 [1;32mDEBUG: remove interactions: u s w+ at order: QED=1 [0m
@@ -180,7 +180,7 @@ Generated helas calls for 1 subprocesses (3 diagrams) in 0.006 s
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates VVV1 set of routines with options: P0[0m
 ALOHA: aloha creates FFV1 routines[0m
-ALOHA: aloha creates 2 routines in  0.144 s
+ALOHA: aloha creates 2 routines in  0.145 s
 <class 'aloha.create_aloha.AbstractRoutine'> VVV1
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
@@ -195,7 +195,7 @@ INFO: Created files Parameters_sm.h and Parameters_sm.cc in directory
 INFO: /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_tt/src/. and /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_tt/src/. 
 quit
 
-real	0m0.545s
-user	0m0.478s
-sys	0m0.059s
-Code generation completed in 0 seconds
+real	0m0.623s
+user	0m0.466s
+sys	0m0.061s
+Code generation completed in 1 seconds
diff --git a/epochX/cudacpp/gg_tt01g.mad/CODEGEN_mad_gg_tt01g_log.txt b/epochX/cudacpp/gg_tt01g.mad/CODEGEN_mad_gg_tt01g_log.txt
index 0e50ba9321..f38b6ec6e6 100644
--- a/epochX/cudacpp/gg_tt01g.mad/CODEGEN_mad_gg_tt01g_log.txt
+++ b/epochX/cudacpp/gg_tt01g.mad/CODEGEN_mad_gg_tt01g_log.txt
@@ -62,7 +62,7 @@ generate g g > t t~
 No model currently active, so we import the Standard Model
 INFO: load particles 
 INFO: load vertices 
-[1;32mDEBUG: model prefixing  takes 0.005617380142211914 [0m
+[1;32mDEBUG: model prefixing  takes 0.005505561828613281 [0m
 INFO: Restrict model sm with file models/sm/restrict_default.dat . 
 [1;32mDEBUG: Simplifying conditional expressions [0m
 [1;32mDEBUG: remove interactions: u s w+ at order: QED=1 [0m
@@ -163,7 +163,7 @@ INFO: Please specify coupling orders to bypass this step.
 INFO: Trying coupling order WEIGHTED<=3: WEIGTHED IS QCD+2*QED 
 INFO: Trying process: g g > t t~ g WEIGHTED<=3 @2  
 INFO: Process has 16 diagrams 
-1 processes with 16 diagrams generated in 0.019 s
+1 processes with 16 diagrams generated in 0.020 s
 Total: 2 processes with 19 diagrams
 output madevent ../TMPOUT/CODEGEN_mad_gg_tt01g --hel_recycling=False --vector_size=32 --me_exporter=standalone_cudacpp
 Load PLUGIN.CUDACPP_OUTPUT
@@ -185,7 +185,7 @@ INFO: Generating Helas calls for process: g g > t t~ WEIGHTED<=2 @1
 INFO: Processing color information for process: g g > t t~ @1 
 INFO: Creating files in directory P2_gg_ttxg 
 [1;32mDEBUG:  kwargs[prefix] = 0 [1;30m[model_handling.py at line 1058][0m [0m
-[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7f814a9d11f0> [1;30m[export_v4.py at line 6262][0m [0m
+[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7f29a41021f0> [1;30m[export_v4.py at line 6262][0m [0m
 INFO: Creating files in directory . 
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.h
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.cc
@@ -202,7 +202,7 @@ INFO: Generating Feynman diagrams for Process: g g > t t~ g WEIGHTED<=3 @2
 INFO: Finding symmetric diagrams for subprocess group gg_ttxg 
 INFO: Creating files in directory P1_gg_ttx 
 [1;32mDEBUG:  kwargs[prefix] = 0 [1;30m[model_handling.py at line 1058][0m [0m
-[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7f814a9d1070> [1;30m[export_v4.py at line 6262][0m [0m
+[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7f29a4102070> [1;30m[export_v4.py at line 6262][0m [0m
 INFO: Creating files in directory . 
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.h
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.cc
@@ -217,15 +217,15 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./.
 [1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m 32 True 32 [1;30m[export_v4.py at line 1872][0m [0m
 INFO: Generating Feynman diagrams for Process: g g > t t~ WEIGHTED<=2 @1 
 INFO: Finding symmetric diagrams for subprocess group gg_ttx 
-Generated helas calls for 2 subprocesses (19 diagrams) in 0.043 s
-Wrote files for 46 helas calls in 0.239 s
+Generated helas calls for 2 subprocesses (19 diagrams) in 0.042 s
+Wrote files for 46 helas calls in 0.243 s
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates VVV1 routines[0m
 ALOHA: aloha creates FFV1 routines[0m
 ALOHA: aloha creates VVVV1 set of routines with options: P0[0m
 ALOHA: aloha creates VVVV3 set of routines with options: P0[0m
 ALOHA: aloha creates VVVV4 set of routines with options: P0[0m
-ALOHA: aloha creates 5 routines in  0.329 s
+ALOHA: aloha creates 5 routines in  0.324 s
 [1;32mDEBUG:  Entering PLUGIN_ProcessExporter.convert_model (create the model) [1;30m[output.py at line 202][0m [0m
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates VVV1 routines[0m
@@ -233,7 +233,7 @@ ALOHA: aloha creates FFV1 routines[0m
 ALOHA: aloha creates VVVV1 set of routines with options: P0[0m
 ALOHA: aloha creates VVVV3 set of routines with options: P0[0m
 ALOHA: aloha creates VVVV4 set of routines with options: P0[0m
-ALOHA: aloha creates 10 routines in  0.312 s
+ALOHA: aloha creates 10 routines in  0.308 s
 <class 'aloha.create_aloha.AbstractRoutine'> VVV1
 <class 'aloha.create_aloha.AbstractRoutine'> VVV1
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
@@ -283,9 +283,9 @@ Type "launch" to generate events from this process, or see
 Run "open index.html" to see more information about this process.
 quit
 
-real	0m2.298s
-user	0m2.019s
-sys	0m0.276s
+real	0m2.484s
+user	0m2.030s
+sys	0m0.256s
 Code generation completed in 3 seconds
 ************************************************************
 *                                                          *
diff --git a/epochX/cudacpp/gg_ttg.mad/CODEGEN_mad_gg_ttg_log.txt b/epochX/cudacpp/gg_ttg.mad/CODEGEN_mad_gg_ttg_log.txt
index 527b74cf99..00ae96c5fb 100644
--- a/epochX/cudacpp/gg_ttg.mad/CODEGEN_mad_gg_ttg_log.txt
+++ b/epochX/cudacpp/gg_ttg.mad/CODEGEN_mad_gg_ttg_log.txt
@@ -62,7 +62,7 @@ generate g g > t t~ g
 No model currently active, so we import the Standard Model
 INFO: load particles 
 INFO: load vertices 
-[1;32mDEBUG: model prefixing  takes 0.005304574966430664 [0m
+[1;32mDEBUG: model prefixing  takes 0.0055010318756103516 [0m
 INFO: Restrict model sm with file models/sm/restrict_default.dat . 
 [1;32mDEBUG: Simplifying conditional expressions [0m
 [1;32mDEBUG: remove interactions: u s w+ at order: QED=1 [0m
@@ -155,7 +155,7 @@ INFO: Please specify coupling orders to bypass this step.
 INFO: Trying coupling order WEIGHTED<=3: WEIGTHED IS QCD+2*QED 
 INFO: Trying process: g g > t t~ g WEIGHTED<=3 @1  
 INFO: Process has 16 diagrams 
-1 processes with 16 diagrams generated in 0.021 s
+1 processes with 16 diagrams generated in 0.022 s
 Total: 1 processes with 16 diagrams
 output madevent ../TMPOUT/CODEGEN_mad_gg_ttg --hel_recycling=False --vector_size=32 --me_exporter=standalone_cudacpp
 Load PLUGIN.CUDACPP_OUTPUT
@@ -175,7 +175,7 @@ INFO: Generating Helas calls for process: g g > t t~ g WEIGHTED<=3 @1
 INFO: Processing color information for process: g g > t t~ g @1 
 INFO: Creating files in directory P1_gg_ttxg 
 [1;32mDEBUG:  kwargs[prefix] = 0 [1;30m[model_handling.py at line 1058][0m [0m
-[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7feaad7456a0> [1;30m[export_v4.py at line 6262][0m [0m
+[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7fd52d0a16a0> [1;30m[export_v4.py at line 6262][0m [0m
 INFO: Creating files in directory . 
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.h
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.cc
@@ -190,8 +190,8 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./.
 [1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m 32 True 32 [1;30m[export_v4.py at line 1872][0m [0m
 INFO: Generating Feynman diagrams for Process: g g > t t~ g WEIGHTED<=3 @1 
 INFO: Finding symmetric diagrams for subprocess group gg_ttxg 
-Generated helas calls for 1 subprocesses (16 diagrams) in 0.037 s
-Wrote files for 36 helas calls in 0.149 s
+Generated helas calls for 1 subprocesses (16 diagrams) in 0.039 s
+Wrote files for 36 helas calls in 0.184 s
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates VVV1 routines[0m
 ALOHA: aloha creates FFV1 routines[0m
@@ -206,7 +206,7 @@ ALOHA: aloha creates FFV1 routines[0m
 ALOHA: aloha creates VVVV1 set of routines with options: P0[0m
 ALOHA: aloha creates VVVV3 set of routines with options: P0[0m
 ALOHA: aloha creates VVVV4 set of routines with options: P0[0m
-ALOHA: aloha creates 10 routines in  0.308 s
+ALOHA: aloha creates 10 routines in  0.310 s
 <class 'aloha.create_aloha.AbstractRoutine'> VVV1
 <class 'aloha.create_aloha.AbstractRoutine'> VVV1
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
@@ -252,9 +252,9 @@ Type "launch" to generate events from this process, or see
 Run "open index.html" to see more information about this process.
 quit
 
-real	0m2.265s
-user	0m1.926s
-sys	0m0.245s
+real	0m2.571s
+user	0m1.941s
+sys	0m0.238s
 Code generation completed in 2 seconds
 ************************************************************
 *                                                          *
diff --git a/epochX/cudacpp/gg_ttg.sa/CODEGEN_cudacpp_gg_ttg_log.txt b/epochX/cudacpp/gg_ttg.sa/CODEGEN_cudacpp_gg_ttg_log.txt
index a4e93bc7e3..ee1a51555d 100644
--- a/epochX/cudacpp/gg_ttg.sa/CODEGEN_cudacpp_gg_ttg_log.txt
+++ b/epochX/cudacpp/gg_ttg.sa/CODEGEN_cudacpp_gg_ttg_log.txt
@@ -62,7 +62,7 @@ generate g g > t t~ g
 No model currently active, so we import the Standard Model
 INFO: load particles 
 INFO: load vertices 
-[1;32mDEBUG: model prefixing  takes 0.005494117736816406 [0m
+[1;32mDEBUG: model prefixing  takes 0.0054416656494140625 [0m
 INFO: Restrict model sm with file models/sm/restrict_default.dat . 
 [1;32mDEBUG: Simplifying conditional expressions [0m
 [1;32mDEBUG: remove interactions: u s w+ at order: QED=1 [0m
@@ -175,7 +175,7 @@ INFO: Creating files in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TM
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttg/SubProcesses/P1_Sigma_sm_gg_ttxg/./CPPProcess.h
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttg/SubProcesses/P1_Sigma_sm_gg_ttxg/./CPPProcess.cc
 INFO: Created files CPPProcess.h and CPPProcess.cc in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttg/SubProcesses/P1_Sigma_sm_gg_ttxg/. 
-Generated helas calls for 1 subprocesses (16 diagrams) in 0.037 s
+Generated helas calls for 1 subprocesses (16 diagrams) in 0.038 s
 [1;32mDEBUG:  Entering PLUGIN_ProcessExporter.convert_model (create the model) [1;30m[output.py at line 202][0m [0m
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates VVV1 routines[0m
@@ -183,7 +183,7 @@ ALOHA: aloha creates FFV1 routines[0m
 ALOHA: aloha creates VVVV1 set of routines with options: P0[0m
 ALOHA: aloha creates VVVV3 set of routines with options: P0[0m
 ALOHA: aloha creates VVVV4 set of routines with options: P0[0m
-ALOHA: aloha creates 5 routines in  0.320 s
+ALOHA: aloha creates 5 routines in  0.345 s
 <class 'aloha.create_aloha.AbstractRoutine'> VVV1
 <class 'aloha.create_aloha.AbstractRoutine'> VVV1
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
@@ -203,7 +203,7 @@ INFO: Created files Parameters_sm.h and Parameters_sm.cc in directory
 INFO: /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttg/src/. and /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttg/src/. 
 quit
 
-real	0m0.790s
-user	0m0.706s
-sys	0m0.056s
+real	0m0.803s
+user	0m0.731s
+sys	0m0.066s
 Code generation completed in 1 seconds
diff --git a/epochX/cudacpp/gg_ttgg.mad/CODEGEN_mad_gg_ttgg_log.txt b/epochX/cudacpp/gg_ttgg.mad/CODEGEN_mad_gg_ttgg_log.txt
index 88b6b72cf1..3a2b1ad647 100644
--- a/epochX/cudacpp/gg_ttgg.mad/CODEGEN_mad_gg_ttgg_log.txt
+++ b/epochX/cudacpp/gg_ttgg.mad/CODEGEN_mad_gg_ttgg_log.txt
@@ -62,7 +62,7 @@ generate g g > t t~ g g
 No model currently active, so we import the Standard Model
 INFO: load particles 
 INFO: load vertices 
-[1;32mDEBUG: model prefixing  takes 0.0054705142974853516 [0m
+[1;32mDEBUG: model prefixing  takes 0.0053348541259765625 [0m
 INFO: Restrict model sm with file models/sm/restrict_default.dat . 
 [1;32mDEBUG: Simplifying conditional expressions [0m
 [1;32mDEBUG: remove interactions: u s w+ at order: QED=1 [0m
@@ -155,7 +155,7 @@ INFO: Please specify coupling orders to bypass this step.
 INFO: Trying coupling order WEIGHTED<=4: WEIGTHED IS QCD+2*QED 
 INFO: Trying process: g g > t t~ g g WEIGHTED<=4 @1  
 INFO: Process has 123 diagrams 
-1 processes with 123 diagrams generated in 0.157 s
+1 processes with 123 diagrams generated in 0.156 s
 Total: 1 processes with 123 diagrams
 output madevent ../TMPOUT/CODEGEN_mad_gg_ttgg --hel_recycling=False --vector_size=32 --me_exporter=standalone_cudacpp
 Load PLUGIN.CUDACPP_OUTPUT
@@ -175,7 +175,7 @@ INFO: Generating Helas calls for process: g g > t t~ g g WEIGHTED<=4 @1
 INFO: Processing color information for process: g g > t t~ g g @1 
 INFO: Creating files in directory P1_gg_ttxgg 
 [1;32mDEBUG:  kwargs[prefix] = 0 [1;30m[model_handling.py at line 1058][0m [0m
-[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7fddb2149c70> [1;30m[export_v4.py at line 6262][0m [0m
+[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7f5b0f500c70> [1;30m[export_v4.py at line 6262][0m [0m
 INFO: Creating files in directory . 
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.h
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.cc
@@ -190,15 +190,15 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./.
 [1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m 32 True 32 [1;30m[export_v4.py at line 1872][0m [0m
 INFO: Generating Feynman diagrams for Process: g g > t t~ g g WEIGHTED<=4 @1 
 INFO: Finding symmetric diagrams for subprocess group gg_ttxgg 
-Generated helas calls for 1 subprocesses (123 diagrams) in 0.429 s
-Wrote files for 222 helas calls in 0.688 s
+Generated helas calls for 1 subprocesses (123 diagrams) in 0.437 s
+Wrote files for 222 helas calls in 0.735 s
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates VVV1 routines[0m
 ALOHA: aloha creates FFV1 routines[0m
 ALOHA: aloha creates VVVV1 routines[0m
 ALOHA: aloha creates VVVV3 routines[0m
 ALOHA: aloha creates VVVV4 routines[0m
-ALOHA: aloha creates 5 routines in  0.331 s
+ALOHA: aloha creates 5 routines in  0.441 s
 [1;32mDEBUG:  Entering PLUGIN_ProcessExporter.convert_model (create the model) [1;30m[output.py at line 202][0m [0m
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates VVV1 routines[0m
@@ -206,7 +206,7 @@ ALOHA: aloha creates FFV1 routines[0m
 ALOHA: aloha creates VVVV1 routines[0m
 ALOHA: aloha creates VVVV3 routines[0m
 ALOHA: aloha creates VVVV4 routines[0m
-ALOHA: aloha creates 10 routines in  0.315 s
+ALOHA: aloha creates 10 routines in  0.309 s
 <class 'aloha.create_aloha.AbstractRoutine'> VVV1
 <class 'aloha.create_aloha.AbstractRoutine'> VVV1
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
@@ -255,10 +255,10 @@ Type "launch" to generate events from this process, or see
 Run "open index.html" to see more information about this process.
 quit
 
-real	0m3.278s
-user	0m3.005s
-sys	0m0.263s
-Code generation completed in 4 seconds
+real	0m3.582s
+user	0m3.061s
+sys	0m0.243s
+Code generation completed in 3 seconds
 ************************************************************
 *                                                          *
 *                      W E L C O M E to                    *
diff --git a/epochX/cudacpp/gg_ttgg.sa/CODEGEN_cudacpp_gg_ttgg_log.txt b/epochX/cudacpp/gg_ttgg.sa/CODEGEN_cudacpp_gg_ttgg_log.txt
index 3f8e2f83ed..1b6c420503 100644
--- a/epochX/cudacpp/gg_ttgg.sa/CODEGEN_cudacpp_gg_ttgg_log.txt
+++ b/epochX/cudacpp/gg_ttgg.sa/CODEGEN_cudacpp_gg_ttgg_log.txt
@@ -62,7 +62,7 @@ generate g g > t t~ g g
 No model currently active, so we import the Standard Model
 INFO: load particles 
 INFO: load vertices 
-[1;32mDEBUG: model prefixing  takes 0.005356311798095703 [0m
+[1;32mDEBUG: model prefixing  takes 0.005376100540161133 [0m
 INFO: Restrict model sm with file models/sm/restrict_default.dat . 
 [1;32mDEBUG: Simplifying conditional expressions [0m
 [1;32mDEBUG: remove interactions: u s w+ at order: QED=1 [0m
@@ -155,7 +155,7 @@ INFO: Please specify coupling orders to bypass this step.
 INFO: Trying coupling order WEIGHTED<=4: WEIGTHED IS QCD+2*QED 
 INFO: Trying process: g g > t t~ g g WEIGHTED<=4 @1  
 INFO: Process has 123 diagrams 
-1 processes with 123 diagrams generated in 0.159 s
+1 processes with 123 diagrams generated in 0.156 s
 Total: 1 processes with 123 diagrams
 output standalone_cudacpp ../TMPOUT/CODEGEN_cudacpp_gg_ttgg
 Load PLUGIN.CUDACPP_OUTPUT
@@ -175,7 +175,7 @@ INFO: Creating files in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TM
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttgg/SubProcesses/P1_Sigma_sm_gg_ttxgg/./CPPProcess.h
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttgg/SubProcesses/P1_Sigma_sm_gg_ttxgg/./CPPProcess.cc
 INFO: Created files CPPProcess.h and CPPProcess.cc in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttgg/SubProcesses/P1_Sigma_sm_gg_ttxgg/. 
-Generated helas calls for 1 subprocesses (123 diagrams) in 0.420 s
+Generated helas calls for 1 subprocesses (123 diagrams) in 0.427 s
 [1;32mDEBUG:  Entering PLUGIN_ProcessExporter.convert_model (create the model) [1;30m[output.py at line 202][0m [0m
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates VVV1 routines[0m
@@ -183,7 +183,7 @@ ALOHA: aloha creates FFV1 routines[0m
 ALOHA: aloha creates VVVV1 routines[0m
 ALOHA: aloha creates VVVV3 routines[0m
 ALOHA: aloha creates VVVV4 routines[0m
-ALOHA: aloha creates 5 routines in  0.315 s
+ALOHA: aloha creates 5 routines in  0.319 s
 <class 'aloha.create_aloha.AbstractRoutine'> VVV1
 <class 'aloha.create_aloha.AbstractRoutine'> VVV1
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
@@ -206,7 +206,7 @@ INFO: Created files Parameters_sm.h and Parameters_sm.cc in directory
 INFO: /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttgg/src/. and /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttgg/src/. 
 quit
 
-real	0m1.429s
-user	0m1.363s
-sys	0m0.054s
-Code generation completed in 2 seconds
+real	0m1.461s
+user	0m1.381s
+sys	0m0.050s
+Code generation completed in 1 seconds
diff --git a/epochX/cudacpp/gg_ttggg.mad/CODEGEN_mad_gg_ttggg_log.txt b/epochX/cudacpp/gg_ttggg.mad/CODEGEN_mad_gg_ttggg_log.txt
index 1163910eb2..f222e5a6b5 100644
--- a/epochX/cudacpp/gg_ttggg.mad/CODEGEN_mad_gg_ttggg_log.txt
+++ b/epochX/cudacpp/gg_ttggg.mad/CODEGEN_mad_gg_ttggg_log.txt
@@ -62,7 +62,7 @@ generate g g > t t~ g g g
 No model currently active, so we import the Standard Model
 INFO: load particles 
 INFO: load vertices 
-[1;32mDEBUG: model prefixing  takes 0.005507230758666992 [0m
+[1;32mDEBUG: model prefixing  takes 0.005517005920410156 [0m
 INFO: Restrict model sm with file models/sm/restrict_default.dat . 
 [1;32mDEBUG: Simplifying conditional expressions [0m
 [1;32mDEBUG: remove interactions: u s w+ at order: QED=1 [0m
@@ -155,7 +155,7 @@ INFO: Please specify coupling orders to bypass this step.
 INFO: Trying coupling order WEIGHTED<=5: WEIGTHED IS QCD+2*QED 
 INFO: Trying process: g g > t t~ g g g WEIGHTED<=5 @1  
 INFO: Process has 1240 diagrams 
-1 processes with 1240 diagrams generated in 1.909 s
+1 processes with 1240 diagrams generated in 1.861 s
 Total: 1 processes with 1240 diagrams
 output madevent ../TMPOUT/CODEGEN_mad_gg_ttggg --hel_recycling=False --vector_size=32 --me_exporter=standalone_cudacpp
 Load PLUGIN.CUDACPP_OUTPUT
@@ -177,7 +177,7 @@ INFO: Creating files in directory P1_gg_ttxggg
 INFO: Computing Color-Flow optimization [15120 term] 
 INFO: Color-Flow passed to 1630 term in 8s. Introduce 3030 contraction 
 [1;32mDEBUG:  kwargs[prefix] = 0 [1;30m[model_handling.py at line 1058][0m [0m
-[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7ff5a06b78e0> [1;30m[export_v4.py at line 6262][0m [0m
+[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7fc5768c88e0> [1;30m[export_v4.py at line 6262][0m [0m
 INFO: Creating files in directory . 
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.h
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.cc
@@ -192,15 +192,15 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./.
 [1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m 32 True 32 [1;30m[export_v4.py at line 1872][0m [0m
 INFO: Generating Feynman diagrams for Process: g g > t t~ g g g WEIGHTED<=5 @1 
 INFO: Finding symmetric diagrams for subprocess group gg_ttxggg 
-Generated helas calls for 1 subprocesses (1240 diagrams) in 6.589 s
-Wrote files for 2281 helas calls in 18.549 s
+Generated helas calls for 1 subprocesses (1240 diagrams) in 6.528 s
+Wrote files for 2281 helas calls in 18.450 s
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates VVV1 routines[0m
 ALOHA: aloha creates FFV1 routines[0m
 ALOHA: aloha creates VVVV1 routines[0m
 ALOHA: aloha creates VVVV3 routines[0m
 ALOHA: aloha creates VVVV4 routines[0m
-ALOHA: aloha creates 5 routines in  0.321 s
+ALOHA: aloha creates 5 routines in  0.314 s
 [1;32mDEBUG:  Entering PLUGIN_ProcessExporter.convert_model (create the model) [1;30m[output.py at line 202][0m [0m
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates VVV1 routines[0m
@@ -208,7 +208,7 @@ ALOHA: aloha creates FFV1 routines[0m
 ALOHA: aloha creates VVVV1 routines[0m
 ALOHA: aloha creates VVVV3 routines[0m
 ALOHA: aloha creates VVVV4 routines[0m
-ALOHA: aloha creates 10 routines in  0.314 s
+ALOHA: aloha creates 10 routines in  0.309 s
 <class 'aloha.create_aloha.AbstractRoutine'> VVV1
 <class 'aloha.create_aloha.AbstractRoutine'> VVV1
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
@@ -257,10 +257,10 @@ Type "launch" to generate events from this process, or see
 Run "open index.html" to see more information about this process.
 quit
 
-real	0m29.295s
-user	0m28.742s
-sys	0m0.442s
-Code generation completed in 29 seconds
+real	0m29.049s
+user	0m28.554s
+sys	0m0.393s
+Code generation completed in 30 seconds
 ************************************************************
 *                                                          *
 *                      W E L C O M E to                    *
diff --git a/epochX/cudacpp/gg_ttggg.sa/CODEGEN_cudacpp_gg_ttggg_log.txt b/epochX/cudacpp/gg_ttggg.sa/CODEGEN_cudacpp_gg_ttggg_log.txt
index 3eb5706f27..2720870321 100644
--- a/epochX/cudacpp/gg_ttggg.sa/CODEGEN_cudacpp_gg_ttggg_log.txt
+++ b/epochX/cudacpp/gg_ttggg.sa/CODEGEN_cudacpp_gg_ttggg_log.txt
@@ -62,7 +62,7 @@ generate g g > t t~ g g g
 No model currently active, so we import the Standard Model
 INFO: load particles 
 INFO: load vertices 
-[1;32mDEBUG: model prefixing  takes 0.005807638168334961 [0m
+[1;32mDEBUG: model prefixing  takes 0.005664825439453125 [0m
 INFO: Restrict model sm with file models/sm/restrict_default.dat . 
 [1;32mDEBUG: Simplifying conditional expressions [0m
 [1;32mDEBUG: remove interactions: u s w+ at order: QED=1 [0m
@@ -155,7 +155,7 @@ INFO: Please specify coupling orders to bypass this step.
 INFO: Trying coupling order WEIGHTED<=5: WEIGTHED IS QCD+2*QED 
 INFO: Trying process: g g > t t~ g g g WEIGHTED<=5 @1  
 INFO: Process has 1240 diagrams 
-1 processes with 1240 diagrams generated in 1.890 s
+1 processes with 1240 diagrams generated in 1.872 s
 Total: 1 processes with 1240 diagrams
 output standalone_cudacpp ../TMPOUT/CODEGEN_cudacpp_gg_ttggg
 Load PLUGIN.CUDACPP_OUTPUT
@@ -175,7 +175,7 @@ INFO: Creating files in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TM
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg/SubProcesses/P1_Sigma_sm_gg_ttxggg/./CPPProcess.h
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg/SubProcesses/P1_Sigma_sm_gg_ttxggg/./CPPProcess.cc
 INFO: Created files CPPProcess.h and CPPProcess.cc in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg/SubProcesses/P1_Sigma_sm_gg_ttxggg/. 
-Generated helas calls for 1 subprocesses (1240 diagrams) in 6.499 s
+Generated helas calls for 1 subprocesses (1240 diagrams) in 6.609 s
 [1;32mDEBUG:  Entering PLUGIN_ProcessExporter.convert_model (create the model) [1;30m[output.py at line 202][0m [0m
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates VVV1 routines[0m
@@ -183,7 +183,7 @@ ALOHA: aloha creates FFV1 routines[0m
 ALOHA: aloha creates VVVV1 routines[0m
 ALOHA: aloha creates VVVV3 routines[0m
 ALOHA: aloha creates VVVV4 routines[0m
-ALOHA: aloha creates 5 routines in  0.347 s
+ALOHA: aloha creates 5 routines in  0.345 s
 <class 'aloha.create_aloha.AbstractRoutine'> VVV1
 <class 'aloha.create_aloha.AbstractRoutine'> VVV1
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
@@ -206,7 +206,7 @@ INFO: Created files Parameters_sm.h and Parameters_sm.cc in directory
 INFO: /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg/src/. and /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg/src/. 
 quit
 
-real	0m12.939s
-user	0m12.788s
-sys	0m0.093s
+real	0m12.978s
+user	0m12.813s
+sys	0m0.111s
 Code generation completed in 13 seconds
diff --git a/epochX/cudacpp/gq_ttq.mad/CODEGEN_mad_gq_ttq_log.txt b/epochX/cudacpp/gq_ttq.mad/CODEGEN_mad_gq_ttq_log.txt
index d46fea2318..bb803498ee 100644
--- a/epochX/cudacpp/gq_ttq.mad/CODEGEN_mad_gq_ttq_log.txt
+++ b/epochX/cudacpp/gq_ttq.mad/CODEGEN_mad_gq_ttq_log.txt
@@ -61,7 +61,7 @@ set zerowidth_tchannel F
 define q = u c d s u~ c~ d~ s~
 INFO: load particles 
 INFO: load vertices 
-[1;32mDEBUG: model prefixing  takes 0.005537271499633789 [0m
+[1;32mDEBUG: model prefixing  takes 0.005455732345581055 [0m
 INFO: Restrict model sm with file models/sm/restrict_default.dat . 
 [1;32mDEBUG: Simplifying conditional expressions [0m
 [1;32mDEBUG: remove interactions: u s w+ at order: QED=1 [0m
@@ -170,7 +170,7 @@ INFO: Crossed process found for g u~ > t t~ u~, reuse diagrams.
 INFO: Crossed process found for g c~ > t t~ c~, reuse diagrams. 
 INFO: Crossed process found for g d~ > t t~ d~, reuse diagrams. 
 INFO: Crossed process found for g s~ > t t~ s~, reuse diagrams. 
-8 processes with 40 diagrams generated in 0.078 s
+8 processes with 40 diagrams generated in 0.077 s
 Total: 8 processes with 40 diagrams
 output madevent ../TMPOUT/CODEGEN_mad_gq_ttq --hel_recycling=False --vector_size=32 --me_exporter=standalone_cudacpp
 Load PLUGIN.CUDACPP_OUTPUT
@@ -198,7 +198,7 @@ INFO: Combined process g d~ > t t~ d~ WEIGHTED<=3 @1 with process g u~ > t t~ u~
 INFO: Combined process g s~ > t t~ s~ WEIGHTED<=3 @1 with process g u~ > t t~ u~ WEIGHTED<=3 @1 
 INFO: Creating files in directory P1_gu_ttxu 
 [1;32mDEBUG:  kwargs[prefix] = 0 [1;30m[model_handling.py at line 1058][0m [0m
-[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7fd95bf71be0> [1;30m[export_v4.py at line 6262][0m [0m
+[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7f50ff1b1be0> [1;30m[export_v4.py at line 6262][0m [0m
 INFO: Creating files in directory . 
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.h
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.cc
@@ -215,7 +215,7 @@ INFO: Generating Feynman diagrams for Process: g u > t t~ u WEIGHTED<=3 @1
 INFO: Finding symmetric diagrams for subprocess group gu_ttxu 
 INFO: Creating files in directory P1_gux_ttxux 
 [1;32mDEBUG:  kwargs[prefix] = 0 [1;30m[model_handling.py at line 1058][0m [0m
-[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7fd95bf71be0> [1;30m[export_v4.py at line 6262][0m [0m
+[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7f50ff1b1be0> [1;30m[export_v4.py at line 6262][0m [0m
 INFO: Creating files in directory . 
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.h
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.cc
@@ -230,17 +230,17 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./.
 [1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m 32 True 32 [1;30m[export_v4.py at line 1872][0m [0m
 INFO: Generating Feynman diagrams for Process: g u~ > t t~ u~ WEIGHTED<=3 @1 
 INFO: Finding symmetric diagrams for subprocess group gux_ttxux 
-Generated helas calls for 2 subprocesses (10 diagrams) in 0.031 s
-Wrote files for 32 helas calls in 0.219 s
+Generated helas calls for 2 subprocesses (10 diagrams) in 0.030 s
+Wrote files for 32 helas calls in 0.216 s
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates FFV1 routines[0m
 ALOHA: aloha creates VVV1 routines[0m
-ALOHA: aloha creates 2 routines in  0.145 s
+ALOHA: aloha creates 2 routines in  0.143 s
 [1;32mDEBUG:  Entering PLUGIN_ProcessExporter.convert_model (create the model) [1;30m[output.py at line 202][0m [0m
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates FFV1 routines[0m
 ALOHA: aloha creates VVV1 routines[0m
-ALOHA: aloha creates 4 routines in  0.136 s
+ALOHA: aloha creates 4 routines in  0.130 s
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
@@ -294,9 +294,9 @@ Type "launch" to generate events from this process, or see
 Run "open index.html" to see more information about this process.
 quit
 
-real	0m1.960s
-user	0m1.693s
-sys	0m0.242s
+real	0m1.916s
+user	0m1.672s
+sys	0m0.240s
 Code generation completed in 2 seconds
 ************************************************************
 *                                                          *
diff --git a/epochX/cudacpp/gq_ttq.sa/CODEGEN_cudacpp_gq_ttq_log.txt b/epochX/cudacpp/gq_ttq.sa/CODEGEN_cudacpp_gq_ttq_log.txt
index 93395d9159..5a07808142 100644
--- a/epochX/cudacpp/gq_ttq.sa/CODEGEN_cudacpp_gq_ttq_log.txt
+++ b/epochX/cudacpp/gq_ttq.sa/CODEGEN_cudacpp_gq_ttq_log.txt
@@ -61,7 +61,7 @@ set zerowidth_tchannel F
 define q = u c d s u~ c~ d~ s~
 INFO: load particles 
 INFO: load vertices 
-[1;32mDEBUG: model prefixing  takes 0.005349397659301758 [0m
+[1;32mDEBUG: model prefixing  takes 0.005926370620727539 [0m
 INFO: Restrict model sm with file models/sm/restrict_default.dat . 
 [1;32mDEBUG: Simplifying conditional expressions [0m
 [1;32mDEBUG: remove interactions: u s w+ at order: QED=1 [0m
@@ -170,7 +170,7 @@ INFO: Crossed process found for g u~ > t t~ u~, reuse diagrams.
 INFO: Crossed process found for g c~ > t t~ c~, reuse diagrams. 
 INFO: Crossed process found for g d~ > t t~ d~, reuse diagrams. 
 INFO: Crossed process found for g s~ > t t~ s~, reuse diagrams. 
-8 processes with 40 diagrams generated in 0.077 s
+8 processes with 40 diagrams generated in 0.082 s
 Total: 8 processes with 40 diagrams
 output standalone_cudacpp ../TMPOUT/CODEGEN_cudacpp_gq_ttq
 Load PLUGIN.CUDACPP_OUTPUT
@@ -211,7 +211,7 @@ Generated helas calls for 2 subprocesses (10 diagrams) in 0.031 s
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates FFV1 routines[0m
 ALOHA: aloha creates VVV1 routines[0m
-ALOHA: aloha creates 2 routines in  0.142 s
+ALOHA: aloha creates 2 routines in  0.179 s
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
@@ -227,7 +227,7 @@ INFO: Created files Parameters_sm.h and Parameters_sm.cc in directory
 INFO: /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/src/. and /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/src/. 
 quit
 
-real	0m0.645s
-user	0m0.577s
-sys	0m0.059s
+real	0m1.076s
+user	0m0.601s
+sys	0m0.061s
 Code generation completed in 1 seconds
diff --git a/epochX/cudacpp/heft_gg_h.sa/CODEGEN_cudacpp_heft_gg_h_log.txt b/epochX/cudacpp/heft_gg_h.sa/CODEGEN_cudacpp_heft_gg_h_log.txt
index c56a4ed162..9bac4b3aae 100644
--- a/epochX/cudacpp/heft_gg_h.sa/CODEGEN_cudacpp_heft_gg_h_log.txt
+++ b/epochX/cudacpp/heft_gg_h.sa/CODEGEN_cudacpp_heft_gg_h_log.txt
@@ -153,7 +153,7 @@ Generated helas calls for 1 subprocesses (1 diagrams) in 0.002 s
 [1;32mDEBUG:  Entering PLUGIN_ProcessExporter.convert_model (create the model) [1;30m[output.py at line 202][0m [0m
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates VVS3 routines[0m
-ALOHA: aloha creates 1 routines in  0.065 s
+ALOHA: aloha creates 1 routines in  0.060 s
 <class 'aloha.create_aloha.AbstractRoutine'> VVS3
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_heft_gg_h/src/./HelAmps_heft.h
 INFO: Created file HelAmps_heft.h in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_heft_gg_h/src/. 
@@ -165,7 +165,7 @@ INFO: Created files Parameters_heft.h and Parameters_heft.cc in directory
 INFO: /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_heft_gg_h/src/. and /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_heft_gg_h/src/. 
 quit
 
-real	0m0.431s
-user	0m0.373s
-sys	0m0.054s
-Code generation completed in 0 seconds
+real	0m0.414s
+user	0m0.350s
+sys	0m0.059s
+Code generation completed in 1 seconds
diff --git a/epochX/cudacpp/pp_tt012j.mad/CODEGEN_mad_pp_tt012j_log.txt b/epochX/cudacpp/pp_tt012j.mad/CODEGEN_mad_pp_tt012j_log.txt
index bfc7dc3052..adfd21027c 100644
--- a/epochX/cudacpp/pp_tt012j.mad/CODEGEN_mad_pp_tt012j_log.txt
+++ b/epochX/cudacpp/pp_tt012j.mad/CODEGEN_mad_pp_tt012j_log.txt
@@ -61,7 +61,7 @@ set zerowidth_tchannel F
 define j = p
 INFO: load particles 
 INFO: load vertices 
-[1;32mDEBUG: model prefixing  takes 0.005826234817504883 [0m
+[1;32mDEBUG: model prefixing  takes 0.0053827762603759766 [0m
 INFO: Restrict model sm with file models/sm/restrict_default.dat . 
 [1;32mDEBUG: Simplifying conditional expressions [0m
 [1;32mDEBUG: remove interactions: u s w+ at order: QED=1 [0m
@@ -172,7 +172,7 @@ INFO: Process u~ u > t t~ added to mirror process u u~ > t t~
 INFO: Process c~ c > t t~ added to mirror process c c~ > t t~ 
 INFO: Process d~ d > t t~ added to mirror process d d~ > t t~ 
 INFO: Process s~ s > t t~ added to mirror process s s~ > t t~ 
-5 processes with 7 diagrams generated in 0.031 s
+5 processes with 7 diagrams generated in 0.029 s
 Total: 5 processes with 7 diagrams
 add process p p > t t~ j @1
 INFO: Checking for minimal orders which gives processes. 
@@ -212,7 +212,7 @@ INFO: Process d~ g > t t~ d~ added to mirror process g d~ > t t~ d~
 INFO: Process d~ d > t t~ g added to mirror process d d~ > t t~ g 
 INFO: Process s~ g > t t~ s~ added to mirror process g s~ > t t~ s~ 
 INFO: Process s~ s > t t~ g added to mirror process s s~ > t t~ g 
-13 processes with 76 diagrams generated in 0.144 s
+13 processes with 76 diagrams generated in 0.136 s
 Total: 18 processes with 83 diagrams
 add process p p > t t~ j j @2
 INFO: Checking for minimal orders which gives processes. 
@@ -378,7 +378,7 @@ INFO: Process s~ u~ > t t~ u~ s~ added to mirror process u~ s~ > t t~ u~ s~
 INFO: Process s~ c~ > t t~ c~ s~ added to mirror process c~ s~ > t t~ c~ s~ 
 INFO: Process s~ d~ > t t~ d~ s~ added to mirror process d~ s~ > t t~ d~ s~ 
 INFO: Crossed process found for s~ s~ > t t~ s~ s~, reuse diagrams. 
-65 processes with 1119 diagrams generated in 1.923 s
+65 processes with 1119 diagrams generated in 1.869 s
 Total: 83 processes with 1202 diagrams
 output madevent ../TMPOUT/CODEGEN_mad_pp_tt012j --hel_recycling=False --vector_size=32 --me_exporter=standalone_cudacpp
 Load PLUGIN.CUDACPP_OUTPUT
@@ -497,7 +497,7 @@ INFO: Combined process d d~ > t t~ WEIGHTED<=2 with process u u~ > t t~ WEIGHTED
 INFO: Combined process s s~ > t t~ WEIGHTED<=2 with process u u~ > t t~ WEIGHTED<=2 
 INFO: Creating files in directory P2_gg_ttxgg 
 [1;32mDEBUG:  kwargs[prefix] = 0 [1;30m[model_handling.py at line 1058][0m [0m
-[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7f0cf1000e80> [1;30m[export_v4.py at line 6262][0m [0m
+[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7fe3d5e3fe80> [1;30m[export_v4.py at line 6262][0m [0m
 INFO: Creating files in directory . 
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.h
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.cc
@@ -514,7 +514,7 @@ INFO: Generating Feynman diagrams for Process: g g > t t~ g g WEIGHTED<=4 @2
 INFO: Finding symmetric diagrams for subprocess group gg_ttxgg 
 INFO: Creating files in directory P2_gg_ttxuux 
 [1;32mDEBUG:  kwargs[prefix] = 0 [1;30m[model_handling.py at line 1058][0m [0m
-[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7f0cf080ad90> [1;30m[export_v4.py at line 6262][0m [0m
+[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7fe3d5649d90> [1;30m[export_v4.py at line 6262][0m [0m
 INFO: Creating files in directory . 
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.h
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.cc
@@ -531,7 +531,7 @@ INFO: Generating Feynman diagrams for Process: g g > t t~ u u~ WEIGHTED<=4 @2
 INFO: Finding symmetric diagrams for subprocess group gg_ttxuux 
 INFO: Creating files in directory P2_gu_ttxgu 
 [1;32mDEBUG:  kwargs[prefix] = 0 [1;30m[model_handling.py at line 1058][0m [0m
-[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7f0cf0b5b190> [1;30m[export_v4.py at line 6262][0m [0m
+[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7fe3d599b190> [1;30m[export_v4.py at line 6262][0m [0m
 INFO: Creating files in directory . 
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.h
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.cc
@@ -548,7 +548,7 @@ INFO: Generating Feynman diagrams for Process: g u > t t~ g u WEIGHTED<=4 @2
 INFO: Finding symmetric diagrams for subprocess group gu_ttxgu 
 INFO: Creating files in directory P2_gux_ttxgux 
 [1;32mDEBUG:  kwargs[prefix] = 0 [1;30m[model_handling.py at line 1058][0m [0m
-[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7f0cf079a490> [1;30m[export_v4.py at line 6262][0m [0m
+[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7fe3d55da490> [1;30m[export_v4.py at line 6262][0m [0m
 INFO: Creating files in directory . 
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.h
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.cc
@@ -565,7 +565,7 @@ INFO: Generating Feynman diagrams for Process: g u~ > t t~ g u~ WEIGHTED<=4 @2
 INFO: Finding symmetric diagrams for subprocess group gux_ttxgux 
 INFO: Creating files in directory P2_uux_ttxgg 
 [1;32mDEBUG:  kwargs[prefix] = 0 [1;30m[model_handling.py at line 1058][0m [0m
-[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7f0cf0b5b190> [1;30m[export_v4.py at line 6262][0m [0m
+[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7fe3d599b190> [1;30m[export_v4.py at line 6262][0m [0m
 INFO: Creating files in directory . 
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.h
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.cc
@@ -582,7 +582,7 @@ INFO: Generating Feynman diagrams for Process: u u~ > t t~ g g WEIGHTED<=4 @2
 INFO: Finding symmetric diagrams for subprocess group uux_ttxgg 
 INFO: Creating files in directory P1_gg_ttxg 
 [1;32mDEBUG:  kwargs[prefix] = 0 [1;30m[model_handling.py at line 1058][0m [0m
-[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7f0cf08127c0> [1;30m[export_v4.py at line 6262][0m [0m
+[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7fe3d56527c0> [1;30m[export_v4.py at line 6262][0m [0m
 INFO: Creating files in directory . 
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.h
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.cc
@@ -599,7 +599,7 @@ INFO: Generating Feynman diagrams for Process: g g > t t~ g WEIGHTED<=3 @1
 INFO: Finding symmetric diagrams for subprocess group gg_ttxg 
 INFO: Creating files in directory P2_uu_ttxuu 
 [1;32mDEBUG:  kwargs[prefix] = 0 [1;30m[model_handling.py at line 1058][0m [0m
-[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7f0cf08127c0> [1;30m[export_v4.py at line 6262][0m [0m
+[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7fe3d56527c0> [1;30m[export_v4.py at line 6262][0m [0m
 INFO: Creating files in directory . 
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.h
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.cc
@@ -616,7 +616,7 @@ INFO: Generating Feynman diagrams for Process: u u > t t~ u u WEIGHTED<=4 @2
 INFO: Finding symmetric diagrams for subprocess group uu_ttxuu 
 INFO: Creating files in directory P2_uux_ttxuux 
 [1;32mDEBUG:  kwargs[prefix] = 0 [1;30m[model_handling.py at line 1058][0m [0m
-[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7f0cf08127c0> [1;30m[export_v4.py at line 6262][0m [0m
+[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7fe3d56527c0> [1;30m[export_v4.py at line 6262][0m [0m
 INFO: Creating files in directory . 
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.h
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.cc
@@ -633,7 +633,7 @@ INFO: Generating Feynman diagrams for Process: u u~ > t t~ u u~ WEIGHTED<=4 @2
 INFO: Finding symmetric diagrams for subprocess group uux_ttxuux 
 INFO: Creating files in directory P2_uxux_ttxuxux 
 [1;32mDEBUG:  kwargs[prefix] = 0 [1;30m[model_handling.py at line 1058][0m [0m
-[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7f0cf08127c0> [1;30m[export_v4.py at line 6262][0m [0m
+[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7fe3d56527c0> [1;30m[export_v4.py at line 6262][0m [0m
 INFO: Creating files in directory . 
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.h
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.cc
@@ -650,7 +650,7 @@ INFO: Generating Feynman diagrams for Process: u~ u~ > t t~ u~ u~ WEIGHTED<=4 @2
 INFO: Finding symmetric diagrams for subprocess group uxux_ttxuxux 
 INFO: Creating files in directory P2_uc_ttxuc 
 [1;32mDEBUG:  kwargs[prefix] = 0 [1;30m[model_handling.py at line 1058][0m [0m
-[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7f0cf1000e80> [1;30m[export_v4.py at line 6262][0m [0m
+[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7fe3d5e3fe80> [1;30m[export_v4.py at line 6262][0m [0m
 INFO: Creating files in directory . 
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.h
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.cc
@@ -667,7 +667,7 @@ INFO: Generating Feynman diagrams for Process: u c > t t~ u c WEIGHTED<=4 @2
 INFO: Finding symmetric diagrams for subprocess group uc_ttxuc 
 INFO: Creating files in directory P2_uux_ttxccx 
 [1;32mDEBUG:  kwargs[prefix] = 0 [1;30m[model_handling.py at line 1058][0m [0m
-[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7f0cf1000e80> [1;30m[export_v4.py at line 6262][0m [0m
+[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7fe3d5e3fe80> [1;30m[export_v4.py at line 6262][0m [0m
 INFO: Creating files in directory . 
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.h
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.cc
@@ -684,7 +684,7 @@ INFO: Generating Feynman diagrams for Process: u u~ > t t~ c c~ WEIGHTED<=4 @2
 INFO: Finding symmetric diagrams for subprocess group uux_ttxccx 
 INFO: Creating files in directory P2_ucx_ttxucx 
 [1;32mDEBUG:  kwargs[prefix] = 0 [1;30m[model_handling.py at line 1058][0m [0m
-[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7f0cf080ad00> [1;30m[export_v4.py at line 6262][0m [0m
+[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7fe3d5649d00> [1;30m[export_v4.py at line 6262][0m [0m
 INFO: Creating files in directory . 
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.h
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.cc
@@ -701,7 +701,7 @@ INFO: Generating Feynman diagrams for Process: u c~ > t t~ u c~ WEIGHTED<=4 @2
 INFO: Finding symmetric diagrams for subprocess group ucx_ttxucx 
 INFO: Creating files in directory P2_uxcx_ttxuxcx 
 [1;32mDEBUG:  kwargs[prefix] = 0 [1;30m[model_handling.py at line 1058][0m [0m
-[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7f0cf080ad00> [1;30m[export_v4.py at line 6262][0m [0m
+[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7fe3d5649d00> [1;30m[export_v4.py at line 6262][0m [0m
 INFO: Creating files in directory . 
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.h
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.cc
@@ -718,7 +718,7 @@ INFO: Generating Feynman diagrams for Process: u~ c~ > t t~ u~ c~ WEIGHTED<=4 @2
 INFO: Finding symmetric diagrams for subprocess group uxcx_ttxuxcx 
 INFO: Creating files in directory P1_gu_ttxu 
 [1;32mDEBUG:  kwargs[prefix] = 0 [1;30m[model_handling.py at line 1058][0m [0m
-[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7f0cf080ad90> [1;30m[export_v4.py at line 6262][0m [0m
+[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7fe3d5649d90> [1;30m[export_v4.py at line 6262][0m [0m
 INFO: Creating files in directory . 
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.h
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.cc
@@ -735,7 +735,7 @@ INFO: Generating Feynman diagrams for Process: g u > t t~ u WEIGHTED<=3 @1
 INFO: Finding symmetric diagrams for subprocess group gu_ttxu 
 INFO: Creating files in directory P1_gux_ttxux 
 [1;32mDEBUG:  kwargs[prefix] = 0 [1;30m[model_handling.py at line 1058][0m [0m
-[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7f0cf075e730> [1;30m[export_v4.py at line 6262][0m [0m
+[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7fe3d559e730> [1;30m[export_v4.py at line 6262][0m [0m
 INFO: Creating files in directory . 
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.h
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.cc
@@ -752,7 +752,7 @@ INFO: Generating Feynman diagrams for Process: g u~ > t t~ u~ WEIGHTED<=3 @1
 INFO: Finding symmetric diagrams for subprocess group gux_ttxux 
 INFO: Creating files in directory P1_uux_ttxg 
 [1;32mDEBUG:  kwargs[prefix] = 0 [1;30m[model_handling.py at line 1058][0m [0m
-[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7f0cf0bd00a0> [1;30m[export_v4.py at line 6262][0m [0m
+[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7fe3d5a100a0> [1;30m[export_v4.py at line 6262][0m [0m
 INFO: Creating files in directory . 
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.h
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.cc
@@ -769,7 +769,7 @@ INFO: Generating Feynman diagrams for Process: u u~ > t t~ g WEIGHTED<=3 @1
 INFO: Finding symmetric diagrams for subprocess group uux_ttxg 
 INFO: Creating files in directory P0_gg_ttx 
 [1;32mDEBUG:  kwargs[prefix] = 0 [1;30m[model_handling.py at line 1058][0m [0m
-[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7f0cf1000e80> [1;30m[export_v4.py at line 6262][0m [0m
+[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7fe3d5e3fe80> [1;30m[export_v4.py at line 6262][0m [0m
 INFO: Creating files in directory . 
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.h
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.cc
@@ -786,7 +786,7 @@ INFO: Generating Feynman diagrams for Process: g g > t t~ WEIGHTED<=2
 INFO: Finding symmetric diagrams for subprocess group gg_ttx 
 INFO: Creating files in directory P0_uux_ttx 
 [1;32mDEBUG:  kwargs[prefix] = 0 [1;30m[model_handling.py at line 1058][0m [0m
-[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7f0cf0b5aaf0> [1;30m[export_v4.py at line 6262][0m [0m
+[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7fe3d59998e0> [1;30m[export_v4.py at line 6262][0m [0m
 INFO: Creating files in directory . 
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.h
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.cc
@@ -801,8 +801,8 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./.
 [1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m 32 True 32 [1;30m[export_v4.py at line 1872][0m [0m
 INFO: Generating Feynman diagrams for Process: u u~ > t t~ WEIGHTED<=2 
 INFO: Finding symmetric diagrams for subprocess group uux_ttx 
-Generated helas calls for 18 subprocesses (372 diagrams) in 1.345 s
-Wrote files for 810 helas calls in 3.264 s
+Generated helas calls for 18 subprocesses (372 diagrams) in 1.297 s
+Wrote files for 810 helas calls in 3.533 s
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates VVV1 routines[0m
 ALOHA: aloha creates FFV1 routines[0m
@@ -1028,9 +1028,9 @@ Type "launch" to generate events from this process, or see
 Run "open index.html" to see more information about this process.
 quit
 
-real	0m9.088s
-user	0m8.481s
-sys	0m0.545s
+real	0m9.184s
+user	0m8.370s
+sys	0m0.508s
 Code generation completed in 9 seconds
 ************************************************************
 *                                                          *